Restoring authorship annotation for <orivej@yandex-team.ru>. Commit 2 of 2.

author: orivej <orivej@yandex-team.ru> 2022-02-10 16:45:01 +0300
committer: Daniil Cherednik <dcherednik@yandex-team.ru> 2022-02-10 16:45:01 +0300
commit: 2d37894b1b037cf24231090eda8589bbb44fb6fc (patch)
tree: be835aa92c6248212e705f25388ebafcf84bc7a1 /contrib/libs/llvm12/lib/Transforms
parent: 718c552901d703c502ccbefdfc3c9028d608b947 (diff)
download: ydb-2d37894b1b037cf24231090eda8589bbb44fb6fc.tar.gz
268 files changed, 242637 insertions, 242637 deletions
diff --git a/contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index cee0726d70..a7ae10d156 100644
--- a/contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -1,96 +1,96 @@
-//===- AggressiveInstCombine.cpp ------------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the aggressive expression pattern combiner classes. 
-// Currently, it handles expression patterns for: 
-//  * Truncate instruction 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h" 
-#include "AggressiveInstCombineInternal.h" 
-#include "llvm-c/Initialization.h" 
-#include "llvm-c/Transforms/AggressiveInstCombine.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/AliasAnalysis.h" 
-#include "llvm/Analysis/BasicAliasAnalysis.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
+//===- AggressiveInstCombine.cpp ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the aggressive expression pattern combiner classes.
+// Currently, it handles expression patterns for:
+//  * Truncate instruction
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h"
+#include "AggressiveInstCombineInternal.h"
+#include "llvm-c/Initialization.h"
+#include "llvm-c/Transforms/AggressiveInstCombine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/Dominators.h" 
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/LegacyPassManager.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Transforms/Utils/Local.h" 
- 
-using namespace llvm; 
-using namespace PatternMatch; 
- 
-#define DEBUG_TYPE "aggressive-instcombine" 
- 
-STATISTIC(NumAnyOrAllBitsSet, "Number of any/all-bits-set patterns folded"); 
-STATISTIC(NumGuardedRotates, 
-          "Number of guarded rotates transformed into funnel shifts"); 
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "aggressive-instcombine"
+
+STATISTIC(NumAnyOrAllBitsSet, "Number of any/all-bits-set patterns folded");
+STATISTIC(NumGuardedRotates,
+          "Number of guarded rotates transformed into funnel shifts");
 STATISTIC(NumGuardedFunnelShifts,
           "Number of guarded funnel shifts transformed into funnel shifts");
-STATISTIC(NumPopCountRecognized, "Number of popcount idioms recognized"); 
- 
-namespace { 
-/// Contains expression pattern combiner logic. 
-/// This class provides both the logic to combine expression patterns and 
-/// combine them. It differs from InstCombiner class in that each pattern 
-/// combiner runs only once as opposed to InstCombine's multi-iteration, 
-/// which allows pattern combiner to have higher complexity than the O(1) 
-/// required by the instruction combiner. 
-class AggressiveInstCombinerLegacyPass : public FunctionPass { 
-public: 
-  static char ID; // Pass identification, replacement for typeid 
- 
-  AggressiveInstCombinerLegacyPass() : FunctionPass(ID) { 
-    initializeAggressiveInstCombinerLegacyPassPass( 
-        *PassRegistry::getPassRegistry()); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override; 
- 
-  /// Run all expression pattern optimizations on the given /p F function. 
-  /// 
-  /// \param F function to optimize. 
-  /// \returns true if the IR is changed. 
-  bool runOnFunction(Function &F) override; 
-}; 
-} // namespace 
- 
+STATISTIC(NumPopCountRecognized, "Number of popcount idioms recognized");
+
+namespace {
+/// Contains expression pattern combiner logic.
+/// This class provides both the logic to combine expression patterns and
+/// combine them. It differs from InstCombiner class in that each pattern
+/// combiner runs only once as opposed to InstCombine's multi-iteration,
+/// which allows pattern combiner to have higher complexity than the O(1)
+/// required by the instruction combiner.
+class AggressiveInstCombinerLegacyPass : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  AggressiveInstCombinerLegacyPass() : FunctionPass(ID) {
+    initializeAggressiveInstCombinerLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  /// Run all expression pattern optimizations on the given /p F function.
+  ///
+  /// \param F function to optimize.
+  /// \returns true if the IR is changed.
+  bool runOnFunction(Function &F) override;
+};
+} // namespace
+
 /// Match a pattern for a bitwise funnel/rotate operation that partially guards
 /// against undefined behavior by branching around the funnel-shift/rotation
 /// when the shift amount is 0.
 static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) {
-  if (I.getOpcode() != Instruction::PHI || I.getNumOperands() != 2) 
-    return false; 
- 
-  // As with the one-use checks below, this is not strictly necessary, but we 
-  // are being cautious to avoid potential perf regressions on targets that 
+  if (I.getOpcode() != Instruction::PHI || I.getNumOperands() != 2)
+    return false;
+
+  // As with the one-use checks below, this is not strictly necessary, but we
+  // are being cautious to avoid potential perf regressions on targets that
   // do not actually have a funnel/rotate instruction (where the funnel shift
   // would be expanded back into math/shift/logic ops).
-  if (!isPowerOf2_32(I.getType()->getScalarSizeInBits())) 
-    return false; 
- 
+  if (!isPowerOf2_32(I.getType()->getScalarSizeInBits()))
+    return false;
+
   // Match V to funnel shift left/right and capture the source operands and
   // shift amount.
   auto matchFunnelShift = [](Value *V, Value *&ShVal0, Value *&ShVal1,
                              Value *&ShAmt) {
     Value *SubAmt;
-    unsigned Width = V->getType()->getScalarSizeInBits(); 
- 
+    unsigned Width = V->getType()->getScalarSizeInBits();
+
     // fshl(ShVal0, ShVal1, ShAmt)
     //  == (ShVal0 << ShAmt) | (ShVal1 >> (Width -ShAmt))
     if (match(V, m_OneUse(m_c_Or(
@@ -99,8 +99,8 @@ static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) {
                             m_Sub(m_SpecificInt(Width), m_Value(SubAmt))))))) {
       if (ShAmt == SubAmt) // TODO: Use m_Specific
         return Intrinsic::fshl;
-    } 
- 
+    }
+
     // fshr(ShVal0, ShVal1, ShAmt)
     //  == (ShVal0 >> ShAmt) | (ShVal1 << (Width - ShAmt))
     if (match(V,
@@ -109,19 +109,19 @@ static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) {
                               m_LShr(m_Value(ShVal1), m_Value(ShAmt)))))) {
       if (ShAmt == SubAmt) // TODO: Use m_Specific
         return Intrinsic::fshr;
-    } 
- 
-    return Intrinsic::not_intrinsic; 
-  }; 
- 
+    }
+
+    return Intrinsic::not_intrinsic;
+  };
+
   // One phi operand must be a funnel/rotate operation, and the other phi
   // operand must be the source value of that funnel/rotate operation:
   // phi [ rotate(RotSrc, ShAmt), FunnelBB ], [ RotSrc, GuardBB ]
   // phi [ fshl(ShVal0, ShVal1, ShAmt), FunnelBB ], [ ShVal0, GuardBB ]
   // phi [ fshr(ShVal0, ShVal1, ShAmt), FunnelBB ], [ ShVal1, GuardBB ]
-  PHINode &Phi = cast<PHINode>(I); 
+  PHINode &Phi = cast<PHINode>(I);
   unsigned FunnelOp = 0, GuardOp = 1;
-  Value *P0 = Phi.getOperand(0), *P1 = Phi.getOperand(1); 
+  Value *P0 = Phi.getOperand(0), *P1 = Phi.getOperand(1);
   Value *ShVal0, *ShVal1, *ShAmt;
   Intrinsic::ID IID = matchFunnelShift(P0, ShVal0, ShVal1, ShAmt);
   if (IID == Intrinsic::not_intrinsic ||
@@ -131,33 +131,33 @@ static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) {
     if (IID == Intrinsic::not_intrinsic ||
         (IID == Intrinsic::fshl && ShVal0 != P0) ||
         (IID == Intrinsic::fshr && ShVal1 != P0))
-      return false; 
-    assert((IID == Intrinsic::fshl || IID == Intrinsic::fshr) && 
-           "Pattern must match funnel shift left or right"); 
+      return false;
+    assert((IID == Intrinsic::fshl || IID == Intrinsic::fshr) &&
+           "Pattern must match funnel shift left or right");
     std::swap(FunnelOp, GuardOp);
-  } 
- 
-  // The incoming block with our source operand must be the "guard" block. 
+  }
+
+  // The incoming block with our source operand must be the "guard" block.
   // That must contain a cmp+branch to avoid the funnel/rotate when the shift
   // amount is equal to 0. The other incoming block is the block with the
   // funnel/rotate.
   BasicBlock *GuardBB = Phi.getIncomingBlock(GuardOp);
   BasicBlock *FunnelBB = Phi.getIncomingBlock(FunnelOp);
-  Instruction *TermI = GuardBB->getTerminator(); 
+  Instruction *TermI = GuardBB->getTerminator();
 
   // Ensure that the shift values dominate each block.
   if (!DT.dominates(ShVal0, TermI) || !DT.dominates(ShVal1, TermI))
     return false;
 
-  ICmpInst::Predicate Pred; 
-  BasicBlock *PhiBB = Phi.getParent(); 
+  ICmpInst::Predicate Pred;
+  BasicBlock *PhiBB = Phi.getParent();
   if (!match(TermI, m_Br(m_ICmp(Pred, m_Specific(ShAmt), m_ZeroInt()),
                          m_SpecificBB(PhiBB), m_SpecificBB(FunnelBB))))
-    return false; 
- 
-  if (Pred != CmpInst::ICMP_EQ) 
-    return false; 
- 
+    return false;
+
+  if (Pred != CmpInst::ICMP_EQ)
+    return false;
+
   IRBuilder<> Builder(PhiBB, PhiBB->getFirstInsertionPt());
 
   if (ShVal0 == ShVal1)
@@ -175,8 +175,8 @@ static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) {
       ShVal0 = Builder.CreateFreeze(ShVal0);
   }
 
-  // We matched a variation of this IR pattern: 
-  // GuardBB: 
+  // We matched a variation of this IR pattern:
+  // GuardBB:
   //   %cmp = icmp eq i32 %ShAmt, 0
   //   br i1 %cmp, label %PhiBB, label %FunnelBB
   // FunnelBB:
@@ -184,280 +184,280 @@ static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) {
   //   %shr = lshr i32 %ShVal1, %sub
   //   %shl = shl i32 %ShVal0, %ShAmt
   //   %fsh = or i32 %shr, %shl
-  //   br label %PhiBB 
-  // PhiBB: 
+  //   br label %PhiBB
+  // PhiBB:
   //   %cond = phi i32 [ %fsh, %FunnelBB ], [ %ShVal0, %GuardBB ]
-  // --> 
+  // -->
   // llvm.fshl.i32(i32 %ShVal0, i32 %ShVal1, i32 %ShAmt)
-  Function *F = Intrinsic::getDeclaration(Phi.getModule(), IID, Phi.getType()); 
+  Function *F = Intrinsic::getDeclaration(Phi.getModule(), IID, Phi.getType());
   Phi.replaceAllUsesWith(Builder.CreateCall(F, {ShVal0, ShVal1, ShAmt}));
-  return true; 
-} 
- 
-/// This is used by foldAnyOrAllBitsSet() to capture a source value (Root) and 
-/// the bit indexes (Mask) needed by a masked compare. If we're matching a chain 
-/// of 'and' ops, then we also need to capture the fact that we saw an 
-/// "and X, 1", so that's an extra return value for that case. 
-struct MaskOps { 
-  Value *Root; 
-  APInt Mask; 
-  bool MatchAndChain; 
-  bool FoundAnd1; 
- 
-  MaskOps(unsigned BitWidth, bool MatchAnds) 
-      : Root(nullptr), Mask(APInt::getNullValue(BitWidth)), 
-        MatchAndChain(MatchAnds), FoundAnd1(false) {} 
-}; 
- 
-/// This is a recursive helper for foldAnyOrAllBitsSet() that walks through a 
-/// chain of 'and' or 'or' instructions looking for shift ops of a common source 
-/// value. Examples: 
-///   or (or (or X, (X >> 3)), (X >> 5)), (X >> 8) 
-/// returns { X, 0x129 } 
-///   and (and (X >> 1), 1), (X >> 4) 
-/// returns { X, 0x12 } 
-static bool matchAndOrChain(Value *V, MaskOps &MOps) { 
-  Value *Op0, *Op1; 
-  if (MOps.MatchAndChain) { 
-    // Recurse through a chain of 'and' operands. This requires an extra check 
-    // vs. the 'or' matcher: we must find an "and X, 1" instruction somewhere 
-    // in the chain to know that all of the high bits are cleared. 
-    if (match(V, m_And(m_Value(Op0), m_One()))) { 
-      MOps.FoundAnd1 = true; 
-      return matchAndOrChain(Op0, MOps); 
-    } 
-    if (match(V, m_And(m_Value(Op0), m_Value(Op1)))) 
-      return matchAndOrChain(Op0, MOps) && matchAndOrChain(Op1, MOps); 
-  } else { 
-    // Recurse through a chain of 'or' operands. 
-    if (match(V, m_Or(m_Value(Op0), m_Value(Op1)))) 
-      return matchAndOrChain(Op0, MOps) && matchAndOrChain(Op1, MOps); 
-  } 
- 
-  // We need a shift-right or a bare value representing a compare of bit 0 of 
-  // the original source operand. 
-  Value *Candidate; 
+  return true;
+}
+
+/// This is used by foldAnyOrAllBitsSet() to capture a source value (Root) and
+/// the bit indexes (Mask) needed by a masked compare. If we're matching a chain
+/// of 'and' ops, then we also need to capture the fact that we saw an
+/// "and X, 1", so that's an extra return value for that case.
+struct MaskOps {
+  Value *Root;
+  APInt Mask;
+  bool MatchAndChain;
+  bool FoundAnd1;
+
+  MaskOps(unsigned BitWidth, bool MatchAnds)
+      : Root(nullptr), Mask(APInt::getNullValue(BitWidth)),
+        MatchAndChain(MatchAnds), FoundAnd1(false) {}
+};
+
+/// This is a recursive helper for foldAnyOrAllBitsSet() that walks through a
+/// chain of 'and' or 'or' instructions looking for shift ops of a common source
+/// value. Examples:
+///   or (or (or X, (X >> 3)), (X >> 5)), (X >> 8)
+/// returns { X, 0x129 }
+///   and (and (X >> 1), 1), (X >> 4)
+/// returns { X, 0x12 }
+static bool matchAndOrChain(Value *V, MaskOps &MOps) {
+  Value *Op0, *Op1;
+  if (MOps.MatchAndChain) {
+    // Recurse through a chain of 'and' operands. This requires an extra check
+    // vs. the 'or' matcher: we must find an "and X, 1" instruction somewhere
+    // in the chain to know that all of the high bits are cleared.
+    if (match(V, m_And(m_Value(Op0), m_One()))) {
+      MOps.FoundAnd1 = true;
+      return matchAndOrChain(Op0, MOps);
+    }
+    if (match(V, m_And(m_Value(Op0), m_Value(Op1))))
+      return matchAndOrChain(Op0, MOps) && matchAndOrChain(Op1, MOps);
+  } else {
+    // Recurse through a chain of 'or' operands.
+    if (match(V, m_Or(m_Value(Op0), m_Value(Op1))))
+      return matchAndOrChain(Op0, MOps) && matchAndOrChain(Op1, MOps);
+  }
+
+  // We need a shift-right or a bare value representing a compare of bit 0 of
+  // the original source operand.
+  Value *Candidate;
   const APInt *BitIndex = nullptr;
   if (!match(V, m_LShr(m_Value(Candidate), m_APInt(BitIndex))))
-    Candidate = V; 
- 
-  // Initialize result source operand. 
-  if (!MOps.Root) 
-    MOps.Root = Candidate; 
- 
-  // The shift constant is out-of-range? This code hasn't been simplified. 
+    Candidate = V;
+
+  // Initialize result source operand.
+  if (!MOps.Root)
+    MOps.Root = Candidate;
+
+  // The shift constant is out-of-range? This code hasn't been simplified.
   if (BitIndex && BitIndex->uge(MOps.Mask.getBitWidth()))
-    return false; 
- 
-  // Fill in the mask bit derived from the shift constant. 
+    return false;
+
+  // Fill in the mask bit derived from the shift constant.
   MOps.Mask.setBit(BitIndex ? BitIndex->getZExtValue() : 0);
-  return MOps.Root == Candidate; 
-} 
- 
-/// Match patterns that correspond to "any-bits-set" and "all-bits-set". 
-/// These will include a chain of 'or' or 'and'-shifted bits from a 
-/// common source value: 
-/// and (or  (lshr X, C), ...), 1 --> (X & CMask) != 0 
-/// and (and (lshr X, C), ...), 1 --> (X & CMask) == CMask 
-/// Note: "any-bits-clear" and "all-bits-clear" are variations of these patterns 
-/// that differ only with a final 'not' of the result. We expect that final 
-/// 'not' to be folded with the compare that we create here (invert predicate). 
-static bool foldAnyOrAllBitsSet(Instruction &I) { 
-  // The 'any-bits-set' ('or' chain) pattern is simpler to match because the 
-  // final "and X, 1" instruction must be the final op in the sequence. 
-  bool MatchAllBitsSet; 
-  if (match(&I, m_c_And(m_OneUse(m_And(m_Value(), m_Value())), m_Value()))) 
-    MatchAllBitsSet = true; 
-  else if (match(&I, m_And(m_OneUse(m_Or(m_Value(), m_Value())), m_One()))) 
-    MatchAllBitsSet = false; 
-  else 
-    return false; 
- 
-  MaskOps MOps(I.getType()->getScalarSizeInBits(), MatchAllBitsSet); 
-  if (MatchAllBitsSet) { 
-    if (!matchAndOrChain(cast<BinaryOperator>(&I), MOps) || !MOps.FoundAnd1) 
-      return false; 
-  } else { 
-    if (!matchAndOrChain(cast<BinaryOperator>(&I)->getOperand(0), MOps)) 
-      return false; 
-  } 
- 
-  // The pattern was found. Create a masked compare that replaces all of the 
-  // shift and logic ops. 
-  IRBuilder<> Builder(&I); 
-  Constant *Mask = ConstantInt::get(I.getType(), MOps.Mask); 
-  Value *And = Builder.CreateAnd(MOps.Root, Mask); 
-  Value *Cmp = MatchAllBitsSet ? Builder.CreateICmpEQ(And, Mask) 
-                               : Builder.CreateIsNotNull(And); 
-  Value *Zext = Builder.CreateZExt(Cmp, I.getType()); 
-  I.replaceAllUsesWith(Zext); 
-  ++NumAnyOrAllBitsSet; 
-  return true; 
-} 
- 
-// Try to recognize below function as popcount intrinsic. 
-// This is the "best" algorithm from 
-// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel 
-// Also used in TargetLowering::expandCTPOP(). 
-// 
-// int popcount(unsigned int i) { 
-//   i = i - ((i >> 1) & 0x55555555); 
-//   i = (i & 0x33333333) + ((i >> 2) & 0x33333333); 
-//   i = ((i + (i >> 4)) & 0x0F0F0F0F); 
-//   return (i * 0x01010101) >> 24; 
-// } 
-static bool tryToRecognizePopCount(Instruction &I) { 
-  if (I.getOpcode() != Instruction::LShr) 
-    return false; 
- 
-  Type *Ty = I.getType(); 
-  if (!Ty->isIntOrIntVectorTy()) 
-    return false; 
- 
-  unsigned Len = Ty->getScalarSizeInBits(); 
-  // FIXME: fix Len == 8 and other irregular type lengths. 
-  if (!(Len <= 128 && Len > 8 && Len % 8 == 0)) 
-    return false; 
- 
-  APInt Mask55 = APInt::getSplat(Len, APInt(8, 0x55)); 
-  APInt Mask33 = APInt::getSplat(Len, APInt(8, 0x33)); 
-  APInt Mask0F = APInt::getSplat(Len, APInt(8, 0x0F)); 
-  APInt Mask01 = APInt::getSplat(Len, APInt(8, 0x01)); 
-  APInt MaskShift = APInt(Len, Len - 8); 
- 
-  Value *Op0 = I.getOperand(0); 
-  Value *Op1 = I.getOperand(1); 
-  Value *MulOp0; 
-  // Matching "(i * 0x01010101...) >> 24". 
-  if ((match(Op0, m_Mul(m_Value(MulOp0), m_SpecificInt(Mask01)))) && 
-       match(Op1, m_SpecificInt(MaskShift))) { 
-    Value *ShiftOp0; 
-    // Matching "((i + (i >> 4)) & 0x0F0F0F0F...)". 
-    if (match(MulOp0, m_And(m_c_Add(m_LShr(m_Value(ShiftOp0), m_SpecificInt(4)), 
-                                    m_Deferred(ShiftOp0)), 
-                            m_SpecificInt(Mask0F)))) { 
-      Value *AndOp0; 
-      // Matching "(i & 0x33333333...) + ((i >> 2) & 0x33333333...)". 
-      if (match(ShiftOp0, 
-                m_c_Add(m_And(m_Value(AndOp0), m_SpecificInt(Mask33)), 
-                        m_And(m_LShr(m_Deferred(AndOp0), m_SpecificInt(2)), 
-                              m_SpecificInt(Mask33))))) { 
-        Value *Root, *SubOp1; 
-        // Matching "i - ((i >> 1) & 0x55555555...)". 
-        if (match(AndOp0, m_Sub(m_Value(Root), m_Value(SubOp1))) && 
-            match(SubOp1, m_And(m_LShr(m_Specific(Root), m_SpecificInt(1)), 
-                                m_SpecificInt(Mask55)))) { 
-          LLVM_DEBUG(dbgs() << "Recognized popcount intrinsic\n"); 
-          IRBuilder<> Builder(&I); 
-          Function *Func = Intrinsic::getDeclaration( 
-              I.getModule(), Intrinsic::ctpop, I.getType()); 
-          I.replaceAllUsesWith(Builder.CreateCall(Func, {Root})); 
-          ++NumPopCountRecognized; 
-          return true; 
-        } 
-      } 
-    } 
-  } 
- 
-  return false; 
-} 
- 
-/// This is the entry point for folds that could be implemented in regular 
-/// InstCombine, but they are separated because they are not expected to 
-/// occur frequently and/or have more than a constant-length pattern match. 
-static bool foldUnusualPatterns(Function &F, DominatorTree &DT) { 
-  bool MadeChange = false; 
-  for (BasicBlock &BB : F) { 
-    // Ignore unreachable basic blocks. 
-    if (!DT.isReachableFromEntry(&BB)) 
-      continue; 
-    // Do not delete instructions under here and invalidate the iterator. 
-    // Walk the block backwards for efficiency. We're matching a chain of 
-    // use->defs, so we're more likely to succeed by starting from the bottom. 
-    // Also, we want to avoid matching partial patterns. 
-    // TODO: It would be more efficient if we removed dead instructions 
-    // iteratively in this loop rather than waiting until the end. 
-    for (Instruction &I : make_range(BB.rbegin(), BB.rend())) { 
-      MadeChange |= foldAnyOrAllBitsSet(I); 
+  return MOps.Root == Candidate;
+}
+
+/// Match patterns that correspond to "any-bits-set" and "all-bits-set".
+/// These will include a chain of 'or' or 'and'-shifted bits from a
+/// common source value:
+/// and (or  (lshr X, C), ...), 1 --> (X & CMask) != 0
+/// and (and (lshr X, C), ...), 1 --> (X & CMask) == CMask
+/// Note: "any-bits-clear" and "all-bits-clear" are variations of these patterns
+/// that differ only with a final 'not' of the result. We expect that final
+/// 'not' to be folded with the compare that we create here (invert predicate).
+static bool foldAnyOrAllBitsSet(Instruction &I) {
+  // The 'any-bits-set' ('or' chain) pattern is simpler to match because the
+  // final "and X, 1" instruction must be the final op in the sequence.
+  bool MatchAllBitsSet;
+  if (match(&I, m_c_And(m_OneUse(m_And(m_Value(), m_Value())), m_Value())))
+    MatchAllBitsSet = true;
+  else if (match(&I, m_And(m_OneUse(m_Or(m_Value(), m_Value())), m_One())))
+    MatchAllBitsSet = false;
+  else
+    return false;
+
+  MaskOps MOps(I.getType()->getScalarSizeInBits(), MatchAllBitsSet);
+  if (MatchAllBitsSet) {
+    if (!matchAndOrChain(cast<BinaryOperator>(&I), MOps) || !MOps.FoundAnd1)
+      return false;
+  } else {
+    if (!matchAndOrChain(cast<BinaryOperator>(&I)->getOperand(0), MOps))
+      return false;
+  }
+
+  // The pattern was found. Create a masked compare that replaces all of the
+  // shift and logic ops.
+  IRBuilder<> Builder(&I);
+  Constant *Mask = ConstantInt::get(I.getType(), MOps.Mask);
+  Value *And = Builder.CreateAnd(MOps.Root, Mask);
+  Value *Cmp = MatchAllBitsSet ? Builder.CreateICmpEQ(And, Mask)
+                               : Builder.CreateIsNotNull(And);
+  Value *Zext = Builder.CreateZExt(Cmp, I.getType());
+  I.replaceAllUsesWith(Zext);
+  ++NumAnyOrAllBitsSet;
+  return true;
+}
+
+// Try to recognize below function as popcount intrinsic.
+// This is the "best" algorithm from
+// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+// Also used in TargetLowering::expandCTPOP().
+//
+// int popcount(unsigned int i) {
+//   i = i - ((i >> 1) & 0x55555555);
+//   i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
+//   i = ((i + (i >> 4)) & 0x0F0F0F0F);
+//   return (i * 0x01010101) >> 24;
+// }
+static bool tryToRecognizePopCount(Instruction &I) {
+  if (I.getOpcode() != Instruction::LShr)
+    return false;
+
+  Type *Ty = I.getType();
+  if (!Ty->isIntOrIntVectorTy())
+    return false;
+
+  unsigned Len = Ty->getScalarSizeInBits();
+  // FIXME: fix Len == 8 and other irregular type lengths.
+  if (!(Len <= 128 && Len > 8 && Len % 8 == 0))
+    return false;
+
+  APInt Mask55 = APInt::getSplat(Len, APInt(8, 0x55));
+  APInt Mask33 = APInt::getSplat(Len, APInt(8, 0x33));
+  APInt Mask0F = APInt::getSplat(Len, APInt(8, 0x0F));
+  APInt Mask01 = APInt::getSplat(Len, APInt(8, 0x01));
+  APInt MaskShift = APInt(Len, Len - 8);
+
+  Value *Op0 = I.getOperand(0);
+  Value *Op1 = I.getOperand(1);
+  Value *MulOp0;
+  // Matching "(i * 0x01010101...) >> 24".
+  if ((match(Op0, m_Mul(m_Value(MulOp0), m_SpecificInt(Mask01)))) &&
+       match(Op1, m_SpecificInt(MaskShift))) {
+    Value *ShiftOp0;
+    // Matching "((i + (i >> 4)) & 0x0F0F0F0F...)".
+    if (match(MulOp0, m_And(m_c_Add(m_LShr(m_Value(ShiftOp0), m_SpecificInt(4)),
+                                    m_Deferred(ShiftOp0)),
+                            m_SpecificInt(Mask0F)))) {
+      Value *AndOp0;
+      // Matching "(i & 0x33333333...) + ((i >> 2) & 0x33333333...)".
+      if (match(ShiftOp0,
+                m_c_Add(m_And(m_Value(AndOp0), m_SpecificInt(Mask33)),
+                        m_And(m_LShr(m_Deferred(AndOp0), m_SpecificInt(2)),
+                              m_SpecificInt(Mask33))))) {
+        Value *Root, *SubOp1;
+        // Matching "i - ((i >> 1) & 0x55555555...)".
+        if (match(AndOp0, m_Sub(m_Value(Root), m_Value(SubOp1))) &&
+            match(SubOp1, m_And(m_LShr(m_Specific(Root), m_SpecificInt(1)),
+                                m_SpecificInt(Mask55)))) {
+          LLVM_DEBUG(dbgs() << "Recognized popcount intrinsic\n");
+          IRBuilder<> Builder(&I);
+          Function *Func = Intrinsic::getDeclaration(
+              I.getModule(), Intrinsic::ctpop, I.getType());
+          I.replaceAllUsesWith(Builder.CreateCall(Func, {Root}));
+          ++NumPopCountRecognized;
+          return true;
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+/// This is the entry point for folds that could be implemented in regular
+/// InstCombine, but they are separated because they are not expected to
+/// occur frequently and/or have more than a constant-length pattern match.
+static bool foldUnusualPatterns(Function &F, DominatorTree &DT) {
+  bool MadeChange = false;
+  for (BasicBlock &BB : F) {
+    // Ignore unreachable basic blocks.
+    if (!DT.isReachableFromEntry(&BB))
+      continue;
+    // Do not delete instructions under here and invalidate the iterator.
+    // Walk the block backwards for efficiency. We're matching a chain of
+    // use->defs, so we're more likely to succeed by starting from the bottom.
+    // Also, we want to avoid matching partial patterns.
+    // TODO: It would be more efficient if we removed dead instructions
+    // iteratively in this loop rather than waiting until the end.
+    for (Instruction &I : make_range(BB.rbegin(), BB.rend())) {
+      MadeChange |= foldAnyOrAllBitsSet(I);
       MadeChange |= foldGuardedFunnelShift(I, DT);
-      MadeChange |= tryToRecognizePopCount(I);  
-    } 
-  } 
- 
-  // We're done with transforms, so remove dead instructions. 
-  if (MadeChange) 
-    for (BasicBlock &BB : F) 
-      SimplifyInstructionsInBlock(&BB); 
- 
-  return MadeChange; 
-} 
- 
-/// This is the entry point for all transforms. Pass manager differences are 
-/// handled in the callers of this function. 
-static bool runImpl(Function &F, TargetLibraryInfo &TLI, DominatorTree &DT) { 
-  bool MadeChange = false; 
-  const DataLayout &DL = F.getParent()->getDataLayout(); 
-  TruncInstCombine TIC(TLI, DL, DT); 
-  MadeChange |= TIC.run(F); 
-  MadeChange |= foldUnusualPatterns(F, DT); 
-  return MadeChange; 
-} 
- 
-void AggressiveInstCombinerLegacyPass::getAnalysisUsage( 
-    AnalysisUsage &AU) const { 
-  AU.setPreservesCFG(); 
-  AU.addRequired<DominatorTreeWrapperPass>(); 
-  AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-  AU.addPreserved<AAResultsWrapperPass>(); 
-  AU.addPreserved<BasicAAWrapperPass>(); 
-  AU.addPreserved<DominatorTreeWrapperPass>(); 
-  AU.addPreserved<GlobalsAAWrapperPass>(); 
-} 
- 
-bool AggressiveInstCombinerLegacyPass::runOnFunction(Function &F) { 
-  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 
-  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-  return runImpl(F, TLI, DT); 
-} 
- 
-PreservedAnalyses AggressiveInstCombinePass::run(Function &F, 
-                                                 FunctionAnalysisManager &AM) { 
-  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 
-  auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 
-  if (!runImpl(F, TLI, DT)) { 
-    // No changes, all analyses are preserved. 
-    return PreservedAnalyses::all(); 
-  } 
-  // Mark all the analyses that instcombine updates as preserved. 
-  PreservedAnalyses PA; 
-  PA.preserveSet<CFGAnalyses>(); 
-  PA.preserve<AAManager>(); 
-  PA.preserve<GlobalsAA>(); 
-  return PA; 
-} 
- 
-char AggressiveInstCombinerLegacyPass::ID = 0; 
-INITIALIZE_PASS_BEGIN(AggressiveInstCombinerLegacyPass, 
-                      "aggressive-instcombine", 
-                      "Combine pattern based expressions", false, false) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_END(AggressiveInstCombinerLegacyPass, "aggressive-instcombine", 
-                    "Combine pattern based expressions", false, false) 
- 
-// Initialization Routines 
-void llvm::initializeAggressiveInstCombine(PassRegistry &Registry) { 
-  initializeAggressiveInstCombinerLegacyPassPass(Registry); 
-} 
- 
-void LLVMInitializeAggressiveInstCombiner(LLVMPassRegistryRef R) { 
-  initializeAggressiveInstCombinerLegacyPassPass(*unwrap(R)); 
-} 
- 
-FunctionPass *llvm::createAggressiveInstCombinerPass() { 
-  return new AggressiveInstCombinerLegacyPass(); 
-} 
- 
-void LLVMAddAggressiveInstCombinerPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createAggressiveInstCombinerPass()); 
-} 
+      MadeChange |= tryToRecognizePopCount(I); 
+    }
+  }
+
+  // We're done with transforms, so remove dead instructions.
+  if (MadeChange)
+    for (BasicBlock &BB : F)
+      SimplifyInstructionsInBlock(&BB);
+
+  return MadeChange;
+}
+
+/// This is the entry point for all transforms. Pass manager differences are
+/// handled in the callers of this function.
+static bool runImpl(Function &F, TargetLibraryInfo &TLI, DominatorTree &DT) {
+  bool MadeChange = false;
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  TruncInstCombine TIC(TLI, DL, DT);
+  MadeChange |= TIC.run(F);
+  MadeChange |= foldUnusualPatterns(F, DT);
+  return MadeChange;
+}
+
+void AggressiveInstCombinerLegacyPass::getAnalysisUsage(
+    AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
+  AU.addPreserved<AAResultsWrapperPass>();
+  AU.addPreserved<BasicAAWrapperPass>();
+  AU.addPreserved<DominatorTreeWrapperPass>();
+  AU.addPreserved<GlobalsAAWrapperPass>();
+}
+
+bool AggressiveInstCombinerLegacyPass::runOnFunction(Function &F) {
+  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  return runImpl(F, TLI, DT);
+}
+
+PreservedAnalyses AggressiveInstCombinePass::run(Function &F,
+                                                 FunctionAnalysisManager &AM) {
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  if (!runImpl(F, TLI, DT)) {
+    // No changes, all analyses are preserved.
+    return PreservedAnalyses::all();
+  }
+  // Mark all the analyses that instcombine updates as preserved.
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<AAManager>();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+
+char AggressiveInstCombinerLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(AggressiveInstCombinerLegacyPass,
+                      "aggressive-instcombine",
+                      "Combine pattern based expressions", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(AggressiveInstCombinerLegacyPass, "aggressive-instcombine",
+                    "Combine pattern based expressions", false, false)
+
+// Initialization Routines
+void llvm::initializeAggressiveInstCombine(PassRegistry &Registry) {
+  initializeAggressiveInstCombinerLegacyPassPass(Registry);
+}
+
+void LLVMInitializeAggressiveInstCombiner(LLVMPassRegistryRef R) {
+  initializeAggressiveInstCombinerLegacyPassPass(*unwrap(R));
+}
+
+FunctionPass *llvm::createAggressiveInstCombinerPass() {
+  return new AggressiveInstCombinerLegacyPass();
+}
+
+void LLVMAddAggressiveInstCombinerPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createAggressiveInstCombinerPass());
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h b/contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h
index 92620bde16..42bcadfc7d 100644
--- a/contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h
+++ b/contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h
@@ -1,124 +1,124 @@
-//===- AggressiveInstCombineInternal.h --------------------------*- C++ -*-===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the instruction pattern combiner classes. 
-// Currently, it handles pattern expressions for: 
-//  * Truncate instruction 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#ifndef LLVM_LIB_TRANSFORMS_AGGRESSIVEINSTCOMBINE_COMBINEINTERNAL_H 
-#define LLVM_LIB_TRANSFORMS_AGGRESSIVEINSTCOMBINE_COMBINEINTERNAL_H 
- 
-#include "llvm/ADT/MapVector.h" 
-#include "llvm/ADT/SmallVector.h" 
- 
-using namespace llvm; 
- 
-//===----------------------------------------------------------------------===// 
-// TruncInstCombine - looks for expression dags dominated by trunc instructions 
-// and for each eligible dag, it will create a reduced bit-width expression and 
-// replace the old expression with this new one and remove the old one. 
-// Eligible expression dag is such that: 
-//   1. Contains only supported instructions. 
-//   2. Supported leaves: ZExtInst, SExtInst, TruncInst and Constant value. 
-//   3. Can be evaluated into type with reduced legal bit-width (or Trunc type). 
-//   4. All instructions in the dag must not have users outside the dag. 
-//      Only exception is for {ZExt, SExt}Inst with operand type equal to the 
-//      new reduced type chosen in (3). 
-// 
-// The motivation for this optimization is that evaluating and expression using 
-// smaller bit-width is preferable, especially for vectorization where we can 
-// fit more values in one vectorized instruction. In addition, this optimization 
-// may decrease the number of cast instructions, but will not increase it. 
-//===----------------------------------------------------------------------===// 
- 
-namespace llvm { 
-  class DataLayout; 
-  class DominatorTree; 
-  class Function; 
-  class Instruction; 
-  class TargetLibraryInfo; 
-  class TruncInst; 
-  class Type; 
-  class Value; 
- 
-class TruncInstCombine { 
-  TargetLibraryInfo &TLI; 
-  const DataLayout &DL; 
-  const DominatorTree &DT; 
- 
-  /// List of all TruncInst instructions to be processed. 
-  SmallVector<TruncInst *, 4> Worklist; 
- 
-  /// Current processed TruncInst instruction. 
-  TruncInst *CurrentTruncInst; 
- 
-  /// Information per each instruction in the expression dag. 
-  struct Info { 
-    /// Number of LSBs that are needed to generate a valid expression. 
-    unsigned ValidBitWidth = 0; 
-    /// Minimum number of LSBs needed to generate the ValidBitWidth. 
-    unsigned MinBitWidth = 0; 
-    /// The reduced value generated to replace the old instruction. 
-    Value *NewValue = nullptr; 
-  }; 
-  /// An ordered map representing expression dag post-dominated by current 
-  /// processed TruncInst. It maps each instruction in the dag to its Info 
-  /// structure. The map is ordered such that each instruction appears before 
-  /// all other instructions in the dag that uses it. 
-  MapVector<Instruction *, Info> InstInfoMap; 
- 
-public: 
-  TruncInstCombine(TargetLibraryInfo &TLI, const DataLayout &DL, 
-                   const DominatorTree &DT) 
-      : TLI(TLI), DL(DL), DT(DT), CurrentTruncInst(nullptr) {} 
- 
-  /// Perform TruncInst pattern optimization on given function. 
-  bool run(Function &F); 
- 
-private: 
-  /// Build expression dag dominated by the /p CurrentTruncInst and append it to 
-  /// the InstInfoMap container. 
-  /// 
-  /// \return true only if succeed to generate an eligible sub expression dag. 
-  bool buildTruncExpressionDag(); 
- 
-  /// Calculate the minimal allowed bit-width of the chain ending with the 
-  /// currently visited truncate's operand. 
-  /// 
-  /// \return minimum number of bits to which the chain ending with the 
-  /// truncate's operand can be shrunk to. 
-  unsigned getMinBitWidth(); 
- 
-  /// Build an expression dag dominated by the current processed TruncInst and 
-  /// Check if it is eligible to be reduced to a smaller type. 
-  /// 
-  /// \return the scalar version of the new type to be used for the reduced 
-  ///         expression dag, or nullptr if the expression dag is not eligible 
-  ///         to be reduced. 
-  Type *getBestTruncatedType(); 
- 
-  /// Given a \p V value and a \p SclTy scalar type return the generated reduced 
-  /// value of \p V based on the type \p SclTy. 
-  /// 
-  /// \param V value to be reduced. 
-  /// \param SclTy scalar version of new type to reduce to. 
-  /// \return the new reduced value. 
-  Value *getReducedOperand(Value *V, Type *SclTy); 
- 
-  /// Create a new expression dag using the reduced /p SclTy type and replace 
-  /// the old expression dag with it. Also erase all instructions in the old 
-  /// dag, except those that are still needed outside the dag. 
-  /// 
-  /// \param SclTy scalar version of new type to reduce expression dag into. 
-  void ReduceExpressionDag(Type *SclTy); 
-}; 
-} // end namespace llvm. 
- 
-#endif 
+//===- AggressiveInstCombineInternal.h --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the instruction pattern combiner classes.
+// Currently, it handles pattern expressions for:
+//  * Truncate instruction
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_AGGRESSIVEINSTCOMBINE_COMBINEINTERNAL_H
+#define LLVM_LIB_TRANSFORMS_AGGRESSIVEINSTCOMBINE_COMBINEINTERNAL_H
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallVector.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// TruncInstCombine - looks for expression dags dominated by trunc instructions
+// and for each eligible dag, it will create a reduced bit-width expression and
+// replace the old expression with this new one and remove the old one.
+// Eligible expression dag is such that:
+//   1. Contains only supported instructions.
+//   2. Supported leaves: ZExtInst, SExtInst, TruncInst and Constant value.
+//   3. Can be evaluated into type with reduced legal bit-width (or Trunc type).
+//   4. All instructions in the dag must not have users outside the dag.
+//      Only exception is for {ZExt, SExt}Inst with operand type equal to the
+//      new reduced type chosen in (3).
+//
+// The motivation for this optimization is that evaluating and expression using
+// smaller bit-width is preferable, especially for vectorization where we can
+// fit more values in one vectorized instruction. In addition, this optimization
+// may decrease the number of cast instructions, but will not increase it.
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+  class DataLayout;
+  class DominatorTree;
+  class Function;
+  class Instruction;
+  class TargetLibraryInfo;
+  class TruncInst;
+  class Type;
+  class Value;
+
+class TruncInstCombine {
+  TargetLibraryInfo &TLI;
+  const DataLayout &DL;
+  const DominatorTree &DT;
+
+  /// List of all TruncInst instructions to be processed.
+  SmallVector<TruncInst *, 4> Worklist;
+
+  /// Current processed TruncInst instruction.
+  TruncInst *CurrentTruncInst;
+
+  /// Information per each instruction in the expression dag.
+  struct Info {
+    /// Number of LSBs that are needed to generate a valid expression.
+    unsigned ValidBitWidth = 0;
+    /// Minimum number of LSBs needed to generate the ValidBitWidth.
+    unsigned MinBitWidth = 0;
+    /// The reduced value generated to replace the old instruction.
+    Value *NewValue = nullptr;
+  };
+  /// An ordered map representing expression dag post-dominated by current
+  /// processed TruncInst. It maps each instruction in the dag to its Info
+  /// structure. The map is ordered such that each instruction appears before
+  /// all other instructions in the dag that uses it.
+  MapVector<Instruction *, Info> InstInfoMap;
+
+public:
+  TruncInstCombine(TargetLibraryInfo &TLI, const DataLayout &DL,
+                   const DominatorTree &DT)
+      : TLI(TLI), DL(DL), DT(DT), CurrentTruncInst(nullptr) {}
+
+  /// Perform TruncInst pattern optimization on given function.
+  bool run(Function &F);
+
+private:
+  /// Build expression dag dominated by the /p CurrentTruncInst and append it to
+  /// the InstInfoMap container.
+  ///
+  /// \return true only if succeed to generate an eligible sub expression dag.
+  bool buildTruncExpressionDag();
+
+  /// Calculate the minimal allowed bit-width of the chain ending with the
+  /// currently visited truncate's operand.
+  ///
+  /// \return minimum number of bits to which the chain ending with the
+  /// truncate's operand can be shrunk to.
+  unsigned getMinBitWidth();
+
+  /// Build an expression dag dominated by the current processed TruncInst and
+  /// Check if it is eligible to be reduced to a smaller type.
+  ///
+  /// \return the scalar version of the new type to be used for the reduced
+  ///         expression dag, or nullptr if the expression dag is not eligible
+  ///         to be reduced.
+  Type *getBestTruncatedType();
+
+  /// Given a \p V value and a \p SclTy scalar type return the generated reduced
+  /// value of \p V based on the type \p SclTy.
+  ///
+  /// \param V value to be reduced.
+  /// \param SclTy scalar version of new type to reduce to.
+  /// \return the new reduced value.
+  Value *getReducedOperand(Value *V, Type *SclTy);
+
+  /// Create a new expression dag using the reduced /p SclTy type and replace
+  /// the old expression dag with it. Also erase all instructions in the old
+  /// dag, except those that are still needed outside the dag.
+  ///
+  /// \param SclTy scalar version of new type to reduce expression dag into.
+  void ReduceExpressionDag(Type *SclTy);
+};
+} // end namespace llvm.
+
+#endif
diff --git a/contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp b/contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
index 193057aaab..16b82219e8 100644
--- a/contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
@@ -1,436 +1,436 @@
-//===- TruncInstCombine.cpp -----------------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// TruncInstCombine - looks for expression dags post-dominated by TruncInst and 
-// for each eligible dag, it will create a reduced bit-width expression, replace 
-// the old expression with this new one and remove the old expression. 
-// Eligible expression dag is such that: 
-//   1. Contains only supported instructions. 
-//   2. Supported leaves: ZExtInst, SExtInst, TruncInst and Constant value. 
-//   3. Can be evaluated into type with reduced legal bit-width. 
-//   4. All instructions in the dag must not have users outside the dag. 
-//      The only exception is for {ZExt, SExt}Inst with operand type equal to 
-//      the new reduced type evaluated in (3). 
-// 
-// The motivation for this optimization is that evaluating and expression using 
-// smaller bit-width is preferable, especially for vectorization where we can 
-// fit more values in one vectorized instruction. In addition, this optimization 
-// may decrease the number of cast instructions, but will not increase it. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "AggressiveInstCombineInternal.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/ConstantFolding.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/Dominators.h" 
+//===- TruncInstCombine.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// TruncInstCombine - looks for expression dags post-dominated by TruncInst and
+// for each eligible dag, it will create a reduced bit-width expression, replace
+// the old expression with this new one and remove the old expression.
+// Eligible expression dag is such that:
+//   1. Contains only supported instructions.
+//   2. Supported leaves: ZExtInst, SExtInst, TruncInst and Constant value.
+//   3. Can be evaluated into type with reduced legal bit-width.
+//   4. All instructions in the dag must not have users outside the dag.
+//      The only exception is for {ZExt, SExt}Inst with operand type equal to
+//      the new reduced type evaluated in (3).
+//
+// The motivation for this optimization is that evaluating and expression using
+// smaller bit-width is preferable, especially for vectorization where we can
+// fit more values in one vectorized instruction. In addition, this optimization
+// may decrease the number of cast instructions, but will not increase it.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AggressiveInstCombineInternal.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instruction.h" 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "aggressive-instcombine" 
- 
-STATISTIC( 
-    NumDAGsReduced, 
-    "Number of truncations eliminated by reducing bit width of expression DAG"); 
-STATISTIC(NumInstrsReduced, 
-          "Number of instructions whose bit width was reduced"); 
- 
-/// Given an instruction and a container, it fills all the relevant operands of 
-/// that instruction, with respect to the Trunc expression dag optimizaton. 
-static void getRelevantOperands(Instruction *I, SmallVectorImpl<Value *> &Ops) { 
-  unsigned Opc = I->getOpcode(); 
-  switch (Opc) { 
-  case Instruction::Trunc: 
-  case Instruction::ZExt: 
-  case Instruction::SExt: 
-    // These CastInst are considered leaves of the evaluated expression, thus, 
-    // their operands are not relevent. 
-    break; 
-  case Instruction::Add: 
-  case Instruction::Sub: 
-  case Instruction::Mul: 
-  case Instruction::And: 
-  case Instruction::Or: 
-  case Instruction::Xor: 
-    Ops.push_back(I->getOperand(0)); 
-    Ops.push_back(I->getOperand(1)); 
-    break; 
-  case Instruction::Select: 
-    Ops.push_back(I->getOperand(1)); 
-    Ops.push_back(I->getOperand(2)); 
-    break; 
-  default: 
-    llvm_unreachable("Unreachable!"); 
-  } 
-} 
- 
-bool TruncInstCombine::buildTruncExpressionDag() { 
-  SmallVector<Value *, 8> Worklist; 
-  SmallVector<Instruction *, 8> Stack; 
-  // Clear old expression dag. 
-  InstInfoMap.clear(); 
- 
-  Worklist.push_back(CurrentTruncInst->getOperand(0)); 
- 
-  while (!Worklist.empty()) { 
-    Value *Curr = Worklist.back(); 
- 
-    if (isa<Constant>(Curr)) { 
-      Worklist.pop_back(); 
-      continue; 
-    } 
- 
-    auto *I = dyn_cast<Instruction>(Curr); 
-    if (!I) 
-      return false; 
- 
-    if (!Stack.empty() && Stack.back() == I) { 
-      // Already handled all instruction operands, can remove it from both the 
-      // Worklist and the Stack, and add it to the instruction info map. 
-      Worklist.pop_back(); 
-      Stack.pop_back(); 
-      // Insert I to the Info map. 
-      InstInfoMap.insert(std::make_pair(I, Info())); 
-      continue; 
-    } 
- 
-    if (InstInfoMap.count(I)) { 
-      Worklist.pop_back(); 
-      continue; 
-    } 
- 
-    // Add the instruction to the stack before start handling its operands. 
-    Stack.push_back(I); 
- 
-    unsigned Opc = I->getOpcode(); 
-    switch (Opc) { 
-    case Instruction::Trunc: 
-    case Instruction::ZExt: 
-    case Instruction::SExt: 
-      // trunc(trunc(x)) -> trunc(x) 
-      // trunc(ext(x)) -> ext(x) if the source type is smaller than the new dest 
-      // trunc(ext(x)) -> trunc(x) if the source type is larger than the new 
-      // dest 
-      break; 
-    case Instruction::Add: 
-    case Instruction::Sub: 
-    case Instruction::Mul: 
-    case Instruction::And: 
-    case Instruction::Or: 
-    case Instruction::Xor: 
-    case Instruction::Select: { 
-      SmallVector<Value *, 2> Operands; 
-      getRelevantOperands(I, Operands); 
+#include "llvm/IR/Instruction.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aggressive-instcombine"
+
+STATISTIC(
+    NumDAGsReduced,
+    "Number of truncations eliminated by reducing bit width of expression DAG");
+STATISTIC(NumInstrsReduced,
+          "Number of instructions whose bit width was reduced");
+
+/// Given an instruction and a container, it fills all the relevant operands of
+/// that instruction, with respect to the Trunc expression dag optimizaton.
+static void getRelevantOperands(Instruction *I, SmallVectorImpl<Value *> &Ops) {
+  unsigned Opc = I->getOpcode();
+  switch (Opc) {
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+    // These CastInst are considered leaves of the evaluated expression, thus,
+    // their operands are not relevent.
+    break;
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    Ops.push_back(I->getOperand(0));
+    Ops.push_back(I->getOperand(1));
+    break;
+  case Instruction::Select:
+    Ops.push_back(I->getOperand(1));
+    Ops.push_back(I->getOperand(2));
+    break;
+  default:
+    llvm_unreachable("Unreachable!");
+  }
+}
+
+bool TruncInstCombine::buildTruncExpressionDag() {
+  SmallVector<Value *, 8> Worklist;
+  SmallVector<Instruction *, 8> Stack;
+  // Clear old expression dag.
+  InstInfoMap.clear();
+
+  Worklist.push_back(CurrentTruncInst->getOperand(0));
+
+  while (!Worklist.empty()) {
+    Value *Curr = Worklist.back();
+
+    if (isa<Constant>(Curr)) {
+      Worklist.pop_back();
+      continue;
+    }
+
+    auto *I = dyn_cast<Instruction>(Curr);
+    if (!I)
+      return false;
+
+    if (!Stack.empty() && Stack.back() == I) {
+      // Already handled all instruction operands, can remove it from both the
+      // Worklist and the Stack, and add it to the instruction info map.
+      Worklist.pop_back();
+      Stack.pop_back();
+      // Insert I to the Info map.
+      InstInfoMap.insert(std::make_pair(I, Info()));
+      continue;
+    }
+
+    if (InstInfoMap.count(I)) {
+      Worklist.pop_back();
+      continue;
+    }
+
+    // Add the instruction to the stack before start handling its operands.
+    Stack.push_back(I);
+
+    unsigned Opc = I->getOpcode();
+    switch (Opc) {
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+      // trunc(trunc(x)) -> trunc(x)
+      // trunc(ext(x)) -> ext(x) if the source type is smaller than the new dest
+      // trunc(ext(x)) -> trunc(x) if the source type is larger than the new
+      // dest
+      break;
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::Mul:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::Select: {
+      SmallVector<Value *, 2> Operands;
+      getRelevantOperands(I, Operands);
       append_range(Worklist, Operands);
-      break; 
-    } 
-    default: 
-      // TODO: Can handle more cases here: 
-      // 1. shufflevector, extractelement, insertelement 
-      // 2. udiv, urem 
-      // 3. shl, lshr, ashr 
-      // 4. phi node(and loop handling) 
-      // ... 
-      return false; 
-    } 
-  } 
-  return true; 
-} 
- 
-unsigned TruncInstCombine::getMinBitWidth() { 
-  SmallVector<Value *, 8> Worklist; 
-  SmallVector<Instruction *, 8> Stack; 
- 
-  Value *Src = CurrentTruncInst->getOperand(0); 
-  Type *DstTy = CurrentTruncInst->getType(); 
-  unsigned TruncBitWidth = DstTy->getScalarSizeInBits(); 
-  unsigned OrigBitWidth = 
-      CurrentTruncInst->getOperand(0)->getType()->getScalarSizeInBits(); 
- 
-  if (isa<Constant>(Src)) 
-    return TruncBitWidth; 
- 
-  Worklist.push_back(Src); 
-  InstInfoMap[cast<Instruction>(Src)].ValidBitWidth = TruncBitWidth; 
- 
-  while (!Worklist.empty()) { 
-    Value *Curr = Worklist.back(); 
- 
-    if (isa<Constant>(Curr)) { 
-      Worklist.pop_back(); 
-      continue; 
-    } 
- 
-    // Otherwise, it must be an instruction. 
-    auto *I = cast<Instruction>(Curr); 
- 
-    auto &Info = InstInfoMap[I]; 
- 
-    SmallVector<Value *, 2> Operands; 
-    getRelevantOperands(I, Operands); 
- 
-    if (!Stack.empty() && Stack.back() == I) { 
-      // Already handled all instruction operands, can remove it from both, the 
-      // Worklist and the Stack, and update MinBitWidth. 
-      Worklist.pop_back(); 
-      Stack.pop_back(); 
-      for (auto *Operand : Operands) 
-        if (auto *IOp = dyn_cast<Instruction>(Operand)) 
-          Info.MinBitWidth = 
-              std::max(Info.MinBitWidth, InstInfoMap[IOp].MinBitWidth); 
-      continue; 
-    } 
- 
-    // Add the instruction to the stack before start handling its operands. 
-    Stack.push_back(I); 
-    unsigned ValidBitWidth = Info.ValidBitWidth; 
- 
-    // Update minimum bit-width before handling its operands. This is required 
-    // when the instruction is part of a loop. 
-    Info.MinBitWidth = std::max(Info.MinBitWidth, Info.ValidBitWidth); 
- 
-    for (auto *Operand : Operands) 
-      if (auto *IOp = dyn_cast<Instruction>(Operand)) { 
-        // If we already calculated the minimum bit-width for this valid 
-        // bit-width, or for a smaller valid bit-width, then just keep the 
-        // answer we already calculated. 
-        unsigned IOpBitwidth = InstInfoMap.lookup(IOp).ValidBitWidth; 
-        if (IOpBitwidth >= ValidBitWidth) 
-          continue; 
-        InstInfoMap[IOp].ValidBitWidth = ValidBitWidth; 
-        Worklist.push_back(IOp); 
-      } 
-  } 
-  unsigned MinBitWidth = InstInfoMap.lookup(cast<Instruction>(Src)).MinBitWidth; 
-  assert(MinBitWidth >= TruncBitWidth); 
- 
-  if (MinBitWidth > TruncBitWidth) { 
-    // In this case reducing expression with vector type might generate a new 
-    // vector type, which is not preferable as it might result in generating 
-    // sub-optimal code. 
-    if (DstTy->isVectorTy()) 
-      return OrigBitWidth; 
-    // Use the smallest integer type in the range [MinBitWidth, OrigBitWidth). 
-    Type *Ty = DL.getSmallestLegalIntType(DstTy->getContext(), MinBitWidth); 
-    // Update minimum bit-width with the new destination type bit-width if 
-    // succeeded to find such, otherwise, with original bit-width. 
-    MinBitWidth = Ty ? Ty->getScalarSizeInBits() : OrigBitWidth; 
-  } else { // MinBitWidth == TruncBitWidth 
-    // In this case the expression can be evaluated with the trunc instruction 
-    // destination type, and trunc instruction can be omitted. However, we 
-    // should not perform the evaluation if the original type is a legal scalar 
-    // type and the target type is illegal. 
-    bool FromLegal = MinBitWidth == 1 || DL.isLegalInteger(OrigBitWidth); 
-    bool ToLegal = MinBitWidth == 1 || DL.isLegalInteger(MinBitWidth); 
-    if (!DstTy->isVectorTy() && FromLegal && !ToLegal) 
-      return OrigBitWidth; 
-  } 
-  return MinBitWidth; 
-} 
- 
-Type *TruncInstCombine::getBestTruncatedType() { 
-  if (!buildTruncExpressionDag()) 
-    return nullptr; 
- 
-  // We don't want to duplicate instructions, which isn't profitable. Thus, we 
-  // can't shrink something that has multiple users, unless all users are 
-  // post-dominated by the trunc instruction, i.e., were visited during the 
-  // expression evaluation. 
-  unsigned DesiredBitWidth = 0; 
-  for (auto Itr : InstInfoMap) { 
-    Instruction *I = Itr.first; 
-    if (I->hasOneUse()) 
-      continue; 
-    bool IsExtInst = (isa<ZExtInst>(I) || isa<SExtInst>(I)); 
-    for (auto *U : I->users()) 
-      if (auto *UI = dyn_cast<Instruction>(U)) 
-        if (UI != CurrentTruncInst && !InstInfoMap.count(UI)) { 
-          if (!IsExtInst) 
-            return nullptr; 
-          // If this is an extension from the dest type, we can eliminate it, 
-          // even if it has multiple users. Thus, update the DesiredBitWidth and 
-          // validate all extension instructions agrees on same DesiredBitWidth. 
-          unsigned ExtInstBitWidth = 
-              I->getOperand(0)->getType()->getScalarSizeInBits(); 
-          if (DesiredBitWidth && DesiredBitWidth != ExtInstBitWidth) 
-            return nullptr; 
-          DesiredBitWidth = ExtInstBitWidth; 
-        } 
-  } 
- 
-  unsigned OrigBitWidth = 
-      CurrentTruncInst->getOperand(0)->getType()->getScalarSizeInBits(); 
- 
-  // Calculate minimum allowed bit-width allowed for shrinking the currently 
-  // visited truncate's operand. 
-  unsigned MinBitWidth = getMinBitWidth(); 
- 
-  // Check that we can shrink to smaller bit-width than original one and that 
-  // it is similar to the DesiredBitWidth is such exists. 
-  if (MinBitWidth >= OrigBitWidth || 
-      (DesiredBitWidth && DesiredBitWidth != MinBitWidth)) 
-    return nullptr; 
- 
-  return IntegerType::get(CurrentTruncInst->getContext(), MinBitWidth); 
-} 
- 
-/// Given a reduced scalar type \p Ty and a \p V value, return a reduced type 
-/// for \p V, according to its type, if it vector type, return the vector 
-/// version of \p Ty, otherwise return \p Ty. 
-static Type *getReducedType(Value *V, Type *Ty) { 
-  assert(Ty && !Ty->isVectorTy() && "Expect Scalar Type"); 
+      break;
+    }
+    default:
+      // TODO: Can handle more cases here:
+      // 1. shufflevector, extractelement, insertelement
+      // 2. udiv, urem
+      // 3. shl, lshr, ashr
+      // 4. phi node(and loop handling)
+      // ...
+      return false;
+    }
+  }
+  return true;
+}
+
+unsigned TruncInstCombine::getMinBitWidth() {
+  SmallVector<Value *, 8> Worklist;
+  SmallVector<Instruction *, 8> Stack;
+
+  Value *Src = CurrentTruncInst->getOperand(0);
+  Type *DstTy = CurrentTruncInst->getType();
+  unsigned TruncBitWidth = DstTy->getScalarSizeInBits();
+  unsigned OrigBitWidth =
+      CurrentTruncInst->getOperand(0)->getType()->getScalarSizeInBits();
+
+  if (isa<Constant>(Src))
+    return TruncBitWidth;
+
+  Worklist.push_back(Src);
+  InstInfoMap[cast<Instruction>(Src)].ValidBitWidth = TruncBitWidth;
+
+  while (!Worklist.empty()) {
+    Value *Curr = Worklist.back();
+
+    if (isa<Constant>(Curr)) {
+      Worklist.pop_back();
+      continue;
+    }
+
+    // Otherwise, it must be an instruction.
+    auto *I = cast<Instruction>(Curr);
+
+    auto &Info = InstInfoMap[I];
+
+    SmallVector<Value *, 2> Operands;
+    getRelevantOperands(I, Operands);
+
+    if (!Stack.empty() && Stack.back() == I) {
+      // Already handled all instruction operands, can remove it from both, the
+      // Worklist and the Stack, and update MinBitWidth.
+      Worklist.pop_back();
+      Stack.pop_back();
+      for (auto *Operand : Operands)
+        if (auto *IOp = dyn_cast<Instruction>(Operand))
+          Info.MinBitWidth =
+              std::max(Info.MinBitWidth, InstInfoMap[IOp].MinBitWidth);
+      continue;
+    }
+
+    // Add the instruction to the stack before start handling its operands.
+    Stack.push_back(I);
+    unsigned ValidBitWidth = Info.ValidBitWidth;
+
+    // Update minimum bit-width before handling its operands. This is required
+    // when the instruction is part of a loop.
+    Info.MinBitWidth = std::max(Info.MinBitWidth, Info.ValidBitWidth);
+
+    for (auto *Operand : Operands)
+      if (auto *IOp = dyn_cast<Instruction>(Operand)) {
+        // If we already calculated the minimum bit-width for this valid
+        // bit-width, or for a smaller valid bit-width, then just keep the
+        // answer we already calculated.
+        unsigned IOpBitwidth = InstInfoMap.lookup(IOp).ValidBitWidth;
+        if (IOpBitwidth >= ValidBitWidth)
+          continue;
+        InstInfoMap[IOp].ValidBitWidth = ValidBitWidth;
+        Worklist.push_back(IOp);
+      }
+  }
+  unsigned MinBitWidth = InstInfoMap.lookup(cast<Instruction>(Src)).MinBitWidth;
+  assert(MinBitWidth >= TruncBitWidth);
+
+  if (MinBitWidth > TruncBitWidth) {
+    // In this case reducing expression with vector type might generate a new
+    // vector type, which is not preferable as it might result in generating
+    // sub-optimal code.
+    if (DstTy->isVectorTy())
+      return OrigBitWidth;
+    // Use the smallest integer type in the range [MinBitWidth, OrigBitWidth).
+    Type *Ty = DL.getSmallestLegalIntType(DstTy->getContext(), MinBitWidth);
+    // Update minimum bit-width with the new destination type bit-width if
+    // succeeded to find such, otherwise, with original bit-width.
+    MinBitWidth = Ty ? Ty->getScalarSizeInBits() : OrigBitWidth;
+  } else { // MinBitWidth == TruncBitWidth
+    // In this case the expression can be evaluated with the trunc instruction
+    // destination type, and trunc instruction can be omitted. However, we
+    // should not perform the evaluation if the original type is a legal scalar
+    // type and the target type is illegal.
+    bool FromLegal = MinBitWidth == 1 || DL.isLegalInteger(OrigBitWidth);
+    bool ToLegal = MinBitWidth == 1 || DL.isLegalInteger(MinBitWidth);
+    if (!DstTy->isVectorTy() && FromLegal && !ToLegal)
+      return OrigBitWidth;
+  }
+  return MinBitWidth;
+}
+
+Type *TruncInstCombine::getBestTruncatedType() {
+  if (!buildTruncExpressionDag())
+    return nullptr;
+
+  // We don't want to duplicate instructions, which isn't profitable. Thus, we
+  // can't shrink something that has multiple users, unless all users are
+  // post-dominated by the trunc instruction, i.e., were visited during the
+  // expression evaluation.
+  unsigned DesiredBitWidth = 0;
+  for (auto Itr : InstInfoMap) {
+    Instruction *I = Itr.first;
+    if (I->hasOneUse())
+      continue;
+    bool IsExtInst = (isa<ZExtInst>(I) || isa<SExtInst>(I));
+    for (auto *U : I->users())
+      if (auto *UI = dyn_cast<Instruction>(U))
+        if (UI != CurrentTruncInst && !InstInfoMap.count(UI)) {
+          if (!IsExtInst)
+            return nullptr;
+          // If this is an extension from the dest type, we can eliminate it,
+          // even if it has multiple users. Thus, update the DesiredBitWidth and
+          // validate all extension instructions agrees on same DesiredBitWidth.
+          unsigned ExtInstBitWidth =
+              I->getOperand(0)->getType()->getScalarSizeInBits();
+          if (DesiredBitWidth && DesiredBitWidth != ExtInstBitWidth)
+            return nullptr;
+          DesiredBitWidth = ExtInstBitWidth;
+        }
+  }
+
+  unsigned OrigBitWidth =
+      CurrentTruncInst->getOperand(0)->getType()->getScalarSizeInBits();
+
+  // Calculate minimum allowed bit-width allowed for shrinking the currently
+  // visited truncate's operand.
+  unsigned MinBitWidth = getMinBitWidth();
+
+  // Check that we can shrink to smaller bit-width than original one and that
+  // it is similar to the DesiredBitWidth is such exists.
+  if (MinBitWidth >= OrigBitWidth ||
+      (DesiredBitWidth && DesiredBitWidth != MinBitWidth))
+    return nullptr;
+
+  return IntegerType::get(CurrentTruncInst->getContext(), MinBitWidth);
+}
+
+/// Given a reduced scalar type \p Ty and a \p V value, return a reduced type
+/// for \p V, according to its type, if it vector type, return the vector
+/// version of \p Ty, otherwise return \p Ty.
+static Type *getReducedType(Value *V, Type *Ty) {
+  assert(Ty && !Ty->isVectorTy() && "Expect Scalar Type");
   if (auto *VTy = dyn_cast<VectorType>(V->getType()))
     return VectorType::get(Ty, VTy->getElementCount());
-  return Ty; 
-} 
- 
-Value *TruncInstCombine::getReducedOperand(Value *V, Type *SclTy) { 
-  Type *Ty = getReducedType(V, SclTy); 
-  if (auto *C = dyn_cast<Constant>(V)) { 
-    C = ConstantExpr::getIntegerCast(C, Ty, false); 
-    // If we got a constantexpr back, try to simplify it with DL info. 
-    return ConstantFoldConstant(C, DL, &TLI); 
-  } 
- 
-  auto *I = cast<Instruction>(V); 
-  Info Entry = InstInfoMap.lookup(I); 
-  assert(Entry.NewValue); 
-  return Entry.NewValue; 
-} 
- 
-void TruncInstCombine::ReduceExpressionDag(Type *SclTy) { 
-  NumInstrsReduced += InstInfoMap.size(); 
-  for (auto &Itr : InstInfoMap) { // Forward 
-    Instruction *I = Itr.first; 
-    TruncInstCombine::Info &NodeInfo = Itr.second; 
- 
-    assert(!NodeInfo.NewValue && "Instruction has been evaluated"); 
- 
-    IRBuilder<> Builder(I); 
-    Value *Res = nullptr; 
-    unsigned Opc = I->getOpcode(); 
-    switch (Opc) { 
-    case Instruction::Trunc: 
-    case Instruction::ZExt: 
-    case Instruction::SExt: { 
-      Type *Ty = getReducedType(I, SclTy); 
-      // If the source type of the cast is the type we're trying for then we can 
-      // just return the source.  There's no need to insert it because it is not 
-      // new. 
-      if (I->getOperand(0)->getType() == Ty) { 
-        assert(!isa<TruncInst>(I) && "Cannot reach here with TruncInst"); 
-        NodeInfo.NewValue = I->getOperand(0); 
-        continue; 
-      } 
-      // Otherwise, must be the same type of cast, so just reinsert a new one. 
-      // This also handles the case of zext(trunc(x)) -> zext(x). 
-      Res = Builder.CreateIntCast(I->getOperand(0), Ty, 
-                                  Opc == Instruction::SExt); 
- 
-      // Update Worklist entries with new value if needed. 
-      // There are three possible changes to the Worklist: 
-      // 1. Update Old-TruncInst -> New-TruncInst. 
-      // 2. Remove Old-TruncInst (if New node is not TruncInst). 
-      // 3. Add New-TruncInst (if Old node was not TruncInst). 
+  return Ty;
+}
+
+Value *TruncInstCombine::getReducedOperand(Value *V, Type *SclTy) {
+  Type *Ty = getReducedType(V, SclTy);
+  if (auto *C = dyn_cast<Constant>(V)) {
+    C = ConstantExpr::getIntegerCast(C, Ty, false);
+    // If we got a constantexpr back, try to simplify it with DL info.
+    return ConstantFoldConstant(C, DL, &TLI);
+  }
+
+  auto *I = cast<Instruction>(V);
+  Info Entry = InstInfoMap.lookup(I);
+  assert(Entry.NewValue);
+  return Entry.NewValue;
+}
+
+void TruncInstCombine::ReduceExpressionDag(Type *SclTy) {
+  NumInstrsReduced += InstInfoMap.size();
+  for (auto &Itr : InstInfoMap) { // Forward
+    Instruction *I = Itr.first;
+    TruncInstCombine::Info &NodeInfo = Itr.second;
+
+    assert(!NodeInfo.NewValue && "Instruction has been evaluated");
+
+    IRBuilder<> Builder(I);
+    Value *Res = nullptr;
+    unsigned Opc = I->getOpcode();
+    switch (Opc) {
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt: {
+      Type *Ty = getReducedType(I, SclTy);
+      // If the source type of the cast is the type we're trying for then we can
+      // just return the source.  There's no need to insert it because it is not
+      // new.
+      if (I->getOperand(0)->getType() == Ty) {
+        assert(!isa<TruncInst>(I) && "Cannot reach here with TruncInst");
+        NodeInfo.NewValue = I->getOperand(0);
+        continue;
+      }
+      // Otherwise, must be the same type of cast, so just reinsert a new one.
+      // This also handles the case of zext(trunc(x)) -> zext(x).
+      Res = Builder.CreateIntCast(I->getOperand(0), Ty,
+                                  Opc == Instruction::SExt);
+
+      // Update Worklist entries with new value if needed.
+      // There are three possible changes to the Worklist:
+      // 1. Update Old-TruncInst -> New-TruncInst.
+      // 2. Remove Old-TruncInst (if New node is not TruncInst).
+      // 3. Add New-TruncInst (if Old node was not TruncInst).
       auto *Entry = find(Worklist, I);
-      if (Entry != Worklist.end()) { 
-        if (auto *NewCI = dyn_cast<TruncInst>(Res)) 
-          *Entry = NewCI; 
-        else 
-          Worklist.erase(Entry); 
-      } else if (auto *NewCI = dyn_cast<TruncInst>(Res)) 
-          Worklist.push_back(NewCI); 
-      break; 
-    } 
-    case Instruction::Add: 
-    case Instruction::Sub: 
-    case Instruction::Mul: 
-    case Instruction::And: 
-    case Instruction::Or: 
-    case Instruction::Xor: { 
-      Value *LHS = getReducedOperand(I->getOperand(0), SclTy); 
-      Value *RHS = getReducedOperand(I->getOperand(1), SclTy); 
-      Res = Builder.CreateBinOp((Instruction::BinaryOps)Opc, LHS, RHS); 
-      break; 
-    } 
-    case Instruction::Select: { 
-      Value *Op0 = I->getOperand(0); 
-      Value *LHS = getReducedOperand(I->getOperand(1), SclTy); 
-      Value *RHS = getReducedOperand(I->getOperand(2), SclTy); 
-      Res = Builder.CreateSelect(Op0, LHS, RHS); 
-      break; 
-    } 
-    default: 
-      llvm_unreachable("Unhandled instruction"); 
-    } 
- 
-    NodeInfo.NewValue = Res; 
-    if (auto *ResI = dyn_cast<Instruction>(Res)) 
-      ResI->takeName(I); 
-  } 
- 
-  Value *Res = getReducedOperand(CurrentTruncInst->getOperand(0), SclTy); 
-  Type *DstTy = CurrentTruncInst->getType(); 
-  if (Res->getType() != DstTy) { 
-    IRBuilder<> Builder(CurrentTruncInst); 
-    Res = Builder.CreateIntCast(Res, DstTy, false); 
-    if (auto *ResI = dyn_cast<Instruction>(Res)) 
-      ResI->takeName(CurrentTruncInst); 
-  } 
-  CurrentTruncInst->replaceAllUsesWith(Res); 
- 
-  // Erase old expression dag, which was replaced by the reduced expression dag. 
-  // We iterate backward, which means we visit the instruction before we visit 
-  // any of its operands, this way, when we get to the operand, we already 
-  // removed the instructions (from the expression dag) that uses it. 
-  CurrentTruncInst->eraseFromParent(); 
-  for (auto I = InstInfoMap.rbegin(), E = InstInfoMap.rend(); I != E; ++I) { 
-    // We still need to check that the instruction has no users before we erase 
-    // it, because {SExt, ZExt}Inst Instruction might have other users that was 
-    // not reduced, in such case, we need to keep that instruction. 
-    if (I->first->use_empty()) 
-      I->first->eraseFromParent(); 
-  } 
-} 
- 
-bool TruncInstCombine::run(Function &F) { 
-  bool MadeIRChange = false; 
- 
-  // Collect all TruncInst in the function into the Worklist for evaluating. 
-  for (auto &BB : F) { 
-    // Ignore unreachable basic block. 
-    if (!DT.isReachableFromEntry(&BB)) 
-      continue; 
-    for (auto &I : BB) 
-      if (auto *CI = dyn_cast<TruncInst>(&I)) 
-        Worklist.push_back(CI); 
-  } 
- 
-  // Process all TruncInst in the Worklist, for each instruction: 
-  //   1. Check if it dominates an eligible expression dag to be reduced. 
-  //   2. Create a reduced expression dag and replace the old one with it. 
-  while (!Worklist.empty()) { 
-    CurrentTruncInst = Worklist.pop_back_val(); 
- 
-    if (Type *NewDstSclTy = getBestTruncatedType()) { 
-      LLVM_DEBUG( 
-          dbgs() << "ICE: TruncInstCombine reducing type of expression dag " 
-                    "dominated by: " 
-                 << CurrentTruncInst << '\n'); 
-      ReduceExpressionDag(NewDstSclTy); 
-      ++NumDAGsReduced; 
-      MadeIRChange = true; 
-    } 
-  } 
- 
-  return MadeIRChange; 
-} 
+      if (Entry != Worklist.end()) {
+        if (auto *NewCI = dyn_cast<TruncInst>(Res))
+          *Entry = NewCI;
+        else
+          Worklist.erase(Entry);
+      } else if (auto *NewCI = dyn_cast<TruncInst>(Res))
+          Worklist.push_back(NewCI);
+      break;
+    }
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::Mul:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor: {
+      Value *LHS = getReducedOperand(I->getOperand(0), SclTy);
+      Value *RHS = getReducedOperand(I->getOperand(1), SclTy);
+      Res = Builder.CreateBinOp((Instruction::BinaryOps)Opc, LHS, RHS);
+      break;
+    }
+    case Instruction::Select: {
+      Value *Op0 = I->getOperand(0);
+      Value *LHS = getReducedOperand(I->getOperand(1), SclTy);
+      Value *RHS = getReducedOperand(I->getOperand(2), SclTy);
+      Res = Builder.CreateSelect(Op0, LHS, RHS);
+      break;
+    }
+    default:
+      llvm_unreachable("Unhandled instruction");
+    }
+
+    NodeInfo.NewValue = Res;
+    if (auto *ResI = dyn_cast<Instruction>(Res))
+      ResI->takeName(I);
+  }
+
+  Value *Res = getReducedOperand(CurrentTruncInst->getOperand(0), SclTy);
+  Type *DstTy = CurrentTruncInst->getType();
+  if (Res->getType() != DstTy) {
+    IRBuilder<> Builder(CurrentTruncInst);
+    Res = Builder.CreateIntCast(Res, DstTy, false);
+    if (auto *ResI = dyn_cast<Instruction>(Res))
+      ResI->takeName(CurrentTruncInst);
+  }
+  CurrentTruncInst->replaceAllUsesWith(Res);
+
+  // Erase old expression dag, which was replaced by the reduced expression dag.
+  // We iterate backward, which means we visit the instruction before we visit
+  // any of its operands, this way, when we get to the operand, we already
+  // removed the instructions (from the expression dag) that uses it.
+  CurrentTruncInst->eraseFromParent();
+  for (auto I = InstInfoMap.rbegin(), E = InstInfoMap.rend(); I != E; ++I) {
+    // We still need to check that the instruction has no users before we erase
+    // it, because {SExt, ZExt}Inst Instruction might have other users that was
+    // not reduced, in such case, we need to keep that instruction.
+    if (I->first->use_empty())
+      I->first->eraseFromParent();
+  }
+}
+
+bool TruncInstCombine::run(Function &F) {
+  bool MadeIRChange = false;
+
+  // Collect all TruncInst in the function into the Worklist for evaluating.
+  for (auto &BB : F) {
+    // Ignore unreachable basic block.
+    if (!DT.isReachableFromEntry(&BB))
+      continue;
+    for (auto &I : BB)
+      if (auto *CI = dyn_cast<TruncInst>(&I))
+        Worklist.push_back(CI);
+  }
+
+  // Process all TruncInst in the Worklist, for each instruction:
+  //   1. Check if it dominates an eligible expression dag to be reduced.
+  //   2. Create a reduced expression dag and replace the old one with it.
+  while (!Worklist.empty()) {
+    CurrentTruncInst = Worklist.pop_back_val();
+
+    if (Type *NewDstSclTy = getBestTruncatedType()) {
+      LLVM_DEBUG(
+          dbgs() << "ICE: TruncInstCombine reducing type of expression dag "
+                    "dominated by: "
+                 << CurrentTruncInst << '\n');
+      ReduceExpressionDag(NewDstSclTy);
+      ++NumDAGsReduced;
+      MadeIRChange = true;
+    }
+  }
+
+  return MadeIRChange;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/ya.make b/contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/ya.make
index fced984059..c472a2054a 100644
--- a/contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/ya.make
+++ b/contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/ya.make
@@ -1,36 +1,36 @@
-# Generated by devtools/yamaker. 
- 
-LIBRARY() 
- 
+# Generated by devtools/yamaker.
+
+LIBRARY()
+
 OWNER(
     orivej
     g:cpp-contrib
 )
- 
+
 LICENSE(Apache-2.0 WITH LLVM-exception)
 
 LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
 
-PEERDIR( 
+PEERDIR(
     contrib/libs/llvm12
     contrib/libs/llvm12/include
     contrib/libs/llvm12/lib/Analysis
     contrib/libs/llvm12/lib/IR
     contrib/libs/llvm12/lib/Support
     contrib/libs/llvm12/lib/Transforms/Utils
-) 
- 
+)
+
 ADDINCL(
     contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine
 )
- 
-NO_COMPILER_WARNINGS() 
- 
-NO_UTIL() 
- 
-SRCS( 
-    AggressiveInstCombine.cpp 
-    TruncInstCombine.cpp 
-) 
- 
-END() 
+
+NO_COMPILER_WARNINGS()
+
+NO_UTIL()
+
+SRCS(
+    AggressiveInstCombine.cpp
+    TruncInstCombine.cpp
+)
+
+END()
diff --git a/contrib/libs/llvm12/lib/Transforms/CFGuard/CFGuard.cpp b/contrib/libs/llvm12/lib/Transforms/CFGuard/CFGuard.cpp
index b102c7d2ce..96c083a144 100644
--- a/contrib/libs/llvm12/lib/Transforms/CFGuard/CFGuard.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/CFGuard/CFGuard.cpp
@@ -1,300 +1,300 @@
-//===-- CFGuard.cpp - Control Flow Guard checks -----------------*- C++ -*-===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-/// 
-/// \file 
-/// This file contains the IR transform to add Microsoft's Control Flow Guard 
-/// checks on Windows targets. 
-/// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/CFGuard.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/Triple.h" 
-#include "llvm/IR/CallingConv.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
- 
-using namespace llvm; 
- 
-using OperandBundleDef = OperandBundleDefT<Value *>; 
- 
-#define DEBUG_TYPE "cfguard" 
- 
-STATISTIC(CFGuardCounter, "Number of Control Flow Guard checks added"); 
- 
-namespace { 
- 
-/// Adds Control Flow Guard (CFG) checks on indirect function calls/invokes. 
-/// These checks ensure that the target address corresponds to the start of an 
-/// address-taken function. X86_64 targets use the CF_Dispatch mechanism. X86, 
-/// ARM, and AArch64 targets use the CF_Check machanism. 
-class CFGuard : public FunctionPass { 
-public: 
-  static char ID; 
- 
-  enum Mechanism { CF_Check, CF_Dispatch }; 
- 
-  // Default constructor required for the INITIALIZE_PASS macro. 
-  CFGuard() : FunctionPass(ID) { 
-    initializeCFGuardPass(*PassRegistry::getPassRegistry()); 
-    // By default, use the guard check mechanism. 
-    GuardMechanism = CF_Check; 
-  } 
- 
-  // Recommended constructor used to specify the type of guard mechanism. 
-  CFGuard(Mechanism Var) : FunctionPass(ID) { 
-    initializeCFGuardPass(*PassRegistry::getPassRegistry()); 
-    GuardMechanism = Var; 
-  } 
- 
-  /// Inserts a Control Flow Guard (CFG) check on an indirect call using the CFG 
-  /// check mechanism. When the image is loaded, the loader puts the appropriate 
-  /// guard check function pointer in the __guard_check_icall_fptr global 
-  /// symbol. This checks that the target address is a valid address-taken 
-  /// function. The address of the target function is passed to the guard check 
-  /// function in an architecture-specific register (e.g. ECX on 32-bit X86, 
-  /// X15 on Aarch64, and R0 on ARM). The guard check function has no return 
-  /// value (if the target is invalid, the guard check funtion will raise an 
-  /// error). 
-  /// 
-  /// For example, the following LLVM IR: 
-  /// \code 
-  ///   %func_ptr = alloca i32 ()*, align 8 
-  ///   store i32 ()* @target_func, i32 ()** %func_ptr, align 8 
-  ///   %0 = load i32 ()*, i32 ()** %func_ptr, align 8 
-  ///   %1 = call i32 %0() 
-  /// \endcode 
-  /// 
-  /// is transformed to: 
-  /// \code 
-  ///   %func_ptr = alloca i32 ()*, align 8 
-  ///   store i32 ()* @target_func, i32 ()** %func_ptr, align 8 
-  ///   %0 = load i32 ()*, i32 ()** %func_ptr, align 8 
-  ///   %1 = load void (i8*)*, void (i8*)** @__guard_check_icall_fptr 
-  ///   %2 = bitcast i32 ()* %0 to i8* 
-  ///   call cfguard_checkcc void %1(i8* %2) 
-  ///   %3 = call i32 %0() 
-  /// \endcode 
-  /// 
-  /// For example, the following X86 assembly code: 
-  /// \code 
-  ///   movl  $_target_func, %eax 
-  ///   calll *%eax 
-  /// \endcode 
-  /// 
-  /// is transformed to: 
-  /// \code 
-  /// 	movl	$_target_func, %ecx 
-  /// 	calll	*___guard_check_icall_fptr 
-  /// 	calll	*%ecx 
-  /// \endcode 
-  /// 
-  /// \param CB indirect call to instrument. 
-  void insertCFGuardCheck(CallBase *CB); 
- 
-  /// Inserts a Control Flow Guard (CFG) check on an indirect call using the CFG 
-  /// dispatch mechanism. When the image is loaded, the loader puts the 
-  /// appropriate guard check function pointer in the 
-  /// __guard_dispatch_icall_fptr global symbol. This checks that the target 
-  /// address is a valid address-taken function and, if so, tail calls the 
-  /// target. The target address is passed in an architecture-specific register 
-  /// (e.g. RAX on X86_64), with all other arguments for the target function 
-  /// passed as usual. 
-  /// 
-  /// For example, the following LLVM IR: 
-  /// \code 
-  ///   %func_ptr = alloca i32 ()*, align 8 
-  ///   store i32 ()* @target_func, i32 ()** %func_ptr, align 8 
-  ///   %0 = load i32 ()*, i32 ()** %func_ptr, align 8 
-  ///   %1 = call i32 %0() 
-  /// \endcode 
-  /// 
-  /// is transformed to: 
-  /// \code 
-  ///   %func_ptr = alloca i32 ()*, align 8 
-  ///   store i32 ()* @target_func, i32 ()** %func_ptr, align 8 
-  ///   %0 = load i32 ()*, i32 ()** %func_ptr, align 8 
-  ///   %1 = load i32 ()*, i32 ()** @__guard_dispatch_icall_fptr 
-  ///   %2 = call i32 %1() [ "cfguardtarget"(i32 ()* %0) ] 
-  /// \endcode 
-  /// 
-  /// For example, the following X86_64 assembly code: 
-  /// \code 
-  ///   leaq   target_func(%rip), %rax 
-  ///	  callq  *%rax 
-  /// \endcode 
-  /// 
-  /// is transformed to: 
-  /// \code 
-  ///   leaq   target_func(%rip), %rax 
-  ///   callq  *__guard_dispatch_icall_fptr(%rip) 
-  /// \endcode 
-  /// 
-  /// \param CB indirect call to instrument. 
-  void insertCFGuardDispatch(CallBase *CB); 
- 
-  bool doInitialization(Module &M) override; 
-  bool runOnFunction(Function &F) override; 
- 
-private: 
-  // Only add checks if the module has the cfguard=2 flag. 
-  int cfguard_module_flag = 0; 
-  Mechanism GuardMechanism = CF_Check; 
-  FunctionType *GuardFnType = nullptr; 
-  PointerType *GuardFnPtrType = nullptr; 
-  Constant *GuardFnGlobal = nullptr; 
-}; 
- 
-} // end anonymous namespace 
- 
-void CFGuard::insertCFGuardCheck(CallBase *CB) { 
- 
-  assert(Triple(CB->getModule()->getTargetTriple()).isOSWindows() && 
-         "Only applicable for Windows targets"); 
-  assert(CB->isIndirectCall() && 
-         "Control Flow Guard checks can only be added to indirect calls"); 
- 
-  IRBuilder<> B(CB); 
-  Value *CalledOperand = CB->getCalledOperand(); 
- 
-  // Load the global symbol as a pointer to the check function. 
-  LoadInst *GuardCheckLoad = B.CreateLoad(GuardFnPtrType, GuardFnGlobal); 
- 
-  // Create new call instruction. The CFGuard check should always be a call, 
-  // even if the original CallBase is an Invoke or CallBr instruction. 
-  CallInst *GuardCheck = 
-      B.CreateCall(GuardFnType, GuardCheckLoad, 
-                   {B.CreateBitCast(CalledOperand, B.getInt8PtrTy())}); 
- 
-  // Ensure that the first argument is passed in the correct register 
-  // (e.g. ECX on 32-bit X86 targets). 
-  GuardCheck->setCallingConv(CallingConv::CFGuard_Check); 
-} 
- 
-void CFGuard::insertCFGuardDispatch(CallBase *CB) { 
- 
-  assert(Triple(CB->getModule()->getTargetTriple()).isOSWindows() && 
-         "Only applicable for Windows targets"); 
-  assert(CB->isIndirectCall() && 
-         "Control Flow Guard checks can only be added to indirect calls"); 
- 
-  IRBuilder<> B(CB); 
-  Value *CalledOperand = CB->getCalledOperand(); 
-  Type *CalledOperandType = CalledOperand->getType(); 
- 
-  // Cast the guard dispatch global to the type of the called operand. 
-  PointerType *PTy = PointerType::get(CalledOperandType, 0); 
-  if (GuardFnGlobal->getType() != PTy) 
-    GuardFnGlobal = ConstantExpr::getBitCast(GuardFnGlobal, PTy); 
- 
-  // Load the global as a pointer to a function of the same type. 
-  LoadInst *GuardDispatchLoad = B.CreateLoad(CalledOperandType, GuardFnGlobal); 
- 
-  // Add the original call target as a cfguardtarget operand bundle. 
-  SmallVector<llvm::OperandBundleDef, 1> Bundles; 
-  CB->getOperandBundlesAsDefs(Bundles); 
-  Bundles.emplace_back("cfguardtarget", CalledOperand); 
- 
-  // Create a copy of the call/invoke instruction and add the new bundle. 
-  assert((isa<CallInst>(CB) || isa<InvokeInst>(CB)) && 
-         "Unknown indirect call type"); 
-  CallBase *NewCB = CallBase::Create(CB, Bundles, CB); 
- 
-  // Change the target of the call to be the guard dispatch function. 
-  NewCB->setCalledOperand(GuardDispatchLoad); 
- 
-  // Replace the original call/invoke with the new instruction. 
-  CB->replaceAllUsesWith(NewCB); 
- 
-  // Delete the original call/invoke. 
-  CB->eraseFromParent(); 
-} 
- 
-bool CFGuard::doInitialization(Module &M) { 
- 
-  // Check if this module has the cfguard flag and read its value. 
-  if (auto *MD = 
-          mdconst::extract_or_null<ConstantInt>(M.getModuleFlag("cfguard"))) 
-    cfguard_module_flag = MD->getZExtValue(); 
- 
-  // Skip modules for which CFGuard checks have been disabled. 
-  if (cfguard_module_flag != 2) 
-    return false; 
- 
-  // Set up prototypes for the guard check and dispatch functions. 
-  GuardFnType = FunctionType::get(Type::getVoidTy(M.getContext()), 
-                                  {Type::getInt8PtrTy(M.getContext())}, false); 
-  GuardFnPtrType = PointerType::get(GuardFnType, 0); 
- 
-  // Get or insert the guard check or dispatch global symbols. 
-  if (GuardMechanism == CF_Check) { 
-    GuardFnGlobal = 
-        M.getOrInsertGlobal("__guard_check_icall_fptr", GuardFnPtrType); 
-  } else { 
-    assert(GuardMechanism == CF_Dispatch && "Invalid CFGuard mechanism"); 
-    GuardFnGlobal = 
-        M.getOrInsertGlobal("__guard_dispatch_icall_fptr", GuardFnPtrType); 
-  } 
- 
-  return true; 
-} 
- 
-bool CFGuard::runOnFunction(Function &F) { 
- 
-  // Skip modules for which CFGuard checks have been disabled. 
-  if (cfguard_module_flag != 2) 
-    return false; 
- 
-  SmallVector<CallBase *, 8> IndirectCalls; 
- 
-  // Iterate over the instructions to find all indirect call/invoke/callbr 
-  // instructions. Make a separate list of pointers to indirect 
-  // call/invoke/callbr instructions because the original instructions will be 
-  // deleted as the checks are added. 
-  for (BasicBlock &BB : F.getBasicBlockList()) { 
-    for (Instruction &I : BB.getInstList()) { 
-      auto *CB = dyn_cast<CallBase>(&I); 
-      if (CB && CB->isIndirectCall() && !CB->hasFnAttr("guard_nocf")) { 
-        IndirectCalls.push_back(CB); 
-        CFGuardCounter++; 
-      } 
-    } 
-  } 
- 
-  // If no checks are needed, return early. 
-  if (IndirectCalls.empty()) { 
-    return false; 
-  } 
- 
-  // For each indirect call/invoke, add the appropriate dispatch or check. 
-  if (GuardMechanism == CF_Dispatch) { 
-    for (CallBase *CB : IndirectCalls) { 
-      insertCFGuardDispatch(CB); 
-    } 
-  } else { 
-    for (CallBase *CB : IndirectCalls) { 
-      insertCFGuardCheck(CB); 
-    } 
-  } 
- 
-  return true; 
-} 
- 
-char CFGuard::ID = 0; 
-INITIALIZE_PASS(CFGuard, "CFGuard", "CFGuard", false, false) 
- 
-FunctionPass *llvm::createCFGuardCheckPass() { 
-  return new CFGuard(CFGuard::CF_Check); 
-} 
- 
-FunctionPass *llvm::createCFGuardDispatchPass() { 
-  return new CFGuard(CFGuard::CF_Dispatch); 
-} 
+//===-- CFGuard.cpp - Control Flow Guard checks -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the IR transform to add Microsoft's Control Flow Guard
+/// checks on Windows targets.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/CFGuard.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+using OperandBundleDef = OperandBundleDefT<Value *>;
+
+#define DEBUG_TYPE "cfguard"
+
+STATISTIC(CFGuardCounter, "Number of Control Flow Guard checks added");
+
+namespace {
+
+/// Adds Control Flow Guard (CFG) checks on indirect function calls/invokes.
+/// These checks ensure that the target address corresponds to the start of an
+/// address-taken function. X86_64 targets use the CF_Dispatch mechanism. X86,
+/// ARM, and AArch64 targets use the CF_Check machanism.
+class CFGuard : public FunctionPass {
+public:
+  static char ID;
+
+  enum Mechanism { CF_Check, CF_Dispatch };
+
+  // Default constructor required for the INITIALIZE_PASS macro.
+  CFGuard() : FunctionPass(ID) {
+    initializeCFGuardPass(*PassRegistry::getPassRegistry());
+    // By default, use the guard check mechanism.
+    GuardMechanism = CF_Check;
+  }
+
+  // Recommended constructor used to specify the type of guard mechanism.
+  CFGuard(Mechanism Var) : FunctionPass(ID) {
+    initializeCFGuardPass(*PassRegistry::getPassRegistry());
+    GuardMechanism = Var;
+  }
+
+  /// Inserts a Control Flow Guard (CFG) check on an indirect call using the CFG
+  /// check mechanism. When the image is loaded, the loader puts the appropriate
+  /// guard check function pointer in the __guard_check_icall_fptr global
+  /// symbol. This checks that the target address is a valid address-taken
+  /// function. The address of the target function is passed to the guard check
+  /// function in an architecture-specific register (e.g. ECX on 32-bit X86,
+  /// X15 on Aarch64, and R0 on ARM). The guard check function has no return
+  /// value (if the target is invalid, the guard check funtion will raise an
+  /// error).
+  ///
+  /// For example, the following LLVM IR:
+  /// \code
+  ///   %func_ptr = alloca i32 ()*, align 8
+  ///   store i32 ()* @target_func, i32 ()** %func_ptr, align 8
+  ///   %0 = load i32 ()*, i32 ()** %func_ptr, align 8
+  ///   %1 = call i32 %0()
+  /// \endcode
+  ///
+  /// is transformed to:
+  /// \code
+  ///   %func_ptr = alloca i32 ()*, align 8
+  ///   store i32 ()* @target_func, i32 ()** %func_ptr, align 8
+  ///   %0 = load i32 ()*, i32 ()** %func_ptr, align 8
+  ///   %1 = load void (i8*)*, void (i8*)** @__guard_check_icall_fptr
+  ///   %2 = bitcast i32 ()* %0 to i8*
+  ///   call cfguard_checkcc void %1(i8* %2)
+  ///   %3 = call i32 %0()
+  /// \endcode
+  ///
+  /// For example, the following X86 assembly code:
+  /// \code
+  ///   movl  $_target_func, %eax
+  ///   calll *%eax
+  /// \endcode
+  ///
+  /// is transformed to:
+  /// \code
+  /// 	movl	$_target_func, %ecx
+  /// 	calll	*___guard_check_icall_fptr
+  /// 	calll	*%ecx
+  /// \endcode
+  ///
+  /// \param CB indirect call to instrument.
+  void insertCFGuardCheck(CallBase *CB);
+
+  /// Inserts a Control Flow Guard (CFG) check on an indirect call using the CFG
+  /// dispatch mechanism. When the image is loaded, the loader puts the
+  /// appropriate guard check function pointer in the
+  /// __guard_dispatch_icall_fptr global symbol. This checks that the target
+  /// address is a valid address-taken function and, if so, tail calls the
+  /// target. The target address is passed in an architecture-specific register
+  /// (e.g. RAX on X86_64), with all other arguments for the target function
+  /// passed as usual.
+  ///
+  /// For example, the following LLVM IR:
+  /// \code
+  ///   %func_ptr = alloca i32 ()*, align 8
+  ///   store i32 ()* @target_func, i32 ()** %func_ptr, align 8
+  ///   %0 = load i32 ()*, i32 ()** %func_ptr, align 8
+  ///   %1 = call i32 %0()
+  /// \endcode
+  ///
+  /// is transformed to:
+  /// \code
+  ///   %func_ptr = alloca i32 ()*, align 8
+  ///   store i32 ()* @target_func, i32 ()** %func_ptr, align 8
+  ///   %0 = load i32 ()*, i32 ()** %func_ptr, align 8
+  ///   %1 = load i32 ()*, i32 ()** @__guard_dispatch_icall_fptr
+  ///   %2 = call i32 %1() [ "cfguardtarget"(i32 ()* %0) ]
+  /// \endcode
+  ///
+  /// For example, the following X86_64 assembly code:
+  /// \code
+  ///   leaq   target_func(%rip), %rax
+  ///	  callq  *%rax
+  /// \endcode
+  ///
+  /// is transformed to:
+  /// \code
+  ///   leaq   target_func(%rip), %rax
+  ///   callq  *__guard_dispatch_icall_fptr(%rip)
+  /// \endcode
+  ///
+  /// \param CB indirect call to instrument.
+  void insertCFGuardDispatch(CallBase *CB);
+
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+
+private:
+  // Only add checks if the module has the cfguard=2 flag.
+  int cfguard_module_flag = 0;
+  Mechanism GuardMechanism = CF_Check;
+  FunctionType *GuardFnType = nullptr;
+  PointerType *GuardFnPtrType = nullptr;
+  Constant *GuardFnGlobal = nullptr;
+};
+
+} // end anonymous namespace
+
+void CFGuard::insertCFGuardCheck(CallBase *CB) {
+
+  assert(Triple(CB->getModule()->getTargetTriple()).isOSWindows() &&
+         "Only applicable for Windows targets");
+  assert(CB->isIndirectCall() &&
+         "Control Flow Guard checks can only be added to indirect calls");
+
+  IRBuilder<> B(CB);
+  Value *CalledOperand = CB->getCalledOperand();
+
+  // Load the global symbol as a pointer to the check function.
+  LoadInst *GuardCheckLoad = B.CreateLoad(GuardFnPtrType, GuardFnGlobal);
+
+  // Create new call instruction. The CFGuard check should always be a call,
+  // even if the original CallBase is an Invoke or CallBr instruction.
+  CallInst *GuardCheck =
+      B.CreateCall(GuardFnType, GuardCheckLoad,
+                   {B.CreateBitCast(CalledOperand, B.getInt8PtrTy())});
+
+  // Ensure that the first argument is passed in the correct register
+  // (e.g. ECX on 32-bit X86 targets).
+  GuardCheck->setCallingConv(CallingConv::CFGuard_Check);
+}
+
+void CFGuard::insertCFGuardDispatch(CallBase *CB) {
+
+  assert(Triple(CB->getModule()->getTargetTriple()).isOSWindows() &&
+         "Only applicable for Windows targets");
+  assert(CB->isIndirectCall() &&
+         "Control Flow Guard checks can only be added to indirect calls");
+
+  IRBuilder<> B(CB);
+  Value *CalledOperand = CB->getCalledOperand();
+  Type *CalledOperandType = CalledOperand->getType();
+
+  // Cast the guard dispatch global to the type of the called operand.
+  PointerType *PTy = PointerType::get(CalledOperandType, 0);
+  if (GuardFnGlobal->getType() != PTy)
+    GuardFnGlobal = ConstantExpr::getBitCast(GuardFnGlobal, PTy);
+
+  // Load the global as a pointer to a function of the same type.
+  LoadInst *GuardDispatchLoad = B.CreateLoad(CalledOperandType, GuardFnGlobal);
+
+  // Add the original call target as a cfguardtarget operand bundle.
+  SmallVector<llvm::OperandBundleDef, 1> Bundles;
+  CB->getOperandBundlesAsDefs(Bundles);
+  Bundles.emplace_back("cfguardtarget", CalledOperand);
+
+  // Create a copy of the call/invoke instruction and add the new bundle.
+  assert((isa<CallInst>(CB) || isa<InvokeInst>(CB)) &&
+         "Unknown indirect call type");
+  CallBase *NewCB = CallBase::Create(CB, Bundles, CB);
+
+  // Change the target of the call to be the guard dispatch function.
+  NewCB->setCalledOperand(GuardDispatchLoad);
+
+  // Replace the original call/invoke with the new instruction.
+  CB->replaceAllUsesWith(NewCB);
+
+  // Delete the original call/invoke.
+  CB->eraseFromParent();
+}
+
+bool CFGuard::doInitialization(Module &M) {
+
+  // Check if this module has the cfguard flag and read its value.
+  if (auto *MD =
+          mdconst::extract_or_null<ConstantInt>(M.getModuleFlag("cfguard")))
+    cfguard_module_flag = MD->getZExtValue();
+
+  // Skip modules for which CFGuard checks have been disabled.
+  if (cfguard_module_flag != 2)
+    return false;
+
+  // Set up prototypes for the guard check and dispatch functions.
+  GuardFnType = FunctionType::get(Type::getVoidTy(M.getContext()),
+                                  {Type::getInt8PtrTy(M.getContext())}, false);
+  GuardFnPtrType = PointerType::get(GuardFnType, 0);
+
+  // Get or insert the guard check or dispatch global symbols.
+  if (GuardMechanism == CF_Check) {
+    GuardFnGlobal =
+        M.getOrInsertGlobal("__guard_check_icall_fptr", GuardFnPtrType);
+  } else {
+    assert(GuardMechanism == CF_Dispatch && "Invalid CFGuard mechanism");
+    GuardFnGlobal =
+        M.getOrInsertGlobal("__guard_dispatch_icall_fptr", GuardFnPtrType);
+  }
+
+  return true;
+}
+
+bool CFGuard::runOnFunction(Function &F) {
+
+  // Skip modules for which CFGuard checks have been disabled.
+  if (cfguard_module_flag != 2)
+    return false;
+
+  SmallVector<CallBase *, 8> IndirectCalls;
+
+  // Iterate over the instructions to find all indirect call/invoke/callbr
+  // instructions. Make a separate list of pointers to indirect
+  // call/invoke/callbr instructions because the original instructions will be
+  // deleted as the checks are added.
+  for (BasicBlock &BB : F.getBasicBlockList()) {
+    for (Instruction &I : BB.getInstList()) {
+      auto *CB = dyn_cast<CallBase>(&I);
+      if (CB && CB->isIndirectCall() && !CB->hasFnAttr("guard_nocf")) {
+        IndirectCalls.push_back(CB);
+        CFGuardCounter++;
+      }
+    }
+  }
+
+  // If no checks are needed, return early.
+  if (IndirectCalls.empty()) {
+    return false;
+  }
+
+  // For each indirect call/invoke, add the appropriate dispatch or check.
+  if (GuardMechanism == CF_Dispatch) {
+    for (CallBase *CB : IndirectCalls) {
+      insertCFGuardDispatch(CB);
+    }
+  } else {
+    for (CallBase *CB : IndirectCalls) {
+      insertCFGuardCheck(CB);
+    }
+  }
+
+  return true;
+}
+
+char CFGuard::ID = 0;
+INITIALIZE_PASS(CFGuard, "CFGuard", "CFGuard", false, false)
+
+FunctionPass *llvm::createCFGuardCheckPass() {
+  return new CFGuard(CFGuard::CF_Check);
+}
+
+FunctionPass *llvm::createCFGuardDispatchPass() {
+  return new CFGuard(CFGuard::CF_Dispatch);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/CFGuard/ya.make b/contrib/libs/llvm12/lib/Transforms/CFGuard/ya.make
index fa6d03488d..37fe9ccc94 100644
--- a/contrib/libs/llvm12/lib/Transforms/CFGuard/ya.make
+++ b/contrib/libs/llvm12/lib/Transforms/CFGuard/ya.make
@@ -1,33 +1,33 @@
-# Generated by devtools/yamaker. 
- 
-LIBRARY() 
- 
+# Generated by devtools/yamaker.
+
+LIBRARY()
+
 OWNER(
     orivej
     g:cpp-contrib
 )
- 
+
 LICENSE(Apache-2.0 WITH LLVM-exception)
 
 LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
 
-PEERDIR( 
+PEERDIR(
     contrib/libs/llvm12
     contrib/libs/llvm12/include
     contrib/libs/llvm12/lib/IR
     contrib/libs/llvm12/lib/Support
-) 
- 
+)
+
 ADDINCL(
     contrib/libs/llvm12/lib/Transforms/CFGuard
 )
- 
-NO_COMPILER_WARNINGS() 
- 
-NO_UTIL() 
- 
-SRCS( 
-    CFGuard.cpp 
-) 
- 
-END() 
+
+NO_COMPILER_WARNINGS()
+
+NO_UTIL()
+
+SRCS(
+    CFGuard.cpp
+)
+
+END()
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/AlwaysInliner.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/AlwaysInliner.cpp
index c7bb0803e3..532599b42e 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/AlwaysInliner.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/AlwaysInliner.cpp
@@ -1,51 +1,51 @@
-//===- InlineAlways.cpp - Code to inline always_inline functions ----------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements a custom inliner that handles only functions that 
-// are marked as "always inline". 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/IPO/AlwaysInliner.h" 
-#include "llvm/ADT/SetVector.h" 
+//===- InlineAlways.cpp - Code to inline always_inline functions ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a custom inliner that handles only functions that
+// are marked as "always inline".
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/InlineCost.h" 
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/IR/CallingConv.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Transforms/IPO.h" 
-#include "llvm/Transforms/IPO/Inliner.h" 
-#include "llvm/Transforms/Utils/Cloning.h" 
-#include "llvm/Transforms/Utils/ModuleUtils.h" 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "inline" 
- 
-PreservedAnalyses AlwaysInlinerPass::run(Module &M, 
-                                         ModuleAnalysisManager &MAM) { 
-  // Add inline assumptions during code generation. 
-  FunctionAnalysisManager &FAM = 
-      MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 
-  auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & { 
-    return FAM.getResult<AssumptionAnalysis>(F); 
-  }; 
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/Inliner.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "inline"
+
+PreservedAnalyses AlwaysInlinerPass::run(Module &M,
+                                         ModuleAnalysisManager &MAM) {
+  // Add inline assumptions during code generation.
+  FunctionAnalysisManager &FAM =
+      MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
+    return FAM.getResult<AssumptionAnalysis>(F);
+  };
   auto &PSI = MAM.getResult<ProfileSummaryAnalysis>(M);
- 
-  SmallSetVector<CallBase *, 16> Calls; 
-  bool Changed = false; 
-  SmallVector<Function *, 16> InlinedFunctions; 
+
+  SmallSetVector<CallBase *, 16> Calls;
+  bool Changed = false;
+  SmallVector<Function *, 16> InlinedFunctions;
   for (Function &F : M) {
     // When callee coroutine function is inlined into caller coroutine function
     // before coro-split pass,
@@ -54,15 +54,15 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M,
     if (F.isPresplitCoroutine())
       continue;
 
-    if (!F.isDeclaration() && F.hasFnAttribute(Attribute::AlwaysInline) && 
-        isInlineViable(F).isSuccess()) { 
-      Calls.clear(); 
- 
-      for (User *U : F.users()) 
-        if (auto *CB = dyn_cast<CallBase>(U)) 
-          if (CB->getCalledFunction() == &F) 
-            Calls.insert(CB); 
- 
+    if (!F.isDeclaration() && F.hasFnAttribute(Attribute::AlwaysInline) &&
+        isInlineViable(F).isSuccess()) {
+      Calls.clear();
+
+      for (User *U : F.users())
+        if (auto *CB = dyn_cast<CallBase>(U))
+          if (CB->getCalledFunction() == &F)
+            Calls.insert(CB);
+
       for (CallBase *CB : Calls) {
         Function *Caller = CB->getCaller();
         OptimizationRemarkEmitter ORE(Caller);
@@ -75,7 +75,7 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M,
         assert(OIC);
         emitInlinedInto(ORE, CB->getDebugLoc(), CB->getParent(), F, *Caller,
                         *OIC, false, DEBUG_TYPE);
- 
+
         InlineFunctionInfo IFI(
             /*cg=*/nullptr, GetAssumptionCache, &PSI,
             &FAM.getResult<BlockFrequencyAnalysis>(*(CB->getCaller())),
@@ -92,104 +92,104 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M,
         Changed = true;
       }
 
-      // Remember to try and delete this function afterward. This both avoids 
-      // re-walking the rest of the module and avoids dealing with any iterator 
-      // invalidation issues while deleting functions. 
-      InlinedFunctions.push_back(&F); 
-    } 
+      // Remember to try and delete this function afterward. This both avoids
+      // re-walking the rest of the module and avoids dealing with any iterator
+      // invalidation issues while deleting functions.
+      InlinedFunctions.push_back(&F);
+    }
+  }
+
+  // Remove any live functions.
+  erase_if(InlinedFunctions, [&](Function *F) {
+    F->removeDeadConstantUsers();
+    return !F->isDefTriviallyDead();
+  });
+
+  // Delete the non-comdat ones from the module and also from our vector.
+  auto NonComdatBegin = partition(
+      InlinedFunctions, [&](Function *F) { return F->hasComdat(); });
+  for (Function *F : make_range(NonComdatBegin, InlinedFunctions.end()))
+    M.getFunctionList().erase(F);
+  InlinedFunctions.erase(NonComdatBegin, InlinedFunctions.end());
+
+  if (!InlinedFunctions.empty()) {
+    // Now we just have the comdat functions. Filter out the ones whose comdats
+    // are not actually dead.
+    filterDeadComdatFunctions(M, InlinedFunctions);
+    // The remaining functions are actually dead.
+    for (Function *F : InlinedFunctions)
+      M.getFunctionList().erase(F);
+  }
+
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
+
+namespace {
+
+/// Inliner pass which only handles "always inline" functions.
+///
+/// Unlike the \c AlwaysInlinerPass, this uses the more heavyweight \c Inliner
+/// base class to provide several facilities such as array alloca merging.
+class AlwaysInlinerLegacyPass : public LegacyInlinerBase {
+
+public:
+  AlwaysInlinerLegacyPass() : LegacyInlinerBase(ID, /*InsertLifetime*/ true) {
+    initializeAlwaysInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  AlwaysInlinerLegacyPass(bool InsertLifetime)
+      : LegacyInlinerBase(ID, InsertLifetime) {
+    initializeAlwaysInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  /// Main run interface method.  We override here to avoid calling skipSCC().
+  bool runOnSCC(CallGraphSCC &SCC) override { return inlineCalls(SCC); }
+
+  static char ID; // Pass identification, replacement for typeid
+
+  InlineCost getInlineCost(CallBase &CB) override;
+
+  using llvm::Pass::doFinalization;
+  bool doFinalization(CallGraph &CG) override {
+    return removeDeadFunctions(CG, /*AlwaysInlineOnly=*/true);
   }
- 
-  // Remove any live functions. 
-  erase_if(InlinedFunctions, [&](Function *F) { 
-    F->removeDeadConstantUsers(); 
-    return !F->isDefTriviallyDead(); 
-  }); 
- 
-  // Delete the non-comdat ones from the module and also from our vector. 
-  auto NonComdatBegin = partition( 
-      InlinedFunctions, [&](Function *F) { return F->hasComdat(); }); 
-  for (Function *F : make_range(NonComdatBegin, InlinedFunctions.end())) 
-    M.getFunctionList().erase(F); 
-  InlinedFunctions.erase(NonComdatBegin, InlinedFunctions.end()); 
- 
-  if (!InlinedFunctions.empty()) { 
-    // Now we just have the comdat functions. Filter out the ones whose comdats 
-    // are not actually dead. 
-    filterDeadComdatFunctions(M, InlinedFunctions); 
-    // The remaining functions are actually dead. 
-    for (Function *F : InlinedFunctions) 
-      M.getFunctionList().erase(F); 
-  } 
- 
-  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); 
-} 
- 
-namespace { 
- 
-/// Inliner pass which only handles "always inline" functions. 
-/// 
-/// Unlike the \c AlwaysInlinerPass, this uses the more heavyweight \c Inliner 
-/// base class to provide several facilities such as array alloca merging. 
-class AlwaysInlinerLegacyPass : public LegacyInlinerBase { 
- 
-public: 
-  AlwaysInlinerLegacyPass() : LegacyInlinerBase(ID, /*InsertLifetime*/ true) { 
-    initializeAlwaysInlinerLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  AlwaysInlinerLegacyPass(bool InsertLifetime) 
-      : LegacyInlinerBase(ID, InsertLifetime) { 
-    initializeAlwaysInlinerLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  /// Main run interface method.  We override here to avoid calling skipSCC(). 
-  bool runOnSCC(CallGraphSCC &SCC) override { return inlineCalls(SCC); } 
- 
-  static char ID; // Pass identification, replacement for typeid 
- 
-  InlineCost getInlineCost(CallBase &CB) override; 
- 
-  using llvm::Pass::doFinalization; 
-  bool doFinalization(CallGraph &CG) override { 
-    return removeDeadFunctions(CG, /*AlwaysInlineOnly=*/true); 
-  } 
-}; 
-} 
- 
-char AlwaysInlinerLegacyPass::ID = 0; 
-INITIALIZE_PASS_BEGIN(AlwaysInlinerLegacyPass, "always-inline", 
-                      "Inliner for always_inline functions", false, false) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
-INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_END(AlwaysInlinerLegacyPass, "always-inline", 
-                    "Inliner for always_inline functions", false, false) 
- 
-Pass *llvm::createAlwaysInlinerLegacyPass(bool InsertLifetime) { 
-  return new AlwaysInlinerLegacyPass(InsertLifetime); 
-} 
- 
-/// Get the inline cost for the always-inliner. 
-/// 
-/// The always inliner *only* handles functions which are marked with the 
-/// attribute to force inlining. As such, it is dramatically simpler and avoids 
-/// using the powerful (but expensive) inline cost analysis. Instead it uses 
-/// a very simple and boring direct walk of the instructions looking for 
-/// impossible-to-inline constructs. 
-/// 
-/// Note, it would be possible to go to some lengths to cache the information 
-/// computed here, but as we only expect to do this for relatively few and 
-/// small functions which have the explicit attribute to force inlining, it is 
-/// likely not worth it in practice. 
-InlineCost AlwaysInlinerLegacyPass::getInlineCost(CallBase &CB) { 
-  Function *Callee = CB.getCalledFunction(); 
- 
-  // Only inline direct calls to functions with always-inline attributes 
-  // that are viable for inlining. 
-  if (!Callee) 
-    return InlineCost::getNever("indirect call"); 
- 
+};
+}
+
+char AlwaysInlinerLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(AlwaysInlinerLegacyPass, "always-inline",
+                      "Inliner for always_inline functions", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(AlwaysInlinerLegacyPass, "always-inline",
+                    "Inliner for always_inline functions", false, false)
+
+Pass *llvm::createAlwaysInlinerLegacyPass(bool InsertLifetime) {
+  return new AlwaysInlinerLegacyPass(InsertLifetime);
+}
+
+/// Get the inline cost for the always-inliner.
+///
+/// The always inliner *only* handles functions which are marked with the
+/// attribute to force inlining. As such, it is dramatically simpler and avoids
+/// using the powerful (but expensive) inline cost analysis. Instead it uses
+/// a very simple and boring direct walk of the instructions looking for
+/// impossible-to-inline constructs.
+///
+/// Note, it would be possible to go to some lengths to cache the information
+/// computed here, but as we only expect to do this for relatively few and
+/// small functions which have the explicit attribute to force inlining, it is
+/// likely not worth it in practice.
+InlineCost AlwaysInlinerLegacyPass::getInlineCost(CallBase &CB) {
+  Function *Callee = CB.getCalledFunction();
+
+  // Only inline direct calls to functions with always-inline attributes
+  // that are viable for inlining.
+  if (!Callee)
+    return InlineCost::getNever("indirect call");
+
   // When callee coroutine function is inlined into caller coroutine function
   // before coro-split pass,
   // coro-early pass can not handle this quiet well.
@@ -197,16 +197,16 @@ InlineCost AlwaysInlinerLegacyPass::getInlineCost(CallBase &CB) {
   if (Callee->isPresplitCoroutine())
     return InlineCost::getNever("unsplited coroutine call");
 
-  // FIXME: We shouldn't even get here for declarations. 
-  if (Callee->isDeclaration()) 
-    return InlineCost::getNever("no definition"); 
- 
-  if (!CB.hasFnAttr(Attribute::AlwaysInline)) 
-    return InlineCost::getNever("no alwaysinline attribute"); 
- 
-  auto IsViable = isInlineViable(*Callee); 
-  if (!IsViable.isSuccess()) 
-    return InlineCost::getNever(IsViable.getFailureReason()); 
- 
-  return InlineCost::getAlways("always inliner"); 
-} 
+  // FIXME: We shouldn't even get here for declarations.
+  if (Callee->isDeclaration())
+    return InlineCost::getNever("no definition");
+
+  if (!CB.hasFnAttr(Attribute::AlwaysInline))
+    return InlineCost::getNever("no alwaysinline attribute");
+
+  auto IsViable = isInlineViable(*Callee);
+  if (!IsViable.isSuccess())
+    return InlineCost::getNever(IsViable.getFailureReason());
+
+  return InlineCost::getAlways("always inliner");
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/ArgumentPromotion.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/ArgumentPromotion.cpp
index 47fea8047d..7998a1ae5c 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -1,1175 +1,1175 @@
-//===- ArgumentPromotion.cpp - Promote by-reference arguments -------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass promotes "by reference" arguments to be "by value" arguments.  In 
-// practice, this means looking for internal functions that have pointer 
-// arguments.  If it can prove, through the use of alias analysis, that an 
-// argument is *only* loaded, then it can pass the value into the function 
-// instead of the address of the value.  This can cause recursive simplification 
-// of code and lead to the elimination of allocas (especially in C++ template 
-// code like the STL). 
-// 
-// This pass also handles aggregate arguments that are passed into a function, 
-// scalarizing them if the elements of the aggregate are only loaded.  Note that 
-// by default it refuses to scalarize aggregates which would require passing in 
-// more than three operands to the function, because passing thousands of 
-// operands for a large array or structure is unprofitable! This limit can be 
-// configured or disabled, however. 
-// 
-// Note that this transformation could also be done for arguments that are only 
-// stored to (returning the value instead), but does not currently.  This case 
-// would be best handled when and if LLVM begins supporting multiple return 
-// values from functions. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/IPO/ArgumentPromotion.h" 
-#include "llvm/ADT/DepthFirstIterator.h" 
-#include "llvm/ADT/None.h" 
-#include "llvm/ADT/Optional.h" 
-#include "llvm/ADT/STLExtras.h" 
+//===- ArgumentPromotion.cpp - Promote by-reference arguments -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass promotes "by reference" arguments to be "by value" arguments.  In
+// practice, this means looking for internal functions that have pointer
+// arguments.  If it can prove, through the use of alias analysis, that an
+// argument is *only* loaded, then it can pass the value into the function
+// instead of the address of the value.  This can cause recursive simplification
+// of code and lead to the elimination of allocas (especially in C++ template
+// code like the STL).
+//
+// This pass also handles aggregate arguments that are passed into a function,
+// scalarizing them if the elements of the aggregate are only loaded.  Note that
+// by default it refuses to scalarize aggregates which would require passing in
+// more than three operands to the function, because passing thousands of
+// operands for a large array or structure is unprofitable! This limit can be
+// configured or disabled, however.
+//
+// Note that this transformation could also be done for arguments that are only
+// stored to (returning the value instead), but does not currently.  This case
+// would be best handled when and if LLVM begins supporting multiple return
+// values from functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/ArgumentPromotion.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/Twine.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/BasicAliasAnalysis.h" 
-#include "llvm/Analysis/CGSCCPassManager.h" 
-#include "llvm/Analysis/CallGraph.h" 
-#include "llvm/Analysis/CallGraphSCCPass.h" 
-#include "llvm/Analysis/LazyCallGraph.h" 
-#include "llvm/Analysis/Loads.h" 
-#include "llvm/Analysis/MemoryLocation.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/IR/Argument.h" 
-#include "llvm/IR/Attributes.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/NoFolder.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/FormatVariadic.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/IPO.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <cstdint> 
-#include <functional> 
-#include <iterator> 
-#include <map> 
-#include <set> 
-#include <string> 
-#include <utility> 
-#include <vector> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "argpromotion" 
- 
-STATISTIC(NumArgumentsPromoted, "Number of pointer arguments promoted"); 
-STATISTIC(NumAggregatesPromoted, "Number of aggregate arguments promoted"); 
-STATISTIC(NumByValArgsPromoted, "Number of byval arguments promoted"); 
-STATISTIC(NumArgumentsDead, "Number of dead pointer args eliminated"); 
- 
-/// A vector used to hold the indices of a single GEP instruction 
-using IndicesVector = std::vector<uint64_t>; 
- 
-/// DoPromotion - This method actually performs the promotion of the specified 
-/// arguments, and returns the new function.  At this point, we know that it's 
-/// safe to do so. 
-static Function * 
-doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote, 
-            SmallPtrSetImpl<Argument *> &ByValArgsToTransform, 
-            Optional<function_ref<void(CallBase &OldCS, CallBase &NewCS)>> 
-                ReplaceCallSite) { 
-  // Start by computing a new prototype for the function, which is the same as 
-  // the old function, but has modified arguments. 
-  FunctionType *FTy = F->getFunctionType(); 
-  std::vector<Type *> Params; 
- 
-  using ScalarizeTable = std::set<std::pair<Type *, IndicesVector>>; 
- 
-  // ScalarizedElements - If we are promoting a pointer that has elements 
-  // accessed out of it, keep track of which elements are accessed so that we 
-  // can add one argument for each. 
-  // 
-  // Arguments that are directly loaded will have a zero element value here, to 
-  // handle cases where there are both a direct load and GEP accesses. 
-  std::map<Argument *, ScalarizeTable> ScalarizedElements; 
- 
-  // OriginalLoads - Keep track of a representative load instruction from the 
-  // original function so that we can tell the alias analysis implementation 
-  // what the new GEP/Load instructions we are inserting look like. 
-  // We need to keep the original loads for each argument and the elements 
-  // of the argument that are accessed. 
-  std::map<std::pair<Argument *, IndicesVector>, LoadInst *> OriginalLoads; 
- 
-  // Attribute - Keep track of the parameter attributes for the arguments 
-  // that we are *not* promoting. For the ones that we do promote, the parameter 
-  // attributes are lost 
-  SmallVector<AttributeSet, 8> ArgAttrVec; 
-  AttributeList PAL = F->getAttributes(); 
- 
-  // First, determine the new argument list 
-  unsigned ArgNo = 0; 
-  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; 
-       ++I, ++ArgNo) { 
-    if (ByValArgsToTransform.count(&*I)) { 
-      // Simple byval argument? Just add all the struct element types. 
-      Type *AgTy = cast<PointerType>(I->getType())->getElementType(); 
-      StructType *STy = cast<StructType>(AgTy); 
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/CGSCCPassManager.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/LazyCallGraph.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/NoFolder.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <map>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "argpromotion"
+
+STATISTIC(NumArgumentsPromoted, "Number of pointer arguments promoted");
+STATISTIC(NumAggregatesPromoted, "Number of aggregate arguments promoted");
+STATISTIC(NumByValArgsPromoted, "Number of byval arguments promoted");
+STATISTIC(NumArgumentsDead, "Number of dead pointer args eliminated");
+
+/// A vector used to hold the indices of a single GEP instruction
+using IndicesVector = std::vector<uint64_t>;
+
+/// DoPromotion - This method actually performs the promotion of the specified
+/// arguments, and returns the new function.  At this point, we know that it's
+/// safe to do so.
+static Function *
+doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
+            SmallPtrSetImpl<Argument *> &ByValArgsToTransform,
+            Optional<function_ref<void(CallBase &OldCS, CallBase &NewCS)>>
+                ReplaceCallSite) {
+  // Start by computing a new prototype for the function, which is the same as
+  // the old function, but has modified arguments.
+  FunctionType *FTy = F->getFunctionType();
+  std::vector<Type *> Params;
+
+  using ScalarizeTable = std::set<std::pair<Type *, IndicesVector>>;
+
+  // ScalarizedElements - If we are promoting a pointer that has elements
+  // accessed out of it, keep track of which elements are accessed so that we
+  // can add one argument for each.
+  //
+  // Arguments that are directly loaded will have a zero element value here, to
+  // handle cases where there are both a direct load and GEP accesses.
+  std::map<Argument *, ScalarizeTable> ScalarizedElements;
+
+  // OriginalLoads - Keep track of a representative load instruction from the
+  // original function so that we can tell the alias analysis implementation
+  // what the new GEP/Load instructions we are inserting look like.
+  // We need to keep the original loads for each argument and the elements
+  // of the argument that are accessed.
+  std::map<std::pair<Argument *, IndicesVector>, LoadInst *> OriginalLoads;
+
+  // Attribute - Keep track of the parameter attributes for the arguments
+  // that we are *not* promoting. For the ones that we do promote, the parameter
+  // attributes are lost
+  SmallVector<AttributeSet, 8> ArgAttrVec;
+  AttributeList PAL = F->getAttributes();
+
+  // First, determine the new argument list
+  unsigned ArgNo = 0;
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
+       ++I, ++ArgNo) {
+    if (ByValArgsToTransform.count(&*I)) {
+      // Simple byval argument? Just add all the struct element types.
+      Type *AgTy = cast<PointerType>(I->getType())->getElementType();
+      StructType *STy = cast<StructType>(AgTy);
       llvm::append_range(Params, STy->elements());
-      ArgAttrVec.insert(ArgAttrVec.end(), STy->getNumElements(), 
-                        AttributeSet()); 
-      ++NumByValArgsPromoted; 
-    } else if (!ArgsToPromote.count(&*I)) { 
-      // Unchanged argument 
-      Params.push_back(I->getType()); 
-      ArgAttrVec.push_back(PAL.getParamAttributes(ArgNo)); 
-    } else if (I->use_empty()) { 
-      // Dead argument (which are always marked as promotable) 
-      ++NumArgumentsDead; 
-    } else { 
-      // Okay, this is being promoted. This means that the only uses are loads 
-      // or GEPs which are only used by loads 
- 
-      // In this table, we will track which indices are loaded from the argument 
-      // (where direct loads are tracked as no indices). 
-      ScalarizeTable &ArgIndices = ScalarizedElements[&*I]; 
+      ArgAttrVec.insert(ArgAttrVec.end(), STy->getNumElements(),
+                        AttributeSet());
+      ++NumByValArgsPromoted;
+    } else if (!ArgsToPromote.count(&*I)) {
+      // Unchanged argument
+      Params.push_back(I->getType());
+      ArgAttrVec.push_back(PAL.getParamAttributes(ArgNo));
+    } else if (I->use_empty()) {
+      // Dead argument (which are always marked as promotable)
+      ++NumArgumentsDead;
+    } else {
+      // Okay, this is being promoted. This means that the only uses are loads
+      // or GEPs which are only used by loads
+
+      // In this table, we will track which indices are loaded from the argument
+      // (where direct loads are tracked as no indices).
+      ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
       for (User *U : make_early_inc_range(I->users())) {
-        Instruction *UI = cast<Instruction>(U); 
-        Type *SrcTy; 
-        if (LoadInst *L = dyn_cast<LoadInst>(UI)) 
-          SrcTy = L->getType(); 
-        else 
-          SrcTy = cast<GetElementPtrInst>(UI)->getSourceElementType(); 
+        Instruction *UI = cast<Instruction>(U);
+        Type *SrcTy;
+        if (LoadInst *L = dyn_cast<LoadInst>(UI))
+          SrcTy = L->getType();
+        else
+          SrcTy = cast<GetElementPtrInst>(UI)->getSourceElementType();
         // Skip dead GEPs and remove them.
         if (isa<GetElementPtrInst>(UI) && UI->use_empty()) {
           UI->eraseFromParent();
           continue;
         }
 
-        IndicesVector Indices; 
-        Indices.reserve(UI->getNumOperands() - 1); 
-        // Since loads will only have a single operand, and GEPs only a single 
-        // non-index operand, this will record direct loads without any indices, 
-        // and gep+loads with the GEP indices. 
-        for (User::op_iterator II = UI->op_begin() + 1, IE = UI->op_end(); 
-             II != IE; ++II) 
-          Indices.push_back(cast<ConstantInt>(*II)->getSExtValue()); 
-        // GEPs with a single 0 index can be merged with direct loads 
-        if (Indices.size() == 1 && Indices.front() == 0) 
-          Indices.clear(); 
-        ArgIndices.insert(std::make_pair(SrcTy, Indices)); 
-        LoadInst *OrigLoad; 
-        if (LoadInst *L = dyn_cast<LoadInst>(UI)) 
-          OrigLoad = L; 
-        else 
-          // Take any load, we will use it only to update Alias Analysis 
-          OrigLoad = cast<LoadInst>(UI->user_back()); 
-        OriginalLoads[std::make_pair(&*I, Indices)] = OrigLoad; 
-      } 
- 
-      // Add a parameter to the function for each element passed in. 
-      for (const auto &ArgIndex : ArgIndices) { 
-        // not allowed to dereference ->begin() if size() is 0 
-        Params.push_back(GetElementPtrInst::getIndexedType( 
-            cast<PointerType>(I->getType())->getElementType(), 
-            ArgIndex.second)); 
-        ArgAttrVec.push_back(AttributeSet()); 
-        assert(Params.back()); 
-      } 
- 
-      if (ArgIndices.size() == 1 && ArgIndices.begin()->second.empty()) 
-        ++NumArgumentsPromoted; 
-      else 
-        ++NumAggregatesPromoted; 
-    } 
-  } 
- 
-  Type *RetTy = FTy->getReturnType(); 
- 
-  // Construct the new function type using the new arguments. 
-  FunctionType *NFTy = FunctionType::get(RetTy, Params, FTy->isVarArg()); 
- 
-  // Create the new function body and insert it into the module. 
-  Function *NF = Function::Create(NFTy, F->getLinkage(), F->getAddressSpace(), 
-                                  F->getName()); 
-  NF->copyAttributesFrom(F); 
+        IndicesVector Indices;
+        Indices.reserve(UI->getNumOperands() - 1);
+        // Since loads will only have a single operand, and GEPs only a single
+        // non-index operand, this will record direct loads without any indices,
+        // and gep+loads with the GEP indices.
+        for (User::op_iterator II = UI->op_begin() + 1, IE = UI->op_end();
+             II != IE; ++II)
+          Indices.push_back(cast<ConstantInt>(*II)->getSExtValue());
+        // GEPs with a single 0 index can be merged with direct loads
+        if (Indices.size() == 1 && Indices.front() == 0)
+          Indices.clear();
+        ArgIndices.insert(std::make_pair(SrcTy, Indices));
+        LoadInst *OrigLoad;
+        if (LoadInst *L = dyn_cast<LoadInst>(UI))
+          OrigLoad = L;
+        else
+          // Take any load, we will use it only to update Alias Analysis
+          OrigLoad = cast<LoadInst>(UI->user_back());
+        OriginalLoads[std::make_pair(&*I, Indices)] = OrigLoad;
+      }
+
+      // Add a parameter to the function for each element passed in.
+      for (const auto &ArgIndex : ArgIndices) {
+        // not allowed to dereference ->begin() if size() is 0
+        Params.push_back(GetElementPtrInst::getIndexedType(
+            cast<PointerType>(I->getType())->getElementType(),
+            ArgIndex.second));
+        ArgAttrVec.push_back(AttributeSet());
+        assert(Params.back());
+      }
+
+      if (ArgIndices.size() == 1 && ArgIndices.begin()->second.empty())
+        ++NumArgumentsPromoted;
+      else
+        ++NumAggregatesPromoted;
+    }
+  }
+
+  Type *RetTy = FTy->getReturnType();
+
+  // Construct the new function type using the new arguments.
+  FunctionType *NFTy = FunctionType::get(RetTy, Params, FTy->isVarArg());
+
+  // Create the new function body and insert it into the module.
+  Function *NF = Function::Create(NFTy, F->getLinkage(), F->getAddressSpace(),
+                                  F->getName());
+  NF->copyAttributesFrom(F);
   NF->copyMetadata(F, 0);
- 
+
   // The new function will have the !dbg metadata copied from the original
   // function. The original function may not be deleted, and dbg metadata need
   // to be unique so we need to drop it.
-  F->setSubprogram(nullptr); 
- 
-  LLVM_DEBUG(dbgs() << "ARG PROMOTION:  Promoting to:" << *NF << "\n" 
-                    << "From: " << *F); 
- 
-  // Recompute the parameter attributes list based on the new arguments for 
-  // the function. 
-  NF->setAttributes(AttributeList::get(F->getContext(), PAL.getFnAttributes(), 
-                                       PAL.getRetAttributes(), ArgAttrVec)); 
-  ArgAttrVec.clear(); 
- 
-  F->getParent()->getFunctionList().insert(F->getIterator(), NF); 
-  NF->takeName(F); 
- 
-  // Loop over all of the callers of the function, transforming the call sites 
-  // to pass in the loaded pointers. 
-  // 
-  SmallVector<Value *, 16> Args; 
-  while (!F->use_empty()) { 
-    CallBase &CB = cast<CallBase>(*F->user_back()); 
-    assert(CB.getCalledFunction() == F); 
-    const AttributeList &CallPAL = CB.getAttributes(); 
-    IRBuilder<NoFolder> IRB(&CB); 
- 
-    // Loop over the operands, inserting GEP and loads in the caller as 
-    // appropriate. 
-    auto AI = CB.arg_begin(); 
-    ArgNo = 0; 
-    for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; 
-         ++I, ++AI, ++ArgNo) 
-      if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) { 
-        Args.push_back(*AI); // Unmodified argument 
-        ArgAttrVec.push_back(CallPAL.getParamAttributes(ArgNo)); 
-      } else if (ByValArgsToTransform.count(&*I)) { 
-        // Emit a GEP and load for each element of the struct. 
-        Type *AgTy = cast<PointerType>(I->getType())->getElementType(); 
-        StructType *STy = cast<StructType>(AgTy); 
-        Value *Idxs[2] = { 
-            ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), nullptr}; 
-        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 
-          Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i); 
-          auto *Idx = 
-              IRB.CreateGEP(STy, *AI, Idxs, (*AI)->getName() + "." + Twine(i)); 
-          // TODO: Tell AA about the new values? 
-          Args.push_back(IRB.CreateLoad(STy->getElementType(i), Idx, 
-                                        Idx->getName() + ".val")); 
-          ArgAttrVec.push_back(AttributeSet()); 
-        } 
-      } else if (!I->use_empty()) { 
-        // Non-dead argument: insert GEPs and loads as appropriate. 
-        ScalarizeTable &ArgIndices = ScalarizedElements[&*I]; 
-        // Store the Value* version of the indices in here, but declare it now 
-        // for reuse. 
-        std::vector<Value *> Ops; 
-        for (const auto &ArgIndex : ArgIndices) { 
-          Value *V = *AI; 
-          LoadInst *OrigLoad = 
-              OriginalLoads[std::make_pair(&*I, ArgIndex.second)]; 
-          if (!ArgIndex.second.empty()) { 
-            Ops.reserve(ArgIndex.second.size()); 
-            Type *ElTy = V->getType(); 
-            for (auto II : ArgIndex.second) { 
-              // Use i32 to index structs, and i64 for others (pointers/arrays). 
-              // This satisfies GEP constraints. 
-              Type *IdxTy = 
-                  (ElTy->isStructTy() ? Type::getInt32Ty(F->getContext()) 
-                                      : Type::getInt64Ty(F->getContext())); 
-              Ops.push_back(ConstantInt::get(IdxTy, II)); 
-              // Keep track of the type we're currently indexing. 
-              if (auto *ElPTy = dyn_cast<PointerType>(ElTy)) 
-                ElTy = ElPTy->getElementType(); 
-              else 
-                ElTy = GetElementPtrInst::getTypeAtIndex(ElTy, II); 
-            } 
-            // And create a GEP to extract those indices. 
-            V = IRB.CreateGEP(ArgIndex.first, V, Ops, V->getName() + ".idx"); 
-            Ops.clear(); 
-          } 
-          // Since we're replacing a load make sure we take the alignment 
-          // of the previous load. 
-          LoadInst *newLoad = 
-              IRB.CreateLoad(OrigLoad->getType(), V, V->getName() + ".val"); 
-          newLoad->setAlignment(OrigLoad->getAlign()); 
-          // Transfer the AA info too. 
-          AAMDNodes AAInfo; 
-          OrigLoad->getAAMetadata(AAInfo); 
-          newLoad->setAAMetadata(AAInfo); 
- 
-          Args.push_back(newLoad); 
-          ArgAttrVec.push_back(AttributeSet()); 
-        } 
-      } 
- 
-    // Push any varargs arguments on the list. 
-    for (; AI != CB.arg_end(); ++AI, ++ArgNo) { 
-      Args.push_back(*AI); 
-      ArgAttrVec.push_back(CallPAL.getParamAttributes(ArgNo)); 
-    } 
- 
-    SmallVector<OperandBundleDef, 1> OpBundles; 
-    CB.getOperandBundlesAsDefs(OpBundles); 
- 
-    CallBase *NewCS = nullptr; 
-    if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) { 
-      NewCS = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(), 
-                                 Args, OpBundles, "", &CB); 
-    } else { 
-      auto *NewCall = CallInst::Create(NF, Args, OpBundles, "", &CB); 
-      NewCall->setTailCallKind(cast<CallInst>(&CB)->getTailCallKind()); 
-      NewCS = NewCall; 
-    } 
-    NewCS->setCallingConv(CB.getCallingConv()); 
-    NewCS->setAttributes( 
-        AttributeList::get(F->getContext(), CallPAL.getFnAttributes(), 
-                           CallPAL.getRetAttributes(), ArgAttrVec)); 
-    NewCS->copyMetadata(CB, {LLVMContext::MD_prof, LLVMContext::MD_dbg}); 
-    Args.clear(); 
-    ArgAttrVec.clear(); 
- 
-    // Update the callgraph to know that the callsite has been transformed. 
-    if (ReplaceCallSite) 
-      (*ReplaceCallSite)(CB, *NewCS); 
- 
-    if (!CB.use_empty()) { 
-      CB.replaceAllUsesWith(NewCS); 
-      NewCS->takeName(&CB); 
-    } 
- 
-    // Finally, remove the old call from the program, reducing the use-count of 
-    // F. 
-    CB.eraseFromParent(); 
-  } 
- 
-  const DataLayout &DL = F->getParent()->getDataLayout(); 
- 
-  // Since we have now created the new function, splice the body of the old 
-  // function right into the new function, leaving the old rotting hulk of the 
-  // function empty. 
-  NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList()); 
- 
-  // Loop over the argument list, transferring uses of the old arguments over to 
-  // the new arguments, also transferring over the names as well. 
-  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(), 
-                              I2 = NF->arg_begin(); 
-       I != E; ++I) { 
-    if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) { 
-      // If this is an unmodified argument, move the name and users over to the 
-      // new version. 
-      I->replaceAllUsesWith(&*I2); 
-      I2->takeName(&*I); 
-      ++I2; 
-      continue; 
-    } 
- 
-    if (ByValArgsToTransform.count(&*I)) { 
-      // In the callee, we create an alloca, and store each of the new incoming 
-      // arguments into the alloca. 
-      Instruction *InsertPt = &NF->begin()->front(); 
- 
-      // Just add all the struct element types. 
-      Type *AgTy = cast<PointerType>(I->getType())->getElementType(); 
-      Value *TheAlloca = new AllocaInst( 
-          AgTy, DL.getAllocaAddrSpace(), nullptr, 
-          I->getParamAlign().getValueOr(DL.getPrefTypeAlign(AgTy)), "", 
-          InsertPt); 
-      StructType *STy = cast<StructType>(AgTy); 
-      Value *Idxs[2] = {ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), 
-                        nullptr}; 
- 
-      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 
-        Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i); 
-        Value *Idx = GetElementPtrInst::Create( 
-            AgTy, TheAlloca, Idxs, TheAlloca->getName() + "." + Twine(i), 
-            InsertPt); 
-        I2->setName(I->getName() + "." + Twine(i)); 
-        new StoreInst(&*I2++, Idx, InsertPt); 
-      } 
- 
-      // Anything that used the arg should now use the alloca. 
-      I->replaceAllUsesWith(TheAlloca); 
-      TheAlloca->takeName(&*I); 
- 
-      // If the alloca is used in a call, we must clear the tail flag since 
-      // the callee now uses an alloca from the caller. 
-      for (User *U : TheAlloca->users()) { 
-        CallInst *Call = dyn_cast<CallInst>(U); 
-        if (!Call) 
-          continue; 
-        Call->setTailCall(false); 
-      } 
-      continue; 
-    } 
- 
+  F->setSubprogram(nullptr);
+
+  LLVM_DEBUG(dbgs() << "ARG PROMOTION:  Promoting to:" << *NF << "\n"
+                    << "From: " << *F);
+
+  // Recompute the parameter attributes list based on the new arguments for
+  // the function.
+  NF->setAttributes(AttributeList::get(F->getContext(), PAL.getFnAttributes(),
+                                       PAL.getRetAttributes(), ArgAttrVec));
+  ArgAttrVec.clear();
+
+  F->getParent()->getFunctionList().insert(F->getIterator(), NF);
+  NF->takeName(F);
+
+  // Loop over all of the callers of the function, transforming the call sites
+  // to pass in the loaded pointers.
+  //
+  SmallVector<Value *, 16> Args;
+  while (!F->use_empty()) {
+    CallBase &CB = cast<CallBase>(*F->user_back());
+    assert(CB.getCalledFunction() == F);
+    const AttributeList &CallPAL = CB.getAttributes();
+    IRBuilder<NoFolder> IRB(&CB);
+
+    // Loop over the operands, inserting GEP and loads in the caller as
+    // appropriate.
+    auto AI = CB.arg_begin();
+    ArgNo = 0;
+    for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
+         ++I, ++AI, ++ArgNo)
+      if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) {
+        Args.push_back(*AI); // Unmodified argument
+        ArgAttrVec.push_back(CallPAL.getParamAttributes(ArgNo));
+      } else if (ByValArgsToTransform.count(&*I)) {
+        // Emit a GEP and load for each element of the struct.
+        Type *AgTy = cast<PointerType>(I->getType())->getElementType();
+        StructType *STy = cast<StructType>(AgTy);
+        Value *Idxs[2] = {
+            ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), nullptr};
+        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+          Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i);
+          auto *Idx =
+              IRB.CreateGEP(STy, *AI, Idxs, (*AI)->getName() + "." + Twine(i));
+          // TODO: Tell AA about the new values?
+          Args.push_back(IRB.CreateLoad(STy->getElementType(i), Idx,
+                                        Idx->getName() + ".val"));
+          ArgAttrVec.push_back(AttributeSet());
+        }
+      } else if (!I->use_empty()) {
+        // Non-dead argument: insert GEPs and loads as appropriate.
+        ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
+        // Store the Value* version of the indices in here, but declare it now
+        // for reuse.
+        std::vector<Value *> Ops;
+        for (const auto &ArgIndex : ArgIndices) {
+          Value *V = *AI;
+          LoadInst *OrigLoad =
+              OriginalLoads[std::make_pair(&*I, ArgIndex.second)];
+          if (!ArgIndex.second.empty()) {
+            Ops.reserve(ArgIndex.second.size());
+            Type *ElTy = V->getType();
+            for (auto II : ArgIndex.second) {
+              // Use i32 to index structs, and i64 for others (pointers/arrays).
+              // This satisfies GEP constraints.
+              Type *IdxTy =
+                  (ElTy->isStructTy() ? Type::getInt32Ty(F->getContext())
+                                      : Type::getInt64Ty(F->getContext()));
+              Ops.push_back(ConstantInt::get(IdxTy, II));
+              // Keep track of the type we're currently indexing.
+              if (auto *ElPTy = dyn_cast<PointerType>(ElTy))
+                ElTy = ElPTy->getElementType();
+              else
+                ElTy = GetElementPtrInst::getTypeAtIndex(ElTy, II);
+            }
+            // And create a GEP to extract those indices.
+            V = IRB.CreateGEP(ArgIndex.first, V, Ops, V->getName() + ".idx");
+            Ops.clear();
+          }
+          // Since we're replacing a load make sure we take the alignment
+          // of the previous load.
+          LoadInst *newLoad =
+              IRB.CreateLoad(OrigLoad->getType(), V, V->getName() + ".val");
+          newLoad->setAlignment(OrigLoad->getAlign());
+          // Transfer the AA info too.
+          AAMDNodes AAInfo;
+          OrigLoad->getAAMetadata(AAInfo);
+          newLoad->setAAMetadata(AAInfo);
+
+          Args.push_back(newLoad);
+          ArgAttrVec.push_back(AttributeSet());
+        }
+      }
+
+    // Push any varargs arguments on the list.
+    for (; AI != CB.arg_end(); ++AI, ++ArgNo) {
+      Args.push_back(*AI);
+      ArgAttrVec.push_back(CallPAL.getParamAttributes(ArgNo));
+    }
+
+    SmallVector<OperandBundleDef, 1> OpBundles;
+    CB.getOperandBundlesAsDefs(OpBundles);
+
+    CallBase *NewCS = nullptr;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
+      NewCS = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
+                                 Args, OpBundles, "", &CB);
+    } else {
+      auto *NewCall = CallInst::Create(NF, Args, OpBundles, "", &CB);
+      NewCall->setTailCallKind(cast<CallInst>(&CB)->getTailCallKind());
+      NewCS = NewCall;
+    }
+    NewCS->setCallingConv(CB.getCallingConv());
+    NewCS->setAttributes(
+        AttributeList::get(F->getContext(), CallPAL.getFnAttributes(),
+                           CallPAL.getRetAttributes(), ArgAttrVec));
+    NewCS->copyMetadata(CB, {LLVMContext::MD_prof, LLVMContext::MD_dbg});
+    Args.clear();
+    ArgAttrVec.clear();
+
+    // Update the callgraph to know that the callsite has been transformed.
+    if (ReplaceCallSite)
+      (*ReplaceCallSite)(CB, *NewCS);
+
+    if (!CB.use_empty()) {
+      CB.replaceAllUsesWith(NewCS);
+      NewCS->takeName(&CB);
+    }
+
+    // Finally, remove the old call from the program, reducing the use-count of
+    // F.
+    CB.eraseFromParent();
+  }
+
+  const DataLayout &DL = F->getParent()->getDataLayout();
+
+  // Since we have now created the new function, splice the body of the old
+  // function right into the new function, leaving the old rotting hulk of the
+  // function empty.
+  NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList());
+
+  // Loop over the argument list, transferring uses of the old arguments over to
+  // the new arguments, also transferring over the names as well.
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(),
+                              I2 = NF->arg_begin();
+       I != E; ++I) {
+    if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) {
+      // If this is an unmodified argument, move the name and users over to the
+      // new version.
+      I->replaceAllUsesWith(&*I2);
+      I2->takeName(&*I);
+      ++I2;
+      continue;
+    }
+
+    if (ByValArgsToTransform.count(&*I)) {
+      // In the callee, we create an alloca, and store each of the new incoming
+      // arguments into the alloca.
+      Instruction *InsertPt = &NF->begin()->front();
+
+      // Just add all the struct element types.
+      Type *AgTy = cast<PointerType>(I->getType())->getElementType();
+      Value *TheAlloca = new AllocaInst(
+          AgTy, DL.getAllocaAddrSpace(), nullptr,
+          I->getParamAlign().getValueOr(DL.getPrefTypeAlign(AgTy)), "",
+          InsertPt);
+      StructType *STy = cast<StructType>(AgTy);
+      Value *Idxs[2] = {ConstantInt::get(Type::getInt32Ty(F->getContext()), 0),
+                        nullptr};
+
+      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+        Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i);
+        Value *Idx = GetElementPtrInst::Create(
+            AgTy, TheAlloca, Idxs, TheAlloca->getName() + "." + Twine(i),
+            InsertPt);
+        I2->setName(I->getName() + "." + Twine(i));
+        new StoreInst(&*I2++, Idx, InsertPt);
+      }
+
+      // Anything that used the arg should now use the alloca.
+      I->replaceAllUsesWith(TheAlloca);
+      TheAlloca->takeName(&*I);
+
+      // If the alloca is used in a call, we must clear the tail flag since
+      // the callee now uses an alloca from the caller.
+      for (User *U : TheAlloca->users()) {
+        CallInst *Call = dyn_cast<CallInst>(U);
+        if (!Call)
+          continue;
+        Call->setTailCall(false);
+      }
+      continue;
+    }
+
     // There potentially are metadata uses for things like llvm.dbg.value.
     // Replace them with undef, after handling the other regular uses.
     auto RauwUndefMetadata = make_scope_exit(
         [&]() { I->replaceAllUsesWith(UndefValue::get(I->getType())); });
 
-    if (I->use_empty()) 
-      continue; 
- 
-    // Otherwise, if we promoted this argument, then all users are load 
-    // instructions (or GEPs with only load users), and all loads should be 
-    // using the new argument that we added. 
-    ScalarizeTable &ArgIndices = ScalarizedElements[&*I]; 
- 
-    while (!I->use_empty()) { 
-      if (LoadInst *LI = dyn_cast<LoadInst>(I->user_back())) { 
-        assert(ArgIndices.begin()->second.empty() && 
-               "Load element should sort to front!"); 
-        I2->setName(I->getName() + ".val"); 
-        LI->replaceAllUsesWith(&*I2); 
-        LI->eraseFromParent(); 
-        LLVM_DEBUG(dbgs() << "*** Promoted load of argument '" << I->getName() 
-                          << "' in function '" << F->getName() << "'\n"); 
-      } else { 
-        GetElementPtrInst *GEP = cast<GetElementPtrInst>(I->user_back()); 
+    if (I->use_empty())
+      continue;
+
+    // Otherwise, if we promoted this argument, then all users are load
+    // instructions (or GEPs with only load users), and all loads should be
+    // using the new argument that we added.
+    ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
+
+    while (!I->use_empty()) {
+      if (LoadInst *LI = dyn_cast<LoadInst>(I->user_back())) {
+        assert(ArgIndices.begin()->second.empty() &&
+               "Load element should sort to front!");
+        I2->setName(I->getName() + ".val");
+        LI->replaceAllUsesWith(&*I2);
+        LI->eraseFromParent();
+        LLVM_DEBUG(dbgs() << "*** Promoted load of argument '" << I->getName()
+                          << "' in function '" << F->getName() << "'\n");
+      } else {
+        GetElementPtrInst *GEP = cast<GetElementPtrInst>(I->user_back());
         assert(!GEP->use_empty() &&
                "GEPs without uses should be cleaned up already");
-        IndicesVector Operands; 
-        Operands.reserve(GEP->getNumIndices()); 
-        for (User::op_iterator II = GEP->idx_begin(), IE = GEP->idx_end(); 
-             II != IE; ++II) 
-          Operands.push_back(cast<ConstantInt>(*II)->getSExtValue()); 
- 
-        // GEPs with a single 0 index can be merged with direct loads 
-        if (Operands.size() == 1 && Operands.front() == 0) 
-          Operands.clear(); 
- 
-        Function::arg_iterator TheArg = I2; 
-        for (ScalarizeTable::iterator It = ArgIndices.begin(); 
-             It->second != Operands; ++It, ++TheArg) { 
-          assert(It != ArgIndices.end() && "GEP not handled??"); 
-        } 
- 
-        TheArg->setName(formatv("{0}.{1:$[.]}.val", I->getName(), 
-                                make_range(Operands.begin(), Operands.end()))); 
- 
-        LLVM_DEBUG(dbgs() << "*** Promoted agg argument '" << TheArg->getName() 
-                          << "' of function '" << NF->getName() << "'\n"); 
- 
-        // All of the uses must be load instructions.  Replace them all with 
-        // the argument specified by ArgNo. 
-        while (!GEP->use_empty()) { 
-          LoadInst *L = cast<LoadInst>(GEP->user_back()); 
-          L->replaceAllUsesWith(&*TheArg); 
-          L->eraseFromParent(); 
-        } 
-        GEP->eraseFromParent(); 
-      } 
-    } 
-    // Increment I2 past all of the arguments added for this promoted pointer. 
-    std::advance(I2, ArgIndices.size()); 
-  } 
- 
-  return NF; 
-} 
- 
-/// Return true if we can prove that all callees pass in a valid pointer for the 
-/// specified function argument. 
-static bool allCallersPassValidPointerForArgument(Argument *Arg, Type *Ty) { 
-  Function *Callee = Arg->getParent(); 
-  const DataLayout &DL = Callee->getParent()->getDataLayout(); 
- 
-  unsigned ArgNo = Arg->getArgNo(); 
- 
-  // Look at all call sites of the function.  At this point we know we only have 
-  // direct callees. 
-  for (User *U : Callee->users()) { 
-    CallBase &CB = cast<CallBase>(*U); 
- 
-    if (!isDereferenceablePointer(CB.getArgOperand(ArgNo), Ty, DL)) 
-      return false; 
-  } 
-  return true; 
-} 
- 
-/// Returns true if Prefix is a prefix of longer. That means, Longer has a size 
-/// that is greater than or equal to the size of prefix, and each of the 
-/// elements in Prefix is the same as the corresponding elements in Longer. 
-/// 
-/// This means it also returns true when Prefix and Longer are equal! 
-static bool isPrefix(const IndicesVector &Prefix, const IndicesVector &Longer) { 
-  if (Prefix.size() > Longer.size()) 
-    return false; 
-  return std::equal(Prefix.begin(), Prefix.end(), Longer.begin()); 
-} 
- 
-/// Checks if Indices, or a prefix of Indices, is in Set. 
-static bool prefixIn(const IndicesVector &Indices, 
-                     std::set<IndicesVector> &Set) { 
-  std::set<IndicesVector>::iterator Low; 
-  Low = Set.upper_bound(Indices); 
-  if (Low != Set.begin()) 
-    Low--; 
-  // Low is now the last element smaller than or equal to Indices. This means 
-  // it points to a prefix of Indices (possibly Indices itself), if such 
-  // prefix exists. 
-  // 
-  // This load is safe if any prefix of its operands is safe to load. 
-  return Low != Set.end() && isPrefix(*Low, Indices); 
-} 
- 
-/// Mark the given indices (ToMark) as safe in the given set of indices 
-/// (Safe). Marking safe usually means adding ToMark to Safe. However, if there 
-/// is already a prefix of Indices in Safe, Indices are implicitely marked safe 
-/// already. Furthermore, any indices that Indices is itself a prefix of, are 
-/// removed from Safe (since they are implicitely safe because of Indices now). 
-static void markIndicesSafe(const IndicesVector &ToMark, 
-                            std::set<IndicesVector> &Safe) { 
-  std::set<IndicesVector>::iterator Low; 
-  Low = Safe.upper_bound(ToMark); 
-  // Guard against the case where Safe is empty 
-  if (Low != Safe.begin()) 
-    Low--; 
-  // Low is now the last element smaller than or equal to Indices. This 
-  // means it points to a prefix of Indices (possibly Indices itself), if 
-  // such prefix exists. 
-  if (Low != Safe.end()) { 
-    if (isPrefix(*Low, ToMark)) 
-      // If there is already a prefix of these indices (or exactly these 
-      // indices) marked a safe, don't bother adding these indices 
-      return; 
- 
-    // Increment Low, so we can use it as a "insert before" hint 
-    ++Low; 
-  } 
-  // Insert 
-  Low = Safe.insert(Low, ToMark); 
-  ++Low; 
-  // If there we're a prefix of longer index list(s), remove those 
-  std::set<IndicesVector>::iterator End = Safe.end(); 
-  while (Low != End && isPrefix(ToMark, *Low)) { 
-    std::set<IndicesVector>::iterator Remove = Low; 
-    ++Low; 
-    Safe.erase(Remove); 
-  } 
-} 
- 
-/// isSafeToPromoteArgument - As you might guess from the name of this method, 
-/// it checks to see if it is both safe and useful to promote the argument. 
-/// This method limits promotion of aggregates to only promote up to three 
-/// elements of the aggregate in order to avoid exploding the number of 
-/// arguments passed in. 
-static bool isSafeToPromoteArgument(Argument *Arg, Type *ByValTy, AAResults &AAR, 
-                                    unsigned MaxElements) { 
-  using GEPIndicesSet = std::set<IndicesVector>; 
- 
-  // Quick exit for unused arguments 
-  if (Arg->use_empty()) 
-    return true; 
- 
-  // We can only promote this argument if all of the uses are loads, or are GEP 
-  // instructions (with constant indices) that are subsequently loaded. 
-  // 
-  // Promoting the argument causes it to be loaded in the caller 
-  // unconditionally. This is only safe if we can prove that either the load 
-  // would have happened in the callee anyway (ie, there is a load in the entry 
-  // block) or the pointer passed in at every call site is guaranteed to be 
-  // valid. 
-  // In the former case, invalid loads can happen, but would have happened 
-  // anyway, in the latter case, invalid loads won't happen. This prevents us 
-  // from introducing an invalid load that wouldn't have happened in the 
-  // original code. 
-  // 
-  // This set will contain all sets of indices that are loaded in the entry 
-  // block, and thus are safe to unconditionally load in the caller. 
-  GEPIndicesSet SafeToUnconditionallyLoad; 
- 
-  // This set contains all the sets of indices that we are planning to promote. 
-  // This makes it possible to limit the number of arguments added. 
-  GEPIndicesSet ToPromote; 
- 
-  // If the pointer is always valid, any load with first index 0 is valid. 
- 
-  if (ByValTy) 
-    SafeToUnconditionallyLoad.insert(IndicesVector(1, 0)); 
- 
-  // Whenever a new underlying type for the operand is found, make sure it's 
-  // consistent with the GEPs and loads we've already seen and, if necessary, 
-  // use it to see if all incoming pointers are valid (which implies the 0-index 
-  // is safe). 
-  Type *BaseTy = ByValTy; 
-  auto UpdateBaseTy = [&](Type *NewBaseTy) { 
-    if (BaseTy) 
-      return BaseTy == NewBaseTy; 
- 
-    BaseTy = NewBaseTy; 
-    if (allCallersPassValidPointerForArgument(Arg, BaseTy)) { 
-      assert(SafeToUnconditionallyLoad.empty()); 
-      SafeToUnconditionallyLoad.insert(IndicesVector(1, 0)); 
-    } 
- 
-    return true; 
-  }; 
- 
-  // First, iterate the entry block and mark loads of (geps of) arguments as 
-  // safe. 
-  BasicBlock &EntryBlock = Arg->getParent()->front(); 
-  // Declare this here so we can reuse it 
-  IndicesVector Indices; 
-  for (Instruction &I : EntryBlock) 
-    if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { 
-      Value *V = LI->getPointerOperand(); 
-      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(V)) { 
-        V = GEP->getPointerOperand(); 
-        if (V == Arg) { 
-          // This load actually loads (part of) Arg? Check the indices then. 
-          Indices.reserve(GEP->getNumIndices()); 
-          for (User::op_iterator II = GEP->idx_begin(), IE = GEP->idx_end(); 
-               II != IE; ++II) 
-            if (ConstantInt *CI = dyn_cast<ConstantInt>(*II)) 
-              Indices.push_back(CI->getSExtValue()); 
-            else 
-              // We found a non-constant GEP index for this argument? Bail out 
-              // right away, can't promote this argument at all. 
-              return false; 
- 
-          if (!UpdateBaseTy(GEP->getSourceElementType())) 
-            return false; 
- 
-          // Indices checked out, mark them as safe 
-          markIndicesSafe(Indices, SafeToUnconditionallyLoad); 
-          Indices.clear(); 
-        } 
-      } else if (V == Arg) { 
-        // Direct loads are equivalent to a GEP with a single 0 index. 
-        markIndicesSafe(IndicesVector(1, 0), SafeToUnconditionallyLoad); 
- 
-        if (BaseTy && LI->getType() != BaseTy) 
-          return false; 
- 
-        BaseTy = LI->getType(); 
-      } 
-    } 
- 
-  // Now, iterate all uses of the argument to see if there are any uses that are 
-  // not (GEP+)loads, or any (GEP+)loads that are not safe to promote. 
-  SmallVector<LoadInst *, 16> Loads; 
-  IndicesVector Operands; 
-  for (Use &U : Arg->uses()) { 
-    User *UR = U.getUser(); 
-    Operands.clear(); 
-    if (LoadInst *LI = dyn_cast<LoadInst>(UR)) { 
-      // Don't hack volatile/atomic loads 
-      if (!LI->isSimple()) 
-        return false; 
-      Loads.push_back(LI); 
-      // Direct loads are equivalent to a GEP with a zero index and then a load. 
-      Operands.push_back(0); 
- 
-      if (!UpdateBaseTy(LI->getType())) 
-        return false; 
-    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UR)) { 
-      if (GEP->use_empty()) { 
-        // Dead GEP's cause trouble later.  Just remove them if we run into 
-        // them. 
+        IndicesVector Operands;
+        Operands.reserve(GEP->getNumIndices());
+        for (User::op_iterator II = GEP->idx_begin(), IE = GEP->idx_end();
+             II != IE; ++II)
+          Operands.push_back(cast<ConstantInt>(*II)->getSExtValue());
+
+        // GEPs with a single 0 index can be merged with direct loads
+        if (Operands.size() == 1 && Operands.front() == 0)
+          Operands.clear();
+
+        Function::arg_iterator TheArg = I2;
+        for (ScalarizeTable::iterator It = ArgIndices.begin();
+             It->second != Operands; ++It, ++TheArg) {
+          assert(It != ArgIndices.end() && "GEP not handled??");
+        }
+
+        TheArg->setName(formatv("{0}.{1:$[.]}.val", I->getName(),
+                                make_range(Operands.begin(), Operands.end())));
+
+        LLVM_DEBUG(dbgs() << "*** Promoted agg argument '" << TheArg->getName()
+                          << "' of function '" << NF->getName() << "'\n");
+
+        // All of the uses must be load instructions.  Replace them all with
+        // the argument specified by ArgNo.
+        while (!GEP->use_empty()) {
+          LoadInst *L = cast<LoadInst>(GEP->user_back());
+          L->replaceAllUsesWith(&*TheArg);
+          L->eraseFromParent();
+        }
+        GEP->eraseFromParent();
+      }
+    }
+    // Increment I2 past all of the arguments added for this promoted pointer.
+    std::advance(I2, ArgIndices.size());
+  }
+
+  return NF;
+}
+
+/// Return true if we can prove that all callees pass in a valid pointer for the
+/// specified function argument.
+static bool allCallersPassValidPointerForArgument(Argument *Arg, Type *Ty) {
+  Function *Callee = Arg->getParent();
+  const DataLayout &DL = Callee->getParent()->getDataLayout();
+
+  unsigned ArgNo = Arg->getArgNo();
+
+  // Look at all call sites of the function.  At this point we know we only have
+  // direct callees.
+  for (User *U : Callee->users()) {
+    CallBase &CB = cast<CallBase>(*U);
+
+    if (!isDereferenceablePointer(CB.getArgOperand(ArgNo), Ty, DL))
+      return false;
+  }
+  return true;
+}
+
+/// Returns true if Prefix is a prefix of longer. That means, Longer has a size
+/// that is greater than or equal to the size of prefix, and each of the
+/// elements in Prefix is the same as the corresponding elements in Longer.
+///
+/// This means it also returns true when Prefix and Longer are equal!
+static bool isPrefix(const IndicesVector &Prefix, const IndicesVector &Longer) {
+  if (Prefix.size() > Longer.size())
+    return false;
+  return std::equal(Prefix.begin(), Prefix.end(), Longer.begin());
+}
+
+/// Checks if Indices, or a prefix of Indices, is in Set.
+static bool prefixIn(const IndicesVector &Indices,
+                     std::set<IndicesVector> &Set) {
+  std::set<IndicesVector>::iterator Low;
+  Low = Set.upper_bound(Indices);
+  if (Low != Set.begin())
+    Low--;
+  // Low is now the last element smaller than or equal to Indices. This means
+  // it points to a prefix of Indices (possibly Indices itself), if such
+  // prefix exists.
+  //
+  // This load is safe if any prefix of its operands is safe to load.
+  return Low != Set.end() && isPrefix(*Low, Indices);
+}
+
+/// Mark the given indices (ToMark) as safe in the given set of indices
+/// (Safe). Marking safe usually means adding ToMark to Safe. However, if there
+/// is already a prefix of Indices in Safe, Indices are implicitely marked safe
+/// already. Furthermore, any indices that Indices is itself a prefix of, are
+/// removed from Safe (since they are implicitely safe because of Indices now).
+static void markIndicesSafe(const IndicesVector &ToMark,
+                            std::set<IndicesVector> &Safe) {
+  std::set<IndicesVector>::iterator Low;
+  Low = Safe.upper_bound(ToMark);
+  // Guard against the case where Safe is empty
+  if (Low != Safe.begin())
+    Low--;
+  // Low is now the last element smaller than or equal to Indices. This
+  // means it points to a prefix of Indices (possibly Indices itself), if
+  // such prefix exists.
+  if (Low != Safe.end()) {
+    if (isPrefix(*Low, ToMark))
+      // If there is already a prefix of these indices (or exactly these
+      // indices) marked a safe, don't bother adding these indices
+      return;
+
+    // Increment Low, so we can use it as a "insert before" hint
+    ++Low;
+  }
+  // Insert
+  Low = Safe.insert(Low, ToMark);
+  ++Low;
+  // If there we're a prefix of longer index list(s), remove those
+  std::set<IndicesVector>::iterator End = Safe.end();
+  while (Low != End && isPrefix(ToMark, *Low)) {
+    std::set<IndicesVector>::iterator Remove = Low;
+    ++Low;
+    Safe.erase(Remove);
+  }
+}
+
+/// isSafeToPromoteArgument - As you might guess from the name of this method,
+/// it checks to see if it is both safe and useful to promote the argument.
+/// This method limits promotion of aggregates to only promote up to three
+/// elements of the aggregate in order to avoid exploding the number of
+/// arguments passed in.
+static bool isSafeToPromoteArgument(Argument *Arg, Type *ByValTy, AAResults &AAR,
+                                    unsigned MaxElements) {
+  using GEPIndicesSet = std::set<IndicesVector>;
+
+  // Quick exit for unused arguments
+  if (Arg->use_empty())
+    return true;
+
+  // We can only promote this argument if all of the uses are loads, or are GEP
+  // instructions (with constant indices) that are subsequently loaded.
+  //
+  // Promoting the argument causes it to be loaded in the caller
+  // unconditionally. This is only safe if we can prove that either the load
+  // would have happened in the callee anyway (ie, there is a load in the entry
+  // block) or the pointer passed in at every call site is guaranteed to be
+  // valid.
+  // In the former case, invalid loads can happen, but would have happened
+  // anyway, in the latter case, invalid loads won't happen. This prevents us
+  // from introducing an invalid load that wouldn't have happened in the
+  // original code.
+  //
+  // This set will contain all sets of indices that are loaded in the entry
+  // block, and thus are safe to unconditionally load in the caller.
+  GEPIndicesSet SafeToUnconditionallyLoad;
+
+  // This set contains all the sets of indices that we are planning to promote.
+  // This makes it possible to limit the number of arguments added.
+  GEPIndicesSet ToPromote;
+
+  // If the pointer is always valid, any load with first index 0 is valid.
+
+  if (ByValTy)
+    SafeToUnconditionallyLoad.insert(IndicesVector(1, 0));
+
+  // Whenever a new underlying type for the operand is found, make sure it's
+  // consistent with the GEPs and loads we've already seen and, if necessary,
+  // use it to see if all incoming pointers are valid (which implies the 0-index
+  // is safe).
+  Type *BaseTy = ByValTy;
+  auto UpdateBaseTy = [&](Type *NewBaseTy) {
+    if (BaseTy)
+      return BaseTy == NewBaseTy;
+
+    BaseTy = NewBaseTy;
+    if (allCallersPassValidPointerForArgument(Arg, BaseTy)) {
+      assert(SafeToUnconditionallyLoad.empty());
+      SafeToUnconditionallyLoad.insert(IndicesVector(1, 0));
+    }
+
+    return true;
+  };
+
+  // First, iterate the entry block and mark loads of (geps of) arguments as
+  // safe.
+  BasicBlock &EntryBlock = Arg->getParent()->front();
+  // Declare this here so we can reuse it
+  IndicesVector Indices;
+  for (Instruction &I : EntryBlock)
+    if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+      Value *V = LI->getPointerOperand();
+      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(V)) {
+        V = GEP->getPointerOperand();
+        if (V == Arg) {
+          // This load actually loads (part of) Arg? Check the indices then.
+          Indices.reserve(GEP->getNumIndices());
+          for (User::op_iterator II = GEP->idx_begin(), IE = GEP->idx_end();
+               II != IE; ++II)
+            if (ConstantInt *CI = dyn_cast<ConstantInt>(*II))
+              Indices.push_back(CI->getSExtValue());
+            else
+              // We found a non-constant GEP index for this argument? Bail out
+              // right away, can't promote this argument at all.
+              return false;
+
+          if (!UpdateBaseTy(GEP->getSourceElementType()))
+            return false;
+
+          // Indices checked out, mark them as safe
+          markIndicesSafe(Indices, SafeToUnconditionallyLoad);
+          Indices.clear();
+        }
+      } else if (V == Arg) {
+        // Direct loads are equivalent to a GEP with a single 0 index.
+        markIndicesSafe(IndicesVector(1, 0), SafeToUnconditionallyLoad);
+
+        if (BaseTy && LI->getType() != BaseTy)
+          return false;
+
+        BaseTy = LI->getType();
+      }
+    }
+
+  // Now, iterate all uses of the argument to see if there are any uses that are
+  // not (GEP+)loads, or any (GEP+)loads that are not safe to promote.
+  SmallVector<LoadInst *, 16> Loads;
+  IndicesVector Operands;
+  for (Use &U : Arg->uses()) {
+    User *UR = U.getUser();
+    Operands.clear();
+    if (LoadInst *LI = dyn_cast<LoadInst>(UR)) {
+      // Don't hack volatile/atomic loads
+      if (!LI->isSimple())
+        return false;
+      Loads.push_back(LI);
+      // Direct loads are equivalent to a GEP with a zero index and then a load.
+      Operands.push_back(0);
+
+      if (!UpdateBaseTy(LI->getType()))
+        return false;
+    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UR)) {
+      if (GEP->use_empty()) {
+        // Dead GEP's cause trouble later.  Just remove them if we run into
+        // them.
         continue;
-      } 
- 
-      if (!UpdateBaseTy(GEP->getSourceElementType())) 
-        return false; 
- 
-      // Ensure that all of the indices are constants. 
-      for (User::op_iterator i = GEP->idx_begin(), e = GEP->idx_end(); i != e; 
-           ++i) 
-        if (ConstantInt *C = dyn_cast<ConstantInt>(*i)) 
-          Operands.push_back(C->getSExtValue()); 
-        else 
-          return false; // Not a constant operand GEP! 
- 
-      // Ensure that the only users of the GEP are load instructions. 
-      for (User *GEPU : GEP->users()) 
-        if (LoadInst *LI = dyn_cast<LoadInst>(GEPU)) { 
-          // Don't hack volatile/atomic loads 
-          if (!LI->isSimple()) 
-            return false; 
-          Loads.push_back(LI); 
-        } else { 
-          // Other uses than load? 
-          return false; 
-        } 
-    } else { 
-      return false; // Not a load or a GEP. 
-    } 
- 
-    // Now, see if it is safe to promote this load / loads of this GEP. Loading 
-    // is safe if Operands, or a prefix of Operands, is marked as safe. 
-    if (!prefixIn(Operands, SafeToUnconditionallyLoad)) 
-      return false; 
- 
-    // See if we are already promoting a load with these indices. If not, check 
-    // to make sure that we aren't promoting too many elements.  If so, nothing 
-    // to do. 
-    if (ToPromote.find(Operands) == ToPromote.end()) { 
-      if (MaxElements > 0 && ToPromote.size() == MaxElements) { 
-        LLVM_DEBUG(dbgs() << "argpromotion not promoting argument '" 
-                          << Arg->getName() 
-                          << "' because it would require adding more " 
-                          << "than " << MaxElements 
-                          << " arguments to the function.\n"); 
-        // We limit aggregate promotion to only promoting up to a fixed number 
-        // of elements of the aggregate. 
-        return false; 
-      } 
-      ToPromote.insert(std::move(Operands)); 
-    } 
-  } 
- 
-  if (Loads.empty()) 
-    return true; // No users, this is a dead argument. 
- 
-  // Okay, now we know that the argument is only used by load instructions and 
-  // it is safe to unconditionally perform all of them. Use alias analysis to 
-  // check to see if the pointer is guaranteed to not be modified from entry of 
-  // the function to each of the load instructions. 
- 
-  // Because there could be several/many load instructions, remember which 
-  // blocks we know to be transparent to the load. 
-  df_iterator_default_set<BasicBlock *, 16> TranspBlocks; 
- 
-  for (LoadInst *Load : Loads) { 
-    // Check to see if the load is invalidated from the start of the block to 
-    // the load itself. 
-    BasicBlock *BB = Load->getParent(); 
- 
-    MemoryLocation Loc = MemoryLocation::get(Load); 
-    if (AAR.canInstructionRangeModRef(BB->front(), *Load, Loc, ModRefInfo::Mod)) 
-      return false; // Pointer is invalidated! 
- 
-    // Now check every path from the entry block to the load for transparency. 
-    // To do this, we perform a depth first search on the inverse CFG from the 
-    // loading block. 
-    for (BasicBlock *P : predecessors(BB)) { 
-      for (BasicBlock *TranspBB : inverse_depth_first_ext(P, TranspBlocks)) 
-        if (AAR.canBasicBlockModify(*TranspBB, Loc)) 
-          return false; 
-    } 
-  } 
- 
-  // If the path from the entry of the function to each load is free of 
-  // instructions that potentially invalidate the load, we can make the 
-  // transformation! 
-  return true; 
-} 
- 
-bool ArgumentPromotionPass::isDenselyPacked(Type *type, const DataLayout &DL) { 
-  // There is no size information, so be conservative. 
-  if (!type->isSized()) 
-    return false; 
- 
-  // If the alloc size is not equal to the storage size, then there are padding 
-  // bytes. For x86_fp80 on x86-64, size: 80 alloc size: 128. 
-  if (DL.getTypeSizeInBits(type) != DL.getTypeAllocSizeInBits(type)) 
-    return false; 
- 
-  // FIXME: This isn't the right way to check for padding in vectors with 
-  // non-byte-size elements. 
-  if (VectorType *seqTy = dyn_cast<VectorType>(type)) 
-    return isDenselyPacked(seqTy->getElementType(), DL); 
- 
-  // For array types, check for padding within members. 
-  if (ArrayType *seqTy = dyn_cast<ArrayType>(type)) 
-    return isDenselyPacked(seqTy->getElementType(), DL); 
- 
-  if (!isa<StructType>(type)) 
-    return true; 
- 
-  // Check for padding within and between elements of a struct. 
-  StructType *StructTy = cast<StructType>(type); 
-  const StructLayout *Layout = DL.getStructLayout(StructTy); 
-  uint64_t StartPos = 0; 
-  for (unsigned i = 0, E = StructTy->getNumElements(); i < E; ++i) { 
-    Type *ElTy = StructTy->getElementType(i); 
-    if (!isDenselyPacked(ElTy, DL)) 
-      return false; 
-    if (StartPos != Layout->getElementOffsetInBits(i)) 
-      return false; 
-    StartPos += DL.getTypeAllocSizeInBits(ElTy); 
-  } 
- 
-  return true; 
-} 
- 
-/// Checks if the padding bytes of an argument could be accessed. 
-static bool canPaddingBeAccessed(Argument *arg) { 
-  assert(arg->hasByValAttr()); 
- 
-  // Track all the pointers to the argument to make sure they are not captured. 
-  SmallPtrSet<Value *, 16> PtrValues; 
-  PtrValues.insert(arg); 
- 
-  // Track all of the stores. 
-  SmallVector<StoreInst *, 16> Stores; 
- 
-  // Scan through the uses recursively to make sure the pointer is always used 
-  // sanely. 
+      }
+
+      if (!UpdateBaseTy(GEP->getSourceElementType()))
+        return false;
+
+      // Ensure that all of the indices are constants.
+      for (User::op_iterator i = GEP->idx_begin(), e = GEP->idx_end(); i != e;
+           ++i)
+        if (ConstantInt *C = dyn_cast<ConstantInt>(*i))
+          Operands.push_back(C->getSExtValue());
+        else
+          return false; // Not a constant operand GEP!
+
+      // Ensure that the only users of the GEP are load instructions.
+      for (User *GEPU : GEP->users())
+        if (LoadInst *LI = dyn_cast<LoadInst>(GEPU)) {
+          // Don't hack volatile/atomic loads
+          if (!LI->isSimple())
+            return false;
+          Loads.push_back(LI);
+        } else {
+          // Other uses than load?
+          return false;
+        }
+    } else {
+      return false; // Not a load or a GEP.
+    }
+
+    // Now, see if it is safe to promote this load / loads of this GEP. Loading
+    // is safe if Operands, or a prefix of Operands, is marked as safe.
+    if (!prefixIn(Operands, SafeToUnconditionallyLoad))
+      return false;
+
+    // See if we are already promoting a load with these indices. If not, check
+    // to make sure that we aren't promoting too many elements.  If so, nothing
+    // to do.
+    if (ToPromote.find(Operands) == ToPromote.end()) {
+      if (MaxElements > 0 && ToPromote.size() == MaxElements) {
+        LLVM_DEBUG(dbgs() << "argpromotion not promoting argument '"
+                          << Arg->getName()
+                          << "' because it would require adding more "
+                          << "than " << MaxElements
+                          << " arguments to the function.\n");
+        // We limit aggregate promotion to only promoting up to a fixed number
+        // of elements of the aggregate.
+        return false;
+      }
+      ToPromote.insert(std::move(Operands));
+    }
+  }
+
+  if (Loads.empty())
+    return true; // No users, this is a dead argument.
+
+  // Okay, now we know that the argument is only used by load instructions and
+  // it is safe to unconditionally perform all of them. Use alias analysis to
+  // check to see if the pointer is guaranteed to not be modified from entry of
+  // the function to each of the load instructions.
+
+  // Because there could be several/many load instructions, remember which
+  // blocks we know to be transparent to the load.
+  df_iterator_default_set<BasicBlock *, 16> TranspBlocks;
+
+  for (LoadInst *Load : Loads) {
+    // Check to see if the load is invalidated from the start of the block to
+    // the load itself.
+    BasicBlock *BB = Load->getParent();
+
+    MemoryLocation Loc = MemoryLocation::get(Load);
+    if (AAR.canInstructionRangeModRef(BB->front(), *Load, Loc, ModRefInfo::Mod))
+      return false; // Pointer is invalidated!
+
+    // Now check every path from the entry block to the load for transparency.
+    // To do this, we perform a depth first search on the inverse CFG from the
+    // loading block.
+    for (BasicBlock *P : predecessors(BB)) {
+      for (BasicBlock *TranspBB : inverse_depth_first_ext(P, TranspBlocks))
+        if (AAR.canBasicBlockModify(*TranspBB, Loc))
+          return false;
+    }
+  }
+
+  // If the path from the entry of the function to each load is free of
+  // instructions that potentially invalidate the load, we can make the
+  // transformation!
+  return true;
+}
+
+bool ArgumentPromotionPass::isDenselyPacked(Type *type, const DataLayout &DL) {
+  // There is no size information, so be conservative.
+  if (!type->isSized())
+    return false;
+
+  // If the alloc size is not equal to the storage size, then there are padding
+  // bytes. For x86_fp80 on x86-64, size: 80 alloc size: 128.
+  if (DL.getTypeSizeInBits(type) != DL.getTypeAllocSizeInBits(type))
+    return false;
+
+  // FIXME: This isn't the right way to check for padding in vectors with
+  // non-byte-size elements.
+  if (VectorType *seqTy = dyn_cast<VectorType>(type))
+    return isDenselyPacked(seqTy->getElementType(), DL);
+
+  // For array types, check for padding within members.
+  if (ArrayType *seqTy = dyn_cast<ArrayType>(type))
+    return isDenselyPacked(seqTy->getElementType(), DL);
+
+  if (!isa<StructType>(type))
+    return true;
+
+  // Check for padding within and between elements of a struct.
+  StructType *StructTy = cast<StructType>(type);
+  const StructLayout *Layout = DL.getStructLayout(StructTy);
+  uint64_t StartPos = 0;
+  for (unsigned i = 0, E = StructTy->getNumElements(); i < E; ++i) {
+    Type *ElTy = StructTy->getElementType(i);
+    if (!isDenselyPacked(ElTy, DL))
+      return false;
+    if (StartPos != Layout->getElementOffsetInBits(i))
+      return false;
+    StartPos += DL.getTypeAllocSizeInBits(ElTy);
+  }
+
+  return true;
+}
+
+/// Checks if the padding bytes of an argument could be accessed.
+static bool canPaddingBeAccessed(Argument *arg) {
+  assert(arg->hasByValAttr());
+
+  // Track all the pointers to the argument to make sure they are not captured.
+  SmallPtrSet<Value *, 16> PtrValues;
+  PtrValues.insert(arg);
+
+  // Track all of the stores.
+  SmallVector<StoreInst *, 16> Stores;
+
+  // Scan through the uses recursively to make sure the pointer is always used
+  // sanely.
   SmallVector<Value *, 16> WorkList(arg->users());
-  while (!WorkList.empty()) { 
+  while (!WorkList.empty()) {
     Value *V = WorkList.pop_back_val();
-    if (isa<GetElementPtrInst>(V) || isa<PHINode>(V)) { 
-      if (PtrValues.insert(V).second) 
+    if (isa<GetElementPtrInst>(V) || isa<PHINode>(V)) {
+      if (PtrValues.insert(V).second)
         llvm::append_range(WorkList, V->users());
-    } else if (StoreInst *Store = dyn_cast<StoreInst>(V)) { 
-      Stores.push_back(Store); 
-    } else if (!isa<LoadInst>(V)) { 
-      return true; 
-    } 
-  } 
- 
-  // Check to make sure the pointers aren't captured 
-  for (StoreInst *Store : Stores) 
-    if (PtrValues.count(Store->getValueOperand())) 
-      return true; 
- 
-  return false; 
-} 
- 
-bool ArgumentPromotionPass::areFunctionArgsABICompatible( 
-    const Function &F, const TargetTransformInfo &TTI, 
-    SmallPtrSetImpl<Argument *> &ArgsToPromote, 
-    SmallPtrSetImpl<Argument *> &ByValArgsToTransform) { 
-  for (const Use &U : F.uses()) { 
-    CallBase *CB = dyn_cast<CallBase>(U.getUser()); 
-    if (!CB) 
-      return false; 
-    const Function *Caller = CB->getCaller(); 
-    const Function *Callee = CB->getCalledFunction(); 
-    if (!TTI.areFunctionArgsABICompatible(Caller, Callee, ArgsToPromote) || 
-        !TTI.areFunctionArgsABICompatible(Caller, Callee, ByValArgsToTransform)) 
-      return false; 
-  } 
-  return true; 
-} 
- 
-/// PromoteArguments - This method checks the specified function to see if there 
-/// are any promotable arguments and if it is safe to promote the function (for 
-/// example, all callers are direct).  If safe to promote some arguments, it 
-/// calls the DoPromotion method. 
-static Function * 
-promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter, 
-                 unsigned MaxElements, 
-                 Optional<function_ref<void(CallBase &OldCS, CallBase &NewCS)>> 
-                     ReplaceCallSite, 
-                 const TargetTransformInfo &TTI) { 
-  // Don't perform argument promotion for naked functions; otherwise we can end 
-  // up removing parameters that are seemingly 'not used' as they are referred 
-  // to in the assembly. 
-  if(F->hasFnAttribute(Attribute::Naked)) 
-    return nullptr; 
- 
-  // Make sure that it is local to this module. 
-  if (!F->hasLocalLinkage()) 
-    return nullptr; 
- 
-  // Don't promote arguments for variadic functions. Adding, removing, or 
-  // changing non-pack parameters can change the classification of pack 
-  // parameters. Frontends encode that classification at the call site in the 
-  // IR, while in the callee the classification is determined dynamically based 
-  // on the number of registers consumed so far. 
-  if (F->isVarArg()) 
-    return nullptr; 
- 
-  // Don't transform functions that receive inallocas, as the transformation may 
-  // not be safe depending on calling convention. 
-  if (F->getAttributes().hasAttrSomewhere(Attribute::InAlloca)) 
-    return nullptr; 
- 
-  // First check: see if there are any pointer arguments!  If not, quick exit. 
-  SmallVector<Argument *, 16> PointerArgs; 
-  for (Argument &I : F->args()) 
-    if (I.getType()->isPointerTy()) 
-      PointerArgs.push_back(&I); 
-  if (PointerArgs.empty()) 
-    return nullptr; 
- 
-  // Second check: make sure that all callers are direct callers.  We can't 
-  // transform functions that have indirect callers.  Also see if the function 
-  // is self-recursive and check that target features are compatible. 
-  bool isSelfRecursive = false; 
-  for (Use &U : F->uses()) { 
-    CallBase *CB = dyn_cast<CallBase>(U.getUser()); 
-    // Must be a direct call. 
-    if (CB == nullptr || !CB->isCallee(&U)) 
-      return nullptr; 
- 
-    // Can't change signature of musttail callee 
-    if (CB->isMustTailCall()) 
-      return nullptr; 
- 
-    if (CB->getParent()->getParent() == F) 
-      isSelfRecursive = true; 
-  } 
- 
-  // Can't change signature of musttail caller 
-  // FIXME: Support promoting whole chain of musttail functions 
-  for (BasicBlock &BB : *F) 
-    if (BB.getTerminatingMustTailCall()) 
-      return nullptr; 
- 
-  const DataLayout &DL = F->getParent()->getDataLayout(); 
- 
-  AAResults &AAR = AARGetter(*F); 
- 
-  // Check to see which arguments are promotable.  If an argument is promotable, 
-  // add it to ArgsToPromote. 
-  SmallPtrSet<Argument *, 8> ArgsToPromote; 
-  SmallPtrSet<Argument *, 8> ByValArgsToTransform; 
-  for (Argument *PtrArg : PointerArgs) { 
-    Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType(); 
- 
-    // Replace sret attribute with noalias. This reduces register pressure by 
-    // avoiding a register copy. 
-    if (PtrArg->hasStructRetAttr()) { 
-      unsigned ArgNo = PtrArg->getArgNo(); 
-      F->removeParamAttr(ArgNo, Attribute::StructRet); 
-      F->addParamAttr(ArgNo, Attribute::NoAlias); 
-      for (Use &U : F->uses()) { 
-        CallBase &CB = cast<CallBase>(*U.getUser()); 
-        CB.removeParamAttr(ArgNo, Attribute::StructRet); 
-        CB.addParamAttr(ArgNo, Attribute::NoAlias); 
-      } 
-    } 
- 
-    // If this is a byval argument, and if the aggregate type is small, just 
-    // pass the elements, which is always safe, if the passed value is densely 
-    // packed or if we can prove the padding bytes are never accessed. 
-    bool isSafeToPromote = PtrArg->hasByValAttr() && 
-                           (ArgumentPromotionPass::isDenselyPacked(AgTy, DL) || 
-                            !canPaddingBeAccessed(PtrArg)); 
-    if (isSafeToPromote) { 
-      if (StructType *STy = dyn_cast<StructType>(AgTy)) { 
-        if (MaxElements > 0 && STy->getNumElements() > MaxElements) { 
-          LLVM_DEBUG(dbgs() << "argpromotion disable promoting argument '" 
-                            << PtrArg->getName() 
-                            << "' because it would require adding more" 
-                            << " than " << MaxElements 
-                            << " arguments to the function.\n"); 
-          continue; 
-        } 
- 
-        // If all the elements are single-value types, we can promote it. 
-        bool AllSimple = true; 
-        for (const auto *EltTy : STy->elements()) { 
-          if (!EltTy->isSingleValueType()) { 
-            AllSimple = false; 
-            break; 
-          } 
-        } 
- 
-        // Safe to transform, don't even bother trying to "promote" it. 
-        // Passing the elements as a scalar will allow sroa to hack on 
-        // the new alloca we introduce. 
-        if (AllSimple) { 
-          ByValArgsToTransform.insert(PtrArg); 
-          continue; 
-        } 
-      } 
-    } 
- 
-    // If the argument is a recursive type and we're in a recursive 
-    // function, we could end up infinitely peeling the function argument. 
-    if (isSelfRecursive) { 
-      if (StructType *STy = dyn_cast<StructType>(AgTy)) { 
-        bool RecursiveType = false; 
-        for (const auto *EltTy : STy->elements()) { 
-          if (EltTy == PtrArg->getType()) { 
-            RecursiveType = true; 
-            break; 
-          } 
-        } 
-        if (RecursiveType) 
-          continue; 
-      } 
-    } 
- 
-    // Otherwise, see if we can promote the pointer to its value. 
-    Type *ByValTy = 
-        PtrArg->hasByValAttr() ? PtrArg->getParamByValType() : nullptr; 
-    if (isSafeToPromoteArgument(PtrArg, ByValTy, AAR, MaxElements)) 
-      ArgsToPromote.insert(PtrArg); 
-  } 
- 
-  // No promotable pointer arguments. 
-  if (ArgsToPromote.empty() && ByValArgsToTransform.empty()) 
-    return nullptr; 
- 
-  if (!ArgumentPromotionPass::areFunctionArgsABICompatible( 
-          *F, TTI, ArgsToPromote, ByValArgsToTransform)) 
-    return nullptr; 
- 
-  return doPromotion(F, ArgsToPromote, ByValArgsToTransform, ReplaceCallSite); 
-} 
- 
-PreservedAnalyses ArgumentPromotionPass::run(LazyCallGraph::SCC &C, 
-                                             CGSCCAnalysisManager &AM, 
-                                             LazyCallGraph &CG, 
-                                             CGSCCUpdateResult &UR) { 
-  bool Changed = false, LocalChange; 
- 
-  // Iterate until we stop promoting from this SCC. 
-  do { 
-    LocalChange = false; 
- 
-    for (LazyCallGraph::Node &N : C) { 
-      Function &OldF = N.getFunction(); 
- 
-      FunctionAnalysisManager &FAM = 
-          AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager(); 
-      // FIXME: This lambda must only be used with this function. We should 
-      // skip the lambda and just get the AA results directly. 
-      auto AARGetter = [&](Function &F) -> AAResults & { 
-        assert(&F == &OldF && "Called with an unexpected function!"); 
-        return FAM.getResult<AAManager>(F); 
-      }; 
- 
-      const TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(OldF); 
-      Function *NewF = 
-          promoteArguments(&OldF, AARGetter, MaxElements, None, TTI); 
-      if (!NewF) 
-        continue; 
-      LocalChange = true; 
- 
-      // Directly substitute the functions in the call graph. Note that this 
-      // requires the old function to be completely dead and completely 
-      // replaced by the new function. It does no call graph updates, it merely 
-      // swaps out the particular function mapped to a particular node in the 
-      // graph. 
-      C.getOuterRefSCC().replaceNodeFunction(N, *NewF); 
-      OldF.eraseFromParent(); 
-    } 
- 
-    Changed |= LocalChange; 
-  } while (LocalChange); 
- 
-  if (!Changed) 
-    return PreservedAnalyses::all(); 
- 
-  return PreservedAnalyses::none(); 
-} 
- 
-namespace { 
- 
-/// ArgPromotion - The 'by reference' to 'by value' argument promotion pass. 
-struct ArgPromotion : public CallGraphSCCPass { 
-  // Pass identification, replacement for typeid 
-  static char ID; 
- 
-  explicit ArgPromotion(unsigned MaxElements = 3) 
-      : CallGraphSCCPass(ID), MaxElements(MaxElements) { 
-    initializeArgPromotionPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<AssumptionCacheTracker>(); 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-    AU.addRequired<TargetTransformInfoWrapperPass>(); 
-    getAAResultsAnalysisUsage(AU); 
-    CallGraphSCCPass::getAnalysisUsage(AU); 
-  } 
- 
-  bool runOnSCC(CallGraphSCC &SCC) override; 
- 
-private: 
-  using llvm::Pass::doInitialization; 
- 
-  bool doInitialization(CallGraph &CG) override; 
- 
-  /// The maximum number of elements to expand, or 0 for unlimited. 
-  unsigned MaxElements; 
-}; 
- 
-} // end anonymous namespace 
- 
-char ArgPromotion::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(ArgPromotion, "argpromotion", 
-                      "Promote 'by reference' arguments to scalars", false, 
-                      false) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
-INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 
-INITIALIZE_PASS_END(ArgPromotion, "argpromotion", 
-                    "Promote 'by reference' arguments to scalars", false, false) 
- 
-Pass *llvm::createArgumentPromotionPass(unsigned MaxElements) { 
-  return new ArgPromotion(MaxElements); 
-} 
- 
-bool ArgPromotion::runOnSCC(CallGraphSCC &SCC) { 
-  if (skipSCC(SCC)) 
-    return false; 
- 
-  // Get the callgraph information that we need to update to reflect our 
-  // changes. 
-  CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); 
- 
-  LegacyAARGetter AARGetter(*this); 
- 
-  bool Changed = false, LocalChange; 
- 
-  // Iterate until we stop promoting from this SCC. 
-  do { 
-    LocalChange = false; 
-    // Attempt to promote arguments from all functions in this SCC. 
-    for (CallGraphNode *OldNode : SCC) { 
-      Function *OldF = OldNode->getFunction(); 
-      if (!OldF) 
-        continue; 
- 
-      auto ReplaceCallSite = [&](CallBase &OldCS, CallBase &NewCS) { 
-        Function *Caller = OldCS.getParent()->getParent(); 
-        CallGraphNode *NewCalleeNode = 
-            CG.getOrInsertFunction(NewCS.getCalledFunction()); 
-        CallGraphNode *CallerNode = CG[Caller]; 
-        CallerNode->replaceCallEdge(cast<CallBase>(OldCS), 
-                                    cast<CallBase>(NewCS), NewCalleeNode); 
-      }; 
- 
-      const TargetTransformInfo &TTI = 
-          getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*OldF); 
-      if (Function *NewF = promoteArguments(OldF, AARGetter, MaxElements, 
-                                            {ReplaceCallSite}, TTI)) { 
-        LocalChange = true; 
- 
-        // Update the call graph for the newly promoted function. 
-        CallGraphNode *NewNode = CG.getOrInsertFunction(NewF); 
-        NewNode->stealCalledFunctionsFrom(OldNode); 
-        if (OldNode->getNumReferences() == 0) 
-          delete CG.removeFunctionFromModule(OldNode); 
-        else 
-          OldF->setLinkage(Function::ExternalLinkage); 
- 
-        // And updat ethe SCC we're iterating as well. 
-        SCC.ReplaceNode(OldNode, NewNode); 
-      } 
-    } 
-    // Remember that we changed something. 
-    Changed |= LocalChange; 
-  } while (LocalChange); 
- 
-  return Changed; 
-} 
- 
-bool ArgPromotion::doInitialization(CallGraph &CG) { 
-  return CallGraphSCCPass::doInitialization(CG); 
-} 
+    } else if (StoreInst *Store = dyn_cast<StoreInst>(V)) {
+      Stores.push_back(Store);
+    } else if (!isa<LoadInst>(V)) {
+      return true;
+    }
+  }
+
+  // Check to make sure the pointers aren't captured
+  for (StoreInst *Store : Stores)
+    if (PtrValues.count(Store->getValueOperand()))
+      return true;
+
+  return false;
+}
+
+bool ArgumentPromotionPass::areFunctionArgsABICompatible(
+    const Function &F, const TargetTransformInfo &TTI,
+    SmallPtrSetImpl<Argument *> &ArgsToPromote,
+    SmallPtrSetImpl<Argument *> &ByValArgsToTransform) {
+  for (const Use &U : F.uses()) {
+    CallBase *CB = dyn_cast<CallBase>(U.getUser());
+    if (!CB)
+      return false;
+    const Function *Caller = CB->getCaller();
+    const Function *Callee = CB->getCalledFunction();
+    if (!TTI.areFunctionArgsABICompatible(Caller, Callee, ArgsToPromote) ||
+        !TTI.areFunctionArgsABICompatible(Caller, Callee, ByValArgsToTransform))
+      return false;
+  }
+  return true;
+}
+
+/// PromoteArguments - This method checks the specified function to see if there
+/// are any promotable arguments and if it is safe to promote the function (for
+/// example, all callers are direct).  If safe to promote some arguments, it
+/// calls the DoPromotion method.
+static Function *
+promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
+                 unsigned MaxElements,
+                 Optional<function_ref<void(CallBase &OldCS, CallBase &NewCS)>>
+                     ReplaceCallSite,
+                 const TargetTransformInfo &TTI) {
+  // Don't perform argument promotion for naked functions; otherwise we can end
+  // up removing parameters that are seemingly 'not used' as they are referred
+  // to in the assembly.
+  if(F->hasFnAttribute(Attribute::Naked))
+    return nullptr;
+
+  // Make sure that it is local to this module.
+  if (!F->hasLocalLinkage())
+    return nullptr;
+
+  // Don't promote arguments for variadic functions. Adding, removing, or
+  // changing non-pack parameters can change the classification of pack
+  // parameters. Frontends encode that classification at the call site in the
+  // IR, while in the callee the classification is determined dynamically based
+  // on the number of registers consumed so far.
+  if (F->isVarArg())
+    return nullptr;
+
+  // Don't transform functions that receive inallocas, as the transformation may
+  // not be safe depending on calling convention.
+  if (F->getAttributes().hasAttrSomewhere(Attribute::InAlloca))
+    return nullptr;
+
+  // First check: see if there are any pointer arguments!  If not, quick exit.
+  SmallVector<Argument *, 16> PointerArgs;
+  for (Argument &I : F->args())
+    if (I.getType()->isPointerTy())
+      PointerArgs.push_back(&I);
+  if (PointerArgs.empty())
+    return nullptr;
+
+  // Second check: make sure that all callers are direct callers.  We can't
+  // transform functions that have indirect callers.  Also see if the function
+  // is self-recursive and check that target features are compatible.
+  bool isSelfRecursive = false;
+  for (Use &U : F->uses()) {
+    CallBase *CB = dyn_cast<CallBase>(U.getUser());
+    // Must be a direct call.
+    if (CB == nullptr || !CB->isCallee(&U))
+      return nullptr;
+
+    // Can't change signature of musttail callee
+    if (CB->isMustTailCall())
+      return nullptr;
+
+    if (CB->getParent()->getParent() == F)
+      isSelfRecursive = true;
+  }
+
+  // Can't change signature of musttail caller
+  // FIXME: Support promoting whole chain of musttail functions
+  for (BasicBlock &BB : *F)
+    if (BB.getTerminatingMustTailCall())
+      return nullptr;
+
+  const DataLayout &DL = F->getParent()->getDataLayout();
+
+  AAResults &AAR = AARGetter(*F);
+
+  // Check to see which arguments are promotable.  If an argument is promotable,
+  // add it to ArgsToPromote.
+  SmallPtrSet<Argument *, 8> ArgsToPromote;
+  SmallPtrSet<Argument *, 8> ByValArgsToTransform;
+  for (Argument *PtrArg : PointerArgs) {
+    Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType();
+
+    // Replace sret attribute with noalias. This reduces register pressure by
+    // avoiding a register copy.
+    if (PtrArg->hasStructRetAttr()) {
+      unsigned ArgNo = PtrArg->getArgNo();
+      F->removeParamAttr(ArgNo, Attribute::StructRet);
+      F->addParamAttr(ArgNo, Attribute::NoAlias);
+      for (Use &U : F->uses()) {
+        CallBase &CB = cast<CallBase>(*U.getUser());
+        CB.removeParamAttr(ArgNo, Attribute::StructRet);
+        CB.addParamAttr(ArgNo, Attribute::NoAlias);
+      }
+    }
+
+    // If this is a byval argument, and if the aggregate type is small, just
+    // pass the elements, which is always safe, if the passed value is densely
+    // packed or if we can prove the padding bytes are never accessed.
+    bool isSafeToPromote = PtrArg->hasByValAttr() &&
+                           (ArgumentPromotionPass::isDenselyPacked(AgTy, DL) ||
+                            !canPaddingBeAccessed(PtrArg));
+    if (isSafeToPromote) {
+      if (StructType *STy = dyn_cast<StructType>(AgTy)) {
+        if (MaxElements > 0 && STy->getNumElements() > MaxElements) {
+          LLVM_DEBUG(dbgs() << "argpromotion disable promoting argument '"
+                            << PtrArg->getName()
+                            << "' because it would require adding more"
+                            << " than " << MaxElements
+                            << " arguments to the function.\n");
+          continue;
+        }
+
+        // If all the elements are single-value types, we can promote it.
+        bool AllSimple = true;
+        for (const auto *EltTy : STy->elements()) {
+          if (!EltTy->isSingleValueType()) {
+            AllSimple = false;
+            break;
+          }
+        }
+
+        // Safe to transform, don't even bother trying to "promote" it.
+        // Passing the elements as a scalar will allow sroa to hack on
+        // the new alloca we introduce.
+        if (AllSimple) {
+          ByValArgsToTransform.insert(PtrArg);
+          continue;
+        }
+      }
+    }
+
+    // If the argument is a recursive type and we're in a recursive
+    // function, we could end up infinitely peeling the function argument.
+    if (isSelfRecursive) {
+      if (StructType *STy = dyn_cast<StructType>(AgTy)) {
+        bool RecursiveType = false;
+        for (const auto *EltTy : STy->elements()) {
+          if (EltTy == PtrArg->getType()) {
+            RecursiveType = true;
+            break;
+          }
+        }
+        if (RecursiveType)
+          continue;
+      }
+    }
+
+    // Otherwise, see if we can promote the pointer to its value.
+    Type *ByValTy =
+        PtrArg->hasByValAttr() ? PtrArg->getParamByValType() : nullptr;
+    if (isSafeToPromoteArgument(PtrArg, ByValTy, AAR, MaxElements))
+      ArgsToPromote.insert(PtrArg);
+  }
+
+  // No promotable pointer arguments.
+  if (ArgsToPromote.empty() && ByValArgsToTransform.empty())
+    return nullptr;
+
+  if (!ArgumentPromotionPass::areFunctionArgsABICompatible(
+          *F, TTI, ArgsToPromote, ByValArgsToTransform))
+    return nullptr;
+
+  return doPromotion(F, ArgsToPromote, ByValArgsToTransform, ReplaceCallSite);
+}
+
+PreservedAnalyses ArgumentPromotionPass::run(LazyCallGraph::SCC &C,
+                                             CGSCCAnalysisManager &AM,
+                                             LazyCallGraph &CG,
+                                             CGSCCUpdateResult &UR) {
+  bool Changed = false, LocalChange;
+
+  // Iterate until we stop promoting from this SCC.
+  do {
+    LocalChange = false;
+
+    for (LazyCallGraph::Node &N : C) {
+      Function &OldF = N.getFunction();
+
+      FunctionAnalysisManager &FAM =
+          AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
+      // FIXME: This lambda must only be used with this function. We should
+      // skip the lambda and just get the AA results directly.
+      auto AARGetter = [&](Function &F) -> AAResults & {
+        assert(&F == &OldF && "Called with an unexpected function!");
+        return FAM.getResult<AAManager>(F);
+      };
+
+      const TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(OldF);
+      Function *NewF =
+          promoteArguments(&OldF, AARGetter, MaxElements, None, TTI);
+      if (!NewF)
+        continue;
+      LocalChange = true;
+
+      // Directly substitute the functions in the call graph. Note that this
+      // requires the old function to be completely dead and completely
+      // replaced by the new function. It does no call graph updates, it merely
+      // swaps out the particular function mapped to a particular node in the
+      // graph.
+      C.getOuterRefSCC().replaceNodeFunction(N, *NewF);
+      OldF.eraseFromParent();
+    }
+
+    Changed |= LocalChange;
+  } while (LocalChange);
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+
+namespace {
+
+/// ArgPromotion - The 'by reference' to 'by value' argument promotion pass.
+struct ArgPromotion : public CallGraphSCCPass {
+  // Pass identification, replacement for typeid
+  static char ID;
+
+  explicit ArgPromotion(unsigned MaxElements = 3)
+      : CallGraphSCCPass(ID), MaxElements(MaxElements) {
+    initializeArgPromotionPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    getAAResultsAnalysisUsage(AU);
+    CallGraphSCCPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnSCC(CallGraphSCC &SCC) override;
+
+private:
+  using llvm::Pass::doInitialization;
+
+  bool doInitialization(CallGraph &CG) override;
+
+  /// The maximum number of elements to expand, or 0 for unlimited.
+  unsigned MaxElements;
+};
+
+} // end anonymous namespace
+
+char ArgPromotion::ID = 0;
+
+INITIALIZE_PASS_BEGIN(ArgPromotion, "argpromotion",
+                      "Promote 'by reference' arguments to scalars", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(ArgPromotion, "argpromotion",
+                    "Promote 'by reference' arguments to scalars", false, false)
+
+Pass *llvm::createArgumentPromotionPass(unsigned MaxElements) {
+  return new ArgPromotion(MaxElements);
+}
+
+bool ArgPromotion::runOnSCC(CallGraphSCC &SCC) {
+  if (skipSCC(SCC))
+    return false;
+
+  // Get the callgraph information that we need to update to reflect our
+  // changes.
+  CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+
+  LegacyAARGetter AARGetter(*this);
+
+  bool Changed = false, LocalChange;
+
+  // Iterate until we stop promoting from this SCC.
+  do {
+    LocalChange = false;
+    // Attempt to promote arguments from all functions in this SCC.
+    for (CallGraphNode *OldNode : SCC) {
+      Function *OldF = OldNode->getFunction();
+      if (!OldF)
+        continue;
+
+      auto ReplaceCallSite = [&](CallBase &OldCS, CallBase &NewCS) {
+        Function *Caller = OldCS.getParent()->getParent();
+        CallGraphNode *NewCalleeNode =
+            CG.getOrInsertFunction(NewCS.getCalledFunction());
+        CallGraphNode *CallerNode = CG[Caller];
+        CallerNode->replaceCallEdge(cast<CallBase>(OldCS),
+                                    cast<CallBase>(NewCS), NewCalleeNode);
+      };
+
+      const TargetTransformInfo &TTI =
+          getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*OldF);
+      if (Function *NewF = promoteArguments(OldF, AARGetter, MaxElements,
+                                            {ReplaceCallSite}, TTI)) {
+        LocalChange = true;
+
+        // Update the call graph for the newly promoted function.
+        CallGraphNode *NewNode = CG.getOrInsertFunction(NewF);
+        NewNode->stealCalledFunctionsFrom(OldNode);
+        if (OldNode->getNumReferences() == 0)
+          delete CG.removeFunctionFromModule(OldNode);
+        else
+          OldF->setLinkage(Function::ExternalLinkage);
+
+        // And updat ethe SCC we're iterating as well.
+        SCC.ReplaceNode(OldNode, NewNode);
+      }
+    }
+    // Remember that we changed something.
+    Changed |= LocalChange;
+  } while (LocalChange);
+
+  return Changed;
+}
+
+bool ArgPromotion::doInitialization(CallGraph &CG) {
+  return CallGraphSCCPass::doInitialization(CG);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/Attributor.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/Attributor.cpp
index fa23176c17..03ad451350 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/Attributor.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/Attributor.cpp
@@ -1,82 +1,82 @@
-//===- Attributor.cpp - Module-wide attribute deduction -------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements an interprocedural pass that deduces and/or propagates 
-// attributes. This is done in an abstract interpretation style fixpoint 
-// iteration. See the Attributor.h file comment and the class descriptions in 
-// that file for more information. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/IPO/Attributor.h" 
- 
+//===- Attributor.cpp - Module-wide attribute deduction -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements an interprocedural pass that deduces and/or propagates
+// attributes. This is done in an abstract interpretation style fixpoint
+// iteration. See the Attributor.h file comment and the class descriptions in
+// that file for more information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/Attributor.h"
+
 #include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/PointerIntPair.h"
-#include "llvm/ADT/Statistic.h" 
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/Analysis/InlineCost.h"
-#include "llvm/Analysis/LazyValueInfo.h" 
+#include "llvm/Analysis/LazyValueInfo.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
-#include "llvm/Analysis/MustExecute.h" 
-#include "llvm/Analysis/ValueTracking.h" 
+#include "llvm/Analysis/MustExecute.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/NoFolder.h" 
-#include "llvm/IR/Verifier.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/NoFolder.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/DebugCounter.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h" 
- 
-#include <cassert> 
+#include "llvm/Transforms/Utils/Local.h"
+
+#include <cassert>
 #include <string>
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "attributor" 
- 
+
+using namespace llvm;
+
+#define DEBUG_TYPE "attributor"
+
 DEBUG_COUNTER(ManifestDBGCounter, "attributor-manifest",
               "Determine what attributes are manifested in the IR");
 
-STATISTIC(NumFnDeleted, "Number of function deleted"); 
-STATISTIC(NumFnWithExactDefinition, 
-          "Number of functions with exact definitions"); 
-STATISTIC(NumFnWithoutExactDefinition, 
-          "Number of functions without exact definitions"); 
+STATISTIC(NumFnDeleted, "Number of function deleted");
+STATISTIC(NumFnWithExactDefinition,
+          "Number of functions with exact definitions");
+STATISTIC(NumFnWithoutExactDefinition,
+          "Number of functions without exact definitions");
 STATISTIC(NumFnShallowWrappersCreated, "Number of shallow wrappers created");
-STATISTIC(NumAttributesTimedOut, 
-          "Number of abstract attributes timed out before fixpoint"); 
-STATISTIC(NumAttributesValidFixpoint, 
-          "Number of abstract attributes in a valid fixpoint state"); 
-STATISTIC(NumAttributesManifested, 
-          "Number of abstract attributes manifested in IR"); 
-STATISTIC(NumAttributesFixedDueToRequiredDependences, 
-          "Number of abstract attributes fixed due to required dependences"); 
- 
-// TODO: Determine a good default value. 
-// 
-// In the LLVM-TS and SPEC2006, 32 seems to not induce compile time overheads 
-// (when run with the first 5 abstract attributes). The results also indicate 
-// that we never reach 32 iterations but always find a fixpoint sooner. 
-// 
-// This will become more evolved once we perform two interleaved fixpoint 
-// iterations: bottom-up and top-down. 
-static cl::opt<unsigned> 
-    MaxFixpointIterations("attributor-max-iterations", cl::Hidden, 
-                          cl::desc("Maximal number of fixpoint iterations."), 
-                          cl::init(32)); 
+STATISTIC(NumAttributesTimedOut,
+          "Number of abstract attributes timed out before fixpoint");
+STATISTIC(NumAttributesValidFixpoint,
+          "Number of abstract attributes in a valid fixpoint state");
+STATISTIC(NumAttributesManifested,
+          "Number of abstract attributes manifested in IR");
+STATISTIC(NumAttributesFixedDueToRequiredDependences,
+          "Number of abstract attributes fixed due to required dependences");
+
+// TODO: Determine a good default value.
+//
+// In the LLVM-TS and SPEC2006, 32 seems to not induce compile time overheads
+// (when run with the first 5 abstract attributes). The results also indicate
+// that we never reach 32 iterations but always find a fixpoint sooner.
+//
+// This will become more evolved once we perform two interleaved fixpoint
+// iterations: bottom-up and top-down.
+static cl::opt<unsigned>
+    MaxFixpointIterations("attributor-max-iterations", cl::Hidden,
+                          cl::desc("Maximal number of fixpoint iterations."),
+                          cl::init(32));
 
 static cl::opt<unsigned, true> MaxInitializationChainLengthX(
     "attributor-max-initialization-chain-length", cl::Hidden,
@@ -85,24 +85,24 @@ static cl::opt<unsigned, true> MaxInitializationChainLengthX(
     cl::location(MaxInitializationChainLength), cl::init(1024));
 unsigned llvm::MaxInitializationChainLength;
 
-static cl::opt<bool> VerifyMaxFixpointIterations( 
-    "attributor-max-iterations-verify", cl::Hidden, 
-    cl::desc("Verify that max-iterations is a tight bound for a fixpoint"), 
-    cl::init(false)); 
- 
-static cl::opt<bool> AnnotateDeclarationCallSites( 
-    "attributor-annotate-decl-cs", cl::Hidden, 
-    cl::desc("Annotate call sites of function declarations."), cl::init(false)); 
- 
-static cl::opt<bool> EnableHeapToStack("enable-heap-to-stack-conversion", 
-                                       cl::init(true), cl::Hidden); 
- 
-static cl::opt<bool> 
-    AllowShallowWrappers("attributor-allow-shallow-wrappers", cl::Hidden, 
-                         cl::desc("Allow the Attributor to create shallow " 
-                                  "wrappers for non-exact definitions."), 
-                         cl::init(false)); 
- 
+static cl::opt<bool> VerifyMaxFixpointIterations(
+    "attributor-max-iterations-verify", cl::Hidden,
+    cl::desc("Verify that max-iterations is a tight bound for a fixpoint"),
+    cl::init(false));
+
+static cl::opt<bool> AnnotateDeclarationCallSites(
+    "attributor-annotate-decl-cs", cl::Hidden,
+    cl::desc("Annotate call sites of function declarations."), cl::init(false));
+
+static cl::opt<bool> EnableHeapToStack("enable-heap-to-stack-conversion",
+                                       cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+    AllowShallowWrappers("attributor-allow-shallow-wrappers", cl::Hidden,
+                         cl::desc("Allow the Attributor to create shallow "
+                                  "wrappers for non-exact definitions."),
+                         cl::init(false));
+
 static cl::opt<bool>
     AllowDeepWrapper("attributor-allow-deep-wrappers", cl::Hidden,
                      cl::desc("Allow the Attributor to use IP information "
@@ -111,12 +111,12 @@ static cl::opt<bool>
 
 // These options can only used for debug builds.
 #ifndef NDEBUG
-static cl::list<std::string> 
-    SeedAllowList("attributor-seed-allow-list", cl::Hidden, 
+static cl::list<std::string>
+    SeedAllowList("attributor-seed-allow-list", cl::Hidden,
                   cl::desc("Comma seperated list of attribute names that are "
-                           "allowed to be seeded."), 
-                  cl::ZeroOrMore, cl::CommaSeparated); 
- 
+                           "allowed to be seeded."),
+                  cl::ZeroOrMore, cl::CommaSeparated);
+
 static cl::list<std::string> FunctionSeedAllowList(
     "attributor-function-seed-allow-list", cl::Hidden,
     cl::desc("Comma seperated list of function names that are "
@@ -141,194 +141,194 @@ static cl::opt<bool> PrintDependencies("attributor-print-dep", cl::Hidden,
                                        cl::desc("Print attribute dependencies"),
                                        cl::init(false));
 
-/// Logic operators for the change status enum class. 
-/// 
-///{ 
+/// Logic operators for the change status enum class.
+///
+///{
 ChangeStatus llvm::operator|(ChangeStatus L, ChangeStatus R) {
   return L == ChangeStatus::CHANGED ? L : R;
-} 
+}
 ChangeStatus llvm::operator&(ChangeStatus L, ChangeStatus R) {
   return L == ChangeStatus::UNCHANGED ? L : R;
-} 
-///} 
- 
-/// Return true if \p New is equal or worse than \p Old. 
-static bool isEqualOrWorse(const Attribute &New, const Attribute &Old) { 
-  if (!Old.isIntAttribute()) 
-    return true; 
- 
-  return Old.getValueAsInt() >= New.getValueAsInt(); 
-} 
- 
-/// Return true if the information provided by \p Attr was added to the 
-/// attribute list \p Attrs. This is only the case if it was not already present 
-/// in \p Attrs at the position describe by \p PK and \p AttrIdx. 
-static bool addIfNotExistent(LLVMContext &Ctx, const Attribute &Attr, 
-                             AttributeList &Attrs, int AttrIdx) { 
- 
-  if (Attr.isEnumAttribute()) { 
-    Attribute::AttrKind Kind = Attr.getKindAsEnum(); 
-    if (Attrs.hasAttribute(AttrIdx, Kind)) 
-      if (isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind))) 
-        return false; 
-    Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr); 
-    return true; 
-  } 
-  if (Attr.isStringAttribute()) { 
-    StringRef Kind = Attr.getKindAsString(); 
-    if (Attrs.hasAttribute(AttrIdx, Kind)) 
-      if (isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind))) 
-        return false; 
-    Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr); 
-    return true; 
-  } 
-  if (Attr.isIntAttribute()) { 
-    Attribute::AttrKind Kind = Attr.getKindAsEnum(); 
-    if (Attrs.hasAttribute(AttrIdx, Kind)) 
-      if (isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind))) 
-        return false; 
-    Attrs = Attrs.removeAttribute(Ctx, AttrIdx, Kind); 
-    Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr); 
-    return true; 
-  } 
- 
-  llvm_unreachable("Expected enum or string attribute!"); 
-} 
- 
-Argument *IRPosition::getAssociatedArgument() const { 
-  if (getPositionKind() == IRP_ARGUMENT) 
-    return cast<Argument>(&getAnchorValue()); 
- 
-  // Not an Argument and no argument number means this is not a call site 
-  // argument, thus we cannot find a callback argument to return. 
+}
+///}
+
+/// Return true if \p New is equal or worse than \p Old.
+static bool isEqualOrWorse(const Attribute &New, const Attribute &Old) {
+  if (!Old.isIntAttribute())
+    return true;
+
+  return Old.getValueAsInt() >= New.getValueAsInt();
+}
+
+/// Return true if the information provided by \p Attr was added to the
+/// attribute list \p Attrs. This is only the case if it was not already present
+/// in \p Attrs at the position describe by \p PK and \p AttrIdx.
+static bool addIfNotExistent(LLVMContext &Ctx, const Attribute &Attr,
+                             AttributeList &Attrs, int AttrIdx) {
+
+  if (Attr.isEnumAttribute()) {
+    Attribute::AttrKind Kind = Attr.getKindAsEnum();
+    if (Attrs.hasAttribute(AttrIdx, Kind))
+      if (isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind)))
+        return false;
+    Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr);
+    return true;
+  }
+  if (Attr.isStringAttribute()) {
+    StringRef Kind = Attr.getKindAsString();
+    if (Attrs.hasAttribute(AttrIdx, Kind))
+      if (isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind)))
+        return false;
+    Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr);
+    return true;
+  }
+  if (Attr.isIntAttribute()) {
+    Attribute::AttrKind Kind = Attr.getKindAsEnum();
+    if (Attrs.hasAttribute(AttrIdx, Kind))
+      if (isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind)))
+        return false;
+    Attrs = Attrs.removeAttribute(Ctx, AttrIdx, Kind);
+    Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr);
+    return true;
+  }
+
+  llvm_unreachable("Expected enum or string attribute!");
+}
+
+Argument *IRPosition::getAssociatedArgument() const {
+  if (getPositionKind() == IRP_ARGUMENT)
+    return cast<Argument>(&getAnchorValue());
+
+  // Not an Argument and no argument number means this is not a call site
+  // argument, thus we cannot find a callback argument to return.
   int ArgNo = getCallSiteArgNo();
-  if (ArgNo < 0) 
-    return nullptr; 
- 
-  // Use abstract call sites to make the connection between the call site 
-  // values and the ones in callbacks. If a callback was found that makes use 
-  // of the underlying call site operand, we want the corresponding callback 
-  // callee argument and not the direct callee argument. 
-  Optional<Argument *> CBCandidateArg; 
-  SmallVector<const Use *, 4> CallbackUses; 
-  const auto &CB = cast<CallBase>(getAnchorValue()); 
-  AbstractCallSite::getCallbackUses(CB, CallbackUses); 
-  for (const Use *U : CallbackUses) { 
-    AbstractCallSite ACS(U); 
-    assert(ACS && ACS.isCallbackCall()); 
-    if (!ACS.getCalledFunction()) 
-      continue; 
- 
-    for (unsigned u = 0, e = ACS.getNumArgOperands(); u < e; u++) { 
- 
-      // Test if the underlying call site operand is argument number u of the 
-      // callback callee. 
-      if (ACS.getCallArgOperandNo(u) != ArgNo) 
-        continue; 
- 
-      assert(ACS.getCalledFunction()->arg_size() > u && 
-             "ACS mapped into var-args arguments!"); 
-      if (CBCandidateArg.hasValue()) { 
-        CBCandidateArg = nullptr; 
-        break; 
-      } 
-      CBCandidateArg = ACS.getCalledFunction()->getArg(u); 
-    } 
-  } 
- 
-  // If we found a unique callback candidate argument, return it. 
-  if (CBCandidateArg.hasValue() && CBCandidateArg.getValue()) 
-    return CBCandidateArg.getValue(); 
- 
-  // If no callbacks were found, or none used the underlying call site operand 
-  // exclusively, use the direct callee argument if available. 
-  const Function *Callee = CB.getCalledFunction(); 
-  if (Callee && Callee->arg_size() > unsigned(ArgNo)) 
-    return Callee->getArg(ArgNo); 
- 
-  return nullptr; 
-} 
- 
-ChangeStatus AbstractAttribute::update(Attributor &A) { 
-  ChangeStatus HasChanged = ChangeStatus::UNCHANGED; 
-  if (getState().isAtFixpoint()) 
-    return HasChanged; 
- 
-  LLVM_DEBUG(dbgs() << "[Attributor] Update: " << *this << "\n"); 
- 
-  HasChanged = updateImpl(A); 
- 
-  LLVM_DEBUG(dbgs() << "[Attributor] Update " << HasChanged << " " << *this 
-                    << "\n"); 
- 
-  return HasChanged; 
-} 
- 
-ChangeStatus 
-IRAttributeManifest::manifestAttrs(Attributor &A, const IRPosition &IRP, 
-                                   const ArrayRef<Attribute> &DeducedAttrs) { 
-  Function *ScopeFn = IRP.getAnchorScope(); 
-  IRPosition::Kind PK = IRP.getPositionKind(); 
- 
-  // In the following some generic code that will manifest attributes in 
-  // DeducedAttrs if they improve the current IR. Due to the different 
-  // annotation positions we use the underlying AttributeList interface. 
- 
-  AttributeList Attrs; 
-  switch (PK) { 
-  case IRPosition::IRP_INVALID: 
-  case IRPosition::IRP_FLOAT: 
-    return ChangeStatus::UNCHANGED; 
-  case IRPosition::IRP_ARGUMENT: 
-  case IRPosition::IRP_FUNCTION: 
-  case IRPosition::IRP_RETURNED: 
-    Attrs = ScopeFn->getAttributes(); 
-    break; 
-  case IRPosition::IRP_CALL_SITE: 
-  case IRPosition::IRP_CALL_SITE_RETURNED: 
-  case IRPosition::IRP_CALL_SITE_ARGUMENT: 
-    Attrs = cast<CallBase>(IRP.getAnchorValue()).getAttributes(); 
-    break; 
-  } 
- 
-  ChangeStatus HasChanged = ChangeStatus::UNCHANGED; 
-  LLVMContext &Ctx = IRP.getAnchorValue().getContext(); 
-  for (const Attribute &Attr : DeducedAttrs) { 
-    if (!addIfNotExistent(Ctx, Attr, Attrs, IRP.getAttrIdx())) 
-      continue; 
- 
-    HasChanged = ChangeStatus::CHANGED; 
-  } 
- 
-  if (HasChanged == ChangeStatus::UNCHANGED) 
-    return HasChanged; 
- 
-  switch (PK) { 
-  case IRPosition::IRP_ARGUMENT: 
-  case IRPosition::IRP_FUNCTION: 
-  case IRPosition::IRP_RETURNED: 
-    ScopeFn->setAttributes(Attrs); 
-    break; 
-  case IRPosition::IRP_CALL_SITE: 
-  case IRPosition::IRP_CALL_SITE_RETURNED: 
-  case IRPosition::IRP_CALL_SITE_ARGUMENT: 
-    cast<CallBase>(IRP.getAnchorValue()).setAttributes(Attrs); 
-    break; 
-  case IRPosition::IRP_INVALID: 
-  case IRPosition::IRP_FLOAT: 
-    break; 
-  } 
- 
-  return HasChanged; 
-} 
- 
-const IRPosition IRPosition::EmptyKey(DenseMapInfo<void *>::getEmptyKey()); 
-const IRPosition 
-    IRPosition::TombstoneKey(DenseMapInfo<void *>::getTombstoneKey()); 
- 
-SubsumingPositionIterator::SubsumingPositionIterator(const IRPosition &IRP) { 
-  IRPositions.emplace_back(IRP); 
- 
+  if (ArgNo < 0)
+    return nullptr;
+
+  // Use abstract call sites to make the connection between the call site
+  // values and the ones in callbacks. If a callback was found that makes use
+  // of the underlying call site operand, we want the corresponding callback
+  // callee argument and not the direct callee argument.
+  Optional<Argument *> CBCandidateArg;
+  SmallVector<const Use *, 4> CallbackUses;
+  const auto &CB = cast<CallBase>(getAnchorValue());
+  AbstractCallSite::getCallbackUses(CB, CallbackUses);
+  for (const Use *U : CallbackUses) {
+    AbstractCallSite ACS(U);
+    assert(ACS && ACS.isCallbackCall());
+    if (!ACS.getCalledFunction())
+      continue;
+
+    for (unsigned u = 0, e = ACS.getNumArgOperands(); u < e; u++) {
+
+      // Test if the underlying call site operand is argument number u of the
+      // callback callee.
+      if (ACS.getCallArgOperandNo(u) != ArgNo)
+        continue;
+
+      assert(ACS.getCalledFunction()->arg_size() > u &&
+             "ACS mapped into var-args arguments!");
+      if (CBCandidateArg.hasValue()) {
+        CBCandidateArg = nullptr;
+        break;
+      }
+      CBCandidateArg = ACS.getCalledFunction()->getArg(u);
+    }
+  }
+
+  // If we found a unique callback candidate argument, return it.
+  if (CBCandidateArg.hasValue() && CBCandidateArg.getValue())
+    return CBCandidateArg.getValue();
+
+  // If no callbacks were found, or none used the underlying call site operand
+  // exclusively, use the direct callee argument if available.
+  const Function *Callee = CB.getCalledFunction();
+  if (Callee && Callee->arg_size() > unsigned(ArgNo))
+    return Callee->getArg(ArgNo);
+
+  return nullptr;
+}
+
+ChangeStatus AbstractAttribute::update(Attributor &A) {
+  ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
+  if (getState().isAtFixpoint())
+    return HasChanged;
+
+  LLVM_DEBUG(dbgs() << "[Attributor] Update: " << *this << "\n");
+
+  HasChanged = updateImpl(A);
+
+  LLVM_DEBUG(dbgs() << "[Attributor] Update " << HasChanged << " " << *this
+                    << "\n");
+
+  return HasChanged;
+}
+
+ChangeStatus
+IRAttributeManifest::manifestAttrs(Attributor &A, const IRPosition &IRP,
+                                   const ArrayRef<Attribute> &DeducedAttrs) {
+  Function *ScopeFn = IRP.getAnchorScope();
+  IRPosition::Kind PK = IRP.getPositionKind();
+
+  // In the following some generic code that will manifest attributes in
+  // DeducedAttrs if they improve the current IR. Due to the different
+  // annotation positions we use the underlying AttributeList interface.
+
+  AttributeList Attrs;
+  switch (PK) {
+  case IRPosition::IRP_INVALID:
+  case IRPosition::IRP_FLOAT:
+    return ChangeStatus::UNCHANGED;
+  case IRPosition::IRP_ARGUMENT:
+  case IRPosition::IRP_FUNCTION:
+  case IRPosition::IRP_RETURNED:
+    Attrs = ScopeFn->getAttributes();
+    break;
+  case IRPosition::IRP_CALL_SITE:
+  case IRPosition::IRP_CALL_SITE_RETURNED:
+  case IRPosition::IRP_CALL_SITE_ARGUMENT:
+    Attrs = cast<CallBase>(IRP.getAnchorValue()).getAttributes();
+    break;
+  }
+
+  ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
+  LLVMContext &Ctx = IRP.getAnchorValue().getContext();
+  for (const Attribute &Attr : DeducedAttrs) {
+    if (!addIfNotExistent(Ctx, Attr, Attrs, IRP.getAttrIdx()))
+      continue;
+
+    HasChanged = ChangeStatus::CHANGED;
+  }
+
+  if (HasChanged == ChangeStatus::UNCHANGED)
+    return HasChanged;
+
+  switch (PK) {
+  case IRPosition::IRP_ARGUMENT:
+  case IRPosition::IRP_FUNCTION:
+  case IRPosition::IRP_RETURNED:
+    ScopeFn->setAttributes(Attrs);
+    break;
+  case IRPosition::IRP_CALL_SITE:
+  case IRPosition::IRP_CALL_SITE_RETURNED:
+  case IRPosition::IRP_CALL_SITE_ARGUMENT:
+    cast<CallBase>(IRP.getAnchorValue()).setAttributes(Attrs);
+    break;
+  case IRPosition::IRP_INVALID:
+  case IRPosition::IRP_FLOAT:
+    break;
+  }
+
+  return HasChanged;
+}
+
+const IRPosition IRPosition::EmptyKey(DenseMapInfo<void *>::getEmptyKey());
+const IRPosition
+    IRPosition::TombstoneKey(DenseMapInfo<void *>::getTombstoneKey());
+
+SubsumingPositionIterator::SubsumingPositionIterator(const IRPosition &IRP) {
+  IRPositions.emplace_back(IRP);
+
   // Helper to determine if operand bundles on a call site are benin or
   // potentially problematic. We handle only llvm.assume for now.
   auto CanIgnoreOperandBundles = [](const CallBase &CB) {
@@ -336,843 +336,843 @@ SubsumingPositionIterator::SubsumingPositionIterator(const IRPosition &IRP) {
             cast<IntrinsicInst>(CB).getIntrinsicID() == Intrinsic ::assume);
   };
 
-  const auto *CB = dyn_cast<CallBase>(&IRP.getAnchorValue()); 
-  switch (IRP.getPositionKind()) { 
-  case IRPosition::IRP_INVALID: 
-  case IRPosition::IRP_FLOAT: 
-  case IRPosition::IRP_FUNCTION: 
-    return; 
-  case IRPosition::IRP_ARGUMENT: 
-  case IRPosition::IRP_RETURNED: 
-    IRPositions.emplace_back(IRPosition::function(*IRP.getAnchorScope())); 
-    return; 
-  case IRPosition::IRP_CALL_SITE: 
-    assert(CB && "Expected call site!"); 
-    // TODO: We need to look at the operand bundles similar to the redirection 
-    //       in CallBase. 
+  const auto *CB = dyn_cast<CallBase>(&IRP.getAnchorValue());
+  switch (IRP.getPositionKind()) {
+  case IRPosition::IRP_INVALID:
+  case IRPosition::IRP_FLOAT:
+  case IRPosition::IRP_FUNCTION:
+    return;
+  case IRPosition::IRP_ARGUMENT:
+  case IRPosition::IRP_RETURNED:
+    IRPositions.emplace_back(IRPosition::function(*IRP.getAnchorScope()));
+    return;
+  case IRPosition::IRP_CALL_SITE:
+    assert(CB && "Expected call site!");
+    // TODO: We need to look at the operand bundles similar to the redirection
+    //       in CallBase.
     if (!CB->hasOperandBundles() || CanIgnoreOperandBundles(*CB))
-      if (const Function *Callee = CB->getCalledFunction()) 
-        IRPositions.emplace_back(IRPosition::function(*Callee)); 
-    return; 
-  case IRPosition::IRP_CALL_SITE_RETURNED: 
-    assert(CB && "Expected call site!"); 
-    // TODO: We need to look at the operand bundles similar to the redirection 
-    //       in CallBase. 
+      if (const Function *Callee = CB->getCalledFunction())
+        IRPositions.emplace_back(IRPosition::function(*Callee));
+    return;
+  case IRPosition::IRP_CALL_SITE_RETURNED:
+    assert(CB && "Expected call site!");
+    // TODO: We need to look at the operand bundles similar to the redirection
+    //       in CallBase.
     if (!CB->hasOperandBundles() || CanIgnoreOperandBundles(*CB)) {
-      if (const Function *Callee = CB->getCalledFunction()) { 
-        IRPositions.emplace_back(IRPosition::returned(*Callee)); 
-        IRPositions.emplace_back(IRPosition::function(*Callee)); 
-        for (const Argument &Arg : Callee->args()) 
-          if (Arg.hasReturnedAttr()) { 
-            IRPositions.emplace_back( 
-                IRPosition::callsite_argument(*CB, Arg.getArgNo())); 
-            IRPositions.emplace_back( 
-                IRPosition::value(*CB->getArgOperand(Arg.getArgNo()))); 
-            IRPositions.emplace_back(IRPosition::argument(Arg)); 
-          } 
-      } 
-    } 
-    IRPositions.emplace_back(IRPosition::callsite_function(*CB)); 
-    return; 
-  case IRPosition::IRP_CALL_SITE_ARGUMENT: { 
+      if (const Function *Callee = CB->getCalledFunction()) {
+        IRPositions.emplace_back(IRPosition::returned(*Callee));
+        IRPositions.emplace_back(IRPosition::function(*Callee));
+        for (const Argument &Arg : Callee->args())
+          if (Arg.hasReturnedAttr()) {
+            IRPositions.emplace_back(
+                IRPosition::callsite_argument(*CB, Arg.getArgNo()));
+            IRPositions.emplace_back(
+                IRPosition::value(*CB->getArgOperand(Arg.getArgNo())));
+            IRPositions.emplace_back(IRPosition::argument(Arg));
+          }
+      }
+    }
+    IRPositions.emplace_back(IRPosition::callsite_function(*CB));
+    return;
+  case IRPosition::IRP_CALL_SITE_ARGUMENT: {
     assert(CB && "Expected call site!");
-    // TODO: We need to look at the operand bundles similar to the redirection 
-    //       in CallBase. 
+    // TODO: We need to look at the operand bundles similar to the redirection
+    //       in CallBase.
     if (!CB->hasOperandBundles() || CanIgnoreOperandBundles(*CB)) {
-      const Function *Callee = CB->getCalledFunction(); 
+      const Function *Callee = CB->getCalledFunction();
       if (Callee) {
         if (Argument *Arg = IRP.getAssociatedArgument())
           IRPositions.emplace_back(IRPosition::argument(*Arg));
-        IRPositions.emplace_back(IRPosition::function(*Callee)); 
+        IRPositions.emplace_back(IRPosition::function(*Callee));
       }
-    } 
-    IRPositions.emplace_back(IRPosition::value(IRP.getAssociatedValue())); 
-    return; 
-  } 
-  } 
-} 
- 
-bool IRPosition::hasAttr(ArrayRef<Attribute::AttrKind> AKs, 
-                         bool IgnoreSubsumingPositions, Attributor *A) const { 
-  SmallVector<Attribute, 4> Attrs; 
-  for (const IRPosition &EquivIRP : SubsumingPositionIterator(*this)) { 
-    for (Attribute::AttrKind AK : AKs) 
-      if (EquivIRP.getAttrsFromIRAttr(AK, Attrs)) 
-        return true; 
-    // The first position returned by the SubsumingPositionIterator is 
-    // always the position itself. If we ignore subsuming positions we 
-    // are done after the first iteration. 
-    if (IgnoreSubsumingPositions) 
-      break; 
-  } 
-  if (A) 
-    for (Attribute::AttrKind AK : AKs) 
-      if (getAttrsFromAssumes(AK, Attrs, *A)) 
-        return true; 
-  return false; 
-} 
- 
-void IRPosition::getAttrs(ArrayRef<Attribute::AttrKind> AKs, 
-                          SmallVectorImpl<Attribute> &Attrs, 
-                          bool IgnoreSubsumingPositions, Attributor *A) const { 
-  for (const IRPosition &EquivIRP : SubsumingPositionIterator(*this)) { 
-    for (Attribute::AttrKind AK : AKs) 
-      EquivIRP.getAttrsFromIRAttr(AK, Attrs); 
-    // The first position returned by the SubsumingPositionIterator is 
-    // always the position itself. If we ignore subsuming positions we 
-    // are done after the first iteration. 
-    if (IgnoreSubsumingPositions) 
-      break; 
-  } 
-  if (A) 
-    for (Attribute::AttrKind AK : AKs) 
-      getAttrsFromAssumes(AK, Attrs, *A); 
-} 
- 
-bool IRPosition::getAttrsFromIRAttr(Attribute::AttrKind AK, 
-                                    SmallVectorImpl<Attribute> &Attrs) const { 
-  if (getPositionKind() == IRP_INVALID || getPositionKind() == IRP_FLOAT) 
-    return false; 
- 
-  AttributeList AttrList; 
-  if (const auto *CB = dyn_cast<CallBase>(&getAnchorValue())) 
-    AttrList = CB->getAttributes(); 
-  else 
-    AttrList = getAssociatedFunction()->getAttributes(); 
- 
-  bool HasAttr = AttrList.hasAttribute(getAttrIdx(), AK); 
-  if (HasAttr) 
-    Attrs.push_back(AttrList.getAttribute(getAttrIdx(), AK)); 
-  return HasAttr; 
-} 
- 
-bool IRPosition::getAttrsFromAssumes(Attribute::AttrKind AK, 
-                                     SmallVectorImpl<Attribute> &Attrs, 
-                                     Attributor &A) const { 
-  assert(getPositionKind() != IRP_INVALID && "Did expect a valid position!"); 
-  Value &AssociatedValue = getAssociatedValue(); 
- 
-  const Assume2KnowledgeMap &A2K = 
-      A.getInfoCache().getKnowledgeMap().lookup({&AssociatedValue, AK}); 
- 
-  // Check if we found any potential assume use, if not we don't need to create 
-  // explorer iterators. 
-  if (A2K.empty()) 
-    return false; 
- 
-  LLVMContext &Ctx = AssociatedValue.getContext(); 
-  unsigned AttrsSize = Attrs.size(); 
-  MustBeExecutedContextExplorer &Explorer = 
-      A.getInfoCache().getMustBeExecutedContextExplorer(); 
-  auto EIt = Explorer.begin(getCtxI()), EEnd = Explorer.end(getCtxI()); 
-  for (auto &It : A2K) 
-    if (Explorer.findInContextOf(It.first, EIt, EEnd)) 
-      Attrs.push_back(Attribute::get(Ctx, AK, It.second.Max)); 
-  return AttrsSize != Attrs.size(); 
-} 
- 
-void IRPosition::verify() { 
-#ifdef EXPENSIVE_CHECKS 
-  switch (getPositionKind()) { 
-  case IRP_INVALID: 
-    assert(!Enc.getOpaqueValue() && 
-           "Expected a nullptr for an invalid position!"); 
-    return; 
-  case IRP_FLOAT: 
-    assert((!isa<CallBase>(&getAssociatedValue()) && 
-            !isa<Argument>(&getAssociatedValue())) && 
-           "Expected specialized kind for call base and argument values!"); 
-    return; 
-  case IRP_RETURNED: 
-    assert(isa<Function>(getAsValuePtr()) && 
-           "Expected function for a 'returned' position!"); 
-    assert(getAsValuePtr() == &getAssociatedValue() && 
-           "Associated value mismatch!"); 
-    return; 
-  case IRP_CALL_SITE_RETURNED: 
-    assert((isa<CallBase>(getAsValuePtr())) && 
-           "Expected call base for 'call site returned' position!"); 
-    assert(getAsValuePtr() == &getAssociatedValue() && 
-           "Associated value mismatch!"); 
-    return; 
-  case IRP_CALL_SITE: 
-    assert((isa<CallBase>(getAsValuePtr())) && 
-           "Expected call base for 'call site function' position!"); 
-    assert(getAsValuePtr() == &getAssociatedValue() && 
-           "Associated value mismatch!"); 
-    return; 
-  case IRP_FUNCTION: 
-    assert(isa<Function>(getAsValuePtr()) && 
-           "Expected function for a 'function' position!"); 
-    assert(getAsValuePtr() == &getAssociatedValue() && 
-           "Associated value mismatch!"); 
-    return; 
-  case IRP_ARGUMENT: 
-    assert(isa<Argument>(getAsValuePtr()) && 
-           "Expected argument for a 'argument' position!"); 
-    assert(getAsValuePtr() == &getAssociatedValue() && 
-           "Associated value mismatch!"); 
-    return; 
-  case IRP_CALL_SITE_ARGUMENT: { 
-    Use *U = getAsUsePtr(); 
-    assert(U && "Expected use for a 'call site argument' position!"); 
-    assert(isa<CallBase>(U->getUser()) && 
-           "Expected call base user for a 'call site argument' position!"); 
-    assert(cast<CallBase>(U->getUser())->isArgOperand(U) && 
-           "Expected call base argument operand for a 'call site argument' " 
-           "position"); 
-    assert(cast<CallBase>(U->getUser())->getArgOperandNo(U) == 
+    }
+    IRPositions.emplace_back(IRPosition::value(IRP.getAssociatedValue()));
+    return;
+  }
+  }
+}
+
+bool IRPosition::hasAttr(ArrayRef<Attribute::AttrKind> AKs,
+                         bool IgnoreSubsumingPositions, Attributor *A) const {
+  SmallVector<Attribute, 4> Attrs;
+  for (const IRPosition &EquivIRP : SubsumingPositionIterator(*this)) {
+    for (Attribute::AttrKind AK : AKs)
+      if (EquivIRP.getAttrsFromIRAttr(AK, Attrs))
+        return true;
+    // The first position returned by the SubsumingPositionIterator is
+    // always the position itself. If we ignore subsuming positions we
+    // are done after the first iteration.
+    if (IgnoreSubsumingPositions)
+      break;
+  }
+  if (A)
+    for (Attribute::AttrKind AK : AKs)
+      if (getAttrsFromAssumes(AK, Attrs, *A))
+        return true;
+  return false;
+}
+
+void IRPosition::getAttrs(ArrayRef<Attribute::AttrKind> AKs,
+                          SmallVectorImpl<Attribute> &Attrs,
+                          bool IgnoreSubsumingPositions, Attributor *A) const {
+  for (const IRPosition &EquivIRP : SubsumingPositionIterator(*this)) {
+    for (Attribute::AttrKind AK : AKs)
+      EquivIRP.getAttrsFromIRAttr(AK, Attrs);
+    // The first position returned by the SubsumingPositionIterator is
+    // always the position itself. If we ignore subsuming positions we
+    // are done after the first iteration.
+    if (IgnoreSubsumingPositions)
+      break;
+  }
+  if (A)
+    for (Attribute::AttrKind AK : AKs)
+      getAttrsFromAssumes(AK, Attrs, *A);
+}
+
+bool IRPosition::getAttrsFromIRAttr(Attribute::AttrKind AK,
+                                    SmallVectorImpl<Attribute> &Attrs) const {
+  if (getPositionKind() == IRP_INVALID || getPositionKind() == IRP_FLOAT)
+    return false;
+
+  AttributeList AttrList;
+  if (const auto *CB = dyn_cast<CallBase>(&getAnchorValue()))
+    AttrList = CB->getAttributes();
+  else
+    AttrList = getAssociatedFunction()->getAttributes();
+
+  bool HasAttr = AttrList.hasAttribute(getAttrIdx(), AK);
+  if (HasAttr)
+    Attrs.push_back(AttrList.getAttribute(getAttrIdx(), AK));
+  return HasAttr;
+}
+
+bool IRPosition::getAttrsFromAssumes(Attribute::AttrKind AK,
+                                     SmallVectorImpl<Attribute> &Attrs,
+                                     Attributor &A) const {
+  assert(getPositionKind() != IRP_INVALID && "Did expect a valid position!");
+  Value &AssociatedValue = getAssociatedValue();
+
+  const Assume2KnowledgeMap &A2K =
+      A.getInfoCache().getKnowledgeMap().lookup({&AssociatedValue, AK});
+
+  // Check if we found any potential assume use, if not we don't need to create
+  // explorer iterators.
+  if (A2K.empty())
+    return false;
+
+  LLVMContext &Ctx = AssociatedValue.getContext();
+  unsigned AttrsSize = Attrs.size();
+  MustBeExecutedContextExplorer &Explorer =
+      A.getInfoCache().getMustBeExecutedContextExplorer();
+  auto EIt = Explorer.begin(getCtxI()), EEnd = Explorer.end(getCtxI());
+  for (auto &It : A2K)
+    if (Explorer.findInContextOf(It.first, EIt, EEnd))
+      Attrs.push_back(Attribute::get(Ctx, AK, It.second.Max));
+  return AttrsSize != Attrs.size();
+}
+
+void IRPosition::verify() {
+#ifdef EXPENSIVE_CHECKS
+  switch (getPositionKind()) {
+  case IRP_INVALID:
+    assert(!Enc.getOpaqueValue() &&
+           "Expected a nullptr for an invalid position!");
+    return;
+  case IRP_FLOAT:
+    assert((!isa<CallBase>(&getAssociatedValue()) &&
+            !isa<Argument>(&getAssociatedValue())) &&
+           "Expected specialized kind for call base and argument values!");
+    return;
+  case IRP_RETURNED:
+    assert(isa<Function>(getAsValuePtr()) &&
+           "Expected function for a 'returned' position!");
+    assert(getAsValuePtr() == &getAssociatedValue() &&
+           "Associated value mismatch!");
+    return;
+  case IRP_CALL_SITE_RETURNED:
+    assert((isa<CallBase>(getAsValuePtr())) &&
+           "Expected call base for 'call site returned' position!");
+    assert(getAsValuePtr() == &getAssociatedValue() &&
+           "Associated value mismatch!");
+    return;
+  case IRP_CALL_SITE:
+    assert((isa<CallBase>(getAsValuePtr())) &&
+           "Expected call base for 'call site function' position!");
+    assert(getAsValuePtr() == &getAssociatedValue() &&
+           "Associated value mismatch!");
+    return;
+  case IRP_FUNCTION:
+    assert(isa<Function>(getAsValuePtr()) &&
+           "Expected function for a 'function' position!");
+    assert(getAsValuePtr() == &getAssociatedValue() &&
+           "Associated value mismatch!");
+    return;
+  case IRP_ARGUMENT:
+    assert(isa<Argument>(getAsValuePtr()) &&
+           "Expected argument for a 'argument' position!");
+    assert(getAsValuePtr() == &getAssociatedValue() &&
+           "Associated value mismatch!");
+    return;
+  case IRP_CALL_SITE_ARGUMENT: {
+    Use *U = getAsUsePtr();
+    assert(U && "Expected use for a 'call site argument' position!");
+    assert(isa<CallBase>(U->getUser()) &&
+           "Expected call base user for a 'call site argument' position!");
+    assert(cast<CallBase>(U->getUser())->isArgOperand(U) &&
+           "Expected call base argument operand for a 'call site argument' "
+           "position");
+    assert(cast<CallBase>(U->getUser())->getArgOperandNo(U) ==
                unsigned(getCallSiteArgNo()) &&
-           "Argument number mismatch!"); 
-    assert(U->get() == &getAssociatedValue() && "Associated value mismatch!"); 
-    return; 
-  } 
-  } 
-#endif 
-} 
- 
-Optional<Constant *> 
-Attributor::getAssumedConstant(const Value &V, const AbstractAttribute &AA, 
-                               bool &UsedAssumedInformation) { 
-  const auto &ValueSimplifyAA = getAAFor<AAValueSimplify>( 
-      AA, IRPosition::value(V), /* TrackDependence */ false); 
-  Optional<Value *> SimplifiedV = 
-      ValueSimplifyAA.getAssumedSimplifiedValue(*this); 
-  bool IsKnown = ValueSimplifyAA.isKnown(); 
-  UsedAssumedInformation |= !IsKnown; 
-  if (!SimplifiedV.hasValue()) { 
-    recordDependence(ValueSimplifyAA, AA, DepClassTy::OPTIONAL); 
-    return llvm::None; 
-  } 
-  if (isa_and_nonnull<UndefValue>(SimplifiedV.getValue())) { 
-    recordDependence(ValueSimplifyAA, AA, DepClassTy::OPTIONAL); 
-    return llvm::None; 
-  } 
-  Constant *CI = dyn_cast_or_null<Constant>(SimplifiedV.getValue()); 
-  if (CI && CI->getType() != V.getType()) { 
-    // TODO: Check for a save conversion. 
-    return nullptr; 
-  } 
-  if (CI) 
-    recordDependence(ValueSimplifyAA, AA, DepClassTy::OPTIONAL); 
-  return CI; 
-} 
- 
-Attributor::~Attributor() { 
-  // The abstract attributes are allocated via the BumpPtrAllocator Allocator, 
-  // thus we cannot delete them. We can, and want to, destruct them though. 
+           "Argument number mismatch!");
+    assert(U->get() == &getAssociatedValue() && "Associated value mismatch!");
+    return;
+  }
+  }
+#endif
+}
+
+Optional<Constant *>
+Attributor::getAssumedConstant(const Value &V, const AbstractAttribute &AA,
+                               bool &UsedAssumedInformation) {
+  const auto &ValueSimplifyAA = getAAFor<AAValueSimplify>(
+      AA, IRPosition::value(V), /* TrackDependence */ false);
+  Optional<Value *> SimplifiedV =
+      ValueSimplifyAA.getAssumedSimplifiedValue(*this);
+  bool IsKnown = ValueSimplifyAA.isKnown();
+  UsedAssumedInformation |= !IsKnown;
+  if (!SimplifiedV.hasValue()) {
+    recordDependence(ValueSimplifyAA, AA, DepClassTy::OPTIONAL);
+    return llvm::None;
+  }
+  if (isa_and_nonnull<UndefValue>(SimplifiedV.getValue())) {
+    recordDependence(ValueSimplifyAA, AA, DepClassTy::OPTIONAL);
+    return llvm::None;
+  }
+  Constant *CI = dyn_cast_or_null<Constant>(SimplifiedV.getValue());
+  if (CI && CI->getType() != V.getType()) {
+    // TODO: Check for a save conversion.
+    return nullptr;
+  }
+  if (CI)
+    recordDependence(ValueSimplifyAA, AA, DepClassTy::OPTIONAL);
+  return CI;
+}
+
+Attributor::~Attributor() {
+  // The abstract attributes are allocated via the BumpPtrAllocator Allocator,
+  // thus we cannot delete them. We can, and want to, destruct them though.
   for (auto &DepAA : DG.SyntheticRoot.Deps) {
     AbstractAttribute *AA = cast<AbstractAttribute>(DepAA.getPointer());
-    AA->~AbstractAttribute(); 
-  }
-} 
- 
-bool Attributor::isAssumedDead(const AbstractAttribute &AA, 
-                               const AAIsDead *FnLivenessAA, 
-                               bool CheckBBLivenessOnly, DepClassTy DepClass) { 
-  const IRPosition &IRP = AA.getIRPosition(); 
-  if (!Functions.count(IRP.getAnchorScope())) 
-    return false; 
-  return isAssumedDead(IRP, &AA, FnLivenessAA, CheckBBLivenessOnly, DepClass); 
-} 
- 
-bool Attributor::isAssumedDead(const Use &U, 
-                               const AbstractAttribute *QueryingAA, 
-                               const AAIsDead *FnLivenessAA, 
-                               bool CheckBBLivenessOnly, DepClassTy DepClass) { 
-  Instruction *UserI = dyn_cast<Instruction>(U.getUser()); 
-  if (!UserI) 
-    return isAssumedDead(IRPosition::value(*U.get()), QueryingAA, FnLivenessAA, 
-                         CheckBBLivenessOnly, DepClass); 
- 
-  if (auto *CB = dyn_cast<CallBase>(UserI)) { 
-    // For call site argument uses we can check if the argument is 
-    // unused/dead. 
-    if (CB->isArgOperand(&U)) { 
-      const IRPosition &CSArgPos = 
-          IRPosition::callsite_argument(*CB, CB->getArgOperandNo(&U)); 
-      return isAssumedDead(CSArgPos, QueryingAA, FnLivenessAA, 
-                           CheckBBLivenessOnly, DepClass); 
-    } 
-  } else if (ReturnInst *RI = dyn_cast<ReturnInst>(UserI)) { 
-    const IRPosition &RetPos = IRPosition::returned(*RI->getFunction()); 
-    return isAssumedDead(RetPos, QueryingAA, FnLivenessAA, CheckBBLivenessOnly, 
-                         DepClass); 
-  } else if (PHINode *PHI = dyn_cast<PHINode>(UserI)) { 
-    BasicBlock *IncomingBB = PHI->getIncomingBlock(U); 
-    return isAssumedDead(*IncomingBB->getTerminator(), QueryingAA, FnLivenessAA, 
-                         CheckBBLivenessOnly, DepClass); 
-  } 
- 
-  return isAssumedDead(IRPosition::value(*UserI), QueryingAA, FnLivenessAA, 
-                       CheckBBLivenessOnly, DepClass); 
-} 
- 
-bool Attributor::isAssumedDead(const Instruction &I, 
-                               const AbstractAttribute *QueryingAA, 
-                               const AAIsDead *FnLivenessAA, 
-                               bool CheckBBLivenessOnly, DepClassTy DepClass) { 
-  if (!FnLivenessAA) 
-    FnLivenessAA = lookupAAFor<AAIsDead>(IRPosition::function(*I.getFunction()), 
-                                         QueryingAA, 
-                                         /* TrackDependence */ false); 
- 
-  // If we have a context instruction and a liveness AA we use it. 
-  if (FnLivenessAA && 
-      FnLivenessAA->getIRPosition().getAnchorScope() == I.getFunction() && 
-      FnLivenessAA->isAssumedDead(&I)) { 
-    if (QueryingAA) 
-      recordDependence(*FnLivenessAA, *QueryingAA, DepClass); 
-    return true; 
-  } 
- 
-  if (CheckBBLivenessOnly) 
-    return false; 
- 
-  const AAIsDead &IsDeadAA = getOrCreateAAFor<AAIsDead>( 
-      IRPosition::value(I), QueryingAA, /* TrackDependence */ false); 
-  // Don't check liveness for AAIsDead. 
-  if (QueryingAA == &IsDeadAA) 
-    return false; 
- 
-  if (IsDeadAA.isAssumedDead()) { 
-    if (QueryingAA) 
-      recordDependence(IsDeadAA, *QueryingAA, DepClass); 
-    return true; 
-  } 
- 
-  return false; 
-} 
- 
-bool Attributor::isAssumedDead(const IRPosition &IRP, 
-                               const AbstractAttribute *QueryingAA, 
-                               const AAIsDead *FnLivenessAA, 
-                               bool CheckBBLivenessOnly, DepClassTy DepClass) { 
-  Instruction *CtxI = IRP.getCtxI(); 
-  if (CtxI && 
-      isAssumedDead(*CtxI, QueryingAA, FnLivenessAA, 
-                    /* CheckBBLivenessOnly */ true, 
-                    CheckBBLivenessOnly ? DepClass : DepClassTy::OPTIONAL)) 
-    return true; 
- 
-  if (CheckBBLivenessOnly) 
-    return false; 
- 
-  // If we haven't succeeded we query the specific liveness info for the IRP. 
-  const AAIsDead *IsDeadAA; 
-  if (IRP.getPositionKind() == IRPosition::IRP_CALL_SITE) 
-    IsDeadAA = &getOrCreateAAFor<AAIsDead>( 
-        IRPosition::callsite_returned(cast<CallBase>(IRP.getAssociatedValue())), 
-        QueryingAA, /* TrackDependence */ false); 
-  else 
-    IsDeadAA = &getOrCreateAAFor<AAIsDead>(IRP, QueryingAA, 
-                                           /* TrackDependence */ false); 
-  // Don't check liveness for AAIsDead. 
-  if (QueryingAA == IsDeadAA) 
-    return false; 
- 
-  if (IsDeadAA->isAssumedDead()) { 
-    if (QueryingAA) 
-      recordDependence(*IsDeadAA, *QueryingAA, DepClass); 
-    return true; 
-  } 
- 
-  return false; 
-} 
- 
-bool Attributor::checkForAllUses(function_ref<bool(const Use &, bool &)> Pred, 
-                                 const AbstractAttribute &QueryingAA, 
-                                 const Value &V, DepClassTy LivenessDepClass) { 
- 
-  // Check the trivial case first as it catches void values. 
-  if (V.use_empty()) 
-    return true; 
- 
-  // If the value is replaced by another one, for now a constant, we do not have 
-  // uses. Note that this requires users of `checkForAllUses` to not recurse but 
-  // instead use the `follow` callback argument to look at transitive users, 
-  // however, that should be clear from the presence of the argument. 
-  bool UsedAssumedInformation = false; 
-  Optional<Constant *> C = 
-      getAssumedConstant(V, QueryingAA, UsedAssumedInformation); 
-  if (C.hasValue() && C.getValue()) { 
-    LLVM_DEBUG(dbgs() << "[Attributor] Value is simplified, uses skipped: " << V 
-                      << " -> " << *C.getValue() << "\n"); 
-    return true; 
-  } 
- 
-  const IRPosition &IRP = QueryingAA.getIRPosition(); 
-  SmallVector<const Use *, 16> Worklist; 
-  SmallPtrSet<const Use *, 16> Visited; 
- 
-  for (const Use &U : V.uses()) 
-    Worklist.push_back(&U); 
- 
-  LLVM_DEBUG(dbgs() << "[Attributor] Got " << Worklist.size() 
-                    << " initial uses to check\n"); 
- 
-  const Function *ScopeFn = IRP.getAnchorScope(); 
-  const auto *LivenessAA = 
-      ScopeFn ? &getAAFor<AAIsDead>(QueryingAA, IRPosition::function(*ScopeFn), 
-                                    /* TrackDependence */ false) 
-              : nullptr; 
- 
-  while (!Worklist.empty()) { 
-    const Use *U = Worklist.pop_back_val(); 
-    if (!Visited.insert(U).second) 
-      continue; 
-    LLVM_DEBUG(dbgs() << "[Attributor] Check use: " << **U << " in " 
-                      << *U->getUser() << "\n"); 
-    if (isAssumedDead(*U, &QueryingAA, LivenessAA, 
-                      /* CheckBBLivenessOnly */ false, LivenessDepClass)) { 
-      LLVM_DEBUG(dbgs() << "[Attributor] Dead use, skip!\n"); 
-      continue; 
-    } 
-    if (U->getUser()->isDroppable()) { 
-      LLVM_DEBUG(dbgs() << "[Attributor] Droppable user, skip!\n"); 
-      continue; 
-    } 
- 
-    bool Follow = false; 
-    if (!Pred(*U, Follow)) 
-      return false; 
-    if (!Follow) 
-      continue; 
-    for (const Use &UU : U->getUser()->uses()) 
-      Worklist.push_back(&UU); 
-  } 
- 
-  return true; 
-} 
- 
-bool Attributor::checkForAllCallSites(function_ref<bool(AbstractCallSite)> Pred, 
-                                      const AbstractAttribute &QueryingAA, 
-                                      bool RequireAllCallSites, 
-                                      bool &AllCallSitesKnown) { 
-  // We can try to determine information from 
-  // the call sites. However, this is only possible all call sites are known, 
-  // hence the function has internal linkage. 
-  const IRPosition &IRP = QueryingAA.getIRPosition(); 
-  const Function *AssociatedFunction = IRP.getAssociatedFunction(); 
-  if (!AssociatedFunction) { 
-    LLVM_DEBUG(dbgs() << "[Attributor] No function associated with " << IRP 
-                      << "\n"); 
-    AllCallSitesKnown = false; 
-    return false; 
-  } 
- 
-  return checkForAllCallSites(Pred, *AssociatedFunction, RequireAllCallSites, 
-                              &QueryingAA, AllCallSitesKnown); 
-} 
- 
-bool Attributor::checkForAllCallSites(function_ref<bool(AbstractCallSite)> Pred, 
-                                      const Function &Fn, 
-                                      bool RequireAllCallSites, 
-                                      const AbstractAttribute *QueryingAA, 
-                                      bool &AllCallSitesKnown) { 
-  if (RequireAllCallSites && !Fn.hasLocalLinkage()) { 
-    LLVM_DEBUG( 
-        dbgs() 
-        << "[Attributor] Function " << Fn.getName() 
-        << " has no internal linkage, hence not all call sites are known\n"); 
-    AllCallSitesKnown = false; 
-    return false; 
-  } 
- 
-  // If we do not require all call sites we might not see all. 
-  AllCallSitesKnown = RequireAllCallSites; 
- 
-  SmallVector<const Use *, 8> Uses(make_pointer_range(Fn.uses())); 
-  for (unsigned u = 0; u < Uses.size(); ++u) { 
-    const Use &U = *Uses[u]; 
-    LLVM_DEBUG(dbgs() << "[Attributor] Check use: " << *U << " in " 
-                      << *U.getUser() << "\n"); 
-    if (isAssumedDead(U, QueryingAA, nullptr, /* CheckBBLivenessOnly */ true)) { 
-      LLVM_DEBUG(dbgs() << "[Attributor] Dead use, skip!\n"); 
-      continue; 
-    } 
-    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U.getUser())) { 
-      if (CE->isCast() && CE->getType()->isPointerTy() && 
-          CE->getType()->getPointerElementType()->isFunctionTy()) { 
-        for (const Use &CEU : CE->uses()) 
-          Uses.push_back(&CEU); 
-        continue; 
-      } 
-    } 
- 
-    AbstractCallSite ACS(&U); 
-    if (!ACS) { 
-      LLVM_DEBUG(dbgs() << "[Attributor] Function " << Fn.getName() 
-                        << " has non call site use " << *U.get() << " in " 
-                        << *U.getUser() << "\n"); 
-      // BlockAddress users are allowed. 
-      if (isa<BlockAddress>(U.getUser())) 
-        continue; 
-      return false; 
-    } 
- 
-    const Use *EffectiveUse = 
-        ACS.isCallbackCall() ? &ACS.getCalleeUseForCallback() : &U; 
-    if (!ACS.isCallee(EffectiveUse)) { 
-      if (!RequireAllCallSites) 
-        continue; 
-      LLVM_DEBUG(dbgs() << "[Attributor] User " << EffectiveUse->getUser() 
-                        << " is an invalid use of " << Fn.getName() << "\n"); 
-      return false; 
-    } 
- 
-    // Make sure the arguments that can be matched between the call site and the 
-    // callee argee on their type. It is unlikely they do not and it doesn't 
-    // make sense for all attributes to know/care about this. 
-    assert(&Fn == ACS.getCalledFunction() && "Expected known callee"); 
-    unsigned MinArgsParams = 
-        std::min(size_t(ACS.getNumArgOperands()), Fn.arg_size()); 
-    for (unsigned u = 0; u < MinArgsParams; ++u) { 
-      Value *CSArgOp = ACS.getCallArgOperand(u); 
-      if (CSArgOp && Fn.getArg(u)->getType() != CSArgOp->getType()) { 
-        LLVM_DEBUG( 
-            dbgs() << "[Attributor] Call site / callee argument type mismatch [" 
-                   << u << "@" << Fn.getName() << ": " 
-                   << *Fn.getArg(u)->getType() << " vs. " 
-                   << *ACS.getCallArgOperand(u)->getType() << "\n"); 
-        return false; 
-      } 
-    } 
- 
-    if (Pred(ACS)) 
-      continue; 
- 
-    LLVM_DEBUG(dbgs() << "[Attributor] Call site callback failed for " 
-                      << *ACS.getInstruction() << "\n"); 
-    return false; 
-  } 
- 
-  return true; 
-} 
- 
-bool Attributor::checkForAllReturnedValuesAndReturnInsts( 
-    function_ref<bool(Value &, const SmallSetVector<ReturnInst *, 4> &)> Pred, 
-    const AbstractAttribute &QueryingAA) { 
- 
-  const IRPosition &IRP = QueryingAA.getIRPosition(); 
-  // Since we need to provide return instructions we have to have an exact 
-  // definition. 
-  const Function *AssociatedFunction = IRP.getAssociatedFunction(); 
-  if (!AssociatedFunction) 
-    return false; 
- 
-  // If this is a call site query we use the call site specific return values 
-  // and liveness information. 
-  // TODO: use the function scope once we have call site AAReturnedValues. 
-  const IRPosition &QueryIRP = IRPosition::function(*AssociatedFunction); 
-  const auto &AARetVal = getAAFor<AAReturnedValues>(QueryingAA, QueryIRP); 
-  if (!AARetVal.getState().isValidState()) 
-    return false; 
- 
-  return AARetVal.checkForAllReturnedValuesAndReturnInsts(Pred); 
-} 
- 
-bool Attributor::checkForAllReturnedValues( 
-    function_ref<bool(Value &)> Pred, const AbstractAttribute &QueryingAA) { 
- 
-  const IRPosition &IRP = QueryingAA.getIRPosition(); 
-  const Function *AssociatedFunction = IRP.getAssociatedFunction(); 
-  if (!AssociatedFunction) 
-    return false; 
- 
-  // TODO: use the function scope once we have call site AAReturnedValues. 
-  const IRPosition &QueryIRP = IRPosition::function(*AssociatedFunction); 
-  const auto &AARetVal = getAAFor<AAReturnedValues>(QueryingAA, QueryIRP); 
-  if (!AARetVal.getState().isValidState()) 
-    return false; 
- 
-  return AARetVal.checkForAllReturnedValuesAndReturnInsts( 
-      [&](Value &RV, const SmallSetVector<ReturnInst *, 4> &) { 
-        return Pred(RV); 
-      }); 
-} 
- 
-static bool checkForAllInstructionsImpl( 
-    Attributor *A, InformationCache::OpcodeInstMapTy &OpcodeInstMap, 
-    function_ref<bool(Instruction &)> Pred, const AbstractAttribute *QueryingAA, 
-    const AAIsDead *LivenessAA, const ArrayRef<unsigned> &Opcodes, 
-    bool CheckBBLivenessOnly = false) { 
-  for (unsigned Opcode : Opcodes) { 
-    // Check if we have instructions with this opcode at all first. 
-    auto *Insts = OpcodeInstMap.lookup(Opcode); 
-    if (!Insts) 
-      continue; 
- 
-    for (Instruction *I : *Insts) { 
-      // Skip dead instructions. 
-      if (A && A->isAssumedDead(IRPosition::value(*I), QueryingAA, LivenessAA, 
-                                CheckBBLivenessOnly)) 
-        continue; 
- 
-      if (!Pred(*I)) 
-        return false; 
-    } 
-  } 
-  return true; 
-} 
- 
-bool Attributor::checkForAllInstructions(function_ref<bool(Instruction &)> Pred, 
-                                         const AbstractAttribute &QueryingAA, 
-                                         const ArrayRef<unsigned> &Opcodes, 
-                                         bool CheckBBLivenessOnly) { 
- 
-  const IRPosition &IRP = QueryingAA.getIRPosition(); 
-  // Since we need to provide instructions we have to have an exact definition. 
-  const Function *AssociatedFunction = IRP.getAssociatedFunction(); 
-  if (!AssociatedFunction) 
-    return false; 
- 
-  // TODO: use the function scope once we have call site AAReturnedValues. 
-  const IRPosition &QueryIRP = IRPosition::function(*AssociatedFunction); 
+    AA->~AbstractAttribute();
+  }
+}
+
+bool Attributor::isAssumedDead(const AbstractAttribute &AA,
+                               const AAIsDead *FnLivenessAA,
+                               bool CheckBBLivenessOnly, DepClassTy DepClass) {
+  const IRPosition &IRP = AA.getIRPosition();
+  if (!Functions.count(IRP.getAnchorScope()))
+    return false;
+  return isAssumedDead(IRP, &AA, FnLivenessAA, CheckBBLivenessOnly, DepClass);
+}
+
+bool Attributor::isAssumedDead(const Use &U,
+                               const AbstractAttribute *QueryingAA,
+                               const AAIsDead *FnLivenessAA,
+                               bool CheckBBLivenessOnly, DepClassTy DepClass) {
+  Instruction *UserI = dyn_cast<Instruction>(U.getUser());
+  if (!UserI)
+    return isAssumedDead(IRPosition::value(*U.get()), QueryingAA, FnLivenessAA,
+                         CheckBBLivenessOnly, DepClass);
+
+  if (auto *CB = dyn_cast<CallBase>(UserI)) {
+    // For call site argument uses we can check if the argument is
+    // unused/dead.
+    if (CB->isArgOperand(&U)) {
+      const IRPosition &CSArgPos =
+          IRPosition::callsite_argument(*CB, CB->getArgOperandNo(&U));
+      return isAssumedDead(CSArgPos, QueryingAA, FnLivenessAA,
+                           CheckBBLivenessOnly, DepClass);
+    }
+  } else if (ReturnInst *RI = dyn_cast<ReturnInst>(UserI)) {
+    const IRPosition &RetPos = IRPosition::returned(*RI->getFunction());
+    return isAssumedDead(RetPos, QueryingAA, FnLivenessAA, CheckBBLivenessOnly,
+                         DepClass);
+  } else if (PHINode *PHI = dyn_cast<PHINode>(UserI)) {
+    BasicBlock *IncomingBB = PHI->getIncomingBlock(U);
+    return isAssumedDead(*IncomingBB->getTerminator(), QueryingAA, FnLivenessAA,
+                         CheckBBLivenessOnly, DepClass);
+  }
+
+  return isAssumedDead(IRPosition::value(*UserI), QueryingAA, FnLivenessAA,
+                       CheckBBLivenessOnly, DepClass);
+}
+
+bool Attributor::isAssumedDead(const Instruction &I,
+                               const AbstractAttribute *QueryingAA,
+                               const AAIsDead *FnLivenessAA,
+                               bool CheckBBLivenessOnly, DepClassTy DepClass) {
+  if (!FnLivenessAA)
+    FnLivenessAA = lookupAAFor<AAIsDead>(IRPosition::function(*I.getFunction()),
+                                         QueryingAA,
+                                         /* TrackDependence */ false);
+
+  // If we have a context instruction and a liveness AA we use it.
+  if (FnLivenessAA &&
+      FnLivenessAA->getIRPosition().getAnchorScope() == I.getFunction() &&
+      FnLivenessAA->isAssumedDead(&I)) {
+    if (QueryingAA)
+      recordDependence(*FnLivenessAA, *QueryingAA, DepClass);
+    return true;
+  }
+
+  if (CheckBBLivenessOnly)
+    return false;
+
+  const AAIsDead &IsDeadAA = getOrCreateAAFor<AAIsDead>(
+      IRPosition::value(I), QueryingAA, /* TrackDependence */ false);
+  // Don't check liveness for AAIsDead.
+  if (QueryingAA == &IsDeadAA)
+    return false;
+
+  if (IsDeadAA.isAssumedDead()) {
+    if (QueryingAA)
+      recordDependence(IsDeadAA, *QueryingAA, DepClass);
+    return true;
+  }
+
+  return false;
+}
+
+bool Attributor::isAssumedDead(const IRPosition &IRP,
+                               const AbstractAttribute *QueryingAA,
+                               const AAIsDead *FnLivenessAA,
+                               bool CheckBBLivenessOnly, DepClassTy DepClass) {
+  Instruction *CtxI = IRP.getCtxI();
+  if (CtxI &&
+      isAssumedDead(*CtxI, QueryingAA, FnLivenessAA,
+                    /* CheckBBLivenessOnly */ true,
+                    CheckBBLivenessOnly ? DepClass : DepClassTy::OPTIONAL))
+    return true;
+
+  if (CheckBBLivenessOnly)
+    return false;
+
+  // If we haven't succeeded we query the specific liveness info for the IRP.
+  const AAIsDead *IsDeadAA;
+  if (IRP.getPositionKind() == IRPosition::IRP_CALL_SITE)
+    IsDeadAA = &getOrCreateAAFor<AAIsDead>(
+        IRPosition::callsite_returned(cast<CallBase>(IRP.getAssociatedValue())),
+        QueryingAA, /* TrackDependence */ false);
+  else
+    IsDeadAA = &getOrCreateAAFor<AAIsDead>(IRP, QueryingAA,
+                                           /* TrackDependence */ false);
+  // Don't check liveness for AAIsDead.
+  if (QueryingAA == IsDeadAA)
+    return false;
+
+  if (IsDeadAA->isAssumedDead()) {
+    if (QueryingAA)
+      recordDependence(*IsDeadAA, *QueryingAA, DepClass);
+    return true;
+  }
+
+  return false;
+}
+
+bool Attributor::checkForAllUses(function_ref<bool(const Use &, bool &)> Pred,
+                                 const AbstractAttribute &QueryingAA,
+                                 const Value &V, DepClassTy LivenessDepClass) {
+
+  // Check the trivial case first as it catches void values.
+  if (V.use_empty())
+    return true;
+
+  // If the value is replaced by another one, for now a constant, we do not have
+  // uses. Note that this requires users of `checkForAllUses` to not recurse but
+  // instead use the `follow` callback argument to look at transitive users,
+  // however, that should be clear from the presence of the argument.
+  bool UsedAssumedInformation = false;
+  Optional<Constant *> C =
+      getAssumedConstant(V, QueryingAA, UsedAssumedInformation);
+  if (C.hasValue() && C.getValue()) {
+    LLVM_DEBUG(dbgs() << "[Attributor] Value is simplified, uses skipped: " << V
+                      << " -> " << *C.getValue() << "\n");
+    return true;
+  }
+
+  const IRPosition &IRP = QueryingAA.getIRPosition();
+  SmallVector<const Use *, 16> Worklist;
+  SmallPtrSet<const Use *, 16> Visited;
+
+  for (const Use &U : V.uses())
+    Worklist.push_back(&U);
+
+  LLVM_DEBUG(dbgs() << "[Attributor] Got " << Worklist.size()
+                    << " initial uses to check\n");
+
+  const Function *ScopeFn = IRP.getAnchorScope();
+  const auto *LivenessAA =
+      ScopeFn ? &getAAFor<AAIsDead>(QueryingAA, IRPosition::function(*ScopeFn),
+                                    /* TrackDependence */ false)
+              : nullptr;
+
+  while (!Worklist.empty()) {
+    const Use *U = Worklist.pop_back_val();
+    if (!Visited.insert(U).second)
+      continue;
+    LLVM_DEBUG(dbgs() << "[Attributor] Check use: " << **U << " in "
+                      << *U->getUser() << "\n");
+    if (isAssumedDead(*U, &QueryingAA, LivenessAA,
+                      /* CheckBBLivenessOnly */ false, LivenessDepClass)) {
+      LLVM_DEBUG(dbgs() << "[Attributor] Dead use, skip!\n");
+      continue;
+    }
+    if (U->getUser()->isDroppable()) {
+      LLVM_DEBUG(dbgs() << "[Attributor] Droppable user, skip!\n");
+      continue;
+    }
+
+    bool Follow = false;
+    if (!Pred(*U, Follow))
+      return false;
+    if (!Follow)
+      continue;
+    for (const Use &UU : U->getUser()->uses())
+      Worklist.push_back(&UU);
+  }
+
+  return true;
+}
+
+bool Attributor::checkForAllCallSites(function_ref<bool(AbstractCallSite)> Pred,
+                                      const AbstractAttribute &QueryingAA,
+                                      bool RequireAllCallSites,
+                                      bool &AllCallSitesKnown) {
+  // We can try to determine information from
+  // the call sites. However, this is only possible all call sites are known,
+  // hence the function has internal linkage.
+  const IRPosition &IRP = QueryingAA.getIRPosition();
+  const Function *AssociatedFunction = IRP.getAssociatedFunction();
+  if (!AssociatedFunction) {
+    LLVM_DEBUG(dbgs() << "[Attributor] No function associated with " << IRP
+                      << "\n");
+    AllCallSitesKnown = false;
+    return false;
+  }
+
+  return checkForAllCallSites(Pred, *AssociatedFunction, RequireAllCallSites,
+                              &QueryingAA, AllCallSitesKnown);
+}
+
+bool Attributor::checkForAllCallSites(function_ref<bool(AbstractCallSite)> Pred,
+                                      const Function &Fn,
+                                      bool RequireAllCallSites,
+                                      const AbstractAttribute *QueryingAA,
+                                      bool &AllCallSitesKnown) {
+  if (RequireAllCallSites && !Fn.hasLocalLinkage()) {
+    LLVM_DEBUG(
+        dbgs()
+        << "[Attributor] Function " << Fn.getName()
+        << " has no internal linkage, hence not all call sites are known\n");
+    AllCallSitesKnown = false;
+    return false;
+  }
+
+  // If we do not require all call sites we might not see all.
+  AllCallSitesKnown = RequireAllCallSites;
+
+  SmallVector<const Use *, 8> Uses(make_pointer_range(Fn.uses()));
+  for (unsigned u = 0; u < Uses.size(); ++u) {
+    const Use &U = *Uses[u];
+    LLVM_DEBUG(dbgs() << "[Attributor] Check use: " << *U << " in "
+                      << *U.getUser() << "\n");
+    if (isAssumedDead(U, QueryingAA, nullptr, /* CheckBBLivenessOnly */ true)) {
+      LLVM_DEBUG(dbgs() << "[Attributor] Dead use, skip!\n");
+      continue;
+    }
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U.getUser())) {
+      if (CE->isCast() && CE->getType()->isPointerTy() &&
+          CE->getType()->getPointerElementType()->isFunctionTy()) {
+        for (const Use &CEU : CE->uses())
+          Uses.push_back(&CEU);
+        continue;
+      }
+    }
+
+    AbstractCallSite ACS(&U);
+    if (!ACS) {
+      LLVM_DEBUG(dbgs() << "[Attributor] Function " << Fn.getName()
+                        << " has non call site use " << *U.get() << " in "
+                        << *U.getUser() << "\n");
+      // BlockAddress users are allowed.
+      if (isa<BlockAddress>(U.getUser()))
+        continue;
+      return false;
+    }
+
+    const Use *EffectiveUse =
+        ACS.isCallbackCall() ? &ACS.getCalleeUseForCallback() : &U;
+    if (!ACS.isCallee(EffectiveUse)) {
+      if (!RequireAllCallSites)
+        continue;
+      LLVM_DEBUG(dbgs() << "[Attributor] User " << EffectiveUse->getUser()
+                        << " is an invalid use of " << Fn.getName() << "\n");
+      return false;
+    }
+
+    // Make sure the arguments that can be matched between the call site and the
+    // callee argee on their type. It is unlikely they do not and it doesn't
+    // make sense for all attributes to know/care about this.
+    assert(&Fn == ACS.getCalledFunction() && "Expected known callee");
+    unsigned MinArgsParams =
+        std::min(size_t(ACS.getNumArgOperands()), Fn.arg_size());
+    for (unsigned u = 0; u < MinArgsParams; ++u) {
+      Value *CSArgOp = ACS.getCallArgOperand(u);
+      if (CSArgOp && Fn.getArg(u)->getType() != CSArgOp->getType()) {
+        LLVM_DEBUG(
+            dbgs() << "[Attributor] Call site / callee argument type mismatch ["
+                   << u << "@" << Fn.getName() << ": "
+                   << *Fn.getArg(u)->getType() << " vs. "
+                   << *ACS.getCallArgOperand(u)->getType() << "\n");
+        return false;
+      }
+    }
+
+    if (Pred(ACS))
+      continue;
+
+    LLVM_DEBUG(dbgs() << "[Attributor] Call site callback failed for "
+                      << *ACS.getInstruction() << "\n");
+    return false;
+  }
+
+  return true;
+}
+
+bool Attributor::checkForAllReturnedValuesAndReturnInsts(
+    function_ref<bool(Value &, const SmallSetVector<ReturnInst *, 4> &)> Pred,
+    const AbstractAttribute &QueryingAA) {
+
+  const IRPosition &IRP = QueryingAA.getIRPosition();
+  // Since we need to provide return instructions we have to have an exact
+  // definition.
+  const Function *AssociatedFunction = IRP.getAssociatedFunction();
+  if (!AssociatedFunction)
+    return false;
+
+  // If this is a call site query we use the call site specific return values
+  // and liveness information.
+  // TODO: use the function scope once we have call site AAReturnedValues.
+  const IRPosition &QueryIRP = IRPosition::function(*AssociatedFunction);
+  const auto &AARetVal = getAAFor<AAReturnedValues>(QueryingAA, QueryIRP);
+  if (!AARetVal.getState().isValidState())
+    return false;
+
+  return AARetVal.checkForAllReturnedValuesAndReturnInsts(Pred);
+}
+
+bool Attributor::checkForAllReturnedValues(
+    function_ref<bool(Value &)> Pred, const AbstractAttribute &QueryingAA) {
+
+  const IRPosition &IRP = QueryingAA.getIRPosition();
+  const Function *AssociatedFunction = IRP.getAssociatedFunction();
+  if (!AssociatedFunction)
+    return false;
+
+  // TODO: use the function scope once we have call site AAReturnedValues.
+  const IRPosition &QueryIRP = IRPosition::function(*AssociatedFunction);
+  const auto &AARetVal = getAAFor<AAReturnedValues>(QueryingAA, QueryIRP);
+  if (!AARetVal.getState().isValidState())
+    return false;
+
+  return AARetVal.checkForAllReturnedValuesAndReturnInsts(
+      [&](Value &RV, const SmallSetVector<ReturnInst *, 4> &) {
+        return Pred(RV);
+      });
+}
+
+static bool checkForAllInstructionsImpl(
+    Attributor *A, InformationCache::OpcodeInstMapTy &OpcodeInstMap,
+    function_ref<bool(Instruction &)> Pred, const AbstractAttribute *QueryingAA,
+    const AAIsDead *LivenessAA, const ArrayRef<unsigned> &Opcodes,
+    bool CheckBBLivenessOnly = false) {
+  for (unsigned Opcode : Opcodes) {
+    // Check if we have instructions with this opcode at all first.
+    auto *Insts = OpcodeInstMap.lookup(Opcode);
+    if (!Insts)
+      continue;
+
+    for (Instruction *I : *Insts) {
+      // Skip dead instructions.
+      if (A && A->isAssumedDead(IRPosition::value(*I), QueryingAA, LivenessAA,
+                                CheckBBLivenessOnly))
+        continue;
+
+      if (!Pred(*I))
+        return false;
+    }
+  }
+  return true;
+}
+
+bool Attributor::checkForAllInstructions(function_ref<bool(Instruction &)> Pred,
+                                         const AbstractAttribute &QueryingAA,
+                                         const ArrayRef<unsigned> &Opcodes,
+                                         bool CheckBBLivenessOnly) {
+
+  const IRPosition &IRP = QueryingAA.getIRPosition();
+  // Since we need to provide instructions we have to have an exact definition.
+  const Function *AssociatedFunction = IRP.getAssociatedFunction();
+  if (!AssociatedFunction)
+    return false;
+
+  // TODO: use the function scope once we have call site AAReturnedValues.
+  const IRPosition &QueryIRP = IRPosition::function(*AssociatedFunction);
   const auto *LivenessAA =
       CheckBBLivenessOnly ? nullptr
                           : &(getAAFor<AAIsDead>(QueryingAA, QueryIRP,
                                                  /* TrackDependence */ false));
- 
-  auto &OpcodeInstMap = 
-      InfoCache.getOpcodeInstMapForFunction(*AssociatedFunction); 
-  if (!checkForAllInstructionsImpl(this, OpcodeInstMap, Pred, &QueryingAA, 
+
+  auto &OpcodeInstMap =
+      InfoCache.getOpcodeInstMapForFunction(*AssociatedFunction);
+  if (!checkForAllInstructionsImpl(this, OpcodeInstMap, Pred, &QueryingAA,
                                    LivenessAA, Opcodes, CheckBBLivenessOnly))
-    return false; 
- 
-  return true; 
-} 
- 
-bool Attributor::checkForAllReadWriteInstructions( 
-    function_ref<bool(Instruction &)> Pred, AbstractAttribute &QueryingAA) { 
- 
-  const Function *AssociatedFunction = 
-      QueryingAA.getIRPosition().getAssociatedFunction(); 
-  if (!AssociatedFunction) 
-    return false; 
- 
-  // TODO: use the function scope once we have call site AAReturnedValues. 
-  const IRPosition &QueryIRP = IRPosition::function(*AssociatedFunction); 
-  const auto &LivenessAA = 
-      getAAFor<AAIsDead>(QueryingAA, QueryIRP, /* TrackDependence */ false); 
- 
-  for (Instruction *I : 
-       InfoCache.getReadOrWriteInstsForFunction(*AssociatedFunction)) { 
-    // Skip dead instructions. 
-    if (isAssumedDead(IRPosition::value(*I), &QueryingAA, &LivenessAA)) 
-      continue; 
- 
-    if (!Pred(*I)) 
-      return false; 
-  } 
- 
-  return true; 
-} 
- 
-void Attributor::runTillFixpoint() { 
+    return false;
+
+  return true;
+}
+
+bool Attributor::checkForAllReadWriteInstructions(
+    function_ref<bool(Instruction &)> Pred, AbstractAttribute &QueryingAA) {
+
+  const Function *AssociatedFunction =
+      QueryingAA.getIRPosition().getAssociatedFunction();
+  if (!AssociatedFunction)
+    return false;
+
+  // TODO: use the function scope once we have call site AAReturnedValues.
+  const IRPosition &QueryIRP = IRPosition::function(*AssociatedFunction);
+  const auto &LivenessAA =
+      getAAFor<AAIsDead>(QueryingAA, QueryIRP, /* TrackDependence */ false);
+
+  for (Instruction *I :
+       InfoCache.getReadOrWriteInstsForFunction(*AssociatedFunction)) {
+    // Skip dead instructions.
+    if (isAssumedDead(IRPosition::value(*I), &QueryingAA, &LivenessAA))
+      continue;
+
+    if (!Pred(*I))
+      return false;
+  }
+
+  return true;
+}
+
+void Attributor::runTillFixpoint() {
   TimeTraceScope TimeScope("Attributor::runTillFixpoint");
-  LLVM_DEBUG(dbgs() << "[Attributor] Identified and initialized " 
+  LLVM_DEBUG(dbgs() << "[Attributor] Identified and initialized "
                     << DG.SyntheticRoot.Deps.size()
-                    << " abstract attributes.\n"); 
- 
-  // Now that all abstract attributes are collected and initialized we start 
-  // the abstract analysis. 
- 
-  unsigned IterationCounter = 1; 
- 
-  SmallVector<AbstractAttribute *, 32> ChangedAAs; 
-  SetVector<AbstractAttribute *> Worklist, InvalidAAs; 
+                    << " abstract attributes.\n");
+
+  // Now that all abstract attributes are collected and initialized we start
+  // the abstract analysis.
+
+  unsigned IterationCounter = 1;
+
+  SmallVector<AbstractAttribute *, 32> ChangedAAs;
+  SetVector<AbstractAttribute *> Worklist, InvalidAAs;
   Worklist.insert(DG.SyntheticRoot.begin(), DG.SyntheticRoot.end());
- 
-  do { 
-    // Remember the size to determine new attributes. 
+
+  do {
+    // Remember the size to determine new attributes.
     size_t NumAAs = DG.SyntheticRoot.Deps.size();
-    LLVM_DEBUG(dbgs() << "\n\n[Attributor] #Iteration: " << IterationCounter 
-                      << ", Worklist size: " << Worklist.size() << "\n"); 
- 
-    // For invalid AAs we can fix dependent AAs that have a required dependence, 
-    // thereby folding long dependence chains in a single step without the need 
-    // to run updates. 
-    for (unsigned u = 0; u < InvalidAAs.size(); ++u) { 
-      AbstractAttribute *InvalidAA = InvalidAAs[u]; 
- 
-      // Check the dependences to fast track invalidation. 
-      LLVM_DEBUG(dbgs() << "[Attributor] InvalidAA: " << *InvalidAA << " has " 
-                        << InvalidAA->Deps.size() 
-                        << " required & optional dependences\n"); 
-      while (!InvalidAA->Deps.empty()) { 
-        const auto &Dep = InvalidAA->Deps.back(); 
-        InvalidAA->Deps.pop_back(); 
+    LLVM_DEBUG(dbgs() << "\n\n[Attributor] #Iteration: " << IterationCounter
+                      << ", Worklist size: " << Worklist.size() << "\n");
+
+    // For invalid AAs we can fix dependent AAs that have a required dependence,
+    // thereby folding long dependence chains in a single step without the need
+    // to run updates.
+    for (unsigned u = 0; u < InvalidAAs.size(); ++u) {
+      AbstractAttribute *InvalidAA = InvalidAAs[u];
+
+      // Check the dependences to fast track invalidation.
+      LLVM_DEBUG(dbgs() << "[Attributor] InvalidAA: " << *InvalidAA << " has "
+                        << InvalidAA->Deps.size()
+                        << " required & optional dependences\n");
+      while (!InvalidAA->Deps.empty()) {
+        const auto &Dep = InvalidAA->Deps.back();
+        InvalidAA->Deps.pop_back();
         AbstractAttribute *DepAA = cast<AbstractAttribute>(Dep.getPointer());
-        if (Dep.getInt() == unsigned(DepClassTy::OPTIONAL)) { 
-          Worklist.insert(DepAA); 
-          continue; 
-        } 
-        DepAA->getState().indicatePessimisticFixpoint(); 
-        assert(DepAA->getState().isAtFixpoint() && "Expected fixpoint state!"); 
-        if (!DepAA->getState().isValidState()) 
-          InvalidAAs.insert(DepAA); 
-        else 
-          ChangedAAs.push_back(DepAA); 
-      } 
-    } 
- 
-    // Add all abstract attributes that are potentially dependent on one that 
-    // changed to the work list. 
-    for (AbstractAttribute *ChangedAA : ChangedAAs) 
-      while (!ChangedAA->Deps.empty()) { 
+        if (Dep.getInt() == unsigned(DepClassTy::OPTIONAL)) {
+          Worklist.insert(DepAA);
+          continue;
+        }
+        DepAA->getState().indicatePessimisticFixpoint();
+        assert(DepAA->getState().isAtFixpoint() && "Expected fixpoint state!");
+        if (!DepAA->getState().isValidState())
+          InvalidAAs.insert(DepAA);
+        else
+          ChangedAAs.push_back(DepAA);
+      }
+    }
+
+    // Add all abstract attributes that are potentially dependent on one that
+    // changed to the work list.
+    for (AbstractAttribute *ChangedAA : ChangedAAs)
+      while (!ChangedAA->Deps.empty()) {
         Worklist.insert(
             cast<AbstractAttribute>(ChangedAA->Deps.back().getPointer()));
-        ChangedAA->Deps.pop_back(); 
-      } 
- 
-    LLVM_DEBUG(dbgs() << "[Attributor] #Iteration: " << IterationCounter 
-                      << ", Worklist+Dependent size: " << Worklist.size() 
-                      << "\n"); 
- 
-    // Reset the changed and invalid set. 
-    ChangedAAs.clear(); 
-    InvalidAAs.clear(); 
- 
-    // Update all abstract attribute in the work list and record the ones that 
-    // changed. 
-    for (AbstractAttribute *AA : Worklist) { 
-      const auto &AAState = AA->getState(); 
-      if (!AAState.isAtFixpoint()) 
-        if (updateAA(*AA) == ChangeStatus::CHANGED) 
-          ChangedAAs.push_back(AA); 
- 
-      // Use the InvalidAAs vector to propagate invalid states fast transitively 
-      // without requiring updates. 
-      if (!AAState.isValidState()) 
-        InvalidAAs.insert(AA); 
-    } 
- 
-    // Add attributes to the changed set if they have been created in the last 
-    // iteration. 
+        ChangedAA->Deps.pop_back();
+      }
+
+    LLVM_DEBUG(dbgs() << "[Attributor] #Iteration: " << IterationCounter
+                      << ", Worklist+Dependent size: " << Worklist.size()
+                      << "\n");
+
+    // Reset the changed and invalid set.
+    ChangedAAs.clear();
+    InvalidAAs.clear();
+
+    // Update all abstract attribute in the work list and record the ones that
+    // changed.
+    for (AbstractAttribute *AA : Worklist) {
+      const auto &AAState = AA->getState();
+      if (!AAState.isAtFixpoint())
+        if (updateAA(*AA) == ChangeStatus::CHANGED)
+          ChangedAAs.push_back(AA);
+
+      // Use the InvalidAAs vector to propagate invalid states fast transitively
+      // without requiring updates.
+      if (!AAState.isValidState())
+        InvalidAAs.insert(AA);
+    }
+
+    // Add attributes to the changed set if they have been created in the last
+    // iteration.
     ChangedAAs.append(DG.SyntheticRoot.begin() + NumAAs,
                       DG.SyntheticRoot.end());
- 
-    // Reset the work list and repopulate with the changed abstract attributes. 
-    // Note that dependent ones are added above. 
-    Worklist.clear(); 
-    Worklist.insert(ChangedAAs.begin(), ChangedAAs.end()); 
- 
-  } while (!Worklist.empty() && (IterationCounter++ < MaxFixpointIterations || 
-                                 VerifyMaxFixpointIterations)); 
- 
-  LLVM_DEBUG(dbgs() << "\n[Attributor] Fixpoint iteration done after: " 
-                    << IterationCounter << "/" << MaxFixpointIterations 
-                    << " iterations\n"); 
- 
-  // Reset abstract arguments not settled in a sound fixpoint by now. This 
-  // happens when we stopped the fixpoint iteration early. Note that only the 
-  // ones marked as "changed" *and* the ones transitively depending on them 
-  // need to be reverted to a pessimistic state. Others might not be in a 
-  // fixpoint state but we can use the optimistic results for them anyway. 
-  SmallPtrSet<AbstractAttribute *, 32> Visited; 
-  for (unsigned u = 0; u < ChangedAAs.size(); u++) { 
-    AbstractAttribute *ChangedAA = ChangedAAs[u]; 
-    if (!Visited.insert(ChangedAA).second) 
-      continue; 
- 
-    AbstractState &State = ChangedAA->getState(); 
-    if (!State.isAtFixpoint()) { 
-      State.indicatePessimisticFixpoint(); 
- 
-      NumAttributesTimedOut++; 
-    } 
- 
-    while (!ChangedAA->Deps.empty()) { 
+
+    // Reset the work list and repopulate with the changed abstract attributes.
+    // Note that dependent ones are added above.
+    Worklist.clear();
+    Worklist.insert(ChangedAAs.begin(), ChangedAAs.end());
+
+  } while (!Worklist.empty() && (IterationCounter++ < MaxFixpointIterations ||
+                                 VerifyMaxFixpointIterations));
+
+  LLVM_DEBUG(dbgs() << "\n[Attributor] Fixpoint iteration done after: "
+                    << IterationCounter << "/" << MaxFixpointIterations
+                    << " iterations\n");
+
+  // Reset abstract arguments not settled in a sound fixpoint by now. This
+  // happens when we stopped the fixpoint iteration early. Note that only the
+  // ones marked as "changed" *and* the ones transitively depending on them
+  // need to be reverted to a pessimistic state. Others might not be in a
+  // fixpoint state but we can use the optimistic results for them anyway.
+  SmallPtrSet<AbstractAttribute *, 32> Visited;
+  for (unsigned u = 0; u < ChangedAAs.size(); u++) {
+    AbstractAttribute *ChangedAA = ChangedAAs[u];
+    if (!Visited.insert(ChangedAA).second)
+      continue;
+
+    AbstractState &State = ChangedAA->getState();
+    if (!State.isAtFixpoint()) {
+      State.indicatePessimisticFixpoint();
+
+      NumAttributesTimedOut++;
+    }
+
+    while (!ChangedAA->Deps.empty()) {
       ChangedAAs.push_back(
           cast<AbstractAttribute>(ChangedAA->Deps.back().getPointer()));
-      ChangedAA->Deps.pop_back(); 
-    } 
-  } 
- 
-  LLVM_DEBUG({ 
-    if (!Visited.empty()) 
-      dbgs() << "\n[Attributor] Finalized " << Visited.size() 
-             << " abstract attributes.\n"; 
-  }); 
- 
-  if (VerifyMaxFixpointIterations && 
-      IterationCounter != MaxFixpointIterations) { 
-    errs() << "\n[Attributor] Fixpoint iteration done after: " 
-           << IterationCounter << "/" << MaxFixpointIterations 
-           << " iterations\n"; 
-    llvm_unreachable("The fixpoint was not reached with exactly the number of " 
-                     "specified iterations!"); 
-  } 
-} 
- 
-ChangeStatus Attributor::manifestAttributes() { 
+      ChangedAA->Deps.pop_back();
+    }
+  }
+
+  LLVM_DEBUG({
+    if (!Visited.empty())
+      dbgs() << "\n[Attributor] Finalized " << Visited.size()
+             << " abstract attributes.\n";
+  });
+
+  if (VerifyMaxFixpointIterations &&
+      IterationCounter != MaxFixpointIterations) {
+    errs() << "\n[Attributor] Fixpoint iteration done after: "
+           << IterationCounter << "/" << MaxFixpointIterations
+           << " iterations\n";
+    llvm_unreachable("The fixpoint was not reached with exactly the number of "
+                     "specified iterations!");
+  }
+}
+
+ChangeStatus Attributor::manifestAttributes() {
   TimeTraceScope TimeScope("Attributor::manifestAttributes");
   size_t NumFinalAAs = DG.SyntheticRoot.Deps.size();
- 
-  unsigned NumManifested = 0; 
-  unsigned NumAtFixpoint = 0; 
-  ChangeStatus ManifestChange = ChangeStatus::UNCHANGED; 
+
+  unsigned NumManifested = 0;
+  unsigned NumAtFixpoint = 0;
+  ChangeStatus ManifestChange = ChangeStatus::UNCHANGED;
   for (auto &DepAA : DG.SyntheticRoot.Deps) {
     AbstractAttribute *AA = cast<AbstractAttribute>(DepAA.getPointer());
-    AbstractState &State = AA->getState(); 
- 
-    // If there is not already a fixpoint reached, we can now take the 
-    // optimistic state. This is correct because we enforced a pessimistic one 
-    // on abstract attributes that were transitively dependent on a changed one 
-    // already above. 
-    if (!State.isAtFixpoint()) 
-      State.indicateOptimisticFixpoint(); 
- 
-    // If the state is invalid, we do not try to manifest it. 
-    if (!State.isValidState()) 
-      continue; 
- 
-    // Skip dead code. 
-    if (isAssumedDead(*AA, nullptr, /* CheckBBLivenessOnly */ true)) 
-      continue; 
+    AbstractState &State = AA->getState();
+
+    // If there is not already a fixpoint reached, we can now take the
+    // optimistic state. This is correct because we enforced a pessimistic one
+    // on abstract attributes that were transitively dependent on a changed one
+    // already above.
+    if (!State.isAtFixpoint())
+      State.indicateOptimisticFixpoint();
+
+    // If the state is invalid, we do not try to manifest it.
+    if (!State.isValidState())
+      continue;
+
+    // Skip dead code.
+    if (isAssumedDead(*AA, nullptr, /* CheckBBLivenessOnly */ true))
+      continue;
     // Check if the manifest debug counter that allows skipping manifestation of
     // AAs
     if (!DebugCounter::shouldExecute(ManifestDBGCounter))
       continue;
-    // Manifest the state and record if we changed the IR. 
-    ChangeStatus LocalChange = AA->manifest(*this); 
-    if (LocalChange == ChangeStatus::CHANGED && AreStatisticsEnabled()) 
-      AA->trackStatistics(); 
-    LLVM_DEBUG(dbgs() << "[Attributor] Manifest " << LocalChange << " : " << *AA 
-                      << "\n"); 
- 
-    ManifestChange = ManifestChange | LocalChange; 
- 
-    NumAtFixpoint++; 
-    NumManifested += (LocalChange == ChangeStatus::CHANGED); 
-  } 
- 
-  (void)NumManifested; 
-  (void)NumAtFixpoint; 
-  LLVM_DEBUG(dbgs() << "\n[Attributor] Manifested " << NumManifested 
-                    << " arguments while " << NumAtFixpoint 
-                    << " were in a valid fixpoint state\n"); 
- 
-  NumAttributesManifested += NumManifested; 
-  NumAttributesValidFixpoint += NumAtFixpoint; 
- 
-  (void)NumFinalAAs; 
+    // Manifest the state and record if we changed the IR.
+    ChangeStatus LocalChange = AA->manifest(*this);
+    if (LocalChange == ChangeStatus::CHANGED && AreStatisticsEnabled())
+      AA->trackStatistics();
+    LLVM_DEBUG(dbgs() << "[Attributor] Manifest " << LocalChange << " : " << *AA
+                      << "\n");
+
+    ManifestChange = ManifestChange | LocalChange;
+
+    NumAtFixpoint++;
+    NumManifested += (LocalChange == ChangeStatus::CHANGED);
+  }
+
+  (void)NumManifested;
+  (void)NumAtFixpoint;
+  LLVM_DEBUG(dbgs() << "\n[Attributor] Manifested " << NumManifested
+                    << " arguments while " << NumAtFixpoint
+                    << " were in a valid fixpoint state\n");
+
+  NumAttributesManifested += NumManifested;
+  NumAttributesValidFixpoint += NumAtFixpoint;
+
+  (void)NumFinalAAs;
   if (NumFinalAAs != DG.SyntheticRoot.Deps.size()) {
     for (unsigned u = NumFinalAAs; u < DG.SyntheticRoot.Deps.size(); ++u)
       errs() << "Unexpected abstract attribute: "
              << cast<AbstractAttribute>(DG.SyntheticRoot.Deps[u].getPointer())
-             << " :: " 
+             << " :: "
              << cast<AbstractAttribute>(DG.SyntheticRoot.Deps[u].getPointer())
                     ->getIRPosition()
                     .getAssociatedValue()
-             << "\n"; 
-    llvm_unreachable("Expected the final number of abstract attributes to " 
-                     "remain unchanged!"); 
-  } 
-  return ManifestChange; 
-} 
- 
+             << "\n";
+    llvm_unreachable("Expected the final number of abstract attributes to "
+                     "remain unchanged!");
+  }
+  return ManifestChange;
+}
+
 void Attributor::identifyDeadInternalFunctions() {
   // Identify dead internal functions and delete them. This happens outside
   // the other fixpoint analysis as we might treat potentially dead functions
@@ -1215,133 +1215,133 @@ void Attributor::identifyDeadInternalFunctions() {
       ToBeDeletedFunctions.insert(F);
 }
 
-ChangeStatus Attributor::cleanupIR() { 
+ChangeStatus Attributor::cleanupIR() {
   TimeTraceScope TimeScope("Attributor::cleanupIR");
-  // Delete stuff at the end to avoid invalid references and a nice order. 
-  LLVM_DEBUG(dbgs() << "\n[Attributor] Delete at least " 
-                    << ToBeDeletedFunctions.size() << " functions and " 
-                    << ToBeDeletedBlocks.size() << " blocks and " 
-                    << ToBeDeletedInsts.size() << " instructions and " 
-                    << ToBeChangedUses.size() << " uses\n"); 
- 
-  SmallVector<WeakTrackingVH, 32> DeadInsts; 
-  SmallVector<Instruction *, 32> TerminatorsToFold; 
- 
-  for (auto &It : ToBeChangedUses) { 
-    Use *U = It.first; 
-    Value *NewV = It.second; 
-    Value *OldV = U->get(); 
- 
-    // Do not replace uses in returns if the value is a must-tail call we will 
-    // not delete. 
-    if (isa<ReturnInst>(U->getUser())) 
-      if (auto *CI = dyn_cast<CallInst>(OldV->stripPointerCasts())) 
-        if (CI->isMustTailCall() && !ToBeDeletedInsts.count(CI)) 
-          continue; 
- 
-    LLVM_DEBUG(dbgs() << "Use " << *NewV << " in " << *U->getUser() 
-                      << " instead of " << *OldV << "\n"); 
-    U->set(NewV); 
-    // Do not modify call instructions outside the SCC. 
-    if (auto *CB = dyn_cast<CallBase>(OldV)) 
-      if (!Functions.count(CB->getCaller())) 
-        continue; 
-    if (Instruction *I = dyn_cast<Instruction>(OldV)) { 
-      CGModifiedFunctions.insert(I->getFunction()); 
-      if (!isa<PHINode>(I) && !ToBeDeletedInsts.count(I) && 
-          isInstructionTriviallyDead(I)) 
-        DeadInsts.push_back(I); 
-    } 
-    if (isa<Constant>(NewV) && isa<BranchInst>(U->getUser())) { 
-      Instruction *UserI = cast<Instruction>(U->getUser()); 
-      if (isa<UndefValue>(NewV)) { 
-        ToBeChangedToUnreachableInsts.insert(UserI); 
-      } else { 
-        TerminatorsToFold.push_back(UserI); 
-      } 
-    } 
-  } 
-  for (auto &V : InvokeWithDeadSuccessor) 
-    if (InvokeInst *II = dyn_cast_or_null<InvokeInst>(V)) { 
-      bool UnwindBBIsDead = II->hasFnAttr(Attribute::NoUnwind); 
-      bool NormalBBIsDead = II->hasFnAttr(Attribute::NoReturn); 
-      bool Invoke2CallAllowed = 
-          !AAIsDead::mayCatchAsynchronousExceptions(*II->getFunction()); 
-      assert((UnwindBBIsDead || NormalBBIsDead) && 
-             "Invoke does not have dead successors!"); 
-      BasicBlock *BB = II->getParent(); 
-      BasicBlock *NormalDestBB = II->getNormalDest(); 
-      if (UnwindBBIsDead) { 
-        Instruction *NormalNextIP = &NormalDestBB->front(); 
-        if (Invoke2CallAllowed) { 
-          changeToCall(II); 
-          NormalNextIP = BB->getTerminator(); 
-        } 
-        if (NormalBBIsDead) 
-          ToBeChangedToUnreachableInsts.insert(NormalNextIP); 
-      } else { 
-        assert(NormalBBIsDead && "Broken invariant!"); 
-        if (!NormalDestBB->getUniquePredecessor()) 
-          NormalDestBB = SplitBlockPredecessors(NormalDestBB, {BB}, ".dead"); 
-        ToBeChangedToUnreachableInsts.insert(&NormalDestBB->front()); 
-      } 
-    } 
-  for (Instruction *I : TerminatorsToFold) { 
-    CGModifiedFunctions.insert(I->getFunction()); 
-    ConstantFoldTerminator(I->getParent()); 
-  } 
-  for (auto &V : ToBeChangedToUnreachableInsts) 
-    if (Instruction *I = dyn_cast_or_null<Instruction>(V)) { 
-      CGModifiedFunctions.insert(I->getFunction()); 
-      changeToUnreachable(I, /* UseLLVMTrap */ false); 
-    } 
- 
-  for (auto &V : ToBeDeletedInsts) { 
-    if (Instruction *I = dyn_cast_or_null<Instruction>(V)) { 
-      I->dropDroppableUses(); 
-      CGModifiedFunctions.insert(I->getFunction()); 
-      if (!I->getType()->isVoidTy()) 
-        I->replaceAllUsesWith(UndefValue::get(I->getType())); 
-      if (!isa<PHINode>(I) && isInstructionTriviallyDead(I)) 
-        DeadInsts.push_back(I); 
-      else 
-        I->eraseFromParent(); 
-    } 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "[Attributor] DeadInsts size: " << DeadInsts.size() 
-                    << "\n"); 
- 
-  RecursivelyDeleteTriviallyDeadInstructions(DeadInsts); 
- 
-  if (unsigned NumDeadBlocks = ToBeDeletedBlocks.size()) { 
-    SmallVector<BasicBlock *, 8> ToBeDeletedBBs; 
-    ToBeDeletedBBs.reserve(NumDeadBlocks); 
-    for (BasicBlock *BB : ToBeDeletedBlocks) { 
-      CGModifiedFunctions.insert(BB->getParent()); 
-      ToBeDeletedBBs.push_back(BB); 
-    } 
-    // Actually we do not delete the blocks but squash them into a single 
-    // unreachable but untangling branches that jump here is something we need 
-    // to do in a more generic way. 
-    DetatchDeadBlocks(ToBeDeletedBBs, nullptr); 
-  } 
- 
+  // Delete stuff at the end to avoid invalid references and a nice order.
+  LLVM_DEBUG(dbgs() << "\n[Attributor] Delete at least "
+                    << ToBeDeletedFunctions.size() << " functions and "
+                    << ToBeDeletedBlocks.size() << " blocks and "
+                    << ToBeDeletedInsts.size() << " instructions and "
+                    << ToBeChangedUses.size() << " uses\n");
+
+  SmallVector<WeakTrackingVH, 32> DeadInsts;
+  SmallVector<Instruction *, 32> TerminatorsToFold;
+
+  for (auto &It : ToBeChangedUses) {
+    Use *U = It.first;
+    Value *NewV = It.second;
+    Value *OldV = U->get();
+
+    // Do not replace uses in returns if the value is a must-tail call we will
+    // not delete.
+    if (isa<ReturnInst>(U->getUser()))
+      if (auto *CI = dyn_cast<CallInst>(OldV->stripPointerCasts()))
+        if (CI->isMustTailCall() && !ToBeDeletedInsts.count(CI))
+          continue;
+
+    LLVM_DEBUG(dbgs() << "Use " << *NewV << " in " << *U->getUser()
+                      << " instead of " << *OldV << "\n");
+    U->set(NewV);
+    // Do not modify call instructions outside the SCC.
+    if (auto *CB = dyn_cast<CallBase>(OldV))
+      if (!Functions.count(CB->getCaller()))
+        continue;
+    if (Instruction *I = dyn_cast<Instruction>(OldV)) {
+      CGModifiedFunctions.insert(I->getFunction());
+      if (!isa<PHINode>(I) && !ToBeDeletedInsts.count(I) &&
+          isInstructionTriviallyDead(I))
+        DeadInsts.push_back(I);
+    }
+    if (isa<Constant>(NewV) && isa<BranchInst>(U->getUser())) {
+      Instruction *UserI = cast<Instruction>(U->getUser());
+      if (isa<UndefValue>(NewV)) {
+        ToBeChangedToUnreachableInsts.insert(UserI);
+      } else {
+        TerminatorsToFold.push_back(UserI);
+      }
+    }
+  }
+  for (auto &V : InvokeWithDeadSuccessor)
+    if (InvokeInst *II = dyn_cast_or_null<InvokeInst>(V)) {
+      bool UnwindBBIsDead = II->hasFnAttr(Attribute::NoUnwind);
+      bool NormalBBIsDead = II->hasFnAttr(Attribute::NoReturn);
+      bool Invoke2CallAllowed =
+          !AAIsDead::mayCatchAsynchronousExceptions(*II->getFunction());
+      assert((UnwindBBIsDead || NormalBBIsDead) &&
+             "Invoke does not have dead successors!");
+      BasicBlock *BB = II->getParent();
+      BasicBlock *NormalDestBB = II->getNormalDest();
+      if (UnwindBBIsDead) {
+        Instruction *NormalNextIP = &NormalDestBB->front();
+        if (Invoke2CallAllowed) {
+          changeToCall(II);
+          NormalNextIP = BB->getTerminator();
+        }
+        if (NormalBBIsDead)
+          ToBeChangedToUnreachableInsts.insert(NormalNextIP);
+      } else {
+        assert(NormalBBIsDead && "Broken invariant!");
+        if (!NormalDestBB->getUniquePredecessor())
+          NormalDestBB = SplitBlockPredecessors(NormalDestBB, {BB}, ".dead");
+        ToBeChangedToUnreachableInsts.insert(&NormalDestBB->front());
+      }
+    }
+  for (Instruction *I : TerminatorsToFold) {
+    CGModifiedFunctions.insert(I->getFunction());
+    ConstantFoldTerminator(I->getParent());
+  }
+  for (auto &V : ToBeChangedToUnreachableInsts)
+    if (Instruction *I = dyn_cast_or_null<Instruction>(V)) {
+      CGModifiedFunctions.insert(I->getFunction());
+      changeToUnreachable(I, /* UseLLVMTrap */ false);
+    }
+
+  for (auto &V : ToBeDeletedInsts) {
+    if (Instruction *I = dyn_cast_or_null<Instruction>(V)) {
+      I->dropDroppableUses();
+      CGModifiedFunctions.insert(I->getFunction());
+      if (!I->getType()->isVoidTy())
+        I->replaceAllUsesWith(UndefValue::get(I->getType()));
+      if (!isa<PHINode>(I) && isInstructionTriviallyDead(I))
+        DeadInsts.push_back(I);
+      else
+        I->eraseFromParent();
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "[Attributor] DeadInsts size: " << DeadInsts.size()
+                    << "\n");
+
+  RecursivelyDeleteTriviallyDeadInstructions(DeadInsts);
+
+  if (unsigned NumDeadBlocks = ToBeDeletedBlocks.size()) {
+    SmallVector<BasicBlock *, 8> ToBeDeletedBBs;
+    ToBeDeletedBBs.reserve(NumDeadBlocks);
+    for (BasicBlock *BB : ToBeDeletedBlocks) {
+      CGModifiedFunctions.insert(BB->getParent());
+      ToBeDeletedBBs.push_back(BB);
+    }
+    // Actually we do not delete the blocks but squash them into a single
+    // unreachable but untangling branches that jump here is something we need
+    // to do in a more generic way.
+    DetatchDeadBlocks(ToBeDeletedBBs, nullptr);
+  }
+
   identifyDeadInternalFunctions();
- 
-  // Rewrite the functions as requested during manifest. 
-  ChangeStatus ManifestChange = rewriteFunctionSignatures(CGModifiedFunctions); 
- 
-  for (Function *Fn : CGModifiedFunctions) 
+
+  // Rewrite the functions as requested during manifest.
+  ChangeStatus ManifestChange = rewriteFunctionSignatures(CGModifiedFunctions);
+
+  for (Function *Fn : CGModifiedFunctions)
     if (!ToBeDeletedFunctions.count(Fn))
       CGUpdater.reanalyzeFunction(*Fn);
- 
+
   for (Function *Fn : ToBeDeletedFunctions) {
     if (!Functions.count(Fn))
       continue;
-    CGUpdater.removeFunction(*Fn); 
+    CGUpdater.removeFunction(*Fn);
   }
- 
+
   if (!ToBeChangedUses.empty())
     ManifestChange = ChangeStatus::CHANGED;
 
@@ -1363,27 +1363,27 @@ ChangeStatus Attributor::cleanupIR() {
   if (!DeadInsts.empty())
     ManifestChange = ChangeStatus::CHANGED;
 
-  NumFnDeleted += ToBeDeletedFunctions.size(); 
- 
+  NumFnDeleted += ToBeDeletedFunctions.size();
+
   LLVM_DEBUG(dbgs() << "[Attributor] Deleted " << ToBeDeletedFunctions.size()
-                    << " functions after manifest.\n"); 
- 
-#ifdef EXPENSIVE_CHECKS 
-  for (Function *F : Functions) { 
-    if (ToBeDeletedFunctions.count(F)) 
-      continue; 
-    assert(!verifyFunction(*F, &errs()) && "Module verification failed!"); 
-  } 
-#endif 
- 
-  return ManifestChange; 
-} 
- 
-ChangeStatus Attributor::run() { 
+                    << " functions after manifest.\n");
+
+#ifdef EXPENSIVE_CHECKS
+  for (Function *F : Functions) {
+    if (ToBeDeletedFunctions.count(F))
+      continue;
+    assert(!verifyFunction(*F, &errs()) && "Module verification failed!");
+  }
+#endif
+
+  return ManifestChange;
+}
+
+ChangeStatus Attributor::run() {
   TimeTraceScope TimeScope("Attributor::run");
 
   Phase = AttributorPhase::UPDATE;
-  runTillFixpoint(); 
+  runTillFixpoint();
 
   // dump graphs on demand
   if (DumpDepGraph)
@@ -1396,95 +1396,95 @@ ChangeStatus Attributor::run() {
     DG.print();
 
   Phase = AttributorPhase::MANIFEST;
-  ChangeStatus ManifestChange = manifestAttributes(); 
+  ChangeStatus ManifestChange = manifestAttributes();
 
   Phase = AttributorPhase::CLEANUP;
-  ChangeStatus CleanupChange = cleanupIR(); 
+  ChangeStatus CleanupChange = cleanupIR();
 
-  return ManifestChange | CleanupChange; 
-} 
- 
-ChangeStatus Attributor::updateAA(AbstractAttribute &AA) { 
+  return ManifestChange | CleanupChange;
+}
+
+ChangeStatus Attributor::updateAA(AbstractAttribute &AA) {
   TimeTraceScope TimeScope(
       AA.getName() + std::to_string(AA.getIRPosition().getPositionKind()) +
       "::updateAA");
   assert(Phase == AttributorPhase::UPDATE &&
          "We can update AA only in the update stage!");
 
-  // Use a new dependence vector for this update. 
-  DependenceVector DV; 
-  DependenceStack.push_back(&DV); 
- 
-  auto &AAState = AA.getState(); 
-  ChangeStatus CS = ChangeStatus::UNCHANGED; 
-  if (!isAssumedDead(AA, nullptr, /* CheckBBLivenessOnly */ true)) 
-    CS = AA.update(*this); 
- 
-  if (DV.empty()) { 
-    // If the attribute did not query any non-fix information, the state 
-    // will not change and we can indicate that right away. 
-    AAState.indicateOptimisticFixpoint(); 
-  } 
- 
-  if (!AAState.isAtFixpoint()) 
-    rememberDependences(); 
- 
-  // Verify the stack was used properly, that is we pop the dependence vector we 
-  // put there earlier. 
-  DependenceVector *PoppedDV = DependenceStack.pop_back_val(); 
-  (void)PoppedDV; 
-  assert(PoppedDV == &DV && "Inconsistent usage of the dependence stack!"); 
- 
-  return CS; 
-} 
- 
+  // Use a new dependence vector for this update.
+  DependenceVector DV;
+  DependenceStack.push_back(&DV);
+
+  auto &AAState = AA.getState();
+  ChangeStatus CS = ChangeStatus::UNCHANGED;
+  if (!isAssumedDead(AA, nullptr, /* CheckBBLivenessOnly */ true))
+    CS = AA.update(*this);
+
+  if (DV.empty()) {
+    // If the attribute did not query any non-fix information, the state
+    // will not change and we can indicate that right away.
+    AAState.indicateOptimisticFixpoint();
+  }
+
+  if (!AAState.isAtFixpoint())
+    rememberDependences();
+
+  // Verify the stack was used properly, that is we pop the dependence vector we
+  // put there earlier.
+  DependenceVector *PoppedDV = DependenceStack.pop_back_val();
+  (void)PoppedDV;
+  assert(PoppedDV == &DV && "Inconsistent usage of the dependence stack!");
+
+  return CS;
+}
+
 void Attributor::createShallowWrapper(Function &F) {
-  assert(!F.isDeclaration() && "Cannot create a wrapper around a declaration!"); 
- 
-  Module &M = *F.getParent(); 
-  LLVMContext &Ctx = M.getContext(); 
-  FunctionType *FnTy = F.getFunctionType(); 
- 
-  Function *Wrapper = 
-      Function::Create(FnTy, F.getLinkage(), F.getAddressSpace(), F.getName()); 
-  F.setName(""); // set the inside function anonymous 
-  M.getFunctionList().insert(F.getIterator(), Wrapper); 
- 
-  F.setLinkage(GlobalValue::InternalLinkage); 
- 
-  F.replaceAllUsesWith(Wrapper); 
-  assert(F.use_empty() && "Uses remained after wrapper was created!"); 
- 
-  // Move the COMDAT section to the wrapper. 
-  // TODO: Check if we need to keep it for F as well. 
-  Wrapper->setComdat(F.getComdat()); 
-  F.setComdat(nullptr); 
- 
-  // Copy all metadata and attributes but keep them on F as well. 
-  SmallVector<std::pair<unsigned, MDNode *>, 1> MDs; 
-  F.getAllMetadata(MDs); 
-  for (auto MDIt : MDs) 
-    Wrapper->addMetadata(MDIt.first, *MDIt.second); 
-  Wrapper->setAttributes(F.getAttributes()); 
- 
-  // Create the call in the wrapper. 
-  BasicBlock *EntryBB = BasicBlock::Create(Ctx, "entry", Wrapper); 
- 
-  SmallVector<Value *, 8> Args; 
+  assert(!F.isDeclaration() && "Cannot create a wrapper around a declaration!");
+
+  Module &M = *F.getParent();
+  LLVMContext &Ctx = M.getContext();
+  FunctionType *FnTy = F.getFunctionType();
+
+  Function *Wrapper =
+      Function::Create(FnTy, F.getLinkage(), F.getAddressSpace(), F.getName());
+  F.setName(""); // set the inside function anonymous
+  M.getFunctionList().insert(F.getIterator(), Wrapper);
+
+  F.setLinkage(GlobalValue::InternalLinkage);
+
+  F.replaceAllUsesWith(Wrapper);
+  assert(F.use_empty() && "Uses remained after wrapper was created!");
+
+  // Move the COMDAT section to the wrapper.
+  // TODO: Check if we need to keep it for F as well.
+  Wrapper->setComdat(F.getComdat());
+  F.setComdat(nullptr);
+
+  // Copy all metadata and attributes but keep them on F as well.
+  SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+  F.getAllMetadata(MDs);
+  for (auto MDIt : MDs)
+    Wrapper->addMetadata(MDIt.first, *MDIt.second);
+  Wrapper->setAttributes(F.getAttributes());
+
+  // Create the call in the wrapper.
+  BasicBlock *EntryBB = BasicBlock::Create(Ctx, "entry", Wrapper);
+
+  SmallVector<Value *, 8> Args;
   Argument *FArgIt = F.arg_begin();
-  for (Argument &Arg : Wrapper->args()) { 
-    Args.push_back(&Arg); 
-    Arg.setName((FArgIt++)->getName()); 
-  } 
- 
-  CallInst *CI = CallInst::Create(&F, Args, "", EntryBB); 
-  CI->setTailCall(true); 
-  CI->addAttribute(AttributeList::FunctionIndex, Attribute::NoInline); 
-  ReturnInst::Create(Ctx, CI->getType()->isVoidTy() ? nullptr : CI, EntryBB); 
- 
+  for (Argument &Arg : Wrapper->args()) {
+    Args.push_back(&Arg);
+    Arg.setName((FArgIt++)->getName());
+  }
+
+  CallInst *CI = CallInst::Create(&F, Args, "", EntryBB);
+  CI->setTailCall(true);
+  CI->addAttribute(AttributeList::FunctionIndex, Attribute::NoInline);
+  ReturnInst::Create(Ctx, CI->getType()->isVoidTy() ? nullptr : CI, EntryBB);
+
   NumFnShallowWrappersCreated++;
-} 
- 
+}
+
 /// Make another copy of the function \p F such that the copied version has
 /// internal linkage afterwards and can be analysed. Then we replace all uses
 /// of the original function to the copied one
@@ -1535,106 +1535,106 @@ static Function *internalizeFunction(Function &F) {
   return Copied;
 }
 
-bool Attributor::isValidFunctionSignatureRewrite( 
-    Argument &Arg, ArrayRef<Type *> ReplacementTypes) { 
- 
-  auto CallSiteCanBeChanged = [](AbstractCallSite ACS) { 
-    // Forbid the call site to cast the function return type. If we need to 
-    // rewrite these functions we need to re-create a cast for the new call site 
-    // (if the old had uses). 
-    if (!ACS.getCalledFunction() || 
-        ACS.getInstruction()->getType() != 
-            ACS.getCalledFunction()->getReturnType()) 
-      return false; 
-    // Forbid must-tail calls for now. 
-    return !ACS.isCallbackCall() && !ACS.getInstruction()->isMustTailCall(); 
-  }; 
- 
-  Function *Fn = Arg.getParent(); 
-  // Avoid var-arg functions for now. 
-  if (Fn->isVarArg()) { 
-    LLVM_DEBUG(dbgs() << "[Attributor] Cannot rewrite var-args functions\n"); 
-    return false; 
-  } 
- 
-  // Avoid functions with complicated argument passing semantics. 
-  AttributeList FnAttributeList = Fn->getAttributes(); 
-  if (FnAttributeList.hasAttrSomewhere(Attribute::Nest) || 
-      FnAttributeList.hasAttrSomewhere(Attribute::StructRet) || 
-      FnAttributeList.hasAttrSomewhere(Attribute::InAlloca) || 
-      FnAttributeList.hasAttrSomewhere(Attribute::Preallocated)) { 
-    LLVM_DEBUG( 
-        dbgs() << "[Attributor] Cannot rewrite due to complex attribute\n"); 
-    return false; 
-  } 
- 
-  // Avoid callbacks for now. 
-  bool AllCallSitesKnown; 
-  if (!checkForAllCallSites(CallSiteCanBeChanged, *Fn, true, nullptr, 
-                            AllCallSitesKnown)) { 
-    LLVM_DEBUG(dbgs() << "[Attributor] Cannot rewrite all call sites\n"); 
-    return false; 
-  } 
- 
-  auto InstPred = [](Instruction &I) { 
-    if (auto *CI = dyn_cast<CallInst>(&I)) 
-      return !CI->isMustTailCall(); 
-    return true; 
-  }; 
- 
-  // Forbid must-tail calls for now. 
-  // TODO: 
-  auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(*Fn); 
-  if (!checkForAllInstructionsImpl(nullptr, OpcodeInstMap, InstPred, nullptr, 
-                                   nullptr, {Instruction::Call})) { 
-    LLVM_DEBUG(dbgs() << "[Attributor] Cannot rewrite due to instructions\n"); 
-    return false; 
-  } 
- 
-  return true; 
-} 
- 
-bool Attributor::registerFunctionSignatureRewrite( 
-    Argument &Arg, ArrayRef<Type *> ReplacementTypes, 
-    ArgumentReplacementInfo::CalleeRepairCBTy &&CalleeRepairCB, 
-    ArgumentReplacementInfo::ACSRepairCBTy &&ACSRepairCB) { 
-  LLVM_DEBUG(dbgs() << "[Attributor] Register new rewrite of " << Arg << " in " 
-                    << Arg.getParent()->getName() << " with " 
-                    << ReplacementTypes.size() << " replacements\n"); 
-  assert(isValidFunctionSignatureRewrite(Arg, ReplacementTypes) && 
-         "Cannot register an invalid rewrite"); 
- 
-  Function *Fn = Arg.getParent(); 
-  SmallVectorImpl<std::unique_ptr<ArgumentReplacementInfo>> &ARIs = 
-      ArgumentReplacementMap[Fn]; 
-  if (ARIs.empty()) 
-    ARIs.resize(Fn->arg_size()); 
- 
-  // If we have a replacement already with less than or equal new arguments, 
-  // ignore this request. 
-  std::unique_ptr<ArgumentReplacementInfo> &ARI = ARIs[Arg.getArgNo()]; 
-  if (ARI && ARI->getNumReplacementArgs() <= ReplacementTypes.size()) { 
-    LLVM_DEBUG(dbgs() << "[Attributor] Existing rewrite is preferred\n"); 
-    return false; 
-  } 
- 
-  // If we have a replacement already but we like the new one better, delete 
-  // the old. 
-  ARI.reset(); 
- 
-  LLVM_DEBUG(dbgs() << "[Attributor] Register new rewrite of " << Arg << " in " 
-                    << Arg.getParent()->getName() << " with " 
-                    << ReplacementTypes.size() << " replacements\n"); 
- 
-  // Remember the replacement. 
-  ARI.reset(new ArgumentReplacementInfo(*this, Arg, ReplacementTypes, 
-                                        std::move(CalleeRepairCB), 
-                                        std::move(ACSRepairCB))); 
- 
-  return true; 
-} 
- 
-bool Attributor::shouldSeedAttribute(AbstractAttribute &AA) { 
+bool Attributor::isValidFunctionSignatureRewrite(
+    Argument &Arg, ArrayRef<Type *> ReplacementTypes) {
+
+  auto CallSiteCanBeChanged = [](AbstractCallSite ACS) {
+    // Forbid the call site to cast the function return type. If we need to
+    // rewrite these functions we need to re-create a cast for the new call site
+    // (if the old had uses).
+    if (!ACS.getCalledFunction() ||
+        ACS.getInstruction()->getType() !=
+            ACS.getCalledFunction()->getReturnType())
+      return false;
+    // Forbid must-tail calls for now.
+    return !ACS.isCallbackCall() && !ACS.getInstruction()->isMustTailCall();
+  };
+
+  Function *Fn = Arg.getParent();
+  // Avoid var-arg functions for now.
+  if (Fn->isVarArg()) {
+    LLVM_DEBUG(dbgs() << "[Attributor] Cannot rewrite var-args functions\n");
+    return false;
+  }
+
+  // Avoid functions with complicated argument passing semantics.
+  AttributeList FnAttributeList = Fn->getAttributes();
+  if (FnAttributeList.hasAttrSomewhere(Attribute::Nest) ||
+      FnAttributeList.hasAttrSomewhere(Attribute::StructRet) ||
+      FnAttributeList.hasAttrSomewhere(Attribute::InAlloca) ||
+      FnAttributeList.hasAttrSomewhere(Attribute::Preallocated)) {
+    LLVM_DEBUG(
+        dbgs() << "[Attributor] Cannot rewrite due to complex attribute\n");
+    return false;
+  }
+
+  // Avoid callbacks for now.
+  bool AllCallSitesKnown;
+  if (!checkForAllCallSites(CallSiteCanBeChanged, *Fn, true, nullptr,
+                            AllCallSitesKnown)) {
+    LLVM_DEBUG(dbgs() << "[Attributor] Cannot rewrite all call sites\n");
+    return false;
+  }
+
+  auto InstPred = [](Instruction &I) {
+    if (auto *CI = dyn_cast<CallInst>(&I))
+      return !CI->isMustTailCall();
+    return true;
+  };
+
+  // Forbid must-tail calls for now.
+  // TODO:
+  auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(*Fn);
+  if (!checkForAllInstructionsImpl(nullptr, OpcodeInstMap, InstPred, nullptr,
+                                   nullptr, {Instruction::Call})) {
+    LLVM_DEBUG(dbgs() << "[Attributor] Cannot rewrite due to instructions\n");
+    return false;
+  }
+
+  return true;
+}
+
+bool Attributor::registerFunctionSignatureRewrite(
+    Argument &Arg, ArrayRef<Type *> ReplacementTypes,
+    ArgumentReplacementInfo::CalleeRepairCBTy &&CalleeRepairCB,
+    ArgumentReplacementInfo::ACSRepairCBTy &&ACSRepairCB) {
+  LLVM_DEBUG(dbgs() << "[Attributor] Register new rewrite of " << Arg << " in "
+                    << Arg.getParent()->getName() << " with "
+                    << ReplacementTypes.size() << " replacements\n");
+  assert(isValidFunctionSignatureRewrite(Arg, ReplacementTypes) &&
+         "Cannot register an invalid rewrite");
+
+  Function *Fn = Arg.getParent();
+  SmallVectorImpl<std::unique_ptr<ArgumentReplacementInfo>> &ARIs =
+      ArgumentReplacementMap[Fn];
+  if (ARIs.empty())
+    ARIs.resize(Fn->arg_size());
+
+  // If we have a replacement already with less than or equal new arguments,
+  // ignore this request.
+  std::unique_ptr<ArgumentReplacementInfo> &ARI = ARIs[Arg.getArgNo()];
+  if (ARI && ARI->getNumReplacementArgs() <= ReplacementTypes.size()) {
+    LLVM_DEBUG(dbgs() << "[Attributor] Existing rewrite is preferred\n");
+    return false;
+  }
+
+  // If we have a replacement already but we like the new one better, delete
+  // the old.
+  ARI.reset();
+
+  LLVM_DEBUG(dbgs() << "[Attributor] Register new rewrite of " << Arg << " in "
+                    << Arg.getParent()->getName() << " with "
+                    << ReplacementTypes.size() << " replacements\n");
+
+  // Remember the replacement.
+  ARI.reset(new ArgumentReplacementInfo(*this, Arg, ReplacementTypes,
+                                        std::move(CalleeRepairCB),
+                                        std::move(ACSRepairCB)));
+
+  return true;
+}
+
+bool Attributor::shouldSeedAttribute(AbstractAttribute &AA) {
   bool Result = true;
 #ifndef NDEBUG
   if (SeedAllowList.size() != 0)
@@ -1646,583 +1646,583 @@ bool Attributor::shouldSeedAttribute(AbstractAttribute &AA) {
                          FunctionSeedAllowList.end(), Fn->getName());
 #endif
   return Result;
-} 
- 
-ChangeStatus Attributor::rewriteFunctionSignatures( 
-    SmallPtrSetImpl<Function *> &ModifiedFns) { 
-  ChangeStatus Changed = ChangeStatus::UNCHANGED; 
- 
-  for (auto &It : ArgumentReplacementMap) { 
-    Function *OldFn = It.getFirst(); 
- 
-    // Deleted functions do not require rewrites. 
+}
+
+ChangeStatus Attributor::rewriteFunctionSignatures(
+    SmallPtrSetImpl<Function *> &ModifiedFns) {
+  ChangeStatus Changed = ChangeStatus::UNCHANGED;
+
+  for (auto &It : ArgumentReplacementMap) {
+    Function *OldFn = It.getFirst();
+
+    // Deleted functions do not require rewrites.
     if (!Functions.count(OldFn) || ToBeDeletedFunctions.count(OldFn))
-      continue; 
- 
-    const SmallVectorImpl<std::unique_ptr<ArgumentReplacementInfo>> &ARIs = 
-        It.getSecond(); 
-    assert(ARIs.size() == OldFn->arg_size() && "Inconsistent state!"); 
- 
-    SmallVector<Type *, 16> NewArgumentTypes; 
-    SmallVector<AttributeSet, 16> NewArgumentAttributes; 
- 
-    // Collect replacement argument types and copy over existing attributes. 
-    AttributeList OldFnAttributeList = OldFn->getAttributes(); 
-    for (Argument &Arg : OldFn->args()) { 
-      if (const std::unique_ptr<ArgumentReplacementInfo> &ARI = 
-              ARIs[Arg.getArgNo()]) { 
-        NewArgumentTypes.append(ARI->ReplacementTypes.begin(), 
-                                ARI->ReplacementTypes.end()); 
-        NewArgumentAttributes.append(ARI->getNumReplacementArgs(), 
-                                     AttributeSet()); 
-      } else { 
-        NewArgumentTypes.push_back(Arg.getType()); 
-        NewArgumentAttributes.push_back( 
-            OldFnAttributeList.getParamAttributes(Arg.getArgNo())); 
-      } 
-    } 
- 
-    FunctionType *OldFnTy = OldFn->getFunctionType(); 
-    Type *RetTy = OldFnTy->getReturnType(); 
- 
-    // Construct the new function type using the new arguments types. 
-    FunctionType *NewFnTy = 
-        FunctionType::get(RetTy, NewArgumentTypes, OldFnTy->isVarArg()); 
- 
-    LLVM_DEBUG(dbgs() << "[Attributor] Function rewrite '" << OldFn->getName() 
-                      << "' from " << *OldFn->getFunctionType() << " to " 
-                      << *NewFnTy << "\n"); 
- 
-    // Create the new function body and insert it into the module. 
-    Function *NewFn = Function::Create(NewFnTy, OldFn->getLinkage(), 
-                                       OldFn->getAddressSpace(), ""); 
-    OldFn->getParent()->getFunctionList().insert(OldFn->getIterator(), NewFn); 
-    NewFn->takeName(OldFn); 
-    NewFn->copyAttributesFrom(OldFn); 
- 
-    // Patch the pointer to LLVM function in debug info descriptor. 
-    NewFn->setSubprogram(OldFn->getSubprogram()); 
-    OldFn->setSubprogram(nullptr); 
- 
-    // Recompute the parameter attributes list based on the new arguments for 
-    // the function. 
-    LLVMContext &Ctx = OldFn->getContext(); 
-    NewFn->setAttributes(AttributeList::get( 
-        Ctx, OldFnAttributeList.getFnAttributes(), 
-        OldFnAttributeList.getRetAttributes(), NewArgumentAttributes)); 
- 
-    // Since we have now created the new function, splice the body of the old 
-    // function right into the new function, leaving the old rotting hulk of the 
-    // function empty. 
-    NewFn->getBasicBlockList().splice(NewFn->begin(), 
-                                      OldFn->getBasicBlockList()); 
- 
-    // Fixup block addresses to reference new function. 
-    SmallVector<BlockAddress *, 8u> BlockAddresses; 
-    for (User *U : OldFn->users()) 
-      if (auto *BA = dyn_cast<BlockAddress>(U)) 
-        BlockAddresses.push_back(BA); 
-    for (auto *BA : BlockAddresses) 
-      BA->replaceAllUsesWith(BlockAddress::get(NewFn, BA->getBasicBlock())); 
- 
-    // Set of all "call-like" instructions that invoke the old function mapped 
-    // to their new replacements. 
-    SmallVector<std::pair<CallBase *, CallBase *>, 8> CallSitePairs; 
- 
-    // Callback to create a new "call-like" instruction for a given one. 
-    auto CallSiteReplacementCreator = [&](AbstractCallSite ACS) { 
-      CallBase *OldCB = cast<CallBase>(ACS.getInstruction()); 
-      const AttributeList &OldCallAttributeList = OldCB->getAttributes(); 
- 
-      // Collect the new argument operands for the replacement call site. 
-      SmallVector<Value *, 16> NewArgOperands; 
-      SmallVector<AttributeSet, 16> NewArgOperandAttributes; 
-      for (unsigned OldArgNum = 0; OldArgNum < ARIs.size(); ++OldArgNum) { 
-        unsigned NewFirstArgNum = NewArgOperands.size(); 
-        (void)NewFirstArgNum; // only used inside assert. 
-        if (const std::unique_ptr<ArgumentReplacementInfo> &ARI = 
-                ARIs[OldArgNum]) { 
-          if (ARI->ACSRepairCB) 
-            ARI->ACSRepairCB(*ARI, ACS, NewArgOperands); 
-          assert(ARI->getNumReplacementArgs() + NewFirstArgNum == 
-                     NewArgOperands.size() && 
-                 "ACS repair callback did not provide as many operand as new " 
-                 "types were registered!"); 
-          // TODO: Exose the attribute set to the ACS repair callback 
-          NewArgOperandAttributes.append(ARI->ReplacementTypes.size(), 
-                                         AttributeSet()); 
-        } else { 
-          NewArgOperands.push_back(ACS.getCallArgOperand(OldArgNum)); 
-          NewArgOperandAttributes.push_back( 
-              OldCallAttributeList.getParamAttributes(OldArgNum)); 
-        } 
-      } 
- 
-      assert(NewArgOperands.size() == NewArgOperandAttributes.size() && 
-             "Mismatch # argument operands vs. # argument operand attributes!"); 
-      assert(NewArgOperands.size() == NewFn->arg_size() && 
-             "Mismatch # argument operands vs. # function arguments!"); 
- 
-      SmallVector<OperandBundleDef, 4> OperandBundleDefs; 
-      OldCB->getOperandBundlesAsDefs(OperandBundleDefs); 
- 
-      // Create a new call or invoke instruction to replace the old one. 
-      CallBase *NewCB; 
-      if (InvokeInst *II = dyn_cast<InvokeInst>(OldCB)) { 
-        NewCB = 
-            InvokeInst::Create(NewFn, II->getNormalDest(), II->getUnwindDest(), 
-                               NewArgOperands, OperandBundleDefs, "", OldCB); 
-      } else { 
-        auto *NewCI = CallInst::Create(NewFn, NewArgOperands, OperandBundleDefs, 
-                                       "", OldCB); 
-        NewCI->setTailCallKind(cast<CallInst>(OldCB)->getTailCallKind()); 
-        NewCB = NewCI; 
-      } 
- 
-      // Copy over various properties and the new attributes. 
-      NewCB->copyMetadata(*OldCB, {LLVMContext::MD_prof, LLVMContext::MD_dbg}); 
-      NewCB->setCallingConv(OldCB->getCallingConv()); 
-      NewCB->takeName(OldCB); 
-      NewCB->setAttributes(AttributeList::get( 
-          Ctx, OldCallAttributeList.getFnAttributes(), 
-          OldCallAttributeList.getRetAttributes(), NewArgOperandAttributes)); 
- 
-      CallSitePairs.push_back({OldCB, NewCB}); 
-      return true; 
-    }; 
- 
-    // Use the CallSiteReplacementCreator to create replacement call sites. 
-    bool AllCallSitesKnown; 
-    bool Success = checkForAllCallSites(CallSiteReplacementCreator, *OldFn, 
-                                        true, nullptr, AllCallSitesKnown); 
-    (void)Success; 
-    assert(Success && "Assumed call site replacement to succeed!"); 
- 
-    // Rewire the arguments. 
+      continue;
+
+    const SmallVectorImpl<std::unique_ptr<ArgumentReplacementInfo>> &ARIs =
+        It.getSecond();
+    assert(ARIs.size() == OldFn->arg_size() && "Inconsistent state!");
+
+    SmallVector<Type *, 16> NewArgumentTypes;
+    SmallVector<AttributeSet, 16> NewArgumentAttributes;
+
+    // Collect replacement argument types and copy over existing attributes.
+    AttributeList OldFnAttributeList = OldFn->getAttributes();
+    for (Argument &Arg : OldFn->args()) {
+      if (const std::unique_ptr<ArgumentReplacementInfo> &ARI =
+              ARIs[Arg.getArgNo()]) {
+        NewArgumentTypes.append(ARI->ReplacementTypes.begin(),
+                                ARI->ReplacementTypes.end());
+        NewArgumentAttributes.append(ARI->getNumReplacementArgs(),
+                                     AttributeSet());
+      } else {
+        NewArgumentTypes.push_back(Arg.getType());
+        NewArgumentAttributes.push_back(
+            OldFnAttributeList.getParamAttributes(Arg.getArgNo()));
+      }
+    }
+
+    FunctionType *OldFnTy = OldFn->getFunctionType();
+    Type *RetTy = OldFnTy->getReturnType();
+
+    // Construct the new function type using the new arguments types.
+    FunctionType *NewFnTy =
+        FunctionType::get(RetTy, NewArgumentTypes, OldFnTy->isVarArg());
+
+    LLVM_DEBUG(dbgs() << "[Attributor] Function rewrite '" << OldFn->getName()
+                      << "' from " << *OldFn->getFunctionType() << " to "
+                      << *NewFnTy << "\n");
+
+    // Create the new function body and insert it into the module.
+    Function *NewFn = Function::Create(NewFnTy, OldFn->getLinkage(),
+                                       OldFn->getAddressSpace(), "");
+    OldFn->getParent()->getFunctionList().insert(OldFn->getIterator(), NewFn);
+    NewFn->takeName(OldFn);
+    NewFn->copyAttributesFrom(OldFn);
+
+    // Patch the pointer to LLVM function in debug info descriptor.
+    NewFn->setSubprogram(OldFn->getSubprogram());
+    OldFn->setSubprogram(nullptr);
+
+    // Recompute the parameter attributes list based on the new arguments for
+    // the function.
+    LLVMContext &Ctx = OldFn->getContext();
+    NewFn->setAttributes(AttributeList::get(
+        Ctx, OldFnAttributeList.getFnAttributes(),
+        OldFnAttributeList.getRetAttributes(), NewArgumentAttributes));
+
+    // Since we have now created the new function, splice the body of the old
+    // function right into the new function, leaving the old rotting hulk of the
+    // function empty.
+    NewFn->getBasicBlockList().splice(NewFn->begin(),
+                                      OldFn->getBasicBlockList());
+
+    // Fixup block addresses to reference new function.
+    SmallVector<BlockAddress *, 8u> BlockAddresses;
+    for (User *U : OldFn->users())
+      if (auto *BA = dyn_cast<BlockAddress>(U))
+        BlockAddresses.push_back(BA);
+    for (auto *BA : BlockAddresses)
+      BA->replaceAllUsesWith(BlockAddress::get(NewFn, BA->getBasicBlock()));
+
+    // Set of all "call-like" instructions that invoke the old function mapped
+    // to their new replacements.
+    SmallVector<std::pair<CallBase *, CallBase *>, 8> CallSitePairs;
+
+    // Callback to create a new "call-like" instruction for a given one.
+    auto CallSiteReplacementCreator = [&](AbstractCallSite ACS) {
+      CallBase *OldCB = cast<CallBase>(ACS.getInstruction());
+      const AttributeList &OldCallAttributeList = OldCB->getAttributes();
+
+      // Collect the new argument operands for the replacement call site.
+      SmallVector<Value *, 16> NewArgOperands;
+      SmallVector<AttributeSet, 16> NewArgOperandAttributes;
+      for (unsigned OldArgNum = 0; OldArgNum < ARIs.size(); ++OldArgNum) {
+        unsigned NewFirstArgNum = NewArgOperands.size();
+        (void)NewFirstArgNum; // only used inside assert.
+        if (const std::unique_ptr<ArgumentReplacementInfo> &ARI =
+                ARIs[OldArgNum]) {
+          if (ARI->ACSRepairCB)
+            ARI->ACSRepairCB(*ARI, ACS, NewArgOperands);
+          assert(ARI->getNumReplacementArgs() + NewFirstArgNum ==
+                     NewArgOperands.size() &&
+                 "ACS repair callback did not provide as many operand as new "
+                 "types were registered!");
+          // TODO: Exose the attribute set to the ACS repair callback
+          NewArgOperandAttributes.append(ARI->ReplacementTypes.size(),
+                                         AttributeSet());
+        } else {
+          NewArgOperands.push_back(ACS.getCallArgOperand(OldArgNum));
+          NewArgOperandAttributes.push_back(
+              OldCallAttributeList.getParamAttributes(OldArgNum));
+        }
+      }
+
+      assert(NewArgOperands.size() == NewArgOperandAttributes.size() &&
+             "Mismatch # argument operands vs. # argument operand attributes!");
+      assert(NewArgOperands.size() == NewFn->arg_size() &&
+             "Mismatch # argument operands vs. # function arguments!");
+
+      SmallVector<OperandBundleDef, 4> OperandBundleDefs;
+      OldCB->getOperandBundlesAsDefs(OperandBundleDefs);
+
+      // Create a new call or invoke instruction to replace the old one.
+      CallBase *NewCB;
+      if (InvokeInst *II = dyn_cast<InvokeInst>(OldCB)) {
+        NewCB =
+            InvokeInst::Create(NewFn, II->getNormalDest(), II->getUnwindDest(),
+                               NewArgOperands, OperandBundleDefs, "", OldCB);
+      } else {
+        auto *NewCI = CallInst::Create(NewFn, NewArgOperands, OperandBundleDefs,
+                                       "", OldCB);
+        NewCI->setTailCallKind(cast<CallInst>(OldCB)->getTailCallKind());
+        NewCB = NewCI;
+      }
+
+      // Copy over various properties and the new attributes.
+      NewCB->copyMetadata(*OldCB, {LLVMContext::MD_prof, LLVMContext::MD_dbg});
+      NewCB->setCallingConv(OldCB->getCallingConv());
+      NewCB->takeName(OldCB);
+      NewCB->setAttributes(AttributeList::get(
+          Ctx, OldCallAttributeList.getFnAttributes(),
+          OldCallAttributeList.getRetAttributes(), NewArgOperandAttributes));
+
+      CallSitePairs.push_back({OldCB, NewCB});
+      return true;
+    };
+
+    // Use the CallSiteReplacementCreator to create replacement call sites.
+    bool AllCallSitesKnown;
+    bool Success = checkForAllCallSites(CallSiteReplacementCreator, *OldFn,
+                                        true, nullptr, AllCallSitesKnown);
+    (void)Success;
+    assert(Success && "Assumed call site replacement to succeed!");
+
+    // Rewire the arguments.
     Argument *OldFnArgIt = OldFn->arg_begin();
     Argument *NewFnArgIt = NewFn->arg_begin();
-    for (unsigned OldArgNum = 0; OldArgNum < ARIs.size(); 
-         ++OldArgNum, ++OldFnArgIt) { 
-      if (const std::unique_ptr<ArgumentReplacementInfo> &ARI = 
-              ARIs[OldArgNum]) { 
-        if (ARI->CalleeRepairCB) 
-          ARI->CalleeRepairCB(*ARI, *NewFn, NewFnArgIt); 
-        NewFnArgIt += ARI->ReplacementTypes.size(); 
-      } else { 
-        NewFnArgIt->takeName(&*OldFnArgIt); 
-        OldFnArgIt->replaceAllUsesWith(&*NewFnArgIt); 
-        ++NewFnArgIt; 
-      } 
-    } 
- 
-    // Eliminate the instructions *after* we visited all of them. 
-    for (auto &CallSitePair : CallSitePairs) { 
-      CallBase &OldCB = *CallSitePair.first; 
-      CallBase &NewCB = *CallSitePair.second; 
-      assert(OldCB.getType() == NewCB.getType() && 
-             "Cannot handle call sites with different types!"); 
-      ModifiedFns.insert(OldCB.getFunction()); 
-      CGUpdater.replaceCallSite(OldCB, NewCB); 
-      OldCB.replaceAllUsesWith(&NewCB); 
-      OldCB.eraseFromParent(); 
-    } 
- 
-    // Replace the function in the call graph (if any). 
-    CGUpdater.replaceFunctionWith(*OldFn, *NewFn); 
- 
-    // If the old function was modified and needed to be reanalyzed, the new one 
-    // does now. 
-    if (ModifiedFns.erase(OldFn)) 
-      ModifiedFns.insert(NewFn); 
- 
-    Changed = ChangeStatus::CHANGED; 
-  } 
- 
-  return Changed; 
-} 
- 
-void InformationCache::initializeInformationCache(const Function &CF, 
-                                                  FunctionInfo &FI) { 
-  // As we do not modify the function here we can remove the const 
-  // withouth breaking implicit assumptions. At the end of the day, we could 
-  // initialize the cache eagerly which would look the same to the users. 
-  Function &F = const_cast<Function &>(CF); 
- 
-  // Walk all instructions to find interesting instructions that might be 
-  // queried by abstract attributes during their initialization or update. 
-  // This has to happen before we create attributes. 
- 
-  for (Instruction &I : instructions(&F)) { 
-    bool IsInterestingOpcode = false; 
- 
-    // To allow easy access to all instructions in a function with a given 
-    // opcode we store them in the InfoCache. As not all opcodes are interesting 
-    // to concrete attributes we only cache the ones that are as identified in 
-    // the following switch. 
-    // Note: There are no concrete attributes now so this is initially empty. 
-    switch (I.getOpcode()) { 
-    default: 
-      assert(!isa<CallBase>(&I) && 
-             "New call base instruction type needs to be known in the " 
-             "Attributor."); 
-      break; 
-    case Instruction::Call: 
-      // Calls are interesting on their own, additionally: 
-      // For `llvm.assume` calls we also fill the KnowledgeMap as we find them. 
-      // For `must-tail` calls we remember the caller and callee. 
-      if (IntrinsicInst *Assume = dyn_cast<IntrinsicInst>(&I)) { 
-        if (Assume->getIntrinsicID() == Intrinsic::assume) 
-          fillMapFromAssume(*Assume, KnowledgeMap); 
-      } else if (cast<CallInst>(I).isMustTailCall()) { 
-        FI.ContainsMustTailCall = true; 
-        if (const Function *Callee = cast<CallInst>(I).getCalledFunction()) 
-          getFunctionInfo(*Callee).CalledViaMustTail = true; 
-      } 
-      LLVM_FALLTHROUGH; 
-    case Instruction::CallBr: 
-    case Instruction::Invoke: 
-    case Instruction::CleanupRet: 
-    case Instruction::CatchSwitch: 
-    case Instruction::AtomicRMW: 
-    case Instruction::AtomicCmpXchg: 
-    case Instruction::Br: 
-    case Instruction::Resume: 
-    case Instruction::Ret: 
-    case Instruction::Load: 
-      // The alignment of a pointer is interesting for loads. 
-    case Instruction::Store: 
-      // The alignment of a pointer is interesting for stores. 
-      IsInterestingOpcode = true; 
-    } 
-    if (IsInterestingOpcode) { 
-      auto *&Insts = FI.OpcodeInstMap[I.getOpcode()]; 
-      if (!Insts) 
-        Insts = new (Allocator) InstructionVectorTy(); 
-      Insts->push_back(&I); 
-    } 
-    if (I.mayReadOrWriteMemory()) 
-      FI.RWInsts.push_back(&I); 
-  } 
- 
-  if (F.hasFnAttribute(Attribute::AlwaysInline) && 
-      isInlineViable(F).isSuccess()) 
-    InlineableFunctions.insert(&F); 
-} 
- 
+    for (unsigned OldArgNum = 0; OldArgNum < ARIs.size();
+         ++OldArgNum, ++OldFnArgIt) {
+      if (const std::unique_ptr<ArgumentReplacementInfo> &ARI =
+              ARIs[OldArgNum]) {
+        if (ARI->CalleeRepairCB)
+          ARI->CalleeRepairCB(*ARI, *NewFn, NewFnArgIt);
+        NewFnArgIt += ARI->ReplacementTypes.size();
+      } else {
+        NewFnArgIt->takeName(&*OldFnArgIt);
+        OldFnArgIt->replaceAllUsesWith(&*NewFnArgIt);
+        ++NewFnArgIt;
+      }
+    }
+
+    // Eliminate the instructions *after* we visited all of them.
+    for (auto &CallSitePair : CallSitePairs) {
+      CallBase &OldCB = *CallSitePair.first;
+      CallBase &NewCB = *CallSitePair.second;
+      assert(OldCB.getType() == NewCB.getType() &&
+             "Cannot handle call sites with different types!");
+      ModifiedFns.insert(OldCB.getFunction());
+      CGUpdater.replaceCallSite(OldCB, NewCB);
+      OldCB.replaceAllUsesWith(&NewCB);
+      OldCB.eraseFromParent();
+    }
+
+    // Replace the function in the call graph (if any).
+    CGUpdater.replaceFunctionWith(*OldFn, *NewFn);
+
+    // If the old function was modified and needed to be reanalyzed, the new one
+    // does now.
+    if (ModifiedFns.erase(OldFn))
+      ModifiedFns.insert(NewFn);
+
+    Changed = ChangeStatus::CHANGED;
+  }
+
+  return Changed;
+}
+
+void InformationCache::initializeInformationCache(const Function &CF,
+                                                  FunctionInfo &FI) {
+  // As we do not modify the function here we can remove the const
+  // withouth breaking implicit assumptions. At the end of the day, we could
+  // initialize the cache eagerly which would look the same to the users.
+  Function &F = const_cast<Function &>(CF);
+
+  // Walk all instructions to find interesting instructions that might be
+  // queried by abstract attributes during their initialization or update.
+  // This has to happen before we create attributes.
+
+  for (Instruction &I : instructions(&F)) {
+    bool IsInterestingOpcode = false;
+
+    // To allow easy access to all instructions in a function with a given
+    // opcode we store them in the InfoCache. As not all opcodes are interesting
+    // to concrete attributes we only cache the ones that are as identified in
+    // the following switch.
+    // Note: There are no concrete attributes now so this is initially empty.
+    switch (I.getOpcode()) {
+    default:
+      assert(!isa<CallBase>(&I) &&
+             "New call base instruction type needs to be known in the "
+             "Attributor.");
+      break;
+    case Instruction::Call:
+      // Calls are interesting on their own, additionally:
+      // For `llvm.assume` calls we also fill the KnowledgeMap as we find them.
+      // For `must-tail` calls we remember the caller and callee.
+      if (IntrinsicInst *Assume = dyn_cast<IntrinsicInst>(&I)) {
+        if (Assume->getIntrinsicID() == Intrinsic::assume)
+          fillMapFromAssume(*Assume, KnowledgeMap);
+      } else if (cast<CallInst>(I).isMustTailCall()) {
+        FI.ContainsMustTailCall = true;
+        if (const Function *Callee = cast<CallInst>(I).getCalledFunction())
+          getFunctionInfo(*Callee).CalledViaMustTail = true;
+      }
+      LLVM_FALLTHROUGH;
+    case Instruction::CallBr:
+    case Instruction::Invoke:
+    case Instruction::CleanupRet:
+    case Instruction::CatchSwitch:
+    case Instruction::AtomicRMW:
+    case Instruction::AtomicCmpXchg:
+    case Instruction::Br:
+    case Instruction::Resume:
+    case Instruction::Ret:
+    case Instruction::Load:
+      // The alignment of a pointer is interesting for loads.
+    case Instruction::Store:
+      // The alignment of a pointer is interesting for stores.
+      IsInterestingOpcode = true;
+    }
+    if (IsInterestingOpcode) {
+      auto *&Insts = FI.OpcodeInstMap[I.getOpcode()];
+      if (!Insts)
+        Insts = new (Allocator) InstructionVectorTy();
+      Insts->push_back(&I);
+    }
+    if (I.mayReadOrWriteMemory())
+      FI.RWInsts.push_back(&I);
+  }
+
+  if (F.hasFnAttribute(Attribute::AlwaysInline) &&
+      isInlineViable(F).isSuccess())
+    InlineableFunctions.insert(&F);
+}
+
 AAResults *InformationCache::getAAResultsForFunction(const Function &F) {
   return AG.getAnalysis<AAManager>(F);
 }
 
-InformationCache::FunctionInfo::~FunctionInfo() { 
-  // The instruction vectors are allocated using a BumpPtrAllocator, we need to 
-  // manually destroy them. 
-  for (auto &It : OpcodeInstMap) 
-    It.getSecond()->~InstructionVectorTy(); 
-} 
- 
-void Attributor::recordDependence(const AbstractAttribute &FromAA, 
-                                  const AbstractAttribute &ToAA, 
-                                  DepClassTy DepClass) { 
-  // If we are outside of an update, thus before the actual fixpoint iteration 
-  // started (= when we create AAs), we do not track dependences because we will 
-  // put all AAs into the initial worklist anyway. 
-  if (DependenceStack.empty()) 
-    return; 
-  if (FromAA.getState().isAtFixpoint()) 
-    return; 
-  DependenceStack.back()->push_back({&FromAA, &ToAA, DepClass}); 
-} 
- 
-void Attributor::rememberDependences() { 
-  assert(!DependenceStack.empty() && "No dependences to remember!"); 
- 
-  for (DepInfo &DI : *DependenceStack.back()) { 
-    auto &DepAAs = const_cast<AbstractAttribute &>(*DI.FromAA).Deps; 
-    DepAAs.push_back(AbstractAttribute::DepTy( 
-        const_cast<AbstractAttribute *>(DI.ToAA), unsigned(DI.DepClass))); 
-  } 
-} 
- 
-void Attributor::identifyDefaultAbstractAttributes(Function &F) { 
-  if (!VisitedFunctions.insert(&F).second) 
-    return; 
-  if (F.isDeclaration()) 
-    return; 
- 
-  // In non-module runs we need to look at the call sites of a function to 
-  // determine if it is part of a must-tail call edge. This will influence what 
-  // attributes we can derive. 
-  InformationCache::FunctionInfo &FI = InfoCache.getFunctionInfo(F); 
-  if (!isModulePass() && !FI.CalledViaMustTail) { 
-    for (const Use &U : F.uses()) 
-      if (const auto *CB = dyn_cast<CallBase>(U.getUser())) 
-        if (CB->isCallee(&U) && CB->isMustTailCall()) 
-          FI.CalledViaMustTail = true; 
-  } 
- 
-  IRPosition FPos = IRPosition::function(F); 
- 
-  // Check for dead BasicBlocks in every function. 
-  // We need dead instruction detection because we do not want to deal with 
-  // broken IR in which SSA rules do not apply. 
-  getOrCreateAAFor<AAIsDead>(FPos); 
- 
-  // Every function might be "will-return". 
-  getOrCreateAAFor<AAWillReturn>(FPos); 
- 
-  // Every function might contain instructions that cause "undefined behavior". 
-  getOrCreateAAFor<AAUndefinedBehavior>(FPos); 
- 
-  // Every function can be nounwind. 
-  getOrCreateAAFor<AANoUnwind>(FPos); 
- 
-  // Every function might be marked "nosync" 
-  getOrCreateAAFor<AANoSync>(FPos); 
- 
-  // Every function might be "no-free". 
-  getOrCreateAAFor<AANoFree>(FPos); 
- 
-  // Every function might be "no-return". 
-  getOrCreateAAFor<AANoReturn>(FPos); 
- 
-  // Every function might be "no-recurse". 
-  getOrCreateAAFor<AANoRecurse>(FPos); 
- 
-  // Every function might be "readnone/readonly/writeonly/...". 
-  getOrCreateAAFor<AAMemoryBehavior>(FPos); 
- 
-  // Every function can be "readnone/argmemonly/inaccessiblememonly/...". 
-  getOrCreateAAFor<AAMemoryLocation>(FPos); 
- 
-  // Every function might be applicable for Heap-To-Stack conversion. 
-  if (EnableHeapToStack) 
-    getOrCreateAAFor<AAHeapToStack>(FPos); 
- 
-  // Return attributes are only appropriate if the return type is non void. 
-  Type *ReturnType = F.getReturnType(); 
-  if (!ReturnType->isVoidTy()) { 
-    // Argument attribute "returned" --- Create only one per function even 
-    // though it is an argument attribute. 
-    getOrCreateAAFor<AAReturnedValues>(FPos); 
- 
-    IRPosition RetPos = IRPosition::returned(F); 
- 
-    // Every returned value might be dead. 
-    getOrCreateAAFor<AAIsDead>(RetPos); 
- 
-    // Every function might be simplified. 
-    getOrCreateAAFor<AAValueSimplify>(RetPos); 
- 
+InformationCache::FunctionInfo::~FunctionInfo() {
+  // The instruction vectors are allocated using a BumpPtrAllocator, we need to
+  // manually destroy them.
+  for (auto &It : OpcodeInstMap)
+    It.getSecond()->~InstructionVectorTy();
+}
+
+void Attributor::recordDependence(const AbstractAttribute &FromAA,
+                                  const AbstractAttribute &ToAA,
+                                  DepClassTy DepClass) {
+  // If we are outside of an update, thus before the actual fixpoint iteration
+  // started (= when we create AAs), we do not track dependences because we will
+  // put all AAs into the initial worklist anyway.
+  if (DependenceStack.empty())
+    return;
+  if (FromAA.getState().isAtFixpoint())
+    return;
+  DependenceStack.back()->push_back({&FromAA, &ToAA, DepClass});
+}
+
+void Attributor::rememberDependences() {
+  assert(!DependenceStack.empty() && "No dependences to remember!");
+
+  for (DepInfo &DI : *DependenceStack.back()) {
+    auto &DepAAs = const_cast<AbstractAttribute &>(*DI.FromAA).Deps;
+    DepAAs.push_back(AbstractAttribute::DepTy(
+        const_cast<AbstractAttribute *>(DI.ToAA), unsigned(DI.DepClass)));
+  }
+}
+
+void Attributor::identifyDefaultAbstractAttributes(Function &F) {
+  if (!VisitedFunctions.insert(&F).second)
+    return;
+  if (F.isDeclaration())
+    return;
+
+  // In non-module runs we need to look at the call sites of a function to
+  // determine if it is part of a must-tail call edge. This will influence what
+  // attributes we can derive.
+  InformationCache::FunctionInfo &FI = InfoCache.getFunctionInfo(F);
+  if (!isModulePass() && !FI.CalledViaMustTail) {
+    for (const Use &U : F.uses())
+      if (const auto *CB = dyn_cast<CallBase>(U.getUser()))
+        if (CB->isCallee(&U) && CB->isMustTailCall())
+          FI.CalledViaMustTail = true;
+  }
+
+  IRPosition FPos = IRPosition::function(F);
+
+  // Check for dead BasicBlocks in every function.
+  // We need dead instruction detection because we do not want to deal with
+  // broken IR in which SSA rules do not apply.
+  getOrCreateAAFor<AAIsDead>(FPos);
+
+  // Every function might be "will-return".
+  getOrCreateAAFor<AAWillReturn>(FPos);
+
+  // Every function might contain instructions that cause "undefined behavior".
+  getOrCreateAAFor<AAUndefinedBehavior>(FPos);
+
+  // Every function can be nounwind.
+  getOrCreateAAFor<AANoUnwind>(FPos);
+
+  // Every function might be marked "nosync"
+  getOrCreateAAFor<AANoSync>(FPos);
+
+  // Every function might be "no-free".
+  getOrCreateAAFor<AANoFree>(FPos);
+
+  // Every function might be "no-return".
+  getOrCreateAAFor<AANoReturn>(FPos);
+
+  // Every function might be "no-recurse".
+  getOrCreateAAFor<AANoRecurse>(FPos);
+
+  // Every function might be "readnone/readonly/writeonly/...".
+  getOrCreateAAFor<AAMemoryBehavior>(FPos);
+
+  // Every function can be "readnone/argmemonly/inaccessiblememonly/...".
+  getOrCreateAAFor<AAMemoryLocation>(FPos);
+
+  // Every function might be applicable for Heap-To-Stack conversion.
+  if (EnableHeapToStack)
+    getOrCreateAAFor<AAHeapToStack>(FPos);
+
+  // Return attributes are only appropriate if the return type is non void.
+  Type *ReturnType = F.getReturnType();
+  if (!ReturnType->isVoidTy()) {
+    // Argument attribute "returned" --- Create only one per function even
+    // though it is an argument attribute.
+    getOrCreateAAFor<AAReturnedValues>(FPos);
+
+    IRPosition RetPos = IRPosition::returned(F);
+
+    // Every returned value might be dead.
+    getOrCreateAAFor<AAIsDead>(RetPos);
+
+    // Every function might be simplified.
+    getOrCreateAAFor<AAValueSimplify>(RetPos);
+
     // Every returned value might be marked noundef.
     getOrCreateAAFor<AANoUndef>(RetPos);
 
-    if (ReturnType->isPointerTy()) { 
- 
-      // Every function with pointer return type might be marked align. 
-      getOrCreateAAFor<AAAlign>(RetPos); 
- 
-      // Every function with pointer return type might be marked nonnull. 
-      getOrCreateAAFor<AANonNull>(RetPos); 
- 
-      // Every function with pointer return type might be marked noalias. 
-      getOrCreateAAFor<AANoAlias>(RetPos); 
- 
-      // Every function with pointer return type might be marked 
-      // dereferenceable. 
-      getOrCreateAAFor<AADereferenceable>(RetPos); 
-    } 
-  } 
- 
-  for (Argument &Arg : F.args()) { 
-    IRPosition ArgPos = IRPosition::argument(Arg); 
- 
-    // Every argument might be simplified. 
-    getOrCreateAAFor<AAValueSimplify>(ArgPos); 
- 
-    // Every argument might be dead. 
-    getOrCreateAAFor<AAIsDead>(ArgPos); 
- 
+    if (ReturnType->isPointerTy()) {
+
+      // Every function with pointer return type might be marked align.
+      getOrCreateAAFor<AAAlign>(RetPos);
+
+      // Every function with pointer return type might be marked nonnull.
+      getOrCreateAAFor<AANonNull>(RetPos);
+
+      // Every function with pointer return type might be marked noalias.
+      getOrCreateAAFor<AANoAlias>(RetPos);
+
+      // Every function with pointer return type might be marked
+      // dereferenceable.
+      getOrCreateAAFor<AADereferenceable>(RetPos);
+    }
+  }
+
+  for (Argument &Arg : F.args()) {
+    IRPosition ArgPos = IRPosition::argument(Arg);
+
+    // Every argument might be simplified.
+    getOrCreateAAFor<AAValueSimplify>(ArgPos);
+
+    // Every argument might be dead.
+    getOrCreateAAFor<AAIsDead>(ArgPos);
+
     // Every argument might be marked noundef.
     getOrCreateAAFor<AANoUndef>(ArgPos);
 
-    if (Arg.getType()->isPointerTy()) { 
-      // Every argument with pointer type might be marked nonnull. 
-      getOrCreateAAFor<AANonNull>(ArgPos); 
- 
-      // Every argument with pointer type might be marked noalias. 
-      getOrCreateAAFor<AANoAlias>(ArgPos); 
- 
-      // Every argument with pointer type might be marked dereferenceable. 
-      getOrCreateAAFor<AADereferenceable>(ArgPos); 
- 
-      // Every argument with pointer type might be marked align. 
-      getOrCreateAAFor<AAAlign>(ArgPos); 
- 
-      // Every argument with pointer type might be marked nocapture. 
-      getOrCreateAAFor<AANoCapture>(ArgPos); 
- 
-      // Every argument with pointer type might be marked 
-      // "readnone/readonly/writeonly/..." 
-      getOrCreateAAFor<AAMemoryBehavior>(ArgPos); 
- 
-      // Every argument with pointer type might be marked nofree. 
-      getOrCreateAAFor<AANoFree>(ArgPos); 
- 
-      // Every argument with pointer type might be privatizable (or promotable) 
-      getOrCreateAAFor<AAPrivatizablePtr>(ArgPos); 
-    } 
-  } 
- 
-  auto CallSitePred = [&](Instruction &I) -> bool { 
-    auto &CB = cast<CallBase>(I); 
-    IRPosition CBRetPos = IRPosition::callsite_returned(CB); 
- 
-    // Call sites might be dead if they do not have side effects and no live 
-    // users. The return value might be dead if there are no live users. 
-    getOrCreateAAFor<AAIsDead>(CBRetPos); 
- 
-    Function *Callee = CB.getCalledFunction(); 
-    // TODO: Even if the callee is not known now we might be able to simplify 
-    //       the call/callee. 
-    if (!Callee) 
-      return true; 
- 
-    // Skip declarations except if annotations on their call sites were 
-    // explicitly requested. 
-    if (!AnnotateDeclarationCallSites && Callee->isDeclaration() && 
-        !Callee->hasMetadata(LLVMContext::MD_callback)) 
-      return true; 
- 
-    if (!Callee->getReturnType()->isVoidTy() && !CB.use_empty()) { 
- 
-      IRPosition CBRetPos = IRPosition::callsite_returned(CB); 
- 
-      // Call site return integer values might be limited by a constant range. 
-      if (Callee->getReturnType()->isIntegerTy()) 
-        getOrCreateAAFor<AAValueConstantRange>(CBRetPos); 
-    } 
- 
-    for (int I = 0, E = CB.getNumArgOperands(); I < E; ++I) { 
- 
-      IRPosition CBArgPos = IRPosition::callsite_argument(CB, I); 
- 
-      // Every call site argument might be dead. 
-      getOrCreateAAFor<AAIsDead>(CBArgPos); 
- 
-      // Call site argument might be simplified. 
-      getOrCreateAAFor<AAValueSimplify>(CBArgPos); 
- 
+    if (Arg.getType()->isPointerTy()) {
+      // Every argument with pointer type might be marked nonnull.
+      getOrCreateAAFor<AANonNull>(ArgPos);
+
+      // Every argument with pointer type might be marked noalias.
+      getOrCreateAAFor<AANoAlias>(ArgPos);
+
+      // Every argument with pointer type might be marked dereferenceable.
+      getOrCreateAAFor<AADereferenceable>(ArgPos);
+
+      // Every argument with pointer type might be marked align.
+      getOrCreateAAFor<AAAlign>(ArgPos);
+
+      // Every argument with pointer type might be marked nocapture.
+      getOrCreateAAFor<AANoCapture>(ArgPos);
+
+      // Every argument with pointer type might be marked
+      // "readnone/readonly/writeonly/..."
+      getOrCreateAAFor<AAMemoryBehavior>(ArgPos);
+
+      // Every argument with pointer type might be marked nofree.
+      getOrCreateAAFor<AANoFree>(ArgPos);
+
+      // Every argument with pointer type might be privatizable (or promotable)
+      getOrCreateAAFor<AAPrivatizablePtr>(ArgPos);
+    }
+  }
+
+  auto CallSitePred = [&](Instruction &I) -> bool {
+    auto &CB = cast<CallBase>(I);
+    IRPosition CBRetPos = IRPosition::callsite_returned(CB);
+
+    // Call sites might be dead if they do not have side effects and no live
+    // users. The return value might be dead if there are no live users.
+    getOrCreateAAFor<AAIsDead>(CBRetPos);
+
+    Function *Callee = CB.getCalledFunction();
+    // TODO: Even if the callee is not known now we might be able to simplify
+    //       the call/callee.
+    if (!Callee)
+      return true;
+
+    // Skip declarations except if annotations on their call sites were
+    // explicitly requested.
+    if (!AnnotateDeclarationCallSites && Callee->isDeclaration() &&
+        !Callee->hasMetadata(LLVMContext::MD_callback))
+      return true;
+
+    if (!Callee->getReturnType()->isVoidTy() && !CB.use_empty()) {
+
+      IRPosition CBRetPos = IRPosition::callsite_returned(CB);
+
+      // Call site return integer values might be limited by a constant range.
+      if (Callee->getReturnType()->isIntegerTy())
+        getOrCreateAAFor<AAValueConstantRange>(CBRetPos);
+    }
+
+    for (int I = 0, E = CB.getNumArgOperands(); I < E; ++I) {
+
+      IRPosition CBArgPos = IRPosition::callsite_argument(CB, I);
+
+      // Every call site argument might be dead.
+      getOrCreateAAFor<AAIsDead>(CBArgPos);
+
+      // Call site argument might be simplified.
+      getOrCreateAAFor<AAValueSimplify>(CBArgPos);
+
       // Every call site argument might be marked "noundef".
       getOrCreateAAFor<AANoUndef>(CBArgPos);
 
-      if (!CB.getArgOperand(I)->getType()->isPointerTy()) 
-        continue; 
- 
-      // Call site argument attribute "non-null". 
-      getOrCreateAAFor<AANonNull>(CBArgPos); 
- 
-      // Call site argument attribute "nocapture". 
-      getOrCreateAAFor<AANoCapture>(CBArgPos); 
- 
-      // Call site argument attribute "no-alias". 
-      getOrCreateAAFor<AANoAlias>(CBArgPos); 
- 
-      // Call site argument attribute "dereferenceable". 
-      getOrCreateAAFor<AADereferenceable>(CBArgPos); 
- 
-      // Call site argument attribute "align". 
-      getOrCreateAAFor<AAAlign>(CBArgPos); 
- 
-      // Call site argument attribute 
-      // "readnone/readonly/writeonly/..." 
-      getOrCreateAAFor<AAMemoryBehavior>(CBArgPos); 
- 
-      // Call site argument attribute "nofree". 
-      getOrCreateAAFor<AANoFree>(CBArgPos); 
-    } 
-    return true; 
-  }; 
- 
-  auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(F); 
-  bool Success; 
-  Success = checkForAllInstructionsImpl( 
-      nullptr, OpcodeInstMap, CallSitePred, nullptr, nullptr, 
-      {(unsigned)Instruction::Invoke, (unsigned)Instruction::CallBr, 
-       (unsigned)Instruction::Call}); 
-  (void)Success; 
-  assert(Success && "Expected the check call to be successful!"); 
- 
-  auto LoadStorePred = [&](Instruction &I) -> bool { 
-    if (isa<LoadInst>(I)) 
-      getOrCreateAAFor<AAAlign>( 
-          IRPosition::value(*cast<LoadInst>(I).getPointerOperand())); 
-    else 
-      getOrCreateAAFor<AAAlign>( 
-          IRPosition::value(*cast<StoreInst>(I).getPointerOperand())); 
-    return true; 
-  }; 
-  Success = checkForAllInstructionsImpl( 
-      nullptr, OpcodeInstMap, LoadStorePred, nullptr, nullptr, 
-      {(unsigned)Instruction::Load, (unsigned)Instruction::Store}); 
-  (void)Success; 
-  assert(Success && "Expected the check call to be successful!"); 
-} 
- 
-/// Helpers to ease debugging through output streams and print calls. 
-/// 
-///{ 
-raw_ostream &llvm::operator<<(raw_ostream &OS, ChangeStatus S) { 
-  return OS << (S == ChangeStatus::CHANGED ? "changed" : "unchanged"); 
-} 
- 
-raw_ostream &llvm::operator<<(raw_ostream &OS, IRPosition::Kind AP) { 
-  switch (AP) { 
-  case IRPosition::IRP_INVALID: 
-    return OS << "inv"; 
-  case IRPosition::IRP_FLOAT: 
-    return OS << "flt"; 
-  case IRPosition::IRP_RETURNED: 
-    return OS << "fn_ret"; 
-  case IRPosition::IRP_CALL_SITE_RETURNED: 
-    return OS << "cs_ret"; 
-  case IRPosition::IRP_FUNCTION: 
-    return OS << "fn"; 
-  case IRPosition::IRP_CALL_SITE: 
-    return OS << "cs"; 
-  case IRPosition::IRP_ARGUMENT: 
-    return OS << "arg"; 
-  case IRPosition::IRP_CALL_SITE_ARGUMENT: 
-    return OS << "cs_arg"; 
-  } 
-  llvm_unreachable("Unknown attribute position!"); 
-} 
- 
-raw_ostream &llvm::operator<<(raw_ostream &OS, const IRPosition &Pos) { 
-  const Value &AV = Pos.getAssociatedValue(); 
-  return OS << "{" << Pos.getPositionKind() << ":" << AV.getName() << " [" 
+      if (!CB.getArgOperand(I)->getType()->isPointerTy())
+        continue;
+
+      // Call site argument attribute "non-null".
+      getOrCreateAAFor<AANonNull>(CBArgPos);
+
+      // Call site argument attribute "nocapture".
+      getOrCreateAAFor<AANoCapture>(CBArgPos);
+
+      // Call site argument attribute "no-alias".
+      getOrCreateAAFor<AANoAlias>(CBArgPos);
+
+      // Call site argument attribute "dereferenceable".
+      getOrCreateAAFor<AADereferenceable>(CBArgPos);
+
+      // Call site argument attribute "align".
+      getOrCreateAAFor<AAAlign>(CBArgPos);
+
+      // Call site argument attribute
+      // "readnone/readonly/writeonly/..."
+      getOrCreateAAFor<AAMemoryBehavior>(CBArgPos);
+
+      // Call site argument attribute "nofree".
+      getOrCreateAAFor<AANoFree>(CBArgPos);
+    }
+    return true;
+  };
+
+  auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(F);
+  bool Success;
+  Success = checkForAllInstructionsImpl(
+      nullptr, OpcodeInstMap, CallSitePred, nullptr, nullptr,
+      {(unsigned)Instruction::Invoke, (unsigned)Instruction::CallBr,
+       (unsigned)Instruction::Call});
+  (void)Success;
+  assert(Success && "Expected the check call to be successful!");
+
+  auto LoadStorePred = [&](Instruction &I) -> bool {
+    if (isa<LoadInst>(I))
+      getOrCreateAAFor<AAAlign>(
+          IRPosition::value(*cast<LoadInst>(I).getPointerOperand()));
+    else
+      getOrCreateAAFor<AAAlign>(
+          IRPosition::value(*cast<StoreInst>(I).getPointerOperand()));
+    return true;
+  };
+  Success = checkForAllInstructionsImpl(
+      nullptr, OpcodeInstMap, LoadStorePred, nullptr, nullptr,
+      {(unsigned)Instruction::Load, (unsigned)Instruction::Store});
+  (void)Success;
+  assert(Success && "Expected the check call to be successful!");
+}
+
+/// Helpers to ease debugging through output streams and print calls.
+///
+///{
+raw_ostream &llvm::operator<<(raw_ostream &OS, ChangeStatus S) {
+  return OS << (S == ChangeStatus::CHANGED ? "changed" : "unchanged");
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, IRPosition::Kind AP) {
+  switch (AP) {
+  case IRPosition::IRP_INVALID:
+    return OS << "inv";
+  case IRPosition::IRP_FLOAT:
+    return OS << "flt";
+  case IRPosition::IRP_RETURNED:
+    return OS << "fn_ret";
+  case IRPosition::IRP_CALL_SITE_RETURNED:
+    return OS << "cs_ret";
+  case IRPosition::IRP_FUNCTION:
+    return OS << "fn";
+  case IRPosition::IRP_CALL_SITE:
+    return OS << "cs";
+  case IRPosition::IRP_ARGUMENT:
+    return OS << "arg";
+  case IRPosition::IRP_CALL_SITE_ARGUMENT:
+    return OS << "cs_arg";
+  }
+  llvm_unreachable("Unknown attribute position!");
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const IRPosition &Pos) {
+  const Value &AV = Pos.getAssociatedValue();
+  return OS << "{" << Pos.getPositionKind() << ":" << AV.getName() << " ["
             << Pos.getAnchorValue().getName() << "@" << Pos.getCallSiteArgNo()
             << "]}";
-} 
- 
-raw_ostream &llvm::operator<<(raw_ostream &OS, const IntegerRangeState &S) { 
-  OS << "range-state(" << S.getBitWidth() << ")<"; 
-  S.getKnown().print(OS); 
-  OS << " / "; 
-  S.getAssumed().print(OS); 
-  OS << ">"; 
- 
-  return OS << static_cast<const AbstractState &>(S); 
-} 
- 
-raw_ostream &llvm::operator<<(raw_ostream &OS, const AbstractState &S) { 
-  return OS << (!S.isValidState() ? "top" : (S.isAtFixpoint() ? "fix" : "")); 
-} 
- 
-raw_ostream &llvm::operator<<(raw_ostream &OS, const AbstractAttribute &AA) { 
-  AA.print(OS); 
-  return OS; 
-} 
- 
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const IntegerRangeState &S) {
+  OS << "range-state(" << S.getBitWidth() << ")<";
+  S.getKnown().print(OS);
+  OS << " / ";
+  S.getAssumed().print(OS);
+  OS << ">";
+
+  return OS << static_cast<const AbstractState &>(S);
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const AbstractState &S) {
+  return OS << (!S.isValidState() ? "top" : (S.isAtFixpoint() ? "fix" : ""));
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const AbstractAttribute &AA) {
+  AA.print(OS);
+  return OS;
+}
+
 raw_ostream &llvm::operator<<(raw_ostream &OS,
                               const PotentialConstantIntValuesState &S) {
   OS << "set-state(< {";
@@ -2239,7 +2239,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS,
   return OS;
 }
 
-void AbstractAttribute::print(raw_ostream &OS) const { 
+void AbstractAttribute::print(raw_ostream &OS) const {
   OS << "[";
   OS << getName();
   OS << "] for CtxI ";
@@ -2253,7 +2253,7 @@ void AbstractAttribute::print(raw_ostream &OS) const {
 
   OS << " at position " << getIRPosition() << " with state " << getAsStr()
      << '\n';
-} 
+}
 
 void AbstractAttribute::printWithDeps(raw_ostream &OS) const {
   print(OS);
@@ -2266,32 +2266,32 @@ void AbstractAttribute::printWithDeps(raw_ostream &OS) const {
 
   OS << '\n';
 }
-///} 
- 
-/// ---------------------------------------------------------------------------- 
-///                       Pass (Manager) Boilerplate 
-/// ---------------------------------------------------------------------------- 
- 
-static bool runAttributorOnFunctions(InformationCache &InfoCache, 
-                                     SetVector<Function *> &Functions, 
-                                     AnalysisGetter &AG, 
-                                     CallGraphUpdater &CGUpdater) { 
-  if (Functions.empty()) 
-    return false; 
- 
-  LLVM_DEBUG(dbgs() << "[Attributor] Run on module with " << Functions.size() 
-                    << " functions.\n"); 
- 
-  // Create an Attributor and initially empty information cache that is filled 
-  // while we identify default attribute opportunities. 
-  Attributor A(Functions, InfoCache, CGUpdater); 
- 
-  // Create shallow wrappers for all functions that are not IPO amendable 
-  if (AllowShallowWrappers) 
-    for (Function *F : Functions) 
-      if (!A.isFunctionIPOAmendable(*F)) 
+///}
+
+/// ----------------------------------------------------------------------------
+///                       Pass (Manager) Boilerplate
+/// ----------------------------------------------------------------------------
+
+static bool runAttributorOnFunctions(InformationCache &InfoCache,
+                                     SetVector<Function *> &Functions,
+                                     AnalysisGetter &AG,
+                                     CallGraphUpdater &CGUpdater) {
+  if (Functions.empty())
+    return false;
+
+  LLVM_DEBUG(dbgs() << "[Attributor] Run on module with " << Functions.size()
+                    << " functions.\n");
+
+  // Create an Attributor and initially empty information cache that is filled
+  // while we identify default attribute opportunities.
+  Attributor A(Functions, InfoCache, CGUpdater);
+
+  // Create shallow wrappers for all functions that are not IPO amendable
+  if (AllowShallowWrappers)
+    for (Function *F : Functions)
+      if (!A.isFunctionIPOAmendable(*F))
         Attributor::createShallowWrapper(*F);
- 
+
   // Internalize non-exact functions
   // TODO: for now we eagerly internalize functions without calculating the
   //       cost, we need a cost interface to determine whether internalizing
@@ -2316,36 +2316,36 @@ static bool runAttributorOnFunctions(InformationCache &InfoCache,
     }
   }
 
-  for (Function *F : Functions) { 
-    if (F->hasExactDefinition()) 
-      NumFnWithExactDefinition++; 
-    else 
-      NumFnWithoutExactDefinition++; 
- 
-    // We look at internal functions only on-demand but if any use is not a 
+  for (Function *F : Functions) {
+    if (F->hasExactDefinition())
+      NumFnWithExactDefinition++;
+    else
+      NumFnWithoutExactDefinition++;
+
+    // We look at internal functions only on-demand but if any use is not a
     // direct call or outside the current set of analyzed functions, we have
     // to do it eagerly.
-    if (F->hasLocalLinkage()) { 
-      if (llvm::all_of(F->uses(), [&Functions](const Use &U) { 
-            const auto *CB = dyn_cast<CallBase>(U.getUser()); 
-            return CB && CB->isCallee(&U) && 
-                   Functions.count(const_cast<Function *>(CB->getCaller())); 
-          })) 
-        continue; 
-    } 
- 
-    // Populate the Attributor with abstract attribute opportunities in the 
-    // function and the information cache with IR information. 
-    A.identifyDefaultAbstractAttributes(*F); 
-  } 
- 
-  ChangeStatus Changed = A.run(); 
-
-  LLVM_DEBUG(dbgs() << "[Attributor] Done with " << Functions.size() 
-                    << " functions, result: " << Changed << ".\n"); 
-  return Changed == ChangeStatus::CHANGED; 
-} 
- 
+    if (F->hasLocalLinkage()) {
+      if (llvm::all_of(F->uses(), [&Functions](const Use &U) {
+            const auto *CB = dyn_cast<CallBase>(U.getUser());
+            return CB && CB->isCallee(&U) &&
+                   Functions.count(const_cast<Function *>(CB->getCaller()));
+          }))
+        continue;
+    }
+
+    // Populate the Attributor with abstract attribute opportunities in the
+    // function and the information cache with IR information.
+    A.identifyDefaultAbstractAttributes(*F);
+  }
+
+  ChangeStatus Changed = A.run();
+
+  LLVM_DEBUG(dbgs() << "[Attributor] Done with " << Functions.size()
+                    << " functions, result: " << Changed << ".\n");
+  return Changed == ChangeStatus::CHANGED;
+}
+
 void AADepGraph::viewGraph() { llvm::ViewGraph(this, "Dependency Graph"); }
 
 void AADepGraph::dumpGraph() {
@@ -2375,54 +2375,54 @@ void AADepGraph::print() {
     cast<AbstractAttribute>(DepAA.getPointer())->printWithDeps(outs());
 }
 
-PreservedAnalyses AttributorPass::run(Module &M, ModuleAnalysisManager &AM) { 
-  FunctionAnalysisManager &FAM = 
-      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 
-  AnalysisGetter AG(FAM); 
- 
-  SetVector<Function *> Functions; 
-  for (Function &F : M) 
-    Functions.insert(&F); 
- 
-  CallGraphUpdater CGUpdater; 
-  BumpPtrAllocator Allocator; 
-  InformationCache InfoCache(M, AG, Allocator, /* CGSCC */ nullptr); 
-  if (runAttributorOnFunctions(InfoCache, Functions, AG, CGUpdater)) { 
-    // FIXME: Think about passes we will preserve and add them here. 
-    return PreservedAnalyses::none(); 
-  } 
-  return PreservedAnalyses::all(); 
-} 
- 
-PreservedAnalyses AttributorCGSCCPass::run(LazyCallGraph::SCC &C, 
-                                           CGSCCAnalysisManager &AM, 
-                                           LazyCallGraph &CG, 
-                                           CGSCCUpdateResult &UR) { 
-  FunctionAnalysisManager &FAM = 
-      AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager(); 
-  AnalysisGetter AG(FAM); 
- 
-  SetVector<Function *> Functions; 
-  for (LazyCallGraph::Node &N : C) 
-    Functions.insert(&N.getFunction()); 
- 
-  if (Functions.empty()) 
-    return PreservedAnalyses::all(); 
- 
-  Module &M = *Functions.back()->getParent(); 
-  CallGraphUpdater CGUpdater; 
-  CGUpdater.initialize(CG, C, AM, UR); 
-  BumpPtrAllocator Allocator; 
-  InformationCache InfoCache(M, AG, Allocator, /* CGSCC */ &Functions); 
-  if (runAttributorOnFunctions(InfoCache, Functions, AG, CGUpdater)) { 
-    // FIXME: Think about passes we will preserve and add them here. 
+PreservedAnalyses AttributorPass::run(Module &M, ModuleAnalysisManager &AM) {
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  AnalysisGetter AG(FAM);
+
+  SetVector<Function *> Functions;
+  for (Function &F : M)
+    Functions.insert(&F);
+
+  CallGraphUpdater CGUpdater;
+  BumpPtrAllocator Allocator;
+  InformationCache InfoCache(M, AG, Allocator, /* CGSCC */ nullptr);
+  if (runAttributorOnFunctions(InfoCache, Functions, AG, CGUpdater)) {
+    // FIXME: Think about passes we will preserve and add them here.
+    return PreservedAnalyses::none();
+  }
+  return PreservedAnalyses::all();
+}
+
+PreservedAnalyses AttributorCGSCCPass::run(LazyCallGraph::SCC &C,
+                                           CGSCCAnalysisManager &AM,
+                                           LazyCallGraph &CG,
+                                           CGSCCUpdateResult &UR) {
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
+  AnalysisGetter AG(FAM);
+
+  SetVector<Function *> Functions;
+  for (LazyCallGraph::Node &N : C)
+    Functions.insert(&N.getFunction());
+
+  if (Functions.empty())
+    return PreservedAnalyses::all();
+
+  Module &M = *Functions.back()->getParent();
+  CallGraphUpdater CGUpdater;
+  CGUpdater.initialize(CG, C, AM, UR);
+  BumpPtrAllocator Allocator;
+  InformationCache InfoCache(M, AG, Allocator, /* CGSCC */ &Functions);
+  if (runAttributorOnFunctions(InfoCache, Functions, AG, CGUpdater)) {
+    // FIXME: Think about passes we will preserve and add them here.
     PreservedAnalyses PA;
     PA.preserve<FunctionAnalysisManagerCGSCCProxy>();
     return PA;
-  } 
-  return PreservedAnalyses::all(); 
-} 
- 
+  }
+  return PreservedAnalyses::all();
+}
+
 namespace llvm {
 
 template <> struct GraphTraits<AADepGraphNode *> {
@@ -2468,93 +2468,93 @@ template <> struct DOTGraphTraits<AADepGraph *> : public DefaultDOTGraphTraits {
 
 } // end namespace llvm
 
-namespace { 
- 
-struct AttributorLegacyPass : public ModulePass { 
-  static char ID; 
- 
-  AttributorLegacyPass() : ModulePass(ID) { 
-    initializeAttributorLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnModule(Module &M) override { 
-    if (skipModule(M)) 
-      return false; 
- 
-    AnalysisGetter AG; 
-    SetVector<Function *> Functions; 
-    for (Function &F : M) 
-      Functions.insert(&F); 
- 
-    CallGraphUpdater CGUpdater; 
-    BumpPtrAllocator Allocator; 
-    InformationCache InfoCache(M, AG, Allocator, /* CGSCC */ nullptr); 
-    return runAttributorOnFunctions(InfoCache, Functions, AG, CGUpdater); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    // FIXME: Think about passes we will preserve and add them here. 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-  } 
-}; 
- 
-struct AttributorCGSCCLegacyPass : public CallGraphSCCPass { 
-  static char ID; 
- 
-  AttributorCGSCCLegacyPass() : CallGraphSCCPass(ID) { 
-    initializeAttributorCGSCCLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnSCC(CallGraphSCC &SCC) override { 
-    if (skipSCC(SCC)) 
-      return false; 
- 
-    SetVector<Function *> Functions; 
-    for (CallGraphNode *CGN : SCC) 
-      if (Function *Fn = CGN->getFunction()) 
-        if (!Fn->isDeclaration()) 
-          Functions.insert(Fn); 
- 
-    if (Functions.empty()) 
-      return false; 
- 
-    AnalysisGetter AG; 
-    CallGraph &CG = const_cast<CallGraph &>(SCC.getCallGraph()); 
+namespace {
+
+struct AttributorLegacyPass : public ModulePass {
+  static char ID;
+
+  AttributorLegacyPass() : ModulePass(ID) {
+    initializeAttributorLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+
+    AnalysisGetter AG;
+    SetVector<Function *> Functions;
+    for (Function &F : M)
+      Functions.insert(&F);
+
     CallGraphUpdater CGUpdater;
-    CGUpdater.initialize(CG, SCC); 
-    Module &M = *Functions.back()->getParent(); 
-    BumpPtrAllocator Allocator; 
-    InformationCache InfoCache(M, AG, Allocator, /* CGSCC */ &Functions); 
-    return runAttributorOnFunctions(InfoCache, Functions, AG, CGUpdater); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    // FIXME: Think about passes we will preserve and add them here. 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-    CallGraphSCCPass::getAnalysisUsage(AU); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-Pass *llvm::createAttributorLegacyPass() { return new AttributorLegacyPass(); } 
-Pass *llvm::createAttributorCGSCCLegacyPass() { 
-  return new AttributorCGSCCLegacyPass(); 
-} 
- 
-char AttributorLegacyPass::ID = 0; 
-char AttributorCGSCCLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(AttributorLegacyPass, "attributor", 
-                      "Deduce and propagate attributes", false, false) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_END(AttributorLegacyPass, "attributor", 
-                    "Deduce and propagate attributes", false, false) 
-INITIALIZE_PASS_BEGIN(AttributorCGSCCLegacyPass, "attributor-cgscc", 
-                      "Deduce and propagate attributes (CGSCC pass)", false, 
-                      false) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) 
-INITIALIZE_PASS_END(AttributorCGSCCLegacyPass, "attributor-cgscc", 
-                    "Deduce and propagate attributes (CGSCC pass)", false, 
-                    false) 
+    BumpPtrAllocator Allocator;
+    InformationCache InfoCache(M, AG, Allocator, /* CGSCC */ nullptr);
+    return runAttributorOnFunctions(InfoCache, Functions, AG, CGUpdater);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    // FIXME: Think about passes we will preserve and add them here.
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+};
+
+struct AttributorCGSCCLegacyPass : public CallGraphSCCPass {
+  static char ID;
+
+  AttributorCGSCCLegacyPass() : CallGraphSCCPass(ID) {
+    initializeAttributorCGSCCLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnSCC(CallGraphSCC &SCC) override {
+    if (skipSCC(SCC))
+      return false;
+
+    SetVector<Function *> Functions;
+    for (CallGraphNode *CGN : SCC)
+      if (Function *Fn = CGN->getFunction())
+        if (!Fn->isDeclaration())
+          Functions.insert(Fn);
+
+    if (Functions.empty())
+      return false;
+
+    AnalysisGetter AG;
+    CallGraph &CG = const_cast<CallGraph &>(SCC.getCallGraph());
+    CallGraphUpdater CGUpdater;
+    CGUpdater.initialize(CG, SCC);
+    Module &M = *Functions.back()->getParent();
+    BumpPtrAllocator Allocator;
+    InformationCache InfoCache(M, AG, Allocator, /* CGSCC */ &Functions);
+    return runAttributorOnFunctions(InfoCache, Functions, AG, CGUpdater);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    // FIXME: Think about passes we will preserve and add them here.
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    CallGraphSCCPass::getAnalysisUsage(AU);
+  }
+};
+
+} // end anonymous namespace
+
+Pass *llvm::createAttributorLegacyPass() { return new AttributorLegacyPass(); }
+Pass *llvm::createAttributorCGSCCLegacyPass() {
+  return new AttributorCGSCCLegacyPass();
+}
+
+char AttributorLegacyPass::ID = 0;
+char AttributorCGSCCLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AttributorLegacyPass, "attributor",
+                      "Deduce and propagate attributes", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(AttributorLegacyPass, "attributor",
+                    "Deduce and propagate attributes", false, false)
+INITIALIZE_PASS_BEGIN(AttributorCGSCCLegacyPass, "attributor-cgscc",
+                      "Deduce and propagate attributes (CGSCC pass)", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_END(AttributorCGSCCLegacyPass, "attributor-cgscc",
+                    "Deduce and propagate attributes (CGSCC pass)", false,
+                    false)
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/AttributorAttributes.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/AttributorAttributes.cpp
index e83d2df7d2..d6127a8df6 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -1,52 +1,52 @@
-//===- AttributorAttributes.cpp - Attributes for Attributor deduction -----===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// See the Attributor.h file comment and the class descriptions in that file for 
-// more information. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/IPO/Attributor.h" 
- 
+//===- AttributorAttributes.cpp - Attributes for Attributor deduction -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// See the Attributor.h file comment and the class descriptions in that file for
+// more information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/Attributor.h"
+
 #include "llvm/ADT/SCCIterator.h"
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/Statistic.h" 
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumeBundleQueries.h" 
+#include "llvm/Analysis/AssumeBundleQueries.h"
 #include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/CaptureTracking.h" 
-#include "llvm/Analysis/LazyValueInfo.h" 
-#include "llvm/Analysis/MemoryBuiltins.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/IRBuilder.h" 
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/NoFolder.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Transforms/IPO/ArgumentPromotion.h" 
-#include "llvm/Transforms/Utils/Local.h" 
- 
-#include <cassert> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "attributor" 
- 
-static cl::opt<bool> ManifestInternal( 
-    "attributor-manifest-internal", cl::Hidden, 
-    cl::desc("Manifest Attributor internal string attributes."), 
-    cl::init(false)); 
- 
-static cl::opt<int> MaxHeapToStackSize("max-heap-to-stack-size", cl::init(128), 
-                                       cl::Hidden); 
- 
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/NoFolder.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/IPO/ArgumentPromotion.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+#include <cassert>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "attributor"
+
+static cl::opt<bool> ManifestInternal(
+    "attributor-manifest-internal", cl::Hidden,
+    cl::desc("Manifest Attributor internal string attributes."),
+    cl::init(false));
+
+static cl::opt<int> MaxHeapToStackSize("max-heap-to-stack-size", cl::init(128),
+                                       cl::Hidden);
+
 template <>
 unsigned llvm::PotentialConstantIntValuesState::MaxPotentialValues = 0;
 
@@ -57,1655 +57,1655 @@ static cl::opt<unsigned, true> MaxPotentialValues(
     cl::location(llvm::PotentialConstantIntValuesState::MaxPotentialValues),
     cl::init(7));
 
-STATISTIC(NumAAs, "Number of abstract attributes created"); 
- 
-// Some helper macros to deal with statistics tracking. 
-// 
-// Usage: 
-// For simple IR attribute tracking overload trackStatistics in the abstract 
-// attribute and choose the right STATS_DECLTRACK_********* macro, 
-// e.g.,: 
-//  void trackStatistics() const override { 
-//    STATS_DECLTRACK_ARG_ATTR(returned) 
-//  } 
-// If there is a single "increment" side one can use the macro 
-// STATS_DECLTRACK with a custom message. If there are multiple increment 
-// sides, STATS_DECL and STATS_TRACK can also be used separately. 
-// 
-#define BUILD_STAT_MSG_IR_ATTR(TYPE, NAME)                                     \ 
-  ("Number of " #TYPE " marked '" #NAME "'") 
-#define BUILD_STAT_NAME(NAME, TYPE) NumIR##TYPE##_##NAME 
-#define STATS_DECL_(NAME, MSG) STATISTIC(NAME, MSG); 
-#define STATS_DECL(NAME, TYPE, MSG)                                            \ 
-  STATS_DECL_(BUILD_STAT_NAME(NAME, TYPE), MSG); 
-#define STATS_TRACK(NAME, TYPE) ++(BUILD_STAT_NAME(NAME, TYPE)); 
-#define STATS_DECLTRACK(NAME, TYPE, MSG)                                       \ 
-  {                                                                            \ 
-    STATS_DECL(NAME, TYPE, MSG)                                                \ 
-    STATS_TRACK(NAME, TYPE)                                                    \ 
-  } 
-#define STATS_DECLTRACK_ARG_ATTR(NAME)                                         \ 
-  STATS_DECLTRACK(NAME, Arguments, BUILD_STAT_MSG_IR_ATTR(arguments, NAME)) 
-#define STATS_DECLTRACK_CSARG_ATTR(NAME)                                       \ 
-  STATS_DECLTRACK(NAME, CSArguments,                                           \ 
-                  BUILD_STAT_MSG_IR_ATTR(call site arguments, NAME)) 
-#define STATS_DECLTRACK_FN_ATTR(NAME)                                          \ 
-  STATS_DECLTRACK(NAME, Function, BUILD_STAT_MSG_IR_ATTR(functions, NAME)) 
-#define STATS_DECLTRACK_CS_ATTR(NAME)                                          \ 
-  STATS_DECLTRACK(NAME, CS, BUILD_STAT_MSG_IR_ATTR(call site, NAME)) 
-#define STATS_DECLTRACK_FNRET_ATTR(NAME)                                       \ 
-  STATS_DECLTRACK(NAME, FunctionReturn,                                        \ 
-                  BUILD_STAT_MSG_IR_ATTR(function returns, NAME)) 
-#define STATS_DECLTRACK_CSRET_ATTR(NAME)                                       \ 
-  STATS_DECLTRACK(NAME, CSReturn,                                              \ 
-                  BUILD_STAT_MSG_IR_ATTR(call site returns, NAME)) 
-#define STATS_DECLTRACK_FLOATING_ATTR(NAME)                                    \ 
-  STATS_DECLTRACK(NAME, Floating,                                              \ 
-                  ("Number of floating values known to be '" #NAME "'")) 
- 
-// Specialization of the operator<< for abstract attributes subclasses. This 
-// disambiguates situations where multiple operators are applicable. 
-namespace llvm { 
-#define PIPE_OPERATOR(CLASS)                                                   \ 
-  raw_ostream &operator<<(raw_ostream &OS, const CLASS &AA) {                  \ 
-    return OS << static_cast<const AbstractAttribute &>(AA);                   \ 
-  } 
- 
-PIPE_OPERATOR(AAIsDead) 
-PIPE_OPERATOR(AANoUnwind) 
-PIPE_OPERATOR(AANoSync) 
-PIPE_OPERATOR(AANoRecurse) 
-PIPE_OPERATOR(AAWillReturn) 
-PIPE_OPERATOR(AANoReturn) 
-PIPE_OPERATOR(AAReturnedValues) 
-PIPE_OPERATOR(AANonNull) 
-PIPE_OPERATOR(AANoAlias) 
-PIPE_OPERATOR(AADereferenceable) 
-PIPE_OPERATOR(AAAlign) 
-PIPE_OPERATOR(AANoCapture) 
-PIPE_OPERATOR(AAValueSimplify) 
-PIPE_OPERATOR(AANoFree) 
-PIPE_OPERATOR(AAHeapToStack) 
-PIPE_OPERATOR(AAReachability) 
-PIPE_OPERATOR(AAMemoryBehavior) 
-PIPE_OPERATOR(AAMemoryLocation) 
-PIPE_OPERATOR(AAValueConstantRange) 
-PIPE_OPERATOR(AAPrivatizablePtr) 
-PIPE_OPERATOR(AAUndefinedBehavior) 
+STATISTIC(NumAAs, "Number of abstract attributes created");
+
+// Some helper macros to deal with statistics tracking.
+//
+// Usage:
+// For simple IR attribute tracking overload trackStatistics in the abstract
+// attribute and choose the right STATS_DECLTRACK_********* macro,
+// e.g.,:
+//  void trackStatistics() const override {
+//    STATS_DECLTRACK_ARG_ATTR(returned)
+//  }
+// If there is a single "increment" side one can use the macro
+// STATS_DECLTRACK with a custom message. If there are multiple increment
+// sides, STATS_DECL and STATS_TRACK can also be used separately.
+//
+#define BUILD_STAT_MSG_IR_ATTR(TYPE, NAME)                                     \
+  ("Number of " #TYPE " marked '" #NAME "'")
+#define BUILD_STAT_NAME(NAME, TYPE) NumIR##TYPE##_##NAME
+#define STATS_DECL_(NAME, MSG) STATISTIC(NAME, MSG);
+#define STATS_DECL(NAME, TYPE, MSG)                                            \
+  STATS_DECL_(BUILD_STAT_NAME(NAME, TYPE), MSG);
+#define STATS_TRACK(NAME, TYPE) ++(BUILD_STAT_NAME(NAME, TYPE));
+#define STATS_DECLTRACK(NAME, TYPE, MSG)                                       \
+  {                                                                            \
+    STATS_DECL(NAME, TYPE, MSG)                                                \
+    STATS_TRACK(NAME, TYPE)                                                    \
+  }
+#define STATS_DECLTRACK_ARG_ATTR(NAME)                                         \
+  STATS_DECLTRACK(NAME, Arguments, BUILD_STAT_MSG_IR_ATTR(arguments, NAME))
+#define STATS_DECLTRACK_CSARG_ATTR(NAME)                                       \
+  STATS_DECLTRACK(NAME, CSArguments,                                           \
+                  BUILD_STAT_MSG_IR_ATTR(call site arguments, NAME))
+#define STATS_DECLTRACK_FN_ATTR(NAME)                                          \
+  STATS_DECLTRACK(NAME, Function, BUILD_STAT_MSG_IR_ATTR(functions, NAME))
+#define STATS_DECLTRACK_CS_ATTR(NAME)                                          \
+  STATS_DECLTRACK(NAME, CS, BUILD_STAT_MSG_IR_ATTR(call site, NAME))
+#define STATS_DECLTRACK_FNRET_ATTR(NAME)                                       \
+  STATS_DECLTRACK(NAME, FunctionReturn,                                        \
+                  BUILD_STAT_MSG_IR_ATTR(function returns, NAME))
+#define STATS_DECLTRACK_CSRET_ATTR(NAME)                                       \
+  STATS_DECLTRACK(NAME, CSReturn,                                              \
+                  BUILD_STAT_MSG_IR_ATTR(call site returns, NAME))
+#define STATS_DECLTRACK_FLOATING_ATTR(NAME)                                    \
+  STATS_DECLTRACK(NAME, Floating,                                              \
+                  ("Number of floating values known to be '" #NAME "'"))
+
+// Specialization of the operator<< for abstract attributes subclasses. This
+// disambiguates situations where multiple operators are applicable.
+namespace llvm {
+#define PIPE_OPERATOR(CLASS)                                                   \
+  raw_ostream &operator<<(raw_ostream &OS, const CLASS &AA) {                  \
+    return OS << static_cast<const AbstractAttribute &>(AA);                   \
+  }
+
+PIPE_OPERATOR(AAIsDead)
+PIPE_OPERATOR(AANoUnwind)
+PIPE_OPERATOR(AANoSync)
+PIPE_OPERATOR(AANoRecurse)
+PIPE_OPERATOR(AAWillReturn)
+PIPE_OPERATOR(AANoReturn)
+PIPE_OPERATOR(AAReturnedValues)
+PIPE_OPERATOR(AANonNull)
+PIPE_OPERATOR(AANoAlias)
+PIPE_OPERATOR(AADereferenceable)
+PIPE_OPERATOR(AAAlign)
+PIPE_OPERATOR(AANoCapture)
+PIPE_OPERATOR(AAValueSimplify)
+PIPE_OPERATOR(AANoFree)
+PIPE_OPERATOR(AAHeapToStack)
+PIPE_OPERATOR(AAReachability)
+PIPE_OPERATOR(AAMemoryBehavior)
+PIPE_OPERATOR(AAMemoryLocation)
+PIPE_OPERATOR(AAValueConstantRange)
+PIPE_OPERATOR(AAPrivatizablePtr)
+PIPE_OPERATOR(AAUndefinedBehavior)
 PIPE_OPERATOR(AAPotentialValues)
 PIPE_OPERATOR(AANoUndef)
- 
-#undef PIPE_OPERATOR 
-} // namespace llvm 
- 
-namespace { 
- 
-static Optional<ConstantInt *> 
-getAssumedConstantInt(Attributor &A, const Value &V, 
-                      const AbstractAttribute &AA, 
-                      bool &UsedAssumedInformation) { 
-  Optional<Constant *> C = A.getAssumedConstant(V, AA, UsedAssumedInformation); 
-  if (C.hasValue()) 
-    return dyn_cast_or_null<ConstantInt>(C.getValue()); 
-  return llvm::None; 
-} 
- 
-/// Get pointer operand of memory accessing instruction. If \p I is 
-/// not a memory accessing instruction, return nullptr. If \p AllowVolatile, 
-/// is set to false and the instruction is volatile, return nullptr. 
-static const Value *getPointerOperand(const Instruction *I, 
-                                      bool AllowVolatile) { 
-  if (auto *LI = dyn_cast<LoadInst>(I)) { 
-    if (!AllowVolatile && LI->isVolatile()) 
-      return nullptr; 
-    return LI->getPointerOperand(); 
-  } 
- 
-  if (auto *SI = dyn_cast<StoreInst>(I)) { 
-    if (!AllowVolatile && SI->isVolatile()) 
-      return nullptr; 
-    return SI->getPointerOperand(); 
-  } 
- 
-  if (auto *CXI = dyn_cast<AtomicCmpXchgInst>(I)) { 
-    if (!AllowVolatile && CXI->isVolatile()) 
-      return nullptr; 
-    return CXI->getPointerOperand(); 
-  } 
- 
-  if (auto *RMWI = dyn_cast<AtomicRMWInst>(I)) { 
-    if (!AllowVolatile && RMWI->isVolatile()) 
-      return nullptr; 
-    return RMWI->getPointerOperand(); 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Helper function to create a pointer of type \p ResTy, based on \p Ptr, and 
-/// advanced by \p Offset bytes. To aid later analysis the method tries to build 
-/// getelement pointer instructions that traverse the natural type of \p Ptr if 
-/// possible. If that fails, the remaining offset is adjusted byte-wise, hence 
-/// through a cast to i8*. 
-/// 
-/// TODO: This could probably live somewhere more prominantly if it doesn't 
-///       already exist. 
-static Value *constructPointer(Type *ResTy, Value *Ptr, int64_t Offset, 
-                               IRBuilder<NoFolder> &IRB, const DataLayout &DL) { 
-  assert(Offset >= 0 && "Negative offset not supported yet!"); 
-  LLVM_DEBUG(dbgs() << "Construct pointer: " << *Ptr << " + " << Offset 
-                    << "-bytes as " << *ResTy << "\n"); 
- 
-  // The initial type we are trying to traverse to get nice GEPs. 
-  Type *Ty = Ptr->getType(); 
- 
-  SmallVector<Value *, 4> Indices; 
-  std::string GEPName = Ptr->getName().str(); 
-  while (Offset) { 
-    uint64_t Idx, Rem; 
- 
-    if (auto *STy = dyn_cast<StructType>(Ty)) { 
-      const StructLayout *SL = DL.getStructLayout(STy); 
-      if (int64_t(SL->getSizeInBytes()) < Offset) 
-        break; 
-      Idx = SL->getElementContainingOffset(Offset); 
-      assert(Idx < STy->getNumElements() && "Offset calculation error!"); 
-      Rem = Offset - SL->getElementOffset(Idx); 
-      Ty = STy->getElementType(Idx); 
-    } else if (auto *PTy = dyn_cast<PointerType>(Ty)) { 
-      Ty = PTy->getElementType(); 
-      if (!Ty->isSized()) 
-        break; 
-      uint64_t ElementSize = DL.getTypeAllocSize(Ty); 
-      assert(ElementSize && "Expected type with size!"); 
-      Idx = Offset / ElementSize; 
-      Rem = Offset % ElementSize; 
-    } else { 
-      // Non-aggregate type, we cast and make byte-wise progress now. 
-      break; 
-    } 
- 
-    LLVM_DEBUG(errs() << "Ty: " << *Ty << " Offset: " << Offset 
-                      << " Idx: " << Idx << " Rem: " << Rem << "\n"); 
- 
-    GEPName += "." + std::to_string(Idx); 
-    Indices.push_back(ConstantInt::get(IRB.getInt32Ty(), Idx)); 
-    Offset = Rem; 
-  } 
- 
-  // Create a GEP if we collected indices above. 
-  if (Indices.size()) 
-    Ptr = IRB.CreateGEP(Ptr, Indices, GEPName); 
- 
-  // If an offset is left we use byte-wise adjustment. 
-  if (Offset) { 
-    Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy()); 
-    Ptr = IRB.CreateGEP(Ptr, IRB.getInt32(Offset), 
-                        GEPName + ".b" + Twine(Offset)); 
-  } 
- 
-  // Ensure the result has the requested type. 
-  Ptr = IRB.CreateBitOrPointerCast(Ptr, ResTy, Ptr->getName() + ".cast"); 
- 
-  LLVM_DEBUG(dbgs() << "Constructed pointer: " << *Ptr << "\n"); 
-  return Ptr; 
-} 
- 
-/// Recursively visit all values that might become \p IRP at some point. This 
-/// will be done by looking through cast instructions, selects, phis, and calls 
-/// with the "returned" attribute. Once we cannot look through the value any 
-/// further, the callback \p VisitValueCB is invoked and passed the current 
-/// value, the \p State, and a flag to indicate if we stripped anything. 
-/// Stripped means that we unpacked the value associated with \p IRP at least 
-/// once. Note that the value used for the callback may still be the value 
-/// associated with \p IRP (due to PHIs). To limit how much effort is invested, 
-/// we will never visit more values than specified by \p MaxValues. 
-template <typename AAType, typename StateTy> 
-static bool genericValueTraversal( 
-    Attributor &A, IRPosition IRP, const AAType &QueryingAA, StateTy &State, 
-    function_ref<bool(Value &, const Instruction *, StateTy &, bool)> 
-        VisitValueCB, 
-    const Instruction *CtxI, bool UseValueSimplify = true, int MaxValues = 16, 
-    function_ref<Value *(Value *)> StripCB = nullptr) { 
- 
-  const AAIsDead *LivenessAA = nullptr; 
-  if (IRP.getAnchorScope()) 
-    LivenessAA = &A.getAAFor<AAIsDead>( 
-        QueryingAA, IRPosition::function(*IRP.getAnchorScope()), 
-        /* TrackDependence */ false); 
-  bool AnyDead = false; 
- 
-  using Item = std::pair<Value *, const Instruction *>; 
-  SmallSet<Item, 16> Visited; 
-  SmallVector<Item, 16> Worklist; 
-  Worklist.push_back({&IRP.getAssociatedValue(), CtxI}); 
- 
-  int Iteration = 0; 
-  do { 
-    Item I = Worklist.pop_back_val(); 
-    Value *V = I.first; 
-    CtxI = I.second; 
-    if (StripCB) 
-      V = StripCB(V); 
- 
-    // Check if we should process the current value. To prevent endless 
-    // recursion keep a record of the values we followed! 
-    if (!Visited.insert(I).second) 
-      continue; 
- 
-    // Make sure we limit the compile time for complex expressions. 
-    if (Iteration++ >= MaxValues) 
-      return false; 
- 
-    // Explicitly look through calls with a "returned" attribute if we do 
-    // not have a pointer as stripPointerCasts only works on them. 
-    Value *NewV = nullptr; 
-    if (V->getType()->isPointerTy()) { 
-      NewV = V->stripPointerCasts(); 
-    } else { 
-      auto *CB = dyn_cast<CallBase>(V); 
-      if (CB && CB->getCalledFunction()) { 
-        for (Argument &Arg : CB->getCalledFunction()->args()) 
-          if (Arg.hasReturnedAttr()) { 
-            NewV = CB->getArgOperand(Arg.getArgNo()); 
-            break; 
-          } 
-      } 
-    } 
-    if (NewV && NewV != V) { 
-      Worklist.push_back({NewV, CtxI}); 
-      continue; 
-    } 
- 
-    // Look through select instructions, visit both potential values. 
-    if (auto *SI = dyn_cast<SelectInst>(V)) { 
-      Worklist.push_back({SI->getTrueValue(), CtxI}); 
-      Worklist.push_back({SI->getFalseValue(), CtxI}); 
-      continue; 
-    } 
- 
-    // Look through phi nodes, visit all live operands. 
-    if (auto *PHI = dyn_cast<PHINode>(V)) { 
-      assert(LivenessAA && 
-             "Expected liveness in the presence of instructions!"); 
-      for (unsigned u = 0, e = PHI->getNumIncomingValues(); u < e; u++) { 
-        BasicBlock *IncomingBB = PHI->getIncomingBlock(u); 
-        if (A.isAssumedDead(*IncomingBB->getTerminator(), &QueryingAA, 
-                            LivenessAA, 
-                            /* CheckBBLivenessOnly */ true)) { 
-          AnyDead = true; 
-          continue; 
-        } 
-        Worklist.push_back( 
-            {PHI->getIncomingValue(u), IncomingBB->getTerminator()}); 
-      } 
-      continue; 
-    } 
- 
-    if (UseValueSimplify && !isa<Constant>(V)) { 
-      bool UsedAssumedInformation = false; 
-      Optional<Constant *> C = 
-          A.getAssumedConstant(*V, QueryingAA, UsedAssumedInformation); 
-      if (!C.hasValue()) 
-        continue; 
-      if (Value *NewV = C.getValue()) { 
-        Worklist.push_back({NewV, CtxI}); 
-        continue; 
-      } 
-    } 
- 
-    // Once a leaf is reached we inform the user through the callback. 
-    if (!VisitValueCB(*V, CtxI, State, Iteration > 1)) 
-      return false; 
-  } while (!Worklist.empty()); 
- 
-  // If we actually used liveness information so we have to record a dependence. 
-  if (AnyDead) 
-    A.recordDependence(*LivenessAA, QueryingAA, DepClassTy::OPTIONAL); 
- 
-  // All values have been visited. 
-  return true; 
-} 
- 
-const Value *stripAndAccumulateMinimalOffsets( 
-    Attributor &A, const AbstractAttribute &QueryingAA, const Value *Val, 
-    const DataLayout &DL, APInt &Offset, bool AllowNonInbounds, 
-    bool UseAssumed = false) { 
- 
-  auto AttributorAnalysis = [&](Value &V, APInt &ROffset) -> bool { 
-    const IRPosition &Pos = IRPosition::value(V); 
-    // Only track dependence if we are going to use the assumed info. 
-    const AAValueConstantRange &ValueConstantRangeAA = 
-        A.getAAFor<AAValueConstantRange>(QueryingAA, Pos, 
-                                         /* TrackDependence */ UseAssumed); 
-    ConstantRange Range = UseAssumed ? ValueConstantRangeAA.getAssumed() 
-                                     : ValueConstantRangeAA.getKnown(); 
-    // We can only use the lower part of the range because the upper part can 
-    // be higher than what the value can really be. 
-    ROffset = Range.getSignedMin(); 
-    return true; 
-  }; 
- 
-  return Val->stripAndAccumulateConstantOffsets(DL, Offset, AllowNonInbounds, 
-                                                AttributorAnalysis); 
-} 
- 
-static const Value *getMinimalBaseOfAccsesPointerOperand( 
-    Attributor &A, const AbstractAttribute &QueryingAA, const Instruction *I, 
-    int64_t &BytesOffset, const DataLayout &DL, bool AllowNonInbounds = false) { 
-  const Value *Ptr = getPointerOperand(I, /* AllowVolatile */ false); 
-  if (!Ptr) 
-    return nullptr; 
-  APInt OffsetAPInt(DL.getIndexTypeSizeInBits(Ptr->getType()), 0); 
-  const Value *Base = stripAndAccumulateMinimalOffsets( 
-      A, QueryingAA, Ptr, DL, OffsetAPInt, AllowNonInbounds); 
- 
-  BytesOffset = OffsetAPInt.getSExtValue(); 
-  return Base; 
-} 
- 
-static const Value * 
-getBasePointerOfAccessPointerOperand(const Instruction *I, int64_t &BytesOffset, 
-                                     const DataLayout &DL, 
-                                     bool AllowNonInbounds = false) { 
-  const Value *Ptr = getPointerOperand(I, /* AllowVolatile */ false); 
-  if (!Ptr) 
-    return nullptr; 
- 
-  return GetPointerBaseWithConstantOffset(Ptr, BytesOffset, DL, 
-                                          AllowNonInbounds); 
-} 
- 
-/// Helper function to clamp a state \p S of type \p StateType with the 
-/// information in \p R and indicate/return if \p S did change (as-in update is 
-/// required to be run again). 
-template <typename StateType> 
-ChangeStatus clampStateAndIndicateChange(StateType &S, const StateType &R) { 
-  auto Assumed = S.getAssumed(); 
-  S ^= R; 
-  return Assumed == S.getAssumed() ? ChangeStatus::UNCHANGED 
-                                   : ChangeStatus::CHANGED; 
-} 
- 
-/// Clamp the information known for all returned values of a function 
-/// (identified by \p QueryingAA) into \p S. 
-template <typename AAType, typename StateType = typename AAType::StateType> 
-static void clampReturnedValueStates(Attributor &A, const AAType &QueryingAA, 
-                                     StateType &S) { 
-  LLVM_DEBUG(dbgs() << "[Attributor] Clamp return value states for " 
-                    << QueryingAA << " into " << S << "\n"); 
- 
-  assert((QueryingAA.getIRPosition().getPositionKind() == 
-              IRPosition::IRP_RETURNED || 
-          QueryingAA.getIRPosition().getPositionKind() == 
-              IRPosition::IRP_CALL_SITE_RETURNED) && 
-         "Can only clamp returned value states for a function returned or call " 
-         "site returned position!"); 
- 
-  // Use an optional state as there might not be any return values and we want 
-  // to join (IntegerState::operator&) the state of all there are. 
-  Optional<StateType> T; 
- 
-  // Callback for each possibly returned value. 
-  auto CheckReturnValue = [&](Value &RV) -> bool { 
-    const IRPosition &RVPos = IRPosition::value(RV); 
-    const AAType &AA = A.getAAFor<AAType>(QueryingAA, RVPos); 
-    LLVM_DEBUG(dbgs() << "[Attributor] RV: " << RV << " AA: " << AA.getAsStr() 
-                      << " @ " << RVPos << "\n"); 
+
+#undef PIPE_OPERATOR
+} // namespace llvm
+
+namespace {
+
+static Optional<ConstantInt *>
+getAssumedConstantInt(Attributor &A, const Value &V,
+                      const AbstractAttribute &AA,
+                      bool &UsedAssumedInformation) {
+  Optional<Constant *> C = A.getAssumedConstant(V, AA, UsedAssumedInformation);
+  if (C.hasValue())
+    return dyn_cast_or_null<ConstantInt>(C.getValue());
+  return llvm::None;
+}
+
+/// Get pointer operand of memory accessing instruction. If \p I is
+/// not a memory accessing instruction, return nullptr. If \p AllowVolatile,
+/// is set to false and the instruction is volatile, return nullptr.
+static const Value *getPointerOperand(const Instruction *I,
+                                      bool AllowVolatile) {
+  if (auto *LI = dyn_cast<LoadInst>(I)) {
+    if (!AllowVolatile && LI->isVolatile())
+      return nullptr;
+    return LI->getPointerOperand();
+  }
+
+  if (auto *SI = dyn_cast<StoreInst>(I)) {
+    if (!AllowVolatile && SI->isVolatile())
+      return nullptr;
+    return SI->getPointerOperand();
+  }
+
+  if (auto *CXI = dyn_cast<AtomicCmpXchgInst>(I)) {
+    if (!AllowVolatile && CXI->isVolatile())
+      return nullptr;
+    return CXI->getPointerOperand();
+  }
+
+  if (auto *RMWI = dyn_cast<AtomicRMWInst>(I)) {
+    if (!AllowVolatile && RMWI->isVolatile())
+      return nullptr;
+    return RMWI->getPointerOperand();
+  }
+
+  return nullptr;
+}
+
+/// Helper function to create a pointer of type \p ResTy, based on \p Ptr, and
+/// advanced by \p Offset bytes. To aid later analysis the method tries to build
+/// getelement pointer instructions that traverse the natural type of \p Ptr if
+/// possible. If that fails, the remaining offset is adjusted byte-wise, hence
+/// through a cast to i8*.
+///
+/// TODO: This could probably live somewhere more prominantly if it doesn't
+///       already exist.
+static Value *constructPointer(Type *ResTy, Value *Ptr, int64_t Offset,
+                               IRBuilder<NoFolder> &IRB, const DataLayout &DL) {
+  assert(Offset >= 0 && "Negative offset not supported yet!");
+  LLVM_DEBUG(dbgs() << "Construct pointer: " << *Ptr << " + " << Offset
+                    << "-bytes as " << *ResTy << "\n");
+
+  // The initial type we are trying to traverse to get nice GEPs.
+  Type *Ty = Ptr->getType();
+
+  SmallVector<Value *, 4> Indices;
+  std::string GEPName = Ptr->getName().str();
+  while (Offset) {
+    uint64_t Idx, Rem;
+
+    if (auto *STy = dyn_cast<StructType>(Ty)) {
+      const StructLayout *SL = DL.getStructLayout(STy);
+      if (int64_t(SL->getSizeInBytes()) < Offset)
+        break;
+      Idx = SL->getElementContainingOffset(Offset);
+      assert(Idx < STy->getNumElements() && "Offset calculation error!");
+      Rem = Offset - SL->getElementOffset(Idx);
+      Ty = STy->getElementType(Idx);
+    } else if (auto *PTy = dyn_cast<PointerType>(Ty)) {
+      Ty = PTy->getElementType();
+      if (!Ty->isSized())
+        break;
+      uint64_t ElementSize = DL.getTypeAllocSize(Ty);
+      assert(ElementSize && "Expected type with size!");
+      Idx = Offset / ElementSize;
+      Rem = Offset % ElementSize;
+    } else {
+      // Non-aggregate type, we cast and make byte-wise progress now.
+      break;
+    }
+
+    LLVM_DEBUG(errs() << "Ty: " << *Ty << " Offset: " << Offset
+                      << " Idx: " << Idx << " Rem: " << Rem << "\n");
+
+    GEPName += "." + std::to_string(Idx);
+    Indices.push_back(ConstantInt::get(IRB.getInt32Ty(), Idx));
+    Offset = Rem;
+  }
+
+  // Create a GEP if we collected indices above.
+  if (Indices.size())
+    Ptr = IRB.CreateGEP(Ptr, Indices, GEPName);
+
+  // If an offset is left we use byte-wise adjustment.
+  if (Offset) {
+    Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy());
+    Ptr = IRB.CreateGEP(Ptr, IRB.getInt32(Offset),
+                        GEPName + ".b" + Twine(Offset));
+  }
+
+  // Ensure the result has the requested type.
+  Ptr = IRB.CreateBitOrPointerCast(Ptr, ResTy, Ptr->getName() + ".cast");
+
+  LLVM_DEBUG(dbgs() << "Constructed pointer: " << *Ptr << "\n");
+  return Ptr;
+}
+
+/// Recursively visit all values that might become \p IRP at some point. This
+/// will be done by looking through cast instructions, selects, phis, and calls
+/// with the "returned" attribute. Once we cannot look through the value any
+/// further, the callback \p VisitValueCB is invoked and passed the current
+/// value, the \p State, and a flag to indicate if we stripped anything.
+/// Stripped means that we unpacked the value associated with \p IRP at least
+/// once. Note that the value used for the callback may still be the value
+/// associated with \p IRP (due to PHIs). To limit how much effort is invested,
+/// we will never visit more values than specified by \p MaxValues.
+template <typename AAType, typename StateTy>
+static bool genericValueTraversal(
+    Attributor &A, IRPosition IRP, const AAType &QueryingAA, StateTy &State,
+    function_ref<bool(Value &, const Instruction *, StateTy &, bool)>
+        VisitValueCB,
+    const Instruction *CtxI, bool UseValueSimplify = true, int MaxValues = 16,
+    function_ref<Value *(Value *)> StripCB = nullptr) {
+
+  const AAIsDead *LivenessAA = nullptr;
+  if (IRP.getAnchorScope())
+    LivenessAA = &A.getAAFor<AAIsDead>(
+        QueryingAA, IRPosition::function(*IRP.getAnchorScope()),
+        /* TrackDependence */ false);
+  bool AnyDead = false;
+
+  using Item = std::pair<Value *, const Instruction *>;
+  SmallSet<Item, 16> Visited;
+  SmallVector<Item, 16> Worklist;
+  Worklist.push_back({&IRP.getAssociatedValue(), CtxI});
+
+  int Iteration = 0;
+  do {
+    Item I = Worklist.pop_back_val();
+    Value *V = I.first;
+    CtxI = I.second;
+    if (StripCB)
+      V = StripCB(V);
+
+    // Check if we should process the current value. To prevent endless
+    // recursion keep a record of the values we followed!
+    if (!Visited.insert(I).second)
+      continue;
+
+    // Make sure we limit the compile time for complex expressions.
+    if (Iteration++ >= MaxValues)
+      return false;
+
+    // Explicitly look through calls with a "returned" attribute if we do
+    // not have a pointer as stripPointerCasts only works on them.
+    Value *NewV = nullptr;
+    if (V->getType()->isPointerTy()) {
+      NewV = V->stripPointerCasts();
+    } else {
+      auto *CB = dyn_cast<CallBase>(V);
+      if (CB && CB->getCalledFunction()) {
+        for (Argument &Arg : CB->getCalledFunction()->args())
+          if (Arg.hasReturnedAttr()) {
+            NewV = CB->getArgOperand(Arg.getArgNo());
+            break;
+          }
+      }
+    }
+    if (NewV && NewV != V) {
+      Worklist.push_back({NewV, CtxI});
+      continue;
+    }
+
+    // Look through select instructions, visit both potential values.
+    if (auto *SI = dyn_cast<SelectInst>(V)) {
+      Worklist.push_back({SI->getTrueValue(), CtxI});
+      Worklist.push_back({SI->getFalseValue(), CtxI});
+      continue;
+    }
+
+    // Look through phi nodes, visit all live operands.
+    if (auto *PHI = dyn_cast<PHINode>(V)) {
+      assert(LivenessAA &&
+             "Expected liveness in the presence of instructions!");
+      for (unsigned u = 0, e = PHI->getNumIncomingValues(); u < e; u++) {
+        BasicBlock *IncomingBB = PHI->getIncomingBlock(u);
+        if (A.isAssumedDead(*IncomingBB->getTerminator(), &QueryingAA,
+                            LivenessAA,
+                            /* CheckBBLivenessOnly */ true)) {
+          AnyDead = true;
+          continue;
+        }
+        Worklist.push_back(
+            {PHI->getIncomingValue(u), IncomingBB->getTerminator()});
+      }
+      continue;
+    }
+
+    if (UseValueSimplify && !isa<Constant>(V)) {
+      bool UsedAssumedInformation = false;
+      Optional<Constant *> C =
+          A.getAssumedConstant(*V, QueryingAA, UsedAssumedInformation);
+      if (!C.hasValue())
+        continue;
+      if (Value *NewV = C.getValue()) {
+        Worklist.push_back({NewV, CtxI});
+        continue;
+      }
+    }
+
+    // Once a leaf is reached we inform the user through the callback.
+    if (!VisitValueCB(*V, CtxI, State, Iteration > 1))
+      return false;
+  } while (!Worklist.empty());
+
+  // If we actually used liveness information so we have to record a dependence.
+  if (AnyDead)
+    A.recordDependence(*LivenessAA, QueryingAA, DepClassTy::OPTIONAL);
+
+  // All values have been visited.
+  return true;
+}
+
+const Value *stripAndAccumulateMinimalOffsets(
+    Attributor &A, const AbstractAttribute &QueryingAA, const Value *Val,
+    const DataLayout &DL, APInt &Offset, bool AllowNonInbounds,
+    bool UseAssumed = false) {
+
+  auto AttributorAnalysis = [&](Value &V, APInt &ROffset) -> bool {
+    const IRPosition &Pos = IRPosition::value(V);
+    // Only track dependence if we are going to use the assumed info.
+    const AAValueConstantRange &ValueConstantRangeAA =
+        A.getAAFor<AAValueConstantRange>(QueryingAA, Pos,
+                                         /* TrackDependence */ UseAssumed);
+    ConstantRange Range = UseAssumed ? ValueConstantRangeAA.getAssumed()
+                                     : ValueConstantRangeAA.getKnown();
+    // We can only use the lower part of the range because the upper part can
+    // be higher than what the value can really be.
+    ROffset = Range.getSignedMin();
+    return true;
+  };
+
+  return Val->stripAndAccumulateConstantOffsets(DL, Offset, AllowNonInbounds,
+                                                AttributorAnalysis);
+}
+
+static const Value *getMinimalBaseOfAccsesPointerOperand(
+    Attributor &A, const AbstractAttribute &QueryingAA, const Instruction *I,
+    int64_t &BytesOffset, const DataLayout &DL, bool AllowNonInbounds = false) {
+  const Value *Ptr = getPointerOperand(I, /* AllowVolatile */ false);
+  if (!Ptr)
+    return nullptr;
+  APInt OffsetAPInt(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
+  const Value *Base = stripAndAccumulateMinimalOffsets(
+      A, QueryingAA, Ptr, DL, OffsetAPInt, AllowNonInbounds);
+
+  BytesOffset = OffsetAPInt.getSExtValue();
+  return Base;
+}
+
+static const Value *
+getBasePointerOfAccessPointerOperand(const Instruction *I, int64_t &BytesOffset,
+                                     const DataLayout &DL,
+                                     bool AllowNonInbounds = false) {
+  const Value *Ptr = getPointerOperand(I, /* AllowVolatile */ false);
+  if (!Ptr)
+    return nullptr;
+
+  return GetPointerBaseWithConstantOffset(Ptr, BytesOffset, DL,
+                                          AllowNonInbounds);
+}
+
+/// Helper function to clamp a state \p S of type \p StateType with the
+/// information in \p R and indicate/return if \p S did change (as-in update is
+/// required to be run again).
+template <typename StateType>
+ChangeStatus clampStateAndIndicateChange(StateType &S, const StateType &R) {
+  auto Assumed = S.getAssumed();
+  S ^= R;
+  return Assumed == S.getAssumed() ? ChangeStatus::UNCHANGED
+                                   : ChangeStatus::CHANGED;
+}
+
+/// Clamp the information known for all returned values of a function
+/// (identified by \p QueryingAA) into \p S.
+template <typename AAType, typename StateType = typename AAType::StateType>
+static void clampReturnedValueStates(Attributor &A, const AAType &QueryingAA,
+                                     StateType &S) {
+  LLVM_DEBUG(dbgs() << "[Attributor] Clamp return value states for "
+                    << QueryingAA << " into " << S << "\n");
+
+  assert((QueryingAA.getIRPosition().getPositionKind() ==
+              IRPosition::IRP_RETURNED ||
+          QueryingAA.getIRPosition().getPositionKind() ==
+              IRPosition::IRP_CALL_SITE_RETURNED) &&
+         "Can only clamp returned value states for a function returned or call "
+         "site returned position!");
+
+  // Use an optional state as there might not be any return values and we want
+  // to join (IntegerState::operator&) the state of all there are.
+  Optional<StateType> T;
+
+  // Callback for each possibly returned value.
+  auto CheckReturnValue = [&](Value &RV) -> bool {
+    const IRPosition &RVPos = IRPosition::value(RV);
+    const AAType &AA = A.getAAFor<AAType>(QueryingAA, RVPos);
+    LLVM_DEBUG(dbgs() << "[Attributor] RV: " << RV << " AA: " << AA.getAsStr()
+                      << " @ " << RVPos << "\n");
     const StateType &AAS = AA.getState();
-    if (T.hasValue()) 
-      *T &= AAS; 
-    else 
-      T = AAS; 
-    LLVM_DEBUG(dbgs() << "[Attributor] AA State: " << AAS << " RV State: " << T 
-                      << "\n"); 
-    return T->isValidState(); 
-  }; 
- 
-  if (!A.checkForAllReturnedValues(CheckReturnValue, QueryingAA)) 
-    S.indicatePessimisticFixpoint(); 
-  else if (T.hasValue()) 
-    S ^= *T; 
-} 
- 
-/// Helper class for generic deduction: return value -> returned position. 
-template <typename AAType, typename BaseType, 
-          typename StateType = typename BaseType::StateType> 
-struct AAReturnedFromReturnedValues : public BaseType { 
-  AAReturnedFromReturnedValues(const IRPosition &IRP, Attributor &A) 
-      : BaseType(IRP, A) {} 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    StateType S(StateType::getBestState(this->getState())); 
-    clampReturnedValueStates<AAType, StateType>(A, *this, S); 
-    // TODO: If we know we visited all returned values, thus no are assumed 
-    // dead, we can take the known information from the state T. 
-    return clampStateAndIndicateChange<StateType>(this->getState(), S); 
-  } 
-}; 
- 
-/// Clamp the information known at all call sites for a given argument 
-/// (identified by \p QueryingAA) into \p S. 
-template <typename AAType, typename StateType = typename AAType::StateType> 
-static void clampCallSiteArgumentStates(Attributor &A, const AAType &QueryingAA, 
-                                        StateType &S) { 
-  LLVM_DEBUG(dbgs() << "[Attributor] Clamp call site argument states for " 
-                    << QueryingAA << " into " << S << "\n"); 
- 
-  assert(QueryingAA.getIRPosition().getPositionKind() == 
-             IRPosition::IRP_ARGUMENT && 
-         "Can only clamp call site argument states for an argument position!"); 
- 
-  // Use an optional state as there might not be any return values and we want 
-  // to join (IntegerState::operator&) the state of all there are. 
-  Optional<StateType> T; 
- 
-  // The argument number which is also the call site argument number. 
+    if (T.hasValue())
+      *T &= AAS;
+    else
+      T = AAS;
+    LLVM_DEBUG(dbgs() << "[Attributor] AA State: " << AAS << " RV State: " << T
+                      << "\n");
+    return T->isValidState();
+  };
+
+  if (!A.checkForAllReturnedValues(CheckReturnValue, QueryingAA))
+    S.indicatePessimisticFixpoint();
+  else if (T.hasValue())
+    S ^= *T;
+}
+
+/// Helper class for generic deduction: return value -> returned position.
+template <typename AAType, typename BaseType,
+          typename StateType = typename BaseType::StateType>
+struct AAReturnedFromReturnedValues : public BaseType {
+  AAReturnedFromReturnedValues(const IRPosition &IRP, Attributor &A)
+      : BaseType(IRP, A) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    StateType S(StateType::getBestState(this->getState()));
+    clampReturnedValueStates<AAType, StateType>(A, *this, S);
+    // TODO: If we know we visited all returned values, thus no are assumed
+    // dead, we can take the known information from the state T.
+    return clampStateAndIndicateChange<StateType>(this->getState(), S);
+  }
+};
+
+/// Clamp the information known at all call sites for a given argument
+/// (identified by \p QueryingAA) into \p S.
+template <typename AAType, typename StateType = typename AAType::StateType>
+static void clampCallSiteArgumentStates(Attributor &A, const AAType &QueryingAA,
+                                        StateType &S) {
+  LLVM_DEBUG(dbgs() << "[Attributor] Clamp call site argument states for "
+                    << QueryingAA << " into " << S << "\n");
+
+  assert(QueryingAA.getIRPosition().getPositionKind() ==
+             IRPosition::IRP_ARGUMENT &&
+         "Can only clamp call site argument states for an argument position!");
+
+  // Use an optional state as there might not be any return values and we want
+  // to join (IntegerState::operator&) the state of all there are.
+  Optional<StateType> T;
+
+  // The argument number which is also the call site argument number.
   unsigned ArgNo = QueryingAA.getIRPosition().getCallSiteArgNo();
- 
-  auto CallSiteCheck = [&](AbstractCallSite ACS) { 
-    const IRPosition &ACSArgPos = IRPosition::callsite_argument(ACS, ArgNo); 
-    // Check if a coresponding argument was found or if it is on not associated 
-    // (which can happen for callback calls). 
-    if (ACSArgPos.getPositionKind() == IRPosition::IRP_INVALID) 
-      return false; 
- 
-    const AAType &AA = A.getAAFor<AAType>(QueryingAA, ACSArgPos); 
-    LLVM_DEBUG(dbgs() << "[Attributor] ACS: " << *ACS.getInstruction() 
-                      << " AA: " << AA.getAsStr() << " @" << ACSArgPos << "\n"); 
+
+  auto CallSiteCheck = [&](AbstractCallSite ACS) {
+    const IRPosition &ACSArgPos = IRPosition::callsite_argument(ACS, ArgNo);
+    // Check if a coresponding argument was found or if it is on not associated
+    // (which can happen for callback calls).
+    if (ACSArgPos.getPositionKind() == IRPosition::IRP_INVALID)
+      return false;
+
+    const AAType &AA = A.getAAFor<AAType>(QueryingAA, ACSArgPos);
+    LLVM_DEBUG(dbgs() << "[Attributor] ACS: " << *ACS.getInstruction()
+                      << " AA: " << AA.getAsStr() << " @" << ACSArgPos << "\n");
     const StateType &AAS = AA.getState();
-    if (T.hasValue()) 
-      *T &= AAS; 
-    else 
-      T = AAS; 
-    LLVM_DEBUG(dbgs() << "[Attributor] AA State: " << AAS << " CSA State: " << T 
-                      << "\n"); 
-    return T->isValidState(); 
-  }; 
- 
-  bool AllCallSitesKnown; 
-  if (!A.checkForAllCallSites(CallSiteCheck, QueryingAA, true, 
-                              AllCallSitesKnown)) 
-    S.indicatePessimisticFixpoint(); 
-  else if (T.hasValue()) 
-    S ^= *T; 
-} 
- 
-/// Helper class for generic deduction: call site argument -> argument position. 
-template <typename AAType, typename BaseType, 
-          typename StateType = typename AAType::StateType> 
-struct AAArgumentFromCallSiteArguments : public BaseType { 
-  AAArgumentFromCallSiteArguments(const IRPosition &IRP, Attributor &A) 
-      : BaseType(IRP, A) {} 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    StateType S(StateType::getBestState(this->getState())); 
-    clampCallSiteArgumentStates<AAType, StateType>(A, *this, S); 
-    // TODO: If we know we visited all incoming values, thus no are assumed 
-    // dead, we can take the known information from the state T. 
-    return clampStateAndIndicateChange<StateType>(this->getState(), S); 
-  } 
-}; 
- 
-/// Helper class for generic replication: function returned -> cs returned. 
-template <typename AAType, typename BaseType, 
-          typename StateType = typename BaseType::StateType> 
-struct AACallSiteReturnedFromReturned : public BaseType { 
-  AACallSiteReturnedFromReturned(const IRPosition &IRP, Attributor &A) 
-      : BaseType(IRP, A) {} 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    assert(this->getIRPosition().getPositionKind() == 
-               IRPosition::IRP_CALL_SITE_RETURNED && 
-           "Can only wrap function returned positions for call site returned " 
-           "positions!"); 
-    auto &S = this->getState(); 
- 
-    const Function *AssociatedFunction = 
-        this->getIRPosition().getAssociatedFunction(); 
-    if (!AssociatedFunction) 
-      return S.indicatePessimisticFixpoint(); 
- 
-    IRPosition FnPos = IRPosition::returned(*AssociatedFunction); 
-    const AAType &AA = A.getAAFor<AAType>(*this, FnPos); 
+    if (T.hasValue())
+      *T &= AAS;
+    else
+      T = AAS;
+    LLVM_DEBUG(dbgs() << "[Attributor] AA State: " << AAS << " CSA State: " << T
+                      << "\n");
+    return T->isValidState();
+  };
+
+  bool AllCallSitesKnown;
+  if (!A.checkForAllCallSites(CallSiteCheck, QueryingAA, true,
+                              AllCallSitesKnown))
+    S.indicatePessimisticFixpoint();
+  else if (T.hasValue())
+    S ^= *T;
+}
+
+/// Helper class for generic deduction: call site argument -> argument position.
+template <typename AAType, typename BaseType,
+          typename StateType = typename AAType::StateType>
+struct AAArgumentFromCallSiteArguments : public BaseType {
+  AAArgumentFromCallSiteArguments(const IRPosition &IRP, Attributor &A)
+      : BaseType(IRP, A) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    StateType S(StateType::getBestState(this->getState()));
+    clampCallSiteArgumentStates<AAType, StateType>(A, *this, S);
+    // TODO: If we know we visited all incoming values, thus no are assumed
+    // dead, we can take the known information from the state T.
+    return clampStateAndIndicateChange<StateType>(this->getState(), S);
+  }
+};
+
+/// Helper class for generic replication: function returned -> cs returned.
+template <typename AAType, typename BaseType,
+          typename StateType = typename BaseType::StateType>
+struct AACallSiteReturnedFromReturned : public BaseType {
+  AACallSiteReturnedFromReturned(const IRPosition &IRP, Attributor &A)
+      : BaseType(IRP, A) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    assert(this->getIRPosition().getPositionKind() ==
+               IRPosition::IRP_CALL_SITE_RETURNED &&
+           "Can only wrap function returned positions for call site returned "
+           "positions!");
+    auto &S = this->getState();
+
+    const Function *AssociatedFunction =
+        this->getIRPosition().getAssociatedFunction();
+    if (!AssociatedFunction)
+      return S.indicatePessimisticFixpoint();
+
+    IRPosition FnPos = IRPosition::returned(*AssociatedFunction);
+    const AAType &AA = A.getAAFor<AAType>(*this, FnPos);
     return clampStateAndIndicateChange(S, AA.getState());
-  } 
-}; 
- 
-/// Helper function to accumulate uses. 
-template <class AAType, typename StateType = typename AAType::StateType> 
-static void followUsesInContext(AAType &AA, Attributor &A, 
-                                MustBeExecutedContextExplorer &Explorer, 
-                                const Instruction *CtxI, 
-                                SetVector<const Use *> &Uses, 
-                                StateType &State) { 
-  auto EIt = Explorer.begin(CtxI), EEnd = Explorer.end(CtxI); 
-  for (unsigned u = 0; u < Uses.size(); ++u) { 
-    const Use *U = Uses[u]; 
-    if (const Instruction *UserI = dyn_cast<Instruction>(U->getUser())) { 
-      bool Found = Explorer.findInContextOf(UserI, EIt, EEnd); 
-      if (Found && AA.followUseInMBEC(A, U, UserI, State)) 
-        for (const Use &Us : UserI->uses()) 
-          Uses.insert(&Us); 
-    } 
-  } 
-} 
- 
-/// Use the must-be-executed-context around \p I to add information into \p S. 
-/// The AAType class is required to have `followUseInMBEC` method with the 
-/// following signature and behaviour: 
-/// 
-/// bool followUseInMBEC(Attributor &A, const Use *U, const Instruction *I) 
-/// U - Underlying use. 
-/// I - The user of the \p U. 
-/// Returns true if the value should be tracked transitively. 
-/// 
-template <class AAType, typename StateType = typename AAType::StateType> 
-static void followUsesInMBEC(AAType &AA, Attributor &A, StateType &S, 
-                             Instruction &CtxI) { 
- 
-  // Container for (transitive) uses of the associated value. 
-  SetVector<const Use *> Uses; 
-  for (const Use &U : AA.getIRPosition().getAssociatedValue().uses()) 
-    Uses.insert(&U); 
- 
-  MustBeExecutedContextExplorer &Explorer = 
-      A.getInfoCache().getMustBeExecutedContextExplorer(); 
- 
-  followUsesInContext<AAType>(AA, A, Explorer, &CtxI, Uses, S); 
- 
-  if (S.isAtFixpoint()) 
-    return; 
- 
-  SmallVector<const BranchInst *, 4> BrInsts; 
-  auto Pred = [&](const Instruction *I) { 
-    if (const BranchInst *Br = dyn_cast<BranchInst>(I)) 
-      if (Br->isConditional()) 
-        BrInsts.push_back(Br); 
-    return true; 
-  }; 
- 
-  // Here, accumulate conditional branch instructions in the context. We 
-  // explore the child paths and collect the known states. The disjunction of 
-  // those states can be merged to its own state. Let ParentState_i be a state 
-  // to indicate the known information for an i-th branch instruction in the 
-  // context. ChildStates are created for its successors respectively. 
-  // 
-  // ParentS_1 = ChildS_{1, 1} /\ ChildS_{1, 2} /\ ... /\ ChildS_{1, n_1} 
-  // ParentS_2 = ChildS_{2, 1} /\ ChildS_{2, 2} /\ ... /\ ChildS_{2, n_2} 
-  //      ... 
-  // ParentS_m = ChildS_{m, 1} /\ ChildS_{m, 2} /\ ... /\ ChildS_{m, n_m} 
-  // 
-  // Known State |= ParentS_1 \/ ParentS_2 \/... \/ ParentS_m 
-  // 
-  // FIXME: Currently, recursive branches are not handled. For example, we 
-  // can't deduce that ptr must be dereferenced in below function. 
-  // 
-  // void f(int a, int c, int *ptr) { 
-  //    if(a) 
-  //      if (b) { 
-  //        *ptr = 0; 
-  //      } else { 
-  //        *ptr = 1; 
-  //      } 
-  //    else { 
-  //      if (b) { 
-  //        *ptr = 0; 
-  //      } else { 
-  //        *ptr = 1; 
-  //      } 
-  //    } 
-  // } 
- 
-  Explorer.checkForAllContext(&CtxI, Pred); 
-  for (const BranchInst *Br : BrInsts) { 
-    StateType ParentState; 
- 
-    // The known state of the parent state is a conjunction of children's 
-    // known states so it is initialized with a best state. 
-    ParentState.indicateOptimisticFixpoint(); 
- 
-    for (const BasicBlock *BB : Br->successors()) { 
-      StateType ChildState; 
- 
-      size_t BeforeSize = Uses.size(); 
-      followUsesInContext(AA, A, Explorer, &BB->front(), Uses, ChildState); 
- 
-      // Erase uses which only appear in the child. 
-      for (auto It = Uses.begin() + BeforeSize; It != Uses.end();) 
-        It = Uses.erase(It); 
- 
-      ParentState &= ChildState; 
-    } 
- 
-    // Use only known state. 
-    S += ParentState; 
-  } 
-} 
- 
-/// -----------------------NoUnwind Function Attribute-------------------------- 
- 
-struct AANoUnwindImpl : AANoUnwind { 
-  AANoUnwindImpl(const IRPosition &IRP, Attributor &A) : AANoUnwind(IRP, A) {} 
- 
-  const std::string getAsStr() const override { 
-    return getAssumed() ? "nounwind" : "may-unwind"; 
-  } 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    auto Opcodes = { 
-        (unsigned)Instruction::Invoke,      (unsigned)Instruction::CallBr, 
-        (unsigned)Instruction::Call,        (unsigned)Instruction::CleanupRet, 
-        (unsigned)Instruction::CatchSwitch, (unsigned)Instruction::Resume}; 
- 
-    auto CheckForNoUnwind = [&](Instruction &I) { 
-      if (!I.mayThrow()) 
-        return true; 
- 
-      if (const auto *CB = dyn_cast<CallBase>(&I)) { 
-        const auto &NoUnwindAA = 
-            A.getAAFor<AANoUnwind>(*this, IRPosition::callsite_function(*CB)); 
-        return NoUnwindAA.isAssumedNoUnwind(); 
-      } 
-      return false; 
-    }; 
- 
-    if (!A.checkForAllInstructions(CheckForNoUnwind, *this, Opcodes)) 
-      return indicatePessimisticFixpoint(); 
- 
-    return ChangeStatus::UNCHANGED; 
-  } 
-}; 
- 
-struct AANoUnwindFunction final : public AANoUnwindImpl { 
-  AANoUnwindFunction(const IRPosition &IRP, Attributor &A) 
-      : AANoUnwindImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(nounwind) } 
-}; 
- 
-/// NoUnwind attribute deduction for a call sites. 
-struct AANoUnwindCallSite final : AANoUnwindImpl { 
-  AANoUnwindCallSite(const IRPosition &IRP, Attributor &A) 
-      : AANoUnwindImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    AANoUnwindImpl::initialize(A); 
-    Function *F = getAssociatedFunction(); 
+  }
+};
+
+/// Helper function to accumulate uses.
+template <class AAType, typename StateType = typename AAType::StateType>
+static void followUsesInContext(AAType &AA, Attributor &A,
+                                MustBeExecutedContextExplorer &Explorer,
+                                const Instruction *CtxI,
+                                SetVector<const Use *> &Uses,
+                                StateType &State) {
+  auto EIt = Explorer.begin(CtxI), EEnd = Explorer.end(CtxI);
+  for (unsigned u = 0; u < Uses.size(); ++u) {
+    const Use *U = Uses[u];
+    if (const Instruction *UserI = dyn_cast<Instruction>(U->getUser())) {
+      bool Found = Explorer.findInContextOf(UserI, EIt, EEnd);
+      if (Found && AA.followUseInMBEC(A, U, UserI, State))
+        for (const Use &Us : UserI->uses())
+          Uses.insert(&Us);
+    }
+  }
+}
+
+/// Use the must-be-executed-context around \p I to add information into \p S.
+/// The AAType class is required to have `followUseInMBEC` method with the
+/// following signature and behaviour:
+///
+/// bool followUseInMBEC(Attributor &A, const Use *U, const Instruction *I)
+/// U - Underlying use.
+/// I - The user of the \p U.
+/// Returns true if the value should be tracked transitively.
+///
+template <class AAType, typename StateType = typename AAType::StateType>
+static void followUsesInMBEC(AAType &AA, Attributor &A, StateType &S,
+                             Instruction &CtxI) {
+
+  // Container for (transitive) uses of the associated value.
+  SetVector<const Use *> Uses;
+  for (const Use &U : AA.getIRPosition().getAssociatedValue().uses())
+    Uses.insert(&U);
+
+  MustBeExecutedContextExplorer &Explorer =
+      A.getInfoCache().getMustBeExecutedContextExplorer();
+
+  followUsesInContext<AAType>(AA, A, Explorer, &CtxI, Uses, S);
+
+  if (S.isAtFixpoint())
+    return;
+
+  SmallVector<const BranchInst *, 4> BrInsts;
+  auto Pred = [&](const Instruction *I) {
+    if (const BranchInst *Br = dyn_cast<BranchInst>(I))
+      if (Br->isConditional())
+        BrInsts.push_back(Br);
+    return true;
+  };
+
+  // Here, accumulate conditional branch instructions in the context. We
+  // explore the child paths and collect the known states. The disjunction of
+  // those states can be merged to its own state. Let ParentState_i be a state
+  // to indicate the known information for an i-th branch instruction in the
+  // context. ChildStates are created for its successors respectively.
+  //
+  // ParentS_1 = ChildS_{1, 1} /\ ChildS_{1, 2} /\ ... /\ ChildS_{1, n_1}
+  // ParentS_2 = ChildS_{2, 1} /\ ChildS_{2, 2} /\ ... /\ ChildS_{2, n_2}
+  //      ...
+  // ParentS_m = ChildS_{m, 1} /\ ChildS_{m, 2} /\ ... /\ ChildS_{m, n_m}
+  //
+  // Known State |= ParentS_1 \/ ParentS_2 \/... \/ ParentS_m
+  //
+  // FIXME: Currently, recursive branches are not handled. For example, we
+  // can't deduce that ptr must be dereferenced in below function.
+  //
+  // void f(int a, int c, int *ptr) {
+  //    if(a)
+  //      if (b) {
+  //        *ptr = 0;
+  //      } else {
+  //        *ptr = 1;
+  //      }
+  //    else {
+  //      if (b) {
+  //        *ptr = 0;
+  //      } else {
+  //        *ptr = 1;
+  //      }
+  //    }
+  // }
+
+  Explorer.checkForAllContext(&CtxI, Pred);
+  for (const BranchInst *Br : BrInsts) {
+    StateType ParentState;
+
+    // The known state of the parent state is a conjunction of children's
+    // known states so it is initialized with a best state.
+    ParentState.indicateOptimisticFixpoint();
+
+    for (const BasicBlock *BB : Br->successors()) {
+      StateType ChildState;
+
+      size_t BeforeSize = Uses.size();
+      followUsesInContext(AA, A, Explorer, &BB->front(), Uses, ChildState);
+
+      // Erase uses which only appear in the child.
+      for (auto It = Uses.begin() + BeforeSize; It != Uses.end();)
+        It = Uses.erase(It);
+
+      ParentState &= ChildState;
+    }
+
+    // Use only known state.
+    S += ParentState;
+  }
+}
+
+/// -----------------------NoUnwind Function Attribute--------------------------
+
+struct AANoUnwindImpl : AANoUnwind {
+  AANoUnwindImpl(const IRPosition &IRP, Attributor &A) : AANoUnwind(IRP, A) {}
+
+  const std::string getAsStr() const override {
+    return getAssumed() ? "nounwind" : "may-unwind";
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    auto Opcodes = {
+        (unsigned)Instruction::Invoke,      (unsigned)Instruction::CallBr,
+        (unsigned)Instruction::Call,        (unsigned)Instruction::CleanupRet,
+        (unsigned)Instruction::CatchSwitch, (unsigned)Instruction::Resume};
+
+    auto CheckForNoUnwind = [&](Instruction &I) {
+      if (!I.mayThrow())
+        return true;
+
+      if (const auto *CB = dyn_cast<CallBase>(&I)) {
+        const auto &NoUnwindAA =
+            A.getAAFor<AANoUnwind>(*this, IRPosition::callsite_function(*CB));
+        return NoUnwindAA.isAssumedNoUnwind();
+      }
+      return false;
+    };
+
+    if (!A.checkForAllInstructions(CheckForNoUnwind, *this, Opcodes))
+      return indicatePessimisticFixpoint();
+
+    return ChangeStatus::UNCHANGED;
+  }
+};
+
+struct AANoUnwindFunction final : public AANoUnwindImpl {
+  AANoUnwindFunction(const IRPosition &IRP, Attributor &A)
+      : AANoUnwindImpl(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(nounwind) }
+};
+
+/// NoUnwind attribute deduction for a call sites.
+struct AANoUnwindCallSite final : AANoUnwindImpl {
+  AANoUnwindCallSite(const IRPosition &IRP, Attributor &A)
+      : AANoUnwindImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoUnwindImpl::initialize(A);
+    Function *F = getAssociatedFunction();
     if (!F || F->isDeclaration())
-      indicatePessimisticFixpoint(); 
-  } 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    // TODO: Once we have call site specific value information we can provide 
-    //       call site specific liveness information and then it makes 
-    //       sense to specialize attributes for call sites arguments instead of 
-    //       redirecting requests to the callee argument. 
-    Function *F = getAssociatedFunction(); 
-    const IRPosition &FnPos = IRPosition::function(*F); 
-    auto &FnAA = A.getAAFor<AANoUnwind>(*this, FnPos); 
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::function(*F);
+    auto &FnAA = A.getAAFor<AANoUnwind>(*this, FnPos);
     return clampStateAndIndicateChange(getState(), FnAA.getState());
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(nounwind); } 
-}; 
- 
-/// --------------------- Function Return Values ------------------------------- 
- 
-/// "Attribute" that collects all potential returned values and the return 
-/// instructions that they arise from. 
-/// 
-/// If there is a unique returned value R, the manifest method will: 
-///   - mark R with the "returned" attribute, if R is an argument. 
-class AAReturnedValuesImpl : public AAReturnedValues, public AbstractState { 
- 
-  /// Mapping of values potentially returned by the associated function to the 
-  /// return instructions that might return them. 
-  MapVector<Value *, SmallSetVector<ReturnInst *, 4>> ReturnedValues; 
- 
-  /// Mapping to remember the number of returned values for a call site such 
-  /// that we can avoid updates if nothing changed. 
-  DenseMap<const CallBase *, unsigned> NumReturnedValuesPerKnownAA; 
- 
-  /// Set of unresolved calls returned by the associated function. 
-  SmallSetVector<CallBase *, 4> UnresolvedCalls; 
- 
-  /// State flags 
-  /// 
-  ///{ 
-  bool IsFixed = false; 
-  bool IsValidState = true; 
-  ///} 
- 
-public: 
-  AAReturnedValuesImpl(const IRPosition &IRP, Attributor &A) 
-      : AAReturnedValues(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    // Reset the state. 
-    IsFixed = false; 
-    IsValidState = true; 
-    ReturnedValues.clear(); 
- 
-    Function *F = getAssociatedFunction(); 
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(nounwind); }
+};
+
+/// --------------------- Function Return Values -------------------------------
+
+/// "Attribute" that collects all potential returned values and the return
+/// instructions that they arise from.
+///
+/// If there is a unique returned value R, the manifest method will:
+///   - mark R with the "returned" attribute, if R is an argument.
+class AAReturnedValuesImpl : public AAReturnedValues, public AbstractState {
+
+  /// Mapping of values potentially returned by the associated function to the
+  /// return instructions that might return them.
+  MapVector<Value *, SmallSetVector<ReturnInst *, 4>> ReturnedValues;
+
+  /// Mapping to remember the number of returned values for a call site such
+  /// that we can avoid updates if nothing changed.
+  DenseMap<const CallBase *, unsigned> NumReturnedValuesPerKnownAA;
+
+  /// Set of unresolved calls returned by the associated function.
+  SmallSetVector<CallBase *, 4> UnresolvedCalls;
+
+  /// State flags
+  ///
+  ///{
+  bool IsFixed = false;
+  bool IsValidState = true;
+  ///}
+
+public:
+  AAReturnedValuesImpl(const IRPosition &IRP, Attributor &A)
+      : AAReturnedValues(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    // Reset the state.
+    IsFixed = false;
+    IsValidState = true;
+    ReturnedValues.clear();
+
+    Function *F = getAssociatedFunction();
     if (!F || F->isDeclaration()) {
-      indicatePessimisticFixpoint(); 
-      return; 
-    } 
-    assert(!F->getReturnType()->isVoidTy() && 
-           "Did not expect a void return type!"); 
- 
-    // The map from instruction opcodes to those instructions in the function. 
-    auto &OpcodeInstMap = A.getInfoCache().getOpcodeInstMapForFunction(*F); 
- 
-    // Look through all arguments, if one is marked as returned we are done. 
-    for (Argument &Arg : F->args()) { 
-      if (Arg.hasReturnedAttr()) { 
-        auto &ReturnInstSet = ReturnedValues[&Arg]; 
-        if (auto *Insts = OpcodeInstMap.lookup(Instruction::Ret)) 
-          for (Instruction *RI : *Insts) 
-            ReturnInstSet.insert(cast<ReturnInst>(RI)); 
- 
-        indicateOptimisticFixpoint(); 
-        return; 
-      } 
-    } 
- 
-    if (!A.isFunctionIPOAmendable(*F)) 
-      indicatePessimisticFixpoint(); 
-  } 
- 
-  /// See AbstractAttribute::manifest(...). 
-  ChangeStatus manifest(Attributor &A) override; 
- 
-  /// See AbstractAttribute::getState(...). 
-  AbstractState &getState() override { return *this; } 
- 
-  /// See AbstractAttribute::getState(...). 
-  const AbstractState &getState() const override { return *this; } 
- 
-  /// See AbstractAttribute::updateImpl(Attributor &A). 
-  ChangeStatus updateImpl(Attributor &A) override; 
- 
-  llvm::iterator_range<iterator> returned_values() override { 
-    return llvm::make_range(ReturnedValues.begin(), ReturnedValues.end()); 
-  } 
- 
-  llvm::iterator_range<const_iterator> returned_values() const override { 
-    return llvm::make_range(ReturnedValues.begin(), ReturnedValues.end()); 
-  } 
- 
-  const SmallSetVector<CallBase *, 4> &getUnresolvedCalls() const override { 
-    return UnresolvedCalls; 
-  } 
- 
-  /// Return the number of potential return values, -1 if unknown. 
-  size_t getNumReturnValues() const override { 
-    return isValidState() ? ReturnedValues.size() : -1; 
-  } 
- 
-  /// Return an assumed unique return value if a single candidate is found. If 
-  /// there cannot be one, return a nullptr. If it is not clear yet, return the 
-  /// Optional::NoneType. 
-  Optional<Value *> getAssumedUniqueReturnValue(Attributor &A) const; 
- 
-  /// See AbstractState::checkForAllReturnedValues(...). 
-  bool checkForAllReturnedValuesAndReturnInsts( 
-      function_ref<bool(Value &, const SmallSetVector<ReturnInst *, 4> &)> Pred) 
-      const override; 
- 
-  /// Pretty print the attribute similar to the IR representation. 
-  const std::string getAsStr() const override; 
- 
-  /// See AbstractState::isAtFixpoint(). 
-  bool isAtFixpoint() const override { return IsFixed; } 
- 
-  /// See AbstractState::isValidState(). 
-  bool isValidState() const override { return IsValidState; } 
- 
-  /// See AbstractState::indicateOptimisticFixpoint(...). 
-  ChangeStatus indicateOptimisticFixpoint() override { 
-    IsFixed = true; 
-    return ChangeStatus::UNCHANGED; 
-  } 
- 
-  ChangeStatus indicatePessimisticFixpoint() override { 
-    IsFixed = true; 
-    IsValidState = false; 
-    return ChangeStatus::CHANGED; 
-  } 
-}; 
- 
-ChangeStatus AAReturnedValuesImpl::manifest(Attributor &A) { 
-  ChangeStatus Changed = ChangeStatus::UNCHANGED; 
- 
-  // Bookkeeping. 
-  assert(isValidState()); 
-  STATS_DECLTRACK(KnownReturnValues, FunctionReturn, 
-                  "Number of function with known return values"); 
- 
-  // Check if we have an assumed unique return value that we could manifest. 
-  Optional<Value *> UniqueRV = getAssumedUniqueReturnValue(A); 
- 
-  if (!UniqueRV.hasValue() || !UniqueRV.getValue()) 
-    return Changed; 
- 
-  // Bookkeeping. 
-  STATS_DECLTRACK(UniqueReturnValue, FunctionReturn, 
-                  "Number of function with unique return"); 
- 
-  // Callback to replace the uses of CB with the constant C. 
-  auto ReplaceCallSiteUsersWith = [&A](CallBase &CB, Constant &C) { 
-    if (CB.use_empty()) 
-      return ChangeStatus::UNCHANGED; 
-    if (A.changeValueAfterManifest(CB, C)) 
-      return ChangeStatus::CHANGED; 
-    return ChangeStatus::UNCHANGED; 
-  }; 
- 
-  // If the assumed unique return value is an argument, annotate it. 
-  if (auto *UniqueRVArg = dyn_cast<Argument>(UniqueRV.getValue())) { 
-    if (UniqueRVArg->getType()->canLosslesslyBitCastTo( 
-            getAssociatedFunction()->getReturnType())) { 
-      getIRPosition() = IRPosition::argument(*UniqueRVArg); 
-      Changed = IRAttribute::manifest(A); 
-    } 
-  } else if (auto *RVC = dyn_cast<Constant>(UniqueRV.getValue())) { 
-    // We can replace the returned value with the unique returned constant. 
-    Value &AnchorValue = getAnchorValue(); 
-    if (Function *F = dyn_cast<Function>(&AnchorValue)) { 
-      for (const Use &U : F->uses()) 
-        if (CallBase *CB = dyn_cast<CallBase>(U.getUser())) 
-          if (CB->isCallee(&U)) { 
-            Constant *RVCCast = 
-                CB->getType() == RVC->getType() 
-                    ? RVC 
-                    : ConstantExpr::getTruncOrBitCast(RVC, CB->getType()); 
-            Changed = ReplaceCallSiteUsersWith(*CB, *RVCCast) | Changed; 
-          } 
-    } else { 
-      assert(isa<CallBase>(AnchorValue) && 
-             "Expcected a function or call base anchor!"); 
-      Constant *RVCCast = 
-          AnchorValue.getType() == RVC->getType() 
-              ? RVC 
-              : ConstantExpr::getTruncOrBitCast(RVC, AnchorValue.getType()); 
-      Changed = ReplaceCallSiteUsersWith(cast<CallBase>(AnchorValue), *RVCCast); 
-    } 
-    if (Changed == ChangeStatus::CHANGED) 
-      STATS_DECLTRACK(UniqueConstantReturnValue, FunctionReturn, 
-                      "Number of function returns replaced by constant return"); 
-  } 
- 
-  return Changed; 
-} 
- 
-const std::string AAReturnedValuesImpl::getAsStr() const { 
-  return (isAtFixpoint() ? "returns(#" : "may-return(#") + 
-         (isValidState() ? std::to_string(getNumReturnValues()) : "?") + 
-         ")[#UC: " + std::to_string(UnresolvedCalls.size()) + "]"; 
-} 
- 
-Optional<Value *> 
-AAReturnedValuesImpl::getAssumedUniqueReturnValue(Attributor &A) const { 
-  // If checkForAllReturnedValues provides a unique value, ignoring potential 
-  // undef values that can also be present, it is assumed to be the actual 
-  // return value and forwarded to the caller of this method. If there are 
-  // multiple, a nullptr is returned indicating there cannot be a unique 
-  // returned value. 
-  Optional<Value *> UniqueRV; 
- 
-  auto Pred = [&](Value &RV) -> bool { 
-    // If we found a second returned value and neither the current nor the saved 
-    // one is an undef, there is no unique returned value. Undefs are special 
-    // since we can pretend they have any value. 
-    if (UniqueRV.hasValue() && UniqueRV != &RV && 
-        !(isa<UndefValue>(RV) || isa<UndefValue>(UniqueRV.getValue()))) { 
-      UniqueRV = nullptr; 
-      return false; 
-    } 
- 
-    // Do not overwrite a value with an undef. 
-    if (!UniqueRV.hasValue() || !isa<UndefValue>(RV)) 
-      UniqueRV = &RV; 
- 
-    return true; 
-  }; 
- 
-  if (!A.checkForAllReturnedValues(Pred, *this)) 
-    UniqueRV = nullptr; 
- 
-  return UniqueRV; 
-} 
- 
-bool AAReturnedValuesImpl::checkForAllReturnedValuesAndReturnInsts( 
-    function_ref<bool(Value &, const SmallSetVector<ReturnInst *, 4> &)> Pred) 
-    const { 
-  if (!isValidState()) 
-    return false; 
- 
-  // Check all returned values but ignore call sites as long as we have not 
-  // encountered an overdefined one during an update. 
-  for (auto &It : ReturnedValues) { 
-    Value *RV = It.first; 
- 
-    CallBase *CB = dyn_cast<CallBase>(RV); 
-    if (CB && !UnresolvedCalls.count(CB)) 
-      continue; 
- 
-    if (!Pred(*RV, It.second)) 
-      return false; 
-  } 
- 
-  return true; 
-} 
- 
-ChangeStatus AAReturnedValuesImpl::updateImpl(Attributor &A) { 
-  size_t NumUnresolvedCalls = UnresolvedCalls.size(); 
-  bool Changed = false; 
- 
-  // State used in the value traversals starting in returned values. 
-  struct RVState { 
-    // The map in which we collect return values -> return instrs. 
-    decltype(ReturnedValues) &RetValsMap; 
-    // The flag to indicate a change. 
-    bool &Changed; 
-    // The return instrs we come from. 
-    SmallSetVector<ReturnInst *, 4> RetInsts; 
-  }; 
- 
-  // Callback for a leaf value returned by the associated function. 
-  auto VisitValueCB = [](Value &Val, const Instruction *, RVState &RVS, 
-                         bool) -> bool { 
-    auto Size = RVS.RetValsMap[&Val].size(); 
-    RVS.RetValsMap[&Val].insert(RVS.RetInsts.begin(), RVS.RetInsts.end()); 
-    bool Inserted = RVS.RetValsMap[&Val].size() != Size; 
-    RVS.Changed |= Inserted; 
-    LLVM_DEBUG({ 
-      if (Inserted) 
-        dbgs() << "[AAReturnedValues] 1 Add new returned value " << Val 
-               << " => " << RVS.RetInsts.size() << "\n"; 
-    }); 
-    return true; 
-  }; 
- 
-  // Helper method to invoke the generic value traversal. 
-  auto VisitReturnedValue = [&](Value &RV, RVState &RVS, 
-                                const Instruction *CtxI) { 
-    IRPosition RetValPos = IRPosition::value(RV); 
-    return genericValueTraversal<AAReturnedValues, RVState>( 
-        A, RetValPos, *this, RVS, VisitValueCB, CtxI, 
-        /* UseValueSimplify */ false); 
-  }; 
- 
-  // Callback for all "return intructions" live in the associated function. 
-  auto CheckReturnInst = [this, &VisitReturnedValue, &Changed](Instruction &I) { 
-    ReturnInst &Ret = cast<ReturnInst>(I); 
-    RVState RVS({ReturnedValues, Changed, {}}); 
-    RVS.RetInsts.insert(&Ret); 
-    return VisitReturnedValue(*Ret.getReturnValue(), RVS, &I); 
-  }; 
- 
-  // Start by discovering returned values from all live returned instructions in 
-  // the associated function. 
-  if (!A.checkForAllInstructions(CheckReturnInst, *this, {Instruction::Ret})) 
-    return indicatePessimisticFixpoint(); 
- 
-  // Once returned values "directly" present in the code are handled we try to 
-  // resolve returned calls. To avoid modifications to the ReturnedValues map 
-  // while we iterate over it we kept record of potential new entries in a copy 
-  // map, NewRVsMap. 
-  decltype(ReturnedValues) NewRVsMap; 
- 
+      indicatePessimisticFixpoint();
+      return;
+    }
+    assert(!F->getReturnType()->isVoidTy() &&
+           "Did not expect a void return type!");
+
+    // The map from instruction opcodes to those instructions in the function.
+    auto &OpcodeInstMap = A.getInfoCache().getOpcodeInstMapForFunction(*F);
+
+    // Look through all arguments, if one is marked as returned we are done.
+    for (Argument &Arg : F->args()) {
+      if (Arg.hasReturnedAttr()) {
+        auto &ReturnInstSet = ReturnedValues[&Arg];
+        if (auto *Insts = OpcodeInstMap.lookup(Instruction::Ret))
+          for (Instruction *RI : *Insts)
+            ReturnInstSet.insert(cast<ReturnInst>(RI));
+
+        indicateOptimisticFixpoint();
+        return;
+      }
+    }
+
+    if (!A.isFunctionIPOAmendable(*F))
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override;
+
+  /// See AbstractAttribute::getState(...).
+  AbstractState &getState() override { return *this; }
+
+  /// See AbstractAttribute::getState(...).
+  const AbstractState &getState() const override { return *this; }
+
+  /// See AbstractAttribute::updateImpl(Attributor &A).
+  ChangeStatus updateImpl(Attributor &A) override;
+
+  llvm::iterator_range<iterator> returned_values() override {
+    return llvm::make_range(ReturnedValues.begin(), ReturnedValues.end());
+  }
+
+  llvm::iterator_range<const_iterator> returned_values() const override {
+    return llvm::make_range(ReturnedValues.begin(), ReturnedValues.end());
+  }
+
+  const SmallSetVector<CallBase *, 4> &getUnresolvedCalls() const override {
+    return UnresolvedCalls;
+  }
+
+  /// Return the number of potential return values, -1 if unknown.
+  size_t getNumReturnValues() const override {
+    return isValidState() ? ReturnedValues.size() : -1;
+  }
+
+  /// Return an assumed unique return value if a single candidate is found. If
+  /// there cannot be one, return a nullptr. If it is not clear yet, return the
+  /// Optional::NoneType.
+  Optional<Value *> getAssumedUniqueReturnValue(Attributor &A) const;
+
+  /// See AbstractState::checkForAllReturnedValues(...).
+  bool checkForAllReturnedValuesAndReturnInsts(
+      function_ref<bool(Value &, const SmallSetVector<ReturnInst *, 4> &)> Pred)
+      const override;
+
+  /// Pretty print the attribute similar to the IR representation.
+  const std::string getAsStr() const override;
+
+  /// See AbstractState::isAtFixpoint().
+  bool isAtFixpoint() const override { return IsFixed; }
+
+  /// See AbstractState::isValidState().
+  bool isValidState() const override { return IsValidState; }
+
+  /// See AbstractState::indicateOptimisticFixpoint(...).
+  ChangeStatus indicateOptimisticFixpoint() override {
+    IsFixed = true;
+    return ChangeStatus::UNCHANGED;
+  }
+
+  ChangeStatus indicatePessimisticFixpoint() override {
+    IsFixed = true;
+    IsValidState = false;
+    return ChangeStatus::CHANGED;
+  }
+};
+
+ChangeStatus AAReturnedValuesImpl::manifest(Attributor &A) {
+  ChangeStatus Changed = ChangeStatus::UNCHANGED;
+
+  // Bookkeeping.
+  assert(isValidState());
+  STATS_DECLTRACK(KnownReturnValues, FunctionReturn,
+                  "Number of function with known return values");
+
+  // Check if we have an assumed unique return value that we could manifest.
+  Optional<Value *> UniqueRV = getAssumedUniqueReturnValue(A);
+
+  if (!UniqueRV.hasValue() || !UniqueRV.getValue())
+    return Changed;
+
+  // Bookkeeping.
+  STATS_DECLTRACK(UniqueReturnValue, FunctionReturn,
+                  "Number of function with unique return");
+
+  // Callback to replace the uses of CB with the constant C.
+  auto ReplaceCallSiteUsersWith = [&A](CallBase &CB, Constant &C) {
+    if (CB.use_empty())
+      return ChangeStatus::UNCHANGED;
+    if (A.changeValueAfterManifest(CB, C))
+      return ChangeStatus::CHANGED;
+    return ChangeStatus::UNCHANGED;
+  };
+
+  // If the assumed unique return value is an argument, annotate it.
+  if (auto *UniqueRVArg = dyn_cast<Argument>(UniqueRV.getValue())) {
+    if (UniqueRVArg->getType()->canLosslesslyBitCastTo(
+            getAssociatedFunction()->getReturnType())) {
+      getIRPosition() = IRPosition::argument(*UniqueRVArg);
+      Changed = IRAttribute::manifest(A);
+    }
+  } else if (auto *RVC = dyn_cast<Constant>(UniqueRV.getValue())) {
+    // We can replace the returned value with the unique returned constant.
+    Value &AnchorValue = getAnchorValue();
+    if (Function *F = dyn_cast<Function>(&AnchorValue)) {
+      for (const Use &U : F->uses())
+        if (CallBase *CB = dyn_cast<CallBase>(U.getUser()))
+          if (CB->isCallee(&U)) {
+            Constant *RVCCast =
+                CB->getType() == RVC->getType()
+                    ? RVC
+                    : ConstantExpr::getTruncOrBitCast(RVC, CB->getType());
+            Changed = ReplaceCallSiteUsersWith(*CB, *RVCCast) | Changed;
+          }
+    } else {
+      assert(isa<CallBase>(AnchorValue) &&
+             "Expcected a function or call base anchor!");
+      Constant *RVCCast =
+          AnchorValue.getType() == RVC->getType()
+              ? RVC
+              : ConstantExpr::getTruncOrBitCast(RVC, AnchorValue.getType());
+      Changed = ReplaceCallSiteUsersWith(cast<CallBase>(AnchorValue), *RVCCast);
+    }
+    if (Changed == ChangeStatus::CHANGED)
+      STATS_DECLTRACK(UniqueConstantReturnValue, FunctionReturn,
+                      "Number of function returns replaced by constant return");
+  }
+
+  return Changed;
+}
+
+const std::string AAReturnedValuesImpl::getAsStr() const {
+  return (isAtFixpoint() ? "returns(#" : "may-return(#") +
+         (isValidState() ? std::to_string(getNumReturnValues()) : "?") +
+         ")[#UC: " + std::to_string(UnresolvedCalls.size()) + "]";
+}
+
+Optional<Value *>
+AAReturnedValuesImpl::getAssumedUniqueReturnValue(Attributor &A) const {
+  // If checkForAllReturnedValues provides a unique value, ignoring potential
+  // undef values that can also be present, it is assumed to be the actual
+  // return value and forwarded to the caller of this method. If there are
+  // multiple, a nullptr is returned indicating there cannot be a unique
+  // returned value.
+  Optional<Value *> UniqueRV;
+
+  auto Pred = [&](Value &RV) -> bool {
+    // If we found a second returned value and neither the current nor the saved
+    // one is an undef, there is no unique returned value. Undefs are special
+    // since we can pretend they have any value.
+    if (UniqueRV.hasValue() && UniqueRV != &RV &&
+        !(isa<UndefValue>(RV) || isa<UndefValue>(UniqueRV.getValue()))) {
+      UniqueRV = nullptr;
+      return false;
+    }
+
+    // Do not overwrite a value with an undef.
+    if (!UniqueRV.hasValue() || !isa<UndefValue>(RV))
+      UniqueRV = &RV;
+
+    return true;
+  };
+
+  if (!A.checkForAllReturnedValues(Pred, *this))
+    UniqueRV = nullptr;
+
+  return UniqueRV;
+}
+
+bool AAReturnedValuesImpl::checkForAllReturnedValuesAndReturnInsts(
+    function_ref<bool(Value &, const SmallSetVector<ReturnInst *, 4> &)> Pred)
+    const {
+  if (!isValidState())
+    return false;
+
+  // Check all returned values but ignore call sites as long as we have not
+  // encountered an overdefined one during an update.
+  for (auto &It : ReturnedValues) {
+    Value *RV = It.first;
+
+    CallBase *CB = dyn_cast<CallBase>(RV);
+    if (CB && !UnresolvedCalls.count(CB))
+      continue;
+
+    if (!Pred(*RV, It.second))
+      return false;
+  }
+
+  return true;
+}
+
+ChangeStatus AAReturnedValuesImpl::updateImpl(Attributor &A) {
+  size_t NumUnresolvedCalls = UnresolvedCalls.size();
+  bool Changed = false;
+
+  // State used in the value traversals starting in returned values.
+  struct RVState {
+    // The map in which we collect return values -> return instrs.
+    decltype(ReturnedValues) &RetValsMap;
+    // The flag to indicate a change.
+    bool &Changed;
+    // The return instrs we come from.
+    SmallSetVector<ReturnInst *, 4> RetInsts;
+  };
+
+  // Callback for a leaf value returned by the associated function.
+  auto VisitValueCB = [](Value &Val, const Instruction *, RVState &RVS,
+                         bool) -> bool {
+    auto Size = RVS.RetValsMap[&Val].size();
+    RVS.RetValsMap[&Val].insert(RVS.RetInsts.begin(), RVS.RetInsts.end());
+    bool Inserted = RVS.RetValsMap[&Val].size() != Size;
+    RVS.Changed |= Inserted;
+    LLVM_DEBUG({
+      if (Inserted)
+        dbgs() << "[AAReturnedValues] 1 Add new returned value " << Val
+               << " => " << RVS.RetInsts.size() << "\n";
+    });
+    return true;
+  };
+
+  // Helper method to invoke the generic value traversal.
+  auto VisitReturnedValue = [&](Value &RV, RVState &RVS,
+                                const Instruction *CtxI) {
+    IRPosition RetValPos = IRPosition::value(RV);
+    return genericValueTraversal<AAReturnedValues, RVState>(
+        A, RetValPos, *this, RVS, VisitValueCB, CtxI,
+        /* UseValueSimplify */ false);
+  };
+
+  // Callback for all "return intructions" live in the associated function.
+  auto CheckReturnInst = [this, &VisitReturnedValue, &Changed](Instruction &I) {
+    ReturnInst &Ret = cast<ReturnInst>(I);
+    RVState RVS({ReturnedValues, Changed, {}});
+    RVS.RetInsts.insert(&Ret);
+    return VisitReturnedValue(*Ret.getReturnValue(), RVS, &I);
+  };
+
+  // Start by discovering returned values from all live returned instructions in
+  // the associated function.
+  if (!A.checkForAllInstructions(CheckReturnInst, *this, {Instruction::Ret}))
+    return indicatePessimisticFixpoint();
+
+  // Once returned values "directly" present in the code are handled we try to
+  // resolve returned calls. To avoid modifications to the ReturnedValues map
+  // while we iterate over it we kept record of potential new entries in a copy
+  // map, NewRVsMap.
+  decltype(ReturnedValues) NewRVsMap;
+
   auto HandleReturnValue = [&](Value *RV,
                                SmallSetVector<ReturnInst *, 4> &RIs) {
     LLVM_DEBUG(dbgs() << "[AAReturnedValues] Returned value: " << *RV << " by #"
                       << RIs.size() << " RIs\n");
-    CallBase *CB = dyn_cast<CallBase>(RV); 
-    if (!CB || UnresolvedCalls.count(CB)) 
-      return; 
- 
-    if (!CB->getCalledFunction()) { 
-      LLVM_DEBUG(dbgs() << "[AAReturnedValues] Unresolved call: " << *CB 
-                        << "\n"); 
-      UnresolvedCalls.insert(CB); 
-      return; 
-    } 
- 
-    // TODO: use the function scope once we have call site AAReturnedValues. 
-    const auto &RetValAA = A.getAAFor<AAReturnedValues>( 
-        *this, IRPosition::function(*CB->getCalledFunction())); 
-    LLVM_DEBUG(dbgs() << "[AAReturnedValues] Found another AAReturnedValues: " 
-                      << RetValAA << "\n"); 
- 
-    // Skip dead ends, thus if we do not know anything about the returned 
-    // call we mark it as unresolved and it will stay that way. 
-    if (!RetValAA.getState().isValidState()) { 
-      LLVM_DEBUG(dbgs() << "[AAReturnedValues] Unresolved call: " << *CB 
-                        << "\n"); 
-      UnresolvedCalls.insert(CB); 
-      return; 
-    } 
- 
-    // Do not try to learn partial information. If the callee has unresolved 
-    // return values we will treat the call as unresolved/opaque. 
-    auto &RetValAAUnresolvedCalls = RetValAA.getUnresolvedCalls(); 
-    if (!RetValAAUnresolvedCalls.empty()) { 
-      UnresolvedCalls.insert(CB); 
-      return; 
-    } 
- 
-    // Now check if we can track transitively returned values. If possible, thus 
-    // if all return value can be represented in the current scope, do so. 
-    bool Unresolved = false; 
-    for (auto &RetValAAIt : RetValAA.returned_values()) { 
-      Value *RetVal = RetValAAIt.first; 
-      if (isa<Argument>(RetVal) || isa<CallBase>(RetVal) || 
-          isa<Constant>(RetVal)) 
-        continue; 
-      // Anything that did not fit in the above categories cannot be resolved, 
-      // mark the call as unresolved. 
-      LLVM_DEBUG(dbgs() << "[AAReturnedValues] transitively returned value " 
-                           "cannot be translated: " 
-                        << *RetVal << "\n"); 
-      UnresolvedCalls.insert(CB); 
-      Unresolved = true; 
-      break; 
-    } 
- 
-    if (Unresolved) 
-      return; 
- 
-    // Now track transitively returned values. 
-    unsigned &NumRetAA = NumReturnedValuesPerKnownAA[CB]; 
-    if (NumRetAA == RetValAA.getNumReturnValues()) { 
-      LLVM_DEBUG(dbgs() << "[AAReturnedValues] Skip call as it has not " 
-                           "changed since it was seen last\n"); 
-      return; 
-    } 
-    NumRetAA = RetValAA.getNumReturnValues(); 
- 
-    for (auto &RetValAAIt : RetValAA.returned_values()) { 
-      Value *RetVal = RetValAAIt.first; 
-      if (Argument *Arg = dyn_cast<Argument>(RetVal)) { 
-        // Arguments are mapped to call site operands and we begin the traversal 
-        // again. 
-        bool Unused = false; 
-        RVState RVS({NewRVsMap, Unused, RetValAAIt.second}); 
-        VisitReturnedValue(*CB->getArgOperand(Arg->getArgNo()), RVS, CB); 
-        continue; 
+    CallBase *CB = dyn_cast<CallBase>(RV);
+    if (!CB || UnresolvedCalls.count(CB))
+      return;
+
+    if (!CB->getCalledFunction()) {
+      LLVM_DEBUG(dbgs() << "[AAReturnedValues] Unresolved call: " << *CB
+                        << "\n");
+      UnresolvedCalls.insert(CB);
+      return;
+    }
+
+    // TODO: use the function scope once we have call site AAReturnedValues.
+    const auto &RetValAA = A.getAAFor<AAReturnedValues>(
+        *this, IRPosition::function(*CB->getCalledFunction()));
+    LLVM_DEBUG(dbgs() << "[AAReturnedValues] Found another AAReturnedValues: "
+                      << RetValAA << "\n");
+
+    // Skip dead ends, thus if we do not know anything about the returned
+    // call we mark it as unresolved and it will stay that way.
+    if (!RetValAA.getState().isValidState()) {
+      LLVM_DEBUG(dbgs() << "[AAReturnedValues] Unresolved call: " << *CB
+                        << "\n");
+      UnresolvedCalls.insert(CB);
+      return;
+    }
+
+    // Do not try to learn partial information. If the callee has unresolved
+    // return values we will treat the call as unresolved/opaque.
+    auto &RetValAAUnresolvedCalls = RetValAA.getUnresolvedCalls();
+    if (!RetValAAUnresolvedCalls.empty()) {
+      UnresolvedCalls.insert(CB);
+      return;
+    }
+
+    // Now check if we can track transitively returned values. If possible, thus
+    // if all return value can be represented in the current scope, do so.
+    bool Unresolved = false;
+    for (auto &RetValAAIt : RetValAA.returned_values()) {
+      Value *RetVal = RetValAAIt.first;
+      if (isa<Argument>(RetVal) || isa<CallBase>(RetVal) ||
+          isa<Constant>(RetVal))
+        continue;
+      // Anything that did not fit in the above categories cannot be resolved,
+      // mark the call as unresolved.
+      LLVM_DEBUG(dbgs() << "[AAReturnedValues] transitively returned value "
+                           "cannot be translated: "
+                        << *RetVal << "\n");
+      UnresolvedCalls.insert(CB);
+      Unresolved = true;
+      break;
+    }
+
+    if (Unresolved)
+      return;
+
+    // Now track transitively returned values.
+    unsigned &NumRetAA = NumReturnedValuesPerKnownAA[CB];
+    if (NumRetAA == RetValAA.getNumReturnValues()) {
+      LLVM_DEBUG(dbgs() << "[AAReturnedValues] Skip call as it has not "
+                           "changed since it was seen last\n");
+      return;
+    }
+    NumRetAA = RetValAA.getNumReturnValues();
+
+    for (auto &RetValAAIt : RetValAA.returned_values()) {
+      Value *RetVal = RetValAAIt.first;
+      if (Argument *Arg = dyn_cast<Argument>(RetVal)) {
+        // Arguments are mapped to call site operands and we begin the traversal
+        // again.
+        bool Unused = false;
+        RVState RVS({NewRVsMap, Unused, RetValAAIt.second});
+        VisitReturnedValue(*CB->getArgOperand(Arg->getArgNo()), RVS, CB);
+        continue;
       }
       if (isa<CallBase>(RetVal)) {
-        // Call sites are resolved by the callee attribute over time, no need to 
-        // do anything for us. 
-        continue; 
+        // Call sites are resolved by the callee attribute over time, no need to
+        // do anything for us.
+        continue;
       }
       if (isa<Constant>(RetVal)) {
-        // Constants are valid everywhere, we can simply take them. 
-        NewRVsMap[RetVal].insert(RIs.begin(), RIs.end()); 
-        continue; 
-      } 
-    } 
-  }; 
- 
-  for (auto &It : ReturnedValues) 
-    HandleReturnValue(It.first, It.second); 
- 
-  // Because processing the new information can again lead to new return values 
-  // we have to be careful and iterate until this iteration is complete. The 
-  // idea is that we are in a stable state at the end of an update. All return 
-  // values have been handled and properly categorized. We might not update 
-  // again if we have not requested a non-fix attribute so we cannot "wait" for 
-  // the next update to analyze a new return value. 
-  while (!NewRVsMap.empty()) { 
-    auto It = std::move(NewRVsMap.back()); 
-    NewRVsMap.pop_back(); 
- 
-    assert(!It.second.empty() && "Entry does not add anything."); 
-    auto &ReturnInsts = ReturnedValues[It.first]; 
-    for (ReturnInst *RI : It.second) 
-      if (ReturnInsts.insert(RI)) { 
-        LLVM_DEBUG(dbgs() << "[AAReturnedValues] Add new returned value " 
-                          << *It.first << " => " << *RI << "\n"); 
-        HandleReturnValue(It.first, ReturnInsts); 
-        Changed = true; 
-      } 
-  } 
- 
-  Changed |= (NumUnresolvedCalls != UnresolvedCalls.size()); 
-  return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED; 
-} 
- 
-struct AAReturnedValuesFunction final : public AAReturnedValuesImpl { 
-  AAReturnedValuesFunction(const IRPosition &IRP, Attributor &A) 
-      : AAReturnedValuesImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(returned) } 
-}; 
- 
-/// Returned values information for a call sites. 
-struct AAReturnedValuesCallSite final : AAReturnedValuesImpl { 
-  AAReturnedValuesCallSite(const IRPosition &IRP, Attributor &A) 
-      : AAReturnedValuesImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    // TODO: Once we have call site specific value information we can provide 
-    //       call site specific liveness information and then it makes 
-    //       sense to specialize attributes for call sites instead of 
-    //       redirecting requests to the callee. 
-    llvm_unreachable("Abstract attributes for returned values are not " 
-                     "supported for call sites yet!"); 
-  } 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    return indicatePessimisticFixpoint(); 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override {} 
-}; 
- 
-/// ------------------------ NoSync Function Attribute ------------------------- 
- 
-struct AANoSyncImpl : AANoSync { 
-  AANoSyncImpl(const IRPosition &IRP, Attributor &A) : AANoSync(IRP, A) {} 
- 
-  const std::string getAsStr() const override { 
-    return getAssumed() ? "nosync" : "may-sync"; 
-  } 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override; 
- 
-  /// Helper function used to determine whether an instruction is non-relaxed 
-  /// atomic. In other words, if an atomic instruction does not have unordered 
-  /// or monotonic ordering 
-  static bool isNonRelaxedAtomic(Instruction *I); 
- 
-  /// Helper function used to determine whether an instruction is volatile. 
-  static bool isVolatile(Instruction *I); 
- 
-  /// Helper function uset to check if intrinsic is volatile (memcpy, memmove, 
-  /// memset). 
-  static bool isNoSyncIntrinsic(Instruction *I); 
-}; 
- 
-bool AANoSyncImpl::isNonRelaxedAtomic(Instruction *I) { 
-  if (!I->isAtomic()) 
-    return false; 
- 
-  AtomicOrdering Ordering; 
-  switch (I->getOpcode()) { 
-  case Instruction::AtomicRMW: 
-    Ordering = cast<AtomicRMWInst>(I)->getOrdering(); 
-    break; 
-  case Instruction::Store: 
-    Ordering = cast<StoreInst>(I)->getOrdering(); 
-    break; 
-  case Instruction::Load: 
-    Ordering = cast<LoadInst>(I)->getOrdering(); 
-    break; 
-  case Instruction::Fence: { 
-    auto *FI = cast<FenceInst>(I); 
-    if (FI->getSyncScopeID() == SyncScope::SingleThread) 
-      return false; 
-    Ordering = FI->getOrdering(); 
-    break; 
-  } 
-  case Instruction::AtomicCmpXchg: { 
-    AtomicOrdering Success = cast<AtomicCmpXchgInst>(I)->getSuccessOrdering(); 
-    AtomicOrdering Failure = cast<AtomicCmpXchgInst>(I)->getFailureOrdering(); 
-    // Only if both are relaxed, than it can be treated as relaxed. 
-    // Otherwise it is non-relaxed. 
-    if (Success != AtomicOrdering::Unordered && 
-        Success != AtomicOrdering::Monotonic) 
-      return true; 
-    if (Failure != AtomicOrdering::Unordered && 
-        Failure != AtomicOrdering::Monotonic) 
-      return true; 
-    return false; 
-  } 
-  default: 
-    llvm_unreachable( 
-        "New atomic operations need to be known in the attributor."); 
-  } 
- 
-  // Relaxed. 
-  if (Ordering == AtomicOrdering::Unordered || 
-      Ordering == AtomicOrdering::Monotonic) 
-    return false; 
-  return true; 
-} 
- 
-/// Checks if an intrinsic is nosync. Currently only checks mem* intrinsics. 
-/// FIXME: We should ipmrove the handling of intrinsics. 
-bool AANoSyncImpl::isNoSyncIntrinsic(Instruction *I) { 
-  if (auto *II = dyn_cast<IntrinsicInst>(I)) { 
-    switch (II->getIntrinsicID()) { 
-    /// Element wise atomic memory intrinsics are can only be unordered, 
-    /// therefore nosync. 
-    case Intrinsic::memset_element_unordered_atomic: 
-    case Intrinsic::memmove_element_unordered_atomic: 
-    case Intrinsic::memcpy_element_unordered_atomic: 
-      return true; 
-    case Intrinsic::memset: 
-    case Intrinsic::memmove: 
-    case Intrinsic::memcpy: 
-      if (!cast<MemIntrinsic>(II)->isVolatile()) 
-        return true; 
-      return false; 
-    default: 
-      return false; 
-    } 
-  } 
-  return false; 
-} 
- 
-bool AANoSyncImpl::isVolatile(Instruction *I) { 
-  assert(!isa<CallBase>(I) && "Calls should not be checked here"); 
- 
-  switch (I->getOpcode()) { 
-  case Instruction::AtomicRMW: 
-    return cast<AtomicRMWInst>(I)->isVolatile(); 
-  case Instruction::Store: 
-    return cast<StoreInst>(I)->isVolatile(); 
-  case Instruction::Load: 
-    return cast<LoadInst>(I)->isVolatile(); 
-  case Instruction::AtomicCmpXchg: 
-    return cast<AtomicCmpXchgInst>(I)->isVolatile(); 
-  default: 
-    return false; 
-  } 
-} 
- 
-ChangeStatus AANoSyncImpl::updateImpl(Attributor &A) { 
- 
-  auto CheckRWInstForNoSync = [&](Instruction &I) { 
-    /// We are looking for volatile instructions or Non-Relaxed atomics. 
-    /// FIXME: We should improve the handling of intrinsics. 
- 
-    if (isa<IntrinsicInst>(&I) && isNoSyncIntrinsic(&I)) 
-      return true; 
- 
-    if (const auto *CB = dyn_cast<CallBase>(&I)) { 
-      if (CB->hasFnAttr(Attribute::NoSync)) 
-        return true; 
- 
-      const auto &NoSyncAA = 
-          A.getAAFor<AANoSync>(*this, IRPosition::callsite_function(*CB)); 
-      if (NoSyncAA.isAssumedNoSync()) 
-        return true; 
-      return false; 
-    } 
- 
-    if (!isVolatile(&I) && !isNonRelaxedAtomic(&I)) 
-      return true; 
- 
-    return false; 
-  }; 
- 
-  auto CheckForNoSync = [&](Instruction &I) { 
-    // At this point we handled all read/write effects and they are all 
-    // nosync, so they can be skipped. 
-    if (I.mayReadOrWriteMemory()) 
-      return true; 
- 
-    // non-convergent and readnone imply nosync. 
-    return !cast<CallBase>(I).isConvergent(); 
-  }; 
- 
-  if (!A.checkForAllReadWriteInstructions(CheckRWInstForNoSync, *this) || 
-      !A.checkForAllCallLikeInstructions(CheckForNoSync, *this)) 
-    return indicatePessimisticFixpoint(); 
- 
-  return ChangeStatus::UNCHANGED; 
-} 
- 
-struct AANoSyncFunction final : public AANoSyncImpl { 
-  AANoSyncFunction(const IRPosition &IRP, Attributor &A) 
-      : AANoSyncImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(nosync) } 
-}; 
- 
-/// NoSync attribute deduction for a call sites. 
-struct AANoSyncCallSite final : AANoSyncImpl { 
-  AANoSyncCallSite(const IRPosition &IRP, Attributor &A) 
-      : AANoSyncImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    AANoSyncImpl::initialize(A); 
-    Function *F = getAssociatedFunction(); 
+        // Constants are valid everywhere, we can simply take them.
+        NewRVsMap[RetVal].insert(RIs.begin(), RIs.end());
+        continue;
+      }
+    }
+  };
+
+  for (auto &It : ReturnedValues)
+    HandleReturnValue(It.first, It.second);
+
+  // Because processing the new information can again lead to new return values
+  // we have to be careful and iterate until this iteration is complete. The
+  // idea is that we are in a stable state at the end of an update. All return
+  // values have been handled and properly categorized. We might not update
+  // again if we have not requested a non-fix attribute so we cannot "wait" for
+  // the next update to analyze a new return value.
+  while (!NewRVsMap.empty()) {
+    auto It = std::move(NewRVsMap.back());
+    NewRVsMap.pop_back();
+
+    assert(!It.second.empty() && "Entry does not add anything.");
+    auto &ReturnInsts = ReturnedValues[It.first];
+    for (ReturnInst *RI : It.second)
+      if (ReturnInsts.insert(RI)) {
+        LLVM_DEBUG(dbgs() << "[AAReturnedValues] Add new returned value "
+                          << *It.first << " => " << *RI << "\n");
+        HandleReturnValue(It.first, ReturnInsts);
+        Changed = true;
+      }
+  }
+
+  Changed |= (NumUnresolvedCalls != UnresolvedCalls.size());
+  return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
+}
+
+struct AAReturnedValuesFunction final : public AAReturnedValuesImpl {
+  AAReturnedValuesFunction(const IRPosition &IRP, Attributor &A)
+      : AAReturnedValuesImpl(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(returned) }
+};
+
+/// Returned values information for a call sites.
+struct AAReturnedValuesCallSite final : AAReturnedValuesImpl {
+  AAReturnedValuesCallSite(const IRPosition &IRP, Attributor &A)
+      : AAReturnedValuesImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites instead of
+    //       redirecting requests to the callee.
+    llvm_unreachable("Abstract attributes for returned values are not "
+                     "supported for call sites yet!");
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    return indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {}
+};
+
+/// ------------------------ NoSync Function Attribute -------------------------
+
+struct AANoSyncImpl : AANoSync {
+  AANoSyncImpl(const IRPosition &IRP, Attributor &A) : AANoSync(IRP, A) {}
+
+  const std::string getAsStr() const override {
+    return getAssumed() ? "nosync" : "may-sync";
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override;
+
+  /// Helper function used to determine whether an instruction is non-relaxed
+  /// atomic. In other words, if an atomic instruction does not have unordered
+  /// or monotonic ordering
+  static bool isNonRelaxedAtomic(Instruction *I);
+
+  /// Helper function used to determine whether an instruction is volatile.
+  static bool isVolatile(Instruction *I);
+
+  /// Helper function uset to check if intrinsic is volatile (memcpy, memmove,
+  /// memset).
+  static bool isNoSyncIntrinsic(Instruction *I);
+};
+
+bool AANoSyncImpl::isNonRelaxedAtomic(Instruction *I) {
+  if (!I->isAtomic())
+    return false;
+
+  AtomicOrdering Ordering;
+  switch (I->getOpcode()) {
+  case Instruction::AtomicRMW:
+    Ordering = cast<AtomicRMWInst>(I)->getOrdering();
+    break;
+  case Instruction::Store:
+    Ordering = cast<StoreInst>(I)->getOrdering();
+    break;
+  case Instruction::Load:
+    Ordering = cast<LoadInst>(I)->getOrdering();
+    break;
+  case Instruction::Fence: {
+    auto *FI = cast<FenceInst>(I);
+    if (FI->getSyncScopeID() == SyncScope::SingleThread)
+      return false;
+    Ordering = FI->getOrdering();
+    break;
+  }
+  case Instruction::AtomicCmpXchg: {
+    AtomicOrdering Success = cast<AtomicCmpXchgInst>(I)->getSuccessOrdering();
+    AtomicOrdering Failure = cast<AtomicCmpXchgInst>(I)->getFailureOrdering();
+    // Only if both are relaxed, than it can be treated as relaxed.
+    // Otherwise it is non-relaxed.
+    if (Success != AtomicOrdering::Unordered &&
+        Success != AtomicOrdering::Monotonic)
+      return true;
+    if (Failure != AtomicOrdering::Unordered &&
+        Failure != AtomicOrdering::Monotonic)
+      return true;
+    return false;
+  }
+  default:
+    llvm_unreachable(
+        "New atomic operations need to be known in the attributor.");
+  }
+
+  // Relaxed.
+  if (Ordering == AtomicOrdering::Unordered ||
+      Ordering == AtomicOrdering::Monotonic)
+    return false;
+  return true;
+}
+
+/// Checks if an intrinsic is nosync. Currently only checks mem* intrinsics.
+/// FIXME: We should ipmrove the handling of intrinsics.
+bool AANoSyncImpl::isNoSyncIntrinsic(Instruction *I) {
+  if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+    /// Element wise atomic memory intrinsics are can only be unordered,
+    /// therefore nosync.
+    case Intrinsic::memset_element_unordered_atomic:
+    case Intrinsic::memmove_element_unordered_atomic:
+    case Intrinsic::memcpy_element_unordered_atomic:
+      return true;
+    case Intrinsic::memset:
+    case Intrinsic::memmove:
+    case Intrinsic::memcpy:
+      if (!cast<MemIntrinsic>(II)->isVolatile())
+        return true;
+      return false;
+    default:
+      return false;
+    }
+  }
+  return false;
+}
+
+bool AANoSyncImpl::isVolatile(Instruction *I) {
+  assert(!isa<CallBase>(I) && "Calls should not be checked here");
+
+  switch (I->getOpcode()) {
+  case Instruction::AtomicRMW:
+    return cast<AtomicRMWInst>(I)->isVolatile();
+  case Instruction::Store:
+    return cast<StoreInst>(I)->isVolatile();
+  case Instruction::Load:
+    return cast<LoadInst>(I)->isVolatile();
+  case Instruction::AtomicCmpXchg:
+    return cast<AtomicCmpXchgInst>(I)->isVolatile();
+  default:
+    return false;
+  }
+}
+
+ChangeStatus AANoSyncImpl::updateImpl(Attributor &A) {
+
+  auto CheckRWInstForNoSync = [&](Instruction &I) {
+    /// We are looking for volatile instructions or Non-Relaxed atomics.
+    /// FIXME: We should improve the handling of intrinsics.
+
+    if (isa<IntrinsicInst>(&I) && isNoSyncIntrinsic(&I))
+      return true;
+
+    if (const auto *CB = dyn_cast<CallBase>(&I)) {
+      if (CB->hasFnAttr(Attribute::NoSync))
+        return true;
+
+      const auto &NoSyncAA =
+          A.getAAFor<AANoSync>(*this, IRPosition::callsite_function(*CB));
+      if (NoSyncAA.isAssumedNoSync())
+        return true;
+      return false;
+    }
+
+    if (!isVolatile(&I) && !isNonRelaxedAtomic(&I))
+      return true;
+
+    return false;
+  };
+
+  auto CheckForNoSync = [&](Instruction &I) {
+    // At this point we handled all read/write effects and they are all
+    // nosync, so they can be skipped.
+    if (I.mayReadOrWriteMemory())
+      return true;
+
+    // non-convergent and readnone imply nosync.
+    return !cast<CallBase>(I).isConvergent();
+  };
+
+  if (!A.checkForAllReadWriteInstructions(CheckRWInstForNoSync, *this) ||
+      !A.checkForAllCallLikeInstructions(CheckForNoSync, *this))
+    return indicatePessimisticFixpoint();
+
+  return ChangeStatus::UNCHANGED;
+}
+
+struct AANoSyncFunction final : public AANoSyncImpl {
+  AANoSyncFunction(const IRPosition &IRP, Attributor &A)
+      : AANoSyncImpl(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(nosync) }
+};
+
+/// NoSync attribute deduction for a call sites.
+struct AANoSyncCallSite final : AANoSyncImpl {
+  AANoSyncCallSite(const IRPosition &IRP, Attributor &A)
+      : AANoSyncImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoSyncImpl::initialize(A);
+    Function *F = getAssociatedFunction();
     if (!F || F->isDeclaration())
-      indicatePessimisticFixpoint(); 
-  } 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    // TODO: Once we have call site specific value information we can provide 
-    //       call site specific liveness information and then it makes 
-    //       sense to specialize attributes for call sites arguments instead of 
-    //       redirecting requests to the callee argument. 
-    Function *F = getAssociatedFunction(); 
-    const IRPosition &FnPos = IRPosition::function(*F); 
-    auto &FnAA = A.getAAFor<AANoSync>(*this, FnPos); 
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::function(*F);
+    auto &FnAA = A.getAAFor<AANoSync>(*this, FnPos);
     return clampStateAndIndicateChange(getState(), FnAA.getState());
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(nosync); } 
-}; 
- 
-/// ------------------------ No-Free Attributes ---------------------------- 
- 
-struct AANoFreeImpl : public AANoFree { 
-  AANoFreeImpl(const IRPosition &IRP, Attributor &A) : AANoFree(IRP, A) {} 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    auto CheckForNoFree = [&](Instruction &I) { 
-      const auto &CB = cast<CallBase>(I); 
-      if (CB.hasFnAttr(Attribute::NoFree)) 
-        return true; 
- 
-      const auto &NoFreeAA = 
-          A.getAAFor<AANoFree>(*this, IRPosition::callsite_function(CB)); 
-      return NoFreeAA.isAssumedNoFree(); 
-    }; 
- 
-    if (!A.checkForAllCallLikeInstructions(CheckForNoFree, *this)) 
-      return indicatePessimisticFixpoint(); 
-    return ChangeStatus::UNCHANGED; 
-  } 
- 
-  /// See AbstractAttribute::getAsStr(). 
-  const std::string getAsStr() const override { 
-    return getAssumed() ? "nofree" : "may-free"; 
-  } 
-}; 
- 
-struct AANoFreeFunction final : public AANoFreeImpl { 
-  AANoFreeFunction(const IRPosition &IRP, Attributor &A) 
-      : AANoFreeImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(nofree) } 
-}; 
- 
-/// NoFree attribute deduction for a call sites. 
-struct AANoFreeCallSite final : AANoFreeImpl { 
-  AANoFreeCallSite(const IRPosition &IRP, Attributor &A) 
-      : AANoFreeImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    AANoFreeImpl::initialize(A); 
-    Function *F = getAssociatedFunction(); 
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(nosync); }
+};
+
+/// ------------------------ No-Free Attributes ----------------------------
+
+struct AANoFreeImpl : public AANoFree {
+  AANoFreeImpl(const IRPosition &IRP, Attributor &A) : AANoFree(IRP, A) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    auto CheckForNoFree = [&](Instruction &I) {
+      const auto &CB = cast<CallBase>(I);
+      if (CB.hasFnAttr(Attribute::NoFree))
+        return true;
+
+      const auto &NoFreeAA =
+          A.getAAFor<AANoFree>(*this, IRPosition::callsite_function(CB));
+      return NoFreeAA.isAssumedNoFree();
+    };
+
+    if (!A.checkForAllCallLikeInstructions(CheckForNoFree, *this))
+      return indicatePessimisticFixpoint();
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    return getAssumed() ? "nofree" : "may-free";
+  }
+};
+
+struct AANoFreeFunction final : public AANoFreeImpl {
+  AANoFreeFunction(const IRPosition &IRP, Attributor &A)
+      : AANoFreeImpl(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(nofree) }
+};
+
+/// NoFree attribute deduction for a call sites.
+struct AANoFreeCallSite final : AANoFreeImpl {
+  AANoFreeCallSite(const IRPosition &IRP, Attributor &A)
+      : AANoFreeImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoFreeImpl::initialize(A);
+    Function *F = getAssociatedFunction();
     if (!F || F->isDeclaration())
-      indicatePessimisticFixpoint(); 
-  } 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    // TODO: Once we have call site specific value information we can provide 
-    //       call site specific liveness information and then it makes 
-    //       sense to specialize attributes for call sites arguments instead of 
-    //       redirecting requests to the callee argument. 
-    Function *F = getAssociatedFunction(); 
-    const IRPosition &FnPos = IRPosition::function(*F); 
-    auto &FnAA = A.getAAFor<AANoFree>(*this, FnPos); 
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::function(*F);
+    auto &FnAA = A.getAAFor<AANoFree>(*this, FnPos);
     return clampStateAndIndicateChange(getState(), FnAA.getState());
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(nofree); } 
-}; 
- 
-/// NoFree attribute for floating values. 
-struct AANoFreeFloating : AANoFreeImpl { 
-  AANoFreeFloating(const IRPosition &IRP, Attributor &A) 
-      : AANoFreeImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override{STATS_DECLTRACK_FLOATING_ATTR(nofree)} 
- 
-  /// See Abstract Attribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    const IRPosition &IRP = getIRPosition(); 
- 
-    const auto &NoFreeAA = 
-        A.getAAFor<AANoFree>(*this, IRPosition::function_scope(IRP)); 
-    if (NoFreeAA.isAssumedNoFree()) 
-      return ChangeStatus::UNCHANGED; 
- 
-    Value &AssociatedValue = getIRPosition().getAssociatedValue(); 
-    auto Pred = [&](const Use &U, bool &Follow) -> bool { 
-      Instruction *UserI = cast<Instruction>(U.getUser()); 
-      if (auto *CB = dyn_cast<CallBase>(UserI)) { 
-        if (CB->isBundleOperand(&U)) 
-          return false; 
-        if (!CB->isArgOperand(&U)) 
-          return true; 
-        unsigned ArgNo = CB->getArgOperandNo(&U); 
- 
-        const auto &NoFreeArg = A.getAAFor<AANoFree>( 
-            *this, IRPosition::callsite_argument(*CB, ArgNo)); 
-        return NoFreeArg.isAssumedNoFree(); 
-      } 
- 
-      if (isa<GetElementPtrInst>(UserI) || isa<BitCastInst>(UserI) || 
-          isa<PHINode>(UserI) || isa<SelectInst>(UserI)) { 
-        Follow = true; 
-        return true; 
-      } 
-      if (isa<ReturnInst>(UserI)) 
-        return true; 
- 
-      // Unknown user. 
-      return false; 
-    }; 
-    if (!A.checkForAllUses(Pred, *this, AssociatedValue)) 
-      return indicatePessimisticFixpoint(); 
- 
-    return ChangeStatus::UNCHANGED; 
-  } 
-}; 
- 
-/// NoFree attribute for a call site argument. 
-struct AANoFreeArgument final : AANoFreeFloating { 
-  AANoFreeArgument(const IRPosition &IRP, Attributor &A) 
-      : AANoFreeFloating(IRP, A) {} 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(nofree) } 
-}; 
- 
-/// NoFree attribute for call site arguments. 
-struct AANoFreeCallSiteArgument final : AANoFreeFloating { 
-  AANoFreeCallSiteArgument(const IRPosition &IRP, Attributor &A) 
-      : AANoFreeFloating(IRP, A) {} 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    // TODO: Once we have call site specific value information we can provide 
-    //       call site specific liveness information and then it makes 
-    //       sense to specialize attributes for call sites arguments instead of 
-    //       redirecting requests to the callee argument. 
-    Argument *Arg = getAssociatedArgument(); 
-    if (!Arg) 
-      return indicatePessimisticFixpoint(); 
-    const IRPosition &ArgPos = IRPosition::argument(*Arg); 
-    auto &ArgAA = A.getAAFor<AANoFree>(*this, ArgPos); 
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(nofree); }
+};
+
+/// NoFree attribute for floating values.
+struct AANoFreeFloating : AANoFreeImpl {
+  AANoFreeFloating(const IRPosition &IRP, Attributor &A)
+      : AANoFreeImpl(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override{STATS_DECLTRACK_FLOATING_ATTR(nofree)}
+
+  /// See Abstract Attribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    const IRPosition &IRP = getIRPosition();
+
+    const auto &NoFreeAA =
+        A.getAAFor<AANoFree>(*this, IRPosition::function_scope(IRP));
+    if (NoFreeAA.isAssumedNoFree())
+      return ChangeStatus::UNCHANGED;
+
+    Value &AssociatedValue = getIRPosition().getAssociatedValue();
+    auto Pred = [&](const Use &U, bool &Follow) -> bool {
+      Instruction *UserI = cast<Instruction>(U.getUser());
+      if (auto *CB = dyn_cast<CallBase>(UserI)) {
+        if (CB->isBundleOperand(&U))
+          return false;
+        if (!CB->isArgOperand(&U))
+          return true;
+        unsigned ArgNo = CB->getArgOperandNo(&U);
+
+        const auto &NoFreeArg = A.getAAFor<AANoFree>(
+            *this, IRPosition::callsite_argument(*CB, ArgNo));
+        return NoFreeArg.isAssumedNoFree();
+      }
+
+      if (isa<GetElementPtrInst>(UserI) || isa<BitCastInst>(UserI) ||
+          isa<PHINode>(UserI) || isa<SelectInst>(UserI)) {
+        Follow = true;
+        return true;
+      }
+      if (isa<ReturnInst>(UserI))
+        return true;
+
+      // Unknown user.
+      return false;
+    };
+    if (!A.checkForAllUses(Pred, *this, AssociatedValue))
+      return indicatePessimisticFixpoint();
+
+    return ChangeStatus::UNCHANGED;
+  }
+};
+
+/// NoFree attribute for a call site argument.
+struct AANoFreeArgument final : AANoFreeFloating {
+  AANoFreeArgument(const IRPosition &IRP, Attributor &A)
+      : AANoFreeFloating(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(nofree) }
+};
+
+/// NoFree attribute for call site arguments.
+struct AANoFreeCallSiteArgument final : AANoFreeFloating {
+  AANoFreeCallSiteArgument(const IRPosition &IRP, Attributor &A)
+      : AANoFreeFloating(IRP, A) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Argument *Arg = getAssociatedArgument();
+    if (!Arg)
+      return indicatePessimisticFixpoint();
+    const IRPosition &ArgPos = IRPosition::argument(*Arg);
+    auto &ArgAA = A.getAAFor<AANoFree>(*this, ArgPos);
     return clampStateAndIndicateChange(getState(), ArgAA.getState());
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override{STATS_DECLTRACK_CSARG_ATTR(nofree)}; 
-}; 
- 
-/// NoFree attribute for function return value. 
-struct AANoFreeReturned final : AANoFreeFloating { 
-  AANoFreeReturned(const IRPosition &IRP, Attributor &A) 
-      : AANoFreeFloating(IRP, A) { 
-    llvm_unreachable("NoFree is not applicable to function returns!"); 
-  } 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    llvm_unreachable("NoFree is not applicable to function returns!"); 
-  } 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    llvm_unreachable("NoFree is not applicable to function returns!"); 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override {} 
-}; 
- 
-/// NoFree attribute deduction for a call site return value. 
-struct AANoFreeCallSiteReturned final : AANoFreeFloating { 
-  AANoFreeCallSiteReturned(const IRPosition &IRP, Attributor &A) 
-      : AANoFreeFloating(IRP, A) {} 
- 
-  ChangeStatus manifest(Attributor &A) override { 
-    return ChangeStatus::UNCHANGED; 
-  } 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(nofree) } 
-}; 
- 
-/// ------------------------ NonNull Argument Attribute ------------------------ 
-static int64_t getKnownNonNullAndDerefBytesForUse( 
-    Attributor &A, const AbstractAttribute &QueryingAA, Value &AssociatedValue, 
-    const Use *U, const Instruction *I, bool &IsNonNull, bool &TrackUse) { 
-  TrackUse = false; 
- 
-  const Value *UseV = U->get(); 
-  if (!UseV->getType()->isPointerTy()) 
-    return 0; 
- 
-  Type *PtrTy = UseV->getType(); 
-  const Function *F = I->getFunction(); 
-  bool NullPointerIsDefined = 
-      F ? llvm::NullPointerIsDefined(F, PtrTy->getPointerAddressSpace()) : true; 
-  const DataLayout &DL = A.getInfoCache().getDL(); 
-  if (const auto *CB = dyn_cast<CallBase>(I)) { 
-    if (CB->isBundleOperand(U)) { 
-      if (RetainedKnowledge RK = getKnowledgeFromUse( 
-              U, {Attribute::NonNull, Attribute::Dereferenceable})) { 
-        IsNonNull |= 
-            (RK.AttrKind == Attribute::NonNull || !NullPointerIsDefined); 
-        return RK.ArgValue; 
-      } 
-      return 0; 
-    } 
- 
-    if (CB->isCallee(U)) { 
-      IsNonNull |= !NullPointerIsDefined; 
-      return 0; 
-    } 
- 
-    unsigned ArgNo = CB->getArgOperandNo(U); 
-    IRPosition IRP = IRPosition::callsite_argument(*CB, ArgNo); 
-    // As long as we only use known information there is no need to track 
-    // dependences here. 
-    auto &DerefAA = A.getAAFor<AADereferenceable>(QueryingAA, IRP, 
-                                                  /* TrackDependence */ false); 
-    IsNonNull |= DerefAA.isKnownNonNull(); 
-    return DerefAA.getKnownDereferenceableBytes(); 
-  } 
- 
-  // We need to follow common pointer manipulation uses to the accesses they 
-  // feed into. We can try to be smart to avoid looking through things we do not 
-  // like for now, e.g., non-inbounds GEPs. 
-  if (isa<CastInst>(I)) { 
-    TrackUse = true; 
-    return 0; 
-  } 
- 
-  if (isa<GetElementPtrInst>(I)) { 
-    TrackUse = true; 
-    return 0; 
-  } 
- 
-  int64_t Offset; 
-  const Value *Base = 
-      getMinimalBaseOfAccsesPointerOperand(A, QueryingAA, I, Offset, DL); 
-  if (Base) { 
-    if (Base == &AssociatedValue && 
-        getPointerOperand(I, /* AllowVolatile */ false) == UseV) { 
-      int64_t DerefBytes = 
-          (int64_t)DL.getTypeStoreSize(PtrTy->getPointerElementType()) + Offset; 
- 
-      IsNonNull |= !NullPointerIsDefined; 
-      return std::max(int64_t(0), DerefBytes); 
-    } 
-  } 
- 
-  /// Corner case when an offset is 0. 
-  Base = getBasePointerOfAccessPointerOperand(I, Offset, DL, 
-                                              /*AllowNonInbounds*/ true); 
-  if (Base) { 
-    if (Offset == 0 && Base == &AssociatedValue && 
-        getPointerOperand(I, /* AllowVolatile */ false) == UseV) { 
-      int64_t DerefBytes = 
-          (int64_t)DL.getTypeStoreSize(PtrTy->getPointerElementType()); 
-      IsNonNull |= !NullPointerIsDefined; 
-      return std::max(int64_t(0), DerefBytes); 
-    } 
-  } 
- 
-  return 0; 
-} 
- 
-struct AANonNullImpl : AANonNull { 
-  AANonNullImpl(const IRPosition &IRP, Attributor &A) 
-      : AANonNull(IRP, A), 
-        NullIsDefined(NullPointerIsDefined( 
-            getAnchorScope(), 
-            getAssociatedValue().getType()->getPointerAddressSpace())) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    Value &V = getAssociatedValue(); 
-    if (!NullIsDefined && 
-        hasAttr({Attribute::NonNull, Attribute::Dereferenceable}, 
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override{STATS_DECLTRACK_CSARG_ATTR(nofree)};
+};
+
+/// NoFree attribute for function return value.
+struct AANoFreeReturned final : AANoFreeFloating {
+  AANoFreeReturned(const IRPosition &IRP, Attributor &A)
+      : AANoFreeFloating(IRP, A) {
+    llvm_unreachable("NoFree is not applicable to function returns!");
+  }
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    llvm_unreachable("NoFree is not applicable to function returns!");
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    llvm_unreachable("NoFree is not applicable to function returns!");
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {}
+};
+
+/// NoFree attribute deduction for a call site return value.
+struct AANoFreeCallSiteReturned final : AANoFreeFloating {
+  AANoFreeCallSiteReturned(const IRPosition &IRP, Attributor &A)
+      : AANoFreeFloating(IRP, A) {}
+
+  ChangeStatus manifest(Attributor &A) override {
+    return ChangeStatus::UNCHANGED;
+  }
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(nofree) }
+};
+
+/// ------------------------ NonNull Argument Attribute ------------------------
+static int64_t getKnownNonNullAndDerefBytesForUse(
+    Attributor &A, const AbstractAttribute &QueryingAA, Value &AssociatedValue,
+    const Use *U, const Instruction *I, bool &IsNonNull, bool &TrackUse) {
+  TrackUse = false;
+
+  const Value *UseV = U->get();
+  if (!UseV->getType()->isPointerTy())
+    return 0;
+
+  Type *PtrTy = UseV->getType();
+  const Function *F = I->getFunction();
+  bool NullPointerIsDefined =
+      F ? llvm::NullPointerIsDefined(F, PtrTy->getPointerAddressSpace()) : true;
+  const DataLayout &DL = A.getInfoCache().getDL();
+  if (const auto *CB = dyn_cast<CallBase>(I)) {
+    if (CB->isBundleOperand(U)) {
+      if (RetainedKnowledge RK = getKnowledgeFromUse(
+              U, {Attribute::NonNull, Attribute::Dereferenceable})) {
+        IsNonNull |=
+            (RK.AttrKind == Attribute::NonNull || !NullPointerIsDefined);
+        return RK.ArgValue;
+      }
+      return 0;
+    }
+
+    if (CB->isCallee(U)) {
+      IsNonNull |= !NullPointerIsDefined;
+      return 0;
+    }
+
+    unsigned ArgNo = CB->getArgOperandNo(U);
+    IRPosition IRP = IRPosition::callsite_argument(*CB, ArgNo);
+    // As long as we only use known information there is no need to track
+    // dependences here.
+    auto &DerefAA = A.getAAFor<AADereferenceable>(QueryingAA, IRP,
+                                                  /* TrackDependence */ false);
+    IsNonNull |= DerefAA.isKnownNonNull();
+    return DerefAA.getKnownDereferenceableBytes();
+  }
+
+  // We need to follow common pointer manipulation uses to the accesses they
+  // feed into. We can try to be smart to avoid looking through things we do not
+  // like for now, e.g., non-inbounds GEPs.
+  if (isa<CastInst>(I)) {
+    TrackUse = true;
+    return 0;
+  }
+
+  if (isa<GetElementPtrInst>(I)) {
+    TrackUse = true;
+    return 0;
+  }
+
+  int64_t Offset;
+  const Value *Base =
+      getMinimalBaseOfAccsesPointerOperand(A, QueryingAA, I, Offset, DL);
+  if (Base) {
+    if (Base == &AssociatedValue &&
+        getPointerOperand(I, /* AllowVolatile */ false) == UseV) {
+      int64_t DerefBytes =
+          (int64_t)DL.getTypeStoreSize(PtrTy->getPointerElementType()) + Offset;
+
+      IsNonNull |= !NullPointerIsDefined;
+      return std::max(int64_t(0), DerefBytes);
+    }
+  }
+
+  /// Corner case when an offset is 0.
+  Base = getBasePointerOfAccessPointerOperand(I, Offset, DL,
+                                              /*AllowNonInbounds*/ true);
+  if (Base) {
+    if (Offset == 0 && Base == &AssociatedValue &&
+        getPointerOperand(I, /* AllowVolatile */ false) == UseV) {
+      int64_t DerefBytes =
+          (int64_t)DL.getTypeStoreSize(PtrTy->getPointerElementType());
+      IsNonNull |= !NullPointerIsDefined;
+      return std::max(int64_t(0), DerefBytes);
+    }
+  }
+
+  return 0;
+}
+
+struct AANonNullImpl : AANonNull {
+  AANonNullImpl(const IRPosition &IRP, Attributor &A)
+      : AANonNull(IRP, A),
+        NullIsDefined(NullPointerIsDefined(
+            getAnchorScope(),
+            getAssociatedValue().getType()->getPointerAddressSpace())) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    Value &V = getAssociatedValue();
+    if (!NullIsDefined &&
+        hasAttr({Attribute::NonNull, Attribute::Dereferenceable},
                 /* IgnoreSubsumingPositions */ false, &A)) {
-      indicateOptimisticFixpoint(); 
+      indicateOptimisticFixpoint();
       return;
     }
 
     if (isa<ConstantPointerNull>(V)) {
-      indicatePessimisticFixpoint(); 
+      indicatePessimisticFixpoint();
       return;
     }
- 
+
     AANonNull::initialize(A);
 
-    bool CanBeNull = true; 
+    bool CanBeNull = true;
     if (V.getPointerDereferenceableBytes(A.getDataLayout(), CanBeNull)) {
       if (!CanBeNull) {
-        indicateOptimisticFixpoint(); 
+        indicateOptimisticFixpoint();
         return;
       }
     }
- 
+
     if (isa<GlobalValue>(&getAssociatedValue())) {
       indicatePessimisticFixpoint();
       return;
@@ -1713,293 +1713,293 @@ struct AANonNullImpl : AANonNull {
 
     if (Instruction *CtxI = getCtxI())
       followUsesInMBEC(*this, A, getState(), *CtxI);
-  } 
- 
-  /// See followUsesInMBEC 
-  bool followUseInMBEC(Attributor &A, const Use *U, const Instruction *I, 
-                       AANonNull::StateType &State) { 
-    bool IsNonNull = false; 
-    bool TrackUse = false; 
-    getKnownNonNullAndDerefBytesForUse(A, *this, getAssociatedValue(), U, I, 
-                                       IsNonNull, TrackUse); 
-    State.setKnown(IsNonNull); 
-    return TrackUse; 
-  } 
- 
-  /// See AbstractAttribute::getAsStr(). 
-  const std::string getAsStr() const override { 
-    return getAssumed() ? "nonnull" : "may-null"; 
-  } 
- 
-  /// Flag to determine if the underlying value can be null and still allow 
-  /// valid accesses. 
-  const bool NullIsDefined; 
-}; 
- 
-/// NonNull attribute for a floating value. 
-struct AANonNullFloating : public AANonNullImpl { 
-  AANonNullFloating(const IRPosition &IRP, Attributor &A) 
-      : AANonNullImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    const DataLayout &DL = A.getDataLayout(); 
- 
-    DominatorTree *DT = nullptr; 
-    AssumptionCache *AC = nullptr; 
-    InformationCache &InfoCache = A.getInfoCache(); 
-    if (const Function *Fn = getAnchorScope()) { 
-      DT = InfoCache.getAnalysisResultForFunction<DominatorTreeAnalysis>(*Fn); 
-      AC = InfoCache.getAnalysisResultForFunction<AssumptionAnalysis>(*Fn); 
-    } 
- 
-    auto VisitValueCB = [&](Value &V, const Instruction *CtxI, 
-                            AANonNull::StateType &T, bool Stripped) -> bool { 
-      const auto &AA = A.getAAFor<AANonNull>(*this, IRPosition::value(V)); 
-      if (!Stripped && this == &AA) { 
-        if (!isKnownNonZero(&V, DL, 0, AC, CtxI, DT)) 
-          T.indicatePessimisticFixpoint(); 
-      } else { 
-        // Use abstract attribute information. 
+  }
+
+  /// See followUsesInMBEC
+  bool followUseInMBEC(Attributor &A, const Use *U, const Instruction *I,
+                       AANonNull::StateType &State) {
+    bool IsNonNull = false;
+    bool TrackUse = false;
+    getKnownNonNullAndDerefBytesForUse(A, *this, getAssociatedValue(), U, I,
+                                       IsNonNull, TrackUse);
+    State.setKnown(IsNonNull);
+    return TrackUse;
+  }
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    return getAssumed() ? "nonnull" : "may-null";
+  }
+
+  /// Flag to determine if the underlying value can be null and still allow
+  /// valid accesses.
+  const bool NullIsDefined;
+};
+
+/// NonNull attribute for a floating value.
+struct AANonNullFloating : public AANonNullImpl {
+  AANonNullFloating(const IRPosition &IRP, Attributor &A)
+      : AANonNullImpl(IRP, A) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    const DataLayout &DL = A.getDataLayout();
+
+    DominatorTree *DT = nullptr;
+    AssumptionCache *AC = nullptr;
+    InformationCache &InfoCache = A.getInfoCache();
+    if (const Function *Fn = getAnchorScope()) {
+      DT = InfoCache.getAnalysisResultForFunction<DominatorTreeAnalysis>(*Fn);
+      AC = InfoCache.getAnalysisResultForFunction<AssumptionAnalysis>(*Fn);
+    }
+
+    auto VisitValueCB = [&](Value &V, const Instruction *CtxI,
+                            AANonNull::StateType &T, bool Stripped) -> bool {
+      const auto &AA = A.getAAFor<AANonNull>(*this, IRPosition::value(V));
+      if (!Stripped && this == &AA) {
+        if (!isKnownNonZero(&V, DL, 0, AC, CtxI, DT))
+          T.indicatePessimisticFixpoint();
+      } else {
+        // Use abstract attribute information.
         const AANonNull::StateType &NS = AA.getState();
-        T ^= NS; 
-      } 
-      return T.isValidState(); 
-    }; 
- 
-    StateType T; 
-    if (!genericValueTraversal<AANonNull, StateType>( 
-            A, getIRPosition(), *this, T, VisitValueCB, getCtxI())) 
-      return indicatePessimisticFixpoint(); 
- 
-    return clampStateAndIndicateChange(getState(), T); 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(nonnull) } 
-}; 
- 
-/// NonNull attribute for function return value. 
-struct AANonNullReturned final 
+        T ^= NS;
+      }
+      return T.isValidState();
+    };
+
+    StateType T;
+    if (!genericValueTraversal<AANonNull, StateType>(
+            A, getIRPosition(), *this, T, VisitValueCB, getCtxI()))
+      return indicatePessimisticFixpoint();
+
+    return clampStateAndIndicateChange(getState(), T);
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(nonnull) }
+};
+
+/// NonNull attribute for function return value.
+struct AANonNullReturned final
     : AAReturnedFromReturnedValues<AANonNull, AANonNull> {
-  AANonNullReturned(const IRPosition &IRP, Attributor &A) 
+  AANonNullReturned(const IRPosition &IRP, Attributor &A)
       : AAReturnedFromReturnedValues<AANonNull, AANonNull>(IRP, A) {}
- 
+
   /// See AbstractAttribute::getAsStr().
   const std::string getAsStr() const override {
     return getAssumed() ? "nonnull" : "may-null";
   }
 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(nonnull) } 
-}; 
- 
-/// NonNull attribute for function argument. 
-struct AANonNullArgument final 
-    : AAArgumentFromCallSiteArguments<AANonNull, AANonNullImpl> { 
-  AANonNullArgument(const IRPosition &IRP, Attributor &A) 
-      : AAArgumentFromCallSiteArguments<AANonNull, AANonNullImpl>(IRP, A) {} 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(nonnull) } 
-}; 
- 
-struct AANonNullCallSiteArgument final : AANonNullFloating { 
-  AANonNullCallSiteArgument(const IRPosition &IRP, Attributor &A) 
-      : AANonNullFloating(IRP, A) {} 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(nonnull) } 
-}; 
- 
-/// NonNull attribute for a call site return position. 
-struct AANonNullCallSiteReturned final 
-    : AACallSiteReturnedFromReturned<AANonNull, AANonNullImpl> { 
-  AANonNullCallSiteReturned(const IRPosition &IRP, Attributor &A) 
-      : AACallSiteReturnedFromReturned<AANonNull, AANonNullImpl>(IRP, A) {} 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(nonnull) } 
-}; 
- 
-/// ------------------------ No-Recurse Attributes ---------------------------- 
- 
-struct AANoRecurseImpl : public AANoRecurse { 
-  AANoRecurseImpl(const IRPosition &IRP, Attributor &A) : AANoRecurse(IRP, A) {} 
- 
-  /// See AbstractAttribute::getAsStr() 
-  const std::string getAsStr() const override { 
-    return getAssumed() ? "norecurse" : "may-recurse"; 
-  } 
-}; 
- 
-struct AANoRecurseFunction final : AANoRecurseImpl { 
-  AANoRecurseFunction(const IRPosition &IRP, Attributor &A) 
-      : AANoRecurseImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    AANoRecurseImpl::initialize(A); 
-    if (const Function *F = getAnchorScope()) 
-      if (A.getInfoCache().getSccSize(*F) != 1) 
-        indicatePessimisticFixpoint(); 
-  } 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
- 
-    // If all live call sites are known to be no-recurse, we are as well. 
-    auto CallSitePred = [&](AbstractCallSite ACS) { 
-      const auto &NoRecurseAA = A.getAAFor<AANoRecurse>( 
-          *this, IRPosition::function(*ACS.getInstruction()->getFunction()), 
-          /* TrackDependence */ false, DepClassTy::OPTIONAL); 
-      return NoRecurseAA.isKnownNoRecurse(); 
-    }; 
-    bool AllCallSitesKnown; 
-    if (A.checkForAllCallSites(CallSitePred, *this, true, AllCallSitesKnown)) { 
-      // If we know all call sites and all are known no-recurse, we are done. 
-      // If all known call sites, which might not be all that exist, are known 
-      // to be no-recurse, we are not done but we can continue to assume 
-      // no-recurse. If one of the call sites we have not visited will become 
-      // live, another update is triggered. 
-      if (AllCallSitesKnown) 
-        indicateOptimisticFixpoint(); 
-      return ChangeStatus::UNCHANGED; 
-    } 
- 
-    // If the above check does not hold anymore we look at the calls. 
-    auto CheckForNoRecurse = [&](Instruction &I) { 
-      const auto &CB = cast<CallBase>(I); 
-      if (CB.hasFnAttr(Attribute::NoRecurse)) 
-        return true; 
- 
-      const auto &NoRecurseAA = 
-          A.getAAFor<AANoRecurse>(*this, IRPosition::callsite_function(CB)); 
-      if (!NoRecurseAA.isAssumedNoRecurse()) 
-        return false; 
- 
-      // Recursion to the same function 
-      if (CB.getCalledFunction() == getAnchorScope()) 
-        return false; 
- 
-      return true; 
-    }; 
- 
-    if (!A.checkForAllCallLikeInstructions(CheckForNoRecurse, *this)) 
-      return indicatePessimisticFixpoint(); 
-    return ChangeStatus::UNCHANGED; 
-  } 
- 
-  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(norecurse) } 
-}; 
- 
-/// NoRecurse attribute deduction for a call sites. 
-struct AANoRecurseCallSite final : AANoRecurseImpl { 
-  AANoRecurseCallSite(const IRPosition &IRP, Attributor &A) 
-      : AANoRecurseImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    AANoRecurseImpl::initialize(A); 
-    Function *F = getAssociatedFunction(); 
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(nonnull) }
+};
+
+/// NonNull attribute for function argument.
+struct AANonNullArgument final
+    : AAArgumentFromCallSiteArguments<AANonNull, AANonNullImpl> {
+  AANonNullArgument(const IRPosition &IRP, Attributor &A)
+      : AAArgumentFromCallSiteArguments<AANonNull, AANonNullImpl>(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(nonnull) }
+};
+
+struct AANonNullCallSiteArgument final : AANonNullFloating {
+  AANonNullCallSiteArgument(const IRPosition &IRP, Attributor &A)
+      : AANonNullFloating(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(nonnull) }
+};
+
+/// NonNull attribute for a call site return position.
+struct AANonNullCallSiteReturned final
+    : AACallSiteReturnedFromReturned<AANonNull, AANonNullImpl> {
+  AANonNullCallSiteReturned(const IRPosition &IRP, Attributor &A)
+      : AACallSiteReturnedFromReturned<AANonNull, AANonNullImpl>(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(nonnull) }
+};
+
+/// ------------------------ No-Recurse Attributes ----------------------------
+
+struct AANoRecurseImpl : public AANoRecurse {
+  AANoRecurseImpl(const IRPosition &IRP, Attributor &A) : AANoRecurse(IRP, A) {}
+
+  /// See AbstractAttribute::getAsStr()
+  const std::string getAsStr() const override {
+    return getAssumed() ? "norecurse" : "may-recurse";
+  }
+};
+
+struct AANoRecurseFunction final : AANoRecurseImpl {
+  AANoRecurseFunction(const IRPosition &IRP, Attributor &A)
+      : AANoRecurseImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoRecurseImpl::initialize(A);
+    if (const Function *F = getAnchorScope())
+      if (A.getInfoCache().getSccSize(*F) != 1)
+        indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+
+    // If all live call sites are known to be no-recurse, we are as well.
+    auto CallSitePred = [&](AbstractCallSite ACS) {
+      const auto &NoRecurseAA = A.getAAFor<AANoRecurse>(
+          *this, IRPosition::function(*ACS.getInstruction()->getFunction()),
+          /* TrackDependence */ false, DepClassTy::OPTIONAL);
+      return NoRecurseAA.isKnownNoRecurse();
+    };
+    bool AllCallSitesKnown;
+    if (A.checkForAllCallSites(CallSitePred, *this, true, AllCallSitesKnown)) {
+      // If we know all call sites and all are known no-recurse, we are done.
+      // If all known call sites, which might not be all that exist, are known
+      // to be no-recurse, we are not done but we can continue to assume
+      // no-recurse. If one of the call sites we have not visited will become
+      // live, another update is triggered.
+      if (AllCallSitesKnown)
+        indicateOptimisticFixpoint();
+      return ChangeStatus::UNCHANGED;
+    }
+
+    // If the above check does not hold anymore we look at the calls.
+    auto CheckForNoRecurse = [&](Instruction &I) {
+      const auto &CB = cast<CallBase>(I);
+      if (CB.hasFnAttr(Attribute::NoRecurse))
+        return true;
+
+      const auto &NoRecurseAA =
+          A.getAAFor<AANoRecurse>(*this, IRPosition::callsite_function(CB));
+      if (!NoRecurseAA.isAssumedNoRecurse())
+        return false;
+
+      // Recursion to the same function
+      if (CB.getCalledFunction() == getAnchorScope())
+        return false;
+
+      return true;
+    };
+
+    if (!A.checkForAllCallLikeInstructions(CheckForNoRecurse, *this))
+      return indicatePessimisticFixpoint();
+    return ChangeStatus::UNCHANGED;
+  }
+
+  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(norecurse) }
+};
+
+/// NoRecurse attribute deduction for a call sites.
+struct AANoRecurseCallSite final : AANoRecurseImpl {
+  AANoRecurseCallSite(const IRPosition &IRP, Attributor &A)
+      : AANoRecurseImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoRecurseImpl::initialize(A);
+    Function *F = getAssociatedFunction();
     if (!F || F->isDeclaration())
-      indicatePessimisticFixpoint(); 
-  } 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    // TODO: Once we have call site specific value information we can provide 
-    //       call site specific liveness information and then it makes 
-    //       sense to specialize attributes for call sites arguments instead of 
-    //       redirecting requests to the callee argument. 
-    Function *F = getAssociatedFunction(); 
-    const IRPosition &FnPos = IRPosition::function(*F); 
-    auto &FnAA = A.getAAFor<AANoRecurse>(*this, FnPos); 
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::function(*F);
+    auto &FnAA = A.getAAFor<AANoRecurse>(*this, FnPos);
     return clampStateAndIndicateChange(getState(), FnAA.getState());
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(norecurse); } 
-}; 
- 
-/// -------------------- Undefined-Behavior Attributes ------------------------ 
- 
-struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior { 
-  AAUndefinedBehaviorImpl(const IRPosition &IRP, Attributor &A) 
-      : AAUndefinedBehavior(IRP, A) {} 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  // through a pointer (i.e. also branches etc.) 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    const size_t UBPrevSize = KnownUBInsts.size(); 
-    const size_t NoUBPrevSize = AssumedNoUBInsts.size(); 
- 
-    auto InspectMemAccessInstForUB = [&](Instruction &I) { 
-      // Skip instructions that are already saved. 
-      if (AssumedNoUBInsts.count(&I) || KnownUBInsts.count(&I)) 
-        return true; 
- 
-      // If we reach here, we know we have an instruction 
-      // that accesses memory through a pointer operand, 
-      // for which getPointerOperand() should give it to us. 
-      const Value *PtrOp = getPointerOperand(&I, /* AllowVolatile */ true); 
-      assert(PtrOp && 
-             "Expected pointer operand of memory accessing instruction"); 
- 
-      // Either we stopped and the appropriate action was taken, 
-      // or we got back a simplified value to continue. 
-      Optional<Value *> SimplifiedPtrOp = stopOnUndefOrAssumed(A, PtrOp, &I); 
-      if (!SimplifiedPtrOp.hasValue()) 
-        return true; 
-      const Value *PtrOpVal = SimplifiedPtrOp.getValue(); 
- 
-      // A memory access through a pointer is considered UB 
-      // only if the pointer has constant null value. 
-      // TODO: Expand it to not only check constant values. 
-      if (!isa<ConstantPointerNull>(PtrOpVal)) { 
-        AssumedNoUBInsts.insert(&I); 
-        return true; 
-      } 
-      const Type *PtrTy = PtrOpVal->getType(); 
- 
-      // Because we only consider instructions inside functions, 
-      // assume that a parent function exists. 
-      const Function *F = I.getFunction(); 
- 
-      // A memory access using constant null pointer is only considered UB 
-      // if null pointer is _not_ defined for the target platform. 
-      if (llvm::NullPointerIsDefined(F, PtrTy->getPointerAddressSpace())) 
-        AssumedNoUBInsts.insert(&I); 
-      else 
-        KnownUBInsts.insert(&I); 
-      return true; 
-    }; 
- 
-    auto InspectBrInstForUB = [&](Instruction &I) { 
-      // A conditional branch instruction is considered UB if it has `undef` 
-      // condition. 
- 
-      // Skip instructions that are already saved. 
-      if (AssumedNoUBInsts.count(&I) || KnownUBInsts.count(&I)) 
-        return true; 
- 
-      // We know we have a branch instruction. 
-      auto BrInst = cast<BranchInst>(&I); 
- 
-      // Unconditional branches are never considered UB. 
-      if (BrInst->isUnconditional()) 
-        return true; 
- 
-      // Either we stopped and the appropriate action was taken, 
-      // or we got back a simplified value to continue. 
-      Optional<Value *> SimplifiedCond = 
-          stopOnUndefOrAssumed(A, BrInst->getCondition(), BrInst); 
-      if (!SimplifiedCond.hasValue()) 
-        return true; 
-      AssumedNoUBInsts.insert(&I); 
-      return true; 
-    }; 
- 
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(norecurse); }
+};
+
+/// -------------------- Undefined-Behavior Attributes ------------------------
+
+struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior {
+  AAUndefinedBehaviorImpl(const IRPosition &IRP, Attributor &A)
+      : AAUndefinedBehavior(IRP, A) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  // through a pointer (i.e. also branches etc.)
+  ChangeStatus updateImpl(Attributor &A) override {
+    const size_t UBPrevSize = KnownUBInsts.size();
+    const size_t NoUBPrevSize = AssumedNoUBInsts.size();
+
+    auto InspectMemAccessInstForUB = [&](Instruction &I) {
+      // Skip instructions that are already saved.
+      if (AssumedNoUBInsts.count(&I) || KnownUBInsts.count(&I))
+        return true;
+
+      // If we reach here, we know we have an instruction
+      // that accesses memory through a pointer operand,
+      // for which getPointerOperand() should give it to us.
+      const Value *PtrOp = getPointerOperand(&I, /* AllowVolatile */ true);
+      assert(PtrOp &&
+             "Expected pointer operand of memory accessing instruction");
+
+      // Either we stopped and the appropriate action was taken,
+      // or we got back a simplified value to continue.
+      Optional<Value *> SimplifiedPtrOp = stopOnUndefOrAssumed(A, PtrOp, &I);
+      if (!SimplifiedPtrOp.hasValue())
+        return true;
+      const Value *PtrOpVal = SimplifiedPtrOp.getValue();
+
+      // A memory access through a pointer is considered UB
+      // only if the pointer has constant null value.
+      // TODO: Expand it to not only check constant values.
+      if (!isa<ConstantPointerNull>(PtrOpVal)) {
+        AssumedNoUBInsts.insert(&I);
+        return true;
+      }
+      const Type *PtrTy = PtrOpVal->getType();
+
+      // Because we only consider instructions inside functions,
+      // assume that a parent function exists.
+      const Function *F = I.getFunction();
+
+      // A memory access using constant null pointer is only considered UB
+      // if null pointer is _not_ defined for the target platform.
+      if (llvm::NullPointerIsDefined(F, PtrTy->getPointerAddressSpace()))
+        AssumedNoUBInsts.insert(&I);
+      else
+        KnownUBInsts.insert(&I);
+      return true;
+    };
+
+    auto InspectBrInstForUB = [&](Instruction &I) {
+      // A conditional branch instruction is considered UB if it has `undef`
+      // condition.
+
+      // Skip instructions that are already saved.
+      if (AssumedNoUBInsts.count(&I) || KnownUBInsts.count(&I))
+        return true;
+
+      // We know we have a branch instruction.
+      auto BrInst = cast<BranchInst>(&I);
+
+      // Unconditional branches are never considered UB.
+      if (BrInst->isUnconditional())
+        return true;
+
+      // Either we stopped and the appropriate action was taken,
+      // or we got back a simplified value to continue.
+      Optional<Value *> SimplifiedCond =
+          stopOnUndefOrAssumed(A, BrInst->getCondition(), BrInst);
+      if (!SimplifiedCond.hasValue())
+        return true;
+      AssumedNoUBInsts.insert(&I);
+      return true;
+    };
+
     auto InspectCallSiteForUB = [&](Instruction &I) {
       // Check whether a callsite always cause UB or not
 
@@ -2092,13 +2092,13 @@ struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior {
           return true;
         };
 
-    A.checkForAllInstructions(InspectMemAccessInstForUB, *this, 
-                              {Instruction::Load, Instruction::Store, 
-                               Instruction::AtomicCmpXchg, 
-                               Instruction::AtomicRMW}, 
-                              /* CheckBBLivenessOnly */ true); 
-    A.checkForAllInstructions(InspectBrInstForUB, *this, {Instruction::Br}, 
-                              /* CheckBBLivenessOnly */ true); 
+    A.checkForAllInstructions(InspectMemAccessInstForUB, *this,
+                              {Instruction::Load, Instruction::Store,
+                               Instruction::AtomicCmpXchg,
+                               Instruction::AtomicRMW},
+                              /* CheckBBLivenessOnly */ true);
+    A.checkForAllInstructions(InspectBrInstForUB, *this, {Instruction::Br},
+                              /* CheckBBLivenessOnly */ true);
     A.checkForAllCallLikeInstructions(InspectCallSiteForUB, *this);
 
     // If the returned position of the anchor scope has noundef attriubte, check
@@ -2115,575 +2115,575 @@ struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior {
       }
     }
 
-    if (NoUBPrevSize != AssumedNoUBInsts.size() || 
-        UBPrevSize != KnownUBInsts.size()) 
-      return ChangeStatus::CHANGED; 
-    return ChangeStatus::UNCHANGED; 
-  } 
- 
-  bool isKnownToCauseUB(Instruction *I) const override { 
-    return KnownUBInsts.count(I); 
-  } 
- 
-  bool isAssumedToCauseUB(Instruction *I) const override { 
-    // In simple words, if an instruction is not in the assumed to _not_ 
-    // cause UB, then it is assumed UB (that includes those 
-    // in the KnownUBInsts set). The rest is boilerplate 
-    // is to ensure that it is one of the instructions we test 
-    // for UB. 
- 
-    switch (I->getOpcode()) { 
-    case Instruction::Load: 
-    case Instruction::Store: 
-    case Instruction::AtomicCmpXchg: 
-    case Instruction::AtomicRMW: 
-      return !AssumedNoUBInsts.count(I); 
-    case Instruction::Br: { 
-      auto BrInst = cast<BranchInst>(I); 
-      if (BrInst->isUnconditional()) 
-        return false; 
-      return !AssumedNoUBInsts.count(I); 
-    } break; 
-    default: 
-      return false; 
-    } 
-    return false; 
-  } 
- 
-  ChangeStatus manifest(Attributor &A) override { 
-    if (KnownUBInsts.empty()) 
-      return ChangeStatus::UNCHANGED; 
-    for (Instruction *I : KnownUBInsts) 
-      A.changeToUnreachableAfterManifest(I); 
-    return ChangeStatus::CHANGED; 
-  } 
- 
-  /// See AbstractAttribute::getAsStr() 
-  const std::string getAsStr() const override { 
-    return getAssumed() ? "undefined-behavior" : "no-ub"; 
-  } 
- 
-  /// Note: The correctness of this analysis depends on the fact that the 
-  /// following 2 sets will stop changing after some point. 
-  /// "Change" here means that their size changes. 
-  /// The size of each set is monotonically increasing 
-  /// (we only add items to them) and it is upper bounded by the number of 
-  /// instructions in the processed function (we can never save more 
-  /// elements in either set than this number). Hence, at some point, 
-  /// they will stop increasing. 
-  /// Consequently, at some point, both sets will have stopped 
-  /// changing, effectively making the analysis reach a fixpoint. 
- 
-  /// Note: These 2 sets are disjoint and an instruction can be considered 
-  /// one of 3 things: 
-  /// 1) Known to cause UB (AAUndefinedBehavior could prove it) and put it in 
-  ///    the KnownUBInsts set. 
-  /// 2) Assumed to cause UB (in every updateImpl, AAUndefinedBehavior 
-  ///    has a reason to assume it). 
-  /// 3) Assumed to not cause UB. very other instruction - AAUndefinedBehavior 
-  ///    could not find a reason to assume or prove that it can cause UB, 
-  ///    hence it assumes it doesn't. We have a set for these instructions 
-  ///    so that we don't reprocess them in every update. 
-  ///    Note however that instructions in this set may cause UB. 
- 
-protected: 
-  /// A set of all live instructions _known_ to cause UB. 
-  SmallPtrSet<Instruction *, 8> KnownUBInsts; 
- 
-private: 
-  /// A set of all the (live) instructions that are assumed to _not_ cause UB. 
-  SmallPtrSet<Instruction *, 8> AssumedNoUBInsts; 
- 
-  // Should be called on updates in which if we're processing an instruction 
-  // \p I that depends on a value \p V, one of the following has to happen: 
-  // - If the value is assumed, then stop. 
-  // - If the value is known but undef, then consider it UB. 
-  // - Otherwise, do specific processing with the simplified value. 
-  // We return None in the first 2 cases to signify that an appropriate 
-  // action was taken and the caller should stop. 
-  // Otherwise, we return the simplified value that the caller should 
-  // use for specific processing. 
-  Optional<Value *> stopOnUndefOrAssumed(Attributor &A, const Value *V, 
-                                         Instruction *I) { 
-    const auto &ValueSimplifyAA = 
-        A.getAAFor<AAValueSimplify>(*this, IRPosition::value(*V)); 
-    Optional<Value *> SimplifiedV = 
-        ValueSimplifyAA.getAssumedSimplifiedValue(A); 
-    if (!ValueSimplifyAA.isKnown()) { 
-      // Don't depend on assumed values. 
-      return llvm::None; 
-    } 
-    if (!SimplifiedV.hasValue()) { 
-      // If it is known (which we tested above) but it doesn't have a value, 
-      // then we can assume `undef` and hence the instruction is UB. 
-      KnownUBInsts.insert(I); 
-      return llvm::None; 
-    } 
-    Value *Val = SimplifiedV.getValue(); 
-    if (isa<UndefValue>(Val)) { 
-      KnownUBInsts.insert(I); 
-      return llvm::None; 
-    } 
-    return Val; 
-  } 
-}; 
- 
-struct AAUndefinedBehaviorFunction final : AAUndefinedBehaviorImpl { 
-  AAUndefinedBehaviorFunction(const IRPosition &IRP, Attributor &A) 
-      : AAUndefinedBehaviorImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { 
-    STATS_DECL(UndefinedBehaviorInstruction, Instruction, 
-               "Number of instructions known to have UB"); 
-    BUILD_STAT_NAME(UndefinedBehaviorInstruction, Instruction) += 
-        KnownUBInsts.size(); 
-  } 
-}; 
- 
-/// ------------------------ Will-Return Attributes ---------------------------- 
- 
-// Helper function that checks whether a function has any cycle which we don't 
-// know if it is bounded or not. 
-// Loops with maximum trip count are considered bounded, any other cycle not. 
-static bool mayContainUnboundedCycle(Function &F, Attributor &A) { 
-  ScalarEvolution *SE = 
-      A.getInfoCache().getAnalysisResultForFunction<ScalarEvolutionAnalysis>(F); 
-  LoopInfo *LI = A.getInfoCache().getAnalysisResultForFunction<LoopAnalysis>(F); 
-  // If either SCEV or LoopInfo is not available for the function then we assume 
-  // any cycle to be unbounded cycle. 
-  // We use scc_iterator which uses Tarjan algorithm to find all the maximal 
-  // SCCs.To detect if there's a cycle, we only need to find the maximal ones. 
-  if (!SE || !LI) { 
-    for (scc_iterator<Function *> SCCI = scc_begin(&F); !SCCI.isAtEnd(); ++SCCI) 
-      if (SCCI.hasCycle()) 
-        return true; 
-    return false; 
-  } 
- 
-  // If there's irreducible control, the function may contain non-loop cycles. 
-  if (mayContainIrreducibleControl(F, LI)) 
-    return true; 
- 
-  // Any loop that does not have a max trip count is considered unbounded cycle. 
-  for (auto *L : LI->getLoopsInPreorder()) { 
-    if (!SE->getSmallConstantMaxTripCount(L)) 
-      return true; 
-  } 
-  return false; 
-} 
- 
-struct AAWillReturnImpl : public AAWillReturn { 
-  AAWillReturnImpl(const IRPosition &IRP, Attributor &A) 
-      : AAWillReturn(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    AAWillReturn::initialize(A); 
- 
-    Function *F = getAnchorScope(); 
+    if (NoUBPrevSize != AssumedNoUBInsts.size() ||
+        UBPrevSize != KnownUBInsts.size())
+      return ChangeStatus::CHANGED;
+    return ChangeStatus::UNCHANGED;
+  }
+
+  bool isKnownToCauseUB(Instruction *I) const override {
+    return KnownUBInsts.count(I);
+  }
+
+  bool isAssumedToCauseUB(Instruction *I) const override {
+    // In simple words, if an instruction is not in the assumed to _not_
+    // cause UB, then it is assumed UB (that includes those
+    // in the KnownUBInsts set). The rest is boilerplate
+    // is to ensure that it is one of the instructions we test
+    // for UB.
+
+    switch (I->getOpcode()) {
+    case Instruction::Load:
+    case Instruction::Store:
+    case Instruction::AtomicCmpXchg:
+    case Instruction::AtomicRMW:
+      return !AssumedNoUBInsts.count(I);
+    case Instruction::Br: {
+      auto BrInst = cast<BranchInst>(I);
+      if (BrInst->isUnconditional())
+        return false;
+      return !AssumedNoUBInsts.count(I);
+    } break;
+    default:
+      return false;
+    }
+    return false;
+  }
+
+  ChangeStatus manifest(Attributor &A) override {
+    if (KnownUBInsts.empty())
+      return ChangeStatus::UNCHANGED;
+    for (Instruction *I : KnownUBInsts)
+      A.changeToUnreachableAfterManifest(I);
+    return ChangeStatus::CHANGED;
+  }
+
+  /// See AbstractAttribute::getAsStr()
+  const std::string getAsStr() const override {
+    return getAssumed() ? "undefined-behavior" : "no-ub";
+  }
+
+  /// Note: The correctness of this analysis depends on the fact that the
+  /// following 2 sets will stop changing after some point.
+  /// "Change" here means that their size changes.
+  /// The size of each set is monotonically increasing
+  /// (we only add items to them) and it is upper bounded by the number of
+  /// instructions in the processed function (we can never save more
+  /// elements in either set than this number). Hence, at some point,
+  /// they will stop increasing.
+  /// Consequently, at some point, both sets will have stopped
+  /// changing, effectively making the analysis reach a fixpoint.
+
+  /// Note: These 2 sets are disjoint and an instruction can be considered
+  /// one of 3 things:
+  /// 1) Known to cause UB (AAUndefinedBehavior could prove it) and put it in
+  ///    the KnownUBInsts set.
+  /// 2) Assumed to cause UB (in every updateImpl, AAUndefinedBehavior
+  ///    has a reason to assume it).
+  /// 3) Assumed to not cause UB. very other instruction - AAUndefinedBehavior
+  ///    could not find a reason to assume or prove that it can cause UB,
+  ///    hence it assumes it doesn't. We have a set for these instructions
+  ///    so that we don't reprocess them in every update.
+  ///    Note however that instructions in this set may cause UB.
+
+protected:
+  /// A set of all live instructions _known_ to cause UB.
+  SmallPtrSet<Instruction *, 8> KnownUBInsts;
+
+private:
+  /// A set of all the (live) instructions that are assumed to _not_ cause UB.
+  SmallPtrSet<Instruction *, 8> AssumedNoUBInsts;
+
+  // Should be called on updates in which if we're processing an instruction
+  // \p I that depends on a value \p V, one of the following has to happen:
+  // - If the value is assumed, then stop.
+  // - If the value is known but undef, then consider it UB.
+  // - Otherwise, do specific processing with the simplified value.
+  // We return None in the first 2 cases to signify that an appropriate
+  // action was taken and the caller should stop.
+  // Otherwise, we return the simplified value that the caller should
+  // use for specific processing.
+  Optional<Value *> stopOnUndefOrAssumed(Attributor &A, const Value *V,
+                                         Instruction *I) {
+    const auto &ValueSimplifyAA =
+        A.getAAFor<AAValueSimplify>(*this, IRPosition::value(*V));
+    Optional<Value *> SimplifiedV =
+        ValueSimplifyAA.getAssumedSimplifiedValue(A);
+    if (!ValueSimplifyAA.isKnown()) {
+      // Don't depend on assumed values.
+      return llvm::None;
+    }
+    if (!SimplifiedV.hasValue()) {
+      // If it is known (which we tested above) but it doesn't have a value,
+      // then we can assume `undef` and hence the instruction is UB.
+      KnownUBInsts.insert(I);
+      return llvm::None;
+    }
+    Value *Val = SimplifiedV.getValue();
+    if (isa<UndefValue>(Val)) {
+      KnownUBInsts.insert(I);
+      return llvm::None;
+    }
+    return Val;
+  }
+};
+
+struct AAUndefinedBehaviorFunction final : AAUndefinedBehaviorImpl {
+  AAUndefinedBehaviorFunction(const IRPosition &IRP, Attributor &A)
+      : AAUndefinedBehaviorImpl(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECL(UndefinedBehaviorInstruction, Instruction,
+               "Number of instructions known to have UB");
+    BUILD_STAT_NAME(UndefinedBehaviorInstruction, Instruction) +=
+        KnownUBInsts.size();
+  }
+};
+
+/// ------------------------ Will-Return Attributes ----------------------------
+
+// Helper function that checks whether a function has any cycle which we don't
+// know if it is bounded or not.
+// Loops with maximum trip count are considered bounded, any other cycle not.
+static bool mayContainUnboundedCycle(Function &F, Attributor &A) {
+  ScalarEvolution *SE =
+      A.getInfoCache().getAnalysisResultForFunction<ScalarEvolutionAnalysis>(F);
+  LoopInfo *LI = A.getInfoCache().getAnalysisResultForFunction<LoopAnalysis>(F);
+  // If either SCEV or LoopInfo is not available for the function then we assume
+  // any cycle to be unbounded cycle.
+  // We use scc_iterator which uses Tarjan algorithm to find all the maximal
+  // SCCs.To detect if there's a cycle, we only need to find the maximal ones.
+  if (!SE || !LI) {
+    for (scc_iterator<Function *> SCCI = scc_begin(&F); !SCCI.isAtEnd(); ++SCCI)
+      if (SCCI.hasCycle())
+        return true;
+    return false;
+  }
+
+  // If there's irreducible control, the function may contain non-loop cycles.
+  if (mayContainIrreducibleControl(F, LI))
+    return true;
+
+  // Any loop that does not have a max trip count is considered unbounded cycle.
+  for (auto *L : LI->getLoopsInPreorder()) {
+    if (!SE->getSmallConstantMaxTripCount(L))
+      return true;
+  }
+  return false;
+}
+
+struct AAWillReturnImpl : public AAWillReturn {
+  AAWillReturnImpl(const IRPosition &IRP, Attributor &A)
+      : AAWillReturn(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AAWillReturn::initialize(A);
+
+    Function *F = getAnchorScope();
     if (!F || F->isDeclaration() || mayContainUnboundedCycle(*F, A))
-      indicatePessimisticFixpoint(); 
-  } 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    auto CheckForWillReturn = [&](Instruction &I) { 
-      IRPosition IPos = IRPosition::callsite_function(cast<CallBase>(I)); 
-      const auto &WillReturnAA = A.getAAFor<AAWillReturn>(*this, IPos); 
-      if (WillReturnAA.isKnownWillReturn()) 
-        return true; 
-      if (!WillReturnAA.isAssumedWillReturn()) 
-        return false; 
-      const auto &NoRecurseAA = A.getAAFor<AANoRecurse>(*this, IPos); 
-      return NoRecurseAA.isAssumedNoRecurse(); 
-    }; 
- 
-    if (!A.checkForAllCallLikeInstructions(CheckForWillReturn, *this)) 
-      return indicatePessimisticFixpoint(); 
- 
-    return ChangeStatus::UNCHANGED; 
-  } 
- 
-  /// See AbstractAttribute::getAsStr() 
-  const std::string getAsStr() const override { 
-    return getAssumed() ? "willreturn" : "may-noreturn"; 
-  } 
-}; 
- 
-struct AAWillReturnFunction final : AAWillReturnImpl { 
-  AAWillReturnFunction(const IRPosition &IRP, Attributor &A) 
-      : AAWillReturnImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(willreturn) } 
-}; 
- 
-/// WillReturn attribute deduction for a call sites. 
-struct AAWillReturnCallSite final : AAWillReturnImpl { 
-  AAWillReturnCallSite(const IRPosition &IRP, Attributor &A) 
-      : AAWillReturnImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    auto CheckForWillReturn = [&](Instruction &I) {
+      IRPosition IPos = IRPosition::callsite_function(cast<CallBase>(I));
+      const auto &WillReturnAA = A.getAAFor<AAWillReturn>(*this, IPos);
+      if (WillReturnAA.isKnownWillReturn())
+        return true;
+      if (!WillReturnAA.isAssumedWillReturn())
+        return false;
+      const auto &NoRecurseAA = A.getAAFor<AANoRecurse>(*this, IPos);
+      return NoRecurseAA.isAssumedNoRecurse();
+    };
+
+    if (!A.checkForAllCallLikeInstructions(CheckForWillReturn, *this))
+      return indicatePessimisticFixpoint();
+
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::getAsStr()
+  const std::string getAsStr() const override {
+    return getAssumed() ? "willreturn" : "may-noreturn";
+  }
+};
+
+struct AAWillReturnFunction final : AAWillReturnImpl {
+  AAWillReturnFunction(const IRPosition &IRP, Attributor &A)
+      : AAWillReturnImpl(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(willreturn) }
+};
+
+/// WillReturn attribute deduction for a call sites.
+struct AAWillReturnCallSite final : AAWillReturnImpl {
+  AAWillReturnCallSite(const IRPosition &IRP, Attributor &A)
+      : AAWillReturnImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
     AAWillReturn::initialize(A);
-    Function *F = getAssociatedFunction(); 
+    Function *F = getAssociatedFunction();
     if (!F || !A.isFunctionIPOAmendable(*F))
-      indicatePessimisticFixpoint(); 
-  } 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    // TODO: Once we have call site specific value information we can provide 
-    //       call site specific liveness information and then it makes 
-    //       sense to specialize attributes for call sites arguments instead of 
-    //       redirecting requests to the callee argument. 
-    Function *F = getAssociatedFunction(); 
-    const IRPosition &FnPos = IRPosition::function(*F); 
-    auto &FnAA = A.getAAFor<AAWillReturn>(*this, FnPos); 
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::function(*F);
+    auto &FnAA = A.getAAFor<AAWillReturn>(*this, FnPos);
     return clampStateAndIndicateChange(getState(), FnAA.getState());
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(willreturn); } 
-}; 
- 
-/// -------------------AAReachability Attribute-------------------------- 
- 
-struct AAReachabilityImpl : AAReachability { 
-  AAReachabilityImpl(const IRPosition &IRP, Attributor &A) 
-      : AAReachability(IRP, A) {} 
- 
-  const std::string getAsStr() const override { 
-    // TODO: Return the number of reachable queries. 
-    return "reachable"; 
-  } 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { indicatePessimisticFixpoint(); } 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    return indicatePessimisticFixpoint(); 
-  } 
-}; 
- 
-struct AAReachabilityFunction final : public AAReachabilityImpl { 
-  AAReachabilityFunction(const IRPosition &IRP, Attributor &A) 
-      : AAReachabilityImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(reachable); } 
-}; 
- 
-/// ------------------------ NoAlias Argument Attribute ------------------------ 
- 
-struct AANoAliasImpl : AANoAlias { 
-  AANoAliasImpl(const IRPosition &IRP, Attributor &A) : AANoAlias(IRP, A) { 
-    assert(getAssociatedType()->isPointerTy() && 
-           "Noalias is a pointer attribute"); 
-  } 
- 
-  const std::string getAsStr() const override { 
-    return getAssumed() ? "noalias" : "may-alias"; 
-  } 
-}; 
- 
-/// NoAlias attribute for a floating value. 
-struct AANoAliasFloating final : AANoAliasImpl { 
-  AANoAliasFloating(const IRPosition &IRP, Attributor &A) 
-      : AANoAliasImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    AANoAliasImpl::initialize(A); 
-    Value *Val = &getAssociatedValue(); 
-    do { 
-      CastInst *CI = dyn_cast<CastInst>(Val); 
-      if (!CI) 
-        break; 
-      Value *Base = CI->getOperand(0); 
-      if (!Base->hasOneUse()) 
-        break; 
-      Val = Base; 
-    } while (true); 
- 
-    if (!Val->getType()->isPointerTy()) { 
-      indicatePessimisticFixpoint(); 
-      return; 
-    } 
- 
-    if (isa<AllocaInst>(Val)) 
-      indicateOptimisticFixpoint(); 
-    else if (isa<ConstantPointerNull>(Val) && 
-             !NullPointerIsDefined(getAnchorScope(), 
-                                   Val->getType()->getPointerAddressSpace())) 
-      indicateOptimisticFixpoint(); 
-    else if (Val != &getAssociatedValue()) { 
-      const auto &ValNoAliasAA = 
-          A.getAAFor<AANoAlias>(*this, IRPosition::value(*Val)); 
-      if (ValNoAliasAA.isKnownNoAlias()) 
-        indicateOptimisticFixpoint(); 
-    } 
-  } 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    // TODO: Implement this. 
-    return indicatePessimisticFixpoint(); 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { 
-    STATS_DECLTRACK_FLOATING_ATTR(noalias) 
-  } 
-}; 
- 
-/// NoAlias attribute for an argument. 
-struct AANoAliasArgument final 
-    : AAArgumentFromCallSiteArguments<AANoAlias, AANoAliasImpl> { 
-  using Base = AAArgumentFromCallSiteArguments<AANoAlias, AANoAliasImpl>; 
-  AANoAliasArgument(const IRPosition &IRP, Attributor &A) : Base(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    Base::initialize(A); 
-    // See callsite argument attribute and callee argument attribute. 
-    if (hasAttr({Attribute::ByVal})) 
-      indicateOptimisticFixpoint(); 
-  } 
- 
-  /// See AbstractAttribute::update(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    // We have to make sure no-alias on the argument does not break 
-    // synchronization when this is a callback argument, see also [1] below. 
-    // If synchronization cannot be affected, we delegate to the base updateImpl 
-    // function, otherwise we give up for now. 
- 
-    // If the function is no-sync, no-alias cannot break synchronization. 
-    const auto &NoSyncAA = A.getAAFor<AANoSync>( 
-        *this, IRPosition::function_scope(getIRPosition())); 
-    if (NoSyncAA.isAssumedNoSync()) 
-      return Base::updateImpl(A); 
- 
-    // If the argument is read-only, no-alias cannot break synchronization. 
-    const auto &MemBehaviorAA = 
-        A.getAAFor<AAMemoryBehavior>(*this, getIRPosition()); 
-    if (MemBehaviorAA.isAssumedReadOnly()) 
-      return Base::updateImpl(A); 
- 
-    // If the argument is never passed through callbacks, no-alias cannot break 
-    // synchronization. 
-    bool AllCallSitesKnown; 
-    if (A.checkForAllCallSites( 
-            [](AbstractCallSite ACS) { return !ACS.isCallbackCall(); }, *this, 
-            true, AllCallSitesKnown)) 
-      return Base::updateImpl(A); 
- 
-    // TODO: add no-alias but make sure it doesn't break synchronization by 
-    // introducing fake uses. See: 
-    // [1] Compiler Optimizations for OpenMP, J. Doerfert and H. Finkel, 
-    //     International Workshop on OpenMP 2018, 
-    //     http://compilers.cs.uni-saarland.de/people/doerfert/par_opt18.pdf 
- 
-    return indicatePessimisticFixpoint(); 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(noalias) } 
-}; 
- 
-struct AANoAliasCallSiteArgument final : AANoAliasImpl { 
-  AANoAliasCallSiteArgument(const IRPosition &IRP, Attributor &A) 
-      : AANoAliasImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    // See callsite argument attribute and callee argument attribute. 
-    const auto &CB = cast<CallBase>(getAnchorValue()); 
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(willreturn); }
+};
+
+/// -------------------AAReachability Attribute--------------------------
+
+struct AAReachabilityImpl : AAReachability {
+  AAReachabilityImpl(const IRPosition &IRP, Attributor &A)
+      : AAReachability(IRP, A) {}
+
+  const std::string getAsStr() const override {
+    // TODO: Return the number of reachable queries.
+    return "reachable";
+  }
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override { indicatePessimisticFixpoint(); }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    return indicatePessimisticFixpoint();
+  }
+};
+
+struct AAReachabilityFunction final : public AAReachabilityImpl {
+  AAReachabilityFunction(const IRPosition &IRP, Attributor &A)
+      : AAReachabilityImpl(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(reachable); }
+};
+
+/// ------------------------ NoAlias Argument Attribute ------------------------
+
+struct AANoAliasImpl : AANoAlias {
+  AANoAliasImpl(const IRPosition &IRP, Attributor &A) : AANoAlias(IRP, A) {
+    assert(getAssociatedType()->isPointerTy() &&
+           "Noalias is a pointer attribute");
+  }
+
+  const std::string getAsStr() const override {
+    return getAssumed() ? "noalias" : "may-alias";
+  }
+};
+
+/// NoAlias attribute for a floating value.
+struct AANoAliasFloating final : AANoAliasImpl {
+  AANoAliasFloating(const IRPosition &IRP, Attributor &A)
+      : AANoAliasImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoAliasImpl::initialize(A);
+    Value *Val = &getAssociatedValue();
+    do {
+      CastInst *CI = dyn_cast<CastInst>(Val);
+      if (!CI)
+        break;
+      Value *Base = CI->getOperand(0);
+      if (!Base->hasOneUse())
+        break;
+      Val = Base;
+    } while (true);
+
+    if (!Val->getType()->isPointerTy()) {
+      indicatePessimisticFixpoint();
+      return;
+    }
+
+    if (isa<AllocaInst>(Val))
+      indicateOptimisticFixpoint();
+    else if (isa<ConstantPointerNull>(Val) &&
+             !NullPointerIsDefined(getAnchorScope(),
+                                   Val->getType()->getPointerAddressSpace()))
+      indicateOptimisticFixpoint();
+    else if (Val != &getAssociatedValue()) {
+      const auto &ValNoAliasAA =
+          A.getAAFor<AANoAlias>(*this, IRPosition::value(*Val));
+      if (ValNoAliasAA.isKnownNoAlias())
+        indicateOptimisticFixpoint();
+    }
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Implement this.
+    return indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FLOATING_ATTR(noalias)
+  }
+};
+
+/// NoAlias attribute for an argument.
+struct AANoAliasArgument final
+    : AAArgumentFromCallSiteArguments<AANoAlias, AANoAliasImpl> {
+  using Base = AAArgumentFromCallSiteArguments<AANoAlias, AANoAliasImpl>;
+  AANoAliasArgument(const IRPosition &IRP, Attributor &A) : Base(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    Base::initialize(A);
+    // See callsite argument attribute and callee argument attribute.
+    if (hasAttr({Attribute::ByVal}))
+      indicateOptimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::update(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // We have to make sure no-alias on the argument does not break
+    // synchronization when this is a callback argument, see also [1] below.
+    // If synchronization cannot be affected, we delegate to the base updateImpl
+    // function, otherwise we give up for now.
+
+    // If the function is no-sync, no-alias cannot break synchronization.
+    const auto &NoSyncAA = A.getAAFor<AANoSync>(
+        *this, IRPosition::function_scope(getIRPosition()));
+    if (NoSyncAA.isAssumedNoSync())
+      return Base::updateImpl(A);
+
+    // If the argument is read-only, no-alias cannot break synchronization.
+    const auto &MemBehaviorAA =
+        A.getAAFor<AAMemoryBehavior>(*this, getIRPosition());
+    if (MemBehaviorAA.isAssumedReadOnly())
+      return Base::updateImpl(A);
+
+    // If the argument is never passed through callbacks, no-alias cannot break
+    // synchronization.
+    bool AllCallSitesKnown;
+    if (A.checkForAllCallSites(
+            [](AbstractCallSite ACS) { return !ACS.isCallbackCall(); }, *this,
+            true, AllCallSitesKnown))
+      return Base::updateImpl(A);
+
+    // TODO: add no-alias but make sure it doesn't break synchronization by
+    // introducing fake uses. See:
+    // [1] Compiler Optimizations for OpenMP, J. Doerfert and H. Finkel,
+    //     International Workshop on OpenMP 2018,
+    //     http://compilers.cs.uni-saarland.de/people/doerfert/par_opt18.pdf
+
+    return indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(noalias) }
+};
+
+struct AANoAliasCallSiteArgument final : AANoAliasImpl {
+  AANoAliasCallSiteArgument(const IRPosition &IRP, Attributor &A)
+      : AANoAliasImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    // See callsite argument attribute and callee argument attribute.
+    const auto &CB = cast<CallBase>(getAnchorValue());
     if (CB.paramHasAttr(getCallSiteArgNo(), Attribute::NoAlias))
-      indicateOptimisticFixpoint(); 
-    Value &Val = getAssociatedValue(); 
-    if (isa<ConstantPointerNull>(Val) && 
-        !NullPointerIsDefined(getAnchorScope(), 
-                              Val.getType()->getPointerAddressSpace())) 
-      indicateOptimisticFixpoint(); 
-  } 
- 
-  /// Determine if the underlying value may alias with the call site argument 
-  /// \p OtherArgNo of \p ICS (= the underlying call site). 
-  bool mayAliasWithArgument(Attributor &A, AAResults *&AAR, 
-                            const AAMemoryBehavior &MemBehaviorAA, 
-                            const CallBase &CB, unsigned OtherArgNo) { 
-    // We do not need to worry about aliasing with the underlying IRP. 
+      indicateOptimisticFixpoint();
+    Value &Val = getAssociatedValue();
+    if (isa<ConstantPointerNull>(Val) &&
+        !NullPointerIsDefined(getAnchorScope(),
+                              Val.getType()->getPointerAddressSpace()))
+      indicateOptimisticFixpoint();
+  }
+
+  /// Determine if the underlying value may alias with the call site argument
+  /// \p OtherArgNo of \p ICS (= the underlying call site).
+  bool mayAliasWithArgument(Attributor &A, AAResults *&AAR,
+                            const AAMemoryBehavior &MemBehaviorAA,
+                            const CallBase &CB, unsigned OtherArgNo) {
+    // We do not need to worry about aliasing with the underlying IRP.
     if (this->getCalleeArgNo() == (int)OtherArgNo)
-      return false; 
- 
-    // If it is not a pointer or pointer vector we do not alias. 
-    const Value *ArgOp = CB.getArgOperand(OtherArgNo); 
-    if (!ArgOp->getType()->isPtrOrPtrVectorTy()) 
-      return false; 
- 
-    auto &CBArgMemBehaviorAA = A.getAAFor<AAMemoryBehavior>( 
-        *this, IRPosition::callsite_argument(CB, OtherArgNo), 
-        /* TrackDependence */ false); 
- 
-    // If the argument is readnone, there is no read-write aliasing. 
-    if (CBArgMemBehaviorAA.isAssumedReadNone()) { 
-      A.recordDependence(CBArgMemBehaviorAA, *this, DepClassTy::OPTIONAL); 
-      return false; 
-    } 
- 
-    // If the argument is readonly and the underlying value is readonly, there 
-    // is no read-write aliasing. 
-    bool IsReadOnly = MemBehaviorAA.isAssumedReadOnly(); 
-    if (CBArgMemBehaviorAA.isAssumedReadOnly() && IsReadOnly) { 
-      A.recordDependence(MemBehaviorAA, *this, DepClassTy::OPTIONAL); 
-      A.recordDependence(CBArgMemBehaviorAA, *this, DepClassTy::OPTIONAL); 
-      return false; 
-    } 
- 
-    // We have to utilize actual alias analysis queries so we need the object. 
-    if (!AAR) 
-      AAR = A.getInfoCache().getAAResultsForFunction(*getAnchorScope()); 
- 
-    // Try to rule it out at the call site. 
-    bool IsAliasing = !AAR || !AAR->isNoAlias(&getAssociatedValue(), ArgOp); 
-    LLVM_DEBUG(dbgs() << "[NoAliasCSArg] Check alias between " 
-                         "callsite arguments: " 
-                      << getAssociatedValue() << " " << *ArgOp << " => " 
-                      << (IsAliasing ? "" : "no-") << "alias \n"); 
- 
-    return IsAliasing; 
-  } 
- 
-  bool 
-  isKnownNoAliasDueToNoAliasPreservation(Attributor &A, AAResults *&AAR, 
-                                         const AAMemoryBehavior &MemBehaviorAA, 
-                                         const AANoAlias &NoAliasAA) { 
-    // We can deduce "noalias" if the following conditions hold. 
-    // (i)   Associated value is assumed to be noalias in the definition. 
-    // (ii)  Associated value is assumed to be no-capture in all the uses 
-    //       possibly executed before this callsite. 
-    // (iii) There is no other pointer argument which could alias with the 
-    //       value. 
- 
-    bool AssociatedValueIsNoAliasAtDef = NoAliasAA.isAssumedNoAlias(); 
-    if (!AssociatedValueIsNoAliasAtDef) { 
-      LLVM_DEBUG(dbgs() << "[AANoAlias] " << getAssociatedValue() 
-                        << " is not no-alias at the definition\n"); 
-      return false; 
-    } 
- 
-    A.recordDependence(NoAliasAA, *this, DepClassTy::OPTIONAL); 
- 
-    const IRPosition &VIRP = IRPosition::value(getAssociatedValue()); 
+      return false;
+
+    // If it is not a pointer or pointer vector we do not alias.
+    const Value *ArgOp = CB.getArgOperand(OtherArgNo);
+    if (!ArgOp->getType()->isPtrOrPtrVectorTy())
+      return false;
+
+    auto &CBArgMemBehaviorAA = A.getAAFor<AAMemoryBehavior>(
+        *this, IRPosition::callsite_argument(CB, OtherArgNo),
+        /* TrackDependence */ false);
+
+    // If the argument is readnone, there is no read-write aliasing.
+    if (CBArgMemBehaviorAA.isAssumedReadNone()) {
+      A.recordDependence(CBArgMemBehaviorAA, *this, DepClassTy::OPTIONAL);
+      return false;
+    }
+
+    // If the argument is readonly and the underlying value is readonly, there
+    // is no read-write aliasing.
+    bool IsReadOnly = MemBehaviorAA.isAssumedReadOnly();
+    if (CBArgMemBehaviorAA.isAssumedReadOnly() && IsReadOnly) {
+      A.recordDependence(MemBehaviorAA, *this, DepClassTy::OPTIONAL);
+      A.recordDependence(CBArgMemBehaviorAA, *this, DepClassTy::OPTIONAL);
+      return false;
+    }
+
+    // We have to utilize actual alias analysis queries so we need the object.
+    if (!AAR)
+      AAR = A.getInfoCache().getAAResultsForFunction(*getAnchorScope());
+
+    // Try to rule it out at the call site.
+    bool IsAliasing = !AAR || !AAR->isNoAlias(&getAssociatedValue(), ArgOp);
+    LLVM_DEBUG(dbgs() << "[NoAliasCSArg] Check alias between "
+                         "callsite arguments: "
+                      << getAssociatedValue() << " " << *ArgOp << " => "
+                      << (IsAliasing ? "" : "no-") << "alias \n");
+
+    return IsAliasing;
+  }
+
+  bool
+  isKnownNoAliasDueToNoAliasPreservation(Attributor &A, AAResults *&AAR,
+                                         const AAMemoryBehavior &MemBehaviorAA,
+                                         const AANoAlias &NoAliasAA) {
+    // We can deduce "noalias" if the following conditions hold.
+    // (i)   Associated value is assumed to be noalias in the definition.
+    // (ii)  Associated value is assumed to be no-capture in all the uses
+    //       possibly executed before this callsite.
+    // (iii) There is no other pointer argument which could alias with the
+    //       value.
+
+    bool AssociatedValueIsNoAliasAtDef = NoAliasAA.isAssumedNoAlias();
+    if (!AssociatedValueIsNoAliasAtDef) {
+      LLVM_DEBUG(dbgs() << "[AANoAlias] " << getAssociatedValue()
+                        << " is not no-alias at the definition\n");
+      return false;
+    }
+
+    A.recordDependence(NoAliasAA, *this, DepClassTy::OPTIONAL);
+
+    const IRPosition &VIRP = IRPosition::value(getAssociatedValue());
     const Function *ScopeFn = VIRP.getAnchorScope();
-    auto &NoCaptureAA = 
-        A.getAAFor<AANoCapture>(*this, VIRP, /* TrackDependence */ false); 
-    // Check whether the value is captured in the scope using AANoCapture. 
-    //      Look at CFG and check only uses possibly executed before this 
-    //      callsite. 
-    auto UsePred = [&](const Use &U, bool &Follow) -> bool { 
-      Instruction *UserI = cast<Instruction>(U.getUser()); 
- 
+    auto &NoCaptureAA =
+        A.getAAFor<AANoCapture>(*this, VIRP, /* TrackDependence */ false);
+    // Check whether the value is captured in the scope using AANoCapture.
+    //      Look at CFG and check only uses possibly executed before this
+    //      callsite.
+    auto UsePred = [&](const Use &U, bool &Follow) -> bool {
+      Instruction *UserI = cast<Instruction>(U.getUser());
+
       // If UserI is the curr instruction and there is a single potential use of
       // the value in UserI we allow the use.
       // TODO: We should inspect the operands and allow those that cannot alias
       //       with the value.
       if (UserI == getCtxI() && UserI->getNumOperands() == 1)
-        return true; 
- 
-      if (ScopeFn) { 
-        const auto &ReachabilityAA = 
-            A.getAAFor<AAReachability>(*this, IRPosition::function(*ScopeFn)); 
- 
+        return true;
+
+      if (ScopeFn) {
+        const auto &ReachabilityAA =
+            A.getAAFor<AAReachability>(*this, IRPosition::function(*ScopeFn));
+
         if (!ReachabilityAA.isAssumedReachable(A, *UserI, *getCtxI()))
-          return true; 
- 
-        if (auto *CB = dyn_cast<CallBase>(UserI)) { 
-          if (CB->isArgOperand(&U)) { 
- 
-            unsigned ArgNo = CB->getArgOperandNo(&U); 
- 
-            const auto &NoCaptureAA = A.getAAFor<AANoCapture>( 
-                *this, IRPosition::callsite_argument(*CB, ArgNo)); 
- 
-            if (NoCaptureAA.isAssumedNoCapture()) 
-              return true; 
-          } 
-        } 
-      } 
- 
-      // For cases which can potentially have more users 
-      if (isa<GetElementPtrInst>(U) || isa<BitCastInst>(U) || isa<PHINode>(U) || 
-          isa<SelectInst>(U)) { 
-        Follow = true; 
-        return true; 
-      } 
- 
-      LLVM_DEBUG(dbgs() << "[AANoAliasCSArg] Unknown user: " << *U << "\n"); 
-      return false; 
-    }; 
- 
-    if (!NoCaptureAA.isAssumedNoCaptureMaybeReturned()) { 
-      if (!A.checkForAllUses(UsePred, *this, getAssociatedValue())) { 
-        LLVM_DEBUG( 
-            dbgs() << "[AANoAliasCSArg] " << getAssociatedValue() 
-                   << " cannot be noalias as it is potentially captured\n"); 
-        return false; 
-      } 
-    } 
-    A.recordDependence(NoCaptureAA, *this, DepClassTy::OPTIONAL); 
- 
-    // Check there is no other pointer argument which could alias with the 
-    // value passed at this call site. 
-    // TODO: AbstractCallSite 
-    const auto &CB = cast<CallBase>(getAnchorValue()); 
-    for (unsigned OtherArgNo = 0; OtherArgNo < CB.getNumArgOperands(); 
-         OtherArgNo++) 
-      if (mayAliasWithArgument(A, AAR, MemBehaviorAA, CB, OtherArgNo)) 
-        return false; 
- 
-    return true; 
-  } 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    // If the argument is readnone we are done as there are no accesses via the 
-    // argument. 
-    auto &MemBehaviorAA = 
-        A.getAAFor<AAMemoryBehavior>(*this, getIRPosition(), 
-                                     /* TrackDependence */ false); 
-    if (MemBehaviorAA.isAssumedReadNone()) { 
-      A.recordDependence(MemBehaviorAA, *this, DepClassTy::OPTIONAL); 
-      return ChangeStatus::UNCHANGED; 
-    } 
- 
-    const IRPosition &VIRP = IRPosition::value(getAssociatedValue()); 
-    const auto &NoAliasAA = A.getAAFor<AANoAlias>(*this, VIRP, 
-                                                  /* TrackDependence */ false); 
- 
-    AAResults *AAR = nullptr; 
-    if (isKnownNoAliasDueToNoAliasPreservation(A, AAR, MemBehaviorAA, 
-                                               NoAliasAA)) { 
-      LLVM_DEBUG( 
-          dbgs() << "[AANoAlias] No-Alias deduced via no-alias preservation\n"); 
-      return ChangeStatus::UNCHANGED; 
-    } 
- 
-    return indicatePessimisticFixpoint(); 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(noalias) } 
-}; 
- 
-/// NoAlias attribute for function return value. 
-struct AANoAliasReturned final : AANoAliasImpl { 
-  AANoAliasReturned(const IRPosition &IRP, Attributor &A) 
-      : AANoAliasImpl(IRP, A) {} 
- 
+          return true;
+
+        if (auto *CB = dyn_cast<CallBase>(UserI)) {
+          if (CB->isArgOperand(&U)) {
+
+            unsigned ArgNo = CB->getArgOperandNo(&U);
+
+            const auto &NoCaptureAA = A.getAAFor<AANoCapture>(
+                *this, IRPosition::callsite_argument(*CB, ArgNo));
+
+            if (NoCaptureAA.isAssumedNoCapture())
+              return true;
+          }
+        }
+      }
+
+      // For cases which can potentially have more users
+      if (isa<GetElementPtrInst>(U) || isa<BitCastInst>(U) || isa<PHINode>(U) ||
+          isa<SelectInst>(U)) {
+        Follow = true;
+        return true;
+      }
+
+      LLVM_DEBUG(dbgs() << "[AANoAliasCSArg] Unknown user: " << *U << "\n");
+      return false;
+    };
+
+    if (!NoCaptureAA.isAssumedNoCaptureMaybeReturned()) {
+      if (!A.checkForAllUses(UsePred, *this, getAssociatedValue())) {
+        LLVM_DEBUG(
+            dbgs() << "[AANoAliasCSArg] " << getAssociatedValue()
+                   << " cannot be noalias as it is potentially captured\n");
+        return false;
+      }
+    }
+    A.recordDependence(NoCaptureAA, *this, DepClassTy::OPTIONAL);
+
+    // Check there is no other pointer argument which could alias with the
+    // value passed at this call site.
+    // TODO: AbstractCallSite
+    const auto &CB = cast<CallBase>(getAnchorValue());
+    for (unsigned OtherArgNo = 0; OtherArgNo < CB.getNumArgOperands();
+         OtherArgNo++)
+      if (mayAliasWithArgument(A, AAR, MemBehaviorAA, CB, OtherArgNo))
+        return false;
+
+    return true;
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // If the argument is readnone we are done as there are no accesses via the
+    // argument.
+    auto &MemBehaviorAA =
+        A.getAAFor<AAMemoryBehavior>(*this, getIRPosition(),
+                                     /* TrackDependence */ false);
+    if (MemBehaviorAA.isAssumedReadNone()) {
+      A.recordDependence(MemBehaviorAA, *this, DepClassTy::OPTIONAL);
+      return ChangeStatus::UNCHANGED;
+    }
+
+    const IRPosition &VIRP = IRPosition::value(getAssociatedValue());
+    const auto &NoAliasAA = A.getAAFor<AANoAlias>(*this, VIRP,
+                                                  /* TrackDependence */ false);
+
+    AAResults *AAR = nullptr;
+    if (isKnownNoAliasDueToNoAliasPreservation(A, AAR, MemBehaviorAA,
+                                               NoAliasAA)) {
+      LLVM_DEBUG(
+          dbgs() << "[AANoAlias] No-Alias deduced via no-alias preservation\n");
+      return ChangeStatus::UNCHANGED;
+    }
+
+    return indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(noalias) }
+};
+
+/// NoAlias attribute for function return value.
+struct AANoAliasReturned final : AANoAliasImpl {
+  AANoAliasReturned(const IRPosition &IRP, Attributor &A)
+      : AANoAliasImpl(IRP, A) {}
+
   /// See AbstractAttribute::initialize(...).
   void initialize(Attributor &A) override {
     AANoAliasImpl::initialize(A);
@@ -2692,371 +2692,371 @@ struct AANoAliasReturned final : AANoAliasImpl {
       indicatePessimisticFixpoint();
   }
 
-  /// See AbstractAttribute::updateImpl(...). 
-  virtual ChangeStatus updateImpl(Attributor &A) override { 
- 
-    auto CheckReturnValue = [&](Value &RV) -> bool { 
-      if (Constant *C = dyn_cast<Constant>(&RV)) 
-        if (C->isNullValue() || isa<UndefValue>(C)) 
-          return true; 
- 
-      /// For now, we can only deduce noalias if we have call sites. 
-      /// FIXME: add more support. 
-      if (!isa<CallBase>(&RV)) 
-        return false; 
- 
-      const IRPosition &RVPos = IRPosition::value(RV); 
-      const auto &NoAliasAA = A.getAAFor<AANoAlias>(*this, RVPos); 
-      if (!NoAliasAA.isAssumedNoAlias()) 
-        return false; 
- 
-      const auto &NoCaptureAA = A.getAAFor<AANoCapture>(*this, RVPos); 
-      return NoCaptureAA.isAssumedNoCaptureMaybeReturned(); 
-    }; 
- 
-    if (!A.checkForAllReturnedValues(CheckReturnValue, *this)) 
-      return indicatePessimisticFixpoint(); 
- 
-    return ChangeStatus::UNCHANGED; 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(noalias) } 
-}; 
- 
-/// NoAlias attribute deduction for a call site return value. 
-struct AANoAliasCallSiteReturned final : AANoAliasImpl { 
-  AANoAliasCallSiteReturned(const IRPosition &IRP, Attributor &A) 
-      : AANoAliasImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    AANoAliasImpl::initialize(A); 
-    Function *F = getAssociatedFunction(); 
+  /// See AbstractAttribute::updateImpl(...).
+  virtual ChangeStatus updateImpl(Attributor &A) override {
+
+    auto CheckReturnValue = [&](Value &RV) -> bool {
+      if (Constant *C = dyn_cast<Constant>(&RV))
+        if (C->isNullValue() || isa<UndefValue>(C))
+          return true;
+
+      /// For now, we can only deduce noalias if we have call sites.
+      /// FIXME: add more support.
+      if (!isa<CallBase>(&RV))
+        return false;
+
+      const IRPosition &RVPos = IRPosition::value(RV);
+      const auto &NoAliasAA = A.getAAFor<AANoAlias>(*this, RVPos);
+      if (!NoAliasAA.isAssumedNoAlias())
+        return false;
+
+      const auto &NoCaptureAA = A.getAAFor<AANoCapture>(*this, RVPos);
+      return NoCaptureAA.isAssumedNoCaptureMaybeReturned();
+    };
+
+    if (!A.checkForAllReturnedValues(CheckReturnValue, *this))
+      return indicatePessimisticFixpoint();
+
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(noalias) }
+};
+
+/// NoAlias attribute deduction for a call site return value.
+struct AANoAliasCallSiteReturned final : AANoAliasImpl {
+  AANoAliasCallSiteReturned(const IRPosition &IRP, Attributor &A)
+      : AANoAliasImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoAliasImpl::initialize(A);
+    Function *F = getAssociatedFunction();
     if (!F || F->isDeclaration())
-      indicatePessimisticFixpoint(); 
-  } 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    // TODO: Once we have call site specific value information we can provide 
-    //       call site specific liveness information and then it makes 
-    //       sense to specialize attributes for call sites arguments instead of 
-    //       redirecting requests to the callee argument. 
-    Function *F = getAssociatedFunction(); 
-    const IRPosition &FnPos = IRPosition::returned(*F); 
-    auto &FnAA = A.getAAFor<AANoAlias>(*this, FnPos); 
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::returned(*F);
+    auto &FnAA = A.getAAFor<AANoAlias>(*this, FnPos);
     return clampStateAndIndicateChange(getState(), FnAA.getState());
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(noalias); } 
-}; 
- 
-/// -------------------AAIsDead Function Attribute----------------------- 
- 
-struct AAIsDeadValueImpl : public AAIsDead { 
-  AAIsDeadValueImpl(const IRPosition &IRP, Attributor &A) : AAIsDead(IRP, A) {} 
- 
-  /// See AAIsDead::isAssumedDead(). 
-  bool isAssumedDead() const override { return getAssumed(); } 
- 
-  /// See AAIsDead::isKnownDead(). 
-  bool isKnownDead() const override { return getKnown(); } 
- 
-  /// See AAIsDead::isAssumedDead(BasicBlock *). 
-  bool isAssumedDead(const BasicBlock *BB) const override { return false; } 
- 
-  /// See AAIsDead::isKnownDead(BasicBlock *). 
-  bool isKnownDead(const BasicBlock *BB) const override { return false; } 
- 
-  /// See AAIsDead::isAssumedDead(Instruction *I). 
-  bool isAssumedDead(const Instruction *I) const override { 
-    return I == getCtxI() && isAssumedDead(); 
-  } 
- 
-  /// See AAIsDead::isKnownDead(Instruction *I). 
-  bool isKnownDead(const Instruction *I) const override { 
-    return isAssumedDead(I) && getKnown(); 
-  } 
- 
-  /// See AbstractAttribute::getAsStr(). 
-  const std::string getAsStr() const override { 
-    return isAssumedDead() ? "assumed-dead" : "assumed-live"; 
-  } 
- 
-  /// Check if all uses are assumed dead. 
-  bool areAllUsesAssumedDead(Attributor &A, Value &V) { 
-    auto UsePred = [&](const Use &U, bool &Follow) { return false; }; 
-    // Explicitly set the dependence class to required because we want a long 
-    // chain of N dependent instructions to be considered live as soon as one is 
-    // without going through N update cycles. This is not required for 
-    // correctness. 
-    return A.checkForAllUses(UsePred, *this, V, DepClassTy::REQUIRED); 
-  } 
- 
-  /// Determine if \p I is assumed to be side-effect free. 
-  bool isAssumedSideEffectFree(Attributor &A, Instruction *I) { 
-    if (!I || wouldInstructionBeTriviallyDead(I)) 
-      return true; 
- 
-    auto *CB = dyn_cast<CallBase>(I); 
-    if (!CB || isa<IntrinsicInst>(CB)) 
-      return false; 
- 
-    const IRPosition &CallIRP = IRPosition::callsite_function(*CB); 
-    const auto &NoUnwindAA = A.getAndUpdateAAFor<AANoUnwind>( 
-        *this, CallIRP, /* TrackDependence */ false); 
-    if (!NoUnwindAA.isAssumedNoUnwind()) 
-      return false; 
-    if (!NoUnwindAA.isKnownNoUnwind()) 
-      A.recordDependence(NoUnwindAA, *this, DepClassTy::OPTIONAL); 
- 
-    const auto &MemBehaviorAA = A.getAndUpdateAAFor<AAMemoryBehavior>( 
-        *this, CallIRP, /* TrackDependence */ false); 
-    if (MemBehaviorAA.isAssumedReadOnly()) { 
-      if (!MemBehaviorAA.isKnownReadOnly()) 
-        A.recordDependence(MemBehaviorAA, *this, DepClassTy::OPTIONAL); 
-      return true; 
-    } 
-    return false; 
-  } 
-}; 
- 
-struct AAIsDeadFloating : public AAIsDeadValueImpl { 
-  AAIsDeadFloating(const IRPosition &IRP, Attributor &A) 
-      : AAIsDeadValueImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    if (isa<UndefValue>(getAssociatedValue())) { 
-      indicatePessimisticFixpoint(); 
-      return; 
-    } 
- 
-    Instruction *I = dyn_cast<Instruction>(&getAssociatedValue()); 
-    if (!isAssumedSideEffectFree(A, I)) 
-      indicatePessimisticFixpoint(); 
-  } 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    Instruction *I = dyn_cast<Instruction>(&getAssociatedValue()); 
-    if (!isAssumedSideEffectFree(A, I)) 
-      return indicatePessimisticFixpoint(); 
- 
-    if (!areAllUsesAssumedDead(A, getAssociatedValue())) 
-      return indicatePessimisticFixpoint(); 
-    return ChangeStatus::UNCHANGED; 
-  } 
- 
-  /// See AbstractAttribute::manifest(...). 
-  ChangeStatus manifest(Attributor &A) override { 
-    Value &V = getAssociatedValue(); 
-    if (auto *I = dyn_cast<Instruction>(&V)) { 
-      // If we get here we basically know the users are all dead. We check if 
-      // isAssumedSideEffectFree returns true here again because it might not be 
-      // the case and only the users are dead but the instruction (=call) is 
-      // still needed. 
-      if (isAssumedSideEffectFree(A, I) && !isa<InvokeInst>(I)) { 
-        A.deleteAfterManifest(*I); 
-        return ChangeStatus::CHANGED; 
-      } 
-    } 
-    if (V.use_empty()) 
-      return ChangeStatus::UNCHANGED; 
- 
-    bool UsedAssumedInformation = false; 
-    Optional<Constant *> C = 
-        A.getAssumedConstant(V, *this, UsedAssumedInformation); 
-    if (C.hasValue() && C.getValue()) 
-      return ChangeStatus::UNCHANGED; 
- 
-    // Replace the value with undef as it is dead but keep droppable uses around 
-    // as they provide information we don't want to give up on just yet. 
-    UndefValue &UV = *UndefValue::get(V.getType()); 
-    bool AnyChange = 
-        A.changeValueAfterManifest(V, UV, /* ChangeDropppable */ false); 
-    return AnyChange ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED; 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { 
-    STATS_DECLTRACK_FLOATING_ATTR(IsDead) 
-  } 
-}; 
- 
-struct AAIsDeadArgument : public AAIsDeadFloating { 
-  AAIsDeadArgument(const IRPosition &IRP, Attributor &A) 
-      : AAIsDeadFloating(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    if (!A.isFunctionIPOAmendable(*getAnchorScope())) 
-      indicatePessimisticFixpoint(); 
-  } 
- 
-  /// See AbstractAttribute::manifest(...). 
-  ChangeStatus manifest(Attributor &A) override { 
-    ChangeStatus Changed = AAIsDeadFloating::manifest(A); 
-    Argument &Arg = *getAssociatedArgument(); 
-    if (A.isValidFunctionSignatureRewrite(Arg, /* ReplacementTypes */ {})) 
-      if (A.registerFunctionSignatureRewrite( 
-              Arg, /* ReplacementTypes */ {}, 
-              Attributor::ArgumentReplacementInfo::CalleeRepairCBTy{}, 
-              Attributor::ArgumentReplacementInfo::ACSRepairCBTy{})) { 
-        Arg.dropDroppableUses(); 
-        return ChangeStatus::CHANGED; 
-      } 
-    return Changed; 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(IsDead) } 
-}; 
- 
-struct AAIsDeadCallSiteArgument : public AAIsDeadValueImpl { 
-  AAIsDeadCallSiteArgument(const IRPosition &IRP, Attributor &A) 
-      : AAIsDeadValueImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    if (isa<UndefValue>(getAssociatedValue())) 
-      indicatePessimisticFixpoint(); 
-  } 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    // TODO: Once we have call site specific value information we can provide 
-    //       call site specific liveness information and then it makes 
-    //       sense to specialize attributes for call sites arguments instead of 
-    //       redirecting requests to the callee argument. 
-    Argument *Arg = getAssociatedArgument(); 
-    if (!Arg) 
-      return indicatePessimisticFixpoint(); 
-    const IRPosition &ArgPos = IRPosition::argument(*Arg); 
-    auto &ArgAA = A.getAAFor<AAIsDead>(*this, ArgPos); 
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(noalias); }
+};
+
+/// -------------------AAIsDead Function Attribute-----------------------
+
+struct AAIsDeadValueImpl : public AAIsDead {
+  AAIsDeadValueImpl(const IRPosition &IRP, Attributor &A) : AAIsDead(IRP, A) {}
+
+  /// See AAIsDead::isAssumedDead().
+  bool isAssumedDead() const override { return getAssumed(); }
+
+  /// See AAIsDead::isKnownDead().
+  bool isKnownDead() const override { return getKnown(); }
+
+  /// See AAIsDead::isAssumedDead(BasicBlock *).
+  bool isAssumedDead(const BasicBlock *BB) const override { return false; }
+
+  /// See AAIsDead::isKnownDead(BasicBlock *).
+  bool isKnownDead(const BasicBlock *BB) const override { return false; }
+
+  /// See AAIsDead::isAssumedDead(Instruction *I).
+  bool isAssumedDead(const Instruction *I) const override {
+    return I == getCtxI() && isAssumedDead();
+  }
+
+  /// See AAIsDead::isKnownDead(Instruction *I).
+  bool isKnownDead(const Instruction *I) const override {
+    return isAssumedDead(I) && getKnown();
+  }
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    return isAssumedDead() ? "assumed-dead" : "assumed-live";
+  }
+
+  /// Check if all uses are assumed dead.
+  bool areAllUsesAssumedDead(Attributor &A, Value &V) {
+    auto UsePred = [&](const Use &U, bool &Follow) { return false; };
+    // Explicitly set the dependence class to required because we want a long
+    // chain of N dependent instructions to be considered live as soon as one is
+    // without going through N update cycles. This is not required for
+    // correctness.
+    return A.checkForAllUses(UsePred, *this, V, DepClassTy::REQUIRED);
+  }
+
+  /// Determine if \p I is assumed to be side-effect free.
+  bool isAssumedSideEffectFree(Attributor &A, Instruction *I) {
+    if (!I || wouldInstructionBeTriviallyDead(I))
+      return true;
+
+    auto *CB = dyn_cast<CallBase>(I);
+    if (!CB || isa<IntrinsicInst>(CB))
+      return false;
+
+    const IRPosition &CallIRP = IRPosition::callsite_function(*CB);
+    const auto &NoUnwindAA = A.getAndUpdateAAFor<AANoUnwind>(
+        *this, CallIRP, /* TrackDependence */ false);
+    if (!NoUnwindAA.isAssumedNoUnwind())
+      return false;
+    if (!NoUnwindAA.isKnownNoUnwind())
+      A.recordDependence(NoUnwindAA, *this, DepClassTy::OPTIONAL);
+
+    const auto &MemBehaviorAA = A.getAndUpdateAAFor<AAMemoryBehavior>(
+        *this, CallIRP, /* TrackDependence */ false);
+    if (MemBehaviorAA.isAssumedReadOnly()) {
+      if (!MemBehaviorAA.isKnownReadOnly())
+        A.recordDependence(MemBehaviorAA, *this, DepClassTy::OPTIONAL);
+      return true;
+    }
+    return false;
+  }
+};
+
+struct AAIsDeadFloating : public AAIsDeadValueImpl {
+  AAIsDeadFloating(const IRPosition &IRP, Attributor &A)
+      : AAIsDeadValueImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    if (isa<UndefValue>(getAssociatedValue())) {
+      indicatePessimisticFixpoint();
+      return;
+    }
+
+    Instruction *I = dyn_cast<Instruction>(&getAssociatedValue());
+    if (!isAssumedSideEffectFree(A, I))
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    Instruction *I = dyn_cast<Instruction>(&getAssociatedValue());
+    if (!isAssumedSideEffectFree(A, I))
+      return indicatePessimisticFixpoint();
+
+    if (!areAllUsesAssumedDead(A, getAssociatedValue()))
+      return indicatePessimisticFixpoint();
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    Value &V = getAssociatedValue();
+    if (auto *I = dyn_cast<Instruction>(&V)) {
+      // If we get here we basically know the users are all dead. We check if
+      // isAssumedSideEffectFree returns true here again because it might not be
+      // the case and only the users are dead but the instruction (=call) is
+      // still needed.
+      if (isAssumedSideEffectFree(A, I) && !isa<InvokeInst>(I)) {
+        A.deleteAfterManifest(*I);
+        return ChangeStatus::CHANGED;
+      }
+    }
+    if (V.use_empty())
+      return ChangeStatus::UNCHANGED;
+
+    bool UsedAssumedInformation = false;
+    Optional<Constant *> C =
+        A.getAssumedConstant(V, *this, UsedAssumedInformation);
+    if (C.hasValue() && C.getValue())
+      return ChangeStatus::UNCHANGED;
+
+    // Replace the value with undef as it is dead but keep droppable uses around
+    // as they provide information we don't want to give up on just yet.
+    UndefValue &UV = *UndefValue::get(V.getType());
+    bool AnyChange =
+        A.changeValueAfterManifest(V, UV, /* ChangeDropppable */ false);
+    return AnyChange ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FLOATING_ATTR(IsDead)
+  }
+};
+
+struct AAIsDeadArgument : public AAIsDeadFloating {
+  AAIsDeadArgument(const IRPosition &IRP, Attributor &A)
+      : AAIsDeadFloating(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    if (!A.isFunctionIPOAmendable(*getAnchorScope()))
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    ChangeStatus Changed = AAIsDeadFloating::manifest(A);
+    Argument &Arg = *getAssociatedArgument();
+    if (A.isValidFunctionSignatureRewrite(Arg, /* ReplacementTypes */ {}))
+      if (A.registerFunctionSignatureRewrite(
+              Arg, /* ReplacementTypes */ {},
+              Attributor::ArgumentReplacementInfo::CalleeRepairCBTy{},
+              Attributor::ArgumentReplacementInfo::ACSRepairCBTy{})) {
+        Arg.dropDroppableUses();
+        return ChangeStatus::CHANGED;
+      }
+    return Changed;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(IsDead) }
+};
+
+struct AAIsDeadCallSiteArgument : public AAIsDeadValueImpl {
+  AAIsDeadCallSiteArgument(const IRPosition &IRP, Attributor &A)
+      : AAIsDeadValueImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    if (isa<UndefValue>(getAssociatedValue()))
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Argument *Arg = getAssociatedArgument();
+    if (!Arg)
+      return indicatePessimisticFixpoint();
+    const IRPosition &ArgPos = IRPosition::argument(*Arg);
+    auto &ArgAA = A.getAAFor<AAIsDead>(*this, ArgPos);
     return clampStateAndIndicateChange(getState(), ArgAA.getState());
-  } 
- 
-  /// See AbstractAttribute::manifest(...). 
-  ChangeStatus manifest(Attributor &A) override { 
-    CallBase &CB = cast<CallBase>(getAnchorValue()); 
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    CallBase &CB = cast<CallBase>(getAnchorValue());
     Use &U = CB.getArgOperandUse(getCallSiteArgNo());
-    assert(!isa<UndefValue>(U.get()) && 
-           "Expected undef values to be filtered out!"); 
-    UndefValue &UV = *UndefValue::get(U->getType()); 
-    if (A.changeUseAfterManifest(U, UV)) 
-      return ChangeStatus::CHANGED; 
-    return ChangeStatus::UNCHANGED; 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(IsDead) } 
-}; 
- 
-struct AAIsDeadCallSiteReturned : public AAIsDeadFloating { 
-  AAIsDeadCallSiteReturned(const IRPosition &IRP, Attributor &A) 
-      : AAIsDeadFloating(IRP, A), IsAssumedSideEffectFree(true) {} 
- 
-  /// See AAIsDead::isAssumedDead(). 
-  bool isAssumedDead() const override { 
-    return AAIsDeadFloating::isAssumedDead() && IsAssumedSideEffectFree; 
-  } 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    if (isa<UndefValue>(getAssociatedValue())) { 
-      indicatePessimisticFixpoint(); 
-      return; 
-    } 
- 
-    // We track this separately as a secondary state. 
-    IsAssumedSideEffectFree = isAssumedSideEffectFree(A, getCtxI()); 
-  } 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    ChangeStatus Changed = ChangeStatus::UNCHANGED; 
-    if (IsAssumedSideEffectFree && !isAssumedSideEffectFree(A, getCtxI())) { 
-      IsAssumedSideEffectFree = false; 
-      Changed = ChangeStatus::CHANGED; 
-    } 
- 
-    if (!areAllUsesAssumedDead(A, getAssociatedValue())) 
-      return indicatePessimisticFixpoint(); 
-    return Changed; 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { 
-    if (IsAssumedSideEffectFree) 
-      STATS_DECLTRACK_CSRET_ATTR(IsDead) 
-    else 
-      STATS_DECLTRACK_CSRET_ATTR(UnusedResult) 
-  } 
- 
-  /// See AbstractAttribute::getAsStr(). 
-  const std::string getAsStr() const override { 
-    return isAssumedDead() 
-               ? "assumed-dead" 
-               : (getAssumed() ? "assumed-dead-users" : "assumed-live"); 
-  } 
- 
-private: 
-  bool IsAssumedSideEffectFree; 
-}; 
- 
-struct AAIsDeadReturned : public AAIsDeadValueImpl { 
-  AAIsDeadReturned(const IRPosition &IRP, Attributor &A) 
-      : AAIsDeadValueImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
- 
-    A.checkForAllInstructions([](Instruction &) { return true; }, *this, 
-                              {Instruction::Ret}); 
- 
-    auto PredForCallSite = [&](AbstractCallSite ACS) { 
-      if (ACS.isCallbackCall() || !ACS.getInstruction()) 
-        return false; 
-      return areAllUsesAssumedDead(A, *ACS.getInstruction()); 
-    }; 
- 
-    bool AllCallSitesKnown; 
-    if (!A.checkForAllCallSites(PredForCallSite, *this, true, 
-                                AllCallSitesKnown)) 
-      return indicatePessimisticFixpoint(); 
- 
-    return ChangeStatus::UNCHANGED; 
-  } 
- 
-  /// See AbstractAttribute::manifest(...). 
-  ChangeStatus manifest(Attributor &A) override { 
-    // TODO: Rewrite the signature to return void? 
-    bool AnyChange = false; 
-    UndefValue &UV = *UndefValue::get(getAssociatedFunction()->getReturnType()); 
-    auto RetInstPred = [&](Instruction &I) { 
-      ReturnInst &RI = cast<ReturnInst>(I); 
-      if (!isa<UndefValue>(RI.getReturnValue())) 
-        AnyChange |= A.changeUseAfterManifest(RI.getOperandUse(0), UV); 
-      return true; 
-    }; 
-    A.checkForAllInstructions(RetInstPred, *this, {Instruction::Ret}); 
-    return AnyChange ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED; 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(IsDead) } 
-}; 
- 
-struct AAIsDeadFunction : public AAIsDead { 
-  AAIsDeadFunction(const IRPosition &IRP, Attributor &A) : AAIsDead(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    const Function *F = getAnchorScope(); 
-    if (F && !F->isDeclaration()) { 
+    assert(!isa<UndefValue>(U.get()) &&
+           "Expected undef values to be filtered out!");
+    UndefValue &UV = *UndefValue::get(U->getType());
+    if (A.changeUseAfterManifest(U, UV))
+      return ChangeStatus::CHANGED;
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(IsDead) }
+};
+
+struct AAIsDeadCallSiteReturned : public AAIsDeadFloating {
+  AAIsDeadCallSiteReturned(const IRPosition &IRP, Attributor &A)
+      : AAIsDeadFloating(IRP, A), IsAssumedSideEffectFree(true) {}
+
+  /// See AAIsDead::isAssumedDead().
+  bool isAssumedDead() const override {
+    return AAIsDeadFloating::isAssumedDead() && IsAssumedSideEffectFree;
+  }
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    if (isa<UndefValue>(getAssociatedValue())) {
+      indicatePessimisticFixpoint();
+      return;
+    }
+
+    // We track this separately as a secondary state.
+    IsAssumedSideEffectFree = isAssumedSideEffectFree(A, getCtxI());
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    ChangeStatus Changed = ChangeStatus::UNCHANGED;
+    if (IsAssumedSideEffectFree && !isAssumedSideEffectFree(A, getCtxI())) {
+      IsAssumedSideEffectFree = false;
+      Changed = ChangeStatus::CHANGED;
+    }
+
+    if (!areAllUsesAssumedDead(A, getAssociatedValue()))
+      return indicatePessimisticFixpoint();
+    return Changed;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    if (IsAssumedSideEffectFree)
+      STATS_DECLTRACK_CSRET_ATTR(IsDead)
+    else
+      STATS_DECLTRACK_CSRET_ATTR(UnusedResult)
+  }
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    return isAssumedDead()
+               ? "assumed-dead"
+               : (getAssumed() ? "assumed-dead-users" : "assumed-live");
+  }
+
+private:
+  bool IsAssumedSideEffectFree;
+};
+
+struct AAIsDeadReturned : public AAIsDeadValueImpl {
+  AAIsDeadReturned(const IRPosition &IRP, Attributor &A)
+      : AAIsDeadValueImpl(IRP, A) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+
+    A.checkForAllInstructions([](Instruction &) { return true; }, *this,
+                              {Instruction::Ret});
+
+    auto PredForCallSite = [&](AbstractCallSite ACS) {
+      if (ACS.isCallbackCall() || !ACS.getInstruction())
+        return false;
+      return areAllUsesAssumedDead(A, *ACS.getInstruction());
+    };
+
+    bool AllCallSitesKnown;
+    if (!A.checkForAllCallSites(PredForCallSite, *this, true,
+                                AllCallSitesKnown))
+      return indicatePessimisticFixpoint();
+
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    // TODO: Rewrite the signature to return void?
+    bool AnyChange = false;
+    UndefValue &UV = *UndefValue::get(getAssociatedFunction()->getReturnType());
+    auto RetInstPred = [&](Instruction &I) {
+      ReturnInst &RI = cast<ReturnInst>(I);
+      if (!isa<UndefValue>(RI.getReturnValue()))
+        AnyChange |= A.changeUseAfterManifest(RI.getOperandUse(0), UV);
+      return true;
+    };
+    A.checkForAllInstructions(RetInstPred, *this, {Instruction::Ret});
+    return AnyChange ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(IsDead) }
+};
+
+struct AAIsDeadFunction : public AAIsDead {
+  AAIsDeadFunction(const IRPosition &IRP, Attributor &A) : AAIsDead(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    const Function *F = getAnchorScope();
+    if (F && !F->isDeclaration()) {
       // We only want to compute liveness once. If the function is not part of
       // the SCC, skip it.
       if (A.isRunOn(*const_cast<Function *>(F))) {
@@ -3065,267 +3065,267 @@ struct AAIsDeadFunction : public AAIsDead {
       } else {
         indicatePessimisticFixpoint();
       }
-    } 
-  } 
- 
-  /// See AbstractAttribute::getAsStr(). 
-  const std::string getAsStr() const override { 
-    return "Live[#BB " + std::to_string(AssumedLiveBlocks.size()) + "/" + 
-           std::to_string(getAnchorScope()->size()) + "][#TBEP " + 
-           std::to_string(ToBeExploredFrom.size()) + "][#KDE " + 
-           std::to_string(KnownDeadEnds.size()) + "]"; 
-  } 
- 
-  /// See AbstractAttribute::manifest(...). 
-  ChangeStatus manifest(Attributor &A) override { 
-    assert(getState().isValidState() && 
-           "Attempted to manifest an invalid state!"); 
- 
-    ChangeStatus HasChanged = ChangeStatus::UNCHANGED; 
-    Function &F = *getAnchorScope(); 
- 
-    if (AssumedLiveBlocks.empty()) { 
-      A.deleteAfterManifest(F); 
-      return ChangeStatus::CHANGED; 
-    } 
- 
-    // Flag to determine if we can change an invoke to a call assuming the 
-    // callee is nounwind. This is not possible if the personality of the 
-    // function allows to catch asynchronous exceptions. 
-    bool Invoke2CallAllowed = !mayCatchAsynchronousExceptions(F); 
- 
-    KnownDeadEnds.set_union(ToBeExploredFrom); 
-    for (const Instruction *DeadEndI : KnownDeadEnds) { 
-      auto *CB = dyn_cast<CallBase>(DeadEndI); 
-      if (!CB) 
-        continue; 
-      const auto &NoReturnAA = A.getAndUpdateAAFor<AANoReturn>( 
-          *this, IRPosition::callsite_function(*CB), /* TrackDependence */ true, 
-          DepClassTy::OPTIONAL); 
-      bool MayReturn = !NoReturnAA.isAssumedNoReturn(); 
-      if (MayReturn && (!Invoke2CallAllowed || !isa<InvokeInst>(CB))) 
-        continue; 
- 
-      if (auto *II = dyn_cast<InvokeInst>(DeadEndI)) 
-        A.registerInvokeWithDeadSuccessor(const_cast<InvokeInst &>(*II)); 
-      else 
-        A.changeToUnreachableAfterManifest( 
-            const_cast<Instruction *>(DeadEndI->getNextNode())); 
-      HasChanged = ChangeStatus::CHANGED; 
-    } 
- 
-    STATS_DECL(AAIsDead, BasicBlock, "Number of dead basic blocks deleted."); 
-    for (BasicBlock &BB : F) 
-      if (!AssumedLiveBlocks.count(&BB)) { 
-        A.deleteAfterManifest(BB); 
-        ++BUILD_STAT_NAME(AAIsDead, BasicBlock); 
-      } 
- 
-    return HasChanged; 
-  } 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override; 
- 
+    }
+  }
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    return "Live[#BB " + std::to_string(AssumedLiveBlocks.size()) + "/" +
+           std::to_string(getAnchorScope()->size()) + "][#TBEP " +
+           std::to_string(ToBeExploredFrom.size()) + "][#KDE " +
+           std::to_string(KnownDeadEnds.size()) + "]";
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    assert(getState().isValidState() &&
+           "Attempted to manifest an invalid state!");
+
+    ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
+    Function &F = *getAnchorScope();
+
+    if (AssumedLiveBlocks.empty()) {
+      A.deleteAfterManifest(F);
+      return ChangeStatus::CHANGED;
+    }
+
+    // Flag to determine if we can change an invoke to a call assuming the
+    // callee is nounwind. This is not possible if the personality of the
+    // function allows to catch asynchronous exceptions.
+    bool Invoke2CallAllowed = !mayCatchAsynchronousExceptions(F);
+
+    KnownDeadEnds.set_union(ToBeExploredFrom);
+    for (const Instruction *DeadEndI : KnownDeadEnds) {
+      auto *CB = dyn_cast<CallBase>(DeadEndI);
+      if (!CB)
+        continue;
+      const auto &NoReturnAA = A.getAndUpdateAAFor<AANoReturn>(
+          *this, IRPosition::callsite_function(*CB), /* TrackDependence */ true,
+          DepClassTy::OPTIONAL);
+      bool MayReturn = !NoReturnAA.isAssumedNoReturn();
+      if (MayReturn && (!Invoke2CallAllowed || !isa<InvokeInst>(CB)))
+        continue;
+
+      if (auto *II = dyn_cast<InvokeInst>(DeadEndI))
+        A.registerInvokeWithDeadSuccessor(const_cast<InvokeInst &>(*II));
+      else
+        A.changeToUnreachableAfterManifest(
+            const_cast<Instruction *>(DeadEndI->getNextNode()));
+      HasChanged = ChangeStatus::CHANGED;
+    }
+
+    STATS_DECL(AAIsDead, BasicBlock, "Number of dead basic blocks deleted.");
+    for (BasicBlock &BB : F)
+      if (!AssumedLiveBlocks.count(&BB)) {
+        A.deleteAfterManifest(BB);
+        ++BUILD_STAT_NAME(AAIsDead, BasicBlock);
+      }
+
+    return HasChanged;
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override;
+
   bool isEdgeDead(const BasicBlock *From, const BasicBlock *To) const override {
     return !AssumedLiveEdges.count(std::make_pair(From, To));
   }
 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override {} 
- 
-  /// Returns true if the function is assumed dead. 
-  bool isAssumedDead() const override { return false; } 
- 
-  /// See AAIsDead::isKnownDead(). 
-  bool isKnownDead() const override { return false; } 
- 
-  /// See AAIsDead::isAssumedDead(BasicBlock *). 
-  bool isAssumedDead(const BasicBlock *BB) const override { 
-    assert(BB->getParent() == getAnchorScope() && 
-           "BB must be in the same anchor scope function."); 
- 
-    if (!getAssumed()) 
-      return false; 
-    return !AssumedLiveBlocks.count(BB); 
-  } 
- 
-  /// See AAIsDead::isKnownDead(BasicBlock *). 
-  bool isKnownDead(const BasicBlock *BB) const override { 
-    return getKnown() && isAssumedDead(BB); 
-  } 
- 
-  /// See AAIsDead::isAssumed(Instruction *I). 
-  bool isAssumedDead(const Instruction *I) const override { 
-    assert(I->getParent()->getParent() == getAnchorScope() && 
-           "Instruction must be in the same anchor scope function."); 
- 
-    if (!getAssumed()) 
-      return false; 
- 
-    // If it is not in AssumedLiveBlocks then it for sure dead. 
-    // Otherwise, it can still be after noreturn call in a live block. 
-    if (!AssumedLiveBlocks.count(I->getParent())) 
-      return true; 
- 
-    // If it is not after a liveness barrier it is live. 
-    const Instruction *PrevI = I->getPrevNode(); 
-    while (PrevI) { 
-      if (KnownDeadEnds.count(PrevI) || ToBeExploredFrom.count(PrevI)) 
-        return true; 
-      PrevI = PrevI->getPrevNode(); 
-    } 
-    return false; 
-  } 
- 
-  /// See AAIsDead::isKnownDead(Instruction *I). 
-  bool isKnownDead(const Instruction *I) const override { 
-    return getKnown() && isAssumedDead(I); 
-  } 
- 
-  /// Assume \p BB is (partially) live now and indicate to the Attributor \p A 
-  /// that internal function called from \p BB should now be looked at. 
-  bool assumeLive(Attributor &A, const BasicBlock &BB) { 
-    if (!AssumedLiveBlocks.insert(&BB).second) 
-      return false; 
- 
-    // We assume that all of BB is (probably) live now and if there are calls to 
-    // internal functions we will assume that those are now live as well. This 
-    // is a performance optimization for blocks with calls to a lot of internal 
-    // functions. It can however cause dead functions to be treated as live. 
-    for (const Instruction &I : BB) 
-      if (const auto *CB = dyn_cast<CallBase>(&I)) 
-        if (const Function *F = CB->getCalledFunction()) 
-          if (F->hasLocalLinkage()) 
-            A.markLiveInternalFunction(*F); 
-    return true; 
-  } 
- 
-  /// Collection of instructions that need to be explored again, e.g., we 
-  /// did assume they do not transfer control to (one of their) successors. 
-  SmallSetVector<const Instruction *, 8> ToBeExploredFrom; 
- 
-  /// Collection of instructions that are known to not transfer control. 
-  SmallSetVector<const Instruction *, 8> KnownDeadEnds; 
- 
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {}
+
+  /// Returns true if the function is assumed dead.
+  bool isAssumedDead() const override { return false; }
+
+  /// See AAIsDead::isKnownDead().
+  bool isKnownDead() const override { return false; }
+
+  /// See AAIsDead::isAssumedDead(BasicBlock *).
+  bool isAssumedDead(const BasicBlock *BB) const override {
+    assert(BB->getParent() == getAnchorScope() &&
+           "BB must be in the same anchor scope function.");
+
+    if (!getAssumed())
+      return false;
+    return !AssumedLiveBlocks.count(BB);
+  }
+
+  /// See AAIsDead::isKnownDead(BasicBlock *).
+  bool isKnownDead(const BasicBlock *BB) const override {
+    return getKnown() && isAssumedDead(BB);
+  }
+
+  /// See AAIsDead::isAssumed(Instruction *I).
+  bool isAssumedDead(const Instruction *I) const override {
+    assert(I->getParent()->getParent() == getAnchorScope() &&
+           "Instruction must be in the same anchor scope function.");
+
+    if (!getAssumed())
+      return false;
+
+    // If it is not in AssumedLiveBlocks then it for sure dead.
+    // Otherwise, it can still be after noreturn call in a live block.
+    if (!AssumedLiveBlocks.count(I->getParent()))
+      return true;
+
+    // If it is not after a liveness barrier it is live.
+    const Instruction *PrevI = I->getPrevNode();
+    while (PrevI) {
+      if (KnownDeadEnds.count(PrevI) || ToBeExploredFrom.count(PrevI))
+        return true;
+      PrevI = PrevI->getPrevNode();
+    }
+    return false;
+  }
+
+  /// See AAIsDead::isKnownDead(Instruction *I).
+  bool isKnownDead(const Instruction *I) const override {
+    return getKnown() && isAssumedDead(I);
+  }
+
+  /// Assume \p BB is (partially) live now and indicate to the Attributor \p A
+  /// that internal function called from \p BB should now be looked at.
+  bool assumeLive(Attributor &A, const BasicBlock &BB) {
+    if (!AssumedLiveBlocks.insert(&BB).second)
+      return false;
+
+    // We assume that all of BB is (probably) live now and if there are calls to
+    // internal functions we will assume that those are now live as well. This
+    // is a performance optimization for blocks with calls to a lot of internal
+    // functions. It can however cause dead functions to be treated as live.
+    for (const Instruction &I : BB)
+      if (const auto *CB = dyn_cast<CallBase>(&I))
+        if (const Function *F = CB->getCalledFunction())
+          if (F->hasLocalLinkage())
+            A.markLiveInternalFunction(*F);
+    return true;
+  }
+
+  /// Collection of instructions that need to be explored again, e.g., we
+  /// did assume they do not transfer control to (one of their) successors.
+  SmallSetVector<const Instruction *, 8> ToBeExploredFrom;
+
+  /// Collection of instructions that are known to not transfer control.
+  SmallSetVector<const Instruction *, 8> KnownDeadEnds;
+
   /// Collection of all assumed live edges
   DenseSet<std::pair<const BasicBlock *, const BasicBlock *>> AssumedLiveEdges;
 
-  /// Collection of all assumed live BasicBlocks. 
-  DenseSet<const BasicBlock *> AssumedLiveBlocks; 
-}; 
- 
-static bool 
-identifyAliveSuccessors(Attributor &A, const CallBase &CB, 
-                        AbstractAttribute &AA, 
-                        SmallVectorImpl<const Instruction *> &AliveSuccessors) { 
-  const IRPosition &IPos = IRPosition::callsite_function(CB); 
- 
-  const auto &NoReturnAA = A.getAndUpdateAAFor<AANoReturn>( 
-      AA, IPos, /* TrackDependence */ true, DepClassTy::OPTIONAL); 
-  if (NoReturnAA.isAssumedNoReturn()) 
-    return !NoReturnAA.isKnownNoReturn(); 
-  if (CB.isTerminator()) 
-    AliveSuccessors.push_back(&CB.getSuccessor(0)->front()); 
-  else 
-    AliveSuccessors.push_back(CB.getNextNode()); 
-  return false; 
-} 
- 
-static bool 
-identifyAliveSuccessors(Attributor &A, const InvokeInst &II, 
-                        AbstractAttribute &AA, 
-                        SmallVectorImpl<const Instruction *> &AliveSuccessors) { 
-  bool UsedAssumedInformation = 
-      identifyAliveSuccessors(A, cast<CallBase>(II), AA, AliveSuccessors); 
- 
-  // First, determine if we can change an invoke to a call assuming the 
-  // callee is nounwind. This is not possible if the personality of the 
-  // function allows to catch asynchronous exceptions. 
-  if (AAIsDeadFunction::mayCatchAsynchronousExceptions(*II.getFunction())) { 
-    AliveSuccessors.push_back(&II.getUnwindDest()->front()); 
-  } else { 
-    const IRPosition &IPos = IRPosition::callsite_function(II); 
-    const auto &AANoUnw = A.getAndUpdateAAFor<AANoUnwind>( 
-        AA, IPos, /* TrackDependence */ true, DepClassTy::OPTIONAL); 
-    if (AANoUnw.isAssumedNoUnwind()) { 
-      UsedAssumedInformation |= !AANoUnw.isKnownNoUnwind(); 
-    } else { 
-      AliveSuccessors.push_back(&II.getUnwindDest()->front()); 
-    } 
-  } 
-  return UsedAssumedInformation; 
-} 
- 
-static bool 
-identifyAliveSuccessors(Attributor &A, const BranchInst &BI, 
-                        AbstractAttribute &AA, 
-                        SmallVectorImpl<const Instruction *> &AliveSuccessors) { 
-  bool UsedAssumedInformation = false; 
-  if (BI.getNumSuccessors() == 1) { 
-    AliveSuccessors.push_back(&BI.getSuccessor(0)->front()); 
-  } else { 
-    Optional<ConstantInt *> CI = getAssumedConstantInt( 
-        A, *BI.getCondition(), AA, UsedAssumedInformation); 
-    if (!CI.hasValue()) { 
-      // No value yet, assume both edges are dead. 
-    } else if (CI.getValue()) { 
-      const BasicBlock *SuccBB = 
-          BI.getSuccessor(1 - CI.getValue()->getZExtValue()); 
-      AliveSuccessors.push_back(&SuccBB->front()); 
-    } else { 
-      AliveSuccessors.push_back(&BI.getSuccessor(0)->front()); 
-      AliveSuccessors.push_back(&BI.getSuccessor(1)->front()); 
-      UsedAssumedInformation = false; 
-    } 
-  } 
-  return UsedAssumedInformation; 
-} 
- 
-static bool 
-identifyAliveSuccessors(Attributor &A, const SwitchInst &SI, 
-                        AbstractAttribute &AA, 
-                        SmallVectorImpl<const Instruction *> &AliveSuccessors) { 
-  bool UsedAssumedInformation = false; 
-  Optional<ConstantInt *> CI = 
-      getAssumedConstantInt(A, *SI.getCondition(), AA, UsedAssumedInformation); 
-  if (!CI.hasValue()) { 
-    // No value yet, assume all edges are dead. 
-  } else if (CI.getValue()) { 
-    for (auto &CaseIt : SI.cases()) { 
-      if (CaseIt.getCaseValue() == CI.getValue()) { 
-        AliveSuccessors.push_back(&CaseIt.getCaseSuccessor()->front()); 
-        return UsedAssumedInformation; 
-      } 
-    } 
-    AliveSuccessors.push_back(&SI.getDefaultDest()->front()); 
-    return UsedAssumedInformation; 
-  } else { 
-    for (const BasicBlock *SuccBB : successors(SI.getParent())) 
-      AliveSuccessors.push_back(&SuccBB->front()); 
-  } 
-  return UsedAssumedInformation; 
-} 
- 
-ChangeStatus AAIsDeadFunction::updateImpl(Attributor &A) { 
-  ChangeStatus Change = ChangeStatus::UNCHANGED; 
- 
-  LLVM_DEBUG(dbgs() << "[AAIsDead] Live [" << AssumedLiveBlocks.size() << "/" 
-                    << getAnchorScope()->size() << "] BBs and " 
-                    << ToBeExploredFrom.size() << " exploration points and " 
-                    << KnownDeadEnds.size() << " known dead ends\n"); 
- 
-  // Copy and clear the list of instructions we need to explore from. It is 
-  // refilled with instructions the next update has to look at. 
-  SmallVector<const Instruction *, 8> Worklist(ToBeExploredFrom.begin(), 
-                                               ToBeExploredFrom.end()); 
-  decltype(ToBeExploredFrom) NewToBeExploredFrom; 
- 
-  SmallVector<const Instruction *, 8> AliveSuccessors; 
-  while (!Worklist.empty()) { 
-    const Instruction *I = Worklist.pop_back_val(); 
-    LLVM_DEBUG(dbgs() << "[AAIsDead] Exploration inst: " << *I << "\n"); 
- 
+  /// Collection of all assumed live BasicBlocks.
+  DenseSet<const BasicBlock *> AssumedLiveBlocks;
+};
+
+static bool
+identifyAliveSuccessors(Attributor &A, const CallBase &CB,
+                        AbstractAttribute &AA,
+                        SmallVectorImpl<const Instruction *> &AliveSuccessors) {
+  const IRPosition &IPos = IRPosition::callsite_function(CB);
+
+  const auto &NoReturnAA = A.getAndUpdateAAFor<AANoReturn>(
+      AA, IPos, /* TrackDependence */ true, DepClassTy::OPTIONAL);
+  if (NoReturnAA.isAssumedNoReturn())
+    return !NoReturnAA.isKnownNoReturn();
+  if (CB.isTerminator())
+    AliveSuccessors.push_back(&CB.getSuccessor(0)->front());
+  else
+    AliveSuccessors.push_back(CB.getNextNode());
+  return false;
+}
+
+static bool
+identifyAliveSuccessors(Attributor &A, const InvokeInst &II,
+                        AbstractAttribute &AA,
+                        SmallVectorImpl<const Instruction *> &AliveSuccessors) {
+  bool UsedAssumedInformation =
+      identifyAliveSuccessors(A, cast<CallBase>(II), AA, AliveSuccessors);
+
+  // First, determine if we can change an invoke to a call assuming the
+  // callee is nounwind. This is not possible if the personality of the
+  // function allows to catch asynchronous exceptions.
+  if (AAIsDeadFunction::mayCatchAsynchronousExceptions(*II.getFunction())) {
+    AliveSuccessors.push_back(&II.getUnwindDest()->front());
+  } else {
+    const IRPosition &IPos = IRPosition::callsite_function(II);
+    const auto &AANoUnw = A.getAndUpdateAAFor<AANoUnwind>(
+        AA, IPos, /* TrackDependence */ true, DepClassTy::OPTIONAL);
+    if (AANoUnw.isAssumedNoUnwind()) {
+      UsedAssumedInformation |= !AANoUnw.isKnownNoUnwind();
+    } else {
+      AliveSuccessors.push_back(&II.getUnwindDest()->front());
+    }
+  }
+  return UsedAssumedInformation;
+}
+
+static bool
+identifyAliveSuccessors(Attributor &A, const BranchInst &BI,
+                        AbstractAttribute &AA,
+                        SmallVectorImpl<const Instruction *> &AliveSuccessors) {
+  bool UsedAssumedInformation = false;
+  if (BI.getNumSuccessors() == 1) {
+    AliveSuccessors.push_back(&BI.getSuccessor(0)->front());
+  } else {
+    Optional<ConstantInt *> CI = getAssumedConstantInt(
+        A, *BI.getCondition(), AA, UsedAssumedInformation);
+    if (!CI.hasValue()) {
+      // No value yet, assume both edges are dead.
+    } else if (CI.getValue()) {
+      const BasicBlock *SuccBB =
+          BI.getSuccessor(1 - CI.getValue()->getZExtValue());
+      AliveSuccessors.push_back(&SuccBB->front());
+    } else {
+      AliveSuccessors.push_back(&BI.getSuccessor(0)->front());
+      AliveSuccessors.push_back(&BI.getSuccessor(1)->front());
+      UsedAssumedInformation = false;
+    }
+  }
+  return UsedAssumedInformation;
+}
+
+static bool
+identifyAliveSuccessors(Attributor &A, const SwitchInst &SI,
+                        AbstractAttribute &AA,
+                        SmallVectorImpl<const Instruction *> &AliveSuccessors) {
+  bool UsedAssumedInformation = false;
+  Optional<ConstantInt *> CI =
+      getAssumedConstantInt(A, *SI.getCondition(), AA, UsedAssumedInformation);
+  if (!CI.hasValue()) {
+    // No value yet, assume all edges are dead.
+  } else if (CI.getValue()) {
+    for (auto &CaseIt : SI.cases()) {
+      if (CaseIt.getCaseValue() == CI.getValue()) {
+        AliveSuccessors.push_back(&CaseIt.getCaseSuccessor()->front());
+        return UsedAssumedInformation;
+      }
+    }
+    AliveSuccessors.push_back(&SI.getDefaultDest()->front());
+    return UsedAssumedInformation;
+  } else {
+    for (const BasicBlock *SuccBB : successors(SI.getParent()))
+      AliveSuccessors.push_back(&SuccBB->front());
+  }
+  return UsedAssumedInformation;
+}
+
+ChangeStatus AAIsDeadFunction::updateImpl(Attributor &A) {
+  ChangeStatus Change = ChangeStatus::UNCHANGED;
+
+  LLVM_DEBUG(dbgs() << "[AAIsDead] Live [" << AssumedLiveBlocks.size() << "/"
+                    << getAnchorScope()->size() << "] BBs and "
+                    << ToBeExploredFrom.size() << " exploration points and "
+                    << KnownDeadEnds.size() << " known dead ends\n");
+
+  // Copy and clear the list of instructions we need to explore from. It is
+  // refilled with instructions the next update has to look at.
+  SmallVector<const Instruction *, 8> Worklist(ToBeExploredFrom.begin(),
+                                               ToBeExploredFrom.end());
+  decltype(ToBeExploredFrom) NewToBeExploredFrom;
+
+  SmallVector<const Instruction *, 8> AliveSuccessors;
+  while (!Worklist.empty()) {
+    const Instruction *I = Worklist.pop_back_val();
+    LLVM_DEBUG(dbgs() << "[AAIsDead] Exploration inst: " << *I << "\n");
+
     // Fast forward for uninteresting instructions. We could look for UB here
     // though.
     while (!I->isTerminator() && !isa<CallBase>(I)) {
@@ -3333,525 +3333,525 @@ ChangeStatus AAIsDeadFunction::updateImpl(Attributor &A) {
       I = I->getNextNode();
     }
 
-    AliveSuccessors.clear(); 
- 
-    bool UsedAssumedInformation = false; 
-    switch (I->getOpcode()) { 
-    // TODO: look for (assumed) UB to backwards propagate "deadness". 
-    default: 
+    AliveSuccessors.clear();
+
+    bool UsedAssumedInformation = false;
+    switch (I->getOpcode()) {
+    // TODO: look for (assumed) UB to backwards propagate "deadness".
+    default:
       assert(I->isTerminator() &&
              "Expected non-terminators to be handled already!");
       for (const BasicBlock *SuccBB : successors(I->getParent()))
         AliveSuccessors.push_back(&SuccBB->front());
-      break; 
-    case Instruction::Call: 
-      UsedAssumedInformation = identifyAliveSuccessors(A, cast<CallInst>(*I), 
-                                                       *this, AliveSuccessors); 
-      break; 
-    case Instruction::Invoke: 
-      UsedAssumedInformation = identifyAliveSuccessors(A, cast<InvokeInst>(*I), 
-                                                       *this, AliveSuccessors); 
-      break; 
-    case Instruction::Br: 
-      UsedAssumedInformation = identifyAliveSuccessors(A, cast<BranchInst>(*I), 
-                                                       *this, AliveSuccessors); 
-      break; 
-    case Instruction::Switch: 
-      UsedAssumedInformation = identifyAliveSuccessors(A, cast<SwitchInst>(*I), 
-                                                       *this, AliveSuccessors); 
-      break; 
-    } 
- 
-    if (UsedAssumedInformation) { 
-      NewToBeExploredFrom.insert(I); 
-    } else { 
-      Change = ChangeStatus::CHANGED; 
-      if (AliveSuccessors.empty() || 
-          (I->isTerminator() && AliveSuccessors.size() < I->getNumSuccessors())) 
-        KnownDeadEnds.insert(I); 
-    } 
- 
-    LLVM_DEBUG(dbgs() << "[AAIsDead] #AliveSuccessors: " 
-                      << AliveSuccessors.size() << " UsedAssumedInformation: " 
-                      << UsedAssumedInformation << "\n"); 
- 
-    for (const Instruction *AliveSuccessor : AliveSuccessors) { 
-      if (!I->isTerminator()) { 
-        assert(AliveSuccessors.size() == 1 && 
-               "Non-terminator expected to have a single successor!"); 
-        Worklist.push_back(AliveSuccessor); 
-      } else { 
+      break;
+    case Instruction::Call:
+      UsedAssumedInformation = identifyAliveSuccessors(A, cast<CallInst>(*I),
+                                                       *this, AliveSuccessors);
+      break;
+    case Instruction::Invoke:
+      UsedAssumedInformation = identifyAliveSuccessors(A, cast<InvokeInst>(*I),
+                                                       *this, AliveSuccessors);
+      break;
+    case Instruction::Br:
+      UsedAssumedInformation = identifyAliveSuccessors(A, cast<BranchInst>(*I),
+                                                       *this, AliveSuccessors);
+      break;
+    case Instruction::Switch:
+      UsedAssumedInformation = identifyAliveSuccessors(A, cast<SwitchInst>(*I),
+                                                       *this, AliveSuccessors);
+      break;
+    }
+
+    if (UsedAssumedInformation) {
+      NewToBeExploredFrom.insert(I);
+    } else {
+      Change = ChangeStatus::CHANGED;
+      if (AliveSuccessors.empty() ||
+          (I->isTerminator() && AliveSuccessors.size() < I->getNumSuccessors()))
+        KnownDeadEnds.insert(I);
+    }
+
+    LLVM_DEBUG(dbgs() << "[AAIsDead] #AliveSuccessors: "
+                      << AliveSuccessors.size() << " UsedAssumedInformation: "
+                      << UsedAssumedInformation << "\n");
+
+    for (const Instruction *AliveSuccessor : AliveSuccessors) {
+      if (!I->isTerminator()) {
+        assert(AliveSuccessors.size() == 1 &&
+               "Non-terminator expected to have a single successor!");
+        Worklist.push_back(AliveSuccessor);
+      } else {
         // record the assumed live edge
         AssumedLiveEdges.insert(
             std::make_pair(I->getParent(), AliveSuccessor->getParent()));
-        if (assumeLive(A, *AliveSuccessor->getParent())) 
-          Worklist.push_back(AliveSuccessor); 
-      } 
-    } 
-  } 
- 
-  ToBeExploredFrom = std::move(NewToBeExploredFrom); 
- 
-  // If we know everything is live there is no need to query for liveness. 
-  // Instead, indicating a pessimistic fixpoint will cause the state to be 
-  // "invalid" and all queries to be answered conservatively without lookups. 
-  // To be in this state we have to (1) finished the exploration and (3) not 
-  // discovered any non-trivial dead end and (2) not ruled unreachable code 
-  // dead. 
-  if (ToBeExploredFrom.empty() && 
-      getAnchorScope()->size() == AssumedLiveBlocks.size() && 
-      llvm::all_of(KnownDeadEnds, [](const Instruction *DeadEndI) { 
-        return DeadEndI->isTerminator() && DeadEndI->getNumSuccessors() == 0; 
-      })) 
-    return indicatePessimisticFixpoint(); 
-  return Change; 
-} 
- 
-/// Liveness information for a call sites. 
-struct AAIsDeadCallSite final : AAIsDeadFunction { 
-  AAIsDeadCallSite(const IRPosition &IRP, Attributor &A) 
-      : AAIsDeadFunction(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    // TODO: Once we have call site specific value information we can provide 
-    //       call site specific liveness information and then it makes 
-    //       sense to specialize attributes for call sites instead of 
-    //       redirecting requests to the callee. 
-    llvm_unreachable("Abstract attributes for liveness are not " 
-                     "supported for call sites yet!"); 
-  } 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    return indicatePessimisticFixpoint(); 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override {} 
-}; 
- 
-/// -------------------- Dereferenceable Argument Attribute -------------------- 
- 
-template <> 
-ChangeStatus clampStateAndIndicateChange<DerefState>(DerefState &S, 
-                                                     const DerefState &R) { 
-  ChangeStatus CS0 = 
-      clampStateAndIndicateChange(S.DerefBytesState, R.DerefBytesState); 
-  ChangeStatus CS1 = clampStateAndIndicateChange(S.GlobalState, R.GlobalState); 
-  return CS0 | CS1; 
-} 
- 
-struct AADereferenceableImpl : AADereferenceable { 
-  AADereferenceableImpl(const IRPosition &IRP, Attributor &A) 
-      : AADereferenceable(IRP, A) {} 
-  using StateType = DerefState; 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    SmallVector<Attribute, 4> Attrs; 
-    getAttrs({Attribute::Dereferenceable, Attribute::DereferenceableOrNull}, 
-             Attrs, /* IgnoreSubsumingPositions */ false, &A); 
-    for (const Attribute &Attr : Attrs) 
-      takeKnownDerefBytesMaximum(Attr.getValueAsInt()); 
- 
-    const IRPosition &IRP = this->getIRPosition(); 
-    NonNullAA = &A.getAAFor<AANonNull>(*this, IRP, 
-                                       /* TrackDependence */ false); 
- 
-    bool CanBeNull; 
-    takeKnownDerefBytesMaximum( 
-        IRP.getAssociatedValue().getPointerDereferenceableBytes( 
-            A.getDataLayout(), CanBeNull)); 
- 
-    bool IsFnInterface = IRP.isFnInterfaceKind(); 
-    Function *FnScope = IRP.getAnchorScope(); 
-    if (IsFnInterface && (!FnScope || !A.isFunctionIPOAmendable(*FnScope))) { 
-      indicatePessimisticFixpoint(); 
-      return; 
-    } 
- 
-    if (Instruction *CtxI = getCtxI()) 
-      followUsesInMBEC(*this, A, getState(), *CtxI); 
-  } 
- 
-  /// See AbstractAttribute::getState() 
-  /// { 
-  StateType &getState() override { return *this; } 
-  const StateType &getState() const override { return *this; } 
-  /// } 
- 
-  /// Helper function for collecting accessed bytes in must-be-executed-context 
-  void addAccessedBytesForUse(Attributor &A, const Use *U, const Instruction *I, 
-                              DerefState &State) { 
-    const Value *UseV = U->get(); 
-    if (!UseV->getType()->isPointerTy()) 
-      return; 
- 
-    Type *PtrTy = UseV->getType(); 
-    const DataLayout &DL = A.getDataLayout(); 
-    int64_t Offset; 
-    if (const Value *Base = getBasePointerOfAccessPointerOperand( 
-            I, Offset, DL, /*AllowNonInbounds*/ true)) { 
-      if (Base == &getAssociatedValue() && 
-          getPointerOperand(I, /* AllowVolatile */ false) == UseV) { 
-        uint64_t Size = DL.getTypeStoreSize(PtrTy->getPointerElementType()); 
-        State.addAccessedBytes(Offset, Size); 
-      } 
-    } 
-  } 
- 
-  /// See followUsesInMBEC 
-  bool followUseInMBEC(Attributor &A, const Use *U, const Instruction *I, 
-                       AADereferenceable::StateType &State) { 
-    bool IsNonNull = false; 
-    bool TrackUse = false; 
-    int64_t DerefBytes = getKnownNonNullAndDerefBytesForUse( 
-        A, *this, getAssociatedValue(), U, I, IsNonNull, TrackUse); 
-    LLVM_DEBUG(dbgs() << "[AADereferenceable] Deref bytes: " << DerefBytes 
-                      << " for instruction " << *I << "\n"); 
- 
-    addAccessedBytesForUse(A, U, I, State); 
-    State.takeKnownDerefBytesMaximum(DerefBytes); 
-    return TrackUse; 
-  } 
- 
-  /// See AbstractAttribute::manifest(...). 
-  ChangeStatus manifest(Attributor &A) override { 
-    ChangeStatus Change = AADereferenceable::manifest(A); 
-    if (isAssumedNonNull() && hasAttr(Attribute::DereferenceableOrNull)) { 
-      removeAttrs({Attribute::DereferenceableOrNull}); 
-      return ChangeStatus::CHANGED; 
-    } 
-    return Change; 
-  } 
- 
-  void getDeducedAttributes(LLVMContext &Ctx, 
-                            SmallVectorImpl<Attribute> &Attrs) const override { 
-    // TODO: Add *_globally support 
-    if (isAssumedNonNull()) 
-      Attrs.emplace_back(Attribute::getWithDereferenceableBytes( 
-          Ctx, getAssumedDereferenceableBytes())); 
-    else 
-      Attrs.emplace_back(Attribute::getWithDereferenceableOrNullBytes( 
-          Ctx, getAssumedDereferenceableBytes())); 
-  } 
- 
-  /// See AbstractAttribute::getAsStr(). 
-  const std::string getAsStr() const override { 
-    if (!getAssumedDereferenceableBytes()) 
-      return "unknown-dereferenceable"; 
-    return std::string("dereferenceable") + 
-           (isAssumedNonNull() ? "" : "_or_null") + 
-           (isAssumedGlobal() ? "_globally" : "") + "<" + 
-           std::to_string(getKnownDereferenceableBytes()) + "-" + 
-           std::to_string(getAssumedDereferenceableBytes()) + ">"; 
-  } 
-}; 
- 
-/// Dereferenceable attribute for a floating value. 
-struct AADereferenceableFloating : AADereferenceableImpl { 
-  AADereferenceableFloating(const IRPosition &IRP, Attributor &A) 
-      : AADereferenceableImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    const DataLayout &DL = A.getDataLayout(); 
- 
-    auto VisitValueCB = [&](const Value &V, const Instruction *, DerefState &T, 
-                            bool Stripped) -> bool { 
-      unsigned IdxWidth = 
-          DL.getIndexSizeInBits(V.getType()->getPointerAddressSpace()); 
-      APInt Offset(IdxWidth, 0); 
-      const Value *Base = 
-          stripAndAccumulateMinimalOffsets(A, *this, &V, DL, Offset, false); 
- 
-      const auto &AA = 
-          A.getAAFor<AADereferenceable>(*this, IRPosition::value(*Base)); 
-      int64_t DerefBytes = 0; 
-      if (!Stripped && this == &AA) { 
-        // Use IR information if we did not strip anything. 
-        // TODO: track globally. 
-        bool CanBeNull; 
-        DerefBytes = Base->getPointerDereferenceableBytes(DL, CanBeNull); 
-        T.GlobalState.indicatePessimisticFixpoint(); 
-      } else { 
+        if (assumeLive(A, *AliveSuccessor->getParent()))
+          Worklist.push_back(AliveSuccessor);
+      }
+    }
+  }
+
+  ToBeExploredFrom = std::move(NewToBeExploredFrom);
+
+  // If we know everything is live there is no need to query for liveness.
+  // Instead, indicating a pessimistic fixpoint will cause the state to be
+  // "invalid" and all queries to be answered conservatively without lookups.
+  // To be in this state we have to (1) finished the exploration and (3) not
+  // discovered any non-trivial dead end and (2) not ruled unreachable code
+  // dead.
+  if (ToBeExploredFrom.empty() &&
+      getAnchorScope()->size() == AssumedLiveBlocks.size() &&
+      llvm::all_of(KnownDeadEnds, [](const Instruction *DeadEndI) {
+        return DeadEndI->isTerminator() && DeadEndI->getNumSuccessors() == 0;
+      }))
+    return indicatePessimisticFixpoint();
+  return Change;
+}
+
+/// Liveness information for a call sites.
+struct AAIsDeadCallSite final : AAIsDeadFunction {
+  AAIsDeadCallSite(const IRPosition &IRP, Attributor &A)
+      : AAIsDeadFunction(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites instead of
+    //       redirecting requests to the callee.
+    llvm_unreachable("Abstract attributes for liveness are not "
+                     "supported for call sites yet!");
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    return indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {}
+};
+
+/// -------------------- Dereferenceable Argument Attribute --------------------
+
+template <>
+ChangeStatus clampStateAndIndicateChange<DerefState>(DerefState &S,
+                                                     const DerefState &R) {
+  ChangeStatus CS0 =
+      clampStateAndIndicateChange(S.DerefBytesState, R.DerefBytesState);
+  ChangeStatus CS1 = clampStateAndIndicateChange(S.GlobalState, R.GlobalState);
+  return CS0 | CS1;
+}
+
+struct AADereferenceableImpl : AADereferenceable {
+  AADereferenceableImpl(const IRPosition &IRP, Attributor &A)
+      : AADereferenceable(IRP, A) {}
+  using StateType = DerefState;
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    SmallVector<Attribute, 4> Attrs;
+    getAttrs({Attribute::Dereferenceable, Attribute::DereferenceableOrNull},
+             Attrs, /* IgnoreSubsumingPositions */ false, &A);
+    for (const Attribute &Attr : Attrs)
+      takeKnownDerefBytesMaximum(Attr.getValueAsInt());
+
+    const IRPosition &IRP = this->getIRPosition();
+    NonNullAA = &A.getAAFor<AANonNull>(*this, IRP,
+                                       /* TrackDependence */ false);
+
+    bool CanBeNull;
+    takeKnownDerefBytesMaximum(
+        IRP.getAssociatedValue().getPointerDereferenceableBytes(
+            A.getDataLayout(), CanBeNull));
+
+    bool IsFnInterface = IRP.isFnInterfaceKind();
+    Function *FnScope = IRP.getAnchorScope();
+    if (IsFnInterface && (!FnScope || !A.isFunctionIPOAmendable(*FnScope))) {
+      indicatePessimisticFixpoint();
+      return;
+    }
+
+    if (Instruction *CtxI = getCtxI())
+      followUsesInMBEC(*this, A, getState(), *CtxI);
+  }
+
+  /// See AbstractAttribute::getState()
+  /// {
+  StateType &getState() override { return *this; }
+  const StateType &getState() const override { return *this; }
+  /// }
+
+  /// Helper function for collecting accessed bytes in must-be-executed-context
+  void addAccessedBytesForUse(Attributor &A, const Use *U, const Instruction *I,
+                              DerefState &State) {
+    const Value *UseV = U->get();
+    if (!UseV->getType()->isPointerTy())
+      return;
+
+    Type *PtrTy = UseV->getType();
+    const DataLayout &DL = A.getDataLayout();
+    int64_t Offset;
+    if (const Value *Base = getBasePointerOfAccessPointerOperand(
+            I, Offset, DL, /*AllowNonInbounds*/ true)) {
+      if (Base == &getAssociatedValue() &&
+          getPointerOperand(I, /* AllowVolatile */ false) == UseV) {
+        uint64_t Size = DL.getTypeStoreSize(PtrTy->getPointerElementType());
+        State.addAccessedBytes(Offset, Size);
+      }
+    }
+  }
+
+  /// See followUsesInMBEC
+  bool followUseInMBEC(Attributor &A, const Use *U, const Instruction *I,
+                       AADereferenceable::StateType &State) {
+    bool IsNonNull = false;
+    bool TrackUse = false;
+    int64_t DerefBytes = getKnownNonNullAndDerefBytesForUse(
+        A, *this, getAssociatedValue(), U, I, IsNonNull, TrackUse);
+    LLVM_DEBUG(dbgs() << "[AADereferenceable] Deref bytes: " << DerefBytes
+                      << " for instruction " << *I << "\n");
+
+    addAccessedBytesForUse(A, U, I, State);
+    State.takeKnownDerefBytesMaximum(DerefBytes);
+    return TrackUse;
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    ChangeStatus Change = AADereferenceable::manifest(A);
+    if (isAssumedNonNull() && hasAttr(Attribute::DereferenceableOrNull)) {
+      removeAttrs({Attribute::DereferenceableOrNull});
+      return ChangeStatus::CHANGED;
+    }
+    return Change;
+  }
+
+  void getDeducedAttributes(LLVMContext &Ctx,
+                            SmallVectorImpl<Attribute> &Attrs) const override {
+    // TODO: Add *_globally support
+    if (isAssumedNonNull())
+      Attrs.emplace_back(Attribute::getWithDereferenceableBytes(
+          Ctx, getAssumedDereferenceableBytes()));
+    else
+      Attrs.emplace_back(Attribute::getWithDereferenceableOrNullBytes(
+          Ctx, getAssumedDereferenceableBytes()));
+  }
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    if (!getAssumedDereferenceableBytes())
+      return "unknown-dereferenceable";
+    return std::string("dereferenceable") +
+           (isAssumedNonNull() ? "" : "_or_null") +
+           (isAssumedGlobal() ? "_globally" : "") + "<" +
+           std::to_string(getKnownDereferenceableBytes()) + "-" +
+           std::to_string(getAssumedDereferenceableBytes()) + ">";
+  }
+};
+
+/// Dereferenceable attribute for a floating value.
+struct AADereferenceableFloating : AADereferenceableImpl {
+  AADereferenceableFloating(const IRPosition &IRP, Attributor &A)
+      : AADereferenceableImpl(IRP, A) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    const DataLayout &DL = A.getDataLayout();
+
+    auto VisitValueCB = [&](const Value &V, const Instruction *, DerefState &T,
+                            bool Stripped) -> bool {
+      unsigned IdxWidth =
+          DL.getIndexSizeInBits(V.getType()->getPointerAddressSpace());
+      APInt Offset(IdxWidth, 0);
+      const Value *Base =
+          stripAndAccumulateMinimalOffsets(A, *this, &V, DL, Offset, false);
+
+      const auto &AA =
+          A.getAAFor<AADereferenceable>(*this, IRPosition::value(*Base));
+      int64_t DerefBytes = 0;
+      if (!Stripped && this == &AA) {
+        // Use IR information if we did not strip anything.
+        // TODO: track globally.
+        bool CanBeNull;
+        DerefBytes = Base->getPointerDereferenceableBytes(DL, CanBeNull);
+        T.GlobalState.indicatePessimisticFixpoint();
+      } else {
         const DerefState &DS = AA.getState();
-        DerefBytes = DS.DerefBytesState.getAssumed(); 
-        T.GlobalState &= DS.GlobalState; 
-      } 
- 
-      // For now we do not try to "increase" dereferenceability due to negative 
-      // indices as we first have to come up with code to deal with loops and 
-      // for overflows of the dereferenceable bytes. 
-      int64_t OffsetSExt = Offset.getSExtValue(); 
-      if (OffsetSExt < 0) 
-        OffsetSExt = 0; 
- 
-      T.takeAssumedDerefBytesMinimum( 
-          std::max(int64_t(0), DerefBytes - OffsetSExt)); 
- 
-      if (this == &AA) { 
-        if (!Stripped) { 
-          // If nothing was stripped IR information is all we got. 
-          T.takeKnownDerefBytesMaximum( 
-              std::max(int64_t(0), DerefBytes - OffsetSExt)); 
-          T.indicatePessimisticFixpoint(); 
-        } else if (OffsetSExt > 0) { 
-          // If something was stripped but there is circular reasoning we look 
-          // for the offset. If it is positive we basically decrease the 
-          // dereferenceable bytes in a circluar loop now, which will simply 
-          // drive them down to the known value in a very slow way which we 
-          // can accelerate. 
-          T.indicatePessimisticFixpoint(); 
-        } 
-      } 
- 
-      return T.isValidState(); 
-    }; 
- 
-    DerefState T; 
-    if (!genericValueTraversal<AADereferenceable, DerefState>( 
-            A, getIRPosition(), *this, T, VisitValueCB, getCtxI())) 
-      return indicatePessimisticFixpoint(); 
- 
-    return clampStateAndIndicateChange(getState(), T); 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { 
-    STATS_DECLTRACK_FLOATING_ATTR(dereferenceable) 
-  } 
-}; 
- 
-/// Dereferenceable attribute for a return value. 
-struct AADereferenceableReturned final 
-    : AAReturnedFromReturnedValues<AADereferenceable, AADereferenceableImpl> { 
-  AADereferenceableReturned(const IRPosition &IRP, Attributor &A) 
-      : AAReturnedFromReturnedValues<AADereferenceable, AADereferenceableImpl>( 
-            IRP, A) {} 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { 
-    STATS_DECLTRACK_FNRET_ATTR(dereferenceable) 
-  } 
-}; 
- 
-/// Dereferenceable attribute for an argument 
-struct AADereferenceableArgument final 
-    : AAArgumentFromCallSiteArguments<AADereferenceable, 
-                                      AADereferenceableImpl> { 
-  using Base = 
-      AAArgumentFromCallSiteArguments<AADereferenceable, AADereferenceableImpl>; 
-  AADereferenceableArgument(const IRPosition &IRP, Attributor &A) 
-      : Base(IRP, A) {} 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { 
-    STATS_DECLTRACK_ARG_ATTR(dereferenceable) 
-  } 
-}; 
- 
-/// Dereferenceable attribute for a call site argument. 
-struct AADereferenceableCallSiteArgument final : AADereferenceableFloating { 
-  AADereferenceableCallSiteArgument(const IRPosition &IRP, Attributor &A) 
-      : AADereferenceableFloating(IRP, A) {} 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { 
-    STATS_DECLTRACK_CSARG_ATTR(dereferenceable) 
-  } 
-}; 
- 
-/// Dereferenceable attribute deduction for a call site return value. 
-struct AADereferenceableCallSiteReturned final 
-    : AACallSiteReturnedFromReturned<AADereferenceable, AADereferenceableImpl> { 
-  using Base = 
-      AACallSiteReturnedFromReturned<AADereferenceable, AADereferenceableImpl>; 
-  AADereferenceableCallSiteReturned(const IRPosition &IRP, Attributor &A) 
-      : Base(IRP, A) {} 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { 
-    STATS_DECLTRACK_CS_ATTR(dereferenceable); 
-  } 
-}; 
- 
-// ------------------------ Align Argument Attribute ------------------------ 
- 
-static unsigned getKnownAlignForUse(Attributor &A, 
-                                    AbstractAttribute &QueryingAA, 
-                                    Value &AssociatedValue, const Use *U, 
-                                    const Instruction *I, bool &TrackUse) { 
-  // We need to follow common pointer manipulation uses to the accesses they 
-  // feed into. 
-  if (isa<CastInst>(I)) { 
-    // Follow all but ptr2int casts. 
-    TrackUse = !isa<PtrToIntInst>(I); 
-    return 0; 
-  } 
-  if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) { 
-    if (GEP->hasAllConstantIndices()) { 
-      TrackUse = true; 
-      return 0; 
-    } 
-  } 
- 
-  MaybeAlign MA; 
-  if (const auto *CB = dyn_cast<CallBase>(I)) { 
-    if (CB->isBundleOperand(U) || CB->isCallee(U)) 
-      return 0; 
- 
-    unsigned ArgNo = CB->getArgOperandNo(U); 
-    IRPosition IRP = IRPosition::callsite_argument(*CB, ArgNo); 
-    // As long as we only use known information there is no need to track 
-    // dependences here. 
-    auto &AlignAA = A.getAAFor<AAAlign>(QueryingAA, IRP, 
-                                        /* TrackDependence */ false); 
-    MA = MaybeAlign(AlignAA.getKnownAlign()); 
-  } 
- 
-  const DataLayout &DL = A.getDataLayout(); 
-  const Value *UseV = U->get(); 
-  if (auto *SI = dyn_cast<StoreInst>(I)) { 
-    if (SI->getPointerOperand() == UseV) 
-      MA = SI->getAlign(); 
-  } else if (auto *LI = dyn_cast<LoadInst>(I)) { 
-    if (LI->getPointerOperand() == UseV) 
-      MA = LI->getAlign(); 
-  } 
- 
-  if (!MA || *MA <= 1) 
-    return 0; 
- 
-  unsigned Alignment = MA->value(); 
-  int64_t Offset; 
- 
-  if (const Value *Base = GetPointerBaseWithConstantOffset(UseV, Offset, DL)) { 
-    if (Base == &AssociatedValue) { 
-      // BasePointerAddr + Offset = Alignment * Q for some integer Q. 
-      // So we can say that the maximum power of two which is a divisor of 
-      // gcd(Offset, Alignment) is an alignment. 
- 
-      uint32_t gcd = 
-          greatestCommonDivisor(uint32_t(abs((int32_t)Offset)), Alignment); 
-      Alignment = llvm::PowerOf2Floor(gcd); 
-    } 
-  } 
- 
-  return Alignment; 
-} 
- 
-struct AAAlignImpl : AAAlign { 
-  AAAlignImpl(const IRPosition &IRP, Attributor &A) : AAAlign(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    SmallVector<Attribute, 4> Attrs; 
-    getAttrs({Attribute::Alignment}, Attrs); 
-    for (const Attribute &Attr : Attrs) 
-      takeKnownMaximum(Attr.getValueAsInt()); 
- 
-    Value &V = getAssociatedValue(); 
-    // TODO: This is a HACK to avoid getPointerAlignment to introduce a ptr2int 
-    //       use of the function pointer. This was caused by D73131. We want to 
-    //       avoid this for function pointers especially because we iterate 
-    //       their uses and int2ptr is not handled. It is not a correctness 
-    //       problem though! 
-    if (!V.getType()->getPointerElementType()->isFunctionTy()) 
-      takeKnownMaximum(V.getPointerAlignment(A.getDataLayout()).value()); 
- 
-    if (getIRPosition().isFnInterfaceKind() && 
-        (!getAnchorScope() || 
-         !A.isFunctionIPOAmendable(*getAssociatedFunction()))) { 
-      indicatePessimisticFixpoint(); 
-      return; 
-    } 
- 
-    if (Instruction *CtxI = getCtxI()) 
-      followUsesInMBEC(*this, A, getState(), *CtxI); 
-  } 
- 
-  /// See AbstractAttribute::manifest(...). 
-  ChangeStatus manifest(Attributor &A) override { 
-    ChangeStatus LoadStoreChanged = ChangeStatus::UNCHANGED; 
- 
-    // Check for users that allow alignment annotations. 
-    Value &AssociatedValue = getAssociatedValue(); 
-    for (const Use &U : AssociatedValue.uses()) { 
-      if (auto *SI = dyn_cast<StoreInst>(U.getUser())) { 
-        if (SI->getPointerOperand() == &AssociatedValue) 
-          if (SI->getAlignment() < getAssumedAlign()) { 
-            STATS_DECLTRACK(AAAlign, Store, 
-                            "Number of times alignment added to a store"); 
-            SI->setAlignment(Align(getAssumedAlign())); 
-            LoadStoreChanged = ChangeStatus::CHANGED; 
-          } 
-      } else if (auto *LI = dyn_cast<LoadInst>(U.getUser())) { 
-        if (LI->getPointerOperand() == &AssociatedValue) 
-          if (LI->getAlignment() < getAssumedAlign()) { 
-            LI->setAlignment(Align(getAssumedAlign())); 
-            STATS_DECLTRACK(AAAlign, Load, 
-                            "Number of times alignment added to a load"); 
-            LoadStoreChanged = ChangeStatus::CHANGED; 
-          } 
-      } 
-    } 
- 
-    ChangeStatus Changed = AAAlign::manifest(A); 
- 
-    Align InheritAlign = 
-        getAssociatedValue().getPointerAlignment(A.getDataLayout()); 
-    if (InheritAlign >= getAssumedAlign()) 
-      return LoadStoreChanged; 
-    return Changed | LoadStoreChanged; 
-  } 
- 
-  // TODO: Provide a helper to determine the implied ABI alignment and check in 
-  //       the existing manifest method and a new one for AAAlignImpl that value 
-  //       to avoid making the alignment explicit if it did not improve. 
- 
-  /// See AbstractAttribute::getDeducedAttributes 
-  virtual void 
-  getDeducedAttributes(LLVMContext &Ctx, 
-                       SmallVectorImpl<Attribute> &Attrs) const override { 
-    if (getAssumedAlign() > 1) 
-      Attrs.emplace_back( 
-          Attribute::getWithAlignment(Ctx, Align(getAssumedAlign()))); 
-  } 
- 
-  /// See followUsesInMBEC 
-  bool followUseInMBEC(Attributor &A, const Use *U, const Instruction *I, 
-                       AAAlign::StateType &State) { 
-    bool TrackUse = false; 
- 
-    unsigned int KnownAlign = 
-        getKnownAlignForUse(A, *this, getAssociatedValue(), U, I, TrackUse); 
-    State.takeKnownMaximum(KnownAlign); 
- 
-    return TrackUse; 
-  } 
- 
-  /// See AbstractAttribute::getAsStr(). 
-  const std::string getAsStr() const override { 
-    return getAssumedAlign() ? ("align<" + std::to_string(getKnownAlign()) + 
-                                "-" + std::to_string(getAssumedAlign()) + ">") 
-                             : "unknown-align"; 
-  } 
-}; 
- 
-/// Align attribute for a floating value. 
-struct AAAlignFloating : AAAlignImpl { 
-  AAAlignFloating(const IRPosition &IRP, Attributor &A) : AAAlignImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    const DataLayout &DL = A.getDataLayout(); 
- 
-    auto VisitValueCB = [&](Value &V, const Instruction *, 
-                            AAAlign::StateType &T, bool Stripped) -> bool { 
-      const auto &AA = A.getAAFor<AAAlign>(*this, IRPosition::value(V)); 
-      if (!Stripped && this == &AA) { 
+        DerefBytes = DS.DerefBytesState.getAssumed();
+        T.GlobalState &= DS.GlobalState;
+      }
+
+      // For now we do not try to "increase" dereferenceability due to negative
+      // indices as we first have to come up with code to deal with loops and
+      // for overflows of the dereferenceable bytes.
+      int64_t OffsetSExt = Offset.getSExtValue();
+      if (OffsetSExt < 0)
+        OffsetSExt = 0;
+
+      T.takeAssumedDerefBytesMinimum(
+          std::max(int64_t(0), DerefBytes - OffsetSExt));
+
+      if (this == &AA) {
+        if (!Stripped) {
+          // If nothing was stripped IR information is all we got.
+          T.takeKnownDerefBytesMaximum(
+              std::max(int64_t(0), DerefBytes - OffsetSExt));
+          T.indicatePessimisticFixpoint();
+        } else if (OffsetSExt > 0) {
+          // If something was stripped but there is circular reasoning we look
+          // for the offset. If it is positive we basically decrease the
+          // dereferenceable bytes in a circluar loop now, which will simply
+          // drive them down to the known value in a very slow way which we
+          // can accelerate.
+          T.indicatePessimisticFixpoint();
+        }
+      }
+
+      return T.isValidState();
+    };
+
+    DerefState T;
+    if (!genericValueTraversal<AADereferenceable, DerefState>(
+            A, getIRPosition(), *this, T, VisitValueCB, getCtxI()))
+      return indicatePessimisticFixpoint();
+
+    return clampStateAndIndicateChange(getState(), T);
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FLOATING_ATTR(dereferenceable)
+  }
+};
+
+/// Dereferenceable attribute for a return value.
+struct AADereferenceableReturned final
+    : AAReturnedFromReturnedValues<AADereferenceable, AADereferenceableImpl> {
+  AADereferenceableReturned(const IRPosition &IRP, Attributor &A)
+      : AAReturnedFromReturnedValues<AADereferenceable, AADereferenceableImpl>(
+            IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FNRET_ATTR(dereferenceable)
+  }
+};
+
+/// Dereferenceable attribute for an argument
+struct AADereferenceableArgument final
+    : AAArgumentFromCallSiteArguments<AADereferenceable,
+                                      AADereferenceableImpl> {
+  using Base =
+      AAArgumentFromCallSiteArguments<AADereferenceable, AADereferenceableImpl>;
+  AADereferenceableArgument(const IRPosition &IRP, Attributor &A)
+      : Base(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_ARG_ATTR(dereferenceable)
+  }
+};
+
+/// Dereferenceable attribute for a call site argument.
+struct AADereferenceableCallSiteArgument final : AADereferenceableFloating {
+  AADereferenceableCallSiteArgument(const IRPosition &IRP, Attributor &A)
+      : AADereferenceableFloating(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CSARG_ATTR(dereferenceable)
+  }
+};
+
+/// Dereferenceable attribute deduction for a call site return value.
+struct AADereferenceableCallSiteReturned final
+    : AACallSiteReturnedFromReturned<AADereferenceable, AADereferenceableImpl> {
+  using Base =
+      AACallSiteReturnedFromReturned<AADereferenceable, AADereferenceableImpl>;
+  AADereferenceableCallSiteReturned(const IRPosition &IRP, Attributor &A)
+      : Base(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CS_ATTR(dereferenceable);
+  }
+};
+
+// ------------------------ Align Argument Attribute ------------------------
+
+static unsigned getKnownAlignForUse(Attributor &A,
+                                    AbstractAttribute &QueryingAA,
+                                    Value &AssociatedValue, const Use *U,
+                                    const Instruction *I, bool &TrackUse) {
+  // We need to follow common pointer manipulation uses to the accesses they
+  // feed into.
+  if (isa<CastInst>(I)) {
+    // Follow all but ptr2int casts.
+    TrackUse = !isa<PtrToIntInst>(I);
+    return 0;
+  }
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
+    if (GEP->hasAllConstantIndices()) {
+      TrackUse = true;
+      return 0;
+    }
+  }
+
+  MaybeAlign MA;
+  if (const auto *CB = dyn_cast<CallBase>(I)) {
+    if (CB->isBundleOperand(U) || CB->isCallee(U))
+      return 0;
+
+    unsigned ArgNo = CB->getArgOperandNo(U);
+    IRPosition IRP = IRPosition::callsite_argument(*CB, ArgNo);
+    // As long as we only use known information there is no need to track
+    // dependences here.
+    auto &AlignAA = A.getAAFor<AAAlign>(QueryingAA, IRP,
+                                        /* TrackDependence */ false);
+    MA = MaybeAlign(AlignAA.getKnownAlign());
+  }
+
+  const DataLayout &DL = A.getDataLayout();
+  const Value *UseV = U->get();
+  if (auto *SI = dyn_cast<StoreInst>(I)) {
+    if (SI->getPointerOperand() == UseV)
+      MA = SI->getAlign();
+  } else if (auto *LI = dyn_cast<LoadInst>(I)) {
+    if (LI->getPointerOperand() == UseV)
+      MA = LI->getAlign();
+  }
+
+  if (!MA || *MA <= 1)
+    return 0;
+
+  unsigned Alignment = MA->value();
+  int64_t Offset;
+
+  if (const Value *Base = GetPointerBaseWithConstantOffset(UseV, Offset, DL)) {
+    if (Base == &AssociatedValue) {
+      // BasePointerAddr + Offset = Alignment * Q for some integer Q.
+      // So we can say that the maximum power of two which is a divisor of
+      // gcd(Offset, Alignment) is an alignment.
+
+      uint32_t gcd =
+          greatestCommonDivisor(uint32_t(abs((int32_t)Offset)), Alignment);
+      Alignment = llvm::PowerOf2Floor(gcd);
+    }
+  }
+
+  return Alignment;
+}
+
+struct AAAlignImpl : AAAlign {
+  AAAlignImpl(const IRPosition &IRP, Attributor &A) : AAAlign(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    SmallVector<Attribute, 4> Attrs;
+    getAttrs({Attribute::Alignment}, Attrs);
+    for (const Attribute &Attr : Attrs)
+      takeKnownMaximum(Attr.getValueAsInt());
+
+    Value &V = getAssociatedValue();
+    // TODO: This is a HACK to avoid getPointerAlignment to introduce a ptr2int
+    //       use of the function pointer. This was caused by D73131. We want to
+    //       avoid this for function pointers especially because we iterate
+    //       their uses and int2ptr is not handled. It is not a correctness
+    //       problem though!
+    if (!V.getType()->getPointerElementType()->isFunctionTy())
+      takeKnownMaximum(V.getPointerAlignment(A.getDataLayout()).value());
+
+    if (getIRPosition().isFnInterfaceKind() &&
+        (!getAnchorScope() ||
+         !A.isFunctionIPOAmendable(*getAssociatedFunction()))) {
+      indicatePessimisticFixpoint();
+      return;
+    }
+
+    if (Instruction *CtxI = getCtxI())
+      followUsesInMBEC(*this, A, getState(), *CtxI);
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    ChangeStatus LoadStoreChanged = ChangeStatus::UNCHANGED;
+
+    // Check for users that allow alignment annotations.
+    Value &AssociatedValue = getAssociatedValue();
+    for (const Use &U : AssociatedValue.uses()) {
+      if (auto *SI = dyn_cast<StoreInst>(U.getUser())) {
+        if (SI->getPointerOperand() == &AssociatedValue)
+          if (SI->getAlignment() < getAssumedAlign()) {
+            STATS_DECLTRACK(AAAlign, Store,
+                            "Number of times alignment added to a store");
+            SI->setAlignment(Align(getAssumedAlign()));
+            LoadStoreChanged = ChangeStatus::CHANGED;
+          }
+      } else if (auto *LI = dyn_cast<LoadInst>(U.getUser())) {
+        if (LI->getPointerOperand() == &AssociatedValue)
+          if (LI->getAlignment() < getAssumedAlign()) {
+            LI->setAlignment(Align(getAssumedAlign()));
+            STATS_DECLTRACK(AAAlign, Load,
+                            "Number of times alignment added to a load");
+            LoadStoreChanged = ChangeStatus::CHANGED;
+          }
+      }
+    }
+
+    ChangeStatus Changed = AAAlign::manifest(A);
+
+    Align InheritAlign =
+        getAssociatedValue().getPointerAlignment(A.getDataLayout());
+    if (InheritAlign >= getAssumedAlign())
+      return LoadStoreChanged;
+    return Changed | LoadStoreChanged;
+  }
+
+  // TODO: Provide a helper to determine the implied ABI alignment and check in
+  //       the existing manifest method and a new one for AAAlignImpl that value
+  //       to avoid making the alignment explicit if it did not improve.
+
+  /// See AbstractAttribute::getDeducedAttributes
+  virtual void
+  getDeducedAttributes(LLVMContext &Ctx,
+                       SmallVectorImpl<Attribute> &Attrs) const override {
+    if (getAssumedAlign() > 1)
+      Attrs.emplace_back(
+          Attribute::getWithAlignment(Ctx, Align(getAssumedAlign())));
+  }
+
+  /// See followUsesInMBEC
+  bool followUseInMBEC(Attributor &A, const Use *U, const Instruction *I,
+                       AAAlign::StateType &State) {
+    bool TrackUse = false;
+
+    unsigned int KnownAlign =
+        getKnownAlignForUse(A, *this, getAssociatedValue(), U, I, TrackUse);
+    State.takeKnownMaximum(KnownAlign);
+
+    return TrackUse;
+  }
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    return getAssumedAlign() ? ("align<" + std::to_string(getKnownAlign()) +
+                                "-" + std::to_string(getAssumedAlign()) + ">")
+                             : "unknown-align";
+  }
+};
+
+/// Align attribute for a floating value.
+struct AAAlignFloating : AAAlignImpl {
+  AAAlignFloating(const IRPosition &IRP, Attributor &A) : AAAlignImpl(IRP, A) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    const DataLayout &DL = A.getDataLayout();
+
+    auto VisitValueCB = [&](Value &V, const Instruction *,
+                            AAAlign::StateType &T, bool Stripped) -> bool {
+      const auto &AA = A.getAAFor<AAAlign>(*this, IRPosition::value(V));
+      if (!Stripped && this == &AA) {
         int64_t Offset;
         unsigned Alignment = 1;
         if (const Value *Base =
@@ -3867,37 +3867,37 @@ struct AAAlignFloating : AAAlignImpl {
         } else {
           Alignment = V.getPointerAlignment(DL).value();
         }
-        // Use only IR information if we did not strip anything. 
+        // Use only IR information if we did not strip anything.
         T.takeKnownMaximum(Alignment);
-        T.indicatePessimisticFixpoint(); 
-      } else { 
-        // Use abstract attribute information. 
+        T.indicatePessimisticFixpoint();
+      } else {
+        // Use abstract attribute information.
         const AAAlign::StateType &DS = AA.getState();
-        T ^= DS; 
-      } 
-      return T.isValidState(); 
-    }; 
- 
-    StateType T; 
-    if (!genericValueTraversal<AAAlign, StateType>(A, getIRPosition(), *this, T, 
-                                                   VisitValueCB, getCtxI())) 
-      return indicatePessimisticFixpoint(); 
- 
-    // TODO: If we know we visited all incoming values, thus no are assumed 
-    // dead, we can take the known information from the state T. 
-    return clampStateAndIndicateChange(getState(), T); 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_FLOATING_ATTR(align) } 
-}; 
- 
-/// Align attribute for function return value. 
-struct AAAlignReturned final 
-    : AAReturnedFromReturnedValues<AAAlign, AAAlignImpl> { 
+        T ^= DS;
+      }
+      return T.isValidState();
+    };
+
+    StateType T;
+    if (!genericValueTraversal<AAAlign, StateType>(A, getIRPosition(), *this, T,
+                                                   VisitValueCB, getCtxI()))
+      return indicatePessimisticFixpoint();
+
+    // TODO: If we know we visited all incoming values, thus no are assumed
+    // dead, we can take the known information from the state T.
+    return clampStateAndIndicateChange(getState(), T);
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FLOATING_ATTR(align) }
+};
+
+/// Align attribute for function return value.
+struct AAAlignReturned final
+    : AAReturnedFromReturnedValues<AAAlign, AAAlignImpl> {
   using Base = AAReturnedFromReturnedValues<AAAlign, AAAlignImpl>;
   AAAlignReturned(const IRPosition &IRP, Attributor &A) : Base(IRP, A) {}
- 
+
   /// See AbstractAttribute::initialize(...).
   void initialize(Attributor &A) override {
     Base::initialize(A);
@@ -3906,126 +3906,126 @@ struct AAAlignReturned final
       indicatePessimisticFixpoint();
   }
 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(aligned) } 
-}; 
- 
-/// Align attribute for function argument. 
-struct AAAlignArgument final 
-    : AAArgumentFromCallSiteArguments<AAAlign, AAAlignImpl> { 
-  using Base = AAArgumentFromCallSiteArguments<AAAlign, AAAlignImpl>; 
-  AAAlignArgument(const IRPosition &IRP, Attributor &A) : Base(IRP, A) {} 
- 
-  /// See AbstractAttribute::manifest(...). 
-  ChangeStatus manifest(Attributor &A) override { 
-    // If the associated argument is involved in a must-tail call we give up 
-    // because we would need to keep the argument alignments of caller and 
-    // callee in-sync. Just does not seem worth the trouble right now. 
-    if (A.getInfoCache().isInvolvedInMustTailCall(*getAssociatedArgument())) 
-      return ChangeStatus::UNCHANGED; 
-    return Base::manifest(A); 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(aligned) } 
-}; 
- 
-struct AAAlignCallSiteArgument final : AAAlignFloating { 
-  AAAlignCallSiteArgument(const IRPosition &IRP, Attributor &A) 
-      : AAAlignFloating(IRP, A) {} 
- 
-  /// See AbstractAttribute::manifest(...). 
-  ChangeStatus manifest(Attributor &A) override { 
-    // If the associated argument is involved in a must-tail call we give up 
-    // because we would need to keep the argument alignments of caller and 
-    // callee in-sync. Just does not seem worth the trouble right now. 
-    if (Argument *Arg = getAssociatedArgument()) 
-      if (A.getInfoCache().isInvolvedInMustTailCall(*Arg)) 
-        return ChangeStatus::UNCHANGED; 
-    ChangeStatus Changed = AAAlignImpl::manifest(A); 
-    Align InheritAlign = 
-        getAssociatedValue().getPointerAlignment(A.getDataLayout()); 
-    if (InheritAlign >= getAssumedAlign()) 
-      Changed = ChangeStatus::UNCHANGED; 
-    return Changed; 
-  } 
- 
-  /// See AbstractAttribute::updateImpl(Attributor &A). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    ChangeStatus Changed = AAAlignFloating::updateImpl(A); 
-    if (Argument *Arg = getAssociatedArgument()) { 
-      // We only take known information from the argument 
-      // so we do not need to track a dependence. 
-      const auto &ArgAlignAA = A.getAAFor<AAAlign>( 
-          *this, IRPosition::argument(*Arg), /* TrackDependence */ false); 
-      takeKnownMaximum(ArgAlignAA.getKnownAlign()); 
-    } 
-    return Changed; 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(aligned) } 
-}; 
- 
-/// Align attribute deduction for a call site return value. 
-struct AAAlignCallSiteReturned final 
-    : AACallSiteReturnedFromReturned<AAAlign, AAAlignImpl> { 
-  using Base = AACallSiteReturnedFromReturned<AAAlign, AAAlignImpl>; 
-  AAAlignCallSiteReturned(const IRPosition &IRP, Attributor &A) 
-      : Base(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    Base::initialize(A); 
-    Function *F = getAssociatedFunction(); 
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(aligned) }
+};
+
+/// Align attribute for function argument.
+struct AAAlignArgument final
+    : AAArgumentFromCallSiteArguments<AAAlign, AAAlignImpl> {
+  using Base = AAArgumentFromCallSiteArguments<AAAlign, AAAlignImpl>;
+  AAAlignArgument(const IRPosition &IRP, Attributor &A) : Base(IRP, A) {}
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    // If the associated argument is involved in a must-tail call we give up
+    // because we would need to keep the argument alignments of caller and
+    // callee in-sync. Just does not seem worth the trouble right now.
+    if (A.getInfoCache().isInvolvedInMustTailCall(*getAssociatedArgument()))
+      return ChangeStatus::UNCHANGED;
+    return Base::manifest(A);
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(aligned) }
+};
+
+struct AAAlignCallSiteArgument final : AAAlignFloating {
+  AAAlignCallSiteArgument(const IRPosition &IRP, Attributor &A)
+      : AAAlignFloating(IRP, A) {}
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    // If the associated argument is involved in a must-tail call we give up
+    // because we would need to keep the argument alignments of caller and
+    // callee in-sync. Just does not seem worth the trouble right now.
+    if (Argument *Arg = getAssociatedArgument())
+      if (A.getInfoCache().isInvolvedInMustTailCall(*Arg))
+        return ChangeStatus::UNCHANGED;
+    ChangeStatus Changed = AAAlignImpl::manifest(A);
+    Align InheritAlign =
+        getAssociatedValue().getPointerAlignment(A.getDataLayout());
+    if (InheritAlign >= getAssumedAlign())
+      Changed = ChangeStatus::UNCHANGED;
+    return Changed;
+  }
+
+  /// See AbstractAttribute::updateImpl(Attributor &A).
+  ChangeStatus updateImpl(Attributor &A) override {
+    ChangeStatus Changed = AAAlignFloating::updateImpl(A);
+    if (Argument *Arg = getAssociatedArgument()) {
+      // We only take known information from the argument
+      // so we do not need to track a dependence.
+      const auto &ArgAlignAA = A.getAAFor<AAAlign>(
+          *this, IRPosition::argument(*Arg), /* TrackDependence */ false);
+      takeKnownMaximum(ArgAlignAA.getKnownAlign());
+    }
+    return Changed;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(aligned) }
+};
+
+/// Align attribute deduction for a call site return value.
+struct AAAlignCallSiteReturned final
+    : AACallSiteReturnedFromReturned<AAAlign, AAAlignImpl> {
+  using Base = AACallSiteReturnedFromReturned<AAAlign, AAAlignImpl>;
+  AAAlignCallSiteReturned(const IRPosition &IRP, Attributor &A)
+      : Base(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    Base::initialize(A);
+    Function *F = getAssociatedFunction();
     if (!F || F->isDeclaration())
-      indicatePessimisticFixpoint(); 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(align); } 
-}; 
- 
-/// ------------------ Function No-Return Attribute ---------------------------- 
-struct AANoReturnImpl : public AANoReturn { 
-  AANoReturnImpl(const IRPosition &IRP, Attributor &A) : AANoReturn(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    AANoReturn::initialize(A); 
-    Function *F = getAssociatedFunction(); 
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(align); }
+};
+
+/// ------------------ Function No-Return Attribute ----------------------------
+struct AANoReturnImpl : public AANoReturn {
+  AANoReturnImpl(const IRPosition &IRP, Attributor &A) : AANoReturn(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoReturn::initialize(A);
+    Function *F = getAssociatedFunction();
     if (!F || F->isDeclaration())
-      indicatePessimisticFixpoint(); 
-  } 
- 
-  /// See AbstractAttribute::getAsStr(). 
-  const std::string getAsStr() const override { 
-    return getAssumed() ? "noreturn" : "may-return"; 
-  } 
- 
-  /// See AbstractAttribute::updateImpl(Attributor &A). 
-  virtual ChangeStatus updateImpl(Attributor &A) override { 
-    auto CheckForNoReturn = [](Instruction &) { return false; }; 
-    if (!A.checkForAllInstructions(CheckForNoReturn, *this, 
-                                   {(unsigned)Instruction::Ret})) 
-      return indicatePessimisticFixpoint(); 
-    return ChangeStatus::UNCHANGED; 
-  } 
-}; 
- 
-struct AANoReturnFunction final : AANoReturnImpl { 
-  AANoReturnFunction(const IRPosition &IRP, Attributor &A) 
-      : AANoReturnImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(noreturn) } 
-}; 
- 
-/// NoReturn attribute deduction for a call sites. 
-struct AANoReturnCallSite final : AANoReturnImpl { 
-  AANoReturnCallSite(const IRPosition &IRP, Attributor &A) 
-      : AANoReturnImpl(IRP, A) {} 
- 
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    return getAssumed() ? "noreturn" : "may-return";
+  }
+
+  /// See AbstractAttribute::updateImpl(Attributor &A).
+  virtual ChangeStatus updateImpl(Attributor &A) override {
+    auto CheckForNoReturn = [](Instruction &) { return false; };
+    if (!A.checkForAllInstructions(CheckForNoReturn, *this,
+                                   {(unsigned)Instruction::Ret}))
+      return indicatePessimisticFixpoint();
+    return ChangeStatus::UNCHANGED;
+  }
+};
+
+struct AANoReturnFunction final : AANoReturnImpl {
+  AANoReturnFunction(const IRPosition &IRP, Attributor &A)
+      : AANoReturnImpl(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(noreturn) }
+};
+
+/// NoReturn attribute deduction for a call sites.
+struct AANoReturnCallSite final : AANoReturnImpl {
+  AANoReturnCallSite(const IRPosition &IRP, Attributor &A)
+      : AANoReturnImpl(IRP, A) {}
+
   /// See AbstractAttribute::initialize(...).
   void initialize(Attributor &A) override {
     AANoReturnImpl::initialize(A);
@@ -4037,542 +4037,542 @@ struct AANoReturnCallSite final : AANoReturnImpl {
     }
   }
 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    // TODO: Once we have call site specific value information we can provide 
-    //       call site specific liveness information and then it makes 
-    //       sense to specialize attributes for call sites arguments instead of 
-    //       redirecting requests to the callee argument. 
-    Function *F = getAssociatedFunction(); 
-    const IRPosition &FnPos = IRPosition::function(*F); 
-    auto &FnAA = A.getAAFor<AANoReturn>(*this, FnPos); 
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::function(*F);
+    auto &FnAA = A.getAAFor<AANoReturn>(*this, FnPos);
     return clampStateAndIndicateChange(getState(), FnAA.getState());
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(noreturn); } 
-}; 
- 
-/// ----------------------- Variable Capturing --------------------------------- 
- 
-/// A class to hold the state of for no-capture attributes. 
-struct AANoCaptureImpl : public AANoCapture { 
-  AANoCaptureImpl(const IRPosition &IRP, Attributor &A) : AANoCapture(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    if (hasAttr(getAttrKind(), /* IgnoreSubsumingPositions */ true)) { 
-      indicateOptimisticFixpoint(); 
-      return; 
-    } 
-    Function *AnchorScope = getAnchorScope(); 
-    if (isFnInterfaceKind() && 
-        (!AnchorScope || !A.isFunctionIPOAmendable(*AnchorScope))) { 
-      indicatePessimisticFixpoint(); 
-      return; 
-    } 
- 
-    // You cannot "capture" null in the default address space. 
-    if (isa<ConstantPointerNull>(getAssociatedValue()) && 
-        getAssociatedValue().getType()->getPointerAddressSpace() == 0) { 
-      indicateOptimisticFixpoint(); 
-      return; 
-    } 
- 
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(noreturn); }
+};
+
+/// ----------------------- Variable Capturing ---------------------------------
+
+/// A class to hold the state of for no-capture attributes.
+struct AANoCaptureImpl : public AANoCapture {
+  AANoCaptureImpl(const IRPosition &IRP, Attributor &A) : AANoCapture(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    if (hasAttr(getAttrKind(), /* IgnoreSubsumingPositions */ true)) {
+      indicateOptimisticFixpoint();
+      return;
+    }
+    Function *AnchorScope = getAnchorScope();
+    if (isFnInterfaceKind() &&
+        (!AnchorScope || !A.isFunctionIPOAmendable(*AnchorScope))) {
+      indicatePessimisticFixpoint();
+      return;
+    }
+
+    // You cannot "capture" null in the default address space.
+    if (isa<ConstantPointerNull>(getAssociatedValue()) &&
+        getAssociatedValue().getType()->getPointerAddressSpace() == 0) {
+      indicateOptimisticFixpoint();
+      return;
+    }
+
     const Function *F =
         isArgumentPosition() ? getAssociatedFunction() : AnchorScope;
- 
-    // Check what state the associated function can actually capture. 
-    if (F) 
-      determineFunctionCaptureCapabilities(getIRPosition(), *F, *this); 
-    else 
-      indicatePessimisticFixpoint(); 
-  } 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override; 
- 
-  /// see AbstractAttribute::isAssumedNoCaptureMaybeReturned(...). 
-  virtual void 
-  getDeducedAttributes(LLVMContext &Ctx, 
-                       SmallVectorImpl<Attribute> &Attrs) const override { 
-    if (!isAssumedNoCaptureMaybeReturned()) 
-      return; 
- 
+
+    // Check what state the associated function can actually capture.
+    if (F)
+      determineFunctionCaptureCapabilities(getIRPosition(), *F, *this);
+    else
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override;
+
+  /// see AbstractAttribute::isAssumedNoCaptureMaybeReturned(...).
+  virtual void
+  getDeducedAttributes(LLVMContext &Ctx,
+                       SmallVectorImpl<Attribute> &Attrs) const override {
+    if (!isAssumedNoCaptureMaybeReturned())
+      return;
+
     if (isArgumentPosition()) {
-      if (isAssumedNoCapture()) 
-        Attrs.emplace_back(Attribute::get(Ctx, Attribute::NoCapture)); 
-      else if (ManifestInternal) 
-        Attrs.emplace_back(Attribute::get(Ctx, "no-capture-maybe-returned")); 
-    } 
-  } 
- 
-  /// Set the NOT_CAPTURED_IN_MEM and NOT_CAPTURED_IN_RET bits in \p Known 
-  /// depending on the ability of the function associated with \p IRP to capture 
-  /// state in memory and through "returning/throwing", respectively. 
-  static void determineFunctionCaptureCapabilities(const IRPosition &IRP, 
-                                                   const Function &F, 
-                                                   BitIntegerState &State) { 
-    // TODO: Once we have memory behavior attributes we should use them here. 
- 
-    // If we know we cannot communicate or write to memory, we do not care about 
-    // ptr2int anymore. 
-    if (F.onlyReadsMemory() && F.doesNotThrow() && 
-        F.getReturnType()->isVoidTy()) { 
-      State.addKnownBits(NO_CAPTURE); 
-      return; 
-    } 
- 
-    // A function cannot capture state in memory if it only reads memory, it can 
-    // however return/throw state and the state might be influenced by the 
-    // pointer value, e.g., loading from a returned pointer might reveal a bit. 
-    if (F.onlyReadsMemory()) 
-      State.addKnownBits(NOT_CAPTURED_IN_MEM); 
- 
-    // A function cannot communicate state back if it does not through 
-    // exceptions and doesn not return values. 
-    if (F.doesNotThrow() && F.getReturnType()->isVoidTy()) 
-      State.addKnownBits(NOT_CAPTURED_IN_RET); 
- 
-    // Check existing "returned" attributes. 
+      if (isAssumedNoCapture())
+        Attrs.emplace_back(Attribute::get(Ctx, Attribute::NoCapture));
+      else if (ManifestInternal)
+        Attrs.emplace_back(Attribute::get(Ctx, "no-capture-maybe-returned"));
+    }
+  }
+
+  /// Set the NOT_CAPTURED_IN_MEM and NOT_CAPTURED_IN_RET bits in \p Known
+  /// depending on the ability of the function associated with \p IRP to capture
+  /// state in memory and through "returning/throwing", respectively.
+  static void determineFunctionCaptureCapabilities(const IRPosition &IRP,
+                                                   const Function &F,
+                                                   BitIntegerState &State) {
+    // TODO: Once we have memory behavior attributes we should use them here.
+
+    // If we know we cannot communicate or write to memory, we do not care about
+    // ptr2int anymore.
+    if (F.onlyReadsMemory() && F.doesNotThrow() &&
+        F.getReturnType()->isVoidTy()) {
+      State.addKnownBits(NO_CAPTURE);
+      return;
+    }
+
+    // A function cannot capture state in memory if it only reads memory, it can
+    // however return/throw state and the state might be influenced by the
+    // pointer value, e.g., loading from a returned pointer might reveal a bit.
+    if (F.onlyReadsMemory())
+      State.addKnownBits(NOT_CAPTURED_IN_MEM);
+
+    // A function cannot communicate state back if it does not through
+    // exceptions and doesn not return values.
+    if (F.doesNotThrow() && F.getReturnType()->isVoidTy())
+      State.addKnownBits(NOT_CAPTURED_IN_RET);
+
+    // Check existing "returned" attributes.
     int ArgNo = IRP.getCalleeArgNo();
-    if (F.doesNotThrow() && ArgNo >= 0) { 
-      for (unsigned u = 0, e = F.arg_size(); u < e; ++u) 
-        if (F.hasParamAttribute(u, Attribute::Returned)) { 
-          if (u == unsigned(ArgNo)) 
-            State.removeAssumedBits(NOT_CAPTURED_IN_RET); 
-          else if (F.onlyReadsMemory()) 
-            State.addKnownBits(NO_CAPTURE); 
-          else 
-            State.addKnownBits(NOT_CAPTURED_IN_RET); 
-          break; 
-        } 
-    } 
-  } 
- 
-  /// See AbstractState::getAsStr(). 
-  const std::string getAsStr() const override { 
-    if (isKnownNoCapture()) 
-      return "known not-captured"; 
-    if (isAssumedNoCapture()) 
-      return "assumed not-captured"; 
-    if (isKnownNoCaptureMaybeReturned()) 
-      return "known not-captured-maybe-returned"; 
-    if (isAssumedNoCaptureMaybeReturned()) 
-      return "assumed not-captured-maybe-returned"; 
-    return "assumed-captured"; 
-  } 
-}; 
- 
-/// Attributor-aware capture tracker. 
-struct AACaptureUseTracker final : public CaptureTracker { 
- 
-  /// Create a capture tracker that can lookup in-flight abstract attributes 
-  /// through the Attributor \p A. 
-  /// 
-  /// If a use leads to a potential capture, \p CapturedInMemory is set and the 
-  /// search is stopped. If a use leads to a return instruction, 
-  /// \p CommunicatedBack is set to true and \p CapturedInMemory is not changed. 
-  /// If a use leads to a ptr2int which may capture the value, 
-  /// \p CapturedInInteger is set. If a use is found that is currently assumed 
-  /// "no-capture-maybe-returned", the user is added to the \p PotentialCopies 
-  /// set. All values in \p PotentialCopies are later tracked as well. For every 
-  /// explored use we decrement \p RemainingUsesToExplore. Once it reaches 0, 
-  /// the search is stopped with \p CapturedInMemory and \p CapturedInInteger 
-  /// conservatively set to true. 
-  AACaptureUseTracker(Attributor &A, AANoCapture &NoCaptureAA, 
-                      const AAIsDead &IsDeadAA, AANoCapture::StateType &State, 
-                      SmallVectorImpl<const Value *> &PotentialCopies, 
-                      unsigned &RemainingUsesToExplore) 
-      : A(A), NoCaptureAA(NoCaptureAA), IsDeadAA(IsDeadAA), State(State), 
-        PotentialCopies(PotentialCopies), 
-        RemainingUsesToExplore(RemainingUsesToExplore) {} 
- 
-  /// Determine if \p V maybe captured. *Also updates the state!* 
-  bool valueMayBeCaptured(const Value *V) { 
-    if (V->getType()->isPointerTy()) { 
-      PointerMayBeCaptured(V, this); 
-    } else { 
-      State.indicatePessimisticFixpoint(); 
-    } 
-    return State.isAssumed(AANoCapture::NO_CAPTURE_MAYBE_RETURNED); 
-  } 
- 
-  /// See CaptureTracker::tooManyUses(). 
-  void tooManyUses() override { 
-    State.removeAssumedBits(AANoCapture::NO_CAPTURE); 
-  } 
- 
-  bool isDereferenceableOrNull(Value *O, const DataLayout &DL) override { 
-    if (CaptureTracker::isDereferenceableOrNull(O, DL)) 
-      return true; 
-    const auto &DerefAA = A.getAAFor<AADereferenceable>( 
-        NoCaptureAA, IRPosition::value(*O), /* TrackDependence */ true, 
-        DepClassTy::OPTIONAL); 
-    return DerefAA.getAssumedDereferenceableBytes(); 
-  } 
- 
-  /// See CaptureTracker::captured(...). 
-  bool captured(const Use *U) override { 
-    Instruction *UInst = cast<Instruction>(U->getUser()); 
-    LLVM_DEBUG(dbgs() << "Check use: " << *U->get() << " in " << *UInst 
-                      << "\n"); 
- 
-    // Because we may reuse the tracker multiple times we keep track of the 
-    // number of explored uses ourselves as well. 
-    if (RemainingUsesToExplore-- == 0) { 
-      LLVM_DEBUG(dbgs() << " - too many uses to explore!\n"); 
-      return isCapturedIn(/* Memory */ true, /* Integer */ true, 
-                          /* Return */ true); 
-    } 
- 
-    // Deal with ptr2int by following uses. 
-    if (isa<PtrToIntInst>(UInst)) { 
-      LLVM_DEBUG(dbgs() << " - ptr2int assume the worst!\n"); 
-      return valueMayBeCaptured(UInst); 
-    } 
- 
-    // Explicitly catch return instructions. 
-    if (isa<ReturnInst>(UInst)) 
-      return isCapturedIn(/* Memory */ false, /* Integer */ false, 
-                          /* Return */ true); 
- 
-    // For now we only use special logic for call sites. However, the tracker 
-    // itself knows about a lot of other non-capturing cases already. 
-    auto *CB = dyn_cast<CallBase>(UInst); 
-    if (!CB || !CB->isArgOperand(U)) 
-      return isCapturedIn(/* Memory */ true, /* Integer */ true, 
-                          /* Return */ true); 
- 
-    unsigned ArgNo = CB->getArgOperandNo(U); 
-    const IRPosition &CSArgPos = IRPosition::callsite_argument(*CB, ArgNo); 
-    // If we have a abstract no-capture attribute for the argument we can use 
-    // it to justify a non-capture attribute here. This allows recursion! 
-    auto &ArgNoCaptureAA = A.getAAFor<AANoCapture>(NoCaptureAA, CSArgPos); 
-    if (ArgNoCaptureAA.isAssumedNoCapture()) 
-      return isCapturedIn(/* Memory */ false, /* Integer */ false, 
-                          /* Return */ false); 
-    if (ArgNoCaptureAA.isAssumedNoCaptureMaybeReturned()) { 
-      addPotentialCopy(*CB); 
-      return isCapturedIn(/* Memory */ false, /* Integer */ false, 
-                          /* Return */ false); 
-    } 
- 
-    // Lastly, we could not find a reason no-capture can be assumed so we don't. 
-    return isCapturedIn(/* Memory */ true, /* Integer */ true, 
-                        /* Return */ true); 
-  } 
- 
-  /// Register \p CS as potential copy of the value we are checking. 
-  void addPotentialCopy(CallBase &CB) { PotentialCopies.push_back(&CB); } 
- 
-  /// See CaptureTracker::shouldExplore(...). 
-  bool shouldExplore(const Use *U) override { 
-    // Check liveness and ignore droppable users. 
-    return !U->getUser()->isDroppable() && 
-           !A.isAssumedDead(*U, &NoCaptureAA, &IsDeadAA); 
-  } 
- 
-  /// Update the state according to \p CapturedInMem, \p CapturedInInt, and 
-  /// \p CapturedInRet, then return the appropriate value for use in the 
-  /// CaptureTracker::captured() interface. 
-  bool isCapturedIn(bool CapturedInMem, bool CapturedInInt, 
-                    bool CapturedInRet) { 
-    LLVM_DEBUG(dbgs() << " - captures [Mem " << CapturedInMem << "|Int " 
-                      << CapturedInInt << "|Ret " << CapturedInRet << "]\n"); 
-    if (CapturedInMem) 
-      State.removeAssumedBits(AANoCapture::NOT_CAPTURED_IN_MEM); 
-    if (CapturedInInt) 
-      State.removeAssumedBits(AANoCapture::NOT_CAPTURED_IN_INT); 
-    if (CapturedInRet) 
-      State.removeAssumedBits(AANoCapture::NOT_CAPTURED_IN_RET); 
-    return !State.isAssumed(AANoCapture::NO_CAPTURE_MAYBE_RETURNED); 
-  } 
- 
-private: 
-  /// The attributor providing in-flight abstract attributes. 
-  Attributor &A; 
- 
-  /// The abstract attribute currently updated. 
-  AANoCapture &NoCaptureAA; 
- 
-  /// The abstract liveness state. 
-  const AAIsDead &IsDeadAA; 
- 
-  /// The state currently updated. 
-  AANoCapture::StateType &State; 
- 
-  /// Set of potential copies of the tracked value. 
-  SmallVectorImpl<const Value *> &PotentialCopies; 
- 
-  /// Global counter to limit the number of explored uses. 
-  unsigned &RemainingUsesToExplore; 
-}; 
- 
-ChangeStatus AANoCaptureImpl::updateImpl(Attributor &A) { 
-  const IRPosition &IRP = getIRPosition(); 
+    if (F.doesNotThrow() && ArgNo >= 0) {
+      for (unsigned u = 0, e = F.arg_size(); u < e; ++u)
+        if (F.hasParamAttribute(u, Attribute::Returned)) {
+          if (u == unsigned(ArgNo))
+            State.removeAssumedBits(NOT_CAPTURED_IN_RET);
+          else if (F.onlyReadsMemory())
+            State.addKnownBits(NO_CAPTURE);
+          else
+            State.addKnownBits(NOT_CAPTURED_IN_RET);
+          break;
+        }
+    }
+  }
+
+  /// See AbstractState::getAsStr().
+  const std::string getAsStr() const override {
+    if (isKnownNoCapture())
+      return "known not-captured";
+    if (isAssumedNoCapture())
+      return "assumed not-captured";
+    if (isKnownNoCaptureMaybeReturned())
+      return "known not-captured-maybe-returned";
+    if (isAssumedNoCaptureMaybeReturned())
+      return "assumed not-captured-maybe-returned";
+    return "assumed-captured";
+  }
+};
+
+/// Attributor-aware capture tracker.
+struct AACaptureUseTracker final : public CaptureTracker {
+
+  /// Create a capture tracker that can lookup in-flight abstract attributes
+  /// through the Attributor \p A.
+  ///
+  /// If a use leads to a potential capture, \p CapturedInMemory is set and the
+  /// search is stopped. If a use leads to a return instruction,
+  /// \p CommunicatedBack is set to true and \p CapturedInMemory is not changed.
+  /// If a use leads to a ptr2int which may capture the value,
+  /// \p CapturedInInteger is set. If a use is found that is currently assumed
+  /// "no-capture-maybe-returned", the user is added to the \p PotentialCopies
+  /// set. All values in \p PotentialCopies are later tracked as well. For every
+  /// explored use we decrement \p RemainingUsesToExplore. Once it reaches 0,
+  /// the search is stopped with \p CapturedInMemory and \p CapturedInInteger
+  /// conservatively set to true.
+  AACaptureUseTracker(Attributor &A, AANoCapture &NoCaptureAA,
+                      const AAIsDead &IsDeadAA, AANoCapture::StateType &State,
+                      SmallVectorImpl<const Value *> &PotentialCopies,
+                      unsigned &RemainingUsesToExplore)
+      : A(A), NoCaptureAA(NoCaptureAA), IsDeadAA(IsDeadAA), State(State),
+        PotentialCopies(PotentialCopies),
+        RemainingUsesToExplore(RemainingUsesToExplore) {}
+
+  /// Determine if \p V maybe captured. *Also updates the state!*
+  bool valueMayBeCaptured(const Value *V) {
+    if (V->getType()->isPointerTy()) {
+      PointerMayBeCaptured(V, this);
+    } else {
+      State.indicatePessimisticFixpoint();
+    }
+    return State.isAssumed(AANoCapture::NO_CAPTURE_MAYBE_RETURNED);
+  }
+
+  /// See CaptureTracker::tooManyUses().
+  void tooManyUses() override {
+    State.removeAssumedBits(AANoCapture::NO_CAPTURE);
+  }
+
+  bool isDereferenceableOrNull(Value *O, const DataLayout &DL) override {
+    if (CaptureTracker::isDereferenceableOrNull(O, DL))
+      return true;
+    const auto &DerefAA = A.getAAFor<AADereferenceable>(
+        NoCaptureAA, IRPosition::value(*O), /* TrackDependence */ true,
+        DepClassTy::OPTIONAL);
+    return DerefAA.getAssumedDereferenceableBytes();
+  }
+
+  /// See CaptureTracker::captured(...).
+  bool captured(const Use *U) override {
+    Instruction *UInst = cast<Instruction>(U->getUser());
+    LLVM_DEBUG(dbgs() << "Check use: " << *U->get() << " in " << *UInst
+                      << "\n");
+
+    // Because we may reuse the tracker multiple times we keep track of the
+    // number of explored uses ourselves as well.
+    if (RemainingUsesToExplore-- == 0) {
+      LLVM_DEBUG(dbgs() << " - too many uses to explore!\n");
+      return isCapturedIn(/* Memory */ true, /* Integer */ true,
+                          /* Return */ true);
+    }
+
+    // Deal with ptr2int by following uses.
+    if (isa<PtrToIntInst>(UInst)) {
+      LLVM_DEBUG(dbgs() << " - ptr2int assume the worst!\n");
+      return valueMayBeCaptured(UInst);
+    }
+
+    // Explicitly catch return instructions.
+    if (isa<ReturnInst>(UInst))
+      return isCapturedIn(/* Memory */ false, /* Integer */ false,
+                          /* Return */ true);
+
+    // For now we only use special logic for call sites. However, the tracker
+    // itself knows about a lot of other non-capturing cases already.
+    auto *CB = dyn_cast<CallBase>(UInst);
+    if (!CB || !CB->isArgOperand(U))
+      return isCapturedIn(/* Memory */ true, /* Integer */ true,
+                          /* Return */ true);
+
+    unsigned ArgNo = CB->getArgOperandNo(U);
+    const IRPosition &CSArgPos = IRPosition::callsite_argument(*CB, ArgNo);
+    // If we have a abstract no-capture attribute for the argument we can use
+    // it to justify a non-capture attribute here. This allows recursion!
+    auto &ArgNoCaptureAA = A.getAAFor<AANoCapture>(NoCaptureAA, CSArgPos);
+    if (ArgNoCaptureAA.isAssumedNoCapture())
+      return isCapturedIn(/* Memory */ false, /* Integer */ false,
+                          /* Return */ false);
+    if (ArgNoCaptureAA.isAssumedNoCaptureMaybeReturned()) {
+      addPotentialCopy(*CB);
+      return isCapturedIn(/* Memory */ false, /* Integer */ false,
+                          /* Return */ false);
+    }
+
+    // Lastly, we could not find a reason no-capture can be assumed so we don't.
+    return isCapturedIn(/* Memory */ true, /* Integer */ true,
+                        /* Return */ true);
+  }
+
+  /// Register \p CS as potential copy of the value we are checking.
+  void addPotentialCopy(CallBase &CB) { PotentialCopies.push_back(&CB); }
+
+  /// See CaptureTracker::shouldExplore(...).
+  bool shouldExplore(const Use *U) override {
+    // Check liveness and ignore droppable users.
+    return !U->getUser()->isDroppable() &&
+           !A.isAssumedDead(*U, &NoCaptureAA, &IsDeadAA);
+  }
+
+  /// Update the state according to \p CapturedInMem, \p CapturedInInt, and
+  /// \p CapturedInRet, then return the appropriate value for use in the
+  /// CaptureTracker::captured() interface.
+  bool isCapturedIn(bool CapturedInMem, bool CapturedInInt,
+                    bool CapturedInRet) {
+    LLVM_DEBUG(dbgs() << " - captures [Mem " << CapturedInMem << "|Int "
+                      << CapturedInInt << "|Ret " << CapturedInRet << "]\n");
+    if (CapturedInMem)
+      State.removeAssumedBits(AANoCapture::NOT_CAPTURED_IN_MEM);
+    if (CapturedInInt)
+      State.removeAssumedBits(AANoCapture::NOT_CAPTURED_IN_INT);
+    if (CapturedInRet)
+      State.removeAssumedBits(AANoCapture::NOT_CAPTURED_IN_RET);
+    return !State.isAssumed(AANoCapture::NO_CAPTURE_MAYBE_RETURNED);
+  }
+
+private:
+  /// The attributor providing in-flight abstract attributes.
+  Attributor &A;
+
+  /// The abstract attribute currently updated.
+  AANoCapture &NoCaptureAA;
+
+  /// The abstract liveness state.
+  const AAIsDead &IsDeadAA;
+
+  /// The state currently updated.
+  AANoCapture::StateType &State;
+
+  /// Set of potential copies of the tracked value.
+  SmallVectorImpl<const Value *> &PotentialCopies;
+
+  /// Global counter to limit the number of explored uses.
+  unsigned &RemainingUsesToExplore;
+};
+
+ChangeStatus AANoCaptureImpl::updateImpl(Attributor &A) {
+  const IRPosition &IRP = getIRPosition();
   const Value *V = isArgumentPosition() ? IRP.getAssociatedArgument()
                                         : &IRP.getAssociatedValue();
-  if (!V) 
-    return indicatePessimisticFixpoint(); 
- 
-  const Function *F = 
+  if (!V)
+    return indicatePessimisticFixpoint();
+
+  const Function *F =
       isArgumentPosition() ? IRP.getAssociatedFunction() : IRP.getAnchorScope();
-  assert(F && "Expected a function!"); 
-  const IRPosition &FnPos = IRPosition::function(*F); 
-  const auto &IsDeadAA = 
-      A.getAAFor<AAIsDead>(*this, FnPos, /* TrackDependence */ false); 
- 
-  AANoCapture::StateType T; 
- 
-  // Readonly means we cannot capture through memory. 
-  const auto &FnMemAA = 
-      A.getAAFor<AAMemoryBehavior>(*this, FnPos, /* TrackDependence */ false); 
-  if (FnMemAA.isAssumedReadOnly()) { 
-    T.addKnownBits(NOT_CAPTURED_IN_MEM); 
-    if (FnMemAA.isKnownReadOnly()) 
-      addKnownBits(NOT_CAPTURED_IN_MEM); 
-    else 
-      A.recordDependence(FnMemAA, *this, DepClassTy::OPTIONAL); 
-  } 
- 
-  // Make sure all returned values are different than the underlying value. 
-  // TODO: we could do this in a more sophisticated way inside 
-  //       AAReturnedValues, e.g., track all values that escape through returns 
-  //       directly somehow. 
-  auto CheckReturnedArgs = [&](const AAReturnedValues &RVAA) { 
-    bool SeenConstant = false; 
-    for (auto &It : RVAA.returned_values()) { 
-      if (isa<Constant>(It.first)) { 
-        if (SeenConstant) 
-          return false; 
-        SeenConstant = true; 
-      } else if (!isa<Argument>(It.first) || 
-                 It.first == getAssociatedArgument()) 
-        return false; 
-    } 
-    return true; 
-  }; 
- 
-  const auto &NoUnwindAA = A.getAAFor<AANoUnwind>( 
-      *this, FnPos, /* TrackDependence */ true, DepClassTy::OPTIONAL); 
-  if (NoUnwindAA.isAssumedNoUnwind()) { 
-    bool IsVoidTy = F->getReturnType()->isVoidTy(); 
-    const AAReturnedValues *RVAA = 
-        IsVoidTy ? nullptr 
-                 : &A.getAAFor<AAReturnedValues>(*this, FnPos, 
-                                                 /* TrackDependence */ true, 
-                                                 DepClassTy::OPTIONAL); 
-    if (IsVoidTy || CheckReturnedArgs(*RVAA)) { 
-      T.addKnownBits(NOT_CAPTURED_IN_RET); 
-      if (T.isKnown(NOT_CAPTURED_IN_MEM)) 
-        return ChangeStatus::UNCHANGED; 
-      if (NoUnwindAA.isKnownNoUnwind() && 
-          (IsVoidTy || RVAA->getState().isAtFixpoint())) { 
-        addKnownBits(NOT_CAPTURED_IN_RET); 
-        if (isKnown(NOT_CAPTURED_IN_MEM)) 
-          return indicateOptimisticFixpoint(); 
-      } 
-    } 
-  } 
- 
-  // Use the CaptureTracker interface and logic with the specialized tracker, 
-  // defined in AACaptureUseTracker, that can look at in-flight abstract 
-  // attributes and directly updates the assumed state. 
-  SmallVector<const Value *, 4> PotentialCopies; 
-  unsigned RemainingUsesToExplore = 
-      getDefaultMaxUsesToExploreForCaptureTracking(); 
-  AACaptureUseTracker Tracker(A, *this, IsDeadAA, T, PotentialCopies, 
-                              RemainingUsesToExplore); 
- 
-  // Check all potential copies of the associated value until we can assume 
-  // none will be captured or we have to assume at least one might be. 
-  unsigned Idx = 0; 
-  PotentialCopies.push_back(V); 
-  while (T.isAssumed(NO_CAPTURE_MAYBE_RETURNED) && Idx < PotentialCopies.size()) 
-    Tracker.valueMayBeCaptured(PotentialCopies[Idx++]); 
- 
-  AANoCapture::StateType &S = getState(); 
-  auto Assumed = S.getAssumed(); 
-  S.intersectAssumedBits(T.getAssumed()); 
-  if (!isAssumedNoCaptureMaybeReturned()) 
-    return indicatePessimisticFixpoint(); 
-  return Assumed == S.getAssumed() ? ChangeStatus::UNCHANGED 
-                                   : ChangeStatus::CHANGED; 
-} 
- 
-/// NoCapture attribute for function arguments. 
-struct AANoCaptureArgument final : AANoCaptureImpl { 
-  AANoCaptureArgument(const IRPosition &IRP, Attributor &A) 
-      : AANoCaptureImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(nocapture) } 
-}; 
- 
-/// NoCapture attribute for call site arguments. 
-struct AANoCaptureCallSiteArgument final : AANoCaptureImpl { 
-  AANoCaptureCallSiteArgument(const IRPosition &IRP, Attributor &A) 
-      : AANoCaptureImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    if (Argument *Arg = getAssociatedArgument()) 
-      if (Arg->hasByValAttr()) 
-        indicateOptimisticFixpoint(); 
-    AANoCaptureImpl::initialize(A); 
-  } 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    // TODO: Once we have call site specific value information we can provide 
-    //       call site specific liveness information and then it makes 
-    //       sense to specialize attributes for call sites arguments instead of 
-    //       redirecting requests to the callee argument. 
-    Argument *Arg = getAssociatedArgument(); 
-    if (!Arg) 
-      return indicatePessimisticFixpoint(); 
-    const IRPosition &ArgPos = IRPosition::argument(*Arg); 
-    auto &ArgAA = A.getAAFor<AANoCapture>(*this, ArgPos); 
+  assert(F && "Expected a function!");
+  const IRPosition &FnPos = IRPosition::function(*F);
+  const auto &IsDeadAA =
+      A.getAAFor<AAIsDead>(*this, FnPos, /* TrackDependence */ false);
+
+  AANoCapture::StateType T;
+
+  // Readonly means we cannot capture through memory.
+  const auto &FnMemAA =
+      A.getAAFor<AAMemoryBehavior>(*this, FnPos, /* TrackDependence */ false);
+  if (FnMemAA.isAssumedReadOnly()) {
+    T.addKnownBits(NOT_CAPTURED_IN_MEM);
+    if (FnMemAA.isKnownReadOnly())
+      addKnownBits(NOT_CAPTURED_IN_MEM);
+    else
+      A.recordDependence(FnMemAA, *this, DepClassTy::OPTIONAL);
+  }
+
+  // Make sure all returned values are different than the underlying value.
+  // TODO: we could do this in a more sophisticated way inside
+  //       AAReturnedValues, e.g., track all values that escape through returns
+  //       directly somehow.
+  auto CheckReturnedArgs = [&](const AAReturnedValues &RVAA) {
+    bool SeenConstant = false;
+    for (auto &It : RVAA.returned_values()) {
+      if (isa<Constant>(It.first)) {
+        if (SeenConstant)
+          return false;
+        SeenConstant = true;
+      } else if (!isa<Argument>(It.first) ||
+                 It.first == getAssociatedArgument())
+        return false;
+    }
+    return true;
+  };
+
+  const auto &NoUnwindAA = A.getAAFor<AANoUnwind>(
+      *this, FnPos, /* TrackDependence */ true, DepClassTy::OPTIONAL);
+  if (NoUnwindAA.isAssumedNoUnwind()) {
+    bool IsVoidTy = F->getReturnType()->isVoidTy();
+    const AAReturnedValues *RVAA =
+        IsVoidTy ? nullptr
+                 : &A.getAAFor<AAReturnedValues>(*this, FnPos,
+                                                 /* TrackDependence */ true,
+                                                 DepClassTy::OPTIONAL);
+    if (IsVoidTy || CheckReturnedArgs(*RVAA)) {
+      T.addKnownBits(NOT_CAPTURED_IN_RET);
+      if (T.isKnown(NOT_CAPTURED_IN_MEM))
+        return ChangeStatus::UNCHANGED;
+      if (NoUnwindAA.isKnownNoUnwind() &&
+          (IsVoidTy || RVAA->getState().isAtFixpoint())) {
+        addKnownBits(NOT_CAPTURED_IN_RET);
+        if (isKnown(NOT_CAPTURED_IN_MEM))
+          return indicateOptimisticFixpoint();
+      }
+    }
+  }
+
+  // Use the CaptureTracker interface and logic with the specialized tracker,
+  // defined in AACaptureUseTracker, that can look at in-flight abstract
+  // attributes and directly updates the assumed state.
+  SmallVector<const Value *, 4> PotentialCopies;
+  unsigned RemainingUsesToExplore =
+      getDefaultMaxUsesToExploreForCaptureTracking();
+  AACaptureUseTracker Tracker(A, *this, IsDeadAA, T, PotentialCopies,
+                              RemainingUsesToExplore);
+
+  // Check all potential copies of the associated value until we can assume
+  // none will be captured or we have to assume at least one might be.
+  unsigned Idx = 0;
+  PotentialCopies.push_back(V);
+  while (T.isAssumed(NO_CAPTURE_MAYBE_RETURNED) && Idx < PotentialCopies.size())
+    Tracker.valueMayBeCaptured(PotentialCopies[Idx++]);
+
+  AANoCapture::StateType &S = getState();
+  auto Assumed = S.getAssumed();
+  S.intersectAssumedBits(T.getAssumed());
+  if (!isAssumedNoCaptureMaybeReturned())
+    return indicatePessimisticFixpoint();
+  return Assumed == S.getAssumed() ? ChangeStatus::UNCHANGED
+                                   : ChangeStatus::CHANGED;
+}
+
+/// NoCapture attribute for function arguments.
+struct AANoCaptureArgument final : AANoCaptureImpl {
+  AANoCaptureArgument(const IRPosition &IRP, Attributor &A)
+      : AANoCaptureImpl(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(nocapture) }
+};
+
+/// NoCapture attribute for call site arguments.
+struct AANoCaptureCallSiteArgument final : AANoCaptureImpl {
+  AANoCaptureCallSiteArgument(const IRPosition &IRP, Attributor &A)
+      : AANoCaptureImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    if (Argument *Arg = getAssociatedArgument())
+      if (Arg->hasByValAttr())
+        indicateOptimisticFixpoint();
+    AANoCaptureImpl::initialize(A);
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Argument *Arg = getAssociatedArgument();
+    if (!Arg)
+      return indicatePessimisticFixpoint();
+    const IRPosition &ArgPos = IRPosition::argument(*Arg);
+    auto &ArgAA = A.getAAFor<AANoCapture>(*this, ArgPos);
     return clampStateAndIndicateChange(getState(), ArgAA.getState());
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override{STATS_DECLTRACK_CSARG_ATTR(nocapture)}; 
-}; 
- 
-/// NoCapture attribute for floating values. 
-struct AANoCaptureFloating final : AANoCaptureImpl { 
-  AANoCaptureFloating(const IRPosition &IRP, Attributor &A) 
-      : AANoCaptureImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { 
-    STATS_DECLTRACK_FLOATING_ATTR(nocapture) 
-  } 
-}; 
- 
-/// NoCapture attribute for function return value. 
-struct AANoCaptureReturned final : AANoCaptureImpl { 
-  AANoCaptureReturned(const IRPosition &IRP, Attributor &A) 
-      : AANoCaptureImpl(IRP, A) { 
-    llvm_unreachable("NoCapture is not applicable to function returns!"); 
-  } 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    llvm_unreachable("NoCapture is not applicable to function returns!"); 
-  } 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    llvm_unreachable("NoCapture is not applicable to function returns!"); 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override {} 
-}; 
- 
-/// NoCapture attribute deduction for a call site return value. 
-struct AANoCaptureCallSiteReturned final : AANoCaptureImpl { 
-  AANoCaptureCallSiteReturned(const IRPosition &IRP, Attributor &A) 
-      : AANoCaptureImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { 
-    STATS_DECLTRACK_CSRET_ATTR(nocapture) 
-  } 
-}; 
- 
-/// ------------------ Value Simplify Attribute ---------------------------- 
-struct AAValueSimplifyImpl : AAValueSimplify { 
-  AAValueSimplifyImpl(const IRPosition &IRP, Attributor &A) 
-      : AAValueSimplify(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    if (getAssociatedValue().getType()->isVoidTy()) 
-      indicatePessimisticFixpoint(); 
-  } 
- 
-  /// See AbstractAttribute::getAsStr(). 
-  const std::string getAsStr() const override { 
-    return getAssumed() ? (getKnown() ? "simplified" : "maybe-simple") 
-                        : "not-simple"; 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override {} 
- 
-  /// See AAValueSimplify::getAssumedSimplifiedValue() 
-  Optional<Value *> getAssumedSimplifiedValue(Attributor &A) const override { 
-    if (!getAssumed()) 
-      return const_cast<Value *>(&getAssociatedValue()); 
-    return SimplifiedAssociatedValue; 
-  } 
- 
-  /// Helper function for querying AAValueSimplify and updating candicate. 
-  /// \param QueryingValue Value trying to unify with SimplifiedValue 
-  /// \param AccumulatedSimplifiedValue Current simplification result. 
-  static bool checkAndUpdate(Attributor &A, const AbstractAttribute &QueryingAA, 
-                             Value &QueryingValue, 
-                             Optional<Value *> &AccumulatedSimplifiedValue) { 
-    // FIXME: Add a typecast support. 
- 
-    auto &ValueSimplifyAA = A.getAAFor<AAValueSimplify>( 
-        QueryingAA, IRPosition::value(QueryingValue)); 
- 
-    Optional<Value *> QueryingValueSimplified = 
-        ValueSimplifyAA.getAssumedSimplifiedValue(A); 
- 
-    if (!QueryingValueSimplified.hasValue()) 
-      return true; 
- 
-    if (!QueryingValueSimplified.getValue()) 
-      return false; 
- 
-    Value &QueryingValueSimplifiedUnwrapped = 
-        *QueryingValueSimplified.getValue(); 
- 
-    if (AccumulatedSimplifiedValue.hasValue() && 
-        !isa<UndefValue>(AccumulatedSimplifiedValue.getValue()) && 
-        !isa<UndefValue>(QueryingValueSimplifiedUnwrapped)) 
-      return AccumulatedSimplifiedValue == QueryingValueSimplified; 
-    if (AccumulatedSimplifiedValue.hasValue() && 
-        isa<UndefValue>(QueryingValueSimplifiedUnwrapped)) 
-      return true; 
- 
-    LLVM_DEBUG(dbgs() << "[ValueSimplify] " << QueryingValue 
-                      << " is assumed to be " 
-                      << QueryingValueSimplifiedUnwrapped << "\n"); 
- 
-    AccumulatedSimplifiedValue = QueryingValueSimplified; 
-    return true; 
-  } 
- 
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override{STATS_DECLTRACK_CSARG_ATTR(nocapture)};
+};
+
+/// NoCapture attribute for floating values.
+struct AANoCaptureFloating final : AANoCaptureImpl {
+  AANoCaptureFloating(const IRPosition &IRP, Attributor &A)
+      : AANoCaptureImpl(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FLOATING_ATTR(nocapture)
+  }
+};
+
+/// NoCapture attribute for function return value.
+struct AANoCaptureReturned final : AANoCaptureImpl {
+  AANoCaptureReturned(const IRPosition &IRP, Attributor &A)
+      : AANoCaptureImpl(IRP, A) {
+    llvm_unreachable("NoCapture is not applicable to function returns!");
+  }
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    llvm_unreachable("NoCapture is not applicable to function returns!");
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    llvm_unreachable("NoCapture is not applicable to function returns!");
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {}
+};
+
+/// NoCapture attribute deduction for a call site return value.
+struct AANoCaptureCallSiteReturned final : AANoCaptureImpl {
+  AANoCaptureCallSiteReturned(const IRPosition &IRP, Attributor &A)
+      : AANoCaptureImpl(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CSRET_ATTR(nocapture)
+  }
+};
+
+/// ------------------ Value Simplify Attribute ----------------------------
+struct AAValueSimplifyImpl : AAValueSimplify {
+  AAValueSimplifyImpl(const IRPosition &IRP, Attributor &A)
+      : AAValueSimplify(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    if (getAssociatedValue().getType()->isVoidTy())
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    return getAssumed() ? (getKnown() ? "simplified" : "maybe-simple")
+                        : "not-simple";
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {}
+
+  /// See AAValueSimplify::getAssumedSimplifiedValue()
+  Optional<Value *> getAssumedSimplifiedValue(Attributor &A) const override {
+    if (!getAssumed())
+      return const_cast<Value *>(&getAssociatedValue());
+    return SimplifiedAssociatedValue;
+  }
+
+  /// Helper function for querying AAValueSimplify and updating candicate.
+  /// \param QueryingValue Value trying to unify with SimplifiedValue
+  /// \param AccumulatedSimplifiedValue Current simplification result.
+  static bool checkAndUpdate(Attributor &A, const AbstractAttribute &QueryingAA,
+                             Value &QueryingValue,
+                             Optional<Value *> &AccumulatedSimplifiedValue) {
+    // FIXME: Add a typecast support.
+
+    auto &ValueSimplifyAA = A.getAAFor<AAValueSimplify>(
+        QueryingAA, IRPosition::value(QueryingValue));
+
+    Optional<Value *> QueryingValueSimplified =
+        ValueSimplifyAA.getAssumedSimplifiedValue(A);
+
+    if (!QueryingValueSimplified.hasValue())
+      return true;
+
+    if (!QueryingValueSimplified.getValue())
+      return false;
+
+    Value &QueryingValueSimplifiedUnwrapped =
+        *QueryingValueSimplified.getValue();
+
+    if (AccumulatedSimplifiedValue.hasValue() &&
+        !isa<UndefValue>(AccumulatedSimplifiedValue.getValue()) &&
+        !isa<UndefValue>(QueryingValueSimplifiedUnwrapped))
+      return AccumulatedSimplifiedValue == QueryingValueSimplified;
+    if (AccumulatedSimplifiedValue.hasValue() &&
+        isa<UndefValue>(QueryingValueSimplifiedUnwrapped))
+      return true;
+
+    LLVM_DEBUG(dbgs() << "[ValueSimplify] " << QueryingValue
+                      << " is assumed to be "
+                      << QueryingValueSimplifiedUnwrapped << "\n");
+
+    AccumulatedSimplifiedValue = QueryingValueSimplified;
+    return true;
+  }
+
   /// Returns a candidate is found or not
   template <typename AAType> bool askSimplifiedValueFor(Attributor &A) {
-    if (!getAssociatedValue().getType()->isIntegerTy()) 
-      return false; 
- 
+    if (!getAssociatedValue().getType()->isIntegerTy())
+      return false;
+
     const auto &AA =
         A.getAAFor<AAType>(*this, getIRPosition(), /* TrackDependence */ false);
- 
+
     Optional<ConstantInt *> COpt = AA.getAssumedConstantInt(A);
 
     if (!COpt.hasValue()) {
-      SimplifiedAssociatedValue = llvm::None; 
+      SimplifiedAssociatedValue = llvm::None;
       A.recordDependence(AA, *this, DepClassTy::OPTIONAL);
       return true;
-    } 
+    }
     if (auto *C = COpt.getValue()) {
       SimplifiedAssociatedValue = C;
       A.recordDependence(AA, *this, DepClassTy::OPTIONAL);
       return true;
     }
     return false;
-  } 
- 
+  }
+
   bool askSimplifiedValueForOtherAAs(Attributor &A) {
     if (askSimplifiedValueFor<AAValueConstantRange>(A))
       return true;
@@ -4581,207 +4581,207 @@ struct AAValueSimplifyImpl : AAValueSimplify {
     return false;
   }
 
-  /// See AbstractAttribute::manifest(...). 
-  ChangeStatus manifest(Attributor &A) override { 
-    ChangeStatus Changed = ChangeStatus::UNCHANGED; 
- 
-    if (SimplifiedAssociatedValue.hasValue() && 
-        !SimplifiedAssociatedValue.getValue()) 
-      return Changed; 
- 
-    Value &V = getAssociatedValue(); 
-    auto *C = SimplifiedAssociatedValue.hasValue() 
-                  ? dyn_cast<Constant>(SimplifiedAssociatedValue.getValue()) 
-                  : UndefValue::get(V.getType()); 
-    if (C) { 
-      // We can replace the AssociatedValue with the constant. 
-      if (!V.user_empty() && &V != C && V.getType() == C->getType()) { 
-        LLVM_DEBUG(dbgs() << "[ValueSimplify] " << V << " -> " << *C 
-                          << " :: " << *this << "\n"); 
-        if (A.changeValueAfterManifest(V, *C)) 
-          Changed = ChangeStatus::CHANGED; 
-      } 
-    } 
- 
-    return Changed | AAValueSimplify::manifest(A); 
-  } 
- 
-  /// See AbstractState::indicatePessimisticFixpoint(...). 
-  ChangeStatus indicatePessimisticFixpoint() override { 
-    // NOTE: Associated value will be returned in a pessimistic fixpoint and is 
-    // regarded as known. That's why`indicateOptimisticFixpoint` is called. 
-    SimplifiedAssociatedValue = &getAssociatedValue(); 
-    indicateOptimisticFixpoint(); 
-    return ChangeStatus::CHANGED; 
-  } 
- 
-protected: 
-  // An assumed simplified value. Initially, it is set to Optional::None, which 
-  // means that the value is not clear under current assumption. If in the 
-  // pessimistic state, getAssumedSimplifiedValue doesn't return this value but 
-  // returns orignal associated value. 
-  Optional<Value *> SimplifiedAssociatedValue; 
-}; 
- 
-struct AAValueSimplifyArgument final : AAValueSimplifyImpl { 
-  AAValueSimplifyArgument(const IRPosition &IRP, Attributor &A) 
-      : AAValueSimplifyImpl(IRP, A) {} 
- 
-  void initialize(Attributor &A) override { 
-    AAValueSimplifyImpl::initialize(A); 
-    if (!getAnchorScope() || getAnchorScope()->isDeclaration()) 
-      indicatePessimisticFixpoint(); 
-    if (hasAttr({Attribute::InAlloca, Attribute::Preallocated, 
-                 Attribute::StructRet, Attribute::Nest}, 
-                /* IgnoreSubsumingPositions */ true)) 
-      indicatePessimisticFixpoint(); 
- 
-    // FIXME: This is a hack to prevent us from propagating function poiner in 
-    // the new pass manager CGSCC pass as it creates call edges the 
-    // CallGraphUpdater cannot handle yet. 
-    Value &V = getAssociatedValue(); 
-    if (V.getType()->isPointerTy() && 
-        V.getType()->getPointerElementType()->isFunctionTy() && 
-        !A.isModulePass()) 
-      indicatePessimisticFixpoint(); 
-  } 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    // Byval is only replacable if it is readonly otherwise we would write into 
-    // the replaced value and not the copy that byval creates implicitly. 
-    Argument *Arg = getAssociatedArgument(); 
-    if (Arg->hasByValAttr()) { 
-      // TODO: We probably need to verify synchronization is not an issue, e.g., 
-      //       there is no race by not copying a constant byval. 
-      const auto &MemAA = A.getAAFor<AAMemoryBehavior>(*this, getIRPosition()); 
-      if (!MemAA.isAssumedReadOnly()) 
-        return indicatePessimisticFixpoint(); 
-    } 
- 
-    bool HasValueBefore = SimplifiedAssociatedValue.hasValue(); 
- 
-    auto PredForCallSite = [&](AbstractCallSite ACS) { 
-      const IRPosition &ACSArgPos = 
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    ChangeStatus Changed = ChangeStatus::UNCHANGED;
+
+    if (SimplifiedAssociatedValue.hasValue() &&
+        !SimplifiedAssociatedValue.getValue())
+      return Changed;
+
+    Value &V = getAssociatedValue();
+    auto *C = SimplifiedAssociatedValue.hasValue()
+                  ? dyn_cast<Constant>(SimplifiedAssociatedValue.getValue())
+                  : UndefValue::get(V.getType());
+    if (C) {
+      // We can replace the AssociatedValue with the constant.
+      if (!V.user_empty() && &V != C && V.getType() == C->getType()) {
+        LLVM_DEBUG(dbgs() << "[ValueSimplify] " << V << " -> " << *C
+                          << " :: " << *this << "\n");
+        if (A.changeValueAfterManifest(V, *C))
+          Changed = ChangeStatus::CHANGED;
+      }
+    }
+
+    return Changed | AAValueSimplify::manifest(A);
+  }
+
+  /// See AbstractState::indicatePessimisticFixpoint(...).
+  ChangeStatus indicatePessimisticFixpoint() override {
+    // NOTE: Associated value will be returned in a pessimistic fixpoint and is
+    // regarded as known. That's why`indicateOptimisticFixpoint` is called.
+    SimplifiedAssociatedValue = &getAssociatedValue();
+    indicateOptimisticFixpoint();
+    return ChangeStatus::CHANGED;
+  }
+
+protected:
+  // An assumed simplified value. Initially, it is set to Optional::None, which
+  // means that the value is not clear under current assumption. If in the
+  // pessimistic state, getAssumedSimplifiedValue doesn't return this value but
+  // returns orignal associated value.
+  Optional<Value *> SimplifiedAssociatedValue;
+};
+
+struct AAValueSimplifyArgument final : AAValueSimplifyImpl {
+  AAValueSimplifyArgument(const IRPosition &IRP, Attributor &A)
+      : AAValueSimplifyImpl(IRP, A) {}
+
+  void initialize(Attributor &A) override {
+    AAValueSimplifyImpl::initialize(A);
+    if (!getAnchorScope() || getAnchorScope()->isDeclaration())
+      indicatePessimisticFixpoint();
+    if (hasAttr({Attribute::InAlloca, Attribute::Preallocated,
+                 Attribute::StructRet, Attribute::Nest},
+                /* IgnoreSubsumingPositions */ true))
+      indicatePessimisticFixpoint();
+
+    // FIXME: This is a hack to prevent us from propagating function poiner in
+    // the new pass manager CGSCC pass as it creates call edges the
+    // CallGraphUpdater cannot handle yet.
+    Value &V = getAssociatedValue();
+    if (V.getType()->isPointerTy() &&
+        V.getType()->getPointerElementType()->isFunctionTy() &&
+        !A.isModulePass())
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // Byval is only replacable if it is readonly otherwise we would write into
+    // the replaced value and not the copy that byval creates implicitly.
+    Argument *Arg = getAssociatedArgument();
+    if (Arg->hasByValAttr()) {
+      // TODO: We probably need to verify synchronization is not an issue, e.g.,
+      //       there is no race by not copying a constant byval.
+      const auto &MemAA = A.getAAFor<AAMemoryBehavior>(*this, getIRPosition());
+      if (!MemAA.isAssumedReadOnly())
+        return indicatePessimisticFixpoint();
+    }
+
+    bool HasValueBefore = SimplifiedAssociatedValue.hasValue();
+
+    auto PredForCallSite = [&](AbstractCallSite ACS) {
+      const IRPosition &ACSArgPos =
           IRPosition::callsite_argument(ACS, getCallSiteArgNo());
-      // Check if a coresponding argument was found or if it is on not 
-      // associated (which can happen for callback calls). 
-      if (ACSArgPos.getPositionKind() == IRPosition::IRP_INVALID) 
-        return false; 
- 
-      // We can only propagate thread independent values through callbacks. 
-      // This is different to direct/indirect call sites because for them we 
-      // know the thread executing the caller and callee is the same. For 
-      // callbacks this is not guaranteed, thus a thread dependent value could 
-      // be different for the caller and callee, making it invalid to propagate. 
-      Value &ArgOp = ACSArgPos.getAssociatedValue(); 
-      if (ACS.isCallbackCall()) 
-        if (auto *C = dyn_cast<Constant>(&ArgOp)) 
-          if (C->isThreadDependent()) 
-            return false; 
-      return checkAndUpdate(A, *this, ArgOp, SimplifiedAssociatedValue); 
-    }; 
- 
-    bool AllCallSitesKnown; 
-    if (!A.checkForAllCallSites(PredForCallSite, *this, true, 
-                                AllCallSitesKnown)) 
+      // Check if a coresponding argument was found or if it is on not
+      // associated (which can happen for callback calls).
+      if (ACSArgPos.getPositionKind() == IRPosition::IRP_INVALID)
+        return false;
+
+      // We can only propagate thread independent values through callbacks.
+      // This is different to direct/indirect call sites because for them we
+      // know the thread executing the caller and callee is the same. For
+      // callbacks this is not guaranteed, thus a thread dependent value could
+      // be different for the caller and callee, making it invalid to propagate.
+      Value &ArgOp = ACSArgPos.getAssociatedValue();
+      if (ACS.isCallbackCall())
+        if (auto *C = dyn_cast<Constant>(&ArgOp))
+          if (C->isThreadDependent())
+            return false;
+      return checkAndUpdate(A, *this, ArgOp, SimplifiedAssociatedValue);
+    };
+
+    bool AllCallSitesKnown;
+    if (!A.checkForAllCallSites(PredForCallSite, *this, true,
+                                AllCallSitesKnown))
       if (!askSimplifiedValueForOtherAAs(A))
-        return indicatePessimisticFixpoint(); 
- 
-    // If a candicate was found in this update, return CHANGED. 
-    return HasValueBefore == SimplifiedAssociatedValue.hasValue() 
-               ? ChangeStatus::UNCHANGED 
-               : ChangeStatus ::CHANGED; 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { 
-    STATS_DECLTRACK_ARG_ATTR(value_simplify) 
-  } 
-}; 
- 
-struct AAValueSimplifyReturned : AAValueSimplifyImpl { 
-  AAValueSimplifyReturned(const IRPosition &IRP, Attributor &A) 
-      : AAValueSimplifyImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    bool HasValueBefore = SimplifiedAssociatedValue.hasValue(); 
- 
-    auto PredForReturned = [&](Value &V) { 
-      return checkAndUpdate(A, *this, V, SimplifiedAssociatedValue); 
-    }; 
- 
-    if (!A.checkForAllReturnedValues(PredForReturned, *this)) 
+        return indicatePessimisticFixpoint();
+
+    // If a candicate was found in this update, return CHANGED.
+    return HasValueBefore == SimplifiedAssociatedValue.hasValue()
+               ? ChangeStatus::UNCHANGED
+               : ChangeStatus ::CHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_ARG_ATTR(value_simplify)
+  }
+};
+
+struct AAValueSimplifyReturned : AAValueSimplifyImpl {
+  AAValueSimplifyReturned(const IRPosition &IRP, Attributor &A)
+      : AAValueSimplifyImpl(IRP, A) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    bool HasValueBefore = SimplifiedAssociatedValue.hasValue();
+
+    auto PredForReturned = [&](Value &V) {
+      return checkAndUpdate(A, *this, V, SimplifiedAssociatedValue);
+    };
+
+    if (!A.checkForAllReturnedValues(PredForReturned, *this))
       if (!askSimplifiedValueForOtherAAs(A))
-        return indicatePessimisticFixpoint(); 
- 
-    // If a candicate was found in this update, return CHANGED. 
-    return HasValueBefore == SimplifiedAssociatedValue.hasValue() 
-               ? ChangeStatus::UNCHANGED 
-               : ChangeStatus ::CHANGED; 
-  } 
- 
-  ChangeStatus manifest(Attributor &A) override { 
-    ChangeStatus Changed = ChangeStatus::UNCHANGED; 
- 
-    if (SimplifiedAssociatedValue.hasValue() && 
-        !SimplifiedAssociatedValue.getValue()) 
-      return Changed; 
- 
-    Value &V = getAssociatedValue(); 
-    auto *C = SimplifiedAssociatedValue.hasValue() 
-                  ? dyn_cast<Constant>(SimplifiedAssociatedValue.getValue()) 
-                  : UndefValue::get(V.getType()); 
-    if (C) { 
-      auto PredForReturned = 
-          [&](Value &V, const SmallSetVector<ReturnInst *, 4> &RetInsts) { 
-            // We can replace the AssociatedValue with the constant. 
-            if (&V == C || V.getType() != C->getType() || isa<UndefValue>(V)) 
-              return true; 
- 
-            for (ReturnInst *RI : RetInsts) { 
-              if (RI->getFunction() != getAnchorScope()) 
-                continue; 
-              auto *RC = C; 
-              if (RC->getType() != RI->getReturnValue()->getType()) 
-                RC = ConstantExpr::getBitCast(RC, 
-                                              RI->getReturnValue()->getType()); 
-              LLVM_DEBUG(dbgs() << "[ValueSimplify] " << V << " -> " << *RC 
-                                << " in " << *RI << " :: " << *this << "\n"); 
-              if (A.changeUseAfterManifest(RI->getOperandUse(0), *RC)) 
-                Changed = ChangeStatus::CHANGED; 
-            } 
-            return true; 
-          }; 
-      A.checkForAllReturnedValuesAndReturnInsts(PredForReturned, *this); 
-    } 
- 
-    return Changed | AAValueSimplify::manifest(A); 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { 
-    STATS_DECLTRACK_FNRET_ATTR(value_simplify) 
-  } 
-}; 
- 
-struct AAValueSimplifyFloating : AAValueSimplifyImpl { 
-  AAValueSimplifyFloating(const IRPosition &IRP, Attributor &A) 
-      : AAValueSimplifyImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    // FIXME: This might have exposed a SCC iterator update bug in the old PM. 
-    //        Needs investigation. 
-    // AAValueSimplifyImpl::initialize(A); 
-    Value &V = getAnchorValue(); 
- 
-    // TODO: add other stuffs 
-    if (isa<Constant>(V)) 
-      indicatePessimisticFixpoint(); 
-  } 
- 
+        return indicatePessimisticFixpoint();
+
+    // If a candicate was found in this update, return CHANGED.
+    return HasValueBefore == SimplifiedAssociatedValue.hasValue()
+               ? ChangeStatus::UNCHANGED
+               : ChangeStatus ::CHANGED;
+  }
+
+  ChangeStatus manifest(Attributor &A) override {
+    ChangeStatus Changed = ChangeStatus::UNCHANGED;
+
+    if (SimplifiedAssociatedValue.hasValue() &&
+        !SimplifiedAssociatedValue.getValue())
+      return Changed;
+
+    Value &V = getAssociatedValue();
+    auto *C = SimplifiedAssociatedValue.hasValue()
+                  ? dyn_cast<Constant>(SimplifiedAssociatedValue.getValue())
+                  : UndefValue::get(V.getType());
+    if (C) {
+      auto PredForReturned =
+          [&](Value &V, const SmallSetVector<ReturnInst *, 4> &RetInsts) {
+            // We can replace the AssociatedValue with the constant.
+            if (&V == C || V.getType() != C->getType() || isa<UndefValue>(V))
+              return true;
+
+            for (ReturnInst *RI : RetInsts) {
+              if (RI->getFunction() != getAnchorScope())
+                continue;
+              auto *RC = C;
+              if (RC->getType() != RI->getReturnValue()->getType())
+                RC = ConstantExpr::getBitCast(RC,
+                                              RI->getReturnValue()->getType());
+              LLVM_DEBUG(dbgs() << "[ValueSimplify] " << V << " -> " << *RC
+                                << " in " << *RI << " :: " << *this << "\n");
+              if (A.changeUseAfterManifest(RI->getOperandUse(0), *RC))
+                Changed = ChangeStatus::CHANGED;
+            }
+            return true;
+          };
+      A.checkForAllReturnedValuesAndReturnInsts(PredForReturned, *this);
+    }
+
+    return Changed | AAValueSimplify::manifest(A);
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FNRET_ATTR(value_simplify)
+  }
+};
+
+struct AAValueSimplifyFloating : AAValueSimplifyImpl {
+  AAValueSimplifyFloating(const IRPosition &IRP, Attributor &A)
+      : AAValueSimplifyImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    // FIXME: This might have exposed a SCC iterator update bug in the old PM.
+    //        Needs investigation.
+    // AAValueSimplifyImpl::initialize(A);
+    Value &V = getAnchorValue();
+
+    // TODO: add other stuffs
+    if (isa<Constant>(V))
+      indicatePessimisticFixpoint();
+  }
+
   /// Check if \p ICmp is an equality comparison (==/!=) with at least one
   /// nullptr. If so, try to simplify it using AANonNull on the other operand.
   /// Return true if successful, in that case SimplifiedAssociatedValue will be
@@ -4843,1201 +4843,1201 @@ struct AAValueSimplifyFloating : AAValueSimplifyImpl {
     return true;
   }
 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    bool HasValueBefore = SimplifiedAssociatedValue.hasValue(); 
- 
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    bool HasValueBefore = SimplifiedAssociatedValue.hasValue();
+
     ChangeStatus Changed;
     if (checkForNullPtrCompare(A, dyn_cast<ICmpInst>(&getAnchorValue()),
                                Changed))
       return Changed;
 
-    auto VisitValueCB = [&](Value &V, const Instruction *CtxI, bool &, 
-                            bool Stripped) -> bool { 
-      auto &AA = A.getAAFor<AAValueSimplify>(*this, IRPosition::value(V)); 
-      if (!Stripped && this == &AA) { 
-        // TODO: Look the instruction and check recursively. 
- 
-        LLVM_DEBUG(dbgs() << "[ValueSimplify] Can't be stripped more : " << V 
-                          << "\n"); 
-        return false; 
-      } 
-      return checkAndUpdate(A, *this, V, SimplifiedAssociatedValue); 
-    }; 
- 
-    bool Dummy = false; 
-    if (!genericValueTraversal<AAValueSimplify, bool>( 
-            A, getIRPosition(), *this, Dummy, VisitValueCB, getCtxI(), 
-            /* UseValueSimplify */ false)) 
+    auto VisitValueCB = [&](Value &V, const Instruction *CtxI, bool &,
+                            bool Stripped) -> bool {
+      auto &AA = A.getAAFor<AAValueSimplify>(*this, IRPosition::value(V));
+      if (!Stripped && this == &AA) {
+        // TODO: Look the instruction and check recursively.
+
+        LLVM_DEBUG(dbgs() << "[ValueSimplify] Can't be stripped more : " << V
+                          << "\n");
+        return false;
+      }
+      return checkAndUpdate(A, *this, V, SimplifiedAssociatedValue);
+    };
+
+    bool Dummy = false;
+    if (!genericValueTraversal<AAValueSimplify, bool>(
+            A, getIRPosition(), *this, Dummy, VisitValueCB, getCtxI(),
+            /* UseValueSimplify */ false))
       if (!askSimplifiedValueForOtherAAs(A))
-        return indicatePessimisticFixpoint(); 
- 
-    // If a candicate was found in this update, return CHANGED. 
- 
-    return HasValueBefore == SimplifiedAssociatedValue.hasValue() 
-               ? ChangeStatus::UNCHANGED 
-               : ChangeStatus ::CHANGED; 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { 
-    STATS_DECLTRACK_FLOATING_ATTR(value_simplify) 
-  } 
-}; 
- 
-struct AAValueSimplifyFunction : AAValueSimplifyImpl { 
-  AAValueSimplifyFunction(const IRPosition &IRP, Attributor &A) 
-      : AAValueSimplifyImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    SimplifiedAssociatedValue = &getAnchorValue(); 
-    indicateOptimisticFixpoint(); 
-  } 
-  /// See AbstractAttribute::initialize(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    llvm_unreachable( 
-        "AAValueSimplify(Function|CallSite)::updateImpl will not be called"); 
-  } 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { 
-    STATS_DECLTRACK_FN_ATTR(value_simplify) 
-  } 
-}; 
- 
-struct AAValueSimplifyCallSite : AAValueSimplifyFunction { 
-  AAValueSimplifyCallSite(const IRPosition &IRP, Attributor &A) 
-      : AAValueSimplifyFunction(IRP, A) {} 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { 
-    STATS_DECLTRACK_CS_ATTR(value_simplify) 
-  } 
-}; 
- 
-struct AAValueSimplifyCallSiteReturned : AAValueSimplifyReturned { 
-  AAValueSimplifyCallSiteReturned(const IRPosition &IRP, Attributor &A) 
-      : AAValueSimplifyReturned(IRP, A) {} 
- 
-  /// See AbstractAttribute::manifest(...). 
-  ChangeStatus manifest(Attributor &A) override { 
-    return AAValueSimplifyImpl::manifest(A); 
-  } 
- 
-  void trackStatistics() const override { 
-    STATS_DECLTRACK_CSRET_ATTR(value_simplify) 
-  } 
-}; 
-struct AAValueSimplifyCallSiteArgument : AAValueSimplifyFloating { 
-  AAValueSimplifyCallSiteArgument(const IRPosition &IRP, Attributor &A) 
-      : AAValueSimplifyFloating(IRP, A) {} 
- 
-  /// See AbstractAttribute::manifest(...). 
-  ChangeStatus manifest(Attributor &A) override { 
-    ChangeStatus Changed = ChangeStatus::UNCHANGED; 
- 
-    if (SimplifiedAssociatedValue.hasValue() && 
-        !SimplifiedAssociatedValue.getValue()) 
-      return Changed; 
- 
-    Value &V = getAssociatedValue(); 
-    auto *C = SimplifiedAssociatedValue.hasValue() 
-                  ? dyn_cast<Constant>(SimplifiedAssociatedValue.getValue()) 
-                  : UndefValue::get(V.getType()); 
-    if (C) { 
+        return indicatePessimisticFixpoint();
+
+    // If a candicate was found in this update, return CHANGED.
+
+    return HasValueBefore == SimplifiedAssociatedValue.hasValue()
+               ? ChangeStatus::UNCHANGED
+               : ChangeStatus ::CHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FLOATING_ATTR(value_simplify)
+  }
+};
+
+struct AAValueSimplifyFunction : AAValueSimplifyImpl {
+  AAValueSimplifyFunction(const IRPosition &IRP, Attributor &A)
+      : AAValueSimplifyImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    SimplifiedAssociatedValue = &getAnchorValue();
+    indicateOptimisticFixpoint();
+  }
+  /// See AbstractAttribute::initialize(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    llvm_unreachable(
+        "AAValueSimplify(Function|CallSite)::updateImpl will not be called");
+  }
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FN_ATTR(value_simplify)
+  }
+};
+
+struct AAValueSimplifyCallSite : AAValueSimplifyFunction {
+  AAValueSimplifyCallSite(const IRPosition &IRP, Attributor &A)
+      : AAValueSimplifyFunction(IRP, A) {}
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CS_ATTR(value_simplify)
+  }
+};
+
+struct AAValueSimplifyCallSiteReturned : AAValueSimplifyReturned {
+  AAValueSimplifyCallSiteReturned(const IRPosition &IRP, Attributor &A)
+      : AAValueSimplifyReturned(IRP, A) {}
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    return AAValueSimplifyImpl::manifest(A);
+  }
+
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CSRET_ATTR(value_simplify)
+  }
+};
+struct AAValueSimplifyCallSiteArgument : AAValueSimplifyFloating {
+  AAValueSimplifyCallSiteArgument(const IRPosition &IRP, Attributor &A)
+      : AAValueSimplifyFloating(IRP, A) {}
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    ChangeStatus Changed = ChangeStatus::UNCHANGED;
+
+    if (SimplifiedAssociatedValue.hasValue() &&
+        !SimplifiedAssociatedValue.getValue())
+      return Changed;
+
+    Value &V = getAssociatedValue();
+    auto *C = SimplifiedAssociatedValue.hasValue()
+                  ? dyn_cast<Constant>(SimplifiedAssociatedValue.getValue())
+                  : UndefValue::get(V.getType());
+    if (C) {
       Use &U = cast<CallBase>(&getAnchorValue())
                    ->getArgOperandUse(getCallSiteArgNo());
-      // We can replace the AssociatedValue with the constant. 
-      if (&V != C && V.getType() == C->getType()) { 
-        if (A.changeUseAfterManifest(U, *C)) 
-          Changed = ChangeStatus::CHANGED; 
-      } 
-    } 
- 
-    return Changed | AAValueSimplify::manifest(A); 
-  } 
- 
-  void trackStatistics() const override { 
-    STATS_DECLTRACK_CSARG_ATTR(value_simplify) 
-  } 
-}; 
- 
-/// ----------------------- Heap-To-Stack Conversion --------------------------- 
-struct AAHeapToStackImpl : public AAHeapToStack { 
-  AAHeapToStackImpl(const IRPosition &IRP, Attributor &A) 
-      : AAHeapToStack(IRP, A) {} 
- 
-  const std::string getAsStr() const override { 
-    return "[H2S] Mallocs: " + std::to_string(MallocCalls.size()); 
-  } 
- 
-  ChangeStatus manifest(Attributor &A) override { 
-    assert(getState().isValidState() && 
-           "Attempted to manifest an invalid state!"); 
- 
-    ChangeStatus HasChanged = ChangeStatus::UNCHANGED; 
-    Function *F = getAnchorScope(); 
-    const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F); 
- 
-    for (Instruction *MallocCall : MallocCalls) { 
-      // This malloc cannot be replaced. 
-      if (BadMallocCalls.count(MallocCall)) 
-        continue; 
- 
-      for (Instruction *FreeCall : FreesForMalloc[MallocCall]) { 
-        LLVM_DEBUG(dbgs() << "H2S: Removing free call: " << *FreeCall << "\n"); 
-        A.deleteAfterManifest(*FreeCall); 
-        HasChanged = ChangeStatus::CHANGED; 
-      } 
- 
-      LLVM_DEBUG(dbgs() << "H2S: Removing malloc call: " << *MallocCall 
-                        << "\n"); 
- 
-      Align Alignment; 
-      Constant *Size; 
-      if (isCallocLikeFn(MallocCall, TLI)) { 
-        auto *Num = cast<ConstantInt>(MallocCall->getOperand(0)); 
-        auto *SizeT = cast<ConstantInt>(MallocCall->getOperand(1)); 
-        APInt TotalSize = SizeT->getValue() * Num->getValue(); 
-        Size = 
-            ConstantInt::get(MallocCall->getOperand(0)->getType(), TotalSize); 
-      } else if (isAlignedAllocLikeFn(MallocCall, TLI)) { 
-        Size = cast<ConstantInt>(MallocCall->getOperand(1)); 
-        Alignment = MaybeAlign(cast<ConstantInt>(MallocCall->getOperand(0)) 
-                                   ->getValue() 
-                                   .getZExtValue()) 
-                        .valueOrOne(); 
-      } else { 
-        Size = cast<ConstantInt>(MallocCall->getOperand(0)); 
-      } 
- 
-      unsigned AS = cast<PointerType>(MallocCall->getType())->getAddressSpace(); 
-      Instruction *AI = 
-          new AllocaInst(Type::getInt8Ty(F->getContext()), AS, Size, Alignment, 
-                         "", MallocCall->getNextNode()); 
- 
-      if (AI->getType() != MallocCall->getType()) 
-        AI = new BitCastInst(AI, MallocCall->getType(), "malloc_bc", 
-                             AI->getNextNode()); 
- 
-      A.changeValueAfterManifest(*MallocCall, *AI); 
- 
-      if (auto *II = dyn_cast<InvokeInst>(MallocCall)) { 
-        auto *NBB = II->getNormalDest(); 
-        BranchInst::Create(NBB, MallocCall->getParent()); 
-        A.deleteAfterManifest(*MallocCall); 
-      } else { 
-        A.deleteAfterManifest(*MallocCall); 
-      } 
- 
-      // Zero out the allocated memory if it was a calloc. 
-      if (isCallocLikeFn(MallocCall, TLI)) { 
-        auto *BI = new BitCastInst(AI, MallocCall->getType(), "calloc_bc", 
-                                   AI->getNextNode()); 
-        Value *Ops[] = { 
-            BI, ConstantInt::get(F->getContext(), APInt(8, 0, false)), Size, 
-            ConstantInt::get(Type::getInt1Ty(F->getContext()), false)}; 
- 
-        Type *Tys[] = {BI->getType(), MallocCall->getOperand(0)->getType()}; 
-        Module *M = F->getParent(); 
-        Function *Fn = Intrinsic::getDeclaration(M, Intrinsic::memset, Tys); 
-        CallInst::Create(Fn, Ops, "", BI->getNextNode()); 
-      } 
-      HasChanged = ChangeStatus::CHANGED; 
-    } 
- 
-    return HasChanged; 
-  } 
- 
-  /// Collection of all malloc calls in a function. 
-  SmallSetVector<Instruction *, 4> MallocCalls; 
- 
-  /// Collection of malloc calls that cannot be converted. 
-  DenseSet<const Instruction *> BadMallocCalls; 
- 
-  /// A map for each malloc call to the set of associated free calls. 
-  DenseMap<Instruction *, SmallPtrSet<Instruction *, 4>> FreesForMalloc; 
- 
-  ChangeStatus updateImpl(Attributor &A) override; 
-}; 
- 
-ChangeStatus AAHeapToStackImpl::updateImpl(Attributor &A) { 
-  const Function *F = getAnchorScope(); 
-  const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F); 
- 
-  MustBeExecutedContextExplorer &Explorer = 
-      A.getInfoCache().getMustBeExecutedContextExplorer(); 
- 
-  auto FreeCheck = [&](Instruction &I) { 
-    const auto &Frees = FreesForMalloc.lookup(&I); 
-    if (Frees.size() != 1) 
-      return false; 
-    Instruction *UniqueFree = *Frees.begin(); 
-    return Explorer.findInContextOf(UniqueFree, I.getNextNode()); 
-  }; 
- 
-  auto UsesCheck = [&](Instruction &I) { 
-    bool ValidUsesOnly = true; 
-    bool MustUse = true; 
-    auto Pred = [&](const Use &U, bool &Follow) -> bool { 
-      Instruction *UserI = cast<Instruction>(U.getUser()); 
-      if (isa<LoadInst>(UserI)) 
-        return true; 
-      if (auto *SI = dyn_cast<StoreInst>(UserI)) { 
-        if (SI->getValueOperand() == U.get()) { 
-          LLVM_DEBUG(dbgs() 
-                     << "[H2S] escaping store to memory: " << *UserI << "\n"); 
-          ValidUsesOnly = false; 
-        } else { 
-          // A store into the malloc'ed memory is fine. 
-        } 
-        return true; 
-      } 
-      if (auto *CB = dyn_cast<CallBase>(UserI)) { 
-        if (!CB->isArgOperand(&U) || CB->isLifetimeStartOrEnd()) 
-          return true; 
-        // Record malloc. 
-        if (isFreeCall(UserI, TLI)) { 
-          if (MustUse) { 
-            FreesForMalloc[&I].insert(UserI); 
-          } else { 
-            LLVM_DEBUG(dbgs() << "[H2S] free potentially on different mallocs: " 
-                              << *UserI << "\n"); 
-            ValidUsesOnly = false; 
-          } 
-          return true; 
-        } 
- 
-        unsigned ArgNo = CB->getArgOperandNo(&U); 
- 
-        const auto &NoCaptureAA = A.getAAFor<AANoCapture>( 
-            *this, IRPosition::callsite_argument(*CB, ArgNo)); 
- 
-        // If a callsite argument use is nofree, we are fine. 
-        const auto &ArgNoFreeAA = A.getAAFor<AANoFree>( 
-            *this, IRPosition::callsite_argument(*CB, ArgNo)); 
- 
-        if (!NoCaptureAA.isAssumedNoCapture() || 
-            !ArgNoFreeAA.isAssumedNoFree()) { 
-          LLVM_DEBUG(dbgs() << "[H2S] Bad user: " << *UserI << "\n"); 
-          ValidUsesOnly = false; 
-        } 
-        return true; 
-      } 
- 
-      if (isa<GetElementPtrInst>(UserI) || isa<BitCastInst>(UserI) || 
-          isa<PHINode>(UserI) || isa<SelectInst>(UserI)) { 
-        MustUse &= !(isa<PHINode>(UserI) || isa<SelectInst>(UserI)); 
-        Follow = true; 
-        return true; 
-      } 
-      // Unknown user for which we can not track uses further (in a way that 
-      // makes sense). 
-      LLVM_DEBUG(dbgs() << "[H2S] Unknown user: " << *UserI << "\n"); 
-      ValidUsesOnly = false; 
-      return true; 
-    }; 
-    A.checkForAllUses(Pred, *this, I); 
-    return ValidUsesOnly; 
-  }; 
- 
-  auto MallocCallocCheck = [&](Instruction &I) { 
-    if (BadMallocCalls.count(&I)) 
-      return true; 
- 
-    bool IsMalloc = isMallocLikeFn(&I, TLI); 
-    bool IsAlignedAllocLike = isAlignedAllocLikeFn(&I, TLI); 
-    bool IsCalloc = !IsMalloc && isCallocLikeFn(&I, TLI); 
-    if (!IsMalloc && !IsAlignedAllocLike && !IsCalloc) { 
-      BadMallocCalls.insert(&I); 
-      return true; 
-    } 
- 
-    if (IsMalloc) { 
-      if (auto *Size = dyn_cast<ConstantInt>(I.getOperand(0))) 
-        if (Size->getValue().ule(MaxHeapToStackSize)) 
-          if (UsesCheck(I) || FreeCheck(I)) { 
-            MallocCalls.insert(&I); 
-            return true; 
-          } 
-    } else if (IsAlignedAllocLike && isa<ConstantInt>(I.getOperand(0))) { 
-      // Only if the alignment and sizes are constant. 
-      if (auto *Size = dyn_cast<ConstantInt>(I.getOperand(1))) 
-        if (Size->getValue().ule(MaxHeapToStackSize)) 
-          if (UsesCheck(I) || FreeCheck(I)) { 
-            MallocCalls.insert(&I); 
-            return true; 
-          } 
-    } else if (IsCalloc) { 
-      bool Overflow = false; 
-      if (auto *Num = dyn_cast<ConstantInt>(I.getOperand(0))) 
-        if (auto *Size = dyn_cast<ConstantInt>(I.getOperand(1))) 
-          if ((Size->getValue().umul_ov(Num->getValue(), Overflow)) 
-                  .ule(MaxHeapToStackSize)) 
-            if (!Overflow && (UsesCheck(I) || FreeCheck(I))) { 
-              MallocCalls.insert(&I); 
-              return true; 
-            } 
-    } 
- 
-    BadMallocCalls.insert(&I); 
-    return true; 
-  }; 
- 
-  size_t NumBadMallocs = BadMallocCalls.size(); 
- 
-  A.checkForAllCallLikeInstructions(MallocCallocCheck, *this); 
- 
-  if (NumBadMallocs != BadMallocCalls.size()) 
-    return ChangeStatus::CHANGED; 
- 
-  return ChangeStatus::UNCHANGED; 
-} 
- 
-struct AAHeapToStackFunction final : public AAHeapToStackImpl { 
-  AAHeapToStackFunction(const IRPosition &IRP, Attributor &A) 
-      : AAHeapToStackImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::trackStatistics(). 
-  void trackStatistics() const override { 
-    STATS_DECL( 
-        MallocCalls, Function, 
-        "Number of malloc/calloc/aligned_alloc calls converted to allocas"); 
-    for (auto *C : MallocCalls) 
-      if (!BadMallocCalls.count(C)) 
-        ++BUILD_STAT_NAME(MallocCalls, Function); 
-  } 
-}; 
- 
-/// ----------------------- Privatizable Pointers ------------------------------ 
-struct AAPrivatizablePtrImpl : public AAPrivatizablePtr { 
-  AAPrivatizablePtrImpl(const IRPosition &IRP, Attributor &A) 
-      : AAPrivatizablePtr(IRP, A), PrivatizableType(llvm::None) {} 
- 
-  ChangeStatus indicatePessimisticFixpoint() override { 
-    AAPrivatizablePtr::indicatePessimisticFixpoint(); 
-    PrivatizableType = nullptr; 
-    return ChangeStatus::CHANGED; 
-  } 
- 
-  /// Identify the type we can chose for a private copy of the underlying 
-  /// argument. None means it is not clear yet, nullptr means there is none. 
-  virtual Optional<Type *> identifyPrivatizableType(Attributor &A) = 0; 
- 
-  /// Return a privatizable type that encloses both T0 and T1. 
-  /// TODO: This is merely a stub for now as we should manage a mapping as well. 
-  Optional<Type *> combineTypes(Optional<Type *> T0, Optional<Type *> T1) { 
-    if (!T0.hasValue()) 
-      return T1; 
-    if (!T1.hasValue()) 
-      return T0; 
-    if (T0 == T1) 
-      return T0; 
-    return nullptr; 
-  } 
- 
-  Optional<Type *> getPrivatizableType() const override { 
-    return PrivatizableType; 
-  } 
- 
-  const std::string getAsStr() const override { 
-    return isAssumedPrivatizablePtr() ? "[priv]" : "[no-priv]"; 
-  } 
- 
-protected: 
-  Optional<Type *> PrivatizableType; 
-}; 
- 
-// TODO: Do this for call site arguments (probably also other values) as well. 
- 
-struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl { 
-  AAPrivatizablePtrArgument(const IRPosition &IRP, Attributor &A) 
-      : AAPrivatizablePtrImpl(IRP, A) {} 
- 
-  /// See AAPrivatizablePtrImpl::identifyPrivatizableType(...) 
-  Optional<Type *> identifyPrivatizableType(Attributor &A) override { 
-    // If this is a byval argument and we know all the call sites (so we can 
-    // rewrite them), there is no need to check them explicitly. 
-    bool AllCallSitesKnown; 
-    if (getIRPosition().hasAttr(Attribute::ByVal) && 
-        A.checkForAllCallSites([](AbstractCallSite ACS) { return true; }, *this, 
-                               true, AllCallSitesKnown)) 
-      return getAssociatedValue().getType()->getPointerElementType(); 
- 
-    Optional<Type *> Ty; 
+      // We can replace the AssociatedValue with the constant.
+      if (&V != C && V.getType() == C->getType()) {
+        if (A.changeUseAfterManifest(U, *C))
+          Changed = ChangeStatus::CHANGED;
+      }
+    }
+
+    return Changed | AAValueSimplify::manifest(A);
+  }
+
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CSARG_ATTR(value_simplify)
+  }
+};
+
+/// ----------------------- Heap-To-Stack Conversion ---------------------------
+struct AAHeapToStackImpl : public AAHeapToStack {
+  AAHeapToStackImpl(const IRPosition &IRP, Attributor &A)
+      : AAHeapToStack(IRP, A) {}
+
+  const std::string getAsStr() const override {
+    return "[H2S] Mallocs: " + std::to_string(MallocCalls.size());
+  }
+
+  ChangeStatus manifest(Attributor &A) override {
+    assert(getState().isValidState() &&
+           "Attempted to manifest an invalid state!");
+
+    ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
+    Function *F = getAnchorScope();
+    const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F);
+
+    for (Instruction *MallocCall : MallocCalls) {
+      // This malloc cannot be replaced.
+      if (BadMallocCalls.count(MallocCall))
+        continue;
+
+      for (Instruction *FreeCall : FreesForMalloc[MallocCall]) {
+        LLVM_DEBUG(dbgs() << "H2S: Removing free call: " << *FreeCall << "\n");
+        A.deleteAfterManifest(*FreeCall);
+        HasChanged = ChangeStatus::CHANGED;
+      }
+
+      LLVM_DEBUG(dbgs() << "H2S: Removing malloc call: " << *MallocCall
+                        << "\n");
+
+      Align Alignment;
+      Constant *Size;
+      if (isCallocLikeFn(MallocCall, TLI)) {
+        auto *Num = cast<ConstantInt>(MallocCall->getOperand(0));
+        auto *SizeT = cast<ConstantInt>(MallocCall->getOperand(1));
+        APInt TotalSize = SizeT->getValue() * Num->getValue();
+        Size =
+            ConstantInt::get(MallocCall->getOperand(0)->getType(), TotalSize);
+      } else if (isAlignedAllocLikeFn(MallocCall, TLI)) {
+        Size = cast<ConstantInt>(MallocCall->getOperand(1));
+        Alignment = MaybeAlign(cast<ConstantInt>(MallocCall->getOperand(0))
+                                   ->getValue()
+                                   .getZExtValue())
+                        .valueOrOne();
+      } else {
+        Size = cast<ConstantInt>(MallocCall->getOperand(0));
+      }
+
+      unsigned AS = cast<PointerType>(MallocCall->getType())->getAddressSpace();
+      Instruction *AI =
+          new AllocaInst(Type::getInt8Ty(F->getContext()), AS, Size, Alignment,
+                         "", MallocCall->getNextNode());
+
+      if (AI->getType() != MallocCall->getType())
+        AI = new BitCastInst(AI, MallocCall->getType(), "malloc_bc",
+                             AI->getNextNode());
+
+      A.changeValueAfterManifest(*MallocCall, *AI);
+
+      if (auto *II = dyn_cast<InvokeInst>(MallocCall)) {
+        auto *NBB = II->getNormalDest();
+        BranchInst::Create(NBB, MallocCall->getParent());
+        A.deleteAfterManifest(*MallocCall);
+      } else {
+        A.deleteAfterManifest(*MallocCall);
+      }
+
+      // Zero out the allocated memory if it was a calloc.
+      if (isCallocLikeFn(MallocCall, TLI)) {
+        auto *BI = new BitCastInst(AI, MallocCall->getType(), "calloc_bc",
+                                   AI->getNextNode());
+        Value *Ops[] = {
+            BI, ConstantInt::get(F->getContext(), APInt(8, 0, false)), Size,
+            ConstantInt::get(Type::getInt1Ty(F->getContext()), false)};
+
+        Type *Tys[] = {BI->getType(), MallocCall->getOperand(0)->getType()};
+        Module *M = F->getParent();
+        Function *Fn = Intrinsic::getDeclaration(M, Intrinsic::memset, Tys);
+        CallInst::Create(Fn, Ops, "", BI->getNextNode());
+      }
+      HasChanged = ChangeStatus::CHANGED;
+    }
+
+    return HasChanged;
+  }
+
+  /// Collection of all malloc calls in a function.
+  SmallSetVector<Instruction *, 4> MallocCalls;
+
+  /// Collection of malloc calls that cannot be converted.
+  DenseSet<const Instruction *> BadMallocCalls;
+
+  /// A map for each malloc call to the set of associated free calls.
+  DenseMap<Instruction *, SmallPtrSet<Instruction *, 4>> FreesForMalloc;
+
+  ChangeStatus updateImpl(Attributor &A) override;
+};
+
+ChangeStatus AAHeapToStackImpl::updateImpl(Attributor &A) {
+  const Function *F = getAnchorScope();
+  const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F);
+
+  MustBeExecutedContextExplorer &Explorer =
+      A.getInfoCache().getMustBeExecutedContextExplorer();
+
+  auto FreeCheck = [&](Instruction &I) {
+    const auto &Frees = FreesForMalloc.lookup(&I);
+    if (Frees.size() != 1)
+      return false;
+    Instruction *UniqueFree = *Frees.begin();
+    return Explorer.findInContextOf(UniqueFree, I.getNextNode());
+  };
+
+  auto UsesCheck = [&](Instruction &I) {
+    bool ValidUsesOnly = true;
+    bool MustUse = true;
+    auto Pred = [&](const Use &U, bool &Follow) -> bool {
+      Instruction *UserI = cast<Instruction>(U.getUser());
+      if (isa<LoadInst>(UserI))
+        return true;
+      if (auto *SI = dyn_cast<StoreInst>(UserI)) {
+        if (SI->getValueOperand() == U.get()) {
+          LLVM_DEBUG(dbgs()
+                     << "[H2S] escaping store to memory: " << *UserI << "\n");
+          ValidUsesOnly = false;
+        } else {
+          // A store into the malloc'ed memory is fine.
+        }
+        return true;
+      }
+      if (auto *CB = dyn_cast<CallBase>(UserI)) {
+        if (!CB->isArgOperand(&U) || CB->isLifetimeStartOrEnd())
+          return true;
+        // Record malloc.
+        if (isFreeCall(UserI, TLI)) {
+          if (MustUse) {
+            FreesForMalloc[&I].insert(UserI);
+          } else {
+            LLVM_DEBUG(dbgs() << "[H2S] free potentially on different mallocs: "
+                              << *UserI << "\n");
+            ValidUsesOnly = false;
+          }
+          return true;
+        }
+
+        unsigned ArgNo = CB->getArgOperandNo(&U);
+
+        const auto &NoCaptureAA = A.getAAFor<AANoCapture>(
+            *this, IRPosition::callsite_argument(*CB, ArgNo));
+
+        // If a callsite argument use is nofree, we are fine.
+        const auto &ArgNoFreeAA = A.getAAFor<AANoFree>(
+            *this, IRPosition::callsite_argument(*CB, ArgNo));
+
+        if (!NoCaptureAA.isAssumedNoCapture() ||
+            !ArgNoFreeAA.isAssumedNoFree()) {
+          LLVM_DEBUG(dbgs() << "[H2S] Bad user: " << *UserI << "\n");
+          ValidUsesOnly = false;
+        }
+        return true;
+      }
+
+      if (isa<GetElementPtrInst>(UserI) || isa<BitCastInst>(UserI) ||
+          isa<PHINode>(UserI) || isa<SelectInst>(UserI)) {
+        MustUse &= !(isa<PHINode>(UserI) || isa<SelectInst>(UserI));
+        Follow = true;
+        return true;
+      }
+      // Unknown user for which we can not track uses further (in a way that
+      // makes sense).
+      LLVM_DEBUG(dbgs() << "[H2S] Unknown user: " << *UserI << "\n");
+      ValidUsesOnly = false;
+      return true;
+    };
+    A.checkForAllUses(Pred, *this, I);
+    return ValidUsesOnly;
+  };
+
+  auto MallocCallocCheck = [&](Instruction &I) {
+    if (BadMallocCalls.count(&I))
+      return true;
+
+    bool IsMalloc = isMallocLikeFn(&I, TLI);
+    bool IsAlignedAllocLike = isAlignedAllocLikeFn(&I, TLI);
+    bool IsCalloc = !IsMalloc && isCallocLikeFn(&I, TLI);
+    if (!IsMalloc && !IsAlignedAllocLike && !IsCalloc) {
+      BadMallocCalls.insert(&I);
+      return true;
+    }
+
+    if (IsMalloc) {
+      if (auto *Size = dyn_cast<ConstantInt>(I.getOperand(0)))
+        if (Size->getValue().ule(MaxHeapToStackSize))
+          if (UsesCheck(I) || FreeCheck(I)) {
+            MallocCalls.insert(&I);
+            return true;
+          }
+    } else if (IsAlignedAllocLike && isa<ConstantInt>(I.getOperand(0))) {
+      // Only if the alignment and sizes are constant.
+      if (auto *Size = dyn_cast<ConstantInt>(I.getOperand(1)))
+        if (Size->getValue().ule(MaxHeapToStackSize))
+          if (UsesCheck(I) || FreeCheck(I)) {
+            MallocCalls.insert(&I);
+            return true;
+          }
+    } else if (IsCalloc) {
+      bool Overflow = false;
+      if (auto *Num = dyn_cast<ConstantInt>(I.getOperand(0)))
+        if (auto *Size = dyn_cast<ConstantInt>(I.getOperand(1)))
+          if ((Size->getValue().umul_ov(Num->getValue(), Overflow))
+                  .ule(MaxHeapToStackSize))
+            if (!Overflow && (UsesCheck(I) || FreeCheck(I))) {
+              MallocCalls.insert(&I);
+              return true;
+            }
+    }
+
+    BadMallocCalls.insert(&I);
+    return true;
+  };
+
+  size_t NumBadMallocs = BadMallocCalls.size();
+
+  A.checkForAllCallLikeInstructions(MallocCallocCheck, *this);
+
+  if (NumBadMallocs != BadMallocCalls.size())
+    return ChangeStatus::CHANGED;
+
+  return ChangeStatus::UNCHANGED;
+}
+
+struct AAHeapToStackFunction final : public AAHeapToStackImpl {
+  AAHeapToStackFunction(const IRPosition &IRP, Attributor &A)
+      : AAHeapToStackImpl(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics().
+  void trackStatistics() const override {
+    STATS_DECL(
+        MallocCalls, Function,
+        "Number of malloc/calloc/aligned_alloc calls converted to allocas");
+    for (auto *C : MallocCalls)
+      if (!BadMallocCalls.count(C))
+        ++BUILD_STAT_NAME(MallocCalls, Function);
+  }
+};
+
+/// ----------------------- Privatizable Pointers ------------------------------
+struct AAPrivatizablePtrImpl : public AAPrivatizablePtr {
+  AAPrivatizablePtrImpl(const IRPosition &IRP, Attributor &A)
+      : AAPrivatizablePtr(IRP, A), PrivatizableType(llvm::None) {}
+
+  ChangeStatus indicatePessimisticFixpoint() override {
+    AAPrivatizablePtr::indicatePessimisticFixpoint();
+    PrivatizableType = nullptr;
+    return ChangeStatus::CHANGED;
+  }
+
+  /// Identify the type we can chose for a private copy of the underlying
+  /// argument. None means it is not clear yet, nullptr means there is none.
+  virtual Optional<Type *> identifyPrivatizableType(Attributor &A) = 0;
+
+  /// Return a privatizable type that encloses both T0 and T1.
+  /// TODO: This is merely a stub for now as we should manage a mapping as well.
+  Optional<Type *> combineTypes(Optional<Type *> T0, Optional<Type *> T1) {
+    if (!T0.hasValue())
+      return T1;
+    if (!T1.hasValue())
+      return T0;
+    if (T0 == T1)
+      return T0;
+    return nullptr;
+  }
+
+  Optional<Type *> getPrivatizableType() const override {
+    return PrivatizableType;
+  }
+
+  const std::string getAsStr() const override {
+    return isAssumedPrivatizablePtr() ? "[priv]" : "[no-priv]";
+  }
+
+protected:
+  Optional<Type *> PrivatizableType;
+};
+
+// TODO: Do this for call site arguments (probably also other values) as well.
+
+struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
+  AAPrivatizablePtrArgument(const IRPosition &IRP, Attributor &A)
+      : AAPrivatizablePtrImpl(IRP, A) {}
+
+  /// See AAPrivatizablePtrImpl::identifyPrivatizableType(...)
+  Optional<Type *> identifyPrivatizableType(Attributor &A) override {
+    // If this is a byval argument and we know all the call sites (so we can
+    // rewrite them), there is no need to check them explicitly.
+    bool AllCallSitesKnown;
+    if (getIRPosition().hasAttr(Attribute::ByVal) &&
+        A.checkForAllCallSites([](AbstractCallSite ACS) { return true; }, *this,
+                               true, AllCallSitesKnown))
+      return getAssociatedValue().getType()->getPointerElementType();
+
+    Optional<Type *> Ty;
     unsigned ArgNo = getIRPosition().getCallSiteArgNo();
- 
-    // Make sure the associated call site argument has the same type at all call 
-    // sites and it is an allocation we know is safe to privatize, for now that 
-    // means we only allow alloca instructions. 
-    // TODO: We can additionally analyze the accesses in the callee to  create 
-    //       the type from that information instead. That is a little more 
-    //       involved and will be done in a follow up patch. 
-    auto CallSiteCheck = [&](AbstractCallSite ACS) { 
-      IRPosition ACSArgPos = IRPosition::callsite_argument(ACS, ArgNo); 
-      // Check if a coresponding argument was found or if it is one not 
-      // associated (which can happen for callback calls). 
-      if (ACSArgPos.getPositionKind() == IRPosition::IRP_INVALID) 
-        return false; 
- 
-      // Check that all call sites agree on a type. 
-      auto &PrivCSArgAA = A.getAAFor<AAPrivatizablePtr>(*this, ACSArgPos); 
-      Optional<Type *> CSTy = PrivCSArgAA.getPrivatizableType(); 
- 
-      LLVM_DEBUG({ 
-        dbgs() << "[AAPrivatizablePtr] ACSPos: " << ACSArgPos << ", CSTy: "; 
-        if (CSTy.hasValue() && CSTy.getValue()) 
-          CSTy.getValue()->print(dbgs()); 
-        else if (CSTy.hasValue()) 
-          dbgs() << "<nullptr>"; 
-        else 
-          dbgs() << "<none>"; 
-      }); 
- 
-      Ty = combineTypes(Ty, CSTy); 
- 
-      LLVM_DEBUG({ 
-        dbgs() << " : New Type: "; 
-        if (Ty.hasValue() && Ty.getValue()) 
-          Ty.getValue()->print(dbgs()); 
-        else if (Ty.hasValue()) 
-          dbgs() << "<nullptr>"; 
-        else 
-          dbgs() << "<none>"; 
-        dbgs() << "\n"; 
-      }); 
- 
-      return !Ty.hasValue() || Ty.getValue(); 
-    }; 
- 
-    if (!A.checkForAllCallSites(CallSiteCheck, *this, true, AllCallSitesKnown)) 
-      return nullptr; 
-    return Ty; 
-  } 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    PrivatizableType = identifyPrivatizableType(A); 
-    if (!PrivatizableType.hasValue()) 
-      return ChangeStatus::UNCHANGED; 
-    if (!PrivatizableType.getValue()) 
-      return indicatePessimisticFixpoint(); 
- 
-    // The dependence is optional so we don't give up once we give up on the 
-    // alignment. 
-    A.getAAFor<AAAlign>(*this, IRPosition::value(getAssociatedValue()), 
-                        /* TrackDependence */ true, DepClassTy::OPTIONAL); 
- 
-    // Avoid arguments with padding for now. 
-    if (!getIRPosition().hasAttr(Attribute::ByVal) && 
-        !ArgumentPromotionPass::isDenselyPacked(PrivatizableType.getValue(), 
-                                                A.getInfoCache().getDL())) { 
-      LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] Padding detected\n"); 
-      return indicatePessimisticFixpoint(); 
-    } 
- 
-    // Verify callee and caller agree on how the promoted argument would be 
-    // passed. 
-    // TODO: The use of the ArgumentPromotion interface here is ugly, we need a 
-    // specialized form of TargetTransformInfo::areFunctionArgsABICompatible 
-    // which doesn't require the arguments ArgumentPromotion wanted to pass. 
-    Function &Fn = *getIRPosition().getAnchorScope(); 
-    SmallPtrSet<Argument *, 1> ArgsToPromote, Dummy; 
-    ArgsToPromote.insert(getAssociatedArgument()); 
-    const auto *TTI = 
-        A.getInfoCache().getAnalysisResultForFunction<TargetIRAnalysis>(Fn); 
-    if (!TTI || 
-        !ArgumentPromotionPass::areFunctionArgsABICompatible( 
-            Fn, *TTI, ArgsToPromote, Dummy) || 
-        ArgsToPromote.empty()) { 
-      LLVM_DEBUG( 
-          dbgs() << "[AAPrivatizablePtr] ABI incompatibility detected for " 
-                 << Fn.getName() << "\n"); 
-      return indicatePessimisticFixpoint(); 
-    } 
- 
-    // Collect the types that will replace the privatizable type in the function 
-    // signature. 
-    SmallVector<Type *, 16> ReplacementTypes; 
-    identifyReplacementTypes(PrivatizableType.getValue(), ReplacementTypes); 
- 
-    // Register a rewrite of the argument. 
-    Argument *Arg = getAssociatedArgument(); 
-    if (!A.isValidFunctionSignatureRewrite(*Arg, ReplacementTypes)) { 
-      LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] Rewrite not valid\n"); 
-      return indicatePessimisticFixpoint(); 
-    } 
- 
-    unsigned ArgNo = Arg->getArgNo(); 
- 
-    // Helper to check if for the given call site the associated argument is 
-    // passed to a callback where the privatization would be different. 
-    auto IsCompatiblePrivArgOfCallback = [&](CallBase &CB) { 
-      SmallVector<const Use *, 4> CallbackUses; 
-      AbstractCallSite::getCallbackUses(CB, CallbackUses); 
-      for (const Use *U : CallbackUses) { 
-        AbstractCallSite CBACS(U); 
-        assert(CBACS && CBACS.isCallbackCall()); 
-        for (Argument &CBArg : CBACS.getCalledFunction()->args()) { 
-          int CBArgNo = CBACS.getCallArgOperandNo(CBArg); 
- 
-          LLVM_DEBUG({ 
-            dbgs() 
-                << "[AAPrivatizablePtr] Argument " << *Arg 
-                << "check if can be privatized in the context of its parent (" 
-                << Arg->getParent()->getName() 
-                << ")\n[AAPrivatizablePtr] because it is an argument in a " 
-                   "callback (" 
-                << CBArgNo << "@" << CBACS.getCalledFunction()->getName() 
-                << ")\n[AAPrivatizablePtr] " << CBArg << " : " 
-                << CBACS.getCallArgOperand(CBArg) << " vs " 
-                << CB.getArgOperand(ArgNo) << "\n" 
-                << "[AAPrivatizablePtr] " << CBArg << " : " 
-                << CBACS.getCallArgOperandNo(CBArg) << " vs " << ArgNo << "\n"; 
-          }); 
- 
-          if (CBArgNo != int(ArgNo)) 
-            continue; 
-          const auto &CBArgPrivAA = 
-              A.getAAFor<AAPrivatizablePtr>(*this, IRPosition::argument(CBArg)); 
-          if (CBArgPrivAA.isValidState()) { 
-            auto CBArgPrivTy = CBArgPrivAA.getPrivatizableType(); 
-            if (!CBArgPrivTy.hasValue()) 
-              continue; 
-            if (CBArgPrivTy.getValue() == PrivatizableType) 
-              continue; 
-          } 
- 
-          LLVM_DEBUG({ 
-            dbgs() << "[AAPrivatizablePtr] Argument " << *Arg 
-                   << " cannot be privatized in the context of its parent (" 
-                   << Arg->getParent()->getName() 
-                   << ")\n[AAPrivatizablePtr] because it is an argument in a " 
-                      "callback (" 
-                   << CBArgNo << "@" << CBACS.getCalledFunction()->getName() 
-                   << ").\n[AAPrivatizablePtr] for which the argument " 
-                      "privatization is not compatible.\n"; 
-          }); 
-          return false; 
-        } 
-      } 
-      return true; 
-    }; 
- 
-    // Helper to check if for the given call site the associated argument is 
-    // passed to a direct call where the privatization would be different. 
-    auto IsCompatiblePrivArgOfDirectCS = [&](AbstractCallSite ACS) { 
-      CallBase *DC = cast<CallBase>(ACS.getInstruction()); 
-      int DCArgNo = ACS.getCallArgOperandNo(ArgNo); 
-      assert(DCArgNo >= 0 && unsigned(DCArgNo) < DC->getNumArgOperands() && 
-             "Expected a direct call operand for callback call operand"); 
- 
-      LLVM_DEBUG({ 
-        dbgs() << "[AAPrivatizablePtr] Argument " << *Arg 
-               << " check if be privatized in the context of its parent (" 
-               << Arg->getParent()->getName() 
-               << ")\n[AAPrivatizablePtr] because it is an argument in a " 
-                  "direct call of (" 
-               << DCArgNo << "@" << DC->getCalledFunction()->getName() 
-               << ").\n"; 
-      }); 
- 
-      Function *DCCallee = DC->getCalledFunction(); 
-      if (unsigned(DCArgNo) < DCCallee->arg_size()) { 
-        const auto &DCArgPrivAA = A.getAAFor<AAPrivatizablePtr>( 
-            *this, IRPosition::argument(*DCCallee->getArg(DCArgNo))); 
-        if (DCArgPrivAA.isValidState()) { 
-          auto DCArgPrivTy = DCArgPrivAA.getPrivatizableType(); 
-          if (!DCArgPrivTy.hasValue()) 
-            return true; 
-          if (DCArgPrivTy.getValue() == PrivatizableType) 
-            return true; 
-        } 
-      } 
- 
-      LLVM_DEBUG({ 
-        dbgs() << "[AAPrivatizablePtr] Argument " << *Arg 
-               << " cannot be privatized in the context of its parent (" 
-               << Arg->getParent()->getName() 
-               << ")\n[AAPrivatizablePtr] because it is an argument in a " 
-                  "direct call of (" 
-               << ACS.getInstruction()->getCalledFunction()->getName() 
-               << ").\n[AAPrivatizablePtr] for which the argument " 
-                  "privatization is not compatible.\n"; 
-      }); 
-      return false; 
-    }; 
- 
-    // Helper to check if the associated argument is used at the given abstract 
-    // call site in a way that is incompatible with the privatization assumed 
-    // here. 
-    auto IsCompatiblePrivArgOfOtherCallSite = [&](AbstractCallSite ACS) { 
-      if (ACS.isDirectCall()) 
-        return IsCompatiblePrivArgOfCallback(*ACS.getInstruction()); 
-      if (ACS.isCallbackCall()) 
-        return IsCompatiblePrivArgOfDirectCS(ACS); 
-      return false; 
-    }; 
- 
-    bool AllCallSitesKnown; 
-    if (!A.checkForAllCallSites(IsCompatiblePrivArgOfOtherCallSite, *this, true, 
-                                AllCallSitesKnown)) 
-      return indicatePessimisticFixpoint(); 
- 
-    return ChangeStatus::UNCHANGED; 
-  } 
- 
-  /// Given a type to private \p PrivType, collect the constituates (which are 
-  /// used) in \p ReplacementTypes. 
-  static void 
-  identifyReplacementTypes(Type *PrivType, 
-                           SmallVectorImpl<Type *> &ReplacementTypes) { 
-    // TODO: For now we expand the privatization type to the fullest which can 
-    //       lead to dead arguments that need to be removed later. 
-    assert(PrivType && "Expected privatizable type!"); 
- 
-    // Traverse the type, extract constituate types on the outermost level. 
-    if (auto *PrivStructType = dyn_cast<StructType>(PrivType)) { 
-      for (unsigned u = 0, e = PrivStructType->getNumElements(); u < e; u++) 
-        ReplacementTypes.push_back(PrivStructType->getElementType(u)); 
-    } else if (auto *PrivArrayType = dyn_cast<ArrayType>(PrivType)) { 
-      ReplacementTypes.append(PrivArrayType->getNumElements(), 
-                              PrivArrayType->getElementType()); 
-    } else { 
-      ReplacementTypes.push_back(PrivType); 
-    } 
-  } 
- 
-  /// Initialize \p Base according to the type \p PrivType at position \p IP. 
-  /// The values needed are taken from the arguments of \p F starting at 
-  /// position \p ArgNo. 
-  static void createInitialization(Type *PrivType, Value &Base, Function &F, 
-                                   unsigned ArgNo, Instruction &IP) { 
-    assert(PrivType && "Expected privatizable type!"); 
- 
-    IRBuilder<NoFolder> IRB(&IP); 
-    const DataLayout &DL = F.getParent()->getDataLayout(); 
- 
-    // Traverse the type, build GEPs and stores. 
-    if (auto *PrivStructType = dyn_cast<StructType>(PrivType)) { 
-      const StructLayout *PrivStructLayout = DL.getStructLayout(PrivStructType); 
-      for (unsigned u = 0, e = PrivStructType->getNumElements(); u < e; u++) { 
-        Type *PointeeTy = PrivStructType->getElementType(u)->getPointerTo(); 
-        Value *Ptr = constructPointer( 
-            PointeeTy, &Base, PrivStructLayout->getElementOffset(u), IRB, DL); 
-        new StoreInst(F.getArg(ArgNo + u), Ptr, &IP); 
-      } 
-    } else if (auto *PrivArrayType = dyn_cast<ArrayType>(PrivType)) { 
+
+    // Make sure the associated call site argument has the same type at all call
+    // sites and it is an allocation we know is safe to privatize, for now that
+    // means we only allow alloca instructions.
+    // TODO: We can additionally analyze the accesses in the callee to  create
+    //       the type from that information instead. That is a little more
+    //       involved and will be done in a follow up patch.
+    auto CallSiteCheck = [&](AbstractCallSite ACS) {
+      IRPosition ACSArgPos = IRPosition::callsite_argument(ACS, ArgNo);
+      // Check if a coresponding argument was found or if it is one not
+      // associated (which can happen for callback calls).
+      if (ACSArgPos.getPositionKind() == IRPosition::IRP_INVALID)
+        return false;
+
+      // Check that all call sites agree on a type.
+      auto &PrivCSArgAA = A.getAAFor<AAPrivatizablePtr>(*this, ACSArgPos);
+      Optional<Type *> CSTy = PrivCSArgAA.getPrivatizableType();
+
+      LLVM_DEBUG({
+        dbgs() << "[AAPrivatizablePtr] ACSPos: " << ACSArgPos << ", CSTy: ";
+        if (CSTy.hasValue() && CSTy.getValue())
+          CSTy.getValue()->print(dbgs());
+        else if (CSTy.hasValue())
+          dbgs() << "<nullptr>";
+        else
+          dbgs() << "<none>";
+      });
+
+      Ty = combineTypes(Ty, CSTy);
+
+      LLVM_DEBUG({
+        dbgs() << " : New Type: ";
+        if (Ty.hasValue() && Ty.getValue())
+          Ty.getValue()->print(dbgs());
+        else if (Ty.hasValue())
+          dbgs() << "<nullptr>";
+        else
+          dbgs() << "<none>";
+        dbgs() << "\n";
+      });
+
+      return !Ty.hasValue() || Ty.getValue();
+    };
+
+    if (!A.checkForAllCallSites(CallSiteCheck, *this, true, AllCallSitesKnown))
+      return nullptr;
+    return Ty;
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    PrivatizableType = identifyPrivatizableType(A);
+    if (!PrivatizableType.hasValue())
+      return ChangeStatus::UNCHANGED;
+    if (!PrivatizableType.getValue())
+      return indicatePessimisticFixpoint();
+
+    // The dependence is optional so we don't give up once we give up on the
+    // alignment.
+    A.getAAFor<AAAlign>(*this, IRPosition::value(getAssociatedValue()),
+                        /* TrackDependence */ true, DepClassTy::OPTIONAL);
+
+    // Avoid arguments with padding for now.
+    if (!getIRPosition().hasAttr(Attribute::ByVal) &&
+        !ArgumentPromotionPass::isDenselyPacked(PrivatizableType.getValue(),
+                                                A.getInfoCache().getDL())) {
+      LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] Padding detected\n");
+      return indicatePessimisticFixpoint();
+    }
+
+    // Verify callee and caller agree on how the promoted argument would be
+    // passed.
+    // TODO: The use of the ArgumentPromotion interface here is ugly, we need a
+    // specialized form of TargetTransformInfo::areFunctionArgsABICompatible
+    // which doesn't require the arguments ArgumentPromotion wanted to pass.
+    Function &Fn = *getIRPosition().getAnchorScope();
+    SmallPtrSet<Argument *, 1> ArgsToPromote, Dummy;
+    ArgsToPromote.insert(getAssociatedArgument());
+    const auto *TTI =
+        A.getInfoCache().getAnalysisResultForFunction<TargetIRAnalysis>(Fn);
+    if (!TTI ||
+        !ArgumentPromotionPass::areFunctionArgsABICompatible(
+            Fn, *TTI, ArgsToPromote, Dummy) ||
+        ArgsToPromote.empty()) {
+      LLVM_DEBUG(
+          dbgs() << "[AAPrivatizablePtr] ABI incompatibility detected for "
+                 << Fn.getName() << "\n");
+      return indicatePessimisticFixpoint();
+    }
+
+    // Collect the types that will replace the privatizable type in the function
+    // signature.
+    SmallVector<Type *, 16> ReplacementTypes;
+    identifyReplacementTypes(PrivatizableType.getValue(), ReplacementTypes);
+
+    // Register a rewrite of the argument.
+    Argument *Arg = getAssociatedArgument();
+    if (!A.isValidFunctionSignatureRewrite(*Arg, ReplacementTypes)) {
+      LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] Rewrite not valid\n");
+      return indicatePessimisticFixpoint();
+    }
+
+    unsigned ArgNo = Arg->getArgNo();
+
+    // Helper to check if for the given call site the associated argument is
+    // passed to a callback where the privatization would be different.
+    auto IsCompatiblePrivArgOfCallback = [&](CallBase &CB) {
+      SmallVector<const Use *, 4> CallbackUses;
+      AbstractCallSite::getCallbackUses(CB, CallbackUses);
+      for (const Use *U : CallbackUses) {
+        AbstractCallSite CBACS(U);
+        assert(CBACS && CBACS.isCallbackCall());
+        for (Argument &CBArg : CBACS.getCalledFunction()->args()) {
+          int CBArgNo = CBACS.getCallArgOperandNo(CBArg);
+
+          LLVM_DEBUG({
+            dbgs()
+                << "[AAPrivatizablePtr] Argument " << *Arg
+                << "check if can be privatized in the context of its parent ("
+                << Arg->getParent()->getName()
+                << ")\n[AAPrivatizablePtr] because it is an argument in a "
+                   "callback ("
+                << CBArgNo << "@" << CBACS.getCalledFunction()->getName()
+                << ")\n[AAPrivatizablePtr] " << CBArg << " : "
+                << CBACS.getCallArgOperand(CBArg) << " vs "
+                << CB.getArgOperand(ArgNo) << "\n"
+                << "[AAPrivatizablePtr] " << CBArg << " : "
+                << CBACS.getCallArgOperandNo(CBArg) << " vs " << ArgNo << "\n";
+          });
+
+          if (CBArgNo != int(ArgNo))
+            continue;
+          const auto &CBArgPrivAA =
+              A.getAAFor<AAPrivatizablePtr>(*this, IRPosition::argument(CBArg));
+          if (CBArgPrivAA.isValidState()) {
+            auto CBArgPrivTy = CBArgPrivAA.getPrivatizableType();
+            if (!CBArgPrivTy.hasValue())
+              continue;
+            if (CBArgPrivTy.getValue() == PrivatizableType)
+              continue;
+          }
+
+          LLVM_DEBUG({
+            dbgs() << "[AAPrivatizablePtr] Argument " << *Arg
+                   << " cannot be privatized in the context of its parent ("
+                   << Arg->getParent()->getName()
+                   << ")\n[AAPrivatizablePtr] because it is an argument in a "
+                      "callback ("
+                   << CBArgNo << "@" << CBACS.getCalledFunction()->getName()
+                   << ").\n[AAPrivatizablePtr] for which the argument "
+                      "privatization is not compatible.\n";
+          });
+          return false;
+        }
+      }
+      return true;
+    };
+
+    // Helper to check if for the given call site the associated argument is
+    // passed to a direct call where the privatization would be different.
+    auto IsCompatiblePrivArgOfDirectCS = [&](AbstractCallSite ACS) {
+      CallBase *DC = cast<CallBase>(ACS.getInstruction());
+      int DCArgNo = ACS.getCallArgOperandNo(ArgNo);
+      assert(DCArgNo >= 0 && unsigned(DCArgNo) < DC->getNumArgOperands() &&
+             "Expected a direct call operand for callback call operand");
+
+      LLVM_DEBUG({
+        dbgs() << "[AAPrivatizablePtr] Argument " << *Arg
+               << " check if be privatized in the context of its parent ("
+               << Arg->getParent()->getName()
+               << ")\n[AAPrivatizablePtr] because it is an argument in a "
+                  "direct call of ("
+               << DCArgNo << "@" << DC->getCalledFunction()->getName()
+               << ").\n";
+      });
+
+      Function *DCCallee = DC->getCalledFunction();
+      if (unsigned(DCArgNo) < DCCallee->arg_size()) {
+        const auto &DCArgPrivAA = A.getAAFor<AAPrivatizablePtr>(
+            *this, IRPosition::argument(*DCCallee->getArg(DCArgNo)));
+        if (DCArgPrivAA.isValidState()) {
+          auto DCArgPrivTy = DCArgPrivAA.getPrivatizableType();
+          if (!DCArgPrivTy.hasValue())
+            return true;
+          if (DCArgPrivTy.getValue() == PrivatizableType)
+            return true;
+        }
+      }
+
+      LLVM_DEBUG({
+        dbgs() << "[AAPrivatizablePtr] Argument " << *Arg
+               << " cannot be privatized in the context of its parent ("
+               << Arg->getParent()->getName()
+               << ")\n[AAPrivatizablePtr] because it is an argument in a "
+                  "direct call of ("
+               << ACS.getInstruction()->getCalledFunction()->getName()
+               << ").\n[AAPrivatizablePtr] for which the argument "
+                  "privatization is not compatible.\n";
+      });
+      return false;
+    };
+
+    // Helper to check if the associated argument is used at the given abstract
+    // call site in a way that is incompatible with the privatization assumed
+    // here.
+    auto IsCompatiblePrivArgOfOtherCallSite = [&](AbstractCallSite ACS) {
+      if (ACS.isDirectCall())
+        return IsCompatiblePrivArgOfCallback(*ACS.getInstruction());
+      if (ACS.isCallbackCall())
+        return IsCompatiblePrivArgOfDirectCS(ACS);
+      return false;
+    };
+
+    bool AllCallSitesKnown;
+    if (!A.checkForAllCallSites(IsCompatiblePrivArgOfOtherCallSite, *this, true,
+                                AllCallSitesKnown))
+      return indicatePessimisticFixpoint();
+
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// Given a type to private \p PrivType, collect the constituates (which are
+  /// used) in \p ReplacementTypes.
+  static void
+  identifyReplacementTypes(Type *PrivType,
+                           SmallVectorImpl<Type *> &ReplacementTypes) {
+    // TODO: For now we expand the privatization type to the fullest which can
+    //       lead to dead arguments that need to be removed later.
+    assert(PrivType && "Expected privatizable type!");
+
+    // Traverse the type, extract constituate types on the outermost level.
+    if (auto *PrivStructType = dyn_cast<StructType>(PrivType)) {
+      for (unsigned u = 0, e = PrivStructType->getNumElements(); u < e; u++)
+        ReplacementTypes.push_back(PrivStructType->getElementType(u));
+    } else if (auto *PrivArrayType = dyn_cast<ArrayType>(PrivType)) {
+      ReplacementTypes.append(PrivArrayType->getNumElements(),
+                              PrivArrayType->getElementType());
+    } else {
+      ReplacementTypes.push_back(PrivType);
+    }
+  }
+
+  /// Initialize \p Base according to the type \p PrivType at position \p IP.
+  /// The values needed are taken from the arguments of \p F starting at
+  /// position \p ArgNo.
+  static void createInitialization(Type *PrivType, Value &Base, Function &F,
+                                   unsigned ArgNo, Instruction &IP) {
+    assert(PrivType && "Expected privatizable type!");
+
+    IRBuilder<NoFolder> IRB(&IP);
+    const DataLayout &DL = F.getParent()->getDataLayout();
+
+    // Traverse the type, build GEPs and stores.
+    if (auto *PrivStructType = dyn_cast<StructType>(PrivType)) {
+      const StructLayout *PrivStructLayout = DL.getStructLayout(PrivStructType);
+      for (unsigned u = 0, e = PrivStructType->getNumElements(); u < e; u++) {
+        Type *PointeeTy = PrivStructType->getElementType(u)->getPointerTo();
+        Value *Ptr = constructPointer(
+            PointeeTy, &Base, PrivStructLayout->getElementOffset(u), IRB, DL);
+        new StoreInst(F.getArg(ArgNo + u), Ptr, &IP);
+      }
+    } else if (auto *PrivArrayType = dyn_cast<ArrayType>(PrivType)) {
       Type *PointeeTy = PrivArrayType->getElementType();
       Type *PointeePtrTy = PointeeTy->getPointerTo();
       uint64_t PointeeTySize = DL.getTypeStoreSize(PointeeTy);
-      for (unsigned u = 0, e = PrivArrayType->getNumElements(); u < e; u++) { 
-        Value *Ptr = 
-            constructPointer(PointeePtrTy, &Base, u * PointeeTySize, IRB, DL); 
-        new StoreInst(F.getArg(ArgNo + u), Ptr, &IP); 
-      } 
-    } else { 
-      new StoreInst(F.getArg(ArgNo), &Base, &IP); 
-    } 
-  } 
- 
-  /// Extract values from \p Base according to the type \p PrivType at the 
-  /// call position \p ACS. The values are appended to \p ReplacementValues. 
-  void createReplacementValues(Align Alignment, Type *PrivType, 
-                               AbstractCallSite ACS, Value *Base, 
-                               SmallVectorImpl<Value *> &ReplacementValues) { 
-    assert(Base && "Expected base value!"); 
-    assert(PrivType && "Expected privatizable type!"); 
-    Instruction *IP = ACS.getInstruction(); 
- 
-    IRBuilder<NoFolder> IRB(IP); 
-    const DataLayout &DL = IP->getModule()->getDataLayout(); 
- 
-    if (Base->getType()->getPointerElementType() != PrivType) 
-      Base = BitCastInst::CreateBitOrPointerCast(Base, PrivType->getPointerTo(), 
-                                                 "", ACS.getInstruction()); 
- 
-    // Traverse the type, build GEPs and loads. 
-    if (auto *PrivStructType = dyn_cast<StructType>(PrivType)) { 
-      const StructLayout *PrivStructLayout = DL.getStructLayout(PrivStructType); 
-      for (unsigned u = 0, e = PrivStructType->getNumElements(); u < e; u++) { 
-        Type *PointeeTy = PrivStructType->getElementType(u); 
-        Value *Ptr = 
-            constructPointer(PointeeTy->getPointerTo(), Base, 
-                             PrivStructLayout->getElementOffset(u), IRB, DL); 
-        LoadInst *L = new LoadInst(PointeeTy, Ptr, "", IP); 
-        L->setAlignment(Alignment); 
-        ReplacementValues.push_back(L); 
-      } 
-    } else if (auto *PrivArrayType = dyn_cast<ArrayType>(PrivType)) { 
-      Type *PointeeTy = PrivArrayType->getElementType(); 
-      uint64_t PointeeTySize = DL.getTypeStoreSize(PointeeTy); 
-      Type *PointeePtrTy = PointeeTy->getPointerTo(); 
-      for (unsigned u = 0, e = PrivArrayType->getNumElements(); u < e; u++) { 
-        Value *Ptr = 
-            constructPointer(PointeePtrTy, Base, u * PointeeTySize, IRB, DL); 
+      for (unsigned u = 0, e = PrivArrayType->getNumElements(); u < e; u++) {
+        Value *Ptr =
+            constructPointer(PointeePtrTy, &Base, u * PointeeTySize, IRB, DL);
+        new StoreInst(F.getArg(ArgNo + u), Ptr, &IP);
+      }
+    } else {
+      new StoreInst(F.getArg(ArgNo), &Base, &IP);
+    }
+  }
+
+  /// Extract values from \p Base according to the type \p PrivType at the
+  /// call position \p ACS. The values are appended to \p ReplacementValues.
+  void createReplacementValues(Align Alignment, Type *PrivType,
+                               AbstractCallSite ACS, Value *Base,
+                               SmallVectorImpl<Value *> &ReplacementValues) {
+    assert(Base && "Expected base value!");
+    assert(PrivType && "Expected privatizable type!");
+    Instruction *IP = ACS.getInstruction();
+
+    IRBuilder<NoFolder> IRB(IP);
+    const DataLayout &DL = IP->getModule()->getDataLayout();
+
+    if (Base->getType()->getPointerElementType() != PrivType)
+      Base = BitCastInst::CreateBitOrPointerCast(Base, PrivType->getPointerTo(),
+                                                 "", ACS.getInstruction());
+
+    // Traverse the type, build GEPs and loads.
+    if (auto *PrivStructType = dyn_cast<StructType>(PrivType)) {
+      const StructLayout *PrivStructLayout = DL.getStructLayout(PrivStructType);
+      for (unsigned u = 0, e = PrivStructType->getNumElements(); u < e; u++) {
+        Type *PointeeTy = PrivStructType->getElementType(u);
+        Value *Ptr =
+            constructPointer(PointeeTy->getPointerTo(), Base,
+                             PrivStructLayout->getElementOffset(u), IRB, DL);
+        LoadInst *L = new LoadInst(PointeeTy, Ptr, "", IP);
+        L->setAlignment(Alignment);
+        ReplacementValues.push_back(L);
+      }
+    } else if (auto *PrivArrayType = dyn_cast<ArrayType>(PrivType)) {
+      Type *PointeeTy = PrivArrayType->getElementType();
+      uint64_t PointeeTySize = DL.getTypeStoreSize(PointeeTy);
+      Type *PointeePtrTy = PointeeTy->getPointerTo();
+      for (unsigned u = 0, e = PrivArrayType->getNumElements(); u < e; u++) {
+        Value *Ptr =
+            constructPointer(PointeePtrTy, Base, u * PointeeTySize, IRB, DL);
         LoadInst *L = new LoadInst(PointeeTy, Ptr, "", IP);
-        L->setAlignment(Alignment); 
-        ReplacementValues.push_back(L); 
-      } 
-    } else { 
-      LoadInst *L = new LoadInst(PrivType, Base, "", IP); 
-      L->setAlignment(Alignment); 
-      ReplacementValues.push_back(L); 
-    } 
-  } 
- 
-  /// See AbstractAttribute::manifest(...) 
-  ChangeStatus manifest(Attributor &A) override { 
-    if (!PrivatizableType.hasValue()) 
-      return ChangeStatus::UNCHANGED; 
-    assert(PrivatizableType.getValue() && "Expected privatizable type!"); 
- 
-    // Collect all tail calls in the function as we cannot allow new allocas to 
-    // escape into tail recursion. 
-    // TODO: Be smarter about new allocas escaping into tail calls. 
-    SmallVector<CallInst *, 16> TailCalls; 
-    if (!A.checkForAllInstructions( 
-            [&](Instruction &I) { 
-              CallInst &CI = cast<CallInst>(I); 
-              if (CI.isTailCall()) 
-                TailCalls.push_back(&CI); 
-              return true; 
-            }, 
-            *this, {Instruction::Call})) 
-      return ChangeStatus::UNCHANGED; 
- 
-    Argument *Arg = getAssociatedArgument(); 
-    // Query AAAlign attribute for alignment of associated argument to 
-    // determine the best alignment of loads. 
-    const auto &AlignAA = A.getAAFor<AAAlign>(*this, IRPosition::value(*Arg)); 
- 
-    // Callback to repair the associated function. A new alloca is placed at the 
-    // beginning and initialized with the values passed through arguments. The 
-    // new alloca replaces the use of the old pointer argument. 
-    Attributor::ArgumentReplacementInfo::CalleeRepairCBTy FnRepairCB = 
-        [=](const Attributor::ArgumentReplacementInfo &ARI, 
-            Function &ReplacementFn, Function::arg_iterator ArgIt) { 
-          BasicBlock &EntryBB = ReplacementFn.getEntryBlock(); 
-          Instruction *IP = &*EntryBB.getFirstInsertionPt(); 
+        L->setAlignment(Alignment);
+        ReplacementValues.push_back(L);
+      }
+    } else {
+      LoadInst *L = new LoadInst(PrivType, Base, "", IP);
+      L->setAlignment(Alignment);
+      ReplacementValues.push_back(L);
+    }
+  }
+
+  /// See AbstractAttribute::manifest(...)
+  ChangeStatus manifest(Attributor &A) override {
+    if (!PrivatizableType.hasValue())
+      return ChangeStatus::UNCHANGED;
+    assert(PrivatizableType.getValue() && "Expected privatizable type!");
+
+    // Collect all tail calls in the function as we cannot allow new allocas to
+    // escape into tail recursion.
+    // TODO: Be smarter about new allocas escaping into tail calls.
+    SmallVector<CallInst *, 16> TailCalls;
+    if (!A.checkForAllInstructions(
+            [&](Instruction &I) {
+              CallInst &CI = cast<CallInst>(I);
+              if (CI.isTailCall())
+                TailCalls.push_back(&CI);
+              return true;
+            },
+            *this, {Instruction::Call}))
+      return ChangeStatus::UNCHANGED;
+
+    Argument *Arg = getAssociatedArgument();
+    // Query AAAlign attribute for alignment of associated argument to
+    // determine the best alignment of loads.
+    const auto &AlignAA = A.getAAFor<AAAlign>(*this, IRPosition::value(*Arg));
+
+    // Callback to repair the associated function. A new alloca is placed at the
+    // beginning and initialized with the values passed through arguments. The
+    // new alloca replaces the use of the old pointer argument.
+    Attributor::ArgumentReplacementInfo::CalleeRepairCBTy FnRepairCB =
+        [=](const Attributor::ArgumentReplacementInfo &ARI,
+            Function &ReplacementFn, Function::arg_iterator ArgIt) {
+          BasicBlock &EntryBB = ReplacementFn.getEntryBlock();
+          Instruction *IP = &*EntryBB.getFirstInsertionPt();
           Instruction *AI = new AllocaInst(PrivatizableType.getValue(), 0,
                                            Arg->getName() + ".priv", IP);
-          createInitialization(PrivatizableType.getValue(), *AI, ReplacementFn, 
-                               ArgIt->getArgNo(), *IP); 
+          createInitialization(PrivatizableType.getValue(), *AI, ReplacementFn,
+                               ArgIt->getArgNo(), *IP);
 
           if (AI->getType() != Arg->getType())
             AI =
                 BitCastInst::CreateBitOrPointerCast(AI, Arg->getType(), "", IP);
-          Arg->replaceAllUsesWith(AI); 
- 
-          for (CallInst *CI : TailCalls) 
-            CI->setTailCall(false); 
-        }; 
- 
-    // Callback to repair a call site of the associated function. The elements 
-    // of the privatizable type are loaded prior to the call and passed to the 
-    // new function version. 
-    Attributor::ArgumentReplacementInfo::ACSRepairCBTy ACSRepairCB = 
-        [=, &AlignAA](const Attributor::ArgumentReplacementInfo &ARI, 
-                      AbstractCallSite ACS, 
-                      SmallVectorImpl<Value *> &NewArgOperands) { 
-          // When no alignment is specified for the load instruction, 
-          // natural alignment is assumed. 
-          createReplacementValues( 
-              assumeAligned(AlignAA.getAssumedAlign()), 
-              PrivatizableType.getValue(), ACS, 
-              ACS.getCallArgOperand(ARI.getReplacedArg().getArgNo()), 
-              NewArgOperands); 
-        }; 
- 
-    // Collect the types that will replace the privatizable type in the function 
-    // signature. 
-    SmallVector<Type *, 16> ReplacementTypes; 
-    identifyReplacementTypes(PrivatizableType.getValue(), ReplacementTypes); 
- 
-    // Register a rewrite of the argument. 
-    if (A.registerFunctionSignatureRewrite(*Arg, ReplacementTypes, 
-                                           std::move(FnRepairCB), 
-                                           std::move(ACSRepairCB))) 
-      return ChangeStatus::CHANGED; 
-    return ChangeStatus::UNCHANGED; 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { 
-    STATS_DECLTRACK_ARG_ATTR(privatizable_ptr); 
-  } 
-}; 
- 
-struct AAPrivatizablePtrFloating : public AAPrivatizablePtrImpl { 
-  AAPrivatizablePtrFloating(const IRPosition &IRP, Attributor &A) 
-      : AAPrivatizablePtrImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  virtual void initialize(Attributor &A) override { 
-    // TODO: We can privatize more than arguments. 
-    indicatePessimisticFixpoint(); 
-  } 
- 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    llvm_unreachable("AAPrivatizablePtr(Floating|Returned|CallSiteReturned)::" 
-                     "updateImpl will not be called"); 
-  } 
- 
-  /// See AAPrivatizablePtrImpl::identifyPrivatizableType(...) 
-  Optional<Type *> identifyPrivatizableType(Attributor &A) override { 
+          Arg->replaceAllUsesWith(AI);
+
+          for (CallInst *CI : TailCalls)
+            CI->setTailCall(false);
+        };
+
+    // Callback to repair a call site of the associated function. The elements
+    // of the privatizable type are loaded prior to the call and passed to the
+    // new function version.
+    Attributor::ArgumentReplacementInfo::ACSRepairCBTy ACSRepairCB =
+        [=, &AlignAA](const Attributor::ArgumentReplacementInfo &ARI,
+                      AbstractCallSite ACS,
+                      SmallVectorImpl<Value *> &NewArgOperands) {
+          // When no alignment is specified for the load instruction,
+          // natural alignment is assumed.
+          createReplacementValues(
+              assumeAligned(AlignAA.getAssumedAlign()),
+              PrivatizableType.getValue(), ACS,
+              ACS.getCallArgOperand(ARI.getReplacedArg().getArgNo()),
+              NewArgOperands);
+        };
+
+    // Collect the types that will replace the privatizable type in the function
+    // signature.
+    SmallVector<Type *, 16> ReplacementTypes;
+    identifyReplacementTypes(PrivatizableType.getValue(), ReplacementTypes);
+
+    // Register a rewrite of the argument.
+    if (A.registerFunctionSignatureRewrite(*Arg, ReplacementTypes,
+                                           std::move(FnRepairCB),
+                                           std::move(ACSRepairCB)))
+      return ChangeStatus::CHANGED;
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_ARG_ATTR(privatizable_ptr);
+  }
+};
+
+struct AAPrivatizablePtrFloating : public AAPrivatizablePtrImpl {
+  AAPrivatizablePtrFloating(const IRPosition &IRP, Attributor &A)
+      : AAPrivatizablePtrImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  virtual void initialize(Attributor &A) override {
+    // TODO: We can privatize more than arguments.
+    indicatePessimisticFixpoint();
+  }
+
+  ChangeStatus updateImpl(Attributor &A) override {
+    llvm_unreachable("AAPrivatizablePtr(Floating|Returned|CallSiteReturned)::"
+                     "updateImpl will not be called");
+  }
+
+  /// See AAPrivatizablePtrImpl::identifyPrivatizableType(...)
+  Optional<Type *> identifyPrivatizableType(Attributor &A) override {
     Value *Obj = getUnderlyingObject(&getAssociatedValue());
-    if (!Obj) { 
-      LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] No underlying object found!\n"); 
-      return nullptr; 
-    } 
- 
-    if (auto *AI = dyn_cast<AllocaInst>(Obj)) 
-      if (auto *CI = dyn_cast<ConstantInt>(AI->getArraySize())) 
-        if (CI->isOne()) 
-          return Obj->getType()->getPointerElementType(); 
-    if (auto *Arg = dyn_cast<Argument>(Obj)) { 
-      auto &PrivArgAA = 
-          A.getAAFor<AAPrivatizablePtr>(*this, IRPosition::argument(*Arg)); 
-      if (PrivArgAA.isAssumedPrivatizablePtr()) 
-        return Obj->getType()->getPointerElementType(); 
-    } 
- 
-    LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] Underlying object neither valid " 
-                         "alloca nor privatizable argument: " 
-                      << *Obj << "!\n"); 
-    return nullptr; 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { 
-    STATS_DECLTRACK_FLOATING_ATTR(privatizable_ptr); 
-  } 
-}; 
- 
-struct AAPrivatizablePtrCallSiteArgument final 
-    : public AAPrivatizablePtrFloating { 
-  AAPrivatizablePtrCallSiteArgument(const IRPosition &IRP, Attributor &A) 
-      : AAPrivatizablePtrFloating(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    if (getIRPosition().hasAttr(Attribute::ByVal)) 
-      indicateOptimisticFixpoint(); 
-  } 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    PrivatizableType = identifyPrivatizableType(A); 
-    if (!PrivatizableType.hasValue()) 
-      return ChangeStatus::UNCHANGED; 
-    if (!PrivatizableType.getValue()) 
-      return indicatePessimisticFixpoint(); 
- 
-    const IRPosition &IRP = getIRPosition(); 
-    auto &NoCaptureAA = A.getAAFor<AANoCapture>(*this, IRP); 
-    if (!NoCaptureAA.isAssumedNoCapture()) { 
-      LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] pointer might be captured!\n"); 
-      return indicatePessimisticFixpoint(); 
-    } 
- 
-    auto &NoAliasAA = A.getAAFor<AANoAlias>(*this, IRP); 
-    if (!NoAliasAA.isAssumedNoAlias()) { 
-      LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] pointer might alias!\n"); 
-      return indicatePessimisticFixpoint(); 
-    } 
- 
-    const auto &MemBehaviorAA = A.getAAFor<AAMemoryBehavior>(*this, IRP); 
-    if (!MemBehaviorAA.isAssumedReadOnly()) { 
-      LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] pointer is written!\n"); 
-      return indicatePessimisticFixpoint(); 
-    } 
- 
-    return ChangeStatus::UNCHANGED; 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { 
-    STATS_DECLTRACK_CSARG_ATTR(privatizable_ptr); 
-  } 
-}; 
- 
-struct AAPrivatizablePtrCallSiteReturned final 
-    : public AAPrivatizablePtrFloating { 
-  AAPrivatizablePtrCallSiteReturned(const IRPosition &IRP, Attributor &A) 
-      : AAPrivatizablePtrFloating(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    // TODO: We can privatize more than arguments. 
-    indicatePessimisticFixpoint(); 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { 
-    STATS_DECLTRACK_CSRET_ATTR(privatizable_ptr); 
-  } 
-}; 
- 
-struct AAPrivatizablePtrReturned final : public AAPrivatizablePtrFloating { 
-  AAPrivatizablePtrReturned(const IRPosition &IRP, Attributor &A) 
-      : AAPrivatizablePtrFloating(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    // TODO: We can privatize more than arguments. 
-    indicatePessimisticFixpoint(); 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { 
-    STATS_DECLTRACK_FNRET_ATTR(privatizable_ptr); 
-  } 
-}; 
- 
-/// -------------------- Memory Behavior Attributes ---------------------------- 
-/// Includes read-none, read-only, and write-only. 
-/// ---------------------------------------------------------------------------- 
-struct AAMemoryBehaviorImpl : public AAMemoryBehavior { 
-  AAMemoryBehaviorImpl(const IRPosition &IRP, Attributor &A) 
-      : AAMemoryBehavior(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    intersectAssumedBits(BEST_STATE); 
-    getKnownStateFromValue(getIRPosition(), getState()); 
+    if (!Obj) {
+      LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] No underlying object found!\n");
+      return nullptr;
+    }
+
+    if (auto *AI = dyn_cast<AllocaInst>(Obj))
+      if (auto *CI = dyn_cast<ConstantInt>(AI->getArraySize()))
+        if (CI->isOne())
+          return Obj->getType()->getPointerElementType();
+    if (auto *Arg = dyn_cast<Argument>(Obj)) {
+      auto &PrivArgAA =
+          A.getAAFor<AAPrivatizablePtr>(*this, IRPosition::argument(*Arg));
+      if (PrivArgAA.isAssumedPrivatizablePtr())
+        return Obj->getType()->getPointerElementType();
+    }
+
+    LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] Underlying object neither valid "
+                         "alloca nor privatizable argument: "
+                      << *Obj << "!\n");
+    return nullptr;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FLOATING_ATTR(privatizable_ptr);
+  }
+};
+
+struct AAPrivatizablePtrCallSiteArgument final
+    : public AAPrivatizablePtrFloating {
+  AAPrivatizablePtrCallSiteArgument(const IRPosition &IRP, Attributor &A)
+      : AAPrivatizablePtrFloating(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    if (getIRPosition().hasAttr(Attribute::ByVal))
+      indicateOptimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    PrivatizableType = identifyPrivatizableType(A);
+    if (!PrivatizableType.hasValue())
+      return ChangeStatus::UNCHANGED;
+    if (!PrivatizableType.getValue())
+      return indicatePessimisticFixpoint();
+
+    const IRPosition &IRP = getIRPosition();
+    auto &NoCaptureAA = A.getAAFor<AANoCapture>(*this, IRP);
+    if (!NoCaptureAA.isAssumedNoCapture()) {
+      LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] pointer might be captured!\n");
+      return indicatePessimisticFixpoint();
+    }
+
+    auto &NoAliasAA = A.getAAFor<AANoAlias>(*this, IRP);
+    if (!NoAliasAA.isAssumedNoAlias()) {
+      LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] pointer might alias!\n");
+      return indicatePessimisticFixpoint();
+    }
+
+    const auto &MemBehaviorAA = A.getAAFor<AAMemoryBehavior>(*this, IRP);
+    if (!MemBehaviorAA.isAssumedReadOnly()) {
+      LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] pointer is written!\n");
+      return indicatePessimisticFixpoint();
+    }
+
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CSARG_ATTR(privatizable_ptr);
+  }
+};
+
+struct AAPrivatizablePtrCallSiteReturned final
+    : public AAPrivatizablePtrFloating {
+  AAPrivatizablePtrCallSiteReturned(const IRPosition &IRP, Attributor &A)
+      : AAPrivatizablePtrFloating(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    // TODO: We can privatize more than arguments.
+    indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CSRET_ATTR(privatizable_ptr);
+  }
+};
+
+struct AAPrivatizablePtrReturned final : public AAPrivatizablePtrFloating {
+  AAPrivatizablePtrReturned(const IRPosition &IRP, Attributor &A)
+      : AAPrivatizablePtrFloating(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    // TODO: We can privatize more than arguments.
+    indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FNRET_ATTR(privatizable_ptr);
+  }
+};
+
+/// -------------------- Memory Behavior Attributes ----------------------------
+/// Includes read-none, read-only, and write-only.
+/// ----------------------------------------------------------------------------
+struct AAMemoryBehaviorImpl : public AAMemoryBehavior {
+  AAMemoryBehaviorImpl(const IRPosition &IRP, Attributor &A)
+      : AAMemoryBehavior(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    intersectAssumedBits(BEST_STATE);
+    getKnownStateFromValue(getIRPosition(), getState());
     AAMemoryBehavior::initialize(A);
-  } 
- 
-  /// Return the memory behavior information encoded in the IR for \p IRP. 
-  static void getKnownStateFromValue(const IRPosition &IRP, 
-                                     BitIntegerState &State, 
-                                     bool IgnoreSubsumingPositions = false) { 
-    SmallVector<Attribute, 2> Attrs; 
-    IRP.getAttrs(AttrKinds, Attrs, IgnoreSubsumingPositions); 
-    for (const Attribute &Attr : Attrs) { 
-      switch (Attr.getKindAsEnum()) { 
-      case Attribute::ReadNone: 
-        State.addKnownBits(NO_ACCESSES); 
-        break; 
-      case Attribute::ReadOnly: 
-        State.addKnownBits(NO_WRITES); 
-        break; 
-      case Attribute::WriteOnly: 
-        State.addKnownBits(NO_READS); 
-        break; 
-      default: 
-        llvm_unreachable("Unexpected attribute!"); 
-      } 
-    } 
- 
-    if (auto *I = dyn_cast<Instruction>(&IRP.getAnchorValue())) { 
-      if (!I->mayReadFromMemory()) 
-        State.addKnownBits(NO_READS); 
-      if (!I->mayWriteToMemory()) 
-        State.addKnownBits(NO_WRITES); 
-    } 
-  } 
- 
-  /// See AbstractAttribute::getDeducedAttributes(...). 
-  void getDeducedAttributes(LLVMContext &Ctx, 
-                            SmallVectorImpl<Attribute> &Attrs) const override { 
-    assert(Attrs.size() == 0); 
-    if (isAssumedReadNone()) 
-      Attrs.push_back(Attribute::get(Ctx, Attribute::ReadNone)); 
-    else if (isAssumedReadOnly()) 
-      Attrs.push_back(Attribute::get(Ctx, Attribute::ReadOnly)); 
-    else if (isAssumedWriteOnly()) 
-      Attrs.push_back(Attribute::get(Ctx, Attribute::WriteOnly)); 
-    assert(Attrs.size() <= 1); 
-  } 
- 
-  /// See AbstractAttribute::manifest(...). 
-  ChangeStatus manifest(Attributor &A) override { 
-    if (hasAttr(Attribute::ReadNone, /* IgnoreSubsumingPositions */ true)) 
-      return ChangeStatus::UNCHANGED; 
- 
-    const IRPosition &IRP = getIRPosition(); 
- 
-    // Check if we would improve the existing attributes first. 
-    SmallVector<Attribute, 4> DeducedAttrs; 
-    getDeducedAttributes(IRP.getAnchorValue().getContext(), DeducedAttrs); 
-    if (llvm::all_of(DeducedAttrs, [&](const Attribute &Attr) { 
-          return IRP.hasAttr(Attr.getKindAsEnum(), 
-                             /* IgnoreSubsumingPositions */ true); 
-        })) 
-      return ChangeStatus::UNCHANGED; 
- 
-    // Clear existing attributes. 
-    IRP.removeAttrs(AttrKinds); 
- 
-    // Use the generic manifest method. 
-    return IRAttribute::manifest(A); 
-  } 
- 
-  /// See AbstractState::getAsStr(). 
-  const std::string getAsStr() const override { 
-    if (isAssumedReadNone()) 
-      return "readnone"; 
-    if (isAssumedReadOnly()) 
-      return "readonly"; 
-    if (isAssumedWriteOnly()) 
-      return "writeonly"; 
-    return "may-read/write"; 
-  } 
- 
-  /// The set of IR attributes AAMemoryBehavior deals with. 
-  static const Attribute::AttrKind AttrKinds[3]; 
-}; 
- 
-const Attribute::AttrKind AAMemoryBehaviorImpl::AttrKinds[] = { 
-    Attribute::ReadNone, Attribute::ReadOnly, Attribute::WriteOnly}; 
- 
-/// Memory behavior attribute for a floating value. 
-struct AAMemoryBehaviorFloating : AAMemoryBehaviorImpl { 
-  AAMemoryBehaviorFloating(const IRPosition &IRP, Attributor &A) 
-      : AAMemoryBehaviorImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    AAMemoryBehaviorImpl::initialize(A); 
+  }
+
+  /// Return the memory behavior information encoded in the IR for \p IRP.
+  static void getKnownStateFromValue(const IRPosition &IRP,
+                                     BitIntegerState &State,
+                                     bool IgnoreSubsumingPositions = false) {
+    SmallVector<Attribute, 2> Attrs;
+    IRP.getAttrs(AttrKinds, Attrs, IgnoreSubsumingPositions);
+    for (const Attribute &Attr : Attrs) {
+      switch (Attr.getKindAsEnum()) {
+      case Attribute::ReadNone:
+        State.addKnownBits(NO_ACCESSES);
+        break;
+      case Attribute::ReadOnly:
+        State.addKnownBits(NO_WRITES);
+        break;
+      case Attribute::WriteOnly:
+        State.addKnownBits(NO_READS);
+        break;
+      default:
+        llvm_unreachable("Unexpected attribute!");
+      }
+    }
+
+    if (auto *I = dyn_cast<Instruction>(&IRP.getAnchorValue())) {
+      if (!I->mayReadFromMemory())
+        State.addKnownBits(NO_READS);
+      if (!I->mayWriteToMemory())
+        State.addKnownBits(NO_WRITES);
+    }
+  }
+
+  /// See AbstractAttribute::getDeducedAttributes(...).
+  void getDeducedAttributes(LLVMContext &Ctx,
+                            SmallVectorImpl<Attribute> &Attrs) const override {
+    assert(Attrs.size() == 0);
+    if (isAssumedReadNone())
+      Attrs.push_back(Attribute::get(Ctx, Attribute::ReadNone));
+    else if (isAssumedReadOnly())
+      Attrs.push_back(Attribute::get(Ctx, Attribute::ReadOnly));
+    else if (isAssumedWriteOnly())
+      Attrs.push_back(Attribute::get(Ctx, Attribute::WriteOnly));
+    assert(Attrs.size() <= 1);
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    if (hasAttr(Attribute::ReadNone, /* IgnoreSubsumingPositions */ true))
+      return ChangeStatus::UNCHANGED;
+
+    const IRPosition &IRP = getIRPosition();
+
+    // Check if we would improve the existing attributes first.
+    SmallVector<Attribute, 4> DeducedAttrs;
+    getDeducedAttributes(IRP.getAnchorValue().getContext(), DeducedAttrs);
+    if (llvm::all_of(DeducedAttrs, [&](const Attribute &Attr) {
+          return IRP.hasAttr(Attr.getKindAsEnum(),
+                             /* IgnoreSubsumingPositions */ true);
+        }))
+      return ChangeStatus::UNCHANGED;
+
+    // Clear existing attributes.
+    IRP.removeAttrs(AttrKinds);
+
+    // Use the generic manifest method.
+    return IRAttribute::manifest(A);
+  }
+
+  /// See AbstractState::getAsStr().
+  const std::string getAsStr() const override {
+    if (isAssumedReadNone())
+      return "readnone";
+    if (isAssumedReadOnly())
+      return "readonly";
+    if (isAssumedWriteOnly())
+      return "writeonly";
+    return "may-read/write";
+  }
+
+  /// The set of IR attributes AAMemoryBehavior deals with.
+  static const Attribute::AttrKind AttrKinds[3];
+};
+
+const Attribute::AttrKind AAMemoryBehaviorImpl::AttrKinds[] = {
+    Attribute::ReadNone, Attribute::ReadOnly, Attribute::WriteOnly};
+
+/// Memory behavior attribute for a floating value.
+struct AAMemoryBehaviorFloating : AAMemoryBehaviorImpl {
+  AAMemoryBehaviorFloating(const IRPosition &IRP, Attributor &A)
+      : AAMemoryBehaviorImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AAMemoryBehaviorImpl::initialize(A);
     addUsesOf(A, getAssociatedValue());
-  } 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override; 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { 
-    if (isAssumedReadNone()) 
-      STATS_DECLTRACK_FLOATING_ATTR(readnone) 
-    else if (isAssumedReadOnly()) 
-      STATS_DECLTRACK_FLOATING_ATTR(readonly) 
-    else if (isAssumedWriteOnly()) 
-      STATS_DECLTRACK_FLOATING_ATTR(writeonly) 
-  } 
- 
-private: 
-  /// Return true if users of \p UserI might access the underlying 
-  /// variable/location described by \p U and should therefore be analyzed. 
-  bool followUsersOfUseIn(Attributor &A, const Use *U, 
-                          const Instruction *UserI); 
- 
-  /// Update the state according to the effect of use \p U in \p UserI. 
-  void analyzeUseIn(Attributor &A, const Use *U, const Instruction *UserI); 
- 
-protected: 
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override;
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    if (isAssumedReadNone())
+      STATS_DECLTRACK_FLOATING_ATTR(readnone)
+    else if (isAssumedReadOnly())
+      STATS_DECLTRACK_FLOATING_ATTR(readonly)
+    else if (isAssumedWriteOnly())
+      STATS_DECLTRACK_FLOATING_ATTR(writeonly)
+  }
+
+private:
+  /// Return true if users of \p UserI might access the underlying
+  /// variable/location described by \p U and should therefore be analyzed.
+  bool followUsersOfUseIn(Attributor &A, const Use *U,
+                          const Instruction *UserI);
+
+  /// Update the state according to the effect of use \p U in \p UserI.
+  void analyzeUseIn(Attributor &A, const Use *U, const Instruction *UserI);
+
+protected:
   /// Add the uses of \p V to the `Uses` set we look at during the update step.
   void addUsesOf(Attributor &A, const Value &V);
 
-  /// Container for (transitive) uses of the associated argument. 
+  /// Container for (transitive) uses of the associated argument.
   SmallVector<const Use *, 8> Uses;
 
   /// Set to remember the uses we already traversed.
   SmallPtrSet<const Use *, 8> Visited;
-}; 
- 
-/// Memory behavior attribute for function argument. 
-struct AAMemoryBehaviorArgument : AAMemoryBehaviorFloating { 
-  AAMemoryBehaviorArgument(const IRPosition &IRP, Attributor &A) 
-      : AAMemoryBehaviorFloating(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    intersectAssumedBits(BEST_STATE); 
-    const IRPosition &IRP = getIRPosition(); 
-    // TODO: Make IgnoreSubsumingPositions a property of an IRAttribute so we 
-    // can query it when we use has/getAttr. That would allow us to reuse the 
-    // initialize of the base class here. 
-    bool HasByVal = 
-        IRP.hasAttr({Attribute::ByVal}, /* IgnoreSubsumingPositions */ true); 
-    getKnownStateFromValue(IRP, getState(), 
-                           /* IgnoreSubsumingPositions */ HasByVal); 
- 
-    // Initialize the use vector with all direct uses of the associated value. 
-    Argument *Arg = getAssociatedArgument(); 
-    if (!Arg || !A.isFunctionIPOAmendable(*(Arg->getParent()))) { 
-      indicatePessimisticFixpoint(); 
-    } else { 
+};
+
+/// Memory behavior attribute for function argument.
+struct AAMemoryBehaviorArgument : AAMemoryBehaviorFloating {
+  AAMemoryBehaviorArgument(const IRPosition &IRP, Attributor &A)
+      : AAMemoryBehaviorFloating(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    intersectAssumedBits(BEST_STATE);
+    const IRPosition &IRP = getIRPosition();
+    // TODO: Make IgnoreSubsumingPositions a property of an IRAttribute so we
+    // can query it when we use has/getAttr. That would allow us to reuse the
+    // initialize of the base class here.
+    bool HasByVal =
+        IRP.hasAttr({Attribute::ByVal}, /* IgnoreSubsumingPositions */ true);
+    getKnownStateFromValue(IRP, getState(),
+                           /* IgnoreSubsumingPositions */ HasByVal);
+
+    // Initialize the use vector with all direct uses of the associated value.
+    Argument *Arg = getAssociatedArgument();
+    if (!Arg || !A.isFunctionIPOAmendable(*(Arg->getParent()))) {
+      indicatePessimisticFixpoint();
+    } else {
       addUsesOf(A, *Arg);
-    } 
-  } 
- 
-  ChangeStatus manifest(Attributor &A) override { 
-    // TODO: Pointer arguments are not supported on vectors of pointers yet. 
-    if (!getAssociatedValue().getType()->isPointerTy()) 
-      return ChangeStatus::UNCHANGED; 
- 
-    // TODO: From readattrs.ll: "inalloca parameters are always 
-    //                           considered written" 
-    if (hasAttr({Attribute::InAlloca, Attribute::Preallocated})) { 
-      removeKnownBits(NO_WRITES); 
-      removeAssumedBits(NO_WRITES); 
-    } 
-    return AAMemoryBehaviorFloating::manifest(A); 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { 
-    if (isAssumedReadNone()) 
-      STATS_DECLTRACK_ARG_ATTR(readnone) 
-    else if (isAssumedReadOnly()) 
-      STATS_DECLTRACK_ARG_ATTR(readonly) 
-    else if (isAssumedWriteOnly()) 
-      STATS_DECLTRACK_ARG_ATTR(writeonly) 
-  } 
-}; 
- 
-struct AAMemoryBehaviorCallSiteArgument final : AAMemoryBehaviorArgument { 
-  AAMemoryBehaviorCallSiteArgument(const IRPosition &IRP, Attributor &A) 
-      : AAMemoryBehaviorArgument(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
+    }
+  }
+
+  ChangeStatus manifest(Attributor &A) override {
+    // TODO: Pointer arguments are not supported on vectors of pointers yet.
+    if (!getAssociatedValue().getType()->isPointerTy())
+      return ChangeStatus::UNCHANGED;
+
+    // TODO: From readattrs.ll: "inalloca parameters are always
+    //                           considered written"
+    if (hasAttr({Attribute::InAlloca, Attribute::Preallocated})) {
+      removeKnownBits(NO_WRITES);
+      removeAssumedBits(NO_WRITES);
+    }
+    return AAMemoryBehaviorFloating::manifest(A);
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    if (isAssumedReadNone())
+      STATS_DECLTRACK_ARG_ATTR(readnone)
+    else if (isAssumedReadOnly())
+      STATS_DECLTRACK_ARG_ATTR(readonly)
+    else if (isAssumedWriteOnly())
+      STATS_DECLTRACK_ARG_ATTR(writeonly)
+  }
+};
+
+struct AAMemoryBehaviorCallSiteArgument final : AAMemoryBehaviorArgument {
+  AAMemoryBehaviorCallSiteArgument(const IRPosition &IRP, Attributor &A)
+      : AAMemoryBehaviorArgument(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
     // If we don't have an associated attribute this is either a variadic call
     // or an indirect call, either way, nothing to do here.
     Argument *Arg = getAssociatedArgument();
     if (!Arg) {
       indicatePessimisticFixpoint();
       return;
-    } 
+    }
     if (Arg->hasByValAttr()) {
       addKnownBits(NO_WRITES);
       removeKnownBits(NO_READS);
       removeAssumedBits(NO_READS);
     }
-    AAMemoryBehaviorArgument::initialize(A); 
+    AAMemoryBehaviorArgument::initialize(A);
     if (getAssociatedFunction()->isDeclaration())
       indicatePessimisticFixpoint();
-  } 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    // TODO: Once we have call site specific value information we can provide 
-    //       call site specific liveness liveness information and then it makes 
-    //       sense to specialize attributes for call sites arguments instead of 
-    //       redirecting requests to the callee argument. 
-    Argument *Arg = getAssociatedArgument(); 
-    const IRPosition &ArgPos = IRPosition::argument(*Arg); 
-    auto &ArgAA = A.getAAFor<AAMemoryBehavior>(*this, ArgPos); 
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Argument *Arg = getAssociatedArgument();
+    const IRPosition &ArgPos = IRPosition::argument(*Arg);
+    auto &ArgAA = A.getAAFor<AAMemoryBehavior>(*this, ArgPos);
     return clampStateAndIndicateChange(getState(), ArgAA.getState());
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { 
-    if (isAssumedReadNone()) 
-      STATS_DECLTRACK_CSARG_ATTR(readnone) 
-    else if (isAssumedReadOnly()) 
-      STATS_DECLTRACK_CSARG_ATTR(readonly) 
-    else if (isAssumedWriteOnly()) 
-      STATS_DECLTRACK_CSARG_ATTR(writeonly) 
-  } 
-}; 
- 
-/// Memory behavior attribute for a call site return position. 
-struct AAMemoryBehaviorCallSiteReturned final : AAMemoryBehaviorFloating { 
-  AAMemoryBehaviorCallSiteReturned(const IRPosition &IRP, Attributor &A) 
-      : AAMemoryBehaviorFloating(IRP, A) {} 
- 
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    if (isAssumedReadNone())
+      STATS_DECLTRACK_CSARG_ATTR(readnone)
+    else if (isAssumedReadOnly())
+      STATS_DECLTRACK_CSARG_ATTR(readonly)
+    else if (isAssumedWriteOnly())
+      STATS_DECLTRACK_CSARG_ATTR(writeonly)
+  }
+};
+
+/// Memory behavior attribute for a call site return position.
+struct AAMemoryBehaviorCallSiteReturned final : AAMemoryBehaviorFloating {
+  AAMemoryBehaviorCallSiteReturned(const IRPosition &IRP, Attributor &A)
+      : AAMemoryBehaviorFloating(IRP, A) {}
+
   /// See AbstractAttribute::initialize(...).
   void initialize(Attributor &A) override {
     AAMemoryBehaviorImpl::initialize(A);
@@ -6046,181 +6046,181 @@ struct AAMemoryBehaviorCallSiteReturned final : AAMemoryBehaviorFloating {
       indicatePessimisticFixpoint();
   }
 
-  /// See AbstractAttribute::manifest(...). 
-  ChangeStatus manifest(Attributor &A) override { 
-    // We do not annotate returned values. 
-    return ChangeStatus::UNCHANGED; 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override {} 
-}; 
- 
-/// An AA to represent the memory behavior function attributes. 
-struct AAMemoryBehaviorFunction final : public AAMemoryBehaviorImpl { 
-  AAMemoryBehaviorFunction(const IRPosition &IRP, Attributor &A) 
-      : AAMemoryBehaviorImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::updateImpl(Attributor &A). 
-  virtual ChangeStatus updateImpl(Attributor &A) override; 
- 
-  /// See AbstractAttribute::manifest(...). 
-  ChangeStatus manifest(Attributor &A) override { 
-    Function &F = cast<Function>(getAnchorValue()); 
-    if (isAssumedReadNone()) { 
-      F.removeFnAttr(Attribute::ArgMemOnly); 
-      F.removeFnAttr(Attribute::InaccessibleMemOnly); 
-      F.removeFnAttr(Attribute::InaccessibleMemOrArgMemOnly); 
-    } 
-    return AAMemoryBehaviorImpl::manifest(A); 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { 
-    if (isAssumedReadNone()) 
-      STATS_DECLTRACK_FN_ATTR(readnone) 
-    else if (isAssumedReadOnly()) 
-      STATS_DECLTRACK_FN_ATTR(readonly) 
-    else if (isAssumedWriteOnly()) 
-      STATS_DECLTRACK_FN_ATTR(writeonly) 
-  } 
-}; 
- 
-/// AAMemoryBehavior attribute for call sites. 
-struct AAMemoryBehaviorCallSite final : AAMemoryBehaviorImpl { 
-  AAMemoryBehaviorCallSite(const IRPosition &IRP, Attributor &A) 
-      : AAMemoryBehaviorImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    AAMemoryBehaviorImpl::initialize(A); 
-    Function *F = getAssociatedFunction(); 
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    // We do not annotate returned values.
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {}
+};
+
+/// An AA to represent the memory behavior function attributes.
+struct AAMemoryBehaviorFunction final : public AAMemoryBehaviorImpl {
+  AAMemoryBehaviorFunction(const IRPosition &IRP, Attributor &A)
+      : AAMemoryBehaviorImpl(IRP, A) {}
+
+  /// See AbstractAttribute::updateImpl(Attributor &A).
+  virtual ChangeStatus updateImpl(Attributor &A) override;
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    Function &F = cast<Function>(getAnchorValue());
+    if (isAssumedReadNone()) {
+      F.removeFnAttr(Attribute::ArgMemOnly);
+      F.removeFnAttr(Attribute::InaccessibleMemOnly);
+      F.removeFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
+    }
+    return AAMemoryBehaviorImpl::manifest(A);
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    if (isAssumedReadNone())
+      STATS_DECLTRACK_FN_ATTR(readnone)
+    else if (isAssumedReadOnly())
+      STATS_DECLTRACK_FN_ATTR(readonly)
+    else if (isAssumedWriteOnly())
+      STATS_DECLTRACK_FN_ATTR(writeonly)
+  }
+};
+
+/// AAMemoryBehavior attribute for call sites.
+struct AAMemoryBehaviorCallSite final : AAMemoryBehaviorImpl {
+  AAMemoryBehaviorCallSite(const IRPosition &IRP, Attributor &A)
+      : AAMemoryBehaviorImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AAMemoryBehaviorImpl::initialize(A);
+    Function *F = getAssociatedFunction();
     if (!F || F->isDeclaration())
-      indicatePessimisticFixpoint(); 
-  } 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    // TODO: Once we have call site specific value information we can provide 
-    //       call site specific liveness liveness information and then it makes 
-    //       sense to specialize attributes for call sites arguments instead of 
-    //       redirecting requests to the callee argument. 
-    Function *F = getAssociatedFunction(); 
-    const IRPosition &FnPos = IRPosition::function(*F); 
-    auto &FnAA = A.getAAFor<AAMemoryBehavior>(*this, FnPos); 
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::function(*F);
+    auto &FnAA = A.getAAFor<AAMemoryBehavior>(*this, FnPos);
     return clampStateAndIndicateChange(getState(), FnAA.getState());
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { 
-    if (isAssumedReadNone()) 
-      STATS_DECLTRACK_CS_ATTR(readnone) 
-    else if (isAssumedReadOnly()) 
-      STATS_DECLTRACK_CS_ATTR(readonly) 
-    else if (isAssumedWriteOnly()) 
-      STATS_DECLTRACK_CS_ATTR(writeonly) 
-  } 
-}; 
- 
-ChangeStatus AAMemoryBehaviorFunction::updateImpl(Attributor &A) { 
- 
-  // The current assumed state used to determine a change. 
-  auto AssumedState = getAssumed(); 
- 
-  auto CheckRWInst = [&](Instruction &I) { 
-    // If the instruction has an own memory behavior state, use it to restrict 
-    // the local state. No further analysis is required as the other memory 
-    // state is as optimistic as it gets. 
-    if (const auto *CB = dyn_cast<CallBase>(&I)) { 
-      const auto &MemBehaviorAA = A.getAAFor<AAMemoryBehavior>( 
-          *this, IRPosition::callsite_function(*CB)); 
-      intersectAssumedBits(MemBehaviorAA.getAssumed()); 
-      return !isAtFixpoint(); 
-    } 
- 
-    // Remove access kind modifiers if necessary. 
-    if (I.mayReadFromMemory()) 
-      removeAssumedBits(NO_READS); 
-    if (I.mayWriteToMemory()) 
-      removeAssumedBits(NO_WRITES); 
-    return !isAtFixpoint(); 
-  }; 
- 
-  if (!A.checkForAllReadWriteInstructions(CheckRWInst, *this)) 
-    return indicatePessimisticFixpoint(); 
- 
-  return (AssumedState != getAssumed()) ? ChangeStatus::CHANGED 
-                                        : ChangeStatus::UNCHANGED; 
-} 
- 
-ChangeStatus AAMemoryBehaviorFloating::updateImpl(Attributor &A) { 
- 
-  const IRPosition &IRP = getIRPosition(); 
-  const IRPosition &FnPos = IRPosition::function_scope(IRP); 
-  AAMemoryBehavior::StateType &S = getState(); 
- 
-  // First, check the function scope. We take the known information and we avoid 
-  // work if the assumed information implies the current assumed information for 
-  // this attribute. This is a valid for all but byval arguments. 
-  Argument *Arg = IRP.getAssociatedArgument(); 
-  AAMemoryBehavior::base_t FnMemAssumedState = 
-      AAMemoryBehavior::StateType::getWorstState(); 
-  if (!Arg || !Arg->hasByValAttr()) { 
-    const auto &FnMemAA = A.getAAFor<AAMemoryBehavior>( 
-        *this, FnPos, /* TrackDependence */ true, DepClassTy::OPTIONAL); 
-    FnMemAssumedState = FnMemAA.getAssumed(); 
-    S.addKnownBits(FnMemAA.getKnown()); 
-    if ((S.getAssumed() & FnMemAA.getAssumed()) == S.getAssumed()) 
-      return ChangeStatus::UNCHANGED; 
-  } 
- 
-  // Make sure the value is not captured (except through "return"), if 
-  // it is, any information derived would be irrelevant anyway as we cannot 
-  // check the potential aliases introduced by the capture. However, no need 
-  // to fall back to anythign less optimistic than the function state. 
-  const auto &ArgNoCaptureAA = A.getAAFor<AANoCapture>( 
-      *this, IRP, /* TrackDependence */ true, DepClassTy::OPTIONAL); 
-  if (!ArgNoCaptureAA.isAssumedNoCaptureMaybeReturned()) { 
-    S.intersectAssumedBits(FnMemAssumedState); 
-    return ChangeStatus::CHANGED; 
-  } 
- 
-  // The current assumed state used to determine a change. 
-  auto AssumedState = S.getAssumed(); 
- 
-  // Liveness information to exclude dead users. 
-  // TODO: Take the FnPos once we have call site specific liveness information. 
-  const auto &LivenessAA = A.getAAFor<AAIsDead>( 
-      *this, IRPosition::function(*IRP.getAssociatedFunction()), 
-      /* TrackDependence */ false); 
- 
-  // Visit and expand uses until all are analyzed or a fixpoint is reached. 
-  for (unsigned i = 0; i < Uses.size() && !isAtFixpoint(); i++) { 
-    const Use *U = Uses[i]; 
-    Instruction *UserI = cast<Instruction>(U->getUser()); 
-    LLVM_DEBUG(dbgs() << "[AAMemoryBehavior] Use: " << **U << " in " << *UserI 
-                      << " [Dead: " << (A.isAssumedDead(*U, this, &LivenessAA)) 
-                      << "]\n"); 
-    if (A.isAssumedDead(*U, this, &LivenessAA)) 
-      continue; 
- 
-    // Droppable users, e.g., llvm::assume does not actually perform any action. 
-    if (UserI->isDroppable()) 
-      continue; 
- 
-    // Check if the users of UserI should also be visited. 
-    if (followUsersOfUseIn(A, U, UserI)) 
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    if (isAssumedReadNone())
+      STATS_DECLTRACK_CS_ATTR(readnone)
+    else if (isAssumedReadOnly())
+      STATS_DECLTRACK_CS_ATTR(readonly)
+    else if (isAssumedWriteOnly())
+      STATS_DECLTRACK_CS_ATTR(writeonly)
+  }
+};
+
+ChangeStatus AAMemoryBehaviorFunction::updateImpl(Attributor &A) {
+
+  // The current assumed state used to determine a change.
+  auto AssumedState = getAssumed();
+
+  auto CheckRWInst = [&](Instruction &I) {
+    // If the instruction has an own memory behavior state, use it to restrict
+    // the local state. No further analysis is required as the other memory
+    // state is as optimistic as it gets.
+    if (const auto *CB = dyn_cast<CallBase>(&I)) {
+      const auto &MemBehaviorAA = A.getAAFor<AAMemoryBehavior>(
+          *this, IRPosition::callsite_function(*CB));
+      intersectAssumedBits(MemBehaviorAA.getAssumed());
+      return !isAtFixpoint();
+    }
+
+    // Remove access kind modifiers if necessary.
+    if (I.mayReadFromMemory())
+      removeAssumedBits(NO_READS);
+    if (I.mayWriteToMemory())
+      removeAssumedBits(NO_WRITES);
+    return !isAtFixpoint();
+  };
+
+  if (!A.checkForAllReadWriteInstructions(CheckRWInst, *this))
+    return indicatePessimisticFixpoint();
+
+  return (AssumedState != getAssumed()) ? ChangeStatus::CHANGED
+                                        : ChangeStatus::UNCHANGED;
+}
+
+ChangeStatus AAMemoryBehaviorFloating::updateImpl(Attributor &A) {
+
+  const IRPosition &IRP = getIRPosition();
+  const IRPosition &FnPos = IRPosition::function_scope(IRP);
+  AAMemoryBehavior::StateType &S = getState();
+
+  // First, check the function scope. We take the known information and we avoid
+  // work if the assumed information implies the current assumed information for
+  // this attribute. This is a valid for all but byval arguments.
+  Argument *Arg = IRP.getAssociatedArgument();
+  AAMemoryBehavior::base_t FnMemAssumedState =
+      AAMemoryBehavior::StateType::getWorstState();
+  if (!Arg || !Arg->hasByValAttr()) {
+    const auto &FnMemAA = A.getAAFor<AAMemoryBehavior>(
+        *this, FnPos, /* TrackDependence */ true, DepClassTy::OPTIONAL);
+    FnMemAssumedState = FnMemAA.getAssumed();
+    S.addKnownBits(FnMemAA.getKnown());
+    if ((S.getAssumed() & FnMemAA.getAssumed()) == S.getAssumed())
+      return ChangeStatus::UNCHANGED;
+  }
+
+  // Make sure the value is not captured (except through "return"), if
+  // it is, any information derived would be irrelevant anyway as we cannot
+  // check the potential aliases introduced by the capture. However, no need
+  // to fall back to anythign less optimistic than the function state.
+  const auto &ArgNoCaptureAA = A.getAAFor<AANoCapture>(
+      *this, IRP, /* TrackDependence */ true, DepClassTy::OPTIONAL);
+  if (!ArgNoCaptureAA.isAssumedNoCaptureMaybeReturned()) {
+    S.intersectAssumedBits(FnMemAssumedState);
+    return ChangeStatus::CHANGED;
+  }
+
+  // The current assumed state used to determine a change.
+  auto AssumedState = S.getAssumed();
+
+  // Liveness information to exclude dead users.
+  // TODO: Take the FnPos once we have call site specific liveness information.
+  const auto &LivenessAA = A.getAAFor<AAIsDead>(
+      *this, IRPosition::function(*IRP.getAssociatedFunction()),
+      /* TrackDependence */ false);
+
+  // Visit and expand uses until all are analyzed or a fixpoint is reached.
+  for (unsigned i = 0; i < Uses.size() && !isAtFixpoint(); i++) {
+    const Use *U = Uses[i];
+    Instruction *UserI = cast<Instruction>(U->getUser());
+    LLVM_DEBUG(dbgs() << "[AAMemoryBehavior] Use: " << **U << " in " << *UserI
+                      << " [Dead: " << (A.isAssumedDead(*U, this, &LivenessAA))
+                      << "]\n");
+    if (A.isAssumedDead(*U, this, &LivenessAA))
+      continue;
+
+    // Droppable users, e.g., llvm::assume does not actually perform any action.
+    if (UserI->isDroppable())
+      continue;
+
+    // Check if the users of UserI should also be visited.
+    if (followUsersOfUseIn(A, U, UserI))
       addUsesOf(A, *UserI);
- 
-    // If UserI might touch memory we analyze the use in detail. 
-    if (UserI->mayReadOrWriteMemory()) 
-      analyzeUseIn(A, U, UserI); 
-  } 
- 
-  return (AssumedState != getAssumed()) ? ChangeStatus::CHANGED 
-                                        : ChangeStatus::UNCHANGED; 
-} 
- 
+
+    // If UserI might touch memory we analyze the use in detail.
+    if (UserI->mayReadOrWriteMemory())
+      analyzeUseIn(A, U, UserI);
+  }
+
+  return (AssumedState != getAssumed()) ? ChangeStatus::CHANGED
+                                        : ChangeStatus::UNCHANGED;
+}
+
 void AAMemoryBehaviorFloating::addUsesOf(Attributor &A, const Value &V) {
   SmallVector<const Use *, 8> WL;
   for (const Use &U : V.uses())
@@ -6243,320 +6243,320 @@ void AAMemoryBehaviorFloating::addUsesOf(Attributor &A, const Value &V) {
   }
 }
 
-bool AAMemoryBehaviorFloating::followUsersOfUseIn(Attributor &A, const Use *U, 
-                                                  const Instruction *UserI) { 
-  // The loaded value is unrelated to the pointer argument, no need to 
-  // follow the users of the load. 
-  if (isa<LoadInst>(UserI)) 
-    return false; 
- 
-  // By default we follow all uses assuming UserI might leak information on U, 
-  // we have special handling for call sites operands though. 
-  const auto *CB = dyn_cast<CallBase>(UserI); 
-  if (!CB || !CB->isArgOperand(U)) 
-    return true; 
- 
-  // If the use is a call argument known not to be captured, the users of 
-  // the call do not need to be visited because they have to be unrelated to 
-  // the input. Note that this check is not trivial even though we disallow 
-  // general capturing of the underlying argument. The reason is that the 
-  // call might the argument "through return", which we allow and for which we 
-  // need to check call users. 
-  if (U->get()->getType()->isPointerTy()) { 
-    unsigned ArgNo = CB->getArgOperandNo(U); 
-    const auto &ArgNoCaptureAA = A.getAAFor<AANoCapture>( 
-        *this, IRPosition::callsite_argument(*CB, ArgNo), 
-        /* TrackDependence */ true, DepClassTy::OPTIONAL); 
-    return !ArgNoCaptureAA.isAssumedNoCapture(); 
-  } 
- 
-  return true; 
-} 
- 
-void AAMemoryBehaviorFloating::analyzeUseIn(Attributor &A, const Use *U, 
-                                            const Instruction *UserI) { 
-  assert(UserI->mayReadOrWriteMemory()); 
- 
-  switch (UserI->getOpcode()) { 
-  default: 
-    // TODO: Handle all atomics and other side-effect operations we know of. 
-    break; 
-  case Instruction::Load: 
-    // Loads cause the NO_READS property to disappear. 
-    removeAssumedBits(NO_READS); 
-    return; 
- 
-  case Instruction::Store: 
-    // Stores cause the NO_WRITES property to disappear if the use is the 
-    // pointer operand. Note that we do assume that capturing was taken care of 
-    // somewhere else. 
-    if (cast<StoreInst>(UserI)->getPointerOperand() == U->get()) 
-      removeAssumedBits(NO_WRITES); 
-    return; 
- 
-  case Instruction::Call: 
-  case Instruction::CallBr: 
-  case Instruction::Invoke: { 
-    // For call sites we look at the argument memory behavior attribute (this 
-    // could be recursive!) in order to restrict our own state. 
-    const auto *CB = cast<CallBase>(UserI); 
- 
-    // Give up on operand bundles. 
-    if (CB->isBundleOperand(U)) { 
-      indicatePessimisticFixpoint(); 
-      return; 
-    } 
- 
-    // Calling a function does read the function pointer, maybe write it if the 
-    // function is self-modifying. 
-    if (CB->isCallee(U)) { 
-      removeAssumedBits(NO_READS); 
-      break; 
-    } 
- 
-    // Adjust the possible access behavior based on the information on the 
-    // argument. 
-    IRPosition Pos; 
-    if (U->get()->getType()->isPointerTy()) 
-      Pos = IRPosition::callsite_argument(*CB, CB->getArgOperandNo(U)); 
-    else 
-      Pos = IRPosition::callsite_function(*CB); 
-    const auto &MemBehaviorAA = A.getAAFor<AAMemoryBehavior>( 
-        *this, Pos, 
-        /* TrackDependence */ true, DepClassTy::OPTIONAL); 
-    // "assumed" has at most the same bits as the MemBehaviorAA assumed 
-    // and at least "known". 
-    intersectAssumedBits(MemBehaviorAA.getAssumed()); 
-    return; 
-  } 
-  }; 
- 
-  // Generally, look at the "may-properties" and adjust the assumed state if we 
-  // did not trigger special handling before. 
-  if (UserI->mayReadFromMemory()) 
-    removeAssumedBits(NO_READS); 
-  if (UserI->mayWriteToMemory()) 
-    removeAssumedBits(NO_WRITES); 
-} 
- 
-} // namespace 
- 
-/// -------------------- Memory Locations Attributes --------------------------- 
-/// Includes read-none, argmemonly, inaccessiblememonly, 
-/// inaccessiblememorargmemonly 
-/// ---------------------------------------------------------------------------- 
- 
-std::string AAMemoryLocation::getMemoryLocationsAsStr( 
-    AAMemoryLocation::MemoryLocationsKind MLK) { 
-  if (0 == (MLK & AAMemoryLocation::NO_LOCATIONS)) 
-    return "all memory"; 
-  if (MLK == AAMemoryLocation::NO_LOCATIONS) 
-    return "no memory"; 
-  std::string S = "memory:"; 
-  if (0 == (MLK & AAMemoryLocation::NO_LOCAL_MEM)) 
-    S += "stack,"; 
-  if (0 == (MLK & AAMemoryLocation::NO_CONST_MEM)) 
-    S += "constant,"; 
-  if (0 == (MLK & AAMemoryLocation::NO_GLOBAL_INTERNAL_MEM)) 
-    S += "internal global,"; 
-  if (0 == (MLK & AAMemoryLocation::NO_GLOBAL_EXTERNAL_MEM)) 
-    S += "external global,"; 
-  if (0 == (MLK & AAMemoryLocation::NO_ARGUMENT_MEM)) 
-    S += "argument,"; 
-  if (0 == (MLK & AAMemoryLocation::NO_INACCESSIBLE_MEM)) 
-    S += "inaccessible,"; 
-  if (0 == (MLK & AAMemoryLocation::NO_MALLOCED_MEM)) 
-    S += "malloced,"; 
-  if (0 == (MLK & AAMemoryLocation::NO_UNKOWN_MEM)) 
-    S += "unknown,"; 
-  S.pop_back(); 
-  return S; 
-} 
- 
-namespace { 
-struct AAMemoryLocationImpl : public AAMemoryLocation { 
- 
-  AAMemoryLocationImpl(const IRPosition &IRP, Attributor &A) 
-      : AAMemoryLocation(IRP, A), Allocator(A.Allocator) { 
-    for (unsigned u = 0; u < llvm::CTLog2<VALID_STATE>(); ++u) 
-      AccessKind2Accesses[u] = nullptr; 
-  } 
- 
-  ~AAMemoryLocationImpl() { 
-    // The AccessSets are allocated via a BumpPtrAllocator, we call 
-    // the destructor manually. 
-    for (unsigned u = 0; u < llvm::CTLog2<VALID_STATE>(); ++u) 
-      if (AccessKind2Accesses[u]) 
-        AccessKind2Accesses[u]->~AccessSet(); 
-  } 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    intersectAssumedBits(BEST_STATE); 
-    getKnownStateFromValue(A, getIRPosition(), getState()); 
+bool AAMemoryBehaviorFloating::followUsersOfUseIn(Attributor &A, const Use *U,
+                                                  const Instruction *UserI) {
+  // The loaded value is unrelated to the pointer argument, no need to
+  // follow the users of the load.
+  if (isa<LoadInst>(UserI))
+    return false;
+
+  // By default we follow all uses assuming UserI might leak information on U,
+  // we have special handling for call sites operands though.
+  const auto *CB = dyn_cast<CallBase>(UserI);
+  if (!CB || !CB->isArgOperand(U))
+    return true;
+
+  // If the use is a call argument known not to be captured, the users of
+  // the call do not need to be visited because they have to be unrelated to
+  // the input. Note that this check is not trivial even though we disallow
+  // general capturing of the underlying argument. The reason is that the
+  // call might the argument "through return", which we allow and for which we
+  // need to check call users.
+  if (U->get()->getType()->isPointerTy()) {
+    unsigned ArgNo = CB->getArgOperandNo(U);
+    const auto &ArgNoCaptureAA = A.getAAFor<AANoCapture>(
+        *this, IRPosition::callsite_argument(*CB, ArgNo),
+        /* TrackDependence */ true, DepClassTy::OPTIONAL);
+    return !ArgNoCaptureAA.isAssumedNoCapture();
+  }
+
+  return true;
+}
+
+void AAMemoryBehaviorFloating::analyzeUseIn(Attributor &A, const Use *U,
+                                            const Instruction *UserI) {
+  assert(UserI->mayReadOrWriteMemory());
+
+  switch (UserI->getOpcode()) {
+  default:
+    // TODO: Handle all atomics and other side-effect operations we know of.
+    break;
+  case Instruction::Load:
+    // Loads cause the NO_READS property to disappear.
+    removeAssumedBits(NO_READS);
+    return;
+
+  case Instruction::Store:
+    // Stores cause the NO_WRITES property to disappear if the use is the
+    // pointer operand. Note that we do assume that capturing was taken care of
+    // somewhere else.
+    if (cast<StoreInst>(UserI)->getPointerOperand() == U->get())
+      removeAssumedBits(NO_WRITES);
+    return;
+
+  case Instruction::Call:
+  case Instruction::CallBr:
+  case Instruction::Invoke: {
+    // For call sites we look at the argument memory behavior attribute (this
+    // could be recursive!) in order to restrict our own state.
+    const auto *CB = cast<CallBase>(UserI);
+
+    // Give up on operand bundles.
+    if (CB->isBundleOperand(U)) {
+      indicatePessimisticFixpoint();
+      return;
+    }
+
+    // Calling a function does read the function pointer, maybe write it if the
+    // function is self-modifying.
+    if (CB->isCallee(U)) {
+      removeAssumedBits(NO_READS);
+      break;
+    }
+
+    // Adjust the possible access behavior based on the information on the
+    // argument.
+    IRPosition Pos;
+    if (U->get()->getType()->isPointerTy())
+      Pos = IRPosition::callsite_argument(*CB, CB->getArgOperandNo(U));
+    else
+      Pos = IRPosition::callsite_function(*CB);
+    const auto &MemBehaviorAA = A.getAAFor<AAMemoryBehavior>(
+        *this, Pos,
+        /* TrackDependence */ true, DepClassTy::OPTIONAL);
+    // "assumed" has at most the same bits as the MemBehaviorAA assumed
+    // and at least "known".
+    intersectAssumedBits(MemBehaviorAA.getAssumed());
+    return;
+  }
+  };
+
+  // Generally, look at the "may-properties" and adjust the assumed state if we
+  // did not trigger special handling before.
+  if (UserI->mayReadFromMemory())
+    removeAssumedBits(NO_READS);
+  if (UserI->mayWriteToMemory())
+    removeAssumedBits(NO_WRITES);
+}
+
+} // namespace
+
+/// -------------------- Memory Locations Attributes ---------------------------
+/// Includes read-none, argmemonly, inaccessiblememonly,
+/// inaccessiblememorargmemonly
+/// ----------------------------------------------------------------------------
+
+std::string AAMemoryLocation::getMemoryLocationsAsStr(
+    AAMemoryLocation::MemoryLocationsKind MLK) {
+  if (0 == (MLK & AAMemoryLocation::NO_LOCATIONS))
+    return "all memory";
+  if (MLK == AAMemoryLocation::NO_LOCATIONS)
+    return "no memory";
+  std::string S = "memory:";
+  if (0 == (MLK & AAMemoryLocation::NO_LOCAL_MEM))
+    S += "stack,";
+  if (0 == (MLK & AAMemoryLocation::NO_CONST_MEM))
+    S += "constant,";
+  if (0 == (MLK & AAMemoryLocation::NO_GLOBAL_INTERNAL_MEM))
+    S += "internal global,";
+  if (0 == (MLK & AAMemoryLocation::NO_GLOBAL_EXTERNAL_MEM))
+    S += "external global,";
+  if (0 == (MLK & AAMemoryLocation::NO_ARGUMENT_MEM))
+    S += "argument,";
+  if (0 == (MLK & AAMemoryLocation::NO_INACCESSIBLE_MEM))
+    S += "inaccessible,";
+  if (0 == (MLK & AAMemoryLocation::NO_MALLOCED_MEM))
+    S += "malloced,";
+  if (0 == (MLK & AAMemoryLocation::NO_UNKOWN_MEM))
+    S += "unknown,";
+  S.pop_back();
+  return S;
+}
+
+namespace {
+struct AAMemoryLocationImpl : public AAMemoryLocation {
+
+  AAMemoryLocationImpl(const IRPosition &IRP, Attributor &A)
+      : AAMemoryLocation(IRP, A), Allocator(A.Allocator) {
+    for (unsigned u = 0; u < llvm::CTLog2<VALID_STATE>(); ++u)
+      AccessKind2Accesses[u] = nullptr;
+  }
+
+  ~AAMemoryLocationImpl() {
+    // The AccessSets are allocated via a BumpPtrAllocator, we call
+    // the destructor manually.
+    for (unsigned u = 0; u < llvm::CTLog2<VALID_STATE>(); ++u)
+      if (AccessKind2Accesses[u])
+        AccessKind2Accesses[u]->~AccessSet();
+  }
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    intersectAssumedBits(BEST_STATE);
+    getKnownStateFromValue(A, getIRPosition(), getState());
     AAMemoryLocation::initialize(A);
-  } 
- 
-  /// Return the memory behavior information encoded in the IR for \p IRP. 
-  static void getKnownStateFromValue(Attributor &A, const IRPosition &IRP, 
-                                     BitIntegerState &State, 
-                                     bool IgnoreSubsumingPositions = false) { 
-    // For internal functions we ignore `argmemonly` and 
-    // `inaccessiblememorargmemonly` as we might break it via interprocedural 
-    // constant propagation. It is unclear if this is the best way but it is 
-    // unlikely this will cause real performance problems. If we are deriving 
-    // attributes for the anchor function we even remove the attribute in 
-    // addition to ignoring it. 
-    bool UseArgMemOnly = true; 
-    Function *AnchorFn = IRP.getAnchorScope(); 
-    if (AnchorFn && A.isRunOn(*AnchorFn)) 
-      UseArgMemOnly = !AnchorFn->hasLocalLinkage(); 
- 
-    SmallVector<Attribute, 2> Attrs; 
-    IRP.getAttrs(AttrKinds, Attrs, IgnoreSubsumingPositions); 
-    for (const Attribute &Attr : Attrs) { 
-      switch (Attr.getKindAsEnum()) { 
-      case Attribute::ReadNone: 
-        State.addKnownBits(NO_LOCAL_MEM | NO_CONST_MEM); 
-        break; 
-      case Attribute::InaccessibleMemOnly: 
-        State.addKnownBits(inverseLocation(NO_INACCESSIBLE_MEM, true, true)); 
-        break; 
-      case Attribute::ArgMemOnly: 
-        if (UseArgMemOnly) 
-          State.addKnownBits(inverseLocation(NO_ARGUMENT_MEM, true, true)); 
-        else 
-          IRP.removeAttrs({Attribute::ArgMemOnly}); 
-        break; 
-      case Attribute::InaccessibleMemOrArgMemOnly: 
-        if (UseArgMemOnly) 
-          State.addKnownBits(inverseLocation( 
-              NO_INACCESSIBLE_MEM | NO_ARGUMENT_MEM, true, true)); 
-        else 
-          IRP.removeAttrs({Attribute::InaccessibleMemOrArgMemOnly}); 
-        break; 
-      default: 
-        llvm_unreachable("Unexpected attribute!"); 
-      } 
-    } 
-  } 
- 
-  /// See AbstractAttribute::getDeducedAttributes(...). 
-  void getDeducedAttributes(LLVMContext &Ctx, 
-                            SmallVectorImpl<Attribute> &Attrs) const override { 
-    assert(Attrs.size() == 0); 
-    if (isAssumedReadNone()) { 
-      Attrs.push_back(Attribute::get(Ctx, Attribute::ReadNone)); 
-    } else if (getIRPosition().getPositionKind() == IRPosition::IRP_FUNCTION) { 
-      if (isAssumedInaccessibleMemOnly()) 
-        Attrs.push_back(Attribute::get(Ctx, Attribute::InaccessibleMemOnly)); 
-      else if (isAssumedArgMemOnly()) 
-        Attrs.push_back(Attribute::get(Ctx, Attribute::ArgMemOnly)); 
-      else if (isAssumedInaccessibleOrArgMemOnly()) 
-        Attrs.push_back( 
-            Attribute::get(Ctx, Attribute::InaccessibleMemOrArgMemOnly)); 
-    } 
-    assert(Attrs.size() <= 1); 
-  } 
- 
-  /// See AbstractAttribute::manifest(...). 
-  ChangeStatus manifest(Attributor &A) override { 
-    const IRPosition &IRP = getIRPosition(); 
- 
-    // Check if we would improve the existing attributes first. 
-    SmallVector<Attribute, 4> DeducedAttrs; 
-    getDeducedAttributes(IRP.getAnchorValue().getContext(), DeducedAttrs); 
-    if (llvm::all_of(DeducedAttrs, [&](const Attribute &Attr) { 
-          return IRP.hasAttr(Attr.getKindAsEnum(), 
-                             /* IgnoreSubsumingPositions */ true); 
-        })) 
-      return ChangeStatus::UNCHANGED; 
- 
-    // Clear existing attributes. 
-    IRP.removeAttrs(AttrKinds); 
-    if (isAssumedReadNone()) 
-      IRP.removeAttrs(AAMemoryBehaviorImpl::AttrKinds); 
- 
-    // Use the generic manifest method. 
-    return IRAttribute::manifest(A); 
-  } 
- 
-  /// See AAMemoryLocation::checkForAllAccessesToMemoryKind(...). 
-  bool checkForAllAccessesToMemoryKind( 
-      function_ref<bool(const Instruction *, const Value *, AccessKind, 
-                        MemoryLocationsKind)> 
-          Pred, 
-      MemoryLocationsKind RequestedMLK) const override { 
-    if (!isValidState()) 
-      return false; 
- 
-    MemoryLocationsKind AssumedMLK = getAssumedNotAccessedLocation(); 
-    if (AssumedMLK == NO_LOCATIONS) 
-      return true; 
- 
-    unsigned Idx = 0; 
-    for (MemoryLocationsKind CurMLK = 1; CurMLK < NO_LOCATIONS; 
-         CurMLK *= 2, ++Idx) { 
-      if (CurMLK & RequestedMLK) 
-        continue; 
- 
-      if (const AccessSet *Accesses = AccessKind2Accesses[Idx]) 
-        for (const AccessInfo &AI : *Accesses) 
-          if (!Pred(AI.I, AI.Ptr, AI.Kind, CurMLK)) 
-            return false; 
-    } 
- 
-    return true; 
-  } 
- 
-  ChangeStatus indicatePessimisticFixpoint() override { 
-    // If we give up and indicate a pessimistic fixpoint this instruction will 
-    // become an access for all potential access kinds: 
-    // TODO: Add pointers for argmemonly and globals to improve the results of 
-    //       checkForAllAccessesToMemoryKind. 
-    bool Changed = false; 
-    MemoryLocationsKind KnownMLK = getKnown(); 
-    Instruction *I = dyn_cast<Instruction>(&getAssociatedValue()); 
-    for (MemoryLocationsKind CurMLK = 1; CurMLK < NO_LOCATIONS; CurMLK *= 2) 
-      if (!(CurMLK & KnownMLK)) 
-        updateStateAndAccessesMap(getState(), CurMLK, I, nullptr, Changed, 
-                                  getAccessKindFromInst(I)); 
-    return AAMemoryLocation::indicatePessimisticFixpoint(); 
-  } 
- 
-protected: 
-  /// Helper struct to tie together an instruction that has a read or write 
-  /// effect with the pointer it accesses (if any). 
-  struct AccessInfo { 
- 
-    /// The instruction that caused the access. 
-    const Instruction *I; 
- 
-    /// The base pointer that is accessed, or null if unknown. 
-    const Value *Ptr; 
- 
-    /// The kind of access (read/write/read+write). 
-    AccessKind Kind; 
- 
-    bool operator==(const AccessInfo &RHS) const { 
-      return I == RHS.I && Ptr == RHS.Ptr && Kind == RHS.Kind; 
-    } 
-    bool operator()(const AccessInfo &LHS, const AccessInfo &RHS) const { 
-      if (LHS.I != RHS.I) 
-        return LHS.I < RHS.I; 
-      if (LHS.Ptr != RHS.Ptr) 
-        return LHS.Ptr < RHS.Ptr; 
-      if (LHS.Kind != RHS.Kind) 
-        return LHS.Kind < RHS.Kind; 
-      return false; 
-    } 
-  }; 
- 
-  /// Mapping from *single* memory location kinds, e.g., LOCAL_MEM with the 
-  /// value of NO_LOCAL_MEM, to the accesses encountered for this memory kind. 
-  using AccessSet = SmallSet<AccessInfo, 2, AccessInfo>; 
-  AccessSet *AccessKind2Accesses[llvm::CTLog2<VALID_STATE>()]; 
- 
+  }
+
+  /// Return the memory behavior information encoded in the IR for \p IRP.
+  static void getKnownStateFromValue(Attributor &A, const IRPosition &IRP,
+                                     BitIntegerState &State,
+                                     bool IgnoreSubsumingPositions = false) {
+    // For internal functions we ignore `argmemonly` and
+    // `inaccessiblememorargmemonly` as we might break it via interprocedural
+    // constant propagation. It is unclear if this is the best way but it is
+    // unlikely this will cause real performance problems. If we are deriving
+    // attributes for the anchor function we even remove the attribute in
+    // addition to ignoring it.
+    bool UseArgMemOnly = true;
+    Function *AnchorFn = IRP.getAnchorScope();
+    if (AnchorFn && A.isRunOn(*AnchorFn))
+      UseArgMemOnly = !AnchorFn->hasLocalLinkage();
+
+    SmallVector<Attribute, 2> Attrs;
+    IRP.getAttrs(AttrKinds, Attrs, IgnoreSubsumingPositions);
+    for (const Attribute &Attr : Attrs) {
+      switch (Attr.getKindAsEnum()) {
+      case Attribute::ReadNone:
+        State.addKnownBits(NO_LOCAL_MEM | NO_CONST_MEM);
+        break;
+      case Attribute::InaccessibleMemOnly:
+        State.addKnownBits(inverseLocation(NO_INACCESSIBLE_MEM, true, true));
+        break;
+      case Attribute::ArgMemOnly:
+        if (UseArgMemOnly)
+          State.addKnownBits(inverseLocation(NO_ARGUMENT_MEM, true, true));
+        else
+          IRP.removeAttrs({Attribute::ArgMemOnly});
+        break;
+      case Attribute::InaccessibleMemOrArgMemOnly:
+        if (UseArgMemOnly)
+          State.addKnownBits(inverseLocation(
+              NO_INACCESSIBLE_MEM | NO_ARGUMENT_MEM, true, true));
+        else
+          IRP.removeAttrs({Attribute::InaccessibleMemOrArgMemOnly});
+        break;
+      default:
+        llvm_unreachable("Unexpected attribute!");
+      }
+    }
+  }
+
+  /// See AbstractAttribute::getDeducedAttributes(...).
+  void getDeducedAttributes(LLVMContext &Ctx,
+                            SmallVectorImpl<Attribute> &Attrs) const override {
+    assert(Attrs.size() == 0);
+    if (isAssumedReadNone()) {
+      Attrs.push_back(Attribute::get(Ctx, Attribute::ReadNone));
+    } else if (getIRPosition().getPositionKind() == IRPosition::IRP_FUNCTION) {
+      if (isAssumedInaccessibleMemOnly())
+        Attrs.push_back(Attribute::get(Ctx, Attribute::InaccessibleMemOnly));
+      else if (isAssumedArgMemOnly())
+        Attrs.push_back(Attribute::get(Ctx, Attribute::ArgMemOnly));
+      else if (isAssumedInaccessibleOrArgMemOnly())
+        Attrs.push_back(
+            Attribute::get(Ctx, Attribute::InaccessibleMemOrArgMemOnly));
+    }
+    assert(Attrs.size() <= 1);
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    const IRPosition &IRP = getIRPosition();
+
+    // Check if we would improve the existing attributes first.
+    SmallVector<Attribute, 4> DeducedAttrs;
+    getDeducedAttributes(IRP.getAnchorValue().getContext(), DeducedAttrs);
+    if (llvm::all_of(DeducedAttrs, [&](const Attribute &Attr) {
+          return IRP.hasAttr(Attr.getKindAsEnum(),
+                             /* IgnoreSubsumingPositions */ true);
+        }))
+      return ChangeStatus::UNCHANGED;
+
+    // Clear existing attributes.
+    IRP.removeAttrs(AttrKinds);
+    if (isAssumedReadNone())
+      IRP.removeAttrs(AAMemoryBehaviorImpl::AttrKinds);
+
+    // Use the generic manifest method.
+    return IRAttribute::manifest(A);
+  }
+
+  /// See AAMemoryLocation::checkForAllAccessesToMemoryKind(...).
+  bool checkForAllAccessesToMemoryKind(
+      function_ref<bool(const Instruction *, const Value *, AccessKind,
+                        MemoryLocationsKind)>
+          Pred,
+      MemoryLocationsKind RequestedMLK) const override {
+    if (!isValidState())
+      return false;
+
+    MemoryLocationsKind AssumedMLK = getAssumedNotAccessedLocation();
+    if (AssumedMLK == NO_LOCATIONS)
+      return true;
+
+    unsigned Idx = 0;
+    for (MemoryLocationsKind CurMLK = 1; CurMLK < NO_LOCATIONS;
+         CurMLK *= 2, ++Idx) {
+      if (CurMLK & RequestedMLK)
+        continue;
+
+      if (const AccessSet *Accesses = AccessKind2Accesses[Idx])
+        for (const AccessInfo &AI : *Accesses)
+          if (!Pred(AI.I, AI.Ptr, AI.Kind, CurMLK))
+            return false;
+    }
+
+    return true;
+  }
+
+  ChangeStatus indicatePessimisticFixpoint() override {
+    // If we give up and indicate a pessimistic fixpoint this instruction will
+    // become an access for all potential access kinds:
+    // TODO: Add pointers for argmemonly and globals to improve the results of
+    //       checkForAllAccessesToMemoryKind.
+    bool Changed = false;
+    MemoryLocationsKind KnownMLK = getKnown();
+    Instruction *I = dyn_cast<Instruction>(&getAssociatedValue());
+    for (MemoryLocationsKind CurMLK = 1; CurMLK < NO_LOCATIONS; CurMLK *= 2)
+      if (!(CurMLK & KnownMLK))
+        updateStateAndAccessesMap(getState(), CurMLK, I, nullptr, Changed,
+                                  getAccessKindFromInst(I));
+    return AAMemoryLocation::indicatePessimisticFixpoint();
+  }
+
+protected:
+  /// Helper struct to tie together an instruction that has a read or write
+  /// effect with the pointer it accesses (if any).
+  struct AccessInfo {
+
+    /// The instruction that caused the access.
+    const Instruction *I;
+
+    /// The base pointer that is accessed, or null if unknown.
+    const Value *Ptr;
+
+    /// The kind of access (read/write/read+write).
+    AccessKind Kind;
+
+    bool operator==(const AccessInfo &RHS) const {
+      return I == RHS.I && Ptr == RHS.Ptr && Kind == RHS.Kind;
+    }
+    bool operator()(const AccessInfo &LHS, const AccessInfo &RHS) const {
+      if (LHS.I != RHS.I)
+        return LHS.I < RHS.I;
+      if (LHS.Ptr != RHS.Ptr)
+        return LHS.Ptr < RHS.Ptr;
+      if (LHS.Kind != RHS.Kind)
+        return LHS.Kind < RHS.Kind;
+      return false;
+    }
+  };
+
+  /// Mapping from *single* memory location kinds, e.g., LOCAL_MEM with the
+  /// value of NO_LOCAL_MEM, to the accesses encountered for this memory kind.
+  using AccessSet = SmallSet<AccessInfo, 2, AccessInfo>;
+  AccessSet *AccessKind2Accesses[llvm::CTLog2<VALID_STATE>()];
+
   /// Categorize the pointer arguments of CB that might access memory in
   /// AccessedLoc and update the state and access map accordingly.
   void
@@ -6564,82 +6564,82 @@ protected:
                                      AAMemoryLocation::StateType &AccessedLocs,
                                      bool &Changed);
 
-  /// Return the kind(s) of location that may be accessed by \p V. 
-  AAMemoryLocation::MemoryLocationsKind 
-  categorizeAccessedLocations(Attributor &A, Instruction &I, bool &Changed); 
- 
-  /// Return the access kind as determined by \p I. 
-  AccessKind getAccessKindFromInst(const Instruction *I) { 
-    AccessKind AK = READ_WRITE; 
-    if (I) { 
-      AK = I->mayReadFromMemory() ? READ : NONE; 
-      AK = AccessKind(AK | (I->mayWriteToMemory() ? WRITE : NONE)); 
-    } 
-    return AK; 
-  } 
- 
-  /// Update the state \p State and the AccessKind2Accesses given that \p I is 
-  /// an access of kind \p AK to a \p MLK memory location with the access 
-  /// pointer \p Ptr. 
-  void updateStateAndAccessesMap(AAMemoryLocation::StateType &State, 
-                                 MemoryLocationsKind MLK, const Instruction *I, 
-                                 const Value *Ptr, bool &Changed, 
-                                 AccessKind AK = READ_WRITE) { 
- 
-    assert(isPowerOf2_32(MLK) && "Expected a single location set!"); 
-    auto *&Accesses = AccessKind2Accesses[llvm::Log2_32(MLK)]; 
-    if (!Accesses) 
-      Accesses = new (Allocator) AccessSet(); 
-    Changed |= Accesses->insert(AccessInfo{I, Ptr, AK}).second; 
-    State.removeAssumedBits(MLK); 
-  } 
- 
-  /// Determine the underlying locations kinds for \p Ptr, e.g., globals or 
-  /// arguments, and update the state and access map accordingly. 
-  void categorizePtrValue(Attributor &A, const Instruction &I, const Value &Ptr, 
-                          AAMemoryLocation::StateType &State, bool &Changed); 
- 
-  /// Used to allocate access sets. 
-  BumpPtrAllocator &Allocator; 
- 
-  /// The set of IR attributes AAMemoryLocation deals with. 
-  static const Attribute::AttrKind AttrKinds[4]; 
-}; 
- 
-const Attribute::AttrKind AAMemoryLocationImpl::AttrKinds[] = { 
-    Attribute::ReadNone, Attribute::InaccessibleMemOnly, Attribute::ArgMemOnly, 
-    Attribute::InaccessibleMemOrArgMemOnly}; 
- 
-void AAMemoryLocationImpl::categorizePtrValue( 
-    Attributor &A, const Instruction &I, const Value &Ptr, 
-    AAMemoryLocation::StateType &State, bool &Changed) { 
-  LLVM_DEBUG(dbgs() << "[AAMemoryLocation] Categorize pointer locations for " 
-                    << Ptr << " [" 
-                    << getMemoryLocationsAsStr(State.getAssumed()) << "]\n"); 
- 
-  auto StripGEPCB = [](Value *V) -> Value * { 
-    auto *GEP = dyn_cast<GEPOperator>(V); 
-    while (GEP) { 
-      V = GEP->getPointerOperand(); 
-      GEP = dyn_cast<GEPOperator>(V); 
-    } 
-    return V; 
-  }; 
- 
-  auto VisitValueCB = [&](Value &V, const Instruction *, 
-                          AAMemoryLocation::StateType &T, 
-                          bool Stripped) -> bool { 
+  /// Return the kind(s) of location that may be accessed by \p V.
+  AAMemoryLocation::MemoryLocationsKind
+  categorizeAccessedLocations(Attributor &A, Instruction &I, bool &Changed);
+
+  /// Return the access kind as determined by \p I.
+  AccessKind getAccessKindFromInst(const Instruction *I) {
+    AccessKind AK = READ_WRITE;
+    if (I) {
+      AK = I->mayReadFromMemory() ? READ : NONE;
+      AK = AccessKind(AK | (I->mayWriteToMemory() ? WRITE : NONE));
+    }
+    return AK;
+  }
+
+  /// Update the state \p State and the AccessKind2Accesses given that \p I is
+  /// an access of kind \p AK to a \p MLK memory location with the access
+  /// pointer \p Ptr.
+  void updateStateAndAccessesMap(AAMemoryLocation::StateType &State,
+                                 MemoryLocationsKind MLK, const Instruction *I,
+                                 const Value *Ptr, bool &Changed,
+                                 AccessKind AK = READ_WRITE) {
+
+    assert(isPowerOf2_32(MLK) && "Expected a single location set!");
+    auto *&Accesses = AccessKind2Accesses[llvm::Log2_32(MLK)];
+    if (!Accesses)
+      Accesses = new (Allocator) AccessSet();
+    Changed |= Accesses->insert(AccessInfo{I, Ptr, AK}).second;
+    State.removeAssumedBits(MLK);
+  }
+
+  /// Determine the underlying locations kinds for \p Ptr, e.g., globals or
+  /// arguments, and update the state and access map accordingly.
+  void categorizePtrValue(Attributor &A, const Instruction &I, const Value &Ptr,
+                          AAMemoryLocation::StateType &State, bool &Changed);
+
+  /// Used to allocate access sets.
+  BumpPtrAllocator &Allocator;
+
+  /// The set of IR attributes AAMemoryLocation deals with.
+  static const Attribute::AttrKind AttrKinds[4];
+};
+
+const Attribute::AttrKind AAMemoryLocationImpl::AttrKinds[] = {
+    Attribute::ReadNone, Attribute::InaccessibleMemOnly, Attribute::ArgMemOnly,
+    Attribute::InaccessibleMemOrArgMemOnly};
+
+void AAMemoryLocationImpl::categorizePtrValue(
+    Attributor &A, const Instruction &I, const Value &Ptr,
+    AAMemoryLocation::StateType &State, bool &Changed) {
+  LLVM_DEBUG(dbgs() << "[AAMemoryLocation] Categorize pointer locations for "
+                    << Ptr << " ["
+                    << getMemoryLocationsAsStr(State.getAssumed()) << "]\n");
+
+  auto StripGEPCB = [](Value *V) -> Value * {
+    auto *GEP = dyn_cast<GEPOperator>(V);
+    while (GEP) {
+      V = GEP->getPointerOperand();
+      GEP = dyn_cast<GEPOperator>(V);
+    }
+    return V;
+  };
+
+  auto VisitValueCB = [&](Value &V, const Instruction *,
+                          AAMemoryLocation::StateType &T,
+                          bool Stripped) -> bool {
     // TODO: recognize the TBAA used for constant accesses.
-    MemoryLocationsKind MLK = NO_LOCATIONS; 
-    assert(!isa<GEPOperator>(V) && "GEPs should have been stripped."); 
-    if (isa<UndefValue>(V)) 
-      return true; 
-    if (auto *Arg = dyn_cast<Argument>(&V)) { 
-      if (Arg->hasByValAttr()) 
-        MLK = NO_LOCAL_MEM; 
-      else 
-        MLK = NO_ARGUMENT_MEM; 
-    } else if (auto *GV = dyn_cast<GlobalValue>(&V)) { 
+    MemoryLocationsKind MLK = NO_LOCATIONS;
+    assert(!isa<GEPOperator>(V) && "GEPs should have been stripped.");
+    if (isa<UndefValue>(V))
+      return true;
+    if (auto *Arg = dyn_cast<Argument>(&V)) {
+      if (Arg->hasByValAttr())
+        MLK = NO_LOCAL_MEM;
+      else
+        MLK = NO_ARGUMENT_MEM;
+    } else if (auto *GV = dyn_cast<GlobalValue>(&V)) {
       // Reading constant memory is not treated as a read "effect" by the
       // function attr pass so we won't neither. Constants defined by TBAA are
       // similar. (We know we do not write it because it is constant.)
@@ -6647,52 +6647,52 @@ void AAMemoryLocationImpl::categorizePtrValue(
         if (GVar->isConstant())
           return true;
 
-      if (GV->hasLocalLinkage()) 
-        MLK = NO_GLOBAL_INTERNAL_MEM; 
-      else 
-        MLK = NO_GLOBAL_EXTERNAL_MEM; 
-    } else if (isa<ConstantPointerNull>(V) && 
-               !NullPointerIsDefined(getAssociatedFunction(), 
-                                     V.getType()->getPointerAddressSpace())) { 
-      return true; 
-    } else if (isa<AllocaInst>(V)) { 
-      MLK = NO_LOCAL_MEM; 
-    } else if (const auto *CB = dyn_cast<CallBase>(&V)) { 
-      const auto &NoAliasAA = 
-          A.getAAFor<AANoAlias>(*this, IRPosition::callsite_returned(*CB)); 
-      if (NoAliasAA.isAssumedNoAlias()) 
-        MLK = NO_MALLOCED_MEM; 
-      else 
-        MLK = NO_UNKOWN_MEM; 
-    } else { 
-      MLK = NO_UNKOWN_MEM; 
-    } 
- 
-    assert(MLK != NO_LOCATIONS && "No location specified!"); 
-    updateStateAndAccessesMap(T, MLK, &I, &V, Changed, 
-                              getAccessKindFromInst(&I)); 
-    LLVM_DEBUG(dbgs() << "[AAMemoryLocation] Ptr value cannot be categorized: " 
-                      << V << " -> " << getMemoryLocationsAsStr(T.getAssumed()) 
-                      << "\n"); 
-    return true; 
-  }; 
- 
-  if (!genericValueTraversal<AAMemoryLocation, AAMemoryLocation::StateType>( 
-          A, IRPosition::value(Ptr), *this, State, VisitValueCB, getCtxI(), 
-          /* UseValueSimplify */ true, 
-          /* MaxValues */ 32, StripGEPCB)) { 
-    LLVM_DEBUG( 
-        dbgs() << "[AAMemoryLocation] Pointer locations not categorized\n"); 
-    updateStateAndAccessesMap(State, NO_UNKOWN_MEM, &I, nullptr, Changed, 
-                              getAccessKindFromInst(&I)); 
-  } else { 
-    LLVM_DEBUG( 
-        dbgs() 
-        << "[AAMemoryLocation] Accessed locations with pointer locations: " 
-        << getMemoryLocationsAsStr(State.getAssumed()) << "\n"); 
-  } 
-} 
- 
+      if (GV->hasLocalLinkage())
+        MLK = NO_GLOBAL_INTERNAL_MEM;
+      else
+        MLK = NO_GLOBAL_EXTERNAL_MEM;
+    } else if (isa<ConstantPointerNull>(V) &&
+               !NullPointerIsDefined(getAssociatedFunction(),
+                                     V.getType()->getPointerAddressSpace())) {
+      return true;
+    } else if (isa<AllocaInst>(V)) {
+      MLK = NO_LOCAL_MEM;
+    } else if (const auto *CB = dyn_cast<CallBase>(&V)) {
+      const auto &NoAliasAA =
+          A.getAAFor<AANoAlias>(*this, IRPosition::callsite_returned(*CB));
+      if (NoAliasAA.isAssumedNoAlias())
+        MLK = NO_MALLOCED_MEM;
+      else
+        MLK = NO_UNKOWN_MEM;
+    } else {
+      MLK = NO_UNKOWN_MEM;
+    }
+
+    assert(MLK != NO_LOCATIONS && "No location specified!");
+    updateStateAndAccessesMap(T, MLK, &I, &V, Changed,
+                              getAccessKindFromInst(&I));
+    LLVM_DEBUG(dbgs() << "[AAMemoryLocation] Ptr value cannot be categorized: "
+                      << V << " -> " << getMemoryLocationsAsStr(T.getAssumed())
+                      << "\n");
+    return true;
+  };
+
+  if (!genericValueTraversal<AAMemoryLocation, AAMemoryLocation::StateType>(
+          A, IRPosition::value(Ptr), *this, State, VisitValueCB, getCtxI(),
+          /* UseValueSimplify */ true,
+          /* MaxValues */ 32, StripGEPCB)) {
+    LLVM_DEBUG(
+        dbgs() << "[AAMemoryLocation] Pointer locations not categorized\n");
+    updateStateAndAccessesMap(State, NO_UNKOWN_MEM, &I, nullptr, Changed,
+                              getAccessKindFromInst(&I));
+  } else {
+    LLVM_DEBUG(
+        dbgs()
+        << "[AAMemoryLocation] Accessed locations with pointer locations: "
+        << getMemoryLocationsAsStr(State.getAssumed()) << "\n");
+  }
+}
+
 void AAMemoryLocationImpl::categorizeArgumentPointerLocations(
     Attributor &A, CallBase &CB, AAMemoryLocation::StateType &AccessedLocs,
     bool &Changed) {
@@ -6717,689 +6717,689 @@ void AAMemoryLocationImpl::categorizeArgumentPointerLocations(
   }
 }
 
-AAMemoryLocation::MemoryLocationsKind 
-AAMemoryLocationImpl::categorizeAccessedLocations(Attributor &A, Instruction &I, 
-                                                  bool &Changed) { 
-  LLVM_DEBUG(dbgs() << "[AAMemoryLocation] Categorize accessed locations for " 
-                    << I << "\n"); 
- 
-  AAMemoryLocation::StateType AccessedLocs; 
-  AccessedLocs.intersectAssumedBits(NO_LOCATIONS); 
- 
-  if (auto *CB = dyn_cast<CallBase>(&I)) { 
- 
-    // First check if we assume any memory is access is visible. 
-    const auto &CBMemLocationAA = 
-        A.getAAFor<AAMemoryLocation>(*this, IRPosition::callsite_function(*CB)); 
-    LLVM_DEBUG(dbgs() << "[AAMemoryLocation] Categorize call site: " << I 
-                      << " [" << CBMemLocationAA << "]\n"); 
- 
-    if (CBMemLocationAA.isAssumedReadNone()) 
-      return NO_LOCATIONS; 
- 
-    if (CBMemLocationAA.isAssumedInaccessibleMemOnly()) { 
-      updateStateAndAccessesMap(AccessedLocs, NO_INACCESSIBLE_MEM, &I, nullptr, 
-                                Changed, getAccessKindFromInst(&I)); 
-      return AccessedLocs.getAssumed(); 
-    } 
- 
-    uint32_t CBAssumedNotAccessedLocs = 
-        CBMemLocationAA.getAssumedNotAccessedLocation(); 
- 
-    // Set the argmemonly and global bit as we handle them separately below. 
-    uint32_t CBAssumedNotAccessedLocsNoArgMem = 
-        CBAssumedNotAccessedLocs | NO_ARGUMENT_MEM | NO_GLOBAL_MEM; 
- 
-    for (MemoryLocationsKind CurMLK = 1; CurMLK < NO_LOCATIONS; CurMLK *= 2) { 
-      if (CBAssumedNotAccessedLocsNoArgMem & CurMLK) 
-        continue; 
-      updateStateAndAccessesMap(AccessedLocs, CurMLK, &I, nullptr, Changed, 
-                                getAccessKindFromInst(&I)); 
-    } 
- 
-    // Now handle global memory if it might be accessed. This is slightly tricky 
-    // as NO_GLOBAL_MEM has multiple bits set. 
-    bool HasGlobalAccesses = ((~CBAssumedNotAccessedLocs) & NO_GLOBAL_MEM); 
-    if (HasGlobalAccesses) { 
-      auto AccessPred = [&](const Instruction *, const Value *Ptr, 
-                            AccessKind Kind, MemoryLocationsKind MLK) { 
-        updateStateAndAccessesMap(AccessedLocs, MLK, &I, Ptr, Changed, 
-                                  getAccessKindFromInst(&I)); 
-        return true; 
-      }; 
-      if (!CBMemLocationAA.checkForAllAccessesToMemoryKind( 
-              AccessPred, inverseLocation(NO_GLOBAL_MEM, false, false))) 
-        return AccessedLocs.getWorstState(); 
-    } 
- 
-    LLVM_DEBUG( 
-        dbgs() << "[AAMemoryLocation] Accessed state before argument handling: " 
-               << getMemoryLocationsAsStr(AccessedLocs.getAssumed()) << "\n"); 
- 
-    // Now handle argument memory if it might be accessed. 
-    bool HasArgAccesses = ((~CBAssumedNotAccessedLocs) & NO_ARGUMENT_MEM); 
+AAMemoryLocation::MemoryLocationsKind
+AAMemoryLocationImpl::categorizeAccessedLocations(Attributor &A, Instruction &I,
+                                                  bool &Changed) {
+  LLVM_DEBUG(dbgs() << "[AAMemoryLocation] Categorize accessed locations for "
+                    << I << "\n");
+
+  AAMemoryLocation::StateType AccessedLocs;
+  AccessedLocs.intersectAssumedBits(NO_LOCATIONS);
+
+  if (auto *CB = dyn_cast<CallBase>(&I)) {
+
+    // First check if we assume any memory is access is visible.
+    const auto &CBMemLocationAA =
+        A.getAAFor<AAMemoryLocation>(*this, IRPosition::callsite_function(*CB));
+    LLVM_DEBUG(dbgs() << "[AAMemoryLocation] Categorize call site: " << I
+                      << " [" << CBMemLocationAA << "]\n");
+
+    if (CBMemLocationAA.isAssumedReadNone())
+      return NO_LOCATIONS;
+
+    if (CBMemLocationAA.isAssumedInaccessibleMemOnly()) {
+      updateStateAndAccessesMap(AccessedLocs, NO_INACCESSIBLE_MEM, &I, nullptr,
+                                Changed, getAccessKindFromInst(&I));
+      return AccessedLocs.getAssumed();
+    }
+
+    uint32_t CBAssumedNotAccessedLocs =
+        CBMemLocationAA.getAssumedNotAccessedLocation();
+
+    // Set the argmemonly and global bit as we handle them separately below.
+    uint32_t CBAssumedNotAccessedLocsNoArgMem =
+        CBAssumedNotAccessedLocs | NO_ARGUMENT_MEM | NO_GLOBAL_MEM;
+
+    for (MemoryLocationsKind CurMLK = 1; CurMLK < NO_LOCATIONS; CurMLK *= 2) {
+      if (CBAssumedNotAccessedLocsNoArgMem & CurMLK)
+        continue;
+      updateStateAndAccessesMap(AccessedLocs, CurMLK, &I, nullptr, Changed,
+                                getAccessKindFromInst(&I));
+    }
+
+    // Now handle global memory if it might be accessed. This is slightly tricky
+    // as NO_GLOBAL_MEM has multiple bits set.
+    bool HasGlobalAccesses = ((~CBAssumedNotAccessedLocs) & NO_GLOBAL_MEM);
+    if (HasGlobalAccesses) {
+      auto AccessPred = [&](const Instruction *, const Value *Ptr,
+                            AccessKind Kind, MemoryLocationsKind MLK) {
+        updateStateAndAccessesMap(AccessedLocs, MLK, &I, Ptr, Changed,
+                                  getAccessKindFromInst(&I));
+        return true;
+      };
+      if (!CBMemLocationAA.checkForAllAccessesToMemoryKind(
+              AccessPred, inverseLocation(NO_GLOBAL_MEM, false, false)))
+        return AccessedLocs.getWorstState();
+    }
+
+    LLVM_DEBUG(
+        dbgs() << "[AAMemoryLocation] Accessed state before argument handling: "
+               << getMemoryLocationsAsStr(AccessedLocs.getAssumed()) << "\n");
+
+    // Now handle argument memory if it might be accessed.
+    bool HasArgAccesses = ((~CBAssumedNotAccessedLocs) & NO_ARGUMENT_MEM);
     if (HasArgAccesses)
       categorizeArgumentPointerLocations(A, *CB, AccessedLocs, Changed);
- 
-    LLVM_DEBUG( 
-        dbgs() << "[AAMemoryLocation] Accessed state after argument handling: " 
-               << getMemoryLocationsAsStr(AccessedLocs.getAssumed()) << "\n"); 
- 
-    return AccessedLocs.getAssumed(); 
-  } 
- 
-  if (const Value *Ptr = getPointerOperand(&I, /* AllowVolatile */ true)) { 
-    LLVM_DEBUG( 
-        dbgs() << "[AAMemoryLocation] Categorize memory access with pointer: " 
-               << I << " [" << *Ptr << "]\n"); 
-    categorizePtrValue(A, I, *Ptr, AccessedLocs, Changed); 
-    return AccessedLocs.getAssumed(); 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "[AAMemoryLocation] Failed to categorize instruction: " 
-                    << I << "\n"); 
-  updateStateAndAccessesMap(AccessedLocs, NO_UNKOWN_MEM, &I, nullptr, Changed, 
-                            getAccessKindFromInst(&I)); 
-  return AccessedLocs.getAssumed(); 
-} 
- 
-/// An AA to represent the memory behavior function attributes. 
-struct AAMemoryLocationFunction final : public AAMemoryLocationImpl { 
-  AAMemoryLocationFunction(const IRPosition &IRP, Attributor &A) 
-      : AAMemoryLocationImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::updateImpl(Attributor &A). 
-  virtual ChangeStatus updateImpl(Attributor &A) override { 
- 
-    const auto &MemBehaviorAA = A.getAAFor<AAMemoryBehavior>( 
-        *this, getIRPosition(), /* TrackDependence */ false); 
-    if (MemBehaviorAA.isAssumedReadNone()) { 
-      if (MemBehaviorAA.isKnownReadNone()) 
-        return indicateOptimisticFixpoint(); 
-      assert(isAssumedReadNone() && 
-             "AAMemoryLocation was not read-none but AAMemoryBehavior was!"); 
-      A.recordDependence(MemBehaviorAA, *this, DepClassTy::OPTIONAL); 
-      return ChangeStatus::UNCHANGED; 
-    } 
- 
-    // The current assumed state used to determine a change. 
-    auto AssumedState = getAssumed(); 
-    bool Changed = false; 
- 
-    auto CheckRWInst = [&](Instruction &I) { 
-      MemoryLocationsKind MLK = categorizeAccessedLocations(A, I, Changed); 
-      LLVM_DEBUG(dbgs() << "[AAMemoryLocation] Accessed locations for " << I 
-                        << ": " << getMemoryLocationsAsStr(MLK) << "\n"); 
-      removeAssumedBits(inverseLocation(MLK, false, false)); 
+
+    LLVM_DEBUG(
+        dbgs() << "[AAMemoryLocation] Accessed state after argument handling: "
+               << getMemoryLocationsAsStr(AccessedLocs.getAssumed()) << "\n");
+
+    return AccessedLocs.getAssumed();
+  }
+
+  if (const Value *Ptr = getPointerOperand(&I, /* AllowVolatile */ true)) {
+    LLVM_DEBUG(
+        dbgs() << "[AAMemoryLocation] Categorize memory access with pointer: "
+               << I << " [" << *Ptr << "]\n");
+    categorizePtrValue(A, I, *Ptr, AccessedLocs, Changed);
+    return AccessedLocs.getAssumed();
+  }
+
+  LLVM_DEBUG(dbgs() << "[AAMemoryLocation] Failed to categorize instruction: "
+                    << I << "\n");
+  updateStateAndAccessesMap(AccessedLocs, NO_UNKOWN_MEM, &I, nullptr, Changed,
+                            getAccessKindFromInst(&I));
+  return AccessedLocs.getAssumed();
+}
+
+/// An AA to represent the memory behavior function attributes.
+struct AAMemoryLocationFunction final : public AAMemoryLocationImpl {
+  AAMemoryLocationFunction(const IRPosition &IRP, Attributor &A)
+      : AAMemoryLocationImpl(IRP, A) {}
+
+  /// See AbstractAttribute::updateImpl(Attributor &A).
+  virtual ChangeStatus updateImpl(Attributor &A) override {
+
+    const auto &MemBehaviorAA = A.getAAFor<AAMemoryBehavior>(
+        *this, getIRPosition(), /* TrackDependence */ false);
+    if (MemBehaviorAA.isAssumedReadNone()) {
+      if (MemBehaviorAA.isKnownReadNone())
+        return indicateOptimisticFixpoint();
+      assert(isAssumedReadNone() &&
+             "AAMemoryLocation was not read-none but AAMemoryBehavior was!");
+      A.recordDependence(MemBehaviorAA, *this, DepClassTy::OPTIONAL);
+      return ChangeStatus::UNCHANGED;
+    }
+
+    // The current assumed state used to determine a change.
+    auto AssumedState = getAssumed();
+    bool Changed = false;
+
+    auto CheckRWInst = [&](Instruction &I) {
+      MemoryLocationsKind MLK = categorizeAccessedLocations(A, I, Changed);
+      LLVM_DEBUG(dbgs() << "[AAMemoryLocation] Accessed locations for " << I
+                        << ": " << getMemoryLocationsAsStr(MLK) << "\n");
+      removeAssumedBits(inverseLocation(MLK, false, false));
       // Stop once only the valid bit set in the *not assumed location*, thus
       // once we don't actually exclude any memory locations in the state.
       return getAssumedNotAccessedLocation() != VALID_STATE;
-    }; 
- 
-    if (!A.checkForAllReadWriteInstructions(CheckRWInst, *this)) 
-      return indicatePessimisticFixpoint(); 
- 
-    Changed |= AssumedState != getAssumed(); 
-    return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED; 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { 
-    if (isAssumedReadNone()) 
-      STATS_DECLTRACK_FN_ATTR(readnone) 
-    else if (isAssumedArgMemOnly()) 
-      STATS_DECLTRACK_FN_ATTR(argmemonly) 
-    else if (isAssumedInaccessibleMemOnly()) 
-      STATS_DECLTRACK_FN_ATTR(inaccessiblememonly) 
-    else if (isAssumedInaccessibleOrArgMemOnly()) 
-      STATS_DECLTRACK_FN_ATTR(inaccessiblememorargmemonly) 
-  } 
-}; 
- 
-/// AAMemoryLocation attribute for call sites. 
-struct AAMemoryLocationCallSite final : AAMemoryLocationImpl { 
-  AAMemoryLocationCallSite(const IRPosition &IRP, Attributor &A) 
-      : AAMemoryLocationImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    AAMemoryLocationImpl::initialize(A); 
-    Function *F = getAssociatedFunction(); 
+    };
+
+    if (!A.checkForAllReadWriteInstructions(CheckRWInst, *this))
+      return indicatePessimisticFixpoint();
+
+    Changed |= AssumedState != getAssumed();
+    return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    if (isAssumedReadNone())
+      STATS_DECLTRACK_FN_ATTR(readnone)
+    else if (isAssumedArgMemOnly())
+      STATS_DECLTRACK_FN_ATTR(argmemonly)
+    else if (isAssumedInaccessibleMemOnly())
+      STATS_DECLTRACK_FN_ATTR(inaccessiblememonly)
+    else if (isAssumedInaccessibleOrArgMemOnly())
+      STATS_DECLTRACK_FN_ATTR(inaccessiblememorargmemonly)
+  }
+};
+
+/// AAMemoryLocation attribute for call sites.
+struct AAMemoryLocationCallSite final : AAMemoryLocationImpl {
+  AAMemoryLocationCallSite(const IRPosition &IRP, Attributor &A)
+      : AAMemoryLocationImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AAMemoryLocationImpl::initialize(A);
+    Function *F = getAssociatedFunction();
     if (!F || F->isDeclaration())
-      indicatePessimisticFixpoint(); 
-  } 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    // TODO: Once we have call site specific value information we can provide 
-    //       call site specific liveness liveness information and then it makes 
-    //       sense to specialize attributes for call sites arguments instead of 
-    //       redirecting requests to the callee argument. 
-    Function *F = getAssociatedFunction(); 
-    const IRPosition &FnPos = IRPosition::function(*F); 
-    auto &FnAA = A.getAAFor<AAMemoryLocation>(*this, FnPos); 
-    bool Changed = false; 
-    auto AccessPred = [&](const Instruction *I, const Value *Ptr, 
-                          AccessKind Kind, MemoryLocationsKind MLK) { 
-      updateStateAndAccessesMap(getState(), MLK, I, Ptr, Changed, 
-                                getAccessKindFromInst(I)); 
-      return true; 
-    }; 
-    if (!FnAA.checkForAllAccessesToMemoryKind(AccessPred, ALL_LOCATIONS)) 
-      return indicatePessimisticFixpoint(); 
-    return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED; 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { 
-    if (isAssumedReadNone()) 
-      STATS_DECLTRACK_CS_ATTR(readnone) 
-  } 
-}; 
- 
-/// ------------------ Value Constant Range Attribute ------------------------- 
- 
-struct AAValueConstantRangeImpl : AAValueConstantRange { 
-  using StateType = IntegerRangeState; 
-  AAValueConstantRangeImpl(const IRPosition &IRP, Attributor &A) 
-      : AAValueConstantRange(IRP, A) {} 
- 
-  /// See AbstractAttribute::getAsStr(). 
-  const std::string getAsStr() const override { 
-    std::string Str; 
-    llvm::raw_string_ostream OS(Str); 
-    OS << "range(" << getBitWidth() << ")<"; 
-    getKnown().print(OS); 
-    OS << " / "; 
-    getAssumed().print(OS); 
-    OS << ">"; 
-    return OS.str(); 
-  } 
- 
-  /// Helper function to get a SCEV expr for the associated value at program 
-  /// point \p I. 
-  const SCEV *getSCEV(Attributor &A, const Instruction *I = nullptr) const { 
-    if (!getAnchorScope()) 
-      return nullptr; 
- 
-    ScalarEvolution *SE = 
-        A.getInfoCache().getAnalysisResultForFunction<ScalarEvolutionAnalysis>( 
-            *getAnchorScope()); 
- 
-    LoopInfo *LI = A.getInfoCache().getAnalysisResultForFunction<LoopAnalysis>( 
-        *getAnchorScope()); 
- 
-    if (!SE || !LI) 
-      return nullptr; 
- 
-    const SCEV *S = SE->getSCEV(&getAssociatedValue()); 
-    if (!I) 
-      return S; 
- 
-    return SE->getSCEVAtScope(S, LI->getLoopFor(I->getParent())); 
-  } 
- 
-  /// Helper function to get a range from SCEV for the associated value at 
-  /// program point \p I. 
-  ConstantRange getConstantRangeFromSCEV(Attributor &A, 
-                                         const Instruction *I = nullptr) const { 
-    if (!getAnchorScope()) 
-      return getWorstState(getBitWidth()); 
- 
-    ScalarEvolution *SE = 
-        A.getInfoCache().getAnalysisResultForFunction<ScalarEvolutionAnalysis>( 
-            *getAnchorScope()); 
- 
-    const SCEV *S = getSCEV(A, I); 
-    if (!SE || !S) 
-      return getWorstState(getBitWidth()); 
- 
-    return SE->getUnsignedRange(S); 
-  } 
- 
-  /// Helper function to get a range from LVI for the associated value at 
-  /// program point \p I. 
-  ConstantRange 
-  getConstantRangeFromLVI(Attributor &A, 
-                          const Instruction *CtxI = nullptr) const { 
-    if (!getAnchorScope()) 
-      return getWorstState(getBitWidth()); 
- 
-    LazyValueInfo *LVI = 
-        A.getInfoCache().getAnalysisResultForFunction<LazyValueAnalysis>( 
-            *getAnchorScope()); 
- 
-    if (!LVI || !CtxI) 
-      return getWorstState(getBitWidth()); 
-    return LVI->getConstantRange(&getAssociatedValue(), 
-                                 const_cast<Instruction *>(CtxI)); 
-  } 
- 
-  /// See AAValueConstantRange::getKnownConstantRange(..). 
-  ConstantRange 
-  getKnownConstantRange(Attributor &A, 
-                        const Instruction *CtxI = nullptr) const override { 
-    if (!CtxI || CtxI == getCtxI()) 
-      return getKnown(); 
- 
-    ConstantRange LVIR = getConstantRangeFromLVI(A, CtxI); 
-    ConstantRange SCEVR = getConstantRangeFromSCEV(A, CtxI); 
-    return getKnown().intersectWith(SCEVR).intersectWith(LVIR); 
-  } 
- 
-  /// See AAValueConstantRange::getAssumedConstantRange(..). 
-  ConstantRange 
-  getAssumedConstantRange(Attributor &A, 
-                          const Instruction *CtxI = nullptr) const override { 
-    // TODO: Make SCEV use Attributor assumption. 
-    //       We may be able to bound a variable range via assumptions in 
-    //       Attributor. ex.) If x is assumed to be in [1, 3] and y is known to 
-    //       evolve to x^2 + x, then we can say that y is in [2, 12]. 
- 
-    if (!CtxI || CtxI == getCtxI()) 
-      return getAssumed(); 
- 
-    ConstantRange LVIR = getConstantRangeFromLVI(A, CtxI); 
-    ConstantRange SCEVR = getConstantRangeFromSCEV(A, CtxI); 
-    return getAssumed().intersectWith(SCEVR).intersectWith(LVIR); 
-  } 
- 
-  /// See AbstractAttribute::initialize(..). 
-  void initialize(Attributor &A) override { 
-    // Intersect a range given by SCEV. 
-    intersectKnown(getConstantRangeFromSCEV(A, getCtxI())); 
- 
-    // Intersect a range given by LVI. 
-    intersectKnown(getConstantRangeFromLVI(A, getCtxI())); 
-  } 
- 
-  /// Helper function to create MDNode for range metadata. 
-  static MDNode * 
-  getMDNodeForConstantRange(Type *Ty, LLVMContext &Ctx, 
-                            const ConstantRange &AssumedConstantRange) { 
-    Metadata *LowAndHigh[] = {ConstantAsMetadata::get(ConstantInt::get( 
-                                  Ty, AssumedConstantRange.getLower())), 
-                              ConstantAsMetadata::get(ConstantInt::get( 
-                                  Ty, AssumedConstantRange.getUpper()))}; 
-    return MDNode::get(Ctx, LowAndHigh); 
-  } 
- 
-  /// Return true if \p Assumed is included in \p KnownRanges. 
-  static bool isBetterRange(const ConstantRange &Assumed, MDNode *KnownRanges) { 
- 
-    if (Assumed.isFullSet()) 
-      return false; 
- 
-    if (!KnownRanges) 
-      return true; 
- 
-    // If multiple ranges are annotated in IR, we give up to annotate assumed 
-    // range for now. 
- 
-    // TODO:  If there exists a known range which containts assumed range, we 
-    // can say assumed range is better. 
-    if (KnownRanges->getNumOperands() > 2) 
-      return false; 
- 
-    ConstantInt *Lower = 
-        mdconst::extract<ConstantInt>(KnownRanges->getOperand(0)); 
-    ConstantInt *Upper = 
-        mdconst::extract<ConstantInt>(KnownRanges->getOperand(1)); 
- 
-    ConstantRange Known(Lower->getValue(), Upper->getValue()); 
-    return Known.contains(Assumed) && Known != Assumed; 
-  } 
- 
-  /// Helper function to set range metadata. 
-  static bool 
-  setRangeMetadataIfisBetterRange(Instruction *I, 
-                                  const ConstantRange &AssumedConstantRange) { 
-    auto *OldRangeMD = I->getMetadata(LLVMContext::MD_range); 
-    if (isBetterRange(AssumedConstantRange, OldRangeMD)) { 
-      if (!AssumedConstantRange.isEmptySet()) { 
-        I->setMetadata(LLVMContext::MD_range, 
-                       getMDNodeForConstantRange(I->getType(), I->getContext(), 
-                                                 AssumedConstantRange)); 
-        return true; 
-      } 
-    } 
-    return false; 
-  } 
- 
-  /// See AbstractAttribute::manifest() 
-  ChangeStatus manifest(Attributor &A) override { 
-    ChangeStatus Changed = ChangeStatus::UNCHANGED; 
-    ConstantRange AssumedConstantRange = getAssumedConstantRange(A); 
-    assert(!AssumedConstantRange.isFullSet() && "Invalid state"); 
- 
-    auto &V = getAssociatedValue(); 
-    if (!AssumedConstantRange.isEmptySet() && 
-        !AssumedConstantRange.isSingleElement()) { 
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::function(*F);
+    auto &FnAA = A.getAAFor<AAMemoryLocation>(*this, FnPos);
+    bool Changed = false;
+    auto AccessPred = [&](const Instruction *I, const Value *Ptr,
+                          AccessKind Kind, MemoryLocationsKind MLK) {
+      updateStateAndAccessesMap(getState(), MLK, I, Ptr, Changed,
+                                getAccessKindFromInst(I));
+      return true;
+    };
+    if (!FnAA.checkForAllAccessesToMemoryKind(AccessPred, ALL_LOCATIONS))
+      return indicatePessimisticFixpoint();
+    return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    if (isAssumedReadNone())
+      STATS_DECLTRACK_CS_ATTR(readnone)
+  }
+};
+
+/// ------------------ Value Constant Range Attribute -------------------------
+
+struct AAValueConstantRangeImpl : AAValueConstantRange {
+  using StateType = IntegerRangeState;
+  AAValueConstantRangeImpl(const IRPosition &IRP, Attributor &A)
+      : AAValueConstantRange(IRP, A) {}
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    std::string Str;
+    llvm::raw_string_ostream OS(Str);
+    OS << "range(" << getBitWidth() << ")<";
+    getKnown().print(OS);
+    OS << " / ";
+    getAssumed().print(OS);
+    OS << ">";
+    return OS.str();
+  }
+
+  /// Helper function to get a SCEV expr for the associated value at program
+  /// point \p I.
+  const SCEV *getSCEV(Attributor &A, const Instruction *I = nullptr) const {
+    if (!getAnchorScope())
+      return nullptr;
+
+    ScalarEvolution *SE =
+        A.getInfoCache().getAnalysisResultForFunction<ScalarEvolutionAnalysis>(
+            *getAnchorScope());
+
+    LoopInfo *LI = A.getInfoCache().getAnalysisResultForFunction<LoopAnalysis>(
+        *getAnchorScope());
+
+    if (!SE || !LI)
+      return nullptr;
+
+    const SCEV *S = SE->getSCEV(&getAssociatedValue());
+    if (!I)
+      return S;
+
+    return SE->getSCEVAtScope(S, LI->getLoopFor(I->getParent()));
+  }
+
+  /// Helper function to get a range from SCEV for the associated value at
+  /// program point \p I.
+  ConstantRange getConstantRangeFromSCEV(Attributor &A,
+                                         const Instruction *I = nullptr) const {
+    if (!getAnchorScope())
+      return getWorstState(getBitWidth());
+
+    ScalarEvolution *SE =
+        A.getInfoCache().getAnalysisResultForFunction<ScalarEvolutionAnalysis>(
+            *getAnchorScope());
+
+    const SCEV *S = getSCEV(A, I);
+    if (!SE || !S)
+      return getWorstState(getBitWidth());
+
+    return SE->getUnsignedRange(S);
+  }
+
+  /// Helper function to get a range from LVI for the associated value at
+  /// program point \p I.
+  ConstantRange
+  getConstantRangeFromLVI(Attributor &A,
+                          const Instruction *CtxI = nullptr) const {
+    if (!getAnchorScope())
+      return getWorstState(getBitWidth());
+
+    LazyValueInfo *LVI =
+        A.getInfoCache().getAnalysisResultForFunction<LazyValueAnalysis>(
+            *getAnchorScope());
+
+    if (!LVI || !CtxI)
+      return getWorstState(getBitWidth());
+    return LVI->getConstantRange(&getAssociatedValue(),
+                                 const_cast<Instruction *>(CtxI));
+  }
+
+  /// See AAValueConstantRange::getKnownConstantRange(..).
+  ConstantRange
+  getKnownConstantRange(Attributor &A,
+                        const Instruction *CtxI = nullptr) const override {
+    if (!CtxI || CtxI == getCtxI())
+      return getKnown();
+
+    ConstantRange LVIR = getConstantRangeFromLVI(A, CtxI);
+    ConstantRange SCEVR = getConstantRangeFromSCEV(A, CtxI);
+    return getKnown().intersectWith(SCEVR).intersectWith(LVIR);
+  }
+
+  /// See AAValueConstantRange::getAssumedConstantRange(..).
+  ConstantRange
+  getAssumedConstantRange(Attributor &A,
+                          const Instruction *CtxI = nullptr) const override {
+    // TODO: Make SCEV use Attributor assumption.
+    //       We may be able to bound a variable range via assumptions in
+    //       Attributor. ex.) If x is assumed to be in [1, 3] and y is known to
+    //       evolve to x^2 + x, then we can say that y is in [2, 12].
+
+    if (!CtxI || CtxI == getCtxI())
+      return getAssumed();
+
+    ConstantRange LVIR = getConstantRangeFromLVI(A, CtxI);
+    ConstantRange SCEVR = getConstantRangeFromSCEV(A, CtxI);
+    return getAssumed().intersectWith(SCEVR).intersectWith(LVIR);
+  }
+
+  /// See AbstractAttribute::initialize(..).
+  void initialize(Attributor &A) override {
+    // Intersect a range given by SCEV.
+    intersectKnown(getConstantRangeFromSCEV(A, getCtxI()));
+
+    // Intersect a range given by LVI.
+    intersectKnown(getConstantRangeFromLVI(A, getCtxI()));
+  }
+
+  /// Helper function to create MDNode for range metadata.
+  static MDNode *
+  getMDNodeForConstantRange(Type *Ty, LLVMContext &Ctx,
+                            const ConstantRange &AssumedConstantRange) {
+    Metadata *LowAndHigh[] = {ConstantAsMetadata::get(ConstantInt::get(
+                                  Ty, AssumedConstantRange.getLower())),
+                              ConstantAsMetadata::get(ConstantInt::get(
+                                  Ty, AssumedConstantRange.getUpper()))};
+    return MDNode::get(Ctx, LowAndHigh);
+  }
+
+  /// Return true if \p Assumed is included in \p KnownRanges.
+  static bool isBetterRange(const ConstantRange &Assumed, MDNode *KnownRanges) {
+
+    if (Assumed.isFullSet())
+      return false;
+
+    if (!KnownRanges)
+      return true;
+
+    // If multiple ranges are annotated in IR, we give up to annotate assumed
+    // range for now.
+
+    // TODO:  If there exists a known range which containts assumed range, we
+    // can say assumed range is better.
+    if (KnownRanges->getNumOperands() > 2)
+      return false;
+
+    ConstantInt *Lower =
+        mdconst::extract<ConstantInt>(KnownRanges->getOperand(0));
+    ConstantInt *Upper =
+        mdconst::extract<ConstantInt>(KnownRanges->getOperand(1));
+
+    ConstantRange Known(Lower->getValue(), Upper->getValue());
+    return Known.contains(Assumed) && Known != Assumed;
+  }
+
+  /// Helper function to set range metadata.
+  static bool
+  setRangeMetadataIfisBetterRange(Instruction *I,
+                                  const ConstantRange &AssumedConstantRange) {
+    auto *OldRangeMD = I->getMetadata(LLVMContext::MD_range);
+    if (isBetterRange(AssumedConstantRange, OldRangeMD)) {
+      if (!AssumedConstantRange.isEmptySet()) {
+        I->setMetadata(LLVMContext::MD_range,
+                       getMDNodeForConstantRange(I->getType(), I->getContext(),
+                                                 AssumedConstantRange));
+        return true;
+      }
+    }
+    return false;
+  }
+
+  /// See AbstractAttribute::manifest()
+  ChangeStatus manifest(Attributor &A) override {
+    ChangeStatus Changed = ChangeStatus::UNCHANGED;
+    ConstantRange AssumedConstantRange = getAssumedConstantRange(A);
+    assert(!AssumedConstantRange.isFullSet() && "Invalid state");
+
+    auto &V = getAssociatedValue();
+    if (!AssumedConstantRange.isEmptySet() &&
+        !AssumedConstantRange.isSingleElement()) {
       if (Instruction *I = dyn_cast<Instruction>(&V)) {
         assert(I == getCtxI() && "Should not annotate an instruction which is "
                                  "not the context instruction");
-        if (isa<CallInst>(I) || isa<LoadInst>(I)) 
-          if (setRangeMetadataIfisBetterRange(I, AssumedConstantRange)) 
-            Changed = ChangeStatus::CHANGED; 
+        if (isa<CallInst>(I) || isa<LoadInst>(I))
+          if (setRangeMetadataIfisBetterRange(I, AssumedConstantRange))
+            Changed = ChangeStatus::CHANGED;
       }
-    } 
- 
-    return Changed; 
-  } 
-}; 
- 
-struct AAValueConstantRangeArgument final 
-    : AAArgumentFromCallSiteArguments< 
-          AAValueConstantRange, AAValueConstantRangeImpl, IntegerRangeState> { 
-  using Base = AAArgumentFromCallSiteArguments< 
-      AAValueConstantRange, AAValueConstantRangeImpl, IntegerRangeState>; 
-  AAValueConstantRangeArgument(const IRPosition &IRP, Attributor &A) 
-      : Base(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(..). 
-  void initialize(Attributor &A) override { 
-    if (!getAnchorScope() || getAnchorScope()->isDeclaration()) { 
-      indicatePessimisticFixpoint(); 
-    } else { 
-      Base::initialize(A); 
-    } 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { 
-    STATS_DECLTRACK_ARG_ATTR(value_range) 
-  } 
-}; 
- 
-struct AAValueConstantRangeReturned 
-    : AAReturnedFromReturnedValues<AAValueConstantRange, 
-                                   AAValueConstantRangeImpl> { 
-  using Base = AAReturnedFromReturnedValues<AAValueConstantRange, 
-                                            AAValueConstantRangeImpl>; 
-  AAValueConstantRangeReturned(const IRPosition &IRP, Attributor &A) 
-      : Base(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override {} 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { 
-    STATS_DECLTRACK_FNRET_ATTR(value_range) 
-  } 
-}; 
- 
-struct AAValueConstantRangeFloating : AAValueConstantRangeImpl { 
-  AAValueConstantRangeFloating(const IRPosition &IRP, Attributor &A) 
-      : AAValueConstantRangeImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    AAValueConstantRangeImpl::initialize(A); 
-    Value &V = getAssociatedValue(); 
- 
-    if (auto *C = dyn_cast<ConstantInt>(&V)) { 
-      unionAssumed(ConstantRange(C->getValue())); 
-      indicateOptimisticFixpoint(); 
-      return; 
-    } 
- 
-    if (isa<UndefValue>(&V)) { 
-      // Collapse the undef state to 0. 
-      unionAssumed(ConstantRange(APInt(getBitWidth(), 0))); 
-      indicateOptimisticFixpoint(); 
-      return; 
-    } 
- 
+    }
+
+    return Changed;
+  }
+};
+
+struct AAValueConstantRangeArgument final
+    : AAArgumentFromCallSiteArguments<
+          AAValueConstantRange, AAValueConstantRangeImpl, IntegerRangeState> {
+  using Base = AAArgumentFromCallSiteArguments<
+      AAValueConstantRange, AAValueConstantRangeImpl, IntegerRangeState>;
+  AAValueConstantRangeArgument(const IRPosition &IRP, Attributor &A)
+      : Base(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(..).
+  void initialize(Attributor &A) override {
+    if (!getAnchorScope() || getAnchorScope()->isDeclaration()) {
+      indicatePessimisticFixpoint();
+    } else {
+      Base::initialize(A);
+    }
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_ARG_ATTR(value_range)
+  }
+};
+
+struct AAValueConstantRangeReturned
+    : AAReturnedFromReturnedValues<AAValueConstantRange,
+                                   AAValueConstantRangeImpl> {
+  using Base = AAReturnedFromReturnedValues<AAValueConstantRange,
+                                            AAValueConstantRangeImpl>;
+  AAValueConstantRangeReturned(const IRPosition &IRP, Attributor &A)
+      : Base(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FNRET_ATTR(value_range)
+  }
+};
+
+struct AAValueConstantRangeFloating : AAValueConstantRangeImpl {
+  AAValueConstantRangeFloating(const IRPosition &IRP, Attributor &A)
+      : AAValueConstantRangeImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AAValueConstantRangeImpl::initialize(A);
+    Value &V = getAssociatedValue();
+
+    if (auto *C = dyn_cast<ConstantInt>(&V)) {
+      unionAssumed(ConstantRange(C->getValue()));
+      indicateOptimisticFixpoint();
+      return;
+    }
+
+    if (isa<UndefValue>(&V)) {
+      // Collapse the undef state to 0.
+      unionAssumed(ConstantRange(APInt(getBitWidth(), 0)));
+      indicateOptimisticFixpoint();
+      return;
+    }
+
     if (isa<CallBase>(&V))
       return;
 
-    if (isa<BinaryOperator>(&V) || isa<CmpInst>(&V) || isa<CastInst>(&V)) 
-      return; 
-    // If it is a load instruction with range metadata, use it. 
-    if (LoadInst *LI = dyn_cast<LoadInst>(&V)) 
-      if (auto *RangeMD = LI->getMetadata(LLVMContext::MD_range)) { 
-        intersectKnown(getConstantRangeFromMetadata(*RangeMD)); 
-        return; 
-      } 
- 
-    // We can work with PHI and select instruction as we traverse their operands 
-    // during update. 
-    if (isa<SelectInst>(V) || isa<PHINode>(V)) 
-      return; 
- 
-    // Otherwise we give up. 
-    indicatePessimisticFixpoint(); 
- 
-    LLVM_DEBUG(dbgs() << "[AAValueConstantRange] We give up: " 
-                      << getAssociatedValue() << "\n"); 
-  } 
- 
-  bool calculateBinaryOperator( 
-      Attributor &A, BinaryOperator *BinOp, IntegerRangeState &T, 
-      const Instruction *CtxI, 
-      SmallVectorImpl<const AAValueConstantRange *> &QuerriedAAs) { 
-    Value *LHS = BinOp->getOperand(0); 
-    Value *RHS = BinOp->getOperand(1); 
-    // TODO: Allow non integers as well. 
-    if (!LHS->getType()->isIntegerTy() || !RHS->getType()->isIntegerTy()) 
-      return false; 
- 
-    auto &LHSAA = 
-        A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*LHS)); 
-    QuerriedAAs.push_back(&LHSAA); 
-    auto LHSAARange = LHSAA.getAssumedConstantRange(A, CtxI); 
- 
-    auto &RHSAA = 
-        A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*RHS)); 
-    QuerriedAAs.push_back(&RHSAA); 
-    auto RHSAARange = RHSAA.getAssumedConstantRange(A, CtxI); 
- 
-    auto AssumedRange = LHSAARange.binaryOp(BinOp->getOpcode(), RHSAARange); 
- 
-    T.unionAssumed(AssumedRange); 
- 
-    // TODO: Track a known state too. 
- 
-    return T.isValidState(); 
-  } 
- 
-  bool calculateCastInst( 
-      Attributor &A, CastInst *CastI, IntegerRangeState &T, 
-      const Instruction *CtxI, 
-      SmallVectorImpl<const AAValueConstantRange *> &QuerriedAAs) { 
-    assert(CastI->getNumOperands() == 1 && "Expected cast to be unary!"); 
-    // TODO: Allow non integers as well. 
-    Value &OpV = *CastI->getOperand(0); 
-    if (!OpV.getType()->isIntegerTy()) 
-      return false; 
- 
-    auto &OpAA = 
-        A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(OpV)); 
-    QuerriedAAs.push_back(&OpAA); 
-    T.unionAssumed( 
-        OpAA.getAssumed().castOp(CastI->getOpcode(), getState().getBitWidth())); 
-    return T.isValidState(); 
-  } 
- 
-  bool 
-  calculateCmpInst(Attributor &A, CmpInst *CmpI, IntegerRangeState &T, 
-                   const Instruction *CtxI, 
-                   SmallVectorImpl<const AAValueConstantRange *> &QuerriedAAs) { 
-    Value *LHS = CmpI->getOperand(0); 
-    Value *RHS = CmpI->getOperand(1); 
-    // TODO: Allow non integers as well. 
-    if (!LHS->getType()->isIntegerTy() || !RHS->getType()->isIntegerTy()) 
-      return false; 
- 
-    auto &LHSAA = 
-        A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*LHS)); 
-    QuerriedAAs.push_back(&LHSAA); 
-    auto &RHSAA = 
-        A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*RHS)); 
-    QuerriedAAs.push_back(&RHSAA); 
- 
-    auto LHSAARange = LHSAA.getAssumedConstantRange(A, CtxI); 
-    auto RHSAARange = RHSAA.getAssumedConstantRange(A, CtxI); 
- 
-    // If one of them is empty set, we can't decide. 
-    if (LHSAARange.isEmptySet() || RHSAARange.isEmptySet()) 
-      return true; 
- 
-    bool MustTrue = false, MustFalse = false; 
- 
-    auto AllowedRegion = 
-        ConstantRange::makeAllowedICmpRegion(CmpI->getPredicate(), RHSAARange); 
- 
-    auto SatisfyingRegion = ConstantRange::makeSatisfyingICmpRegion( 
-        CmpI->getPredicate(), RHSAARange); 
- 
-    if (AllowedRegion.intersectWith(LHSAARange).isEmptySet()) 
-      MustFalse = true; 
- 
-    if (SatisfyingRegion.contains(LHSAARange)) 
-      MustTrue = true; 
- 
-    assert((!MustTrue || !MustFalse) && 
-           "Either MustTrue or MustFalse should be false!"); 
- 
-    if (MustTrue) 
-      T.unionAssumed(ConstantRange(APInt(/* numBits */ 1, /* val */ 1))); 
-    else if (MustFalse) 
-      T.unionAssumed(ConstantRange(APInt(/* numBits */ 1, /* val */ 0))); 
-    else 
-      T.unionAssumed(ConstantRange(/* BitWidth */ 1, /* isFullSet */ true)); 
- 
-    LLVM_DEBUG(dbgs() << "[AAValueConstantRange] " << *CmpI << " " << LHSAA 
-                      << " " << RHSAA << "\n"); 
- 
-    // TODO: Track a known state too. 
-    return T.isValidState(); 
-  } 
- 
-  /// See AbstractAttribute::updateImpl(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    auto VisitValueCB = [&](Value &V, const Instruction *CtxI, 
-                            IntegerRangeState &T, bool Stripped) -> bool { 
-      Instruction *I = dyn_cast<Instruction>(&V); 
-      if (!I || isa<CallBase>(I)) { 
- 
-        // If the value is not instruction, we query AA to Attributor. 
-        const auto &AA = 
-            A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(V)); 
- 
-        // Clamp operator is not used to utilize a program point CtxI. 
-        T.unionAssumed(AA.getAssumedConstantRange(A, CtxI)); 
- 
-        return T.isValidState(); 
-      } 
- 
-      SmallVector<const AAValueConstantRange *, 4> QuerriedAAs; 
-      if (auto *BinOp = dyn_cast<BinaryOperator>(I)) { 
-        if (!calculateBinaryOperator(A, BinOp, T, CtxI, QuerriedAAs)) 
-          return false; 
-      } else if (auto *CmpI = dyn_cast<CmpInst>(I)) { 
-        if (!calculateCmpInst(A, CmpI, T, CtxI, QuerriedAAs)) 
-          return false; 
-      } else if (auto *CastI = dyn_cast<CastInst>(I)) { 
-        if (!calculateCastInst(A, CastI, T, CtxI, QuerriedAAs)) 
-          return false; 
-      } else { 
-        // Give up with other instructions. 
-        // TODO: Add other instructions 
- 
-        T.indicatePessimisticFixpoint(); 
-        return false; 
-      } 
- 
-      // Catch circular reasoning in a pessimistic way for now. 
-      // TODO: Check how the range evolves and if we stripped anything, see also 
-      //       AADereferenceable or AAAlign for similar situations. 
-      for (const AAValueConstantRange *QueriedAA : QuerriedAAs) { 
-        if (QueriedAA != this) 
-          continue; 
-        // If we are in a stady state we do not need to worry. 
-        if (T.getAssumed() == getState().getAssumed()) 
-          continue; 
-        T.indicatePessimisticFixpoint(); 
-      } 
- 
-      return T.isValidState(); 
-    }; 
- 
-    IntegerRangeState T(getBitWidth()); 
- 
-    if (!genericValueTraversal<AAValueConstantRange, IntegerRangeState>( 
-            A, getIRPosition(), *this, T, VisitValueCB, getCtxI(), 
-            /* UseValueSimplify */ false)) 
-      return indicatePessimisticFixpoint(); 
- 
-    return clampStateAndIndicateChange(getState(), T); 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { 
-    STATS_DECLTRACK_FLOATING_ATTR(value_range) 
-  } 
-}; 
- 
-struct AAValueConstantRangeFunction : AAValueConstantRangeImpl { 
-  AAValueConstantRangeFunction(const IRPosition &IRP, Attributor &A) 
-      : AAValueConstantRangeImpl(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    llvm_unreachable("AAValueConstantRange(Function|CallSite)::updateImpl will " 
-                     "not be called"); 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(value_range) } 
-}; 
- 
-struct AAValueConstantRangeCallSite : AAValueConstantRangeFunction { 
-  AAValueConstantRangeCallSite(const IRPosition &IRP, Attributor &A) 
-      : AAValueConstantRangeFunction(IRP, A) {} 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(value_range) } 
-}; 
- 
-struct AAValueConstantRangeCallSiteReturned 
-    : AACallSiteReturnedFromReturned<AAValueConstantRange, 
-                                     AAValueConstantRangeImpl> { 
-  AAValueConstantRangeCallSiteReturned(const IRPosition &IRP, Attributor &A) 
-      : AACallSiteReturnedFromReturned<AAValueConstantRange, 
-                                       AAValueConstantRangeImpl>(IRP, A) {} 
- 
-  /// See AbstractAttribute::initialize(...). 
-  void initialize(Attributor &A) override { 
-    // If it is a load instruction with range metadata, use the metadata. 
-    if (CallInst *CI = dyn_cast<CallInst>(&getAssociatedValue())) 
-      if (auto *RangeMD = CI->getMetadata(LLVMContext::MD_range)) 
-        intersectKnown(getConstantRangeFromMetadata(*RangeMD)); 
- 
-    AAValueConstantRangeImpl::initialize(A); 
-  } 
- 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { 
-    STATS_DECLTRACK_CSRET_ATTR(value_range) 
-  } 
-}; 
-struct AAValueConstantRangeCallSiteArgument : AAValueConstantRangeFloating { 
-  AAValueConstantRangeCallSiteArgument(const IRPosition &IRP, Attributor &A) 
-      : AAValueConstantRangeFloating(IRP, A) {} 
- 
+    if (isa<BinaryOperator>(&V) || isa<CmpInst>(&V) || isa<CastInst>(&V))
+      return;
+    // If it is a load instruction with range metadata, use it.
+    if (LoadInst *LI = dyn_cast<LoadInst>(&V))
+      if (auto *RangeMD = LI->getMetadata(LLVMContext::MD_range)) {
+        intersectKnown(getConstantRangeFromMetadata(*RangeMD));
+        return;
+      }
+
+    // We can work with PHI and select instruction as we traverse their operands
+    // during update.
+    if (isa<SelectInst>(V) || isa<PHINode>(V))
+      return;
+
+    // Otherwise we give up.
+    indicatePessimisticFixpoint();
+
+    LLVM_DEBUG(dbgs() << "[AAValueConstantRange] We give up: "
+                      << getAssociatedValue() << "\n");
+  }
+
+  bool calculateBinaryOperator(
+      Attributor &A, BinaryOperator *BinOp, IntegerRangeState &T,
+      const Instruction *CtxI,
+      SmallVectorImpl<const AAValueConstantRange *> &QuerriedAAs) {
+    Value *LHS = BinOp->getOperand(0);
+    Value *RHS = BinOp->getOperand(1);
+    // TODO: Allow non integers as well.
+    if (!LHS->getType()->isIntegerTy() || !RHS->getType()->isIntegerTy())
+      return false;
+
+    auto &LHSAA =
+        A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*LHS));
+    QuerriedAAs.push_back(&LHSAA);
+    auto LHSAARange = LHSAA.getAssumedConstantRange(A, CtxI);
+
+    auto &RHSAA =
+        A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*RHS));
+    QuerriedAAs.push_back(&RHSAA);
+    auto RHSAARange = RHSAA.getAssumedConstantRange(A, CtxI);
+
+    auto AssumedRange = LHSAARange.binaryOp(BinOp->getOpcode(), RHSAARange);
+
+    T.unionAssumed(AssumedRange);
+
+    // TODO: Track a known state too.
+
+    return T.isValidState();
+  }
+
+  bool calculateCastInst(
+      Attributor &A, CastInst *CastI, IntegerRangeState &T,
+      const Instruction *CtxI,
+      SmallVectorImpl<const AAValueConstantRange *> &QuerriedAAs) {
+    assert(CastI->getNumOperands() == 1 && "Expected cast to be unary!");
+    // TODO: Allow non integers as well.
+    Value &OpV = *CastI->getOperand(0);
+    if (!OpV.getType()->isIntegerTy())
+      return false;
+
+    auto &OpAA =
+        A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(OpV));
+    QuerriedAAs.push_back(&OpAA);
+    T.unionAssumed(
+        OpAA.getAssumed().castOp(CastI->getOpcode(), getState().getBitWidth()));
+    return T.isValidState();
+  }
+
+  bool
+  calculateCmpInst(Attributor &A, CmpInst *CmpI, IntegerRangeState &T,
+                   const Instruction *CtxI,
+                   SmallVectorImpl<const AAValueConstantRange *> &QuerriedAAs) {
+    Value *LHS = CmpI->getOperand(0);
+    Value *RHS = CmpI->getOperand(1);
+    // TODO: Allow non integers as well.
+    if (!LHS->getType()->isIntegerTy() || !RHS->getType()->isIntegerTy())
+      return false;
+
+    auto &LHSAA =
+        A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*LHS));
+    QuerriedAAs.push_back(&LHSAA);
+    auto &RHSAA =
+        A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*RHS));
+    QuerriedAAs.push_back(&RHSAA);
+
+    auto LHSAARange = LHSAA.getAssumedConstantRange(A, CtxI);
+    auto RHSAARange = RHSAA.getAssumedConstantRange(A, CtxI);
+
+    // If one of them is empty set, we can't decide.
+    if (LHSAARange.isEmptySet() || RHSAARange.isEmptySet())
+      return true;
+
+    bool MustTrue = false, MustFalse = false;
+
+    auto AllowedRegion =
+        ConstantRange::makeAllowedICmpRegion(CmpI->getPredicate(), RHSAARange);
+
+    auto SatisfyingRegion = ConstantRange::makeSatisfyingICmpRegion(
+        CmpI->getPredicate(), RHSAARange);
+
+    if (AllowedRegion.intersectWith(LHSAARange).isEmptySet())
+      MustFalse = true;
+
+    if (SatisfyingRegion.contains(LHSAARange))
+      MustTrue = true;
+
+    assert((!MustTrue || !MustFalse) &&
+           "Either MustTrue or MustFalse should be false!");
+
+    if (MustTrue)
+      T.unionAssumed(ConstantRange(APInt(/* numBits */ 1, /* val */ 1)));
+    else if (MustFalse)
+      T.unionAssumed(ConstantRange(APInt(/* numBits */ 1, /* val */ 0)));
+    else
+      T.unionAssumed(ConstantRange(/* BitWidth */ 1, /* isFullSet */ true));
+
+    LLVM_DEBUG(dbgs() << "[AAValueConstantRange] " << *CmpI << " " << LHSAA
+                      << " " << RHSAA << "\n");
+
+    // TODO: Track a known state too.
+    return T.isValidState();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    auto VisitValueCB = [&](Value &V, const Instruction *CtxI,
+                            IntegerRangeState &T, bool Stripped) -> bool {
+      Instruction *I = dyn_cast<Instruction>(&V);
+      if (!I || isa<CallBase>(I)) {
+
+        // If the value is not instruction, we query AA to Attributor.
+        const auto &AA =
+            A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(V));
+
+        // Clamp operator is not used to utilize a program point CtxI.
+        T.unionAssumed(AA.getAssumedConstantRange(A, CtxI));
+
+        return T.isValidState();
+      }
+
+      SmallVector<const AAValueConstantRange *, 4> QuerriedAAs;
+      if (auto *BinOp = dyn_cast<BinaryOperator>(I)) {
+        if (!calculateBinaryOperator(A, BinOp, T, CtxI, QuerriedAAs))
+          return false;
+      } else if (auto *CmpI = dyn_cast<CmpInst>(I)) {
+        if (!calculateCmpInst(A, CmpI, T, CtxI, QuerriedAAs))
+          return false;
+      } else if (auto *CastI = dyn_cast<CastInst>(I)) {
+        if (!calculateCastInst(A, CastI, T, CtxI, QuerriedAAs))
+          return false;
+      } else {
+        // Give up with other instructions.
+        // TODO: Add other instructions
+
+        T.indicatePessimisticFixpoint();
+        return false;
+      }
+
+      // Catch circular reasoning in a pessimistic way for now.
+      // TODO: Check how the range evolves and if we stripped anything, see also
+      //       AADereferenceable or AAAlign for similar situations.
+      for (const AAValueConstantRange *QueriedAA : QuerriedAAs) {
+        if (QueriedAA != this)
+          continue;
+        // If we are in a stady state we do not need to worry.
+        if (T.getAssumed() == getState().getAssumed())
+          continue;
+        T.indicatePessimisticFixpoint();
+      }
+
+      return T.isValidState();
+    };
+
+    IntegerRangeState T(getBitWidth());
+
+    if (!genericValueTraversal<AAValueConstantRange, IntegerRangeState>(
+            A, getIRPosition(), *this, T, VisitValueCB, getCtxI(),
+            /* UseValueSimplify */ false))
+      return indicatePessimisticFixpoint();
+
+    return clampStateAndIndicateChange(getState(), T);
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FLOATING_ATTR(value_range)
+  }
+};
+
+struct AAValueConstantRangeFunction : AAValueConstantRangeImpl {
+  AAValueConstantRangeFunction(const IRPosition &IRP, Attributor &A)
+      : AAValueConstantRangeImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    llvm_unreachable("AAValueConstantRange(Function|CallSite)::updateImpl will "
+                     "not be called");
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(value_range) }
+};
+
+struct AAValueConstantRangeCallSite : AAValueConstantRangeFunction {
+  AAValueConstantRangeCallSite(const IRPosition &IRP, Attributor &A)
+      : AAValueConstantRangeFunction(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(value_range) }
+};
+
+struct AAValueConstantRangeCallSiteReturned
+    : AACallSiteReturnedFromReturned<AAValueConstantRange,
+                                     AAValueConstantRangeImpl> {
+  AAValueConstantRangeCallSiteReturned(const IRPosition &IRP, Attributor &A)
+      : AACallSiteReturnedFromReturned<AAValueConstantRange,
+                                       AAValueConstantRangeImpl>(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    // If it is a load instruction with range metadata, use the metadata.
+    if (CallInst *CI = dyn_cast<CallInst>(&getAssociatedValue()))
+      if (auto *RangeMD = CI->getMetadata(LLVMContext::MD_range))
+        intersectKnown(getConstantRangeFromMetadata(*RangeMD));
+
+    AAValueConstantRangeImpl::initialize(A);
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CSRET_ATTR(value_range)
+  }
+};
+struct AAValueConstantRangeCallSiteArgument : AAValueConstantRangeFloating {
+  AAValueConstantRangeCallSiteArgument(const IRPosition &IRP, Attributor &A)
+      : AAValueConstantRangeFloating(IRP, A) {}
+
   /// See AbstractAttribute::manifest()
   ChangeStatus manifest(Attributor &A) override {
     return ChangeStatus::UNCHANGED;
   }
 
-  /// See AbstractAttribute::trackStatistics() 
-  void trackStatistics() const override { 
-    STATS_DECLTRACK_CSARG_ATTR(value_range) 
-  } 
-}; 
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CSARG_ATTR(value_range)
+  }
+};
 
 /// ------------------ Potential Values Attribute -------------------------
 
@@ -8025,157 +8025,157 @@ struct AANoUndefCallSiteReturned final
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(noundef) }
 };
-} // namespace 
- 
-const char AAReturnedValues::ID = 0; 
-const char AANoUnwind::ID = 0; 
-const char AANoSync::ID = 0; 
-const char AANoFree::ID = 0; 
-const char AANonNull::ID = 0; 
-const char AANoRecurse::ID = 0; 
-const char AAWillReturn::ID = 0; 
-const char AAUndefinedBehavior::ID = 0; 
-const char AANoAlias::ID = 0; 
-const char AAReachability::ID = 0; 
-const char AANoReturn::ID = 0; 
-const char AAIsDead::ID = 0; 
-const char AADereferenceable::ID = 0; 
-const char AAAlign::ID = 0; 
-const char AANoCapture::ID = 0; 
-const char AAValueSimplify::ID = 0; 
-const char AAHeapToStack::ID = 0; 
-const char AAPrivatizablePtr::ID = 0; 
-const char AAMemoryBehavior::ID = 0; 
-const char AAMemoryLocation::ID = 0; 
-const char AAValueConstantRange::ID = 0; 
+} // namespace
+
+const char AAReturnedValues::ID = 0;
+const char AANoUnwind::ID = 0;
+const char AANoSync::ID = 0;
+const char AANoFree::ID = 0;
+const char AANonNull::ID = 0;
+const char AANoRecurse::ID = 0;
+const char AAWillReturn::ID = 0;
+const char AAUndefinedBehavior::ID = 0;
+const char AANoAlias::ID = 0;
+const char AAReachability::ID = 0;
+const char AANoReturn::ID = 0;
+const char AAIsDead::ID = 0;
+const char AADereferenceable::ID = 0;
+const char AAAlign::ID = 0;
+const char AANoCapture::ID = 0;
+const char AAValueSimplify::ID = 0;
+const char AAHeapToStack::ID = 0;
+const char AAPrivatizablePtr::ID = 0;
+const char AAMemoryBehavior::ID = 0;
+const char AAMemoryLocation::ID = 0;
+const char AAValueConstantRange::ID = 0;
 const char AAPotentialValues::ID = 0;
 const char AANoUndef::ID = 0;
- 
-// Macro magic to create the static generator function for attributes that 
-// follow the naming scheme. 
- 
-#define SWITCH_PK_INV(CLASS, PK, POS_NAME)                                     \ 
-  case IRPosition::PK:                                                         \ 
-    llvm_unreachable("Cannot create " #CLASS " for a " POS_NAME " position!"); 
- 
-#define SWITCH_PK_CREATE(CLASS, IRP, PK, SUFFIX)                               \ 
-  case IRPosition::PK:                                                         \ 
-    AA = new (A.Allocator) CLASS##SUFFIX(IRP, A);                              \ 
-    ++NumAAs;                                                                  \ 
-    break; 
- 
-#define CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS)                 \ 
-  CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) {      \ 
-    CLASS *AA = nullptr;                                                       \ 
-    switch (IRP.getPositionKind()) {                                           \ 
-      SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid")                             \ 
-      SWITCH_PK_INV(CLASS, IRP_FLOAT, "floating")                              \ 
-      SWITCH_PK_INV(CLASS, IRP_ARGUMENT, "argument")                           \ 
-      SWITCH_PK_INV(CLASS, IRP_RETURNED, "returned")                           \ 
-      SWITCH_PK_INV(CLASS, IRP_CALL_SITE_RETURNED, "call site returned")       \ 
-      SWITCH_PK_INV(CLASS, IRP_CALL_SITE_ARGUMENT, "call site argument")       \ 
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function)                     \ 
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE, CallSite)                    \ 
-    }                                                                          \ 
-    return *AA;                                                                \ 
-  } 
- 
-#define CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS)                    \ 
-  CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) {      \ 
-    CLASS *AA = nullptr;                                                       \ 
-    switch (IRP.getPositionKind()) {                                           \ 
-      SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid")                             \ 
-      SWITCH_PK_INV(CLASS, IRP_FUNCTION, "function")                           \ 
-      SWITCH_PK_INV(CLASS, IRP_CALL_SITE, "call site")                         \ 
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_FLOAT, Floating)                        \ 
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_ARGUMENT, Argument)                     \ 
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_RETURNED, Returned)                     \ 
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_RETURNED, CallSiteReturned)   \ 
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_ARGUMENT, CallSiteArgument)   \ 
-    }                                                                          \ 
-    return *AA;                                                                \ 
-  } 
- 
-#define CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS)                      \ 
-  CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) {      \ 
-    CLASS *AA = nullptr;                                                       \ 
-    switch (IRP.getPositionKind()) {                                           \ 
-      SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid")                             \ 
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function)                     \ 
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE, CallSite)                    \ 
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_FLOAT, Floating)                        \ 
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_ARGUMENT, Argument)                     \ 
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_RETURNED, Returned)                     \ 
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_RETURNED, CallSiteReturned)   \ 
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_ARGUMENT, CallSiteArgument)   \ 
-    }                                                                          \ 
-    return *AA;                                                                \ 
-  } 
- 
-#define CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS)            \ 
-  CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) {      \ 
-    CLASS *AA = nullptr;                                                       \ 
-    switch (IRP.getPositionKind()) {                                           \ 
-      SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid")                             \ 
-      SWITCH_PK_INV(CLASS, IRP_ARGUMENT, "argument")                           \ 
-      SWITCH_PK_INV(CLASS, IRP_FLOAT, "floating")                              \ 
-      SWITCH_PK_INV(CLASS, IRP_RETURNED, "returned")                           \ 
-      SWITCH_PK_INV(CLASS, IRP_CALL_SITE_RETURNED, "call site returned")       \ 
-      SWITCH_PK_INV(CLASS, IRP_CALL_SITE_ARGUMENT, "call site argument")       \ 
-      SWITCH_PK_INV(CLASS, IRP_CALL_SITE, "call site")                         \ 
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function)                     \ 
-    }                                                                          \ 
-    return *AA;                                                                \ 
-  } 
- 
-#define CREATE_NON_RET_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS)                  \ 
-  CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) {      \ 
-    CLASS *AA = nullptr;                                                       \ 
-    switch (IRP.getPositionKind()) {                                           \ 
-      SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid")                             \ 
-      SWITCH_PK_INV(CLASS, IRP_RETURNED, "returned")                           \ 
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function)                     \ 
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE, CallSite)                    \ 
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_FLOAT, Floating)                        \ 
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_ARGUMENT, Argument)                     \ 
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_RETURNED, CallSiteReturned)   \ 
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_ARGUMENT, CallSiteArgument)   \ 
-    }                                                                          \ 
-    return *AA;                                                                \ 
-  } 
- 
-CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoUnwind) 
-CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoSync) 
-CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoRecurse) 
-CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAWillReturn) 
-CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoReturn) 
-CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAReturnedValues) 
-CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAMemoryLocation) 
- 
-CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANonNull) 
-CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoAlias) 
-CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAPrivatizablePtr) 
-CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AADereferenceable) 
-CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAAlign) 
-CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoCapture) 
-CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAValueConstantRange) 
+
+// Macro magic to create the static generator function for attributes that
+// follow the naming scheme.
+
+#define SWITCH_PK_INV(CLASS, PK, POS_NAME)                                     \
+  case IRPosition::PK:                                                         \
+    llvm_unreachable("Cannot create " #CLASS " for a " POS_NAME " position!");
+
+#define SWITCH_PK_CREATE(CLASS, IRP, PK, SUFFIX)                               \
+  case IRPosition::PK:                                                         \
+    AA = new (A.Allocator) CLASS##SUFFIX(IRP, A);                              \
+    ++NumAAs;                                                                  \
+    break;
+
+#define CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS)                 \
+  CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) {      \
+    CLASS *AA = nullptr;                                                       \
+    switch (IRP.getPositionKind()) {                                           \
+      SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid")                             \
+      SWITCH_PK_INV(CLASS, IRP_FLOAT, "floating")                              \
+      SWITCH_PK_INV(CLASS, IRP_ARGUMENT, "argument")                           \
+      SWITCH_PK_INV(CLASS, IRP_RETURNED, "returned")                           \
+      SWITCH_PK_INV(CLASS, IRP_CALL_SITE_RETURNED, "call site returned")       \
+      SWITCH_PK_INV(CLASS, IRP_CALL_SITE_ARGUMENT, "call site argument")       \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function)                     \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE, CallSite)                    \
+    }                                                                          \
+    return *AA;                                                                \
+  }
+
+#define CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS)                    \
+  CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) {      \
+    CLASS *AA = nullptr;                                                       \
+    switch (IRP.getPositionKind()) {                                           \
+      SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid")                             \
+      SWITCH_PK_INV(CLASS, IRP_FUNCTION, "function")                           \
+      SWITCH_PK_INV(CLASS, IRP_CALL_SITE, "call site")                         \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_FLOAT, Floating)                        \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_ARGUMENT, Argument)                     \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_RETURNED, Returned)                     \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_RETURNED, CallSiteReturned)   \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_ARGUMENT, CallSiteArgument)   \
+    }                                                                          \
+    return *AA;                                                                \
+  }
+
+#define CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS)                      \
+  CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) {      \
+    CLASS *AA = nullptr;                                                       \
+    switch (IRP.getPositionKind()) {                                           \
+      SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid")                             \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function)                     \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE, CallSite)                    \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_FLOAT, Floating)                        \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_ARGUMENT, Argument)                     \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_RETURNED, Returned)                     \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_RETURNED, CallSiteReturned)   \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_ARGUMENT, CallSiteArgument)   \
+    }                                                                          \
+    return *AA;                                                                \
+  }
+
+#define CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS)            \
+  CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) {      \
+    CLASS *AA = nullptr;                                                       \
+    switch (IRP.getPositionKind()) {                                           \
+      SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid")                             \
+      SWITCH_PK_INV(CLASS, IRP_ARGUMENT, "argument")                           \
+      SWITCH_PK_INV(CLASS, IRP_FLOAT, "floating")                              \
+      SWITCH_PK_INV(CLASS, IRP_RETURNED, "returned")                           \
+      SWITCH_PK_INV(CLASS, IRP_CALL_SITE_RETURNED, "call site returned")       \
+      SWITCH_PK_INV(CLASS, IRP_CALL_SITE_ARGUMENT, "call site argument")       \
+      SWITCH_PK_INV(CLASS, IRP_CALL_SITE, "call site")                         \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function)                     \
+    }                                                                          \
+    return *AA;                                                                \
+  }
+
+#define CREATE_NON_RET_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS)                  \
+  CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) {      \
+    CLASS *AA = nullptr;                                                       \
+    switch (IRP.getPositionKind()) {                                           \
+      SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid")                             \
+      SWITCH_PK_INV(CLASS, IRP_RETURNED, "returned")                           \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function)                     \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE, CallSite)                    \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_FLOAT, Floating)                        \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_ARGUMENT, Argument)                     \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_RETURNED, CallSiteReturned)   \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_ARGUMENT, CallSiteArgument)   \
+    }                                                                          \
+    return *AA;                                                                \
+  }
+
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoUnwind)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoSync)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoRecurse)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAWillReturn)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoReturn)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAReturnedValues)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAMemoryLocation)
+
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANonNull)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoAlias)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAPrivatizablePtr)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AADereferenceable)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAAlign)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoCapture)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAValueConstantRange)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAPotentialValues)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoUndef)
- 
-CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAValueSimplify) 
-CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAIsDead) 
-CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoFree) 
- 
-CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAHeapToStack) 
-CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAReachability) 
-CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAUndefinedBehavior) 
- 
-CREATE_NON_RET_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAMemoryBehavior) 
- 
-#undef CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION 
-#undef CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION 
-#undef CREATE_NON_RET_ABSTRACT_ATTRIBUTE_FOR_POSITION 
-#undef CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION 
-#undef CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION 
-#undef SWITCH_PK_CREATE 
-#undef SWITCH_PK_INV 
+
+CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAValueSimplify)
+CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAIsDead)
+CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoFree)
+
+CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAHeapToStack)
+CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAReachability)
+CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAUndefinedBehavior)
+
+CREATE_NON_RET_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAMemoryBehavior)
+
+#undef CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION
+#undef CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION
+#undef CREATE_NON_RET_ABSTRACT_ATTRIBUTE_FOR_POSITION
+#undef CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION
+#undef CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION
+#undef SWITCH_PK_CREATE
+#undef SWITCH_PK_INV
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/BarrierNoopPass.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/BarrierNoopPass.cpp
index 73c0791e1a..b49a92ad16 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/BarrierNoopPass.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/BarrierNoopPass.cpp
@@ -1,47 +1,47 @@
-//===- BarrierNoopPass.cpp - A barrier pass for the pass manager ----------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// NOTE: DO NOT USE THIS IF AVOIDABLE 
-// 
-// This pass is a nonce pass intended to allow manipulation of the implicitly 
-// nesting pass manager. For example, it can be used to cause a CGSCC pass 
-// manager to be closed prior to running a new collection of function passes. 
-// 
-// FIXME: This is a huge HACK. This should be removed when the pass manager's 
-// nesting is made explicit instead of implicit. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Transforms/IPO.h" 
-using namespace llvm; 
- 
-namespace { 
-/// A nonce module pass used to place a barrier in a pass manager. 
-/// 
-/// There is no mechanism for ending a CGSCC pass manager once one is started. 
-/// This prevents extension points from having clear deterministic ordering 
-/// when they are phrased as non-module passes. 
-class BarrierNoop : public ModulePass { 
-public: 
-  static char ID; // Pass identification. 
- 
-  BarrierNoop() : ModulePass(ID) { 
-    initializeBarrierNoopPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnModule(Module &M) override { return false; } 
-}; 
-} 
- 
-ModulePass *llvm::createBarrierNoopPass() { return new BarrierNoop(); } 
- 
-char BarrierNoop::ID = 0; 
-INITIALIZE_PASS(BarrierNoop, "barrier", "A No-Op Barrier Pass", 
-                false, false) 
+//===- BarrierNoopPass.cpp - A barrier pass for the pass manager ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// NOTE: DO NOT USE THIS IF AVOIDABLE
+//
+// This pass is a nonce pass intended to allow manipulation of the implicitly
+// nesting pass manager. For example, it can be used to cause a CGSCC pass
+// manager to be closed prior to running a new collection of function passes.
+//
+// FIXME: This is a huge HACK. This should be removed when the pass manager's
+// nesting is made explicit instead of implicit.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
+using namespace llvm;
+
+namespace {
+/// A nonce module pass used to place a barrier in a pass manager.
+///
+/// There is no mechanism for ending a CGSCC pass manager once one is started.
+/// This prevents extension points from having clear deterministic ordering
+/// when they are phrased as non-module passes.
+class BarrierNoop : public ModulePass {
+public:
+  static char ID; // Pass identification.
+
+  BarrierNoop() : ModulePass(ID) {
+    initializeBarrierNoopPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override { return false; }
+};
+}
+
+ModulePass *llvm::createBarrierNoopPass() { return new BarrierNoop(); }
+
+char BarrierNoop::ID = 0;
+INITIALIZE_PASS(BarrierNoop, "barrier", "A No-Op Barrier Pass",
+                false, false)
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/BlockExtractor.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/BlockExtractor.cpp
index 0cff82113a..c6e222a096 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/BlockExtractor.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/BlockExtractor.cpp
@@ -1,61 +1,61 @@
-//===- BlockExtractor.cpp - Extracts blocks into their own functions ------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass extracts the specified basic blocks from the module into their 
-// own functions. 
-// 
-//===----------------------------------------------------------------------===// 
- 
+//===- BlockExtractor.cpp - Extracts blocks into their own functions ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass extracts the specified basic blocks from the module into their
+// own functions.
+//
+//===----------------------------------------------------------------------===//
+
 #include "llvm/Transforms/IPO/BlockExtractor.h"
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Module.h" 
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/MemoryBuffer.h" 
-#include "llvm/Transforms/IPO.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/CodeExtractor.h" 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "block-extractor" 
- 
-STATISTIC(NumExtracted, "Number of basic blocks extracted"); 
- 
-static cl::opt<std::string> BlockExtractorFile( 
-    "extract-blocks-file", cl::value_desc("filename"), 
-    cl::desc("A file containing list of basic blocks to extract"), cl::Hidden); 
- 
-cl::opt<bool> BlockExtractorEraseFuncs("extract-blocks-erase-funcs", 
-                                       cl::desc("Erase the existing functions"), 
-                                       cl::Hidden); 
-namespace { 
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/CodeExtractor.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "block-extractor"
+
+STATISTIC(NumExtracted, "Number of basic blocks extracted");
+
+static cl::opt<std::string> BlockExtractorFile(
+    "extract-blocks-file", cl::value_desc("filename"),
+    cl::desc("A file containing list of basic blocks to extract"), cl::Hidden);
+
+cl::opt<bool> BlockExtractorEraseFuncs("extract-blocks-erase-funcs",
+                                       cl::desc("Erase the existing functions"),
+                                       cl::Hidden);
+namespace {
 class BlockExtractor {
 public:
   BlockExtractor(bool EraseFunctions) : EraseFunctions(EraseFunctions) {}
   bool runOnModule(Module &M);
-  void init(const SmallVectorImpl<SmallVector<BasicBlock *, 16>> 
-                &GroupsOfBlocksToExtract) { 
-    for (const SmallVectorImpl<BasicBlock *> &GroupOfBlocks : 
-         GroupsOfBlocksToExtract) { 
-      SmallVector<BasicBlock *, 16> NewGroup; 
-      NewGroup.append(GroupOfBlocks.begin(), GroupOfBlocks.end()); 
-      GroupsOfBlocks.emplace_back(NewGroup); 
-    } 
-    if (!BlockExtractorFile.empty()) 
-      loadFile(); 
-  } 
- 
+  void init(const SmallVectorImpl<SmallVector<BasicBlock *, 16>>
+                &GroupsOfBlocksToExtract) {
+    for (const SmallVectorImpl<BasicBlock *> &GroupOfBlocks :
+         GroupsOfBlocksToExtract) {
+      SmallVector<BasicBlock *, 16> NewGroup;
+      NewGroup.append(GroupOfBlocks.begin(), GroupOfBlocks.end());
+      GroupsOfBlocks.emplace_back(NewGroup);
+    }
+    if (!BlockExtractorFile.empty())
+      loadFile();
+  }
+
 private:
   SmallVector<SmallVector<BasicBlock *, 16>, 4> GroupsOfBlocks;
   bool EraseFunctions;
@@ -71,181 +71,181 @@ class BlockExtractorLegacyPass : public ModulePass {
   BlockExtractor BE;
   bool runOnModule(Module &M) override;
 
-public: 
-  static char ID; 
+public:
+  static char ID;
   BlockExtractorLegacyPass(const SmallVectorImpl<BasicBlock *> &BlocksToExtract,
                            bool EraseFunctions)
       : ModulePass(ID), BE(EraseFunctions) {
-    // We want one group per element of the input list. 
-    SmallVector<SmallVector<BasicBlock *, 16>, 4> MassagedGroupsOfBlocks; 
-    for (BasicBlock *BB : BlocksToExtract) { 
-      SmallVector<BasicBlock *, 16> NewGroup; 
-      NewGroup.push_back(BB); 
-      MassagedGroupsOfBlocks.push_back(NewGroup); 
-    } 
+    // We want one group per element of the input list.
+    SmallVector<SmallVector<BasicBlock *, 16>, 4> MassagedGroupsOfBlocks;
+    for (BasicBlock *BB : BlocksToExtract) {
+      SmallVector<BasicBlock *, 16> NewGroup;
+      NewGroup.push_back(BB);
+      MassagedGroupsOfBlocks.push_back(NewGroup);
+    }
     BE.init(MassagedGroupsOfBlocks);
-  } 
- 
+  }
+
   BlockExtractorLegacyPass(const SmallVectorImpl<SmallVector<BasicBlock *, 16>>
                                &GroupsOfBlocksToExtract,
                            bool EraseFunctions)
       : ModulePass(ID), BE(EraseFunctions) {
     BE.init(GroupsOfBlocksToExtract);
-  } 
- 
+  }
+
   BlockExtractorLegacyPass()
       : BlockExtractorLegacyPass(SmallVector<BasicBlock *, 0>(), false) {}
 };
- 
-} // end anonymous namespace 
- 
+
+} // end anonymous namespace
+
 char BlockExtractorLegacyPass::ID = 0;
 INITIALIZE_PASS(BlockExtractorLegacyPass, "extract-blocks",
-                "Extract basic blocks from module", false, false) 
- 
+                "Extract basic blocks from module", false, false)
+
 ModulePass *llvm::createBlockExtractorPass() {
   return new BlockExtractorLegacyPass();
 }
-ModulePass *llvm::createBlockExtractorPass( 
-    const SmallVectorImpl<BasicBlock *> &BlocksToExtract, bool EraseFunctions) { 
+ModulePass *llvm::createBlockExtractorPass(
+    const SmallVectorImpl<BasicBlock *> &BlocksToExtract, bool EraseFunctions) {
   return new BlockExtractorLegacyPass(BlocksToExtract, EraseFunctions);
-} 
-ModulePass *llvm::createBlockExtractorPass( 
-    const SmallVectorImpl<SmallVector<BasicBlock *, 16>> 
-        &GroupsOfBlocksToExtract, 
-    bool EraseFunctions) { 
+}
+ModulePass *llvm::createBlockExtractorPass(
+    const SmallVectorImpl<SmallVector<BasicBlock *, 16>>
+        &GroupsOfBlocksToExtract,
+    bool EraseFunctions) {
   return new BlockExtractorLegacyPass(GroupsOfBlocksToExtract, EraseFunctions);
-} 
- 
-/// Gets all of the blocks specified in the input file. 
-void BlockExtractor::loadFile() { 
-  auto ErrOrBuf = MemoryBuffer::getFile(BlockExtractorFile); 
-  if (ErrOrBuf.getError()) 
-    report_fatal_error("BlockExtractor couldn't load the file."); 
-  // Read the file. 
-  auto &Buf = *ErrOrBuf; 
-  SmallVector<StringRef, 16> Lines; 
-  Buf->getBuffer().split(Lines, '\n', /*MaxSplit=*/-1, 
-                         /*KeepEmpty=*/false); 
-  for (const auto &Line : Lines) { 
-    SmallVector<StringRef, 4> LineSplit; 
-    Line.split(LineSplit, ' ', /*MaxSplit=*/-1, 
-               /*KeepEmpty=*/false); 
-    if (LineSplit.empty()) 
-      continue; 
-    if (LineSplit.size()!=2) 
-      report_fatal_error("Invalid line format, expecting lines like: 'funcname bb1[;bb2..]'"); 
-    SmallVector<StringRef, 4> BBNames; 
-    LineSplit[1].split(BBNames, ';', /*MaxSplit=*/-1, 
-                       /*KeepEmpty=*/false); 
-    if (BBNames.empty()) 
-      report_fatal_error("Missing bbs name"); 
-    BlocksByName.push_back( 
-        {std::string(LineSplit[0]), {BBNames.begin(), BBNames.end()}}); 
-  } 
-} 
- 
-/// Extracts the landing pads to make sure all of them have only one 
-/// predecessor. 
-void BlockExtractor::splitLandingPadPreds(Function &F) { 
-  for (BasicBlock &BB : F) { 
-    for (Instruction &I : BB) { 
-      if (!isa<InvokeInst>(&I)) 
-        continue; 
-      InvokeInst *II = cast<InvokeInst>(&I); 
-      BasicBlock *Parent = II->getParent(); 
-      BasicBlock *LPad = II->getUnwindDest(); 
- 
-      // Look through the landing pad's predecessors. If one of them ends in an 
-      // 'invoke', then we want to split the landing pad. 
-      bool Split = false; 
-      for (auto PredBB : predecessors(LPad)) { 
-        if (PredBB->isLandingPad() && PredBB != Parent && 
-            isa<InvokeInst>(Parent->getTerminator())) { 
-          Split = true; 
-          break; 
-        } 
-      } 
- 
-      if (!Split) 
-        continue; 
- 
-      SmallVector<BasicBlock *, 2> NewBBs; 
-      SplitLandingPadPredecessors(LPad, Parent, ".1", ".2", NewBBs); 
-    } 
-  } 
-} 
- 
-bool BlockExtractor::runOnModule(Module &M) { 
- 
-  bool Changed = false; 
- 
-  // Get all the functions. 
-  SmallVector<Function *, 4> Functions; 
-  for (Function &F : M) { 
-    splitLandingPadPreds(F); 
-    Functions.push_back(&F); 
-  } 
- 
-  // Get all the blocks specified in the input file. 
-  unsigned NextGroupIdx = GroupsOfBlocks.size(); 
-  GroupsOfBlocks.resize(NextGroupIdx + BlocksByName.size()); 
-  for (const auto &BInfo : BlocksByName) { 
-    Function *F = M.getFunction(BInfo.first); 
-    if (!F) 
-      report_fatal_error("Invalid function name specified in the input file"); 
-    for (const auto &BBInfo : BInfo.second) { 
-      auto Res = llvm::find_if(*F, [&](const BasicBlock &BB) { 
-        return BB.getName().equals(BBInfo); 
-      }); 
-      if (Res == F->end()) 
-        report_fatal_error("Invalid block name specified in the input file"); 
-      GroupsOfBlocks[NextGroupIdx].push_back(&*Res); 
-    } 
-    ++NextGroupIdx; 
-  } 
- 
-  // Extract each group of basic blocks. 
-  for (auto &BBs : GroupsOfBlocks) { 
-    SmallVector<BasicBlock *, 32> BlocksToExtractVec; 
-    for (BasicBlock *BB : BBs) { 
-      // Check if the module contains BB. 
-      if (BB->getParent()->getParent() != &M) 
-        report_fatal_error("Invalid basic block"); 
-      LLVM_DEBUG(dbgs() << "BlockExtractor: Extracting " 
-                        << BB->getParent()->getName() << ":" << BB->getName() 
-                        << "\n"); 
-      BlocksToExtractVec.push_back(BB); 
-      if (const InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) 
-        BlocksToExtractVec.push_back(II->getUnwindDest()); 
-      ++NumExtracted; 
-      Changed = true; 
-    } 
-    CodeExtractorAnalysisCache CEAC(*BBs[0]->getParent()); 
-    Function *F = CodeExtractor(BlocksToExtractVec).extractCodeRegion(CEAC); 
-    if (F) 
-      LLVM_DEBUG(dbgs() << "Extracted group '" << (*BBs.begin())->getName() 
-                        << "' in: " << F->getName() << '\n'); 
-    else 
-      LLVM_DEBUG(dbgs() << "Failed to extract for group '" 
-                        << (*BBs.begin())->getName() << "'\n"); 
-  } 
- 
-  // Erase the functions. 
-  if (EraseFunctions || BlockExtractorEraseFuncs) { 
-    for (Function *F : Functions) { 
-      LLVM_DEBUG(dbgs() << "BlockExtractor: Trying to delete " << F->getName() 
-                        << "\n"); 
-      F->deleteBody(); 
-    } 
-    // Set linkage as ExternalLinkage to avoid erasing unreachable functions. 
-    for (Function &F : M) 
-      F.setLinkage(GlobalValue::ExternalLinkage); 
-    Changed = true; 
-  } 
- 
-  return Changed; 
-} 
+}
+
+/// Gets all of the blocks specified in the input file.
+void BlockExtractor::loadFile() {
+  auto ErrOrBuf = MemoryBuffer::getFile(BlockExtractorFile);
+  if (ErrOrBuf.getError())
+    report_fatal_error("BlockExtractor couldn't load the file.");
+  // Read the file.
+  auto &Buf = *ErrOrBuf;
+  SmallVector<StringRef, 16> Lines;
+  Buf->getBuffer().split(Lines, '\n', /*MaxSplit=*/-1,
+                         /*KeepEmpty=*/false);
+  for (const auto &Line : Lines) {
+    SmallVector<StringRef, 4> LineSplit;
+    Line.split(LineSplit, ' ', /*MaxSplit=*/-1,
+               /*KeepEmpty=*/false);
+    if (LineSplit.empty())
+      continue;
+    if (LineSplit.size()!=2)
+      report_fatal_error("Invalid line format, expecting lines like: 'funcname bb1[;bb2..]'");
+    SmallVector<StringRef, 4> BBNames;
+    LineSplit[1].split(BBNames, ';', /*MaxSplit=*/-1,
+                       /*KeepEmpty=*/false);
+    if (BBNames.empty())
+      report_fatal_error("Missing bbs name");
+    BlocksByName.push_back(
+        {std::string(LineSplit[0]), {BBNames.begin(), BBNames.end()}});
+  }
+}
+
+/// Extracts the landing pads to make sure all of them have only one
+/// predecessor.
+void BlockExtractor::splitLandingPadPreds(Function &F) {
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      if (!isa<InvokeInst>(&I))
+        continue;
+      InvokeInst *II = cast<InvokeInst>(&I);
+      BasicBlock *Parent = II->getParent();
+      BasicBlock *LPad = II->getUnwindDest();
+
+      // Look through the landing pad's predecessors. If one of them ends in an
+      // 'invoke', then we want to split the landing pad.
+      bool Split = false;
+      for (auto PredBB : predecessors(LPad)) {
+        if (PredBB->isLandingPad() && PredBB != Parent &&
+            isa<InvokeInst>(Parent->getTerminator())) {
+          Split = true;
+          break;
+        }
+      }
+
+      if (!Split)
+        continue;
+
+      SmallVector<BasicBlock *, 2> NewBBs;
+      SplitLandingPadPredecessors(LPad, Parent, ".1", ".2", NewBBs);
+    }
+  }
+}
+
+bool BlockExtractor::runOnModule(Module &M) {
+
+  bool Changed = false;
+
+  // Get all the functions.
+  SmallVector<Function *, 4> Functions;
+  for (Function &F : M) {
+    splitLandingPadPreds(F);
+    Functions.push_back(&F);
+  }
+
+  // Get all the blocks specified in the input file.
+  unsigned NextGroupIdx = GroupsOfBlocks.size();
+  GroupsOfBlocks.resize(NextGroupIdx + BlocksByName.size());
+  for (const auto &BInfo : BlocksByName) {
+    Function *F = M.getFunction(BInfo.first);
+    if (!F)
+      report_fatal_error("Invalid function name specified in the input file");
+    for (const auto &BBInfo : BInfo.second) {
+      auto Res = llvm::find_if(*F, [&](const BasicBlock &BB) {
+        return BB.getName().equals(BBInfo);
+      });
+      if (Res == F->end())
+        report_fatal_error("Invalid block name specified in the input file");
+      GroupsOfBlocks[NextGroupIdx].push_back(&*Res);
+    }
+    ++NextGroupIdx;
+  }
+
+  // Extract each group of basic blocks.
+  for (auto &BBs : GroupsOfBlocks) {
+    SmallVector<BasicBlock *, 32> BlocksToExtractVec;
+    for (BasicBlock *BB : BBs) {
+      // Check if the module contains BB.
+      if (BB->getParent()->getParent() != &M)
+        report_fatal_error("Invalid basic block");
+      LLVM_DEBUG(dbgs() << "BlockExtractor: Extracting "
+                        << BB->getParent()->getName() << ":" << BB->getName()
+                        << "\n");
+      BlocksToExtractVec.push_back(BB);
+      if (const InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator()))
+        BlocksToExtractVec.push_back(II->getUnwindDest());
+      ++NumExtracted;
+      Changed = true;
+    }
+    CodeExtractorAnalysisCache CEAC(*BBs[0]->getParent());
+    Function *F = CodeExtractor(BlocksToExtractVec).extractCodeRegion(CEAC);
+    if (F)
+      LLVM_DEBUG(dbgs() << "Extracted group '" << (*BBs.begin())->getName()
+                        << "' in: " << F->getName() << '\n');
+    else
+      LLVM_DEBUG(dbgs() << "Failed to extract for group '"
+                        << (*BBs.begin())->getName() << "'\n");
+  }
+
+  // Erase the functions.
+  if (EraseFunctions || BlockExtractorEraseFuncs) {
+    for (Function *F : Functions) {
+      LLVM_DEBUG(dbgs() << "BlockExtractor: Trying to delete " << F->getName()
+                        << "\n");
+      F->deleteBody();
+    }
+    // Set linkage as ExternalLinkage to avoid erasing unreachable functions.
+    for (Function &F : M)
+      F.setLinkage(GlobalValue::ExternalLinkage);
+    Changed = true;
+  }
+
+  return Changed;
+}
 
 bool BlockExtractorLegacyPass::runOnModule(Module &M) {
   return BE.runOnModule(M);
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/CalledValuePropagation.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/CalledValuePropagation.cpp
index 778e017f4d..74f11fa309 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/CalledValuePropagation.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/CalledValuePropagation.cpp
@@ -1,434 +1,434 @@
-//===- CalledValuePropagation.cpp - Propagate called values -----*- C++ -*-===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements a transformation that attaches !callees metadata to 
-// indirect call sites. For a given call site, the metadata, if present, 
-// indicates the set of functions the call site could possibly target at 
-// run-time. This metadata is added to indirect call sites when the set of 
-// possible targets can be determined by analysis and is known to be small. The 
-// analysis driving the transformation is similar to constant propagation and 
-// makes uses of the generic sparse propagation solver. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/IPO/CalledValuePropagation.h" 
-#include "llvm/Analysis/SparsePropagation.h" 
-#include "llvm/Analysis/ValueLatticeUtils.h" 
-#include "llvm/IR/MDBuilder.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Transforms/IPO.h" 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "called-value-propagation" 
- 
-/// The maximum number of functions to track per lattice value. Once the number 
-/// of functions a call site can possibly target exceeds this threshold, it's 
-/// lattice value becomes overdefined. The number of possible lattice values is 
-/// bounded by Ch(F, M), where F is the number of functions in the module and M 
-/// is MaxFunctionsPerValue. As such, this value should be kept very small. We 
-/// likely can't do anything useful for call sites with a large number of 
-/// possible targets, anyway. 
-static cl::opt<unsigned> MaxFunctionsPerValue( 
-    "cvp-max-functions-per-value", cl::Hidden, cl::init(4), 
-    cl::desc("The maximum number of functions to track per lattice value")); 
- 
-namespace { 
-/// To enable interprocedural analysis, we assign LLVM values to the following 
-/// groups. The register group represents SSA registers, the return group 
-/// represents the return values of functions, and the memory group represents 
-/// in-memory values. An LLVM Value can technically be in more than one group. 
-/// It's necessary to distinguish these groups so we can, for example, track a 
-/// global variable separately from the value stored at its location. 
-enum class IPOGrouping { Register, Return, Memory }; 
- 
-/// Our LatticeKeys are PointerIntPairs composed of LLVM values and groupings. 
-using CVPLatticeKey = PointerIntPair<Value *, 2, IPOGrouping>; 
- 
-/// The lattice value type used by our custom lattice function. It holds the 
-/// lattice state, and a set of functions. 
-class CVPLatticeVal { 
-public: 
-  /// The states of the lattice values. Only the FunctionSet state is 
-  /// interesting. It indicates the set of functions to which an LLVM value may 
-  /// refer. 
-  enum CVPLatticeStateTy { Undefined, FunctionSet, Overdefined, Untracked }; 
- 
-  /// Comparator for sorting the functions set. We want to keep the order 
-  /// deterministic for testing, etc. 
-  struct Compare { 
-    bool operator()(const Function *LHS, const Function *RHS) const { 
-      return LHS->getName() < RHS->getName(); 
-    } 
-  }; 
- 
-  CVPLatticeVal() : LatticeState(Undefined) {} 
-  CVPLatticeVal(CVPLatticeStateTy LatticeState) : LatticeState(LatticeState) {} 
-  CVPLatticeVal(std::vector<Function *> &&Functions) 
-      : LatticeState(FunctionSet), Functions(std::move(Functions)) { 
-    assert(llvm::is_sorted(this->Functions, Compare())); 
-  } 
- 
-  /// Get a reference to the functions held by this lattice value. The number 
-  /// of functions will be zero for states other than FunctionSet. 
-  const std::vector<Function *> &getFunctions() const { 
-    return Functions; 
-  } 
- 
-  /// Returns true if the lattice value is in the FunctionSet state. 
-  bool isFunctionSet() const { return LatticeState == FunctionSet; } 
- 
-  bool operator==(const CVPLatticeVal &RHS) const { 
-    return LatticeState == RHS.LatticeState && Functions == RHS.Functions; 
-  } 
- 
-  bool operator!=(const CVPLatticeVal &RHS) const { 
-    return LatticeState != RHS.LatticeState || Functions != RHS.Functions; 
-  } 
- 
-private: 
-  /// Holds the state this lattice value is in. 
-  CVPLatticeStateTy LatticeState; 
- 
-  /// Holds functions indicating the possible targets of call sites. This set 
-  /// is empty for lattice values in the undefined, overdefined, and untracked 
-  /// states. The maximum size of the set is controlled by 
-  /// MaxFunctionsPerValue. Since most LLVM values are expected to be in 
-  /// uninteresting states (i.e., overdefined), CVPLatticeVal objects should be 
-  /// small and efficiently copyable. 
-  // FIXME: This could be a TinyPtrVector and/or merge with LatticeState. 
-  std::vector<Function *> Functions; 
-}; 
- 
-/// The custom lattice function used by the generic sparse propagation solver. 
-/// It handles merging lattice values and computing new lattice values for 
-/// constants, arguments, values returned from trackable functions, and values 
-/// located in trackable global variables. It also computes the lattice values 
-/// that change as a result of executing instructions. 
-class CVPLatticeFunc 
-    : public AbstractLatticeFunction<CVPLatticeKey, CVPLatticeVal> { 
-public: 
-  CVPLatticeFunc() 
-      : AbstractLatticeFunction(CVPLatticeVal(CVPLatticeVal::Undefined), 
-                                CVPLatticeVal(CVPLatticeVal::Overdefined), 
-                                CVPLatticeVal(CVPLatticeVal::Untracked)) {} 
- 
-  /// Compute and return a CVPLatticeVal for the given CVPLatticeKey. 
-  CVPLatticeVal ComputeLatticeVal(CVPLatticeKey Key) override { 
-    switch (Key.getInt()) { 
-    case IPOGrouping::Register: 
-      if (isa<Instruction>(Key.getPointer())) { 
-        return getUndefVal(); 
-      } else if (auto *A = dyn_cast<Argument>(Key.getPointer())) { 
-        if (canTrackArgumentsInterprocedurally(A->getParent())) 
-          return getUndefVal(); 
-      } else if (auto *C = dyn_cast<Constant>(Key.getPointer())) { 
-        return computeConstant(C); 
-      } 
-      return getOverdefinedVal(); 
-    case IPOGrouping::Memory: 
-    case IPOGrouping::Return: 
-      if (auto *GV = dyn_cast<GlobalVariable>(Key.getPointer())) { 
-        if (canTrackGlobalVariableInterprocedurally(GV)) 
-          return computeConstant(GV->getInitializer()); 
-      } else if (auto *F = cast<Function>(Key.getPointer())) 
-        if (canTrackReturnsInterprocedurally(F)) 
-          return getUndefVal(); 
-    } 
-    return getOverdefinedVal(); 
-  } 
- 
-  /// Merge the two given lattice values. The interesting cases are merging two 
-  /// FunctionSet values and a FunctionSet value with an Undefined value. For 
-  /// these cases, we simply union the function sets. If the size of the union 
-  /// is greater than the maximum functions we track, the merged value is 
-  /// overdefined. 
-  CVPLatticeVal MergeValues(CVPLatticeVal X, CVPLatticeVal Y) override { 
-    if (X == getOverdefinedVal() || Y == getOverdefinedVal()) 
-      return getOverdefinedVal(); 
-    if (X == getUndefVal() && Y == getUndefVal()) 
-      return getUndefVal(); 
-    std::vector<Function *> Union; 
-    std::set_union(X.getFunctions().begin(), X.getFunctions().end(), 
-                   Y.getFunctions().begin(), Y.getFunctions().end(), 
-                   std::back_inserter(Union), CVPLatticeVal::Compare{}); 
-    if (Union.size() > MaxFunctionsPerValue) 
-      return getOverdefinedVal(); 
-    return CVPLatticeVal(std::move(Union)); 
-  } 
- 
-  /// Compute the lattice values that change as a result of executing the given 
-  /// instruction. The changed values are stored in \p ChangedValues. We handle 
-  /// just a few kinds of instructions since we're only propagating values that 
-  /// can be called. 
-  void ComputeInstructionState( 
-      Instruction &I, DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues, 
-      SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) override { 
-    switch (I.getOpcode()) { 
-    case Instruction::Call: 
-    case Instruction::Invoke: 
-      return visitCallBase(cast<CallBase>(I), ChangedValues, SS); 
-    case Instruction::Load: 
-      return visitLoad(*cast<LoadInst>(&I), ChangedValues, SS); 
-    case Instruction::Ret: 
-      return visitReturn(*cast<ReturnInst>(&I), ChangedValues, SS); 
-    case Instruction::Select: 
-      return visitSelect(*cast<SelectInst>(&I), ChangedValues, SS); 
-    case Instruction::Store: 
-      return visitStore(*cast<StoreInst>(&I), ChangedValues, SS); 
-    default: 
-      return visitInst(I, ChangedValues, SS); 
-    } 
-  } 
- 
-  /// Print the given CVPLatticeVal to the specified stream. 
-  void PrintLatticeVal(CVPLatticeVal LV, raw_ostream &OS) override { 
-    if (LV == getUndefVal()) 
-      OS << "Undefined  "; 
-    else if (LV == getOverdefinedVal()) 
-      OS << "Overdefined"; 
-    else if (LV == getUntrackedVal()) 
-      OS << "Untracked  "; 
-    else 
-      OS << "FunctionSet"; 
-  } 
- 
-  /// Print the given CVPLatticeKey to the specified stream. 
-  void PrintLatticeKey(CVPLatticeKey Key, raw_ostream &OS) override { 
-    if (Key.getInt() == IPOGrouping::Register) 
-      OS << "<reg> "; 
-    else if (Key.getInt() == IPOGrouping::Memory) 
-      OS << "<mem> "; 
-    else if (Key.getInt() == IPOGrouping::Return) 
-      OS << "<ret> "; 
-    if (isa<Function>(Key.getPointer())) 
-      OS << Key.getPointer()->getName(); 
-    else 
-      OS << *Key.getPointer(); 
-  } 
- 
-  /// We collect a set of indirect calls when visiting call sites. This method 
-  /// returns a reference to that set. 
-  SmallPtrSetImpl<CallBase *> &getIndirectCalls() { return IndirectCalls; } 
- 
-private: 
-  /// Holds the indirect calls we encounter during the analysis. We will attach 
-  /// metadata to these calls after the analysis indicating the functions the 
-  /// calls can possibly target. 
-  SmallPtrSet<CallBase *, 32> IndirectCalls; 
- 
-  /// Compute a new lattice value for the given constant. The constant, after 
-  /// stripping any pointer casts, should be a Function. We ignore null 
-  /// pointers as an optimization, since calling these values is undefined 
-  /// behavior. 
-  CVPLatticeVal computeConstant(Constant *C) { 
-    if (isa<ConstantPointerNull>(C)) 
-      return CVPLatticeVal(CVPLatticeVal::FunctionSet); 
-    if (auto *F = dyn_cast<Function>(C->stripPointerCasts())) 
-      return CVPLatticeVal({F}); 
-    return getOverdefinedVal(); 
-  } 
- 
-  /// Handle return instructions. The function's return state is the merge of 
-  /// the returned value state and the function's return state. 
-  void visitReturn(ReturnInst &I, 
-                   DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues, 
-                   SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) { 
-    Function *F = I.getParent()->getParent(); 
-    if (F->getReturnType()->isVoidTy()) 
-      return; 
-    auto RegI = CVPLatticeKey(I.getReturnValue(), IPOGrouping::Register); 
-    auto RetF = CVPLatticeKey(F, IPOGrouping::Return); 
-    ChangedValues[RetF] = 
-        MergeValues(SS.getValueState(RegI), SS.getValueState(RetF)); 
-  } 
- 
-  /// Handle call sites. The state of a called function's formal arguments is 
-  /// the merge of the argument state with the call sites corresponding actual 
-  /// argument state. The call site state is the merge of the call site state 
-  /// with the returned value state of the called function. 
-  void visitCallBase(CallBase &CB, 
-                     DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues, 
-                     SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) { 
-    Function *F = CB.getCalledFunction(); 
-    auto RegI = CVPLatticeKey(&CB, IPOGrouping::Register); 
- 
-    // If this is an indirect call, save it so we can quickly revisit it when 
-    // attaching metadata. 
-    if (!F) 
-      IndirectCalls.insert(&CB); 
- 
-    // If we can't track the function's return values, there's nothing to do. 
-    if (!F || !canTrackReturnsInterprocedurally(F)) { 
-      // Void return, No need to create and update CVPLattice state as no one 
-      // can use it. 
-      if (CB.getType()->isVoidTy()) 
-        return; 
-      ChangedValues[RegI] = getOverdefinedVal(); 
-      return; 
-    } 
- 
-    // Inform the solver that the called function is executable, and perform 
-    // the merges for the arguments and return value. 
-    SS.MarkBlockExecutable(&F->front()); 
-    auto RetF = CVPLatticeKey(F, IPOGrouping::Return); 
-    for (Argument &A : F->args()) { 
-      auto RegFormal = CVPLatticeKey(&A, IPOGrouping::Register); 
-      auto RegActual = 
-          CVPLatticeKey(CB.getArgOperand(A.getArgNo()), IPOGrouping::Register); 
-      ChangedValues[RegFormal] = 
-          MergeValues(SS.getValueState(RegFormal), SS.getValueState(RegActual)); 
-    } 
- 
-    // Void return, No need to create and update CVPLattice state as no one can 
-    // use it. 
-    if (CB.getType()->isVoidTy()) 
-      return; 
- 
-    ChangedValues[RegI] = 
-        MergeValues(SS.getValueState(RegI), SS.getValueState(RetF)); 
-  } 
- 
-  /// Handle select instructions. The select instruction state is the merge the 
-  /// true and false value states. 
-  void visitSelect(SelectInst &I, 
-                   DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues, 
-                   SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) { 
-    auto RegI = CVPLatticeKey(&I, IPOGrouping::Register); 
-    auto RegT = CVPLatticeKey(I.getTrueValue(), IPOGrouping::Register); 
-    auto RegF = CVPLatticeKey(I.getFalseValue(), IPOGrouping::Register); 
-    ChangedValues[RegI] = 
-        MergeValues(SS.getValueState(RegT), SS.getValueState(RegF)); 
-  } 
- 
-  /// Handle load instructions. If the pointer operand of the load is a global 
-  /// variable, we attempt to track the value. The loaded value state is the 
-  /// merge of the loaded value state with the global variable state. 
-  void visitLoad(LoadInst &I, 
-                 DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues, 
-                 SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) { 
-    auto RegI = CVPLatticeKey(&I, IPOGrouping::Register); 
-    if (auto *GV = dyn_cast<GlobalVariable>(I.getPointerOperand())) { 
-      auto MemGV = CVPLatticeKey(GV, IPOGrouping::Memory); 
-      ChangedValues[RegI] = 
-          MergeValues(SS.getValueState(RegI), SS.getValueState(MemGV)); 
-    } else { 
-      ChangedValues[RegI] = getOverdefinedVal(); 
-    } 
-  } 
- 
-  /// Handle store instructions. If the pointer operand of the store is a 
-  /// global variable, we attempt to track the value. The global variable state 
-  /// is the merge of the stored value state with the global variable state. 
-  void visitStore(StoreInst &I, 
-                  DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues, 
-                  SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) { 
-    auto *GV = dyn_cast<GlobalVariable>(I.getPointerOperand()); 
-    if (!GV) 
-      return; 
-    auto RegI = CVPLatticeKey(I.getValueOperand(), IPOGrouping::Register); 
-    auto MemGV = CVPLatticeKey(GV, IPOGrouping::Memory); 
-    ChangedValues[MemGV] = 
-        MergeValues(SS.getValueState(RegI), SS.getValueState(MemGV)); 
-  } 
- 
-  /// Handle all other instructions. All other instructions are marked 
-  /// overdefined. 
-  void visitInst(Instruction &I, 
-                 DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues, 
-                 SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) { 
-    // Simply bail if this instruction has no user. 
-    if (I.use_empty()) 
-      return; 
-    auto RegI = CVPLatticeKey(&I, IPOGrouping::Register); 
-    ChangedValues[RegI] = getOverdefinedVal(); 
-  } 
-}; 
-} // namespace 
- 
-namespace llvm { 
-/// A specialization of LatticeKeyInfo for CVPLatticeKeys. The generic solver 
-/// must translate between LatticeKeys and LLVM Values when adding Values to 
-/// its work list and inspecting the state of control-flow related values. 
-template <> struct LatticeKeyInfo<CVPLatticeKey> { 
-  static inline Value *getValueFromLatticeKey(CVPLatticeKey Key) { 
-    return Key.getPointer(); 
-  } 
-  static inline CVPLatticeKey getLatticeKeyFromValue(Value *V) { 
-    return CVPLatticeKey(V, IPOGrouping::Register); 
-  } 
-}; 
-} // namespace llvm 
- 
-static bool runCVP(Module &M) { 
-  // Our custom lattice function and generic sparse propagation solver. 
-  CVPLatticeFunc Lattice; 
-  SparseSolver<CVPLatticeKey, CVPLatticeVal> Solver(&Lattice); 
- 
-  // For each function in the module, if we can't track its arguments, let the 
-  // generic solver assume it is executable. 
-  for (Function &F : M) 
-    if (!F.isDeclaration() && !canTrackArgumentsInterprocedurally(&F)) 
-      Solver.MarkBlockExecutable(&F.front()); 
- 
-  // Solver our custom lattice. In doing so, we will also build a set of 
-  // indirect call sites. 
-  Solver.Solve(); 
- 
-  // Attach metadata to the indirect call sites that were collected indicating 
-  // the set of functions they can possibly target. 
-  bool Changed = false; 
-  MDBuilder MDB(M.getContext()); 
-  for (CallBase *C : Lattice.getIndirectCalls()) { 
-    auto RegI = CVPLatticeKey(C->getCalledOperand(), IPOGrouping::Register); 
-    CVPLatticeVal LV = Solver.getExistingValueState(RegI); 
-    if (!LV.isFunctionSet() || LV.getFunctions().empty()) 
-      continue; 
-    MDNode *Callees = MDB.createCallees(LV.getFunctions()); 
-    C->setMetadata(LLVMContext::MD_callees, Callees); 
-    Changed = true; 
-  } 
- 
-  return Changed; 
-} 
- 
-PreservedAnalyses CalledValuePropagationPass::run(Module &M, 
-                                                  ModuleAnalysisManager &) { 
-  runCVP(M); 
-  return PreservedAnalyses::all(); 
-} 
- 
-namespace { 
-class CalledValuePropagationLegacyPass : public ModulePass { 
-public: 
-  static char ID; 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.setPreservesAll(); 
-  } 
- 
-  CalledValuePropagationLegacyPass() : ModulePass(ID) { 
-    initializeCalledValuePropagationLegacyPassPass( 
-        *PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnModule(Module &M) override { 
-    if (skipModule(M)) 
-      return false; 
-    return runCVP(M); 
-  } 
-}; 
-} // namespace 
- 
-char CalledValuePropagationLegacyPass::ID = 0; 
-INITIALIZE_PASS(CalledValuePropagationLegacyPass, "called-value-propagation", 
-                "Called Value Propagation", false, false) 
- 
-ModulePass *llvm::createCalledValuePropagationPass() { 
-  return new CalledValuePropagationLegacyPass(); 
-} 
+//===- CalledValuePropagation.cpp - Propagate called values -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a transformation that attaches !callees metadata to
+// indirect call sites. For a given call site, the metadata, if present,
+// indicates the set of functions the call site could possibly target at
+// run-time. This metadata is added to indirect call sites when the set of
+// possible targets can be determined by analysis and is known to be small. The
+// analysis driving the transformation is similar to constant propagation and
+// makes uses of the generic sparse propagation solver.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/CalledValuePropagation.h"
+#include "llvm/Analysis/SparsePropagation.h"
+#include "llvm/Analysis/ValueLatticeUtils.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/IPO.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "called-value-propagation"
+
+/// The maximum number of functions to track per lattice value. Once the number
+/// of functions a call site can possibly target exceeds this threshold, it's
+/// lattice value becomes overdefined. The number of possible lattice values is
+/// bounded by Ch(F, M), where F is the number of functions in the module and M
+/// is MaxFunctionsPerValue. As such, this value should be kept very small. We
+/// likely can't do anything useful for call sites with a large number of
+/// possible targets, anyway.
+static cl::opt<unsigned> MaxFunctionsPerValue(
+    "cvp-max-functions-per-value", cl::Hidden, cl::init(4),
+    cl::desc("The maximum number of functions to track per lattice value"));
+
+namespace {
+/// To enable interprocedural analysis, we assign LLVM values to the following
+/// groups. The register group represents SSA registers, the return group
+/// represents the return values of functions, and the memory group represents
+/// in-memory values. An LLVM Value can technically be in more than one group.
+/// It's necessary to distinguish these groups so we can, for example, track a
+/// global variable separately from the value stored at its location.
+enum class IPOGrouping { Register, Return, Memory };
+
+/// Our LatticeKeys are PointerIntPairs composed of LLVM values and groupings.
+using CVPLatticeKey = PointerIntPair<Value *, 2, IPOGrouping>;
+
+/// The lattice value type used by our custom lattice function. It holds the
+/// lattice state, and a set of functions.
+class CVPLatticeVal {
+public:
+  /// The states of the lattice values. Only the FunctionSet state is
+  /// interesting. It indicates the set of functions to which an LLVM value may
+  /// refer.
+  enum CVPLatticeStateTy { Undefined, FunctionSet, Overdefined, Untracked };
+
+  /// Comparator for sorting the functions set. We want to keep the order
+  /// deterministic for testing, etc.
+  struct Compare {
+    bool operator()(const Function *LHS, const Function *RHS) const {
+      return LHS->getName() < RHS->getName();
+    }
+  };
+
+  CVPLatticeVal() : LatticeState(Undefined) {}
+  CVPLatticeVal(CVPLatticeStateTy LatticeState) : LatticeState(LatticeState) {}
+  CVPLatticeVal(std::vector<Function *> &&Functions)
+      : LatticeState(FunctionSet), Functions(std::move(Functions)) {
+    assert(llvm::is_sorted(this->Functions, Compare()));
+  }
+
+  /// Get a reference to the functions held by this lattice value. The number
+  /// of functions will be zero for states other than FunctionSet.
+  const std::vector<Function *> &getFunctions() const {
+    return Functions;
+  }
+
+  /// Returns true if the lattice value is in the FunctionSet state.
+  bool isFunctionSet() const { return LatticeState == FunctionSet; }
+
+  bool operator==(const CVPLatticeVal &RHS) const {
+    return LatticeState == RHS.LatticeState && Functions == RHS.Functions;
+  }
+
+  bool operator!=(const CVPLatticeVal &RHS) const {
+    return LatticeState != RHS.LatticeState || Functions != RHS.Functions;
+  }
+
+private:
+  /// Holds the state this lattice value is in.
+  CVPLatticeStateTy LatticeState;
+
+  /// Holds functions indicating the possible targets of call sites. This set
+  /// is empty for lattice values in the undefined, overdefined, and untracked
+  /// states. The maximum size of the set is controlled by
+  /// MaxFunctionsPerValue. Since most LLVM values are expected to be in
+  /// uninteresting states (i.e., overdefined), CVPLatticeVal objects should be
+  /// small and efficiently copyable.
+  // FIXME: This could be a TinyPtrVector and/or merge with LatticeState.
+  std::vector<Function *> Functions;
+};
+
+/// The custom lattice function used by the generic sparse propagation solver.
+/// It handles merging lattice values and computing new lattice values for
+/// constants, arguments, values returned from trackable functions, and values
+/// located in trackable global variables. It also computes the lattice values
+/// that change as a result of executing instructions.
+class CVPLatticeFunc
+    : public AbstractLatticeFunction<CVPLatticeKey, CVPLatticeVal> {
+public:
+  CVPLatticeFunc()
+      : AbstractLatticeFunction(CVPLatticeVal(CVPLatticeVal::Undefined),
+                                CVPLatticeVal(CVPLatticeVal::Overdefined),
+                                CVPLatticeVal(CVPLatticeVal::Untracked)) {}
+
+  /// Compute and return a CVPLatticeVal for the given CVPLatticeKey.
+  CVPLatticeVal ComputeLatticeVal(CVPLatticeKey Key) override {
+    switch (Key.getInt()) {
+    case IPOGrouping::Register:
+      if (isa<Instruction>(Key.getPointer())) {
+        return getUndefVal();
+      } else if (auto *A = dyn_cast<Argument>(Key.getPointer())) {
+        if (canTrackArgumentsInterprocedurally(A->getParent()))
+          return getUndefVal();
+      } else if (auto *C = dyn_cast<Constant>(Key.getPointer())) {
+        return computeConstant(C);
+      }
+      return getOverdefinedVal();
+    case IPOGrouping::Memory:
+    case IPOGrouping::Return:
+      if (auto *GV = dyn_cast<GlobalVariable>(Key.getPointer())) {
+        if (canTrackGlobalVariableInterprocedurally(GV))
+          return computeConstant(GV->getInitializer());
+      } else if (auto *F = cast<Function>(Key.getPointer()))
+        if (canTrackReturnsInterprocedurally(F))
+          return getUndefVal();
+    }
+    return getOverdefinedVal();
+  }
+
+  /// Merge the two given lattice values. The interesting cases are merging two
+  /// FunctionSet values and a FunctionSet value with an Undefined value. For
+  /// these cases, we simply union the function sets. If the size of the union
+  /// is greater than the maximum functions we track, the merged value is
+  /// overdefined.
+  CVPLatticeVal MergeValues(CVPLatticeVal X, CVPLatticeVal Y) override {
+    if (X == getOverdefinedVal() || Y == getOverdefinedVal())
+      return getOverdefinedVal();
+    if (X == getUndefVal() && Y == getUndefVal())
+      return getUndefVal();
+    std::vector<Function *> Union;
+    std::set_union(X.getFunctions().begin(), X.getFunctions().end(),
+                   Y.getFunctions().begin(), Y.getFunctions().end(),
+                   std::back_inserter(Union), CVPLatticeVal::Compare{});
+    if (Union.size() > MaxFunctionsPerValue)
+      return getOverdefinedVal();
+    return CVPLatticeVal(std::move(Union));
+  }
+
+  /// Compute the lattice values that change as a result of executing the given
+  /// instruction. The changed values are stored in \p ChangedValues. We handle
+  /// just a few kinds of instructions since we're only propagating values that
+  /// can be called.
+  void ComputeInstructionState(
+      Instruction &I, DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
+      SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) override {
+    switch (I.getOpcode()) {
+    case Instruction::Call:
+    case Instruction::Invoke:
+      return visitCallBase(cast<CallBase>(I), ChangedValues, SS);
+    case Instruction::Load:
+      return visitLoad(*cast<LoadInst>(&I), ChangedValues, SS);
+    case Instruction::Ret:
+      return visitReturn(*cast<ReturnInst>(&I), ChangedValues, SS);
+    case Instruction::Select:
+      return visitSelect(*cast<SelectInst>(&I), ChangedValues, SS);
+    case Instruction::Store:
+      return visitStore(*cast<StoreInst>(&I), ChangedValues, SS);
+    default:
+      return visitInst(I, ChangedValues, SS);
+    }
+  }
+
+  /// Print the given CVPLatticeVal to the specified stream.
+  void PrintLatticeVal(CVPLatticeVal LV, raw_ostream &OS) override {
+    if (LV == getUndefVal())
+      OS << "Undefined  ";
+    else if (LV == getOverdefinedVal())
+      OS << "Overdefined";
+    else if (LV == getUntrackedVal())
+      OS << "Untracked  ";
+    else
+      OS << "FunctionSet";
+  }
+
+  /// Print the given CVPLatticeKey to the specified stream.
+  void PrintLatticeKey(CVPLatticeKey Key, raw_ostream &OS) override {
+    if (Key.getInt() == IPOGrouping::Register)
+      OS << "<reg> ";
+    else if (Key.getInt() == IPOGrouping::Memory)
+      OS << "<mem> ";
+    else if (Key.getInt() == IPOGrouping::Return)
+      OS << "<ret> ";
+    if (isa<Function>(Key.getPointer()))
+      OS << Key.getPointer()->getName();
+    else
+      OS << *Key.getPointer();
+  }
+
+  /// We collect a set of indirect calls when visiting call sites. This method
+  /// returns a reference to that set.
+  SmallPtrSetImpl<CallBase *> &getIndirectCalls() { return IndirectCalls; }
+
+private:
+  /// Holds the indirect calls we encounter during the analysis. We will attach
+  /// metadata to these calls after the analysis indicating the functions the
+  /// calls can possibly target.
+  SmallPtrSet<CallBase *, 32> IndirectCalls;
+
+  /// Compute a new lattice value for the given constant. The constant, after
+  /// stripping any pointer casts, should be a Function. We ignore null
+  /// pointers as an optimization, since calling these values is undefined
+  /// behavior.
+  CVPLatticeVal computeConstant(Constant *C) {
+    if (isa<ConstantPointerNull>(C))
+      return CVPLatticeVal(CVPLatticeVal::FunctionSet);
+    if (auto *F = dyn_cast<Function>(C->stripPointerCasts()))
+      return CVPLatticeVal({F});
+    return getOverdefinedVal();
+  }
+
+  /// Handle return instructions. The function's return state is the merge of
+  /// the returned value state and the function's return state.
+  void visitReturn(ReturnInst &I,
+                   DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
+                   SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
+    Function *F = I.getParent()->getParent();
+    if (F->getReturnType()->isVoidTy())
+      return;
+    auto RegI = CVPLatticeKey(I.getReturnValue(), IPOGrouping::Register);
+    auto RetF = CVPLatticeKey(F, IPOGrouping::Return);
+    ChangedValues[RetF] =
+        MergeValues(SS.getValueState(RegI), SS.getValueState(RetF));
+  }
+
+  /// Handle call sites. The state of a called function's formal arguments is
+  /// the merge of the argument state with the call sites corresponding actual
+  /// argument state. The call site state is the merge of the call site state
+  /// with the returned value state of the called function.
+  void visitCallBase(CallBase &CB,
+                     DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
+                     SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
+    Function *F = CB.getCalledFunction();
+    auto RegI = CVPLatticeKey(&CB, IPOGrouping::Register);
+
+    // If this is an indirect call, save it so we can quickly revisit it when
+    // attaching metadata.
+    if (!F)
+      IndirectCalls.insert(&CB);
+
+    // If we can't track the function's return values, there's nothing to do.
+    if (!F || !canTrackReturnsInterprocedurally(F)) {
+      // Void return, No need to create and update CVPLattice state as no one
+      // can use it.
+      if (CB.getType()->isVoidTy())
+        return;
+      ChangedValues[RegI] = getOverdefinedVal();
+      return;
+    }
+
+    // Inform the solver that the called function is executable, and perform
+    // the merges for the arguments and return value.
+    SS.MarkBlockExecutable(&F->front());
+    auto RetF = CVPLatticeKey(F, IPOGrouping::Return);
+    for (Argument &A : F->args()) {
+      auto RegFormal = CVPLatticeKey(&A, IPOGrouping::Register);
+      auto RegActual =
+          CVPLatticeKey(CB.getArgOperand(A.getArgNo()), IPOGrouping::Register);
+      ChangedValues[RegFormal] =
+          MergeValues(SS.getValueState(RegFormal), SS.getValueState(RegActual));
+    }
+
+    // Void return, No need to create and update CVPLattice state as no one can
+    // use it.
+    if (CB.getType()->isVoidTy())
+      return;
+
+    ChangedValues[RegI] =
+        MergeValues(SS.getValueState(RegI), SS.getValueState(RetF));
+  }
+
+  /// Handle select instructions. The select instruction state is the merge the
+  /// true and false value states.
+  void visitSelect(SelectInst &I,
+                   DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
+                   SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
+    auto RegI = CVPLatticeKey(&I, IPOGrouping::Register);
+    auto RegT = CVPLatticeKey(I.getTrueValue(), IPOGrouping::Register);
+    auto RegF = CVPLatticeKey(I.getFalseValue(), IPOGrouping::Register);
+    ChangedValues[RegI] =
+        MergeValues(SS.getValueState(RegT), SS.getValueState(RegF));
+  }
+
+  /// Handle load instructions. If the pointer operand of the load is a global
+  /// variable, we attempt to track the value. The loaded value state is the
+  /// merge of the loaded value state with the global variable state.
+  void visitLoad(LoadInst &I,
+                 DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
+                 SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
+    auto RegI = CVPLatticeKey(&I, IPOGrouping::Register);
+    if (auto *GV = dyn_cast<GlobalVariable>(I.getPointerOperand())) {
+      auto MemGV = CVPLatticeKey(GV, IPOGrouping::Memory);
+      ChangedValues[RegI] =
+          MergeValues(SS.getValueState(RegI), SS.getValueState(MemGV));
+    } else {
+      ChangedValues[RegI] = getOverdefinedVal();
+    }
+  }
+
+  /// Handle store instructions. If the pointer operand of the store is a
+  /// global variable, we attempt to track the value. The global variable state
+  /// is the merge of the stored value state with the global variable state.
+  void visitStore(StoreInst &I,
+                  DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
+                  SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
+    auto *GV = dyn_cast<GlobalVariable>(I.getPointerOperand());
+    if (!GV)
+      return;
+    auto RegI = CVPLatticeKey(I.getValueOperand(), IPOGrouping::Register);
+    auto MemGV = CVPLatticeKey(GV, IPOGrouping::Memory);
+    ChangedValues[MemGV] =
+        MergeValues(SS.getValueState(RegI), SS.getValueState(MemGV));
+  }
+
+  /// Handle all other instructions. All other instructions are marked
+  /// overdefined.
+  void visitInst(Instruction &I,
+                 DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
+                 SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
+    // Simply bail if this instruction has no user.
+    if (I.use_empty())
+      return;
+    auto RegI = CVPLatticeKey(&I, IPOGrouping::Register);
+    ChangedValues[RegI] = getOverdefinedVal();
+  }
+};
+} // namespace
+
+namespace llvm {
+/// A specialization of LatticeKeyInfo for CVPLatticeKeys. The generic solver
+/// must translate between LatticeKeys and LLVM Values when adding Values to
+/// its work list and inspecting the state of control-flow related values.
+template <> struct LatticeKeyInfo<CVPLatticeKey> {
+  static inline Value *getValueFromLatticeKey(CVPLatticeKey Key) {
+    return Key.getPointer();
+  }
+  static inline CVPLatticeKey getLatticeKeyFromValue(Value *V) {
+    return CVPLatticeKey(V, IPOGrouping::Register);
+  }
+};
+} // namespace llvm
+
+static bool runCVP(Module &M) {
+  // Our custom lattice function and generic sparse propagation solver.
+  CVPLatticeFunc Lattice;
+  SparseSolver<CVPLatticeKey, CVPLatticeVal> Solver(&Lattice);
+
+  // For each function in the module, if we can't track its arguments, let the
+  // generic solver assume it is executable.
+  for (Function &F : M)
+    if (!F.isDeclaration() && !canTrackArgumentsInterprocedurally(&F))
+      Solver.MarkBlockExecutable(&F.front());
+
+  // Solver our custom lattice. In doing so, we will also build a set of
+  // indirect call sites.
+  Solver.Solve();
+
+  // Attach metadata to the indirect call sites that were collected indicating
+  // the set of functions they can possibly target.
+  bool Changed = false;
+  MDBuilder MDB(M.getContext());
+  for (CallBase *C : Lattice.getIndirectCalls()) {
+    auto RegI = CVPLatticeKey(C->getCalledOperand(), IPOGrouping::Register);
+    CVPLatticeVal LV = Solver.getExistingValueState(RegI);
+    if (!LV.isFunctionSet() || LV.getFunctions().empty())
+      continue;
+    MDNode *Callees = MDB.createCallees(LV.getFunctions());
+    C->setMetadata(LLVMContext::MD_callees, Callees);
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+PreservedAnalyses CalledValuePropagationPass::run(Module &M,
+                                                  ModuleAnalysisManager &) {
+  runCVP(M);
+  return PreservedAnalyses::all();
+}
+
+namespace {
+class CalledValuePropagationLegacyPass : public ModulePass {
+public:
+  static char ID;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+
+  CalledValuePropagationLegacyPass() : ModulePass(ID) {
+    initializeCalledValuePropagationLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+    return runCVP(M);
+  }
+};
+} // namespace
+
+char CalledValuePropagationLegacyPass::ID = 0;
+INITIALIZE_PASS(CalledValuePropagationLegacyPass, "called-value-propagation",
+                "Called Value Propagation", false, false)
+
+ModulePass *llvm::createCalledValuePropagationPass() {
+  return new CalledValuePropagationLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/ConstantMerge.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/ConstantMerge.cpp
index 41f4f4da81..8e81f4bad4 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/ConstantMerge.cpp
@@ -1,288 +1,288 @@
-//===- ConstantMerge.cpp - Merge duplicate global constants ---------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file defines the interface to a pass that merges duplicate global 
-// constants together into a single constant that is shared.  This is useful 
-// because some passes (ie TraceValues) insert a lot of string constants into 
-// the program, regardless of whether or not an existing string is available. 
-// 
-// Algorithm: ConstantMerge is designed to build up a map of available constants 
-// and eliminate duplicates when it is initialized. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/IPO/ConstantMerge.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/GlobalValue.h" 
-#include "llvm/IR/GlobalVariable.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Transforms/IPO.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <utility> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "constmerge" 
- 
-STATISTIC(NumIdenticalMerged, "Number of identical global constants merged"); 
- 
-/// Find values that are marked as llvm.used. 
-static void FindUsedValues(GlobalVariable *LLVMUsed, 
-                           SmallPtrSetImpl<const GlobalValue*> &UsedValues) { 
-  if (!LLVMUsed) return; 
-  ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer()); 
- 
-  for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i) { 
-    Value *Operand = Inits->getOperand(i)->stripPointerCasts(); 
-    GlobalValue *GV = cast<GlobalValue>(Operand); 
-    UsedValues.insert(GV); 
-  } 
-} 
- 
-// True if A is better than B. 
-static bool IsBetterCanonical(const GlobalVariable &A, 
-                              const GlobalVariable &B) { 
-  if (!A.hasLocalLinkage() && B.hasLocalLinkage()) 
-    return true; 
- 
-  if (A.hasLocalLinkage() && !B.hasLocalLinkage()) 
-    return false; 
- 
-  return A.hasGlobalUnnamedAddr(); 
-} 
- 
-static bool hasMetadataOtherThanDebugLoc(const GlobalVariable *GV) { 
-  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs; 
-  GV->getAllMetadata(MDs); 
-  for (const auto &V : MDs) 
-    if (V.first != LLVMContext::MD_dbg) 
-      return true; 
-  return false; 
-} 
- 
-static void copyDebugLocMetadata(const GlobalVariable *From, 
-                                 GlobalVariable *To) { 
-  SmallVector<DIGlobalVariableExpression *, 1> MDs; 
-  From->getDebugInfo(MDs); 
-  for (auto MD : MDs) 
-    To->addDebugInfo(MD); 
-} 
- 
-static Align getAlign(GlobalVariable *GV) { 
-  return GV->getAlign().getValueOr( 
-      GV->getParent()->getDataLayout().getPreferredAlign(GV)); 
-} 
- 
-static bool 
-isUnmergeableGlobal(GlobalVariable *GV, 
-                    const SmallPtrSetImpl<const GlobalValue *> &UsedGlobals) { 
-  // Only process constants with initializers in the default address space. 
-  return !GV->isConstant() || !GV->hasDefinitiveInitializer() || 
-         GV->getType()->getAddressSpace() != 0 || GV->hasSection() || 
+//===- ConstantMerge.cpp - Merge duplicate global constants ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interface to a pass that merges duplicate global
+// constants together into a single constant that is shared.  This is useful
+// because some passes (ie TraceValues) insert a lot of string constants into
+// the program, regardless of whether or not an existing string is available.
+//
+// Algorithm: ConstantMerge is designed to build up a map of available constants
+// and eliminate duplicates when it is initialized.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/ConstantMerge.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Transforms/IPO.h"
+#include <algorithm>
+#include <cassert>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "constmerge"
+
+STATISTIC(NumIdenticalMerged, "Number of identical global constants merged");
+
+/// Find values that are marked as llvm.used.
+static void FindUsedValues(GlobalVariable *LLVMUsed,
+                           SmallPtrSetImpl<const GlobalValue*> &UsedValues) {
+  if (!LLVMUsed) return;
+  ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer());
+
+  for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i) {
+    Value *Operand = Inits->getOperand(i)->stripPointerCasts();
+    GlobalValue *GV = cast<GlobalValue>(Operand);
+    UsedValues.insert(GV);
+  }
+}
+
+// True if A is better than B.
+static bool IsBetterCanonical(const GlobalVariable &A,
+                              const GlobalVariable &B) {
+  if (!A.hasLocalLinkage() && B.hasLocalLinkage())
+    return true;
+
+  if (A.hasLocalLinkage() && !B.hasLocalLinkage())
+    return false;
+
+  return A.hasGlobalUnnamedAddr();
+}
+
+static bool hasMetadataOtherThanDebugLoc(const GlobalVariable *GV) {
+  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+  GV->getAllMetadata(MDs);
+  for (const auto &V : MDs)
+    if (V.first != LLVMContext::MD_dbg)
+      return true;
+  return false;
+}
+
+static void copyDebugLocMetadata(const GlobalVariable *From,
+                                 GlobalVariable *To) {
+  SmallVector<DIGlobalVariableExpression *, 1> MDs;
+  From->getDebugInfo(MDs);
+  for (auto MD : MDs)
+    To->addDebugInfo(MD);
+}
+
+static Align getAlign(GlobalVariable *GV) {
+  return GV->getAlign().getValueOr(
+      GV->getParent()->getDataLayout().getPreferredAlign(GV));
+}
+
+static bool
+isUnmergeableGlobal(GlobalVariable *GV,
+                    const SmallPtrSetImpl<const GlobalValue *> &UsedGlobals) {
+  // Only process constants with initializers in the default address space.
+  return !GV->isConstant() || !GV->hasDefinitiveInitializer() ||
+         GV->getType()->getAddressSpace() != 0 || GV->hasSection() ||
          // Don't touch thread-local variables.
          GV->isThreadLocal() ||
-         // Don't touch values marked with attribute(used). 
-         UsedGlobals.count(GV); 
-} 
- 
-enum class CanMerge { No, Yes }; 
-static CanMerge makeMergeable(GlobalVariable *Old, GlobalVariable *New) { 
-  if (!Old->hasGlobalUnnamedAddr() && !New->hasGlobalUnnamedAddr()) 
-    return CanMerge::No; 
-  if (hasMetadataOtherThanDebugLoc(Old)) 
-    return CanMerge::No; 
-  assert(!hasMetadataOtherThanDebugLoc(New)); 
-  if (!Old->hasGlobalUnnamedAddr()) 
-    New->setUnnamedAddr(GlobalValue::UnnamedAddr::None); 
-  return CanMerge::Yes; 
-} 
- 
-static void replace(Module &M, GlobalVariable *Old, GlobalVariable *New) { 
-  Constant *NewConstant = New; 
- 
-  LLVM_DEBUG(dbgs() << "Replacing global: @" << Old->getName() << " -> @" 
-                    << New->getName() << "\n"); 
- 
-  // Bump the alignment if necessary. 
-  if (Old->getAlign() || New->getAlign()) 
-    New->setAlignment(std::max(getAlign(Old), getAlign(New))); 
- 
-  copyDebugLocMetadata(Old, New); 
-  Old->replaceAllUsesWith(NewConstant); 
- 
-  // Delete the global value from the module. 
-  assert(Old->hasLocalLinkage() && 
-         "Refusing to delete an externally visible global variable."); 
-  Old->eraseFromParent(); 
-} 
- 
-static bool mergeConstants(Module &M) { 
-  // Find all the globals that are marked "used".  These cannot be merged. 
-  SmallPtrSet<const GlobalValue*, 8> UsedGlobals; 
-  FindUsedValues(M.getGlobalVariable("llvm.used"), UsedGlobals); 
-  FindUsedValues(M.getGlobalVariable("llvm.compiler.used"), UsedGlobals); 
- 
-  // Map unique constants to globals. 
-  DenseMap<Constant *, GlobalVariable *> CMap; 
- 
-  SmallVector<std::pair<GlobalVariable *, GlobalVariable *>, 32> 
-      SameContentReplacements; 
- 
-  size_t ChangesMade = 0; 
-  size_t OldChangesMade = 0; 
- 
-  // Iterate constant merging while we are still making progress.  Merging two 
-  // constants together may allow us to merge other constants together if the 
-  // second level constants have initializers which point to the globals that 
-  // were just merged. 
-  while (true) { 
-    // Find the canonical constants others will be merged with. 
-    for (Module::global_iterator GVI = M.global_begin(), E = M.global_end(); 
-         GVI != E; ) { 
-      GlobalVariable *GV = &*GVI++; 
- 
-      // If this GV is dead, remove it. 
-      GV->removeDeadConstantUsers(); 
-      if (GV->use_empty() && GV->hasLocalLinkage()) { 
-        GV->eraseFromParent(); 
-        ++ChangesMade; 
-        continue; 
-      } 
- 
-      if (isUnmergeableGlobal(GV, UsedGlobals)) 
-        continue; 
- 
-      // This transformation is legal for weak ODR globals in the sense it 
-      // doesn't change semantics, but we really don't want to perform it 
-      // anyway; it's likely to pessimize code generation, and some tools 
-      // (like the Darwin linker in cases involving CFString) don't expect it. 
-      if (GV->isWeakForLinker()) 
-        continue; 
- 
-      // Don't touch globals with metadata other then !dbg. 
-      if (hasMetadataOtherThanDebugLoc(GV)) 
-        continue; 
- 
-      Constant *Init = GV->getInitializer(); 
- 
-      // Check to see if the initializer is already known. 
-      GlobalVariable *&Slot = CMap[Init]; 
- 
-      // If this is the first constant we find or if the old one is local, 
-      // replace with the current one. If the current is externally visible 
-      // it cannot be replace, but can be the canonical constant we merge with. 
-      bool FirstConstantFound = !Slot; 
-      if (FirstConstantFound || IsBetterCanonical(*GV, *Slot)) { 
-        Slot = GV; 
-        LLVM_DEBUG(dbgs() << "Cmap[" << *Init << "] = " << GV->getName() 
-                          << (FirstConstantFound ? "\n" : " (updated)\n")); 
-      } 
-    } 
- 
-    // Identify all globals that can be merged together, filling in the 
-    // SameContentReplacements vector. We cannot do the replacement in this pass 
-    // because doing so may cause initializers of other globals to be rewritten, 
-    // invalidating the Constant* pointers in CMap. 
-    for (Module::global_iterator GVI = M.global_begin(), E = M.global_end(); 
-         GVI != E; ) { 
-      GlobalVariable *GV = &*GVI++; 
- 
-      if (isUnmergeableGlobal(GV, UsedGlobals)) 
-        continue; 
- 
-      // We can only replace constant with local linkage. 
-      if (!GV->hasLocalLinkage()) 
-        continue; 
- 
-      Constant *Init = GV->getInitializer(); 
- 
-      // Check to see if the initializer is already known. 
-      auto Found = CMap.find(Init); 
-      if (Found == CMap.end()) 
-        continue; 
- 
-      GlobalVariable *Slot = Found->second; 
-      if (Slot == GV) 
-        continue; 
- 
-      if (makeMergeable(GV, Slot) == CanMerge::No) 
-        continue; 
- 
-      // Make all uses of the duplicate constant use the canonical version. 
-      LLVM_DEBUG(dbgs() << "Will replace: @" << GV->getName() << " -> @" 
-                        << Slot->getName() << "\n"); 
-      SameContentReplacements.push_back(std::make_pair(GV, Slot)); 
-    } 
- 
-    // Now that we have figured out which replacements must be made, do them all 
-    // now.  This avoid invalidating the pointers in CMap, which are unneeded 
-    // now. 
-    for (unsigned i = 0, e = SameContentReplacements.size(); i != e; ++i) { 
-      GlobalVariable *Old = SameContentReplacements[i].first; 
-      GlobalVariable *New = SameContentReplacements[i].second; 
-      replace(M, Old, New); 
-      ++ChangesMade; 
-      ++NumIdenticalMerged; 
-    } 
- 
-    if (ChangesMade == OldChangesMade) 
-      break; 
-    OldChangesMade = ChangesMade; 
- 
-    SameContentReplacements.clear(); 
-    CMap.clear(); 
-  } 
- 
-  return ChangesMade; 
-} 
- 
-PreservedAnalyses ConstantMergePass::run(Module &M, ModuleAnalysisManager &) { 
-  if (!mergeConstants(M)) 
-    return PreservedAnalyses::all(); 
-  return PreservedAnalyses::none(); 
-} 
- 
-namespace { 
- 
-struct ConstantMergeLegacyPass : public ModulePass { 
-  static char ID; // Pass identification, replacement for typeid 
- 
-  ConstantMergeLegacyPass() : ModulePass(ID) { 
-    initializeConstantMergeLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  // For this pass, process all of the globals in the module, eliminating 
-  // duplicate constants. 
-  bool runOnModule(Module &M) override { 
-    if (skipModule(M)) 
-      return false; 
-    return mergeConstants(M); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-char ConstantMergeLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS(ConstantMergeLegacyPass, "constmerge", 
-                "Merge Duplicate Global Constants", false, false) 
- 
-ModulePass *llvm::createConstantMergePass() { 
-  return new ConstantMergeLegacyPass(); 
-} 
+         // Don't touch values marked with attribute(used).
+         UsedGlobals.count(GV);
+}
+
+enum class CanMerge { No, Yes };
+static CanMerge makeMergeable(GlobalVariable *Old, GlobalVariable *New) {
+  if (!Old->hasGlobalUnnamedAddr() && !New->hasGlobalUnnamedAddr())
+    return CanMerge::No;
+  if (hasMetadataOtherThanDebugLoc(Old))
+    return CanMerge::No;
+  assert(!hasMetadataOtherThanDebugLoc(New));
+  if (!Old->hasGlobalUnnamedAddr())
+    New->setUnnamedAddr(GlobalValue::UnnamedAddr::None);
+  return CanMerge::Yes;
+}
+
+static void replace(Module &M, GlobalVariable *Old, GlobalVariable *New) {
+  Constant *NewConstant = New;
+
+  LLVM_DEBUG(dbgs() << "Replacing global: @" << Old->getName() << " -> @"
+                    << New->getName() << "\n");
+
+  // Bump the alignment if necessary.
+  if (Old->getAlign() || New->getAlign())
+    New->setAlignment(std::max(getAlign(Old), getAlign(New)));
+
+  copyDebugLocMetadata(Old, New);
+  Old->replaceAllUsesWith(NewConstant);
+
+  // Delete the global value from the module.
+  assert(Old->hasLocalLinkage() &&
+         "Refusing to delete an externally visible global variable.");
+  Old->eraseFromParent();
+}
+
+static bool mergeConstants(Module &M) {
+  // Find all the globals that are marked "used".  These cannot be merged.
+  SmallPtrSet<const GlobalValue*, 8> UsedGlobals;
+  FindUsedValues(M.getGlobalVariable("llvm.used"), UsedGlobals);
+  FindUsedValues(M.getGlobalVariable("llvm.compiler.used"), UsedGlobals);
+
+  // Map unique constants to globals.
+  DenseMap<Constant *, GlobalVariable *> CMap;
+
+  SmallVector<std::pair<GlobalVariable *, GlobalVariable *>, 32>
+      SameContentReplacements;
+
+  size_t ChangesMade = 0;
+  size_t OldChangesMade = 0;
+
+  // Iterate constant merging while we are still making progress.  Merging two
+  // constants together may allow us to merge other constants together if the
+  // second level constants have initializers which point to the globals that
+  // were just merged.
+  while (true) {
+    // Find the canonical constants others will be merged with.
+    for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
+         GVI != E; ) {
+      GlobalVariable *GV = &*GVI++;
+
+      // If this GV is dead, remove it.
+      GV->removeDeadConstantUsers();
+      if (GV->use_empty() && GV->hasLocalLinkage()) {
+        GV->eraseFromParent();
+        ++ChangesMade;
+        continue;
+      }
+
+      if (isUnmergeableGlobal(GV, UsedGlobals))
+        continue;
+
+      // This transformation is legal for weak ODR globals in the sense it
+      // doesn't change semantics, but we really don't want to perform it
+      // anyway; it's likely to pessimize code generation, and some tools
+      // (like the Darwin linker in cases involving CFString) don't expect it.
+      if (GV->isWeakForLinker())
+        continue;
+
+      // Don't touch globals with metadata other then !dbg.
+      if (hasMetadataOtherThanDebugLoc(GV))
+        continue;
+
+      Constant *Init = GV->getInitializer();
+
+      // Check to see if the initializer is already known.
+      GlobalVariable *&Slot = CMap[Init];
+
+      // If this is the first constant we find or if the old one is local,
+      // replace with the current one. If the current is externally visible
+      // it cannot be replace, but can be the canonical constant we merge with.
+      bool FirstConstantFound = !Slot;
+      if (FirstConstantFound || IsBetterCanonical(*GV, *Slot)) {
+        Slot = GV;
+        LLVM_DEBUG(dbgs() << "Cmap[" << *Init << "] = " << GV->getName()
+                          << (FirstConstantFound ? "\n" : " (updated)\n"));
+      }
+    }
+
+    // Identify all globals that can be merged together, filling in the
+    // SameContentReplacements vector. We cannot do the replacement in this pass
+    // because doing so may cause initializers of other globals to be rewritten,
+    // invalidating the Constant* pointers in CMap.
+    for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
+         GVI != E; ) {
+      GlobalVariable *GV = &*GVI++;
+
+      if (isUnmergeableGlobal(GV, UsedGlobals))
+        continue;
+
+      // We can only replace constant with local linkage.
+      if (!GV->hasLocalLinkage())
+        continue;
+
+      Constant *Init = GV->getInitializer();
+
+      // Check to see if the initializer is already known.
+      auto Found = CMap.find(Init);
+      if (Found == CMap.end())
+        continue;
+
+      GlobalVariable *Slot = Found->second;
+      if (Slot == GV)
+        continue;
+
+      if (makeMergeable(GV, Slot) == CanMerge::No)
+        continue;
+
+      // Make all uses of the duplicate constant use the canonical version.
+      LLVM_DEBUG(dbgs() << "Will replace: @" << GV->getName() << " -> @"
+                        << Slot->getName() << "\n");
+      SameContentReplacements.push_back(std::make_pair(GV, Slot));
+    }
+
+    // Now that we have figured out which replacements must be made, do them all
+    // now.  This avoid invalidating the pointers in CMap, which are unneeded
+    // now.
+    for (unsigned i = 0, e = SameContentReplacements.size(); i != e; ++i) {
+      GlobalVariable *Old = SameContentReplacements[i].first;
+      GlobalVariable *New = SameContentReplacements[i].second;
+      replace(M, Old, New);
+      ++ChangesMade;
+      ++NumIdenticalMerged;
+    }
+
+    if (ChangesMade == OldChangesMade)
+      break;
+    OldChangesMade = ChangesMade;
+
+    SameContentReplacements.clear();
+    CMap.clear();
+  }
+
+  return ChangesMade;
+}
+
+PreservedAnalyses ConstantMergePass::run(Module &M, ModuleAnalysisManager &) {
+  if (!mergeConstants(M))
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
+}
+
+namespace {
+
+struct ConstantMergeLegacyPass : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+
+  ConstantMergeLegacyPass() : ModulePass(ID) {
+    initializeConstantMergeLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  // For this pass, process all of the globals in the module, eliminating
+  // duplicate constants.
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+    return mergeConstants(M);
+  }
+};
+
+} // end anonymous namespace
+
+char ConstantMergeLegacyPass::ID = 0;
+
+INITIALIZE_PASS(ConstantMergeLegacyPass, "constmerge",
+                "Merge Duplicate Global Constants", false, false)
+
+ModulePass *llvm::createConstantMergePass() {
+  return new ConstantMergeLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/CrossDSOCFI.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/CrossDSOCFI.cpp
index 88b9cc5fe4..2fe9a59ad2 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/CrossDSOCFI.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/CrossDSOCFI.cpp
@@ -1,175 +1,175 @@
-//===-- CrossDSOCFI.cpp - Externalize this module's CFI checks ------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass exports all llvm.bitset's found in the module in the form of a 
-// __cfi_check function, which can be used to verify cross-DSO call targets. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/IPO/CrossDSOCFI.h" 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/Triple.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GlobalObject.h" 
-#include "llvm/IR/GlobalVariable.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/MDBuilder.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Operator.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/IPO.h" 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "cross-dso-cfi" 
- 
-STATISTIC(NumTypeIds, "Number of unique type identifiers"); 
- 
-namespace { 
- 
-struct CrossDSOCFI : public ModulePass { 
-  static char ID; 
-  CrossDSOCFI() : ModulePass(ID) { 
-    initializeCrossDSOCFIPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  MDNode *VeryLikelyWeights; 
- 
-  ConstantInt *extractNumericTypeId(MDNode *MD); 
-  void buildCFICheck(Module &M); 
-  bool runOnModule(Module &M) override; 
-}; 
- 
-} // anonymous namespace 
- 
-INITIALIZE_PASS_BEGIN(CrossDSOCFI, "cross-dso-cfi", "Cross-DSO CFI", false, 
-                      false) 
-INITIALIZE_PASS_END(CrossDSOCFI, "cross-dso-cfi", "Cross-DSO CFI", false, false) 
-char CrossDSOCFI::ID = 0; 
- 
-ModulePass *llvm::createCrossDSOCFIPass() { return new CrossDSOCFI; } 
- 
-/// Extracts a numeric type identifier from an MDNode containing type metadata. 
-ConstantInt *CrossDSOCFI::extractNumericTypeId(MDNode *MD) { 
-  // This check excludes vtables for classes inside anonymous namespaces. 
-  auto TM = dyn_cast<ValueAsMetadata>(MD->getOperand(1)); 
-  if (!TM) 
-    return nullptr; 
-  auto C = dyn_cast_or_null<ConstantInt>(TM->getValue()); 
-  if (!C) return nullptr; 
-  // We are looking for i64 constants. 
-  if (C->getBitWidth() != 64) return nullptr; 
- 
-  return C; 
-} 
- 
-/// buildCFICheck - emits __cfi_check for the current module. 
-void CrossDSOCFI::buildCFICheck(Module &M) { 
-  // FIXME: verify that __cfi_check ends up near the end of the code section, 
-  // but before the jump slots created in LowerTypeTests. 
-  SetVector<uint64_t> TypeIds; 
-  SmallVector<MDNode *, 2> Types; 
-  for (GlobalObject &GO : M.global_objects()) { 
-    Types.clear(); 
-    GO.getMetadata(LLVMContext::MD_type, Types); 
-    for (MDNode *Type : Types) 
-      if (ConstantInt *TypeId = extractNumericTypeId(Type)) 
-        TypeIds.insert(TypeId->getZExtValue()); 
-  } 
- 
-  NamedMDNode *CfiFunctionsMD = M.getNamedMetadata("cfi.functions"); 
-  if (CfiFunctionsMD) { 
-    for (auto Func : CfiFunctionsMD->operands()) { 
-      assert(Func->getNumOperands() >= 2); 
-      for (unsigned I = 2; I < Func->getNumOperands(); ++I) 
-        if (ConstantInt *TypeId = 
-                extractNumericTypeId(cast<MDNode>(Func->getOperand(I).get()))) 
-          TypeIds.insert(TypeId->getZExtValue()); 
-    } 
-  } 
- 
-  LLVMContext &Ctx = M.getContext(); 
-  FunctionCallee C = M.getOrInsertFunction( 
-      "__cfi_check", Type::getVoidTy(Ctx), Type::getInt64Ty(Ctx), 
-      Type::getInt8PtrTy(Ctx), Type::getInt8PtrTy(Ctx)); 
-  Function *F = cast<Function>(C.getCallee()); 
-  // Take over the existing function. The frontend emits a weak stub so that the 
-  // linker knows about the symbol; this pass replaces the function body. 
-  F->deleteBody(); 
-  F->setAlignment(Align(4096)); 
- 
-  Triple T(M.getTargetTriple()); 
-  if (T.isARM() || T.isThumb()) 
-    F->addFnAttr("target-features", "+thumb-mode"); 
- 
-  auto args = F->arg_begin(); 
-  Value &CallSiteTypeId = *(args++); 
-  CallSiteTypeId.setName("CallSiteTypeId"); 
-  Value &Addr = *(args++); 
-  Addr.setName("Addr"); 
-  Value &CFICheckFailData = *(args++); 
-  CFICheckFailData.setName("CFICheckFailData"); 
-  assert(args == F->arg_end()); 
- 
-  BasicBlock *BB = BasicBlock::Create(Ctx, "entry", F); 
-  BasicBlock *ExitBB = BasicBlock::Create(Ctx, "exit", F); 
- 
-  BasicBlock *TrapBB = BasicBlock::Create(Ctx, "fail", F); 
-  IRBuilder<> IRBFail(TrapBB); 
-  FunctionCallee CFICheckFailFn = 
-      M.getOrInsertFunction("__cfi_check_fail", Type::getVoidTy(Ctx), 
-                            Type::getInt8PtrTy(Ctx), Type::getInt8PtrTy(Ctx)); 
-  IRBFail.CreateCall(CFICheckFailFn, {&CFICheckFailData, &Addr}); 
-  IRBFail.CreateBr(ExitBB); 
- 
-  IRBuilder<> IRBExit(ExitBB); 
-  IRBExit.CreateRetVoid(); 
- 
-  IRBuilder<> IRB(BB); 
-  SwitchInst *SI = IRB.CreateSwitch(&CallSiteTypeId, TrapBB, TypeIds.size()); 
-  for (uint64_t TypeId : TypeIds) { 
-    ConstantInt *CaseTypeId = ConstantInt::get(Type::getInt64Ty(Ctx), TypeId); 
-    BasicBlock *TestBB = BasicBlock::Create(Ctx, "test", F); 
-    IRBuilder<> IRBTest(TestBB); 
-    Function *BitsetTestFn = Intrinsic::getDeclaration(&M, Intrinsic::type_test); 
- 
-    Value *Test = IRBTest.CreateCall( 
-        BitsetTestFn, {&Addr, MetadataAsValue::get( 
-                                  Ctx, ConstantAsMetadata::get(CaseTypeId))}); 
-    BranchInst *BI = IRBTest.CreateCondBr(Test, ExitBB, TrapBB); 
-    BI->setMetadata(LLVMContext::MD_prof, VeryLikelyWeights); 
- 
-    SI->addCase(CaseTypeId, TestBB); 
-    ++NumTypeIds; 
-  } 
-} 
- 
-bool CrossDSOCFI::runOnModule(Module &M) { 
-  VeryLikelyWeights = 
-    MDBuilder(M.getContext()).createBranchWeights((1U << 20) - 1, 1); 
-  if (M.getModuleFlag("Cross-DSO CFI") == nullptr) 
-    return false; 
-  buildCFICheck(M); 
-  return true; 
-} 
- 
-PreservedAnalyses CrossDSOCFIPass::run(Module &M, ModuleAnalysisManager &AM) { 
-  CrossDSOCFI Impl; 
-  bool Changed = Impl.runOnModule(M); 
-  if (!Changed) 
-    return PreservedAnalyses::all(); 
-  return PreservedAnalyses::none(); 
-} 
+//===-- CrossDSOCFI.cpp - Externalize this module's CFI checks ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass exports all llvm.bitset's found in the module in the form of a
+// __cfi_check function, which can be used to verify cross-DSO call targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/CrossDSOCFI.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "cross-dso-cfi"
+
+STATISTIC(NumTypeIds, "Number of unique type identifiers");
+
+namespace {
+
+struct CrossDSOCFI : public ModulePass {
+  static char ID;
+  CrossDSOCFI() : ModulePass(ID) {
+    initializeCrossDSOCFIPass(*PassRegistry::getPassRegistry());
+  }
+
+  MDNode *VeryLikelyWeights;
+
+  ConstantInt *extractNumericTypeId(MDNode *MD);
+  void buildCFICheck(Module &M);
+  bool runOnModule(Module &M) override;
+};
+
+} // anonymous namespace
+
+INITIALIZE_PASS_BEGIN(CrossDSOCFI, "cross-dso-cfi", "Cross-DSO CFI", false,
+                      false)
+INITIALIZE_PASS_END(CrossDSOCFI, "cross-dso-cfi", "Cross-DSO CFI", false, false)
+char CrossDSOCFI::ID = 0;
+
+ModulePass *llvm::createCrossDSOCFIPass() { return new CrossDSOCFI; }
+
+/// Extracts a numeric type identifier from an MDNode containing type metadata.
+ConstantInt *CrossDSOCFI::extractNumericTypeId(MDNode *MD) {
+  // This check excludes vtables for classes inside anonymous namespaces.
+  auto TM = dyn_cast<ValueAsMetadata>(MD->getOperand(1));
+  if (!TM)
+    return nullptr;
+  auto C = dyn_cast_or_null<ConstantInt>(TM->getValue());
+  if (!C) return nullptr;
+  // We are looking for i64 constants.
+  if (C->getBitWidth() != 64) return nullptr;
+
+  return C;
+}
+
+/// buildCFICheck - emits __cfi_check for the current module.
+void CrossDSOCFI::buildCFICheck(Module &M) {
+  // FIXME: verify that __cfi_check ends up near the end of the code section,
+  // but before the jump slots created in LowerTypeTests.
+  SetVector<uint64_t> TypeIds;
+  SmallVector<MDNode *, 2> Types;
+  for (GlobalObject &GO : M.global_objects()) {
+    Types.clear();
+    GO.getMetadata(LLVMContext::MD_type, Types);
+    for (MDNode *Type : Types)
+      if (ConstantInt *TypeId = extractNumericTypeId(Type))
+        TypeIds.insert(TypeId->getZExtValue());
+  }
+
+  NamedMDNode *CfiFunctionsMD = M.getNamedMetadata("cfi.functions");
+  if (CfiFunctionsMD) {
+    for (auto Func : CfiFunctionsMD->operands()) {
+      assert(Func->getNumOperands() >= 2);
+      for (unsigned I = 2; I < Func->getNumOperands(); ++I)
+        if (ConstantInt *TypeId =
+                extractNumericTypeId(cast<MDNode>(Func->getOperand(I).get())))
+          TypeIds.insert(TypeId->getZExtValue());
+    }
+  }
+
+  LLVMContext &Ctx = M.getContext();
+  FunctionCallee C = M.getOrInsertFunction(
+      "__cfi_check", Type::getVoidTy(Ctx), Type::getInt64Ty(Ctx),
+      Type::getInt8PtrTy(Ctx), Type::getInt8PtrTy(Ctx));
+  Function *F = cast<Function>(C.getCallee());
+  // Take over the existing function. The frontend emits a weak stub so that the
+  // linker knows about the symbol; this pass replaces the function body.
+  F->deleteBody();
+  F->setAlignment(Align(4096));
+
+  Triple T(M.getTargetTriple());
+  if (T.isARM() || T.isThumb())
+    F->addFnAttr("target-features", "+thumb-mode");
+
+  auto args = F->arg_begin();
+  Value &CallSiteTypeId = *(args++);
+  CallSiteTypeId.setName("CallSiteTypeId");
+  Value &Addr = *(args++);
+  Addr.setName("Addr");
+  Value &CFICheckFailData = *(args++);
+  CFICheckFailData.setName("CFICheckFailData");
+  assert(args == F->arg_end());
+
+  BasicBlock *BB = BasicBlock::Create(Ctx, "entry", F);
+  BasicBlock *ExitBB = BasicBlock::Create(Ctx, "exit", F);
+
+  BasicBlock *TrapBB = BasicBlock::Create(Ctx, "fail", F);
+  IRBuilder<> IRBFail(TrapBB);
+  FunctionCallee CFICheckFailFn =
+      M.getOrInsertFunction("__cfi_check_fail", Type::getVoidTy(Ctx),
+                            Type::getInt8PtrTy(Ctx), Type::getInt8PtrTy(Ctx));
+  IRBFail.CreateCall(CFICheckFailFn, {&CFICheckFailData, &Addr});
+  IRBFail.CreateBr(ExitBB);
+
+  IRBuilder<> IRBExit(ExitBB);
+  IRBExit.CreateRetVoid();
+
+  IRBuilder<> IRB(BB);
+  SwitchInst *SI = IRB.CreateSwitch(&CallSiteTypeId, TrapBB, TypeIds.size());
+  for (uint64_t TypeId : TypeIds) {
+    ConstantInt *CaseTypeId = ConstantInt::get(Type::getInt64Ty(Ctx), TypeId);
+    BasicBlock *TestBB = BasicBlock::Create(Ctx, "test", F);
+    IRBuilder<> IRBTest(TestBB);
+    Function *BitsetTestFn = Intrinsic::getDeclaration(&M, Intrinsic::type_test);
+
+    Value *Test = IRBTest.CreateCall(
+        BitsetTestFn, {&Addr, MetadataAsValue::get(
+                                  Ctx, ConstantAsMetadata::get(CaseTypeId))});
+    BranchInst *BI = IRBTest.CreateCondBr(Test, ExitBB, TrapBB);
+    BI->setMetadata(LLVMContext::MD_prof, VeryLikelyWeights);
+
+    SI->addCase(CaseTypeId, TestBB);
+    ++NumTypeIds;
+  }
+}
+
+bool CrossDSOCFI::runOnModule(Module &M) {
+  VeryLikelyWeights =
+    MDBuilder(M.getContext()).createBranchWeights((1U << 20) - 1, 1);
+  if (M.getModuleFlag("Cross-DSO CFI") == nullptr)
+    return false;
+  buildCFICheck(M);
+  return true;
+}
+
+PreservedAnalyses CrossDSOCFIPass::run(Module &M, ModuleAnalysisManager &AM) {
+  CrossDSOCFI Impl;
+  bool Changed = Impl.runOnModule(M);
+  if (!Changed)
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/DeadArgumentElimination.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/DeadArgumentElimination.cpp
index bfb1a83473..0b763e423f 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -1,1124 +1,1124 @@
-//===- DeadArgumentElimination.cpp - Eliminate dead arguments -------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass deletes dead arguments from internal functions.  Dead argument 
-// elimination removes arguments which are directly dead, as well as arguments 
-// only passed into function calls as dead arguments of other functions.  This 
-// pass also deletes dead return values in a similar way. 
-// 
-// This pass is often useful as a cleanup pass to run after aggressive 
-// interprocedural passes, which add possibly-dead arguments or return values. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/IPO/DeadArgumentElimination.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/IR/Argument.h" 
-#include "llvm/IR/Attributes.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/NoFolder.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/IPO.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include <cassert> 
-#include <cstdint> 
-#include <utility> 
-#include <vector> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "deadargelim" 
- 
-STATISTIC(NumArgumentsEliminated, "Number of unread args removed"); 
-STATISTIC(NumRetValsEliminated  , "Number of unused return values removed"); 
-STATISTIC(NumArgumentsReplacedWithUndef, 
-          "Number of unread args replaced with undef"); 
- 
-namespace { 
- 
-  /// DAE - The dead argument elimination pass. 
-  class DAE : public ModulePass { 
-  protected: 
-    // DAH uses this to specify a different ID. 
-    explicit DAE(char &ID) : ModulePass(ID) {} 
- 
-  public: 
-    static char ID; // Pass identification, replacement for typeid 
- 
-    DAE() : ModulePass(ID) { 
-      initializeDAEPass(*PassRegistry::getPassRegistry()); 
-    } 
- 
-    bool runOnModule(Module &M) override { 
-      if (skipModule(M)) 
-        return false; 
-      DeadArgumentEliminationPass DAEP(ShouldHackArguments()); 
-      ModuleAnalysisManager DummyMAM; 
-      PreservedAnalyses PA = DAEP.run(M, DummyMAM); 
-      return !PA.areAllPreserved(); 
-    } 
- 
-    virtual bool ShouldHackArguments() const { return false; } 
-  }; 
- 
-} // end anonymous namespace 
- 
-char DAE::ID = 0; 
- 
-INITIALIZE_PASS(DAE, "deadargelim", "Dead Argument Elimination", false, false) 
- 
-namespace { 
- 
-  /// DAH - DeadArgumentHacking pass - Same as dead argument elimination, but 
-  /// deletes arguments to functions which are external.  This is only for use 
-  /// by bugpoint. 
-  struct DAH : public DAE { 
-    static char ID; 
- 
-    DAH() : DAE(ID) {} 
- 
-    bool ShouldHackArguments() const override { return true; } 
-  }; 
- 
-} // end anonymous namespace 
- 
-char DAH::ID = 0; 
- 
-INITIALIZE_PASS(DAH, "deadarghaX0r", 
-                "Dead Argument Hacking (BUGPOINT USE ONLY; DO NOT USE)", 
-                false, false) 
- 
-/// createDeadArgEliminationPass - This pass removes arguments from functions 
-/// which are not used by the body of the function. 
-ModulePass *llvm::createDeadArgEliminationPass() { return new DAE(); } 
- 
-ModulePass *llvm::createDeadArgHackingPass() { return new DAH(); } 
- 
-/// DeleteDeadVarargs - If this is an function that takes a ... list, and if 
-/// llvm.vastart is never called, the varargs list is dead for the function. 
-bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) { 
-  assert(Fn.getFunctionType()->isVarArg() && "Function isn't varargs!"); 
-  if (Fn.isDeclaration() || !Fn.hasLocalLinkage()) return false; 
- 
-  // Ensure that the function is only directly called. 
-  if (Fn.hasAddressTaken()) 
-    return false; 
- 
-  // Don't touch naked functions. The assembly might be using an argument, or 
-  // otherwise rely on the frame layout in a way that this analysis will not 
-  // see. 
-  if (Fn.hasFnAttribute(Attribute::Naked)) { 
-    return false; 
-  } 
- 
-  // Okay, we know we can transform this function if safe.  Scan its body 
-  // looking for calls marked musttail or calls to llvm.vastart. 
-  for (BasicBlock &BB : Fn) { 
-    for (Instruction &I : BB) { 
-      CallInst *CI = dyn_cast<CallInst>(&I); 
-      if (!CI) 
-        continue; 
-      if (CI->isMustTailCall()) 
-        return false; 
-      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) { 
-        if (II->getIntrinsicID() == Intrinsic::vastart) 
-          return false; 
-      } 
-    } 
-  } 
- 
-  // If we get here, there are no calls to llvm.vastart in the function body, 
-  // remove the "..." and adjust all the calls. 
- 
-  // Start by computing a new prototype for the function, which is the same as 
-  // the old function, but doesn't have isVarArg set. 
-  FunctionType *FTy = Fn.getFunctionType(); 
- 
-  std::vector<Type *> Params(FTy->param_begin(), FTy->param_end()); 
-  FunctionType *NFTy = FunctionType::get(FTy->getReturnType(), 
-                                                Params, false); 
-  unsigned NumArgs = Params.size(); 
- 
-  // Create the new function body and insert it into the module... 
-  Function *NF = Function::Create(NFTy, Fn.getLinkage(), Fn.getAddressSpace()); 
-  NF->copyAttributesFrom(&Fn); 
-  NF->setComdat(Fn.getComdat()); 
-  Fn.getParent()->getFunctionList().insert(Fn.getIterator(), NF); 
-  NF->takeName(&Fn); 
- 
-  // Loop over all of the callers of the function, transforming the call sites 
-  // to pass in a smaller number of arguments into the new function. 
-  // 
-  std::vector<Value *> Args; 
-  for (Value::user_iterator I = Fn.user_begin(), E = Fn.user_end(); I != E; ) { 
-    CallBase *CB = dyn_cast<CallBase>(*I++); 
-    if (!CB) 
-      continue; 
- 
-    // Pass all the same arguments. 
-    Args.assign(CB->arg_begin(), CB->arg_begin() + NumArgs); 
- 
-    // Drop any attributes that were on the vararg arguments. 
-    AttributeList PAL = CB->getAttributes(); 
-    if (!PAL.isEmpty()) { 
-      SmallVector<AttributeSet, 8> ArgAttrs; 
-      for (unsigned ArgNo = 0; ArgNo < NumArgs; ++ArgNo) 
-        ArgAttrs.push_back(PAL.getParamAttributes(ArgNo)); 
-      PAL = AttributeList::get(Fn.getContext(), PAL.getFnAttributes(), 
-                               PAL.getRetAttributes(), ArgAttrs); 
-    } 
- 
-    SmallVector<OperandBundleDef, 1> OpBundles; 
-    CB->getOperandBundlesAsDefs(OpBundles); 
- 
-    CallBase *NewCB = nullptr; 
-    if (InvokeInst *II = dyn_cast<InvokeInst>(CB)) { 
-      NewCB = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(), 
-                                 Args, OpBundles, "", CB); 
-    } else { 
-      NewCB = CallInst::Create(NF, Args, OpBundles, "", CB); 
-      cast<CallInst>(NewCB)->setTailCallKind( 
-          cast<CallInst>(CB)->getTailCallKind()); 
-    } 
-    NewCB->setCallingConv(CB->getCallingConv()); 
-    NewCB->setAttributes(PAL); 
-    NewCB->copyMetadata(*CB, {LLVMContext::MD_prof, LLVMContext::MD_dbg}); 
- 
-    Args.clear(); 
- 
-    if (!CB->use_empty()) 
-      CB->replaceAllUsesWith(NewCB); 
- 
-    NewCB->takeName(CB); 
- 
-    // Finally, remove the old call from the program, reducing the use-count of 
-    // F. 
-    CB->eraseFromParent(); 
-  } 
- 
-  // Since we have now created the new function, splice the body of the old 
-  // function right into the new function, leaving the old rotting hulk of the 
-  // function empty. 
-  NF->getBasicBlockList().splice(NF->begin(), Fn.getBasicBlockList()); 
- 
-  // Loop over the argument list, transferring uses of the old arguments over to 
-  // the new arguments, also transferring over the names as well.  While we're at 
-  // it, remove the dead arguments from the DeadArguments list. 
-  for (Function::arg_iterator I = Fn.arg_begin(), E = Fn.arg_end(), 
-       I2 = NF->arg_begin(); I != E; ++I, ++I2) { 
-    // Move the name and users over to the new version. 
-    I->replaceAllUsesWith(&*I2); 
-    I2->takeName(&*I); 
-  } 
- 
-  // Clone metadatas from the old function, including debug info descriptor. 
-  SmallVector<std::pair<unsigned, MDNode *>, 1> MDs; 
-  Fn.getAllMetadata(MDs); 
-  for (auto MD : MDs) 
-    NF->addMetadata(MD.first, *MD.second); 
- 
-  // Fix up any BlockAddresses that refer to the function. 
-  Fn.replaceAllUsesWith(ConstantExpr::getBitCast(NF, Fn.getType())); 
-  // Delete the bitcast that we just created, so that NF does not 
-  // appear to be address-taken. 
-  NF->removeDeadConstantUsers(); 
-  // Finally, nuke the old function. 
-  Fn.eraseFromParent(); 
-  return true; 
-} 
- 
-/// RemoveDeadArgumentsFromCallers - Checks if the given function has any 
-/// arguments that are unused, and changes the caller parameters to be undefined 
-/// instead. 
-bool DeadArgumentEliminationPass::RemoveDeadArgumentsFromCallers(Function &Fn) { 
-  // We cannot change the arguments if this TU does not define the function or 
-  // if the linker may choose a function body from another TU, even if the 
-  // nominal linkage indicates that other copies of the function have the same 
-  // semantics. In the below example, the dead load from %p may not have been 
-  // eliminated from the linker-chosen copy of f, so replacing %p with undef 
-  // in callers may introduce undefined behavior. 
-  // 
-  // define linkonce_odr void @f(i32* %p) { 
-  //   %v = load i32 %p 
-  //   ret void 
-  // } 
-  if (!Fn.hasExactDefinition()) 
-    return false; 
- 
-  // Functions with local linkage should already have been handled, except the 
-  // fragile (variadic) ones which we can improve here. 
-  if (Fn.hasLocalLinkage() && !Fn.getFunctionType()->isVarArg()) 
-    return false; 
- 
-  // Don't touch naked functions. The assembly might be using an argument, or 
-  // otherwise rely on the frame layout in a way that this analysis will not 
-  // see. 
-  if (Fn.hasFnAttribute(Attribute::Naked)) 
-    return false; 
- 
-  if (Fn.use_empty()) 
-    return false; 
- 
-  SmallVector<unsigned, 8> UnusedArgs; 
-  bool Changed = false; 
- 
-  for (Argument &Arg : Fn.args()) { 
-    if (!Arg.hasSwiftErrorAttr() && Arg.use_empty() && 
+//===- DeadArgumentElimination.cpp - Eliminate dead arguments -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass deletes dead arguments from internal functions.  Dead argument
+// elimination removes arguments which are directly dead, as well as arguments
+// only passed into function calls as dead arguments of other functions.  This
+// pass also deletes dead return values in a similar way.
+//
+// This pass is often useful as a cleanup pass to run after aggressive
+// interprocedural passes, which add possibly-dead arguments or return values.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/DeadArgumentElimination.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/NoFolder.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <cassert>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "deadargelim"
+
+STATISTIC(NumArgumentsEliminated, "Number of unread args removed");
+STATISTIC(NumRetValsEliminated  , "Number of unused return values removed");
+STATISTIC(NumArgumentsReplacedWithUndef,
+          "Number of unread args replaced with undef");
+
+namespace {
+
+  /// DAE - The dead argument elimination pass.
+  class DAE : public ModulePass {
+  protected:
+    // DAH uses this to specify a different ID.
+    explicit DAE(char &ID) : ModulePass(ID) {}
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+
+    DAE() : ModulePass(ID) {
+      initializeDAEPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnModule(Module &M) override {
+      if (skipModule(M))
+        return false;
+      DeadArgumentEliminationPass DAEP(ShouldHackArguments());
+      ModuleAnalysisManager DummyMAM;
+      PreservedAnalyses PA = DAEP.run(M, DummyMAM);
+      return !PA.areAllPreserved();
+    }
+
+    virtual bool ShouldHackArguments() const { return false; }
+  };
+
+} // end anonymous namespace
+
+char DAE::ID = 0;
+
+INITIALIZE_PASS(DAE, "deadargelim", "Dead Argument Elimination", false, false)
+
+namespace {
+
+  /// DAH - DeadArgumentHacking pass - Same as dead argument elimination, but
+  /// deletes arguments to functions which are external.  This is only for use
+  /// by bugpoint.
+  struct DAH : public DAE {
+    static char ID;
+
+    DAH() : DAE(ID) {}
+
+    bool ShouldHackArguments() const override { return true; }
+  };
+
+} // end anonymous namespace
+
+char DAH::ID = 0;
+
+INITIALIZE_PASS(DAH, "deadarghaX0r",
+                "Dead Argument Hacking (BUGPOINT USE ONLY; DO NOT USE)",
+                false, false)
+
+/// createDeadArgEliminationPass - This pass removes arguments from functions
+/// which are not used by the body of the function.
+ModulePass *llvm::createDeadArgEliminationPass() { return new DAE(); }
+
+ModulePass *llvm::createDeadArgHackingPass() { return new DAH(); }
+
+/// DeleteDeadVarargs - If this is an function that takes a ... list, and if
+/// llvm.vastart is never called, the varargs list is dead for the function.
+bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) {
+  assert(Fn.getFunctionType()->isVarArg() && "Function isn't varargs!");
+  if (Fn.isDeclaration() || !Fn.hasLocalLinkage()) return false;
+
+  // Ensure that the function is only directly called.
+  if (Fn.hasAddressTaken())
+    return false;
+
+  // Don't touch naked functions. The assembly might be using an argument, or
+  // otherwise rely on the frame layout in a way that this analysis will not
+  // see.
+  if (Fn.hasFnAttribute(Attribute::Naked)) {
+    return false;
+  }
+
+  // Okay, we know we can transform this function if safe.  Scan its body
+  // looking for calls marked musttail or calls to llvm.vastart.
+  for (BasicBlock &BB : Fn) {
+    for (Instruction &I : BB) {
+      CallInst *CI = dyn_cast<CallInst>(&I);
+      if (!CI)
+        continue;
+      if (CI->isMustTailCall())
+        return false;
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
+        if (II->getIntrinsicID() == Intrinsic::vastart)
+          return false;
+      }
+    }
+  }
+
+  // If we get here, there are no calls to llvm.vastart in the function body,
+  // remove the "..." and adjust all the calls.
+
+  // Start by computing a new prototype for the function, which is the same as
+  // the old function, but doesn't have isVarArg set.
+  FunctionType *FTy = Fn.getFunctionType();
+
+  std::vector<Type *> Params(FTy->param_begin(), FTy->param_end());
+  FunctionType *NFTy = FunctionType::get(FTy->getReturnType(),
+                                                Params, false);
+  unsigned NumArgs = Params.size();
+
+  // Create the new function body and insert it into the module...
+  Function *NF = Function::Create(NFTy, Fn.getLinkage(), Fn.getAddressSpace());
+  NF->copyAttributesFrom(&Fn);
+  NF->setComdat(Fn.getComdat());
+  Fn.getParent()->getFunctionList().insert(Fn.getIterator(), NF);
+  NF->takeName(&Fn);
+
+  // Loop over all of the callers of the function, transforming the call sites
+  // to pass in a smaller number of arguments into the new function.
+  //
+  std::vector<Value *> Args;
+  for (Value::user_iterator I = Fn.user_begin(), E = Fn.user_end(); I != E; ) {
+    CallBase *CB = dyn_cast<CallBase>(*I++);
+    if (!CB)
+      continue;
+
+    // Pass all the same arguments.
+    Args.assign(CB->arg_begin(), CB->arg_begin() + NumArgs);
+
+    // Drop any attributes that were on the vararg arguments.
+    AttributeList PAL = CB->getAttributes();
+    if (!PAL.isEmpty()) {
+      SmallVector<AttributeSet, 8> ArgAttrs;
+      for (unsigned ArgNo = 0; ArgNo < NumArgs; ++ArgNo)
+        ArgAttrs.push_back(PAL.getParamAttributes(ArgNo));
+      PAL = AttributeList::get(Fn.getContext(), PAL.getFnAttributes(),
+                               PAL.getRetAttributes(), ArgAttrs);
+    }
+
+    SmallVector<OperandBundleDef, 1> OpBundles;
+    CB->getOperandBundlesAsDefs(OpBundles);
+
+    CallBase *NewCB = nullptr;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(CB)) {
+      NewCB = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
+                                 Args, OpBundles, "", CB);
+    } else {
+      NewCB = CallInst::Create(NF, Args, OpBundles, "", CB);
+      cast<CallInst>(NewCB)->setTailCallKind(
+          cast<CallInst>(CB)->getTailCallKind());
+    }
+    NewCB->setCallingConv(CB->getCallingConv());
+    NewCB->setAttributes(PAL);
+    NewCB->copyMetadata(*CB, {LLVMContext::MD_prof, LLVMContext::MD_dbg});
+
+    Args.clear();
+
+    if (!CB->use_empty())
+      CB->replaceAllUsesWith(NewCB);
+
+    NewCB->takeName(CB);
+
+    // Finally, remove the old call from the program, reducing the use-count of
+    // F.
+    CB->eraseFromParent();
+  }
+
+  // Since we have now created the new function, splice the body of the old
+  // function right into the new function, leaving the old rotting hulk of the
+  // function empty.
+  NF->getBasicBlockList().splice(NF->begin(), Fn.getBasicBlockList());
+
+  // Loop over the argument list, transferring uses of the old arguments over to
+  // the new arguments, also transferring over the names as well.  While we're at
+  // it, remove the dead arguments from the DeadArguments list.
+  for (Function::arg_iterator I = Fn.arg_begin(), E = Fn.arg_end(),
+       I2 = NF->arg_begin(); I != E; ++I, ++I2) {
+    // Move the name and users over to the new version.
+    I->replaceAllUsesWith(&*I2);
+    I2->takeName(&*I);
+  }
+
+  // Clone metadatas from the old function, including debug info descriptor.
+  SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+  Fn.getAllMetadata(MDs);
+  for (auto MD : MDs)
+    NF->addMetadata(MD.first, *MD.second);
+
+  // Fix up any BlockAddresses that refer to the function.
+  Fn.replaceAllUsesWith(ConstantExpr::getBitCast(NF, Fn.getType()));
+  // Delete the bitcast that we just created, so that NF does not
+  // appear to be address-taken.
+  NF->removeDeadConstantUsers();
+  // Finally, nuke the old function.
+  Fn.eraseFromParent();
+  return true;
+}
+
+/// RemoveDeadArgumentsFromCallers - Checks if the given function has any
+/// arguments that are unused, and changes the caller parameters to be undefined
+/// instead.
+bool DeadArgumentEliminationPass::RemoveDeadArgumentsFromCallers(Function &Fn) {
+  // We cannot change the arguments if this TU does not define the function or
+  // if the linker may choose a function body from another TU, even if the
+  // nominal linkage indicates that other copies of the function have the same
+  // semantics. In the below example, the dead load from %p may not have been
+  // eliminated from the linker-chosen copy of f, so replacing %p with undef
+  // in callers may introduce undefined behavior.
+  //
+  // define linkonce_odr void @f(i32* %p) {
+  //   %v = load i32 %p
+  //   ret void
+  // }
+  if (!Fn.hasExactDefinition())
+    return false;
+
+  // Functions with local linkage should already have been handled, except the
+  // fragile (variadic) ones which we can improve here.
+  if (Fn.hasLocalLinkage() && !Fn.getFunctionType()->isVarArg())
+    return false;
+
+  // Don't touch naked functions. The assembly might be using an argument, or
+  // otherwise rely on the frame layout in a way that this analysis will not
+  // see.
+  if (Fn.hasFnAttribute(Attribute::Naked))
+    return false;
+
+  if (Fn.use_empty())
+    return false;
+
+  SmallVector<unsigned, 8> UnusedArgs;
+  bool Changed = false;
+
+  for (Argument &Arg : Fn.args()) {
+    if (!Arg.hasSwiftErrorAttr() && Arg.use_empty() &&
         !Arg.hasPassPointeeByValueCopyAttr()) {
-      if (Arg.isUsedByMetadata()) { 
-        Arg.replaceAllUsesWith(UndefValue::get(Arg.getType())); 
-        Changed = true; 
-      } 
-      UnusedArgs.push_back(Arg.getArgNo()); 
-    } 
-  } 
- 
-  if (UnusedArgs.empty()) 
-    return false; 
- 
-  for (Use &U : Fn.uses()) { 
-    CallBase *CB = dyn_cast<CallBase>(U.getUser()); 
-    if (!CB || !CB->isCallee(&U)) 
-      continue; 
- 
-    // Now go through all unused args and replace them with "undef". 
-    for (unsigned I = 0, E = UnusedArgs.size(); I != E; ++I) { 
-      unsigned ArgNo = UnusedArgs[I]; 
- 
-      Value *Arg = CB->getArgOperand(ArgNo); 
-      CB->setArgOperand(ArgNo, UndefValue::get(Arg->getType())); 
-      ++NumArgumentsReplacedWithUndef; 
-      Changed = true; 
-    } 
-  } 
- 
-  return Changed; 
-} 
- 
-/// Convenience function that returns the number of return values. It returns 0 
-/// for void functions and 1 for functions not returning a struct. It returns 
-/// the number of struct elements for functions returning a struct. 
-static unsigned NumRetVals(const Function *F) { 
-  Type *RetTy = F->getReturnType(); 
-  if (RetTy->isVoidTy()) 
-    return 0; 
-  else if (StructType *STy = dyn_cast<StructType>(RetTy)) 
-    return STy->getNumElements(); 
-  else if (ArrayType *ATy = dyn_cast<ArrayType>(RetTy)) 
-    return ATy->getNumElements(); 
-  else 
-    return 1; 
-} 
- 
-/// Returns the sub-type a function will return at a given Idx. Should 
-/// correspond to the result type of an ExtractValue instruction executed with 
-/// just that one Idx (i.e. only top-level structure is considered). 
-static Type *getRetComponentType(const Function *F, unsigned Idx) { 
-  Type *RetTy = F->getReturnType(); 
-  assert(!RetTy->isVoidTy() && "void type has no subtype"); 
- 
-  if (StructType *STy = dyn_cast<StructType>(RetTy)) 
-    return STy->getElementType(Idx); 
-  else if (ArrayType *ATy = dyn_cast<ArrayType>(RetTy)) 
-    return ATy->getElementType(); 
-  else 
-    return RetTy; 
-} 
- 
-/// MarkIfNotLive - This checks Use for liveness in LiveValues. If Use is not 
-/// live, it adds Use to the MaybeLiveUses argument. Returns the determined 
-/// liveness of Use. 
-DeadArgumentEliminationPass::Liveness 
-DeadArgumentEliminationPass::MarkIfNotLive(RetOrArg Use, 
-                                           UseVector &MaybeLiveUses) { 
-  // We're live if our use or its Function is already marked as live. 
-  if (IsLive(Use)) 
-    return Live; 
- 
-  // We're maybe live otherwise, but remember that we must become live if 
-  // Use becomes live. 
-  MaybeLiveUses.push_back(Use); 
-  return MaybeLive; 
-} 
- 
-/// SurveyUse - This looks at a single use of an argument or return value 
-/// and determines if it should be alive or not. Adds this use to MaybeLiveUses 
-/// if it causes the used value to become MaybeLive. 
-/// 
-/// RetValNum is the return value number to use when this use is used in a 
-/// return instruction. This is used in the recursion, you should always leave 
-/// it at 0. 
-DeadArgumentEliminationPass::Liveness 
-DeadArgumentEliminationPass::SurveyUse(const Use *U, UseVector &MaybeLiveUses, 
-                                       unsigned RetValNum) { 
-    const User *V = U->getUser(); 
-    if (const ReturnInst *RI = dyn_cast<ReturnInst>(V)) { 
-      // The value is returned from a function. It's only live when the 
-      // function's return value is live. We use RetValNum here, for the case 
-      // that U is really a use of an insertvalue instruction that uses the 
-      // original Use. 
-      const Function *F = RI->getParent()->getParent(); 
-      if (RetValNum != -1U) { 
-        RetOrArg Use = CreateRet(F, RetValNum); 
-        // We might be live, depending on the liveness of Use. 
-        return MarkIfNotLive(Use, MaybeLiveUses); 
-      } else { 
-        DeadArgumentEliminationPass::Liveness Result = MaybeLive; 
-        for (unsigned Ri = 0; Ri < NumRetVals(F); ++Ri) { 
-          RetOrArg Use = CreateRet(F, Ri); 
-          // We might be live, depending on the liveness of Use. If any 
-          // sub-value is live, then the entire value is considered live. This 
-          // is a conservative choice, and better tracking is possible. 
-          DeadArgumentEliminationPass::Liveness SubResult = 
-              MarkIfNotLive(Use, MaybeLiveUses); 
-          if (Result != Live) 
-            Result = SubResult; 
-        } 
-        return Result; 
-      } 
-    } 
-    if (const InsertValueInst *IV = dyn_cast<InsertValueInst>(V)) { 
-      if (U->getOperandNo() != InsertValueInst::getAggregateOperandIndex() 
-          && IV->hasIndices()) 
-        // The use we are examining is inserted into an aggregate. Our liveness 
-        // depends on all uses of that aggregate, but if it is used as a return 
-        // value, only index at which we were inserted counts. 
-        RetValNum = *IV->idx_begin(); 
- 
-      // Note that if we are used as the aggregate operand to the insertvalue, 
-      // we don't change RetValNum, but do survey all our uses. 
- 
-      Liveness Result = MaybeLive; 
-      for (const Use &UU : IV->uses()) { 
-        Result = SurveyUse(&UU, MaybeLiveUses, RetValNum); 
-        if (Result == Live) 
-          break; 
-      } 
-      return Result; 
-    } 
- 
-    if (const auto *CB = dyn_cast<CallBase>(V)) { 
-      const Function *F = CB->getCalledFunction(); 
-      if (F) { 
-        // Used in a direct call. 
- 
-        // The function argument is live if it is used as a bundle operand. 
-        if (CB->isBundleOperand(U)) 
-          return Live; 
- 
-        // Find the argument number. We know for sure that this use is an 
-        // argument, since if it was the function argument this would be an 
-        // indirect call and the we know can't be looking at a value of the 
-        // label type (for the invoke instruction). 
-        unsigned ArgNo = CB->getArgOperandNo(U); 
- 
-        if (ArgNo >= F->getFunctionType()->getNumParams()) 
-          // The value is passed in through a vararg! Must be live. 
-          return Live; 
- 
-        assert(CB->getArgOperand(ArgNo) == CB->getOperand(U->getOperandNo()) && 
-               "Argument is not where we expected it"); 
- 
-        // Value passed to a normal call. It's only live when the corresponding 
-        // argument to the called function turns out live. 
-        RetOrArg Use = CreateArg(F, ArgNo); 
-        return MarkIfNotLive(Use, MaybeLiveUses); 
-      } 
-    } 
-    // Used in any other way? Value must be live. 
-    return Live; 
-} 
- 
-/// SurveyUses - This looks at all the uses of the given value 
-/// Returns the Liveness deduced from the uses of this value. 
-/// 
-/// Adds all uses that cause the result to be MaybeLive to MaybeLiveRetUses. If 
-/// the result is Live, MaybeLiveUses might be modified but its content should 
-/// be ignored (since it might not be complete). 
-DeadArgumentEliminationPass::Liveness 
-DeadArgumentEliminationPass::SurveyUses(const Value *V, 
-                                        UseVector &MaybeLiveUses) { 
-  // Assume it's dead (which will only hold if there are no uses at all..). 
-  Liveness Result = MaybeLive; 
-  // Check each use. 
-  for (const Use &U : V->uses()) { 
-    Result = SurveyUse(&U, MaybeLiveUses); 
-    if (Result == Live) 
-      break; 
-  } 
-  return Result; 
-} 
- 
-// SurveyFunction - This performs the initial survey of the specified function, 
-// checking out whether or not it uses any of its incoming arguments or whether 
-// any callers use the return value.  This fills in the LiveValues set and Uses 
-// map. 
-// 
-// We consider arguments of non-internal functions to be intrinsically alive as 
-// well as arguments to functions which have their "address taken". 
-void DeadArgumentEliminationPass::SurveyFunction(const Function &F) { 
-  // Functions with inalloca/preallocated parameters are expecting args in a 
-  // particular register and memory layout. 
-  if (F.getAttributes().hasAttrSomewhere(Attribute::InAlloca) || 
-      F.getAttributes().hasAttrSomewhere(Attribute::Preallocated)) { 
-    MarkLive(F); 
-    return; 
-  } 
- 
-  // Don't touch naked functions. The assembly might be using an argument, or 
-  // otherwise rely on the frame layout in a way that this analysis will not 
-  // see. 
-  if (F.hasFnAttribute(Attribute::Naked)) { 
-    MarkLive(F); 
-    return; 
-  } 
- 
-  unsigned RetCount = NumRetVals(&F); 
- 
-  // Assume all return values are dead 
-  using RetVals = SmallVector<Liveness, 5>; 
- 
-  RetVals RetValLiveness(RetCount, MaybeLive); 
- 
-  using RetUses = SmallVector<UseVector, 5>; 
- 
-  // These vectors map each return value to the uses that make it MaybeLive, so 
-  // we can add those to the Uses map if the return value really turns out to be 
-  // MaybeLive. Initialized to a list of RetCount empty lists. 
-  RetUses MaybeLiveRetUses(RetCount); 
- 
-  bool HasMustTailCalls = false; 
- 
-  for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { 
-    if (const ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) { 
-      if (RI->getNumOperands() != 0 && RI->getOperand(0)->getType() 
-          != F.getFunctionType()->getReturnType()) { 
-        // We don't support old style multiple return values. 
-        MarkLive(F); 
-        return; 
-      } 
-    } 
- 
-    // If we have any returns of `musttail` results - the signature can't 
-    // change 
-    if (BB->getTerminatingMustTailCall() != nullptr) 
-      HasMustTailCalls = true; 
-  } 
- 
-  if (HasMustTailCalls) { 
-    LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - " << F.getName() 
-                      << " has musttail calls\n"); 
-  } 
- 
-  if (!F.hasLocalLinkage() && (!ShouldHackArguments || F.isIntrinsic())) { 
-    MarkLive(F); 
-    return; 
-  } 
- 
-  LLVM_DEBUG( 
-      dbgs() << "DeadArgumentEliminationPass - Inspecting callers for fn: " 
-             << F.getName() << "\n"); 
-  // Keep track of the number of live retvals, so we can skip checks once all 
-  // of them turn out to be live. 
-  unsigned NumLiveRetVals = 0; 
- 
-  bool HasMustTailCallers = false; 
- 
-  // Loop all uses of the function. 
-  for (const Use &U : F.uses()) { 
-    // If the function is PASSED IN as an argument, its address has been 
-    // taken. 
-    const auto *CB = dyn_cast<CallBase>(U.getUser()); 
-    if (!CB || !CB->isCallee(&U)) { 
-      MarkLive(F); 
-      return; 
-    } 
- 
-    // The number of arguments for `musttail` call must match the number of 
-    // arguments of the caller 
-    if (CB->isMustTailCall()) 
-      HasMustTailCallers = true; 
- 
-    // If we end up here, we are looking at a direct call to our function. 
- 
-    // Now, check how our return value(s) is/are used in this caller. Don't 
-    // bother checking return values if all of them are live already. 
-    if (NumLiveRetVals == RetCount) 
-      continue; 
- 
-    // Check all uses of the return value. 
-    for (const Use &U : CB->uses()) { 
-      if (ExtractValueInst *Ext = dyn_cast<ExtractValueInst>(U.getUser())) { 
-        // This use uses a part of our return value, survey the uses of 
-        // that part and store the results for this index only. 
-        unsigned Idx = *Ext->idx_begin(); 
-        if (RetValLiveness[Idx] != Live) { 
-          RetValLiveness[Idx] = SurveyUses(Ext, MaybeLiveRetUses[Idx]); 
-          if (RetValLiveness[Idx] == Live) 
-            NumLiveRetVals++; 
-        } 
-      } else { 
-        // Used by something else than extractvalue. Survey, but assume that the 
-        // result applies to all sub-values. 
-        UseVector MaybeLiveAggregateUses; 
-        if (SurveyUse(&U, MaybeLiveAggregateUses) == Live) { 
-          NumLiveRetVals = RetCount; 
-          RetValLiveness.assign(RetCount, Live); 
-          break; 
-        } else { 
-          for (unsigned Ri = 0; Ri != RetCount; ++Ri) { 
-            if (RetValLiveness[Ri] != Live) 
-              MaybeLiveRetUses[Ri].append(MaybeLiveAggregateUses.begin(), 
-                                          MaybeLiveAggregateUses.end()); 
-          } 
-        } 
-      } 
-    } 
-  } 
- 
-  if (HasMustTailCallers) { 
-    LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - " << F.getName() 
-                      << " has musttail callers\n"); 
-  } 
- 
-  // Now we've inspected all callers, record the liveness of our return values. 
-  for (unsigned Ri = 0; Ri != RetCount; ++Ri) 
-    MarkValue(CreateRet(&F, Ri), RetValLiveness[Ri], MaybeLiveRetUses[Ri]); 
- 
-  LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Inspecting args for fn: " 
-                    << F.getName() << "\n"); 
- 
-  // Now, check all of our arguments. 
-  unsigned ArgI = 0; 
-  UseVector MaybeLiveArgUses; 
-  for (Function::const_arg_iterator AI = F.arg_begin(), E = F.arg_end(); 
-       AI != E; ++AI, ++ArgI) { 
-    Liveness Result; 
-    if (F.getFunctionType()->isVarArg() || HasMustTailCallers || 
-        HasMustTailCalls) { 
-      // Variadic functions will already have a va_arg function expanded inside 
-      // them, making them potentially very sensitive to ABI changes resulting 
-      // from removing arguments entirely, so don't. For example AArch64 handles 
-      // register and stack HFAs very differently, and this is reflected in the 
-      // IR which has already been generated. 
-      // 
-      // `musttail` calls to this function restrict argument removal attempts. 
-      // The signature of the caller must match the signature of the function. 
-      // 
-      // `musttail` calls in this function prevents us from changing its 
-      // signature 
-      Result = Live; 
-    } else { 
-      // See what the effect of this use is (recording any uses that cause 
-      // MaybeLive in MaybeLiveArgUses). 
-      Result = SurveyUses(&*AI, MaybeLiveArgUses); 
-    } 
- 
-    // Mark the result. 
-    MarkValue(CreateArg(&F, ArgI), Result, MaybeLiveArgUses); 
-    // Clear the vector again for the next iteration. 
-    MaybeLiveArgUses.clear(); 
-  } 
-} 
- 
-/// MarkValue - This function marks the liveness of RA depending on L. If L is 
-/// MaybeLive, it also takes all uses in MaybeLiveUses and records them in Uses, 
-/// such that RA will be marked live if any use in MaybeLiveUses gets marked 
-/// live later on. 
-void DeadArgumentEliminationPass::MarkValue(const RetOrArg &RA, Liveness L, 
-                                            const UseVector &MaybeLiveUses) { 
-  switch (L) { 
-    case Live: 
-      MarkLive(RA); 
-      break; 
-    case MaybeLive: 
-      assert(!IsLive(RA) && "Use is already live!"); 
-      for (const auto &MaybeLiveUse : MaybeLiveUses) { 
-        if (IsLive(MaybeLiveUse)) { 
-          // A use is live, so this value is live. 
-          MarkLive(RA); 
-          break; 
-        } else { 
-          // Note any uses of this value, so this value can be 
-          // marked live whenever one of the uses becomes live. 
-          Uses.insert(std::make_pair(MaybeLiveUse, RA)); 
-        } 
-      } 
-      break; 
-  } 
-} 
- 
-/// MarkLive - Mark the given Function as alive, meaning that it cannot be 
-/// changed in any way. Additionally, 
-/// mark any values that are used as this function's parameters or by its return 
-/// values (according to Uses) live as well. 
-void DeadArgumentEliminationPass::MarkLive(const Function &F) { 
-  LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Intrinsically live fn: " 
-                    << F.getName() << "\n"); 
-  // Mark the function as live. 
-  LiveFunctions.insert(&F); 
-  // Mark all arguments as live. 
-  for (unsigned ArgI = 0, E = F.arg_size(); ArgI != E; ++ArgI) 
-    PropagateLiveness(CreateArg(&F, ArgI)); 
-  // Mark all return values as live. 
-  for (unsigned Ri = 0, E = NumRetVals(&F); Ri != E; ++Ri) 
-    PropagateLiveness(CreateRet(&F, Ri)); 
-} 
- 
-/// MarkLive - Mark the given return value or argument as live. Additionally, 
-/// mark any values that are used by this value (according to Uses) live as 
-/// well. 
-void DeadArgumentEliminationPass::MarkLive(const RetOrArg &RA) { 
-  if (IsLive(RA)) 
-    return; // Already marked Live. 
- 
-  LiveValues.insert(RA); 
- 
-  LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Marking " 
-                    << RA.getDescription() << " live\n"); 
-  PropagateLiveness(RA); 
-} 
- 
-bool DeadArgumentEliminationPass::IsLive(const RetOrArg &RA) { 
-  return LiveFunctions.count(RA.F) || LiveValues.count(RA); 
-} 
- 
-/// PropagateLiveness - Given that RA is a live value, propagate it's liveness 
-/// to any other values it uses (according to Uses). 
-void DeadArgumentEliminationPass::PropagateLiveness(const RetOrArg &RA) { 
-  // We don't use upper_bound (or equal_range) here, because our recursive call 
-  // to ourselves is likely to cause the upper_bound (which is the first value 
-  // not belonging to RA) to become erased and the iterator invalidated. 
-  UseMap::iterator Begin = Uses.lower_bound(RA); 
-  UseMap::iterator E = Uses.end(); 
-  UseMap::iterator I; 
-  for (I = Begin; I != E && I->first == RA; ++I) 
-    MarkLive(I->second); 
- 
-  // Erase RA from the Uses map (from the lower bound to wherever we ended up 
-  // after the loop). 
-  Uses.erase(Begin, I); 
-} 
- 
-// RemoveDeadStuffFromFunction - Remove any arguments and return values from F 
-// that are not in LiveValues. Transform the function and all of the callees of 
-// the function to not have these arguments and return values. 
-// 
-bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { 
-  // Don't modify fully live functions 
-  if (LiveFunctions.count(F)) 
-    return false; 
- 
-  // Start by computing a new prototype for the function, which is the same as 
-  // the old function, but has fewer arguments and a different return type. 
-  FunctionType *FTy = F->getFunctionType(); 
-  std::vector<Type*> Params; 
- 
-  // Keep track of if we have a live 'returned' argument 
-  bool HasLiveReturnedArg = false; 
- 
-  // Set up to build a new list of parameter attributes. 
-  SmallVector<AttributeSet, 8> ArgAttrVec; 
-  const AttributeList &PAL = F->getAttributes(); 
- 
-  // Remember which arguments are still alive. 
-  SmallVector<bool, 10> ArgAlive(FTy->getNumParams(), false); 
-  // Construct the new parameter list from non-dead arguments. Also construct 
-  // a new set of parameter attributes to correspond. Skip the first parameter 
-  // attribute, since that belongs to the return value. 
-  unsigned ArgI = 0; 
-  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; 
-       ++I, ++ArgI) { 
-    RetOrArg Arg = CreateArg(F, ArgI); 
-    if (LiveValues.erase(Arg)) { 
-      Params.push_back(I->getType()); 
-      ArgAlive[ArgI] = true; 
-      ArgAttrVec.push_back(PAL.getParamAttributes(ArgI)); 
-      HasLiveReturnedArg |= PAL.hasParamAttribute(ArgI, Attribute::Returned); 
-    } else { 
-      ++NumArgumentsEliminated; 
-      LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Removing argument " 
-                        << ArgI << " (" << I->getName() << ") from " 
-                        << F->getName() << "\n"); 
-    } 
-  } 
- 
-  // Find out the new return value. 
-  Type *RetTy = FTy->getReturnType(); 
-  Type *NRetTy = nullptr; 
-  unsigned RetCount = NumRetVals(F); 
- 
-  // -1 means unused, other numbers are the new index 
-  SmallVector<int, 5> NewRetIdxs(RetCount, -1); 
-  std::vector<Type*> RetTypes; 
- 
-  // If there is a function with a live 'returned' argument but a dead return 
-  // value, then there are two possible actions: 
-  // 1) Eliminate the return value and take off the 'returned' attribute on the 
-  //    argument. 
-  // 2) Retain the 'returned' attribute and treat the return value (but not the 
-  //    entire function) as live so that it is not eliminated. 
-  // 
-  // It's not clear in the general case which option is more profitable because, 
-  // even in the absence of explicit uses of the return value, code generation 
-  // is free to use the 'returned' attribute to do things like eliding 
-  // save/restores of registers across calls. Whether or not this happens is 
-  // target and ABI-specific as well as depending on the amount of register 
-  // pressure, so there's no good way for an IR-level pass to figure this out. 
-  // 
-  // Fortunately, the only places where 'returned' is currently generated by 
-  // the FE are places where 'returned' is basically free and almost always a 
-  // performance win, so the second option can just be used always for now. 
-  // 
-  // This should be revisited if 'returned' is ever applied more liberally. 
-  if (RetTy->isVoidTy() || HasLiveReturnedArg) { 
-    NRetTy = RetTy; 
-  } else { 
-    // Look at each of the original return values individually. 
-    for (unsigned Ri = 0; Ri != RetCount; ++Ri) { 
-      RetOrArg Ret = CreateRet(F, Ri); 
-      if (LiveValues.erase(Ret)) { 
-        RetTypes.push_back(getRetComponentType(F, Ri)); 
-        NewRetIdxs[Ri] = RetTypes.size() - 1; 
-      } else { 
-        ++NumRetValsEliminated; 
-        LLVM_DEBUG( 
-            dbgs() << "DeadArgumentEliminationPass - Removing return value " 
-                   << Ri << " from " << F->getName() << "\n"); 
-      } 
-    } 
-    if (RetTypes.size() > 1) { 
-      // More than one return type? Reduce it down to size. 
-      if (StructType *STy = dyn_cast<StructType>(RetTy)) { 
-        // Make the new struct packed if we used to return a packed struct 
-        // already. 
-        NRetTy = StructType::get(STy->getContext(), RetTypes, STy->isPacked()); 
-      } else { 
-        assert(isa<ArrayType>(RetTy) && "unexpected multi-value return"); 
-        NRetTy = ArrayType::get(RetTypes[0], RetTypes.size()); 
-      } 
-    } else if (RetTypes.size() == 1) 
-      // One return type? Just a simple value then, but only if we didn't use to 
-      // return a struct with that simple value before. 
-      NRetTy = RetTypes.front(); 
-    else if (RetTypes.empty()) 
-      // No return types? Make it void, but only if we didn't use to return {}. 
-      NRetTy = Type::getVoidTy(F->getContext()); 
-  } 
- 
-  assert(NRetTy && "No new return type found?"); 
- 
-  // The existing function return attributes. 
-  AttrBuilder RAttrs(PAL.getRetAttributes()); 
- 
-  // Remove any incompatible attributes, but only if we removed all return 
-  // values. Otherwise, ensure that we don't have any conflicting attributes 
-  // here. Currently, this should not be possible, but special handling might be 
-  // required when new return value attributes are added. 
-  if (NRetTy->isVoidTy()) 
-    RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy)); 
-  else 
-    assert(!RAttrs.overlaps(AttributeFuncs::typeIncompatible(NRetTy)) && 
-           "Return attributes no longer compatible?"); 
- 
-  AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs); 
- 
-  // Strip allocsize attributes. They might refer to the deleted arguments. 
-  AttributeSet FnAttrs = PAL.getFnAttributes().removeAttribute( 
-      F->getContext(), Attribute::AllocSize); 
- 
-  // Reconstruct the AttributesList based on the vector we constructed. 
-  assert(ArgAttrVec.size() == Params.size()); 
-  AttributeList NewPAL = 
-      AttributeList::get(F->getContext(), FnAttrs, RetAttrs, ArgAttrVec); 
- 
-  // Create the new function type based on the recomputed parameters. 
-  FunctionType *NFTy = FunctionType::get(NRetTy, Params, FTy->isVarArg()); 
- 
-  // No change? 
-  if (NFTy == FTy) 
-    return false; 
- 
-  // Create the new function body and insert it into the module... 
-  Function *NF = Function::Create(NFTy, F->getLinkage(), F->getAddressSpace()); 
-  NF->copyAttributesFrom(F); 
-  NF->setComdat(F->getComdat()); 
-  NF->setAttributes(NewPAL); 
-  // Insert the new function before the old function, so we won't be processing 
-  // it again. 
-  F->getParent()->getFunctionList().insert(F->getIterator(), NF); 
-  NF->takeName(F); 
- 
-  // Loop over all of the callers of the function, transforming the call sites 
-  // to pass in a smaller number of arguments into the new function. 
-  std::vector<Value*> Args; 
-  while (!F->use_empty()) { 
-    CallBase &CB = cast<CallBase>(*F->user_back()); 
- 
-    ArgAttrVec.clear(); 
-    const AttributeList &CallPAL = CB.getAttributes(); 
- 
-    // Adjust the call return attributes in case the function was changed to 
-    // return void. 
-    AttrBuilder RAttrs(CallPAL.getRetAttributes()); 
-    RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy)); 
-    AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs); 
- 
-    // Declare these outside of the loops, so we can reuse them for the second 
-    // loop, which loops the varargs. 
-    auto I = CB.arg_begin(); 
-    unsigned Pi = 0; 
-    // Loop over those operands, corresponding to the normal arguments to the 
-    // original function, and add those that are still alive. 
-    for (unsigned E = FTy->getNumParams(); Pi != E; ++I, ++Pi) 
-      if (ArgAlive[Pi]) { 
-        Args.push_back(*I); 
-        // Get original parameter attributes, but skip return attributes. 
-        AttributeSet Attrs = CallPAL.getParamAttributes(Pi); 
-        if (NRetTy != RetTy && Attrs.hasAttribute(Attribute::Returned)) { 
-          // If the return type has changed, then get rid of 'returned' on the 
-          // call site. The alternative is to make all 'returned' attributes on 
-          // call sites keep the return value alive just like 'returned' 
-          // attributes on function declaration but it's less clearly a win and 
-          // this is not an expected case anyway 
-          ArgAttrVec.push_back(AttributeSet::get( 
-              F->getContext(), 
-              AttrBuilder(Attrs).removeAttribute(Attribute::Returned))); 
-        } else { 
-          // Otherwise, use the original attributes. 
-          ArgAttrVec.push_back(Attrs); 
-        } 
-      } 
- 
-    // Push any varargs arguments on the list. Don't forget their attributes. 
-    for (auto E = CB.arg_end(); I != E; ++I, ++Pi) { 
-      Args.push_back(*I); 
-      ArgAttrVec.push_back(CallPAL.getParamAttributes(Pi)); 
-    } 
- 
-    // Reconstruct the AttributesList based on the vector we constructed. 
-    assert(ArgAttrVec.size() == Args.size()); 
- 
-    // Again, be sure to remove any allocsize attributes, since their indices 
-    // may now be incorrect. 
-    AttributeSet FnAttrs = CallPAL.getFnAttributes().removeAttribute( 
-        F->getContext(), Attribute::AllocSize); 
- 
-    AttributeList NewCallPAL = AttributeList::get( 
-        F->getContext(), FnAttrs, RetAttrs, ArgAttrVec); 
- 
-    SmallVector<OperandBundleDef, 1> OpBundles; 
-    CB.getOperandBundlesAsDefs(OpBundles); 
- 
-    CallBase *NewCB = nullptr; 
-    if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) { 
-      NewCB = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(), 
-                                 Args, OpBundles, "", CB.getParent()); 
-    } else { 
-      NewCB = CallInst::Create(NFTy, NF, Args, OpBundles, "", &CB); 
-      cast<CallInst>(NewCB)->setTailCallKind( 
-          cast<CallInst>(&CB)->getTailCallKind()); 
-    } 
-    NewCB->setCallingConv(CB.getCallingConv()); 
-    NewCB->setAttributes(NewCallPAL); 
-    NewCB->copyMetadata(CB, {LLVMContext::MD_prof, LLVMContext::MD_dbg}); 
-    Args.clear(); 
-    ArgAttrVec.clear(); 
- 
-    if (!CB.use_empty() || CB.isUsedByMetadata()) { 
-      if (NewCB->getType() == CB.getType()) { 
-        // Return type not changed? Just replace users then. 
-        CB.replaceAllUsesWith(NewCB); 
-        NewCB->takeName(&CB); 
-      } else if (NewCB->getType()->isVoidTy()) { 
-        // If the return value is dead, replace any uses of it with undef 
-        // (any non-debug value uses will get removed later on). 
-        if (!CB.getType()->isX86_MMXTy()) 
-          CB.replaceAllUsesWith(UndefValue::get(CB.getType())); 
-      } else { 
-        assert((RetTy->isStructTy() || RetTy->isArrayTy()) && 
-               "Return type changed, but not into a void. The old return type" 
-               " must have been a struct or an array!"); 
-        Instruction *InsertPt = &CB; 
-        if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) { 
-          BasicBlock *NewEdge = 
-              SplitEdge(NewCB->getParent(), II->getNormalDest()); 
-          InsertPt = &*NewEdge->getFirstInsertionPt(); 
-        } 
- 
-        // We used to return a struct or array. Instead of doing smart stuff 
-        // with all the uses, we will just rebuild it using extract/insertvalue 
-        // chaining and let instcombine clean that up. 
-        // 
-        // Start out building up our return value from undef 
-        Value *RetVal = UndefValue::get(RetTy); 
-        for (unsigned Ri = 0; Ri != RetCount; ++Ri) 
-          if (NewRetIdxs[Ri] != -1) { 
-            Value *V; 
-            IRBuilder<NoFolder> IRB(InsertPt); 
-            if (RetTypes.size() > 1) 
-              // We are still returning a struct, so extract the value from our 
-              // return value 
-              V = IRB.CreateExtractValue(NewCB, NewRetIdxs[Ri], "newret"); 
-            else 
-              // We are now returning a single element, so just insert that 
-              V = NewCB; 
-            // Insert the value at the old position 
-            RetVal = IRB.CreateInsertValue(RetVal, V, Ri, "oldret"); 
-          } 
-        // Now, replace all uses of the old call instruction with the return 
-        // struct we built 
-        CB.replaceAllUsesWith(RetVal); 
-        NewCB->takeName(&CB); 
-      } 
-    } 
- 
-    // Finally, remove the old call from the program, reducing the use-count of 
-    // F. 
-    CB.eraseFromParent(); 
-  } 
- 
-  // Since we have now created the new function, splice the body of the old 
-  // function right into the new function, leaving the old rotting hulk of the 
-  // function empty. 
-  NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList()); 
- 
-  // Loop over the argument list, transferring uses of the old arguments over to 
-  // the new arguments, also transferring over the names as well. 
-  ArgI = 0; 
-  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(), 
-                              I2 = NF->arg_begin(); 
-       I != E; ++I, ++ArgI) 
-    if (ArgAlive[ArgI]) { 
-      // If this is a live argument, move the name and users over to the new 
-      // version. 
-      I->replaceAllUsesWith(&*I2); 
-      I2->takeName(&*I); 
-      ++I2; 
-    } else { 
-      // If this argument is dead, replace any uses of it with undef 
-      // (any non-debug value uses will get removed later on). 
-      if (!I->getType()->isX86_MMXTy()) 
-        I->replaceAllUsesWith(UndefValue::get(I->getType())); 
-    } 
- 
-  // If we change the return value of the function we must rewrite any return 
-  // instructions.  Check this now. 
-  if (F->getReturnType() != NF->getReturnType()) 
-    for (BasicBlock &BB : *NF) 
-      if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator())) { 
-        IRBuilder<NoFolder> IRB(RI); 
-        Value *RetVal = nullptr; 
- 
-        if (!NFTy->getReturnType()->isVoidTy()) { 
-          assert(RetTy->isStructTy() || RetTy->isArrayTy()); 
-          // The original return value was a struct or array, insert 
-          // extractvalue/insertvalue chains to extract only the values we need 
-          // to return and insert them into our new result. 
-          // This does generate messy code, but we'll let it to instcombine to 
-          // clean that up. 
-          Value *OldRet = RI->getOperand(0); 
-          // Start out building up our return value from undef 
-          RetVal = UndefValue::get(NRetTy); 
-          for (unsigned RetI = 0; RetI != RetCount; ++RetI) 
-            if (NewRetIdxs[RetI] != -1) { 
-              Value *EV = IRB.CreateExtractValue(OldRet, RetI, "oldret"); 
- 
-              if (RetTypes.size() > 1) { 
-                // We're still returning a struct, so reinsert the value into 
-                // our new return value at the new index 
- 
-                RetVal = IRB.CreateInsertValue(RetVal, EV, NewRetIdxs[RetI], 
-                                               "newret"); 
-              } else { 
-                // We are now only returning a simple value, so just return the 
-                // extracted value. 
-                RetVal = EV; 
-              } 
-            } 
-        } 
-        // Replace the return instruction with one returning the new return 
-        // value (possibly 0 if we became void). 
-        auto *NewRet = ReturnInst::Create(F->getContext(), RetVal, RI); 
-        NewRet->setDebugLoc(RI->getDebugLoc()); 
-        BB.getInstList().erase(RI); 
-      } 
- 
-  // Clone metadatas from the old function, including debug info descriptor. 
-  SmallVector<std::pair<unsigned, MDNode *>, 1> MDs; 
-  F->getAllMetadata(MDs); 
-  for (auto MD : MDs) 
-    NF->addMetadata(MD.first, *MD.second); 
- 
-  // Now that the old function is dead, delete it. 
-  F->eraseFromParent(); 
- 
-  return true; 
-} 
- 
-PreservedAnalyses DeadArgumentEliminationPass::run(Module &M, 
-                                                   ModuleAnalysisManager &) { 
-  bool Changed = false; 
- 
-  // First pass: Do a simple check to see if any functions can have their "..." 
-  // removed.  We can do this if they never call va_start.  This loop cannot be 
-  // fused with the next loop, because deleting a function invalidates 
-  // information computed while surveying other functions. 
-  LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Deleting dead varargs\n"); 
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ) { 
-    Function &F = *I++; 
-    if (F.getFunctionType()->isVarArg()) 
-      Changed |= DeleteDeadVarargs(F); 
-  } 
- 
-  // Second phase:loop through the module, determining which arguments are live. 
-  // We assume all arguments are dead unless proven otherwise (allowing us to 
-  // determine that dead arguments passed into recursive functions are dead). 
-  // 
-  LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Determining liveness\n"); 
-  for (auto &F : M) 
-    SurveyFunction(F); 
- 
-  // Now, remove all dead arguments and return values from each function in 
-  // turn. 
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ) { 
-    // Increment now, because the function will probably get removed (ie. 
-    // replaced by a new one). 
-    Function *F = &*I++; 
-    Changed |= RemoveDeadStuffFromFunction(F); 
-  } 
- 
-  // Finally, look for any unused parameters in functions with non-local 
-  // linkage and replace the passed in parameters with undef. 
-  for (auto &F : M) 
-    Changed |= RemoveDeadArgumentsFromCallers(F); 
- 
-  if (!Changed) 
-    return PreservedAnalyses::all(); 
-  return PreservedAnalyses::none(); 
-} 
+      if (Arg.isUsedByMetadata()) {
+        Arg.replaceAllUsesWith(UndefValue::get(Arg.getType()));
+        Changed = true;
+      }
+      UnusedArgs.push_back(Arg.getArgNo());
+    }
+  }
+
+  if (UnusedArgs.empty())
+    return false;
+
+  for (Use &U : Fn.uses()) {
+    CallBase *CB = dyn_cast<CallBase>(U.getUser());
+    if (!CB || !CB->isCallee(&U))
+      continue;
+
+    // Now go through all unused args and replace them with "undef".
+    for (unsigned I = 0, E = UnusedArgs.size(); I != E; ++I) {
+      unsigned ArgNo = UnusedArgs[I];
+
+      Value *Arg = CB->getArgOperand(ArgNo);
+      CB->setArgOperand(ArgNo, UndefValue::get(Arg->getType()));
+      ++NumArgumentsReplacedWithUndef;
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+/// Convenience function that returns the number of return values. It returns 0
+/// for void functions and 1 for functions not returning a struct. It returns
+/// the number of struct elements for functions returning a struct.
+static unsigned NumRetVals(const Function *F) {
+  Type *RetTy = F->getReturnType();
+  if (RetTy->isVoidTy())
+    return 0;
+  else if (StructType *STy = dyn_cast<StructType>(RetTy))
+    return STy->getNumElements();
+  else if (ArrayType *ATy = dyn_cast<ArrayType>(RetTy))
+    return ATy->getNumElements();
+  else
+    return 1;
+}
+
+/// Returns the sub-type a function will return at a given Idx. Should
+/// correspond to the result type of an ExtractValue instruction executed with
+/// just that one Idx (i.e. only top-level structure is considered).
+static Type *getRetComponentType(const Function *F, unsigned Idx) {
+  Type *RetTy = F->getReturnType();
+  assert(!RetTy->isVoidTy() && "void type has no subtype");
+
+  if (StructType *STy = dyn_cast<StructType>(RetTy))
+    return STy->getElementType(Idx);
+  else if (ArrayType *ATy = dyn_cast<ArrayType>(RetTy))
+    return ATy->getElementType();
+  else
+    return RetTy;
+}
+
+/// MarkIfNotLive - This checks Use for liveness in LiveValues. If Use is not
+/// live, it adds Use to the MaybeLiveUses argument. Returns the determined
+/// liveness of Use.
+DeadArgumentEliminationPass::Liveness
+DeadArgumentEliminationPass::MarkIfNotLive(RetOrArg Use,
+                                           UseVector &MaybeLiveUses) {
+  // We're live if our use or its Function is already marked as live.
+  if (IsLive(Use))
+    return Live;
+
+  // We're maybe live otherwise, but remember that we must become live if
+  // Use becomes live.
+  MaybeLiveUses.push_back(Use);
+  return MaybeLive;
+}
+
+/// SurveyUse - This looks at a single use of an argument or return value
+/// and determines if it should be alive or not. Adds this use to MaybeLiveUses
+/// if it causes the used value to become MaybeLive.
+///
+/// RetValNum is the return value number to use when this use is used in a
+/// return instruction. This is used in the recursion, you should always leave
+/// it at 0.
+DeadArgumentEliminationPass::Liveness
+DeadArgumentEliminationPass::SurveyUse(const Use *U, UseVector &MaybeLiveUses,
+                                       unsigned RetValNum) {
+    const User *V = U->getUser();
+    if (const ReturnInst *RI = dyn_cast<ReturnInst>(V)) {
+      // The value is returned from a function. It's only live when the
+      // function's return value is live. We use RetValNum here, for the case
+      // that U is really a use of an insertvalue instruction that uses the
+      // original Use.
+      const Function *F = RI->getParent()->getParent();
+      if (RetValNum != -1U) {
+        RetOrArg Use = CreateRet(F, RetValNum);
+        // We might be live, depending on the liveness of Use.
+        return MarkIfNotLive(Use, MaybeLiveUses);
+      } else {
+        DeadArgumentEliminationPass::Liveness Result = MaybeLive;
+        for (unsigned Ri = 0; Ri < NumRetVals(F); ++Ri) {
+          RetOrArg Use = CreateRet(F, Ri);
+          // We might be live, depending on the liveness of Use. If any
+          // sub-value is live, then the entire value is considered live. This
+          // is a conservative choice, and better tracking is possible.
+          DeadArgumentEliminationPass::Liveness SubResult =
+              MarkIfNotLive(Use, MaybeLiveUses);
+          if (Result != Live)
+            Result = SubResult;
+        }
+        return Result;
+      }
+    }
+    if (const InsertValueInst *IV = dyn_cast<InsertValueInst>(V)) {
+      if (U->getOperandNo() != InsertValueInst::getAggregateOperandIndex()
+          && IV->hasIndices())
+        // The use we are examining is inserted into an aggregate. Our liveness
+        // depends on all uses of that aggregate, but if it is used as a return
+        // value, only index at which we were inserted counts.
+        RetValNum = *IV->idx_begin();
+
+      // Note that if we are used as the aggregate operand to the insertvalue,
+      // we don't change RetValNum, but do survey all our uses.
+
+      Liveness Result = MaybeLive;
+      for (const Use &UU : IV->uses()) {
+        Result = SurveyUse(&UU, MaybeLiveUses, RetValNum);
+        if (Result == Live)
+          break;
+      }
+      return Result;
+    }
+
+    if (const auto *CB = dyn_cast<CallBase>(V)) {
+      const Function *F = CB->getCalledFunction();
+      if (F) {
+        // Used in a direct call.
+
+        // The function argument is live if it is used as a bundle operand.
+        if (CB->isBundleOperand(U))
+          return Live;
+
+        // Find the argument number. We know for sure that this use is an
+        // argument, since if it was the function argument this would be an
+        // indirect call and the we know can't be looking at a value of the
+        // label type (for the invoke instruction).
+        unsigned ArgNo = CB->getArgOperandNo(U);
+
+        if (ArgNo >= F->getFunctionType()->getNumParams())
+          // The value is passed in through a vararg! Must be live.
+          return Live;
+
+        assert(CB->getArgOperand(ArgNo) == CB->getOperand(U->getOperandNo()) &&
+               "Argument is not where we expected it");
+
+        // Value passed to a normal call. It's only live when the corresponding
+        // argument to the called function turns out live.
+        RetOrArg Use = CreateArg(F, ArgNo);
+        return MarkIfNotLive(Use, MaybeLiveUses);
+      }
+    }
+    // Used in any other way? Value must be live.
+    return Live;
+}
+
+/// SurveyUses - This looks at all the uses of the given value
+/// Returns the Liveness deduced from the uses of this value.
+///
+/// Adds all uses that cause the result to be MaybeLive to MaybeLiveRetUses. If
+/// the result is Live, MaybeLiveUses might be modified but its content should
+/// be ignored (since it might not be complete).
+DeadArgumentEliminationPass::Liveness
+DeadArgumentEliminationPass::SurveyUses(const Value *V,
+                                        UseVector &MaybeLiveUses) {
+  // Assume it's dead (which will only hold if there are no uses at all..).
+  Liveness Result = MaybeLive;
+  // Check each use.
+  for (const Use &U : V->uses()) {
+    Result = SurveyUse(&U, MaybeLiveUses);
+    if (Result == Live)
+      break;
+  }
+  return Result;
+}
+
+// SurveyFunction - This performs the initial survey of the specified function,
+// checking out whether or not it uses any of its incoming arguments or whether
+// any callers use the return value.  This fills in the LiveValues set and Uses
+// map.
+//
+// We consider arguments of non-internal functions to be intrinsically alive as
+// well as arguments to functions which have their "address taken".
+void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
+  // Functions with inalloca/preallocated parameters are expecting args in a
+  // particular register and memory layout.
+  if (F.getAttributes().hasAttrSomewhere(Attribute::InAlloca) ||
+      F.getAttributes().hasAttrSomewhere(Attribute::Preallocated)) {
+    MarkLive(F);
+    return;
+  }
+
+  // Don't touch naked functions. The assembly might be using an argument, or
+  // otherwise rely on the frame layout in a way that this analysis will not
+  // see.
+  if (F.hasFnAttribute(Attribute::Naked)) {
+    MarkLive(F);
+    return;
+  }
+
+  unsigned RetCount = NumRetVals(&F);
+
+  // Assume all return values are dead
+  using RetVals = SmallVector<Liveness, 5>;
+
+  RetVals RetValLiveness(RetCount, MaybeLive);
+
+  using RetUses = SmallVector<UseVector, 5>;
+
+  // These vectors map each return value to the uses that make it MaybeLive, so
+  // we can add those to the Uses map if the return value really turns out to be
+  // MaybeLive. Initialized to a list of RetCount empty lists.
+  RetUses MaybeLiveRetUses(RetCount);
+
+  bool HasMustTailCalls = false;
+
+  for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    if (const ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
+      if (RI->getNumOperands() != 0 && RI->getOperand(0)->getType()
+          != F.getFunctionType()->getReturnType()) {
+        // We don't support old style multiple return values.
+        MarkLive(F);
+        return;
+      }
+    }
+
+    // If we have any returns of `musttail` results - the signature can't
+    // change
+    if (BB->getTerminatingMustTailCall() != nullptr)
+      HasMustTailCalls = true;
+  }
+
+  if (HasMustTailCalls) {
+    LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - " << F.getName()
+                      << " has musttail calls\n");
+  }
+
+  if (!F.hasLocalLinkage() && (!ShouldHackArguments || F.isIntrinsic())) {
+    MarkLive(F);
+    return;
+  }
+
+  LLVM_DEBUG(
+      dbgs() << "DeadArgumentEliminationPass - Inspecting callers for fn: "
+             << F.getName() << "\n");
+  // Keep track of the number of live retvals, so we can skip checks once all
+  // of them turn out to be live.
+  unsigned NumLiveRetVals = 0;
+
+  bool HasMustTailCallers = false;
+
+  // Loop all uses of the function.
+  for (const Use &U : F.uses()) {
+    // If the function is PASSED IN as an argument, its address has been
+    // taken.
+    const auto *CB = dyn_cast<CallBase>(U.getUser());
+    if (!CB || !CB->isCallee(&U)) {
+      MarkLive(F);
+      return;
+    }
+
+    // The number of arguments for `musttail` call must match the number of
+    // arguments of the caller
+    if (CB->isMustTailCall())
+      HasMustTailCallers = true;
+
+    // If we end up here, we are looking at a direct call to our function.
+
+    // Now, check how our return value(s) is/are used in this caller. Don't
+    // bother checking return values if all of them are live already.
+    if (NumLiveRetVals == RetCount)
+      continue;
+
+    // Check all uses of the return value.
+    for (const Use &U : CB->uses()) {
+      if (ExtractValueInst *Ext = dyn_cast<ExtractValueInst>(U.getUser())) {
+        // This use uses a part of our return value, survey the uses of
+        // that part and store the results for this index only.
+        unsigned Idx = *Ext->idx_begin();
+        if (RetValLiveness[Idx] != Live) {
+          RetValLiveness[Idx] = SurveyUses(Ext, MaybeLiveRetUses[Idx]);
+          if (RetValLiveness[Idx] == Live)
+            NumLiveRetVals++;
+        }
+      } else {
+        // Used by something else than extractvalue. Survey, but assume that the
+        // result applies to all sub-values.
+        UseVector MaybeLiveAggregateUses;
+        if (SurveyUse(&U, MaybeLiveAggregateUses) == Live) {
+          NumLiveRetVals = RetCount;
+          RetValLiveness.assign(RetCount, Live);
+          break;
+        } else {
+          for (unsigned Ri = 0; Ri != RetCount; ++Ri) {
+            if (RetValLiveness[Ri] != Live)
+              MaybeLiveRetUses[Ri].append(MaybeLiveAggregateUses.begin(),
+                                          MaybeLiveAggregateUses.end());
+          }
+        }
+      }
+    }
+  }
+
+  if (HasMustTailCallers) {
+    LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - " << F.getName()
+                      << " has musttail callers\n");
+  }
+
+  // Now we've inspected all callers, record the liveness of our return values.
+  for (unsigned Ri = 0; Ri != RetCount; ++Ri)
+    MarkValue(CreateRet(&F, Ri), RetValLiveness[Ri], MaybeLiveRetUses[Ri]);
+
+  LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Inspecting args for fn: "
+                    << F.getName() << "\n");
+
+  // Now, check all of our arguments.
+  unsigned ArgI = 0;
+  UseVector MaybeLiveArgUses;
+  for (Function::const_arg_iterator AI = F.arg_begin(), E = F.arg_end();
+       AI != E; ++AI, ++ArgI) {
+    Liveness Result;
+    if (F.getFunctionType()->isVarArg() || HasMustTailCallers ||
+        HasMustTailCalls) {
+      // Variadic functions will already have a va_arg function expanded inside
+      // them, making them potentially very sensitive to ABI changes resulting
+      // from removing arguments entirely, so don't. For example AArch64 handles
+      // register and stack HFAs very differently, and this is reflected in the
+      // IR which has already been generated.
+      //
+      // `musttail` calls to this function restrict argument removal attempts.
+      // The signature of the caller must match the signature of the function.
+      //
+      // `musttail` calls in this function prevents us from changing its
+      // signature
+      Result = Live;
+    } else {
+      // See what the effect of this use is (recording any uses that cause
+      // MaybeLive in MaybeLiveArgUses).
+      Result = SurveyUses(&*AI, MaybeLiveArgUses);
+    }
+
+    // Mark the result.
+    MarkValue(CreateArg(&F, ArgI), Result, MaybeLiveArgUses);
+    // Clear the vector again for the next iteration.
+    MaybeLiveArgUses.clear();
+  }
+}
+
+/// MarkValue - This function marks the liveness of RA depending on L. If L is
+/// MaybeLive, it also takes all uses in MaybeLiveUses and records them in Uses,
+/// such that RA will be marked live if any use in MaybeLiveUses gets marked
+/// live later on.
+void DeadArgumentEliminationPass::MarkValue(const RetOrArg &RA, Liveness L,
+                                            const UseVector &MaybeLiveUses) {
+  switch (L) {
+    case Live:
+      MarkLive(RA);
+      break;
+    case MaybeLive:
+      assert(!IsLive(RA) && "Use is already live!");
+      for (const auto &MaybeLiveUse : MaybeLiveUses) {
+        if (IsLive(MaybeLiveUse)) {
+          // A use is live, so this value is live.
+          MarkLive(RA);
+          break;
+        } else {
+          // Note any uses of this value, so this value can be
+          // marked live whenever one of the uses becomes live.
+          Uses.insert(std::make_pair(MaybeLiveUse, RA));
+        }
+      }
+      break;
+  }
+}
+
+/// MarkLive - Mark the given Function as alive, meaning that it cannot be
+/// changed in any way. Additionally,
+/// mark any values that are used as this function's parameters or by its return
+/// values (according to Uses) live as well.
+void DeadArgumentEliminationPass::MarkLive(const Function &F) {
+  LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Intrinsically live fn: "
+                    << F.getName() << "\n");
+  // Mark the function as live.
+  LiveFunctions.insert(&F);
+  // Mark all arguments as live.
+  for (unsigned ArgI = 0, E = F.arg_size(); ArgI != E; ++ArgI)
+    PropagateLiveness(CreateArg(&F, ArgI));
+  // Mark all return values as live.
+  for (unsigned Ri = 0, E = NumRetVals(&F); Ri != E; ++Ri)
+    PropagateLiveness(CreateRet(&F, Ri));
+}
+
+/// MarkLive - Mark the given return value or argument as live. Additionally,
+/// mark any values that are used by this value (according to Uses) live as
+/// well.
+void DeadArgumentEliminationPass::MarkLive(const RetOrArg &RA) {
+  if (IsLive(RA))
+    return; // Already marked Live.
+
+  LiveValues.insert(RA);
+
+  LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Marking "
+                    << RA.getDescription() << " live\n");
+  PropagateLiveness(RA);
+}
+
+bool DeadArgumentEliminationPass::IsLive(const RetOrArg &RA) {
+  return LiveFunctions.count(RA.F) || LiveValues.count(RA);
+}
+
+/// PropagateLiveness - Given that RA is a live value, propagate it's liveness
+/// to any other values it uses (according to Uses).
+void DeadArgumentEliminationPass::PropagateLiveness(const RetOrArg &RA) {
+  // We don't use upper_bound (or equal_range) here, because our recursive call
+  // to ourselves is likely to cause the upper_bound (which is the first value
+  // not belonging to RA) to become erased and the iterator invalidated.
+  UseMap::iterator Begin = Uses.lower_bound(RA);
+  UseMap::iterator E = Uses.end();
+  UseMap::iterator I;
+  for (I = Begin; I != E && I->first == RA; ++I)
+    MarkLive(I->second);
+
+  // Erase RA from the Uses map (from the lower bound to wherever we ended up
+  // after the loop).
+  Uses.erase(Begin, I);
+}
+
+// RemoveDeadStuffFromFunction - Remove any arguments and return values from F
+// that are not in LiveValues. Transform the function and all of the callees of
+// the function to not have these arguments and return values.
+//
+bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
+  // Don't modify fully live functions
+  if (LiveFunctions.count(F))
+    return false;
+
+  // Start by computing a new prototype for the function, which is the same as
+  // the old function, but has fewer arguments and a different return type.
+  FunctionType *FTy = F->getFunctionType();
+  std::vector<Type*> Params;
+
+  // Keep track of if we have a live 'returned' argument
+  bool HasLiveReturnedArg = false;
+
+  // Set up to build a new list of parameter attributes.
+  SmallVector<AttributeSet, 8> ArgAttrVec;
+  const AttributeList &PAL = F->getAttributes();
+
+  // Remember which arguments are still alive.
+  SmallVector<bool, 10> ArgAlive(FTy->getNumParams(), false);
+  // Construct the new parameter list from non-dead arguments. Also construct
+  // a new set of parameter attributes to correspond. Skip the first parameter
+  // attribute, since that belongs to the return value.
+  unsigned ArgI = 0;
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
+       ++I, ++ArgI) {
+    RetOrArg Arg = CreateArg(F, ArgI);
+    if (LiveValues.erase(Arg)) {
+      Params.push_back(I->getType());
+      ArgAlive[ArgI] = true;
+      ArgAttrVec.push_back(PAL.getParamAttributes(ArgI));
+      HasLiveReturnedArg |= PAL.hasParamAttribute(ArgI, Attribute::Returned);
+    } else {
+      ++NumArgumentsEliminated;
+      LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Removing argument "
+                        << ArgI << " (" << I->getName() << ") from "
+                        << F->getName() << "\n");
+    }
+  }
+
+  // Find out the new return value.
+  Type *RetTy = FTy->getReturnType();
+  Type *NRetTy = nullptr;
+  unsigned RetCount = NumRetVals(F);
+
+  // -1 means unused, other numbers are the new index
+  SmallVector<int, 5> NewRetIdxs(RetCount, -1);
+  std::vector<Type*> RetTypes;
+
+  // If there is a function with a live 'returned' argument but a dead return
+  // value, then there are two possible actions:
+  // 1) Eliminate the return value and take off the 'returned' attribute on the
+  //    argument.
+  // 2) Retain the 'returned' attribute and treat the return value (but not the
+  //    entire function) as live so that it is not eliminated.
+  //
+  // It's not clear in the general case which option is more profitable because,
+  // even in the absence of explicit uses of the return value, code generation
+  // is free to use the 'returned' attribute to do things like eliding
+  // save/restores of registers across calls. Whether or not this happens is
+  // target and ABI-specific as well as depending on the amount of register
+  // pressure, so there's no good way for an IR-level pass to figure this out.
+  //
+  // Fortunately, the only places where 'returned' is currently generated by
+  // the FE are places where 'returned' is basically free and almost always a
+  // performance win, so the second option can just be used always for now.
+  //
+  // This should be revisited if 'returned' is ever applied more liberally.
+  if (RetTy->isVoidTy() || HasLiveReturnedArg) {
+    NRetTy = RetTy;
+  } else {
+    // Look at each of the original return values individually.
+    for (unsigned Ri = 0; Ri != RetCount; ++Ri) {
+      RetOrArg Ret = CreateRet(F, Ri);
+      if (LiveValues.erase(Ret)) {
+        RetTypes.push_back(getRetComponentType(F, Ri));
+        NewRetIdxs[Ri] = RetTypes.size() - 1;
+      } else {
+        ++NumRetValsEliminated;
+        LLVM_DEBUG(
+            dbgs() << "DeadArgumentEliminationPass - Removing return value "
+                   << Ri << " from " << F->getName() << "\n");
+      }
+    }
+    if (RetTypes.size() > 1) {
+      // More than one return type? Reduce it down to size.
+      if (StructType *STy = dyn_cast<StructType>(RetTy)) {
+        // Make the new struct packed if we used to return a packed struct
+        // already.
+        NRetTy = StructType::get(STy->getContext(), RetTypes, STy->isPacked());
+      } else {
+        assert(isa<ArrayType>(RetTy) && "unexpected multi-value return");
+        NRetTy = ArrayType::get(RetTypes[0], RetTypes.size());
+      }
+    } else if (RetTypes.size() == 1)
+      // One return type? Just a simple value then, but only if we didn't use to
+      // return a struct with that simple value before.
+      NRetTy = RetTypes.front();
+    else if (RetTypes.empty())
+      // No return types? Make it void, but only if we didn't use to return {}.
+      NRetTy = Type::getVoidTy(F->getContext());
+  }
+
+  assert(NRetTy && "No new return type found?");
+
+  // The existing function return attributes.
+  AttrBuilder RAttrs(PAL.getRetAttributes());
+
+  // Remove any incompatible attributes, but only if we removed all return
+  // values. Otherwise, ensure that we don't have any conflicting attributes
+  // here. Currently, this should not be possible, but special handling might be
+  // required when new return value attributes are added.
+  if (NRetTy->isVoidTy())
+    RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy));
+  else
+    assert(!RAttrs.overlaps(AttributeFuncs::typeIncompatible(NRetTy)) &&
+           "Return attributes no longer compatible?");
+
+  AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs);
+
+  // Strip allocsize attributes. They might refer to the deleted arguments.
+  AttributeSet FnAttrs = PAL.getFnAttributes().removeAttribute(
+      F->getContext(), Attribute::AllocSize);
+
+  // Reconstruct the AttributesList based on the vector we constructed.
+  assert(ArgAttrVec.size() == Params.size());
+  AttributeList NewPAL =
+      AttributeList::get(F->getContext(), FnAttrs, RetAttrs, ArgAttrVec);
+
+  // Create the new function type based on the recomputed parameters.
+  FunctionType *NFTy = FunctionType::get(NRetTy, Params, FTy->isVarArg());
+
+  // No change?
+  if (NFTy == FTy)
+    return false;
+
+  // Create the new function body and insert it into the module...
+  Function *NF = Function::Create(NFTy, F->getLinkage(), F->getAddressSpace());
+  NF->copyAttributesFrom(F);
+  NF->setComdat(F->getComdat());
+  NF->setAttributes(NewPAL);
+  // Insert the new function before the old function, so we won't be processing
+  // it again.
+  F->getParent()->getFunctionList().insert(F->getIterator(), NF);
+  NF->takeName(F);
+
+  // Loop over all of the callers of the function, transforming the call sites
+  // to pass in a smaller number of arguments into the new function.
+  std::vector<Value*> Args;
+  while (!F->use_empty()) {
+    CallBase &CB = cast<CallBase>(*F->user_back());
+
+    ArgAttrVec.clear();
+    const AttributeList &CallPAL = CB.getAttributes();
+
+    // Adjust the call return attributes in case the function was changed to
+    // return void.
+    AttrBuilder RAttrs(CallPAL.getRetAttributes());
+    RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy));
+    AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs);
+
+    // Declare these outside of the loops, so we can reuse them for the second
+    // loop, which loops the varargs.
+    auto I = CB.arg_begin();
+    unsigned Pi = 0;
+    // Loop over those operands, corresponding to the normal arguments to the
+    // original function, and add those that are still alive.
+    for (unsigned E = FTy->getNumParams(); Pi != E; ++I, ++Pi)
+      if (ArgAlive[Pi]) {
+        Args.push_back(*I);
+        // Get original parameter attributes, but skip return attributes.
+        AttributeSet Attrs = CallPAL.getParamAttributes(Pi);
+        if (NRetTy != RetTy && Attrs.hasAttribute(Attribute::Returned)) {
+          // If the return type has changed, then get rid of 'returned' on the
+          // call site. The alternative is to make all 'returned' attributes on
+          // call sites keep the return value alive just like 'returned'
+          // attributes on function declaration but it's less clearly a win and
+          // this is not an expected case anyway
+          ArgAttrVec.push_back(AttributeSet::get(
+              F->getContext(),
+              AttrBuilder(Attrs).removeAttribute(Attribute::Returned)));
+        } else {
+          // Otherwise, use the original attributes.
+          ArgAttrVec.push_back(Attrs);
+        }
+      }
+
+    // Push any varargs arguments on the list. Don't forget their attributes.
+    for (auto E = CB.arg_end(); I != E; ++I, ++Pi) {
+      Args.push_back(*I);
+      ArgAttrVec.push_back(CallPAL.getParamAttributes(Pi));
+    }
+
+    // Reconstruct the AttributesList based on the vector we constructed.
+    assert(ArgAttrVec.size() == Args.size());
+
+    // Again, be sure to remove any allocsize attributes, since their indices
+    // may now be incorrect.
+    AttributeSet FnAttrs = CallPAL.getFnAttributes().removeAttribute(
+        F->getContext(), Attribute::AllocSize);
+
+    AttributeList NewCallPAL = AttributeList::get(
+        F->getContext(), FnAttrs, RetAttrs, ArgAttrVec);
+
+    SmallVector<OperandBundleDef, 1> OpBundles;
+    CB.getOperandBundlesAsDefs(OpBundles);
+
+    CallBase *NewCB = nullptr;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
+      NewCB = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
+                                 Args, OpBundles, "", CB.getParent());
+    } else {
+      NewCB = CallInst::Create(NFTy, NF, Args, OpBundles, "", &CB);
+      cast<CallInst>(NewCB)->setTailCallKind(
+          cast<CallInst>(&CB)->getTailCallKind());
+    }
+    NewCB->setCallingConv(CB.getCallingConv());
+    NewCB->setAttributes(NewCallPAL);
+    NewCB->copyMetadata(CB, {LLVMContext::MD_prof, LLVMContext::MD_dbg});
+    Args.clear();
+    ArgAttrVec.clear();
+
+    if (!CB.use_empty() || CB.isUsedByMetadata()) {
+      if (NewCB->getType() == CB.getType()) {
+        // Return type not changed? Just replace users then.
+        CB.replaceAllUsesWith(NewCB);
+        NewCB->takeName(&CB);
+      } else if (NewCB->getType()->isVoidTy()) {
+        // If the return value is dead, replace any uses of it with undef
+        // (any non-debug value uses will get removed later on).
+        if (!CB.getType()->isX86_MMXTy())
+          CB.replaceAllUsesWith(UndefValue::get(CB.getType()));
+      } else {
+        assert((RetTy->isStructTy() || RetTy->isArrayTy()) &&
+               "Return type changed, but not into a void. The old return type"
+               " must have been a struct or an array!");
+        Instruction *InsertPt = &CB;
+        if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
+          BasicBlock *NewEdge =
+              SplitEdge(NewCB->getParent(), II->getNormalDest());
+          InsertPt = &*NewEdge->getFirstInsertionPt();
+        }
+
+        // We used to return a struct or array. Instead of doing smart stuff
+        // with all the uses, we will just rebuild it using extract/insertvalue
+        // chaining and let instcombine clean that up.
+        //
+        // Start out building up our return value from undef
+        Value *RetVal = UndefValue::get(RetTy);
+        for (unsigned Ri = 0; Ri != RetCount; ++Ri)
+          if (NewRetIdxs[Ri] != -1) {
+            Value *V;
+            IRBuilder<NoFolder> IRB(InsertPt);
+            if (RetTypes.size() > 1)
+              // We are still returning a struct, so extract the value from our
+              // return value
+              V = IRB.CreateExtractValue(NewCB, NewRetIdxs[Ri], "newret");
+            else
+              // We are now returning a single element, so just insert that
+              V = NewCB;
+            // Insert the value at the old position
+            RetVal = IRB.CreateInsertValue(RetVal, V, Ri, "oldret");
+          }
+        // Now, replace all uses of the old call instruction with the return
+        // struct we built
+        CB.replaceAllUsesWith(RetVal);
+        NewCB->takeName(&CB);
+      }
+    }
+
+    // Finally, remove the old call from the program, reducing the use-count of
+    // F.
+    CB.eraseFromParent();
+  }
+
+  // Since we have now created the new function, splice the body of the old
+  // function right into the new function, leaving the old rotting hulk of the
+  // function empty.
+  NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList());
+
+  // Loop over the argument list, transferring uses of the old arguments over to
+  // the new arguments, also transferring over the names as well.
+  ArgI = 0;
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(),
+                              I2 = NF->arg_begin();
+       I != E; ++I, ++ArgI)
+    if (ArgAlive[ArgI]) {
+      // If this is a live argument, move the name and users over to the new
+      // version.
+      I->replaceAllUsesWith(&*I2);
+      I2->takeName(&*I);
+      ++I2;
+    } else {
+      // If this argument is dead, replace any uses of it with undef
+      // (any non-debug value uses will get removed later on).
+      if (!I->getType()->isX86_MMXTy())
+        I->replaceAllUsesWith(UndefValue::get(I->getType()));
+    }
+
+  // If we change the return value of the function we must rewrite any return
+  // instructions.  Check this now.
+  if (F->getReturnType() != NF->getReturnType())
+    for (BasicBlock &BB : *NF)
+      if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
+        IRBuilder<NoFolder> IRB(RI);
+        Value *RetVal = nullptr;
+
+        if (!NFTy->getReturnType()->isVoidTy()) {
+          assert(RetTy->isStructTy() || RetTy->isArrayTy());
+          // The original return value was a struct or array, insert
+          // extractvalue/insertvalue chains to extract only the values we need
+          // to return and insert them into our new result.
+          // This does generate messy code, but we'll let it to instcombine to
+          // clean that up.
+          Value *OldRet = RI->getOperand(0);
+          // Start out building up our return value from undef
+          RetVal = UndefValue::get(NRetTy);
+          for (unsigned RetI = 0; RetI != RetCount; ++RetI)
+            if (NewRetIdxs[RetI] != -1) {
+              Value *EV = IRB.CreateExtractValue(OldRet, RetI, "oldret");
+
+              if (RetTypes.size() > 1) {
+                // We're still returning a struct, so reinsert the value into
+                // our new return value at the new index
+
+                RetVal = IRB.CreateInsertValue(RetVal, EV, NewRetIdxs[RetI],
+                                               "newret");
+              } else {
+                // We are now only returning a simple value, so just return the
+                // extracted value.
+                RetVal = EV;
+              }
+            }
+        }
+        // Replace the return instruction with one returning the new return
+        // value (possibly 0 if we became void).
+        auto *NewRet = ReturnInst::Create(F->getContext(), RetVal, RI);
+        NewRet->setDebugLoc(RI->getDebugLoc());
+        BB.getInstList().erase(RI);
+      }
+
+  // Clone metadatas from the old function, including debug info descriptor.
+  SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+  F->getAllMetadata(MDs);
+  for (auto MD : MDs)
+    NF->addMetadata(MD.first, *MD.second);
+
+  // Now that the old function is dead, delete it.
+  F->eraseFromParent();
+
+  return true;
+}
+
+PreservedAnalyses DeadArgumentEliminationPass::run(Module &M,
+                                                   ModuleAnalysisManager &) {
+  bool Changed = false;
+
+  // First pass: Do a simple check to see if any functions can have their "..."
+  // removed.  We can do this if they never call va_start.  This loop cannot be
+  // fused with the next loop, because deleting a function invalidates
+  // information computed while surveying other functions.
+  LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Deleting dead varargs\n");
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {
+    Function &F = *I++;
+    if (F.getFunctionType()->isVarArg())
+      Changed |= DeleteDeadVarargs(F);
+  }
+
+  // Second phase:loop through the module, determining which arguments are live.
+  // We assume all arguments are dead unless proven otherwise (allowing us to
+  // determine that dead arguments passed into recursive functions are dead).
+  //
+  LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Determining liveness\n");
+  for (auto &F : M)
+    SurveyFunction(F);
+
+  // Now, remove all dead arguments and return values from each function in
+  // turn.
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {
+    // Increment now, because the function will probably get removed (ie.
+    // replaced by a new one).
+    Function *F = &*I++;
+    Changed |= RemoveDeadStuffFromFunction(F);
+  }
+
+  // Finally, look for any unused parameters in functions with non-local
+  // linkage and replace the passed in parameters with undef.
+  for (auto &F : M)
+    Changed |= RemoveDeadArgumentsFromCallers(F);
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/ElimAvailExtern.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/ElimAvailExtern.cpp
index 1fc6114af3..7f138d206f 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/ElimAvailExtern.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/ElimAvailExtern.cpp
@@ -1,102 +1,102 @@
-//===- ElimAvailExtern.cpp - DCE unreachable internal functions -----------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This transform is designed to eliminate available external global 
-// definitions from the program, turning them into declarations. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/IPO/ElimAvailExtern.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GlobalValue.h" 
-#include "llvm/IR/GlobalVariable.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Transforms/IPO.h" 
-#include "llvm/Transforms/Utils/GlobalStatus.h" 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "elim-avail-extern" 
- 
-STATISTIC(NumFunctions, "Number of functions removed"); 
-STATISTIC(NumVariables, "Number of global variables removed"); 
- 
-static bool eliminateAvailableExternally(Module &M) { 
-  bool Changed = false; 
- 
-  // Drop initializers of available externally global variables. 
-  for (GlobalVariable &GV : M.globals()) { 
-    if (!GV.hasAvailableExternallyLinkage()) 
-      continue; 
-    if (GV.hasInitializer()) { 
-      Constant *Init = GV.getInitializer(); 
-      GV.setInitializer(nullptr); 
-      if (isSafeToDestroyConstant(Init)) 
-        Init->destroyConstant(); 
-    } 
-    GV.removeDeadConstantUsers(); 
-    GV.setLinkage(GlobalValue::ExternalLinkage); 
-    NumVariables++; 
-    Changed = true; 
-  } 
- 
-  // Drop the bodies of available externally functions. 
-  for (Function &F : M) { 
-    if (!F.hasAvailableExternallyLinkage()) 
-      continue; 
-    if (!F.isDeclaration()) 
-      // This will set the linkage to external 
-      F.deleteBody(); 
-    F.removeDeadConstantUsers(); 
-    NumFunctions++; 
-    Changed = true; 
-  } 
- 
-  return Changed; 
-} 
- 
-PreservedAnalyses 
-EliminateAvailableExternallyPass::run(Module &M, ModuleAnalysisManager &) { 
-  if (!eliminateAvailableExternally(M)) 
-    return PreservedAnalyses::all(); 
-  return PreservedAnalyses::none(); 
-} 
- 
-namespace { 
- 
-struct EliminateAvailableExternallyLegacyPass : public ModulePass { 
-  static char ID; // Pass identification, replacement for typeid 
- 
-  EliminateAvailableExternallyLegacyPass() : ModulePass(ID) { 
-    initializeEliminateAvailableExternallyLegacyPassPass( 
-        *PassRegistry::getPassRegistry()); 
-  } 
- 
-  // run - Do the EliminateAvailableExternally pass on the specified module, 
-  // optionally updating the specified callgraph to reflect the changes. 
-  bool runOnModule(Module &M) override { 
-    if (skipModule(M)) 
-      return false; 
-    return eliminateAvailableExternally(M); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-char EliminateAvailableExternallyLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS(EliminateAvailableExternallyLegacyPass, "elim-avail-extern", 
-                "Eliminate Available Externally Globals", false, false) 
- 
-ModulePass *llvm::createEliminateAvailableExternallyPass() { 
-  return new EliminateAvailableExternallyLegacyPass(); 
-} 
+//===- ElimAvailExtern.cpp - DCE unreachable internal functions -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This transform is designed to eliminate available external global
+// definitions from the program, turning them into declarations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/ElimAvailExtern.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "elim-avail-extern"
+
+STATISTIC(NumFunctions, "Number of functions removed");
+STATISTIC(NumVariables, "Number of global variables removed");
+
+static bool eliminateAvailableExternally(Module &M) {
+  bool Changed = false;
+
+  // Drop initializers of available externally global variables.
+  for (GlobalVariable &GV : M.globals()) {
+    if (!GV.hasAvailableExternallyLinkage())
+      continue;
+    if (GV.hasInitializer()) {
+      Constant *Init = GV.getInitializer();
+      GV.setInitializer(nullptr);
+      if (isSafeToDestroyConstant(Init))
+        Init->destroyConstant();
+    }
+    GV.removeDeadConstantUsers();
+    GV.setLinkage(GlobalValue::ExternalLinkage);
+    NumVariables++;
+    Changed = true;
+  }
+
+  // Drop the bodies of available externally functions.
+  for (Function &F : M) {
+    if (!F.hasAvailableExternallyLinkage())
+      continue;
+    if (!F.isDeclaration())
+      // This will set the linkage to external
+      F.deleteBody();
+    F.removeDeadConstantUsers();
+    NumFunctions++;
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+PreservedAnalyses
+EliminateAvailableExternallyPass::run(Module &M, ModuleAnalysisManager &) {
+  if (!eliminateAvailableExternally(M))
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
+}
+
+namespace {
+
+struct EliminateAvailableExternallyLegacyPass : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+
+  EliminateAvailableExternallyLegacyPass() : ModulePass(ID) {
+    initializeEliminateAvailableExternallyLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  // run - Do the EliminateAvailableExternally pass on the specified module,
+  // optionally updating the specified callgraph to reflect the changes.
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+    return eliminateAvailableExternally(M);
+  }
+};
+
+} // end anonymous namespace
+
+char EliminateAvailableExternallyLegacyPass::ID = 0;
+
+INITIALIZE_PASS(EliminateAvailableExternallyLegacyPass, "elim-avail-extern",
+                "Eliminate Available Externally Globals", false, false)
+
+ModulePass *llvm::createEliminateAvailableExternallyPass() {
+  return new EliminateAvailableExternallyLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/ExtractGV.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/ExtractGV.cpp
index 2958fb0308..b45766a8e7 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/ExtractGV.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/ExtractGV.cpp
@@ -1,164 +1,164 @@
-//===-- ExtractGV.cpp - Global Value extraction pass ----------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass extracts global values 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Transforms/IPO.h" 
-#include <algorithm> 
-using namespace llvm; 
- 
-/// Make sure GV is visible from both modules. Delete is true if it is 
-/// being deleted from this module. 
-/// This also makes sure GV cannot be dropped so that references from 
-/// the split module remain valid. 
-static void makeVisible(GlobalValue &GV, bool Delete) { 
-  bool Local = GV.hasLocalLinkage(); 
-  if (Local || Delete) { 
-    GV.setLinkage(GlobalValue::ExternalLinkage); 
-    if (Local) 
-      GV.setVisibility(GlobalValue::HiddenVisibility); 
-    return; 
-  } 
- 
-  if (!GV.hasLinkOnceLinkage()) { 
-    assert(!GV.isDiscardableIfUnused()); 
-    return; 
-  } 
- 
-  // Map linkonce* to weak* so that llvm doesn't drop this GV. 
-  switch(GV.getLinkage()) { 
-  default: 
-    llvm_unreachable("Unexpected linkage"); 
-  case GlobalValue::LinkOnceAnyLinkage: 
-    GV.setLinkage(GlobalValue::WeakAnyLinkage); 
-    return; 
-  case GlobalValue::LinkOnceODRLinkage: 
-    GV.setLinkage(GlobalValue::WeakODRLinkage); 
-    return; 
-  } 
-} 
- 
-namespace { 
-  /// A pass to extract specific global values and their dependencies. 
-  class GVExtractorPass : public ModulePass { 
-    SetVector<GlobalValue *> Named; 
-    bool deleteStuff; 
-    bool keepConstInit; 
-  public: 
-    static char ID; // Pass identification, replacement for typeid 
- 
-    /// If deleteS is true, this pass deletes the specified global values. 
-    /// Otherwise, it deletes as much of the module as possible, except for the 
-    /// global values specified. 
-    explicit GVExtractorPass(std::vector<GlobalValue*> &GVs, 
-                             bool deleteS = true, bool keepConstInit = false) 
-      : ModulePass(ID), Named(GVs.begin(), GVs.end()), deleteStuff(deleteS), 
-        keepConstInit(keepConstInit) {} 
- 
-    bool runOnModule(Module &M) override { 
-      if (skipModule(M)) 
-        return false; 
- 
-      // Visit the global inline asm. 
-      if (!deleteStuff) 
-        M.setModuleInlineAsm(""); 
- 
-      // For simplicity, just give all GlobalValues ExternalLinkage. A trickier 
-      // implementation could figure out which GlobalValues are actually 
-      // referenced by the Named set, and which GlobalValues in the rest of 
-      // the module are referenced by the NamedSet, and get away with leaving 
-      // more internal and private things internal and private. But for now, 
-      // be conservative and simple. 
- 
-      // Visit the GlobalVariables. 
-      for (Module::global_iterator I = M.global_begin(), E = M.global_end(); 
-           I != E; ++I) { 
-        bool Delete = 
-            deleteStuff == (bool)Named.count(&*I) && !I->isDeclaration() && 
-            (!I->isConstant() || !keepConstInit); 
-        if (!Delete) { 
-          if (I->hasAvailableExternallyLinkage()) 
-            continue; 
-          if (I->getName() == "llvm.global_ctors") 
-            continue; 
-        } 
- 
-        makeVisible(*I, Delete); 
- 
-        if (Delete) { 
-          // Make this a declaration and drop it's comdat. 
-          I->setInitializer(nullptr); 
-          I->setComdat(nullptr); 
-        } 
-      } 
- 
-      // Visit the Functions. 
-      for (Function &F : M) { 
-        bool Delete = 
-            deleteStuff == (bool)Named.count(&F) && !F.isDeclaration(); 
-        if (!Delete) { 
-          if (F.hasAvailableExternallyLinkage()) 
-            continue; 
-        } 
- 
-        makeVisible(F, Delete); 
- 
-        if (Delete) { 
-          // Make this a declaration and drop it's comdat. 
-          F.deleteBody(); 
-          F.setComdat(nullptr); 
-        } 
-      } 
- 
-      // Visit the Aliases. 
-      for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); 
-           I != E;) { 
-        Module::alias_iterator CurI = I; 
-        ++I; 
- 
-        bool Delete = deleteStuff == (bool)Named.count(&*CurI); 
-        makeVisible(*CurI, Delete); 
- 
-        if (Delete) { 
-          Type *Ty =  CurI->getValueType(); 
- 
-          CurI->removeFromParent(); 
-          llvm::Value *Declaration; 
-          if (FunctionType *FTy = dyn_cast<FunctionType>(Ty)) { 
-            Declaration = Function::Create(FTy, GlobalValue::ExternalLinkage, 
-                                           CurI->getAddressSpace(), 
-                                           CurI->getName(), &M); 
- 
-          } else { 
-            Declaration = 
-              new GlobalVariable(M, Ty, false, GlobalValue::ExternalLinkage, 
-                                 nullptr, CurI->getName()); 
- 
-          } 
-          CurI->replaceAllUsesWith(Declaration); 
-          delete &*CurI; 
-        } 
-      } 
- 
-      return true; 
-    } 
-  }; 
- 
-  char GVExtractorPass::ID = 0; 
-} 
- 
-ModulePass *llvm::createGVExtractionPass(std::vector<GlobalValue *> &GVs, 
-                                         bool deleteFn, bool keepConstInit) { 
-  return new GVExtractorPass(GVs, deleteFn, keepConstInit); 
-} 
+//===-- ExtractGV.cpp - Global Value extraction pass ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass extracts global values
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
+#include <algorithm>
+using namespace llvm;
+
+/// Make sure GV is visible from both modules. Delete is true if it is
+/// being deleted from this module.
+/// This also makes sure GV cannot be dropped so that references from
+/// the split module remain valid.
+static void makeVisible(GlobalValue &GV, bool Delete) {
+  bool Local = GV.hasLocalLinkage();
+  if (Local || Delete) {
+    GV.setLinkage(GlobalValue::ExternalLinkage);
+    if (Local)
+      GV.setVisibility(GlobalValue::HiddenVisibility);
+    return;
+  }
+
+  if (!GV.hasLinkOnceLinkage()) {
+    assert(!GV.isDiscardableIfUnused());
+    return;
+  }
+
+  // Map linkonce* to weak* so that llvm doesn't drop this GV.
+  switch(GV.getLinkage()) {
+  default:
+    llvm_unreachable("Unexpected linkage");
+  case GlobalValue::LinkOnceAnyLinkage:
+    GV.setLinkage(GlobalValue::WeakAnyLinkage);
+    return;
+  case GlobalValue::LinkOnceODRLinkage:
+    GV.setLinkage(GlobalValue::WeakODRLinkage);
+    return;
+  }
+}
+
+namespace {
+  /// A pass to extract specific global values and their dependencies.
+  class GVExtractorPass : public ModulePass {
+    SetVector<GlobalValue *> Named;
+    bool deleteStuff;
+    bool keepConstInit;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+
+    /// If deleteS is true, this pass deletes the specified global values.
+    /// Otherwise, it deletes as much of the module as possible, except for the
+    /// global values specified.
+    explicit GVExtractorPass(std::vector<GlobalValue*> &GVs,
+                             bool deleteS = true, bool keepConstInit = false)
+      : ModulePass(ID), Named(GVs.begin(), GVs.end()), deleteStuff(deleteS),
+        keepConstInit(keepConstInit) {}
+
+    bool runOnModule(Module &M) override {
+      if (skipModule(M))
+        return false;
+
+      // Visit the global inline asm.
+      if (!deleteStuff)
+        M.setModuleInlineAsm("");
+
+      // For simplicity, just give all GlobalValues ExternalLinkage. A trickier
+      // implementation could figure out which GlobalValues are actually
+      // referenced by the Named set, and which GlobalValues in the rest of
+      // the module are referenced by the NamedSet, and get away with leaving
+      // more internal and private things internal and private. But for now,
+      // be conservative and simple.
+
+      // Visit the GlobalVariables.
+      for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+           I != E; ++I) {
+        bool Delete =
+            deleteStuff == (bool)Named.count(&*I) && !I->isDeclaration() &&
+            (!I->isConstant() || !keepConstInit);
+        if (!Delete) {
+          if (I->hasAvailableExternallyLinkage())
+            continue;
+          if (I->getName() == "llvm.global_ctors")
+            continue;
+        }
+
+        makeVisible(*I, Delete);
+
+        if (Delete) {
+          // Make this a declaration and drop it's comdat.
+          I->setInitializer(nullptr);
+          I->setComdat(nullptr);
+        }
+      }
+
+      // Visit the Functions.
+      for (Function &F : M) {
+        bool Delete =
+            deleteStuff == (bool)Named.count(&F) && !F.isDeclaration();
+        if (!Delete) {
+          if (F.hasAvailableExternallyLinkage())
+            continue;
+        }
+
+        makeVisible(F, Delete);
+
+        if (Delete) {
+          // Make this a declaration and drop it's comdat.
+          F.deleteBody();
+          F.setComdat(nullptr);
+        }
+      }
+
+      // Visit the Aliases.
+      for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
+           I != E;) {
+        Module::alias_iterator CurI = I;
+        ++I;
+
+        bool Delete = deleteStuff == (bool)Named.count(&*CurI);
+        makeVisible(*CurI, Delete);
+
+        if (Delete) {
+          Type *Ty =  CurI->getValueType();
+
+          CurI->removeFromParent();
+          llvm::Value *Declaration;
+          if (FunctionType *FTy = dyn_cast<FunctionType>(Ty)) {
+            Declaration = Function::Create(FTy, GlobalValue::ExternalLinkage,
+                                           CurI->getAddressSpace(),
+                                           CurI->getName(), &M);
+
+          } else {
+            Declaration =
+              new GlobalVariable(M, Ty, false, GlobalValue::ExternalLinkage,
+                                 nullptr, CurI->getName());
+
+          }
+          CurI->replaceAllUsesWith(Declaration);
+          delete &*CurI;
+        }
+      }
+
+      return true;
+    }
+  };
+
+  char GVExtractorPass::ID = 0;
+}
+
+ModulePass *llvm::createGVExtractionPass(std::vector<GlobalValue *> &GVs,
+                                         bool deleteFn, bool keepConstInit) {
+  return new GVExtractorPass(GVs, deleteFn, keepConstInit);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/ForceFunctionAttrs.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/ForceFunctionAttrs.cpp
index f0aa837e30..1a8bb225a6 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/ForceFunctionAttrs.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/ForceFunctionAttrs.cpp
@@ -1,31 +1,31 @@
-//===- ForceFunctionAttrs.cpp - Force function attrs for debugging --------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/IPO/ForceFunctionAttrs.h" 
-#include "llvm/ADT/StringSwitch.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "forceattrs" 
- 
-static cl::list<std::string> 
-    ForceAttributes("force-attribute", cl::Hidden, 
-                    cl::desc("Add an attribute to a function. This should be a " 
-                             "pair of 'function-name:attribute-name', for " 
-                             "example -force-attribute=foo:noinline. This " 
-                             "option can be specified multiple times.")); 
- 
+//===- ForceFunctionAttrs.cpp - Force function attrs for debugging --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/ForceFunctionAttrs.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "forceattrs"
+
+static cl::list<std::string>
+    ForceAttributes("force-attribute", cl::Hidden,
+                    cl::desc("Add an attribute to a function. This should be a "
+                             "pair of 'function-name:attribute-name', for "
+                             "example -force-attribute=foo:noinline. This "
+                             "option can be specified multiple times."));
+
 static cl::list<std::string> ForceRemoveAttributes(
     "force-remove-attribute", cl::Hidden,
     cl::desc("Remove an attribute from a function. This should be a "
@@ -33,73 +33,73 @@ static cl::list<std::string> ForceRemoveAttributes(
              "example -force-remove-attribute=foo:noinline. This "
              "option can be specified multiple times."));
 
-static Attribute::AttrKind parseAttrKind(StringRef Kind) { 
-  return StringSwitch<Attribute::AttrKind>(Kind) 
-      .Case("alwaysinline", Attribute::AlwaysInline) 
-      .Case("builtin", Attribute::Builtin) 
-      .Case("cold", Attribute::Cold) 
-      .Case("convergent", Attribute::Convergent) 
-      .Case("inlinehint", Attribute::InlineHint) 
-      .Case("jumptable", Attribute::JumpTable) 
-      .Case("minsize", Attribute::MinSize) 
-      .Case("naked", Attribute::Naked) 
-      .Case("nobuiltin", Attribute::NoBuiltin) 
-      .Case("noduplicate", Attribute::NoDuplicate) 
-      .Case("noimplicitfloat", Attribute::NoImplicitFloat) 
-      .Case("noinline", Attribute::NoInline) 
-      .Case("nonlazybind", Attribute::NonLazyBind) 
-      .Case("noredzone", Attribute::NoRedZone) 
-      .Case("noreturn", Attribute::NoReturn) 
-      .Case("nocf_check", Attribute::NoCfCheck) 
-      .Case("norecurse", Attribute::NoRecurse) 
-      .Case("nounwind", Attribute::NoUnwind) 
-      .Case("optforfuzzing", Attribute::OptForFuzzing) 
-      .Case("optnone", Attribute::OptimizeNone) 
-      .Case("optsize", Attribute::OptimizeForSize) 
-      .Case("readnone", Attribute::ReadNone) 
-      .Case("readonly", Attribute::ReadOnly) 
-      .Case("argmemonly", Attribute::ArgMemOnly) 
-      .Case("returns_twice", Attribute::ReturnsTwice) 
-      .Case("safestack", Attribute::SafeStack) 
-      .Case("shadowcallstack", Attribute::ShadowCallStack) 
-      .Case("sanitize_address", Attribute::SanitizeAddress) 
-      .Case("sanitize_hwaddress", Attribute::SanitizeHWAddress) 
-      .Case("sanitize_memory", Attribute::SanitizeMemory) 
-      .Case("sanitize_thread", Attribute::SanitizeThread) 
-      .Case("sanitize_memtag", Attribute::SanitizeMemTag) 
-      .Case("speculative_load_hardening", Attribute::SpeculativeLoadHardening) 
-      .Case("ssp", Attribute::StackProtect) 
-      .Case("sspreq", Attribute::StackProtectReq) 
-      .Case("sspstrong", Attribute::StackProtectStrong) 
-      .Case("strictfp", Attribute::StrictFP) 
-      .Case("uwtable", Attribute::UWTable) 
-      .Default(Attribute::None); 
-} 
- 
-/// If F has any forced attributes given on the command line, add them. 
+static Attribute::AttrKind parseAttrKind(StringRef Kind) {
+  return StringSwitch<Attribute::AttrKind>(Kind)
+      .Case("alwaysinline", Attribute::AlwaysInline)
+      .Case("builtin", Attribute::Builtin)
+      .Case("cold", Attribute::Cold)
+      .Case("convergent", Attribute::Convergent)
+      .Case("inlinehint", Attribute::InlineHint)
+      .Case("jumptable", Attribute::JumpTable)
+      .Case("minsize", Attribute::MinSize)
+      .Case("naked", Attribute::Naked)
+      .Case("nobuiltin", Attribute::NoBuiltin)
+      .Case("noduplicate", Attribute::NoDuplicate)
+      .Case("noimplicitfloat", Attribute::NoImplicitFloat)
+      .Case("noinline", Attribute::NoInline)
+      .Case("nonlazybind", Attribute::NonLazyBind)
+      .Case("noredzone", Attribute::NoRedZone)
+      .Case("noreturn", Attribute::NoReturn)
+      .Case("nocf_check", Attribute::NoCfCheck)
+      .Case("norecurse", Attribute::NoRecurse)
+      .Case("nounwind", Attribute::NoUnwind)
+      .Case("optforfuzzing", Attribute::OptForFuzzing)
+      .Case("optnone", Attribute::OptimizeNone)
+      .Case("optsize", Attribute::OptimizeForSize)
+      .Case("readnone", Attribute::ReadNone)
+      .Case("readonly", Attribute::ReadOnly)
+      .Case("argmemonly", Attribute::ArgMemOnly)
+      .Case("returns_twice", Attribute::ReturnsTwice)
+      .Case("safestack", Attribute::SafeStack)
+      .Case("shadowcallstack", Attribute::ShadowCallStack)
+      .Case("sanitize_address", Attribute::SanitizeAddress)
+      .Case("sanitize_hwaddress", Attribute::SanitizeHWAddress)
+      .Case("sanitize_memory", Attribute::SanitizeMemory)
+      .Case("sanitize_thread", Attribute::SanitizeThread)
+      .Case("sanitize_memtag", Attribute::SanitizeMemTag)
+      .Case("speculative_load_hardening", Attribute::SpeculativeLoadHardening)
+      .Case("ssp", Attribute::StackProtect)
+      .Case("sspreq", Attribute::StackProtectReq)
+      .Case("sspstrong", Attribute::StackProtectStrong)
+      .Case("strictfp", Attribute::StrictFP)
+      .Case("uwtable", Attribute::UWTable)
+      .Default(Attribute::None);
+}
+
+/// If F has any forced attributes given on the command line, add them.
 /// If F has any forced remove attributes given on the command line, remove
 /// them. When both force and force-remove are given to a function, the latter
 /// takes precedence.
 static void forceAttributes(Function &F) {
   auto ParseFunctionAndAttr = [&](StringRef S) {
     auto Kind = Attribute::None;
-    auto KV = StringRef(S).split(':'); 
-    if (KV.first != F.getName()) 
+    auto KV = StringRef(S).split(':');
+    if (KV.first != F.getName())
       return Kind;
     Kind = parseAttrKind(KV.second);
-    if (Kind == Attribute::None) { 
-      LLVM_DEBUG(dbgs() << "ForcedAttribute: " << KV.second 
-                        << " unknown or not handled!\n"); 
-    } 
+    if (Kind == Attribute::None) {
+      LLVM_DEBUG(dbgs() << "ForcedAttribute: " << KV.second
+                        << " unknown or not handled!\n");
+    }
     return Kind;
   };
 
   for (auto &S : ForceAttributes) {
     auto Kind = ParseFunctionAndAttr(S);
     if (Kind == Attribute::None || F.hasFnAttribute(Kind))
-      continue; 
-    F.addFnAttr(Kind); 
-  } 
+      continue;
+    F.addFnAttr(Kind);
+  }
 
   for (auto &S : ForceRemoveAttributes) {
     auto Kind = ParseFunctionAndAttr(S);
@@ -107,49 +107,49 @@ static void forceAttributes(Function &F) {
       continue;
     F.removeFnAttr(Kind);
   }
-} 
- 
+}
+
 static bool hasForceAttributes() {
   return !ForceAttributes.empty() || !ForceRemoveAttributes.empty();
 }
 
-PreservedAnalyses ForceFunctionAttrsPass::run(Module &M, 
-                                              ModuleAnalysisManager &) { 
+PreservedAnalyses ForceFunctionAttrsPass::run(Module &M,
+                                              ModuleAnalysisManager &) {
   if (!hasForceAttributes())
-    return PreservedAnalyses::all(); 
- 
-  for (Function &F : M.functions()) 
+    return PreservedAnalyses::all();
+
+  for (Function &F : M.functions())
     forceAttributes(F);
- 
-  // Just conservatively invalidate analyses, this isn't likely to be important. 
-  return PreservedAnalyses::none(); 
-} 
- 
-namespace { 
-struct ForceFunctionAttrsLegacyPass : public ModulePass { 
-  static char ID; // Pass identification, replacement for typeid 
-  ForceFunctionAttrsLegacyPass() : ModulePass(ID) { 
-    initializeForceFunctionAttrsLegacyPassPass( 
-        *PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnModule(Module &M) override { 
+
+  // Just conservatively invalidate analyses, this isn't likely to be important.
+  return PreservedAnalyses::none();
+}
+
+namespace {
+struct ForceFunctionAttrsLegacyPass : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  ForceFunctionAttrsLegacyPass() : ModulePass(ID) {
+    initializeForceFunctionAttrsLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
     if (!hasForceAttributes())
-      return false; 
- 
-    for (Function &F : M.functions()) 
+      return false;
+
+    for (Function &F : M.functions())
       forceAttributes(F);
- 
-    // Conservatively assume we changed something. 
-    return true; 
-  } 
-}; 
-} 
- 
-char ForceFunctionAttrsLegacyPass::ID = 0; 
-INITIALIZE_PASS(ForceFunctionAttrsLegacyPass, "forceattrs", 
-                "Force set function attributes", false, false) 
- 
-Pass *llvm::createForceFunctionAttrsLegacyPass() { 
-  return new ForceFunctionAttrsLegacyPass(); 
-} 
+
+    // Conservatively assume we changed something.
+    return true;
+  }
+};
+}
+
+char ForceFunctionAttrsLegacyPass::ID = 0;
+INITIALIZE_PASS(ForceFunctionAttrsLegacyPass, "forceattrs",
+                "Force set function attributes", false, false)
+
+Pass *llvm::createForceFunctionAttrsLegacyPass() {
+  return new ForceFunctionAttrsLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/FunctionAttrs.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/FunctionAttrs.cpp
index 86a30355bb..6730824e86 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -1,154 +1,154 @@
-//===- FunctionAttrs.cpp - Pass which marks functions attributes ----------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-/// \file 
-/// This file implements interprocedural passes which walk the 
-/// call-graph deducing and/or propagating function attributes. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/IPO/FunctionAttrs.h" 
+//===- FunctionAttrs.cpp - Pass which marks functions attributes ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file implements interprocedural passes which walk the
+/// call-graph deducing and/or propagating function attributes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/FunctionAttrs.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SCCIterator.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/BasicAliasAnalysis.h" 
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/CGSCCPassManager.h" 
-#include "llvm/Analysis/CallGraph.h" 
-#include "llvm/Analysis/CallGraphSCCPass.h" 
-#include "llvm/Analysis/CaptureTracking.h" 
-#include "llvm/Analysis/LazyCallGraph.h" 
-#include "llvm/Analysis/MemoryBuiltins.h" 
-#include "llvm/Analysis/MemoryLocation.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/Argument.h" 
-#include "llvm/IR/Attributes.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/InstIterator.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Compiler.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/IPO.h" 
-#include <cassert> 
-#include <iterator> 
-#include <map> 
-#include <vector> 
- 
-using namespace llvm; 
- 
+#include "llvm/Analysis/CGSCCPassManager.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/LazyCallGraph.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include <cassert>
+#include <iterator>
+#include <map>
+#include <vector>
+
+using namespace llvm;
+
 #define DEBUG_TYPE "function-attrs"
- 
-STATISTIC(NumReadNone, "Number of functions marked readnone"); 
-STATISTIC(NumReadOnly, "Number of functions marked readonly"); 
-STATISTIC(NumWriteOnly, "Number of functions marked writeonly"); 
-STATISTIC(NumNoCapture, "Number of arguments marked nocapture"); 
-STATISTIC(NumReturned, "Number of arguments marked returned"); 
-STATISTIC(NumReadNoneArg, "Number of arguments marked readnone"); 
-STATISTIC(NumReadOnlyArg, "Number of arguments marked readonly"); 
-STATISTIC(NumNoAlias, "Number of function returns marked noalias"); 
-STATISTIC(NumNonNullReturn, "Number of function returns marked nonnull"); 
-STATISTIC(NumNoRecurse, "Number of functions marked as norecurse"); 
-STATISTIC(NumNoUnwind, "Number of functions marked as nounwind"); 
-STATISTIC(NumNoFree, "Number of functions marked as nofree"); 
+
+STATISTIC(NumReadNone, "Number of functions marked readnone");
+STATISTIC(NumReadOnly, "Number of functions marked readonly");
+STATISTIC(NumWriteOnly, "Number of functions marked writeonly");
+STATISTIC(NumNoCapture, "Number of arguments marked nocapture");
+STATISTIC(NumReturned, "Number of arguments marked returned");
+STATISTIC(NumReadNoneArg, "Number of arguments marked readnone");
+STATISTIC(NumReadOnlyArg, "Number of arguments marked readonly");
+STATISTIC(NumNoAlias, "Number of function returns marked noalias");
+STATISTIC(NumNonNullReturn, "Number of function returns marked nonnull");
+STATISTIC(NumNoRecurse, "Number of functions marked as norecurse");
+STATISTIC(NumNoUnwind, "Number of functions marked as nounwind");
+STATISTIC(NumNoFree, "Number of functions marked as nofree");
 STATISTIC(NumWillReturn, "Number of functions marked as willreturn");
- 
-static cl::opt<bool> EnableNonnullArgPropagation( 
-    "enable-nonnull-arg-prop", cl::init(true), cl::Hidden, 
-    cl::desc("Try to propagate nonnull argument attributes from callsites to " 
-             "caller functions.")); 
- 
-static cl::opt<bool> DisableNoUnwindInference( 
-    "disable-nounwind-inference", cl::Hidden, 
-    cl::desc("Stop inferring nounwind attribute during function-attrs pass")); 
- 
-static cl::opt<bool> DisableNoFreeInference( 
-    "disable-nofree-inference", cl::Hidden, 
-    cl::desc("Stop inferring nofree attribute during function-attrs pass")); 
- 
-namespace { 
- 
-using SCCNodeSet = SmallSetVector<Function *, 8>; 
- 
-} // end anonymous namespace 
- 
-/// Returns the memory access attribute for function F using AAR for AA results, 
-/// where SCCNodes is the current SCC. 
-/// 
-/// If ThisBody is true, this function may examine the function body and will 
-/// return a result pertaining to this copy of the function. If it is false, the 
-/// result will be based only on AA results for the function declaration; it 
-/// will be assumed that some other (perhaps less optimized) version of the 
-/// function may be selected at link time. 
-static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody, 
-                                                  AAResults &AAR, 
-                                                  const SCCNodeSet &SCCNodes) { 
-  FunctionModRefBehavior MRB = AAR.getModRefBehavior(&F); 
-  if (MRB == FMRB_DoesNotAccessMemory) 
-    // Already perfect! 
-    return MAK_ReadNone; 
- 
-  if (!ThisBody) { 
-    if (AliasAnalysis::onlyReadsMemory(MRB)) 
-      return MAK_ReadOnly; 
- 
-    if (AliasAnalysis::doesNotReadMemory(MRB)) 
-      return MAK_WriteOnly; 
- 
-    // Conservatively assume it reads and writes to memory. 
-    return MAK_MayWrite; 
-  } 
- 
-  // Scan the function body for instructions that may read or write memory. 
-  bool ReadsMemory = false; 
-  bool WritesMemory = false; 
-  for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) { 
-    Instruction *I = &*II; 
- 
-    // Some instructions can be ignored even if they read or write memory. 
-    // Detect these now, skipping to the next instruction if one is found. 
-    if (auto *Call = dyn_cast<CallBase>(I)) { 
-      // Ignore calls to functions in the same SCC, as long as the call sites 
-      // don't have operand bundles.  Calls with operand bundles are allowed to 
-      // have memory effects not described by the memory effects of the call 
-      // target. 
-      if (!Call->hasOperandBundles() && Call->getCalledFunction() && 
-          SCCNodes.count(Call->getCalledFunction())) 
-        continue; 
-      FunctionModRefBehavior MRB = AAR.getModRefBehavior(Call); 
-      ModRefInfo MRI = createModRefInfo(MRB); 
- 
-      // If the call doesn't access memory, we're done. 
-      if (isNoModRef(MRI)) 
-        continue; 
- 
+
+static cl::opt<bool> EnableNonnullArgPropagation(
+    "enable-nonnull-arg-prop", cl::init(true), cl::Hidden,
+    cl::desc("Try to propagate nonnull argument attributes from callsites to "
+             "caller functions."));
+
+static cl::opt<bool> DisableNoUnwindInference(
+    "disable-nounwind-inference", cl::Hidden,
+    cl::desc("Stop inferring nounwind attribute during function-attrs pass"));
+
+static cl::opt<bool> DisableNoFreeInference(
+    "disable-nofree-inference", cl::Hidden,
+    cl::desc("Stop inferring nofree attribute during function-attrs pass"));
+
+namespace {
+
+using SCCNodeSet = SmallSetVector<Function *, 8>;
+
+} // end anonymous namespace
+
+/// Returns the memory access attribute for function F using AAR for AA results,
+/// where SCCNodes is the current SCC.
+///
+/// If ThisBody is true, this function may examine the function body and will
+/// return a result pertaining to this copy of the function. If it is false, the
+/// result will be based only on AA results for the function declaration; it
+/// will be assumed that some other (perhaps less optimized) version of the
+/// function may be selected at link time.
+static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
+                                                  AAResults &AAR,
+                                                  const SCCNodeSet &SCCNodes) {
+  FunctionModRefBehavior MRB = AAR.getModRefBehavior(&F);
+  if (MRB == FMRB_DoesNotAccessMemory)
+    // Already perfect!
+    return MAK_ReadNone;
+
+  if (!ThisBody) {
+    if (AliasAnalysis::onlyReadsMemory(MRB))
+      return MAK_ReadOnly;
+
+    if (AliasAnalysis::doesNotReadMemory(MRB))
+      return MAK_WriteOnly;
+
+    // Conservatively assume it reads and writes to memory.
+    return MAK_MayWrite;
+  }
+
+  // Scan the function body for instructions that may read or write memory.
+  bool ReadsMemory = false;
+  bool WritesMemory = false;
+  for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) {
+    Instruction *I = &*II;
+
+    // Some instructions can be ignored even if they read or write memory.
+    // Detect these now, skipping to the next instruction if one is found.
+    if (auto *Call = dyn_cast<CallBase>(I)) {
+      // Ignore calls to functions in the same SCC, as long as the call sites
+      // don't have operand bundles.  Calls with operand bundles are allowed to
+      // have memory effects not described by the memory effects of the call
+      // target.
+      if (!Call->hasOperandBundles() && Call->getCalledFunction() &&
+          SCCNodes.count(Call->getCalledFunction()))
+        continue;
+      FunctionModRefBehavior MRB = AAR.getModRefBehavior(Call);
+      ModRefInfo MRI = createModRefInfo(MRB);
+
+      // If the call doesn't access memory, we're done.
+      if (isNoModRef(MRI))
+        continue;
+
       // A pseudo probe call shouldn't change any function attribute since it
       // doesn't translate to a real instruction. It comes with a memory access
       // tag to prevent itself being removed by optimizations and not block
@@ -156,1158 +156,1158 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
       if (isa<PseudoProbeInst>(I))
         continue;
 
-      if (!AliasAnalysis::onlyAccessesArgPointees(MRB)) { 
-        // The call could access any memory. If that includes writes, note it. 
-        if (isModSet(MRI)) 
-          WritesMemory = true; 
-        // If it reads, note it. 
-        if (isRefSet(MRI)) 
-          ReadsMemory = true; 
-        continue; 
-      } 
- 
-      // Check whether all pointer arguments point to local memory, and 
-      // ignore calls that only access local memory. 
-      for (auto CI = Call->arg_begin(), CE = Call->arg_end(); CI != CE; ++CI) { 
-        Value *Arg = *CI; 
-        if (!Arg->getType()->isPtrOrPtrVectorTy()) 
-          continue; 
- 
-        AAMDNodes AAInfo; 
-        I->getAAMetadata(AAInfo); 
+      if (!AliasAnalysis::onlyAccessesArgPointees(MRB)) {
+        // The call could access any memory. If that includes writes, note it.
+        if (isModSet(MRI))
+          WritesMemory = true;
+        // If it reads, note it.
+        if (isRefSet(MRI))
+          ReadsMemory = true;
+        continue;
+      }
+
+      // Check whether all pointer arguments point to local memory, and
+      // ignore calls that only access local memory.
+      for (auto CI = Call->arg_begin(), CE = Call->arg_end(); CI != CE; ++CI) {
+        Value *Arg = *CI;
+        if (!Arg->getType()->isPtrOrPtrVectorTy())
+          continue;
+
+        AAMDNodes AAInfo;
+        I->getAAMetadata(AAInfo);
         MemoryLocation Loc = MemoryLocation::getBeforeOrAfter(Arg, AAInfo);
- 
-        // Skip accesses to local or constant memory as they don't impact the 
-        // externally visible mod/ref behavior. 
-        if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true)) 
-          continue; 
- 
-        if (isModSet(MRI)) 
-          // Writes non-local memory. 
-          WritesMemory = true; 
-        if (isRefSet(MRI)) 
-          // Ok, it reads non-local memory. 
-          ReadsMemory = true; 
-      } 
-      continue; 
-    } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) { 
-      // Ignore non-volatile loads from local memory. (Atomic is okay here.) 
-      if (!LI->isVolatile()) { 
-        MemoryLocation Loc = MemoryLocation::get(LI); 
-        if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true)) 
-          continue; 
-      } 
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) { 
-      // Ignore non-volatile stores to local memory. (Atomic is okay here.) 
-      if (!SI->isVolatile()) { 
-        MemoryLocation Loc = MemoryLocation::get(SI); 
-        if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true)) 
-          continue; 
-      } 
-    } else if (VAArgInst *VI = dyn_cast<VAArgInst>(I)) { 
-      // Ignore vaargs on local memory. 
-      MemoryLocation Loc = MemoryLocation::get(VI); 
-      if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true)) 
-        continue; 
-    } 
- 
-    // Any remaining instructions need to be taken seriously!  Check if they 
-    // read or write memory. 
-    // 
-    // Writes memory, remember that. 
-    WritesMemory |= I->mayWriteToMemory(); 
- 
-    // If this instruction may read memory, remember that. 
-    ReadsMemory |= I->mayReadFromMemory(); 
-  } 
- 
-  if (WritesMemory) {  
-    if (!ReadsMemory) 
-      return MAK_WriteOnly; 
-    else 
-      return MAK_MayWrite; 
-  } 
- 
-  return ReadsMemory ? MAK_ReadOnly : MAK_ReadNone; 
-} 
- 
-MemoryAccessKind llvm::computeFunctionBodyMemoryAccess(Function &F, 
-                                                       AAResults &AAR) { 
-  return checkFunctionMemoryAccess(F, /*ThisBody=*/true, AAR, {}); 
-} 
- 
-/// Deduce readonly/readnone attributes for the SCC. 
-template <typename AARGetterT> 
-static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter) { 
-  // Check if any of the functions in the SCC read or write memory.  If they 
-  // write memory then they can't be marked readnone or readonly. 
-  bool ReadsMemory = false; 
-  bool WritesMemory = false; 
-  for (Function *F : SCCNodes) { 
-    // Call the callable parameter to look up AA results for this function. 
-    AAResults &AAR = AARGetter(*F); 
- 
-    // Non-exact function definitions may not be selected at link time, and an 
-    // alternative version that writes to memory may be selected.  See the 
-    // comment on GlobalValue::isDefinitionExact for more details. 
-    switch (checkFunctionMemoryAccess(*F, F->hasExactDefinition(), 
-                                      AAR, SCCNodes)) { 
-    case MAK_MayWrite: 
-      return false; 
-    case MAK_ReadOnly: 
-      ReadsMemory = true; 
-      break; 
-    case MAK_WriteOnly: 
-      WritesMemory = true; 
-      break; 
-    case MAK_ReadNone: 
-      // Nothing to do! 
-      break; 
-    } 
-  } 
- 
-  // If the SCC contains both functions that read and functions that write, then 
-  // we cannot add readonly attributes. 
-  if (ReadsMemory && WritesMemory) 
-    return false; 
- 
-  // Success!  Functions in this SCC do not access memory, or only read memory. 
-  // Give them the appropriate attribute. 
-  bool MadeChange = false; 
- 
-  for (Function *F : SCCNodes) { 
-    if (F->doesNotAccessMemory()) 
-      // Already perfect! 
-      continue; 
- 
-    if (F->onlyReadsMemory() && ReadsMemory) 
-      // No change. 
-      continue; 
- 
-    if (F->doesNotReadMemory() && WritesMemory) 
-      continue; 
- 
-    MadeChange = true; 
- 
-    // Clear out any existing attributes. 
+
+        // Skip accesses to local or constant memory as they don't impact the
+        // externally visible mod/ref behavior.
+        if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))
+          continue;
+
+        if (isModSet(MRI))
+          // Writes non-local memory.
+          WritesMemory = true;
+        if (isRefSet(MRI))
+          // Ok, it reads non-local memory.
+          ReadsMemory = true;
+      }
+      continue;
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+      // Ignore non-volatile loads from local memory. (Atomic is okay here.)
+      if (!LI->isVolatile()) {
+        MemoryLocation Loc = MemoryLocation::get(LI);
+        if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))
+          continue;
+      }
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+      // Ignore non-volatile stores to local memory. (Atomic is okay here.)
+      if (!SI->isVolatile()) {
+        MemoryLocation Loc = MemoryLocation::get(SI);
+        if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))
+          continue;
+      }
+    } else if (VAArgInst *VI = dyn_cast<VAArgInst>(I)) {
+      // Ignore vaargs on local memory.
+      MemoryLocation Loc = MemoryLocation::get(VI);
+      if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))
+        continue;
+    }
+
+    // Any remaining instructions need to be taken seriously!  Check if they
+    // read or write memory.
+    //
+    // Writes memory, remember that.
+    WritesMemory |= I->mayWriteToMemory();
+
+    // If this instruction may read memory, remember that.
+    ReadsMemory |= I->mayReadFromMemory();
+  }
+
+  if (WritesMemory) { 
+    if (!ReadsMemory)
+      return MAK_WriteOnly;
+    else
+      return MAK_MayWrite;
+  }
+
+  return ReadsMemory ? MAK_ReadOnly : MAK_ReadNone;
+}
+
+MemoryAccessKind llvm::computeFunctionBodyMemoryAccess(Function &F,
+                                                       AAResults &AAR) {
+  return checkFunctionMemoryAccess(F, /*ThisBody=*/true, AAR, {});
+}
+
+/// Deduce readonly/readnone attributes for the SCC.
+template <typename AARGetterT>
+static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter) {
+  // Check if any of the functions in the SCC read or write memory.  If they
+  // write memory then they can't be marked readnone or readonly.
+  bool ReadsMemory = false;
+  bool WritesMemory = false;
+  for (Function *F : SCCNodes) {
+    // Call the callable parameter to look up AA results for this function.
+    AAResults &AAR = AARGetter(*F);
+
+    // Non-exact function definitions may not be selected at link time, and an
+    // alternative version that writes to memory may be selected.  See the
+    // comment on GlobalValue::isDefinitionExact for more details.
+    switch (checkFunctionMemoryAccess(*F, F->hasExactDefinition(),
+                                      AAR, SCCNodes)) {
+    case MAK_MayWrite:
+      return false;
+    case MAK_ReadOnly:
+      ReadsMemory = true;
+      break;
+    case MAK_WriteOnly:
+      WritesMemory = true;
+      break;
+    case MAK_ReadNone:
+      // Nothing to do!
+      break;
+    }
+  }
+
+  // If the SCC contains both functions that read and functions that write, then
+  // we cannot add readonly attributes.
+  if (ReadsMemory && WritesMemory)
+    return false;
+
+  // Success!  Functions in this SCC do not access memory, or only read memory.
+  // Give them the appropriate attribute.
+  bool MadeChange = false;
+
+  for (Function *F : SCCNodes) {
+    if (F->doesNotAccessMemory())
+      // Already perfect!
+      continue;
+
+    if (F->onlyReadsMemory() && ReadsMemory)
+      // No change.
+      continue;
+
+    if (F->doesNotReadMemory() && WritesMemory)
+      continue;
+
+    MadeChange = true;
+
+    // Clear out any existing attributes.
     AttrBuilder AttrsToRemove;
     AttrsToRemove.addAttribute(Attribute::ReadOnly);
     AttrsToRemove.addAttribute(Attribute::ReadNone);
     AttrsToRemove.addAttribute(Attribute::WriteOnly);
- 
-    if (!WritesMemory && !ReadsMemory) { 
-      // Clear out any "access range attributes" if readnone was deduced. 
+
+    if (!WritesMemory && !ReadsMemory) {
+      // Clear out any "access range attributes" if readnone was deduced.
       AttrsToRemove.addAttribute(Attribute::ArgMemOnly);
       AttrsToRemove.addAttribute(Attribute::InaccessibleMemOnly);
       AttrsToRemove.addAttribute(Attribute::InaccessibleMemOrArgMemOnly);
-    } 
+    }
     F->removeAttributes(AttributeList::FunctionIndex, AttrsToRemove);
- 
-    // Add in the new attribute. 
-    if (WritesMemory && !ReadsMemory) 
-      F->addFnAttr(Attribute::WriteOnly); 
-    else 
-      F->addFnAttr(ReadsMemory ? Attribute::ReadOnly : Attribute::ReadNone); 
- 
-    if (WritesMemory && !ReadsMemory) 
-      ++NumWriteOnly; 
-    else if (ReadsMemory) 
-      ++NumReadOnly; 
-    else 
-      ++NumReadNone; 
-  } 
- 
-  return MadeChange; 
-} 
- 
-namespace { 
- 
-/// For a given pointer Argument, this retains a list of Arguments of functions 
-/// in the same SCC that the pointer data flows into. We use this to build an 
-/// SCC of the arguments. 
-struct ArgumentGraphNode { 
-  Argument *Definition; 
-  SmallVector<ArgumentGraphNode *, 4> Uses; 
-}; 
- 
-class ArgumentGraph { 
-  // We store pointers to ArgumentGraphNode objects, so it's important that 
-  // that they not move around upon insert. 
-  using ArgumentMapTy = std::map<Argument *, ArgumentGraphNode>; 
- 
-  ArgumentMapTy ArgumentMap; 
- 
-  // There is no root node for the argument graph, in fact: 
-  //   void f(int *x, int *y) { if (...) f(x, y); } 
-  // is an example where the graph is disconnected. The SCCIterator requires a 
-  // single entry point, so we maintain a fake ("synthetic") root node that 
-  // uses every node. Because the graph is directed and nothing points into 
-  // the root, it will not participate in any SCCs (except for its own). 
-  ArgumentGraphNode SyntheticRoot; 
- 
-public: 
-  ArgumentGraph() { SyntheticRoot.Definition = nullptr; } 
- 
-  using iterator = SmallVectorImpl<ArgumentGraphNode *>::iterator; 
- 
-  iterator begin() { return SyntheticRoot.Uses.begin(); } 
-  iterator end() { return SyntheticRoot.Uses.end(); } 
-  ArgumentGraphNode *getEntryNode() { return &SyntheticRoot; } 
- 
-  ArgumentGraphNode *operator[](Argument *A) { 
-    ArgumentGraphNode &Node = ArgumentMap[A]; 
-    Node.Definition = A; 
-    SyntheticRoot.Uses.push_back(&Node); 
-    return &Node; 
-  } 
-}; 
- 
-/// This tracker checks whether callees are in the SCC, and if so it does not 
-/// consider that a capture, instead adding it to the "Uses" list and 
-/// continuing with the analysis. 
-struct ArgumentUsesTracker : public CaptureTracker { 
-  ArgumentUsesTracker(const SCCNodeSet &SCCNodes) : SCCNodes(SCCNodes) {} 
- 
-  void tooManyUses() override { Captured = true; } 
- 
-  bool captured(const Use *U) override { 
-    CallBase *CB = dyn_cast<CallBase>(U->getUser()); 
-    if (!CB) { 
-      Captured = true; 
-      return true; 
-    } 
- 
-    Function *F = CB->getCalledFunction(); 
-    if (!F || !F->hasExactDefinition() || !SCCNodes.count(F)) { 
-      Captured = true; 
-      return true; 
-    } 
- 
-    // Note: the callee and the two successor blocks *follow* the argument 
-    // operands.  This means there is no need to adjust UseIndex to account for 
-    // these. 
- 
-    unsigned UseIndex = 
-        std::distance(const_cast<const Use *>(CB->arg_begin()), U); 
- 
-    assert(UseIndex < CB->data_operands_size() && 
-           "Indirect function calls should have been filtered above!"); 
- 
-    if (UseIndex >= CB->getNumArgOperands()) { 
-      // Data operand, but not a argument operand -- must be a bundle operand 
-      assert(CB->hasOperandBundles() && "Must be!"); 
- 
-      // CaptureTracking told us that we're being captured by an operand bundle 
-      // use.  In this case it does not matter if the callee is within our SCC 
-      // or not -- we've been captured in some unknown way, and we have to be 
-      // conservative. 
-      Captured = true; 
-      return true; 
-    } 
- 
-    if (UseIndex >= F->arg_size()) { 
-      assert(F->isVarArg() && "More params than args in non-varargs call"); 
-      Captured = true; 
-      return true; 
-    } 
- 
-    Uses.push_back(&*std::next(F->arg_begin(), UseIndex)); 
-    return false; 
-  } 
- 
-  // True only if certainly captured (used outside our SCC). 
-  bool Captured = false; 
- 
-  // Uses within our SCC. 
-  SmallVector<Argument *, 4> Uses; 
- 
-  const SCCNodeSet &SCCNodes; 
-}; 
- 
-} // end anonymous namespace 
- 
-namespace llvm { 
- 
-template <> struct GraphTraits<ArgumentGraphNode *> { 
-  using NodeRef = ArgumentGraphNode *; 
-  using ChildIteratorType = SmallVectorImpl<ArgumentGraphNode *>::iterator; 
- 
-  static NodeRef getEntryNode(NodeRef A) { return A; } 
-  static ChildIteratorType child_begin(NodeRef N) { return N->Uses.begin(); } 
-  static ChildIteratorType child_end(NodeRef N) { return N->Uses.end(); } 
-}; 
- 
-template <> 
-struct GraphTraits<ArgumentGraph *> : public GraphTraits<ArgumentGraphNode *> { 
-  static NodeRef getEntryNode(ArgumentGraph *AG) { return AG->getEntryNode(); } 
- 
-  static ChildIteratorType nodes_begin(ArgumentGraph *AG) { 
-    return AG->begin(); 
-  } 
- 
-  static ChildIteratorType nodes_end(ArgumentGraph *AG) { return AG->end(); } 
-}; 
- 
-} // end namespace llvm 
- 
-/// Returns Attribute::None, Attribute::ReadOnly or Attribute::ReadNone. 
-static Attribute::AttrKind 
-determinePointerReadAttrs(Argument *A, 
-                          const SmallPtrSet<Argument *, 8> &SCCNodes) { 
-  SmallVector<Use *, 32> Worklist; 
-  SmallPtrSet<Use *, 32> Visited; 
- 
-  // inalloca arguments are always clobbered by the call. 
-  if (A->hasInAllocaAttr() || A->hasPreallocatedAttr()) 
-    return Attribute::None; 
- 
-  bool IsRead = false; 
-  // We don't need to track IsWritten. If A is written to, return immediately. 
- 
-  for (Use &U : A->uses()) { 
-    Visited.insert(&U); 
-    Worklist.push_back(&U); 
-  } 
- 
-  while (!Worklist.empty()) { 
-    Use *U = Worklist.pop_back_val(); 
-    Instruction *I = cast<Instruction>(U->getUser()); 
- 
-    switch (I->getOpcode()) { 
-    case Instruction::BitCast: 
-    case Instruction::GetElementPtr: 
-    case Instruction::PHI: 
-    case Instruction::Select: 
-    case Instruction::AddrSpaceCast: 
-      // The original value is not read/written via this if the new value isn't. 
-      for (Use &UU : I->uses()) 
-        if (Visited.insert(&UU).second) 
-          Worklist.push_back(&UU); 
-      break; 
- 
-    case Instruction::Call: 
-    case Instruction::Invoke: { 
-      bool Captures = true; 
- 
-      if (I->getType()->isVoidTy()) 
-        Captures = false; 
- 
-      auto AddUsersToWorklistIfCapturing = [&] { 
-        if (Captures) 
-          for (Use &UU : I->uses()) 
-            if (Visited.insert(&UU).second) 
-              Worklist.push_back(&UU); 
-      }; 
- 
-      CallBase &CB = cast<CallBase>(*I); 
-      if (CB.doesNotAccessMemory()) { 
-        AddUsersToWorklistIfCapturing(); 
-        continue; 
-      } 
- 
-      Function *F = CB.getCalledFunction(); 
-      if (!F) { 
-        if (CB.onlyReadsMemory()) { 
-          IsRead = true; 
-          AddUsersToWorklistIfCapturing(); 
-          continue; 
-        } 
-        return Attribute::None; 
-      } 
- 
-      // Note: the callee and the two successor blocks *follow* the argument 
-      // operands.  This means there is no need to adjust UseIndex to account 
-      // for these. 
- 
-      unsigned UseIndex = std::distance(CB.arg_begin(), U); 
- 
-      // U cannot be the callee operand use: since we're exploring the 
-      // transitive uses of an Argument, having such a use be a callee would 
-      // imply the call site is an indirect call or invoke; and we'd take the 
-      // early exit above. 
-      assert(UseIndex < CB.data_operands_size() && 
-             "Data operand use expected!"); 
- 
-      bool IsOperandBundleUse = UseIndex >= CB.getNumArgOperands(); 
- 
-      if (UseIndex >= F->arg_size() && !IsOperandBundleUse) { 
-        assert(F->isVarArg() && "More params than args in non-varargs call"); 
-        return Attribute::None; 
-      } 
- 
-      Captures &= !CB.doesNotCapture(UseIndex); 
- 
-      // Since the optimizer (by design) cannot see the data flow corresponding 
-      // to a operand bundle use, these cannot participate in the optimistic SCC 
-      // analysis.  Instead, we model the operand bundle uses as arguments in 
-      // call to a function external to the SCC. 
-      if (IsOperandBundleUse || 
-          !SCCNodes.count(&*std::next(F->arg_begin(), UseIndex))) { 
- 
-        // The accessors used on call site here do the right thing for calls and 
-        // invokes with operand bundles. 
- 
-        if (!CB.onlyReadsMemory() && !CB.onlyReadsMemory(UseIndex)) 
-          return Attribute::None; 
-        if (!CB.doesNotAccessMemory(UseIndex)) 
-          IsRead = true; 
-      } 
- 
-      AddUsersToWorklistIfCapturing(); 
-      break; 
-    } 
- 
-    case Instruction::Load: 
-      // A volatile load has side effects beyond what readonly can be relied 
-      // upon. 
-      if (cast<LoadInst>(I)->isVolatile()) 
-        return Attribute::None; 
- 
-      IsRead = true; 
-      break; 
- 
-    case Instruction::ICmp: 
-    case Instruction::Ret: 
-      break; 
- 
-    default: 
-      return Attribute::None; 
-    } 
-  } 
- 
-  return IsRead ? Attribute::ReadOnly : Attribute::ReadNone; 
-} 
- 
-/// Deduce returned attributes for the SCC. 
-static bool addArgumentReturnedAttrs(const SCCNodeSet &SCCNodes) { 
-  bool Changed = false; 
- 
-  // Check each function in turn, determining if an argument is always returned. 
-  for (Function *F : SCCNodes) { 
-    // We can infer and propagate function attributes only when we know that the 
-    // definition we'll get at link time is *exactly* the definition we see now. 
-    // For more details, see GlobalValue::mayBeDerefined. 
-    if (!F->hasExactDefinition()) 
-      continue; 
- 
-    if (F->getReturnType()->isVoidTy()) 
-      continue; 
- 
-    // There is nothing to do if an argument is already marked as 'returned'. 
-    if (llvm::any_of(F->args(), 
-                     [](const Argument &Arg) { return Arg.hasReturnedAttr(); })) 
-      continue; 
- 
-    auto FindRetArg = [&]() -> Value * { 
-      Value *RetArg = nullptr; 
-      for (BasicBlock &BB : *F) 
-        if (auto *Ret = dyn_cast<ReturnInst>(BB.getTerminator())) { 
-          // Note that stripPointerCasts should look through functions with 
-          // returned arguments. 
-          Value *RetVal = Ret->getReturnValue()->stripPointerCasts(); 
-          if (!isa<Argument>(RetVal) || RetVal->getType() != F->getReturnType()) 
-            return nullptr; 
- 
-          if (!RetArg) 
-            RetArg = RetVal; 
-          else if (RetArg != RetVal) 
-            return nullptr; 
-        } 
- 
-      return RetArg; 
-    }; 
- 
-    if (Value *RetArg = FindRetArg()) { 
-      auto *A = cast<Argument>(RetArg); 
-      A->addAttr(Attribute::Returned); 
-      ++NumReturned; 
-      Changed = true; 
-    } 
-  } 
- 
-  return Changed; 
-} 
- 
-/// If a callsite has arguments that are also arguments to the parent function, 
-/// try to propagate attributes from the callsite's arguments to the parent's 
-/// arguments. This may be important because inlining can cause information loss 
-/// when attribute knowledge disappears with the inlined call. 
-static bool addArgumentAttrsFromCallsites(Function &F) { 
-  if (!EnableNonnullArgPropagation) 
-    return false; 
- 
-  bool Changed = false; 
- 
-  // For an argument attribute to transfer from a callsite to the parent, the 
-  // call must be guaranteed to execute every time the parent is called. 
-  // Conservatively, just check for calls in the entry block that are guaranteed 
-  // to execute. 
-  // TODO: This could be enhanced by testing if the callsite post-dominates the 
-  // entry block or by doing simple forward walks or backward walks to the 
-  // callsite. 
-  BasicBlock &Entry = F.getEntryBlock(); 
-  for (Instruction &I : Entry) { 
-    if (auto *CB = dyn_cast<CallBase>(&I)) { 
-      if (auto *CalledFunc = CB->getCalledFunction()) { 
-        for (auto &CSArg : CalledFunc->args()) { 
+
+    // Add in the new attribute.
+    if (WritesMemory && !ReadsMemory)
+      F->addFnAttr(Attribute::WriteOnly);
+    else
+      F->addFnAttr(ReadsMemory ? Attribute::ReadOnly : Attribute::ReadNone);
+
+    if (WritesMemory && !ReadsMemory)
+      ++NumWriteOnly;
+    else if (ReadsMemory)
+      ++NumReadOnly;
+    else
+      ++NumReadNone;
+  }
+
+  return MadeChange;
+}
+
+namespace {
+
+/// For a given pointer Argument, this retains a list of Arguments of functions
+/// in the same SCC that the pointer data flows into. We use this to build an
+/// SCC of the arguments.
+struct ArgumentGraphNode {
+  Argument *Definition;
+  SmallVector<ArgumentGraphNode *, 4> Uses;
+};
+
+class ArgumentGraph {
+  // We store pointers to ArgumentGraphNode objects, so it's important that
+  // that they not move around upon insert.
+  using ArgumentMapTy = std::map<Argument *, ArgumentGraphNode>;
+
+  ArgumentMapTy ArgumentMap;
+
+  // There is no root node for the argument graph, in fact:
+  //   void f(int *x, int *y) { if (...) f(x, y); }
+  // is an example where the graph is disconnected. The SCCIterator requires a
+  // single entry point, so we maintain a fake ("synthetic") root node that
+  // uses every node. Because the graph is directed and nothing points into
+  // the root, it will not participate in any SCCs (except for its own).
+  ArgumentGraphNode SyntheticRoot;
+
+public:
+  ArgumentGraph() { SyntheticRoot.Definition = nullptr; }
+
+  using iterator = SmallVectorImpl<ArgumentGraphNode *>::iterator;
+
+  iterator begin() { return SyntheticRoot.Uses.begin(); }
+  iterator end() { return SyntheticRoot.Uses.end(); }
+  ArgumentGraphNode *getEntryNode() { return &SyntheticRoot; }
+
+  ArgumentGraphNode *operator[](Argument *A) {
+    ArgumentGraphNode &Node = ArgumentMap[A];
+    Node.Definition = A;
+    SyntheticRoot.Uses.push_back(&Node);
+    return &Node;
+  }
+};
+
+/// This tracker checks whether callees are in the SCC, and if so it does not
+/// consider that a capture, instead adding it to the "Uses" list and
+/// continuing with the analysis.
+struct ArgumentUsesTracker : public CaptureTracker {
+  ArgumentUsesTracker(const SCCNodeSet &SCCNodes) : SCCNodes(SCCNodes) {}
+
+  void tooManyUses() override { Captured = true; }
+
+  bool captured(const Use *U) override {
+    CallBase *CB = dyn_cast<CallBase>(U->getUser());
+    if (!CB) {
+      Captured = true;
+      return true;
+    }
+
+    Function *F = CB->getCalledFunction();
+    if (!F || !F->hasExactDefinition() || !SCCNodes.count(F)) {
+      Captured = true;
+      return true;
+    }
+
+    // Note: the callee and the two successor blocks *follow* the argument
+    // operands.  This means there is no need to adjust UseIndex to account for
+    // these.
+
+    unsigned UseIndex =
+        std::distance(const_cast<const Use *>(CB->arg_begin()), U);
+
+    assert(UseIndex < CB->data_operands_size() &&
+           "Indirect function calls should have been filtered above!");
+
+    if (UseIndex >= CB->getNumArgOperands()) {
+      // Data operand, but not a argument operand -- must be a bundle operand
+      assert(CB->hasOperandBundles() && "Must be!");
+
+      // CaptureTracking told us that we're being captured by an operand bundle
+      // use.  In this case it does not matter if the callee is within our SCC
+      // or not -- we've been captured in some unknown way, and we have to be
+      // conservative.
+      Captured = true;
+      return true;
+    }
+
+    if (UseIndex >= F->arg_size()) {
+      assert(F->isVarArg() && "More params than args in non-varargs call");
+      Captured = true;
+      return true;
+    }
+
+    Uses.push_back(&*std::next(F->arg_begin(), UseIndex));
+    return false;
+  }
+
+  // True only if certainly captured (used outside our SCC).
+  bool Captured = false;
+
+  // Uses within our SCC.
+  SmallVector<Argument *, 4> Uses;
+
+  const SCCNodeSet &SCCNodes;
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+template <> struct GraphTraits<ArgumentGraphNode *> {
+  using NodeRef = ArgumentGraphNode *;
+  using ChildIteratorType = SmallVectorImpl<ArgumentGraphNode *>::iterator;
+
+  static NodeRef getEntryNode(NodeRef A) { return A; }
+  static ChildIteratorType child_begin(NodeRef N) { return N->Uses.begin(); }
+  static ChildIteratorType child_end(NodeRef N) { return N->Uses.end(); }
+};
+
+template <>
+struct GraphTraits<ArgumentGraph *> : public GraphTraits<ArgumentGraphNode *> {
+  static NodeRef getEntryNode(ArgumentGraph *AG) { return AG->getEntryNode(); }
+
+  static ChildIteratorType nodes_begin(ArgumentGraph *AG) {
+    return AG->begin();
+  }
+
+  static ChildIteratorType nodes_end(ArgumentGraph *AG) { return AG->end(); }
+};
+
+} // end namespace llvm
+
+/// Returns Attribute::None, Attribute::ReadOnly or Attribute::ReadNone.
+static Attribute::AttrKind
+determinePointerReadAttrs(Argument *A,
+                          const SmallPtrSet<Argument *, 8> &SCCNodes) {
+  SmallVector<Use *, 32> Worklist;
+  SmallPtrSet<Use *, 32> Visited;
+
+  // inalloca arguments are always clobbered by the call.
+  if (A->hasInAllocaAttr() || A->hasPreallocatedAttr())
+    return Attribute::None;
+
+  bool IsRead = false;
+  // We don't need to track IsWritten. If A is written to, return immediately.
+
+  for (Use &U : A->uses()) {
+    Visited.insert(&U);
+    Worklist.push_back(&U);
+  }
+
+  while (!Worklist.empty()) {
+    Use *U = Worklist.pop_back_val();
+    Instruction *I = cast<Instruction>(U->getUser());
+
+    switch (I->getOpcode()) {
+    case Instruction::BitCast:
+    case Instruction::GetElementPtr:
+    case Instruction::PHI:
+    case Instruction::Select:
+    case Instruction::AddrSpaceCast:
+      // The original value is not read/written via this if the new value isn't.
+      for (Use &UU : I->uses())
+        if (Visited.insert(&UU).second)
+          Worklist.push_back(&UU);
+      break;
+
+    case Instruction::Call:
+    case Instruction::Invoke: {
+      bool Captures = true;
+
+      if (I->getType()->isVoidTy())
+        Captures = false;
+
+      auto AddUsersToWorklistIfCapturing = [&] {
+        if (Captures)
+          for (Use &UU : I->uses())
+            if (Visited.insert(&UU).second)
+              Worklist.push_back(&UU);
+      };
+
+      CallBase &CB = cast<CallBase>(*I);
+      if (CB.doesNotAccessMemory()) {
+        AddUsersToWorklistIfCapturing();
+        continue;
+      }
+
+      Function *F = CB.getCalledFunction();
+      if (!F) {
+        if (CB.onlyReadsMemory()) {
+          IsRead = true;
+          AddUsersToWorklistIfCapturing();
+          continue;
+        }
+        return Attribute::None;
+      }
+
+      // Note: the callee and the two successor blocks *follow* the argument
+      // operands.  This means there is no need to adjust UseIndex to account
+      // for these.
+
+      unsigned UseIndex = std::distance(CB.arg_begin(), U);
+
+      // U cannot be the callee operand use: since we're exploring the
+      // transitive uses of an Argument, having such a use be a callee would
+      // imply the call site is an indirect call or invoke; and we'd take the
+      // early exit above.
+      assert(UseIndex < CB.data_operands_size() &&
+             "Data operand use expected!");
+
+      bool IsOperandBundleUse = UseIndex >= CB.getNumArgOperands();
+
+      if (UseIndex >= F->arg_size() && !IsOperandBundleUse) {
+        assert(F->isVarArg() && "More params than args in non-varargs call");
+        return Attribute::None;
+      }
+
+      Captures &= !CB.doesNotCapture(UseIndex);
+
+      // Since the optimizer (by design) cannot see the data flow corresponding
+      // to a operand bundle use, these cannot participate in the optimistic SCC
+      // analysis.  Instead, we model the operand bundle uses as arguments in
+      // call to a function external to the SCC.
+      if (IsOperandBundleUse ||
+          !SCCNodes.count(&*std::next(F->arg_begin(), UseIndex))) {
+
+        // The accessors used on call site here do the right thing for calls and
+        // invokes with operand bundles.
+
+        if (!CB.onlyReadsMemory() && !CB.onlyReadsMemory(UseIndex))
+          return Attribute::None;
+        if (!CB.doesNotAccessMemory(UseIndex))
+          IsRead = true;
+      }
+
+      AddUsersToWorklistIfCapturing();
+      break;
+    }
+
+    case Instruction::Load:
+      // A volatile load has side effects beyond what readonly can be relied
+      // upon.
+      if (cast<LoadInst>(I)->isVolatile())
+        return Attribute::None;
+
+      IsRead = true;
+      break;
+
+    case Instruction::ICmp:
+    case Instruction::Ret:
+      break;
+
+    default:
+      return Attribute::None;
+    }
+  }
+
+  return IsRead ? Attribute::ReadOnly : Attribute::ReadNone;
+}
+
+/// Deduce returned attributes for the SCC.
+static bool addArgumentReturnedAttrs(const SCCNodeSet &SCCNodes) {
+  bool Changed = false;
+
+  // Check each function in turn, determining if an argument is always returned.
+  for (Function *F : SCCNodes) {
+    // We can infer and propagate function attributes only when we know that the
+    // definition we'll get at link time is *exactly* the definition we see now.
+    // For more details, see GlobalValue::mayBeDerefined.
+    if (!F->hasExactDefinition())
+      continue;
+
+    if (F->getReturnType()->isVoidTy())
+      continue;
+
+    // There is nothing to do if an argument is already marked as 'returned'.
+    if (llvm::any_of(F->args(),
+                     [](const Argument &Arg) { return Arg.hasReturnedAttr(); }))
+      continue;
+
+    auto FindRetArg = [&]() -> Value * {
+      Value *RetArg = nullptr;
+      for (BasicBlock &BB : *F)
+        if (auto *Ret = dyn_cast<ReturnInst>(BB.getTerminator())) {
+          // Note that stripPointerCasts should look through functions with
+          // returned arguments.
+          Value *RetVal = Ret->getReturnValue()->stripPointerCasts();
+          if (!isa<Argument>(RetVal) || RetVal->getType() != F->getReturnType())
+            return nullptr;
+
+          if (!RetArg)
+            RetArg = RetVal;
+          else if (RetArg != RetVal)
+            return nullptr;
+        }
+
+      return RetArg;
+    };
+
+    if (Value *RetArg = FindRetArg()) {
+      auto *A = cast<Argument>(RetArg);
+      A->addAttr(Attribute::Returned);
+      ++NumReturned;
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+/// If a callsite has arguments that are also arguments to the parent function,
+/// try to propagate attributes from the callsite's arguments to the parent's
+/// arguments. This may be important because inlining can cause information loss
+/// when attribute knowledge disappears with the inlined call.
+static bool addArgumentAttrsFromCallsites(Function &F) {
+  if (!EnableNonnullArgPropagation)
+    return false;
+
+  bool Changed = false;
+
+  // For an argument attribute to transfer from a callsite to the parent, the
+  // call must be guaranteed to execute every time the parent is called.
+  // Conservatively, just check for calls in the entry block that are guaranteed
+  // to execute.
+  // TODO: This could be enhanced by testing if the callsite post-dominates the
+  // entry block or by doing simple forward walks or backward walks to the
+  // callsite.
+  BasicBlock &Entry = F.getEntryBlock();
+  for (Instruction &I : Entry) {
+    if (auto *CB = dyn_cast<CallBase>(&I)) {
+      if (auto *CalledFunc = CB->getCalledFunction()) {
+        for (auto &CSArg : CalledFunc->args()) {
           if (!CSArg.hasNonNullAttr(/* AllowUndefOrPoison */ false))
-            continue; 
- 
-          // If the non-null callsite argument operand is an argument to 'F' 
-          // (the caller) and the call is guaranteed to execute, then the value 
-          // must be non-null throughout 'F'. 
-          auto *FArg = dyn_cast<Argument>(CB->getArgOperand(CSArg.getArgNo())); 
-          if (FArg && !FArg->hasNonNullAttr()) { 
-            FArg->addAttr(Attribute::NonNull); 
-            Changed = true; 
-          } 
-        } 
-      } 
-    } 
-    if (!isGuaranteedToTransferExecutionToSuccessor(&I)) 
-      break; 
-  } 
- 
-  return Changed; 
-} 
- 
-static bool addReadAttr(Argument *A, Attribute::AttrKind R) { 
-  assert((R == Attribute::ReadOnly || R == Attribute::ReadNone) 
-         && "Must be a Read attribute."); 
-  assert(A && "Argument must not be null."); 
- 
-  // If the argument already has the attribute, nothing needs to be done. 
-  if (A->hasAttribute(R)) 
-      return false; 
- 
-  // Otherwise, remove potentially conflicting attribute, add the new one, 
-  // and update statistics. 
-  A->removeAttr(Attribute::WriteOnly); 
-  A->removeAttr(Attribute::ReadOnly); 
-  A->removeAttr(Attribute::ReadNone); 
-  A->addAttr(R); 
-  R == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg; 
-  return true; 
-} 
- 
-/// Deduce nocapture attributes for the SCC. 
-static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) { 
-  bool Changed = false; 
- 
-  ArgumentGraph AG; 
- 
-  // Check each function in turn, determining which pointer arguments are not 
-  // captured. 
-  for (Function *F : SCCNodes) { 
-    // We can infer and propagate function attributes only when we know that the 
-    // definition we'll get at link time is *exactly* the definition we see now. 
-    // For more details, see GlobalValue::mayBeDerefined. 
-    if (!F->hasExactDefinition()) 
-      continue; 
- 
-    Changed |= addArgumentAttrsFromCallsites(*F); 
- 
-    // Functions that are readonly (or readnone) and nounwind and don't return 
-    // a value can't capture arguments. Don't analyze them. 
-    if (F->onlyReadsMemory() && F->doesNotThrow() && 
-        F->getReturnType()->isVoidTy()) { 
-      for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); A != E; 
-           ++A) { 
-        if (A->getType()->isPointerTy() && !A->hasNoCaptureAttr()) { 
-          A->addAttr(Attribute::NoCapture); 
-          ++NumNoCapture; 
-          Changed = true; 
-        } 
-      } 
-      continue; 
-    } 
- 
-    for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); A != E; 
-         ++A) { 
-      if (!A->getType()->isPointerTy()) 
-        continue; 
-      bool HasNonLocalUses = false; 
-      if (!A->hasNoCaptureAttr()) { 
-        ArgumentUsesTracker Tracker(SCCNodes); 
-        PointerMayBeCaptured(&*A, &Tracker); 
-        if (!Tracker.Captured) { 
-          if (Tracker.Uses.empty()) { 
-            // If it's trivially not captured, mark it nocapture now. 
-            A->addAttr(Attribute::NoCapture); 
-            ++NumNoCapture; 
-            Changed = true; 
-          } else { 
-            // If it's not trivially captured and not trivially not captured, 
-            // then it must be calling into another function in our SCC. Save 
-            // its particulars for Argument-SCC analysis later. 
-            ArgumentGraphNode *Node = AG[&*A]; 
-            for (Argument *Use : Tracker.Uses) { 
-              Node->Uses.push_back(AG[Use]); 
-              if (Use != &*A) 
-                HasNonLocalUses = true; 
-            } 
-          } 
-        } 
-        // Otherwise, it's captured. Don't bother doing SCC analysis on it. 
-      } 
-      if (!HasNonLocalUses && !A->onlyReadsMemory()) { 
-        // Can we determine that it's readonly/readnone without doing an SCC? 
-        // Note that we don't allow any calls at all here, or else our result 
-        // will be dependent on the iteration order through the functions in the 
-        // SCC. 
-        SmallPtrSet<Argument *, 8> Self; 
-        Self.insert(&*A); 
-        Attribute::AttrKind R = determinePointerReadAttrs(&*A, Self); 
-        if (R != Attribute::None) 
-          Changed = addReadAttr(A, R); 
-      } 
-    } 
-  } 
- 
-  // The graph we've collected is partial because we stopped scanning for 
-  // argument uses once we solved the argument trivially. These partial nodes 
-  // show up as ArgumentGraphNode objects with an empty Uses list, and for 
-  // these nodes the final decision about whether they capture has already been 
-  // made.  If the definition doesn't have a 'nocapture' attribute by now, it 
-  // captures. 
- 
-  for (scc_iterator<ArgumentGraph *> I = scc_begin(&AG); !I.isAtEnd(); ++I) { 
-    const std::vector<ArgumentGraphNode *> &ArgumentSCC = *I; 
-    if (ArgumentSCC.size() == 1) { 
-      if (!ArgumentSCC[0]->Definition) 
-        continue; // synthetic root node 
- 
-      // eg. "void f(int* x) { if (...) f(x); }" 
-      if (ArgumentSCC[0]->Uses.size() == 1 && 
-          ArgumentSCC[0]->Uses[0] == ArgumentSCC[0]) { 
-        Argument *A = ArgumentSCC[0]->Definition; 
-        A->addAttr(Attribute::NoCapture); 
-        ++NumNoCapture; 
-        Changed = true; 
-      } 
-      continue; 
-    } 
- 
-    bool SCCCaptured = false; 
-    for (auto I = ArgumentSCC.begin(), E = ArgumentSCC.end(); 
-         I != E && !SCCCaptured; ++I) { 
-      ArgumentGraphNode *Node = *I; 
-      if (Node->Uses.empty()) { 
-        if (!Node->Definition->hasNoCaptureAttr()) 
-          SCCCaptured = true; 
-      } 
-    } 
-    if (SCCCaptured) 
-      continue; 
- 
-    SmallPtrSet<Argument *, 8> ArgumentSCCNodes; 
-    // Fill ArgumentSCCNodes with the elements of the ArgumentSCC.  Used for 
-    // quickly looking up whether a given Argument is in this ArgumentSCC. 
-    for (ArgumentGraphNode *I : ArgumentSCC) { 
-      ArgumentSCCNodes.insert(I->Definition); 
-    } 
- 
-    for (auto I = ArgumentSCC.begin(), E = ArgumentSCC.end(); 
-         I != E && !SCCCaptured; ++I) { 
-      ArgumentGraphNode *N = *I; 
-      for (ArgumentGraphNode *Use : N->Uses) { 
-        Argument *A = Use->Definition; 
-        if (A->hasNoCaptureAttr() || ArgumentSCCNodes.count(A)) 
-          continue; 
-        SCCCaptured = true; 
-        break; 
-      } 
-    } 
-    if (SCCCaptured) 
-      continue; 
- 
-    for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) { 
-      Argument *A = ArgumentSCC[i]->Definition; 
-      A->addAttr(Attribute::NoCapture); 
-      ++NumNoCapture; 
-      Changed = true; 
-    } 
- 
-    // We also want to compute readonly/readnone. With a small number of false 
-    // negatives, we can assume that any pointer which is captured isn't going 
-    // to be provably readonly or readnone, since by definition we can't 
-    // analyze all uses of a captured pointer. 
-    // 
-    // The false negatives happen when the pointer is captured by a function 
-    // that promises readonly/readnone behaviour on the pointer, then the 
-    // pointer's lifetime ends before anything that writes to arbitrary memory. 
-    // Also, a readonly/readnone pointer may be returned, but returning a 
-    // pointer is capturing it. 
- 
-    Attribute::AttrKind ReadAttr = Attribute::ReadNone; 
-    for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) { 
-      Argument *A = ArgumentSCC[i]->Definition; 
-      Attribute::AttrKind K = determinePointerReadAttrs(A, ArgumentSCCNodes); 
-      if (K == Attribute::ReadNone) 
-        continue; 
-      if (K == Attribute::ReadOnly) { 
-        ReadAttr = Attribute::ReadOnly; 
-        continue; 
-      } 
-      ReadAttr = K; 
-      break; 
-    } 
- 
-    if (ReadAttr != Attribute::None) { 
-      for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) { 
-        Argument *A = ArgumentSCC[i]->Definition; 
-        Changed = addReadAttr(A, ReadAttr); 
-      } 
-    } 
-  } 
- 
-  return Changed; 
-} 
- 
-/// Tests whether a function is "malloc-like". 
-/// 
-/// A function is "malloc-like" if it returns either null or a pointer that 
-/// doesn't alias any other pointer visible to the caller. 
-static bool isFunctionMallocLike(Function *F, const SCCNodeSet &SCCNodes) { 
-  SmallSetVector<Value *, 8> FlowsToReturn; 
-  for (BasicBlock &BB : *F) 
-    if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB.getTerminator())) 
-      FlowsToReturn.insert(Ret->getReturnValue()); 
- 
-  for (unsigned i = 0; i != FlowsToReturn.size(); ++i) { 
-    Value *RetVal = FlowsToReturn[i]; 
- 
-    if (Constant *C = dyn_cast<Constant>(RetVal)) { 
-      if (!C->isNullValue() && !isa<UndefValue>(C)) 
-        return false; 
- 
-      continue; 
-    } 
- 
-    if (isa<Argument>(RetVal)) 
-      return false; 
- 
-    if (Instruction *RVI = dyn_cast<Instruction>(RetVal)) 
-      switch (RVI->getOpcode()) { 
-      // Extend the analysis by looking upwards. 
-      case Instruction::BitCast: 
-      case Instruction::GetElementPtr: 
-      case Instruction::AddrSpaceCast: 
-        FlowsToReturn.insert(RVI->getOperand(0)); 
-        continue; 
-      case Instruction::Select: { 
-        SelectInst *SI = cast<SelectInst>(RVI); 
-        FlowsToReturn.insert(SI->getTrueValue()); 
-        FlowsToReturn.insert(SI->getFalseValue()); 
-        continue; 
-      } 
-      case Instruction::PHI: { 
-        PHINode *PN = cast<PHINode>(RVI); 
-        for (Value *IncValue : PN->incoming_values()) 
-          FlowsToReturn.insert(IncValue); 
-        continue; 
-      } 
- 
-      // Check whether the pointer came from an allocation. 
-      case Instruction::Alloca: 
-        break; 
-      case Instruction::Call: 
-      case Instruction::Invoke: { 
-        CallBase &CB = cast<CallBase>(*RVI); 
-        if (CB.hasRetAttr(Attribute::NoAlias)) 
-          break; 
-        if (CB.getCalledFunction() && SCCNodes.count(CB.getCalledFunction())) 
-          break; 
-        LLVM_FALLTHROUGH; 
-      } 
-      default: 
-        return false; // Did not come from an allocation. 
-      } 
- 
-    if (PointerMayBeCaptured(RetVal, false, /*StoreCaptures=*/false)) 
-      return false; 
-  } 
- 
-  return true; 
-} 
- 
-/// Deduce noalias attributes for the SCC. 
-static bool addNoAliasAttrs(const SCCNodeSet &SCCNodes) { 
-  // Check each function in turn, determining which functions return noalias 
-  // pointers. 
-  for (Function *F : SCCNodes) { 
-    // Already noalias. 
-    if (F->returnDoesNotAlias()) 
-      continue; 
- 
-    // We can infer and propagate function attributes only when we know that the 
-    // definition we'll get at link time is *exactly* the definition we see now. 
-    // For more details, see GlobalValue::mayBeDerefined. 
-    if (!F->hasExactDefinition()) 
-      return false; 
- 
-    // We annotate noalias return values, which are only applicable to 
-    // pointer types. 
-    if (!F->getReturnType()->isPointerTy()) 
-      continue; 
- 
-    if (!isFunctionMallocLike(F, SCCNodes)) 
-      return false; 
-  } 
- 
-  bool MadeChange = false; 
-  for (Function *F : SCCNodes) { 
-    if (F->returnDoesNotAlias() || 
-        !F->getReturnType()->isPointerTy()) 
-      continue; 
- 
-    F->setReturnDoesNotAlias(); 
-    ++NumNoAlias; 
-    MadeChange = true; 
-  } 
- 
-  return MadeChange; 
-} 
- 
-/// Tests whether this function is known to not return null. 
-/// 
-/// Requires that the function returns a pointer. 
-/// 
-/// Returns true if it believes the function will not return a null, and sets 
-/// \p Speculative based on whether the returned conclusion is a speculative 
-/// conclusion due to SCC calls. 
-static bool isReturnNonNull(Function *F, const SCCNodeSet &SCCNodes, 
-                            bool &Speculative) { 
-  assert(F->getReturnType()->isPointerTy() && 
-         "nonnull only meaningful on pointer types"); 
-  Speculative = false; 
- 
-  SmallSetVector<Value *, 8> FlowsToReturn; 
-  for (BasicBlock &BB : *F) 
-    if (auto *Ret = dyn_cast<ReturnInst>(BB.getTerminator())) 
-      FlowsToReturn.insert(Ret->getReturnValue()); 
- 
-  auto &DL = F->getParent()->getDataLayout(); 
- 
-  for (unsigned i = 0; i != FlowsToReturn.size(); ++i) { 
-    Value *RetVal = FlowsToReturn[i]; 
- 
-    // If this value is locally known to be non-null, we're good 
-    if (isKnownNonZero(RetVal, DL)) 
-      continue; 
- 
-    // Otherwise, we need to look upwards since we can't make any local 
-    // conclusions. 
-    Instruction *RVI = dyn_cast<Instruction>(RetVal); 
-    if (!RVI) 
-      return false; 
-    switch (RVI->getOpcode()) { 
-    // Extend the analysis by looking upwards. 
-    case Instruction::BitCast: 
-    case Instruction::GetElementPtr: 
-    case Instruction::AddrSpaceCast: 
-      FlowsToReturn.insert(RVI->getOperand(0)); 
-      continue; 
-    case Instruction::Select: { 
-      SelectInst *SI = cast<SelectInst>(RVI); 
-      FlowsToReturn.insert(SI->getTrueValue()); 
-      FlowsToReturn.insert(SI->getFalseValue()); 
-      continue; 
-    } 
-    case Instruction::PHI: { 
-      PHINode *PN = cast<PHINode>(RVI); 
-      for (int i = 0, e = PN->getNumIncomingValues(); i != e; ++i) 
-        FlowsToReturn.insert(PN->getIncomingValue(i)); 
-      continue; 
-    } 
-    case Instruction::Call: 
-    case Instruction::Invoke: { 
-      CallBase &CB = cast<CallBase>(*RVI); 
-      Function *Callee = CB.getCalledFunction(); 
-      // A call to a node within the SCC is assumed to return null until 
-      // proven otherwise 
-      if (Callee && SCCNodes.count(Callee)) { 
-        Speculative = true; 
-        continue; 
-      } 
-      return false; 
-    } 
-    default: 
-      return false; // Unknown source, may be null 
-    }; 
-    llvm_unreachable("should have either continued or returned"); 
-  } 
- 
-  return true; 
-} 
- 
-/// Deduce nonnull attributes for the SCC. 
-static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) { 
-  // Speculative that all functions in the SCC return only nonnull 
-  // pointers.  We may refute this as we analyze functions. 
-  bool SCCReturnsNonNull = true; 
- 
-  bool MadeChange = false; 
- 
-  // Check each function in turn, determining which functions return nonnull 
-  // pointers. 
-  for (Function *F : SCCNodes) { 
-    // Already nonnull. 
-    if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex, 
-                                        Attribute::NonNull)) 
-      continue; 
- 
-    // We can infer and propagate function attributes only when we know that the 
-    // definition we'll get at link time is *exactly* the definition we see now. 
-    // For more details, see GlobalValue::mayBeDerefined. 
-    if (!F->hasExactDefinition()) 
-      return false; 
- 
-    // We annotate nonnull return values, which are only applicable to 
-    // pointer types. 
-    if (!F->getReturnType()->isPointerTy()) 
-      continue; 
- 
-    bool Speculative = false; 
-    if (isReturnNonNull(F, SCCNodes, Speculative)) { 
-      if (!Speculative) { 
-        // Mark the function eagerly since we may discover a function 
-        // which prevents us from speculating about the entire SCC 
-        LLVM_DEBUG(dbgs() << "Eagerly marking " << F->getName() 
-                          << " as nonnull\n"); 
-        F->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull); 
-        ++NumNonNullReturn; 
-        MadeChange = true; 
-      } 
-      continue; 
-    } 
-    // At least one function returns something which could be null, can't 
-    // speculate any more. 
-    SCCReturnsNonNull = false; 
-  } 
- 
-  if (SCCReturnsNonNull) { 
-    for (Function *F : SCCNodes) { 
-      if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex, 
-                                          Attribute::NonNull) || 
-          !F->getReturnType()->isPointerTy()) 
-        continue; 
- 
-      LLVM_DEBUG(dbgs() << "SCC marking " << F->getName() << " as nonnull\n"); 
-      F->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull); 
-      ++NumNonNullReturn; 
-      MadeChange = true; 
-    } 
-  } 
- 
-  return MadeChange; 
-} 
- 
-namespace { 
- 
-/// Collects a set of attribute inference requests and performs them all in one 
-/// go on a single SCC Node. Inference involves scanning function bodies 
-/// looking for instructions that violate attribute assumptions. 
-/// As soon as all the bodies are fine we are free to set the attribute. 
-/// Customization of inference for individual attributes is performed by 
-/// providing a handful of predicates for each attribute. 
-class AttributeInferer { 
-public: 
-  /// Describes a request for inference of a single attribute. 
-  struct InferenceDescriptor { 
- 
-    /// Returns true if this function does not have to be handled. 
-    /// General intent for this predicate is to provide an optimization 
-    /// for functions that do not need this attribute inference at all 
-    /// (say, for functions that already have the attribute). 
-    std::function<bool(const Function &)> SkipFunction; 
- 
-    /// Returns true if this instruction violates attribute assumptions. 
-    std::function<bool(Instruction &)> InstrBreaksAttribute; 
- 
-    /// Sets the inferred attribute for this function. 
-    std::function<void(Function &)> SetAttribute; 
- 
-    /// Attribute we derive. 
-    Attribute::AttrKind AKind; 
- 
-    /// If true, only "exact" definitions can be used to infer this attribute. 
-    /// See GlobalValue::isDefinitionExact. 
-    bool RequiresExactDefinition; 
- 
-    InferenceDescriptor(Attribute::AttrKind AK, 
-                        std::function<bool(const Function &)> SkipFunc, 
-                        std::function<bool(Instruction &)> InstrScan, 
-                        std::function<void(Function &)> SetAttr, 
-                        bool ReqExactDef) 
-        : SkipFunction(SkipFunc), InstrBreaksAttribute(InstrScan), 
-          SetAttribute(SetAttr), AKind(AK), 
-          RequiresExactDefinition(ReqExactDef) {} 
-  }; 
- 
-private: 
-  SmallVector<InferenceDescriptor, 4> InferenceDescriptors; 
- 
-public: 
-  void registerAttrInference(InferenceDescriptor AttrInference) { 
-    InferenceDescriptors.push_back(AttrInference); 
-  } 
- 
-  bool run(const SCCNodeSet &SCCNodes); 
-}; 
- 
-/// Perform all the requested attribute inference actions according to the 
-/// attribute predicates stored before. 
-bool AttributeInferer::run(const SCCNodeSet &SCCNodes) { 
-  SmallVector<InferenceDescriptor, 4> InferInSCC = InferenceDescriptors; 
-  // Go through all the functions in SCC and check corresponding attribute 
-  // assumptions for each of them. Attributes that are invalid for this SCC 
-  // will be removed from InferInSCC. 
-  for (Function *F : SCCNodes) { 
- 
-    // No attributes whose assumptions are still valid - done. 
-    if (InferInSCC.empty()) 
-      return false; 
- 
-    // Check if our attributes ever need scanning/can be scanned. 
-    llvm::erase_if(InferInSCC, [F](const InferenceDescriptor &ID) { 
-      if (ID.SkipFunction(*F)) 
-        return false; 
- 
-      // Remove from further inference (invalidate) when visiting a function 
-      // that has no instructions to scan/has an unsuitable definition. 
-      return F->isDeclaration() || 
-             (ID.RequiresExactDefinition && !F->hasExactDefinition()); 
-    }); 
- 
-    // For each attribute still in InferInSCC that doesn't explicitly skip F, 
-    // set up the F instructions scan to verify assumptions of the attribute. 
-    SmallVector<InferenceDescriptor, 4> InferInThisFunc; 
-    llvm::copy_if( 
-        InferInSCC, std::back_inserter(InferInThisFunc), 
-        [F](const InferenceDescriptor &ID) { return !ID.SkipFunction(*F); }); 
- 
-    if (InferInThisFunc.empty()) 
-      continue; 
- 
-    // Start instruction scan. 
-    for (Instruction &I : instructions(*F)) { 
-      llvm::erase_if(InferInThisFunc, [&](const InferenceDescriptor &ID) { 
-        if (!ID.InstrBreaksAttribute(I)) 
-          return false; 
-        // Remove attribute from further inference on any other functions 
-        // because attribute assumptions have just been violated. 
-        llvm::erase_if(InferInSCC, [&ID](const InferenceDescriptor &D) { 
-          return D.AKind == ID.AKind; 
-        }); 
-        // Remove attribute from the rest of current instruction scan. 
-        return true; 
-      }); 
- 
-      if (InferInThisFunc.empty()) 
-        break; 
-    } 
-  } 
- 
-  if (InferInSCC.empty()) 
-    return false; 
- 
-  bool Changed = false; 
-  for (Function *F : SCCNodes) 
-    // At this point InferInSCC contains only functions that were either: 
-    //   - explicitly skipped from scan/inference, or 
-    //   - verified to have no instructions that break attribute assumptions. 
-    // Hence we just go and force the attribute for all non-skipped functions. 
-    for (auto &ID : InferInSCC) { 
-      if (ID.SkipFunction(*F)) 
-        continue; 
-      Changed = true; 
-      ID.SetAttribute(*F); 
-    } 
-  return Changed; 
-} 
- 
+            continue;
+
+          // If the non-null callsite argument operand is an argument to 'F'
+          // (the caller) and the call is guaranteed to execute, then the value
+          // must be non-null throughout 'F'.
+          auto *FArg = dyn_cast<Argument>(CB->getArgOperand(CSArg.getArgNo()));
+          if (FArg && !FArg->hasNonNullAttr()) {
+            FArg->addAttr(Attribute::NonNull);
+            Changed = true;
+          }
+        }
+      }
+    }
+    if (!isGuaranteedToTransferExecutionToSuccessor(&I))
+      break;
+  }
+
+  return Changed;
+}
+
+static bool addReadAttr(Argument *A, Attribute::AttrKind R) {
+  assert((R == Attribute::ReadOnly || R == Attribute::ReadNone)
+         && "Must be a Read attribute.");
+  assert(A && "Argument must not be null.");
+
+  // If the argument already has the attribute, nothing needs to be done.
+  if (A->hasAttribute(R))
+      return false;
+
+  // Otherwise, remove potentially conflicting attribute, add the new one,
+  // and update statistics.
+  A->removeAttr(Attribute::WriteOnly);
+  A->removeAttr(Attribute::ReadOnly);
+  A->removeAttr(Attribute::ReadNone);
+  A->addAttr(R);
+  R == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg;
+  return true;
+}
+
+/// Deduce nocapture attributes for the SCC.
+static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
+  bool Changed = false;
+
+  ArgumentGraph AG;
+
+  // Check each function in turn, determining which pointer arguments are not
+  // captured.
+  for (Function *F : SCCNodes) {
+    // We can infer and propagate function attributes only when we know that the
+    // definition we'll get at link time is *exactly* the definition we see now.
+    // For more details, see GlobalValue::mayBeDerefined.
+    if (!F->hasExactDefinition())
+      continue;
+
+    Changed |= addArgumentAttrsFromCallsites(*F);
+
+    // Functions that are readonly (or readnone) and nounwind and don't return
+    // a value can't capture arguments. Don't analyze them.
+    if (F->onlyReadsMemory() && F->doesNotThrow() &&
+        F->getReturnType()->isVoidTy()) {
+      for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); A != E;
+           ++A) {
+        if (A->getType()->isPointerTy() && !A->hasNoCaptureAttr()) {
+          A->addAttr(Attribute::NoCapture);
+          ++NumNoCapture;
+          Changed = true;
+        }
+      }
+      continue;
+    }
+
+    for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); A != E;
+         ++A) {
+      if (!A->getType()->isPointerTy())
+        continue;
+      bool HasNonLocalUses = false;
+      if (!A->hasNoCaptureAttr()) {
+        ArgumentUsesTracker Tracker(SCCNodes);
+        PointerMayBeCaptured(&*A, &Tracker);
+        if (!Tracker.Captured) {
+          if (Tracker.Uses.empty()) {
+            // If it's trivially not captured, mark it nocapture now.
+            A->addAttr(Attribute::NoCapture);
+            ++NumNoCapture;
+            Changed = true;
+          } else {
+            // If it's not trivially captured and not trivially not captured,
+            // then it must be calling into another function in our SCC. Save
+            // its particulars for Argument-SCC analysis later.
+            ArgumentGraphNode *Node = AG[&*A];
+            for (Argument *Use : Tracker.Uses) {
+              Node->Uses.push_back(AG[Use]);
+              if (Use != &*A)
+                HasNonLocalUses = true;
+            }
+          }
+        }
+        // Otherwise, it's captured. Don't bother doing SCC analysis on it.
+      }
+      if (!HasNonLocalUses && !A->onlyReadsMemory()) {
+        // Can we determine that it's readonly/readnone without doing an SCC?
+        // Note that we don't allow any calls at all here, or else our result
+        // will be dependent on the iteration order through the functions in the
+        // SCC.
+        SmallPtrSet<Argument *, 8> Self;
+        Self.insert(&*A);
+        Attribute::AttrKind R = determinePointerReadAttrs(&*A, Self);
+        if (R != Attribute::None)
+          Changed = addReadAttr(A, R);
+      }
+    }
+  }
+
+  // The graph we've collected is partial because we stopped scanning for
+  // argument uses once we solved the argument trivially. These partial nodes
+  // show up as ArgumentGraphNode objects with an empty Uses list, and for
+  // these nodes the final decision about whether they capture has already been
+  // made.  If the definition doesn't have a 'nocapture' attribute by now, it
+  // captures.
+
+  for (scc_iterator<ArgumentGraph *> I = scc_begin(&AG); !I.isAtEnd(); ++I) {
+    const std::vector<ArgumentGraphNode *> &ArgumentSCC = *I;
+    if (ArgumentSCC.size() == 1) {
+      if (!ArgumentSCC[0]->Definition)
+        continue; // synthetic root node
+
+      // eg. "void f(int* x) { if (...) f(x); }"
+      if (ArgumentSCC[0]->Uses.size() == 1 &&
+          ArgumentSCC[0]->Uses[0] == ArgumentSCC[0]) {
+        Argument *A = ArgumentSCC[0]->Definition;
+        A->addAttr(Attribute::NoCapture);
+        ++NumNoCapture;
+        Changed = true;
+      }
+      continue;
+    }
+
+    bool SCCCaptured = false;
+    for (auto I = ArgumentSCC.begin(), E = ArgumentSCC.end();
+         I != E && !SCCCaptured; ++I) {
+      ArgumentGraphNode *Node = *I;
+      if (Node->Uses.empty()) {
+        if (!Node->Definition->hasNoCaptureAttr())
+          SCCCaptured = true;
+      }
+    }
+    if (SCCCaptured)
+      continue;
+
+    SmallPtrSet<Argument *, 8> ArgumentSCCNodes;
+    // Fill ArgumentSCCNodes with the elements of the ArgumentSCC.  Used for
+    // quickly looking up whether a given Argument is in this ArgumentSCC.
+    for (ArgumentGraphNode *I : ArgumentSCC) {
+      ArgumentSCCNodes.insert(I->Definition);
+    }
+
+    for (auto I = ArgumentSCC.begin(), E = ArgumentSCC.end();
+         I != E && !SCCCaptured; ++I) {
+      ArgumentGraphNode *N = *I;
+      for (ArgumentGraphNode *Use : N->Uses) {
+        Argument *A = Use->Definition;
+        if (A->hasNoCaptureAttr() || ArgumentSCCNodes.count(A))
+          continue;
+        SCCCaptured = true;
+        break;
+      }
+    }
+    if (SCCCaptured)
+      continue;
+
+    for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
+      Argument *A = ArgumentSCC[i]->Definition;
+      A->addAttr(Attribute::NoCapture);
+      ++NumNoCapture;
+      Changed = true;
+    }
+
+    // We also want to compute readonly/readnone. With a small number of false
+    // negatives, we can assume that any pointer which is captured isn't going
+    // to be provably readonly or readnone, since by definition we can't
+    // analyze all uses of a captured pointer.
+    //
+    // The false negatives happen when the pointer is captured by a function
+    // that promises readonly/readnone behaviour on the pointer, then the
+    // pointer's lifetime ends before anything that writes to arbitrary memory.
+    // Also, a readonly/readnone pointer may be returned, but returning a
+    // pointer is capturing it.
+
+    Attribute::AttrKind ReadAttr = Attribute::ReadNone;
+    for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
+      Argument *A = ArgumentSCC[i]->Definition;
+      Attribute::AttrKind K = determinePointerReadAttrs(A, ArgumentSCCNodes);
+      if (K == Attribute::ReadNone)
+        continue;
+      if (K == Attribute::ReadOnly) {
+        ReadAttr = Attribute::ReadOnly;
+        continue;
+      }
+      ReadAttr = K;
+      break;
+    }
+
+    if (ReadAttr != Attribute::None) {
+      for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
+        Argument *A = ArgumentSCC[i]->Definition;
+        Changed = addReadAttr(A, ReadAttr);
+      }
+    }
+  }
+
+  return Changed;
+}
+
+/// Tests whether a function is "malloc-like".
+///
+/// A function is "malloc-like" if it returns either null or a pointer that
+/// doesn't alias any other pointer visible to the caller.
+static bool isFunctionMallocLike(Function *F, const SCCNodeSet &SCCNodes) {
+  SmallSetVector<Value *, 8> FlowsToReturn;
+  for (BasicBlock &BB : *F)
+    if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB.getTerminator()))
+      FlowsToReturn.insert(Ret->getReturnValue());
+
+  for (unsigned i = 0; i != FlowsToReturn.size(); ++i) {
+    Value *RetVal = FlowsToReturn[i];
+
+    if (Constant *C = dyn_cast<Constant>(RetVal)) {
+      if (!C->isNullValue() && !isa<UndefValue>(C))
+        return false;
+
+      continue;
+    }
+
+    if (isa<Argument>(RetVal))
+      return false;
+
+    if (Instruction *RVI = dyn_cast<Instruction>(RetVal))
+      switch (RVI->getOpcode()) {
+      // Extend the analysis by looking upwards.
+      case Instruction::BitCast:
+      case Instruction::GetElementPtr:
+      case Instruction::AddrSpaceCast:
+        FlowsToReturn.insert(RVI->getOperand(0));
+        continue;
+      case Instruction::Select: {
+        SelectInst *SI = cast<SelectInst>(RVI);
+        FlowsToReturn.insert(SI->getTrueValue());
+        FlowsToReturn.insert(SI->getFalseValue());
+        continue;
+      }
+      case Instruction::PHI: {
+        PHINode *PN = cast<PHINode>(RVI);
+        for (Value *IncValue : PN->incoming_values())
+          FlowsToReturn.insert(IncValue);
+        continue;
+      }
+
+      // Check whether the pointer came from an allocation.
+      case Instruction::Alloca:
+        break;
+      case Instruction::Call:
+      case Instruction::Invoke: {
+        CallBase &CB = cast<CallBase>(*RVI);
+        if (CB.hasRetAttr(Attribute::NoAlias))
+          break;
+        if (CB.getCalledFunction() && SCCNodes.count(CB.getCalledFunction()))
+          break;
+        LLVM_FALLTHROUGH;
+      }
+      default:
+        return false; // Did not come from an allocation.
+      }
+
+    if (PointerMayBeCaptured(RetVal, false, /*StoreCaptures=*/false))
+      return false;
+  }
+
+  return true;
+}
+
+/// Deduce noalias attributes for the SCC.
+static bool addNoAliasAttrs(const SCCNodeSet &SCCNodes) {
+  // Check each function in turn, determining which functions return noalias
+  // pointers.
+  for (Function *F : SCCNodes) {
+    // Already noalias.
+    if (F->returnDoesNotAlias())
+      continue;
+
+    // We can infer and propagate function attributes only when we know that the
+    // definition we'll get at link time is *exactly* the definition we see now.
+    // For more details, see GlobalValue::mayBeDerefined.
+    if (!F->hasExactDefinition())
+      return false;
+
+    // We annotate noalias return values, which are only applicable to
+    // pointer types.
+    if (!F->getReturnType()->isPointerTy())
+      continue;
+
+    if (!isFunctionMallocLike(F, SCCNodes))
+      return false;
+  }
+
+  bool MadeChange = false;
+  for (Function *F : SCCNodes) {
+    if (F->returnDoesNotAlias() ||
+        !F->getReturnType()->isPointerTy())
+      continue;
+
+    F->setReturnDoesNotAlias();
+    ++NumNoAlias;
+    MadeChange = true;
+  }
+
+  return MadeChange;
+}
+
+/// Tests whether this function is known to not return null.
+///
+/// Requires that the function returns a pointer.
+///
+/// Returns true if it believes the function will not return a null, and sets
+/// \p Speculative based on whether the returned conclusion is a speculative
+/// conclusion due to SCC calls.
+static bool isReturnNonNull(Function *F, const SCCNodeSet &SCCNodes,
+                            bool &Speculative) {
+  assert(F->getReturnType()->isPointerTy() &&
+         "nonnull only meaningful on pointer types");
+  Speculative = false;
+
+  SmallSetVector<Value *, 8> FlowsToReturn;
+  for (BasicBlock &BB : *F)
+    if (auto *Ret = dyn_cast<ReturnInst>(BB.getTerminator()))
+      FlowsToReturn.insert(Ret->getReturnValue());
+
+  auto &DL = F->getParent()->getDataLayout();
+
+  for (unsigned i = 0; i != FlowsToReturn.size(); ++i) {
+    Value *RetVal = FlowsToReturn[i];
+
+    // If this value is locally known to be non-null, we're good
+    if (isKnownNonZero(RetVal, DL))
+      continue;
+
+    // Otherwise, we need to look upwards since we can't make any local
+    // conclusions.
+    Instruction *RVI = dyn_cast<Instruction>(RetVal);
+    if (!RVI)
+      return false;
+    switch (RVI->getOpcode()) {
+    // Extend the analysis by looking upwards.
+    case Instruction::BitCast:
+    case Instruction::GetElementPtr:
+    case Instruction::AddrSpaceCast:
+      FlowsToReturn.insert(RVI->getOperand(0));
+      continue;
+    case Instruction::Select: {
+      SelectInst *SI = cast<SelectInst>(RVI);
+      FlowsToReturn.insert(SI->getTrueValue());
+      FlowsToReturn.insert(SI->getFalseValue());
+      continue;
+    }
+    case Instruction::PHI: {
+      PHINode *PN = cast<PHINode>(RVI);
+      for (int i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        FlowsToReturn.insert(PN->getIncomingValue(i));
+      continue;
+    }
+    case Instruction::Call:
+    case Instruction::Invoke: {
+      CallBase &CB = cast<CallBase>(*RVI);
+      Function *Callee = CB.getCalledFunction();
+      // A call to a node within the SCC is assumed to return null until
+      // proven otherwise
+      if (Callee && SCCNodes.count(Callee)) {
+        Speculative = true;
+        continue;
+      }
+      return false;
+    }
+    default:
+      return false; // Unknown source, may be null
+    };
+    llvm_unreachable("should have either continued or returned");
+  }
+
+  return true;
+}
+
+/// Deduce nonnull attributes for the SCC.
+static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) {
+  // Speculative that all functions in the SCC return only nonnull
+  // pointers.  We may refute this as we analyze functions.
+  bool SCCReturnsNonNull = true;
+
+  bool MadeChange = false;
+
+  // Check each function in turn, determining which functions return nonnull
+  // pointers.
+  for (Function *F : SCCNodes) {
+    // Already nonnull.
+    if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
+                                        Attribute::NonNull))
+      continue;
+
+    // We can infer and propagate function attributes only when we know that the
+    // definition we'll get at link time is *exactly* the definition we see now.
+    // For more details, see GlobalValue::mayBeDerefined.
+    if (!F->hasExactDefinition())
+      return false;
+
+    // We annotate nonnull return values, which are only applicable to
+    // pointer types.
+    if (!F->getReturnType()->isPointerTy())
+      continue;
+
+    bool Speculative = false;
+    if (isReturnNonNull(F, SCCNodes, Speculative)) {
+      if (!Speculative) {
+        // Mark the function eagerly since we may discover a function
+        // which prevents us from speculating about the entire SCC
+        LLVM_DEBUG(dbgs() << "Eagerly marking " << F->getName()
+                          << " as nonnull\n");
+        F->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+        ++NumNonNullReturn;
+        MadeChange = true;
+      }
+      continue;
+    }
+    // At least one function returns something which could be null, can't
+    // speculate any more.
+    SCCReturnsNonNull = false;
+  }
+
+  if (SCCReturnsNonNull) {
+    for (Function *F : SCCNodes) {
+      if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
+                                          Attribute::NonNull) ||
+          !F->getReturnType()->isPointerTy())
+        continue;
+
+      LLVM_DEBUG(dbgs() << "SCC marking " << F->getName() << " as nonnull\n");
+      F->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+      ++NumNonNullReturn;
+      MadeChange = true;
+    }
+  }
+
+  return MadeChange;
+}
+
+namespace {
+
+/// Collects a set of attribute inference requests and performs them all in one
+/// go on a single SCC Node. Inference involves scanning function bodies
+/// looking for instructions that violate attribute assumptions.
+/// As soon as all the bodies are fine we are free to set the attribute.
+/// Customization of inference for individual attributes is performed by
+/// providing a handful of predicates for each attribute.
+class AttributeInferer {
+public:
+  /// Describes a request for inference of a single attribute.
+  struct InferenceDescriptor {
+
+    /// Returns true if this function does not have to be handled.
+    /// General intent for this predicate is to provide an optimization
+    /// for functions that do not need this attribute inference at all
+    /// (say, for functions that already have the attribute).
+    std::function<bool(const Function &)> SkipFunction;
+
+    /// Returns true if this instruction violates attribute assumptions.
+    std::function<bool(Instruction &)> InstrBreaksAttribute;
+
+    /// Sets the inferred attribute for this function.
+    std::function<void(Function &)> SetAttribute;
+
+    /// Attribute we derive.
+    Attribute::AttrKind AKind;
+
+    /// If true, only "exact" definitions can be used to infer this attribute.
+    /// See GlobalValue::isDefinitionExact.
+    bool RequiresExactDefinition;
+
+    InferenceDescriptor(Attribute::AttrKind AK,
+                        std::function<bool(const Function &)> SkipFunc,
+                        std::function<bool(Instruction &)> InstrScan,
+                        std::function<void(Function &)> SetAttr,
+                        bool ReqExactDef)
+        : SkipFunction(SkipFunc), InstrBreaksAttribute(InstrScan),
+          SetAttribute(SetAttr), AKind(AK),
+          RequiresExactDefinition(ReqExactDef) {}
+  };
+
+private:
+  SmallVector<InferenceDescriptor, 4> InferenceDescriptors;
+
+public:
+  void registerAttrInference(InferenceDescriptor AttrInference) {
+    InferenceDescriptors.push_back(AttrInference);
+  }
+
+  bool run(const SCCNodeSet &SCCNodes);
+};
+
+/// Perform all the requested attribute inference actions according to the
+/// attribute predicates stored before.
+bool AttributeInferer::run(const SCCNodeSet &SCCNodes) {
+  SmallVector<InferenceDescriptor, 4> InferInSCC = InferenceDescriptors;
+  // Go through all the functions in SCC and check corresponding attribute
+  // assumptions for each of them. Attributes that are invalid for this SCC
+  // will be removed from InferInSCC.
+  for (Function *F : SCCNodes) {
+
+    // No attributes whose assumptions are still valid - done.
+    if (InferInSCC.empty())
+      return false;
+
+    // Check if our attributes ever need scanning/can be scanned.
+    llvm::erase_if(InferInSCC, [F](const InferenceDescriptor &ID) {
+      if (ID.SkipFunction(*F))
+        return false;
+
+      // Remove from further inference (invalidate) when visiting a function
+      // that has no instructions to scan/has an unsuitable definition.
+      return F->isDeclaration() ||
+             (ID.RequiresExactDefinition && !F->hasExactDefinition());
+    });
+
+    // For each attribute still in InferInSCC that doesn't explicitly skip F,
+    // set up the F instructions scan to verify assumptions of the attribute.
+    SmallVector<InferenceDescriptor, 4> InferInThisFunc;
+    llvm::copy_if(
+        InferInSCC, std::back_inserter(InferInThisFunc),
+        [F](const InferenceDescriptor &ID) { return !ID.SkipFunction(*F); });
+
+    if (InferInThisFunc.empty())
+      continue;
+
+    // Start instruction scan.
+    for (Instruction &I : instructions(*F)) {
+      llvm::erase_if(InferInThisFunc, [&](const InferenceDescriptor &ID) {
+        if (!ID.InstrBreaksAttribute(I))
+          return false;
+        // Remove attribute from further inference on any other functions
+        // because attribute assumptions have just been violated.
+        llvm::erase_if(InferInSCC, [&ID](const InferenceDescriptor &D) {
+          return D.AKind == ID.AKind;
+        });
+        // Remove attribute from the rest of current instruction scan.
+        return true;
+      });
+
+      if (InferInThisFunc.empty())
+        break;
+    }
+  }
+
+  if (InferInSCC.empty())
+    return false;
+
+  bool Changed = false;
+  for (Function *F : SCCNodes)
+    // At this point InferInSCC contains only functions that were either:
+    //   - explicitly skipped from scan/inference, or
+    //   - verified to have no instructions that break attribute assumptions.
+    // Hence we just go and force the attribute for all non-skipped functions.
+    for (auto &ID : InferInSCC) {
+      if (ID.SkipFunction(*F))
+        continue;
+      Changed = true;
+      ID.SetAttribute(*F);
+    }
+  return Changed;
+}
+
 struct SCCNodesResult {
   SCCNodeSet SCCNodes;
   bool HasUnknownCall;
 };
 
-} // end anonymous namespace 
- 
-/// Helper for non-Convergent inference predicate InstrBreaksAttribute. 
-static bool InstrBreaksNonConvergent(Instruction &I, 
-                                     const SCCNodeSet &SCCNodes) { 
-  const CallBase *CB = dyn_cast<CallBase>(&I); 
-  // Breaks non-convergent assumption if CS is a convergent call to a function 
-  // not in the SCC. 
-  return CB && CB->isConvergent() && 
-         SCCNodes.count(CB->getCalledFunction()) == 0; 
-} 
- 
-/// Helper for NoUnwind inference predicate InstrBreaksAttribute. 
-static bool InstrBreaksNonThrowing(Instruction &I, const SCCNodeSet &SCCNodes) { 
-  if (!I.mayThrow()) 
-    return false; 
-  if (const auto *CI = dyn_cast<CallInst>(&I)) { 
-    if (Function *Callee = CI->getCalledFunction()) { 
-      // I is a may-throw call to a function inside our SCC. This doesn't 
-      // invalidate our current working assumption that the SCC is no-throw; we 
-      // just have to scan that other function. 
+} // end anonymous namespace
+
+/// Helper for non-Convergent inference predicate InstrBreaksAttribute.
+static bool InstrBreaksNonConvergent(Instruction &I,
+                                     const SCCNodeSet &SCCNodes) {
+  const CallBase *CB = dyn_cast<CallBase>(&I);
+  // Breaks non-convergent assumption if CS is a convergent call to a function
+  // not in the SCC.
+  return CB && CB->isConvergent() &&
+         SCCNodes.count(CB->getCalledFunction()) == 0;
+}
+
+/// Helper for NoUnwind inference predicate InstrBreaksAttribute.
+static bool InstrBreaksNonThrowing(Instruction &I, const SCCNodeSet &SCCNodes) {
+  if (!I.mayThrow())
+    return false;
+  if (const auto *CI = dyn_cast<CallInst>(&I)) {
+    if (Function *Callee = CI->getCalledFunction()) {
+      // I is a may-throw call to a function inside our SCC. This doesn't
+      // invalidate our current working assumption that the SCC is no-throw; we
+      // just have to scan that other function.
       if (SCCNodes.contains(Callee))
-        return false; 
-    } 
-  } 
-  return true; 
-} 
- 
-/// Helper for NoFree inference predicate InstrBreaksAttribute. 
-static bool InstrBreaksNoFree(Instruction &I, const SCCNodeSet &SCCNodes) { 
-  CallBase *CB = dyn_cast<CallBase>(&I); 
-  if (!CB) 
-    return false; 
- 
-  Function *Callee = CB->getCalledFunction(); 
-  if (!Callee) 
-    return true; 
- 
-  if (Callee->doesNotFreeMemory()) 
-    return false; 
- 
+        return false;
+    }
+  }
+  return true;
+}
+
+/// Helper for NoFree inference predicate InstrBreaksAttribute.
+static bool InstrBreaksNoFree(Instruction &I, const SCCNodeSet &SCCNodes) {
+  CallBase *CB = dyn_cast<CallBase>(&I);
+  if (!CB)
+    return false;
+
+  Function *Callee = CB->getCalledFunction();
+  if (!Callee)
+    return true;
+
+  if (Callee->doesNotFreeMemory())
+    return false;
+
   if (SCCNodes.contains(Callee))
-    return false; 
- 
-  return true; 
-} 
- 
+    return false;
+
+  return true;
+}
+
 /// Attempt to remove convergent function attribute when possible.
-/// 
-/// Returns true if any changes to function attributes were made. 
+///
+/// Returns true if any changes to function attributes were made.
 static bool inferConvergent(const SCCNodeSet &SCCNodes) {
-  AttributeInferer AI; 
- 
-  // Request to remove the convergent attribute from all functions in the SCC 
-  // if every callsite within the SCC is not convergent (except for calls 
-  // to functions within the SCC). 
-  // Note: Removal of the attr from the callsites will happen in 
-  // InstCombineCalls separately. 
-  AI.registerAttrInference(AttributeInferer::InferenceDescriptor{ 
-      Attribute::Convergent, 
-      // Skip non-convergent functions. 
-      [](const Function &F) { return !F.isConvergent(); }, 
-      // Instructions that break non-convergent assumption. 
-      [SCCNodes](Instruction &I) { 
-        return InstrBreaksNonConvergent(I, SCCNodes); 
-      }, 
-      [](Function &F) { 
-        LLVM_DEBUG(dbgs() << "Removing convergent attr from fn " << F.getName() 
-                          << "\n"); 
-        F.setNotConvergent(); 
-      }, 
-      /* RequiresExactDefinition= */ false}); 
+  AttributeInferer AI;
+
+  // Request to remove the convergent attribute from all functions in the SCC
+  // if every callsite within the SCC is not convergent (except for calls
+  // to functions within the SCC).
+  // Note: Removal of the attr from the callsites will happen in
+  // InstCombineCalls separately.
+  AI.registerAttrInference(AttributeInferer::InferenceDescriptor{
+      Attribute::Convergent,
+      // Skip non-convergent functions.
+      [](const Function &F) { return !F.isConvergent(); },
+      // Instructions that break non-convergent assumption.
+      [SCCNodes](Instruction &I) {
+        return InstrBreaksNonConvergent(I, SCCNodes);
+      },
+      [](Function &F) {
+        LLVM_DEBUG(dbgs() << "Removing convergent attr from fn " << F.getName()
+                          << "\n");
+        F.setNotConvergent();
+      },
+      /* RequiresExactDefinition= */ false});
   // Perform all the requested attribute inference actions.
   return AI.run(SCCNodes);
 }
- 
+
 /// Infer attributes from all functions in the SCC by scanning every
 /// instruction for compliance to the attribute assumptions. Currently it
 /// does:
@@ -1317,86 +1317,86 @@ static bool inferConvergent(const SCCNodeSet &SCCNodes) {
 static bool inferAttrsFromFunctionBodies(const SCCNodeSet &SCCNodes) {
   AttributeInferer AI;
 
-  if (!DisableNoUnwindInference) 
-    // Request to infer nounwind attribute for all the functions in the SCC if 
-    // every callsite within the SCC is not throwing (except for calls to 
-    // functions within the SCC). Note that nounwind attribute suffers from 
-    // derefinement - results may change depending on how functions are 
-    // optimized. Thus it can be inferred only from exact definitions. 
-    AI.registerAttrInference(AttributeInferer::InferenceDescriptor{ 
-        Attribute::NoUnwind, 
-        // Skip non-throwing functions. 
-        [](const Function &F) { return F.doesNotThrow(); }, 
-        // Instructions that break non-throwing assumption. 
-        [&SCCNodes](Instruction &I) { 
-          return InstrBreaksNonThrowing(I, SCCNodes); 
-        }, 
-        [](Function &F) { 
-          LLVM_DEBUG(dbgs() 
-                     << "Adding nounwind attr to fn " << F.getName() << "\n"); 
-          F.setDoesNotThrow(); 
-          ++NumNoUnwind; 
-        }, 
-        /* RequiresExactDefinition= */ true}); 
- 
-  if (!DisableNoFreeInference) 
-    // Request to infer nofree attribute for all the functions in the SCC if 
-    // every callsite within the SCC does not directly or indirectly free 
-    // memory (except for calls to functions within the SCC). Note that nofree 
-    // attribute suffers from derefinement - results may change depending on 
-    // how functions are optimized. Thus it can be inferred only from exact 
-    // definitions. 
-    AI.registerAttrInference(AttributeInferer::InferenceDescriptor{ 
-        Attribute::NoFree, 
-        // Skip functions known not to free memory. 
-        [](const Function &F) { return F.doesNotFreeMemory(); }, 
-        // Instructions that break non-deallocating assumption. 
-        [&SCCNodes](Instruction &I) { 
-          return InstrBreaksNoFree(I, SCCNodes); 
-        }, 
-        [](Function &F) { 
-          LLVM_DEBUG(dbgs() 
-                     << "Adding nofree attr to fn " << F.getName() << "\n"); 
-          F.setDoesNotFreeMemory(); 
-          ++NumNoFree; 
-        }, 
-        /* RequiresExactDefinition= */ true}); 
- 
-  // Perform all the requested attribute inference actions. 
-  return AI.run(SCCNodes); 
-} 
- 
-static bool addNoRecurseAttrs(const SCCNodeSet &SCCNodes) { 
-  // Try and identify functions that do not recurse. 
- 
-  // If the SCC contains multiple nodes we know for sure there is recursion. 
-  if (SCCNodes.size() != 1) 
-    return false; 
- 
-  Function *F = *SCCNodes.begin(); 
-  if (!F || !F->hasExactDefinition() || F->doesNotRecurse()) 
-    return false; 
- 
-  // If all of the calls in F are identifiable and are to norecurse functions, F 
-  // is norecurse. This check also detects self-recursion as F is not currently 
-  // marked norecurse, so any called from F to F will not be marked norecurse. 
-  for (auto &BB : *F) 
-    for (auto &I : BB.instructionsWithoutDebug()) 
-      if (auto *CB = dyn_cast<CallBase>(&I)) { 
-        Function *Callee = CB->getCalledFunction(); 
-        if (!Callee || Callee == F || !Callee->doesNotRecurse()) 
-          // Function calls a potentially recursive function. 
-          return false; 
-      } 
- 
-  // Every call was to a non-recursive function other than this function, and 
-  // we have no indirect recursion as the SCC size is one. This function cannot 
-  // recurse. 
+  if (!DisableNoUnwindInference)
+    // Request to infer nounwind attribute for all the functions in the SCC if
+    // every callsite within the SCC is not throwing (except for calls to
+    // functions within the SCC). Note that nounwind attribute suffers from
+    // derefinement - results may change depending on how functions are
+    // optimized. Thus it can be inferred only from exact definitions.
+    AI.registerAttrInference(AttributeInferer::InferenceDescriptor{
+        Attribute::NoUnwind,
+        // Skip non-throwing functions.
+        [](const Function &F) { return F.doesNotThrow(); },
+        // Instructions that break non-throwing assumption.
+        [&SCCNodes](Instruction &I) {
+          return InstrBreaksNonThrowing(I, SCCNodes);
+        },
+        [](Function &F) {
+          LLVM_DEBUG(dbgs()
+                     << "Adding nounwind attr to fn " << F.getName() << "\n");
+          F.setDoesNotThrow();
+          ++NumNoUnwind;
+        },
+        /* RequiresExactDefinition= */ true});
+
+  if (!DisableNoFreeInference)
+    // Request to infer nofree attribute for all the functions in the SCC if
+    // every callsite within the SCC does not directly or indirectly free
+    // memory (except for calls to functions within the SCC). Note that nofree
+    // attribute suffers from derefinement - results may change depending on
+    // how functions are optimized. Thus it can be inferred only from exact
+    // definitions.
+    AI.registerAttrInference(AttributeInferer::InferenceDescriptor{
+        Attribute::NoFree,
+        // Skip functions known not to free memory.
+        [](const Function &F) { return F.doesNotFreeMemory(); },
+        // Instructions that break non-deallocating assumption.
+        [&SCCNodes](Instruction &I) {
+          return InstrBreaksNoFree(I, SCCNodes);
+        },
+        [](Function &F) {
+          LLVM_DEBUG(dbgs()
+                     << "Adding nofree attr to fn " << F.getName() << "\n");
+          F.setDoesNotFreeMemory();
+          ++NumNoFree;
+        },
+        /* RequiresExactDefinition= */ true});
+
+  // Perform all the requested attribute inference actions.
+  return AI.run(SCCNodes);
+}
+
+static bool addNoRecurseAttrs(const SCCNodeSet &SCCNodes) {
+  // Try and identify functions that do not recurse.
+
+  // If the SCC contains multiple nodes we know for sure there is recursion.
+  if (SCCNodes.size() != 1)
+    return false;
+
+  Function *F = *SCCNodes.begin();
+  if (!F || !F->hasExactDefinition() || F->doesNotRecurse())
+    return false;
+
+  // If all of the calls in F are identifiable and are to norecurse functions, F
+  // is norecurse. This check also detects self-recursion as F is not currently
+  // marked norecurse, so any called from F to F will not be marked norecurse.
+  for (auto &BB : *F)
+    for (auto &I : BB.instructionsWithoutDebug())
+      if (auto *CB = dyn_cast<CallBase>(&I)) {
+        Function *Callee = CB->getCalledFunction();
+        if (!Callee || Callee == F || !Callee->doesNotRecurse())
+          // Function calls a potentially recursive function.
+          return false;
+      }
+
+  // Every call was to a non-recursive function other than this function, and
+  // we have no indirect recursion as the SCC size is one. This function cannot
+  // recurse.
   F->setDoesNotRecurse();
   ++NumNoRecurse;
   return true;
-} 
- 
+}
+
 static bool instructionDoesNotReturn(Instruction &I) {
   if (auto *CB = dyn_cast<CallBase>(&I)) {
     Function *Callee = CB->getCalledFunction();
@@ -1501,220 +1501,220 @@ static SCCNodesResult createSCCNodeSet(ArrayRef<Function *> Functions) {
   return Res;
 }
 
-template <typename AARGetterT> 
+template <typename AARGetterT>
 static bool deriveAttrsInPostOrder(ArrayRef<Function *> Functions,
                                    AARGetterT &&AARGetter) {
   SCCNodesResult Nodes = createSCCNodeSet(Functions);
-  bool Changed = false; 
- 
-  // Bail if the SCC only contains optnone functions. 
+  bool Changed = false;
+
+  // Bail if the SCC only contains optnone functions.
   if (Nodes.SCCNodes.empty())
-    return Changed; 
- 
+    return Changed;
+
   Changed |= addArgumentReturnedAttrs(Nodes.SCCNodes);
   Changed |= addReadAttrs(Nodes.SCCNodes, AARGetter);
   Changed |= addArgumentAttrs(Nodes.SCCNodes);
   Changed |= inferConvergent(Nodes.SCCNodes);
   Changed |= addNoReturnAttrs(Nodes.SCCNodes);
   Changed |= addWillReturn(Nodes.SCCNodes);
- 
-  // If we have no external nodes participating in the SCC, we can deduce some 
-  // more precise attributes as well. 
+
+  // If we have no external nodes participating in the SCC, we can deduce some
+  // more precise attributes as well.
   if (!Nodes.HasUnknownCall) {
     Changed |= addNoAliasAttrs(Nodes.SCCNodes);
     Changed |= addNonNullAttrs(Nodes.SCCNodes);
     Changed |= inferAttrsFromFunctionBodies(Nodes.SCCNodes);
     Changed |= addNoRecurseAttrs(Nodes.SCCNodes);
-  } 
- 
-  return Changed; 
-} 
- 
-PreservedAnalyses PostOrderFunctionAttrsPass::run(LazyCallGraph::SCC &C, 
-                                                  CGSCCAnalysisManager &AM, 
-                                                  LazyCallGraph &CG, 
-                                                  CGSCCUpdateResult &) { 
-  FunctionAnalysisManager &FAM = 
-      AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager(); 
- 
-  // We pass a lambda into functions to wire them up to the analysis manager 
-  // for getting function analyses. 
-  auto AARGetter = [&](Function &F) -> AAResults & { 
-    return FAM.getResult<AAManager>(F); 
-  }; 
- 
+  }
+
+  return Changed;
+}
+
+PreservedAnalyses PostOrderFunctionAttrsPass::run(LazyCallGraph::SCC &C,
+                                                  CGSCCAnalysisManager &AM,
+                                                  LazyCallGraph &CG,
+                                                  CGSCCUpdateResult &) {
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
+
+  // We pass a lambda into functions to wire them up to the analysis manager
+  // for getting function analyses.
+  auto AARGetter = [&](Function &F) -> AAResults & {
+    return FAM.getResult<AAManager>(F);
+  };
+
   SmallVector<Function *, 8> Functions;
-  for (LazyCallGraph::Node &N : C) { 
+  for (LazyCallGraph::Node &N : C) {
     Functions.push_back(&N.getFunction());
-  } 
- 
+  }
+
   if (deriveAttrsInPostOrder(Functions, AARGetter))
-    return PreservedAnalyses::none(); 
- 
-  return PreservedAnalyses::all(); 
-} 
- 
-namespace { 
- 
-struct PostOrderFunctionAttrsLegacyPass : public CallGraphSCCPass { 
-  // Pass identification, replacement for typeid 
-  static char ID; 
- 
-  PostOrderFunctionAttrsLegacyPass() : CallGraphSCCPass(ID) { 
-    initializePostOrderFunctionAttrsLegacyPassPass( 
-        *PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnSCC(CallGraphSCC &SCC) override; 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.setPreservesCFG(); 
-    AU.addRequired<AssumptionCacheTracker>(); 
-    getAAResultsAnalysisUsage(AU); 
-    CallGraphSCCPass::getAnalysisUsage(AU); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-char PostOrderFunctionAttrsLegacyPass::ID = 0; 
+    return PreservedAnalyses::none();
+
+  return PreservedAnalyses::all();
+}
+
+namespace {
+
+struct PostOrderFunctionAttrsLegacyPass : public CallGraphSCCPass {
+  // Pass identification, replacement for typeid
+  static char ID;
+
+  PostOrderFunctionAttrsLegacyPass() : CallGraphSCCPass(ID) {
+    initializePostOrderFunctionAttrsLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnSCC(CallGraphSCC &SCC) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<AssumptionCacheTracker>();
+    getAAResultsAnalysisUsage(AU);
+    CallGraphSCCPass::getAnalysisUsage(AU);
+  }
+};
+
+} // end anonymous namespace
+
+char PostOrderFunctionAttrsLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(PostOrderFunctionAttrsLegacyPass, "function-attrs",
-                      "Deduce function attributes", false, false) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
-INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) 
+                      "Deduce function attributes", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
 INITIALIZE_PASS_END(PostOrderFunctionAttrsLegacyPass, "function-attrs",
-                    "Deduce function attributes", false, false) 
- 
-Pass *llvm::createPostOrderFunctionAttrsLegacyPass() { 
-  return new PostOrderFunctionAttrsLegacyPass(); 
-} 
- 
-template <typename AARGetterT> 
-static bool runImpl(CallGraphSCC &SCC, AARGetterT AARGetter) { 
+                    "Deduce function attributes", false, false)
+
+Pass *llvm::createPostOrderFunctionAttrsLegacyPass() {
+  return new PostOrderFunctionAttrsLegacyPass();
+}
+
+template <typename AARGetterT>
+static bool runImpl(CallGraphSCC &SCC, AARGetterT AARGetter) {
   SmallVector<Function *, 8> Functions;
-  for (CallGraphNode *I : SCC) { 
+  for (CallGraphNode *I : SCC) {
     Functions.push_back(I->getFunction());
-  } 
- 
+  }
+
   return deriveAttrsInPostOrder(Functions, AARGetter);
-} 
- 
-bool PostOrderFunctionAttrsLegacyPass::runOnSCC(CallGraphSCC &SCC) { 
-  if (skipSCC(SCC)) 
-    return false; 
-  return runImpl(SCC, LegacyAARGetter(*this)); 
-} 
- 
-namespace { 
- 
-struct ReversePostOrderFunctionAttrsLegacyPass : public ModulePass { 
-  // Pass identification, replacement for typeid 
-  static char ID; 
- 
-  ReversePostOrderFunctionAttrsLegacyPass() : ModulePass(ID) { 
-    initializeReversePostOrderFunctionAttrsLegacyPassPass( 
-        *PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnModule(Module &M) override; 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.setPreservesCFG(); 
-    AU.addRequired<CallGraphWrapperPass>(); 
-    AU.addPreserved<CallGraphWrapperPass>(); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-char ReversePostOrderFunctionAttrsLegacyPass::ID = 0; 
- 
+}
+
+bool PostOrderFunctionAttrsLegacyPass::runOnSCC(CallGraphSCC &SCC) {
+  if (skipSCC(SCC))
+    return false;
+  return runImpl(SCC, LegacyAARGetter(*this));
+}
+
+namespace {
+
+struct ReversePostOrderFunctionAttrsLegacyPass : public ModulePass {
+  // Pass identification, replacement for typeid
+  static char ID;
+
+  ReversePostOrderFunctionAttrsLegacyPass() : ModulePass(ID) {
+    initializeReversePostOrderFunctionAttrsLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<CallGraphWrapperPass>();
+    AU.addPreserved<CallGraphWrapperPass>();
+  }
+};
+
+} // end anonymous namespace
+
+char ReversePostOrderFunctionAttrsLegacyPass::ID = 0;
+
 INITIALIZE_PASS_BEGIN(ReversePostOrderFunctionAttrsLegacyPass,
                       "rpo-function-attrs", "Deduce function attributes in RPO",
                       false, false)
-INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) 
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
 INITIALIZE_PASS_END(ReversePostOrderFunctionAttrsLegacyPass,
                     "rpo-function-attrs", "Deduce function attributes in RPO",
                     false, false)
- 
-Pass *llvm::createReversePostOrderFunctionAttrsPass() { 
-  return new ReversePostOrderFunctionAttrsLegacyPass(); 
-} 
- 
-static bool addNoRecurseAttrsTopDown(Function &F) { 
-  // We check the preconditions for the function prior to calling this to avoid 
-  // the cost of building up a reversible post-order list. We assert them here 
-  // to make sure none of the invariants this relies on were violated. 
-  assert(!F.isDeclaration() && "Cannot deduce norecurse without a definition!"); 
-  assert(!F.doesNotRecurse() && 
-         "This function has already been deduced as norecurs!"); 
-  assert(F.hasInternalLinkage() && 
-         "Can only do top-down deduction for internal linkage functions!"); 
- 
-  // If F is internal and all of its uses are calls from a non-recursive 
-  // functions, then none of its calls could in fact recurse without going 
-  // through a function marked norecurse, and so we can mark this function too 
-  // as norecurse. Note that the uses must actually be calls -- otherwise 
-  // a pointer to this function could be returned from a norecurse function but 
-  // this function could be recursively (indirectly) called. Note that this 
-  // also detects if F is directly recursive as F is not yet marked as 
-  // a norecurse function. 
-  for (auto *U : F.users()) { 
-    auto *I = dyn_cast<Instruction>(U); 
-    if (!I) 
-      return false; 
-    CallBase *CB = dyn_cast<CallBase>(I); 
-    if (!CB || !CB->getParent()->getParent()->doesNotRecurse()) 
-      return false; 
-  } 
+
+Pass *llvm::createReversePostOrderFunctionAttrsPass() {
+  return new ReversePostOrderFunctionAttrsLegacyPass();
+}
+
+static bool addNoRecurseAttrsTopDown(Function &F) {
+  // We check the preconditions for the function prior to calling this to avoid
+  // the cost of building up a reversible post-order list. We assert them here
+  // to make sure none of the invariants this relies on were violated.
+  assert(!F.isDeclaration() && "Cannot deduce norecurse without a definition!");
+  assert(!F.doesNotRecurse() &&
+         "This function has already been deduced as norecurs!");
+  assert(F.hasInternalLinkage() &&
+         "Can only do top-down deduction for internal linkage functions!");
+
+  // If F is internal and all of its uses are calls from a non-recursive
+  // functions, then none of its calls could in fact recurse without going
+  // through a function marked norecurse, and so we can mark this function too
+  // as norecurse. Note that the uses must actually be calls -- otherwise
+  // a pointer to this function could be returned from a norecurse function but
+  // this function could be recursively (indirectly) called. Note that this
+  // also detects if F is directly recursive as F is not yet marked as
+  // a norecurse function.
+  for (auto *U : F.users()) {
+    auto *I = dyn_cast<Instruction>(U);
+    if (!I)
+      return false;
+    CallBase *CB = dyn_cast<CallBase>(I);
+    if (!CB || !CB->getParent()->getParent()->doesNotRecurse())
+      return false;
+  }
   F.setDoesNotRecurse();
   ++NumNoRecurse;
   return true;
-} 
- 
-static bool deduceFunctionAttributeInRPO(Module &M, CallGraph &CG) { 
-  // We only have a post-order SCC traversal (because SCCs are inherently 
-  // discovered in post-order), so we accumulate them in a vector and then walk 
-  // it in reverse. This is simpler than using the RPO iterator infrastructure 
-  // because we need to combine SCC detection and the PO walk of the call 
-  // graph. We can also cheat egregiously because we're primarily interested in 
-  // synthesizing norecurse and so we can only save the singular SCCs as SCCs 
-  // with multiple functions in them will clearly be recursive. 
-  SmallVector<Function *, 16> Worklist; 
-  for (scc_iterator<CallGraph *> I = scc_begin(&CG); !I.isAtEnd(); ++I) { 
-    if (I->size() != 1) 
-      continue; 
- 
-    Function *F = I->front()->getFunction(); 
-    if (F && !F->isDeclaration() && !F->doesNotRecurse() && 
-        F->hasInternalLinkage()) 
-      Worklist.push_back(F); 
-  } 
- 
-  bool Changed = false; 
-  for (auto *F : llvm::reverse(Worklist)) 
-    Changed |= addNoRecurseAttrsTopDown(*F); 
- 
-  return Changed; 
-} 
- 
-bool ReversePostOrderFunctionAttrsLegacyPass::runOnModule(Module &M) { 
-  if (skipModule(M)) 
-    return false; 
- 
-  auto &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); 
- 
-  return deduceFunctionAttributeInRPO(M, CG); 
-} 
- 
-PreservedAnalyses 
-ReversePostOrderFunctionAttrsPass::run(Module &M, ModuleAnalysisManager &AM) { 
-  auto &CG = AM.getResult<CallGraphAnalysis>(M); 
- 
-  if (!deduceFunctionAttributeInRPO(M, CG)) 
-    return PreservedAnalyses::all(); 
- 
-  PreservedAnalyses PA; 
-  PA.preserve<CallGraphAnalysis>(); 
-  return PA; 
-} 
+}
+
+static bool deduceFunctionAttributeInRPO(Module &M, CallGraph &CG) {
+  // We only have a post-order SCC traversal (because SCCs are inherently
+  // discovered in post-order), so we accumulate them in a vector and then walk
+  // it in reverse. This is simpler than using the RPO iterator infrastructure
+  // because we need to combine SCC detection and the PO walk of the call
+  // graph. We can also cheat egregiously because we're primarily interested in
+  // synthesizing norecurse and so we can only save the singular SCCs as SCCs
+  // with multiple functions in them will clearly be recursive.
+  SmallVector<Function *, 16> Worklist;
+  for (scc_iterator<CallGraph *> I = scc_begin(&CG); !I.isAtEnd(); ++I) {
+    if (I->size() != 1)
+      continue;
+
+    Function *F = I->front()->getFunction();
+    if (F && !F->isDeclaration() && !F->doesNotRecurse() &&
+        F->hasInternalLinkage())
+      Worklist.push_back(F);
+  }
+
+  bool Changed = false;
+  for (auto *F : llvm::reverse(Worklist))
+    Changed |= addNoRecurseAttrsTopDown(*F);
+
+  return Changed;
+}
+
+bool ReversePostOrderFunctionAttrsLegacyPass::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  auto &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+
+  return deduceFunctionAttributeInRPO(M, CG);
+}
+
+PreservedAnalyses
+ReversePostOrderFunctionAttrsPass::run(Module &M, ModuleAnalysisManager &AM) {
+  auto &CG = AM.getResult<CallGraphAnalysis>(M);
+
+  if (!deduceFunctionAttributeInRPO(M, CG))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserve<CallGraphAnalysis>();
+  return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/FunctionImport.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/FunctionImport.cpp
index f99358e70b..18343030bc 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/FunctionImport.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/FunctionImport.cpp
@@ -1,558 +1,558 @@
-//===- FunctionImport.cpp - ThinLTO Summary-based Function Import ---------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements Function import based on summaries. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/IPO/FunctionImport.h" 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/StringMap.h" 
-#include "llvm/ADT/StringRef.h" 
-#include "llvm/ADT/StringSet.h" 
-#include "llvm/Bitcode/BitcodeReader.h" 
-#include "llvm/IR/AutoUpgrade.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GlobalAlias.h" 
-#include "llvm/IR/GlobalObject.h" 
-#include "llvm/IR/GlobalValue.h" 
-#include "llvm/IR/GlobalVariable.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/ModuleSummaryIndex.h" 
-#include "llvm/IRReader/IRReader.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Linker/IRMover.h" 
-#include "llvm/Object/ModuleSymbolTable.h" 
-#include "llvm/Object/SymbolicFile.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/Error.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/FileSystem.h" 
-#include "llvm/Support/SourceMgr.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/IPO/Internalize.h" 
-#include "llvm/Transforms/Utils/Cloning.h" 
-#include "llvm/Transforms/Utils/FunctionImportUtils.h" 
-#include "llvm/Transforms/Utils/ValueMapper.h" 
-#include <cassert> 
-#include <memory> 
-#include <set> 
-#include <string> 
-#include <system_error> 
-#include <tuple> 
-#include <utility> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "function-import" 
- 
-STATISTIC(NumImportedFunctionsThinLink, 
-          "Number of functions thin link decided to import"); 
-STATISTIC(NumImportedHotFunctionsThinLink, 
-          "Number of hot functions thin link decided to import"); 
-STATISTIC(NumImportedCriticalFunctionsThinLink, 
-          "Number of critical functions thin link decided to import"); 
-STATISTIC(NumImportedGlobalVarsThinLink, 
-          "Number of global variables thin link decided to import"); 
-STATISTIC(NumImportedFunctions, "Number of functions imported in backend"); 
-STATISTIC(NumImportedGlobalVars, 
-          "Number of global variables imported in backend"); 
-STATISTIC(NumImportedModules, "Number of modules imported from"); 
-STATISTIC(NumDeadSymbols, "Number of dead stripped symbols in index"); 
-STATISTIC(NumLiveSymbols, "Number of live symbols in index"); 
- 
-/// Limit on instruction count of imported functions. 
-static cl::opt<unsigned> ImportInstrLimit( 
-    "import-instr-limit", cl::init(100), cl::Hidden, cl::value_desc("N"), 
-    cl::desc("Only import functions with less than N instructions")); 
- 
-static cl::opt<int> ImportCutoff( 
-    "import-cutoff", cl::init(-1), cl::Hidden, cl::value_desc("N"), 
-    cl::desc("Only import first N functions if N>=0 (default -1)")); 
- 
-static cl::opt<float> 
-    ImportInstrFactor("import-instr-evolution-factor", cl::init(0.7), 
-                      cl::Hidden, cl::value_desc("x"), 
-                      cl::desc("As we import functions, multiply the " 
-                               "`import-instr-limit` threshold by this factor " 
-                               "before processing newly imported functions")); 
- 
-static cl::opt<float> ImportHotInstrFactor( 
-    "import-hot-evolution-factor", cl::init(1.0), cl::Hidden, 
-    cl::value_desc("x"), 
-    cl::desc("As we import functions called from hot callsite, multiply the " 
-             "`import-instr-limit` threshold by this factor " 
-             "before processing newly imported functions")); 
- 
-static cl::opt<float> ImportHotMultiplier( 
-    "import-hot-multiplier", cl::init(10.0), cl::Hidden, cl::value_desc("x"), 
-    cl::desc("Multiply the `import-instr-limit` threshold for hot callsites")); 
- 
-static cl::opt<float> ImportCriticalMultiplier( 
-    "import-critical-multiplier", cl::init(100.0), cl::Hidden, 
-    cl::value_desc("x"), 
-    cl::desc( 
-        "Multiply the `import-instr-limit` threshold for critical callsites")); 
- 
-// FIXME: This multiplier was not really tuned up. 
-static cl::opt<float> ImportColdMultiplier( 
-    "import-cold-multiplier", cl::init(0), cl::Hidden, cl::value_desc("N"), 
-    cl::desc("Multiply the `import-instr-limit` threshold for cold callsites")); 
- 
-static cl::opt<bool> PrintImports("print-imports", cl::init(false), cl::Hidden, 
-                                  cl::desc("Print imported functions")); 
- 
-static cl::opt<bool> PrintImportFailures( 
-    "print-import-failures", cl::init(false), cl::Hidden, 
-    cl::desc("Print information for functions rejected for importing")); 
- 
-static cl::opt<bool> ComputeDead("compute-dead", cl::init(true), cl::Hidden, 
-                                 cl::desc("Compute dead symbols")); 
- 
-static cl::opt<bool> EnableImportMetadata( 
+//===- FunctionImport.cpp - ThinLTO Summary-based Function Import ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements Function import based on summaries.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/FunctionImport.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/IR/AutoUpgrade.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Linker/IRMover.h"
+#include "llvm/Object/ModuleSymbolTable.h"
+#include "llvm/Object/SymbolicFile.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO/Internalize.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/FunctionImportUtils.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <cassert>
+#include <memory>
+#include <set>
+#include <string>
+#include <system_error>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "function-import"
+
+STATISTIC(NumImportedFunctionsThinLink,
+          "Number of functions thin link decided to import");
+STATISTIC(NumImportedHotFunctionsThinLink,
+          "Number of hot functions thin link decided to import");
+STATISTIC(NumImportedCriticalFunctionsThinLink,
+          "Number of critical functions thin link decided to import");
+STATISTIC(NumImportedGlobalVarsThinLink,
+          "Number of global variables thin link decided to import");
+STATISTIC(NumImportedFunctions, "Number of functions imported in backend");
+STATISTIC(NumImportedGlobalVars,
+          "Number of global variables imported in backend");
+STATISTIC(NumImportedModules, "Number of modules imported from");
+STATISTIC(NumDeadSymbols, "Number of dead stripped symbols in index");
+STATISTIC(NumLiveSymbols, "Number of live symbols in index");
+
+/// Limit on instruction count of imported functions.
+static cl::opt<unsigned> ImportInstrLimit(
+    "import-instr-limit", cl::init(100), cl::Hidden, cl::value_desc("N"),
+    cl::desc("Only import functions with less than N instructions"));
+
+static cl::opt<int> ImportCutoff(
+    "import-cutoff", cl::init(-1), cl::Hidden, cl::value_desc("N"),
+    cl::desc("Only import first N functions if N>=0 (default -1)"));
+
+static cl::opt<float>
+    ImportInstrFactor("import-instr-evolution-factor", cl::init(0.7),
+                      cl::Hidden, cl::value_desc("x"),
+                      cl::desc("As we import functions, multiply the "
+                               "`import-instr-limit` threshold by this factor "
+                               "before processing newly imported functions"));
+
+static cl::opt<float> ImportHotInstrFactor(
+    "import-hot-evolution-factor", cl::init(1.0), cl::Hidden,
+    cl::value_desc("x"),
+    cl::desc("As we import functions called from hot callsite, multiply the "
+             "`import-instr-limit` threshold by this factor "
+             "before processing newly imported functions"));
+
+static cl::opt<float> ImportHotMultiplier(
+    "import-hot-multiplier", cl::init(10.0), cl::Hidden, cl::value_desc("x"),
+    cl::desc("Multiply the `import-instr-limit` threshold for hot callsites"));
+
+static cl::opt<float> ImportCriticalMultiplier(
+    "import-critical-multiplier", cl::init(100.0), cl::Hidden,
+    cl::value_desc("x"),
+    cl::desc(
+        "Multiply the `import-instr-limit` threshold for critical callsites"));
+
+// FIXME: This multiplier was not really tuned up.
+static cl::opt<float> ImportColdMultiplier(
+    "import-cold-multiplier", cl::init(0), cl::Hidden, cl::value_desc("N"),
+    cl::desc("Multiply the `import-instr-limit` threshold for cold callsites"));
+
+static cl::opt<bool> PrintImports("print-imports", cl::init(false), cl::Hidden,
+                                  cl::desc("Print imported functions"));
+
+static cl::opt<bool> PrintImportFailures(
+    "print-import-failures", cl::init(false), cl::Hidden,
+    cl::desc("Print information for functions rejected for importing"));
+
+static cl::opt<bool> ComputeDead("compute-dead", cl::init(true), cl::Hidden,
+                                 cl::desc("Compute dead symbols"));
+
+static cl::opt<bool> EnableImportMetadata(
     "enable-import-metadata", cl::init(false), cl::Hidden,
     cl::desc("Enable import metadata like 'thinlto_src_module'"));
- 
-/// Summary file to use for function importing when using -function-import from 
-/// the command line. 
-static cl::opt<std::string> 
-    SummaryFile("summary-file", 
-                cl::desc("The summary file to use for function importing.")); 
- 
-/// Used when testing importing from distributed indexes via opt 
-// -function-import. 
-static cl::opt<bool> 
-    ImportAllIndex("import-all-index", 
-                   cl::desc("Import all external functions in index.")); 
- 
-// Load lazily a module from \p FileName in \p Context. 
-static std::unique_ptr<Module> loadFile(const std::string &FileName, 
-                                        LLVMContext &Context) { 
-  SMDiagnostic Err; 
-  LLVM_DEBUG(dbgs() << "Loading '" << FileName << "'\n"); 
-  // Metadata isn't loaded until functions are imported, to minimize 
-  // the memory overhead. 
-  std::unique_ptr<Module> Result = 
-      getLazyIRFileModule(FileName, Err, Context, 
-                          /* ShouldLazyLoadMetadata = */ true); 
-  if (!Result) { 
-    Err.print("function-import", errs()); 
-    report_fatal_error("Abort"); 
-  } 
- 
-  return Result; 
-} 
- 
-/// Given a list of possible callee implementation for a call site, select one 
-/// that fits the \p Threshold. 
-/// 
-/// FIXME: select "best" instead of first that fits. But what is "best"? 
-/// - The smallest: more likely to be inlined. 
-/// - The one with the least outgoing edges (already well optimized). 
-/// - One from a module already being imported from in order to reduce the 
-///   number of source modules parsed/linked. 
-/// - One that has PGO data attached. 
-/// - [insert you fancy metric here] 
-static const GlobalValueSummary * 
-selectCallee(const ModuleSummaryIndex &Index, 
-             ArrayRef<std::unique_ptr<GlobalValueSummary>> CalleeSummaryList, 
-             unsigned Threshold, StringRef CallerModulePath, 
-             FunctionImporter::ImportFailureReason &Reason, 
-             GlobalValue::GUID GUID) { 
-  Reason = FunctionImporter::ImportFailureReason::None; 
-  auto It = llvm::find_if( 
-      CalleeSummaryList, 
-      [&](const std::unique_ptr<GlobalValueSummary> &SummaryPtr) { 
-        auto *GVSummary = SummaryPtr.get(); 
-        if (!Index.isGlobalValueLive(GVSummary)) { 
-          Reason = FunctionImporter::ImportFailureReason::NotLive; 
-          return false; 
-        } 
- 
-        // For SamplePGO, in computeImportForFunction the OriginalId 
-        // may have been used to locate the callee summary list (See 
-        // comment there). 
-        // The mapping from OriginalId to GUID may return a GUID 
-        // that corresponds to a static variable. Filter it out here. 
-        // This can happen when 
-        // 1) There is a call to a library function which is not defined 
-        // in the index. 
-        // 2) There is a static variable with the  OriginalGUID identical 
-        // to the GUID of the library function in 1); 
-        // When this happens, the logic for SamplePGO kicks in and 
-        // the static variable in 2) will be found, which needs to be 
-        // filtered out. 
-        if (GVSummary->getSummaryKind() == GlobalValueSummary::GlobalVarKind) { 
-          Reason = FunctionImporter::ImportFailureReason::GlobalVar; 
-          return false; 
-        } 
-        if (GlobalValue::isInterposableLinkage(GVSummary->linkage())) { 
-          Reason = FunctionImporter::ImportFailureReason::InterposableLinkage; 
-          // There is no point in importing these, we can't inline them 
-          return false; 
-        } 
- 
-        auto *Summary = cast<FunctionSummary>(GVSummary->getBaseObject()); 
- 
-        // If this is a local function, make sure we import the copy 
-        // in the caller's module. The only time a local function can 
-        // share an entry in the index is if there is a local with the same name 
-        // in another module that had the same source file name (in a different 
-        // directory), where each was compiled in their own directory so there 
-        // was not distinguishing path. 
-        // However, do the import from another module if there is only one 
-        // entry in the list - in that case this must be a reference due 
-        // to indirect call profile data, since a function pointer can point to 
-        // a local in another module. 
-        if (GlobalValue::isLocalLinkage(Summary->linkage()) && 
-            CalleeSummaryList.size() > 1 && 
-            Summary->modulePath() != CallerModulePath) { 
-          Reason = 
-              FunctionImporter::ImportFailureReason::LocalLinkageNotInModule; 
-          return false; 
-        } 
- 
-        if ((Summary->instCount() > Threshold) && 
-            !Summary->fflags().AlwaysInline) { 
-          Reason = FunctionImporter::ImportFailureReason::TooLarge; 
-          return false; 
-        } 
- 
-        // Skip if it isn't legal to import (e.g. may reference unpromotable 
-        // locals). 
-        if (Summary->notEligibleToImport()) { 
-          Reason = FunctionImporter::ImportFailureReason::NotEligible; 
-          return false; 
-        } 
- 
-        // Don't bother importing if we can't inline it anyway. 
-        if (Summary->fflags().NoInline) { 
-          Reason = FunctionImporter::ImportFailureReason::NoInline; 
-          return false; 
-        } 
- 
-        return true; 
-      }); 
-  if (It == CalleeSummaryList.end()) 
-    return nullptr; 
- 
-  return cast<GlobalValueSummary>(It->get()); 
-} 
- 
-namespace { 
- 
+
+/// Summary file to use for function importing when using -function-import from
+/// the command line.
+static cl::opt<std::string>
+    SummaryFile("summary-file",
+                cl::desc("The summary file to use for function importing."));
+
+/// Used when testing importing from distributed indexes via opt
+// -function-import.
+static cl::opt<bool>
+    ImportAllIndex("import-all-index",
+                   cl::desc("Import all external functions in index."));
+
+// Load lazily a module from \p FileName in \p Context.
+static std::unique_ptr<Module> loadFile(const std::string &FileName,
+                                        LLVMContext &Context) {
+  SMDiagnostic Err;
+  LLVM_DEBUG(dbgs() << "Loading '" << FileName << "'\n");
+  // Metadata isn't loaded until functions are imported, to minimize
+  // the memory overhead.
+  std::unique_ptr<Module> Result =
+      getLazyIRFileModule(FileName, Err, Context,
+                          /* ShouldLazyLoadMetadata = */ true);
+  if (!Result) {
+    Err.print("function-import", errs());
+    report_fatal_error("Abort");
+  }
+
+  return Result;
+}
+
+/// Given a list of possible callee implementation for a call site, select one
+/// that fits the \p Threshold.
+///
+/// FIXME: select "best" instead of first that fits. But what is "best"?
+/// - The smallest: more likely to be inlined.
+/// - The one with the least outgoing edges (already well optimized).
+/// - One from a module already being imported from in order to reduce the
+///   number of source modules parsed/linked.
+/// - One that has PGO data attached.
+/// - [insert you fancy metric here]
+static const GlobalValueSummary *
+selectCallee(const ModuleSummaryIndex &Index,
+             ArrayRef<std::unique_ptr<GlobalValueSummary>> CalleeSummaryList,
+             unsigned Threshold, StringRef CallerModulePath,
+             FunctionImporter::ImportFailureReason &Reason,
+             GlobalValue::GUID GUID) {
+  Reason = FunctionImporter::ImportFailureReason::None;
+  auto It = llvm::find_if(
+      CalleeSummaryList,
+      [&](const std::unique_ptr<GlobalValueSummary> &SummaryPtr) {
+        auto *GVSummary = SummaryPtr.get();
+        if (!Index.isGlobalValueLive(GVSummary)) {
+          Reason = FunctionImporter::ImportFailureReason::NotLive;
+          return false;
+        }
+
+        // For SamplePGO, in computeImportForFunction the OriginalId
+        // may have been used to locate the callee summary list (See
+        // comment there).
+        // The mapping from OriginalId to GUID may return a GUID
+        // that corresponds to a static variable. Filter it out here.
+        // This can happen when
+        // 1) There is a call to a library function which is not defined
+        // in the index.
+        // 2) There is a static variable with the  OriginalGUID identical
+        // to the GUID of the library function in 1);
+        // When this happens, the logic for SamplePGO kicks in and
+        // the static variable in 2) will be found, which needs to be
+        // filtered out.
+        if (GVSummary->getSummaryKind() == GlobalValueSummary::GlobalVarKind) {
+          Reason = FunctionImporter::ImportFailureReason::GlobalVar;
+          return false;
+        }
+        if (GlobalValue::isInterposableLinkage(GVSummary->linkage())) {
+          Reason = FunctionImporter::ImportFailureReason::InterposableLinkage;
+          // There is no point in importing these, we can't inline them
+          return false;
+        }
+
+        auto *Summary = cast<FunctionSummary>(GVSummary->getBaseObject());
+
+        // If this is a local function, make sure we import the copy
+        // in the caller's module. The only time a local function can
+        // share an entry in the index is if there is a local with the same name
+        // in another module that had the same source file name (in a different
+        // directory), where each was compiled in their own directory so there
+        // was not distinguishing path.
+        // However, do the import from another module if there is only one
+        // entry in the list - in that case this must be a reference due
+        // to indirect call profile data, since a function pointer can point to
+        // a local in another module.
+        if (GlobalValue::isLocalLinkage(Summary->linkage()) &&
+            CalleeSummaryList.size() > 1 &&
+            Summary->modulePath() != CallerModulePath) {
+          Reason =
+              FunctionImporter::ImportFailureReason::LocalLinkageNotInModule;
+          return false;
+        }
+
+        if ((Summary->instCount() > Threshold) &&
+            !Summary->fflags().AlwaysInline) {
+          Reason = FunctionImporter::ImportFailureReason::TooLarge;
+          return false;
+        }
+
+        // Skip if it isn't legal to import (e.g. may reference unpromotable
+        // locals).
+        if (Summary->notEligibleToImport()) {
+          Reason = FunctionImporter::ImportFailureReason::NotEligible;
+          return false;
+        }
+
+        // Don't bother importing if we can't inline it anyway.
+        if (Summary->fflags().NoInline) {
+          Reason = FunctionImporter::ImportFailureReason::NoInline;
+          return false;
+        }
+
+        return true;
+      });
+  if (It == CalleeSummaryList.end())
+    return nullptr;
+
+  return cast<GlobalValueSummary>(It->get());
+}
+
+namespace {
+
 using EdgeInfo =
     std::tuple<const GlobalValueSummary *, unsigned /* Threshold */>;
- 
-} // anonymous namespace 
- 
-static ValueInfo 
-updateValueInfoForIndirectCalls(const ModuleSummaryIndex &Index, ValueInfo VI) { 
-  if (!VI.getSummaryList().empty()) 
-    return VI; 
-  // For SamplePGO, the indirect call targets for local functions will 
-  // have its original name annotated in profile. We try to find the 
-  // corresponding PGOFuncName as the GUID. 
-  // FIXME: Consider updating the edges in the graph after building 
-  // it, rather than needing to perform this mapping on each walk. 
-  auto GUID = Index.getGUIDFromOriginalID(VI.getGUID()); 
-  if (GUID == 0) 
-    return ValueInfo(); 
-  return Index.getValueInfo(GUID); 
-} 
- 
-static void computeImportForReferencedGlobals( 
+
+} // anonymous namespace
+
+static ValueInfo
+updateValueInfoForIndirectCalls(const ModuleSummaryIndex &Index, ValueInfo VI) {
+  if (!VI.getSummaryList().empty())
+    return VI;
+  // For SamplePGO, the indirect call targets for local functions will
+  // have its original name annotated in profile. We try to find the
+  // corresponding PGOFuncName as the GUID.
+  // FIXME: Consider updating the edges in the graph after building
+  // it, rather than needing to perform this mapping on each walk.
+  auto GUID = Index.getGUIDFromOriginalID(VI.getGUID());
+  if (GUID == 0)
+    return ValueInfo();
+  return Index.getValueInfo(GUID);
+}
+
+static void computeImportForReferencedGlobals(
     const GlobalValueSummary &Summary, const ModuleSummaryIndex &Index,
-    const GVSummaryMapTy &DefinedGVSummaries, 
+    const GVSummaryMapTy &DefinedGVSummaries,
     SmallVectorImpl<EdgeInfo> &Worklist,
-    FunctionImporter::ImportMapTy &ImportList, 
-    StringMap<FunctionImporter::ExportSetTy> *ExportLists) { 
-  for (auto &VI : Summary.refs()) { 
-    if (DefinedGVSummaries.count(VI.getGUID())) { 
-      LLVM_DEBUG( 
-          dbgs() << "Ref ignored! Target already in destination module.\n"); 
-      continue; 
-    } 
- 
-    LLVM_DEBUG(dbgs() << " ref -> " << VI << "\n"); 
- 
-    // If this is a local variable, make sure we import the copy 
-    // in the caller's module. The only time a local variable can 
-    // share an entry in the index is if there is a local with the same name 
-    // in another module that had the same source file name (in a different 
-    // directory), where each was compiled in their own directory so there 
-    // was not distinguishing path. 
-    auto LocalNotInModule = [&](const GlobalValueSummary *RefSummary) -> bool { 
-      return GlobalValue::isLocalLinkage(RefSummary->linkage()) && 
-             RefSummary->modulePath() != Summary.modulePath(); 
-    }; 
- 
-    for (auto &RefSummary : VI.getSummaryList()) 
-      if (isa<GlobalVarSummary>(RefSummary.get()) && 
-          Index.canImportGlobalVar(RefSummary.get(), /* AnalyzeRefs */ true) && 
-          !LocalNotInModule(RefSummary.get())) { 
-        auto ILI = ImportList[RefSummary->modulePath()].insert(VI.getGUID()); 
-        // Only update stat and exports if we haven't already imported this 
-        // variable. 
-        if (!ILI.second) 
-          break; 
-        NumImportedGlobalVarsThinLink++; 
-        // Any references made by this variable will be marked exported later, 
-        // in ComputeCrossModuleImport, after import decisions are complete, 
-        // which is more efficient than adding them here. 
-        if (ExportLists) 
-          (*ExportLists)[RefSummary->modulePath()].insert(VI); 
+    FunctionImporter::ImportMapTy &ImportList,
+    StringMap<FunctionImporter::ExportSetTy> *ExportLists) {
+  for (auto &VI : Summary.refs()) {
+    if (DefinedGVSummaries.count(VI.getGUID())) {
+      LLVM_DEBUG(
+          dbgs() << "Ref ignored! Target already in destination module.\n");
+      continue;
+    }
+
+    LLVM_DEBUG(dbgs() << " ref -> " << VI << "\n");
+
+    // If this is a local variable, make sure we import the copy
+    // in the caller's module. The only time a local variable can
+    // share an entry in the index is if there is a local with the same name
+    // in another module that had the same source file name (in a different
+    // directory), where each was compiled in their own directory so there
+    // was not distinguishing path.
+    auto LocalNotInModule = [&](const GlobalValueSummary *RefSummary) -> bool {
+      return GlobalValue::isLocalLinkage(RefSummary->linkage()) &&
+             RefSummary->modulePath() != Summary.modulePath();
+    };
+
+    for (auto &RefSummary : VI.getSummaryList())
+      if (isa<GlobalVarSummary>(RefSummary.get()) &&
+          Index.canImportGlobalVar(RefSummary.get(), /* AnalyzeRefs */ true) &&
+          !LocalNotInModule(RefSummary.get())) {
+        auto ILI = ImportList[RefSummary->modulePath()].insert(VI.getGUID());
+        // Only update stat and exports if we haven't already imported this
+        // variable.
+        if (!ILI.second)
+          break;
+        NumImportedGlobalVarsThinLink++;
+        // Any references made by this variable will be marked exported later,
+        // in ComputeCrossModuleImport, after import decisions are complete,
+        // which is more efficient than adding them here.
+        if (ExportLists)
+          (*ExportLists)[RefSummary->modulePath()].insert(VI);
 
         // If variable is not writeonly we attempt to recursively analyze
         // its references in order to import referenced constants.
         if (!Index.isWriteOnly(cast<GlobalVarSummary>(RefSummary.get())))
           Worklist.emplace_back(RefSummary.get(), 0);
-        break; 
-      } 
-  } 
-} 
- 
-static const char * 
-getFailureName(FunctionImporter::ImportFailureReason Reason) { 
-  switch (Reason) { 
-  case FunctionImporter::ImportFailureReason::None: 
-    return "None"; 
-  case FunctionImporter::ImportFailureReason::GlobalVar: 
-    return "GlobalVar"; 
-  case FunctionImporter::ImportFailureReason::NotLive: 
-    return "NotLive"; 
-  case FunctionImporter::ImportFailureReason::TooLarge: 
-    return "TooLarge"; 
-  case FunctionImporter::ImportFailureReason::InterposableLinkage: 
-    return "InterposableLinkage"; 
-  case FunctionImporter::ImportFailureReason::LocalLinkageNotInModule: 
-    return "LocalLinkageNotInModule"; 
-  case FunctionImporter::ImportFailureReason::NotEligible: 
-    return "NotEligible"; 
-  case FunctionImporter::ImportFailureReason::NoInline: 
-    return "NoInline"; 
-  } 
-  llvm_unreachable("invalid reason"); 
-} 
- 
-/// Compute the list of functions to import for a given caller. Mark these 
-/// imported functions and the symbols they reference in their source module as 
-/// exported from their source module. 
-static void computeImportForFunction( 
-    const FunctionSummary &Summary, const ModuleSummaryIndex &Index, 
-    const unsigned Threshold, const GVSummaryMapTy &DefinedGVSummaries, 
-    SmallVectorImpl<EdgeInfo> &Worklist, 
-    FunctionImporter::ImportMapTy &ImportList, 
-    StringMap<FunctionImporter::ExportSetTy> *ExportLists, 
-    FunctionImporter::ImportThresholdsTy &ImportThresholds) { 
-  computeImportForReferencedGlobals(Summary, Index, DefinedGVSummaries, 
+        break;
+      }
+  }
+}
+
+static const char *
+getFailureName(FunctionImporter::ImportFailureReason Reason) {
+  switch (Reason) {
+  case FunctionImporter::ImportFailureReason::None:
+    return "None";
+  case FunctionImporter::ImportFailureReason::GlobalVar:
+    return "GlobalVar";
+  case FunctionImporter::ImportFailureReason::NotLive:
+    return "NotLive";
+  case FunctionImporter::ImportFailureReason::TooLarge:
+    return "TooLarge";
+  case FunctionImporter::ImportFailureReason::InterposableLinkage:
+    return "InterposableLinkage";
+  case FunctionImporter::ImportFailureReason::LocalLinkageNotInModule:
+    return "LocalLinkageNotInModule";
+  case FunctionImporter::ImportFailureReason::NotEligible:
+    return "NotEligible";
+  case FunctionImporter::ImportFailureReason::NoInline:
+    return "NoInline";
+  }
+  llvm_unreachable("invalid reason");
+}
+
+/// Compute the list of functions to import for a given caller. Mark these
+/// imported functions and the symbols they reference in their source module as
+/// exported from their source module.
+static void computeImportForFunction(
+    const FunctionSummary &Summary, const ModuleSummaryIndex &Index,
+    const unsigned Threshold, const GVSummaryMapTy &DefinedGVSummaries,
+    SmallVectorImpl<EdgeInfo> &Worklist,
+    FunctionImporter::ImportMapTy &ImportList,
+    StringMap<FunctionImporter::ExportSetTy> *ExportLists,
+    FunctionImporter::ImportThresholdsTy &ImportThresholds) {
+  computeImportForReferencedGlobals(Summary, Index, DefinedGVSummaries,
                                     Worklist, ImportList, ExportLists);
-  static int ImportCount = 0; 
-  for (auto &Edge : Summary.calls()) { 
-    ValueInfo VI = Edge.first; 
-    LLVM_DEBUG(dbgs() << " edge -> " << VI << " Threshold:" << Threshold 
-                      << "\n"); 
- 
-    if (ImportCutoff >= 0 && ImportCount >= ImportCutoff) { 
-      LLVM_DEBUG(dbgs() << "ignored! import-cutoff value of " << ImportCutoff 
-                        << " reached.\n"); 
-      continue; 
-    } 
- 
-    VI = updateValueInfoForIndirectCalls(Index, VI); 
-    if (!VI) 
-      continue; 
- 
-    if (DefinedGVSummaries.count(VI.getGUID())) { 
-      LLVM_DEBUG(dbgs() << "ignored! Target already in destination module.\n"); 
-      continue; 
-    } 
- 
-    auto GetBonusMultiplier = [](CalleeInfo::HotnessType Hotness) -> float { 
-      if (Hotness == CalleeInfo::HotnessType::Hot) 
-        return ImportHotMultiplier; 
-      if (Hotness == CalleeInfo::HotnessType::Cold) 
-        return ImportColdMultiplier; 
-      if (Hotness == CalleeInfo::HotnessType::Critical) 
-        return ImportCriticalMultiplier; 
-      return 1.0; 
-    }; 
- 
-    const auto NewThreshold = 
-        Threshold * GetBonusMultiplier(Edge.second.getHotness()); 
- 
-    auto IT = ImportThresholds.insert(std::make_pair( 
-        VI.getGUID(), std::make_tuple(NewThreshold, nullptr, nullptr))); 
-    bool PreviouslyVisited = !IT.second; 
-    auto &ProcessedThreshold = std::get<0>(IT.first->second); 
-    auto &CalleeSummary = std::get<1>(IT.first->second); 
-    auto &FailureInfo = std::get<2>(IT.first->second); 
- 
-    bool IsHotCallsite = 
-        Edge.second.getHotness() == CalleeInfo::HotnessType::Hot; 
-    bool IsCriticalCallsite = 
-        Edge.second.getHotness() == CalleeInfo::HotnessType::Critical; 
- 
-    const FunctionSummary *ResolvedCalleeSummary = nullptr; 
-    if (CalleeSummary) { 
-      assert(PreviouslyVisited); 
-      // Since the traversal of the call graph is DFS, we can revisit a function 
-      // a second time with a higher threshold. In this case, it is added back 
-      // to the worklist with the new threshold (so that its own callee chains 
-      // can be considered with the higher threshold). 
-      if (NewThreshold <= ProcessedThreshold) { 
-        LLVM_DEBUG( 
-            dbgs() << "ignored! Target was already imported with Threshold " 
-                   << ProcessedThreshold << "\n"); 
-        continue; 
-      } 
-      // Update with new larger threshold. 
-      ProcessedThreshold = NewThreshold; 
-      ResolvedCalleeSummary = cast<FunctionSummary>(CalleeSummary); 
-    } else { 
-      // If we already rejected importing a callee at the same or higher 
-      // threshold, don't waste time calling selectCallee. 
-      if (PreviouslyVisited && NewThreshold <= ProcessedThreshold) { 
-        LLVM_DEBUG( 
-            dbgs() << "ignored! Target was already rejected with Threshold " 
-            << ProcessedThreshold << "\n"); 
-        if (PrintImportFailures) { 
-          assert(FailureInfo && 
-                 "Expected FailureInfo for previously rejected candidate"); 
-          FailureInfo->Attempts++; 
-        } 
-        continue; 
-      } 
- 
-      FunctionImporter::ImportFailureReason Reason; 
-      CalleeSummary = selectCallee(Index, VI.getSummaryList(), NewThreshold, 
-                                   Summary.modulePath(), Reason, VI.getGUID()); 
-      if (!CalleeSummary) { 
-        // Update with new larger threshold if this was a retry (otherwise 
-        // we would have already inserted with NewThreshold above). Also 
-        // update failure info if requested. 
-        if (PreviouslyVisited) { 
-          ProcessedThreshold = NewThreshold; 
-          if (PrintImportFailures) { 
-            assert(FailureInfo && 
-                   "Expected FailureInfo for previously rejected candidate"); 
-            FailureInfo->Reason = Reason; 
-            FailureInfo->Attempts++; 
-            FailureInfo->MaxHotness = 
-                std::max(FailureInfo->MaxHotness, Edge.second.getHotness()); 
-          } 
-        } else if (PrintImportFailures) { 
-          assert(!FailureInfo && 
-                 "Expected no FailureInfo for newly rejected candidate"); 
-          FailureInfo = std::make_unique<FunctionImporter::ImportFailureInfo>( 
-              VI, Edge.second.getHotness(), Reason, 1); 
-        } 
-        LLVM_DEBUG( 
-            dbgs() << "ignored! No qualifying callee with summary found.\n"); 
-        continue; 
-      } 
- 
-      // "Resolve" the summary 
-      CalleeSummary = CalleeSummary->getBaseObject(); 
-      ResolvedCalleeSummary = cast<FunctionSummary>(CalleeSummary); 
- 
-      assert((ResolvedCalleeSummary->fflags().AlwaysInline || 
-	     (ResolvedCalleeSummary->instCount() <= NewThreshold)) && 
-             "selectCallee() didn't honor the threshold"); 
- 
-      auto ExportModulePath = ResolvedCalleeSummary->modulePath(); 
-      auto ILI = ImportList[ExportModulePath].insert(VI.getGUID()); 
-      // We previously decided to import this GUID definition if it was already 
-      // inserted in the set of imports from the exporting module. 
-      bool PreviouslyImported = !ILI.second; 
-      if (!PreviouslyImported) { 
-        NumImportedFunctionsThinLink++; 
-        if (IsHotCallsite) 
-          NumImportedHotFunctionsThinLink++; 
-        if (IsCriticalCallsite) 
-          NumImportedCriticalFunctionsThinLink++; 
-      } 
- 
-      // Any calls/references made by this function will be marked exported 
-      // later, in ComputeCrossModuleImport, after import decisions are 
-      // complete, which is more efficient than adding them here. 
-      if (ExportLists) 
-        (*ExportLists)[ExportModulePath].insert(VI); 
-    } 
- 
-    auto GetAdjustedThreshold = [](unsigned Threshold, bool IsHotCallsite) { 
-      // Adjust the threshold for next level of imported functions. 
-      // The threshold is different for hot callsites because we can then 
-      // inline chains of hot calls. 
-      if (IsHotCallsite) 
-        return Threshold * ImportHotInstrFactor; 
-      return Threshold * ImportInstrFactor; 
-    }; 
- 
-    const auto AdjThreshold = GetAdjustedThreshold(Threshold, IsHotCallsite); 
- 
-    ImportCount++; 
- 
-    // Insert the newly imported function to the worklist. 
+  static int ImportCount = 0;
+  for (auto &Edge : Summary.calls()) {
+    ValueInfo VI = Edge.first;
+    LLVM_DEBUG(dbgs() << " edge -> " << VI << " Threshold:" << Threshold
+                      << "\n");
+
+    if (ImportCutoff >= 0 && ImportCount >= ImportCutoff) {
+      LLVM_DEBUG(dbgs() << "ignored! import-cutoff value of " << ImportCutoff
+                        << " reached.\n");
+      continue;
+    }
+
+    VI = updateValueInfoForIndirectCalls(Index, VI);
+    if (!VI)
+      continue;
+
+    if (DefinedGVSummaries.count(VI.getGUID())) {
+      LLVM_DEBUG(dbgs() << "ignored! Target already in destination module.\n");
+      continue;
+    }
+
+    auto GetBonusMultiplier = [](CalleeInfo::HotnessType Hotness) -> float {
+      if (Hotness == CalleeInfo::HotnessType::Hot)
+        return ImportHotMultiplier;
+      if (Hotness == CalleeInfo::HotnessType::Cold)
+        return ImportColdMultiplier;
+      if (Hotness == CalleeInfo::HotnessType::Critical)
+        return ImportCriticalMultiplier;
+      return 1.0;
+    };
+
+    const auto NewThreshold =
+        Threshold * GetBonusMultiplier(Edge.second.getHotness());
+
+    auto IT = ImportThresholds.insert(std::make_pair(
+        VI.getGUID(), std::make_tuple(NewThreshold, nullptr, nullptr)));
+    bool PreviouslyVisited = !IT.second;
+    auto &ProcessedThreshold = std::get<0>(IT.first->second);
+    auto &CalleeSummary = std::get<1>(IT.first->second);
+    auto &FailureInfo = std::get<2>(IT.first->second);
+
+    bool IsHotCallsite =
+        Edge.second.getHotness() == CalleeInfo::HotnessType::Hot;
+    bool IsCriticalCallsite =
+        Edge.second.getHotness() == CalleeInfo::HotnessType::Critical;
+
+    const FunctionSummary *ResolvedCalleeSummary = nullptr;
+    if (CalleeSummary) {
+      assert(PreviouslyVisited);
+      // Since the traversal of the call graph is DFS, we can revisit a function
+      // a second time with a higher threshold. In this case, it is added back
+      // to the worklist with the new threshold (so that its own callee chains
+      // can be considered with the higher threshold).
+      if (NewThreshold <= ProcessedThreshold) {
+        LLVM_DEBUG(
+            dbgs() << "ignored! Target was already imported with Threshold "
+                   << ProcessedThreshold << "\n");
+        continue;
+      }
+      // Update with new larger threshold.
+      ProcessedThreshold = NewThreshold;
+      ResolvedCalleeSummary = cast<FunctionSummary>(CalleeSummary);
+    } else {
+      // If we already rejected importing a callee at the same or higher
+      // threshold, don't waste time calling selectCallee.
+      if (PreviouslyVisited && NewThreshold <= ProcessedThreshold) {
+        LLVM_DEBUG(
+            dbgs() << "ignored! Target was already rejected with Threshold "
+            << ProcessedThreshold << "\n");
+        if (PrintImportFailures) {
+          assert(FailureInfo &&
+                 "Expected FailureInfo for previously rejected candidate");
+          FailureInfo->Attempts++;
+        }
+        continue;
+      }
+
+      FunctionImporter::ImportFailureReason Reason;
+      CalleeSummary = selectCallee(Index, VI.getSummaryList(), NewThreshold,
+                                   Summary.modulePath(), Reason, VI.getGUID());
+      if (!CalleeSummary) {
+        // Update with new larger threshold if this was a retry (otherwise
+        // we would have already inserted with NewThreshold above). Also
+        // update failure info if requested.
+        if (PreviouslyVisited) {
+          ProcessedThreshold = NewThreshold;
+          if (PrintImportFailures) {
+            assert(FailureInfo &&
+                   "Expected FailureInfo for previously rejected candidate");
+            FailureInfo->Reason = Reason;
+            FailureInfo->Attempts++;
+            FailureInfo->MaxHotness =
+                std::max(FailureInfo->MaxHotness, Edge.second.getHotness());
+          }
+        } else if (PrintImportFailures) {
+          assert(!FailureInfo &&
+                 "Expected no FailureInfo for newly rejected candidate");
+          FailureInfo = std::make_unique<FunctionImporter::ImportFailureInfo>(
+              VI, Edge.second.getHotness(), Reason, 1);
+        }
+        LLVM_DEBUG(
+            dbgs() << "ignored! No qualifying callee with summary found.\n");
+        continue;
+      }
+
+      // "Resolve" the summary
+      CalleeSummary = CalleeSummary->getBaseObject();
+      ResolvedCalleeSummary = cast<FunctionSummary>(CalleeSummary);
+
+      assert((ResolvedCalleeSummary->fflags().AlwaysInline ||
+	     (ResolvedCalleeSummary->instCount() <= NewThreshold)) &&
+             "selectCallee() didn't honor the threshold");
+
+      auto ExportModulePath = ResolvedCalleeSummary->modulePath();
+      auto ILI = ImportList[ExportModulePath].insert(VI.getGUID());
+      // We previously decided to import this GUID definition if it was already
+      // inserted in the set of imports from the exporting module.
+      bool PreviouslyImported = !ILI.second;
+      if (!PreviouslyImported) {
+        NumImportedFunctionsThinLink++;
+        if (IsHotCallsite)
+          NumImportedHotFunctionsThinLink++;
+        if (IsCriticalCallsite)
+          NumImportedCriticalFunctionsThinLink++;
+      }
+
+      // Any calls/references made by this function will be marked exported
+      // later, in ComputeCrossModuleImport, after import decisions are
+      // complete, which is more efficient than adding them here.
+      if (ExportLists)
+        (*ExportLists)[ExportModulePath].insert(VI);
+    }
+
+    auto GetAdjustedThreshold = [](unsigned Threshold, bool IsHotCallsite) {
+      // Adjust the threshold for next level of imported functions.
+      // The threshold is different for hot callsites because we can then
+      // inline chains of hot calls.
+      if (IsHotCallsite)
+        return Threshold * ImportHotInstrFactor;
+      return Threshold * ImportInstrFactor;
+    };
+
+    const auto AdjThreshold = GetAdjustedThreshold(Threshold, IsHotCallsite);
+
+    ImportCount++;
+
+    // Insert the newly imported function to the worklist.
     Worklist.emplace_back(ResolvedCalleeSummary, AdjThreshold);
-  } 
-} 
- 
-/// Given the list of globals defined in a module, compute the list of imports 
-/// as well as the list of "exports", i.e. the list of symbols referenced from 
-/// another module (that may require promotion). 
-static void ComputeImportForModule( 
-    const GVSummaryMapTy &DefinedGVSummaries, const ModuleSummaryIndex &Index, 
-    StringRef ModName, FunctionImporter::ImportMapTy &ImportList, 
-    StringMap<FunctionImporter::ExportSetTy> *ExportLists = nullptr) { 
-  // Worklist contains the list of function imported in this module, for which 
-  // we will analyse the callees and may import further down the callgraph. 
-  SmallVector<EdgeInfo, 128> Worklist; 
-  FunctionImporter::ImportThresholdsTy ImportThresholds; 
- 
-  // Populate the worklist with the import for the functions in the current 
-  // module 
-  for (auto &GVSummary : DefinedGVSummaries) { 
-#ifndef NDEBUG 
-    // FIXME: Change the GVSummaryMapTy to hold ValueInfo instead of GUID 
-    // so this map look up (and possibly others) can be avoided. 
-    auto VI = Index.getValueInfo(GVSummary.first); 
-#endif 
-    if (!Index.isGlobalValueLive(GVSummary.second)) { 
-      LLVM_DEBUG(dbgs() << "Ignores Dead GUID: " << VI << "\n"); 
-      continue; 
-    } 
-    auto *FuncSummary = 
-        dyn_cast<FunctionSummary>(GVSummary.second->getBaseObject()); 
-    if (!FuncSummary) 
-      // Skip import for global variables 
-      continue; 
-    LLVM_DEBUG(dbgs() << "Initialize import for " << VI << "\n"); 
-    computeImportForFunction(*FuncSummary, Index, ImportInstrLimit, 
-                             DefinedGVSummaries, Worklist, ImportList, 
-                             ExportLists, ImportThresholds); 
-  } 
- 
-  // Process the newly imported functions and add callees to the worklist. 
-  while (!Worklist.empty()) { 
+  }
+}
+
+/// Given the list of globals defined in a module, compute the list of imports
+/// as well as the list of "exports", i.e. the list of symbols referenced from
+/// another module (that may require promotion).
+static void ComputeImportForModule(
+    const GVSummaryMapTy &DefinedGVSummaries, const ModuleSummaryIndex &Index,
+    StringRef ModName, FunctionImporter::ImportMapTy &ImportList,
+    StringMap<FunctionImporter::ExportSetTy> *ExportLists = nullptr) {
+  // Worklist contains the list of function imported in this module, for which
+  // we will analyse the callees and may import further down the callgraph.
+  SmallVector<EdgeInfo, 128> Worklist;
+  FunctionImporter::ImportThresholdsTy ImportThresholds;
+
+  // Populate the worklist with the import for the functions in the current
+  // module
+  for (auto &GVSummary : DefinedGVSummaries) {
+#ifndef NDEBUG
+    // FIXME: Change the GVSummaryMapTy to hold ValueInfo instead of GUID
+    // so this map look up (and possibly others) can be avoided.
+    auto VI = Index.getValueInfo(GVSummary.first);
+#endif
+    if (!Index.isGlobalValueLive(GVSummary.second)) {
+      LLVM_DEBUG(dbgs() << "Ignores Dead GUID: " << VI << "\n");
+      continue;
+    }
+    auto *FuncSummary =
+        dyn_cast<FunctionSummary>(GVSummary.second->getBaseObject());
+    if (!FuncSummary)
+      // Skip import for global variables
+      continue;
+    LLVM_DEBUG(dbgs() << "Initialize import for " << VI << "\n");
+    computeImportForFunction(*FuncSummary, Index, ImportInstrLimit,
+                             DefinedGVSummaries, Worklist, ImportList,
+                             ExportLists, ImportThresholds);
+  }
+
+  // Process the newly imported functions and add callees to the worklist.
+  while (!Worklist.empty()) {
     auto GVInfo = Worklist.pop_back_val();
     auto *Summary = std::get<0>(GVInfo);
     auto Threshold = std::get<1>(GVInfo);
- 
+
     if (auto *FS = dyn_cast<FunctionSummary>(Summary))
       computeImportForFunction(*FS, Index, Threshold, DefinedGVSummaries,
                                Worklist, ImportList, ExportLists,
@@ -560,823 +560,823 @@ static void ComputeImportForModule(
     else
       computeImportForReferencedGlobals(*Summary, Index, DefinedGVSummaries,
                                         Worklist, ImportList, ExportLists);
-  } 
- 
-  // Print stats about functions considered but rejected for importing 
-  // when requested. 
-  if (PrintImportFailures) { 
-    dbgs() << "Missed imports into module " << ModName << "\n"; 
-    for (auto &I : ImportThresholds) { 
-      auto &ProcessedThreshold = std::get<0>(I.second); 
-      auto &CalleeSummary = std::get<1>(I.second); 
-      auto &FailureInfo = std::get<2>(I.second); 
-      if (CalleeSummary) 
-        continue; // We are going to import. 
-      assert(FailureInfo); 
-      FunctionSummary *FS = nullptr; 
-      if (!FailureInfo->VI.getSummaryList().empty()) 
-        FS = dyn_cast<FunctionSummary>( 
-            FailureInfo->VI.getSummaryList()[0]->getBaseObject()); 
-      dbgs() << FailureInfo->VI 
-             << ": Reason = " << getFailureName(FailureInfo->Reason) 
-             << ", Threshold = " << ProcessedThreshold 
-             << ", Size = " << (FS ? (int)FS->instCount() : -1) 
-             << ", MaxHotness = " << getHotnessName(FailureInfo->MaxHotness) 
-             << ", Attempts = " << FailureInfo->Attempts << "\n"; 
-    } 
-  } 
-} 
- 
-#ifndef NDEBUG 
-static bool isGlobalVarSummary(const ModuleSummaryIndex &Index, ValueInfo VI) { 
-  auto SL = VI.getSummaryList(); 
-  return SL.empty() 
-             ? false 
-             : SL[0]->getSummaryKind() == GlobalValueSummary::GlobalVarKind; 
-} 
- 
-static bool isGlobalVarSummary(const ModuleSummaryIndex &Index, 
-                               GlobalValue::GUID G) { 
-  if (const auto &VI = Index.getValueInfo(G)) 
-    return isGlobalVarSummary(Index, VI); 
-  return false; 
-} 
- 
-template <class T> 
-static unsigned numGlobalVarSummaries(const ModuleSummaryIndex &Index, 
-                                      T &Cont) { 
-  unsigned NumGVS = 0; 
-  for (auto &V : Cont) 
-    if (isGlobalVarSummary(Index, V)) 
-      ++NumGVS; 
-  return NumGVS; 
-} 
-#endif 
- 
-#ifndef NDEBUG 
-static bool 
-checkVariableImport(const ModuleSummaryIndex &Index, 
-                    StringMap<FunctionImporter::ImportMapTy> &ImportLists, 
-                    StringMap<FunctionImporter::ExportSetTy> &ExportLists) { 
- 
-  DenseSet<GlobalValue::GUID> FlattenedImports; 
- 
-  for (auto &ImportPerModule : ImportLists) 
-    for (auto &ExportPerModule : ImportPerModule.second) 
-      FlattenedImports.insert(ExportPerModule.second.begin(), 
-                              ExportPerModule.second.end()); 
- 
-  // Checks that all GUIDs of read/writeonly vars we see in export lists 
-  // are also in the import lists. Otherwise we my face linker undefs, 
-  // because readonly and writeonly vars are internalized in their 
-  // source modules. 
-  auto IsReadOrWriteOnlyVar = [&](StringRef ModulePath, const ValueInfo &VI) { 
-    auto *GVS = dyn_cast_or_null<GlobalVarSummary>( 
-        Index.findSummaryInModule(VI, ModulePath)); 
-    return GVS && (Index.isReadOnly(GVS) || Index.isWriteOnly(GVS)); 
-  }; 
- 
-  for (auto &ExportPerModule : ExportLists) 
-    for (auto &VI : ExportPerModule.second) 
-      if (!FlattenedImports.count(VI.getGUID()) && 
-          IsReadOrWriteOnlyVar(ExportPerModule.first(), VI)) 
-        return false; 
- 
-  return true; 
-} 
-#endif 
- 
-/// Compute all the import and export for every module using the Index. 
-void llvm::ComputeCrossModuleImport( 
-    const ModuleSummaryIndex &Index, 
-    const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries, 
-    StringMap<FunctionImporter::ImportMapTy> &ImportLists, 
-    StringMap<FunctionImporter::ExportSetTy> &ExportLists) { 
-  // For each module that has function defined, compute the import/export lists. 
-  for (auto &DefinedGVSummaries : ModuleToDefinedGVSummaries) { 
-    auto &ImportList = ImportLists[DefinedGVSummaries.first()]; 
-    LLVM_DEBUG(dbgs() << "Computing import for Module '" 
-                      << DefinedGVSummaries.first() << "'\n"); 
-    ComputeImportForModule(DefinedGVSummaries.second, Index, 
-                           DefinedGVSummaries.first(), ImportList, 
-                           &ExportLists); 
-  } 
- 
-  // When computing imports we only added the variables and functions being 
-  // imported to the export list. We also need to mark any references and calls 
-  // they make as exported as well. We do this here, as it is more efficient 
-  // since we may import the same values multiple times into different modules 
-  // during the import computation. 
-  for (auto &ELI : ExportLists) { 
-    FunctionImporter::ExportSetTy NewExports; 
-    const auto &DefinedGVSummaries = 
-        ModuleToDefinedGVSummaries.lookup(ELI.first()); 
-    for (auto &EI : ELI.second) { 
-      // Find the copy defined in the exporting module so that we can mark the 
-      // values it references in that specific definition as exported. 
-      // Below we will add all references and called values, without regard to 
-      // whether they are also defined in this module. We subsequently prune the 
-      // list to only include those defined in the exporting module, see comment 
-      // there as to why. 
-      auto DS = DefinedGVSummaries.find(EI.getGUID()); 
-      // Anything marked exported during the import computation must have been 
-      // defined in the exporting module. 
-      assert(DS != DefinedGVSummaries.end()); 
-      auto *S = DS->getSecond(); 
-      S = S->getBaseObject(); 
-      if (auto *GVS = dyn_cast<GlobalVarSummary>(S)) { 
-        // Export referenced functions and variables. We don't export/promote 
-        // objects referenced by writeonly variable initializer, because 
-        // we convert such variables initializers to "zeroinitializer". 
-        // See processGlobalForThinLTO. 
-        if (!Index.isWriteOnly(GVS)) 
-          for (const auto &VI : GVS->refs()) 
-            NewExports.insert(VI); 
-      } else { 
-        auto *FS = cast<FunctionSummary>(S); 
-        for (auto &Edge : FS->calls()) 
-          NewExports.insert(Edge.first); 
-        for (auto &Ref : FS->refs()) 
-          NewExports.insert(Ref); 
-      } 
-    } 
-    // Prune list computed above to only include values defined in the exporting 
-    // module. We do this after the above insertion since we may hit the same 
-    // ref/call target multiple times in above loop, and it is more efficient to 
-    // avoid a set lookup each time. 
-    for (auto EI = NewExports.begin(); EI != NewExports.end();) { 
-      if (!DefinedGVSummaries.count(EI->getGUID())) 
-        NewExports.erase(EI++); 
-      else 
-        ++EI; 
-    } 
-    ELI.second.insert(NewExports.begin(), NewExports.end()); 
-  } 
- 
-  assert(checkVariableImport(Index, ImportLists, ExportLists)); 
-#ifndef NDEBUG 
-  LLVM_DEBUG(dbgs() << "Import/Export lists for " << ImportLists.size() 
-                    << " modules:\n"); 
-  for (auto &ModuleImports : ImportLists) { 
-    auto ModName = ModuleImports.first(); 
-    auto &Exports = ExportLists[ModName]; 
-    unsigned NumGVS = numGlobalVarSummaries(Index, Exports); 
-    LLVM_DEBUG(dbgs() << "* Module " << ModName << " exports " 
-                      << Exports.size() - NumGVS << " functions and " << NumGVS 
-                      << " vars. Imports from " << ModuleImports.second.size() 
-                      << " modules.\n"); 
-    for (auto &Src : ModuleImports.second) { 
-      auto SrcModName = Src.first(); 
-      unsigned NumGVSPerMod = numGlobalVarSummaries(Index, Src.second); 
-      LLVM_DEBUG(dbgs() << " - " << Src.second.size() - NumGVSPerMod 
-                        << " functions imported from " << SrcModName << "\n"); 
-      LLVM_DEBUG(dbgs() << " - " << NumGVSPerMod 
-                        << " global vars imported from " << SrcModName << "\n"); 
-    } 
-  } 
-#endif 
-} 
- 
-#ifndef NDEBUG 
-static void dumpImportListForModule(const ModuleSummaryIndex &Index, 
-                                    StringRef ModulePath, 
-                                    FunctionImporter::ImportMapTy &ImportList) { 
-  LLVM_DEBUG(dbgs() << "* Module " << ModulePath << " imports from " 
-                    << ImportList.size() << " modules.\n"); 
-  for (auto &Src : ImportList) { 
-    auto SrcModName = Src.first(); 
-    unsigned NumGVSPerMod = numGlobalVarSummaries(Index, Src.second); 
-    LLVM_DEBUG(dbgs() << " - " << Src.second.size() - NumGVSPerMod 
-                      << " functions imported from " << SrcModName << "\n"); 
-    LLVM_DEBUG(dbgs() << " - " << NumGVSPerMod << " vars imported from " 
-                      << SrcModName << "\n"); 
-  } 
-} 
-#endif 
- 
-/// Compute all the imports for the given module in the Index. 
-void llvm::ComputeCrossModuleImportForModule( 
-    StringRef ModulePath, const ModuleSummaryIndex &Index, 
-    FunctionImporter::ImportMapTy &ImportList) { 
-  // Collect the list of functions this module defines. 
-  // GUID -> Summary 
-  GVSummaryMapTy FunctionSummaryMap; 
-  Index.collectDefinedFunctionsForModule(ModulePath, FunctionSummaryMap); 
- 
-  // Compute the import list for this module. 
-  LLVM_DEBUG(dbgs() << "Computing import for Module '" << ModulePath << "'\n"); 
-  ComputeImportForModule(FunctionSummaryMap, Index, ModulePath, ImportList); 
- 
-#ifndef NDEBUG 
-  dumpImportListForModule(Index, ModulePath, ImportList); 
-#endif 
-} 
- 
-// Mark all external summaries in Index for import into the given module. 
-// Used for distributed builds using a distributed index. 
-void llvm::ComputeCrossModuleImportForModuleFromIndex( 
-    StringRef ModulePath, const ModuleSummaryIndex &Index, 
-    FunctionImporter::ImportMapTy &ImportList) { 
-  for (auto &GlobalList : Index) { 
-    // Ignore entries for undefined references. 
-    if (GlobalList.second.SummaryList.empty()) 
-      continue; 
- 
-    auto GUID = GlobalList.first; 
-    assert(GlobalList.second.SummaryList.size() == 1 && 
-           "Expected individual combined index to have one summary per GUID"); 
-    auto &Summary = GlobalList.second.SummaryList[0]; 
-    // Skip the summaries for the importing module. These are included to 
-    // e.g. record required linkage changes. 
-    if (Summary->modulePath() == ModulePath) 
-      continue; 
-    // Add an entry to provoke importing by thinBackend. 
-    ImportList[Summary->modulePath()].insert(GUID); 
-  } 
-#ifndef NDEBUG 
-  dumpImportListForModule(Index, ModulePath, ImportList); 
-#endif 
-} 
- 
-void llvm::computeDeadSymbols( 
-    ModuleSummaryIndex &Index, 
-    const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols, 
-    function_ref<PrevailingType(GlobalValue::GUID)> isPrevailing) { 
-  assert(!Index.withGlobalValueDeadStripping()); 
-  if (!ComputeDead) 
-    return; 
-  if (GUIDPreservedSymbols.empty()) 
-    // Don't do anything when nothing is live, this is friendly with tests. 
-    return; 
-  unsigned LiveSymbols = 0; 
-  SmallVector<ValueInfo, 128> Worklist; 
-  Worklist.reserve(GUIDPreservedSymbols.size() * 2); 
-  for (auto GUID : GUIDPreservedSymbols) { 
-    ValueInfo VI = Index.getValueInfo(GUID); 
-    if (!VI) 
-      continue; 
-    for (auto &S : VI.getSummaryList()) 
-      S->setLive(true); 
-  } 
- 
-  // Add values flagged in the index as live roots to the worklist. 
-  for (const auto &Entry : Index) { 
-    auto VI = Index.getValueInfo(Entry); 
-    for (auto &S : Entry.second.SummaryList) 
-      if (S->isLive()) { 
-        LLVM_DEBUG(dbgs() << "Live root: " << VI << "\n"); 
-        Worklist.push_back(VI); 
-        ++LiveSymbols; 
-        break; 
-      } 
-  } 
- 
-  // Make value live and add it to the worklist if it was not live before. 
-  auto visit = [&](ValueInfo VI, bool IsAliasee) { 
-    // FIXME: If we knew which edges were created for indirect call profiles, 
-    // we could skip them here. Any that are live should be reached via 
-    // other edges, e.g. reference edges. Otherwise, using a profile collected 
-    // on a slightly different binary might provoke preserving, importing 
-    // and ultimately promoting calls to functions not linked into this 
-    // binary, which increases the binary size unnecessarily. Note that 
-    // if this code changes, the importer needs to change so that edges 
-    // to functions marked dead are skipped. 
-    VI = updateValueInfoForIndirectCalls(Index, VI); 
-    if (!VI) 
-      return; 
- 
-    if (llvm::any_of(VI.getSummaryList(), 
-                     [](const std::unique_ptr<llvm::GlobalValueSummary> &S) { 
-                       return S->isLive(); 
-                     })) 
-      return; 
- 
-    // We only keep live symbols that are known to be non-prevailing if any are 
-    // available_externally, linkonceodr, weakodr. Those symbols are discarded 
-    // later in the EliminateAvailableExternally pass and setting them to 
-    // not-live could break downstreams users of liveness information (PR36483) 
-    // or limit optimization opportunities. 
-    if (isPrevailing(VI.getGUID()) == PrevailingType::No) { 
-      bool KeepAliveLinkage = false; 
-      bool Interposable = false; 
-      for (auto &S : VI.getSummaryList()) { 
-        if (S->linkage() == GlobalValue::AvailableExternallyLinkage || 
-            S->linkage() == GlobalValue::WeakODRLinkage || 
-            S->linkage() == GlobalValue::LinkOnceODRLinkage) 
-          KeepAliveLinkage = true; 
-        else if (GlobalValue::isInterposableLinkage(S->linkage())) 
-          Interposable = true; 
-      } 
- 
-      if (!IsAliasee) { 
-        if (!KeepAliveLinkage) 
-          return; 
- 
-        if (Interposable) 
-          report_fatal_error( 
-              "Interposable and available_externally/linkonce_odr/weak_odr " 
-              "symbol"); 
-      } 
-    } 
- 
-    for (auto &S : VI.getSummaryList()) 
-      S->setLive(true); 
-    ++LiveSymbols; 
-    Worklist.push_back(VI); 
-  }; 
- 
-  while (!Worklist.empty()) { 
-    auto VI = Worklist.pop_back_val(); 
-    for (auto &Summary : VI.getSummaryList()) { 
+  }
+
+  // Print stats about functions considered but rejected for importing
+  // when requested.
+  if (PrintImportFailures) {
+    dbgs() << "Missed imports into module " << ModName << "\n";
+    for (auto &I : ImportThresholds) {
+      auto &ProcessedThreshold = std::get<0>(I.second);
+      auto &CalleeSummary = std::get<1>(I.second);
+      auto &FailureInfo = std::get<2>(I.second);
+      if (CalleeSummary)
+        continue; // We are going to import.
+      assert(FailureInfo);
+      FunctionSummary *FS = nullptr;
+      if (!FailureInfo->VI.getSummaryList().empty())
+        FS = dyn_cast<FunctionSummary>(
+            FailureInfo->VI.getSummaryList()[0]->getBaseObject());
+      dbgs() << FailureInfo->VI
+             << ": Reason = " << getFailureName(FailureInfo->Reason)
+             << ", Threshold = " << ProcessedThreshold
+             << ", Size = " << (FS ? (int)FS->instCount() : -1)
+             << ", MaxHotness = " << getHotnessName(FailureInfo->MaxHotness)
+             << ", Attempts = " << FailureInfo->Attempts << "\n";
+    }
+  }
+}
+
+#ifndef NDEBUG
+static bool isGlobalVarSummary(const ModuleSummaryIndex &Index, ValueInfo VI) {
+  auto SL = VI.getSummaryList();
+  return SL.empty()
+             ? false
+             : SL[0]->getSummaryKind() == GlobalValueSummary::GlobalVarKind;
+}
+
+static bool isGlobalVarSummary(const ModuleSummaryIndex &Index,
+                               GlobalValue::GUID G) {
+  if (const auto &VI = Index.getValueInfo(G))
+    return isGlobalVarSummary(Index, VI);
+  return false;
+}
+
+template <class T>
+static unsigned numGlobalVarSummaries(const ModuleSummaryIndex &Index,
+                                      T &Cont) {
+  unsigned NumGVS = 0;
+  for (auto &V : Cont)
+    if (isGlobalVarSummary(Index, V))
+      ++NumGVS;
+  return NumGVS;
+}
+#endif
+
+#ifndef NDEBUG
+static bool
+checkVariableImport(const ModuleSummaryIndex &Index,
+                    StringMap<FunctionImporter::ImportMapTy> &ImportLists,
+                    StringMap<FunctionImporter::ExportSetTy> &ExportLists) {
+
+  DenseSet<GlobalValue::GUID> FlattenedImports;
+
+  for (auto &ImportPerModule : ImportLists)
+    for (auto &ExportPerModule : ImportPerModule.second)
+      FlattenedImports.insert(ExportPerModule.second.begin(),
+                              ExportPerModule.second.end());
+
+  // Checks that all GUIDs of read/writeonly vars we see in export lists
+  // are also in the import lists. Otherwise we my face linker undefs,
+  // because readonly and writeonly vars are internalized in their
+  // source modules.
+  auto IsReadOrWriteOnlyVar = [&](StringRef ModulePath, const ValueInfo &VI) {
+    auto *GVS = dyn_cast_or_null<GlobalVarSummary>(
+        Index.findSummaryInModule(VI, ModulePath));
+    return GVS && (Index.isReadOnly(GVS) || Index.isWriteOnly(GVS));
+  };
+
+  for (auto &ExportPerModule : ExportLists)
+    for (auto &VI : ExportPerModule.second)
+      if (!FlattenedImports.count(VI.getGUID()) &&
+          IsReadOrWriteOnlyVar(ExportPerModule.first(), VI))
+        return false;
+
+  return true;
+}
+#endif
+
+/// Compute all the import and export for every module using the Index.
+void llvm::ComputeCrossModuleImport(
+    const ModuleSummaryIndex &Index,
+    const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
+    StringMap<FunctionImporter::ImportMapTy> &ImportLists,
+    StringMap<FunctionImporter::ExportSetTy> &ExportLists) {
+  // For each module that has function defined, compute the import/export lists.
+  for (auto &DefinedGVSummaries : ModuleToDefinedGVSummaries) {
+    auto &ImportList = ImportLists[DefinedGVSummaries.first()];
+    LLVM_DEBUG(dbgs() << "Computing import for Module '"
+                      << DefinedGVSummaries.first() << "'\n");
+    ComputeImportForModule(DefinedGVSummaries.second, Index,
+                           DefinedGVSummaries.first(), ImportList,
+                           &ExportLists);
+  }
+
+  // When computing imports we only added the variables and functions being
+  // imported to the export list. We also need to mark any references and calls
+  // they make as exported as well. We do this here, as it is more efficient
+  // since we may import the same values multiple times into different modules
+  // during the import computation.
+  for (auto &ELI : ExportLists) {
+    FunctionImporter::ExportSetTy NewExports;
+    const auto &DefinedGVSummaries =
+        ModuleToDefinedGVSummaries.lookup(ELI.first());
+    for (auto &EI : ELI.second) {
+      // Find the copy defined in the exporting module so that we can mark the
+      // values it references in that specific definition as exported.
+      // Below we will add all references and called values, without regard to
+      // whether they are also defined in this module. We subsequently prune the
+      // list to only include those defined in the exporting module, see comment
+      // there as to why.
+      auto DS = DefinedGVSummaries.find(EI.getGUID());
+      // Anything marked exported during the import computation must have been
+      // defined in the exporting module.
+      assert(DS != DefinedGVSummaries.end());
+      auto *S = DS->getSecond();
+      S = S->getBaseObject();
+      if (auto *GVS = dyn_cast<GlobalVarSummary>(S)) {
+        // Export referenced functions and variables. We don't export/promote
+        // objects referenced by writeonly variable initializer, because
+        // we convert such variables initializers to "zeroinitializer".
+        // See processGlobalForThinLTO.
+        if (!Index.isWriteOnly(GVS))
+          for (const auto &VI : GVS->refs())
+            NewExports.insert(VI);
+      } else {
+        auto *FS = cast<FunctionSummary>(S);
+        for (auto &Edge : FS->calls())
+          NewExports.insert(Edge.first);
+        for (auto &Ref : FS->refs())
+          NewExports.insert(Ref);
+      }
+    }
+    // Prune list computed above to only include values defined in the exporting
+    // module. We do this after the above insertion since we may hit the same
+    // ref/call target multiple times in above loop, and it is more efficient to
+    // avoid a set lookup each time.
+    for (auto EI = NewExports.begin(); EI != NewExports.end();) {
+      if (!DefinedGVSummaries.count(EI->getGUID()))
+        NewExports.erase(EI++);
+      else
+        ++EI;
+    }
+    ELI.second.insert(NewExports.begin(), NewExports.end());
+  }
+
+  assert(checkVariableImport(Index, ImportLists, ExportLists));
+#ifndef NDEBUG
+  LLVM_DEBUG(dbgs() << "Import/Export lists for " << ImportLists.size()
+                    << " modules:\n");
+  for (auto &ModuleImports : ImportLists) {
+    auto ModName = ModuleImports.first();
+    auto &Exports = ExportLists[ModName];
+    unsigned NumGVS = numGlobalVarSummaries(Index, Exports);
+    LLVM_DEBUG(dbgs() << "* Module " << ModName << " exports "
+                      << Exports.size() - NumGVS << " functions and " << NumGVS
+                      << " vars. Imports from " << ModuleImports.second.size()
+                      << " modules.\n");
+    for (auto &Src : ModuleImports.second) {
+      auto SrcModName = Src.first();
+      unsigned NumGVSPerMod = numGlobalVarSummaries(Index, Src.second);
+      LLVM_DEBUG(dbgs() << " - " << Src.second.size() - NumGVSPerMod
+                        << " functions imported from " << SrcModName << "\n");
+      LLVM_DEBUG(dbgs() << " - " << NumGVSPerMod
+                        << " global vars imported from " << SrcModName << "\n");
+    }
+  }
+#endif
+}
+
+#ifndef NDEBUG
+static void dumpImportListForModule(const ModuleSummaryIndex &Index,
+                                    StringRef ModulePath,
+                                    FunctionImporter::ImportMapTy &ImportList) {
+  LLVM_DEBUG(dbgs() << "* Module " << ModulePath << " imports from "
+                    << ImportList.size() << " modules.\n");
+  for (auto &Src : ImportList) {
+    auto SrcModName = Src.first();
+    unsigned NumGVSPerMod = numGlobalVarSummaries(Index, Src.second);
+    LLVM_DEBUG(dbgs() << " - " << Src.second.size() - NumGVSPerMod
+                      << " functions imported from " << SrcModName << "\n");
+    LLVM_DEBUG(dbgs() << " - " << NumGVSPerMod << " vars imported from "
+                      << SrcModName << "\n");
+  }
+}
+#endif
+
+/// Compute all the imports for the given module in the Index.
+void llvm::ComputeCrossModuleImportForModule(
+    StringRef ModulePath, const ModuleSummaryIndex &Index,
+    FunctionImporter::ImportMapTy &ImportList) {
+  // Collect the list of functions this module defines.
+  // GUID -> Summary
+  GVSummaryMapTy FunctionSummaryMap;
+  Index.collectDefinedFunctionsForModule(ModulePath, FunctionSummaryMap);
+
+  // Compute the import list for this module.
+  LLVM_DEBUG(dbgs() << "Computing import for Module '" << ModulePath << "'\n");
+  ComputeImportForModule(FunctionSummaryMap, Index, ModulePath, ImportList);
+
+#ifndef NDEBUG
+  dumpImportListForModule(Index, ModulePath, ImportList);
+#endif
+}
+
+// Mark all external summaries in Index for import into the given module.
+// Used for distributed builds using a distributed index.
+void llvm::ComputeCrossModuleImportForModuleFromIndex(
+    StringRef ModulePath, const ModuleSummaryIndex &Index,
+    FunctionImporter::ImportMapTy &ImportList) {
+  for (auto &GlobalList : Index) {
+    // Ignore entries for undefined references.
+    if (GlobalList.second.SummaryList.empty())
+      continue;
+
+    auto GUID = GlobalList.first;
+    assert(GlobalList.second.SummaryList.size() == 1 &&
+           "Expected individual combined index to have one summary per GUID");
+    auto &Summary = GlobalList.second.SummaryList[0];
+    // Skip the summaries for the importing module. These are included to
+    // e.g. record required linkage changes.
+    if (Summary->modulePath() == ModulePath)
+      continue;
+    // Add an entry to provoke importing by thinBackend.
+    ImportList[Summary->modulePath()].insert(GUID);
+  }
+#ifndef NDEBUG
+  dumpImportListForModule(Index, ModulePath, ImportList);
+#endif
+}
+
+void llvm::computeDeadSymbols(
+    ModuleSummaryIndex &Index,
+    const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols,
+    function_ref<PrevailingType(GlobalValue::GUID)> isPrevailing) {
+  assert(!Index.withGlobalValueDeadStripping());
+  if (!ComputeDead)
+    return;
+  if (GUIDPreservedSymbols.empty())
+    // Don't do anything when nothing is live, this is friendly with tests.
+    return;
+  unsigned LiveSymbols = 0;
+  SmallVector<ValueInfo, 128> Worklist;
+  Worklist.reserve(GUIDPreservedSymbols.size() * 2);
+  for (auto GUID : GUIDPreservedSymbols) {
+    ValueInfo VI = Index.getValueInfo(GUID);
+    if (!VI)
+      continue;
+    for (auto &S : VI.getSummaryList())
+      S->setLive(true);
+  }
+
+  // Add values flagged in the index as live roots to the worklist.
+  for (const auto &Entry : Index) {
+    auto VI = Index.getValueInfo(Entry);
+    for (auto &S : Entry.second.SummaryList)
+      if (S->isLive()) {
+        LLVM_DEBUG(dbgs() << "Live root: " << VI << "\n");
+        Worklist.push_back(VI);
+        ++LiveSymbols;
+        break;
+      }
+  }
+
+  // Make value live and add it to the worklist if it was not live before.
+  auto visit = [&](ValueInfo VI, bool IsAliasee) {
+    // FIXME: If we knew which edges were created for indirect call profiles,
+    // we could skip them here. Any that are live should be reached via
+    // other edges, e.g. reference edges. Otherwise, using a profile collected
+    // on a slightly different binary might provoke preserving, importing
+    // and ultimately promoting calls to functions not linked into this
+    // binary, which increases the binary size unnecessarily. Note that
+    // if this code changes, the importer needs to change so that edges
+    // to functions marked dead are skipped.
+    VI = updateValueInfoForIndirectCalls(Index, VI);
+    if (!VI)
+      return;
+
+    if (llvm::any_of(VI.getSummaryList(),
+                     [](const std::unique_ptr<llvm::GlobalValueSummary> &S) {
+                       return S->isLive();
+                     }))
+      return;
+
+    // We only keep live symbols that are known to be non-prevailing if any are
+    // available_externally, linkonceodr, weakodr. Those symbols are discarded
+    // later in the EliminateAvailableExternally pass and setting them to
+    // not-live could break downstreams users of liveness information (PR36483)
+    // or limit optimization opportunities.
+    if (isPrevailing(VI.getGUID()) == PrevailingType::No) {
+      bool KeepAliveLinkage = false;
+      bool Interposable = false;
+      for (auto &S : VI.getSummaryList()) {
+        if (S->linkage() == GlobalValue::AvailableExternallyLinkage ||
+            S->linkage() == GlobalValue::WeakODRLinkage ||
+            S->linkage() == GlobalValue::LinkOnceODRLinkage)
+          KeepAliveLinkage = true;
+        else if (GlobalValue::isInterposableLinkage(S->linkage()))
+          Interposable = true;
+      }
+
+      if (!IsAliasee) {
+        if (!KeepAliveLinkage)
+          return;
+
+        if (Interposable)
+          report_fatal_error(
+              "Interposable and available_externally/linkonce_odr/weak_odr "
+              "symbol");
+      }
+    }
+
+    for (auto &S : VI.getSummaryList())
+      S->setLive(true);
+    ++LiveSymbols;
+    Worklist.push_back(VI);
+  };
+
+  while (!Worklist.empty()) {
+    auto VI = Worklist.pop_back_val();
+    for (auto &Summary : VI.getSummaryList()) {
       Summary->setLive(true);
-      if (auto *AS = dyn_cast<AliasSummary>(Summary.get())) { 
-        // If this is an alias, visit the aliasee VI to ensure that all copies 
-        // are marked live and it is added to the worklist for further 
-        // processing of its references. 
-        visit(AS->getAliaseeVI(), true); 
-        continue; 
-      } 
-      for (auto Ref : Summary->refs()) 
-        visit(Ref, false); 
-      if (auto *FS = dyn_cast<FunctionSummary>(Summary.get())) 
-        for (auto Call : FS->calls()) 
-          visit(Call.first, false); 
-    } 
-  } 
-  Index.setWithGlobalValueDeadStripping(); 
- 
-  unsigned DeadSymbols = Index.size() - LiveSymbols; 
-  LLVM_DEBUG(dbgs() << LiveSymbols << " symbols Live, and " << DeadSymbols 
-                    << " symbols Dead \n"); 
-  NumDeadSymbols += DeadSymbols; 
-  NumLiveSymbols += LiveSymbols; 
-} 
- 
-// Compute dead symbols and propagate constants in combined index. 
-void llvm::computeDeadSymbolsWithConstProp( 
-    ModuleSummaryIndex &Index, 
-    const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols, 
-    function_ref<PrevailingType(GlobalValue::GUID)> isPrevailing, 
-    bool ImportEnabled) { 
-  computeDeadSymbols(Index, GUIDPreservedSymbols, isPrevailing); 
-  if (ImportEnabled) 
-    Index.propagateAttributes(GUIDPreservedSymbols); 
-} 
- 
-/// Compute the set of summaries needed for a ThinLTO backend compilation of 
-/// \p ModulePath. 
-void llvm::gatherImportedSummariesForModule( 
-    StringRef ModulePath, 
-    const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries, 
-    const FunctionImporter::ImportMapTy &ImportList, 
-    std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex) { 
-  // Include all summaries from the importing module. 
-  ModuleToSummariesForIndex[std::string(ModulePath)] = 
-      ModuleToDefinedGVSummaries.lookup(ModulePath); 
-  // Include summaries for imports. 
-  for (auto &ILI : ImportList) { 
-    auto &SummariesForIndex = 
-        ModuleToSummariesForIndex[std::string(ILI.first())]; 
-    const auto &DefinedGVSummaries = 
-        ModuleToDefinedGVSummaries.lookup(ILI.first()); 
-    for (auto &GI : ILI.second) { 
-      const auto &DS = DefinedGVSummaries.find(GI); 
-      assert(DS != DefinedGVSummaries.end() && 
-             "Expected a defined summary for imported global value"); 
-      SummariesForIndex[GI] = DS->second; 
-    } 
-  } 
-} 
- 
-/// Emit the files \p ModulePath will import from into \p OutputFilename. 
-std::error_code llvm::EmitImportsFiles( 
-    StringRef ModulePath, StringRef OutputFilename, 
-    const std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex) { 
-  std::error_code EC; 
-  raw_fd_ostream ImportsOS(OutputFilename, EC, sys::fs::OpenFlags::OF_None); 
-  if (EC) 
-    return EC; 
-  for (auto &ILI : ModuleToSummariesForIndex) 
-    // The ModuleToSummariesForIndex map includes an entry for the current 
-    // Module (needed for writing out the index files). We don't want to 
-    // include it in the imports file, however, so filter it out. 
-    if (ILI.first != ModulePath) 
-      ImportsOS << ILI.first << "\n"; 
-  return std::error_code(); 
-} 
- 
-bool llvm::convertToDeclaration(GlobalValue &GV) { 
-  LLVM_DEBUG(dbgs() << "Converting to a declaration: `" << GV.getName() 
-                    << "\n"); 
-  if (Function *F = dyn_cast<Function>(&GV)) { 
-    F->deleteBody(); 
-    F->clearMetadata(); 
-    F->setComdat(nullptr); 
-  } else if (GlobalVariable *V = dyn_cast<GlobalVariable>(&GV)) { 
-    V->setInitializer(nullptr); 
-    V->setLinkage(GlobalValue::ExternalLinkage); 
-    V->clearMetadata(); 
-    V->setComdat(nullptr); 
-  } else { 
-    GlobalValue *NewGV; 
-    if (GV.getValueType()->isFunctionTy()) 
-      NewGV = 
-          Function::Create(cast<FunctionType>(GV.getValueType()), 
-                           GlobalValue::ExternalLinkage, GV.getAddressSpace(), 
-                           "", GV.getParent()); 
-    else 
-      NewGV = 
-          new GlobalVariable(*GV.getParent(), GV.getValueType(), 
-                             /*isConstant*/ false, GlobalValue::ExternalLinkage, 
-                             /*init*/ nullptr, "", 
-                             /*insertbefore*/ nullptr, GV.getThreadLocalMode(), 
-                             GV.getType()->getAddressSpace()); 
-    NewGV->takeName(&GV); 
-    GV.replaceAllUsesWith(NewGV); 
-    return false; 
-  } 
-  if (!GV.isImplicitDSOLocal()) 
-    GV.setDSOLocal(false); 
-  return true; 
-} 
- 
-/// Fixup prevailing symbol linkages in \p TheModule based on summary analysis. 
-void llvm::thinLTOResolvePrevailingInModule( 
-    Module &TheModule, const GVSummaryMapTy &DefinedGlobals) { 
-  auto updateLinkage = [&](GlobalValue &GV) { 
-    // See if the global summary analysis computed a new resolved linkage. 
-    const auto &GS = DefinedGlobals.find(GV.getGUID()); 
-    if (GS == DefinedGlobals.end()) 
-      return; 
-    auto NewLinkage = GS->second->linkage(); 
-    if (NewLinkage == GV.getLinkage()) 
-      return; 
-    if (GlobalValue::isLocalLinkage(GV.getLinkage()) || 
-        // Don't internalize anything here, because the code below 
-        // lacks necessary correctness checks. Leave this job to 
-        // LLVM 'internalize' pass. 
-        GlobalValue::isLocalLinkage(NewLinkage) || 
-        // In case it was dead and already converted to declaration. 
-        GV.isDeclaration()) 
-      return; 
- 
-    // Check for a non-prevailing def that has interposable linkage 
-    // (e.g. non-odr weak or linkonce). In that case we can't simply 
-    // convert to available_externally, since it would lose the 
-    // interposable property and possibly get inlined. Simply drop 
-    // the definition in that case. 
-    if (GlobalValue::isAvailableExternallyLinkage(NewLinkage) && 
-        GlobalValue::isInterposableLinkage(GV.getLinkage())) { 
-      if (!convertToDeclaration(GV)) 
-        // FIXME: Change this to collect replaced GVs and later erase 
-        // them from the parent module once thinLTOResolvePrevailingGUID is 
-        // changed to enable this for aliases. 
-        llvm_unreachable("Expected GV to be converted"); 
-    } else { 
-      // If all copies of the original symbol had global unnamed addr and 
-      // linkonce_odr linkage, it should be an auto hide symbol. In that case 
-      // the thin link would have marked it as CanAutoHide. Add hidden visibility 
-      // to the symbol to preserve the property. 
-      if (NewLinkage == GlobalValue::WeakODRLinkage && 
-          GS->second->canAutoHide()) { 
-        assert(GV.hasLinkOnceODRLinkage() && GV.hasGlobalUnnamedAddr()); 
-        GV.setVisibility(GlobalValue::HiddenVisibility); 
-      } 
- 
-      LLVM_DEBUG(dbgs() << "ODR fixing up linkage for `" << GV.getName() 
-                        << "` from " << GV.getLinkage() << " to " << NewLinkage 
-                        << "\n"); 
-      GV.setLinkage(NewLinkage); 
-    } 
-    // Remove declarations from comdats, including available_externally 
-    // as this is a declaration for the linker, and will be dropped eventually. 
-    // It is illegal for comdats to contain declarations. 
-    auto *GO = dyn_cast_or_null<GlobalObject>(&GV); 
-    if (GO && GO->isDeclarationForLinker() && GO->hasComdat()) 
-      GO->setComdat(nullptr); 
-  }; 
- 
-  // Process functions and global now 
-  for (auto &GV : TheModule) 
-    updateLinkage(GV); 
-  for (auto &GV : TheModule.globals()) 
-    updateLinkage(GV); 
-  for (auto &GV : TheModule.aliases()) 
-    updateLinkage(GV); 
-} 
- 
-/// Run internalization on \p TheModule based on symmary analysis. 
-void llvm::thinLTOInternalizeModule(Module &TheModule, 
-                                    const GVSummaryMapTy &DefinedGlobals) { 
-  // Declare a callback for the internalize pass that will ask for every 
-  // candidate GlobalValue if it can be internalized or not. 
-  auto MustPreserveGV = [&](const GlobalValue &GV) -> bool { 
-    // Lookup the linkage recorded in the summaries during global analysis. 
-    auto GS = DefinedGlobals.find(GV.getGUID()); 
-    if (GS == DefinedGlobals.end()) { 
-      // Must have been promoted (possibly conservatively). Find original 
-      // name so that we can access the correct summary and see if it can 
-      // be internalized again. 
-      // FIXME: Eventually we should control promotion instead of promoting 
-      // and internalizing again. 
-      StringRef OrigName = 
-          ModuleSummaryIndex::getOriginalNameBeforePromote(GV.getName()); 
-      std::string OrigId = GlobalValue::getGlobalIdentifier( 
-          OrigName, GlobalValue::InternalLinkage, 
-          TheModule.getSourceFileName()); 
-      GS = DefinedGlobals.find(GlobalValue::getGUID(OrigId)); 
-      if (GS == DefinedGlobals.end()) { 
-        // Also check the original non-promoted non-globalized name. In some 
-        // cases a preempted weak value is linked in as a local copy because 
-        // it is referenced by an alias (IRLinker::linkGlobalValueProto). 
-        // In that case, since it was originally not a local value, it was 
-        // recorded in the index using the original name. 
-        // FIXME: This may not be needed once PR27866 is fixed. 
-        GS = DefinedGlobals.find(GlobalValue::getGUID(OrigName)); 
-        assert(GS != DefinedGlobals.end()); 
-      } 
-    } 
-    return !GlobalValue::isLocalLinkage(GS->second->linkage()); 
-  }; 
- 
-  // FIXME: See if we can just internalize directly here via linkage changes 
-  // based on the index, rather than invoking internalizeModule. 
-  internalizeModule(TheModule, MustPreserveGV); 
-} 
- 
-/// Make alias a clone of its aliasee. 
-static Function *replaceAliasWithAliasee(Module *SrcModule, GlobalAlias *GA) { 
-  Function *Fn = cast<Function>(GA->getBaseObject()); 
- 
-  ValueToValueMapTy VMap; 
-  Function *NewFn = CloneFunction(Fn, VMap); 
-  // Clone should use the original alias's linkage, visibility and name, and we 
-  // ensure all uses of alias instead use the new clone (casted if necessary). 
-  NewFn->setLinkage(GA->getLinkage()); 
-  NewFn->setVisibility(GA->getVisibility()); 
-  GA->replaceAllUsesWith(ConstantExpr::getBitCast(NewFn, GA->getType())); 
-  NewFn->takeName(GA); 
-  return NewFn; 
-} 
- 
-// Internalize values that we marked with specific attribute 
-// in processGlobalForThinLTO. 
-static void internalizeGVsAfterImport(Module &M) { 
-  for (auto &GV : M.globals()) 
-    // Skip GVs which have been converted to declarations 
-    // by dropDeadSymbols. 
-    if (!GV.isDeclaration() && GV.hasAttribute("thinlto-internalize")) { 
-      GV.setLinkage(GlobalValue::InternalLinkage); 
-      GV.setVisibility(GlobalValue::DefaultVisibility); 
-    } 
-} 
- 
-// Automatically import functions in Module \p DestModule based on the summaries 
-// index. 
-Expected<bool> FunctionImporter::importFunctions( 
-    Module &DestModule, const FunctionImporter::ImportMapTy &ImportList) { 
-  LLVM_DEBUG(dbgs() << "Starting import for Module " 
-                    << DestModule.getModuleIdentifier() << "\n"); 
-  unsigned ImportedCount = 0, ImportedGVCount = 0; 
- 
-  IRMover Mover(DestModule); 
-  // Do the actual import of functions now, one Module at a time 
-  std::set<StringRef> ModuleNameOrderedList; 
-  for (auto &FunctionsToImportPerModule : ImportList) { 
-    ModuleNameOrderedList.insert(FunctionsToImportPerModule.first()); 
-  } 
-  for (auto &Name : ModuleNameOrderedList) { 
-    // Get the module for the import 
-    const auto &FunctionsToImportPerModule = ImportList.find(Name); 
-    assert(FunctionsToImportPerModule != ImportList.end()); 
-    Expected<std::unique_ptr<Module>> SrcModuleOrErr = ModuleLoader(Name); 
-    if (!SrcModuleOrErr) 
-      return SrcModuleOrErr.takeError(); 
-    std::unique_ptr<Module> SrcModule = std::move(*SrcModuleOrErr); 
-    assert(&DestModule.getContext() == &SrcModule->getContext() && 
-           "Context mismatch"); 
- 
-    // If modules were created with lazy metadata loading, materialize it 
-    // now, before linking it (otherwise this will be a noop). 
-    if (Error Err = SrcModule->materializeMetadata()) 
-      return std::move(Err); 
- 
-    auto &ImportGUIDs = FunctionsToImportPerModule->second; 
-    // Find the globals to import 
-    SetVector<GlobalValue *> GlobalsToImport; 
-    for (Function &F : *SrcModule) { 
-      if (!F.hasName()) 
-        continue; 
-      auto GUID = F.getGUID(); 
-      auto Import = ImportGUIDs.count(GUID); 
-      LLVM_DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing function " 
-                        << GUID << " " << F.getName() << " from " 
-                        << SrcModule->getSourceFileName() << "\n"); 
-      if (Import) { 
-        if (Error Err = F.materialize()) 
-          return std::move(Err); 
-        if (EnableImportMetadata) { 
-          // Add 'thinlto_src_module' metadata for statistics and debugging. 
-          F.setMetadata( 
-              "thinlto_src_module", 
-              MDNode::get(DestModule.getContext(), 
-                          {MDString::get(DestModule.getContext(), 
-                                         SrcModule->getSourceFileName())})); 
-        } 
-        GlobalsToImport.insert(&F); 
-      } 
-    } 
-    for (GlobalVariable &GV : SrcModule->globals()) { 
-      if (!GV.hasName()) 
-        continue; 
-      auto GUID = GV.getGUID(); 
-      auto Import = ImportGUIDs.count(GUID); 
-      LLVM_DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing global " 
-                        << GUID << " " << GV.getName() << " from " 
-                        << SrcModule->getSourceFileName() << "\n"); 
-      if (Import) { 
-        if (Error Err = GV.materialize()) 
-          return std::move(Err); 
-        ImportedGVCount += GlobalsToImport.insert(&GV); 
-      } 
-    } 
-    for (GlobalAlias &GA : SrcModule->aliases()) { 
-      if (!GA.hasName()) 
-        continue; 
-      auto GUID = GA.getGUID(); 
-      auto Import = ImportGUIDs.count(GUID); 
-      LLVM_DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing alias " 
-                        << GUID << " " << GA.getName() << " from " 
-                        << SrcModule->getSourceFileName() << "\n"); 
-      if (Import) { 
-        if (Error Err = GA.materialize()) 
-          return std::move(Err); 
-        // Import alias as a copy of its aliasee. 
-        GlobalObject *Base = GA.getBaseObject(); 
-        if (Error Err = Base->materialize()) 
-          return std::move(Err); 
-        auto *Fn = replaceAliasWithAliasee(SrcModule.get(), &GA); 
-        LLVM_DEBUG(dbgs() << "Is importing aliasee fn " << Base->getGUID() 
-                          << " " << Base->getName() << " from " 
-                          << SrcModule->getSourceFileName() << "\n"); 
-        if (EnableImportMetadata) { 
-          // Add 'thinlto_src_module' metadata for statistics and debugging. 
-          Fn->setMetadata( 
-              "thinlto_src_module", 
-              MDNode::get(DestModule.getContext(), 
-                          {MDString::get(DestModule.getContext(), 
-                                         SrcModule->getSourceFileName())})); 
-        } 
-        GlobalsToImport.insert(Fn); 
-      } 
-    } 
- 
-    // Upgrade debug info after we're done materializing all the globals and we 
-    // have loaded all the required metadata! 
-    UpgradeDebugInfo(*SrcModule); 
- 
-    // Set the partial sample profile ratio in the profile summary module flag 
-    // of the imported source module, if applicable, so that the profile summary 
-    // module flag will match with that of the destination module when it's 
-    // imported. 
-    SrcModule->setPartialSampleProfileRatio(Index); 
- 
-    // Link in the specified functions. 
-    if (renameModuleForThinLTO(*SrcModule, Index, ClearDSOLocalOnDeclarations, 
-                               &GlobalsToImport)) 
-      return true; 
- 
-    if (PrintImports) { 
-      for (const auto *GV : GlobalsToImport) 
-        dbgs() << DestModule.getSourceFileName() << ": Import " << GV->getName() 
-               << " from " << SrcModule->getSourceFileName() << "\n"; 
-    } 
- 
-    if (Error Err = Mover.move( 
-            std::move(SrcModule), GlobalsToImport.getArrayRef(), 
-            [](GlobalValue &, IRMover::ValueAdder) {}, 
-            /*IsPerformingImport=*/true)) 
-      report_fatal_error("Function Import: link error: " + 
-                         toString(std::move(Err))); 
- 
-    ImportedCount += GlobalsToImport.size(); 
-    NumImportedModules++; 
-  } 
- 
-  internalizeGVsAfterImport(DestModule); 
- 
-  NumImportedFunctions += (ImportedCount - ImportedGVCount); 
-  NumImportedGlobalVars += ImportedGVCount; 
- 
-  LLVM_DEBUG(dbgs() << "Imported " << ImportedCount - ImportedGVCount 
-                    << " functions for Module " 
-                    << DestModule.getModuleIdentifier() << "\n"); 
-  LLVM_DEBUG(dbgs() << "Imported " << ImportedGVCount 
-                    << " global variables for Module " 
-                    << DestModule.getModuleIdentifier() << "\n"); 
-  return ImportedCount; 
-} 
- 
-static bool doImportingForModule(Module &M) { 
-  if (SummaryFile.empty()) 
-    report_fatal_error("error: -function-import requires -summary-file\n"); 
-  Expected<std::unique_ptr<ModuleSummaryIndex>> IndexPtrOrErr = 
-      getModuleSummaryIndexForFile(SummaryFile); 
-  if (!IndexPtrOrErr) { 
-    logAllUnhandledErrors(IndexPtrOrErr.takeError(), errs(), 
-                          "Error loading file '" + SummaryFile + "': "); 
-    return false; 
-  } 
-  std::unique_ptr<ModuleSummaryIndex> Index = std::move(*IndexPtrOrErr); 
- 
-  // First step is collecting the import list. 
-  FunctionImporter::ImportMapTy ImportList; 
-  // If requested, simply import all functions in the index. This is used 
-  // when testing distributed backend handling via the opt tool, when 
-  // we have distributed indexes containing exactly the summaries to import. 
-  if (ImportAllIndex) 
-    ComputeCrossModuleImportForModuleFromIndex(M.getModuleIdentifier(), *Index, 
-                                               ImportList); 
-  else 
-    ComputeCrossModuleImportForModule(M.getModuleIdentifier(), *Index, 
-                                      ImportList); 
- 
-  // Conservatively mark all internal values as promoted. This interface is 
-  // only used when doing importing via the function importing pass. The pass 
-  // is only enabled when testing importing via the 'opt' tool, which does 
-  // not do the ThinLink that would normally determine what values to promote. 
-  for (auto &I : *Index) { 
-    for (auto &S : I.second.SummaryList) { 
-      if (GlobalValue::isLocalLinkage(S->linkage())) 
-        S->setLinkage(GlobalValue::ExternalLinkage); 
-    } 
-  } 
- 
-  // Next we need to promote to global scope and rename any local values that 
-  // are potentially exported to other modules. 
+      if (auto *AS = dyn_cast<AliasSummary>(Summary.get())) {
+        // If this is an alias, visit the aliasee VI to ensure that all copies
+        // are marked live and it is added to the worklist for further
+        // processing of its references.
+        visit(AS->getAliaseeVI(), true);
+        continue;
+      }
+      for (auto Ref : Summary->refs())
+        visit(Ref, false);
+      if (auto *FS = dyn_cast<FunctionSummary>(Summary.get()))
+        for (auto Call : FS->calls())
+          visit(Call.first, false);
+    }
+  }
+  Index.setWithGlobalValueDeadStripping();
+
+  unsigned DeadSymbols = Index.size() - LiveSymbols;
+  LLVM_DEBUG(dbgs() << LiveSymbols << " symbols Live, and " << DeadSymbols
+                    << " symbols Dead \n");
+  NumDeadSymbols += DeadSymbols;
+  NumLiveSymbols += LiveSymbols;
+}
+
+// Compute dead symbols and propagate constants in combined index.
+void llvm::computeDeadSymbolsWithConstProp(
+    ModuleSummaryIndex &Index,
+    const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols,
+    function_ref<PrevailingType(GlobalValue::GUID)> isPrevailing,
+    bool ImportEnabled) {
+  computeDeadSymbols(Index, GUIDPreservedSymbols, isPrevailing);
+  if (ImportEnabled)
+    Index.propagateAttributes(GUIDPreservedSymbols);
+}
+
+/// Compute the set of summaries needed for a ThinLTO backend compilation of
+/// \p ModulePath.
+void llvm::gatherImportedSummariesForModule(
+    StringRef ModulePath,
+    const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
+    const FunctionImporter::ImportMapTy &ImportList,
+    std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex) {
+  // Include all summaries from the importing module.
+  ModuleToSummariesForIndex[std::string(ModulePath)] =
+      ModuleToDefinedGVSummaries.lookup(ModulePath);
+  // Include summaries for imports.
+  for (auto &ILI : ImportList) {
+    auto &SummariesForIndex =
+        ModuleToSummariesForIndex[std::string(ILI.first())];
+    const auto &DefinedGVSummaries =
+        ModuleToDefinedGVSummaries.lookup(ILI.first());
+    for (auto &GI : ILI.second) {
+      const auto &DS = DefinedGVSummaries.find(GI);
+      assert(DS != DefinedGVSummaries.end() &&
+             "Expected a defined summary for imported global value");
+      SummariesForIndex[GI] = DS->second;
+    }
+  }
+}
+
+/// Emit the files \p ModulePath will import from into \p OutputFilename.
+std::error_code llvm::EmitImportsFiles(
+    StringRef ModulePath, StringRef OutputFilename,
+    const std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex) {
+  std::error_code EC;
+  raw_fd_ostream ImportsOS(OutputFilename, EC, sys::fs::OpenFlags::OF_None);
+  if (EC)
+    return EC;
+  for (auto &ILI : ModuleToSummariesForIndex)
+    // The ModuleToSummariesForIndex map includes an entry for the current
+    // Module (needed for writing out the index files). We don't want to
+    // include it in the imports file, however, so filter it out.
+    if (ILI.first != ModulePath)
+      ImportsOS << ILI.first << "\n";
+  return std::error_code();
+}
+
+bool llvm::convertToDeclaration(GlobalValue &GV) {
+  LLVM_DEBUG(dbgs() << "Converting to a declaration: `" << GV.getName()
+                    << "\n");
+  if (Function *F = dyn_cast<Function>(&GV)) {
+    F->deleteBody();
+    F->clearMetadata();
+    F->setComdat(nullptr);
+  } else if (GlobalVariable *V = dyn_cast<GlobalVariable>(&GV)) {
+    V->setInitializer(nullptr);
+    V->setLinkage(GlobalValue::ExternalLinkage);
+    V->clearMetadata();
+    V->setComdat(nullptr);
+  } else {
+    GlobalValue *NewGV;
+    if (GV.getValueType()->isFunctionTy())
+      NewGV =
+          Function::Create(cast<FunctionType>(GV.getValueType()),
+                           GlobalValue::ExternalLinkage, GV.getAddressSpace(),
+                           "", GV.getParent());
+    else
+      NewGV =
+          new GlobalVariable(*GV.getParent(), GV.getValueType(),
+                             /*isConstant*/ false, GlobalValue::ExternalLinkage,
+                             /*init*/ nullptr, "",
+                             /*insertbefore*/ nullptr, GV.getThreadLocalMode(),
+                             GV.getType()->getAddressSpace());
+    NewGV->takeName(&GV);
+    GV.replaceAllUsesWith(NewGV);
+    return false;
+  }
+  if (!GV.isImplicitDSOLocal())
+    GV.setDSOLocal(false);
+  return true;
+}
+
+/// Fixup prevailing symbol linkages in \p TheModule based on summary analysis.
+void llvm::thinLTOResolvePrevailingInModule(
+    Module &TheModule, const GVSummaryMapTy &DefinedGlobals) {
+  auto updateLinkage = [&](GlobalValue &GV) {
+    // See if the global summary analysis computed a new resolved linkage.
+    const auto &GS = DefinedGlobals.find(GV.getGUID());
+    if (GS == DefinedGlobals.end())
+      return;
+    auto NewLinkage = GS->second->linkage();
+    if (NewLinkage == GV.getLinkage())
+      return;
+    if (GlobalValue::isLocalLinkage(GV.getLinkage()) ||
+        // Don't internalize anything here, because the code below
+        // lacks necessary correctness checks. Leave this job to
+        // LLVM 'internalize' pass.
+        GlobalValue::isLocalLinkage(NewLinkage) ||
+        // In case it was dead and already converted to declaration.
+        GV.isDeclaration())
+      return;
+
+    // Check for a non-prevailing def that has interposable linkage
+    // (e.g. non-odr weak or linkonce). In that case we can't simply
+    // convert to available_externally, since it would lose the
+    // interposable property and possibly get inlined. Simply drop
+    // the definition in that case.
+    if (GlobalValue::isAvailableExternallyLinkage(NewLinkage) &&
+        GlobalValue::isInterposableLinkage(GV.getLinkage())) {
+      if (!convertToDeclaration(GV))
+        // FIXME: Change this to collect replaced GVs and later erase
+        // them from the parent module once thinLTOResolvePrevailingGUID is
+        // changed to enable this for aliases.
+        llvm_unreachable("Expected GV to be converted");
+    } else {
+      // If all copies of the original symbol had global unnamed addr and
+      // linkonce_odr linkage, it should be an auto hide symbol. In that case
+      // the thin link would have marked it as CanAutoHide. Add hidden visibility
+      // to the symbol to preserve the property.
+      if (NewLinkage == GlobalValue::WeakODRLinkage &&
+          GS->second->canAutoHide()) {
+        assert(GV.hasLinkOnceODRLinkage() && GV.hasGlobalUnnamedAddr());
+        GV.setVisibility(GlobalValue::HiddenVisibility);
+      }
+
+      LLVM_DEBUG(dbgs() << "ODR fixing up linkage for `" << GV.getName()
+                        << "` from " << GV.getLinkage() << " to " << NewLinkage
+                        << "\n");
+      GV.setLinkage(NewLinkage);
+    }
+    // Remove declarations from comdats, including available_externally
+    // as this is a declaration for the linker, and will be dropped eventually.
+    // It is illegal for comdats to contain declarations.
+    auto *GO = dyn_cast_or_null<GlobalObject>(&GV);
+    if (GO && GO->isDeclarationForLinker() && GO->hasComdat())
+      GO->setComdat(nullptr);
+  };
+
+  // Process functions and global now
+  for (auto &GV : TheModule)
+    updateLinkage(GV);
+  for (auto &GV : TheModule.globals())
+    updateLinkage(GV);
+  for (auto &GV : TheModule.aliases())
+    updateLinkage(GV);
+}
+
+/// Run internalization on \p TheModule based on symmary analysis.
+void llvm::thinLTOInternalizeModule(Module &TheModule,
+                                    const GVSummaryMapTy &DefinedGlobals) {
+  // Declare a callback for the internalize pass that will ask for every
+  // candidate GlobalValue if it can be internalized or not.
+  auto MustPreserveGV = [&](const GlobalValue &GV) -> bool {
+    // Lookup the linkage recorded in the summaries during global analysis.
+    auto GS = DefinedGlobals.find(GV.getGUID());
+    if (GS == DefinedGlobals.end()) {
+      // Must have been promoted (possibly conservatively). Find original
+      // name so that we can access the correct summary and see if it can
+      // be internalized again.
+      // FIXME: Eventually we should control promotion instead of promoting
+      // and internalizing again.
+      StringRef OrigName =
+          ModuleSummaryIndex::getOriginalNameBeforePromote(GV.getName());
+      std::string OrigId = GlobalValue::getGlobalIdentifier(
+          OrigName, GlobalValue::InternalLinkage,
+          TheModule.getSourceFileName());
+      GS = DefinedGlobals.find(GlobalValue::getGUID(OrigId));
+      if (GS == DefinedGlobals.end()) {
+        // Also check the original non-promoted non-globalized name. In some
+        // cases a preempted weak value is linked in as a local copy because
+        // it is referenced by an alias (IRLinker::linkGlobalValueProto).
+        // In that case, since it was originally not a local value, it was
+        // recorded in the index using the original name.
+        // FIXME: This may not be needed once PR27866 is fixed.
+        GS = DefinedGlobals.find(GlobalValue::getGUID(OrigName));
+        assert(GS != DefinedGlobals.end());
+      }
+    }
+    return !GlobalValue::isLocalLinkage(GS->second->linkage());
+  };
+
+  // FIXME: See if we can just internalize directly here via linkage changes
+  // based on the index, rather than invoking internalizeModule.
+  internalizeModule(TheModule, MustPreserveGV);
+}
+
+/// Make alias a clone of its aliasee.
+static Function *replaceAliasWithAliasee(Module *SrcModule, GlobalAlias *GA) {
+  Function *Fn = cast<Function>(GA->getBaseObject());
+
+  ValueToValueMapTy VMap;
+  Function *NewFn = CloneFunction(Fn, VMap);
+  // Clone should use the original alias's linkage, visibility and name, and we
+  // ensure all uses of alias instead use the new clone (casted if necessary).
+  NewFn->setLinkage(GA->getLinkage());
+  NewFn->setVisibility(GA->getVisibility());
+  GA->replaceAllUsesWith(ConstantExpr::getBitCast(NewFn, GA->getType()));
+  NewFn->takeName(GA);
+  return NewFn;
+}
+
+// Internalize values that we marked with specific attribute
+// in processGlobalForThinLTO.
+static void internalizeGVsAfterImport(Module &M) {
+  for (auto &GV : M.globals())
+    // Skip GVs which have been converted to declarations
+    // by dropDeadSymbols.
+    if (!GV.isDeclaration() && GV.hasAttribute("thinlto-internalize")) {
+      GV.setLinkage(GlobalValue::InternalLinkage);
+      GV.setVisibility(GlobalValue::DefaultVisibility);
+    }
+}
+
+// Automatically import functions in Module \p DestModule based on the summaries
+// index.
+Expected<bool> FunctionImporter::importFunctions(
+    Module &DestModule, const FunctionImporter::ImportMapTy &ImportList) {
+  LLVM_DEBUG(dbgs() << "Starting import for Module "
+                    << DestModule.getModuleIdentifier() << "\n");
+  unsigned ImportedCount = 0, ImportedGVCount = 0;
+
+  IRMover Mover(DestModule);
+  // Do the actual import of functions now, one Module at a time
+  std::set<StringRef> ModuleNameOrderedList;
+  for (auto &FunctionsToImportPerModule : ImportList) {
+    ModuleNameOrderedList.insert(FunctionsToImportPerModule.first());
+  }
+  for (auto &Name : ModuleNameOrderedList) {
+    // Get the module for the import
+    const auto &FunctionsToImportPerModule = ImportList.find(Name);
+    assert(FunctionsToImportPerModule != ImportList.end());
+    Expected<std::unique_ptr<Module>> SrcModuleOrErr = ModuleLoader(Name);
+    if (!SrcModuleOrErr)
+      return SrcModuleOrErr.takeError();
+    std::unique_ptr<Module> SrcModule = std::move(*SrcModuleOrErr);
+    assert(&DestModule.getContext() == &SrcModule->getContext() &&
+           "Context mismatch");
+
+    // If modules were created with lazy metadata loading, materialize it
+    // now, before linking it (otherwise this will be a noop).
+    if (Error Err = SrcModule->materializeMetadata())
+      return std::move(Err);
+
+    auto &ImportGUIDs = FunctionsToImportPerModule->second;
+    // Find the globals to import
+    SetVector<GlobalValue *> GlobalsToImport;
+    for (Function &F : *SrcModule) {
+      if (!F.hasName())
+        continue;
+      auto GUID = F.getGUID();
+      auto Import = ImportGUIDs.count(GUID);
+      LLVM_DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing function "
+                        << GUID << " " << F.getName() << " from "
+                        << SrcModule->getSourceFileName() << "\n");
+      if (Import) {
+        if (Error Err = F.materialize())
+          return std::move(Err);
+        if (EnableImportMetadata) {
+          // Add 'thinlto_src_module' metadata for statistics and debugging.
+          F.setMetadata(
+              "thinlto_src_module",
+              MDNode::get(DestModule.getContext(),
+                          {MDString::get(DestModule.getContext(),
+                                         SrcModule->getSourceFileName())}));
+        }
+        GlobalsToImport.insert(&F);
+      }
+    }
+    for (GlobalVariable &GV : SrcModule->globals()) {
+      if (!GV.hasName())
+        continue;
+      auto GUID = GV.getGUID();
+      auto Import = ImportGUIDs.count(GUID);
+      LLVM_DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing global "
+                        << GUID << " " << GV.getName() << " from "
+                        << SrcModule->getSourceFileName() << "\n");
+      if (Import) {
+        if (Error Err = GV.materialize())
+          return std::move(Err);
+        ImportedGVCount += GlobalsToImport.insert(&GV);
+      }
+    }
+    for (GlobalAlias &GA : SrcModule->aliases()) {
+      if (!GA.hasName())
+        continue;
+      auto GUID = GA.getGUID();
+      auto Import = ImportGUIDs.count(GUID);
+      LLVM_DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing alias "
+                        << GUID << " " << GA.getName() << " from "
+                        << SrcModule->getSourceFileName() << "\n");
+      if (Import) {
+        if (Error Err = GA.materialize())
+          return std::move(Err);
+        // Import alias as a copy of its aliasee.
+        GlobalObject *Base = GA.getBaseObject();
+        if (Error Err = Base->materialize())
+          return std::move(Err);
+        auto *Fn = replaceAliasWithAliasee(SrcModule.get(), &GA);
+        LLVM_DEBUG(dbgs() << "Is importing aliasee fn " << Base->getGUID()
+                          << " " << Base->getName() << " from "
+                          << SrcModule->getSourceFileName() << "\n");
+        if (EnableImportMetadata) {
+          // Add 'thinlto_src_module' metadata for statistics and debugging.
+          Fn->setMetadata(
+              "thinlto_src_module",
+              MDNode::get(DestModule.getContext(),
+                          {MDString::get(DestModule.getContext(),
+                                         SrcModule->getSourceFileName())}));
+        }
+        GlobalsToImport.insert(Fn);
+      }
+    }
+
+    // Upgrade debug info after we're done materializing all the globals and we
+    // have loaded all the required metadata!
+    UpgradeDebugInfo(*SrcModule);
+
+    // Set the partial sample profile ratio in the profile summary module flag
+    // of the imported source module, if applicable, so that the profile summary
+    // module flag will match with that of the destination module when it's
+    // imported.
+    SrcModule->setPartialSampleProfileRatio(Index);
+
+    // Link in the specified functions.
+    if (renameModuleForThinLTO(*SrcModule, Index, ClearDSOLocalOnDeclarations,
+                               &GlobalsToImport))
+      return true;
+
+    if (PrintImports) {
+      for (const auto *GV : GlobalsToImport)
+        dbgs() << DestModule.getSourceFileName() << ": Import " << GV->getName()
+               << " from " << SrcModule->getSourceFileName() << "\n";
+    }
+
+    if (Error Err = Mover.move(
+            std::move(SrcModule), GlobalsToImport.getArrayRef(),
+            [](GlobalValue &, IRMover::ValueAdder) {},
+            /*IsPerformingImport=*/true))
+      report_fatal_error("Function Import: link error: " +
+                         toString(std::move(Err)));
+
+    ImportedCount += GlobalsToImport.size();
+    NumImportedModules++;
+  }
+
+  internalizeGVsAfterImport(DestModule);
+
+  NumImportedFunctions += (ImportedCount - ImportedGVCount);
+  NumImportedGlobalVars += ImportedGVCount;
+
+  LLVM_DEBUG(dbgs() << "Imported " << ImportedCount - ImportedGVCount
+                    << " functions for Module "
+                    << DestModule.getModuleIdentifier() << "\n");
+  LLVM_DEBUG(dbgs() << "Imported " << ImportedGVCount
+                    << " global variables for Module "
+                    << DestModule.getModuleIdentifier() << "\n");
+  return ImportedCount;
+}
+
+static bool doImportingForModule(Module &M) {
+  if (SummaryFile.empty())
+    report_fatal_error("error: -function-import requires -summary-file\n");
+  Expected<std::unique_ptr<ModuleSummaryIndex>> IndexPtrOrErr =
+      getModuleSummaryIndexForFile(SummaryFile);
+  if (!IndexPtrOrErr) {
+    logAllUnhandledErrors(IndexPtrOrErr.takeError(), errs(),
+                          "Error loading file '" + SummaryFile + "': ");
+    return false;
+  }
+  std::unique_ptr<ModuleSummaryIndex> Index = std::move(*IndexPtrOrErr);
+
+  // First step is collecting the import list.
+  FunctionImporter::ImportMapTy ImportList;
+  // If requested, simply import all functions in the index. This is used
+  // when testing distributed backend handling via the opt tool, when
+  // we have distributed indexes containing exactly the summaries to import.
+  if (ImportAllIndex)
+    ComputeCrossModuleImportForModuleFromIndex(M.getModuleIdentifier(), *Index,
+                                               ImportList);
+  else
+    ComputeCrossModuleImportForModule(M.getModuleIdentifier(), *Index,
+                                      ImportList);
+
+  // Conservatively mark all internal values as promoted. This interface is
+  // only used when doing importing via the function importing pass. The pass
+  // is only enabled when testing importing via the 'opt' tool, which does
+  // not do the ThinLink that would normally determine what values to promote.
+  for (auto &I : *Index) {
+    for (auto &S : I.second.SummaryList) {
+      if (GlobalValue::isLocalLinkage(S->linkage()))
+        S->setLinkage(GlobalValue::ExternalLinkage);
+    }
+  }
+
+  // Next we need to promote to global scope and rename any local values that
+  // are potentially exported to other modules.
   if (renameModuleForThinLTO(M, *Index, /*ClearDSOLocalOnDeclarations=*/false,
-                             /*GlobalsToImport=*/nullptr)) { 
-    errs() << "Error renaming module\n"; 
-    return false; 
-  } 
- 
-  // Perform the import now. 
-  auto ModuleLoader = [&M](StringRef Identifier) { 
-    return loadFile(std::string(Identifier), M.getContext()); 
-  }; 
-  FunctionImporter Importer(*Index, ModuleLoader, 
-                            /*ClearDSOLocalOnDeclarations=*/false); 
-  Expected<bool> Result = Importer.importFunctions(M, ImportList); 
- 
-  // FIXME: Probably need to propagate Errors through the pass manager. 
-  if (!Result) { 
-    logAllUnhandledErrors(Result.takeError(), errs(), 
-                          "Error importing module: "); 
-    return false; 
-  } 
- 
-  return *Result; 
-} 
- 
-namespace { 
- 
-/// Pass that performs cross-module function import provided a summary file. 
-class FunctionImportLegacyPass : public ModulePass { 
-public: 
-  /// Pass identification, replacement for typeid 
-  static char ID; 
- 
-  explicit FunctionImportLegacyPass() : ModulePass(ID) {} 
- 
-  /// Specify pass name for debug output 
-  StringRef getPassName() const override { return "Function Importing"; } 
- 
-  bool runOnModule(Module &M) override { 
-    if (skipModule(M)) 
-      return false; 
- 
-    return doImportingForModule(M); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-PreservedAnalyses FunctionImportPass::run(Module &M, 
-                                          ModuleAnalysisManager &AM) { 
-  if (!doImportingForModule(M)) 
-    return PreservedAnalyses::all(); 
- 
-  return PreservedAnalyses::none(); 
-} 
- 
-char FunctionImportLegacyPass::ID = 0; 
-INITIALIZE_PASS(FunctionImportLegacyPass, "function-import", 
-                "Summary Based Function Import", false, false) 
- 
-namespace llvm { 
- 
-Pass *createFunctionImportPass() { 
-  return new FunctionImportLegacyPass(); 
-} 
- 
-} // end namespace llvm 
+                             /*GlobalsToImport=*/nullptr)) {
+    errs() << "Error renaming module\n";
+    return false;
+  }
+
+  // Perform the import now.
+  auto ModuleLoader = [&M](StringRef Identifier) {
+    return loadFile(std::string(Identifier), M.getContext());
+  };
+  FunctionImporter Importer(*Index, ModuleLoader,
+                            /*ClearDSOLocalOnDeclarations=*/false);
+  Expected<bool> Result = Importer.importFunctions(M, ImportList);
+
+  // FIXME: Probably need to propagate Errors through the pass manager.
+  if (!Result) {
+    logAllUnhandledErrors(Result.takeError(), errs(),
+                          "Error importing module: ");
+    return false;
+  }
+
+  return *Result;
+}
+
+namespace {
+
+/// Pass that performs cross-module function import provided a summary file.
+class FunctionImportLegacyPass : public ModulePass {
+public:
+  /// Pass identification, replacement for typeid
+  static char ID;
+
+  explicit FunctionImportLegacyPass() : ModulePass(ID) {}
+
+  /// Specify pass name for debug output
+  StringRef getPassName() const override { return "Function Importing"; }
+
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+
+    return doImportingForModule(M);
+  }
+};
+
+} // end anonymous namespace
+
+PreservedAnalyses FunctionImportPass::run(Module &M,
+                                          ModuleAnalysisManager &AM) {
+  if (!doImportingForModule(M))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+
+char FunctionImportLegacyPass::ID = 0;
+INITIALIZE_PASS(FunctionImportLegacyPass, "function-import",
+                "Summary Based Function Import", false, false)
+
+namespace llvm {
+
+Pass *createFunctionImportPass() {
+  return new FunctionImportLegacyPass();
+}
+
+} // end namespace llvm
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/GlobalDCE.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/GlobalDCE.cpp
index 6322e51552..fb4cb23b83 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/GlobalDCE.cpp
@@ -1,460 +1,460 @@
-//===-- GlobalDCE.cpp - DCE unreachable internal functions ----------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This transform is designed to eliminate unreachable internal globals from the 
-// program.  It uses an aggressive algorithm, searching out globals that are 
-// known to be alive.  After it finds all of the globals which are needed, it 
-// deletes whatever is left over.  This allows it to delete recursive chunks of 
-// the program which are unreachable. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/IPO/GlobalDCE.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/TypeMetadataUtils.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Operator.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Transforms/IPO.h" 
-#include "llvm/Transforms/Utils/CtorUtils.h" 
-#include "llvm/Transforms/Utils/GlobalStatus.h" 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "globaldce" 
- 
-static cl::opt<bool> 
-    ClEnableVFE("enable-vfe", cl::Hidden, cl::init(true), cl::ZeroOrMore, 
-                cl::desc("Enable virtual function elimination")); 
- 
-STATISTIC(NumAliases  , "Number of global aliases removed"); 
-STATISTIC(NumFunctions, "Number of functions removed"); 
-STATISTIC(NumIFuncs,    "Number of indirect functions removed"); 
-STATISTIC(NumVariables, "Number of global variables removed"); 
-STATISTIC(NumVFuncs,    "Number of virtual functions removed"); 
- 
-namespace { 
-  class GlobalDCELegacyPass : public ModulePass { 
-  public: 
-    static char ID; // Pass identification, replacement for typeid 
-    GlobalDCELegacyPass() : ModulePass(ID) { 
-      initializeGlobalDCELegacyPassPass(*PassRegistry::getPassRegistry()); 
-    } 
- 
-    // run - Do the GlobalDCE pass on the specified module, optionally updating 
-    // the specified callgraph to reflect the changes. 
-    // 
-    bool runOnModule(Module &M) override { 
-      if (skipModule(M)) 
-        return false; 
- 
-      // We need a minimally functional dummy module analysis manager. It needs 
-      // to at least know about the possibility of proxying a function analysis 
-      // manager. 
-      FunctionAnalysisManager DummyFAM; 
-      ModuleAnalysisManager DummyMAM; 
-      DummyMAM.registerPass( 
-          [&] { return FunctionAnalysisManagerModuleProxy(DummyFAM); }); 
- 
-      auto PA = Impl.run(M, DummyMAM); 
-      return !PA.areAllPreserved(); 
-    } 
- 
-  private: 
-    GlobalDCEPass Impl; 
-  }; 
-} 
- 
-char GlobalDCELegacyPass::ID = 0; 
-INITIALIZE_PASS(GlobalDCELegacyPass, "globaldce", 
-                "Dead Global Elimination", false, false) 
- 
-// Public interface to the GlobalDCEPass. 
-ModulePass *llvm::createGlobalDCEPass() { 
-  return new GlobalDCELegacyPass(); 
-} 
- 
-/// Returns true if F is effectively empty. 
-static bool isEmptyFunction(Function *F) { 
-  BasicBlock &Entry = F->getEntryBlock(); 
-  for (auto &I : Entry) { 
-    if (isa<DbgInfoIntrinsic>(I)) 
-      continue; 
-    if (auto *RI = dyn_cast<ReturnInst>(&I)) 
-      return !RI->getReturnValue(); 
-    break; 
-  } 
-  return false; 
-} 
- 
-/// Compute the set of GlobalValue that depends from V. 
-/// The recursion stops as soon as a GlobalValue is met. 
-void GlobalDCEPass::ComputeDependencies(Value *V, 
-                                        SmallPtrSetImpl<GlobalValue *> &Deps) { 
-  if (auto *I = dyn_cast<Instruction>(V)) { 
-    Function *Parent = I->getParent()->getParent(); 
-    Deps.insert(Parent); 
-  } else if (auto *GV = dyn_cast<GlobalValue>(V)) { 
-    Deps.insert(GV); 
-  } else if (auto *CE = dyn_cast<Constant>(V)) { 
-    // Avoid walking the whole tree of a big ConstantExprs multiple times. 
-    auto Where = ConstantDependenciesCache.find(CE); 
-    if (Where != ConstantDependenciesCache.end()) { 
-      auto const &K = Where->second; 
-      Deps.insert(K.begin(), K.end()); 
-    } else { 
-      SmallPtrSetImpl<GlobalValue *> &LocalDeps = ConstantDependenciesCache[CE]; 
-      for (User *CEUser : CE->users()) 
-        ComputeDependencies(CEUser, LocalDeps); 
-      Deps.insert(LocalDeps.begin(), LocalDeps.end()); 
-    } 
-  } 
-} 
- 
-void GlobalDCEPass::UpdateGVDependencies(GlobalValue &GV) { 
-  SmallPtrSet<GlobalValue *, 8> Deps; 
-  for (User *User : GV.users()) 
-    ComputeDependencies(User, Deps); 
-  Deps.erase(&GV); // Remove self-reference. 
-  for (GlobalValue *GVU : Deps) { 
-    // If this is a dep from a vtable to a virtual function, and we have 
-    // complete information about all virtual call sites which could call 
-    // though this vtable, then skip it, because the call site information will 
-    // be more precise. 
-    if (VFESafeVTables.count(GVU) && isa<Function>(&GV)) { 
-      LLVM_DEBUG(dbgs() << "Ignoring dep " << GVU->getName() << " -> " 
-                        << GV.getName() << "\n"); 
-      continue; 
-    } 
-    GVDependencies[GVU].insert(&GV); 
-  } 
-} 
- 
-/// Mark Global value as Live 
-void GlobalDCEPass::MarkLive(GlobalValue &GV, 
-                             SmallVectorImpl<GlobalValue *> *Updates) { 
-  auto const Ret = AliveGlobals.insert(&GV); 
-  if (!Ret.second) 
-    return; 
- 
-  if (Updates) 
-    Updates->push_back(&GV); 
-  if (Comdat *C = GV.getComdat()) { 
-    for (auto &&CM : make_range(ComdatMembers.equal_range(C))) { 
-      MarkLive(*CM.second, Updates); // Recursion depth is only two because only 
-                                     // globals in the same comdat are visited. 
-    } 
-  } 
-} 
- 
-void GlobalDCEPass::ScanVTables(Module &M) { 
-  SmallVector<MDNode *, 2> Types; 
-  LLVM_DEBUG(dbgs() << "Building type info -> vtable map\n"); 
- 
-  auto *LTOPostLinkMD = 
-      cast_or_null<ConstantAsMetadata>(M.getModuleFlag("LTOPostLink")); 
-  bool LTOPostLink = 
-      LTOPostLinkMD && 
-      (cast<ConstantInt>(LTOPostLinkMD->getValue())->getZExtValue() != 0); 
- 
-  for (GlobalVariable &GV : M.globals()) { 
-    Types.clear(); 
-    GV.getMetadata(LLVMContext::MD_type, Types); 
-    if (GV.isDeclaration() || Types.empty()) 
-      continue; 
- 
-    // Use the typeid metadata on the vtable to build a mapping from typeids to 
-    // the list of (GV, offset) pairs which are the possible vtables for that 
-    // typeid. 
-    for (MDNode *Type : Types) { 
-      Metadata *TypeID = Type->getOperand(1).get(); 
- 
-      uint64_t Offset = 
-          cast<ConstantInt>( 
-              cast<ConstantAsMetadata>(Type->getOperand(0))->getValue()) 
-              ->getZExtValue(); 
- 
-      TypeIdMap[TypeID].insert(std::make_pair(&GV, Offset)); 
-    } 
- 
-    // If the type corresponding to the vtable is private to this translation 
-    // unit, we know that we can see all virtual functions which might use it, 
-    // so VFE is safe. 
-    if (auto GO = dyn_cast<GlobalObject>(&GV)) { 
-      GlobalObject::VCallVisibility TypeVis = GO->getVCallVisibility(); 
-      if (TypeVis == GlobalObject::VCallVisibilityTranslationUnit || 
-          (LTOPostLink && 
-           TypeVis == GlobalObject::VCallVisibilityLinkageUnit)) { 
-        LLVM_DEBUG(dbgs() << GV.getName() << " is safe for VFE\n"); 
-        VFESafeVTables.insert(&GV); 
-      } 
-    } 
-  } 
-} 
- 
-void GlobalDCEPass::ScanVTableLoad(Function *Caller, Metadata *TypeId, 
-                                   uint64_t CallOffset) { 
-  for (auto &VTableInfo : TypeIdMap[TypeId]) { 
-    GlobalVariable *VTable = VTableInfo.first; 
-    uint64_t VTableOffset = VTableInfo.second; 
- 
-    Constant *Ptr = 
-        getPointerAtOffset(VTable->getInitializer(), VTableOffset + CallOffset, 
-                           *Caller->getParent()); 
-    if (!Ptr) { 
-      LLVM_DEBUG(dbgs() << "can't find pointer in vtable!\n"); 
-      VFESafeVTables.erase(VTable); 
-      return; 
-    } 
- 
-    auto Callee = dyn_cast<Function>(Ptr->stripPointerCasts()); 
-    if (!Callee) { 
-      LLVM_DEBUG(dbgs() << "vtable entry is not function pointer!\n"); 
-      VFESafeVTables.erase(VTable); 
-      return; 
-    } 
- 
-    LLVM_DEBUG(dbgs() << "vfunc dep " << Caller->getName() << " -> " 
-                      << Callee->getName() << "\n"); 
-    GVDependencies[Caller].insert(Callee); 
-  } 
-} 
- 
-void GlobalDCEPass::ScanTypeCheckedLoadIntrinsics(Module &M) { 
-  LLVM_DEBUG(dbgs() << "Scanning type.checked.load intrinsics\n"); 
-  Function *TypeCheckedLoadFunc = 
-      M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load)); 
- 
-  if (!TypeCheckedLoadFunc) 
-    return; 
- 
-  for (auto U : TypeCheckedLoadFunc->users()) { 
-    auto CI = dyn_cast<CallInst>(U); 
-    if (!CI) 
-      continue; 
- 
-    auto *Offset = dyn_cast<ConstantInt>(CI->getArgOperand(1)); 
-    Value *TypeIdValue = CI->getArgOperand(2); 
-    auto *TypeId = cast<MetadataAsValue>(TypeIdValue)->getMetadata(); 
- 
-    if (Offset) { 
-      ScanVTableLoad(CI->getFunction(), TypeId, Offset->getZExtValue()); 
-    } else { 
-      // type.checked.load with a non-constant offset, so assume every entry in 
-      // every matching vtable is used. 
-      for (auto &VTableInfo : TypeIdMap[TypeId]) { 
-        VFESafeVTables.erase(VTableInfo.first); 
-      } 
-    } 
-  } 
-} 
- 
-void GlobalDCEPass::AddVirtualFunctionDependencies(Module &M) { 
-  if (!ClEnableVFE) 
-    return; 
- 
-  // If the Virtual Function Elim module flag is present and set to zero, then 
-  // the vcall_visibility metadata was inserted for another optimization (WPD) 
-  // and we may not have type checked loads on all accesses to the vtable. 
-  // Don't attempt VFE in that case. 
-  auto *Val = mdconst::dyn_extract_or_null<ConstantInt>( 
-      M.getModuleFlag("Virtual Function Elim")); 
-  if (!Val || Val->getZExtValue() == 0) 
-    return; 
- 
-  ScanVTables(M); 
- 
-  if (VFESafeVTables.empty()) 
-    return; 
- 
-  ScanTypeCheckedLoadIntrinsics(M); 
- 
-  LLVM_DEBUG( 
-    dbgs() << "VFE safe vtables:\n"; 
-    for (auto *VTable : VFESafeVTables) 
-      dbgs() << "  " << VTable->getName() << "\n"; 
-  ); 
-} 
- 
-PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) { 
-  bool Changed = false; 
- 
-  // The algorithm first computes the set L of global variables that are 
-  // trivially live.  Then it walks the initialization of these variables to 
-  // compute the globals used to initialize them, which effectively builds a 
-  // directed graph where nodes are global variables, and an edge from A to B 
-  // means B is used to initialize A.  Finally, it propagates the liveness 
-  // information through the graph starting from the nodes in L. Nodes note 
-  // marked as alive are discarded. 
- 
-  // Remove empty functions from the global ctors list. 
-  Changed |= optimizeGlobalCtorsList(M, isEmptyFunction); 
- 
-  // Collect the set of members for each comdat. 
-  for (Function &F : M) 
-    if (Comdat *C = F.getComdat()) 
-      ComdatMembers.insert(std::make_pair(C, &F)); 
-  for (GlobalVariable &GV : M.globals()) 
-    if (Comdat *C = GV.getComdat()) 
-      ComdatMembers.insert(std::make_pair(C, &GV)); 
-  for (GlobalAlias &GA : M.aliases()) 
-    if (Comdat *C = GA.getComdat()) 
-      ComdatMembers.insert(std::make_pair(C, &GA)); 
- 
-  // Add dependencies between virtual call sites and the virtual functions they 
-  // might call, if we have that information. 
-  AddVirtualFunctionDependencies(M); 
- 
-  // Loop over the module, adding globals which are obviously necessary. 
-  for (GlobalObject &GO : M.global_objects()) { 
-    Changed |= RemoveUnusedGlobalValue(GO); 
-    // Functions with external linkage are needed if they have a body. 
-    // Externally visible & appending globals are needed, if they have an 
-    // initializer. 
-    if (!GO.isDeclaration()) 
-      if (!GO.isDiscardableIfUnused()) 
-        MarkLive(GO); 
- 
-    UpdateGVDependencies(GO); 
-  } 
- 
-  // Compute direct dependencies of aliases. 
-  for (GlobalAlias &GA : M.aliases()) { 
-    Changed |= RemoveUnusedGlobalValue(GA); 
-    // Externally visible aliases are needed. 
-    if (!GA.isDiscardableIfUnused()) 
-      MarkLive(GA); 
- 
-    UpdateGVDependencies(GA); 
-  } 
- 
-  // Compute direct dependencies of ifuncs. 
-  for (GlobalIFunc &GIF : M.ifuncs()) { 
-    Changed |= RemoveUnusedGlobalValue(GIF); 
-    // Externally visible ifuncs are needed. 
-    if (!GIF.isDiscardableIfUnused()) 
-      MarkLive(GIF); 
- 
-    UpdateGVDependencies(GIF); 
-  } 
- 
-  // Propagate liveness from collected Global Values through the computed 
-  // dependencies. 
-  SmallVector<GlobalValue *, 8> NewLiveGVs{AliveGlobals.begin(), 
-                                           AliveGlobals.end()}; 
-  while (!NewLiveGVs.empty()) { 
-    GlobalValue *LGV = NewLiveGVs.pop_back_val(); 
-    for (auto *GVD : GVDependencies[LGV]) 
-      MarkLive(*GVD, &NewLiveGVs); 
-  } 
- 
-  // Now that all globals which are needed are in the AliveGlobals set, we loop 
-  // through the program, deleting those which are not alive. 
-  // 
- 
-  // The first pass is to drop initializers of global variables which are dead. 
-  std::vector<GlobalVariable *> DeadGlobalVars; // Keep track of dead globals 
-  for (GlobalVariable &GV : M.globals()) 
-    if (!AliveGlobals.count(&GV)) { 
-      DeadGlobalVars.push_back(&GV);         // Keep track of dead globals 
-      if (GV.hasInitializer()) { 
-        Constant *Init = GV.getInitializer(); 
-        GV.setInitializer(nullptr); 
-        if (isSafeToDestroyConstant(Init)) 
-          Init->destroyConstant(); 
-      } 
-    } 
- 
-  // The second pass drops the bodies of functions which are dead... 
-  std::vector<Function *> DeadFunctions; 
-  for (Function &F : M) 
-    if (!AliveGlobals.count(&F)) { 
-      DeadFunctions.push_back(&F);         // Keep track of dead globals 
-      if (!F.isDeclaration()) 
-        F.deleteBody(); 
-    } 
- 
-  // The third pass drops targets of aliases which are dead... 
-  std::vector<GlobalAlias*> DeadAliases; 
-  for (GlobalAlias &GA : M.aliases()) 
-    if (!AliveGlobals.count(&GA)) { 
-      DeadAliases.push_back(&GA); 
-      GA.setAliasee(nullptr); 
-    } 
- 
-  // The fourth pass drops targets of ifuncs which are dead... 
-  std::vector<GlobalIFunc*> DeadIFuncs; 
-  for (GlobalIFunc &GIF : M.ifuncs()) 
-    if (!AliveGlobals.count(&GIF)) { 
-      DeadIFuncs.push_back(&GIF); 
-      GIF.setResolver(nullptr); 
-    } 
- 
-  // Now that all interferences have been dropped, delete the actual objects 
-  // themselves. 
-  auto EraseUnusedGlobalValue = [&](GlobalValue *GV) { 
-    RemoveUnusedGlobalValue(*GV); 
-    GV->eraseFromParent(); 
-    Changed = true; 
-  }; 
- 
-  NumFunctions += DeadFunctions.size(); 
-  for (Function *F : DeadFunctions) { 
-    if (!F->use_empty()) { 
-      // Virtual functions might still be referenced by one or more vtables, 
-      // but if we've proven them to be unused then it's safe to replace the 
-      // virtual function pointers with null, allowing us to remove the 
-      // function itself. 
-      ++NumVFuncs; 
-      F->replaceNonMetadataUsesWith(ConstantPointerNull::get(F->getType())); 
-    } 
-    EraseUnusedGlobalValue(F); 
-  } 
- 
-  NumVariables += DeadGlobalVars.size(); 
-  for (GlobalVariable *GV : DeadGlobalVars) 
-    EraseUnusedGlobalValue(GV); 
- 
-  NumAliases += DeadAliases.size(); 
-  for (GlobalAlias *GA : DeadAliases) 
-    EraseUnusedGlobalValue(GA); 
- 
-  NumIFuncs += DeadIFuncs.size(); 
-  for (GlobalIFunc *GIF : DeadIFuncs) 
-    EraseUnusedGlobalValue(GIF); 
- 
-  // Make sure that all memory is released 
-  AliveGlobals.clear(); 
-  ConstantDependenciesCache.clear(); 
-  GVDependencies.clear(); 
-  ComdatMembers.clear(); 
-  TypeIdMap.clear(); 
-  VFESafeVTables.clear(); 
- 
-  if (Changed) 
-    return PreservedAnalyses::none(); 
-  return PreservedAnalyses::all(); 
-} 
- 
-// RemoveUnusedGlobalValue - Loop over all of the uses of the specified 
-// GlobalValue, looking for the constant pointer ref that may be pointing to it. 
-// If found, check to see if the constant pointer ref is safe to destroy, and if 
-// so, nuke it.  This will reduce the reference count on the global value, which 
-// might make it deader. 
-// 
-bool GlobalDCEPass::RemoveUnusedGlobalValue(GlobalValue &GV) { 
-  if (GV.use_empty()) 
-    return false; 
-  GV.removeDeadConstantUsers(); 
-  return GV.use_empty(); 
-} 
+//===-- GlobalDCE.cpp - DCE unreachable internal functions ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This transform is designed to eliminate unreachable internal globals from the
+// program.  It uses an aggressive algorithm, searching out globals that are
+// known to be alive.  After it finds all of the globals which are needed, it
+// deletes whatever is left over.  This allows it to delete recursive chunks of
+// the program which are unreachable.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/GlobalDCE.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TypeMetadataUtils.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/CtorUtils.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "globaldce"
+
+static cl::opt<bool>
+    ClEnableVFE("enable-vfe", cl::Hidden, cl::init(true), cl::ZeroOrMore,
+                cl::desc("Enable virtual function elimination"));
+
+STATISTIC(NumAliases  , "Number of global aliases removed");
+STATISTIC(NumFunctions, "Number of functions removed");
+STATISTIC(NumIFuncs,    "Number of indirect functions removed");
+STATISTIC(NumVariables, "Number of global variables removed");
+STATISTIC(NumVFuncs,    "Number of virtual functions removed");
+
+namespace {
+  class GlobalDCELegacyPass : public ModulePass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    GlobalDCELegacyPass() : ModulePass(ID) {
+      initializeGlobalDCELegacyPassPass(*PassRegistry::getPassRegistry());
+    }
+
+    // run - Do the GlobalDCE pass on the specified module, optionally updating
+    // the specified callgraph to reflect the changes.
+    //
+    bool runOnModule(Module &M) override {
+      if (skipModule(M))
+        return false;
+
+      // We need a minimally functional dummy module analysis manager. It needs
+      // to at least know about the possibility of proxying a function analysis
+      // manager.
+      FunctionAnalysisManager DummyFAM;
+      ModuleAnalysisManager DummyMAM;
+      DummyMAM.registerPass(
+          [&] { return FunctionAnalysisManagerModuleProxy(DummyFAM); });
+
+      auto PA = Impl.run(M, DummyMAM);
+      return !PA.areAllPreserved();
+    }
+
+  private:
+    GlobalDCEPass Impl;
+  };
+}
+
+char GlobalDCELegacyPass::ID = 0;
+INITIALIZE_PASS(GlobalDCELegacyPass, "globaldce",
+                "Dead Global Elimination", false, false)
+
+// Public interface to the GlobalDCEPass.
+ModulePass *llvm::createGlobalDCEPass() {
+  return new GlobalDCELegacyPass();
+}
+
+/// Returns true if F is effectively empty.
+static bool isEmptyFunction(Function *F) {
+  BasicBlock &Entry = F->getEntryBlock();
+  for (auto &I : Entry) {
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
+    if (auto *RI = dyn_cast<ReturnInst>(&I))
+      return !RI->getReturnValue();
+    break;
+  }
+  return false;
+}
+
+/// Compute the set of GlobalValue that depends from V.
+/// The recursion stops as soon as a GlobalValue is met.
+void GlobalDCEPass::ComputeDependencies(Value *V,
+                                        SmallPtrSetImpl<GlobalValue *> &Deps) {
+  if (auto *I = dyn_cast<Instruction>(V)) {
+    Function *Parent = I->getParent()->getParent();
+    Deps.insert(Parent);
+  } else if (auto *GV = dyn_cast<GlobalValue>(V)) {
+    Deps.insert(GV);
+  } else if (auto *CE = dyn_cast<Constant>(V)) {
+    // Avoid walking the whole tree of a big ConstantExprs multiple times.
+    auto Where = ConstantDependenciesCache.find(CE);
+    if (Where != ConstantDependenciesCache.end()) {
+      auto const &K = Where->second;
+      Deps.insert(K.begin(), K.end());
+    } else {
+      SmallPtrSetImpl<GlobalValue *> &LocalDeps = ConstantDependenciesCache[CE];
+      for (User *CEUser : CE->users())
+        ComputeDependencies(CEUser, LocalDeps);
+      Deps.insert(LocalDeps.begin(), LocalDeps.end());
+    }
+  }
+}
+
+void GlobalDCEPass::UpdateGVDependencies(GlobalValue &GV) {
+  SmallPtrSet<GlobalValue *, 8> Deps;
+  for (User *User : GV.users())
+    ComputeDependencies(User, Deps);
+  Deps.erase(&GV); // Remove self-reference.
+  for (GlobalValue *GVU : Deps) {
+    // If this is a dep from a vtable to a virtual function, and we have
+    // complete information about all virtual call sites which could call
+    // though this vtable, then skip it, because the call site information will
+    // be more precise.
+    if (VFESafeVTables.count(GVU) && isa<Function>(&GV)) {
+      LLVM_DEBUG(dbgs() << "Ignoring dep " << GVU->getName() << " -> "
+                        << GV.getName() << "\n");
+      continue;
+    }
+    GVDependencies[GVU].insert(&GV);
+  }
+}
+
+/// Mark Global value as Live
+void GlobalDCEPass::MarkLive(GlobalValue &GV,
+                             SmallVectorImpl<GlobalValue *> *Updates) {
+  auto const Ret = AliveGlobals.insert(&GV);
+  if (!Ret.second)
+    return;
+
+  if (Updates)
+    Updates->push_back(&GV);
+  if (Comdat *C = GV.getComdat()) {
+    for (auto &&CM : make_range(ComdatMembers.equal_range(C))) {
+      MarkLive(*CM.second, Updates); // Recursion depth is only two because only
+                                     // globals in the same comdat are visited.
+    }
+  }
+}
+
+void GlobalDCEPass::ScanVTables(Module &M) {
+  SmallVector<MDNode *, 2> Types;
+  LLVM_DEBUG(dbgs() << "Building type info -> vtable map\n");
+
+  auto *LTOPostLinkMD =
+      cast_or_null<ConstantAsMetadata>(M.getModuleFlag("LTOPostLink"));
+  bool LTOPostLink =
+      LTOPostLinkMD &&
+      (cast<ConstantInt>(LTOPostLinkMD->getValue())->getZExtValue() != 0);
+
+  for (GlobalVariable &GV : M.globals()) {
+    Types.clear();
+    GV.getMetadata(LLVMContext::MD_type, Types);
+    if (GV.isDeclaration() || Types.empty())
+      continue;
+
+    // Use the typeid metadata on the vtable to build a mapping from typeids to
+    // the list of (GV, offset) pairs which are the possible vtables for that
+    // typeid.
+    for (MDNode *Type : Types) {
+      Metadata *TypeID = Type->getOperand(1).get();
+
+      uint64_t Offset =
+          cast<ConstantInt>(
+              cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
+              ->getZExtValue();
+
+      TypeIdMap[TypeID].insert(std::make_pair(&GV, Offset));
+    }
+
+    // If the type corresponding to the vtable is private to this translation
+    // unit, we know that we can see all virtual functions which might use it,
+    // so VFE is safe.
+    if (auto GO = dyn_cast<GlobalObject>(&GV)) {
+      GlobalObject::VCallVisibility TypeVis = GO->getVCallVisibility();
+      if (TypeVis == GlobalObject::VCallVisibilityTranslationUnit ||
+          (LTOPostLink &&
+           TypeVis == GlobalObject::VCallVisibilityLinkageUnit)) {
+        LLVM_DEBUG(dbgs() << GV.getName() << " is safe for VFE\n");
+        VFESafeVTables.insert(&GV);
+      }
+    }
+  }
+}
+
+void GlobalDCEPass::ScanVTableLoad(Function *Caller, Metadata *TypeId,
+                                   uint64_t CallOffset) {
+  for (auto &VTableInfo : TypeIdMap[TypeId]) {
+    GlobalVariable *VTable = VTableInfo.first;
+    uint64_t VTableOffset = VTableInfo.second;
+
+    Constant *Ptr =
+        getPointerAtOffset(VTable->getInitializer(), VTableOffset + CallOffset,
+                           *Caller->getParent());
+    if (!Ptr) {
+      LLVM_DEBUG(dbgs() << "can't find pointer in vtable!\n");
+      VFESafeVTables.erase(VTable);
+      return;
+    }
+
+    auto Callee = dyn_cast<Function>(Ptr->stripPointerCasts());
+    if (!Callee) {
+      LLVM_DEBUG(dbgs() << "vtable entry is not function pointer!\n");
+      VFESafeVTables.erase(VTable);
+      return;
+    }
+
+    LLVM_DEBUG(dbgs() << "vfunc dep " << Caller->getName() << " -> "
+                      << Callee->getName() << "\n");
+    GVDependencies[Caller].insert(Callee);
+  }
+}
+
+void GlobalDCEPass::ScanTypeCheckedLoadIntrinsics(Module &M) {
+  LLVM_DEBUG(dbgs() << "Scanning type.checked.load intrinsics\n");
+  Function *TypeCheckedLoadFunc =
+      M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load));
+
+  if (!TypeCheckedLoadFunc)
+    return;
+
+  for (auto U : TypeCheckedLoadFunc->users()) {
+    auto CI = dyn_cast<CallInst>(U);
+    if (!CI)
+      continue;
+
+    auto *Offset = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+    Value *TypeIdValue = CI->getArgOperand(2);
+    auto *TypeId = cast<MetadataAsValue>(TypeIdValue)->getMetadata();
+
+    if (Offset) {
+      ScanVTableLoad(CI->getFunction(), TypeId, Offset->getZExtValue());
+    } else {
+      // type.checked.load with a non-constant offset, so assume every entry in
+      // every matching vtable is used.
+      for (auto &VTableInfo : TypeIdMap[TypeId]) {
+        VFESafeVTables.erase(VTableInfo.first);
+      }
+    }
+  }
+}
+
+void GlobalDCEPass::AddVirtualFunctionDependencies(Module &M) {
+  if (!ClEnableVFE)
+    return;
+
+  // If the Virtual Function Elim module flag is present and set to zero, then
+  // the vcall_visibility metadata was inserted for another optimization (WPD)
+  // and we may not have type checked loads on all accesses to the vtable.
+  // Don't attempt VFE in that case.
+  auto *Val = mdconst::dyn_extract_or_null<ConstantInt>(
+      M.getModuleFlag("Virtual Function Elim"));
+  if (!Val || Val->getZExtValue() == 0)
+    return;
+
+  ScanVTables(M);
+
+  if (VFESafeVTables.empty())
+    return;
+
+  ScanTypeCheckedLoadIntrinsics(M);
+
+  LLVM_DEBUG(
+    dbgs() << "VFE safe vtables:\n";
+    for (auto *VTable : VFESafeVTables)
+      dbgs() << "  " << VTable->getName() << "\n";
+  );
+}
+
+PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) {
+  bool Changed = false;
+
+  // The algorithm first computes the set L of global variables that are
+  // trivially live.  Then it walks the initialization of these variables to
+  // compute the globals used to initialize them, which effectively builds a
+  // directed graph where nodes are global variables, and an edge from A to B
+  // means B is used to initialize A.  Finally, it propagates the liveness
+  // information through the graph starting from the nodes in L. Nodes note
+  // marked as alive are discarded.
+
+  // Remove empty functions from the global ctors list.
+  Changed |= optimizeGlobalCtorsList(M, isEmptyFunction);
+
+  // Collect the set of members for each comdat.
+  for (Function &F : M)
+    if (Comdat *C = F.getComdat())
+      ComdatMembers.insert(std::make_pair(C, &F));
+  for (GlobalVariable &GV : M.globals())
+    if (Comdat *C = GV.getComdat())
+      ComdatMembers.insert(std::make_pair(C, &GV));
+  for (GlobalAlias &GA : M.aliases())
+    if (Comdat *C = GA.getComdat())
+      ComdatMembers.insert(std::make_pair(C, &GA));
+
+  // Add dependencies between virtual call sites and the virtual functions they
+  // might call, if we have that information.
+  AddVirtualFunctionDependencies(M);
+
+  // Loop over the module, adding globals which are obviously necessary.
+  for (GlobalObject &GO : M.global_objects()) {
+    Changed |= RemoveUnusedGlobalValue(GO);
+    // Functions with external linkage are needed if they have a body.
+    // Externally visible & appending globals are needed, if they have an
+    // initializer.
+    if (!GO.isDeclaration())
+      if (!GO.isDiscardableIfUnused())
+        MarkLive(GO);
+
+    UpdateGVDependencies(GO);
+  }
+
+  // Compute direct dependencies of aliases.
+  for (GlobalAlias &GA : M.aliases()) {
+    Changed |= RemoveUnusedGlobalValue(GA);
+    // Externally visible aliases are needed.
+    if (!GA.isDiscardableIfUnused())
+      MarkLive(GA);
+
+    UpdateGVDependencies(GA);
+  }
+
+  // Compute direct dependencies of ifuncs.
+  for (GlobalIFunc &GIF : M.ifuncs()) {
+    Changed |= RemoveUnusedGlobalValue(GIF);
+    // Externally visible ifuncs are needed.
+    if (!GIF.isDiscardableIfUnused())
+      MarkLive(GIF);
+
+    UpdateGVDependencies(GIF);
+  }
+
+  // Propagate liveness from collected Global Values through the computed
+  // dependencies.
+  SmallVector<GlobalValue *, 8> NewLiveGVs{AliveGlobals.begin(),
+                                           AliveGlobals.end()};
+  while (!NewLiveGVs.empty()) {
+    GlobalValue *LGV = NewLiveGVs.pop_back_val();
+    for (auto *GVD : GVDependencies[LGV])
+      MarkLive(*GVD, &NewLiveGVs);
+  }
+
+  // Now that all globals which are needed are in the AliveGlobals set, we loop
+  // through the program, deleting those which are not alive.
+  //
+
+  // The first pass is to drop initializers of global variables which are dead.
+  std::vector<GlobalVariable *> DeadGlobalVars; // Keep track of dead globals
+  for (GlobalVariable &GV : M.globals())
+    if (!AliveGlobals.count(&GV)) {
+      DeadGlobalVars.push_back(&GV);         // Keep track of dead globals
+      if (GV.hasInitializer()) {
+        Constant *Init = GV.getInitializer();
+        GV.setInitializer(nullptr);
+        if (isSafeToDestroyConstant(Init))
+          Init->destroyConstant();
+      }
+    }
+
+  // The second pass drops the bodies of functions which are dead...
+  std::vector<Function *> DeadFunctions;
+  for (Function &F : M)
+    if (!AliveGlobals.count(&F)) {
+      DeadFunctions.push_back(&F);         // Keep track of dead globals
+      if (!F.isDeclaration())
+        F.deleteBody();
+    }
+
+  // The third pass drops targets of aliases which are dead...
+  std::vector<GlobalAlias*> DeadAliases;
+  for (GlobalAlias &GA : M.aliases())
+    if (!AliveGlobals.count(&GA)) {
+      DeadAliases.push_back(&GA);
+      GA.setAliasee(nullptr);
+    }
+
+  // The fourth pass drops targets of ifuncs which are dead...
+  std::vector<GlobalIFunc*> DeadIFuncs;
+  for (GlobalIFunc &GIF : M.ifuncs())
+    if (!AliveGlobals.count(&GIF)) {
+      DeadIFuncs.push_back(&GIF);
+      GIF.setResolver(nullptr);
+    }
+
+  // Now that all interferences have been dropped, delete the actual objects
+  // themselves.
+  auto EraseUnusedGlobalValue = [&](GlobalValue *GV) {
+    RemoveUnusedGlobalValue(*GV);
+    GV->eraseFromParent();
+    Changed = true;
+  };
+
+  NumFunctions += DeadFunctions.size();
+  for (Function *F : DeadFunctions) {
+    if (!F->use_empty()) {
+      // Virtual functions might still be referenced by one or more vtables,
+      // but if we've proven them to be unused then it's safe to replace the
+      // virtual function pointers with null, allowing us to remove the
+      // function itself.
+      ++NumVFuncs;
+      F->replaceNonMetadataUsesWith(ConstantPointerNull::get(F->getType()));
+    }
+    EraseUnusedGlobalValue(F);
+  }
+
+  NumVariables += DeadGlobalVars.size();
+  for (GlobalVariable *GV : DeadGlobalVars)
+    EraseUnusedGlobalValue(GV);
+
+  NumAliases += DeadAliases.size();
+  for (GlobalAlias *GA : DeadAliases)
+    EraseUnusedGlobalValue(GA);
+
+  NumIFuncs += DeadIFuncs.size();
+  for (GlobalIFunc *GIF : DeadIFuncs)
+    EraseUnusedGlobalValue(GIF);
+
+  // Make sure that all memory is released
+  AliveGlobals.clear();
+  ConstantDependenciesCache.clear();
+  GVDependencies.clear();
+  ComdatMembers.clear();
+  TypeIdMap.clear();
+  VFESafeVTables.clear();
+
+  if (Changed)
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+// RemoveUnusedGlobalValue - Loop over all of the uses of the specified
+// GlobalValue, looking for the constant pointer ref that may be pointing to it.
+// If found, check to see if the constant pointer ref is safe to destroy, and if
+// so, nuke it.  This will reduce the reference count on the global value, which
+// might make it deader.
+//
+bool GlobalDCEPass::RemoveUnusedGlobalValue(GlobalValue &GV) {
+  if (GV.use_empty())
+    return false;
+  GV.removeDeadConstantUsers();
+  return GV.use_empty();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/GlobalOpt.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/GlobalOpt.cpp
index ecc0634a9e..223a05e8ea 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/GlobalOpt.cpp
@@ -1,3204 +1,3204 @@
-//===- GlobalOpt.cpp - Optimize Global Variables --------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass transforms simple global variables that never have their address 
-// taken.  If obviously true, it marks read/write globals as constant, deletes 
-// variables only stored to, etc. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/IPO/GlobalOpt.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/Twine.h" 
-#include "llvm/ADT/iterator_range.h" 
-#include "llvm/Analysis/BlockFrequencyInfo.h" 
-#include "llvm/Analysis/ConstantFolding.h" 
-#include "llvm/Analysis/MemoryBuiltins.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/BinaryFormat/Dwarf.h" 
-#include "llvm/IR/Attributes.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/CallingConv.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DebugInfoMetadata.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GetElementPtrTypeIterator.h" 
-#include "llvm/IR/GlobalAlias.h" 
-#include "llvm/IR/GlobalValue.h" 
-#include "llvm/IR/GlobalVariable.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Operator.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/IR/ValueHandle.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/AtomicOrdering.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/MathExtras.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/IPO.h" 
-#include "llvm/Transforms/Utils/CtorUtils.h" 
-#include "llvm/Transforms/Utils/Evaluator.h" 
-#include "llvm/Transforms/Utils/GlobalStatus.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include <cassert> 
-#include <cstdint> 
-#include <utility> 
-#include <vector> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "globalopt" 
- 
-STATISTIC(NumMarked    , "Number of globals marked constant"); 
-STATISTIC(NumUnnamed   , "Number of globals marked unnamed_addr"); 
-STATISTIC(NumSRA       , "Number of aggregate globals broken into scalars"); 
-STATISTIC(NumHeapSRA   , "Number of heap objects SRA'd"); 
-STATISTIC(NumSubstitute,"Number of globals with initializers stored into them"); 
-STATISTIC(NumDeleted   , "Number of globals deleted"); 
-STATISTIC(NumGlobUses  , "Number of global uses devirtualized"); 
-STATISTIC(NumLocalized , "Number of globals localized"); 
-STATISTIC(NumShrunkToBool  , "Number of global vars shrunk to booleans"); 
-STATISTIC(NumFastCallFns   , "Number of functions converted to fastcc"); 
-STATISTIC(NumCtorsEvaluated, "Number of static ctors evaluated"); 
-STATISTIC(NumNestRemoved   , "Number of nest attributes removed"); 
-STATISTIC(NumAliasesResolved, "Number of global aliases resolved"); 
-STATISTIC(NumAliasesRemoved, "Number of global aliases eliminated"); 
-STATISTIC(NumCXXDtorsRemoved, "Number of global C++ destructors removed"); 
-STATISTIC(NumInternalFunc, "Number of internal functions"); 
-STATISTIC(NumColdCC, "Number of functions marked coldcc"); 
- 
-static cl::opt<bool> 
-    EnableColdCCStressTest("enable-coldcc-stress-test", 
-                           cl::desc("Enable stress test of coldcc by adding " 
-                                    "calling conv to all internal functions."), 
-                           cl::init(false), cl::Hidden); 
- 
-static cl::opt<int> ColdCCRelFreq( 
-    "coldcc-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore, 
-    cl::desc( 
-        "Maximum block frequency, expressed as a percentage of caller's " 
-        "entry frequency, for a call site to be considered cold for enabling" 
-        "coldcc")); 
- 
-/// Is this global variable possibly used by a leak checker as a root?  If so, 
-/// we might not really want to eliminate the stores to it. 
-static bool isLeakCheckerRoot(GlobalVariable *GV) { 
-  // A global variable is a root if it is a pointer, or could plausibly contain 
-  // a pointer.  There are two challenges; one is that we could have a struct 
-  // the has an inner member which is a pointer.  We recurse through the type to 
-  // detect these (up to a point).  The other is that we may actually be a union 
-  // of a pointer and another type, and so our LLVM type is an integer which 
-  // gets converted into a pointer, or our type is an [i8 x #] with a pointer 
-  // potentially contained here. 
- 
-  if (GV->hasPrivateLinkage()) 
-    return false; 
- 
-  SmallVector<Type *, 4> Types; 
-  Types.push_back(GV->getValueType()); 
- 
-  unsigned Limit = 20; 
-  do { 
-    Type *Ty = Types.pop_back_val(); 
-    switch (Ty->getTypeID()) { 
-      default: break; 
-      case Type::PointerTyID: 
-        return true; 
-      case Type::FixedVectorTyID: 
-      case Type::ScalableVectorTyID: 
-        if (cast<VectorType>(Ty)->getElementType()->isPointerTy()) 
-          return true; 
-        break; 
-      case Type::ArrayTyID: 
-        Types.push_back(cast<ArrayType>(Ty)->getElementType()); 
-        break; 
-      case Type::StructTyID: { 
-        StructType *STy = cast<StructType>(Ty); 
-        if (STy->isOpaque()) return true; 
-        for (StructType::element_iterator I = STy->element_begin(), 
-                 E = STy->element_end(); I != E; ++I) { 
-          Type *InnerTy = *I; 
-          if (isa<PointerType>(InnerTy)) return true; 
-          if (isa<StructType>(InnerTy) || isa<ArrayType>(InnerTy) || 
-              isa<VectorType>(InnerTy)) 
-            Types.push_back(InnerTy); 
-        } 
-        break; 
-      } 
-    } 
-    if (--Limit == 0) return true; 
-  } while (!Types.empty()); 
-  return false; 
-} 
- 
-/// Given a value that is stored to a global but never read, determine whether 
-/// it's safe to remove the store and the chain of computation that feeds the 
-/// store. 
-static bool IsSafeComputationToRemove( 
-    Value *V, function_ref<TargetLibraryInfo &(Function &)> GetTLI) { 
-  do { 
-    if (isa<Constant>(V)) 
-      return true; 
-    if (!V->hasOneUse()) 
-      return false; 
-    if (isa<LoadInst>(V) || isa<InvokeInst>(V) || isa<Argument>(V) || 
-        isa<GlobalValue>(V)) 
-      return false; 
-    if (isAllocationFn(V, GetTLI)) 
-      return true; 
- 
-    Instruction *I = cast<Instruction>(V); 
-    if (I->mayHaveSideEffects()) 
-      return false; 
-    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) { 
-      if (!GEP->hasAllConstantIndices()) 
-        return false; 
-    } else if (I->getNumOperands() != 1) { 
-      return false; 
-    } 
- 
-    V = I->getOperand(0); 
-  } while (true); 
-} 
- 
-/// This GV is a pointer root.  Loop over all users of the global and clean up 
-/// any that obviously don't assign the global a value that isn't dynamically 
-/// allocated. 
-static bool 
-CleanupPointerRootUsers(GlobalVariable *GV, 
-                        function_ref<TargetLibraryInfo &(Function &)> GetTLI) { 
-  // A brief explanation of leak checkers.  The goal is to find bugs where 
-  // pointers are forgotten, causing an accumulating growth in memory 
-  // usage over time.  The common strategy for leak checkers is to explicitly 
-  // allow the memory pointed to by globals at exit.  This is popular because it 
-  // also solves another problem where the main thread of a C++ program may shut 
-  // down before other threads that are still expecting to use those globals. To 
-  // handle that case, we expect the program may create a singleton and never 
-  // destroy it. 
- 
-  bool Changed = false; 
- 
-  // If Dead[n].first is the only use of a malloc result, we can delete its 
-  // chain of computation and the store to the global in Dead[n].second. 
-  SmallVector<std::pair<Instruction *, Instruction *>, 32> Dead; 
- 
-  // Constants can't be pointers to dynamically allocated memory. 
-  for (Value::user_iterator UI = GV->user_begin(), E = GV->user_end(); 
-       UI != E;) { 
-    User *U = *UI++; 
-    if (StoreInst *SI = dyn_cast<StoreInst>(U)) { 
-      Value *V = SI->getValueOperand(); 
-      if (isa<Constant>(V)) { 
-        Changed = true; 
-        SI->eraseFromParent(); 
-      } else if (Instruction *I = dyn_cast<Instruction>(V)) { 
-        if (I->hasOneUse()) 
-          Dead.push_back(std::make_pair(I, SI)); 
-      } 
-    } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(U)) { 
-      if (isa<Constant>(MSI->getValue())) { 
-        Changed = true; 
-        MSI->eraseFromParent(); 
-      } else if (Instruction *I = dyn_cast<Instruction>(MSI->getValue())) { 
-        if (I->hasOneUse()) 
-          Dead.push_back(std::make_pair(I, MSI)); 
-      } 
-    } else if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(U)) { 
-      GlobalVariable *MemSrc = dyn_cast<GlobalVariable>(MTI->getSource()); 
-      if (MemSrc && MemSrc->isConstant()) { 
-        Changed = true; 
-        MTI->eraseFromParent(); 
-      } else if (Instruction *I = dyn_cast<Instruction>(MemSrc)) { 
-        if (I->hasOneUse()) 
-          Dead.push_back(std::make_pair(I, MTI)); 
-      } 
-    } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) { 
-      if (CE->use_empty()) { 
-        CE->destroyConstant(); 
-        Changed = true; 
-      } 
-    } else if (Constant *C = dyn_cast<Constant>(U)) { 
-      if (isSafeToDestroyConstant(C)) { 
-        C->destroyConstant(); 
-        // This could have invalidated UI, start over from scratch. 
-        Dead.clear(); 
-        CleanupPointerRootUsers(GV, GetTLI); 
-        return true; 
-      } 
-    } 
-  } 
- 
-  for (int i = 0, e = Dead.size(); i != e; ++i) { 
-    if (IsSafeComputationToRemove(Dead[i].first, GetTLI)) { 
-      Dead[i].second->eraseFromParent(); 
-      Instruction *I = Dead[i].first; 
-      do { 
-        if (isAllocationFn(I, GetTLI)) 
-          break; 
-        Instruction *J = dyn_cast<Instruction>(I->getOperand(0)); 
-        if (!J) 
-          break; 
-        I->eraseFromParent(); 
-        I = J; 
-      } while (true); 
-      I->eraseFromParent(); 
+//===- GlobalOpt.cpp - Optimize Global Variables --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass transforms simple global variables that never have their address
+// taken.  If obviously true, it marks read/write globals as constant, deletes
+// variables only stored to, etc.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/GlobalOpt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/CtorUtils.h"
+#include "llvm/Transforms/Utils/Evaluator.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "globalopt"
+
+STATISTIC(NumMarked    , "Number of globals marked constant");
+STATISTIC(NumUnnamed   , "Number of globals marked unnamed_addr");
+STATISTIC(NumSRA       , "Number of aggregate globals broken into scalars");
+STATISTIC(NumHeapSRA   , "Number of heap objects SRA'd");
+STATISTIC(NumSubstitute,"Number of globals with initializers stored into them");
+STATISTIC(NumDeleted   , "Number of globals deleted");
+STATISTIC(NumGlobUses  , "Number of global uses devirtualized");
+STATISTIC(NumLocalized , "Number of globals localized");
+STATISTIC(NumShrunkToBool  , "Number of global vars shrunk to booleans");
+STATISTIC(NumFastCallFns   , "Number of functions converted to fastcc");
+STATISTIC(NumCtorsEvaluated, "Number of static ctors evaluated");
+STATISTIC(NumNestRemoved   , "Number of nest attributes removed");
+STATISTIC(NumAliasesResolved, "Number of global aliases resolved");
+STATISTIC(NumAliasesRemoved, "Number of global aliases eliminated");
+STATISTIC(NumCXXDtorsRemoved, "Number of global C++ destructors removed");
+STATISTIC(NumInternalFunc, "Number of internal functions");
+STATISTIC(NumColdCC, "Number of functions marked coldcc");
+
+static cl::opt<bool>
+    EnableColdCCStressTest("enable-coldcc-stress-test",
+                           cl::desc("Enable stress test of coldcc by adding "
+                                    "calling conv to all internal functions."),
+                           cl::init(false), cl::Hidden);
+
+static cl::opt<int> ColdCCRelFreq(
+    "coldcc-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore,
+    cl::desc(
+        "Maximum block frequency, expressed as a percentage of caller's "
+        "entry frequency, for a call site to be considered cold for enabling"
+        "coldcc"));
+
+/// Is this global variable possibly used by a leak checker as a root?  If so,
+/// we might not really want to eliminate the stores to it.
+static bool isLeakCheckerRoot(GlobalVariable *GV) {
+  // A global variable is a root if it is a pointer, or could plausibly contain
+  // a pointer.  There are two challenges; one is that we could have a struct
+  // the has an inner member which is a pointer.  We recurse through the type to
+  // detect these (up to a point).  The other is that we may actually be a union
+  // of a pointer and another type, and so our LLVM type is an integer which
+  // gets converted into a pointer, or our type is an [i8 x #] with a pointer
+  // potentially contained here.
+
+  if (GV->hasPrivateLinkage())
+    return false;
+
+  SmallVector<Type *, 4> Types;
+  Types.push_back(GV->getValueType());
+
+  unsigned Limit = 20;
+  do {
+    Type *Ty = Types.pop_back_val();
+    switch (Ty->getTypeID()) {
+      default: break;
+      case Type::PointerTyID:
+        return true;
+      case Type::FixedVectorTyID:
+      case Type::ScalableVectorTyID:
+        if (cast<VectorType>(Ty)->getElementType()->isPointerTy())
+          return true;
+        break;
+      case Type::ArrayTyID:
+        Types.push_back(cast<ArrayType>(Ty)->getElementType());
+        break;
+      case Type::StructTyID: {
+        StructType *STy = cast<StructType>(Ty);
+        if (STy->isOpaque()) return true;
+        for (StructType::element_iterator I = STy->element_begin(),
+                 E = STy->element_end(); I != E; ++I) {
+          Type *InnerTy = *I;
+          if (isa<PointerType>(InnerTy)) return true;
+          if (isa<StructType>(InnerTy) || isa<ArrayType>(InnerTy) ||
+              isa<VectorType>(InnerTy))
+            Types.push_back(InnerTy);
+        }
+        break;
+      }
+    }
+    if (--Limit == 0) return true;
+  } while (!Types.empty());
+  return false;
+}
+
+/// Given a value that is stored to a global but never read, determine whether
+/// it's safe to remove the store and the chain of computation that feeds the
+/// store.
+static bool IsSafeComputationToRemove(
+    Value *V, function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
+  do {
+    if (isa<Constant>(V))
+      return true;
+    if (!V->hasOneUse())
+      return false;
+    if (isa<LoadInst>(V) || isa<InvokeInst>(V) || isa<Argument>(V) ||
+        isa<GlobalValue>(V))
+      return false;
+    if (isAllocationFn(V, GetTLI))
+      return true;
+
+    Instruction *I = cast<Instruction>(V);
+    if (I->mayHaveSideEffects())
+      return false;
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
+      if (!GEP->hasAllConstantIndices())
+        return false;
+    } else if (I->getNumOperands() != 1) {
+      return false;
+    }
+
+    V = I->getOperand(0);
+  } while (true);
+}
+
+/// This GV is a pointer root.  Loop over all users of the global and clean up
+/// any that obviously don't assign the global a value that isn't dynamically
+/// allocated.
+static bool
+CleanupPointerRootUsers(GlobalVariable *GV,
+                        function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
+  // A brief explanation of leak checkers.  The goal is to find bugs where
+  // pointers are forgotten, causing an accumulating growth in memory
+  // usage over time.  The common strategy for leak checkers is to explicitly
+  // allow the memory pointed to by globals at exit.  This is popular because it
+  // also solves another problem where the main thread of a C++ program may shut
+  // down before other threads that are still expecting to use those globals. To
+  // handle that case, we expect the program may create a singleton and never
+  // destroy it.
+
+  bool Changed = false;
+
+  // If Dead[n].first is the only use of a malloc result, we can delete its
+  // chain of computation and the store to the global in Dead[n].second.
+  SmallVector<std::pair<Instruction *, Instruction *>, 32> Dead;
+
+  // Constants can't be pointers to dynamically allocated memory.
+  for (Value::user_iterator UI = GV->user_begin(), E = GV->user_end();
+       UI != E;) {
+    User *U = *UI++;
+    if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      Value *V = SI->getValueOperand();
+      if (isa<Constant>(V)) {
+        Changed = true;
+        SI->eraseFromParent();
+      } else if (Instruction *I = dyn_cast<Instruction>(V)) {
+        if (I->hasOneUse())
+          Dead.push_back(std::make_pair(I, SI));
+      }
+    } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(U)) {
+      if (isa<Constant>(MSI->getValue())) {
+        Changed = true;
+        MSI->eraseFromParent();
+      } else if (Instruction *I = dyn_cast<Instruction>(MSI->getValue())) {
+        if (I->hasOneUse())
+          Dead.push_back(std::make_pair(I, MSI));
+      }
+    } else if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(U)) {
+      GlobalVariable *MemSrc = dyn_cast<GlobalVariable>(MTI->getSource());
+      if (MemSrc && MemSrc->isConstant()) {
+        Changed = true;
+        MTI->eraseFromParent();
+      } else if (Instruction *I = dyn_cast<Instruction>(MemSrc)) {
+        if (I->hasOneUse())
+          Dead.push_back(std::make_pair(I, MTI));
+      }
+    } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
+      if (CE->use_empty()) {
+        CE->destroyConstant();
+        Changed = true;
+      }
+    } else if (Constant *C = dyn_cast<Constant>(U)) {
+      if (isSafeToDestroyConstant(C)) {
+        C->destroyConstant();
+        // This could have invalidated UI, start over from scratch.
+        Dead.clear();
+        CleanupPointerRootUsers(GV, GetTLI);
+        return true;
+      }
+    }
+  }
+
+  for (int i = 0, e = Dead.size(); i != e; ++i) {
+    if (IsSafeComputationToRemove(Dead[i].first, GetTLI)) {
+      Dead[i].second->eraseFromParent();
+      Instruction *I = Dead[i].first;
+      do {
+        if (isAllocationFn(I, GetTLI))
+          break;
+        Instruction *J = dyn_cast<Instruction>(I->getOperand(0));
+        if (!J)
+          break;
+        I->eraseFromParent();
+        I = J;
+      } while (true);
+      I->eraseFromParent();
       Changed = true;
-    } 
-  } 
- 
-  return Changed; 
-} 
- 
-/// We just marked GV constant.  Loop over all users of the global, cleaning up 
-/// the obvious ones.  This is largely just a quick scan over the use list to 
-/// clean up the easy and obvious cruft.  This returns true if it made a change. 
-static bool CleanupConstantGlobalUsers( 
-    Value *V, Constant *Init, const DataLayout &DL, 
-    function_ref<TargetLibraryInfo &(Function &)> GetTLI) { 
-  bool Changed = false; 
-  // Note that we need to use a weak value handle for the worklist items. When 
-  // we delete a constant array, we may also be holding pointer to one of its 
-  // elements (or an element of one of its elements if we're dealing with an 
-  // array of arrays) in the worklist. 
+    }
+  }
+
+  return Changed;
+}
+
+/// We just marked GV constant.  Loop over all users of the global, cleaning up
+/// the obvious ones.  This is largely just a quick scan over the use list to
+/// clean up the easy and obvious cruft.  This returns true if it made a change.
+static bool CleanupConstantGlobalUsers(
+    Value *V, Constant *Init, const DataLayout &DL,
+    function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
+  bool Changed = false;
+  // Note that we need to use a weak value handle for the worklist items. When
+  // we delete a constant array, we may also be holding pointer to one of its
+  // elements (or an element of one of its elements if we're dealing with an
+  // array of arrays) in the worklist.
   SmallVector<WeakTrackingVH, 8> WorkList(V->users());
-  while (!WorkList.empty()) { 
-    Value *UV = WorkList.pop_back_val(); 
-    if (!UV) 
-      continue; 
- 
-    User *U = cast<User>(UV); 
- 
-    if (LoadInst *LI = dyn_cast<LoadInst>(U)) { 
-      if (Init) { 
-        // Replace the load with the initializer. 
-        LI->replaceAllUsesWith(Init); 
-        LI->eraseFromParent(); 
-        Changed = true; 
-      } 
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(U)) { 
-      // Store must be unreachable or storing Init into the global. 
-      SI->eraseFromParent(); 
-      Changed = true; 
-    } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) { 
-      if (CE->getOpcode() == Instruction::GetElementPtr) { 
-        Constant *SubInit = nullptr; 
-        if (Init) 
-          SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE); 
-        Changed |= CleanupConstantGlobalUsers(CE, SubInit, DL, GetTLI); 
-      } else if ((CE->getOpcode() == Instruction::BitCast && 
-                  CE->getType()->isPointerTy()) || 
-                 CE->getOpcode() == Instruction::AddrSpaceCast) { 
-        // Pointer cast, delete any stores and memsets to the global. 
-        Changed |= CleanupConstantGlobalUsers(CE, nullptr, DL, GetTLI); 
-      } 
- 
-      if (CE->use_empty()) { 
-        CE->destroyConstant(); 
-        Changed = true; 
-      } 
-    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) { 
-      // Do not transform "gepinst (gep constexpr (GV))" here, because forming 
-      // "gepconstexpr (gep constexpr (GV))" will cause the two gep's to fold 
-      // and will invalidate our notion of what Init is. 
-      Constant *SubInit = nullptr; 
-      if (!isa<ConstantExpr>(GEP->getOperand(0))) { 
-        ConstantExpr *CE = dyn_cast_or_null<ConstantExpr>( 
-            ConstantFoldInstruction(GEP, DL, &GetTLI(*GEP->getFunction()))); 
-        if (Init && CE && CE->getOpcode() == Instruction::GetElementPtr) 
-          SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE); 
- 
-        // If the initializer is an all-null value and we have an inbounds GEP, 
-        // we already know what the result of any load from that GEP is. 
-        // TODO: Handle splats. 
-        if (Init && isa<ConstantAggregateZero>(Init) && GEP->isInBounds()) 
-          SubInit = Constant::getNullValue(GEP->getResultElementType()); 
-      } 
-      Changed |= CleanupConstantGlobalUsers(GEP, SubInit, DL, GetTLI); 
- 
-      if (GEP->use_empty()) { 
-        GEP->eraseFromParent(); 
-        Changed = true; 
-      } 
-    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U)) { // memset/cpy/mv 
-      if (MI->getRawDest() == V) { 
-        MI->eraseFromParent(); 
-        Changed = true; 
-      } 
- 
-    } else if (Constant *C = dyn_cast<Constant>(U)) { 
-      // If we have a chain of dead constantexprs or other things dangling from 
-      // us, and if they are all dead, nuke them without remorse. 
-      if (isSafeToDestroyConstant(C)) { 
-        C->destroyConstant(); 
-        CleanupConstantGlobalUsers(V, Init, DL, GetTLI); 
-        return true; 
-      } 
-    } 
-  } 
-  return Changed; 
-} 
- 
-static bool isSafeSROAElementUse(Value *V); 
- 
-/// Return true if the specified GEP is a safe user of a derived 
-/// expression from a global that we want to SROA. 
-static bool isSafeSROAGEP(User *U) { 
-  // Check to see if this ConstantExpr GEP is SRA'able.  In particular, we 
-  // don't like < 3 operand CE's, and we don't like non-constant integer 
-  // indices.  This enforces that all uses are 'gep GV, 0, C, ...' for some 
-  // value of C. 
-  if (U->getNumOperands() < 3 || !isa<Constant>(U->getOperand(1)) || 
-      !cast<Constant>(U->getOperand(1))->isNullValue()) 
-    return false; 
- 
-  gep_type_iterator GEPI = gep_type_begin(U), E = gep_type_end(U); 
-  ++GEPI; // Skip over the pointer index. 
- 
-  // For all other level we require that the indices are constant and inrange. 
-  // In particular, consider: A[0][i].  We cannot know that the user isn't doing 
-  // invalid things like allowing i to index an out-of-range subscript that 
-  // accesses A[1]. This can also happen between different members of a struct 
-  // in llvm IR. 
-  for (; GEPI != E; ++GEPI) { 
-    if (GEPI.isStruct()) 
-      continue; 
- 
-    ConstantInt *IdxVal = dyn_cast<ConstantInt>(GEPI.getOperand()); 
-    if (!IdxVal || (GEPI.isBoundedSequential() && 
-                    IdxVal->getZExtValue() >= GEPI.getSequentialNumElements())) 
-      return false; 
-  } 
- 
-  return llvm::all_of(U->users(), 
-                      [](User *UU) { return isSafeSROAElementUse(UU); }); 
-} 
- 
-/// Return true if the specified instruction is a safe user of a derived 
-/// expression from a global that we want to SROA. 
-static bool isSafeSROAElementUse(Value *V) { 
-  // We might have a dead and dangling constant hanging off of here. 
-  if (Constant *C = dyn_cast<Constant>(V)) 
-    return isSafeToDestroyConstant(C); 
- 
-  Instruction *I = dyn_cast<Instruction>(V); 
-  if (!I) return false; 
- 
-  // Loads are ok. 
-  if (isa<LoadInst>(I)) return true; 
- 
-  // Stores *to* the pointer are ok. 
-  if (StoreInst *SI = dyn_cast<StoreInst>(I)) 
-    return SI->getOperand(0) != V; 
- 
-  // Otherwise, it must be a GEP. Check it and its users are safe to SRA. 
-  return isa<GetElementPtrInst>(I) && isSafeSROAGEP(I); 
-} 
- 
-/// Look at all uses of the global and decide whether it is safe for us to 
-/// perform this transformation. 
-static bool GlobalUsersSafeToSRA(GlobalValue *GV) { 
-  for (User *U : GV->users()) { 
-    // The user of the global must be a GEP Inst or a ConstantExpr GEP. 
-    if (!isa<GetElementPtrInst>(U) && 
-        (!isa<ConstantExpr>(U) || 
-        cast<ConstantExpr>(U)->getOpcode() != Instruction::GetElementPtr)) 
-      return false; 
- 
-    // Check the gep and it's users are safe to SRA 
-    if (!isSafeSROAGEP(U)) 
-      return false; 
-  } 
- 
-  return true; 
-} 
- 
-static bool IsSRASequential(Type *T) { 
-  return isa<ArrayType>(T) || isa<VectorType>(T); 
-} 
-static uint64_t GetSRASequentialNumElements(Type *T) { 
-  if (ArrayType *AT = dyn_cast<ArrayType>(T)) 
-    return AT->getNumElements(); 
-  return cast<FixedVectorType>(T)->getNumElements(); 
-} 
-static Type *GetSRASequentialElementType(Type *T) { 
-  if (ArrayType *AT = dyn_cast<ArrayType>(T)) 
-    return AT->getElementType(); 
-  return cast<VectorType>(T)->getElementType(); 
-} 
-static bool CanDoGlobalSRA(GlobalVariable *GV) { 
-  Constant *Init = GV->getInitializer(); 
- 
-  if (isa<StructType>(Init->getType())) { 
-    // nothing to check 
-  } else if (IsSRASequential(Init->getType())) { 
-    if (GetSRASequentialNumElements(Init->getType()) > 16 && 
-        GV->hasNUsesOrMore(16)) 
-      return false; // It's not worth it. 
-  } else 
-    return false; 
- 
-  return GlobalUsersSafeToSRA(GV); 
-} 
- 
-/// Copy over the debug info for a variable to its SRA replacements. 
-static void transferSRADebugInfo(GlobalVariable *GV, GlobalVariable *NGV, 
-                                 uint64_t FragmentOffsetInBits, 
-                                 uint64_t FragmentSizeInBits, 
-                                 uint64_t VarSize) { 
-  SmallVector<DIGlobalVariableExpression *, 1> GVs; 
-  GV->getDebugInfo(GVs); 
-  for (auto *GVE : GVs) { 
-    DIVariable *Var = GVE->getVariable(); 
-    DIExpression *Expr = GVE->getExpression(); 
-    // If the FragmentSize is smaller than the variable, 
-    // emit a fragment expression. 
-    if (FragmentSizeInBits < VarSize) { 
-      if (auto E = DIExpression::createFragmentExpression( 
-              Expr, FragmentOffsetInBits, FragmentSizeInBits)) 
-        Expr = *E; 
-      else 
-        return; 
-    } 
-    auto *NGVE = DIGlobalVariableExpression::get(GVE->getContext(), Var, Expr); 
-    NGV->addDebugInfo(NGVE); 
-  } 
-} 
- 
-/// Perform scalar replacement of aggregates on the specified global variable. 
-/// This opens the door for other optimizations by exposing the behavior of the 
-/// program in a more fine-grained way.  We have determined that this 
-/// transformation is safe already.  We return the first global variable we 
-/// insert so that the caller can reprocess it. 
-static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) { 
-  // Make sure this global only has simple uses that we can SRA. 
-  if (!CanDoGlobalSRA(GV)) 
-    return nullptr; 
- 
-  assert(GV->hasLocalLinkage()); 
-  Constant *Init = GV->getInitializer(); 
-  Type *Ty = Init->getType(); 
-  uint64_t VarSize = DL.getTypeSizeInBits(Ty); 
- 
-  std::map<unsigned, GlobalVariable *> NewGlobals; 
- 
-  // Get the alignment of the global, either explicit or target-specific. 
-  Align StartAlignment = 
-      DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getType()); 
- 
-  // Loop over all users and create replacement variables for used aggregate 
-  // elements. 
-  for (User *GEP : GV->users()) { 
-    assert(((isa<ConstantExpr>(GEP) && cast<ConstantExpr>(GEP)->getOpcode() == 
-                                           Instruction::GetElementPtr) || 
-            isa<GetElementPtrInst>(GEP)) && 
-           "NonGEP CE's are not SRAable!"); 
- 
-    // Ignore the 1th operand, which has to be zero or else the program is quite 
-    // broken (undefined).  Get the 2nd operand, which is the structure or array 
-    // index. 
-    unsigned ElementIdx = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue(); 
-    if (NewGlobals.count(ElementIdx) == 1) 
-      continue; // we`ve already created replacement variable 
-    assert(NewGlobals.count(ElementIdx) == 0); 
- 
-    Type *ElTy = nullptr; 
-    if (StructType *STy = dyn_cast<StructType>(Ty)) 
-      ElTy = STy->getElementType(ElementIdx); 
-    else 
-      ElTy = GetSRASequentialElementType(Ty); 
-    assert(ElTy); 
- 
-    Constant *In = Init->getAggregateElement(ElementIdx); 
-    assert(In && "Couldn't get element of initializer?"); 
- 
-    GlobalVariable *NGV = new GlobalVariable( 
-        ElTy, false, GlobalVariable::InternalLinkage, In, 
-        GV->getName() + "." + Twine(ElementIdx), GV->getThreadLocalMode(), 
-        GV->getType()->getAddressSpace()); 
-    NGV->setExternallyInitialized(GV->isExternallyInitialized()); 
-    NGV->copyAttributesFrom(GV); 
-    NewGlobals.insert(std::make_pair(ElementIdx, NGV)); 
- 
-    if (StructType *STy = dyn_cast<StructType>(Ty)) { 
-      const StructLayout &Layout = *DL.getStructLayout(STy); 
- 
-      // Calculate the known alignment of the field.  If the original aggregate 
-      // had 256 byte alignment for example, something might depend on that: 
-      // propagate info to each field. 
-      uint64_t FieldOffset = Layout.getElementOffset(ElementIdx); 
-      Align NewAlign = commonAlignment(StartAlignment, FieldOffset); 
-      if (NewAlign > DL.getABITypeAlign(STy->getElementType(ElementIdx))) 
-        NGV->setAlignment(NewAlign); 
- 
-      // Copy over the debug info for the variable. 
-      uint64_t Size = DL.getTypeAllocSizeInBits(NGV->getValueType()); 
-      uint64_t FragmentOffsetInBits = Layout.getElementOffsetInBits(ElementIdx); 
-      transferSRADebugInfo(GV, NGV, FragmentOffsetInBits, Size, VarSize); 
-    } else { 
-      uint64_t EltSize = DL.getTypeAllocSize(ElTy); 
-      Align EltAlign = DL.getABITypeAlign(ElTy); 
-      uint64_t FragmentSizeInBits = DL.getTypeAllocSizeInBits(ElTy); 
- 
-      // Calculate the known alignment of the field.  If the original aggregate 
-      // had 256 byte alignment for example, something might depend on that: 
-      // propagate info to each field. 
-      Align NewAlign = commonAlignment(StartAlignment, EltSize * ElementIdx); 
-      if (NewAlign > EltAlign) 
-        NGV->setAlignment(NewAlign); 
-      transferSRADebugInfo(GV, NGV, FragmentSizeInBits * ElementIdx, 
-                           FragmentSizeInBits, VarSize); 
-    } 
-  } 
- 
-  if (NewGlobals.empty()) 
-    return nullptr; 
- 
-  Module::GlobalListType &Globals = GV->getParent()->getGlobalList(); 
-  for (auto NewGlobalVar : NewGlobals) 
-    Globals.push_back(NewGlobalVar.second); 
- 
-  LLVM_DEBUG(dbgs() << "PERFORMING GLOBAL SRA ON: " << *GV << "\n"); 
- 
-  Constant *NullInt =Constant::getNullValue(Type::getInt32Ty(GV->getContext())); 
- 
-  // Loop over all of the uses of the global, replacing the constantexpr geps, 
-  // with smaller constantexpr geps or direct references. 
-  while (!GV->use_empty()) { 
-    User *GEP = GV->user_back(); 
-    assert(((isa<ConstantExpr>(GEP) && 
-             cast<ConstantExpr>(GEP)->getOpcode()==Instruction::GetElementPtr)|| 
-            isa<GetElementPtrInst>(GEP)) && "NonGEP CE's are not SRAable!"); 
- 
-    // Ignore the 1th operand, which has to be zero or else the program is quite 
-    // broken (undefined).  Get the 2nd operand, which is the structure or array 
-    // index. 
-    unsigned ElementIdx = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue(); 
-    assert(NewGlobals.count(ElementIdx) == 1); 
- 
-    Value *NewPtr = NewGlobals[ElementIdx]; 
-    Type *NewTy = NewGlobals[ElementIdx]->getValueType(); 
- 
-    // Form a shorter GEP if needed. 
-    if (GEP->getNumOperands() > 3) { 
-      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(GEP)) { 
-        SmallVector<Constant*, 8> Idxs; 
-        Idxs.push_back(NullInt); 
-        for (unsigned i = 3, e = CE->getNumOperands(); i != e; ++i) 
-          Idxs.push_back(CE->getOperand(i)); 
-        NewPtr = 
-            ConstantExpr::getGetElementPtr(NewTy, cast<Constant>(NewPtr), Idxs); 
-      } else { 
-        GetElementPtrInst *GEPI = cast<GetElementPtrInst>(GEP); 
-        SmallVector<Value*, 8> Idxs; 
-        Idxs.push_back(NullInt); 
-        for (unsigned i = 3, e = GEPI->getNumOperands(); i != e; ++i) 
-          Idxs.push_back(GEPI->getOperand(i)); 
-        NewPtr = GetElementPtrInst::Create( 
-            NewTy, NewPtr, Idxs, GEPI->getName() + "." + Twine(ElementIdx), 
-            GEPI); 
-      } 
-    } 
-    GEP->replaceAllUsesWith(NewPtr); 
- 
-    if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(GEP)) 
-      GEPI->eraseFromParent(); 
-    else 
-      cast<ConstantExpr>(GEP)->destroyConstant(); 
-  } 
- 
-  // Delete the old global, now that it is dead. 
-  Globals.erase(GV); 
-  ++NumSRA; 
- 
-  assert(NewGlobals.size() > 0); 
-  return NewGlobals.begin()->second; 
-} 
- 
-/// Return true if all users of the specified value will trap if the value is 
-/// dynamically null.  PHIs keeps track of any phi nodes we've seen to avoid 
-/// reprocessing them. 
-static bool AllUsesOfValueWillTrapIfNull(const Value *V, 
-                                        SmallPtrSetImpl<const PHINode*> &PHIs) { 
-  for (const User *U : V->users()) { 
-    if (const Instruction *I = dyn_cast<Instruction>(U)) { 
-      // If null pointer is considered valid, then all uses are non-trapping. 
-      // Non address-space 0 globals have already been pruned by the caller. 
-      if (NullPointerIsDefined(I->getFunction())) 
-        return false; 
-    } 
-    if (isa<LoadInst>(U)) { 
-      // Will trap. 
-    } else if (const StoreInst *SI = dyn_cast<StoreInst>(U)) { 
-      if (SI->getOperand(0) == V) { 
-        //cerr << "NONTRAPPING USE: " << *U; 
-        return false;  // Storing the value. 
-      } 
-    } else if (const CallInst *CI = dyn_cast<CallInst>(U)) { 
-      if (CI->getCalledOperand() != V) { 
-        //cerr << "NONTRAPPING USE: " << *U; 
-        return false;  // Not calling the ptr 
-      } 
-    } else if (const InvokeInst *II = dyn_cast<InvokeInst>(U)) { 
-      if (II->getCalledOperand() != V) { 
-        //cerr << "NONTRAPPING USE: " << *U; 
-        return false;  // Not calling the ptr 
-      } 
-    } else if (const BitCastInst *CI = dyn_cast<BitCastInst>(U)) { 
-      if (!AllUsesOfValueWillTrapIfNull(CI, PHIs)) return false; 
-    } else if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) { 
-      if (!AllUsesOfValueWillTrapIfNull(GEPI, PHIs)) return false; 
-    } else if (const PHINode *PN = dyn_cast<PHINode>(U)) { 
-      // If we've already seen this phi node, ignore it, it has already been 
-      // checked. 
-      if (PHIs.insert(PN).second && !AllUsesOfValueWillTrapIfNull(PN, PHIs)) 
-        return false; 
-    } else { 
-      //cerr << "NONTRAPPING USE: " << *U; 
-      return false; 
-    } 
-  } 
-  return true; 
-} 
- 
-/// Return true if all uses of any loads from GV will trap if the loaded value 
-/// is null.  Note that this also permits comparisons of the loaded value 
-/// against null, as a special case. 
-static bool AllUsesOfLoadedValueWillTrapIfNull(const GlobalVariable *GV) { 
-  for (const User *U : GV->users()) 
-    if (const LoadInst *LI = dyn_cast<LoadInst>(U)) { 
-      SmallPtrSet<const PHINode*, 8> PHIs; 
-      if (!AllUsesOfValueWillTrapIfNull(LI, PHIs)) 
-        return false; 
-    } else if (isa<StoreInst>(U)) { 
-      // Ignore stores to the global. 
-    } else { 
-      // We don't know or understand this user, bail out. 
-      //cerr << "UNKNOWN USER OF GLOBAL!: " << *U; 
-      return false; 
-    } 
-  return true; 
-} 
- 
-static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) { 
-  bool Changed = false; 
-  for (auto UI = V->user_begin(), E = V->user_end(); UI != E; ) { 
-    Instruction *I = cast<Instruction>(*UI++); 
-    // Uses are non-trapping if null pointer is considered valid. 
-    // Non address-space 0 globals are already pruned by the caller. 
-    if (NullPointerIsDefined(I->getFunction())) 
-      return false; 
-    if (LoadInst *LI = dyn_cast<LoadInst>(I)) { 
-      LI->setOperand(0, NewV); 
-      Changed = true; 
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) { 
-      if (SI->getOperand(1) == V) { 
-        SI->setOperand(1, NewV); 
-        Changed = true; 
-      } 
-    } else if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 
-      CallBase *CB = cast<CallBase>(I); 
-      if (CB->getCalledOperand() == V) { 
-        // Calling through the pointer!  Turn into a direct call, but be careful 
-        // that the pointer is not also being passed as an argument. 
-        CB->setCalledOperand(NewV); 
-        Changed = true; 
-        bool PassedAsArg = false; 
-        for (unsigned i = 0, e = CB->arg_size(); i != e; ++i) 
-          if (CB->getArgOperand(i) == V) { 
-            PassedAsArg = true; 
-            CB->setArgOperand(i, NewV); 
-          } 
- 
-        if (PassedAsArg) { 
-          // Being passed as an argument also.  Be careful to not invalidate UI! 
-          UI = V->user_begin(); 
-        } 
-      } 
-    } else if (CastInst *CI = dyn_cast<CastInst>(I)) { 
-      Changed |= OptimizeAwayTrappingUsesOfValue(CI, 
-                                ConstantExpr::getCast(CI->getOpcode(), 
-                                                      NewV, CI->getType())); 
-      if (CI->use_empty()) { 
-        Changed = true; 
-        CI->eraseFromParent(); 
-      } 
-    } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) { 
-      // Should handle GEP here. 
-      SmallVector<Constant*, 8> Idxs; 
-      Idxs.reserve(GEPI->getNumOperands()-1); 
-      for (User::op_iterator i = GEPI->op_begin() + 1, e = GEPI->op_end(); 
-           i != e; ++i) 
-        if (Constant *C = dyn_cast<Constant>(*i)) 
-          Idxs.push_back(C); 
-        else 
-          break; 
-      if (Idxs.size() == GEPI->getNumOperands()-1) 
-        Changed |= OptimizeAwayTrappingUsesOfValue( 
-            GEPI, ConstantExpr::getGetElementPtr(GEPI->getSourceElementType(), 
-                                                 NewV, Idxs)); 
-      if (GEPI->use_empty()) { 
-        Changed = true; 
-        GEPI->eraseFromParent(); 
-      } 
-    } 
-  } 
- 
-  return Changed; 
-} 
- 
-/// The specified global has only one non-null value stored into it.  If there 
-/// are uses of the loaded value that would trap if the loaded value is 
-/// dynamically null, then we know that they cannot be reachable with a null 
-/// optimize away the load. 
-static bool OptimizeAwayTrappingUsesOfLoads( 
-    GlobalVariable *GV, Constant *LV, const DataLayout &DL, 
-    function_ref<TargetLibraryInfo &(Function &)> GetTLI) { 
-  bool Changed = false; 
- 
-  // Keep track of whether we are able to remove all the uses of the global 
-  // other than the store that defines it. 
-  bool AllNonStoreUsesGone = true; 
- 
-  // Replace all uses of loads with uses of uses of the stored value. 
-  for (Value::user_iterator GUI = GV->user_begin(), E = GV->user_end(); GUI != E;){ 
-    User *GlobalUser = *GUI++; 
-    if (LoadInst *LI = dyn_cast<LoadInst>(GlobalUser)) { 
-      Changed |= OptimizeAwayTrappingUsesOfValue(LI, LV); 
-      // If we were able to delete all uses of the loads 
-      if (LI->use_empty()) { 
-        LI->eraseFromParent(); 
-        Changed = true; 
-      } else { 
-        AllNonStoreUsesGone = false; 
-      } 
-    } else if (isa<StoreInst>(GlobalUser)) { 
-      // Ignore the store that stores "LV" to the global. 
-      assert(GlobalUser->getOperand(1) == GV && 
-             "Must be storing *to* the global"); 
-    } else { 
-      AllNonStoreUsesGone = false; 
- 
-      // If we get here we could have other crazy uses that are transitively 
-      // loaded. 
-      assert((isa<PHINode>(GlobalUser) || isa<SelectInst>(GlobalUser) || 
-              isa<ConstantExpr>(GlobalUser) || isa<CmpInst>(GlobalUser) || 
-              isa<BitCastInst>(GlobalUser) || 
-              isa<GetElementPtrInst>(GlobalUser)) && 
-             "Only expect load and stores!"); 
-    } 
-  } 
- 
-  if (Changed) { 
-    LLVM_DEBUG(dbgs() << "OPTIMIZED LOADS FROM STORED ONCE POINTER: " << *GV 
-                      << "\n"); 
-    ++NumGlobUses; 
-  } 
- 
-  // If we nuked all of the loads, then none of the stores are needed either, 
-  // nor is the global. 
-  if (AllNonStoreUsesGone) { 
-    if (isLeakCheckerRoot(GV)) { 
-      Changed |= CleanupPointerRootUsers(GV, GetTLI); 
-    } else { 
-      Changed = true; 
-      CleanupConstantGlobalUsers(GV, nullptr, DL, GetTLI); 
-    } 
-    if (GV->use_empty()) { 
-      LLVM_DEBUG(dbgs() << "  *** GLOBAL NOW DEAD!\n"); 
-      Changed = true; 
-      GV->eraseFromParent(); 
-      ++NumDeleted; 
-    } 
-  } 
-  return Changed; 
-} 
- 
-/// Walk the use list of V, constant folding all of the instructions that are 
-/// foldable. 
-static void ConstantPropUsersOf(Value *V, const DataLayout &DL, 
-                                TargetLibraryInfo *TLI) { 
-  for (Value::user_iterator UI = V->user_begin(), E = V->user_end(); UI != E; ) 
-    if (Instruction *I = dyn_cast<Instruction>(*UI++)) 
-      if (Constant *NewC = ConstantFoldInstruction(I, DL, TLI)) { 
-        I->replaceAllUsesWith(NewC); 
- 
-        // Advance UI to the next non-I use to avoid invalidating it! 
-        // Instructions could multiply use V. 
-        while (UI != E && *UI == I) 
-          ++UI; 
-        if (isInstructionTriviallyDead(I, TLI)) 
-          I->eraseFromParent(); 
-      } 
-} 
- 
-/// This function takes the specified global variable, and transforms the 
-/// program as if it always contained the result of the specified malloc. 
-/// Because it is always the result of the specified malloc, there is no reason 
-/// to actually DO the malloc.  Instead, turn the malloc into a global, and any 
-/// loads of GV as uses of the new global. 
-static GlobalVariable * 
-OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy, 
-                              ConstantInt *NElements, const DataLayout &DL, 
-                              TargetLibraryInfo *TLI) { 
-  LLVM_DEBUG(errs() << "PROMOTING GLOBAL: " << *GV << "  CALL = " << *CI 
-                    << '\n'); 
- 
-  Type *GlobalType; 
-  if (NElements->getZExtValue() == 1) 
-    GlobalType = AllocTy; 
-  else 
-    // If we have an array allocation, the global variable is of an array. 
-    GlobalType = ArrayType::get(AllocTy, NElements->getZExtValue()); 
- 
-  // Create the new global variable.  The contents of the malloc'd memory is 
-  // undefined, so initialize with an undef value. 
-  GlobalVariable *NewGV = new GlobalVariable( 
-      *GV->getParent(), GlobalType, false, GlobalValue::InternalLinkage, 
-      UndefValue::get(GlobalType), GV->getName() + ".body", nullptr, 
-      GV->getThreadLocalMode()); 
- 
-  // If there are bitcast users of the malloc (which is typical, usually we have 
-  // a malloc + bitcast) then replace them with uses of the new global.  Update 
-  // other users to use the global as well. 
-  BitCastInst *TheBC = nullptr; 
-  while (!CI->use_empty()) { 
-    Instruction *User = cast<Instruction>(CI->user_back()); 
-    if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) { 
-      if (BCI->getType() == NewGV->getType()) { 
-        BCI->replaceAllUsesWith(NewGV); 
-        BCI->eraseFromParent(); 
-      } else { 
-        BCI->setOperand(0, NewGV); 
-      } 
-    } else { 
-      if (!TheBC) 
-        TheBC = new BitCastInst(NewGV, CI->getType(), "newgv", CI); 
-      User->replaceUsesOfWith(CI, TheBC); 
-    } 
-  } 
- 
-  Constant *RepValue = NewGV; 
-  if (NewGV->getType() != GV->getValueType()) 
-    RepValue = ConstantExpr::getBitCast(RepValue, GV->getValueType()); 
- 
-  // If there is a comparison against null, we will insert a global bool to 
-  // keep track of whether the global was initialized yet or not. 
-  GlobalVariable *InitBool = 
-    new GlobalVariable(Type::getInt1Ty(GV->getContext()), false, 
-                       GlobalValue::InternalLinkage, 
-                       ConstantInt::getFalse(GV->getContext()), 
-                       GV->getName()+".init", GV->getThreadLocalMode()); 
-  bool InitBoolUsed = false; 
- 
-  // Loop over all uses of GV, processing them in turn. 
-  while (!GV->use_empty()) { 
-    if (StoreInst *SI = dyn_cast<StoreInst>(GV->user_back())) { 
-      // The global is initialized when the store to it occurs. 
-      new StoreInst(ConstantInt::getTrue(GV->getContext()), InitBool, false, 
-                    Align(1), SI->getOrdering(), SI->getSyncScopeID(), SI); 
-      SI->eraseFromParent(); 
-      continue; 
-    } 
- 
-    LoadInst *LI = cast<LoadInst>(GV->user_back()); 
-    while (!LI->use_empty()) { 
-      Use &LoadUse = *LI->use_begin(); 
-      ICmpInst *ICI = dyn_cast<ICmpInst>(LoadUse.getUser()); 
-      if (!ICI) { 
-        LoadUse = RepValue; 
-        continue; 
-      } 
- 
-      // Replace the cmp X, 0 with a use of the bool value. 
-      // Sink the load to where the compare was, if atomic rules allow us to. 
-      Value *LV = new LoadInst(InitBool->getValueType(), InitBool, 
-                               InitBool->getName() + ".val", false, Align(1), 
-                               LI->getOrdering(), LI->getSyncScopeID(), 
-                               LI->isUnordered() ? (Instruction *)ICI : LI); 
-      InitBoolUsed = true; 
-      switch (ICI->getPredicate()) { 
-      default: llvm_unreachable("Unknown ICmp Predicate!"); 
-      case ICmpInst::ICMP_ULT: 
-      case ICmpInst::ICMP_SLT:   // X < null -> always false 
-        LV = ConstantInt::getFalse(GV->getContext()); 
-        break; 
-      case ICmpInst::ICMP_ULE: 
-      case ICmpInst::ICMP_SLE: 
-      case ICmpInst::ICMP_EQ: 
-        LV = BinaryOperator::CreateNot(LV, "notinit", ICI); 
-        break; 
-      case ICmpInst::ICMP_NE: 
-      case ICmpInst::ICMP_UGE: 
-      case ICmpInst::ICMP_SGE: 
-      case ICmpInst::ICMP_UGT: 
-      case ICmpInst::ICMP_SGT: 
-        break;  // no change. 
-      } 
-      ICI->replaceAllUsesWith(LV); 
-      ICI->eraseFromParent(); 
-    } 
-    LI->eraseFromParent(); 
-  } 
- 
-  // If the initialization boolean was used, insert it, otherwise delete it. 
-  if (!InitBoolUsed) { 
-    while (!InitBool->use_empty())  // Delete initializations 
-      cast<StoreInst>(InitBool->user_back())->eraseFromParent(); 
-    delete InitBool; 
-  } else 
-    GV->getParent()->getGlobalList().insert(GV->getIterator(), InitBool); 
- 
-  // Now the GV is dead, nuke it and the malloc.. 
-  GV->eraseFromParent(); 
-  CI->eraseFromParent(); 
- 
-  // To further other optimizations, loop over all users of NewGV and try to 
-  // constant prop them.  This will promote GEP instructions with constant 
-  // indices into GEP constant-exprs, which will allow global-opt to hack on it. 
-  ConstantPropUsersOf(NewGV, DL, TLI); 
-  if (RepValue != NewGV) 
-    ConstantPropUsersOf(RepValue, DL, TLI); 
- 
-  return NewGV; 
-} 
- 
-/// Scan the use-list of V checking to make sure that there are no complex uses 
-/// of V.  We permit simple things like dereferencing the pointer, but not 
-/// storing through the address, unless it is to the specified global. 
-static bool ValueIsOnlyUsedLocallyOrStoredToOneGlobal(const Instruction *V, 
-                                                      const GlobalVariable *GV, 
-                                        SmallPtrSetImpl<const PHINode*> &PHIs) { 
-  for (const User *U : V->users()) { 
-    const Instruction *Inst = cast<Instruction>(U); 
- 
-    if (isa<LoadInst>(Inst) || isa<CmpInst>(Inst)) { 
-      continue; // Fine, ignore. 
-    } 
- 
-    if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) { 
-      if (SI->getOperand(0) == V && SI->getOperand(1) != GV) 
-        return false;  // Storing the pointer itself... bad. 
-      continue; // Otherwise, storing through it, or storing into GV... fine. 
-    } 
- 
-    // Must index into the array and into the struct. 
-    if (isa<GetElementPtrInst>(Inst) && Inst->getNumOperands() >= 3) { 
-      if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(Inst, GV, PHIs)) 
-        return false; 
-      continue; 
-    } 
- 
-    if (const PHINode *PN = dyn_cast<PHINode>(Inst)) { 
-      // PHIs are ok if all uses are ok.  Don't infinitely recurse through PHI 
-      // cycles. 
-      if (PHIs.insert(PN).second) 
-        if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(PN, GV, PHIs)) 
-          return false; 
-      continue; 
-    } 
- 
-    if (const BitCastInst *BCI = dyn_cast<BitCastInst>(Inst)) { 
-      if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(BCI, GV, PHIs)) 
-        return false; 
-      continue; 
-    } 
- 
-    return false; 
-  } 
-  return true; 
-} 
- 
-/// The Alloc pointer is stored into GV somewhere.  Transform all uses of the 
-/// allocation into loads from the global and uses of the resultant pointer. 
-/// Further, delete the store into GV.  This assumes that these value pass the 
-/// 'ValueIsOnlyUsedLocallyOrStoredToOneGlobal' predicate. 
-static void ReplaceUsesOfMallocWithGlobal(Instruction *Alloc, 
-                                          GlobalVariable *GV) { 
-  while (!Alloc->use_empty()) { 
-    Instruction *U = cast<Instruction>(*Alloc->user_begin()); 
-    Instruction *InsertPt = U; 
-    if (StoreInst *SI = dyn_cast<StoreInst>(U)) { 
-      // If this is the store of the allocation into the global, remove it. 
-      if (SI->getOperand(1) == GV) { 
-        SI->eraseFromParent(); 
-        continue; 
-      } 
-    } else if (PHINode *PN = dyn_cast<PHINode>(U)) { 
-      // Insert the load in the corresponding predecessor, not right before the 
-      // PHI. 
-      InsertPt = PN->getIncomingBlock(*Alloc->use_begin())->getTerminator(); 
-    } else if (isa<BitCastInst>(U)) { 
-      // Must be bitcast between the malloc and store to initialize the global. 
-      ReplaceUsesOfMallocWithGlobal(U, GV); 
-      U->eraseFromParent(); 
-      continue; 
-    } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) { 
-      // If this is a "GEP bitcast" and the user is a store to the global, then 
-      // just process it as a bitcast. 
-      if (GEPI->hasAllZeroIndices() && GEPI->hasOneUse()) 
-        if (StoreInst *SI = dyn_cast<StoreInst>(GEPI->user_back())) 
-          if (SI->getOperand(1) == GV) { 
-            // Must be bitcast GEP between the malloc and store to initialize 
-            // the global. 
-            ReplaceUsesOfMallocWithGlobal(GEPI, GV); 
-            GEPI->eraseFromParent(); 
-            continue; 
-          } 
-    } 
- 
-    // Insert a load from the global, and use it instead of the malloc. 
-    Value *NL = 
-        new LoadInst(GV->getValueType(), GV, GV->getName() + ".val", InsertPt); 
-    U->replaceUsesOfWith(Alloc, NL); 
-  } 
-} 
- 
-/// Verify that all uses of V (a load, or a phi of a load) are simple enough to 
-/// perform heap SRA on.  This permits GEP's that index through the array and 
-/// struct field, icmps of null, and PHIs. 
-static bool LoadUsesSimpleEnoughForHeapSRA(const Value *V, 
-                        SmallPtrSetImpl<const PHINode*> &LoadUsingPHIs, 
-                        SmallPtrSetImpl<const PHINode*> &LoadUsingPHIsPerLoad) { 
-  // We permit two users of the load: setcc comparing against the null 
-  // pointer, and a getelementptr of a specific form. 
-  for (const User *U : V->users()) { 
-    const Instruction *UI = cast<Instruction>(U); 
- 
-    // Comparison against null is ok. 
-    if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UI)) { 
-      if (!isa<ConstantPointerNull>(ICI->getOperand(1))) 
-        return false; 
-      continue; 
-    } 
- 
-    // getelementptr is also ok, but only a simple form. 
-    if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(UI)) { 
-      // Must index into the array and into the struct. 
-      if (GEPI->getNumOperands() < 3) 
-        return false; 
- 
-      // Otherwise the GEP is ok. 
-      continue; 
-    } 
- 
-    if (const PHINode *PN = dyn_cast<PHINode>(UI)) { 
-      if (!LoadUsingPHIsPerLoad.insert(PN).second) 
-        // This means some phi nodes are dependent on each other. 
-        // Avoid infinite looping! 
-        return false; 
-      if (!LoadUsingPHIs.insert(PN).second) 
-        // If we have already analyzed this PHI, then it is safe. 
-        continue; 
- 
-      // Make sure all uses of the PHI are simple enough to transform. 
-      if (!LoadUsesSimpleEnoughForHeapSRA(PN, 
-                                          LoadUsingPHIs, LoadUsingPHIsPerLoad)) 
-        return false; 
- 
-      continue; 
-    } 
- 
-    // Otherwise we don't know what this is, not ok. 
-    return false; 
-  } 
- 
-  return true; 
-} 
- 
-/// If all users of values loaded from GV are simple enough to perform HeapSRA, 
-/// return true. 
-static bool AllGlobalLoadUsesSimpleEnoughForHeapSRA(const GlobalVariable *GV, 
-                                                    Instruction *StoredVal) { 
-  SmallPtrSet<const PHINode*, 32> LoadUsingPHIs; 
-  SmallPtrSet<const PHINode*, 32> LoadUsingPHIsPerLoad; 
-  for (const User *U : GV->users()) 
-    if (const LoadInst *LI = dyn_cast<LoadInst>(U)) { 
-      if (!LoadUsesSimpleEnoughForHeapSRA(LI, LoadUsingPHIs, 
-                                          LoadUsingPHIsPerLoad)) 
-        return false; 
-      LoadUsingPHIsPerLoad.clear(); 
-    } 
- 
-  // If we reach here, we know that all uses of the loads and transitive uses 
-  // (through PHI nodes) are simple enough to transform.  However, we don't know 
-  // that all inputs the to the PHI nodes are in the same equivalence sets. 
-  // Check to verify that all operands of the PHIs are either PHIS that can be 
-  // transformed, loads from GV, or MI itself. 
-  for (const PHINode *PN : LoadUsingPHIs) { 
-    for (unsigned op = 0, e = PN->getNumIncomingValues(); op != e; ++op) { 
-      Value *InVal = PN->getIncomingValue(op); 
- 
-      // PHI of the stored value itself is ok. 
-      if (InVal == StoredVal) continue; 
- 
-      if (const PHINode *InPN = dyn_cast<PHINode>(InVal)) { 
-        // One of the PHIs in our set is (optimistically) ok. 
-        if (LoadUsingPHIs.count(InPN)) 
-          continue; 
-        return false; 
-      } 
- 
-      // Load from GV is ok. 
-      if (const LoadInst *LI = dyn_cast<LoadInst>(InVal)) 
-        if (LI->getOperand(0) == GV) 
-          continue; 
- 
-      // UNDEF? NULL? 
- 
-      // Anything else is rejected. 
-      return false; 
-    } 
-  } 
- 
-  return true; 
-} 
- 
-static Value *GetHeapSROAValue(Value *V, unsigned FieldNo, 
-              DenseMap<Value *, std::vector<Value *>> &InsertedScalarizedValues, 
-                   std::vector<std::pair<PHINode *, unsigned>> &PHIsToRewrite) { 
-  std::vector<Value *> &FieldVals = InsertedScalarizedValues[V]; 
- 
-  if (FieldNo >= FieldVals.size()) 
-    FieldVals.resize(FieldNo+1); 
- 
-  // If we already have this value, just reuse the previously scalarized 
-  // version. 
-  if (Value *FieldVal = FieldVals[FieldNo]) 
-    return FieldVal; 
- 
-  // Depending on what instruction this is, we have several cases. 
-  Value *Result; 
-  if (LoadInst *LI = dyn_cast<LoadInst>(V)) { 
-    // This is a scalarized version of the load from the global.  Just create 
-    // a new Load of the scalarized global. 
-    Value *V = GetHeapSROAValue(LI->getOperand(0), FieldNo, 
-                                InsertedScalarizedValues, PHIsToRewrite); 
-    Result = new LoadInst(V->getType()->getPointerElementType(), V, 
-                          LI->getName() + ".f" + Twine(FieldNo), LI); 
-  } else { 
-    PHINode *PN = cast<PHINode>(V); 
-    // PN's type is pointer to struct.  Make a new PHI of pointer to struct 
-    // field. 
- 
-    PointerType *PTy = cast<PointerType>(PN->getType()); 
-    StructType *ST = cast<StructType>(PTy->getElementType()); 
- 
-    unsigned AS = PTy->getAddressSpace(); 
-    PHINode *NewPN = 
-      PHINode::Create(PointerType::get(ST->getElementType(FieldNo), AS), 
-                     PN->getNumIncomingValues(), 
-                     PN->getName()+".f"+Twine(FieldNo), PN); 
-    Result = NewPN; 
-    PHIsToRewrite.push_back(std::make_pair(PN, FieldNo)); 
-  } 
- 
-  return FieldVals[FieldNo] = Result; 
-} 
- 
-/// Given a load instruction and a value derived from the load, rewrite the 
-/// derived value to use the HeapSRoA'd load. 
-static void RewriteHeapSROALoadUser(Instruction *LoadUser, 
-              DenseMap<Value *, std::vector<Value *>> &InsertedScalarizedValues, 
-                   std::vector<std::pair<PHINode *, unsigned>> &PHIsToRewrite) { 
-  // If this is a comparison against null, handle it. 
-  if (ICmpInst *SCI = dyn_cast<ICmpInst>(LoadUser)) { 
-    assert(isa<ConstantPointerNull>(SCI->getOperand(1))); 
-    // If we have a setcc of the loaded pointer, we can use a setcc of any 
-    // field. 
-    Value *NPtr = GetHeapSROAValue(SCI->getOperand(0), 0, 
-                                   InsertedScalarizedValues, PHIsToRewrite); 
- 
-    Value *New = new ICmpInst(SCI, SCI->getPredicate(), NPtr, 
-                              Constant::getNullValue(NPtr->getType()), 
-                              SCI->getName()); 
-    SCI->replaceAllUsesWith(New); 
-    SCI->eraseFromParent(); 
-    return; 
-  } 
- 
-  // Handle 'getelementptr Ptr, Idx, i32 FieldNo ...' 
-  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(LoadUser)) { 
-    assert(GEPI->getNumOperands() >= 3 && isa<ConstantInt>(GEPI->getOperand(2)) 
-           && "Unexpected GEPI!"); 
- 
-    // Load the pointer for this field. 
-    unsigned FieldNo = cast<ConstantInt>(GEPI->getOperand(2))->getZExtValue(); 
-    Value *NewPtr = GetHeapSROAValue(GEPI->getOperand(0), FieldNo, 
-                                     InsertedScalarizedValues, PHIsToRewrite); 
- 
-    // Create the new GEP idx vector. 
-    SmallVector<Value*, 8> GEPIdx; 
-    GEPIdx.push_back(GEPI->getOperand(1)); 
-    GEPIdx.append(GEPI->op_begin()+3, GEPI->op_end()); 
- 
-    Value *NGEPI = GetElementPtrInst::Create(GEPI->getResultElementType(), NewPtr, GEPIdx, 
-                                             GEPI->getName(), GEPI); 
-    GEPI->replaceAllUsesWith(NGEPI); 
-    GEPI->eraseFromParent(); 
-    return; 
-  } 
- 
-  // Recursively transform the users of PHI nodes.  This will lazily create the 
-  // PHIs that are needed for individual elements.  Keep track of what PHIs we 
-  // see in InsertedScalarizedValues so that we don't get infinite loops (very 
-  // antisocial).  If the PHI is already in InsertedScalarizedValues, it has 
-  // already been seen first by another load, so its uses have already been 
-  // processed. 
-  PHINode *PN = cast<PHINode>(LoadUser); 
-  if (!InsertedScalarizedValues.insert(std::make_pair(PN, 
-                                              std::vector<Value *>())).second) 
-    return; 
- 
-  // If this is the first time we've seen this PHI, recursively process all 
-  // users. 
-  for (auto UI = PN->user_begin(), E = PN->user_end(); UI != E;) { 
-    Instruction *User = cast<Instruction>(*UI++); 
-    RewriteHeapSROALoadUser(User, InsertedScalarizedValues, PHIsToRewrite); 
-  } 
-} 
- 
-/// We are performing Heap SRoA on a global.  Ptr is a value loaded from the 
-/// global.  Eliminate all uses of Ptr, making them use FieldGlobals instead. 
-/// All uses of loaded values satisfy AllGlobalLoadUsesSimpleEnoughForHeapSRA. 
-static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load, 
-              DenseMap<Value *, std::vector<Value *>> &InsertedScalarizedValues, 
-                  std::vector<std::pair<PHINode *, unsigned> > &PHIsToRewrite) { 
-  for (auto UI = Load->user_begin(), E = Load->user_end(); UI != E;) { 
-    Instruction *User = cast<Instruction>(*UI++); 
-    RewriteHeapSROALoadUser(User, InsertedScalarizedValues, PHIsToRewrite); 
-  } 
- 
-  if (Load->use_empty()) { 
-    Load->eraseFromParent(); 
-    InsertedScalarizedValues.erase(Load); 
-  } 
-} 
- 
-/// CI is an allocation of an array of structures.  Break it up into multiple 
-/// allocations of arrays of the fields. 
-static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI, 
-                                            Value *NElems, const DataLayout &DL, 
-                                            const TargetLibraryInfo *TLI) { 
-  LLVM_DEBUG(dbgs() << "SROA HEAP ALLOC: " << *GV << "  MALLOC = " << *CI 
-                    << '\n'); 
-  Type *MAT = getMallocAllocatedType(CI, TLI); 
-  StructType *STy = cast<StructType>(MAT); 
- 
-  // There is guaranteed to be at least one use of the malloc (storing 
-  // it into GV).  If there are other uses, change them to be uses of 
-  // the global to simplify later code.  This also deletes the store 
-  // into GV. 
-  ReplaceUsesOfMallocWithGlobal(CI, GV); 
- 
-  // Okay, at this point, there are no users of the malloc.  Insert N 
-  // new mallocs at the same place as CI, and N globals. 
-  std::vector<Value *> FieldGlobals; 
-  std::vector<Value *> FieldMallocs; 
- 
-  SmallVector<OperandBundleDef, 1> OpBundles; 
-  CI->getOperandBundlesAsDefs(OpBundles); 
- 
-  unsigned AS = GV->getType()->getPointerAddressSpace(); 
-  for (unsigned FieldNo = 0, e = STy->getNumElements(); FieldNo != e;++FieldNo){ 
-    Type *FieldTy = STy->getElementType(FieldNo); 
-    PointerType *PFieldTy = PointerType::get(FieldTy, AS); 
- 
-    GlobalVariable *NGV = new GlobalVariable( 
-        *GV->getParent(), PFieldTy, false, GlobalValue::InternalLinkage, 
-        Constant::getNullValue(PFieldTy), GV->getName() + ".f" + Twine(FieldNo), 
-        nullptr, GV->getThreadLocalMode()); 
-    NGV->copyAttributesFrom(GV); 
-    FieldGlobals.push_back(NGV); 
- 
-    unsigned TypeSize = DL.getTypeAllocSize(FieldTy); 
-    if (StructType *ST = dyn_cast<StructType>(FieldTy)) 
-      TypeSize = DL.getStructLayout(ST)->getSizeInBytes(); 
-    Type *IntPtrTy = DL.getIntPtrType(CI->getType()); 
-    Value *NMI = CallInst::CreateMalloc(CI, IntPtrTy, FieldTy, 
-                                        ConstantInt::get(IntPtrTy, TypeSize), 
-                                        NElems, OpBundles, nullptr, 
-                                        CI->getName() + ".f" + Twine(FieldNo)); 
-    FieldMallocs.push_back(NMI); 
-    new StoreInst(NMI, NGV, CI); 
-  } 
- 
-  // The tricky aspect of this transformation is handling the case when malloc 
-  // fails.  In the original code, malloc failing would set the result pointer 
-  // of malloc to null.  In this case, some mallocs could succeed and others 
-  // could fail.  As such, we emit code that looks like this: 
-  //    F0 = malloc(field0) 
-  //    F1 = malloc(field1) 
-  //    F2 = malloc(field2) 
-  //    if (F0 == 0 || F1 == 0 || F2 == 0) { 
-  //      if (F0) { free(F0); F0 = 0; } 
-  //      if (F1) { free(F1); F1 = 0; } 
-  //      if (F2) { free(F2); F2 = 0; } 
-  //    } 
-  // The malloc can also fail if its argument is too large. 
-  Constant *ConstantZero = ConstantInt::get(CI->getArgOperand(0)->getType(), 0); 
-  Value *RunningOr = new ICmpInst(CI, ICmpInst::ICMP_SLT, CI->getArgOperand(0), 
-                                  ConstantZero, "isneg"); 
-  for (unsigned i = 0, e = FieldMallocs.size(); i != e; ++i) { 
-    Value *Cond = new ICmpInst(CI, ICmpInst::ICMP_EQ, FieldMallocs[i], 
-                             Constant::getNullValue(FieldMallocs[i]->getType()), 
-                               "isnull"); 
-    RunningOr = BinaryOperator::CreateOr(RunningOr, Cond, "tmp", CI); 
-  } 
- 
-  // Split the basic block at the old malloc. 
-  BasicBlock *OrigBB = CI->getParent(); 
-  BasicBlock *ContBB = 
-      OrigBB->splitBasicBlock(CI->getIterator(), "malloc_cont"); 
- 
-  // Create the block to check the first condition.  Put all these blocks at the 
-  // end of the function as they are unlikely to be executed. 
-  BasicBlock *NullPtrBlock = BasicBlock::Create(OrigBB->getContext(), 
-                                                "malloc_ret_null", 
-                                                OrigBB->getParent()); 
- 
-  // Remove the uncond branch from OrigBB to ContBB, turning it into a cond 
-  // branch on RunningOr. 
-  OrigBB->getTerminator()->eraseFromParent(); 
-  BranchInst::Create(NullPtrBlock, ContBB, RunningOr, OrigBB); 
- 
-  // Within the NullPtrBlock, we need to emit a comparison and branch for each 
-  // pointer, because some may be null while others are not. 
-  for (unsigned i = 0, e = FieldGlobals.size(); i != e; ++i) { 
-    Value *GVVal = 
-        new LoadInst(cast<GlobalVariable>(FieldGlobals[i])->getValueType(), 
-                     FieldGlobals[i], "tmp", NullPtrBlock); 
-    Value *Cmp = new ICmpInst(*NullPtrBlock, ICmpInst::ICMP_NE, GVVal, 
-                              Constant::getNullValue(GVVal->getType())); 
-    BasicBlock *FreeBlock = BasicBlock::Create(Cmp->getContext(), "free_it", 
-                                               OrigBB->getParent()); 
-    BasicBlock *NextBlock = BasicBlock::Create(Cmp->getContext(), "next", 
-                                               OrigBB->getParent()); 
-    Instruction *BI = BranchInst::Create(FreeBlock, NextBlock, 
-                                         Cmp, NullPtrBlock); 
- 
-    // Fill in FreeBlock. 
-    CallInst::CreateFree(GVVal, OpBundles, BI); 
-    new StoreInst(Constant::getNullValue(GVVal->getType()), FieldGlobals[i], 
-                  FreeBlock); 
-    BranchInst::Create(NextBlock, FreeBlock); 
- 
-    NullPtrBlock = NextBlock; 
-  } 
- 
-  BranchInst::Create(ContBB, NullPtrBlock); 
- 
-  // CI is no longer needed, remove it. 
-  CI->eraseFromParent(); 
- 
-  /// As we process loads, if we can't immediately update all uses of the load, 
-  /// keep track of what scalarized loads are inserted for a given load. 
-  DenseMap<Value *, std::vector<Value *>> InsertedScalarizedValues; 
-  InsertedScalarizedValues[GV] = FieldGlobals; 
- 
-  std::vector<std::pair<PHINode *, unsigned>> PHIsToRewrite; 
- 
-  // Okay, the malloc site is completely handled.  All of the uses of GV are now 
-  // loads, and all uses of those loads are simple.  Rewrite them to use loads 
-  // of the per-field globals instead. 
-  for (auto UI = GV->user_begin(), E = GV->user_end(); UI != E;) { 
-    Instruction *User = cast<Instruction>(*UI++); 
- 
-    if (LoadInst *LI = dyn_cast<LoadInst>(User)) { 
-      RewriteUsesOfLoadForHeapSRoA(LI, InsertedScalarizedValues, PHIsToRewrite); 
-      continue; 
-    } 
- 
-    // Must be a store of null. 
-    StoreInst *SI = cast<StoreInst>(User); 
-    assert(isa<ConstantPointerNull>(SI->getOperand(0)) && 
-           "Unexpected heap-sra user!"); 
- 
-    // Insert a store of null into each global. 
-    for (unsigned i = 0, e = FieldGlobals.size(); i != e; ++i) { 
-      Type *ValTy = cast<GlobalValue>(FieldGlobals[i])->getValueType(); 
-      Constant *Null = Constant::getNullValue(ValTy); 
-      new StoreInst(Null, FieldGlobals[i], SI); 
-    } 
-    // Erase the original store. 
-    SI->eraseFromParent(); 
-  } 
- 
-  // While we have PHIs that are interesting to rewrite, do it. 
-  while (!PHIsToRewrite.empty()) { 
-    PHINode *PN = PHIsToRewrite.back().first; 
-    unsigned FieldNo = PHIsToRewrite.back().second; 
-    PHIsToRewrite.pop_back(); 
-    PHINode *FieldPN = cast<PHINode>(InsertedScalarizedValues[PN][FieldNo]); 
-    assert(FieldPN->getNumIncomingValues() == 0 &&"Already processed this phi"); 
- 
-    // Add all the incoming values.  This can materialize more phis. 
-    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { 
-      Value *InVal = PN->getIncomingValue(i); 
-      InVal = GetHeapSROAValue(InVal, FieldNo, InsertedScalarizedValues, 
-                               PHIsToRewrite); 
-      FieldPN->addIncoming(InVal, PN->getIncomingBlock(i)); 
-    } 
-  } 
- 
-  // Drop all inter-phi links and any loads that made it this far. 
-  for (DenseMap<Value *, std::vector<Value *>>::iterator 
-       I = InsertedScalarizedValues.begin(), E = InsertedScalarizedValues.end(); 
-       I != E; ++I) { 
-    if (PHINode *PN = dyn_cast<PHINode>(I->first)) 
-      PN->dropAllReferences(); 
-    else if (LoadInst *LI = dyn_cast<LoadInst>(I->first)) 
-      LI->dropAllReferences(); 
-  } 
- 
-  // Delete all the phis and loads now that inter-references are dead. 
-  for (DenseMap<Value *, std::vector<Value *>>::iterator 
-       I = InsertedScalarizedValues.begin(), E = InsertedScalarizedValues.end(); 
-       I != E; ++I) { 
-    if (PHINode *PN = dyn_cast<PHINode>(I->first)) 
-      PN->eraseFromParent(); 
-    else if (LoadInst *LI = dyn_cast<LoadInst>(I->first)) 
-      LI->eraseFromParent(); 
-  } 
- 
-  // The old global is now dead, remove it. 
-  GV->eraseFromParent(); 
- 
-  ++NumHeapSRA; 
-  return cast<GlobalVariable>(FieldGlobals[0]); 
-} 
- 
-/// This function is called when we see a pointer global variable with a single 
-/// value stored it that is a malloc or cast of malloc. 
-static bool tryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI, 
-                                               Type *AllocTy, 
-                                               AtomicOrdering Ordering, 
-                                               const DataLayout &DL, 
-                                               TargetLibraryInfo *TLI) { 
-  // If this is a malloc of an abstract type, don't touch it. 
-  if (!AllocTy->isSized()) 
-    return false; 
- 
-  // We can't optimize this global unless all uses of it are *known* to be 
-  // of the malloc value, not of the null initializer value (consider a use 
-  // that compares the global's value against zero to see if the malloc has 
-  // been reached).  To do this, we check to see if all uses of the global 
-  // would trap if the global were null: this proves that they must all 
-  // happen after the malloc. 
-  if (!AllUsesOfLoadedValueWillTrapIfNull(GV)) 
-    return false; 
- 
-  // We can't optimize this if the malloc itself is used in a complex way, 
-  // for example, being stored into multiple globals.  This allows the 
-  // malloc to be stored into the specified global, loaded icmp'd, and 
-  // GEP'd.  These are all things we could transform to using the global 
-  // for. 
-  SmallPtrSet<const PHINode*, 8> PHIs; 
-  if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(CI, GV, PHIs)) 
-    return false; 
- 
-  // If we have a global that is only initialized with a fixed size malloc, 
-  // transform the program to use global memory instead of malloc'd memory. 
-  // This eliminates dynamic allocation, avoids an indirection accessing the 
-  // data, and exposes the resultant global to further GlobalOpt. 
-  // We cannot optimize the malloc if we cannot determine malloc array size. 
-  Value *NElems = getMallocArraySize(CI, DL, TLI, true); 
-  if (!NElems) 
-    return false; 
- 
-  if (ConstantInt *NElements = dyn_cast<ConstantInt>(NElems)) 
-    // Restrict this transformation to only working on small allocations 
-    // (2048 bytes currently), as we don't want to introduce a 16M global or 
-    // something. 
-    if (NElements->getZExtValue() * DL.getTypeAllocSize(AllocTy) < 2048) { 
-      OptimizeGlobalAddressOfMalloc(GV, CI, AllocTy, NElements, DL, TLI); 
-      return true; 
-    } 
- 
-  // If the allocation is an array of structures, consider transforming this 
-  // into multiple malloc'd arrays, one for each field.  This is basically 
-  // SRoA for malloc'd memory. 
- 
-  if (Ordering != AtomicOrdering::NotAtomic) 
-    return false; 
- 
-  // If this is an allocation of a fixed size array of structs, analyze as a 
-  // variable size array.  malloc [100 x struct],1 -> malloc struct, 100 
-  if (NElems == ConstantInt::get(CI->getArgOperand(0)->getType(), 1)) 
-    if (ArrayType *AT = dyn_cast<ArrayType>(AllocTy)) 
-      AllocTy = AT->getElementType(); 
- 
-  StructType *AllocSTy = dyn_cast<StructType>(AllocTy); 
-  if (!AllocSTy) 
-    return false; 
- 
-  // This the structure has an unreasonable number of fields, leave it 
-  // alone. 
-  if (AllocSTy->getNumElements() <= 16 && AllocSTy->getNumElements() != 0 && 
-      AllGlobalLoadUsesSimpleEnoughForHeapSRA(GV, CI)) { 
- 
-    // If this is a fixed size array, transform the Malloc to be an alloc of 
-    // structs.  malloc [100 x struct],1 -> malloc struct, 100 
-    if (ArrayType *AT = dyn_cast<ArrayType>(getMallocAllocatedType(CI, TLI))) { 
-      Type *IntPtrTy = DL.getIntPtrType(CI->getType()); 
-      unsigned TypeSize = DL.getStructLayout(AllocSTy)->getSizeInBytes(); 
-      Value *AllocSize = ConstantInt::get(IntPtrTy, TypeSize); 
-      Value *NumElements = ConstantInt::get(IntPtrTy, AT->getNumElements()); 
-      SmallVector<OperandBundleDef, 1> OpBundles; 
-      CI->getOperandBundlesAsDefs(OpBundles); 
-      Instruction *Malloc = 
-          CallInst::CreateMalloc(CI, IntPtrTy, AllocSTy, AllocSize, NumElements, 
-                                 OpBundles, nullptr, CI->getName()); 
-      Instruction *Cast = new BitCastInst(Malloc, CI->getType(), "tmp", CI); 
-      CI->replaceAllUsesWith(Cast); 
-      CI->eraseFromParent(); 
-      if (BitCastInst *BCI = dyn_cast<BitCastInst>(Malloc)) 
-        CI = cast<CallInst>(BCI->getOperand(0)); 
-      else 
-        CI = cast<CallInst>(Malloc); 
-    } 
- 
-    PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, DL, TLI, true), DL, 
-                         TLI); 
-    return true; 
-  } 
- 
-  return false; 
-} 
- 
-// Try to optimize globals based on the knowledge that only one value (besides 
-// its initializer) is ever stored to the global. 
-static bool 
-optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal, 
-                         AtomicOrdering Ordering, const DataLayout &DL, 
-                         function_ref<TargetLibraryInfo &(Function &)> GetTLI) { 
-  // Ignore no-op GEPs and bitcasts. 
-  StoredOnceVal = StoredOnceVal->stripPointerCasts(); 
- 
-  // If we are dealing with a pointer global that is initialized to null and 
-  // only has one (non-null) value stored into it, then we can optimize any 
-  // users of the loaded value (often calls and loads) that would trap if the 
-  // value was null. 
-  if (GV->getInitializer()->getType()->isPointerTy() && 
-      GV->getInitializer()->isNullValue() && 
-      !NullPointerIsDefined( 
-          nullptr /* F */, 
-          GV->getInitializer()->getType()->getPointerAddressSpace())) { 
-    if (Constant *SOVC = dyn_cast<Constant>(StoredOnceVal)) { 
-      if (GV->getInitializer()->getType() != SOVC->getType()) 
-        SOVC = ConstantExpr::getBitCast(SOVC, GV->getInitializer()->getType()); 
- 
-      // Optimize away any trapping uses of the loaded value. 
-      if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC, DL, GetTLI)) 
-        return true; 
-    } else if (CallInst *CI = extractMallocCall(StoredOnceVal, GetTLI)) { 
-      auto *TLI = &GetTLI(*CI->getFunction()); 
-      Type *MallocType = getMallocAllocatedType(CI, TLI); 
-      if (MallocType && tryToOptimizeStoreOfMallocToGlobal(GV, CI, MallocType, 
-                                                           Ordering, DL, TLI)) 
-        return true; 
-    } 
-  } 
- 
-  return false; 
-} 
- 
-/// At this point, we have learned that the only two values ever stored into GV 
-/// are its initializer and OtherVal.  See if we can shrink the global into a 
-/// boolean and select between the two values whenever it is used.  This exposes 
-/// the values to other scalar optimizations. 
-static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) { 
-  Type *GVElType = GV->getValueType(); 
- 
-  // If GVElType is already i1, it is already shrunk.  If the type of the GV is 
-  // an FP value, pointer or vector, don't do this optimization because a select 
-  // between them is very expensive and unlikely to lead to later 
-  // simplification.  In these cases, we typically end up with "cond ? v1 : v2" 
-  // where v1 and v2 both require constant pool loads, a big loss. 
-  if (GVElType == Type::getInt1Ty(GV->getContext()) || 
-      GVElType->isFloatingPointTy() || 
-      GVElType->isPointerTy() || GVElType->isVectorTy()) 
-    return false; 
- 
-  // Walk the use list of the global seeing if all the uses are load or store. 
-  // If there is anything else, bail out. 
-  for (User *U : GV->users()) 
-    if (!isa<LoadInst>(U) && !isa<StoreInst>(U)) 
-      return false; 
- 
-  LLVM_DEBUG(dbgs() << "   *** SHRINKING TO BOOL: " << *GV << "\n"); 
- 
-  // Create the new global, initializing it to false. 
-  GlobalVariable *NewGV = new GlobalVariable(Type::getInt1Ty(GV->getContext()), 
-                                             false, 
-                                             GlobalValue::InternalLinkage, 
-                                        ConstantInt::getFalse(GV->getContext()), 
-                                             GV->getName()+".b", 
-                                             GV->getThreadLocalMode(), 
-                                             GV->getType()->getAddressSpace()); 
-  NewGV->copyAttributesFrom(GV); 
-  GV->getParent()->getGlobalList().insert(GV->getIterator(), NewGV); 
- 
-  Constant *InitVal = GV->getInitializer(); 
-  assert(InitVal->getType() != Type::getInt1Ty(GV->getContext()) && 
-         "No reason to shrink to bool!"); 
- 
-  SmallVector<DIGlobalVariableExpression *, 1> GVs; 
-  GV->getDebugInfo(GVs); 
- 
-  // If initialized to zero and storing one into the global, we can use a cast 
-  // instead of a select to synthesize the desired value. 
-  bool IsOneZero = false; 
-  bool EmitOneOrZero = true; 
-  auto *CI = dyn_cast<ConstantInt>(OtherVal); 
-  if (CI && CI->getValue().getActiveBits() <= 64) { 
-    IsOneZero = InitVal->isNullValue() && CI->isOne(); 
- 
-    auto *CIInit = dyn_cast<ConstantInt>(GV->getInitializer()); 
-    if (CIInit && CIInit->getValue().getActiveBits() <= 64) { 
-      uint64_t ValInit = CIInit->getZExtValue(); 
-      uint64_t ValOther = CI->getZExtValue(); 
-      uint64_t ValMinus = ValOther - ValInit; 
- 
-      for(auto *GVe : GVs){ 
-        DIGlobalVariable *DGV = GVe->getVariable(); 
-        DIExpression *E = GVe->getExpression(); 
-        const DataLayout &DL = GV->getParent()->getDataLayout(); 
-        unsigned SizeInOctets = 
-          DL.getTypeAllocSizeInBits(NewGV->getType()->getElementType()) / 8; 
- 
-        // It is expected that the address of global optimized variable is on 
-        // top of the stack. After optimization, value of that variable will 
-        // be ether 0 for initial value or 1 for other value. The following 
-        // expression should return constant integer value depending on the 
-        // value at global object address: 
-        // val * (ValOther - ValInit) + ValInit: 
-        // DW_OP_deref DW_OP_constu <ValMinus> 
-        // DW_OP_mul DW_OP_constu <ValInit> DW_OP_plus DW_OP_stack_value 
-        SmallVector<uint64_t, 12> Ops = { 
-            dwarf::DW_OP_deref_size, SizeInOctets, 
-            dwarf::DW_OP_constu, ValMinus, 
-            dwarf::DW_OP_mul, dwarf::DW_OP_constu, ValInit, 
-            dwarf::DW_OP_plus}; 
-        bool WithStackValue = true; 
-        E = DIExpression::prependOpcodes(E, Ops, WithStackValue); 
-        DIGlobalVariableExpression *DGVE = 
-          DIGlobalVariableExpression::get(NewGV->getContext(), DGV, E); 
-        NewGV->addDebugInfo(DGVE); 
-     } 
-     EmitOneOrZero = false; 
-    } 
-  } 
- 
-  if (EmitOneOrZero) { 
-     // FIXME: This will only emit address for debugger on which will 
-     // be written only 0 or 1. 
-     for(auto *GV : GVs) 
-       NewGV->addDebugInfo(GV); 
-   } 
- 
-  while (!GV->use_empty()) { 
-    Instruction *UI = cast<Instruction>(GV->user_back()); 
-    if (StoreInst *SI = dyn_cast<StoreInst>(UI)) { 
-      // Change the store into a boolean store. 
-      bool StoringOther = SI->getOperand(0) == OtherVal; 
-      // Only do this if we weren't storing a loaded value. 
-      Value *StoreVal; 
-      if (StoringOther || SI->getOperand(0) == InitVal) { 
-        StoreVal = ConstantInt::get(Type::getInt1Ty(GV->getContext()), 
-                                    StoringOther); 
-      } else { 
-        // Otherwise, we are storing a previously loaded copy.  To do this, 
-        // change the copy from copying the original value to just copying the 
-        // bool. 
-        Instruction *StoredVal = cast<Instruction>(SI->getOperand(0)); 
- 
-        // If we've already replaced the input, StoredVal will be a cast or 
-        // select instruction.  If not, it will be a load of the original 
-        // global. 
-        if (LoadInst *LI = dyn_cast<LoadInst>(StoredVal)) { 
-          assert(LI->getOperand(0) == GV && "Not a copy!"); 
-          // Insert a new load, to preserve the saved value. 
-          StoreVal = new LoadInst(NewGV->getValueType(), NewGV, 
-                                  LI->getName() + ".b", false, Align(1), 
-                                  LI->getOrdering(), LI->getSyncScopeID(), LI); 
-        } else { 
-          assert((isa<CastInst>(StoredVal) || isa<SelectInst>(StoredVal)) && 
-                 "This is not a form that we understand!"); 
-          StoreVal = StoredVal->getOperand(0); 
-          assert(isa<LoadInst>(StoreVal) && "Not a load of NewGV!"); 
-        } 
-      } 
-      StoreInst *NSI = 
-          new StoreInst(StoreVal, NewGV, false, Align(1), SI->getOrdering(), 
-                        SI->getSyncScopeID(), SI); 
-      NSI->setDebugLoc(SI->getDebugLoc()); 
-    } else { 
-      // Change the load into a load of bool then a select. 
-      LoadInst *LI = cast<LoadInst>(UI); 
-      LoadInst *NLI = new LoadInst(NewGV->getValueType(), NewGV, 
-                                   LI->getName() + ".b", false, Align(1), 
-                                   LI->getOrdering(), LI->getSyncScopeID(), LI); 
-      Instruction *NSI; 
-      if (IsOneZero) 
-        NSI = new ZExtInst(NLI, LI->getType(), "", LI); 
-      else 
-        NSI = SelectInst::Create(NLI, OtherVal, InitVal, "", LI); 
-      NSI->takeName(LI); 
-      // Since LI is split into two instructions, NLI and NSI both inherit the 
-      // same DebugLoc 
-      NLI->setDebugLoc(LI->getDebugLoc()); 
-      NSI->setDebugLoc(LI->getDebugLoc()); 
-      LI->replaceAllUsesWith(NSI); 
-    } 
-    UI->eraseFromParent(); 
-  } 
- 
-  // Retain the name of the old global variable. People who are debugging their 
-  // programs may expect these variables to be named the same. 
-  NewGV->takeName(GV); 
-  GV->eraseFromParent(); 
-  return true; 
-} 
- 
-static bool deleteIfDead( 
-    GlobalValue &GV, SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) { 
-  GV.removeDeadConstantUsers(); 
- 
-  if (!GV.isDiscardableIfUnused() && !GV.isDeclaration()) 
-    return false; 
- 
-  if (const Comdat *C = GV.getComdat()) 
-    if (!GV.hasLocalLinkage() && NotDiscardableComdats.count(C)) 
-      return false; 
- 
-  bool Dead; 
-  if (auto *F = dyn_cast<Function>(&GV)) 
-    Dead = (F->isDeclaration() && F->use_empty()) || F->isDefTriviallyDead(); 
-  else 
-    Dead = GV.use_empty(); 
-  if (!Dead) 
-    return false; 
- 
-  LLVM_DEBUG(dbgs() << "GLOBAL DEAD: " << GV << "\n"); 
-  GV.eraseFromParent(); 
-  ++NumDeleted; 
-  return true; 
-} 
- 
-static bool isPointerValueDeadOnEntryToFunction( 
-    const Function *F, GlobalValue *GV, 
-    function_ref<DominatorTree &(Function &)> LookupDomTree) { 
-  // Find all uses of GV. We expect them all to be in F, and if we can't 
-  // identify any of the uses we bail out. 
-  // 
-  // On each of these uses, identify if the memory that GV points to is 
-  // used/required/live at the start of the function. If it is not, for example 
-  // if the first thing the function does is store to the GV, the GV can 
-  // possibly be demoted. 
-  // 
-  // We don't do an exhaustive search for memory operations - simply look 
-  // through bitcasts as they're quite common and benign. 
-  const DataLayout &DL = GV->getParent()->getDataLayout(); 
-  SmallVector<LoadInst *, 4> Loads; 
-  SmallVector<StoreInst *, 4> Stores; 
-  for (auto *U : GV->users()) { 
-    if (Operator::getOpcode(U) == Instruction::BitCast) { 
-      for (auto *UU : U->users()) { 
-        if (auto *LI = dyn_cast<LoadInst>(UU)) 
-          Loads.push_back(LI); 
-        else if (auto *SI = dyn_cast<StoreInst>(UU)) 
-          Stores.push_back(SI); 
-        else 
-          return false; 
-      } 
-      continue; 
-    } 
- 
-    Instruction *I = dyn_cast<Instruction>(U); 
-    if (!I) 
-      return false; 
-    assert(I->getParent()->getParent() == F); 
- 
-    if (auto *LI = dyn_cast<LoadInst>(I)) 
-      Loads.push_back(LI); 
-    else if (auto *SI = dyn_cast<StoreInst>(I)) 
-      Stores.push_back(SI); 
-    else 
-      return false; 
-  } 
- 
-  // We have identified all uses of GV into loads and stores. Now check if all 
-  // of them are known not to depend on the value of the global at the function 
-  // entry point. We do this by ensuring that every load is dominated by at 
-  // least one store. 
-  auto &DT = LookupDomTree(*const_cast<Function *>(F)); 
- 
-  // The below check is quadratic. Check we're not going to do too many tests. 
-  // FIXME: Even though this will always have worst-case quadratic time, we 
-  // could put effort into minimizing the average time by putting stores that 
-  // have been shown to dominate at least one load at the beginning of the 
-  // Stores array, making subsequent dominance checks more likely to succeed 
-  // early. 
-  // 
-  // The threshold here is fairly large because global->local demotion is a 
-  // very powerful optimization should it fire. 
-  const unsigned Threshold = 100; 
-  if (Loads.size() * Stores.size() > Threshold) 
-    return false; 
- 
-  for (auto *L : Loads) { 
-    auto *LTy = L->getType(); 
-    if (none_of(Stores, [&](const StoreInst *S) { 
-          auto *STy = S->getValueOperand()->getType(); 
-          // The load is only dominated by the store if DomTree says so 
-          // and the number of bits loaded in L is less than or equal to 
-          // the number of bits stored in S. 
-          return DT.dominates(S, L) && 
+  while (!WorkList.empty()) {
+    Value *UV = WorkList.pop_back_val();
+    if (!UV)
+      continue;
+
+    User *U = cast<User>(UV);
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
+      if (Init) {
+        // Replace the load with the initializer.
+        LI->replaceAllUsesWith(Init);
+        LI->eraseFromParent();
+        Changed = true;
+      }
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      // Store must be unreachable or storing Init into the global.
+      SI->eraseFromParent();
+      Changed = true;
+    } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
+      if (CE->getOpcode() == Instruction::GetElementPtr) {
+        Constant *SubInit = nullptr;
+        if (Init)
+          SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE);
+        Changed |= CleanupConstantGlobalUsers(CE, SubInit, DL, GetTLI);
+      } else if ((CE->getOpcode() == Instruction::BitCast &&
+                  CE->getType()->isPointerTy()) ||
+                 CE->getOpcode() == Instruction::AddrSpaceCast) {
+        // Pointer cast, delete any stores and memsets to the global.
+        Changed |= CleanupConstantGlobalUsers(CE, nullptr, DL, GetTLI);
+      }
+
+      if (CE->use_empty()) {
+        CE->destroyConstant();
+        Changed = true;
+      }
+    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
+      // Do not transform "gepinst (gep constexpr (GV))" here, because forming
+      // "gepconstexpr (gep constexpr (GV))" will cause the two gep's to fold
+      // and will invalidate our notion of what Init is.
+      Constant *SubInit = nullptr;
+      if (!isa<ConstantExpr>(GEP->getOperand(0))) {
+        ConstantExpr *CE = dyn_cast_or_null<ConstantExpr>(
+            ConstantFoldInstruction(GEP, DL, &GetTLI(*GEP->getFunction())));
+        if (Init && CE && CE->getOpcode() == Instruction::GetElementPtr)
+          SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE);
+
+        // If the initializer is an all-null value and we have an inbounds GEP,
+        // we already know what the result of any load from that GEP is.
+        // TODO: Handle splats.
+        if (Init && isa<ConstantAggregateZero>(Init) && GEP->isInBounds())
+          SubInit = Constant::getNullValue(GEP->getResultElementType());
+      }
+      Changed |= CleanupConstantGlobalUsers(GEP, SubInit, DL, GetTLI);
+
+      if (GEP->use_empty()) {
+        GEP->eraseFromParent();
+        Changed = true;
+      }
+    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U)) { // memset/cpy/mv
+      if (MI->getRawDest() == V) {
+        MI->eraseFromParent();
+        Changed = true;
+      }
+
+    } else if (Constant *C = dyn_cast<Constant>(U)) {
+      // If we have a chain of dead constantexprs or other things dangling from
+      // us, and if they are all dead, nuke them without remorse.
+      if (isSafeToDestroyConstant(C)) {
+        C->destroyConstant();
+        CleanupConstantGlobalUsers(V, Init, DL, GetTLI);
+        return true;
+      }
+    }
+  }
+  return Changed;
+}
+
+static bool isSafeSROAElementUse(Value *V);
+
+/// Return true if the specified GEP is a safe user of a derived
+/// expression from a global that we want to SROA.
+static bool isSafeSROAGEP(User *U) {
+  // Check to see if this ConstantExpr GEP is SRA'able.  In particular, we
+  // don't like < 3 operand CE's, and we don't like non-constant integer
+  // indices.  This enforces that all uses are 'gep GV, 0, C, ...' for some
+  // value of C.
+  if (U->getNumOperands() < 3 || !isa<Constant>(U->getOperand(1)) ||
+      !cast<Constant>(U->getOperand(1))->isNullValue())
+    return false;
+
+  gep_type_iterator GEPI = gep_type_begin(U), E = gep_type_end(U);
+  ++GEPI; // Skip over the pointer index.
+
+  // For all other level we require that the indices are constant and inrange.
+  // In particular, consider: A[0][i].  We cannot know that the user isn't doing
+  // invalid things like allowing i to index an out-of-range subscript that
+  // accesses A[1]. This can also happen between different members of a struct
+  // in llvm IR.
+  for (; GEPI != E; ++GEPI) {
+    if (GEPI.isStruct())
+      continue;
+
+    ConstantInt *IdxVal = dyn_cast<ConstantInt>(GEPI.getOperand());
+    if (!IdxVal || (GEPI.isBoundedSequential() &&
+                    IdxVal->getZExtValue() >= GEPI.getSequentialNumElements()))
+      return false;
+  }
+
+  return llvm::all_of(U->users(),
+                      [](User *UU) { return isSafeSROAElementUse(UU); });
+}
+
+/// Return true if the specified instruction is a safe user of a derived
+/// expression from a global that we want to SROA.
+static bool isSafeSROAElementUse(Value *V) {
+  // We might have a dead and dangling constant hanging off of here.
+  if (Constant *C = dyn_cast<Constant>(V))
+    return isSafeToDestroyConstant(C);
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
+
+  // Loads are ok.
+  if (isa<LoadInst>(I)) return true;
+
+  // Stores *to* the pointer are ok.
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->getOperand(0) != V;
+
+  // Otherwise, it must be a GEP. Check it and its users are safe to SRA.
+  return isa<GetElementPtrInst>(I) && isSafeSROAGEP(I);
+}
+
+/// Look at all uses of the global and decide whether it is safe for us to
+/// perform this transformation.
+static bool GlobalUsersSafeToSRA(GlobalValue *GV) {
+  for (User *U : GV->users()) {
+    // The user of the global must be a GEP Inst or a ConstantExpr GEP.
+    if (!isa<GetElementPtrInst>(U) &&
+        (!isa<ConstantExpr>(U) ||
+        cast<ConstantExpr>(U)->getOpcode() != Instruction::GetElementPtr))
+      return false;
+
+    // Check the gep and it's users are safe to SRA
+    if (!isSafeSROAGEP(U))
+      return false;
+  }
+
+  return true;
+}
+
+static bool IsSRASequential(Type *T) {
+  return isa<ArrayType>(T) || isa<VectorType>(T);
+}
+static uint64_t GetSRASequentialNumElements(Type *T) {
+  if (ArrayType *AT = dyn_cast<ArrayType>(T))
+    return AT->getNumElements();
+  return cast<FixedVectorType>(T)->getNumElements();
+}
+static Type *GetSRASequentialElementType(Type *T) {
+  if (ArrayType *AT = dyn_cast<ArrayType>(T))
+    return AT->getElementType();
+  return cast<VectorType>(T)->getElementType();
+}
+static bool CanDoGlobalSRA(GlobalVariable *GV) {
+  Constant *Init = GV->getInitializer();
+
+  if (isa<StructType>(Init->getType())) {
+    // nothing to check
+  } else if (IsSRASequential(Init->getType())) {
+    if (GetSRASequentialNumElements(Init->getType()) > 16 &&
+        GV->hasNUsesOrMore(16))
+      return false; // It's not worth it.
+  } else
+    return false;
+
+  return GlobalUsersSafeToSRA(GV);
+}
+
+/// Copy over the debug info for a variable to its SRA replacements.
+static void transferSRADebugInfo(GlobalVariable *GV, GlobalVariable *NGV,
+                                 uint64_t FragmentOffsetInBits,
+                                 uint64_t FragmentSizeInBits,
+                                 uint64_t VarSize) {
+  SmallVector<DIGlobalVariableExpression *, 1> GVs;
+  GV->getDebugInfo(GVs);
+  for (auto *GVE : GVs) {
+    DIVariable *Var = GVE->getVariable();
+    DIExpression *Expr = GVE->getExpression();
+    // If the FragmentSize is smaller than the variable,
+    // emit a fragment expression.
+    if (FragmentSizeInBits < VarSize) {
+      if (auto E = DIExpression::createFragmentExpression(
+              Expr, FragmentOffsetInBits, FragmentSizeInBits))
+        Expr = *E;
+      else
+        return;
+    }
+    auto *NGVE = DIGlobalVariableExpression::get(GVE->getContext(), Var, Expr);
+    NGV->addDebugInfo(NGVE);
+  }
+}
+
+/// Perform scalar replacement of aggregates on the specified global variable.
+/// This opens the door for other optimizations by exposing the behavior of the
+/// program in a more fine-grained way.  We have determined that this
+/// transformation is safe already.  We return the first global variable we
+/// insert so that the caller can reprocess it.
+static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
+  // Make sure this global only has simple uses that we can SRA.
+  if (!CanDoGlobalSRA(GV))
+    return nullptr;
+
+  assert(GV->hasLocalLinkage());
+  Constant *Init = GV->getInitializer();
+  Type *Ty = Init->getType();
+  uint64_t VarSize = DL.getTypeSizeInBits(Ty);
+
+  std::map<unsigned, GlobalVariable *> NewGlobals;
+
+  // Get the alignment of the global, either explicit or target-specific.
+  Align StartAlignment =
+      DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getType());
+
+  // Loop over all users and create replacement variables for used aggregate
+  // elements.
+  for (User *GEP : GV->users()) {
+    assert(((isa<ConstantExpr>(GEP) && cast<ConstantExpr>(GEP)->getOpcode() ==
+                                           Instruction::GetElementPtr) ||
+            isa<GetElementPtrInst>(GEP)) &&
+           "NonGEP CE's are not SRAable!");
+
+    // Ignore the 1th operand, which has to be zero or else the program is quite
+    // broken (undefined).  Get the 2nd operand, which is the structure or array
+    // index.
+    unsigned ElementIdx = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue();
+    if (NewGlobals.count(ElementIdx) == 1)
+      continue; // we`ve already created replacement variable
+    assert(NewGlobals.count(ElementIdx) == 0);
+
+    Type *ElTy = nullptr;
+    if (StructType *STy = dyn_cast<StructType>(Ty))
+      ElTy = STy->getElementType(ElementIdx);
+    else
+      ElTy = GetSRASequentialElementType(Ty);
+    assert(ElTy);
+
+    Constant *In = Init->getAggregateElement(ElementIdx);
+    assert(In && "Couldn't get element of initializer?");
+
+    GlobalVariable *NGV = new GlobalVariable(
+        ElTy, false, GlobalVariable::InternalLinkage, In,
+        GV->getName() + "." + Twine(ElementIdx), GV->getThreadLocalMode(),
+        GV->getType()->getAddressSpace());
+    NGV->setExternallyInitialized(GV->isExternallyInitialized());
+    NGV->copyAttributesFrom(GV);
+    NewGlobals.insert(std::make_pair(ElementIdx, NGV));
+
+    if (StructType *STy = dyn_cast<StructType>(Ty)) {
+      const StructLayout &Layout = *DL.getStructLayout(STy);
+
+      // Calculate the known alignment of the field.  If the original aggregate
+      // had 256 byte alignment for example, something might depend on that:
+      // propagate info to each field.
+      uint64_t FieldOffset = Layout.getElementOffset(ElementIdx);
+      Align NewAlign = commonAlignment(StartAlignment, FieldOffset);
+      if (NewAlign > DL.getABITypeAlign(STy->getElementType(ElementIdx)))
+        NGV->setAlignment(NewAlign);
+
+      // Copy over the debug info for the variable.
+      uint64_t Size = DL.getTypeAllocSizeInBits(NGV->getValueType());
+      uint64_t FragmentOffsetInBits = Layout.getElementOffsetInBits(ElementIdx);
+      transferSRADebugInfo(GV, NGV, FragmentOffsetInBits, Size, VarSize);
+    } else {
+      uint64_t EltSize = DL.getTypeAllocSize(ElTy);
+      Align EltAlign = DL.getABITypeAlign(ElTy);
+      uint64_t FragmentSizeInBits = DL.getTypeAllocSizeInBits(ElTy);
+
+      // Calculate the known alignment of the field.  If the original aggregate
+      // had 256 byte alignment for example, something might depend on that:
+      // propagate info to each field.
+      Align NewAlign = commonAlignment(StartAlignment, EltSize * ElementIdx);
+      if (NewAlign > EltAlign)
+        NGV->setAlignment(NewAlign);
+      transferSRADebugInfo(GV, NGV, FragmentSizeInBits * ElementIdx,
+                           FragmentSizeInBits, VarSize);
+    }
+  }
+
+  if (NewGlobals.empty())
+    return nullptr;
+
+  Module::GlobalListType &Globals = GV->getParent()->getGlobalList();
+  for (auto NewGlobalVar : NewGlobals)
+    Globals.push_back(NewGlobalVar.second);
+
+  LLVM_DEBUG(dbgs() << "PERFORMING GLOBAL SRA ON: " << *GV << "\n");
+
+  Constant *NullInt =Constant::getNullValue(Type::getInt32Ty(GV->getContext()));
+
+  // Loop over all of the uses of the global, replacing the constantexpr geps,
+  // with smaller constantexpr geps or direct references.
+  while (!GV->use_empty()) {
+    User *GEP = GV->user_back();
+    assert(((isa<ConstantExpr>(GEP) &&
+             cast<ConstantExpr>(GEP)->getOpcode()==Instruction::GetElementPtr)||
+            isa<GetElementPtrInst>(GEP)) && "NonGEP CE's are not SRAable!");
+
+    // Ignore the 1th operand, which has to be zero or else the program is quite
+    // broken (undefined).  Get the 2nd operand, which is the structure or array
+    // index.
+    unsigned ElementIdx = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue();
+    assert(NewGlobals.count(ElementIdx) == 1);
+
+    Value *NewPtr = NewGlobals[ElementIdx];
+    Type *NewTy = NewGlobals[ElementIdx]->getValueType();
+
+    // Form a shorter GEP if needed.
+    if (GEP->getNumOperands() > 3) {
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(GEP)) {
+        SmallVector<Constant*, 8> Idxs;
+        Idxs.push_back(NullInt);
+        for (unsigned i = 3, e = CE->getNumOperands(); i != e; ++i)
+          Idxs.push_back(CE->getOperand(i));
+        NewPtr =
+            ConstantExpr::getGetElementPtr(NewTy, cast<Constant>(NewPtr), Idxs);
+      } else {
+        GetElementPtrInst *GEPI = cast<GetElementPtrInst>(GEP);
+        SmallVector<Value*, 8> Idxs;
+        Idxs.push_back(NullInt);
+        for (unsigned i = 3, e = GEPI->getNumOperands(); i != e; ++i)
+          Idxs.push_back(GEPI->getOperand(i));
+        NewPtr = GetElementPtrInst::Create(
+            NewTy, NewPtr, Idxs, GEPI->getName() + "." + Twine(ElementIdx),
+            GEPI);
+      }
+    }
+    GEP->replaceAllUsesWith(NewPtr);
+
+    if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(GEP))
+      GEPI->eraseFromParent();
+    else
+      cast<ConstantExpr>(GEP)->destroyConstant();
+  }
+
+  // Delete the old global, now that it is dead.
+  Globals.erase(GV);
+  ++NumSRA;
+
+  assert(NewGlobals.size() > 0);
+  return NewGlobals.begin()->second;
+}
+
+/// Return true if all users of the specified value will trap if the value is
+/// dynamically null.  PHIs keeps track of any phi nodes we've seen to avoid
+/// reprocessing them.
+static bool AllUsesOfValueWillTrapIfNull(const Value *V,
+                                        SmallPtrSetImpl<const PHINode*> &PHIs) {
+  for (const User *U : V->users()) {
+    if (const Instruction *I = dyn_cast<Instruction>(U)) {
+      // If null pointer is considered valid, then all uses are non-trapping.
+      // Non address-space 0 globals have already been pruned by the caller.
+      if (NullPointerIsDefined(I->getFunction()))
+        return false;
+    }
+    if (isa<LoadInst>(U)) {
+      // Will trap.
+    } else if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      if (SI->getOperand(0) == V) {
+        //cerr << "NONTRAPPING USE: " << *U;
+        return false;  // Storing the value.
+      }
+    } else if (const CallInst *CI = dyn_cast<CallInst>(U)) {
+      if (CI->getCalledOperand() != V) {
+        //cerr << "NONTRAPPING USE: " << *U;
+        return false;  // Not calling the ptr
+      }
+    } else if (const InvokeInst *II = dyn_cast<InvokeInst>(U)) {
+      if (II->getCalledOperand() != V) {
+        //cerr << "NONTRAPPING USE: " << *U;
+        return false;  // Not calling the ptr
+      }
+    } else if (const BitCastInst *CI = dyn_cast<BitCastInst>(U)) {
+      if (!AllUsesOfValueWillTrapIfNull(CI, PHIs)) return false;
+    } else if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) {
+      if (!AllUsesOfValueWillTrapIfNull(GEPI, PHIs)) return false;
+    } else if (const PHINode *PN = dyn_cast<PHINode>(U)) {
+      // If we've already seen this phi node, ignore it, it has already been
+      // checked.
+      if (PHIs.insert(PN).second && !AllUsesOfValueWillTrapIfNull(PN, PHIs))
+        return false;
+    } else {
+      //cerr << "NONTRAPPING USE: " << *U;
+      return false;
+    }
+  }
+  return true;
+}
+
+/// Return true if all uses of any loads from GV will trap if the loaded value
+/// is null.  Note that this also permits comparisons of the loaded value
+/// against null, as a special case.
+static bool AllUsesOfLoadedValueWillTrapIfNull(const GlobalVariable *GV) {
+  for (const User *U : GV->users())
+    if (const LoadInst *LI = dyn_cast<LoadInst>(U)) {
+      SmallPtrSet<const PHINode*, 8> PHIs;
+      if (!AllUsesOfValueWillTrapIfNull(LI, PHIs))
+        return false;
+    } else if (isa<StoreInst>(U)) {
+      // Ignore stores to the global.
+    } else {
+      // We don't know or understand this user, bail out.
+      //cerr << "UNKNOWN USER OF GLOBAL!: " << *U;
+      return false;
+    }
+  return true;
+}
+
+static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) {
+  bool Changed = false;
+  for (auto UI = V->user_begin(), E = V->user_end(); UI != E; ) {
+    Instruction *I = cast<Instruction>(*UI++);
+    // Uses are non-trapping if null pointer is considered valid.
+    // Non address-space 0 globals are already pruned by the caller.
+    if (NullPointerIsDefined(I->getFunction()))
+      return false;
+    if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+      LI->setOperand(0, NewV);
+      Changed = true;
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+      if (SI->getOperand(1) == V) {
+        SI->setOperand(1, NewV);
+        Changed = true;
+      }
+    } else if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
+      CallBase *CB = cast<CallBase>(I);
+      if (CB->getCalledOperand() == V) {
+        // Calling through the pointer!  Turn into a direct call, but be careful
+        // that the pointer is not also being passed as an argument.
+        CB->setCalledOperand(NewV);
+        Changed = true;
+        bool PassedAsArg = false;
+        for (unsigned i = 0, e = CB->arg_size(); i != e; ++i)
+          if (CB->getArgOperand(i) == V) {
+            PassedAsArg = true;
+            CB->setArgOperand(i, NewV);
+          }
+
+        if (PassedAsArg) {
+          // Being passed as an argument also.  Be careful to not invalidate UI!
+          UI = V->user_begin();
+        }
+      }
+    } else if (CastInst *CI = dyn_cast<CastInst>(I)) {
+      Changed |= OptimizeAwayTrappingUsesOfValue(CI,
+                                ConstantExpr::getCast(CI->getOpcode(),
+                                                      NewV, CI->getType()));
+      if (CI->use_empty()) {
+        Changed = true;
+        CI->eraseFromParent();
+      }
+    } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
+      // Should handle GEP here.
+      SmallVector<Constant*, 8> Idxs;
+      Idxs.reserve(GEPI->getNumOperands()-1);
+      for (User::op_iterator i = GEPI->op_begin() + 1, e = GEPI->op_end();
+           i != e; ++i)
+        if (Constant *C = dyn_cast<Constant>(*i))
+          Idxs.push_back(C);
+        else
+          break;
+      if (Idxs.size() == GEPI->getNumOperands()-1)
+        Changed |= OptimizeAwayTrappingUsesOfValue(
+            GEPI, ConstantExpr::getGetElementPtr(GEPI->getSourceElementType(),
+                                                 NewV, Idxs));
+      if (GEPI->use_empty()) {
+        Changed = true;
+        GEPI->eraseFromParent();
+      }
+    }
+  }
+
+  return Changed;
+}
+
+/// The specified global has only one non-null value stored into it.  If there
+/// are uses of the loaded value that would trap if the loaded value is
+/// dynamically null, then we know that they cannot be reachable with a null
+/// optimize away the load.
+static bool OptimizeAwayTrappingUsesOfLoads(
+    GlobalVariable *GV, Constant *LV, const DataLayout &DL,
+    function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
+  bool Changed = false;
+
+  // Keep track of whether we are able to remove all the uses of the global
+  // other than the store that defines it.
+  bool AllNonStoreUsesGone = true;
+
+  // Replace all uses of loads with uses of uses of the stored value.
+  for (Value::user_iterator GUI = GV->user_begin(), E = GV->user_end(); GUI != E;){
+    User *GlobalUser = *GUI++;
+    if (LoadInst *LI = dyn_cast<LoadInst>(GlobalUser)) {
+      Changed |= OptimizeAwayTrappingUsesOfValue(LI, LV);
+      // If we were able to delete all uses of the loads
+      if (LI->use_empty()) {
+        LI->eraseFromParent();
+        Changed = true;
+      } else {
+        AllNonStoreUsesGone = false;
+      }
+    } else if (isa<StoreInst>(GlobalUser)) {
+      // Ignore the store that stores "LV" to the global.
+      assert(GlobalUser->getOperand(1) == GV &&
+             "Must be storing *to* the global");
+    } else {
+      AllNonStoreUsesGone = false;
+
+      // If we get here we could have other crazy uses that are transitively
+      // loaded.
+      assert((isa<PHINode>(GlobalUser) || isa<SelectInst>(GlobalUser) ||
+              isa<ConstantExpr>(GlobalUser) || isa<CmpInst>(GlobalUser) ||
+              isa<BitCastInst>(GlobalUser) ||
+              isa<GetElementPtrInst>(GlobalUser)) &&
+             "Only expect load and stores!");
+    }
+  }
+
+  if (Changed) {
+    LLVM_DEBUG(dbgs() << "OPTIMIZED LOADS FROM STORED ONCE POINTER: " << *GV
+                      << "\n");
+    ++NumGlobUses;
+  }
+
+  // If we nuked all of the loads, then none of the stores are needed either,
+  // nor is the global.
+  if (AllNonStoreUsesGone) {
+    if (isLeakCheckerRoot(GV)) {
+      Changed |= CleanupPointerRootUsers(GV, GetTLI);
+    } else {
+      Changed = true;
+      CleanupConstantGlobalUsers(GV, nullptr, DL, GetTLI);
+    }
+    if (GV->use_empty()) {
+      LLVM_DEBUG(dbgs() << "  *** GLOBAL NOW DEAD!\n");
+      Changed = true;
+      GV->eraseFromParent();
+      ++NumDeleted;
+    }
+  }
+  return Changed;
+}
+
+/// Walk the use list of V, constant folding all of the instructions that are
+/// foldable.
+static void ConstantPropUsersOf(Value *V, const DataLayout &DL,
+                                TargetLibraryInfo *TLI) {
+  for (Value::user_iterator UI = V->user_begin(), E = V->user_end(); UI != E; )
+    if (Instruction *I = dyn_cast<Instruction>(*UI++))
+      if (Constant *NewC = ConstantFoldInstruction(I, DL, TLI)) {
+        I->replaceAllUsesWith(NewC);
+
+        // Advance UI to the next non-I use to avoid invalidating it!
+        // Instructions could multiply use V.
+        while (UI != E && *UI == I)
+          ++UI;
+        if (isInstructionTriviallyDead(I, TLI))
+          I->eraseFromParent();
+      }
+}
+
+/// This function takes the specified global variable, and transforms the
+/// program as if it always contained the result of the specified malloc.
+/// Because it is always the result of the specified malloc, there is no reason
+/// to actually DO the malloc.  Instead, turn the malloc into a global, and any
+/// loads of GV as uses of the new global.
+static GlobalVariable *
+OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,
+                              ConstantInt *NElements, const DataLayout &DL,
+                              TargetLibraryInfo *TLI) {
+  LLVM_DEBUG(errs() << "PROMOTING GLOBAL: " << *GV << "  CALL = " << *CI
+                    << '\n');
+
+  Type *GlobalType;
+  if (NElements->getZExtValue() == 1)
+    GlobalType = AllocTy;
+  else
+    // If we have an array allocation, the global variable is of an array.
+    GlobalType = ArrayType::get(AllocTy, NElements->getZExtValue());
+
+  // Create the new global variable.  The contents of the malloc'd memory is
+  // undefined, so initialize with an undef value.
+  GlobalVariable *NewGV = new GlobalVariable(
+      *GV->getParent(), GlobalType, false, GlobalValue::InternalLinkage,
+      UndefValue::get(GlobalType), GV->getName() + ".body", nullptr,
+      GV->getThreadLocalMode());
+
+  // If there are bitcast users of the malloc (which is typical, usually we have
+  // a malloc + bitcast) then replace them with uses of the new global.  Update
+  // other users to use the global as well.
+  BitCastInst *TheBC = nullptr;
+  while (!CI->use_empty()) {
+    Instruction *User = cast<Instruction>(CI->user_back());
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) {
+      if (BCI->getType() == NewGV->getType()) {
+        BCI->replaceAllUsesWith(NewGV);
+        BCI->eraseFromParent();
+      } else {
+        BCI->setOperand(0, NewGV);
+      }
+    } else {
+      if (!TheBC)
+        TheBC = new BitCastInst(NewGV, CI->getType(), "newgv", CI);
+      User->replaceUsesOfWith(CI, TheBC);
+    }
+  }
+
+  Constant *RepValue = NewGV;
+  if (NewGV->getType() != GV->getValueType())
+    RepValue = ConstantExpr::getBitCast(RepValue, GV->getValueType());
+
+  // If there is a comparison against null, we will insert a global bool to
+  // keep track of whether the global was initialized yet or not.
+  GlobalVariable *InitBool =
+    new GlobalVariable(Type::getInt1Ty(GV->getContext()), false,
+                       GlobalValue::InternalLinkage,
+                       ConstantInt::getFalse(GV->getContext()),
+                       GV->getName()+".init", GV->getThreadLocalMode());
+  bool InitBoolUsed = false;
+
+  // Loop over all uses of GV, processing them in turn.
+  while (!GV->use_empty()) {
+    if (StoreInst *SI = dyn_cast<StoreInst>(GV->user_back())) {
+      // The global is initialized when the store to it occurs.
+      new StoreInst(ConstantInt::getTrue(GV->getContext()), InitBool, false,
+                    Align(1), SI->getOrdering(), SI->getSyncScopeID(), SI);
+      SI->eraseFromParent();
+      continue;
+    }
+
+    LoadInst *LI = cast<LoadInst>(GV->user_back());
+    while (!LI->use_empty()) {
+      Use &LoadUse = *LI->use_begin();
+      ICmpInst *ICI = dyn_cast<ICmpInst>(LoadUse.getUser());
+      if (!ICI) {
+        LoadUse = RepValue;
+        continue;
+      }
+
+      // Replace the cmp X, 0 with a use of the bool value.
+      // Sink the load to where the compare was, if atomic rules allow us to.
+      Value *LV = new LoadInst(InitBool->getValueType(), InitBool,
+                               InitBool->getName() + ".val", false, Align(1),
+                               LI->getOrdering(), LI->getSyncScopeID(),
+                               LI->isUnordered() ? (Instruction *)ICI : LI);
+      InitBoolUsed = true;
+      switch (ICI->getPredicate()) {
+      default: llvm_unreachable("Unknown ICmp Predicate!");
+      case ICmpInst::ICMP_ULT:
+      case ICmpInst::ICMP_SLT:   // X < null -> always false
+        LV = ConstantInt::getFalse(GV->getContext());
+        break;
+      case ICmpInst::ICMP_ULE:
+      case ICmpInst::ICMP_SLE:
+      case ICmpInst::ICMP_EQ:
+        LV = BinaryOperator::CreateNot(LV, "notinit", ICI);
+        break;
+      case ICmpInst::ICMP_NE:
+      case ICmpInst::ICMP_UGE:
+      case ICmpInst::ICMP_SGE:
+      case ICmpInst::ICMP_UGT:
+      case ICmpInst::ICMP_SGT:
+        break;  // no change.
+      }
+      ICI->replaceAllUsesWith(LV);
+      ICI->eraseFromParent();
+    }
+    LI->eraseFromParent();
+  }
+
+  // If the initialization boolean was used, insert it, otherwise delete it.
+  if (!InitBoolUsed) {
+    while (!InitBool->use_empty())  // Delete initializations
+      cast<StoreInst>(InitBool->user_back())->eraseFromParent();
+    delete InitBool;
+  } else
+    GV->getParent()->getGlobalList().insert(GV->getIterator(), InitBool);
+
+  // Now the GV is dead, nuke it and the malloc..
+  GV->eraseFromParent();
+  CI->eraseFromParent();
+
+  // To further other optimizations, loop over all users of NewGV and try to
+  // constant prop them.  This will promote GEP instructions with constant
+  // indices into GEP constant-exprs, which will allow global-opt to hack on it.
+  ConstantPropUsersOf(NewGV, DL, TLI);
+  if (RepValue != NewGV)
+    ConstantPropUsersOf(RepValue, DL, TLI);
+
+  return NewGV;
+}
+
+/// Scan the use-list of V checking to make sure that there are no complex uses
+/// of V.  We permit simple things like dereferencing the pointer, but not
+/// storing through the address, unless it is to the specified global.
+static bool ValueIsOnlyUsedLocallyOrStoredToOneGlobal(const Instruction *V,
+                                                      const GlobalVariable *GV,
+                                        SmallPtrSetImpl<const PHINode*> &PHIs) {
+  for (const User *U : V->users()) {
+    const Instruction *Inst = cast<Instruction>(U);
+
+    if (isa<LoadInst>(Inst) || isa<CmpInst>(Inst)) {
+      continue; // Fine, ignore.
+    }
+
+    if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+      if (SI->getOperand(0) == V && SI->getOperand(1) != GV)
+        return false;  // Storing the pointer itself... bad.
+      continue; // Otherwise, storing through it, or storing into GV... fine.
+    }
+
+    // Must index into the array and into the struct.
+    if (isa<GetElementPtrInst>(Inst) && Inst->getNumOperands() >= 3) {
+      if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(Inst, GV, PHIs))
+        return false;
+      continue;
+    }
+
+    if (const PHINode *PN = dyn_cast<PHINode>(Inst)) {
+      // PHIs are ok if all uses are ok.  Don't infinitely recurse through PHI
+      // cycles.
+      if (PHIs.insert(PN).second)
+        if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(PN, GV, PHIs))
+          return false;
+      continue;
+    }
+
+    if (const BitCastInst *BCI = dyn_cast<BitCastInst>(Inst)) {
+      if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(BCI, GV, PHIs))
+        return false;
+      continue;
+    }
+
+    return false;
+  }
+  return true;
+}
+
+/// The Alloc pointer is stored into GV somewhere.  Transform all uses of the
+/// allocation into loads from the global and uses of the resultant pointer.
+/// Further, delete the store into GV.  This assumes that these value pass the
+/// 'ValueIsOnlyUsedLocallyOrStoredToOneGlobal' predicate.
+static void ReplaceUsesOfMallocWithGlobal(Instruction *Alloc,
+                                          GlobalVariable *GV) {
+  while (!Alloc->use_empty()) {
+    Instruction *U = cast<Instruction>(*Alloc->user_begin());
+    Instruction *InsertPt = U;
+    if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      // If this is the store of the allocation into the global, remove it.
+      if (SI->getOperand(1) == GV) {
+        SI->eraseFromParent();
+        continue;
+      }
+    } else if (PHINode *PN = dyn_cast<PHINode>(U)) {
+      // Insert the load in the corresponding predecessor, not right before the
+      // PHI.
+      InsertPt = PN->getIncomingBlock(*Alloc->use_begin())->getTerminator();
+    } else if (isa<BitCastInst>(U)) {
+      // Must be bitcast between the malloc and store to initialize the global.
+      ReplaceUsesOfMallocWithGlobal(U, GV);
+      U->eraseFromParent();
+      continue;
+    } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) {
+      // If this is a "GEP bitcast" and the user is a store to the global, then
+      // just process it as a bitcast.
+      if (GEPI->hasAllZeroIndices() && GEPI->hasOneUse())
+        if (StoreInst *SI = dyn_cast<StoreInst>(GEPI->user_back()))
+          if (SI->getOperand(1) == GV) {
+            // Must be bitcast GEP between the malloc and store to initialize
+            // the global.
+            ReplaceUsesOfMallocWithGlobal(GEPI, GV);
+            GEPI->eraseFromParent();
+            continue;
+          }
+    }
+
+    // Insert a load from the global, and use it instead of the malloc.
+    Value *NL =
+        new LoadInst(GV->getValueType(), GV, GV->getName() + ".val", InsertPt);
+    U->replaceUsesOfWith(Alloc, NL);
+  }
+}
+
+/// Verify that all uses of V (a load, or a phi of a load) are simple enough to
+/// perform heap SRA on.  This permits GEP's that index through the array and
+/// struct field, icmps of null, and PHIs.
+static bool LoadUsesSimpleEnoughForHeapSRA(const Value *V,
+                        SmallPtrSetImpl<const PHINode*> &LoadUsingPHIs,
+                        SmallPtrSetImpl<const PHINode*> &LoadUsingPHIsPerLoad) {
+  // We permit two users of the load: setcc comparing against the null
+  // pointer, and a getelementptr of a specific form.
+  for (const User *U : V->users()) {
+    const Instruction *UI = cast<Instruction>(U);
+
+    // Comparison against null is ok.
+    if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UI)) {
+      if (!isa<ConstantPointerNull>(ICI->getOperand(1)))
+        return false;
+      continue;
+    }
+
+    // getelementptr is also ok, but only a simple form.
+    if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(UI)) {
+      // Must index into the array and into the struct.
+      if (GEPI->getNumOperands() < 3)
+        return false;
+
+      // Otherwise the GEP is ok.
+      continue;
+    }
+
+    if (const PHINode *PN = dyn_cast<PHINode>(UI)) {
+      if (!LoadUsingPHIsPerLoad.insert(PN).second)
+        // This means some phi nodes are dependent on each other.
+        // Avoid infinite looping!
+        return false;
+      if (!LoadUsingPHIs.insert(PN).second)
+        // If we have already analyzed this PHI, then it is safe.
+        continue;
+
+      // Make sure all uses of the PHI are simple enough to transform.
+      if (!LoadUsesSimpleEnoughForHeapSRA(PN,
+                                          LoadUsingPHIs, LoadUsingPHIsPerLoad))
+        return false;
+
+      continue;
+    }
+
+    // Otherwise we don't know what this is, not ok.
+    return false;
+  }
+
+  return true;
+}
+
+/// If all users of values loaded from GV are simple enough to perform HeapSRA,
+/// return true.
+static bool AllGlobalLoadUsesSimpleEnoughForHeapSRA(const GlobalVariable *GV,
+                                                    Instruction *StoredVal) {
+  SmallPtrSet<const PHINode*, 32> LoadUsingPHIs;
+  SmallPtrSet<const PHINode*, 32> LoadUsingPHIsPerLoad;
+  for (const User *U : GV->users())
+    if (const LoadInst *LI = dyn_cast<LoadInst>(U)) {
+      if (!LoadUsesSimpleEnoughForHeapSRA(LI, LoadUsingPHIs,
+                                          LoadUsingPHIsPerLoad))
+        return false;
+      LoadUsingPHIsPerLoad.clear();
+    }
+
+  // If we reach here, we know that all uses of the loads and transitive uses
+  // (through PHI nodes) are simple enough to transform.  However, we don't know
+  // that all inputs the to the PHI nodes are in the same equivalence sets.
+  // Check to verify that all operands of the PHIs are either PHIS that can be
+  // transformed, loads from GV, or MI itself.
+  for (const PHINode *PN : LoadUsingPHIs) {
+    for (unsigned op = 0, e = PN->getNumIncomingValues(); op != e; ++op) {
+      Value *InVal = PN->getIncomingValue(op);
+
+      // PHI of the stored value itself is ok.
+      if (InVal == StoredVal) continue;
+
+      if (const PHINode *InPN = dyn_cast<PHINode>(InVal)) {
+        // One of the PHIs in our set is (optimistically) ok.
+        if (LoadUsingPHIs.count(InPN))
+          continue;
+        return false;
+      }
+
+      // Load from GV is ok.
+      if (const LoadInst *LI = dyn_cast<LoadInst>(InVal))
+        if (LI->getOperand(0) == GV)
+          continue;
+
+      // UNDEF? NULL?
+
+      // Anything else is rejected.
+      return false;
+    }
+  }
+
+  return true;
+}
+
+static Value *GetHeapSROAValue(Value *V, unsigned FieldNo,
+              DenseMap<Value *, std::vector<Value *>> &InsertedScalarizedValues,
+                   std::vector<std::pair<PHINode *, unsigned>> &PHIsToRewrite) {
+  std::vector<Value *> &FieldVals = InsertedScalarizedValues[V];
+
+  if (FieldNo >= FieldVals.size())
+    FieldVals.resize(FieldNo+1);
+
+  // If we already have this value, just reuse the previously scalarized
+  // version.
+  if (Value *FieldVal = FieldVals[FieldNo])
+    return FieldVal;
+
+  // Depending on what instruction this is, we have several cases.
+  Value *Result;
+  if (LoadInst *LI = dyn_cast<LoadInst>(V)) {
+    // This is a scalarized version of the load from the global.  Just create
+    // a new Load of the scalarized global.
+    Value *V = GetHeapSROAValue(LI->getOperand(0), FieldNo,
+                                InsertedScalarizedValues, PHIsToRewrite);
+    Result = new LoadInst(V->getType()->getPointerElementType(), V,
+                          LI->getName() + ".f" + Twine(FieldNo), LI);
+  } else {
+    PHINode *PN = cast<PHINode>(V);
+    // PN's type is pointer to struct.  Make a new PHI of pointer to struct
+    // field.
+
+    PointerType *PTy = cast<PointerType>(PN->getType());
+    StructType *ST = cast<StructType>(PTy->getElementType());
+
+    unsigned AS = PTy->getAddressSpace();
+    PHINode *NewPN =
+      PHINode::Create(PointerType::get(ST->getElementType(FieldNo), AS),
+                     PN->getNumIncomingValues(),
+                     PN->getName()+".f"+Twine(FieldNo), PN);
+    Result = NewPN;
+    PHIsToRewrite.push_back(std::make_pair(PN, FieldNo));
+  }
+
+  return FieldVals[FieldNo] = Result;
+}
+
+/// Given a load instruction and a value derived from the load, rewrite the
+/// derived value to use the HeapSRoA'd load.
+static void RewriteHeapSROALoadUser(Instruction *LoadUser,
+              DenseMap<Value *, std::vector<Value *>> &InsertedScalarizedValues,
+                   std::vector<std::pair<PHINode *, unsigned>> &PHIsToRewrite) {
+  // If this is a comparison against null, handle it.
+  if (ICmpInst *SCI = dyn_cast<ICmpInst>(LoadUser)) {
+    assert(isa<ConstantPointerNull>(SCI->getOperand(1)));
+    // If we have a setcc of the loaded pointer, we can use a setcc of any
+    // field.
+    Value *NPtr = GetHeapSROAValue(SCI->getOperand(0), 0,
+                                   InsertedScalarizedValues, PHIsToRewrite);
+
+    Value *New = new ICmpInst(SCI, SCI->getPredicate(), NPtr,
+                              Constant::getNullValue(NPtr->getType()),
+                              SCI->getName());
+    SCI->replaceAllUsesWith(New);
+    SCI->eraseFromParent();
+    return;
+  }
+
+  // Handle 'getelementptr Ptr, Idx, i32 FieldNo ...'
+  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(LoadUser)) {
+    assert(GEPI->getNumOperands() >= 3 && isa<ConstantInt>(GEPI->getOperand(2))
+           && "Unexpected GEPI!");
+
+    // Load the pointer for this field.
+    unsigned FieldNo = cast<ConstantInt>(GEPI->getOperand(2))->getZExtValue();
+    Value *NewPtr = GetHeapSROAValue(GEPI->getOperand(0), FieldNo,
+                                     InsertedScalarizedValues, PHIsToRewrite);
+
+    // Create the new GEP idx vector.
+    SmallVector<Value*, 8> GEPIdx;
+    GEPIdx.push_back(GEPI->getOperand(1));
+    GEPIdx.append(GEPI->op_begin()+3, GEPI->op_end());
+
+    Value *NGEPI = GetElementPtrInst::Create(GEPI->getResultElementType(), NewPtr, GEPIdx,
+                                             GEPI->getName(), GEPI);
+    GEPI->replaceAllUsesWith(NGEPI);
+    GEPI->eraseFromParent();
+    return;
+  }
+
+  // Recursively transform the users of PHI nodes.  This will lazily create the
+  // PHIs that are needed for individual elements.  Keep track of what PHIs we
+  // see in InsertedScalarizedValues so that we don't get infinite loops (very
+  // antisocial).  If the PHI is already in InsertedScalarizedValues, it has
+  // already been seen first by another load, so its uses have already been
+  // processed.
+  PHINode *PN = cast<PHINode>(LoadUser);
+  if (!InsertedScalarizedValues.insert(std::make_pair(PN,
+                                              std::vector<Value *>())).second)
+    return;
+
+  // If this is the first time we've seen this PHI, recursively process all
+  // users.
+  for (auto UI = PN->user_begin(), E = PN->user_end(); UI != E;) {
+    Instruction *User = cast<Instruction>(*UI++);
+    RewriteHeapSROALoadUser(User, InsertedScalarizedValues, PHIsToRewrite);
+  }
+}
+
+/// We are performing Heap SRoA on a global.  Ptr is a value loaded from the
+/// global.  Eliminate all uses of Ptr, making them use FieldGlobals instead.
+/// All uses of loaded values satisfy AllGlobalLoadUsesSimpleEnoughForHeapSRA.
+static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load,
+              DenseMap<Value *, std::vector<Value *>> &InsertedScalarizedValues,
+                  std::vector<std::pair<PHINode *, unsigned> > &PHIsToRewrite) {
+  for (auto UI = Load->user_begin(), E = Load->user_end(); UI != E;) {
+    Instruction *User = cast<Instruction>(*UI++);
+    RewriteHeapSROALoadUser(User, InsertedScalarizedValues, PHIsToRewrite);
+  }
+
+  if (Load->use_empty()) {
+    Load->eraseFromParent();
+    InsertedScalarizedValues.erase(Load);
+  }
+}
+
+/// CI is an allocation of an array of structures.  Break it up into multiple
+/// allocations of arrays of the fields.
+static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
+                                            Value *NElems, const DataLayout &DL,
+                                            const TargetLibraryInfo *TLI) {
+  LLVM_DEBUG(dbgs() << "SROA HEAP ALLOC: " << *GV << "  MALLOC = " << *CI
+                    << '\n');
+  Type *MAT = getMallocAllocatedType(CI, TLI);
+  StructType *STy = cast<StructType>(MAT);
+
+  // There is guaranteed to be at least one use of the malloc (storing
+  // it into GV).  If there are other uses, change them to be uses of
+  // the global to simplify later code.  This also deletes the store
+  // into GV.
+  ReplaceUsesOfMallocWithGlobal(CI, GV);
+
+  // Okay, at this point, there are no users of the malloc.  Insert N
+  // new mallocs at the same place as CI, and N globals.
+  std::vector<Value *> FieldGlobals;
+  std::vector<Value *> FieldMallocs;
+
+  SmallVector<OperandBundleDef, 1> OpBundles;
+  CI->getOperandBundlesAsDefs(OpBundles);
+
+  unsigned AS = GV->getType()->getPointerAddressSpace();
+  for (unsigned FieldNo = 0, e = STy->getNumElements(); FieldNo != e;++FieldNo){
+    Type *FieldTy = STy->getElementType(FieldNo);
+    PointerType *PFieldTy = PointerType::get(FieldTy, AS);
+
+    GlobalVariable *NGV = new GlobalVariable(
+        *GV->getParent(), PFieldTy, false, GlobalValue::InternalLinkage,
+        Constant::getNullValue(PFieldTy), GV->getName() + ".f" + Twine(FieldNo),
+        nullptr, GV->getThreadLocalMode());
+    NGV->copyAttributesFrom(GV);
+    FieldGlobals.push_back(NGV);
+
+    unsigned TypeSize = DL.getTypeAllocSize(FieldTy);
+    if (StructType *ST = dyn_cast<StructType>(FieldTy))
+      TypeSize = DL.getStructLayout(ST)->getSizeInBytes();
+    Type *IntPtrTy = DL.getIntPtrType(CI->getType());
+    Value *NMI = CallInst::CreateMalloc(CI, IntPtrTy, FieldTy,
+                                        ConstantInt::get(IntPtrTy, TypeSize),
+                                        NElems, OpBundles, nullptr,
+                                        CI->getName() + ".f" + Twine(FieldNo));
+    FieldMallocs.push_back(NMI);
+    new StoreInst(NMI, NGV, CI);
+  }
+
+  // The tricky aspect of this transformation is handling the case when malloc
+  // fails.  In the original code, malloc failing would set the result pointer
+  // of malloc to null.  In this case, some mallocs could succeed and others
+  // could fail.  As such, we emit code that looks like this:
+  //    F0 = malloc(field0)
+  //    F1 = malloc(field1)
+  //    F2 = malloc(field2)
+  //    if (F0 == 0 || F1 == 0 || F2 == 0) {
+  //      if (F0) { free(F0); F0 = 0; }
+  //      if (F1) { free(F1); F1 = 0; }
+  //      if (F2) { free(F2); F2 = 0; }
+  //    }
+  // The malloc can also fail if its argument is too large.
+  Constant *ConstantZero = ConstantInt::get(CI->getArgOperand(0)->getType(), 0);
+  Value *RunningOr = new ICmpInst(CI, ICmpInst::ICMP_SLT, CI->getArgOperand(0),
+                                  ConstantZero, "isneg");
+  for (unsigned i = 0, e = FieldMallocs.size(); i != e; ++i) {
+    Value *Cond = new ICmpInst(CI, ICmpInst::ICMP_EQ, FieldMallocs[i],
+                             Constant::getNullValue(FieldMallocs[i]->getType()),
+                               "isnull");
+    RunningOr = BinaryOperator::CreateOr(RunningOr, Cond, "tmp", CI);
+  }
+
+  // Split the basic block at the old malloc.
+  BasicBlock *OrigBB = CI->getParent();
+  BasicBlock *ContBB =
+      OrigBB->splitBasicBlock(CI->getIterator(), "malloc_cont");
+
+  // Create the block to check the first condition.  Put all these blocks at the
+  // end of the function as they are unlikely to be executed.
+  BasicBlock *NullPtrBlock = BasicBlock::Create(OrigBB->getContext(),
+                                                "malloc_ret_null",
+                                                OrigBB->getParent());
+
+  // Remove the uncond branch from OrigBB to ContBB, turning it into a cond
+  // branch on RunningOr.
+  OrigBB->getTerminator()->eraseFromParent();
+  BranchInst::Create(NullPtrBlock, ContBB, RunningOr, OrigBB);
+
+  // Within the NullPtrBlock, we need to emit a comparison and branch for each
+  // pointer, because some may be null while others are not.
+  for (unsigned i = 0, e = FieldGlobals.size(); i != e; ++i) {
+    Value *GVVal =
+        new LoadInst(cast<GlobalVariable>(FieldGlobals[i])->getValueType(),
+                     FieldGlobals[i], "tmp", NullPtrBlock);
+    Value *Cmp = new ICmpInst(*NullPtrBlock, ICmpInst::ICMP_NE, GVVal,
+                              Constant::getNullValue(GVVal->getType()));
+    BasicBlock *FreeBlock = BasicBlock::Create(Cmp->getContext(), "free_it",
+                                               OrigBB->getParent());
+    BasicBlock *NextBlock = BasicBlock::Create(Cmp->getContext(), "next",
+                                               OrigBB->getParent());
+    Instruction *BI = BranchInst::Create(FreeBlock, NextBlock,
+                                         Cmp, NullPtrBlock);
+
+    // Fill in FreeBlock.
+    CallInst::CreateFree(GVVal, OpBundles, BI);
+    new StoreInst(Constant::getNullValue(GVVal->getType()), FieldGlobals[i],
+                  FreeBlock);
+    BranchInst::Create(NextBlock, FreeBlock);
+
+    NullPtrBlock = NextBlock;
+  }
+
+  BranchInst::Create(ContBB, NullPtrBlock);
+
+  // CI is no longer needed, remove it.
+  CI->eraseFromParent();
+
+  /// As we process loads, if we can't immediately update all uses of the load,
+  /// keep track of what scalarized loads are inserted for a given load.
+  DenseMap<Value *, std::vector<Value *>> InsertedScalarizedValues;
+  InsertedScalarizedValues[GV] = FieldGlobals;
+
+  std::vector<std::pair<PHINode *, unsigned>> PHIsToRewrite;
+
+  // Okay, the malloc site is completely handled.  All of the uses of GV are now
+  // loads, and all uses of those loads are simple.  Rewrite them to use loads
+  // of the per-field globals instead.
+  for (auto UI = GV->user_begin(), E = GV->user_end(); UI != E;) {
+    Instruction *User = cast<Instruction>(*UI++);
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
+      RewriteUsesOfLoadForHeapSRoA(LI, InsertedScalarizedValues, PHIsToRewrite);
+      continue;
+    }
+
+    // Must be a store of null.
+    StoreInst *SI = cast<StoreInst>(User);
+    assert(isa<ConstantPointerNull>(SI->getOperand(0)) &&
+           "Unexpected heap-sra user!");
+
+    // Insert a store of null into each global.
+    for (unsigned i = 0, e = FieldGlobals.size(); i != e; ++i) {
+      Type *ValTy = cast<GlobalValue>(FieldGlobals[i])->getValueType();
+      Constant *Null = Constant::getNullValue(ValTy);
+      new StoreInst(Null, FieldGlobals[i], SI);
+    }
+    // Erase the original store.
+    SI->eraseFromParent();
+  }
+
+  // While we have PHIs that are interesting to rewrite, do it.
+  while (!PHIsToRewrite.empty()) {
+    PHINode *PN = PHIsToRewrite.back().first;
+    unsigned FieldNo = PHIsToRewrite.back().second;
+    PHIsToRewrite.pop_back();
+    PHINode *FieldPN = cast<PHINode>(InsertedScalarizedValues[PN][FieldNo]);
+    assert(FieldPN->getNumIncomingValues() == 0 &&"Already processed this phi");
+
+    // Add all the incoming values.  This can materialize more phis.
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      Value *InVal = PN->getIncomingValue(i);
+      InVal = GetHeapSROAValue(InVal, FieldNo, InsertedScalarizedValues,
+                               PHIsToRewrite);
+      FieldPN->addIncoming(InVal, PN->getIncomingBlock(i));
+    }
+  }
+
+  // Drop all inter-phi links and any loads that made it this far.
+  for (DenseMap<Value *, std::vector<Value *>>::iterator
+       I = InsertedScalarizedValues.begin(), E = InsertedScalarizedValues.end();
+       I != E; ++I) {
+    if (PHINode *PN = dyn_cast<PHINode>(I->first))
+      PN->dropAllReferences();
+    else if (LoadInst *LI = dyn_cast<LoadInst>(I->first))
+      LI->dropAllReferences();
+  }
+
+  // Delete all the phis and loads now that inter-references are dead.
+  for (DenseMap<Value *, std::vector<Value *>>::iterator
+       I = InsertedScalarizedValues.begin(), E = InsertedScalarizedValues.end();
+       I != E; ++I) {
+    if (PHINode *PN = dyn_cast<PHINode>(I->first))
+      PN->eraseFromParent();
+    else if (LoadInst *LI = dyn_cast<LoadInst>(I->first))
+      LI->eraseFromParent();
+  }
+
+  // The old global is now dead, remove it.
+  GV->eraseFromParent();
+
+  ++NumHeapSRA;
+  return cast<GlobalVariable>(FieldGlobals[0]);
+}
+
+/// This function is called when we see a pointer global variable with a single
+/// value stored it that is a malloc or cast of malloc.
+static bool tryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI,
+                                               Type *AllocTy,
+                                               AtomicOrdering Ordering,
+                                               const DataLayout &DL,
+                                               TargetLibraryInfo *TLI) {
+  // If this is a malloc of an abstract type, don't touch it.
+  if (!AllocTy->isSized())
+    return false;
+
+  // We can't optimize this global unless all uses of it are *known* to be
+  // of the malloc value, not of the null initializer value (consider a use
+  // that compares the global's value against zero to see if the malloc has
+  // been reached).  To do this, we check to see if all uses of the global
+  // would trap if the global were null: this proves that they must all
+  // happen after the malloc.
+  if (!AllUsesOfLoadedValueWillTrapIfNull(GV))
+    return false;
+
+  // We can't optimize this if the malloc itself is used in a complex way,
+  // for example, being stored into multiple globals.  This allows the
+  // malloc to be stored into the specified global, loaded icmp'd, and
+  // GEP'd.  These are all things we could transform to using the global
+  // for.
+  SmallPtrSet<const PHINode*, 8> PHIs;
+  if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(CI, GV, PHIs))
+    return false;
+
+  // If we have a global that is only initialized with a fixed size malloc,
+  // transform the program to use global memory instead of malloc'd memory.
+  // This eliminates dynamic allocation, avoids an indirection accessing the
+  // data, and exposes the resultant global to further GlobalOpt.
+  // We cannot optimize the malloc if we cannot determine malloc array size.
+  Value *NElems = getMallocArraySize(CI, DL, TLI, true);
+  if (!NElems)
+    return false;
+
+  if (ConstantInt *NElements = dyn_cast<ConstantInt>(NElems))
+    // Restrict this transformation to only working on small allocations
+    // (2048 bytes currently), as we don't want to introduce a 16M global or
+    // something.
+    if (NElements->getZExtValue() * DL.getTypeAllocSize(AllocTy) < 2048) {
+      OptimizeGlobalAddressOfMalloc(GV, CI, AllocTy, NElements, DL, TLI);
+      return true;
+    }
+
+  // If the allocation is an array of structures, consider transforming this
+  // into multiple malloc'd arrays, one for each field.  This is basically
+  // SRoA for malloc'd memory.
+
+  if (Ordering != AtomicOrdering::NotAtomic)
+    return false;
+
+  // If this is an allocation of a fixed size array of structs, analyze as a
+  // variable size array.  malloc [100 x struct],1 -> malloc struct, 100
+  if (NElems == ConstantInt::get(CI->getArgOperand(0)->getType(), 1))
+    if (ArrayType *AT = dyn_cast<ArrayType>(AllocTy))
+      AllocTy = AT->getElementType();
+
+  StructType *AllocSTy = dyn_cast<StructType>(AllocTy);
+  if (!AllocSTy)
+    return false;
+
+  // This the structure has an unreasonable number of fields, leave it
+  // alone.
+  if (AllocSTy->getNumElements() <= 16 && AllocSTy->getNumElements() != 0 &&
+      AllGlobalLoadUsesSimpleEnoughForHeapSRA(GV, CI)) {
+
+    // If this is a fixed size array, transform the Malloc to be an alloc of
+    // structs.  malloc [100 x struct],1 -> malloc struct, 100
+    if (ArrayType *AT = dyn_cast<ArrayType>(getMallocAllocatedType(CI, TLI))) {
+      Type *IntPtrTy = DL.getIntPtrType(CI->getType());
+      unsigned TypeSize = DL.getStructLayout(AllocSTy)->getSizeInBytes();
+      Value *AllocSize = ConstantInt::get(IntPtrTy, TypeSize);
+      Value *NumElements = ConstantInt::get(IntPtrTy, AT->getNumElements());
+      SmallVector<OperandBundleDef, 1> OpBundles;
+      CI->getOperandBundlesAsDefs(OpBundles);
+      Instruction *Malloc =
+          CallInst::CreateMalloc(CI, IntPtrTy, AllocSTy, AllocSize, NumElements,
+                                 OpBundles, nullptr, CI->getName());
+      Instruction *Cast = new BitCastInst(Malloc, CI->getType(), "tmp", CI);
+      CI->replaceAllUsesWith(Cast);
+      CI->eraseFromParent();
+      if (BitCastInst *BCI = dyn_cast<BitCastInst>(Malloc))
+        CI = cast<CallInst>(BCI->getOperand(0));
+      else
+        CI = cast<CallInst>(Malloc);
+    }
+
+    PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, DL, TLI, true), DL,
+                         TLI);
+    return true;
+  }
+
+  return false;
+}
+
+// Try to optimize globals based on the knowledge that only one value (besides
+// its initializer) is ever stored to the global.
+static bool
+optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
+                         AtomicOrdering Ordering, const DataLayout &DL,
+                         function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
+  // Ignore no-op GEPs and bitcasts.
+  StoredOnceVal = StoredOnceVal->stripPointerCasts();
+
+  // If we are dealing with a pointer global that is initialized to null and
+  // only has one (non-null) value stored into it, then we can optimize any
+  // users of the loaded value (often calls and loads) that would trap if the
+  // value was null.
+  if (GV->getInitializer()->getType()->isPointerTy() &&
+      GV->getInitializer()->isNullValue() &&
+      !NullPointerIsDefined(
+          nullptr /* F */,
+          GV->getInitializer()->getType()->getPointerAddressSpace())) {
+    if (Constant *SOVC = dyn_cast<Constant>(StoredOnceVal)) {
+      if (GV->getInitializer()->getType() != SOVC->getType())
+        SOVC = ConstantExpr::getBitCast(SOVC, GV->getInitializer()->getType());
+
+      // Optimize away any trapping uses of the loaded value.
+      if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC, DL, GetTLI))
+        return true;
+    } else if (CallInst *CI = extractMallocCall(StoredOnceVal, GetTLI)) {
+      auto *TLI = &GetTLI(*CI->getFunction());
+      Type *MallocType = getMallocAllocatedType(CI, TLI);
+      if (MallocType && tryToOptimizeStoreOfMallocToGlobal(GV, CI, MallocType,
+                                                           Ordering, DL, TLI))
+        return true;
+    }
+  }
+
+  return false;
+}
+
+/// At this point, we have learned that the only two values ever stored into GV
+/// are its initializer and OtherVal.  See if we can shrink the global into a
+/// boolean and select between the two values whenever it is used.  This exposes
+/// the values to other scalar optimizations.
+static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
+  Type *GVElType = GV->getValueType();
+
+  // If GVElType is already i1, it is already shrunk.  If the type of the GV is
+  // an FP value, pointer or vector, don't do this optimization because a select
+  // between them is very expensive and unlikely to lead to later
+  // simplification.  In these cases, we typically end up with "cond ? v1 : v2"
+  // where v1 and v2 both require constant pool loads, a big loss.
+  if (GVElType == Type::getInt1Ty(GV->getContext()) ||
+      GVElType->isFloatingPointTy() ||
+      GVElType->isPointerTy() || GVElType->isVectorTy())
+    return false;
+
+  // Walk the use list of the global seeing if all the uses are load or store.
+  // If there is anything else, bail out.
+  for (User *U : GV->users())
+    if (!isa<LoadInst>(U) && !isa<StoreInst>(U))
+      return false;
+
+  LLVM_DEBUG(dbgs() << "   *** SHRINKING TO BOOL: " << *GV << "\n");
+
+  // Create the new global, initializing it to false.
+  GlobalVariable *NewGV = new GlobalVariable(Type::getInt1Ty(GV->getContext()),
+                                             false,
+                                             GlobalValue::InternalLinkage,
+                                        ConstantInt::getFalse(GV->getContext()),
+                                             GV->getName()+".b",
+                                             GV->getThreadLocalMode(),
+                                             GV->getType()->getAddressSpace());
+  NewGV->copyAttributesFrom(GV);
+  GV->getParent()->getGlobalList().insert(GV->getIterator(), NewGV);
+
+  Constant *InitVal = GV->getInitializer();
+  assert(InitVal->getType() != Type::getInt1Ty(GV->getContext()) &&
+         "No reason to shrink to bool!");
+
+  SmallVector<DIGlobalVariableExpression *, 1> GVs;
+  GV->getDebugInfo(GVs);
+
+  // If initialized to zero and storing one into the global, we can use a cast
+  // instead of a select to synthesize the desired value.
+  bool IsOneZero = false;
+  bool EmitOneOrZero = true;
+  auto *CI = dyn_cast<ConstantInt>(OtherVal);
+  if (CI && CI->getValue().getActiveBits() <= 64) {
+    IsOneZero = InitVal->isNullValue() && CI->isOne();
+
+    auto *CIInit = dyn_cast<ConstantInt>(GV->getInitializer());
+    if (CIInit && CIInit->getValue().getActiveBits() <= 64) {
+      uint64_t ValInit = CIInit->getZExtValue();
+      uint64_t ValOther = CI->getZExtValue();
+      uint64_t ValMinus = ValOther - ValInit;
+
+      for(auto *GVe : GVs){
+        DIGlobalVariable *DGV = GVe->getVariable();
+        DIExpression *E = GVe->getExpression();
+        const DataLayout &DL = GV->getParent()->getDataLayout();
+        unsigned SizeInOctets =
+          DL.getTypeAllocSizeInBits(NewGV->getType()->getElementType()) / 8;
+
+        // It is expected that the address of global optimized variable is on
+        // top of the stack. After optimization, value of that variable will
+        // be ether 0 for initial value or 1 for other value. The following
+        // expression should return constant integer value depending on the
+        // value at global object address:
+        // val * (ValOther - ValInit) + ValInit:
+        // DW_OP_deref DW_OP_constu <ValMinus>
+        // DW_OP_mul DW_OP_constu <ValInit> DW_OP_plus DW_OP_stack_value
+        SmallVector<uint64_t, 12> Ops = {
+            dwarf::DW_OP_deref_size, SizeInOctets,
+            dwarf::DW_OP_constu, ValMinus,
+            dwarf::DW_OP_mul, dwarf::DW_OP_constu, ValInit,
+            dwarf::DW_OP_plus};
+        bool WithStackValue = true;
+        E = DIExpression::prependOpcodes(E, Ops, WithStackValue);
+        DIGlobalVariableExpression *DGVE =
+          DIGlobalVariableExpression::get(NewGV->getContext(), DGV, E);
+        NewGV->addDebugInfo(DGVE);
+     }
+     EmitOneOrZero = false;
+    }
+  }
+
+  if (EmitOneOrZero) {
+     // FIXME: This will only emit address for debugger on which will
+     // be written only 0 or 1.
+     for(auto *GV : GVs)
+       NewGV->addDebugInfo(GV);
+   }
+
+  while (!GV->use_empty()) {
+    Instruction *UI = cast<Instruction>(GV->user_back());
+    if (StoreInst *SI = dyn_cast<StoreInst>(UI)) {
+      // Change the store into a boolean store.
+      bool StoringOther = SI->getOperand(0) == OtherVal;
+      // Only do this if we weren't storing a loaded value.
+      Value *StoreVal;
+      if (StoringOther || SI->getOperand(0) == InitVal) {
+        StoreVal = ConstantInt::get(Type::getInt1Ty(GV->getContext()),
+                                    StoringOther);
+      } else {
+        // Otherwise, we are storing a previously loaded copy.  To do this,
+        // change the copy from copying the original value to just copying the
+        // bool.
+        Instruction *StoredVal = cast<Instruction>(SI->getOperand(0));
+
+        // If we've already replaced the input, StoredVal will be a cast or
+        // select instruction.  If not, it will be a load of the original
+        // global.
+        if (LoadInst *LI = dyn_cast<LoadInst>(StoredVal)) {
+          assert(LI->getOperand(0) == GV && "Not a copy!");
+          // Insert a new load, to preserve the saved value.
+          StoreVal = new LoadInst(NewGV->getValueType(), NewGV,
+                                  LI->getName() + ".b", false, Align(1),
+                                  LI->getOrdering(), LI->getSyncScopeID(), LI);
+        } else {
+          assert((isa<CastInst>(StoredVal) || isa<SelectInst>(StoredVal)) &&
+                 "This is not a form that we understand!");
+          StoreVal = StoredVal->getOperand(0);
+          assert(isa<LoadInst>(StoreVal) && "Not a load of NewGV!");
+        }
+      }
+      StoreInst *NSI =
+          new StoreInst(StoreVal, NewGV, false, Align(1), SI->getOrdering(),
+                        SI->getSyncScopeID(), SI);
+      NSI->setDebugLoc(SI->getDebugLoc());
+    } else {
+      // Change the load into a load of bool then a select.
+      LoadInst *LI = cast<LoadInst>(UI);
+      LoadInst *NLI = new LoadInst(NewGV->getValueType(), NewGV,
+                                   LI->getName() + ".b", false, Align(1),
+                                   LI->getOrdering(), LI->getSyncScopeID(), LI);
+      Instruction *NSI;
+      if (IsOneZero)
+        NSI = new ZExtInst(NLI, LI->getType(), "", LI);
+      else
+        NSI = SelectInst::Create(NLI, OtherVal, InitVal, "", LI);
+      NSI->takeName(LI);
+      // Since LI is split into two instructions, NLI and NSI both inherit the
+      // same DebugLoc
+      NLI->setDebugLoc(LI->getDebugLoc());
+      NSI->setDebugLoc(LI->getDebugLoc());
+      LI->replaceAllUsesWith(NSI);
+    }
+    UI->eraseFromParent();
+  }
+
+  // Retain the name of the old global variable. People who are debugging their
+  // programs may expect these variables to be named the same.
+  NewGV->takeName(GV);
+  GV->eraseFromParent();
+  return true;
+}
+
+static bool deleteIfDead(
+    GlobalValue &GV, SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) {
+  GV.removeDeadConstantUsers();
+
+  if (!GV.isDiscardableIfUnused() && !GV.isDeclaration())
+    return false;
+
+  if (const Comdat *C = GV.getComdat())
+    if (!GV.hasLocalLinkage() && NotDiscardableComdats.count(C))
+      return false;
+
+  bool Dead;
+  if (auto *F = dyn_cast<Function>(&GV))
+    Dead = (F->isDeclaration() && F->use_empty()) || F->isDefTriviallyDead();
+  else
+    Dead = GV.use_empty();
+  if (!Dead)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "GLOBAL DEAD: " << GV << "\n");
+  GV.eraseFromParent();
+  ++NumDeleted;
+  return true;
+}
+
+static bool isPointerValueDeadOnEntryToFunction(
+    const Function *F, GlobalValue *GV,
+    function_ref<DominatorTree &(Function &)> LookupDomTree) {
+  // Find all uses of GV. We expect them all to be in F, and if we can't
+  // identify any of the uses we bail out.
+  //
+  // On each of these uses, identify if the memory that GV points to is
+  // used/required/live at the start of the function. If it is not, for example
+  // if the first thing the function does is store to the GV, the GV can
+  // possibly be demoted.
+  //
+  // We don't do an exhaustive search for memory operations - simply look
+  // through bitcasts as they're quite common and benign.
+  const DataLayout &DL = GV->getParent()->getDataLayout();
+  SmallVector<LoadInst *, 4> Loads;
+  SmallVector<StoreInst *, 4> Stores;
+  for (auto *U : GV->users()) {
+    if (Operator::getOpcode(U) == Instruction::BitCast) {
+      for (auto *UU : U->users()) {
+        if (auto *LI = dyn_cast<LoadInst>(UU))
+          Loads.push_back(LI);
+        else if (auto *SI = dyn_cast<StoreInst>(UU))
+          Stores.push_back(SI);
+        else
+          return false;
+      }
+      continue;
+    }
+
+    Instruction *I = dyn_cast<Instruction>(U);
+    if (!I)
+      return false;
+    assert(I->getParent()->getParent() == F);
+
+    if (auto *LI = dyn_cast<LoadInst>(I))
+      Loads.push_back(LI);
+    else if (auto *SI = dyn_cast<StoreInst>(I))
+      Stores.push_back(SI);
+    else
+      return false;
+  }
+
+  // We have identified all uses of GV into loads and stores. Now check if all
+  // of them are known not to depend on the value of the global at the function
+  // entry point. We do this by ensuring that every load is dominated by at
+  // least one store.
+  auto &DT = LookupDomTree(*const_cast<Function *>(F));
+
+  // The below check is quadratic. Check we're not going to do too many tests.
+  // FIXME: Even though this will always have worst-case quadratic time, we
+  // could put effort into minimizing the average time by putting stores that
+  // have been shown to dominate at least one load at the beginning of the
+  // Stores array, making subsequent dominance checks more likely to succeed
+  // early.
+  //
+  // The threshold here is fairly large because global->local demotion is a
+  // very powerful optimization should it fire.
+  const unsigned Threshold = 100;
+  if (Loads.size() * Stores.size() > Threshold)
+    return false;
+
+  for (auto *L : Loads) {
+    auto *LTy = L->getType();
+    if (none_of(Stores, [&](const StoreInst *S) {
+          auto *STy = S->getValueOperand()->getType();
+          // The load is only dominated by the store if DomTree says so
+          // and the number of bits loaded in L is less than or equal to
+          // the number of bits stored in S.
+          return DT.dominates(S, L) &&
                  DL.getTypeStoreSize(LTy).getFixedSize() <=
                      DL.getTypeStoreSize(STy).getFixedSize();
-        })) 
-      return false; 
-  } 
-  // All loads have known dependences inside F, so the global can be localized. 
-  return true; 
-} 
- 
-/// C may have non-instruction users. Can all of those users be turned into 
-/// instructions? 
-static bool allNonInstructionUsersCanBeMadeInstructions(Constant *C) { 
-  // We don't do this exhaustively. The most common pattern that we really need 
-  // to care about is a constant GEP or constant bitcast - so just looking 
-  // through one single ConstantExpr. 
-  // 
-  // The set of constants that this function returns true for must be able to be 
-  // handled by makeAllConstantUsesInstructions. 
-  for (auto *U : C->users()) { 
-    if (isa<Instruction>(U)) 
-      continue; 
-    if (!isa<ConstantExpr>(U)) 
-      // Non instruction, non-constantexpr user; cannot convert this. 
-      return false; 
-    for (auto *UU : U->users()) 
-      if (!isa<Instruction>(UU)) 
-        // A constantexpr used by another constant. We don't try and recurse any 
-        // further but just bail out at this point. 
-        return false; 
-  } 
- 
-  return true; 
-} 
- 
-/// C may have non-instruction users, and 
-/// allNonInstructionUsersCanBeMadeInstructions has returned true. Convert the 
-/// non-instruction users to instructions. 
-static void makeAllConstantUsesInstructions(Constant *C) { 
-  SmallVector<ConstantExpr*,4> Users; 
-  for (auto *U : C->users()) { 
-    if (isa<ConstantExpr>(U)) 
-      Users.push_back(cast<ConstantExpr>(U)); 
-    else 
-      // We should never get here; allNonInstructionUsersCanBeMadeInstructions 
-      // should not have returned true for C. 
-      assert( 
-          isa<Instruction>(U) && 
-          "Can't transform non-constantexpr non-instruction to instruction!"); 
-  } 
- 
-  SmallVector<Value*,4> UUsers; 
-  for (auto *U : Users) { 
-    UUsers.clear(); 
+        }))
+      return false;
+  }
+  // All loads have known dependences inside F, so the global can be localized.
+  return true;
+}
+
+/// C may have non-instruction users. Can all of those users be turned into
+/// instructions?
+static bool allNonInstructionUsersCanBeMadeInstructions(Constant *C) {
+  // We don't do this exhaustively. The most common pattern that we really need
+  // to care about is a constant GEP or constant bitcast - so just looking
+  // through one single ConstantExpr.
+  //
+  // The set of constants that this function returns true for must be able to be
+  // handled by makeAllConstantUsesInstructions.
+  for (auto *U : C->users()) {
+    if (isa<Instruction>(U))
+      continue;
+    if (!isa<ConstantExpr>(U))
+      // Non instruction, non-constantexpr user; cannot convert this.
+      return false;
+    for (auto *UU : U->users())
+      if (!isa<Instruction>(UU))
+        // A constantexpr used by another constant. We don't try and recurse any
+        // further but just bail out at this point.
+        return false;
+  }
+
+  return true;
+}
+
+/// C may have non-instruction users, and
+/// allNonInstructionUsersCanBeMadeInstructions has returned true. Convert the
+/// non-instruction users to instructions.
+static void makeAllConstantUsesInstructions(Constant *C) {
+  SmallVector<ConstantExpr*,4> Users;
+  for (auto *U : C->users()) {
+    if (isa<ConstantExpr>(U))
+      Users.push_back(cast<ConstantExpr>(U));
+    else
+      // We should never get here; allNonInstructionUsersCanBeMadeInstructions
+      // should not have returned true for C.
+      assert(
+          isa<Instruction>(U) &&
+          "Can't transform non-constantexpr non-instruction to instruction!");
+  }
+
+  SmallVector<Value*,4> UUsers;
+  for (auto *U : Users) {
+    UUsers.clear();
     append_range(UUsers, U->users());
-    for (auto *UU : UUsers) { 
-      Instruction *UI = cast<Instruction>(UU); 
-      Instruction *NewU = U->getAsInstruction(); 
-      NewU->insertBefore(UI); 
-      UI->replaceUsesOfWith(U, NewU); 
-    } 
-    // We've replaced all the uses, so destroy the constant. (destroyConstant 
-    // will update value handles and metadata.) 
-    U->destroyConstant(); 
-  } 
-} 
- 
-/// Analyze the specified global variable and optimize 
-/// it if possible.  If we make a change, return true. 
-static bool 
-processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS, 
-                      function_ref<TargetLibraryInfo &(Function &)> GetTLI, 
-                      function_ref<DominatorTree &(Function &)> LookupDomTree) { 
-  auto &DL = GV->getParent()->getDataLayout(); 
-  // If this is a first class global and has only one accessing function and 
-  // this function is non-recursive, we replace the global with a local alloca 
-  // in this function. 
-  // 
-  // NOTE: It doesn't make sense to promote non-single-value types since we 
-  // are just replacing static memory to stack memory. 
-  // 
-  // If the global is in different address space, don't bring it to stack. 
-  if (!GS.HasMultipleAccessingFunctions && 
-      GS.AccessingFunction && 
-      GV->getValueType()->isSingleValueType() && 
-      GV->getType()->getAddressSpace() == 0 && 
-      !GV->isExternallyInitialized() && 
-      allNonInstructionUsersCanBeMadeInstructions(GV) && 
-      GS.AccessingFunction->doesNotRecurse() && 
-      isPointerValueDeadOnEntryToFunction(GS.AccessingFunction, GV, 
-                                          LookupDomTree)) { 
-    const DataLayout &DL = GV->getParent()->getDataLayout(); 
- 
-    LLVM_DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV << "\n"); 
-    Instruction &FirstI = const_cast<Instruction&>(*GS.AccessingFunction 
-                                                   ->getEntryBlock().begin()); 
-    Type *ElemTy = GV->getValueType(); 
-    // FIXME: Pass Global's alignment when globals have alignment 
-    AllocaInst *Alloca = new AllocaInst(ElemTy, DL.getAllocaAddrSpace(), nullptr, 
-                                        GV->getName(), &FirstI); 
-    if (!isa<UndefValue>(GV->getInitializer())) 
-      new StoreInst(GV->getInitializer(), Alloca, &FirstI); 
- 
-    makeAllConstantUsesInstructions(GV); 
- 
-    GV->replaceAllUsesWith(Alloca); 
-    GV->eraseFromParent(); 
-    ++NumLocalized; 
-    return true; 
-  } 
- 
+    for (auto *UU : UUsers) {
+      Instruction *UI = cast<Instruction>(UU);
+      Instruction *NewU = U->getAsInstruction();
+      NewU->insertBefore(UI);
+      UI->replaceUsesOfWith(U, NewU);
+    }
+    // We've replaced all the uses, so destroy the constant. (destroyConstant
+    // will update value handles and metadata.)
+    U->destroyConstant();
+  }
+}
+
+/// Analyze the specified global variable and optimize
+/// it if possible.  If we make a change, return true.
+static bool
+processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS,
+                      function_ref<TargetLibraryInfo &(Function &)> GetTLI,
+                      function_ref<DominatorTree &(Function &)> LookupDomTree) {
+  auto &DL = GV->getParent()->getDataLayout();
+  // If this is a first class global and has only one accessing function and
+  // this function is non-recursive, we replace the global with a local alloca
+  // in this function.
+  //
+  // NOTE: It doesn't make sense to promote non-single-value types since we
+  // are just replacing static memory to stack memory.
+  //
+  // If the global is in different address space, don't bring it to stack.
+  if (!GS.HasMultipleAccessingFunctions &&
+      GS.AccessingFunction &&
+      GV->getValueType()->isSingleValueType() &&
+      GV->getType()->getAddressSpace() == 0 &&
+      !GV->isExternallyInitialized() &&
+      allNonInstructionUsersCanBeMadeInstructions(GV) &&
+      GS.AccessingFunction->doesNotRecurse() &&
+      isPointerValueDeadOnEntryToFunction(GS.AccessingFunction, GV,
+                                          LookupDomTree)) {
+    const DataLayout &DL = GV->getParent()->getDataLayout();
+
+    LLVM_DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV << "\n");
+    Instruction &FirstI = const_cast<Instruction&>(*GS.AccessingFunction
+                                                   ->getEntryBlock().begin());
+    Type *ElemTy = GV->getValueType();
+    // FIXME: Pass Global's alignment when globals have alignment
+    AllocaInst *Alloca = new AllocaInst(ElemTy, DL.getAllocaAddrSpace(), nullptr,
+                                        GV->getName(), &FirstI);
+    if (!isa<UndefValue>(GV->getInitializer()))
+      new StoreInst(GV->getInitializer(), Alloca, &FirstI);
+
+    makeAllConstantUsesInstructions(GV);
+
+    GV->replaceAllUsesWith(Alloca);
+    GV->eraseFromParent();
+    ++NumLocalized;
+    return true;
+  }
+
   bool Changed = false;
 
-  // If the global is never loaded (but may be stored to), it is dead. 
-  // Delete it now. 
-  if (!GS.IsLoaded) { 
-    LLVM_DEBUG(dbgs() << "GLOBAL NEVER LOADED: " << *GV << "\n"); 
- 
-    if (isLeakCheckerRoot(GV)) { 
-      // Delete any constant stores to the global. 
-      Changed = CleanupPointerRootUsers(GV, GetTLI); 
-    } else { 
-      // Delete any stores we can find to the global.  We may not be able to 
-      // make it completely dead though. 
-      Changed = 
-          CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI); 
-    } 
- 
-    // If the global is dead now, delete it. 
-    if (GV->use_empty()) { 
-      GV->eraseFromParent(); 
-      ++NumDeleted; 
-      Changed = true; 
-    } 
-    return Changed; 
- 
-  } 
-  if (GS.StoredType <= GlobalStatus::InitializerStored) { 
-    LLVM_DEBUG(dbgs() << "MARKING CONSTANT: " << *GV << "\n"); 
- 
-    // Don't actually mark a global constant if it's atomic because atomic loads 
-    // are implemented by a trivial cmpxchg in some edge-cases and that usually 
-    // requires write access to the variable even if it's not actually changed. 
+  // If the global is never loaded (but may be stored to), it is dead.
+  // Delete it now.
+  if (!GS.IsLoaded) {
+    LLVM_DEBUG(dbgs() << "GLOBAL NEVER LOADED: " << *GV << "\n");
+
+    if (isLeakCheckerRoot(GV)) {
+      // Delete any constant stores to the global.
+      Changed = CleanupPointerRootUsers(GV, GetTLI);
+    } else {
+      // Delete any stores we can find to the global.  We may not be able to
+      // make it completely dead though.
+      Changed =
+          CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI);
+    }
+
+    // If the global is dead now, delete it.
+    if (GV->use_empty()) {
+      GV->eraseFromParent();
+      ++NumDeleted;
+      Changed = true;
+    }
+    return Changed;
+
+  }
+  if (GS.StoredType <= GlobalStatus::InitializerStored) {
+    LLVM_DEBUG(dbgs() << "MARKING CONSTANT: " << *GV << "\n");
+
+    // Don't actually mark a global constant if it's atomic because atomic loads
+    // are implemented by a trivial cmpxchg in some edge-cases and that usually
+    // requires write access to the variable even if it's not actually changed.
     if (GS.Ordering == AtomicOrdering::NotAtomic) {
       assert(!GV->isConstant() && "Expected a non-constant global");
-      GV->setConstant(true); 
+      GV->setConstant(true);
       Changed = true;
     }
- 
-    // Clean up any obviously simplifiable users now. 
+
+    // Clean up any obviously simplifiable users now.
     Changed |= CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI);
- 
-    // If the global is dead now, just nuke it. 
-    if (GV->use_empty()) { 
-      LLVM_DEBUG(dbgs() << "   *** Marking constant allowed us to simplify " 
-                        << "all users and delete global!\n"); 
-      GV->eraseFromParent(); 
-      ++NumDeleted; 
-      return true; 
-    } 
- 
-    // Fall through to the next check; see if we can optimize further. 
-    ++NumMarked; 
-  } 
-  if (!GV->getInitializer()->getType()->isSingleValueType()) { 
-    const DataLayout &DL = GV->getParent()->getDataLayout(); 
-    if (SRAGlobal(GV, DL)) 
-      return true; 
-  } 
-  if (GS.StoredType == GlobalStatus::StoredOnce && GS.StoredOnceValue) { 
-    // If the initial value for the global was an undef value, and if only 
-    // one other value was stored into it, we can just change the 
-    // initializer to be the stored value, then delete all stores to the 
-    // global.  This allows us to mark it constant. 
-    if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue)) 
-      if (isa<UndefValue>(GV->getInitializer())) { 
-        // Change the initial value here. 
-        GV->setInitializer(SOVConstant); 
- 
-        // Clean up any obviously simplifiable users now. 
-        CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI); 
- 
-        if (GV->use_empty()) { 
-          LLVM_DEBUG(dbgs() << "   *** Substituting initializer allowed us to " 
-                            << "simplify all users and delete global!\n"); 
-          GV->eraseFromParent(); 
-          ++NumDeleted; 
-        } 
-        ++NumSubstitute; 
-        return true; 
-      } 
- 
-    // Try to optimize globals based on the knowledge that only one value 
-    // (besides its initializer) is ever stored to the global. 
-    if (optimizeOnceStoredGlobal(GV, GS.StoredOnceValue, GS.Ordering, DL, 
-                                 GetTLI)) 
-      return true; 
- 
-    // Otherwise, if the global was not a boolean, we can shrink it to be a 
-    // boolean. 
-    if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue)) { 
-      if (GS.Ordering == AtomicOrdering::NotAtomic) { 
-        if (TryToShrinkGlobalToBoolean(GV, SOVConstant)) { 
-          ++NumShrunkToBool; 
-          return true; 
-        } 
-      } 
-    } 
-  } 
- 
+
+    // If the global is dead now, just nuke it.
+    if (GV->use_empty()) {
+      LLVM_DEBUG(dbgs() << "   *** Marking constant allowed us to simplify "
+                        << "all users and delete global!\n");
+      GV->eraseFromParent();
+      ++NumDeleted;
+      return true;
+    }
+
+    // Fall through to the next check; see if we can optimize further.
+    ++NumMarked;
+  }
+  if (!GV->getInitializer()->getType()->isSingleValueType()) {
+    const DataLayout &DL = GV->getParent()->getDataLayout();
+    if (SRAGlobal(GV, DL))
+      return true;
+  }
+  if (GS.StoredType == GlobalStatus::StoredOnce && GS.StoredOnceValue) {
+    // If the initial value for the global was an undef value, and if only
+    // one other value was stored into it, we can just change the
+    // initializer to be the stored value, then delete all stores to the
+    // global.  This allows us to mark it constant.
+    if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue))
+      if (isa<UndefValue>(GV->getInitializer())) {
+        // Change the initial value here.
+        GV->setInitializer(SOVConstant);
+
+        // Clean up any obviously simplifiable users now.
+        CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI);
+
+        if (GV->use_empty()) {
+          LLVM_DEBUG(dbgs() << "   *** Substituting initializer allowed us to "
+                            << "simplify all users and delete global!\n");
+          GV->eraseFromParent();
+          ++NumDeleted;
+        }
+        ++NumSubstitute;
+        return true;
+      }
+
+    // Try to optimize globals based on the knowledge that only one value
+    // (besides its initializer) is ever stored to the global.
+    if (optimizeOnceStoredGlobal(GV, GS.StoredOnceValue, GS.Ordering, DL,
+                                 GetTLI))
+      return true;
+
+    // Otherwise, if the global was not a boolean, we can shrink it to be a
+    // boolean.
+    if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue)) {
+      if (GS.Ordering == AtomicOrdering::NotAtomic) {
+        if (TryToShrinkGlobalToBoolean(GV, SOVConstant)) {
+          ++NumShrunkToBool;
+          return true;
+        }
+      }
+    }
+  }
+
   return Changed;
-} 
- 
-/// Analyze the specified global variable and optimize it if possible.  If we 
-/// make a change, return true. 
-static bool 
-processGlobal(GlobalValue &GV, 
-              function_ref<TargetLibraryInfo &(Function &)> GetTLI, 
-              function_ref<DominatorTree &(Function &)> LookupDomTree) { 
-  if (GV.getName().startswith("llvm.")) 
-    return false; 
- 
-  GlobalStatus GS; 
- 
-  if (GlobalStatus::analyzeGlobal(&GV, GS)) 
-    return false; 
- 
-  bool Changed = false; 
-  if (!GS.IsCompared && !GV.hasGlobalUnnamedAddr()) { 
-    auto NewUnnamedAddr = GV.hasLocalLinkage() ? GlobalValue::UnnamedAddr::Global 
-                                               : GlobalValue::UnnamedAddr::Local; 
-    if (NewUnnamedAddr != GV.getUnnamedAddr()) { 
-      GV.setUnnamedAddr(NewUnnamedAddr); 
-      NumUnnamed++; 
-      Changed = true; 
-    } 
-  } 
- 
-  // Do more involved optimizations if the global is internal. 
-  if (!GV.hasLocalLinkage()) 
-    return Changed; 
- 
-  auto *GVar = dyn_cast<GlobalVariable>(&GV); 
-  if (!GVar) 
-    return Changed; 
- 
-  if (GVar->isConstant() || !GVar->hasInitializer()) 
-    return Changed; 
- 
-  return processInternalGlobal(GVar, GS, GetTLI, LookupDomTree) || Changed; 
-} 
- 
-/// Walk all of the direct calls of the specified function, changing them to 
-/// FastCC. 
-static void ChangeCalleesToFastCall(Function *F) { 
-  for (User *U : F->users()) { 
-    if (isa<BlockAddress>(U)) 
-      continue; 
-    cast<CallBase>(U)->setCallingConv(CallingConv::Fast); 
-  } 
-} 
- 
-static AttributeList StripAttr(LLVMContext &C, AttributeList Attrs, 
-                               Attribute::AttrKind A) { 
-  unsigned AttrIndex; 
-  if (Attrs.hasAttrSomewhere(A, &AttrIndex)) 
-    return Attrs.removeAttribute(C, AttrIndex, A); 
-  return Attrs; 
-} 
- 
-static void RemoveAttribute(Function *F, Attribute::AttrKind A) { 
-  F->setAttributes(StripAttr(F->getContext(), F->getAttributes(), A)); 
-  for (User *U : F->users()) { 
-    if (isa<BlockAddress>(U)) 
-      continue; 
-    CallBase *CB = cast<CallBase>(U); 
-    CB->setAttributes(StripAttr(F->getContext(), CB->getAttributes(), A)); 
-  } 
-} 
- 
-/// Return true if this is a calling convention that we'd like to change.  The 
-/// idea here is that we don't want to mess with the convention if the user 
-/// explicitly requested something with performance implications like coldcc, 
-/// GHC, or anyregcc. 
-static bool hasChangeableCC(Function *F) { 
-  CallingConv::ID CC = F->getCallingConv(); 
- 
-  // FIXME: Is it worth transforming x86_stdcallcc and x86_fastcallcc? 
-  if (CC != CallingConv::C && CC != CallingConv::X86_ThisCall) 
-    return false; 
- 
-  // FIXME: Change CC for the whole chain of musttail calls when possible. 
-  // 
-  // Can't change CC of the function that either has musttail calls, or is a 
-  // musttail callee itself 
-  for (User *U : F->users()) { 
-    if (isa<BlockAddress>(U)) 
-      continue; 
-    CallInst* CI = dyn_cast<CallInst>(U); 
-    if (!CI) 
-      continue; 
- 
-    if (CI->isMustTailCall()) 
-      return false; 
-  } 
- 
-  for (BasicBlock &BB : *F) 
-    if (BB.getTerminatingMustTailCall()) 
-      return false; 
- 
-  return true; 
-} 
- 
-/// Return true if the block containing the call site has a BlockFrequency of 
-/// less than ColdCCRelFreq% of the entry block. 
-static bool isColdCallSite(CallBase &CB, BlockFrequencyInfo &CallerBFI) { 
-  const BranchProbability ColdProb(ColdCCRelFreq, 100); 
-  auto *CallSiteBB = CB.getParent(); 
-  auto CallSiteFreq = CallerBFI.getBlockFreq(CallSiteBB); 
-  auto CallerEntryFreq = 
-      CallerBFI.getBlockFreq(&(CB.getCaller()->getEntryBlock())); 
-  return CallSiteFreq < CallerEntryFreq * ColdProb; 
-} 
- 
-// This function checks if the input function F is cold at all call sites. It 
-// also looks each call site's containing function, returning false if the 
-// caller function contains other non cold calls. The input vector AllCallsCold 
-// contains a list of functions that only have call sites in cold blocks. 
-static bool 
-isValidCandidateForColdCC(Function &F, 
-                          function_ref<BlockFrequencyInfo &(Function &)> GetBFI, 
-                          const std::vector<Function *> &AllCallsCold) { 
- 
-  if (F.user_empty()) 
-    return false; 
- 
-  for (User *U : F.users()) { 
-    if (isa<BlockAddress>(U)) 
-      continue; 
- 
-    CallBase &CB = cast<CallBase>(*U); 
-    Function *CallerFunc = CB.getParent()->getParent(); 
-    BlockFrequencyInfo &CallerBFI = GetBFI(*CallerFunc); 
-    if (!isColdCallSite(CB, CallerBFI)) 
-      return false; 
+}
+
+/// Analyze the specified global variable and optimize it if possible.  If we
+/// make a change, return true.
+static bool
+processGlobal(GlobalValue &GV,
+              function_ref<TargetLibraryInfo &(Function &)> GetTLI,
+              function_ref<DominatorTree &(Function &)> LookupDomTree) {
+  if (GV.getName().startswith("llvm."))
+    return false;
+
+  GlobalStatus GS;
+
+  if (GlobalStatus::analyzeGlobal(&GV, GS))
+    return false;
+
+  bool Changed = false;
+  if (!GS.IsCompared && !GV.hasGlobalUnnamedAddr()) {
+    auto NewUnnamedAddr = GV.hasLocalLinkage() ? GlobalValue::UnnamedAddr::Global
+                                               : GlobalValue::UnnamedAddr::Local;
+    if (NewUnnamedAddr != GV.getUnnamedAddr()) {
+      GV.setUnnamedAddr(NewUnnamedAddr);
+      NumUnnamed++;
+      Changed = true;
+    }
+  }
+
+  // Do more involved optimizations if the global is internal.
+  if (!GV.hasLocalLinkage())
+    return Changed;
+
+  auto *GVar = dyn_cast<GlobalVariable>(&GV);
+  if (!GVar)
+    return Changed;
+
+  if (GVar->isConstant() || !GVar->hasInitializer())
+    return Changed;
+
+  return processInternalGlobal(GVar, GS, GetTLI, LookupDomTree) || Changed;
+}
+
+/// Walk all of the direct calls of the specified function, changing them to
+/// FastCC.
+static void ChangeCalleesToFastCall(Function *F) {
+  for (User *U : F->users()) {
+    if (isa<BlockAddress>(U))
+      continue;
+    cast<CallBase>(U)->setCallingConv(CallingConv::Fast);
+  }
+}
+
+static AttributeList StripAttr(LLVMContext &C, AttributeList Attrs,
+                               Attribute::AttrKind A) {
+  unsigned AttrIndex;
+  if (Attrs.hasAttrSomewhere(A, &AttrIndex))
+    return Attrs.removeAttribute(C, AttrIndex, A);
+  return Attrs;
+}
+
+static void RemoveAttribute(Function *F, Attribute::AttrKind A) {
+  F->setAttributes(StripAttr(F->getContext(), F->getAttributes(), A));
+  for (User *U : F->users()) {
+    if (isa<BlockAddress>(U))
+      continue;
+    CallBase *CB = cast<CallBase>(U);
+    CB->setAttributes(StripAttr(F->getContext(), CB->getAttributes(), A));
+  }
+}
+
+/// Return true if this is a calling convention that we'd like to change.  The
+/// idea here is that we don't want to mess with the convention if the user
+/// explicitly requested something with performance implications like coldcc,
+/// GHC, or anyregcc.
+static bool hasChangeableCC(Function *F) {
+  CallingConv::ID CC = F->getCallingConv();
+
+  // FIXME: Is it worth transforming x86_stdcallcc and x86_fastcallcc?
+  if (CC != CallingConv::C && CC != CallingConv::X86_ThisCall)
+    return false;
+
+  // FIXME: Change CC for the whole chain of musttail calls when possible.
+  //
+  // Can't change CC of the function that either has musttail calls, or is a
+  // musttail callee itself
+  for (User *U : F->users()) {
+    if (isa<BlockAddress>(U))
+      continue;
+    CallInst* CI = dyn_cast<CallInst>(U);
+    if (!CI)
+      continue;
+
+    if (CI->isMustTailCall())
+      return false;
+  }
+
+  for (BasicBlock &BB : *F)
+    if (BB.getTerminatingMustTailCall())
+      return false;
+
+  return true;
+}
+
+/// Return true if the block containing the call site has a BlockFrequency of
+/// less than ColdCCRelFreq% of the entry block.
+static bool isColdCallSite(CallBase &CB, BlockFrequencyInfo &CallerBFI) {
+  const BranchProbability ColdProb(ColdCCRelFreq, 100);
+  auto *CallSiteBB = CB.getParent();
+  auto CallSiteFreq = CallerBFI.getBlockFreq(CallSiteBB);
+  auto CallerEntryFreq =
+      CallerBFI.getBlockFreq(&(CB.getCaller()->getEntryBlock()));
+  return CallSiteFreq < CallerEntryFreq * ColdProb;
+}
+
+// This function checks if the input function F is cold at all call sites. It
+// also looks each call site's containing function, returning false if the
+// caller function contains other non cold calls. The input vector AllCallsCold
+// contains a list of functions that only have call sites in cold blocks.
+static bool
+isValidCandidateForColdCC(Function &F,
+                          function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
+                          const std::vector<Function *> &AllCallsCold) {
+
+  if (F.user_empty())
+    return false;
+
+  for (User *U : F.users()) {
+    if (isa<BlockAddress>(U))
+      continue;
+
+    CallBase &CB = cast<CallBase>(*U);
+    Function *CallerFunc = CB.getParent()->getParent();
+    BlockFrequencyInfo &CallerBFI = GetBFI(*CallerFunc);
+    if (!isColdCallSite(CB, CallerBFI))
+      return false;
     if (!llvm::is_contained(AllCallsCold, CallerFunc))
-      return false; 
-  } 
-  return true; 
-} 
- 
-static void changeCallSitesToColdCC(Function *F) { 
-  for (User *U : F->users()) { 
-    if (isa<BlockAddress>(U)) 
-      continue; 
-    cast<CallBase>(U)->setCallingConv(CallingConv::Cold); 
-  } 
-} 
- 
-// This function iterates over all the call instructions in the input Function 
-// and checks that all call sites are in cold blocks and are allowed to use the 
-// coldcc calling convention. 
-static bool 
-hasOnlyColdCalls(Function &F, 
-                 function_ref<BlockFrequencyInfo &(Function &)> GetBFI) { 
-  for (BasicBlock &BB : F) { 
-    for (Instruction &I : BB) { 
-      if (CallInst *CI = dyn_cast<CallInst>(&I)) { 
-        // Skip over isline asm instructions since they aren't function calls. 
-        if (CI->isInlineAsm()) 
-          continue; 
-        Function *CalledFn = CI->getCalledFunction(); 
-        if (!CalledFn) 
-          return false; 
-        if (!CalledFn->hasLocalLinkage()) 
-          return false; 
-        // Skip over instrinsics since they won't remain as function calls. 
-        if (CalledFn->getIntrinsicID() != Intrinsic::not_intrinsic) 
-          continue; 
-        // Check if it's valid to use coldcc calling convention. 
-        if (!hasChangeableCC(CalledFn) || CalledFn->isVarArg() || 
-            CalledFn->hasAddressTaken()) 
-          return false; 
-        BlockFrequencyInfo &CallerBFI = GetBFI(F); 
-        if (!isColdCallSite(*CI, CallerBFI)) 
-          return false; 
-      } 
-    } 
-  } 
-  return true; 
-} 
- 
-static bool hasMustTailCallers(Function *F) { 
-  for (User *U : F->users()) { 
-    CallBase *CB = dyn_cast<CallBase>(U); 
-    if (!CB) { 
-      assert(isa<BlockAddress>(U) && 
-             "Expected either CallBase or BlockAddress"); 
-      continue; 
-    } 
-    if (CB->isMustTailCall()) 
-      return true; 
-  } 
-  return false; 
-} 
- 
-static bool hasInvokeCallers(Function *F) { 
-  for (User *U : F->users()) 
-    if (isa<InvokeInst>(U)) 
-      return true; 
-  return false; 
-} 
- 
-static void RemovePreallocated(Function *F) { 
-  RemoveAttribute(F, Attribute::Preallocated); 
- 
-  auto *M = F->getParent(); 
- 
-  IRBuilder<> Builder(M->getContext()); 
- 
-  // Cannot modify users() while iterating over it, so make a copy. 
-  SmallVector<User *, 4> PreallocatedCalls(F->users()); 
-  for (User *U : PreallocatedCalls) { 
-    CallBase *CB = dyn_cast<CallBase>(U); 
-    if (!CB) 
-      continue; 
- 
-    assert( 
-        !CB->isMustTailCall() && 
-        "Shouldn't call RemotePreallocated() on a musttail preallocated call"); 
-    // Create copy of call without "preallocated" operand bundle. 
-    SmallVector<OperandBundleDef, 1> OpBundles; 
-    CB->getOperandBundlesAsDefs(OpBundles); 
-    CallBase *PreallocatedSetup = nullptr; 
-    for (auto *It = OpBundles.begin(); It != OpBundles.end(); ++It) { 
-      if (It->getTag() == "preallocated") { 
-        PreallocatedSetup = cast<CallBase>(*It->input_begin()); 
-        OpBundles.erase(It); 
-        break; 
-      } 
-    } 
-    assert(PreallocatedSetup && "Did not find preallocated bundle"); 
-    uint64_t ArgCount = 
-        cast<ConstantInt>(PreallocatedSetup->getArgOperand(0))->getZExtValue(); 
- 
-    assert((isa<CallInst>(CB) || isa<InvokeInst>(CB)) && 
-           "Unknown indirect call type"); 
-    CallBase *NewCB = CallBase::Create(CB, OpBundles, CB); 
-    CB->replaceAllUsesWith(NewCB); 
-    NewCB->takeName(CB); 
-    CB->eraseFromParent(); 
- 
-    Builder.SetInsertPoint(PreallocatedSetup); 
-    auto *StackSave = 
-        Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stacksave)); 
- 
-    Builder.SetInsertPoint(NewCB->getNextNonDebugInstruction()); 
-    Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stackrestore), 
-                       StackSave); 
- 
-    // Replace @llvm.call.preallocated.arg() with alloca. 
-    // Cannot modify users() while iterating over it, so make a copy. 
-    // @llvm.call.preallocated.arg() can be called with the same index multiple 
-    // times. So for each @llvm.call.preallocated.arg(), we see if we have 
-    // already created a Value* for the index, and if not, create an alloca and 
-    // bitcast right after the @llvm.call.preallocated.setup() so that it 
-    // dominates all uses. 
-    SmallVector<Value *, 2> ArgAllocas(ArgCount); 
-    SmallVector<User *, 2> PreallocatedArgs(PreallocatedSetup->users()); 
-    for (auto *User : PreallocatedArgs) { 
-      auto *UseCall = cast<CallBase>(User); 
-      assert(UseCall->getCalledFunction()->getIntrinsicID() == 
-                 Intrinsic::call_preallocated_arg && 
-             "preallocated token use was not a llvm.call.preallocated.arg"); 
-      uint64_t AllocArgIndex = 
-          cast<ConstantInt>(UseCall->getArgOperand(1))->getZExtValue(); 
-      Value *AllocaReplacement = ArgAllocas[AllocArgIndex]; 
-      if (!AllocaReplacement) { 
-        auto AddressSpace = UseCall->getType()->getPointerAddressSpace(); 
-        auto *ArgType = UseCall 
-                            ->getAttribute(AttributeList::FunctionIndex, 
-                                           Attribute::Preallocated) 
-                            .getValueAsType(); 
-        auto *InsertBefore = PreallocatedSetup->getNextNonDebugInstruction(); 
-        Builder.SetInsertPoint(InsertBefore); 
-        auto *Alloca = 
-            Builder.CreateAlloca(ArgType, AddressSpace, nullptr, "paarg"); 
-        auto *BitCast = Builder.CreateBitCast( 
-            Alloca, Type::getInt8PtrTy(M->getContext()), UseCall->getName()); 
-        ArgAllocas[AllocArgIndex] = BitCast; 
-        AllocaReplacement = BitCast; 
-      } 
- 
-      UseCall->replaceAllUsesWith(AllocaReplacement); 
-      UseCall->eraseFromParent(); 
-    } 
-    // Remove @llvm.call.preallocated.setup(). 
-    cast<Instruction>(PreallocatedSetup)->eraseFromParent(); 
-  } 
-} 
- 
-static bool 
-OptimizeFunctions(Module &M, 
-                  function_ref<TargetLibraryInfo &(Function &)> GetTLI, 
-                  function_ref<TargetTransformInfo &(Function &)> GetTTI, 
-                  function_ref<BlockFrequencyInfo &(Function &)> GetBFI, 
-                  function_ref<DominatorTree &(Function &)> LookupDomTree, 
-                  SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) { 
- 
-  bool Changed = false; 
- 
-  std::vector<Function *> AllCallsCold; 
-  for (Module::iterator FI = M.begin(), E = M.end(); FI != E;) { 
-    Function *F = &*FI++; 
-    if (hasOnlyColdCalls(*F, GetBFI)) 
-      AllCallsCold.push_back(F); 
-  } 
- 
-  // Optimize functions. 
-  for (Module::iterator FI = M.begin(), E = M.end(); FI != E; ) { 
-    Function *F = &*FI++; 
- 
-    // Don't perform global opt pass on naked functions; we don't want fast 
-    // calling conventions for naked functions. 
-    if (F->hasFnAttribute(Attribute::Naked)) 
-      continue; 
- 
-    // Functions without names cannot be referenced outside this module. 
-    if (!F->hasName() && !F->isDeclaration() && !F->hasLocalLinkage()) 
-      F->setLinkage(GlobalValue::InternalLinkage); 
- 
-    if (deleteIfDead(*F, NotDiscardableComdats)) { 
-      Changed = true; 
-      continue; 
-    } 
- 
-    // LLVM's definition of dominance allows instructions that are cyclic 
-    // in unreachable blocks, e.g.: 
-    // %pat = select i1 %condition, @global, i16* %pat 
-    // because any instruction dominates an instruction in a block that's 
-    // not reachable from entry. 
-    // So, remove unreachable blocks from the function, because a) there's 
-    // no point in analyzing them and b) GlobalOpt should otherwise grow 
-    // some more complicated logic to break these cycles. 
-    // Removing unreachable blocks might invalidate the dominator so we 
-    // recalculate it. 
-    if (!F->isDeclaration()) { 
-      if (removeUnreachableBlocks(*F)) { 
-        auto &DT = LookupDomTree(*F); 
-        DT.recalculate(*F); 
-        Changed = true; 
-      } 
-    } 
- 
-    Changed |= processGlobal(*F, GetTLI, LookupDomTree); 
- 
-    if (!F->hasLocalLinkage()) 
-      continue; 
- 
-    // If we have an inalloca parameter that we can safely remove the 
-    // inalloca attribute from, do so. This unlocks optimizations that 
-    // wouldn't be safe in the presence of inalloca. 
-    // FIXME: We should also hoist alloca affected by this to the entry 
-    // block if possible. 
-    if (F->getAttributes().hasAttrSomewhere(Attribute::InAlloca) && 
-        !F->hasAddressTaken() && !hasMustTailCallers(F)) { 
-      RemoveAttribute(F, Attribute::InAlloca); 
-      Changed = true; 
-    } 
- 
-    // FIXME: handle invokes 
-    // FIXME: handle musttail 
-    if (F->getAttributes().hasAttrSomewhere(Attribute::Preallocated)) { 
-      if (!F->hasAddressTaken() && !hasMustTailCallers(F) && 
-          !hasInvokeCallers(F)) { 
-        RemovePreallocated(F); 
-        Changed = true; 
-      } 
-      continue; 
-    } 
- 
-    if (hasChangeableCC(F) && !F->isVarArg() && !F->hasAddressTaken()) { 
-      NumInternalFunc++; 
-      TargetTransformInfo &TTI = GetTTI(*F); 
-      // Change the calling convention to coldcc if either stress testing is 
-      // enabled or the target would like to use coldcc on functions which are 
-      // cold at all call sites and the callers contain no other non coldcc 
-      // calls. 
-      if (EnableColdCCStressTest || 
-          (TTI.useColdCCForColdCall(*F) && 
-           isValidCandidateForColdCC(*F, GetBFI, AllCallsCold))) { 
-        F->setCallingConv(CallingConv::Cold); 
-        changeCallSitesToColdCC(F); 
-        Changed = true; 
-        NumColdCC++; 
-      } 
-    } 
- 
-    if (hasChangeableCC(F) && !F->isVarArg() && 
-        !F->hasAddressTaken()) { 
-      // If this function has a calling convention worth changing, is not a 
-      // varargs function, and is only called directly, promote it to use the 
-      // Fast calling convention. 
-      F->setCallingConv(CallingConv::Fast); 
-      ChangeCalleesToFastCall(F); 
-      ++NumFastCallFns; 
-      Changed = true; 
-    } 
- 
-    if (F->getAttributes().hasAttrSomewhere(Attribute::Nest) && 
-        !F->hasAddressTaken()) { 
-      // The function is not used by a trampoline intrinsic, so it is safe 
-      // to remove the 'nest' attribute. 
-      RemoveAttribute(F, Attribute::Nest); 
-      ++NumNestRemoved; 
-      Changed = true; 
-    } 
-  } 
-  return Changed; 
-} 
- 
-static bool 
-OptimizeGlobalVars(Module &M, 
-                   function_ref<TargetLibraryInfo &(Function &)> GetTLI, 
-                   function_ref<DominatorTree &(Function &)> LookupDomTree, 
-                   SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) { 
-  bool Changed = false; 
- 
-  for (Module::global_iterator GVI = M.global_begin(), E = M.global_end(); 
-       GVI != E; ) { 
-    GlobalVariable *GV = &*GVI++; 
-    // Global variables without names cannot be referenced outside this module. 
-    if (!GV->hasName() && !GV->isDeclaration() && !GV->hasLocalLinkage()) 
-      GV->setLinkage(GlobalValue::InternalLinkage); 
-    // Simplify the initializer. 
-    if (GV->hasInitializer()) 
-      if (auto *C = dyn_cast<Constant>(GV->getInitializer())) { 
-        auto &DL = M.getDataLayout(); 
-        // TLI is not used in the case of a Constant, so use default nullptr 
-        // for that optional parameter, since we don't have a Function to 
-        // provide GetTLI anyway. 
-        Constant *New = ConstantFoldConstant(C, DL, /*TLI*/ nullptr); 
-        if (New != C) 
-          GV->setInitializer(New); 
-      } 
- 
-    if (deleteIfDead(*GV, NotDiscardableComdats)) { 
-      Changed = true; 
-      continue; 
-    } 
- 
-    Changed |= processGlobal(*GV, GetTLI, LookupDomTree); 
-  } 
-  return Changed; 
-} 
- 
-/// Evaluate a piece of a constantexpr store into a global initializer.  This 
-/// returns 'Init' modified to reflect 'Val' stored into it.  At this point, the 
-/// GEP operands of Addr [0, OpNo) have been stepped into. 
-static Constant *EvaluateStoreInto(Constant *Init, Constant *Val, 
-                                   ConstantExpr *Addr, unsigned OpNo) { 
-  // Base case of the recursion. 
-  if (OpNo == Addr->getNumOperands()) { 
-    assert(Val->getType() == Init->getType() && "Type mismatch!"); 
-    return Val; 
-  } 
- 
-  SmallVector<Constant*, 32> Elts; 
-  if (StructType *STy = dyn_cast<StructType>(Init->getType())) { 
-    // Break up the constant into its elements. 
-    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) 
-      Elts.push_back(Init->getAggregateElement(i)); 
- 
-    // Replace the element that we are supposed to. 
-    ConstantInt *CU = cast<ConstantInt>(Addr->getOperand(OpNo)); 
-    unsigned Idx = CU->getZExtValue(); 
-    assert(Idx < STy->getNumElements() && "Struct index out of range!"); 
-    Elts[Idx] = EvaluateStoreInto(Elts[Idx], Val, Addr, OpNo+1); 
- 
-    // Return the modified struct. 
-    return ConstantStruct::get(STy, Elts); 
-  } 
- 
-  ConstantInt *CI = cast<ConstantInt>(Addr->getOperand(OpNo)); 
-  uint64_t NumElts; 
-  if (ArrayType *ATy = dyn_cast<ArrayType>(Init->getType())) 
-    NumElts = ATy->getNumElements(); 
-  else 
-    NumElts = cast<FixedVectorType>(Init->getType())->getNumElements(); 
- 
-  // Break up the array into elements. 
-  for (uint64_t i = 0, e = NumElts; i != e; ++i) 
-    Elts.push_back(Init->getAggregateElement(i)); 
- 
-  assert(CI->getZExtValue() < NumElts); 
-  Elts[CI->getZExtValue()] = 
-    EvaluateStoreInto(Elts[CI->getZExtValue()], Val, Addr, OpNo+1); 
- 
-  if (Init->getType()->isArrayTy()) 
-    return ConstantArray::get(cast<ArrayType>(Init->getType()), Elts); 
-  return ConstantVector::get(Elts); 
-} 
- 
-/// We have decided that Addr (which satisfies the predicate 
-/// isSimpleEnoughPointerToCommit) should get Val as its value.  Make it happen. 
-static void CommitValueTo(Constant *Val, Constant *Addr) { 
-  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) { 
-    assert(GV->hasInitializer()); 
-    GV->setInitializer(Val); 
-    return; 
-  } 
- 
-  ConstantExpr *CE = cast<ConstantExpr>(Addr); 
-  GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0)); 
-  GV->setInitializer(EvaluateStoreInto(GV->getInitializer(), Val, CE, 2)); 
-} 
- 
-/// Given a map of address -> value, where addresses are expected to be some form 
-/// of either a global or a constant GEP, set the initializer for the address to 
-/// be the value. This performs mostly the same function as CommitValueTo() 
-/// and EvaluateStoreInto() but is optimized to be more efficient for the common 
-/// case where the set of addresses are GEPs sharing the same underlying global, 
-/// processing the GEPs in batches rather than individually. 
-/// 
-/// To give an example, consider the following C++ code adapted from the clang 
-/// regression tests: 
-/// struct S { 
-///  int n = 10; 
-///  int m = 2 * n; 
-///  S(int a) : n(a) {} 
-/// }; 
-/// 
-/// template<typename T> 
-/// struct U { 
-///  T *r = &q; 
-///  T q = 42; 
-///  U *p = this; 
-/// }; 
-/// 
-/// U<S> e; 
-/// 
-/// The global static constructor for 'e' will need to initialize 'r' and 'p' of 
-/// the outer struct, while also initializing the inner 'q' structs 'n' and 'm' 
-/// members. This batch algorithm will simply use general CommitValueTo() method 
-/// to handle the complex nested S struct initialization of 'q', before 
-/// processing the outermost members in a single batch. Using CommitValueTo() to 
-/// handle member in the outer struct is inefficient when the struct/array is 
-/// very large as we end up creating and destroy constant arrays for each 
-/// initialization. 
-/// For the above case, we expect the following IR to be generated: 
-/// 
-/// %struct.U = type { %struct.S*, %struct.S, %struct.U* } 
-/// %struct.S = type { i32, i32 } 
-/// @e = global %struct.U { %struct.S* gep inbounds (%struct.U, %struct.U* @e, 
-///                                                  i64 0, i32 1), 
-///                         %struct.S { i32 42, i32 84 }, %struct.U* @e } 
-/// The %struct.S { i32 42, i32 84 } inner initializer is treated as a complex 
-/// constant expression, while the other two elements of @e are "simple". 
-static void BatchCommitValueTo(const DenseMap<Constant*, Constant*> &Mem) { 
-  SmallVector<std::pair<GlobalVariable*, Constant*>, 32> GVs; 
-  SmallVector<std::pair<ConstantExpr*, Constant*>, 32> ComplexCEs; 
-  SmallVector<std::pair<ConstantExpr*, Constant*>, 32> SimpleCEs; 
-  SimpleCEs.reserve(Mem.size()); 
- 
-  for (const auto &I : Mem) { 
-    if (auto *GV = dyn_cast<GlobalVariable>(I.first)) { 
-      GVs.push_back(std::make_pair(GV, I.second)); 
-    } else { 
-      ConstantExpr *GEP = cast<ConstantExpr>(I.first); 
-      // We don't handle the deeply recursive case using the batch method. 
-      if (GEP->getNumOperands() > 3) 
-        ComplexCEs.push_back(std::make_pair(GEP, I.second)); 
-      else 
-        SimpleCEs.push_back(std::make_pair(GEP, I.second)); 
-    } 
-  } 
- 
-  // The algorithm below doesn't handle cases like nested structs, so use the 
-  // slower fully general method if we have to. 
-  for (auto ComplexCE : ComplexCEs) 
-    CommitValueTo(ComplexCE.second, ComplexCE.first); 
- 
-  for (auto GVPair : GVs) { 
-    assert(GVPair.first->hasInitializer()); 
-    GVPair.first->setInitializer(GVPair.second); 
-  } 
- 
-  if (SimpleCEs.empty()) 
-    return; 
- 
-  // We cache a single global's initializer elements in the case where the 
-  // subsequent address/val pair uses the same one. This avoids throwing away and 
-  // rebuilding the constant struct/vector/array just because one element is 
-  // modified at a time. 
-  SmallVector<Constant *, 32> Elts; 
-  Elts.reserve(SimpleCEs.size()); 
-  GlobalVariable *CurrentGV = nullptr; 
- 
-  auto commitAndSetupCache = [&](GlobalVariable *GV, bool Update) { 
-    Constant *Init = GV->getInitializer(); 
-    Type *Ty = Init->getType(); 
-    if (Update) { 
-      if (CurrentGV) { 
-        assert(CurrentGV && "Expected a GV to commit to!"); 
-        Type *CurrentInitTy = CurrentGV->getInitializer()->getType(); 
-        // We have a valid cache that needs to be committed. 
-        if (StructType *STy = dyn_cast<StructType>(CurrentInitTy)) 
-          CurrentGV->setInitializer(ConstantStruct::get(STy, Elts)); 
-        else if (ArrayType *ArrTy = dyn_cast<ArrayType>(CurrentInitTy)) 
-          CurrentGV->setInitializer(ConstantArray::get(ArrTy, Elts)); 
-        else 
-          CurrentGV->setInitializer(ConstantVector::get(Elts)); 
-      } 
-      if (CurrentGV == GV) 
-        return; 
-      // Need to clear and set up cache for new initializer. 
-      CurrentGV = GV; 
-      Elts.clear(); 
-      unsigned NumElts; 
-      if (auto *STy = dyn_cast<StructType>(Ty)) 
-        NumElts = STy->getNumElements(); 
-      else if (auto *ATy = dyn_cast<ArrayType>(Ty)) 
-        NumElts = ATy->getNumElements(); 
-      else 
-        NumElts = cast<FixedVectorType>(Ty)->getNumElements(); 
-      for (unsigned i = 0, e = NumElts; i != e; ++i) 
-        Elts.push_back(Init->getAggregateElement(i)); 
-    } 
-  }; 
- 
-  for (auto CEPair : SimpleCEs) { 
-    ConstantExpr *GEP = CEPair.first; 
-    Constant *Val = CEPair.second; 
- 
-    GlobalVariable *GV = cast<GlobalVariable>(GEP->getOperand(0)); 
-    commitAndSetupCache(GV, GV != CurrentGV); 
-    ConstantInt *CI = cast<ConstantInt>(GEP->getOperand(2)); 
-    Elts[CI->getZExtValue()] = Val; 
-  } 
-  // The last initializer in the list needs to be committed, others 
-  // will be committed on a new initializer being processed. 
-  commitAndSetupCache(CurrentGV, true); 
-} 
- 
-/// Evaluate static constructors in the function, if we can.  Return true if we 
-/// can, false otherwise. 
-static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL, 
-                                      TargetLibraryInfo *TLI) { 
-  // Call the function. 
-  Evaluator Eval(DL, TLI); 
-  Constant *RetValDummy; 
-  bool EvalSuccess = Eval.EvaluateFunction(F, RetValDummy, 
-                                           SmallVector<Constant*, 0>()); 
- 
-  if (EvalSuccess) { 
-    ++NumCtorsEvaluated; 
- 
-    // We succeeded at evaluation: commit the result. 
-    LLVM_DEBUG(dbgs() << "FULLY EVALUATED GLOBAL CTOR FUNCTION '" 
-                      << F->getName() << "' to " 
-                      << Eval.getMutatedMemory().size() << " stores.\n"); 
-    BatchCommitValueTo(Eval.getMutatedMemory()); 
-    for (GlobalVariable *GV : Eval.getInvariants()) 
-      GV->setConstant(true); 
-  } 
- 
-  return EvalSuccess; 
-} 
- 
-static int compareNames(Constant *const *A, Constant *const *B) { 
-  Value *AStripped = (*A)->stripPointerCasts(); 
-  Value *BStripped = (*B)->stripPointerCasts(); 
-  return AStripped->getName().compare(BStripped->getName()); 
-} 
- 
-static void setUsedInitializer(GlobalVariable &V, 
-                               const SmallPtrSetImpl<GlobalValue *> &Init) { 
-  if (Init.empty()) { 
-    V.eraseFromParent(); 
-    return; 
-  } 
- 
-  // Type of pointer to the array of pointers. 
-  PointerType *Int8PtrTy = Type::getInt8PtrTy(V.getContext(), 0); 
- 
-  SmallVector<Constant *, 8> UsedArray; 
-  for (GlobalValue *GV : Init) { 
-    Constant *Cast 
-      = ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, Int8PtrTy); 
-    UsedArray.push_back(Cast); 
-  } 
-  // Sort to get deterministic order. 
-  array_pod_sort(UsedArray.begin(), UsedArray.end(), compareNames); 
-  ArrayType *ATy = ArrayType::get(Int8PtrTy, UsedArray.size()); 
- 
-  Module *M = V.getParent(); 
-  V.removeFromParent(); 
-  GlobalVariable *NV = 
-      new GlobalVariable(*M, ATy, false, GlobalValue::AppendingLinkage, 
-                         ConstantArray::get(ATy, UsedArray), ""); 
-  NV->takeName(&V); 
-  NV->setSection("llvm.metadata"); 
-  delete &V; 
-} 
- 
-namespace { 
- 
-/// An easy to access representation of llvm.used and llvm.compiler.used. 
-class LLVMUsed { 
-  SmallPtrSet<GlobalValue *, 8> Used; 
-  SmallPtrSet<GlobalValue *, 8> CompilerUsed; 
-  GlobalVariable *UsedV; 
-  GlobalVariable *CompilerUsedV; 
- 
-public: 
-  LLVMUsed(Module &M) { 
-    UsedV = collectUsedGlobalVariables(M, Used, false); 
-    CompilerUsedV = collectUsedGlobalVariables(M, CompilerUsed, true); 
-  } 
- 
-  using iterator = SmallPtrSet<GlobalValue *, 8>::iterator; 
-  using used_iterator_range = iterator_range<iterator>; 
- 
-  iterator usedBegin() { return Used.begin(); } 
-  iterator usedEnd() { return Used.end(); } 
- 
-  used_iterator_range used() { 
-    return used_iterator_range(usedBegin(), usedEnd()); 
-  } 
- 
-  iterator compilerUsedBegin() { return CompilerUsed.begin(); } 
-  iterator compilerUsedEnd() { return CompilerUsed.end(); } 
- 
-  used_iterator_range compilerUsed() { 
-    return used_iterator_range(compilerUsedBegin(), compilerUsedEnd()); 
-  } 
- 
-  bool usedCount(GlobalValue *GV) const { return Used.count(GV); } 
- 
-  bool compilerUsedCount(GlobalValue *GV) const { 
-    return CompilerUsed.count(GV); 
-  } 
- 
-  bool usedErase(GlobalValue *GV) { return Used.erase(GV); } 
-  bool compilerUsedErase(GlobalValue *GV) { return CompilerUsed.erase(GV); } 
-  bool usedInsert(GlobalValue *GV) { return Used.insert(GV).second; } 
- 
-  bool compilerUsedInsert(GlobalValue *GV) { 
-    return CompilerUsed.insert(GV).second; 
-  } 
- 
-  void syncVariablesAndSets() { 
-    if (UsedV) 
-      setUsedInitializer(*UsedV, Used); 
-    if (CompilerUsedV) 
-      setUsedInitializer(*CompilerUsedV, CompilerUsed); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-static bool hasUseOtherThanLLVMUsed(GlobalAlias &GA, const LLVMUsed &U) { 
-  if (GA.use_empty()) // No use at all. 
-    return false; 
- 
-  assert((!U.usedCount(&GA) || !U.compilerUsedCount(&GA)) && 
-         "We should have removed the duplicated " 
-         "element from llvm.compiler.used"); 
-  if (!GA.hasOneUse()) 
-    // Strictly more than one use. So at least one is not in llvm.used and 
-    // llvm.compiler.used. 
-    return true; 
- 
-  // Exactly one use. Check if it is in llvm.used or llvm.compiler.used. 
-  return !U.usedCount(&GA) && !U.compilerUsedCount(&GA); 
-} 
- 
-static bool hasMoreThanOneUseOtherThanLLVMUsed(GlobalValue &V, 
-                                               const LLVMUsed &U) { 
-  unsigned N = 2; 
-  assert((!U.usedCount(&V) || !U.compilerUsedCount(&V)) && 
-         "We should have removed the duplicated " 
-         "element from llvm.compiler.used"); 
-  if (U.usedCount(&V) || U.compilerUsedCount(&V)) 
-    ++N; 
-  return V.hasNUsesOrMore(N); 
-} 
- 
-static bool mayHaveOtherReferences(GlobalAlias &GA, const LLVMUsed &U) { 
-  if (!GA.hasLocalLinkage()) 
-    return true; 
- 
-  return U.usedCount(&GA) || U.compilerUsedCount(&GA); 
-} 
- 
-static bool hasUsesToReplace(GlobalAlias &GA, const LLVMUsed &U, 
-                             bool &RenameTarget) { 
-  RenameTarget = false; 
-  bool Ret = false; 
-  if (hasUseOtherThanLLVMUsed(GA, U)) 
-    Ret = true; 
- 
-  // If the alias is externally visible, we may still be able to simplify it. 
-  if (!mayHaveOtherReferences(GA, U)) 
-    return Ret; 
- 
-  // If the aliasee has internal linkage, give it the name and linkage 
-  // of the alias, and delete the alias.  This turns: 
-  //   define internal ... @f(...) 
-  //   @a = alias ... @f 
-  // into: 
-  //   define ... @a(...) 
-  Constant *Aliasee = GA.getAliasee(); 
-  GlobalValue *Target = cast<GlobalValue>(Aliasee->stripPointerCasts()); 
-  if (!Target->hasLocalLinkage()) 
-    return Ret; 
- 
-  // Do not perform the transform if multiple aliases potentially target the 
-  // aliasee. This check also ensures that it is safe to replace the section 
-  // and other attributes of the aliasee with those of the alias. 
-  if (hasMoreThanOneUseOtherThanLLVMUsed(*Target, U)) 
-    return Ret; 
- 
-  RenameTarget = true; 
-  return true; 
-} 
- 
-static bool 
-OptimizeGlobalAliases(Module &M, 
-                      SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) { 
-  bool Changed = false; 
-  LLVMUsed Used(M); 
- 
-  for (GlobalValue *GV : Used.used()) 
-    Used.compilerUsedErase(GV); 
- 
-  for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); 
-       I != E;) { 
-    GlobalAlias *J = &*I++; 
- 
-    // Aliases without names cannot be referenced outside this module. 
-    if (!J->hasName() && !J->isDeclaration() && !J->hasLocalLinkage()) 
-      J->setLinkage(GlobalValue::InternalLinkage); 
- 
-    if (deleteIfDead(*J, NotDiscardableComdats)) { 
-      Changed = true; 
-      continue; 
-    } 
- 
-    // If the alias can change at link time, nothing can be done - bail out. 
-    if (J->isInterposable()) 
-      continue; 
- 
-    Constant *Aliasee = J->getAliasee(); 
-    GlobalValue *Target = dyn_cast<GlobalValue>(Aliasee->stripPointerCasts()); 
-    // We can't trivially replace the alias with the aliasee if the aliasee is 
-    // non-trivial in some way. 
-    // TODO: Try to handle non-zero GEPs of local aliasees. 
-    if (!Target) 
-      continue; 
-    Target->removeDeadConstantUsers(); 
- 
-    // Make all users of the alias use the aliasee instead. 
-    bool RenameTarget; 
-    if (!hasUsesToReplace(*J, Used, RenameTarget)) 
-      continue; 
- 
-    J->replaceAllUsesWith(ConstantExpr::getBitCast(Aliasee, J->getType())); 
-    ++NumAliasesResolved; 
-    Changed = true; 
- 
-    if (RenameTarget) { 
-      // Give the aliasee the name, linkage and other attributes of the alias. 
-      Target->takeName(&*J); 
-      Target->setLinkage(J->getLinkage()); 
-      Target->setDSOLocal(J->isDSOLocal()); 
-      Target->setVisibility(J->getVisibility()); 
-      Target->setDLLStorageClass(J->getDLLStorageClass()); 
- 
-      if (Used.usedErase(&*J)) 
-        Used.usedInsert(Target); 
- 
-      if (Used.compilerUsedErase(&*J)) 
-        Used.compilerUsedInsert(Target); 
-    } else if (mayHaveOtherReferences(*J, Used)) 
-      continue; 
- 
-    // Delete the alias. 
-    M.getAliasList().erase(J); 
-    ++NumAliasesRemoved; 
-    Changed = true; 
-  } 
- 
-  Used.syncVariablesAndSets(); 
- 
-  return Changed; 
-} 
- 
-static Function * 
-FindCXAAtExit(Module &M, function_ref<TargetLibraryInfo &(Function &)> GetTLI) { 
-  // Hack to get a default TLI before we have actual Function. 
-  auto FuncIter = M.begin(); 
-  if (FuncIter == M.end()) 
-    return nullptr; 
-  auto *TLI = &GetTLI(*FuncIter); 
- 
-  LibFunc F = LibFunc_cxa_atexit; 
-  if (!TLI->has(F)) 
-    return nullptr; 
- 
-  Function *Fn = M.getFunction(TLI->getName(F)); 
-  if (!Fn) 
-    return nullptr; 
- 
-  // Now get the actual TLI for Fn. 
-  TLI = &GetTLI(*Fn); 
- 
-  // Make sure that the function has the correct prototype. 
-  if (!TLI->getLibFunc(*Fn, F) || F != LibFunc_cxa_atexit) 
-    return nullptr; 
- 
-  return Fn; 
-} 
- 
-/// Returns whether the given function is an empty C++ destructor and can 
-/// therefore be eliminated. 
-/// Note that we assume that other optimization passes have already simplified 
-/// the code so we simply check for 'ret'. 
-static bool cxxDtorIsEmpty(const Function &Fn) { 
-  // FIXME: We could eliminate C++ destructors if they're readonly/readnone and 
-  // nounwind, but that doesn't seem worth doing. 
-  if (Fn.isDeclaration()) 
-    return false; 
- 
-  for (auto &I : Fn.getEntryBlock()) { 
-    if (isa<DbgInfoIntrinsic>(I)) 
-      continue; 
-    if (isa<ReturnInst>(I)) 
-      return true; 
-    break; 
-  } 
-  return false; 
-} 
- 
-static bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) { 
-  /// Itanium C++ ABI p3.3.5: 
-  /// 
-  ///   After constructing a global (or local static) object, that will require 
-  ///   destruction on exit, a termination function is registered as follows: 
-  /// 
-  ///   extern "C" int __cxa_atexit ( void (*f)(void *), void *p, void *d ); 
-  /// 
-  ///   This registration, e.g. __cxa_atexit(f,p,d), is intended to cause the 
-  ///   call f(p) when DSO d is unloaded, before all such termination calls 
-  ///   registered before this one. It returns zero if registration is 
-  ///   successful, nonzero on failure. 
- 
-  // This pass will look for calls to __cxa_atexit where the function is trivial 
-  // and remove them. 
-  bool Changed = false; 
- 
-  for (auto I = CXAAtExitFn->user_begin(), E = CXAAtExitFn->user_end(); 
-       I != E;) { 
-    // We're only interested in calls. Theoretically, we could handle invoke 
-    // instructions as well, but neither llvm-gcc nor clang generate invokes 
-    // to __cxa_atexit. 
-    CallInst *CI = dyn_cast<CallInst>(*I++); 
-    if (!CI) 
-      continue; 
- 
-    Function *DtorFn = 
-      dyn_cast<Function>(CI->getArgOperand(0)->stripPointerCasts()); 
-    if (!DtorFn || !cxxDtorIsEmpty(*DtorFn)) 
-      continue; 
- 
-    // Just remove the call. 
-    CI->replaceAllUsesWith(Constant::getNullValue(CI->getType())); 
-    CI->eraseFromParent(); 
- 
-    ++NumCXXDtorsRemoved; 
- 
-    Changed |= true; 
-  } 
- 
-  return Changed; 
-} 
- 
-static bool optimizeGlobalsInModule( 
-    Module &M, const DataLayout &DL, 
-    function_ref<TargetLibraryInfo &(Function &)> GetTLI, 
-    function_ref<TargetTransformInfo &(Function &)> GetTTI, 
-    function_ref<BlockFrequencyInfo &(Function &)> GetBFI, 
-    function_ref<DominatorTree &(Function &)> LookupDomTree) { 
-  SmallPtrSet<const Comdat *, 8> NotDiscardableComdats; 
-  bool Changed = false; 
-  bool LocalChange = true; 
-  while (LocalChange) { 
-    LocalChange = false; 
- 
-    NotDiscardableComdats.clear(); 
-    for (const GlobalVariable &GV : M.globals()) 
-      if (const Comdat *C = GV.getComdat()) 
-        if (!GV.isDiscardableIfUnused() || !GV.use_empty()) 
-          NotDiscardableComdats.insert(C); 
-    for (Function &F : M) 
-      if (const Comdat *C = F.getComdat()) 
-        if (!F.isDefTriviallyDead()) 
-          NotDiscardableComdats.insert(C); 
-    for (GlobalAlias &GA : M.aliases()) 
-      if (const Comdat *C = GA.getComdat()) 
-        if (!GA.isDiscardableIfUnused() || !GA.use_empty()) 
-          NotDiscardableComdats.insert(C); 
- 
-    // Delete functions that are trivially dead, ccc -> fastcc 
-    LocalChange |= OptimizeFunctions(M, GetTLI, GetTTI, GetBFI, LookupDomTree, 
-                                     NotDiscardableComdats); 
- 
-    // Optimize global_ctors list. 
-    LocalChange |= optimizeGlobalCtorsList(M, [&](Function *F) { 
-      return EvaluateStaticConstructor(F, DL, &GetTLI(*F)); 
-    }); 
- 
-    // Optimize non-address-taken globals. 
-    LocalChange |= 
-        OptimizeGlobalVars(M, GetTLI, LookupDomTree, NotDiscardableComdats); 
- 
-    // Resolve aliases, when possible. 
-    LocalChange |= OptimizeGlobalAliases(M, NotDiscardableComdats); 
- 
-    // Try to remove trivial global destructors if they are not removed 
-    // already. 
-    Function *CXAAtExitFn = FindCXAAtExit(M, GetTLI); 
-    if (CXAAtExitFn) 
-      LocalChange |= OptimizeEmptyGlobalCXXDtors(CXAAtExitFn); 
- 
-    Changed |= LocalChange; 
-  } 
- 
-  // TODO: Move all global ctors functions to the end of the module for code 
-  // layout. 
- 
-  return Changed; 
-} 
- 
-PreservedAnalyses GlobalOptPass::run(Module &M, ModuleAnalysisManager &AM) { 
-    auto &DL = M.getDataLayout(); 
-    auto &FAM = 
-        AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 
-    auto LookupDomTree = [&FAM](Function &F) -> DominatorTree &{ 
-      return FAM.getResult<DominatorTreeAnalysis>(F); 
-    }; 
-    auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & { 
-      return FAM.getResult<TargetLibraryAnalysis>(F); 
-    }; 
-    auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & { 
-      return FAM.getResult<TargetIRAnalysis>(F); 
-    }; 
- 
-    auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & { 
-      return FAM.getResult<BlockFrequencyAnalysis>(F); 
-    }; 
- 
-    if (!optimizeGlobalsInModule(M, DL, GetTLI, GetTTI, GetBFI, LookupDomTree)) 
-      return PreservedAnalyses::all(); 
-    return PreservedAnalyses::none(); 
-} 
- 
-namespace { 
- 
-struct GlobalOptLegacyPass : public ModulePass { 
-  static char ID; // Pass identification, replacement for typeid 
- 
-  GlobalOptLegacyPass() : ModulePass(ID) { 
-    initializeGlobalOptLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnModule(Module &M) override { 
-    if (skipModule(M)) 
-      return false; 
- 
-    auto &DL = M.getDataLayout(); 
-    auto LookupDomTree = [this](Function &F) -> DominatorTree & { 
-      return this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree(); 
-    }; 
-    auto GetTLI = [this](Function &F) -> TargetLibraryInfo & { 
-      return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 
-    }; 
-    auto GetTTI = [this](Function &F) -> TargetTransformInfo & { 
-      return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 
-    }; 
- 
-    auto GetBFI = [this](Function &F) -> BlockFrequencyInfo & { 
-      return this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI(); 
-    }; 
- 
-    return optimizeGlobalsInModule(M, DL, GetTLI, GetTTI, GetBFI, 
-                                   LookupDomTree); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-    AU.addRequired<TargetTransformInfoWrapperPass>(); 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addRequired<BlockFrequencyInfoWrapperPass>(); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-char GlobalOptLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(GlobalOptLegacyPass, "globalopt", 
-                      "Global Variable Optimizer", false, false) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_END(GlobalOptLegacyPass, "globalopt", 
-                    "Global Variable Optimizer", false, false) 
- 
-ModulePass *llvm::createGlobalOptimizerPass() { 
-  return new GlobalOptLegacyPass(); 
-} 
+      return false;
+  }
+  return true;
+}
+
+static void changeCallSitesToColdCC(Function *F) {
+  for (User *U : F->users()) {
+    if (isa<BlockAddress>(U))
+      continue;
+    cast<CallBase>(U)->setCallingConv(CallingConv::Cold);
+  }
+}
+
+// This function iterates over all the call instructions in the input Function
+// and checks that all call sites are in cold blocks and are allowed to use the
+// coldcc calling convention.
+static bool
+hasOnlyColdCalls(Function &F,
+                 function_ref<BlockFrequencyInfo &(Function &)> GetBFI) {
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+        // Skip over isline asm instructions since they aren't function calls.
+        if (CI->isInlineAsm())
+          continue;
+        Function *CalledFn = CI->getCalledFunction();
+        if (!CalledFn)
+          return false;
+        if (!CalledFn->hasLocalLinkage())
+          return false;
+        // Skip over instrinsics since they won't remain as function calls.
+        if (CalledFn->getIntrinsicID() != Intrinsic::not_intrinsic)
+          continue;
+        // Check if it's valid to use coldcc calling convention.
+        if (!hasChangeableCC(CalledFn) || CalledFn->isVarArg() ||
+            CalledFn->hasAddressTaken())
+          return false;
+        BlockFrequencyInfo &CallerBFI = GetBFI(F);
+        if (!isColdCallSite(*CI, CallerBFI))
+          return false;
+      }
+    }
+  }
+  return true;
+}
+
+static bool hasMustTailCallers(Function *F) {
+  for (User *U : F->users()) {
+    CallBase *CB = dyn_cast<CallBase>(U);
+    if (!CB) {
+      assert(isa<BlockAddress>(U) &&
+             "Expected either CallBase or BlockAddress");
+      continue;
+    }
+    if (CB->isMustTailCall())
+      return true;
+  }
+  return false;
+}
+
+static bool hasInvokeCallers(Function *F) {
+  for (User *U : F->users())
+    if (isa<InvokeInst>(U))
+      return true;
+  return false;
+}
+
+static void RemovePreallocated(Function *F) {
+  RemoveAttribute(F, Attribute::Preallocated);
+
+  auto *M = F->getParent();
+
+  IRBuilder<> Builder(M->getContext());
+
+  // Cannot modify users() while iterating over it, so make a copy.
+  SmallVector<User *, 4> PreallocatedCalls(F->users());
+  for (User *U : PreallocatedCalls) {
+    CallBase *CB = dyn_cast<CallBase>(U);
+    if (!CB)
+      continue;
+
+    assert(
+        !CB->isMustTailCall() &&
+        "Shouldn't call RemotePreallocated() on a musttail preallocated call");
+    // Create copy of call without "preallocated" operand bundle.
+    SmallVector<OperandBundleDef, 1> OpBundles;
+    CB->getOperandBundlesAsDefs(OpBundles);
+    CallBase *PreallocatedSetup = nullptr;
+    for (auto *It = OpBundles.begin(); It != OpBundles.end(); ++It) {
+      if (It->getTag() == "preallocated") {
+        PreallocatedSetup = cast<CallBase>(*It->input_begin());
+        OpBundles.erase(It);
+        break;
+      }
+    }
+    assert(PreallocatedSetup && "Did not find preallocated bundle");
+    uint64_t ArgCount =
+        cast<ConstantInt>(PreallocatedSetup->getArgOperand(0))->getZExtValue();
+
+    assert((isa<CallInst>(CB) || isa<InvokeInst>(CB)) &&
+           "Unknown indirect call type");
+    CallBase *NewCB = CallBase::Create(CB, OpBundles, CB);
+    CB->replaceAllUsesWith(NewCB);
+    NewCB->takeName(CB);
+    CB->eraseFromParent();
+
+    Builder.SetInsertPoint(PreallocatedSetup);
+    auto *StackSave =
+        Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stacksave));
+
+    Builder.SetInsertPoint(NewCB->getNextNonDebugInstruction());
+    Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stackrestore),
+                       StackSave);
+
+    // Replace @llvm.call.preallocated.arg() with alloca.
+    // Cannot modify users() while iterating over it, so make a copy.
+    // @llvm.call.preallocated.arg() can be called with the same index multiple
+    // times. So for each @llvm.call.preallocated.arg(), we see if we have
+    // already created a Value* for the index, and if not, create an alloca and
+    // bitcast right after the @llvm.call.preallocated.setup() so that it
+    // dominates all uses.
+    SmallVector<Value *, 2> ArgAllocas(ArgCount);
+    SmallVector<User *, 2> PreallocatedArgs(PreallocatedSetup->users());
+    for (auto *User : PreallocatedArgs) {
+      auto *UseCall = cast<CallBase>(User);
+      assert(UseCall->getCalledFunction()->getIntrinsicID() ==
+                 Intrinsic::call_preallocated_arg &&
+             "preallocated token use was not a llvm.call.preallocated.arg");
+      uint64_t AllocArgIndex =
+          cast<ConstantInt>(UseCall->getArgOperand(1))->getZExtValue();
+      Value *AllocaReplacement = ArgAllocas[AllocArgIndex];
+      if (!AllocaReplacement) {
+        auto AddressSpace = UseCall->getType()->getPointerAddressSpace();
+        auto *ArgType = UseCall
+                            ->getAttribute(AttributeList::FunctionIndex,
+                                           Attribute::Preallocated)
+                            .getValueAsType();
+        auto *InsertBefore = PreallocatedSetup->getNextNonDebugInstruction();
+        Builder.SetInsertPoint(InsertBefore);
+        auto *Alloca =
+            Builder.CreateAlloca(ArgType, AddressSpace, nullptr, "paarg");
+        auto *BitCast = Builder.CreateBitCast(
+            Alloca, Type::getInt8PtrTy(M->getContext()), UseCall->getName());
+        ArgAllocas[AllocArgIndex] = BitCast;
+        AllocaReplacement = BitCast;
+      }
+
+      UseCall->replaceAllUsesWith(AllocaReplacement);
+      UseCall->eraseFromParent();
+    }
+    // Remove @llvm.call.preallocated.setup().
+    cast<Instruction>(PreallocatedSetup)->eraseFromParent();
+  }
+}
+
+static bool
+OptimizeFunctions(Module &M,
+                  function_ref<TargetLibraryInfo &(Function &)> GetTLI,
+                  function_ref<TargetTransformInfo &(Function &)> GetTTI,
+                  function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
+                  function_ref<DominatorTree &(Function &)> LookupDomTree,
+                  SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) {
+
+  bool Changed = false;
+
+  std::vector<Function *> AllCallsCold;
+  for (Module::iterator FI = M.begin(), E = M.end(); FI != E;) {
+    Function *F = &*FI++;
+    if (hasOnlyColdCalls(*F, GetBFI))
+      AllCallsCold.push_back(F);
+  }
+
+  // Optimize functions.
+  for (Module::iterator FI = M.begin(), E = M.end(); FI != E; ) {
+    Function *F = &*FI++;
+
+    // Don't perform global opt pass on naked functions; we don't want fast
+    // calling conventions for naked functions.
+    if (F->hasFnAttribute(Attribute::Naked))
+      continue;
+
+    // Functions without names cannot be referenced outside this module.
+    if (!F->hasName() && !F->isDeclaration() && !F->hasLocalLinkage())
+      F->setLinkage(GlobalValue::InternalLinkage);
+
+    if (deleteIfDead(*F, NotDiscardableComdats)) {
+      Changed = true;
+      continue;
+    }
+
+    // LLVM's definition of dominance allows instructions that are cyclic
+    // in unreachable blocks, e.g.:
+    // %pat = select i1 %condition, @global, i16* %pat
+    // because any instruction dominates an instruction in a block that's
+    // not reachable from entry.
+    // So, remove unreachable blocks from the function, because a) there's
+    // no point in analyzing them and b) GlobalOpt should otherwise grow
+    // some more complicated logic to break these cycles.
+    // Removing unreachable blocks might invalidate the dominator so we
+    // recalculate it.
+    if (!F->isDeclaration()) {
+      if (removeUnreachableBlocks(*F)) {
+        auto &DT = LookupDomTree(*F);
+        DT.recalculate(*F);
+        Changed = true;
+      }
+    }
+
+    Changed |= processGlobal(*F, GetTLI, LookupDomTree);
+
+    if (!F->hasLocalLinkage())
+      continue;
+
+    // If we have an inalloca parameter that we can safely remove the
+    // inalloca attribute from, do so. This unlocks optimizations that
+    // wouldn't be safe in the presence of inalloca.
+    // FIXME: We should also hoist alloca affected by this to the entry
+    // block if possible.
+    if (F->getAttributes().hasAttrSomewhere(Attribute::InAlloca) &&
+        !F->hasAddressTaken() && !hasMustTailCallers(F)) {
+      RemoveAttribute(F, Attribute::InAlloca);
+      Changed = true;
+    }
+
+    // FIXME: handle invokes
+    // FIXME: handle musttail
+    if (F->getAttributes().hasAttrSomewhere(Attribute::Preallocated)) {
+      if (!F->hasAddressTaken() && !hasMustTailCallers(F) &&
+          !hasInvokeCallers(F)) {
+        RemovePreallocated(F);
+        Changed = true;
+      }
+      continue;
+    }
+
+    if (hasChangeableCC(F) && !F->isVarArg() && !F->hasAddressTaken()) {
+      NumInternalFunc++;
+      TargetTransformInfo &TTI = GetTTI(*F);
+      // Change the calling convention to coldcc if either stress testing is
+      // enabled or the target would like to use coldcc on functions which are
+      // cold at all call sites and the callers contain no other non coldcc
+      // calls.
+      if (EnableColdCCStressTest ||
+          (TTI.useColdCCForColdCall(*F) &&
+           isValidCandidateForColdCC(*F, GetBFI, AllCallsCold))) {
+        F->setCallingConv(CallingConv::Cold);
+        changeCallSitesToColdCC(F);
+        Changed = true;
+        NumColdCC++;
+      }
+    }
+
+    if (hasChangeableCC(F) && !F->isVarArg() &&
+        !F->hasAddressTaken()) {
+      // If this function has a calling convention worth changing, is not a
+      // varargs function, and is only called directly, promote it to use the
+      // Fast calling convention.
+      F->setCallingConv(CallingConv::Fast);
+      ChangeCalleesToFastCall(F);
+      ++NumFastCallFns;
+      Changed = true;
+    }
+
+    if (F->getAttributes().hasAttrSomewhere(Attribute::Nest) &&
+        !F->hasAddressTaken()) {
+      // The function is not used by a trampoline intrinsic, so it is safe
+      // to remove the 'nest' attribute.
+      RemoveAttribute(F, Attribute::Nest);
+      ++NumNestRemoved;
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+static bool
+OptimizeGlobalVars(Module &M,
+                   function_ref<TargetLibraryInfo &(Function &)> GetTLI,
+                   function_ref<DominatorTree &(Function &)> LookupDomTree,
+                   SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) {
+  bool Changed = false;
+
+  for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
+       GVI != E; ) {
+    GlobalVariable *GV = &*GVI++;
+    // Global variables without names cannot be referenced outside this module.
+    if (!GV->hasName() && !GV->isDeclaration() && !GV->hasLocalLinkage())
+      GV->setLinkage(GlobalValue::InternalLinkage);
+    // Simplify the initializer.
+    if (GV->hasInitializer())
+      if (auto *C = dyn_cast<Constant>(GV->getInitializer())) {
+        auto &DL = M.getDataLayout();
+        // TLI is not used in the case of a Constant, so use default nullptr
+        // for that optional parameter, since we don't have a Function to
+        // provide GetTLI anyway.
+        Constant *New = ConstantFoldConstant(C, DL, /*TLI*/ nullptr);
+        if (New != C)
+          GV->setInitializer(New);
+      }
+
+    if (deleteIfDead(*GV, NotDiscardableComdats)) {
+      Changed = true;
+      continue;
+    }
+
+    Changed |= processGlobal(*GV, GetTLI, LookupDomTree);
+  }
+  return Changed;
+}
+
+/// Evaluate a piece of a constantexpr store into a global initializer.  This
+/// returns 'Init' modified to reflect 'Val' stored into it.  At this point, the
+/// GEP operands of Addr [0, OpNo) have been stepped into.
+static Constant *EvaluateStoreInto(Constant *Init, Constant *Val,
+                                   ConstantExpr *Addr, unsigned OpNo) {
+  // Base case of the recursion.
+  if (OpNo == Addr->getNumOperands()) {
+    assert(Val->getType() == Init->getType() && "Type mismatch!");
+    return Val;
+  }
+
+  SmallVector<Constant*, 32> Elts;
+  if (StructType *STy = dyn_cast<StructType>(Init->getType())) {
+    // Break up the constant into its elements.
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+      Elts.push_back(Init->getAggregateElement(i));
+
+    // Replace the element that we are supposed to.
+    ConstantInt *CU = cast<ConstantInt>(Addr->getOperand(OpNo));
+    unsigned Idx = CU->getZExtValue();
+    assert(Idx < STy->getNumElements() && "Struct index out of range!");
+    Elts[Idx] = EvaluateStoreInto(Elts[Idx], Val, Addr, OpNo+1);
+
+    // Return the modified struct.
+    return ConstantStruct::get(STy, Elts);
+  }
+
+  ConstantInt *CI = cast<ConstantInt>(Addr->getOperand(OpNo));
+  uint64_t NumElts;
+  if (ArrayType *ATy = dyn_cast<ArrayType>(Init->getType()))
+    NumElts = ATy->getNumElements();
+  else
+    NumElts = cast<FixedVectorType>(Init->getType())->getNumElements();
+
+  // Break up the array into elements.
+  for (uint64_t i = 0, e = NumElts; i != e; ++i)
+    Elts.push_back(Init->getAggregateElement(i));
+
+  assert(CI->getZExtValue() < NumElts);
+  Elts[CI->getZExtValue()] =
+    EvaluateStoreInto(Elts[CI->getZExtValue()], Val, Addr, OpNo+1);
+
+  if (Init->getType()->isArrayTy())
+    return ConstantArray::get(cast<ArrayType>(Init->getType()), Elts);
+  return ConstantVector::get(Elts);
+}
+
+/// We have decided that Addr (which satisfies the predicate
+/// isSimpleEnoughPointerToCommit) should get Val as its value.  Make it happen.
+static void CommitValueTo(Constant *Val, Constant *Addr) {
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
+    assert(GV->hasInitializer());
+    GV->setInitializer(Val);
+    return;
+  }
+
+  ConstantExpr *CE = cast<ConstantExpr>(Addr);
+  GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
+  GV->setInitializer(EvaluateStoreInto(GV->getInitializer(), Val, CE, 2));
+}
+
+/// Given a map of address -> value, where addresses are expected to be some form
+/// of either a global or a constant GEP, set the initializer for the address to
+/// be the value. This performs mostly the same function as CommitValueTo()
+/// and EvaluateStoreInto() but is optimized to be more efficient for the common
+/// case where the set of addresses are GEPs sharing the same underlying global,
+/// processing the GEPs in batches rather than individually.
+///
+/// To give an example, consider the following C++ code adapted from the clang
+/// regression tests:
+/// struct S {
+///  int n = 10;
+///  int m = 2 * n;
+///  S(int a) : n(a) {}
+/// };
+///
+/// template<typename T>
+/// struct U {
+///  T *r = &q;
+///  T q = 42;
+///  U *p = this;
+/// };
+///
+/// U<S> e;
+///
+/// The global static constructor for 'e' will need to initialize 'r' and 'p' of
+/// the outer struct, while also initializing the inner 'q' structs 'n' and 'm'
+/// members. This batch algorithm will simply use general CommitValueTo() method
+/// to handle the complex nested S struct initialization of 'q', before
+/// processing the outermost members in a single batch. Using CommitValueTo() to
+/// handle member in the outer struct is inefficient when the struct/array is
+/// very large as we end up creating and destroy constant arrays for each
+/// initialization.
+/// For the above case, we expect the following IR to be generated:
+///
+/// %struct.U = type { %struct.S*, %struct.S, %struct.U* }
+/// %struct.S = type { i32, i32 }
+/// @e = global %struct.U { %struct.S* gep inbounds (%struct.U, %struct.U* @e,
+///                                                  i64 0, i32 1),
+///                         %struct.S { i32 42, i32 84 }, %struct.U* @e }
+/// The %struct.S { i32 42, i32 84 } inner initializer is treated as a complex
+/// constant expression, while the other two elements of @e are "simple".
+static void BatchCommitValueTo(const DenseMap<Constant*, Constant*> &Mem) {
+  SmallVector<std::pair<GlobalVariable*, Constant*>, 32> GVs;
+  SmallVector<std::pair<ConstantExpr*, Constant*>, 32> ComplexCEs;
+  SmallVector<std::pair<ConstantExpr*, Constant*>, 32> SimpleCEs;
+  SimpleCEs.reserve(Mem.size());
+
+  for (const auto &I : Mem) {
+    if (auto *GV = dyn_cast<GlobalVariable>(I.first)) {
+      GVs.push_back(std::make_pair(GV, I.second));
+    } else {
+      ConstantExpr *GEP = cast<ConstantExpr>(I.first);
+      // We don't handle the deeply recursive case using the batch method.
+      if (GEP->getNumOperands() > 3)
+        ComplexCEs.push_back(std::make_pair(GEP, I.second));
+      else
+        SimpleCEs.push_back(std::make_pair(GEP, I.second));
+    }
+  }
+
+  // The algorithm below doesn't handle cases like nested structs, so use the
+  // slower fully general method if we have to.
+  for (auto ComplexCE : ComplexCEs)
+    CommitValueTo(ComplexCE.second, ComplexCE.first);
+
+  for (auto GVPair : GVs) {
+    assert(GVPair.first->hasInitializer());
+    GVPair.first->setInitializer(GVPair.second);
+  }
+
+  if (SimpleCEs.empty())
+    return;
+
+  // We cache a single global's initializer elements in the case where the
+  // subsequent address/val pair uses the same one. This avoids throwing away and
+  // rebuilding the constant struct/vector/array just because one element is
+  // modified at a time.
+  SmallVector<Constant *, 32> Elts;
+  Elts.reserve(SimpleCEs.size());
+  GlobalVariable *CurrentGV = nullptr;
+
+  auto commitAndSetupCache = [&](GlobalVariable *GV, bool Update) {
+    Constant *Init = GV->getInitializer();
+    Type *Ty = Init->getType();
+    if (Update) {
+      if (CurrentGV) {
+        assert(CurrentGV && "Expected a GV to commit to!");
+        Type *CurrentInitTy = CurrentGV->getInitializer()->getType();
+        // We have a valid cache that needs to be committed.
+        if (StructType *STy = dyn_cast<StructType>(CurrentInitTy))
+          CurrentGV->setInitializer(ConstantStruct::get(STy, Elts));
+        else if (ArrayType *ArrTy = dyn_cast<ArrayType>(CurrentInitTy))
+          CurrentGV->setInitializer(ConstantArray::get(ArrTy, Elts));
+        else
+          CurrentGV->setInitializer(ConstantVector::get(Elts));
+      }
+      if (CurrentGV == GV)
+        return;
+      // Need to clear and set up cache for new initializer.
+      CurrentGV = GV;
+      Elts.clear();
+      unsigned NumElts;
+      if (auto *STy = dyn_cast<StructType>(Ty))
+        NumElts = STy->getNumElements();
+      else if (auto *ATy = dyn_cast<ArrayType>(Ty))
+        NumElts = ATy->getNumElements();
+      else
+        NumElts = cast<FixedVectorType>(Ty)->getNumElements();
+      for (unsigned i = 0, e = NumElts; i != e; ++i)
+        Elts.push_back(Init->getAggregateElement(i));
+    }
+  };
+
+  for (auto CEPair : SimpleCEs) {
+    ConstantExpr *GEP = CEPair.first;
+    Constant *Val = CEPair.second;
+
+    GlobalVariable *GV = cast<GlobalVariable>(GEP->getOperand(0));
+    commitAndSetupCache(GV, GV != CurrentGV);
+    ConstantInt *CI = cast<ConstantInt>(GEP->getOperand(2));
+    Elts[CI->getZExtValue()] = Val;
+  }
+  // The last initializer in the list needs to be committed, others
+  // will be committed on a new initializer being processed.
+  commitAndSetupCache(CurrentGV, true);
+}
+
+/// Evaluate static constructors in the function, if we can.  Return true if we
+/// can, false otherwise.
+static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL,
+                                      TargetLibraryInfo *TLI) {
+  // Call the function.
+  Evaluator Eval(DL, TLI);
+  Constant *RetValDummy;
+  bool EvalSuccess = Eval.EvaluateFunction(F, RetValDummy,
+                                           SmallVector<Constant*, 0>());
+
+  if (EvalSuccess) {
+    ++NumCtorsEvaluated;
+
+    // We succeeded at evaluation: commit the result.
+    LLVM_DEBUG(dbgs() << "FULLY EVALUATED GLOBAL CTOR FUNCTION '"
+                      << F->getName() << "' to "
+                      << Eval.getMutatedMemory().size() << " stores.\n");
+    BatchCommitValueTo(Eval.getMutatedMemory());
+    for (GlobalVariable *GV : Eval.getInvariants())
+      GV->setConstant(true);
+  }
+
+  return EvalSuccess;
+}
+
+static int compareNames(Constant *const *A, Constant *const *B) {
+  Value *AStripped = (*A)->stripPointerCasts();
+  Value *BStripped = (*B)->stripPointerCasts();
+  return AStripped->getName().compare(BStripped->getName());
+}
+
+static void setUsedInitializer(GlobalVariable &V,
+                               const SmallPtrSetImpl<GlobalValue *> &Init) {
+  if (Init.empty()) {
+    V.eraseFromParent();
+    return;
+  }
+
+  // Type of pointer to the array of pointers.
+  PointerType *Int8PtrTy = Type::getInt8PtrTy(V.getContext(), 0);
+
+  SmallVector<Constant *, 8> UsedArray;
+  for (GlobalValue *GV : Init) {
+    Constant *Cast
+      = ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, Int8PtrTy);
+    UsedArray.push_back(Cast);
+  }
+  // Sort to get deterministic order.
+  array_pod_sort(UsedArray.begin(), UsedArray.end(), compareNames);
+  ArrayType *ATy = ArrayType::get(Int8PtrTy, UsedArray.size());
+
+  Module *M = V.getParent();
+  V.removeFromParent();
+  GlobalVariable *NV =
+      new GlobalVariable(*M, ATy, false, GlobalValue::AppendingLinkage,
+                         ConstantArray::get(ATy, UsedArray), "");
+  NV->takeName(&V);
+  NV->setSection("llvm.metadata");
+  delete &V;
+}
+
+namespace {
+
+/// An easy to access representation of llvm.used and llvm.compiler.used.
+class LLVMUsed {
+  SmallPtrSet<GlobalValue *, 8> Used;
+  SmallPtrSet<GlobalValue *, 8> CompilerUsed;
+  GlobalVariable *UsedV;
+  GlobalVariable *CompilerUsedV;
+
+public:
+  LLVMUsed(Module &M) {
+    UsedV = collectUsedGlobalVariables(M, Used, false);
+    CompilerUsedV = collectUsedGlobalVariables(M, CompilerUsed, true);
+  }
+
+  using iterator = SmallPtrSet<GlobalValue *, 8>::iterator;
+  using used_iterator_range = iterator_range<iterator>;
+
+  iterator usedBegin() { return Used.begin(); }
+  iterator usedEnd() { return Used.end(); }
+
+  used_iterator_range used() {
+    return used_iterator_range(usedBegin(), usedEnd());
+  }
+
+  iterator compilerUsedBegin() { return CompilerUsed.begin(); }
+  iterator compilerUsedEnd() { return CompilerUsed.end(); }
+
+  used_iterator_range compilerUsed() {
+    return used_iterator_range(compilerUsedBegin(), compilerUsedEnd());
+  }
+
+  bool usedCount(GlobalValue *GV) const { return Used.count(GV); }
+
+  bool compilerUsedCount(GlobalValue *GV) const {
+    return CompilerUsed.count(GV);
+  }
+
+  bool usedErase(GlobalValue *GV) { return Used.erase(GV); }
+  bool compilerUsedErase(GlobalValue *GV) { return CompilerUsed.erase(GV); }
+  bool usedInsert(GlobalValue *GV) { return Used.insert(GV).second; }
+
+  bool compilerUsedInsert(GlobalValue *GV) {
+    return CompilerUsed.insert(GV).second;
+  }
+
+  void syncVariablesAndSets() {
+    if (UsedV)
+      setUsedInitializer(*UsedV, Used);
+    if (CompilerUsedV)
+      setUsedInitializer(*CompilerUsedV, CompilerUsed);
+  }
+};
+
+} // end anonymous namespace
+
+static bool hasUseOtherThanLLVMUsed(GlobalAlias &GA, const LLVMUsed &U) {
+  if (GA.use_empty()) // No use at all.
+    return false;
+
+  assert((!U.usedCount(&GA) || !U.compilerUsedCount(&GA)) &&
+         "We should have removed the duplicated "
+         "element from llvm.compiler.used");
+  if (!GA.hasOneUse())
+    // Strictly more than one use. So at least one is not in llvm.used and
+    // llvm.compiler.used.
+    return true;
+
+  // Exactly one use. Check if it is in llvm.used or llvm.compiler.used.
+  return !U.usedCount(&GA) && !U.compilerUsedCount(&GA);
+}
+
+static bool hasMoreThanOneUseOtherThanLLVMUsed(GlobalValue &V,
+                                               const LLVMUsed &U) {
+  unsigned N = 2;
+  assert((!U.usedCount(&V) || !U.compilerUsedCount(&V)) &&
+         "We should have removed the duplicated "
+         "element from llvm.compiler.used");
+  if (U.usedCount(&V) || U.compilerUsedCount(&V))
+    ++N;
+  return V.hasNUsesOrMore(N);
+}
+
+static bool mayHaveOtherReferences(GlobalAlias &GA, const LLVMUsed &U) {
+  if (!GA.hasLocalLinkage())
+    return true;
+
+  return U.usedCount(&GA) || U.compilerUsedCount(&GA);
+}
+
+static bool hasUsesToReplace(GlobalAlias &GA, const LLVMUsed &U,
+                             bool &RenameTarget) {
+  RenameTarget = false;
+  bool Ret = false;
+  if (hasUseOtherThanLLVMUsed(GA, U))
+    Ret = true;
+
+  // If the alias is externally visible, we may still be able to simplify it.
+  if (!mayHaveOtherReferences(GA, U))
+    return Ret;
+
+  // If the aliasee has internal linkage, give it the name and linkage
+  // of the alias, and delete the alias.  This turns:
+  //   define internal ... @f(...)
+  //   @a = alias ... @f
+  // into:
+  //   define ... @a(...)
+  Constant *Aliasee = GA.getAliasee();
+  GlobalValue *Target = cast<GlobalValue>(Aliasee->stripPointerCasts());
+  if (!Target->hasLocalLinkage())
+    return Ret;
+
+  // Do not perform the transform if multiple aliases potentially target the
+  // aliasee. This check also ensures that it is safe to replace the section
+  // and other attributes of the aliasee with those of the alias.
+  if (hasMoreThanOneUseOtherThanLLVMUsed(*Target, U))
+    return Ret;
+
+  RenameTarget = true;
+  return true;
+}
+
+static bool
+OptimizeGlobalAliases(Module &M,
+                      SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) {
+  bool Changed = false;
+  LLVMUsed Used(M);
+
+  for (GlobalValue *GV : Used.used())
+    Used.compilerUsedErase(GV);
+
+  for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
+       I != E;) {
+    GlobalAlias *J = &*I++;
+
+    // Aliases without names cannot be referenced outside this module.
+    if (!J->hasName() && !J->isDeclaration() && !J->hasLocalLinkage())
+      J->setLinkage(GlobalValue::InternalLinkage);
+
+    if (deleteIfDead(*J, NotDiscardableComdats)) {
+      Changed = true;
+      continue;
+    }
+
+    // If the alias can change at link time, nothing can be done - bail out.
+    if (J->isInterposable())
+      continue;
+
+    Constant *Aliasee = J->getAliasee();
+    GlobalValue *Target = dyn_cast<GlobalValue>(Aliasee->stripPointerCasts());
+    // We can't trivially replace the alias with the aliasee if the aliasee is
+    // non-trivial in some way.
+    // TODO: Try to handle non-zero GEPs of local aliasees.
+    if (!Target)
+      continue;
+    Target->removeDeadConstantUsers();
+
+    // Make all users of the alias use the aliasee instead.
+    bool RenameTarget;
+    if (!hasUsesToReplace(*J, Used, RenameTarget))
+      continue;
+
+    J->replaceAllUsesWith(ConstantExpr::getBitCast(Aliasee, J->getType()));
+    ++NumAliasesResolved;
+    Changed = true;
+
+    if (RenameTarget) {
+      // Give the aliasee the name, linkage and other attributes of the alias.
+      Target->takeName(&*J);
+      Target->setLinkage(J->getLinkage());
+      Target->setDSOLocal(J->isDSOLocal());
+      Target->setVisibility(J->getVisibility());
+      Target->setDLLStorageClass(J->getDLLStorageClass());
+
+      if (Used.usedErase(&*J))
+        Used.usedInsert(Target);
+
+      if (Used.compilerUsedErase(&*J))
+        Used.compilerUsedInsert(Target);
+    } else if (mayHaveOtherReferences(*J, Used))
+      continue;
+
+    // Delete the alias.
+    M.getAliasList().erase(J);
+    ++NumAliasesRemoved;
+    Changed = true;
+  }
+
+  Used.syncVariablesAndSets();
+
+  return Changed;
+}
+
+static Function *
+FindCXAAtExit(Module &M, function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
+  // Hack to get a default TLI before we have actual Function.
+  auto FuncIter = M.begin();
+  if (FuncIter == M.end())
+    return nullptr;
+  auto *TLI = &GetTLI(*FuncIter);
+
+  LibFunc F = LibFunc_cxa_atexit;
+  if (!TLI->has(F))
+    return nullptr;
+
+  Function *Fn = M.getFunction(TLI->getName(F));
+  if (!Fn)
+    return nullptr;
+
+  // Now get the actual TLI for Fn.
+  TLI = &GetTLI(*Fn);
+
+  // Make sure that the function has the correct prototype.
+  if (!TLI->getLibFunc(*Fn, F) || F != LibFunc_cxa_atexit)
+    return nullptr;
+
+  return Fn;
+}
+
+/// Returns whether the given function is an empty C++ destructor and can
+/// therefore be eliminated.
+/// Note that we assume that other optimization passes have already simplified
+/// the code so we simply check for 'ret'.
+static bool cxxDtorIsEmpty(const Function &Fn) {
+  // FIXME: We could eliminate C++ destructors if they're readonly/readnone and
+  // nounwind, but that doesn't seem worth doing.
+  if (Fn.isDeclaration())
+    return false;
+
+  for (auto &I : Fn.getEntryBlock()) {
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
+    if (isa<ReturnInst>(I))
+      return true;
+    break;
+  }
+  return false;
+}
+
+static bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) {
+  /// Itanium C++ ABI p3.3.5:
+  ///
+  ///   After constructing a global (or local static) object, that will require
+  ///   destruction on exit, a termination function is registered as follows:
+  ///
+  ///   extern "C" int __cxa_atexit ( void (*f)(void *), void *p, void *d );
+  ///
+  ///   This registration, e.g. __cxa_atexit(f,p,d), is intended to cause the
+  ///   call f(p) when DSO d is unloaded, before all such termination calls
+  ///   registered before this one. It returns zero if registration is
+  ///   successful, nonzero on failure.
+
+  // This pass will look for calls to __cxa_atexit where the function is trivial
+  // and remove them.
+  bool Changed = false;
+
+  for (auto I = CXAAtExitFn->user_begin(), E = CXAAtExitFn->user_end();
+       I != E;) {
+    // We're only interested in calls. Theoretically, we could handle invoke
+    // instructions as well, but neither llvm-gcc nor clang generate invokes
+    // to __cxa_atexit.
+    CallInst *CI = dyn_cast<CallInst>(*I++);
+    if (!CI)
+      continue;
+
+    Function *DtorFn =
+      dyn_cast<Function>(CI->getArgOperand(0)->stripPointerCasts());
+    if (!DtorFn || !cxxDtorIsEmpty(*DtorFn))
+      continue;
+
+    // Just remove the call.
+    CI->replaceAllUsesWith(Constant::getNullValue(CI->getType()));
+    CI->eraseFromParent();
+
+    ++NumCXXDtorsRemoved;
+
+    Changed |= true;
+  }
+
+  return Changed;
+}
+
+static bool optimizeGlobalsInModule(
+    Module &M, const DataLayout &DL,
+    function_ref<TargetLibraryInfo &(Function &)> GetTLI,
+    function_ref<TargetTransformInfo &(Function &)> GetTTI,
+    function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
+    function_ref<DominatorTree &(Function &)> LookupDomTree) {
+  SmallPtrSet<const Comdat *, 8> NotDiscardableComdats;
+  bool Changed = false;
+  bool LocalChange = true;
+  while (LocalChange) {
+    LocalChange = false;
+
+    NotDiscardableComdats.clear();
+    for (const GlobalVariable &GV : M.globals())
+      if (const Comdat *C = GV.getComdat())
+        if (!GV.isDiscardableIfUnused() || !GV.use_empty())
+          NotDiscardableComdats.insert(C);
+    for (Function &F : M)
+      if (const Comdat *C = F.getComdat())
+        if (!F.isDefTriviallyDead())
+          NotDiscardableComdats.insert(C);
+    for (GlobalAlias &GA : M.aliases())
+      if (const Comdat *C = GA.getComdat())
+        if (!GA.isDiscardableIfUnused() || !GA.use_empty())
+          NotDiscardableComdats.insert(C);
+
+    // Delete functions that are trivially dead, ccc -> fastcc
+    LocalChange |= OptimizeFunctions(M, GetTLI, GetTTI, GetBFI, LookupDomTree,
+                                     NotDiscardableComdats);
+
+    // Optimize global_ctors list.
+    LocalChange |= optimizeGlobalCtorsList(M, [&](Function *F) {
+      return EvaluateStaticConstructor(F, DL, &GetTLI(*F));
+    });
+
+    // Optimize non-address-taken globals.
+    LocalChange |=
+        OptimizeGlobalVars(M, GetTLI, LookupDomTree, NotDiscardableComdats);
+
+    // Resolve aliases, when possible.
+    LocalChange |= OptimizeGlobalAliases(M, NotDiscardableComdats);
+
+    // Try to remove trivial global destructors if they are not removed
+    // already.
+    Function *CXAAtExitFn = FindCXAAtExit(M, GetTLI);
+    if (CXAAtExitFn)
+      LocalChange |= OptimizeEmptyGlobalCXXDtors(CXAAtExitFn);
+
+    Changed |= LocalChange;
+  }
+
+  // TODO: Move all global ctors functions to the end of the module for code
+  // layout.
+
+  return Changed;
+}
+
+PreservedAnalyses GlobalOptPass::run(Module &M, ModuleAnalysisManager &AM) {
+    auto &DL = M.getDataLayout();
+    auto &FAM =
+        AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+    auto LookupDomTree = [&FAM](Function &F) -> DominatorTree &{
+      return FAM.getResult<DominatorTreeAnalysis>(F);
+    };
+    auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+      return FAM.getResult<TargetLibraryAnalysis>(F);
+    };
+    auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
+      return FAM.getResult<TargetIRAnalysis>(F);
+    };
+
+    auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
+      return FAM.getResult<BlockFrequencyAnalysis>(F);
+    };
+
+    if (!optimizeGlobalsInModule(M, DL, GetTLI, GetTTI, GetBFI, LookupDomTree))
+      return PreservedAnalyses::all();
+    return PreservedAnalyses::none();
+}
+
+namespace {
+
+struct GlobalOptLegacyPass : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+
+  GlobalOptLegacyPass() : ModulePass(ID) {
+    initializeGlobalOptLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+
+    auto &DL = M.getDataLayout();
+    auto LookupDomTree = [this](Function &F) -> DominatorTree & {
+      return this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+    };
+    auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
+      return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    };
+    auto GetTTI = [this](Function &F) -> TargetTransformInfo & {
+      return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    };
+
+    auto GetBFI = [this](Function &F) -> BlockFrequencyInfo & {
+      return this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
+    };
+
+    return optimizeGlobalsInModule(M, DL, GetTLI, GetTTI, GetBFI,
+                                   LookupDomTree);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
+  }
+};
+
+} // end anonymous namespace
+
+char GlobalOptLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(GlobalOptLegacyPass, "globalopt",
+                      "Global Variable Optimizer", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(GlobalOptLegacyPass, "globalopt",
+                    "Global Variable Optimizer", false, false)
+
+ModulePass *llvm::createGlobalOptimizerPass() {
+  return new GlobalOptLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/GlobalSplit.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/GlobalSplit.cpp
index 289099af3a..365b269dc3 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/GlobalSplit.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/GlobalSplit.cpp
@@ -1,196 +1,196 @@
-//===- GlobalSplit.cpp - global variable splitter -------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass uses inrange annotations on GEP indices to split globals where 
-// beneficial. Clang currently attaches these annotations to references to 
-// virtual table globals under the Itanium ABI for the benefit of the 
-// whole-program virtual call optimization and control flow integrity passes. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/IPO/GlobalSplit.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/StringExtras.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GlobalValue.h" 
-#include "llvm/IR/GlobalVariable.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Operator.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Transforms/IPO.h" 
-#include <cstdint> 
-#include <vector> 
- 
-using namespace llvm; 
- 
-static bool splitGlobal(GlobalVariable &GV) { 
-  // If the address of the global is taken outside of the module, we cannot 
-  // apply this transformation. 
-  if (!GV.hasLocalLinkage()) 
-    return false; 
- 
-  // We currently only know how to split ConstantStructs. 
-  auto *Init = dyn_cast_or_null<ConstantStruct>(GV.getInitializer()); 
-  if (!Init) 
-    return false; 
- 
-  // Verify that each user of the global is an inrange getelementptr constant. 
-  // From this it follows that any loads from or stores to that global must use 
-  // a pointer derived from an inrange getelementptr constant, which is 
-  // sufficient to allow us to apply the splitting transform. 
-  for (User *U : GV.users()) { 
-    if (!isa<Constant>(U)) 
-      return false; 
- 
-    auto *GEP = dyn_cast<GEPOperator>(U); 
-    if (!GEP || !GEP->getInRangeIndex() || *GEP->getInRangeIndex() != 1 || 
-        !isa<ConstantInt>(GEP->getOperand(1)) || 
-        !cast<ConstantInt>(GEP->getOperand(1))->isZero() || 
-        !isa<ConstantInt>(GEP->getOperand(2))) 
-      return false; 
-  } 
- 
-  SmallVector<MDNode *, 2> Types; 
-  GV.getMetadata(LLVMContext::MD_type, Types); 
- 
-  const DataLayout &DL = GV.getParent()->getDataLayout(); 
-  const StructLayout *SL = DL.getStructLayout(Init->getType()); 
- 
-  IntegerType *Int32Ty = Type::getInt32Ty(GV.getContext()); 
- 
-  std::vector<GlobalVariable *> SplitGlobals(Init->getNumOperands()); 
-  for (unsigned I = 0; I != Init->getNumOperands(); ++I) { 
-    // Build a global representing this split piece. 
-    auto *SplitGV = 
-        new GlobalVariable(*GV.getParent(), Init->getOperand(I)->getType(), 
-                           GV.isConstant(), GlobalValue::PrivateLinkage, 
-                           Init->getOperand(I), GV.getName() + "." + utostr(I)); 
-    SplitGlobals[I] = SplitGV; 
- 
-    unsigned SplitBegin = SL->getElementOffset(I); 
-    unsigned SplitEnd = (I == Init->getNumOperands() - 1) 
-                            ? SL->getSizeInBytes() 
-                            : SL->getElementOffset(I + 1); 
- 
-    // Rebuild type metadata, adjusting by the split offset. 
-    // FIXME: See if we can use DW_OP_piece to preserve debug metadata here. 
-    for (MDNode *Type : Types) { 
-      uint64_t ByteOffset = cast<ConstantInt>( 
-              cast<ConstantAsMetadata>(Type->getOperand(0))->getValue()) 
-              ->getZExtValue(); 
-      // Type metadata may be attached one byte after the end of the vtable, for 
-      // classes without virtual methods in Itanium ABI. AFAIK, it is never 
-      // attached to the first byte of a vtable. Subtract one to get the right 
-      // slice. 
-      // This is making an assumption that vtable groups are the only kinds of 
-      // global variables that !type metadata can be attached to, and that they 
-      // are either Itanium ABI vtable groups or contain a single vtable (i.e. 
-      // Microsoft ABI vtables). 
-      uint64_t AttachedTo = (ByteOffset == 0) ? ByteOffset : ByteOffset - 1; 
-      if (AttachedTo < SplitBegin || AttachedTo >= SplitEnd) 
-        continue; 
-      SplitGV->addMetadata( 
-          LLVMContext::MD_type, 
-          *MDNode::get(GV.getContext(), 
-                       {ConstantAsMetadata::get( 
-                            ConstantInt::get(Int32Ty, ByteOffset - SplitBegin)), 
-                        Type->getOperand(1)})); 
-    } 
- 
-    if (GV.hasMetadata(LLVMContext::MD_vcall_visibility)) 
-      SplitGV->setVCallVisibilityMetadata(GV.getVCallVisibility()); 
-  } 
- 
-  for (User *U : GV.users()) { 
-    auto *GEP = cast<GEPOperator>(U); 
-    unsigned I = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue(); 
-    if (I >= SplitGlobals.size()) 
-      continue; 
- 
-    SmallVector<Value *, 4> Ops; 
-    Ops.push_back(ConstantInt::get(Int32Ty, 0)); 
-    for (unsigned I = 3; I != GEP->getNumOperands(); ++I) 
-      Ops.push_back(GEP->getOperand(I)); 
- 
-    auto *NewGEP = ConstantExpr::getGetElementPtr( 
-        SplitGlobals[I]->getInitializer()->getType(), SplitGlobals[I], Ops, 
-        GEP->isInBounds()); 
-    GEP->replaceAllUsesWith(NewGEP); 
-  } 
- 
-  // Finally, remove the original global. Any remaining uses refer to invalid 
-  // elements of the global, so replace with undef. 
-  if (!GV.use_empty()) 
-    GV.replaceAllUsesWith(UndefValue::get(GV.getType())); 
-  GV.eraseFromParent(); 
-  return true; 
-} 
- 
-static bool splitGlobals(Module &M) { 
-  // First, see if the module uses either of the llvm.type.test or 
-  // llvm.type.checked.load intrinsics, which indicates that splitting globals 
-  // may be beneficial. 
-  Function *TypeTestFunc = 
-      M.getFunction(Intrinsic::getName(Intrinsic::type_test)); 
-  Function *TypeCheckedLoadFunc = 
-      M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load)); 
-  if ((!TypeTestFunc || TypeTestFunc->use_empty()) && 
-      (!TypeCheckedLoadFunc || TypeCheckedLoadFunc->use_empty())) 
-    return false; 
- 
-  bool Changed = false; 
-  for (auto I = M.global_begin(); I != M.global_end();) { 
-    GlobalVariable &GV = *I; 
-    ++I; 
-    Changed |= splitGlobal(GV); 
-  } 
-  return Changed; 
-} 
- 
-namespace { 
- 
-struct GlobalSplit : public ModulePass { 
-  static char ID; 
- 
-  GlobalSplit() : ModulePass(ID) { 
-    initializeGlobalSplitPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnModule(Module &M) override { 
-    if (skipModule(M)) 
-      return false; 
- 
-    return splitGlobals(M); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-char GlobalSplit::ID = 0; 
- 
-INITIALIZE_PASS(GlobalSplit, "globalsplit", "Global splitter", false, false) 
- 
-ModulePass *llvm::createGlobalSplitPass() { 
-  return new GlobalSplit; 
-} 
- 
-PreservedAnalyses GlobalSplitPass::run(Module &M, ModuleAnalysisManager &AM) { 
-  if (!splitGlobals(M)) 
-    return PreservedAnalyses::all(); 
-  return PreservedAnalyses::none(); 
-} 
+//===- GlobalSplit.cpp - global variable splitter -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass uses inrange annotations on GEP indices to split globals where
+// beneficial. Clang currently attaches these annotations to references to
+// virtual table globals under the Itanium ABI for the benefit of the
+// whole-program virtual call optimization and control flow integrity passes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/GlobalSplit.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Transforms/IPO.h"
+#include <cstdint>
+#include <vector>
+
+using namespace llvm;
+
+static bool splitGlobal(GlobalVariable &GV) {
+  // If the address of the global is taken outside of the module, we cannot
+  // apply this transformation.
+  if (!GV.hasLocalLinkage())
+    return false;
+
+  // We currently only know how to split ConstantStructs.
+  auto *Init = dyn_cast_or_null<ConstantStruct>(GV.getInitializer());
+  if (!Init)
+    return false;
+
+  // Verify that each user of the global is an inrange getelementptr constant.
+  // From this it follows that any loads from or stores to that global must use
+  // a pointer derived from an inrange getelementptr constant, which is
+  // sufficient to allow us to apply the splitting transform.
+  for (User *U : GV.users()) {
+    if (!isa<Constant>(U))
+      return false;
+
+    auto *GEP = dyn_cast<GEPOperator>(U);
+    if (!GEP || !GEP->getInRangeIndex() || *GEP->getInRangeIndex() != 1 ||
+        !isa<ConstantInt>(GEP->getOperand(1)) ||
+        !cast<ConstantInt>(GEP->getOperand(1))->isZero() ||
+        !isa<ConstantInt>(GEP->getOperand(2)))
+      return false;
+  }
+
+  SmallVector<MDNode *, 2> Types;
+  GV.getMetadata(LLVMContext::MD_type, Types);
+
+  const DataLayout &DL = GV.getParent()->getDataLayout();
+  const StructLayout *SL = DL.getStructLayout(Init->getType());
+
+  IntegerType *Int32Ty = Type::getInt32Ty(GV.getContext());
+
+  std::vector<GlobalVariable *> SplitGlobals(Init->getNumOperands());
+  for (unsigned I = 0; I != Init->getNumOperands(); ++I) {
+    // Build a global representing this split piece.
+    auto *SplitGV =
+        new GlobalVariable(*GV.getParent(), Init->getOperand(I)->getType(),
+                           GV.isConstant(), GlobalValue::PrivateLinkage,
+                           Init->getOperand(I), GV.getName() + "." + utostr(I));
+    SplitGlobals[I] = SplitGV;
+
+    unsigned SplitBegin = SL->getElementOffset(I);
+    unsigned SplitEnd = (I == Init->getNumOperands() - 1)
+                            ? SL->getSizeInBytes()
+                            : SL->getElementOffset(I + 1);
+
+    // Rebuild type metadata, adjusting by the split offset.
+    // FIXME: See if we can use DW_OP_piece to preserve debug metadata here.
+    for (MDNode *Type : Types) {
+      uint64_t ByteOffset = cast<ConstantInt>(
+              cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
+              ->getZExtValue();
+      // Type metadata may be attached one byte after the end of the vtable, for
+      // classes without virtual methods in Itanium ABI. AFAIK, it is never
+      // attached to the first byte of a vtable. Subtract one to get the right
+      // slice.
+      // This is making an assumption that vtable groups are the only kinds of
+      // global variables that !type metadata can be attached to, and that they
+      // are either Itanium ABI vtable groups or contain a single vtable (i.e.
+      // Microsoft ABI vtables).
+      uint64_t AttachedTo = (ByteOffset == 0) ? ByteOffset : ByteOffset - 1;
+      if (AttachedTo < SplitBegin || AttachedTo >= SplitEnd)
+        continue;
+      SplitGV->addMetadata(
+          LLVMContext::MD_type,
+          *MDNode::get(GV.getContext(),
+                       {ConstantAsMetadata::get(
+                            ConstantInt::get(Int32Ty, ByteOffset - SplitBegin)),
+                        Type->getOperand(1)}));
+    }
+
+    if (GV.hasMetadata(LLVMContext::MD_vcall_visibility))
+      SplitGV->setVCallVisibilityMetadata(GV.getVCallVisibility());
+  }
+
+  for (User *U : GV.users()) {
+    auto *GEP = cast<GEPOperator>(U);
+    unsigned I = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue();
+    if (I >= SplitGlobals.size())
+      continue;
+
+    SmallVector<Value *, 4> Ops;
+    Ops.push_back(ConstantInt::get(Int32Ty, 0));
+    for (unsigned I = 3; I != GEP->getNumOperands(); ++I)
+      Ops.push_back(GEP->getOperand(I));
+
+    auto *NewGEP = ConstantExpr::getGetElementPtr(
+        SplitGlobals[I]->getInitializer()->getType(), SplitGlobals[I], Ops,
+        GEP->isInBounds());
+    GEP->replaceAllUsesWith(NewGEP);
+  }
+
+  // Finally, remove the original global. Any remaining uses refer to invalid
+  // elements of the global, so replace with undef.
+  if (!GV.use_empty())
+    GV.replaceAllUsesWith(UndefValue::get(GV.getType()));
+  GV.eraseFromParent();
+  return true;
+}
+
+static bool splitGlobals(Module &M) {
+  // First, see if the module uses either of the llvm.type.test or
+  // llvm.type.checked.load intrinsics, which indicates that splitting globals
+  // may be beneficial.
+  Function *TypeTestFunc =
+      M.getFunction(Intrinsic::getName(Intrinsic::type_test));
+  Function *TypeCheckedLoadFunc =
+      M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load));
+  if ((!TypeTestFunc || TypeTestFunc->use_empty()) &&
+      (!TypeCheckedLoadFunc || TypeCheckedLoadFunc->use_empty()))
+    return false;
+
+  bool Changed = false;
+  for (auto I = M.global_begin(); I != M.global_end();) {
+    GlobalVariable &GV = *I;
+    ++I;
+    Changed |= splitGlobal(GV);
+  }
+  return Changed;
+}
+
+namespace {
+
+struct GlobalSplit : public ModulePass {
+  static char ID;
+
+  GlobalSplit() : ModulePass(ID) {
+    initializeGlobalSplitPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+
+    return splitGlobals(M);
+  }
+};
+
+} // end anonymous namespace
+
+char GlobalSplit::ID = 0;
+
+INITIALIZE_PASS(GlobalSplit, "globalsplit", "Global splitter", false, false)
+
+ModulePass *llvm::createGlobalSplitPass() {
+  return new GlobalSplit;
+}
+
+PreservedAnalyses GlobalSplitPass::run(Module &M, ModuleAnalysisManager &AM) {
+  if (!splitGlobals(M))
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/HotColdSplitting.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/HotColdSplitting.cpp
index 35dcaf85db..aa708ee520 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/HotColdSplitting.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/HotColdSplitting.cpp
@@ -1,91 +1,91 @@
-//===- HotColdSplitting.cpp -- Outline Cold Regions -------------*- C++ -*-===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-/// 
-/// \file 
-/// The goal of hot/cold splitting is to improve the memory locality of code. 
-/// The splitting pass does this by identifying cold blocks and moving them into 
-/// separate functions. 
-/// 
-/// When the splitting pass finds a cold block (referred to as "the sink"), it 
-/// grows a maximal cold region around that block. The maximal region contains 
-/// all blocks (post-)dominated by the sink [*]. In theory, these blocks are as 
-/// cold as the sink. Once a region is found, it's split out of the original 
-/// function provided it's profitable to do so. 
-/// 
-/// [*] In practice, there is some added complexity because some blocks are not 
-/// safe to extract. 
-/// 
-/// TODO: Use the PM to get domtrees, and preserve BFI/BPI. 
-/// TODO: Reorder outlined functions. 
-/// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/IPO/HotColdSplitting.h" 
-#include "llvm/ADT/PostOrderIterator.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/BlockFrequencyInfo.h" 
-#include "llvm/Analysis/BranchProbabilityInfo.h" 
-#include "llvm/Analysis/CFG.h" 
-#include "llvm/Analysis/OptimizationRemarkEmitter.h" 
-#include "llvm/Analysis/PostDominators.h" 
-#include "llvm/Analysis/ProfileSummaryInfo.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DiagnosticInfo.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/BlockFrequency.h" 
-#include "llvm/Support/BranchProbability.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/IPO.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/Cloning.h" 
-#include "llvm/Transforms/Utils/CodeExtractor.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Transforms/Utils/ValueMapper.h" 
-#include <algorithm> 
+//===- HotColdSplitting.cpp -- Outline Cold Regions -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// The goal of hot/cold splitting is to improve the memory locality of code.
+/// The splitting pass does this by identifying cold blocks and moving them into
+/// separate functions.
+///
+/// When the splitting pass finds a cold block (referred to as "the sink"), it
+/// grows a maximal cold region around that block. The maximal region contains
+/// all blocks (post-)dominated by the sink [*]. In theory, these blocks are as
+/// cold as the sink. Once a region is found, it's split out of the original
+/// function provided it's profitable to do so.
+///
+/// [*] In practice, there is some added complexity because some blocks are not
+/// safe to extract.
+///
+/// TODO: Use the PM to get domtrees, and preserve BFI/BPI.
+/// TODO: Reorder outlined functions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/HotColdSplitting.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/CodeExtractor.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
 #include <limits>
-#include <cassert> 
+#include <cassert>
 #include <string>
- 
-#define DEBUG_TYPE "hotcoldsplit" 
- 
-STATISTIC(NumColdRegionsFound, "Number of cold regions found."); 
-STATISTIC(NumColdRegionsOutlined, "Number of cold regions outlined."); 
- 
-using namespace llvm; 
- 
+
+#define DEBUG_TYPE "hotcoldsplit"
+
+STATISTIC(NumColdRegionsFound, "Number of cold regions found.");
+STATISTIC(NumColdRegionsOutlined, "Number of cold regions outlined.");
+
+using namespace llvm;
+
 static cl::opt<bool> EnableStaticAnalysis("hot-cold-static-analysis",
                                           cl::init(true), cl::Hidden);
- 
-static cl::opt<int> 
-    SplittingThreshold("hotcoldsplit-threshold", cl::init(2), cl::Hidden, 
-                       cl::desc("Base penalty for splitting cold code (as a " 
-                                "multiple of TCC_Basic)")); 
- 
+
+static cl::opt<int>
+    SplittingThreshold("hotcoldsplit-threshold", cl::init(2), cl::Hidden,
+                       cl::desc("Base penalty for splitting cold code (as a "
+                                "multiple of TCC_Basic)"));
+
 static cl::opt<bool> EnableColdSection(
     "enable-cold-section", cl::init(false), cl::Hidden,
     cl::desc("Enable placement of extracted cold functions"
@@ -101,187 +101,187 @@ static cl::opt<int> MaxParametersForSplit(
     "hotcoldsplit-max-params", cl::init(4), cl::Hidden,
     cl::desc("Maximum number of parameters for a split function"));
 
-namespace { 
-// Same as blockEndsInUnreachable in CodeGen/BranchFolding.cpp. Do not modify 
-// this function unless you modify the MBB version as well. 
-// 
-/// A no successor, non-return block probably ends in unreachable and is cold. 
-/// Also consider a block that ends in an indirect branch to be a return block, 
-/// since many targets use plain indirect branches to return. 
-bool blockEndsInUnreachable(const BasicBlock &BB) { 
-  if (!succ_empty(&BB)) 
-    return false; 
-  if (BB.empty()) 
-    return true; 
-  const Instruction *I = BB.getTerminator(); 
-  return !(isa<ReturnInst>(I) || isa<IndirectBrInst>(I)); 
-} 
- 
-bool unlikelyExecuted(BasicBlock &BB) { 
-  // Exception handling blocks are unlikely executed. 
-  if (BB.isEHPad() || isa<ResumeInst>(BB.getTerminator())) 
-    return true; 
- 
-  // The block is cold if it calls/invokes a cold function. However, do not 
-  // mark sanitizer traps as cold. 
-  for (Instruction &I : BB) 
-    if (auto *CB = dyn_cast<CallBase>(&I)) 
-      if (CB->hasFnAttr(Attribute::Cold) && !CB->getMetadata("nosanitize")) 
-        return true; 
- 
-  // The block is cold if it has an unreachable terminator, unless it's 
-  // preceded by a call to a (possibly warm) noreturn call (e.g. longjmp). 
-  if (blockEndsInUnreachable(BB)) { 
-    if (auto *CI = 
-            dyn_cast_or_null<CallInst>(BB.getTerminator()->getPrevNode())) 
-      if (CI->hasFnAttr(Attribute::NoReturn)) 
-        return false; 
-    return true; 
-  } 
- 
-  return false; 
-} 
- 
-/// Check whether it's safe to outline \p BB. 
-static bool mayExtractBlock(const BasicBlock &BB) { 
-  // EH pads are unsafe to outline because doing so breaks EH type tables. It 
-  // follows that invoke instructions cannot be extracted, because CodeExtractor 
-  // requires unwind destinations to be within the extraction region. 
-  // 
-  // Resumes that are not reachable from a cleanup landing pad are considered to 
-  // be unreachable. It’s not safe to split them out either. 
-  auto Term = BB.getTerminator(); 
-  return !BB.hasAddressTaken() && !BB.isEHPad() && !isa<InvokeInst>(Term) && 
-         !isa<ResumeInst>(Term); 
-} 
- 
-/// Mark \p F cold. Based on this assumption, also optimize it for minimum size. 
-/// If \p UpdateEntryCount is true (set when this is a new split function and 
-/// module has profile data), set entry count to 0 to ensure treated as cold. 
-/// Return true if the function is changed. 
-static bool markFunctionCold(Function &F, bool UpdateEntryCount = false) { 
-  assert(!F.hasOptNone() && "Can't mark this cold"); 
-  bool Changed = false; 
-  if (!F.hasFnAttribute(Attribute::Cold)) { 
-    F.addFnAttr(Attribute::Cold); 
-    Changed = true; 
-  } 
-  if (!F.hasFnAttribute(Attribute::MinSize)) { 
-    F.addFnAttr(Attribute::MinSize); 
-    Changed = true; 
-  } 
-  if (UpdateEntryCount) { 
-    // Set the entry count to 0 to ensure it is placed in the unlikely text 
-    // section when function sections are enabled. 
-    F.setEntryCount(0); 
-    Changed = true; 
-  } 
- 
-  return Changed; 
-} 
- 
-class HotColdSplittingLegacyPass : public ModulePass { 
-public: 
-  static char ID; 
-  HotColdSplittingLegacyPass() : ModulePass(ID) { 
-    initializeHotColdSplittingLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<BlockFrequencyInfoWrapperPass>(); 
-    AU.addRequired<ProfileSummaryInfoWrapperPass>(); 
-    AU.addRequired<TargetTransformInfoWrapperPass>(); 
-    AU.addUsedIfAvailable<AssumptionCacheTracker>(); 
-  } 
- 
-  bool runOnModule(Module &M) override; 
-}; 
- 
-} // end anonymous namespace 
- 
-/// Check whether \p F is inherently cold. 
-bool HotColdSplitting::isFunctionCold(const Function &F) const { 
-  if (F.hasFnAttribute(Attribute::Cold)) 
-    return true; 
- 
-  if (F.getCallingConv() == CallingConv::Cold) 
-    return true; 
- 
-  if (PSI->isFunctionEntryCold(&F)) 
-    return true; 
- 
-  return false; 
-} 
- 
-// Returns false if the function should not be considered for hot-cold split 
-// optimization. 
-bool HotColdSplitting::shouldOutlineFrom(const Function &F) const { 
-  if (F.hasFnAttribute(Attribute::AlwaysInline)) 
-    return false; 
- 
-  if (F.hasFnAttribute(Attribute::NoInline)) 
-    return false; 
- 
-  // A function marked `noreturn` may contain unreachable terminators: these 
-  // should not be considered cold, as the function may be a trampoline. 
-  if (F.hasFnAttribute(Attribute::NoReturn)) 
-    return false; 
- 
-  if (F.hasFnAttribute(Attribute::SanitizeAddress) || 
-      F.hasFnAttribute(Attribute::SanitizeHWAddress) || 
-      F.hasFnAttribute(Attribute::SanitizeThread) || 
-      F.hasFnAttribute(Attribute::SanitizeMemory)) 
-    return false; 
- 
-  return true; 
-} 
- 
-/// Get the benefit score of outlining \p Region. 
+namespace {
+// Same as blockEndsInUnreachable in CodeGen/BranchFolding.cpp. Do not modify
+// this function unless you modify the MBB version as well.
+//
+/// A no successor, non-return block probably ends in unreachable and is cold.
+/// Also consider a block that ends in an indirect branch to be a return block,
+/// since many targets use plain indirect branches to return.
+bool blockEndsInUnreachable(const BasicBlock &BB) {
+  if (!succ_empty(&BB))
+    return false;
+  if (BB.empty())
+    return true;
+  const Instruction *I = BB.getTerminator();
+  return !(isa<ReturnInst>(I) || isa<IndirectBrInst>(I));
+}
+
+bool unlikelyExecuted(BasicBlock &BB) {
+  // Exception handling blocks are unlikely executed.
+  if (BB.isEHPad() || isa<ResumeInst>(BB.getTerminator()))
+    return true;
+
+  // The block is cold if it calls/invokes a cold function. However, do not
+  // mark sanitizer traps as cold.
+  for (Instruction &I : BB)
+    if (auto *CB = dyn_cast<CallBase>(&I))
+      if (CB->hasFnAttr(Attribute::Cold) && !CB->getMetadata("nosanitize"))
+        return true;
+
+  // The block is cold if it has an unreachable terminator, unless it's
+  // preceded by a call to a (possibly warm) noreturn call (e.g. longjmp).
+  if (blockEndsInUnreachable(BB)) {
+    if (auto *CI =
+            dyn_cast_or_null<CallInst>(BB.getTerminator()->getPrevNode()))
+      if (CI->hasFnAttr(Attribute::NoReturn))
+        return false;
+    return true;
+  }
+
+  return false;
+}
+
+/// Check whether it's safe to outline \p BB.
+static bool mayExtractBlock(const BasicBlock &BB) {
+  // EH pads are unsafe to outline because doing so breaks EH type tables. It
+  // follows that invoke instructions cannot be extracted, because CodeExtractor
+  // requires unwind destinations to be within the extraction region.
+  //
+  // Resumes that are not reachable from a cleanup landing pad are considered to
+  // be unreachable. It’s not safe to split them out either.
+  auto Term = BB.getTerminator();
+  return !BB.hasAddressTaken() && !BB.isEHPad() && !isa<InvokeInst>(Term) &&
+         !isa<ResumeInst>(Term);
+}
+
+/// Mark \p F cold. Based on this assumption, also optimize it for minimum size.
+/// If \p UpdateEntryCount is true (set when this is a new split function and
+/// module has profile data), set entry count to 0 to ensure treated as cold.
+/// Return true if the function is changed.
+static bool markFunctionCold(Function &F, bool UpdateEntryCount = false) {
+  assert(!F.hasOptNone() && "Can't mark this cold");
+  bool Changed = false;
+  if (!F.hasFnAttribute(Attribute::Cold)) {
+    F.addFnAttr(Attribute::Cold);
+    Changed = true;
+  }
+  if (!F.hasFnAttribute(Attribute::MinSize)) {
+    F.addFnAttr(Attribute::MinSize);
+    Changed = true;
+  }
+  if (UpdateEntryCount) {
+    // Set the entry count to 0 to ensure it is placed in the unlikely text
+    // section when function sections are enabled.
+    F.setEntryCount(0);
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+class HotColdSplittingLegacyPass : public ModulePass {
+public:
+  static char ID;
+  HotColdSplittingLegacyPass() : ModulePass(ID) {
+    initializeHotColdSplittingLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addUsedIfAvailable<AssumptionCacheTracker>();
+  }
+
+  bool runOnModule(Module &M) override;
+};
+
+} // end anonymous namespace
+
+/// Check whether \p F is inherently cold.
+bool HotColdSplitting::isFunctionCold(const Function &F) const {
+  if (F.hasFnAttribute(Attribute::Cold))
+    return true;
+
+  if (F.getCallingConv() == CallingConv::Cold)
+    return true;
+
+  if (PSI->isFunctionEntryCold(&F))
+    return true;
+
+  return false;
+}
+
+// Returns false if the function should not be considered for hot-cold split
+// optimization.
+bool HotColdSplitting::shouldOutlineFrom(const Function &F) const {
+  if (F.hasFnAttribute(Attribute::AlwaysInline))
+    return false;
+
+  if (F.hasFnAttribute(Attribute::NoInline))
+    return false;
+
+  // A function marked `noreturn` may contain unreachable terminators: these
+  // should not be considered cold, as the function may be a trampoline.
+  if (F.hasFnAttribute(Attribute::NoReturn))
+    return false;
+
+  if (F.hasFnAttribute(Attribute::SanitizeAddress) ||
+      F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
+      F.hasFnAttribute(Attribute::SanitizeThread) ||
+      F.hasFnAttribute(Attribute::SanitizeMemory))
+    return false;
+
+  return true;
+}
+
+/// Get the benefit score of outlining \p Region.
 static InstructionCost getOutliningBenefit(ArrayRef<BasicBlock *> Region,
                                            TargetTransformInfo &TTI) {
-  // Sum up the code size costs of non-terminator instructions. Tight coupling 
-  // with \ref getOutliningPenalty is needed to model the costs of terminators. 
+  // Sum up the code size costs of non-terminator instructions. Tight coupling
+  // with \ref getOutliningPenalty is needed to model the costs of terminators.
   InstructionCost Benefit = 0;
-  for (BasicBlock *BB : Region) 
-    for (Instruction &I : BB->instructionsWithoutDebug()) 
-      if (&I != BB->getTerminator()) 
-        Benefit += 
-            TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize); 
- 
-  return Benefit; 
-} 
- 
-/// Get the penalty score for outlining \p Region. 
-static int getOutliningPenalty(ArrayRef<BasicBlock *> Region, 
-                               unsigned NumInputs, unsigned NumOutputs) { 
-  int Penalty = SplittingThreshold; 
-  LLVM_DEBUG(dbgs() << "Applying penalty for splitting: " << Penalty << "\n"); 
- 
-  // If the splitting threshold is set at or below zero, skip the usual 
-  // profitability check. 
-  if (SplittingThreshold <= 0) 
-    return Penalty; 
- 
-  // Find the number of distinct exit blocks for the region. Use a conservative 
-  // check to determine whether control returns from the region. 
-  bool NoBlocksReturn = true; 
-  SmallPtrSet<BasicBlock *, 2> SuccsOutsideRegion; 
-  for (BasicBlock *BB : Region) { 
-    // If a block has no successors, only assume it does not return if it's 
-    // unreachable. 
-    if (succ_empty(BB)) { 
-      NoBlocksReturn &= isa<UnreachableInst>(BB->getTerminator()); 
-      continue; 
-    } 
- 
-    for (BasicBlock *SuccBB : successors(BB)) { 
+  for (BasicBlock *BB : Region)
+    for (Instruction &I : BB->instructionsWithoutDebug())
+      if (&I != BB->getTerminator())
+        Benefit +=
+            TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
+
+  return Benefit;
+}
+
+/// Get the penalty score for outlining \p Region.
+static int getOutliningPenalty(ArrayRef<BasicBlock *> Region,
+                               unsigned NumInputs, unsigned NumOutputs) {
+  int Penalty = SplittingThreshold;
+  LLVM_DEBUG(dbgs() << "Applying penalty for splitting: " << Penalty << "\n");
+
+  // If the splitting threshold is set at or below zero, skip the usual
+  // profitability check.
+  if (SplittingThreshold <= 0)
+    return Penalty;
+
+  // Find the number of distinct exit blocks for the region. Use a conservative
+  // check to determine whether control returns from the region.
+  bool NoBlocksReturn = true;
+  SmallPtrSet<BasicBlock *, 2> SuccsOutsideRegion;
+  for (BasicBlock *BB : Region) {
+    // If a block has no successors, only assume it does not return if it's
+    // unreachable.
+    if (succ_empty(BB)) {
+      NoBlocksReturn &= isa<UnreachableInst>(BB->getTerminator());
+      continue;
+    }
+
+    for (BasicBlock *SuccBB : successors(BB)) {
       if (!is_contained(Region, SuccBB)) {
-        NoBlocksReturn = false; 
-        SuccsOutsideRegion.insert(SuccBB); 
-      } 
-    } 
-  } 
- 
+        NoBlocksReturn = false;
+        SuccsOutsideRegion.insert(SuccBB);
+      }
+    }
+  }
+
   // Count the number of phis in exit blocks with >= 2 incoming values from the
   // outlining region. These phis are split (\ref severSplitPHINodesOfExits),
   // and new outputs are created to supply the split phis. CodeExtractor can't
@@ -324,473 +324,473 @@ static int getOutliningPenalty(ArrayRef<BasicBlock *> Region,
   const int CostForRegionOutput = 3 * TargetTransformInfo::TCC_Basic;
   Penalty += CostForRegionOutput * NumOutputsAndSplitPhis;
 
-  // Apply a `noreturn` bonus. 
-  if (NoBlocksReturn) { 
-    LLVM_DEBUG(dbgs() << "Applying bonus for: " << Region.size() 
-                      << " non-returning terminators\n"); 
-    Penalty -= Region.size(); 
-  } 
- 
-  // Apply a penalty for having more than one successor outside of the region. 
-  // This penalty accounts for the switch needed in the caller. 
+  // Apply a `noreturn` bonus.
+  if (NoBlocksReturn) {
+    LLVM_DEBUG(dbgs() << "Applying bonus for: " << Region.size()
+                      << " non-returning terminators\n");
+    Penalty -= Region.size();
+  }
+
+  // Apply a penalty for having more than one successor outside of the region.
+  // This penalty accounts for the switch needed in the caller.
   if (SuccsOutsideRegion.size() > 1) {
-    LLVM_DEBUG(dbgs() << "Applying penalty for: " << SuccsOutsideRegion.size() 
-                      << " non-region successors\n"); 
-    Penalty += (SuccsOutsideRegion.size() - 1) * TargetTransformInfo::TCC_Basic; 
-  } 
- 
-  return Penalty; 
-} 
- 
-Function *HotColdSplitting::extractColdRegion( 
-    const BlockSequence &Region, const CodeExtractorAnalysisCache &CEAC, 
-    DominatorTree &DT, BlockFrequencyInfo *BFI, TargetTransformInfo &TTI, 
-    OptimizationRemarkEmitter &ORE, AssumptionCache *AC, unsigned Count) { 
-  assert(!Region.empty()); 
- 
-  // TODO: Pass BFI and BPI to update profile information. 
-  CodeExtractor CE(Region, &DT, /* AggregateArgs */ false, /* BFI */ nullptr, 
-                   /* BPI */ nullptr, AC, /* AllowVarArgs */ false, 
-                   /* AllowAlloca */ false, 
-                   /* Suffix */ "cold." + std::to_string(Count)); 
- 
-  // Perform a simple cost/benefit analysis to decide whether or not to permit 
-  // splitting. 
-  SetVector<Value *> Inputs, Outputs, Sinks; 
-  CE.findInputsOutputs(Inputs, Outputs, Sinks); 
+    LLVM_DEBUG(dbgs() << "Applying penalty for: " << SuccsOutsideRegion.size()
+                      << " non-region successors\n");
+    Penalty += (SuccsOutsideRegion.size() - 1) * TargetTransformInfo::TCC_Basic;
+  }
+
+  return Penalty;
+}
+
+Function *HotColdSplitting::extractColdRegion(
+    const BlockSequence &Region, const CodeExtractorAnalysisCache &CEAC,
+    DominatorTree &DT, BlockFrequencyInfo *BFI, TargetTransformInfo &TTI,
+    OptimizationRemarkEmitter &ORE, AssumptionCache *AC, unsigned Count) {
+  assert(!Region.empty());
+
+  // TODO: Pass BFI and BPI to update profile information.
+  CodeExtractor CE(Region, &DT, /* AggregateArgs */ false, /* BFI */ nullptr,
+                   /* BPI */ nullptr, AC, /* AllowVarArgs */ false,
+                   /* AllowAlloca */ false,
+                   /* Suffix */ "cold." + std::to_string(Count));
+
+  // Perform a simple cost/benefit analysis to decide whether or not to permit
+  // splitting.
+  SetVector<Value *> Inputs, Outputs, Sinks;
+  CE.findInputsOutputs(Inputs, Outputs, Sinks);
   InstructionCost OutliningBenefit = getOutliningBenefit(Region, TTI);
-  int OutliningPenalty = 
-      getOutliningPenalty(Region, Inputs.size(), Outputs.size()); 
-  LLVM_DEBUG(dbgs() << "Split profitability: benefit = " << OutliningBenefit 
-                    << ", penalty = " << OutliningPenalty << "\n"); 
+  int OutliningPenalty =
+      getOutliningPenalty(Region, Inputs.size(), Outputs.size());
+  LLVM_DEBUG(dbgs() << "Split profitability: benefit = " << OutliningBenefit
+                    << ", penalty = " << OutliningPenalty << "\n");
   if (!OutliningBenefit.isValid() || OutliningBenefit <= OutliningPenalty)
-    return nullptr; 
- 
-  Function *OrigF = Region[0]->getParent(); 
-  if (Function *OutF = CE.extractCodeRegion(CEAC)) { 
-    User *U = *OutF->user_begin(); 
-    CallInst *CI = cast<CallInst>(U); 
-    NumColdRegionsOutlined++; 
-    if (TTI.useColdCCForColdCall(*OutF)) { 
-      OutF->setCallingConv(CallingConv::Cold); 
-      CI->setCallingConv(CallingConv::Cold); 
-    } 
-    CI->setIsNoInline(); 
- 
+    return nullptr;
+
+  Function *OrigF = Region[0]->getParent();
+  if (Function *OutF = CE.extractCodeRegion(CEAC)) {
+    User *U = *OutF->user_begin();
+    CallInst *CI = cast<CallInst>(U);
+    NumColdRegionsOutlined++;
+    if (TTI.useColdCCForColdCall(*OutF)) {
+      OutF->setCallingConv(CallingConv::Cold);
+      CI->setCallingConv(CallingConv::Cold);
+    }
+    CI->setIsNoInline();
+
     if (EnableColdSection)
       OutF->setSection(ColdSectionName);
     else {
       if (OrigF->hasSection())
         OutF->setSection(OrigF->getSection());
     }
- 
-    markFunctionCold(*OutF, BFI != nullptr); 
- 
-    LLVM_DEBUG(llvm::dbgs() << "Outlined Region: " << *OutF); 
-    ORE.emit([&]() { 
-      return OptimizationRemark(DEBUG_TYPE, "HotColdSplit", 
-                                &*Region[0]->begin()) 
-             << ore::NV("Original", OrigF) << " split cold code into " 
-             << ore::NV("Split", OutF); 
-    }); 
-    return OutF; 
-  } 
- 
-  ORE.emit([&]() { 
-    return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", 
-                                    &*Region[0]->begin()) 
-           << "Failed to extract region at block " 
-           << ore::NV("Block", Region.front()); 
-  }); 
-  return nullptr; 
-} 
- 
-/// A pair of (basic block, score). 
-using BlockTy = std::pair<BasicBlock *, unsigned>; 
- 
-namespace { 
-/// A maximal outlining region. This contains all blocks post-dominated by a 
-/// sink block, the sink block itself, and all blocks dominated by the sink. 
-/// If sink-predecessors and sink-successors cannot be extracted in one region, 
-/// the static constructor returns a list of suitable extraction regions. 
-class OutliningRegion { 
-  /// A list of (block, score) pairs. A block's score is non-zero iff it's a 
-  /// viable sub-region entry point. Blocks with higher scores are better entry 
-  /// points (i.e. they are more distant ancestors of the sink block). 
-  SmallVector<BlockTy, 0> Blocks = {}; 
- 
-  /// The suggested entry point into the region. If the region has multiple 
-  /// entry points, all blocks within the region may not be reachable from this 
-  /// entry point. 
-  BasicBlock *SuggestedEntryPoint = nullptr; 
- 
-  /// Whether the entire function is cold. 
-  bool EntireFunctionCold = false; 
- 
-  /// If \p BB is a viable entry point, return \p Score. Return 0 otherwise. 
-  static unsigned getEntryPointScore(BasicBlock &BB, unsigned Score) { 
-    return mayExtractBlock(BB) ? Score : 0; 
-  } 
- 
-  /// These scores should be lower than the score for predecessor blocks, 
-  /// because regions starting at predecessor blocks are typically larger. 
-  static constexpr unsigned ScoreForSuccBlock = 1; 
-  static constexpr unsigned ScoreForSinkBlock = 1; 
- 
-  OutliningRegion(const OutliningRegion &) = delete; 
-  OutliningRegion &operator=(const OutliningRegion &) = delete; 
- 
-public: 
-  OutliningRegion() = default; 
-  OutliningRegion(OutliningRegion &&) = default; 
-  OutliningRegion &operator=(OutliningRegion &&) = default; 
- 
-  static std::vector<OutliningRegion> create(BasicBlock &SinkBB, 
-                                             const DominatorTree &DT, 
-                                             const PostDominatorTree &PDT) { 
-    std::vector<OutliningRegion> Regions; 
-    SmallPtrSet<BasicBlock *, 4> RegionBlocks; 
- 
-    Regions.emplace_back(); 
-    OutliningRegion *ColdRegion = &Regions.back(); 
- 
-    auto addBlockToRegion = [&](BasicBlock *BB, unsigned Score) { 
-      RegionBlocks.insert(BB); 
-      ColdRegion->Blocks.emplace_back(BB, Score); 
-    }; 
- 
-    // The ancestor farthest-away from SinkBB, and also post-dominated by it. 
-    unsigned SinkScore = getEntryPointScore(SinkBB, ScoreForSinkBlock); 
-    ColdRegion->SuggestedEntryPoint = (SinkScore > 0) ? &SinkBB : nullptr; 
-    unsigned BestScore = SinkScore; 
- 
-    // Visit SinkBB's ancestors using inverse DFS. 
-    auto PredIt = ++idf_begin(&SinkBB); 
-    auto PredEnd = idf_end(&SinkBB); 
-    while (PredIt != PredEnd) { 
-      BasicBlock &PredBB = **PredIt; 
-      bool SinkPostDom = PDT.dominates(&SinkBB, &PredBB); 
- 
-      // If the predecessor is cold and has no predecessors, the entire 
-      // function must be cold. 
-      if (SinkPostDom && pred_empty(&PredBB)) { 
-        ColdRegion->EntireFunctionCold = true; 
-        return Regions; 
-      } 
- 
-      // If SinkBB does not post-dominate a predecessor, do not mark the 
-      // predecessor (or any of its predecessors) cold. 
-      if (!SinkPostDom || !mayExtractBlock(PredBB)) { 
-        PredIt.skipChildren(); 
-        continue; 
-      } 
- 
-      // Keep track of the post-dominated ancestor farthest away from the sink. 
-      // The path length is always >= 2, ensuring that predecessor blocks are 
-      // considered as entry points before the sink block. 
-      unsigned PredScore = getEntryPointScore(PredBB, PredIt.getPathLength()); 
-      if (PredScore > BestScore) { 
-        ColdRegion->SuggestedEntryPoint = &PredBB; 
-        BestScore = PredScore; 
-      } 
- 
-      addBlockToRegion(&PredBB, PredScore); 
-      ++PredIt; 
-    } 
- 
-    // If the sink can be added to the cold region, do so. It's considered as 
-    // an entry point before any sink-successor blocks. 
-    // 
-    // Otherwise, split cold sink-successor blocks using a separate region. 
-    // This satisfies the requirement that all extraction blocks other than the 
-    // first have predecessors within the extraction region. 
-    if (mayExtractBlock(SinkBB)) { 
-      addBlockToRegion(&SinkBB, SinkScore); 
-      if (pred_empty(&SinkBB)) { 
-        ColdRegion->EntireFunctionCold = true; 
-        return Regions; 
-      } 
-    } else { 
-      Regions.emplace_back(); 
-      ColdRegion = &Regions.back(); 
-      BestScore = 0; 
-    } 
- 
-    // Find all successors of SinkBB dominated by SinkBB using DFS. 
-    auto SuccIt = ++df_begin(&SinkBB); 
-    auto SuccEnd = df_end(&SinkBB); 
-    while (SuccIt != SuccEnd) { 
-      BasicBlock &SuccBB = **SuccIt; 
-      bool SinkDom = DT.dominates(&SinkBB, &SuccBB); 
- 
-      // Don't allow the backwards & forwards DFSes to mark the same block. 
-      bool DuplicateBlock = RegionBlocks.count(&SuccBB); 
- 
-      // If SinkBB does not dominate a successor, do not mark the successor (or 
-      // any of its successors) cold. 
-      if (DuplicateBlock || !SinkDom || !mayExtractBlock(SuccBB)) { 
-        SuccIt.skipChildren(); 
-        continue; 
-      } 
- 
-      unsigned SuccScore = getEntryPointScore(SuccBB, ScoreForSuccBlock); 
-      if (SuccScore > BestScore) { 
-        ColdRegion->SuggestedEntryPoint = &SuccBB; 
-        BestScore = SuccScore; 
-      } 
- 
-      addBlockToRegion(&SuccBB, SuccScore); 
-      ++SuccIt; 
-    } 
- 
-    return Regions; 
-  } 
- 
-  /// Whether this region has nothing to extract. 
-  bool empty() const { return !SuggestedEntryPoint; } 
- 
-  /// The blocks in this region. 
-  ArrayRef<std::pair<BasicBlock *, unsigned>> blocks() const { return Blocks; } 
- 
-  /// Whether the entire function containing this region is cold. 
-  bool isEntireFunctionCold() const { return EntireFunctionCold; } 
- 
-  /// Remove a sub-region from this region and return it as a block sequence. 
-  BlockSequence takeSingleEntrySubRegion(DominatorTree &DT) { 
-    assert(!empty() && !isEntireFunctionCold() && "Nothing to extract"); 
- 
-    // Remove blocks dominated by the suggested entry point from this region. 
-    // During the removal, identify the next best entry point into the region. 
-    // Ensure that the first extracted block is the suggested entry point. 
-    BlockSequence SubRegion = {SuggestedEntryPoint}; 
-    BasicBlock *NextEntryPoint = nullptr; 
-    unsigned NextScore = 0; 
-    auto RegionEndIt = Blocks.end(); 
-    auto RegionStartIt = remove_if(Blocks, [&](const BlockTy &Block) { 
-      BasicBlock *BB = Block.first; 
-      unsigned Score = Block.second; 
-      bool InSubRegion = 
-          BB == SuggestedEntryPoint || DT.dominates(SuggestedEntryPoint, BB); 
-      if (!InSubRegion && Score > NextScore) { 
-        NextEntryPoint = BB; 
-        NextScore = Score; 
-      } 
-      if (InSubRegion && BB != SuggestedEntryPoint) 
-        SubRegion.push_back(BB); 
-      return InSubRegion; 
-    }); 
-    Blocks.erase(RegionStartIt, RegionEndIt); 
- 
-    // Update the suggested entry point. 
-    SuggestedEntryPoint = NextEntryPoint; 
- 
-    return SubRegion; 
-  } 
-}; 
-} // namespace 
- 
-bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) { 
-  bool Changed = false; 
- 
-  // The set of cold blocks. 
-  SmallPtrSet<BasicBlock *, 4> ColdBlocks; 
- 
-  // The worklist of non-intersecting regions left to outline. 
-  SmallVector<OutliningRegion, 2> OutliningWorklist; 
- 
-  // Set up an RPO traversal. Experimentally, this performs better (outlines 
-  // more) than a PO traversal, because we prevent region overlap by keeping 
-  // the first region to contain a block. 
-  ReversePostOrderTraversal<Function *> RPOT(&F); 
- 
-  // Calculate domtrees lazily. This reduces compile-time significantly. 
-  std::unique_ptr<DominatorTree> DT; 
-  std::unique_ptr<PostDominatorTree> PDT; 
- 
-  // Calculate BFI lazily (it's only used to query ProfileSummaryInfo). This 
-  // reduces compile-time significantly. TODO: When we *do* use BFI, we should 
-  // be able to salvage its domtrees instead of recomputing them. 
-  BlockFrequencyInfo *BFI = nullptr; 
-  if (HasProfileSummary) 
-    BFI = GetBFI(F); 
- 
-  TargetTransformInfo &TTI = GetTTI(F); 
-  OptimizationRemarkEmitter &ORE = (*GetORE)(F); 
-  AssumptionCache *AC = LookupAC(F); 
- 
-  // Find all cold regions. 
-  for (BasicBlock *BB : RPOT) { 
-    // This block is already part of some outlining region. 
-    if (ColdBlocks.count(BB)) 
-      continue; 
- 
-    bool Cold = (BFI && PSI->isColdBlock(BB, BFI)) || 
+
+    markFunctionCold(*OutF, BFI != nullptr);
+
+    LLVM_DEBUG(llvm::dbgs() << "Outlined Region: " << *OutF);
+    ORE.emit([&]() {
+      return OptimizationRemark(DEBUG_TYPE, "HotColdSplit",
+                                &*Region[0]->begin())
+             << ore::NV("Original", OrigF) << " split cold code into "
+             << ore::NV("Split", OutF);
+    });
+    return OutF;
+  }
+
+  ORE.emit([&]() {
+    return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
+                                    &*Region[0]->begin())
+           << "Failed to extract region at block "
+           << ore::NV("Block", Region.front());
+  });
+  return nullptr;
+}
+
+/// A pair of (basic block, score).
+using BlockTy = std::pair<BasicBlock *, unsigned>;
+
+namespace {
+/// A maximal outlining region. This contains all blocks post-dominated by a
+/// sink block, the sink block itself, and all blocks dominated by the sink.
+/// If sink-predecessors and sink-successors cannot be extracted in one region,
+/// the static constructor returns a list of suitable extraction regions.
+class OutliningRegion {
+  /// A list of (block, score) pairs. A block's score is non-zero iff it's a
+  /// viable sub-region entry point. Blocks with higher scores are better entry
+  /// points (i.e. they are more distant ancestors of the sink block).
+  SmallVector<BlockTy, 0> Blocks = {};
+
+  /// The suggested entry point into the region. If the region has multiple
+  /// entry points, all blocks within the region may not be reachable from this
+  /// entry point.
+  BasicBlock *SuggestedEntryPoint = nullptr;
+
+  /// Whether the entire function is cold.
+  bool EntireFunctionCold = false;
+
+  /// If \p BB is a viable entry point, return \p Score. Return 0 otherwise.
+  static unsigned getEntryPointScore(BasicBlock &BB, unsigned Score) {
+    return mayExtractBlock(BB) ? Score : 0;
+  }
+
+  /// These scores should be lower than the score for predecessor blocks,
+  /// because regions starting at predecessor blocks are typically larger.
+  static constexpr unsigned ScoreForSuccBlock = 1;
+  static constexpr unsigned ScoreForSinkBlock = 1;
+
+  OutliningRegion(const OutliningRegion &) = delete;
+  OutliningRegion &operator=(const OutliningRegion &) = delete;
+
+public:
+  OutliningRegion() = default;
+  OutliningRegion(OutliningRegion &&) = default;
+  OutliningRegion &operator=(OutliningRegion &&) = default;
+
+  static std::vector<OutliningRegion> create(BasicBlock &SinkBB,
+                                             const DominatorTree &DT,
+                                             const PostDominatorTree &PDT) {
+    std::vector<OutliningRegion> Regions;
+    SmallPtrSet<BasicBlock *, 4> RegionBlocks;
+
+    Regions.emplace_back();
+    OutliningRegion *ColdRegion = &Regions.back();
+
+    auto addBlockToRegion = [&](BasicBlock *BB, unsigned Score) {
+      RegionBlocks.insert(BB);
+      ColdRegion->Blocks.emplace_back(BB, Score);
+    };
+
+    // The ancestor farthest-away from SinkBB, and also post-dominated by it.
+    unsigned SinkScore = getEntryPointScore(SinkBB, ScoreForSinkBlock);
+    ColdRegion->SuggestedEntryPoint = (SinkScore > 0) ? &SinkBB : nullptr;
+    unsigned BestScore = SinkScore;
+
+    // Visit SinkBB's ancestors using inverse DFS.
+    auto PredIt = ++idf_begin(&SinkBB);
+    auto PredEnd = idf_end(&SinkBB);
+    while (PredIt != PredEnd) {
+      BasicBlock &PredBB = **PredIt;
+      bool SinkPostDom = PDT.dominates(&SinkBB, &PredBB);
+
+      // If the predecessor is cold and has no predecessors, the entire
+      // function must be cold.
+      if (SinkPostDom && pred_empty(&PredBB)) {
+        ColdRegion->EntireFunctionCold = true;
+        return Regions;
+      }
+
+      // If SinkBB does not post-dominate a predecessor, do not mark the
+      // predecessor (or any of its predecessors) cold.
+      if (!SinkPostDom || !mayExtractBlock(PredBB)) {
+        PredIt.skipChildren();
+        continue;
+      }
+
+      // Keep track of the post-dominated ancestor farthest away from the sink.
+      // The path length is always >= 2, ensuring that predecessor blocks are
+      // considered as entry points before the sink block.
+      unsigned PredScore = getEntryPointScore(PredBB, PredIt.getPathLength());
+      if (PredScore > BestScore) {
+        ColdRegion->SuggestedEntryPoint = &PredBB;
+        BestScore = PredScore;
+      }
+
+      addBlockToRegion(&PredBB, PredScore);
+      ++PredIt;
+    }
+
+    // If the sink can be added to the cold region, do so. It's considered as
+    // an entry point before any sink-successor blocks.
+    //
+    // Otherwise, split cold sink-successor blocks using a separate region.
+    // This satisfies the requirement that all extraction blocks other than the
+    // first have predecessors within the extraction region.
+    if (mayExtractBlock(SinkBB)) {
+      addBlockToRegion(&SinkBB, SinkScore);
+      if (pred_empty(&SinkBB)) {
+        ColdRegion->EntireFunctionCold = true;
+        return Regions;
+      }
+    } else {
+      Regions.emplace_back();
+      ColdRegion = &Regions.back();
+      BestScore = 0;
+    }
+
+    // Find all successors of SinkBB dominated by SinkBB using DFS.
+    auto SuccIt = ++df_begin(&SinkBB);
+    auto SuccEnd = df_end(&SinkBB);
+    while (SuccIt != SuccEnd) {
+      BasicBlock &SuccBB = **SuccIt;
+      bool SinkDom = DT.dominates(&SinkBB, &SuccBB);
+
+      // Don't allow the backwards & forwards DFSes to mark the same block.
+      bool DuplicateBlock = RegionBlocks.count(&SuccBB);
+
+      // If SinkBB does not dominate a successor, do not mark the successor (or
+      // any of its successors) cold.
+      if (DuplicateBlock || !SinkDom || !mayExtractBlock(SuccBB)) {
+        SuccIt.skipChildren();
+        continue;
+      }
+
+      unsigned SuccScore = getEntryPointScore(SuccBB, ScoreForSuccBlock);
+      if (SuccScore > BestScore) {
+        ColdRegion->SuggestedEntryPoint = &SuccBB;
+        BestScore = SuccScore;
+      }
+
+      addBlockToRegion(&SuccBB, SuccScore);
+      ++SuccIt;
+    }
+
+    return Regions;
+  }
+
+  /// Whether this region has nothing to extract.
+  bool empty() const { return !SuggestedEntryPoint; }
+
+  /// The blocks in this region.
+  ArrayRef<std::pair<BasicBlock *, unsigned>> blocks() const { return Blocks; }
+
+  /// Whether the entire function containing this region is cold.
+  bool isEntireFunctionCold() const { return EntireFunctionCold; }
+
+  /// Remove a sub-region from this region and return it as a block sequence.
+  BlockSequence takeSingleEntrySubRegion(DominatorTree &DT) {
+    assert(!empty() && !isEntireFunctionCold() && "Nothing to extract");
+
+    // Remove blocks dominated by the suggested entry point from this region.
+    // During the removal, identify the next best entry point into the region.
+    // Ensure that the first extracted block is the suggested entry point.
+    BlockSequence SubRegion = {SuggestedEntryPoint};
+    BasicBlock *NextEntryPoint = nullptr;
+    unsigned NextScore = 0;
+    auto RegionEndIt = Blocks.end();
+    auto RegionStartIt = remove_if(Blocks, [&](const BlockTy &Block) {
+      BasicBlock *BB = Block.first;
+      unsigned Score = Block.second;
+      bool InSubRegion =
+          BB == SuggestedEntryPoint || DT.dominates(SuggestedEntryPoint, BB);
+      if (!InSubRegion && Score > NextScore) {
+        NextEntryPoint = BB;
+        NextScore = Score;
+      }
+      if (InSubRegion && BB != SuggestedEntryPoint)
+        SubRegion.push_back(BB);
+      return InSubRegion;
+    });
+    Blocks.erase(RegionStartIt, RegionEndIt);
+
+    // Update the suggested entry point.
+    SuggestedEntryPoint = NextEntryPoint;
+
+    return SubRegion;
+  }
+};
+} // namespace
+
+bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
+  bool Changed = false;
+
+  // The set of cold blocks.
+  SmallPtrSet<BasicBlock *, 4> ColdBlocks;
+
+  // The worklist of non-intersecting regions left to outline.
+  SmallVector<OutliningRegion, 2> OutliningWorklist;
+
+  // Set up an RPO traversal. Experimentally, this performs better (outlines
+  // more) than a PO traversal, because we prevent region overlap by keeping
+  // the first region to contain a block.
+  ReversePostOrderTraversal<Function *> RPOT(&F);
+
+  // Calculate domtrees lazily. This reduces compile-time significantly.
+  std::unique_ptr<DominatorTree> DT;
+  std::unique_ptr<PostDominatorTree> PDT;
+
+  // Calculate BFI lazily (it's only used to query ProfileSummaryInfo). This
+  // reduces compile-time significantly. TODO: When we *do* use BFI, we should
+  // be able to salvage its domtrees instead of recomputing them.
+  BlockFrequencyInfo *BFI = nullptr;
+  if (HasProfileSummary)
+    BFI = GetBFI(F);
+
+  TargetTransformInfo &TTI = GetTTI(F);
+  OptimizationRemarkEmitter &ORE = (*GetORE)(F);
+  AssumptionCache *AC = LookupAC(F);
+
+  // Find all cold regions.
+  for (BasicBlock *BB : RPOT) {
+    // This block is already part of some outlining region.
+    if (ColdBlocks.count(BB))
+      continue;
+
+    bool Cold = (BFI && PSI->isColdBlock(BB, BFI)) ||
                 (EnableStaticAnalysis && unlikelyExecuted(*BB));
-    if (!Cold) 
-      continue; 
- 
-    LLVM_DEBUG({ 
-      dbgs() << "Found a cold block:\n"; 
-      BB->dump(); 
-    }); 
- 
-    if (!DT) 
-      DT = std::make_unique<DominatorTree>(F); 
-    if (!PDT) 
-      PDT = std::make_unique<PostDominatorTree>(F); 
- 
-    auto Regions = OutliningRegion::create(*BB, *DT, *PDT); 
-    for (OutliningRegion &Region : Regions) { 
-      if (Region.empty()) 
-        continue; 
- 
-      if (Region.isEntireFunctionCold()) { 
-        LLVM_DEBUG(dbgs() << "Entire function is cold\n"); 
-        return markFunctionCold(F); 
-      } 
- 
-      // If this outlining region intersects with another, drop the new region. 
-      // 
-      // TODO: It's theoretically possible to outline more by only keeping the 
-      // largest region which contains a block, but the extra bookkeeping to do 
-      // this is tricky/expensive. 
-      bool RegionsOverlap = any_of(Region.blocks(), [&](const BlockTy &Block) { 
-        return !ColdBlocks.insert(Block.first).second; 
-      }); 
-      if (RegionsOverlap) 
-        continue; 
- 
-      OutliningWorklist.emplace_back(std::move(Region)); 
-      ++NumColdRegionsFound; 
-    } 
-  } 
- 
-  if (OutliningWorklist.empty()) 
-    return Changed; 
- 
-  // Outline single-entry cold regions, splitting up larger regions as needed. 
-  unsigned OutlinedFunctionID = 1; 
-  // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time. 
-  CodeExtractorAnalysisCache CEAC(F); 
-  do { 
-    OutliningRegion Region = OutliningWorklist.pop_back_val(); 
-    assert(!Region.empty() && "Empty outlining region in worklist"); 
-    do { 
-      BlockSequence SubRegion = Region.takeSingleEntrySubRegion(*DT); 
-      LLVM_DEBUG({ 
-        dbgs() << "Hot/cold splitting attempting to outline these blocks:\n"; 
-        for (BasicBlock *BB : SubRegion) 
-          BB->dump(); 
-      }); 
- 
-      Function *Outlined = extractColdRegion(SubRegion, CEAC, *DT, BFI, TTI, 
-                                             ORE, AC, OutlinedFunctionID); 
-      if (Outlined) { 
-        ++OutlinedFunctionID; 
-        Changed = true; 
-      } 
-    } while (!Region.empty()); 
-  } while (!OutliningWorklist.empty()); 
- 
-  return Changed; 
-} 
- 
-bool HotColdSplitting::run(Module &M) { 
-  bool Changed = false; 
-  bool HasProfileSummary = (M.getProfileSummary(/* IsCS */ false) != nullptr); 
-  for (auto It = M.begin(), End = M.end(); It != End; ++It) { 
-    Function &F = *It; 
- 
-    // Do not touch declarations. 
-    if (F.isDeclaration()) 
-      continue; 
- 
-    // Do not modify `optnone` functions. 
-    if (F.hasOptNone()) 
-      continue; 
- 
-    // Detect inherently cold functions and mark them as such. 
-    if (isFunctionCold(F)) { 
-      Changed |= markFunctionCold(F); 
-      continue; 
-    } 
- 
-    if (!shouldOutlineFrom(F)) { 
-      LLVM_DEBUG(llvm::dbgs() << "Skipping " << F.getName() << "\n"); 
-      continue; 
-    } 
- 
-    LLVM_DEBUG(llvm::dbgs() << "Outlining in " << F.getName() << "\n"); 
-    Changed |= outlineColdRegions(F, HasProfileSummary); 
-  } 
-  return Changed; 
-} 
- 
-bool HotColdSplittingLegacyPass::runOnModule(Module &M) { 
-  if (skipModule(M)) 
-    return false; 
-  ProfileSummaryInfo *PSI = 
-      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 
-  auto GTTI = [this](Function &F) -> TargetTransformInfo & { 
-    return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 
-  }; 
-  auto GBFI = [this](Function &F) { 
-    return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI(); 
-  }; 
-  std::unique_ptr<OptimizationRemarkEmitter> ORE; 
-  std::function<OptimizationRemarkEmitter &(Function &)> GetORE = 
-      [&ORE](Function &F) -> OptimizationRemarkEmitter & { 
-    ORE.reset(new OptimizationRemarkEmitter(&F)); 
-    return *ORE.get(); 
-  }; 
-  auto LookupAC = [this](Function &F) -> AssumptionCache * { 
-    if (auto *ACT = getAnalysisIfAvailable<AssumptionCacheTracker>()) 
-      return ACT->lookupAssumptionCache(F); 
-    return nullptr; 
-  }; 
- 
-  return HotColdSplitting(PSI, GBFI, GTTI, &GetORE, LookupAC).run(M); 
-} 
- 
-PreservedAnalyses 
-HotColdSplittingPass::run(Module &M, ModuleAnalysisManager &AM) { 
-  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 
- 
-  auto LookupAC = [&FAM](Function &F) -> AssumptionCache * { 
-    return FAM.getCachedResult<AssumptionAnalysis>(F); 
-  }; 
- 
-  auto GBFI = [&FAM](Function &F) { 
-    return &FAM.getResult<BlockFrequencyAnalysis>(F); 
-  }; 
- 
-  std::function<TargetTransformInfo &(Function &)> GTTI = 
-      [&FAM](Function &F) -> TargetTransformInfo & { 
-    return FAM.getResult<TargetIRAnalysis>(F); 
-  }; 
- 
-  std::unique_ptr<OptimizationRemarkEmitter> ORE; 
-  std::function<OptimizationRemarkEmitter &(Function &)> GetORE = 
-      [&ORE](Function &F) -> OptimizationRemarkEmitter & { 
-    ORE.reset(new OptimizationRemarkEmitter(&F)); 
-    return *ORE.get(); 
-  }; 
- 
-  ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M); 
- 
-  if (HotColdSplitting(PSI, GBFI, GTTI, &GetORE, LookupAC).run(M)) 
-    return PreservedAnalyses::none(); 
-  return PreservedAnalyses::all(); 
-} 
- 
-char HotColdSplittingLegacyPass::ID = 0; 
-INITIALIZE_PASS_BEGIN(HotColdSplittingLegacyPass, "hotcoldsplit", 
-                      "Hot Cold Splitting", false, false) 
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 
-INITIALIZE_PASS_END(HotColdSplittingLegacyPass, "hotcoldsplit", 
-                    "Hot Cold Splitting", false, false) 
- 
-ModulePass *llvm::createHotColdSplittingPass() { 
-  return new HotColdSplittingLegacyPass(); 
-} 
+    if (!Cold)
+      continue;
+
+    LLVM_DEBUG({
+      dbgs() << "Found a cold block:\n";
+      BB->dump();
+    });
+
+    if (!DT)
+      DT = std::make_unique<DominatorTree>(F);
+    if (!PDT)
+      PDT = std::make_unique<PostDominatorTree>(F);
+
+    auto Regions = OutliningRegion::create(*BB, *DT, *PDT);
+    for (OutliningRegion &Region : Regions) {
+      if (Region.empty())
+        continue;
+
+      if (Region.isEntireFunctionCold()) {
+        LLVM_DEBUG(dbgs() << "Entire function is cold\n");
+        return markFunctionCold(F);
+      }
+
+      // If this outlining region intersects with another, drop the new region.
+      //
+      // TODO: It's theoretically possible to outline more by only keeping the
+      // largest region which contains a block, but the extra bookkeeping to do
+      // this is tricky/expensive.
+      bool RegionsOverlap = any_of(Region.blocks(), [&](const BlockTy &Block) {
+        return !ColdBlocks.insert(Block.first).second;
+      });
+      if (RegionsOverlap)
+        continue;
+
+      OutliningWorklist.emplace_back(std::move(Region));
+      ++NumColdRegionsFound;
+    }
+  }
+
+  if (OutliningWorklist.empty())
+    return Changed;
+
+  // Outline single-entry cold regions, splitting up larger regions as needed.
+  unsigned OutlinedFunctionID = 1;
+  // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
+  CodeExtractorAnalysisCache CEAC(F);
+  do {
+    OutliningRegion Region = OutliningWorklist.pop_back_val();
+    assert(!Region.empty() && "Empty outlining region in worklist");
+    do {
+      BlockSequence SubRegion = Region.takeSingleEntrySubRegion(*DT);
+      LLVM_DEBUG({
+        dbgs() << "Hot/cold splitting attempting to outline these blocks:\n";
+        for (BasicBlock *BB : SubRegion)
+          BB->dump();
+      });
+
+      Function *Outlined = extractColdRegion(SubRegion, CEAC, *DT, BFI, TTI,
+                                             ORE, AC, OutlinedFunctionID);
+      if (Outlined) {
+        ++OutlinedFunctionID;
+        Changed = true;
+      }
+    } while (!Region.empty());
+  } while (!OutliningWorklist.empty());
+
+  return Changed;
+}
+
+bool HotColdSplitting::run(Module &M) {
+  bool Changed = false;
+  bool HasProfileSummary = (M.getProfileSummary(/* IsCS */ false) != nullptr);
+  for (auto It = M.begin(), End = M.end(); It != End; ++It) {
+    Function &F = *It;
+
+    // Do not touch declarations.
+    if (F.isDeclaration())
+      continue;
+
+    // Do not modify `optnone` functions.
+    if (F.hasOptNone())
+      continue;
+
+    // Detect inherently cold functions and mark them as such.
+    if (isFunctionCold(F)) {
+      Changed |= markFunctionCold(F);
+      continue;
+    }
+
+    if (!shouldOutlineFrom(F)) {
+      LLVM_DEBUG(llvm::dbgs() << "Skipping " << F.getName() << "\n");
+      continue;
+    }
+
+    LLVM_DEBUG(llvm::dbgs() << "Outlining in " << F.getName() << "\n");
+    Changed |= outlineColdRegions(F, HasProfileSummary);
+  }
+  return Changed;
+}
+
+bool HotColdSplittingLegacyPass::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+  ProfileSummaryInfo *PSI =
+      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  auto GTTI = [this](Function &F) -> TargetTransformInfo & {
+    return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  };
+  auto GBFI = [this](Function &F) {
+    return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
+  };
+  std::unique_ptr<OptimizationRemarkEmitter> ORE;
+  std::function<OptimizationRemarkEmitter &(Function &)> GetORE =
+      [&ORE](Function &F) -> OptimizationRemarkEmitter & {
+    ORE.reset(new OptimizationRemarkEmitter(&F));
+    return *ORE.get();
+  };
+  auto LookupAC = [this](Function &F) -> AssumptionCache * {
+    if (auto *ACT = getAnalysisIfAvailable<AssumptionCacheTracker>())
+      return ACT->lookupAssumptionCache(F);
+    return nullptr;
+  };
+
+  return HotColdSplitting(PSI, GBFI, GTTI, &GetORE, LookupAC).run(M);
+}
+
+PreservedAnalyses
+HotColdSplittingPass::run(Module &M, ModuleAnalysisManager &AM) {
+  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+
+  auto LookupAC = [&FAM](Function &F) -> AssumptionCache * {
+    return FAM.getCachedResult<AssumptionAnalysis>(F);
+  };
+
+  auto GBFI = [&FAM](Function &F) {
+    return &FAM.getResult<BlockFrequencyAnalysis>(F);
+  };
+
+  std::function<TargetTransformInfo &(Function &)> GTTI =
+      [&FAM](Function &F) -> TargetTransformInfo & {
+    return FAM.getResult<TargetIRAnalysis>(F);
+  };
+
+  std::unique_ptr<OptimizationRemarkEmitter> ORE;
+  std::function<OptimizationRemarkEmitter &(Function &)> GetORE =
+      [&ORE](Function &F) -> OptimizationRemarkEmitter & {
+    ORE.reset(new OptimizationRemarkEmitter(&F));
+    return *ORE.get();
+  };
+
+  ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
+
+  if (HotColdSplitting(PSI, GBFI, GTTI, &GetORE, LookupAC).run(M))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+char HotColdSplittingLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(HotColdSplittingLegacyPass, "hotcoldsplit",
+                      "Hot Cold Splitting", false, false)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_END(HotColdSplittingLegacyPass, "hotcoldsplit",
+                    "Hot Cold Splitting", false, false)
+
+ModulePass *llvm::createHotColdSplittingPass() {
+  return new HotColdSplittingLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/IPO.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/IPO.cpp
index 8b670b6c98..f4c12dd7f4 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/IPO.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/IPO.cpp
@@ -1,142 +1,142 @@
-//===-- IPO.cpp -----------------------------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the common infrastructure (including C bindings) for 
-// libLLVMIPO.a, which implements several transformations over the LLVM 
-// intermediate representation. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm-c/Transforms/IPO.h" 
-#include "llvm-c/Initialization.h" 
-#include "llvm/IR/LegacyPassManager.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Transforms/IPO.h" 
-#include "llvm/Transforms/IPO/AlwaysInliner.h" 
-#include "llvm/Transforms/IPO/FunctionAttrs.h" 
- 
-using namespace llvm; 
- 
-void llvm::initializeIPO(PassRegistry &Registry) { 
-  initializeOpenMPOptLegacyPassPass(Registry); 
-  initializeArgPromotionPass(Registry); 
+//===-- IPO.cpp -----------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the common infrastructure (including C bindings) for
+// libLLVMIPO.a, which implements several transformations over the LLVM
+// intermediate representation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/Transforms/IPO.h"
+#include "llvm-c/Initialization.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/Transforms/IPO/FunctionAttrs.h"
+
+using namespace llvm;
+
+void llvm::initializeIPO(PassRegistry &Registry) {
+  initializeOpenMPOptLegacyPassPass(Registry);
+  initializeArgPromotionPass(Registry);
   initializeAnnotation2MetadataLegacyPass(Registry);
-  initializeCalledValuePropagationLegacyPassPass(Registry); 
-  initializeConstantMergeLegacyPassPass(Registry); 
-  initializeCrossDSOCFIPass(Registry); 
-  initializeDAEPass(Registry); 
-  initializeDAHPass(Registry); 
-  initializeForceFunctionAttrsLegacyPassPass(Registry); 
-  initializeGlobalDCELegacyPassPass(Registry); 
-  initializeGlobalOptLegacyPassPass(Registry); 
-  initializeGlobalSplitPass(Registry); 
-  initializeHotColdSplittingLegacyPassPass(Registry); 
+  initializeCalledValuePropagationLegacyPassPass(Registry);
+  initializeConstantMergeLegacyPassPass(Registry);
+  initializeCrossDSOCFIPass(Registry);
+  initializeDAEPass(Registry);
+  initializeDAHPass(Registry);
+  initializeForceFunctionAttrsLegacyPassPass(Registry);
+  initializeGlobalDCELegacyPassPass(Registry);
+  initializeGlobalOptLegacyPassPass(Registry);
+  initializeGlobalSplitPass(Registry);
+  initializeHotColdSplittingLegacyPassPass(Registry);
   initializeIROutlinerLegacyPassPass(Registry);
-  initializeAlwaysInlinerLegacyPassPass(Registry); 
-  initializeSimpleInlinerPass(Registry); 
-  initializeInferFunctionAttrsLegacyPassPass(Registry); 
-  initializeInternalizeLegacyPassPass(Registry); 
+  initializeAlwaysInlinerLegacyPassPass(Registry);
+  initializeSimpleInlinerPass(Registry);
+  initializeInferFunctionAttrsLegacyPassPass(Registry);
+  initializeInternalizeLegacyPassPass(Registry);
   initializeLoopExtractorLegacyPassPass(Registry);
   initializeBlockExtractorLegacyPassPass(Registry);
-  initializeSingleLoopExtractorPass(Registry); 
-  initializeLowerTypeTestsPass(Registry); 
-  initializeMergeFunctionsLegacyPassPass(Registry); 
-  initializePartialInlinerLegacyPassPass(Registry); 
-  initializeAttributorLegacyPassPass(Registry); 
-  initializeAttributorCGSCCLegacyPassPass(Registry); 
-  initializePostOrderFunctionAttrsLegacyPassPass(Registry); 
-  initializeReversePostOrderFunctionAttrsLegacyPassPass(Registry); 
-  initializePruneEHPass(Registry); 
-  initializeIPSCCPLegacyPassPass(Registry); 
-  initializeStripDeadPrototypesLegacyPassPass(Registry); 
-  initializeStripSymbolsPass(Registry); 
-  initializeStripDebugDeclarePass(Registry); 
-  initializeStripDeadDebugInfoPass(Registry); 
-  initializeStripNonDebugSymbolsPass(Registry); 
-  initializeBarrierNoopPass(Registry); 
-  initializeEliminateAvailableExternallyLegacyPassPass(Registry); 
-  initializeSampleProfileLoaderLegacyPassPass(Registry); 
-  initializeFunctionImportLegacyPassPass(Registry); 
-  initializeWholeProgramDevirtPass(Registry); 
-} 
- 
-void LLVMInitializeIPO(LLVMPassRegistryRef R) { 
-  initializeIPO(*unwrap(R)); 
-} 
- 
-void LLVMAddArgumentPromotionPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createArgumentPromotionPass()); 
-} 
- 
-void LLVMAddCalledValuePropagationPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createCalledValuePropagationPass()); 
-} 
- 
-void LLVMAddConstantMergePass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createConstantMergePass()); 
-} 
- 
-void LLVMAddDeadArgEliminationPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createDeadArgEliminationPass()); 
-} 
- 
-void LLVMAddFunctionAttrsPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createPostOrderFunctionAttrsLegacyPass()); 
-} 
- 
-void LLVMAddFunctionInliningPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createFunctionInliningPass()); 
-} 
- 
-void LLVMAddAlwaysInlinerPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(llvm::createAlwaysInlinerLegacyPass()); 
-} 
- 
-void LLVMAddGlobalDCEPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createGlobalDCEPass()); 
-} 
- 
-void LLVMAddGlobalOptimizerPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createGlobalOptimizerPass()); 
-} 
- 
-void LLVMAddPruneEHPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createPruneEHPass()); 
-} 
- 
-void LLVMAddIPSCCPPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createIPSCCPPass()); 
-} 
- 
-void LLVMAddMergeFunctionsPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createMergeFunctionsPass()); 
-} 
- 
-void LLVMAddInternalizePass(LLVMPassManagerRef PM, unsigned AllButMain) { 
-  auto PreserveMain = [=](const GlobalValue &GV) { 
-    return AllButMain && GV.getName() == "main"; 
-  }; 
-  unwrap(PM)->add(createInternalizePass(PreserveMain)); 
-} 
- 
-void LLVMAddInternalizePassWithMustPreservePredicate( 
-    LLVMPassManagerRef PM, 
-    void *Context, 
-    LLVMBool (*Pred)(LLVMValueRef, void *)) { 
-  unwrap(PM)->add(createInternalizePass([=](const GlobalValue &GV) { 
-    return Pred(wrap(&GV), Context) == 0 ? false : true; 
-  })); 
-} 
- 
-void LLVMAddStripDeadPrototypesPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createStripDeadPrototypesPass()); 
-} 
- 
-void LLVMAddStripSymbolsPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createStripSymbolsPass()); 
-} 
+  initializeSingleLoopExtractorPass(Registry);
+  initializeLowerTypeTestsPass(Registry);
+  initializeMergeFunctionsLegacyPassPass(Registry);
+  initializePartialInlinerLegacyPassPass(Registry);
+  initializeAttributorLegacyPassPass(Registry);
+  initializeAttributorCGSCCLegacyPassPass(Registry);
+  initializePostOrderFunctionAttrsLegacyPassPass(Registry);
+  initializeReversePostOrderFunctionAttrsLegacyPassPass(Registry);
+  initializePruneEHPass(Registry);
+  initializeIPSCCPLegacyPassPass(Registry);
+  initializeStripDeadPrototypesLegacyPassPass(Registry);
+  initializeStripSymbolsPass(Registry);
+  initializeStripDebugDeclarePass(Registry);
+  initializeStripDeadDebugInfoPass(Registry);
+  initializeStripNonDebugSymbolsPass(Registry);
+  initializeBarrierNoopPass(Registry);
+  initializeEliminateAvailableExternallyLegacyPassPass(Registry);
+  initializeSampleProfileLoaderLegacyPassPass(Registry);
+  initializeFunctionImportLegacyPassPass(Registry);
+  initializeWholeProgramDevirtPass(Registry);
+}
+
+void LLVMInitializeIPO(LLVMPassRegistryRef R) {
+  initializeIPO(*unwrap(R));
+}
+
+void LLVMAddArgumentPromotionPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createArgumentPromotionPass());
+}
+
+void LLVMAddCalledValuePropagationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createCalledValuePropagationPass());
+}
+
+void LLVMAddConstantMergePass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createConstantMergePass());
+}
+
+void LLVMAddDeadArgEliminationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createDeadArgEliminationPass());
+}
+
+void LLVMAddFunctionAttrsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createPostOrderFunctionAttrsLegacyPass());
+}
+
+void LLVMAddFunctionInliningPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createFunctionInliningPass());
+}
+
+void LLVMAddAlwaysInlinerPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(llvm::createAlwaysInlinerLegacyPass());
+}
+
+void LLVMAddGlobalDCEPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createGlobalDCEPass());
+}
+
+void LLVMAddGlobalOptimizerPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createGlobalOptimizerPass());
+}
+
+void LLVMAddPruneEHPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createPruneEHPass());
+}
+
+void LLVMAddIPSCCPPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createIPSCCPPass());
+}
+
+void LLVMAddMergeFunctionsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createMergeFunctionsPass());
+}
+
+void LLVMAddInternalizePass(LLVMPassManagerRef PM, unsigned AllButMain) {
+  auto PreserveMain = [=](const GlobalValue &GV) {
+    return AllButMain && GV.getName() == "main";
+  };
+  unwrap(PM)->add(createInternalizePass(PreserveMain));
+}
+
+void LLVMAddInternalizePassWithMustPreservePredicate(
+    LLVMPassManagerRef PM,
+    void *Context,
+    LLVMBool (*Pred)(LLVMValueRef, void *)) {
+  unwrap(PM)->add(createInternalizePass([=](const GlobalValue &GV) {
+    return Pred(wrap(&GV), Context) == 0 ? false : true;
+  }));
+}
+
+void LLVMAddStripDeadPrototypesPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createStripDeadPrototypesPass());
+}
+
+void LLVMAddStripSymbolsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createStripSymbolsPass());
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/InferFunctionAttrs.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/InferFunctionAttrs.cpp
index 327d411ea4..685f8f7d7a 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/InferFunctionAttrs.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/InferFunctionAttrs.cpp
@@ -1,85 +1,85 @@
-//===- InferFunctionAttrs.cpp - Infer implicit function attributes --------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/IPO/InferFunctionAttrs.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Utils/BuildLibCalls.h" 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "inferattrs" 
- 
-static bool inferAllPrototypeAttributes( 
-    Module &M, function_ref<TargetLibraryInfo &(Function &)> GetTLI) { 
-  bool Changed = false; 
- 
-  for (Function &F : M.functions()) 
-    // We only infer things using the prototype and the name; we don't need 
-    // definitions. 
-    if (F.isDeclaration() && !F.hasOptNone()) 
-      Changed |= inferLibFuncAttributes(F, GetTLI(F)); 
- 
-  return Changed; 
-} 
- 
-PreservedAnalyses InferFunctionAttrsPass::run(Module &M, 
-                                              ModuleAnalysisManager &AM) { 
-  FunctionAnalysisManager &FAM = 
-      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 
-  auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & { 
-    return FAM.getResult<TargetLibraryAnalysis>(F); 
-  }; 
- 
-  if (!inferAllPrototypeAttributes(M, GetTLI)) 
-    // If we didn't infer anything, preserve all analyses. 
-    return PreservedAnalyses::all(); 
- 
-  // Otherwise, we may have changed fundamental function attributes, so clear 
-  // out all the passes. 
-  return PreservedAnalyses::none(); 
-} 
- 
-namespace { 
-struct InferFunctionAttrsLegacyPass : public ModulePass { 
-  static char ID; // Pass identification, replacement for typeid 
-  InferFunctionAttrsLegacyPass() : ModulePass(ID) { 
-    initializeInferFunctionAttrsLegacyPassPass( 
-        *PassRegistry::getPassRegistry()); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-  } 
- 
-  bool runOnModule(Module &M) override { 
-    if (skipModule(M)) 
-      return false; 
- 
-    auto GetTLI = [this](Function &F) -> TargetLibraryInfo & { 
-      return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 
-    }; 
-    return inferAllPrototypeAttributes(M, GetTLI); 
-  } 
-}; 
-} 
- 
-char InferFunctionAttrsLegacyPass::ID = 0; 
-INITIALIZE_PASS_BEGIN(InferFunctionAttrsLegacyPass, "inferattrs", 
-                      "Infer set function attributes", false, false) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_END(InferFunctionAttrsLegacyPass, "inferattrs", 
-                    "Infer set function attributes", false, false) 
- 
-Pass *llvm::createInferFunctionAttrsLegacyPass() { 
-  return new InferFunctionAttrsLegacyPass(); 
-} 
+//===- InferFunctionAttrs.cpp - Infer implicit function attributes --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/InferFunctionAttrs.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "inferattrs"
+
+static bool inferAllPrototypeAttributes(
+    Module &M, function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
+  bool Changed = false;
+
+  for (Function &F : M.functions())
+    // We only infer things using the prototype and the name; we don't need
+    // definitions.
+    if (F.isDeclaration() && !F.hasOptNone())
+      Changed |= inferLibFuncAttributes(F, GetTLI(F));
+
+  return Changed;
+}
+
+PreservedAnalyses InferFunctionAttrsPass::run(Module &M,
+                                              ModuleAnalysisManager &AM) {
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+    return FAM.getResult<TargetLibraryAnalysis>(F);
+  };
+
+  if (!inferAllPrototypeAttributes(M, GetTLI))
+    // If we didn't infer anything, preserve all analyses.
+    return PreservedAnalyses::all();
+
+  // Otherwise, we may have changed fundamental function attributes, so clear
+  // out all the passes.
+  return PreservedAnalyses::none();
+}
+
+namespace {
+struct InferFunctionAttrsLegacyPass : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  InferFunctionAttrsLegacyPass() : ModulePass(ID) {
+    initializeInferFunctionAttrsLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+
+    auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
+      return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    };
+    return inferAllPrototypeAttributes(M, GetTLI);
+  }
+};
+}
+
+char InferFunctionAttrsLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(InferFunctionAttrsLegacyPass, "inferattrs",
+                      "Infer set function attributes", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(InferFunctionAttrsLegacyPass, "inferattrs",
+                    "Infer set function attributes", false, false)
+
+Pass *llvm::createInferFunctionAttrsLegacyPass() {
+  return new InferFunctionAttrsLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/InlineSimple.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/InlineSimple.cpp
index 51659f659c..76f1d0c54d 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/InlineSimple.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/InlineSimple.cpp
@@ -1,124 +1,124 @@
-//===- InlineSimple.cpp - Code to perform simple function inlining --------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements bottom-up inlining of functions into callees. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/InlineCost.h" 
-#include "llvm/Analysis/ProfileSummaryInfo.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/IR/CallingConv.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Transforms/IPO.h" 
-#include "llvm/Transforms/IPO/Inliner.h" 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "inline" 
- 
-namespace { 
- 
-/// Actual inliner pass implementation. 
-/// 
-/// The common implementation of the inlining logic is shared between this 
-/// inliner pass and the always inliner pass. The two passes use different cost 
-/// analyses to determine when to inline. 
-class SimpleInliner : public LegacyInlinerBase { 
- 
-  InlineParams Params; 
- 
-public: 
-  SimpleInliner() : LegacyInlinerBase(ID), Params(llvm::getInlineParams()) { 
-    initializeSimpleInlinerPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  explicit SimpleInliner(InlineParams Params) 
-      : LegacyInlinerBase(ID), Params(std::move(Params)) { 
-    initializeSimpleInlinerPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  static char ID; // Pass identification, replacement for typeid 
- 
-  InlineCost getInlineCost(CallBase &CB) override { 
-    Function *Callee = CB.getCalledFunction(); 
-    TargetTransformInfo &TTI = TTIWP->getTTI(*Callee); 
- 
-    bool RemarksEnabled = false; 
-    const auto &BBs = CB.getCaller()->getBasicBlockList(); 
-    if (!BBs.empty()) { 
-      auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBs.front()); 
-      if (DI.isEnabled()) 
-        RemarksEnabled = true; 
-    } 
-    OptimizationRemarkEmitter ORE(CB.getCaller()); 
- 
-    std::function<AssumptionCache &(Function &)> GetAssumptionCache = 
-        [&](Function &F) -> AssumptionCache & { 
-      return ACT->getAssumptionCache(F); 
-    }; 
-    return llvm::getInlineCost(CB, Params, TTI, GetAssumptionCache, GetTLI, 
-                               /*GetBFI=*/nullptr, PSI, 
-                               RemarksEnabled ? &ORE : nullptr); 
-  } 
- 
-  bool runOnSCC(CallGraphSCC &SCC) override; 
-  void getAnalysisUsage(AnalysisUsage &AU) const override; 
- 
-private: 
-  TargetTransformInfoWrapperPass *TTIWP; 
- 
-}; 
- 
-} // end anonymous namespace 
- 
-char SimpleInliner::ID = 0; 
-INITIALIZE_PASS_BEGIN(SimpleInliner, "inline", "Function Integration/Inlining", 
-                      false, false) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
-INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_END(SimpleInliner, "inline", "Function Integration/Inlining", 
-                    false, false) 
- 
-Pass *llvm::createFunctionInliningPass() { return new SimpleInliner(); } 
- 
-Pass *llvm::createFunctionInliningPass(int Threshold) { 
-  return new SimpleInliner(llvm::getInlineParams(Threshold)); 
-} 
- 
-Pass *llvm::createFunctionInliningPass(unsigned OptLevel, 
-                                       unsigned SizeOptLevel, 
-                                       bool DisableInlineHotCallSite) { 
-  auto Param = llvm::getInlineParams(OptLevel, SizeOptLevel); 
-  if (DisableInlineHotCallSite) 
-    Param.HotCallSiteThreshold = 0; 
-  return new SimpleInliner(Param); 
-} 
- 
-Pass *llvm::createFunctionInliningPass(InlineParams &Params) { 
-  return new SimpleInliner(Params); 
-} 
- 
-bool SimpleInliner::runOnSCC(CallGraphSCC &SCC) { 
-  TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>(); 
-  return LegacyInlinerBase::runOnSCC(SCC); 
-} 
- 
-void SimpleInliner::getAnalysisUsage(AnalysisUsage &AU) const { 
-  AU.addRequired<TargetTransformInfoWrapperPass>(); 
-  LegacyInlinerBase::getAnalysisUsage(AU); 
-} 
+//===- InlineSimple.cpp - Code to perform simple function inlining --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements bottom-up inlining of functions into callees.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/Inliner.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "inline"
+
+namespace {
+
+/// Actual inliner pass implementation.
+///
+/// The common implementation of the inlining logic is shared between this
+/// inliner pass and the always inliner pass. The two passes use different cost
+/// analyses to determine when to inline.
+class SimpleInliner : public LegacyInlinerBase {
+
+  InlineParams Params;
+
+public:
+  SimpleInliner() : LegacyInlinerBase(ID), Params(llvm::getInlineParams()) {
+    initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
+  }
+
+  explicit SimpleInliner(InlineParams Params)
+      : LegacyInlinerBase(ID), Params(std::move(Params)) {
+    initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
+  }
+
+  static char ID; // Pass identification, replacement for typeid
+
+  InlineCost getInlineCost(CallBase &CB) override {
+    Function *Callee = CB.getCalledFunction();
+    TargetTransformInfo &TTI = TTIWP->getTTI(*Callee);
+
+    bool RemarksEnabled = false;
+    const auto &BBs = CB.getCaller()->getBasicBlockList();
+    if (!BBs.empty()) {
+      auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBs.front());
+      if (DI.isEnabled())
+        RemarksEnabled = true;
+    }
+    OptimizationRemarkEmitter ORE(CB.getCaller());
+
+    std::function<AssumptionCache &(Function &)> GetAssumptionCache =
+        [&](Function &F) -> AssumptionCache & {
+      return ACT->getAssumptionCache(F);
+    };
+    return llvm::getInlineCost(CB, Params, TTI, GetAssumptionCache, GetTLI,
+                               /*GetBFI=*/nullptr, PSI,
+                               RemarksEnabled ? &ORE : nullptr);
+  }
+
+  bool runOnSCC(CallGraphSCC &SCC) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+private:
+  TargetTransformInfoWrapperPass *TTIWP;
+
+};
+
+} // end anonymous namespace
+
+char SimpleInliner::ID = 0;
+INITIALIZE_PASS_BEGIN(SimpleInliner, "inline", "Function Integration/Inlining",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(SimpleInliner, "inline", "Function Integration/Inlining",
+                    false, false)
+
+Pass *llvm::createFunctionInliningPass() { return new SimpleInliner(); }
+
+Pass *llvm::createFunctionInliningPass(int Threshold) {
+  return new SimpleInliner(llvm::getInlineParams(Threshold));
+}
+
+Pass *llvm::createFunctionInliningPass(unsigned OptLevel,
+                                       unsigned SizeOptLevel,
+                                       bool DisableInlineHotCallSite) {
+  auto Param = llvm::getInlineParams(OptLevel, SizeOptLevel);
+  if (DisableInlineHotCallSite)
+    Param.HotCallSiteThreshold = 0;
+  return new SimpleInliner(Param);
+}
+
+Pass *llvm::createFunctionInliningPass(InlineParams &Params) {
+  return new SimpleInliner(Params);
+}
+
+bool SimpleInliner::runOnSCC(CallGraphSCC &SCC) {
+  TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
+  return LegacyInlinerBase::runOnSCC(SCC);
+}
+
+void SimpleInliner::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetTransformInfoWrapperPass>();
+  LegacyInlinerBase::getAnalysisUsage(AU);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/Inliner.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/Inliner.cpp
index 133a6e2a85..e91b6c9b1d 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/Inliner.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/Inliner.cpp
@@ -1,658 +1,658 @@
-//===- Inliner.cpp - Code common to all inliners --------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the mechanics required to implement inlining without 
-// missing any calls and updating the call graph.  The decisions of which calls 
-// are profitable to inline are implemented elsewhere. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/IPO/Inliner.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/None.h" 
-#include "llvm/ADT/Optional.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/ScopeExit.h" 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/StringRef.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/BasicAliasAnalysis.h" 
-#include "llvm/Analysis/BlockFrequencyInfo.h" 
-#include "llvm/Analysis/CGSCCPassManager.h" 
-#include "llvm/Analysis/CallGraph.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/InlineAdvisor.h" 
-#include "llvm/Analysis/InlineCost.h" 
-#include "llvm/Analysis/LazyCallGraph.h" 
-#include "llvm/Analysis/OptimizationRemarkEmitter.h" 
-#include "llvm/Analysis/ProfileSummaryInfo.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
+//===- Inliner.cpp - Code common to all inliners --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the mechanics required to implement inlining without
+// missing any calls and updating the call graph.  The decisions of which calls
+// are profitable to inline are implemented elsewhere.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/Inliner.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/CGSCCPassManager.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InlineAdvisor.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/LazyCallGraph.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/Utils/ImportedFunctionsInliningStatistics.h"
-#include "llvm/IR/Attributes.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DebugLoc.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/DiagnosticInfo.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/InstIterator.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Utils/CallPromotionUtils.h" 
-#include "llvm/Transforms/Utils/Cloning.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Transforms/Utils/ModuleUtils.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <functional> 
-#include <sstream> 
-#include <tuple> 
-#include <utility> 
-#include <vector> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "inline" 
- 
-STATISTIC(NumInlined, "Number of functions inlined"); 
-STATISTIC(NumCallsDeleted, "Number of call sites deleted, not inlined"); 
-STATISTIC(NumDeleted, "Number of functions deleted because all callers found"); 
-STATISTIC(NumMergedAllocas, "Number of allocas merged together"); 
- 
-/// Flag to disable manual alloca merging. 
-/// 
-/// Merging of allocas was originally done as a stack-size saving technique 
-/// prior to LLVM's code generator having support for stack coloring based on 
-/// lifetime markers. It is now in the process of being removed. To experiment 
-/// with disabling it and relying fully on lifetime marker based stack 
-/// coloring, you can pass this flag to LLVM. 
-static cl::opt<bool> 
-    DisableInlinedAllocaMerging("disable-inlined-alloca-merging", 
-                                cl::init(false), cl::Hidden); 
- 
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/CallPromotionUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <algorithm>
+#include <cassert>
+#include <functional>
+#include <sstream>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "inline"
+
+STATISTIC(NumInlined, "Number of functions inlined");
+STATISTIC(NumCallsDeleted, "Number of call sites deleted, not inlined");
+STATISTIC(NumDeleted, "Number of functions deleted because all callers found");
+STATISTIC(NumMergedAllocas, "Number of allocas merged together");
+
+/// Flag to disable manual alloca merging.
+///
+/// Merging of allocas was originally done as a stack-size saving technique
+/// prior to LLVM's code generator having support for stack coloring based on
+/// lifetime markers. It is now in the process of being removed. To experiment
+/// with disabling it and relying fully on lifetime marker based stack
+/// coloring, you can pass this flag to LLVM.
+static cl::opt<bool>
+    DisableInlinedAllocaMerging("disable-inlined-alloca-merging",
+                                cl::init(false), cl::Hidden);
+
 extern cl::opt<InlinerFunctionImportStatsOpts> InlinerFunctionImportStats;
- 
+
 static cl::opt<std::string> CGSCCInlineReplayFile(
     "cgscc-inline-replay", cl::init(""), cl::value_desc("filename"),
     cl::desc(
         "Optimization remarks file containing inline remarks to be replayed "
         "by inlining from cgscc inline remarks."),
     cl::Hidden);
- 
-LegacyInlinerBase::LegacyInlinerBase(char &ID) : CallGraphSCCPass(ID) {} 
- 
-LegacyInlinerBase::LegacyInlinerBase(char &ID, bool InsertLifetime) 
-    : CallGraphSCCPass(ID), InsertLifetime(InsertLifetime) {} 
- 
-/// For this class, we declare that we require and preserve the call graph. 
-/// If the derived class implements this method, it should 
-/// always explicitly call the implementation here. 
-void LegacyInlinerBase::getAnalysisUsage(AnalysisUsage &AU) const { 
-  AU.addRequired<AssumptionCacheTracker>(); 
-  AU.addRequired<ProfileSummaryInfoWrapperPass>(); 
-  AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-  getAAResultsAnalysisUsage(AU); 
-  CallGraphSCCPass::getAnalysisUsage(AU); 
-} 
- 
-using InlinedArrayAllocasTy = DenseMap<ArrayType *, std::vector<AllocaInst *>>; 
- 
-/// Look at all of the allocas that we inlined through this call site.  If we 
-/// have already inlined other allocas through other calls into this function, 
-/// then we know that they have disjoint lifetimes and that we can merge them. 
-/// 
-/// There are many heuristics possible for merging these allocas, and the 
-/// different options have different tradeoffs.  One thing that we *really* 
-/// don't want to hurt is SRoA: once inlining happens, often allocas are no 
-/// longer address taken and so they can be promoted. 
-/// 
-/// Our "solution" for that is to only merge allocas whose outermost type is an 
-/// array type.  These are usually not promoted because someone is using a 
-/// variable index into them.  These are also often the most important ones to 
-/// merge. 
-/// 
-/// A better solution would be to have real memory lifetime markers in the IR 
-/// and not have the inliner do any merging of allocas at all.  This would 
-/// allow the backend to do proper stack slot coloring of all allocas that 
-/// *actually make it to the backend*, which is really what we want. 
-/// 
-/// Because we don't have this information, we do this simple and useful hack. 
-static void mergeInlinedArrayAllocas(Function *Caller, InlineFunctionInfo &IFI, 
-                                     InlinedArrayAllocasTy &InlinedArrayAllocas, 
-                                     int InlineHistory) { 
-  SmallPtrSet<AllocaInst *, 16> UsedAllocas; 
- 
-  // When processing our SCC, check to see if the call site was inlined from 
-  // some other call site.  For example, if we're processing "A" in this code: 
-  //   A() { B() } 
-  //   B() { x = alloca ... C() } 
-  //   C() { y = alloca ... } 
-  // Assume that C was not inlined into B initially, and so we're processing A 
-  // and decide to inline B into A.  Doing this makes an alloca available for 
-  // reuse and makes a callsite (C) available for inlining.  When we process 
-  // the C call site we don't want to do any alloca merging between X and Y 
-  // because their scopes are not disjoint.  We could make this smarter by 
-  // keeping track of the inline history for each alloca in the 
-  // InlinedArrayAllocas but this isn't likely to be a significant win. 
-  if (InlineHistory != -1) // Only do merging for top-level call sites in SCC. 
-    return; 
- 
-  // Loop over all the allocas we have so far and see if they can be merged with 
-  // a previously inlined alloca.  If not, remember that we had it. 
-  for (unsigned AllocaNo = 0, E = IFI.StaticAllocas.size(); AllocaNo != E; 
-       ++AllocaNo) { 
-    AllocaInst *AI = IFI.StaticAllocas[AllocaNo]; 
- 
-    // Don't bother trying to merge array allocations (they will usually be 
-    // canonicalized to be an allocation *of* an array), or allocations whose 
-    // type is not itself an array (because we're afraid of pessimizing SRoA). 
-    ArrayType *ATy = dyn_cast<ArrayType>(AI->getAllocatedType()); 
-    if (!ATy || AI->isArrayAllocation()) 
-      continue; 
- 
-    // Get the list of all available allocas for this array type. 
-    std::vector<AllocaInst *> &AllocasForType = InlinedArrayAllocas[ATy]; 
- 
-    // Loop over the allocas in AllocasForType to see if we can reuse one.  Note 
-    // that we have to be careful not to reuse the same "available" alloca for 
-    // multiple different allocas that we just inlined, we use the 'UsedAllocas' 
-    // set to keep track of which "available" allocas are being used by this 
-    // function.  Also, AllocasForType can be empty of course! 
-    bool MergedAwayAlloca = false; 
-    for (AllocaInst *AvailableAlloca : AllocasForType) { 
-      Align Align1 = AI->getAlign(); 
-      Align Align2 = AvailableAlloca->getAlign(); 
- 
-      // The available alloca has to be in the right function, not in some other 
-      // function in this SCC. 
-      if (AvailableAlloca->getParent() != AI->getParent()) 
-        continue; 
- 
-      // If the inlined function already uses this alloca then we can't reuse 
-      // it. 
-      if (!UsedAllocas.insert(AvailableAlloca).second) 
-        continue; 
- 
-      // Otherwise, we *can* reuse it, RAUW AI into AvailableAlloca and declare 
-      // success! 
-      LLVM_DEBUG(dbgs() << "    ***MERGED ALLOCA: " << *AI 
-                        << "\n\t\tINTO: " << *AvailableAlloca << '\n'); 
- 
-      // Move affected dbg.declare calls immediately after the new alloca to 
-      // avoid the situation when a dbg.declare precedes its alloca. 
-      if (auto *L = LocalAsMetadata::getIfExists(AI)) 
-        if (auto *MDV = MetadataAsValue::getIfExists(AI->getContext(), L)) 
-          for (User *U : MDV->users()) 
-            if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U)) 
-              DDI->moveBefore(AvailableAlloca->getNextNode()); 
- 
-      AI->replaceAllUsesWith(AvailableAlloca); 
- 
-      if (Align1 > Align2) 
-        AvailableAlloca->setAlignment(AI->getAlign()); 
- 
-      AI->eraseFromParent(); 
-      MergedAwayAlloca = true; 
-      ++NumMergedAllocas; 
-      IFI.StaticAllocas[AllocaNo] = nullptr; 
-      break; 
-    } 
- 
-    // If we already nuked the alloca, we're done with it. 
-    if (MergedAwayAlloca) 
-      continue; 
- 
-    // If we were unable to merge away the alloca either because there are no 
-    // allocas of the right type available or because we reused them all 
-    // already, remember that this alloca came from an inlined function and mark 
-    // it used so we don't reuse it for other allocas from this inline 
-    // operation. 
-    AllocasForType.push_back(AI); 
-    UsedAllocas.insert(AI); 
-  } 
-} 
- 
-/// If it is possible to inline the specified call site, 
-/// do so and update the CallGraph for this operation. 
-/// 
-/// This function also does some basic book-keeping to update the IR.  The 
-/// InlinedArrayAllocas map keeps track of any allocas that are already 
-/// available from other functions inlined into the caller.  If we are able to 
-/// inline this call site we attempt to reuse already available allocas or add 
-/// any new allocas to the set if not possible. 
-static InlineResult inlineCallIfPossible( 
-    CallBase &CB, InlineFunctionInfo &IFI, 
-    InlinedArrayAllocasTy &InlinedArrayAllocas, int InlineHistory, 
-    bool InsertLifetime, function_ref<AAResults &(Function &)> &AARGetter, 
-    ImportedFunctionsInliningStatistics &ImportedFunctionsStats) { 
-  Function *Callee = CB.getCalledFunction(); 
-  Function *Caller = CB.getCaller(); 
- 
-  AAResults &AAR = AARGetter(*Callee); 
- 
-  // Try to inline the function.  Get the list of static allocas that were 
-  // inlined. 
-  InlineResult IR = InlineFunction(CB, IFI, &AAR, InsertLifetime); 
-  if (!IR.isSuccess()) 
-    return IR; 
- 
-  if (InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No) 
-    ImportedFunctionsStats.recordInline(*Caller, *Callee); 
- 
-  AttributeFuncs::mergeAttributesForInlining(*Caller, *Callee); 
- 
-  if (!DisableInlinedAllocaMerging) 
-    mergeInlinedArrayAllocas(Caller, IFI, InlinedArrayAllocas, InlineHistory); 
- 
-  return IR; // success 
-} 
- 
-/// Return true if the specified inline history ID 
-/// indicates an inline history that includes the specified function. 
-static bool inlineHistoryIncludes( 
-    Function *F, int InlineHistoryID, 
-    const SmallVectorImpl<std::pair<Function *, int>> &InlineHistory) { 
-  while (InlineHistoryID != -1) { 
-    assert(unsigned(InlineHistoryID) < InlineHistory.size() && 
-           "Invalid inline history ID"); 
-    if (InlineHistory[InlineHistoryID].first == F) 
-      return true; 
-    InlineHistoryID = InlineHistory[InlineHistoryID].second; 
-  } 
-  return false; 
-} 
- 
-bool LegacyInlinerBase::doInitialization(CallGraph &CG) { 
-  if (InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No) 
-    ImportedFunctionsStats.setModuleInfo(CG.getModule()); 
-  return false; // No changes to CallGraph. 
-} 
- 
-bool LegacyInlinerBase::runOnSCC(CallGraphSCC &SCC) { 
-  if (skipSCC(SCC)) 
-    return false; 
-  return inlineCalls(SCC); 
-} 
- 
-static bool 
-inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG, 
-                std::function<AssumptionCache &(Function &)> GetAssumptionCache, 
-                ProfileSummaryInfo *PSI, 
-                std::function<const TargetLibraryInfo &(Function &)> GetTLI, 
-                bool InsertLifetime, 
-                function_ref<InlineCost(CallBase &CB)> GetInlineCost, 
-                function_ref<AAResults &(Function &)> AARGetter, 
-                ImportedFunctionsInliningStatistics &ImportedFunctionsStats) { 
-  SmallPtrSet<Function *, 8> SCCFunctions; 
-  LLVM_DEBUG(dbgs() << "Inliner visiting SCC:"); 
-  for (CallGraphNode *Node : SCC) { 
-    Function *F = Node->getFunction(); 
-    if (F) 
-      SCCFunctions.insert(F); 
-    LLVM_DEBUG(dbgs() << " " << (F ? F->getName() : "INDIRECTNODE")); 
-  } 
- 
-  // Scan through and identify all call sites ahead of time so that we only 
-  // inline call sites in the original functions, not call sites that result 
-  // from inlining other functions. 
-  SmallVector<std::pair<CallBase *, int>, 16> CallSites; 
- 
-  // When inlining a callee produces new call sites, we want to keep track of 
-  // the fact that they were inlined from the callee.  This allows us to avoid 
-  // infinite inlining in some obscure cases.  To represent this, we use an 
-  // index into the InlineHistory vector. 
-  SmallVector<std::pair<Function *, int>, 8> InlineHistory; 
- 
-  for (CallGraphNode *Node : SCC) { 
-    Function *F = Node->getFunction(); 
-    if (!F || F->isDeclaration()) 
-      continue; 
- 
-    OptimizationRemarkEmitter ORE(F); 
-    for (BasicBlock &BB : *F) 
-      for (Instruction &I : BB) { 
-        auto *CB = dyn_cast<CallBase>(&I); 
-        // If this isn't a call, or it is a call to an intrinsic, it can 
-        // never be inlined. 
-        if (!CB || isa<IntrinsicInst>(I)) 
-          continue; 
- 
-        // If this is a direct call to an external function, we can never inline 
-        // it.  If it is an indirect call, inlining may resolve it to be a 
-        // direct call, so we keep it. 
-        if (Function *Callee = CB->getCalledFunction()) 
-          if (Callee->isDeclaration()) { 
-            using namespace ore; 
- 
-            setInlineRemark(*CB, "unavailable definition"); 
-            ORE.emit([&]() { 
-              return OptimizationRemarkMissed(DEBUG_TYPE, "NoDefinition", &I) 
-                     << NV("Callee", Callee) << " will not be inlined into " 
-                     << NV("Caller", CB->getCaller()) 
-                     << " because its definition is unavailable" 
-                     << setIsVerbose(); 
-            }); 
-            continue; 
-          } 
- 
-        CallSites.push_back(std::make_pair(CB, -1)); 
-      } 
-  } 
- 
-  LLVM_DEBUG(dbgs() << ": " << CallSites.size() << " call sites.\n"); 
- 
-  // If there are no calls in this function, exit early. 
-  if (CallSites.empty()) 
-    return false; 
- 
-  // Now that we have all of the call sites, move the ones to functions in the 
-  // current SCC to the end of the list. 
-  unsigned FirstCallInSCC = CallSites.size(); 
-  for (unsigned I = 0; I < FirstCallInSCC; ++I) 
-    if (Function *F = CallSites[I].first->getCalledFunction()) 
-      if (SCCFunctions.count(F)) 
-        std::swap(CallSites[I--], CallSites[--FirstCallInSCC]); 
- 
-  InlinedArrayAllocasTy InlinedArrayAllocas; 
-  InlineFunctionInfo InlineInfo(&CG, GetAssumptionCache, PSI); 
- 
-  // Now that we have all of the call sites, loop over them and inline them if 
-  // it looks profitable to do so. 
-  bool Changed = false; 
-  bool LocalChange; 
-  do { 
-    LocalChange = false; 
-    // Iterate over the outer loop because inlining functions can cause indirect 
-    // calls to become direct calls. 
-    // CallSites may be modified inside so ranged for loop can not be used. 
-    for (unsigned CSi = 0; CSi != CallSites.size(); ++CSi) { 
-      auto &P = CallSites[CSi]; 
-      CallBase &CB = *P.first; 
-      const int InlineHistoryID = P.second; 
- 
-      Function *Caller = CB.getCaller(); 
-      Function *Callee = CB.getCalledFunction(); 
- 
-      // We can only inline direct calls to non-declarations. 
-      if (!Callee || Callee->isDeclaration()) 
-        continue; 
- 
-      bool IsTriviallyDead = isInstructionTriviallyDead(&CB, &GetTLI(*Caller)); 
- 
-      if (!IsTriviallyDead) { 
-        // If this call site was obtained by inlining another function, verify 
-        // that the include path for the function did not include the callee 
-        // itself.  If so, we'd be recursively inlining the same function, 
-        // which would provide the same callsites, which would cause us to 
-        // infinitely inline. 
-        if (InlineHistoryID != -1 && 
-            inlineHistoryIncludes(Callee, InlineHistoryID, InlineHistory)) { 
-          setInlineRemark(CB, "recursive"); 
-          continue; 
-        } 
-      } 
- 
-      // FIXME for new PM: because of the old PM we currently generate ORE and 
-      // in turn BFI on demand.  With the new PM, the ORE dependency should 
-      // just become a regular analysis dependency. 
-      OptimizationRemarkEmitter ORE(Caller); 
- 
-      auto OIC = shouldInline(CB, GetInlineCost, ORE); 
-      // If the policy determines that we should inline this function, 
-      // delete the call instead. 
-      if (!OIC) 
-        continue; 
- 
-      // If this call site is dead and it is to a readonly function, we should 
-      // just delete the call instead of trying to inline it, regardless of 
-      // size.  This happens because IPSCCP propagates the result out of the 
-      // call and then we're left with the dead call. 
-      if (IsTriviallyDead) { 
-        LLVM_DEBUG(dbgs() << "    -> Deleting dead call: " << CB << "\n"); 
-        // Update the call graph by deleting the edge from Callee to Caller. 
-        setInlineRemark(CB, "trivially dead"); 
-        CG[Caller]->removeCallEdgeFor(CB); 
-        CB.eraseFromParent(); 
-        ++NumCallsDeleted; 
-      } else { 
-        // Get DebugLoc to report. CB will be invalid after Inliner. 
-        DebugLoc DLoc = CB.getDebugLoc(); 
-        BasicBlock *Block = CB.getParent(); 
- 
-        // Attempt to inline the function. 
-        using namespace ore; 
- 
-        InlineResult IR = inlineCallIfPossible( 
-            CB, InlineInfo, InlinedArrayAllocas, InlineHistoryID, 
-            InsertLifetime, AARGetter, ImportedFunctionsStats); 
-        if (!IR.isSuccess()) { 
-          setInlineRemark(CB, std::string(IR.getFailureReason()) + "; " + 
-                                  inlineCostStr(*OIC)); 
-          ORE.emit([&]() { 
-            return OptimizationRemarkMissed(DEBUG_TYPE, "NotInlined", DLoc, 
-                                            Block) 
-                   << NV("Callee", Callee) << " will not be inlined into " 
-                   << NV("Caller", Caller) << ": " 
-                   << NV("Reason", IR.getFailureReason()); 
-          }); 
-          continue; 
-        } 
-        ++NumInlined; 
- 
-        emitInlinedInto(ORE, DLoc, Block, *Callee, *Caller, *OIC); 
- 
-        // If inlining this function gave us any new call sites, throw them 
-        // onto our worklist to process.  They are useful inline candidates. 
-        if (!InlineInfo.InlinedCalls.empty()) { 
-          // Create a new inline history entry for this, so that we remember 
-          // that these new callsites came about due to inlining Callee. 
-          int NewHistoryID = InlineHistory.size(); 
-          InlineHistory.push_back(std::make_pair(Callee, InlineHistoryID)); 
- 
-#ifndef NDEBUG 
-          // Make sure no dupplicates in the inline candidates. This could 
-          // happen when a callsite is simpilfied to reusing the return value 
-          // of another callsite during function cloning, thus the other 
-          // callsite will be reconsidered here. 
-          DenseSet<CallBase *> DbgCallSites; 
-          for (auto &II : CallSites) 
-            DbgCallSites.insert(II.first); 
-#endif 
- 
-          for (Value *Ptr : InlineInfo.InlinedCalls) { 
-#ifndef NDEBUG 
-            assert(DbgCallSites.count(dyn_cast<CallBase>(Ptr)) == 0); 
-#endif 
-            CallSites.push_back( 
-                std::make_pair(dyn_cast<CallBase>(Ptr), NewHistoryID)); 
-          } 
-        } 
-      } 
- 
-      // If we inlined or deleted the last possible call site to the function, 
-      // delete the function body now. 
-      if (Callee && Callee->use_empty() && Callee->hasLocalLinkage() && 
-          // TODO: Can remove if in SCC now. 
-          !SCCFunctions.count(Callee) && 
-          // The function may be apparently dead, but if there are indirect 
-          // callgraph references to the node, we cannot delete it yet, this 
-          // could invalidate the CGSCC iterator. 
-          CG[Callee]->getNumReferences() == 0) { 
-        LLVM_DEBUG(dbgs() << "    -> Deleting dead function: " 
-                          << Callee->getName() << "\n"); 
-        CallGraphNode *CalleeNode = CG[Callee]; 
- 
-        // Remove any call graph edges from the callee to its callees. 
-        CalleeNode->removeAllCalledFunctions(); 
- 
-        // Removing the node for callee from the call graph and delete it. 
-        delete CG.removeFunctionFromModule(CalleeNode); 
-        ++NumDeleted; 
-      } 
- 
-      // Remove this call site from the list.  If possible, use 
-      // swap/pop_back for efficiency, but do not use it if doing so would 
-      // move a call site to a function in this SCC before the 
-      // 'FirstCallInSCC' barrier. 
-      if (SCC.isSingular()) { 
-        CallSites[CSi] = CallSites.back(); 
-        CallSites.pop_back(); 
-      } else { 
-        CallSites.erase(CallSites.begin() + CSi); 
-      } 
-      --CSi; 
- 
-      Changed = true; 
-      LocalChange = true; 
-    } 
-  } while (LocalChange); 
- 
-  return Changed; 
-} 
- 
-bool LegacyInlinerBase::inlineCalls(CallGraphSCC &SCC) { 
-  CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); 
-  ACT = &getAnalysis<AssumptionCacheTracker>(); 
-  PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 
-  GetTLI = [&](Function &F) -> const TargetLibraryInfo & { 
-    return getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 
-  }; 
-  auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & { 
-    return ACT->getAssumptionCache(F); 
-  }; 
-  return inlineCallsImpl( 
-      SCC, CG, GetAssumptionCache, PSI, GetTLI, InsertLifetime, 
-      [&](CallBase &CB) { return getInlineCost(CB); }, LegacyAARGetter(*this), 
-      ImportedFunctionsStats); 
-} 
- 
-/// Remove now-dead linkonce functions at the end of 
-/// processing to avoid breaking the SCC traversal. 
-bool LegacyInlinerBase::doFinalization(CallGraph &CG) { 
-  if (InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No) 
-    ImportedFunctionsStats.dump(InlinerFunctionImportStats == 
-                                InlinerFunctionImportStatsOpts::Verbose); 
-  return removeDeadFunctions(CG); 
-} 
- 
-/// Remove dead functions that are not included in DNR (Do Not Remove) list. 
-bool LegacyInlinerBase::removeDeadFunctions(CallGraph &CG, 
-                                            bool AlwaysInlineOnly) { 
-  SmallVector<CallGraphNode *, 16> FunctionsToRemove; 
-  SmallVector<Function *, 16> DeadFunctionsInComdats; 
- 
-  auto RemoveCGN = [&](CallGraphNode *CGN) { 
-    // Remove any call graph edges from the function to its callees. 
-    CGN->removeAllCalledFunctions(); 
- 
-    // Remove any edges from the external node to the function's call graph 
-    // node.  These edges might have been made irrelegant due to 
-    // optimization of the program. 
-    CG.getExternalCallingNode()->removeAnyCallEdgeTo(CGN); 
- 
-    // Removing the node for callee from the call graph and delete it. 
-    FunctionsToRemove.push_back(CGN); 
-  }; 
- 
-  // Scan for all of the functions, looking for ones that should now be removed 
-  // from the program.  Insert the dead ones in the FunctionsToRemove set. 
-  for (const auto &I : CG) { 
-    CallGraphNode *CGN = I.second.get(); 
-    Function *F = CGN->getFunction(); 
-    if (!F || F->isDeclaration()) 
-      continue; 
- 
-    // Handle the case when this function is called and we only want to care 
-    // about always-inline functions. This is a bit of a hack to share code 
-    // between here and the InlineAlways pass. 
-    if (AlwaysInlineOnly && !F->hasFnAttribute(Attribute::AlwaysInline)) 
-      continue; 
- 
-    // If the only remaining users of the function are dead constants, remove 
-    // them. 
-    F->removeDeadConstantUsers(); 
- 
-    if (!F->isDefTriviallyDead()) 
-      continue; 
- 
-    // It is unsafe to drop a function with discardable linkage from a COMDAT 
-    // without also dropping the other members of the COMDAT. 
-    // The inliner doesn't visit non-function entities which are in COMDAT 
-    // groups so it is unsafe to do so *unless* the linkage is local. 
-    if (!F->hasLocalLinkage()) { 
-      if (F->hasComdat()) { 
-        DeadFunctionsInComdats.push_back(F); 
-        continue; 
-      } 
-    } 
- 
-    RemoveCGN(CGN); 
-  } 
-  if (!DeadFunctionsInComdats.empty()) { 
-    // Filter out the functions whose comdats remain alive. 
-    filterDeadComdatFunctions(CG.getModule(), DeadFunctionsInComdats); 
-    // Remove the rest. 
-    for (Function *F : DeadFunctionsInComdats) 
-      RemoveCGN(CG[F]); 
-  } 
- 
-  if (FunctionsToRemove.empty()) 
-    return false; 
- 
-  // Now that we know which functions to delete, do so.  We didn't want to do 
-  // this inline, because that would invalidate our CallGraph::iterator 
-  // objects. :( 
-  // 
-  // Note that it doesn't matter that we are iterating over a non-stable order 
-  // here to do this, it doesn't matter which order the functions are deleted 
-  // in. 
-  array_pod_sort(FunctionsToRemove.begin(), FunctionsToRemove.end()); 
-  FunctionsToRemove.erase( 
-      std::unique(FunctionsToRemove.begin(), FunctionsToRemove.end()), 
-      FunctionsToRemove.end()); 
-  for (CallGraphNode *CGN : FunctionsToRemove) { 
-    delete CG.removeFunctionFromModule(CGN); 
-    ++NumDeleted; 
-  } 
-  return true; 
-} 
- 
-InlineAdvisor & 
-InlinerPass::getAdvisor(const ModuleAnalysisManagerCGSCCProxy::Result &MAM, 
-                        FunctionAnalysisManager &FAM, Module &M) { 
+
+LegacyInlinerBase::LegacyInlinerBase(char &ID) : CallGraphSCCPass(ID) {}
+
+LegacyInlinerBase::LegacyInlinerBase(char &ID, bool InsertLifetime)
+    : CallGraphSCCPass(ID), InsertLifetime(InsertLifetime) {}
+
+/// For this class, we declare that we require and preserve the call graph.
+/// If the derived class implements this method, it should
+/// always explicitly call the implementation here.
+void LegacyInlinerBase::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<AssumptionCacheTracker>();
+  AU.addRequired<ProfileSummaryInfoWrapperPass>();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
+  getAAResultsAnalysisUsage(AU);
+  CallGraphSCCPass::getAnalysisUsage(AU);
+}
+
+using InlinedArrayAllocasTy = DenseMap<ArrayType *, std::vector<AllocaInst *>>;
+
+/// Look at all of the allocas that we inlined through this call site.  If we
+/// have already inlined other allocas through other calls into this function,
+/// then we know that they have disjoint lifetimes and that we can merge them.
+///
+/// There are many heuristics possible for merging these allocas, and the
+/// different options have different tradeoffs.  One thing that we *really*
+/// don't want to hurt is SRoA: once inlining happens, often allocas are no
+/// longer address taken and so they can be promoted.
+///
+/// Our "solution" for that is to only merge allocas whose outermost type is an
+/// array type.  These are usually not promoted because someone is using a
+/// variable index into them.  These are also often the most important ones to
+/// merge.
+///
+/// A better solution would be to have real memory lifetime markers in the IR
+/// and not have the inliner do any merging of allocas at all.  This would
+/// allow the backend to do proper stack slot coloring of all allocas that
+/// *actually make it to the backend*, which is really what we want.
+///
+/// Because we don't have this information, we do this simple and useful hack.
+static void mergeInlinedArrayAllocas(Function *Caller, InlineFunctionInfo &IFI,
+                                     InlinedArrayAllocasTy &InlinedArrayAllocas,
+                                     int InlineHistory) {
+  SmallPtrSet<AllocaInst *, 16> UsedAllocas;
+
+  // When processing our SCC, check to see if the call site was inlined from
+  // some other call site.  For example, if we're processing "A" in this code:
+  //   A() { B() }
+  //   B() { x = alloca ... C() }
+  //   C() { y = alloca ... }
+  // Assume that C was not inlined into B initially, and so we're processing A
+  // and decide to inline B into A.  Doing this makes an alloca available for
+  // reuse and makes a callsite (C) available for inlining.  When we process
+  // the C call site we don't want to do any alloca merging between X and Y
+  // because their scopes are not disjoint.  We could make this smarter by
+  // keeping track of the inline history for each alloca in the
+  // InlinedArrayAllocas but this isn't likely to be a significant win.
+  if (InlineHistory != -1) // Only do merging for top-level call sites in SCC.
+    return;
+
+  // Loop over all the allocas we have so far and see if they can be merged with
+  // a previously inlined alloca.  If not, remember that we had it.
+  for (unsigned AllocaNo = 0, E = IFI.StaticAllocas.size(); AllocaNo != E;
+       ++AllocaNo) {
+    AllocaInst *AI = IFI.StaticAllocas[AllocaNo];
+
+    // Don't bother trying to merge array allocations (they will usually be
+    // canonicalized to be an allocation *of* an array), or allocations whose
+    // type is not itself an array (because we're afraid of pessimizing SRoA).
+    ArrayType *ATy = dyn_cast<ArrayType>(AI->getAllocatedType());
+    if (!ATy || AI->isArrayAllocation())
+      continue;
+
+    // Get the list of all available allocas for this array type.
+    std::vector<AllocaInst *> &AllocasForType = InlinedArrayAllocas[ATy];
+
+    // Loop over the allocas in AllocasForType to see if we can reuse one.  Note
+    // that we have to be careful not to reuse the same "available" alloca for
+    // multiple different allocas that we just inlined, we use the 'UsedAllocas'
+    // set to keep track of which "available" allocas are being used by this
+    // function.  Also, AllocasForType can be empty of course!
+    bool MergedAwayAlloca = false;
+    for (AllocaInst *AvailableAlloca : AllocasForType) {
+      Align Align1 = AI->getAlign();
+      Align Align2 = AvailableAlloca->getAlign();
+
+      // The available alloca has to be in the right function, not in some other
+      // function in this SCC.
+      if (AvailableAlloca->getParent() != AI->getParent())
+        continue;
+
+      // If the inlined function already uses this alloca then we can't reuse
+      // it.
+      if (!UsedAllocas.insert(AvailableAlloca).second)
+        continue;
+
+      // Otherwise, we *can* reuse it, RAUW AI into AvailableAlloca and declare
+      // success!
+      LLVM_DEBUG(dbgs() << "    ***MERGED ALLOCA: " << *AI
+                        << "\n\t\tINTO: " << *AvailableAlloca << '\n');
+
+      // Move affected dbg.declare calls immediately after the new alloca to
+      // avoid the situation when a dbg.declare precedes its alloca.
+      if (auto *L = LocalAsMetadata::getIfExists(AI))
+        if (auto *MDV = MetadataAsValue::getIfExists(AI->getContext(), L))
+          for (User *U : MDV->users())
+            if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U))
+              DDI->moveBefore(AvailableAlloca->getNextNode());
+
+      AI->replaceAllUsesWith(AvailableAlloca);
+
+      if (Align1 > Align2)
+        AvailableAlloca->setAlignment(AI->getAlign());
+
+      AI->eraseFromParent();
+      MergedAwayAlloca = true;
+      ++NumMergedAllocas;
+      IFI.StaticAllocas[AllocaNo] = nullptr;
+      break;
+    }
+
+    // If we already nuked the alloca, we're done with it.
+    if (MergedAwayAlloca)
+      continue;
+
+    // If we were unable to merge away the alloca either because there are no
+    // allocas of the right type available or because we reused them all
+    // already, remember that this alloca came from an inlined function and mark
+    // it used so we don't reuse it for other allocas from this inline
+    // operation.
+    AllocasForType.push_back(AI);
+    UsedAllocas.insert(AI);
+  }
+}
+
+/// If it is possible to inline the specified call site,
+/// do so and update the CallGraph for this operation.
+///
+/// This function also does some basic book-keeping to update the IR.  The
+/// InlinedArrayAllocas map keeps track of any allocas that are already
+/// available from other functions inlined into the caller.  If we are able to
+/// inline this call site we attempt to reuse already available allocas or add
+/// any new allocas to the set if not possible.
+static InlineResult inlineCallIfPossible(
+    CallBase &CB, InlineFunctionInfo &IFI,
+    InlinedArrayAllocasTy &InlinedArrayAllocas, int InlineHistory,
+    bool InsertLifetime, function_ref<AAResults &(Function &)> &AARGetter,
+    ImportedFunctionsInliningStatistics &ImportedFunctionsStats) {
+  Function *Callee = CB.getCalledFunction();
+  Function *Caller = CB.getCaller();
+
+  AAResults &AAR = AARGetter(*Callee);
+
+  // Try to inline the function.  Get the list of static allocas that were
+  // inlined.
+  InlineResult IR = InlineFunction(CB, IFI, &AAR, InsertLifetime);
+  if (!IR.isSuccess())
+    return IR;
+
+  if (InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No)
+    ImportedFunctionsStats.recordInline(*Caller, *Callee);
+
+  AttributeFuncs::mergeAttributesForInlining(*Caller, *Callee);
+
+  if (!DisableInlinedAllocaMerging)
+    mergeInlinedArrayAllocas(Caller, IFI, InlinedArrayAllocas, InlineHistory);
+
+  return IR; // success
+}
+
+/// Return true if the specified inline history ID
+/// indicates an inline history that includes the specified function.
+static bool inlineHistoryIncludes(
+    Function *F, int InlineHistoryID,
+    const SmallVectorImpl<std::pair<Function *, int>> &InlineHistory) {
+  while (InlineHistoryID != -1) {
+    assert(unsigned(InlineHistoryID) < InlineHistory.size() &&
+           "Invalid inline history ID");
+    if (InlineHistory[InlineHistoryID].first == F)
+      return true;
+    InlineHistoryID = InlineHistory[InlineHistoryID].second;
+  }
+  return false;
+}
+
+bool LegacyInlinerBase::doInitialization(CallGraph &CG) {
+  if (InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No)
+    ImportedFunctionsStats.setModuleInfo(CG.getModule());
+  return false; // No changes to CallGraph.
+}
+
+bool LegacyInlinerBase::runOnSCC(CallGraphSCC &SCC) {
+  if (skipSCC(SCC))
+    return false;
+  return inlineCalls(SCC);
+}
+
+static bool
+inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
+                std::function<AssumptionCache &(Function &)> GetAssumptionCache,
+                ProfileSummaryInfo *PSI,
+                std::function<const TargetLibraryInfo &(Function &)> GetTLI,
+                bool InsertLifetime,
+                function_ref<InlineCost(CallBase &CB)> GetInlineCost,
+                function_ref<AAResults &(Function &)> AARGetter,
+                ImportedFunctionsInliningStatistics &ImportedFunctionsStats) {
+  SmallPtrSet<Function *, 8> SCCFunctions;
+  LLVM_DEBUG(dbgs() << "Inliner visiting SCC:");
+  for (CallGraphNode *Node : SCC) {
+    Function *F = Node->getFunction();
+    if (F)
+      SCCFunctions.insert(F);
+    LLVM_DEBUG(dbgs() << " " << (F ? F->getName() : "INDIRECTNODE"));
+  }
+
+  // Scan through and identify all call sites ahead of time so that we only
+  // inline call sites in the original functions, not call sites that result
+  // from inlining other functions.
+  SmallVector<std::pair<CallBase *, int>, 16> CallSites;
+
+  // When inlining a callee produces new call sites, we want to keep track of
+  // the fact that they were inlined from the callee.  This allows us to avoid
+  // infinite inlining in some obscure cases.  To represent this, we use an
+  // index into the InlineHistory vector.
+  SmallVector<std::pair<Function *, int>, 8> InlineHistory;
+
+  for (CallGraphNode *Node : SCC) {
+    Function *F = Node->getFunction();
+    if (!F || F->isDeclaration())
+      continue;
+
+    OptimizationRemarkEmitter ORE(F);
+    for (BasicBlock &BB : *F)
+      for (Instruction &I : BB) {
+        auto *CB = dyn_cast<CallBase>(&I);
+        // If this isn't a call, or it is a call to an intrinsic, it can
+        // never be inlined.
+        if (!CB || isa<IntrinsicInst>(I))
+          continue;
+
+        // If this is a direct call to an external function, we can never inline
+        // it.  If it is an indirect call, inlining may resolve it to be a
+        // direct call, so we keep it.
+        if (Function *Callee = CB->getCalledFunction())
+          if (Callee->isDeclaration()) {
+            using namespace ore;
+
+            setInlineRemark(*CB, "unavailable definition");
+            ORE.emit([&]() {
+              return OptimizationRemarkMissed(DEBUG_TYPE, "NoDefinition", &I)
+                     << NV("Callee", Callee) << " will not be inlined into "
+                     << NV("Caller", CB->getCaller())
+                     << " because its definition is unavailable"
+                     << setIsVerbose();
+            });
+            continue;
+          }
+
+        CallSites.push_back(std::make_pair(CB, -1));
+      }
+  }
+
+  LLVM_DEBUG(dbgs() << ": " << CallSites.size() << " call sites.\n");
+
+  // If there are no calls in this function, exit early.
+  if (CallSites.empty())
+    return false;
+
+  // Now that we have all of the call sites, move the ones to functions in the
+  // current SCC to the end of the list.
+  unsigned FirstCallInSCC = CallSites.size();
+  for (unsigned I = 0; I < FirstCallInSCC; ++I)
+    if (Function *F = CallSites[I].first->getCalledFunction())
+      if (SCCFunctions.count(F))
+        std::swap(CallSites[I--], CallSites[--FirstCallInSCC]);
+
+  InlinedArrayAllocasTy InlinedArrayAllocas;
+  InlineFunctionInfo InlineInfo(&CG, GetAssumptionCache, PSI);
+
+  // Now that we have all of the call sites, loop over them and inline them if
+  // it looks profitable to do so.
+  bool Changed = false;
+  bool LocalChange;
+  do {
+    LocalChange = false;
+    // Iterate over the outer loop because inlining functions can cause indirect
+    // calls to become direct calls.
+    // CallSites may be modified inside so ranged for loop can not be used.
+    for (unsigned CSi = 0; CSi != CallSites.size(); ++CSi) {
+      auto &P = CallSites[CSi];
+      CallBase &CB = *P.first;
+      const int InlineHistoryID = P.second;
+
+      Function *Caller = CB.getCaller();
+      Function *Callee = CB.getCalledFunction();
+
+      // We can only inline direct calls to non-declarations.
+      if (!Callee || Callee->isDeclaration())
+        continue;
+
+      bool IsTriviallyDead = isInstructionTriviallyDead(&CB, &GetTLI(*Caller));
+
+      if (!IsTriviallyDead) {
+        // If this call site was obtained by inlining another function, verify
+        // that the include path for the function did not include the callee
+        // itself.  If so, we'd be recursively inlining the same function,
+        // which would provide the same callsites, which would cause us to
+        // infinitely inline.
+        if (InlineHistoryID != -1 &&
+            inlineHistoryIncludes(Callee, InlineHistoryID, InlineHistory)) {
+          setInlineRemark(CB, "recursive");
+          continue;
+        }
+      }
+
+      // FIXME for new PM: because of the old PM we currently generate ORE and
+      // in turn BFI on demand.  With the new PM, the ORE dependency should
+      // just become a regular analysis dependency.
+      OptimizationRemarkEmitter ORE(Caller);
+
+      auto OIC = shouldInline(CB, GetInlineCost, ORE);
+      // If the policy determines that we should inline this function,
+      // delete the call instead.
+      if (!OIC)
+        continue;
+
+      // If this call site is dead and it is to a readonly function, we should
+      // just delete the call instead of trying to inline it, regardless of
+      // size.  This happens because IPSCCP propagates the result out of the
+      // call and then we're left with the dead call.
+      if (IsTriviallyDead) {
+        LLVM_DEBUG(dbgs() << "    -> Deleting dead call: " << CB << "\n");
+        // Update the call graph by deleting the edge from Callee to Caller.
+        setInlineRemark(CB, "trivially dead");
+        CG[Caller]->removeCallEdgeFor(CB);
+        CB.eraseFromParent();
+        ++NumCallsDeleted;
+      } else {
+        // Get DebugLoc to report. CB will be invalid after Inliner.
+        DebugLoc DLoc = CB.getDebugLoc();
+        BasicBlock *Block = CB.getParent();
+
+        // Attempt to inline the function.
+        using namespace ore;
+
+        InlineResult IR = inlineCallIfPossible(
+            CB, InlineInfo, InlinedArrayAllocas, InlineHistoryID,
+            InsertLifetime, AARGetter, ImportedFunctionsStats);
+        if (!IR.isSuccess()) {
+          setInlineRemark(CB, std::string(IR.getFailureReason()) + "; " +
+                                  inlineCostStr(*OIC));
+          ORE.emit([&]() {
+            return OptimizationRemarkMissed(DEBUG_TYPE, "NotInlined", DLoc,
+                                            Block)
+                   << NV("Callee", Callee) << " will not be inlined into "
+                   << NV("Caller", Caller) << ": "
+                   << NV("Reason", IR.getFailureReason());
+          });
+          continue;
+        }
+        ++NumInlined;
+
+        emitInlinedInto(ORE, DLoc, Block, *Callee, *Caller, *OIC);
+
+        // If inlining this function gave us any new call sites, throw them
+        // onto our worklist to process.  They are useful inline candidates.
+        if (!InlineInfo.InlinedCalls.empty()) {
+          // Create a new inline history entry for this, so that we remember
+          // that these new callsites came about due to inlining Callee.
+          int NewHistoryID = InlineHistory.size();
+          InlineHistory.push_back(std::make_pair(Callee, InlineHistoryID));
+
+#ifndef NDEBUG
+          // Make sure no dupplicates in the inline candidates. This could
+          // happen when a callsite is simpilfied to reusing the return value
+          // of another callsite during function cloning, thus the other
+          // callsite will be reconsidered here.
+          DenseSet<CallBase *> DbgCallSites;
+          for (auto &II : CallSites)
+            DbgCallSites.insert(II.first);
+#endif
+
+          for (Value *Ptr : InlineInfo.InlinedCalls) {
+#ifndef NDEBUG
+            assert(DbgCallSites.count(dyn_cast<CallBase>(Ptr)) == 0);
+#endif
+            CallSites.push_back(
+                std::make_pair(dyn_cast<CallBase>(Ptr), NewHistoryID));
+          }
+        }
+      }
+
+      // If we inlined or deleted the last possible call site to the function,
+      // delete the function body now.
+      if (Callee && Callee->use_empty() && Callee->hasLocalLinkage() &&
+          // TODO: Can remove if in SCC now.
+          !SCCFunctions.count(Callee) &&
+          // The function may be apparently dead, but if there are indirect
+          // callgraph references to the node, we cannot delete it yet, this
+          // could invalidate the CGSCC iterator.
+          CG[Callee]->getNumReferences() == 0) {
+        LLVM_DEBUG(dbgs() << "    -> Deleting dead function: "
+                          << Callee->getName() << "\n");
+        CallGraphNode *CalleeNode = CG[Callee];
+
+        // Remove any call graph edges from the callee to its callees.
+        CalleeNode->removeAllCalledFunctions();
+
+        // Removing the node for callee from the call graph and delete it.
+        delete CG.removeFunctionFromModule(CalleeNode);
+        ++NumDeleted;
+      }
+
+      // Remove this call site from the list.  If possible, use
+      // swap/pop_back for efficiency, but do not use it if doing so would
+      // move a call site to a function in this SCC before the
+      // 'FirstCallInSCC' barrier.
+      if (SCC.isSingular()) {
+        CallSites[CSi] = CallSites.back();
+        CallSites.pop_back();
+      } else {
+        CallSites.erase(CallSites.begin() + CSi);
+      }
+      --CSi;
+
+      Changed = true;
+      LocalChange = true;
+    }
+  } while (LocalChange);
+
+  return Changed;
+}
+
+bool LegacyInlinerBase::inlineCalls(CallGraphSCC &SCC) {
+  CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+  ACT = &getAnalysis<AssumptionCacheTracker>();
+  PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  GetTLI = [&](Function &F) -> const TargetLibraryInfo & {
+    return getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  };
+  auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
+    return ACT->getAssumptionCache(F);
+  };
+  return inlineCallsImpl(
+      SCC, CG, GetAssumptionCache, PSI, GetTLI, InsertLifetime,
+      [&](CallBase &CB) { return getInlineCost(CB); }, LegacyAARGetter(*this),
+      ImportedFunctionsStats);
+}
+
+/// Remove now-dead linkonce functions at the end of
+/// processing to avoid breaking the SCC traversal.
+bool LegacyInlinerBase::doFinalization(CallGraph &CG) {
+  if (InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No)
+    ImportedFunctionsStats.dump(InlinerFunctionImportStats ==
+                                InlinerFunctionImportStatsOpts::Verbose);
+  return removeDeadFunctions(CG);
+}
+
+/// Remove dead functions that are not included in DNR (Do Not Remove) list.
+bool LegacyInlinerBase::removeDeadFunctions(CallGraph &CG,
+                                            bool AlwaysInlineOnly) {
+  SmallVector<CallGraphNode *, 16> FunctionsToRemove;
+  SmallVector<Function *, 16> DeadFunctionsInComdats;
+
+  auto RemoveCGN = [&](CallGraphNode *CGN) {
+    // Remove any call graph edges from the function to its callees.
+    CGN->removeAllCalledFunctions();
+
+    // Remove any edges from the external node to the function's call graph
+    // node.  These edges might have been made irrelegant due to
+    // optimization of the program.
+    CG.getExternalCallingNode()->removeAnyCallEdgeTo(CGN);
+
+    // Removing the node for callee from the call graph and delete it.
+    FunctionsToRemove.push_back(CGN);
+  };
+
+  // Scan for all of the functions, looking for ones that should now be removed
+  // from the program.  Insert the dead ones in the FunctionsToRemove set.
+  for (const auto &I : CG) {
+    CallGraphNode *CGN = I.second.get();
+    Function *F = CGN->getFunction();
+    if (!F || F->isDeclaration())
+      continue;
+
+    // Handle the case when this function is called and we only want to care
+    // about always-inline functions. This is a bit of a hack to share code
+    // between here and the InlineAlways pass.
+    if (AlwaysInlineOnly && !F->hasFnAttribute(Attribute::AlwaysInline))
+      continue;
+
+    // If the only remaining users of the function are dead constants, remove
+    // them.
+    F->removeDeadConstantUsers();
+
+    if (!F->isDefTriviallyDead())
+      continue;
+
+    // It is unsafe to drop a function with discardable linkage from a COMDAT
+    // without also dropping the other members of the COMDAT.
+    // The inliner doesn't visit non-function entities which are in COMDAT
+    // groups so it is unsafe to do so *unless* the linkage is local.
+    if (!F->hasLocalLinkage()) {
+      if (F->hasComdat()) {
+        DeadFunctionsInComdats.push_back(F);
+        continue;
+      }
+    }
+
+    RemoveCGN(CGN);
+  }
+  if (!DeadFunctionsInComdats.empty()) {
+    // Filter out the functions whose comdats remain alive.
+    filterDeadComdatFunctions(CG.getModule(), DeadFunctionsInComdats);
+    // Remove the rest.
+    for (Function *F : DeadFunctionsInComdats)
+      RemoveCGN(CG[F]);
+  }
+
+  if (FunctionsToRemove.empty())
+    return false;
+
+  // Now that we know which functions to delete, do so.  We didn't want to do
+  // this inline, because that would invalidate our CallGraph::iterator
+  // objects. :(
+  //
+  // Note that it doesn't matter that we are iterating over a non-stable order
+  // here to do this, it doesn't matter which order the functions are deleted
+  // in.
+  array_pod_sort(FunctionsToRemove.begin(), FunctionsToRemove.end());
+  FunctionsToRemove.erase(
+      std::unique(FunctionsToRemove.begin(), FunctionsToRemove.end()),
+      FunctionsToRemove.end());
+  for (CallGraphNode *CGN : FunctionsToRemove) {
+    delete CG.removeFunctionFromModule(CGN);
+    ++NumDeleted;
+  }
+  return true;
+}
+
+InlineAdvisor &
+InlinerPass::getAdvisor(const ModuleAnalysisManagerCGSCCProxy::Result &MAM,
+                        FunctionAnalysisManager &FAM, Module &M) {
   if (OwnedAdvisor)
     return *OwnedAdvisor;
 
-  auto *IAA = MAM.getCachedResult<InlineAdvisorAnalysis>(M); 
-  if (!IAA) { 
-    // It should still be possible to run the inliner as a stand-alone SCC pass, 
-    // for test scenarios. In that case, we default to the 
-    // DefaultInlineAdvisor, which doesn't need to keep state between SCC pass 
-    // runs. It also uses just the default InlineParams. 
-    // In this case, we need to use the provided FAM, which is valid for the 
-    // duration of the inliner pass, and thus the lifetime of the owned advisor. 
-    // The one we would get from the MAM can be invalidated as a result of the 
-    // inliner's activity. 
+  auto *IAA = MAM.getCachedResult<InlineAdvisorAnalysis>(M);
+  if (!IAA) {
+    // It should still be possible to run the inliner as a stand-alone SCC pass,
+    // for test scenarios. In that case, we default to the
+    // DefaultInlineAdvisor, which doesn't need to keep state between SCC pass
+    // runs. It also uses just the default InlineParams.
+    // In this case, we need to use the provided FAM, which is valid for the
+    // duration of the inliner pass, and thus the lifetime of the owned advisor.
+    // The one we would get from the MAM can be invalidated as a result of the
+    // inliner's activity.
     OwnedAdvisor =
         std::make_unique<DefaultInlineAdvisor>(M, FAM, getInlineParams());
 
@@ -663,376 +663,376 @@ InlinerPass::getAdvisor(const ModuleAnalysisManagerCGSCCProxy::Result &MAM,
           /*EmitRemarks=*/true);
 
     return *OwnedAdvisor;
-  } 
-  assert(IAA->getAdvisor() && 
-         "Expected a present InlineAdvisorAnalysis also have an " 
-         "InlineAdvisor initialized"); 
-  return *IAA->getAdvisor(); 
-} 
- 
-PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, 
-                                   CGSCCAnalysisManager &AM, LazyCallGraph &CG, 
-                                   CGSCCUpdateResult &UR) { 
-  const auto &MAMProxy = 
-      AM.getResult<ModuleAnalysisManagerCGSCCProxy>(InitialC, CG); 
-  bool Changed = false; 
- 
-  assert(InitialC.size() > 0 && "Cannot handle an empty SCC!"); 
-  Module &M = *InitialC.begin()->getFunction().getParent(); 
-  ProfileSummaryInfo *PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(M); 
- 
-  FunctionAnalysisManager &FAM = 
-      AM.getResult<FunctionAnalysisManagerCGSCCProxy>(InitialC, CG) 
-          .getManager(); 
- 
-  InlineAdvisor &Advisor = getAdvisor(MAMProxy, FAM, M); 
-  Advisor.onPassEntry(); 
- 
-  auto AdvisorOnExit = make_scope_exit([&] { Advisor.onPassExit(); }); 
- 
-  // We use a single common worklist for calls across the entire SCC. We 
-  // process these in-order and append new calls introduced during inlining to 
-  // the end. 
-  // 
-  // Note that this particular order of processing is actually critical to 
-  // avoid very bad behaviors. Consider *highly connected* call graphs where 
+  }
+  assert(IAA->getAdvisor() &&
+         "Expected a present InlineAdvisorAnalysis also have an "
+         "InlineAdvisor initialized");
+  return *IAA->getAdvisor();
+}
+
+PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
+                                   CGSCCAnalysisManager &AM, LazyCallGraph &CG,
+                                   CGSCCUpdateResult &UR) {
+  const auto &MAMProxy =
+      AM.getResult<ModuleAnalysisManagerCGSCCProxy>(InitialC, CG);
+  bool Changed = false;
+
+  assert(InitialC.size() > 0 && "Cannot handle an empty SCC!");
+  Module &M = *InitialC.begin()->getFunction().getParent();
+  ProfileSummaryInfo *PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(M);
+
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerCGSCCProxy>(InitialC, CG)
+          .getManager();
+
+  InlineAdvisor &Advisor = getAdvisor(MAMProxy, FAM, M);
+  Advisor.onPassEntry();
+
+  auto AdvisorOnExit = make_scope_exit([&] { Advisor.onPassExit(); });
+
+  // We use a single common worklist for calls across the entire SCC. We
+  // process these in-order and append new calls introduced during inlining to
+  // the end.
+  //
+  // Note that this particular order of processing is actually critical to
+  // avoid very bad behaviors. Consider *highly connected* call graphs where
   // each function contains a small amount of code and a couple of calls to
-  // other functions. Because the LLVM inliner is fundamentally a bottom-up 
-  // inliner, it can handle gracefully the fact that these all appear to be 
-  // reasonable inlining candidates as it will flatten things until they become 
-  // too big to inline, and then move on and flatten another batch. 
-  // 
-  // However, when processing call edges *within* an SCC we cannot rely on this 
-  // bottom-up behavior. As a consequence, with heavily connected *SCCs* of 
-  // functions we can end up incrementally inlining N calls into each of 
-  // N functions because each incremental inlining decision looks good and we 
-  // don't have a topological ordering to prevent explosions. 
-  // 
-  // To compensate for this, we don't process transitive edges made immediate 
-  // by inlining until we've done one pass of inlining across the entire SCC. 
-  // Large, highly connected SCCs still lead to some amount of code bloat in 
-  // this model, but it is uniformly spread across all the functions in the SCC 
-  // and eventually they all become too large to inline, rather than 
-  // incrementally maknig a single function grow in a super linear fashion. 
-  SmallVector<std::pair<CallBase *, int>, 16> Calls; 
- 
-  // Populate the initial list of calls in this SCC. 
-  for (auto &N : InitialC) { 
-    auto &ORE = 
-        FAM.getResult<OptimizationRemarkEmitterAnalysis>(N.getFunction()); 
-    // We want to generally process call sites top-down in order for 
-    // simplifications stemming from replacing the call with the returned value 
-    // after inlining to be visible to subsequent inlining decisions. 
-    // FIXME: Using instructions sequence is a really bad way to do this. 
-    // Instead we should do an actual RPO walk of the function body. 
-    for (Instruction &I : instructions(N.getFunction())) 
-      if (auto *CB = dyn_cast<CallBase>(&I)) 
-        if (Function *Callee = CB->getCalledFunction()) { 
-          if (!Callee->isDeclaration()) 
-            Calls.push_back({CB, -1}); 
-          else if (!isa<IntrinsicInst>(I)) { 
-            using namespace ore; 
-            setInlineRemark(*CB, "unavailable definition"); 
-            ORE.emit([&]() { 
-              return OptimizationRemarkMissed(DEBUG_TYPE, "NoDefinition", &I) 
-                     << NV("Callee", Callee) << " will not be inlined into " 
-                     << NV("Caller", CB->getCaller()) 
-                     << " because its definition is unavailable" 
-                     << setIsVerbose(); 
-            }); 
-          } 
-        } 
-  } 
-  if (Calls.empty()) 
-    return PreservedAnalyses::all(); 
- 
+  // other functions. Because the LLVM inliner is fundamentally a bottom-up
+  // inliner, it can handle gracefully the fact that these all appear to be
+  // reasonable inlining candidates as it will flatten things until they become
+  // too big to inline, and then move on and flatten another batch.
+  //
+  // However, when processing call edges *within* an SCC we cannot rely on this
+  // bottom-up behavior. As a consequence, with heavily connected *SCCs* of
+  // functions we can end up incrementally inlining N calls into each of
+  // N functions because each incremental inlining decision looks good and we
+  // don't have a topological ordering to prevent explosions.
+  //
+  // To compensate for this, we don't process transitive edges made immediate
+  // by inlining until we've done one pass of inlining across the entire SCC.
+  // Large, highly connected SCCs still lead to some amount of code bloat in
+  // this model, but it is uniformly spread across all the functions in the SCC
+  // and eventually they all become too large to inline, rather than
+  // incrementally maknig a single function grow in a super linear fashion.
+  SmallVector<std::pair<CallBase *, int>, 16> Calls;
+
+  // Populate the initial list of calls in this SCC.
+  for (auto &N : InitialC) {
+    auto &ORE =
+        FAM.getResult<OptimizationRemarkEmitterAnalysis>(N.getFunction());
+    // We want to generally process call sites top-down in order for
+    // simplifications stemming from replacing the call with the returned value
+    // after inlining to be visible to subsequent inlining decisions.
+    // FIXME: Using instructions sequence is a really bad way to do this.
+    // Instead we should do an actual RPO walk of the function body.
+    for (Instruction &I : instructions(N.getFunction()))
+      if (auto *CB = dyn_cast<CallBase>(&I))
+        if (Function *Callee = CB->getCalledFunction()) {
+          if (!Callee->isDeclaration())
+            Calls.push_back({CB, -1});
+          else if (!isa<IntrinsicInst>(I)) {
+            using namespace ore;
+            setInlineRemark(*CB, "unavailable definition");
+            ORE.emit([&]() {
+              return OptimizationRemarkMissed(DEBUG_TYPE, "NoDefinition", &I)
+                     << NV("Callee", Callee) << " will not be inlined into "
+                     << NV("Caller", CB->getCaller())
+                     << " because its definition is unavailable"
+                     << setIsVerbose();
+            });
+          }
+        }
+  }
+  if (Calls.empty())
+    return PreservedAnalyses::all();
+
   // Capture updatable variable for the current SCC.
-  auto *C = &InitialC; 
- 
-  // When inlining a callee produces new call sites, we want to keep track of 
-  // the fact that they were inlined from the callee.  This allows us to avoid 
-  // infinite inlining in some obscure cases.  To represent this, we use an 
-  // index into the InlineHistory vector. 
-  SmallVector<std::pair<Function *, int>, 16> InlineHistory; 
- 
-  // Track a set vector of inlined callees so that we can augment the caller 
-  // with all of their edges in the call graph before pruning out the ones that 
-  // got simplified away. 
-  SmallSetVector<Function *, 4> InlinedCallees; 
- 
-  // Track the dead functions to delete once finished with inlining calls. We 
-  // defer deleting these to make it easier to handle the call graph updates. 
-  SmallVector<Function *, 4> DeadFunctions; 
- 
-  // Loop forward over all of the calls. Note that we cannot cache the size as 
-  // inlining can introduce new calls that need to be processed. 
-  for (int I = 0; I < (int)Calls.size(); ++I) { 
-    // We expect the calls to typically be batched with sequences of calls that 
-    // have the same caller, so we first set up some shared infrastructure for 
-    // this caller. We also do any pruning we can at this layer on the caller 
-    // alone. 
-    Function &F = *Calls[I].first->getCaller(); 
-    LazyCallGraph::Node &N = *CG.lookup(F); 
-    if (CG.lookupSCC(N) != C) 
-      continue; 
- 
-    LLVM_DEBUG(dbgs() << "Inlining calls in: " << F.getName() << "\n"); 
- 
-    auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & { 
-      return FAM.getResult<AssumptionAnalysis>(F); 
-    }; 
- 
-    // Now process as many calls as we have within this caller in the sequence. 
-    // We bail out as soon as the caller has to change so we can update the 
-    // call graph and prepare the context of that new caller. 
-    bool DidInline = false; 
-    for (; I < (int)Calls.size() && Calls[I].first->getCaller() == &F; ++I) { 
-      auto &P = Calls[I]; 
-      CallBase *CB = P.first; 
-      const int InlineHistoryID = P.second; 
-      Function &Callee = *CB->getCalledFunction(); 
- 
-      if (InlineHistoryID != -1 && 
-          inlineHistoryIncludes(&Callee, InlineHistoryID, InlineHistory)) { 
-        setInlineRemark(*CB, "recursive"); 
-        continue; 
-      } 
- 
-      // Check if this inlining may repeat breaking an SCC apart that has 
-      // already been split once before. In that case, inlining here may 
-      // trigger infinite inlining, much like is prevented within the inliner 
-      // itself by the InlineHistory above, but spread across CGSCC iterations 
-      // and thus hidden from the full inline history. 
-      if (CG.lookupSCC(*CG.lookup(Callee)) == C && 
-          UR.InlinedInternalEdges.count({&N, C})) { 
-        LLVM_DEBUG(dbgs() << "Skipping inlining internal SCC edge from a node " 
-                             "previously split out of this SCC by inlining: " 
-                          << F.getName() << " -> " << Callee.getName() << "\n"); 
-        setInlineRemark(*CB, "recursive SCC split"); 
-        continue; 
-      } 
- 
+  auto *C = &InitialC;
+
+  // When inlining a callee produces new call sites, we want to keep track of
+  // the fact that they were inlined from the callee.  This allows us to avoid
+  // infinite inlining in some obscure cases.  To represent this, we use an
+  // index into the InlineHistory vector.
+  SmallVector<std::pair<Function *, int>, 16> InlineHistory;
+
+  // Track a set vector of inlined callees so that we can augment the caller
+  // with all of their edges in the call graph before pruning out the ones that
+  // got simplified away.
+  SmallSetVector<Function *, 4> InlinedCallees;
+
+  // Track the dead functions to delete once finished with inlining calls. We
+  // defer deleting these to make it easier to handle the call graph updates.
+  SmallVector<Function *, 4> DeadFunctions;
+
+  // Loop forward over all of the calls. Note that we cannot cache the size as
+  // inlining can introduce new calls that need to be processed.
+  for (int I = 0; I < (int)Calls.size(); ++I) {
+    // We expect the calls to typically be batched with sequences of calls that
+    // have the same caller, so we first set up some shared infrastructure for
+    // this caller. We also do any pruning we can at this layer on the caller
+    // alone.
+    Function &F = *Calls[I].first->getCaller();
+    LazyCallGraph::Node &N = *CG.lookup(F);
+    if (CG.lookupSCC(N) != C)
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Inlining calls in: " << F.getName() << "\n");
+
+    auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
+      return FAM.getResult<AssumptionAnalysis>(F);
+    };
+
+    // Now process as many calls as we have within this caller in the sequence.
+    // We bail out as soon as the caller has to change so we can update the
+    // call graph and prepare the context of that new caller.
+    bool DidInline = false;
+    for (; I < (int)Calls.size() && Calls[I].first->getCaller() == &F; ++I) {
+      auto &P = Calls[I];
+      CallBase *CB = P.first;
+      const int InlineHistoryID = P.second;
+      Function &Callee = *CB->getCalledFunction();
+
+      if (InlineHistoryID != -1 &&
+          inlineHistoryIncludes(&Callee, InlineHistoryID, InlineHistory)) {
+        setInlineRemark(*CB, "recursive");
+        continue;
+      }
+
+      // Check if this inlining may repeat breaking an SCC apart that has
+      // already been split once before. In that case, inlining here may
+      // trigger infinite inlining, much like is prevented within the inliner
+      // itself by the InlineHistory above, but spread across CGSCC iterations
+      // and thus hidden from the full inline history.
+      if (CG.lookupSCC(*CG.lookup(Callee)) == C &&
+          UR.InlinedInternalEdges.count({&N, C})) {
+        LLVM_DEBUG(dbgs() << "Skipping inlining internal SCC edge from a node "
+                             "previously split out of this SCC by inlining: "
+                          << F.getName() << " -> " << Callee.getName() << "\n");
+        setInlineRemark(*CB, "recursive SCC split");
+        continue;
+      }
+
       auto Advice = Advisor.getAdvice(*CB, OnlyMandatory);
-      // Check whether we want to inline this callsite. 
-      if (!Advice->isInliningRecommended()) { 
-        Advice->recordUnattemptedInlining(); 
-        continue; 
-      } 
- 
-      // Setup the data structure used to plumb customization into the 
-      // `InlineFunction` routine. 
-      InlineFunctionInfo IFI( 
-          /*cg=*/nullptr, GetAssumptionCache, PSI, 
-          &FAM.getResult<BlockFrequencyAnalysis>(*(CB->getCaller())), 
-          &FAM.getResult<BlockFrequencyAnalysis>(Callee)); 
- 
+      // Check whether we want to inline this callsite.
+      if (!Advice->isInliningRecommended()) {
+        Advice->recordUnattemptedInlining();
+        continue;
+      }
+
+      // Setup the data structure used to plumb customization into the
+      // `InlineFunction` routine.
+      InlineFunctionInfo IFI(
+          /*cg=*/nullptr, GetAssumptionCache, PSI,
+          &FAM.getResult<BlockFrequencyAnalysis>(*(CB->getCaller())),
+          &FAM.getResult<BlockFrequencyAnalysis>(Callee));
+
       InlineResult IR =
           InlineFunction(*CB, IFI, &FAM.getResult<AAManager>(*CB->getCaller()));
-      if (!IR.isSuccess()) { 
-        Advice->recordUnsuccessfulInlining(IR); 
-        continue; 
-      } 
- 
-      DidInline = true; 
-      InlinedCallees.insert(&Callee); 
-      ++NumInlined; 
- 
-      // Add any new callsites to defined functions to the worklist. 
-      if (!IFI.InlinedCallSites.empty()) { 
-        int NewHistoryID = InlineHistory.size(); 
-        InlineHistory.push_back({&Callee, InlineHistoryID}); 
- 
-        for (CallBase *ICB : reverse(IFI.InlinedCallSites)) { 
-          Function *NewCallee = ICB->getCalledFunction(); 
-          if (!NewCallee) { 
-            // Try to promote an indirect (virtual) call without waiting for 
-            // the post-inline cleanup and the next DevirtSCCRepeatedPass 
-            // iteration because the next iteration may not happen and we may 
-            // miss inlining it. 
-            if (tryPromoteCall(*ICB)) 
-              NewCallee = ICB->getCalledFunction(); 
-          } 
-          if (NewCallee) 
-            if (!NewCallee->isDeclaration()) 
-              Calls.push_back({ICB, NewHistoryID}); 
-        } 
-      } 
- 
-      // Merge the attributes based on the inlining. 
-      AttributeFuncs::mergeAttributesForInlining(F, Callee); 
- 
-      // For local functions, check whether this makes the callee trivially 
-      // dead. In that case, we can drop the body of the function eagerly 
-      // which may reduce the number of callers of other functions to one, 
-      // changing inline cost thresholds. 
-      bool CalleeWasDeleted = false; 
-      if (Callee.hasLocalLinkage()) { 
-        // To check this we also need to nuke any dead constant uses (perhaps 
-        // made dead by this operation on other functions). 
-        Callee.removeDeadConstantUsers(); 
-        if (Callee.use_empty() && !CG.isLibFunction(Callee)) { 
-          Calls.erase( 
-              std::remove_if(Calls.begin() + I + 1, Calls.end(), 
-                             [&](const std::pair<CallBase *, int> &Call) { 
-                               return Call.first->getCaller() == &Callee; 
-                             }), 
-              Calls.end()); 
-          // Clear the body and queue the function itself for deletion when we 
-          // finish inlining and call graph updates. 
-          // Note that after this point, it is an error to do anything other 
-          // than use the callee's address or delete it. 
-          Callee.dropAllReferences(); 
+      if (!IR.isSuccess()) {
+        Advice->recordUnsuccessfulInlining(IR);
+        continue;
+      }
+
+      DidInline = true;
+      InlinedCallees.insert(&Callee);
+      ++NumInlined;
+
+      // Add any new callsites to defined functions to the worklist.
+      if (!IFI.InlinedCallSites.empty()) {
+        int NewHistoryID = InlineHistory.size();
+        InlineHistory.push_back({&Callee, InlineHistoryID});
+
+        for (CallBase *ICB : reverse(IFI.InlinedCallSites)) {
+          Function *NewCallee = ICB->getCalledFunction();
+          if (!NewCallee) {
+            // Try to promote an indirect (virtual) call without waiting for
+            // the post-inline cleanup and the next DevirtSCCRepeatedPass
+            // iteration because the next iteration may not happen and we may
+            // miss inlining it.
+            if (tryPromoteCall(*ICB))
+              NewCallee = ICB->getCalledFunction();
+          }
+          if (NewCallee)
+            if (!NewCallee->isDeclaration())
+              Calls.push_back({ICB, NewHistoryID});
+        }
+      }
+
+      // Merge the attributes based on the inlining.
+      AttributeFuncs::mergeAttributesForInlining(F, Callee);
+
+      // For local functions, check whether this makes the callee trivially
+      // dead. In that case, we can drop the body of the function eagerly
+      // which may reduce the number of callers of other functions to one,
+      // changing inline cost thresholds.
+      bool CalleeWasDeleted = false;
+      if (Callee.hasLocalLinkage()) {
+        // To check this we also need to nuke any dead constant uses (perhaps
+        // made dead by this operation on other functions).
+        Callee.removeDeadConstantUsers();
+        if (Callee.use_empty() && !CG.isLibFunction(Callee)) {
+          Calls.erase(
+              std::remove_if(Calls.begin() + I + 1, Calls.end(),
+                             [&](const std::pair<CallBase *, int> &Call) {
+                               return Call.first->getCaller() == &Callee;
+                             }),
+              Calls.end());
+          // Clear the body and queue the function itself for deletion when we
+          // finish inlining and call graph updates.
+          // Note that after this point, it is an error to do anything other
+          // than use the callee's address or delete it.
+          Callee.dropAllReferences();
           assert(!is_contained(DeadFunctions, &Callee) &&
-                 "Cannot put cause a function to become dead twice!"); 
-          DeadFunctions.push_back(&Callee); 
-          CalleeWasDeleted = true; 
-        } 
-      } 
-      if (CalleeWasDeleted) 
-        Advice->recordInliningWithCalleeDeleted(); 
-      else 
-        Advice->recordInlining(); 
-    } 
- 
-    // Back the call index up by one to put us in a good position to go around 
-    // the outer loop. 
-    --I; 
- 
-    if (!DidInline) 
-      continue; 
-    Changed = true; 
- 
-    // At this point, since we have made changes we have at least removed 
-    // a call instruction. However, in the process we do some incremental 
-    // simplification of the surrounding code. This simplification can 
-    // essentially do all of the same things as a function pass and we can 
-    // re-use the exact same logic for updating the call graph to reflect the 
-    // change. 
- 
-    // Inside the update, we also update the FunctionAnalysisManager in the 
-    // proxy for this particular SCC. We do this as the SCC may have changed and 
-    // as we're going to mutate this particular function we want to make sure 
-    // the proxy is in place to forward any invalidation events. 
-    LazyCallGraph::SCC *OldC = C; 
+                 "Cannot put cause a function to become dead twice!");
+          DeadFunctions.push_back(&Callee);
+          CalleeWasDeleted = true;
+        }
+      }
+      if (CalleeWasDeleted)
+        Advice->recordInliningWithCalleeDeleted();
+      else
+        Advice->recordInlining();
+    }
+
+    // Back the call index up by one to put us in a good position to go around
+    // the outer loop.
+    --I;
+
+    if (!DidInline)
+      continue;
+    Changed = true;
+
+    // At this point, since we have made changes we have at least removed
+    // a call instruction. However, in the process we do some incremental
+    // simplification of the surrounding code. This simplification can
+    // essentially do all of the same things as a function pass and we can
+    // re-use the exact same logic for updating the call graph to reflect the
+    // change.
+
+    // Inside the update, we also update the FunctionAnalysisManager in the
+    // proxy for this particular SCC. We do this as the SCC may have changed and
+    // as we're going to mutate this particular function we want to make sure
+    // the proxy is in place to forward any invalidation events.
+    LazyCallGraph::SCC *OldC = C;
     C = &updateCGAndAnalysisManagerForCGSCCPass(CG, *C, N, AM, UR, FAM);
-    LLVM_DEBUG(dbgs() << "Updated inlining SCC: " << *C << "\n"); 
- 
-    // If this causes an SCC to split apart into multiple smaller SCCs, there 
-    // is a subtle risk we need to prepare for. Other transformations may 
-    // expose an "infinite inlining" opportunity later, and because of the SCC 
-    // mutation, we will revisit this function and potentially re-inline. If we 
-    // do, and that re-inlining also has the potentially to mutate the SCC 
-    // structure, the infinite inlining problem can manifest through infinite 
-    // SCC splits and merges. To avoid this, we capture the originating caller 
-    // node and the SCC containing the call edge. This is a slight over 
-    // approximation of the possible inlining decisions that must be avoided, 
-    // but is relatively efficient to store. We use C != OldC to know when 
-    // a new SCC is generated and the original SCC may be generated via merge 
-    // in later iterations. 
-    // 
-    // It is also possible that even if no new SCC is generated 
-    // (i.e., C == OldC), the original SCC could be split and then merged 
-    // into the same one as itself. and the original SCC will be added into 
-    // UR.CWorklist again, we want to catch such cases too. 
-    // 
-    // FIXME: This seems like a very heavyweight way of retaining the inline 
-    // history, we should look for a more efficient way of tracking it. 
-    if ((C != OldC || UR.CWorklist.count(OldC)) && 
-        llvm::any_of(InlinedCallees, [&](Function *Callee) { 
-          return CG.lookupSCC(*CG.lookup(*Callee)) == OldC; 
-        })) { 
-      LLVM_DEBUG(dbgs() << "Inlined an internal call edge and split an SCC, " 
-                           "retaining this to avoid infinite inlining.\n"); 
-      UR.InlinedInternalEdges.insert({&N, OldC}); 
-    } 
-    InlinedCallees.clear(); 
-  } 
- 
-  // Now that we've finished inlining all of the calls across this SCC, delete 
-  // all of the trivially dead functions, updating the call graph and the CGSCC 
-  // pass manager in the process. 
-  // 
-  // Note that this walks a pointer set which has non-deterministic order but 
-  // that is OK as all we do is delete things and add pointers to unordered 
-  // sets. 
-  for (Function *DeadF : DeadFunctions) { 
-    // Get the necessary information out of the call graph and nuke the 
-    // function there. Also, clear out any cached analyses. 
-    auto &DeadC = *CG.lookupSCC(*CG.lookup(*DeadF)); 
-    FAM.clear(*DeadF, DeadF->getName()); 
-    AM.clear(DeadC, DeadC.getName()); 
-    auto &DeadRC = DeadC.getOuterRefSCC(); 
-    CG.removeDeadFunction(*DeadF); 
- 
-    // Mark the relevant parts of the call graph as invalid so we don't visit 
-    // them. 
-    UR.InvalidatedSCCs.insert(&DeadC); 
-    UR.InvalidatedRefSCCs.insert(&DeadRC); 
- 
-    // And delete the actual function from the module. 
-    // The Advisor may use Function pointers to efficiently index various 
-    // internal maps, e.g. for memoization. Function cleanup passes like 
-    // argument promotion create new functions. It is possible for a new 
-    // function to be allocated at the address of a deleted function. We could 
-    // index using names, but that's inefficient. Alternatively, we let the 
-    // Advisor free the functions when it sees fit. 
-    DeadF->getBasicBlockList().clear(); 
-    M.getFunctionList().remove(DeadF); 
- 
-    ++NumDeleted; 
-  } 
- 
-  if (!Changed) 
-    return PreservedAnalyses::all(); 
- 
-  // Even if we change the IR, we update the core CGSCC data structures and so 
-  // can preserve the proxy to the function analysis manager. 
-  PreservedAnalyses PA; 
-  PA.preserve<FunctionAnalysisManagerCGSCCProxy>(); 
-  return PA; 
-} 
- 
-ModuleInlinerWrapperPass::ModuleInlinerWrapperPass(InlineParams Params, 
-                                                   bool Debugging, 
+    LLVM_DEBUG(dbgs() << "Updated inlining SCC: " << *C << "\n");
+
+    // If this causes an SCC to split apart into multiple smaller SCCs, there
+    // is a subtle risk we need to prepare for. Other transformations may
+    // expose an "infinite inlining" opportunity later, and because of the SCC
+    // mutation, we will revisit this function and potentially re-inline. If we
+    // do, and that re-inlining also has the potentially to mutate the SCC
+    // structure, the infinite inlining problem can manifest through infinite
+    // SCC splits and merges. To avoid this, we capture the originating caller
+    // node and the SCC containing the call edge. This is a slight over
+    // approximation of the possible inlining decisions that must be avoided,
+    // but is relatively efficient to store. We use C != OldC to know when
+    // a new SCC is generated and the original SCC may be generated via merge
+    // in later iterations.
+    //
+    // It is also possible that even if no new SCC is generated
+    // (i.e., C == OldC), the original SCC could be split and then merged
+    // into the same one as itself. and the original SCC will be added into
+    // UR.CWorklist again, we want to catch such cases too.
+    //
+    // FIXME: This seems like a very heavyweight way of retaining the inline
+    // history, we should look for a more efficient way of tracking it.
+    if ((C != OldC || UR.CWorklist.count(OldC)) &&
+        llvm::any_of(InlinedCallees, [&](Function *Callee) {
+          return CG.lookupSCC(*CG.lookup(*Callee)) == OldC;
+        })) {
+      LLVM_DEBUG(dbgs() << "Inlined an internal call edge and split an SCC, "
+                           "retaining this to avoid infinite inlining.\n");
+      UR.InlinedInternalEdges.insert({&N, OldC});
+    }
+    InlinedCallees.clear();
+  }
+
+  // Now that we've finished inlining all of the calls across this SCC, delete
+  // all of the trivially dead functions, updating the call graph and the CGSCC
+  // pass manager in the process.
+  //
+  // Note that this walks a pointer set which has non-deterministic order but
+  // that is OK as all we do is delete things and add pointers to unordered
+  // sets.
+  for (Function *DeadF : DeadFunctions) {
+    // Get the necessary information out of the call graph and nuke the
+    // function there. Also, clear out any cached analyses.
+    auto &DeadC = *CG.lookupSCC(*CG.lookup(*DeadF));
+    FAM.clear(*DeadF, DeadF->getName());
+    AM.clear(DeadC, DeadC.getName());
+    auto &DeadRC = DeadC.getOuterRefSCC();
+    CG.removeDeadFunction(*DeadF);
+
+    // Mark the relevant parts of the call graph as invalid so we don't visit
+    // them.
+    UR.InvalidatedSCCs.insert(&DeadC);
+    UR.InvalidatedRefSCCs.insert(&DeadRC);
+
+    // And delete the actual function from the module.
+    // The Advisor may use Function pointers to efficiently index various
+    // internal maps, e.g. for memoization. Function cleanup passes like
+    // argument promotion create new functions. It is possible for a new
+    // function to be allocated at the address of a deleted function. We could
+    // index using names, but that's inefficient. Alternatively, we let the
+    // Advisor free the functions when it sees fit.
+    DeadF->getBasicBlockList().clear();
+    M.getFunctionList().remove(DeadF);
+
+    ++NumDeleted;
+  }
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  // Even if we change the IR, we update the core CGSCC data structures and so
+  // can preserve the proxy to the function analysis manager.
+  PreservedAnalyses PA;
+  PA.preserve<FunctionAnalysisManagerCGSCCProxy>();
+  return PA;
+}
+
+ModuleInlinerWrapperPass::ModuleInlinerWrapperPass(InlineParams Params,
+                                                   bool Debugging,
                                                    bool MandatoryFirst,
-                                                   InliningAdvisorMode Mode, 
-                                                   unsigned MaxDevirtIterations) 
-    : Params(Params), Mode(Mode), MaxDevirtIterations(MaxDevirtIterations), 
-      PM(Debugging), MPM(Debugging) { 
-  // Run the inliner first. The theory is that we are walking bottom-up and so 
-  // the callees have already been fully optimized, and we want to inline them 
-  // into the callers so that our optimizations can reflect that. 
-  // For PreLinkThinLTO pass, we disable hot-caller heuristic for sample PGO 
-  // because it makes profile annotation in the backend inaccurate. 
+                                                   InliningAdvisorMode Mode,
+                                                   unsigned MaxDevirtIterations)
+    : Params(Params), Mode(Mode), MaxDevirtIterations(MaxDevirtIterations),
+      PM(Debugging), MPM(Debugging) {
+  // Run the inliner first. The theory is that we are walking bottom-up and so
+  // the callees have already been fully optimized, and we want to inline them
+  // into the callers so that our optimizations can reflect that.
+  // For PreLinkThinLTO pass, we disable hot-caller heuristic for sample PGO
+  // because it makes profile annotation in the backend inaccurate.
   if (MandatoryFirst)
     PM.addPass(InlinerPass(/*OnlyMandatory*/ true));
-  PM.addPass(InlinerPass()); 
-} 
- 
-PreservedAnalyses ModuleInlinerWrapperPass::run(Module &M, 
-                                                ModuleAnalysisManager &MAM) { 
-  auto &IAA = MAM.getResult<InlineAdvisorAnalysis>(M); 
+  PM.addPass(InlinerPass());
+}
+
+PreservedAnalyses ModuleInlinerWrapperPass::run(Module &M,
+                                                ModuleAnalysisManager &MAM) {
+  auto &IAA = MAM.getResult<InlineAdvisorAnalysis>(M);
   if (!IAA.tryCreate(Params, Mode, CGSCCInlineReplayFile)) {
-    M.getContext().emitError( 
-        "Could not setup Inlining Advisor for the requested " 
-        "mode and/or options"); 
-    return PreservedAnalyses::all(); 
-  } 
- 
-  // We wrap the CGSCC pipeline in a devirtualization repeater. This will try 
-  // to detect when we devirtualize indirect calls and iterate the SCC passes 
-  // in that case to try and catch knock-on inlining or function attrs 
-  // opportunities. Then we add it to the module pipeline by walking the SCCs 
-  // in postorder (or bottom-up). 
-  // If MaxDevirtIterations is 0, we just don't use the devirtualization 
-  // wrapper. 
-  if (MaxDevirtIterations == 0) 
-    MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(PM))); 
-  else 
-    MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor( 
-        createDevirtSCCRepeatedPass(std::move(PM), MaxDevirtIterations))); 
-  auto Ret = MPM.run(M, MAM); 
- 
-  IAA.clear(); 
-  return Ret; 
-} 
+    M.getContext().emitError(
+        "Could not setup Inlining Advisor for the requested "
+        "mode and/or options");
+    return PreservedAnalyses::all();
+  }
+
+  // We wrap the CGSCC pipeline in a devirtualization repeater. This will try
+  // to detect when we devirtualize indirect calls and iterate the SCC passes
+  // in that case to try and catch knock-on inlining or function attrs
+  // opportunities. Then we add it to the module pipeline by walking the SCCs
+  // in postorder (or bottom-up).
+  // If MaxDevirtIterations is 0, we just don't use the devirtualization
+  // wrapper.
+  if (MaxDevirtIterations == 0)
+    MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(PM)));
+  else
+    MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
+        createDevirtSCCRepeatedPass(std::move(PM), MaxDevirtIterations)));
+  auto Ret = MPM.run(M, MAM);
+
+  IAA.clear();
+  return Ret;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/Internalize.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/Internalize.cpp
index 77c13436a5..e1644819af 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/Internalize.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/Internalize.cpp
@@ -1,291 +1,291 @@
-//===-- Internalize.cpp - Mark functions internal -------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass loops over all of the functions and variables in the input module. 
-// If the function or variable does not need to be preserved according to the 
-// client supplied callback, it is marked as internal. 
-// 
-// This transformation would not be legal in a regular compilation, but it gets 
-// extra information from the linker about what is safe. 
-// 
-// For example: Internalizing a function with external linkage. Only if we are 
-// told it is only used from within this module, it is safe to do it. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/IPO/Internalize.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/StringSet.h" 
-#include "llvm/Analysis/CallGraph.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/LineIterator.h" 
-#include "llvm/Support/MemoryBuffer.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/IPO.h" 
-#include "llvm/Transforms/Utils/GlobalStatus.h" 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "internalize" 
- 
-STATISTIC(NumAliases, "Number of aliases internalized"); 
-STATISTIC(NumFunctions, "Number of functions internalized"); 
-STATISTIC(NumGlobals, "Number of global vars internalized"); 
- 
-// APIFile - A file which contains a list of symbols that should not be marked 
-// external. 
-static cl::opt<std::string> 
-    APIFile("internalize-public-api-file", cl::value_desc("filename"), 
-            cl::desc("A file containing list of symbol names to preserve")); 
- 
-// APIList - A list of symbols that should not be marked internal. 
-static cl::list<std::string> 
-    APIList("internalize-public-api-list", cl::value_desc("list"), 
-            cl::desc("A list of symbol names to preserve"), cl::CommaSeparated); 
- 
-namespace { 
-// Helper to load an API list to preserve from file and expose it as a functor 
-// for internalization. 
-class PreserveAPIList { 
-public: 
-  PreserveAPIList() { 
-    if (!APIFile.empty()) 
-      LoadFile(APIFile); 
-    ExternalNames.insert(APIList.begin(), APIList.end()); 
-  } 
- 
-  bool operator()(const GlobalValue &GV) { 
-    return ExternalNames.count(GV.getName()); 
-  } 
- 
-private: 
-  // Contains the set of symbols loaded from file 
-  StringSet<> ExternalNames; 
- 
-  void LoadFile(StringRef Filename) { 
-    // Load the APIFile... 
-    ErrorOr<std::unique_ptr<MemoryBuffer>> Buf = 
-        MemoryBuffer::getFile(Filename); 
-    if (!Buf) { 
-      errs() << "WARNING: Internalize couldn't load file '" << Filename 
-             << "'! Continuing as if it's empty.\n"; 
-      return; // Just continue as if the file were empty 
-    } 
-    for (line_iterator I(*Buf->get(), true), E; I != E; ++I) 
-      ExternalNames.insert(*I); 
-  } 
-}; 
-} // end anonymous namespace 
- 
-bool InternalizePass::shouldPreserveGV(const GlobalValue &GV) { 
-  // Function must be defined here 
-  if (GV.isDeclaration()) 
-    return true; 
- 
-  // Available externally is really just a "declaration with a body". 
-  if (GV.hasAvailableExternallyLinkage()) 
-    return true; 
- 
-  // Assume that dllexported symbols are referenced elsewhere 
-  if (GV.hasDLLExportStorageClass()) 
-    return true; 
- 
-  // Already local, has nothing to do. 
-  if (GV.hasLocalLinkage()) 
-    return false; 
- 
-  // Check some special cases 
-  if (AlwaysPreserved.count(GV.getName())) 
-    return true; 
- 
-  return MustPreserveGV(GV); 
-} 
- 
-bool InternalizePass::maybeInternalize( 
-    GlobalValue &GV, const DenseSet<const Comdat *> &ExternalComdats) { 
-  if (Comdat *C = GV.getComdat()) { 
-    if (ExternalComdats.count(C)) 
-      return false; 
- 
-    // If a comdat is not externally visible we can drop it. 
-    if (auto GO = dyn_cast<GlobalObject>(&GV)) 
-      GO->setComdat(nullptr); 
- 
-    if (GV.hasLocalLinkage()) 
-      return false; 
-  } else { 
-    if (GV.hasLocalLinkage()) 
-      return false; 
- 
-    if (shouldPreserveGV(GV)) 
-      return false; 
-  } 
- 
-  GV.setVisibility(GlobalValue::DefaultVisibility); 
-  GV.setLinkage(GlobalValue::InternalLinkage); 
-  return true; 
-} 
- 
-// If GV is part of a comdat and is externally visible, keep track of its 
-// comdat so that we don't internalize any of its members. 
-void InternalizePass::checkComdatVisibility( 
-    GlobalValue &GV, DenseSet<const Comdat *> &ExternalComdats) { 
-  Comdat *C = GV.getComdat(); 
-  if (!C) 
-    return; 
- 
-  if (shouldPreserveGV(GV)) 
-    ExternalComdats.insert(C); 
-} 
- 
-bool InternalizePass::internalizeModule(Module &M, CallGraph *CG) { 
-  bool Changed = false; 
-  CallGraphNode *ExternalNode = CG ? CG->getExternalCallingNode() : nullptr; 
- 
-  SmallPtrSet<GlobalValue *, 8> Used; 
-  collectUsedGlobalVariables(M, Used, false); 
- 
-  // Collect comdat visiblity information for the module. 
-  DenseSet<const Comdat *> ExternalComdats; 
-  if (!M.getComdatSymbolTable().empty()) { 
-    for (Function &F : M) 
-      checkComdatVisibility(F, ExternalComdats); 
-    for (GlobalVariable &GV : M.globals()) 
-      checkComdatVisibility(GV, ExternalComdats); 
-    for (GlobalAlias &GA : M.aliases()) 
-      checkComdatVisibility(GA, ExternalComdats); 
-  } 
- 
-  // We must assume that globals in llvm.used have a reference that not even 
-  // the linker can see, so we don't internalize them. 
-  // For llvm.compiler.used the situation is a bit fuzzy. The assembler and 
-  // linker can drop those symbols. If this pass is running as part of LTO, 
-  // one might think that it could just drop llvm.compiler.used. The problem 
-  // is that even in LTO llvm doesn't see every reference. For example, 
-  // we don't see references from function local inline assembly. To be 
-  // conservative, we internalize symbols in llvm.compiler.used, but we 
-  // keep llvm.compiler.used so that the symbol is not deleted by llvm. 
-  for (GlobalValue *V : Used) { 
-    AlwaysPreserved.insert(V->getName()); 
-  } 
- 
-  // Mark all functions not in the api as internal. 
-  for (Function &I : M) { 
-    if (!maybeInternalize(I, ExternalComdats)) 
-      continue; 
-    Changed = true; 
- 
-    if (ExternalNode) 
-      // Remove a callgraph edge from the external node to this function. 
-      ExternalNode->removeOneAbstractEdgeTo((*CG)[&I]); 
- 
-    ++NumFunctions; 
-    LLVM_DEBUG(dbgs() << "Internalizing func " << I.getName() << "\n"); 
-  } 
- 
-  // Never internalize the llvm.used symbol.  It is used to implement 
-  // attribute((used)). 
-  // FIXME: Shouldn't this just filter on llvm.metadata section?? 
-  AlwaysPreserved.insert("llvm.used"); 
-  AlwaysPreserved.insert("llvm.compiler.used"); 
- 
-  // Never internalize anchors used by the machine module info, else the info 
-  // won't find them.  (see MachineModuleInfo.) 
-  AlwaysPreserved.insert("llvm.global_ctors"); 
-  AlwaysPreserved.insert("llvm.global_dtors"); 
-  AlwaysPreserved.insert("llvm.global.annotations"); 
- 
-  // Never internalize symbols code-gen inserts. 
-  // FIXME: We should probably add this (and the __stack_chk_guard) via some 
-  // type of call-back in CodeGen. 
-  AlwaysPreserved.insert("__stack_chk_fail"); 
-  AlwaysPreserved.insert("__stack_chk_guard"); 
- 
-  // Mark all global variables with initializers that are not in the api as 
-  // internal as well. 
-  for (auto &GV : M.globals()) { 
-    if (!maybeInternalize(GV, ExternalComdats)) 
-      continue; 
-    Changed = true; 
- 
-    ++NumGlobals; 
-    LLVM_DEBUG(dbgs() << "Internalized gvar " << GV.getName() << "\n"); 
-  } 
- 
-  // Mark all aliases that are not in the api as internal as well. 
-  for (auto &GA : M.aliases()) { 
-    if (!maybeInternalize(GA, ExternalComdats)) 
-      continue; 
-    Changed = true; 
- 
-    ++NumAliases; 
-    LLVM_DEBUG(dbgs() << "Internalized alias " << GA.getName() << "\n"); 
-  } 
- 
-  return Changed; 
-} 
- 
-InternalizePass::InternalizePass() : MustPreserveGV(PreserveAPIList()) {} 
- 
-PreservedAnalyses InternalizePass::run(Module &M, ModuleAnalysisManager &AM) { 
-  if (!internalizeModule(M, AM.getCachedResult<CallGraphAnalysis>(M))) 
-    return PreservedAnalyses::all(); 
- 
-  PreservedAnalyses PA; 
-  PA.preserve<CallGraphAnalysis>(); 
-  return PA; 
-} 
- 
-namespace { 
-class InternalizeLegacyPass : public ModulePass { 
-  // Client supplied callback to control wheter a symbol must be preserved. 
-  std::function<bool(const GlobalValue &)> MustPreserveGV; 
- 
-public: 
-  static char ID; // Pass identification, replacement for typeid 
- 
-  InternalizeLegacyPass() : ModulePass(ID), MustPreserveGV(PreserveAPIList()) {} 
- 
-  InternalizeLegacyPass(std::function<bool(const GlobalValue &)> MustPreserveGV) 
-      : ModulePass(ID), MustPreserveGV(std::move(MustPreserveGV)) { 
-    initializeInternalizeLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnModule(Module &M) override { 
-    if (skipModule(M)) 
-      return false; 
- 
-    CallGraphWrapperPass *CGPass = 
-        getAnalysisIfAvailable<CallGraphWrapperPass>(); 
-    CallGraph *CG = CGPass ? &CGPass->getCallGraph() : nullptr; 
-    return internalizeModule(M, MustPreserveGV, CG); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.setPreservesCFG(); 
-    AU.addPreserved<CallGraphWrapperPass>(); 
-  } 
-}; 
-} 
- 
-char InternalizeLegacyPass::ID = 0; 
-INITIALIZE_PASS(InternalizeLegacyPass, "internalize", 
-                "Internalize Global Symbols", false, false) 
- 
-ModulePass *llvm::createInternalizePass() { 
-  return new InternalizeLegacyPass(); 
-} 
- 
-ModulePass *llvm::createInternalizePass( 
-    std::function<bool(const GlobalValue &)> MustPreserveGV) { 
-  return new InternalizeLegacyPass(std::move(MustPreserveGV)); 
-} 
+//===-- Internalize.cpp - Mark functions internal -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass loops over all of the functions and variables in the input module.
+// If the function or variable does not need to be preserved according to the
+// client supplied callback, it is marked as internal.
+//
+// This transformation would not be legal in a regular compilation, but it gets
+// extra information from the linker about what is safe.
+//
+// For example: Internalizing a function with external linkage. Only if we are
+// told it is only used from within this module, it is safe to do it.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/Internalize.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/LineIterator.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "internalize"
+
+STATISTIC(NumAliases, "Number of aliases internalized");
+STATISTIC(NumFunctions, "Number of functions internalized");
+STATISTIC(NumGlobals, "Number of global vars internalized");
+
+// APIFile - A file which contains a list of symbols that should not be marked
+// external.
+static cl::opt<std::string>
+    APIFile("internalize-public-api-file", cl::value_desc("filename"),
+            cl::desc("A file containing list of symbol names to preserve"));
+
+// APIList - A list of symbols that should not be marked internal.
+static cl::list<std::string>
+    APIList("internalize-public-api-list", cl::value_desc("list"),
+            cl::desc("A list of symbol names to preserve"), cl::CommaSeparated);
+
+namespace {
+// Helper to load an API list to preserve from file and expose it as a functor
+// for internalization.
+class PreserveAPIList {
+public:
+  PreserveAPIList() {
+    if (!APIFile.empty())
+      LoadFile(APIFile);
+    ExternalNames.insert(APIList.begin(), APIList.end());
+  }
+
+  bool operator()(const GlobalValue &GV) {
+    return ExternalNames.count(GV.getName());
+  }
+
+private:
+  // Contains the set of symbols loaded from file
+  StringSet<> ExternalNames;
+
+  void LoadFile(StringRef Filename) {
+    // Load the APIFile...
+    ErrorOr<std::unique_ptr<MemoryBuffer>> Buf =
+        MemoryBuffer::getFile(Filename);
+    if (!Buf) {
+      errs() << "WARNING: Internalize couldn't load file '" << Filename
+             << "'! Continuing as if it's empty.\n";
+      return; // Just continue as if the file were empty
+    }
+    for (line_iterator I(*Buf->get(), true), E; I != E; ++I)
+      ExternalNames.insert(*I);
+  }
+};
+} // end anonymous namespace
+
+bool InternalizePass::shouldPreserveGV(const GlobalValue &GV) {
+  // Function must be defined here
+  if (GV.isDeclaration())
+    return true;
+
+  // Available externally is really just a "declaration with a body".
+  if (GV.hasAvailableExternallyLinkage())
+    return true;
+
+  // Assume that dllexported symbols are referenced elsewhere
+  if (GV.hasDLLExportStorageClass())
+    return true;
+
+  // Already local, has nothing to do.
+  if (GV.hasLocalLinkage())
+    return false;
+
+  // Check some special cases
+  if (AlwaysPreserved.count(GV.getName()))
+    return true;
+
+  return MustPreserveGV(GV);
+}
+
+bool InternalizePass::maybeInternalize(
+    GlobalValue &GV, const DenseSet<const Comdat *> &ExternalComdats) {
+  if (Comdat *C = GV.getComdat()) {
+    if (ExternalComdats.count(C))
+      return false;
+
+    // If a comdat is not externally visible we can drop it.
+    if (auto GO = dyn_cast<GlobalObject>(&GV))
+      GO->setComdat(nullptr);
+
+    if (GV.hasLocalLinkage())
+      return false;
+  } else {
+    if (GV.hasLocalLinkage())
+      return false;
+
+    if (shouldPreserveGV(GV))
+      return false;
+  }
+
+  GV.setVisibility(GlobalValue::DefaultVisibility);
+  GV.setLinkage(GlobalValue::InternalLinkage);
+  return true;
+}
+
+// If GV is part of a comdat and is externally visible, keep track of its
+// comdat so that we don't internalize any of its members.
+void InternalizePass::checkComdatVisibility(
+    GlobalValue &GV, DenseSet<const Comdat *> &ExternalComdats) {
+  Comdat *C = GV.getComdat();
+  if (!C)
+    return;
+
+  if (shouldPreserveGV(GV))
+    ExternalComdats.insert(C);
+}
+
+bool InternalizePass::internalizeModule(Module &M, CallGraph *CG) {
+  bool Changed = false;
+  CallGraphNode *ExternalNode = CG ? CG->getExternalCallingNode() : nullptr;
+
+  SmallPtrSet<GlobalValue *, 8> Used;
+  collectUsedGlobalVariables(M, Used, false);
+
+  // Collect comdat visiblity information for the module.
+  DenseSet<const Comdat *> ExternalComdats;
+  if (!M.getComdatSymbolTable().empty()) {
+    for (Function &F : M)
+      checkComdatVisibility(F, ExternalComdats);
+    for (GlobalVariable &GV : M.globals())
+      checkComdatVisibility(GV, ExternalComdats);
+    for (GlobalAlias &GA : M.aliases())
+      checkComdatVisibility(GA, ExternalComdats);
+  }
+
+  // We must assume that globals in llvm.used have a reference that not even
+  // the linker can see, so we don't internalize them.
+  // For llvm.compiler.used the situation is a bit fuzzy. The assembler and
+  // linker can drop those symbols. If this pass is running as part of LTO,
+  // one might think that it could just drop llvm.compiler.used. The problem
+  // is that even in LTO llvm doesn't see every reference. For example,
+  // we don't see references from function local inline assembly. To be
+  // conservative, we internalize symbols in llvm.compiler.used, but we
+  // keep llvm.compiler.used so that the symbol is not deleted by llvm.
+  for (GlobalValue *V : Used) {
+    AlwaysPreserved.insert(V->getName());
+  }
+
+  // Mark all functions not in the api as internal.
+  for (Function &I : M) {
+    if (!maybeInternalize(I, ExternalComdats))
+      continue;
+    Changed = true;
+
+    if (ExternalNode)
+      // Remove a callgraph edge from the external node to this function.
+      ExternalNode->removeOneAbstractEdgeTo((*CG)[&I]);
+
+    ++NumFunctions;
+    LLVM_DEBUG(dbgs() << "Internalizing func " << I.getName() << "\n");
+  }
+
+  // Never internalize the llvm.used symbol.  It is used to implement
+  // attribute((used)).
+  // FIXME: Shouldn't this just filter on llvm.metadata section??
+  AlwaysPreserved.insert("llvm.used");
+  AlwaysPreserved.insert("llvm.compiler.used");
+
+  // Never internalize anchors used by the machine module info, else the info
+  // won't find them.  (see MachineModuleInfo.)
+  AlwaysPreserved.insert("llvm.global_ctors");
+  AlwaysPreserved.insert("llvm.global_dtors");
+  AlwaysPreserved.insert("llvm.global.annotations");
+
+  // Never internalize symbols code-gen inserts.
+  // FIXME: We should probably add this (and the __stack_chk_guard) via some
+  // type of call-back in CodeGen.
+  AlwaysPreserved.insert("__stack_chk_fail");
+  AlwaysPreserved.insert("__stack_chk_guard");
+
+  // Mark all global variables with initializers that are not in the api as
+  // internal as well.
+  for (auto &GV : M.globals()) {
+    if (!maybeInternalize(GV, ExternalComdats))
+      continue;
+    Changed = true;
+
+    ++NumGlobals;
+    LLVM_DEBUG(dbgs() << "Internalized gvar " << GV.getName() << "\n");
+  }
+
+  // Mark all aliases that are not in the api as internal as well.
+  for (auto &GA : M.aliases()) {
+    if (!maybeInternalize(GA, ExternalComdats))
+      continue;
+    Changed = true;
+
+    ++NumAliases;
+    LLVM_DEBUG(dbgs() << "Internalized alias " << GA.getName() << "\n");
+  }
+
+  return Changed;
+}
+
+InternalizePass::InternalizePass() : MustPreserveGV(PreserveAPIList()) {}
+
+PreservedAnalyses InternalizePass::run(Module &M, ModuleAnalysisManager &AM) {
+  if (!internalizeModule(M, AM.getCachedResult<CallGraphAnalysis>(M)))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserve<CallGraphAnalysis>();
+  return PA;
+}
+
+namespace {
+class InternalizeLegacyPass : public ModulePass {
+  // Client supplied callback to control wheter a symbol must be preserved.
+  std::function<bool(const GlobalValue &)> MustPreserveGV;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  InternalizeLegacyPass() : ModulePass(ID), MustPreserveGV(PreserveAPIList()) {}
+
+  InternalizeLegacyPass(std::function<bool(const GlobalValue &)> MustPreserveGV)
+      : ModulePass(ID), MustPreserveGV(std::move(MustPreserveGV)) {
+    initializeInternalizeLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+
+    CallGraphWrapperPass *CGPass =
+        getAnalysisIfAvailable<CallGraphWrapperPass>();
+    CallGraph *CG = CGPass ? &CGPass->getCallGraph() : nullptr;
+    return internalizeModule(M, MustPreserveGV, CG);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addPreserved<CallGraphWrapperPass>();
+  }
+};
+}
+
+char InternalizeLegacyPass::ID = 0;
+INITIALIZE_PASS(InternalizeLegacyPass, "internalize",
+                "Internalize Global Symbols", false, false)
+
+ModulePass *llvm::createInternalizePass() {
+  return new InternalizeLegacyPass();
+}
+
+ModulePass *llvm::createInternalizePass(
+    std::function<bool(const GlobalValue &)> MustPreserveGV) {
+  return new InternalizeLegacyPass(std::move(MustPreserveGV));
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/LoopExtractor.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/LoopExtractor.cpp
index 79cfa45924..a497c0390b 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/LoopExtractor.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/LoopExtractor.cpp
@@ -1,55 +1,55 @@
-//===- LoopExtractor.cpp - Extract each loop into a new function ----------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// A pass wrapper around the ExtractLoop() scalar transformation to extract each 
-// top-level loop into its own new function. If the loop is the ONLY loop in a 
-// given function, it is not touched. This is a pass most useful for debugging 
-// via bugpoint. 
-// 
-//===----------------------------------------------------------------------===// 
- 
+//===- LoopExtractor.cpp - Extract each loop into a new function ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A pass wrapper around the ExtractLoop() scalar transformation to extract each
+// top-level loop into its own new function. If the loop is the ONLY loop in a
+// given function, it is not touched. This is a pass most useful for debugging
+// via bugpoint.
+//
+//===----------------------------------------------------------------------===//
+
 #include "llvm/Transforms/IPO/LoopExtractor.h"
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Module.h" 
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Transforms/IPO.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/CodeExtractor.h" 
-#include <fstream> 
-#include <set> 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "loop-extract" 
- 
-STATISTIC(NumExtracted, "Number of loops extracted"); 
- 
-namespace { 
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/CodeExtractor.h"
+#include <fstream>
+#include <set>
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-extract"
+
+STATISTIC(NumExtracted, "Number of loops extracted");
+
+namespace {
 struct LoopExtractorLegacyPass : public ModulePass {
   static char ID; // Pass identification, replacement for typeid
- 
+
   unsigned NumLoops;
- 
+
   explicit LoopExtractorLegacyPass(unsigned NumLoops = ~0)
       : ModulePass(ID), NumLoops(NumLoops) {
     initializeLoopExtractorLegacyPassPass(*PassRegistry::getPassRegistry());
   }
- 
+
   bool runOnModule(Module &M) override;
- 
+
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequiredID(BreakCriticalEdgesID);
     AU.addRequired<DominatorTreeWrapperPass>();
@@ -59,7 +59,7 @@ struct LoopExtractorLegacyPass : public ModulePass {
     AU.addUsedIfAvailable<AssumptionCacheTracker>();
   }
 };
- 
+
 struct LoopExtractor {
   explicit LoopExtractor(
       unsigned NumLoops,
@@ -70,7 +70,7 @@ struct LoopExtractor {
         LookupLoopInfo(LookupLoopInfo),
         LookupAssumptionCache(LookupAssumptionCache) {}
   bool runOnModule(Module &M);
- 
+
 private:
   // The number of natural loops to extract from the program into functions.
   unsigned NumLoops;
@@ -89,35 +89,35 @@ private:
 
 char LoopExtractorLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopExtractorLegacyPass, "loop-extract",
-                      "Extract loops into new functions", false, false) 
-INITIALIZE_PASS_DEPENDENCY(BreakCriticalEdges) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 
+                      "Extract loops into new functions", false, false)
+INITIALIZE_PASS_DEPENDENCY(BreakCriticalEdges)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_END(LoopExtractorLegacyPass, "loop-extract",
-                    "Extract loops into new functions", false, false) 
- 
-namespace { 
-  /// SingleLoopExtractor - For bugpoint. 
+                    "Extract loops into new functions", false, false)
+
+namespace {
+  /// SingleLoopExtractor - For bugpoint.
 struct SingleLoopExtractor : public LoopExtractorLegacyPass {
   static char ID; // Pass identification, replacement for typeid
   SingleLoopExtractor() : LoopExtractorLegacyPass(1) {}
 };
-} // End anonymous namespace 
- 
-char SingleLoopExtractor::ID = 0; 
-INITIALIZE_PASS(SingleLoopExtractor, "loop-extract-single", 
-                "Extract at most one loop into a new function", false, false) 
- 
-// createLoopExtractorPass - This pass extracts all natural loops from the 
-// program into a function if it can. 
-// 
+} // End anonymous namespace
+
+char SingleLoopExtractor::ID = 0;
+INITIALIZE_PASS(SingleLoopExtractor, "loop-extract-single",
+                "Extract at most one loop into a new function", false, false)
+
+// createLoopExtractorPass - This pass extracts all natural loops from the
+// program into a function if it can.
+//
 Pass *llvm::createLoopExtractorPass() { return new LoopExtractorLegacyPass(); }
- 
+
 bool LoopExtractorLegacyPass::runOnModule(Module &M) {
-  if (skipModule(M)) 
-    return false; 
- 
+  if (skipModule(M))
+    return false;
+
   bool Changed = false;
   auto LookupDomTree = [this](Function &F) -> DominatorTree & {
     return this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
@@ -136,132 +136,132 @@ bool LoopExtractorLegacyPass::runOnModule(Module &M) {
 }
 
 bool LoopExtractor::runOnModule(Module &M) {
-  if (M.empty()) 
-    return false; 
- 
-  if (!NumLoops) 
-    return false; 
- 
-  bool Changed = false; 
- 
-  // The end of the function list may change (new functions will be added at the 
-  // end), so we run from the first to the current last. 
-  auto I = M.begin(), E = --M.end(); 
-  while (true) { 
-    Function &F = *I; 
- 
-    Changed |= runOnFunction(F); 
-    if (!NumLoops) 
-      break; 
- 
-    // If this is the last function. 
-    if (I == E) 
-      break; 
- 
-    ++I; 
-  } 
-  return Changed; 
-} 
- 
-bool LoopExtractor::runOnFunction(Function &F) { 
-  // Do not modify `optnone` functions. 
-  if (F.hasOptNone()) 
-    return false; 
- 
-  if (F.empty()) 
-    return false; 
- 
-  bool Changed = false; 
+  if (M.empty())
+    return false;
+
+  if (!NumLoops)
+    return false;
+
+  bool Changed = false;
+
+  // The end of the function list may change (new functions will be added at the
+  // end), so we run from the first to the current last.
+  auto I = M.begin(), E = --M.end();
+  while (true) {
+    Function &F = *I;
+
+    Changed |= runOnFunction(F);
+    if (!NumLoops)
+      break;
+
+    // If this is the last function.
+    if (I == E)
+      break;
+
+    ++I;
+  }
+  return Changed;
+}
+
+bool LoopExtractor::runOnFunction(Function &F) {
+  // Do not modify `optnone` functions.
+  if (F.hasOptNone())
+    return false;
+
+  if (F.empty())
+    return false;
+
+  bool Changed = false;
   LoopInfo &LI = LookupLoopInfo(F);
- 
-  // If there are no loops in the function. 
-  if (LI.empty()) 
-    return Changed; 
- 
+
+  // If there are no loops in the function.
+  if (LI.empty())
+    return Changed;
+
   DominatorTree &DT = LookupDomTree(F);
- 
-  // If there is more than one top-level loop in this function, extract all of 
-  // the loops. 
-  if (std::next(LI.begin()) != LI.end()) 
-    return Changed | extractLoops(LI.begin(), LI.end(), LI, DT); 
- 
-  // Otherwise there is exactly one top-level loop. 
-  Loop *TLL = *LI.begin(); 
- 
-  // If the loop is in LoopSimplify form, then extract it only if this function 
-  // is more than a minimal wrapper around the loop. 
-  if (TLL->isLoopSimplifyForm()) { 
-    bool ShouldExtractLoop = false; 
- 
-    // Extract the loop if the entry block doesn't branch to the loop header. 
-    Instruction *EntryTI = F.getEntryBlock().getTerminator(); 
-    if (!isa<BranchInst>(EntryTI) || 
-        !cast<BranchInst>(EntryTI)->isUnconditional() || 
-        EntryTI->getSuccessor(0) != TLL->getHeader()) { 
-      ShouldExtractLoop = true; 
-    } else { 
-      // Check to see if any exits from the loop are more than just return 
-      // blocks. 
-      SmallVector<BasicBlock *, 8> ExitBlocks; 
-      TLL->getExitBlocks(ExitBlocks); 
-      for (auto *ExitBlock : ExitBlocks) 
-        if (!isa<ReturnInst>(ExitBlock->getTerminator())) { 
-          ShouldExtractLoop = true; 
-          break; 
-        } 
-    } 
- 
-    if (ShouldExtractLoop) 
-      return Changed | extractLoop(TLL, LI, DT); 
-  } 
- 
-  // Okay, this function is a minimal container around the specified loop. 
-  // If we extract the loop, we will continue to just keep extracting it 
-  // infinitely... so don't extract it. However, if the loop contains any 
-  // sub-loops, extract them. 
-  return Changed | extractLoops(TLL->begin(), TLL->end(), LI, DT); 
-} 
- 
-bool LoopExtractor::extractLoops(Loop::iterator From, Loop::iterator To, 
-                                 LoopInfo &LI, DominatorTree &DT) { 
-  bool Changed = false; 
-  SmallVector<Loop *, 8> Loops; 
- 
-  // Save the list of loops, as it may change. 
-  Loops.assign(From, To); 
-  for (Loop *L : Loops) { 
-    // If LoopSimplify form is not available, stay out of trouble. 
-    if (!L->isLoopSimplifyForm()) 
-      continue; 
- 
-    Changed |= extractLoop(L, LI, DT); 
-    if (!NumLoops) 
-      break; 
-  } 
-  return Changed; 
-} 
- 
-bool LoopExtractor::extractLoop(Loop *L, LoopInfo &LI, DominatorTree &DT) { 
-  assert(NumLoops != 0); 
-  Function &Func = *L->getHeader()->getParent(); 
+
+  // If there is more than one top-level loop in this function, extract all of
+  // the loops.
+  if (std::next(LI.begin()) != LI.end())
+    return Changed | extractLoops(LI.begin(), LI.end(), LI, DT);
+
+  // Otherwise there is exactly one top-level loop.
+  Loop *TLL = *LI.begin();
+
+  // If the loop is in LoopSimplify form, then extract it only if this function
+  // is more than a minimal wrapper around the loop.
+  if (TLL->isLoopSimplifyForm()) {
+    bool ShouldExtractLoop = false;
+
+    // Extract the loop if the entry block doesn't branch to the loop header.
+    Instruction *EntryTI = F.getEntryBlock().getTerminator();
+    if (!isa<BranchInst>(EntryTI) ||
+        !cast<BranchInst>(EntryTI)->isUnconditional() ||
+        EntryTI->getSuccessor(0) != TLL->getHeader()) {
+      ShouldExtractLoop = true;
+    } else {
+      // Check to see if any exits from the loop are more than just return
+      // blocks.
+      SmallVector<BasicBlock *, 8> ExitBlocks;
+      TLL->getExitBlocks(ExitBlocks);
+      for (auto *ExitBlock : ExitBlocks)
+        if (!isa<ReturnInst>(ExitBlock->getTerminator())) {
+          ShouldExtractLoop = true;
+          break;
+        }
+    }
+
+    if (ShouldExtractLoop)
+      return Changed | extractLoop(TLL, LI, DT);
+  }
+
+  // Okay, this function is a minimal container around the specified loop.
+  // If we extract the loop, we will continue to just keep extracting it
+  // infinitely... so don't extract it. However, if the loop contains any
+  // sub-loops, extract them.
+  return Changed | extractLoops(TLL->begin(), TLL->end(), LI, DT);
+}
+
+bool LoopExtractor::extractLoops(Loop::iterator From, Loop::iterator To,
+                                 LoopInfo &LI, DominatorTree &DT) {
+  bool Changed = false;
+  SmallVector<Loop *, 8> Loops;
+
+  // Save the list of loops, as it may change.
+  Loops.assign(From, To);
+  for (Loop *L : Loops) {
+    // If LoopSimplify form is not available, stay out of trouble.
+    if (!L->isLoopSimplifyForm())
+      continue;
+
+    Changed |= extractLoop(L, LI, DT);
+    if (!NumLoops)
+      break;
+  }
+  return Changed;
+}
+
+bool LoopExtractor::extractLoop(Loop *L, LoopInfo &LI, DominatorTree &DT) {
+  assert(NumLoops != 0);
+  Function &Func = *L->getHeader()->getParent();
   AssumptionCache *AC = LookupAssumptionCache(Func);
-  CodeExtractorAnalysisCache CEAC(Func); 
-  CodeExtractor Extractor(DT, *L, false, nullptr, nullptr, AC); 
-  if (Extractor.extractCodeRegion(CEAC)) { 
-    LI.erase(L); 
-    --NumLoops; 
-    ++NumExtracted; 
-    return true; 
-  } 
-  return false; 
-} 
- 
-// createSingleLoopExtractorPass - This pass extracts one natural loop from the 
-// program into a function if it can.  This is used by bugpoint. 
-// 
-Pass *llvm::createSingleLoopExtractorPass() { 
-  return new SingleLoopExtractor(); 
-} 
+  CodeExtractorAnalysisCache CEAC(Func);
+  CodeExtractor Extractor(DT, *L, false, nullptr, nullptr, AC);
+  if (Extractor.extractCodeRegion(CEAC)) {
+    LI.erase(L);
+    --NumLoops;
+    ++NumExtracted;
+    return true;
+  }
+  return false;
+}
+
+// createSingleLoopExtractorPass - This pass extracts one natural loop from the
+// program into a function if it can.  This is used by bugpoint.
+//
+Pass *llvm::createSingleLoopExtractorPass() {
+  return new SingleLoopExtractor();
+}
 
 PreservedAnalyses LoopExtractorPass::run(Module &M, ModuleAnalysisManager &AM) {
   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/LowerTypeTests.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/LowerTypeTests.cpp
index 96a4dfd176..8bd3036f1f 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -1,2260 +1,2260 @@
-//===- LowerTypeTests.cpp - type metadata lowering pass -------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass lowers type metadata and calls to the llvm.type.test intrinsic. 
-// It also ensures that globals are properly laid out for the 
-// llvm.icall.branch.funnel intrinsic. 
-// See http://llvm.org/docs/TypeMetadata.html for more information. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/IPO/LowerTypeTests.h" 
-#include "llvm/ADT/APInt.h" 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/EquivalenceClasses.h" 
-#include "llvm/ADT/PointerUnion.h" 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/StringRef.h" 
-#include "llvm/ADT/TinyPtrVector.h" 
-#include "llvm/ADT/Triple.h" 
-#include "llvm/Analysis/TypeMetadataUtils.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/Attributes.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GlobalAlias.h" 
-#include "llvm/IR/GlobalObject.h" 
-#include "llvm/IR/GlobalValue.h" 
-#include "llvm/IR/GlobalVariable.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InlineAsm.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/ModuleSummaryIndex.h" 
-#include "llvm/IR/ModuleSummaryIndexYAML.h" 
-#include "llvm/IR/Operator.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Allocator.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/Error.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/FileSystem.h" 
-#include "llvm/Support/MathExtras.h" 
-#include "llvm/Support/MemoryBuffer.h" 
-#include "llvm/Support/TrailingObjects.h" 
-#include "llvm/Support/YAMLTraits.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/IPO.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/ModuleUtils.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <cstdint> 
-#include <memory> 
-#include <set> 
-#include <string> 
-#include <system_error> 
-#include <utility> 
-#include <vector> 
- 
-using namespace llvm; 
-using namespace lowertypetests; 
- 
-#define DEBUG_TYPE "lowertypetests" 
- 
-STATISTIC(ByteArraySizeBits, "Byte array size in bits"); 
-STATISTIC(ByteArraySizeBytes, "Byte array size in bytes"); 
-STATISTIC(NumByteArraysCreated, "Number of byte arrays created"); 
-STATISTIC(NumTypeTestCallsLowered, "Number of type test calls lowered"); 
-STATISTIC(NumTypeIdDisjointSets, "Number of disjoint sets of type identifiers"); 
- 
-static cl::opt<bool> AvoidReuse( 
-    "lowertypetests-avoid-reuse", 
-    cl::desc("Try to avoid reuse of byte array addresses using aliases"), 
-    cl::Hidden, cl::init(true)); 
- 
-static cl::opt<PassSummaryAction> ClSummaryAction( 
-    "lowertypetests-summary-action", 
-    cl::desc("What to do with the summary when running this pass"), 
-    cl::values(clEnumValN(PassSummaryAction::None, "none", "Do nothing"), 
-               clEnumValN(PassSummaryAction::Import, "import", 
-                          "Import typeid resolutions from summary and globals"), 
-               clEnumValN(PassSummaryAction::Export, "export", 
-                          "Export typeid resolutions to summary and globals")), 
-    cl::Hidden); 
- 
-static cl::opt<std::string> ClReadSummary( 
-    "lowertypetests-read-summary", 
-    cl::desc("Read summary from given YAML file before running pass"), 
-    cl::Hidden); 
- 
-static cl::opt<std::string> ClWriteSummary( 
-    "lowertypetests-write-summary", 
-    cl::desc("Write summary to given YAML file after running pass"), 
-    cl::Hidden); 
- 
-bool BitSetInfo::containsGlobalOffset(uint64_t Offset) const { 
-  if (Offset < ByteOffset) 
-    return false; 
- 
-  if ((Offset - ByteOffset) % (uint64_t(1) << AlignLog2) != 0) 
-    return false; 
- 
-  uint64_t BitOffset = (Offset - ByteOffset) >> AlignLog2; 
-  if (BitOffset >= BitSize) 
-    return false; 
- 
-  return Bits.count(BitOffset); 
-} 
- 
-void BitSetInfo::print(raw_ostream &OS) const { 
-  OS << "offset " << ByteOffset << " size " << BitSize << " align " 
-     << (1 << AlignLog2); 
- 
-  if (isAllOnes()) { 
-    OS << " all-ones\n"; 
-    return; 
-  } 
- 
-  OS << " { "; 
-  for (uint64_t B : Bits) 
-    OS << B << ' '; 
-  OS << "}\n"; 
-} 
- 
-BitSetInfo BitSetBuilder::build() { 
-  if (Min > Max) 
-    Min = 0; 
- 
-  // Normalize each offset against the minimum observed offset, and compute 
-  // the bitwise OR of each of the offsets. The number of trailing zeros 
-  // in the mask gives us the log2 of the alignment of all offsets, which 
-  // allows us to compress the bitset by only storing one bit per aligned 
-  // address. 
-  uint64_t Mask = 0; 
-  for (uint64_t &Offset : Offsets) { 
-    Offset -= Min; 
-    Mask |= Offset; 
-  } 
- 
-  BitSetInfo BSI; 
-  BSI.ByteOffset = Min; 
- 
-  BSI.AlignLog2 = 0; 
-  if (Mask != 0) 
-    BSI.AlignLog2 = countTrailingZeros(Mask, ZB_Undefined); 
- 
-  // Build the compressed bitset while normalizing the offsets against the 
-  // computed alignment. 
-  BSI.BitSize = ((Max - Min) >> BSI.AlignLog2) + 1; 
-  for (uint64_t Offset : Offsets) { 
-    Offset >>= BSI.AlignLog2; 
-    BSI.Bits.insert(Offset); 
-  } 
- 
-  return BSI; 
-} 
- 
-void GlobalLayoutBuilder::addFragment(const std::set<uint64_t> &F) { 
-  // Create a new fragment to hold the layout for F. 
-  Fragments.emplace_back(); 
-  std::vector<uint64_t> &Fragment = Fragments.back(); 
-  uint64_t FragmentIndex = Fragments.size() - 1; 
- 
-  for (auto ObjIndex : F) { 
-    uint64_t OldFragmentIndex = FragmentMap[ObjIndex]; 
-    if (OldFragmentIndex == 0) { 
-      // We haven't seen this object index before, so just add it to the current 
-      // fragment. 
-      Fragment.push_back(ObjIndex); 
-    } else { 
-      // This index belongs to an existing fragment. Copy the elements of the 
-      // old fragment into this one and clear the old fragment. We don't update 
-      // the fragment map just yet, this ensures that any further references to 
-      // indices from the old fragment in this fragment do not insert any more 
-      // indices. 
-      std::vector<uint64_t> &OldFragment = Fragments[OldFragmentIndex]; 
+//===- LowerTypeTests.cpp - type metadata lowering pass -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers type metadata and calls to the llvm.type.test intrinsic.
+// It also ensures that globals are properly laid out for the
+// llvm.icall.branch.funnel intrinsic.
+// See http://llvm.org/docs/TypeMetadata.html for more information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/LowerTypeTests.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/PointerUnion.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/TypeMetadataUtils.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
+#include "llvm/IR/ModuleSummaryIndexYAML.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/TrailingObjects.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <set>
+#include <string>
+#include <system_error>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using namespace lowertypetests;
+
+#define DEBUG_TYPE "lowertypetests"
+
+STATISTIC(ByteArraySizeBits, "Byte array size in bits");
+STATISTIC(ByteArraySizeBytes, "Byte array size in bytes");
+STATISTIC(NumByteArraysCreated, "Number of byte arrays created");
+STATISTIC(NumTypeTestCallsLowered, "Number of type test calls lowered");
+STATISTIC(NumTypeIdDisjointSets, "Number of disjoint sets of type identifiers");
+
+static cl::opt<bool> AvoidReuse(
+    "lowertypetests-avoid-reuse",
+    cl::desc("Try to avoid reuse of byte array addresses using aliases"),
+    cl::Hidden, cl::init(true));
+
+static cl::opt<PassSummaryAction> ClSummaryAction(
+    "lowertypetests-summary-action",
+    cl::desc("What to do with the summary when running this pass"),
+    cl::values(clEnumValN(PassSummaryAction::None, "none", "Do nothing"),
+               clEnumValN(PassSummaryAction::Import, "import",
+                          "Import typeid resolutions from summary and globals"),
+               clEnumValN(PassSummaryAction::Export, "export",
+                          "Export typeid resolutions to summary and globals")),
+    cl::Hidden);
+
+static cl::opt<std::string> ClReadSummary(
+    "lowertypetests-read-summary",
+    cl::desc("Read summary from given YAML file before running pass"),
+    cl::Hidden);
+
+static cl::opt<std::string> ClWriteSummary(
+    "lowertypetests-write-summary",
+    cl::desc("Write summary to given YAML file after running pass"),
+    cl::Hidden);
+
+bool BitSetInfo::containsGlobalOffset(uint64_t Offset) const {
+  if (Offset < ByteOffset)
+    return false;
+
+  if ((Offset - ByteOffset) % (uint64_t(1) << AlignLog2) != 0)
+    return false;
+
+  uint64_t BitOffset = (Offset - ByteOffset) >> AlignLog2;
+  if (BitOffset >= BitSize)
+    return false;
+
+  return Bits.count(BitOffset);
+}
+
+void BitSetInfo::print(raw_ostream &OS) const {
+  OS << "offset " << ByteOffset << " size " << BitSize << " align "
+     << (1 << AlignLog2);
+
+  if (isAllOnes()) {
+    OS << " all-ones\n";
+    return;
+  }
+
+  OS << " { ";
+  for (uint64_t B : Bits)
+    OS << B << ' ';
+  OS << "}\n";
+}
+
+BitSetInfo BitSetBuilder::build() {
+  if (Min > Max)
+    Min = 0;
+
+  // Normalize each offset against the minimum observed offset, and compute
+  // the bitwise OR of each of the offsets. The number of trailing zeros
+  // in the mask gives us the log2 of the alignment of all offsets, which
+  // allows us to compress the bitset by only storing one bit per aligned
+  // address.
+  uint64_t Mask = 0;
+  for (uint64_t &Offset : Offsets) {
+    Offset -= Min;
+    Mask |= Offset;
+  }
+
+  BitSetInfo BSI;
+  BSI.ByteOffset = Min;
+
+  BSI.AlignLog2 = 0;
+  if (Mask != 0)
+    BSI.AlignLog2 = countTrailingZeros(Mask, ZB_Undefined);
+
+  // Build the compressed bitset while normalizing the offsets against the
+  // computed alignment.
+  BSI.BitSize = ((Max - Min) >> BSI.AlignLog2) + 1;
+  for (uint64_t Offset : Offsets) {
+    Offset >>= BSI.AlignLog2;
+    BSI.Bits.insert(Offset);
+  }
+
+  return BSI;
+}
+
+void GlobalLayoutBuilder::addFragment(const std::set<uint64_t> &F) {
+  // Create a new fragment to hold the layout for F.
+  Fragments.emplace_back();
+  std::vector<uint64_t> &Fragment = Fragments.back();
+  uint64_t FragmentIndex = Fragments.size() - 1;
+
+  for (auto ObjIndex : F) {
+    uint64_t OldFragmentIndex = FragmentMap[ObjIndex];
+    if (OldFragmentIndex == 0) {
+      // We haven't seen this object index before, so just add it to the current
+      // fragment.
+      Fragment.push_back(ObjIndex);
+    } else {
+      // This index belongs to an existing fragment. Copy the elements of the
+      // old fragment into this one and clear the old fragment. We don't update
+      // the fragment map just yet, this ensures that any further references to
+      // indices from the old fragment in this fragment do not insert any more
+      // indices.
+      std::vector<uint64_t> &OldFragment = Fragments[OldFragmentIndex];
       llvm::append_range(Fragment, OldFragment);
-      OldFragment.clear(); 
-    } 
-  } 
- 
-  // Update the fragment map to point our object indices to this fragment. 
-  for (uint64_t ObjIndex : Fragment) 
-    FragmentMap[ObjIndex] = FragmentIndex; 
-} 
- 
-void ByteArrayBuilder::allocate(const std::set<uint64_t> &Bits, 
-                                uint64_t BitSize, uint64_t &AllocByteOffset, 
-                                uint8_t &AllocMask) { 
-  // Find the smallest current allocation. 
-  unsigned Bit = 0; 
-  for (unsigned I = 1; I != BitsPerByte; ++I) 
-    if (BitAllocs[I] < BitAllocs[Bit]) 
-      Bit = I; 
- 
-  AllocByteOffset = BitAllocs[Bit]; 
- 
-  // Add our size to it. 
-  unsigned ReqSize = AllocByteOffset + BitSize; 
-  BitAllocs[Bit] = ReqSize; 
-  if (Bytes.size() < ReqSize) 
-    Bytes.resize(ReqSize); 
- 
-  // Set our bits. 
-  AllocMask = 1 << Bit; 
-  for (uint64_t B : Bits) 
-    Bytes[AllocByteOffset + B] |= AllocMask; 
-} 
- 
-bool lowertypetests::isJumpTableCanonical(Function *F) { 
-  if (F->isDeclarationForLinker()) 
-    return false; 
-  auto *CI = mdconst::extract_or_null<ConstantInt>( 
-      F->getParent()->getModuleFlag("CFI Canonical Jump Tables")); 
-  if (!CI || CI->getZExtValue() != 0) 
-    return true; 
-  return F->hasFnAttribute("cfi-canonical-jump-table"); 
-} 
- 
-namespace { 
- 
-struct ByteArrayInfo { 
-  std::set<uint64_t> Bits; 
-  uint64_t BitSize; 
-  GlobalVariable *ByteArray; 
-  GlobalVariable *MaskGlobal; 
-  uint8_t *MaskPtr = nullptr; 
-}; 
- 
-/// A POD-like structure that we use to store a global reference together with 
-/// its metadata types. In this pass we frequently need to query the set of 
-/// metadata types referenced by a global, which at the IR level is an expensive 
-/// operation involving a map lookup; this data structure helps to reduce the 
-/// number of times we need to do this lookup. 
-class GlobalTypeMember final : TrailingObjects<GlobalTypeMember, MDNode *> { 
-  friend TrailingObjects; 
- 
-  GlobalObject *GO; 
-  size_t NTypes; 
- 
-  // For functions: true if the jump table is canonical. This essentially means 
-  // whether the canonical address (i.e. the symbol table entry) of the function 
-  // is provided by the local jump table. This is normally the same as whether 
-  // the function is defined locally, but if canonical jump tables are disabled 
-  // by the user then the jump table never provides a canonical definition. 
-  bool IsJumpTableCanonical; 
- 
-  // For functions: true if this function is either defined or used in a thinlto 
-  // module and its jumptable entry needs to be exported to thinlto backends. 
-  bool IsExported; 
- 
-  size_t numTrailingObjects(OverloadToken<MDNode *>) const { return NTypes; } 
- 
-public: 
-  static GlobalTypeMember *create(BumpPtrAllocator &Alloc, GlobalObject *GO, 
-                                  bool IsJumpTableCanonical, bool IsExported, 
-                                  ArrayRef<MDNode *> Types) { 
-    auto *GTM = static_cast<GlobalTypeMember *>(Alloc.Allocate( 
-        totalSizeToAlloc<MDNode *>(Types.size()), alignof(GlobalTypeMember))); 
-    GTM->GO = GO; 
-    GTM->NTypes = Types.size(); 
-    GTM->IsJumpTableCanonical = IsJumpTableCanonical; 
-    GTM->IsExported = IsExported; 
-    std::uninitialized_copy(Types.begin(), Types.end(), 
-                            GTM->getTrailingObjects<MDNode *>()); 
-    return GTM; 
-  } 
- 
-  GlobalObject *getGlobal() const { 
-    return GO; 
-  } 
- 
-  bool isJumpTableCanonical() const { 
-    return IsJumpTableCanonical; 
-  } 
- 
-  bool isExported() const { 
-    return IsExported; 
-  } 
- 
-  ArrayRef<MDNode *> types() const { 
-    return makeArrayRef(getTrailingObjects<MDNode *>(), NTypes); 
-  } 
-}; 
- 
-struct ICallBranchFunnel final 
-    : TrailingObjects<ICallBranchFunnel, GlobalTypeMember *> { 
-  static ICallBranchFunnel *create(BumpPtrAllocator &Alloc, CallInst *CI, 
-                                   ArrayRef<GlobalTypeMember *> Targets, 
-                                   unsigned UniqueId) { 
-    auto *Call = static_cast<ICallBranchFunnel *>( 
-        Alloc.Allocate(totalSizeToAlloc<GlobalTypeMember *>(Targets.size()), 
-                       alignof(ICallBranchFunnel))); 
-    Call->CI = CI; 
-    Call->UniqueId = UniqueId; 
-    Call->NTargets = Targets.size(); 
-    std::uninitialized_copy(Targets.begin(), Targets.end(), 
-                            Call->getTrailingObjects<GlobalTypeMember *>()); 
-    return Call; 
-  } 
- 
-  CallInst *CI; 
-  ArrayRef<GlobalTypeMember *> targets() const { 
-    return makeArrayRef(getTrailingObjects<GlobalTypeMember *>(), NTargets); 
-  } 
- 
-  unsigned UniqueId; 
- 
-private: 
-  size_t NTargets; 
-}; 
- 
-struct ScopedSaveAliaseesAndUsed { 
-  Module &M; 
-  SmallPtrSet<GlobalValue *, 16> Used, CompilerUsed; 
-  std::vector<std::pair<GlobalIndirectSymbol *, Function *>> FunctionAliases; 
- 
-  ScopedSaveAliaseesAndUsed(Module &M) : M(M) { 
-    // The users of this class want to replace all function references except 
-    // for aliases and llvm.used/llvm.compiler.used with references to a jump 
-    // table. We avoid replacing aliases in order to avoid introducing a double 
-    // indirection (or an alias pointing to a declaration in ThinLTO mode), and 
-    // we avoid replacing llvm.used/llvm.compiler.used because these global 
-    // variables describe properties of the global, not the jump table (besides, 
-    // offseted references to the jump table in llvm.used are invalid). 
-    // Unfortunately, LLVM doesn't have a "RAUW except for these (possibly 
-    // indirect) users", so what we do is save the list of globals referenced by 
-    // llvm.used/llvm.compiler.used and aliases, erase the used lists, let RAUW 
-    // replace the aliasees and then set them back to their original values at 
-    // the end. 
-    if (GlobalVariable *GV = collectUsedGlobalVariables(M, Used, false)) 
-      GV->eraseFromParent(); 
-    if (GlobalVariable *GV = collectUsedGlobalVariables(M, CompilerUsed, true)) 
-      GV->eraseFromParent(); 
- 
-    for (auto &GIS : concat<GlobalIndirectSymbol>(M.aliases(), M.ifuncs())) { 
-      // FIXME: This should look past all aliases not just interposable ones, 
-      // see discussion on D65118. 
-      if (auto *F = 
-              dyn_cast<Function>(GIS.getIndirectSymbol()->stripPointerCasts())) 
-        FunctionAliases.push_back({&GIS, F}); 
-    } 
-  } 
- 
-  ~ScopedSaveAliaseesAndUsed() { 
-    appendToUsed(M, std::vector<GlobalValue *>(Used.begin(), Used.end())); 
-    appendToCompilerUsed(M, std::vector<GlobalValue *>(CompilerUsed.begin(), 
-                                                       CompilerUsed.end())); 
- 
-    for (auto P : FunctionAliases) 
-      P.first->setIndirectSymbol( 
-          ConstantExpr::getBitCast(P.second, P.first->getType())); 
-  } 
-}; 
- 
-class LowerTypeTestsModule { 
-  Module &M; 
- 
-  ModuleSummaryIndex *ExportSummary; 
-  const ModuleSummaryIndex *ImportSummary; 
-  // Set when the client has invoked this to simply drop all type test assume 
-  // sequences. 
-  bool DropTypeTests; 
- 
-  Triple::ArchType Arch; 
-  Triple::OSType OS; 
-  Triple::ObjectFormatType ObjectFormat; 
- 
-  IntegerType *Int1Ty = Type::getInt1Ty(M.getContext()); 
-  IntegerType *Int8Ty = Type::getInt8Ty(M.getContext()); 
-  PointerType *Int8PtrTy = Type::getInt8PtrTy(M.getContext()); 
-  ArrayType *Int8Arr0Ty = ArrayType::get(Type::getInt8Ty(M.getContext()), 0); 
-  IntegerType *Int32Ty = Type::getInt32Ty(M.getContext()); 
-  PointerType *Int32PtrTy = PointerType::getUnqual(Int32Ty); 
-  IntegerType *Int64Ty = Type::getInt64Ty(M.getContext()); 
-  IntegerType *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext(), 0); 
- 
-  // Indirect function call index assignment counter for WebAssembly 
-  uint64_t IndirectIndex = 1; 
- 
-  // Mapping from type identifiers to the call sites that test them, as well as 
-  // whether the type identifier needs to be exported to ThinLTO backends as 
-  // part of the regular LTO phase of the ThinLTO pipeline (see exportTypeId). 
-  struct TypeIdUserInfo { 
-    std::vector<CallInst *> CallSites; 
-    bool IsExported = false; 
-  }; 
-  DenseMap<Metadata *, TypeIdUserInfo> TypeIdUsers; 
- 
-  /// This structure describes how to lower type tests for a particular type 
-  /// identifier. It is either built directly from the global analysis (during 
-  /// regular LTO or the regular LTO phase of ThinLTO), or indirectly using type 
-  /// identifier summaries and external symbol references (in ThinLTO backends). 
-  struct TypeIdLowering { 
-    TypeTestResolution::Kind TheKind = TypeTestResolution::Unsat; 
- 
-    /// All except Unsat: the start address within the combined global. 
-    Constant *OffsetedGlobal; 
- 
-    /// ByteArray, Inline, AllOnes: log2 of the required global alignment 
-    /// relative to the start address. 
-    Constant *AlignLog2; 
- 
-    /// ByteArray, Inline, AllOnes: one less than the size of the memory region 
-    /// covering members of this type identifier as a multiple of 2^AlignLog2. 
-    Constant *SizeM1; 
- 
-    /// ByteArray: the byte array to test the address against. 
-    Constant *TheByteArray; 
- 
-    /// ByteArray: the bit mask to apply to bytes loaded from the byte array. 
-    Constant *BitMask; 
- 
-    /// Inline: the bit mask to test the address against. 
-    Constant *InlineBits; 
-  }; 
- 
-  std::vector<ByteArrayInfo> ByteArrayInfos; 
- 
-  Function *WeakInitializerFn = nullptr; 
- 
-  bool shouldExportConstantsAsAbsoluteSymbols(); 
-  uint8_t *exportTypeId(StringRef TypeId, const TypeIdLowering &TIL); 
-  TypeIdLowering importTypeId(StringRef TypeId); 
-  void importTypeTest(CallInst *CI); 
-  void importFunction(Function *F, bool isJumpTableCanonical, 
-                      std::vector<GlobalAlias *> &AliasesToErase); 
- 
-  BitSetInfo 
-  buildBitSet(Metadata *TypeId, 
-              const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout); 
-  ByteArrayInfo *createByteArray(BitSetInfo &BSI); 
-  void allocateByteArrays(); 
-  Value *createBitSetTest(IRBuilder<> &B, const TypeIdLowering &TIL, 
-                          Value *BitOffset); 
-  void lowerTypeTestCalls( 
-      ArrayRef<Metadata *> TypeIds, Constant *CombinedGlobalAddr, 
-      const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout); 
-  Value *lowerTypeTestCall(Metadata *TypeId, CallInst *CI, 
-                           const TypeIdLowering &TIL); 
- 
-  void buildBitSetsFromGlobalVariables(ArrayRef<Metadata *> TypeIds, 
-                                       ArrayRef<GlobalTypeMember *> Globals); 
-  unsigned getJumpTableEntrySize(); 
-  Type *getJumpTableEntryType(); 
-  void createJumpTableEntry(raw_ostream &AsmOS, raw_ostream &ConstraintOS, 
-                            Triple::ArchType JumpTableArch, 
-                            SmallVectorImpl<Value *> &AsmArgs, Function *Dest); 
-  void verifyTypeMDNode(GlobalObject *GO, MDNode *Type); 
-  void buildBitSetsFromFunctions(ArrayRef<Metadata *> TypeIds, 
-                                 ArrayRef<GlobalTypeMember *> Functions); 
-  void buildBitSetsFromFunctionsNative(ArrayRef<Metadata *> TypeIds, 
-                                       ArrayRef<GlobalTypeMember *> Functions); 
-  void buildBitSetsFromFunctionsWASM(ArrayRef<Metadata *> TypeIds, 
-                                     ArrayRef<GlobalTypeMember *> Functions); 
-  void 
-  buildBitSetsFromDisjointSet(ArrayRef<Metadata *> TypeIds, 
-                              ArrayRef<GlobalTypeMember *> Globals, 
-                              ArrayRef<ICallBranchFunnel *> ICallBranchFunnels); 
- 
-  void replaceWeakDeclarationWithJumpTablePtr(Function *F, Constant *JT, 
-                                              bool IsJumpTableCanonical); 
-  void moveInitializerToModuleConstructor(GlobalVariable *GV); 
-  void findGlobalVariableUsersOf(Constant *C, 
-                                 SmallSetVector<GlobalVariable *, 8> &Out); 
- 
-  void createJumpTable(Function *F, ArrayRef<GlobalTypeMember *> Functions); 
- 
-  /// replaceCfiUses - Go through the uses list for this definition 
-  /// and make each use point to "V" instead of "this" when the use is outside 
-  /// the block. 'This's use list is expected to have at least one element. 
-  /// Unlike replaceAllUsesWith this function skips blockaddr and direct call 
-  /// uses. 
-  void replaceCfiUses(Function *Old, Value *New, bool IsJumpTableCanonical); 
- 
-  /// replaceDirectCalls - Go through the uses list for this definition and 
-  /// replace each use, which is a direct function call. 
-  void replaceDirectCalls(Value *Old, Value *New); 
- 
-public: 
-  LowerTypeTestsModule(Module &M, ModuleSummaryIndex *ExportSummary, 
-                       const ModuleSummaryIndex *ImportSummary, 
-                       bool DropTypeTests); 
- 
-  bool lower(); 
- 
-  // Lower the module using the action and summary passed as command line 
-  // arguments. For testing purposes only. 
-  static bool runForTesting(Module &M); 
-}; 
- 
-struct LowerTypeTests : public ModulePass { 
-  static char ID; 
- 
-  bool UseCommandLine = false; 
- 
-  ModuleSummaryIndex *ExportSummary; 
-  const ModuleSummaryIndex *ImportSummary; 
-  bool DropTypeTests; 
- 
-  LowerTypeTests() : ModulePass(ID), UseCommandLine(true) { 
-    initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  LowerTypeTests(ModuleSummaryIndex *ExportSummary, 
-                 const ModuleSummaryIndex *ImportSummary, bool DropTypeTests) 
-      : ModulePass(ID), ExportSummary(ExportSummary), 
-        ImportSummary(ImportSummary), DropTypeTests(DropTypeTests) { 
-    initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnModule(Module &M) override { 
-    if (UseCommandLine) 
-      return LowerTypeTestsModule::runForTesting(M); 
-    return LowerTypeTestsModule(M, ExportSummary, ImportSummary, DropTypeTests) 
-        .lower(); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-char LowerTypeTests::ID = 0; 
- 
-INITIALIZE_PASS(LowerTypeTests, "lowertypetests", "Lower type metadata", false, 
-                false) 
- 
-ModulePass * 
-llvm::createLowerTypeTestsPass(ModuleSummaryIndex *ExportSummary, 
-                               const ModuleSummaryIndex *ImportSummary, 
-                               bool DropTypeTests) { 
-  return new LowerTypeTests(ExportSummary, ImportSummary, DropTypeTests); 
-} 
- 
-/// Build a bit set for TypeId using the object layouts in 
-/// GlobalLayout. 
-BitSetInfo LowerTypeTestsModule::buildBitSet( 
-    Metadata *TypeId, 
-    const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout) { 
-  BitSetBuilder BSB; 
- 
-  // Compute the byte offset of each address associated with this type 
-  // identifier. 
-  for (auto &GlobalAndOffset : GlobalLayout) { 
-    for (MDNode *Type : GlobalAndOffset.first->types()) { 
-      if (Type->getOperand(1) != TypeId) 
-        continue; 
-      uint64_t Offset = 
-          cast<ConstantInt>( 
-              cast<ConstantAsMetadata>(Type->getOperand(0))->getValue()) 
-              ->getZExtValue(); 
-      BSB.addOffset(GlobalAndOffset.second + Offset); 
-    } 
-  } 
- 
-  return BSB.build(); 
-} 
- 
-/// Build a test that bit BitOffset mod sizeof(Bits)*8 is set in 
-/// Bits. This pattern matches to the bt instruction on x86. 
-static Value *createMaskedBitTest(IRBuilder<> &B, Value *Bits, 
-                                  Value *BitOffset) { 
-  auto BitsType = cast<IntegerType>(Bits->getType()); 
-  unsigned BitWidth = BitsType->getBitWidth(); 
- 
-  BitOffset = B.CreateZExtOrTrunc(BitOffset, BitsType); 
-  Value *BitIndex = 
-      B.CreateAnd(BitOffset, ConstantInt::get(BitsType, BitWidth - 1)); 
-  Value *BitMask = B.CreateShl(ConstantInt::get(BitsType, 1), BitIndex); 
-  Value *MaskedBits = B.CreateAnd(Bits, BitMask); 
-  return B.CreateICmpNE(MaskedBits, ConstantInt::get(BitsType, 0)); 
-} 
- 
-ByteArrayInfo *LowerTypeTestsModule::createByteArray(BitSetInfo &BSI) { 
-  // Create globals to stand in for byte arrays and masks. These never actually 
-  // get initialized, we RAUW and erase them later in allocateByteArrays() once 
-  // we know the offset and mask to use. 
-  auto ByteArrayGlobal = new GlobalVariable( 
-      M, Int8Ty, /*isConstant=*/true, GlobalValue::PrivateLinkage, nullptr); 
-  auto MaskGlobal = new GlobalVariable(M, Int8Ty, /*isConstant=*/true, 
-                                       GlobalValue::PrivateLinkage, nullptr); 
- 
-  ByteArrayInfos.emplace_back(); 
-  ByteArrayInfo *BAI = &ByteArrayInfos.back(); 
- 
-  BAI->Bits = BSI.Bits; 
-  BAI->BitSize = BSI.BitSize; 
-  BAI->ByteArray = ByteArrayGlobal; 
-  BAI->MaskGlobal = MaskGlobal; 
-  return BAI; 
-} 
- 
-void LowerTypeTestsModule::allocateByteArrays() { 
-  llvm::stable_sort(ByteArrayInfos, 
-                    [](const ByteArrayInfo &BAI1, const ByteArrayInfo &BAI2) { 
-                      return BAI1.BitSize > BAI2.BitSize; 
-                    }); 
- 
-  std::vector<uint64_t> ByteArrayOffsets(ByteArrayInfos.size()); 
- 
-  ByteArrayBuilder BAB; 
-  for (unsigned I = 0; I != ByteArrayInfos.size(); ++I) { 
-    ByteArrayInfo *BAI = &ByteArrayInfos[I]; 
- 
-    uint8_t Mask; 
-    BAB.allocate(BAI->Bits, BAI->BitSize, ByteArrayOffsets[I], Mask); 
- 
-    BAI->MaskGlobal->replaceAllUsesWith( 
-        ConstantExpr::getIntToPtr(ConstantInt::get(Int8Ty, Mask), Int8PtrTy)); 
-    BAI->MaskGlobal->eraseFromParent(); 
-    if (BAI->MaskPtr) 
-      *BAI->MaskPtr = Mask; 
-  } 
- 
-  Constant *ByteArrayConst = ConstantDataArray::get(M.getContext(), BAB.Bytes); 
-  auto ByteArray = 
-      new GlobalVariable(M, ByteArrayConst->getType(), /*isConstant=*/true, 
-                         GlobalValue::PrivateLinkage, ByteArrayConst); 
- 
-  for (unsigned I = 0; I != ByteArrayInfos.size(); ++I) { 
-    ByteArrayInfo *BAI = &ByteArrayInfos[I]; 
- 
-    Constant *Idxs[] = {ConstantInt::get(IntPtrTy, 0), 
-                        ConstantInt::get(IntPtrTy, ByteArrayOffsets[I])}; 
-    Constant *GEP = ConstantExpr::getInBoundsGetElementPtr( 
-        ByteArrayConst->getType(), ByteArray, Idxs); 
- 
-    // Create an alias instead of RAUW'ing the gep directly. On x86 this ensures 
-    // that the pc-relative displacement is folded into the lea instead of the 
-    // test instruction getting another displacement. 
-    GlobalAlias *Alias = GlobalAlias::create( 
-        Int8Ty, 0, GlobalValue::PrivateLinkage, "bits", GEP, &M); 
-    BAI->ByteArray->replaceAllUsesWith(Alias); 
-    BAI->ByteArray->eraseFromParent(); 
-  } 
- 
-  ByteArraySizeBits = BAB.BitAllocs[0] + BAB.BitAllocs[1] + BAB.BitAllocs[2] + 
-                      BAB.BitAllocs[3] + BAB.BitAllocs[4] + BAB.BitAllocs[5] + 
-                      BAB.BitAllocs[6] + BAB.BitAllocs[7]; 
-  ByteArraySizeBytes = BAB.Bytes.size(); 
-} 
- 
-/// Build a test that bit BitOffset is set in the type identifier that was 
-/// lowered to TIL, which must be either an Inline or a ByteArray. 
-Value *LowerTypeTestsModule::createBitSetTest(IRBuilder<> &B, 
-                                              const TypeIdLowering &TIL, 
-                                              Value *BitOffset) { 
-  if (TIL.TheKind == TypeTestResolution::Inline) { 
-    // If the bit set is sufficiently small, we can avoid a load by bit testing 
-    // a constant. 
-    return createMaskedBitTest(B, TIL.InlineBits, BitOffset); 
-  } else { 
-    Constant *ByteArray = TIL.TheByteArray; 
-    if (AvoidReuse && !ImportSummary) { 
-      // Each use of the byte array uses a different alias. This makes the 
-      // backend less likely to reuse previously computed byte array addresses, 
-      // improving the security of the CFI mechanism based on this pass. 
-      // This won't work when importing because TheByteArray is external. 
-      ByteArray = GlobalAlias::create(Int8Ty, 0, GlobalValue::PrivateLinkage, 
-                                      "bits_use", ByteArray, &M); 
-    } 
- 
-    Value *ByteAddr = B.CreateGEP(Int8Ty, ByteArray, BitOffset); 
-    Value *Byte = B.CreateLoad(Int8Ty, ByteAddr); 
- 
-    Value *ByteAndMask = 
-        B.CreateAnd(Byte, ConstantExpr::getPtrToInt(TIL.BitMask, Int8Ty)); 
-    return B.CreateICmpNE(ByteAndMask, ConstantInt::get(Int8Ty, 0)); 
-  } 
-} 
- 
-static bool isKnownTypeIdMember(Metadata *TypeId, const DataLayout &DL, 
-                                Value *V, uint64_t COffset) { 
-  if (auto GV = dyn_cast<GlobalObject>(V)) { 
-    SmallVector<MDNode *, 2> Types; 
-    GV->getMetadata(LLVMContext::MD_type, Types); 
-    for (MDNode *Type : Types) { 
-      if (Type->getOperand(1) != TypeId) 
-        continue; 
-      uint64_t Offset = 
-          cast<ConstantInt>( 
-              cast<ConstantAsMetadata>(Type->getOperand(0))->getValue()) 
-              ->getZExtValue(); 
-      if (COffset == Offset) 
-        return true; 
-    } 
-    return false; 
-  } 
- 
-  if (auto GEP = dyn_cast<GEPOperator>(V)) { 
-    APInt APOffset(DL.getPointerSizeInBits(0), 0); 
-    bool Result = GEP->accumulateConstantOffset(DL, APOffset); 
-    if (!Result) 
-      return false; 
-    COffset += APOffset.getZExtValue(); 
-    return isKnownTypeIdMember(TypeId, DL, GEP->getPointerOperand(), COffset); 
-  } 
- 
-  if (auto Op = dyn_cast<Operator>(V)) { 
-    if (Op->getOpcode() == Instruction::BitCast) 
-      return isKnownTypeIdMember(TypeId, DL, Op->getOperand(0), COffset); 
- 
-    if (Op->getOpcode() == Instruction::Select) 
-      return isKnownTypeIdMember(TypeId, DL, Op->getOperand(1), COffset) && 
-             isKnownTypeIdMember(TypeId, DL, Op->getOperand(2), COffset); 
-  } 
- 
-  return false; 
-} 
- 
-/// Lower a llvm.type.test call to its implementation. Returns the value to 
-/// replace the call with. 
-Value *LowerTypeTestsModule::lowerTypeTestCall(Metadata *TypeId, CallInst *CI, 
-                                               const TypeIdLowering &TIL) { 
-  // Delay lowering if the resolution is currently unknown. 
-  if (TIL.TheKind == TypeTestResolution::Unknown) 
-    return nullptr; 
-  if (TIL.TheKind == TypeTestResolution::Unsat) 
-    return ConstantInt::getFalse(M.getContext()); 
- 
-  Value *Ptr = CI->getArgOperand(0); 
-  const DataLayout &DL = M.getDataLayout(); 
-  if (isKnownTypeIdMember(TypeId, DL, Ptr, 0)) 
-    return ConstantInt::getTrue(M.getContext()); 
- 
-  BasicBlock *InitialBB = CI->getParent(); 
- 
-  IRBuilder<> B(CI); 
- 
-  Value *PtrAsInt = B.CreatePtrToInt(Ptr, IntPtrTy); 
- 
-  Constant *OffsetedGlobalAsInt = 
-      ConstantExpr::getPtrToInt(TIL.OffsetedGlobal, IntPtrTy); 
-  if (TIL.TheKind == TypeTestResolution::Single) 
-    return B.CreateICmpEQ(PtrAsInt, OffsetedGlobalAsInt); 
- 
-  Value *PtrOffset = B.CreateSub(PtrAsInt, OffsetedGlobalAsInt); 
- 
-  // We need to check that the offset both falls within our range and is 
-  // suitably aligned. We can check both properties at the same time by 
-  // performing a right rotate by log2(alignment) followed by an integer 
-  // comparison against the bitset size. The rotate will move the lower 
-  // order bits that need to be zero into the higher order bits of the 
-  // result, causing the comparison to fail if they are nonzero. The rotate 
-  // also conveniently gives us a bit offset to use during the load from 
-  // the bitset. 
-  Value *OffsetSHR = 
-      B.CreateLShr(PtrOffset, ConstantExpr::getZExt(TIL.AlignLog2, IntPtrTy)); 
-  Value *OffsetSHL = B.CreateShl( 
-      PtrOffset, ConstantExpr::getZExt( 
-                     ConstantExpr::getSub( 
-                         ConstantInt::get(Int8Ty, DL.getPointerSizeInBits(0)), 
-                         TIL.AlignLog2), 
-                     IntPtrTy)); 
-  Value *BitOffset = B.CreateOr(OffsetSHR, OffsetSHL); 
- 
-  Value *OffsetInRange = B.CreateICmpULE(BitOffset, TIL.SizeM1); 
- 
-  // If the bit set is all ones, testing against it is unnecessary. 
-  if (TIL.TheKind == TypeTestResolution::AllOnes) 
-    return OffsetInRange; 
- 
-  // See if the intrinsic is used in the following common pattern: 
-  //   br(llvm.type.test(...), thenbb, elsebb) 
-  // where nothing happens between the type test and the br. 
-  // If so, create slightly simpler IR. 
-  if (CI->hasOneUse()) 
-    if (auto *Br = dyn_cast<BranchInst>(*CI->user_begin())) 
-      if (CI->getNextNode() == Br) { 
-        BasicBlock *Then = InitialBB->splitBasicBlock(CI->getIterator()); 
-        BasicBlock *Else = Br->getSuccessor(1); 
-        BranchInst *NewBr = BranchInst::Create(Then, Else, OffsetInRange); 
-        NewBr->setMetadata(LLVMContext::MD_prof, 
-                           Br->getMetadata(LLVMContext::MD_prof)); 
-        ReplaceInstWithInst(InitialBB->getTerminator(), NewBr); 
- 
-        // Update phis in Else resulting from InitialBB being split 
-        for (auto &Phi : Else->phis()) 
-          Phi.addIncoming(Phi.getIncomingValueForBlock(Then), InitialBB); 
- 
-        IRBuilder<> ThenB(CI); 
-        return createBitSetTest(ThenB, TIL, BitOffset); 
-      } 
- 
-  IRBuilder<> ThenB(SplitBlockAndInsertIfThen(OffsetInRange, CI, false)); 
- 
-  // Now that we know that the offset is in range and aligned, load the 
-  // appropriate bit from the bitset. 
-  Value *Bit = createBitSetTest(ThenB, TIL, BitOffset); 
- 
-  // The value we want is 0 if we came directly from the initial block 
-  // (having failed the range or alignment checks), or the loaded bit if 
-  // we came from the block in which we loaded it. 
-  B.SetInsertPoint(CI); 
-  PHINode *P = B.CreatePHI(Int1Ty, 2); 
-  P->addIncoming(ConstantInt::get(Int1Ty, 0), InitialBB); 
-  P->addIncoming(Bit, ThenB.GetInsertBlock()); 
-  return P; 
-} 
- 
-/// Given a disjoint set of type identifiers and globals, lay out the globals, 
-/// build the bit sets and lower the llvm.type.test calls. 
-void LowerTypeTestsModule::buildBitSetsFromGlobalVariables( 
-    ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Globals) { 
-  // Build a new global with the combined contents of the referenced globals. 
-  // This global is a struct whose even-indexed elements contain the original 
-  // contents of the referenced globals and whose odd-indexed elements contain 
-  // any padding required to align the next element to the next power of 2 plus 
-  // any additional padding required to meet its alignment requirements. 
-  std::vector<Constant *> GlobalInits; 
-  const DataLayout &DL = M.getDataLayout(); 
-  DenseMap<GlobalTypeMember *, uint64_t> GlobalLayout; 
-  Align MaxAlign; 
-  uint64_t CurOffset = 0; 
-  uint64_t DesiredPadding = 0; 
-  for (GlobalTypeMember *G : Globals) { 
-    auto *GV = cast<GlobalVariable>(G->getGlobal()); 
-    Align Alignment = 
-        DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType()); 
-    MaxAlign = std::max(MaxAlign, Alignment); 
-    uint64_t GVOffset = alignTo(CurOffset + DesiredPadding, Alignment); 
-    GlobalLayout[G] = GVOffset; 
-    if (GVOffset != 0) { 
-      uint64_t Padding = GVOffset - CurOffset; 
-      GlobalInits.push_back( 
-          ConstantAggregateZero::get(ArrayType::get(Int8Ty, Padding))); 
-    } 
- 
-    GlobalInits.push_back(GV->getInitializer()); 
-    uint64_t InitSize = DL.getTypeAllocSize(GV->getValueType()); 
-    CurOffset = GVOffset + InitSize; 
- 
-    // Compute the amount of padding that we'd like for the next element. 
-    DesiredPadding = NextPowerOf2(InitSize - 1) - InitSize; 
- 
-    // Experiments of different caps with Chromium on both x64 and ARM64 
-    // have shown that the 32-byte cap generates the smallest binary on 
-    // both platforms while different caps yield similar performance. 
-    // (see https://lists.llvm.org/pipermail/llvm-dev/2018-July/124694.html) 
-    if (DesiredPadding > 32) 
-      DesiredPadding = alignTo(InitSize, 32) - InitSize; 
-  } 
- 
-  Constant *NewInit = ConstantStruct::getAnon(M.getContext(), GlobalInits); 
-  auto *CombinedGlobal = 
-      new GlobalVariable(M, NewInit->getType(), /*isConstant=*/true, 
-                         GlobalValue::PrivateLinkage, NewInit); 
-  CombinedGlobal->setAlignment(MaxAlign); 
- 
-  StructType *NewTy = cast<StructType>(NewInit->getType()); 
-  lowerTypeTestCalls(TypeIds, CombinedGlobal, GlobalLayout); 
- 
-  // Build aliases pointing to offsets into the combined global for each 
-  // global from which we built the combined global, and replace references 
-  // to the original globals with references to the aliases. 
-  for (unsigned I = 0; I != Globals.size(); ++I) { 
-    GlobalVariable *GV = cast<GlobalVariable>(Globals[I]->getGlobal()); 
- 
-    // Multiply by 2 to account for padding elements. 
-    Constant *CombinedGlobalIdxs[] = {ConstantInt::get(Int32Ty, 0), 
-                                      ConstantInt::get(Int32Ty, I * 2)}; 
-    Constant *CombinedGlobalElemPtr = ConstantExpr::getGetElementPtr( 
-        NewInit->getType(), CombinedGlobal, CombinedGlobalIdxs); 
-    assert(GV->getType()->getAddressSpace() == 0); 
-    GlobalAlias *GAlias = 
-        GlobalAlias::create(NewTy->getElementType(I * 2), 0, GV->getLinkage(), 
-                            "", CombinedGlobalElemPtr, &M); 
-    GAlias->setVisibility(GV->getVisibility()); 
-    GAlias->takeName(GV); 
-    GV->replaceAllUsesWith(GAlias); 
-    GV->eraseFromParent(); 
-  } 
-} 
- 
-bool LowerTypeTestsModule::shouldExportConstantsAsAbsoluteSymbols() { 
-  return (Arch == Triple::x86 || Arch == Triple::x86_64) && 
-         ObjectFormat == Triple::ELF; 
-} 
- 
-/// Export the given type identifier so that ThinLTO backends may import it. 
-/// Type identifiers are exported by adding coarse-grained information about how 
-/// to test the type identifier to the summary, and creating symbols in the 
-/// object file (aliases and absolute symbols) containing fine-grained 
-/// information about the type identifier. 
-/// 
-/// Returns a pointer to the location in which to store the bitmask, if 
-/// applicable. 
-uint8_t *LowerTypeTestsModule::exportTypeId(StringRef TypeId, 
-                                            const TypeIdLowering &TIL) { 
-  TypeTestResolution &TTRes = 
-      ExportSummary->getOrInsertTypeIdSummary(TypeId).TTRes; 
-  TTRes.TheKind = TIL.TheKind; 
- 
-  auto ExportGlobal = [&](StringRef Name, Constant *C) { 
-    GlobalAlias *GA = 
-        GlobalAlias::create(Int8Ty, 0, GlobalValue::ExternalLinkage, 
-                            "__typeid_" + TypeId + "_" + Name, C, &M); 
-    GA->setVisibility(GlobalValue::HiddenVisibility); 
-  }; 
- 
-  auto ExportConstant = [&](StringRef Name, uint64_t &Storage, Constant *C) { 
-    if (shouldExportConstantsAsAbsoluteSymbols()) 
-      ExportGlobal(Name, ConstantExpr::getIntToPtr(C, Int8PtrTy)); 
-    else 
-      Storage = cast<ConstantInt>(C)->getZExtValue(); 
-  }; 
- 
-  if (TIL.TheKind != TypeTestResolution::Unsat) 
-    ExportGlobal("global_addr", TIL.OffsetedGlobal); 
- 
-  if (TIL.TheKind == TypeTestResolution::ByteArray || 
-      TIL.TheKind == TypeTestResolution::Inline || 
-      TIL.TheKind == TypeTestResolution::AllOnes) { 
-    ExportConstant("align", TTRes.AlignLog2, TIL.AlignLog2); 
-    ExportConstant("size_m1", TTRes.SizeM1, TIL.SizeM1); 
- 
-    uint64_t BitSize = cast<ConstantInt>(TIL.SizeM1)->getZExtValue() + 1; 
-    if (TIL.TheKind == TypeTestResolution::Inline) 
-      TTRes.SizeM1BitWidth = (BitSize <= 32) ? 5 : 6; 
-    else 
-      TTRes.SizeM1BitWidth = (BitSize <= 128) ? 7 : 32; 
-  } 
- 
-  if (TIL.TheKind == TypeTestResolution::ByteArray) { 
-    ExportGlobal("byte_array", TIL.TheByteArray); 
-    if (shouldExportConstantsAsAbsoluteSymbols()) 
-      ExportGlobal("bit_mask", TIL.BitMask); 
-    else 
-      return &TTRes.BitMask; 
-  } 
- 
-  if (TIL.TheKind == TypeTestResolution::Inline) 
-    ExportConstant("inline_bits", TTRes.InlineBits, TIL.InlineBits); 
- 
-  return nullptr; 
-} 
- 
-LowerTypeTestsModule::TypeIdLowering 
-LowerTypeTestsModule::importTypeId(StringRef TypeId) { 
-  const TypeIdSummary *TidSummary = ImportSummary->getTypeIdSummary(TypeId); 
-  if (!TidSummary) 
-    return {}; // Unsat: no globals match this type id. 
-  const TypeTestResolution &TTRes = TidSummary->TTRes; 
- 
-  TypeIdLowering TIL; 
-  TIL.TheKind = TTRes.TheKind; 
- 
-  auto ImportGlobal = [&](StringRef Name) { 
-    // Give the global a type of length 0 so that it is not assumed not to alias 
-    // with any other global. 
-    Constant *C = M.getOrInsertGlobal(("__typeid_" + TypeId + "_" + Name).str(), 
-                                      Int8Arr0Ty); 
-    if (auto *GV = dyn_cast<GlobalVariable>(C)) 
-      GV->setVisibility(GlobalValue::HiddenVisibility); 
-    C = ConstantExpr::getBitCast(C, Int8PtrTy); 
-    return C; 
-  }; 
- 
-  auto ImportConstant = [&](StringRef Name, uint64_t Const, unsigned AbsWidth, 
-                            Type *Ty) { 
-    if (!shouldExportConstantsAsAbsoluteSymbols()) { 
-      Constant *C = 
-          ConstantInt::get(isa<IntegerType>(Ty) ? Ty : Int64Ty, Const); 
-      if (!isa<IntegerType>(Ty)) 
-        C = ConstantExpr::getIntToPtr(C, Ty); 
-      return C; 
-    } 
- 
-    Constant *C = ImportGlobal(Name); 
-    auto *GV = cast<GlobalVariable>(C->stripPointerCasts()); 
-    if (isa<IntegerType>(Ty)) 
-      C = ConstantExpr::getPtrToInt(C, Ty); 
-    if (GV->getMetadata(LLVMContext::MD_absolute_symbol)) 
-      return C; 
- 
-    auto SetAbsRange = [&](uint64_t Min, uint64_t Max) { 
-      auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Min)); 
-      auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Max)); 
-      GV->setMetadata(LLVMContext::MD_absolute_symbol, 
-                      MDNode::get(M.getContext(), {MinC, MaxC})); 
-    }; 
-    if (AbsWidth == IntPtrTy->getBitWidth()) 
-      SetAbsRange(~0ull, ~0ull); // Full set. 
-    else 
-      SetAbsRange(0, 1ull << AbsWidth); 
-    return C; 
-  }; 
- 
-  if (TIL.TheKind != TypeTestResolution::Unsat) 
-    TIL.OffsetedGlobal = ImportGlobal("global_addr"); 
- 
-  if (TIL.TheKind == TypeTestResolution::ByteArray || 
-      TIL.TheKind == TypeTestResolution::Inline || 
-      TIL.TheKind == TypeTestResolution::AllOnes) { 
-    TIL.AlignLog2 = ImportConstant("align", TTRes.AlignLog2, 8, Int8Ty); 
-    TIL.SizeM1 = 
-        ImportConstant("size_m1", TTRes.SizeM1, TTRes.SizeM1BitWidth, IntPtrTy); 
-  } 
- 
-  if (TIL.TheKind == TypeTestResolution::ByteArray) { 
-    TIL.TheByteArray = ImportGlobal("byte_array"); 
-    TIL.BitMask = ImportConstant("bit_mask", TTRes.BitMask, 8, Int8PtrTy); 
-  } 
- 
-  if (TIL.TheKind == TypeTestResolution::Inline) 
-    TIL.InlineBits = ImportConstant( 
-        "inline_bits", TTRes.InlineBits, 1 << TTRes.SizeM1BitWidth, 
-        TTRes.SizeM1BitWidth <= 5 ? Int32Ty : Int64Ty); 
- 
-  return TIL; 
-} 
- 
-void LowerTypeTestsModule::importTypeTest(CallInst *CI) { 
-  auto TypeIdMDVal = dyn_cast<MetadataAsValue>(CI->getArgOperand(1)); 
-  if (!TypeIdMDVal) 
-    report_fatal_error("Second argument of llvm.type.test must be metadata"); 
- 
-  auto TypeIdStr = dyn_cast<MDString>(TypeIdMDVal->getMetadata()); 
-  // If this is a local unpromoted type, which doesn't have a metadata string, 
-  // treat as Unknown and delay lowering, so that we can still utilize it for 
-  // later optimizations. 
-  if (!TypeIdStr) 
-    return; 
- 
-  TypeIdLowering TIL = importTypeId(TypeIdStr->getString()); 
-  Value *Lowered = lowerTypeTestCall(TypeIdStr, CI, TIL); 
-  if (Lowered) { 
-    CI->replaceAllUsesWith(Lowered); 
-    CI->eraseFromParent(); 
-  } 
-} 
- 
-// ThinLTO backend: the function F has a jump table entry; update this module 
-// accordingly. isJumpTableCanonical describes the type of the jump table entry. 
-void LowerTypeTestsModule::importFunction( 
-    Function *F, bool isJumpTableCanonical, 
-    std::vector<GlobalAlias *> &AliasesToErase) { 
-  assert(F->getType()->getAddressSpace() == 0); 
- 
-  GlobalValue::VisibilityTypes Visibility = F->getVisibility(); 
-  std::string Name = std::string(F->getName()); 
- 
-  if (F->isDeclarationForLinker() && isJumpTableCanonical) { 
-    // Non-dso_local functions may be overriden at run time, 
-    // don't short curcuit them 
-    if (F->isDSOLocal()) { 
-      Function *RealF = Function::Create(F->getFunctionType(), 
-                                         GlobalValue::ExternalLinkage, 
-                                         F->getAddressSpace(), 
-                                         Name + ".cfi", &M); 
-      RealF->setVisibility(GlobalVariable::HiddenVisibility); 
-      replaceDirectCalls(F, RealF); 
-    } 
-    return; 
-  } 
- 
-  Function *FDecl; 
-  if (!isJumpTableCanonical) { 
-    // Either a declaration of an external function or a reference to a locally 
-    // defined jump table. 
-    FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage, 
-                             F->getAddressSpace(), Name + ".cfi_jt", &M); 
-    FDecl->setVisibility(GlobalValue::HiddenVisibility); 
-  } else { 
-    F->setName(Name + ".cfi"); 
-    F->setLinkage(GlobalValue::ExternalLinkage); 
-    FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage, 
-                             F->getAddressSpace(), Name, &M); 
-    FDecl->setVisibility(Visibility); 
-    Visibility = GlobalValue::HiddenVisibility; 
- 
-    // Delete aliases pointing to this function, they'll be re-created in the 
-    // merged output. Don't do it yet though because ScopedSaveAliaseesAndUsed 
-    // will want to reset the aliasees first. 
-    for (auto &U : F->uses()) { 
-      if (auto *A = dyn_cast<GlobalAlias>(U.getUser())) { 
-        Function *AliasDecl = Function::Create( 
-            F->getFunctionType(), GlobalValue::ExternalLinkage, 
-            F->getAddressSpace(), "", &M); 
-        AliasDecl->takeName(A); 
-        A->replaceAllUsesWith(AliasDecl); 
-        AliasesToErase.push_back(A); 
-      } 
-    } 
-  } 
- 
-  if (F->hasExternalWeakLinkage()) 
-    replaceWeakDeclarationWithJumpTablePtr(F, FDecl, isJumpTableCanonical); 
-  else 
-    replaceCfiUses(F, FDecl, isJumpTableCanonical); 
- 
-  // Set visibility late because it's used in replaceCfiUses() to determine 
-  // whether uses need to to be replaced. 
-  F->setVisibility(Visibility); 
-} 
- 
-void LowerTypeTestsModule::lowerTypeTestCalls( 
-    ArrayRef<Metadata *> TypeIds, Constant *CombinedGlobalAddr, 
-    const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout) { 
-  CombinedGlobalAddr = ConstantExpr::getBitCast(CombinedGlobalAddr, Int8PtrTy); 
- 
-  // For each type identifier in this disjoint set... 
-  for (Metadata *TypeId : TypeIds) { 
-    // Build the bitset. 
-    BitSetInfo BSI = buildBitSet(TypeId, GlobalLayout); 
-    LLVM_DEBUG({ 
-      if (auto MDS = dyn_cast<MDString>(TypeId)) 
-        dbgs() << MDS->getString() << ": "; 
-      else 
-        dbgs() << "<unnamed>: "; 
-      BSI.print(dbgs()); 
-    }); 
- 
-    ByteArrayInfo *BAI = nullptr; 
-    TypeIdLowering TIL; 
-    TIL.OffsetedGlobal = ConstantExpr::getGetElementPtr( 
-        Int8Ty, CombinedGlobalAddr, ConstantInt::get(IntPtrTy, BSI.ByteOffset)), 
-    TIL.AlignLog2 = ConstantInt::get(Int8Ty, BSI.AlignLog2); 
-    TIL.SizeM1 = ConstantInt::get(IntPtrTy, BSI.BitSize - 1); 
-    if (BSI.isAllOnes()) { 
-      TIL.TheKind = (BSI.BitSize == 1) ? TypeTestResolution::Single 
-                                       : TypeTestResolution::AllOnes; 
-    } else if (BSI.BitSize <= 64) { 
-      TIL.TheKind = TypeTestResolution::Inline; 
-      uint64_t InlineBits = 0; 
-      for (auto Bit : BSI.Bits) 
-        InlineBits |= uint64_t(1) << Bit; 
-      if (InlineBits == 0) 
-        TIL.TheKind = TypeTestResolution::Unsat; 
-      else 
-        TIL.InlineBits = ConstantInt::get( 
-            (BSI.BitSize <= 32) ? Int32Ty : Int64Ty, InlineBits); 
-    } else { 
-      TIL.TheKind = TypeTestResolution::ByteArray; 
-      ++NumByteArraysCreated; 
-      BAI = createByteArray(BSI); 
-      TIL.TheByteArray = BAI->ByteArray; 
-      TIL.BitMask = BAI->MaskGlobal; 
-    } 
- 
-    TypeIdUserInfo &TIUI = TypeIdUsers[TypeId]; 
- 
-    if (TIUI.IsExported) { 
-      uint8_t *MaskPtr = exportTypeId(cast<MDString>(TypeId)->getString(), TIL); 
-      if (BAI) 
-        BAI->MaskPtr = MaskPtr; 
-    } 
- 
-    // Lower each call to llvm.type.test for this type identifier. 
-    for (CallInst *CI : TIUI.CallSites) { 
-      ++NumTypeTestCallsLowered; 
-      Value *Lowered = lowerTypeTestCall(TypeId, CI, TIL); 
-      if (Lowered) { 
-        CI->replaceAllUsesWith(Lowered); 
-        CI->eraseFromParent(); 
-      } 
-    } 
-  } 
-} 
- 
-void LowerTypeTestsModule::verifyTypeMDNode(GlobalObject *GO, MDNode *Type) { 
-  if (Type->getNumOperands() != 2) 
-    report_fatal_error("All operands of type metadata must have 2 elements"); 
- 
-  if (GO->isThreadLocal()) 
-    report_fatal_error("Bit set element may not be thread-local"); 
-  if (isa<GlobalVariable>(GO) && GO->hasSection()) 
-    report_fatal_error( 
-        "A member of a type identifier may not have an explicit section"); 
- 
-  // FIXME: We previously checked that global var member of a type identifier 
-  // must be a definition, but the IR linker may leave type metadata on 
-  // declarations. We should restore this check after fixing PR31759. 
- 
-  auto OffsetConstMD = dyn_cast<ConstantAsMetadata>(Type->getOperand(0)); 
-  if (!OffsetConstMD) 
-    report_fatal_error("Type offset must be a constant"); 
-  auto OffsetInt = dyn_cast<ConstantInt>(OffsetConstMD->getValue()); 
-  if (!OffsetInt) 
-    report_fatal_error("Type offset must be an integer constant"); 
-} 
- 
-static const unsigned kX86JumpTableEntrySize = 8; 
-static const unsigned kARMJumpTableEntrySize = 4; 
+      OldFragment.clear();
+    }
+  }
+
+  // Update the fragment map to point our object indices to this fragment.
+  for (uint64_t ObjIndex : Fragment)
+    FragmentMap[ObjIndex] = FragmentIndex;
+}
+
+void ByteArrayBuilder::allocate(const std::set<uint64_t> &Bits,
+                                uint64_t BitSize, uint64_t &AllocByteOffset,
+                                uint8_t &AllocMask) {
+  // Find the smallest current allocation.
+  unsigned Bit = 0;
+  for (unsigned I = 1; I != BitsPerByte; ++I)
+    if (BitAllocs[I] < BitAllocs[Bit])
+      Bit = I;
+
+  AllocByteOffset = BitAllocs[Bit];
+
+  // Add our size to it.
+  unsigned ReqSize = AllocByteOffset + BitSize;
+  BitAllocs[Bit] = ReqSize;
+  if (Bytes.size() < ReqSize)
+    Bytes.resize(ReqSize);
+
+  // Set our bits.
+  AllocMask = 1 << Bit;
+  for (uint64_t B : Bits)
+    Bytes[AllocByteOffset + B] |= AllocMask;
+}
+
+bool lowertypetests::isJumpTableCanonical(Function *F) {
+  if (F->isDeclarationForLinker())
+    return false;
+  auto *CI = mdconst::extract_or_null<ConstantInt>(
+      F->getParent()->getModuleFlag("CFI Canonical Jump Tables"));
+  if (!CI || CI->getZExtValue() != 0)
+    return true;
+  return F->hasFnAttribute("cfi-canonical-jump-table");
+}
+
+namespace {
+
+struct ByteArrayInfo {
+  std::set<uint64_t> Bits;
+  uint64_t BitSize;
+  GlobalVariable *ByteArray;
+  GlobalVariable *MaskGlobal;
+  uint8_t *MaskPtr = nullptr;
+};
+
+/// A POD-like structure that we use to store a global reference together with
+/// its metadata types. In this pass we frequently need to query the set of
+/// metadata types referenced by a global, which at the IR level is an expensive
+/// operation involving a map lookup; this data structure helps to reduce the
+/// number of times we need to do this lookup.
+class GlobalTypeMember final : TrailingObjects<GlobalTypeMember, MDNode *> {
+  friend TrailingObjects;
+
+  GlobalObject *GO;
+  size_t NTypes;
+
+  // For functions: true if the jump table is canonical. This essentially means
+  // whether the canonical address (i.e. the symbol table entry) of the function
+  // is provided by the local jump table. This is normally the same as whether
+  // the function is defined locally, but if canonical jump tables are disabled
+  // by the user then the jump table never provides a canonical definition.
+  bool IsJumpTableCanonical;
+
+  // For functions: true if this function is either defined or used in a thinlto
+  // module and its jumptable entry needs to be exported to thinlto backends.
+  bool IsExported;
+
+  size_t numTrailingObjects(OverloadToken<MDNode *>) const { return NTypes; }
+
+public:
+  static GlobalTypeMember *create(BumpPtrAllocator &Alloc, GlobalObject *GO,
+                                  bool IsJumpTableCanonical, bool IsExported,
+                                  ArrayRef<MDNode *> Types) {
+    auto *GTM = static_cast<GlobalTypeMember *>(Alloc.Allocate(
+        totalSizeToAlloc<MDNode *>(Types.size()), alignof(GlobalTypeMember)));
+    GTM->GO = GO;
+    GTM->NTypes = Types.size();
+    GTM->IsJumpTableCanonical = IsJumpTableCanonical;
+    GTM->IsExported = IsExported;
+    std::uninitialized_copy(Types.begin(), Types.end(),
+                            GTM->getTrailingObjects<MDNode *>());
+    return GTM;
+  }
+
+  GlobalObject *getGlobal() const {
+    return GO;
+  }
+
+  bool isJumpTableCanonical() const {
+    return IsJumpTableCanonical;
+  }
+
+  bool isExported() const {
+    return IsExported;
+  }
+
+  ArrayRef<MDNode *> types() const {
+    return makeArrayRef(getTrailingObjects<MDNode *>(), NTypes);
+  }
+};
+
+struct ICallBranchFunnel final
+    : TrailingObjects<ICallBranchFunnel, GlobalTypeMember *> {
+  static ICallBranchFunnel *create(BumpPtrAllocator &Alloc, CallInst *CI,
+                                   ArrayRef<GlobalTypeMember *> Targets,
+                                   unsigned UniqueId) {
+    auto *Call = static_cast<ICallBranchFunnel *>(
+        Alloc.Allocate(totalSizeToAlloc<GlobalTypeMember *>(Targets.size()),
+                       alignof(ICallBranchFunnel)));
+    Call->CI = CI;
+    Call->UniqueId = UniqueId;
+    Call->NTargets = Targets.size();
+    std::uninitialized_copy(Targets.begin(), Targets.end(),
+                            Call->getTrailingObjects<GlobalTypeMember *>());
+    return Call;
+  }
+
+  CallInst *CI;
+  ArrayRef<GlobalTypeMember *> targets() const {
+    return makeArrayRef(getTrailingObjects<GlobalTypeMember *>(), NTargets);
+  }
+
+  unsigned UniqueId;
+
+private:
+  size_t NTargets;
+};
+
+struct ScopedSaveAliaseesAndUsed {
+  Module &M;
+  SmallPtrSet<GlobalValue *, 16> Used, CompilerUsed;
+  std::vector<std::pair<GlobalIndirectSymbol *, Function *>> FunctionAliases;
+
+  ScopedSaveAliaseesAndUsed(Module &M) : M(M) {
+    // The users of this class want to replace all function references except
+    // for aliases and llvm.used/llvm.compiler.used with references to a jump
+    // table. We avoid replacing aliases in order to avoid introducing a double
+    // indirection (or an alias pointing to a declaration in ThinLTO mode), and
+    // we avoid replacing llvm.used/llvm.compiler.used because these global
+    // variables describe properties of the global, not the jump table (besides,
+    // offseted references to the jump table in llvm.used are invalid).
+    // Unfortunately, LLVM doesn't have a "RAUW except for these (possibly
+    // indirect) users", so what we do is save the list of globals referenced by
+    // llvm.used/llvm.compiler.used and aliases, erase the used lists, let RAUW
+    // replace the aliasees and then set them back to their original values at
+    // the end.
+    if (GlobalVariable *GV = collectUsedGlobalVariables(M, Used, false))
+      GV->eraseFromParent();
+    if (GlobalVariable *GV = collectUsedGlobalVariables(M, CompilerUsed, true))
+      GV->eraseFromParent();
+
+    for (auto &GIS : concat<GlobalIndirectSymbol>(M.aliases(), M.ifuncs())) {
+      // FIXME: This should look past all aliases not just interposable ones,
+      // see discussion on D65118.
+      if (auto *F =
+              dyn_cast<Function>(GIS.getIndirectSymbol()->stripPointerCasts()))
+        FunctionAliases.push_back({&GIS, F});
+    }
+  }
+
+  ~ScopedSaveAliaseesAndUsed() {
+    appendToUsed(M, std::vector<GlobalValue *>(Used.begin(), Used.end()));
+    appendToCompilerUsed(M, std::vector<GlobalValue *>(CompilerUsed.begin(),
+                                                       CompilerUsed.end()));
+
+    for (auto P : FunctionAliases)
+      P.first->setIndirectSymbol(
+          ConstantExpr::getBitCast(P.second, P.first->getType()));
+  }
+};
+
+class LowerTypeTestsModule {
+  Module &M;
+
+  ModuleSummaryIndex *ExportSummary;
+  const ModuleSummaryIndex *ImportSummary;
+  // Set when the client has invoked this to simply drop all type test assume
+  // sequences.
+  bool DropTypeTests;
+
+  Triple::ArchType Arch;
+  Triple::OSType OS;
+  Triple::ObjectFormatType ObjectFormat;
+
+  IntegerType *Int1Ty = Type::getInt1Ty(M.getContext());
+  IntegerType *Int8Ty = Type::getInt8Ty(M.getContext());
+  PointerType *Int8PtrTy = Type::getInt8PtrTy(M.getContext());
+  ArrayType *Int8Arr0Ty = ArrayType::get(Type::getInt8Ty(M.getContext()), 0);
+  IntegerType *Int32Ty = Type::getInt32Ty(M.getContext());
+  PointerType *Int32PtrTy = PointerType::getUnqual(Int32Ty);
+  IntegerType *Int64Ty = Type::getInt64Ty(M.getContext());
+  IntegerType *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext(), 0);
+
+  // Indirect function call index assignment counter for WebAssembly
+  uint64_t IndirectIndex = 1;
+
+  // Mapping from type identifiers to the call sites that test them, as well as
+  // whether the type identifier needs to be exported to ThinLTO backends as
+  // part of the regular LTO phase of the ThinLTO pipeline (see exportTypeId).
+  struct TypeIdUserInfo {
+    std::vector<CallInst *> CallSites;
+    bool IsExported = false;
+  };
+  DenseMap<Metadata *, TypeIdUserInfo> TypeIdUsers;
+
+  /// This structure describes how to lower type tests for a particular type
+  /// identifier. It is either built directly from the global analysis (during
+  /// regular LTO or the regular LTO phase of ThinLTO), or indirectly using type
+  /// identifier summaries and external symbol references (in ThinLTO backends).
+  struct TypeIdLowering {
+    TypeTestResolution::Kind TheKind = TypeTestResolution::Unsat;
+
+    /// All except Unsat: the start address within the combined global.
+    Constant *OffsetedGlobal;
+
+    /// ByteArray, Inline, AllOnes: log2 of the required global alignment
+    /// relative to the start address.
+    Constant *AlignLog2;
+
+    /// ByteArray, Inline, AllOnes: one less than the size of the memory region
+    /// covering members of this type identifier as a multiple of 2^AlignLog2.
+    Constant *SizeM1;
+
+    /// ByteArray: the byte array to test the address against.
+    Constant *TheByteArray;
+
+    /// ByteArray: the bit mask to apply to bytes loaded from the byte array.
+    Constant *BitMask;
+
+    /// Inline: the bit mask to test the address against.
+    Constant *InlineBits;
+  };
+
+  std::vector<ByteArrayInfo> ByteArrayInfos;
+
+  Function *WeakInitializerFn = nullptr;
+
+  bool shouldExportConstantsAsAbsoluteSymbols();
+  uint8_t *exportTypeId(StringRef TypeId, const TypeIdLowering &TIL);
+  TypeIdLowering importTypeId(StringRef TypeId);
+  void importTypeTest(CallInst *CI);
+  void importFunction(Function *F, bool isJumpTableCanonical,
+                      std::vector<GlobalAlias *> &AliasesToErase);
+
+  BitSetInfo
+  buildBitSet(Metadata *TypeId,
+              const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout);
+  ByteArrayInfo *createByteArray(BitSetInfo &BSI);
+  void allocateByteArrays();
+  Value *createBitSetTest(IRBuilder<> &B, const TypeIdLowering &TIL,
+                          Value *BitOffset);
+  void lowerTypeTestCalls(
+      ArrayRef<Metadata *> TypeIds, Constant *CombinedGlobalAddr,
+      const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout);
+  Value *lowerTypeTestCall(Metadata *TypeId, CallInst *CI,
+                           const TypeIdLowering &TIL);
+
+  void buildBitSetsFromGlobalVariables(ArrayRef<Metadata *> TypeIds,
+                                       ArrayRef<GlobalTypeMember *> Globals);
+  unsigned getJumpTableEntrySize();
+  Type *getJumpTableEntryType();
+  void createJumpTableEntry(raw_ostream &AsmOS, raw_ostream &ConstraintOS,
+                            Triple::ArchType JumpTableArch,
+                            SmallVectorImpl<Value *> &AsmArgs, Function *Dest);
+  void verifyTypeMDNode(GlobalObject *GO, MDNode *Type);
+  void buildBitSetsFromFunctions(ArrayRef<Metadata *> TypeIds,
+                                 ArrayRef<GlobalTypeMember *> Functions);
+  void buildBitSetsFromFunctionsNative(ArrayRef<Metadata *> TypeIds,
+                                       ArrayRef<GlobalTypeMember *> Functions);
+  void buildBitSetsFromFunctionsWASM(ArrayRef<Metadata *> TypeIds,
+                                     ArrayRef<GlobalTypeMember *> Functions);
+  void
+  buildBitSetsFromDisjointSet(ArrayRef<Metadata *> TypeIds,
+                              ArrayRef<GlobalTypeMember *> Globals,
+                              ArrayRef<ICallBranchFunnel *> ICallBranchFunnels);
+
+  void replaceWeakDeclarationWithJumpTablePtr(Function *F, Constant *JT,
+                                              bool IsJumpTableCanonical);
+  void moveInitializerToModuleConstructor(GlobalVariable *GV);
+  void findGlobalVariableUsersOf(Constant *C,
+                                 SmallSetVector<GlobalVariable *, 8> &Out);
+
+  void createJumpTable(Function *F, ArrayRef<GlobalTypeMember *> Functions);
+
+  /// replaceCfiUses - Go through the uses list for this definition
+  /// and make each use point to "V" instead of "this" when the use is outside
+  /// the block. 'This's use list is expected to have at least one element.
+  /// Unlike replaceAllUsesWith this function skips blockaddr and direct call
+  /// uses.
+  void replaceCfiUses(Function *Old, Value *New, bool IsJumpTableCanonical);
+
+  /// replaceDirectCalls - Go through the uses list for this definition and
+  /// replace each use, which is a direct function call.
+  void replaceDirectCalls(Value *Old, Value *New);
+
+public:
+  LowerTypeTestsModule(Module &M, ModuleSummaryIndex *ExportSummary,
+                       const ModuleSummaryIndex *ImportSummary,
+                       bool DropTypeTests);
+
+  bool lower();
+
+  // Lower the module using the action and summary passed as command line
+  // arguments. For testing purposes only.
+  static bool runForTesting(Module &M);
+};
+
+struct LowerTypeTests : public ModulePass {
+  static char ID;
+
+  bool UseCommandLine = false;
+
+  ModuleSummaryIndex *ExportSummary;
+  const ModuleSummaryIndex *ImportSummary;
+  bool DropTypeTests;
+
+  LowerTypeTests() : ModulePass(ID), UseCommandLine(true) {
+    initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry());
+  }
+
+  LowerTypeTests(ModuleSummaryIndex *ExportSummary,
+                 const ModuleSummaryIndex *ImportSummary, bool DropTypeTests)
+      : ModulePass(ID), ExportSummary(ExportSummary),
+        ImportSummary(ImportSummary), DropTypeTests(DropTypeTests) {
+    initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    if (UseCommandLine)
+      return LowerTypeTestsModule::runForTesting(M);
+    return LowerTypeTestsModule(M, ExportSummary, ImportSummary, DropTypeTests)
+        .lower();
+  }
+};
+
+} // end anonymous namespace
+
+char LowerTypeTests::ID = 0;
+
+INITIALIZE_PASS(LowerTypeTests, "lowertypetests", "Lower type metadata", false,
+                false)
+
+ModulePass *
+llvm::createLowerTypeTestsPass(ModuleSummaryIndex *ExportSummary,
+                               const ModuleSummaryIndex *ImportSummary,
+                               bool DropTypeTests) {
+  return new LowerTypeTests(ExportSummary, ImportSummary, DropTypeTests);
+}
+
+/// Build a bit set for TypeId using the object layouts in
+/// GlobalLayout.
+BitSetInfo LowerTypeTestsModule::buildBitSet(
+    Metadata *TypeId,
+    const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout) {
+  BitSetBuilder BSB;
+
+  // Compute the byte offset of each address associated with this type
+  // identifier.
+  for (auto &GlobalAndOffset : GlobalLayout) {
+    for (MDNode *Type : GlobalAndOffset.first->types()) {
+      if (Type->getOperand(1) != TypeId)
+        continue;
+      uint64_t Offset =
+          cast<ConstantInt>(
+              cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
+              ->getZExtValue();
+      BSB.addOffset(GlobalAndOffset.second + Offset);
+    }
+  }
+
+  return BSB.build();
+}
+
+/// Build a test that bit BitOffset mod sizeof(Bits)*8 is set in
+/// Bits. This pattern matches to the bt instruction on x86.
+static Value *createMaskedBitTest(IRBuilder<> &B, Value *Bits,
+                                  Value *BitOffset) {
+  auto BitsType = cast<IntegerType>(Bits->getType());
+  unsigned BitWidth = BitsType->getBitWidth();
+
+  BitOffset = B.CreateZExtOrTrunc(BitOffset, BitsType);
+  Value *BitIndex =
+      B.CreateAnd(BitOffset, ConstantInt::get(BitsType, BitWidth - 1));
+  Value *BitMask = B.CreateShl(ConstantInt::get(BitsType, 1), BitIndex);
+  Value *MaskedBits = B.CreateAnd(Bits, BitMask);
+  return B.CreateICmpNE(MaskedBits, ConstantInt::get(BitsType, 0));
+}
+
+ByteArrayInfo *LowerTypeTestsModule::createByteArray(BitSetInfo &BSI) {
+  // Create globals to stand in for byte arrays and masks. These never actually
+  // get initialized, we RAUW and erase them later in allocateByteArrays() once
+  // we know the offset and mask to use.
+  auto ByteArrayGlobal = new GlobalVariable(
+      M, Int8Ty, /*isConstant=*/true, GlobalValue::PrivateLinkage, nullptr);
+  auto MaskGlobal = new GlobalVariable(M, Int8Ty, /*isConstant=*/true,
+                                       GlobalValue::PrivateLinkage, nullptr);
+
+  ByteArrayInfos.emplace_back();
+  ByteArrayInfo *BAI = &ByteArrayInfos.back();
+
+  BAI->Bits = BSI.Bits;
+  BAI->BitSize = BSI.BitSize;
+  BAI->ByteArray = ByteArrayGlobal;
+  BAI->MaskGlobal = MaskGlobal;
+  return BAI;
+}
+
+void LowerTypeTestsModule::allocateByteArrays() {
+  llvm::stable_sort(ByteArrayInfos,
+                    [](const ByteArrayInfo &BAI1, const ByteArrayInfo &BAI2) {
+                      return BAI1.BitSize > BAI2.BitSize;
+                    });
+
+  std::vector<uint64_t> ByteArrayOffsets(ByteArrayInfos.size());
+
+  ByteArrayBuilder BAB;
+  for (unsigned I = 0; I != ByteArrayInfos.size(); ++I) {
+    ByteArrayInfo *BAI = &ByteArrayInfos[I];
+
+    uint8_t Mask;
+    BAB.allocate(BAI->Bits, BAI->BitSize, ByteArrayOffsets[I], Mask);
+
+    BAI->MaskGlobal->replaceAllUsesWith(
+        ConstantExpr::getIntToPtr(ConstantInt::get(Int8Ty, Mask), Int8PtrTy));
+    BAI->MaskGlobal->eraseFromParent();
+    if (BAI->MaskPtr)
+      *BAI->MaskPtr = Mask;
+  }
+
+  Constant *ByteArrayConst = ConstantDataArray::get(M.getContext(), BAB.Bytes);
+  auto ByteArray =
+      new GlobalVariable(M, ByteArrayConst->getType(), /*isConstant=*/true,
+                         GlobalValue::PrivateLinkage, ByteArrayConst);
+
+  for (unsigned I = 0; I != ByteArrayInfos.size(); ++I) {
+    ByteArrayInfo *BAI = &ByteArrayInfos[I];
+
+    Constant *Idxs[] = {ConstantInt::get(IntPtrTy, 0),
+                        ConstantInt::get(IntPtrTy, ByteArrayOffsets[I])};
+    Constant *GEP = ConstantExpr::getInBoundsGetElementPtr(
+        ByteArrayConst->getType(), ByteArray, Idxs);
+
+    // Create an alias instead of RAUW'ing the gep directly. On x86 this ensures
+    // that the pc-relative displacement is folded into the lea instead of the
+    // test instruction getting another displacement.
+    GlobalAlias *Alias = GlobalAlias::create(
+        Int8Ty, 0, GlobalValue::PrivateLinkage, "bits", GEP, &M);
+    BAI->ByteArray->replaceAllUsesWith(Alias);
+    BAI->ByteArray->eraseFromParent();
+  }
+
+  ByteArraySizeBits = BAB.BitAllocs[0] + BAB.BitAllocs[1] + BAB.BitAllocs[2] +
+                      BAB.BitAllocs[3] + BAB.BitAllocs[4] + BAB.BitAllocs[5] +
+                      BAB.BitAllocs[6] + BAB.BitAllocs[7];
+  ByteArraySizeBytes = BAB.Bytes.size();
+}
+
+/// Build a test that bit BitOffset is set in the type identifier that was
+/// lowered to TIL, which must be either an Inline or a ByteArray.
+Value *LowerTypeTestsModule::createBitSetTest(IRBuilder<> &B,
+                                              const TypeIdLowering &TIL,
+                                              Value *BitOffset) {
+  if (TIL.TheKind == TypeTestResolution::Inline) {
+    // If the bit set is sufficiently small, we can avoid a load by bit testing
+    // a constant.
+    return createMaskedBitTest(B, TIL.InlineBits, BitOffset);
+  } else {
+    Constant *ByteArray = TIL.TheByteArray;
+    if (AvoidReuse && !ImportSummary) {
+      // Each use of the byte array uses a different alias. This makes the
+      // backend less likely to reuse previously computed byte array addresses,
+      // improving the security of the CFI mechanism based on this pass.
+      // This won't work when importing because TheByteArray is external.
+      ByteArray = GlobalAlias::create(Int8Ty, 0, GlobalValue::PrivateLinkage,
+                                      "bits_use", ByteArray, &M);
+    }
+
+    Value *ByteAddr = B.CreateGEP(Int8Ty, ByteArray, BitOffset);
+    Value *Byte = B.CreateLoad(Int8Ty, ByteAddr);
+
+    Value *ByteAndMask =
+        B.CreateAnd(Byte, ConstantExpr::getPtrToInt(TIL.BitMask, Int8Ty));
+    return B.CreateICmpNE(ByteAndMask, ConstantInt::get(Int8Ty, 0));
+  }
+}
+
+static bool isKnownTypeIdMember(Metadata *TypeId, const DataLayout &DL,
+                                Value *V, uint64_t COffset) {
+  if (auto GV = dyn_cast<GlobalObject>(V)) {
+    SmallVector<MDNode *, 2> Types;
+    GV->getMetadata(LLVMContext::MD_type, Types);
+    for (MDNode *Type : Types) {
+      if (Type->getOperand(1) != TypeId)
+        continue;
+      uint64_t Offset =
+          cast<ConstantInt>(
+              cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
+              ->getZExtValue();
+      if (COffset == Offset)
+        return true;
+    }
+    return false;
+  }
+
+  if (auto GEP = dyn_cast<GEPOperator>(V)) {
+    APInt APOffset(DL.getPointerSizeInBits(0), 0);
+    bool Result = GEP->accumulateConstantOffset(DL, APOffset);
+    if (!Result)
+      return false;
+    COffset += APOffset.getZExtValue();
+    return isKnownTypeIdMember(TypeId, DL, GEP->getPointerOperand(), COffset);
+  }
+
+  if (auto Op = dyn_cast<Operator>(V)) {
+    if (Op->getOpcode() == Instruction::BitCast)
+      return isKnownTypeIdMember(TypeId, DL, Op->getOperand(0), COffset);
+
+    if (Op->getOpcode() == Instruction::Select)
+      return isKnownTypeIdMember(TypeId, DL, Op->getOperand(1), COffset) &&
+             isKnownTypeIdMember(TypeId, DL, Op->getOperand(2), COffset);
+  }
+
+  return false;
+}
+
+/// Lower a llvm.type.test call to its implementation. Returns the value to
+/// replace the call with.
+Value *LowerTypeTestsModule::lowerTypeTestCall(Metadata *TypeId, CallInst *CI,
+                                               const TypeIdLowering &TIL) {
+  // Delay lowering if the resolution is currently unknown.
+  if (TIL.TheKind == TypeTestResolution::Unknown)
+    return nullptr;
+  if (TIL.TheKind == TypeTestResolution::Unsat)
+    return ConstantInt::getFalse(M.getContext());
+
+  Value *Ptr = CI->getArgOperand(0);
+  const DataLayout &DL = M.getDataLayout();
+  if (isKnownTypeIdMember(TypeId, DL, Ptr, 0))
+    return ConstantInt::getTrue(M.getContext());
+
+  BasicBlock *InitialBB = CI->getParent();
+
+  IRBuilder<> B(CI);
+
+  Value *PtrAsInt = B.CreatePtrToInt(Ptr, IntPtrTy);
+
+  Constant *OffsetedGlobalAsInt =
+      ConstantExpr::getPtrToInt(TIL.OffsetedGlobal, IntPtrTy);
+  if (TIL.TheKind == TypeTestResolution::Single)
+    return B.CreateICmpEQ(PtrAsInt, OffsetedGlobalAsInt);
+
+  Value *PtrOffset = B.CreateSub(PtrAsInt, OffsetedGlobalAsInt);
+
+  // We need to check that the offset both falls within our range and is
+  // suitably aligned. We can check both properties at the same time by
+  // performing a right rotate by log2(alignment) followed by an integer
+  // comparison against the bitset size. The rotate will move the lower
+  // order bits that need to be zero into the higher order bits of the
+  // result, causing the comparison to fail if they are nonzero. The rotate
+  // also conveniently gives us a bit offset to use during the load from
+  // the bitset.
+  Value *OffsetSHR =
+      B.CreateLShr(PtrOffset, ConstantExpr::getZExt(TIL.AlignLog2, IntPtrTy));
+  Value *OffsetSHL = B.CreateShl(
+      PtrOffset, ConstantExpr::getZExt(
+                     ConstantExpr::getSub(
+                         ConstantInt::get(Int8Ty, DL.getPointerSizeInBits(0)),
+                         TIL.AlignLog2),
+                     IntPtrTy));
+  Value *BitOffset = B.CreateOr(OffsetSHR, OffsetSHL);
+
+  Value *OffsetInRange = B.CreateICmpULE(BitOffset, TIL.SizeM1);
+
+  // If the bit set is all ones, testing against it is unnecessary.
+  if (TIL.TheKind == TypeTestResolution::AllOnes)
+    return OffsetInRange;
+
+  // See if the intrinsic is used in the following common pattern:
+  //   br(llvm.type.test(...), thenbb, elsebb)
+  // where nothing happens between the type test and the br.
+  // If so, create slightly simpler IR.
+  if (CI->hasOneUse())
+    if (auto *Br = dyn_cast<BranchInst>(*CI->user_begin()))
+      if (CI->getNextNode() == Br) {
+        BasicBlock *Then = InitialBB->splitBasicBlock(CI->getIterator());
+        BasicBlock *Else = Br->getSuccessor(1);
+        BranchInst *NewBr = BranchInst::Create(Then, Else, OffsetInRange);
+        NewBr->setMetadata(LLVMContext::MD_prof,
+                           Br->getMetadata(LLVMContext::MD_prof));
+        ReplaceInstWithInst(InitialBB->getTerminator(), NewBr);
+
+        // Update phis in Else resulting from InitialBB being split
+        for (auto &Phi : Else->phis())
+          Phi.addIncoming(Phi.getIncomingValueForBlock(Then), InitialBB);
+
+        IRBuilder<> ThenB(CI);
+        return createBitSetTest(ThenB, TIL, BitOffset);
+      }
+
+  IRBuilder<> ThenB(SplitBlockAndInsertIfThen(OffsetInRange, CI, false));
+
+  // Now that we know that the offset is in range and aligned, load the
+  // appropriate bit from the bitset.
+  Value *Bit = createBitSetTest(ThenB, TIL, BitOffset);
+
+  // The value we want is 0 if we came directly from the initial block
+  // (having failed the range or alignment checks), or the loaded bit if
+  // we came from the block in which we loaded it.
+  B.SetInsertPoint(CI);
+  PHINode *P = B.CreatePHI(Int1Ty, 2);
+  P->addIncoming(ConstantInt::get(Int1Ty, 0), InitialBB);
+  P->addIncoming(Bit, ThenB.GetInsertBlock());
+  return P;
+}
+
+/// Given a disjoint set of type identifiers and globals, lay out the globals,
+/// build the bit sets and lower the llvm.type.test calls.
+void LowerTypeTestsModule::buildBitSetsFromGlobalVariables(
+    ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Globals) {
+  // Build a new global with the combined contents of the referenced globals.
+  // This global is a struct whose even-indexed elements contain the original
+  // contents of the referenced globals and whose odd-indexed elements contain
+  // any padding required to align the next element to the next power of 2 plus
+  // any additional padding required to meet its alignment requirements.
+  std::vector<Constant *> GlobalInits;
+  const DataLayout &DL = M.getDataLayout();
+  DenseMap<GlobalTypeMember *, uint64_t> GlobalLayout;
+  Align MaxAlign;
+  uint64_t CurOffset = 0;
+  uint64_t DesiredPadding = 0;
+  for (GlobalTypeMember *G : Globals) {
+    auto *GV = cast<GlobalVariable>(G->getGlobal());
+    Align Alignment =
+        DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
+    MaxAlign = std::max(MaxAlign, Alignment);
+    uint64_t GVOffset = alignTo(CurOffset + DesiredPadding, Alignment);
+    GlobalLayout[G] = GVOffset;
+    if (GVOffset != 0) {
+      uint64_t Padding = GVOffset - CurOffset;
+      GlobalInits.push_back(
+          ConstantAggregateZero::get(ArrayType::get(Int8Ty, Padding)));
+    }
+
+    GlobalInits.push_back(GV->getInitializer());
+    uint64_t InitSize = DL.getTypeAllocSize(GV->getValueType());
+    CurOffset = GVOffset + InitSize;
+
+    // Compute the amount of padding that we'd like for the next element.
+    DesiredPadding = NextPowerOf2(InitSize - 1) - InitSize;
+
+    // Experiments of different caps with Chromium on both x64 and ARM64
+    // have shown that the 32-byte cap generates the smallest binary on
+    // both platforms while different caps yield similar performance.
+    // (see https://lists.llvm.org/pipermail/llvm-dev/2018-July/124694.html)
+    if (DesiredPadding > 32)
+      DesiredPadding = alignTo(InitSize, 32) - InitSize;
+  }
+
+  Constant *NewInit = ConstantStruct::getAnon(M.getContext(), GlobalInits);
+  auto *CombinedGlobal =
+      new GlobalVariable(M, NewInit->getType(), /*isConstant=*/true,
+                         GlobalValue::PrivateLinkage, NewInit);
+  CombinedGlobal->setAlignment(MaxAlign);
+
+  StructType *NewTy = cast<StructType>(NewInit->getType());
+  lowerTypeTestCalls(TypeIds, CombinedGlobal, GlobalLayout);
+
+  // Build aliases pointing to offsets into the combined global for each
+  // global from which we built the combined global, and replace references
+  // to the original globals with references to the aliases.
+  for (unsigned I = 0; I != Globals.size(); ++I) {
+    GlobalVariable *GV = cast<GlobalVariable>(Globals[I]->getGlobal());
+
+    // Multiply by 2 to account for padding elements.
+    Constant *CombinedGlobalIdxs[] = {ConstantInt::get(Int32Ty, 0),
+                                      ConstantInt::get(Int32Ty, I * 2)};
+    Constant *CombinedGlobalElemPtr = ConstantExpr::getGetElementPtr(
+        NewInit->getType(), CombinedGlobal, CombinedGlobalIdxs);
+    assert(GV->getType()->getAddressSpace() == 0);
+    GlobalAlias *GAlias =
+        GlobalAlias::create(NewTy->getElementType(I * 2), 0, GV->getLinkage(),
+                            "", CombinedGlobalElemPtr, &M);
+    GAlias->setVisibility(GV->getVisibility());
+    GAlias->takeName(GV);
+    GV->replaceAllUsesWith(GAlias);
+    GV->eraseFromParent();
+  }
+}
+
+bool LowerTypeTestsModule::shouldExportConstantsAsAbsoluteSymbols() {
+  return (Arch == Triple::x86 || Arch == Triple::x86_64) &&
+         ObjectFormat == Triple::ELF;
+}
+
+/// Export the given type identifier so that ThinLTO backends may import it.
+/// Type identifiers are exported by adding coarse-grained information about how
+/// to test the type identifier to the summary, and creating symbols in the
+/// object file (aliases and absolute symbols) containing fine-grained
+/// information about the type identifier.
+///
+/// Returns a pointer to the location in which to store the bitmask, if
+/// applicable.
+uint8_t *LowerTypeTestsModule::exportTypeId(StringRef TypeId,
+                                            const TypeIdLowering &TIL) {
+  TypeTestResolution &TTRes =
+      ExportSummary->getOrInsertTypeIdSummary(TypeId).TTRes;
+  TTRes.TheKind = TIL.TheKind;
+
+  auto ExportGlobal = [&](StringRef Name, Constant *C) {
+    GlobalAlias *GA =
+        GlobalAlias::create(Int8Ty, 0, GlobalValue::ExternalLinkage,
+                            "__typeid_" + TypeId + "_" + Name, C, &M);
+    GA->setVisibility(GlobalValue::HiddenVisibility);
+  };
+
+  auto ExportConstant = [&](StringRef Name, uint64_t &Storage, Constant *C) {
+    if (shouldExportConstantsAsAbsoluteSymbols())
+      ExportGlobal(Name, ConstantExpr::getIntToPtr(C, Int8PtrTy));
+    else
+      Storage = cast<ConstantInt>(C)->getZExtValue();
+  };
+
+  if (TIL.TheKind != TypeTestResolution::Unsat)
+    ExportGlobal("global_addr", TIL.OffsetedGlobal);
+
+  if (TIL.TheKind == TypeTestResolution::ByteArray ||
+      TIL.TheKind == TypeTestResolution::Inline ||
+      TIL.TheKind == TypeTestResolution::AllOnes) {
+    ExportConstant("align", TTRes.AlignLog2, TIL.AlignLog2);
+    ExportConstant("size_m1", TTRes.SizeM1, TIL.SizeM1);
+
+    uint64_t BitSize = cast<ConstantInt>(TIL.SizeM1)->getZExtValue() + 1;
+    if (TIL.TheKind == TypeTestResolution::Inline)
+      TTRes.SizeM1BitWidth = (BitSize <= 32) ? 5 : 6;
+    else
+      TTRes.SizeM1BitWidth = (BitSize <= 128) ? 7 : 32;
+  }
+
+  if (TIL.TheKind == TypeTestResolution::ByteArray) {
+    ExportGlobal("byte_array", TIL.TheByteArray);
+    if (shouldExportConstantsAsAbsoluteSymbols())
+      ExportGlobal("bit_mask", TIL.BitMask);
+    else
+      return &TTRes.BitMask;
+  }
+
+  if (TIL.TheKind == TypeTestResolution::Inline)
+    ExportConstant("inline_bits", TTRes.InlineBits, TIL.InlineBits);
+
+  return nullptr;
+}
+
+LowerTypeTestsModule::TypeIdLowering
+LowerTypeTestsModule::importTypeId(StringRef TypeId) {
+  const TypeIdSummary *TidSummary = ImportSummary->getTypeIdSummary(TypeId);
+  if (!TidSummary)
+    return {}; // Unsat: no globals match this type id.
+  const TypeTestResolution &TTRes = TidSummary->TTRes;
+
+  TypeIdLowering TIL;
+  TIL.TheKind = TTRes.TheKind;
+
+  auto ImportGlobal = [&](StringRef Name) {
+    // Give the global a type of length 0 so that it is not assumed not to alias
+    // with any other global.
+    Constant *C = M.getOrInsertGlobal(("__typeid_" + TypeId + "_" + Name).str(),
+                                      Int8Arr0Ty);
+    if (auto *GV = dyn_cast<GlobalVariable>(C))
+      GV->setVisibility(GlobalValue::HiddenVisibility);
+    C = ConstantExpr::getBitCast(C, Int8PtrTy);
+    return C;
+  };
+
+  auto ImportConstant = [&](StringRef Name, uint64_t Const, unsigned AbsWidth,
+                            Type *Ty) {
+    if (!shouldExportConstantsAsAbsoluteSymbols()) {
+      Constant *C =
+          ConstantInt::get(isa<IntegerType>(Ty) ? Ty : Int64Ty, Const);
+      if (!isa<IntegerType>(Ty))
+        C = ConstantExpr::getIntToPtr(C, Ty);
+      return C;
+    }
+
+    Constant *C = ImportGlobal(Name);
+    auto *GV = cast<GlobalVariable>(C->stripPointerCasts());
+    if (isa<IntegerType>(Ty))
+      C = ConstantExpr::getPtrToInt(C, Ty);
+    if (GV->getMetadata(LLVMContext::MD_absolute_symbol))
+      return C;
+
+    auto SetAbsRange = [&](uint64_t Min, uint64_t Max) {
+      auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Min));
+      auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Max));
+      GV->setMetadata(LLVMContext::MD_absolute_symbol,
+                      MDNode::get(M.getContext(), {MinC, MaxC}));
+    };
+    if (AbsWidth == IntPtrTy->getBitWidth())
+      SetAbsRange(~0ull, ~0ull); // Full set.
+    else
+      SetAbsRange(0, 1ull << AbsWidth);
+    return C;
+  };
+
+  if (TIL.TheKind != TypeTestResolution::Unsat)
+    TIL.OffsetedGlobal = ImportGlobal("global_addr");
+
+  if (TIL.TheKind == TypeTestResolution::ByteArray ||
+      TIL.TheKind == TypeTestResolution::Inline ||
+      TIL.TheKind == TypeTestResolution::AllOnes) {
+    TIL.AlignLog2 = ImportConstant("align", TTRes.AlignLog2, 8, Int8Ty);
+    TIL.SizeM1 =
+        ImportConstant("size_m1", TTRes.SizeM1, TTRes.SizeM1BitWidth, IntPtrTy);
+  }
+
+  if (TIL.TheKind == TypeTestResolution::ByteArray) {
+    TIL.TheByteArray = ImportGlobal("byte_array");
+    TIL.BitMask = ImportConstant("bit_mask", TTRes.BitMask, 8, Int8PtrTy);
+  }
+
+  if (TIL.TheKind == TypeTestResolution::Inline)
+    TIL.InlineBits = ImportConstant(
+        "inline_bits", TTRes.InlineBits, 1 << TTRes.SizeM1BitWidth,
+        TTRes.SizeM1BitWidth <= 5 ? Int32Ty : Int64Ty);
+
+  return TIL;
+}
+
+void LowerTypeTestsModule::importTypeTest(CallInst *CI) {
+  auto TypeIdMDVal = dyn_cast<MetadataAsValue>(CI->getArgOperand(1));
+  if (!TypeIdMDVal)
+    report_fatal_error("Second argument of llvm.type.test must be metadata");
+
+  auto TypeIdStr = dyn_cast<MDString>(TypeIdMDVal->getMetadata());
+  // If this is a local unpromoted type, which doesn't have a metadata string,
+  // treat as Unknown and delay lowering, so that we can still utilize it for
+  // later optimizations.
+  if (!TypeIdStr)
+    return;
+
+  TypeIdLowering TIL = importTypeId(TypeIdStr->getString());
+  Value *Lowered = lowerTypeTestCall(TypeIdStr, CI, TIL);
+  if (Lowered) {
+    CI->replaceAllUsesWith(Lowered);
+    CI->eraseFromParent();
+  }
+}
+
+// ThinLTO backend: the function F has a jump table entry; update this module
+// accordingly. isJumpTableCanonical describes the type of the jump table entry.
+void LowerTypeTestsModule::importFunction(
+    Function *F, bool isJumpTableCanonical,
+    std::vector<GlobalAlias *> &AliasesToErase) {
+  assert(F->getType()->getAddressSpace() == 0);
+
+  GlobalValue::VisibilityTypes Visibility = F->getVisibility();
+  std::string Name = std::string(F->getName());
+
+  if (F->isDeclarationForLinker() && isJumpTableCanonical) {
+    // Non-dso_local functions may be overriden at run time,
+    // don't short curcuit them
+    if (F->isDSOLocal()) {
+      Function *RealF = Function::Create(F->getFunctionType(),
+                                         GlobalValue::ExternalLinkage,
+                                         F->getAddressSpace(),
+                                         Name + ".cfi", &M);
+      RealF->setVisibility(GlobalVariable::HiddenVisibility);
+      replaceDirectCalls(F, RealF);
+    }
+    return;
+  }
+
+  Function *FDecl;
+  if (!isJumpTableCanonical) {
+    // Either a declaration of an external function or a reference to a locally
+    // defined jump table.
+    FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage,
+                             F->getAddressSpace(), Name + ".cfi_jt", &M);
+    FDecl->setVisibility(GlobalValue::HiddenVisibility);
+  } else {
+    F->setName(Name + ".cfi");
+    F->setLinkage(GlobalValue::ExternalLinkage);
+    FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage,
+                             F->getAddressSpace(), Name, &M);
+    FDecl->setVisibility(Visibility);
+    Visibility = GlobalValue::HiddenVisibility;
+
+    // Delete aliases pointing to this function, they'll be re-created in the
+    // merged output. Don't do it yet though because ScopedSaveAliaseesAndUsed
+    // will want to reset the aliasees first.
+    for (auto &U : F->uses()) {
+      if (auto *A = dyn_cast<GlobalAlias>(U.getUser())) {
+        Function *AliasDecl = Function::Create(
+            F->getFunctionType(), GlobalValue::ExternalLinkage,
+            F->getAddressSpace(), "", &M);
+        AliasDecl->takeName(A);
+        A->replaceAllUsesWith(AliasDecl);
+        AliasesToErase.push_back(A);
+      }
+    }
+  }
+
+  if (F->hasExternalWeakLinkage())
+    replaceWeakDeclarationWithJumpTablePtr(F, FDecl, isJumpTableCanonical);
+  else
+    replaceCfiUses(F, FDecl, isJumpTableCanonical);
+
+  // Set visibility late because it's used in replaceCfiUses() to determine
+  // whether uses need to to be replaced.
+  F->setVisibility(Visibility);
+}
+
+void LowerTypeTestsModule::lowerTypeTestCalls(
+    ArrayRef<Metadata *> TypeIds, Constant *CombinedGlobalAddr,
+    const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout) {
+  CombinedGlobalAddr = ConstantExpr::getBitCast(CombinedGlobalAddr, Int8PtrTy);
+
+  // For each type identifier in this disjoint set...
+  for (Metadata *TypeId : TypeIds) {
+    // Build the bitset.
+    BitSetInfo BSI = buildBitSet(TypeId, GlobalLayout);
+    LLVM_DEBUG({
+      if (auto MDS = dyn_cast<MDString>(TypeId))
+        dbgs() << MDS->getString() << ": ";
+      else
+        dbgs() << "<unnamed>: ";
+      BSI.print(dbgs());
+    });
+
+    ByteArrayInfo *BAI = nullptr;
+    TypeIdLowering TIL;
+    TIL.OffsetedGlobal = ConstantExpr::getGetElementPtr(
+        Int8Ty, CombinedGlobalAddr, ConstantInt::get(IntPtrTy, BSI.ByteOffset)),
+    TIL.AlignLog2 = ConstantInt::get(Int8Ty, BSI.AlignLog2);
+    TIL.SizeM1 = ConstantInt::get(IntPtrTy, BSI.BitSize - 1);
+    if (BSI.isAllOnes()) {
+      TIL.TheKind = (BSI.BitSize == 1) ? TypeTestResolution::Single
+                                       : TypeTestResolution::AllOnes;
+    } else if (BSI.BitSize <= 64) {
+      TIL.TheKind = TypeTestResolution::Inline;
+      uint64_t InlineBits = 0;
+      for (auto Bit : BSI.Bits)
+        InlineBits |= uint64_t(1) << Bit;
+      if (InlineBits == 0)
+        TIL.TheKind = TypeTestResolution::Unsat;
+      else
+        TIL.InlineBits = ConstantInt::get(
+            (BSI.BitSize <= 32) ? Int32Ty : Int64Ty, InlineBits);
+    } else {
+      TIL.TheKind = TypeTestResolution::ByteArray;
+      ++NumByteArraysCreated;
+      BAI = createByteArray(BSI);
+      TIL.TheByteArray = BAI->ByteArray;
+      TIL.BitMask = BAI->MaskGlobal;
+    }
+
+    TypeIdUserInfo &TIUI = TypeIdUsers[TypeId];
+
+    if (TIUI.IsExported) {
+      uint8_t *MaskPtr = exportTypeId(cast<MDString>(TypeId)->getString(), TIL);
+      if (BAI)
+        BAI->MaskPtr = MaskPtr;
+    }
+
+    // Lower each call to llvm.type.test for this type identifier.
+    for (CallInst *CI : TIUI.CallSites) {
+      ++NumTypeTestCallsLowered;
+      Value *Lowered = lowerTypeTestCall(TypeId, CI, TIL);
+      if (Lowered) {
+        CI->replaceAllUsesWith(Lowered);
+        CI->eraseFromParent();
+      }
+    }
+  }
+}
+
+void LowerTypeTestsModule::verifyTypeMDNode(GlobalObject *GO, MDNode *Type) {
+  if (Type->getNumOperands() != 2)
+    report_fatal_error("All operands of type metadata must have 2 elements");
+
+  if (GO->isThreadLocal())
+    report_fatal_error("Bit set element may not be thread-local");
+  if (isa<GlobalVariable>(GO) && GO->hasSection())
+    report_fatal_error(
+        "A member of a type identifier may not have an explicit section");
+
+  // FIXME: We previously checked that global var member of a type identifier
+  // must be a definition, but the IR linker may leave type metadata on
+  // declarations. We should restore this check after fixing PR31759.
+
+  auto OffsetConstMD = dyn_cast<ConstantAsMetadata>(Type->getOperand(0));
+  if (!OffsetConstMD)
+    report_fatal_error("Type offset must be a constant");
+  auto OffsetInt = dyn_cast<ConstantInt>(OffsetConstMD->getValue());
+  if (!OffsetInt)
+    report_fatal_error("Type offset must be an integer constant");
+}
+
+static const unsigned kX86JumpTableEntrySize = 8;
+static const unsigned kARMJumpTableEntrySize = 4;
 static const unsigned kARMBTIJumpTableEntrySize = 8;
- 
-unsigned LowerTypeTestsModule::getJumpTableEntrySize() { 
-  switch (Arch) { 
-    case Triple::x86: 
-    case Triple::x86_64: 
-      return kX86JumpTableEntrySize; 
-    case Triple::arm: 
-    case Triple::thumb: 
+
+unsigned LowerTypeTestsModule::getJumpTableEntrySize() {
+  switch (Arch) {
+    case Triple::x86:
+    case Triple::x86_64:
+      return kX86JumpTableEntrySize;
+    case Triple::arm:
+    case Triple::thumb:
       return kARMJumpTableEntrySize;
-    case Triple::aarch64: 
+    case Triple::aarch64:
       if (const auto *BTE = mdconst::extract_or_null<ConstantInt>(
             M.getModuleFlag("branch-target-enforcement")))
         if (BTE->getZExtValue())
           return kARMBTIJumpTableEntrySize;
-      return kARMJumpTableEntrySize; 
-    default: 
-      report_fatal_error("Unsupported architecture for jump tables"); 
-  } 
-} 
- 
-// Create a jump table entry for the target. This consists of an instruction 
-// sequence containing a relative branch to Dest. Appends inline asm text, 
-// constraints and arguments to AsmOS, ConstraintOS and AsmArgs. 
-void LowerTypeTestsModule::createJumpTableEntry( 
-    raw_ostream &AsmOS, raw_ostream &ConstraintOS, 
-    Triple::ArchType JumpTableArch, SmallVectorImpl<Value *> &AsmArgs, 
-    Function *Dest) { 
-  unsigned ArgIndex = AsmArgs.size(); 
- 
-  if (JumpTableArch == Triple::x86 || JumpTableArch == Triple::x86_64) { 
-    AsmOS << "jmp ${" << ArgIndex << ":c}@plt\n"; 
-    AsmOS << "int3\nint3\nint3\n"; 
+      return kARMJumpTableEntrySize;
+    default:
+      report_fatal_error("Unsupported architecture for jump tables");
+  }
+}
+
+// Create a jump table entry for the target. This consists of an instruction
+// sequence containing a relative branch to Dest. Appends inline asm text,
+// constraints and arguments to AsmOS, ConstraintOS and AsmArgs.
+void LowerTypeTestsModule::createJumpTableEntry(
+    raw_ostream &AsmOS, raw_ostream &ConstraintOS,
+    Triple::ArchType JumpTableArch, SmallVectorImpl<Value *> &AsmArgs,
+    Function *Dest) {
+  unsigned ArgIndex = AsmArgs.size();
+
+  if (JumpTableArch == Triple::x86 || JumpTableArch == Triple::x86_64) {
+    AsmOS << "jmp ${" << ArgIndex << ":c}@plt\n";
+    AsmOS << "int3\nint3\nint3\n";
   } else if (JumpTableArch == Triple::arm) {
-    AsmOS << "b $" << ArgIndex << "\n"; 
+    AsmOS << "b $" << ArgIndex << "\n";
   } else if (JumpTableArch == Triple::aarch64) {
     if (const auto *BTE = mdconst::extract_or_null<ConstantInt>(
           Dest->getParent()->getModuleFlag("branch-target-enforcement")))
       if (BTE->getZExtValue())
         AsmOS << "bti c\n";
     AsmOS << "b $" << ArgIndex << "\n";
-  } else if (JumpTableArch == Triple::thumb) { 
-    AsmOS << "b.w $" << ArgIndex << "\n"; 
-  } else { 
-    report_fatal_error("Unsupported architecture for jump tables"); 
-  } 
- 
-  ConstraintOS << (ArgIndex > 0 ? ",s" : "s"); 
-  AsmArgs.push_back(Dest); 
-} 
- 
-Type *LowerTypeTestsModule::getJumpTableEntryType() { 
-  return ArrayType::get(Int8Ty, getJumpTableEntrySize()); 
-} 
- 
-/// Given a disjoint set of type identifiers and functions, build the bit sets 
-/// and lower the llvm.type.test calls, architecture dependently. 
-void LowerTypeTestsModule::buildBitSetsFromFunctions( 
-    ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Functions) { 
-  if (Arch == Triple::x86 || Arch == Triple::x86_64 || Arch == Triple::arm || 
-      Arch == Triple::thumb || Arch == Triple::aarch64) 
-    buildBitSetsFromFunctionsNative(TypeIds, Functions); 
-  else if (Arch == Triple::wasm32 || Arch == Triple::wasm64) 
-    buildBitSetsFromFunctionsWASM(TypeIds, Functions); 
-  else 
-    report_fatal_error("Unsupported architecture for jump tables"); 
-} 
- 
-void LowerTypeTestsModule::moveInitializerToModuleConstructor( 
-    GlobalVariable *GV) { 
-  if (WeakInitializerFn == nullptr) { 
-    WeakInitializerFn = Function::Create( 
-        FunctionType::get(Type::getVoidTy(M.getContext()), 
-                          /* IsVarArg */ false), 
-        GlobalValue::InternalLinkage, 
-        M.getDataLayout().getProgramAddressSpace(), 
-        "__cfi_global_var_init", &M); 
-    BasicBlock *BB = 
-        BasicBlock::Create(M.getContext(), "entry", WeakInitializerFn); 
-    ReturnInst::Create(M.getContext(), BB); 
-    WeakInitializerFn->setSection( 
-        ObjectFormat == Triple::MachO 
-            ? "__TEXT,__StaticInit,regular,pure_instructions" 
-            : ".text.startup"); 
-    // This code is equivalent to relocation application, and should run at the 
-    // earliest possible time (i.e. with the highest priority). 
-    appendToGlobalCtors(M, WeakInitializerFn, /* Priority */ 0); 
-  } 
- 
-  IRBuilder<> IRB(WeakInitializerFn->getEntryBlock().getTerminator()); 
-  GV->setConstant(false); 
-  IRB.CreateAlignedStore(GV->getInitializer(), GV, GV->getAlign()); 
-  GV->setInitializer(Constant::getNullValue(GV->getValueType())); 
-} 
- 
-void LowerTypeTestsModule::findGlobalVariableUsersOf( 
-    Constant *C, SmallSetVector<GlobalVariable *, 8> &Out) { 
-  for (auto *U : C->users()){ 
-    if (auto *GV = dyn_cast<GlobalVariable>(U)) 
-      Out.insert(GV); 
-    else if (auto *C2 = dyn_cast<Constant>(U)) 
-      findGlobalVariableUsersOf(C2, Out); 
-  } 
-} 
- 
-// Replace all uses of F with (F ? JT : 0). 
-void LowerTypeTestsModule::replaceWeakDeclarationWithJumpTablePtr( 
-    Function *F, Constant *JT, bool IsJumpTableCanonical) { 
-  // The target expression can not appear in a constant initializer on most 
-  // (all?) targets. Switch to a runtime initializer. 
-  SmallSetVector<GlobalVariable *, 8> GlobalVarUsers; 
-  findGlobalVariableUsersOf(F, GlobalVarUsers); 
-  for (auto GV : GlobalVarUsers) 
-    moveInitializerToModuleConstructor(GV); 
- 
-  // Can not RAUW F with an expression that uses F. Replace with a temporary 
-  // placeholder first. 
-  Function *PlaceholderFn = 
-      Function::Create(cast<FunctionType>(F->getValueType()), 
-                       GlobalValue::ExternalWeakLinkage, 
-                       F->getAddressSpace(), "", &M); 
-  replaceCfiUses(F, PlaceholderFn, IsJumpTableCanonical); 
- 
-  Constant *Target = ConstantExpr::getSelect( 
-      ConstantExpr::getICmp(CmpInst::ICMP_NE, F, 
-                            Constant::getNullValue(F->getType())), 
-      JT, Constant::getNullValue(F->getType())); 
-  PlaceholderFn->replaceAllUsesWith(Target); 
-  PlaceholderFn->eraseFromParent(); 
-} 
- 
-static bool isThumbFunction(Function *F, Triple::ArchType ModuleArch) { 
-  Attribute TFAttr = F->getFnAttribute("target-features"); 
+  } else if (JumpTableArch == Triple::thumb) {
+    AsmOS << "b.w $" << ArgIndex << "\n";
+  } else {
+    report_fatal_error("Unsupported architecture for jump tables");
+  }
+
+  ConstraintOS << (ArgIndex > 0 ? ",s" : "s");
+  AsmArgs.push_back(Dest);
+}
+
+Type *LowerTypeTestsModule::getJumpTableEntryType() {
+  return ArrayType::get(Int8Ty, getJumpTableEntrySize());
+}
+
+/// Given a disjoint set of type identifiers and functions, build the bit sets
+/// and lower the llvm.type.test calls, architecture dependently.
+void LowerTypeTestsModule::buildBitSetsFromFunctions(
+    ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Functions) {
+  if (Arch == Triple::x86 || Arch == Triple::x86_64 || Arch == Triple::arm ||
+      Arch == Triple::thumb || Arch == Triple::aarch64)
+    buildBitSetsFromFunctionsNative(TypeIds, Functions);
+  else if (Arch == Triple::wasm32 || Arch == Triple::wasm64)
+    buildBitSetsFromFunctionsWASM(TypeIds, Functions);
+  else
+    report_fatal_error("Unsupported architecture for jump tables");
+}
+
+void LowerTypeTestsModule::moveInitializerToModuleConstructor(
+    GlobalVariable *GV) {
+  if (WeakInitializerFn == nullptr) {
+    WeakInitializerFn = Function::Create(
+        FunctionType::get(Type::getVoidTy(M.getContext()),
+                          /* IsVarArg */ false),
+        GlobalValue::InternalLinkage,
+        M.getDataLayout().getProgramAddressSpace(),
+        "__cfi_global_var_init", &M);
+    BasicBlock *BB =
+        BasicBlock::Create(M.getContext(), "entry", WeakInitializerFn);
+    ReturnInst::Create(M.getContext(), BB);
+    WeakInitializerFn->setSection(
+        ObjectFormat == Triple::MachO
+            ? "__TEXT,__StaticInit,regular,pure_instructions"
+            : ".text.startup");
+    // This code is equivalent to relocation application, and should run at the
+    // earliest possible time (i.e. with the highest priority).
+    appendToGlobalCtors(M, WeakInitializerFn, /* Priority */ 0);
+  }
+
+  IRBuilder<> IRB(WeakInitializerFn->getEntryBlock().getTerminator());
+  GV->setConstant(false);
+  IRB.CreateAlignedStore(GV->getInitializer(), GV, GV->getAlign());
+  GV->setInitializer(Constant::getNullValue(GV->getValueType()));
+}
+
+void LowerTypeTestsModule::findGlobalVariableUsersOf(
+    Constant *C, SmallSetVector<GlobalVariable *, 8> &Out) {
+  for (auto *U : C->users()){
+    if (auto *GV = dyn_cast<GlobalVariable>(U))
+      Out.insert(GV);
+    else if (auto *C2 = dyn_cast<Constant>(U))
+      findGlobalVariableUsersOf(C2, Out);
+  }
+}
+
+// Replace all uses of F with (F ? JT : 0).
+void LowerTypeTestsModule::replaceWeakDeclarationWithJumpTablePtr(
+    Function *F, Constant *JT, bool IsJumpTableCanonical) {
+  // The target expression can not appear in a constant initializer on most
+  // (all?) targets. Switch to a runtime initializer.
+  SmallSetVector<GlobalVariable *, 8> GlobalVarUsers;
+  findGlobalVariableUsersOf(F, GlobalVarUsers);
+  for (auto GV : GlobalVarUsers)
+    moveInitializerToModuleConstructor(GV);
+
+  // Can not RAUW F with an expression that uses F. Replace with a temporary
+  // placeholder first.
+  Function *PlaceholderFn =
+      Function::Create(cast<FunctionType>(F->getValueType()),
+                       GlobalValue::ExternalWeakLinkage,
+                       F->getAddressSpace(), "", &M);
+  replaceCfiUses(F, PlaceholderFn, IsJumpTableCanonical);
+
+  Constant *Target = ConstantExpr::getSelect(
+      ConstantExpr::getICmp(CmpInst::ICMP_NE, F,
+                            Constant::getNullValue(F->getType())),
+      JT, Constant::getNullValue(F->getType()));
+  PlaceholderFn->replaceAllUsesWith(Target);
+  PlaceholderFn->eraseFromParent();
+}
+
+static bool isThumbFunction(Function *F, Triple::ArchType ModuleArch) {
+  Attribute TFAttr = F->getFnAttribute("target-features");
   if (TFAttr.isValid()) {
-    SmallVector<StringRef, 6> Features; 
-    TFAttr.getValueAsString().split(Features, ','); 
-    for (StringRef Feature : Features) { 
-      if (Feature == "-thumb-mode") 
-        return false; 
-      else if (Feature == "+thumb-mode") 
-        return true; 
-    } 
-  } 
- 
-  return ModuleArch == Triple::thumb; 
-} 
- 
-// Each jump table must be either ARM or Thumb as a whole for the bit-test math 
-// to work. Pick one that matches the majority of members to minimize interop 
-// veneers inserted by the linker. 
-static Triple::ArchType 
-selectJumpTableArmEncoding(ArrayRef<GlobalTypeMember *> Functions, 
-                           Triple::ArchType ModuleArch) { 
-  if (ModuleArch != Triple::arm && ModuleArch != Triple::thumb) 
-    return ModuleArch; 
- 
-  unsigned ArmCount = 0, ThumbCount = 0; 
-  for (const auto GTM : Functions) { 
-    if (!GTM->isJumpTableCanonical()) { 
-      // PLT stubs are always ARM. 
-      // FIXME: This is the wrong heuristic for non-canonical jump tables. 
-      ++ArmCount; 
-      continue; 
-    } 
- 
-    Function *F = cast<Function>(GTM->getGlobal()); 
-    ++(isThumbFunction(F, ModuleArch) ? ThumbCount : ArmCount); 
-  } 
- 
-  return ArmCount > ThumbCount ? Triple::arm : Triple::thumb; 
-} 
- 
-void LowerTypeTestsModule::createJumpTable( 
-    Function *F, ArrayRef<GlobalTypeMember *> Functions) { 
-  std::string AsmStr, ConstraintStr; 
-  raw_string_ostream AsmOS(AsmStr), ConstraintOS(ConstraintStr); 
-  SmallVector<Value *, 16> AsmArgs; 
-  AsmArgs.reserve(Functions.size() * 2); 
- 
-  Triple::ArchType JumpTableArch = selectJumpTableArmEncoding(Functions, Arch); 
- 
-  for (unsigned I = 0; I != Functions.size(); ++I) 
-    createJumpTableEntry(AsmOS, ConstraintOS, JumpTableArch, AsmArgs, 
-                         cast<Function>(Functions[I]->getGlobal())); 
- 
-  // Align the whole table by entry size. 
-  F->setAlignment(Align(getJumpTableEntrySize())); 
-  // Skip prologue. 
-  // Disabled on win32 due to https://llvm.org/bugs/show_bug.cgi?id=28641#c3. 
-  // Luckily, this function does not get any prologue even without the 
-  // attribute. 
-  if (OS != Triple::Win32) 
-    F->addFnAttr(Attribute::Naked); 
-  if (JumpTableArch == Triple::arm) 
-    F->addFnAttr("target-features", "-thumb-mode"); 
-  if (JumpTableArch == Triple::thumb) { 
-    F->addFnAttr("target-features", "+thumb-mode"); 
-    // Thumb jump table assembly needs Thumb2. The following attribute is added 
-    // by Clang for -march=armv7. 
-    F->addFnAttr("target-cpu", "cortex-a8"); 
-  } 
+    SmallVector<StringRef, 6> Features;
+    TFAttr.getValueAsString().split(Features, ',');
+    for (StringRef Feature : Features) {
+      if (Feature == "-thumb-mode")
+        return false;
+      else if (Feature == "+thumb-mode")
+        return true;
+    }
+  }
+
+  return ModuleArch == Triple::thumb;
+}
+
+// Each jump table must be either ARM or Thumb as a whole for the bit-test math
+// to work. Pick one that matches the majority of members to minimize interop
+// veneers inserted by the linker.
+static Triple::ArchType
+selectJumpTableArmEncoding(ArrayRef<GlobalTypeMember *> Functions,
+                           Triple::ArchType ModuleArch) {
+  if (ModuleArch != Triple::arm && ModuleArch != Triple::thumb)
+    return ModuleArch;
+
+  unsigned ArmCount = 0, ThumbCount = 0;
+  for (const auto GTM : Functions) {
+    if (!GTM->isJumpTableCanonical()) {
+      // PLT stubs are always ARM.
+      // FIXME: This is the wrong heuristic for non-canonical jump tables.
+      ++ArmCount;
+      continue;
+    }
+
+    Function *F = cast<Function>(GTM->getGlobal());
+    ++(isThumbFunction(F, ModuleArch) ? ThumbCount : ArmCount);
+  }
+
+  return ArmCount > ThumbCount ? Triple::arm : Triple::thumb;
+}
+
+void LowerTypeTestsModule::createJumpTable(
+    Function *F, ArrayRef<GlobalTypeMember *> Functions) {
+  std::string AsmStr, ConstraintStr;
+  raw_string_ostream AsmOS(AsmStr), ConstraintOS(ConstraintStr);
+  SmallVector<Value *, 16> AsmArgs;
+  AsmArgs.reserve(Functions.size() * 2);
+
+  Triple::ArchType JumpTableArch = selectJumpTableArmEncoding(Functions, Arch);
+
+  for (unsigned I = 0; I != Functions.size(); ++I)
+    createJumpTableEntry(AsmOS, ConstraintOS, JumpTableArch, AsmArgs,
+                         cast<Function>(Functions[I]->getGlobal()));
+
+  // Align the whole table by entry size.
+  F->setAlignment(Align(getJumpTableEntrySize()));
+  // Skip prologue.
+  // Disabled on win32 due to https://llvm.org/bugs/show_bug.cgi?id=28641#c3.
+  // Luckily, this function does not get any prologue even without the
+  // attribute.
+  if (OS != Triple::Win32)
+    F->addFnAttr(Attribute::Naked);
+  if (JumpTableArch == Triple::arm)
+    F->addFnAttr("target-features", "-thumb-mode");
+  if (JumpTableArch == Triple::thumb) {
+    F->addFnAttr("target-features", "+thumb-mode");
+    // Thumb jump table assembly needs Thumb2. The following attribute is added
+    // by Clang for -march=armv7.
+    F->addFnAttr("target-cpu", "cortex-a8");
+  }
   if (JumpTableArch == Triple::aarch64) {
     F->addFnAttr("branch-target-enforcement", "false");
     F->addFnAttr("sign-return-address", "none");
   }
-  // Make sure we don't emit .eh_frame for this function. 
-  F->addFnAttr(Attribute::NoUnwind); 
- 
-  BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F); 
-  IRBuilder<> IRB(BB); 
- 
-  SmallVector<Type *, 16> ArgTypes; 
-  ArgTypes.reserve(AsmArgs.size()); 
-  for (const auto &Arg : AsmArgs) 
-    ArgTypes.push_back(Arg->getType()); 
-  InlineAsm *JumpTableAsm = 
-      InlineAsm::get(FunctionType::get(IRB.getVoidTy(), ArgTypes, false), 
-                     AsmOS.str(), ConstraintOS.str(), 
-                     /*hasSideEffects=*/true); 
- 
-  IRB.CreateCall(JumpTableAsm, AsmArgs); 
-  IRB.CreateUnreachable(); 
-} 
- 
-/// Given a disjoint set of type identifiers and functions, build a jump table 
-/// for the functions, build the bit sets and lower the llvm.type.test calls. 
-void LowerTypeTestsModule::buildBitSetsFromFunctionsNative( 
-    ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Functions) { 
-  // Unlike the global bitset builder, the function bitset builder cannot 
-  // re-arrange functions in a particular order and base its calculations on the 
-  // layout of the functions' entry points, as we have no idea how large a 
-  // particular function will end up being (the size could even depend on what 
-  // this pass does!) Instead, we build a jump table, which is a block of code 
-  // consisting of one branch instruction for each of the functions in the bit 
-  // set that branches to the target function, and redirect any taken function 
-  // addresses to the corresponding jump table entry. In the object file's 
-  // symbol table, the symbols for the target functions also refer to the jump 
-  // table entries, so that addresses taken outside the module will pass any 
-  // verification done inside the module. 
-  // 
-  // In more concrete terms, suppose we have three functions f, g, h which are 
-  // of the same type, and a function foo that returns their addresses: 
-  // 
-  // f: 
-  // mov 0, %eax 
-  // ret 
-  // 
-  // g: 
-  // mov 1, %eax 
-  // ret 
-  // 
-  // h: 
-  // mov 2, %eax 
-  // ret 
-  // 
-  // foo: 
-  // mov f, %eax 
-  // mov g, %edx 
-  // mov h, %ecx 
-  // ret 
-  // 
-  // We output the jump table as module-level inline asm string. The end result 
-  // will (conceptually) look like this: 
-  // 
-  // f = .cfi.jumptable 
-  // g = .cfi.jumptable + 4 
-  // h = .cfi.jumptable + 8 
-  // .cfi.jumptable: 
-  // jmp f.cfi  ; 5 bytes 
-  // int3       ; 1 byte 
-  // int3       ; 1 byte 
-  // int3       ; 1 byte 
-  // jmp g.cfi  ; 5 bytes 
-  // int3       ; 1 byte 
-  // int3       ; 1 byte 
-  // int3       ; 1 byte 
-  // jmp h.cfi  ; 5 bytes 
-  // int3       ; 1 byte 
-  // int3       ; 1 byte 
-  // int3       ; 1 byte 
-  // 
-  // f.cfi: 
-  // mov 0, %eax 
-  // ret 
-  // 
-  // g.cfi: 
-  // mov 1, %eax 
-  // ret 
-  // 
-  // h.cfi: 
-  // mov 2, %eax 
-  // ret 
-  // 
-  // foo: 
-  // mov f, %eax 
-  // mov g, %edx 
-  // mov h, %ecx 
-  // ret 
-  // 
-  // Because the addresses of f, g, h are evenly spaced at a power of 2, in the 
-  // normal case the check can be carried out using the same kind of simple 
-  // arithmetic that we normally use for globals. 
- 
-  // FIXME: find a better way to represent the jumptable in the IR. 
-  assert(!Functions.empty()); 
- 
-  // Build a simple layout based on the regular layout of jump tables. 
-  DenseMap<GlobalTypeMember *, uint64_t> GlobalLayout; 
-  unsigned EntrySize = getJumpTableEntrySize(); 
-  for (unsigned I = 0; I != Functions.size(); ++I) 
-    GlobalLayout[Functions[I]] = I * EntrySize; 
- 
-  Function *JumpTableFn = 
-      Function::Create(FunctionType::get(Type::getVoidTy(M.getContext()), 
-                                         /* IsVarArg */ false), 
-                       GlobalValue::PrivateLinkage, 
-                       M.getDataLayout().getProgramAddressSpace(), 
-                       ".cfi.jumptable", &M); 
-  ArrayType *JumpTableType = 
-      ArrayType::get(getJumpTableEntryType(), Functions.size()); 
-  auto JumpTable = 
-      ConstantExpr::getPointerCast(JumpTableFn, JumpTableType->getPointerTo(0)); 
- 
-  lowerTypeTestCalls(TypeIds, JumpTable, GlobalLayout); 
- 
-  { 
-    ScopedSaveAliaseesAndUsed S(M); 
- 
-    // Build aliases pointing to offsets into the jump table, and replace 
-    // references to the original functions with references to the aliases. 
-    for (unsigned I = 0; I != Functions.size(); ++I) { 
-      Function *F = cast<Function>(Functions[I]->getGlobal()); 
-      bool IsJumpTableCanonical = Functions[I]->isJumpTableCanonical(); 
- 
-      Constant *CombinedGlobalElemPtr = ConstantExpr::getBitCast( 
-          ConstantExpr::getInBoundsGetElementPtr( 
-              JumpTableType, JumpTable, 
-              ArrayRef<Constant *>{ConstantInt::get(IntPtrTy, 0), 
-                                   ConstantInt::get(IntPtrTy, I)}), 
-          F->getType()); 
-      if (Functions[I]->isExported()) { 
-        if (IsJumpTableCanonical) { 
-          ExportSummary->cfiFunctionDefs().insert(std::string(F->getName())); 
-        } else { 
-          GlobalAlias *JtAlias = GlobalAlias::create( 
-              F->getValueType(), 0, GlobalValue::ExternalLinkage, 
-              F->getName() + ".cfi_jt", CombinedGlobalElemPtr, &M); 
-          JtAlias->setVisibility(GlobalValue::HiddenVisibility); 
-          ExportSummary->cfiFunctionDecls().insert(std::string(F->getName())); 
-        } 
-      } 
-      if (!IsJumpTableCanonical) { 
-        if (F->hasExternalWeakLinkage()) 
-          replaceWeakDeclarationWithJumpTablePtr(F, CombinedGlobalElemPtr, 
-                                                 IsJumpTableCanonical); 
-        else 
-          replaceCfiUses(F, CombinedGlobalElemPtr, IsJumpTableCanonical); 
-      } else { 
-        assert(F->getType()->getAddressSpace() == 0); 
- 
-        GlobalAlias *FAlias = 
-            GlobalAlias::create(F->getValueType(), 0, F->getLinkage(), "", 
-                                CombinedGlobalElemPtr, &M); 
-        FAlias->setVisibility(F->getVisibility()); 
-        FAlias->takeName(F); 
-        if (FAlias->hasName()) 
-          F->setName(FAlias->getName() + ".cfi"); 
-        replaceCfiUses(F, FAlias, IsJumpTableCanonical); 
-        if (!F->hasLocalLinkage()) 
-          F->setVisibility(GlobalVariable::HiddenVisibility); 
-      } 
-    } 
-  } 
- 
-  createJumpTable(JumpTableFn, Functions); 
-} 
- 
-/// Assign a dummy layout using an incrementing counter, tag each function 
-/// with its index represented as metadata, and lower each type test to an 
-/// integer range comparison. During generation of the indirect function call 
-/// table in the backend, it will assign the given indexes. 
-/// Note: Dynamic linking is not supported, as the WebAssembly ABI has not yet 
-/// been finalized. 
-void LowerTypeTestsModule::buildBitSetsFromFunctionsWASM( 
-    ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Functions) { 
-  assert(!Functions.empty()); 
- 
-  // Build consecutive monotonic integer ranges for each call target set 
-  DenseMap<GlobalTypeMember *, uint64_t> GlobalLayout; 
- 
-  for (GlobalTypeMember *GTM : Functions) { 
-    Function *F = cast<Function>(GTM->getGlobal()); 
- 
-    // Skip functions that are not address taken, to avoid bloating the table 
-    if (!F->hasAddressTaken()) 
-      continue; 
- 
-    // Store metadata with the index for each function 
-    MDNode *MD = MDNode::get(F->getContext(), 
-                             ArrayRef<Metadata *>(ConstantAsMetadata::get( 
-                                 ConstantInt::get(Int64Ty, IndirectIndex)))); 
-    F->setMetadata("wasm.index", MD); 
- 
-    // Assign the counter value 
-    GlobalLayout[GTM] = IndirectIndex++; 
-  } 
- 
-  // The indirect function table index space starts at zero, so pass a NULL 
-  // pointer as the subtracted "jump table" offset. 
-  lowerTypeTestCalls(TypeIds, ConstantPointerNull::get(Int32PtrTy), 
-                     GlobalLayout); 
-} 
- 
-void LowerTypeTestsModule::buildBitSetsFromDisjointSet( 
-    ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Globals, 
-    ArrayRef<ICallBranchFunnel *> ICallBranchFunnels) { 
-  DenseMap<Metadata *, uint64_t> TypeIdIndices; 
-  for (unsigned I = 0; I != TypeIds.size(); ++I) 
-    TypeIdIndices[TypeIds[I]] = I; 
- 
-  // For each type identifier, build a set of indices that refer to members of 
-  // the type identifier. 
-  std::vector<std::set<uint64_t>> TypeMembers(TypeIds.size()); 
-  unsigned GlobalIndex = 0; 
-  DenseMap<GlobalTypeMember *, uint64_t> GlobalIndices; 
-  for (GlobalTypeMember *GTM : Globals) { 
-    for (MDNode *Type : GTM->types()) { 
-      // Type = { offset, type identifier } 
-      auto I = TypeIdIndices.find(Type->getOperand(1)); 
-      if (I != TypeIdIndices.end()) 
-        TypeMembers[I->second].insert(GlobalIndex); 
-    } 
-    GlobalIndices[GTM] = GlobalIndex; 
-    GlobalIndex++; 
-  } 
- 
-  for (ICallBranchFunnel *JT : ICallBranchFunnels) { 
-    TypeMembers.emplace_back(); 
-    std::set<uint64_t> &TMSet = TypeMembers.back(); 
-    for (GlobalTypeMember *T : JT->targets()) 
-      TMSet.insert(GlobalIndices[T]); 
-  } 
- 
-  // Order the sets of indices by size. The GlobalLayoutBuilder works best 
-  // when given small index sets first. 
-  llvm::stable_sort(TypeMembers, [](const std::set<uint64_t> &O1, 
-                                    const std::set<uint64_t> &O2) { 
-    return O1.size() < O2.size(); 
-  }); 
- 
-  // Create a GlobalLayoutBuilder and provide it with index sets as layout 
-  // fragments. The GlobalLayoutBuilder tries to lay out members of fragments as 
-  // close together as possible. 
-  GlobalLayoutBuilder GLB(Globals.size()); 
-  for (auto &&MemSet : TypeMembers) 
-    GLB.addFragment(MemSet); 
- 
-  // Build a vector of globals with the computed layout. 
-  bool IsGlobalSet = 
-      Globals.empty() || isa<GlobalVariable>(Globals[0]->getGlobal()); 
-  std::vector<GlobalTypeMember *> OrderedGTMs(Globals.size()); 
-  auto OGTMI = OrderedGTMs.begin(); 
-  for (auto &&F : GLB.Fragments) { 
-    for (auto &&Offset : F) { 
-      if (IsGlobalSet != isa<GlobalVariable>(Globals[Offset]->getGlobal())) 
-        report_fatal_error("Type identifier may not contain both global " 
-                           "variables and functions"); 
-      *OGTMI++ = Globals[Offset]; 
-    } 
-  } 
- 
-  // Build the bitsets from this disjoint set. 
-  if (IsGlobalSet) 
-    buildBitSetsFromGlobalVariables(TypeIds, OrderedGTMs); 
-  else 
-    buildBitSetsFromFunctions(TypeIds, OrderedGTMs); 
-} 
- 
-/// Lower all type tests in this module. 
-LowerTypeTestsModule::LowerTypeTestsModule( 
-    Module &M, ModuleSummaryIndex *ExportSummary, 
-    const ModuleSummaryIndex *ImportSummary, bool DropTypeTests) 
-    : M(M), ExportSummary(ExportSummary), ImportSummary(ImportSummary), 
-      DropTypeTests(DropTypeTests) { 
-  assert(!(ExportSummary && ImportSummary)); 
-  Triple TargetTriple(M.getTargetTriple()); 
-  Arch = TargetTriple.getArch(); 
-  OS = TargetTriple.getOS(); 
-  ObjectFormat = TargetTriple.getObjectFormat(); 
-} 
- 
-bool LowerTypeTestsModule::runForTesting(Module &M) { 
-  ModuleSummaryIndex Summary(/*HaveGVs=*/false); 
- 
-  // Handle the command-line summary arguments. This code is for testing 
-  // purposes only, so we handle errors directly. 
-  if (!ClReadSummary.empty()) { 
-    ExitOnError ExitOnErr("-lowertypetests-read-summary: " + ClReadSummary + 
-                          ": "); 
-    auto ReadSummaryFile = 
-        ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(ClReadSummary))); 
- 
-    yaml::Input In(ReadSummaryFile->getBuffer()); 
-    In >> Summary; 
-    ExitOnErr(errorCodeToError(In.error())); 
-  } 
- 
-  bool Changed = 
-      LowerTypeTestsModule( 
-          M, ClSummaryAction == PassSummaryAction::Export ? &Summary : nullptr, 
-          ClSummaryAction == PassSummaryAction::Import ? &Summary : nullptr, 
-          /*DropTypeTests*/ false) 
-          .lower(); 
- 
-  if (!ClWriteSummary.empty()) { 
-    ExitOnError ExitOnErr("-lowertypetests-write-summary: " + ClWriteSummary + 
-                          ": "); 
-    std::error_code EC; 
-    raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::OF_Text); 
-    ExitOnErr(errorCodeToError(EC)); 
- 
-    yaml::Output Out(OS); 
-    Out << Summary; 
-  } 
- 
-  return Changed; 
-} 
- 
-static bool isDirectCall(Use& U) { 
-  auto *Usr = dyn_cast<CallInst>(U.getUser()); 
-  if (Usr) { 
-    auto *CB = dyn_cast<CallBase>(Usr); 
-    if (CB && CB->isCallee(&U)) 
-      return true; 
-  } 
-  return false; 
-} 
- 
-void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New, 
-                                          bool IsJumpTableCanonical) { 
-  SmallSetVector<Constant *, 4> Constants; 
-  auto UI = Old->use_begin(), E = Old->use_end(); 
-  for (; UI != E;) { 
-    Use &U = *UI; 
-    ++UI; 
- 
-    // Skip block addresses 
-    if (isa<BlockAddress>(U.getUser())) 
-      continue; 
- 
-    // Skip direct calls to externally defined or non-dso_local functions 
-    if (isDirectCall(U) && (Old->isDSOLocal() || !IsJumpTableCanonical)) 
-      continue; 
- 
-    // Must handle Constants specially, we cannot call replaceUsesOfWith on a 
-    // constant because they are uniqued. 
-    if (auto *C = dyn_cast<Constant>(U.getUser())) { 
-      if (!isa<GlobalValue>(C)) { 
-        // Save unique users to avoid processing operand replacement 
-        // more than once. 
-        Constants.insert(C); 
-        continue; 
-      } 
-    } 
- 
-    U.set(New); 
-  } 
- 
-  // Process operand replacement of saved constants. 
-  for (auto *C : Constants) 
-    C->handleOperandChange(Old, New); 
-} 
- 
-void LowerTypeTestsModule::replaceDirectCalls(Value *Old, Value *New) { 
-  Old->replaceUsesWithIf(New, [](Use &U) { return isDirectCall(U); }); 
-} 
- 
-bool LowerTypeTestsModule::lower() { 
-  Function *TypeTestFunc = 
-      M.getFunction(Intrinsic::getName(Intrinsic::type_test)); 
- 
-  if (DropTypeTests && TypeTestFunc) { 
-    for (auto UI = TypeTestFunc->use_begin(), UE = TypeTestFunc->use_end(); 
-         UI != UE;) { 
-      auto *CI = cast<CallInst>((*UI++).getUser()); 
-      // Find and erase llvm.assume intrinsics for this llvm.type.test call. 
-      for (auto CIU = CI->use_begin(), CIUE = CI->use_end(); CIU != CIUE;) { 
-        if (auto *AssumeCI = dyn_cast<CallInst>((*CIU++).getUser())) { 
-          Function *F = AssumeCI->getCalledFunction(); 
-          if (F && F->getIntrinsicID() == Intrinsic::assume) 
-            AssumeCI->eraseFromParent(); 
-        } 
-      } 
-      CI->eraseFromParent(); 
-    } 
- 
-    // We have deleted the type intrinsics, so we no longer have enough 
-    // information to reason about the liveness of virtual function pointers 
-    // in GlobalDCE. 
-    for (GlobalVariable &GV : M.globals()) 
-      GV.eraseMetadata(LLVMContext::MD_vcall_visibility); 
- 
-    return true; 
-  } 
- 
-  // If only some of the modules were split, we cannot correctly perform 
-  // this transformation. We already checked for the presense of type tests 
-  // with partially split modules during the thin link, and would have emitted 
-  // an error if any were found, so here we can simply return. 
-  if ((ExportSummary && ExportSummary->partiallySplitLTOUnits()) || 
-      (ImportSummary && ImportSummary->partiallySplitLTOUnits())) 
-    return false; 
- 
-  Function *ICallBranchFunnelFunc = 
-      M.getFunction(Intrinsic::getName(Intrinsic::icall_branch_funnel)); 
-  if ((!TypeTestFunc || TypeTestFunc->use_empty()) && 
-      (!ICallBranchFunnelFunc || ICallBranchFunnelFunc->use_empty()) && 
-      !ExportSummary && !ImportSummary) 
-    return false; 
- 
-  if (ImportSummary) { 
-    if (TypeTestFunc) { 
-      for (auto UI = TypeTestFunc->use_begin(), UE = TypeTestFunc->use_end(); 
-           UI != UE;) { 
-        auto *CI = cast<CallInst>((*UI++).getUser()); 
-        importTypeTest(CI); 
-      } 
-    } 
- 
-    if (ICallBranchFunnelFunc && !ICallBranchFunnelFunc->use_empty()) 
-      report_fatal_error( 
-          "unexpected call to llvm.icall.branch.funnel during import phase"); 
- 
-    SmallVector<Function *, 8> Defs; 
-    SmallVector<Function *, 8> Decls; 
-    for (auto &F : M) { 
-      // CFI functions are either external, or promoted. A local function may 
-      // have the same name, but it's not the one we are looking for. 
-      if (F.hasLocalLinkage()) 
-        continue; 
-      if (ImportSummary->cfiFunctionDefs().count(std::string(F.getName()))) 
-        Defs.push_back(&F); 
-      else if (ImportSummary->cfiFunctionDecls().count( 
-                   std::string(F.getName()))) 
-        Decls.push_back(&F); 
-    } 
- 
-    std::vector<GlobalAlias *> AliasesToErase; 
-    { 
-      ScopedSaveAliaseesAndUsed S(M); 
-      for (auto F : Defs) 
-        importFunction(F, /*isJumpTableCanonical*/ true, AliasesToErase); 
-      for (auto F : Decls) 
-        importFunction(F, /*isJumpTableCanonical*/ false, AliasesToErase); 
-    } 
-    for (GlobalAlias *GA : AliasesToErase) 
-      GA->eraseFromParent(); 
- 
-    return true; 
-  } 
- 
-  // Equivalence class set containing type identifiers and the globals that 
-  // reference them. This is used to partition the set of type identifiers in 
-  // the module into disjoint sets. 
-  using GlobalClassesTy = EquivalenceClasses< 
-      PointerUnion<GlobalTypeMember *, Metadata *, ICallBranchFunnel *>>; 
-  GlobalClassesTy GlobalClasses; 
- 
-  // Verify the type metadata and build a few data structures to let us 
-  // efficiently enumerate the type identifiers associated with a global: 
-  // a list of GlobalTypeMembers (a GlobalObject stored alongside a vector 
-  // of associated type metadata) and a mapping from type identifiers to their 
-  // list of GlobalTypeMembers and last observed index in the list of globals. 
-  // The indices will be used later to deterministically order the list of type 
-  // identifiers. 
-  BumpPtrAllocator Alloc; 
-  struct TIInfo { 
-    unsigned UniqueId; 
-    std::vector<GlobalTypeMember *> RefGlobals; 
-  }; 
-  DenseMap<Metadata *, TIInfo> TypeIdInfo; 
-  unsigned CurUniqueId = 0; 
-  SmallVector<MDNode *, 2> Types; 
- 
-  // Cross-DSO CFI emits jumptable entries for exported functions as well as 
-  // address taken functions in case they are address taken in other modules. 
-  const bool CrossDsoCfi = M.getModuleFlag("Cross-DSO CFI") != nullptr; 
- 
-  struct ExportedFunctionInfo { 
-    CfiFunctionLinkage Linkage; 
-    MDNode *FuncMD; // {name, linkage, type[, type...]} 
-  }; 
-  DenseMap<StringRef, ExportedFunctionInfo> ExportedFunctions; 
-  if (ExportSummary) { 
-    // A set of all functions that are address taken by a live global object. 
-    DenseSet<GlobalValue::GUID> AddressTaken; 
-    for (auto &I : *ExportSummary) 
-      for (auto &GVS : I.second.SummaryList) 
-        if (GVS->isLive()) 
-          for (auto &Ref : GVS->refs()) 
-            AddressTaken.insert(Ref.getGUID()); 
- 
-    NamedMDNode *CfiFunctionsMD = M.getNamedMetadata("cfi.functions"); 
-    if (CfiFunctionsMD) { 
-      for (auto FuncMD : CfiFunctionsMD->operands()) { 
-        assert(FuncMD->getNumOperands() >= 2); 
-        StringRef FunctionName = 
-            cast<MDString>(FuncMD->getOperand(0))->getString(); 
-        CfiFunctionLinkage Linkage = static_cast<CfiFunctionLinkage>( 
-            cast<ConstantAsMetadata>(FuncMD->getOperand(1)) 
-                ->getValue() 
-                ->getUniqueInteger() 
-                .getZExtValue()); 
-        const GlobalValue::GUID GUID = GlobalValue::getGUID( 
-                GlobalValue::dropLLVMManglingEscape(FunctionName)); 
-        // Do not emit jumptable entries for functions that are not-live and 
-        // have no live references (and are not exported with cross-DSO CFI.) 
-        if (!ExportSummary->isGUIDLive(GUID)) 
-          continue; 
-        if (!AddressTaken.count(GUID)) { 
-          if (!CrossDsoCfi || Linkage != CFL_Definition) 
-            continue; 
- 
-          bool Exported = false; 
-          if (auto VI = ExportSummary->getValueInfo(GUID)) 
-            for (auto &GVS : VI.getSummaryList()) 
-              if (GVS->isLive() && !GlobalValue::isLocalLinkage(GVS->linkage())) 
-                Exported = true; 
- 
-          if (!Exported) 
-            continue; 
-        } 
-        auto P = ExportedFunctions.insert({FunctionName, {Linkage, FuncMD}}); 
-        if (!P.second && P.first->second.Linkage != CFL_Definition) 
-          P.first->second = {Linkage, FuncMD}; 
-      } 
- 
-      for (const auto &P : ExportedFunctions) { 
-        StringRef FunctionName = P.first; 
-        CfiFunctionLinkage Linkage = P.second.Linkage; 
-        MDNode *FuncMD = P.second.FuncMD; 
-        Function *F = M.getFunction(FunctionName); 
-        if (F && F->hasLocalLinkage()) { 
-          // Locally defined function that happens to have the same name as a 
-          // function defined in a ThinLTO module. Rename it to move it out of 
-          // the way of the external reference that we're about to create. 
-          // Note that setName will find a unique name for the function, so even 
-          // if there is an existing function with the suffix there won't be a 
-          // name collision. 
-          F->setName(F->getName() + ".1"); 
-          F = nullptr; 
-        } 
- 
-        if (!F) 
-          F = Function::Create( 
-              FunctionType::get(Type::getVoidTy(M.getContext()), false), 
-              GlobalVariable::ExternalLinkage, 
-              M.getDataLayout().getProgramAddressSpace(), FunctionName, &M); 
- 
-        // If the function is available_externally, remove its definition so 
-        // that it is handled the same way as a declaration. Later we will try 
-        // to create an alias using this function's linkage, which will fail if 
-        // the linkage is available_externally. This will also result in us 
-        // following the code path below to replace the type metadata. 
-        if (F->hasAvailableExternallyLinkage()) { 
-          F->setLinkage(GlobalValue::ExternalLinkage); 
-          F->deleteBody(); 
-          F->setComdat(nullptr); 
-          F->clearMetadata(); 
-        } 
- 
-        // Update the linkage for extern_weak declarations when a definition 
-        // exists. 
-        if (Linkage == CFL_Definition && F->hasExternalWeakLinkage()) 
-          F->setLinkage(GlobalValue::ExternalLinkage); 
- 
-        // If the function in the full LTO module is a declaration, replace its 
-        // type metadata with the type metadata we found in cfi.functions. That 
-        // metadata is presumed to be more accurate than the metadata attached 
-        // to the declaration. 
-        if (F->isDeclaration()) { 
-          if (Linkage == CFL_WeakDeclaration) 
-            F->setLinkage(GlobalValue::ExternalWeakLinkage); 
- 
-          F->eraseMetadata(LLVMContext::MD_type); 
-          for (unsigned I = 2; I < FuncMD->getNumOperands(); ++I) 
-            F->addMetadata(LLVMContext::MD_type, 
-                           *cast<MDNode>(FuncMD->getOperand(I).get())); 
-        } 
-      } 
-    } 
-  } 
- 
-  DenseMap<GlobalObject *, GlobalTypeMember *> GlobalTypeMembers; 
-  for (GlobalObject &GO : M.global_objects()) { 
-    if (isa<GlobalVariable>(GO) && GO.isDeclarationForLinker()) 
-      continue; 
- 
-    Types.clear(); 
-    GO.getMetadata(LLVMContext::MD_type, Types); 
- 
-    bool IsJumpTableCanonical = false; 
-    bool IsExported = false; 
-    if (Function *F = dyn_cast<Function>(&GO)) { 
-      IsJumpTableCanonical = isJumpTableCanonical(F); 
-      if (ExportedFunctions.count(F->getName())) { 
-        IsJumpTableCanonical |= 
-            ExportedFunctions[F->getName()].Linkage == CFL_Definition; 
-        IsExported = true; 
-      // TODO: The logic here checks only that the function is address taken, 
-      // not that the address takers are live. This can be updated to check 
-      // their liveness and emit fewer jumptable entries once monolithic LTO 
-      // builds also emit summaries. 
-      } else if (!F->hasAddressTaken()) { 
-        if (!CrossDsoCfi || !IsJumpTableCanonical || F->hasLocalLinkage()) 
-          continue; 
-      } 
-    } 
- 
-    auto *GTM = GlobalTypeMember::create(Alloc, &GO, IsJumpTableCanonical, 
-                                         IsExported, Types); 
-    GlobalTypeMembers[&GO] = GTM; 
-    for (MDNode *Type : Types) { 
-      verifyTypeMDNode(&GO, Type); 
-      auto &Info = TypeIdInfo[Type->getOperand(1)]; 
-      Info.UniqueId = ++CurUniqueId; 
-      Info.RefGlobals.push_back(GTM); 
-    } 
-  } 
- 
-  auto AddTypeIdUse = [&](Metadata *TypeId) -> TypeIdUserInfo & { 
-    // Add the call site to the list of call sites for this type identifier. We 
-    // also use TypeIdUsers to keep track of whether we have seen this type 
-    // identifier before. If we have, we don't need to re-add the referenced 
-    // globals to the equivalence class. 
-    auto Ins = TypeIdUsers.insert({TypeId, {}}); 
-    if (Ins.second) { 
-      // Add the type identifier to the equivalence class. 
-      GlobalClassesTy::iterator GCI = GlobalClasses.insert(TypeId); 
-      GlobalClassesTy::member_iterator CurSet = GlobalClasses.findLeader(GCI); 
- 
-      // Add the referenced globals to the type identifier's equivalence class. 
-      for (GlobalTypeMember *GTM : TypeIdInfo[TypeId].RefGlobals) 
-        CurSet = GlobalClasses.unionSets( 
-            CurSet, GlobalClasses.findLeader(GlobalClasses.insert(GTM))); 
-    } 
- 
-    return Ins.first->second; 
-  }; 
- 
-  if (TypeTestFunc) { 
-    for (const Use &U : TypeTestFunc->uses()) { 
-      auto CI = cast<CallInst>(U.getUser()); 
- 
-      auto TypeIdMDVal = dyn_cast<MetadataAsValue>(CI->getArgOperand(1)); 
-      if (!TypeIdMDVal) 
-        report_fatal_error("Second argument of llvm.type.test must be metadata"); 
-      auto TypeId = TypeIdMDVal->getMetadata(); 
-      AddTypeIdUse(TypeId).CallSites.push_back(CI); 
-    } 
-  } 
- 
-  if (ICallBranchFunnelFunc) { 
-    for (const Use &U : ICallBranchFunnelFunc->uses()) { 
-      if (Arch != Triple::x86_64) 
-        report_fatal_error( 
-            "llvm.icall.branch.funnel not supported on this target"); 
- 
-      auto CI = cast<CallInst>(U.getUser()); 
- 
-      std::vector<GlobalTypeMember *> Targets; 
-      if (CI->getNumArgOperands() % 2 != 1) 
-        report_fatal_error("number of arguments should be odd"); 
- 
-      GlobalClassesTy::member_iterator CurSet; 
-      for (unsigned I = 1; I != CI->getNumArgOperands(); I += 2) { 
-        int64_t Offset; 
-        auto *Base = dyn_cast<GlobalObject>(GetPointerBaseWithConstantOffset( 
-            CI->getOperand(I), Offset, M.getDataLayout())); 
-        if (!Base) 
-          report_fatal_error( 
-              "Expected branch funnel operand to be global value"); 
- 
-        GlobalTypeMember *GTM = GlobalTypeMembers[Base]; 
-        Targets.push_back(GTM); 
-        GlobalClassesTy::member_iterator NewSet = 
-            GlobalClasses.findLeader(GlobalClasses.insert(GTM)); 
-        if (I == 1) 
-          CurSet = NewSet; 
-        else 
-          CurSet = GlobalClasses.unionSets(CurSet, NewSet); 
-      } 
- 
-      GlobalClasses.unionSets( 
-          CurSet, GlobalClasses.findLeader( 
-                      GlobalClasses.insert(ICallBranchFunnel::create( 
-                          Alloc, CI, Targets, ++CurUniqueId)))); 
-    } 
-  } 
- 
-  if (ExportSummary) { 
-    DenseMap<GlobalValue::GUID, TinyPtrVector<Metadata *>> MetadataByGUID; 
-    for (auto &P : TypeIdInfo) { 
-      if (auto *TypeId = dyn_cast<MDString>(P.first)) 
-        MetadataByGUID[GlobalValue::getGUID(TypeId->getString())].push_back( 
-            TypeId); 
-    } 
- 
-    for (auto &P : *ExportSummary) { 
-      for (auto &S : P.second.SummaryList) { 
-        if (!ExportSummary->isGlobalValueLive(S.get())) 
-          continue; 
-        if (auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject())) 
-          for (GlobalValue::GUID G : FS->type_tests()) 
-            for (Metadata *MD : MetadataByGUID[G]) 
-              AddTypeIdUse(MD).IsExported = true; 
-      } 
-    } 
-  } 
- 
-  if (GlobalClasses.empty()) 
-    return false; 
- 
-  // Build a list of disjoint sets ordered by their maximum global index for 
-  // determinism. 
-  std::vector<std::pair<GlobalClassesTy::iterator, unsigned>> Sets; 
-  for (GlobalClassesTy::iterator I = GlobalClasses.begin(), 
-                                 E = GlobalClasses.end(); 
-       I != E; ++I) { 
-    if (!I->isLeader()) 
-      continue; 
-    ++NumTypeIdDisjointSets; 
- 
-    unsigned MaxUniqueId = 0; 
-    for (GlobalClassesTy::member_iterator MI = GlobalClasses.member_begin(I); 
-         MI != GlobalClasses.member_end(); ++MI) { 
-      if (auto *MD = MI->dyn_cast<Metadata *>()) 
-        MaxUniqueId = std::max(MaxUniqueId, TypeIdInfo[MD].UniqueId); 
-      else if (auto *BF = MI->dyn_cast<ICallBranchFunnel *>()) 
-        MaxUniqueId = std::max(MaxUniqueId, BF->UniqueId); 
-    } 
-    Sets.emplace_back(I, MaxUniqueId); 
-  } 
-  llvm::sort(Sets, 
-             [](const std::pair<GlobalClassesTy::iterator, unsigned> &S1, 
-                const std::pair<GlobalClassesTy::iterator, unsigned> &S2) { 
-               return S1.second < S2.second; 
-             }); 
- 
-  // For each disjoint set we found... 
-  for (const auto &S : Sets) { 
-    // Build the list of type identifiers in this disjoint set. 
-    std::vector<Metadata *> TypeIds; 
-    std::vector<GlobalTypeMember *> Globals; 
-    std::vector<ICallBranchFunnel *> ICallBranchFunnels; 
-    for (GlobalClassesTy::member_iterator MI = 
-             GlobalClasses.member_begin(S.first); 
-         MI != GlobalClasses.member_end(); ++MI) { 
-      if (MI->is<Metadata *>()) 
-        TypeIds.push_back(MI->get<Metadata *>()); 
-      else if (MI->is<GlobalTypeMember *>()) 
-        Globals.push_back(MI->get<GlobalTypeMember *>()); 
-      else 
-        ICallBranchFunnels.push_back(MI->get<ICallBranchFunnel *>()); 
-    } 
- 
-    // Order type identifiers by unique ID for determinism. This ordering is 
-    // stable as there is a one-to-one mapping between metadata and unique IDs. 
-    llvm::sort(TypeIds, [&](Metadata *M1, Metadata *M2) { 
-      return TypeIdInfo[M1].UniqueId < TypeIdInfo[M2].UniqueId; 
-    }); 
- 
-    // Same for the branch funnels. 
-    llvm::sort(ICallBranchFunnels, 
-               [&](ICallBranchFunnel *F1, ICallBranchFunnel *F2) { 
-                 return F1->UniqueId < F2->UniqueId; 
-               }); 
- 
-    // Build bitsets for this disjoint set. 
-    buildBitSetsFromDisjointSet(TypeIds, Globals, ICallBranchFunnels); 
-  } 
- 
-  allocateByteArrays(); 
- 
-  // Parse alias data to replace stand-in function declarations for aliases 
-  // with an alias to the intended target. 
-  if (ExportSummary) { 
-    if (NamedMDNode *AliasesMD = M.getNamedMetadata("aliases")) { 
-      for (auto AliasMD : AliasesMD->operands()) { 
-        assert(AliasMD->getNumOperands() >= 4); 
-        StringRef AliasName = 
-            cast<MDString>(AliasMD->getOperand(0))->getString(); 
-        StringRef Aliasee = cast<MDString>(AliasMD->getOperand(1))->getString(); 
- 
-        if (!ExportedFunctions.count(Aliasee) || 
-            ExportedFunctions[Aliasee].Linkage != CFL_Definition || 
-            !M.getNamedAlias(Aliasee)) 
-          continue; 
- 
-        GlobalValue::VisibilityTypes Visibility = 
-            static_cast<GlobalValue::VisibilityTypes>( 
-                cast<ConstantAsMetadata>(AliasMD->getOperand(2)) 
-                    ->getValue() 
-                    ->getUniqueInteger() 
-                    .getZExtValue()); 
-        bool Weak = 
-            static_cast<bool>(cast<ConstantAsMetadata>(AliasMD->getOperand(3)) 
-                                  ->getValue() 
-                                  ->getUniqueInteger() 
-                                  .getZExtValue()); 
- 
-        auto *Alias = GlobalAlias::create("", M.getNamedAlias(Aliasee)); 
-        Alias->setVisibility(Visibility); 
-        if (Weak) 
-          Alias->setLinkage(GlobalValue::WeakAnyLinkage); 
- 
-        if (auto *F = M.getFunction(AliasName)) { 
-          Alias->takeName(F); 
-          F->replaceAllUsesWith(Alias); 
-          F->eraseFromParent(); 
-        } else { 
-          Alias->setName(AliasName); 
-        } 
-      } 
-    } 
-  } 
- 
-  // Emit .symver directives for exported functions, if they exist. 
-  if (ExportSummary) { 
-    if (NamedMDNode *SymversMD = M.getNamedMetadata("symvers")) { 
-      for (auto Symver : SymversMD->operands()) { 
-        assert(Symver->getNumOperands() >= 2); 
-        StringRef SymbolName = 
-            cast<MDString>(Symver->getOperand(0))->getString(); 
-        StringRef Alias = cast<MDString>(Symver->getOperand(1))->getString(); 
- 
-        if (!ExportedFunctions.count(SymbolName)) 
-          continue; 
- 
-        M.appendModuleInlineAsm( 
-            (llvm::Twine(".symver ") + SymbolName + ", " + Alias).str()); 
-      } 
-    } 
-  } 
- 
-  return true; 
-} 
- 
-PreservedAnalyses LowerTypeTestsPass::run(Module &M, 
-                                          ModuleAnalysisManager &AM) { 
+  // Make sure we don't emit .eh_frame for this function.
+  F->addFnAttr(Attribute::NoUnwind);
+
+  BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F);
+  IRBuilder<> IRB(BB);
+
+  SmallVector<Type *, 16> ArgTypes;
+  ArgTypes.reserve(AsmArgs.size());
+  for (const auto &Arg : AsmArgs)
+    ArgTypes.push_back(Arg->getType());
+  InlineAsm *JumpTableAsm =
+      InlineAsm::get(FunctionType::get(IRB.getVoidTy(), ArgTypes, false),
+                     AsmOS.str(), ConstraintOS.str(),
+                     /*hasSideEffects=*/true);
+
+  IRB.CreateCall(JumpTableAsm, AsmArgs);
+  IRB.CreateUnreachable();
+}
+
+/// Given a disjoint set of type identifiers and functions, build a jump table
+/// for the functions, build the bit sets and lower the llvm.type.test calls.
+void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
+    ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Functions) {
+  // Unlike the global bitset builder, the function bitset builder cannot
+  // re-arrange functions in a particular order and base its calculations on the
+  // layout of the functions' entry points, as we have no idea how large a
+  // particular function will end up being (the size could even depend on what
+  // this pass does!) Instead, we build a jump table, which is a block of code
+  // consisting of one branch instruction for each of the functions in the bit
+  // set that branches to the target function, and redirect any taken function
+  // addresses to the corresponding jump table entry. In the object file's
+  // symbol table, the symbols for the target functions also refer to the jump
+  // table entries, so that addresses taken outside the module will pass any
+  // verification done inside the module.
+  //
+  // In more concrete terms, suppose we have three functions f, g, h which are
+  // of the same type, and a function foo that returns their addresses:
+  //
+  // f:
+  // mov 0, %eax
+  // ret
+  //
+  // g:
+  // mov 1, %eax
+  // ret
+  //
+  // h:
+  // mov 2, %eax
+  // ret
+  //
+  // foo:
+  // mov f, %eax
+  // mov g, %edx
+  // mov h, %ecx
+  // ret
+  //
+  // We output the jump table as module-level inline asm string. The end result
+  // will (conceptually) look like this:
+  //
+  // f = .cfi.jumptable
+  // g = .cfi.jumptable + 4
+  // h = .cfi.jumptable + 8
+  // .cfi.jumptable:
+  // jmp f.cfi  ; 5 bytes
+  // int3       ; 1 byte
+  // int3       ; 1 byte
+  // int3       ; 1 byte
+  // jmp g.cfi  ; 5 bytes
+  // int3       ; 1 byte
+  // int3       ; 1 byte
+  // int3       ; 1 byte
+  // jmp h.cfi  ; 5 bytes
+  // int3       ; 1 byte
+  // int3       ; 1 byte
+  // int3       ; 1 byte
+  //
+  // f.cfi:
+  // mov 0, %eax
+  // ret
+  //
+  // g.cfi:
+  // mov 1, %eax
+  // ret
+  //
+  // h.cfi:
+  // mov 2, %eax
+  // ret
+  //
+  // foo:
+  // mov f, %eax
+  // mov g, %edx
+  // mov h, %ecx
+  // ret
+  //
+  // Because the addresses of f, g, h are evenly spaced at a power of 2, in the
+  // normal case the check can be carried out using the same kind of simple
+  // arithmetic that we normally use for globals.
+
+  // FIXME: find a better way to represent the jumptable in the IR.
+  assert(!Functions.empty());
+
+  // Build a simple layout based on the regular layout of jump tables.
+  DenseMap<GlobalTypeMember *, uint64_t> GlobalLayout;
+  unsigned EntrySize = getJumpTableEntrySize();
+  for (unsigned I = 0; I != Functions.size(); ++I)
+    GlobalLayout[Functions[I]] = I * EntrySize;
+
+  Function *JumpTableFn =
+      Function::Create(FunctionType::get(Type::getVoidTy(M.getContext()),
+                                         /* IsVarArg */ false),
+                       GlobalValue::PrivateLinkage,
+                       M.getDataLayout().getProgramAddressSpace(),
+                       ".cfi.jumptable", &M);
+  ArrayType *JumpTableType =
+      ArrayType::get(getJumpTableEntryType(), Functions.size());
+  auto JumpTable =
+      ConstantExpr::getPointerCast(JumpTableFn, JumpTableType->getPointerTo(0));
+
+  lowerTypeTestCalls(TypeIds, JumpTable, GlobalLayout);
+
+  {
+    ScopedSaveAliaseesAndUsed S(M);
+
+    // Build aliases pointing to offsets into the jump table, and replace
+    // references to the original functions with references to the aliases.
+    for (unsigned I = 0; I != Functions.size(); ++I) {
+      Function *F = cast<Function>(Functions[I]->getGlobal());
+      bool IsJumpTableCanonical = Functions[I]->isJumpTableCanonical();
+
+      Constant *CombinedGlobalElemPtr = ConstantExpr::getBitCast(
+          ConstantExpr::getInBoundsGetElementPtr(
+              JumpTableType, JumpTable,
+              ArrayRef<Constant *>{ConstantInt::get(IntPtrTy, 0),
+                                   ConstantInt::get(IntPtrTy, I)}),
+          F->getType());
+      if (Functions[I]->isExported()) {
+        if (IsJumpTableCanonical) {
+          ExportSummary->cfiFunctionDefs().insert(std::string(F->getName()));
+        } else {
+          GlobalAlias *JtAlias = GlobalAlias::create(
+              F->getValueType(), 0, GlobalValue::ExternalLinkage,
+              F->getName() + ".cfi_jt", CombinedGlobalElemPtr, &M);
+          JtAlias->setVisibility(GlobalValue::HiddenVisibility);
+          ExportSummary->cfiFunctionDecls().insert(std::string(F->getName()));
+        }
+      }
+      if (!IsJumpTableCanonical) {
+        if (F->hasExternalWeakLinkage())
+          replaceWeakDeclarationWithJumpTablePtr(F, CombinedGlobalElemPtr,
+                                                 IsJumpTableCanonical);
+        else
+          replaceCfiUses(F, CombinedGlobalElemPtr, IsJumpTableCanonical);
+      } else {
+        assert(F->getType()->getAddressSpace() == 0);
+
+        GlobalAlias *FAlias =
+            GlobalAlias::create(F->getValueType(), 0, F->getLinkage(), "",
+                                CombinedGlobalElemPtr, &M);
+        FAlias->setVisibility(F->getVisibility());
+        FAlias->takeName(F);
+        if (FAlias->hasName())
+          F->setName(FAlias->getName() + ".cfi");
+        replaceCfiUses(F, FAlias, IsJumpTableCanonical);
+        if (!F->hasLocalLinkage())
+          F->setVisibility(GlobalVariable::HiddenVisibility);
+      }
+    }
+  }
+
+  createJumpTable(JumpTableFn, Functions);
+}
+
+/// Assign a dummy layout using an incrementing counter, tag each function
+/// with its index represented as metadata, and lower each type test to an
+/// integer range comparison. During generation of the indirect function call
+/// table in the backend, it will assign the given indexes.
+/// Note: Dynamic linking is not supported, as the WebAssembly ABI has not yet
+/// been finalized.
+void LowerTypeTestsModule::buildBitSetsFromFunctionsWASM(
+    ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Functions) {
+  assert(!Functions.empty());
+
+  // Build consecutive monotonic integer ranges for each call target set
+  DenseMap<GlobalTypeMember *, uint64_t> GlobalLayout;
+
+  for (GlobalTypeMember *GTM : Functions) {
+    Function *F = cast<Function>(GTM->getGlobal());
+
+    // Skip functions that are not address taken, to avoid bloating the table
+    if (!F->hasAddressTaken())
+      continue;
+
+    // Store metadata with the index for each function
+    MDNode *MD = MDNode::get(F->getContext(),
+                             ArrayRef<Metadata *>(ConstantAsMetadata::get(
+                                 ConstantInt::get(Int64Ty, IndirectIndex))));
+    F->setMetadata("wasm.index", MD);
+
+    // Assign the counter value
+    GlobalLayout[GTM] = IndirectIndex++;
+  }
+
+  // The indirect function table index space starts at zero, so pass a NULL
+  // pointer as the subtracted "jump table" offset.
+  lowerTypeTestCalls(TypeIds, ConstantPointerNull::get(Int32PtrTy),
+                     GlobalLayout);
+}
+
+void LowerTypeTestsModule::buildBitSetsFromDisjointSet(
+    ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Globals,
+    ArrayRef<ICallBranchFunnel *> ICallBranchFunnels) {
+  DenseMap<Metadata *, uint64_t> TypeIdIndices;
+  for (unsigned I = 0; I != TypeIds.size(); ++I)
+    TypeIdIndices[TypeIds[I]] = I;
+
+  // For each type identifier, build a set of indices that refer to members of
+  // the type identifier.
+  std::vector<std::set<uint64_t>> TypeMembers(TypeIds.size());
+  unsigned GlobalIndex = 0;
+  DenseMap<GlobalTypeMember *, uint64_t> GlobalIndices;
+  for (GlobalTypeMember *GTM : Globals) {
+    for (MDNode *Type : GTM->types()) {
+      // Type = { offset, type identifier }
+      auto I = TypeIdIndices.find(Type->getOperand(1));
+      if (I != TypeIdIndices.end())
+        TypeMembers[I->second].insert(GlobalIndex);
+    }
+    GlobalIndices[GTM] = GlobalIndex;
+    GlobalIndex++;
+  }
+
+  for (ICallBranchFunnel *JT : ICallBranchFunnels) {
+    TypeMembers.emplace_back();
+    std::set<uint64_t> &TMSet = TypeMembers.back();
+    for (GlobalTypeMember *T : JT->targets())
+      TMSet.insert(GlobalIndices[T]);
+  }
+
+  // Order the sets of indices by size. The GlobalLayoutBuilder works best
+  // when given small index sets first.
+  llvm::stable_sort(TypeMembers, [](const std::set<uint64_t> &O1,
+                                    const std::set<uint64_t> &O2) {
+    return O1.size() < O2.size();
+  });
+
+  // Create a GlobalLayoutBuilder and provide it with index sets as layout
+  // fragments. The GlobalLayoutBuilder tries to lay out members of fragments as
+  // close together as possible.
+  GlobalLayoutBuilder GLB(Globals.size());
+  for (auto &&MemSet : TypeMembers)
+    GLB.addFragment(MemSet);
+
+  // Build a vector of globals with the computed layout.
+  bool IsGlobalSet =
+      Globals.empty() || isa<GlobalVariable>(Globals[0]->getGlobal());
+  std::vector<GlobalTypeMember *> OrderedGTMs(Globals.size());
+  auto OGTMI = OrderedGTMs.begin();
+  for (auto &&F : GLB.Fragments) {
+    for (auto &&Offset : F) {
+      if (IsGlobalSet != isa<GlobalVariable>(Globals[Offset]->getGlobal()))
+        report_fatal_error("Type identifier may not contain both global "
+                           "variables and functions");
+      *OGTMI++ = Globals[Offset];
+    }
+  }
+
+  // Build the bitsets from this disjoint set.
+  if (IsGlobalSet)
+    buildBitSetsFromGlobalVariables(TypeIds, OrderedGTMs);
+  else
+    buildBitSetsFromFunctions(TypeIds, OrderedGTMs);
+}
+
+/// Lower all type tests in this module.
+LowerTypeTestsModule::LowerTypeTestsModule(
+    Module &M, ModuleSummaryIndex *ExportSummary,
+    const ModuleSummaryIndex *ImportSummary, bool DropTypeTests)
+    : M(M), ExportSummary(ExportSummary), ImportSummary(ImportSummary),
+      DropTypeTests(DropTypeTests) {
+  assert(!(ExportSummary && ImportSummary));
+  Triple TargetTriple(M.getTargetTriple());
+  Arch = TargetTriple.getArch();
+  OS = TargetTriple.getOS();
+  ObjectFormat = TargetTriple.getObjectFormat();
+}
+
+bool LowerTypeTestsModule::runForTesting(Module &M) {
+  ModuleSummaryIndex Summary(/*HaveGVs=*/false);
+
+  // Handle the command-line summary arguments. This code is for testing
+  // purposes only, so we handle errors directly.
+  if (!ClReadSummary.empty()) {
+    ExitOnError ExitOnErr("-lowertypetests-read-summary: " + ClReadSummary +
+                          ": ");
+    auto ReadSummaryFile =
+        ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(ClReadSummary)));
+
+    yaml::Input In(ReadSummaryFile->getBuffer());
+    In >> Summary;
+    ExitOnErr(errorCodeToError(In.error()));
+  }
+
+  bool Changed =
+      LowerTypeTestsModule(
+          M, ClSummaryAction == PassSummaryAction::Export ? &Summary : nullptr,
+          ClSummaryAction == PassSummaryAction::Import ? &Summary : nullptr,
+          /*DropTypeTests*/ false)
+          .lower();
+
+  if (!ClWriteSummary.empty()) {
+    ExitOnError ExitOnErr("-lowertypetests-write-summary: " + ClWriteSummary +
+                          ": ");
+    std::error_code EC;
+    raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::OF_Text);
+    ExitOnErr(errorCodeToError(EC));
+
+    yaml::Output Out(OS);
+    Out << Summary;
+  }
+
+  return Changed;
+}
+
+static bool isDirectCall(Use& U) {
+  auto *Usr = dyn_cast<CallInst>(U.getUser());
+  if (Usr) {
+    auto *CB = dyn_cast<CallBase>(Usr);
+    if (CB && CB->isCallee(&U))
+      return true;
+  }
+  return false;
+}
+
+void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New,
+                                          bool IsJumpTableCanonical) {
+  SmallSetVector<Constant *, 4> Constants;
+  auto UI = Old->use_begin(), E = Old->use_end();
+  for (; UI != E;) {
+    Use &U = *UI;
+    ++UI;
+
+    // Skip block addresses
+    if (isa<BlockAddress>(U.getUser()))
+      continue;
+
+    // Skip direct calls to externally defined or non-dso_local functions
+    if (isDirectCall(U) && (Old->isDSOLocal() || !IsJumpTableCanonical))
+      continue;
+
+    // Must handle Constants specially, we cannot call replaceUsesOfWith on a
+    // constant because they are uniqued.
+    if (auto *C = dyn_cast<Constant>(U.getUser())) {
+      if (!isa<GlobalValue>(C)) {
+        // Save unique users to avoid processing operand replacement
+        // more than once.
+        Constants.insert(C);
+        continue;
+      }
+    }
+
+    U.set(New);
+  }
+
+  // Process operand replacement of saved constants.
+  for (auto *C : Constants)
+    C->handleOperandChange(Old, New);
+}
+
+void LowerTypeTestsModule::replaceDirectCalls(Value *Old, Value *New) {
+  Old->replaceUsesWithIf(New, [](Use &U) { return isDirectCall(U); });
+}
+
+bool LowerTypeTestsModule::lower() {
+  Function *TypeTestFunc =
+      M.getFunction(Intrinsic::getName(Intrinsic::type_test));
+
+  if (DropTypeTests && TypeTestFunc) {
+    for (auto UI = TypeTestFunc->use_begin(), UE = TypeTestFunc->use_end();
+         UI != UE;) {
+      auto *CI = cast<CallInst>((*UI++).getUser());
+      // Find and erase llvm.assume intrinsics for this llvm.type.test call.
+      for (auto CIU = CI->use_begin(), CIUE = CI->use_end(); CIU != CIUE;) {
+        if (auto *AssumeCI = dyn_cast<CallInst>((*CIU++).getUser())) {
+          Function *F = AssumeCI->getCalledFunction();
+          if (F && F->getIntrinsicID() == Intrinsic::assume)
+            AssumeCI->eraseFromParent();
+        }
+      }
+      CI->eraseFromParent();
+    }
+
+    // We have deleted the type intrinsics, so we no longer have enough
+    // information to reason about the liveness of virtual function pointers
+    // in GlobalDCE.
+    for (GlobalVariable &GV : M.globals())
+      GV.eraseMetadata(LLVMContext::MD_vcall_visibility);
+
+    return true;
+  }
+
+  // If only some of the modules were split, we cannot correctly perform
+  // this transformation. We already checked for the presense of type tests
+  // with partially split modules during the thin link, and would have emitted
+  // an error if any were found, so here we can simply return.
+  if ((ExportSummary && ExportSummary->partiallySplitLTOUnits()) ||
+      (ImportSummary && ImportSummary->partiallySplitLTOUnits()))
+    return false;
+
+  Function *ICallBranchFunnelFunc =
+      M.getFunction(Intrinsic::getName(Intrinsic::icall_branch_funnel));
+  if ((!TypeTestFunc || TypeTestFunc->use_empty()) &&
+      (!ICallBranchFunnelFunc || ICallBranchFunnelFunc->use_empty()) &&
+      !ExportSummary && !ImportSummary)
+    return false;
+
+  if (ImportSummary) {
+    if (TypeTestFunc) {
+      for (auto UI = TypeTestFunc->use_begin(), UE = TypeTestFunc->use_end();
+           UI != UE;) {
+        auto *CI = cast<CallInst>((*UI++).getUser());
+        importTypeTest(CI);
+      }
+    }
+
+    if (ICallBranchFunnelFunc && !ICallBranchFunnelFunc->use_empty())
+      report_fatal_error(
+          "unexpected call to llvm.icall.branch.funnel during import phase");
+
+    SmallVector<Function *, 8> Defs;
+    SmallVector<Function *, 8> Decls;
+    for (auto &F : M) {
+      // CFI functions are either external, or promoted. A local function may
+      // have the same name, but it's not the one we are looking for.
+      if (F.hasLocalLinkage())
+        continue;
+      if (ImportSummary->cfiFunctionDefs().count(std::string(F.getName())))
+        Defs.push_back(&F);
+      else if (ImportSummary->cfiFunctionDecls().count(
+                   std::string(F.getName())))
+        Decls.push_back(&F);
+    }
+
+    std::vector<GlobalAlias *> AliasesToErase;
+    {
+      ScopedSaveAliaseesAndUsed S(M);
+      for (auto F : Defs)
+        importFunction(F, /*isJumpTableCanonical*/ true, AliasesToErase);
+      for (auto F : Decls)
+        importFunction(F, /*isJumpTableCanonical*/ false, AliasesToErase);
+    }
+    for (GlobalAlias *GA : AliasesToErase)
+      GA->eraseFromParent();
+
+    return true;
+  }
+
+  // Equivalence class set containing type identifiers and the globals that
+  // reference them. This is used to partition the set of type identifiers in
+  // the module into disjoint sets.
+  using GlobalClassesTy = EquivalenceClasses<
+      PointerUnion<GlobalTypeMember *, Metadata *, ICallBranchFunnel *>>;
+  GlobalClassesTy GlobalClasses;
+
+  // Verify the type metadata and build a few data structures to let us
+  // efficiently enumerate the type identifiers associated with a global:
+  // a list of GlobalTypeMembers (a GlobalObject stored alongside a vector
+  // of associated type metadata) and a mapping from type identifiers to their
+  // list of GlobalTypeMembers and last observed index in the list of globals.
+  // The indices will be used later to deterministically order the list of type
+  // identifiers.
+  BumpPtrAllocator Alloc;
+  struct TIInfo {
+    unsigned UniqueId;
+    std::vector<GlobalTypeMember *> RefGlobals;
+  };
+  DenseMap<Metadata *, TIInfo> TypeIdInfo;
+  unsigned CurUniqueId = 0;
+  SmallVector<MDNode *, 2> Types;
+
+  // Cross-DSO CFI emits jumptable entries for exported functions as well as
+  // address taken functions in case they are address taken in other modules.
+  const bool CrossDsoCfi = M.getModuleFlag("Cross-DSO CFI") != nullptr;
+
+  struct ExportedFunctionInfo {
+    CfiFunctionLinkage Linkage;
+    MDNode *FuncMD; // {name, linkage, type[, type...]}
+  };
+  DenseMap<StringRef, ExportedFunctionInfo> ExportedFunctions;
+  if (ExportSummary) {
+    // A set of all functions that are address taken by a live global object.
+    DenseSet<GlobalValue::GUID> AddressTaken;
+    for (auto &I : *ExportSummary)
+      for (auto &GVS : I.second.SummaryList)
+        if (GVS->isLive())
+          for (auto &Ref : GVS->refs())
+            AddressTaken.insert(Ref.getGUID());
+
+    NamedMDNode *CfiFunctionsMD = M.getNamedMetadata("cfi.functions");
+    if (CfiFunctionsMD) {
+      for (auto FuncMD : CfiFunctionsMD->operands()) {
+        assert(FuncMD->getNumOperands() >= 2);
+        StringRef FunctionName =
+            cast<MDString>(FuncMD->getOperand(0))->getString();
+        CfiFunctionLinkage Linkage = static_cast<CfiFunctionLinkage>(
+            cast<ConstantAsMetadata>(FuncMD->getOperand(1))
+                ->getValue()
+                ->getUniqueInteger()
+                .getZExtValue());
+        const GlobalValue::GUID GUID = GlobalValue::getGUID(
+                GlobalValue::dropLLVMManglingEscape(FunctionName));
+        // Do not emit jumptable entries for functions that are not-live and
+        // have no live references (and are not exported with cross-DSO CFI.)
+        if (!ExportSummary->isGUIDLive(GUID))
+          continue;
+        if (!AddressTaken.count(GUID)) {
+          if (!CrossDsoCfi || Linkage != CFL_Definition)
+            continue;
+
+          bool Exported = false;
+          if (auto VI = ExportSummary->getValueInfo(GUID))
+            for (auto &GVS : VI.getSummaryList())
+              if (GVS->isLive() && !GlobalValue::isLocalLinkage(GVS->linkage()))
+                Exported = true;
+
+          if (!Exported)
+            continue;
+        }
+        auto P = ExportedFunctions.insert({FunctionName, {Linkage, FuncMD}});
+        if (!P.second && P.first->second.Linkage != CFL_Definition)
+          P.first->second = {Linkage, FuncMD};
+      }
+
+      for (const auto &P : ExportedFunctions) {
+        StringRef FunctionName = P.first;
+        CfiFunctionLinkage Linkage = P.second.Linkage;
+        MDNode *FuncMD = P.second.FuncMD;
+        Function *F = M.getFunction(FunctionName);
+        if (F && F->hasLocalLinkage()) {
+          // Locally defined function that happens to have the same name as a
+          // function defined in a ThinLTO module. Rename it to move it out of
+          // the way of the external reference that we're about to create.
+          // Note that setName will find a unique name for the function, so even
+          // if there is an existing function with the suffix there won't be a
+          // name collision.
+          F->setName(F->getName() + ".1");
+          F = nullptr;
+        }
+
+        if (!F)
+          F = Function::Create(
+              FunctionType::get(Type::getVoidTy(M.getContext()), false),
+              GlobalVariable::ExternalLinkage,
+              M.getDataLayout().getProgramAddressSpace(), FunctionName, &M);
+
+        // If the function is available_externally, remove its definition so
+        // that it is handled the same way as a declaration. Later we will try
+        // to create an alias using this function's linkage, which will fail if
+        // the linkage is available_externally. This will also result in us
+        // following the code path below to replace the type metadata.
+        if (F->hasAvailableExternallyLinkage()) {
+          F->setLinkage(GlobalValue::ExternalLinkage);
+          F->deleteBody();
+          F->setComdat(nullptr);
+          F->clearMetadata();
+        }
+
+        // Update the linkage for extern_weak declarations when a definition
+        // exists.
+        if (Linkage == CFL_Definition && F->hasExternalWeakLinkage())
+          F->setLinkage(GlobalValue::ExternalLinkage);
+
+        // If the function in the full LTO module is a declaration, replace its
+        // type metadata with the type metadata we found in cfi.functions. That
+        // metadata is presumed to be more accurate than the metadata attached
+        // to the declaration.
+        if (F->isDeclaration()) {
+          if (Linkage == CFL_WeakDeclaration)
+            F->setLinkage(GlobalValue::ExternalWeakLinkage);
+
+          F->eraseMetadata(LLVMContext::MD_type);
+          for (unsigned I = 2; I < FuncMD->getNumOperands(); ++I)
+            F->addMetadata(LLVMContext::MD_type,
+                           *cast<MDNode>(FuncMD->getOperand(I).get()));
+        }
+      }
+    }
+  }
+
+  DenseMap<GlobalObject *, GlobalTypeMember *> GlobalTypeMembers;
+  for (GlobalObject &GO : M.global_objects()) {
+    if (isa<GlobalVariable>(GO) && GO.isDeclarationForLinker())
+      continue;
+
+    Types.clear();
+    GO.getMetadata(LLVMContext::MD_type, Types);
+
+    bool IsJumpTableCanonical = false;
+    bool IsExported = false;
+    if (Function *F = dyn_cast<Function>(&GO)) {
+      IsJumpTableCanonical = isJumpTableCanonical(F);
+      if (ExportedFunctions.count(F->getName())) {
+        IsJumpTableCanonical |=
+            ExportedFunctions[F->getName()].Linkage == CFL_Definition;
+        IsExported = true;
+      // TODO: The logic here checks only that the function is address taken,
+      // not that the address takers are live. This can be updated to check
+      // their liveness and emit fewer jumptable entries once monolithic LTO
+      // builds also emit summaries.
+      } else if (!F->hasAddressTaken()) {
+        if (!CrossDsoCfi || !IsJumpTableCanonical || F->hasLocalLinkage())
+          continue;
+      }
+    }
+
+    auto *GTM = GlobalTypeMember::create(Alloc, &GO, IsJumpTableCanonical,
+                                         IsExported, Types);
+    GlobalTypeMembers[&GO] = GTM;
+    for (MDNode *Type : Types) {
+      verifyTypeMDNode(&GO, Type);
+      auto &Info = TypeIdInfo[Type->getOperand(1)];
+      Info.UniqueId = ++CurUniqueId;
+      Info.RefGlobals.push_back(GTM);
+    }
+  }
+
+  auto AddTypeIdUse = [&](Metadata *TypeId) -> TypeIdUserInfo & {
+    // Add the call site to the list of call sites for this type identifier. We
+    // also use TypeIdUsers to keep track of whether we have seen this type
+    // identifier before. If we have, we don't need to re-add the referenced
+    // globals to the equivalence class.
+    auto Ins = TypeIdUsers.insert({TypeId, {}});
+    if (Ins.second) {
+      // Add the type identifier to the equivalence class.
+      GlobalClassesTy::iterator GCI = GlobalClasses.insert(TypeId);
+      GlobalClassesTy::member_iterator CurSet = GlobalClasses.findLeader(GCI);
+
+      // Add the referenced globals to the type identifier's equivalence class.
+      for (GlobalTypeMember *GTM : TypeIdInfo[TypeId].RefGlobals)
+        CurSet = GlobalClasses.unionSets(
+            CurSet, GlobalClasses.findLeader(GlobalClasses.insert(GTM)));
+    }
+
+    return Ins.first->second;
+  };
+
+  if (TypeTestFunc) {
+    for (const Use &U : TypeTestFunc->uses()) {
+      auto CI = cast<CallInst>(U.getUser());
+
+      auto TypeIdMDVal = dyn_cast<MetadataAsValue>(CI->getArgOperand(1));
+      if (!TypeIdMDVal)
+        report_fatal_error("Second argument of llvm.type.test must be metadata");
+      auto TypeId = TypeIdMDVal->getMetadata();
+      AddTypeIdUse(TypeId).CallSites.push_back(CI);
+    }
+  }
+
+  if (ICallBranchFunnelFunc) {
+    for (const Use &U : ICallBranchFunnelFunc->uses()) {
+      if (Arch != Triple::x86_64)
+        report_fatal_error(
+            "llvm.icall.branch.funnel not supported on this target");
+
+      auto CI = cast<CallInst>(U.getUser());
+
+      std::vector<GlobalTypeMember *> Targets;
+      if (CI->getNumArgOperands() % 2 != 1)
+        report_fatal_error("number of arguments should be odd");
+
+      GlobalClassesTy::member_iterator CurSet;
+      for (unsigned I = 1; I != CI->getNumArgOperands(); I += 2) {
+        int64_t Offset;
+        auto *Base = dyn_cast<GlobalObject>(GetPointerBaseWithConstantOffset(
+            CI->getOperand(I), Offset, M.getDataLayout()));
+        if (!Base)
+          report_fatal_error(
+              "Expected branch funnel operand to be global value");
+
+        GlobalTypeMember *GTM = GlobalTypeMembers[Base];
+        Targets.push_back(GTM);
+        GlobalClassesTy::member_iterator NewSet =
+            GlobalClasses.findLeader(GlobalClasses.insert(GTM));
+        if (I == 1)
+          CurSet = NewSet;
+        else
+          CurSet = GlobalClasses.unionSets(CurSet, NewSet);
+      }
+
+      GlobalClasses.unionSets(
+          CurSet, GlobalClasses.findLeader(
+                      GlobalClasses.insert(ICallBranchFunnel::create(
+                          Alloc, CI, Targets, ++CurUniqueId))));
+    }
+  }
+
+  if (ExportSummary) {
+    DenseMap<GlobalValue::GUID, TinyPtrVector<Metadata *>> MetadataByGUID;
+    for (auto &P : TypeIdInfo) {
+      if (auto *TypeId = dyn_cast<MDString>(P.first))
+        MetadataByGUID[GlobalValue::getGUID(TypeId->getString())].push_back(
+            TypeId);
+    }
+
+    for (auto &P : *ExportSummary) {
+      for (auto &S : P.second.SummaryList) {
+        if (!ExportSummary->isGlobalValueLive(S.get()))
+          continue;
+        if (auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject()))
+          for (GlobalValue::GUID G : FS->type_tests())
+            for (Metadata *MD : MetadataByGUID[G])
+              AddTypeIdUse(MD).IsExported = true;
+      }
+    }
+  }
+
+  if (GlobalClasses.empty())
+    return false;
+
+  // Build a list of disjoint sets ordered by their maximum global index for
+  // determinism.
+  std::vector<std::pair<GlobalClassesTy::iterator, unsigned>> Sets;
+  for (GlobalClassesTy::iterator I = GlobalClasses.begin(),
+                                 E = GlobalClasses.end();
+       I != E; ++I) {
+    if (!I->isLeader())
+      continue;
+    ++NumTypeIdDisjointSets;
+
+    unsigned MaxUniqueId = 0;
+    for (GlobalClassesTy::member_iterator MI = GlobalClasses.member_begin(I);
+         MI != GlobalClasses.member_end(); ++MI) {
+      if (auto *MD = MI->dyn_cast<Metadata *>())
+        MaxUniqueId = std::max(MaxUniqueId, TypeIdInfo[MD].UniqueId);
+      else if (auto *BF = MI->dyn_cast<ICallBranchFunnel *>())
+        MaxUniqueId = std::max(MaxUniqueId, BF->UniqueId);
+    }
+    Sets.emplace_back(I, MaxUniqueId);
+  }
+  llvm::sort(Sets,
+             [](const std::pair<GlobalClassesTy::iterator, unsigned> &S1,
+                const std::pair<GlobalClassesTy::iterator, unsigned> &S2) {
+               return S1.second < S2.second;
+             });
+
+  // For each disjoint set we found...
+  for (const auto &S : Sets) {
+    // Build the list of type identifiers in this disjoint set.
+    std::vector<Metadata *> TypeIds;
+    std::vector<GlobalTypeMember *> Globals;
+    std::vector<ICallBranchFunnel *> ICallBranchFunnels;
+    for (GlobalClassesTy::member_iterator MI =
+             GlobalClasses.member_begin(S.first);
+         MI != GlobalClasses.member_end(); ++MI) {
+      if (MI->is<Metadata *>())
+        TypeIds.push_back(MI->get<Metadata *>());
+      else if (MI->is<GlobalTypeMember *>())
+        Globals.push_back(MI->get<GlobalTypeMember *>());
+      else
+        ICallBranchFunnels.push_back(MI->get<ICallBranchFunnel *>());
+    }
+
+    // Order type identifiers by unique ID for determinism. This ordering is
+    // stable as there is a one-to-one mapping between metadata and unique IDs.
+    llvm::sort(TypeIds, [&](Metadata *M1, Metadata *M2) {
+      return TypeIdInfo[M1].UniqueId < TypeIdInfo[M2].UniqueId;
+    });
+
+    // Same for the branch funnels.
+    llvm::sort(ICallBranchFunnels,
+               [&](ICallBranchFunnel *F1, ICallBranchFunnel *F2) {
+                 return F1->UniqueId < F2->UniqueId;
+               });
+
+    // Build bitsets for this disjoint set.
+    buildBitSetsFromDisjointSet(TypeIds, Globals, ICallBranchFunnels);
+  }
+
+  allocateByteArrays();
+
+  // Parse alias data to replace stand-in function declarations for aliases
+  // with an alias to the intended target.
+  if (ExportSummary) {
+    if (NamedMDNode *AliasesMD = M.getNamedMetadata("aliases")) {
+      for (auto AliasMD : AliasesMD->operands()) {
+        assert(AliasMD->getNumOperands() >= 4);
+        StringRef AliasName =
+            cast<MDString>(AliasMD->getOperand(0))->getString();
+        StringRef Aliasee = cast<MDString>(AliasMD->getOperand(1))->getString();
+
+        if (!ExportedFunctions.count(Aliasee) ||
+            ExportedFunctions[Aliasee].Linkage != CFL_Definition ||
+            !M.getNamedAlias(Aliasee))
+          continue;
+
+        GlobalValue::VisibilityTypes Visibility =
+            static_cast<GlobalValue::VisibilityTypes>(
+                cast<ConstantAsMetadata>(AliasMD->getOperand(2))
+                    ->getValue()
+                    ->getUniqueInteger()
+                    .getZExtValue());
+        bool Weak =
+            static_cast<bool>(cast<ConstantAsMetadata>(AliasMD->getOperand(3))
+                                  ->getValue()
+                                  ->getUniqueInteger()
+                                  .getZExtValue());
+
+        auto *Alias = GlobalAlias::create("", M.getNamedAlias(Aliasee));
+        Alias->setVisibility(Visibility);
+        if (Weak)
+          Alias->setLinkage(GlobalValue::WeakAnyLinkage);
+
+        if (auto *F = M.getFunction(AliasName)) {
+          Alias->takeName(F);
+          F->replaceAllUsesWith(Alias);
+          F->eraseFromParent();
+        } else {
+          Alias->setName(AliasName);
+        }
+      }
+    }
+  }
+
+  // Emit .symver directives for exported functions, if they exist.
+  if (ExportSummary) {
+    if (NamedMDNode *SymversMD = M.getNamedMetadata("symvers")) {
+      for (auto Symver : SymversMD->operands()) {
+        assert(Symver->getNumOperands() >= 2);
+        StringRef SymbolName =
+            cast<MDString>(Symver->getOperand(0))->getString();
+        StringRef Alias = cast<MDString>(Symver->getOperand(1))->getString();
+
+        if (!ExportedFunctions.count(SymbolName))
+          continue;
+
+        M.appendModuleInlineAsm(
+            (llvm::Twine(".symver ") + SymbolName + ", " + Alias).str());
+      }
+    }
+  }
+
+  return true;
+}
+
+PreservedAnalyses LowerTypeTestsPass::run(Module &M,
+                                          ModuleAnalysisManager &AM) {
   bool Changed;
   if (UseCommandLine)
     Changed = LowerTypeTestsModule::runForTesting(M);
@@ -2262,7 +2262,7 @@ PreservedAnalyses LowerTypeTestsPass::run(Module &M,
     Changed =
         LowerTypeTestsModule(M, ExportSummary, ImportSummary, DropTypeTests)
             .lower();
-  if (!Changed) 
-    return PreservedAnalyses::all(); 
-  return PreservedAnalyses::none(); 
-} 
+  if (!Changed)
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/MergeFunctions.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/MergeFunctions.cpp
index 7ec8de7715..ec5d86b72a 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/MergeFunctions.cpp
@@ -1,955 +1,955 @@
-//===- MergeFunctions.cpp - Merge identical functions ---------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass looks for equivalent functions that are mergable and folds them. 
-// 
-// Order relation is defined on set of functions. It was made through 
-// special function comparison procedure that returns 
-// 0 when functions are equal, 
-// -1 when Left function is less than right function, and 
-// 1 for opposite case. We need total-ordering, so we need to maintain 
-// four properties on the functions set: 
-// a <= a (reflexivity) 
-// if a <= b and b <= a then a = b (antisymmetry) 
-// if a <= b and b <= c then a <= c (transitivity). 
-// for all a and b: a <= b or b <= a (totality). 
-// 
-// Comparison iterates through each instruction in each basic block. 
-// Functions are kept on binary tree. For each new function F we perform 
-// lookup in binary tree. 
-// In practice it works the following way: 
-// -- We define Function* container class with custom "operator<" (FunctionPtr). 
-// -- "FunctionPtr" instances are stored in std::set collection, so every 
-//    std::set::insert operation will give you result in log(N) time. 
-// 
-// As an optimization, a hash of the function structure is calculated first, and 
-// two functions are only compared if they have the same hash. This hash is 
-// cheap to compute, and has the property that if function F == G according to 
-// the comparison function, then hash(F) == hash(G). This consistency property 
-// is critical to ensuring all possible merging opportunities are exploited. 
-// Collisions in the hash affect the speed of the pass but not the correctness 
-// or determinism of the resulting transformation. 
-// 
-// When a match is found the functions are folded. If both functions are 
-// overridable, we move the functionality into a new internal function and 
-// leave two overridable thunks to it. 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// Future work: 
-// 
-// * virtual functions. 
-// 
-// Many functions have their address taken by the virtual function table for 
-// the object they belong to. However, as long as it's only used for a lookup 
-// and call, this is irrelevant, and we'd like to fold such functions. 
-// 
-// * be smarter about bitcasts. 
-// 
-// In order to fold functions, we will sometimes add either bitcast instructions 
-// or bitcast constant expressions. Unfortunately, this can confound further 
-// analysis since the two functions differ where one has a bitcast and the 
-// other doesn't. We should learn to look through bitcasts. 
-// 
-// * Compare complex types with pointer types inside. 
-// * Compare cross-reference cases. 
-// * Compare complex expressions. 
-// 
-// All the three issues above could be described as ability to prove that 
-// fA == fB == fC == fE == fF == fG in example below: 
-// 
-//  void fA() { 
-//    fB(); 
-//  } 
-//  void fB() { 
-//    fA(); 
-//  } 
-// 
-//  void fE() { 
-//    fF(); 
-//  } 
-//  void fF() { 
-//    fG(); 
-//  } 
-//  void fG() { 
-//    fE(); 
-//  } 
-// 
-// Simplest cross-reference case (fA <--> fB) was implemented in previous 
-// versions of MergeFunctions, though it presented only in two function pairs 
-// in test-suite (that counts >50k functions) 
-// Though possibility to detect complex cross-referencing (e.g.: A->B->C->D->A) 
-// could cover much more cases. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/IR/Argument.h" 
-#include "llvm/IR/Attributes.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DebugInfoMetadata.h" 
-#include "llvm/IR/DebugLoc.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GlobalValue.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/IR/ValueHandle.h" 
-#include "llvm/IR/ValueMap.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/IPO.h" 
-#include "llvm/Transforms/IPO/MergeFunctions.h" 
-#include "llvm/Transforms/Utils/FunctionComparator.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <iterator> 
-#include <set> 
-#include <utility> 
-#include <vector> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "mergefunc" 
- 
-STATISTIC(NumFunctionsMerged, "Number of functions merged"); 
-STATISTIC(NumThunksWritten, "Number of thunks generated"); 
-STATISTIC(NumAliasesWritten, "Number of aliases generated"); 
-STATISTIC(NumDoubleWeak, "Number of new functions created"); 
- 
-static cl::opt<unsigned> NumFunctionsForSanityCheck( 
-    "mergefunc-sanity", 
-    cl::desc("How many functions in module could be used for " 
-             "MergeFunctions pass sanity check. " 
-             "'0' disables this check. Works only with '-debug' key."), 
-    cl::init(0), cl::Hidden); 
- 
-// Under option -mergefunc-preserve-debug-info we: 
-// - Do not create a new function for a thunk. 
-// - Retain the debug info for a thunk's parameters (and associated 
-//   instructions for the debug info) from the entry block. 
-//   Note: -debug will display the algorithm at work. 
-// - Create debug-info for the call (to the shared implementation) made by 
-//   a thunk and its return value. 
-// - Erase the rest of the function, retaining the (minimally sized) entry 
-//   block to create a thunk. 
-// - Preserve a thunk's call site to point to the thunk even when both occur 
-//   within the same translation unit, to aid debugability. Note that this 
-//   behaviour differs from the underlying -mergefunc implementation which 
-//   modifies the thunk's call site to point to the shared implementation 
-//   when both occur within the same translation unit. 
-static cl::opt<bool> 
-    MergeFunctionsPDI("mergefunc-preserve-debug-info", cl::Hidden, 
-                      cl::init(false), 
-                      cl::desc("Preserve debug info in thunk when mergefunc " 
-                               "transformations are made.")); 
- 
-static cl::opt<bool> 
-    MergeFunctionsAliases("mergefunc-use-aliases", cl::Hidden, 
-                          cl::init(false), 
-                          cl::desc("Allow mergefunc to create aliases")); 
- 
-namespace { 
- 
-class FunctionNode { 
-  mutable AssertingVH<Function> F; 
-  FunctionComparator::FunctionHash Hash; 
- 
-public: 
-  // Note the hash is recalculated potentially multiple times, but it is cheap. 
-  FunctionNode(Function *F) 
-    : F(F), Hash(FunctionComparator::functionHash(*F))  {} 
- 
-  Function *getFunc() const { return F; } 
-  FunctionComparator::FunctionHash getHash() const { return Hash; } 
- 
-  /// Replace the reference to the function F by the function G, assuming their 
-  /// implementations are equal. 
-  void replaceBy(Function *G) const { 
-    F = G; 
-  } 
-}; 
- 
-/// MergeFunctions finds functions which will generate identical machine code, 
-/// by considering all pointer types to be equivalent. Once identified, 
-/// MergeFunctions will fold them by replacing a call to one to a call to a 
-/// bitcast of the other. 
-class MergeFunctions { 
-public: 
-  MergeFunctions() : FnTree(FunctionNodeCmp(&GlobalNumbers)) { 
-  } 
- 
-  bool runOnModule(Module &M); 
- 
-private: 
-  // The function comparison operator is provided here so that FunctionNodes do 
-  // not need to become larger with another pointer. 
-  class FunctionNodeCmp { 
-    GlobalNumberState* GlobalNumbers; 
- 
-  public: 
-    FunctionNodeCmp(GlobalNumberState* GN) : GlobalNumbers(GN) {} 
- 
-    bool operator()(const FunctionNode &LHS, const FunctionNode &RHS) const { 
-      // Order first by hashes, then full function comparison. 
-      if (LHS.getHash() != RHS.getHash()) 
-        return LHS.getHash() < RHS.getHash(); 
-      FunctionComparator FCmp(LHS.getFunc(), RHS.getFunc(), GlobalNumbers); 
-      return FCmp.compare() == -1; 
-    } 
-  }; 
-  using FnTreeType = std::set<FunctionNode, FunctionNodeCmp>; 
- 
-  GlobalNumberState GlobalNumbers; 
- 
-  /// A work queue of functions that may have been modified and should be 
-  /// analyzed again. 
-  std::vector<WeakTrackingVH> Deferred; 
- 
-#ifndef NDEBUG 
-  /// Checks the rules of order relation introduced among functions set. 
-  /// Returns true, if sanity check has been passed, and false if failed. 
-  bool doSanityCheck(std::vector<WeakTrackingVH> &Worklist); 
-#endif 
- 
-  /// Insert a ComparableFunction into the FnTree, or merge it away if it's 
-  /// equal to one that's already present. 
-  bool insert(Function *NewFunction); 
- 
-  /// Remove a Function from the FnTree and queue it up for a second sweep of 
-  /// analysis. 
-  void remove(Function *F); 
- 
-  /// Find the functions that use this Value and remove them from FnTree and 
-  /// queue the functions. 
-  void removeUsers(Value *V); 
- 
-  /// Replace all direct calls of Old with calls of New. Will bitcast New if 
-  /// necessary to make types match. 
-  void replaceDirectCallers(Function *Old, Function *New); 
- 
-  /// Merge two equivalent functions. Upon completion, G may be deleted, or may 
-  /// be converted into a thunk. In either case, it should never be visited 
-  /// again. 
-  void mergeTwoFunctions(Function *F, Function *G); 
- 
-  /// Fill PDIUnrelatedWL with instructions from the entry block that are 
-  /// unrelated to parameter related debug info. 
-  void filterInstsUnrelatedToPDI(BasicBlock *GEntryBlock, 
-                                 std::vector<Instruction *> &PDIUnrelatedWL); 
- 
-  /// Erase the rest of the CFG (i.e. barring the entry block). 
-  void eraseTail(Function *G); 
- 
-  /// Erase the instructions in PDIUnrelatedWL as they are unrelated to the 
-  /// parameter debug info, from the entry block. 
-  void eraseInstsUnrelatedToPDI(std::vector<Instruction *> &PDIUnrelatedWL); 
- 
-  /// Replace G with a simple tail call to bitcast(F). Also (unless 
-  /// MergeFunctionsPDI holds) replace direct uses of G with bitcast(F), 
-  /// delete G. 
-  void writeThunk(Function *F, Function *G); 
- 
-  // Replace G with an alias to F (deleting function G) 
-  void writeAlias(Function *F, Function *G); 
- 
-  // Replace G with an alias to F if possible, or a thunk to F if possible. 
-  // Returns false if neither is the case. 
-  bool writeThunkOrAlias(Function *F, Function *G); 
- 
-  /// Replace function F with function G in the function tree. 
-  void replaceFunctionInTree(const FunctionNode &FN, Function *G); 
- 
-  /// The set of all distinct functions. Use the insert() and remove() methods 
-  /// to modify it. The map allows efficient lookup and deferring of Functions. 
-  FnTreeType FnTree; 
- 
-  // Map functions to the iterators of the FunctionNode which contains them 
-  // in the FnTree. This must be updated carefully whenever the FnTree is 
-  // modified, i.e. in insert(), remove(), and replaceFunctionInTree(), to avoid 
-  // dangling iterators into FnTree. The invariant that preserves this is that 
-  // there is exactly one mapping F -> FN for each FunctionNode FN in FnTree. 
-  DenseMap<AssertingVH<Function>, FnTreeType::iterator> FNodesInTree; 
-}; 
- 
-class MergeFunctionsLegacyPass : public ModulePass { 
-public: 
-  static char ID; 
- 
-  MergeFunctionsLegacyPass(): ModulePass(ID) { 
-    initializeMergeFunctionsLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnModule(Module &M) override { 
-    if (skipModule(M)) 
-      return false; 
- 
-    MergeFunctions MF; 
-    return MF.runOnModule(M); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-char MergeFunctionsLegacyPass::ID = 0; 
-INITIALIZE_PASS(MergeFunctionsLegacyPass, "mergefunc", 
-                "Merge Functions", false, false) 
- 
-ModulePass *llvm::createMergeFunctionsPass() { 
-  return new MergeFunctionsLegacyPass(); 
-} 
- 
-PreservedAnalyses MergeFunctionsPass::run(Module &M, 
-                                          ModuleAnalysisManager &AM) { 
-  MergeFunctions MF; 
-  if (!MF.runOnModule(M)) 
-    return PreservedAnalyses::all(); 
-  return PreservedAnalyses::none(); 
-} 
- 
-#ifndef NDEBUG 
-bool MergeFunctions::doSanityCheck(std::vector<WeakTrackingVH> &Worklist) { 
-  if (const unsigned Max = NumFunctionsForSanityCheck) { 
-    unsigned TripleNumber = 0; 
-    bool Valid = true; 
- 
-    dbgs() << "MERGEFUNC-SANITY: Started for first " << Max << " functions.\n"; 
- 
-    unsigned i = 0; 
-    for (std::vector<WeakTrackingVH>::iterator I = Worklist.begin(), 
-                                               E = Worklist.end(); 
-         I != E && i < Max; ++I, ++i) { 
-      unsigned j = i; 
-      for (std::vector<WeakTrackingVH>::iterator J = I; J != E && j < Max; 
-           ++J, ++j) { 
-        Function *F1 = cast<Function>(*I); 
-        Function *F2 = cast<Function>(*J); 
-        int Res1 = FunctionComparator(F1, F2, &GlobalNumbers).compare(); 
-        int Res2 = FunctionComparator(F2, F1, &GlobalNumbers).compare(); 
- 
-        // If F1 <= F2, then F2 >= F1, otherwise report failure. 
-        if (Res1 != -Res2) { 
-          dbgs() << "MERGEFUNC-SANITY: Non-symmetric; triple: " << TripleNumber 
-                 << "\n"; 
-          dbgs() << *F1 << '\n' << *F2 << '\n'; 
-          Valid = false; 
-        } 
- 
-        if (Res1 == 0) 
-          continue; 
- 
-        unsigned k = j; 
-        for (std::vector<WeakTrackingVH>::iterator K = J; K != E && k < Max; 
-             ++k, ++K, ++TripleNumber) { 
-          if (K == J) 
-            continue; 
- 
-          Function *F3 = cast<Function>(*K); 
-          int Res3 = FunctionComparator(F1, F3, &GlobalNumbers).compare(); 
-          int Res4 = FunctionComparator(F2, F3, &GlobalNumbers).compare(); 
- 
-          bool Transitive = true; 
- 
-          if (Res1 != 0 && Res1 == Res4) { 
-            // F1 > F2, F2 > F3 => F1 > F3 
-            Transitive = Res3 == Res1; 
-          } else if (Res3 != 0 && Res3 == -Res4) { 
-            // F1 > F3, F3 > F2 => F1 > F2 
-            Transitive = Res3 == Res1; 
-          } else if (Res4 != 0 && -Res3 == Res4) { 
-            // F2 > F3, F3 > F1 => F2 > F1 
-            Transitive = Res4 == -Res1; 
-          } 
- 
-          if (!Transitive) { 
-            dbgs() << "MERGEFUNC-SANITY: Non-transitive; triple: " 
-                   << TripleNumber << "\n"; 
-            dbgs() << "Res1, Res3, Res4: " << Res1 << ", " << Res3 << ", " 
-                   << Res4 << "\n"; 
-            dbgs() << *F1 << '\n' << *F2 << '\n' << *F3 << '\n'; 
-            Valid = false; 
-          } 
-        } 
-      } 
-    } 
- 
-    dbgs() << "MERGEFUNC-SANITY: " << (Valid ? "Passed." : "Failed.") << "\n"; 
-    return Valid; 
-  } 
-  return true; 
-} 
-#endif 
- 
-/// Check whether \p F is eligible for function merging. 
-static bool isEligibleForMerging(Function &F) { 
-  return !F.isDeclaration() && !F.hasAvailableExternallyLinkage(); 
-} 
- 
-bool MergeFunctions::runOnModule(Module &M) { 
-  bool Changed = false; 
- 
-  // All functions in the module, ordered by hash. Functions with a unique 
-  // hash value are easily eliminated. 
-  std::vector<std::pair<FunctionComparator::FunctionHash, Function *>> 
-    HashedFuncs; 
-  for (Function &Func : M) { 
-    if (isEligibleForMerging(Func)) { 
-      HashedFuncs.push_back({FunctionComparator::functionHash(Func), &Func}); 
-    } 
-  } 
- 
-  llvm::stable_sort(HashedFuncs, less_first()); 
- 
-  auto S = HashedFuncs.begin(); 
-  for (auto I = HashedFuncs.begin(), IE = HashedFuncs.end(); I != IE; ++I) { 
-    // If the hash value matches the previous value or the next one, we must 
-    // consider merging it. Otherwise it is dropped and never considered again. 
-    if ((I != S && std::prev(I)->first == I->first) || 
-        (std::next(I) != IE && std::next(I)->first == I->first) ) { 
-      Deferred.push_back(WeakTrackingVH(I->second)); 
-    } 
-  } 
- 
-  do { 
-    std::vector<WeakTrackingVH> Worklist; 
-    Deferred.swap(Worklist); 
- 
-    LLVM_DEBUG(doSanityCheck(Worklist)); 
- 
-    LLVM_DEBUG(dbgs() << "size of module: " << M.size() << '\n'); 
-    LLVM_DEBUG(dbgs() << "size of worklist: " << Worklist.size() << '\n'); 
- 
-    // Insert functions and merge them. 
-    for (WeakTrackingVH &I : Worklist) { 
-      if (!I) 
-        continue; 
-      Function *F = cast<Function>(I); 
-      if (!F->isDeclaration() && !F->hasAvailableExternallyLinkage()) { 
-        Changed |= insert(F); 
-      } 
-    } 
-    LLVM_DEBUG(dbgs() << "size of FnTree: " << FnTree.size() << '\n'); 
-  } while (!Deferred.empty()); 
- 
-  FnTree.clear(); 
-  FNodesInTree.clear(); 
-  GlobalNumbers.clear(); 
- 
-  return Changed; 
-} 
- 
-// Replace direct callers of Old with New. 
-void MergeFunctions::replaceDirectCallers(Function *Old, Function *New) { 
-  Constant *BitcastNew = ConstantExpr::getBitCast(New, Old->getType()); 
-  for (auto UI = Old->use_begin(), UE = Old->use_end(); UI != UE;) { 
-    Use *U = &*UI; 
-    ++UI; 
-    CallBase *CB = dyn_cast<CallBase>(U->getUser()); 
-    if (CB && CB->isCallee(U)) { 
-      // Do not copy attributes from the called function to the call-site. 
-      // Function comparison ensures that the attributes are the same up to 
-      // type congruences in byval(), in which case we need to keep the byval 
-      // type of the call-site, not the callee function. 
-      remove(CB->getFunction()); 
-      U->set(BitcastNew); 
-    } 
-  } 
-} 
- 
-// Helper for writeThunk, 
-// Selects proper bitcast operation, 
-// but a bit simpler then CastInst::getCastOpcode. 
-static Value *createCast(IRBuilder<> &Builder, Value *V, Type *DestTy) { 
-  Type *SrcTy = V->getType(); 
-  if (SrcTy->isStructTy()) { 
-    assert(DestTy->isStructTy()); 
-    assert(SrcTy->getStructNumElements() == DestTy->getStructNumElements()); 
-    Value *Result = UndefValue::get(DestTy); 
-    for (unsigned int I = 0, E = SrcTy->getStructNumElements(); I < E; ++I) { 
-      Value *Element = createCast( 
-          Builder, Builder.CreateExtractValue(V, makeArrayRef(I)), 
-          DestTy->getStructElementType(I)); 
- 
-      Result = 
-          Builder.CreateInsertValue(Result, Element, makeArrayRef(I)); 
-    } 
-    return Result; 
-  } 
-  assert(!DestTy->isStructTy()); 
-  if (SrcTy->isIntegerTy() && DestTy->isPointerTy()) 
-    return Builder.CreateIntToPtr(V, DestTy); 
-  else if (SrcTy->isPointerTy() && DestTy->isIntegerTy()) 
-    return Builder.CreatePtrToInt(V, DestTy); 
-  else 
-    return Builder.CreateBitCast(V, DestTy); 
-} 
- 
-// Erase the instructions in PDIUnrelatedWL as they are unrelated to the 
-// parameter debug info, from the entry block. 
-void MergeFunctions::eraseInstsUnrelatedToPDI( 
-    std::vector<Instruction *> &PDIUnrelatedWL) { 
-  LLVM_DEBUG( 
-      dbgs() << " Erasing instructions (in reverse order of appearance in " 
-                "entry block) unrelated to parameter debug info from entry " 
-                "block: {\n"); 
-  while (!PDIUnrelatedWL.empty()) { 
-    Instruction *I = PDIUnrelatedWL.back(); 
-    LLVM_DEBUG(dbgs() << "  Deleting Instruction: "); 
-    LLVM_DEBUG(I->print(dbgs())); 
-    LLVM_DEBUG(dbgs() << "\n"); 
-    I->eraseFromParent(); 
-    PDIUnrelatedWL.pop_back(); 
-  } 
-  LLVM_DEBUG(dbgs() << " } // Done erasing instructions unrelated to parameter " 
-                       "debug info from entry block. \n"); 
-} 
- 
-// Reduce G to its entry block. 
-void MergeFunctions::eraseTail(Function *G) { 
-  std::vector<BasicBlock *> WorklistBB; 
-  for (Function::iterator BBI = std::next(G->begin()), BBE = G->end(); 
-       BBI != BBE; ++BBI) { 
-    BBI->dropAllReferences(); 
-    WorklistBB.push_back(&*BBI); 
-  } 
-  while (!WorklistBB.empty()) { 
-    BasicBlock *BB = WorklistBB.back(); 
-    BB->eraseFromParent(); 
-    WorklistBB.pop_back(); 
-  } 
-} 
- 
-// We are interested in the following instructions from the entry block as being 
-// related to parameter debug info: 
-// - @llvm.dbg.declare 
-// - stores from the incoming parameters to locations on the stack-frame 
-// - allocas that create these locations on the stack-frame 
-// - @llvm.dbg.value 
-// - the entry block's terminator 
-// The rest are unrelated to debug info for the parameters; fill up 
-// PDIUnrelatedWL with such instructions. 
-void MergeFunctions::filterInstsUnrelatedToPDI( 
-    BasicBlock *GEntryBlock, std::vector<Instruction *> &PDIUnrelatedWL) { 
-  std::set<Instruction *> PDIRelated; 
-  for (BasicBlock::iterator BI = GEntryBlock->begin(), BIE = GEntryBlock->end(); 
-       BI != BIE; ++BI) { 
-    if (auto *DVI = dyn_cast<DbgValueInst>(&*BI)) { 
-      LLVM_DEBUG(dbgs() << " Deciding: "); 
-      LLVM_DEBUG(BI->print(dbgs())); 
-      LLVM_DEBUG(dbgs() << "\n"); 
-      DILocalVariable *DILocVar = DVI->getVariable(); 
-      if (DILocVar->isParameter()) { 
-        LLVM_DEBUG(dbgs() << "  Include (parameter): "); 
-        LLVM_DEBUG(BI->print(dbgs())); 
-        LLVM_DEBUG(dbgs() << "\n"); 
-        PDIRelated.insert(&*BI); 
-      } else { 
-        LLVM_DEBUG(dbgs() << "  Delete (!parameter): "); 
-        LLVM_DEBUG(BI->print(dbgs())); 
-        LLVM_DEBUG(dbgs() << "\n"); 
-      } 
-    } else if (auto *DDI = dyn_cast<DbgDeclareInst>(&*BI)) { 
-      LLVM_DEBUG(dbgs() << " Deciding: "); 
-      LLVM_DEBUG(BI->print(dbgs())); 
-      LLVM_DEBUG(dbgs() << "\n"); 
-      DILocalVariable *DILocVar = DDI->getVariable(); 
-      if (DILocVar->isParameter()) { 
-        LLVM_DEBUG(dbgs() << "  Parameter: "); 
-        LLVM_DEBUG(DILocVar->print(dbgs())); 
-        AllocaInst *AI = dyn_cast_or_null<AllocaInst>(DDI->getAddress()); 
-        if (AI) { 
-          LLVM_DEBUG(dbgs() << "  Processing alloca users: "); 
-          LLVM_DEBUG(dbgs() << "\n"); 
-          for (User *U : AI->users()) { 
-            if (StoreInst *SI = dyn_cast<StoreInst>(U)) { 
-              if (Value *Arg = SI->getValueOperand()) { 
-                if (dyn_cast<Argument>(Arg)) { 
-                  LLVM_DEBUG(dbgs() << "  Include: "); 
-                  LLVM_DEBUG(AI->print(dbgs())); 
-                  LLVM_DEBUG(dbgs() << "\n"); 
-                  PDIRelated.insert(AI); 
-                  LLVM_DEBUG(dbgs() << "   Include (parameter): "); 
-                  LLVM_DEBUG(SI->print(dbgs())); 
-                  LLVM_DEBUG(dbgs() << "\n"); 
-                  PDIRelated.insert(SI); 
-                  LLVM_DEBUG(dbgs() << "  Include: "); 
-                  LLVM_DEBUG(BI->print(dbgs())); 
-                  LLVM_DEBUG(dbgs() << "\n"); 
-                  PDIRelated.insert(&*BI); 
-                } else { 
-                  LLVM_DEBUG(dbgs() << "   Delete (!parameter): "); 
-                  LLVM_DEBUG(SI->print(dbgs())); 
-                  LLVM_DEBUG(dbgs() << "\n"); 
-                } 
-              } 
-            } else { 
-              LLVM_DEBUG(dbgs() << "   Defer: "); 
-              LLVM_DEBUG(U->print(dbgs())); 
-              LLVM_DEBUG(dbgs() << "\n"); 
-            } 
-          } 
-        } else { 
-          LLVM_DEBUG(dbgs() << "  Delete (alloca NULL): "); 
-          LLVM_DEBUG(BI->print(dbgs())); 
-          LLVM_DEBUG(dbgs() << "\n"); 
-        } 
-      } else { 
-        LLVM_DEBUG(dbgs() << "  Delete (!parameter): "); 
-        LLVM_DEBUG(BI->print(dbgs())); 
-        LLVM_DEBUG(dbgs() << "\n"); 
-      } 
-    } else if (BI->isTerminator() && &*BI == GEntryBlock->getTerminator()) { 
-      LLVM_DEBUG(dbgs() << " Will Include Terminator: "); 
-      LLVM_DEBUG(BI->print(dbgs())); 
-      LLVM_DEBUG(dbgs() << "\n"); 
-      PDIRelated.insert(&*BI); 
-    } else { 
-      LLVM_DEBUG(dbgs() << " Defer: "); 
-      LLVM_DEBUG(BI->print(dbgs())); 
-      LLVM_DEBUG(dbgs() << "\n"); 
-    } 
-  } 
-  LLVM_DEBUG( 
-      dbgs() 
-      << " Report parameter debug info related/related instructions: {\n"); 
-  for (BasicBlock::iterator BI = GEntryBlock->begin(), BE = GEntryBlock->end(); 
-       BI != BE; ++BI) { 
- 
-    Instruction *I = &*BI; 
-    if (PDIRelated.find(I) == PDIRelated.end()) { 
-      LLVM_DEBUG(dbgs() << "  !PDIRelated: "); 
-      LLVM_DEBUG(I->print(dbgs())); 
-      LLVM_DEBUG(dbgs() << "\n"); 
-      PDIUnrelatedWL.push_back(I); 
-    } else { 
-      LLVM_DEBUG(dbgs() << "   PDIRelated: "); 
-      LLVM_DEBUG(I->print(dbgs())); 
-      LLVM_DEBUG(dbgs() << "\n"); 
-    } 
-  } 
-  LLVM_DEBUG(dbgs() << " }\n"); 
-} 
- 
-/// Whether this function may be replaced by a forwarding thunk. 
-static bool canCreateThunkFor(Function *F) { 
-  if (F->isVarArg()) 
-    return false; 
- 
-  // Don't merge tiny functions using a thunk, since it can just end up 
-  // making the function larger. 
-  if (F->size() == 1) { 
-    if (F->front().size() <= 2) { 
-      LLVM_DEBUG(dbgs() << "canCreateThunkFor: " << F->getName() 
-                        << " is too small to bother creating a thunk for\n"); 
-      return false; 
-    } 
-  } 
-  return true; 
-} 
- 
-// Replace G with a simple tail call to bitcast(F). Also (unless 
-// MergeFunctionsPDI holds) replace direct uses of G with bitcast(F), 
-// delete G. Under MergeFunctionsPDI, we use G itself for creating 
-// the thunk as we preserve the debug info (and associated instructions) 
-// from G's entry block pertaining to G's incoming arguments which are 
-// passed on as corresponding arguments in the call that G makes to F. 
-// For better debugability, under MergeFunctionsPDI, we do not modify G's 
-// call sites to point to F even when within the same translation unit. 
-void MergeFunctions::writeThunk(Function *F, Function *G) { 
-  BasicBlock *GEntryBlock = nullptr; 
-  std::vector<Instruction *> PDIUnrelatedWL; 
-  BasicBlock *BB = nullptr; 
-  Function *NewG = nullptr; 
-  if (MergeFunctionsPDI) { 
-    LLVM_DEBUG(dbgs() << "writeThunk: (MergeFunctionsPDI) Do not create a new " 
-                         "function as thunk; retain original: " 
-                      << G->getName() << "()\n"); 
-    GEntryBlock = &G->getEntryBlock(); 
-    LLVM_DEBUG( 
-        dbgs() << "writeThunk: (MergeFunctionsPDI) filter parameter related " 
-                  "debug info for " 
-               << G->getName() << "() {\n"); 
-    filterInstsUnrelatedToPDI(GEntryBlock, PDIUnrelatedWL); 
-    GEntryBlock->getTerminator()->eraseFromParent(); 
-    BB = GEntryBlock; 
-  } else { 
-    NewG = Function::Create(G->getFunctionType(), G->getLinkage(), 
-                            G->getAddressSpace(), "", G->getParent()); 
-    NewG->setComdat(G->getComdat()); 
-    BB = BasicBlock::Create(F->getContext(), "", NewG); 
-  } 
- 
-  IRBuilder<> Builder(BB); 
-  Function *H = MergeFunctionsPDI ? G : NewG; 
-  SmallVector<Value *, 16> Args; 
-  unsigned i = 0; 
-  FunctionType *FFTy = F->getFunctionType(); 
-  for (Argument &AI : H->args()) { 
-    Args.push_back(createCast(Builder, &AI, FFTy->getParamType(i))); 
-    ++i; 
-  } 
- 
-  CallInst *CI = Builder.CreateCall(F, Args); 
-  ReturnInst *RI = nullptr; 
-  CI->setTailCall(); 
-  CI->setCallingConv(F->getCallingConv()); 
-  CI->setAttributes(F->getAttributes()); 
-  if (H->getReturnType()->isVoidTy()) { 
-    RI = Builder.CreateRetVoid(); 
-  } else { 
-    RI = Builder.CreateRet(createCast(Builder, CI, H->getReturnType())); 
-  } 
- 
-  if (MergeFunctionsPDI) { 
-    DISubprogram *DIS = G->getSubprogram(); 
-    if (DIS) { 
+//===- MergeFunctions.cpp - Merge identical functions ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass looks for equivalent functions that are mergable and folds them.
+//
+// Order relation is defined on set of functions. It was made through
+// special function comparison procedure that returns
+// 0 when functions are equal,
+// -1 when Left function is less than right function, and
+// 1 for opposite case. We need total-ordering, so we need to maintain
+// four properties on the functions set:
+// a <= a (reflexivity)
+// if a <= b and b <= a then a = b (antisymmetry)
+// if a <= b and b <= c then a <= c (transitivity).
+// for all a and b: a <= b or b <= a (totality).
+//
+// Comparison iterates through each instruction in each basic block.
+// Functions are kept on binary tree. For each new function F we perform
+// lookup in binary tree.
+// In practice it works the following way:
+// -- We define Function* container class with custom "operator<" (FunctionPtr).
+// -- "FunctionPtr" instances are stored in std::set collection, so every
+//    std::set::insert operation will give you result in log(N) time.
+//
+// As an optimization, a hash of the function structure is calculated first, and
+// two functions are only compared if they have the same hash. This hash is
+// cheap to compute, and has the property that if function F == G according to
+// the comparison function, then hash(F) == hash(G). This consistency property
+// is critical to ensuring all possible merging opportunities are exploited.
+// Collisions in the hash affect the speed of the pass but not the correctness
+// or determinism of the resulting transformation.
+//
+// When a match is found the functions are folded. If both functions are
+// overridable, we move the functionality into a new internal function and
+// leave two overridable thunks to it.
+//
+//===----------------------------------------------------------------------===//
+//
+// Future work:
+//
+// * virtual functions.
+//
+// Many functions have their address taken by the virtual function table for
+// the object they belong to. However, as long as it's only used for a lookup
+// and call, this is irrelevant, and we'd like to fold such functions.
+//
+// * be smarter about bitcasts.
+//
+// In order to fold functions, we will sometimes add either bitcast instructions
+// or bitcast constant expressions. Unfortunately, this can confound further
+// analysis since the two functions differ where one has a bitcast and the
+// other doesn't. We should learn to look through bitcasts.
+//
+// * Compare complex types with pointer types inside.
+// * Compare cross-reference cases.
+// * Compare complex expressions.
+//
+// All the three issues above could be described as ability to prove that
+// fA == fB == fC == fE == fF == fG in example below:
+//
+//  void fA() {
+//    fB();
+//  }
+//  void fB() {
+//    fA();
+//  }
+//
+//  void fE() {
+//    fF();
+//  }
+//  void fF() {
+//    fG();
+//  }
+//  void fG() {
+//    fE();
+//  }
+//
+// Simplest cross-reference case (fA <--> fB) was implemented in previous
+// versions of MergeFunctions, though it presented only in two function pairs
+// in test-suite (that counts >50k functions)
+// Though possibility to detect complex cross-referencing (e.g.: A->B->C->D->A)
+// could cover much more cases.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/MergeFunctions.h"
+#include "llvm/Transforms/Utils/FunctionComparator.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <set>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mergefunc"
+
+STATISTIC(NumFunctionsMerged, "Number of functions merged");
+STATISTIC(NumThunksWritten, "Number of thunks generated");
+STATISTIC(NumAliasesWritten, "Number of aliases generated");
+STATISTIC(NumDoubleWeak, "Number of new functions created");
+
+static cl::opt<unsigned> NumFunctionsForSanityCheck(
+    "mergefunc-sanity",
+    cl::desc("How many functions in module could be used for "
+             "MergeFunctions pass sanity check. "
+             "'0' disables this check. Works only with '-debug' key."),
+    cl::init(0), cl::Hidden);
+
+// Under option -mergefunc-preserve-debug-info we:
+// - Do not create a new function for a thunk.
+// - Retain the debug info for a thunk's parameters (and associated
+//   instructions for the debug info) from the entry block.
+//   Note: -debug will display the algorithm at work.
+// - Create debug-info for the call (to the shared implementation) made by
+//   a thunk and its return value.
+// - Erase the rest of the function, retaining the (minimally sized) entry
+//   block to create a thunk.
+// - Preserve a thunk's call site to point to the thunk even when both occur
+//   within the same translation unit, to aid debugability. Note that this
+//   behaviour differs from the underlying -mergefunc implementation which
+//   modifies the thunk's call site to point to the shared implementation
+//   when both occur within the same translation unit.
+static cl::opt<bool>
+    MergeFunctionsPDI("mergefunc-preserve-debug-info", cl::Hidden,
+                      cl::init(false),
+                      cl::desc("Preserve debug info in thunk when mergefunc "
+                               "transformations are made."));
+
+static cl::opt<bool>
+    MergeFunctionsAliases("mergefunc-use-aliases", cl::Hidden,
+                          cl::init(false),
+                          cl::desc("Allow mergefunc to create aliases"));
+
+namespace {
+
+class FunctionNode {
+  mutable AssertingVH<Function> F;
+  FunctionComparator::FunctionHash Hash;
+
+public:
+  // Note the hash is recalculated potentially multiple times, but it is cheap.
+  FunctionNode(Function *F)
+    : F(F), Hash(FunctionComparator::functionHash(*F))  {}
+
+  Function *getFunc() const { return F; }
+  FunctionComparator::FunctionHash getHash() const { return Hash; }
+
+  /// Replace the reference to the function F by the function G, assuming their
+  /// implementations are equal.
+  void replaceBy(Function *G) const {
+    F = G;
+  }
+};
+
+/// MergeFunctions finds functions which will generate identical machine code,
+/// by considering all pointer types to be equivalent. Once identified,
+/// MergeFunctions will fold them by replacing a call to one to a call to a
+/// bitcast of the other.
+class MergeFunctions {
+public:
+  MergeFunctions() : FnTree(FunctionNodeCmp(&GlobalNumbers)) {
+  }
+
+  bool runOnModule(Module &M);
+
+private:
+  // The function comparison operator is provided here so that FunctionNodes do
+  // not need to become larger with another pointer.
+  class FunctionNodeCmp {
+    GlobalNumberState* GlobalNumbers;
+
+  public:
+    FunctionNodeCmp(GlobalNumberState* GN) : GlobalNumbers(GN) {}
+
+    bool operator()(const FunctionNode &LHS, const FunctionNode &RHS) const {
+      // Order first by hashes, then full function comparison.
+      if (LHS.getHash() != RHS.getHash())
+        return LHS.getHash() < RHS.getHash();
+      FunctionComparator FCmp(LHS.getFunc(), RHS.getFunc(), GlobalNumbers);
+      return FCmp.compare() == -1;
+    }
+  };
+  using FnTreeType = std::set<FunctionNode, FunctionNodeCmp>;
+
+  GlobalNumberState GlobalNumbers;
+
+  /// A work queue of functions that may have been modified and should be
+  /// analyzed again.
+  std::vector<WeakTrackingVH> Deferred;
+
+#ifndef NDEBUG
+  /// Checks the rules of order relation introduced among functions set.
+  /// Returns true, if sanity check has been passed, and false if failed.
+  bool doSanityCheck(std::vector<WeakTrackingVH> &Worklist);
+#endif
+
+  /// Insert a ComparableFunction into the FnTree, or merge it away if it's
+  /// equal to one that's already present.
+  bool insert(Function *NewFunction);
+
+  /// Remove a Function from the FnTree and queue it up for a second sweep of
+  /// analysis.
+  void remove(Function *F);
+
+  /// Find the functions that use this Value and remove them from FnTree and
+  /// queue the functions.
+  void removeUsers(Value *V);
+
+  /// Replace all direct calls of Old with calls of New. Will bitcast New if
+  /// necessary to make types match.
+  void replaceDirectCallers(Function *Old, Function *New);
+
+  /// Merge two equivalent functions. Upon completion, G may be deleted, or may
+  /// be converted into a thunk. In either case, it should never be visited
+  /// again.
+  void mergeTwoFunctions(Function *F, Function *G);
+
+  /// Fill PDIUnrelatedWL with instructions from the entry block that are
+  /// unrelated to parameter related debug info.
+  void filterInstsUnrelatedToPDI(BasicBlock *GEntryBlock,
+                                 std::vector<Instruction *> &PDIUnrelatedWL);
+
+  /// Erase the rest of the CFG (i.e. barring the entry block).
+  void eraseTail(Function *G);
+
+  /// Erase the instructions in PDIUnrelatedWL as they are unrelated to the
+  /// parameter debug info, from the entry block.
+  void eraseInstsUnrelatedToPDI(std::vector<Instruction *> &PDIUnrelatedWL);
+
+  /// Replace G with a simple tail call to bitcast(F). Also (unless
+  /// MergeFunctionsPDI holds) replace direct uses of G with bitcast(F),
+  /// delete G.
+  void writeThunk(Function *F, Function *G);
+
+  // Replace G with an alias to F (deleting function G)
+  void writeAlias(Function *F, Function *G);
+
+  // Replace G with an alias to F if possible, or a thunk to F if possible.
+  // Returns false if neither is the case.
+  bool writeThunkOrAlias(Function *F, Function *G);
+
+  /// Replace function F with function G in the function tree.
+  void replaceFunctionInTree(const FunctionNode &FN, Function *G);
+
+  /// The set of all distinct functions. Use the insert() and remove() methods
+  /// to modify it. The map allows efficient lookup and deferring of Functions.
+  FnTreeType FnTree;
+
+  // Map functions to the iterators of the FunctionNode which contains them
+  // in the FnTree. This must be updated carefully whenever the FnTree is
+  // modified, i.e. in insert(), remove(), and replaceFunctionInTree(), to avoid
+  // dangling iterators into FnTree. The invariant that preserves this is that
+  // there is exactly one mapping F -> FN for each FunctionNode FN in FnTree.
+  DenseMap<AssertingVH<Function>, FnTreeType::iterator> FNodesInTree;
+};
+
+class MergeFunctionsLegacyPass : public ModulePass {
+public:
+  static char ID;
+
+  MergeFunctionsLegacyPass(): ModulePass(ID) {
+    initializeMergeFunctionsLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+
+    MergeFunctions MF;
+    return MF.runOnModule(M);
+  }
+};
+
+} // end anonymous namespace
+
+char MergeFunctionsLegacyPass::ID = 0;
+INITIALIZE_PASS(MergeFunctionsLegacyPass, "mergefunc",
+                "Merge Functions", false, false)
+
+ModulePass *llvm::createMergeFunctionsPass() {
+  return new MergeFunctionsLegacyPass();
+}
+
+PreservedAnalyses MergeFunctionsPass::run(Module &M,
+                                          ModuleAnalysisManager &AM) {
+  MergeFunctions MF;
+  if (!MF.runOnModule(M))
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
+}
+
+#ifndef NDEBUG
+bool MergeFunctions::doSanityCheck(std::vector<WeakTrackingVH> &Worklist) {
+  if (const unsigned Max = NumFunctionsForSanityCheck) {
+    unsigned TripleNumber = 0;
+    bool Valid = true;
+
+    dbgs() << "MERGEFUNC-SANITY: Started for first " << Max << " functions.\n";
+
+    unsigned i = 0;
+    for (std::vector<WeakTrackingVH>::iterator I = Worklist.begin(),
+                                               E = Worklist.end();
+         I != E && i < Max; ++I, ++i) {
+      unsigned j = i;
+      for (std::vector<WeakTrackingVH>::iterator J = I; J != E && j < Max;
+           ++J, ++j) {
+        Function *F1 = cast<Function>(*I);
+        Function *F2 = cast<Function>(*J);
+        int Res1 = FunctionComparator(F1, F2, &GlobalNumbers).compare();
+        int Res2 = FunctionComparator(F2, F1, &GlobalNumbers).compare();
+
+        // If F1 <= F2, then F2 >= F1, otherwise report failure.
+        if (Res1 != -Res2) {
+          dbgs() << "MERGEFUNC-SANITY: Non-symmetric; triple: " << TripleNumber
+                 << "\n";
+          dbgs() << *F1 << '\n' << *F2 << '\n';
+          Valid = false;
+        }
+
+        if (Res1 == 0)
+          continue;
+
+        unsigned k = j;
+        for (std::vector<WeakTrackingVH>::iterator K = J; K != E && k < Max;
+             ++k, ++K, ++TripleNumber) {
+          if (K == J)
+            continue;
+
+          Function *F3 = cast<Function>(*K);
+          int Res3 = FunctionComparator(F1, F3, &GlobalNumbers).compare();
+          int Res4 = FunctionComparator(F2, F3, &GlobalNumbers).compare();
+
+          bool Transitive = true;
+
+          if (Res1 != 0 && Res1 == Res4) {
+            // F1 > F2, F2 > F3 => F1 > F3
+            Transitive = Res3 == Res1;
+          } else if (Res3 != 0 && Res3 == -Res4) {
+            // F1 > F3, F3 > F2 => F1 > F2
+            Transitive = Res3 == Res1;
+          } else if (Res4 != 0 && -Res3 == Res4) {
+            // F2 > F3, F3 > F1 => F2 > F1
+            Transitive = Res4 == -Res1;
+          }
+
+          if (!Transitive) {
+            dbgs() << "MERGEFUNC-SANITY: Non-transitive; triple: "
+                   << TripleNumber << "\n";
+            dbgs() << "Res1, Res3, Res4: " << Res1 << ", " << Res3 << ", "
+                   << Res4 << "\n";
+            dbgs() << *F1 << '\n' << *F2 << '\n' << *F3 << '\n';
+            Valid = false;
+          }
+        }
+      }
+    }
+
+    dbgs() << "MERGEFUNC-SANITY: " << (Valid ? "Passed." : "Failed.") << "\n";
+    return Valid;
+  }
+  return true;
+}
+#endif
+
+/// Check whether \p F is eligible for function merging.
+static bool isEligibleForMerging(Function &F) {
+  return !F.isDeclaration() && !F.hasAvailableExternallyLinkage();
+}
+
+bool MergeFunctions::runOnModule(Module &M) {
+  bool Changed = false;
+
+  // All functions in the module, ordered by hash. Functions with a unique
+  // hash value are easily eliminated.
+  std::vector<std::pair<FunctionComparator::FunctionHash, Function *>>
+    HashedFuncs;
+  for (Function &Func : M) {
+    if (isEligibleForMerging(Func)) {
+      HashedFuncs.push_back({FunctionComparator::functionHash(Func), &Func});
+    }
+  }
+
+  llvm::stable_sort(HashedFuncs, less_first());
+
+  auto S = HashedFuncs.begin();
+  for (auto I = HashedFuncs.begin(), IE = HashedFuncs.end(); I != IE; ++I) {
+    // If the hash value matches the previous value or the next one, we must
+    // consider merging it. Otherwise it is dropped and never considered again.
+    if ((I != S && std::prev(I)->first == I->first) ||
+        (std::next(I) != IE && std::next(I)->first == I->first) ) {
+      Deferred.push_back(WeakTrackingVH(I->second));
+    }
+  }
+
+  do {
+    std::vector<WeakTrackingVH> Worklist;
+    Deferred.swap(Worklist);
+
+    LLVM_DEBUG(doSanityCheck(Worklist));
+
+    LLVM_DEBUG(dbgs() << "size of module: " << M.size() << '\n');
+    LLVM_DEBUG(dbgs() << "size of worklist: " << Worklist.size() << '\n');
+
+    // Insert functions and merge them.
+    for (WeakTrackingVH &I : Worklist) {
+      if (!I)
+        continue;
+      Function *F = cast<Function>(I);
+      if (!F->isDeclaration() && !F->hasAvailableExternallyLinkage()) {
+        Changed |= insert(F);
+      }
+    }
+    LLVM_DEBUG(dbgs() << "size of FnTree: " << FnTree.size() << '\n');
+  } while (!Deferred.empty());
+
+  FnTree.clear();
+  FNodesInTree.clear();
+  GlobalNumbers.clear();
+
+  return Changed;
+}
+
+// Replace direct callers of Old with New.
+void MergeFunctions::replaceDirectCallers(Function *Old, Function *New) {
+  Constant *BitcastNew = ConstantExpr::getBitCast(New, Old->getType());
+  for (auto UI = Old->use_begin(), UE = Old->use_end(); UI != UE;) {
+    Use *U = &*UI;
+    ++UI;
+    CallBase *CB = dyn_cast<CallBase>(U->getUser());
+    if (CB && CB->isCallee(U)) {
+      // Do not copy attributes from the called function to the call-site.
+      // Function comparison ensures that the attributes are the same up to
+      // type congruences in byval(), in which case we need to keep the byval
+      // type of the call-site, not the callee function.
+      remove(CB->getFunction());
+      U->set(BitcastNew);
+    }
+  }
+}
+
+// Helper for writeThunk,
+// Selects proper bitcast operation,
+// but a bit simpler then CastInst::getCastOpcode.
+static Value *createCast(IRBuilder<> &Builder, Value *V, Type *DestTy) {
+  Type *SrcTy = V->getType();
+  if (SrcTy->isStructTy()) {
+    assert(DestTy->isStructTy());
+    assert(SrcTy->getStructNumElements() == DestTy->getStructNumElements());
+    Value *Result = UndefValue::get(DestTy);
+    for (unsigned int I = 0, E = SrcTy->getStructNumElements(); I < E; ++I) {
+      Value *Element = createCast(
+          Builder, Builder.CreateExtractValue(V, makeArrayRef(I)),
+          DestTy->getStructElementType(I));
+
+      Result =
+          Builder.CreateInsertValue(Result, Element, makeArrayRef(I));
+    }
+    return Result;
+  }
+  assert(!DestTy->isStructTy());
+  if (SrcTy->isIntegerTy() && DestTy->isPointerTy())
+    return Builder.CreateIntToPtr(V, DestTy);
+  else if (SrcTy->isPointerTy() && DestTy->isIntegerTy())
+    return Builder.CreatePtrToInt(V, DestTy);
+  else
+    return Builder.CreateBitCast(V, DestTy);
+}
+
+// Erase the instructions in PDIUnrelatedWL as they are unrelated to the
+// parameter debug info, from the entry block.
+void MergeFunctions::eraseInstsUnrelatedToPDI(
+    std::vector<Instruction *> &PDIUnrelatedWL) {
+  LLVM_DEBUG(
+      dbgs() << " Erasing instructions (in reverse order of appearance in "
+                "entry block) unrelated to parameter debug info from entry "
+                "block: {\n");
+  while (!PDIUnrelatedWL.empty()) {
+    Instruction *I = PDIUnrelatedWL.back();
+    LLVM_DEBUG(dbgs() << "  Deleting Instruction: ");
+    LLVM_DEBUG(I->print(dbgs()));
+    LLVM_DEBUG(dbgs() << "\n");
+    I->eraseFromParent();
+    PDIUnrelatedWL.pop_back();
+  }
+  LLVM_DEBUG(dbgs() << " } // Done erasing instructions unrelated to parameter "
+                       "debug info from entry block. \n");
+}
+
+// Reduce G to its entry block.
+void MergeFunctions::eraseTail(Function *G) {
+  std::vector<BasicBlock *> WorklistBB;
+  for (Function::iterator BBI = std::next(G->begin()), BBE = G->end();
+       BBI != BBE; ++BBI) {
+    BBI->dropAllReferences();
+    WorklistBB.push_back(&*BBI);
+  }
+  while (!WorklistBB.empty()) {
+    BasicBlock *BB = WorklistBB.back();
+    BB->eraseFromParent();
+    WorklistBB.pop_back();
+  }
+}
+
+// We are interested in the following instructions from the entry block as being
+// related to parameter debug info:
+// - @llvm.dbg.declare
+// - stores from the incoming parameters to locations on the stack-frame
+// - allocas that create these locations on the stack-frame
+// - @llvm.dbg.value
+// - the entry block's terminator
+// The rest are unrelated to debug info for the parameters; fill up
+// PDIUnrelatedWL with such instructions.
+void MergeFunctions::filterInstsUnrelatedToPDI(
+    BasicBlock *GEntryBlock, std::vector<Instruction *> &PDIUnrelatedWL) {
+  std::set<Instruction *> PDIRelated;
+  for (BasicBlock::iterator BI = GEntryBlock->begin(), BIE = GEntryBlock->end();
+       BI != BIE; ++BI) {
+    if (auto *DVI = dyn_cast<DbgValueInst>(&*BI)) {
+      LLVM_DEBUG(dbgs() << " Deciding: ");
+      LLVM_DEBUG(BI->print(dbgs()));
+      LLVM_DEBUG(dbgs() << "\n");
+      DILocalVariable *DILocVar = DVI->getVariable();
+      if (DILocVar->isParameter()) {
+        LLVM_DEBUG(dbgs() << "  Include (parameter): ");
+        LLVM_DEBUG(BI->print(dbgs()));
+        LLVM_DEBUG(dbgs() << "\n");
+        PDIRelated.insert(&*BI);
+      } else {
+        LLVM_DEBUG(dbgs() << "  Delete (!parameter): ");
+        LLVM_DEBUG(BI->print(dbgs()));
+        LLVM_DEBUG(dbgs() << "\n");
+      }
+    } else if (auto *DDI = dyn_cast<DbgDeclareInst>(&*BI)) {
+      LLVM_DEBUG(dbgs() << " Deciding: ");
+      LLVM_DEBUG(BI->print(dbgs()));
+      LLVM_DEBUG(dbgs() << "\n");
+      DILocalVariable *DILocVar = DDI->getVariable();
+      if (DILocVar->isParameter()) {
+        LLVM_DEBUG(dbgs() << "  Parameter: ");
+        LLVM_DEBUG(DILocVar->print(dbgs()));
+        AllocaInst *AI = dyn_cast_or_null<AllocaInst>(DDI->getAddress());
+        if (AI) {
+          LLVM_DEBUG(dbgs() << "  Processing alloca users: ");
+          LLVM_DEBUG(dbgs() << "\n");
+          for (User *U : AI->users()) {
+            if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+              if (Value *Arg = SI->getValueOperand()) {
+                if (dyn_cast<Argument>(Arg)) {
+                  LLVM_DEBUG(dbgs() << "  Include: ");
+                  LLVM_DEBUG(AI->print(dbgs()));
+                  LLVM_DEBUG(dbgs() << "\n");
+                  PDIRelated.insert(AI);
+                  LLVM_DEBUG(dbgs() << "   Include (parameter): ");
+                  LLVM_DEBUG(SI->print(dbgs()));
+                  LLVM_DEBUG(dbgs() << "\n");
+                  PDIRelated.insert(SI);
+                  LLVM_DEBUG(dbgs() << "  Include: ");
+                  LLVM_DEBUG(BI->print(dbgs()));
+                  LLVM_DEBUG(dbgs() << "\n");
+                  PDIRelated.insert(&*BI);
+                } else {
+                  LLVM_DEBUG(dbgs() << "   Delete (!parameter): ");
+                  LLVM_DEBUG(SI->print(dbgs()));
+                  LLVM_DEBUG(dbgs() << "\n");
+                }
+              }
+            } else {
+              LLVM_DEBUG(dbgs() << "   Defer: ");
+              LLVM_DEBUG(U->print(dbgs()));
+              LLVM_DEBUG(dbgs() << "\n");
+            }
+          }
+        } else {
+          LLVM_DEBUG(dbgs() << "  Delete (alloca NULL): ");
+          LLVM_DEBUG(BI->print(dbgs()));
+          LLVM_DEBUG(dbgs() << "\n");
+        }
+      } else {
+        LLVM_DEBUG(dbgs() << "  Delete (!parameter): ");
+        LLVM_DEBUG(BI->print(dbgs()));
+        LLVM_DEBUG(dbgs() << "\n");
+      }
+    } else if (BI->isTerminator() && &*BI == GEntryBlock->getTerminator()) {
+      LLVM_DEBUG(dbgs() << " Will Include Terminator: ");
+      LLVM_DEBUG(BI->print(dbgs()));
+      LLVM_DEBUG(dbgs() << "\n");
+      PDIRelated.insert(&*BI);
+    } else {
+      LLVM_DEBUG(dbgs() << " Defer: ");
+      LLVM_DEBUG(BI->print(dbgs()));
+      LLVM_DEBUG(dbgs() << "\n");
+    }
+  }
+  LLVM_DEBUG(
+      dbgs()
+      << " Report parameter debug info related/related instructions: {\n");
+  for (BasicBlock::iterator BI = GEntryBlock->begin(), BE = GEntryBlock->end();
+       BI != BE; ++BI) {
+
+    Instruction *I = &*BI;
+    if (PDIRelated.find(I) == PDIRelated.end()) {
+      LLVM_DEBUG(dbgs() << "  !PDIRelated: ");
+      LLVM_DEBUG(I->print(dbgs()));
+      LLVM_DEBUG(dbgs() << "\n");
+      PDIUnrelatedWL.push_back(I);
+    } else {
+      LLVM_DEBUG(dbgs() << "   PDIRelated: ");
+      LLVM_DEBUG(I->print(dbgs()));
+      LLVM_DEBUG(dbgs() << "\n");
+    }
+  }
+  LLVM_DEBUG(dbgs() << " }\n");
+}
+
+/// Whether this function may be replaced by a forwarding thunk.
+static bool canCreateThunkFor(Function *F) {
+  if (F->isVarArg())
+    return false;
+
+  // Don't merge tiny functions using a thunk, since it can just end up
+  // making the function larger.
+  if (F->size() == 1) {
+    if (F->front().size() <= 2) {
+      LLVM_DEBUG(dbgs() << "canCreateThunkFor: " << F->getName()
+                        << " is too small to bother creating a thunk for\n");
+      return false;
+    }
+  }
+  return true;
+}
+
+// Replace G with a simple tail call to bitcast(F). Also (unless
+// MergeFunctionsPDI holds) replace direct uses of G with bitcast(F),
+// delete G. Under MergeFunctionsPDI, we use G itself for creating
+// the thunk as we preserve the debug info (and associated instructions)
+// from G's entry block pertaining to G's incoming arguments which are
+// passed on as corresponding arguments in the call that G makes to F.
+// For better debugability, under MergeFunctionsPDI, we do not modify G's
+// call sites to point to F even when within the same translation unit.
+void MergeFunctions::writeThunk(Function *F, Function *G) {
+  BasicBlock *GEntryBlock = nullptr;
+  std::vector<Instruction *> PDIUnrelatedWL;
+  BasicBlock *BB = nullptr;
+  Function *NewG = nullptr;
+  if (MergeFunctionsPDI) {
+    LLVM_DEBUG(dbgs() << "writeThunk: (MergeFunctionsPDI) Do not create a new "
+                         "function as thunk; retain original: "
+                      << G->getName() << "()\n");
+    GEntryBlock = &G->getEntryBlock();
+    LLVM_DEBUG(
+        dbgs() << "writeThunk: (MergeFunctionsPDI) filter parameter related "
+                  "debug info for "
+               << G->getName() << "() {\n");
+    filterInstsUnrelatedToPDI(GEntryBlock, PDIUnrelatedWL);
+    GEntryBlock->getTerminator()->eraseFromParent();
+    BB = GEntryBlock;
+  } else {
+    NewG = Function::Create(G->getFunctionType(), G->getLinkage(),
+                            G->getAddressSpace(), "", G->getParent());
+    NewG->setComdat(G->getComdat());
+    BB = BasicBlock::Create(F->getContext(), "", NewG);
+  }
+
+  IRBuilder<> Builder(BB);
+  Function *H = MergeFunctionsPDI ? G : NewG;
+  SmallVector<Value *, 16> Args;
+  unsigned i = 0;
+  FunctionType *FFTy = F->getFunctionType();
+  for (Argument &AI : H->args()) {
+    Args.push_back(createCast(Builder, &AI, FFTy->getParamType(i)));
+    ++i;
+  }
+
+  CallInst *CI = Builder.CreateCall(F, Args);
+  ReturnInst *RI = nullptr;
+  CI->setTailCall();
+  CI->setCallingConv(F->getCallingConv());
+  CI->setAttributes(F->getAttributes());
+  if (H->getReturnType()->isVoidTy()) {
+    RI = Builder.CreateRetVoid();
+  } else {
+    RI = Builder.CreateRet(createCast(Builder, CI, H->getReturnType()));
+  }
+
+  if (MergeFunctionsPDI) {
+    DISubprogram *DIS = G->getSubprogram();
+    if (DIS) {
       DebugLoc CIDbgLoc =
           DILocation::get(DIS->getContext(), DIS->getScopeLine(), 0, DIS);
       DebugLoc RIDbgLoc =
           DILocation::get(DIS->getContext(), DIS->getScopeLine(), 0, DIS);
-      CI->setDebugLoc(CIDbgLoc); 
-      RI->setDebugLoc(RIDbgLoc); 
-    } else { 
-      LLVM_DEBUG( 
-          dbgs() << "writeThunk: (MergeFunctionsPDI) No DISubprogram for " 
-                 << G->getName() << "()\n"); 
-    } 
-    eraseTail(G); 
-    eraseInstsUnrelatedToPDI(PDIUnrelatedWL); 
-    LLVM_DEBUG( 
-        dbgs() << "} // End of parameter related debug info filtering for: " 
-               << G->getName() << "()\n"); 
-  } else { 
-    NewG->copyAttributesFrom(G); 
-    NewG->takeName(G); 
-    removeUsers(G); 
-    G->replaceAllUsesWith(NewG); 
-    G->eraseFromParent(); 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "writeThunk: " << H->getName() << '\n'); 
-  ++NumThunksWritten; 
-} 
- 
-// Whether this function may be replaced by an alias 
-static bool canCreateAliasFor(Function *F) { 
-  if (!MergeFunctionsAliases || !F->hasGlobalUnnamedAddr()) 
-    return false; 
- 
-  // We should only see linkages supported by aliases here 
-  assert(F->hasLocalLinkage() || F->hasExternalLinkage() 
-      || F->hasWeakLinkage() || F->hasLinkOnceLinkage()); 
-  return true; 
-} 
- 
-// Replace G with an alias to F (deleting function G) 
-void MergeFunctions::writeAlias(Function *F, Function *G) { 
-  Constant *BitcastF = ConstantExpr::getBitCast(F, G->getType()); 
-  PointerType *PtrType = G->getType(); 
-  auto *GA = GlobalAlias::create( 
-      PtrType->getElementType(), PtrType->getAddressSpace(), 
-      G->getLinkage(), "", BitcastF, G->getParent()); 
- 
-  F->setAlignment(MaybeAlign(std::max(F->getAlignment(), G->getAlignment()))); 
-  GA->takeName(G); 
-  GA->setVisibility(G->getVisibility()); 
-  GA->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); 
- 
-  removeUsers(G); 
-  G->replaceAllUsesWith(GA); 
-  G->eraseFromParent(); 
- 
-  LLVM_DEBUG(dbgs() << "writeAlias: " << GA->getName() << '\n'); 
-  ++NumAliasesWritten; 
-} 
- 
-// Replace G with an alias to F if possible, or a thunk to F if 
-// profitable. Returns false if neither is the case. 
-bool MergeFunctions::writeThunkOrAlias(Function *F, Function *G) { 
-  if (canCreateAliasFor(G)) { 
-    writeAlias(F, G); 
-    return true; 
-  } 
-  if (canCreateThunkFor(F)) { 
-    writeThunk(F, G); 
-    return true; 
-  } 
-  return false; 
-} 
- 
-// Merge two equivalent functions. Upon completion, Function G is deleted. 
-void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) { 
-  if (F->isInterposable()) { 
-    assert(G->isInterposable()); 
- 
-    // Both writeThunkOrAlias() calls below must succeed, either because we can 
-    // create aliases for G and NewF, or because a thunk for F is profitable. 
-    // F here has the same signature as NewF below, so that's what we check. 
-    if (!canCreateThunkFor(F) && 
-        (!canCreateAliasFor(F) || !canCreateAliasFor(G))) 
-      return; 
- 
-    // Make them both thunks to the same internal function. 
-    Function *NewF = Function::Create(F->getFunctionType(), F->getLinkage(), 
-                                      F->getAddressSpace(), "", F->getParent()); 
-    NewF->copyAttributesFrom(F); 
-    NewF->takeName(F); 
-    removeUsers(F); 
-    F->replaceAllUsesWith(NewF); 
- 
-    MaybeAlign MaxAlignment(std::max(G->getAlignment(), NewF->getAlignment())); 
- 
-    writeThunkOrAlias(F, G); 
-    writeThunkOrAlias(F, NewF); 
- 
-    F->setAlignment(MaxAlignment); 
-    F->setLinkage(GlobalValue::PrivateLinkage); 
-    ++NumDoubleWeak; 
-    ++NumFunctionsMerged; 
-  } else { 
-    // For better debugability, under MergeFunctionsPDI, we do not modify G's 
-    // call sites to point to F even when within the same translation unit. 
-    if (!G->isInterposable() && !MergeFunctionsPDI) { 
-      if (G->hasGlobalUnnamedAddr()) { 
-        // G might have been a key in our GlobalNumberState, and it's illegal 
-        // to replace a key in ValueMap<GlobalValue *> with a non-global. 
-        GlobalNumbers.erase(G); 
-        // If G's address is not significant, replace it entirely. 
-        Constant *BitcastF = ConstantExpr::getBitCast(F, G->getType()); 
-        removeUsers(G); 
-        G->replaceAllUsesWith(BitcastF); 
-      } else { 
-        // Redirect direct callers of G to F. (See note on MergeFunctionsPDI 
-        // above). 
-        replaceDirectCallers(G, F); 
-      } 
-    } 
- 
-    // If G was internal then we may have replaced all uses of G with F. If so, 
-    // stop here and delete G. There's no need for a thunk. (See note on 
-    // MergeFunctionsPDI above). 
-    if (G->isDiscardableIfUnused() && G->use_empty() && !MergeFunctionsPDI) { 
-      G->eraseFromParent(); 
-      ++NumFunctionsMerged; 
-      return; 
-    } 
- 
-    if (writeThunkOrAlias(F, G)) { 
-      ++NumFunctionsMerged; 
-    } 
-  } 
-} 
- 
-/// Replace function F by function G. 
-void MergeFunctions::replaceFunctionInTree(const FunctionNode &FN, 
-                                           Function *G) { 
-  Function *F = FN.getFunc(); 
-  assert(FunctionComparator(F, G, &GlobalNumbers).compare() == 0 && 
-         "The two functions must be equal"); 
- 
-  auto I = FNodesInTree.find(F); 
-  assert(I != FNodesInTree.end() && "F should be in FNodesInTree"); 
-  assert(FNodesInTree.count(G) == 0 && "FNodesInTree should not contain G"); 
- 
-  FnTreeType::iterator IterToFNInFnTree = I->second; 
-  assert(&(*IterToFNInFnTree) == &FN && "F should map to FN in FNodesInTree."); 
-  // Remove F -> FN and insert G -> FN 
-  FNodesInTree.erase(I); 
-  FNodesInTree.insert({G, IterToFNInFnTree}); 
-  // Replace F with G in FN, which is stored inside the FnTree. 
-  FN.replaceBy(G); 
-} 
- 
-// Ordering for functions that are equal under FunctionComparator 
-static bool isFuncOrderCorrect(const Function *F, const Function *G) { 
-  if (F->isInterposable() != G->isInterposable()) { 
-    // Strong before weak, because the weak function may call the strong 
-    // one, but not the other way around. 
-    return !F->isInterposable(); 
-  } 
-  if (F->hasLocalLinkage() != G->hasLocalLinkage()) { 
-    // External before local, because we definitely have to keep the external 
-    // function, but may be able to drop the local one. 
-    return !F->hasLocalLinkage(); 
-  } 
-  // Impose a total order (by name) on the replacement of functions. This is 
-  // important when operating on more than one module independently to prevent 
-  // cycles of thunks calling each other when the modules are linked together. 
-  return F->getName() <= G->getName(); 
-} 
- 
-// Insert a ComparableFunction into the FnTree, or merge it away if equal to one 
-// that was already inserted. 
-bool MergeFunctions::insert(Function *NewFunction) { 
-  std::pair<FnTreeType::iterator, bool> Result = 
-      FnTree.insert(FunctionNode(NewFunction)); 
- 
-  if (Result.second) { 
-    assert(FNodesInTree.count(NewFunction) == 0); 
-    FNodesInTree.insert({NewFunction, Result.first}); 
-    LLVM_DEBUG(dbgs() << "Inserting as unique: " << NewFunction->getName() 
-                      << '\n'); 
-    return false; 
-  } 
- 
-  const FunctionNode &OldF = *Result.first; 
- 
-  if (!isFuncOrderCorrect(OldF.getFunc(), NewFunction)) { 
-    // Swap the two functions. 
-    Function *F = OldF.getFunc(); 
-    replaceFunctionInTree(*Result.first, NewFunction); 
-    NewFunction = F; 
-    assert(OldF.getFunc() != F && "Must have swapped the functions."); 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "  " << OldF.getFunc()->getName() 
-                    << " == " << NewFunction->getName() << '\n'); 
- 
-  Function *DeleteF = NewFunction; 
-  mergeTwoFunctions(OldF.getFunc(), DeleteF); 
-  return true; 
-} 
- 
-// Remove a function from FnTree. If it was already in FnTree, add 
-// it to Deferred so that we'll look at it in the next round. 
-void MergeFunctions::remove(Function *F) { 
-  auto I = FNodesInTree.find(F); 
-  if (I != FNodesInTree.end()) { 
-    LLVM_DEBUG(dbgs() << "Deferred " << F->getName() << ".\n"); 
-    FnTree.erase(I->second); 
-    // I->second has been invalidated, remove it from the FNodesInTree map to 
-    // preserve the invariant. 
-    FNodesInTree.erase(I); 
-    Deferred.emplace_back(F); 
-  } 
-} 
- 
-// For each instruction used by the value, remove() the function that contains 
-// the instruction. This should happen right before a call to RAUW. 
-void MergeFunctions::removeUsers(Value *V) { 
-  for (User *U : V->users()) 
-    if (auto *I = dyn_cast<Instruction>(U)) 
-      remove(I->getFunction()); 
-} 
+      CI->setDebugLoc(CIDbgLoc);
+      RI->setDebugLoc(RIDbgLoc);
+    } else {
+      LLVM_DEBUG(
+          dbgs() << "writeThunk: (MergeFunctionsPDI) No DISubprogram for "
+                 << G->getName() << "()\n");
+    }
+    eraseTail(G);
+    eraseInstsUnrelatedToPDI(PDIUnrelatedWL);
+    LLVM_DEBUG(
+        dbgs() << "} // End of parameter related debug info filtering for: "
+               << G->getName() << "()\n");
+  } else {
+    NewG->copyAttributesFrom(G);
+    NewG->takeName(G);
+    removeUsers(G);
+    G->replaceAllUsesWith(NewG);
+    G->eraseFromParent();
+  }
+
+  LLVM_DEBUG(dbgs() << "writeThunk: " << H->getName() << '\n');
+  ++NumThunksWritten;
+}
+
+// Whether this function may be replaced by an alias
+static bool canCreateAliasFor(Function *F) {
+  if (!MergeFunctionsAliases || !F->hasGlobalUnnamedAddr())
+    return false;
+
+  // We should only see linkages supported by aliases here
+  assert(F->hasLocalLinkage() || F->hasExternalLinkage()
+      || F->hasWeakLinkage() || F->hasLinkOnceLinkage());
+  return true;
+}
+
+// Replace G with an alias to F (deleting function G)
+void MergeFunctions::writeAlias(Function *F, Function *G) {
+  Constant *BitcastF = ConstantExpr::getBitCast(F, G->getType());
+  PointerType *PtrType = G->getType();
+  auto *GA = GlobalAlias::create(
+      PtrType->getElementType(), PtrType->getAddressSpace(),
+      G->getLinkage(), "", BitcastF, G->getParent());
+
+  F->setAlignment(MaybeAlign(std::max(F->getAlignment(), G->getAlignment())));
+  GA->takeName(G);
+  GA->setVisibility(G->getVisibility());
+  GA->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+
+  removeUsers(G);
+  G->replaceAllUsesWith(GA);
+  G->eraseFromParent();
+
+  LLVM_DEBUG(dbgs() << "writeAlias: " << GA->getName() << '\n');
+  ++NumAliasesWritten;
+}
+
+// Replace G with an alias to F if possible, or a thunk to F if
+// profitable. Returns false if neither is the case.
+bool MergeFunctions::writeThunkOrAlias(Function *F, Function *G) {
+  if (canCreateAliasFor(G)) {
+    writeAlias(F, G);
+    return true;
+  }
+  if (canCreateThunkFor(F)) {
+    writeThunk(F, G);
+    return true;
+  }
+  return false;
+}
+
+// Merge two equivalent functions. Upon completion, Function G is deleted.
+void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) {
+  if (F->isInterposable()) {
+    assert(G->isInterposable());
+
+    // Both writeThunkOrAlias() calls below must succeed, either because we can
+    // create aliases for G and NewF, or because a thunk for F is profitable.
+    // F here has the same signature as NewF below, so that's what we check.
+    if (!canCreateThunkFor(F) &&
+        (!canCreateAliasFor(F) || !canCreateAliasFor(G)))
+      return;
+
+    // Make them both thunks to the same internal function.
+    Function *NewF = Function::Create(F->getFunctionType(), F->getLinkage(),
+                                      F->getAddressSpace(), "", F->getParent());
+    NewF->copyAttributesFrom(F);
+    NewF->takeName(F);
+    removeUsers(F);
+    F->replaceAllUsesWith(NewF);
+
+    MaybeAlign MaxAlignment(std::max(G->getAlignment(), NewF->getAlignment()));
+
+    writeThunkOrAlias(F, G);
+    writeThunkOrAlias(F, NewF);
+
+    F->setAlignment(MaxAlignment);
+    F->setLinkage(GlobalValue::PrivateLinkage);
+    ++NumDoubleWeak;
+    ++NumFunctionsMerged;
+  } else {
+    // For better debugability, under MergeFunctionsPDI, we do not modify G's
+    // call sites to point to F even when within the same translation unit.
+    if (!G->isInterposable() && !MergeFunctionsPDI) {
+      if (G->hasGlobalUnnamedAddr()) {
+        // G might have been a key in our GlobalNumberState, and it's illegal
+        // to replace a key in ValueMap<GlobalValue *> with a non-global.
+        GlobalNumbers.erase(G);
+        // If G's address is not significant, replace it entirely.
+        Constant *BitcastF = ConstantExpr::getBitCast(F, G->getType());
+        removeUsers(G);
+        G->replaceAllUsesWith(BitcastF);
+      } else {
+        // Redirect direct callers of G to F. (See note on MergeFunctionsPDI
+        // above).
+        replaceDirectCallers(G, F);
+      }
+    }
+
+    // If G was internal then we may have replaced all uses of G with F. If so,
+    // stop here and delete G. There's no need for a thunk. (See note on
+    // MergeFunctionsPDI above).
+    if (G->isDiscardableIfUnused() && G->use_empty() && !MergeFunctionsPDI) {
+      G->eraseFromParent();
+      ++NumFunctionsMerged;
+      return;
+    }
+
+    if (writeThunkOrAlias(F, G)) {
+      ++NumFunctionsMerged;
+    }
+  }
+}
+
+/// Replace function F by function G.
+void MergeFunctions::replaceFunctionInTree(const FunctionNode &FN,
+                                           Function *G) {
+  Function *F = FN.getFunc();
+  assert(FunctionComparator(F, G, &GlobalNumbers).compare() == 0 &&
+         "The two functions must be equal");
+
+  auto I = FNodesInTree.find(F);
+  assert(I != FNodesInTree.end() && "F should be in FNodesInTree");
+  assert(FNodesInTree.count(G) == 0 && "FNodesInTree should not contain G");
+
+  FnTreeType::iterator IterToFNInFnTree = I->second;
+  assert(&(*IterToFNInFnTree) == &FN && "F should map to FN in FNodesInTree.");
+  // Remove F -> FN and insert G -> FN
+  FNodesInTree.erase(I);
+  FNodesInTree.insert({G, IterToFNInFnTree});
+  // Replace F with G in FN, which is stored inside the FnTree.
+  FN.replaceBy(G);
+}
+
+// Ordering for functions that are equal under FunctionComparator
+static bool isFuncOrderCorrect(const Function *F, const Function *G) {
+  if (F->isInterposable() != G->isInterposable()) {
+    // Strong before weak, because the weak function may call the strong
+    // one, but not the other way around.
+    return !F->isInterposable();
+  }
+  if (F->hasLocalLinkage() != G->hasLocalLinkage()) {
+    // External before local, because we definitely have to keep the external
+    // function, but may be able to drop the local one.
+    return !F->hasLocalLinkage();
+  }
+  // Impose a total order (by name) on the replacement of functions. This is
+  // important when operating on more than one module independently to prevent
+  // cycles of thunks calling each other when the modules are linked together.
+  return F->getName() <= G->getName();
+}
+
+// Insert a ComparableFunction into the FnTree, or merge it away if equal to one
+// that was already inserted.
+bool MergeFunctions::insert(Function *NewFunction) {
+  std::pair<FnTreeType::iterator, bool> Result =
+      FnTree.insert(FunctionNode(NewFunction));
+
+  if (Result.second) {
+    assert(FNodesInTree.count(NewFunction) == 0);
+    FNodesInTree.insert({NewFunction, Result.first});
+    LLVM_DEBUG(dbgs() << "Inserting as unique: " << NewFunction->getName()
+                      << '\n');
+    return false;
+  }
+
+  const FunctionNode &OldF = *Result.first;
+
+  if (!isFuncOrderCorrect(OldF.getFunc(), NewFunction)) {
+    // Swap the two functions.
+    Function *F = OldF.getFunc();
+    replaceFunctionInTree(*Result.first, NewFunction);
+    NewFunction = F;
+    assert(OldF.getFunc() != F && "Must have swapped the functions.");
+  }
+
+  LLVM_DEBUG(dbgs() << "  " << OldF.getFunc()->getName()
+                    << " == " << NewFunction->getName() << '\n');
+
+  Function *DeleteF = NewFunction;
+  mergeTwoFunctions(OldF.getFunc(), DeleteF);
+  return true;
+}
+
+// Remove a function from FnTree. If it was already in FnTree, add
+// it to Deferred so that we'll look at it in the next round.
+void MergeFunctions::remove(Function *F) {
+  auto I = FNodesInTree.find(F);
+  if (I != FNodesInTree.end()) {
+    LLVM_DEBUG(dbgs() << "Deferred " << F->getName() << ".\n");
+    FnTree.erase(I->second);
+    // I->second has been invalidated, remove it from the FNodesInTree map to
+    // preserve the invariant.
+    FNodesInTree.erase(I);
+    Deferred.emplace_back(F);
+  }
+}
+
+// For each instruction used by the value, remove() the function that contains
+// the instruction. This should happen right before a call to RAUW.
+void MergeFunctions::removeUsers(Value *V) {
+  for (User *U : V->users())
+    if (auto *I = dyn_cast<Instruction>(U))
+      remove(I->getFunction());
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/OpenMPOpt.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/OpenMPOpt.cpp
index f213859928..a5ba6edb9a 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -1,323 +1,323 @@
-//===-- IPO/OpenMPOpt.cpp - Collection of OpenMP specific optimizations ---===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// OpenMP specific optimizations: 
-// 
-// - Deduplication of runtime calls, e.g., omp_get_thread_num. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/IPO/OpenMPOpt.h" 
- 
-#include "llvm/ADT/EnumeratedArray.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/CallGraph.h" 
-#include "llvm/Analysis/CallGraphSCCPass.h" 
-#include "llvm/Analysis/OptimizationRemarkEmitter.h" 
+//===-- IPO/OpenMPOpt.cpp - Collection of OpenMP specific optimizations ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// OpenMP specific optimizations:
+//
+// - Deduplication of runtime calls, e.g., omp_get_thread_num.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/OpenMPOpt.h"
+
+#include "llvm/ADT/EnumeratedArray.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Frontend/OpenMP/OMPConstants.h" 
-#include "llvm/Frontend/OpenMP/OMPIRBuilder.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Transforms/IPO.h" 
-#include "llvm/Transforms/IPO/Attributor.h" 
+#include "llvm/Frontend/OpenMP/OMPConstants.h"
+#include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/Attributor.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/CallGraphUpdater.h" 
+#include "llvm/Transforms/Utils/CallGraphUpdater.h"
 #include "llvm/Transforms/Utils/CodeExtractor.h"
- 
-using namespace llvm; 
-using namespace omp; 
- 
-#define DEBUG_TYPE "openmp-opt" 
- 
-static cl::opt<bool> DisableOpenMPOptimizations( 
-    "openmp-opt-disable", cl::ZeroOrMore, 
-    cl::desc("Disable OpenMP specific optimizations."), cl::Hidden, 
-    cl::init(false)); 
- 
+
+using namespace llvm;
+using namespace omp;
+
+#define DEBUG_TYPE "openmp-opt"
+
+static cl::opt<bool> DisableOpenMPOptimizations(
+    "openmp-opt-disable", cl::ZeroOrMore,
+    cl::desc("Disable OpenMP specific optimizations."), cl::Hidden,
+    cl::init(false));
+
 static cl::opt<bool> EnableParallelRegionMerging(
     "openmp-opt-enable-merging", cl::ZeroOrMore,
     cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden,
     cl::init(false));
 
-static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false), 
-                                    cl::Hidden); 
-static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels", 
-                                        cl::init(false), cl::Hidden); 
- 
+static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false),
+                                    cl::Hidden);
+static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels",
+                                        cl::init(false), cl::Hidden);
+
 static cl::opt<bool> HideMemoryTransferLatency(
     "openmp-hide-memory-transfer-latency",
     cl::desc("[WIP] Tries to hide the latency of host to device memory"
              " transfers"),
     cl::Hidden, cl::init(false));
 
-STATISTIC(NumOpenMPRuntimeCallsDeduplicated, 
-          "Number of OpenMP runtime calls deduplicated"); 
-STATISTIC(NumOpenMPParallelRegionsDeleted, 
-          "Number of OpenMP parallel regions deleted"); 
-STATISTIC(NumOpenMPRuntimeFunctionsIdentified, 
-          "Number of OpenMP runtime functions identified"); 
-STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified, 
-          "Number of OpenMP runtime function uses identified"); 
-STATISTIC(NumOpenMPTargetRegionKernels, 
-          "Number of OpenMP target region entry points (=kernels) identified"); 
-STATISTIC( 
-    NumOpenMPParallelRegionsReplacedInGPUStateMachine, 
-    "Number of OpenMP parallel regions replaced with ID in GPU state machines"); 
+STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
+          "Number of OpenMP runtime calls deduplicated");
+STATISTIC(NumOpenMPParallelRegionsDeleted,
+          "Number of OpenMP parallel regions deleted");
+STATISTIC(NumOpenMPRuntimeFunctionsIdentified,
+          "Number of OpenMP runtime functions identified");
+STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified,
+          "Number of OpenMP runtime function uses identified");
+STATISTIC(NumOpenMPTargetRegionKernels,
+          "Number of OpenMP target region entry points (=kernels) identified");
+STATISTIC(
+    NumOpenMPParallelRegionsReplacedInGPUStateMachine,
+    "Number of OpenMP parallel regions replaced with ID in GPU state machines");
 STATISTIC(NumOpenMPParallelRegionsMerged,
           "Number of OpenMP parallel regions merged");
- 
-#if !defined(NDEBUG) 
-static constexpr auto TAG = "[" DEBUG_TYPE "]"; 
-#endif 
- 
-namespace { 
- 
-struct AAICVTracker; 
- 
-/// OpenMP specific information. For now, stores RFIs and ICVs also needed for 
-/// Attributor runs. 
-struct OMPInformationCache : public InformationCache { 
-  OMPInformationCache(Module &M, AnalysisGetter &AG, 
-                      BumpPtrAllocator &Allocator, SetVector<Function *> &CGSCC, 
-                      SmallPtrSetImpl<Kernel> &Kernels) 
-      : InformationCache(M, AG, Allocator, &CGSCC), OMPBuilder(M), 
-        Kernels(Kernels) { 
- 
-    OMPBuilder.initialize(); 
-    initializeRuntimeFunctions(); 
-    initializeInternalControlVars(); 
-  } 
- 
-  /// Generic information that describes an internal control variable. 
-  struct InternalControlVarInfo { 
-    /// The kind, as described by InternalControlVar enum. 
-    InternalControlVar Kind; 
- 
-    /// The name of the ICV. 
-    StringRef Name; 
- 
-    /// Environment variable associated with this ICV. 
-    StringRef EnvVarName; 
- 
-    /// Initial value kind. 
-    ICVInitValue InitKind; 
- 
-    /// Initial value. 
-    ConstantInt *InitValue; 
- 
-    /// Setter RTL function associated with this ICV. 
-    RuntimeFunction Setter; 
- 
-    /// Getter RTL function associated with this ICV. 
-    RuntimeFunction Getter; 
- 
-    /// RTL Function corresponding to the override clause of this ICV 
-    RuntimeFunction Clause; 
-  }; 
- 
-  /// Generic information that describes a runtime function 
-  struct RuntimeFunctionInfo { 
- 
-    /// The kind, as described by the RuntimeFunction enum. 
-    RuntimeFunction Kind; 
- 
-    /// The name of the function. 
-    StringRef Name; 
- 
-    /// Flag to indicate a variadic function. 
-    bool IsVarArg; 
- 
-    /// The return type of the function. 
-    Type *ReturnType; 
- 
-    /// The argument types of the function. 
-    SmallVector<Type *, 8> ArgumentTypes; 
- 
-    /// The declaration if available. 
-    Function *Declaration = nullptr; 
- 
-    /// Uses of this runtime function per function containing the use. 
-    using UseVector = SmallVector<Use *, 16>; 
- 
-    /// Clear UsesMap for runtime function. 
-    void clearUsesMap() { UsesMap.clear(); } 
- 
-    /// Boolean conversion that is true if the runtime function was found. 
-    operator bool() const { return Declaration; } 
- 
-    /// Return the vector of uses in function \p F. 
-    UseVector &getOrCreateUseVector(Function *F) { 
-      std::shared_ptr<UseVector> &UV = UsesMap[F]; 
-      if (!UV) 
-        UV = std::make_shared<UseVector>(); 
-      return *UV; 
-    } 
- 
-    /// Return the vector of uses in function \p F or `nullptr` if there are 
-    /// none. 
-    const UseVector *getUseVector(Function &F) const { 
-      auto I = UsesMap.find(&F); 
-      if (I != UsesMap.end()) 
-        return I->second.get(); 
-      return nullptr; 
-    } 
- 
-    /// Return how many functions contain uses of this runtime function. 
-    size_t getNumFunctionsWithUses() const { return UsesMap.size(); } 
- 
-    /// Return the number of arguments (or the minimal number for variadic 
-    /// functions). 
-    size_t getNumArgs() const { return ArgumentTypes.size(); } 
- 
-    /// Run the callback \p CB on each use and forget the use if the result is 
-    /// true. The callback will be fed the function in which the use was 
-    /// encountered as second argument. 
-    void foreachUse(SmallVectorImpl<Function *> &SCC, 
-                    function_ref<bool(Use &, Function &)> CB) { 
-      for (Function *F : SCC) 
-        foreachUse(CB, F); 
-    } 
- 
-    /// Run the callback \p CB on each use within the function \p F and forget 
-    /// the use if the result is true. 
-    void foreachUse(function_ref<bool(Use &, Function &)> CB, Function *F) { 
-      SmallVector<unsigned, 8> ToBeDeleted; 
-      ToBeDeleted.clear(); 
- 
-      unsigned Idx = 0; 
-      UseVector &UV = getOrCreateUseVector(F); 
- 
-      for (Use *U : UV) { 
-        if (CB(*U, *F)) 
-          ToBeDeleted.push_back(Idx); 
-        ++Idx; 
-      } 
- 
-      // Remove the to-be-deleted indices in reverse order as prior 
-      // modifications will not modify the smaller indices. 
-      while (!ToBeDeleted.empty()) { 
-        unsigned Idx = ToBeDeleted.pop_back_val(); 
-        UV[Idx] = UV.back(); 
-        UV.pop_back(); 
-      } 
-    } 
- 
-  private: 
-    /// Map from functions to all uses of this runtime function contained in 
-    /// them. 
-    DenseMap<Function *, std::shared_ptr<UseVector>> UsesMap; 
-  }; 
- 
-  /// An OpenMP-IR-Builder instance 
-  OpenMPIRBuilder OMPBuilder; 
- 
-  /// Map from runtime function kind to the runtime function description. 
-  EnumeratedArray<RuntimeFunctionInfo, RuntimeFunction, 
-                  RuntimeFunction::OMPRTL___last> 
-      RFIs; 
- 
-  /// Map from ICV kind to the ICV description. 
-  EnumeratedArray<InternalControlVarInfo, InternalControlVar, 
-                  InternalControlVar::ICV___last> 
-      ICVs; 
- 
-  /// Helper to initialize all internal control variable information for those 
-  /// defined in OMPKinds.def. 
-  void initializeInternalControlVars() { 
-#define ICV_RT_SET(_Name, RTL)                                                 \ 
-  {                                                                            \ 
-    auto &ICV = ICVs[_Name];                                                   \ 
-    ICV.Setter = RTL;                                                          \ 
-  } 
-#define ICV_RT_GET(Name, RTL)                                                  \ 
-  {                                                                            \ 
-    auto &ICV = ICVs[Name];                                                    \ 
-    ICV.Getter = RTL;                                                          \ 
-  } 
-#define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init)                           \ 
-  {                                                                            \ 
-    auto &ICV = ICVs[Enum];                                                    \ 
-    ICV.Name = _Name;                                                          \ 
-    ICV.Kind = Enum;                                                           \ 
-    ICV.InitKind = Init;                                                       \ 
-    ICV.EnvVarName = _EnvVarName;                                              \ 
-    switch (ICV.InitKind) {                                                    \ 
-    case ICV_IMPLEMENTATION_DEFINED:                                           \ 
-      ICV.InitValue = nullptr;                                                 \ 
-      break;                                                                   \ 
-    case ICV_ZERO:                                                             \ 
-      ICV.InitValue = ConstantInt::get(                                        \ 
-          Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0);                \ 
-      break;                                                                   \ 
-    case ICV_FALSE:                                                            \ 
-      ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext());    \ 
-      break;                                                                   \ 
-    case ICV_LAST:                                                             \ 
-      break;                                                                   \ 
-    }                                                                          \ 
-  } 
-#include "llvm/Frontend/OpenMP/OMPKinds.def" 
-  } 
- 
-  /// Returns true if the function declaration \p F matches the runtime 
-  /// function types, that is, return type \p RTFRetType, and argument types 
-  /// \p RTFArgTypes. 
-  static bool declMatchesRTFTypes(Function *F, Type *RTFRetType, 
-                                  SmallVector<Type *, 8> &RTFArgTypes) { 
-    // TODO: We should output information to the user (under debug output 
-    //       and via remarks). 
- 
-    if (!F) 
-      return false; 
-    if (F->getReturnType() != RTFRetType) 
-      return false; 
-    if (F->arg_size() != RTFArgTypes.size()) 
-      return false; 
- 
-    auto RTFTyIt = RTFArgTypes.begin(); 
-    for (Argument &Arg : F->args()) { 
-      if (Arg.getType() != *RTFTyIt) 
-        return false; 
- 
-      ++RTFTyIt; 
-    } 
- 
-    return true; 
-  } 
- 
-  // Helper to collect all uses of the declaration in the UsesMap. 
-  unsigned collectUses(RuntimeFunctionInfo &RFI, bool CollectStats = true) { 
-    unsigned NumUses = 0; 
-    if (!RFI.Declaration) 
-      return NumUses; 
-    OMPBuilder.addAttributes(RFI.Kind, *RFI.Declaration); 
- 
-    if (CollectStats) { 
-      NumOpenMPRuntimeFunctionsIdentified += 1; 
-      NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses(); 
-    } 
- 
-    // TODO: We directly convert uses into proper calls and unknown uses. 
-    for (Use &U : RFI.Declaration->uses()) { 
-      if (Instruction *UserI = dyn_cast<Instruction>(U.getUser())) { 
-        if (ModuleSlice.count(UserI->getFunction())) { 
-          RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U); 
-          ++NumUses; 
-        } 
-      } else { 
-        RFI.getOrCreateUseVector(nullptr).push_back(&U); 
-        ++NumUses; 
-      } 
-    } 
-    return NumUses; 
-  } 
- 
+
+#if !defined(NDEBUG)
+static constexpr auto TAG = "[" DEBUG_TYPE "]";
+#endif
+
+namespace {
+
+struct AAICVTracker;
+
+/// OpenMP specific information. For now, stores RFIs and ICVs also needed for
+/// Attributor runs.
+struct OMPInformationCache : public InformationCache {
+  OMPInformationCache(Module &M, AnalysisGetter &AG,
+                      BumpPtrAllocator &Allocator, SetVector<Function *> &CGSCC,
+                      SmallPtrSetImpl<Kernel> &Kernels)
+      : InformationCache(M, AG, Allocator, &CGSCC), OMPBuilder(M),
+        Kernels(Kernels) {
+
+    OMPBuilder.initialize();
+    initializeRuntimeFunctions();
+    initializeInternalControlVars();
+  }
+
+  /// Generic information that describes an internal control variable.
+  struct InternalControlVarInfo {
+    /// The kind, as described by InternalControlVar enum.
+    InternalControlVar Kind;
+
+    /// The name of the ICV.
+    StringRef Name;
+
+    /// Environment variable associated with this ICV.
+    StringRef EnvVarName;
+
+    /// Initial value kind.
+    ICVInitValue InitKind;
+
+    /// Initial value.
+    ConstantInt *InitValue;
+
+    /// Setter RTL function associated with this ICV.
+    RuntimeFunction Setter;
+
+    /// Getter RTL function associated with this ICV.
+    RuntimeFunction Getter;
+
+    /// RTL Function corresponding to the override clause of this ICV
+    RuntimeFunction Clause;
+  };
+
+  /// Generic information that describes a runtime function
+  struct RuntimeFunctionInfo {
+
+    /// The kind, as described by the RuntimeFunction enum.
+    RuntimeFunction Kind;
+
+    /// The name of the function.
+    StringRef Name;
+
+    /// Flag to indicate a variadic function.
+    bool IsVarArg;
+
+    /// The return type of the function.
+    Type *ReturnType;
+
+    /// The argument types of the function.
+    SmallVector<Type *, 8> ArgumentTypes;
+
+    /// The declaration if available.
+    Function *Declaration = nullptr;
+
+    /// Uses of this runtime function per function containing the use.
+    using UseVector = SmallVector<Use *, 16>;
+
+    /// Clear UsesMap for runtime function.
+    void clearUsesMap() { UsesMap.clear(); }
+
+    /// Boolean conversion that is true if the runtime function was found.
+    operator bool() const { return Declaration; }
+
+    /// Return the vector of uses in function \p F.
+    UseVector &getOrCreateUseVector(Function *F) {
+      std::shared_ptr<UseVector> &UV = UsesMap[F];
+      if (!UV)
+        UV = std::make_shared<UseVector>();
+      return *UV;
+    }
+
+    /// Return the vector of uses in function \p F or `nullptr` if there are
+    /// none.
+    const UseVector *getUseVector(Function &F) const {
+      auto I = UsesMap.find(&F);
+      if (I != UsesMap.end())
+        return I->second.get();
+      return nullptr;
+    }
+
+    /// Return how many functions contain uses of this runtime function.
+    size_t getNumFunctionsWithUses() const { return UsesMap.size(); }
+
+    /// Return the number of arguments (or the minimal number for variadic
+    /// functions).
+    size_t getNumArgs() const { return ArgumentTypes.size(); }
+
+    /// Run the callback \p CB on each use and forget the use if the result is
+    /// true. The callback will be fed the function in which the use was
+    /// encountered as second argument.
+    void foreachUse(SmallVectorImpl<Function *> &SCC,
+                    function_ref<bool(Use &, Function &)> CB) {
+      for (Function *F : SCC)
+        foreachUse(CB, F);
+    }
+
+    /// Run the callback \p CB on each use within the function \p F and forget
+    /// the use if the result is true.
+    void foreachUse(function_ref<bool(Use &, Function &)> CB, Function *F) {
+      SmallVector<unsigned, 8> ToBeDeleted;
+      ToBeDeleted.clear();
+
+      unsigned Idx = 0;
+      UseVector &UV = getOrCreateUseVector(F);
+
+      for (Use *U : UV) {
+        if (CB(*U, *F))
+          ToBeDeleted.push_back(Idx);
+        ++Idx;
+      }
+
+      // Remove the to-be-deleted indices in reverse order as prior
+      // modifications will not modify the smaller indices.
+      while (!ToBeDeleted.empty()) {
+        unsigned Idx = ToBeDeleted.pop_back_val();
+        UV[Idx] = UV.back();
+        UV.pop_back();
+      }
+    }
+
+  private:
+    /// Map from functions to all uses of this runtime function contained in
+    /// them.
+    DenseMap<Function *, std::shared_ptr<UseVector>> UsesMap;
+  };
+
+  /// An OpenMP-IR-Builder instance
+  OpenMPIRBuilder OMPBuilder;
+
+  /// Map from runtime function kind to the runtime function description.
+  EnumeratedArray<RuntimeFunctionInfo, RuntimeFunction,
+                  RuntimeFunction::OMPRTL___last>
+      RFIs;
+
+  /// Map from ICV kind to the ICV description.
+  EnumeratedArray<InternalControlVarInfo, InternalControlVar,
+                  InternalControlVar::ICV___last>
+      ICVs;
+
+  /// Helper to initialize all internal control variable information for those
+  /// defined in OMPKinds.def.
+  void initializeInternalControlVars() {
+#define ICV_RT_SET(_Name, RTL)                                                 \
+  {                                                                            \
+    auto &ICV = ICVs[_Name];                                                   \
+    ICV.Setter = RTL;                                                          \
+  }
+#define ICV_RT_GET(Name, RTL)                                                  \
+  {                                                                            \
+    auto &ICV = ICVs[Name];                                                    \
+    ICV.Getter = RTL;                                                          \
+  }
+#define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init)                           \
+  {                                                                            \
+    auto &ICV = ICVs[Enum];                                                    \
+    ICV.Name = _Name;                                                          \
+    ICV.Kind = Enum;                                                           \
+    ICV.InitKind = Init;                                                       \
+    ICV.EnvVarName = _EnvVarName;                                              \
+    switch (ICV.InitKind) {                                                    \
+    case ICV_IMPLEMENTATION_DEFINED:                                           \
+      ICV.InitValue = nullptr;                                                 \
+      break;                                                                   \
+    case ICV_ZERO:                                                             \
+      ICV.InitValue = ConstantInt::get(                                        \
+          Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0);                \
+      break;                                                                   \
+    case ICV_FALSE:                                                            \
+      ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext());    \
+      break;                                                                   \
+    case ICV_LAST:                                                             \
+      break;                                                                   \
+    }                                                                          \
+  }
+#include "llvm/Frontend/OpenMP/OMPKinds.def"
+  }
+
+  /// Returns true if the function declaration \p F matches the runtime
+  /// function types, that is, return type \p RTFRetType, and argument types
+  /// \p RTFArgTypes.
+  static bool declMatchesRTFTypes(Function *F, Type *RTFRetType,
+                                  SmallVector<Type *, 8> &RTFArgTypes) {
+    // TODO: We should output information to the user (under debug output
+    //       and via remarks).
+
+    if (!F)
+      return false;
+    if (F->getReturnType() != RTFRetType)
+      return false;
+    if (F->arg_size() != RTFArgTypes.size())
+      return false;
+
+    auto RTFTyIt = RTFArgTypes.begin();
+    for (Argument &Arg : F->args()) {
+      if (Arg.getType() != *RTFTyIt)
+        return false;
+
+      ++RTFTyIt;
+    }
+
+    return true;
+  }
+
+  // Helper to collect all uses of the declaration in the UsesMap.
+  unsigned collectUses(RuntimeFunctionInfo &RFI, bool CollectStats = true) {
+    unsigned NumUses = 0;
+    if (!RFI.Declaration)
+      return NumUses;
+    OMPBuilder.addAttributes(RFI.Kind, *RFI.Declaration);
+
+    if (CollectStats) {
+      NumOpenMPRuntimeFunctionsIdentified += 1;
+      NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses();
+    }
+
+    // TODO: We directly convert uses into proper calls and unknown uses.
+    for (Use &U : RFI.Declaration->uses()) {
+      if (Instruction *UserI = dyn_cast<Instruction>(U.getUser())) {
+        if (ModuleSlice.count(UserI->getFunction())) {
+          RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U);
+          ++NumUses;
+        }
+      } else {
+        RFI.getOrCreateUseVector(nullptr).push_back(&U);
+        ++NumUses;
+      }
+    }
+    return NumUses;
+  }
+
   // Helper function to recollect uses of a runtime function.
   void recollectUsesForFunction(RuntimeFunction RTF) {
     auto &RFI = RFIs[RTF];
@@ -325,73 +325,73 @@ struct OMPInformationCache : public InformationCache {
     collectUses(RFI, /*CollectStats*/ false);
   }
 
-  // Helper function to recollect uses of all runtime functions. 
-  void recollectUses() { 
+  // Helper function to recollect uses of all runtime functions.
+  void recollectUses() {
     for (int Idx = 0; Idx < RFIs.size(); ++Idx)
       recollectUsesForFunction(static_cast<RuntimeFunction>(Idx));
-  } 
- 
-  /// Helper to initialize all runtime function information for those defined 
-  /// in OpenMPKinds.def. 
-  void initializeRuntimeFunctions() { 
-    Module &M = *((*ModuleSlice.begin())->getParent()); 
- 
-    // Helper macros for handling __VA_ARGS__ in OMP_RTL 
-#define OMP_TYPE(VarName, ...)                                                 \ 
-  Type *VarName = OMPBuilder.VarName;                                          \ 
-  (void)VarName; 
- 
-#define OMP_ARRAY_TYPE(VarName, ...)                                           \ 
-  ArrayType *VarName##Ty = OMPBuilder.VarName##Ty;                             \ 
-  (void)VarName##Ty;                                                           \ 
-  PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy;                     \ 
-  (void)VarName##PtrTy; 
- 
-#define OMP_FUNCTION_TYPE(VarName, ...)                                        \ 
-  FunctionType *VarName = OMPBuilder.VarName;                                  \ 
-  (void)VarName;                                                               \ 
-  PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr;                         \ 
-  (void)VarName##Ptr; 
- 
-#define OMP_STRUCT_TYPE(VarName, ...)                                          \ 
-  StructType *VarName = OMPBuilder.VarName;                                    \ 
-  (void)VarName;                                                               \ 
-  PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr;                         \ 
-  (void)VarName##Ptr; 
- 
-#define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...)                     \ 
-  {                                                                            \ 
-    SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__});                           \ 
-    Function *F = M.getFunction(_Name);                                        \ 
-    if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) {           \ 
-      auto &RFI = RFIs[_Enum];                                                 \ 
-      RFI.Kind = _Enum;                                                        \ 
-      RFI.Name = _Name;                                                        \ 
-      RFI.IsVarArg = _IsVarArg;                                                \ 
-      RFI.ReturnType = OMPBuilder._ReturnType;                                 \ 
-      RFI.ArgumentTypes = std::move(ArgsTypes);                                \ 
-      RFI.Declaration = F;                                                     \ 
-      unsigned NumUses = collectUses(RFI);                                     \ 
-      (void)NumUses;                                                           \ 
-      LLVM_DEBUG({                                                             \ 
-        dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not")           \ 
-               << " found\n";                                                  \ 
-        if (RFI.Declaration)                                                   \ 
-          dbgs() << TAG << "-> got " << NumUses << " uses in "                 \ 
-                 << RFI.getNumFunctionsWithUses()                              \ 
-                 << " different functions.\n";                                 \ 
-      });                                                                      \ 
-    }                                                                          \ 
-  } 
-#include "llvm/Frontend/OpenMP/OMPKinds.def" 
- 
-    // TODO: We should attach the attributes defined in OMPKinds.def. 
-  } 
- 
-  /// Collection of known kernels (\see Kernel) in the module. 
-  SmallPtrSetImpl<Kernel> &Kernels; 
-}; 
- 
+  }
+
+  /// Helper to initialize all runtime function information for those defined
+  /// in OpenMPKinds.def.
+  void initializeRuntimeFunctions() {
+    Module &M = *((*ModuleSlice.begin())->getParent());
+
+    // Helper macros for handling __VA_ARGS__ in OMP_RTL
+#define OMP_TYPE(VarName, ...)                                                 \
+  Type *VarName = OMPBuilder.VarName;                                          \
+  (void)VarName;
+
+#define OMP_ARRAY_TYPE(VarName, ...)                                           \
+  ArrayType *VarName##Ty = OMPBuilder.VarName##Ty;                             \
+  (void)VarName##Ty;                                                           \
+  PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy;                     \
+  (void)VarName##PtrTy;
+
+#define OMP_FUNCTION_TYPE(VarName, ...)                                        \
+  FunctionType *VarName = OMPBuilder.VarName;                                  \
+  (void)VarName;                                                               \
+  PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr;                         \
+  (void)VarName##Ptr;
+
+#define OMP_STRUCT_TYPE(VarName, ...)                                          \
+  StructType *VarName = OMPBuilder.VarName;                                    \
+  (void)VarName;                                                               \
+  PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr;                         \
+  (void)VarName##Ptr;
+
+#define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...)                     \
+  {                                                                            \
+    SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__});                           \
+    Function *F = M.getFunction(_Name);                                        \
+    if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) {           \
+      auto &RFI = RFIs[_Enum];                                                 \
+      RFI.Kind = _Enum;                                                        \
+      RFI.Name = _Name;                                                        \
+      RFI.IsVarArg = _IsVarArg;                                                \
+      RFI.ReturnType = OMPBuilder._ReturnType;                                 \
+      RFI.ArgumentTypes = std::move(ArgsTypes);                                \
+      RFI.Declaration = F;                                                     \
+      unsigned NumUses = collectUses(RFI);                                     \
+      (void)NumUses;                                                           \
+      LLVM_DEBUG({                                                             \
+        dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not")           \
+               << " found\n";                                                  \
+        if (RFI.Declaration)                                                   \
+          dbgs() << TAG << "-> got " << NumUses << " uses in "                 \
+                 << RFI.getNumFunctionsWithUses()                              \
+                 << " different functions.\n";                                 \
+      });                                                                      \
+    }                                                                          \
+  }
+#include "llvm/Frontend/OpenMP/OMPKinds.def"
+
+    // TODO: We should attach the attributes defined in OMPKinds.def.
+  }
+
+  /// Collection of known kernels (\see Kernel) in the module.
+  SmallPtrSetImpl<Kernel> &Kernels;
+};
+
 /// Used to map the values physically (in the IR) stored in an offload
 /// array, to a vector in memory.
 struct OffloadArray {
@@ -477,122 +477,122 @@ private:
   }
 };
 
-struct OpenMPOpt { 
- 
-  using OptimizationRemarkGetter = 
-      function_ref<OptimizationRemarkEmitter &(Function *)>; 
- 
-  OpenMPOpt(SmallVectorImpl<Function *> &SCC, CallGraphUpdater &CGUpdater, 
-            OptimizationRemarkGetter OREGetter, 
-            OMPInformationCache &OMPInfoCache, Attributor &A) 
-      : M(*(*SCC.begin())->getParent()), SCC(SCC), CGUpdater(CGUpdater), 
-        OREGetter(OREGetter), OMPInfoCache(OMPInfoCache), A(A) {} 
- 
+struct OpenMPOpt {
+
+  using OptimizationRemarkGetter =
+      function_ref<OptimizationRemarkEmitter &(Function *)>;
+
+  OpenMPOpt(SmallVectorImpl<Function *> &SCC, CallGraphUpdater &CGUpdater,
+            OptimizationRemarkGetter OREGetter,
+            OMPInformationCache &OMPInfoCache, Attributor &A)
+      : M(*(*SCC.begin())->getParent()), SCC(SCC), CGUpdater(CGUpdater),
+        OREGetter(OREGetter), OMPInfoCache(OMPInfoCache), A(A) {}
+
   /// Check if any remarks are enabled for openmp-opt
   bool remarksEnabled() {
     auto &Ctx = M.getContext();
     return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(DEBUG_TYPE);
   }
 
-  /// Run all OpenMP optimizations on the underlying SCC/ModuleSlice. 
-  bool run() { 
-    if (SCC.empty()) 
-      return false; 
- 
-    bool Changed = false; 
- 
-    LLVM_DEBUG(dbgs() << TAG << "Run on SCC with " << SCC.size() 
-                      << " functions in a slice with " 
-                      << OMPInfoCache.ModuleSlice.size() << " functions\n"); 
- 
-    if (PrintICVValues) 
-      printICVs(); 
-    if (PrintOpenMPKernels) 
-      printKernels(); 
- 
-    Changed |= rewriteDeviceCodeStateMachine(); 
- 
-    Changed |= runAttributor(); 
- 
-    // Recollect uses, in case Attributor deleted any. 
-    OMPInfoCache.recollectUses(); 
- 
+  /// Run all OpenMP optimizations on the underlying SCC/ModuleSlice.
+  bool run() {
+    if (SCC.empty())
+      return false;
+
+    bool Changed = false;
+
+    LLVM_DEBUG(dbgs() << TAG << "Run on SCC with " << SCC.size()
+                      << " functions in a slice with "
+                      << OMPInfoCache.ModuleSlice.size() << " functions\n");
+
+    if (PrintICVValues)
+      printICVs();
+    if (PrintOpenMPKernels)
+      printKernels();
+
+    Changed |= rewriteDeviceCodeStateMachine();
+
+    Changed |= runAttributor();
+
+    // Recollect uses, in case Attributor deleted any.
+    OMPInfoCache.recollectUses();
+
     Changed |= deleteParallelRegions();
     if (HideMemoryTransferLatency)
       Changed |= hideMemTransfersLatency();
     if (remarksEnabled())
       analysisGlobalization();
-    Changed |= deduplicateRuntimeCalls(); 
+    Changed |= deduplicateRuntimeCalls();
     if (EnableParallelRegionMerging) {
       if (mergeParallelRegions()) {
         deduplicateRuntimeCalls();
         Changed = true;
       }
     }
- 
-    return Changed; 
-  } 
- 
-  /// Print initial ICV values for testing. 
-  /// FIXME: This should be done from the Attributor once it is added. 
-  void printICVs() const { 
+
+    return Changed;
+  }
+
+  /// Print initial ICV values for testing.
+  /// FIXME: This should be done from the Attributor once it is added.
+  void printICVs() const {
     InternalControlVar ICVs[] = {ICV_nthreads, ICV_active_levels, ICV_cancel,
                                  ICV_proc_bind};
- 
-    for (Function *F : OMPInfoCache.ModuleSlice) { 
-      for (auto ICV : ICVs) { 
-        auto ICVInfo = OMPInfoCache.ICVs[ICV]; 
-        auto Remark = [&](OptimizationRemark OR) { 
-          return OR << "OpenMP ICV " << ore::NV("OpenMPICV", ICVInfo.Name) 
-                    << " Value: " 
-                    << (ICVInfo.InitValue 
-                            ? ICVInfo.InitValue->getValue().toString(10, true) 
-                            : "IMPLEMENTATION_DEFINED"); 
-        }; 
- 
-        emitRemarkOnFunction(F, "OpenMPICVTracker", Remark); 
-      } 
-    } 
-  } 
- 
-  /// Print OpenMP GPU kernels for testing. 
-  void printKernels() const { 
-    for (Function *F : SCC) { 
-      if (!OMPInfoCache.Kernels.count(F)) 
-        continue; 
- 
-      auto Remark = [&](OptimizationRemark OR) { 
-        return OR << "OpenMP GPU kernel " 
-                  << ore::NV("OpenMPGPUKernel", F->getName()) << "\n"; 
-      }; 
- 
-      emitRemarkOnFunction(F, "OpenMPGPU", Remark); 
-    } 
-  } 
- 
-  /// Return the call if \p U is a callee use in a regular call. If \p RFI is 
-  /// given it has to be the callee or a nullptr is returned. 
-  static CallInst *getCallIfRegularCall( 
-      Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) { 
-    CallInst *CI = dyn_cast<CallInst>(U.getUser()); 
-    if (CI && CI->isCallee(&U) && !CI->hasOperandBundles() && 
-        (!RFI || CI->getCalledFunction() == RFI->Declaration)) 
-      return CI; 
-    return nullptr; 
-  } 
- 
-  /// Return the call if \p V is a regular call. If \p RFI is given it has to be 
-  /// the callee or a nullptr is returned. 
-  static CallInst *getCallIfRegularCall( 
-      Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) { 
-    CallInst *CI = dyn_cast<CallInst>(&V); 
-    if (CI && !CI->hasOperandBundles() && 
-        (!RFI || CI->getCalledFunction() == RFI->Declaration)) 
-      return CI; 
-    return nullptr; 
-  } 
- 
-private: 
+
+    for (Function *F : OMPInfoCache.ModuleSlice) {
+      for (auto ICV : ICVs) {
+        auto ICVInfo = OMPInfoCache.ICVs[ICV];
+        auto Remark = [&](OptimizationRemark OR) {
+          return OR << "OpenMP ICV " << ore::NV("OpenMPICV", ICVInfo.Name)
+                    << " Value: "
+                    << (ICVInfo.InitValue
+                            ? ICVInfo.InitValue->getValue().toString(10, true)
+                            : "IMPLEMENTATION_DEFINED");
+        };
+
+        emitRemarkOnFunction(F, "OpenMPICVTracker", Remark);
+      }
+    }
+  }
+
+  /// Print OpenMP GPU kernels for testing.
+  void printKernels() const {
+    for (Function *F : SCC) {
+      if (!OMPInfoCache.Kernels.count(F))
+        continue;
+
+      auto Remark = [&](OptimizationRemark OR) {
+        return OR << "OpenMP GPU kernel "
+                  << ore::NV("OpenMPGPUKernel", F->getName()) << "\n";
+      };
+
+      emitRemarkOnFunction(F, "OpenMPGPU", Remark);
+    }
+  }
+
+  /// Return the call if \p U is a callee use in a regular call. If \p RFI is
+  /// given it has to be the callee or a nullptr is returned.
+  static CallInst *getCallIfRegularCall(
+      Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {
+    CallInst *CI = dyn_cast<CallInst>(U.getUser());
+    if (CI && CI->isCallee(&U) && !CI->hasOperandBundles() &&
+        (!RFI || CI->getCalledFunction() == RFI->Declaration))
+      return CI;
+    return nullptr;
+  }
+
+  /// Return the call if \p V is a regular call. If \p RFI is given it has to be
+  /// the callee or a nullptr is returned.
+  static CallInst *getCallIfRegularCall(
+      Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {
+    CallInst *CI = dyn_cast<CallInst>(&V);
+    if (CI && !CI->hasOperandBundles() &&
+        (!RFI || CI->getCalledFunction() == RFI->Declaration))
+      return CI;
+    return nullptr;
+  }
+
+private:
   /// Merge parallel regions when it is safe.
   bool mergeParallelRegions() {
     const unsigned CallbackCalleeOperand = 2;
@@ -981,101 +981,101 @@ private:
     return Changed;
   }
 
-  /// Try to delete parallel regions if possible. 
-  bool deleteParallelRegions() { 
-    const unsigned CallbackCalleeOperand = 2; 
- 
-    OMPInformationCache::RuntimeFunctionInfo &RFI = 
-        OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call]; 
- 
-    if (!RFI.Declaration) 
-      return false; 
- 
-    bool Changed = false; 
-    auto DeleteCallCB = [&](Use &U, Function &) { 
-      CallInst *CI = getCallIfRegularCall(U); 
-      if (!CI) 
-        return false; 
-      auto *Fn = dyn_cast<Function>( 
-          CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts()); 
-      if (!Fn) 
-        return false; 
-      if (!Fn->onlyReadsMemory()) 
-        return false; 
-      if (!Fn->hasFnAttribute(Attribute::WillReturn)) 
-        return false; 
- 
-      LLVM_DEBUG(dbgs() << TAG << "Delete read-only parallel region in " 
-                        << CI->getCaller()->getName() << "\n"); 
- 
-      auto Remark = [&](OptimizationRemark OR) { 
-        return OR << "Parallel region in " 
-                  << ore::NV("OpenMPParallelDelete", CI->getCaller()->getName()) 
-                  << " deleted"; 
-      }; 
-      emitRemark<OptimizationRemark>(CI, "OpenMPParallelRegionDeletion", 
-                                     Remark); 
- 
-      CGUpdater.removeCallSite(*CI); 
-      CI->eraseFromParent(); 
-      Changed = true; 
-      ++NumOpenMPParallelRegionsDeleted; 
-      return true; 
-    }; 
- 
-    RFI.foreachUse(SCC, DeleteCallCB); 
- 
-    return Changed; 
-  } 
- 
-  /// Try to eliminate runtime calls by reusing existing ones. 
-  bool deduplicateRuntimeCalls() { 
-    bool Changed = false; 
- 
-    RuntimeFunction DeduplicableRuntimeCallIDs[] = { 
-        OMPRTL_omp_get_num_threads, 
-        OMPRTL_omp_in_parallel, 
-        OMPRTL_omp_get_cancellation, 
-        OMPRTL_omp_get_thread_limit, 
-        OMPRTL_omp_get_supported_active_levels, 
-        OMPRTL_omp_get_level, 
-        OMPRTL_omp_get_ancestor_thread_num, 
-        OMPRTL_omp_get_team_size, 
-        OMPRTL_omp_get_active_level, 
-        OMPRTL_omp_in_final, 
-        OMPRTL_omp_get_proc_bind, 
-        OMPRTL_omp_get_num_places, 
-        OMPRTL_omp_get_num_procs, 
-        OMPRTL_omp_get_place_num, 
-        OMPRTL_omp_get_partition_num_places, 
-        OMPRTL_omp_get_partition_place_nums}; 
- 
-    // Global-tid is handled separately. 
-    SmallSetVector<Value *, 16> GTIdArgs; 
-    collectGlobalThreadIdArguments(GTIdArgs); 
-    LLVM_DEBUG(dbgs() << TAG << "Found " << GTIdArgs.size() 
-                      << " global thread ID arguments\n"); 
- 
-    for (Function *F : SCC) { 
-      for (auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs) 
+  /// Try to delete parallel regions if possible.
+  bool deleteParallelRegions() {
+    const unsigned CallbackCalleeOperand = 2;
+
+    OMPInformationCache::RuntimeFunctionInfo &RFI =
+        OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
+
+    if (!RFI.Declaration)
+      return false;
+
+    bool Changed = false;
+    auto DeleteCallCB = [&](Use &U, Function &) {
+      CallInst *CI = getCallIfRegularCall(U);
+      if (!CI)
+        return false;
+      auto *Fn = dyn_cast<Function>(
+          CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts());
+      if (!Fn)
+        return false;
+      if (!Fn->onlyReadsMemory())
+        return false;
+      if (!Fn->hasFnAttribute(Attribute::WillReturn))
+        return false;
+
+      LLVM_DEBUG(dbgs() << TAG << "Delete read-only parallel region in "
+                        << CI->getCaller()->getName() << "\n");
+
+      auto Remark = [&](OptimizationRemark OR) {
+        return OR << "Parallel region in "
+                  << ore::NV("OpenMPParallelDelete", CI->getCaller()->getName())
+                  << " deleted";
+      };
+      emitRemark<OptimizationRemark>(CI, "OpenMPParallelRegionDeletion",
+                                     Remark);
+
+      CGUpdater.removeCallSite(*CI);
+      CI->eraseFromParent();
+      Changed = true;
+      ++NumOpenMPParallelRegionsDeleted;
+      return true;
+    };
+
+    RFI.foreachUse(SCC, DeleteCallCB);
+
+    return Changed;
+  }
+
+  /// Try to eliminate runtime calls by reusing existing ones.
+  bool deduplicateRuntimeCalls() {
+    bool Changed = false;
+
+    RuntimeFunction DeduplicableRuntimeCallIDs[] = {
+        OMPRTL_omp_get_num_threads,
+        OMPRTL_omp_in_parallel,
+        OMPRTL_omp_get_cancellation,
+        OMPRTL_omp_get_thread_limit,
+        OMPRTL_omp_get_supported_active_levels,
+        OMPRTL_omp_get_level,
+        OMPRTL_omp_get_ancestor_thread_num,
+        OMPRTL_omp_get_team_size,
+        OMPRTL_omp_get_active_level,
+        OMPRTL_omp_in_final,
+        OMPRTL_omp_get_proc_bind,
+        OMPRTL_omp_get_num_places,
+        OMPRTL_omp_get_num_procs,
+        OMPRTL_omp_get_place_num,
+        OMPRTL_omp_get_partition_num_places,
+        OMPRTL_omp_get_partition_place_nums};
+
+    // Global-tid is handled separately.
+    SmallSetVector<Value *, 16> GTIdArgs;
+    collectGlobalThreadIdArguments(GTIdArgs);
+    LLVM_DEBUG(dbgs() << TAG << "Found " << GTIdArgs.size()
+                      << " global thread ID arguments\n");
+
+    for (Function *F : SCC) {
+      for (auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs)
         Changed |= deduplicateRuntimeCalls(
             *F, OMPInfoCache.RFIs[DeduplicableRuntimeCallID]);
- 
-      // __kmpc_global_thread_num is special as we can replace it with an 
-      // argument in enough cases to make it worth trying. 
-      Value *GTIdArg = nullptr; 
-      for (Argument &Arg : F->args()) 
-        if (GTIdArgs.count(&Arg)) { 
-          GTIdArg = &Arg; 
-          break; 
-        } 
-      Changed |= deduplicateRuntimeCalls( 
-          *F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg); 
-    } 
- 
-    return Changed; 
-  } 
- 
+
+      // __kmpc_global_thread_num is special as we can replace it with an
+      // argument in enough cases to make it worth trying.
+      Value *GTIdArg = nullptr;
+      for (Argument &Arg : F->args())
+        if (GTIdArgs.count(&Arg)) {
+          GTIdArg = &Arg;
+          break;
+        }
+      Changed |= deduplicateRuntimeCalls(
+          *F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg);
+    }
+
+    return Changed;
+  }
+
   /// Tries to hide the latency of runtime calls that involve host to
   /// device memory transfers by splitting them into their "issue" and "wait"
   /// versions. The "issue" is moved upwards as much as possible. The "wait" is
@@ -1293,294 +1293,294 @@ private:
     return true;
   }
 
-  static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent, 
-                                    bool GlobalOnly, bool &SingleChoice) { 
-    if (CurrentIdent == NextIdent) 
-      return CurrentIdent; 
- 
-    // TODO: Figure out how to actually combine multiple debug locations. For 
-    //       now we just keep an existing one if there is a single choice. 
-    if (!GlobalOnly || isa<GlobalValue>(NextIdent)) { 
-      SingleChoice = !CurrentIdent; 
-      return NextIdent; 
-    } 
-    return nullptr; 
-  } 
- 
-  /// Return an `struct ident_t*` value that represents the ones used in the 
-  /// calls of \p RFI inside of \p F. If \p GlobalOnly is true, we will not 
-  /// return a local `struct ident_t*`. For now, if we cannot find a suitable 
-  /// return value we create one from scratch. We also do not yet combine 
-  /// information, e.g., the source locations, see combinedIdentStruct. 
-  Value * 
-  getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI, 
-                                 Function &F, bool GlobalOnly) { 
-    bool SingleChoice = true; 
-    Value *Ident = nullptr; 
-    auto CombineIdentStruct = [&](Use &U, Function &Caller) { 
-      CallInst *CI = getCallIfRegularCall(U, &RFI); 
-      if (!CI || &F != &Caller) 
-        return false; 
-      Ident = combinedIdentStruct(Ident, CI->getArgOperand(0), 
-                                  /* GlobalOnly */ true, SingleChoice); 
-      return false; 
-    }; 
-    RFI.foreachUse(SCC, CombineIdentStruct); 
- 
-    if (!Ident || !SingleChoice) { 
-      // The IRBuilder uses the insertion block to get to the module, this is 
-      // unfortunate but we work around it for now. 
-      if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock()) 
-        OMPInfoCache.OMPBuilder.updateToLocation(OpenMPIRBuilder::InsertPointTy( 
-            &F.getEntryBlock(), F.getEntryBlock().begin())); 
-      // Create a fallback location if non was found. 
-      // TODO: Use the debug locations of the calls instead. 
-      Constant *Loc = OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr(); 
-      Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc); 
-    } 
-    return Ident; 
-  } 
- 
-  /// Try to eliminate calls of \p RFI in \p F by reusing an existing one or 
-  /// \p ReplVal if given. 
-  bool deduplicateRuntimeCalls(Function &F, 
-                               OMPInformationCache::RuntimeFunctionInfo &RFI, 
-                               Value *ReplVal = nullptr) { 
-    auto *UV = RFI.getUseVector(F); 
-    if (!UV || UV->size() + (ReplVal != nullptr) < 2) 
-      return false; 
- 
-    LLVM_DEBUG( 
-        dbgs() << TAG << "Deduplicate " << UV->size() << " uses of " << RFI.Name 
-               << (ReplVal ? " with an existing value\n" : "\n") << "\n"); 
- 
-    assert((!ReplVal || (isa<Argument>(ReplVal) && 
-                         cast<Argument>(ReplVal)->getParent() == &F)) && 
-           "Unexpected replacement value!"); 
- 
-    // TODO: Use dominance to find a good position instead. 
-    auto CanBeMoved = [this](CallBase &CB) { 
-      unsigned NumArgs = CB.getNumArgOperands(); 
-      if (NumArgs == 0) 
-        return true; 
-      if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr) 
-        return false; 
-      for (unsigned u = 1; u < NumArgs; ++u) 
-        if (isa<Instruction>(CB.getArgOperand(u))) 
-          return false; 
-      return true; 
-    }; 
- 
-    if (!ReplVal) { 
-      for (Use *U : *UV) 
-        if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) { 
-          if (!CanBeMoved(*CI)) 
-            continue; 
- 
-          auto Remark = [&](OptimizationRemark OR) { 
-            auto newLoc = &*F.getEntryBlock().getFirstInsertionPt(); 
-            return OR << "OpenMP runtime call " 
-                      << ore::NV("OpenMPOptRuntime", RFI.Name) << " moved to " 
-                      << ore::NV("OpenMPRuntimeMoves", newLoc->getDebugLoc()); 
-          }; 
-          emitRemark<OptimizationRemark>(CI, "OpenMPRuntimeCodeMotion", Remark); 
- 
-          CI->moveBefore(&*F.getEntryBlock().getFirstInsertionPt()); 
-          ReplVal = CI; 
-          break; 
-        } 
-      if (!ReplVal) 
-        return false; 
-    } 
- 
-    // If we use a call as a replacement value we need to make sure the ident is 
-    // valid at the new location. For now we just pick a global one, either 
-    // existing and used by one of the calls, or created from scratch. 
-    if (CallBase *CI = dyn_cast<CallBase>(ReplVal)) { 
-      if (CI->getNumArgOperands() > 0 && 
-          CI->getArgOperand(0)->getType() == OMPInfoCache.OMPBuilder.IdentPtr) { 
-        Value *Ident = getCombinedIdentFromCallUsesIn(RFI, F, 
-                                                      /* GlobalOnly */ true); 
-        CI->setArgOperand(0, Ident); 
-      } 
-    } 
- 
-    bool Changed = false; 
-    auto ReplaceAndDeleteCB = [&](Use &U, Function &Caller) { 
-      CallInst *CI = getCallIfRegularCall(U, &RFI); 
-      if (!CI || CI == ReplVal || &F != &Caller) 
-        return false; 
-      assert(CI->getCaller() == &F && "Unexpected call!"); 
- 
-      auto Remark = [&](OptimizationRemark OR) { 
-        return OR << "OpenMP runtime call " 
-                  << ore::NV("OpenMPOptRuntime", RFI.Name) << " deduplicated"; 
-      }; 
-      emitRemark<OptimizationRemark>(CI, "OpenMPRuntimeDeduplicated", Remark); 
- 
-      CGUpdater.removeCallSite(*CI); 
-      CI->replaceAllUsesWith(ReplVal); 
-      CI->eraseFromParent(); 
-      ++NumOpenMPRuntimeCallsDeduplicated; 
-      Changed = true; 
-      return true; 
-    }; 
-    RFI.foreachUse(SCC, ReplaceAndDeleteCB); 
- 
-    return Changed; 
-  } 
- 
-  /// Collect arguments that represent the global thread id in \p GTIdArgs. 
-  void collectGlobalThreadIdArguments(SmallSetVector<Value *, 16> &GTIdArgs) { 
-    // TODO: Below we basically perform a fixpoint iteration with a pessimistic 
-    //       initialization. We could define an AbstractAttribute instead and 
-    //       run the Attributor here once it can be run as an SCC pass. 
- 
-    // Helper to check the argument \p ArgNo at all call sites of \p F for 
-    // a GTId. 
-    auto CallArgOpIsGTId = [&](Function &F, unsigned ArgNo, CallInst &RefCI) { 
-      if (!F.hasLocalLinkage()) 
-        return false; 
-      for (Use &U : F.uses()) { 
-        if (CallInst *CI = getCallIfRegularCall(U)) { 
-          Value *ArgOp = CI->getArgOperand(ArgNo); 
-          if (CI == &RefCI || GTIdArgs.count(ArgOp) || 
-              getCallIfRegularCall( 
-                  *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num])) 
-            continue; 
-        } 
-        return false; 
-      } 
-      return true; 
-    }; 
- 
-    // Helper to identify uses of a GTId as GTId arguments. 
-    auto AddUserArgs = [&](Value &GTId) { 
-      for (Use &U : GTId.uses()) 
-        if (CallInst *CI = dyn_cast<CallInst>(U.getUser())) 
-          if (CI->isArgOperand(&U)) 
-            if (Function *Callee = CI->getCalledFunction()) 
-              if (CallArgOpIsGTId(*Callee, U.getOperandNo(), *CI)) 
-                GTIdArgs.insert(Callee->getArg(U.getOperandNo())); 
-    }; 
- 
-    // The argument users of __kmpc_global_thread_num calls are GTIds. 
-    OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI = 
-        OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]; 
- 
-    GlobThreadNumRFI.foreachUse(SCC, [&](Use &U, Function &F) { 
-      if (CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI)) 
-        AddUserArgs(*CI); 
-      return false; 
-    }); 
- 
-    // Transitively search for more arguments by looking at the users of the 
-    // ones we know already. During the search the GTIdArgs vector is extended 
-    // so we cannot cache the size nor can we use a range based for. 
-    for (unsigned u = 0; u < GTIdArgs.size(); ++u) 
-      AddUserArgs(*GTIdArgs[u]); 
-  } 
- 
-  /// Kernel (=GPU) optimizations and utility functions 
-  /// 
-  ///{{ 
- 
-  /// Check if \p F is a kernel, hence entry point for target offloading. 
-  bool isKernel(Function &F) { return OMPInfoCache.Kernels.count(&F); } 
- 
-  /// Cache to remember the unique kernel for a function. 
-  DenseMap<Function *, Optional<Kernel>> UniqueKernelMap; 
- 
-  /// Find the unique kernel that will execute \p F, if any. 
-  Kernel getUniqueKernelFor(Function &F); 
- 
-  /// Find the unique kernel that will execute \p I, if any. 
-  Kernel getUniqueKernelFor(Instruction &I) { 
-    return getUniqueKernelFor(*I.getFunction()); 
-  } 
- 
-  /// Rewrite the device (=GPU) code state machine create in non-SPMD mode in 
-  /// the cases we can avoid taking the address of a function. 
-  bool rewriteDeviceCodeStateMachine(); 
- 
-  /// 
-  ///}} 
- 
-  /// Emit a remark generically 
-  /// 
-  /// This template function can be used to generically emit a remark. The 
-  /// RemarkKind should be one of the following: 
-  ///   - OptimizationRemark to indicate a successful optimization attempt 
-  ///   - OptimizationRemarkMissed to report a failed optimization attempt 
-  ///   - OptimizationRemarkAnalysis to provide additional information about an 
-  ///     optimization attempt 
-  /// 
-  /// The remark is built using a callback function provided by the caller that 
-  /// takes a RemarkKind as input and returns a RemarkKind. 
-  template <typename RemarkKind, 
-            typename RemarkCallBack = function_ref<RemarkKind(RemarkKind &&)>> 
-  void emitRemark(Instruction *Inst, StringRef RemarkName, 
-                  RemarkCallBack &&RemarkCB) const { 
-    Function *F = Inst->getParent()->getParent(); 
-    auto &ORE = OREGetter(F); 
- 
-    ORE.emit( 
-        [&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, Inst)); }); 
-  } 
- 
-  /// Emit a remark on a function. Since only OptimizationRemark is supporting 
-  /// this, it can't be made generic. 
-  void 
-  emitRemarkOnFunction(Function *F, StringRef RemarkName, 
-                       function_ref<OptimizationRemark(OptimizationRemark &&)> 
-                           &&RemarkCB) const { 
-    auto &ORE = OREGetter(F); 
- 
-    ORE.emit([&]() { 
-      return RemarkCB(OptimizationRemark(DEBUG_TYPE, RemarkName, F)); 
-    }); 
-  } 
- 
-  /// The underlying module. 
-  Module &M; 
- 
-  /// The SCC we are operating on. 
-  SmallVectorImpl<Function *> &SCC; 
- 
-  /// Callback to update the call graph, the first argument is a removed call, 
-  /// the second an optional replacement call. 
-  CallGraphUpdater &CGUpdater; 
- 
-  /// Callback to get an OptimizationRemarkEmitter from a Function * 
-  OptimizationRemarkGetter OREGetter; 
- 
-  /// OpenMP-specific information cache. Also Used for Attributor runs. 
-  OMPInformationCache &OMPInfoCache; 
- 
-  /// Attributor instance. 
-  Attributor &A; 
- 
-  /// Helper function to run Attributor on SCC. 
-  bool runAttributor() { 
-    if (SCC.empty()) 
-      return false; 
- 
-    registerAAs(); 
- 
-    ChangeStatus Changed = A.run(); 
- 
-    LLVM_DEBUG(dbgs() << "[Attributor] Done with " << SCC.size() 
-                      << " functions, result: " << Changed << ".\n"); 
- 
-    return Changed == ChangeStatus::CHANGED; 
-  } 
- 
-  /// Populate the Attributor with abstract attribute opportunities in the 
-  /// function. 
-  void registerAAs() { 
+  static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent,
+                                    bool GlobalOnly, bool &SingleChoice) {
+    if (CurrentIdent == NextIdent)
+      return CurrentIdent;
+
+    // TODO: Figure out how to actually combine multiple debug locations. For
+    //       now we just keep an existing one if there is a single choice.
+    if (!GlobalOnly || isa<GlobalValue>(NextIdent)) {
+      SingleChoice = !CurrentIdent;
+      return NextIdent;
+    }
+    return nullptr;
+  }
+
+  /// Return an `struct ident_t*` value that represents the ones used in the
+  /// calls of \p RFI inside of \p F. If \p GlobalOnly is true, we will not
+  /// return a local `struct ident_t*`. For now, if we cannot find a suitable
+  /// return value we create one from scratch. We also do not yet combine
+  /// information, e.g., the source locations, see combinedIdentStruct.
+  Value *
+  getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI,
+                                 Function &F, bool GlobalOnly) {
+    bool SingleChoice = true;
+    Value *Ident = nullptr;
+    auto CombineIdentStruct = [&](Use &U, Function &Caller) {
+      CallInst *CI = getCallIfRegularCall(U, &RFI);
+      if (!CI || &F != &Caller)
+        return false;
+      Ident = combinedIdentStruct(Ident, CI->getArgOperand(0),
+                                  /* GlobalOnly */ true, SingleChoice);
+      return false;
+    };
+    RFI.foreachUse(SCC, CombineIdentStruct);
+
+    if (!Ident || !SingleChoice) {
+      // The IRBuilder uses the insertion block to get to the module, this is
+      // unfortunate but we work around it for now.
+      if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock())
+        OMPInfoCache.OMPBuilder.updateToLocation(OpenMPIRBuilder::InsertPointTy(
+            &F.getEntryBlock(), F.getEntryBlock().begin()));
+      // Create a fallback location if non was found.
+      // TODO: Use the debug locations of the calls instead.
+      Constant *Loc = OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr();
+      Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc);
+    }
+    return Ident;
+  }
+
+  /// Try to eliminate calls of \p RFI in \p F by reusing an existing one or
+  /// \p ReplVal if given.
+  bool deduplicateRuntimeCalls(Function &F,
+                               OMPInformationCache::RuntimeFunctionInfo &RFI,
+                               Value *ReplVal = nullptr) {
+    auto *UV = RFI.getUseVector(F);
+    if (!UV || UV->size() + (ReplVal != nullptr) < 2)
+      return false;
+
+    LLVM_DEBUG(
+        dbgs() << TAG << "Deduplicate " << UV->size() << " uses of " << RFI.Name
+               << (ReplVal ? " with an existing value\n" : "\n") << "\n");
+
+    assert((!ReplVal || (isa<Argument>(ReplVal) &&
+                         cast<Argument>(ReplVal)->getParent() == &F)) &&
+           "Unexpected replacement value!");
+
+    // TODO: Use dominance to find a good position instead.
+    auto CanBeMoved = [this](CallBase &CB) {
+      unsigned NumArgs = CB.getNumArgOperands();
+      if (NumArgs == 0)
+        return true;
+      if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr)
+        return false;
+      for (unsigned u = 1; u < NumArgs; ++u)
+        if (isa<Instruction>(CB.getArgOperand(u)))
+          return false;
+      return true;
+    };
+
+    if (!ReplVal) {
+      for (Use *U : *UV)
+        if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) {
+          if (!CanBeMoved(*CI))
+            continue;
+
+          auto Remark = [&](OptimizationRemark OR) {
+            auto newLoc = &*F.getEntryBlock().getFirstInsertionPt();
+            return OR << "OpenMP runtime call "
+                      << ore::NV("OpenMPOptRuntime", RFI.Name) << " moved to "
+                      << ore::NV("OpenMPRuntimeMoves", newLoc->getDebugLoc());
+          };
+          emitRemark<OptimizationRemark>(CI, "OpenMPRuntimeCodeMotion", Remark);
+
+          CI->moveBefore(&*F.getEntryBlock().getFirstInsertionPt());
+          ReplVal = CI;
+          break;
+        }
+      if (!ReplVal)
+        return false;
+    }
+
+    // If we use a call as a replacement value we need to make sure the ident is
+    // valid at the new location. For now we just pick a global one, either
+    // existing and used by one of the calls, or created from scratch.
+    if (CallBase *CI = dyn_cast<CallBase>(ReplVal)) {
+      if (CI->getNumArgOperands() > 0 &&
+          CI->getArgOperand(0)->getType() == OMPInfoCache.OMPBuilder.IdentPtr) {
+        Value *Ident = getCombinedIdentFromCallUsesIn(RFI, F,
+                                                      /* GlobalOnly */ true);
+        CI->setArgOperand(0, Ident);
+      }
+    }
+
+    bool Changed = false;
+    auto ReplaceAndDeleteCB = [&](Use &U, Function &Caller) {
+      CallInst *CI = getCallIfRegularCall(U, &RFI);
+      if (!CI || CI == ReplVal || &F != &Caller)
+        return false;
+      assert(CI->getCaller() == &F && "Unexpected call!");
+
+      auto Remark = [&](OptimizationRemark OR) {
+        return OR << "OpenMP runtime call "
+                  << ore::NV("OpenMPOptRuntime", RFI.Name) << " deduplicated";
+      };
+      emitRemark<OptimizationRemark>(CI, "OpenMPRuntimeDeduplicated", Remark);
+
+      CGUpdater.removeCallSite(*CI);
+      CI->replaceAllUsesWith(ReplVal);
+      CI->eraseFromParent();
+      ++NumOpenMPRuntimeCallsDeduplicated;
+      Changed = true;
+      return true;
+    };
+    RFI.foreachUse(SCC, ReplaceAndDeleteCB);
+
+    return Changed;
+  }
+
+  /// Collect arguments that represent the global thread id in \p GTIdArgs.
+  void collectGlobalThreadIdArguments(SmallSetVector<Value *, 16> &GTIdArgs) {
+    // TODO: Below we basically perform a fixpoint iteration with a pessimistic
+    //       initialization. We could define an AbstractAttribute instead and
+    //       run the Attributor here once it can be run as an SCC pass.
+
+    // Helper to check the argument \p ArgNo at all call sites of \p F for
+    // a GTId.
+    auto CallArgOpIsGTId = [&](Function &F, unsigned ArgNo, CallInst &RefCI) {
+      if (!F.hasLocalLinkage())
+        return false;
+      for (Use &U : F.uses()) {
+        if (CallInst *CI = getCallIfRegularCall(U)) {
+          Value *ArgOp = CI->getArgOperand(ArgNo);
+          if (CI == &RefCI || GTIdArgs.count(ArgOp) ||
+              getCallIfRegularCall(
+                  *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]))
+            continue;
+        }
+        return false;
+      }
+      return true;
+    };
+
+    // Helper to identify uses of a GTId as GTId arguments.
+    auto AddUserArgs = [&](Value &GTId) {
+      for (Use &U : GTId.uses())
+        if (CallInst *CI = dyn_cast<CallInst>(U.getUser()))
+          if (CI->isArgOperand(&U))
+            if (Function *Callee = CI->getCalledFunction())
+              if (CallArgOpIsGTId(*Callee, U.getOperandNo(), *CI))
+                GTIdArgs.insert(Callee->getArg(U.getOperandNo()));
+    };
+
+    // The argument users of __kmpc_global_thread_num calls are GTIds.
+    OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI =
+        OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num];
+
+    GlobThreadNumRFI.foreachUse(SCC, [&](Use &U, Function &F) {
+      if (CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI))
+        AddUserArgs(*CI);
+      return false;
+    });
+
+    // Transitively search for more arguments by looking at the users of the
+    // ones we know already. During the search the GTIdArgs vector is extended
+    // so we cannot cache the size nor can we use a range based for.
+    for (unsigned u = 0; u < GTIdArgs.size(); ++u)
+      AddUserArgs(*GTIdArgs[u]);
+  }
+
+  /// Kernel (=GPU) optimizations and utility functions
+  ///
+  ///{{
+
+  /// Check if \p F is a kernel, hence entry point for target offloading.
+  bool isKernel(Function &F) { return OMPInfoCache.Kernels.count(&F); }
+
+  /// Cache to remember the unique kernel for a function.
+  DenseMap<Function *, Optional<Kernel>> UniqueKernelMap;
+
+  /// Find the unique kernel that will execute \p F, if any.
+  Kernel getUniqueKernelFor(Function &F);
+
+  /// Find the unique kernel that will execute \p I, if any.
+  Kernel getUniqueKernelFor(Instruction &I) {
+    return getUniqueKernelFor(*I.getFunction());
+  }
+
+  /// Rewrite the device (=GPU) code state machine create in non-SPMD mode in
+  /// the cases we can avoid taking the address of a function.
+  bool rewriteDeviceCodeStateMachine();
+
+  ///
+  ///}}
+
+  /// Emit a remark generically
+  ///
+  /// This template function can be used to generically emit a remark. The
+  /// RemarkKind should be one of the following:
+  ///   - OptimizationRemark to indicate a successful optimization attempt
+  ///   - OptimizationRemarkMissed to report a failed optimization attempt
+  ///   - OptimizationRemarkAnalysis to provide additional information about an
+  ///     optimization attempt
+  ///
+  /// The remark is built using a callback function provided by the caller that
+  /// takes a RemarkKind as input and returns a RemarkKind.
+  template <typename RemarkKind,
+            typename RemarkCallBack = function_ref<RemarkKind(RemarkKind &&)>>
+  void emitRemark(Instruction *Inst, StringRef RemarkName,
+                  RemarkCallBack &&RemarkCB) const {
+    Function *F = Inst->getParent()->getParent();
+    auto &ORE = OREGetter(F);
+
+    ORE.emit(
+        [&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, Inst)); });
+  }
+
+  /// Emit a remark on a function. Since only OptimizationRemark is supporting
+  /// this, it can't be made generic.
+  void
+  emitRemarkOnFunction(Function *F, StringRef RemarkName,
+                       function_ref<OptimizationRemark(OptimizationRemark &&)>
+                           &&RemarkCB) const {
+    auto &ORE = OREGetter(F);
+
+    ORE.emit([&]() {
+      return RemarkCB(OptimizationRemark(DEBUG_TYPE, RemarkName, F));
+    });
+  }
+
+  /// The underlying module.
+  Module &M;
+
+  /// The SCC we are operating on.
+  SmallVectorImpl<Function *> &SCC;
+
+  /// Callback to update the call graph, the first argument is a removed call,
+  /// the second an optional replacement call.
+  CallGraphUpdater &CGUpdater;
+
+  /// Callback to get an OptimizationRemarkEmitter from a Function *
+  OptimizationRemarkGetter OREGetter;
+
+  /// OpenMP-specific information cache. Also Used for Attributor runs.
+  OMPInformationCache &OMPInfoCache;
+
+  /// Attributor instance.
+  Attributor &A;
+
+  /// Helper function to run Attributor on SCC.
+  bool runAttributor() {
+    if (SCC.empty())
+      return false;
+
+    registerAAs();
+
+    ChangeStatus Changed = A.run();
+
+    LLVM_DEBUG(dbgs() << "[Attributor] Done with " << SCC.size()
+                      << " functions, result: " << Changed << ".\n");
+
+    return Changed == ChangeStatus::CHANGED;
+  }
+
+  /// Populate the Attributor with abstract attribute opportunities in the
+  /// function.
+  void registerAAs() {
     if (SCC.empty())
       return;
- 
+
     // Create CallSite AA for all Getters.
     for (int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) {
       auto ICVInfo = OMPInfoCache.ICVs[static_cast<InternalControlVar>(Idx)];
@@ -1600,29 +1600,29 @@ private:
       };
 
       GetterRFI.foreachUse(SCC, CreateAA);
-    } 
-  } 
-}; 
- 
-Kernel OpenMPOpt::getUniqueKernelFor(Function &F) { 
-  if (!OMPInfoCache.ModuleSlice.count(&F)) 
-    return nullptr; 
- 
-  // Use a scope to keep the lifetime of the CachedKernel short. 
-  { 
-    Optional<Kernel> &CachedKernel = UniqueKernelMap[&F]; 
-    if (CachedKernel) 
-      return *CachedKernel; 
- 
-    // TODO: We should use an AA to create an (optimistic and callback 
-    //       call-aware) call graph. For now we stick to simple patterns that 
-    //       are less powerful, basically the worst fixpoint. 
-    if (isKernel(F)) { 
-      CachedKernel = Kernel(&F); 
-      return *CachedKernel; 
-    } 
- 
-    CachedKernel = nullptr; 
+    }
+  }
+};
+
+Kernel OpenMPOpt::getUniqueKernelFor(Function &F) {
+  if (!OMPInfoCache.ModuleSlice.count(&F))
+    return nullptr;
+
+  // Use a scope to keep the lifetime of the CachedKernel short.
+  {
+    Optional<Kernel> &CachedKernel = UniqueKernelMap[&F];
+    if (CachedKernel)
+      return *CachedKernel;
+
+    // TODO: We should use an AA to create an (optimistic and callback
+    //       call-aware) call graph. For now we stick to simple patterns that
+    //       are less powerful, basically the worst fixpoint.
+    if (isKernel(F)) {
+      CachedKernel = Kernel(&F);
+      return *CachedKernel;
+    }
+
+    CachedKernel = nullptr;
     if (!F.hasLocalLinkage()) {
 
       // See https://openmp.llvm.org/remarks/OptimizationRemarks.html
@@ -1631,206 +1631,206 @@ Kernel OpenMPOpt::getUniqueKernelFor(Function &F) {
       };
       emitRemarkOnFunction(&F, "OMP100", Remark);
 
-      return nullptr; 
+      return nullptr;
     }
-  } 
- 
-  auto GetUniqueKernelForUse = [&](const Use &U) -> Kernel { 
-    if (auto *Cmp = dyn_cast<ICmpInst>(U.getUser())) { 
-      // Allow use in equality comparisons. 
-      if (Cmp->isEquality()) 
-        return getUniqueKernelFor(*Cmp); 
-      return nullptr; 
-    } 
-    if (auto *CB = dyn_cast<CallBase>(U.getUser())) { 
-      // Allow direct calls. 
-      if (CB->isCallee(&U)) 
-        return getUniqueKernelFor(*CB); 
-      // Allow the use in __kmpc_kernel_prepare_parallel calls. 
-      if (Function *Callee = CB->getCalledFunction()) 
-        if (Callee->getName() == "__kmpc_kernel_prepare_parallel") 
-          return getUniqueKernelFor(*CB); 
-      return nullptr; 
-    } 
-    // Disallow every other use. 
-    return nullptr; 
-  }; 
- 
-  // TODO: In the future we want to track more than just a unique kernel. 
-  SmallPtrSet<Kernel, 2> PotentialKernels; 
+  }
+
+  auto GetUniqueKernelForUse = [&](const Use &U) -> Kernel {
+    if (auto *Cmp = dyn_cast<ICmpInst>(U.getUser())) {
+      // Allow use in equality comparisons.
+      if (Cmp->isEquality())
+        return getUniqueKernelFor(*Cmp);
+      return nullptr;
+    }
+    if (auto *CB = dyn_cast<CallBase>(U.getUser())) {
+      // Allow direct calls.
+      if (CB->isCallee(&U))
+        return getUniqueKernelFor(*CB);
+      // Allow the use in __kmpc_kernel_prepare_parallel calls.
+      if (Function *Callee = CB->getCalledFunction())
+        if (Callee->getName() == "__kmpc_kernel_prepare_parallel")
+          return getUniqueKernelFor(*CB);
+      return nullptr;
+    }
+    // Disallow every other use.
+    return nullptr;
+  };
+
+  // TODO: In the future we want to track more than just a unique kernel.
+  SmallPtrSet<Kernel, 2> PotentialKernels;
   OMPInformationCache::foreachUse(F, [&](const Use &U) {
-    PotentialKernels.insert(GetUniqueKernelForUse(U)); 
-  }); 
- 
-  Kernel K = nullptr; 
-  if (PotentialKernels.size() == 1) 
-    K = *PotentialKernels.begin(); 
- 
-  // Cache the result. 
-  UniqueKernelMap[&F] = K; 
- 
-  return K; 
-} 
- 
-bool OpenMPOpt::rewriteDeviceCodeStateMachine() { 
-  OMPInformationCache::RuntimeFunctionInfo &KernelPrepareParallelRFI = 
-      OMPInfoCache.RFIs[OMPRTL___kmpc_kernel_prepare_parallel]; 
- 
-  bool Changed = false; 
-  if (!KernelPrepareParallelRFI) 
-    return Changed; 
- 
-  for (Function *F : SCC) { 
- 
-    // Check if the function is uses in a __kmpc_kernel_prepare_parallel call at 
-    // all. 
-    bool UnknownUse = false; 
-    bool KernelPrepareUse = false; 
-    unsigned NumDirectCalls = 0; 
- 
-    SmallVector<Use *, 2> ToBeReplacedStateMachineUses; 
+    PotentialKernels.insert(GetUniqueKernelForUse(U));
+  });
+
+  Kernel K = nullptr;
+  if (PotentialKernels.size() == 1)
+    K = *PotentialKernels.begin();
+
+  // Cache the result.
+  UniqueKernelMap[&F] = K;
+
+  return K;
+}
+
+bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
+  OMPInformationCache::RuntimeFunctionInfo &KernelPrepareParallelRFI =
+      OMPInfoCache.RFIs[OMPRTL___kmpc_kernel_prepare_parallel];
+
+  bool Changed = false;
+  if (!KernelPrepareParallelRFI)
+    return Changed;
+
+  for (Function *F : SCC) {
+
+    // Check if the function is uses in a __kmpc_kernel_prepare_parallel call at
+    // all.
+    bool UnknownUse = false;
+    bool KernelPrepareUse = false;
+    unsigned NumDirectCalls = 0;
+
+    SmallVector<Use *, 2> ToBeReplacedStateMachineUses;
     OMPInformationCache::foreachUse(*F, [&](Use &U) {
-      if (auto *CB = dyn_cast<CallBase>(U.getUser())) 
-        if (CB->isCallee(&U)) { 
-          ++NumDirectCalls; 
-          return; 
-        } 
- 
-      if (isa<ICmpInst>(U.getUser())) { 
-        ToBeReplacedStateMachineUses.push_back(&U); 
-        return; 
-      } 
-      if (!KernelPrepareUse && OpenMPOpt::getCallIfRegularCall( 
-                                   *U.getUser(), &KernelPrepareParallelRFI)) { 
-        KernelPrepareUse = true; 
-        ToBeReplacedStateMachineUses.push_back(&U); 
-        return; 
-      } 
-      UnknownUse = true; 
-    }); 
- 
-    // Do not emit a remark if we haven't seen a __kmpc_kernel_prepare_parallel 
-    // use. 
-    if (!KernelPrepareUse) 
-      continue; 
- 
-    { 
-      auto Remark = [&](OptimizationRemark OR) { 
-        return OR << "Found a parallel region that is called in a target " 
-                     "region but not part of a combined target construct nor " 
-                     "nesed inside a target construct without intermediate " 
-                     "code. This can lead to excessive register usage for " 
-                     "unrelated target regions in the same translation unit " 
-                     "due to spurious call edges assumed by ptxas."; 
-      }; 
-      emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark); 
-    } 
- 
-    // If this ever hits, we should investigate. 
-    // TODO: Checking the number of uses is not a necessary restriction and 
-    // should be lifted. 
-    if (UnknownUse || NumDirectCalls != 1 || 
-        ToBeReplacedStateMachineUses.size() != 2) { 
-      { 
-        auto Remark = [&](OptimizationRemark OR) { 
-          return OR << "Parallel region is used in " 
-                    << (UnknownUse ? "unknown" : "unexpected") 
-                    << " ways; will not attempt to rewrite the state machine."; 
-        }; 
-        emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark); 
-      } 
-      continue; 
-    } 
- 
-    // Even if we have __kmpc_kernel_prepare_parallel calls, we (for now) give 
-    // up if the function is not called from a unique kernel. 
-    Kernel K = getUniqueKernelFor(*F); 
-    if (!K) { 
-      { 
-        auto Remark = [&](OptimizationRemark OR) { 
-          return OR << "Parallel region is not known to be called from a " 
-                       "unique single target region, maybe the surrounding " 
-                       "function has external linkage?; will not attempt to " 
-                       "rewrite the state machine use."; 
-        }; 
-        emitRemarkOnFunction(F, "OpenMPParallelRegionInMultipleKernesl", 
-                             Remark); 
-      } 
-      continue; 
-    } 
- 
-    // We now know F is a parallel body function called only from the kernel K. 
-    // We also identified the state machine uses in which we replace the 
-    // function pointer by a new global symbol for identification purposes. This 
-    // ensures only direct calls to the function are left. 
- 
-    { 
-      auto RemarkParalleRegion = [&](OptimizationRemark OR) { 
-        return OR << "Specialize parallel region that is only reached from a " 
-                     "single target region to avoid spurious call edges and " 
-                     "excessive register usage in other target regions. " 
-                     "(parallel region ID: " 
-                  << ore::NV("OpenMPParallelRegion", F->getName()) 
-                  << ", kernel ID: " 
-                  << ore::NV("OpenMPTargetRegion", K->getName()) << ")"; 
-      }; 
-      emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", 
-                           RemarkParalleRegion); 
-      auto RemarkKernel = [&](OptimizationRemark OR) { 
-        return OR << "Target region containing the parallel region that is " 
-                     "specialized. (parallel region ID: " 
-                  << ore::NV("OpenMPParallelRegion", F->getName()) 
-                  << ", kernel ID: " 
-                  << ore::NV("OpenMPTargetRegion", K->getName()) << ")"; 
-      }; 
-      emitRemarkOnFunction(K, "OpenMPParallelRegionInNonSPMD", RemarkKernel); 
-    } 
- 
-    Module &M = *F->getParent(); 
-    Type *Int8Ty = Type::getInt8Ty(M.getContext()); 
- 
-    auto *ID = new GlobalVariable( 
-        M, Int8Ty, /* isConstant */ true, GlobalValue::PrivateLinkage, 
-        UndefValue::get(Int8Ty), F->getName() + ".ID"); 
- 
-    for (Use *U : ToBeReplacedStateMachineUses) 
-      U->set(ConstantExpr::getBitCast(ID, U->get()->getType())); 
- 
-    ++NumOpenMPParallelRegionsReplacedInGPUStateMachine; 
- 
-    Changed = true; 
-  } 
- 
-  return Changed; 
-} 
- 
-/// Abstract Attribute for tracking ICV values. 
-struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> { 
-  using Base = StateWrapper<BooleanState, AbstractAttribute>; 
-  AAICVTracker(const IRPosition &IRP, Attributor &A) : Base(IRP) {} 
- 
+      if (auto *CB = dyn_cast<CallBase>(U.getUser()))
+        if (CB->isCallee(&U)) {
+          ++NumDirectCalls;
+          return;
+        }
+
+      if (isa<ICmpInst>(U.getUser())) {
+        ToBeReplacedStateMachineUses.push_back(&U);
+        return;
+      }
+      if (!KernelPrepareUse && OpenMPOpt::getCallIfRegularCall(
+                                   *U.getUser(), &KernelPrepareParallelRFI)) {
+        KernelPrepareUse = true;
+        ToBeReplacedStateMachineUses.push_back(&U);
+        return;
+      }
+      UnknownUse = true;
+    });
+
+    // Do not emit a remark if we haven't seen a __kmpc_kernel_prepare_parallel
+    // use.
+    if (!KernelPrepareUse)
+      continue;
+
+    {
+      auto Remark = [&](OptimizationRemark OR) {
+        return OR << "Found a parallel region that is called in a target "
+                     "region but not part of a combined target construct nor "
+                     "nesed inside a target construct without intermediate "
+                     "code. This can lead to excessive register usage for "
+                     "unrelated target regions in the same translation unit "
+                     "due to spurious call edges assumed by ptxas.";
+      };
+      emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark);
+    }
+
+    // If this ever hits, we should investigate.
+    // TODO: Checking the number of uses is not a necessary restriction and
+    // should be lifted.
+    if (UnknownUse || NumDirectCalls != 1 ||
+        ToBeReplacedStateMachineUses.size() != 2) {
+      {
+        auto Remark = [&](OptimizationRemark OR) {
+          return OR << "Parallel region is used in "
+                    << (UnknownUse ? "unknown" : "unexpected")
+                    << " ways; will not attempt to rewrite the state machine.";
+        };
+        emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark);
+      }
+      continue;
+    }
+
+    // Even if we have __kmpc_kernel_prepare_parallel calls, we (for now) give
+    // up if the function is not called from a unique kernel.
+    Kernel K = getUniqueKernelFor(*F);
+    if (!K) {
+      {
+        auto Remark = [&](OptimizationRemark OR) {
+          return OR << "Parallel region is not known to be called from a "
+                       "unique single target region, maybe the surrounding "
+                       "function has external linkage?; will not attempt to "
+                       "rewrite the state machine use.";
+        };
+        emitRemarkOnFunction(F, "OpenMPParallelRegionInMultipleKernesl",
+                             Remark);
+      }
+      continue;
+    }
+
+    // We now know F is a parallel body function called only from the kernel K.
+    // We also identified the state machine uses in which we replace the
+    // function pointer by a new global symbol for identification purposes. This
+    // ensures only direct calls to the function are left.
+
+    {
+      auto RemarkParalleRegion = [&](OptimizationRemark OR) {
+        return OR << "Specialize parallel region that is only reached from a "
+                     "single target region to avoid spurious call edges and "
+                     "excessive register usage in other target regions. "
+                     "(parallel region ID: "
+                  << ore::NV("OpenMPParallelRegion", F->getName())
+                  << ", kernel ID: "
+                  << ore::NV("OpenMPTargetRegion", K->getName()) << ")";
+      };
+      emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD",
+                           RemarkParalleRegion);
+      auto RemarkKernel = [&](OptimizationRemark OR) {
+        return OR << "Target region containing the parallel region that is "
+                     "specialized. (parallel region ID: "
+                  << ore::NV("OpenMPParallelRegion", F->getName())
+                  << ", kernel ID: "
+                  << ore::NV("OpenMPTargetRegion", K->getName()) << ")";
+      };
+      emitRemarkOnFunction(K, "OpenMPParallelRegionInNonSPMD", RemarkKernel);
+    }
+
+    Module &M = *F->getParent();
+    Type *Int8Ty = Type::getInt8Ty(M.getContext());
+
+    auto *ID = new GlobalVariable(
+        M, Int8Ty, /* isConstant */ true, GlobalValue::PrivateLinkage,
+        UndefValue::get(Int8Ty), F->getName() + ".ID");
+
+    for (Use *U : ToBeReplacedStateMachineUses)
+      U->set(ConstantExpr::getBitCast(ID, U->get()->getType()));
+
+    ++NumOpenMPParallelRegionsReplacedInGPUStateMachine;
+
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+/// Abstract Attribute for tracking ICV values.
+struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> {
+  using Base = StateWrapper<BooleanState, AbstractAttribute>;
+  AAICVTracker(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
+
   void initialize(Attributor &A) override {
     Function *F = getAnchorScope();
     if (!F || !A.isFunctionIPOAmendable(*F))
       indicatePessimisticFixpoint();
   }
 
-  /// Returns true if value is assumed to be tracked. 
-  bool isAssumedTracked() const { return getAssumed(); } 
- 
-  /// Returns true if value is known to be tracked. 
-  bool isKnownTracked() const { return getAssumed(); } 
- 
-  /// Create an abstract attribute biew for the position \p IRP. 
-  static AAICVTracker &createForPosition(const IRPosition &IRP, Attributor &A); 
- 
-  /// Return the value with which \p I can be replaced for specific \p ICV. 
+  /// Returns true if value is assumed to be tracked.
+  bool isAssumedTracked() const { return getAssumed(); }
+
+  /// Returns true if value is known to be tracked.
+  bool isKnownTracked() const { return getAssumed(); }
+
+  /// Create an abstract attribute biew for the position \p IRP.
+  static AAICVTracker &createForPosition(const IRPosition &IRP, Attributor &A);
+
+  /// Return the value with which \p I can be replaced for specific \p ICV.
   virtual Optional<Value *> getReplacementValue(InternalControlVar ICV,
                                                 const Instruction *I,
                                                 Attributor &A) const {
     return None;
   }
- 
+
   /// Return an assumed unique ICV value if a single candidate is found. If
   /// there cannot be one, return a nullptr. If it is not clear yet, return the
   /// Optional::NoneType.
@@ -1841,64 +1841,64 @@ struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> {
   // this array will only grow with time.
   InternalControlVar TrackableICVs[1] = {ICV_nthreads};
 
-  /// See AbstractAttribute::getName() 
-  const std::string getName() const override { return "AAICVTracker"; } 
- 
-  /// See AbstractAttribute::getIdAddr() 
-  const char *getIdAddr() const override { return &ID; } 
- 
-  /// This function should return true if the type of the \p AA is AAICVTracker 
-  static bool classof(const AbstractAttribute *AA) { 
-    return (AA->getIdAddr() == &ID); 
-  } 
- 
-  static const char ID; 
-}; 
- 
-struct AAICVTrackerFunction : public AAICVTracker { 
-  AAICVTrackerFunction(const IRPosition &IRP, Attributor &A) 
-      : AAICVTracker(IRP, A) {} 
- 
-  // FIXME: come up with better string. 
+  /// See AbstractAttribute::getName()
+  const std::string getName() const override { return "AAICVTracker"; }
+
+  /// See AbstractAttribute::getIdAddr()
+  const char *getIdAddr() const override { return &ID; }
+
+  /// This function should return true if the type of the \p AA is AAICVTracker
+  static bool classof(const AbstractAttribute *AA) {
+    return (AA->getIdAddr() == &ID);
+  }
+
+  static const char ID;
+};
+
+struct AAICVTrackerFunction : public AAICVTracker {
+  AAICVTrackerFunction(const IRPosition &IRP, Attributor &A)
+      : AAICVTracker(IRP, A) {}
+
+  // FIXME: come up with better string.
   const std::string getAsStr() const override { return "ICVTrackerFunction"; }
- 
-  // FIXME: come up with some stats. 
-  void trackStatistics() const override {} 
- 
+
+  // FIXME: come up with some stats.
+  void trackStatistics() const override {}
+
   /// We don't manifest anything for this AA.
-  ChangeStatus manifest(Attributor &A) override { 
+  ChangeStatus manifest(Attributor &A) override {
     return ChangeStatus::UNCHANGED;
-  } 
- 
-  // Map of ICV to their values at specific program point. 
+  }
+
+  // Map of ICV to their values at specific program point.
   EnumeratedArray<DenseMap<Instruction *, Value *>, InternalControlVar,
-                  InternalControlVar::ICV___last> 
+                  InternalControlVar::ICV___last>
       ICVReplacementValuesMap;
- 
-  ChangeStatus updateImpl(Attributor &A) override { 
-    ChangeStatus HasChanged = ChangeStatus::UNCHANGED; 
- 
-    Function *F = getAnchorScope(); 
- 
-    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); 
- 
-    for (InternalControlVar ICV : TrackableICVs) { 
-      auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter]; 
- 
+
+  ChangeStatus updateImpl(Attributor &A) override {
+    ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
+
+    Function *F = getAnchorScope();
+
+    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+
+    for (InternalControlVar ICV : TrackableICVs) {
+      auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
+
       auto &ValuesMap = ICVReplacementValuesMap[ICV];
-      auto TrackValues = [&](Use &U, Function &) { 
-        CallInst *CI = OpenMPOpt::getCallIfRegularCall(U); 
-        if (!CI) 
-          return false; 
- 
-        // FIXME: handle setters with more that 1 arguments. 
-        /// Track new value. 
+      auto TrackValues = [&](Use &U, Function &) {
+        CallInst *CI = OpenMPOpt::getCallIfRegularCall(U);
+        if (!CI)
+          return false;
+
+        // FIXME: handle setters with more that 1 arguments.
+        /// Track new value.
         if (ValuesMap.insert(std::make_pair(CI, CI->getArgOperand(0))).second)
-          HasChanged = ChangeStatus::CHANGED; 
- 
-        return false; 
-      }; 
- 
+          HasChanged = ChangeStatus::CHANGED;
+
+        return false;
+      };
+
       auto CallCheck = [&](Instruction &I) {
         Optional<Value *> ReplVal = getValueForCall(A, &I, ICV);
         if (ReplVal.hasValue() &&
@@ -1909,7 +1909,7 @@ struct AAICVTrackerFunction : public AAICVTracker {
       };
 
       // Track all changes of an ICV.
-      SetterRFI.foreachUse(TrackValues, F); 
+      SetterRFI.foreachUse(TrackValues, F);
 
       A.checkForAllInstructions(CallCheck, *this, {Instruction::Call},
                                 /* CheckBBLivenessOnly */ true);
@@ -1919,26 +1919,26 @@ struct AAICVTrackerFunction : public AAICVTracker {
       Instruction *Entry = &F->getEntryBlock().front();
       if (HasChanged == ChangeStatus::CHANGED && !ValuesMap.count(Entry))
         ValuesMap.insert(std::make_pair(Entry, nullptr));
-    } 
- 
-    return HasChanged; 
-  } 
- 
+    }
+
+    return HasChanged;
+  }
+
   /// Hepler to check if \p I is a call and get the value for it if it is
   /// unique.
   Optional<Value *> getValueForCall(Attributor &A, const Instruction *I,
                                     InternalControlVar &ICV) const {
- 
+
     const auto *CB = dyn_cast<CallBase>(I);
     if (!CB || CB->hasFnAttr("no_openmp") ||
         CB->hasFnAttr("no_openmp_routines"))
       return None;
 
-    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); 
-    auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter]; 
+    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+    auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter];
     auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
     Function *CalledFunction = CB->getCalledFunction();
- 
+
     // Indirect call, assume ICV changes.
     if (CalledFunction == nullptr)
       return nullptr;
@@ -1947,7 +1947,7 @@ struct AAICVTrackerFunction : public AAICVTracker {
     if (CalledFunction == SetterRFI.Declaration) {
       if (ICVReplacementValuesMap[ICV].count(I))
         return ICVReplacementValuesMap[ICV].lookup(I);
- 
+
       return nullptr;
     }
 
@@ -2006,11 +2006,11 @@ struct AAICVTrackerFunction : public AAICVTracker {
           // If we found a new value, we can't know the icv value anymore.
           if (NewReplVal.hasValue())
             if (ReplVal != NewReplVal)
-              return nullptr; 
- 
+              return nullptr;
+
           break;
-        } 
- 
+        }
+
         Optional<Value *> NewReplVal = getValueForCall(A, CurrInst, ICV);
         if (!NewReplVal.hasValue())
           continue;
@@ -2025,7 +2025,7 @@ struct AAICVTrackerFunction : public AAICVTracker {
         // We found a new value, we can't know the icv value anymore.
         if (ReplVal != NewReplVal)
           return nullptr;
-      } 
+      }
 
       // If we are in the same BB and we have a value, we are done.
       if (CurrBB == I->getParent() && ReplVal.hasValue())
@@ -2035,11 +2035,11 @@ struct AAICVTrackerFunction : public AAICVTracker {
       for (const BasicBlock *Pred : predecessors(CurrBB))
         if (const Instruction *Terminator = Pred->getTerminator())
           Worklist.push_back(Terminator);
-    } 
- 
+    }
+
     return ReplVal;
-  } 
-}; 
+  }
+};
 
 struct AAICVTrackerFunctionReturned : AAICVTracker {
   AAICVTrackerFunctionReturned(const IRPosition &IRP, Attributor &A)
@@ -2231,52 +2231,52 @@ struct AAICVTrackerCallSiteReturned : AAICVTracker {
     return Changed;
   }
 };
-} // namespace 
- 
-const char AAICVTracker::ID = 0; 
- 
-AAICVTracker &AAICVTracker::createForPosition(const IRPosition &IRP, 
-                                              Attributor &A) { 
-  AAICVTracker *AA = nullptr; 
-  switch (IRP.getPositionKind()) { 
-  case IRPosition::IRP_INVALID: 
-  case IRPosition::IRP_FLOAT: 
-  case IRPosition::IRP_ARGUMENT: 
+} // namespace
+
+const char AAICVTracker::ID = 0;
+
+AAICVTracker &AAICVTracker::createForPosition(const IRPosition &IRP,
+                                              Attributor &A) {
+  AAICVTracker *AA = nullptr;
+  switch (IRP.getPositionKind()) {
+  case IRPosition::IRP_INVALID:
+  case IRPosition::IRP_FLOAT:
+  case IRPosition::IRP_ARGUMENT:
   case IRPosition::IRP_CALL_SITE_ARGUMENT:
     llvm_unreachable("ICVTracker can only be created for function position!");
-  case IRPosition::IRP_RETURNED: 
+  case IRPosition::IRP_RETURNED:
     AA = new (A.Allocator) AAICVTrackerFunctionReturned(IRP, A);
     break;
-  case IRPosition::IRP_CALL_SITE_RETURNED: 
+  case IRPosition::IRP_CALL_SITE_RETURNED:
     AA = new (A.Allocator) AAICVTrackerCallSiteReturned(IRP, A);
     break;
-  case IRPosition::IRP_CALL_SITE: 
+  case IRPosition::IRP_CALL_SITE:
     AA = new (A.Allocator) AAICVTrackerCallSite(IRP, A);
     break;
-  case IRPosition::IRP_FUNCTION: 
-    AA = new (A.Allocator) AAICVTrackerFunction(IRP, A); 
-    break; 
-  } 
- 
-  return *AA; 
-} 
- 
-PreservedAnalyses OpenMPOptPass::run(LazyCallGraph::SCC &C, 
-                                     CGSCCAnalysisManager &AM, 
-                                     LazyCallGraph &CG, CGSCCUpdateResult &UR) { 
-  if (!containsOpenMP(*C.begin()->getFunction().getParent(), OMPInModule)) 
-    return PreservedAnalyses::all(); 
- 
-  if (DisableOpenMPOptimizations) 
-    return PreservedAnalyses::all(); 
- 
-  SmallVector<Function *, 16> SCC; 
+  case IRPosition::IRP_FUNCTION:
+    AA = new (A.Allocator) AAICVTrackerFunction(IRP, A);
+    break;
+  }
+
+  return *AA;
+}
+
+PreservedAnalyses OpenMPOptPass::run(LazyCallGraph::SCC &C,
+                                     CGSCCAnalysisManager &AM,
+                                     LazyCallGraph &CG, CGSCCUpdateResult &UR) {
+  if (!containsOpenMP(*C.begin()->getFunction().getParent(), OMPInModule))
+    return PreservedAnalyses::all();
+
+  if (DisableOpenMPOptimizations)
+    return PreservedAnalyses::all();
+
+  SmallVector<Function *, 16> SCC;
   // If there are kernels in the module, we have to run on all SCC's.
   bool SCCIsInteresting = !OMPInModule.getKernels().empty();
   for (LazyCallGraph::Node &N : C) {
     Function *Fn = &N.getFunction();
     SCC.push_back(Fn);
- 
+
     // Do we already know that the SCC contains kernels,
     // or that OpenMP functions are called from this SCC?
     if (SCCIsInteresting)
@@ -2286,63 +2286,63 @@ PreservedAnalyses OpenMPOptPass::run(LazyCallGraph::SCC &C,
   }
 
   if (!SCCIsInteresting || SCC.empty())
-    return PreservedAnalyses::all(); 
- 
-  FunctionAnalysisManager &FAM = 
-      AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager(); 
- 
-  AnalysisGetter AG(FAM); 
- 
-  auto OREGetter = [&FAM](Function *F) -> OptimizationRemarkEmitter & { 
-    return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F); 
-  }; 
- 
-  CallGraphUpdater CGUpdater; 
-  CGUpdater.initialize(CG, C, AM, UR); 
- 
-  SetVector<Function *> Functions(SCC.begin(), SCC.end()); 
-  BumpPtrAllocator Allocator; 
-  OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator, 
-                                /*CGSCC*/ Functions, OMPInModule.getKernels()); 
- 
-  Attributor A(Functions, InfoCache, CGUpdater); 
- 
-  OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A); 
-  bool Changed = OMPOpt.run(); 
-  if (Changed) 
-    return PreservedAnalyses::none(); 
- 
-  return PreservedAnalyses::all(); 
-} 
- 
-namespace { 
- 
-struct OpenMPOptLegacyPass : public CallGraphSCCPass { 
-  CallGraphUpdater CGUpdater; 
-  OpenMPInModule OMPInModule; 
-  static char ID; 
- 
-  OpenMPOptLegacyPass() : CallGraphSCCPass(ID) { 
-    initializeOpenMPOptLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    CallGraphSCCPass::getAnalysisUsage(AU); 
-  } 
- 
-  bool doInitialization(CallGraph &CG) override { 
-    // Disable the pass if there is no OpenMP (runtime call) in the module. 
-    containsOpenMP(CG.getModule(), OMPInModule); 
-    return false; 
-  } 
- 
-  bool runOnSCC(CallGraphSCC &CGSCC) override { 
-    if (!containsOpenMP(CGSCC.getCallGraph().getModule(), OMPInModule)) 
-      return false; 
-    if (DisableOpenMPOptimizations || skipSCC(CGSCC)) 
-      return false; 
- 
-    SmallVector<Function *, 16> SCC; 
+    return PreservedAnalyses::all();
+
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
+
+  AnalysisGetter AG(FAM);
+
+  auto OREGetter = [&FAM](Function *F) -> OptimizationRemarkEmitter & {
+    return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
+  };
+
+  CallGraphUpdater CGUpdater;
+  CGUpdater.initialize(CG, C, AM, UR);
+
+  SetVector<Function *> Functions(SCC.begin(), SCC.end());
+  BumpPtrAllocator Allocator;
+  OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator,
+                                /*CGSCC*/ Functions, OMPInModule.getKernels());
+
+  Attributor A(Functions, InfoCache, CGUpdater);
+
+  OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
+  bool Changed = OMPOpt.run();
+  if (Changed)
+    return PreservedAnalyses::none();
+
+  return PreservedAnalyses::all();
+}
+
+namespace {
+
+struct OpenMPOptLegacyPass : public CallGraphSCCPass {
+  CallGraphUpdater CGUpdater;
+  OpenMPInModule OMPInModule;
+  static char ID;
+
+  OpenMPOptLegacyPass() : CallGraphSCCPass(ID) {
+    initializeOpenMPOptLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    CallGraphSCCPass::getAnalysisUsage(AU);
+  }
+
+  bool doInitialization(CallGraph &CG) override {
+    // Disable the pass if there is no OpenMP (runtime call) in the module.
+    containsOpenMP(CG.getModule(), OMPInModule);
+    return false;
+  }
+
+  bool runOnSCC(CallGraphSCC &CGSCC) override {
+    if (!containsOpenMP(CGSCC.getCallGraph().getModule(), OMPInModule))
+      return false;
+    if (DisableOpenMPOptimizations || skipSCC(CGSCC))
+      return false;
+
+    SmallVector<Function *, 16> SCC;
     // If there are kernels in the module, we have to run on all SCC's.
     bool SCCIsInteresting = !OMPInModule.getKernels().empty();
     for (CallGraphNode *CGN : CGSCC) {
@@ -2350,7 +2350,7 @@ struct OpenMPOptLegacyPass : public CallGraphSCCPass {
       if (!Fn || Fn->isDeclaration())
         continue;
       SCC.push_back(Fn);
- 
+
       // Do we already know that the SCC contains kernels,
       // or that OpenMP functions are called from this SCC?
       if (SCCIsInteresting)
@@ -2360,100 +2360,100 @@ struct OpenMPOptLegacyPass : public CallGraphSCCPass {
     }
 
     if (!SCCIsInteresting || SCC.empty())
-      return false; 
- 
-    CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); 
-    CGUpdater.initialize(CG, CGSCC); 
- 
-    // Maintain a map of functions to avoid rebuilding the ORE 
-    DenseMap<Function *, std::unique_ptr<OptimizationRemarkEmitter>> OREMap; 
-    auto OREGetter = [&OREMap](Function *F) -> OptimizationRemarkEmitter & { 
-      std::unique_ptr<OptimizationRemarkEmitter> &ORE = OREMap[F]; 
-      if (!ORE) 
-        ORE = std::make_unique<OptimizationRemarkEmitter>(F); 
-      return *ORE; 
-    }; 
- 
-    AnalysisGetter AG; 
-    SetVector<Function *> Functions(SCC.begin(), SCC.end()); 
-    BumpPtrAllocator Allocator; 
-    OMPInformationCache InfoCache( 
-        *(Functions.back()->getParent()), AG, Allocator, 
-        /*CGSCC*/ Functions, OMPInModule.getKernels()); 
- 
-    Attributor A(Functions, InfoCache, CGUpdater); 
- 
-    OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A); 
-    return OMPOpt.run(); 
-  } 
- 
-  bool doFinalization(CallGraph &CG) override { return CGUpdater.finalize(); } 
-}; 
- 
-} // end anonymous namespace 
- 
-void OpenMPInModule::identifyKernels(Module &M) { 
- 
-  NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations"); 
-  if (!MD) 
-    return; 
- 
-  for (auto *Op : MD->operands()) { 
-    if (Op->getNumOperands() < 2) 
-      continue; 
-    MDString *KindID = dyn_cast<MDString>(Op->getOperand(1)); 
-    if (!KindID || KindID->getString() != "kernel") 
-      continue; 
- 
-    Function *KernelFn = 
-        mdconst::dyn_extract_or_null<Function>(Op->getOperand(0)); 
-    if (!KernelFn) 
-      continue; 
- 
-    ++NumOpenMPTargetRegionKernels; 
- 
-    Kernels.insert(KernelFn); 
-  } 
-} 
- 
-bool llvm::omp::containsOpenMP(Module &M, OpenMPInModule &OMPInModule) { 
-  if (OMPInModule.isKnown()) 
-    return OMPInModule; 
- 
+      return false;
+
+    CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+    CGUpdater.initialize(CG, CGSCC);
+
+    // Maintain a map of functions to avoid rebuilding the ORE
+    DenseMap<Function *, std::unique_ptr<OptimizationRemarkEmitter>> OREMap;
+    auto OREGetter = [&OREMap](Function *F) -> OptimizationRemarkEmitter & {
+      std::unique_ptr<OptimizationRemarkEmitter> &ORE = OREMap[F];
+      if (!ORE)
+        ORE = std::make_unique<OptimizationRemarkEmitter>(F);
+      return *ORE;
+    };
+
+    AnalysisGetter AG;
+    SetVector<Function *> Functions(SCC.begin(), SCC.end());
+    BumpPtrAllocator Allocator;
+    OMPInformationCache InfoCache(
+        *(Functions.back()->getParent()), AG, Allocator,
+        /*CGSCC*/ Functions, OMPInModule.getKernels());
+
+    Attributor A(Functions, InfoCache, CGUpdater);
+
+    OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
+    return OMPOpt.run();
+  }
+
+  bool doFinalization(CallGraph &CG) override { return CGUpdater.finalize(); }
+};
+
+} // end anonymous namespace
+
+void OpenMPInModule::identifyKernels(Module &M) {
+
+  NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
+  if (!MD)
+    return;
+
+  for (auto *Op : MD->operands()) {
+    if (Op->getNumOperands() < 2)
+      continue;
+    MDString *KindID = dyn_cast<MDString>(Op->getOperand(1));
+    if (!KindID || KindID->getString() != "kernel")
+      continue;
+
+    Function *KernelFn =
+        mdconst::dyn_extract_or_null<Function>(Op->getOperand(0));
+    if (!KernelFn)
+      continue;
+
+    ++NumOpenMPTargetRegionKernels;
+
+    Kernels.insert(KernelFn);
+  }
+}
+
+bool llvm::omp::containsOpenMP(Module &M, OpenMPInModule &OMPInModule) {
+  if (OMPInModule.isKnown())
+    return OMPInModule;
+
   auto RecordFunctionsContainingUsesOf = [&](Function *F) {
     for (User *U : F->users())
       if (auto *I = dyn_cast<Instruction>(U))
         OMPInModule.FuncsWithOMPRuntimeCalls.insert(I->getFunction());
   };
 
-  // MSVC doesn't like long if-else chains for some reason and instead just 
-  // issues an error. Work around it.. 
-  do { 
-#define OMP_RTL(_Enum, _Name, ...)                                             \ 
+  // MSVC doesn't like long if-else chains for some reason and instead just
+  // issues an error. Work around it..
+  do {
+#define OMP_RTL(_Enum, _Name, ...)                                             \
   if (Function *F = M.getFunction(_Name)) {                                    \
     RecordFunctionsContainingUsesOf(F);                                        \
-    OMPInModule = true;                                                        \ 
-  } 
-#include "llvm/Frontend/OpenMP/OMPKinds.def" 
-  } while (false); 
- 
-  // Identify kernels once. TODO: We should split the OMPInformationCache into a 
-  // module and an SCC part. The kernel information, among other things, could 
-  // go into the module part. 
-  if (OMPInModule.isKnown() && OMPInModule) { 
-    OMPInModule.identifyKernels(M); 
-    return true; 
-  } 
- 
-  return OMPInModule = false; 
-} 
- 
-char OpenMPOptLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(OpenMPOptLegacyPass, "openmpopt", 
-                      "OpenMP specific optimizations", false, false) 
-INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) 
-INITIALIZE_PASS_END(OpenMPOptLegacyPass, "openmpopt", 
-                    "OpenMP specific optimizations", false, false) 
- 
-Pass *llvm::createOpenMPOptLegacyPass() { return new OpenMPOptLegacyPass(); } 
+    OMPInModule = true;                                                        \
+  }
+#include "llvm/Frontend/OpenMP/OMPKinds.def"
+  } while (false);
+
+  // Identify kernels once. TODO: We should split the OMPInformationCache into a
+  // module and an SCC part. The kernel information, among other things, could
+  // go into the module part.
+  if (OMPInModule.isKnown() && OMPInModule) {
+    OMPInModule.identifyKernels(M);
+    return true;
+  }
+
+  return OMPInModule = false;
+}
+
+char OpenMPOptLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(OpenMPOptLegacyPass, "openmpopt",
+                      "OpenMP specific optimizations", false, false)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_END(OpenMPOptLegacyPass, "openmpopt",
+                    "OpenMP specific optimizations", false, false)
+
+Pass *llvm::createOpenMPOptLegacyPass() { return new OpenMPOptLegacyPass(); }
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/PartialInlining.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/PartialInlining.cpp
index e0a77e26b2..2bbf4bf110 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/PartialInlining.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/PartialInlining.cpp
@@ -1,880 +1,880 @@
-//===- PartialInlining.cpp - Inline parts of functions --------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass performs partial inlining, typically by inlining an if statement 
-// that surrounds the body of the function. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/IPO/PartialInlining.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/DenseSet.h" 
-#include "llvm/ADT/None.h" 
-#include "llvm/ADT/Optional.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/BlockFrequencyInfo.h" 
-#include "llvm/Analysis/BranchProbabilityInfo.h" 
-#include "llvm/Analysis/InlineCost.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/OptimizationRemarkEmitter.h" 
-#include "llvm/Analysis/ProfileSummaryInfo.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/IR/Attributes.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/DebugLoc.h" 
-#include "llvm/IR/DiagnosticInfo.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/BlockFrequency.h" 
-#include "llvm/Support/BranchProbability.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Transforms/IPO.h" 
-#include "llvm/Transforms/Utils/Cloning.h" 
-#include "llvm/Transforms/Utils/CodeExtractor.h" 
-#include "llvm/Transforms/Utils/ValueMapper.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <cstdint> 
-#include <functional> 
-#include <iterator> 
-#include <memory> 
-#include <tuple> 
-#include <vector> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "partial-inlining" 
- 
-STATISTIC(NumPartialInlined, 
-          "Number of callsites functions partially inlined into."); 
-STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with " 
-                                        "cold outlined regions were partially " 
-                                        "inlined into its caller(s)."); 
-STATISTIC(NumColdRegionsFound, 
-           "Number of cold single entry/exit regions found."); 
-STATISTIC(NumColdRegionsOutlined, 
-           "Number of cold single entry/exit regions outlined."); 
- 
-// Command line option to disable partial-inlining. The default is false: 
-static cl::opt<bool> 
-    DisablePartialInlining("disable-partial-inlining", cl::init(false), 
-                           cl::Hidden, cl::desc("Disable partial inlining")); 
-// Command line option to disable multi-region partial-inlining. The default is 
-// false: 
-static cl::opt<bool> DisableMultiRegionPartialInline( 
-    "disable-mr-partial-inlining", cl::init(false), cl::Hidden, 
-    cl::desc("Disable multi-region partial inlining")); 
- 
-// Command line option to force outlining in regions with live exit variables. 
-// The default is false: 
-static cl::opt<bool> 
-    ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden, 
-               cl::desc("Force outline regions with live exits")); 
- 
-// Command line option to enable marking outline functions with Cold Calling 
-// Convention. The default is false: 
-static cl::opt<bool> 
-    MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden, 
-                       cl::desc("Mark outline function calls with ColdCC")); 
- 
-// This is an option used by testing: 
-static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis", 
-                                      cl::init(false), cl::ZeroOrMore, 
-                                      cl::ReallyHidden, 
-                                      cl::desc("Skip Cost Analysis")); 
-// Used to determine if a cold region is worth outlining based on 
-// its inlining cost compared to the original function.  Default is set at 10%. 
-// ie. if the cold region reduces the inlining cost of the original function by 
-// at least 10%. 
-static cl::opt<float> MinRegionSizeRatio( 
-    "min-region-size-ratio", cl::init(0.1), cl::Hidden, 
-    cl::desc("Minimum ratio comparing relative sizes of each " 
-             "outline candidate and original function")); 
-// Used to tune the minimum number of execution counts needed in the predecessor 
-// block to the cold edge. ie. confidence interval. 
-static cl::opt<unsigned> 
-    MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden, 
-                             cl::desc("Minimum block executions to consider " 
-                                      "its BranchProbabilityInfo valid")); 
-// Used to determine when an edge is considered cold. Default is set to 10%. ie. 
-// if the branch probability is 10% or less, then it is deemed as 'cold'. 
-static cl::opt<float> ColdBranchRatio( 
-    "cold-branch-ratio", cl::init(0.1), cl::Hidden, 
-    cl::desc("Minimum BranchProbability to consider a region cold.")); 
- 
-static cl::opt<unsigned> MaxNumInlineBlocks( 
-    "max-num-inline-blocks", cl::init(5), cl::Hidden, 
-    cl::desc("Max number of blocks to be partially inlined")); 
- 
-// Command line option to set the maximum number of partial inlining allowed 
-// for the module. The default value of -1 means no limit. 
-static cl::opt<int> MaxNumPartialInlining( 
-    "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore, 
-    cl::desc("Max number of partial inlining. The default is unlimited")); 
- 
-// Used only when PGO or user annotated branch data is absent. It is 
-// the least value that is used to weigh the outline region. If BFI 
-// produces larger value, the BFI value will be used. 
-static cl::opt<int> 
-    OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), 
-                             cl::Hidden, cl::ZeroOrMore, 
-                             cl::desc("Relative frequency of outline region to " 
-                                      "the entry block")); 
- 
-static cl::opt<unsigned> ExtraOutliningPenalty( 
-    "partial-inlining-extra-penalty", cl::init(0), cl::Hidden, 
-    cl::desc("A debug option to add additional penalty to the computed one.")); 
- 
-namespace { 
- 
-struct FunctionOutliningInfo { 
-  FunctionOutliningInfo() = default; 
- 
-  // Returns the number of blocks to be inlined including all blocks 
-  // in Entries and one return block. 
+//===- PartialInlining.cpp - Inline parts of functions --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs partial inlining, typically by inlining an if statement
+// that surrounds the body of the function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/PartialInlining.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/User.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/CodeExtractor.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "partial-inlining"
+
+STATISTIC(NumPartialInlined,
+          "Number of callsites functions partially inlined into.");
+STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "
+                                        "cold outlined regions were partially "
+                                        "inlined into its caller(s).");
+STATISTIC(NumColdRegionsFound,
+           "Number of cold single entry/exit regions found.");
+STATISTIC(NumColdRegionsOutlined,
+           "Number of cold single entry/exit regions outlined.");
+
+// Command line option to disable partial-inlining. The default is false:
+static cl::opt<bool>
+    DisablePartialInlining("disable-partial-inlining", cl::init(false),
+                           cl::Hidden, cl::desc("Disable partial inlining"));
+// Command line option to disable multi-region partial-inlining. The default is
+// false:
+static cl::opt<bool> DisableMultiRegionPartialInline(
+    "disable-mr-partial-inlining", cl::init(false), cl::Hidden,
+    cl::desc("Disable multi-region partial inlining"));
+
+// Command line option to force outlining in regions with live exit variables.
+// The default is false:
+static cl::opt<bool>
+    ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden,
+               cl::desc("Force outline regions with live exits"));
+
+// Command line option to enable marking outline functions with Cold Calling
+// Convention. The default is false:
+static cl::opt<bool>
+    MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,
+                       cl::desc("Mark outline function calls with ColdCC"));
+
+// This is an option used by testing:
+static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
+                                      cl::init(false), cl::ZeroOrMore,
+                                      cl::ReallyHidden,
+                                      cl::desc("Skip Cost Analysis"));
+// Used to determine if a cold region is worth outlining based on
+// its inlining cost compared to the original function.  Default is set at 10%.
+// ie. if the cold region reduces the inlining cost of the original function by
+// at least 10%.
+static cl::opt<float> MinRegionSizeRatio(
+    "min-region-size-ratio", cl::init(0.1), cl::Hidden,
+    cl::desc("Minimum ratio comparing relative sizes of each "
+             "outline candidate and original function"));
+// Used to tune the minimum number of execution counts needed in the predecessor
+// block to the cold edge. ie. confidence interval.
+static cl::opt<unsigned>
+    MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
+                             cl::desc("Minimum block executions to consider "
+                                      "its BranchProbabilityInfo valid"));
+// Used to determine when an edge is considered cold. Default is set to 10%. ie.
+// if the branch probability is 10% or less, then it is deemed as 'cold'.
+static cl::opt<float> ColdBranchRatio(
+    "cold-branch-ratio", cl::init(0.1), cl::Hidden,
+    cl::desc("Minimum BranchProbability to consider a region cold."));
+
+static cl::opt<unsigned> MaxNumInlineBlocks(
+    "max-num-inline-blocks", cl::init(5), cl::Hidden,
+    cl::desc("Max number of blocks to be partially inlined"));
+
+// Command line option to set the maximum number of partial inlining allowed
+// for the module. The default value of -1 means no limit.
+static cl::opt<int> MaxNumPartialInlining(
+    "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
+    cl::desc("Max number of partial inlining. The default is unlimited"));
+
+// Used only when PGO or user annotated branch data is absent. It is
+// the least value that is used to weigh the outline region. If BFI
+// produces larger value, the BFI value will be used.
+static cl::opt<int>
+    OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
+                             cl::Hidden, cl::ZeroOrMore,
+                             cl::desc("Relative frequency of outline region to "
+                                      "the entry block"));
+
+static cl::opt<unsigned> ExtraOutliningPenalty(
+    "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
+    cl::desc("A debug option to add additional penalty to the computed one."));
+
+namespace {
+
+struct FunctionOutliningInfo {
+  FunctionOutliningInfo() = default;
+
+  // Returns the number of blocks to be inlined including all blocks
+  // in Entries and one return block.
   unsigned getNumInlinedBlocks() const { return Entries.size() + 1; }
- 
-  // A set of blocks including the function entry that guard 
-  // the region to be outlined. 
-  SmallVector<BasicBlock *, 4> Entries; 
- 
-  // The return block that is not included in the outlined region. 
-  BasicBlock *ReturnBlock = nullptr; 
- 
-  // The dominating block of the region to be outlined. 
-  BasicBlock *NonReturnBlock = nullptr; 
- 
-  // The set of blocks in Entries that that are predecessors to ReturnBlock 
-  SmallVector<BasicBlock *, 4> ReturnBlockPreds; 
-}; 
- 
-struct FunctionOutliningMultiRegionInfo { 
-  FunctionOutliningMultiRegionInfo() 
-      : ORI() {} 
- 
-  // Container for outline regions 
-  struct OutlineRegionInfo { 
-    OutlineRegionInfo(ArrayRef<BasicBlock *> Region, 
-                      BasicBlock *EntryBlock, BasicBlock *ExitBlock, 
-                      BasicBlock *ReturnBlock) 
-        : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock), 
-          ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {} 
-    SmallVector<BasicBlock *, 8> Region; 
-    BasicBlock *EntryBlock; 
-    BasicBlock *ExitBlock; 
-    BasicBlock *ReturnBlock; 
-  }; 
- 
-  SmallVector<OutlineRegionInfo, 4> ORI; 
-}; 
- 
-struct PartialInlinerImpl { 
- 
-  PartialInlinerImpl( 
-      function_ref<AssumptionCache &(Function &)> GetAC, 
-      function_ref<AssumptionCache *(Function &)> LookupAC, 
-      function_ref<TargetTransformInfo &(Function &)> GTTI, 
-      function_ref<const TargetLibraryInfo &(Function &)> GTLI, 
-      ProfileSummaryInfo &ProfSI, 
-      function_ref<BlockFrequencyInfo &(Function &)> GBFI = nullptr) 
-      : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC), 
-        GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {} 
- 
-  bool run(Module &M); 
-  // Main part of the transformation that calls helper functions to find 
-  // outlining candidates, clone & outline the function, and attempt to 
-  // partially inline the resulting function. Returns true if 
-  // inlining was successful, false otherwise.  Also returns the outline 
-  // function (only if we partially inlined early returns) as there is a 
-  // possibility to further "peel" early return statements that were left in the 
-  // outline function due to code size. 
+
+  // A set of blocks including the function entry that guard
+  // the region to be outlined.
+  SmallVector<BasicBlock *, 4> Entries;
+
+  // The return block that is not included in the outlined region.
+  BasicBlock *ReturnBlock = nullptr;
+
+  // The dominating block of the region to be outlined.
+  BasicBlock *NonReturnBlock = nullptr;
+
+  // The set of blocks in Entries that that are predecessors to ReturnBlock
+  SmallVector<BasicBlock *, 4> ReturnBlockPreds;
+};
+
+struct FunctionOutliningMultiRegionInfo {
+  FunctionOutliningMultiRegionInfo()
+      : ORI() {}
+
+  // Container for outline regions
+  struct OutlineRegionInfo {
+    OutlineRegionInfo(ArrayRef<BasicBlock *> Region,
+                      BasicBlock *EntryBlock, BasicBlock *ExitBlock,
+                      BasicBlock *ReturnBlock)
+        : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock),
+          ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {}
+    SmallVector<BasicBlock *, 8> Region;
+    BasicBlock *EntryBlock;
+    BasicBlock *ExitBlock;
+    BasicBlock *ReturnBlock;
+  };
+
+  SmallVector<OutlineRegionInfo, 4> ORI;
+};
+
+struct PartialInlinerImpl {
+
+  PartialInlinerImpl(
+      function_ref<AssumptionCache &(Function &)> GetAC,
+      function_ref<AssumptionCache *(Function &)> LookupAC,
+      function_ref<TargetTransformInfo &(Function &)> GTTI,
+      function_ref<const TargetLibraryInfo &(Function &)> GTLI,
+      ProfileSummaryInfo &ProfSI,
+      function_ref<BlockFrequencyInfo &(Function &)> GBFI = nullptr)
+      : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC),
+        GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {}
+
+  bool run(Module &M);
+  // Main part of the transformation that calls helper functions to find
+  // outlining candidates, clone & outline the function, and attempt to
+  // partially inline the resulting function. Returns true if
+  // inlining was successful, false otherwise.  Also returns the outline
+  // function (only if we partially inlined early returns) as there is a
+  // possibility to further "peel" early return statements that were left in the
+  // outline function due to code size.
   std::pair<bool, Function *> unswitchFunction(Function &F);
- 
-  // This class speculatively clones the function to be partial inlined. 
-  // At the end of partial inlining, the remaining callsites to the cloned 
-  // function that are not partially inlined will be fixed up to reference 
-  // the original function, and the cloned function will be erased. 
-  struct FunctionCloner { 
-    // Two constructors, one for single region outlining, the other for 
-    // multi-region outlining. 
-    FunctionCloner(Function *F, FunctionOutliningInfo *OI, 
-                   OptimizationRemarkEmitter &ORE, 
+
+  // This class speculatively clones the function to be partial inlined.
+  // At the end of partial inlining, the remaining callsites to the cloned
+  // function that are not partially inlined will be fixed up to reference
+  // the original function, and the cloned function will be erased.
+  struct FunctionCloner {
+    // Two constructors, one for single region outlining, the other for
+    // multi-region outlining.
+    FunctionCloner(Function *F, FunctionOutliningInfo *OI,
+                   OptimizationRemarkEmitter &ORE,
                    function_ref<AssumptionCache *(Function &)> LookupAC,
                    function_ref<TargetTransformInfo &(Function &)> GetTTI);
-    FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI, 
-                   OptimizationRemarkEmitter &ORE, 
+    FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
+                   OptimizationRemarkEmitter &ORE,
                    function_ref<AssumptionCache *(Function &)> LookupAC,
                    function_ref<TargetTransformInfo &(Function &)> GetTTI);
 
-    ~FunctionCloner(); 
- 
-    // Prepare for function outlining: making sure there is only 
-    // one incoming edge from the extracted/outlined region to 
-    // the return block. 
+    ~FunctionCloner();
+
+    // Prepare for function outlining: making sure there is only
+    // one incoming edge from the extracted/outlined region to
+    // the return block.
     void normalizeReturnBlock() const;
- 
-    // Do function outlining for cold regions. 
-    bool doMultiRegionFunctionOutlining(); 
-    // Do function outlining for region after early return block(s). 
-    // NOTE: For vararg functions that do the vararg handling in the outlined 
-    //       function, we temporarily generate IR that does not properly 
-    //       forward varargs to the outlined function. Calling InlineFunction 
-    //       will update calls to the outlined functions to properly forward 
-    //       the varargs. 
-    Function *doSingleRegionFunctionOutlining(); 
- 
-    Function *OrigFunc = nullptr; 
-    Function *ClonedFunc = nullptr; 
- 
-    typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair; 
-    // Keep track of Outlined Functions and the basic block they're called from. 
-    SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions; 
- 
-    // ClonedFunc is inlined in one of its callers after function 
-    // outlining. 
-    bool IsFunctionInlined = false; 
-    // The cost of the region to be outlined. 
-    int OutlinedRegionCost = 0; 
-    // ClonedOI is specific to outlining non-early return blocks. 
-    std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr; 
-    // ClonedOMRI is specific to outlining cold regions. 
-    std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr; 
-    std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr; 
-    OptimizationRemarkEmitter &ORE; 
-    function_ref<AssumptionCache *(Function &)> LookupAC; 
+
+    // Do function outlining for cold regions.
+    bool doMultiRegionFunctionOutlining();
+    // Do function outlining for region after early return block(s).
+    // NOTE: For vararg functions that do the vararg handling in the outlined
+    //       function, we temporarily generate IR that does not properly
+    //       forward varargs to the outlined function. Calling InlineFunction
+    //       will update calls to the outlined functions to properly forward
+    //       the varargs.
+    Function *doSingleRegionFunctionOutlining();
+
+    Function *OrigFunc = nullptr;
+    Function *ClonedFunc = nullptr;
+
+    typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
+    // Keep track of Outlined Functions and the basic block they're called from.
+    SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions;
+
+    // ClonedFunc is inlined in one of its callers after function
+    // outlining.
+    bool IsFunctionInlined = false;
+    // The cost of the region to be outlined.
+    int OutlinedRegionCost = 0;
+    // ClonedOI is specific to outlining non-early return blocks.
+    std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
+    // ClonedOMRI is specific to outlining cold regions.
+    std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;
+    std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
+    OptimizationRemarkEmitter &ORE;
+    function_ref<AssumptionCache *(Function &)> LookupAC;
     function_ref<TargetTransformInfo &(Function &)> GetTTI;
-  }; 
- 
-private: 
-  int NumPartialInlining = 0; 
-  function_ref<AssumptionCache &(Function &)> GetAssumptionCache; 
-  function_ref<AssumptionCache *(Function &)> LookupAssumptionCache; 
-  function_ref<TargetTransformInfo &(Function &)> GetTTI; 
-  function_ref<BlockFrequencyInfo &(Function &)> GetBFI; 
-  function_ref<const TargetLibraryInfo &(Function &)> GetTLI; 
-  ProfileSummaryInfo &PSI; 
- 
-  // Return the frequency of the OutlininingBB relative to F's entry point. 
-  // The result is no larger than 1 and is represented using BP. 
-  // (Note that the outlined region's 'head' block can only have incoming 
-  // edges from the guarding entry blocks). 
+  };
+
+private:
+  int NumPartialInlining = 0;
+  function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
+  function_ref<AssumptionCache *(Function &)> LookupAssumptionCache;
+  function_ref<TargetTransformInfo &(Function &)> GetTTI;
+  function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
+  function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
+  ProfileSummaryInfo &PSI;
+
+  // Return the frequency of the OutlininingBB relative to F's entry point.
+  // The result is no larger than 1 and is represented using BP.
+  // (Note that the outlined region's 'head' block can only have incoming
+  // edges from the guarding entry blocks).
   BranchProbability
   getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) const;
- 
-  // Return true if the callee of CB should be partially inlined with 
-  // profit. 
-  bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner, 
-                           BlockFrequency WeightedOutliningRcost, 
+
+  // Return true if the callee of CB should be partially inlined with
+  // profit.
+  bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner,
+                           BlockFrequency WeightedOutliningRcost,
                            OptimizationRemarkEmitter &ORE) const;
- 
-  // Try to inline DuplicateFunction (cloned from F with call to 
-  // the OutlinedFunction into its callers. Return true 
-  // if there is any successful inlining. 
-  bool tryPartialInline(FunctionCloner &Cloner); 
- 
-  // Compute the mapping from use site of DuplicationFunction to the enclosing 
-  // BB's profile count. 
+
+  // Try to inline DuplicateFunction (cloned from F with call to
+  // the OutlinedFunction into its callers. Return true
+  // if there is any successful inlining.
+  bool tryPartialInline(FunctionCloner &Cloner);
+
+  // Compute the mapping from use site of DuplicationFunction to the enclosing
+  // BB's profile count.
   void
   computeCallsiteToProfCountMap(Function *DuplicateFunction,
                                 DenseMap<User *, uint64_t> &SiteCountMap) const;
- 
+
   bool isLimitReached() const {
-    return (MaxNumPartialInlining != -1 && 
-            NumPartialInlining >= MaxNumPartialInlining); 
-  } 
- 
-  static CallBase *getSupportedCallBase(User *U) { 
-    if (isa<CallInst>(U) || isa<InvokeInst>(U)) 
-      return cast<CallBase>(U); 
-    llvm_unreachable("All uses must be calls"); 
-    return nullptr; 
-  } 
- 
+    return (MaxNumPartialInlining != -1 &&
+            NumPartialInlining >= MaxNumPartialInlining);
+  }
+
+  static CallBase *getSupportedCallBase(User *U) {
+    if (isa<CallInst>(U) || isa<InvokeInst>(U))
+      return cast<CallBase>(U);
+    llvm_unreachable("All uses must be calls");
+    return nullptr;
+  }
+
   static CallBase *getOneCallSiteTo(Function &F) {
     User *User = *F.user_begin();
-    return getSupportedCallBase(User); 
-  } 
- 
+    return getSupportedCallBase(User);
+  }
+
   std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function &F) const {
-    CallBase *CB = getOneCallSiteTo(F); 
-    DebugLoc DLoc = CB->getDebugLoc(); 
-    BasicBlock *Block = CB->getParent(); 
-    return std::make_tuple(DLoc, Block); 
-  } 
- 
-  // Returns the costs associated with function outlining: 
-  // - The first value is the non-weighted runtime cost for making the call 
-  //   to the outlined function, including the addtional  setup cost in the 
-  //    outlined function itself; 
-  // - The second value is the estimated size of the new call sequence in 
-  //   basic block Cloner.OutliningCallBB; 
+    CallBase *CB = getOneCallSiteTo(F);
+    DebugLoc DLoc = CB->getDebugLoc();
+    BasicBlock *Block = CB->getParent();
+    return std::make_tuple(DLoc, Block);
+  }
+
+  // Returns the costs associated with function outlining:
+  // - The first value is the non-weighted runtime cost for making the call
+  //   to the outlined function, including the addtional  setup cost in the
+  //    outlined function itself;
+  // - The second value is the estimated size of the new call sequence in
+  //   basic block Cloner.OutliningCallBB;
   std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner) const;
- 
-  // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to 
-  // approximate both the size and runtime cost (Note that in the current 
-  // inline cost analysis, there is no clear distinction there either). 
+
+  // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
+  // approximate both the size and runtime cost (Note that in the current
+  // inline cost analysis, there is no clear distinction there either).
   static int computeBBInlineCost(BasicBlock *BB, TargetTransformInfo *TTI);
- 
+
   std::unique_ptr<FunctionOutliningInfo>
   computeOutliningInfo(Function &F) const;
 
-  std::unique_ptr<FunctionOutliningMultiRegionInfo> 
+  std::unique_ptr<FunctionOutliningMultiRegionInfo>
   computeOutliningColdRegionsInfo(Function &F,
                                   OptimizationRemarkEmitter &ORE) const;
-}; 
- 
-struct PartialInlinerLegacyPass : public ModulePass { 
-  static char ID; // Pass identification, replacement for typeid 
- 
-  PartialInlinerLegacyPass() : ModulePass(ID) { 
-    initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<AssumptionCacheTracker>(); 
-    AU.addRequired<ProfileSummaryInfoWrapperPass>(); 
-    AU.addRequired<TargetTransformInfoWrapperPass>(); 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-  } 
- 
-  bool runOnModule(Module &M) override { 
-    if (skipModule(M)) 
-      return false; 
- 
-    AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>(); 
-    TargetTransformInfoWrapperPass *TTIWP = 
-        &getAnalysis<TargetTransformInfoWrapperPass>(); 
-    ProfileSummaryInfo &PSI = 
-        getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 
- 
-    auto GetAssumptionCache = [&ACT](Function &F) -> AssumptionCache & { 
-      return ACT->getAssumptionCache(F); 
-    }; 
- 
-    auto LookupAssumptionCache = [ACT](Function &F) -> AssumptionCache * { 
-      return ACT->lookupAssumptionCache(F); 
-    }; 
- 
-    auto GetTTI = [&TTIWP](Function &F) -> TargetTransformInfo & { 
-      return TTIWP->getTTI(F); 
-    }; 
- 
-    auto GetTLI = [this](Function &F) -> TargetLibraryInfo & { 
-      return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 
-    }; 
- 
-    return PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI, 
-                              GetTLI, PSI) 
-        .run(M); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-std::unique_ptr<FunctionOutliningMultiRegionInfo> 
+};
+
+struct PartialInlinerLegacyPass : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+
+  PartialInlinerLegacyPass() : ModulePass(ID) {
+    initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+
+    AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
+    TargetTransformInfoWrapperPass *TTIWP =
+        &getAnalysis<TargetTransformInfoWrapperPass>();
+    ProfileSummaryInfo &PSI =
+        getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+
+    auto GetAssumptionCache = [&ACT](Function &F) -> AssumptionCache & {
+      return ACT->getAssumptionCache(F);
+    };
+
+    auto LookupAssumptionCache = [ACT](Function &F) -> AssumptionCache * {
+      return ACT->lookupAssumptionCache(F);
+    };
+
+    auto GetTTI = [&TTIWP](Function &F) -> TargetTransformInfo & {
+      return TTIWP->getTTI(F);
+    };
+
+    auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
+      return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    };
+
+    return PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
+                              GetTLI, PSI)
+        .run(M);
+  }
+};
+
+} // end anonymous namespace
+
+std::unique_ptr<FunctionOutliningMultiRegionInfo>
 PartialInlinerImpl::computeOutliningColdRegionsInfo(
     Function &F, OptimizationRemarkEmitter &ORE) const {
   BasicBlock *EntryBlock = &F.front();
- 
+
   DominatorTree DT(F);
-  LoopInfo LI(DT); 
+  LoopInfo LI(DT);
   BranchProbabilityInfo BPI(F, LI);
-  std::unique_ptr<BlockFrequencyInfo> ScopedBFI; 
-  BlockFrequencyInfo *BFI; 
-  if (!GetBFI) { 
+  std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
+  BlockFrequencyInfo *BFI;
+  if (!GetBFI) {
     ScopedBFI.reset(new BlockFrequencyInfo(F, BPI, LI));
-    BFI = ScopedBFI.get(); 
-  } else 
+    BFI = ScopedBFI.get();
+  } else
     BFI = &(GetBFI(F));
- 
-  // Return if we don't have profiling information. 
-  if (!PSI.hasInstrumentationProfile()) 
-    return std::unique_ptr<FunctionOutliningMultiRegionInfo>(); 
- 
-  std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo = 
-      std::make_unique<FunctionOutliningMultiRegionInfo>(); 
- 
-  auto IsSingleExit = 
-      [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * { 
-    BasicBlock *ExitBlock = nullptr; 
-    for (auto *Block : BlockList) { 
-      for (auto SI = succ_begin(Block); SI != succ_end(Block); ++SI) { 
-        if (!is_contained(BlockList, *SI)) { 
-          if (ExitBlock) { 
-            ORE.emit([&]() { 
-              return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion", 
-                                              &SI->front()) 
-                     << "Region dominated by " 
-                     << ore::NV("Block", BlockList.front()->getName()) 
-                     << " has more than one region exit edge."; 
-            }); 
-            return nullptr; 
+
+  // Return if we don't have profiling information.
+  if (!PSI.hasInstrumentationProfile())
+    return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
+
+  std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
+      std::make_unique<FunctionOutliningMultiRegionInfo>();
+
+  auto IsSingleExit =
+      [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
+    BasicBlock *ExitBlock = nullptr;
+    for (auto *Block : BlockList) {
+      for (auto SI = succ_begin(Block); SI != succ_end(Block); ++SI) {
+        if (!is_contained(BlockList, *SI)) {
+          if (ExitBlock) {
+            ORE.emit([&]() {
+              return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion",
+                                              &SI->front())
+                     << "Region dominated by "
+                     << ore::NV("Block", BlockList.front()->getName())
+                     << " has more than one region exit edge.";
+            });
+            return nullptr;
           }
 
           ExitBlock = Block;
-        } 
-      } 
-    } 
-    return ExitBlock; 
-  }; 
- 
-  auto BBProfileCount = [BFI](BasicBlock *BB) { 
-    return BFI->getBlockProfileCount(BB) 
-               ? BFI->getBlockProfileCount(BB).getValue() 
-               : 0; 
-  }; 
- 
-  // Use the same computeBBInlineCost function to compute the cost savings of 
-  // the outlining the candidate region. 
+        }
+      }
+    }
+    return ExitBlock;
+  };
+
+  auto BBProfileCount = [BFI](BasicBlock *BB) {
+    return BFI->getBlockProfileCount(BB)
+               ? BFI->getBlockProfileCount(BB).getValue()
+               : 0;
+  };
+
+  // Use the same computeBBInlineCost function to compute the cost savings of
+  // the outlining the candidate region.
   TargetTransformInfo *FTTI = &GetTTI(F);
-  int OverallFunctionCost = 0; 
+  int OverallFunctionCost = 0;
   for (auto &BB : F)
     OverallFunctionCost += computeBBInlineCost(&BB, FTTI);
- 
+
   LLVM_DEBUG(dbgs() << "OverallFunctionCost = " << OverallFunctionCost
                     << "\n";);
 
-  int MinOutlineRegionCost = 
-      static_cast<int>(OverallFunctionCost * MinRegionSizeRatio); 
-  BranchProbability MinBranchProbability( 
-      static_cast<int>(ColdBranchRatio * MinBlockCounterExecution), 
-      MinBlockCounterExecution); 
-  bool ColdCandidateFound = false; 
-  BasicBlock *CurrEntry = EntryBlock; 
-  std::vector<BasicBlock *> DFS; 
-  DenseMap<BasicBlock *, bool> VisitedMap; 
-  DFS.push_back(CurrEntry); 
-  VisitedMap[CurrEntry] = true; 
-
-  // Use Depth First Search on the basic blocks to find CFG edges that are 
-  // considered cold. 
-  // Cold regions considered must also have its inline cost compared to the 
-  // overall inline cost of the original function.  The region is outlined only 
-  // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or 
-  // more. 
-  while (!DFS.empty()) { 
+  int MinOutlineRegionCost =
+      static_cast<int>(OverallFunctionCost * MinRegionSizeRatio);
+  BranchProbability MinBranchProbability(
+      static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),
+      MinBlockCounterExecution);
+  bool ColdCandidateFound = false;
+  BasicBlock *CurrEntry = EntryBlock;
+  std::vector<BasicBlock *> DFS;
+  DenseMap<BasicBlock *, bool> VisitedMap;
+  DFS.push_back(CurrEntry);
+  VisitedMap[CurrEntry] = true;
+
+  // Use Depth First Search on the basic blocks to find CFG edges that are
+  // considered cold.
+  // Cold regions considered must also have its inline cost compared to the
+  // overall inline cost of the original function.  The region is outlined only
+  // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
+  // more.
+  while (!DFS.empty()) {
     auto *ThisBB = DFS.back();
-    DFS.pop_back(); 
-    // Only consider regions with predecessor blocks that are considered 
-    // not-cold (default: part of the top 99.99% of all block counters) 
-    // AND greater than our minimum block execution count (default: 100). 
+    DFS.pop_back();
+    // Only consider regions with predecessor blocks that are considered
+    // not-cold (default: part of the top 99.99% of all block counters)
+    // AND greater than our minimum block execution count (default: 100).
     if (PSI.isColdBlock(ThisBB, BFI) ||
         BBProfileCount(ThisBB) < MinBlockCounterExecution)
-      continue; 
+      continue;
     for (auto SI = succ_begin(ThisBB); SI != succ_end(ThisBB); ++SI) {
-      if (VisitedMap[*SI]) 
-        continue; 
-      VisitedMap[*SI] = true; 
-      DFS.push_back(*SI); 
-      // If branch isn't cold, we skip to the next one. 
+      if (VisitedMap[*SI])
+        continue;
+      VisitedMap[*SI] = true;
+      DFS.push_back(*SI);
+      // If branch isn't cold, we skip to the next one.
       BranchProbability SuccProb = BPI.getEdgeProbability(ThisBB, *SI);
-      if (SuccProb > MinBranchProbability) 
-        continue; 
+      if (SuccProb > MinBranchProbability)
+        continue;
 
       LLVM_DEBUG(dbgs() << "Found cold edge: " << ThisBB->getName() << "->"
                         << SI->getName()
                         << "\nBranch Probability = " << SuccProb << "\n";);
 
-      SmallVector<BasicBlock *, 8> DominateVector; 
-      DT.getDescendants(*SI, DominateVector); 
+      SmallVector<BasicBlock *, 8> DominateVector;
+      DT.getDescendants(*SI, DominateVector);
       assert(!DominateVector.empty() &&
              "SI should be reachable and have at least itself as descendant");
 
-      // We can only outline single entry regions (for now). 
+      // We can only outline single entry regions (for now).
       if (!DominateVector.front()->hasNPredecessors(1)) {
         LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
                           << " doesn't have a single predecessor in the "
                              "dominator tree\n";);
-        continue; 
+        continue;
       }
 
-      BasicBlock *ExitBlock = nullptr; 
-      // We can only outline single exit regions (for now). 
+      BasicBlock *ExitBlock = nullptr;
+      // We can only outline single exit regions (for now).
       if (!(ExitBlock = IsSingleExit(DominateVector))) {
         LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
                           << " doesn't have a unique successor\n";);
-        continue; 
+        continue;
       }
 
-      int OutlineRegionCost = 0; 
-      for (auto *BB : DominateVector) 
+      int OutlineRegionCost = 0;
+      for (auto *BB : DominateVector)
         OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
- 
+
       LLVM_DEBUG(dbgs() << "OutlineRegionCost = " << OutlineRegionCost
                         << "\n";);
- 
+
       if (!SkipCostAnalysis && OutlineRegionCost < MinOutlineRegionCost) {
-        ORE.emit([&]() { 
-          return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", 
-                                            &SI->front()) 
+        ORE.emit([&]() {
+          return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",
+                                            &SI->front())
                  << ore::NV("Callee", &F)
                  << " inline cost-savings smaller than "
-                 << ore::NV("Cost", MinOutlineRegionCost); 
-        }); 
+                 << ore::NV("Cost", MinOutlineRegionCost);
+        });
 
         LLVM_DEBUG(dbgs() << "ABORT: Outline region cost is smaller than "
                           << MinOutlineRegionCost << "\n";);
-        continue; 
-      } 
-
-      // For now, ignore blocks that belong to a SISE region that is a 
-      // candidate for outlining.  In the future, we may want to look 
-      // at inner regions because the outer region may have live-exit 
-      // variables. 
-      for (auto *BB : DominateVector) 
-        VisitedMap[BB] = true; 
-
-      // ReturnBlock here means the block after the outline call 
-      BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor(); 
-      FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo( 
-          DominateVector, DominateVector.front(), ExitBlock, ReturnBlock); 
-      OutliningInfo->ORI.push_back(RegInfo); 
+        continue;
+      }
+
+      // For now, ignore blocks that belong to a SISE region that is a
+      // candidate for outlining.  In the future, we may want to look
+      // at inner regions because the outer region may have live-exit
+      // variables.
+      for (auto *BB : DominateVector)
+        VisitedMap[BB] = true;
+
+      // ReturnBlock here means the block after the outline call
+      BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
+      FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
+          DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
+      OutliningInfo->ORI.push_back(RegInfo);
       LLVM_DEBUG(dbgs() << "Found Cold Candidate starting at block: "
                         << DominateVector.front()->getName() << "\n";);
-      ColdCandidateFound = true; 
-      NumColdRegionsFound++; 
-    } 
-  } 
+      ColdCandidateFound = true;
+      NumColdRegionsFound++;
+    }
+  }
 
-  if (ColdCandidateFound) 
-    return OutliningInfo; 
+  if (ColdCandidateFound)
+    return OutliningInfo;
 
   return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
-} 
- 
-std::unique_ptr<FunctionOutliningInfo> 
+}
+
+std::unique_ptr<FunctionOutliningInfo>
 PartialInlinerImpl::computeOutliningInfo(Function &F) const {
   BasicBlock *EntryBlock = &F.front();
-  BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator()); 
-  if (!BR || BR->isUnconditional()) 
-    return std::unique_ptr<FunctionOutliningInfo>(); 
- 
-  // Returns true if Succ is BB's successor 
-  auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) { 
-    return is_contained(successors(BB), Succ); 
-  }; 
- 
-  auto IsReturnBlock = [](BasicBlock *BB) { 
-    Instruction *TI = BB->getTerminator(); 
-    return isa<ReturnInst>(TI); 
-  }; 
- 
-  auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 
-    if (IsReturnBlock(Succ1)) 
-      return std::make_tuple(Succ1, Succ2); 
-    if (IsReturnBlock(Succ2)) 
-      return std::make_tuple(Succ2, Succ1); 
- 
-    return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 
-  }; 
- 
-  // Detect a triangular shape: 
-  auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 
-    if (IsSuccessor(Succ1, Succ2)) 
-      return std::make_tuple(Succ1, Succ2); 
-    if (IsSuccessor(Succ2, Succ1)) 
-      return std::make_tuple(Succ2, Succ1); 
- 
-    return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 
-  }; 
- 
-  std::unique_ptr<FunctionOutliningInfo> OutliningInfo = 
-      std::make_unique<FunctionOutliningInfo>(); 
- 
-  BasicBlock *CurrEntry = EntryBlock; 
-  bool CandidateFound = false; 
-  do { 
-    // The number of blocks to be inlined has already reached 
-    // the limit. When MaxNumInlineBlocks is set to 0 or 1, this 
-    // disables partial inlining for the function. 
+  BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
+  if (!BR || BR->isUnconditional())
+    return std::unique_ptr<FunctionOutliningInfo>();
+
+  // Returns true if Succ is BB's successor
+  auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
+    return is_contained(successors(BB), Succ);
+  };
+
+  auto IsReturnBlock = [](BasicBlock *BB) {
+    Instruction *TI = BB->getTerminator();
+    return isa<ReturnInst>(TI);
+  };
+
+  auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
+    if (IsReturnBlock(Succ1))
+      return std::make_tuple(Succ1, Succ2);
+    if (IsReturnBlock(Succ2))
+      return std::make_tuple(Succ2, Succ1);
+
+    return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
+  };
+
+  // Detect a triangular shape:
+  auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
+    if (IsSuccessor(Succ1, Succ2))
+      return std::make_tuple(Succ1, Succ2);
+    if (IsSuccessor(Succ2, Succ1))
+      return std::make_tuple(Succ2, Succ1);
+
+    return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
+  };
+
+  std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
+      std::make_unique<FunctionOutliningInfo>();
+
+  BasicBlock *CurrEntry = EntryBlock;
+  bool CandidateFound = false;
+  do {
+    // The number of blocks to be inlined has already reached
+    // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
+    // disables partial inlining for the function.
     if (OutliningInfo->getNumInlinedBlocks() >= MaxNumInlineBlocks)
-      break; 
- 
-    if (succ_size(CurrEntry) != 2) 
-      break; 
- 
-    BasicBlock *Succ1 = *succ_begin(CurrEntry); 
-    BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1); 
- 
-    BasicBlock *ReturnBlock, *NonReturnBlock; 
-    std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 
- 
-    if (ReturnBlock) { 
-      OutliningInfo->Entries.push_back(CurrEntry); 
-      OutliningInfo->ReturnBlock = ReturnBlock; 
-      OutliningInfo->NonReturnBlock = NonReturnBlock; 
-      CandidateFound = true; 
-      break; 
-    } 
- 
+      break;
+
+    if (succ_size(CurrEntry) != 2)
+      break;
+
+    BasicBlock *Succ1 = *succ_begin(CurrEntry);
+    BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
+
+    BasicBlock *ReturnBlock, *NonReturnBlock;
+    std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
+
+    if (ReturnBlock) {
+      OutliningInfo->Entries.push_back(CurrEntry);
+      OutliningInfo->ReturnBlock = ReturnBlock;
+      OutliningInfo->NonReturnBlock = NonReturnBlock;
+      CandidateFound = true;
+      break;
+    }
+
     BasicBlock *CommSucc, *OtherSucc;
-    std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2); 
- 
-    if (!CommSucc) 
-      break; 
- 
-    OutliningInfo->Entries.push_back(CurrEntry); 
-    CurrEntry = OtherSucc; 
-  } while (true); 
- 
-  if (!CandidateFound) 
-    return std::unique_ptr<FunctionOutliningInfo>(); 
- 
-  // Do sanity check of the entries: threre should not 
-  // be any successors (not in the entry set) other than 
-  // {ReturnBlock, NonReturnBlock} 
+    std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
+
+    if (!CommSucc)
+      break;
+
+    OutliningInfo->Entries.push_back(CurrEntry);
+    CurrEntry = OtherSucc;
+  } while (true);
+
+  if (!CandidateFound)
+    return std::unique_ptr<FunctionOutliningInfo>();
+
+  // Do sanity check of the entries: threre should not
+  // be any successors (not in the entry set) other than
+  // {ReturnBlock, NonReturnBlock}
   assert(OutliningInfo->Entries[0] == &F.front() &&
-         "Function Entry must be the first in Entries vector"); 
-  DenseSet<BasicBlock *> Entries; 
-  for (BasicBlock *E : OutliningInfo->Entries) 
-    Entries.insert(E); 
- 
-  // Returns true of BB has Predecessor which is not 
-  // in Entries set. 
-  auto HasNonEntryPred = [Entries](BasicBlock *BB) { 
+         "Function Entry must be the first in Entries vector");
+  DenseSet<BasicBlock *> Entries;
+  for (BasicBlock *E : OutliningInfo->Entries)
+    Entries.insert(E);
+
+  // Returns true of BB has Predecessor which is not
+  // in Entries set.
+  auto HasNonEntryPred = [Entries](BasicBlock *BB) {
     for (auto *Pred : predecessors(BB)) {
-      if (!Entries.count(Pred)) 
-        return true; 
-    } 
-    return false; 
-  }; 
-  auto CheckAndNormalizeCandidate = 
-      [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) { 
-        for (BasicBlock *E : OutliningInfo->Entries) { 
+      if (!Entries.count(Pred))
+        return true;
+    }
+    return false;
+  };
+  auto CheckAndNormalizeCandidate =
+      [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
+        for (BasicBlock *E : OutliningInfo->Entries) {
           for (auto *Succ : successors(E)) {
-            if (Entries.count(Succ)) 
-              continue; 
-            if (Succ == OutliningInfo->ReturnBlock) 
-              OutliningInfo->ReturnBlockPreds.push_back(E); 
-            else if (Succ != OutliningInfo->NonReturnBlock) 
-              return false; 
-          } 
-          // There should not be any outside incoming edges either: 
-          if (HasNonEntryPred(E)) 
-            return false; 
-        } 
-        return true; 
-      }; 
- 
-  if (!CheckAndNormalizeCandidate(OutliningInfo.get())) 
-    return std::unique_ptr<FunctionOutliningInfo>(); 
- 
-  // Now further growing the candidate's inlining region by 
-  // peeling off dominating blocks from the outlining region: 
+            if (Entries.count(Succ))
+              continue;
+            if (Succ == OutliningInfo->ReturnBlock)
+              OutliningInfo->ReturnBlockPreds.push_back(E);
+            else if (Succ != OutliningInfo->NonReturnBlock)
+              return false;
+          }
+          // There should not be any outside incoming edges either:
+          if (HasNonEntryPred(E))
+            return false;
+        }
+        return true;
+      };
+
+  if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
+    return std::unique_ptr<FunctionOutliningInfo>();
+
+  // Now further growing the candidate's inlining region by
+  // peeling off dominating blocks from the outlining region:
   while (OutliningInfo->getNumInlinedBlocks() < MaxNumInlineBlocks) {
-    BasicBlock *Cand = OutliningInfo->NonReturnBlock; 
-    if (succ_size(Cand) != 2) 
-      break; 
- 
-    if (HasNonEntryPred(Cand)) 
-      break; 
- 
-    BasicBlock *Succ1 = *succ_begin(Cand); 
-    BasicBlock *Succ2 = *(succ_begin(Cand) + 1); 
- 
-    BasicBlock *ReturnBlock, *NonReturnBlock; 
-    std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 
-    if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock) 
-      break; 
- 
-    if (NonReturnBlock->getSinglePredecessor() != Cand) 
-      break; 
- 
-    // Now grow and update OutlininigInfo: 
-    OutliningInfo->Entries.push_back(Cand); 
-    OutliningInfo->NonReturnBlock = NonReturnBlock; 
-    OutliningInfo->ReturnBlockPreds.push_back(Cand); 
-    Entries.insert(Cand); 
-  } 
- 
-  return OutliningInfo; 
-} 
- 
-// Check if there is PGO data or user annotated branch data: 
+    BasicBlock *Cand = OutliningInfo->NonReturnBlock;
+    if (succ_size(Cand) != 2)
+      break;
+
+    if (HasNonEntryPred(Cand))
+      break;
+
+    BasicBlock *Succ1 = *succ_begin(Cand);
+    BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
+
+    BasicBlock *ReturnBlock, *NonReturnBlock;
+    std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
+    if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
+      break;
+
+    if (NonReturnBlock->getSinglePredecessor() != Cand)
+      break;
+
+    // Now grow and update OutlininigInfo:
+    OutliningInfo->Entries.push_back(Cand);
+    OutliningInfo->NonReturnBlock = NonReturnBlock;
+    OutliningInfo->ReturnBlockPreds.push_back(Cand);
+    Entries.insert(Cand);
+  }
+
+  return OutliningInfo;
+}
+
+// Check if there is PGO data or user annotated branch data:
 static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI) {
   if (F.hasProfileData())
-    return true; 
-  // Now check if any of the entry block has MD_prof data: 
+    return true;
+  // Now check if any of the entry block has MD_prof data:
   for (auto *E : OI.Entries) {
-    BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator()); 
-    if (!BR || BR->isUnconditional()) 
-      continue; 
-    uint64_t T, F; 
-    if (BR->extractProfMetadata(T, F)) 
-      return true; 
-  } 
-  return false; 
-} 
- 
+    BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
+    if (!BR || BR->isUnconditional())
+      continue;
+    uint64_t T, F;
+    if (BR->extractProfMetadata(T, F))
+      return true;
+  }
+  return false;
+}
+
 BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
     FunctionCloner &Cloner) const {
-  BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second; 
-  auto EntryFreq = 
-      Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock()); 
-  auto OutliningCallFreq = 
-      Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB); 
-  // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE 
-  // we outlined any regions, so we may encounter situations where the 
-  // OutliningCallFreq is *slightly* bigger than the EntryFreq. 
+  BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
+  auto EntryFreq =
+      Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
+  auto OutliningCallFreq =
+      Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
+  // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
+  // we outlined any regions, so we may encounter situations where the
+  // OutliningCallFreq is *slightly* bigger than the EntryFreq.
   if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency())
-    OutliningCallFreq = EntryFreq; 
+    OutliningCallFreq = EntryFreq;
+
+  auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
+      OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
 
-  auto OutlineRegionRelFreq = BranchProbability::getBranchProbability( 
-      OutliningCallFreq.getFrequency(), EntryFreq.getFrequency()); 
- 
   if (hasProfileData(*Cloner.OrigFunc, *Cloner.ClonedOI.get()))
-    return OutlineRegionRelFreq; 
- 
-  // When profile data is not available, we need to be conservative in 
-  // estimating the overall savings. Static branch prediction can usually 
-  // guess the branch direction right (taken/non-taken), but the guessed 
-  // branch probability is usually not biased enough. In case when the 
-  // outlined region is predicted to be likely, its probability needs 
-  // to be made higher (more biased) to not under-estimate the cost of 
-  // function outlining. On the other hand, if the outlined region 
-  // is predicted to be less likely, the predicted probablity is usually 
-  // higher than the actual. For instance, the actual probability of the 
-  // less likely target is only 5%, but the guessed probablity can be 
-  // 40%. In the latter case, there is no need for further adjustement. 
-  // FIXME: add an option for this. 
-  if (OutlineRegionRelFreq < BranchProbability(45, 100)) 
-    return OutlineRegionRelFreq; 
- 
-  OutlineRegionRelFreq = std::max( 
-      OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100)); 
- 
-  return OutlineRegionRelFreq; 
-} 
- 
-bool PartialInlinerImpl::shouldPartialInline( 
-    CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost, 
+    return OutlineRegionRelFreq;
+
+  // When profile data is not available, we need to be conservative in
+  // estimating the overall savings. Static branch prediction can usually
+  // guess the branch direction right (taken/non-taken), but the guessed
+  // branch probability is usually not biased enough. In case when the
+  // outlined region is predicted to be likely, its probability needs
+  // to be made higher (more biased) to not under-estimate the cost of
+  // function outlining. On the other hand, if the outlined region
+  // is predicted to be less likely, the predicted probablity is usually
+  // higher than the actual. For instance, the actual probability of the
+  // less likely target is only 5%, but the guessed probablity can be
+  // 40%. In the latter case, there is no need for further adjustement.
+  // FIXME: add an option for this.
+  if (OutlineRegionRelFreq < BranchProbability(45, 100))
+    return OutlineRegionRelFreq;
+
+  OutlineRegionRelFreq = std::max(
+      OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
+
+  return OutlineRegionRelFreq;
+}
+
+bool PartialInlinerImpl::shouldPartialInline(
+    CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
     OptimizationRemarkEmitter &ORE) const {
-  using namespace ore; 
- 
-  Function *Callee = CB.getCalledFunction(); 
-  assert(Callee == Cloner.ClonedFunc); 
- 
-  if (SkipCostAnalysis) 
-    return isInlineViable(*Callee).isSuccess(); 
- 
-  Function *Caller = CB.getCaller(); 
-  auto &CalleeTTI = GetTTI(*Callee); 
-  bool RemarksEnabled = 
-      Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled( 
-          DEBUG_TYPE); 
-  InlineCost IC = 
-      getInlineCost(CB, getInlineParams(), CalleeTTI, GetAssumptionCache, 
-                    GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE : nullptr); 
- 
-  if (IC.isAlways()) { 
-    ORE.emit([&]() { 
-      return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB) 
-             << NV("Callee", Cloner.OrigFunc) 
-             << " should always be fully inlined, not partially"; 
-    }); 
-    return false; 
-  } 
- 
-  if (IC.isNever()) { 
-    ORE.emit([&]() { 
-      return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB) 
-             << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 
-             << NV("Caller", Caller) 
-             << " because it should never be inlined (cost=never)"; 
-    }); 
-    return false; 
-  } 
- 
-  if (!IC) { 
-    ORE.emit([&]() { 
-      return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB) 
-             << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 
-             << NV("Caller", Caller) << " because too costly to inline (cost=" 
-             << NV("Cost", IC.getCost()) << ", threshold=" 
-             << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 
-    }); 
-    return false; 
-  } 
-  const DataLayout &DL = Caller->getParent()->getDataLayout(); 
- 
-  // The savings of eliminating the call: 
-  int NonWeightedSavings = getCallsiteCost(CB, DL); 
-  BlockFrequency NormWeightedSavings(NonWeightedSavings); 
- 
-  // Weighted saving is smaller than weighted cost, return false 
-  if (NormWeightedSavings < WeightedOutliningRcost) { 
-    ORE.emit([&]() { 
-      return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", 
-                                        &CB) 
-             << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 
-             << NV("Caller", Caller) << " runtime overhead (overhead=" 
-             << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency()) 
-             << ", savings=" 
-             << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) 
-             << ")" 
-             << " of making the outlined call is too high"; 
-    }); 
- 
-    return false; 
-  } 
- 
-  ORE.emit([&]() { 
-    return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB) 
-           << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into " 
-           << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost()) 
-           << " (threshold=" 
-           << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 
-  }); 
-  return true; 
-} 
- 
-// TODO: Ideally  we should share Inliner's InlineCost Analysis code. 
-// For now use a simplified version. The returned 'InlineCost' will be used 
-// to esimate the size cost as well as runtime cost of the BB. 
+  using namespace ore;
+
+  Function *Callee = CB.getCalledFunction();
+  assert(Callee == Cloner.ClonedFunc);
+
+  if (SkipCostAnalysis)
+    return isInlineViable(*Callee).isSuccess();
+
+  Function *Caller = CB.getCaller();
+  auto &CalleeTTI = GetTTI(*Callee);
+  bool RemarksEnabled =
+      Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
+          DEBUG_TYPE);
+  InlineCost IC =
+      getInlineCost(CB, getInlineParams(), CalleeTTI, GetAssumptionCache,
+                    GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE : nullptr);
+
+  if (IC.isAlways()) {
+    ORE.emit([&]() {
+      return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB)
+             << NV("Callee", Cloner.OrigFunc)
+             << " should always be fully inlined, not partially";
+    });
+    return false;
+  }
+
+  if (IC.isNever()) {
+    ORE.emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB)
+             << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
+             << NV("Caller", Caller)
+             << " because it should never be inlined (cost=never)";
+    });
+    return false;
+  }
+
+  if (!IC) {
+    ORE.emit([&]() {
+      return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB)
+             << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
+             << NV("Caller", Caller) << " because too costly to inline (cost="
+             << NV("Cost", IC.getCost()) << ", threshold="
+             << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
+    });
+    return false;
+  }
+  const DataLayout &DL = Caller->getParent()->getDataLayout();
+
+  // The savings of eliminating the call:
+  int NonWeightedSavings = getCallsiteCost(CB, DL);
+  BlockFrequency NormWeightedSavings(NonWeightedSavings);
+
+  // Weighted saving is smaller than weighted cost, return false
+  if (NormWeightedSavings < WeightedOutliningRcost) {
+    ORE.emit([&]() {
+      return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
+                                        &CB)
+             << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
+             << NV("Caller", Caller) << " runtime overhead (overhead="
+             << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
+             << ", savings="
+             << NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
+             << ")"
+             << " of making the outlined call is too high";
+    });
+
+    return false;
+  }
+
+  ORE.emit([&]() {
+    return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB)
+           << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
+           << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
+           << " (threshold="
+           << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
+  });
+  return true;
+}
+
+// TODO: Ideally  we should share Inliner's InlineCost Analysis code.
+// For now use a simplified version. The returned 'InlineCost' will be used
+// to esimate the size cost as well as runtime cost of the BB.
 int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB,
                                             TargetTransformInfo *TTI) {
-  int InlineCost = 0; 
-  const DataLayout &DL = BB->getParent()->getParent()->getDataLayout(); 
-  for (Instruction &I : BB->instructionsWithoutDebug()) { 
-    // Skip free instructions. 
-    switch (I.getOpcode()) { 
-    case Instruction::BitCast: 
-    case Instruction::PtrToInt: 
-    case Instruction::IntToPtr: 
-    case Instruction::Alloca: 
-    case Instruction::PHI: 
-      continue; 
-    case Instruction::GetElementPtr: 
-      if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices()) 
-        continue; 
-      break; 
-    default: 
-      break; 
-    } 
- 
-    if (I.isLifetimeStartOrEnd()) 
-      continue; 
- 
+  int InlineCost = 0;
+  const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
+  for (Instruction &I : BB->instructionsWithoutDebug()) {
+    // Skip free instructions.
+    switch (I.getOpcode()) {
+    case Instruction::BitCast:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::Alloca:
+    case Instruction::PHI:
+      continue;
+    case Instruction::GetElementPtr:
+      if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices())
+        continue;
+      break;
+    default:
+      break;
+    }
+
+    if (I.isLifetimeStartOrEnd())
+      continue;
+
     if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
       Intrinsic::ID IID = II->getIntrinsicID();
       SmallVector<Type *, 4> Tys;
@@ -890,657 +890,657 @@ int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB,
       continue;
     }
 
-    if (CallInst *CI = dyn_cast<CallInst>(&I)) { 
-      InlineCost += getCallsiteCost(*CI, DL); 
-      continue; 
-    } 
- 
-    if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) { 
-      InlineCost += getCallsiteCost(*II, DL); 
-      continue; 
-    } 
- 
-    if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) { 
-      InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost; 
-      continue; 
-    } 
-    InlineCost += InlineConstants::InstrCost; 
-  } 
-  return InlineCost; 
-} 
- 
-std::tuple<int, int> 
+    if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+      InlineCost += getCallsiteCost(*CI, DL);
+      continue;
+    }
+
+    if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
+      InlineCost += getCallsiteCost(*II, DL);
+      continue;
+    }
+
+    if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
+      InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
+      continue;
+    }
+    InlineCost += InlineConstants::InstrCost;
+  }
+  return InlineCost;
+}
+
+std::tuple<int, int>
 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) const {
-  int OutliningFuncCallCost = 0, OutlinedFunctionCost = 0; 
-  for (auto FuncBBPair : Cloner.OutlinedFunctions) { 
-    Function *OutlinedFunc = FuncBBPair.first; 
-    BasicBlock* OutliningCallBB = FuncBBPair.second; 
-    // Now compute the cost of the call sequence to the outlined function 
-    // 'OutlinedFunction' in BB 'OutliningCallBB': 
+  int OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;
+  for (auto FuncBBPair : Cloner.OutlinedFunctions) {
+    Function *OutlinedFunc = FuncBBPair.first;
+    BasicBlock* OutliningCallBB = FuncBBPair.second;
+    // Now compute the cost of the call sequence to the outlined function
+    // 'OutlinedFunction' in BB 'OutliningCallBB':
     auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc);
     OutliningFuncCallCost +=
         computeBBInlineCost(OutliningCallBB, OutlinedFuncTTI);
- 
-    // Now compute the cost of the extracted/outlined function itself: 
-    for (BasicBlock &BB : *OutlinedFunc) 
+
+    // Now compute the cost of the extracted/outlined function itself:
+    for (BasicBlock &BB : *OutlinedFunc)
       OutlinedFunctionCost += computeBBInlineCost(&BB, OutlinedFuncTTI);
-  } 
-  assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost && 
-         "Outlined function cost should be no less than the outlined region"); 
- 
-  // The code extractor introduces a new root and exit stub blocks with 
-  // additional unconditional branches. Those branches will be eliminated 
-  // later with bb layout. The cost should be adjusted accordingly: 
-  OutlinedFunctionCost -= 
-      2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size(); 
- 
-  int OutliningRuntimeOverhead = 
-      OutliningFuncCallCost + 
-      (OutlinedFunctionCost - Cloner.OutlinedRegionCost) + 
-      ExtraOutliningPenalty; 
- 
-  return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead); 
-} 
- 
-// Create the callsite to profile count map which is 
-// used to update the original function's entry count, 
-// after the function is partially inlined into the callsite. 
-void PartialInlinerImpl::computeCallsiteToProfCountMap( 
-    Function *DuplicateFunction, 
+  }
+  assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
+         "Outlined function cost should be no less than the outlined region");
+
+  // The code extractor introduces a new root and exit stub blocks with
+  // additional unconditional branches. Those branches will be eliminated
+  // later with bb layout. The cost should be adjusted accordingly:
+  OutlinedFunctionCost -=
+      2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size();
+
+  int OutliningRuntimeOverhead =
+      OutliningFuncCallCost +
+      (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
+      ExtraOutliningPenalty;
+
+  return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
+}
+
+// Create the callsite to profile count map which is
+// used to update the original function's entry count,
+// after the function is partially inlined into the callsite.
+void PartialInlinerImpl::computeCallsiteToProfCountMap(
+    Function *DuplicateFunction,
     DenseMap<User *, uint64_t> &CallSiteToProfCountMap) const {
-  std::vector<User *> Users(DuplicateFunction->user_begin(), 
-                            DuplicateFunction->user_end()); 
-  Function *CurrentCaller = nullptr; 
-  std::unique_ptr<BlockFrequencyInfo> TempBFI; 
-  BlockFrequencyInfo *CurrentCallerBFI = nullptr; 
- 
-  auto ComputeCurrBFI = [&,this](Function *Caller) { 
-      // For the old pass manager: 
-      if (!GetBFI) { 
-        DominatorTree DT(*Caller); 
-        LoopInfo LI(DT); 
-        BranchProbabilityInfo BPI(*Caller, LI); 
-        TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI)); 
-        CurrentCallerBFI = TempBFI.get(); 
-      } else { 
-        // New pass manager: 
-        CurrentCallerBFI = &(GetBFI(*Caller)); 
-      } 
-  }; 
- 
-  for (User *User : Users) { 
-    CallBase *CB = getSupportedCallBase(User); 
-    Function *Caller = CB->getCaller(); 
-    if (CurrentCaller != Caller) { 
-      CurrentCaller = Caller; 
-      ComputeCurrBFI(Caller); 
-    } else { 
-      assert(CurrentCallerBFI && "CallerBFI is not set"); 
-    } 
-    BasicBlock *CallBB = CB->getParent(); 
-    auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB); 
-    if (Count) 
-      CallSiteToProfCountMap[User] = *Count; 
-    else 
-      CallSiteToProfCountMap[User] = 0; 
-  } 
-} 
- 
-PartialInlinerImpl::FunctionCloner::FunctionCloner( 
-    Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE, 
+  std::vector<User *> Users(DuplicateFunction->user_begin(),
+                            DuplicateFunction->user_end());
+  Function *CurrentCaller = nullptr;
+  std::unique_ptr<BlockFrequencyInfo> TempBFI;
+  BlockFrequencyInfo *CurrentCallerBFI = nullptr;
+
+  auto ComputeCurrBFI = [&,this](Function *Caller) {
+      // For the old pass manager:
+      if (!GetBFI) {
+        DominatorTree DT(*Caller);
+        LoopInfo LI(DT);
+        BranchProbabilityInfo BPI(*Caller, LI);
+        TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
+        CurrentCallerBFI = TempBFI.get();
+      } else {
+        // New pass manager:
+        CurrentCallerBFI = &(GetBFI(*Caller));
+      }
+  };
+
+  for (User *User : Users) {
+    CallBase *CB = getSupportedCallBase(User);
+    Function *Caller = CB->getCaller();
+    if (CurrentCaller != Caller) {
+      CurrentCaller = Caller;
+      ComputeCurrBFI(Caller);
+    } else {
+      assert(CurrentCallerBFI && "CallerBFI is not set");
+    }
+    BasicBlock *CallBB = CB->getParent();
+    auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
+    if (Count)
+      CallSiteToProfCountMap[User] = *Count;
+    else
+      CallSiteToProfCountMap[User] = 0;
+  }
+}
+
+PartialInlinerImpl::FunctionCloner::FunctionCloner(
+    Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
     function_ref<AssumptionCache *(Function &)> LookupAC,
     function_ref<TargetTransformInfo &(Function &)> GetTTI)
     : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
-  ClonedOI = std::make_unique<FunctionOutliningInfo>(); 
- 
-  // Clone the function, so that we can hack away on it. 
-  ValueToValueMapTy VMap; 
-  ClonedFunc = CloneFunction(F, VMap); 
- 
-  ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]); 
-  ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]); 
+  ClonedOI = std::make_unique<FunctionOutliningInfo>();
+
+  // Clone the function, so that we can hack away on it.
+  ValueToValueMapTy VMap;
+  ClonedFunc = CloneFunction(F, VMap);
+
+  ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
+  ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
   for (BasicBlock *BB : OI->Entries)
-    ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB])); 
-
-  for (BasicBlock *E : OI->ReturnBlockPreds) { 
-    BasicBlock *NewE = cast<BasicBlock>(VMap[E]); 
-    ClonedOI->ReturnBlockPreds.push_back(NewE); 
-  } 
-  // Go ahead and update all uses to the duplicate, so that we can just 
-  // use the inliner functionality when we're done hacking. 
-  F->replaceAllUsesWith(ClonedFunc); 
-} 
- 
-PartialInlinerImpl::FunctionCloner::FunctionCloner( 
-    Function *F, FunctionOutliningMultiRegionInfo *OI, 
-    OptimizationRemarkEmitter &ORE, 
+    ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
+
+  for (BasicBlock *E : OI->ReturnBlockPreds) {
+    BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
+    ClonedOI->ReturnBlockPreds.push_back(NewE);
+  }
+  // Go ahead and update all uses to the duplicate, so that we can just
+  // use the inliner functionality when we're done hacking.
+  F->replaceAllUsesWith(ClonedFunc);
+}
+
+PartialInlinerImpl::FunctionCloner::FunctionCloner(
+    Function *F, FunctionOutliningMultiRegionInfo *OI,
+    OptimizationRemarkEmitter &ORE,
     function_ref<AssumptionCache *(Function &)> LookupAC,
     function_ref<TargetTransformInfo &(Function &)> GetTTI)
     : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
-  ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>(); 
- 
-  // Clone the function, so that we can hack away on it. 
-  ValueToValueMapTy VMap; 
-  ClonedFunc = CloneFunction(F, VMap); 
- 
-  // Go through all Outline Candidate Regions and update all BasicBlock 
-  // information. 
-  for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : 
-       OI->ORI) { 
-    SmallVector<BasicBlock *, 8> Region; 
+  ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
+
+  // Clone the function, so that we can hack away on it.
+  ValueToValueMapTy VMap;
+  ClonedFunc = CloneFunction(F, VMap);
+
+  // Go through all Outline Candidate Regions and update all BasicBlock
+  // information.
+  for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
+       OI->ORI) {
+    SmallVector<BasicBlock *, 8> Region;
     for (BasicBlock *BB : RegionInfo.Region)
-      Region.push_back(cast<BasicBlock>(VMap[BB])); 
-
-    BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]); 
-    BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]); 
-    BasicBlock *NewReturnBlock = nullptr; 
-    if (RegionInfo.ReturnBlock) 
-      NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]); 
-    FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo( 
-        Region, NewEntryBlock, NewExitBlock, NewReturnBlock); 
-    ClonedOMRI->ORI.push_back(MappedRegionInfo); 
-  } 
-  // Go ahead and update all uses to the duplicate, so that we can just 
-  // use the inliner functionality when we're done hacking. 
-  F->replaceAllUsesWith(ClonedFunc); 
-} 
- 
+      Region.push_back(cast<BasicBlock>(VMap[BB]));
+
+    BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]);
+    BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]);
+    BasicBlock *NewReturnBlock = nullptr;
+    if (RegionInfo.ReturnBlock)
+      NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]);
+    FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
+        Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
+    ClonedOMRI->ORI.push_back(MappedRegionInfo);
+  }
+  // Go ahead and update all uses to the duplicate, so that we can just
+  // use the inliner functionality when we're done hacking.
+  F->replaceAllUsesWith(ClonedFunc);
+}
+
 void PartialInlinerImpl::FunctionCloner::normalizeReturnBlock() const {
   auto GetFirstPHI = [](BasicBlock *BB) {
-    BasicBlock::iterator I = BB->begin(); 
-    PHINode *FirstPhi = nullptr; 
-    while (I != BB->end()) { 
-      PHINode *Phi = dyn_cast<PHINode>(I); 
-      if (!Phi) 
-        break; 
-      if (!FirstPhi) { 
-        FirstPhi = Phi; 
-        break; 
-      } 
-    } 
-    return FirstPhi; 
-  }; 
- 
-  // Shouldn't need to normalize PHIs if we're not outlining non-early return 
-  // blocks. 
-  if (!ClonedOI) 
-    return; 
- 
-  // Special hackery is needed with PHI nodes that have inputs from more than 
-  // one extracted block.  For simplicity, just split the PHIs into a two-level 
-  // sequence of PHIs, some of which will go in the extracted region, and some 
-  // of which will go outside. 
-  BasicBlock *PreReturn = ClonedOI->ReturnBlock; 
-  // only split block when necessary: 
+    BasicBlock::iterator I = BB->begin();
+    PHINode *FirstPhi = nullptr;
+    while (I != BB->end()) {
+      PHINode *Phi = dyn_cast<PHINode>(I);
+      if (!Phi)
+        break;
+      if (!FirstPhi) {
+        FirstPhi = Phi;
+        break;
+      }
+    }
+    return FirstPhi;
+  };
+
+  // Shouldn't need to normalize PHIs if we're not outlining non-early return
+  // blocks.
+  if (!ClonedOI)
+    return;
+
+  // Special hackery is needed with PHI nodes that have inputs from more than
+  // one extracted block.  For simplicity, just split the PHIs into a two-level
+  // sequence of PHIs, some of which will go in the extracted region, and some
+  // of which will go outside.
+  BasicBlock *PreReturn = ClonedOI->ReturnBlock;
+  // only split block when necessary:
   PHINode *FirstPhi = GetFirstPHI(PreReturn);
-  unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size(); 
- 
-  if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1) 
-    return; 
- 
-  auto IsTrivialPhi = [](PHINode *PN) -> Value * { 
-    Value *CommonValue = PN->getIncomingValue(0); 
-    if (all_of(PN->incoming_values(), 
-               [&](Value *V) { return V == CommonValue; })) 
-      return CommonValue; 
-    return nullptr; 
-  }; 
- 
-  ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock( 
-      ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator()); 
-  BasicBlock::iterator I = PreReturn->begin(); 
-  Instruction *Ins = &ClonedOI->ReturnBlock->front(); 
-  SmallVector<Instruction *, 4> DeadPhis; 
-  while (I != PreReturn->end()) { 
-    PHINode *OldPhi = dyn_cast<PHINode>(I); 
-    if (!OldPhi) 
-      break; 
- 
-    PHINode *RetPhi = 
-        PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins); 
-    OldPhi->replaceAllUsesWith(RetPhi); 
-    Ins = ClonedOI->ReturnBlock->getFirstNonPHI(); 
- 
-    RetPhi->addIncoming(&*I, PreReturn); 
-    for (BasicBlock *E : ClonedOI->ReturnBlockPreds) { 
-      RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E); 
-      OldPhi->removeIncomingValue(E); 
-    } 
- 
-    // After incoming values splitting, the old phi may become trivial. 
-    // Keeping the trivial phi can introduce definition inside the outline 
-    // region which is live-out, causing necessary overhead (load, store 
-    // arg passing etc). 
-    if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) { 
-      OldPhi->replaceAllUsesWith(OldPhiVal); 
-      DeadPhis.push_back(OldPhi); 
-    } 
-    ++I; 
-  } 
-  for (auto *DP : DeadPhis) 
-    DP->eraseFromParent(); 
- 
+  unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
+
+  if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
+    return;
+
+  auto IsTrivialPhi = [](PHINode *PN) -> Value * {
+    Value *CommonValue = PN->getIncomingValue(0);
+    if (all_of(PN->incoming_values(),
+               [&](Value *V) { return V == CommonValue; }))
+      return CommonValue;
+    return nullptr;
+  };
+
+  ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
+      ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
+  BasicBlock::iterator I = PreReturn->begin();
+  Instruction *Ins = &ClonedOI->ReturnBlock->front();
+  SmallVector<Instruction *, 4> DeadPhis;
+  while (I != PreReturn->end()) {
+    PHINode *OldPhi = dyn_cast<PHINode>(I);
+    if (!OldPhi)
+      break;
+
+    PHINode *RetPhi =
+        PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
+    OldPhi->replaceAllUsesWith(RetPhi);
+    Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
+
+    RetPhi->addIncoming(&*I, PreReturn);
+    for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
+      RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
+      OldPhi->removeIncomingValue(E);
+    }
+
+    // After incoming values splitting, the old phi may become trivial.
+    // Keeping the trivial phi can introduce definition inside the outline
+    // region which is live-out, causing necessary overhead (load, store
+    // arg passing etc).
+    if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
+      OldPhi->replaceAllUsesWith(OldPhiVal);
+      DeadPhis.push_back(OldPhi);
+    }
+    ++I;
+  }
+  for (auto *DP : DeadPhis)
+    DP->eraseFromParent();
+
   for (auto *E : ClonedOI->ReturnBlockPreds)
-    E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock); 
-} 
- 
-bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() { 
- 
+    E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
+}
+
+bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
+
   auto ComputeRegionCost = [&](SmallVectorImpl<BasicBlock *> &Region) {
-    int Cost = 0; 
-    for (BasicBlock* BB : Region) 
+    int Cost = 0;
+    for (BasicBlock* BB : Region)
       Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
-    return Cost; 
-  }; 
- 
-  assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline"); 
- 
-  if (ClonedOMRI->ORI.empty()) 
-    return false; 
- 
-  // The CodeExtractor needs a dominator tree. 
-  DominatorTree DT; 
-  DT.recalculate(*ClonedFunc); 
- 
-  // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 
-  LoopInfo LI(DT); 
-  BranchProbabilityInfo BPI(*ClonedFunc, LI); 
-  ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 
- 
-  // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time. 
-  CodeExtractorAnalysisCache CEAC(*ClonedFunc); 
- 
-  SetVector<Value *> Inputs, Outputs, Sinks; 
-  for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : 
-       ClonedOMRI->ORI) { 
-    int CurrentOutlinedRegionCost = ComputeRegionCost(RegionInfo.Region); 
- 
-    CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false, 
-                     ClonedFuncBFI.get(), &BPI, 
-                     LookupAC(*RegionInfo.EntryBlock->getParent()), 
-                     /* AllowVarargs */ false); 
- 
-    CE.findInputsOutputs(Inputs, Outputs, Sinks); 
- 
+    return Cost;
+  };
+
+  assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");
+
+  if (ClonedOMRI->ORI.empty())
+    return false;
+
+  // The CodeExtractor needs a dominator tree.
+  DominatorTree DT;
+  DT.recalculate(*ClonedFunc);
+
+  // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
+  LoopInfo LI(DT);
+  BranchProbabilityInfo BPI(*ClonedFunc, LI);
+  ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
+
+  // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
+  CodeExtractorAnalysisCache CEAC(*ClonedFunc);
+
+  SetVector<Value *> Inputs, Outputs, Sinks;
+  for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
+       ClonedOMRI->ORI) {
+    int CurrentOutlinedRegionCost = ComputeRegionCost(RegionInfo.Region);
+
+    CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false,
+                     ClonedFuncBFI.get(), &BPI,
+                     LookupAC(*RegionInfo.EntryBlock->getParent()),
+                     /* AllowVarargs */ false);
+
+    CE.findInputsOutputs(Inputs, Outputs, Sinks);
+
     LLVM_DEBUG({
-      dbgs() << "inputs: " << Inputs.size() << "\n"; 
-      dbgs() << "outputs: " << Outputs.size() << "\n"; 
-      for (Value *value : Inputs) 
-        dbgs() << "value used in func: " << *value << "\n"; 
-      for (Value *output : Outputs) 
-        dbgs() << "instr used in func: " << *output << "\n"; 
+      dbgs() << "inputs: " << Inputs.size() << "\n";
+      dbgs() << "outputs: " << Outputs.size() << "\n";
+      for (Value *value : Inputs)
+        dbgs() << "value used in func: " << *value << "\n";
+      for (Value *output : Outputs)
+        dbgs() << "instr used in func: " << *output << "\n";
     });
 
-    // Do not extract regions that have live exit variables. 
-    if (Outputs.size() > 0 && !ForceLiveExit) 
-      continue; 
- 
+    // Do not extract regions that have live exit variables.
+    if (Outputs.size() > 0 && !ForceLiveExit)
+      continue;
+
     if (Function *OutlinedFunc = CE.extractCodeRegion(CEAC)) {
       CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc);
-      BasicBlock *OutliningCallBB = OCS->getParent(); 
-      assert(OutliningCallBB->getParent() == ClonedFunc); 
-      OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB)); 
-      NumColdRegionsOutlined++; 
-      OutlinedRegionCost += CurrentOutlinedRegionCost; 
- 
-      if (MarkOutlinedColdCC) { 
-        OutlinedFunc->setCallingConv(CallingConv::Cold); 
-        OCS->setCallingConv(CallingConv::Cold); 
-      } 
-    } else 
-      ORE.emit([&]() { 
-        return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", 
-                                        &RegionInfo.Region.front()->front()) 
-               << "Failed to extract region at block " 
-               << ore::NV("Block", RegionInfo.Region.front()); 
-      }); 
-  } 
- 
-  return !OutlinedFunctions.empty(); 
-} 
- 
-Function * 
-PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() { 
-  // Returns true if the block is to be partial inlined into the caller 
-  // (i.e. not to be extracted to the out of line function) 
-  auto ToBeInlined = [&, this](BasicBlock *BB) { 
-    return BB == ClonedOI->ReturnBlock || 
+      BasicBlock *OutliningCallBB = OCS->getParent();
+      assert(OutliningCallBB->getParent() == ClonedFunc);
+      OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
+      NumColdRegionsOutlined++;
+      OutlinedRegionCost += CurrentOutlinedRegionCost;
+
+      if (MarkOutlinedColdCC) {
+        OutlinedFunc->setCallingConv(CallingConv::Cold);
+        OCS->setCallingConv(CallingConv::Cold);
+      }
+    } else
+      ORE.emit([&]() {
+        return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
+                                        &RegionInfo.Region.front()->front())
+               << "Failed to extract region at block "
+               << ore::NV("Block", RegionInfo.Region.front());
+      });
+  }
+
+  return !OutlinedFunctions.empty();
+}
+
+Function *
+PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
+  // Returns true if the block is to be partial inlined into the caller
+  // (i.e. not to be extracted to the out of line function)
+  auto ToBeInlined = [&, this](BasicBlock *BB) {
+    return BB == ClonedOI->ReturnBlock ||
            llvm::is_contained(ClonedOI->Entries, BB);
-  }; 
- 
-  assert(ClonedOI && "Expecting OutlineInfo for single region outline"); 
-  // The CodeExtractor needs a dominator tree. 
-  DominatorTree DT; 
-  DT.recalculate(*ClonedFunc); 
- 
-  // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 
-  LoopInfo LI(DT); 
-  BranchProbabilityInfo BPI(*ClonedFunc, LI); 
-  ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 
- 
-  // Gather up the blocks that we're going to extract. 
-  std::vector<BasicBlock *> ToExtract; 
+  };
+
+  assert(ClonedOI && "Expecting OutlineInfo for single region outline");
+  // The CodeExtractor needs a dominator tree.
+  DominatorTree DT;
+  DT.recalculate(*ClonedFunc);
+
+  // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
+  LoopInfo LI(DT);
+  BranchProbabilityInfo BPI(*ClonedFunc, LI);
+  ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
+
+  // Gather up the blocks that we're going to extract.
+  std::vector<BasicBlock *> ToExtract;
   auto *ClonedFuncTTI = &GetTTI(*ClonedFunc);
-  ToExtract.push_back(ClonedOI->NonReturnBlock); 
+  ToExtract.push_back(ClonedOI->NonReturnBlock);
   OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost(
       ClonedOI->NonReturnBlock, ClonedFuncTTI);
-  for (BasicBlock &BB : *ClonedFunc) 
-    if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) { 
-      ToExtract.push_back(&BB); 
-      // FIXME: the code extractor may hoist/sink more code 
-      // into the outlined function which may make the outlining 
-      // overhead (the difference of the outlined function cost 
-      // and OutliningRegionCost) look larger. 
+  for (BasicBlock &BB : *ClonedFunc)
+    if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
+      ToExtract.push_back(&BB);
+      // FIXME: the code extractor may hoist/sink more code
+      // into the outlined function which may make the outlining
+      // overhead (the difference of the outlined function cost
+      // and OutliningRegionCost) look larger.
       OutlinedRegionCost += computeBBInlineCost(&BB, ClonedFuncTTI);
-    } 
- 
-  // Extract the body of the if. 
-  CodeExtractorAnalysisCache CEAC(*ClonedFunc); 
-  Function *OutlinedFunc = 
-      CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, 
-                    ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc), 
-                    /* AllowVarargs */ true) 
-          .extractCodeRegion(CEAC); 
- 
-  if (OutlinedFunc) { 
-    BasicBlock *OutliningCallBB = 
+    }
+
+  // Extract the body of the if.
+  CodeExtractorAnalysisCache CEAC(*ClonedFunc);
+  Function *OutlinedFunc =
+      CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
+                    ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc),
+                    /* AllowVarargs */ true)
+          .extractCodeRegion(CEAC);
+
+  if (OutlinedFunc) {
+    BasicBlock *OutliningCallBB =
         PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc)->getParent();
-    assert(OutliningCallBB->getParent() == ClonedFunc); 
-    OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB)); 
-  } else 
-    ORE.emit([&]() { 
-      return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", 
-                                      &ToExtract.front()->front()) 
-             << "Failed to extract region at block " 
-             << ore::NV("Block", ToExtract.front()); 
-    }); 
- 
-  return OutlinedFunc; 
-} 
- 
-PartialInlinerImpl::FunctionCloner::~FunctionCloner() { 
-  // Ditch the duplicate, since we're done with it, and rewrite all remaining 
-  // users (function pointers, etc.) back to the original function. 
-  ClonedFunc->replaceAllUsesWith(OrigFunc); 
-  ClonedFunc->eraseFromParent(); 
-  if (!IsFunctionInlined) { 
-    // Remove each function that was speculatively created if there is no 
-    // reference. 
-    for (auto FuncBBPair : OutlinedFunctions) { 
-      Function *Func = FuncBBPair.first; 
-      Func->eraseFromParent(); 
-    } 
-  } 
-} 
- 
+    assert(OutliningCallBB->getParent() == ClonedFunc);
+    OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
+  } else
+    ORE.emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
+                                      &ToExtract.front()->front())
+             << "Failed to extract region at block "
+             << ore::NV("Block", ToExtract.front());
+    });
+
+  return OutlinedFunc;
+}
+
+PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
+  // Ditch the duplicate, since we're done with it, and rewrite all remaining
+  // users (function pointers, etc.) back to the original function.
+  ClonedFunc->replaceAllUsesWith(OrigFunc);
+  ClonedFunc->eraseFromParent();
+  if (!IsFunctionInlined) {
+    // Remove each function that was speculatively created if there is no
+    // reference.
+    for (auto FuncBBPair : OutlinedFunctions) {
+      Function *Func = FuncBBPair.first;
+      Func->eraseFromParent();
+    }
+  }
+}
+
 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function &F) {
   if (F.hasAddressTaken())
-    return {false, nullptr}; 
- 
-  // Let inliner handle it 
+    return {false, nullptr};
+
+  // Let inliner handle it
   if (F.hasFnAttribute(Attribute::AlwaysInline))
-    return {false, nullptr}; 
- 
+    return {false, nullptr};
+
   if (F.hasFnAttribute(Attribute::NoInline))
-    return {false, nullptr}; 
- 
+    return {false, nullptr};
+
   if (PSI.isFunctionEntryCold(&F))
-    return {false, nullptr}; 
- 
+    return {false, nullptr};
+
   if (F.users().empty())
-    return {false, nullptr}; 
- 
+    return {false, nullptr};
+
   OptimizationRemarkEmitter ORE(&F);
- 
-  // Only try to outline cold regions if we have a profile summary, which 
-  // implies we have profiling information. 
+
+  // Only try to outline cold regions if we have a profile summary, which
+  // implies we have profiling information.
   if (PSI.hasProfileSummary() && F.hasProfileData() &&
-      !DisableMultiRegionPartialInline) { 
-    std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI = 
-        computeOutliningColdRegionsInfo(F, ORE); 
-    if (OMRI) { 
+      !DisableMultiRegionPartialInline) {
+    std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
+        computeOutliningColdRegionsInfo(F, ORE);
+    if (OMRI) {
       FunctionCloner Cloner(&F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI);
- 
+
       LLVM_DEBUG({
-        dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n"; 
-        dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold() 
-               << "\n"; 
+        dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n";
+        dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold()
+               << "\n";
       });
 
-      bool DidOutline = Cloner.doMultiRegionFunctionOutlining(); 
- 
-      if (DidOutline) { 
+      bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
+
+      if (DidOutline) {
         LLVM_DEBUG({
-          dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n"; 
-          Cloner.ClonedFunc->print(dbgs()); 
-          dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n"; 
+          dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
+          Cloner.ClonedFunc->print(dbgs());
+          dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
         });
- 
-        if (tryPartialInline(Cloner)) 
-          return {true, nullptr}; 
-      } 
-    } 
-  } 
- 
-  // Fall-thru to regular partial inlining if we: 
-  //    i) can't find any cold regions to outline, or 
-  //   ii) can't inline the outlined function anywhere. 
-  std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F); 
-  if (!OI) 
-    return {false, nullptr}; 
- 
+
+        if (tryPartialInline(Cloner))
+          return {true, nullptr};
+      }
+    }
+  }
+
+  // Fall-thru to regular partial inlining if we:
+  //    i) can't find any cold regions to outline, or
+  //   ii) can't inline the outlined function anywhere.
+  std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
+  if (!OI)
+    return {false, nullptr};
+
   FunctionCloner Cloner(&F, OI.get(), ORE, LookupAssumptionCache, GetTTI);
   Cloner.normalizeReturnBlock();
- 
-  Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining(); 
- 
-  if (!OutlinedFunction) 
-    return {false, nullptr}; 
- 
+
+  Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
+
+  if (!OutlinedFunction)
+    return {false, nullptr};
+
   if (tryPartialInline(Cloner))
-    return {true, OutlinedFunction}; 
- 
-  return {false, nullptr}; 
-} 
- 
-bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { 
-  if (Cloner.OutlinedFunctions.empty()) 
-    return false; 
- 
-  int SizeCost = 0; 
-  BlockFrequency WeightedRcost; 
-  int NonWeightedRcost; 
-  std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner); 
- 
-  // Only calculate RelativeToEntryFreq when we are doing single region 
-  // outlining. 
-  BranchProbability RelativeToEntryFreq; 
+    return {true, OutlinedFunction};
+
+  return {false, nullptr};
+}
+
+bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
+  if (Cloner.OutlinedFunctions.empty())
+    return false;
+
+  int SizeCost = 0;
+  BlockFrequency WeightedRcost;
+  int NonWeightedRcost;
+  std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner);
+
+  // Only calculate RelativeToEntryFreq when we are doing single region
+  // outlining.
+  BranchProbability RelativeToEntryFreq;
   if (Cloner.ClonedOI)
-    RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner); 
+    RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
   else
-    // RelativeToEntryFreq doesn't make sense when we have more than one 
-    // outlined call because each call will have a different relative frequency 
-    // to the entry block.  We can consider using the average, but the 
-    // usefulness of that information is questionable. For now, assume we never 
-    // execute the calls to outlined functions. 
-    RelativeToEntryFreq = BranchProbability(0, 1); 
- 
-  WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq; 
- 
-  // The call sequence(s) to the outlined function(s) are larger than the sum of 
-  // the original outlined region size(s), it does not increase the chances of 
-  // inlining the function with outlining (The inliner uses the size increase to 
-  // model the cost of inlining a callee). 
-  if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) { 
-    OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc); 
-    DebugLoc DLoc; 
-    BasicBlock *Block; 
+    // RelativeToEntryFreq doesn't make sense when we have more than one
+    // outlined call because each call will have a different relative frequency
+    // to the entry block.  We can consider using the average, but the
+    // usefulness of that information is questionable. For now, assume we never
+    // execute the calls to outlined functions.
+    RelativeToEntryFreq = BranchProbability(0, 1);
+
+  WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq;
+
+  // The call sequence(s) to the outlined function(s) are larger than the sum of
+  // the original outlined region size(s), it does not increase the chances of
+  // inlining the function with outlining (The inliner uses the size increase to
+  // model the cost of inlining a callee).
+  if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
+    OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
+    DebugLoc DLoc;
+    BasicBlock *Block;
     std::tie(DLoc, Block) = getOneDebugLoc(*Cloner.ClonedFunc);
-    OrigFuncORE.emit([&]() { 
-      return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall", 
-                                        DLoc, Block) 
-             << ore::NV("Function", Cloner.OrigFunc) 
-             << " not partially inlined into callers (Original Size = " 
-             << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost) 
-             << ", Size of call sequence to outlined function = " 
-             << ore::NV("NewSize", SizeCost) << ")"; 
-    }); 
-    return false; 
-  } 
- 
-  assert(Cloner.OrigFunc->users().empty() && 
-         "F's users should all be replaced!"); 
- 
-  std::vector<User *> Users(Cloner.ClonedFunc->user_begin(), 
-                            Cloner.ClonedFunc->user_end()); 
- 
-  DenseMap<User *, uint64_t> CallSiteToProfCountMap; 
-  auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount(); 
-  if (CalleeEntryCount) 
-    computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap); 
- 
-  uint64_t CalleeEntryCountV = 
-      (CalleeEntryCount ? CalleeEntryCount.getCount() : 0); 
- 
-  bool AnyInline = false; 
-  for (User *User : Users) { 
-    CallBase *CB = getSupportedCallBase(User); 
- 
+    OrigFuncORE.emit([&]() {
+      return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
+                                        DLoc, Block)
+             << ore::NV("Function", Cloner.OrigFunc)
+             << " not partially inlined into callers (Original Size = "
+             << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
+             << ", Size of call sequence to outlined function = "
+             << ore::NV("NewSize", SizeCost) << ")";
+    });
+    return false;
+  }
+
+  assert(Cloner.OrigFunc->users().empty() &&
+         "F's users should all be replaced!");
+
+  std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
+                            Cloner.ClonedFunc->user_end());
+
+  DenseMap<User *, uint64_t> CallSiteToProfCountMap;
+  auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
+  if (CalleeEntryCount)
+    computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
+
+  uint64_t CalleeEntryCountV =
+      (CalleeEntryCount ? CalleeEntryCount.getCount() : 0);
+
+  bool AnyInline = false;
+  for (User *User : Users) {
+    CallBase *CB = getSupportedCallBase(User);
+
     if (isLimitReached())
-      continue; 
- 
-    OptimizationRemarkEmitter CallerORE(CB->getCaller()); 
-    if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE)) 
-      continue; 
- 
-    // Construct remark before doing the inlining, as after successful inlining 
-    // the callsite is removed. 
-    OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB); 
-    OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into " 
-       << ore::NV("Caller", CB->getCaller()); 
- 
-    InlineFunctionInfo IFI(nullptr, GetAssumptionCache, &PSI); 
-    // We can only forward varargs when we outlined a single region, else we 
-    // bail on vararg functions. 
-    if (!InlineFunction(*CB, IFI, nullptr, true, 
-                        (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first 
-                                         : nullptr)) 
-             .isSuccess()) 
-      continue; 
- 
-    CallerORE.emit(OR); 
- 
-    // Now update the entry count: 
-    if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) { 
-      uint64_t CallSiteCount = CallSiteToProfCountMap[User]; 
-      CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount); 
-    } 
- 
-    AnyInline = true; 
-    NumPartialInlining++; 
-    // Update the stats 
-    if (Cloner.ClonedOI) 
-      NumPartialInlined++; 
-    else 
-      NumColdOutlinePartialInlined++; 
-  } 
- 
-  if (AnyInline) { 
-    Cloner.IsFunctionInlined = true; 
-    if (CalleeEntryCount) 
-      Cloner.OrigFunc->setEntryCount( 
-          CalleeEntryCount.setCount(CalleeEntryCountV)); 
-    OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc); 
-    OrigFuncORE.emit([&]() { 
-      return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc) 
-             << "Partially inlined into at least one caller"; 
-    }); 
-  } 
- 
-  return AnyInline; 
-} 
- 
-bool PartialInlinerImpl::run(Module &M) { 
-  if (DisablePartialInlining) 
-    return false; 
- 
-  std::vector<Function *> Worklist; 
-  Worklist.reserve(M.size()); 
-  for (Function &F : M) 
-    if (!F.use_empty() && !F.isDeclaration()) 
-      Worklist.push_back(&F); 
- 
-  bool Changed = false; 
-  while (!Worklist.empty()) { 
-    Function *CurrFunc = Worklist.back(); 
-    Worklist.pop_back(); 
- 
-    if (CurrFunc->use_empty()) 
-      continue; 
- 
-    bool Recursive = false; 
-    for (User *U : CurrFunc->users()) 
-      if (Instruction *I = dyn_cast<Instruction>(U)) 
-        if (I->getParent()->getParent() == CurrFunc) { 
-          Recursive = true; 
-          break; 
-        } 
-    if (Recursive) 
-      continue; 
- 
+      continue;
+
+    OptimizationRemarkEmitter CallerORE(CB->getCaller());
+    if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE))
+      continue;
+
+    // Construct remark before doing the inlining, as after successful inlining
+    // the callsite is removed.
+    OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB);
+    OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
+       << ore::NV("Caller", CB->getCaller());
+
+    InlineFunctionInfo IFI(nullptr, GetAssumptionCache, &PSI);
+    // We can only forward varargs when we outlined a single region, else we
+    // bail on vararg functions.
+    if (!InlineFunction(*CB, IFI, nullptr, true,
+                        (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
+                                         : nullptr))
+             .isSuccess())
+      continue;
+
+    CallerORE.emit(OR);
+
+    // Now update the entry count:
+    if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
+      uint64_t CallSiteCount = CallSiteToProfCountMap[User];
+      CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
+    }
+
+    AnyInline = true;
+    NumPartialInlining++;
+    // Update the stats
+    if (Cloner.ClonedOI)
+      NumPartialInlined++;
+    else
+      NumColdOutlinePartialInlined++;
+  }
+
+  if (AnyInline) {
+    Cloner.IsFunctionInlined = true;
+    if (CalleeEntryCount)
+      Cloner.OrigFunc->setEntryCount(
+          CalleeEntryCount.setCount(CalleeEntryCountV));
+    OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
+    OrigFuncORE.emit([&]() {
+      return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
+             << "Partially inlined into at least one caller";
+    });
+  }
+
+  return AnyInline;
+}
+
+bool PartialInlinerImpl::run(Module &M) {
+  if (DisablePartialInlining)
+    return false;
+
+  std::vector<Function *> Worklist;
+  Worklist.reserve(M.size());
+  for (Function &F : M)
+    if (!F.use_empty() && !F.isDeclaration())
+      Worklist.push_back(&F);
+
+  bool Changed = false;
+  while (!Worklist.empty()) {
+    Function *CurrFunc = Worklist.back();
+    Worklist.pop_back();
+
+    if (CurrFunc->use_empty())
+      continue;
+
+    bool Recursive = false;
+    for (User *U : CurrFunc->users())
+      if (Instruction *I = dyn_cast<Instruction>(U))
+        if (I->getParent()->getParent() == CurrFunc) {
+          Recursive = true;
+          break;
+        }
+    if (Recursive)
+      continue;
+
     std::pair<bool, Function *> Result = unswitchFunction(*CurrFunc);
-    if (Result.second) 
-      Worklist.push_back(Result.second); 
-    Changed |= Result.first; 
-  } 
- 
-  return Changed; 
-} 
- 
-char PartialInlinerLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner", 
-                      "Partial Inliner", false, false) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner", 
-                    "Partial Inliner", false, false) 
- 
-ModulePass *llvm::createPartialInliningPass() { 
-  return new PartialInlinerLegacyPass(); 
-} 
- 
-PreservedAnalyses PartialInlinerPass::run(Module &M, 
-                                          ModuleAnalysisManager &AM) { 
-  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 
- 
-  auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & { 
-    return FAM.getResult<AssumptionAnalysis>(F); 
-  }; 
- 
-  auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * { 
-    return FAM.getCachedResult<AssumptionAnalysis>(F); 
-  }; 
- 
-  auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & { 
-    return FAM.getResult<BlockFrequencyAnalysis>(F); 
-  }; 
- 
-  auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & { 
-    return FAM.getResult<TargetIRAnalysis>(F); 
-  }; 
- 
-  auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & { 
-    return FAM.getResult<TargetLibraryAnalysis>(F); 
-  }; 
- 
-  ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(M); 
- 
-  if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI, 
-                         GetTLI, PSI, GetBFI) 
-          .run(M)) 
-    return PreservedAnalyses::none(); 
-  return PreservedAnalyses::all(); 
-} 
+    if (Result.second)
+      Worklist.push_back(Result.second);
+    Changed |= Result.first;
+  }
+
+  return Changed;
+}
+
+char PartialInlinerLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
+                      "Partial Inliner", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
+                    "Partial Inliner", false, false)
+
+ModulePass *llvm::createPartialInliningPass() {
+  return new PartialInlinerLegacyPass();
+}
+
+PreservedAnalyses PartialInlinerPass::run(Module &M,
+                                          ModuleAnalysisManager &AM) {
+  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+
+  auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & {
+    return FAM.getResult<AssumptionAnalysis>(F);
+  };
+
+  auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * {
+    return FAM.getCachedResult<AssumptionAnalysis>(F);
+  };
+
+  auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
+    return FAM.getResult<BlockFrequencyAnalysis>(F);
+  };
+
+  auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
+    return FAM.getResult<TargetIRAnalysis>(F);
+  };
+
+  auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+    return FAM.getResult<TargetLibraryAnalysis>(F);
+  };
+
+  ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(M);
+
+  if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
+                         GetTLI, PSI, GetBFI)
+          .run(M))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/PassManagerBuilder.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/PassManagerBuilder.cpp
index 520456e912..068328391d 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -1,311 +1,311 @@
-//===- PassManagerBuilder.cpp - Build Standard Pass -----------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file defines the PassManagerBuilder class, which is used to set up a 
-// "standard" optimization sequence suitable for languages like C and C++. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/IPO/PassManagerBuilder.h" 
-#include "llvm-c/Transforms/PassManagerBuilder.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/Analysis/BasicAliasAnalysis.h" 
-#include "llvm/Analysis/CFLAndersAliasAnalysis.h" 
-#include "llvm/Analysis/CFLSteensAliasAnalysis.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/InlineCost.h" 
-#include "llvm/Analysis/Passes.h" 
-#include "llvm/Analysis/ScopedNoAliasAA.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/Analysis/TypeBasedAliasAnalysis.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/LegacyPassManager.h" 
-#include "llvm/IR/Verifier.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/ManagedStatic.h" 
-#include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h" 
-#include "llvm/Transforms/IPO.h" 
-#include "llvm/Transforms/IPO/Attributor.h" 
-#include "llvm/Transforms/IPO/ForceFunctionAttrs.h" 
-#include "llvm/Transforms/IPO/FunctionAttrs.h" 
-#include "llvm/Transforms/IPO/InferFunctionAttrs.h" 
-#include "llvm/Transforms/InstCombine/InstCombine.h" 
-#include "llvm/Transforms/Instrumentation.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Scalar/GVN.h" 
-#include "llvm/Transforms/Scalar/InstSimplifyPass.h" 
-#include "llvm/Transforms/Scalar/LICM.h" 
-#include "llvm/Transforms/Scalar/LoopUnrollPass.h" 
-#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" 
-#include "llvm/Transforms/Utils.h" 
-#include "llvm/Transforms/Vectorize.h" 
-#include "llvm/Transforms/Vectorize/LoopVectorize.h" 
-#include "llvm/Transforms/Vectorize/SLPVectorizer.h" 
-#include "llvm/Transforms/Vectorize/VectorCombine.h" 
- 
-using namespace llvm; 
- 
+//===- PassManagerBuilder.cpp - Build Standard Pass -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PassManagerBuilder class, which is used to set up a
+// "standard" optimization sequence suitable for languages like C and C++.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm-c/Transforms/PassManagerBuilder.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/CFLAndersAliasAnalysis.h"
+#include "llvm/Analysis/CFLSteensAliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/ScopedNoAliasAA.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TypeBasedAliasAnalysis.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/Attributor.h"
+#include "llvm/Transforms/IPO/ForceFunctionAttrs.h"
+#include "llvm/Transforms/IPO/FunctionAttrs.h"
+#include "llvm/Transforms/IPO/InferFunctionAttrs.h"
+#include "llvm/Transforms/InstCombine/InstCombine.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Scalar/InstSimplifyPass.h"
+#include "llvm/Transforms/Scalar/LICM.h"
+#include "llvm/Transforms/Scalar/LoopUnrollPass.h"
+#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Vectorize.h"
+#include "llvm/Transforms/Vectorize/LoopVectorize.h"
+#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
+#include "llvm/Transforms/Vectorize/VectorCombine.h"
+
+using namespace llvm;
+
 cl::opt<bool> RunPartialInlining("enable-partial-inlining", cl::init(false),
                                  cl::Hidden, cl::ZeroOrMore,
                                  cl::desc("Run Partial inlinining pass"));
- 
-static cl::opt<bool> 
-UseGVNAfterVectorization("use-gvn-after-vectorization", 
-  cl::init(false), cl::Hidden, 
-  cl::desc("Run GVN instead of Early CSE after vectorization passes")); 
- 
+
+static cl::opt<bool>
+UseGVNAfterVectorization("use-gvn-after-vectorization",
+  cl::init(false), cl::Hidden,
+  cl::desc("Run GVN instead of Early CSE after vectorization passes"));
+
 cl::opt<bool> ExtraVectorizerPasses(
-    "extra-vectorizer-passes", cl::init(false), cl::Hidden, 
-    cl::desc("Run cleanup optimization passes after vectorization.")); 
- 
-static cl::opt<bool> 
-RunLoopRerolling("reroll-loops", cl::Hidden, 
-                 cl::desc("Run the loop rerolling pass")); 
- 
+    "extra-vectorizer-passes", cl::init(false), cl::Hidden,
+    cl::desc("Run cleanup optimization passes after vectorization."));
+
+static cl::opt<bool>
+RunLoopRerolling("reroll-loops", cl::Hidden,
+                 cl::desc("Run the loop rerolling pass"));
+
 cl::opt<bool> RunNewGVN("enable-newgvn", cl::init(false), cl::Hidden,
                         cl::desc("Run the NewGVN pass"));
- 
-// Experimental option to use CFL-AA 
-enum class CFLAAType { None, Steensgaard, Andersen, Both }; 
+
+// Experimental option to use CFL-AA
+enum class CFLAAType { None, Steensgaard, Andersen, Both };
 static cl::opt<::CFLAAType>
     UseCFLAA("use-cfl-aa", cl::init(::CFLAAType::None), cl::Hidden,
-             cl::desc("Enable the new, experimental CFL alias analysis"), 
+             cl::desc("Enable the new, experimental CFL alias analysis"),
              cl::values(clEnumValN(::CFLAAType::None, "none", "Disable CFL-AA"),
                         clEnumValN(::CFLAAType::Steensgaard, "steens",
-                                   "Enable unification-based CFL-AA"), 
+                                   "Enable unification-based CFL-AA"),
                         clEnumValN(::CFLAAType::Andersen, "anders",
-                                   "Enable inclusion-based CFL-AA"), 
+                                   "Enable inclusion-based CFL-AA"),
                         clEnumValN(::CFLAAType::Both, "both",
-                                   "Enable both variants of CFL-AA"))); 
- 
-static cl::opt<bool> EnableLoopInterchange( 
-    "enable-loopinterchange", cl::init(false), cl::Hidden, 
-    cl::desc("Enable the new, experimental LoopInterchange Pass")); 
- 
+                                   "Enable both variants of CFL-AA")));
+
+static cl::opt<bool> EnableLoopInterchange(
+    "enable-loopinterchange", cl::init(false), cl::Hidden,
+    cl::desc("Enable the new, experimental LoopInterchange Pass"));
+
 cl::opt<bool> EnableUnrollAndJam("enable-unroll-and-jam", cl::init(false),
                                  cl::Hidden,
                                  cl::desc("Enable Unroll And Jam Pass"));
- 
+
 cl::opt<bool> EnableLoopFlatten("enable-loop-flatten", cl::init(false),
                                 cl::Hidden,
                                 cl::desc("Enable the LoopFlatten Pass"));
 
-static cl::opt<bool> 
-    EnablePrepareForThinLTO("prepare-for-thinlto", cl::init(false), cl::Hidden, 
-                            cl::desc("Enable preparation for ThinLTO.")); 
- 
-static cl::opt<bool> 
-    EnablePerformThinLTO("perform-thinlto", cl::init(false), cl::Hidden, 
-                         cl::desc("Enable performing ThinLTO.")); 
- 
-cl::opt<bool> EnableHotColdSplit("hot-cold-split", cl::init(false), 
-    cl::ZeroOrMore, cl::desc("Enable hot-cold splitting pass")); 
- 
+static cl::opt<bool>
+    EnablePrepareForThinLTO("prepare-for-thinlto", cl::init(false), cl::Hidden,
+                            cl::desc("Enable preparation for ThinLTO."));
+
+static cl::opt<bool>
+    EnablePerformThinLTO("perform-thinlto", cl::init(false), cl::Hidden,
+                         cl::desc("Enable performing ThinLTO."));
+
+cl::opt<bool> EnableHotColdSplit("hot-cold-split", cl::init(false),
+    cl::ZeroOrMore, cl::desc("Enable hot-cold splitting pass"));
+
 cl::opt<bool> EnableIROutliner("ir-outliner", cl::init(false), cl::Hidden,
     cl::desc("Enable ir outliner pass"));
 
-static cl::opt<bool> UseLoopVersioningLICM( 
-    "enable-loop-versioning-licm", cl::init(false), cl::Hidden, 
-    cl::desc("Enable the experimental Loop Versioning LICM pass")); 
- 
+static cl::opt<bool> UseLoopVersioningLICM(
+    "enable-loop-versioning-licm", cl::init(false), cl::Hidden,
+    cl::desc("Enable the experimental Loop Versioning LICM pass"));
+
 cl::opt<bool>
-    DisablePreInliner("disable-preinline", cl::init(false), cl::Hidden, 
-                      cl::desc("Disable pre-instrumentation inliner")); 
- 
+    DisablePreInliner("disable-preinline", cl::init(false), cl::Hidden,
+                      cl::desc("Disable pre-instrumentation inliner"));
+
 cl::opt<int> PreInlineThreshold(
-    "preinline-threshold", cl::Hidden, cl::init(75), cl::ZeroOrMore, 
-    cl::desc("Control the amount of inlining in pre-instrumentation inliner " 
-             "(default = 75)")); 
- 
+    "preinline-threshold", cl::Hidden, cl::init(75), cl::ZeroOrMore,
+    cl::desc("Control the amount of inlining in pre-instrumentation inliner "
+             "(default = 75)"));
+
 cl::opt<bool>
     EnableGVNHoist("enable-gvn-hoist", cl::init(false), cl::ZeroOrMore,
                    cl::desc("Enable the GVN hoisting pass (default = off)"));
- 
-static cl::opt<bool> 
-    DisableLibCallsShrinkWrap("disable-libcalls-shrinkwrap", cl::init(false), 
-                              cl::Hidden, 
-                              cl::desc("Disable shrink-wrap library calls")); 
- 
-static cl::opt<bool> EnableSimpleLoopUnswitch( 
-    "enable-simple-loop-unswitch", cl::init(false), cl::Hidden, 
-    cl::desc("Enable the simple loop unswitch pass. Also enables independent " 
-             "cleanup passes integrated into the loop pass manager pipeline.")); 
- 
+
+static cl::opt<bool>
+    DisableLibCallsShrinkWrap("disable-libcalls-shrinkwrap", cl::init(false),
+                              cl::Hidden,
+                              cl::desc("Disable shrink-wrap library calls"));
+
+static cl::opt<bool> EnableSimpleLoopUnswitch(
+    "enable-simple-loop-unswitch", cl::init(false), cl::Hidden,
+    cl::desc("Enable the simple loop unswitch pass. Also enables independent "
+             "cleanup passes integrated into the loop pass manager pipeline."));
+
 cl::opt<bool>
     EnableGVNSink("enable-gvn-sink", cl::init(false), cl::ZeroOrMore,
                   cl::desc("Enable the GVN sinking pass (default = off)"));
- 
-// This option is used in simplifying testing SampleFDO optimizations for 
-// profile loading. 
+
+// This option is used in simplifying testing SampleFDO optimizations for
+// profile loading.
 cl::opt<bool>
-    EnableCHR("enable-chr", cl::init(true), cl::Hidden, 
-              cl::desc("Enable control height reduction optimization (CHR)")); 
- 
-cl::opt<bool> FlattenedProfileUsed( 
-    "flattened-profile-used", cl::init(false), cl::Hidden, 
-    cl::desc("Indicate the sample profile being used is flattened, i.e., " 
-             "no inline hierachy exists in the profile. ")); 
- 
-cl::opt<bool> EnableOrderFileInstrumentation( 
-    "enable-order-file-instrumentation", cl::init(false), cl::Hidden, 
-    cl::desc("Enable order file instrumentation (default = off)")); 
- 
+    EnableCHR("enable-chr", cl::init(true), cl::Hidden,
+              cl::desc("Enable control height reduction optimization (CHR)"));
+
+cl::opt<bool> FlattenedProfileUsed(
+    "flattened-profile-used", cl::init(false), cl::Hidden,
+    cl::desc("Indicate the sample profile being used is flattened, i.e., "
+             "no inline hierachy exists in the profile. "));
+
+cl::opt<bool> EnableOrderFileInstrumentation(
+    "enable-order-file-instrumentation", cl::init(false), cl::Hidden,
+    cl::desc("Enable order file instrumentation (default = off)"));
+
 cl::opt<bool> EnableMatrix(
     "enable-matrix", cl::init(false), cl::Hidden,
     cl::desc("Enable lowering of the matrix intrinsics"));
- 
+
 cl::opt<bool> EnableConstraintElimination(
     "enable-constraint-elimination", cl::init(false), cl::Hidden,
     cl::desc(
         "Enable pass to eliminate conditions based on linear constraints."));
 
-cl::opt<AttributorRunOption> AttributorRun( 
-    "attributor-enable", cl::Hidden, cl::init(AttributorRunOption::NONE), 
-    cl::desc("Enable the attributor inter-procedural deduction pass."), 
-    cl::values(clEnumValN(AttributorRunOption::ALL, "all", 
-                          "enable all attributor runs"), 
-               clEnumValN(AttributorRunOption::MODULE, "module", 
-                          "enable module-wide attributor runs"), 
-               clEnumValN(AttributorRunOption::CGSCC, "cgscc", 
-                          "enable call graph SCC attributor runs"), 
-               clEnumValN(AttributorRunOption::NONE, "none", 
-                          "disable attributor runs"))); 
- 
-extern cl::opt<bool> EnableKnowledgeRetention; 
- 
-PassManagerBuilder::PassManagerBuilder() { 
-    OptLevel = 2; 
-    SizeLevel = 0; 
-    LibraryInfo = nullptr; 
-    Inliner = nullptr; 
-    DisableUnrollLoops = false; 
-    SLPVectorize = false; 
-    LoopVectorize = true; 
-    LoopsInterleaved = true; 
-    RerollLoops = RunLoopRerolling; 
-    NewGVN = RunNewGVN; 
-    LicmMssaOptCap = SetLicmMssaOptCap; 
-    LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap; 
-    DisableGVNLoadPRE = false; 
-    ForgetAllSCEVInLoopUnroll = ForgetSCEVInLoopUnroll; 
-    VerifyInput = false; 
-    VerifyOutput = false; 
-    MergeFunctions = false; 
-    PrepareForLTO = false; 
-    EnablePGOInstrGen = false; 
-    EnablePGOCSInstrGen = false; 
-    EnablePGOCSInstrUse = false; 
-    PGOInstrGen = ""; 
-    PGOInstrUse = ""; 
-    PGOSampleUse = ""; 
-    PrepareForThinLTO = EnablePrepareForThinLTO; 
-    PerformThinLTO = EnablePerformThinLTO; 
-    DivergentTarget = false; 
-    CallGraphProfile = true; 
-} 
- 
-PassManagerBuilder::~PassManagerBuilder() { 
-  delete LibraryInfo; 
-  delete Inliner; 
-} 
- 
-/// Set of global extensions, automatically added as part of the standard set. 
-static ManagedStatic< 
-    SmallVector<std::tuple<PassManagerBuilder::ExtensionPointTy, 
-                           PassManagerBuilder::ExtensionFn, 
-                           PassManagerBuilder::GlobalExtensionID>, 
-                8>> 
-    GlobalExtensions; 
-static PassManagerBuilder::GlobalExtensionID GlobalExtensionsCounter; 
- 
-/// Check if GlobalExtensions is constructed and not empty. 
-/// Since GlobalExtensions is a managed static, calling 'empty()' will trigger 
-/// the construction of the object. 
-static bool GlobalExtensionsNotEmpty() { 
-  return GlobalExtensions.isConstructed() && !GlobalExtensions->empty(); 
-} 
- 
-PassManagerBuilder::GlobalExtensionID 
-PassManagerBuilder::addGlobalExtension(PassManagerBuilder::ExtensionPointTy Ty, 
-                                       PassManagerBuilder::ExtensionFn Fn) { 
-  auto ExtensionID = GlobalExtensionsCounter++; 
-  GlobalExtensions->push_back(std::make_tuple(Ty, std::move(Fn), ExtensionID)); 
-  return ExtensionID; 
-} 
- 
-void PassManagerBuilder::removeGlobalExtension( 
-    PassManagerBuilder::GlobalExtensionID ExtensionID) { 
-  // RegisterStandardPasses may try to call this function after GlobalExtensions 
-  // has already been destroyed; doing so should not generate an error. 
-  if (!GlobalExtensions.isConstructed()) 
-    return; 
- 
-  auto GlobalExtension = 
-      llvm::find_if(*GlobalExtensions, [ExtensionID](const auto &elem) { 
-        return std::get<2>(elem) == ExtensionID; 
-      }); 
-  assert(GlobalExtension != GlobalExtensions->end() && 
-         "The extension ID to be removed should always be valid."); 
- 
-  GlobalExtensions->erase(GlobalExtension); 
-} 
- 
-void PassManagerBuilder::addExtension(ExtensionPointTy Ty, ExtensionFn Fn) { 
-  Extensions.push_back(std::make_pair(Ty, std::move(Fn))); 
-} 
- 
-void PassManagerBuilder::addExtensionsToPM(ExtensionPointTy ETy, 
-                                           legacy::PassManagerBase &PM) const { 
-  if (GlobalExtensionsNotEmpty()) { 
-    for (auto &Ext : *GlobalExtensions) { 
-      if (std::get<0>(Ext) == ETy) 
-        std::get<1>(Ext)(*this, PM); 
-    } 
-  } 
-  for (unsigned i = 0, e = Extensions.size(); i != e; ++i) 
-    if (Extensions[i].first == ETy) 
-      Extensions[i].second(*this, PM); 
-} 
- 
-void PassManagerBuilder::addInitialAliasAnalysisPasses( 
-    legacy::PassManagerBase &PM) const { 
-  switch (UseCFLAA) { 
+cl::opt<AttributorRunOption> AttributorRun(
+    "attributor-enable", cl::Hidden, cl::init(AttributorRunOption::NONE),
+    cl::desc("Enable the attributor inter-procedural deduction pass."),
+    cl::values(clEnumValN(AttributorRunOption::ALL, "all",
+                          "enable all attributor runs"),
+               clEnumValN(AttributorRunOption::MODULE, "module",
+                          "enable module-wide attributor runs"),
+               clEnumValN(AttributorRunOption::CGSCC, "cgscc",
+                          "enable call graph SCC attributor runs"),
+               clEnumValN(AttributorRunOption::NONE, "none",
+                          "disable attributor runs")));
+
+extern cl::opt<bool> EnableKnowledgeRetention;
+
+PassManagerBuilder::PassManagerBuilder() {
+    OptLevel = 2;
+    SizeLevel = 0;
+    LibraryInfo = nullptr;
+    Inliner = nullptr;
+    DisableUnrollLoops = false;
+    SLPVectorize = false;
+    LoopVectorize = true;
+    LoopsInterleaved = true;
+    RerollLoops = RunLoopRerolling;
+    NewGVN = RunNewGVN;
+    LicmMssaOptCap = SetLicmMssaOptCap;
+    LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap;
+    DisableGVNLoadPRE = false;
+    ForgetAllSCEVInLoopUnroll = ForgetSCEVInLoopUnroll;
+    VerifyInput = false;
+    VerifyOutput = false;
+    MergeFunctions = false;
+    PrepareForLTO = false;
+    EnablePGOInstrGen = false;
+    EnablePGOCSInstrGen = false;
+    EnablePGOCSInstrUse = false;
+    PGOInstrGen = "";
+    PGOInstrUse = "";
+    PGOSampleUse = "";
+    PrepareForThinLTO = EnablePrepareForThinLTO;
+    PerformThinLTO = EnablePerformThinLTO;
+    DivergentTarget = false;
+    CallGraphProfile = true;
+}
+
+PassManagerBuilder::~PassManagerBuilder() {
+  delete LibraryInfo;
+  delete Inliner;
+}
+
+/// Set of global extensions, automatically added as part of the standard set.
+static ManagedStatic<
+    SmallVector<std::tuple<PassManagerBuilder::ExtensionPointTy,
+                           PassManagerBuilder::ExtensionFn,
+                           PassManagerBuilder::GlobalExtensionID>,
+                8>>
+    GlobalExtensions;
+static PassManagerBuilder::GlobalExtensionID GlobalExtensionsCounter;
+
+/// Check if GlobalExtensions is constructed and not empty.
+/// Since GlobalExtensions is a managed static, calling 'empty()' will trigger
+/// the construction of the object.
+static bool GlobalExtensionsNotEmpty() {
+  return GlobalExtensions.isConstructed() && !GlobalExtensions->empty();
+}
+
+PassManagerBuilder::GlobalExtensionID
+PassManagerBuilder::addGlobalExtension(PassManagerBuilder::ExtensionPointTy Ty,
+                                       PassManagerBuilder::ExtensionFn Fn) {
+  auto ExtensionID = GlobalExtensionsCounter++;
+  GlobalExtensions->push_back(std::make_tuple(Ty, std::move(Fn), ExtensionID));
+  return ExtensionID;
+}
+
+void PassManagerBuilder::removeGlobalExtension(
+    PassManagerBuilder::GlobalExtensionID ExtensionID) {
+  // RegisterStandardPasses may try to call this function after GlobalExtensions
+  // has already been destroyed; doing so should not generate an error.
+  if (!GlobalExtensions.isConstructed())
+    return;
+
+  auto GlobalExtension =
+      llvm::find_if(*GlobalExtensions, [ExtensionID](const auto &elem) {
+        return std::get<2>(elem) == ExtensionID;
+      });
+  assert(GlobalExtension != GlobalExtensions->end() &&
+         "The extension ID to be removed should always be valid.");
+
+  GlobalExtensions->erase(GlobalExtension);
+}
+
+void PassManagerBuilder::addExtension(ExtensionPointTy Ty, ExtensionFn Fn) {
+  Extensions.push_back(std::make_pair(Ty, std::move(Fn)));
+}
+
+void PassManagerBuilder::addExtensionsToPM(ExtensionPointTy ETy,
+                                           legacy::PassManagerBase &PM) const {
+  if (GlobalExtensionsNotEmpty()) {
+    for (auto &Ext : *GlobalExtensions) {
+      if (std::get<0>(Ext) == ETy)
+        std::get<1>(Ext)(*this, PM);
+    }
+  }
+  for (unsigned i = 0, e = Extensions.size(); i != e; ++i)
+    if (Extensions[i].first == ETy)
+      Extensions[i].second(*this, PM);
+}
+
+void PassManagerBuilder::addInitialAliasAnalysisPasses(
+    legacy::PassManagerBase &PM) const {
+  switch (UseCFLAA) {
   case ::CFLAAType::Steensgaard:
-    PM.add(createCFLSteensAAWrapperPass()); 
-    break; 
+    PM.add(createCFLSteensAAWrapperPass());
+    break;
   case ::CFLAAType::Andersen:
-    PM.add(createCFLAndersAAWrapperPass()); 
-    break; 
+    PM.add(createCFLAndersAAWrapperPass());
+    break;
   case ::CFLAAType::Both:
-    PM.add(createCFLSteensAAWrapperPass()); 
-    PM.add(createCFLAndersAAWrapperPass()); 
-    break; 
-  default: 
-    break; 
-  } 
- 
-  // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that 
-  // BasicAliasAnalysis wins if they disagree. This is intended to help 
-  // support "obvious" type-punning idioms. 
-  PM.add(createTypeBasedAAWrapperPass()); 
-  PM.add(createScopedNoAliasAAWrapperPass()); 
-} 
- 
-void PassManagerBuilder::populateFunctionPassManager( 
-    legacy::FunctionPassManager &FPM) { 
-  addExtensionsToPM(EP_EarlyAsPossible, FPM); 
-  FPM.add(createEntryExitInstrumenterPass()); 
- 
-  // Add LibraryInfo if we have some. 
-  if (LibraryInfo) 
-    FPM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo)); 
- 
+    PM.add(createCFLSteensAAWrapperPass());
+    PM.add(createCFLAndersAAWrapperPass());
+    break;
+  default:
+    break;
+  }
+
+  // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
+  // BasicAliasAnalysis wins if they disagree. This is intended to help
+  // support "obvious" type-punning idioms.
+  PM.add(createTypeBasedAAWrapperPass());
+  PM.add(createScopedNoAliasAAWrapperPass());
+}
+
+void PassManagerBuilder::populateFunctionPassManager(
+    legacy::FunctionPassManager &FPM) {
+  addExtensionsToPM(EP_EarlyAsPossible, FPM);
+  FPM.add(createEntryExitInstrumenterPass());
+
+  // Add LibraryInfo if we have some.
+  if (LibraryInfo)
+    FPM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo));
+
   // The backends do not handle matrix intrinsics currently.
   // Make sure they are also lowered in O0.
   // FIXME: A lightweight version of the pass should run in the backend
@@ -313,34 +313,34 @@ void PassManagerBuilder::populateFunctionPassManager(
   if (EnableMatrix && OptLevel == 0)
     FPM.add(createLowerMatrixIntrinsicsMinimalPass());
 
-  if (OptLevel == 0) return; 
- 
-  addInitialAliasAnalysisPasses(FPM); 
- 
-  FPM.add(createCFGSimplificationPass()); 
-  FPM.add(createSROAPass()); 
-  FPM.add(createEarlyCSEPass()); 
-  FPM.add(createLowerExpectIntrinsicPass()); 
-} 
- 
-// Do PGO instrumentation generation or use pass as the option specified. 
-void PassManagerBuilder::addPGOInstrPasses(legacy::PassManagerBase &MPM, 
-                                           bool IsCS = false) { 
-  if (IsCS) { 
-    if (!EnablePGOCSInstrGen && !EnablePGOCSInstrUse) 
-      return; 
-  } else if (!EnablePGOInstrGen && PGOInstrUse.empty() && PGOSampleUse.empty()) 
-    return; 
- 
-  // Perform the preinline and cleanup passes for O1 and above. 
-  // We will not do this inline for context sensitive PGO (when IsCS is true). 
+  if (OptLevel == 0) return;
+
+  addInitialAliasAnalysisPasses(FPM);
+
+  FPM.add(createCFGSimplificationPass());
+  FPM.add(createSROAPass());
+  FPM.add(createEarlyCSEPass());
+  FPM.add(createLowerExpectIntrinsicPass());
+}
+
+// Do PGO instrumentation generation or use pass as the option specified.
+void PassManagerBuilder::addPGOInstrPasses(legacy::PassManagerBase &MPM,
+                                           bool IsCS = false) {
+  if (IsCS) {
+    if (!EnablePGOCSInstrGen && !EnablePGOCSInstrUse)
+      return;
+  } else if (!EnablePGOInstrGen && PGOInstrUse.empty() && PGOSampleUse.empty())
+    return;
+
+  // Perform the preinline and cleanup passes for O1 and above.
+  // We will not do this inline for context sensitive PGO (when IsCS is true).
   if (OptLevel > 0 && !DisablePreInliner && PGOSampleUse.empty() && !IsCS) {
-    // Create preinline pass. We construct an InlineParams object and specify 
-    // the threshold here to avoid the command line options of the regular 
-    // inliner to influence pre-inlining. The only fields of InlineParams we 
-    // care about are DefaultThreshold and HintThreshold. 
-    InlineParams IP; 
-    IP.DefaultThreshold = PreInlineThreshold; 
+    // Create preinline pass. We construct an InlineParams object and specify
+    // the threshold here to avoid the command line options of the regular
+    // inliner to influence pre-inlining. The only fields of InlineParams we
+    // care about are DefaultThreshold and HintThreshold.
+    InlineParams IP;
+    IP.DefaultThreshold = PreInlineThreshold;
     // FIXME: The hint threshold has the same value used by the regular inliner
     // when not optimzing for size. This should probably be lowered after
     // performance testing.
@@ -348,476 +348,476 @@ void PassManagerBuilder::addPGOInstrPasses(legacy::PassManagerBase &MPM,
     // the instrumented binary unusably large. Even if PreInlineThreshold is not
     // correct thresold for -Oz, it is better than not running preinliner.
     IP.HintThreshold = SizeLevel > 0 ? PreInlineThreshold : 325;
- 
-    MPM.add(createFunctionInliningPass(IP)); 
-    MPM.add(createSROAPass()); 
-    MPM.add(createEarlyCSEPass());             // Catch trivial redundancies 
-    MPM.add(createCFGSimplificationPass());    // Merge & remove BBs 
-    MPM.add(createInstructionCombiningPass()); // Combine silly seq's 
-    addExtensionsToPM(EP_Peephole, MPM); 
-  } 
-  if ((EnablePGOInstrGen && !IsCS) || (EnablePGOCSInstrGen && IsCS)) { 
-    MPM.add(createPGOInstrumentationGenLegacyPass(IsCS)); 
-    // Add the profile lowering pass. 
-    InstrProfOptions Options; 
-    if (!PGOInstrGen.empty()) 
-      Options.InstrProfileOutput = PGOInstrGen; 
-    Options.DoCounterPromotion = true; 
-    Options.UseBFIInPromotion = IsCS; 
-    MPM.add(createLoopRotatePass()); 
-    MPM.add(createInstrProfilingLegacyPass(Options, IsCS)); 
-  } 
-  if (!PGOInstrUse.empty()) 
-    MPM.add(createPGOInstrumentationUseLegacyPass(PGOInstrUse, IsCS)); 
-  // Indirect call promotion that promotes intra-module targets only. 
-  // For ThinLTO this is done earlier due to interactions with globalopt 
-  // for imported functions. We don't run this at -O0. 
-  if (OptLevel > 0 && !IsCS) 
-    MPM.add( 
-        createPGOIndirectCallPromotionLegacyPass(false, !PGOSampleUse.empty())); 
-} 
-void PassManagerBuilder::addFunctionSimplificationPasses( 
-    legacy::PassManagerBase &MPM) { 
-  // Start of function pass. 
-  // Break up aggregate allocas, using SSAUpdater. 
-  assert(OptLevel >= 1 && "Calling function optimizer with no optimization level!"); 
-  MPM.add(createSROAPass()); 
-  MPM.add(createEarlyCSEPass(true /* Enable mem-ssa. */)); // Catch trivial redundancies 
-  if (EnableKnowledgeRetention) 
-    MPM.add(createAssumeSimplifyPass()); 
- 
-  if (OptLevel > 1) { 
-    if (EnableGVNHoist) 
-      MPM.add(createGVNHoistPass()); 
-    if (EnableGVNSink) { 
-      MPM.add(createGVNSinkPass()); 
-      MPM.add(createCFGSimplificationPass()); 
-    } 
-  } 
- 
+
+    MPM.add(createFunctionInliningPass(IP));
+    MPM.add(createSROAPass());
+    MPM.add(createEarlyCSEPass());             // Catch trivial redundancies
+    MPM.add(createCFGSimplificationPass());    // Merge & remove BBs
+    MPM.add(createInstructionCombiningPass()); // Combine silly seq's
+    addExtensionsToPM(EP_Peephole, MPM);
+  }
+  if ((EnablePGOInstrGen && !IsCS) || (EnablePGOCSInstrGen && IsCS)) {
+    MPM.add(createPGOInstrumentationGenLegacyPass(IsCS));
+    // Add the profile lowering pass.
+    InstrProfOptions Options;
+    if (!PGOInstrGen.empty())
+      Options.InstrProfileOutput = PGOInstrGen;
+    Options.DoCounterPromotion = true;
+    Options.UseBFIInPromotion = IsCS;
+    MPM.add(createLoopRotatePass());
+    MPM.add(createInstrProfilingLegacyPass(Options, IsCS));
+  }
+  if (!PGOInstrUse.empty())
+    MPM.add(createPGOInstrumentationUseLegacyPass(PGOInstrUse, IsCS));
+  // Indirect call promotion that promotes intra-module targets only.
+  // For ThinLTO this is done earlier due to interactions with globalopt
+  // for imported functions. We don't run this at -O0.
+  if (OptLevel > 0 && !IsCS)
+    MPM.add(
+        createPGOIndirectCallPromotionLegacyPass(false, !PGOSampleUse.empty()));
+}
+void PassManagerBuilder::addFunctionSimplificationPasses(
+    legacy::PassManagerBase &MPM) {
+  // Start of function pass.
+  // Break up aggregate allocas, using SSAUpdater.
+  assert(OptLevel >= 1 && "Calling function optimizer with no optimization level!");
+  MPM.add(createSROAPass());
+  MPM.add(createEarlyCSEPass(true /* Enable mem-ssa. */)); // Catch trivial redundancies
+  if (EnableKnowledgeRetention)
+    MPM.add(createAssumeSimplifyPass());
+
+  if (OptLevel > 1) {
+    if (EnableGVNHoist)
+      MPM.add(createGVNHoistPass());
+    if (EnableGVNSink) {
+      MPM.add(createGVNSinkPass());
+      MPM.add(createCFGSimplificationPass());
+    }
+  }
+
   if (EnableConstraintElimination)
     MPM.add(createConstraintEliminationPass());
 
-  if (OptLevel > 1) { 
-    // Speculative execution if the target has divergent branches; otherwise nop. 
-    MPM.add(createSpeculativeExecutionIfHasBranchDivergencePass()); 
- 
-    MPM.add(createJumpThreadingPass());         // Thread jumps. 
-    MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals 
-  } 
-  MPM.add(createCFGSimplificationPass());     // Merge & remove BBs 
-  // Combine silly seq's 
-  if (OptLevel > 2) 
-    MPM.add(createAggressiveInstCombinerPass()); 
-  MPM.add(createInstructionCombiningPass()); 
-  if (SizeLevel == 0 && !DisableLibCallsShrinkWrap) 
-    MPM.add(createLibCallsShrinkWrapPass()); 
-  addExtensionsToPM(EP_Peephole, MPM); 
- 
-  // Optimize memory intrinsic calls based on the profiled size information. 
-  if (SizeLevel == 0) 
-    MPM.add(createPGOMemOPSizeOptLegacyPass()); 
- 
-  // TODO: Investigate the cost/benefit of tail call elimination on debugging. 
-  if (OptLevel > 1) 
-    MPM.add(createTailCallEliminationPass()); // Eliminate tail calls 
-  MPM.add(createCFGSimplificationPass());      // Merge & remove BBs 
-  MPM.add(createReassociatePass());           // Reassociate expressions 
- 
-  // Begin the loop pass pipeline. 
-  if (EnableSimpleLoopUnswitch) { 
-    // The simple loop unswitch pass relies on separate cleanup passes. Schedule 
-    // them first so when we re-process a loop they run before other loop 
-    // passes. 
-    MPM.add(createLoopInstSimplifyPass()); 
-    MPM.add(createLoopSimplifyCFGPass()); 
-  } 
-  // Rotate Loop - disable header duplication at -Oz 
+  if (OptLevel > 1) {
+    // Speculative execution if the target has divergent branches; otherwise nop.
+    MPM.add(createSpeculativeExecutionIfHasBranchDivergencePass());
+
+    MPM.add(createJumpThreadingPass());         // Thread jumps.
+    MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals
+  }
+  MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
+  // Combine silly seq's
+  if (OptLevel > 2)
+    MPM.add(createAggressiveInstCombinerPass());
+  MPM.add(createInstructionCombiningPass());
+  if (SizeLevel == 0 && !DisableLibCallsShrinkWrap)
+    MPM.add(createLibCallsShrinkWrapPass());
+  addExtensionsToPM(EP_Peephole, MPM);
+
+  // Optimize memory intrinsic calls based on the profiled size information.
+  if (SizeLevel == 0)
+    MPM.add(createPGOMemOPSizeOptLegacyPass());
+
+  // TODO: Investigate the cost/benefit of tail call elimination on debugging.
+  if (OptLevel > 1)
+    MPM.add(createTailCallEliminationPass()); // Eliminate tail calls
+  MPM.add(createCFGSimplificationPass());      // Merge & remove BBs
+  MPM.add(createReassociatePass());           // Reassociate expressions
+
+  // Begin the loop pass pipeline.
+  if (EnableSimpleLoopUnswitch) {
+    // The simple loop unswitch pass relies on separate cleanup passes. Schedule
+    // them first so when we re-process a loop they run before other loop
+    // passes.
+    MPM.add(createLoopInstSimplifyPass());
+    MPM.add(createLoopSimplifyCFGPass());
+  }
+  // Rotate Loop - disable header duplication at -Oz
   MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1, PrepareForLTO));
-  // TODO: Investigate promotion cap for O1. 
-  MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); 
-  if (EnableSimpleLoopUnswitch) 
-    MPM.add(createSimpleLoopUnswitchLegacyPass()); 
-  else 
-    MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget)); 
-  // FIXME: We break the loop pass pipeline here in order to do full 
-  // simplify-cfg. Eventually loop-simplifycfg should be enhanced to replace the 
-  // need for this. 
-  MPM.add(createCFGSimplificationPass()); 
-  MPM.add(createInstructionCombiningPass()); 
-  // We resume loop passes creating a second loop pipeline here. 
+  // TODO: Investigate promotion cap for O1.
+  MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+  if (EnableSimpleLoopUnswitch)
+    MPM.add(createSimpleLoopUnswitchLegacyPass());
+  else
+    MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
+  // FIXME: We break the loop pass pipeline here in order to do full
+  // simplify-cfg. Eventually loop-simplifycfg should be enhanced to replace the
+  // need for this.
+  MPM.add(createCFGSimplificationPass());
+  MPM.add(createInstructionCombiningPass());
+  // We resume loop passes creating a second loop pipeline here.
   if (EnableLoopFlatten) {
     MPM.add(createLoopFlattenPass()); // Flatten loops
     MPM.add(createLoopSimplifyCFGPass());
   }
   MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
-  MPM.add(createIndVarSimplifyPass());        // Canonicalize indvars 
-  addExtensionsToPM(EP_LateLoopOptimizations, MPM); 
-  MPM.add(createLoopDeletionPass());          // Delete dead loops 
- 
-  if (EnableLoopInterchange) 
-    MPM.add(createLoopInterchangePass()); // Interchange loops 
- 
+  MPM.add(createIndVarSimplifyPass());        // Canonicalize indvars
+  addExtensionsToPM(EP_LateLoopOptimizations, MPM);
+  MPM.add(createLoopDeletionPass());          // Delete dead loops
+
+  if (EnableLoopInterchange)
+    MPM.add(createLoopInterchangePass()); // Interchange loops
+
   // Unroll small loops and perform peeling.
-  MPM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops, 
-                                     ForgetAllSCEVInLoopUnroll)); 
-  addExtensionsToPM(EP_LoopOptimizerEnd, MPM); 
-  // This ends the loop pass pipelines. 
- 
+  MPM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops,
+                                     ForgetAllSCEVInLoopUnroll));
+  addExtensionsToPM(EP_LoopOptimizerEnd, MPM);
+  // This ends the loop pass pipelines.
+
   // Break up allocas that may now be splittable after loop unrolling.
   MPM.add(createSROAPass());
 
-  if (OptLevel > 1) { 
-    MPM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds 
-    MPM.add(NewGVN ? createNewGVNPass() 
-                   : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies 
-  } 
-  MPM.add(createMemCpyOptPass());             // Remove memcpy / form memset 
-  MPM.add(createSCCPPass());                  // Constant prop with SCCP 
- 
+  if (OptLevel > 1) {
+    MPM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds
+    MPM.add(NewGVN ? createNewGVNPass()
+                   : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies
+  }
+  MPM.add(createMemCpyOptPass());             // Remove memcpy / form memset
+  MPM.add(createSCCPPass());                  // Constant prop with SCCP
+
   if (EnableConstraintElimination)
     MPM.add(createConstraintEliminationPass());
 
-  // Delete dead bit computations (instcombine runs after to fold away the dead 
-  // computations, and then ADCE will run later to exploit any new DCE 
-  // opportunities that creates). 
-  MPM.add(createBitTrackingDCEPass());        // Delete dead bit computations 
- 
-  // Run instcombine after redundancy elimination to exploit opportunities 
-  // opened up by them. 
-  MPM.add(createInstructionCombiningPass()); 
-  addExtensionsToPM(EP_Peephole, MPM); 
-  if (OptLevel > 1) { 
-    MPM.add(createJumpThreadingPass());         // Thread jumps 
-    MPM.add(createCorrelatedValuePropagationPass()); 
+  // Delete dead bit computations (instcombine runs after to fold away the dead
+  // computations, and then ADCE will run later to exploit any new DCE
+  // opportunities that creates).
+  MPM.add(createBitTrackingDCEPass());        // Delete dead bit computations
+
+  // Run instcombine after redundancy elimination to exploit opportunities
+  // opened up by them.
+  MPM.add(createInstructionCombiningPass());
+  addExtensionsToPM(EP_Peephole, MPM);
+  if (OptLevel > 1) {
+    MPM.add(createJumpThreadingPass());         // Thread jumps
+    MPM.add(createCorrelatedValuePropagationPass());
   }
   MPM.add(createAggressiveDCEPass()); // Delete dead instructions
 
   // TODO: Investigate if this is too expensive at O1.
   if (OptLevel > 1) {
-    MPM.add(createDeadStoreEliminationPass());  // Delete dead stores 
-    MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); 
-  } 
- 
-  addExtensionsToPM(EP_ScalarOptimizerLate, MPM); 
- 
-  if (RerollLoops) 
-    MPM.add(createLoopRerollPass()); 
- 
-  MPM.add(createCFGSimplificationPass()); // Merge & remove BBs 
-  // Clean up after everything. 
-  MPM.add(createInstructionCombiningPass()); 
-  addExtensionsToPM(EP_Peephole, MPM); 
- 
-  if (EnableCHR && OptLevel >= 3 && 
-      (!PGOInstrUse.empty() || !PGOSampleUse.empty() || EnablePGOCSInstrGen)) 
-    MPM.add(createControlHeightReductionLegacyPass()); 
-} 
- 
-void PassManagerBuilder::populateModulePassManager( 
-    legacy::PassManagerBase &MPM) { 
-  // Whether this is a default or *LTO pre-link pipeline. The FullLTO post-link 
-  // is handled separately, so just check this is not the ThinLTO post-link. 
-  bool DefaultOrPreLinkPipeline = !PerformThinLTO; 
- 
+    MPM.add(createDeadStoreEliminationPass());  // Delete dead stores
+    MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+  }
+
+  addExtensionsToPM(EP_ScalarOptimizerLate, MPM);
+
+  if (RerollLoops)
+    MPM.add(createLoopRerollPass());
+
+  MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
+  // Clean up after everything.
+  MPM.add(createInstructionCombiningPass());
+  addExtensionsToPM(EP_Peephole, MPM);
+
+  if (EnableCHR && OptLevel >= 3 &&
+      (!PGOInstrUse.empty() || !PGOSampleUse.empty() || EnablePGOCSInstrGen))
+    MPM.add(createControlHeightReductionLegacyPass());
+}
+
+void PassManagerBuilder::populateModulePassManager(
+    legacy::PassManagerBase &MPM) {
+  // Whether this is a default or *LTO pre-link pipeline. The FullLTO post-link
+  // is handled separately, so just check this is not the ThinLTO post-link.
+  bool DefaultOrPreLinkPipeline = !PerformThinLTO;
+
   MPM.add(createAnnotation2MetadataLegacyPass());
 
-  if (!PGOSampleUse.empty()) { 
-    MPM.add(createPruneEHPass()); 
-    // In ThinLTO mode, when flattened profile is used, all the available 
-    // profile information will be annotated in PreLink phase so there is 
-    // no need to load the profile again in PostLink. 
-    if (!(FlattenedProfileUsed && PerformThinLTO)) 
-      MPM.add(createSampleProfileLoaderPass(PGOSampleUse)); 
-  } 
- 
-  // Allow forcing function attributes as a debugging and tuning aid. 
-  MPM.add(createForceFunctionAttrsLegacyPass()); 
- 
-  // If all optimizations are disabled, just run the always-inline pass and, 
-  // if enabled, the function merging pass. 
-  if (OptLevel == 0) { 
-    addPGOInstrPasses(MPM); 
-    if (Inliner) { 
-      MPM.add(Inliner); 
-      Inliner = nullptr; 
-    } 
- 
-    // FIXME: The BarrierNoopPass is a HACK! The inliner pass above implicitly 
-    // creates a CGSCC pass manager, but we don't want to add extensions into 
-    // that pass manager. To prevent this we insert a no-op module pass to reset 
-    // the pass manager to get the same behavior as EP_OptimizerLast in non-O0 
-    // builds. The function merging pass is 
-    if (MergeFunctions) 
-      MPM.add(createMergeFunctionsPass()); 
-    else if (GlobalExtensionsNotEmpty() || !Extensions.empty()) 
-      MPM.add(createBarrierNoopPass()); 
- 
-    if (PerformThinLTO) { 
-      MPM.add(createLowerTypeTestsPass(nullptr, nullptr, true)); 
-      // Drop available_externally and unreferenced globals. This is necessary 
-      // with ThinLTO in order to avoid leaving undefined references to dead 
-      // globals in the object file. 
-      MPM.add(createEliminateAvailableExternallyPass()); 
-      MPM.add(createGlobalDCEPass()); 
-    } 
- 
-    addExtensionsToPM(EP_EnabledOnOptLevel0, MPM); 
- 
-    if (PrepareForLTO || PrepareForThinLTO) { 
-      MPM.add(createCanonicalizeAliasesPass()); 
-      // Rename anon globals to be able to export them in the summary. 
-      // This has to be done after we add the extensions to the pass manager 
-      // as there could be passes (e.g. Adddress sanitizer) which introduce 
-      // new unnamed globals. 
-      MPM.add(createNameAnonGlobalPass()); 
-    } 
+  if (!PGOSampleUse.empty()) {
+    MPM.add(createPruneEHPass());
+    // In ThinLTO mode, when flattened profile is used, all the available
+    // profile information will be annotated in PreLink phase so there is
+    // no need to load the profile again in PostLink.
+    if (!(FlattenedProfileUsed && PerformThinLTO))
+      MPM.add(createSampleProfileLoaderPass(PGOSampleUse));
+  }
+
+  // Allow forcing function attributes as a debugging and tuning aid.
+  MPM.add(createForceFunctionAttrsLegacyPass());
+
+  // If all optimizations are disabled, just run the always-inline pass and,
+  // if enabled, the function merging pass.
+  if (OptLevel == 0) {
+    addPGOInstrPasses(MPM);
+    if (Inliner) {
+      MPM.add(Inliner);
+      Inliner = nullptr;
+    }
+
+    // FIXME: The BarrierNoopPass is a HACK! The inliner pass above implicitly
+    // creates a CGSCC pass manager, but we don't want to add extensions into
+    // that pass manager. To prevent this we insert a no-op module pass to reset
+    // the pass manager to get the same behavior as EP_OptimizerLast in non-O0
+    // builds. The function merging pass is
+    if (MergeFunctions)
+      MPM.add(createMergeFunctionsPass());
+    else if (GlobalExtensionsNotEmpty() || !Extensions.empty())
+      MPM.add(createBarrierNoopPass());
+
+    if (PerformThinLTO) {
+      MPM.add(createLowerTypeTestsPass(nullptr, nullptr, true));
+      // Drop available_externally and unreferenced globals. This is necessary
+      // with ThinLTO in order to avoid leaving undefined references to dead
+      // globals in the object file.
+      MPM.add(createEliminateAvailableExternallyPass());
+      MPM.add(createGlobalDCEPass());
+    }
+
+    addExtensionsToPM(EP_EnabledOnOptLevel0, MPM);
+
+    if (PrepareForLTO || PrepareForThinLTO) {
+      MPM.add(createCanonicalizeAliasesPass());
+      // Rename anon globals to be able to export them in the summary.
+      // This has to be done after we add the extensions to the pass manager
+      // as there could be passes (e.g. Adddress sanitizer) which introduce
+      // new unnamed globals.
+      MPM.add(createNameAnonGlobalPass());
+    }
 
     MPM.add(createAnnotationRemarksLegacyPass());
-    return; 
-  } 
- 
-  // Add LibraryInfo if we have some. 
-  if (LibraryInfo) 
-    MPM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo)); 
- 
-  addInitialAliasAnalysisPasses(MPM); 
- 
-  // For ThinLTO there are two passes of indirect call promotion. The 
-  // first is during the compile phase when PerformThinLTO=false and 
-  // intra-module indirect call targets are promoted. The second is during 
-  // the ThinLTO backend when PerformThinLTO=true, when we promote imported 
-  // inter-module indirect calls. For that we perform indirect call promotion 
-  // earlier in the pass pipeline, here before globalopt. Otherwise imported 
-  // available_externally functions look unreferenced and are removed. 
-  if (PerformThinLTO) { 
-    MPM.add(createPGOIndirectCallPromotionLegacyPass(/*InLTO = */ true, 
-                                                     !PGOSampleUse.empty())); 
-    MPM.add(createLowerTypeTestsPass(nullptr, nullptr, true)); 
-  } 
- 
-  // For SamplePGO in ThinLTO compile phase, we do not want to unroll loops 
-  // as it will change the CFG too much to make the 2nd profile annotation 
-  // in backend more difficult. 
-  bool PrepareForThinLTOUsingPGOSampleProfile = 
-      PrepareForThinLTO && !PGOSampleUse.empty(); 
-  if (PrepareForThinLTOUsingPGOSampleProfile) 
-    DisableUnrollLoops = true; 
- 
-  // Infer attributes about declarations if possible. 
-  MPM.add(createInferFunctionAttrsLegacyPass()); 
- 
-  // Infer attributes on declarations, call sites, arguments, etc. 
-  if (AttributorRun & AttributorRunOption::MODULE) 
-    MPM.add(createAttributorLegacyPass()); 
- 
-  addExtensionsToPM(EP_ModuleOptimizerEarly, MPM); 
- 
-  if (OptLevel > 2) 
-    MPM.add(createCallSiteSplittingPass()); 
- 
-  MPM.add(createIPSCCPPass());          // IP SCCP 
-  MPM.add(createCalledValuePropagationPass()); 
- 
-  MPM.add(createGlobalOptimizerPass()); // Optimize out global vars 
-  // Promote any localized global vars. 
-  MPM.add(createPromoteMemoryToRegisterPass()); 
- 
-  MPM.add(createDeadArgEliminationPass()); // Dead argument elimination 
- 
-  MPM.add(createInstructionCombiningPass()); // Clean up after IPCP & DAE 
-  addExtensionsToPM(EP_Peephole, MPM); 
-  MPM.add(createCFGSimplificationPass()); // Clean up after IPCP & DAE 
- 
-  // For SamplePGO in ThinLTO compile phase, we do not want to do indirect 
-  // call promotion as it will change the CFG too much to make the 2nd 
-  // profile annotation in backend more difficult. 
-  // PGO instrumentation is added during the compile phase for ThinLTO, do 
-  // not run it a second time 
-  if (DefaultOrPreLinkPipeline && !PrepareForThinLTOUsingPGOSampleProfile) 
-    addPGOInstrPasses(MPM); 
- 
-  // Create profile COMDAT variables. Lld linker wants to see all variables 
-  // before the LTO/ThinLTO link since it needs to resolve symbols/comdats. 
-  if (!PerformThinLTO && EnablePGOCSInstrGen) 
-    MPM.add(createPGOInstrumentationGenCreateVarLegacyPass(PGOInstrGen)); 
- 
-  // We add a module alias analysis pass here. In part due to bugs in the 
-  // analysis infrastructure this "works" in that the analysis stays alive 
-  // for the entire SCC pass run below. 
-  MPM.add(createGlobalsAAWrapperPass()); 
- 
-  // Start of CallGraph SCC passes. 
-  MPM.add(createPruneEHPass()); // Remove dead EH info 
-  bool RunInliner = false; 
-  if (Inliner) { 
-    MPM.add(Inliner); 
-    Inliner = nullptr; 
-    RunInliner = true; 
-  } 
- 
-  // Infer attributes on declarations, call sites, arguments, etc. for an SCC. 
-  if (AttributorRun & AttributorRunOption::CGSCC) 
-    MPM.add(createAttributorCGSCCLegacyPass()); 
- 
-  // Try to perform OpenMP specific optimizations. This is a (quick!) no-op if 
-  // there are no OpenMP runtime calls present in the module. 
-  if (OptLevel > 1) 
-    MPM.add(createOpenMPOptLegacyPass()); 
- 
-  MPM.add(createPostOrderFunctionAttrsLegacyPass()); 
-  if (OptLevel > 2) 
-    MPM.add(createArgumentPromotionPass()); // Scalarize uninlined fn args 
- 
-  addExtensionsToPM(EP_CGSCCOptimizerLate, MPM); 
-  addFunctionSimplificationPasses(MPM); 
- 
-  // FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC 
-  // pass manager that we are specifically trying to avoid. To prevent this 
-  // we must insert a no-op module pass to reset the pass manager. 
-  MPM.add(createBarrierNoopPass()); 
- 
-  if (RunPartialInlining) 
-    MPM.add(createPartialInliningPass()); 
- 
-  if (OptLevel > 1 && !PrepareForLTO && !PrepareForThinLTO) 
-    // Remove avail extern fns and globals definitions if we aren't 
-    // compiling an object file for later LTO. For LTO we want to preserve 
-    // these so they are eligible for inlining at link-time. Note if they 
-    // are unreferenced they will be removed by GlobalDCE later, so 
-    // this only impacts referenced available externally globals. 
-    // Eventually they will be suppressed during codegen, but eliminating 
-    // here enables more opportunity for GlobalDCE as it may make 
-    // globals referenced by available external functions dead 
-    // and saves running remaining passes on the eliminated functions. 
-    MPM.add(createEliminateAvailableExternallyPass()); 
- 
-  // CSFDO instrumentation and use pass. Don't invoke this for Prepare pass 
-  // for LTO and ThinLTO -- The actual pass will be called after all inlines 
-  // are performed. 
-  // Need to do this after COMDAT variables have been eliminated, 
-  // (i.e. after EliminateAvailableExternallyPass). 
-  if (!(PrepareForLTO || PrepareForThinLTO)) 
-    addPGOInstrPasses(MPM, /* IsCS */ true); 
- 
-  if (EnableOrderFileInstrumentation) 
-    MPM.add(createInstrOrderFilePass()); 
- 
-  MPM.add(createReversePostOrderFunctionAttrsPass()); 
- 
-  // The inliner performs some kind of dead code elimination as it goes, 
-  // but there are cases that are not really caught by it. We might 
-  // at some point consider teaching the inliner about them, but it 
-  // is OK for now to run GlobalOpt + GlobalDCE in tandem as their 
-  // benefits generally outweight the cost, making the whole pipeline 
-  // faster. 
-  if (RunInliner) { 
-    MPM.add(createGlobalOptimizerPass()); 
-    MPM.add(createGlobalDCEPass()); 
-  } 
- 
-  // If we are planning to perform ThinLTO later, let's not bloat the code with 
-  // unrolling/vectorization/... now. We'll first run the inliner + CGSCC passes 
-  // during ThinLTO and perform the rest of the optimizations afterward. 
-  if (PrepareForThinLTO) { 
-    // Ensure we perform any last passes, but do so before renaming anonymous 
-    // globals in case the passes add any. 
-    addExtensionsToPM(EP_OptimizerLast, MPM); 
-    MPM.add(createCanonicalizeAliasesPass()); 
-    // Rename anon globals to be able to export them in the summary. 
-    MPM.add(createNameAnonGlobalPass()); 
-    return; 
-  } 
- 
-  if (PerformThinLTO) 
-    // Optimize globals now when performing ThinLTO, this enables more 
-    // optimizations later. 
-    MPM.add(createGlobalOptimizerPass()); 
- 
-  // Scheduling LoopVersioningLICM when inlining is over, because after that 
-  // we may see more accurate aliasing. Reason to run this late is that too 
-  // early versioning may prevent further inlining due to increase of code 
-  // size. By placing it just after inlining other optimizations which runs 
-  // later might get benefit of no-alias assumption in clone loop. 
-  if (UseLoopVersioningLICM) { 
-    MPM.add(createLoopVersioningLICMPass());    // Do LoopVersioningLICM 
-    MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); 
-  } 
- 
-  // We add a fresh GlobalsModRef run at this point. This is particularly 
-  // useful as the above will have inlined, DCE'ed, and function-attr 
-  // propagated everything. We should at this point have a reasonably minimal 
-  // and richly annotated call graph. By computing aliasing and mod/ref 
-  // information for all local globals here, the late loop passes and notably 
-  // the vectorizer will be able to use them to help recognize vectorizable 
-  // memory operations. 
-  // 
-  // Note that this relies on a bug in the pass manager which preserves 
-  // a module analysis into a function pass pipeline (and throughout it) so 
-  // long as the first function pass doesn't invalidate the module analysis. 
-  // Thus both Float2Int and LoopRotate have to preserve AliasAnalysis for 
-  // this to work. Fortunately, it is trivial to preserve AliasAnalysis 
-  // (doing nothing preserves it as it is required to be conservatively 
-  // correct in the face of IR changes). 
-  MPM.add(createGlobalsAAWrapperPass()); 
- 
-  MPM.add(createFloat2IntPass()); 
-  MPM.add(createLowerConstantIntrinsicsPass()); 
- 
-  if (EnableMatrix) { 
-    MPM.add(createLowerMatrixIntrinsicsPass()); 
-    // CSE the pointer arithmetic of the column vectors.  This allows alias 
-    // analysis to establish no-aliasing between loads and stores of different 
-    // columns of the same matrix. 
-    MPM.add(createEarlyCSEPass(false)); 
-  } 
- 
-  addExtensionsToPM(EP_VectorizerStart, MPM); 
- 
-  // Re-rotate loops in all our loop nests. These may have fallout out of 
-  // rotated form due to GVN or other transformations, and the vectorizer relies 
-  // on the rotated form. Disable header duplication at -Oz. 
+    return;
+  }
+
+  // Add LibraryInfo if we have some.
+  if (LibraryInfo)
+    MPM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo));
+
+  addInitialAliasAnalysisPasses(MPM);
+
+  // For ThinLTO there are two passes of indirect call promotion. The
+  // first is during the compile phase when PerformThinLTO=false and
+  // intra-module indirect call targets are promoted. The second is during
+  // the ThinLTO backend when PerformThinLTO=true, when we promote imported
+  // inter-module indirect calls. For that we perform indirect call promotion
+  // earlier in the pass pipeline, here before globalopt. Otherwise imported
+  // available_externally functions look unreferenced and are removed.
+  if (PerformThinLTO) {
+    MPM.add(createPGOIndirectCallPromotionLegacyPass(/*InLTO = */ true,
+                                                     !PGOSampleUse.empty()));
+    MPM.add(createLowerTypeTestsPass(nullptr, nullptr, true));
+  }
+
+  // For SamplePGO in ThinLTO compile phase, we do not want to unroll loops
+  // as it will change the CFG too much to make the 2nd profile annotation
+  // in backend more difficult.
+  bool PrepareForThinLTOUsingPGOSampleProfile =
+      PrepareForThinLTO && !PGOSampleUse.empty();
+  if (PrepareForThinLTOUsingPGOSampleProfile)
+    DisableUnrollLoops = true;
+
+  // Infer attributes about declarations if possible.
+  MPM.add(createInferFunctionAttrsLegacyPass());
+
+  // Infer attributes on declarations, call sites, arguments, etc.
+  if (AttributorRun & AttributorRunOption::MODULE)
+    MPM.add(createAttributorLegacyPass());
+
+  addExtensionsToPM(EP_ModuleOptimizerEarly, MPM);
+
+  if (OptLevel > 2)
+    MPM.add(createCallSiteSplittingPass());
+
+  MPM.add(createIPSCCPPass());          // IP SCCP
+  MPM.add(createCalledValuePropagationPass());
+
+  MPM.add(createGlobalOptimizerPass()); // Optimize out global vars
+  // Promote any localized global vars.
+  MPM.add(createPromoteMemoryToRegisterPass());
+
+  MPM.add(createDeadArgEliminationPass()); // Dead argument elimination
+
+  MPM.add(createInstructionCombiningPass()); // Clean up after IPCP & DAE
+  addExtensionsToPM(EP_Peephole, MPM);
+  MPM.add(createCFGSimplificationPass()); // Clean up after IPCP & DAE
+
+  // For SamplePGO in ThinLTO compile phase, we do not want to do indirect
+  // call promotion as it will change the CFG too much to make the 2nd
+  // profile annotation in backend more difficult.
+  // PGO instrumentation is added during the compile phase for ThinLTO, do
+  // not run it a second time
+  if (DefaultOrPreLinkPipeline && !PrepareForThinLTOUsingPGOSampleProfile)
+    addPGOInstrPasses(MPM);
+
+  // Create profile COMDAT variables. Lld linker wants to see all variables
+  // before the LTO/ThinLTO link since it needs to resolve symbols/comdats.
+  if (!PerformThinLTO && EnablePGOCSInstrGen)
+    MPM.add(createPGOInstrumentationGenCreateVarLegacyPass(PGOInstrGen));
+
+  // We add a module alias analysis pass here. In part due to bugs in the
+  // analysis infrastructure this "works" in that the analysis stays alive
+  // for the entire SCC pass run below.
+  MPM.add(createGlobalsAAWrapperPass());
+
+  // Start of CallGraph SCC passes.
+  MPM.add(createPruneEHPass()); // Remove dead EH info
+  bool RunInliner = false;
+  if (Inliner) {
+    MPM.add(Inliner);
+    Inliner = nullptr;
+    RunInliner = true;
+  }
+
+  // Infer attributes on declarations, call sites, arguments, etc. for an SCC.
+  if (AttributorRun & AttributorRunOption::CGSCC)
+    MPM.add(createAttributorCGSCCLegacyPass());
+
+  // Try to perform OpenMP specific optimizations. This is a (quick!) no-op if
+  // there are no OpenMP runtime calls present in the module.
+  if (OptLevel > 1)
+    MPM.add(createOpenMPOptLegacyPass());
+
+  MPM.add(createPostOrderFunctionAttrsLegacyPass());
+  if (OptLevel > 2)
+    MPM.add(createArgumentPromotionPass()); // Scalarize uninlined fn args
+
+  addExtensionsToPM(EP_CGSCCOptimizerLate, MPM);
+  addFunctionSimplificationPasses(MPM);
+
+  // FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC
+  // pass manager that we are specifically trying to avoid. To prevent this
+  // we must insert a no-op module pass to reset the pass manager.
+  MPM.add(createBarrierNoopPass());
+
+  if (RunPartialInlining)
+    MPM.add(createPartialInliningPass());
+
+  if (OptLevel > 1 && !PrepareForLTO && !PrepareForThinLTO)
+    // Remove avail extern fns and globals definitions if we aren't
+    // compiling an object file for later LTO. For LTO we want to preserve
+    // these so they are eligible for inlining at link-time. Note if they
+    // are unreferenced they will be removed by GlobalDCE later, so
+    // this only impacts referenced available externally globals.
+    // Eventually they will be suppressed during codegen, but eliminating
+    // here enables more opportunity for GlobalDCE as it may make
+    // globals referenced by available external functions dead
+    // and saves running remaining passes on the eliminated functions.
+    MPM.add(createEliminateAvailableExternallyPass());
+
+  // CSFDO instrumentation and use pass. Don't invoke this for Prepare pass
+  // for LTO and ThinLTO -- The actual pass will be called after all inlines
+  // are performed.
+  // Need to do this after COMDAT variables have been eliminated,
+  // (i.e. after EliminateAvailableExternallyPass).
+  if (!(PrepareForLTO || PrepareForThinLTO))
+    addPGOInstrPasses(MPM, /* IsCS */ true);
+
+  if (EnableOrderFileInstrumentation)
+    MPM.add(createInstrOrderFilePass());
+
+  MPM.add(createReversePostOrderFunctionAttrsPass());
+
+  // The inliner performs some kind of dead code elimination as it goes,
+  // but there are cases that are not really caught by it. We might
+  // at some point consider teaching the inliner about them, but it
+  // is OK for now to run GlobalOpt + GlobalDCE in tandem as their
+  // benefits generally outweight the cost, making the whole pipeline
+  // faster.
+  if (RunInliner) {
+    MPM.add(createGlobalOptimizerPass());
+    MPM.add(createGlobalDCEPass());
+  }
+
+  // If we are planning to perform ThinLTO later, let's not bloat the code with
+  // unrolling/vectorization/... now. We'll first run the inliner + CGSCC passes
+  // during ThinLTO and perform the rest of the optimizations afterward.
+  if (PrepareForThinLTO) {
+    // Ensure we perform any last passes, but do so before renaming anonymous
+    // globals in case the passes add any.
+    addExtensionsToPM(EP_OptimizerLast, MPM);
+    MPM.add(createCanonicalizeAliasesPass());
+    // Rename anon globals to be able to export them in the summary.
+    MPM.add(createNameAnonGlobalPass());
+    return;
+  }
+
+  if (PerformThinLTO)
+    // Optimize globals now when performing ThinLTO, this enables more
+    // optimizations later.
+    MPM.add(createGlobalOptimizerPass());
+
+  // Scheduling LoopVersioningLICM when inlining is over, because after that
+  // we may see more accurate aliasing. Reason to run this late is that too
+  // early versioning may prevent further inlining due to increase of code
+  // size. By placing it just after inlining other optimizations which runs
+  // later might get benefit of no-alias assumption in clone loop.
+  if (UseLoopVersioningLICM) {
+    MPM.add(createLoopVersioningLICMPass());    // Do LoopVersioningLICM
+    MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+  }
+
+  // We add a fresh GlobalsModRef run at this point. This is particularly
+  // useful as the above will have inlined, DCE'ed, and function-attr
+  // propagated everything. We should at this point have a reasonably minimal
+  // and richly annotated call graph. By computing aliasing and mod/ref
+  // information for all local globals here, the late loop passes and notably
+  // the vectorizer will be able to use them to help recognize vectorizable
+  // memory operations.
+  //
+  // Note that this relies on a bug in the pass manager which preserves
+  // a module analysis into a function pass pipeline (and throughout it) so
+  // long as the first function pass doesn't invalidate the module analysis.
+  // Thus both Float2Int and LoopRotate have to preserve AliasAnalysis for
+  // this to work. Fortunately, it is trivial to preserve AliasAnalysis
+  // (doing nothing preserves it as it is required to be conservatively
+  // correct in the face of IR changes).
+  MPM.add(createGlobalsAAWrapperPass());
+
+  MPM.add(createFloat2IntPass());
+  MPM.add(createLowerConstantIntrinsicsPass());
+
+  if (EnableMatrix) {
+    MPM.add(createLowerMatrixIntrinsicsPass());
+    // CSE the pointer arithmetic of the column vectors.  This allows alias
+    // analysis to establish no-aliasing between loads and stores of different
+    // columns of the same matrix.
+    MPM.add(createEarlyCSEPass(false));
+  }
+
+  addExtensionsToPM(EP_VectorizerStart, MPM);
+
+  // Re-rotate loops in all our loop nests. These may have fallout out of
+  // rotated form due to GVN or other transformations, and the vectorizer relies
+  // on the rotated form. Disable header duplication at -Oz.
   MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1, PrepareForLTO));
- 
-  // Distribute loops to allow partial vectorization.  I.e. isolate dependences 
-  // into separate loop that would otherwise inhibit vectorization.  This is 
-  // currently only performed for loops marked with the metadata 
-  // llvm.loop.distribute=true or when -enable-loop-distribute is specified. 
-  MPM.add(createLoopDistributePass()); 
- 
-  MPM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize)); 
- 
-  // Eliminate loads by forwarding stores from the previous iteration to loads 
-  // of the current iteration. 
-  MPM.add(createLoopLoadEliminationPass()); 
- 
-  // FIXME: Because of #pragma vectorize enable, the passes below are always 
-  // inserted in the pipeline, even when the vectorizer doesn't run (ex. when 
-  // on -O1 and no #pragma is found). Would be good to have these two passes 
-  // as function calls, so that we can only pass them when the vectorizer 
-  // changed the code. 
-  MPM.add(createInstructionCombiningPass()); 
-  if (OptLevel > 1 && ExtraVectorizerPasses) { 
-    // At higher optimization levels, try to clean up any runtime overlap and 
-    // alignment checks inserted by the vectorizer. We want to track correllated 
-    // runtime checks for two inner loops in the same outer loop, fold any 
-    // common computations, hoist loop-invariant aspects out of any outer loop, 
-    // and unswitch the runtime checks if possible. Once hoisted, we may have 
-    // dead (or speculatable) control flows or more combining opportunities. 
-    MPM.add(createEarlyCSEPass()); 
-    MPM.add(createCorrelatedValuePropagationPass()); 
-    MPM.add(createInstructionCombiningPass()); 
-    MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); 
-    MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget)); 
-    MPM.add(createCFGSimplificationPass()); 
-    MPM.add(createInstructionCombiningPass()); 
-  } 
- 
-  // Cleanup after loop vectorization, etc. Simplification passes like CVP and 
-  // GVN, loop transforms, and others have already run, so it's now better to 
-  // convert to more optimized IR using more aggressive simplify CFG options. 
-  // The extra sinking transform can create larger basic blocks, so do this 
-  // before SLP vectorization. 
+
+  // Distribute loops to allow partial vectorization.  I.e. isolate dependences
+  // into separate loop that would otherwise inhibit vectorization.  This is
+  // currently only performed for loops marked with the metadata
+  // llvm.loop.distribute=true or when -enable-loop-distribute is specified.
+  MPM.add(createLoopDistributePass());
+
+  MPM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize));
+
+  // Eliminate loads by forwarding stores from the previous iteration to loads
+  // of the current iteration.
+  MPM.add(createLoopLoadEliminationPass());
+
+  // FIXME: Because of #pragma vectorize enable, the passes below are always
+  // inserted in the pipeline, even when the vectorizer doesn't run (ex. when
+  // on -O1 and no #pragma is found). Would be good to have these two passes
+  // as function calls, so that we can only pass them when the vectorizer
+  // changed the code.
+  MPM.add(createInstructionCombiningPass());
+  if (OptLevel > 1 && ExtraVectorizerPasses) {
+    // At higher optimization levels, try to clean up any runtime overlap and
+    // alignment checks inserted by the vectorizer. We want to track correllated
+    // runtime checks for two inner loops in the same outer loop, fold any
+    // common computations, hoist loop-invariant aspects out of any outer loop,
+    // and unswitch the runtime checks if possible. Once hoisted, we may have
+    // dead (or speculatable) control flows or more combining opportunities.
+    MPM.add(createEarlyCSEPass());
+    MPM.add(createCorrelatedValuePropagationPass());
+    MPM.add(createInstructionCombiningPass());
+    MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+    MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
+    MPM.add(createCFGSimplificationPass());
+    MPM.add(createInstructionCombiningPass());
+  }
+
+  // Cleanup after loop vectorization, etc. Simplification passes like CVP and
+  // GVN, loop transforms, and others have already run, so it's now better to
+  // convert to more optimized IR using more aggressive simplify CFG options.
+  // The extra sinking transform can create larger basic blocks, so do this
+  // before SLP vectorization.
   // FIXME: study whether hoisting and/or sinking of common instructions should
   //        be delayed until after SLP vectorizer.
   MPM.add(createCFGSimplificationPass(SimplifyCFGOptions()
@@ -826,464 +826,464 @@ void PassManagerBuilder::populateModulePassManager(
                                           .needCanonicalLoops(false)
                                           .hoistCommonInsts(true)
                                           .sinkCommonInsts(true)));
- 
-  if (SLPVectorize) { 
-    MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. 
-    if (OptLevel > 1 && ExtraVectorizerPasses) { 
-      MPM.add(createEarlyCSEPass()); 
-    } 
-  } 
- 
-  // Enhance/cleanup vector code. 
-  MPM.add(createVectorCombinePass()); 
- 
-  addExtensionsToPM(EP_Peephole, MPM); 
-  MPM.add(createInstructionCombiningPass()); 
- 
-  if (EnableUnrollAndJam && !DisableUnrollLoops) { 
-    // Unroll and Jam. We do this before unroll but need to be in a separate 
-    // loop pass manager in order for the outer loop to be processed by 
-    // unroll and jam before the inner loop is unrolled. 
-    MPM.add(createLoopUnrollAndJamPass(OptLevel)); 
-  } 
- 
-  // Unroll small loops 
-  MPM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, 
-                               ForgetAllSCEVInLoopUnroll)); 
- 
-  if (!DisableUnrollLoops) { 
-    // LoopUnroll may generate some redundency to cleanup. 
-    MPM.add(createInstructionCombiningPass()); 
- 
-    // Runtime unrolling will introduce runtime check in loop prologue. If the 
-    // unrolled loop is a inner loop, then the prologue will be inside the 
-    // outer loop. LICM pass can help to promote the runtime check out if the 
-    // checked value is loop invariant. 
-    MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); 
-  } 
- 
-  MPM.add(createWarnMissedTransformationsPass()); 
- 
-  // After vectorization and unrolling, assume intrinsics may tell us more 
-  // about pointer alignments. 
-  MPM.add(createAlignmentFromAssumptionsPass()); 
- 
-  // FIXME: We shouldn't bother with this anymore. 
-  MPM.add(createStripDeadPrototypesPass()); // Get rid of dead prototypes 
- 
-  // GlobalOpt already deletes dead functions and globals, at -O2 try a 
-  // late pass of GlobalDCE.  It is capable of deleting dead cycles. 
-  if (OptLevel > 1) { 
-    MPM.add(createGlobalDCEPass());         // Remove dead fns and globals. 
-    MPM.add(createConstantMergePass());     // Merge dup global constants 
-  } 
- 
-  // See comment in the new PM for justification of scheduling splitting at 
-  // this stage (\ref buildModuleSimplificationPipeline). 
-  if (EnableHotColdSplit && !(PrepareForLTO || PrepareForThinLTO)) 
-    MPM.add(createHotColdSplittingPass()); 
- 
+
+  if (SLPVectorize) {
+    MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
+    if (OptLevel > 1 && ExtraVectorizerPasses) {
+      MPM.add(createEarlyCSEPass());
+    }
+  }
+
+  // Enhance/cleanup vector code.
+  MPM.add(createVectorCombinePass());
+
+  addExtensionsToPM(EP_Peephole, MPM);
+  MPM.add(createInstructionCombiningPass());
+
+  if (EnableUnrollAndJam && !DisableUnrollLoops) {
+    // Unroll and Jam. We do this before unroll but need to be in a separate
+    // loop pass manager in order for the outer loop to be processed by
+    // unroll and jam before the inner loop is unrolled.
+    MPM.add(createLoopUnrollAndJamPass(OptLevel));
+  }
+
+  // Unroll small loops
+  MPM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops,
+                               ForgetAllSCEVInLoopUnroll));
+
+  if (!DisableUnrollLoops) {
+    // LoopUnroll may generate some redundency to cleanup.
+    MPM.add(createInstructionCombiningPass());
+
+    // Runtime unrolling will introduce runtime check in loop prologue. If the
+    // unrolled loop is a inner loop, then the prologue will be inside the
+    // outer loop. LICM pass can help to promote the runtime check out if the
+    // checked value is loop invariant.
+    MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+  }
+
+  MPM.add(createWarnMissedTransformationsPass());
+
+  // After vectorization and unrolling, assume intrinsics may tell us more
+  // about pointer alignments.
+  MPM.add(createAlignmentFromAssumptionsPass());
+
+  // FIXME: We shouldn't bother with this anymore.
+  MPM.add(createStripDeadPrototypesPass()); // Get rid of dead prototypes
+
+  // GlobalOpt already deletes dead functions and globals, at -O2 try a
+  // late pass of GlobalDCE.  It is capable of deleting dead cycles.
+  if (OptLevel > 1) {
+    MPM.add(createGlobalDCEPass());         // Remove dead fns and globals.
+    MPM.add(createConstantMergePass());     // Merge dup global constants
+  }
+
+  // See comment in the new PM for justification of scheduling splitting at
+  // this stage (\ref buildModuleSimplificationPipeline).
+  if (EnableHotColdSplit && !(PrepareForLTO || PrepareForThinLTO))
+    MPM.add(createHotColdSplittingPass());
+
   if (EnableIROutliner)
     MPM.add(createIROutlinerPass());
 
-  if (MergeFunctions) 
-    MPM.add(createMergeFunctionsPass()); 
- 
-  // Add Module flag "CG Profile" based on Branch Frequency Information. 
-  if (CallGraphProfile) 
-    MPM.add(createCGProfileLegacyPass()); 
- 
-  // LoopSink pass sinks instructions hoisted by LICM, which serves as a 
-  // canonicalization pass that enables other optimizations. As a result, 
-  // LoopSink pass needs to be a very late IR pass to avoid undoing LICM 
-  // result too early. 
-  MPM.add(createLoopSinkPass()); 
-  // Get rid of LCSSA nodes. 
-  MPM.add(createInstSimplifyLegacyPass()); 
- 
-  // This hoists/decomposes div/rem ops. It should run after other sink/hoist 
-  // passes to avoid re-sinking, but before SimplifyCFG because it can allow 
-  // flattening of blocks. 
-  MPM.add(createDivRemPairsPass()); 
- 
-  // LoopSink (and other loop passes since the last simplifyCFG) might have 
-  // resulted in single-entry-single-exit or empty blocks. Clean up the CFG. 
-  MPM.add(createCFGSimplificationPass()); 
- 
-  addExtensionsToPM(EP_OptimizerLast, MPM); 
- 
-  if (PrepareForLTO) { 
-    MPM.add(createCanonicalizeAliasesPass()); 
-    // Rename anon globals to be able to handle them in the summary 
-    MPM.add(createNameAnonGlobalPass()); 
-  } 
+  if (MergeFunctions)
+    MPM.add(createMergeFunctionsPass());
+
+  // Add Module flag "CG Profile" based on Branch Frequency Information.
+  if (CallGraphProfile)
+    MPM.add(createCGProfileLegacyPass());
+
+  // LoopSink pass sinks instructions hoisted by LICM, which serves as a
+  // canonicalization pass that enables other optimizations. As a result,
+  // LoopSink pass needs to be a very late IR pass to avoid undoing LICM
+  // result too early.
+  MPM.add(createLoopSinkPass());
+  // Get rid of LCSSA nodes.
+  MPM.add(createInstSimplifyLegacyPass());
+
+  // This hoists/decomposes div/rem ops. It should run after other sink/hoist
+  // passes to avoid re-sinking, but before SimplifyCFG because it can allow
+  // flattening of blocks.
+  MPM.add(createDivRemPairsPass());
+
+  // LoopSink (and other loop passes since the last simplifyCFG) might have
+  // resulted in single-entry-single-exit or empty blocks. Clean up the CFG.
+  MPM.add(createCFGSimplificationPass());
+
+  addExtensionsToPM(EP_OptimizerLast, MPM);
+
+  if (PrepareForLTO) {
+    MPM.add(createCanonicalizeAliasesPass());
+    // Rename anon globals to be able to handle them in the summary
+    MPM.add(createNameAnonGlobalPass());
+  }
 
   MPM.add(createAnnotationRemarksLegacyPass());
-} 
- 
-void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { 
-  // Load sample profile before running the LTO optimization pipeline. 
-  if (!PGOSampleUse.empty()) { 
-    PM.add(createPruneEHPass()); 
-    PM.add(createSampleProfileLoaderPass(PGOSampleUse)); 
-  } 
- 
-  // Remove unused virtual tables to improve the quality of code generated by 
-  // whole-program devirtualization and bitset lowering. 
-  PM.add(createGlobalDCEPass()); 
- 
-  // Provide AliasAnalysis services for optimizations. 
-  addInitialAliasAnalysisPasses(PM); 
- 
-  // Allow forcing function attributes as a debugging and tuning aid. 
-  PM.add(createForceFunctionAttrsLegacyPass()); 
- 
-  // Infer attributes about declarations if possible. 
-  PM.add(createInferFunctionAttrsLegacyPass()); 
- 
-  if (OptLevel > 1) { 
-    // Split call-site with more constrained arguments. 
-    PM.add(createCallSiteSplittingPass()); 
- 
-    // Indirect call promotion. This should promote all the targets that are 
-    // left by the earlier promotion pass that promotes intra-module targets. 
-    // This two-step promotion is to save the compile time. For LTO, it should 
-    // produce the same result as if we only do promotion here. 
-    PM.add( 
-        createPGOIndirectCallPromotionLegacyPass(true, !PGOSampleUse.empty())); 
- 
-    // Propagate constants at call sites into the functions they call.  This 
-    // opens opportunities for globalopt (and inlining) by substituting function 
-    // pointers passed as arguments to direct uses of functions. 
-    PM.add(createIPSCCPPass()); 
- 
-    // Attach metadata to indirect call sites indicating the set of functions 
-    // they may target at run-time. This should follow IPSCCP. 
-    PM.add(createCalledValuePropagationPass()); 
- 
-    // Infer attributes on declarations, call sites, arguments, etc. 
-    if (AttributorRun & AttributorRunOption::MODULE) 
-      PM.add(createAttributorLegacyPass()); 
-  } 
- 
-  // Infer attributes about definitions. The readnone attribute in particular is 
-  // required for virtual constant propagation. 
-  PM.add(createPostOrderFunctionAttrsLegacyPass()); 
-  PM.add(createReversePostOrderFunctionAttrsPass()); 
- 
-  // Split globals using inrange annotations on GEP indices. This can help 
-  // improve the quality of generated code when virtual constant propagation or 
-  // control flow integrity are enabled. 
-  PM.add(createGlobalSplitPass()); 
- 
-  // Apply whole-program devirtualization and virtual constant propagation. 
-  PM.add(createWholeProgramDevirtPass(ExportSummary, nullptr)); 
- 
-  // That's all we need at opt level 1. 
-  if (OptLevel == 1) 
-    return; 
- 
-  // Now that we internalized some globals, see if we can hack on them! 
-  PM.add(createGlobalOptimizerPass()); 
-  // Promote any localized global vars. 
-  PM.add(createPromoteMemoryToRegisterPass()); 
- 
-  // Linking modules together can lead to duplicated global constants, only 
-  // keep one copy of each constant. 
-  PM.add(createConstantMergePass()); 
- 
-  // Remove unused arguments from functions. 
-  PM.add(createDeadArgEliminationPass()); 
- 
-  // Reduce the code after globalopt and ipsccp.  Both can open up significant 
-  // simplification opportunities, and both can propagate functions through 
-  // function pointers.  When this happens, we often have to resolve varargs 
-  // calls, etc, so let instcombine do this. 
-  if (OptLevel > 2) 
-    PM.add(createAggressiveInstCombinerPass()); 
-  PM.add(createInstructionCombiningPass()); 
-  addExtensionsToPM(EP_Peephole, PM); 
- 
-  // Inline small functions 
-  bool RunInliner = Inliner; 
-  if (RunInliner) { 
-    PM.add(Inliner); 
-    Inliner = nullptr; 
-  } 
- 
-  PM.add(createPruneEHPass());   // Remove dead EH info. 
- 
-  // CSFDO instrumentation and use pass. 
-  addPGOInstrPasses(PM, /* IsCS */ true); 
- 
-  // Infer attributes on declarations, call sites, arguments, etc. for an SCC. 
-  if (AttributorRun & AttributorRunOption::CGSCC) 
-    PM.add(createAttributorCGSCCLegacyPass()); 
- 
-  // Try to perform OpenMP specific optimizations. This is a (quick!) no-op if 
-  // there are no OpenMP runtime calls present in the module. 
-  if (OptLevel > 1) 
-    PM.add(createOpenMPOptLegacyPass()); 
- 
-  // Optimize globals again if we ran the inliner. 
-  if (RunInliner) 
-    PM.add(createGlobalOptimizerPass()); 
-  PM.add(createGlobalDCEPass()); // Remove dead functions. 
- 
-  // If we didn't decide to inline a function, check to see if we can 
-  // transform it to pass arguments by value instead of by reference. 
-  PM.add(createArgumentPromotionPass()); 
- 
-  // The IPO passes may leave cruft around.  Clean up after them. 
-  PM.add(createInstructionCombiningPass()); 
-  addExtensionsToPM(EP_Peephole, PM); 
+}
+
+void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
+  // Load sample profile before running the LTO optimization pipeline.
+  if (!PGOSampleUse.empty()) {
+    PM.add(createPruneEHPass());
+    PM.add(createSampleProfileLoaderPass(PGOSampleUse));
+  }
+
+  // Remove unused virtual tables to improve the quality of code generated by
+  // whole-program devirtualization and bitset lowering.
+  PM.add(createGlobalDCEPass());
+
+  // Provide AliasAnalysis services for optimizations.
+  addInitialAliasAnalysisPasses(PM);
+
+  // Allow forcing function attributes as a debugging and tuning aid.
+  PM.add(createForceFunctionAttrsLegacyPass());
+
+  // Infer attributes about declarations if possible.
+  PM.add(createInferFunctionAttrsLegacyPass());
+
+  if (OptLevel > 1) {
+    // Split call-site with more constrained arguments.
+    PM.add(createCallSiteSplittingPass());
+
+    // Indirect call promotion. This should promote all the targets that are
+    // left by the earlier promotion pass that promotes intra-module targets.
+    // This two-step promotion is to save the compile time. For LTO, it should
+    // produce the same result as if we only do promotion here.
+    PM.add(
+        createPGOIndirectCallPromotionLegacyPass(true, !PGOSampleUse.empty()));
+
+    // Propagate constants at call sites into the functions they call.  This
+    // opens opportunities for globalopt (and inlining) by substituting function
+    // pointers passed as arguments to direct uses of functions.
+    PM.add(createIPSCCPPass());
+
+    // Attach metadata to indirect call sites indicating the set of functions
+    // they may target at run-time. This should follow IPSCCP.
+    PM.add(createCalledValuePropagationPass());
+
+    // Infer attributes on declarations, call sites, arguments, etc.
+    if (AttributorRun & AttributorRunOption::MODULE)
+      PM.add(createAttributorLegacyPass());
+  }
+
+  // Infer attributes about definitions. The readnone attribute in particular is
+  // required for virtual constant propagation.
+  PM.add(createPostOrderFunctionAttrsLegacyPass());
+  PM.add(createReversePostOrderFunctionAttrsPass());
+
+  // Split globals using inrange annotations on GEP indices. This can help
+  // improve the quality of generated code when virtual constant propagation or
+  // control flow integrity are enabled.
+  PM.add(createGlobalSplitPass());
+
+  // Apply whole-program devirtualization and virtual constant propagation.
+  PM.add(createWholeProgramDevirtPass(ExportSummary, nullptr));
+
+  // That's all we need at opt level 1.
+  if (OptLevel == 1)
+    return;
+
+  // Now that we internalized some globals, see if we can hack on them!
+  PM.add(createGlobalOptimizerPass());
+  // Promote any localized global vars.
+  PM.add(createPromoteMemoryToRegisterPass());
+
+  // Linking modules together can lead to duplicated global constants, only
+  // keep one copy of each constant.
+  PM.add(createConstantMergePass());
+
+  // Remove unused arguments from functions.
+  PM.add(createDeadArgEliminationPass());
+
+  // Reduce the code after globalopt and ipsccp.  Both can open up significant
+  // simplification opportunities, and both can propagate functions through
+  // function pointers.  When this happens, we often have to resolve varargs
+  // calls, etc, so let instcombine do this.
+  if (OptLevel > 2)
+    PM.add(createAggressiveInstCombinerPass());
+  PM.add(createInstructionCombiningPass());
+  addExtensionsToPM(EP_Peephole, PM);
+
+  // Inline small functions
+  bool RunInliner = Inliner;
+  if (RunInliner) {
+    PM.add(Inliner);
+    Inliner = nullptr;
+  }
+
+  PM.add(createPruneEHPass());   // Remove dead EH info.
+
+  // CSFDO instrumentation and use pass.
+  addPGOInstrPasses(PM, /* IsCS */ true);
+
+  // Infer attributes on declarations, call sites, arguments, etc. for an SCC.
+  if (AttributorRun & AttributorRunOption::CGSCC)
+    PM.add(createAttributorCGSCCLegacyPass());
+
+  // Try to perform OpenMP specific optimizations. This is a (quick!) no-op if
+  // there are no OpenMP runtime calls present in the module.
+  if (OptLevel > 1)
+    PM.add(createOpenMPOptLegacyPass());
+
+  // Optimize globals again if we ran the inliner.
+  if (RunInliner)
+    PM.add(createGlobalOptimizerPass());
+  PM.add(createGlobalDCEPass()); // Remove dead functions.
+
+  // If we didn't decide to inline a function, check to see if we can
+  // transform it to pass arguments by value instead of by reference.
+  PM.add(createArgumentPromotionPass());
+
+  // The IPO passes may leave cruft around.  Clean up after them.
+  PM.add(createInstructionCombiningPass());
+  addExtensionsToPM(EP_Peephole, PM);
   PM.add(createJumpThreadingPass(/*FreezeSelectCond*/ true));
- 
-  // Break up allocas 
-  PM.add(createSROAPass()); 
- 
-  // LTO provides additional opportunities for tailcall elimination due to 
-  // link-time inlining, and visibility of nocapture attribute. 
-  if (OptLevel > 1) 
-    PM.add(createTailCallEliminationPass()); 
- 
-  // Infer attributes on declarations, call sites, arguments, etc. 
-  PM.add(createPostOrderFunctionAttrsLegacyPass()); // Add nocapture. 
-  // Run a few AA driven optimizations here and now, to cleanup the code. 
-  PM.add(createGlobalsAAWrapperPass()); // IP alias analysis. 
- 
-  PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); 
-  PM.add(NewGVN ? createNewGVNPass() 
-                : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies. 
-  PM.add(createMemCpyOptPass());            // Remove dead memcpys. 
- 
-  // Nuke dead stores. 
-  PM.add(createDeadStoreEliminationPass()); 
+
+  // Break up allocas
+  PM.add(createSROAPass());
+
+  // LTO provides additional opportunities for tailcall elimination due to
+  // link-time inlining, and visibility of nocapture attribute.
+  if (OptLevel > 1)
+    PM.add(createTailCallEliminationPass());
+
+  // Infer attributes on declarations, call sites, arguments, etc.
+  PM.add(createPostOrderFunctionAttrsLegacyPass()); // Add nocapture.
+  // Run a few AA driven optimizations here and now, to cleanup the code.
+  PM.add(createGlobalsAAWrapperPass()); // IP alias analysis.
+
+  PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+  PM.add(NewGVN ? createNewGVNPass()
+                : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies.
+  PM.add(createMemCpyOptPass());            // Remove dead memcpys.
+
+  // Nuke dead stores.
+  PM.add(createDeadStoreEliminationPass());
   PM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds.
- 
-  // More loops are countable; try to optimize them. 
+
+  // More loops are countable; try to optimize them.
   if (EnableLoopFlatten)
     PM.add(createLoopFlattenPass());
-  PM.add(createIndVarSimplifyPass()); 
-  PM.add(createLoopDeletionPass()); 
-  if (EnableLoopInterchange) 
-    PM.add(createLoopInterchangePass()); 
- 
+  PM.add(createIndVarSimplifyPass());
+  PM.add(createLoopDeletionPass());
+  if (EnableLoopInterchange)
+    PM.add(createLoopInterchangePass());
+
   if (EnableConstraintElimination)
     PM.add(createConstraintEliminationPass());
 
   // Unroll small loops and perform peeling.
-  PM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops, 
-                                    ForgetAllSCEVInLoopUnroll)); 
+  PM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops,
+                                    ForgetAllSCEVInLoopUnroll));
   PM.add(createLoopDistributePass());
-  PM.add(createLoopVectorizePass(true, !LoopVectorize)); 
-  // The vectorizer may have significantly shortened a loop body; unroll again. 
-  PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, 
-                              ForgetAllSCEVInLoopUnroll)); 
- 
-  PM.add(createWarnMissedTransformationsPass()); 
- 
-  // Now that we've optimized loops (in particular loop induction variables), 
-  // we may have exposed more scalar opportunities. Run parts of the scalar 
-  // optimizer again at this point. 
-  PM.add(createInstructionCombiningPass()); // Initial cleanup 
+  PM.add(createLoopVectorizePass(true, !LoopVectorize));
+  // The vectorizer may have significantly shortened a loop body; unroll again.
+  PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops,
+                              ForgetAllSCEVInLoopUnroll));
+
+  PM.add(createWarnMissedTransformationsPass());
+
+  // Now that we've optimized loops (in particular loop induction variables),
+  // we may have exposed more scalar opportunities. Run parts of the scalar
+  // optimizer again at this point.
+  PM.add(createInstructionCombiningPass()); // Initial cleanup
   PM.add(createCFGSimplificationPass(SimplifyCFGOptions() // if-convert
                                          .hoistCommonInsts(true)));
-  PM.add(createSCCPPass()); // Propagate exposed constants 
-  PM.add(createInstructionCombiningPass()); // Clean up again 
-  PM.add(createBitTrackingDCEPass()); 
- 
-  // More scalar chains could be vectorized due to more alias information 
-  if (SLPVectorize) 
-    PM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. 
- 
-  PM.add(createVectorCombinePass()); // Clean up partial vectorization. 
- 
-  // After vectorization, assume intrinsics may tell us more about pointer 
-  // alignments. 
-  PM.add(createAlignmentFromAssumptionsPass()); 
- 
-  // Cleanup and simplify the code after the scalar optimizations. 
-  PM.add(createInstructionCombiningPass()); 
-  addExtensionsToPM(EP_Peephole, PM); 
- 
+  PM.add(createSCCPPass()); // Propagate exposed constants
+  PM.add(createInstructionCombiningPass()); // Clean up again
+  PM.add(createBitTrackingDCEPass());
+
+  // More scalar chains could be vectorized due to more alias information
+  if (SLPVectorize)
+    PM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
+
+  PM.add(createVectorCombinePass()); // Clean up partial vectorization.
+
+  // After vectorization, assume intrinsics may tell us more about pointer
+  // alignments.
+  PM.add(createAlignmentFromAssumptionsPass());
+
+  // Cleanup and simplify the code after the scalar optimizations.
+  PM.add(createInstructionCombiningPass());
+  addExtensionsToPM(EP_Peephole, PM);
+
   PM.add(createJumpThreadingPass(/*FreezeSelectCond*/ true));
-} 
- 
-void PassManagerBuilder::addLateLTOOptimizationPasses( 
-    legacy::PassManagerBase &PM) { 
-  // See comment in the new PM for justification of scheduling splitting at 
-  // this stage (\ref buildLTODefaultPipeline). 
-  if (EnableHotColdSplit) 
-    PM.add(createHotColdSplittingPass()); 
- 
-  // Delete basic blocks, which optimization passes may have killed. 
+}
+
+void PassManagerBuilder::addLateLTOOptimizationPasses(
+    legacy::PassManagerBase &PM) {
+  // See comment in the new PM for justification of scheduling splitting at
+  // this stage (\ref buildLTODefaultPipeline).
+  if (EnableHotColdSplit)
+    PM.add(createHotColdSplittingPass());
+
+  // Delete basic blocks, which optimization passes may have killed.
   PM.add(
       createCFGSimplificationPass(SimplifyCFGOptions().hoistCommonInsts(true)));
- 
-  // Drop bodies of available externally objects to improve GlobalDCE. 
-  PM.add(createEliminateAvailableExternallyPass()); 
- 
-  // Now that we have optimized the program, discard unreachable functions. 
-  PM.add(createGlobalDCEPass()); 
- 
-  // FIXME: this is profitable (for compiler time) to do at -O0 too, but 
-  // currently it damages debug info. 
-  if (MergeFunctions) 
-    PM.add(createMergeFunctionsPass()); 
-} 
- 
-void PassManagerBuilder::populateThinLTOPassManager( 
-    legacy::PassManagerBase &PM) { 
-  PerformThinLTO = true; 
-  if (LibraryInfo) 
-    PM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo)); 
- 
-  if (VerifyInput) 
-    PM.add(createVerifierPass()); 
- 
-  if (ImportSummary) { 
-    // This pass imports type identifier resolutions for whole-program 
-    // devirtualization and CFI. It must run early because other passes may 
-    // disturb the specific instruction patterns that these passes look for, 
-    // creating dependencies on resolutions that may not appear in the summary. 
-    // 
-    // For example, GVN may transform the pattern assume(type.test) appearing in 
-    // two basic blocks into assume(phi(type.test, type.test)), which would 
-    // transform a dependency on a WPD resolution into a dependency on a type 
-    // identifier resolution for CFI. 
-    // 
-    // Also, WPD has access to more precise information than ICP and can 
-    // devirtualize more effectively, so it should operate on the IR first. 
-    PM.add(createWholeProgramDevirtPass(nullptr, ImportSummary)); 
-    PM.add(createLowerTypeTestsPass(nullptr, ImportSummary)); 
-  } 
- 
-  populateModulePassManager(PM); 
- 
-  if (VerifyOutput) 
-    PM.add(createVerifierPass()); 
-  PerformThinLTO = false; 
-} 
- 
-void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) { 
-  if (LibraryInfo) 
-    PM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo)); 
- 
-  if (VerifyInput) 
-    PM.add(createVerifierPass()); 
- 
-  addExtensionsToPM(EP_FullLinkTimeOptimizationEarly, PM); 
- 
-  if (OptLevel != 0) 
-    addLTOOptimizationPasses(PM); 
-  else { 
-    // The whole-program-devirt pass needs to run at -O0 because only it knows 
-    // about the llvm.type.checked.load intrinsic: it needs to both lower the 
-    // intrinsic itself and handle it in the summary. 
-    PM.add(createWholeProgramDevirtPass(ExportSummary, nullptr)); 
-  } 
- 
-  // Create a function that performs CFI checks for cross-DSO calls with targets 
-  // in the current module. 
-  PM.add(createCrossDSOCFIPass()); 
- 
-  // Lower type metadata and the type.test intrinsic. This pass supports Clang's 
-  // control flow integrity mechanisms (-fsanitize=cfi*) and needs to run at 
-  // link time if CFI is enabled. The pass does nothing if CFI is disabled. 
-  PM.add(createLowerTypeTestsPass(ExportSummary, nullptr)); 
-  // Run a second time to clean up any type tests left behind by WPD for use 
-  // in ICP (which is performed earlier than this in the regular LTO pipeline). 
-  PM.add(createLowerTypeTestsPass(nullptr, nullptr, true)); 
- 
-  if (OptLevel != 0) 
-    addLateLTOOptimizationPasses(PM); 
- 
-  addExtensionsToPM(EP_FullLinkTimeOptimizationLast, PM); 
- 
+
+  // Drop bodies of available externally objects to improve GlobalDCE.
+  PM.add(createEliminateAvailableExternallyPass());
+
+  // Now that we have optimized the program, discard unreachable functions.
+  PM.add(createGlobalDCEPass());
+
+  // FIXME: this is profitable (for compiler time) to do at -O0 too, but
+  // currently it damages debug info.
+  if (MergeFunctions)
+    PM.add(createMergeFunctionsPass());
+}
+
+void PassManagerBuilder::populateThinLTOPassManager(
+    legacy::PassManagerBase &PM) {
+  PerformThinLTO = true;
+  if (LibraryInfo)
+    PM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo));
+
+  if (VerifyInput)
+    PM.add(createVerifierPass());
+
+  if (ImportSummary) {
+    // This pass imports type identifier resolutions for whole-program
+    // devirtualization and CFI. It must run early because other passes may
+    // disturb the specific instruction patterns that these passes look for,
+    // creating dependencies on resolutions that may not appear in the summary.
+    //
+    // For example, GVN may transform the pattern assume(type.test) appearing in
+    // two basic blocks into assume(phi(type.test, type.test)), which would
+    // transform a dependency on a WPD resolution into a dependency on a type
+    // identifier resolution for CFI.
+    //
+    // Also, WPD has access to more precise information than ICP and can
+    // devirtualize more effectively, so it should operate on the IR first.
+    PM.add(createWholeProgramDevirtPass(nullptr, ImportSummary));
+    PM.add(createLowerTypeTestsPass(nullptr, ImportSummary));
+  }
+
+  populateModulePassManager(PM);
+
+  if (VerifyOutput)
+    PM.add(createVerifierPass());
+  PerformThinLTO = false;
+}
+
+void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) {
+  if (LibraryInfo)
+    PM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo));
+
+  if (VerifyInput)
+    PM.add(createVerifierPass());
+
+  addExtensionsToPM(EP_FullLinkTimeOptimizationEarly, PM);
+
+  if (OptLevel != 0)
+    addLTOOptimizationPasses(PM);
+  else {
+    // The whole-program-devirt pass needs to run at -O0 because only it knows
+    // about the llvm.type.checked.load intrinsic: it needs to both lower the
+    // intrinsic itself and handle it in the summary.
+    PM.add(createWholeProgramDevirtPass(ExportSummary, nullptr));
+  }
+
+  // Create a function that performs CFI checks for cross-DSO calls with targets
+  // in the current module.
+  PM.add(createCrossDSOCFIPass());
+
+  // Lower type metadata and the type.test intrinsic. This pass supports Clang's
+  // control flow integrity mechanisms (-fsanitize=cfi*) and needs to run at
+  // link time if CFI is enabled. The pass does nothing if CFI is disabled.
+  PM.add(createLowerTypeTestsPass(ExportSummary, nullptr));
+  // Run a second time to clean up any type tests left behind by WPD for use
+  // in ICP (which is performed earlier than this in the regular LTO pipeline).
+  PM.add(createLowerTypeTestsPass(nullptr, nullptr, true));
+
+  if (OptLevel != 0)
+    addLateLTOOptimizationPasses(PM);
+
+  addExtensionsToPM(EP_FullLinkTimeOptimizationLast, PM);
+
   PM.add(createAnnotationRemarksLegacyPass());
 
-  if (VerifyOutput) 
-    PM.add(createVerifierPass()); 
-} 
- 
-LLVMPassManagerBuilderRef LLVMPassManagerBuilderCreate() { 
-  PassManagerBuilder *PMB = new PassManagerBuilder(); 
-  return wrap(PMB); 
-} 
- 
-void LLVMPassManagerBuilderDispose(LLVMPassManagerBuilderRef PMB) { 
-  PassManagerBuilder *Builder = unwrap(PMB); 
-  delete Builder; 
-} 
- 
-void 
-LLVMPassManagerBuilderSetOptLevel(LLVMPassManagerBuilderRef PMB, 
-                                  unsigned OptLevel) { 
-  PassManagerBuilder *Builder = unwrap(PMB); 
-  Builder->OptLevel = OptLevel; 
-} 
- 
-void 
-LLVMPassManagerBuilderSetSizeLevel(LLVMPassManagerBuilderRef PMB, 
-                                   unsigned SizeLevel) { 
-  PassManagerBuilder *Builder = unwrap(PMB); 
-  Builder->SizeLevel = SizeLevel; 
-} 
- 
-void 
-LLVMPassManagerBuilderSetDisableUnitAtATime(LLVMPassManagerBuilderRef PMB, 
-                                            LLVMBool Value) { 
-  // NOTE: The DisableUnitAtATime switch has been removed. 
-} 
- 
-void 
-LLVMPassManagerBuilderSetDisableUnrollLoops(LLVMPassManagerBuilderRef PMB, 
-                                            LLVMBool Value) { 
-  PassManagerBuilder *Builder = unwrap(PMB); 
-  Builder->DisableUnrollLoops = Value; 
-} 
- 
-void 
-LLVMPassManagerBuilderSetDisableSimplifyLibCalls(LLVMPassManagerBuilderRef PMB, 
-                                                 LLVMBool Value) { 
-  // NOTE: The simplify-libcalls pass has been removed. 
-} 
- 
-void 
-LLVMPassManagerBuilderUseInlinerWithThreshold(LLVMPassManagerBuilderRef PMB, 
-                                              unsigned Threshold) { 
-  PassManagerBuilder *Builder = unwrap(PMB); 
-  Builder->Inliner = createFunctionInliningPass(Threshold); 
-} 
- 
-void 
-LLVMPassManagerBuilderPopulateFunctionPassManager(LLVMPassManagerBuilderRef PMB, 
-                                                  LLVMPassManagerRef PM) { 
-  PassManagerBuilder *Builder = unwrap(PMB); 
-  legacy::FunctionPassManager *FPM = unwrap<legacy::FunctionPassManager>(PM); 
-  Builder->populateFunctionPassManager(*FPM); 
-} 
- 
-void 
-LLVMPassManagerBuilderPopulateModulePassManager(LLVMPassManagerBuilderRef PMB, 
-                                                LLVMPassManagerRef PM) { 
-  PassManagerBuilder *Builder = unwrap(PMB); 
-  legacy::PassManagerBase *MPM = unwrap(PM); 
-  Builder->populateModulePassManager(*MPM); 
-} 
- 
-void LLVMPassManagerBuilderPopulateLTOPassManager(LLVMPassManagerBuilderRef PMB, 
-                                                  LLVMPassManagerRef PM, 
-                                                  LLVMBool Internalize, 
-                                                  LLVMBool RunInliner) { 
-  PassManagerBuilder *Builder = unwrap(PMB); 
-  legacy::PassManagerBase *LPM = unwrap(PM); 
- 
-  // A small backwards compatibility hack. populateLTOPassManager used to take 
-  // an RunInliner option. 
-  if (RunInliner && !Builder->Inliner) 
-    Builder->Inliner = createFunctionInliningPass(); 
- 
-  Builder->populateLTOPassManager(*LPM); 
-} 
+  if (VerifyOutput)
+    PM.add(createVerifierPass());
+}
+
+LLVMPassManagerBuilderRef LLVMPassManagerBuilderCreate() {
+  PassManagerBuilder *PMB = new PassManagerBuilder();
+  return wrap(PMB);
+}
+
+void LLVMPassManagerBuilderDispose(LLVMPassManagerBuilderRef PMB) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  delete Builder;
+}
+
+void
+LLVMPassManagerBuilderSetOptLevel(LLVMPassManagerBuilderRef PMB,
+                                  unsigned OptLevel) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  Builder->OptLevel = OptLevel;
+}
+
+void
+LLVMPassManagerBuilderSetSizeLevel(LLVMPassManagerBuilderRef PMB,
+                                   unsigned SizeLevel) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  Builder->SizeLevel = SizeLevel;
+}
+
+void
+LLVMPassManagerBuilderSetDisableUnitAtATime(LLVMPassManagerBuilderRef PMB,
+                                            LLVMBool Value) {
+  // NOTE: The DisableUnitAtATime switch has been removed.
+}
+
+void
+LLVMPassManagerBuilderSetDisableUnrollLoops(LLVMPassManagerBuilderRef PMB,
+                                            LLVMBool Value) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  Builder->DisableUnrollLoops = Value;
+}
+
+void
+LLVMPassManagerBuilderSetDisableSimplifyLibCalls(LLVMPassManagerBuilderRef PMB,
+                                                 LLVMBool Value) {
+  // NOTE: The simplify-libcalls pass has been removed.
+}
+
+void
+LLVMPassManagerBuilderUseInlinerWithThreshold(LLVMPassManagerBuilderRef PMB,
+                                              unsigned Threshold) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  Builder->Inliner = createFunctionInliningPass(Threshold);
+}
+
+void
+LLVMPassManagerBuilderPopulateFunctionPassManager(LLVMPassManagerBuilderRef PMB,
+                                                  LLVMPassManagerRef PM) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  legacy::FunctionPassManager *FPM = unwrap<legacy::FunctionPassManager>(PM);
+  Builder->populateFunctionPassManager(*FPM);
+}
+
+void
+LLVMPassManagerBuilderPopulateModulePassManager(LLVMPassManagerBuilderRef PMB,
+                                                LLVMPassManagerRef PM) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  legacy::PassManagerBase *MPM = unwrap(PM);
+  Builder->populateModulePassManager(*MPM);
+}
+
+void LLVMPassManagerBuilderPopulateLTOPassManager(LLVMPassManagerBuilderRef PMB,
+                                                  LLVMPassManagerRef PM,
+                                                  LLVMBool Internalize,
+                                                  LLVMBool RunInliner) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  legacy::PassManagerBase *LPM = unwrap(PM);
+
+  // A small backwards compatibility hack. populateLTOPassManager used to take
+  // an RunInliner option.
+  if (RunInliner && !Builder->Inliner)
+    Builder->Inliner = createFunctionInliningPass();
+
+  Builder->populateLTOPassManager(*LPM);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/PruneEH.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/PruneEH.cpp
index 0e50d45979..3f3b18771c 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/PruneEH.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/PruneEH.cpp
@@ -1,264 +1,264 @@
-//===- PruneEH.cpp - Pass which deletes unused exception handlers ---------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements a simple interprocedural pass which walks the 
-// call-graph, turning invoke instructions into calls, iff the callee cannot 
-// throw an exception, and marking functions 'nounwind' if they cannot throw. 
-// It implements this as a bottom-up traversal of the call-graph. 
-// 
-//===----------------------------------------------------------------------===// 
- 
+//===- PruneEH.cpp - Pass which deletes unused exception handlers ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple interprocedural pass which walks the
+// call-graph, turning invoke instructions into calls, iff the callee cannot
+// throw an exception, and marking functions 'nounwind' if they cannot throw.
+// It implements this as a bottom-up traversal of the call-graph.
+//
+//===----------------------------------------------------------------------===//
+
 #include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/CallGraph.h" 
-#include "llvm/Analysis/CallGraphSCCPass.h" 
-#include "llvm/Analysis/EHPersonalities.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/InlineAsm.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/IPO.h" 
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Utils/CallGraphUpdater.h"
-#include "llvm/Transforms/Utils/Local.h" 
-#include <algorithm> 
-
-using namespace llvm; 
- 
-#define DEBUG_TYPE "prune-eh" 
- 
-STATISTIC(NumRemoved, "Number of invokes removed"); 
-STATISTIC(NumUnreach, "Number of noreturn calls optimized"); 
- 
-namespace { 
-  struct PruneEH : public CallGraphSCCPass { 
-    static char ID; // Pass identification, replacement for typeid 
-    PruneEH() : CallGraphSCCPass(ID) { 
-      initializePruneEHPass(*PassRegistry::getPassRegistry()); 
-    } 
- 
-    // runOnSCC - Analyze the SCC, performing the transformation if possible. 
-    bool runOnSCC(CallGraphSCC &SCC) override; 
-  }; 
-} 
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "prune-eh"
+
+STATISTIC(NumRemoved, "Number of invokes removed");
+STATISTIC(NumUnreach, "Number of noreturn calls optimized");
+
+namespace {
+  struct PruneEH : public CallGraphSCCPass {
+    static char ID; // Pass identification, replacement for typeid
+    PruneEH() : CallGraphSCCPass(ID) {
+      initializePruneEHPass(*PassRegistry::getPassRegistry());
+    }
+
+    // runOnSCC - Analyze the SCC, performing the transformation if possible.
+    bool runOnSCC(CallGraphSCC &SCC) override;
+  };
+}
 static bool SimplifyFunction(Function *F, CallGraphUpdater &CGU);
 static void DeleteBasicBlock(BasicBlock *BB, CallGraphUpdater &CGU);
- 
-char PruneEH::ID = 0; 
-INITIALIZE_PASS_BEGIN(PruneEH, "prune-eh", 
-                "Remove unused exception handling info", false, false) 
-INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) 
-INITIALIZE_PASS_END(PruneEH, "prune-eh", 
-                "Remove unused exception handling info", false, false) 
- 
-Pass *llvm::createPruneEHPass() { return new PruneEH(); } 
- 
+
+char PruneEH::ID = 0;
+INITIALIZE_PASS_BEGIN(PruneEH, "prune-eh",
+                "Remove unused exception handling info", false, false)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_END(PruneEH, "prune-eh",
+                "Remove unused exception handling info", false, false)
+
+Pass *llvm::createPruneEHPass() { return new PruneEH(); }
+
 static bool runImpl(CallGraphUpdater &CGU, SetVector<Function *> &Functions) {
 #ifndef NDEBUG
   for (auto *F : Functions)
     assert(F && "null Function");
 #endif
-  bool MadeChange = false; 
- 
-  // First pass, scan all of the functions in the SCC, simplifying them 
-  // according to what we know. 
+  bool MadeChange = false;
+
+  // First pass, scan all of the functions in the SCC, simplifying them
+  // according to what we know.
   for (Function *F : Functions)
     MadeChange |= SimplifyFunction(F, CGU);
- 
-  // Next, check to see if any callees might throw or if there are any external 
-  // functions in this SCC: if so, we cannot prune any functions in this SCC. 
-  // Definitions that are weak and not declared non-throwing might be 
-  // overridden at linktime with something that throws, so assume that. 
-  // If this SCC includes the unwind instruction, we KNOW it throws, so 
-  // obviously the SCC might throw. 
-  // 
-  bool SCCMightUnwind = false, SCCMightReturn = false; 
+
+  // Next, check to see if any callees might throw or if there are any external
+  // functions in this SCC: if so, we cannot prune any functions in this SCC.
+  // Definitions that are weak and not declared non-throwing might be
+  // overridden at linktime with something that throws, so assume that.
+  // If this SCC includes the unwind instruction, we KNOW it throws, so
+  // obviously the SCC might throw.
+  //
+  bool SCCMightUnwind = false, SCCMightReturn = false;
   for (Function *F : Functions) {
     if (!F->hasExactDefinition()) {
-      SCCMightUnwind |= !F->doesNotThrow(); 
-      SCCMightReturn |= !F->doesNotReturn(); 
-    } else { 
-      bool CheckUnwind = !SCCMightUnwind && !F->doesNotThrow(); 
-      bool CheckReturn = !SCCMightReturn && !F->doesNotReturn(); 
-      // Determine if we should scan for InlineAsm in a naked function as it 
-      // is the only way to return without a ReturnInst.  Only do this for 
-      // no-inline functions as functions which may be inlined cannot 
-      // meaningfully return via assembly. 
-      bool CheckReturnViaAsm = CheckReturn && 
-                               F->hasFnAttribute(Attribute::Naked) && 
-                               F->hasFnAttribute(Attribute::NoInline); 
- 
-      if (!CheckUnwind && !CheckReturn) 
-        continue; 
- 
-      for (const BasicBlock &BB : *F) { 
-        const Instruction *TI = BB.getTerminator(); 
-        if (CheckUnwind && TI->mayThrow()) { 
-          SCCMightUnwind = true; 
-        } else if (CheckReturn && isa<ReturnInst>(TI)) { 
-          SCCMightReturn = true; 
-        } 
- 
-        for (const Instruction &I : BB) { 
-          if ((!CheckUnwind || SCCMightUnwind) && 
-              (!CheckReturnViaAsm || SCCMightReturn)) 
-            break; 
- 
-          // Check to see if this function performs an unwind or calls an 
-          // unwinding function. 
-          if (CheckUnwind && !SCCMightUnwind && I.mayThrow()) { 
-            bool InstMightUnwind = true; 
-            if (const auto *CI = dyn_cast<CallInst>(&I)) { 
-              if (Function *Callee = CI->getCalledFunction()) { 
-                // If the callee is outside our current SCC then we may throw 
-                // because it might.  If it is inside, do nothing. 
+      SCCMightUnwind |= !F->doesNotThrow();
+      SCCMightReturn |= !F->doesNotReturn();
+    } else {
+      bool CheckUnwind = !SCCMightUnwind && !F->doesNotThrow();
+      bool CheckReturn = !SCCMightReturn && !F->doesNotReturn();
+      // Determine if we should scan for InlineAsm in a naked function as it
+      // is the only way to return without a ReturnInst.  Only do this for
+      // no-inline functions as functions which may be inlined cannot
+      // meaningfully return via assembly.
+      bool CheckReturnViaAsm = CheckReturn &&
+                               F->hasFnAttribute(Attribute::Naked) &&
+                               F->hasFnAttribute(Attribute::NoInline);
+
+      if (!CheckUnwind && !CheckReturn)
+        continue;
+
+      for (const BasicBlock &BB : *F) {
+        const Instruction *TI = BB.getTerminator();
+        if (CheckUnwind && TI->mayThrow()) {
+          SCCMightUnwind = true;
+        } else if (CheckReturn && isa<ReturnInst>(TI)) {
+          SCCMightReturn = true;
+        }
+
+        for (const Instruction &I : BB) {
+          if ((!CheckUnwind || SCCMightUnwind) &&
+              (!CheckReturnViaAsm || SCCMightReturn))
+            break;
+
+          // Check to see if this function performs an unwind or calls an
+          // unwinding function.
+          if (CheckUnwind && !SCCMightUnwind && I.mayThrow()) {
+            bool InstMightUnwind = true;
+            if (const auto *CI = dyn_cast<CallInst>(&I)) {
+              if (Function *Callee = CI->getCalledFunction()) {
+                // If the callee is outside our current SCC then we may throw
+                // because it might.  If it is inside, do nothing.
                 if (Functions.contains(Callee))
-                  InstMightUnwind = false; 
-              } 
-            } 
-            SCCMightUnwind |= InstMightUnwind; 
-          } 
-          if (CheckReturnViaAsm && !SCCMightReturn) 
-            if (const auto *CB = dyn_cast<CallBase>(&I)) 
-              if (const auto *IA = dyn_cast<InlineAsm>(CB->getCalledOperand())) 
-                if (IA->hasSideEffects()) 
-                  SCCMightReturn = true; 
-        } 
+                  InstMightUnwind = false;
+              }
+            }
+            SCCMightUnwind |= InstMightUnwind;
+          }
+          if (CheckReturnViaAsm && !SCCMightReturn)
+            if (const auto *CB = dyn_cast<CallBase>(&I))
+              if (const auto *IA = dyn_cast<InlineAsm>(CB->getCalledOperand()))
+                if (IA->hasSideEffects())
+                  SCCMightReturn = true;
+        }
       }
-        if (SCCMightUnwind && SCCMightReturn) 
-          break; 
-    } 
-  } 
- 
-  // If the SCC doesn't unwind or doesn't throw, note this fact. 
-  if (!SCCMightUnwind || !SCCMightReturn) 
+        if (SCCMightUnwind && SCCMightReturn)
+          break;
+    }
+  }
+
+  // If the SCC doesn't unwind or doesn't throw, note this fact.
+  if (!SCCMightUnwind || !SCCMightReturn)
     for (Function *F : Functions) {
-      if (!SCCMightUnwind && !F->hasFnAttribute(Attribute::NoUnwind)) { 
-        F->addFnAttr(Attribute::NoUnwind); 
-        MadeChange = true; 
-      } 
- 
-      if (!SCCMightReturn && !F->hasFnAttribute(Attribute::NoReturn)) { 
-        F->addFnAttr(Attribute::NoReturn); 
-        MadeChange = true; 
-      } 
-    } 
- 
+      if (!SCCMightUnwind && !F->hasFnAttribute(Attribute::NoUnwind)) {
+        F->addFnAttr(Attribute::NoUnwind);
+        MadeChange = true;
+      }
+
+      if (!SCCMightReturn && !F->hasFnAttribute(Attribute::NoReturn)) {
+        F->addFnAttr(Attribute::NoReturn);
+        MadeChange = true;
+      }
+    }
+
   for (Function *F : Functions) {
-    // Convert any invoke instructions to non-throwing functions in this node 
-    // into call instructions with a branch.  This makes the exception blocks 
-    // dead. 
+    // Convert any invoke instructions to non-throwing functions in this node
+    // into call instructions with a branch.  This makes the exception blocks
+    // dead.
     MadeChange |= SimplifyFunction(F, CGU);
-  } 
- 
-  return MadeChange; 
-} 
- 
-bool PruneEH::runOnSCC(CallGraphSCC &SCC) { 
-  if (skipSCC(SCC)) 
-    return false; 
+  }
+
+  return MadeChange;
+}
+
+bool PruneEH::runOnSCC(CallGraphSCC &SCC) {
+  if (skipSCC(SCC))
+    return false;
   SetVector<Function *> Functions;
   for (auto &N : SCC) {
     if (auto *F = N->getFunction())
       Functions.insert(F);
   }
-  CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); 
+  CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
   CallGraphUpdater CGU;
   CGU.initialize(CG, SCC);
   return runImpl(CGU, Functions);
-} 
- 
- 
-// SimplifyFunction - Given information about callees, simplify the specified 
-// function if we have invokes to non-unwinding functions or code after calls to 
-// no-return functions. 
+}
+
+
+// SimplifyFunction - Given information about callees, simplify the specified
+// function if we have invokes to non-unwinding functions or code after calls to
+// no-return functions.
 static bool SimplifyFunction(Function *F, CallGraphUpdater &CGU) {
-  bool MadeChange = false; 
-  for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { 
-    if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) 
-      if (II->doesNotThrow() && canSimplifyInvokeNoUnwind(F)) { 
-        BasicBlock *UnwindBlock = II->getUnwindDest(); 
-        removeUnwindEdge(&*BB); 
- 
-        // If the unwind block is now dead, nuke it. 
-        if (pred_empty(UnwindBlock)) 
+  bool MadeChange = false;
+  for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+    if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator()))
+      if (II->doesNotThrow() && canSimplifyInvokeNoUnwind(F)) {
+        BasicBlock *UnwindBlock = II->getUnwindDest();
+        removeUnwindEdge(&*BB);
+
+        // If the unwind block is now dead, nuke it.
+        if (pred_empty(UnwindBlock))
           DeleteBasicBlock(UnwindBlock, CGU); // Delete the new BB.
- 
-        ++NumRemoved; 
-        MadeChange = true; 
-      } 
- 
-    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) 
-      if (CallInst *CI = dyn_cast<CallInst>(I++)) 
-        if (CI->doesNotReturn() && !CI->isMustTailCall() && 
-            !isa<UnreachableInst>(I)) { 
-          // This call calls a function that cannot return.  Insert an 
-          // unreachable instruction after it and simplify the code.  Do this 
-          // by splitting the BB, adding the unreachable, then deleting the 
-          // new BB. 
-          BasicBlock *New = BB->splitBasicBlock(I); 
- 
-          // Remove the uncond branch and add an unreachable. 
-          BB->getInstList().pop_back(); 
-          new UnreachableInst(BB->getContext(), &*BB); 
- 
+
+        ++NumRemoved;
+        MadeChange = true;
+      }
+
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; )
+      if (CallInst *CI = dyn_cast<CallInst>(I++))
+        if (CI->doesNotReturn() && !CI->isMustTailCall() &&
+            !isa<UnreachableInst>(I)) {
+          // This call calls a function that cannot return.  Insert an
+          // unreachable instruction after it and simplify the code.  Do this
+          // by splitting the BB, adding the unreachable, then deleting the
+          // new BB.
+          BasicBlock *New = BB->splitBasicBlock(I);
+
+          // Remove the uncond branch and add an unreachable.
+          BB->getInstList().pop_back();
+          new UnreachableInst(BB->getContext(), &*BB);
+
           DeleteBasicBlock(New, CGU); // Delete the new BB.
-          MadeChange = true; 
-          ++NumUnreach; 
-          break; 
-        } 
-  } 
- 
-  return MadeChange; 
-} 
- 
-/// DeleteBasicBlock - remove the specified basic block from the program, 
-/// updating the callgraph to reflect any now-obsolete edges due to calls that 
-/// exist in the BB. 
+          MadeChange = true;
+          ++NumUnreach;
+          break;
+        }
+  }
+
+  return MadeChange;
+}
+
+/// DeleteBasicBlock - remove the specified basic block from the program,
+/// updating the callgraph to reflect any now-obsolete edges due to calls that
+/// exist in the BB.
 static void DeleteBasicBlock(BasicBlock *BB, CallGraphUpdater &CGU) {
-  assert(pred_empty(BB) && "BB is not dead!"); 
- 
-  Instruction *TokenInst = nullptr; 
- 
-  for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; ) { 
-    --I; 
- 
-    if (I->getType()->isTokenTy()) { 
-      TokenInst = &*I; 
-      break; 
-    } 
- 
-    if (auto *Call = dyn_cast<CallBase>(&*I)) { 
-      const Function *Callee = Call->getCalledFunction(); 
-      if (!Callee || !Intrinsic::isLeaf(Callee->getIntrinsicID())) 
+  assert(pred_empty(BB) && "BB is not dead!");
+
+  Instruction *TokenInst = nullptr;
+
+  for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; ) {
+    --I;
+
+    if (I->getType()->isTokenTy()) {
+      TokenInst = &*I;
+      break;
+    }
+
+    if (auto *Call = dyn_cast<CallBase>(&*I)) {
+      const Function *Callee = Call->getCalledFunction();
+      if (!Callee || !Intrinsic::isLeaf(Callee->getIntrinsicID()))
         CGU.removeCallSite(*Call);
-      else if (!Callee->isIntrinsic()) 
+      else if (!Callee->isIntrinsic())
         CGU.removeCallSite(*Call);
-    } 
- 
-    if (!I->use_empty()) 
-      I->replaceAllUsesWith(UndefValue::get(I->getType())); 
-  } 
- 
-  if (TokenInst) { 
-    if (!TokenInst->isTerminator()) 
-      changeToUnreachable(TokenInst->getNextNode(), /*UseLLVMTrap=*/false); 
-  } else { 
-    // Get the list of successors of this block. 
-    std::vector<BasicBlock *> Succs(succ_begin(BB), succ_end(BB)); 
- 
-    for (unsigned i = 0, e = Succs.size(); i != e; ++i) 
-      Succs[i]->removePredecessor(BB); 
- 
-    BB->eraseFromParent(); 
-  } 
-} 
+    }
+
+    if (!I->use_empty())
+      I->replaceAllUsesWith(UndefValue::get(I->getType()));
+  }
+
+  if (TokenInst) {
+    if (!TokenInst->isTerminator())
+      changeToUnreachable(TokenInst->getNextNode(), /*UseLLVMTrap=*/false);
+  } else {
+    // Get the list of successors of this block.
+    std::vector<BasicBlock *> Succs(succ_begin(BB), succ_end(BB));
+
+    for (unsigned i = 0, e = Succs.size(); i != e; ++i)
+      Succs[i]->removePredecessor(BB);
+
+    BB->eraseFromParent();
+  }
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/SCCP.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/SCCP.cpp
index c8be482716..fdffffba0c 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/SCCP.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/SCCP.cpp
@@ -1,93 +1,93 @@
-#include "llvm/Transforms/IPO/SCCP.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/PostDominators.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Transforms/IPO.h" 
-#include "llvm/Transforms/Scalar/SCCP.h" 
- 
-using namespace llvm; 
- 
-PreservedAnalyses IPSCCPPass::run(Module &M, ModuleAnalysisManager &AM) { 
-  const DataLayout &DL = M.getDataLayout(); 
-  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 
-  auto GetTLI = [&FAM](Function &F) -> const TargetLibraryInfo & { 
-    return FAM.getResult<TargetLibraryAnalysis>(F); 
-  }; 
-  auto getAnalysis = [&FAM](Function &F) -> AnalysisResultsForFn { 
-    DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F); 
-    return { 
-        std::make_unique<PredicateInfo>(F, DT, FAM.getResult<AssumptionAnalysis>(F)), 
-        &DT, FAM.getCachedResult<PostDominatorTreeAnalysis>(F)}; 
-  }; 
- 
-  if (!runIPSCCP(M, DL, GetTLI, getAnalysis)) 
-    return PreservedAnalyses::all(); 
- 
-  PreservedAnalyses PA; 
-  PA.preserve<DominatorTreeAnalysis>(); 
-  PA.preserve<PostDominatorTreeAnalysis>(); 
-  PA.preserve<FunctionAnalysisManagerModuleProxy>(); 
-  return PA; 
-} 
- 
-namespace { 
- 
-//===--------------------------------------------------------------------===// 
-// 
-/// IPSCCP Class - This class implements interprocedural Sparse Conditional 
-/// Constant Propagation. 
-/// 
-class IPSCCPLegacyPass : public ModulePass { 
-public: 
-  static char ID; 
- 
-  IPSCCPLegacyPass() : ModulePass(ID) { 
-    initializeIPSCCPLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnModule(Module &M) override { 
-    if (skipModule(M)) 
-      return false; 
-    const DataLayout &DL = M.getDataLayout(); 
-    auto GetTLI = [this](Function &F) -> const TargetLibraryInfo & { 
-      return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 
-    }; 
-    auto getAnalysis = [this](Function &F) -> AnalysisResultsForFn { 
-      DominatorTree &DT = 
-          this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree(); 
-      return { 
-          std::make_unique<PredicateInfo>( 
-              F, DT, 
-              this->getAnalysis<AssumptionCacheTracker>().getAssumptionCache( 
-                  F)), 
-          nullptr,  // We cannot preserve the DT or PDT with the legacy pass 
-          nullptr}; // manager, so set them to nullptr. 
-    }; 
- 
-    return runIPSCCP(M, DL, GetTLI, getAnalysis); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<AssumptionCacheTracker>(); 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-char IPSCCPLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(IPSCCPLegacyPass, "ipsccp", 
-                      "Interprocedural Sparse Conditional Constant Propagation", 
-                      false, false) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_END(IPSCCPLegacyPass, "ipsccp", 
-                    "Interprocedural Sparse Conditional Constant Propagation", 
-                    false, false) 
- 
-// createIPSCCPPass - This is the public interface to this file. 
-ModulePass *llvm::createIPSCCPPass() { return new IPSCCPLegacyPass(); } 
+#include "llvm/Transforms/IPO/SCCP.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Scalar/SCCP.h"
+
+using namespace llvm;
+
+PreservedAnalyses IPSCCPPass::run(Module &M, ModuleAnalysisManager &AM) {
+  const DataLayout &DL = M.getDataLayout();
+  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto GetTLI = [&FAM](Function &F) -> const TargetLibraryInfo & {
+    return FAM.getResult<TargetLibraryAnalysis>(F);
+  };
+  auto getAnalysis = [&FAM](Function &F) -> AnalysisResultsForFn {
+    DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+    return {
+        std::make_unique<PredicateInfo>(F, DT, FAM.getResult<AssumptionAnalysis>(F)),
+        &DT, FAM.getCachedResult<PostDominatorTreeAnalysis>(F)};
+  };
+
+  if (!runIPSCCP(M, DL, GetTLI, getAnalysis))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<PostDominatorTreeAnalysis>();
+  PA.preserve<FunctionAnalysisManagerModuleProxy>();
+  return PA;
+}
+
+namespace {
+
+//===--------------------------------------------------------------------===//
+//
+/// IPSCCP Class - This class implements interprocedural Sparse Conditional
+/// Constant Propagation.
+///
+class IPSCCPLegacyPass : public ModulePass {
+public:
+  static char ID;
+
+  IPSCCPLegacyPass() : ModulePass(ID) {
+    initializeIPSCCPLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+    const DataLayout &DL = M.getDataLayout();
+    auto GetTLI = [this](Function &F) -> const TargetLibraryInfo & {
+      return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    };
+    auto getAnalysis = [this](Function &F) -> AnalysisResultsForFn {
+      DominatorTree &DT =
+          this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+      return {
+          std::make_unique<PredicateInfo>(
+              F, DT,
+              this->getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+                  F)),
+          nullptr,  // We cannot preserve the DT or PDT with the legacy pass
+          nullptr}; // manager, so set them to nullptr.
+    };
+
+    return runIPSCCP(M, DL, GetTLI, getAnalysis);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+};
+
+} // end anonymous namespace
+
+char IPSCCPLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(IPSCCPLegacyPass, "ipsccp",
+                      "Interprocedural Sparse Conditional Constant Propagation",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(IPSCCPLegacyPass, "ipsccp",
+                    "Interprocedural Sparse Conditional Constant Propagation",
+                    false, false)
+
+// createIPSCCPPass - This is the public interface to this file.
+ModulePass *llvm::createIPSCCPPass() { return new IPSCCPLegacyPass(); }
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/SampleProfile.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/SampleProfile.cpp
index e2a097bfaa..a6a419bfe7 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/SampleProfile.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/SampleProfile.cpp
@@ -1,116 +1,116 @@
-//===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the SampleProfileLoader transformation. This pass 
-// reads a profile file generated by a sampling profiler (e.g. Linux Perf - 
-// http://perf.wiki.kernel.org/) and generates IR metadata to reflect the 
-// profile information in the given profile. 
-// 
-// This pass generates branch weight annotations on the IR: 
-// 
-// - prof: Represents branch weights. This annotation is added to branches 
-//      to indicate the weights of each edge coming out of the branch. 
-//      The weight of each edge is the weight of the target block for 
-//      that edge. The weight of a block B is computed as the maximum 
-//      number of samples found in B. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/IPO/SampleProfile.h" 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/DenseSet.h" 
-#include "llvm/ADT/None.h" 
+//===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SampleProfileLoader transformation. This pass
+// reads a profile file generated by a sampling profiler (e.g. Linux Perf -
+// http://perf.wiki.kernel.org/) and generates IR metadata to reflect the
+// profile information in the given profile.
+//
+// This pass generates branch weight annotations on the IR:
+//
+// - prof: Represents branch weights. This annotation is added to branches
+//      to indicate the weights of each edge coming out of the branch.
+//      The weight of each edge is the weight of the target block for
+//      that edge. The weight of a block B is computed as the maximum
+//      number of samples found in B.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/SampleProfile.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/PriorityQueue.h"
-#include "llvm/ADT/SCCIterator.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/StringMap.h" 
-#include "llvm/ADT/StringRef.h" 
-#include "llvm/ADT/Twine.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/CallGraph.h" 
-#include "llvm/Analysis/CallGraphSCCPass.h" 
-#include "llvm/Analysis/InlineAdvisor.h" 
-#include "llvm/Analysis/InlineCost.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/OptimizationRemarkEmitter.h" 
-#include "llvm/Analysis/PostDominators.h" 
-#include "llvm/Analysis/ProfileSummaryInfo.h" 
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/InlineAdvisor.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ReplayInlineAdvisor.h"
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/DebugInfoMetadata.h" 
-#include "llvm/IR/DebugLoc.h" 
-#include "llvm/IR/DiagnosticInfo.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GlobalValue.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/MDBuilder.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/IR/ValueSymbolTable.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/ProfileData/InstrProf.h" 
-#include "llvm/ProfileData/SampleProf.h" 
-#include "llvm/ProfileData/SampleProfReader.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/ErrorOr.h" 
-#include "llvm/Support/GenericDomTree.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/IPO.h" 
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/SampleProf.h"
+#include "llvm/ProfileData/SampleProfReader.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/GenericDomTree.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/SampleContextTracker.h"
 #include "llvm/Transforms/IPO/SampleProfileProbe.h"
-#include "llvm/Transforms/Instrumentation.h" 
-#include "llvm/Transforms/Utils/CallPromotionUtils.h" 
-#include "llvm/Transforms/Utils/Cloning.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <cstdint> 
-#include <functional> 
-#include <limits> 
-#include <map> 
-#include <memory> 
-#include <queue> 
-#include <string> 
-#include <system_error> 
-#include <utility> 
-#include <vector> 
- 
-using namespace llvm; 
-using namespace sampleprof; 
-using ProfileCount = Function::ProfileCount; 
-#define DEBUG_TYPE "sample-profile" 
-#define CSINLINE_DEBUG DEBUG_TYPE "-inline" 
- 
-STATISTIC(NumCSInlined, 
-          "Number of functions inlined with context sensitive profile"); 
-STATISTIC(NumCSNotInlined, 
-          "Number of functions not inlined with context sensitive profile"); 
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/CallPromotionUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <map>
+#include <memory>
+#include <queue>
+#include <string>
+#include <system_error>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using namespace sampleprof;
+using ProfileCount = Function::ProfileCount;
+#define DEBUG_TYPE "sample-profile"
+#define CSINLINE_DEBUG DEBUG_TYPE "-inline"
+
+STATISTIC(NumCSInlined,
+          "Number of functions inlined with context sensitive profile");
+STATISTIC(NumCSNotInlined,
+          "Number of functions not inlined with context sensitive profile");
 STATISTIC(NumMismatchedProfile,
           "Number of functions with CFG mismatched profile");
 STATISTIC(NumMatchedProfile, "Number of functions with CFG matched profile");
 STATISTIC(NumDuplicatedInlinesite,
           "Number of inlined callsites with a partial distribution factor");
- 
+
 STATISTIC(NumCSInlinedHitMinLimit,
           "Number of functions with FDO inline stopped due to min size limit");
 STATISTIC(NumCSInlinedHitMaxLimit,
@@ -119,64 +119,64 @@ STATISTIC(
     NumCSInlinedHitGrowthLimit,
     "Number of functions with FDO inline stopped due to growth size limit");
 
-// Command line option to specify the file to read samples from. This is 
-// mainly used for debugging. 
-static cl::opt<std::string> SampleProfileFile( 
-    "sample-profile-file", cl::init(""), cl::value_desc("filename"), 
-    cl::desc("Profile file loaded by -sample-profile"), cl::Hidden); 
- 
-// The named file contains a set of transformations that may have been applied 
-// to the symbol names between the program from which the sample data was 
-// collected and the current program's symbols. 
-static cl::opt<std::string> SampleProfileRemappingFile( 
-    "sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"), 
-    cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden); 
- 
-static cl::opt<unsigned> SampleProfileMaxPropagateIterations( 
-    "sample-profile-max-propagate-iterations", cl::init(100), 
-    cl::desc("Maximum number of iterations to go through when propagating " 
-             "sample block/edge weights through the CFG.")); 
- 
-static cl::opt<unsigned> SampleProfileRecordCoverage( 
-    "sample-profile-check-record-coverage", cl::init(0), cl::value_desc("N"), 
-    cl::desc("Emit a warning if less than N% of records in the input profile " 
-             "are matched to the IR.")); 
- 
-static cl::opt<unsigned> SampleProfileSampleCoverage( 
-    "sample-profile-check-sample-coverage", cl::init(0), cl::value_desc("N"), 
-    cl::desc("Emit a warning if less than N% of samples in the input profile " 
-             "are matched to the IR.")); 
- 
-static cl::opt<bool> NoWarnSampleUnused( 
-    "no-warn-sample-unused", cl::init(false), cl::Hidden, 
-    cl::desc("Use this option to turn off/on warnings about function with " 
-             "samples but without debug information to use those samples. ")); 
- 
-static cl::opt<bool> ProfileSampleAccurate( 
-    "profile-sample-accurate", cl::Hidden, cl::init(false), 
-    cl::desc("If the sample profile is accurate, we will mark all un-sampled " 
-             "callsite and function as having 0 samples. Otherwise, treat " 
-             "un-sampled callsites and functions conservatively as unknown. ")); 
- 
-static cl::opt<bool> ProfileAccurateForSymsInList( 
-    "profile-accurate-for-symsinlist", cl::Hidden, cl::ZeroOrMore, 
-    cl::init(true), 
-    cl::desc("For symbols in profile symbol list, regard their profiles to " 
-             "be accurate. It may be overriden by profile-sample-accurate. ")); 
- 
-static cl::opt<bool> ProfileMergeInlinee( 
-    "sample-profile-merge-inlinee", cl::Hidden, cl::init(true), 
-    cl::desc("Merge past inlinee's profile to outline version if sample " 
-             "profile loader decided not to inline a call site. It will " 
-             "only be enabled when top-down order of profile loading is " 
-             "enabled. ")); 
- 
-static cl::opt<bool> ProfileTopDownLoad( 
-    "sample-profile-top-down-load", cl::Hidden, cl::init(true), 
-    cl::desc("Do profile annotation and inlining for functions in top-down " 
-             "order of call graph during sample profile loading. It only " 
-             "works for new pass manager. ")); 
- 
+// Command line option to specify the file to read samples from. This is
+// mainly used for debugging.
+static cl::opt<std::string> SampleProfileFile(
+    "sample-profile-file", cl::init(""), cl::value_desc("filename"),
+    cl::desc("Profile file loaded by -sample-profile"), cl::Hidden);
+
+// The named file contains a set of transformations that may have been applied
+// to the symbol names between the program from which the sample data was
+// collected and the current program's symbols.
+static cl::opt<std::string> SampleProfileRemappingFile(
+    "sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"),
+    cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden);
+
+static cl::opt<unsigned> SampleProfileMaxPropagateIterations(
+    "sample-profile-max-propagate-iterations", cl::init(100),
+    cl::desc("Maximum number of iterations to go through when propagating "
+             "sample block/edge weights through the CFG."));
+
+static cl::opt<unsigned> SampleProfileRecordCoverage(
+    "sample-profile-check-record-coverage", cl::init(0), cl::value_desc("N"),
+    cl::desc("Emit a warning if less than N% of records in the input profile "
+             "are matched to the IR."));
+
+static cl::opt<unsigned> SampleProfileSampleCoverage(
+    "sample-profile-check-sample-coverage", cl::init(0), cl::value_desc("N"),
+    cl::desc("Emit a warning if less than N% of samples in the input profile "
+             "are matched to the IR."));
+
+static cl::opt<bool> NoWarnSampleUnused(
+    "no-warn-sample-unused", cl::init(false), cl::Hidden,
+    cl::desc("Use this option to turn off/on warnings about function with "
+             "samples but without debug information to use those samples. "));
+
+static cl::opt<bool> ProfileSampleAccurate(
+    "profile-sample-accurate", cl::Hidden, cl::init(false),
+    cl::desc("If the sample profile is accurate, we will mark all un-sampled "
+             "callsite and function as having 0 samples. Otherwise, treat "
+             "un-sampled callsites and functions conservatively as unknown. "));
+
+static cl::opt<bool> ProfileAccurateForSymsInList(
+    "profile-accurate-for-symsinlist", cl::Hidden, cl::ZeroOrMore,
+    cl::init(true),
+    cl::desc("For symbols in profile symbol list, regard their profiles to "
+             "be accurate. It may be overriden by profile-sample-accurate. "));
+
+static cl::opt<bool> ProfileMergeInlinee(
+    "sample-profile-merge-inlinee", cl::Hidden, cl::init(true),
+    cl::desc("Merge past inlinee's profile to outline version if sample "
+             "profile loader decided not to inline a call site. It will "
+             "only be enabled when top-down order of profile loading is "
+             "enabled. "));
+
+static cl::opt<bool> ProfileTopDownLoad(
+    "sample-profile-top-down-load", cl::Hidden, cl::init(true),
+    cl::desc("Do profile annotation and inlining for functions in top-down "
+             "order of call graph during sample profile loading. It only "
+             "works for new pass manager. "));
+
 static cl::opt<bool> UseProfileIndirectCallEdges(
     "use-profile-indirect-call-edges", cl::init(true), cl::Hidden,
     cl::desc("Considering indirect call samples from profile when top-down "
@@ -187,11 +187,11 @@ static cl::opt<bool> UseProfileTopDownOrder(
     cl::desc("Process functions in one SCC in a top-down order "
              "based on the input profile."));
 
-static cl::opt<bool> ProfileSizeInline( 
-    "sample-profile-inline-size", cl::Hidden, cl::init(false), 
-    cl::desc("Inline cold call sites in profile loader if it's beneficial " 
-             "for code size.")); 
- 
+static cl::opt<bool> ProfileSizeInline(
+    "sample-profile-inline-size", cl::Hidden, cl::init(false),
+    cl::desc("Inline cold call sites in profile loader if it's beneficial "
+             "for code size."));
+
 static cl::opt<int> ProfileInlineGrowthLimit(
     "sample-profile-inline-growth-limit", cl::Hidden, cl::init(12),
     cl::desc("The size growth ratio limit for proirity-based sample profile "
@@ -224,10 +224,10 @@ static cl::opt<bool> CallsitePrioritizedInline(
     cl::desc("Use call site prioritized inlining for sample profile loader."
              "Currently only CSSPGO is supported."));
 
-static cl::opt<int> SampleColdCallSiteThreshold( 
-    "sample-profile-cold-inline-threshold", cl::Hidden, cl::init(45), 
-    cl::desc("Threshold for inlining cold callsites")); 
- 
+static cl::opt<int> SampleColdCallSiteThreshold(
+    "sample-profile-cold-inline-threshold", cl::Hidden, cl::init(45),
+    cl::desc("Threshold for inlining cold callsites"));
+
 static cl::opt<std::string> ProfileInlineReplayFile(
     "sample-profile-inline-replay", cl::init(""), cl::value_desc("filename"),
     cl::desc(
@@ -235,137 +235,137 @@ static cl::opt<std::string> ProfileInlineReplayFile(
         "by inlining from sample profile loader."),
     cl::Hidden);
 
-namespace { 
- 
-using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>; 
-using EquivalenceClassMap = DenseMap<const BasicBlock *, const BasicBlock *>; 
-using Edge = std::pair<const BasicBlock *, const BasicBlock *>; 
-using EdgeWeightMap = DenseMap<Edge, uint64_t>; 
-using BlockEdgeMap = 
-    DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>>; 
- 
-class SampleProfileLoader; 
- 
-class SampleCoverageTracker { 
-public: 
-  SampleCoverageTracker(SampleProfileLoader &SPL) : SPLoader(SPL){}; 
- 
-  bool markSamplesUsed(const FunctionSamples *FS, uint32_t LineOffset, 
-                       uint32_t Discriminator, uint64_t Samples); 
-  unsigned computeCoverage(unsigned Used, unsigned Total) const; 
-  unsigned countUsedRecords(const FunctionSamples *FS, 
-                            ProfileSummaryInfo *PSI) const; 
-  unsigned countBodyRecords(const FunctionSamples *FS, 
-                            ProfileSummaryInfo *PSI) const; 
-  uint64_t getTotalUsedSamples() const { return TotalUsedSamples; } 
-  uint64_t countBodySamples(const FunctionSamples *FS, 
-                            ProfileSummaryInfo *PSI) const; 
- 
-  void clear() { 
-    SampleCoverage.clear(); 
-    TotalUsedSamples = 0; 
-  } 
- 
-private: 
-  using BodySampleCoverageMap = std::map<LineLocation, unsigned>; 
-  using FunctionSamplesCoverageMap = 
-      DenseMap<const FunctionSamples *, BodySampleCoverageMap>; 
- 
-  /// Coverage map for sampling records. 
-  /// 
-  /// This map keeps a record of sampling records that have been matched to 
-  /// an IR instruction. This is used to detect some form of staleness in 
-  /// profiles (see flag -sample-profile-check-coverage). 
-  /// 
-  /// Each entry in the map corresponds to a FunctionSamples instance.  This is 
-  /// another map that counts how many times the sample record at the 
-  /// given location has been used. 
-  FunctionSamplesCoverageMap SampleCoverage; 
- 
-  /// Number of samples used from the profile. 
-  /// 
-  /// When a sampling record is used for the first time, the samples from 
-  /// that record are added to this accumulator.  Coverage is later computed 
-  /// based on the total number of samples available in this function and 
-  /// its callsites. 
-  /// 
-  /// Note that this accumulator tracks samples used from a single function 
-  /// and all the inlined callsites. Strictly, we should have a map of counters 
-  /// keyed by FunctionSamples pointers, but these stats are cleared after 
-  /// every function, so we just need to keep a single counter. 
-  uint64_t TotalUsedSamples = 0; 
- 
-  SampleProfileLoader &SPLoader; 
-}; 
- 
-class GUIDToFuncNameMapper { 
-public: 
-  GUIDToFuncNameMapper(Module &M, SampleProfileReader &Reader, 
-                        DenseMap<uint64_t, StringRef> &GUIDToFuncNameMap) 
-      : CurrentReader(Reader), CurrentModule(M), 
-      CurrentGUIDToFuncNameMap(GUIDToFuncNameMap) { 
-    if (!CurrentReader.useMD5()) 
-      return; 
- 
-    for (const auto &F : CurrentModule) { 
-      StringRef OrigName = F.getName(); 
-      CurrentGUIDToFuncNameMap.insert( 
-          {Function::getGUID(OrigName), OrigName}); 
- 
-      // Local to global var promotion used by optimization like thinlto 
-      // will rename the var and add suffix like ".llvm.xxx" to the 
-      // original local name. In sample profile, the suffixes of function 
-      // names are all stripped. Since it is possible that the mapper is 
-      // built in post-thin-link phase and var promotion has been done, 
-      // we need to add the substring of function name without the suffix 
-      // into the GUIDToFuncNameMap. 
-      StringRef CanonName = FunctionSamples::getCanonicalFnName(F); 
-      if (CanonName != OrigName) 
-        CurrentGUIDToFuncNameMap.insert( 
-            {Function::getGUID(CanonName), CanonName}); 
-    } 
- 
-    // Update GUIDToFuncNameMap for each function including inlinees. 
-    SetGUIDToFuncNameMapForAll(&CurrentGUIDToFuncNameMap); 
-  } 
- 
-  ~GUIDToFuncNameMapper() { 
-    if (!CurrentReader.useMD5()) 
-      return; 
- 
-    CurrentGUIDToFuncNameMap.clear(); 
- 
-    // Reset GUIDToFuncNameMap for of each function as they're no 
-    // longer valid at this point. 
-    SetGUIDToFuncNameMapForAll(nullptr); 
-  } 
- 
-private: 
-  void SetGUIDToFuncNameMapForAll(DenseMap<uint64_t, StringRef> *Map) { 
-    std::queue<FunctionSamples *> FSToUpdate; 
-    for (auto &IFS : CurrentReader.getProfiles()) { 
-      FSToUpdate.push(&IFS.second); 
-    } 
- 
-    while (!FSToUpdate.empty()) { 
-      FunctionSamples *FS = FSToUpdate.front(); 
-      FSToUpdate.pop(); 
-      FS->GUIDToFuncNameMap = Map; 
-      for (const auto &ICS : FS->getCallsiteSamples()) { 
-        const FunctionSamplesMap &FSMap = ICS.second; 
-        for (auto &IFS : FSMap) { 
-          FunctionSamples &FS = const_cast<FunctionSamples &>(IFS.second); 
-          FSToUpdate.push(&FS); 
-        } 
-      } 
-    } 
-  } 
- 
-  SampleProfileReader &CurrentReader; 
-  Module &CurrentModule; 
-  DenseMap<uint64_t, StringRef> &CurrentGUIDToFuncNameMap; 
-}; 
- 
+namespace {
+
+using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>;
+using EquivalenceClassMap = DenseMap<const BasicBlock *, const BasicBlock *>;
+using Edge = std::pair<const BasicBlock *, const BasicBlock *>;
+using EdgeWeightMap = DenseMap<Edge, uint64_t>;
+using BlockEdgeMap =
+    DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>>;
+
+class SampleProfileLoader;
+
+class SampleCoverageTracker {
+public:
+  SampleCoverageTracker(SampleProfileLoader &SPL) : SPLoader(SPL){};
+
+  bool markSamplesUsed(const FunctionSamples *FS, uint32_t LineOffset,
+                       uint32_t Discriminator, uint64_t Samples);
+  unsigned computeCoverage(unsigned Used, unsigned Total) const;
+  unsigned countUsedRecords(const FunctionSamples *FS,
+                            ProfileSummaryInfo *PSI) const;
+  unsigned countBodyRecords(const FunctionSamples *FS,
+                            ProfileSummaryInfo *PSI) const;
+  uint64_t getTotalUsedSamples() const { return TotalUsedSamples; }
+  uint64_t countBodySamples(const FunctionSamples *FS,
+                            ProfileSummaryInfo *PSI) const;
+
+  void clear() {
+    SampleCoverage.clear();
+    TotalUsedSamples = 0;
+  }
+
+private:
+  using BodySampleCoverageMap = std::map<LineLocation, unsigned>;
+  using FunctionSamplesCoverageMap =
+      DenseMap<const FunctionSamples *, BodySampleCoverageMap>;
+
+  /// Coverage map for sampling records.
+  ///
+  /// This map keeps a record of sampling records that have been matched to
+  /// an IR instruction. This is used to detect some form of staleness in
+  /// profiles (see flag -sample-profile-check-coverage).
+  ///
+  /// Each entry in the map corresponds to a FunctionSamples instance.  This is
+  /// another map that counts how many times the sample record at the
+  /// given location has been used.
+  FunctionSamplesCoverageMap SampleCoverage;
+
+  /// Number of samples used from the profile.
+  ///
+  /// When a sampling record is used for the first time, the samples from
+  /// that record are added to this accumulator.  Coverage is later computed
+  /// based on the total number of samples available in this function and
+  /// its callsites.
+  ///
+  /// Note that this accumulator tracks samples used from a single function
+  /// and all the inlined callsites. Strictly, we should have a map of counters
+  /// keyed by FunctionSamples pointers, but these stats are cleared after
+  /// every function, so we just need to keep a single counter.
+  uint64_t TotalUsedSamples = 0;
+
+  SampleProfileLoader &SPLoader;
+};
+
+class GUIDToFuncNameMapper {
+public:
+  GUIDToFuncNameMapper(Module &M, SampleProfileReader &Reader,
+                        DenseMap<uint64_t, StringRef> &GUIDToFuncNameMap)
+      : CurrentReader(Reader), CurrentModule(M),
+      CurrentGUIDToFuncNameMap(GUIDToFuncNameMap) {
+    if (!CurrentReader.useMD5())
+      return;
+
+    for (const auto &F : CurrentModule) {
+      StringRef OrigName = F.getName();
+      CurrentGUIDToFuncNameMap.insert(
+          {Function::getGUID(OrigName), OrigName});
+
+      // Local to global var promotion used by optimization like thinlto
+      // will rename the var and add suffix like ".llvm.xxx" to the
+      // original local name. In sample profile, the suffixes of function
+      // names are all stripped. Since it is possible that the mapper is
+      // built in post-thin-link phase and var promotion has been done,
+      // we need to add the substring of function name without the suffix
+      // into the GUIDToFuncNameMap.
+      StringRef CanonName = FunctionSamples::getCanonicalFnName(F);
+      if (CanonName != OrigName)
+        CurrentGUIDToFuncNameMap.insert(
+            {Function::getGUID(CanonName), CanonName});
+    }
+
+    // Update GUIDToFuncNameMap for each function including inlinees.
+    SetGUIDToFuncNameMapForAll(&CurrentGUIDToFuncNameMap);
+  }
+
+  ~GUIDToFuncNameMapper() {
+    if (!CurrentReader.useMD5())
+      return;
+
+    CurrentGUIDToFuncNameMap.clear();
+
+    // Reset GUIDToFuncNameMap for of each function as they're no
+    // longer valid at this point.
+    SetGUIDToFuncNameMapForAll(nullptr);
+  }
+
+private:
+  void SetGUIDToFuncNameMapForAll(DenseMap<uint64_t, StringRef> *Map) {
+    std::queue<FunctionSamples *> FSToUpdate;
+    for (auto &IFS : CurrentReader.getProfiles()) {
+      FSToUpdate.push(&IFS.second);
+    }
+
+    while (!FSToUpdate.empty()) {
+      FunctionSamples *FS = FSToUpdate.front();
+      FSToUpdate.pop();
+      FS->GUIDToFuncNameMap = Map;
+      for (const auto &ICS : FS->getCallsiteSamples()) {
+        const FunctionSamplesMap &FSMap = ICS.second;
+        for (auto &IFS : FSMap) {
+          FunctionSamples &FS = const_cast<FunctionSamples &>(IFS.second);
+          FSToUpdate.push(&FS);
+        }
+      }
+    }
+  }
+
+  SampleProfileReader &CurrentReader;
+  Module &CurrentModule;
+  DenseMap<uint64_t, StringRef> &CurrentGUIDToFuncNameMap;
+};
+
 // Inline candidate used by iterative callsite prioritized inliner
 struct InlineCandidate {
   CallBase *CallInstr;
@@ -398,50 +398,50 @@ using CandidateQueue =
     PriorityQueue<InlineCandidate, std::vector<InlineCandidate>,
                   CandidateComparer>;
 
-/// Sample profile pass. 
-/// 
-/// This pass reads profile data from the file specified by 
-/// -sample-profile-file and annotates every affected function with the 
-/// profile information found in that file. 
-class SampleProfileLoader { 
-public: 
-  SampleProfileLoader( 
+/// Sample profile pass.
+///
+/// This pass reads profile data from the file specified by
+/// -sample-profile-file and annotates every affected function with the
+/// profile information found in that file.
+class SampleProfileLoader {
+public:
+  SampleProfileLoader(
       StringRef Name, StringRef RemapName, ThinOrFullLTOPhase LTOPhase,
-      std::function<AssumptionCache &(Function &)> GetAssumptionCache, 
-      std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo, 
-      std::function<const TargetLibraryInfo &(Function &)> GetTLI) 
-      : GetAC(std::move(GetAssumptionCache)), 
-        GetTTI(std::move(GetTargetTransformInfo)), GetTLI(std::move(GetTLI)), 
-        CoverageTracker(*this), Filename(std::string(Name)), 
+      std::function<AssumptionCache &(Function &)> GetAssumptionCache,
+      std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo,
+      std::function<const TargetLibraryInfo &(Function &)> GetTLI)
+      : GetAC(std::move(GetAssumptionCache)),
+        GetTTI(std::move(GetTargetTransformInfo)), GetTLI(std::move(GetTLI)),
+        CoverageTracker(*this), Filename(std::string(Name)),
         RemappingFilename(std::string(RemapName)), LTOPhase(LTOPhase) {}
- 
+
   bool doInitialization(Module &M, FunctionAnalysisManager *FAM = nullptr);
-  bool runOnModule(Module &M, ModuleAnalysisManager *AM, 
-                   ProfileSummaryInfo *_PSI, CallGraph *CG); 
- 
-  void dump() { Reader->dump(); } 
- 
-protected: 
-  friend class SampleCoverageTracker; 
- 
-  bool runOnFunction(Function &F, ModuleAnalysisManager *AM); 
-  unsigned getFunctionLoc(Function &F); 
-  bool emitAnnotations(Function &F); 
-  ErrorOr<uint64_t> getInstWeight(const Instruction &I); 
+  bool runOnModule(Module &M, ModuleAnalysisManager *AM,
+                   ProfileSummaryInfo *_PSI, CallGraph *CG);
+
+  void dump() { Reader->dump(); }
+
+protected:
+  friend class SampleCoverageTracker;
+
+  bool runOnFunction(Function &F, ModuleAnalysisManager *AM);
+  unsigned getFunctionLoc(Function &F);
+  bool emitAnnotations(Function &F);
+  ErrorOr<uint64_t> getInstWeight(const Instruction &I);
   ErrorOr<uint64_t> getProbeWeight(const Instruction &I);
-  ErrorOr<uint64_t> getBlockWeight(const BasicBlock *BB); 
-  const FunctionSamples *findCalleeFunctionSamples(const CallBase &I) const; 
-  std::vector<const FunctionSamples *> 
-  findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const; 
-  mutable DenseMap<const DILocation *, const FunctionSamples *> DILocation2SampleMap; 
-  const FunctionSamples *findFunctionSamples(const Instruction &I) const; 
+  ErrorOr<uint64_t> getBlockWeight(const BasicBlock *BB);
+  const FunctionSamples *findCalleeFunctionSamples(const CallBase &I) const;
+  std::vector<const FunctionSamples *>
+  findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const;
+  mutable DenseMap<const DILocation *, const FunctionSamples *> DILocation2SampleMap;
+  const FunctionSamples *findFunctionSamples(const Instruction &I) const;
   // Attempt to promote indirect call and also inline the promoted call
   bool tryPromoteAndInlineCandidate(
       Function &F, InlineCandidate &Candidate, uint64_t SumOrigin,
       uint64_t &Sum, DenseSet<Instruction *> &PromotedInsns,
       SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);
-  bool inlineHotFunctions(Function &F, 
-                          DenseSet<GlobalValue::GUID> &InlinedGUIDs); 
+  bool inlineHotFunctions(Function &F,
+                          DenseSet<GlobalValue::GUID> &InlinedGUIDs);
   InlineCost shouldInlineCandidate(InlineCandidate &Candidate);
   bool getInlineCandidate(InlineCandidate *NewCandidate, CallBase *CB);
   bool
@@ -450,442 +450,442 @@ protected:
   bool
   inlineHotFunctionsWithPriority(Function &F,
                                  DenseSet<GlobalValue::GUID> &InlinedGUIDs);
-  // Inline cold/small functions in addition to hot ones 
-  bool shouldInlineColdCallee(CallBase &CallInst); 
-  void emitOptimizationRemarksForInlineCandidates( 
-      const SmallVectorImpl<CallBase *> &Candidates, const Function &F, 
-      bool Hot); 
-  void printEdgeWeight(raw_ostream &OS, Edge E); 
-  void printBlockWeight(raw_ostream &OS, const BasicBlock *BB) const; 
-  void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB); 
-  bool computeBlockWeights(Function &F); 
-  void findEquivalenceClasses(Function &F); 
-  template <bool IsPostDom> 
-  void findEquivalencesFor(BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants, 
-                           DominatorTreeBase<BasicBlock, IsPostDom> *DomTree); 
- 
-  void propagateWeights(Function &F); 
-  uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge); 
-  void buildEdges(Function &F); 
-  std::vector<Function *> buildFunctionOrder(Module &M, CallGraph *CG); 
+  // Inline cold/small functions in addition to hot ones
+  bool shouldInlineColdCallee(CallBase &CallInst);
+  void emitOptimizationRemarksForInlineCandidates(
+      const SmallVectorImpl<CallBase *> &Candidates, const Function &F,
+      bool Hot);
+  void printEdgeWeight(raw_ostream &OS, Edge E);
+  void printBlockWeight(raw_ostream &OS, const BasicBlock *BB) const;
+  void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB);
+  bool computeBlockWeights(Function &F);
+  void findEquivalenceClasses(Function &F);
+  template <bool IsPostDom>
+  void findEquivalencesFor(BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants,
+                           DominatorTreeBase<BasicBlock, IsPostDom> *DomTree);
+
+  void propagateWeights(Function &F);
+  uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge);
+  void buildEdges(Function &F);
+  std::vector<Function *> buildFunctionOrder(Module &M, CallGraph *CG);
   void addCallGraphEdges(CallGraph &CG, const FunctionSamples &Samples);
   void replaceCallGraphEdges(CallGraph &CG, StringMap<Function *> &SymbolMap);
-  bool propagateThroughEdges(Function &F, bool UpdateBlockCount); 
-  void computeDominanceAndLoopInfo(Function &F); 
-  void clearFunctionData(); 
-  bool callsiteIsHot(const FunctionSamples *CallsiteFS, 
-                     ProfileSummaryInfo *PSI); 
- 
-  /// Map basic blocks to their computed weights. 
-  /// 
-  /// The weight of a basic block is defined to be the maximum 
-  /// of all the instruction weights in that block. 
-  BlockWeightMap BlockWeights; 
- 
-  /// Map edges to their computed weights. 
-  /// 
-  /// Edge weights are computed by propagating basic block weights in 
-  /// SampleProfile::propagateWeights. 
-  EdgeWeightMap EdgeWeights; 
- 
-  /// Set of visited blocks during propagation. 
-  SmallPtrSet<const BasicBlock *, 32> VisitedBlocks; 
- 
-  /// Set of visited edges during propagation. 
-  SmallSet<Edge, 32> VisitedEdges; 
- 
-  /// Equivalence classes for block weights. 
-  /// 
-  /// Two blocks BB1 and BB2 are in the same equivalence class if they 
-  /// dominate and post-dominate each other, and they are in the same loop 
-  /// nest. When this happens, the two blocks are guaranteed to execute 
-  /// the same number of times. 
-  EquivalenceClassMap EquivalenceClass; 
- 
-  /// Map from function name to Function *. Used to find the function from 
-  /// the function name. If the function name contains suffix, additional 
-  /// entry is added to map from the stripped name to the function if there 
-  /// is one-to-one mapping. 
-  StringMap<Function *> SymbolMap; 
- 
-  /// Dominance, post-dominance and loop information. 
-  std::unique_ptr<DominatorTree> DT; 
-  std::unique_ptr<PostDominatorTree> PDT; 
-  std::unique_ptr<LoopInfo> LI; 
- 
-  std::function<AssumptionCache &(Function &)> GetAC; 
-  std::function<TargetTransformInfo &(Function &)> GetTTI; 
-  std::function<const TargetLibraryInfo &(Function &)> GetTLI; 
- 
-  /// Predecessors for each basic block in the CFG. 
-  BlockEdgeMap Predecessors; 
- 
-  /// Successors for each basic block in the CFG. 
-  BlockEdgeMap Successors; 
- 
-  SampleCoverageTracker CoverageTracker; 
- 
-  /// Profile reader object. 
-  std::unique_ptr<SampleProfileReader> Reader; 
- 
+  bool propagateThroughEdges(Function &F, bool UpdateBlockCount);
+  void computeDominanceAndLoopInfo(Function &F);
+  void clearFunctionData();
+  bool callsiteIsHot(const FunctionSamples *CallsiteFS,
+                     ProfileSummaryInfo *PSI);
+
+  /// Map basic blocks to their computed weights.
+  ///
+  /// The weight of a basic block is defined to be the maximum
+  /// of all the instruction weights in that block.
+  BlockWeightMap BlockWeights;
+
+  /// Map edges to their computed weights.
+  ///
+  /// Edge weights are computed by propagating basic block weights in
+  /// SampleProfile::propagateWeights.
+  EdgeWeightMap EdgeWeights;
+
+  /// Set of visited blocks during propagation.
+  SmallPtrSet<const BasicBlock *, 32> VisitedBlocks;
+
+  /// Set of visited edges during propagation.
+  SmallSet<Edge, 32> VisitedEdges;
+
+  /// Equivalence classes for block weights.
+  ///
+  /// Two blocks BB1 and BB2 are in the same equivalence class if they
+  /// dominate and post-dominate each other, and they are in the same loop
+  /// nest. When this happens, the two blocks are guaranteed to execute
+  /// the same number of times.
+  EquivalenceClassMap EquivalenceClass;
+
+  /// Map from function name to Function *. Used to find the function from
+  /// the function name. If the function name contains suffix, additional
+  /// entry is added to map from the stripped name to the function if there
+  /// is one-to-one mapping.
+  StringMap<Function *> SymbolMap;
+
+  /// Dominance, post-dominance and loop information.
+  std::unique_ptr<DominatorTree> DT;
+  std::unique_ptr<PostDominatorTree> PDT;
+  std::unique_ptr<LoopInfo> LI;
+
+  std::function<AssumptionCache &(Function &)> GetAC;
+  std::function<TargetTransformInfo &(Function &)> GetTTI;
+  std::function<const TargetLibraryInfo &(Function &)> GetTLI;
+
+  /// Predecessors for each basic block in the CFG.
+  BlockEdgeMap Predecessors;
+
+  /// Successors for each basic block in the CFG.
+  BlockEdgeMap Successors;
+
+  SampleCoverageTracker CoverageTracker;
+
+  /// Profile reader object.
+  std::unique_ptr<SampleProfileReader> Reader;
+
   /// Profile tracker for different context.
   std::unique_ptr<SampleContextTracker> ContextTracker;
 
-  /// Samples collected for the body of this function. 
-  FunctionSamples *Samples = nullptr; 
- 
-  /// Name of the profile file to load. 
-  std::string Filename; 
- 
-  /// Name of the profile remapping file to load. 
-  std::string RemappingFilename; 
- 
-  /// Flag indicating whether the profile input loaded successfully. 
-  bool ProfileIsValid = false; 
- 
+  /// Samples collected for the body of this function.
+  FunctionSamples *Samples = nullptr;
+
+  /// Name of the profile file to load.
+  std::string Filename;
+
+  /// Name of the profile remapping file to load.
+  std::string RemappingFilename;
+
+  /// Flag indicating whether the profile input loaded successfully.
+  bool ProfileIsValid = false;
+
   /// Flag indicating whether input profile is context-sensitive
   bool ProfileIsCS = false;
 
   /// Flag indicating which LTO/ThinLTO phase the pass is invoked in.
-  /// 
+  ///
   /// We need to know the LTO phase because for example in ThinLTOPrelink
   /// phase, in annotation, we should not promote indirect calls. Instead,
   /// we will mark GUIDs that needs to be annotated to the function.
   ThinOrFullLTOPhase LTOPhase;
- 
-  /// Profile Summary Info computed from sample profile. 
-  ProfileSummaryInfo *PSI = nullptr; 
- 
-  /// Profle Symbol list tells whether a function name appears in the binary 
-  /// used to generate the current profile. 
-  std::unique_ptr<ProfileSymbolList> PSL; 
- 
-  /// Total number of samples collected in this profile. 
-  /// 
-  /// This is the sum of all the samples collected in all the functions executed 
-  /// at runtime. 
-  uint64_t TotalCollectedSamples = 0; 
- 
-  /// Optimization Remark Emitter used to emit diagnostic remarks. 
-  OptimizationRemarkEmitter *ORE = nullptr; 
- 
-  // Information recorded when we declined to inline a call site 
-  // because we have determined it is too cold is accumulated for 
-  // each callee function. Initially this is just the entry count. 
-  struct NotInlinedProfileInfo { 
-    uint64_t entryCount; 
-  }; 
-  DenseMap<Function *, NotInlinedProfileInfo> notInlinedCallInfo; 
- 
-  // GUIDToFuncNameMap saves the mapping from GUID to the symbol name, for 
-  // all the function symbols defined or declared in current module. 
-  DenseMap<uint64_t, StringRef> GUIDToFuncNameMap; 
- 
-  // All the Names used in FunctionSamples including outline function 
-  // names, inline instance names and call target names. 
-  StringSet<> NamesInProfile; 
- 
-  // For symbol in profile symbol list, whether to regard their profiles 
-  // to be accurate. It is mainly decided by existance of profile symbol 
-  // list and -profile-accurate-for-symsinlist flag, but it can be 
-  // overriden by -profile-sample-accurate or profile-sample-accurate 
-  // attribute. 
-  bool ProfAccForSymsInList; 
+
+  /// Profile Summary Info computed from sample profile.
+  ProfileSummaryInfo *PSI = nullptr;
+
+  /// Profle Symbol list tells whether a function name appears in the binary
+  /// used to generate the current profile.
+  std::unique_ptr<ProfileSymbolList> PSL;
+
+  /// Total number of samples collected in this profile.
+  ///
+  /// This is the sum of all the samples collected in all the functions executed
+  /// at runtime.
+  uint64_t TotalCollectedSamples = 0;
+
+  /// Optimization Remark Emitter used to emit diagnostic remarks.
+  OptimizationRemarkEmitter *ORE = nullptr;
+
+  // Information recorded when we declined to inline a call site
+  // because we have determined it is too cold is accumulated for
+  // each callee function. Initially this is just the entry count.
+  struct NotInlinedProfileInfo {
+    uint64_t entryCount;
+  };
+  DenseMap<Function *, NotInlinedProfileInfo> notInlinedCallInfo;
+
+  // GUIDToFuncNameMap saves the mapping from GUID to the symbol name, for
+  // all the function symbols defined or declared in current module.
+  DenseMap<uint64_t, StringRef> GUIDToFuncNameMap;
+
+  // All the Names used in FunctionSamples including outline function
+  // names, inline instance names and call target names.
+  StringSet<> NamesInProfile;
+
+  // For symbol in profile symbol list, whether to regard their profiles
+  // to be accurate. It is mainly decided by existance of profile symbol
+  // list and -profile-accurate-for-symsinlist flag, but it can be
+  // overriden by -profile-sample-accurate or profile-sample-accurate
+  // attribute.
+  bool ProfAccForSymsInList;
 
   // External inline advisor used to replay inline decision from remarks.
   std::unique_ptr<ReplayInlineAdvisor> ExternalInlineAdvisor;
 
   // A pseudo probe helper to correlate the imported sample counts.
   std::unique_ptr<PseudoProbeManager> ProbeManager;
-}; 
- 
-class SampleProfileLoaderLegacyPass : public ModulePass { 
-public: 
-  // Class identification, replacement for typeinfo 
-  static char ID; 
- 
+};
+
+class SampleProfileLoaderLegacyPass : public ModulePass {
+public:
+  // Class identification, replacement for typeinfo
+  static char ID;
+
   SampleProfileLoaderLegacyPass(
       StringRef Name = SampleProfileFile,
       ThinOrFullLTOPhase LTOPhase = ThinOrFullLTOPhase::None)
-      : ModulePass(ID), SampleLoader( 
+      : ModulePass(ID), SampleLoader(
                             Name, SampleProfileRemappingFile, LTOPhase,
-                            [&](Function &F) -> AssumptionCache & { 
-                              return ACT->getAssumptionCache(F); 
-                            }, 
-                            [&](Function &F) -> TargetTransformInfo & { 
-                              return TTIWP->getTTI(F); 
-                            }, 
-                            [&](Function &F) -> TargetLibraryInfo & { 
-                              return TLIWP->getTLI(F); 
-                            }) { 
-    initializeSampleProfileLoaderLegacyPassPass( 
-        *PassRegistry::getPassRegistry()); 
-  } 
- 
-  void dump() { SampleLoader.dump(); } 
- 
-  bool doInitialization(Module &M) override { 
-    return SampleLoader.doInitialization(M); 
-  } 
- 
-  StringRef getPassName() const override { return "Sample profile pass"; } 
-  bool runOnModule(Module &M) override; 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<AssumptionCacheTracker>(); 
-    AU.addRequired<TargetTransformInfoWrapperPass>(); 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-    AU.addRequired<ProfileSummaryInfoWrapperPass>(); 
-  } 
- 
-private: 
-  SampleProfileLoader SampleLoader; 
-  AssumptionCacheTracker *ACT = nullptr; 
-  TargetTransformInfoWrapperPass *TTIWP = nullptr; 
-  TargetLibraryInfoWrapperPass *TLIWP = nullptr; 
-}; 
- 
-} // end anonymous namespace 
- 
-/// Return true if the given callsite is hot wrt to hot cutoff threshold. 
-/// 
-/// Functions that were inlined in the original binary will be represented 
-/// in the inline stack in the sample profile. If the profile shows that 
-/// the original inline decision was "good" (i.e., the callsite is executed 
-/// frequently), then we will recreate the inline decision and apply the 
-/// profile from the inlined callsite. 
-/// 
-/// To decide whether an inlined callsite is hot, we compare the callsite 
-/// sample count with the hot cutoff computed by ProfileSummaryInfo, it is 
-/// regarded as hot if the count is above the cutoff value. 
-/// 
-/// When ProfileAccurateForSymsInList is enabled and profile symbol list 
-/// is present, functions in the profile symbol list but without profile will 
-/// be regarded as cold and much less inlining will happen in CGSCC inlining 
-/// pass, so we tend to lower the hot criteria here to allow more early 
-/// inlining to happen for warm callsites and it is helpful for performance. 
-bool SampleProfileLoader::callsiteIsHot(const FunctionSamples *CallsiteFS, 
-                                        ProfileSummaryInfo *PSI) { 
-  if (!CallsiteFS) 
-    return false; // The callsite was not inlined in the original binary. 
- 
-  assert(PSI && "PSI is expected to be non null"); 
-  uint64_t CallsiteTotalSamples = CallsiteFS->getTotalSamples(); 
-  if (ProfAccForSymsInList) 
-    return !PSI->isColdCount(CallsiteTotalSamples); 
-  else 
-    return PSI->isHotCount(CallsiteTotalSamples); 
-} 
- 
-/// Mark as used the sample record for the given function samples at 
-/// (LineOffset, Discriminator). 
-/// 
-/// \returns true if this is the first time we mark the given record. 
-bool SampleCoverageTracker::markSamplesUsed(const FunctionSamples *FS, 
-                                            uint32_t LineOffset, 
-                                            uint32_t Discriminator, 
-                                            uint64_t Samples) { 
-  LineLocation Loc(LineOffset, Discriminator); 
-  unsigned &Count = SampleCoverage[FS][Loc]; 
-  bool FirstTime = (++Count == 1); 
-  if (FirstTime) 
-    TotalUsedSamples += Samples; 
-  return FirstTime; 
-} 
- 
-/// Return the number of sample records that were applied from this profile. 
-/// 
-/// This count does not include records from cold inlined callsites. 
-unsigned 
-SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS, 
-                                        ProfileSummaryInfo *PSI) const { 
-  auto I = SampleCoverage.find(FS); 
- 
-  // The size of the coverage map for FS represents the number of records 
-  // that were marked used at least once. 
-  unsigned Count = (I != SampleCoverage.end()) ? I->second.size() : 0; 
- 
-  // If there are inlined callsites in this function, count the samples found 
-  // in the respective bodies. However, do not bother counting callees with 0 
-  // total samples, these are callees that were never invoked at runtime. 
-  for (const auto &I : FS->getCallsiteSamples()) 
-    for (const auto &J : I.second) { 
-      const FunctionSamples *CalleeSamples = &J.second; 
-      if (SPLoader.callsiteIsHot(CalleeSamples, PSI)) 
-        Count += countUsedRecords(CalleeSamples, PSI); 
-    } 
- 
-  return Count; 
-} 
- 
-/// Return the number of sample records in the body of this profile. 
-/// 
-/// This count does not include records from cold inlined callsites. 
-unsigned 
-SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS, 
-                                        ProfileSummaryInfo *PSI) const { 
-  unsigned Count = FS->getBodySamples().size(); 
- 
-  // Only count records in hot callsites. 
-  for (const auto &I : FS->getCallsiteSamples()) 
-    for (const auto &J : I.second) { 
-      const FunctionSamples *CalleeSamples = &J.second; 
-      if (SPLoader.callsiteIsHot(CalleeSamples, PSI)) 
-        Count += countBodyRecords(CalleeSamples, PSI); 
-    } 
- 
-  return Count; 
-} 
- 
-/// Return the number of samples collected in the body of this profile. 
-/// 
-/// This count does not include samples from cold inlined callsites. 
-uint64_t 
-SampleCoverageTracker::countBodySamples(const FunctionSamples *FS, 
-                                        ProfileSummaryInfo *PSI) const { 
-  uint64_t Total = 0; 
-  for (const auto &I : FS->getBodySamples()) 
-    Total += I.second.getSamples(); 
- 
-  // Only count samples in hot callsites. 
-  for (const auto &I : FS->getCallsiteSamples()) 
-    for (const auto &J : I.second) { 
-      const FunctionSamples *CalleeSamples = &J.second; 
-      if (SPLoader.callsiteIsHot(CalleeSamples, PSI)) 
-        Total += countBodySamples(CalleeSamples, PSI); 
-    } 
- 
-  return Total; 
-} 
- 
-/// Return the fraction of sample records used in this profile. 
-/// 
-/// The returned value is an unsigned integer in the range 0-100 indicating 
-/// the percentage of sample records that were used while applying this 
-/// profile to the associated function. 
-unsigned SampleCoverageTracker::computeCoverage(unsigned Used, 
-                                                unsigned Total) const { 
-  assert(Used <= Total && 
-         "number of used records cannot exceed the total number of records"); 
-  return Total > 0 ? Used * 100 / Total : 100; 
-} 
- 
-/// Clear all the per-function data used to load samples and propagate weights. 
-void SampleProfileLoader::clearFunctionData() { 
-  BlockWeights.clear(); 
-  EdgeWeights.clear(); 
-  VisitedBlocks.clear(); 
-  VisitedEdges.clear(); 
-  EquivalenceClass.clear(); 
-  DT = nullptr; 
-  PDT = nullptr; 
-  LI = nullptr; 
-  Predecessors.clear(); 
-  Successors.clear(); 
-  CoverageTracker.clear(); 
-} 
- 
-#ifndef NDEBUG 
-/// Print the weight of edge \p E on stream \p OS. 
-/// 
-/// \param OS  Stream to emit the output to. 
-/// \param E  Edge to print. 
-void SampleProfileLoader::printEdgeWeight(raw_ostream &OS, Edge E) { 
-  OS << "weight[" << E.first->getName() << "->" << E.second->getName() 
-     << "]: " << EdgeWeights[E] << "\n"; 
-} 
- 
-/// Print the equivalence class of block \p BB on stream \p OS. 
-/// 
-/// \param OS  Stream to emit the output to. 
-/// \param BB  Block to print. 
-void SampleProfileLoader::printBlockEquivalence(raw_ostream &OS, 
-                                                const BasicBlock *BB) { 
-  const BasicBlock *Equiv = EquivalenceClass[BB]; 
-  OS << "equivalence[" << BB->getName() 
-     << "]: " << ((Equiv) ? EquivalenceClass[BB]->getName() : "NONE") << "\n"; 
-} 
- 
-/// Print the weight of block \p BB on stream \p OS. 
-/// 
-/// \param OS  Stream to emit the output to. 
-/// \param BB  Block to print. 
-void SampleProfileLoader::printBlockWeight(raw_ostream &OS, 
-                                           const BasicBlock *BB) const { 
-  const auto &I = BlockWeights.find(BB); 
-  uint64_t W = (I == BlockWeights.end() ? 0 : I->second); 
-  OS << "weight[" << BB->getName() << "]: " << W << "\n"; 
-} 
-#endif 
- 
-/// Get the weight for an instruction. 
-/// 
-/// The "weight" of an instruction \p Inst is the number of samples 
-/// collected on that instruction at runtime. To retrieve it, we 
-/// need to compute the line number of \p Inst relative to the start of its 
-/// function. We use HeaderLineno to compute the offset. We then 
-/// look up the samples collected for \p Inst using BodySamples. 
-/// 
-/// \param Inst Instruction to query. 
-/// 
-/// \returns the weight of \p Inst. 
-ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) { 
+                            [&](Function &F) -> AssumptionCache & {
+                              return ACT->getAssumptionCache(F);
+                            },
+                            [&](Function &F) -> TargetTransformInfo & {
+                              return TTIWP->getTTI(F);
+                            },
+                            [&](Function &F) -> TargetLibraryInfo & {
+                              return TLIWP->getTLI(F);
+                            }) {
+    initializeSampleProfileLoaderLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  void dump() { SampleLoader.dump(); }
+
+  bool doInitialization(Module &M) override {
+    return SampleLoader.doInitialization(M);
+  }
+
+  StringRef getPassName() const override { return "Sample profile pass"; }
+  bool runOnModule(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
+  }
+
+private:
+  SampleProfileLoader SampleLoader;
+  AssumptionCacheTracker *ACT = nullptr;
+  TargetTransformInfoWrapperPass *TTIWP = nullptr;
+  TargetLibraryInfoWrapperPass *TLIWP = nullptr;
+};
+
+} // end anonymous namespace
+
+/// Return true if the given callsite is hot wrt to hot cutoff threshold.
+///
+/// Functions that were inlined in the original binary will be represented
+/// in the inline stack in the sample profile. If the profile shows that
+/// the original inline decision was "good" (i.e., the callsite is executed
+/// frequently), then we will recreate the inline decision and apply the
+/// profile from the inlined callsite.
+///
+/// To decide whether an inlined callsite is hot, we compare the callsite
+/// sample count with the hot cutoff computed by ProfileSummaryInfo, it is
+/// regarded as hot if the count is above the cutoff value.
+///
+/// When ProfileAccurateForSymsInList is enabled and profile symbol list
+/// is present, functions in the profile symbol list but without profile will
+/// be regarded as cold and much less inlining will happen in CGSCC inlining
+/// pass, so we tend to lower the hot criteria here to allow more early
+/// inlining to happen for warm callsites and it is helpful for performance.
+bool SampleProfileLoader::callsiteIsHot(const FunctionSamples *CallsiteFS,
+                                        ProfileSummaryInfo *PSI) {
+  if (!CallsiteFS)
+    return false; // The callsite was not inlined in the original binary.
+
+  assert(PSI && "PSI is expected to be non null");
+  uint64_t CallsiteTotalSamples = CallsiteFS->getTotalSamples();
+  if (ProfAccForSymsInList)
+    return !PSI->isColdCount(CallsiteTotalSamples);
+  else
+    return PSI->isHotCount(CallsiteTotalSamples);
+}
+
+/// Mark as used the sample record for the given function samples at
+/// (LineOffset, Discriminator).
+///
+/// \returns true if this is the first time we mark the given record.
+bool SampleCoverageTracker::markSamplesUsed(const FunctionSamples *FS,
+                                            uint32_t LineOffset,
+                                            uint32_t Discriminator,
+                                            uint64_t Samples) {
+  LineLocation Loc(LineOffset, Discriminator);
+  unsigned &Count = SampleCoverage[FS][Loc];
+  bool FirstTime = (++Count == 1);
+  if (FirstTime)
+    TotalUsedSamples += Samples;
+  return FirstTime;
+}
+
+/// Return the number of sample records that were applied from this profile.
+///
+/// This count does not include records from cold inlined callsites.
+unsigned
+SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS,
+                                        ProfileSummaryInfo *PSI) const {
+  auto I = SampleCoverage.find(FS);
+
+  // The size of the coverage map for FS represents the number of records
+  // that were marked used at least once.
+  unsigned Count = (I != SampleCoverage.end()) ? I->second.size() : 0;
+
+  // If there are inlined callsites in this function, count the samples found
+  // in the respective bodies. However, do not bother counting callees with 0
+  // total samples, these are callees that were never invoked at runtime.
+  for (const auto &I : FS->getCallsiteSamples())
+    for (const auto &J : I.second) {
+      const FunctionSamples *CalleeSamples = &J.second;
+      if (SPLoader.callsiteIsHot(CalleeSamples, PSI))
+        Count += countUsedRecords(CalleeSamples, PSI);
+    }
+
+  return Count;
+}
+
+/// Return the number of sample records in the body of this profile.
+///
+/// This count does not include records from cold inlined callsites.
+unsigned
+SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS,
+                                        ProfileSummaryInfo *PSI) const {
+  unsigned Count = FS->getBodySamples().size();
+
+  // Only count records in hot callsites.
+  for (const auto &I : FS->getCallsiteSamples())
+    for (const auto &J : I.second) {
+      const FunctionSamples *CalleeSamples = &J.second;
+      if (SPLoader.callsiteIsHot(CalleeSamples, PSI))
+        Count += countBodyRecords(CalleeSamples, PSI);
+    }
+
+  return Count;
+}
+
+/// Return the number of samples collected in the body of this profile.
+///
+/// This count does not include samples from cold inlined callsites.
+uint64_t
+SampleCoverageTracker::countBodySamples(const FunctionSamples *FS,
+                                        ProfileSummaryInfo *PSI) const {
+  uint64_t Total = 0;
+  for (const auto &I : FS->getBodySamples())
+    Total += I.second.getSamples();
+
+  // Only count samples in hot callsites.
+  for (const auto &I : FS->getCallsiteSamples())
+    for (const auto &J : I.second) {
+      const FunctionSamples *CalleeSamples = &J.second;
+      if (SPLoader.callsiteIsHot(CalleeSamples, PSI))
+        Total += countBodySamples(CalleeSamples, PSI);
+    }
+
+  return Total;
+}
+
+/// Return the fraction of sample records used in this profile.
+///
+/// The returned value is an unsigned integer in the range 0-100 indicating
+/// the percentage of sample records that were used while applying this
+/// profile to the associated function.
+unsigned SampleCoverageTracker::computeCoverage(unsigned Used,
+                                                unsigned Total) const {
+  assert(Used <= Total &&
+         "number of used records cannot exceed the total number of records");
+  return Total > 0 ? Used * 100 / Total : 100;
+}
+
+/// Clear all the per-function data used to load samples and propagate weights.
+void SampleProfileLoader::clearFunctionData() {
+  BlockWeights.clear();
+  EdgeWeights.clear();
+  VisitedBlocks.clear();
+  VisitedEdges.clear();
+  EquivalenceClass.clear();
+  DT = nullptr;
+  PDT = nullptr;
+  LI = nullptr;
+  Predecessors.clear();
+  Successors.clear();
+  CoverageTracker.clear();
+}
+
+#ifndef NDEBUG
+/// Print the weight of edge \p E on stream \p OS.
+///
+/// \param OS  Stream to emit the output to.
+/// \param E  Edge to print.
+void SampleProfileLoader::printEdgeWeight(raw_ostream &OS, Edge E) {
+  OS << "weight[" << E.first->getName() << "->" << E.second->getName()
+     << "]: " << EdgeWeights[E] << "\n";
+}
+
+/// Print the equivalence class of block \p BB on stream \p OS.
+///
+/// \param OS  Stream to emit the output to.
+/// \param BB  Block to print.
+void SampleProfileLoader::printBlockEquivalence(raw_ostream &OS,
+                                                const BasicBlock *BB) {
+  const BasicBlock *Equiv = EquivalenceClass[BB];
+  OS << "equivalence[" << BB->getName()
+     << "]: " << ((Equiv) ? EquivalenceClass[BB]->getName() : "NONE") << "\n";
+}
+
+/// Print the weight of block \p BB on stream \p OS.
+///
+/// \param OS  Stream to emit the output to.
+/// \param BB  Block to print.
+void SampleProfileLoader::printBlockWeight(raw_ostream &OS,
+                                           const BasicBlock *BB) const {
+  const auto &I = BlockWeights.find(BB);
+  uint64_t W = (I == BlockWeights.end() ? 0 : I->second);
+  OS << "weight[" << BB->getName() << "]: " << W << "\n";
+}
+#endif
+
+/// Get the weight for an instruction.
+///
+/// The "weight" of an instruction \p Inst is the number of samples
+/// collected on that instruction at runtime. To retrieve it, we
+/// need to compute the line number of \p Inst relative to the start of its
+/// function. We use HeaderLineno to compute the offset. We then
+/// look up the samples collected for \p Inst using BodySamples.
+///
+/// \param Inst Instruction to query.
+///
+/// \returns the weight of \p Inst.
+ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
   if (FunctionSamples::ProfileIsProbeBased)
     return getProbeWeight(Inst);
 
-  const DebugLoc &DLoc = Inst.getDebugLoc(); 
-  if (!DLoc) 
-    return std::error_code(); 
- 
-  const FunctionSamples *FS = findFunctionSamples(Inst); 
-  if (!FS) 
-    return std::error_code(); 
- 
-  // Ignore all intrinsics, phinodes and branch instructions. 
-  // Branch and phinodes instruction usually contains debug info from sources outside of 
-  // the residing basic block, thus we ignore them during annotation. 
-  if (isa<BranchInst>(Inst) || isa<IntrinsicInst>(Inst) || isa<PHINode>(Inst)) 
-    return std::error_code(); 
- 
-  // If a direct call/invoke instruction is inlined in profile 
-  // (findCalleeFunctionSamples returns non-empty result), but not inlined here, 
-  // it means that the inlined callsite has no sample, thus the call 
-  // instruction should have 0 count. 
+  const DebugLoc &DLoc = Inst.getDebugLoc();
+  if (!DLoc)
+    return std::error_code();
+
+  const FunctionSamples *FS = findFunctionSamples(Inst);
+  if (!FS)
+    return std::error_code();
+
+  // Ignore all intrinsics, phinodes and branch instructions.
+  // Branch and phinodes instruction usually contains debug info from sources outside of
+  // the residing basic block, thus we ignore them during annotation.
+  if (isa<BranchInst>(Inst) || isa<IntrinsicInst>(Inst) || isa<PHINode>(Inst))
+    return std::error_code();
+
+  // If a direct call/invoke instruction is inlined in profile
+  // (findCalleeFunctionSamples returns non-empty result), but not inlined here,
+  // it means that the inlined callsite has no sample, thus the call
+  // instruction should have 0 count.
   if (!ProfileIsCS)
     if (const auto *CB = dyn_cast<CallBase>(&Inst))
       if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB))
         return 0;
- 
-  const DILocation *DIL = DLoc; 
-  uint32_t LineOffset = FunctionSamples::getOffset(DIL); 
-  uint32_t Discriminator = DIL->getBaseDiscriminator(); 
-  ErrorOr<uint64_t> R = FS->findSamplesAt(LineOffset, Discriminator); 
-  if (R) { 
-    bool FirstMark = 
-        CoverageTracker.markSamplesUsed(FS, LineOffset, Discriminator, R.get()); 
-    if (FirstMark) { 
-      ORE->emit([&]() { 
-        OptimizationRemarkAnalysis Remark(DEBUG_TYPE, "AppliedSamples", &Inst); 
-        Remark << "Applied " << ore::NV("NumSamples", *R); 
-        Remark << " samples from profile (offset: "; 
-        Remark << ore::NV("LineOffset", LineOffset); 
-        if (Discriminator) { 
-          Remark << "."; 
-          Remark << ore::NV("Discriminator", Discriminator); 
-        } 
-        Remark << ")"; 
-        return Remark; 
-      }); 
-    } 
-    LLVM_DEBUG(dbgs() << "    " << DLoc.getLine() << "." 
-                      << DIL->getBaseDiscriminator() << ":" << Inst 
-                      << " (line offset: " << LineOffset << "." 
-                      << DIL->getBaseDiscriminator() << " - weight: " << R.get() 
-                      << ")\n"); 
-  } 
-  return R; 
-} 
- 
+
+  const DILocation *DIL = DLoc;
+  uint32_t LineOffset = FunctionSamples::getOffset(DIL);
+  uint32_t Discriminator = DIL->getBaseDiscriminator();
+  ErrorOr<uint64_t> R = FS->findSamplesAt(LineOffset, Discriminator);
+  if (R) {
+    bool FirstMark =
+        CoverageTracker.markSamplesUsed(FS, LineOffset, Discriminator, R.get());
+    if (FirstMark) {
+      ORE->emit([&]() {
+        OptimizationRemarkAnalysis Remark(DEBUG_TYPE, "AppliedSamples", &Inst);
+        Remark << "Applied " << ore::NV("NumSamples", *R);
+        Remark << " samples from profile (offset: ";
+        Remark << ore::NV("LineOffset", LineOffset);
+        if (Discriminator) {
+          Remark << ".";
+          Remark << ore::NV("Discriminator", Discriminator);
+        }
+        Remark << ")";
+        return Remark;
+      });
+    }
+    LLVM_DEBUG(dbgs() << "    " << DLoc.getLine() << "."
+                      << DIL->getBaseDiscriminator() << ":" << Inst
+                      << " (line offset: " << LineOffset << "."
+                      << DIL->getBaseDiscriminator() << " - weight: " << R.get()
+                      << ")\n");
+  }
+  return R;
+}
+
 ErrorOr<uint64_t> SampleProfileLoader::getProbeWeight(const Instruction &Inst) {
   assert(FunctionSamples::ProfileIsProbeBased &&
          "Profile is not pseudo probe based");
@@ -931,96 +931,96 @@ ErrorOr<uint64_t> SampleProfileLoader::getProbeWeight(const Instruction &Inst) {
   return R;
 }
 
-/// Compute the weight of a basic block. 
-/// 
-/// The weight of basic block \p BB is the maximum weight of all the 
-/// instructions in BB. 
-/// 
-/// \param BB The basic block to query. 
-/// 
-/// \returns the weight for \p BB. 
-ErrorOr<uint64_t> SampleProfileLoader::getBlockWeight(const BasicBlock *BB) { 
-  uint64_t Max = 0; 
-  bool HasWeight = false; 
-  for (auto &I : BB->getInstList()) { 
-    const ErrorOr<uint64_t> &R = getInstWeight(I); 
-    if (R) { 
-      Max = std::max(Max, R.get()); 
-      HasWeight = true; 
-    } 
-  } 
-  return HasWeight ? ErrorOr<uint64_t>(Max) : std::error_code(); 
-} 
- 
-/// Compute and store the weights of every basic block. 
-/// 
-/// This populates the BlockWeights map by computing 
-/// the weights of every basic block in the CFG. 
-/// 
-/// \param F The function to query. 
-bool SampleProfileLoader::computeBlockWeights(Function &F) { 
-  bool Changed = false; 
-  LLVM_DEBUG(dbgs() << "Block weights\n"); 
-  for (const auto &BB : F) { 
-    ErrorOr<uint64_t> Weight = getBlockWeight(&BB); 
-    if (Weight) { 
-      BlockWeights[&BB] = Weight.get(); 
-      VisitedBlocks.insert(&BB); 
-      Changed = true; 
-    } 
-    LLVM_DEBUG(printBlockWeight(dbgs(), &BB)); 
-  } 
- 
-  return Changed; 
-} 
- 
-/// Get the FunctionSamples for a call instruction. 
-/// 
-/// The FunctionSamples of a call/invoke instruction \p Inst is the inlined 
-/// instance in which that call instruction is calling to. It contains 
-/// all samples that resides in the inlined instance. We first find the 
-/// inlined instance in which the call instruction is from, then we 
-/// traverse its children to find the callsite with the matching 
-/// location. 
-/// 
-/// \param Inst Call/Invoke instruction to query. 
-/// 
-/// \returns The FunctionSamples pointer to the inlined instance. 
-const FunctionSamples * 
-SampleProfileLoader::findCalleeFunctionSamples(const CallBase &Inst) const { 
-  const DILocation *DIL = Inst.getDebugLoc(); 
-  if (!DIL) { 
-    return nullptr; 
-  } 
- 
-  StringRef CalleeName; 
+/// Compute the weight of a basic block.
+///
+/// The weight of basic block \p BB is the maximum weight of all the
+/// instructions in BB.
+///
+/// \param BB The basic block to query.
+///
+/// \returns the weight for \p BB.
+ErrorOr<uint64_t> SampleProfileLoader::getBlockWeight(const BasicBlock *BB) {
+  uint64_t Max = 0;
+  bool HasWeight = false;
+  for (auto &I : BB->getInstList()) {
+    const ErrorOr<uint64_t> &R = getInstWeight(I);
+    if (R) {
+      Max = std::max(Max, R.get());
+      HasWeight = true;
+    }
+  }
+  return HasWeight ? ErrorOr<uint64_t>(Max) : std::error_code();
+}
+
+/// Compute and store the weights of every basic block.
+///
+/// This populates the BlockWeights map by computing
+/// the weights of every basic block in the CFG.
+///
+/// \param F The function to query.
+bool SampleProfileLoader::computeBlockWeights(Function &F) {
+  bool Changed = false;
+  LLVM_DEBUG(dbgs() << "Block weights\n");
+  for (const auto &BB : F) {
+    ErrorOr<uint64_t> Weight = getBlockWeight(&BB);
+    if (Weight) {
+      BlockWeights[&BB] = Weight.get();
+      VisitedBlocks.insert(&BB);
+      Changed = true;
+    }
+    LLVM_DEBUG(printBlockWeight(dbgs(), &BB));
+  }
+
+  return Changed;
+}
+
+/// Get the FunctionSamples for a call instruction.
+///
+/// The FunctionSamples of a call/invoke instruction \p Inst is the inlined
+/// instance in which that call instruction is calling to. It contains
+/// all samples that resides in the inlined instance. We first find the
+/// inlined instance in which the call instruction is from, then we
+/// traverse its children to find the callsite with the matching
+/// location.
+///
+/// \param Inst Call/Invoke instruction to query.
+///
+/// \returns The FunctionSamples pointer to the inlined instance.
+const FunctionSamples *
+SampleProfileLoader::findCalleeFunctionSamples(const CallBase &Inst) const {
+  const DILocation *DIL = Inst.getDebugLoc();
+  if (!DIL) {
+    return nullptr;
+  }
+
+  StringRef CalleeName;
   if (Function *Callee = Inst.getCalledFunction())
     CalleeName = FunctionSamples::getCanonicalFnName(*Callee);
- 
+
   if (ProfileIsCS)
     return ContextTracker->getCalleeContextSamplesFor(Inst, CalleeName);
 
-  const FunctionSamples *FS = findFunctionSamples(Inst); 
-  if (FS == nullptr) 
-    return nullptr; 
- 
+  const FunctionSamples *FS = findFunctionSamples(Inst);
+  if (FS == nullptr)
+    return nullptr;
+
   return FS->findFunctionSamplesAt(FunctionSamples::getCallSiteIdentifier(DIL),
                                    CalleeName, Reader->getRemapper());
-} 
- 
-/// Returns a vector of FunctionSamples that are the indirect call targets 
-/// of \p Inst. The vector is sorted by the total number of samples. Stores 
-/// the total call count of the indirect call in \p Sum. 
-std::vector<const FunctionSamples *> 
-SampleProfileLoader::findIndirectCallFunctionSamples( 
-    const Instruction &Inst, uint64_t &Sum) const { 
-  const DILocation *DIL = Inst.getDebugLoc(); 
-  std::vector<const FunctionSamples *> R; 
- 
-  if (!DIL) { 
-    return R; 
-  } 
- 
+}
+
+/// Returns a vector of FunctionSamples that are the indirect call targets
+/// of \p Inst. The vector is sorted by the total number of samples. Stores
+/// the total call count of the indirect call in \p Sum.
+std::vector<const FunctionSamples *>
+SampleProfileLoader::findIndirectCallFunctionSamples(
+    const Instruction &Inst, uint64_t &Sum) const {
+  const DILocation *DIL = Inst.getDebugLoc();
+  std::vector<const FunctionSamples *> R;
+
+  if (!DIL) {
+    return R;
+  }
+
   auto FSCompare = [](const FunctionSamples *L, const FunctionSamples *R) {
     assert(L && R && "Expect non-null FunctionSamples");
     if (L->getEntrySamples() != R->getEntrySamples())
@@ -1046,50 +1046,50 @@ SampleProfileLoader::findIndirectCallFunctionSamples(
     return R;
   }
 
-  const FunctionSamples *FS = findFunctionSamples(Inst); 
-  if (FS == nullptr) 
-    return R; 
- 
+  const FunctionSamples *FS = findFunctionSamples(Inst);
+  if (FS == nullptr)
+    return R;
+
   auto CallSite = FunctionSamples::getCallSiteIdentifier(DIL);
   auto T = FS->findCallTargetMapAt(CallSite);
-  Sum = 0; 
-  if (T) 
-    for (const auto &T_C : T.get()) 
-      Sum += T_C.second; 
+  Sum = 0;
+  if (T)
+    for (const auto &T_C : T.get())
+      Sum += T_C.second;
   if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(CallSite)) {
-    if (M->empty()) 
-      return R; 
-    for (const auto &NameFS : *M) { 
-      Sum += NameFS.second.getEntrySamples(); 
-      R.push_back(&NameFS.second); 
-    } 
+    if (M->empty())
+      return R;
+    for (const auto &NameFS : *M) {
+      Sum += NameFS.second.getEntrySamples();
+      R.push_back(&NameFS.second);
+    }
     llvm::sort(R, FSCompare);
-  } 
-  return R; 
-} 
- 
-/// Get the FunctionSamples for an instruction. 
-/// 
-/// The FunctionSamples of an instruction \p Inst is the inlined instance 
-/// in which that instruction is coming from. We traverse the inline stack 
-/// of that instruction, and match it with the tree nodes in the profile. 
-/// 
-/// \param Inst Instruction to query. 
-/// 
-/// \returns the FunctionSamples pointer to the inlined instance. 
-const FunctionSamples * 
-SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const { 
+  }
+  return R;
+}
+
+/// Get the FunctionSamples for an instruction.
+///
+/// The FunctionSamples of an instruction \p Inst is the inlined instance
+/// in which that instruction is coming from. We traverse the inline stack
+/// of that instruction, and match it with the tree nodes in the profile.
+///
+/// \param Inst Instruction to query.
+///
+/// \returns the FunctionSamples pointer to the inlined instance.
+const FunctionSamples *
+SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
   if (FunctionSamples::ProfileIsProbeBased) {
     Optional<PseudoProbe> Probe = extractProbe(Inst);
     if (!Probe)
       return nullptr;
   }
 
-  const DILocation *DIL = Inst.getDebugLoc(); 
-  if (!DIL) 
-    return Samples; 
- 
-  auto it = DILocation2SampleMap.try_emplace(DIL,nullptr); 
+  const DILocation *DIL = Inst.getDebugLoc();
+  if (!DIL)
+    return Samples;
+
+  auto it = DILocation2SampleMap.try_emplace(DIL,nullptr);
   if (it.second) {
     if (ProfileIsCS)
       it.first->second = ContextTracker->getContextSamplesFor(DIL);
@@ -1097,9 +1097,9 @@ SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
       it.first->second =
           Samples->findFunctionSamples(DIL, Reader->getRemapper());
   }
-  return it.first->second; 
-} 
- 
+  return it.first->second;
+}
+
 /// Attempt to promote indirect call and also inline the promoted call.
 ///
 /// \param F  Caller function.
@@ -1158,175 +1158,175 @@ bool SampleProfileLoader::tryPromoteAndInlineCandidate(
     LLVM_DEBUG(dbgs() << "\nFailed to promote indirect call to "
                       << Candidate.CalleeSamples->getFuncName() << " because "
                       << Reason << "\n");
-  } 
-  return false; 
-} 
- 
-bool SampleProfileLoader::shouldInlineColdCallee(CallBase &CallInst) { 
-  if (!ProfileSizeInline) 
-    return false; 
- 
-  Function *Callee = CallInst.getCalledFunction(); 
-  if (Callee == nullptr) 
-    return false; 
- 
-  InlineCost Cost = getInlineCost(CallInst, getInlineParams(), GetTTI(*Callee), 
-                                  GetAC, GetTLI); 
- 
+  }
+  return false;
+}
+
+bool SampleProfileLoader::shouldInlineColdCallee(CallBase &CallInst) {
+  if (!ProfileSizeInline)
+    return false;
+
+  Function *Callee = CallInst.getCalledFunction();
+  if (Callee == nullptr)
+    return false;
+
+  InlineCost Cost = getInlineCost(CallInst, getInlineParams(), GetTTI(*Callee),
+                                  GetAC, GetTLI);
+
   if (Cost.isNever())
     return false;
 
   if (Cost.isAlways())
     return true;
 
-  return Cost.getCost() <= SampleColdCallSiteThreshold; 
-} 
- 
-void SampleProfileLoader::emitOptimizationRemarksForInlineCandidates( 
-    const SmallVectorImpl<CallBase *> &Candidates, const Function &F, 
-    bool Hot) { 
-  for (auto I : Candidates) { 
-    Function *CalledFunction = I->getCalledFunction(); 
-    if (CalledFunction) { 
-      ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "InlineAttempt", 
-                                           I->getDebugLoc(), I->getParent()) 
-                << "previous inlining reattempted for " 
-                << (Hot ? "hotness: '" : "size: '") 
-                << ore::NV("Callee", CalledFunction) << "' into '" 
-                << ore::NV("Caller", &F) << "'"); 
-    } 
-  } 
-} 
- 
-/// Iteratively inline hot callsites of a function. 
-/// 
-/// Iteratively traverse all callsites of the function \p F, and find if 
-/// the corresponding inlined instance exists and is hot in profile. If 
-/// it is hot enough, inline the callsites and adds new callsites of the 
-/// callee into the caller. If the call is an indirect call, first promote 
-/// it to direct call. Each indirect call is limited with a single target. 
-/// 
-/// \param F function to perform iterative inlining. 
-/// \param InlinedGUIDs a set to be updated to include all GUIDs that are 
-///     inlined in the profiled binary. 
-/// 
-/// \returns True if there is any inline happened. 
-bool SampleProfileLoader::inlineHotFunctions( 
-    Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) { 
-  DenseSet<Instruction *> PromotedInsns; 
- 
-  // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure 
-  // Profile symbol list is ignored when profile-sample-accurate is on. 
-  assert((!ProfAccForSymsInList || 
-          (!ProfileSampleAccurate && 
-           !F.hasFnAttribute("profile-sample-accurate"))) && 
-         "ProfAccForSymsInList should be false when profile-sample-accurate " 
-         "is enabled"); 
- 
+  return Cost.getCost() <= SampleColdCallSiteThreshold;
+}
+
+void SampleProfileLoader::emitOptimizationRemarksForInlineCandidates(
+    const SmallVectorImpl<CallBase *> &Candidates, const Function &F,
+    bool Hot) {
+  for (auto I : Candidates) {
+    Function *CalledFunction = I->getCalledFunction();
+    if (CalledFunction) {
+      ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "InlineAttempt",
+                                           I->getDebugLoc(), I->getParent())
+                << "previous inlining reattempted for "
+                << (Hot ? "hotness: '" : "size: '")
+                << ore::NV("Callee", CalledFunction) << "' into '"
+                << ore::NV("Caller", &F) << "'");
+    }
+  }
+}
+
+/// Iteratively inline hot callsites of a function.
+///
+/// Iteratively traverse all callsites of the function \p F, and find if
+/// the corresponding inlined instance exists and is hot in profile. If
+/// it is hot enough, inline the callsites and adds new callsites of the
+/// callee into the caller. If the call is an indirect call, first promote
+/// it to direct call. Each indirect call is limited with a single target.
+///
+/// \param F function to perform iterative inlining.
+/// \param InlinedGUIDs a set to be updated to include all GUIDs that are
+///     inlined in the profiled binary.
+///
+/// \returns True if there is any inline happened.
+bool SampleProfileLoader::inlineHotFunctions(
+    Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
+  DenseSet<Instruction *> PromotedInsns;
+
+  // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
+  // Profile symbol list is ignored when profile-sample-accurate is on.
+  assert((!ProfAccForSymsInList ||
+          (!ProfileSampleAccurate &&
+           !F.hasFnAttribute("profile-sample-accurate"))) &&
+         "ProfAccForSymsInList should be false when profile-sample-accurate "
+         "is enabled");
+
   DenseMap<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites;
-  bool Changed = false; 
+  bool Changed = false;
   bool LocalChanged = true;
   while (LocalChanged) {
     LocalChanged = false;
-    SmallVector<CallBase *, 10> CIS; 
-    for (auto &BB : F) { 
-      bool Hot = false; 
-      SmallVector<CallBase *, 10> AllCandidates; 
-      SmallVector<CallBase *, 10> ColdCandidates; 
-      for (auto &I : BB.getInstList()) { 
-        const FunctionSamples *FS = nullptr; 
-        if (auto *CB = dyn_cast<CallBase>(&I)) { 
-          if (!isa<IntrinsicInst>(I) && (FS = findCalleeFunctionSamples(*CB))) { 
+    SmallVector<CallBase *, 10> CIS;
+    for (auto &BB : F) {
+      bool Hot = false;
+      SmallVector<CallBase *, 10> AllCandidates;
+      SmallVector<CallBase *, 10> ColdCandidates;
+      for (auto &I : BB.getInstList()) {
+        const FunctionSamples *FS = nullptr;
+        if (auto *CB = dyn_cast<CallBase>(&I)) {
+          if (!isa<IntrinsicInst>(I) && (FS = findCalleeFunctionSamples(*CB))) {
             assert((!FunctionSamples::UseMD5 || FS->GUIDToFuncNameMap) &&
                    "GUIDToFuncNameMap has to be populated");
-            AllCandidates.push_back(CB); 
+            AllCandidates.push_back(CB);
             if (FS->getEntrySamples() > 0 || ProfileIsCS)
               LocalNotInlinedCallSites.try_emplace(CB, FS);
-            if (callsiteIsHot(FS, PSI)) 
-              Hot = true; 
-            else if (shouldInlineColdCallee(*CB)) 
-              ColdCandidates.push_back(CB); 
-          } 
-        } 
-      } 
+            if (callsiteIsHot(FS, PSI))
+              Hot = true;
+            else if (shouldInlineColdCallee(*CB))
+              ColdCandidates.push_back(CB);
+          }
+        }
+      }
       if (Hot || ExternalInlineAdvisor) {
-        CIS.insert(CIS.begin(), AllCandidates.begin(), AllCandidates.end()); 
-        emitOptimizationRemarksForInlineCandidates(AllCandidates, F, true); 
-      } else { 
-        CIS.insert(CIS.begin(), ColdCandidates.begin(), ColdCandidates.end()); 
-        emitOptimizationRemarksForInlineCandidates(ColdCandidates, F, false); 
-      } 
-    } 
-    for (CallBase *I : CIS) { 
-      Function *CalledFunction = I->getCalledFunction(); 
+        CIS.insert(CIS.begin(), AllCandidates.begin(), AllCandidates.end());
+        emitOptimizationRemarksForInlineCandidates(AllCandidates, F, true);
+      } else {
+        CIS.insert(CIS.begin(), ColdCandidates.begin(), ColdCandidates.end());
+        emitOptimizationRemarksForInlineCandidates(ColdCandidates, F, false);
+      }
+    }
+    for (CallBase *I : CIS) {
+      Function *CalledFunction = I->getCalledFunction();
       InlineCandidate Candidate = {
           I,
           LocalNotInlinedCallSites.count(I) ? LocalNotInlinedCallSites[I]
                                             : nullptr,
           0 /* dummy count */, 1.0 /* dummy distribution factor */};
-      // Do not inline recursive calls. 
-      if (CalledFunction == &F) 
-        continue; 
-      if (I->isIndirectCall()) { 
-        if (PromotedInsns.count(I)) 
-          continue; 
-        uint64_t Sum; 
-        for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) { 
+      // Do not inline recursive calls.
+      if (CalledFunction == &F)
+        continue;
+      if (I->isIndirectCall()) {
+        if (PromotedInsns.count(I))
+          continue;
+        uint64_t Sum;
+        for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) {
           uint64_t SumOrigin = Sum;
           if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
-            FS->findInlinedFunctions(InlinedGUIDs, F.getParent(), 
-                                     PSI->getOrCompHotCountThreshold()); 
-            continue; 
-          } 
-          if (!callsiteIsHot(FS, PSI)) 
-            continue; 
- 
+            FS->findInlinedFunctions(InlinedGUIDs, F.getParent(),
+                                     PSI->getOrCompHotCountThreshold());
+            continue;
+          }
+          if (!callsiteIsHot(FS, PSI))
+            continue;
+
           Candidate = {I, FS, FS->getEntrySamples(), 1.0};
           if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum,
                                            PromotedInsns)) {
             LocalNotInlinedCallSites.erase(I);
             LocalChanged = true;
-          } 
-        } 
-      } else if (CalledFunction && CalledFunction->getSubprogram() && 
-                 !CalledFunction->isDeclaration()) { 
+          }
+        }
+      } else if (CalledFunction && CalledFunction->getSubprogram() &&
+                 !CalledFunction->isDeclaration()) {
         if (tryInlineCandidate(Candidate)) {
           LocalNotInlinedCallSites.erase(I);
-          LocalChanged = true; 
-        } 
+          LocalChanged = true;
+        }
       } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
-        findCalleeFunctionSamples(*I)->findInlinedFunctions( 
-            InlinedGUIDs, F.getParent(), PSI->getOrCompHotCountThreshold()); 
-      } 
-    } 
+        findCalleeFunctionSamples(*I)->findInlinedFunctions(
+            InlinedGUIDs, F.getParent(), PSI->getOrCompHotCountThreshold());
+      }
+    }
     Changed |= LocalChanged;
-  } 
- 
+  }
+
   // For CS profile, profile for not inlined context will be merged when
   // base profile is being trieved
   if (ProfileIsCS)
     return Changed;
 
-  // Accumulate not inlined callsite information into notInlinedSamples 
+  // Accumulate not inlined callsite information into notInlinedSamples
   for (const auto &Pair : LocalNotInlinedCallSites) {
-    CallBase *I = Pair.getFirst(); 
-    Function *Callee = I->getCalledFunction(); 
-    if (!Callee || Callee->isDeclaration()) 
-      continue; 
- 
-    ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "NotInline", 
-                                         I->getDebugLoc(), I->getParent()) 
-              << "previous inlining not repeated: '" 
-              << ore::NV("Callee", Callee) << "' into '" 
-              << ore::NV("Caller", &F) << "'"); 
- 
-    ++NumCSNotInlined; 
-    const FunctionSamples *FS = Pair.getSecond(); 
-    if (FS->getTotalSamples() == 0 && FS->getEntrySamples() == 0) { 
-      continue; 
-    } 
- 
-    if (ProfileMergeInlinee) { 
+    CallBase *I = Pair.getFirst();
+    Function *Callee = I->getCalledFunction();
+    if (!Callee || Callee->isDeclaration())
+      continue;
+
+    ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "NotInline",
+                                         I->getDebugLoc(), I->getParent())
+              << "previous inlining not repeated: '"
+              << ore::NV("Callee", Callee) << "' into '"
+              << ore::NV("Caller", &F) << "'");
+
+    ++NumCSNotInlined;
+    const FunctionSamples *FS = Pair.getSecond();
+    if (FS->getTotalSamples() == 0 && FS->getEntrySamples() == 0) {
+      continue;
+    }
+
+    if (ProfileMergeInlinee) {
       // A function call can be replicated by optimizations like callsite
       // splitting or jump threading and the replicates end up sharing the
       // sample nested callee profile instead of slicing the original inlinee's
@@ -1337,22 +1337,22 @@ bool SampleProfileLoader::inlineHotFunctions(
         // don't have head samples.
         const_cast<FunctionSamples *>(FS)->addHeadSamples(
             FS->getEntrySamples());
- 
+
         // Note that we have to do the merge right after processing function.
         // This allows OutlineFS's profile to be used for annotation during
         // top-down processing of functions' annotation.
         FunctionSamples *OutlineFS = Reader->getOrCreateSamplesFor(*Callee);
         OutlineFS->merge(*FS);
       }
-    } else { 
-      auto pair = 
-          notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0}); 
-      pair.first->second.entryCount += FS->getEntrySamples(); 
-    } 
-  } 
-  return Changed; 
-} 
- 
+    } else {
+      auto pair =
+          notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0});
+      pair.first->second.entryCount += FS->getEntrySamples();
+    }
+  }
+  return Changed;
+}
+
 bool SampleProfileLoader::tryInlineCandidate(
     InlineCandidate &Candidate, SmallVector<CallBase *, 8> *InlinedCallSites) {
 
@@ -1613,428 +1613,428 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority(
   return Changed;
 }
 
-/// Find equivalence classes for the given block. 
-/// 
-/// This finds all the blocks that are guaranteed to execute the same 
-/// number of times as \p BB1. To do this, it traverses all the 
-/// descendants of \p BB1 in the dominator or post-dominator tree. 
-/// 
-/// A block BB2 will be in the same equivalence class as \p BB1 if 
-/// the following holds: 
-/// 
-/// 1- \p BB1 is a descendant of BB2 in the opposite tree. So, if BB2 
-///    is a descendant of \p BB1 in the dominator tree, then BB2 should 
-///    dominate BB1 in the post-dominator tree. 
-/// 
-/// 2- Both BB2 and \p BB1 must be in the same loop. 
-/// 
-/// For every block BB2 that meets those two requirements, we set BB2's 
-/// equivalence class to \p BB1. 
-/// 
-/// \param BB1  Block to check. 
-/// \param Descendants  Descendants of \p BB1 in either the dom or pdom tree. 
-/// \param DomTree  Opposite dominator tree. If \p Descendants is filled 
-///                 with blocks from \p BB1's dominator tree, then 
-///                 this is the post-dominator tree, and vice versa. 
-template <bool IsPostDom> 
-void SampleProfileLoader::findEquivalencesFor( 
-    BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants, 
-    DominatorTreeBase<BasicBlock, IsPostDom> *DomTree) { 
-  const BasicBlock *EC = EquivalenceClass[BB1]; 
-  uint64_t Weight = BlockWeights[EC]; 
-  for (const auto *BB2 : Descendants) { 
-    bool IsDomParent = DomTree->dominates(BB2, BB1); 
-    bool IsInSameLoop = LI->getLoopFor(BB1) == LI->getLoopFor(BB2); 
-    if (BB1 != BB2 && IsDomParent && IsInSameLoop) { 
-      EquivalenceClass[BB2] = EC; 
-      // If BB2 is visited, then the entire EC should be marked as visited. 
-      if (VisitedBlocks.count(BB2)) { 
-        VisitedBlocks.insert(EC); 
-      } 
- 
-      // If BB2 is heavier than BB1, make BB2 have the same weight 
-      // as BB1. 
-      // 
-      // Note that we don't worry about the opposite situation here 
-      // (when BB2 is lighter than BB1). We will deal with this 
-      // during the propagation phase. Right now, we just want to 
-      // make sure that BB1 has the largest weight of all the 
-      // members of its equivalence set. 
-      Weight = std::max(Weight, BlockWeights[BB2]); 
-    } 
-  } 
-  if (EC == &EC->getParent()->getEntryBlock()) { 
-    BlockWeights[EC] = Samples->getHeadSamples() + 1; 
-  } else { 
-    BlockWeights[EC] = Weight; 
-  } 
-} 
- 
-/// Find equivalence classes. 
-/// 
-/// Since samples may be missing from blocks, we can fill in the gaps by setting 
-/// the weights of all the blocks in the same equivalence class to the same 
-/// weight. To compute the concept of equivalence, we use dominance and loop 
-/// information. Two blocks B1 and B2 are in the same equivalence class if B1 
-/// dominates B2, B2 post-dominates B1 and both are in the same loop. 
-/// 
-/// \param F The function to query. 
-void SampleProfileLoader::findEquivalenceClasses(Function &F) { 
-  SmallVector<BasicBlock *, 8> DominatedBBs; 
-  LLVM_DEBUG(dbgs() << "\nBlock equivalence classes\n"); 
-  // Find equivalence sets based on dominance and post-dominance information. 
-  for (auto &BB : F) { 
-    BasicBlock *BB1 = &BB; 
- 
-    // Compute BB1's equivalence class once. 
-    if (EquivalenceClass.count(BB1)) { 
-      LLVM_DEBUG(printBlockEquivalence(dbgs(), BB1)); 
-      continue; 
-    } 
- 
-    // By default, blocks are in their own equivalence class. 
-    EquivalenceClass[BB1] = BB1; 
- 
-    // Traverse all the blocks dominated by BB1. We are looking for 
-    // every basic block BB2 such that: 
-    // 
-    // 1- BB1 dominates BB2. 
-    // 2- BB2 post-dominates BB1. 
-    // 3- BB1 and BB2 are in the same loop nest. 
-    // 
-    // If all those conditions hold, it means that BB2 is executed 
-    // as many times as BB1, so they are placed in the same equivalence 
-    // class by making BB2's equivalence class be BB1. 
-    DominatedBBs.clear(); 
-    DT->getDescendants(BB1, DominatedBBs); 
-    findEquivalencesFor(BB1, DominatedBBs, PDT.get()); 
- 
-    LLVM_DEBUG(printBlockEquivalence(dbgs(), BB1)); 
-  } 
- 
-  // Assign weights to equivalence classes. 
-  // 
-  // All the basic blocks in the same equivalence class will execute 
-  // the same number of times. Since we know that the head block in 
-  // each equivalence class has the largest weight, assign that weight 
-  // to all the blocks in that equivalence class. 
-  LLVM_DEBUG( 
-      dbgs() << "\nAssign the same weight to all blocks in the same class\n"); 
-  for (auto &BI : F) { 
-    const BasicBlock *BB = &BI; 
-    const BasicBlock *EquivBB = EquivalenceClass[BB]; 
-    if (BB != EquivBB) 
-      BlockWeights[BB] = BlockWeights[EquivBB]; 
-    LLVM_DEBUG(printBlockWeight(dbgs(), BB)); 
-  } 
-} 
- 
-/// Visit the given edge to decide if it has a valid weight. 
-/// 
-/// If \p E has not been visited before, we copy to \p UnknownEdge 
-/// and increment the count of unknown edges. 
-/// 
-/// \param E  Edge to visit. 
-/// \param NumUnknownEdges  Current number of unknown edges. 
-/// \param UnknownEdge  Set if E has not been visited before. 
-/// 
-/// \returns E's weight, if known. Otherwise, return 0. 
-uint64_t SampleProfileLoader::visitEdge(Edge E, unsigned *NumUnknownEdges, 
-                                        Edge *UnknownEdge) { 
-  if (!VisitedEdges.count(E)) { 
-    (*NumUnknownEdges)++; 
-    *UnknownEdge = E; 
-    return 0; 
-  } 
- 
-  return EdgeWeights[E]; 
-} 
- 
-/// Propagate weights through incoming/outgoing edges. 
-/// 
-/// If the weight of a basic block is known, and there is only one edge 
-/// with an unknown weight, we can calculate the weight of that edge. 
-/// 
-/// Similarly, if all the edges have a known count, we can calculate the 
-/// count of the basic block, if needed. 
-/// 
-/// \param F  Function to process. 
-/// \param UpdateBlockCount  Whether we should update basic block counts that 
-///                          has already been annotated. 
-/// 
-/// \returns  True if new weights were assigned to edges or blocks. 
-bool SampleProfileLoader::propagateThroughEdges(Function &F, 
-                                                bool UpdateBlockCount) { 
-  bool Changed = false; 
-  LLVM_DEBUG(dbgs() << "\nPropagation through edges\n"); 
-  for (const auto &BI : F) { 
-    const BasicBlock *BB = &BI; 
-    const BasicBlock *EC = EquivalenceClass[BB]; 
- 
-    // Visit all the predecessor and successor edges to determine 
-    // which ones have a weight assigned already. Note that it doesn't 
-    // matter that we only keep track of a single unknown edge. The 
-    // only case we are interested in handling is when only a single 
-    // edge is unknown (see setEdgeOrBlockWeight). 
-    for (unsigned i = 0; i < 2; i++) { 
-      uint64_t TotalWeight = 0; 
-      unsigned NumUnknownEdges = 0, NumTotalEdges = 0; 
-      Edge UnknownEdge, SelfReferentialEdge, SingleEdge; 
- 
-      if (i == 0) { 
-        // First, visit all predecessor edges. 
-        NumTotalEdges = Predecessors[BB].size(); 
-        for (auto *Pred : Predecessors[BB]) { 
-          Edge E = std::make_pair(Pred, BB); 
-          TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge); 
-          if (E.first == E.second) 
-            SelfReferentialEdge = E; 
-        } 
-        if (NumTotalEdges == 1) { 
-          SingleEdge = std::make_pair(Predecessors[BB][0], BB); 
-        } 
-      } else { 
-        // On the second round, visit all successor edges. 
-        NumTotalEdges = Successors[BB].size(); 
-        for (auto *Succ : Successors[BB]) { 
-          Edge E = std::make_pair(BB, Succ); 
-          TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge); 
-        } 
-        if (NumTotalEdges == 1) { 
-          SingleEdge = std::make_pair(BB, Successors[BB][0]); 
-        } 
-      } 
- 
-      // After visiting all the edges, there are three cases that we 
-      // can handle immediately: 
-      // 
-      // - All the edge weights are known (i.e., NumUnknownEdges == 0). 
-      //   In this case, we simply check that the sum of all the edges 
-      //   is the same as BB's weight. If not, we change BB's weight 
-      //   to match. Additionally, if BB had not been visited before, 
-      //   we mark it visited. 
-      // 
-      // - Only one edge is unknown and BB has already been visited. 
-      //   In this case, we can compute the weight of the edge by 
-      //   subtracting the total block weight from all the known 
-      //   edge weights. If the edges weight more than BB, then the 
-      //   edge of the last remaining edge is set to zero. 
-      // 
-      // - There exists a self-referential edge and the weight of BB is 
-      //   known. In this case, this edge can be based on BB's weight. 
-      //   We add up all the other known edges and set the weight on 
-      //   the self-referential edge as we did in the previous case. 
-      // 
-      // In any other case, we must continue iterating. Eventually, 
-      // all edges will get a weight, or iteration will stop when 
-      // it reaches SampleProfileMaxPropagateIterations. 
-      if (NumUnknownEdges <= 1) { 
-        uint64_t &BBWeight = BlockWeights[EC]; 
-        if (NumUnknownEdges == 0) { 
-          if (!VisitedBlocks.count(EC)) { 
-            // If we already know the weight of all edges, the weight of the 
-            // basic block can be computed. It should be no larger than the sum 
-            // of all edge weights. 
-            if (TotalWeight > BBWeight) { 
-              BBWeight = TotalWeight; 
-              Changed = true; 
-              LLVM_DEBUG(dbgs() << "All edge weights for " << BB->getName() 
-                                << " known. Set weight for block: "; 
-                         printBlockWeight(dbgs(), BB);); 
-            } 
-          } else if (NumTotalEdges == 1 && 
-                     EdgeWeights[SingleEdge] < BlockWeights[EC]) { 
-            // If there is only one edge for the visited basic block, use the 
-            // block weight to adjust edge weight if edge weight is smaller. 
-            EdgeWeights[SingleEdge] = BlockWeights[EC]; 
-            Changed = true; 
-          } 
-        } else if (NumUnknownEdges == 1 && VisitedBlocks.count(EC)) { 
-          // If there is a single unknown edge and the block has been 
-          // visited, then we can compute E's weight. 
-          if (BBWeight >= TotalWeight) 
-            EdgeWeights[UnknownEdge] = BBWeight - TotalWeight; 
-          else 
-            EdgeWeights[UnknownEdge] = 0; 
-          const BasicBlock *OtherEC; 
-          if (i == 0) 
-            OtherEC = EquivalenceClass[UnknownEdge.first]; 
-          else 
-            OtherEC = EquivalenceClass[UnknownEdge.second]; 
-          // Edge weights should never exceed the BB weights it connects. 
-          if (VisitedBlocks.count(OtherEC) && 
-              EdgeWeights[UnknownEdge] > BlockWeights[OtherEC]) 
-            EdgeWeights[UnknownEdge] = BlockWeights[OtherEC]; 
-          VisitedEdges.insert(UnknownEdge); 
-          Changed = true; 
-          LLVM_DEBUG(dbgs() << "Set weight for edge: "; 
-                     printEdgeWeight(dbgs(), UnknownEdge)); 
-        } 
-      } else if (VisitedBlocks.count(EC) && BlockWeights[EC] == 0) { 
-        // If a block Weights 0, all its in/out edges should weight 0. 
-        if (i == 0) { 
-          for (auto *Pred : Predecessors[BB]) { 
-            Edge E = std::make_pair(Pred, BB); 
-            EdgeWeights[E] = 0; 
-            VisitedEdges.insert(E); 
-          } 
-        } else { 
-          for (auto *Succ : Successors[BB]) { 
-            Edge E = std::make_pair(BB, Succ); 
-            EdgeWeights[E] = 0; 
-            VisitedEdges.insert(E); 
-          } 
-        } 
-      } else if (SelfReferentialEdge.first && VisitedBlocks.count(EC)) { 
-        uint64_t &BBWeight = BlockWeights[BB]; 
-        // We have a self-referential edge and the weight of BB is known. 
-        if (BBWeight >= TotalWeight) 
-          EdgeWeights[SelfReferentialEdge] = BBWeight - TotalWeight; 
-        else 
-          EdgeWeights[SelfReferentialEdge] = 0; 
-        VisitedEdges.insert(SelfReferentialEdge); 
-        Changed = true; 
-        LLVM_DEBUG(dbgs() << "Set self-referential edge weight to: "; 
-                   printEdgeWeight(dbgs(), SelfReferentialEdge)); 
-      } 
-      if (UpdateBlockCount && !VisitedBlocks.count(EC) && TotalWeight > 0) { 
-        BlockWeights[EC] = TotalWeight; 
-        VisitedBlocks.insert(EC); 
-        Changed = true; 
-      } 
-    } 
-  } 
- 
-  return Changed; 
-} 
- 
-/// Build in/out edge lists for each basic block in the CFG. 
-/// 
-/// We are interested in unique edges. If a block B1 has multiple 
-/// edges to another block B2, we only add a single B1->B2 edge. 
-void SampleProfileLoader::buildEdges(Function &F) { 
-  for (auto &BI : F) { 
-    BasicBlock *B1 = &BI; 
- 
-    // Add predecessors for B1. 
-    SmallPtrSet<BasicBlock *, 16> Visited; 
-    if (!Predecessors[B1].empty()) 
-      llvm_unreachable("Found a stale predecessors list in a basic block."); 
-    for (pred_iterator PI = pred_begin(B1), PE = pred_end(B1); PI != PE; ++PI) { 
-      BasicBlock *B2 = *PI; 
-      if (Visited.insert(B2).second) 
-        Predecessors[B1].push_back(B2); 
-    } 
- 
-    // Add successors for B1. 
-    Visited.clear(); 
-    if (!Successors[B1].empty()) 
-      llvm_unreachable("Found a stale successors list in a basic block."); 
-    for (succ_iterator SI = succ_begin(B1), SE = succ_end(B1); SI != SE; ++SI) { 
-      BasicBlock *B2 = *SI; 
-      if (Visited.insert(B2).second) 
-        Successors[B1].push_back(B2); 
-    } 
-  } 
-} 
- 
-/// Returns the sorted CallTargetMap \p M by count in descending order. 
-static SmallVector<InstrProfValueData, 2> GetSortedValueDataFromCallTargets( 
-    const SampleRecord::CallTargetMap & M) { 
-  SmallVector<InstrProfValueData, 2> R; 
-  for (const auto &I : SampleRecord::SortCallTargets(M)) { 
-    R.emplace_back(InstrProfValueData{FunctionSamples::getGUID(I.first), I.second}); 
-  } 
-  return R; 
-} 
- 
-/// Propagate weights into edges 
-/// 
-/// The following rules are applied to every block BB in the CFG: 
-/// 
-/// - If BB has a single predecessor/successor, then the weight 
-///   of that edge is the weight of the block. 
-/// 
-/// - If all incoming or outgoing edges are known except one, and the 
-///   weight of the block is already known, the weight of the unknown 
-///   edge will be the weight of the block minus the sum of all the known 
-///   edges. If the sum of all the known edges is larger than BB's weight, 
-///   we set the unknown edge weight to zero. 
-/// 
-/// - If there is a self-referential edge, and the weight of the block is 
-///   known, the weight for that edge is set to the weight of the block 
-///   minus the weight of the other incoming edges to that block (if 
-///   known). 
-void SampleProfileLoader::propagateWeights(Function &F) { 
-  bool Changed = true; 
-  unsigned I = 0; 
- 
-  // If BB weight is larger than its corresponding loop's header BB weight, 
-  // use the BB weight to replace the loop header BB weight. 
-  for (auto &BI : F) { 
-    BasicBlock *BB = &BI; 
-    Loop *L = LI->getLoopFor(BB); 
-    if (!L) { 
-      continue; 
-    } 
-    BasicBlock *Header = L->getHeader(); 
-    if (Header && BlockWeights[BB] > BlockWeights[Header]) { 
-      BlockWeights[Header] = BlockWeights[BB]; 
-    } 
-  } 
- 
-  // Before propagation starts, build, for each block, a list of 
-  // unique predecessors and successors. This is necessary to handle 
-  // identical edges in multiway branches. Since we visit all blocks and all 
-  // edges of the CFG, it is cleaner to build these lists once at the start 
-  // of the pass. 
-  buildEdges(F); 
- 
-  // Propagate until we converge or we go past the iteration limit. 
-  while (Changed && I++ < SampleProfileMaxPropagateIterations) { 
-    Changed = propagateThroughEdges(F, false); 
-  } 
- 
-  // The first propagation propagates BB counts from annotated BBs to unknown 
-  // BBs. The 2nd propagation pass resets edges weights, and use all BB weights 
-  // to propagate edge weights. 
-  VisitedEdges.clear(); 
-  Changed = true; 
-  while (Changed && I++ < SampleProfileMaxPropagateIterations) { 
-    Changed = propagateThroughEdges(F, false); 
-  } 
- 
-  // The 3rd propagation pass allows adjust annotated BB weights that are 
-  // obviously wrong. 
-  Changed = true; 
-  while (Changed && I++ < SampleProfileMaxPropagateIterations) { 
-    Changed = propagateThroughEdges(F, true); 
-  } 
- 
-  // Generate MD_prof metadata for every branch instruction using the 
-  // edge weights computed during propagation. 
-  LLVM_DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n"); 
-  LLVMContext &Ctx = F.getContext(); 
-  MDBuilder MDB(Ctx); 
-  for (auto &BI : F) { 
-    BasicBlock *BB = &BI; 
- 
-    if (BlockWeights[BB]) { 
-      for (auto &I : BB->getInstList()) { 
-        if (!isa<CallInst>(I) && !isa<InvokeInst>(I)) 
-          continue; 
-        if (!cast<CallBase>(I).getCalledFunction()) { 
-          const DebugLoc &DLoc = I.getDebugLoc(); 
-          if (!DLoc) 
-            continue; 
-          const DILocation *DIL = DLoc; 
-          const FunctionSamples *FS = findFunctionSamples(I); 
-          if (!FS) 
-            continue; 
+/// Find equivalence classes for the given block.
+///
+/// This finds all the blocks that are guaranteed to execute the same
+/// number of times as \p BB1. To do this, it traverses all the
+/// descendants of \p BB1 in the dominator or post-dominator tree.
+///
+/// A block BB2 will be in the same equivalence class as \p BB1 if
+/// the following holds:
+///
+/// 1- \p BB1 is a descendant of BB2 in the opposite tree. So, if BB2
+///    is a descendant of \p BB1 in the dominator tree, then BB2 should
+///    dominate BB1 in the post-dominator tree.
+///
+/// 2- Both BB2 and \p BB1 must be in the same loop.
+///
+/// For every block BB2 that meets those two requirements, we set BB2's
+/// equivalence class to \p BB1.
+///
+/// \param BB1  Block to check.
+/// \param Descendants  Descendants of \p BB1 in either the dom or pdom tree.
+/// \param DomTree  Opposite dominator tree. If \p Descendants is filled
+///                 with blocks from \p BB1's dominator tree, then
+///                 this is the post-dominator tree, and vice versa.
+template <bool IsPostDom>
+void SampleProfileLoader::findEquivalencesFor(
+    BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants,
+    DominatorTreeBase<BasicBlock, IsPostDom> *DomTree) {
+  const BasicBlock *EC = EquivalenceClass[BB1];
+  uint64_t Weight = BlockWeights[EC];
+  for (const auto *BB2 : Descendants) {
+    bool IsDomParent = DomTree->dominates(BB2, BB1);
+    bool IsInSameLoop = LI->getLoopFor(BB1) == LI->getLoopFor(BB2);
+    if (BB1 != BB2 && IsDomParent && IsInSameLoop) {
+      EquivalenceClass[BB2] = EC;
+      // If BB2 is visited, then the entire EC should be marked as visited.
+      if (VisitedBlocks.count(BB2)) {
+        VisitedBlocks.insert(EC);
+      }
+
+      // If BB2 is heavier than BB1, make BB2 have the same weight
+      // as BB1.
+      //
+      // Note that we don't worry about the opposite situation here
+      // (when BB2 is lighter than BB1). We will deal with this
+      // during the propagation phase. Right now, we just want to
+      // make sure that BB1 has the largest weight of all the
+      // members of its equivalence set.
+      Weight = std::max(Weight, BlockWeights[BB2]);
+    }
+  }
+  if (EC == &EC->getParent()->getEntryBlock()) {
+    BlockWeights[EC] = Samples->getHeadSamples() + 1;
+  } else {
+    BlockWeights[EC] = Weight;
+  }
+}
+
+/// Find equivalence classes.
+///
+/// Since samples may be missing from blocks, we can fill in the gaps by setting
+/// the weights of all the blocks in the same equivalence class to the same
+/// weight. To compute the concept of equivalence, we use dominance and loop
+/// information. Two blocks B1 and B2 are in the same equivalence class if B1
+/// dominates B2, B2 post-dominates B1 and both are in the same loop.
+///
+/// \param F The function to query.
+void SampleProfileLoader::findEquivalenceClasses(Function &F) {
+  SmallVector<BasicBlock *, 8> DominatedBBs;
+  LLVM_DEBUG(dbgs() << "\nBlock equivalence classes\n");
+  // Find equivalence sets based on dominance and post-dominance information.
+  for (auto &BB : F) {
+    BasicBlock *BB1 = &BB;
+
+    // Compute BB1's equivalence class once.
+    if (EquivalenceClass.count(BB1)) {
+      LLVM_DEBUG(printBlockEquivalence(dbgs(), BB1));
+      continue;
+    }
+
+    // By default, blocks are in their own equivalence class.
+    EquivalenceClass[BB1] = BB1;
+
+    // Traverse all the blocks dominated by BB1. We are looking for
+    // every basic block BB2 such that:
+    //
+    // 1- BB1 dominates BB2.
+    // 2- BB2 post-dominates BB1.
+    // 3- BB1 and BB2 are in the same loop nest.
+    //
+    // If all those conditions hold, it means that BB2 is executed
+    // as many times as BB1, so they are placed in the same equivalence
+    // class by making BB2's equivalence class be BB1.
+    DominatedBBs.clear();
+    DT->getDescendants(BB1, DominatedBBs);
+    findEquivalencesFor(BB1, DominatedBBs, PDT.get());
+
+    LLVM_DEBUG(printBlockEquivalence(dbgs(), BB1));
+  }
+
+  // Assign weights to equivalence classes.
+  //
+  // All the basic blocks in the same equivalence class will execute
+  // the same number of times. Since we know that the head block in
+  // each equivalence class has the largest weight, assign that weight
+  // to all the blocks in that equivalence class.
+  LLVM_DEBUG(
+      dbgs() << "\nAssign the same weight to all blocks in the same class\n");
+  for (auto &BI : F) {
+    const BasicBlock *BB = &BI;
+    const BasicBlock *EquivBB = EquivalenceClass[BB];
+    if (BB != EquivBB)
+      BlockWeights[BB] = BlockWeights[EquivBB];
+    LLVM_DEBUG(printBlockWeight(dbgs(), BB));
+  }
+}
+
+/// Visit the given edge to decide if it has a valid weight.
+///
+/// If \p E has not been visited before, we copy to \p UnknownEdge
+/// and increment the count of unknown edges.
+///
+/// \param E  Edge to visit.
+/// \param NumUnknownEdges  Current number of unknown edges.
+/// \param UnknownEdge  Set if E has not been visited before.
+///
+/// \returns E's weight, if known. Otherwise, return 0.
+uint64_t SampleProfileLoader::visitEdge(Edge E, unsigned *NumUnknownEdges,
+                                        Edge *UnknownEdge) {
+  if (!VisitedEdges.count(E)) {
+    (*NumUnknownEdges)++;
+    *UnknownEdge = E;
+    return 0;
+  }
+
+  return EdgeWeights[E];
+}
+
+/// Propagate weights through incoming/outgoing edges.
+///
+/// If the weight of a basic block is known, and there is only one edge
+/// with an unknown weight, we can calculate the weight of that edge.
+///
+/// Similarly, if all the edges have a known count, we can calculate the
+/// count of the basic block, if needed.
+///
+/// \param F  Function to process.
+/// \param UpdateBlockCount  Whether we should update basic block counts that
+///                          has already been annotated.
+///
+/// \returns  True if new weights were assigned to edges or blocks.
+bool SampleProfileLoader::propagateThroughEdges(Function &F,
+                                                bool UpdateBlockCount) {
+  bool Changed = false;
+  LLVM_DEBUG(dbgs() << "\nPropagation through edges\n");
+  for (const auto &BI : F) {
+    const BasicBlock *BB = &BI;
+    const BasicBlock *EC = EquivalenceClass[BB];
+
+    // Visit all the predecessor and successor edges to determine
+    // which ones have a weight assigned already. Note that it doesn't
+    // matter that we only keep track of a single unknown edge. The
+    // only case we are interested in handling is when only a single
+    // edge is unknown (see setEdgeOrBlockWeight).
+    for (unsigned i = 0; i < 2; i++) {
+      uint64_t TotalWeight = 0;
+      unsigned NumUnknownEdges = 0, NumTotalEdges = 0;
+      Edge UnknownEdge, SelfReferentialEdge, SingleEdge;
+
+      if (i == 0) {
+        // First, visit all predecessor edges.
+        NumTotalEdges = Predecessors[BB].size();
+        for (auto *Pred : Predecessors[BB]) {
+          Edge E = std::make_pair(Pred, BB);
+          TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge);
+          if (E.first == E.second)
+            SelfReferentialEdge = E;
+        }
+        if (NumTotalEdges == 1) {
+          SingleEdge = std::make_pair(Predecessors[BB][0], BB);
+        }
+      } else {
+        // On the second round, visit all successor edges.
+        NumTotalEdges = Successors[BB].size();
+        for (auto *Succ : Successors[BB]) {
+          Edge E = std::make_pair(BB, Succ);
+          TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge);
+        }
+        if (NumTotalEdges == 1) {
+          SingleEdge = std::make_pair(BB, Successors[BB][0]);
+        }
+      }
+
+      // After visiting all the edges, there are three cases that we
+      // can handle immediately:
+      //
+      // - All the edge weights are known (i.e., NumUnknownEdges == 0).
+      //   In this case, we simply check that the sum of all the edges
+      //   is the same as BB's weight. If not, we change BB's weight
+      //   to match. Additionally, if BB had not been visited before,
+      //   we mark it visited.
+      //
+      // - Only one edge is unknown and BB has already been visited.
+      //   In this case, we can compute the weight of the edge by
+      //   subtracting the total block weight from all the known
+      //   edge weights. If the edges weight more than BB, then the
+      //   edge of the last remaining edge is set to zero.
+      //
+      // - There exists a self-referential edge and the weight of BB is
+      //   known. In this case, this edge can be based on BB's weight.
+      //   We add up all the other known edges and set the weight on
+      //   the self-referential edge as we did in the previous case.
+      //
+      // In any other case, we must continue iterating. Eventually,
+      // all edges will get a weight, or iteration will stop when
+      // it reaches SampleProfileMaxPropagateIterations.
+      if (NumUnknownEdges <= 1) {
+        uint64_t &BBWeight = BlockWeights[EC];
+        if (NumUnknownEdges == 0) {
+          if (!VisitedBlocks.count(EC)) {
+            // If we already know the weight of all edges, the weight of the
+            // basic block can be computed. It should be no larger than the sum
+            // of all edge weights.
+            if (TotalWeight > BBWeight) {
+              BBWeight = TotalWeight;
+              Changed = true;
+              LLVM_DEBUG(dbgs() << "All edge weights for " << BB->getName()
+                                << " known. Set weight for block: ";
+                         printBlockWeight(dbgs(), BB););
+            }
+          } else if (NumTotalEdges == 1 &&
+                     EdgeWeights[SingleEdge] < BlockWeights[EC]) {
+            // If there is only one edge for the visited basic block, use the
+            // block weight to adjust edge weight if edge weight is smaller.
+            EdgeWeights[SingleEdge] = BlockWeights[EC];
+            Changed = true;
+          }
+        } else if (NumUnknownEdges == 1 && VisitedBlocks.count(EC)) {
+          // If there is a single unknown edge and the block has been
+          // visited, then we can compute E's weight.
+          if (BBWeight >= TotalWeight)
+            EdgeWeights[UnknownEdge] = BBWeight - TotalWeight;
+          else
+            EdgeWeights[UnknownEdge] = 0;
+          const BasicBlock *OtherEC;
+          if (i == 0)
+            OtherEC = EquivalenceClass[UnknownEdge.first];
+          else
+            OtherEC = EquivalenceClass[UnknownEdge.second];
+          // Edge weights should never exceed the BB weights it connects.
+          if (VisitedBlocks.count(OtherEC) &&
+              EdgeWeights[UnknownEdge] > BlockWeights[OtherEC])
+            EdgeWeights[UnknownEdge] = BlockWeights[OtherEC];
+          VisitedEdges.insert(UnknownEdge);
+          Changed = true;
+          LLVM_DEBUG(dbgs() << "Set weight for edge: ";
+                     printEdgeWeight(dbgs(), UnknownEdge));
+        }
+      } else if (VisitedBlocks.count(EC) && BlockWeights[EC] == 0) {
+        // If a block Weights 0, all its in/out edges should weight 0.
+        if (i == 0) {
+          for (auto *Pred : Predecessors[BB]) {
+            Edge E = std::make_pair(Pred, BB);
+            EdgeWeights[E] = 0;
+            VisitedEdges.insert(E);
+          }
+        } else {
+          for (auto *Succ : Successors[BB]) {
+            Edge E = std::make_pair(BB, Succ);
+            EdgeWeights[E] = 0;
+            VisitedEdges.insert(E);
+          }
+        }
+      } else if (SelfReferentialEdge.first && VisitedBlocks.count(EC)) {
+        uint64_t &BBWeight = BlockWeights[BB];
+        // We have a self-referential edge and the weight of BB is known.
+        if (BBWeight >= TotalWeight)
+          EdgeWeights[SelfReferentialEdge] = BBWeight - TotalWeight;
+        else
+          EdgeWeights[SelfReferentialEdge] = 0;
+        VisitedEdges.insert(SelfReferentialEdge);
+        Changed = true;
+        LLVM_DEBUG(dbgs() << "Set self-referential edge weight to: ";
+                   printEdgeWeight(dbgs(), SelfReferentialEdge));
+      }
+      if (UpdateBlockCount && !VisitedBlocks.count(EC) && TotalWeight > 0) {
+        BlockWeights[EC] = TotalWeight;
+        VisitedBlocks.insert(EC);
+        Changed = true;
+      }
+    }
+  }
+
+  return Changed;
+}
+
+/// Build in/out edge lists for each basic block in the CFG.
+///
+/// We are interested in unique edges. If a block B1 has multiple
+/// edges to another block B2, we only add a single B1->B2 edge.
+void SampleProfileLoader::buildEdges(Function &F) {
+  for (auto &BI : F) {
+    BasicBlock *B1 = &BI;
+
+    // Add predecessors for B1.
+    SmallPtrSet<BasicBlock *, 16> Visited;
+    if (!Predecessors[B1].empty())
+      llvm_unreachable("Found a stale predecessors list in a basic block.");
+    for (pred_iterator PI = pred_begin(B1), PE = pred_end(B1); PI != PE; ++PI) {
+      BasicBlock *B2 = *PI;
+      if (Visited.insert(B2).second)
+        Predecessors[B1].push_back(B2);
+    }
+
+    // Add successors for B1.
+    Visited.clear();
+    if (!Successors[B1].empty())
+      llvm_unreachable("Found a stale successors list in a basic block.");
+    for (succ_iterator SI = succ_begin(B1), SE = succ_end(B1); SI != SE; ++SI) {
+      BasicBlock *B2 = *SI;
+      if (Visited.insert(B2).second)
+        Successors[B1].push_back(B2);
+    }
+  }
+}
+
+/// Returns the sorted CallTargetMap \p M by count in descending order.
+static SmallVector<InstrProfValueData, 2> GetSortedValueDataFromCallTargets(
+    const SampleRecord::CallTargetMap & M) {
+  SmallVector<InstrProfValueData, 2> R;
+  for (const auto &I : SampleRecord::SortCallTargets(M)) {
+    R.emplace_back(InstrProfValueData{FunctionSamples::getGUID(I.first), I.second});
+  }
+  return R;
+}
+
+/// Propagate weights into edges
+///
+/// The following rules are applied to every block BB in the CFG:
+///
+/// - If BB has a single predecessor/successor, then the weight
+///   of that edge is the weight of the block.
+///
+/// - If all incoming or outgoing edges are known except one, and the
+///   weight of the block is already known, the weight of the unknown
+///   edge will be the weight of the block minus the sum of all the known
+///   edges. If the sum of all the known edges is larger than BB's weight,
+///   we set the unknown edge weight to zero.
+///
+/// - If there is a self-referential edge, and the weight of the block is
+///   known, the weight for that edge is set to the weight of the block
+///   minus the weight of the other incoming edges to that block (if
+///   known).
+void SampleProfileLoader::propagateWeights(Function &F) {
+  bool Changed = true;
+  unsigned I = 0;
+
+  // If BB weight is larger than its corresponding loop's header BB weight,
+  // use the BB weight to replace the loop header BB weight.
+  for (auto &BI : F) {
+    BasicBlock *BB = &BI;
+    Loop *L = LI->getLoopFor(BB);
+    if (!L) {
+      continue;
+    }
+    BasicBlock *Header = L->getHeader();
+    if (Header && BlockWeights[BB] > BlockWeights[Header]) {
+      BlockWeights[Header] = BlockWeights[BB];
+    }
+  }
+
+  // Before propagation starts, build, for each block, a list of
+  // unique predecessors and successors. This is necessary to handle
+  // identical edges in multiway branches. Since we visit all blocks and all
+  // edges of the CFG, it is cleaner to build these lists once at the start
+  // of the pass.
+  buildEdges(F);
+
+  // Propagate until we converge or we go past the iteration limit.
+  while (Changed && I++ < SampleProfileMaxPropagateIterations) {
+    Changed = propagateThroughEdges(F, false);
+  }
+
+  // The first propagation propagates BB counts from annotated BBs to unknown
+  // BBs. The 2nd propagation pass resets edges weights, and use all BB weights
+  // to propagate edge weights.
+  VisitedEdges.clear();
+  Changed = true;
+  while (Changed && I++ < SampleProfileMaxPropagateIterations) {
+    Changed = propagateThroughEdges(F, false);
+  }
+
+  // The 3rd propagation pass allows adjust annotated BB weights that are
+  // obviously wrong.
+  Changed = true;
+  while (Changed && I++ < SampleProfileMaxPropagateIterations) {
+    Changed = propagateThroughEdges(F, true);
+  }
+
+  // Generate MD_prof metadata for every branch instruction using the
+  // edge weights computed during propagation.
+  LLVM_DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n");
+  LLVMContext &Ctx = F.getContext();
+  MDBuilder MDB(Ctx);
+  for (auto &BI : F) {
+    BasicBlock *BB = &BI;
+
+    if (BlockWeights[BB]) {
+      for (auto &I : BB->getInstList()) {
+        if (!isa<CallInst>(I) && !isa<InvokeInst>(I))
+          continue;
+        if (!cast<CallBase>(I).getCalledFunction()) {
+          const DebugLoc &DLoc = I.getDebugLoc();
+          if (!DLoc)
+            continue;
+          const DILocation *DIL = DLoc;
+          const FunctionSamples *FS = findFunctionSamples(I);
+          if (!FS)
+            continue;
           auto CallSite = FunctionSamples::getCallSiteIdentifier(DIL);
           auto T = FS->findCallTargetMapAt(CallSite);
-          if (!T || T.get().empty()) 
-            continue; 
+          if (!T || T.get().empty())
+            continue;
           // Prorate the callsite counts to reflect what is already done to the
           // callsite, such as ICP or calliste cloning.
           if (FunctionSamples::ProfileIsProbeBased) {
@@ -2043,167 +2043,167 @@ void SampleProfileLoader::propagateWeights(Function &F) {
                 T = SampleRecord::adjustCallTargets(T.get(), Probe->Factor);
             }
           }
-          SmallVector<InstrProfValueData, 2> SortedCallTargets = 
-              GetSortedValueDataFromCallTargets(T.get()); 
-          uint64_t Sum; 
-          findIndirectCallFunctionSamples(I, Sum); 
-          annotateValueSite(*I.getParent()->getParent()->getParent(), I, 
-                            SortedCallTargets, Sum, IPVK_IndirectCallTarget, 
-                            SortedCallTargets.size()); 
-        } else if (!isa<IntrinsicInst>(&I)) { 
-          I.setMetadata(LLVMContext::MD_prof, 
-                        MDB.createBranchWeights( 
-                            {static_cast<uint32_t>(BlockWeights[BB])})); 
-        } 
-      } 
-    } 
-    Instruction *TI = BB->getTerminator(); 
-    if (TI->getNumSuccessors() == 1) 
-      continue; 
-    if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI)) 
-      continue; 
- 
-    DebugLoc BranchLoc = TI->getDebugLoc(); 
-    LLVM_DEBUG(dbgs() << "\nGetting weights for branch at line " 
-                      << ((BranchLoc) ? Twine(BranchLoc.getLine()) 
-                                      : Twine("<UNKNOWN LOCATION>")) 
-                      << ".\n"); 
-    SmallVector<uint32_t, 4> Weights; 
-    uint32_t MaxWeight = 0; 
-    Instruction *MaxDestInst; 
-    for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) { 
-      BasicBlock *Succ = TI->getSuccessor(I); 
-      Edge E = std::make_pair(BB, Succ); 
-      uint64_t Weight = EdgeWeights[E]; 
-      LLVM_DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E)); 
-      // Use uint32_t saturated arithmetic to adjust the incoming weights, 
-      // if needed. Sample counts in profiles are 64-bit unsigned values, 
-      // but internally branch weights are expressed as 32-bit values. 
-      if (Weight > std::numeric_limits<uint32_t>::max()) { 
-        LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)"); 
-        Weight = std::numeric_limits<uint32_t>::max(); 
-      } 
-      // Weight is added by one to avoid propagation errors introduced by 
-      // 0 weights. 
-      Weights.push_back(static_cast<uint32_t>(Weight + 1)); 
-      if (Weight != 0) { 
-        if (Weight > MaxWeight) { 
-          MaxWeight = Weight; 
-          MaxDestInst = Succ->getFirstNonPHIOrDbgOrLifetime(); 
-        } 
-      } 
-    } 
- 
-    uint64_t TempWeight; 
-    // Only set weights if there is at least one non-zero weight. 
-    // In any other case, let the analyzer set weights. 
-    // Do not set weights if the weights are present. In ThinLTO, the profile 
-    // annotation is done twice. If the first annotation already set the 
-    // weights, the second pass does not need to set it. 
-    if (MaxWeight > 0 && !TI->extractProfTotalWeight(TempWeight)) { 
-      LLVM_DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n"); 
-      TI->setMetadata(LLVMContext::MD_prof, 
-                      MDB.createBranchWeights(Weights)); 
-      ORE->emit([&]() { 
-        return OptimizationRemark(DEBUG_TYPE, "PopularDest", MaxDestInst) 
-               << "most popular destination for conditional branches at " 
-               << ore::NV("CondBranchesLoc", BranchLoc); 
-      }); 
-    } else { 
-      LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n"); 
-    } 
-  } 
-} 
- 
-/// Get the line number for the function header. 
-/// 
-/// This looks up function \p F in the current compilation unit and 
-/// retrieves the line number where the function is defined. This is 
-/// line 0 for all the samples read from the profile file. Every line 
-/// number is relative to this line. 
-/// 
-/// \param F  Function object to query. 
-/// 
-/// \returns the line number where \p F is defined. If it returns 0, 
-///          it means that there is no debug information available for \p F. 
-unsigned SampleProfileLoader::getFunctionLoc(Function &F) { 
-  if (DISubprogram *S = F.getSubprogram()) 
-    return S->getLine(); 
- 
-  if (NoWarnSampleUnused) 
-    return 0; 
- 
-  // If the start of \p F is missing, emit a diagnostic to inform the user 
-  // about the missed opportunity. 
-  F.getContext().diagnose(DiagnosticInfoSampleProfile( 
-      "No debug information found in function " + F.getName() + 
-          ": Function profile not used", 
-      DS_Warning)); 
-  return 0; 
-} 
- 
-void SampleProfileLoader::computeDominanceAndLoopInfo(Function &F) { 
-  DT.reset(new DominatorTree); 
-  DT->recalculate(F); 
- 
-  PDT.reset(new PostDominatorTree(F)); 
- 
-  LI.reset(new LoopInfo); 
-  LI->analyze(*DT); 
-} 
- 
-/// Generate branch weight metadata for all branches in \p F. 
-/// 
-/// Branch weights are computed out of instruction samples using a 
-/// propagation heuristic. Propagation proceeds in 3 phases: 
-/// 
-/// 1- Assignment of block weights. All the basic blocks in the function 
-///    are initial assigned the same weight as their most frequently 
-///    executed instruction. 
-/// 
-/// 2- Creation of equivalence classes. Since samples may be missing from 
-///    blocks, we can fill in the gaps by setting the weights of all the 
-///    blocks in the same equivalence class to the same weight. To compute 
-///    the concept of equivalence, we use dominance and loop information. 
-///    Two blocks B1 and B2 are in the same equivalence class if B1 
-///    dominates B2, B2 post-dominates B1 and both are in the same loop. 
-/// 
-/// 3- Propagation of block weights into edges. This uses a simple 
-///    propagation heuristic. The following rules are applied to every 
-///    block BB in the CFG: 
-/// 
-///    - If BB has a single predecessor/successor, then the weight 
-///      of that edge is the weight of the block. 
-/// 
-///    - If all the edges are known except one, and the weight of the 
-///      block is already known, the weight of the unknown edge will 
-///      be the weight of the block minus the sum of all the known 
-///      edges. If the sum of all the known edges is larger than BB's weight, 
-///      we set the unknown edge weight to zero. 
-/// 
-///    - If there is a self-referential edge, and the weight of the block is 
-///      known, the weight for that edge is set to the weight of the block 
-///      minus the weight of the other incoming edges to that block (if 
-///      known). 
-/// 
-/// Since this propagation is not guaranteed to finalize for every CFG, we 
-/// only allow it to proceed for a limited number of iterations (controlled 
-/// by -sample-profile-max-propagate-iterations). 
-/// 
-/// FIXME: Try to replace this propagation heuristic with a scheme 
-/// that is guaranteed to finalize. A work-list approach similar to 
-/// the standard value propagation algorithm used by SSA-CCP might 
-/// work here. 
-/// 
-/// Once all the branch weights are computed, we emit the MD_prof 
-/// metadata on BB using the computed values for each of its branches. 
-/// 
-/// \param F The function to query. 
-/// 
-/// \returns true if \p F was modified. Returns false, otherwise. 
-bool SampleProfileLoader::emitAnnotations(Function &F) { 
-  bool Changed = false; 
- 
+          SmallVector<InstrProfValueData, 2> SortedCallTargets =
+              GetSortedValueDataFromCallTargets(T.get());
+          uint64_t Sum;
+          findIndirectCallFunctionSamples(I, Sum);
+          annotateValueSite(*I.getParent()->getParent()->getParent(), I,
+                            SortedCallTargets, Sum, IPVK_IndirectCallTarget,
+                            SortedCallTargets.size());
+        } else if (!isa<IntrinsicInst>(&I)) {
+          I.setMetadata(LLVMContext::MD_prof,
+                        MDB.createBranchWeights(
+                            {static_cast<uint32_t>(BlockWeights[BB])}));
+        }
+      }
+    }
+    Instruction *TI = BB->getTerminator();
+    if (TI->getNumSuccessors() == 1)
+      continue;
+    if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI))
+      continue;
+
+    DebugLoc BranchLoc = TI->getDebugLoc();
+    LLVM_DEBUG(dbgs() << "\nGetting weights for branch at line "
+                      << ((BranchLoc) ? Twine(BranchLoc.getLine())
+                                      : Twine("<UNKNOWN LOCATION>"))
+                      << ".\n");
+    SmallVector<uint32_t, 4> Weights;
+    uint32_t MaxWeight = 0;
+    Instruction *MaxDestInst;
+    for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
+      BasicBlock *Succ = TI->getSuccessor(I);
+      Edge E = std::make_pair(BB, Succ);
+      uint64_t Weight = EdgeWeights[E];
+      LLVM_DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E));
+      // Use uint32_t saturated arithmetic to adjust the incoming weights,
+      // if needed. Sample counts in profiles are 64-bit unsigned values,
+      // but internally branch weights are expressed as 32-bit values.
+      if (Weight > std::numeric_limits<uint32_t>::max()) {
+        LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)");
+        Weight = std::numeric_limits<uint32_t>::max();
+      }
+      // Weight is added by one to avoid propagation errors introduced by
+      // 0 weights.
+      Weights.push_back(static_cast<uint32_t>(Weight + 1));
+      if (Weight != 0) {
+        if (Weight > MaxWeight) {
+          MaxWeight = Weight;
+          MaxDestInst = Succ->getFirstNonPHIOrDbgOrLifetime();
+        }
+      }
+    }
+
+    uint64_t TempWeight;
+    // Only set weights if there is at least one non-zero weight.
+    // In any other case, let the analyzer set weights.
+    // Do not set weights if the weights are present. In ThinLTO, the profile
+    // annotation is done twice. If the first annotation already set the
+    // weights, the second pass does not need to set it.
+    if (MaxWeight > 0 && !TI->extractProfTotalWeight(TempWeight)) {
+      LLVM_DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");
+      TI->setMetadata(LLVMContext::MD_prof,
+                      MDB.createBranchWeights(Weights));
+      ORE->emit([&]() {
+        return OptimizationRemark(DEBUG_TYPE, "PopularDest", MaxDestInst)
+               << "most popular destination for conditional branches at "
+               << ore::NV("CondBranchesLoc", BranchLoc);
+      });
+    } else {
+      LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n");
+    }
+  }
+}
+
+/// Get the line number for the function header.
+///
+/// This looks up function \p F in the current compilation unit and
+/// retrieves the line number where the function is defined. This is
+/// line 0 for all the samples read from the profile file. Every line
+/// number is relative to this line.
+///
+/// \param F  Function object to query.
+///
+/// \returns the line number where \p F is defined. If it returns 0,
+///          it means that there is no debug information available for \p F.
+unsigned SampleProfileLoader::getFunctionLoc(Function &F) {
+  if (DISubprogram *S = F.getSubprogram())
+    return S->getLine();
+
+  if (NoWarnSampleUnused)
+    return 0;
+
+  // If the start of \p F is missing, emit a diagnostic to inform the user
+  // about the missed opportunity.
+  F.getContext().diagnose(DiagnosticInfoSampleProfile(
+      "No debug information found in function " + F.getName() +
+          ": Function profile not used",
+      DS_Warning));
+  return 0;
+}
+
+void SampleProfileLoader::computeDominanceAndLoopInfo(Function &F) {
+  DT.reset(new DominatorTree);
+  DT->recalculate(F);
+
+  PDT.reset(new PostDominatorTree(F));
+
+  LI.reset(new LoopInfo);
+  LI->analyze(*DT);
+}
+
+/// Generate branch weight metadata for all branches in \p F.
+///
+/// Branch weights are computed out of instruction samples using a
+/// propagation heuristic. Propagation proceeds in 3 phases:
+///
+/// 1- Assignment of block weights. All the basic blocks in the function
+///    are initial assigned the same weight as their most frequently
+///    executed instruction.
+///
+/// 2- Creation of equivalence classes. Since samples may be missing from
+///    blocks, we can fill in the gaps by setting the weights of all the
+///    blocks in the same equivalence class to the same weight. To compute
+///    the concept of equivalence, we use dominance and loop information.
+///    Two blocks B1 and B2 are in the same equivalence class if B1
+///    dominates B2, B2 post-dominates B1 and both are in the same loop.
+///
+/// 3- Propagation of block weights into edges. This uses a simple
+///    propagation heuristic. The following rules are applied to every
+///    block BB in the CFG:
+///
+///    - If BB has a single predecessor/successor, then the weight
+///      of that edge is the weight of the block.
+///
+///    - If all the edges are known except one, and the weight of the
+///      block is already known, the weight of the unknown edge will
+///      be the weight of the block minus the sum of all the known
+///      edges. If the sum of all the known edges is larger than BB's weight,
+///      we set the unknown edge weight to zero.
+///
+///    - If there is a self-referential edge, and the weight of the block is
+///      known, the weight for that edge is set to the weight of the block
+///      minus the weight of the other incoming edges to that block (if
+///      known).
+///
+/// Since this propagation is not guaranteed to finalize for every CFG, we
+/// only allow it to proceed for a limited number of iterations (controlled
+/// by -sample-profile-max-propagate-iterations).
+///
+/// FIXME: Try to replace this propagation heuristic with a scheme
+/// that is guaranteed to finalize. A work-list approach similar to
+/// the standard value propagation algorithm used by SSA-CCP might
+/// work here.
+///
+/// Once all the branch weights are computed, we emit the MD_prof
+/// metadata on BB using the computed values for each of its branches.
+///
+/// \param F The function to query.
+///
+/// \returns true if \p F was modified. Returns false, otherwise.
+bool SampleProfileLoader::emitAnnotations(Function &F) {
+  bool Changed = false;
+
   if (FunctionSamples::ProfileIsProbeBased) {
     if (!ProbeManager->profileIsValid(F, *Samples)) {
       LLVM_DEBUG(
@@ -2216,80 +2216,80 @@ bool SampleProfileLoader::emitAnnotations(Function &F) {
   } else {
     if (getFunctionLoc(F) == 0)
       return false;
- 
+
     LLVM_DEBUG(dbgs() << "Line number for the first instruction in "
                       << F.getName() << ": " << getFunctionLoc(F) << "\n");
   }
- 
-  DenseSet<GlobalValue::GUID> InlinedGUIDs; 
+
+  DenseSet<GlobalValue::GUID> InlinedGUIDs;
   if (ProfileIsCS && CallsitePrioritizedInline)
     Changed |= inlineHotFunctionsWithPriority(F, InlinedGUIDs);
   else
     Changed |= inlineHotFunctions(F, InlinedGUIDs);
- 
-  // Compute basic block weights. 
-  Changed |= computeBlockWeights(F); 
- 
-  if (Changed) { 
-    // Add an entry count to the function using the samples gathered at the 
-    // function entry. 
-    // Sets the GUIDs that are inlined in the profiled binary. This is used 
-    // for ThinLink to make correct liveness analysis, and also make the IR 
-    // match the profiled binary before annotation. 
-    F.setEntryCount( 
-        ProfileCount(Samples->getHeadSamples() + 1, Function::PCT_Real), 
-        &InlinedGUIDs); 
- 
-    // Compute dominance and loop info needed for propagation. 
-    computeDominanceAndLoopInfo(F); 
- 
-    // Find equivalence classes. 
-    findEquivalenceClasses(F); 
- 
-    // Propagate weights to all edges. 
-    propagateWeights(F); 
-  } 
- 
-  // If coverage checking was requested, compute it now. 
-  if (SampleProfileRecordCoverage) { 
-    unsigned Used = CoverageTracker.countUsedRecords(Samples, PSI); 
-    unsigned Total = CoverageTracker.countBodyRecords(Samples, PSI); 
-    unsigned Coverage = CoverageTracker.computeCoverage(Used, Total); 
-    if (Coverage < SampleProfileRecordCoverage) { 
-      F.getContext().diagnose(DiagnosticInfoSampleProfile( 
-          F.getSubprogram()->getFilename(), getFunctionLoc(F), 
-          Twine(Used) + " of " + Twine(Total) + " available profile records (" + 
-              Twine(Coverage) + "%) were applied", 
-          DS_Warning)); 
-    } 
-  } 
- 
-  if (SampleProfileSampleCoverage) { 
-    uint64_t Used = CoverageTracker.getTotalUsedSamples(); 
-    uint64_t Total = CoverageTracker.countBodySamples(Samples, PSI); 
-    unsigned Coverage = CoverageTracker.computeCoverage(Used, Total); 
-    if (Coverage < SampleProfileSampleCoverage) { 
-      F.getContext().diagnose(DiagnosticInfoSampleProfile( 
-          F.getSubprogram()->getFilename(), getFunctionLoc(F), 
-          Twine(Used) + " of " + Twine(Total) + " available profile samples (" + 
-              Twine(Coverage) + "%) were applied", 
-          DS_Warning)); 
-    } 
-  } 
-  return Changed; 
-} 
- 
-char SampleProfileLoaderLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(SampleProfileLoaderLegacyPass, "sample-profile", 
-                      "Sample Profile loader", false, false) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 
-INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile", 
-                    "Sample Profile loader", false, false) 
- 
+
+  // Compute basic block weights.
+  Changed |= computeBlockWeights(F);
+
+  if (Changed) {
+    // Add an entry count to the function using the samples gathered at the
+    // function entry.
+    // Sets the GUIDs that are inlined in the profiled binary. This is used
+    // for ThinLink to make correct liveness analysis, and also make the IR
+    // match the profiled binary before annotation.
+    F.setEntryCount(
+        ProfileCount(Samples->getHeadSamples() + 1, Function::PCT_Real),
+        &InlinedGUIDs);
+
+    // Compute dominance and loop info needed for propagation.
+    computeDominanceAndLoopInfo(F);
+
+    // Find equivalence classes.
+    findEquivalenceClasses(F);
+
+    // Propagate weights to all edges.
+    propagateWeights(F);
+  }
+
+  // If coverage checking was requested, compute it now.
+  if (SampleProfileRecordCoverage) {
+    unsigned Used = CoverageTracker.countUsedRecords(Samples, PSI);
+    unsigned Total = CoverageTracker.countBodyRecords(Samples, PSI);
+    unsigned Coverage = CoverageTracker.computeCoverage(Used, Total);
+    if (Coverage < SampleProfileRecordCoverage) {
+      F.getContext().diagnose(DiagnosticInfoSampleProfile(
+          F.getSubprogram()->getFilename(), getFunctionLoc(F),
+          Twine(Used) + " of " + Twine(Total) + " available profile records (" +
+              Twine(Coverage) + "%) were applied",
+          DS_Warning));
+    }
+  }
+
+  if (SampleProfileSampleCoverage) {
+    uint64_t Used = CoverageTracker.getTotalUsedSamples();
+    uint64_t Total = CoverageTracker.countBodySamples(Samples, PSI);
+    unsigned Coverage = CoverageTracker.computeCoverage(Used, Total);
+    if (Coverage < SampleProfileSampleCoverage) {
+      F.getContext().diagnose(DiagnosticInfoSampleProfile(
+          F.getSubprogram()->getFilename(), getFunctionLoc(F),
+          Twine(Used) + " of " + Twine(Total) + " available profile samples (" +
+              Twine(Coverage) + "%) were applied",
+          DS_Warning));
+    }
+  }
+  return Changed;
+}
+
+char SampleProfileLoaderLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(SampleProfileLoaderLegacyPass, "sample-profile",
+                      "Sample Profile loader", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile",
+                    "Sample Profile loader", false, false)
+
 // Add inlined profile call edges to the call graph.
 void SampleProfileLoader::addCallGraphEdges(CallGraph &CG,
                                             const FunctionSamples &Samples) {
@@ -2329,28 +2329,28 @@ void SampleProfileLoader::replaceCallGraphEdges(
   }
 }
 
-std::vector<Function *> 
-SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) { 
-  std::vector<Function *> FunctionOrderList; 
-  FunctionOrderList.reserve(M.size()); 
- 
-  if (!ProfileTopDownLoad || CG == nullptr) { 
-    if (ProfileMergeInlinee) { 
-      // Disable ProfileMergeInlinee if profile is not loaded in top down order, 
-      // because the profile for a function may be used for the profile 
-      // annotation of its outline copy before the profile merging of its 
-      // non-inlined inline instances, and that is not the way how 
-      // ProfileMergeInlinee is supposed to work. 
-      ProfileMergeInlinee = false; 
-    } 
- 
-    for (Function &F : M) 
-      if (!F.isDeclaration() && F.hasFnAttribute("use-sample-profile")) 
-        FunctionOrderList.push_back(&F); 
-    return FunctionOrderList; 
-  } 
- 
-  assert(&CG->getModule() == &M); 
+std::vector<Function *>
+SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) {
+  std::vector<Function *> FunctionOrderList;
+  FunctionOrderList.reserve(M.size());
+
+  if (!ProfileTopDownLoad || CG == nullptr) {
+    if (ProfileMergeInlinee) {
+      // Disable ProfileMergeInlinee if profile is not loaded in top down order,
+      // because the profile for a function may be used for the profile
+      // annotation of its outline copy before the profile merging of its
+      // non-inlined inline instances, and that is not the way how
+      // ProfileMergeInlinee is supposed to work.
+      ProfileMergeInlinee = false;
+    }
+
+    for (Function &F : M)
+      if (!F.isDeclaration() && F.hasFnAttribute("use-sample-profile"))
+        FunctionOrderList.push_back(&F);
+    return FunctionOrderList;
+  }
+
+  assert(&CG->getModule() == &M);
 
   // Add indirect call edges from profile to augment the static call graph.
   // Functions will be processed in a top-down order defined by the static call
@@ -2414,14 +2414,14 @@ SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) {
     }
   }
 
-  scc_iterator<CallGraph *> CGI = scc_begin(CG); 
-  while (!CGI.isAtEnd()) { 
+  scc_iterator<CallGraph *> CGI = scc_begin(CG);
+  while (!CGI.isAtEnd()) {
     uint64_t Start = FunctionOrderList.size();
     for (CallGraphNode *Node : *CGI) {
       auto *F = Node->getFunction();
-      if (F && !F->isDeclaration() && F->hasFnAttribute("use-sample-profile")) 
-        FunctionOrderList.push_back(F); 
-    } 
+      if (F && !F->isDeclaration() && F->hasFnAttribute("use-sample-profile"))
+        FunctionOrderList.push_back(F);
+    }
 
     // Sort nodes in SCC based on the profile top-down order.
     if (!ProfileOrderMap.empty()) {
@@ -2432,9 +2432,9 @@ SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) {
                        });
     }
 
-    ++CGI; 
-  } 
- 
+    ++CGI;
+  }
+
   LLVM_DEBUG({
     dbgs() << "Function processing order:\n";
     for (auto F : reverse(FunctionOrderList)) {
@@ -2442,41 +2442,41 @@ SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) {
     }
   });
 
-  std::reverse(FunctionOrderList.begin(), FunctionOrderList.end()); 
-  return FunctionOrderList; 
-} 
- 
+  std::reverse(FunctionOrderList.begin(), FunctionOrderList.end());
+  return FunctionOrderList;
+}
+
 bool SampleProfileLoader::doInitialization(Module &M,
                                            FunctionAnalysisManager *FAM) {
-  auto &Ctx = M.getContext(); 
- 
-  auto ReaderOrErr = 
-      SampleProfileReader::create(Filename, Ctx, RemappingFilename); 
-  if (std::error_code EC = ReaderOrErr.getError()) { 
-    std::string Msg = "Could not open profile: " + EC.message(); 
-    Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg)); 
-    return false; 
-  } 
-  Reader = std::move(ReaderOrErr.get()); 
+  auto &Ctx = M.getContext();
+
+  auto ReaderOrErr =
+      SampleProfileReader::create(Filename, Ctx, RemappingFilename);
+  if (std::error_code EC = ReaderOrErr.getError()) {
+    std::string Msg = "Could not open profile: " + EC.message();
+    Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
+    return false;
+  }
+  Reader = std::move(ReaderOrErr.get());
   Reader->setSkipFlatProf(LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink);
-  Reader->collectFuncsFrom(M); 
+  Reader->collectFuncsFrom(M);
   if (std::error_code EC = Reader->read()) {
     std::string Msg = "profile reading failed: " + EC.message();
     Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
     return false;
   }
 
-  PSL = Reader->getProfileSymbolList(); 
- 
-  // While profile-sample-accurate is on, ignore symbol list. 
-  ProfAccForSymsInList = 
-      ProfileAccurateForSymsInList && PSL && !ProfileSampleAccurate; 
-  if (ProfAccForSymsInList) { 
-    NamesInProfile.clear(); 
-    if (auto NameTable = Reader->getNameTable()) 
-      NamesInProfile.insert(NameTable->begin(), NameTable->end()); 
-  } 
- 
+  PSL = Reader->getProfileSymbolList();
+
+  // While profile-sample-accurate is on, ignore symbol list.
+  ProfAccForSymsInList =
+      ProfileAccurateForSymsInList && PSL && !ProfileSampleAccurate;
+  if (ProfAccForSymsInList) {
+    NamesInProfile.clear();
+    if (auto NameTable = Reader->getNameTable())
+      NamesInProfile.insert(NameTable->begin(), NameTable->end());
+  }
+
   if (FAM && !ProfileInlineReplayFile.empty()) {
     ExternalInlineAdvisor = std::make_unique<ReplayInlineAdvisor>(
         M, *FAM, Ctx, /*OriginalAdvisor=*/nullptr, ProfileInlineReplayFile,
@@ -2512,51 +2512,51 @@ bool SampleProfileLoader::doInitialization(Module &M,
     }
   }
 
-  return true; 
-} 
- 
-ModulePass *llvm::createSampleProfileLoaderPass() { 
-  return new SampleProfileLoaderLegacyPass(); 
-} 
- 
-ModulePass *llvm::createSampleProfileLoaderPass(StringRef Name) { 
-  return new SampleProfileLoaderLegacyPass(Name); 
-} 
- 
-bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM, 
-                                      ProfileSummaryInfo *_PSI, CallGraph *CG) { 
-  GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap); 
- 
-  PSI = _PSI; 
-  if (M.getProfileSummary(/* IsCS */ false) == nullptr) { 
-    M.setProfileSummary(Reader->getSummary().getMD(M.getContext()), 
-                        ProfileSummary::PSK_Sample); 
-    PSI->refresh(); 
-  } 
-  // Compute the total number of samples collected in this profile. 
-  for (const auto &I : Reader->getProfiles()) 
-    TotalCollectedSamples += I.second.getTotalSamples(); 
- 
+  return true;
+}
+
+ModulePass *llvm::createSampleProfileLoaderPass() {
+  return new SampleProfileLoaderLegacyPass();
+}
+
+ModulePass *llvm::createSampleProfileLoaderPass(StringRef Name) {
+  return new SampleProfileLoaderLegacyPass(Name);
+}
+
+bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
+                                      ProfileSummaryInfo *_PSI, CallGraph *CG) {
+  GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap);
+
+  PSI = _PSI;
+  if (M.getProfileSummary(/* IsCS */ false) == nullptr) {
+    M.setProfileSummary(Reader->getSummary().getMD(M.getContext()),
+                        ProfileSummary::PSK_Sample);
+    PSI->refresh();
+  }
+  // Compute the total number of samples collected in this profile.
+  for (const auto &I : Reader->getProfiles())
+    TotalCollectedSamples += I.second.getTotalSamples();
+
   auto Remapper = Reader->getRemapper();
-  // Populate the symbol map. 
-  for (const auto &N_F : M.getValueSymbolTable()) { 
-    StringRef OrigName = N_F.getKey(); 
-    Function *F = dyn_cast<Function>(N_F.getValue()); 
-    if (F == nullptr) 
-      continue; 
-    SymbolMap[OrigName] = F; 
-    auto pos = OrigName.find('.'); 
-    if (pos != StringRef::npos) { 
-      StringRef NewName = OrigName.substr(0, pos); 
-      auto r = SymbolMap.insert(std::make_pair(NewName, F)); 
-      // Failiing to insert means there is already an entry in SymbolMap, 
-      // thus there are multiple functions that are mapped to the same 
-      // stripped name. In this case of name conflicting, set the value 
-      // to nullptr to avoid confusion. 
-      if (!r.second) 
-        r.first->second = nullptr; 
+  // Populate the symbol map.
+  for (const auto &N_F : M.getValueSymbolTable()) {
+    StringRef OrigName = N_F.getKey();
+    Function *F = dyn_cast<Function>(N_F.getValue());
+    if (F == nullptr)
+      continue;
+    SymbolMap[OrigName] = F;
+    auto pos = OrigName.find('.');
+    if (pos != StringRef::npos) {
+      StringRef NewName = OrigName.substr(0, pos);
+      auto r = SymbolMap.insert(std::make_pair(NewName, F));
+      // Failiing to insert means there is already an entry in SymbolMap,
+      // thus there are multiple functions that are mapped to the same
+      // stripped name. In this case of name conflicting, set the value
+      // to nullptr to avoid confusion.
+      if (!r.second)
+        r.first->second = nullptr;
       OrigName = NewName;
-    } 
+    }
     // Insert the remapped names into SymbolMap.
     if (Remapper) {
       if (auto MapName = Remapper->lookUpNameInProfile(OrigName)) {
@@ -2565,129 +2565,129 @@ bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
         SymbolMap.insert(std::make_pair(*MapName, F));
       }
     }
-  } 
- 
-  bool retval = false; 
-  for (auto F : buildFunctionOrder(M, CG)) { 
-    assert(!F->isDeclaration()); 
-    clearFunctionData(); 
-    retval |= runOnFunction(*F, AM); 
-  } 
- 
-  // Account for cold calls not inlined.... 
+  }
+
+  bool retval = false;
+  for (auto F : buildFunctionOrder(M, CG)) {
+    assert(!F->isDeclaration());
+    clearFunctionData();
+    retval |= runOnFunction(*F, AM);
+  }
+
+  // Account for cold calls not inlined....
   if (!ProfileIsCS)
     for (const std::pair<Function *, NotInlinedProfileInfo> &pair :
          notInlinedCallInfo)
       updateProfileCallee(pair.first, pair.second.entryCount);
- 
-  return retval; 
-} 
- 
-bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) { 
-  ACT = &getAnalysis<AssumptionCacheTracker>(); 
-  TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>(); 
-  TLIWP = &getAnalysis<TargetLibraryInfoWrapperPass>(); 
-  ProfileSummaryInfo *PSI = 
-      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 
-  return SampleLoader.runOnModule(M, nullptr, PSI, nullptr); 
-} 
- 
-bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) { 
+
+  return retval;
+}
+
+bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) {
+  ACT = &getAnalysis<AssumptionCacheTracker>();
+  TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
+  TLIWP = &getAnalysis<TargetLibraryInfoWrapperPass>();
+  ProfileSummaryInfo *PSI =
+      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  return SampleLoader.runOnModule(M, nullptr, PSI, nullptr);
+}
+
+bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) {
   LLVM_DEBUG(dbgs() << "\n\nProcessing Function " << F.getName() << "\n");
-  DILocation2SampleMap.clear(); 
-  // By default the entry count is initialized to -1, which will be treated 
-  // conservatively by getEntryCount as the same as unknown (None). This is 
-  // to avoid newly added code to be treated as cold. If we have samples 
-  // this will be overwritten in emitAnnotations. 
-  uint64_t initialEntryCount = -1; 
- 
-  ProfAccForSymsInList = ProfileAccurateForSymsInList && PSL; 
-  if (ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate")) { 
-    // initialize all the function entry counts to 0. It means all the 
-    // functions without profile will be regarded as cold. 
-    initialEntryCount = 0; 
-    // profile-sample-accurate is a user assertion which has a higher precedence 
-    // than symbol list. When profile-sample-accurate is on, ignore symbol list. 
-    ProfAccForSymsInList = false; 
-  } 
- 
-  // PSL -- profile symbol list include all the symbols in sampled binary. 
-  // If ProfileAccurateForSymsInList is enabled, PSL is used to treat 
-  // old functions without samples being cold, without having to worry 
-  // about new and hot functions being mistakenly treated as cold. 
-  if (ProfAccForSymsInList) { 
-    // Initialize the entry count to 0 for functions in the list. 
-    if (PSL->contains(F.getName())) 
-      initialEntryCount = 0; 
- 
-    // Function in the symbol list but without sample will be regarded as 
-    // cold. To minimize the potential negative performance impact it could 
-    // have, we want to be a little conservative here saying if a function 
-    // shows up in the profile, no matter as outline function, inline instance 
-    // or call targets, treat the function as not being cold. This will handle 
-    // the cases such as most callsites of a function are inlined in sampled 
-    // binary but not inlined in current build (because of source code drift, 
-    // imprecise debug information, or the callsites are all cold individually 
-    // but not cold accumulatively...), so the outline function showing up as 
-    // cold in sampled binary will actually not be cold after current build. 
-    StringRef CanonName = FunctionSamples::getCanonicalFnName(F); 
-    if (NamesInProfile.count(CanonName)) 
-      initialEntryCount = -1; 
-  } 
- 
+  DILocation2SampleMap.clear();
+  // By default the entry count is initialized to -1, which will be treated
+  // conservatively by getEntryCount as the same as unknown (None). This is
+  // to avoid newly added code to be treated as cold. If we have samples
+  // this will be overwritten in emitAnnotations.
+  uint64_t initialEntryCount = -1;
+
+  ProfAccForSymsInList = ProfileAccurateForSymsInList && PSL;
+  if (ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate")) {
+    // initialize all the function entry counts to 0. It means all the
+    // functions without profile will be regarded as cold.
+    initialEntryCount = 0;
+    // profile-sample-accurate is a user assertion which has a higher precedence
+    // than symbol list. When profile-sample-accurate is on, ignore symbol list.
+    ProfAccForSymsInList = false;
+  }
+
+  // PSL -- profile symbol list include all the symbols in sampled binary.
+  // If ProfileAccurateForSymsInList is enabled, PSL is used to treat
+  // old functions without samples being cold, without having to worry
+  // about new and hot functions being mistakenly treated as cold.
+  if (ProfAccForSymsInList) {
+    // Initialize the entry count to 0 for functions in the list.
+    if (PSL->contains(F.getName()))
+      initialEntryCount = 0;
+
+    // Function in the symbol list but without sample will be regarded as
+    // cold. To minimize the potential negative performance impact it could
+    // have, we want to be a little conservative here saying if a function
+    // shows up in the profile, no matter as outline function, inline instance
+    // or call targets, treat the function as not being cold. This will handle
+    // the cases such as most callsites of a function are inlined in sampled
+    // binary but not inlined in current build (because of source code drift,
+    // imprecise debug information, or the callsites are all cold individually
+    // but not cold accumulatively...), so the outline function showing up as
+    // cold in sampled binary will actually not be cold after current build.
+    StringRef CanonName = FunctionSamples::getCanonicalFnName(F);
+    if (NamesInProfile.count(CanonName))
+      initialEntryCount = -1;
+  }
+
   // Initialize entry count when the function has no existing entry
   // count value.
   if (!F.getEntryCount().hasValue())
     F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real));
-  std::unique_ptr<OptimizationRemarkEmitter> OwnedORE; 
-  if (AM) { 
-    auto &FAM = 
-        AM->getResult<FunctionAnalysisManagerModuleProxy>(*F.getParent()) 
-            .getManager(); 
-    ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); 
-  } else { 
-    OwnedORE = std::make_unique<OptimizationRemarkEmitter>(&F); 
-    ORE = OwnedORE.get(); 
-  } 
+  std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
+  if (AM) {
+    auto &FAM =
+        AM->getResult<FunctionAnalysisManagerModuleProxy>(*F.getParent())
+            .getManager();
+    ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  } else {
+    OwnedORE = std::make_unique<OptimizationRemarkEmitter>(&F);
+    ORE = OwnedORE.get();
+  }
 
   if (ProfileIsCS)
     Samples = ContextTracker->getBaseSamplesFor(F);
   else
     Samples = Reader->getSamplesFor(F);
 
-  if (Samples && !Samples->empty()) 
-    return emitAnnotations(F); 
-  return false; 
-} 
- 
-PreservedAnalyses SampleProfileLoaderPass::run(Module &M, 
-                                               ModuleAnalysisManager &AM) { 
-  FunctionAnalysisManager &FAM = 
-      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 
- 
-  auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & { 
-    return FAM.getResult<AssumptionAnalysis>(F); 
-  }; 
-  auto GetTTI = [&](Function &F) -> TargetTransformInfo & { 
-    return FAM.getResult<TargetIRAnalysis>(F); 
-  }; 
-  auto GetTLI = [&](Function &F) -> const TargetLibraryInfo & { 
-    return FAM.getResult<TargetLibraryAnalysis>(F); 
-  }; 
- 
-  SampleProfileLoader SampleLoader( 
-      ProfileFileName.empty() ? SampleProfileFile : ProfileFileName, 
-      ProfileRemappingFileName.empty() ? SampleProfileRemappingFile 
-                                       : ProfileRemappingFileName, 
+  if (Samples && !Samples->empty())
+    return emitAnnotations(F);
+  return false;
+}
+
+PreservedAnalyses SampleProfileLoaderPass::run(Module &M,
+                                               ModuleAnalysisManager &AM) {
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+
+  auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
+    return FAM.getResult<AssumptionAnalysis>(F);
+  };
+  auto GetTTI = [&](Function &F) -> TargetTransformInfo & {
+    return FAM.getResult<TargetIRAnalysis>(F);
+  };
+  auto GetTLI = [&](Function &F) -> const TargetLibraryInfo & {
+    return FAM.getResult<TargetLibraryAnalysis>(F);
+  };
+
+  SampleProfileLoader SampleLoader(
+      ProfileFileName.empty() ? SampleProfileFile : ProfileFileName,
+      ProfileRemappingFileName.empty() ? SampleProfileRemappingFile
+                                       : ProfileRemappingFileName,
       LTOPhase, GetAssumptionCache, GetTTI, GetTLI);
- 
+
   if (!SampleLoader.doInitialization(M, &FAM))
-    return PreservedAnalyses::all(); 
- 
-  ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M); 
-  CallGraph &CG = AM.getResult<CallGraphAnalysis>(M); 
-  if (!SampleLoader.runOnModule(M, &AM, PSI, &CG)) 
-    return PreservedAnalyses::all(); 
- 
-  return PreservedAnalyses::none(); 
-} 
+    return PreservedAnalyses::all();
+
+  ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
+  CallGraph &CG = AM.getResult<CallGraphAnalysis>(M);
+  if (!SampleLoader.runOnModule(M, &AM, PSI, &CG))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/StripDeadPrototypes.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/StripDeadPrototypes.cpp
index 1e9cbeac6d..655a7a4049 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/StripDeadPrototypes.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/StripDeadPrototypes.cpp
@@ -1,88 +1,88 @@
-//===-- StripDeadPrototypes.cpp - Remove unused function declarations ----===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass loops over all of the functions in the input module, looking for 
-// dead declarations and removes them. Dead declarations are declarations of 
-// functions for which no implementation is available (i.e., declarations for 
-// unused library functions). 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/IPO/StripDeadPrototypes.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Transforms/IPO.h" 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "strip-dead-prototypes" 
- 
-STATISTIC(NumDeadPrototypes, "Number of dead prototypes removed"); 
- 
-static bool stripDeadPrototypes(Module &M) { 
-  bool MadeChange = false; 
- 
-  // Erase dead function prototypes. 
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ) { 
-    Function *F = &*I++; 
-    // Function must be a prototype and unused. 
-    if (F->isDeclaration() && F->use_empty()) { 
-      F->eraseFromParent(); 
-      ++NumDeadPrototypes; 
-      MadeChange = true; 
-    } 
-  } 
- 
-  // Erase dead global var prototypes. 
-  for (Module::global_iterator I = M.global_begin(), E = M.global_end(); 
-       I != E; ) { 
-    GlobalVariable *GV = &*I++; 
-    // Global must be a prototype and unused. 
-    if (GV->isDeclaration() && GV->use_empty()) 
-      GV->eraseFromParent(); 
-  } 
- 
-  // Return an indication of whether we changed anything or not. 
-  return MadeChange; 
-} 
- 
-PreservedAnalyses StripDeadPrototypesPass::run(Module &M, 
-                                               ModuleAnalysisManager &) { 
-  if (stripDeadPrototypes(M)) 
-    return PreservedAnalyses::none(); 
-  return PreservedAnalyses::all(); 
-} 
- 
-namespace { 
- 
-class StripDeadPrototypesLegacyPass : public ModulePass { 
-public: 
-  static char ID; // Pass identification, replacement for typeid 
-  StripDeadPrototypesLegacyPass() : ModulePass(ID) { 
-    initializeStripDeadPrototypesLegacyPassPass( 
-        *PassRegistry::getPassRegistry()); 
-  } 
-  bool runOnModule(Module &M) override { 
-    if (skipModule(M)) 
-      return false; 
- 
-    return stripDeadPrototypes(M); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-char StripDeadPrototypesLegacyPass::ID = 0; 
-INITIALIZE_PASS(StripDeadPrototypesLegacyPass, "strip-dead-prototypes", 
-                "Strip Unused Function Prototypes", false, false) 
- 
-ModulePass *llvm::createStripDeadPrototypesPass() { 
-  return new StripDeadPrototypesLegacyPass(); 
-} 
+//===-- StripDeadPrototypes.cpp - Remove unused function declarations ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass loops over all of the functions in the input module, looking for
+// dead declarations and removes them. Dead declarations are declarations of
+// functions for which no implementation is available (i.e., declarations for
+// unused library functions).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/StripDeadPrototypes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "strip-dead-prototypes"
+
+STATISTIC(NumDeadPrototypes, "Number of dead prototypes removed");
+
+static bool stripDeadPrototypes(Module &M) {
+  bool MadeChange = false;
+
+  // Erase dead function prototypes.
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {
+    Function *F = &*I++;
+    // Function must be a prototype and unused.
+    if (F->isDeclaration() && F->use_empty()) {
+      F->eraseFromParent();
+      ++NumDeadPrototypes;
+      MadeChange = true;
+    }
+  }
+
+  // Erase dead global var prototypes.
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ) {
+    GlobalVariable *GV = &*I++;
+    // Global must be a prototype and unused.
+    if (GV->isDeclaration() && GV->use_empty())
+      GV->eraseFromParent();
+  }
+
+  // Return an indication of whether we changed anything or not.
+  return MadeChange;
+}
+
+PreservedAnalyses StripDeadPrototypesPass::run(Module &M,
+                                               ModuleAnalysisManager &) {
+  if (stripDeadPrototypes(M))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+namespace {
+
+class StripDeadPrototypesLegacyPass : public ModulePass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  StripDeadPrototypesLegacyPass() : ModulePass(ID) {
+    initializeStripDeadPrototypesLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+
+    return stripDeadPrototypes(M);
+  }
+};
+
+} // end anonymous namespace
+
+char StripDeadPrototypesLegacyPass::ID = 0;
+INITIALIZE_PASS(StripDeadPrototypesLegacyPass, "strip-dead-prototypes",
+                "Strip Unused Function Prototypes", false, false)
+
+ModulePass *llvm::createStripDeadPrototypesPass() {
+  return new StripDeadPrototypesLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/StripSymbols.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/StripSymbols.cpp
index d35f785a31..4fc71847a0 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/StripSymbols.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/StripSymbols.cpp
@@ -1,382 +1,382 @@
-//===- StripSymbols.cpp - Strip symbols and debug info from a module ------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// The StripSymbols transformation implements code stripping. Specifically, it 
-// can delete: 
-// 
-//   * names for virtual registers 
-//   * symbols for internal globals and functions 
-//   * debug information 
-// 
-// Note that this transformation makes code much less readable, so it should 
-// only be used in situations where the 'strip' utility would be used, such as 
-// reducing code size or making it harder to reverse engineer code. 
-// 
-//===----------------------------------------------------------------------===// 
- 
+//===- StripSymbols.cpp - Strip symbols and debug info from a module ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The StripSymbols transformation implements code stripping. Specifically, it
+// can delete:
+//
+//   * names for virtual registers
+//   * symbols for internal globals and functions
+//   * debug information
+//
+// Note that this transformation makes code much less readable, so it should
+// only be used in situations where the 'strip' utility would be used, such as
+// reducing code size or making it harder to reverse engineer code.
+//
+//===----------------------------------------------------------------------===//
+
 #include "llvm/Transforms/IPO/StripSymbols.h"
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DebugInfo.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Module.h" 
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/IR/TypeFinder.h" 
-#include "llvm/IR/ValueSymbolTable.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Transforms/IPO.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-
-using namespace llvm; 
- 
-namespace { 
-  class StripSymbols : public ModulePass { 
-    bool OnlyDebugInfo; 
-  public: 
-    static char ID; // Pass identification, replacement for typeid 
-    explicit StripSymbols(bool ODI = false) 
-      : ModulePass(ID), OnlyDebugInfo(ODI) { 
-        initializeStripSymbolsPass(*PassRegistry::getPassRegistry()); 
-      } 
- 
-    bool runOnModule(Module &M) override; 
- 
-    void getAnalysisUsage(AnalysisUsage &AU) const override { 
-      AU.setPreservesAll(); 
-    } 
-  }; 
- 
-  class StripNonDebugSymbols : public ModulePass { 
-  public: 
-    static char ID; // Pass identification, replacement for typeid 
-    explicit StripNonDebugSymbols() 
-      : ModulePass(ID) { 
-        initializeStripNonDebugSymbolsPass(*PassRegistry::getPassRegistry()); 
-      } 
- 
-    bool runOnModule(Module &M) override; 
- 
-    void getAnalysisUsage(AnalysisUsage &AU) const override { 
-      AU.setPreservesAll(); 
-    } 
-  }; 
- 
-  class StripDebugDeclare : public ModulePass { 
-  public: 
-    static char ID; // Pass identification, replacement for typeid 
-    explicit StripDebugDeclare() 
-      : ModulePass(ID) { 
-        initializeStripDebugDeclarePass(*PassRegistry::getPassRegistry()); 
-      } 
- 
-    bool runOnModule(Module &M) override; 
- 
-    void getAnalysisUsage(AnalysisUsage &AU) const override { 
-      AU.setPreservesAll(); 
-    } 
-  }; 
- 
-  class StripDeadDebugInfo : public ModulePass { 
-  public: 
-    static char ID; // Pass identification, replacement for typeid 
-    explicit StripDeadDebugInfo() 
-      : ModulePass(ID) { 
-        initializeStripDeadDebugInfoPass(*PassRegistry::getPassRegistry()); 
-      } 
- 
-    bool runOnModule(Module &M) override; 
- 
-    void getAnalysisUsage(AnalysisUsage &AU) const override { 
-      AU.setPreservesAll(); 
-    } 
-  }; 
-} 
- 
-char StripSymbols::ID = 0; 
-INITIALIZE_PASS(StripSymbols, "strip", 
-                "Strip all symbols from a module", false, false) 
- 
-ModulePass *llvm::createStripSymbolsPass(bool OnlyDebugInfo) { 
-  return new StripSymbols(OnlyDebugInfo); 
-} 
- 
-char StripNonDebugSymbols::ID = 0; 
-INITIALIZE_PASS(StripNonDebugSymbols, "strip-nondebug", 
-                "Strip all symbols, except dbg symbols, from a module", 
-                false, false) 
- 
-ModulePass *llvm::createStripNonDebugSymbolsPass() { 
-  return new StripNonDebugSymbols(); 
-} 
- 
-char StripDebugDeclare::ID = 0; 
-INITIALIZE_PASS(StripDebugDeclare, "strip-debug-declare", 
-                "Strip all llvm.dbg.declare intrinsics", false, false) 
- 
-ModulePass *llvm::createStripDebugDeclarePass() { 
-  return new StripDebugDeclare(); 
-} 
- 
-char StripDeadDebugInfo::ID = 0; 
-INITIALIZE_PASS(StripDeadDebugInfo, "strip-dead-debug-info", 
-                "Strip debug info for unused symbols", false, false) 
- 
-ModulePass *llvm::createStripDeadDebugInfoPass() { 
-  return new StripDeadDebugInfo(); 
-} 
- 
-/// OnlyUsedBy - Return true if V is only used by Usr. 
-static bool OnlyUsedBy(Value *V, Value *Usr) { 
-  for (User *U : V->users()) 
-    if (U != Usr) 
-      return false; 
- 
-  return true; 
-} 
- 
-static void RemoveDeadConstant(Constant *C) { 
-  assert(C->use_empty() && "Constant is not dead!"); 
-  SmallPtrSet<Constant*, 4> Operands; 
-  for (Value *Op : C->operands()) 
-    if (OnlyUsedBy(Op, C)) 
-      Operands.insert(cast<Constant>(Op)); 
-  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) { 
-    if (!GV->hasLocalLinkage()) return;   // Don't delete non-static globals. 
-    GV->eraseFromParent(); 
-  } else if (!isa<Function>(C)) { 
-    // FIXME: Why does the type of the constant matter here? 
-    if (isa<StructType>(C->getType()) || isa<ArrayType>(C->getType()) || 
-        isa<VectorType>(C->getType())) 
-      C->destroyConstant(); 
-  } 
- 
-  // If the constant referenced anything, see if we can delete it as well. 
-  for (Constant *O : Operands) 
-    RemoveDeadConstant(O); 
-} 
- 
-// Strip the symbol table of its names. 
-// 
-static void StripSymtab(ValueSymbolTable &ST, bool PreserveDbgInfo) { 
-  for (ValueSymbolTable::iterator VI = ST.begin(), VE = ST.end(); VI != VE; ) { 
-    Value *V = VI->getValue(); 
-    ++VI; 
-    if (!isa<GlobalValue>(V) || cast<GlobalValue>(V)->hasLocalLinkage()) { 
-      if (!PreserveDbgInfo || !V->getName().startswith("llvm.dbg")) 
-        // Set name to "", removing from symbol table! 
-        V->setName(""); 
-    } 
-  } 
-} 
- 
-// Strip any named types of their names. 
-static void StripTypeNames(Module &M, bool PreserveDbgInfo) { 
-  TypeFinder StructTypes; 
-  StructTypes.run(M, false); 
- 
-  for (unsigned i = 0, e = StructTypes.size(); i != e; ++i) { 
-    StructType *STy = StructTypes[i]; 
-    if (STy->isLiteral() || STy->getName().empty()) continue; 
- 
-    if (PreserveDbgInfo && STy->getName().startswith("llvm.dbg")) 
-      continue; 
- 
-    STy->setName(""); 
-  } 
-} 
- 
-/// Find values that are marked as llvm.used. 
-static void findUsedValues(GlobalVariable *LLVMUsed, 
-                           SmallPtrSetImpl<const GlobalValue*> &UsedValues) { 
-  if (!LLVMUsed) return; 
-  UsedValues.insert(LLVMUsed); 
- 
-  ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer()); 
- 
-  for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i) 
-    if (GlobalValue *GV = 
-          dyn_cast<GlobalValue>(Inits->getOperand(i)->stripPointerCasts())) 
-      UsedValues.insert(GV); 
-} 
- 
-/// StripSymbolNames - Strip symbol names. 
-static bool StripSymbolNames(Module &M, bool PreserveDbgInfo) { 
- 
-  SmallPtrSet<const GlobalValue*, 8> llvmUsedValues; 
-  findUsedValues(M.getGlobalVariable("llvm.used"), llvmUsedValues); 
-  findUsedValues(M.getGlobalVariable("llvm.compiler.used"), llvmUsedValues); 
- 
-  for (Module::global_iterator I = M.global_begin(), E = M.global_end(); 
-       I != E; ++I) { 
-    if (I->hasLocalLinkage() && llvmUsedValues.count(&*I) == 0) 
-      if (!PreserveDbgInfo || !I->getName().startswith("llvm.dbg")) 
-        I->setName("");     // Internal symbols can't participate in linkage 
-  } 
- 
-  for (Function &I : M) { 
-    if (I.hasLocalLinkage() && llvmUsedValues.count(&I) == 0) 
-      if (!PreserveDbgInfo || !I.getName().startswith("llvm.dbg")) 
-        I.setName(""); // Internal symbols can't participate in linkage 
-    if (auto *Symtab = I.getValueSymbolTable()) 
-      StripSymtab(*Symtab, PreserveDbgInfo); 
-  } 
- 
-  // Remove all names from types. 
-  StripTypeNames(M, PreserveDbgInfo); 
- 
-  return true; 
-} 
- 
-bool StripSymbols::runOnModule(Module &M) { 
-  if (skipModule(M)) 
-    return false; 
- 
-  bool Changed = false; 
-  Changed |= StripDebugInfo(M); 
-  if (!OnlyDebugInfo) 
-    Changed |= StripSymbolNames(M, false); 
-  return Changed; 
-} 
- 
-bool StripNonDebugSymbols::runOnModule(Module &M) { 
-  if (skipModule(M)) 
-    return false; 
- 
-  return StripSymbolNames(M, true); 
-} 
- 
+#include "llvm/IR/TypeFinder.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+namespace {
+  class StripSymbols : public ModulePass {
+    bool OnlyDebugInfo;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit StripSymbols(bool ODI = false)
+      : ModulePass(ID), OnlyDebugInfo(ODI) {
+        initializeStripSymbolsPass(*PassRegistry::getPassRegistry());
+      }
+
+    bool runOnModule(Module &M) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesAll();
+    }
+  };
+
+  class StripNonDebugSymbols : public ModulePass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit StripNonDebugSymbols()
+      : ModulePass(ID) {
+        initializeStripNonDebugSymbolsPass(*PassRegistry::getPassRegistry());
+      }
+
+    bool runOnModule(Module &M) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesAll();
+    }
+  };
+
+  class StripDebugDeclare : public ModulePass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit StripDebugDeclare()
+      : ModulePass(ID) {
+        initializeStripDebugDeclarePass(*PassRegistry::getPassRegistry());
+      }
+
+    bool runOnModule(Module &M) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesAll();
+    }
+  };
+
+  class StripDeadDebugInfo : public ModulePass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit StripDeadDebugInfo()
+      : ModulePass(ID) {
+        initializeStripDeadDebugInfoPass(*PassRegistry::getPassRegistry());
+      }
+
+    bool runOnModule(Module &M) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesAll();
+    }
+  };
+}
+
+char StripSymbols::ID = 0;
+INITIALIZE_PASS(StripSymbols, "strip",
+                "Strip all symbols from a module", false, false)
+
+ModulePass *llvm::createStripSymbolsPass(bool OnlyDebugInfo) {
+  return new StripSymbols(OnlyDebugInfo);
+}
+
+char StripNonDebugSymbols::ID = 0;
+INITIALIZE_PASS(StripNonDebugSymbols, "strip-nondebug",
+                "Strip all symbols, except dbg symbols, from a module",
+                false, false)
+
+ModulePass *llvm::createStripNonDebugSymbolsPass() {
+  return new StripNonDebugSymbols();
+}
+
+char StripDebugDeclare::ID = 0;
+INITIALIZE_PASS(StripDebugDeclare, "strip-debug-declare",
+                "Strip all llvm.dbg.declare intrinsics", false, false)
+
+ModulePass *llvm::createStripDebugDeclarePass() {
+  return new StripDebugDeclare();
+}
+
+char StripDeadDebugInfo::ID = 0;
+INITIALIZE_PASS(StripDeadDebugInfo, "strip-dead-debug-info",
+                "Strip debug info for unused symbols", false, false)
+
+ModulePass *llvm::createStripDeadDebugInfoPass() {
+  return new StripDeadDebugInfo();
+}
+
+/// OnlyUsedBy - Return true if V is only used by Usr.
+static bool OnlyUsedBy(Value *V, Value *Usr) {
+  for (User *U : V->users())
+    if (U != Usr)
+      return false;
+
+  return true;
+}
+
+static void RemoveDeadConstant(Constant *C) {
+  assert(C->use_empty() && "Constant is not dead!");
+  SmallPtrSet<Constant*, 4> Operands;
+  for (Value *Op : C->operands())
+    if (OnlyUsedBy(Op, C))
+      Operands.insert(cast<Constant>(Op));
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
+    if (!GV->hasLocalLinkage()) return;   // Don't delete non-static globals.
+    GV->eraseFromParent();
+  } else if (!isa<Function>(C)) {
+    // FIXME: Why does the type of the constant matter here?
+    if (isa<StructType>(C->getType()) || isa<ArrayType>(C->getType()) ||
+        isa<VectorType>(C->getType()))
+      C->destroyConstant();
+  }
+
+  // If the constant referenced anything, see if we can delete it as well.
+  for (Constant *O : Operands)
+    RemoveDeadConstant(O);
+}
+
+// Strip the symbol table of its names.
+//
+static void StripSymtab(ValueSymbolTable &ST, bool PreserveDbgInfo) {
+  for (ValueSymbolTable::iterator VI = ST.begin(), VE = ST.end(); VI != VE; ) {
+    Value *V = VI->getValue();
+    ++VI;
+    if (!isa<GlobalValue>(V) || cast<GlobalValue>(V)->hasLocalLinkage()) {
+      if (!PreserveDbgInfo || !V->getName().startswith("llvm.dbg"))
+        // Set name to "", removing from symbol table!
+        V->setName("");
+    }
+  }
+}
+
+// Strip any named types of their names.
+static void StripTypeNames(Module &M, bool PreserveDbgInfo) {
+  TypeFinder StructTypes;
+  StructTypes.run(M, false);
+
+  for (unsigned i = 0, e = StructTypes.size(); i != e; ++i) {
+    StructType *STy = StructTypes[i];
+    if (STy->isLiteral() || STy->getName().empty()) continue;
+
+    if (PreserveDbgInfo && STy->getName().startswith("llvm.dbg"))
+      continue;
+
+    STy->setName("");
+  }
+}
+
+/// Find values that are marked as llvm.used.
+static void findUsedValues(GlobalVariable *LLVMUsed,
+                           SmallPtrSetImpl<const GlobalValue*> &UsedValues) {
+  if (!LLVMUsed) return;
+  UsedValues.insert(LLVMUsed);
+
+  ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer());
+
+  for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i)
+    if (GlobalValue *GV =
+          dyn_cast<GlobalValue>(Inits->getOperand(i)->stripPointerCasts()))
+      UsedValues.insert(GV);
+}
+
+/// StripSymbolNames - Strip symbol names.
+static bool StripSymbolNames(Module &M, bool PreserveDbgInfo) {
+
+  SmallPtrSet<const GlobalValue*, 8> llvmUsedValues;
+  findUsedValues(M.getGlobalVariable("llvm.used"), llvmUsedValues);
+  findUsedValues(M.getGlobalVariable("llvm.compiler.used"), llvmUsedValues);
+
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I) {
+    if (I->hasLocalLinkage() && llvmUsedValues.count(&*I) == 0)
+      if (!PreserveDbgInfo || !I->getName().startswith("llvm.dbg"))
+        I->setName("");     // Internal symbols can't participate in linkage
+  }
+
+  for (Function &I : M) {
+    if (I.hasLocalLinkage() && llvmUsedValues.count(&I) == 0)
+      if (!PreserveDbgInfo || !I.getName().startswith("llvm.dbg"))
+        I.setName(""); // Internal symbols can't participate in linkage
+    if (auto *Symtab = I.getValueSymbolTable())
+      StripSymtab(*Symtab, PreserveDbgInfo);
+  }
+
+  // Remove all names from types.
+  StripTypeNames(M, PreserveDbgInfo);
+
+  return true;
+}
+
+bool StripSymbols::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  bool Changed = false;
+  Changed |= StripDebugInfo(M);
+  if (!OnlyDebugInfo)
+    Changed |= StripSymbolNames(M, false);
+  return Changed;
+}
+
+bool StripNonDebugSymbols::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  return StripSymbolNames(M, true);
+}
+
 static bool stripDebugDeclareImpl(Module &M) {
- 
-  Function *Declare = M.getFunction("llvm.dbg.declare"); 
-  std::vector<Constant*> DeadConstants; 
- 
-  if (Declare) { 
-    while (!Declare->use_empty()) { 
-      CallInst *CI = cast<CallInst>(Declare->user_back()); 
-      Value *Arg1 = CI->getArgOperand(0); 
-      Value *Arg2 = CI->getArgOperand(1); 
-      assert(CI->use_empty() && "llvm.dbg intrinsic should have void result"); 
-      CI->eraseFromParent(); 
-      if (Arg1->use_empty()) { 
-        if (Constant *C = dyn_cast<Constant>(Arg1)) 
-          DeadConstants.push_back(C); 
-        else 
-          RecursivelyDeleteTriviallyDeadInstructions(Arg1); 
-      } 
-      if (Arg2->use_empty()) 
-        if (Constant *C = dyn_cast<Constant>(Arg2)) 
-          DeadConstants.push_back(C); 
-    } 
-    Declare->eraseFromParent(); 
-  } 
- 
-  while (!DeadConstants.empty()) { 
-    Constant *C = DeadConstants.back(); 
-    DeadConstants.pop_back(); 
-    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) { 
-      if (GV->hasLocalLinkage()) 
-        RemoveDeadConstant(GV); 
-    } else 
-      RemoveDeadConstant(C); 
-  } 
- 
-  return true; 
-} 
- 
+
+  Function *Declare = M.getFunction("llvm.dbg.declare");
+  std::vector<Constant*> DeadConstants;
+
+  if (Declare) {
+    while (!Declare->use_empty()) {
+      CallInst *CI = cast<CallInst>(Declare->user_back());
+      Value *Arg1 = CI->getArgOperand(0);
+      Value *Arg2 = CI->getArgOperand(1);
+      assert(CI->use_empty() && "llvm.dbg intrinsic should have void result");
+      CI->eraseFromParent();
+      if (Arg1->use_empty()) {
+        if (Constant *C = dyn_cast<Constant>(Arg1))
+          DeadConstants.push_back(C);
+        else
+          RecursivelyDeleteTriviallyDeadInstructions(Arg1);
+      }
+      if (Arg2->use_empty())
+        if (Constant *C = dyn_cast<Constant>(Arg2))
+          DeadConstants.push_back(C);
+    }
+    Declare->eraseFromParent();
+  }
+
+  while (!DeadConstants.empty()) {
+    Constant *C = DeadConstants.back();
+    DeadConstants.pop_back();
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
+      if (GV->hasLocalLinkage())
+        RemoveDeadConstant(GV);
+    } else
+      RemoveDeadConstant(C);
+  }
+
+  return true;
+}
+
 bool StripDebugDeclare::runOnModule(Module &M) {
-  if (skipModule(M)) 
-    return false; 
+  if (skipModule(M))
+    return false;
   return stripDebugDeclareImpl(M);
 }
- 
+
 static bool stripDeadDebugInfoImpl(Module &M) {
-  bool Changed = false; 
- 
-  LLVMContext &C = M.getContext(); 
- 
-  // Find all debug info in F. This is actually overkill in terms of what we 
-  // want to do, but we want to try and be as resilient as possible in the face 
-  // of potential debug info changes by using the formal interfaces given to us 
-  // as much as possible. 
-  DebugInfoFinder F; 
-  F.processModule(M); 
- 
-  // For each compile unit, find the live set of global variables/functions and 
-  // replace the current list of potentially dead global variables/functions 
-  // with the live list. 
-  SmallVector<Metadata *, 64> LiveGlobalVariables; 
-  DenseSet<DIGlobalVariableExpression *> VisitedSet; 
- 
-  std::set<DIGlobalVariableExpression *> LiveGVs; 
-  for (GlobalVariable &GV : M.globals()) { 
-    SmallVector<DIGlobalVariableExpression *, 1> GVEs; 
-    GV.getDebugInfo(GVEs); 
-    for (auto *GVE : GVEs) 
-      LiveGVs.insert(GVE); 
-  } 
- 
-  std::set<DICompileUnit *> LiveCUs; 
-  // Any CU referenced from a subprogram is live. 
-  for (DISubprogram *SP : F.subprograms()) { 
-    if (SP->getUnit()) 
-      LiveCUs.insert(SP->getUnit()); 
-  } 
- 
-  bool HasDeadCUs = false; 
-  for (DICompileUnit *DIC : F.compile_units()) { 
-    // Create our live global variable list. 
-    bool GlobalVariableChange = false; 
-    for (auto *DIG : DIC->getGlobalVariables()) { 
-      if (DIG->getExpression() && DIG->getExpression()->isConstant()) 
-        LiveGVs.insert(DIG); 
- 
-      // Make sure we only visit each global variable only once. 
-      if (!VisitedSet.insert(DIG).second) 
-        continue; 
- 
-      // If a global variable references DIG, the global variable is live. 
-      if (LiveGVs.count(DIG)) 
-        LiveGlobalVariables.push_back(DIG); 
-      else 
-        GlobalVariableChange = true; 
-    } 
- 
-    if (!LiveGlobalVariables.empty()) 
-      LiveCUs.insert(DIC); 
-    else if (!LiveCUs.count(DIC)) 
-      HasDeadCUs = true; 
- 
-    // If we found dead global variables, replace the current global 
-    // variable list with our new live global variable list. 
-    if (GlobalVariableChange) { 
-      DIC->replaceGlobalVariables(MDTuple::get(C, LiveGlobalVariables)); 
-      Changed = true; 
-    } 
- 
-    // Reset lists for the next iteration. 
-    LiveGlobalVariables.clear(); 
-  } 
- 
-  if (HasDeadCUs) { 
-    // Delete the old node and replace it with a new one 
-    NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.cu"); 
-    NMD->clearOperands(); 
-    if (!LiveCUs.empty()) { 
-      for (DICompileUnit *CU : LiveCUs) 
-        NMD->addOperand(CU); 
-    } 
-    Changed = true; 
-  } 
- 
-  return Changed; 
-} 
+  bool Changed = false;
+
+  LLVMContext &C = M.getContext();
+
+  // Find all debug info in F. This is actually overkill in terms of what we
+  // want to do, but we want to try and be as resilient as possible in the face
+  // of potential debug info changes by using the formal interfaces given to us
+  // as much as possible.
+  DebugInfoFinder F;
+  F.processModule(M);
+
+  // For each compile unit, find the live set of global variables/functions and
+  // replace the current list of potentially dead global variables/functions
+  // with the live list.
+  SmallVector<Metadata *, 64> LiveGlobalVariables;
+  DenseSet<DIGlobalVariableExpression *> VisitedSet;
+
+  std::set<DIGlobalVariableExpression *> LiveGVs;
+  for (GlobalVariable &GV : M.globals()) {
+    SmallVector<DIGlobalVariableExpression *, 1> GVEs;
+    GV.getDebugInfo(GVEs);
+    for (auto *GVE : GVEs)
+      LiveGVs.insert(GVE);
+  }
+
+  std::set<DICompileUnit *> LiveCUs;
+  // Any CU referenced from a subprogram is live.
+  for (DISubprogram *SP : F.subprograms()) {
+    if (SP->getUnit())
+      LiveCUs.insert(SP->getUnit());
+  }
+
+  bool HasDeadCUs = false;
+  for (DICompileUnit *DIC : F.compile_units()) {
+    // Create our live global variable list.
+    bool GlobalVariableChange = false;
+    for (auto *DIG : DIC->getGlobalVariables()) {
+      if (DIG->getExpression() && DIG->getExpression()->isConstant())
+        LiveGVs.insert(DIG);
+
+      // Make sure we only visit each global variable only once.
+      if (!VisitedSet.insert(DIG).second)
+        continue;
+
+      // If a global variable references DIG, the global variable is live.
+      if (LiveGVs.count(DIG))
+        LiveGlobalVariables.push_back(DIG);
+      else
+        GlobalVariableChange = true;
+    }
+
+    if (!LiveGlobalVariables.empty())
+      LiveCUs.insert(DIC);
+    else if (!LiveCUs.count(DIC))
+      HasDeadCUs = true;
+
+    // If we found dead global variables, replace the current global
+    // variable list with our new live global variable list.
+    if (GlobalVariableChange) {
+      DIC->replaceGlobalVariables(MDTuple::get(C, LiveGlobalVariables));
+      Changed = true;
+    }
+
+    // Reset lists for the next iteration.
+    LiveGlobalVariables.clear();
+  }
+
+  if (HasDeadCUs) {
+    // Delete the old node and replace it with a new one
+    NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.cu");
+    NMD->clearOperands();
+    if (!LiveCUs.empty()) {
+      for (DICompileUnit *CU : LiveCUs)
+        NMD->addOperand(CU);
+    }
+    Changed = true;
+  }
+
+  return Changed;
+}
 
 /// Remove any debug info for global variables/functions in the given module for
 /// which said global variable/function no longer exists (i.e. is null).
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/SyntheticCountsPropagation.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/SyntheticCountsPropagation.cpp
index c29ea77791..1b1e91cafa 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/SyntheticCountsPropagation.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/SyntheticCountsPropagation.cpp
@@ -1,144 +1,144 @@
-//=- SyntheticCountsPropagation.cpp - Propagate function counts --*- C++ -*-=// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements a transformation that synthesizes entry counts for 
-// functions and attaches !prof metadata to functions with the synthesized 
-// counts. The presence of !prof metadata with counter name set to 
-// 'synthesized_function_entry_count' indicate that the value of the counter is 
-// an estimation of the likely execution count of the function. This transform 
-// is applied only in non PGO mode as functions get 'real' profile-based 
-// function entry counts in the PGO mode. 
-// 
-// The transformation works by first assigning some initial values to the entry 
-// counts of all functions and then doing a top-down traversal of the 
-// callgraph-scc to propagate the counts. For each function the set of callsites 
-// and their relative block frequency is gathered. The relative block frequency 
-// multiplied by the entry count of the caller and added to the callee's entry 
-// count. For non-trivial SCCs, the new counts are computed from the previous 
-// counts and updated in one shot. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/IPO/SyntheticCountsPropagation.h" 
-#include "llvm/ADT/DenseSet.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/Analysis/BlockFrequencyInfo.h" 
-#include "llvm/Analysis/CallGraph.h" 
-#include "llvm/Analysis/ProfileSummaryInfo.h" 
-#include "llvm/Analysis/SyntheticCountsUtils.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
- 
-using namespace llvm; 
-using Scaled64 = ScaledNumber<uint64_t>; 
-using ProfileCount = Function::ProfileCount; 
- 
-#define DEBUG_TYPE "synthetic-counts-propagation" 
- 
-/// Initial synthetic count assigned to functions. 
-cl::opt<int> 
-    InitialSyntheticCount("initial-synthetic-count", cl::Hidden, cl::init(10), 
-                          cl::ZeroOrMore, 
-                          cl::desc("Initial value of synthetic entry count.")); 
- 
-/// Initial synthetic count assigned to inline functions. 
-static cl::opt<int> InlineSyntheticCount( 
-    "inline-synthetic-count", cl::Hidden, cl::init(15), cl::ZeroOrMore, 
-    cl::desc("Initial synthetic entry count for inline functions.")); 
- 
-/// Initial synthetic count assigned to cold functions. 
-static cl::opt<int> ColdSyntheticCount( 
-    "cold-synthetic-count", cl::Hidden, cl::init(5), cl::ZeroOrMore, 
-    cl::desc("Initial synthetic entry count for cold functions.")); 
- 
-// Assign initial synthetic entry counts to functions. 
-static void 
-initializeCounts(Module &M, function_ref<void(Function *, uint64_t)> SetCount) { 
-  auto MayHaveIndirectCalls = [](Function &F) { 
-    for (auto *U : F.users()) { 
-      if (!isa<CallInst>(U) && !isa<InvokeInst>(U)) 
-        return true; 
-    } 
-    return false; 
-  }; 
- 
-  for (Function &F : M) { 
-    uint64_t InitialCount = InitialSyntheticCount; 
-    if (F.isDeclaration()) 
-      continue; 
-    if (F.hasFnAttribute(Attribute::AlwaysInline) || 
-        F.hasFnAttribute(Attribute::InlineHint)) { 
-      // Use a higher value for inline functions to account for the fact that 
-      // these are usually beneficial to inline. 
-      InitialCount = InlineSyntheticCount; 
-    } else if (F.hasLocalLinkage() && !MayHaveIndirectCalls(F)) { 
-      // Local functions without inline hints get counts only through 
-      // propagation. 
-      InitialCount = 0; 
-    } else if (F.hasFnAttribute(Attribute::Cold) || 
-               F.hasFnAttribute(Attribute::NoInline)) { 
-      // Use a lower value for noinline and cold functions. 
-      InitialCount = ColdSyntheticCount; 
-    } 
-    SetCount(&F, InitialCount); 
-  } 
-} 
- 
-PreservedAnalyses SyntheticCountsPropagation::run(Module &M, 
-                                                  ModuleAnalysisManager &MAM) { 
-  FunctionAnalysisManager &FAM = 
-      MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 
-  DenseMap<Function *, Scaled64> Counts; 
-  // Set initial entry counts. 
-  initializeCounts( 
-      M, [&](Function *F, uint64_t Count) { Counts[F] = Scaled64(Count, 0); }); 
- 
-  // Edge includes information about the source. Hence ignore the first 
-  // parameter. 
-  auto GetCallSiteProfCount = [&](const CallGraphNode *, 
-                                  const CallGraphNode::CallRecord &Edge) { 
-    Optional<Scaled64> Res = None; 
-    if (!Edge.first) 
-      return Res; 
-    CallBase &CB = *cast<CallBase>(*Edge.first); 
-    Function *Caller = CB.getCaller(); 
-    auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(*Caller); 
- 
-    // Now compute the callsite count from relative frequency and 
-    // entry count: 
-    BasicBlock *CSBB = CB.getParent(); 
-    Scaled64 EntryFreq(BFI.getEntryFreq(), 0); 
-    Scaled64 BBCount(BFI.getBlockFreq(CSBB).getFrequency(), 0); 
-    BBCount /= EntryFreq; 
-    BBCount *= Counts[Caller]; 
-    return Optional<Scaled64>(BBCount); 
-  }; 
- 
-  CallGraph CG(M); 
-  // Propgate the entry counts on the callgraph. 
-  SyntheticCountsUtils<const CallGraph *>::propagate( 
-      &CG, GetCallSiteProfCount, [&](const CallGraphNode *N, Scaled64 New) { 
-        auto F = N->getFunction(); 
-        if (!F || F->isDeclaration()) 
-          return; 
- 
-        Counts[F] += New; 
-      }); 
- 
-  // Set the counts as metadata. 
-  for (auto Entry : Counts) { 
-    Entry.first->setEntryCount(ProfileCount( 
-        Entry.second.template toInt<uint64_t>(), Function::PCT_Synthetic)); 
-  } 
- 
-  return PreservedAnalyses::all(); 
-} 
+//=- SyntheticCountsPropagation.cpp - Propagate function counts --*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a transformation that synthesizes entry counts for
+// functions and attaches !prof metadata to functions with the synthesized
+// counts. The presence of !prof metadata with counter name set to
+// 'synthesized_function_entry_count' indicate that the value of the counter is
+// an estimation of the likely execution count of the function. This transform
+// is applied only in non PGO mode as functions get 'real' profile-based
+// function entry counts in the PGO mode.
+//
+// The transformation works by first assigning some initial values to the entry
+// counts of all functions and then doing a top-down traversal of the
+// callgraph-scc to propagate the counts. For each function the set of callsites
+// and their relative block frequency is gathered. The relative block frequency
+// multiplied by the entry count of the caller and added to the callee's entry
+// count. For non-trivial SCCs, the new counts are computed from the previous
+// counts and updated in one shot.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/SyntheticCountsPropagation.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/SyntheticCountsUtils.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using Scaled64 = ScaledNumber<uint64_t>;
+using ProfileCount = Function::ProfileCount;
+
+#define DEBUG_TYPE "synthetic-counts-propagation"
+
+/// Initial synthetic count assigned to functions.
+cl::opt<int>
+    InitialSyntheticCount("initial-synthetic-count", cl::Hidden, cl::init(10),
+                          cl::ZeroOrMore,
+                          cl::desc("Initial value of synthetic entry count."));
+
+/// Initial synthetic count assigned to inline functions.
+static cl::opt<int> InlineSyntheticCount(
+    "inline-synthetic-count", cl::Hidden, cl::init(15), cl::ZeroOrMore,
+    cl::desc("Initial synthetic entry count for inline functions."));
+
+/// Initial synthetic count assigned to cold functions.
+static cl::opt<int> ColdSyntheticCount(
+    "cold-synthetic-count", cl::Hidden, cl::init(5), cl::ZeroOrMore,
+    cl::desc("Initial synthetic entry count for cold functions."));
+
+// Assign initial synthetic entry counts to functions.
+static void
+initializeCounts(Module &M, function_ref<void(Function *, uint64_t)> SetCount) {
+  auto MayHaveIndirectCalls = [](Function &F) {
+    for (auto *U : F.users()) {
+      if (!isa<CallInst>(U) && !isa<InvokeInst>(U))
+        return true;
+    }
+    return false;
+  };
+
+  for (Function &F : M) {
+    uint64_t InitialCount = InitialSyntheticCount;
+    if (F.isDeclaration())
+      continue;
+    if (F.hasFnAttribute(Attribute::AlwaysInline) ||
+        F.hasFnAttribute(Attribute::InlineHint)) {
+      // Use a higher value for inline functions to account for the fact that
+      // these are usually beneficial to inline.
+      InitialCount = InlineSyntheticCount;
+    } else if (F.hasLocalLinkage() && !MayHaveIndirectCalls(F)) {
+      // Local functions without inline hints get counts only through
+      // propagation.
+      InitialCount = 0;
+    } else if (F.hasFnAttribute(Attribute::Cold) ||
+               F.hasFnAttribute(Attribute::NoInline)) {
+      // Use a lower value for noinline and cold functions.
+      InitialCount = ColdSyntheticCount;
+    }
+    SetCount(&F, InitialCount);
+  }
+}
+
+PreservedAnalyses SyntheticCountsPropagation::run(Module &M,
+                                                  ModuleAnalysisManager &MAM) {
+  FunctionAnalysisManager &FAM =
+      MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  DenseMap<Function *, Scaled64> Counts;
+  // Set initial entry counts.
+  initializeCounts(
+      M, [&](Function *F, uint64_t Count) { Counts[F] = Scaled64(Count, 0); });
+
+  // Edge includes information about the source. Hence ignore the first
+  // parameter.
+  auto GetCallSiteProfCount = [&](const CallGraphNode *,
+                                  const CallGraphNode::CallRecord &Edge) {
+    Optional<Scaled64> Res = None;
+    if (!Edge.first)
+      return Res;
+    CallBase &CB = *cast<CallBase>(*Edge.first);
+    Function *Caller = CB.getCaller();
+    auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(*Caller);
+
+    // Now compute the callsite count from relative frequency and
+    // entry count:
+    BasicBlock *CSBB = CB.getParent();
+    Scaled64 EntryFreq(BFI.getEntryFreq(), 0);
+    Scaled64 BBCount(BFI.getBlockFreq(CSBB).getFrequency(), 0);
+    BBCount /= EntryFreq;
+    BBCount *= Counts[Caller];
+    return Optional<Scaled64>(BBCount);
+  };
+
+  CallGraph CG(M);
+  // Propgate the entry counts on the callgraph.
+  SyntheticCountsUtils<const CallGraph *>::propagate(
+      &CG, GetCallSiteProfCount, [&](const CallGraphNode *N, Scaled64 New) {
+        auto F = N->getFunction();
+        if (!F || F->isDeclaration())
+          return;
+
+        Counts[F] += New;
+      });
+
+  // Set the counts as metadata.
+  for (auto Entry : Counts) {
+    Entry.first->setEntryCount(ProfileCount(
+        Entry.second.template toInt<uint64_t>(), Function::PCT_Synthetic));
+  }
+
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index 24891b3392..225b4fe95f 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -1,549 +1,549 @@
-//===- ThinLTOBitcodeWriter.cpp - Bitcode writing pass for ThinLTO --------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/IPO/ThinLTOBitcodeWriter.h" 
-#include "llvm/Analysis/BasicAliasAnalysis.h" 
-#include "llvm/Analysis/ModuleSummaryAnalysis.h" 
-#include "llvm/Analysis/ProfileSummaryInfo.h" 
-#include "llvm/Analysis/TypeMetadataUtils.h" 
-#include "llvm/Bitcode/BitcodeWriter.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DebugInfo.h" 
+//===- ThinLTOBitcodeWriter.cpp - Bitcode writing pass for ThinLTO --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/ThinLTOBitcodeWriter.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/ModuleSummaryAnalysis.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TypeMetadataUtils.h"
+#include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Object/ModuleSymbolTable.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/ScopedPrinter.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/IPO.h" 
-#include "llvm/Transforms/IPO/FunctionAttrs.h" 
-#include "llvm/Transforms/IPO/FunctionImport.h" 
-#include "llvm/Transforms/IPO/LowerTypeTests.h" 
-#include "llvm/Transforms/Utils/Cloning.h" 
-#include "llvm/Transforms/Utils/ModuleUtils.h" 
-using namespace llvm; 
- 
-namespace { 
- 
-// Promote each local-linkage entity defined by ExportM and used by ImportM by 
-// changing visibility and appending the given ModuleId. 
-void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId, 
-                      SetVector<GlobalValue *> &PromoteExtra) { 
-  DenseMap<const Comdat *, Comdat *> RenamedComdats; 
-  for (auto &ExportGV : ExportM.global_values()) { 
-    if (!ExportGV.hasLocalLinkage()) 
-      continue; 
- 
-    auto Name = ExportGV.getName(); 
-    GlobalValue *ImportGV = nullptr; 
-    if (!PromoteExtra.count(&ExportGV)) { 
-      ImportGV = ImportM.getNamedValue(Name); 
-      if (!ImportGV) 
-        continue; 
-      ImportGV->removeDeadConstantUsers(); 
-      if (ImportGV->use_empty()) { 
-        ImportGV->eraseFromParent(); 
-        continue; 
-      } 
-    } 
- 
-    std::string NewName = (Name + ModuleId).str(); 
- 
-    if (const auto *C = ExportGV.getComdat()) 
-      if (C->getName() == Name) 
-        RenamedComdats.try_emplace(C, ExportM.getOrInsertComdat(NewName)); 
- 
-    ExportGV.setName(NewName); 
-    ExportGV.setLinkage(GlobalValue::ExternalLinkage); 
-    ExportGV.setVisibility(GlobalValue::HiddenVisibility); 
- 
-    if (ImportGV) { 
-      ImportGV->setName(NewName); 
-      ImportGV->setVisibility(GlobalValue::HiddenVisibility); 
-    } 
-  } 
- 
-  if (!RenamedComdats.empty()) 
-    for (auto &GO : ExportM.global_objects()) 
-      if (auto *C = GO.getComdat()) { 
-        auto Replacement = RenamedComdats.find(C); 
-        if (Replacement != RenamedComdats.end()) 
-          GO.setComdat(Replacement->second); 
-      } 
-} 
- 
-// Promote all internal (i.e. distinct) type ids used by the module by replacing 
-// them with external type ids formed using the module id. 
-// 
-// Note that this needs to be done before we clone the module because each clone 
-// will receive its own set of distinct metadata nodes. 
-void promoteTypeIds(Module &M, StringRef ModuleId) { 
-  DenseMap<Metadata *, Metadata *> LocalToGlobal; 
-  auto ExternalizeTypeId = [&](CallInst *CI, unsigned ArgNo) { 
-    Metadata *MD = 
-        cast<MetadataAsValue>(CI->getArgOperand(ArgNo))->getMetadata(); 
- 
-    if (isa<MDNode>(MD) && cast<MDNode>(MD)->isDistinct()) { 
-      Metadata *&GlobalMD = LocalToGlobal[MD]; 
-      if (!GlobalMD) { 
-        std::string NewName = (Twine(LocalToGlobal.size()) + ModuleId).str(); 
-        GlobalMD = MDString::get(M.getContext(), NewName); 
-      } 
- 
-      CI->setArgOperand(ArgNo, 
-                        MetadataAsValue::get(M.getContext(), GlobalMD)); 
-    } 
-  }; 
- 
-  if (Function *TypeTestFunc = 
-          M.getFunction(Intrinsic::getName(Intrinsic::type_test))) { 
-    for (const Use &U : TypeTestFunc->uses()) { 
-      auto CI = cast<CallInst>(U.getUser()); 
-      ExternalizeTypeId(CI, 1); 
-    } 
-  } 
- 
-  if (Function *TypeCheckedLoadFunc = 
-          M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load))) { 
-    for (const Use &U : TypeCheckedLoadFunc->uses()) { 
-      auto CI = cast<CallInst>(U.getUser()); 
-      ExternalizeTypeId(CI, 2); 
-    } 
-  } 
- 
-  for (GlobalObject &GO : M.global_objects()) { 
-    SmallVector<MDNode *, 1> MDs; 
-    GO.getMetadata(LLVMContext::MD_type, MDs); 
- 
-    GO.eraseMetadata(LLVMContext::MD_type); 
-    for (auto MD : MDs) { 
-      auto I = LocalToGlobal.find(MD->getOperand(1)); 
-      if (I == LocalToGlobal.end()) { 
-        GO.addMetadata(LLVMContext::MD_type, *MD); 
-        continue; 
-      } 
-      GO.addMetadata( 
-          LLVMContext::MD_type, 
-          *MDNode::get(M.getContext(), {MD->getOperand(0), I->second})); 
-    } 
-  } 
-} 
- 
-// Drop unused globals, and drop type information from function declarations. 
-// FIXME: If we made functions typeless then there would be no need to do this. 
-void simplifyExternals(Module &M) { 
-  FunctionType *EmptyFT = 
-      FunctionType::get(Type::getVoidTy(M.getContext()), false); 
- 
-  for (auto I = M.begin(), E = M.end(); I != E;) { 
-    Function &F = *I++; 
-    if (F.isDeclaration() && F.use_empty()) { 
-      F.eraseFromParent(); 
-      continue; 
-    } 
- 
-    if (!F.isDeclaration() || F.getFunctionType() == EmptyFT || 
-        // Changing the type of an intrinsic may invalidate the IR. 
-        F.getName().startswith("llvm.")) 
-      continue; 
- 
-    Function *NewF = 
-        Function::Create(EmptyFT, GlobalValue::ExternalLinkage, 
-                         F.getAddressSpace(), "", &M); 
-    NewF->setVisibility(F.getVisibility()); 
-    NewF->takeName(&F); 
-    F.replaceAllUsesWith(ConstantExpr::getBitCast(NewF, F.getType())); 
-    F.eraseFromParent(); 
-  } 
- 
-  for (auto I = M.global_begin(), E = M.global_end(); I != E;) { 
-    GlobalVariable &GV = *I++; 
-    if (GV.isDeclaration() && GV.use_empty()) { 
-      GV.eraseFromParent(); 
-      continue; 
-    } 
-  } 
-} 
- 
-static void 
-filterModule(Module *M, 
-             function_ref<bool(const GlobalValue *)> ShouldKeepDefinition) { 
-  std::vector<GlobalValue *> V; 
-  for (GlobalValue &GV : M->global_values()) 
-    if (!ShouldKeepDefinition(&GV)) 
-      V.push_back(&GV); 
- 
-  for (GlobalValue *GV : V) 
-    if (!convertToDeclaration(*GV)) 
-      GV->eraseFromParent(); 
-} 
- 
-void forEachVirtualFunction(Constant *C, function_ref<void(Function *)> Fn) { 
-  if (auto *F = dyn_cast<Function>(C)) 
-    return Fn(F); 
-  if (isa<GlobalValue>(C)) 
-    return; 
-  for (Value *Op : C->operands()) 
-    forEachVirtualFunction(cast<Constant>(Op), Fn); 
-} 
- 
-// If it's possible to split M into regular and thin LTO parts, do so and write 
-// a multi-module bitcode file with the two parts to OS. Otherwise, write only a 
-// regular LTO bitcode file to OS. 
-void splitAndWriteThinLTOBitcode( 
-    raw_ostream &OS, raw_ostream *ThinLinkOS, 
-    function_ref<AAResults &(Function &)> AARGetter, Module &M) { 
-  std::string ModuleId = getUniqueModuleId(&M); 
-  if (ModuleId.empty()) { 
-    // We couldn't generate a module ID for this module, write it out as a 
-    // regular LTO module with an index for summary-based dead stripping. 
-    ProfileSummaryInfo PSI(M); 
-    M.addModuleFlag(Module::Error, "ThinLTO", uint32_t(0)); 
-    ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, &PSI); 
-    WriteBitcodeToFile(M, OS, /*ShouldPreserveUseListOrder=*/false, &Index); 
- 
-    if (ThinLinkOS) 
-      // We don't have a ThinLTO part, but still write the module to the 
-      // ThinLinkOS if requested so that the expected output file is produced. 
-      WriteBitcodeToFile(M, *ThinLinkOS, /*ShouldPreserveUseListOrder=*/false, 
-                         &Index); 
- 
-    return; 
-  } 
- 
-  promoteTypeIds(M, ModuleId); 
- 
-  // Returns whether a global or its associated global has attached type 
-  // metadata. The former may participate in CFI or whole-program 
-  // devirtualization, so they need to appear in the merged module instead of 
-  // the thin LTO module. Similarly, globals that are associated with globals 
-  // with type metadata need to appear in the merged module because they will 
-  // reference the global's section directly. 
-  auto HasTypeMetadata = [](const GlobalObject *GO) { 
-    if (MDNode *MD = GO->getMetadata(LLVMContext::MD_associated)) 
-      if (auto *AssocVM = dyn_cast_or_null<ValueAsMetadata>(MD->getOperand(0))) 
-        if (auto *AssocGO = dyn_cast<GlobalObject>(AssocVM->getValue())) 
-          if (AssocGO->hasMetadata(LLVMContext::MD_type)) 
-            return true; 
-    return GO->hasMetadata(LLVMContext::MD_type); 
-  }; 
- 
-  // Collect the set of virtual functions that are eligible for virtual constant 
-  // propagation. Each eligible function must not access memory, must return 
-  // an integer of width <=64 bits, must take at least one argument, must not 
-  // use its first argument (assumed to be "this") and all arguments other than 
-  // the first one must be of <=64 bit integer type. 
-  // 
-  // Note that we test whether this copy of the function is readnone, rather 
-  // than testing function attributes, which must hold for any copy of the 
-  // function, even a less optimized version substituted at link time. This is 
-  // sound because the virtual constant propagation optimizations effectively 
-  // inline all implementations of the virtual function into each call site, 
-  // rather than using function attributes to perform local optimization. 
-  DenseSet<const Function *> EligibleVirtualFns; 
-  // If any member of a comdat lives in MergedM, put all members of that 
-  // comdat in MergedM to keep the comdat together. 
-  DenseSet<const Comdat *> MergedMComdats; 
-  for (GlobalVariable &GV : M.globals()) 
-    if (HasTypeMetadata(&GV)) { 
-      if (const auto *C = GV.getComdat()) 
-        MergedMComdats.insert(C); 
-      forEachVirtualFunction(GV.getInitializer(), [&](Function *F) { 
-        auto *RT = dyn_cast<IntegerType>(F->getReturnType()); 
-        if (!RT || RT->getBitWidth() > 64 || F->arg_empty() || 
-            !F->arg_begin()->use_empty()) 
-          return; 
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Object/ModuleSymbolTable.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/FunctionAttrs.h"
+#include "llvm/Transforms/IPO/FunctionImport.h"
+#include "llvm/Transforms/IPO/LowerTypeTests.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+using namespace llvm;
+
+namespace {
+
+// Promote each local-linkage entity defined by ExportM and used by ImportM by
+// changing visibility and appending the given ModuleId.
+void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId,
+                      SetVector<GlobalValue *> &PromoteExtra) {
+  DenseMap<const Comdat *, Comdat *> RenamedComdats;
+  for (auto &ExportGV : ExportM.global_values()) {
+    if (!ExportGV.hasLocalLinkage())
+      continue;
+
+    auto Name = ExportGV.getName();
+    GlobalValue *ImportGV = nullptr;
+    if (!PromoteExtra.count(&ExportGV)) {
+      ImportGV = ImportM.getNamedValue(Name);
+      if (!ImportGV)
+        continue;
+      ImportGV->removeDeadConstantUsers();
+      if (ImportGV->use_empty()) {
+        ImportGV->eraseFromParent();
+        continue;
+      }
+    }
+
+    std::string NewName = (Name + ModuleId).str();
+
+    if (const auto *C = ExportGV.getComdat())
+      if (C->getName() == Name)
+        RenamedComdats.try_emplace(C, ExportM.getOrInsertComdat(NewName));
+
+    ExportGV.setName(NewName);
+    ExportGV.setLinkage(GlobalValue::ExternalLinkage);
+    ExportGV.setVisibility(GlobalValue::HiddenVisibility);
+
+    if (ImportGV) {
+      ImportGV->setName(NewName);
+      ImportGV->setVisibility(GlobalValue::HiddenVisibility);
+    }
+  }
+
+  if (!RenamedComdats.empty())
+    for (auto &GO : ExportM.global_objects())
+      if (auto *C = GO.getComdat()) {
+        auto Replacement = RenamedComdats.find(C);
+        if (Replacement != RenamedComdats.end())
+          GO.setComdat(Replacement->second);
+      }
+}
+
+// Promote all internal (i.e. distinct) type ids used by the module by replacing
+// them with external type ids formed using the module id.
+//
+// Note that this needs to be done before we clone the module because each clone
+// will receive its own set of distinct metadata nodes.
+void promoteTypeIds(Module &M, StringRef ModuleId) {
+  DenseMap<Metadata *, Metadata *> LocalToGlobal;
+  auto ExternalizeTypeId = [&](CallInst *CI, unsigned ArgNo) {
+    Metadata *MD =
+        cast<MetadataAsValue>(CI->getArgOperand(ArgNo))->getMetadata();
+
+    if (isa<MDNode>(MD) && cast<MDNode>(MD)->isDistinct()) {
+      Metadata *&GlobalMD = LocalToGlobal[MD];
+      if (!GlobalMD) {
+        std::string NewName = (Twine(LocalToGlobal.size()) + ModuleId).str();
+        GlobalMD = MDString::get(M.getContext(), NewName);
+      }
+
+      CI->setArgOperand(ArgNo,
+                        MetadataAsValue::get(M.getContext(), GlobalMD));
+    }
+  };
+
+  if (Function *TypeTestFunc =
+          M.getFunction(Intrinsic::getName(Intrinsic::type_test))) {
+    for (const Use &U : TypeTestFunc->uses()) {
+      auto CI = cast<CallInst>(U.getUser());
+      ExternalizeTypeId(CI, 1);
+    }
+  }
+
+  if (Function *TypeCheckedLoadFunc =
+          M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load))) {
+    for (const Use &U : TypeCheckedLoadFunc->uses()) {
+      auto CI = cast<CallInst>(U.getUser());
+      ExternalizeTypeId(CI, 2);
+    }
+  }
+
+  for (GlobalObject &GO : M.global_objects()) {
+    SmallVector<MDNode *, 1> MDs;
+    GO.getMetadata(LLVMContext::MD_type, MDs);
+
+    GO.eraseMetadata(LLVMContext::MD_type);
+    for (auto MD : MDs) {
+      auto I = LocalToGlobal.find(MD->getOperand(1));
+      if (I == LocalToGlobal.end()) {
+        GO.addMetadata(LLVMContext::MD_type, *MD);
+        continue;
+      }
+      GO.addMetadata(
+          LLVMContext::MD_type,
+          *MDNode::get(M.getContext(), {MD->getOperand(0), I->second}));
+    }
+  }
+}
+
+// Drop unused globals, and drop type information from function declarations.
+// FIXME: If we made functions typeless then there would be no need to do this.
+void simplifyExternals(Module &M) {
+  FunctionType *EmptyFT =
+      FunctionType::get(Type::getVoidTy(M.getContext()), false);
+
+  for (auto I = M.begin(), E = M.end(); I != E;) {
+    Function &F = *I++;
+    if (F.isDeclaration() && F.use_empty()) {
+      F.eraseFromParent();
+      continue;
+    }
+
+    if (!F.isDeclaration() || F.getFunctionType() == EmptyFT ||
+        // Changing the type of an intrinsic may invalidate the IR.
+        F.getName().startswith("llvm."))
+      continue;
+
+    Function *NewF =
+        Function::Create(EmptyFT, GlobalValue::ExternalLinkage,
+                         F.getAddressSpace(), "", &M);
+    NewF->setVisibility(F.getVisibility());
+    NewF->takeName(&F);
+    F.replaceAllUsesWith(ConstantExpr::getBitCast(NewF, F.getType()));
+    F.eraseFromParent();
+  }
+
+  for (auto I = M.global_begin(), E = M.global_end(); I != E;) {
+    GlobalVariable &GV = *I++;
+    if (GV.isDeclaration() && GV.use_empty()) {
+      GV.eraseFromParent();
+      continue;
+    }
+  }
+}
+
+static void
+filterModule(Module *M,
+             function_ref<bool(const GlobalValue *)> ShouldKeepDefinition) {
+  std::vector<GlobalValue *> V;
+  for (GlobalValue &GV : M->global_values())
+    if (!ShouldKeepDefinition(&GV))
+      V.push_back(&GV);
+
+  for (GlobalValue *GV : V)
+    if (!convertToDeclaration(*GV))
+      GV->eraseFromParent();
+}
+
+void forEachVirtualFunction(Constant *C, function_ref<void(Function *)> Fn) {
+  if (auto *F = dyn_cast<Function>(C))
+    return Fn(F);
+  if (isa<GlobalValue>(C))
+    return;
+  for (Value *Op : C->operands())
+    forEachVirtualFunction(cast<Constant>(Op), Fn);
+}
+
+// If it's possible to split M into regular and thin LTO parts, do so and write
+// a multi-module bitcode file with the two parts to OS. Otherwise, write only a
+// regular LTO bitcode file to OS.
+void splitAndWriteThinLTOBitcode(
+    raw_ostream &OS, raw_ostream *ThinLinkOS,
+    function_ref<AAResults &(Function &)> AARGetter, Module &M) {
+  std::string ModuleId = getUniqueModuleId(&M);
+  if (ModuleId.empty()) {
+    // We couldn't generate a module ID for this module, write it out as a
+    // regular LTO module with an index for summary-based dead stripping.
+    ProfileSummaryInfo PSI(M);
+    M.addModuleFlag(Module::Error, "ThinLTO", uint32_t(0));
+    ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, &PSI);
+    WriteBitcodeToFile(M, OS, /*ShouldPreserveUseListOrder=*/false, &Index);
+
+    if (ThinLinkOS)
+      // We don't have a ThinLTO part, but still write the module to the
+      // ThinLinkOS if requested so that the expected output file is produced.
+      WriteBitcodeToFile(M, *ThinLinkOS, /*ShouldPreserveUseListOrder=*/false,
+                         &Index);
+
+    return;
+  }
+
+  promoteTypeIds(M, ModuleId);
+
+  // Returns whether a global or its associated global has attached type
+  // metadata. The former may participate in CFI or whole-program
+  // devirtualization, so they need to appear in the merged module instead of
+  // the thin LTO module. Similarly, globals that are associated with globals
+  // with type metadata need to appear in the merged module because they will
+  // reference the global's section directly.
+  auto HasTypeMetadata = [](const GlobalObject *GO) {
+    if (MDNode *MD = GO->getMetadata(LLVMContext::MD_associated))
+      if (auto *AssocVM = dyn_cast_or_null<ValueAsMetadata>(MD->getOperand(0)))
+        if (auto *AssocGO = dyn_cast<GlobalObject>(AssocVM->getValue()))
+          if (AssocGO->hasMetadata(LLVMContext::MD_type))
+            return true;
+    return GO->hasMetadata(LLVMContext::MD_type);
+  };
+
+  // Collect the set of virtual functions that are eligible for virtual constant
+  // propagation. Each eligible function must not access memory, must return
+  // an integer of width <=64 bits, must take at least one argument, must not
+  // use its first argument (assumed to be "this") and all arguments other than
+  // the first one must be of <=64 bit integer type.
+  //
+  // Note that we test whether this copy of the function is readnone, rather
+  // than testing function attributes, which must hold for any copy of the
+  // function, even a less optimized version substituted at link time. This is
+  // sound because the virtual constant propagation optimizations effectively
+  // inline all implementations of the virtual function into each call site,
+  // rather than using function attributes to perform local optimization.
+  DenseSet<const Function *> EligibleVirtualFns;
+  // If any member of a comdat lives in MergedM, put all members of that
+  // comdat in MergedM to keep the comdat together.
+  DenseSet<const Comdat *> MergedMComdats;
+  for (GlobalVariable &GV : M.globals())
+    if (HasTypeMetadata(&GV)) {
+      if (const auto *C = GV.getComdat())
+        MergedMComdats.insert(C);
+      forEachVirtualFunction(GV.getInitializer(), [&](Function *F) {
+        auto *RT = dyn_cast<IntegerType>(F->getReturnType());
+        if (!RT || RT->getBitWidth() > 64 || F->arg_empty() ||
+            !F->arg_begin()->use_empty())
+          return;
         for (auto &Arg : drop_begin(F->args())) {
-          auto *ArgT = dyn_cast<IntegerType>(Arg.getType()); 
-          if (!ArgT || ArgT->getBitWidth() > 64) 
-            return; 
-        } 
-        if (!F->isDeclaration() && 
-            computeFunctionBodyMemoryAccess(*F, AARGetter(*F)) == MAK_ReadNone) 
-          EligibleVirtualFns.insert(F); 
-      }); 
-    } 
- 
-  ValueToValueMapTy VMap; 
-  std::unique_ptr<Module> MergedM( 
-      CloneModule(M, VMap, [&](const GlobalValue *GV) -> bool { 
-        if (const auto *C = GV->getComdat()) 
-          if (MergedMComdats.count(C)) 
-            return true; 
-        if (auto *F = dyn_cast<Function>(GV)) 
-          return EligibleVirtualFns.count(F); 
-        if (auto *GVar = dyn_cast_or_null<GlobalVariable>(GV->getBaseObject())) 
-          return HasTypeMetadata(GVar); 
-        return false; 
-      })); 
-  StripDebugInfo(*MergedM); 
-  MergedM->setModuleInlineAsm(""); 
- 
-  for (Function &F : *MergedM) 
-    if (!F.isDeclaration()) { 
-      // Reset the linkage of all functions eligible for virtual constant 
-      // propagation. The canonical definitions live in the thin LTO module so 
-      // that they can be imported. 
-      F.setLinkage(GlobalValue::AvailableExternallyLinkage); 
-      F.setComdat(nullptr); 
-    } 
- 
-  SetVector<GlobalValue *> CfiFunctions; 
-  for (auto &F : M) 
-    if ((!F.hasLocalLinkage() || F.hasAddressTaken()) && HasTypeMetadata(&F)) 
-      CfiFunctions.insert(&F); 
- 
-  // Remove all globals with type metadata, globals with comdats that live in 
-  // MergedM, and aliases pointing to such globals from the thin LTO module. 
-  filterModule(&M, [&](const GlobalValue *GV) { 
-    if (auto *GVar = dyn_cast_or_null<GlobalVariable>(GV->getBaseObject())) 
-      if (HasTypeMetadata(GVar)) 
-        return false; 
-    if (const auto *C = GV->getComdat()) 
-      if (MergedMComdats.count(C)) 
-        return false; 
-    return true; 
-  }); 
- 
-  promoteInternals(*MergedM, M, ModuleId, CfiFunctions); 
-  promoteInternals(M, *MergedM, ModuleId, CfiFunctions); 
- 
-  auto &Ctx = MergedM->getContext(); 
-  SmallVector<MDNode *, 8> CfiFunctionMDs; 
-  for (auto V : CfiFunctions) { 
-    Function &F = *cast<Function>(V); 
-    SmallVector<MDNode *, 2> Types; 
-    F.getMetadata(LLVMContext::MD_type, Types); 
- 
-    SmallVector<Metadata *, 4> Elts; 
-    Elts.push_back(MDString::get(Ctx, F.getName())); 
-    CfiFunctionLinkage Linkage; 
-    if (lowertypetests::isJumpTableCanonical(&F)) 
-      Linkage = CFL_Definition; 
-    else if (F.hasExternalWeakLinkage()) 
-      Linkage = CFL_WeakDeclaration; 
-    else 
-      Linkage = CFL_Declaration; 
-    Elts.push_back(ConstantAsMetadata::get( 
-        llvm::ConstantInt::get(Type::getInt8Ty(Ctx), Linkage))); 
+          auto *ArgT = dyn_cast<IntegerType>(Arg.getType());
+          if (!ArgT || ArgT->getBitWidth() > 64)
+            return;
+        }
+        if (!F->isDeclaration() &&
+            computeFunctionBodyMemoryAccess(*F, AARGetter(*F)) == MAK_ReadNone)
+          EligibleVirtualFns.insert(F);
+      });
+    }
+
+  ValueToValueMapTy VMap;
+  std::unique_ptr<Module> MergedM(
+      CloneModule(M, VMap, [&](const GlobalValue *GV) -> bool {
+        if (const auto *C = GV->getComdat())
+          if (MergedMComdats.count(C))
+            return true;
+        if (auto *F = dyn_cast<Function>(GV))
+          return EligibleVirtualFns.count(F);
+        if (auto *GVar = dyn_cast_or_null<GlobalVariable>(GV->getBaseObject()))
+          return HasTypeMetadata(GVar);
+        return false;
+      }));
+  StripDebugInfo(*MergedM);
+  MergedM->setModuleInlineAsm("");
+
+  for (Function &F : *MergedM)
+    if (!F.isDeclaration()) {
+      // Reset the linkage of all functions eligible for virtual constant
+      // propagation. The canonical definitions live in the thin LTO module so
+      // that they can be imported.
+      F.setLinkage(GlobalValue::AvailableExternallyLinkage);
+      F.setComdat(nullptr);
+    }
+
+  SetVector<GlobalValue *> CfiFunctions;
+  for (auto &F : M)
+    if ((!F.hasLocalLinkage() || F.hasAddressTaken()) && HasTypeMetadata(&F))
+      CfiFunctions.insert(&F);
+
+  // Remove all globals with type metadata, globals with comdats that live in
+  // MergedM, and aliases pointing to such globals from the thin LTO module.
+  filterModule(&M, [&](const GlobalValue *GV) {
+    if (auto *GVar = dyn_cast_or_null<GlobalVariable>(GV->getBaseObject()))
+      if (HasTypeMetadata(GVar))
+        return false;
+    if (const auto *C = GV->getComdat())
+      if (MergedMComdats.count(C))
+        return false;
+    return true;
+  });
+
+  promoteInternals(*MergedM, M, ModuleId, CfiFunctions);
+  promoteInternals(M, *MergedM, ModuleId, CfiFunctions);
+
+  auto &Ctx = MergedM->getContext();
+  SmallVector<MDNode *, 8> CfiFunctionMDs;
+  for (auto V : CfiFunctions) {
+    Function &F = *cast<Function>(V);
+    SmallVector<MDNode *, 2> Types;
+    F.getMetadata(LLVMContext::MD_type, Types);
+
+    SmallVector<Metadata *, 4> Elts;
+    Elts.push_back(MDString::get(Ctx, F.getName()));
+    CfiFunctionLinkage Linkage;
+    if (lowertypetests::isJumpTableCanonical(&F))
+      Linkage = CFL_Definition;
+    else if (F.hasExternalWeakLinkage())
+      Linkage = CFL_WeakDeclaration;
+    else
+      Linkage = CFL_Declaration;
+    Elts.push_back(ConstantAsMetadata::get(
+        llvm::ConstantInt::get(Type::getInt8Ty(Ctx), Linkage)));
     append_range(Elts, Types);
-    CfiFunctionMDs.push_back(MDTuple::get(Ctx, Elts)); 
-  } 
- 
-  if(!CfiFunctionMDs.empty()) { 
-    NamedMDNode *NMD = MergedM->getOrInsertNamedMetadata("cfi.functions"); 
-    for (auto MD : CfiFunctionMDs) 
-      NMD->addOperand(MD); 
-  } 
- 
-  SmallVector<MDNode *, 8> FunctionAliases; 
-  for (auto &A : M.aliases()) { 
-    if (!isa<Function>(A.getAliasee())) 
-      continue; 
- 
-    auto *F = cast<Function>(A.getAliasee()); 
- 
-    Metadata *Elts[] = { 
-        MDString::get(Ctx, A.getName()), 
-        MDString::get(Ctx, F->getName()), 
-        ConstantAsMetadata::get( 
-            ConstantInt::get(Type::getInt8Ty(Ctx), A.getVisibility())), 
-        ConstantAsMetadata::get( 
-            ConstantInt::get(Type::getInt8Ty(Ctx), A.isWeakForLinker())), 
-    }; 
- 
-    FunctionAliases.push_back(MDTuple::get(Ctx, Elts)); 
-  } 
- 
-  if (!FunctionAliases.empty()) { 
-    NamedMDNode *NMD = MergedM->getOrInsertNamedMetadata("aliases"); 
-    for (auto MD : FunctionAliases) 
-      NMD->addOperand(MD); 
-  } 
- 
-  SmallVector<MDNode *, 8> Symvers; 
-  ModuleSymbolTable::CollectAsmSymvers(M, [&](StringRef Name, StringRef Alias) { 
-    Function *F = M.getFunction(Name); 
-    if (!F || F->use_empty()) 
-      return; 
- 
-    Symvers.push_back(MDTuple::get( 
-        Ctx, {MDString::get(Ctx, Name), MDString::get(Ctx, Alias)})); 
-  }); 
- 
-  if (!Symvers.empty()) { 
-    NamedMDNode *NMD = MergedM->getOrInsertNamedMetadata("symvers"); 
-    for (auto MD : Symvers) 
-      NMD->addOperand(MD); 
-  } 
- 
-  simplifyExternals(*MergedM); 
- 
-  // FIXME: Try to re-use BSI and PFI from the original module here. 
-  ProfileSummaryInfo PSI(M); 
-  ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, &PSI); 
- 
-  // Mark the merged module as requiring full LTO. We still want an index for 
-  // it though, so that it can participate in summary-based dead stripping. 
-  MergedM->addModuleFlag(Module::Error, "ThinLTO", uint32_t(0)); 
-  ModuleSummaryIndex MergedMIndex = 
-      buildModuleSummaryIndex(*MergedM, nullptr, &PSI); 
- 
-  SmallVector<char, 0> Buffer; 
- 
-  BitcodeWriter W(Buffer); 
-  // Save the module hash produced for the full bitcode, which will 
-  // be used in the backends, and use that in the minimized bitcode 
-  // produced for the full link. 
-  ModuleHash ModHash = {{0}}; 
-  W.writeModule(M, /*ShouldPreserveUseListOrder=*/false, &Index, 
-                /*GenerateHash=*/true, &ModHash); 
-  W.writeModule(*MergedM, /*ShouldPreserveUseListOrder=*/false, &MergedMIndex); 
-  W.writeSymtab(); 
-  W.writeStrtab(); 
-  OS << Buffer; 
- 
-  // If a minimized bitcode module was requested for the thin link, only 
-  // the information that is needed by thin link will be written in the 
-  // given OS (the merged module will be written as usual). 
-  if (ThinLinkOS) { 
-    Buffer.clear(); 
-    BitcodeWriter W2(Buffer); 
-    StripDebugInfo(M); 
-    W2.writeThinLinkBitcode(M, Index, ModHash); 
-    W2.writeModule(*MergedM, /*ShouldPreserveUseListOrder=*/false, 
-                   &MergedMIndex); 
-    W2.writeSymtab(); 
-    W2.writeStrtab(); 
-    *ThinLinkOS << Buffer; 
-  } 
-} 
- 
-// Check if the LTO Unit splitting has been enabled. 
-bool enableSplitLTOUnit(Module &M) { 
-  bool EnableSplitLTOUnit = false; 
-  if (auto *MD = mdconst::extract_or_null<ConstantInt>( 
-          M.getModuleFlag("EnableSplitLTOUnit"))) 
-    EnableSplitLTOUnit = MD->getZExtValue(); 
-  return EnableSplitLTOUnit; 
-} 
- 
-// Returns whether this module needs to be split because it uses type metadata. 
-bool hasTypeMetadata(Module &M) { 
-  for (auto &GO : M.global_objects()) { 
-    if (GO.hasMetadata(LLVMContext::MD_type)) 
-      return true; 
-  } 
-  return false; 
-} 
- 
-void writeThinLTOBitcode(raw_ostream &OS, raw_ostream *ThinLinkOS, 
-                         function_ref<AAResults &(Function &)> AARGetter, 
-                         Module &M, const ModuleSummaryIndex *Index) { 
-  std::unique_ptr<ModuleSummaryIndex> NewIndex = nullptr; 
-  // See if this module has any type metadata. If so, we try to split it 
-  // or at least promote type ids to enable WPD. 
-  if (hasTypeMetadata(M)) { 
-    if (enableSplitLTOUnit(M)) 
-      return splitAndWriteThinLTOBitcode(OS, ThinLinkOS, AARGetter, M); 
-    // Promote type ids as needed for index-based WPD. 
-    std::string ModuleId = getUniqueModuleId(&M); 
-    if (!ModuleId.empty()) { 
-      promoteTypeIds(M, ModuleId); 
-      // Need to rebuild the index so that it contains type metadata 
-      // for the newly promoted type ids. 
-      // FIXME: Probably should not bother building the index at all 
-      // in the caller of writeThinLTOBitcode (which does so via the 
-      // ModuleSummaryIndexAnalysis pass), since we have to rebuild it 
-      // anyway whenever there is type metadata (here or in 
-      // splitAndWriteThinLTOBitcode). Just always build it once via the 
-      // buildModuleSummaryIndex when Module(s) are ready. 
-      ProfileSummaryInfo PSI(M); 
-      NewIndex = std::make_unique<ModuleSummaryIndex>( 
-          buildModuleSummaryIndex(M, nullptr, &PSI)); 
-      Index = NewIndex.get(); 
-    } 
-  } 
- 
-  // Write it out as an unsplit ThinLTO module. 
- 
-  // Save the module hash produced for the full bitcode, which will 
-  // be used in the backends, and use that in the minimized bitcode 
-  // produced for the full link. 
-  ModuleHash ModHash = {{0}}; 
-  WriteBitcodeToFile(M, OS, /*ShouldPreserveUseListOrder=*/false, Index, 
-                     /*GenerateHash=*/true, &ModHash); 
-  // If a minimized bitcode module was requested for the thin link, only 
-  // the information that is needed by thin link will be written in the 
-  // given OS. 
-  if (ThinLinkOS && Index) 
-    WriteThinLinkBitcodeToFile(M, *ThinLinkOS, *Index, ModHash); 
-} 
- 
-class WriteThinLTOBitcode : public ModulePass { 
-  raw_ostream &OS; // raw_ostream to print on 
-  // The output stream on which to emit a minimized module for use 
-  // just in the thin link, if requested. 
-  raw_ostream *ThinLinkOS; 
- 
-public: 
-  static char ID; // Pass identification, replacement for typeid 
-  WriteThinLTOBitcode() : ModulePass(ID), OS(dbgs()), ThinLinkOS(nullptr) { 
-    initializeWriteThinLTOBitcodePass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  explicit WriteThinLTOBitcode(raw_ostream &o, raw_ostream *ThinLinkOS) 
-      : ModulePass(ID), OS(o), ThinLinkOS(ThinLinkOS) { 
-    initializeWriteThinLTOBitcodePass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  StringRef getPassName() const override { return "ThinLTO Bitcode Writer"; } 
- 
-  bool runOnModule(Module &M) override { 
-    const ModuleSummaryIndex *Index = 
-        &(getAnalysis<ModuleSummaryIndexWrapperPass>().getIndex()); 
-    writeThinLTOBitcode(OS, ThinLinkOS, LegacyAARGetter(*this), M, Index); 
-    return true; 
-  } 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.setPreservesAll(); 
-    AU.addRequired<AssumptionCacheTracker>(); 
-    AU.addRequired<ModuleSummaryIndexWrapperPass>(); 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-  } 
-}; 
-} // anonymous namespace 
- 
-char WriteThinLTOBitcode::ID = 0; 
-INITIALIZE_PASS_BEGIN(WriteThinLTOBitcode, "write-thinlto-bitcode", 
-                      "Write ThinLTO Bitcode", false, true) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
-INITIALIZE_PASS_DEPENDENCY(ModuleSummaryIndexWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_END(WriteThinLTOBitcode, "write-thinlto-bitcode", 
-                    "Write ThinLTO Bitcode", false, true) 
- 
-ModulePass *llvm::createWriteThinLTOBitcodePass(raw_ostream &Str, 
-                                                raw_ostream *ThinLinkOS) { 
-  return new WriteThinLTOBitcode(Str, ThinLinkOS); 
-} 
- 
-PreservedAnalyses 
-llvm::ThinLTOBitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) { 
-  FunctionAnalysisManager &FAM = 
-      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 
-  writeThinLTOBitcode(OS, ThinLinkOS, 
-                      [&FAM](Function &F) -> AAResults & { 
-                        return FAM.getResult<AAManager>(F); 
-                      }, 
-                      M, &AM.getResult<ModuleSummaryIndexAnalysis>(M)); 
-  return PreservedAnalyses::all(); 
-} 
+    CfiFunctionMDs.push_back(MDTuple::get(Ctx, Elts));
+  }
+
+  if(!CfiFunctionMDs.empty()) {
+    NamedMDNode *NMD = MergedM->getOrInsertNamedMetadata("cfi.functions");
+    for (auto MD : CfiFunctionMDs)
+      NMD->addOperand(MD);
+  }
+
+  SmallVector<MDNode *, 8> FunctionAliases;
+  for (auto &A : M.aliases()) {
+    if (!isa<Function>(A.getAliasee()))
+      continue;
+
+    auto *F = cast<Function>(A.getAliasee());
+
+    Metadata *Elts[] = {
+        MDString::get(Ctx, A.getName()),
+        MDString::get(Ctx, F->getName()),
+        ConstantAsMetadata::get(
+            ConstantInt::get(Type::getInt8Ty(Ctx), A.getVisibility())),
+        ConstantAsMetadata::get(
+            ConstantInt::get(Type::getInt8Ty(Ctx), A.isWeakForLinker())),
+    };
+
+    FunctionAliases.push_back(MDTuple::get(Ctx, Elts));
+  }
+
+  if (!FunctionAliases.empty()) {
+    NamedMDNode *NMD = MergedM->getOrInsertNamedMetadata("aliases");
+    for (auto MD : FunctionAliases)
+      NMD->addOperand(MD);
+  }
+
+  SmallVector<MDNode *, 8> Symvers;
+  ModuleSymbolTable::CollectAsmSymvers(M, [&](StringRef Name, StringRef Alias) {
+    Function *F = M.getFunction(Name);
+    if (!F || F->use_empty())
+      return;
+
+    Symvers.push_back(MDTuple::get(
+        Ctx, {MDString::get(Ctx, Name), MDString::get(Ctx, Alias)}));
+  });
+
+  if (!Symvers.empty()) {
+    NamedMDNode *NMD = MergedM->getOrInsertNamedMetadata("symvers");
+    for (auto MD : Symvers)
+      NMD->addOperand(MD);
+  }
+
+  simplifyExternals(*MergedM);
+
+  // FIXME: Try to re-use BSI and PFI from the original module here.
+  ProfileSummaryInfo PSI(M);
+  ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, &PSI);
+
+  // Mark the merged module as requiring full LTO. We still want an index for
+  // it though, so that it can participate in summary-based dead stripping.
+  MergedM->addModuleFlag(Module::Error, "ThinLTO", uint32_t(0));
+  ModuleSummaryIndex MergedMIndex =
+      buildModuleSummaryIndex(*MergedM, nullptr, &PSI);
+
+  SmallVector<char, 0> Buffer;
+
+  BitcodeWriter W(Buffer);
+  // Save the module hash produced for the full bitcode, which will
+  // be used in the backends, and use that in the minimized bitcode
+  // produced for the full link.
+  ModuleHash ModHash = {{0}};
+  W.writeModule(M, /*ShouldPreserveUseListOrder=*/false, &Index,
+                /*GenerateHash=*/true, &ModHash);
+  W.writeModule(*MergedM, /*ShouldPreserveUseListOrder=*/false, &MergedMIndex);
+  W.writeSymtab();
+  W.writeStrtab();
+  OS << Buffer;
+
+  // If a minimized bitcode module was requested for the thin link, only
+  // the information that is needed by thin link will be written in the
+  // given OS (the merged module will be written as usual).
+  if (ThinLinkOS) {
+    Buffer.clear();
+    BitcodeWriter W2(Buffer);
+    StripDebugInfo(M);
+    W2.writeThinLinkBitcode(M, Index, ModHash);
+    W2.writeModule(*MergedM, /*ShouldPreserveUseListOrder=*/false,
+                   &MergedMIndex);
+    W2.writeSymtab();
+    W2.writeStrtab();
+    *ThinLinkOS << Buffer;
+  }
+}
+
+// Check if the LTO Unit splitting has been enabled.
+bool enableSplitLTOUnit(Module &M) {
+  bool EnableSplitLTOUnit = false;
+  if (auto *MD = mdconst::extract_or_null<ConstantInt>(
+          M.getModuleFlag("EnableSplitLTOUnit")))
+    EnableSplitLTOUnit = MD->getZExtValue();
+  return EnableSplitLTOUnit;
+}
+
+// Returns whether this module needs to be split because it uses type metadata.
+bool hasTypeMetadata(Module &M) {
+  for (auto &GO : M.global_objects()) {
+    if (GO.hasMetadata(LLVMContext::MD_type))
+      return true;
+  }
+  return false;
+}
+
+void writeThinLTOBitcode(raw_ostream &OS, raw_ostream *ThinLinkOS,
+                         function_ref<AAResults &(Function &)> AARGetter,
+                         Module &M, const ModuleSummaryIndex *Index) {
+  std::unique_ptr<ModuleSummaryIndex> NewIndex = nullptr;
+  // See if this module has any type metadata. If so, we try to split it
+  // or at least promote type ids to enable WPD.
+  if (hasTypeMetadata(M)) {
+    if (enableSplitLTOUnit(M))
+      return splitAndWriteThinLTOBitcode(OS, ThinLinkOS, AARGetter, M);
+    // Promote type ids as needed for index-based WPD.
+    std::string ModuleId = getUniqueModuleId(&M);
+    if (!ModuleId.empty()) {
+      promoteTypeIds(M, ModuleId);
+      // Need to rebuild the index so that it contains type metadata
+      // for the newly promoted type ids.
+      // FIXME: Probably should not bother building the index at all
+      // in the caller of writeThinLTOBitcode (which does so via the
+      // ModuleSummaryIndexAnalysis pass), since we have to rebuild it
+      // anyway whenever there is type metadata (here or in
+      // splitAndWriteThinLTOBitcode). Just always build it once via the
+      // buildModuleSummaryIndex when Module(s) are ready.
+      ProfileSummaryInfo PSI(M);
+      NewIndex = std::make_unique<ModuleSummaryIndex>(
+          buildModuleSummaryIndex(M, nullptr, &PSI));
+      Index = NewIndex.get();
+    }
+  }
+
+  // Write it out as an unsplit ThinLTO module.
+
+  // Save the module hash produced for the full bitcode, which will
+  // be used in the backends, and use that in the minimized bitcode
+  // produced for the full link.
+  ModuleHash ModHash = {{0}};
+  WriteBitcodeToFile(M, OS, /*ShouldPreserveUseListOrder=*/false, Index,
+                     /*GenerateHash=*/true, &ModHash);
+  // If a minimized bitcode module was requested for the thin link, only
+  // the information that is needed by thin link will be written in the
+  // given OS.
+  if (ThinLinkOS && Index)
+    WriteThinLinkBitcodeToFile(M, *ThinLinkOS, *Index, ModHash);
+}
+
+class WriteThinLTOBitcode : public ModulePass {
+  raw_ostream &OS; // raw_ostream to print on
+  // The output stream on which to emit a minimized module for use
+  // just in the thin link, if requested.
+  raw_ostream *ThinLinkOS;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WriteThinLTOBitcode() : ModulePass(ID), OS(dbgs()), ThinLinkOS(nullptr) {
+    initializeWriteThinLTOBitcodePass(*PassRegistry::getPassRegistry());
+  }
+
+  explicit WriteThinLTOBitcode(raw_ostream &o, raw_ostream *ThinLinkOS)
+      : ModulePass(ID), OS(o), ThinLinkOS(ThinLinkOS) {
+    initializeWriteThinLTOBitcodePass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "ThinLTO Bitcode Writer"; }
+
+  bool runOnModule(Module &M) override {
+    const ModuleSummaryIndex *Index =
+        &(getAnalysis<ModuleSummaryIndexWrapperPass>().getIndex());
+    writeThinLTOBitcode(OS, ThinLinkOS, LegacyAARGetter(*this), M, Index);
+    return true;
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<ModuleSummaryIndexWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+};
+} // anonymous namespace
+
+char WriteThinLTOBitcode::ID = 0;
+INITIALIZE_PASS_BEGIN(WriteThinLTOBitcode, "write-thinlto-bitcode",
+                      "Write ThinLTO Bitcode", false, true)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(ModuleSummaryIndexWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(WriteThinLTOBitcode, "write-thinlto-bitcode",
+                    "Write ThinLTO Bitcode", false, true)
+
+ModulePass *llvm::createWriteThinLTOBitcodePass(raw_ostream &Str,
+                                                raw_ostream *ThinLinkOS) {
+  return new WriteThinLTOBitcode(Str, ThinLinkOS);
+}
+
+PreservedAnalyses
+llvm::ThinLTOBitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) {
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  writeThinLTOBitcode(OS, ThinLinkOS,
+                      [&FAM](Function &F) -> AAResults & {
+                        return FAM.getResult<AAManager>(F);
+                      },
+                      M, &AM.getResult<ModuleSummaryIndexAnalysis>(M));
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/WholeProgramDevirt.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/WholeProgramDevirt.cpp
index d515fe9ed9..cf1ff405c4 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -1,2216 +1,2216 @@
-//===- WholeProgramDevirt.cpp - Whole program virtual call optimization ---===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass implements whole program optimization of virtual calls in cases 
-// where we know (via !type metadata) that the list of callees is fixed. This 
-// includes the following: 
-// - Single implementation devirtualization: if a virtual call has a single 
-//   possible callee, replace all calls with a direct call to that callee. 
-// - Virtual constant propagation: if the virtual function's return type is an 
-//   integer <=64 bits and all possible callees are readnone, for each class and 
-//   each list of constant arguments: evaluate the function, store the return 
-//   value alongside the virtual table, and rewrite each virtual call as a load 
-//   from the virtual table. 
-// - Uniform return value optimization: if the conditions for virtual constant 
-//   propagation hold and each function returns the same constant value, replace 
-//   each virtual call with that constant. 
-// - Unique return value optimization for i1 return values: if the conditions 
-//   for virtual constant propagation hold and a single vtable's function 
-//   returns 0, or a single vtable's function returns 1, replace each virtual 
-//   call with a comparison of the vptr against that vtable's address. 
-// 
-// This pass is intended to be used during the regular and thin LTO pipelines: 
-// 
-// During regular LTO, the pass determines the best optimization for each 
-// virtual call and applies the resolutions directly to virtual calls that are 
-// eligible for virtual call optimization (i.e. calls that use either of the 
-// llvm.assume(llvm.type.test) or llvm.type.checked.load intrinsics). 
-// 
-// During hybrid Regular/ThinLTO, the pass operates in two phases: 
-// - Export phase: this is run during the thin link over a single merged module 
-//   that contains all vtables with !type metadata that participate in the link. 
-//   The pass computes a resolution for each virtual call and stores it in the 
-//   type identifier summary. 
-// - Import phase: this is run during the thin backends over the individual 
-//   modules. The pass applies the resolutions previously computed during the 
-//   import phase to each eligible virtual call. 
-// 
-// During ThinLTO, the pass operates in two phases: 
-// - Export phase: this is run during the thin link over the index which 
-//   contains a summary of all vtables with !type metadata that participate in 
-//   the link. It computes a resolution for each virtual call and stores it in 
-//   the type identifier summary. Only single implementation devirtualization 
-//   is supported. 
-// - Import phase: (same as with hybrid case above). 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/IPO/WholeProgramDevirt.h" 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/DenseMapInfo.h" 
-#include "llvm/ADT/DenseSet.h" 
-#include "llvm/ADT/MapVector.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Triple.h" 
-#include "llvm/ADT/iterator_range.h" 
+//===- WholeProgramDevirt.cpp - Whole program virtual call optimization ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements whole program optimization of virtual calls in cases
+// where we know (via !type metadata) that the list of callees is fixed. This
+// includes the following:
+// - Single implementation devirtualization: if a virtual call has a single
+//   possible callee, replace all calls with a direct call to that callee.
+// - Virtual constant propagation: if the virtual function's return type is an
+//   integer <=64 bits and all possible callees are readnone, for each class and
+//   each list of constant arguments: evaluate the function, store the return
+//   value alongside the virtual table, and rewrite each virtual call as a load
+//   from the virtual table.
+// - Uniform return value optimization: if the conditions for virtual constant
+//   propagation hold and each function returns the same constant value, replace
+//   each virtual call with that constant.
+// - Unique return value optimization for i1 return values: if the conditions
+//   for virtual constant propagation hold and a single vtable's function
+//   returns 0, or a single vtable's function returns 1, replace each virtual
+//   call with a comparison of the vptr against that vtable's address.
+//
+// This pass is intended to be used during the regular and thin LTO pipelines:
+//
+// During regular LTO, the pass determines the best optimization for each
+// virtual call and applies the resolutions directly to virtual calls that are
+// eligible for virtual call optimization (i.e. calls that use either of the
+// llvm.assume(llvm.type.test) or llvm.type.checked.load intrinsics).
+//
+// During hybrid Regular/ThinLTO, the pass operates in two phases:
+// - Export phase: this is run during the thin link over a single merged module
+//   that contains all vtables with !type metadata that participate in the link.
+//   The pass computes a resolution for each virtual call and stores it in the
+//   type identifier summary.
+// - Import phase: this is run during the thin backends over the individual
+//   modules. The pass applies the resolutions previously computed during the
+//   import phase to each eligible virtual call.
+//
+// During ThinLTO, the pass operates in two phases:
+// - Export phase: this is run during the thin link over the index which
+//   contains a summary of all vtables with !type metadata that participate in
+//   the link. It computes a resolution for each virtual call and stores it in
+//   the type identifier summary. Only single implementation devirtualization
+//   is supported.
+// - Import phase: (same as with hybrid case above).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/WholeProgramDevirt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h" 
-#include "llvm/Analysis/OptimizationRemarkEmitter.h" 
-#include "llvm/Analysis/TypeMetadataUtils.h" 
-#include "llvm/Bitcode/BitcodeReader.h" 
-#include "llvm/Bitcode/BitcodeWriter.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DebugLoc.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GlobalAlias.h" 
-#include "llvm/IR/GlobalVariable.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/ModuleSummaryIndexYAML.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/PassRegistry.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Errc.h" 
-#include "llvm/Support/Error.h" 
-#include "llvm/Support/FileSystem.h" 
-#include "llvm/Support/GlobPattern.h" 
-#include "llvm/Support/MathExtras.h" 
-#include "llvm/Transforms/IPO.h" 
-#include "llvm/Transforms/IPO/FunctionAttrs.h" 
-#include "llvm/Transforms/Utils/Evaluator.h" 
-#include <algorithm> 
-#include <cstddef> 
-#include <map> 
-#include <set> 
-#include <string> 
- 
-using namespace llvm; 
-using namespace wholeprogramdevirt; 
- 
-#define DEBUG_TYPE "wholeprogramdevirt" 
- 
-static cl::opt<PassSummaryAction> ClSummaryAction( 
-    "wholeprogramdevirt-summary-action", 
-    cl::desc("What to do with the summary when running this pass"), 
-    cl::values(clEnumValN(PassSummaryAction::None, "none", "Do nothing"), 
-               clEnumValN(PassSummaryAction::Import, "import", 
-                          "Import typeid resolutions from summary and globals"), 
-               clEnumValN(PassSummaryAction::Export, "export", 
-                          "Export typeid resolutions to summary and globals")), 
-    cl::Hidden); 
- 
-static cl::opt<std::string> ClReadSummary( 
-    "wholeprogramdevirt-read-summary", 
-    cl::desc( 
-        "Read summary from given bitcode or YAML file before running pass"), 
-    cl::Hidden); 
- 
-static cl::opt<std::string> ClWriteSummary( 
-    "wholeprogramdevirt-write-summary", 
-    cl::desc("Write summary to given bitcode or YAML file after running pass. " 
-             "Output file format is deduced from extension: *.bc means writing " 
-             "bitcode, otherwise YAML"), 
-    cl::Hidden); 
- 
-static cl::opt<unsigned> 
-    ClThreshold("wholeprogramdevirt-branch-funnel-threshold", cl::Hidden, 
-                cl::init(10), cl::ZeroOrMore, 
-                cl::desc("Maximum number of call targets per " 
-                         "call site to enable branch funnels")); 
- 
-static cl::opt<bool> 
-    PrintSummaryDevirt("wholeprogramdevirt-print-index-based", cl::Hidden, 
-                       cl::init(false), cl::ZeroOrMore, 
-                       cl::desc("Print index-based devirtualization messages")); 
- 
-/// Provide a way to force enable whole program visibility in tests. 
-/// This is needed to support legacy tests that don't contain 
-/// !vcall_visibility metadata (the mere presense of type tests 
-/// previously implied hidden visibility). 
-cl::opt<bool> 
-    WholeProgramVisibility("whole-program-visibility", cl::init(false), 
-                           cl::Hidden, cl::ZeroOrMore, 
-                           cl::desc("Enable whole program visibility")); 
- 
-/// Provide a way to force disable whole program for debugging or workarounds, 
-/// when enabled via the linker. 
-cl::opt<bool> DisableWholeProgramVisibility( 
-    "disable-whole-program-visibility", cl::init(false), cl::Hidden, 
-    cl::ZeroOrMore, 
-    cl::desc("Disable whole program visibility (overrides enabling options)")); 
- 
-/// Provide way to prevent certain function from being devirtualized 
-cl::list<std::string> 
-    SkipFunctionNames("wholeprogramdevirt-skip", 
-                      cl::desc("Prevent function(s) from being devirtualized"), 
-                      cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated); 
- 
-namespace { 
-struct PatternList { 
-  std::vector<GlobPattern> Patterns; 
-  template <class T> void init(const T &StringList) { 
-    for (const auto &S : StringList) 
-      if (Expected<GlobPattern> Pat = GlobPattern::create(S)) 
-        Patterns.push_back(std::move(*Pat)); 
-  } 
-  bool match(StringRef S) { 
-    for (const GlobPattern &P : Patterns) 
-      if (P.match(S)) 
-        return true; 
-    return false; 
-  } 
-}; 
-} // namespace 
- 
-// Find the minimum offset that we may store a value of size Size bits at. If 
-// IsAfter is set, look for an offset before the object, otherwise look for an 
-// offset after the object. 
-uint64_t 
-wholeprogramdevirt::findLowestOffset(ArrayRef<VirtualCallTarget> Targets, 
-                                     bool IsAfter, uint64_t Size) { 
-  // Find a minimum offset taking into account only vtable sizes. 
-  uint64_t MinByte = 0; 
-  for (const VirtualCallTarget &Target : Targets) { 
-    if (IsAfter) 
-      MinByte = std::max(MinByte, Target.minAfterBytes()); 
-    else 
-      MinByte = std::max(MinByte, Target.minBeforeBytes()); 
-  } 
- 
-  // Build a vector of arrays of bytes covering, for each target, a slice of the 
-  // used region (see AccumBitVector::BytesUsed in 
-  // llvm/Transforms/IPO/WholeProgramDevirt.h) starting at MinByte. Effectively, 
-  // this aligns the used regions to start at MinByte. 
-  // 
-  // In this example, A, B and C are vtables, # is a byte already allocated for 
-  // a virtual function pointer, AAAA... (etc.) are the used regions for the 
-  // vtables and Offset(X) is the value computed for the Offset variable below 
-  // for X. 
-  // 
-  //                    Offset(A) 
-  //                    |       | 
-  //                            |MinByte 
-  // A: ################AAAAAAAA|AAAAAAAA 
-  // B: ########BBBBBBBBBBBBBBBB|BBBB 
-  // C: ########################|CCCCCCCCCCCCCCCC 
-  //            |   Offset(B)   | 
-  // 
-  // This code produces the slices of A, B and C that appear after the divider 
-  // at MinByte. 
-  std::vector<ArrayRef<uint8_t>> Used; 
-  for (const VirtualCallTarget &Target : Targets) { 
-    ArrayRef<uint8_t> VTUsed = IsAfter ? Target.TM->Bits->After.BytesUsed 
-                                       : Target.TM->Bits->Before.BytesUsed; 
-    uint64_t Offset = IsAfter ? MinByte - Target.minAfterBytes() 
-                              : MinByte - Target.minBeforeBytes(); 
- 
-    // Disregard used regions that are smaller than Offset. These are 
-    // effectively all-free regions that do not need to be checked. 
-    if (VTUsed.size() > Offset) 
-      Used.push_back(VTUsed.slice(Offset)); 
-  } 
- 
-  if (Size == 1) { 
-    // Find a free bit in each member of Used. 
-    for (unsigned I = 0;; ++I) { 
-      uint8_t BitsUsed = 0; 
-      for (auto &&B : Used) 
-        if (I < B.size()) 
-          BitsUsed |= B[I]; 
-      if (BitsUsed != 0xff) 
-        return (MinByte + I) * 8 + 
-               countTrailingZeros(uint8_t(~BitsUsed), ZB_Undefined); 
-    } 
-  } else { 
-    // Find a free (Size/8) byte region in each member of Used. 
-    // FIXME: see if alignment helps. 
-    for (unsigned I = 0;; ++I) { 
-      for (auto &&B : Used) { 
-        unsigned Byte = 0; 
-        while ((I + Byte) < B.size() && Byte < (Size / 8)) { 
-          if (B[I + Byte]) 
-            goto NextI; 
-          ++Byte; 
-        } 
-      } 
-      return (MinByte + I) * 8; 
-    NextI:; 
-    } 
-  } 
-} 
- 
-void wholeprogramdevirt::setBeforeReturnValues( 
-    MutableArrayRef<VirtualCallTarget> Targets, uint64_t AllocBefore, 
-    unsigned BitWidth, int64_t &OffsetByte, uint64_t &OffsetBit) { 
-  if (BitWidth == 1) 
-    OffsetByte = -(AllocBefore / 8 + 1); 
-  else 
-    OffsetByte = -((AllocBefore + 7) / 8 + (BitWidth + 7) / 8); 
-  OffsetBit = AllocBefore % 8; 
- 
-  for (VirtualCallTarget &Target : Targets) { 
-    if (BitWidth == 1) 
-      Target.setBeforeBit(AllocBefore); 
-    else 
-      Target.setBeforeBytes(AllocBefore, (BitWidth + 7) / 8); 
-  } 
-} 
- 
-void wholeprogramdevirt::setAfterReturnValues( 
-    MutableArrayRef<VirtualCallTarget> Targets, uint64_t AllocAfter, 
-    unsigned BitWidth, int64_t &OffsetByte, uint64_t &OffsetBit) { 
-  if (BitWidth == 1) 
-    OffsetByte = AllocAfter / 8; 
-  else 
-    OffsetByte = (AllocAfter + 7) / 8; 
-  OffsetBit = AllocAfter % 8; 
- 
-  for (VirtualCallTarget &Target : Targets) { 
-    if (BitWidth == 1) 
-      Target.setAfterBit(AllocAfter); 
-    else 
-      Target.setAfterBytes(AllocAfter, (BitWidth + 7) / 8); 
-  } 
-} 
- 
-VirtualCallTarget::VirtualCallTarget(Function *Fn, const TypeMemberInfo *TM) 
-    : Fn(Fn), TM(TM), 
-      IsBigEndian(Fn->getParent()->getDataLayout().isBigEndian()), WasDevirt(false) {} 
- 
-namespace { 
- 
-// A slot in a set of virtual tables. The TypeID identifies the set of virtual 
-// tables, and the ByteOffset is the offset in bytes from the address point to 
-// the virtual function pointer. 
-struct VTableSlot { 
-  Metadata *TypeID; 
-  uint64_t ByteOffset; 
-}; 
- 
-} // end anonymous namespace 
- 
-namespace llvm { 
- 
-template <> struct DenseMapInfo<VTableSlot> { 
-  static VTableSlot getEmptyKey() { 
-    return {DenseMapInfo<Metadata *>::getEmptyKey(), 
-            DenseMapInfo<uint64_t>::getEmptyKey()}; 
-  } 
-  static VTableSlot getTombstoneKey() { 
-    return {DenseMapInfo<Metadata *>::getTombstoneKey(), 
-            DenseMapInfo<uint64_t>::getTombstoneKey()}; 
-  } 
-  static unsigned getHashValue(const VTableSlot &I) { 
-    return DenseMapInfo<Metadata *>::getHashValue(I.TypeID) ^ 
-           DenseMapInfo<uint64_t>::getHashValue(I.ByteOffset); 
-  } 
-  static bool isEqual(const VTableSlot &LHS, 
-                      const VTableSlot &RHS) { 
-    return LHS.TypeID == RHS.TypeID && LHS.ByteOffset == RHS.ByteOffset; 
-  } 
-}; 
- 
-template <> struct DenseMapInfo<VTableSlotSummary> { 
-  static VTableSlotSummary getEmptyKey() { 
-    return {DenseMapInfo<StringRef>::getEmptyKey(), 
-            DenseMapInfo<uint64_t>::getEmptyKey()}; 
-  } 
-  static VTableSlotSummary getTombstoneKey() { 
-    return {DenseMapInfo<StringRef>::getTombstoneKey(), 
-            DenseMapInfo<uint64_t>::getTombstoneKey()}; 
-  } 
-  static unsigned getHashValue(const VTableSlotSummary &I) { 
-    return DenseMapInfo<StringRef>::getHashValue(I.TypeID) ^ 
-           DenseMapInfo<uint64_t>::getHashValue(I.ByteOffset); 
-  } 
-  static bool isEqual(const VTableSlotSummary &LHS, 
-                      const VTableSlotSummary &RHS) { 
-    return LHS.TypeID == RHS.TypeID && LHS.ByteOffset == RHS.ByteOffset; 
-  } 
-}; 
- 
-} // end namespace llvm 
- 
-namespace { 
- 
-// A virtual call site. VTable is the loaded virtual table pointer, and CS is 
-// the indirect virtual call. 
-struct VirtualCallSite { 
-  Value *VTable = nullptr; 
-  CallBase &CB; 
- 
-  // If non-null, this field points to the associated unsafe use count stored in 
-  // the DevirtModule::NumUnsafeUsesForTypeTest map below. See the description 
-  // of that field for details. 
-  unsigned *NumUnsafeUses = nullptr; 
- 
-  void 
-  emitRemark(const StringRef OptName, const StringRef TargetName, 
-             function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) { 
-    Function *F = CB.getCaller(); 
-    DebugLoc DLoc = CB.getDebugLoc(); 
-    BasicBlock *Block = CB.getParent(); 
- 
-    using namespace ore; 
-    OREGetter(F).emit(OptimizationRemark(DEBUG_TYPE, OptName, DLoc, Block) 
-                      << NV("Optimization", OptName) 
-                      << ": devirtualized a call to " 
-                      << NV("FunctionName", TargetName)); 
-  } 
- 
-  void replaceAndErase( 
-      const StringRef OptName, const StringRef TargetName, bool RemarksEnabled, 
-      function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter, 
-      Value *New) { 
-    if (RemarksEnabled) 
-      emitRemark(OptName, TargetName, OREGetter); 
-    CB.replaceAllUsesWith(New); 
-    if (auto *II = dyn_cast<InvokeInst>(&CB)) { 
-      BranchInst::Create(II->getNormalDest(), &CB); 
-      II->getUnwindDest()->removePredecessor(II->getParent()); 
-    } 
-    CB.eraseFromParent(); 
-    // This use is no longer unsafe. 
-    if (NumUnsafeUses) 
-      --*NumUnsafeUses; 
-  } 
-}; 
- 
-// Call site information collected for a specific VTableSlot and possibly a list 
-// of constant integer arguments. The grouping by arguments is handled by the 
-// VTableSlotInfo class. 
-struct CallSiteInfo { 
-  /// The set of call sites for this slot. Used during regular LTO and the 
-  /// import phase of ThinLTO (as well as the export phase of ThinLTO for any 
-  /// call sites that appear in the merged module itself); in each of these 
-  /// cases we are directly operating on the call sites at the IR level. 
-  std::vector<VirtualCallSite> CallSites; 
- 
-  /// Whether all call sites represented by this CallSiteInfo, including those 
-  /// in summaries, have been devirtualized. This starts off as true because a 
-  /// default constructed CallSiteInfo represents no call sites. 
-  bool AllCallSitesDevirted = true; 
- 
-  // These fields are used during the export phase of ThinLTO and reflect 
-  // information collected from function summaries. 
- 
-  /// Whether any function summary contains an llvm.assume(llvm.type.test) for 
-  /// this slot. 
-  bool SummaryHasTypeTestAssumeUsers = false; 
- 
-  /// CFI-specific: a vector containing the list of function summaries that use 
-  /// the llvm.type.checked.load intrinsic and therefore will require 
-  /// resolutions for llvm.type.test in order to implement CFI checks if 
-  /// devirtualization was unsuccessful. If devirtualization was successful, the 
-  /// pass will clear this vector by calling markDevirt(). If at the end of the 
-  /// pass the vector is non-empty, we will need to add a use of llvm.type.test 
-  /// to each of the function summaries in the vector. 
-  std::vector<FunctionSummary *> SummaryTypeCheckedLoadUsers; 
-  std::vector<FunctionSummary *> SummaryTypeTestAssumeUsers; 
- 
-  bool isExported() const { 
-    return SummaryHasTypeTestAssumeUsers || 
-           !SummaryTypeCheckedLoadUsers.empty(); 
-  } 
- 
-  void addSummaryTypeCheckedLoadUser(FunctionSummary *FS) { 
-    SummaryTypeCheckedLoadUsers.push_back(FS); 
-    AllCallSitesDevirted = false; 
-  } 
- 
-  void addSummaryTypeTestAssumeUser(FunctionSummary *FS) { 
-    SummaryTypeTestAssumeUsers.push_back(FS); 
-    SummaryHasTypeTestAssumeUsers = true; 
-    AllCallSitesDevirted = false; 
-  } 
- 
-  void markDevirt() { 
-    AllCallSitesDevirted = true; 
- 
-    // As explained in the comment for SummaryTypeCheckedLoadUsers. 
-    SummaryTypeCheckedLoadUsers.clear(); 
-  } 
-}; 
- 
-// Call site information collected for a specific VTableSlot. 
-struct VTableSlotInfo { 
-  // The set of call sites which do not have all constant integer arguments 
-  // (excluding "this"). 
-  CallSiteInfo CSInfo; 
- 
-  // The set of call sites with all constant integer arguments (excluding 
-  // "this"), grouped by argument list. 
-  std::map<std::vector<uint64_t>, CallSiteInfo> ConstCSInfo; 
- 
-  void addCallSite(Value *VTable, CallBase &CB, unsigned *NumUnsafeUses); 
- 
-private: 
-  CallSiteInfo &findCallSiteInfo(CallBase &CB); 
-}; 
- 
-CallSiteInfo &VTableSlotInfo::findCallSiteInfo(CallBase &CB) { 
-  std::vector<uint64_t> Args; 
-  auto *CBType = dyn_cast<IntegerType>(CB.getType()); 
-  if (!CBType || CBType->getBitWidth() > 64 || CB.arg_empty()) 
-    return CSInfo; 
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TypeMetadataUtils.h"
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSummaryIndexYAML.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/GlobPattern.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/FunctionAttrs.h"
+#include "llvm/Transforms/Utils/Evaluator.h"
+#include <algorithm>
+#include <cstddef>
+#include <map>
+#include <set>
+#include <string>
+
+using namespace llvm;
+using namespace wholeprogramdevirt;
+
+#define DEBUG_TYPE "wholeprogramdevirt"
+
+static cl::opt<PassSummaryAction> ClSummaryAction(
+    "wholeprogramdevirt-summary-action",
+    cl::desc("What to do with the summary when running this pass"),
+    cl::values(clEnumValN(PassSummaryAction::None, "none", "Do nothing"),
+               clEnumValN(PassSummaryAction::Import, "import",
+                          "Import typeid resolutions from summary and globals"),
+               clEnumValN(PassSummaryAction::Export, "export",
+                          "Export typeid resolutions to summary and globals")),
+    cl::Hidden);
+
+static cl::opt<std::string> ClReadSummary(
+    "wholeprogramdevirt-read-summary",
+    cl::desc(
+        "Read summary from given bitcode or YAML file before running pass"),
+    cl::Hidden);
+
+static cl::opt<std::string> ClWriteSummary(
+    "wholeprogramdevirt-write-summary",
+    cl::desc("Write summary to given bitcode or YAML file after running pass. "
+             "Output file format is deduced from extension: *.bc means writing "
+             "bitcode, otherwise YAML"),
+    cl::Hidden);
+
+static cl::opt<unsigned>
+    ClThreshold("wholeprogramdevirt-branch-funnel-threshold", cl::Hidden,
+                cl::init(10), cl::ZeroOrMore,
+                cl::desc("Maximum number of call targets per "
+                         "call site to enable branch funnels"));
+
+static cl::opt<bool>
+    PrintSummaryDevirt("wholeprogramdevirt-print-index-based", cl::Hidden,
+                       cl::init(false), cl::ZeroOrMore,
+                       cl::desc("Print index-based devirtualization messages"));
+
+/// Provide a way to force enable whole program visibility in tests.
+/// This is needed to support legacy tests that don't contain
+/// !vcall_visibility metadata (the mere presense of type tests
+/// previously implied hidden visibility).
+cl::opt<bool>
+    WholeProgramVisibility("whole-program-visibility", cl::init(false),
+                           cl::Hidden, cl::ZeroOrMore,
+                           cl::desc("Enable whole program visibility"));
+
+/// Provide a way to force disable whole program for debugging or workarounds,
+/// when enabled via the linker.
+cl::opt<bool> DisableWholeProgramVisibility(
+    "disable-whole-program-visibility", cl::init(false), cl::Hidden,
+    cl::ZeroOrMore,
+    cl::desc("Disable whole program visibility (overrides enabling options)"));
+
+/// Provide way to prevent certain function from being devirtualized
+cl::list<std::string>
+    SkipFunctionNames("wholeprogramdevirt-skip",
+                      cl::desc("Prevent function(s) from being devirtualized"),
+                      cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated);
+
+namespace {
+struct PatternList {
+  std::vector<GlobPattern> Patterns;
+  template <class T> void init(const T &StringList) {
+    for (const auto &S : StringList)
+      if (Expected<GlobPattern> Pat = GlobPattern::create(S))
+        Patterns.push_back(std::move(*Pat));
+  }
+  bool match(StringRef S) {
+    for (const GlobPattern &P : Patterns)
+      if (P.match(S))
+        return true;
+    return false;
+  }
+};
+} // namespace
+
+// Find the minimum offset that we may store a value of size Size bits at. If
+// IsAfter is set, look for an offset before the object, otherwise look for an
+// offset after the object.
+uint64_t
+wholeprogramdevirt::findLowestOffset(ArrayRef<VirtualCallTarget> Targets,
+                                     bool IsAfter, uint64_t Size) {
+  // Find a minimum offset taking into account only vtable sizes.
+  uint64_t MinByte = 0;
+  for (const VirtualCallTarget &Target : Targets) {
+    if (IsAfter)
+      MinByte = std::max(MinByte, Target.minAfterBytes());
+    else
+      MinByte = std::max(MinByte, Target.minBeforeBytes());
+  }
+
+  // Build a vector of arrays of bytes covering, for each target, a slice of the
+  // used region (see AccumBitVector::BytesUsed in
+  // llvm/Transforms/IPO/WholeProgramDevirt.h) starting at MinByte. Effectively,
+  // this aligns the used regions to start at MinByte.
+  //
+  // In this example, A, B and C are vtables, # is a byte already allocated for
+  // a virtual function pointer, AAAA... (etc.) are the used regions for the
+  // vtables and Offset(X) is the value computed for the Offset variable below
+  // for X.
+  //
+  //                    Offset(A)
+  //                    |       |
+  //                            |MinByte
+  // A: ################AAAAAAAA|AAAAAAAA
+  // B: ########BBBBBBBBBBBBBBBB|BBBB
+  // C: ########################|CCCCCCCCCCCCCCCC
+  //            |   Offset(B)   |
+  //
+  // This code produces the slices of A, B and C that appear after the divider
+  // at MinByte.
+  std::vector<ArrayRef<uint8_t>> Used;
+  for (const VirtualCallTarget &Target : Targets) {
+    ArrayRef<uint8_t> VTUsed = IsAfter ? Target.TM->Bits->After.BytesUsed
+                                       : Target.TM->Bits->Before.BytesUsed;
+    uint64_t Offset = IsAfter ? MinByte - Target.minAfterBytes()
+                              : MinByte - Target.minBeforeBytes();
+
+    // Disregard used regions that are smaller than Offset. These are
+    // effectively all-free regions that do not need to be checked.
+    if (VTUsed.size() > Offset)
+      Used.push_back(VTUsed.slice(Offset));
+  }
+
+  if (Size == 1) {
+    // Find a free bit in each member of Used.
+    for (unsigned I = 0;; ++I) {
+      uint8_t BitsUsed = 0;
+      for (auto &&B : Used)
+        if (I < B.size())
+          BitsUsed |= B[I];
+      if (BitsUsed != 0xff)
+        return (MinByte + I) * 8 +
+               countTrailingZeros(uint8_t(~BitsUsed), ZB_Undefined);
+    }
+  } else {
+    // Find a free (Size/8) byte region in each member of Used.
+    // FIXME: see if alignment helps.
+    for (unsigned I = 0;; ++I) {
+      for (auto &&B : Used) {
+        unsigned Byte = 0;
+        while ((I + Byte) < B.size() && Byte < (Size / 8)) {
+          if (B[I + Byte])
+            goto NextI;
+          ++Byte;
+        }
+      }
+      return (MinByte + I) * 8;
+    NextI:;
+    }
+  }
+}
+
+void wholeprogramdevirt::setBeforeReturnValues(
+    MutableArrayRef<VirtualCallTarget> Targets, uint64_t AllocBefore,
+    unsigned BitWidth, int64_t &OffsetByte, uint64_t &OffsetBit) {
+  if (BitWidth == 1)
+    OffsetByte = -(AllocBefore / 8 + 1);
+  else
+    OffsetByte = -((AllocBefore + 7) / 8 + (BitWidth + 7) / 8);
+  OffsetBit = AllocBefore % 8;
+
+  for (VirtualCallTarget &Target : Targets) {
+    if (BitWidth == 1)
+      Target.setBeforeBit(AllocBefore);
+    else
+      Target.setBeforeBytes(AllocBefore, (BitWidth + 7) / 8);
+  }
+}
+
+void wholeprogramdevirt::setAfterReturnValues(
+    MutableArrayRef<VirtualCallTarget> Targets, uint64_t AllocAfter,
+    unsigned BitWidth, int64_t &OffsetByte, uint64_t &OffsetBit) {
+  if (BitWidth == 1)
+    OffsetByte = AllocAfter / 8;
+  else
+    OffsetByte = (AllocAfter + 7) / 8;
+  OffsetBit = AllocAfter % 8;
+
+  for (VirtualCallTarget &Target : Targets) {
+    if (BitWidth == 1)
+      Target.setAfterBit(AllocAfter);
+    else
+      Target.setAfterBytes(AllocAfter, (BitWidth + 7) / 8);
+  }
+}
+
+VirtualCallTarget::VirtualCallTarget(Function *Fn, const TypeMemberInfo *TM)
+    : Fn(Fn), TM(TM),
+      IsBigEndian(Fn->getParent()->getDataLayout().isBigEndian()), WasDevirt(false) {}
+
+namespace {
+
+// A slot in a set of virtual tables. The TypeID identifies the set of virtual
+// tables, and the ByteOffset is the offset in bytes from the address point to
+// the virtual function pointer.
+struct VTableSlot {
+  Metadata *TypeID;
+  uint64_t ByteOffset;
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+template <> struct DenseMapInfo<VTableSlot> {
+  static VTableSlot getEmptyKey() {
+    return {DenseMapInfo<Metadata *>::getEmptyKey(),
+            DenseMapInfo<uint64_t>::getEmptyKey()};
+  }
+  static VTableSlot getTombstoneKey() {
+    return {DenseMapInfo<Metadata *>::getTombstoneKey(),
+            DenseMapInfo<uint64_t>::getTombstoneKey()};
+  }
+  static unsigned getHashValue(const VTableSlot &I) {
+    return DenseMapInfo<Metadata *>::getHashValue(I.TypeID) ^
+           DenseMapInfo<uint64_t>::getHashValue(I.ByteOffset);
+  }
+  static bool isEqual(const VTableSlot &LHS,
+                      const VTableSlot &RHS) {
+    return LHS.TypeID == RHS.TypeID && LHS.ByteOffset == RHS.ByteOffset;
+  }
+};
+
+template <> struct DenseMapInfo<VTableSlotSummary> {
+  static VTableSlotSummary getEmptyKey() {
+    return {DenseMapInfo<StringRef>::getEmptyKey(),
+            DenseMapInfo<uint64_t>::getEmptyKey()};
+  }
+  static VTableSlotSummary getTombstoneKey() {
+    return {DenseMapInfo<StringRef>::getTombstoneKey(),
+            DenseMapInfo<uint64_t>::getTombstoneKey()};
+  }
+  static unsigned getHashValue(const VTableSlotSummary &I) {
+    return DenseMapInfo<StringRef>::getHashValue(I.TypeID) ^
+           DenseMapInfo<uint64_t>::getHashValue(I.ByteOffset);
+  }
+  static bool isEqual(const VTableSlotSummary &LHS,
+                      const VTableSlotSummary &RHS) {
+    return LHS.TypeID == RHS.TypeID && LHS.ByteOffset == RHS.ByteOffset;
+  }
+};
+
+} // end namespace llvm
+
+namespace {
+
+// A virtual call site. VTable is the loaded virtual table pointer, and CS is
+// the indirect virtual call.
+struct VirtualCallSite {
+  Value *VTable = nullptr;
+  CallBase &CB;
+
+  // If non-null, this field points to the associated unsafe use count stored in
+  // the DevirtModule::NumUnsafeUsesForTypeTest map below. See the description
+  // of that field for details.
+  unsigned *NumUnsafeUses = nullptr;
+
+  void
+  emitRemark(const StringRef OptName, const StringRef TargetName,
+             function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
+    Function *F = CB.getCaller();
+    DebugLoc DLoc = CB.getDebugLoc();
+    BasicBlock *Block = CB.getParent();
+
+    using namespace ore;
+    OREGetter(F).emit(OptimizationRemark(DEBUG_TYPE, OptName, DLoc, Block)
+                      << NV("Optimization", OptName)
+                      << ": devirtualized a call to "
+                      << NV("FunctionName", TargetName));
+  }
+
+  void replaceAndErase(
+      const StringRef OptName, const StringRef TargetName, bool RemarksEnabled,
+      function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter,
+      Value *New) {
+    if (RemarksEnabled)
+      emitRemark(OptName, TargetName, OREGetter);
+    CB.replaceAllUsesWith(New);
+    if (auto *II = dyn_cast<InvokeInst>(&CB)) {
+      BranchInst::Create(II->getNormalDest(), &CB);
+      II->getUnwindDest()->removePredecessor(II->getParent());
+    }
+    CB.eraseFromParent();
+    // This use is no longer unsafe.
+    if (NumUnsafeUses)
+      --*NumUnsafeUses;
+  }
+};
+
+// Call site information collected for a specific VTableSlot and possibly a list
+// of constant integer arguments. The grouping by arguments is handled by the
+// VTableSlotInfo class.
+struct CallSiteInfo {
+  /// The set of call sites for this slot. Used during regular LTO and the
+  /// import phase of ThinLTO (as well as the export phase of ThinLTO for any
+  /// call sites that appear in the merged module itself); in each of these
+  /// cases we are directly operating on the call sites at the IR level.
+  std::vector<VirtualCallSite> CallSites;
+
+  /// Whether all call sites represented by this CallSiteInfo, including those
+  /// in summaries, have been devirtualized. This starts off as true because a
+  /// default constructed CallSiteInfo represents no call sites.
+  bool AllCallSitesDevirted = true;
+
+  // These fields are used during the export phase of ThinLTO and reflect
+  // information collected from function summaries.
+
+  /// Whether any function summary contains an llvm.assume(llvm.type.test) for
+  /// this slot.
+  bool SummaryHasTypeTestAssumeUsers = false;
+
+  /// CFI-specific: a vector containing the list of function summaries that use
+  /// the llvm.type.checked.load intrinsic and therefore will require
+  /// resolutions for llvm.type.test in order to implement CFI checks if
+  /// devirtualization was unsuccessful. If devirtualization was successful, the
+  /// pass will clear this vector by calling markDevirt(). If at the end of the
+  /// pass the vector is non-empty, we will need to add a use of llvm.type.test
+  /// to each of the function summaries in the vector.
+  std::vector<FunctionSummary *> SummaryTypeCheckedLoadUsers;
+  std::vector<FunctionSummary *> SummaryTypeTestAssumeUsers;
+
+  bool isExported() const {
+    return SummaryHasTypeTestAssumeUsers ||
+           !SummaryTypeCheckedLoadUsers.empty();
+  }
+
+  void addSummaryTypeCheckedLoadUser(FunctionSummary *FS) {
+    SummaryTypeCheckedLoadUsers.push_back(FS);
+    AllCallSitesDevirted = false;
+  }
+
+  void addSummaryTypeTestAssumeUser(FunctionSummary *FS) {
+    SummaryTypeTestAssumeUsers.push_back(FS);
+    SummaryHasTypeTestAssumeUsers = true;
+    AllCallSitesDevirted = false;
+  }
+
+  void markDevirt() {
+    AllCallSitesDevirted = true;
+
+    // As explained in the comment for SummaryTypeCheckedLoadUsers.
+    SummaryTypeCheckedLoadUsers.clear();
+  }
+};
+
+// Call site information collected for a specific VTableSlot.
+struct VTableSlotInfo {
+  // The set of call sites which do not have all constant integer arguments
+  // (excluding "this").
+  CallSiteInfo CSInfo;
+
+  // The set of call sites with all constant integer arguments (excluding
+  // "this"), grouped by argument list.
+  std::map<std::vector<uint64_t>, CallSiteInfo> ConstCSInfo;
+
+  void addCallSite(Value *VTable, CallBase &CB, unsigned *NumUnsafeUses);
+
+private:
+  CallSiteInfo &findCallSiteInfo(CallBase &CB);
+};
+
+CallSiteInfo &VTableSlotInfo::findCallSiteInfo(CallBase &CB) {
+  std::vector<uint64_t> Args;
+  auto *CBType = dyn_cast<IntegerType>(CB.getType());
+  if (!CBType || CBType->getBitWidth() > 64 || CB.arg_empty())
+    return CSInfo;
   for (auto &&Arg : drop_begin(CB.args())) {
-    auto *CI = dyn_cast<ConstantInt>(Arg); 
-    if (!CI || CI->getBitWidth() > 64) 
-      return CSInfo; 
-    Args.push_back(CI->getZExtValue()); 
-  } 
-  return ConstCSInfo[Args]; 
-} 
- 
-void VTableSlotInfo::addCallSite(Value *VTable, CallBase &CB, 
-                                 unsigned *NumUnsafeUses) { 
-  auto &CSI = findCallSiteInfo(CB); 
-  CSI.AllCallSitesDevirted = false; 
-  CSI.CallSites.push_back({VTable, CB, NumUnsafeUses}); 
-} 
- 
-struct DevirtModule { 
-  Module &M; 
-  function_ref<AAResults &(Function &)> AARGetter; 
-  function_ref<DominatorTree &(Function &)> LookupDomTree; 
- 
-  ModuleSummaryIndex *ExportSummary; 
-  const ModuleSummaryIndex *ImportSummary; 
- 
-  IntegerType *Int8Ty; 
-  PointerType *Int8PtrTy; 
-  IntegerType *Int32Ty; 
-  IntegerType *Int64Ty; 
-  IntegerType *IntPtrTy; 
-  /// Sizeless array type, used for imported vtables. This provides a signal 
-  /// to analyzers that these imports may alias, as they do for example 
-  /// when multiple unique return values occur in the same vtable. 
-  ArrayType *Int8Arr0Ty; 
- 
-  bool RemarksEnabled; 
-  function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter; 
- 
-  MapVector<VTableSlot, VTableSlotInfo> CallSlots; 
- 
-  // This map keeps track of the number of "unsafe" uses of a loaded function 
-  // pointer. The key is the associated llvm.type.test intrinsic call generated 
-  // by this pass. An unsafe use is one that calls the loaded function pointer 
-  // directly. Every time we eliminate an unsafe use (for example, by 
-  // devirtualizing it or by applying virtual constant propagation), we 
-  // decrement the value stored in this map. If a value reaches zero, we can 
-  // eliminate the type check by RAUWing the associated llvm.type.test call with 
-  // true. 
-  std::map<CallInst *, unsigned> NumUnsafeUsesForTypeTest; 
-  PatternList FunctionsToSkip; 
- 
-  DevirtModule(Module &M, function_ref<AAResults &(Function &)> AARGetter, 
-               function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter, 
-               function_ref<DominatorTree &(Function &)> LookupDomTree, 
-               ModuleSummaryIndex *ExportSummary, 
-               const ModuleSummaryIndex *ImportSummary) 
-      : M(M), AARGetter(AARGetter), LookupDomTree(LookupDomTree), 
-        ExportSummary(ExportSummary), ImportSummary(ImportSummary), 
-        Int8Ty(Type::getInt8Ty(M.getContext())), 
-        Int8PtrTy(Type::getInt8PtrTy(M.getContext())), 
-        Int32Ty(Type::getInt32Ty(M.getContext())), 
-        Int64Ty(Type::getInt64Ty(M.getContext())), 
-        IntPtrTy(M.getDataLayout().getIntPtrType(M.getContext(), 0)), 
-        Int8Arr0Ty(ArrayType::get(Type::getInt8Ty(M.getContext()), 0)), 
-        RemarksEnabled(areRemarksEnabled()), OREGetter(OREGetter) { 
-    assert(!(ExportSummary && ImportSummary)); 
-    FunctionsToSkip.init(SkipFunctionNames); 
-  } 
- 
-  bool areRemarksEnabled(); 
- 
-  void 
-  scanTypeTestUsers(Function *TypeTestFunc, 
-                    DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap); 
-  void scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc); 
- 
-  void buildTypeIdentifierMap( 
-      std::vector<VTableBits> &Bits, 
-      DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap); 
-  bool 
-  tryFindVirtualCallTargets(std::vector<VirtualCallTarget> &TargetsForSlot, 
-                            const std::set<TypeMemberInfo> &TypeMemberInfos, 
-                            uint64_t ByteOffset); 
- 
-  void applySingleImplDevirt(VTableSlotInfo &SlotInfo, Constant *TheFn, 
-                             bool &IsExported); 
-  bool trySingleImplDevirt(ModuleSummaryIndex *ExportSummary, 
-                           MutableArrayRef<VirtualCallTarget> TargetsForSlot, 
-                           VTableSlotInfo &SlotInfo, 
-                           WholeProgramDevirtResolution *Res); 
- 
-  void applyICallBranchFunnel(VTableSlotInfo &SlotInfo, Constant *JT, 
-                              bool &IsExported); 
-  void tryICallBranchFunnel(MutableArrayRef<VirtualCallTarget> TargetsForSlot, 
-                            VTableSlotInfo &SlotInfo, 
-                            WholeProgramDevirtResolution *Res, VTableSlot Slot); 
- 
-  bool tryEvaluateFunctionsWithArgs( 
-      MutableArrayRef<VirtualCallTarget> TargetsForSlot, 
-      ArrayRef<uint64_t> Args); 
- 
-  void applyUniformRetValOpt(CallSiteInfo &CSInfo, StringRef FnName, 
-                             uint64_t TheRetVal); 
-  bool tryUniformRetValOpt(MutableArrayRef<VirtualCallTarget> TargetsForSlot, 
-                           CallSiteInfo &CSInfo, 
-                           WholeProgramDevirtResolution::ByArg *Res); 
- 
-  // Returns the global symbol name that is used to export information about the 
-  // given vtable slot and list of arguments. 
-  std::string getGlobalName(VTableSlot Slot, ArrayRef<uint64_t> Args, 
-                            StringRef Name); 
- 
-  bool shouldExportConstantsAsAbsoluteSymbols(); 
- 
-  // This function is called during the export phase to create a symbol 
-  // definition containing information about the given vtable slot and list of 
-  // arguments. 
-  void exportGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args, StringRef Name, 
-                    Constant *C); 
-  void exportConstant(VTableSlot Slot, ArrayRef<uint64_t> Args, StringRef Name, 
-                      uint32_t Const, uint32_t &Storage); 
- 
-  // This function is called during the import phase to create a reference to 
-  // the symbol definition created during the export phase. 
-  Constant *importGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args, 
-                         StringRef Name); 
-  Constant *importConstant(VTableSlot Slot, ArrayRef<uint64_t> Args, 
-                           StringRef Name, IntegerType *IntTy, 
-                           uint32_t Storage); 
- 
-  Constant *getMemberAddr(const TypeMemberInfo *M); 
- 
-  void applyUniqueRetValOpt(CallSiteInfo &CSInfo, StringRef FnName, bool IsOne, 
-                            Constant *UniqueMemberAddr); 
-  bool tryUniqueRetValOpt(unsigned BitWidth, 
-                          MutableArrayRef<VirtualCallTarget> TargetsForSlot, 
-                          CallSiteInfo &CSInfo, 
-                          WholeProgramDevirtResolution::ByArg *Res, 
-                          VTableSlot Slot, ArrayRef<uint64_t> Args); 
- 
-  void applyVirtualConstProp(CallSiteInfo &CSInfo, StringRef FnName, 
-                             Constant *Byte, Constant *Bit); 
-  bool tryVirtualConstProp(MutableArrayRef<VirtualCallTarget> TargetsForSlot, 
-                           VTableSlotInfo &SlotInfo, 
-                           WholeProgramDevirtResolution *Res, VTableSlot Slot); 
- 
-  void rebuildGlobal(VTableBits &B); 
- 
-  // Apply the summary resolution for Slot to all virtual calls in SlotInfo. 
-  void importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo); 
- 
-  // If we were able to eliminate all unsafe uses for a type checked load, 
-  // eliminate the associated type tests by replacing them with true. 
-  void removeRedundantTypeTests(); 
- 
-  bool run(); 
- 
-  // Lower the module using the action and summary passed as command line 
-  // arguments. For testing purposes only. 
-  static bool 
-  runForTesting(Module &M, function_ref<AAResults &(Function &)> AARGetter, 
-                function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter, 
-                function_ref<DominatorTree &(Function &)> LookupDomTree); 
-}; 
- 
-struct DevirtIndex { 
-  ModuleSummaryIndex &ExportSummary; 
-  // The set in which to record GUIDs exported from their module by 
-  // devirtualization, used by client to ensure they are not internalized. 
-  std::set<GlobalValue::GUID> &ExportedGUIDs; 
-  // A map in which to record the information necessary to locate the WPD 
-  // resolution for local targets in case they are exported by cross module 
-  // importing. 
-  std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap; 
- 
-  MapVector<VTableSlotSummary, VTableSlotInfo> CallSlots; 
- 
-  PatternList FunctionsToSkip; 
- 
-  DevirtIndex( 
-      ModuleSummaryIndex &ExportSummary, 
-      std::set<GlobalValue::GUID> &ExportedGUIDs, 
-      std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap) 
-      : ExportSummary(ExportSummary), ExportedGUIDs(ExportedGUIDs), 
-        LocalWPDTargetsMap(LocalWPDTargetsMap) { 
-    FunctionsToSkip.init(SkipFunctionNames); 
-  } 
- 
-  bool tryFindVirtualCallTargets(std::vector<ValueInfo> &TargetsForSlot, 
-                                 const TypeIdCompatibleVtableInfo TIdInfo, 
-                                 uint64_t ByteOffset); 
- 
-  bool trySingleImplDevirt(MutableArrayRef<ValueInfo> TargetsForSlot, 
-                           VTableSlotSummary &SlotSummary, 
-                           VTableSlotInfo &SlotInfo, 
-                           WholeProgramDevirtResolution *Res, 
-                           std::set<ValueInfo> &DevirtTargets); 
- 
-  void run(); 
-}; 
- 
-struct WholeProgramDevirt : public ModulePass { 
-  static char ID; 
- 
-  bool UseCommandLine = false; 
- 
-  ModuleSummaryIndex *ExportSummary = nullptr; 
-  const ModuleSummaryIndex *ImportSummary = nullptr; 
- 
-  WholeProgramDevirt() : ModulePass(ID), UseCommandLine(true) { 
-    initializeWholeProgramDevirtPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  WholeProgramDevirt(ModuleSummaryIndex *ExportSummary, 
-                     const ModuleSummaryIndex *ImportSummary) 
-      : ModulePass(ID), ExportSummary(ExportSummary), 
-        ImportSummary(ImportSummary) { 
-    initializeWholeProgramDevirtPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnModule(Module &M) override { 
-    if (skipModule(M)) 
-      return false; 
- 
-    // In the new pass manager, we can request the optimization 
-    // remark emitter pass on a per-function-basis, which the 
-    // OREGetter will do for us. 
-    // In the old pass manager, this is harder, so we just build 
-    // an optimization remark emitter on the fly, when we need it. 
-    std::unique_ptr<OptimizationRemarkEmitter> ORE; 
-    auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & { 
-      ORE = std::make_unique<OptimizationRemarkEmitter>(F); 
-      return *ORE; 
-    }; 
- 
-    auto LookupDomTree = [this](Function &F) -> DominatorTree & { 
-      return this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree(); 
-    }; 
- 
-    if (UseCommandLine) 
-      return DevirtModule::runForTesting(M, LegacyAARGetter(*this), OREGetter, 
-                                         LookupDomTree); 
- 
-    return DevirtModule(M, LegacyAARGetter(*this), OREGetter, LookupDomTree, 
-                        ExportSummary, ImportSummary) 
-        .run(); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<AssumptionCacheTracker>(); 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-INITIALIZE_PASS_BEGIN(WholeProgramDevirt, "wholeprogramdevirt", 
-                      "Whole program devirtualization", false, false) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_END(WholeProgramDevirt, "wholeprogramdevirt", 
-                    "Whole program devirtualization", false, false) 
-char WholeProgramDevirt::ID = 0; 
- 
-ModulePass * 
-llvm::createWholeProgramDevirtPass(ModuleSummaryIndex *ExportSummary, 
-                                   const ModuleSummaryIndex *ImportSummary) { 
-  return new WholeProgramDevirt(ExportSummary, ImportSummary); 
-} 
- 
-PreservedAnalyses WholeProgramDevirtPass::run(Module &M, 
-                                              ModuleAnalysisManager &AM) { 
-  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 
-  auto AARGetter = [&](Function &F) -> AAResults & { 
-    return FAM.getResult<AAManager>(F); 
-  }; 
-  auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & { 
-    return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F); 
-  }; 
-  auto LookupDomTree = [&FAM](Function &F) -> DominatorTree & { 
-    return FAM.getResult<DominatorTreeAnalysis>(F); 
-  }; 
+    auto *CI = dyn_cast<ConstantInt>(Arg);
+    if (!CI || CI->getBitWidth() > 64)
+      return CSInfo;
+    Args.push_back(CI->getZExtValue());
+  }
+  return ConstCSInfo[Args];
+}
+
+void VTableSlotInfo::addCallSite(Value *VTable, CallBase &CB,
+                                 unsigned *NumUnsafeUses) {
+  auto &CSI = findCallSiteInfo(CB);
+  CSI.AllCallSitesDevirted = false;
+  CSI.CallSites.push_back({VTable, CB, NumUnsafeUses});
+}
+
+struct DevirtModule {
+  Module &M;
+  function_ref<AAResults &(Function &)> AARGetter;
+  function_ref<DominatorTree &(Function &)> LookupDomTree;
+
+  ModuleSummaryIndex *ExportSummary;
+  const ModuleSummaryIndex *ImportSummary;
+
+  IntegerType *Int8Ty;
+  PointerType *Int8PtrTy;
+  IntegerType *Int32Ty;
+  IntegerType *Int64Ty;
+  IntegerType *IntPtrTy;
+  /// Sizeless array type, used for imported vtables. This provides a signal
+  /// to analyzers that these imports may alias, as they do for example
+  /// when multiple unique return values occur in the same vtable.
+  ArrayType *Int8Arr0Ty;
+
+  bool RemarksEnabled;
+  function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
+
+  MapVector<VTableSlot, VTableSlotInfo> CallSlots;
+
+  // This map keeps track of the number of "unsafe" uses of a loaded function
+  // pointer. The key is the associated llvm.type.test intrinsic call generated
+  // by this pass. An unsafe use is one that calls the loaded function pointer
+  // directly. Every time we eliminate an unsafe use (for example, by
+  // devirtualizing it or by applying virtual constant propagation), we
+  // decrement the value stored in this map. If a value reaches zero, we can
+  // eliminate the type check by RAUWing the associated llvm.type.test call with
+  // true.
+  std::map<CallInst *, unsigned> NumUnsafeUsesForTypeTest;
+  PatternList FunctionsToSkip;
+
+  DevirtModule(Module &M, function_ref<AAResults &(Function &)> AARGetter,
+               function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter,
+               function_ref<DominatorTree &(Function &)> LookupDomTree,
+               ModuleSummaryIndex *ExportSummary,
+               const ModuleSummaryIndex *ImportSummary)
+      : M(M), AARGetter(AARGetter), LookupDomTree(LookupDomTree),
+        ExportSummary(ExportSummary), ImportSummary(ImportSummary),
+        Int8Ty(Type::getInt8Ty(M.getContext())),
+        Int8PtrTy(Type::getInt8PtrTy(M.getContext())),
+        Int32Ty(Type::getInt32Ty(M.getContext())),
+        Int64Ty(Type::getInt64Ty(M.getContext())),
+        IntPtrTy(M.getDataLayout().getIntPtrType(M.getContext(), 0)),
+        Int8Arr0Ty(ArrayType::get(Type::getInt8Ty(M.getContext()), 0)),
+        RemarksEnabled(areRemarksEnabled()), OREGetter(OREGetter) {
+    assert(!(ExportSummary && ImportSummary));
+    FunctionsToSkip.init(SkipFunctionNames);
+  }
+
+  bool areRemarksEnabled();
+
+  void
+  scanTypeTestUsers(Function *TypeTestFunc,
+                    DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap);
+  void scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc);
+
+  void buildTypeIdentifierMap(
+      std::vector<VTableBits> &Bits,
+      DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap);
+  bool
+  tryFindVirtualCallTargets(std::vector<VirtualCallTarget> &TargetsForSlot,
+                            const std::set<TypeMemberInfo> &TypeMemberInfos,
+                            uint64_t ByteOffset);
+
+  void applySingleImplDevirt(VTableSlotInfo &SlotInfo, Constant *TheFn,
+                             bool &IsExported);
+  bool trySingleImplDevirt(ModuleSummaryIndex *ExportSummary,
+                           MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+                           VTableSlotInfo &SlotInfo,
+                           WholeProgramDevirtResolution *Res);
+
+  void applyICallBranchFunnel(VTableSlotInfo &SlotInfo, Constant *JT,
+                              bool &IsExported);
+  void tryICallBranchFunnel(MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+                            VTableSlotInfo &SlotInfo,
+                            WholeProgramDevirtResolution *Res, VTableSlot Slot);
+
+  bool tryEvaluateFunctionsWithArgs(
+      MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+      ArrayRef<uint64_t> Args);
+
+  void applyUniformRetValOpt(CallSiteInfo &CSInfo, StringRef FnName,
+                             uint64_t TheRetVal);
+  bool tryUniformRetValOpt(MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+                           CallSiteInfo &CSInfo,
+                           WholeProgramDevirtResolution::ByArg *Res);
+
+  // Returns the global symbol name that is used to export information about the
+  // given vtable slot and list of arguments.
+  std::string getGlobalName(VTableSlot Slot, ArrayRef<uint64_t> Args,
+                            StringRef Name);
+
+  bool shouldExportConstantsAsAbsoluteSymbols();
+
+  // This function is called during the export phase to create a symbol
+  // definition containing information about the given vtable slot and list of
+  // arguments.
+  void exportGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args, StringRef Name,
+                    Constant *C);
+  void exportConstant(VTableSlot Slot, ArrayRef<uint64_t> Args, StringRef Name,
+                      uint32_t Const, uint32_t &Storage);
+
+  // This function is called during the import phase to create a reference to
+  // the symbol definition created during the export phase.
+  Constant *importGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args,
+                         StringRef Name);
+  Constant *importConstant(VTableSlot Slot, ArrayRef<uint64_t> Args,
+                           StringRef Name, IntegerType *IntTy,
+                           uint32_t Storage);
+
+  Constant *getMemberAddr(const TypeMemberInfo *M);
+
+  void applyUniqueRetValOpt(CallSiteInfo &CSInfo, StringRef FnName, bool IsOne,
+                            Constant *UniqueMemberAddr);
+  bool tryUniqueRetValOpt(unsigned BitWidth,
+                          MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+                          CallSiteInfo &CSInfo,
+                          WholeProgramDevirtResolution::ByArg *Res,
+                          VTableSlot Slot, ArrayRef<uint64_t> Args);
+
+  void applyVirtualConstProp(CallSiteInfo &CSInfo, StringRef FnName,
+                             Constant *Byte, Constant *Bit);
+  bool tryVirtualConstProp(MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+                           VTableSlotInfo &SlotInfo,
+                           WholeProgramDevirtResolution *Res, VTableSlot Slot);
+
+  void rebuildGlobal(VTableBits &B);
+
+  // Apply the summary resolution for Slot to all virtual calls in SlotInfo.
+  void importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo);
+
+  // If we were able to eliminate all unsafe uses for a type checked load,
+  // eliminate the associated type tests by replacing them with true.
+  void removeRedundantTypeTests();
+
+  bool run();
+
+  // Lower the module using the action and summary passed as command line
+  // arguments. For testing purposes only.
+  static bool
+  runForTesting(Module &M, function_ref<AAResults &(Function &)> AARGetter,
+                function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter,
+                function_ref<DominatorTree &(Function &)> LookupDomTree);
+};
+
+struct DevirtIndex {
+  ModuleSummaryIndex &ExportSummary;
+  // The set in which to record GUIDs exported from their module by
+  // devirtualization, used by client to ensure they are not internalized.
+  std::set<GlobalValue::GUID> &ExportedGUIDs;
+  // A map in which to record the information necessary to locate the WPD
+  // resolution for local targets in case they are exported by cross module
+  // importing.
+  std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap;
+
+  MapVector<VTableSlotSummary, VTableSlotInfo> CallSlots;
+
+  PatternList FunctionsToSkip;
+
+  DevirtIndex(
+      ModuleSummaryIndex &ExportSummary,
+      std::set<GlobalValue::GUID> &ExportedGUIDs,
+      std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap)
+      : ExportSummary(ExportSummary), ExportedGUIDs(ExportedGUIDs),
+        LocalWPDTargetsMap(LocalWPDTargetsMap) {
+    FunctionsToSkip.init(SkipFunctionNames);
+  }
+
+  bool tryFindVirtualCallTargets(std::vector<ValueInfo> &TargetsForSlot,
+                                 const TypeIdCompatibleVtableInfo TIdInfo,
+                                 uint64_t ByteOffset);
+
+  bool trySingleImplDevirt(MutableArrayRef<ValueInfo> TargetsForSlot,
+                           VTableSlotSummary &SlotSummary,
+                           VTableSlotInfo &SlotInfo,
+                           WholeProgramDevirtResolution *Res,
+                           std::set<ValueInfo> &DevirtTargets);
+
+  void run();
+};
+
+struct WholeProgramDevirt : public ModulePass {
+  static char ID;
+
+  bool UseCommandLine = false;
+
+  ModuleSummaryIndex *ExportSummary = nullptr;
+  const ModuleSummaryIndex *ImportSummary = nullptr;
+
+  WholeProgramDevirt() : ModulePass(ID), UseCommandLine(true) {
+    initializeWholeProgramDevirtPass(*PassRegistry::getPassRegistry());
+  }
+
+  WholeProgramDevirt(ModuleSummaryIndex *ExportSummary,
+                     const ModuleSummaryIndex *ImportSummary)
+      : ModulePass(ID), ExportSummary(ExportSummary),
+        ImportSummary(ImportSummary) {
+    initializeWholeProgramDevirtPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+
+    // In the new pass manager, we can request the optimization
+    // remark emitter pass on a per-function-basis, which the
+    // OREGetter will do for us.
+    // In the old pass manager, this is harder, so we just build
+    // an optimization remark emitter on the fly, when we need it.
+    std::unique_ptr<OptimizationRemarkEmitter> ORE;
+    auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
+      ORE = std::make_unique<OptimizationRemarkEmitter>(F);
+      return *ORE;
+    };
+
+    auto LookupDomTree = [this](Function &F) -> DominatorTree & {
+      return this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+    };
+
+    if (UseCommandLine)
+      return DevirtModule::runForTesting(M, LegacyAARGetter(*this), OREGetter,
+                                         LookupDomTree);
+
+    return DevirtModule(M, LegacyAARGetter(*this), OREGetter, LookupDomTree,
+                        ExportSummary, ImportSummary)
+        .run();
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+  }
+};
+
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(WholeProgramDevirt, "wholeprogramdevirt",
+                      "Whole program devirtualization", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(WholeProgramDevirt, "wholeprogramdevirt",
+                    "Whole program devirtualization", false, false)
+char WholeProgramDevirt::ID = 0;
+
+ModulePass *
+llvm::createWholeProgramDevirtPass(ModuleSummaryIndex *ExportSummary,
+                                   const ModuleSummaryIndex *ImportSummary) {
+  return new WholeProgramDevirt(ExportSummary, ImportSummary);
+}
+
+PreservedAnalyses WholeProgramDevirtPass::run(Module &M,
+                                              ModuleAnalysisManager &AM) {
+  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto AARGetter = [&](Function &F) -> AAResults & {
+    return FAM.getResult<AAManager>(F);
+  };
+  auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
+    return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
+  };
+  auto LookupDomTree = [&FAM](Function &F) -> DominatorTree & {
+    return FAM.getResult<DominatorTreeAnalysis>(F);
+  };
   if (UseCommandLine) {
     if (DevirtModule::runForTesting(M, AARGetter, OREGetter, LookupDomTree))
       return PreservedAnalyses::all();
     return PreservedAnalyses::none();
   }
-  if (!DevirtModule(M, AARGetter, OREGetter, LookupDomTree, ExportSummary, 
-                    ImportSummary) 
-           .run()) 
-    return PreservedAnalyses::all(); 
-  return PreservedAnalyses::none(); 
-} 
- 
-// Enable whole program visibility if enabled by client (e.g. linker) or 
-// internal option, and not force disabled. 
-static bool hasWholeProgramVisibility(bool WholeProgramVisibilityEnabledInLTO) { 
-  return (WholeProgramVisibilityEnabledInLTO || WholeProgramVisibility) && 
-         !DisableWholeProgramVisibility; 
-} 
- 
-namespace llvm { 
- 
-/// If whole program visibility asserted, then upgrade all public vcall 
-/// visibility metadata on vtable definitions to linkage unit visibility in 
-/// Module IR (for regular or hybrid LTO). 
-void updateVCallVisibilityInModule(Module &M, 
-                                   bool WholeProgramVisibilityEnabledInLTO) { 
-  if (!hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO)) 
-    return; 
-  for (GlobalVariable &GV : M.globals()) 
-    // Add linkage unit visibility to any variable with type metadata, which are 
-    // the vtable definitions. We won't have an existing vcall_visibility 
-    // metadata on vtable definitions with public visibility. 
-    if (GV.hasMetadata(LLVMContext::MD_type) && 
-        GV.getVCallVisibility() == GlobalObject::VCallVisibilityPublic) 
-      GV.setVCallVisibilityMetadata(GlobalObject::VCallVisibilityLinkageUnit); 
-} 
- 
-/// If whole program visibility asserted, then upgrade all public vcall 
-/// visibility metadata on vtable definition summaries to linkage unit 
-/// visibility in Module summary index (for ThinLTO). 
-void updateVCallVisibilityInIndex(ModuleSummaryIndex &Index, 
-                                  bool WholeProgramVisibilityEnabledInLTO) { 
-  if (!hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO)) 
-    return; 
-  for (auto &P : Index) { 
-    for (auto &S : P.second.SummaryList) { 
-      auto *GVar = dyn_cast<GlobalVarSummary>(S.get()); 
-      if (!GVar || GVar->vTableFuncs().empty() || 
-          GVar->getVCallVisibility() != GlobalObject::VCallVisibilityPublic) 
-        continue; 
-      GVar->setVCallVisibility(GlobalObject::VCallVisibilityLinkageUnit); 
-    } 
-  } 
-} 
- 
-void runWholeProgramDevirtOnIndex( 
-    ModuleSummaryIndex &Summary, std::set<GlobalValue::GUID> &ExportedGUIDs, 
-    std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap) { 
-  DevirtIndex(Summary, ExportedGUIDs, LocalWPDTargetsMap).run(); 
-} 
- 
-void updateIndexWPDForExports( 
-    ModuleSummaryIndex &Summary, 
-    function_ref<bool(StringRef, ValueInfo)> isExported, 
-    std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap) { 
-  for (auto &T : LocalWPDTargetsMap) { 
-    auto &VI = T.first; 
-    // This was enforced earlier during trySingleImplDevirt. 
-    assert(VI.getSummaryList().size() == 1 && 
-           "Devirt of local target has more than one copy"); 
-    auto &S = VI.getSummaryList()[0]; 
-    if (!isExported(S->modulePath(), VI)) 
-      continue; 
- 
-    // It's been exported by a cross module import. 
-    for (auto &SlotSummary : T.second) { 
-      auto *TIdSum = Summary.getTypeIdSummary(SlotSummary.TypeID); 
-      assert(TIdSum); 
-      auto WPDRes = TIdSum->WPDRes.find(SlotSummary.ByteOffset); 
-      assert(WPDRes != TIdSum->WPDRes.end()); 
-      WPDRes->second.SingleImplName = ModuleSummaryIndex::getGlobalNameForLocal( 
-          WPDRes->second.SingleImplName, 
-          Summary.getModuleHash(S->modulePath())); 
-    } 
-  } 
-} 
- 
-} // end namespace llvm 
- 
-static Error checkCombinedSummaryForTesting(ModuleSummaryIndex *Summary) { 
-  // Check that summary index contains regular LTO module when performing 
-  // export to prevent occasional use of index from pure ThinLTO compilation 
-  // (-fno-split-lto-module). This kind of summary index is passed to 
-  // DevirtIndex::run, not to DevirtModule::run used by opt/runForTesting. 
-  const auto &ModPaths = Summary->modulePaths(); 
-  if (ClSummaryAction != PassSummaryAction::Import && 
-      ModPaths.find(ModuleSummaryIndex::getRegularLTOModuleName()) == 
-          ModPaths.end()) 
-    return createStringError( 
-        errc::invalid_argument, 
-        "combined summary should contain Regular LTO module"); 
-  return ErrorSuccess(); 
-} 
- 
-bool DevirtModule::runForTesting( 
-    Module &M, function_ref<AAResults &(Function &)> AARGetter, 
-    function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter, 
-    function_ref<DominatorTree &(Function &)> LookupDomTree) { 
-  std::unique_ptr<ModuleSummaryIndex> Summary = 
-      std::make_unique<ModuleSummaryIndex>(/*HaveGVs=*/false); 
- 
-  // Handle the command-line summary arguments. This code is for testing 
-  // purposes only, so we handle errors directly. 
-  if (!ClReadSummary.empty()) { 
-    ExitOnError ExitOnErr("-wholeprogramdevirt-read-summary: " + ClReadSummary + 
-                          ": "); 
-    auto ReadSummaryFile = 
-        ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(ClReadSummary))); 
-    if (Expected<std::unique_ptr<ModuleSummaryIndex>> SummaryOrErr = 
-            getModuleSummaryIndex(*ReadSummaryFile)) { 
-      Summary = std::move(*SummaryOrErr); 
-      ExitOnErr(checkCombinedSummaryForTesting(Summary.get())); 
-    } else { 
-      // Try YAML if we've failed with bitcode. 
-      consumeError(SummaryOrErr.takeError()); 
-      yaml::Input In(ReadSummaryFile->getBuffer()); 
-      In >> *Summary; 
-      ExitOnErr(errorCodeToError(In.error())); 
-    } 
-  } 
- 
-  bool Changed = 
-      DevirtModule(M, AARGetter, OREGetter, LookupDomTree, 
-                   ClSummaryAction == PassSummaryAction::Export ? Summary.get() 
-                                                                : nullptr, 
-                   ClSummaryAction == PassSummaryAction::Import ? Summary.get() 
-                                                                : nullptr) 
-          .run(); 
- 
-  if (!ClWriteSummary.empty()) { 
-    ExitOnError ExitOnErr( 
-        "-wholeprogramdevirt-write-summary: " + ClWriteSummary + ": "); 
-    std::error_code EC; 
-    if (StringRef(ClWriteSummary).endswith(".bc")) { 
-      raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::OF_None); 
-      ExitOnErr(errorCodeToError(EC)); 
-      WriteIndexToFile(*Summary, OS); 
-    } else { 
-      raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::OF_Text); 
-      ExitOnErr(errorCodeToError(EC)); 
-      yaml::Output Out(OS); 
-      Out << *Summary; 
-    } 
-  } 
- 
-  return Changed; 
-} 
- 
-void DevirtModule::buildTypeIdentifierMap( 
-    std::vector<VTableBits> &Bits, 
-    DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap) { 
-  DenseMap<GlobalVariable *, VTableBits *> GVToBits; 
-  Bits.reserve(M.getGlobalList().size()); 
-  SmallVector<MDNode *, 2> Types; 
-  for (GlobalVariable &GV : M.globals()) { 
-    Types.clear(); 
-    GV.getMetadata(LLVMContext::MD_type, Types); 
-    if (GV.isDeclaration() || Types.empty()) 
-      continue; 
- 
-    VTableBits *&BitsPtr = GVToBits[&GV]; 
-    if (!BitsPtr) { 
-      Bits.emplace_back(); 
-      Bits.back().GV = &GV; 
-      Bits.back().ObjectSize = 
-          M.getDataLayout().getTypeAllocSize(GV.getInitializer()->getType()); 
-      BitsPtr = &Bits.back(); 
-    } 
- 
-    for (MDNode *Type : Types) { 
-      auto TypeID = Type->getOperand(1).get(); 
- 
-      uint64_t Offset = 
-          cast<ConstantInt>( 
-              cast<ConstantAsMetadata>(Type->getOperand(0))->getValue()) 
-              ->getZExtValue(); 
- 
-      TypeIdMap[TypeID].insert({BitsPtr, Offset}); 
-    } 
-  } 
-} 
- 
-bool DevirtModule::tryFindVirtualCallTargets( 
-    std::vector<VirtualCallTarget> &TargetsForSlot, 
-    const std::set<TypeMemberInfo> &TypeMemberInfos, uint64_t ByteOffset) { 
-  for (const TypeMemberInfo &TM : TypeMemberInfos) { 
-    if (!TM.Bits->GV->isConstant()) 
-      return false; 
- 
-    // We cannot perform whole program devirtualization analysis on a vtable 
-    // with public LTO visibility. 
-    if (TM.Bits->GV->getVCallVisibility() == 
-        GlobalObject::VCallVisibilityPublic) 
-      return false; 
- 
-    Constant *Ptr = getPointerAtOffset(TM.Bits->GV->getInitializer(), 
-                                       TM.Offset + ByteOffset, M); 
-    if (!Ptr) 
-      return false; 
- 
-    auto Fn = dyn_cast<Function>(Ptr->stripPointerCasts()); 
-    if (!Fn) 
-      return false; 
- 
-    if (FunctionsToSkip.match(Fn->getName())) 
-      return false; 
- 
-    // We can disregard __cxa_pure_virtual as a possible call target, as 
-    // calls to pure virtuals are UB. 
-    if (Fn->getName() == "__cxa_pure_virtual") 
-      continue; 
- 
-    TargetsForSlot.push_back({Fn, &TM}); 
-  } 
- 
-  // Give up if we couldn't find any targets. 
-  return !TargetsForSlot.empty(); 
-} 
- 
-bool DevirtIndex::tryFindVirtualCallTargets( 
-    std::vector<ValueInfo> &TargetsForSlot, const TypeIdCompatibleVtableInfo TIdInfo, 
-    uint64_t ByteOffset) { 
-  for (const TypeIdOffsetVtableInfo &P : TIdInfo) { 
-    // Find the first non-available_externally linkage vtable initializer. 
-    // We can have multiple available_externally, linkonce_odr and weak_odr 
-    // vtable initializers, however we want to skip available_externally as they 
-    // do not have type metadata attached, and therefore the summary will not 
-    // contain any vtable functions. We can also have multiple external 
-    // vtable initializers in the case of comdats, which we cannot check here. 
-    // The linker should give an error in this case. 
-    // 
-    // Also, handle the case of same-named local Vtables with the same path 
-    // and therefore the same GUID. This can happen if there isn't enough 
-    // distinguishing path when compiling the source file. In that case we 
-    // conservatively return false early. 
-    const GlobalVarSummary *VS = nullptr; 
-    bool LocalFound = false; 
-    for (auto &S : P.VTableVI.getSummaryList()) { 
-      if (GlobalValue::isLocalLinkage(S->linkage())) { 
-        if (LocalFound) 
-          return false; 
-        LocalFound = true; 
-      } 
-      if (!GlobalValue::isAvailableExternallyLinkage(S->linkage())) { 
-        VS = cast<GlobalVarSummary>(S->getBaseObject()); 
-        // We cannot perform whole program devirtualization analysis on a vtable 
-        // with public LTO visibility. 
-        if (VS->getVCallVisibility() == GlobalObject::VCallVisibilityPublic) 
-          return false; 
-      } 
-    } 
-    if (!VS->isLive()) 
-      continue; 
-    for (auto VTP : VS->vTableFuncs()) { 
-      if (VTP.VTableOffset != P.AddressPointOffset + ByteOffset) 
-        continue; 
- 
-      TargetsForSlot.push_back(VTP.FuncVI); 
-    } 
-  } 
- 
-  // Give up if we couldn't find any targets. 
-  return !TargetsForSlot.empty(); 
-} 
- 
-void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo, 
-                                         Constant *TheFn, bool &IsExported) { 
+  if (!DevirtModule(M, AARGetter, OREGetter, LookupDomTree, ExportSummary,
+                    ImportSummary)
+           .run())
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
+}
+
+// Enable whole program visibility if enabled by client (e.g. linker) or
+// internal option, and not force disabled.
+static bool hasWholeProgramVisibility(bool WholeProgramVisibilityEnabledInLTO) {
+  return (WholeProgramVisibilityEnabledInLTO || WholeProgramVisibility) &&
+         !DisableWholeProgramVisibility;
+}
+
+namespace llvm {
+
+/// If whole program visibility asserted, then upgrade all public vcall
+/// visibility metadata on vtable definitions to linkage unit visibility in
+/// Module IR (for regular or hybrid LTO).
+void updateVCallVisibilityInModule(Module &M,
+                                   bool WholeProgramVisibilityEnabledInLTO) {
+  if (!hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO))
+    return;
+  for (GlobalVariable &GV : M.globals())
+    // Add linkage unit visibility to any variable with type metadata, which are
+    // the vtable definitions. We won't have an existing vcall_visibility
+    // metadata on vtable definitions with public visibility.
+    if (GV.hasMetadata(LLVMContext::MD_type) &&
+        GV.getVCallVisibility() == GlobalObject::VCallVisibilityPublic)
+      GV.setVCallVisibilityMetadata(GlobalObject::VCallVisibilityLinkageUnit);
+}
+
+/// If whole program visibility asserted, then upgrade all public vcall
+/// visibility metadata on vtable definition summaries to linkage unit
+/// visibility in Module summary index (for ThinLTO).
+void updateVCallVisibilityInIndex(ModuleSummaryIndex &Index,
+                                  bool WholeProgramVisibilityEnabledInLTO) {
+  if (!hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO))
+    return;
+  for (auto &P : Index) {
+    for (auto &S : P.second.SummaryList) {
+      auto *GVar = dyn_cast<GlobalVarSummary>(S.get());
+      if (!GVar || GVar->vTableFuncs().empty() ||
+          GVar->getVCallVisibility() != GlobalObject::VCallVisibilityPublic)
+        continue;
+      GVar->setVCallVisibility(GlobalObject::VCallVisibilityLinkageUnit);
+    }
+  }
+}
+
+void runWholeProgramDevirtOnIndex(
+    ModuleSummaryIndex &Summary, std::set<GlobalValue::GUID> &ExportedGUIDs,
+    std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap) {
+  DevirtIndex(Summary, ExportedGUIDs, LocalWPDTargetsMap).run();
+}
+
+void updateIndexWPDForExports(
+    ModuleSummaryIndex &Summary,
+    function_ref<bool(StringRef, ValueInfo)> isExported,
+    std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap) {
+  for (auto &T : LocalWPDTargetsMap) {
+    auto &VI = T.first;
+    // This was enforced earlier during trySingleImplDevirt.
+    assert(VI.getSummaryList().size() == 1 &&
+           "Devirt of local target has more than one copy");
+    auto &S = VI.getSummaryList()[0];
+    if (!isExported(S->modulePath(), VI))
+      continue;
+
+    // It's been exported by a cross module import.
+    for (auto &SlotSummary : T.second) {
+      auto *TIdSum = Summary.getTypeIdSummary(SlotSummary.TypeID);
+      assert(TIdSum);
+      auto WPDRes = TIdSum->WPDRes.find(SlotSummary.ByteOffset);
+      assert(WPDRes != TIdSum->WPDRes.end());
+      WPDRes->second.SingleImplName = ModuleSummaryIndex::getGlobalNameForLocal(
+          WPDRes->second.SingleImplName,
+          Summary.getModuleHash(S->modulePath()));
+    }
+  }
+}
+
+} // end namespace llvm
+
+static Error checkCombinedSummaryForTesting(ModuleSummaryIndex *Summary) {
+  // Check that summary index contains regular LTO module when performing
+  // export to prevent occasional use of index from pure ThinLTO compilation
+  // (-fno-split-lto-module). This kind of summary index is passed to
+  // DevirtIndex::run, not to DevirtModule::run used by opt/runForTesting.
+  const auto &ModPaths = Summary->modulePaths();
+  if (ClSummaryAction != PassSummaryAction::Import &&
+      ModPaths.find(ModuleSummaryIndex::getRegularLTOModuleName()) ==
+          ModPaths.end())
+    return createStringError(
+        errc::invalid_argument,
+        "combined summary should contain Regular LTO module");
+  return ErrorSuccess();
+}
+
+bool DevirtModule::runForTesting(
+    Module &M, function_ref<AAResults &(Function &)> AARGetter,
+    function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter,
+    function_ref<DominatorTree &(Function &)> LookupDomTree) {
+  std::unique_ptr<ModuleSummaryIndex> Summary =
+      std::make_unique<ModuleSummaryIndex>(/*HaveGVs=*/false);
+
+  // Handle the command-line summary arguments. This code is for testing
+  // purposes only, so we handle errors directly.
+  if (!ClReadSummary.empty()) {
+    ExitOnError ExitOnErr("-wholeprogramdevirt-read-summary: " + ClReadSummary +
+                          ": ");
+    auto ReadSummaryFile =
+        ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(ClReadSummary)));
+    if (Expected<std::unique_ptr<ModuleSummaryIndex>> SummaryOrErr =
+            getModuleSummaryIndex(*ReadSummaryFile)) {
+      Summary = std::move(*SummaryOrErr);
+      ExitOnErr(checkCombinedSummaryForTesting(Summary.get()));
+    } else {
+      // Try YAML if we've failed with bitcode.
+      consumeError(SummaryOrErr.takeError());
+      yaml::Input In(ReadSummaryFile->getBuffer());
+      In >> *Summary;
+      ExitOnErr(errorCodeToError(In.error()));
+    }
+  }
+
+  bool Changed =
+      DevirtModule(M, AARGetter, OREGetter, LookupDomTree,
+                   ClSummaryAction == PassSummaryAction::Export ? Summary.get()
+                                                                : nullptr,
+                   ClSummaryAction == PassSummaryAction::Import ? Summary.get()
+                                                                : nullptr)
+          .run();
+
+  if (!ClWriteSummary.empty()) {
+    ExitOnError ExitOnErr(
+        "-wholeprogramdevirt-write-summary: " + ClWriteSummary + ": ");
+    std::error_code EC;
+    if (StringRef(ClWriteSummary).endswith(".bc")) {
+      raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::OF_None);
+      ExitOnErr(errorCodeToError(EC));
+      WriteIndexToFile(*Summary, OS);
+    } else {
+      raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::OF_Text);
+      ExitOnErr(errorCodeToError(EC));
+      yaml::Output Out(OS);
+      Out << *Summary;
+    }
+  }
+
+  return Changed;
+}
+
+void DevirtModule::buildTypeIdentifierMap(
+    std::vector<VTableBits> &Bits,
+    DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap) {
+  DenseMap<GlobalVariable *, VTableBits *> GVToBits;
+  Bits.reserve(M.getGlobalList().size());
+  SmallVector<MDNode *, 2> Types;
+  for (GlobalVariable &GV : M.globals()) {
+    Types.clear();
+    GV.getMetadata(LLVMContext::MD_type, Types);
+    if (GV.isDeclaration() || Types.empty())
+      continue;
+
+    VTableBits *&BitsPtr = GVToBits[&GV];
+    if (!BitsPtr) {
+      Bits.emplace_back();
+      Bits.back().GV = &GV;
+      Bits.back().ObjectSize =
+          M.getDataLayout().getTypeAllocSize(GV.getInitializer()->getType());
+      BitsPtr = &Bits.back();
+    }
+
+    for (MDNode *Type : Types) {
+      auto TypeID = Type->getOperand(1).get();
+
+      uint64_t Offset =
+          cast<ConstantInt>(
+              cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
+              ->getZExtValue();
+
+      TypeIdMap[TypeID].insert({BitsPtr, Offset});
+    }
+  }
+}
+
+bool DevirtModule::tryFindVirtualCallTargets(
+    std::vector<VirtualCallTarget> &TargetsForSlot,
+    const std::set<TypeMemberInfo> &TypeMemberInfos, uint64_t ByteOffset) {
+  for (const TypeMemberInfo &TM : TypeMemberInfos) {
+    if (!TM.Bits->GV->isConstant())
+      return false;
+
+    // We cannot perform whole program devirtualization analysis on a vtable
+    // with public LTO visibility.
+    if (TM.Bits->GV->getVCallVisibility() ==
+        GlobalObject::VCallVisibilityPublic)
+      return false;
+
+    Constant *Ptr = getPointerAtOffset(TM.Bits->GV->getInitializer(),
+                                       TM.Offset + ByteOffset, M);
+    if (!Ptr)
+      return false;
+
+    auto Fn = dyn_cast<Function>(Ptr->stripPointerCasts());
+    if (!Fn)
+      return false;
+
+    if (FunctionsToSkip.match(Fn->getName()))
+      return false;
+
+    // We can disregard __cxa_pure_virtual as a possible call target, as
+    // calls to pure virtuals are UB.
+    if (Fn->getName() == "__cxa_pure_virtual")
+      continue;
+
+    TargetsForSlot.push_back({Fn, &TM});
+  }
+
+  // Give up if we couldn't find any targets.
+  return !TargetsForSlot.empty();
+}
+
+bool DevirtIndex::tryFindVirtualCallTargets(
+    std::vector<ValueInfo> &TargetsForSlot, const TypeIdCompatibleVtableInfo TIdInfo,
+    uint64_t ByteOffset) {
+  for (const TypeIdOffsetVtableInfo &P : TIdInfo) {
+    // Find the first non-available_externally linkage vtable initializer.
+    // We can have multiple available_externally, linkonce_odr and weak_odr
+    // vtable initializers, however we want to skip available_externally as they
+    // do not have type metadata attached, and therefore the summary will not
+    // contain any vtable functions. We can also have multiple external
+    // vtable initializers in the case of comdats, which we cannot check here.
+    // The linker should give an error in this case.
+    //
+    // Also, handle the case of same-named local Vtables with the same path
+    // and therefore the same GUID. This can happen if there isn't enough
+    // distinguishing path when compiling the source file. In that case we
+    // conservatively return false early.
+    const GlobalVarSummary *VS = nullptr;
+    bool LocalFound = false;
+    for (auto &S : P.VTableVI.getSummaryList()) {
+      if (GlobalValue::isLocalLinkage(S->linkage())) {
+        if (LocalFound)
+          return false;
+        LocalFound = true;
+      }
+      if (!GlobalValue::isAvailableExternallyLinkage(S->linkage())) {
+        VS = cast<GlobalVarSummary>(S->getBaseObject());
+        // We cannot perform whole program devirtualization analysis on a vtable
+        // with public LTO visibility.
+        if (VS->getVCallVisibility() == GlobalObject::VCallVisibilityPublic)
+          return false;
+      }
+    }
+    if (!VS->isLive())
+      continue;
+    for (auto VTP : VS->vTableFuncs()) {
+      if (VTP.VTableOffset != P.AddressPointOffset + ByteOffset)
+        continue;
+
+      TargetsForSlot.push_back(VTP.FuncVI);
+    }
+  }
+
+  // Give up if we couldn't find any targets.
+  return !TargetsForSlot.empty();
+}
+
+void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo,
+                                         Constant *TheFn, bool &IsExported) {
   // Don't devirtualize function if we're told to skip it
   // in -wholeprogramdevirt-skip.
   if (FunctionsToSkip.match(TheFn->stripPointerCasts()->getName()))
     return;
-  auto Apply = [&](CallSiteInfo &CSInfo) { 
-    for (auto &&VCallSite : CSInfo.CallSites) { 
-      if (RemarksEnabled) 
-        VCallSite.emitRemark("single-impl", 
-                             TheFn->stripPointerCasts()->getName(), OREGetter); 
-      VCallSite.CB.setCalledOperand(ConstantExpr::getBitCast( 
-          TheFn, VCallSite.CB.getCalledOperand()->getType())); 
-      // This use is no longer unsafe. 
-      if (VCallSite.NumUnsafeUses) 
-        --*VCallSite.NumUnsafeUses; 
-    } 
-    if (CSInfo.isExported()) 
-      IsExported = true; 
-    CSInfo.markDevirt(); 
-  }; 
-  Apply(SlotInfo.CSInfo); 
-  for (auto &P : SlotInfo.ConstCSInfo) 
-    Apply(P.second); 
-} 
- 
-static bool AddCalls(VTableSlotInfo &SlotInfo, const ValueInfo &Callee) { 
-  // We can't add calls if we haven't seen a definition 
-  if (Callee.getSummaryList().empty()) 
-    return false; 
- 
-  // Insert calls into the summary index so that the devirtualized targets 
-  // are eligible for import. 
-  // FIXME: Annotate type tests with hotness. For now, mark these as hot 
-  // to better ensure we have the opportunity to inline them. 
-  bool IsExported = false; 
-  auto &S = Callee.getSummaryList()[0]; 
-  CalleeInfo CI(CalleeInfo::HotnessType::Hot, /* RelBF = */ 0); 
-  auto AddCalls = [&](CallSiteInfo &CSInfo) { 
-    for (auto *FS : CSInfo.SummaryTypeCheckedLoadUsers) { 
-      FS->addCall({Callee, CI}); 
-      IsExported |= S->modulePath() != FS->modulePath(); 
-    } 
-    for (auto *FS : CSInfo.SummaryTypeTestAssumeUsers) { 
-      FS->addCall({Callee, CI}); 
-      IsExported |= S->modulePath() != FS->modulePath(); 
-    } 
-  }; 
-  AddCalls(SlotInfo.CSInfo); 
-  for (auto &P : SlotInfo.ConstCSInfo) 
-    AddCalls(P.second); 
-  return IsExported; 
-} 
- 
-bool DevirtModule::trySingleImplDevirt( 
-    ModuleSummaryIndex *ExportSummary, 
-    MutableArrayRef<VirtualCallTarget> TargetsForSlot, VTableSlotInfo &SlotInfo, 
-    WholeProgramDevirtResolution *Res) { 
-  // See if the program contains a single implementation of this virtual 
-  // function. 
-  Function *TheFn = TargetsForSlot[0].Fn; 
-  for (auto &&Target : TargetsForSlot) 
-    if (TheFn != Target.Fn) 
-      return false; 
- 
-  // If so, update each call site to call that implementation directly. 
-  if (RemarksEnabled) 
-    TargetsForSlot[0].WasDevirt = true; 
- 
-  bool IsExported = false; 
-  applySingleImplDevirt(SlotInfo, TheFn, IsExported); 
-  if (!IsExported) 
-    return false; 
- 
-  // If the only implementation has local linkage, we must promote to external 
-  // to make it visible to thin LTO objects. We can only get here during the 
-  // ThinLTO export phase. 
-  if (TheFn->hasLocalLinkage()) { 
-    std::string NewName = (TheFn->getName() + "$merged").str(); 
- 
-    // Since we are renaming the function, any comdats with the same name must 
-    // also be renamed. This is required when targeting COFF, as the comdat name 
-    // must match one of the names of the symbols in the comdat. 
-    if (Comdat *C = TheFn->getComdat()) { 
-      if (C->getName() == TheFn->getName()) { 
-        Comdat *NewC = M.getOrInsertComdat(NewName); 
-        NewC->setSelectionKind(C->getSelectionKind()); 
-        for (GlobalObject &GO : M.global_objects()) 
-          if (GO.getComdat() == C) 
-            GO.setComdat(NewC); 
-      } 
-    } 
- 
-    TheFn->setLinkage(GlobalValue::ExternalLinkage); 
-    TheFn->setVisibility(GlobalValue::HiddenVisibility); 
-    TheFn->setName(NewName); 
-  } 
-  if (ValueInfo TheFnVI = ExportSummary->getValueInfo(TheFn->getGUID())) 
-    // Any needed promotion of 'TheFn' has already been done during 
-    // LTO unit split, so we can ignore return value of AddCalls. 
-    AddCalls(SlotInfo, TheFnVI); 
- 
-  Res->TheKind = WholeProgramDevirtResolution::SingleImpl; 
-  Res->SingleImplName = std::string(TheFn->getName()); 
- 
-  return true; 
-} 
- 
-bool DevirtIndex::trySingleImplDevirt(MutableArrayRef<ValueInfo> TargetsForSlot, 
-                                      VTableSlotSummary &SlotSummary, 
-                                      VTableSlotInfo &SlotInfo, 
-                                      WholeProgramDevirtResolution *Res, 
-                                      std::set<ValueInfo> &DevirtTargets) { 
-  // See if the program contains a single implementation of this virtual 
-  // function. 
-  auto TheFn = TargetsForSlot[0]; 
-  for (auto &&Target : TargetsForSlot) 
-    if (TheFn != Target) 
-      return false; 
- 
-  // Don't devirtualize if we don't have target definition. 
-  auto Size = TheFn.getSummaryList().size(); 
-  if (!Size) 
-    return false; 
- 
-  // Don't devirtualize function if we're told to skip it 
-  // in -wholeprogramdevirt-skip. 
-  if (FunctionsToSkip.match(TheFn.name())) 
-    return false; 
- 
-  // If the summary list contains multiple summaries where at least one is 
-  // a local, give up, as we won't know which (possibly promoted) name to use. 
-  for (auto &S : TheFn.getSummaryList()) 
-    if (GlobalValue::isLocalLinkage(S->linkage()) && Size > 1) 
-      return false; 
- 
-  // Collect functions devirtualized at least for one call site for stats. 
-  if (PrintSummaryDevirt) 
-    DevirtTargets.insert(TheFn); 
- 
-  auto &S = TheFn.getSummaryList()[0]; 
-  bool IsExported = AddCalls(SlotInfo, TheFn); 
-  if (IsExported) 
-    ExportedGUIDs.insert(TheFn.getGUID()); 
- 
-  // Record in summary for use in devirtualization during the ThinLTO import 
-  // step. 
-  Res->TheKind = WholeProgramDevirtResolution::SingleImpl; 
-  if (GlobalValue::isLocalLinkage(S->linkage())) { 
-    if (IsExported) 
-      // If target is a local function and we are exporting it by 
-      // devirtualizing a call in another module, we need to record the 
-      // promoted name. 
-      Res->SingleImplName = ModuleSummaryIndex::getGlobalNameForLocal( 
-          TheFn.name(), ExportSummary.getModuleHash(S->modulePath())); 
-    else { 
-      LocalWPDTargetsMap[TheFn].push_back(SlotSummary); 
-      Res->SingleImplName = std::string(TheFn.name()); 
-    } 
-  } else 
-    Res->SingleImplName = std::string(TheFn.name()); 
- 
-  // Name will be empty if this thin link driven off of serialized combined 
-  // index (e.g. llvm-lto). However, WPD is not supported/invoked for the 
-  // legacy LTO API anyway. 
-  assert(!Res->SingleImplName.empty()); 
- 
-  return true; 
-} 
- 
-void DevirtModule::tryICallBranchFunnel( 
-    MutableArrayRef<VirtualCallTarget> TargetsForSlot, VTableSlotInfo &SlotInfo, 
-    WholeProgramDevirtResolution *Res, VTableSlot Slot) { 
-  Triple T(M.getTargetTriple()); 
-  if (T.getArch() != Triple::x86_64) 
-    return; 
- 
-  if (TargetsForSlot.size() > ClThreshold) 
-    return; 
- 
-  bool HasNonDevirt = !SlotInfo.CSInfo.AllCallSitesDevirted; 
-  if (!HasNonDevirt) 
-    for (auto &P : SlotInfo.ConstCSInfo) 
-      if (!P.second.AllCallSitesDevirted) { 
-        HasNonDevirt = true; 
-        break; 
-      } 
- 
-  if (!HasNonDevirt) 
-    return; 
- 
-  FunctionType *FT = 
-      FunctionType::get(Type::getVoidTy(M.getContext()), {Int8PtrTy}, true); 
-  Function *JT; 
-  if (isa<MDString>(Slot.TypeID)) { 
-    JT = Function::Create(FT, Function::ExternalLinkage, 
-                          M.getDataLayout().getProgramAddressSpace(), 
-                          getGlobalName(Slot, {}, "branch_funnel"), &M); 
-    JT->setVisibility(GlobalValue::HiddenVisibility); 
-  } else { 
-    JT = Function::Create(FT, Function::InternalLinkage, 
-                          M.getDataLayout().getProgramAddressSpace(), 
-                          "branch_funnel", &M); 
-  } 
-  JT->addAttribute(1, Attribute::Nest); 
- 
-  std::vector<Value *> JTArgs; 
-  JTArgs.push_back(JT->arg_begin()); 
-  for (auto &T : TargetsForSlot) { 
-    JTArgs.push_back(getMemberAddr(T.TM)); 
-    JTArgs.push_back(T.Fn); 
-  } 
- 
-  BasicBlock *BB = BasicBlock::Create(M.getContext(), "", JT, nullptr); 
-  Function *Intr = 
-      Intrinsic::getDeclaration(&M, llvm::Intrinsic::icall_branch_funnel, {}); 
- 
-  auto *CI = CallInst::Create(Intr, JTArgs, "", BB); 
-  CI->setTailCallKind(CallInst::TCK_MustTail); 
-  ReturnInst::Create(M.getContext(), nullptr, BB); 
- 
-  bool IsExported = false; 
-  applyICallBranchFunnel(SlotInfo, JT, IsExported); 
-  if (IsExported) 
-    Res->TheKind = WholeProgramDevirtResolution::BranchFunnel; 
-} 
- 
-void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo, 
-                                          Constant *JT, bool &IsExported) { 
-  auto Apply = [&](CallSiteInfo &CSInfo) { 
-    if (CSInfo.isExported()) 
-      IsExported = true; 
-    if (CSInfo.AllCallSitesDevirted) 
-      return; 
-    for (auto &&VCallSite : CSInfo.CallSites) { 
-      CallBase &CB = VCallSite.CB; 
- 
-      // Jump tables are only profitable if the retpoline mitigation is enabled. 
-      Attribute FSAttr = CB.getCaller()->getFnAttribute("target-features"); 
+  auto Apply = [&](CallSiteInfo &CSInfo) {
+    for (auto &&VCallSite : CSInfo.CallSites) {
+      if (RemarksEnabled)
+        VCallSite.emitRemark("single-impl",
+                             TheFn->stripPointerCasts()->getName(), OREGetter);
+      VCallSite.CB.setCalledOperand(ConstantExpr::getBitCast(
+          TheFn, VCallSite.CB.getCalledOperand()->getType()));
+      // This use is no longer unsafe.
+      if (VCallSite.NumUnsafeUses)
+        --*VCallSite.NumUnsafeUses;
+    }
+    if (CSInfo.isExported())
+      IsExported = true;
+    CSInfo.markDevirt();
+  };
+  Apply(SlotInfo.CSInfo);
+  for (auto &P : SlotInfo.ConstCSInfo)
+    Apply(P.second);
+}
+
+static bool AddCalls(VTableSlotInfo &SlotInfo, const ValueInfo &Callee) {
+  // We can't add calls if we haven't seen a definition
+  if (Callee.getSummaryList().empty())
+    return false;
+
+  // Insert calls into the summary index so that the devirtualized targets
+  // are eligible for import.
+  // FIXME: Annotate type tests with hotness. For now, mark these as hot
+  // to better ensure we have the opportunity to inline them.
+  bool IsExported = false;
+  auto &S = Callee.getSummaryList()[0];
+  CalleeInfo CI(CalleeInfo::HotnessType::Hot, /* RelBF = */ 0);
+  auto AddCalls = [&](CallSiteInfo &CSInfo) {
+    for (auto *FS : CSInfo.SummaryTypeCheckedLoadUsers) {
+      FS->addCall({Callee, CI});
+      IsExported |= S->modulePath() != FS->modulePath();
+    }
+    for (auto *FS : CSInfo.SummaryTypeTestAssumeUsers) {
+      FS->addCall({Callee, CI});
+      IsExported |= S->modulePath() != FS->modulePath();
+    }
+  };
+  AddCalls(SlotInfo.CSInfo);
+  for (auto &P : SlotInfo.ConstCSInfo)
+    AddCalls(P.second);
+  return IsExported;
+}
+
+bool DevirtModule::trySingleImplDevirt(
+    ModuleSummaryIndex *ExportSummary,
+    MutableArrayRef<VirtualCallTarget> TargetsForSlot, VTableSlotInfo &SlotInfo,
+    WholeProgramDevirtResolution *Res) {
+  // See if the program contains a single implementation of this virtual
+  // function.
+  Function *TheFn = TargetsForSlot[0].Fn;
+  for (auto &&Target : TargetsForSlot)
+    if (TheFn != Target.Fn)
+      return false;
+
+  // If so, update each call site to call that implementation directly.
+  if (RemarksEnabled)
+    TargetsForSlot[0].WasDevirt = true;
+
+  bool IsExported = false;
+  applySingleImplDevirt(SlotInfo, TheFn, IsExported);
+  if (!IsExported)
+    return false;
+
+  // If the only implementation has local linkage, we must promote to external
+  // to make it visible to thin LTO objects. We can only get here during the
+  // ThinLTO export phase.
+  if (TheFn->hasLocalLinkage()) {
+    std::string NewName = (TheFn->getName() + "$merged").str();
+
+    // Since we are renaming the function, any comdats with the same name must
+    // also be renamed. This is required when targeting COFF, as the comdat name
+    // must match one of the names of the symbols in the comdat.
+    if (Comdat *C = TheFn->getComdat()) {
+      if (C->getName() == TheFn->getName()) {
+        Comdat *NewC = M.getOrInsertComdat(NewName);
+        NewC->setSelectionKind(C->getSelectionKind());
+        for (GlobalObject &GO : M.global_objects())
+          if (GO.getComdat() == C)
+            GO.setComdat(NewC);
+      }
+    }
+
+    TheFn->setLinkage(GlobalValue::ExternalLinkage);
+    TheFn->setVisibility(GlobalValue::HiddenVisibility);
+    TheFn->setName(NewName);
+  }
+  if (ValueInfo TheFnVI = ExportSummary->getValueInfo(TheFn->getGUID()))
+    // Any needed promotion of 'TheFn' has already been done during
+    // LTO unit split, so we can ignore return value of AddCalls.
+    AddCalls(SlotInfo, TheFnVI);
+
+  Res->TheKind = WholeProgramDevirtResolution::SingleImpl;
+  Res->SingleImplName = std::string(TheFn->getName());
+
+  return true;
+}
+
+bool DevirtIndex::trySingleImplDevirt(MutableArrayRef<ValueInfo> TargetsForSlot,
+                                      VTableSlotSummary &SlotSummary,
+                                      VTableSlotInfo &SlotInfo,
+                                      WholeProgramDevirtResolution *Res,
+                                      std::set<ValueInfo> &DevirtTargets) {
+  // See if the program contains a single implementation of this virtual
+  // function.
+  auto TheFn = TargetsForSlot[0];
+  for (auto &&Target : TargetsForSlot)
+    if (TheFn != Target)
+      return false;
+
+  // Don't devirtualize if we don't have target definition.
+  auto Size = TheFn.getSummaryList().size();
+  if (!Size)
+    return false;
+
+  // Don't devirtualize function if we're told to skip it
+  // in -wholeprogramdevirt-skip.
+  if (FunctionsToSkip.match(TheFn.name()))
+    return false;
+
+  // If the summary list contains multiple summaries where at least one is
+  // a local, give up, as we won't know which (possibly promoted) name to use.
+  for (auto &S : TheFn.getSummaryList())
+    if (GlobalValue::isLocalLinkage(S->linkage()) && Size > 1)
+      return false;
+
+  // Collect functions devirtualized at least for one call site for stats.
+  if (PrintSummaryDevirt)
+    DevirtTargets.insert(TheFn);
+
+  auto &S = TheFn.getSummaryList()[0];
+  bool IsExported = AddCalls(SlotInfo, TheFn);
+  if (IsExported)
+    ExportedGUIDs.insert(TheFn.getGUID());
+
+  // Record in summary for use in devirtualization during the ThinLTO import
+  // step.
+  Res->TheKind = WholeProgramDevirtResolution::SingleImpl;
+  if (GlobalValue::isLocalLinkage(S->linkage())) {
+    if (IsExported)
+      // If target is a local function and we are exporting it by
+      // devirtualizing a call in another module, we need to record the
+      // promoted name.
+      Res->SingleImplName = ModuleSummaryIndex::getGlobalNameForLocal(
+          TheFn.name(), ExportSummary.getModuleHash(S->modulePath()));
+    else {
+      LocalWPDTargetsMap[TheFn].push_back(SlotSummary);
+      Res->SingleImplName = std::string(TheFn.name());
+    }
+  } else
+    Res->SingleImplName = std::string(TheFn.name());
+
+  // Name will be empty if this thin link driven off of serialized combined
+  // index (e.g. llvm-lto). However, WPD is not supported/invoked for the
+  // legacy LTO API anyway.
+  assert(!Res->SingleImplName.empty());
+
+  return true;
+}
+
+void DevirtModule::tryICallBranchFunnel(
+    MutableArrayRef<VirtualCallTarget> TargetsForSlot, VTableSlotInfo &SlotInfo,
+    WholeProgramDevirtResolution *Res, VTableSlot Slot) {
+  Triple T(M.getTargetTriple());
+  if (T.getArch() != Triple::x86_64)
+    return;
+
+  if (TargetsForSlot.size() > ClThreshold)
+    return;
+
+  bool HasNonDevirt = !SlotInfo.CSInfo.AllCallSitesDevirted;
+  if (!HasNonDevirt)
+    for (auto &P : SlotInfo.ConstCSInfo)
+      if (!P.second.AllCallSitesDevirted) {
+        HasNonDevirt = true;
+        break;
+      }
+
+  if (!HasNonDevirt)
+    return;
+
+  FunctionType *FT =
+      FunctionType::get(Type::getVoidTy(M.getContext()), {Int8PtrTy}, true);
+  Function *JT;
+  if (isa<MDString>(Slot.TypeID)) {
+    JT = Function::Create(FT, Function::ExternalLinkage,
+                          M.getDataLayout().getProgramAddressSpace(),
+                          getGlobalName(Slot, {}, "branch_funnel"), &M);
+    JT->setVisibility(GlobalValue::HiddenVisibility);
+  } else {
+    JT = Function::Create(FT, Function::InternalLinkage,
+                          M.getDataLayout().getProgramAddressSpace(),
+                          "branch_funnel", &M);
+  }
+  JT->addAttribute(1, Attribute::Nest);
+
+  std::vector<Value *> JTArgs;
+  JTArgs.push_back(JT->arg_begin());
+  for (auto &T : TargetsForSlot) {
+    JTArgs.push_back(getMemberAddr(T.TM));
+    JTArgs.push_back(T.Fn);
+  }
+
+  BasicBlock *BB = BasicBlock::Create(M.getContext(), "", JT, nullptr);
+  Function *Intr =
+      Intrinsic::getDeclaration(&M, llvm::Intrinsic::icall_branch_funnel, {});
+
+  auto *CI = CallInst::Create(Intr, JTArgs, "", BB);
+  CI->setTailCallKind(CallInst::TCK_MustTail);
+  ReturnInst::Create(M.getContext(), nullptr, BB);
+
+  bool IsExported = false;
+  applyICallBranchFunnel(SlotInfo, JT, IsExported);
+  if (IsExported)
+    Res->TheKind = WholeProgramDevirtResolution::BranchFunnel;
+}
+
+void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo,
+                                          Constant *JT, bool &IsExported) {
+  auto Apply = [&](CallSiteInfo &CSInfo) {
+    if (CSInfo.isExported())
+      IsExported = true;
+    if (CSInfo.AllCallSitesDevirted)
+      return;
+    for (auto &&VCallSite : CSInfo.CallSites) {
+      CallBase &CB = VCallSite.CB;
+
+      // Jump tables are only profitable if the retpoline mitigation is enabled.
+      Attribute FSAttr = CB.getCaller()->getFnAttribute("target-features");
       if (!FSAttr.isValid() ||
-          !FSAttr.getValueAsString().contains("+retpoline")) 
-        continue; 
- 
-      if (RemarksEnabled) 
-        VCallSite.emitRemark("branch-funnel", 
-                             JT->stripPointerCasts()->getName(), OREGetter); 
- 
-      // Pass the address of the vtable in the nest register, which is r10 on 
-      // x86_64. 
-      std::vector<Type *> NewArgs; 
-      NewArgs.push_back(Int8PtrTy); 
+          !FSAttr.getValueAsString().contains("+retpoline"))
+        continue;
+
+      if (RemarksEnabled)
+        VCallSite.emitRemark("branch-funnel",
+                             JT->stripPointerCasts()->getName(), OREGetter);
+
+      // Pass the address of the vtable in the nest register, which is r10 on
+      // x86_64.
+      std::vector<Type *> NewArgs;
+      NewArgs.push_back(Int8PtrTy);
       append_range(NewArgs, CB.getFunctionType()->params());
-      FunctionType *NewFT = 
-          FunctionType::get(CB.getFunctionType()->getReturnType(), NewArgs, 
-                            CB.getFunctionType()->isVarArg()); 
-      PointerType *NewFTPtr = PointerType::getUnqual(NewFT); 
- 
-      IRBuilder<> IRB(&CB); 
-      std::vector<Value *> Args; 
-      Args.push_back(IRB.CreateBitCast(VCallSite.VTable, Int8PtrTy)); 
+      FunctionType *NewFT =
+          FunctionType::get(CB.getFunctionType()->getReturnType(), NewArgs,
+                            CB.getFunctionType()->isVarArg());
+      PointerType *NewFTPtr = PointerType::getUnqual(NewFT);
+
+      IRBuilder<> IRB(&CB);
+      std::vector<Value *> Args;
+      Args.push_back(IRB.CreateBitCast(VCallSite.VTable, Int8PtrTy));
       llvm::append_range(Args, CB.args());
- 
-      CallBase *NewCS = nullptr; 
-      if (isa<CallInst>(CB)) 
-        NewCS = IRB.CreateCall(NewFT, IRB.CreateBitCast(JT, NewFTPtr), Args); 
-      else 
-        NewCS = IRB.CreateInvoke(NewFT, IRB.CreateBitCast(JT, NewFTPtr), 
-                                 cast<InvokeInst>(CB).getNormalDest(), 
-                                 cast<InvokeInst>(CB).getUnwindDest(), Args); 
-      NewCS->setCallingConv(CB.getCallingConv()); 
- 
-      AttributeList Attrs = CB.getAttributes(); 
-      std::vector<AttributeSet> NewArgAttrs; 
-      NewArgAttrs.push_back(AttributeSet::get( 
-          M.getContext(), ArrayRef<Attribute>{Attribute::get( 
-                              M.getContext(), Attribute::Nest)})); 
-      for (unsigned I = 0; I + 2 <  Attrs.getNumAttrSets(); ++I) 
-        NewArgAttrs.push_back(Attrs.getParamAttributes(I)); 
-      NewCS->setAttributes( 
-          AttributeList::get(M.getContext(), Attrs.getFnAttributes(), 
-                             Attrs.getRetAttributes(), NewArgAttrs)); 
- 
-      CB.replaceAllUsesWith(NewCS); 
-      CB.eraseFromParent(); 
- 
-      // This use is no longer unsafe. 
-      if (VCallSite.NumUnsafeUses) 
-        --*VCallSite.NumUnsafeUses; 
-    } 
-    // Don't mark as devirtualized because there may be callers compiled without 
-    // retpoline mitigation, which would mean that they are lowered to 
-    // llvm.type.test and therefore require an llvm.type.test resolution for the 
-    // type identifier. 
-  }; 
-  Apply(SlotInfo.CSInfo); 
-  for (auto &P : SlotInfo.ConstCSInfo) 
-    Apply(P.second); 
-} 
- 
-bool DevirtModule::tryEvaluateFunctionsWithArgs( 
-    MutableArrayRef<VirtualCallTarget> TargetsForSlot, 
-    ArrayRef<uint64_t> Args) { 
-  // Evaluate each function and store the result in each target's RetVal 
-  // field. 
-  for (VirtualCallTarget &Target : TargetsForSlot) { 
-    if (Target.Fn->arg_size() != Args.size() + 1) 
-      return false; 
- 
-    Evaluator Eval(M.getDataLayout(), nullptr); 
-    SmallVector<Constant *, 2> EvalArgs; 
-    EvalArgs.push_back( 
-        Constant::getNullValue(Target.Fn->getFunctionType()->getParamType(0))); 
-    for (unsigned I = 0; I != Args.size(); ++I) { 
-      auto *ArgTy = dyn_cast<IntegerType>( 
-          Target.Fn->getFunctionType()->getParamType(I + 1)); 
-      if (!ArgTy) 
-        return false; 
-      EvalArgs.push_back(ConstantInt::get(ArgTy, Args[I])); 
-    } 
- 
-    Constant *RetVal; 
-    if (!Eval.EvaluateFunction(Target.Fn, RetVal, EvalArgs) || 
-        !isa<ConstantInt>(RetVal)) 
-      return false; 
-    Target.RetVal = cast<ConstantInt>(RetVal)->getZExtValue(); 
-  } 
-  return true; 
-} 
- 
-void DevirtModule::applyUniformRetValOpt(CallSiteInfo &CSInfo, StringRef FnName, 
-                                         uint64_t TheRetVal) { 
-  for (auto Call : CSInfo.CallSites) 
-    Call.replaceAndErase( 
-        "uniform-ret-val", FnName, RemarksEnabled, OREGetter, 
-        ConstantInt::get(cast<IntegerType>(Call.CB.getType()), TheRetVal)); 
-  CSInfo.markDevirt(); 
-} 
- 
-bool DevirtModule::tryUniformRetValOpt( 
-    MutableArrayRef<VirtualCallTarget> TargetsForSlot, CallSiteInfo &CSInfo, 
-    WholeProgramDevirtResolution::ByArg *Res) { 
-  // Uniform return value optimization. If all functions return the same 
-  // constant, replace all calls with that constant. 
-  uint64_t TheRetVal = TargetsForSlot[0].RetVal; 
-  for (const VirtualCallTarget &Target : TargetsForSlot) 
-    if (Target.RetVal != TheRetVal) 
-      return false; 
- 
-  if (CSInfo.isExported()) { 
-    Res->TheKind = WholeProgramDevirtResolution::ByArg::UniformRetVal; 
-    Res->Info = TheRetVal; 
-  } 
- 
-  applyUniformRetValOpt(CSInfo, TargetsForSlot[0].Fn->getName(), TheRetVal); 
-  if (RemarksEnabled) 
-    for (auto &&Target : TargetsForSlot) 
-      Target.WasDevirt = true; 
-  return true; 
-} 
- 
-std::string DevirtModule::getGlobalName(VTableSlot Slot, 
-                                        ArrayRef<uint64_t> Args, 
-                                        StringRef Name) { 
-  std::string FullName = "__typeid_"; 
-  raw_string_ostream OS(FullName); 
-  OS << cast<MDString>(Slot.TypeID)->getString() << '_' << Slot.ByteOffset; 
-  for (uint64_t Arg : Args) 
-    OS << '_' << Arg; 
-  OS << '_' << Name; 
-  return OS.str(); 
-} 
- 
-bool DevirtModule::shouldExportConstantsAsAbsoluteSymbols() { 
-  Triple T(M.getTargetTriple()); 
-  return T.isX86() && T.getObjectFormat() == Triple::ELF; 
-} 
- 
-void DevirtModule::exportGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args, 
-                                StringRef Name, Constant *C) { 
-  GlobalAlias *GA = GlobalAlias::create(Int8Ty, 0, GlobalValue::ExternalLinkage, 
-                                        getGlobalName(Slot, Args, Name), C, &M); 
-  GA->setVisibility(GlobalValue::HiddenVisibility); 
-} 
- 
-void DevirtModule::exportConstant(VTableSlot Slot, ArrayRef<uint64_t> Args, 
-                                  StringRef Name, uint32_t Const, 
-                                  uint32_t &Storage) { 
-  if (shouldExportConstantsAsAbsoluteSymbols()) { 
-    exportGlobal( 
-        Slot, Args, Name, 
-        ConstantExpr::getIntToPtr(ConstantInt::get(Int32Ty, Const), Int8PtrTy)); 
-    return; 
-  } 
- 
-  Storage = Const; 
-} 
- 
-Constant *DevirtModule::importGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args, 
-                                     StringRef Name) { 
-  Constant *C = 
-      M.getOrInsertGlobal(getGlobalName(Slot, Args, Name), Int8Arr0Ty); 
-  auto *GV = dyn_cast<GlobalVariable>(C); 
-  if (GV) 
-    GV->setVisibility(GlobalValue::HiddenVisibility); 
-  return C; 
-} 
- 
-Constant *DevirtModule::importConstant(VTableSlot Slot, ArrayRef<uint64_t> Args, 
-                                       StringRef Name, IntegerType *IntTy, 
-                                       uint32_t Storage) { 
-  if (!shouldExportConstantsAsAbsoluteSymbols()) 
-    return ConstantInt::get(IntTy, Storage); 
- 
-  Constant *C = importGlobal(Slot, Args, Name); 
-  auto *GV = cast<GlobalVariable>(C->stripPointerCasts()); 
-  C = ConstantExpr::getPtrToInt(C, IntTy); 
- 
-  // We only need to set metadata if the global is newly created, in which 
-  // case it would not have hidden visibility. 
-  if (GV->hasMetadata(LLVMContext::MD_absolute_symbol)) 
-    return C; 
- 
-  auto SetAbsRange = [&](uint64_t Min, uint64_t Max) { 
-    auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Min)); 
-    auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Max)); 
-    GV->setMetadata(LLVMContext::MD_absolute_symbol, 
-                    MDNode::get(M.getContext(), {MinC, MaxC})); 
-  }; 
-  unsigned AbsWidth = IntTy->getBitWidth(); 
-  if (AbsWidth == IntPtrTy->getBitWidth()) 
-    SetAbsRange(~0ull, ~0ull); // Full set. 
-  else 
-    SetAbsRange(0, 1ull << AbsWidth); 
-  return C; 
-} 
- 
-void DevirtModule::applyUniqueRetValOpt(CallSiteInfo &CSInfo, StringRef FnName, 
-                                        bool IsOne, 
-                                        Constant *UniqueMemberAddr) { 
-  for (auto &&Call : CSInfo.CallSites) { 
-    IRBuilder<> B(&Call.CB); 
-    Value *Cmp = 
-        B.CreateICmp(IsOne ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE, Call.VTable, 
-                     B.CreateBitCast(UniqueMemberAddr, Call.VTable->getType())); 
-    Cmp = B.CreateZExt(Cmp, Call.CB.getType()); 
-    Call.replaceAndErase("unique-ret-val", FnName, RemarksEnabled, OREGetter, 
-                         Cmp); 
-  } 
-  CSInfo.markDevirt(); 
-} 
- 
-Constant *DevirtModule::getMemberAddr(const TypeMemberInfo *M) { 
-  Constant *C = ConstantExpr::getBitCast(M->Bits->GV, Int8PtrTy); 
-  return ConstantExpr::getGetElementPtr(Int8Ty, C, 
-                                        ConstantInt::get(Int64Ty, M->Offset)); 
-} 
- 
-bool DevirtModule::tryUniqueRetValOpt( 
-    unsigned BitWidth, MutableArrayRef<VirtualCallTarget> TargetsForSlot, 
-    CallSiteInfo &CSInfo, WholeProgramDevirtResolution::ByArg *Res, 
-    VTableSlot Slot, ArrayRef<uint64_t> Args) { 
-  // IsOne controls whether we look for a 0 or a 1. 
-  auto tryUniqueRetValOptFor = [&](bool IsOne) { 
-    const TypeMemberInfo *UniqueMember = nullptr; 
-    for (const VirtualCallTarget &Target : TargetsForSlot) { 
-      if (Target.RetVal == (IsOne ? 1 : 0)) { 
-        if (UniqueMember) 
-          return false; 
-        UniqueMember = Target.TM; 
-      } 
-    } 
- 
-    // We should have found a unique member or bailed out by now. We already 
-    // checked for a uniform return value in tryUniformRetValOpt. 
-    assert(UniqueMember); 
- 
-    Constant *UniqueMemberAddr = getMemberAddr(UniqueMember); 
-    if (CSInfo.isExported()) { 
-      Res->TheKind = WholeProgramDevirtResolution::ByArg::UniqueRetVal; 
-      Res->Info = IsOne; 
- 
-      exportGlobal(Slot, Args, "unique_member", UniqueMemberAddr); 
-    } 
- 
-    // Replace each call with the comparison. 
-    applyUniqueRetValOpt(CSInfo, TargetsForSlot[0].Fn->getName(), IsOne, 
-                         UniqueMemberAddr); 
- 
-    // Update devirtualization statistics for targets. 
-    if (RemarksEnabled) 
-      for (auto &&Target : TargetsForSlot) 
-        Target.WasDevirt = true; 
- 
-    return true; 
-  }; 
- 
-  if (BitWidth == 1) { 
-    if (tryUniqueRetValOptFor(true)) 
-      return true; 
-    if (tryUniqueRetValOptFor(false)) 
-      return true; 
-  } 
-  return false; 
-} 
- 
-void DevirtModule::applyVirtualConstProp(CallSiteInfo &CSInfo, StringRef FnName, 
-                                         Constant *Byte, Constant *Bit) { 
-  for (auto Call : CSInfo.CallSites) { 
-    auto *RetType = cast<IntegerType>(Call.CB.getType()); 
-    IRBuilder<> B(&Call.CB); 
-    Value *Addr = 
-        B.CreateGEP(Int8Ty, B.CreateBitCast(Call.VTable, Int8PtrTy), Byte); 
-    if (RetType->getBitWidth() == 1) { 
-      Value *Bits = B.CreateLoad(Int8Ty, Addr); 
-      Value *BitsAndBit = B.CreateAnd(Bits, Bit); 
-      auto IsBitSet = B.CreateICmpNE(BitsAndBit, ConstantInt::get(Int8Ty, 0)); 
-      Call.replaceAndErase("virtual-const-prop-1-bit", FnName, RemarksEnabled, 
-                           OREGetter, IsBitSet); 
-    } else { 
-      Value *ValAddr = B.CreateBitCast(Addr, RetType->getPointerTo()); 
-      Value *Val = B.CreateLoad(RetType, ValAddr); 
-      Call.replaceAndErase("virtual-const-prop", FnName, RemarksEnabled, 
-                           OREGetter, Val); 
-    } 
-  } 
-  CSInfo.markDevirt(); 
-} 
- 
-bool DevirtModule::tryVirtualConstProp( 
-    MutableArrayRef<VirtualCallTarget> TargetsForSlot, VTableSlotInfo &SlotInfo, 
-    WholeProgramDevirtResolution *Res, VTableSlot Slot) { 
-  // This only works if the function returns an integer. 
-  auto RetType = dyn_cast<IntegerType>(TargetsForSlot[0].Fn->getReturnType()); 
-  if (!RetType) 
-    return false; 
-  unsigned BitWidth = RetType->getBitWidth(); 
-  if (BitWidth > 64) 
-    return false; 
- 
-  // Make sure that each function is defined, does not access memory, takes at 
-  // least one argument, does not use its first argument (which we assume is 
-  // 'this'), and has the same return type. 
-  // 
-  // Note that we test whether this copy of the function is readnone, rather 
-  // than testing function attributes, which must hold for any copy of the 
-  // function, even a less optimized version substituted at link time. This is 
-  // sound because the virtual constant propagation optimizations effectively 
-  // inline all implementations of the virtual function into each call site, 
-  // rather than using function attributes to perform local optimization. 
-  for (VirtualCallTarget &Target : TargetsForSlot) { 
-    if (Target.Fn->isDeclaration() || 
-        computeFunctionBodyMemoryAccess(*Target.Fn, AARGetter(*Target.Fn)) != 
-            MAK_ReadNone || 
-        Target.Fn->arg_empty() || !Target.Fn->arg_begin()->use_empty() || 
-        Target.Fn->getReturnType() != RetType) 
-      return false; 
-  } 
- 
-  for (auto &&CSByConstantArg : SlotInfo.ConstCSInfo) { 
-    if (!tryEvaluateFunctionsWithArgs(TargetsForSlot, CSByConstantArg.first)) 
-      continue; 
- 
-    WholeProgramDevirtResolution::ByArg *ResByArg = nullptr; 
-    if (Res) 
-      ResByArg = &Res->ResByArg[CSByConstantArg.first]; 
- 
-    if (tryUniformRetValOpt(TargetsForSlot, CSByConstantArg.second, ResByArg)) 
-      continue; 
- 
-    if (tryUniqueRetValOpt(BitWidth, TargetsForSlot, CSByConstantArg.second, 
-                           ResByArg, Slot, CSByConstantArg.first)) 
-      continue; 
- 
-    // Find an allocation offset in bits in all vtables associated with the 
-    // type. 
-    uint64_t AllocBefore = 
-        findLowestOffset(TargetsForSlot, /*IsAfter=*/false, BitWidth); 
-    uint64_t AllocAfter = 
-        findLowestOffset(TargetsForSlot, /*IsAfter=*/true, BitWidth); 
- 
-    // Calculate the total amount of padding needed to store a value at both 
-    // ends of the object. 
-    uint64_t TotalPaddingBefore = 0, TotalPaddingAfter = 0; 
-    for (auto &&Target : TargetsForSlot) { 
-      TotalPaddingBefore += std::max<int64_t>( 
-          (AllocBefore + 7) / 8 - Target.allocatedBeforeBytes() - 1, 0); 
-      TotalPaddingAfter += std::max<int64_t>( 
-          (AllocAfter + 7) / 8 - Target.allocatedAfterBytes() - 1, 0); 
-    } 
- 
-    // If the amount of padding is too large, give up. 
-    // FIXME: do something smarter here. 
-    if (std::min(TotalPaddingBefore, TotalPaddingAfter) > 128) 
-      continue; 
- 
-    // Calculate the offset to the value as a (possibly negative) byte offset 
-    // and (if applicable) a bit offset, and store the values in the targets. 
-    int64_t OffsetByte; 
-    uint64_t OffsetBit; 
-    if (TotalPaddingBefore <= TotalPaddingAfter) 
-      setBeforeReturnValues(TargetsForSlot, AllocBefore, BitWidth, OffsetByte, 
-                            OffsetBit); 
-    else 
-      setAfterReturnValues(TargetsForSlot, AllocAfter, BitWidth, OffsetByte, 
-                           OffsetBit); 
- 
-    if (RemarksEnabled) 
-      for (auto &&Target : TargetsForSlot) 
-        Target.WasDevirt = true; 
- 
- 
-    if (CSByConstantArg.second.isExported()) { 
-      ResByArg->TheKind = WholeProgramDevirtResolution::ByArg::VirtualConstProp; 
-      exportConstant(Slot, CSByConstantArg.first, "byte", OffsetByte, 
-                     ResByArg->Byte); 
-      exportConstant(Slot, CSByConstantArg.first, "bit", 1ULL << OffsetBit, 
-                     ResByArg->Bit); 
-    } 
- 
-    // Rewrite each call to a load from OffsetByte/OffsetBit. 
-    Constant *ByteConst = ConstantInt::get(Int32Ty, OffsetByte); 
-    Constant *BitConst = ConstantInt::get(Int8Ty, 1ULL << OffsetBit); 
-    applyVirtualConstProp(CSByConstantArg.second, 
-                          TargetsForSlot[0].Fn->getName(), ByteConst, BitConst); 
-  } 
-  return true; 
-} 
- 
-void DevirtModule::rebuildGlobal(VTableBits &B) { 
-  if (B.Before.Bytes.empty() && B.After.Bytes.empty()) 
-    return; 
- 
-  // Align the before byte array to the global's minimum alignment so that we 
-  // don't break any alignment requirements on the global. 
-  Align Alignment = M.getDataLayout().getValueOrABITypeAlignment( 
-      B.GV->getAlign(), B.GV->getValueType()); 
-  B.Before.Bytes.resize(alignTo(B.Before.Bytes.size(), Alignment)); 
- 
-  // Before was stored in reverse order; flip it now. 
-  for (size_t I = 0, Size = B.Before.Bytes.size(); I != Size / 2; ++I) 
-    std::swap(B.Before.Bytes[I], B.Before.Bytes[Size - 1 - I]); 
- 
-  // Build an anonymous global containing the before bytes, followed by the 
-  // original initializer, followed by the after bytes. 
-  auto NewInit = ConstantStruct::getAnon( 
-      {ConstantDataArray::get(M.getContext(), B.Before.Bytes), 
-       B.GV->getInitializer(), 
-       ConstantDataArray::get(M.getContext(), B.After.Bytes)}); 
-  auto NewGV = 
-      new GlobalVariable(M, NewInit->getType(), B.GV->isConstant(), 
-                         GlobalVariable::PrivateLinkage, NewInit, "", B.GV); 
-  NewGV->setSection(B.GV->getSection()); 
-  NewGV->setComdat(B.GV->getComdat()); 
-  NewGV->setAlignment(MaybeAlign(B.GV->getAlignment())); 
- 
-  // Copy the original vtable's metadata to the anonymous global, adjusting 
-  // offsets as required. 
-  NewGV->copyMetadata(B.GV, B.Before.Bytes.size()); 
- 
-  // Build an alias named after the original global, pointing at the second 
-  // element (the original initializer). 
-  auto Alias = GlobalAlias::create( 
-      B.GV->getInitializer()->getType(), 0, B.GV->getLinkage(), "", 
-      ConstantExpr::getGetElementPtr( 
-          NewInit->getType(), NewGV, 
-          ArrayRef<Constant *>{ConstantInt::get(Int32Ty, 0), 
-                               ConstantInt::get(Int32Ty, 1)}), 
-      &M); 
-  Alias->setVisibility(B.GV->getVisibility()); 
-  Alias->takeName(B.GV); 
- 
-  B.GV->replaceAllUsesWith(Alias); 
-  B.GV->eraseFromParent(); 
-} 
- 
-bool DevirtModule::areRemarksEnabled() { 
-  const auto &FL = M.getFunctionList(); 
-  for (const Function &Fn : FL) { 
-    const auto &BBL = Fn.getBasicBlockList(); 
-    if (BBL.empty()) 
-      continue; 
-    auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBL.front()); 
-    return DI.isEnabled(); 
-  } 
-  return false; 
-} 
- 
-void DevirtModule::scanTypeTestUsers( 
-    Function *TypeTestFunc, 
-    DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap) { 
-  // Find all virtual calls via a virtual table pointer %p under an assumption 
-  // of the form llvm.assume(llvm.type.test(%p, %md)). This indicates that %p 
-  // points to a member of the type identifier %md. Group calls by (type ID, 
-  // offset) pair (effectively the identity of the virtual function) and store 
-  // to CallSlots. 
-  for (auto I = TypeTestFunc->use_begin(), E = TypeTestFunc->use_end(); 
-       I != E;) { 
-    auto CI = dyn_cast<CallInst>(I->getUser()); 
-    ++I; 
-    if (!CI) 
-      continue; 
- 
-    // Search for virtual calls based on %p and add them to DevirtCalls. 
-    SmallVector<DevirtCallSite, 1> DevirtCalls; 
-    SmallVector<CallInst *, 1> Assumes; 
-    auto &DT = LookupDomTree(*CI->getFunction()); 
-    findDevirtualizableCallsForTypeTest(DevirtCalls, Assumes, CI, DT); 
- 
-    Metadata *TypeId = 
-        cast<MetadataAsValue>(CI->getArgOperand(1))->getMetadata(); 
-    // If we found any, add them to CallSlots. 
-    if (!Assumes.empty()) { 
-      Value *Ptr = CI->getArgOperand(0)->stripPointerCasts(); 
-      for (DevirtCallSite Call : DevirtCalls) 
-        CallSlots[{TypeId, Call.Offset}].addCallSite(Ptr, Call.CB, nullptr); 
-    } 
- 
-    auto RemoveTypeTestAssumes = [&]() { 
-      // We no longer need the assumes or the type test. 
-      for (auto Assume : Assumes) 
-        Assume->eraseFromParent(); 
-      // We can't use RecursivelyDeleteTriviallyDeadInstructions here because we 
-      // may use the vtable argument later. 
-      if (CI->use_empty()) 
-        CI->eraseFromParent(); 
-    }; 
- 
-    // At this point we could remove all type test assume sequences, as they 
-    // were originally inserted for WPD. However, we can keep these in the 
-    // code stream for later analysis (e.g. to help drive more efficient ICP 
-    // sequences). They will eventually be removed by a second LowerTypeTests 
-    // invocation that cleans them up. In order to do this correctly, the first 
-    // LowerTypeTests invocation needs to know that they have "Unknown" type 
-    // test resolution, so that they aren't treated as Unsat and lowered to 
-    // False, which will break any uses on assumes. Below we remove any type 
-    // test assumes that will not be treated as Unknown by LTT. 
- 
-    // The type test assumes will be treated by LTT as Unsat if the type id is 
-    // not used on a global (in which case it has no entry in the TypeIdMap). 
-    if (!TypeIdMap.count(TypeId)) 
-      RemoveTypeTestAssumes(); 
- 
-    // For ThinLTO importing, we need to remove the type test assumes if this is 
-    // an MDString type id without a corresponding TypeIdSummary. Any 
-    // non-MDString type ids are ignored and treated as Unknown by LTT, so their 
-    // type test assumes can be kept. If the MDString type id is missing a 
-    // TypeIdSummary (e.g. because there was no use on a vcall, preventing the 
-    // exporting phase of WPD from analyzing it), then it would be treated as 
-    // Unsat by LTT and we need to remove its type test assumes here. If not 
-    // used on a vcall we don't need them for later optimization use in any 
-    // case. 
-    else if (ImportSummary && isa<MDString>(TypeId)) { 
-      const TypeIdSummary *TidSummary = 
-          ImportSummary->getTypeIdSummary(cast<MDString>(TypeId)->getString()); 
-      if (!TidSummary) 
-        RemoveTypeTestAssumes(); 
-      else 
-        // If one was created it should not be Unsat, because if we reached here 
-        // the type id was used on a global. 
-        assert(TidSummary->TTRes.TheKind != TypeTestResolution::Unsat); 
-    } 
-  } 
-} 
- 
-void DevirtModule::scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc) { 
-  Function *TypeTestFunc = Intrinsic::getDeclaration(&M, Intrinsic::type_test); 
- 
-  for (auto I = TypeCheckedLoadFunc->use_begin(), 
-            E = TypeCheckedLoadFunc->use_end(); 
-       I != E;) { 
-    auto CI = dyn_cast<CallInst>(I->getUser()); 
-    ++I; 
-    if (!CI) 
-      continue; 
- 
-    Value *Ptr = CI->getArgOperand(0); 
-    Value *Offset = CI->getArgOperand(1); 
-    Value *TypeIdValue = CI->getArgOperand(2); 
-    Metadata *TypeId = cast<MetadataAsValue>(TypeIdValue)->getMetadata(); 
- 
-    SmallVector<DevirtCallSite, 1> DevirtCalls; 
-    SmallVector<Instruction *, 1> LoadedPtrs; 
-    SmallVector<Instruction *, 1> Preds; 
-    bool HasNonCallUses = false; 
-    auto &DT = LookupDomTree(*CI->getFunction()); 
-    findDevirtualizableCallsForTypeCheckedLoad(DevirtCalls, LoadedPtrs, Preds, 
-                                               HasNonCallUses, CI, DT); 
- 
-    // Start by generating "pessimistic" code that explicitly loads the function 
-    // pointer from the vtable and performs the type check. If possible, we will 
-    // eliminate the load and the type check later. 
- 
-    // If possible, only generate the load at the point where it is used. 
-    // This helps avoid unnecessary spills. 
-    IRBuilder<> LoadB( 
-        (LoadedPtrs.size() == 1 && !HasNonCallUses) ? LoadedPtrs[0] : CI); 
-    Value *GEP = LoadB.CreateGEP(Int8Ty, Ptr, Offset); 
-    Value *GEPPtr = LoadB.CreateBitCast(GEP, PointerType::getUnqual(Int8PtrTy)); 
-    Value *LoadedValue = LoadB.CreateLoad(Int8PtrTy, GEPPtr); 
- 
-    for (Instruction *LoadedPtr : LoadedPtrs) { 
-      LoadedPtr->replaceAllUsesWith(LoadedValue); 
-      LoadedPtr->eraseFromParent(); 
-    } 
- 
-    // Likewise for the type test. 
-    IRBuilder<> CallB((Preds.size() == 1 && !HasNonCallUses) ? Preds[0] : CI); 
-    CallInst *TypeTestCall = CallB.CreateCall(TypeTestFunc, {Ptr, TypeIdValue}); 
- 
-    for (Instruction *Pred : Preds) { 
-      Pred->replaceAllUsesWith(TypeTestCall); 
-      Pred->eraseFromParent(); 
-    } 
- 
-    // We have already erased any extractvalue instructions that refer to the 
-    // intrinsic call, but the intrinsic may have other non-extractvalue uses 
-    // (although this is unlikely). In that case, explicitly build a pair and 
-    // RAUW it. 
-    if (!CI->use_empty()) { 
-      Value *Pair = UndefValue::get(CI->getType()); 
-      IRBuilder<> B(CI); 
-      Pair = B.CreateInsertValue(Pair, LoadedValue, {0}); 
-      Pair = B.CreateInsertValue(Pair, TypeTestCall, {1}); 
-      CI->replaceAllUsesWith(Pair); 
-    } 
- 
-    // The number of unsafe uses is initially the number of uses. 
-    auto &NumUnsafeUses = NumUnsafeUsesForTypeTest[TypeTestCall]; 
-    NumUnsafeUses = DevirtCalls.size(); 
- 
-    // If the function pointer has a non-call user, we cannot eliminate the type 
-    // check, as one of those users may eventually call the pointer. Increment 
-    // the unsafe use count to make sure it cannot reach zero. 
-    if (HasNonCallUses) 
-      ++NumUnsafeUses; 
-    for (DevirtCallSite Call : DevirtCalls) { 
-      CallSlots[{TypeId, Call.Offset}].addCallSite(Ptr, Call.CB, 
-                                                   &NumUnsafeUses); 
-    } 
- 
-    CI->eraseFromParent(); 
-  } 
-} 
- 
-void DevirtModule::importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo) { 
-  auto *TypeId = dyn_cast<MDString>(Slot.TypeID); 
-  if (!TypeId) 
-    return; 
-  const TypeIdSummary *TidSummary = 
-      ImportSummary->getTypeIdSummary(TypeId->getString()); 
-  if (!TidSummary) 
-    return; 
-  auto ResI = TidSummary->WPDRes.find(Slot.ByteOffset); 
-  if (ResI == TidSummary->WPDRes.end()) 
-    return; 
-  const WholeProgramDevirtResolution &Res = ResI->second; 
- 
-  if (Res.TheKind == WholeProgramDevirtResolution::SingleImpl) { 
-    assert(!Res.SingleImplName.empty()); 
-    // The type of the function in the declaration is irrelevant because every 
-    // call site will cast it to the correct type. 
-    Constant *SingleImpl = 
-        cast<Constant>(M.getOrInsertFunction(Res.SingleImplName, 
-                                             Type::getVoidTy(M.getContext())) 
-                           .getCallee()); 
- 
-    // This is the import phase so we should not be exporting anything. 
-    bool IsExported = false; 
-    applySingleImplDevirt(SlotInfo, SingleImpl, IsExported); 
-    assert(!IsExported); 
-  } 
- 
-  for (auto &CSByConstantArg : SlotInfo.ConstCSInfo) { 
-    auto I = Res.ResByArg.find(CSByConstantArg.first); 
-    if (I == Res.ResByArg.end()) 
-      continue; 
-    auto &ResByArg = I->second; 
-    // FIXME: We should figure out what to do about the "function name" argument 
-    // to the apply* functions, as the function names are unavailable during the 
-    // importing phase. For now we just pass the empty string. This does not 
-    // impact correctness because the function names are just used for remarks. 
-    switch (ResByArg.TheKind) { 
-    case WholeProgramDevirtResolution::ByArg::UniformRetVal: 
-      applyUniformRetValOpt(CSByConstantArg.second, "", ResByArg.Info); 
-      break; 
-    case WholeProgramDevirtResolution::ByArg::UniqueRetVal: { 
-      Constant *UniqueMemberAddr = 
-          importGlobal(Slot, CSByConstantArg.first, "unique_member"); 
-      applyUniqueRetValOpt(CSByConstantArg.second, "", ResByArg.Info, 
-                           UniqueMemberAddr); 
-      break; 
-    } 
-    case WholeProgramDevirtResolution::ByArg::VirtualConstProp: { 
-      Constant *Byte = importConstant(Slot, CSByConstantArg.first, "byte", 
-                                      Int32Ty, ResByArg.Byte); 
-      Constant *Bit = importConstant(Slot, CSByConstantArg.first, "bit", Int8Ty, 
-                                     ResByArg.Bit); 
-      applyVirtualConstProp(CSByConstantArg.second, "", Byte, Bit); 
-      break; 
-    } 
-    default: 
-      break; 
-    } 
-  } 
- 
-  if (Res.TheKind == WholeProgramDevirtResolution::BranchFunnel) { 
-    // The type of the function is irrelevant, because it's bitcast at calls 
-    // anyhow. 
-    Constant *JT = cast<Constant>( 
-        M.getOrInsertFunction(getGlobalName(Slot, {}, "branch_funnel"), 
-                              Type::getVoidTy(M.getContext())) 
-            .getCallee()); 
-    bool IsExported = false; 
-    applyICallBranchFunnel(SlotInfo, JT, IsExported); 
-    assert(!IsExported); 
-  } 
-} 
- 
-void DevirtModule::removeRedundantTypeTests() { 
-  auto True = ConstantInt::getTrue(M.getContext()); 
-  for (auto &&U : NumUnsafeUsesForTypeTest) { 
-    if (U.second == 0) { 
-      U.first->replaceAllUsesWith(True); 
-      U.first->eraseFromParent(); 
-    } 
-  } 
-} 
- 
-bool DevirtModule::run() { 
-  // If only some of the modules were split, we cannot correctly perform 
-  // this transformation. We already checked for the presense of type tests 
-  // with partially split modules during the thin link, and would have emitted 
-  // an error if any were found, so here we can simply return. 
-  if ((ExportSummary && ExportSummary->partiallySplitLTOUnits()) || 
-      (ImportSummary && ImportSummary->partiallySplitLTOUnits())) 
-    return false; 
- 
-  Function *TypeTestFunc = 
-      M.getFunction(Intrinsic::getName(Intrinsic::type_test)); 
-  Function *TypeCheckedLoadFunc = 
-      M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load)); 
-  Function *AssumeFunc = M.getFunction(Intrinsic::getName(Intrinsic::assume)); 
- 
-  // Normally if there are no users of the devirtualization intrinsics in the 
-  // module, this pass has nothing to do. But if we are exporting, we also need 
-  // to handle any users that appear only in the function summaries. 
-  if (!ExportSummary && 
-      (!TypeTestFunc || TypeTestFunc->use_empty() || !AssumeFunc || 
-       AssumeFunc->use_empty()) && 
-      (!TypeCheckedLoadFunc || TypeCheckedLoadFunc->use_empty())) 
-    return false; 
- 
-  // Rebuild type metadata into a map for easy lookup. 
-  std::vector<VTableBits> Bits; 
-  DenseMap<Metadata *, std::set<TypeMemberInfo>> TypeIdMap; 
-  buildTypeIdentifierMap(Bits, TypeIdMap); 
- 
-  if (TypeTestFunc && AssumeFunc) 
-    scanTypeTestUsers(TypeTestFunc, TypeIdMap); 
- 
-  if (TypeCheckedLoadFunc) 
-    scanTypeCheckedLoadUsers(TypeCheckedLoadFunc); 
- 
-  if (ImportSummary) { 
-    for (auto &S : CallSlots) 
-      importResolution(S.first, S.second); 
- 
-    removeRedundantTypeTests(); 
- 
-    // We have lowered or deleted the type instrinsics, so we will no 
-    // longer have enough information to reason about the liveness of virtual 
-    // function pointers in GlobalDCE. 
-    for (GlobalVariable &GV : M.globals()) 
-      GV.eraseMetadata(LLVMContext::MD_vcall_visibility); 
- 
-    // The rest of the code is only necessary when exporting or during regular 
-    // LTO, so we are done. 
-    return true; 
-  } 
- 
-  if (TypeIdMap.empty()) 
-    return true; 
- 
-  // Collect information from summary about which calls to try to devirtualize. 
-  if (ExportSummary) { 
-    DenseMap<GlobalValue::GUID, TinyPtrVector<Metadata *>> MetadataByGUID; 
-    for (auto &P : TypeIdMap) { 
-      if (auto *TypeId = dyn_cast<MDString>(P.first)) 
-        MetadataByGUID[GlobalValue::getGUID(TypeId->getString())].push_back( 
-            TypeId); 
-    } 
- 
-    for (auto &P : *ExportSummary) { 
-      for (auto &S : P.second.SummaryList) { 
-        auto *FS = dyn_cast<FunctionSummary>(S.get()); 
-        if (!FS) 
-          continue; 
-        // FIXME: Only add live functions. 
-        for (FunctionSummary::VFuncId VF : FS->type_test_assume_vcalls()) { 
-          for (Metadata *MD : MetadataByGUID[VF.GUID]) { 
-            CallSlots[{MD, VF.Offset}].CSInfo.addSummaryTypeTestAssumeUser(FS); 
-          } 
-        } 
-        for (FunctionSummary::VFuncId VF : FS->type_checked_load_vcalls()) { 
-          for (Metadata *MD : MetadataByGUID[VF.GUID]) { 
-            CallSlots[{MD, VF.Offset}].CSInfo.addSummaryTypeCheckedLoadUser(FS); 
-          } 
-        } 
-        for (const FunctionSummary::ConstVCall &VC : 
-             FS->type_test_assume_const_vcalls()) { 
-          for (Metadata *MD : MetadataByGUID[VC.VFunc.GUID]) { 
-            CallSlots[{MD, VC.VFunc.Offset}] 
-                .ConstCSInfo[VC.Args] 
-                .addSummaryTypeTestAssumeUser(FS); 
-          } 
-        } 
-        for (const FunctionSummary::ConstVCall &VC : 
-             FS->type_checked_load_const_vcalls()) { 
-          for (Metadata *MD : MetadataByGUID[VC.VFunc.GUID]) { 
-            CallSlots[{MD, VC.VFunc.Offset}] 
-                .ConstCSInfo[VC.Args] 
-                .addSummaryTypeCheckedLoadUser(FS); 
-          } 
-        } 
-      } 
-    } 
-  } 
- 
-  // For each (type, offset) pair: 
-  bool DidVirtualConstProp = false; 
-  std::map<std::string, Function*> DevirtTargets; 
-  for (auto &S : CallSlots) { 
-    // Search each of the members of the type identifier for the virtual 
-    // function implementation at offset S.first.ByteOffset, and add to 
-    // TargetsForSlot. 
-    std::vector<VirtualCallTarget> TargetsForSlot; 
-    WholeProgramDevirtResolution *Res = nullptr; 
-    const std::set<TypeMemberInfo> &TypeMemberInfos = TypeIdMap[S.first.TypeID]; 
-    if (ExportSummary && isa<MDString>(S.first.TypeID) && 
-        TypeMemberInfos.size()) 
-      // For any type id used on a global's type metadata, create the type id 
-      // summary resolution regardless of whether we can devirtualize, so that 
-      // lower type tests knows the type id is not Unsat. If it was not used on 
-      // a global's type metadata, the TypeIdMap entry set will be empty, and 
-      // we don't want to create an entry (with the default Unknown type 
-      // resolution), which can prevent detection of the Unsat. 
-      Res = &ExportSummary 
-                 ->getOrInsertTypeIdSummary( 
-                     cast<MDString>(S.first.TypeID)->getString()) 
-                 .WPDRes[S.first.ByteOffset]; 
-    if (tryFindVirtualCallTargets(TargetsForSlot, TypeMemberInfos, 
-                                  S.first.ByteOffset)) { 
- 
-      if (!trySingleImplDevirt(ExportSummary, TargetsForSlot, S.second, Res)) { 
-        DidVirtualConstProp |= 
-            tryVirtualConstProp(TargetsForSlot, S.second, Res, S.first); 
- 
-        tryICallBranchFunnel(TargetsForSlot, S.second, Res, S.first); 
-      } 
- 
-      // Collect functions devirtualized at least for one call site for stats. 
-      if (RemarksEnabled) 
-        for (const auto &T : TargetsForSlot) 
-          if (T.WasDevirt) 
-            DevirtTargets[std::string(T.Fn->getName())] = T.Fn; 
-    } 
- 
-    // CFI-specific: if we are exporting and any llvm.type.checked.load 
-    // intrinsics were *not* devirtualized, we need to add the resulting 
-    // llvm.type.test intrinsics to the function summaries so that the 
-    // LowerTypeTests pass will export them. 
-    if (ExportSummary && isa<MDString>(S.first.TypeID)) { 
-      auto GUID = 
-          GlobalValue::getGUID(cast<MDString>(S.first.TypeID)->getString()); 
-      for (auto FS : S.second.CSInfo.SummaryTypeCheckedLoadUsers) 
-        FS->addTypeTest(GUID); 
-      for (auto &CCS : S.second.ConstCSInfo) 
-        for (auto FS : CCS.second.SummaryTypeCheckedLoadUsers) 
-          FS->addTypeTest(GUID); 
-    } 
-  } 
- 
-  if (RemarksEnabled) { 
-    // Generate remarks for each devirtualized function. 
-    for (const auto &DT : DevirtTargets) { 
-      Function *F = DT.second; 
- 
-      using namespace ore; 
-      OREGetter(F).emit(OptimizationRemark(DEBUG_TYPE, "Devirtualized", F) 
-                        << "devirtualized " 
-                        << NV("FunctionName", DT.first)); 
-    } 
-  } 
- 
-  removeRedundantTypeTests(); 
- 
-  // Rebuild each global we touched as part of virtual constant propagation to 
-  // include the before and after bytes. 
-  if (DidVirtualConstProp) 
-    for (VTableBits &B : Bits) 
-      rebuildGlobal(B); 
- 
-  // We have lowered or deleted the type instrinsics, so we will no 
-  // longer have enough information to reason about the liveness of virtual 
-  // function pointers in GlobalDCE. 
-  for (GlobalVariable &GV : M.globals()) 
-    GV.eraseMetadata(LLVMContext::MD_vcall_visibility); 
- 
-  return true; 
-} 
- 
-void DevirtIndex::run() { 
-  if (ExportSummary.typeIdCompatibleVtableMap().empty()) 
-    return; 
- 
-  DenseMap<GlobalValue::GUID, std::vector<StringRef>> NameByGUID; 
-  for (auto &P : ExportSummary.typeIdCompatibleVtableMap()) { 
-    NameByGUID[GlobalValue::getGUID(P.first)].push_back(P.first); 
-  } 
- 
-  // Collect information from summary about which calls to try to devirtualize. 
-  for (auto &P : ExportSummary) { 
-    for (auto &S : P.second.SummaryList) { 
-      auto *FS = dyn_cast<FunctionSummary>(S.get()); 
-      if (!FS) 
-        continue; 
-      // FIXME: Only add live functions. 
-      for (FunctionSummary::VFuncId VF : FS->type_test_assume_vcalls()) { 
-        for (StringRef Name : NameByGUID[VF.GUID]) { 
-          CallSlots[{Name, VF.Offset}].CSInfo.addSummaryTypeTestAssumeUser(FS); 
-        } 
-      } 
-      for (FunctionSummary::VFuncId VF : FS->type_checked_load_vcalls()) { 
-        for (StringRef Name : NameByGUID[VF.GUID]) { 
-          CallSlots[{Name, VF.Offset}].CSInfo.addSummaryTypeCheckedLoadUser(FS); 
-        } 
-      } 
-      for (const FunctionSummary::ConstVCall &VC : 
-           FS->type_test_assume_const_vcalls()) { 
-        for (StringRef Name : NameByGUID[VC.VFunc.GUID]) { 
-          CallSlots[{Name, VC.VFunc.Offset}] 
-              .ConstCSInfo[VC.Args] 
-              .addSummaryTypeTestAssumeUser(FS); 
-        } 
-      } 
-      for (const FunctionSummary::ConstVCall &VC : 
-           FS->type_checked_load_const_vcalls()) { 
-        for (StringRef Name : NameByGUID[VC.VFunc.GUID]) { 
-          CallSlots[{Name, VC.VFunc.Offset}] 
-              .ConstCSInfo[VC.Args] 
-              .addSummaryTypeCheckedLoadUser(FS); 
-        } 
-      } 
-    } 
-  } 
- 
-  std::set<ValueInfo> DevirtTargets; 
-  // For each (type, offset) pair: 
-  for (auto &S : CallSlots) { 
-    // Search each of the members of the type identifier for the virtual 
-    // function implementation at offset S.first.ByteOffset, and add to 
-    // TargetsForSlot. 
-    std::vector<ValueInfo> TargetsForSlot; 
-    auto TidSummary = ExportSummary.getTypeIdCompatibleVtableSummary(S.first.TypeID); 
-    assert(TidSummary); 
-    // Create the type id summary resolution regardlness of whether we can 
-    // devirtualize, so that lower type tests knows the type id is used on 
-    // a global and not Unsat. 
-    WholeProgramDevirtResolution *Res = 
-        &ExportSummary.getOrInsertTypeIdSummary(S.first.TypeID) 
-             .WPDRes[S.first.ByteOffset]; 
-    if (tryFindVirtualCallTargets(TargetsForSlot, *TidSummary, 
-                                  S.first.ByteOffset)) { 
- 
-      if (!trySingleImplDevirt(TargetsForSlot, S.first, S.second, Res, 
-                               DevirtTargets)) 
-        continue; 
-    } 
-  } 
- 
-  // Optionally have the thin link print message for each devirtualized 
-  // function. 
-  if (PrintSummaryDevirt) 
-    for (const auto &DT : DevirtTargets) 
-      errs() << "Devirtualized call to " << DT << "\n"; 
-} 
+
+      CallBase *NewCS = nullptr;
+      if (isa<CallInst>(CB))
+        NewCS = IRB.CreateCall(NewFT, IRB.CreateBitCast(JT, NewFTPtr), Args);
+      else
+        NewCS = IRB.CreateInvoke(NewFT, IRB.CreateBitCast(JT, NewFTPtr),
+                                 cast<InvokeInst>(CB).getNormalDest(),
+                                 cast<InvokeInst>(CB).getUnwindDest(), Args);
+      NewCS->setCallingConv(CB.getCallingConv());
+
+      AttributeList Attrs = CB.getAttributes();
+      std::vector<AttributeSet> NewArgAttrs;
+      NewArgAttrs.push_back(AttributeSet::get(
+          M.getContext(), ArrayRef<Attribute>{Attribute::get(
+                              M.getContext(), Attribute::Nest)}));
+      for (unsigned I = 0; I + 2 <  Attrs.getNumAttrSets(); ++I)
+        NewArgAttrs.push_back(Attrs.getParamAttributes(I));
+      NewCS->setAttributes(
+          AttributeList::get(M.getContext(), Attrs.getFnAttributes(),
+                             Attrs.getRetAttributes(), NewArgAttrs));
+
+      CB.replaceAllUsesWith(NewCS);
+      CB.eraseFromParent();
+
+      // This use is no longer unsafe.
+      if (VCallSite.NumUnsafeUses)
+        --*VCallSite.NumUnsafeUses;
+    }
+    // Don't mark as devirtualized because there may be callers compiled without
+    // retpoline mitigation, which would mean that they are lowered to
+    // llvm.type.test and therefore require an llvm.type.test resolution for the
+    // type identifier.
+  };
+  Apply(SlotInfo.CSInfo);
+  for (auto &P : SlotInfo.ConstCSInfo)
+    Apply(P.second);
+}
+
+bool DevirtModule::tryEvaluateFunctionsWithArgs(
+    MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+    ArrayRef<uint64_t> Args) {
+  // Evaluate each function and store the result in each target's RetVal
+  // field.
+  for (VirtualCallTarget &Target : TargetsForSlot) {
+    if (Target.Fn->arg_size() != Args.size() + 1)
+      return false;
+
+    Evaluator Eval(M.getDataLayout(), nullptr);
+    SmallVector<Constant *, 2> EvalArgs;
+    EvalArgs.push_back(
+        Constant::getNullValue(Target.Fn->getFunctionType()->getParamType(0)));
+    for (unsigned I = 0; I != Args.size(); ++I) {
+      auto *ArgTy = dyn_cast<IntegerType>(
+          Target.Fn->getFunctionType()->getParamType(I + 1));
+      if (!ArgTy)
+        return false;
+      EvalArgs.push_back(ConstantInt::get(ArgTy, Args[I]));
+    }
+
+    Constant *RetVal;
+    if (!Eval.EvaluateFunction(Target.Fn, RetVal, EvalArgs) ||
+        !isa<ConstantInt>(RetVal))
+      return false;
+    Target.RetVal = cast<ConstantInt>(RetVal)->getZExtValue();
+  }
+  return true;
+}
+
+void DevirtModule::applyUniformRetValOpt(CallSiteInfo &CSInfo, StringRef FnName,
+                                         uint64_t TheRetVal) {
+  for (auto Call : CSInfo.CallSites)
+    Call.replaceAndErase(
+        "uniform-ret-val", FnName, RemarksEnabled, OREGetter,
+        ConstantInt::get(cast<IntegerType>(Call.CB.getType()), TheRetVal));
+  CSInfo.markDevirt();
+}
+
+bool DevirtModule::tryUniformRetValOpt(
+    MutableArrayRef<VirtualCallTarget> TargetsForSlot, CallSiteInfo &CSInfo,
+    WholeProgramDevirtResolution::ByArg *Res) {
+  // Uniform return value optimization. If all functions return the same
+  // constant, replace all calls with that constant.
+  uint64_t TheRetVal = TargetsForSlot[0].RetVal;
+  for (const VirtualCallTarget &Target : TargetsForSlot)
+    if (Target.RetVal != TheRetVal)
+      return false;
+
+  if (CSInfo.isExported()) {
+    Res->TheKind = WholeProgramDevirtResolution::ByArg::UniformRetVal;
+    Res->Info = TheRetVal;
+  }
+
+  applyUniformRetValOpt(CSInfo, TargetsForSlot[0].Fn->getName(), TheRetVal);
+  if (RemarksEnabled)
+    for (auto &&Target : TargetsForSlot)
+      Target.WasDevirt = true;
+  return true;
+}
+
+std::string DevirtModule::getGlobalName(VTableSlot Slot,
+                                        ArrayRef<uint64_t> Args,
+                                        StringRef Name) {
+  std::string FullName = "__typeid_";
+  raw_string_ostream OS(FullName);
+  OS << cast<MDString>(Slot.TypeID)->getString() << '_' << Slot.ByteOffset;
+  for (uint64_t Arg : Args)
+    OS << '_' << Arg;
+  OS << '_' << Name;
+  return OS.str();
+}
+
+bool DevirtModule::shouldExportConstantsAsAbsoluteSymbols() {
+  Triple T(M.getTargetTriple());
+  return T.isX86() && T.getObjectFormat() == Triple::ELF;
+}
+
+void DevirtModule::exportGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args,
+                                StringRef Name, Constant *C) {
+  GlobalAlias *GA = GlobalAlias::create(Int8Ty, 0, GlobalValue::ExternalLinkage,
+                                        getGlobalName(Slot, Args, Name), C, &M);
+  GA->setVisibility(GlobalValue::HiddenVisibility);
+}
+
+void DevirtModule::exportConstant(VTableSlot Slot, ArrayRef<uint64_t> Args,
+                                  StringRef Name, uint32_t Const,
+                                  uint32_t &Storage) {
+  if (shouldExportConstantsAsAbsoluteSymbols()) {
+    exportGlobal(
+        Slot, Args, Name,
+        ConstantExpr::getIntToPtr(ConstantInt::get(Int32Ty, Const), Int8PtrTy));
+    return;
+  }
+
+  Storage = Const;
+}
+
+Constant *DevirtModule::importGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args,
+                                     StringRef Name) {
+  Constant *C =
+      M.getOrInsertGlobal(getGlobalName(Slot, Args, Name), Int8Arr0Ty);
+  auto *GV = dyn_cast<GlobalVariable>(C);
+  if (GV)
+    GV->setVisibility(GlobalValue::HiddenVisibility);
+  return C;
+}
+
+Constant *DevirtModule::importConstant(VTableSlot Slot, ArrayRef<uint64_t> Args,
+                                       StringRef Name, IntegerType *IntTy,
+                                       uint32_t Storage) {
+  if (!shouldExportConstantsAsAbsoluteSymbols())
+    return ConstantInt::get(IntTy, Storage);
+
+  Constant *C = importGlobal(Slot, Args, Name);
+  auto *GV = cast<GlobalVariable>(C->stripPointerCasts());
+  C = ConstantExpr::getPtrToInt(C, IntTy);
+
+  // We only need to set metadata if the global is newly created, in which
+  // case it would not have hidden visibility.
+  if (GV->hasMetadata(LLVMContext::MD_absolute_symbol))
+    return C;
+
+  auto SetAbsRange = [&](uint64_t Min, uint64_t Max) {
+    auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Min));
+    auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Max));
+    GV->setMetadata(LLVMContext::MD_absolute_symbol,
+                    MDNode::get(M.getContext(), {MinC, MaxC}));
+  };
+  unsigned AbsWidth = IntTy->getBitWidth();
+  if (AbsWidth == IntPtrTy->getBitWidth())
+    SetAbsRange(~0ull, ~0ull); // Full set.
+  else
+    SetAbsRange(0, 1ull << AbsWidth);
+  return C;
+}
+
+void DevirtModule::applyUniqueRetValOpt(CallSiteInfo &CSInfo, StringRef FnName,
+                                        bool IsOne,
+                                        Constant *UniqueMemberAddr) {
+  for (auto &&Call : CSInfo.CallSites) {
+    IRBuilder<> B(&Call.CB);
+    Value *Cmp =
+        B.CreateICmp(IsOne ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE, Call.VTable,
+                     B.CreateBitCast(UniqueMemberAddr, Call.VTable->getType()));
+    Cmp = B.CreateZExt(Cmp, Call.CB.getType());
+    Call.replaceAndErase("unique-ret-val", FnName, RemarksEnabled, OREGetter,
+                         Cmp);
+  }
+  CSInfo.markDevirt();
+}
+
+Constant *DevirtModule::getMemberAddr(const TypeMemberInfo *M) {
+  Constant *C = ConstantExpr::getBitCast(M->Bits->GV, Int8PtrTy);
+  return ConstantExpr::getGetElementPtr(Int8Ty, C,
+                                        ConstantInt::get(Int64Ty, M->Offset));
+}
+
+bool DevirtModule::tryUniqueRetValOpt(
+    unsigned BitWidth, MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+    CallSiteInfo &CSInfo, WholeProgramDevirtResolution::ByArg *Res,
+    VTableSlot Slot, ArrayRef<uint64_t> Args) {
+  // IsOne controls whether we look for a 0 or a 1.
+  auto tryUniqueRetValOptFor = [&](bool IsOne) {
+    const TypeMemberInfo *UniqueMember = nullptr;
+    for (const VirtualCallTarget &Target : TargetsForSlot) {
+      if (Target.RetVal == (IsOne ? 1 : 0)) {
+        if (UniqueMember)
+          return false;
+        UniqueMember = Target.TM;
+      }
+    }
+
+    // We should have found a unique member or bailed out by now. We already
+    // checked for a uniform return value in tryUniformRetValOpt.
+    assert(UniqueMember);
+
+    Constant *UniqueMemberAddr = getMemberAddr(UniqueMember);
+    if (CSInfo.isExported()) {
+      Res->TheKind = WholeProgramDevirtResolution::ByArg::UniqueRetVal;
+      Res->Info = IsOne;
+
+      exportGlobal(Slot, Args, "unique_member", UniqueMemberAddr);
+    }
+
+    // Replace each call with the comparison.
+    applyUniqueRetValOpt(CSInfo, TargetsForSlot[0].Fn->getName(), IsOne,
+                         UniqueMemberAddr);
+
+    // Update devirtualization statistics for targets.
+    if (RemarksEnabled)
+      for (auto &&Target : TargetsForSlot)
+        Target.WasDevirt = true;
+
+    return true;
+  };
+
+  if (BitWidth == 1) {
+    if (tryUniqueRetValOptFor(true))
+      return true;
+    if (tryUniqueRetValOptFor(false))
+      return true;
+  }
+  return false;
+}
+
+void DevirtModule::applyVirtualConstProp(CallSiteInfo &CSInfo, StringRef FnName,
+                                         Constant *Byte, Constant *Bit) {
+  for (auto Call : CSInfo.CallSites) {
+    auto *RetType = cast<IntegerType>(Call.CB.getType());
+    IRBuilder<> B(&Call.CB);
+    Value *Addr =
+        B.CreateGEP(Int8Ty, B.CreateBitCast(Call.VTable, Int8PtrTy), Byte);
+    if (RetType->getBitWidth() == 1) {
+      Value *Bits = B.CreateLoad(Int8Ty, Addr);
+      Value *BitsAndBit = B.CreateAnd(Bits, Bit);
+      auto IsBitSet = B.CreateICmpNE(BitsAndBit, ConstantInt::get(Int8Ty, 0));
+      Call.replaceAndErase("virtual-const-prop-1-bit", FnName, RemarksEnabled,
+                           OREGetter, IsBitSet);
+    } else {
+      Value *ValAddr = B.CreateBitCast(Addr, RetType->getPointerTo());
+      Value *Val = B.CreateLoad(RetType, ValAddr);
+      Call.replaceAndErase("virtual-const-prop", FnName, RemarksEnabled,
+                           OREGetter, Val);
+    }
+  }
+  CSInfo.markDevirt();
+}
+
+bool DevirtModule::tryVirtualConstProp(
+    MutableArrayRef<VirtualCallTarget> TargetsForSlot, VTableSlotInfo &SlotInfo,
+    WholeProgramDevirtResolution *Res, VTableSlot Slot) {
+  // This only works if the function returns an integer.
+  auto RetType = dyn_cast<IntegerType>(TargetsForSlot[0].Fn->getReturnType());
+  if (!RetType)
+    return false;
+  unsigned BitWidth = RetType->getBitWidth();
+  if (BitWidth > 64)
+    return false;
+
+  // Make sure that each function is defined, does not access memory, takes at
+  // least one argument, does not use its first argument (which we assume is
+  // 'this'), and has the same return type.
+  //
+  // Note that we test whether this copy of the function is readnone, rather
+  // than testing function attributes, which must hold for any copy of the
+  // function, even a less optimized version substituted at link time. This is
+  // sound because the virtual constant propagation optimizations effectively
+  // inline all implementations of the virtual function into each call site,
+  // rather than using function attributes to perform local optimization.
+  for (VirtualCallTarget &Target : TargetsForSlot) {
+    if (Target.Fn->isDeclaration() ||
+        computeFunctionBodyMemoryAccess(*Target.Fn, AARGetter(*Target.Fn)) !=
+            MAK_ReadNone ||
+        Target.Fn->arg_empty() || !Target.Fn->arg_begin()->use_empty() ||
+        Target.Fn->getReturnType() != RetType)
+      return false;
+  }
+
+  for (auto &&CSByConstantArg : SlotInfo.ConstCSInfo) {
+    if (!tryEvaluateFunctionsWithArgs(TargetsForSlot, CSByConstantArg.first))
+      continue;
+
+    WholeProgramDevirtResolution::ByArg *ResByArg = nullptr;
+    if (Res)
+      ResByArg = &Res->ResByArg[CSByConstantArg.first];
+
+    if (tryUniformRetValOpt(TargetsForSlot, CSByConstantArg.second, ResByArg))
+      continue;
+
+    if (tryUniqueRetValOpt(BitWidth, TargetsForSlot, CSByConstantArg.second,
+                           ResByArg, Slot, CSByConstantArg.first))
+      continue;
+
+    // Find an allocation offset in bits in all vtables associated with the
+    // type.
+    uint64_t AllocBefore =
+        findLowestOffset(TargetsForSlot, /*IsAfter=*/false, BitWidth);
+    uint64_t AllocAfter =
+        findLowestOffset(TargetsForSlot, /*IsAfter=*/true, BitWidth);
+
+    // Calculate the total amount of padding needed to store a value at both
+    // ends of the object.
+    uint64_t TotalPaddingBefore = 0, TotalPaddingAfter = 0;
+    for (auto &&Target : TargetsForSlot) {
+      TotalPaddingBefore += std::max<int64_t>(
+          (AllocBefore + 7) / 8 - Target.allocatedBeforeBytes() - 1, 0);
+      TotalPaddingAfter += std::max<int64_t>(
+          (AllocAfter + 7) / 8 - Target.allocatedAfterBytes() - 1, 0);
+    }
+
+    // If the amount of padding is too large, give up.
+    // FIXME: do something smarter here.
+    if (std::min(TotalPaddingBefore, TotalPaddingAfter) > 128)
+      continue;
+
+    // Calculate the offset to the value as a (possibly negative) byte offset
+    // and (if applicable) a bit offset, and store the values in the targets.
+    int64_t OffsetByte;
+    uint64_t OffsetBit;
+    if (TotalPaddingBefore <= TotalPaddingAfter)
+      setBeforeReturnValues(TargetsForSlot, AllocBefore, BitWidth, OffsetByte,
+                            OffsetBit);
+    else
+      setAfterReturnValues(TargetsForSlot, AllocAfter, BitWidth, OffsetByte,
+                           OffsetBit);
+
+    if (RemarksEnabled)
+      for (auto &&Target : TargetsForSlot)
+        Target.WasDevirt = true;
+
+
+    if (CSByConstantArg.second.isExported()) {
+      ResByArg->TheKind = WholeProgramDevirtResolution::ByArg::VirtualConstProp;
+      exportConstant(Slot, CSByConstantArg.first, "byte", OffsetByte,
+                     ResByArg->Byte);
+      exportConstant(Slot, CSByConstantArg.first, "bit", 1ULL << OffsetBit,
+                     ResByArg->Bit);
+    }
+
+    // Rewrite each call to a load from OffsetByte/OffsetBit.
+    Constant *ByteConst = ConstantInt::get(Int32Ty, OffsetByte);
+    Constant *BitConst = ConstantInt::get(Int8Ty, 1ULL << OffsetBit);
+    applyVirtualConstProp(CSByConstantArg.second,
+                          TargetsForSlot[0].Fn->getName(), ByteConst, BitConst);
+  }
+  return true;
+}
+
+void DevirtModule::rebuildGlobal(VTableBits &B) {
+  if (B.Before.Bytes.empty() && B.After.Bytes.empty())
+    return;
+
+  // Align the before byte array to the global's minimum alignment so that we
+  // don't break any alignment requirements on the global.
+  Align Alignment = M.getDataLayout().getValueOrABITypeAlignment(
+      B.GV->getAlign(), B.GV->getValueType());
+  B.Before.Bytes.resize(alignTo(B.Before.Bytes.size(), Alignment));
+
+  // Before was stored in reverse order; flip it now.
+  for (size_t I = 0, Size = B.Before.Bytes.size(); I != Size / 2; ++I)
+    std::swap(B.Before.Bytes[I], B.Before.Bytes[Size - 1 - I]);
+
+  // Build an anonymous global containing the before bytes, followed by the
+  // original initializer, followed by the after bytes.
+  auto NewInit = ConstantStruct::getAnon(
+      {ConstantDataArray::get(M.getContext(), B.Before.Bytes),
+       B.GV->getInitializer(),
+       ConstantDataArray::get(M.getContext(), B.After.Bytes)});
+  auto NewGV =
+      new GlobalVariable(M, NewInit->getType(), B.GV->isConstant(),
+                         GlobalVariable::PrivateLinkage, NewInit, "", B.GV);
+  NewGV->setSection(B.GV->getSection());
+  NewGV->setComdat(B.GV->getComdat());
+  NewGV->setAlignment(MaybeAlign(B.GV->getAlignment()));
+
+  // Copy the original vtable's metadata to the anonymous global, adjusting
+  // offsets as required.
+  NewGV->copyMetadata(B.GV, B.Before.Bytes.size());
+
+  // Build an alias named after the original global, pointing at the second
+  // element (the original initializer).
+  auto Alias = GlobalAlias::create(
+      B.GV->getInitializer()->getType(), 0, B.GV->getLinkage(), "",
+      ConstantExpr::getGetElementPtr(
+          NewInit->getType(), NewGV,
+          ArrayRef<Constant *>{ConstantInt::get(Int32Ty, 0),
+                               ConstantInt::get(Int32Ty, 1)}),
+      &M);
+  Alias->setVisibility(B.GV->getVisibility());
+  Alias->takeName(B.GV);
+
+  B.GV->replaceAllUsesWith(Alias);
+  B.GV->eraseFromParent();
+}
+
+bool DevirtModule::areRemarksEnabled() {
+  const auto &FL = M.getFunctionList();
+  for (const Function &Fn : FL) {
+    const auto &BBL = Fn.getBasicBlockList();
+    if (BBL.empty())
+      continue;
+    auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBL.front());
+    return DI.isEnabled();
+  }
+  return false;
+}
+
+void DevirtModule::scanTypeTestUsers(
+    Function *TypeTestFunc,
+    DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap) {
+  // Find all virtual calls via a virtual table pointer %p under an assumption
+  // of the form llvm.assume(llvm.type.test(%p, %md)). This indicates that %p
+  // points to a member of the type identifier %md. Group calls by (type ID,
+  // offset) pair (effectively the identity of the virtual function) and store
+  // to CallSlots.
+  for (auto I = TypeTestFunc->use_begin(), E = TypeTestFunc->use_end();
+       I != E;) {
+    auto CI = dyn_cast<CallInst>(I->getUser());
+    ++I;
+    if (!CI)
+      continue;
+
+    // Search for virtual calls based on %p and add them to DevirtCalls.
+    SmallVector<DevirtCallSite, 1> DevirtCalls;
+    SmallVector<CallInst *, 1> Assumes;
+    auto &DT = LookupDomTree(*CI->getFunction());
+    findDevirtualizableCallsForTypeTest(DevirtCalls, Assumes, CI, DT);
+
+    Metadata *TypeId =
+        cast<MetadataAsValue>(CI->getArgOperand(1))->getMetadata();
+    // If we found any, add them to CallSlots.
+    if (!Assumes.empty()) {
+      Value *Ptr = CI->getArgOperand(0)->stripPointerCasts();
+      for (DevirtCallSite Call : DevirtCalls)
+        CallSlots[{TypeId, Call.Offset}].addCallSite(Ptr, Call.CB, nullptr);
+    }
+
+    auto RemoveTypeTestAssumes = [&]() {
+      // We no longer need the assumes or the type test.
+      for (auto Assume : Assumes)
+        Assume->eraseFromParent();
+      // We can't use RecursivelyDeleteTriviallyDeadInstructions here because we
+      // may use the vtable argument later.
+      if (CI->use_empty())
+        CI->eraseFromParent();
+    };
+
+    // At this point we could remove all type test assume sequences, as they
+    // were originally inserted for WPD. However, we can keep these in the
+    // code stream for later analysis (e.g. to help drive more efficient ICP
+    // sequences). They will eventually be removed by a second LowerTypeTests
+    // invocation that cleans them up. In order to do this correctly, the first
+    // LowerTypeTests invocation needs to know that they have "Unknown" type
+    // test resolution, so that they aren't treated as Unsat and lowered to
+    // False, which will break any uses on assumes. Below we remove any type
+    // test assumes that will not be treated as Unknown by LTT.
+
+    // The type test assumes will be treated by LTT as Unsat if the type id is
+    // not used on a global (in which case it has no entry in the TypeIdMap).
+    if (!TypeIdMap.count(TypeId))
+      RemoveTypeTestAssumes();
+
+    // For ThinLTO importing, we need to remove the type test assumes if this is
+    // an MDString type id without a corresponding TypeIdSummary. Any
+    // non-MDString type ids are ignored and treated as Unknown by LTT, so their
+    // type test assumes can be kept. If the MDString type id is missing a
+    // TypeIdSummary (e.g. because there was no use on a vcall, preventing the
+    // exporting phase of WPD from analyzing it), then it would be treated as
+    // Unsat by LTT and we need to remove its type test assumes here. If not
+    // used on a vcall we don't need them for later optimization use in any
+    // case.
+    else if (ImportSummary && isa<MDString>(TypeId)) {
+      const TypeIdSummary *TidSummary =
+          ImportSummary->getTypeIdSummary(cast<MDString>(TypeId)->getString());
+      if (!TidSummary)
+        RemoveTypeTestAssumes();
+      else
+        // If one was created it should not be Unsat, because if we reached here
+        // the type id was used on a global.
+        assert(TidSummary->TTRes.TheKind != TypeTestResolution::Unsat);
+    }
+  }
+}
+
+void DevirtModule::scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc) {
+  Function *TypeTestFunc = Intrinsic::getDeclaration(&M, Intrinsic::type_test);
+
+  for (auto I = TypeCheckedLoadFunc->use_begin(),
+            E = TypeCheckedLoadFunc->use_end();
+       I != E;) {
+    auto CI = dyn_cast<CallInst>(I->getUser());
+    ++I;
+    if (!CI)
+      continue;
+
+    Value *Ptr = CI->getArgOperand(0);
+    Value *Offset = CI->getArgOperand(1);
+    Value *TypeIdValue = CI->getArgOperand(2);
+    Metadata *TypeId = cast<MetadataAsValue>(TypeIdValue)->getMetadata();
+
+    SmallVector<DevirtCallSite, 1> DevirtCalls;
+    SmallVector<Instruction *, 1> LoadedPtrs;
+    SmallVector<Instruction *, 1> Preds;
+    bool HasNonCallUses = false;
+    auto &DT = LookupDomTree(*CI->getFunction());
+    findDevirtualizableCallsForTypeCheckedLoad(DevirtCalls, LoadedPtrs, Preds,
+                                               HasNonCallUses, CI, DT);
+
+    // Start by generating "pessimistic" code that explicitly loads the function
+    // pointer from the vtable and performs the type check. If possible, we will
+    // eliminate the load and the type check later.
+
+    // If possible, only generate the load at the point where it is used.
+    // This helps avoid unnecessary spills.
+    IRBuilder<> LoadB(
+        (LoadedPtrs.size() == 1 && !HasNonCallUses) ? LoadedPtrs[0] : CI);
+    Value *GEP = LoadB.CreateGEP(Int8Ty, Ptr, Offset);
+    Value *GEPPtr = LoadB.CreateBitCast(GEP, PointerType::getUnqual(Int8PtrTy));
+    Value *LoadedValue = LoadB.CreateLoad(Int8PtrTy, GEPPtr);
+
+    for (Instruction *LoadedPtr : LoadedPtrs) {
+      LoadedPtr->replaceAllUsesWith(LoadedValue);
+      LoadedPtr->eraseFromParent();
+    }
+
+    // Likewise for the type test.
+    IRBuilder<> CallB((Preds.size() == 1 && !HasNonCallUses) ? Preds[0] : CI);
+    CallInst *TypeTestCall = CallB.CreateCall(TypeTestFunc, {Ptr, TypeIdValue});
+
+    for (Instruction *Pred : Preds) {
+      Pred->replaceAllUsesWith(TypeTestCall);
+      Pred->eraseFromParent();
+    }
+
+    // We have already erased any extractvalue instructions that refer to the
+    // intrinsic call, but the intrinsic may have other non-extractvalue uses
+    // (although this is unlikely). In that case, explicitly build a pair and
+    // RAUW it.
+    if (!CI->use_empty()) {
+      Value *Pair = UndefValue::get(CI->getType());
+      IRBuilder<> B(CI);
+      Pair = B.CreateInsertValue(Pair, LoadedValue, {0});
+      Pair = B.CreateInsertValue(Pair, TypeTestCall, {1});
+      CI->replaceAllUsesWith(Pair);
+    }
+
+    // The number of unsafe uses is initially the number of uses.
+    auto &NumUnsafeUses = NumUnsafeUsesForTypeTest[TypeTestCall];
+    NumUnsafeUses = DevirtCalls.size();
+
+    // If the function pointer has a non-call user, we cannot eliminate the type
+    // check, as one of those users may eventually call the pointer. Increment
+    // the unsafe use count to make sure it cannot reach zero.
+    if (HasNonCallUses)
+      ++NumUnsafeUses;
+    for (DevirtCallSite Call : DevirtCalls) {
+      CallSlots[{TypeId, Call.Offset}].addCallSite(Ptr, Call.CB,
+                                                   &NumUnsafeUses);
+    }
+
+    CI->eraseFromParent();
+  }
+}
+
+void DevirtModule::importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo) {
+  auto *TypeId = dyn_cast<MDString>(Slot.TypeID);
+  if (!TypeId)
+    return;
+  const TypeIdSummary *TidSummary =
+      ImportSummary->getTypeIdSummary(TypeId->getString());
+  if (!TidSummary)
+    return;
+  auto ResI = TidSummary->WPDRes.find(Slot.ByteOffset);
+  if (ResI == TidSummary->WPDRes.end())
+    return;
+  const WholeProgramDevirtResolution &Res = ResI->second;
+
+  if (Res.TheKind == WholeProgramDevirtResolution::SingleImpl) {
+    assert(!Res.SingleImplName.empty());
+    // The type of the function in the declaration is irrelevant because every
+    // call site will cast it to the correct type.
+    Constant *SingleImpl =
+        cast<Constant>(M.getOrInsertFunction(Res.SingleImplName,
+                                             Type::getVoidTy(M.getContext()))
+                           .getCallee());
+
+    // This is the import phase so we should not be exporting anything.
+    bool IsExported = false;
+    applySingleImplDevirt(SlotInfo, SingleImpl, IsExported);
+    assert(!IsExported);
+  }
+
+  for (auto &CSByConstantArg : SlotInfo.ConstCSInfo) {
+    auto I = Res.ResByArg.find(CSByConstantArg.first);
+    if (I == Res.ResByArg.end())
+      continue;
+    auto &ResByArg = I->second;
+    // FIXME: We should figure out what to do about the "function name" argument
+    // to the apply* functions, as the function names are unavailable during the
+    // importing phase. For now we just pass the empty string. This does not
+    // impact correctness because the function names are just used for remarks.
+    switch (ResByArg.TheKind) {
+    case WholeProgramDevirtResolution::ByArg::UniformRetVal:
+      applyUniformRetValOpt(CSByConstantArg.second, "", ResByArg.Info);
+      break;
+    case WholeProgramDevirtResolution::ByArg::UniqueRetVal: {
+      Constant *UniqueMemberAddr =
+          importGlobal(Slot, CSByConstantArg.first, "unique_member");
+      applyUniqueRetValOpt(CSByConstantArg.second, "", ResByArg.Info,
+                           UniqueMemberAddr);
+      break;
+    }
+    case WholeProgramDevirtResolution::ByArg::VirtualConstProp: {
+      Constant *Byte = importConstant(Slot, CSByConstantArg.first, "byte",
+                                      Int32Ty, ResByArg.Byte);
+      Constant *Bit = importConstant(Slot, CSByConstantArg.first, "bit", Int8Ty,
+                                     ResByArg.Bit);
+      applyVirtualConstProp(CSByConstantArg.second, "", Byte, Bit);
+      break;
+    }
+    default:
+      break;
+    }
+  }
+
+  if (Res.TheKind == WholeProgramDevirtResolution::BranchFunnel) {
+    // The type of the function is irrelevant, because it's bitcast at calls
+    // anyhow.
+    Constant *JT = cast<Constant>(
+        M.getOrInsertFunction(getGlobalName(Slot, {}, "branch_funnel"),
+                              Type::getVoidTy(M.getContext()))
+            .getCallee());
+    bool IsExported = false;
+    applyICallBranchFunnel(SlotInfo, JT, IsExported);
+    assert(!IsExported);
+  }
+}
+
+void DevirtModule::removeRedundantTypeTests() {
+  auto True = ConstantInt::getTrue(M.getContext());
+  for (auto &&U : NumUnsafeUsesForTypeTest) {
+    if (U.second == 0) {
+      U.first->replaceAllUsesWith(True);
+      U.first->eraseFromParent();
+    }
+  }
+}
+
+bool DevirtModule::run() {
+  // If only some of the modules were split, we cannot correctly perform
+  // this transformation. We already checked for the presense of type tests
+  // with partially split modules during the thin link, and would have emitted
+  // an error if any were found, so here we can simply return.
+  if ((ExportSummary && ExportSummary->partiallySplitLTOUnits()) ||
+      (ImportSummary && ImportSummary->partiallySplitLTOUnits()))
+    return false;
+
+  Function *TypeTestFunc =
+      M.getFunction(Intrinsic::getName(Intrinsic::type_test));
+  Function *TypeCheckedLoadFunc =
+      M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load));
+  Function *AssumeFunc = M.getFunction(Intrinsic::getName(Intrinsic::assume));
+
+  // Normally if there are no users of the devirtualization intrinsics in the
+  // module, this pass has nothing to do. But if we are exporting, we also need
+  // to handle any users that appear only in the function summaries.
+  if (!ExportSummary &&
+      (!TypeTestFunc || TypeTestFunc->use_empty() || !AssumeFunc ||
+       AssumeFunc->use_empty()) &&
+      (!TypeCheckedLoadFunc || TypeCheckedLoadFunc->use_empty()))
+    return false;
+
+  // Rebuild type metadata into a map for easy lookup.
+  std::vector<VTableBits> Bits;
+  DenseMap<Metadata *, std::set<TypeMemberInfo>> TypeIdMap;
+  buildTypeIdentifierMap(Bits, TypeIdMap);
+
+  if (TypeTestFunc && AssumeFunc)
+    scanTypeTestUsers(TypeTestFunc, TypeIdMap);
+
+  if (TypeCheckedLoadFunc)
+    scanTypeCheckedLoadUsers(TypeCheckedLoadFunc);
+
+  if (ImportSummary) {
+    for (auto &S : CallSlots)
+      importResolution(S.first, S.second);
+
+    removeRedundantTypeTests();
+
+    // We have lowered or deleted the type instrinsics, so we will no
+    // longer have enough information to reason about the liveness of virtual
+    // function pointers in GlobalDCE.
+    for (GlobalVariable &GV : M.globals())
+      GV.eraseMetadata(LLVMContext::MD_vcall_visibility);
+
+    // The rest of the code is only necessary when exporting or during regular
+    // LTO, so we are done.
+    return true;
+  }
+
+  if (TypeIdMap.empty())
+    return true;
+
+  // Collect information from summary about which calls to try to devirtualize.
+  if (ExportSummary) {
+    DenseMap<GlobalValue::GUID, TinyPtrVector<Metadata *>> MetadataByGUID;
+    for (auto &P : TypeIdMap) {
+      if (auto *TypeId = dyn_cast<MDString>(P.first))
+        MetadataByGUID[GlobalValue::getGUID(TypeId->getString())].push_back(
+            TypeId);
+    }
+
+    for (auto &P : *ExportSummary) {
+      for (auto &S : P.second.SummaryList) {
+        auto *FS = dyn_cast<FunctionSummary>(S.get());
+        if (!FS)
+          continue;
+        // FIXME: Only add live functions.
+        for (FunctionSummary::VFuncId VF : FS->type_test_assume_vcalls()) {
+          for (Metadata *MD : MetadataByGUID[VF.GUID]) {
+            CallSlots[{MD, VF.Offset}].CSInfo.addSummaryTypeTestAssumeUser(FS);
+          }
+        }
+        for (FunctionSummary::VFuncId VF : FS->type_checked_load_vcalls()) {
+          for (Metadata *MD : MetadataByGUID[VF.GUID]) {
+            CallSlots[{MD, VF.Offset}].CSInfo.addSummaryTypeCheckedLoadUser(FS);
+          }
+        }
+        for (const FunctionSummary::ConstVCall &VC :
+             FS->type_test_assume_const_vcalls()) {
+          for (Metadata *MD : MetadataByGUID[VC.VFunc.GUID]) {
+            CallSlots[{MD, VC.VFunc.Offset}]
+                .ConstCSInfo[VC.Args]
+                .addSummaryTypeTestAssumeUser(FS);
+          }
+        }
+        for (const FunctionSummary::ConstVCall &VC :
+             FS->type_checked_load_const_vcalls()) {
+          for (Metadata *MD : MetadataByGUID[VC.VFunc.GUID]) {
+            CallSlots[{MD, VC.VFunc.Offset}]
+                .ConstCSInfo[VC.Args]
+                .addSummaryTypeCheckedLoadUser(FS);
+          }
+        }
+      }
+    }
+  }
+
+  // For each (type, offset) pair:
+  bool DidVirtualConstProp = false;
+  std::map<std::string, Function*> DevirtTargets;
+  for (auto &S : CallSlots) {
+    // Search each of the members of the type identifier for the virtual
+    // function implementation at offset S.first.ByteOffset, and add to
+    // TargetsForSlot.
+    std::vector<VirtualCallTarget> TargetsForSlot;
+    WholeProgramDevirtResolution *Res = nullptr;
+    const std::set<TypeMemberInfo> &TypeMemberInfos = TypeIdMap[S.first.TypeID];
+    if (ExportSummary && isa<MDString>(S.first.TypeID) &&
+        TypeMemberInfos.size())
+      // For any type id used on a global's type metadata, create the type id
+      // summary resolution regardless of whether we can devirtualize, so that
+      // lower type tests knows the type id is not Unsat. If it was not used on
+      // a global's type metadata, the TypeIdMap entry set will be empty, and
+      // we don't want to create an entry (with the default Unknown type
+      // resolution), which can prevent detection of the Unsat.
+      Res = &ExportSummary
+                 ->getOrInsertTypeIdSummary(
+                     cast<MDString>(S.first.TypeID)->getString())
+                 .WPDRes[S.first.ByteOffset];
+    if (tryFindVirtualCallTargets(TargetsForSlot, TypeMemberInfos,
+                                  S.first.ByteOffset)) {
+
+      if (!trySingleImplDevirt(ExportSummary, TargetsForSlot, S.second, Res)) {
+        DidVirtualConstProp |=
+            tryVirtualConstProp(TargetsForSlot, S.second, Res, S.first);
+
+        tryICallBranchFunnel(TargetsForSlot, S.second, Res, S.first);
+      }
+
+      // Collect functions devirtualized at least for one call site for stats.
+      if (RemarksEnabled)
+        for (const auto &T : TargetsForSlot)
+          if (T.WasDevirt)
+            DevirtTargets[std::string(T.Fn->getName())] = T.Fn;
+    }
+
+    // CFI-specific: if we are exporting and any llvm.type.checked.load
+    // intrinsics were *not* devirtualized, we need to add the resulting
+    // llvm.type.test intrinsics to the function summaries so that the
+    // LowerTypeTests pass will export them.
+    if (ExportSummary && isa<MDString>(S.first.TypeID)) {
+      auto GUID =
+          GlobalValue::getGUID(cast<MDString>(S.first.TypeID)->getString());
+      for (auto FS : S.second.CSInfo.SummaryTypeCheckedLoadUsers)
+        FS->addTypeTest(GUID);
+      for (auto &CCS : S.second.ConstCSInfo)
+        for (auto FS : CCS.second.SummaryTypeCheckedLoadUsers)
+          FS->addTypeTest(GUID);
+    }
+  }
+
+  if (RemarksEnabled) {
+    // Generate remarks for each devirtualized function.
+    for (const auto &DT : DevirtTargets) {
+      Function *F = DT.second;
+
+      using namespace ore;
+      OREGetter(F).emit(OptimizationRemark(DEBUG_TYPE, "Devirtualized", F)
+                        << "devirtualized "
+                        << NV("FunctionName", DT.first));
+    }
+  }
+
+  removeRedundantTypeTests();
+
+  // Rebuild each global we touched as part of virtual constant propagation to
+  // include the before and after bytes.
+  if (DidVirtualConstProp)
+    for (VTableBits &B : Bits)
+      rebuildGlobal(B);
+
+  // We have lowered or deleted the type instrinsics, so we will no
+  // longer have enough information to reason about the liveness of virtual
+  // function pointers in GlobalDCE.
+  for (GlobalVariable &GV : M.globals())
+    GV.eraseMetadata(LLVMContext::MD_vcall_visibility);
+
+  return true;
+}
+
+void DevirtIndex::run() {
+  if (ExportSummary.typeIdCompatibleVtableMap().empty())
+    return;
+
+  DenseMap<GlobalValue::GUID, std::vector<StringRef>> NameByGUID;
+  for (auto &P : ExportSummary.typeIdCompatibleVtableMap()) {
+    NameByGUID[GlobalValue::getGUID(P.first)].push_back(P.first);
+  }
+
+  // Collect information from summary about which calls to try to devirtualize.
+  for (auto &P : ExportSummary) {
+    for (auto &S : P.second.SummaryList) {
+      auto *FS = dyn_cast<FunctionSummary>(S.get());
+      if (!FS)
+        continue;
+      // FIXME: Only add live functions.
+      for (FunctionSummary::VFuncId VF : FS->type_test_assume_vcalls()) {
+        for (StringRef Name : NameByGUID[VF.GUID]) {
+          CallSlots[{Name, VF.Offset}].CSInfo.addSummaryTypeTestAssumeUser(FS);
+        }
+      }
+      for (FunctionSummary::VFuncId VF : FS->type_checked_load_vcalls()) {
+        for (StringRef Name : NameByGUID[VF.GUID]) {
+          CallSlots[{Name, VF.Offset}].CSInfo.addSummaryTypeCheckedLoadUser(FS);
+        }
+      }
+      for (const FunctionSummary::ConstVCall &VC :
+           FS->type_test_assume_const_vcalls()) {
+        for (StringRef Name : NameByGUID[VC.VFunc.GUID]) {
+          CallSlots[{Name, VC.VFunc.Offset}]
+              .ConstCSInfo[VC.Args]
+              .addSummaryTypeTestAssumeUser(FS);
+        }
+      }
+      for (const FunctionSummary::ConstVCall &VC :
+           FS->type_checked_load_const_vcalls()) {
+        for (StringRef Name : NameByGUID[VC.VFunc.GUID]) {
+          CallSlots[{Name, VC.VFunc.Offset}]
+              .ConstCSInfo[VC.Args]
+              .addSummaryTypeCheckedLoadUser(FS);
+        }
+      }
+    }
+  }
+
+  std::set<ValueInfo> DevirtTargets;
+  // For each (type, offset) pair:
+  for (auto &S : CallSlots) {
+    // Search each of the members of the type identifier for the virtual
+    // function implementation at offset S.first.ByteOffset, and add to
+    // TargetsForSlot.
+    std::vector<ValueInfo> TargetsForSlot;
+    auto TidSummary = ExportSummary.getTypeIdCompatibleVtableSummary(S.first.TypeID);
+    assert(TidSummary);
+    // Create the type id summary resolution regardlness of whether we can
+    // devirtualize, so that lower type tests knows the type id is used on
+    // a global and not Unsat.
+    WholeProgramDevirtResolution *Res =
+        &ExportSummary.getOrInsertTypeIdSummary(S.first.TypeID)
+             .WPDRes[S.first.ByteOffset];
+    if (tryFindVirtualCallTargets(TargetsForSlot, *TidSummary,
+                                  S.first.ByteOffset)) {
+
+      if (!trySingleImplDevirt(TargetsForSlot, S.first, S.second, Res,
+                               DevirtTargets))
+        continue;
+    }
+  }
+
+  // Optionally have the thin link print message for each devirtualized
+  // function.
+  if (PrintSummaryDevirt)
+    for (const auto &DT : DevirtTargets)
+      errs() << "Devirtualized call to " << DT << "\n";
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/ya.make b/contrib/libs/llvm12/lib/Transforms/IPO/ya.make
index 8e38815374..5b078050fe 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/ya.make
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/ya.make
@@ -1,17 +1,17 @@
-# Generated by devtools/yamaker. 
- 
-LIBRARY() 
- 
+# Generated by devtools/yamaker.
+
+LIBRARY()
+
 OWNER(
     orivej
     g:cpp-contrib
 )
- 
+
 LICENSE(Apache-2.0 WITH LLVM-exception)
 
 LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
 
-PEERDIR( 
+PEERDIR(
     contrib/libs/llvm12
     contrib/libs/llvm12/include
     contrib/libs/llvm12/lib/Analysis
@@ -30,59 +30,59 @@ PEERDIR(
     contrib/libs/llvm12/lib/Transforms/Scalar
     contrib/libs/llvm12/lib/Transforms/Utils
     contrib/libs/llvm12/lib/Transforms/Vectorize
-) 
- 
+)
+
 ADDINCL(
     contrib/libs/llvm12/lib/Transforms/IPO
 )
- 
-NO_COMPILER_WARNINGS() 
- 
-NO_UTIL() 
- 
-SRCS( 
-    AlwaysInliner.cpp 
+
+NO_COMPILER_WARNINGS()
+
+NO_UTIL()
+
+SRCS(
+    AlwaysInliner.cpp
     Annotation2Metadata.cpp
-    ArgumentPromotion.cpp 
-    Attributor.cpp 
-    AttributorAttributes.cpp 
-    BarrierNoopPass.cpp 
-    BlockExtractor.cpp 
-    CalledValuePropagation.cpp 
-    ConstantMerge.cpp 
-    CrossDSOCFI.cpp 
-    DeadArgumentElimination.cpp 
-    ElimAvailExtern.cpp 
-    ExtractGV.cpp 
-    ForceFunctionAttrs.cpp 
-    FunctionAttrs.cpp 
-    FunctionImport.cpp 
-    GlobalDCE.cpp 
-    GlobalOpt.cpp 
-    GlobalSplit.cpp 
-    HotColdSplitting.cpp 
-    IPO.cpp 
+    ArgumentPromotion.cpp
+    Attributor.cpp
+    AttributorAttributes.cpp
+    BarrierNoopPass.cpp
+    BlockExtractor.cpp
+    CalledValuePropagation.cpp
+    ConstantMerge.cpp
+    CrossDSOCFI.cpp
+    DeadArgumentElimination.cpp
+    ElimAvailExtern.cpp
+    ExtractGV.cpp
+    ForceFunctionAttrs.cpp
+    FunctionAttrs.cpp
+    FunctionImport.cpp
+    GlobalDCE.cpp
+    GlobalOpt.cpp
+    GlobalSplit.cpp
+    HotColdSplitting.cpp
+    IPO.cpp
     IROutliner.cpp
-    InferFunctionAttrs.cpp 
-    InlineSimple.cpp 
-    Inliner.cpp 
-    Internalize.cpp 
-    LoopExtractor.cpp 
-    LowerTypeTests.cpp 
-    MergeFunctions.cpp 
-    OpenMPOpt.cpp 
-    PartialInlining.cpp 
-    PassManagerBuilder.cpp 
-    PruneEH.cpp 
-    SCCP.cpp 
+    InferFunctionAttrs.cpp
+    InlineSimple.cpp
+    Inliner.cpp
+    Internalize.cpp
+    LoopExtractor.cpp
+    LowerTypeTests.cpp
+    MergeFunctions.cpp
+    OpenMPOpt.cpp
+    PartialInlining.cpp
+    PassManagerBuilder.cpp
+    PruneEH.cpp
+    SCCP.cpp
     SampleContextTracker.cpp
-    SampleProfile.cpp 
+    SampleProfile.cpp
     SampleProfileProbe.cpp
-    StripDeadPrototypes.cpp 
-    StripSymbols.cpp 
-    SyntheticCountsPropagation.cpp 
-    ThinLTOBitcodeWriter.cpp 
-    WholeProgramDevirt.cpp 
-) 
- 
-END() 
+    StripDeadPrototypes.cpp
+    StripSymbols.cpp
+    SyntheticCountsPropagation.cpp
+    ThinLTOBitcodeWriter.cpp
+    WholeProgramDevirt.cpp
+)
+
+END()
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index f63a508659..bacb868989 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1,929 +1,929 @@
-//===- InstCombineAddSub.cpp ------------------------------------*- C++ -*-===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the visit functions for add, fadd, sub, and fsub. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "InstCombineInternal.h" 
-#include "llvm/ADT/APFloat.h" 
-#include "llvm/ADT/APInt.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Operator.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/Support/AlignOf.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/KnownBits.h" 
+//===- InstCombineAddSub.cpp ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visit functions for add, fadd, sub, and fsub.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/AlignOf.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
-#include <cassert> 
-#include <utility> 
- 
-using namespace llvm; 
-using namespace PatternMatch; 
- 
-#define DEBUG_TYPE "instcombine" 
- 
-namespace { 
- 
-  /// Class representing coefficient of floating-point addend. 
-  /// This class needs to be highly efficient, which is especially true for 
-  /// the constructor. As of I write this comment, the cost of the default 
-  /// constructor is merely 4-byte-store-zero (Assuming compiler is able to 
-  /// perform write-merging). 
-  /// 
-  class FAddendCoef { 
-  public: 
-    // The constructor has to initialize a APFloat, which is unnecessary for 
-    // most addends which have coefficient either 1 or -1. So, the constructor 
-    // is expensive. In order to avoid the cost of the constructor, we should 
-    // reuse some instances whenever possible. The pre-created instances 
-    // FAddCombine::Add[0-5] embodies this idea. 
-    FAddendCoef() = default; 
-    ~FAddendCoef(); 
- 
-    // If possible, don't define operator+/operator- etc because these 
-    // operators inevitably call FAddendCoef's constructor which is not cheap. 
-    void operator=(const FAddendCoef &A); 
-    void operator+=(const FAddendCoef &A); 
-    void operator*=(const FAddendCoef &S); 
- 
-    void set(short C) { 
-      assert(!insaneIntVal(C) && "Insane coefficient"); 
-      IsFp = false; IntVal = C; 
-    } 
- 
-    void set(const APFloat& C); 
- 
-    void negate(); 
- 
-    bool isZero() const { return isInt() ? !IntVal : getFpVal().isZero(); } 
-    Value *getValue(Type *) const; 
- 
-    bool isOne() const { return isInt() && IntVal == 1; } 
-    bool isTwo() const { return isInt() && IntVal == 2; } 
-    bool isMinusOne() const { return isInt() && IntVal == -1; } 
-    bool isMinusTwo() const { return isInt() && IntVal == -2; } 
- 
-  private: 
-    bool insaneIntVal(int V) { return V > 4 || V < -4; } 
- 
+#include <cassert>
+#include <utility>
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
+namespace {
+
+  /// Class representing coefficient of floating-point addend.
+  /// This class needs to be highly efficient, which is especially true for
+  /// the constructor. As of I write this comment, the cost of the default
+  /// constructor is merely 4-byte-store-zero (Assuming compiler is able to
+  /// perform write-merging).
+  ///
+  class FAddendCoef {
+  public:
+    // The constructor has to initialize a APFloat, which is unnecessary for
+    // most addends which have coefficient either 1 or -1. So, the constructor
+    // is expensive. In order to avoid the cost of the constructor, we should
+    // reuse some instances whenever possible. The pre-created instances
+    // FAddCombine::Add[0-5] embodies this idea.
+    FAddendCoef() = default;
+    ~FAddendCoef();
+
+    // If possible, don't define operator+/operator- etc because these
+    // operators inevitably call FAddendCoef's constructor which is not cheap.
+    void operator=(const FAddendCoef &A);
+    void operator+=(const FAddendCoef &A);
+    void operator*=(const FAddendCoef &S);
+
+    void set(short C) {
+      assert(!insaneIntVal(C) && "Insane coefficient");
+      IsFp = false; IntVal = C;
+    }
+
+    void set(const APFloat& C);
+
+    void negate();
+
+    bool isZero() const { return isInt() ? !IntVal : getFpVal().isZero(); }
+    Value *getValue(Type *) const;
+
+    bool isOne() const { return isInt() && IntVal == 1; }
+    bool isTwo() const { return isInt() && IntVal == 2; }
+    bool isMinusOne() const { return isInt() && IntVal == -1; }
+    bool isMinusTwo() const { return isInt() && IntVal == -2; }
+
+  private:
+    bool insaneIntVal(int V) { return V > 4 || V < -4; }
+
     APFloat *getFpValPtr() { return reinterpret_cast<APFloat *>(&FpValBuf); }
- 
+
     const APFloat *getFpValPtr() const {
       return reinterpret_cast<const APFloat *>(&FpValBuf);
     }
- 
-    const APFloat &getFpVal() const { 
-      assert(IsFp && BufHasFpVal && "Incorret state"); 
-      return *getFpValPtr(); 
-    } 
- 
-    APFloat &getFpVal() { 
-      assert(IsFp && BufHasFpVal && "Incorret state"); 
-      return *getFpValPtr(); 
-    } 
- 
-    bool isInt() const { return !IsFp; } 
- 
-    // If the coefficient is represented by an integer, promote it to a 
-    // floating point. 
-    void convertToFpType(const fltSemantics &Sem); 
- 
-    // Construct an APFloat from a signed integer. 
-    // TODO: We should get rid of this function when APFloat can be constructed 
-    //       from an *SIGNED* integer. 
-    APFloat createAPFloatFromInt(const fltSemantics &Sem, int Val); 
- 
-    bool IsFp = false; 
- 
-    // True iff FpValBuf contains an instance of APFloat. 
-    bool BufHasFpVal = false; 
- 
-    // The integer coefficient of an individual addend is either 1 or -1, 
-    // and we try to simplify at most 4 addends from neighboring at most 
-    // two instructions. So the range of <IntVal> falls in [-4, 4]. APInt 
-    // is overkill of this end. 
-    short IntVal = 0; 
- 
-    AlignedCharArrayUnion<APFloat> FpValBuf; 
-  }; 
- 
-  /// FAddend is used to represent floating-point addend. An addend is 
-  /// represented as <C, V>, where the V is a symbolic value, and C is a 
-  /// constant coefficient. A constant addend is represented as <C, 0>. 
-  class FAddend { 
-  public: 
-    FAddend() = default; 
- 
-    void operator+=(const FAddend &T) { 
-      assert((Val == T.Val) && "Symbolic-values disagree"); 
-      Coeff += T.Coeff; 
-    } 
- 
-    Value *getSymVal() const { return Val; } 
-    const FAddendCoef &getCoef() const { return Coeff; } 
- 
-    bool isConstant() const { return Val == nullptr; } 
-    bool isZero() const { return Coeff.isZero(); } 
- 
-    void set(short Coefficient, Value *V) { 
-      Coeff.set(Coefficient); 
-      Val = V; 
-    } 
-    void set(const APFloat &Coefficient, Value *V) { 
-      Coeff.set(Coefficient); 
-      Val = V; 
-    } 
-    void set(const ConstantFP *Coefficient, Value *V) { 
-      Coeff.set(Coefficient->getValueAPF()); 
-      Val = V; 
-    } 
- 
-    void negate() { Coeff.negate(); } 
- 
-    /// Drill down the U-D chain one step to find the definition of V, and 
-    /// try to break the definition into one or two addends. 
-    static unsigned drillValueDownOneStep(Value* V, FAddend &A0, FAddend &A1); 
- 
-    /// Similar to FAddend::drillDownOneStep() except that the value being 
-    /// splitted is the addend itself. 
-    unsigned drillAddendDownOneStep(FAddend &Addend0, FAddend &Addend1) const; 
- 
-  private: 
-    void Scale(const FAddendCoef& ScaleAmt) { Coeff *= ScaleAmt; } 
- 
-    // This addend has the value of "Coeff * Val". 
-    Value *Val = nullptr; 
-    FAddendCoef Coeff; 
-  }; 
- 
-  /// FAddCombine is the class for optimizing an unsafe fadd/fsub along 
-  /// with its neighboring at most two instructions. 
-  /// 
-  class FAddCombine { 
-  public: 
-    FAddCombine(InstCombiner::BuilderTy &B) : Builder(B) {} 
- 
-    Value *simplify(Instruction *FAdd); 
- 
-  private: 
-    using AddendVect = SmallVector<const FAddend *, 4>; 
- 
-    Value *simplifyFAdd(AddendVect& V, unsigned InstrQuota); 
- 
-    /// Convert given addend to a Value 
-    Value *createAddendVal(const FAddend &A, bool& NeedNeg); 
- 
-    /// Return the number of instructions needed to emit the N-ary addition. 
-    unsigned calcInstrNumber(const AddendVect& Vect); 
- 
-    Value *createFSub(Value *Opnd0, Value *Opnd1); 
-    Value *createFAdd(Value *Opnd0, Value *Opnd1); 
-    Value *createFMul(Value *Opnd0, Value *Opnd1); 
-    Value *createFNeg(Value *V); 
-    Value *createNaryFAdd(const AddendVect& Opnds, unsigned InstrQuota); 
-    void createInstPostProc(Instruction *NewInst, bool NoNumber = false); 
- 
-     // Debugging stuff are clustered here. 
-    #ifndef NDEBUG 
-      unsigned CreateInstrNum; 
-      void initCreateInstNum() { CreateInstrNum = 0; } 
-      void incCreateInstNum() { CreateInstrNum++; } 
-    #else 
-      void initCreateInstNum() {} 
-      void incCreateInstNum() {} 
-    #endif 
- 
-    InstCombiner::BuilderTy &Builder; 
-    Instruction *Instr = nullptr; 
-  }; 
- 
-} // end anonymous namespace 
- 
-//===----------------------------------------------------------------------===// 
-// 
-// Implementation of 
-//    {FAddendCoef, FAddend, FAddition, FAddCombine}. 
-// 
-//===----------------------------------------------------------------------===// 
-FAddendCoef::~FAddendCoef() { 
-  if (BufHasFpVal) 
-    getFpValPtr()->~APFloat(); 
-} 
- 
-void FAddendCoef::set(const APFloat& C) { 
-  APFloat *P = getFpValPtr(); 
- 
-  if (isInt()) { 
-    // As the buffer is meanless byte stream, we cannot call 
-    // APFloat::operator=(). 
-    new(P) APFloat(C); 
-  } else 
-    *P = C; 
- 
-  IsFp = BufHasFpVal = true; 
-} 
- 
-void FAddendCoef::convertToFpType(const fltSemantics &Sem) { 
-  if (!isInt()) 
-    return; 
- 
-  APFloat *P = getFpValPtr(); 
-  if (IntVal > 0) 
-    new(P) APFloat(Sem, IntVal); 
-  else { 
-    new(P) APFloat(Sem, 0 - IntVal); 
-    P->changeSign(); 
-  } 
-  IsFp = BufHasFpVal = true; 
-} 
- 
-APFloat FAddendCoef::createAPFloatFromInt(const fltSemantics &Sem, int Val) { 
-  if (Val >= 0) 
-    return APFloat(Sem, Val); 
- 
-  APFloat T(Sem, 0 - Val); 
-  T.changeSign(); 
- 
-  return T; 
-} 
- 
-void FAddendCoef::operator=(const FAddendCoef &That) { 
-  if (That.isInt()) 
-    set(That.IntVal); 
-  else 
-    set(That.getFpVal()); 
-} 
- 
-void FAddendCoef::operator+=(const FAddendCoef &That) { 
-  RoundingMode RndMode = RoundingMode::NearestTiesToEven; 
-  if (isInt() == That.isInt()) { 
-    if (isInt()) 
-      IntVal += That.IntVal; 
-    else 
-      getFpVal().add(That.getFpVal(), RndMode); 
-    return; 
-  } 
- 
-  if (isInt()) { 
-    const APFloat &T = That.getFpVal(); 
-    convertToFpType(T.getSemantics()); 
-    getFpVal().add(T, RndMode); 
-    return; 
-  } 
- 
-  APFloat &T = getFpVal(); 
-  T.add(createAPFloatFromInt(T.getSemantics(), That.IntVal), RndMode); 
-} 
- 
-void FAddendCoef::operator*=(const FAddendCoef &That) { 
-  if (That.isOne()) 
-    return; 
- 
-  if (That.isMinusOne()) { 
-    negate(); 
-    return; 
-  } 
- 
-  if (isInt() && That.isInt()) { 
-    int Res = IntVal * (int)That.IntVal; 
-    assert(!insaneIntVal(Res) && "Insane int value"); 
-    IntVal = Res; 
-    return; 
-  } 
- 
-  const fltSemantics &Semantic = 
-    isInt() ? That.getFpVal().getSemantics() : getFpVal().getSemantics(); 
- 
-  if (isInt()) 
-    convertToFpType(Semantic); 
-  APFloat &F0 = getFpVal(); 
- 
-  if (That.isInt()) 
-    F0.multiply(createAPFloatFromInt(Semantic, That.IntVal), 
-                APFloat::rmNearestTiesToEven); 
-  else 
-    F0.multiply(That.getFpVal(), APFloat::rmNearestTiesToEven); 
-} 
- 
-void FAddendCoef::negate() { 
-  if (isInt()) 
-    IntVal = 0 - IntVal; 
-  else 
-    getFpVal().changeSign(); 
-} 
- 
-Value *FAddendCoef::getValue(Type *Ty) const { 
-  return isInt() ? 
-    ConstantFP::get(Ty, float(IntVal)) : 
-    ConstantFP::get(Ty->getContext(), getFpVal()); 
-} 
- 
-// The definition of <Val>     Addends 
-// ========================================= 
-//  A + B                     <1, A>, <1,B> 
-//  A - B                     <1, A>, <1,B> 
-//  0 - B                     <-1, B> 
-//  C * A,                    <C, A> 
-//  A + C                     <1, A> <C, NULL> 
-//  0 +/- 0                   <0, NULL> (corner case) 
-// 
-// Legend: A and B are not constant, C is constant 
-unsigned FAddend::drillValueDownOneStep 
-  (Value *Val, FAddend &Addend0, FAddend &Addend1) { 
-  Instruction *I = nullptr; 
-  if (!Val || !(I = dyn_cast<Instruction>(Val))) 
-    return 0; 
- 
-  unsigned Opcode = I->getOpcode(); 
- 
-  if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub) { 
-    ConstantFP *C0, *C1; 
-    Value *Opnd0 = I->getOperand(0); 
-    Value *Opnd1 = I->getOperand(1); 
-    if ((C0 = dyn_cast<ConstantFP>(Opnd0)) && C0->isZero()) 
-      Opnd0 = nullptr; 
- 
-    if ((C1 = dyn_cast<ConstantFP>(Opnd1)) && C1->isZero()) 
-      Opnd1 = nullptr; 
- 
-    if (Opnd0) { 
-      if (!C0) 
-        Addend0.set(1, Opnd0); 
-      else 
-        Addend0.set(C0, nullptr); 
-    } 
- 
-    if (Opnd1) { 
-      FAddend &Addend = Opnd0 ? Addend1 : Addend0; 
-      if (!C1) 
-        Addend.set(1, Opnd1); 
-      else 
-        Addend.set(C1, nullptr); 
-      if (Opcode == Instruction::FSub) 
-        Addend.negate(); 
-    } 
- 
-    if (Opnd0 || Opnd1) 
-      return Opnd0 && Opnd1 ? 2 : 1; 
- 
-    // Both operands are zero. Weird! 
-    Addend0.set(APFloat(C0->getValueAPF().getSemantics()), nullptr); 
-    return 1; 
-  } 
- 
-  if (I->getOpcode() == Instruction::FMul) { 
-    Value *V0 = I->getOperand(0); 
-    Value *V1 = I->getOperand(1); 
-    if (ConstantFP *C = dyn_cast<ConstantFP>(V0)) { 
-      Addend0.set(C, V1); 
-      return 1; 
-    } 
- 
-    if (ConstantFP *C = dyn_cast<ConstantFP>(V1)) { 
-      Addend0.set(C, V0); 
-      return 1; 
-    } 
-  } 
- 
-  return 0; 
-} 
- 
-// Try to break *this* addend into two addends. e.g. Suppose this addend is 
-// <2.3, V>, and V = X + Y, by calling this function, we obtain two addends, 
-// i.e. <2.3, X> and <2.3, Y>. 
-unsigned FAddend::drillAddendDownOneStep 
-  (FAddend &Addend0, FAddend &Addend1) const { 
-  if (isConstant()) 
-    return 0; 
- 
-  unsigned BreakNum = FAddend::drillValueDownOneStep(Val, Addend0, Addend1); 
-  if (!BreakNum || Coeff.isOne()) 
-    return BreakNum; 
- 
-  Addend0.Scale(Coeff); 
- 
-  if (BreakNum == 2) 
-    Addend1.Scale(Coeff); 
- 
-  return BreakNum; 
-} 
- 
-Value *FAddCombine::simplify(Instruction *I) { 
-  assert(I->hasAllowReassoc() && I->hasNoSignedZeros() && 
-         "Expected 'reassoc'+'nsz' instruction"); 
- 
-  // Currently we are not able to handle vector type. 
-  if (I->getType()->isVectorTy()) 
-    return nullptr; 
- 
-  assert((I->getOpcode() == Instruction::FAdd || 
-          I->getOpcode() == Instruction::FSub) && "Expect add/sub"); 
- 
-  // Save the instruction before calling other member-functions. 
-  Instr = I; 
- 
-  FAddend Opnd0, Opnd1, Opnd0_0, Opnd0_1, Opnd1_0, Opnd1_1; 
- 
-  unsigned OpndNum = FAddend::drillValueDownOneStep(I, Opnd0, Opnd1); 
- 
-  // Step 1: Expand the 1st addend into Opnd0_0 and Opnd0_1. 
-  unsigned Opnd0_ExpNum = 0; 
-  unsigned Opnd1_ExpNum = 0; 
- 
-  if (!Opnd0.isConstant()) 
-    Opnd0_ExpNum = Opnd0.drillAddendDownOneStep(Opnd0_0, Opnd0_1); 
- 
-  // Step 2: Expand the 2nd addend into Opnd1_0 and Opnd1_1. 
-  if (OpndNum == 2 && !Opnd1.isConstant()) 
-    Opnd1_ExpNum = Opnd1.drillAddendDownOneStep(Opnd1_0, Opnd1_1); 
- 
-  // Step 3: Try to optimize Opnd0_0 + Opnd0_1 + Opnd1_0 + Opnd1_1 
-  if (Opnd0_ExpNum && Opnd1_ExpNum) { 
-    AddendVect AllOpnds; 
-    AllOpnds.push_back(&Opnd0_0); 
-    AllOpnds.push_back(&Opnd1_0); 
-    if (Opnd0_ExpNum == 2) 
-      AllOpnds.push_back(&Opnd0_1); 
-    if (Opnd1_ExpNum == 2) 
-      AllOpnds.push_back(&Opnd1_1); 
- 
-    // Compute instruction quota. We should save at least one instruction. 
-    unsigned InstQuota = 0; 
- 
-    Value *V0 = I->getOperand(0); 
-    Value *V1 = I->getOperand(1); 
-    InstQuota = ((!isa<Constant>(V0) && V0->hasOneUse()) && 
-                 (!isa<Constant>(V1) && V1->hasOneUse())) ? 2 : 1; 
- 
-    if (Value *R = simplifyFAdd(AllOpnds, InstQuota)) 
-      return R; 
-  } 
- 
-  if (OpndNum != 2) { 
-    // The input instruction is : "I=0.0 +/- V". If the "V" were able to be 
-    // splitted into two addends, say "V = X - Y", the instruction would have 
-    // been optimized into "I = Y - X" in the previous steps. 
-    // 
-    const FAddendCoef &CE = Opnd0.getCoef(); 
-    return CE.isOne() ? Opnd0.getSymVal() : nullptr; 
-  } 
- 
-  // step 4: Try to optimize Opnd0 + Opnd1_0 [+ Opnd1_1] 
-  if (Opnd1_ExpNum) { 
-    AddendVect AllOpnds; 
-    AllOpnds.push_back(&Opnd0); 
-    AllOpnds.push_back(&Opnd1_0); 
-    if (Opnd1_ExpNum == 2) 
-      AllOpnds.push_back(&Opnd1_1); 
- 
-    if (Value *R = simplifyFAdd(AllOpnds, 1)) 
-      return R; 
-  } 
- 
-  // step 5: Try to optimize Opnd1 + Opnd0_0 [+ Opnd0_1] 
-  if (Opnd0_ExpNum) { 
-    AddendVect AllOpnds; 
-    AllOpnds.push_back(&Opnd1); 
-    AllOpnds.push_back(&Opnd0_0); 
-    if (Opnd0_ExpNum == 2) 
-      AllOpnds.push_back(&Opnd0_1); 
- 
-    if (Value *R = simplifyFAdd(AllOpnds, 1)) 
-      return R; 
-  } 
- 
-  return nullptr; 
-} 
- 
-Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) { 
-  unsigned AddendNum = Addends.size(); 
-  assert(AddendNum <= 4 && "Too many addends"); 
- 
-  // For saving intermediate results; 
-  unsigned NextTmpIdx = 0; 
-  FAddend TmpResult[3]; 
- 
-  // Points to the constant addend of the resulting simplified expression. 
-  // If the resulting expr has constant-addend, this constant-addend is 
-  // desirable to reside at the top of the resulting expression tree. Placing 
-  // constant close to supper-expr(s) will potentially reveal some optimization 
-  // opportunities in super-expr(s). 
-  const FAddend *ConstAdd = nullptr; 
- 
-  // Simplified addends are placed <SimpVect>. 
-  AddendVect SimpVect; 
- 
-  // The outer loop works on one symbolic-value at a time. Suppose the input 
-  // addends are : <a1, x>, <b1, y>, <a2, x>, <c1, z>, <b2, y>, ... 
-  // The symbolic-values will be processed in this order: x, y, z. 
-  for (unsigned SymIdx = 0; SymIdx < AddendNum; SymIdx++) { 
- 
-    const FAddend *ThisAddend = Addends[SymIdx]; 
-    if (!ThisAddend) { 
-      // This addend was processed before. 
-      continue; 
-    } 
- 
-    Value *Val = ThisAddend->getSymVal(); 
-    unsigned StartIdx = SimpVect.size(); 
-    SimpVect.push_back(ThisAddend); 
- 
-    // The inner loop collects addends sharing same symbolic-value, and these 
-    // addends will be later on folded into a single addend. Following above 
-    // example, if the symbolic value "y" is being processed, the inner loop 
-    // will collect two addends "<b1,y>" and "<b2,Y>". These two addends will 
-    // be later on folded into "<b1+b2, y>". 
-    for (unsigned SameSymIdx = SymIdx + 1; 
-         SameSymIdx < AddendNum; SameSymIdx++) { 
-      const FAddend *T = Addends[SameSymIdx]; 
-      if (T && T->getSymVal() == Val) { 
-        // Set null such that next iteration of the outer loop will not process 
-        // this addend again. 
-        Addends[SameSymIdx] = nullptr; 
-        SimpVect.push_back(T); 
-      } 
-    } 
- 
-    // If multiple addends share same symbolic value, fold them together. 
-    if (StartIdx + 1 != SimpVect.size()) { 
-      FAddend &R = TmpResult[NextTmpIdx ++]; 
-      R = *SimpVect[StartIdx]; 
-      for (unsigned Idx = StartIdx + 1; Idx < SimpVect.size(); Idx++) 
-        R += *SimpVect[Idx]; 
- 
-      // Pop all addends being folded and push the resulting folded addend. 
-      SimpVect.resize(StartIdx); 
-      if (Val) { 
-        if (!R.isZero()) { 
-          SimpVect.push_back(&R); 
-        } 
-      } else { 
-        // Don't push constant addend at this time. It will be the last element 
-        // of <SimpVect>. 
-        ConstAdd = &R; 
-      } 
-    } 
-  } 
- 
-  assert((NextTmpIdx <= array_lengthof(TmpResult) + 1) && 
-         "out-of-bound access"); 
- 
-  if (ConstAdd) 
-    SimpVect.push_back(ConstAdd); 
- 
-  Value *Result; 
-  if (!SimpVect.empty()) 
-    Result = createNaryFAdd(SimpVect, InstrQuota); 
-  else { 
-    // The addition is folded to 0.0. 
-    Result = ConstantFP::get(Instr->getType(), 0.0); 
-  } 
- 
-  return Result; 
-} 
- 
-Value *FAddCombine::createNaryFAdd 
-  (const AddendVect &Opnds, unsigned InstrQuota) { 
-  assert(!Opnds.empty() && "Expect at least one addend"); 
- 
-  // Step 1: Check if the # of instructions needed exceeds the quota. 
- 
-  unsigned InstrNeeded = calcInstrNumber(Opnds); 
-  if (InstrNeeded > InstrQuota) 
-    return nullptr; 
- 
-  initCreateInstNum(); 
- 
-  // step 2: Emit the N-ary addition. 
-  // Note that at most three instructions are involved in Fadd-InstCombine: the 
-  // addition in question, and at most two neighboring instructions. 
-  // The resulting optimized addition should have at least one less instruction 
-  // than the original addition expression tree. This implies that the resulting 
-  // N-ary addition has at most two instructions, and we don't need to worry 
-  // about tree-height when constructing the N-ary addition. 
- 
-  Value *LastVal = nullptr; 
-  bool LastValNeedNeg = false; 
- 
-  // Iterate the addends, creating fadd/fsub using adjacent two addends. 
-  for (const FAddend *Opnd : Opnds) { 
-    bool NeedNeg; 
-    Value *V = createAddendVal(*Opnd, NeedNeg); 
-    if (!LastVal) { 
-      LastVal = V; 
-      LastValNeedNeg = NeedNeg; 
-      continue; 
-    } 
- 
-    if (LastValNeedNeg == NeedNeg) { 
-      LastVal = createFAdd(LastVal, V); 
-      continue; 
-    } 
- 
-    if (LastValNeedNeg) 
-      LastVal = createFSub(V, LastVal); 
-    else 
-      LastVal = createFSub(LastVal, V); 
- 
-    LastValNeedNeg = false; 
-  } 
- 
-  if (LastValNeedNeg) { 
-    LastVal = createFNeg(LastVal); 
-  } 
- 
-#ifndef NDEBUG 
-  assert(CreateInstrNum == InstrNeeded && 
-         "Inconsistent in instruction numbers"); 
-#endif 
- 
-  return LastVal; 
-} 
- 
-Value *FAddCombine::createFSub(Value *Opnd0, Value *Opnd1) { 
-  Value *V = Builder.CreateFSub(Opnd0, Opnd1); 
-  if (Instruction *I = dyn_cast<Instruction>(V)) 
-    createInstPostProc(I); 
-  return V; 
-} 
- 
-Value *FAddCombine::createFNeg(Value *V) { 
-  Value *NewV = Builder.CreateFNeg(V); 
-  if (Instruction *I = dyn_cast<Instruction>(NewV)) 
-    createInstPostProc(I, true); // fneg's don't receive instruction numbers. 
-  return NewV; 
-} 
- 
-Value *FAddCombine::createFAdd(Value *Opnd0, Value *Opnd1) { 
-  Value *V = Builder.CreateFAdd(Opnd0, Opnd1); 
-  if (Instruction *I = dyn_cast<Instruction>(V)) 
-    createInstPostProc(I); 
-  return V; 
-} 
- 
-Value *FAddCombine::createFMul(Value *Opnd0, Value *Opnd1) { 
-  Value *V = Builder.CreateFMul(Opnd0, Opnd1); 
-  if (Instruction *I = dyn_cast<Instruction>(V)) 
-    createInstPostProc(I); 
-  return V; 
-} 
- 
-void FAddCombine::createInstPostProc(Instruction *NewInstr, bool NoNumber) { 
-  NewInstr->setDebugLoc(Instr->getDebugLoc()); 
- 
-  // Keep track of the number of instruction created. 
-  if (!NoNumber) 
-    incCreateInstNum(); 
- 
-  // Propagate fast-math flags 
-  NewInstr->setFastMathFlags(Instr->getFastMathFlags()); 
-} 
- 
-// Return the number of instruction needed to emit the N-ary addition. 
-// NOTE: Keep this function in sync with createAddendVal(). 
-unsigned FAddCombine::calcInstrNumber(const AddendVect &Opnds) { 
-  unsigned OpndNum = Opnds.size(); 
-  unsigned InstrNeeded = OpndNum - 1; 
- 
-  // The number of addends in the form of "(-1)*x". 
-  unsigned NegOpndNum = 0; 
- 
-  // Adjust the number of instructions needed to emit the N-ary add. 
-  for (const FAddend *Opnd : Opnds) { 
-    if (Opnd->isConstant()) 
-      continue; 
- 
-    // The constant check above is really for a few special constant 
-    // coefficients. 
-    if (isa<UndefValue>(Opnd->getSymVal())) 
-      continue; 
- 
-    const FAddendCoef &CE = Opnd->getCoef(); 
-    if (CE.isMinusOne() || CE.isMinusTwo()) 
-      NegOpndNum++; 
- 
-    // Let the addend be "c * x". If "c == +/-1", the value of the addend 
-    // is immediately available; otherwise, it needs exactly one instruction 
-    // to evaluate the value. 
-    if (!CE.isMinusOne() && !CE.isOne()) 
-      InstrNeeded++; 
-  } 
-  return InstrNeeded; 
-} 
- 
-// Input Addend        Value           NeedNeg(output) 
-// ================================================================ 
-// Constant C          C               false 
-// <+/-1, V>           V               coefficient is -1 
-// <2/-2, V>          "fadd V, V"      coefficient is -2 
-// <C, V>             "fmul V, C"      false 
-// 
-// NOTE: Keep this function in sync with FAddCombine::calcInstrNumber. 
-Value *FAddCombine::createAddendVal(const FAddend &Opnd, bool &NeedNeg) { 
-  const FAddendCoef &Coeff = Opnd.getCoef(); 
- 
-  if (Opnd.isConstant()) { 
-    NeedNeg = false; 
-    return Coeff.getValue(Instr->getType()); 
-  } 
- 
-  Value *OpndVal = Opnd.getSymVal(); 
- 
-  if (Coeff.isMinusOne() || Coeff.isOne()) { 
-    NeedNeg = Coeff.isMinusOne(); 
-    return OpndVal; 
-  } 
- 
-  if (Coeff.isTwo() || Coeff.isMinusTwo()) { 
-    NeedNeg = Coeff.isMinusTwo(); 
-    return createFAdd(OpndVal, OpndVal); 
-  } 
- 
-  NeedNeg = false; 
-  return createFMul(OpndVal, Coeff.getValue(Instr->getType())); 
-} 
- 
-// Checks if any operand is negative and we can convert add to sub. 
-// This function checks for following negative patterns 
-//   ADD(XOR(OR(Z, NOT(C)), C)), 1) == NEG(AND(Z, C)) 
-//   ADD(XOR(AND(Z, C), C), 1) == NEG(OR(Z, ~C)) 
-//   XOR(AND(Z, C), (C + 1)) == NEG(OR(Z, ~C)) if C is even 
-static Value *checkForNegativeOperand(BinaryOperator &I, 
-                                      InstCombiner::BuilderTy &Builder) { 
-  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 
- 
-  // This function creates 2 instructions to replace ADD, we need at least one 
-  // of LHS or RHS to have one use to ensure benefit in transform. 
-  if (!LHS->hasOneUse() && !RHS->hasOneUse()) 
-    return nullptr; 
- 
-  Value *X = nullptr, *Y = nullptr, *Z = nullptr; 
-  const APInt *C1 = nullptr, *C2 = nullptr; 
- 
-  // if ONE is on other side, swap 
-  if (match(RHS, m_Add(m_Value(X), m_One()))) 
-    std::swap(LHS, RHS); 
- 
-  if (match(LHS, m_Add(m_Value(X), m_One()))) { 
-    // if XOR on other side, swap 
-    if (match(RHS, m_Xor(m_Value(Y), m_APInt(C1)))) 
-      std::swap(X, RHS); 
- 
-    if (match(X, m_Xor(m_Value(Y), m_APInt(C1)))) { 
-      // X = XOR(Y, C1), Y = OR(Z, C2), C2 = NOT(C1) ==> X == NOT(AND(Z, C1)) 
-      // ADD(ADD(X, 1), RHS) == ADD(X, ADD(RHS, 1)) == SUB(RHS, AND(Z, C1)) 
-      if (match(Y, m_Or(m_Value(Z), m_APInt(C2))) && (*C2 == ~(*C1))) { 
-        Value *NewAnd = Builder.CreateAnd(Z, *C1); 
-        return Builder.CreateSub(RHS, NewAnd, "sub"); 
-      } else if (match(Y, m_And(m_Value(Z), m_APInt(C2))) && (*C1 == *C2)) { 
-        // X = XOR(Y, C1), Y = AND(Z, C2), C2 == C1 ==> X == NOT(OR(Z, ~C1)) 
-        // ADD(ADD(X, 1), RHS) == ADD(X, ADD(RHS, 1)) == SUB(RHS, OR(Z, ~C1)) 
-        Value *NewOr = Builder.CreateOr(Z, ~(*C1)); 
-        return Builder.CreateSub(RHS, NewOr, "sub"); 
-      } 
-    } 
-  } 
- 
-  // Restore LHS and RHS 
-  LHS = I.getOperand(0); 
-  RHS = I.getOperand(1); 
- 
-  // if XOR is on other side, swap 
-  if (match(RHS, m_Xor(m_Value(Y), m_APInt(C1)))) 
-    std::swap(LHS, RHS); 
- 
-  // C2 is ODD 
-  // LHS = XOR(Y, C1), Y = AND(Z, C2), C1 == (C2 + 1) => LHS == NEG(OR(Z, ~C2)) 
-  // ADD(LHS, RHS) == SUB(RHS, OR(Z, ~C2)) 
-  if (match(LHS, m_Xor(m_Value(Y), m_APInt(C1)))) 
-    if (C1->countTrailingZeros() == 0) 
-      if (match(Y, m_And(m_Value(Z), m_APInt(C2))) && *C1 == (*C2 + 1)) { 
-        Value *NewOr = Builder.CreateOr(Z, ~(*C2)); 
-        return Builder.CreateSub(RHS, NewOr, "sub"); 
-      } 
-  return nullptr; 
-} 
- 
-/// Wrapping flags may allow combining constants separated by an extend. 
-static Instruction *foldNoWrapAdd(BinaryOperator &Add, 
-                                  InstCombiner::BuilderTy &Builder) { 
-  Value *Op0 = Add.getOperand(0), *Op1 = Add.getOperand(1); 
-  Type *Ty = Add.getType(); 
-  Constant *Op1C; 
-  if (!match(Op1, m_Constant(Op1C))) 
-    return nullptr; 
- 
-  // Try this match first because it results in an add in the narrow type. 
-  // (zext (X +nuw C2)) + C1 --> zext (X + (C2 + trunc(C1))) 
-  Value *X; 
-  const APInt *C1, *C2; 
-  if (match(Op1, m_APInt(C1)) && 
-      match(Op0, m_OneUse(m_ZExt(m_NUWAdd(m_Value(X), m_APInt(C2))))) && 
-      C1->isNegative() && C1->sge(-C2->sext(C1->getBitWidth()))) { 
-    Constant *NewC = 
-        ConstantInt::get(X->getType(), *C2 + C1->trunc(C2->getBitWidth())); 
-    return new ZExtInst(Builder.CreateNUWAdd(X, NewC), Ty); 
-  } 
- 
-  // More general combining of constants in the wide type. 
-  // (sext (X +nsw NarrowC)) + C --> (sext X) + (sext(NarrowC) + C) 
-  Constant *NarrowC; 
-  if (match(Op0, m_OneUse(m_SExt(m_NSWAdd(m_Value(X), m_Constant(NarrowC)))))) { 
-    Constant *WideC = ConstantExpr::getSExt(NarrowC, Ty); 
-    Constant *NewC = ConstantExpr::getAdd(WideC, Op1C); 
-    Value *WideX = Builder.CreateSExt(X, Ty); 
-    return BinaryOperator::CreateAdd(WideX, NewC); 
-  } 
-  // (zext (X +nuw NarrowC)) + C --> (zext X) + (zext(NarrowC) + C) 
-  if (match(Op0, m_OneUse(m_ZExt(m_NUWAdd(m_Value(X), m_Constant(NarrowC)))))) { 
-    Constant *WideC = ConstantExpr::getZExt(NarrowC, Ty); 
-    Constant *NewC = ConstantExpr::getAdd(WideC, Op1C); 
-    Value *WideX = Builder.CreateZExt(X, Ty); 
-    return BinaryOperator::CreateAdd(WideX, NewC); 
-  } 
- 
-  return nullptr; 
-} 
- 
+
+    const APFloat &getFpVal() const {
+      assert(IsFp && BufHasFpVal && "Incorret state");
+      return *getFpValPtr();
+    }
+
+    APFloat &getFpVal() {
+      assert(IsFp && BufHasFpVal && "Incorret state");
+      return *getFpValPtr();
+    }
+
+    bool isInt() const { return !IsFp; }
+
+    // If the coefficient is represented by an integer, promote it to a
+    // floating point.
+    void convertToFpType(const fltSemantics &Sem);
+
+    // Construct an APFloat from a signed integer.
+    // TODO: We should get rid of this function when APFloat can be constructed
+    //       from an *SIGNED* integer.
+    APFloat createAPFloatFromInt(const fltSemantics &Sem, int Val);
+
+    bool IsFp = false;
+
+    // True iff FpValBuf contains an instance of APFloat.
+    bool BufHasFpVal = false;
+
+    // The integer coefficient of an individual addend is either 1 or -1,
+    // and we try to simplify at most 4 addends from neighboring at most
+    // two instructions. So the range of <IntVal> falls in [-4, 4]. APInt
+    // is overkill of this end.
+    short IntVal = 0;
+
+    AlignedCharArrayUnion<APFloat> FpValBuf;
+  };
+
+  /// FAddend is used to represent floating-point addend. An addend is
+  /// represented as <C, V>, where the V is a symbolic value, and C is a
+  /// constant coefficient. A constant addend is represented as <C, 0>.
+  class FAddend {
+  public:
+    FAddend() = default;
+
+    void operator+=(const FAddend &T) {
+      assert((Val == T.Val) && "Symbolic-values disagree");
+      Coeff += T.Coeff;
+    }
+
+    Value *getSymVal() const { return Val; }
+    const FAddendCoef &getCoef() const { return Coeff; }
+
+    bool isConstant() const { return Val == nullptr; }
+    bool isZero() const { return Coeff.isZero(); }
+
+    void set(short Coefficient, Value *V) {
+      Coeff.set(Coefficient);
+      Val = V;
+    }
+    void set(const APFloat &Coefficient, Value *V) {
+      Coeff.set(Coefficient);
+      Val = V;
+    }
+    void set(const ConstantFP *Coefficient, Value *V) {
+      Coeff.set(Coefficient->getValueAPF());
+      Val = V;
+    }
+
+    void negate() { Coeff.negate(); }
+
+    /// Drill down the U-D chain one step to find the definition of V, and
+    /// try to break the definition into one or two addends.
+    static unsigned drillValueDownOneStep(Value* V, FAddend &A0, FAddend &A1);
+
+    /// Similar to FAddend::drillDownOneStep() except that the value being
+    /// splitted is the addend itself.
+    unsigned drillAddendDownOneStep(FAddend &Addend0, FAddend &Addend1) const;
+
+  private:
+    void Scale(const FAddendCoef& ScaleAmt) { Coeff *= ScaleAmt; }
+
+    // This addend has the value of "Coeff * Val".
+    Value *Val = nullptr;
+    FAddendCoef Coeff;
+  };
+
+  /// FAddCombine is the class for optimizing an unsafe fadd/fsub along
+  /// with its neighboring at most two instructions.
+  ///
+  class FAddCombine {
+  public:
+    FAddCombine(InstCombiner::BuilderTy &B) : Builder(B) {}
+
+    Value *simplify(Instruction *FAdd);
+
+  private:
+    using AddendVect = SmallVector<const FAddend *, 4>;
+
+    Value *simplifyFAdd(AddendVect& V, unsigned InstrQuota);
+
+    /// Convert given addend to a Value
+    Value *createAddendVal(const FAddend &A, bool& NeedNeg);
+
+    /// Return the number of instructions needed to emit the N-ary addition.
+    unsigned calcInstrNumber(const AddendVect& Vect);
+
+    Value *createFSub(Value *Opnd0, Value *Opnd1);
+    Value *createFAdd(Value *Opnd0, Value *Opnd1);
+    Value *createFMul(Value *Opnd0, Value *Opnd1);
+    Value *createFNeg(Value *V);
+    Value *createNaryFAdd(const AddendVect& Opnds, unsigned InstrQuota);
+    void createInstPostProc(Instruction *NewInst, bool NoNumber = false);
+
+     // Debugging stuff are clustered here.
+    #ifndef NDEBUG
+      unsigned CreateInstrNum;
+      void initCreateInstNum() { CreateInstrNum = 0; }
+      void incCreateInstNum() { CreateInstrNum++; }
+    #else
+      void initCreateInstNum() {}
+      void incCreateInstNum() {}
+    #endif
+
+    InstCombiner::BuilderTy &Builder;
+    Instruction *Instr = nullptr;
+  };
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+//
+// Implementation of
+//    {FAddendCoef, FAddend, FAddition, FAddCombine}.
+//
+//===----------------------------------------------------------------------===//
+FAddendCoef::~FAddendCoef() {
+  if (BufHasFpVal)
+    getFpValPtr()->~APFloat();
+}
+
+void FAddendCoef::set(const APFloat& C) {
+  APFloat *P = getFpValPtr();
+
+  if (isInt()) {
+    // As the buffer is meanless byte stream, we cannot call
+    // APFloat::operator=().
+    new(P) APFloat(C);
+  } else
+    *P = C;
+
+  IsFp = BufHasFpVal = true;
+}
+
+void FAddendCoef::convertToFpType(const fltSemantics &Sem) {
+  if (!isInt())
+    return;
+
+  APFloat *P = getFpValPtr();
+  if (IntVal > 0)
+    new(P) APFloat(Sem, IntVal);
+  else {
+    new(P) APFloat(Sem, 0 - IntVal);
+    P->changeSign();
+  }
+  IsFp = BufHasFpVal = true;
+}
+
+APFloat FAddendCoef::createAPFloatFromInt(const fltSemantics &Sem, int Val) {
+  if (Val >= 0)
+    return APFloat(Sem, Val);
+
+  APFloat T(Sem, 0 - Val);
+  T.changeSign();
+
+  return T;
+}
+
+void FAddendCoef::operator=(const FAddendCoef &That) {
+  if (That.isInt())
+    set(That.IntVal);
+  else
+    set(That.getFpVal());
+}
+
+void FAddendCoef::operator+=(const FAddendCoef &That) {
+  RoundingMode RndMode = RoundingMode::NearestTiesToEven;
+  if (isInt() == That.isInt()) {
+    if (isInt())
+      IntVal += That.IntVal;
+    else
+      getFpVal().add(That.getFpVal(), RndMode);
+    return;
+  }
+
+  if (isInt()) {
+    const APFloat &T = That.getFpVal();
+    convertToFpType(T.getSemantics());
+    getFpVal().add(T, RndMode);
+    return;
+  }
+
+  APFloat &T = getFpVal();
+  T.add(createAPFloatFromInt(T.getSemantics(), That.IntVal), RndMode);
+}
+
+void FAddendCoef::operator*=(const FAddendCoef &That) {
+  if (That.isOne())
+    return;
+
+  if (That.isMinusOne()) {
+    negate();
+    return;
+  }
+
+  if (isInt() && That.isInt()) {
+    int Res = IntVal * (int)That.IntVal;
+    assert(!insaneIntVal(Res) && "Insane int value");
+    IntVal = Res;
+    return;
+  }
+
+  const fltSemantics &Semantic =
+    isInt() ? That.getFpVal().getSemantics() : getFpVal().getSemantics();
+
+  if (isInt())
+    convertToFpType(Semantic);
+  APFloat &F0 = getFpVal();
+
+  if (That.isInt())
+    F0.multiply(createAPFloatFromInt(Semantic, That.IntVal),
+                APFloat::rmNearestTiesToEven);
+  else
+    F0.multiply(That.getFpVal(), APFloat::rmNearestTiesToEven);
+}
+
+void FAddendCoef::negate() {
+  if (isInt())
+    IntVal = 0 - IntVal;
+  else
+    getFpVal().changeSign();
+}
+
+Value *FAddendCoef::getValue(Type *Ty) const {
+  return isInt() ?
+    ConstantFP::get(Ty, float(IntVal)) :
+    ConstantFP::get(Ty->getContext(), getFpVal());
+}
+
+// The definition of <Val>     Addends
+// =========================================
+//  A + B                     <1, A>, <1,B>
+//  A - B                     <1, A>, <1,B>
+//  0 - B                     <-1, B>
+//  C * A,                    <C, A>
+//  A + C                     <1, A> <C, NULL>
+//  0 +/- 0                   <0, NULL> (corner case)
+//
+// Legend: A and B are not constant, C is constant
+unsigned FAddend::drillValueDownOneStep
+  (Value *Val, FAddend &Addend0, FAddend &Addend1) {
+  Instruction *I = nullptr;
+  if (!Val || !(I = dyn_cast<Instruction>(Val)))
+    return 0;
+
+  unsigned Opcode = I->getOpcode();
+
+  if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub) {
+    ConstantFP *C0, *C1;
+    Value *Opnd0 = I->getOperand(0);
+    Value *Opnd1 = I->getOperand(1);
+    if ((C0 = dyn_cast<ConstantFP>(Opnd0)) && C0->isZero())
+      Opnd0 = nullptr;
+
+    if ((C1 = dyn_cast<ConstantFP>(Opnd1)) && C1->isZero())
+      Opnd1 = nullptr;
+
+    if (Opnd0) {
+      if (!C0)
+        Addend0.set(1, Opnd0);
+      else
+        Addend0.set(C0, nullptr);
+    }
+
+    if (Opnd1) {
+      FAddend &Addend = Opnd0 ? Addend1 : Addend0;
+      if (!C1)
+        Addend.set(1, Opnd1);
+      else
+        Addend.set(C1, nullptr);
+      if (Opcode == Instruction::FSub)
+        Addend.negate();
+    }
+
+    if (Opnd0 || Opnd1)
+      return Opnd0 && Opnd1 ? 2 : 1;
+
+    // Both operands are zero. Weird!
+    Addend0.set(APFloat(C0->getValueAPF().getSemantics()), nullptr);
+    return 1;
+  }
+
+  if (I->getOpcode() == Instruction::FMul) {
+    Value *V0 = I->getOperand(0);
+    Value *V1 = I->getOperand(1);
+    if (ConstantFP *C = dyn_cast<ConstantFP>(V0)) {
+      Addend0.set(C, V1);
+      return 1;
+    }
+
+    if (ConstantFP *C = dyn_cast<ConstantFP>(V1)) {
+      Addend0.set(C, V0);
+      return 1;
+    }
+  }
+
+  return 0;
+}
+
+// Try to break *this* addend into two addends. e.g. Suppose this addend is
+// <2.3, V>, and V = X + Y, by calling this function, we obtain two addends,
+// i.e. <2.3, X> and <2.3, Y>.
+unsigned FAddend::drillAddendDownOneStep
+  (FAddend &Addend0, FAddend &Addend1) const {
+  if (isConstant())
+    return 0;
+
+  unsigned BreakNum = FAddend::drillValueDownOneStep(Val, Addend0, Addend1);
+  if (!BreakNum || Coeff.isOne())
+    return BreakNum;
+
+  Addend0.Scale(Coeff);
+
+  if (BreakNum == 2)
+    Addend1.Scale(Coeff);
+
+  return BreakNum;
+}
+
+Value *FAddCombine::simplify(Instruction *I) {
+  assert(I->hasAllowReassoc() && I->hasNoSignedZeros() &&
+         "Expected 'reassoc'+'nsz' instruction");
+
+  // Currently we are not able to handle vector type.
+  if (I->getType()->isVectorTy())
+    return nullptr;
+
+  assert((I->getOpcode() == Instruction::FAdd ||
+          I->getOpcode() == Instruction::FSub) && "Expect add/sub");
+
+  // Save the instruction before calling other member-functions.
+  Instr = I;
+
+  FAddend Opnd0, Opnd1, Opnd0_0, Opnd0_1, Opnd1_0, Opnd1_1;
+
+  unsigned OpndNum = FAddend::drillValueDownOneStep(I, Opnd0, Opnd1);
+
+  // Step 1: Expand the 1st addend into Opnd0_0 and Opnd0_1.
+  unsigned Opnd0_ExpNum = 0;
+  unsigned Opnd1_ExpNum = 0;
+
+  if (!Opnd0.isConstant())
+    Opnd0_ExpNum = Opnd0.drillAddendDownOneStep(Opnd0_0, Opnd0_1);
+
+  // Step 2: Expand the 2nd addend into Opnd1_0 and Opnd1_1.
+  if (OpndNum == 2 && !Opnd1.isConstant())
+    Opnd1_ExpNum = Opnd1.drillAddendDownOneStep(Opnd1_0, Opnd1_1);
+
+  // Step 3: Try to optimize Opnd0_0 + Opnd0_1 + Opnd1_0 + Opnd1_1
+  if (Opnd0_ExpNum && Opnd1_ExpNum) {
+    AddendVect AllOpnds;
+    AllOpnds.push_back(&Opnd0_0);
+    AllOpnds.push_back(&Opnd1_0);
+    if (Opnd0_ExpNum == 2)
+      AllOpnds.push_back(&Opnd0_1);
+    if (Opnd1_ExpNum == 2)
+      AllOpnds.push_back(&Opnd1_1);
+
+    // Compute instruction quota. We should save at least one instruction.
+    unsigned InstQuota = 0;
+
+    Value *V0 = I->getOperand(0);
+    Value *V1 = I->getOperand(1);
+    InstQuota = ((!isa<Constant>(V0) && V0->hasOneUse()) &&
+                 (!isa<Constant>(V1) && V1->hasOneUse())) ? 2 : 1;
+
+    if (Value *R = simplifyFAdd(AllOpnds, InstQuota))
+      return R;
+  }
+
+  if (OpndNum != 2) {
+    // The input instruction is : "I=0.0 +/- V". If the "V" were able to be
+    // splitted into two addends, say "V = X - Y", the instruction would have
+    // been optimized into "I = Y - X" in the previous steps.
+    //
+    const FAddendCoef &CE = Opnd0.getCoef();
+    return CE.isOne() ? Opnd0.getSymVal() : nullptr;
+  }
+
+  // step 4: Try to optimize Opnd0 + Opnd1_0 [+ Opnd1_1]
+  if (Opnd1_ExpNum) {
+    AddendVect AllOpnds;
+    AllOpnds.push_back(&Opnd0);
+    AllOpnds.push_back(&Opnd1_0);
+    if (Opnd1_ExpNum == 2)
+      AllOpnds.push_back(&Opnd1_1);
+
+    if (Value *R = simplifyFAdd(AllOpnds, 1))
+      return R;
+  }
+
+  // step 5: Try to optimize Opnd1 + Opnd0_0 [+ Opnd0_1]
+  if (Opnd0_ExpNum) {
+    AddendVect AllOpnds;
+    AllOpnds.push_back(&Opnd1);
+    AllOpnds.push_back(&Opnd0_0);
+    if (Opnd0_ExpNum == 2)
+      AllOpnds.push_back(&Opnd0_1);
+
+    if (Value *R = simplifyFAdd(AllOpnds, 1))
+      return R;
+  }
+
+  return nullptr;
+}
+
+Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) {
+  unsigned AddendNum = Addends.size();
+  assert(AddendNum <= 4 && "Too many addends");
+
+  // For saving intermediate results;
+  unsigned NextTmpIdx = 0;
+  FAddend TmpResult[3];
+
+  // Points to the constant addend of the resulting simplified expression.
+  // If the resulting expr has constant-addend, this constant-addend is
+  // desirable to reside at the top of the resulting expression tree. Placing
+  // constant close to supper-expr(s) will potentially reveal some optimization
+  // opportunities in super-expr(s).
+  const FAddend *ConstAdd = nullptr;
+
+  // Simplified addends are placed <SimpVect>.
+  AddendVect SimpVect;
+
+  // The outer loop works on one symbolic-value at a time. Suppose the input
+  // addends are : <a1, x>, <b1, y>, <a2, x>, <c1, z>, <b2, y>, ...
+  // The symbolic-values will be processed in this order: x, y, z.
+  for (unsigned SymIdx = 0; SymIdx < AddendNum; SymIdx++) {
+
+    const FAddend *ThisAddend = Addends[SymIdx];
+    if (!ThisAddend) {
+      // This addend was processed before.
+      continue;
+    }
+
+    Value *Val = ThisAddend->getSymVal();
+    unsigned StartIdx = SimpVect.size();
+    SimpVect.push_back(ThisAddend);
+
+    // The inner loop collects addends sharing same symbolic-value, and these
+    // addends will be later on folded into a single addend. Following above
+    // example, if the symbolic value "y" is being processed, the inner loop
+    // will collect two addends "<b1,y>" and "<b2,Y>". These two addends will
+    // be later on folded into "<b1+b2, y>".
+    for (unsigned SameSymIdx = SymIdx + 1;
+         SameSymIdx < AddendNum; SameSymIdx++) {
+      const FAddend *T = Addends[SameSymIdx];
+      if (T && T->getSymVal() == Val) {
+        // Set null such that next iteration of the outer loop will not process
+        // this addend again.
+        Addends[SameSymIdx] = nullptr;
+        SimpVect.push_back(T);
+      }
+    }
+
+    // If multiple addends share same symbolic value, fold them together.
+    if (StartIdx + 1 != SimpVect.size()) {
+      FAddend &R = TmpResult[NextTmpIdx ++];
+      R = *SimpVect[StartIdx];
+      for (unsigned Idx = StartIdx + 1; Idx < SimpVect.size(); Idx++)
+        R += *SimpVect[Idx];
+
+      // Pop all addends being folded and push the resulting folded addend.
+      SimpVect.resize(StartIdx);
+      if (Val) {
+        if (!R.isZero()) {
+          SimpVect.push_back(&R);
+        }
+      } else {
+        // Don't push constant addend at this time. It will be the last element
+        // of <SimpVect>.
+        ConstAdd = &R;
+      }
+    }
+  }
+
+  assert((NextTmpIdx <= array_lengthof(TmpResult) + 1) &&
+         "out-of-bound access");
+
+  if (ConstAdd)
+    SimpVect.push_back(ConstAdd);
+
+  Value *Result;
+  if (!SimpVect.empty())
+    Result = createNaryFAdd(SimpVect, InstrQuota);
+  else {
+    // The addition is folded to 0.0.
+    Result = ConstantFP::get(Instr->getType(), 0.0);
+  }
+
+  return Result;
+}
+
+Value *FAddCombine::createNaryFAdd
+  (const AddendVect &Opnds, unsigned InstrQuota) {
+  assert(!Opnds.empty() && "Expect at least one addend");
+
+  // Step 1: Check if the # of instructions needed exceeds the quota.
+
+  unsigned InstrNeeded = calcInstrNumber(Opnds);
+  if (InstrNeeded > InstrQuota)
+    return nullptr;
+
+  initCreateInstNum();
+
+  // step 2: Emit the N-ary addition.
+  // Note that at most three instructions are involved in Fadd-InstCombine: the
+  // addition in question, and at most two neighboring instructions.
+  // The resulting optimized addition should have at least one less instruction
+  // than the original addition expression tree. This implies that the resulting
+  // N-ary addition has at most two instructions, and we don't need to worry
+  // about tree-height when constructing the N-ary addition.
+
+  Value *LastVal = nullptr;
+  bool LastValNeedNeg = false;
+
+  // Iterate the addends, creating fadd/fsub using adjacent two addends.
+  for (const FAddend *Opnd : Opnds) {
+    bool NeedNeg;
+    Value *V = createAddendVal(*Opnd, NeedNeg);
+    if (!LastVal) {
+      LastVal = V;
+      LastValNeedNeg = NeedNeg;
+      continue;
+    }
+
+    if (LastValNeedNeg == NeedNeg) {
+      LastVal = createFAdd(LastVal, V);
+      continue;
+    }
+
+    if (LastValNeedNeg)
+      LastVal = createFSub(V, LastVal);
+    else
+      LastVal = createFSub(LastVal, V);
+
+    LastValNeedNeg = false;
+  }
+
+  if (LastValNeedNeg) {
+    LastVal = createFNeg(LastVal);
+  }
+
+#ifndef NDEBUG
+  assert(CreateInstrNum == InstrNeeded &&
+         "Inconsistent in instruction numbers");
+#endif
+
+  return LastVal;
+}
+
+Value *FAddCombine::createFSub(Value *Opnd0, Value *Opnd1) {
+  Value *V = Builder.CreateFSub(Opnd0, Opnd1);
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    createInstPostProc(I);
+  return V;
+}
+
+Value *FAddCombine::createFNeg(Value *V) {
+  Value *NewV = Builder.CreateFNeg(V);
+  if (Instruction *I = dyn_cast<Instruction>(NewV))
+    createInstPostProc(I, true); // fneg's don't receive instruction numbers.
+  return NewV;
+}
+
+Value *FAddCombine::createFAdd(Value *Opnd0, Value *Opnd1) {
+  Value *V = Builder.CreateFAdd(Opnd0, Opnd1);
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    createInstPostProc(I);
+  return V;
+}
+
+Value *FAddCombine::createFMul(Value *Opnd0, Value *Opnd1) {
+  Value *V = Builder.CreateFMul(Opnd0, Opnd1);
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    createInstPostProc(I);
+  return V;
+}
+
+void FAddCombine::createInstPostProc(Instruction *NewInstr, bool NoNumber) {
+  NewInstr->setDebugLoc(Instr->getDebugLoc());
+
+  // Keep track of the number of instruction created.
+  if (!NoNumber)
+    incCreateInstNum();
+
+  // Propagate fast-math flags
+  NewInstr->setFastMathFlags(Instr->getFastMathFlags());
+}
+
+// Return the number of instruction needed to emit the N-ary addition.
+// NOTE: Keep this function in sync with createAddendVal().
+unsigned FAddCombine::calcInstrNumber(const AddendVect &Opnds) {
+  unsigned OpndNum = Opnds.size();
+  unsigned InstrNeeded = OpndNum - 1;
+
+  // The number of addends in the form of "(-1)*x".
+  unsigned NegOpndNum = 0;
+
+  // Adjust the number of instructions needed to emit the N-ary add.
+  for (const FAddend *Opnd : Opnds) {
+    if (Opnd->isConstant())
+      continue;
+
+    // The constant check above is really for a few special constant
+    // coefficients.
+    if (isa<UndefValue>(Opnd->getSymVal()))
+      continue;
+
+    const FAddendCoef &CE = Opnd->getCoef();
+    if (CE.isMinusOne() || CE.isMinusTwo())
+      NegOpndNum++;
+
+    // Let the addend be "c * x". If "c == +/-1", the value of the addend
+    // is immediately available; otherwise, it needs exactly one instruction
+    // to evaluate the value.
+    if (!CE.isMinusOne() && !CE.isOne())
+      InstrNeeded++;
+  }
+  return InstrNeeded;
+}
+
+// Input Addend        Value           NeedNeg(output)
+// ================================================================
+// Constant C          C               false
+// <+/-1, V>           V               coefficient is -1
+// <2/-2, V>          "fadd V, V"      coefficient is -2
+// <C, V>             "fmul V, C"      false
+//
+// NOTE: Keep this function in sync with FAddCombine::calcInstrNumber.
+Value *FAddCombine::createAddendVal(const FAddend &Opnd, bool &NeedNeg) {
+  const FAddendCoef &Coeff = Opnd.getCoef();
+
+  if (Opnd.isConstant()) {
+    NeedNeg = false;
+    return Coeff.getValue(Instr->getType());
+  }
+
+  Value *OpndVal = Opnd.getSymVal();
+
+  if (Coeff.isMinusOne() || Coeff.isOne()) {
+    NeedNeg = Coeff.isMinusOne();
+    return OpndVal;
+  }
+
+  if (Coeff.isTwo() || Coeff.isMinusTwo()) {
+    NeedNeg = Coeff.isMinusTwo();
+    return createFAdd(OpndVal, OpndVal);
+  }
+
+  NeedNeg = false;
+  return createFMul(OpndVal, Coeff.getValue(Instr->getType()));
+}
+
+// Checks if any operand is negative and we can convert add to sub.
+// This function checks for following negative patterns
+//   ADD(XOR(OR(Z, NOT(C)), C)), 1) == NEG(AND(Z, C))
+//   ADD(XOR(AND(Z, C), C), 1) == NEG(OR(Z, ~C))
+//   XOR(AND(Z, C), (C + 1)) == NEG(OR(Z, ~C)) if C is even
+static Value *checkForNegativeOperand(BinaryOperator &I,
+                                      InstCombiner::BuilderTy &Builder) {
+  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+
+  // This function creates 2 instructions to replace ADD, we need at least one
+  // of LHS or RHS to have one use to ensure benefit in transform.
+  if (!LHS->hasOneUse() && !RHS->hasOneUse())
+    return nullptr;
+
+  Value *X = nullptr, *Y = nullptr, *Z = nullptr;
+  const APInt *C1 = nullptr, *C2 = nullptr;
+
+  // if ONE is on other side, swap
+  if (match(RHS, m_Add(m_Value(X), m_One())))
+    std::swap(LHS, RHS);
+
+  if (match(LHS, m_Add(m_Value(X), m_One()))) {
+    // if XOR on other side, swap
+    if (match(RHS, m_Xor(m_Value(Y), m_APInt(C1))))
+      std::swap(X, RHS);
+
+    if (match(X, m_Xor(m_Value(Y), m_APInt(C1)))) {
+      // X = XOR(Y, C1), Y = OR(Z, C2), C2 = NOT(C1) ==> X == NOT(AND(Z, C1))
+      // ADD(ADD(X, 1), RHS) == ADD(X, ADD(RHS, 1)) == SUB(RHS, AND(Z, C1))
+      if (match(Y, m_Or(m_Value(Z), m_APInt(C2))) && (*C2 == ~(*C1))) {
+        Value *NewAnd = Builder.CreateAnd(Z, *C1);
+        return Builder.CreateSub(RHS, NewAnd, "sub");
+      } else if (match(Y, m_And(m_Value(Z), m_APInt(C2))) && (*C1 == *C2)) {
+        // X = XOR(Y, C1), Y = AND(Z, C2), C2 == C1 ==> X == NOT(OR(Z, ~C1))
+        // ADD(ADD(X, 1), RHS) == ADD(X, ADD(RHS, 1)) == SUB(RHS, OR(Z, ~C1))
+        Value *NewOr = Builder.CreateOr(Z, ~(*C1));
+        return Builder.CreateSub(RHS, NewOr, "sub");
+      }
+    }
+  }
+
+  // Restore LHS and RHS
+  LHS = I.getOperand(0);
+  RHS = I.getOperand(1);
+
+  // if XOR is on other side, swap
+  if (match(RHS, m_Xor(m_Value(Y), m_APInt(C1))))
+    std::swap(LHS, RHS);
+
+  // C2 is ODD
+  // LHS = XOR(Y, C1), Y = AND(Z, C2), C1 == (C2 + 1) => LHS == NEG(OR(Z, ~C2))
+  // ADD(LHS, RHS) == SUB(RHS, OR(Z, ~C2))
+  if (match(LHS, m_Xor(m_Value(Y), m_APInt(C1))))
+    if (C1->countTrailingZeros() == 0)
+      if (match(Y, m_And(m_Value(Z), m_APInt(C2))) && *C1 == (*C2 + 1)) {
+        Value *NewOr = Builder.CreateOr(Z, ~(*C2));
+        return Builder.CreateSub(RHS, NewOr, "sub");
+      }
+  return nullptr;
+}
+
+/// Wrapping flags may allow combining constants separated by an extend.
+static Instruction *foldNoWrapAdd(BinaryOperator &Add,
+                                  InstCombiner::BuilderTy &Builder) {
+  Value *Op0 = Add.getOperand(0), *Op1 = Add.getOperand(1);
+  Type *Ty = Add.getType();
+  Constant *Op1C;
+  if (!match(Op1, m_Constant(Op1C)))
+    return nullptr;
+
+  // Try this match first because it results in an add in the narrow type.
+  // (zext (X +nuw C2)) + C1 --> zext (X + (C2 + trunc(C1)))
+  Value *X;
+  const APInt *C1, *C2;
+  if (match(Op1, m_APInt(C1)) &&
+      match(Op0, m_OneUse(m_ZExt(m_NUWAdd(m_Value(X), m_APInt(C2))))) &&
+      C1->isNegative() && C1->sge(-C2->sext(C1->getBitWidth()))) {
+    Constant *NewC =
+        ConstantInt::get(X->getType(), *C2 + C1->trunc(C2->getBitWidth()));
+    return new ZExtInst(Builder.CreateNUWAdd(X, NewC), Ty);
+  }
+
+  // More general combining of constants in the wide type.
+  // (sext (X +nsw NarrowC)) + C --> (sext X) + (sext(NarrowC) + C)
+  Constant *NarrowC;
+  if (match(Op0, m_OneUse(m_SExt(m_NSWAdd(m_Value(X), m_Constant(NarrowC)))))) {
+    Constant *WideC = ConstantExpr::getSExt(NarrowC, Ty);
+    Constant *NewC = ConstantExpr::getAdd(WideC, Op1C);
+    Value *WideX = Builder.CreateSExt(X, Ty);
+    return BinaryOperator::CreateAdd(WideX, NewC);
+  }
+  // (zext (X +nuw NarrowC)) + C --> (zext X) + (zext(NarrowC) + C)
+  if (match(Op0, m_OneUse(m_ZExt(m_NUWAdd(m_Value(X), m_Constant(NarrowC)))))) {
+    Constant *WideC = ConstantExpr::getZExt(NarrowC, Ty);
+    Constant *NewC = ConstantExpr::getAdd(WideC, Op1C);
+    Value *WideX = Builder.CreateZExt(X, Ty);
+    return BinaryOperator::CreateAdd(WideX, NewC);
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::foldAddWithConstant(BinaryOperator &Add) {
-  Value *Op0 = Add.getOperand(0), *Op1 = Add.getOperand(1); 
-  Constant *Op1C; 
-  if (!match(Op1, m_Constant(Op1C))) 
-    return nullptr; 
- 
-  if (Instruction *NV = foldBinOpIntoSelectOrPhi(Add)) 
-    return NV; 
- 
-  Value *X; 
-  Constant *Op00C; 
- 
-  // add (sub C1, X), C2 --> sub (add C1, C2), X 
-  if (match(Op0, m_Sub(m_Constant(Op00C), m_Value(X)))) 
-    return BinaryOperator::CreateSub(ConstantExpr::getAdd(Op00C, Op1C), X); 
- 
-  Value *Y; 
- 
-  // add (sub X, Y), -1 --> add (not Y), X 
-  if (match(Op0, m_OneUse(m_Sub(m_Value(X), m_Value(Y)))) && 
-      match(Op1, m_AllOnes())) 
-    return BinaryOperator::CreateAdd(Builder.CreateNot(Y), X); 
- 
-  // zext(bool) + C -> bool ? C + 1 : C 
-  if (match(Op0, m_ZExt(m_Value(X))) && 
-      X->getType()->getScalarSizeInBits() == 1) 
+  Value *Op0 = Add.getOperand(0), *Op1 = Add.getOperand(1);
+  Constant *Op1C;
+  if (!match(Op1, m_Constant(Op1C)))
+    return nullptr;
+
+  if (Instruction *NV = foldBinOpIntoSelectOrPhi(Add))
+    return NV;
+
+  Value *X;
+  Constant *Op00C;
+
+  // add (sub C1, X), C2 --> sub (add C1, C2), X
+  if (match(Op0, m_Sub(m_Constant(Op00C), m_Value(X))))
+    return BinaryOperator::CreateSub(ConstantExpr::getAdd(Op00C, Op1C), X);
+
+  Value *Y;
+
+  // add (sub X, Y), -1 --> add (not Y), X
+  if (match(Op0, m_OneUse(m_Sub(m_Value(X), m_Value(Y)))) &&
+      match(Op1, m_AllOnes()))
+    return BinaryOperator::CreateAdd(Builder.CreateNot(Y), X);
+
+  // zext(bool) + C -> bool ? C + 1 : C
+  if (match(Op0, m_ZExt(m_Value(X))) &&
+      X->getType()->getScalarSizeInBits() == 1)
     return SelectInst::Create(X, InstCombiner::AddOne(Op1C), Op1);
-  // sext(bool) + C -> bool ? C - 1 : C 
-  if (match(Op0, m_SExt(m_Value(X))) && 
-      X->getType()->getScalarSizeInBits() == 1) 
+  // sext(bool) + C -> bool ? C - 1 : C
+  if (match(Op0, m_SExt(m_Value(X))) &&
+      X->getType()->getScalarSizeInBits() == 1)
     return SelectInst::Create(X, InstCombiner::SubOne(Op1C), Op1);
- 
-  // ~X + C --> (C-1) - X 
-  if (match(Op0, m_Not(m_Value(X)))) 
+
+  // ~X + C --> (C-1) - X
+  if (match(Op0, m_Not(m_Value(X))))
     return BinaryOperator::CreateSub(InstCombiner::SubOne(Op1C), X);
- 
-  const APInt *C; 
-  if (!match(Op1, m_APInt(C))) 
-    return nullptr; 
- 
-  // (X | C2) + C --> (X | C2) ^ C2 iff (C2 == -C) 
-  const APInt *C2; 
-  if (match(Op0, m_Or(m_Value(), m_APInt(C2))) && *C2 == -*C) 
-    return BinaryOperator::CreateXor(Op0, ConstantInt::get(Add.getType(), *C2)); 
- 
-  if (C->isSignMask()) { 
-    // If wrapping is not allowed, then the addition must set the sign bit: 
-    // X + (signmask) --> X | signmask 
-    if (Add.hasNoSignedWrap() || Add.hasNoUnsignedWrap()) 
-      return BinaryOperator::CreateOr(Op0, Op1); 
- 
-    // If wrapping is allowed, then the addition flips the sign bit of LHS: 
-    // X + (signmask) --> X ^ signmask 
-    return BinaryOperator::CreateXor(Op0, Op1); 
-  } 
- 
-  // Is this add the last step in a convoluted sext? 
-  // add(zext(xor i16 X, -32768), -32768) --> sext X 
-  Type *Ty = Add.getType(); 
-  if (match(Op0, m_ZExt(m_Xor(m_Value(X), m_APInt(C2)))) && 
-      C2->isMinSignedValue() && C2->sext(Ty->getScalarSizeInBits()) == *C) 
-    return CastInst::Create(Instruction::SExt, X, Ty); 
- 
+
+  const APInt *C;
+  if (!match(Op1, m_APInt(C)))
+    return nullptr;
+
+  // (X | C2) + C --> (X | C2) ^ C2 iff (C2 == -C)
+  const APInt *C2;
+  if (match(Op0, m_Or(m_Value(), m_APInt(C2))) && *C2 == -*C)
+    return BinaryOperator::CreateXor(Op0, ConstantInt::get(Add.getType(), *C2));
+
+  if (C->isSignMask()) {
+    // If wrapping is not allowed, then the addition must set the sign bit:
+    // X + (signmask) --> X | signmask
+    if (Add.hasNoSignedWrap() || Add.hasNoUnsignedWrap())
+      return BinaryOperator::CreateOr(Op0, Op1);
+
+    // If wrapping is allowed, then the addition flips the sign bit of LHS:
+    // X + (signmask) --> X ^ signmask
+    return BinaryOperator::CreateXor(Op0, Op1);
+  }
+
+  // Is this add the last step in a convoluted sext?
+  // add(zext(xor i16 X, -32768), -32768) --> sext X
+  Type *Ty = Add.getType();
+  if (match(Op0, m_ZExt(m_Xor(m_Value(X), m_APInt(C2)))) &&
+      C2->isMinSignedValue() && C2->sext(Ty->getScalarSizeInBits()) == *C)
+    return CastInst::Create(Instruction::SExt, X, Ty);
+
   if (match(Op0, m_Xor(m_Value(X), m_APInt(C2)))) {
     // (X ^ signmask) + C --> (X + (signmask ^ C))
     if (C2->isSignMask())
@@ -957,26 +957,26 @@ Instruction *InstCombinerImpl::foldAddWithConstant(BinaryOperator &Add) {
     }
   }
 
-  if (C->isOneValue() && Op0->hasOneUse()) { 
-    // add (sext i1 X), 1 --> zext (not X) 
-    // TODO: The smallest IR representation is (select X, 0, 1), and that would 
-    // not require the one-use check. But we need to remove a transform in 
-    // visitSelect and make sure that IR value tracking for select is equal or 
-    // better than for these ops. 
-    if (match(Op0, m_SExt(m_Value(X))) && 
-        X->getType()->getScalarSizeInBits() == 1) 
-      return new ZExtInst(Builder.CreateNot(X), Ty); 
- 
-    // Shifts and add used to flip and mask off the low bit: 
-    // add (ashr (shl i32 X, 31), 31), 1 --> and (not X), 1 
-    const APInt *C3; 
-    if (match(Op0, m_AShr(m_Shl(m_Value(X), m_APInt(C2)), m_APInt(C3))) && 
-        C2 == C3 && *C2 == Ty->getScalarSizeInBits() - 1) { 
-      Value *NotX = Builder.CreateNot(X); 
-      return BinaryOperator::CreateAnd(NotX, ConstantInt::get(Ty, 1)); 
-    } 
-  } 
- 
+  if (C->isOneValue() && Op0->hasOneUse()) {
+    // add (sext i1 X), 1 --> zext (not X)
+    // TODO: The smallest IR representation is (select X, 0, 1), and that would
+    // not require the one-use check. But we need to remove a transform in
+    // visitSelect and make sure that IR value tracking for select is equal or
+    // better than for these ops.
+    if (match(Op0, m_SExt(m_Value(X))) &&
+        X->getType()->getScalarSizeInBits() == 1)
+      return new ZExtInst(Builder.CreateNot(X), Ty);
+
+    // Shifts and add used to flip and mask off the low bit:
+    // add (ashr (shl i32 X, 31), 31), 1 --> and (not X), 1
+    const APInt *C3;
+    if (match(Op0, m_AShr(m_Shl(m_Value(X), m_APInt(C2)), m_APInt(C3))) &&
+        C2 == C3 && *C2 == Ty->getScalarSizeInBits() - 1) {
+      Value *NotX = Builder.CreateNot(X);
+      return BinaryOperator::CreateAnd(NotX, ConstantInt::get(Ty, 1));
+    }
+  }
+
   // If all bits affected by the add are included in a high-bit-mask, do the
   // add before the mask op:
   // (X & 0xFF00) + xx00 --> (X + xx00) & 0xFF00
@@ -986,261 +986,261 @@ Instruction *InstCombinerImpl::foldAddWithConstant(BinaryOperator &Add) {
     return BinaryOperator::CreateAnd(NewAdd, ConstantInt::get(Ty, *C2));
   }
 
-  return nullptr; 
-} 
- 
-// Matches multiplication expression Op * C where C is a constant. Returns the 
-// constant value in C and the other operand in Op. Returns true if such a 
-// match is found. 
-static bool MatchMul(Value *E, Value *&Op, APInt &C) { 
-  const APInt *AI; 
-  if (match(E, m_Mul(m_Value(Op), m_APInt(AI)))) { 
-    C = *AI; 
-    return true; 
-  } 
-  if (match(E, m_Shl(m_Value(Op), m_APInt(AI)))) { 
-    C = APInt(AI->getBitWidth(), 1); 
-    C <<= *AI; 
-    return true; 
-  } 
-  return false; 
-} 
- 
-// Matches remainder expression Op % C where C is a constant. Returns the 
-// constant value in C and the other operand in Op. Returns the signedness of 
-// the remainder operation in IsSigned. Returns true if such a match is 
-// found. 
-static bool MatchRem(Value *E, Value *&Op, APInt &C, bool &IsSigned) { 
-  const APInt *AI; 
-  IsSigned = false; 
-  if (match(E, m_SRem(m_Value(Op), m_APInt(AI)))) { 
-    IsSigned = true; 
-    C = *AI; 
-    return true; 
-  } 
-  if (match(E, m_URem(m_Value(Op), m_APInt(AI)))) { 
-    C = *AI; 
-    return true; 
-  } 
-  if (match(E, m_And(m_Value(Op), m_APInt(AI))) && (*AI + 1).isPowerOf2()) { 
-    C = *AI + 1; 
-    return true; 
-  } 
-  return false; 
-} 
- 
-// Matches division expression Op / C with the given signedness as indicated 
-// by IsSigned, where C is a constant. Returns the constant value in C and the 
-// other operand in Op. Returns true if such a match is found. 
-static bool MatchDiv(Value *E, Value *&Op, APInt &C, bool IsSigned) { 
-  const APInt *AI; 
-  if (IsSigned && match(E, m_SDiv(m_Value(Op), m_APInt(AI)))) { 
-    C = *AI; 
-    return true; 
-  } 
-  if (!IsSigned) { 
-    if (match(E, m_UDiv(m_Value(Op), m_APInt(AI)))) { 
-      C = *AI; 
-      return true; 
-    } 
-    if (match(E, m_LShr(m_Value(Op), m_APInt(AI)))) { 
-      C = APInt(AI->getBitWidth(), 1); 
-      C <<= *AI; 
-      return true; 
-    } 
-  } 
-  return false; 
-} 
- 
-// Returns whether C0 * C1 with the given signedness overflows. 
-static bool MulWillOverflow(APInt &C0, APInt &C1, bool IsSigned) { 
-  bool overflow; 
-  if (IsSigned) 
-    (void)C0.smul_ov(C1, overflow); 
-  else 
-    (void)C0.umul_ov(C1, overflow); 
-  return overflow; 
-} 
- 
-// Simplifies X % C0 + (( X / C0 ) % C1) * C0 to X % (C0 * C1), where (C0 * C1) 
-// does not overflow. 
+  return nullptr;
+}
+
+// Matches multiplication expression Op * C where C is a constant. Returns the
+// constant value in C and the other operand in Op. Returns true if such a
+// match is found.
+static bool MatchMul(Value *E, Value *&Op, APInt &C) {
+  const APInt *AI;
+  if (match(E, m_Mul(m_Value(Op), m_APInt(AI)))) {
+    C = *AI;
+    return true;
+  }
+  if (match(E, m_Shl(m_Value(Op), m_APInt(AI)))) {
+    C = APInt(AI->getBitWidth(), 1);
+    C <<= *AI;
+    return true;
+  }
+  return false;
+}
+
+// Matches remainder expression Op % C where C is a constant. Returns the
+// constant value in C and the other operand in Op. Returns the signedness of
+// the remainder operation in IsSigned. Returns true if such a match is
+// found.
+static bool MatchRem(Value *E, Value *&Op, APInt &C, bool &IsSigned) {
+  const APInt *AI;
+  IsSigned = false;
+  if (match(E, m_SRem(m_Value(Op), m_APInt(AI)))) {
+    IsSigned = true;
+    C = *AI;
+    return true;
+  }
+  if (match(E, m_URem(m_Value(Op), m_APInt(AI)))) {
+    C = *AI;
+    return true;
+  }
+  if (match(E, m_And(m_Value(Op), m_APInt(AI))) && (*AI + 1).isPowerOf2()) {
+    C = *AI + 1;
+    return true;
+  }
+  return false;
+}
+
+// Matches division expression Op / C with the given signedness as indicated
+// by IsSigned, where C is a constant. Returns the constant value in C and the
+// other operand in Op. Returns true if such a match is found.
+static bool MatchDiv(Value *E, Value *&Op, APInt &C, bool IsSigned) {
+  const APInt *AI;
+  if (IsSigned && match(E, m_SDiv(m_Value(Op), m_APInt(AI)))) {
+    C = *AI;
+    return true;
+  }
+  if (!IsSigned) {
+    if (match(E, m_UDiv(m_Value(Op), m_APInt(AI)))) {
+      C = *AI;
+      return true;
+    }
+    if (match(E, m_LShr(m_Value(Op), m_APInt(AI)))) {
+      C = APInt(AI->getBitWidth(), 1);
+      C <<= *AI;
+      return true;
+    }
+  }
+  return false;
+}
+
+// Returns whether C0 * C1 with the given signedness overflows.
+static bool MulWillOverflow(APInt &C0, APInt &C1, bool IsSigned) {
+  bool overflow;
+  if (IsSigned)
+    (void)C0.smul_ov(C1, overflow);
+  else
+    (void)C0.umul_ov(C1, overflow);
+  return overflow;
+}
+
+// Simplifies X % C0 + (( X / C0 ) % C1) * C0 to X % (C0 * C1), where (C0 * C1)
+// does not overflow.
 Value *InstCombinerImpl::SimplifyAddWithRemainder(BinaryOperator &I) {
-  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 
-  Value *X, *MulOpV; 
-  APInt C0, MulOpC; 
-  bool IsSigned; 
-  // Match I = X % C0 + MulOpV * C0 
-  if (((MatchRem(LHS, X, C0, IsSigned) && MatchMul(RHS, MulOpV, MulOpC)) || 
-       (MatchRem(RHS, X, C0, IsSigned) && MatchMul(LHS, MulOpV, MulOpC))) && 
-      C0 == MulOpC) { 
-    Value *RemOpV; 
-    APInt C1; 
-    bool Rem2IsSigned; 
-    // Match MulOpC = RemOpV % C1 
-    if (MatchRem(MulOpV, RemOpV, C1, Rem2IsSigned) && 
-        IsSigned == Rem2IsSigned) { 
-      Value *DivOpV; 
-      APInt DivOpC; 
-      // Match RemOpV = X / C0 
-      if (MatchDiv(RemOpV, DivOpV, DivOpC, IsSigned) && X == DivOpV && 
-          C0 == DivOpC && !MulWillOverflow(C0, C1, IsSigned)) { 
-        Value *NewDivisor = ConstantInt::get(X->getType(), C0 * C1); 
-        return IsSigned ? Builder.CreateSRem(X, NewDivisor, "srem") 
-                        : Builder.CreateURem(X, NewDivisor, "urem"); 
-      } 
-    } 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Fold 
-///   (1 << NBits) - 1 
-/// Into: 
-///   ~(-(1 << NBits)) 
-/// Because a 'not' is better for bit-tracking analysis and other transforms 
-/// than an 'add'. The new shl is always nsw, and is nuw if old `and` was. 
-static Instruction *canonicalizeLowbitMask(BinaryOperator &I, 
-                                           InstCombiner::BuilderTy &Builder) { 
-  Value *NBits; 
-  if (!match(&I, m_Add(m_OneUse(m_Shl(m_One(), m_Value(NBits))), m_AllOnes()))) 
-    return nullptr; 
- 
-  Constant *MinusOne = Constant::getAllOnesValue(NBits->getType()); 
-  Value *NotMask = Builder.CreateShl(MinusOne, NBits, "notmask"); 
-  // Be wary of constant folding. 
-  if (auto *BOp = dyn_cast<BinaryOperator>(NotMask)) { 
-    // Always NSW. But NUW propagates from `add`. 
-    BOp->setHasNoSignedWrap(); 
-    BOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap()); 
-  } 
- 
-  return BinaryOperator::CreateNot(NotMask, I.getName()); 
-} 
- 
-static Instruction *foldToUnsignedSaturatedAdd(BinaryOperator &I) { 
-  assert(I.getOpcode() == Instruction::Add && "Expecting add instruction"); 
-  Type *Ty = I.getType(); 
-  auto getUAddSat = [&]() { 
-    return Intrinsic::getDeclaration(I.getModule(), Intrinsic::uadd_sat, Ty); 
-  }; 
- 
-  // add (umin X, ~Y), Y --> uaddsat X, Y 
-  Value *X, *Y; 
-  if (match(&I, m_c_Add(m_c_UMin(m_Value(X), m_Not(m_Value(Y))), 
-                        m_Deferred(Y)))) 
-    return CallInst::Create(getUAddSat(), { X, Y }); 
- 
-  // add (umin X, ~C), C --> uaddsat X, C 
-  const APInt *C, *NotC; 
-  if (match(&I, m_Add(m_UMin(m_Value(X), m_APInt(NotC)), m_APInt(C))) && 
-      *C == ~*NotC) 
-    return CallInst::Create(getUAddSat(), { X, ConstantInt::get(Ty, *C) }); 
- 
-  return nullptr; 
-} 
- 
+  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+  Value *X, *MulOpV;
+  APInt C0, MulOpC;
+  bool IsSigned;
+  // Match I = X % C0 + MulOpV * C0
+  if (((MatchRem(LHS, X, C0, IsSigned) && MatchMul(RHS, MulOpV, MulOpC)) ||
+       (MatchRem(RHS, X, C0, IsSigned) && MatchMul(LHS, MulOpV, MulOpC))) &&
+      C0 == MulOpC) {
+    Value *RemOpV;
+    APInt C1;
+    bool Rem2IsSigned;
+    // Match MulOpC = RemOpV % C1
+    if (MatchRem(MulOpV, RemOpV, C1, Rem2IsSigned) &&
+        IsSigned == Rem2IsSigned) {
+      Value *DivOpV;
+      APInt DivOpC;
+      // Match RemOpV = X / C0
+      if (MatchDiv(RemOpV, DivOpV, DivOpC, IsSigned) && X == DivOpV &&
+          C0 == DivOpC && !MulWillOverflow(C0, C1, IsSigned)) {
+        Value *NewDivisor = ConstantInt::get(X->getType(), C0 * C1);
+        return IsSigned ? Builder.CreateSRem(X, NewDivisor, "srem")
+                        : Builder.CreateURem(X, NewDivisor, "urem");
+      }
+    }
+  }
+
+  return nullptr;
+}
+
+/// Fold
+///   (1 << NBits) - 1
+/// Into:
+///   ~(-(1 << NBits))
+/// Because a 'not' is better for bit-tracking analysis and other transforms
+/// than an 'add'. The new shl is always nsw, and is nuw if old `and` was.
+static Instruction *canonicalizeLowbitMask(BinaryOperator &I,
+                                           InstCombiner::BuilderTy &Builder) {
+  Value *NBits;
+  if (!match(&I, m_Add(m_OneUse(m_Shl(m_One(), m_Value(NBits))), m_AllOnes())))
+    return nullptr;
+
+  Constant *MinusOne = Constant::getAllOnesValue(NBits->getType());
+  Value *NotMask = Builder.CreateShl(MinusOne, NBits, "notmask");
+  // Be wary of constant folding.
+  if (auto *BOp = dyn_cast<BinaryOperator>(NotMask)) {
+    // Always NSW. But NUW propagates from `add`.
+    BOp->setHasNoSignedWrap();
+    BOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
+  }
+
+  return BinaryOperator::CreateNot(NotMask, I.getName());
+}
+
+static Instruction *foldToUnsignedSaturatedAdd(BinaryOperator &I) {
+  assert(I.getOpcode() == Instruction::Add && "Expecting add instruction");
+  Type *Ty = I.getType();
+  auto getUAddSat = [&]() {
+    return Intrinsic::getDeclaration(I.getModule(), Intrinsic::uadd_sat, Ty);
+  };
+
+  // add (umin X, ~Y), Y --> uaddsat X, Y
+  Value *X, *Y;
+  if (match(&I, m_c_Add(m_c_UMin(m_Value(X), m_Not(m_Value(Y))),
+                        m_Deferred(Y))))
+    return CallInst::Create(getUAddSat(), { X, Y });
+
+  // add (umin X, ~C), C --> uaddsat X, C
+  const APInt *C, *NotC;
+  if (match(&I, m_Add(m_UMin(m_Value(X), m_APInt(NotC)), m_APInt(C))) &&
+      *C == ~*NotC)
+    return CallInst::Create(getUAddSat(), { X, ConstantInt::get(Ty, *C) });
+
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::
     canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(
         BinaryOperator &I) {
-  assert((I.getOpcode() == Instruction::Add || 
-          I.getOpcode() == Instruction::Or || 
-          I.getOpcode() == Instruction::Sub) && 
-         "Expecting add/or/sub instruction"); 
- 
-  // We have a subtraction/addition between a (potentially truncated) *logical* 
-  // right-shift of X and a "select". 
-  Value *X, *Select; 
-  Instruction *LowBitsToSkip, *Extract; 
-  if (!match(&I, m_c_BinOp(m_TruncOrSelf(m_CombineAnd( 
-                               m_LShr(m_Value(X), m_Instruction(LowBitsToSkip)), 
-                               m_Instruction(Extract))), 
-                           m_Value(Select)))) 
-    return nullptr; 
- 
-  // `add`/`or` is commutative; but for `sub`, "select" *must* be on RHS. 
-  if (I.getOpcode() == Instruction::Sub && I.getOperand(1) != Select) 
-    return nullptr; 
- 
-  Type *XTy = X->getType(); 
-  bool HadTrunc = I.getType() != XTy; 
- 
-  // If there was a truncation of extracted value, then we'll need to produce 
-  // one extra instruction, so we need to ensure one instruction will go away. 
-  if (HadTrunc && !match(&I, m_c_BinOp(m_OneUse(m_Value()), m_Value()))) 
-    return nullptr; 
- 
-  // Extraction should extract high NBits bits, with shift amount calculated as: 
-  //   low bits to skip = shift bitwidth - high bits to extract 
-  // The shift amount itself may be extended, and we need to look past zero-ext 
-  // when matching NBits, that will matter for matching later. 
-  Constant *C; 
-  Value *NBits; 
-  if (!match( 
-          LowBitsToSkip, 
-          m_ZExtOrSelf(m_Sub(m_Constant(C), m_ZExtOrSelf(m_Value(NBits))))) || 
-      !match(C, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ, 
-                                   APInt(C->getType()->getScalarSizeInBits(), 
-                                         X->getType()->getScalarSizeInBits())))) 
-    return nullptr; 
- 
-  // Sign-extending value can be zero-extended if we `sub`tract it, 
-  // or sign-extended otherwise. 
-  auto SkipExtInMagic = [&I](Value *&V) { 
-    if (I.getOpcode() == Instruction::Sub) 
-      match(V, m_ZExtOrSelf(m_Value(V))); 
-    else 
-      match(V, m_SExtOrSelf(m_Value(V))); 
-  }; 
- 
-  // Now, finally validate the sign-extending magic. 
-  // `select` itself may be appropriately extended, look past that. 
-  SkipExtInMagic(Select); 
- 
-  ICmpInst::Predicate Pred; 
-  const APInt *Thr; 
-  Value *SignExtendingValue, *Zero; 
-  bool ShouldSignext; 
-  // It must be a select between two values we will later establish to be a 
-  // sign-extending value and a zero constant. The condition guarding the 
-  // sign-extension must be based on a sign bit of the same X we had in `lshr`. 
-  if (!match(Select, m_Select(m_ICmp(Pred, m_Specific(X), m_APInt(Thr)), 
-                              m_Value(SignExtendingValue), m_Value(Zero))) || 
-      !isSignBitCheck(Pred, *Thr, ShouldSignext)) 
-    return nullptr; 
- 
-  // icmp-select pair is commutative. 
-  if (!ShouldSignext) 
-    std::swap(SignExtendingValue, Zero); 
- 
-  // If we should not perform sign-extension then we must add/or/subtract zero. 
-  if (!match(Zero, m_Zero())) 
-    return nullptr; 
-  // Otherwise, it should be some constant, left-shifted by the same NBits we 
-  // had in `lshr`. Said left-shift can also be appropriately extended. 
-  // Again, we must look past zero-ext when looking for NBits. 
-  SkipExtInMagic(SignExtendingValue); 
-  Constant *SignExtendingValueBaseConstant; 
-  if (!match(SignExtendingValue, 
-             m_Shl(m_Constant(SignExtendingValueBaseConstant), 
-                   m_ZExtOrSelf(m_Specific(NBits))))) 
-    return nullptr; 
-  // If we `sub`, then the constant should be one, else it should be all-ones. 
-  if (I.getOpcode() == Instruction::Sub 
-          ? !match(SignExtendingValueBaseConstant, m_One()) 
-          : !match(SignExtendingValueBaseConstant, m_AllOnes())) 
-    return nullptr; 
- 
-  auto *NewAShr = BinaryOperator::CreateAShr(X, LowBitsToSkip, 
-                                             Extract->getName() + ".sext"); 
-  NewAShr->copyIRFlags(Extract); // Preserve `exact`-ness. 
-  if (!HadTrunc) 
-    return NewAShr; 
- 
-  Builder.Insert(NewAShr); 
-  return TruncInst::CreateTruncOrBitCast(NewAShr, I.getType()); 
-} 
- 
+  assert((I.getOpcode() == Instruction::Add ||
+          I.getOpcode() == Instruction::Or ||
+          I.getOpcode() == Instruction::Sub) &&
+         "Expecting add/or/sub instruction");
+
+  // We have a subtraction/addition between a (potentially truncated) *logical*
+  // right-shift of X and a "select".
+  Value *X, *Select;
+  Instruction *LowBitsToSkip, *Extract;
+  if (!match(&I, m_c_BinOp(m_TruncOrSelf(m_CombineAnd(
+                               m_LShr(m_Value(X), m_Instruction(LowBitsToSkip)),
+                               m_Instruction(Extract))),
+                           m_Value(Select))))
+    return nullptr;
+
+  // `add`/`or` is commutative; but for `sub`, "select" *must* be on RHS.
+  if (I.getOpcode() == Instruction::Sub && I.getOperand(1) != Select)
+    return nullptr;
+
+  Type *XTy = X->getType();
+  bool HadTrunc = I.getType() != XTy;
+
+  // If there was a truncation of extracted value, then we'll need to produce
+  // one extra instruction, so we need to ensure one instruction will go away.
+  if (HadTrunc && !match(&I, m_c_BinOp(m_OneUse(m_Value()), m_Value())))
+    return nullptr;
+
+  // Extraction should extract high NBits bits, with shift amount calculated as:
+  //   low bits to skip = shift bitwidth - high bits to extract
+  // The shift amount itself may be extended, and we need to look past zero-ext
+  // when matching NBits, that will matter for matching later.
+  Constant *C;
+  Value *NBits;
+  if (!match(
+          LowBitsToSkip,
+          m_ZExtOrSelf(m_Sub(m_Constant(C), m_ZExtOrSelf(m_Value(NBits))))) ||
+      !match(C, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ,
+                                   APInt(C->getType()->getScalarSizeInBits(),
+                                         X->getType()->getScalarSizeInBits()))))
+    return nullptr;
+
+  // Sign-extending value can be zero-extended if we `sub`tract it,
+  // or sign-extended otherwise.
+  auto SkipExtInMagic = [&I](Value *&V) {
+    if (I.getOpcode() == Instruction::Sub)
+      match(V, m_ZExtOrSelf(m_Value(V)));
+    else
+      match(V, m_SExtOrSelf(m_Value(V)));
+  };
+
+  // Now, finally validate the sign-extending magic.
+  // `select` itself may be appropriately extended, look past that.
+  SkipExtInMagic(Select);
+
+  ICmpInst::Predicate Pred;
+  const APInt *Thr;
+  Value *SignExtendingValue, *Zero;
+  bool ShouldSignext;
+  // It must be a select between two values we will later establish to be a
+  // sign-extending value and a zero constant. The condition guarding the
+  // sign-extension must be based on a sign bit of the same X we had in `lshr`.
+  if (!match(Select, m_Select(m_ICmp(Pred, m_Specific(X), m_APInt(Thr)),
+                              m_Value(SignExtendingValue), m_Value(Zero))) ||
+      !isSignBitCheck(Pred, *Thr, ShouldSignext))
+    return nullptr;
+
+  // icmp-select pair is commutative.
+  if (!ShouldSignext)
+    std::swap(SignExtendingValue, Zero);
+
+  // If we should not perform sign-extension then we must add/or/subtract zero.
+  if (!match(Zero, m_Zero()))
+    return nullptr;
+  // Otherwise, it should be some constant, left-shifted by the same NBits we
+  // had in `lshr`. Said left-shift can also be appropriately extended.
+  // Again, we must look past zero-ext when looking for NBits.
+  SkipExtInMagic(SignExtendingValue);
+  Constant *SignExtendingValueBaseConstant;
+  if (!match(SignExtendingValue,
+             m_Shl(m_Constant(SignExtendingValueBaseConstant),
+                   m_ZExtOrSelf(m_Specific(NBits)))))
+    return nullptr;
+  // If we `sub`, then the constant should be one, else it should be all-ones.
+  if (I.getOpcode() == Instruction::Sub
+          ? !match(SignExtendingValueBaseConstant, m_One())
+          : !match(SignExtendingValueBaseConstant, m_AllOnes()))
+    return nullptr;
+
+  auto *NewAShr = BinaryOperator::CreateAShr(X, LowBitsToSkip,
+                                             Extract->getName() + ".sext");
+  NewAShr->copyIRFlags(Extract); // Preserve `exact`-ness.
+  if (!HadTrunc)
+    return NewAShr;
+
+  Builder.Insert(NewAShr);
+  return TruncInst::CreateTruncOrBitCast(NewAShr, I.getType());
+}
+
 /// This is a specialization of a more general transform from
 /// SimplifyUsingDistributiveLaws. If that code can be made to work optimally
 /// for multi-use cases or propagating nsw/nuw, then we would not need this.
@@ -1279,161 +1279,161 @@ static Instruction *factorizeMathWithShlOps(BinaryOperator &I,
 }
 
 Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
-  if (Value *V = SimplifyAddInst(I.getOperand(0), I.getOperand(1), 
-                                 I.hasNoSignedWrap(), I.hasNoUnsignedWrap(), 
-                                 SQ.getWithInstruction(&I))) 
-    return replaceInstUsesWith(I, V); 
- 
-  if (SimplifyAssociativeOrCommutative(I)) 
-    return &I; 
- 
-  if (Instruction *X = foldVectorBinop(I)) 
-    return X; 
- 
-  // (A*B)+(A*C) -> A*(B+C) etc 
-  if (Value *V = SimplifyUsingDistributiveLaws(I)) 
-    return replaceInstUsesWith(I, V); 
- 
+  if (Value *V = SimplifyAddInst(I.getOperand(0), I.getOperand(1),
+                                 I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
+                                 SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  if (SimplifyAssociativeOrCommutative(I))
+    return &I;
+
+  if (Instruction *X = foldVectorBinop(I))
+    return X;
+
+  // (A*B)+(A*C) -> A*(B+C) etc
+  if (Value *V = SimplifyUsingDistributiveLaws(I))
+    return replaceInstUsesWith(I, V);
+
   if (Instruction *R = factorizeMathWithShlOps(I, Builder))
     return R;
 
-  if (Instruction *X = foldAddWithConstant(I)) 
-    return X; 
- 
-  if (Instruction *X = foldNoWrapAdd(I, Builder)) 
-    return X; 
- 
-  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 
-  Type *Ty = I.getType(); 
-  if (Ty->isIntOrIntVectorTy(1)) 
-    return BinaryOperator::CreateXor(LHS, RHS); 
- 
-  // X + X --> X << 1 
-  if (LHS == RHS) { 
-    auto *Shl = BinaryOperator::CreateShl(LHS, ConstantInt::get(Ty, 1)); 
-    Shl->setHasNoSignedWrap(I.hasNoSignedWrap()); 
-    Shl->setHasNoUnsignedWrap(I.hasNoUnsignedWrap()); 
-    return Shl; 
-  } 
- 
-  Value *A, *B; 
-  if (match(LHS, m_Neg(m_Value(A)))) { 
-    // -A + -B --> -(A + B) 
-    if (match(RHS, m_Neg(m_Value(B)))) 
-      return BinaryOperator::CreateNeg(Builder.CreateAdd(A, B)); 
- 
-    // -A + B --> B - A 
-    return BinaryOperator::CreateSub(RHS, A); 
-  } 
- 
-  // A + -B  -->  A - B 
-  if (match(RHS, m_Neg(m_Value(B)))) 
-    return BinaryOperator::CreateSub(LHS, B); 
- 
-  if (Value *V = checkForNegativeOperand(I, Builder)) 
-    return replaceInstUsesWith(I, V); 
- 
-  // (A + 1) + ~B --> A - B 
-  // ~B + (A + 1) --> A - B 
-  // (~B + A) + 1 --> A - B 
-  // (A + ~B) + 1 --> A - B 
-  if (match(&I, m_c_BinOp(m_Add(m_Value(A), m_One()), m_Not(m_Value(B)))) || 
-      match(&I, m_BinOp(m_c_Add(m_Not(m_Value(B)), m_Value(A)), m_One()))) 
-    return BinaryOperator::CreateSub(A, B); 
- 
-  // (A + RHS) + RHS --> A + (RHS << 1) 
-  if (match(LHS, m_OneUse(m_c_Add(m_Value(A), m_Specific(RHS))))) 
-    return BinaryOperator::CreateAdd(A, Builder.CreateShl(RHS, 1, "reass.add")); 
- 
-  // LHS + (A + LHS) --> A + (LHS << 1) 
-  if (match(RHS, m_OneUse(m_c_Add(m_Value(A), m_Specific(LHS))))) 
-    return BinaryOperator::CreateAdd(A, Builder.CreateShl(LHS, 1, "reass.add")); 
- 
-  // X % C0 + (( X / C0 ) % C1) * C0 => X % (C0 * C1) 
-  if (Value *V = SimplifyAddWithRemainder(I)) return replaceInstUsesWith(I, V); 
- 
-  // ((X s/ C1) << C2) + X => X s% -C1 where -C1 is 1 << C2 
-  const APInt *C1, *C2; 
-  if (match(LHS, m_Shl(m_SDiv(m_Specific(RHS), m_APInt(C1)), m_APInt(C2)))) { 
-    APInt one(C2->getBitWidth(), 1); 
-    APInt minusC1 = -(*C1); 
-    if (minusC1 == (one << *C2)) { 
-      Constant *NewRHS = ConstantInt::get(RHS->getType(), minusC1); 
-      return BinaryOperator::CreateSRem(RHS, NewRHS); 
-    } 
-  } 
- 
-  // A+B --> A|B iff A and B have no bits set in common. 
-  if (haveNoCommonBitsSet(LHS, RHS, DL, &AC, &I, &DT)) 
-    return BinaryOperator::CreateOr(LHS, RHS); 
- 
-  // add (select X 0 (sub n A)) A  -->  select X A n 
-  { 
-    SelectInst *SI = dyn_cast<SelectInst>(LHS); 
-    Value *A = RHS; 
-    if (!SI) { 
-      SI = dyn_cast<SelectInst>(RHS); 
-      A = LHS; 
-    } 
-    if (SI && SI->hasOneUse()) { 
-      Value *TV = SI->getTrueValue(); 
-      Value *FV = SI->getFalseValue(); 
-      Value *N; 
- 
-      // Can we fold the add into the argument of the select? 
-      // We check both true and false select arguments for a matching subtract. 
-      if (match(FV, m_Zero()) && match(TV, m_Sub(m_Value(N), m_Specific(A)))) 
-        // Fold the add into the true select value. 
-        return SelectInst::Create(SI->getCondition(), N, A); 
- 
-      if (match(TV, m_Zero()) && match(FV, m_Sub(m_Value(N), m_Specific(A)))) 
-        // Fold the add into the false select value. 
-        return SelectInst::Create(SI->getCondition(), A, N); 
-    } 
-  } 
- 
-  if (Instruction *Ext = narrowMathIfNoOverflow(I)) 
-    return Ext; 
- 
-  // (add (xor A, B) (and A, B)) --> (or A, B) 
-  // (add (and A, B) (xor A, B)) --> (or A, B) 
-  if (match(&I, m_c_BinOp(m_Xor(m_Value(A), m_Value(B)), 
-                          m_c_And(m_Deferred(A), m_Deferred(B))))) 
-    return BinaryOperator::CreateOr(A, B); 
- 
-  // (add (or A, B) (and A, B)) --> (add A, B) 
-  // (add (and A, B) (or A, B)) --> (add A, B) 
-  if (match(&I, m_c_BinOp(m_Or(m_Value(A), m_Value(B)), 
-                          m_c_And(m_Deferred(A), m_Deferred(B))))) { 
-    // Replacing operands in-place to preserve nuw/nsw flags. 
-    replaceOperand(I, 0, A); 
-    replaceOperand(I, 1, B); 
-    return &I; 
-  } 
- 
-  // TODO(jingyue): Consider willNotOverflowSignedAdd and 
-  // willNotOverflowUnsignedAdd to reduce the number of invocations of 
-  // computeKnownBits. 
-  bool Changed = false; 
-  if (!I.hasNoSignedWrap() && willNotOverflowSignedAdd(LHS, RHS, I)) { 
-    Changed = true; 
-    I.setHasNoSignedWrap(true); 
-  } 
-  if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedAdd(LHS, RHS, I)) { 
-    Changed = true; 
-    I.setHasNoUnsignedWrap(true); 
-  } 
- 
-  if (Instruction *V = canonicalizeLowbitMask(I, Builder)) 
-    return V; 
- 
-  if (Instruction *V = 
-          canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(I)) 
-    return V; 
- 
-  if (Instruction *SatAdd = foldToUnsignedSaturatedAdd(I)) 
-    return SatAdd; 
- 
+  if (Instruction *X = foldAddWithConstant(I))
+    return X;
+
+  if (Instruction *X = foldNoWrapAdd(I, Builder))
+    return X;
+
+  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+  Type *Ty = I.getType();
+  if (Ty->isIntOrIntVectorTy(1))
+    return BinaryOperator::CreateXor(LHS, RHS);
+
+  // X + X --> X << 1
+  if (LHS == RHS) {
+    auto *Shl = BinaryOperator::CreateShl(LHS, ConstantInt::get(Ty, 1));
+    Shl->setHasNoSignedWrap(I.hasNoSignedWrap());
+    Shl->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
+    return Shl;
+  }
+
+  Value *A, *B;
+  if (match(LHS, m_Neg(m_Value(A)))) {
+    // -A + -B --> -(A + B)
+    if (match(RHS, m_Neg(m_Value(B))))
+      return BinaryOperator::CreateNeg(Builder.CreateAdd(A, B));
+
+    // -A + B --> B - A
+    return BinaryOperator::CreateSub(RHS, A);
+  }
+
+  // A + -B  -->  A - B
+  if (match(RHS, m_Neg(m_Value(B))))
+    return BinaryOperator::CreateSub(LHS, B);
+
+  if (Value *V = checkForNegativeOperand(I, Builder))
+    return replaceInstUsesWith(I, V);
+
+  // (A + 1) + ~B --> A - B
+  // ~B + (A + 1) --> A - B
+  // (~B + A) + 1 --> A - B
+  // (A + ~B) + 1 --> A - B
+  if (match(&I, m_c_BinOp(m_Add(m_Value(A), m_One()), m_Not(m_Value(B)))) ||
+      match(&I, m_BinOp(m_c_Add(m_Not(m_Value(B)), m_Value(A)), m_One())))
+    return BinaryOperator::CreateSub(A, B);
+
+  // (A + RHS) + RHS --> A + (RHS << 1)
+  if (match(LHS, m_OneUse(m_c_Add(m_Value(A), m_Specific(RHS)))))
+    return BinaryOperator::CreateAdd(A, Builder.CreateShl(RHS, 1, "reass.add"));
+
+  // LHS + (A + LHS) --> A + (LHS << 1)
+  if (match(RHS, m_OneUse(m_c_Add(m_Value(A), m_Specific(LHS)))))
+    return BinaryOperator::CreateAdd(A, Builder.CreateShl(LHS, 1, "reass.add"));
+
+  // X % C0 + (( X / C0 ) % C1) * C0 => X % (C0 * C1)
+  if (Value *V = SimplifyAddWithRemainder(I)) return replaceInstUsesWith(I, V);
+
+  // ((X s/ C1) << C2) + X => X s% -C1 where -C1 is 1 << C2
+  const APInt *C1, *C2;
+  if (match(LHS, m_Shl(m_SDiv(m_Specific(RHS), m_APInt(C1)), m_APInt(C2)))) {
+    APInt one(C2->getBitWidth(), 1);
+    APInt minusC1 = -(*C1);
+    if (minusC1 == (one << *C2)) {
+      Constant *NewRHS = ConstantInt::get(RHS->getType(), minusC1);
+      return BinaryOperator::CreateSRem(RHS, NewRHS);
+    }
+  }
+
+  // A+B --> A|B iff A and B have no bits set in common.
+  if (haveNoCommonBitsSet(LHS, RHS, DL, &AC, &I, &DT))
+    return BinaryOperator::CreateOr(LHS, RHS);
+
+  // add (select X 0 (sub n A)) A  -->  select X A n
+  {
+    SelectInst *SI = dyn_cast<SelectInst>(LHS);
+    Value *A = RHS;
+    if (!SI) {
+      SI = dyn_cast<SelectInst>(RHS);
+      A = LHS;
+    }
+    if (SI && SI->hasOneUse()) {
+      Value *TV = SI->getTrueValue();
+      Value *FV = SI->getFalseValue();
+      Value *N;
+
+      // Can we fold the add into the argument of the select?
+      // We check both true and false select arguments for a matching subtract.
+      if (match(FV, m_Zero()) && match(TV, m_Sub(m_Value(N), m_Specific(A))))
+        // Fold the add into the true select value.
+        return SelectInst::Create(SI->getCondition(), N, A);
+
+      if (match(TV, m_Zero()) && match(FV, m_Sub(m_Value(N), m_Specific(A))))
+        // Fold the add into the false select value.
+        return SelectInst::Create(SI->getCondition(), A, N);
+    }
+  }
+
+  if (Instruction *Ext = narrowMathIfNoOverflow(I))
+    return Ext;
+
+  // (add (xor A, B) (and A, B)) --> (or A, B)
+  // (add (and A, B) (xor A, B)) --> (or A, B)
+  if (match(&I, m_c_BinOp(m_Xor(m_Value(A), m_Value(B)),
+                          m_c_And(m_Deferred(A), m_Deferred(B)))))
+    return BinaryOperator::CreateOr(A, B);
+
+  // (add (or A, B) (and A, B)) --> (add A, B)
+  // (add (and A, B) (or A, B)) --> (add A, B)
+  if (match(&I, m_c_BinOp(m_Or(m_Value(A), m_Value(B)),
+                          m_c_And(m_Deferred(A), m_Deferred(B))))) {
+    // Replacing operands in-place to preserve nuw/nsw flags.
+    replaceOperand(I, 0, A);
+    replaceOperand(I, 1, B);
+    return &I;
+  }
+
+  // TODO(jingyue): Consider willNotOverflowSignedAdd and
+  // willNotOverflowUnsignedAdd to reduce the number of invocations of
+  // computeKnownBits.
+  bool Changed = false;
+  if (!I.hasNoSignedWrap() && willNotOverflowSignedAdd(LHS, RHS, I)) {
+    Changed = true;
+    I.setHasNoSignedWrap(true);
+  }
+  if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedAdd(LHS, RHS, I)) {
+    Changed = true;
+    I.setHasNoUnsignedWrap(true);
+  }
+
+  if (Instruction *V = canonicalizeLowbitMask(I, Builder))
+    return V;
+
+  if (Instruction *V =
+          canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(I))
+    return V;
+
+  if (Instruction *SatAdd = foldToUnsignedSaturatedAdd(I))
+    return SatAdd;
+
   // usub.sat(A, B) + B => umax(A, B)
   if (match(&I, m_c_BinOp(
           m_OneUse(m_Intrinsic<Intrinsic::usub_sat>(m_Value(A), m_Value(B))),
@@ -1442,286 +1442,286 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
         Builder.CreateIntrinsic(Intrinsic::umax, {I.getType()}, {A, B}));
   }
 
-  return Changed ? &I : nullptr; 
-} 
- 
-/// Eliminate an op from a linear interpolation (lerp) pattern. 
-static Instruction *factorizeLerp(BinaryOperator &I, 
-                                  InstCombiner::BuilderTy &Builder) { 
-  Value *X, *Y, *Z; 
-  if (!match(&I, m_c_FAdd(m_OneUse(m_c_FMul(m_Value(Y), 
-                                            m_OneUse(m_FSub(m_FPOne(), 
-                                                            m_Value(Z))))), 
-                          m_OneUse(m_c_FMul(m_Value(X), m_Deferred(Z)))))) 
-    return nullptr; 
- 
-  // (Y * (1.0 - Z)) + (X * Z) --> Y + Z * (X - Y) [8 commuted variants] 
-  Value *XY = Builder.CreateFSubFMF(X, Y, &I); 
-  Value *MulZ = Builder.CreateFMulFMF(Z, XY, &I); 
-  return BinaryOperator::CreateFAddFMF(Y, MulZ, &I); 
-} 
- 
-/// Factor a common operand out of fadd/fsub of fmul/fdiv. 
-static Instruction *factorizeFAddFSub(BinaryOperator &I, 
-                                      InstCombiner::BuilderTy &Builder) { 
-  assert((I.getOpcode() == Instruction::FAdd || 
-          I.getOpcode() == Instruction::FSub) && "Expecting fadd/fsub"); 
-  assert(I.hasAllowReassoc() && I.hasNoSignedZeros() && 
-         "FP factorization requires FMF"); 
- 
-  if (Instruction *Lerp = factorizeLerp(I, Builder)) 
-    return Lerp; 
- 
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); 
-  Value *X, *Y, *Z; 
-  bool IsFMul; 
-  if ((match(Op0, m_OneUse(m_FMul(m_Value(X), m_Value(Z)))) && 
-       match(Op1, m_OneUse(m_c_FMul(m_Value(Y), m_Specific(Z))))) || 
-      (match(Op0, m_OneUse(m_FMul(m_Value(Z), m_Value(X)))) && 
-       match(Op1, m_OneUse(m_c_FMul(m_Value(Y), m_Specific(Z)))))) 
-    IsFMul = true; 
-  else if (match(Op0, m_OneUse(m_FDiv(m_Value(X), m_Value(Z)))) && 
-           match(Op1, m_OneUse(m_FDiv(m_Value(Y), m_Specific(Z))))) 
-    IsFMul = false; 
-  else 
-    return nullptr; 
- 
-  // (X * Z) + (Y * Z) --> (X + Y) * Z 
-  // (X * Z) - (Y * Z) --> (X - Y) * Z 
-  // (X / Z) + (Y / Z) --> (X + Y) / Z 
-  // (X / Z) - (Y / Z) --> (X - Y) / Z 
-  bool IsFAdd = I.getOpcode() == Instruction::FAdd; 
-  Value *XY = IsFAdd ? Builder.CreateFAddFMF(X, Y, &I) 
-                     : Builder.CreateFSubFMF(X, Y, &I); 
- 
-  // Bail out if we just created a denormal constant. 
-  // TODO: This is copied from a previous implementation. Is it necessary? 
-  const APFloat *C; 
-  if (match(XY, m_APFloat(C)) && !C->isNormal()) 
-    return nullptr; 
- 
-  return IsFMul ? BinaryOperator::CreateFMulFMF(XY, Z, &I) 
-                : BinaryOperator::CreateFDivFMF(XY, Z, &I); 
-} 
- 
+  return Changed ? &I : nullptr;
+}
+
+/// Eliminate an op from a linear interpolation (lerp) pattern.
+static Instruction *factorizeLerp(BinaryOperator &I,
+                                  InstCombiner::BuilderTy &Builder) {
+  Value *X, *Y, *Z;
+  if (!match(&I, m_c_FAdd(m_OneUse(m_c_FMul(m_Value(Y),
+                                            m_OneUse(m_FSub(m_FPOne(),
+                                                            m_Value(Z))))),
+                          m_OneUse(m_c_FMul(m_Value(X), m_Deferred(Z))))))
+    return nullptr;
+
+  // (Y * (1.0 - Z)) + (X * Z) --> Y + Z * (X - Y) [8 commuted variants]
+  Value *XY = Builder.CreateFSubFMF(X, Y, &I);
+  Value *MulZ = Builder.CreateFMulFMF(Z, XY, &I);
+  return BinaryOperator::CreateFAddFMF(Y, MulZ, &I);
+}
+
+/// Factor a common operand out of fadd/fsub of fmul/fdiv.
+static Instruction *factorizeFAddFSub(BinaryOperator &I,
+                                      InstCombiner::BuilderTy &Builder) {
+  assert((I.getOpcode() == Instruction::FAdd ||
+          I.getOpcode() == Instruction::FSub) && "Expecting fadd/fsub");
+  assert(I.hasAllowReassoc() && I.hasNoSignedZeros() &&
+         "FP factorization requires FMF");
+
+  if (Instruction *Lerp = factorizeLerp(I, Builder))
+    return Lerp;
+
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  Value *X, *Y, *Z;
+  bool IsFMul;
+  if ((match(Op0, m_OneUse(m_FMul(m_Value(X), m_Value(Z)))) &&
+       match(Op1, m_OneUse(m_c_FMul(m_Value(Y), m_Specific(Z))))) ||
+      (match(Op0, m_OneUse(m_FMul(m_Value(Z), m_Value(X)))) &&
+       match(Op1, m_OneUse(m_c_FMul(m_Value(Y), m_Specific(Z))))))
+    IsFMul = true;
+  else if (match(Op0, m_OneUse(m_FDiv(m_Value(X), m_Value(Z)))) &&
+           match(Op1, m_OneUse(m_FDiv(m_Value(Y), m_Specific(Z)))))
+    IsFMul = false;
+  else
+    return nullptr;
+
+  // (X * Z) + (Y * Z) --> (X + Y) * Z
+  // (X * Z) - (Y * Z) --> (X - Y) * Z
+  // (X / Z) + (Y / Z) --> (X + Y) / Z
+  // (X / Z) - (Y / Z) --> (X - Y) / Z
+  bool IsFAdd = I.getOpcode() == Instruction::FAdd;
+  Value *XY = IsFAdd ? Builder.CreateFAddFMF(X, Y, &I)
+                     : Builder.CreateFSubFMF(X, Y, &I);
+
+  // Bail out if we just created a denormal constant.
+  // TODO: This is copied from a previous implementation. Is it necessary?
+  const APFloat *C;
+  if (match(XY, m_APFloat(C)) && !C->isNormal())
+    return nullptr;
+
+  return IsFMul ? BinaryOperator::CreateFMulFMF(XY, Z, &I)
+                : BinaryOperator::CreateFDivFMF(XY, Z, &I);
+}
+
 Instruction *InstCombinerImpl::visitFAdd(BinaryOperator &I) {
-  if (Value *V = SimplifyFAddInst(I.getOperand(0), I.getOperand(1), 
-                                  I.getFastMathFlags(), 
-                                  SQ.getWithInstruction(&I))) 
-    return replaceInstUsesWith(I, V); 
- 
-  if (SimplifyAssociativeOrCommutative(I)) 
-    return &I; 
- 
-  if (Instruction *X = foldVectorBinop(I)) 
-    return X; 
- 
-  if (Instruction *FoldedFAdd = foldBinOpIntoSelectOrPhi(I)) 
-    return FoldedFAdd; 
- 
-  // (-X) + Y --> Y - X 
-  Value *X, *Y; 
-  if (match(&I, m_c_FAdd(m_FNeg(m_Value(X)), m_Value(Y)))) 
-    return BinaryOperator::CreateFSubFMF(Y, X, &I); 
- 
-  // Similar to above, but look through fmul/fdiv for the negated term. 
-  // (-X * Y) + Z --> Z - (X * Y) [4 commuted variants] 
-  Value *Z; 
-  if (match(&I, m_c_FAdd(m_OneUse(m_c_FMul(m_FNeg(m_Value(X)), m_Value(Y))), 
-                         m_Value(Z)))) { 
-    Value *XY = Builder.CreateFMulFMF(X, Y, &I); 
-    return BinaryOperator::CreateFSubFMF(Z, XY, &I); 
-  } 
-  // (-X / Y) + Z --> Z - (X / Y) [2 commuted variants] 
-  // (X / -Y) + Z --> Z - (X / Y) [2 commuted variants] 
-  if (match(&I, m_c_FAdd(m_OneUse(m_FDiv(m_FNeg(m_Value(X)), m_Value(Y))), 
-                         m_Value(Z))) || 
-      match(&I, m_c_FAdd(m_OneUse(m_FDiv(m_Value(X), m_FNeg(m_Value(Y)))), 
-                         m_Value(Z)))) { 
-    Value *XY = Builder.CreateFDivFMF(X, Y, &I); 
-    return BinaryOperator::CreateFSubFMF(Z, XY, &I); 
-  } 
- 
-  // Check for (fadd double (sitofp x), y), see if we can merge this into an 
-  // integer add followed by a promotion. 
-  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 
-  if (SIToFPInst *LHSConv = dyn_cast<SIToFPInst>(LHS)) { 
-    Value *LHSIntVal = LHSConv->getOperand(0); 
-    Type *FPType = LHSConv->getType(); 
- 
-    // TODO: This check is overly conservative. In many cases known bits 
-    // analysis can tell us that the result of the addition has less significant 
-    // bits than the integer type can hold. 
-    auto IsValidPromotion = [](Type *FTy, Type *ITy) { 
-      Type *FScalarTy = FTy->getScalarType(); 
-      Type *IScalarTy = ITy->getScalarType(); 
- 
-      // Do we have enough bits in the significand to represent the result of 
-      // the integer addition? 
-      unsigned MaxRepresentableBits = 
-          APFloat::semanticsPrecision(FScalarTy->getFltSemantics()); 
-      return IScalarTy->getIntegerBitWidth() <= MaxRepresentableBits; 
-    }; 
- 
-    // (fadd double (sitofp x), fpcst) --> (sitofp (add int x, intcst)) 
-    // ... if the constant fits in the integer value.  This is useful for things 
-    // like (double)(x & 1234) + 4.0 -> (double)((X & 1234)+4) which no longer 
-    // requires a constant pool load, and generally allows the add to be better 
-    // instcombined. 
-    if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHS)) 
-      if (IsValidPromotion(FPType, LHSIntVal->getType())) { 
-        Constant *CI = 
-          ConstantExpr::getFPToSI(CFP, LHSIntVal->getType()); 
-        if (LHSConv->hasOneUse() && 
-            ConstantExpr::getSIToFP(CI, I.getType()) == CFP && 
-            willNotOverflowSignedAdd(LHSIntVal, CI, I)) { 
-          // Insert the new integer add. 
-          Value *NewAdd = Builder.CreateNSWAdd(LHSIntVal, CI, "addconv"); 
-          return new SIToFPInst(NewAdd, I.getType()); 
-        } 
-      } 
- 
-    // (fadd double (sitofp x), (sitofp y)) --> (sitofp (add int x, y)) 
-    if (SIToFPInst *RHSConv = dyn_cast<SIToFPInst>(RHS)) { 
-      Value *RHSIntVal = RHSConv->getOperand(0); 
-      // It's enough to check LHS types only because we require int types to 
-      // be the same for this transform. 
-      if (IsValidPromotion(FPType, LHSIntVal->getType())) { 
-        // Only do this if x/y have the same type, if at least one of them has a 
-        // single use (so we don't increase the number of int->fp conversions), 
-        // and if the integer add will not overflow. 
-        if (LHSIntVal->getType() == RHSIntVal->getType() && 
-            (LHSConv->hasOneUse() || RHSConv->hasOneUse()) && 
-            willNotOverflowSignedAdd(LHSIntVal, RHSIntVal, I)) { 
-          // Insert the new integer add. 
-          Value *NewAdd = Builder.CreateNSWAdd(LHSIntVal, RHSIntVal, "addconv"); 
-          return new SIToFPInst(NewAdd, I.getType()); 
-        } 
-      } 
-    } 
-  } 
- 
-  // Handle specials cases for FAdd with selects feeding the operation 
-  if (Value *V = SimplifySelectsFeedingBinaryOp(I, LHS, RHS)) 
-    return replaceInstUsesWith(I, V); 
- 
-  if (I.hasAllowReassoc() && I.hasNoSignedZeros()) { 
-    if (Instruction *F = factorizeFAddFSub(I, Builder)) 
-      return F; 
-    if (Value *V = FAddCombine(Builder).simplify(&I)) 
-      return replaceInstUsesWith(I, V); 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Optimize pointer differences into the same array into a size.  Consider: 
-///  &A[10] - &A[0]: we should compile this to "10".  LHS/RHS are the pointer 
-/// operands to the ptrtoint instructions for the LHS/RHS of the subtract. 
+  if (Value *V = SimplifyFAddInst(I.getOperand(0), I.getOperand(1),
+                                  I.getFastMathFlags(),
+                                  SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  if (SimplifyAssociativeOrCommutative(I))
+    return &I;
+
+  if (Instruction *X = foldVectorBinop(I))
+    return X;
+
+  if (Instruction *FoldedFAdd = foldBinOpIntoSelectOrPhi(I))
+    return FoldedFAdd;
+
+  // (-X) + Y --> Y - X
+  Value *X, *Y;
+  if (match(&I, m_c_FAdd(m_FNeg(m_Value(X)), m_Value(Y))))
+    return BinaryOperator::CreateFSubFMF(Y, X, &I);
+
+  // Similar to above, but look through fmul/fdiv for the negated term.
+  // (-X * Y) + Z --> Z - (X * Y) [4 commuted variants]
+  Value *Z;
+  if (match(&I, m_c_FAdd(m_OneUse(m_c_FMul(m_FNeg(m_Value(X)), m_Value(Y))),
+                         m_Value(Z)))) {
+    Value *XY = Builder.CreateFMulFMF(X, Y, &I);
+    return BinaryOperator::CreateFSubFMF(Z, XY, &I);
+  }
+  // (-X / Y) + Z --> Z - (X / Y) [2 commuted variants]
+  // (X / -Y) + Z --> Z - (X / Y) [2 commuted variants]
+  if (match(&I, m_c_FAdd(m_OneUse(m_FDiv(m_FNeg(m_Value(X)), m_Value(Y))),
+                         m_Value(Z))) ||
+      match(&I, m_c_FAdd(m_OneUse(m_FDiv(m_Value(X), m_FNeg(m_Value(Y)))),
+                         m_Value(Z)))) {
+    Value *XY = Builder.CreateFDivFMF(X, Y, &I);
+    return BinaryOperator::CreateFSubFMF(Z, XY, &I);
+  }
+
+  // Check for (fadd double (sitofp x), y), see if we can merge this into an
+  // integer add followed by a promotion.
+  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+  if (SIToFPInst *LHSConv = dyn_cast<SIToFPInst>(LHS)) {
+    Value *LHSIntVal = LHSConv->getOperand(0);
+    Type *FPType = LHSConv->getType();
+
+    // TODO: This check is overly conservative. In many cases known bits
+    // analysis can tell us that the result of the addition has less significant
+    // bits than the integer type can hold.
+    auto IsValidPromotion = [](Type *FTy, Type *ITy) {
+      Type *FScalarTy = FTy->getScalarType();
+      Type *IScalarTy = ITy->getScalarType();
+
+      // Do we have enough bits in the significand to represent the result of
+      // the integer addition?
+      unsigned MaxRepresentableBits =
+          APFloat::semanticsPrecision(FScalarTy->getFltSemantics());
+      return IScalarTy->getIntegerBitWidth() <= MaxRepresentableBits;
+    };
+
+    // (fadd double (sitofp x), fpcst) --> (sitofp (add int x, intcst))
+    // ... if the constant fits in the integer value.  This is useful for things
+    // like (double)(x & 1234) + 4.0 -> (double)((X & 1234)+4) which no longer
+    // requires a constant pool load, and generally allows the add to be better
+    // instcombined.
+    if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHS))
+      if (IsValidPromotion(FPType, LHSIntVal->getType())) {
+        Constant *CI =
+          ConstantExpr::getFPToSI(CFP, LHSIntVal->getType());
+        if (LHSConv->hasOneUse() &&
+            ConstantExpr::getSIToFP(CI, I.getType()) == CFP &&
+            willNotOverflowSignedAdd(LHSIntVal, CI, I)) {
+          // Insert the new integer add.
+          Value *NewAdd = Builder.CreateNSWAdd(LHSIntVal, CI, "addconv");
+          return new SIToFPInst(NewAdd, I.getType());
+        }
+      }
+
+    // (fadd double (sitofp x), (sitofp y)) --> (sitofp (add int x, y))
+    if (SIToFPInst *RHSConv = dyn_cast<SIToFPInst>(RHS)) {
+      Value *RHSIntVal = RHSConv->getOperand(0);
+      // It's enough to check LHS types only because we require int types to
+      // be the same for this transform.
+      if (IsValidPromotion(FPType, LHSIntVal->getType())) {
+        // Only do this if x/y have the same type, if at least one of them has a
+        // single use (so we don't increase the number of int->fp conversions),
+        // and if the integer add will not overflow.
+        if (LHSIntVal->getType() == RHSIntVal->getType() &&
+            (LHSConv->hasOneUse() || RHSConv->hasOneUse()) &&
+            willNotOverflowSignedAdd(LHSIntVal, RHSIntVal, I)) {
+          // Insert the new integer add.
+          Value *NewAdd = Builder.CreateNSWAdd(LHSIntVal, RHSIntVal, "addconv");
+          return new SIToFPInst(NewAdd, I.getType());
+        }
+      }
+    }
+  }
+
+  // Handle specials cases for FAdd with selects feeding the operation
+  if (Value *V = SimplifySelectsFeedingBinaryOp(I, LHS, RHS))
+    return replaceInstUsesWith(I, V);
+
+  if (I.hasAllowReassoc() && I.hasNoSignedZeros()) {
+    if (Instruction *F = factorizeFAddFSub(I, Builder))
+      return F;
+    if (Value *V = FAddCombine(Builder).simplify(&I))
+      return replaceInstUsesWith(I, V);
+  }
+
+  return nullptr;
+}
+
+/// Optimize pointer differences into the same array into a size.  Consider:
+///  &A[10] - &A[0]: we should compile this to "10".  LHS/RHS are the pointer
+/// operands to the ptrtoint instructions for the LHS/RHS of the subtract.
 Value *InstCombinerImpl::OptimizePointerDifference(Value *LHS, Value *RHS,
                                                    Type *Ty, bool IsNUW) {
-  // If LHS is a gep based on RHS or RHS is a gep based on LHS, we can optimize 
-  // this. 
-  bool Swapped = false; 
-  GEPOperator *GEP1 = nullptr, *GEP2 = nullptr; 
+  // If LHS is a gep based on RHS or RHS is a gep based on LHS, we can optimize
+  // this.
+  bool Swapped = false;
+  GEPOperator *GEP1 = nullptr, *GEP2 = nullptr;
   if (!isa<GEPOperator>(LHS) && isa<GEPOperator>(RHS)) {
     std::swap(LHS, RHS);
     Swapped = true;
   }
- 
+
   // Require at least one GEP with a common base pointer on both sides.
   if (auto *LHSGEP = dyn_cast<GEPOperator>(LHS)) {
-    // (gep X, ...) - X 
-    if (LHSGEP->getOperand(0) == RHS) { 
-      GEP1 = LHSGEP; 
+    // (gep X, ...) - X
+    if (LHSGEP->getOperand(0) == RHS) {
+      GEP1 = LHSGEP;
     } else if (auto *RHSGEP = dyn_cast<GEPOperator>(RHS)) {
-      // (gep X, ...) - (gep X, ...) 
-      if (LHSGEP->getOperand(0)->stripPointerCasts() == 
+      // (gep X, ...) - (gep X, ...)
+      if (LHSGEP->getOperand(0)->stripPointerCasts() ==
           RHSGEP->getOperand(0)->stripPointerCasts()) {
         GEP1 = LHSGEP;
-        GEP2 = RHSGEP; 
-      } 
-    } 
-  } 
- 
-  if (!GEP1) 
-    return nullptr; 
- 
-  if (GEP2) { 
-    // (gep X, ...) - (gep X, ...) 
-    // 
-    // Avoid duplicating the arithmetic if there are more than one non-constant 
-    // indices between the two GEPs and either GEP has a non-constant index and 
-    // multiple users. If zero non-constant index, the result is a constant and 
-    // there is no duplication. If one non-constant index, the result is an add 
-    // or sub with a constant, which is no larger than the original code, and 
-    // there's no duplicated arithmetic, even if either GEP has multiple 
-    // users. If more than one non-constant indices combined, as long as the GEP 
-    // with at least one non-constant index doesn't have multiple users, there 
-    // is no duplication. 
-    unsigned NumNonConstantIndices1 = GEP1->countNonConstantIndices(); 
-    unsigned NumNonConstantIndices2 = GEP2->countNonConstantIndices(); 
-    if (NumNonConstantIndices1 + NumNonConstantIndices2 > 1 && 
-        ((NumNonConstantIndices1 > 0 && !GEP1->hasOneUse()) || 
-         (NumNonConstantIndices2 > 0 && !GEP2->hasOneUse()))) { 
-      return nullptr; 
-    } 
-  } 
- 
-  // Emit the offset of the GEP and an intptr_t. 
-  Value *Result = EmitGEPOffset(GEP1); 
- 
-  // If this is a single inbounds GEP and the original sub was nuw, 
+        GEP2 = RHSGEP;
+      }
+    }
+  }
+
+  if (!GEP1)
+    return nullptr;
+
+  if (GEP2) {
+    // (gep X, ...) - (gep X, ...)
+    //
+    // Avoid duplicating the arithmetic if there are more than one non-constant
+    // indices between the two GEPs and either GEP has a non-constant index and
+    // multiple users. If zero non-constant index, the result is a constant and
+    // there is no duplication. If one non-constant index, the result is an add
+    // or sub with a constant, which is no larger than the original code, and
+    // there's no duplicated arithmetic, even if either GEP has multiple
+    // users. If more than one non-constant indices combined, as long as the GEP
+    // with at least one non-constant index doesn't have multiple users, there
+    // is no duplication.
+    unsigned NumNonConstantIndices1 = GEP1->countNonConstantIndices();
+    unsigned NumNonConstantIndices2 = GEP2->countNonConstantIndices();
+    if (NumNonConstantIndices1 + NumNonConstantIndices2 > 1 &&
+        ((NumNonConstantIndices1 > 0 && !GEP1->hasOneUse()) ||
+         (NumNonConstantIndices2 > 0 && !GEP2->hasOneUse()))) {
+      return nullptr;
+    }
+  }
+
+  // Emit the offset of the GEP and an intptr_t.
+  Value *Result = EmitGEPOffset(GEP1);
+
+  // If this is a single inbounds GEP and the original sub was nuw,
   // then the final multiplication is also nuw.
   if (auto *I = dyn_cast<Instruction>(Result))
     if (IsNUW && !GEP2 && !Swapped && GEP1->isInBounds() &&
         I->getOpcode() == Instruction::Mul)
       I->setHasNoUnsignedWrap();
- 
+
   // If we have a 2nd GEP of the same base pointer, subtract the offsets.
   // If both GEPs are inbounds, then the subtract does not have signed overflow.
-  if (GEP2) { 
-    Value *Offset = EmitGEPOffset(GEP2); 
+  if (GEP2) {
+    Value *Offset = EmitGEPOffset(GEP2);
     Result = Builder.CreateSub(Result, Offset, "gepdiff", /* NUW */ false,
                                GEP1->isInBounds() && GEP2->isInBounds());
-  } 
- 
-  // If we have p - gep(p, ...)  then we have to negate the result. 
-  if (Swapped) 
-    Result = Builder.CreateNeg(Result, "diff.neg"); 
- 
-  return Builder.CreateIntCast(Result, Ty, true); 
-} 
- 
+  }
+
+  // If we have p - gep(p, ...)  then we have to negate the result.
+  if (Swapped)
+    Result = Builder.CreateNeg(Result, "diff.neg");
+
+  return Builder.CreateIntCast(Result, Ty, true);
+}
+
 Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
-  if (Value *V = SimplifySubInst(I.getOperand(0), I.getOperand(1), 
-                                 I.hasNoSignedWrap(), I.hasNoUnsignedWrap(), 
-                                 SQ.getWithInstruction(&I))) 
-    return replaceInstUsesWith(I, V); 
- 
-  if (Instruction *X = foldVectorBinop(I)) 
-    return X; 
- 
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); 
- 
-  // If this is a 'B = x-(-A)', change to B = x+A. 
-  // We deal with this without involving Negator to preserve NSW flag. 
-  if (Value *V = dyn_castNegVal(Op1)) { 
-    BinaryOperator *Res = BinaryOperator::CreateAdd(Op0, V); 
- 
-    if (const auto *BO = dyn_cast<BinaryOperator>(Op1)) { 
-      assert(BO->getOpcode() == Instruction::Sub && 
-             "Expected a subtraction operator!"); 
-      if (BO->hasNoSignedWrap() && I.hasNoSignedWrap()) 
-        Res->setHasNoSignedWrap(true); 
-    } else { 
-      if (cast<Constant>(Op1)->isNotMinSignedValue() && I.hasNoSignedWrap()) 
-        Res->setHasNoSignedWrap(true); 
-    } 
- 
-    return Res; 
-  } 
- 
+  if (Value *V = SimplifySubInst(I.getOperand(0), I.getOperand(1),
+                                 I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
+                                 SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  if (Instruction *X = foldVectorBinop(I))
+    return X;
+
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // If this is a 'B = x-(-A)', change to B = x+A.
+  // We deal with this without involving Negator to preserve NSW flag.
+  if (Value *V = dyn_castNegVal(Op1)) {
+    BinaryOperator *Res = BinaryOperator::CreateAdd(Op0, V);
+
+    if (const auto *BO = dyn_cast<BinaryOperator>(Op1)) {
+      assert(BO->getOpcode() == Instruction::Sub &&
+             "Expected a subtraction operator!");
+      if (BO->hasNoSignedWrap() && I.hasNoSignedWrap())
+        Res->setHasNoSignedWrap(true);
+    } else {
+      if (cast<Constant>(Op1)->isNotMinSignedValue() && I.hasNoSignedWrap())
+        Res->setHasNoSignedWrap(true);
+    }
+
+    return Res;
+  }
+
   // Try this before Negator to preserve NSW flag.
   if (Instruction *R = factorizeMathWithShlOps(I, Builder))
     return R;
@@ -1735,144 +1735,144 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
       return BinaryOperator::CreateSub(ConstantExpr::getSub(C, C2), X);
   }
 
-  auto TryToNarrowDeduceFlags = [this, &I, &Op0, &Op1]() -> Instruction * { 
-    if (Instruction *Ext = narrowMathIfNoOverflow(I)) 
-      return Ext; 
- 
-    bool Changed = false; 
-    if (!I.hasNoSignedWrap() && willNotOverflowSignedSub(Op0, Op1, I)) { 
-      Changed = true; 
-      I.setHasNoSignedWrap(true); 
-    } 
-    if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedSub(Op0, Op1, I)) { 
-      Changed = true; 
-      I.setHasNoUnsignedWrap(true); 
-    } 
- 
-    return Changed ? &I : nullptr; 
-  }; 
- 
-  // First, let's try to interpret `sub a, b` as `add a, (sub 0, b)`, 
-  // and let's try to sink `(sub 0, b)` into `b` itself. But only if this isn't 
-  // a pure negation used by a select that looks like abs/nabs. 
-  bool IsNegation = match(Op0, m_ZeroInt()); 
-  if (!IsNegation || none_of(I.users(), [&I, Op1](const User *U) { 
-        const Instruction *UI = dyn_cast<Instruction>(U); 
-        if (!UI) 
-          return false; 
-        return match(UI, 
-                     m_Select(m_Value(), m_Specific(Op1), m_Specific(&I))) || 
-               match(UI, m_Select(m_Value(), m_Specific(&I), m_Specific(Op1))); 
-      })) { 
-    if (Value *NegOp1 = Negator::Negate(IsNegation, Op1, *this)) 
-      return BinaryOperator::CreateAdd(NegOp1, Op0); 
-  } 
-  if (IsNegation) 
-    return TryToNarrowDeduceFlags(); // Should have been handled in Negator! 
- 
-  // (A*B)-(A*C) -> A*(B-C) etc 
-  if (Value *V = SimplifyUsingDistributiveLaws(I)) 
-    return replaceInstUsesWith(I, V); 
- 
-  if (I.getType()->isIntOrIntVectorTy(1)) 
-    return BinaryOperator::CreateXor(Op0, Op1); 
- 
-  // Replace (-1 - A) with (~A). 
-  if (match(Op0, m_AllOnes())) 
-    return BinaryOperator::CreateNot(Op1); 
- 
-  // (~X) - (~Y) --> Y - X 
-  Value *X, *Y; 
-  if (match(Op0, m_Not(m_Value(X))) && match(Op1, m_Not(m_Value(Y)))) 
-    return BinaryOperator::CreateSub(Y, X); 
- 
-  // (X + -1) - Y --> ~Y + X 
-  if (match(Op0, m_OneUse(m_Add(m_Value(X), m_AllOnes())))) 
-    return BinaryOperator::CreateAdd(Builder.CreateNot(Op1), X); 
- 
-  // Reassociate sub/add sequences to create more add instructions and 
-  // reduce dependency chains: 
-  // ((X - Y) + Z) - Op1 --> (X + Z) - (Y + Op1) 
-  Value *Z; 
-  if (match(Op0, m_OneUse(m_c_Add(m_OneUse(m_Sub(m_Value(X), m_Value(Y))), 
-                                  m_Value(Z))))) { 
-    Value *XZ = Builder.CreateAdd(X, Z); 
-    Value *YW = Builder.CreateAdd(Y, Op1); 
-    return BinaryOperator::CreateSub(XZ, YW); 
-  } 
- 
-  auto m_AddRdx = [](Value *&Vec) { 
+  auto TryToNarrowDeduceFlags = [this, &I, &Op0, &Op1]() -> Instruction * {
+    if (Instruction *Ext = narrowMathIfNoOverflow(I))
+      return Ext;
+
+    bool Changed = false;
+    if (!I.hasNoSignedWrap() && willNotOverflowSignedSub(Op0, Op1, I)) {
+      Changed = true;
+      I.setHasNoSignedWrap(true);
+    }
+    if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedSub(Op0, Op1, I)) {
+      Changed = true;
+      I.setHasNoUnsignedWrap(true);
+    }
+
+    return Changed ? &I : nullptr;
+  };
+
+  // First, let's try to interpret `sub a, b` as `add a, (sub 0, b)`,
+  // and let's try to sink `(sub 0, b)` into `b` itself. But only if this isn't
+  // a pure negation used by a select that looks like abs/nabs.
+  bool IsNegation = match(Op0, m_ZeroInt());
+  if (!IsNegation || none_of(I.users(), [&I, Op1](const User *U) {
+        const Instruction *UI = dyn_cast<Instruction>(U);
+        if (!UI)
+          return false;
+        return match(UI,
+                     m_Select(m_Value(), m_Specific(Op1), m_Specific(&I))) ||
+               match(UI, m_Select(m_Value(), m_Specific(&I), m_Specific(Op1)));
+      })) {
+    if (Value *NegOp1 = Negator::Negate(IsNegation, Op1, *this))
+      return BinaryOperator::CreateAdd(NegOp1, Op0);
+  }
+  if (IsNegation)
+    return TryToNarrowDeduceFlags(); // Should have been handled in Negator!
+
+  // (A*B)-(A*C) -> A*(B-C) etc
+  if (Value *V = SimplifyUsingDistributiveLaws(I))
+    return replaceInstUsesWith(I, V);
+
+  if (I.getType()->isIntOrIntVectorTy(1))
+    return BinaryOperator::CreateXor(Op0, Op1);
+
+  // Replace (-1 - A) with (~A).
+  if (match(Op0, m_AllOnes()))
+    return BinaryOperator::CreateNot(Op1);
+
+  // (~X) - (~Y) --> Y - X
+  Value *X, *Y;
+  if (match(Op0, m_Not(m_Value(X))) && match(Op1, m_Not(m_Value(Y))))
+    return BinaryOperator::CreateSub(Y, X);
+
+  // (X + -1) - Y --> ~Y + X
+  if (match(Op0, m_OneUse(m_Add(m_Value(X), m_AllOnes()))))
+    return BinaryOperator::CreateAdd(Builder.CreateNot(Op1), X);
+
+  // Reassociate sub/add sequences to create more add instructions and
+  // reduce dependency chains:
+  // ((X - Y) + Z) - Op1 --> (X + Z) - (Y + Op1)
+  Value *Z;
+  if (match(Op0, m_OneUse(m_c_Add(m_OneUse(m_Sub(m_Value(X), m_Value(Y))),
+                                  m_Value(Z))))) {
+    Value *XZ = Builder.CreateAdd(X, Z);
+    Value *YW = Builder.CreateAdd(Y, Op1);
+    return BinaryOperator::CreateSub(XZ, YW);
+  }
+
+  auto m_AddRdx = [](Value *&Vec) {
     return m_OneUse(m_Intrinsic<Intrinsic::vector_reduce_add>(m_Value(Vec)));
-  }; 
-  Value *V0, *V1; 
-  if (match(Op0, m_AddRdx(V0)) && match(Op1, m_AddRdx(V1)) && 
-      V0->getType() == V1->getType()) { 
-    // Difference of sums is sum of differences: 
-    // add_rdx(V0) - add_rdx(V1) --> add_rdx(V0 - V1) 
-    Value *Sub = Builder.CreateSub(V0, V1); 
+  };
+  Value *V0, *V1;
+  if (match(Op0, m_AddRdx(V0)) && match(Op1, m_AddRdx(V1)) &&
+      V0->getType() == V1->getType()) {
+    // Difference of sums is sum of differences:
+    // add_rdx(V0) - add_rdx(V1) --> add_rdx(V0 - V1)
+    Value *Sub = Builder.CreateSub(V0, V1);
     Value *Rdx = Builder.CreateIntrinsic(Intrinsic::vector_reduce_add,
                                          {Sub->getType()}, {Sub});
-    return replaceInstUsesWith(I, Rdx); 
-  } 
- 
-  if (Constant *C = dyn_cast<Constant>(Op0)) { 
-    Value *X; 
-    if (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) 
-      // C - (zext bool) --> bool ? C - 1 : C 
+    return replaceInstUsesWith(I, Rdx);
+  }
+
+  if (Constant *C = dyn_cast<Constant>(Op0)) {
+    Value *X;
+    if (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
+      // C - (zext bool) --> bool ? C - 1 : C
       return SelectInst::Create(X, InstCombiner::SubOne(C), C);
-    if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) 
-      // C - (sext bool) --> bool ? C + 1 : C 
+    if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
+      // C - (sext bool) --> bool ? C + 1 : C
       return SelectInst::Create(X, InstCombiner::AddOne(C), C);
- 
-    // C - ~X == X + (1+C) 
-    if (match(Op1, m_Not(m_Value(X)))) 
+
+    // C - ~X == X + (1+C)
+    if (match(Op1, m_Not(m_Value(X))))
       return BinaryOperator::CreateAdd(X, InstCombiner::AddOne(C));
- 
-    // Try to fold constant sub into select arguments. 
-    if (SelectInst *SI = dyn_cast<SelectInst>(Op1)) 
-      if (Instruction *R = FoldOpIntoSelect(I, SI)) 
-        return R; 
- 
-    // Try to fold constant sub into PHI values. 
-    if (PHINode *PN = dyn_cast<PHINode>(Op1)) 
-      if (Instruction *R = foldOpIntoPhi(I, PN)) 
-        return R; 
- 
-    Constant *C2; 
- 
-    // C-(C2-X) --> X+(C-C2) 
+
+    // Try to fold constant sub into select arguments.
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
+      if (Instruction *R = FoldOpIntoSelect(I, SI))
+        return R;
+
+    // Try to fold constant sub into PHI values.
+    if (PHINode *PN = dyn_cast<PHINode>(Op1))
+      if (Instruction *R = foldOpIntoPhi(I, PN))
+        return R;
+
+    Constant *C2;
+
+    // C-(C2-X) --> X+(C-C2)
     if (match(Op1, m_Sub(m_ImmConstant(C2), m_Value(X))))
-      return BinaryOperator::CreateAdd(X, ConstantExpr::getSub(C, C2)); 
-  } 
- 
-  const APInt *Op0C; 
-  if (match(Op0, m_APInt(Op0C)) && Op0C->isMask()) { 
-    // Turn this into a xor if LHS is 2^n-1 and the remaining bits are known 
-    // zero. 
-    KnownBits RHSKnown = computeKnownBits(Op1, 0, &I); 
-    if ((*Op0C | RHSKnown.Zero).isAllOnesValue()) 
-      return BinaryOperator::CreateXor(Op1, Op0); 
-  } 
- 
-  { 
-    Value *Y; 
-    // X-(X+Y) == -Y    X-(Y+X) == -Y 
-    if (match(Op1, m_c_Add(m_Specific(Op0), m_Value(Y)))) 
-      return BinaryOperator::CreateNeg(Y); 
- 
-    // (X-Y)-X == -Y 
-    if (match(Op0, m_Sub(m_Specific(Op1), m_Value(Y)))) 
-      return BinaryOperator::CreateNeg(Y); 
-  } 
- 
-  // (sub (or A, B) (and A, B)) --> (xor A, B) 
-  { 
-    Value *A, *B; 
-    if (match(Op1, m_And(m_Value(A), m_Value(B))) && 
-        match(Op0, m_c_Or(m_Specific(A), m_Specific(B)))) 
-      return BinaryOperator::CreateXor(A, B); 
-  } 
- 
+      return BinaryOperator::CreateAdd(X, ConstantExpr::getSub(C, C2));
+  }
+
+  const APInt *Op0C;
+  if (match(Op0, m_APInt(Op0C)) && Op0C->isMask()) {
+    // Turn this into a xor if LHS is 2^n-1 and the remaining bits are known
+    // zero.
+    KnownBits RHSKnown = computeKnownBits(Op1, 0, &I);
+    if ((*Op0C | RHSKnown.Zero).isAllOnesValue())
+      return BinaryOperator::CreateXor(Op1, Op0);
+  }
+
+  {
+    Value *Y;
+    // X-(X+Y) == -Y    X-(Y+X) == -Y
+    if (match(Op1, m_c_Add(m_Specific(Op0), m_Value(Y))))
+      return BinaryOperator::CreateNeg(Y);
+
+    // (X-Y)-X == -Y
+    if (match(Op0, m_Sub(m_Specific(Op1), m_Value(Y))))
+      return BinaryOperator::CreateNeg(Y);
+  }
+
+  // (sub (or A, B) (and A, B)) --> (xor A, B)
+  {
+    Value *A, *B;
+    if (match(Op1, m_And(m_Value(A), m_Value(B))) &&
+        match(Op0, m_c_Or(m_Specific(A), m_Specific(B))))
+      return BinaryOperator::CreateXor(A, B);
+  }
+
   // (sub (add A, B) (or A, B)) --> (and A, B)
   {
     Value *A, *B;
@@ -1889,184 +1889,184 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
       return BinaryOperator::CreateOr(A, B);
   }
 
-  // (sub (and A, B) (or A, B)) --> neg (xor A, B) 
-  { 
-    Value *A, *B; 
-    if (match(Op0, m_And(m_Value(A), m_Value(B))) && 
-        match(Op1, m_c_Or(m_Specific(A), m_Specific(B))) && 
-        (Op0->hasOneUse() || Op1->hasOneUse())) 
-      return BinaryOperator::CreateNeg(Builder.CreateXor(A, B)); 
-  } 
- 
-  // (sub (or A, B), (xor A, B)) --> (and A, B) 
-  { 
-    Value *A, *B; 
-    if (match(Op1, m_Xor(m_Value(A), m_Value(B))) && 
-        match(Op0, m_c_Or(m_Specific(A), m_Specific(B)))) 
-      return BinaryOperator::CreateAnd(A, B); 
-  } 
- 
-  // (sub (xor A, B) (or A, B)) --> neg (and A, B) 
-  { 
-    Value *A, *B; 
-    if (match(Op0, m_Xor(m_Value(A), m_Value(B))) && 
-        match(Op1, m_c_Or(m_Specific(A), m_Specific(B))) && 
-        (Op0->hasOneUse() || Op1->hasOneUse())) 
-      return BinaryOperator::CreateNeg(Builder.CreateAnd(A, B)); 
-  } 
- 
-  { 
-    Value *Y; 
-    // ((X | Y) - X) --> (~X & Y) 
-    if (match(Op0, m_OneUse(m_c_Or(m_Value(Y), m_Specific(Op1))))) 
-      return BinaryOperator::CreateAnd( 
-          Y, Builder.CreateNot(Op1, Op1->getName() + ".not")); 
-  } 
- 
-  { 
-    // (sub (and Op1, (neg X)), Op1) --> neg (and Op1, (add X, -1)) 
-    Value *X; 
-    if (match(Op0, m_OneUse(m_c_And(m_Specific(Op1), 
-                                    m_OneUse(m_Neg(m_Value(X))))))) { 
-      return BinaryOperator::CreateNeg(Builder.CreateAnd( 
-          Op1, Builder.CreateAdd(X, Constant::getAllOnesValue(I.getType())))); 
-    } 
-  } 
- 
-  { 
-    // (sub (and Op1, C), Op1) --> neg (and Op1, ~C) 
-    Constant *C; 
-    if (match(Op0, m_OneUse(m_And(m_Specific(Op1), m_Constant(C))))) { 
-      return BinaryOperator::CreateNeg( 
-          Builder.CreateAnd(Op1, Builder.CreateNot(C))); 
-    } 
-  } 
- 
-  { 
-    // If we have a subtraction between some value and a select between 
-    // said value and something else, sink subtraction into select hands, i.e.: 
-    //   sub (select %Cond, %TrueVal, %FalseVal), %Op1 
-    //     -> 
-    //   select %Cond, (sub %TrueVal, %Op1), (sub %FalseVal, %Op1) 
-    //  or 
-    //   sub %Op0, (select %Cond, %TrueVal, %FalseVal) 
-    //     -> 
-    //   select %Cond, (sub %Op0, %TrueVal), (sub %Op0, %FalseVal) 
-    // This will result in select between new subtraction and 0. 
-    auto SinkSubIntoSelect = 
-        [Ty = I.getType()](Value *Select, Value *OtherHandOfSub, 
-                           auto SubBuilder) -> Instruction * { 
-      Value *Cond, *TrueVal, *FalseVal; 
-      if (!match(Select, m_OneUse(m_Select(m_Value(Cond), m_Value(TrueVal), 
-                                           m_Value(FalseVal))))) 
-        return nullptr; 
-      if (OtherHandOfSub != TrueVal && OtherHandOfSub != FalseVal) 
-        return nullptr; 
-      // While it is really tempting to just create two subtractions and let 
-      // InstCombine fold one of those to 0, it isn't possible to do so 
-      // because of worklist visitation order. So ugly it is. 
-      bool OtherHandOfSubIsTrueVal = OtherHandOfSub == TrueVal; 
-      Value *NewSub = SubBuilder(OtherHandOfSubIsTrueVal ? FalseVal : TrueVal); 
-      Constant *Zero = Constant::getNullValue(Ty); 
-      SelectInst *NewSel = 
-          SelectInst::Create(Cond, OtherHandOfSubIsTrueVal ? Zero : NewSub, 
-                             OtherHandOfSubIsTrueVal ? NewSub : Zero); 
-      // Preserve prof metadata if any. 
-      NewSel->copyMetadata(cast<Instruction>(*Select)); 
-      return NewSel; 
-    }; 
-    if (Instruction *NewSel = SinkSubIntoSelect( 
-            /*Select=*/Op0, /*OtherHandOfSub=*/Op1, 
-            [Builder = &Builder, Op1](Value *OtherHandOfSelect) { 
-              return Builder->CreateSub(OtherHandOfSelect, 
-                                        /*OtherHandOfSub=*/Op1); 
-            })) 
-      return NewSel; 
-    if (Instruction *NewSel = SinkSubIntoSelect( 
-            /*Select=*/Op1, /*OtherHandOfSub=*/Op0, 
-            [Builder = &Builder, Op0](Value *OtherHandOfSelect) { 
-              return Builder->CreateSub(/*OtherHandOfSub=*/Op0, 
-                                        OtherHandOfSelect); 
-            })) 
-      return NewSel; 
-  } 
- 
-  // (X - (X & Y))   -->   (X & ~Y) 
-  if (match(Op1, m_c_And(m_Specific(Op0), m_Value(Y))) && 
-      (Op1->hasOneUse() || isa<Constant>(Y))) 
-    return BinaryOperator::CreateAnd( 
-        Op0, Builder.CreateNot(Y, Y->getName() + ".not")); 
- 
-  { 
-    // ~A - Min/Max(~A, O) -> Max/Min(A, ~O) - A 
-    // ~A - Min/Max(O, ~A) -> Max/Min(A, ~O) - A 
-    // Min/Max(~A, O) - ~A -> A - Max/Min(A, ~O) 
-    // Min/Max(O, ~A) - ~A -> A - Max/Min(A, ~O) 
-    // So long as O here is freely invertible, this will be neutral or a win. 
-    Value *LHS, *RHS, *A; 
-    Value *NotA = Op0, *MinMax = Op1; 
-    SelectPatternFlavor SPF = matchSelectPattern(MinMax, LHS, RHS).Flavor; 
-    if (!SelectPatternResult::isMinOrMax(SPF)) { 
-      NotA = Op1; 
-      MinMax = Op0; 
-      SPF = matchSelectPattern(MinMax, LHS, RHS).Flavor; 
-    } 
-    if (SelectPatternResult::isMinOrMax(SPF) && 
-        match(NotA, m_Not(m_Value(A))) && (NotA == LHS || NotA == RHS)) { 
-      if (NotA == LHS) 
-        std::swap(LHS, RHS); 
-      // LHS is now O above and expected to have at least 2 uses (the min/max) 
-      // NotA is epected to have 2 uses from the min/max and 1 from the sub. 
-      if (isFreeToInvert(LHS, !LHS->hasNUsesOrMore(3)) && 
-          !NotA->hasNUsesOrMore(4)) { 
-        // Note: We don't generate the inverse max/min, just create the not of 
-        // it and let other folds do the rest. 
-        Value *Not = Builder.CreateNot(MinMax); 
-        if (NotA == Op0) 
-          return BinaryOperator::CreateSub(Not, A); 
-        else 
-          return BinaryOperator::CreateSub(A, Not); 
-      } 
-    } 
-  } 
- 
-  // Optimize pointer differences into the same array into a size.  Consider: 
-  //  &A[10] - &A[0]: we should compile this to "10". 
-  Value *LHSOp, *RHSOp; 
-  if (match(Op0, m_PtrToInt(m_Value(LHSOp))) && 
-      match(Op1, m_PtrToInt(m_Value(RHSOp)))) 
-    if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType(), 
-                                               I.hasNoUnsignedWrap())) 
-      return replaceInstUsesWith(I, Res); 
- 
-  // trunc(p)-trunc(q) -> trunc(p-q) 
-  if (match(Op0, m_Trunc(m_PtrToInt(m_Value(LHSOp)))) && 
-      match(Op1, m_Trunc(m_PtrToInt(m_Value(RHSOp))))) 
-    if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType(), 
-                                               /* IsNUW */ false)) 
-      return replaceInstUsesWith(I, Res); 
- 
-  // Canonicalize a shifty way to code absolute value to the common pattern. 
-  // There are 2 potential commuted variants. 
-  // We're relying on the fact that we only do this transform when the shift has 
-  // exactly 2 uses and the xor has exactly 1 use (otherwise, we might increase 
-  // instructions). 
-  Value *A; 
-  const APInt *ShAmt; 
-  Type *Ty = I.getType(); 
-  if (match(Op1, m_AShr(m_Value(A), m_APInt(ShAmt))) && 
-      Op1->hasNUses(2) && *ShAmt == Ty->getScalarSizeInBits() - 1 && 
-      match(Op0, m_OneUse(m_c_Xor(m_Specific(A), m_Specific(Op1))))) { 
-    // B = ashr i32 A, 31 ; smear the sign bit 
-    // sub (xor A, B), B  ; flip bits if negative and subtract -1 (add 1) 
-    // --> (A < 0) ? -A : A 
-    Value *Cmp = Builder.CreateICmpSLT(A, ConstantInt::getNullValue(Ty)); 
-    // Copy the nuw/nsw flags from the sub to the negate. 
-    Value *Neg = Builder.CreateNeg(A, "", I.hasNoUnsignedWrap(), 
-                                   I.hasNoSignedWrap()); 
-    return SelectInst::Create(Cmp, Neg, A); 
-  } 
- 
+  // (sub (and A, B) (or A, B)) --> neg (xor A, B)
+  {
+    Value *A, *B;
+    if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
+        match(Op1, m_c_Or(m_Specific(A), m_Specific(B))) &&
+        (Op0->hasOneUse() || Op1->hasOneUse()))
+      return BinaryOperator::CreateNeg(Builder.CreateXor(A, B));
+  }
+
+  // (sub (or A, B), (xor A, B)) --> (and A, B)
+  {
+    Value *A, *B;
+    if (match(Op1, m_Xor(m_Value(A), m_Value(B))) &&
+        match(Op0, m_c_Or(m_Specific(A), m_Specific(B))))
+      return BinaryOperator::CreateAnd(A, B);
+  }
+
+  // (sub (xor A, B) (or A, B)) --> neg (and A, B)
+  {
+    Value *A, *B;
+    if (match(Op0, m_Xor(m_Value(A), m_Value(B))) &&
+        match(Op1, m_c_Or(m_Specific(A), m_Specific(B))) &&
+        (Op0->hasOneUse() || Op1->hasOneUse()))
+      return BinaryOperator::CreateNeg(Builder.CreateAnd(A, B));
+  }
+
+  {
+    Value *Y;
+    // ((X | Y) - X) --> (~X & Y)
+    if (match(Op0, m_OneUse(m_c_Or(m_Value(Y), m_Specific(Op1)))))
+      return BinaryOperator::CreateAnd(
+          Y, Builder.CreateNot(Op1, Op1->getName() + ".not"));
+  }
+
+  {
+    // (sub (and Op1, (neg X)), Op1) --> neg (and Op1, (add X, -1))
+    Value *X;
+    if (match(Op0, m_OneUse(m_c_And(m_Specific(Op1),
+                                    m_OneUse(m_Neg(m_Value(X))))))) {
+      return BinaryOperator::CreateNeg(Builder.CreateAnd(
+          Op1, Builder.CreateAdd(X, Constant::getAllOnesValue(I.getType()))));
+    }
+  }
+
+  {
+    // (sub (and Op1, C), Op1) --> neg (and Op1, ~C)
+    Constant *C;
+    if (match(Op0, m_OneUse(m_And(m_Specific(Op1), m_Constant(C))))) {
+      return BinaryOperator::CreateNeg(
+          Builder.CreateAnd(Op1, Builder.CreateNot(C)));
+    }
+  }
+
+  {
+    // If we have a subtraction between some value and a select between
+    // said value and something else, sink subtraction into select hands, i.e.:
+    //   sub (select %Cond, %TrueVal, %FalseVal), %Op1
+    //     ->
+    //   select %Cond, (sub %TrueVal, %Op1), (sub %FalseVal, %Op1)
+    //  or
+    //   sub %Op0, (select %Cond, %TrueVal, %FalseVal)
+    //     ->
+    //   select %Cond, (sub %Op0, %TrueVal), (sub %Op0, %FalseVal)
+    // This will result in select between new subtraction and 0.
+    auto SinkSubIntoSelect =
+        [Ty = I.getType()](Value *Select, Value *OtherHandOfSub,
+                           auto SubBuilder) -> Instruction * {
+      Value *Cond, *TrueVal, *FalseVal;
+      if (!match(Select, m_OneUse(m_Select(m_Value(Cond), m_Value(TrueVal),
+                                           m_Value(FalseVal)))))
+        return nullptr;
+      if (OtherHandOfSub != TrueVal && OtherHandOfSub != FalseVal)
+        return nullptr;
+      // While it is really tempting to just create two subtractions and let
+      // InstCombine fold one of those to 0, it isn't possible to do so
+      // because of worklist visitation order. So ugly it is.
+      bool OtherHandOfSubIsTrueVal = OtherHandOfSub == TrueVal;
+      Value *NewSub = SubBuilder(OtherHandOfSubIsTrueVal ? FalseVal : TrueVal);
+      Constant *Zero = Constant::getNullValue(Ty);
+      SelectInst *NewSel =
+          SelectInst::Create(Cond, OtherHandOfSubIsTrueVal ? Zero : NewSub,
+                             OtherHandOfSubIsTrueVal ? NewSub : Zero);
+      // Preserve prof metadata if any.
+      NewSel->copyMetadata(cast<Instruction>(*Select));
+      return NewSel;
+    };
+    if (Instruction *NewSel = SinkSubIntoSelect(
+            /*Select=*/Op0, /*OtherHandOfSub=*/Op1,
+            [Builder = &Builder, Op1](Value *OtherHandOfSelect) {
+              return Builder->CreateSub(OtherHandOfSelect,
+                                        /*OtherHandOfSub=*/Op1);
+            }))
+      return NewSel;
+    if (Instruction *NewSel = SinkSubIntoSelect(
+            /*Select=*/Op1, /*OtherHandOfSub=*/Op0,
+            [Builder = &Builder, Op0](Value *OtherHandOfSelect) {
+              return Builder->CreateSub(/*OtherHandOfSub=*/Op0,
+                                        OtherHandOfSelect);
+            }))
+      return NewSel;
+  }
+
+  // (X - (X & Y))   -->   (X & ~Y)
+  if (match(Op1, m_c_And(m_Specific(Op0), m_Value(Y))) &&
+      (Op1->hasOneUse() || isa<Constant>(Y)))
+    return BinaryOperator::CreateAnd(
+        Op0, Builder.CreateNot(Y, Y->getName() + ".not"));
+
+  {
+    // ~A - Min/Max(~A, O) -> Max/Min(A, ~O) - A
+    // ~A - Min/Max(O, ~A) -> Max/Min(A, ~O) - A
+    // Min/Max(~A, O) - ~A -> A - Max/Min(A, ~O)
+    // Min/Max(O, ~A) - ~A -> A - Max/Min(A, ~O)
+    // So long as O here is freely invertible, this will be neutral or a win.
+    Value *LHS, *RHS, *A;
+    Value *NotA = Op0, *MinMax = Op1;
+    SelectPatternFlavor SPF = matchSelectPattern(MinMax, LHS, RHS).Flavor;
+    if (!SelectPatternResult::isMinOrMax(SPF)) {
+      NotA = Op1;
+      MinMax = Op0;
+      SPF = matchSelectPattern(MinMax, LHS, RHS).Flavor;
+    }
+    if (SelectPatternResult::isMinOrMax(SPF) &&
+        match(NotA, m_Not(m_Value(A))) && (NotA == LHS || NotA == RHS)) {
+      if (NotA == LHS)
+        std::swap(LHS, RHS);
+      // LHS is now O above and expected to have at least 2 uses (the min/max)
+      // NotA is epected to have 2 uses from the min/max and 1 from the sub.
+      if (isFreeToInvert(LHS, !LHS->hasNUsesOrMore(3)) &&
+          !NotA->hasNUsesOrMore(4)) {
+        // Note: We don't generate the inverse max/min, just create the not of
+        // it and let other folds do the rest.
+        Value *Not = Builder.CreateNot(MinMax);
+        if (NotA == Op0)
+          return BinaryOperator::CreateSub(Not, A);
+        else
+          return BinaryOperator::CreateSub(A, Not);
+      }
+    }
+  }
+
+  // Optimize pointer differences into the same array into a size.  Consider:
+  //  &A[10] - &A[0]: we should compile this to "10".
+  Value *LHSOp, *RHSOp;
+  if (match(Op0, m_PtrToInt(m_Value(LHSOp))) &&
+      match(Op1, m_PtrToInt(m_Value(RHSOp))))
+    if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType(),
+                                               I.hasNoUnsignedWrap()))
+      return replaceInstUsesWith(I, Res);
+
+  // trunc(p)-trunc(q) -> trunc(p-q)
+  if (match(Op0, m_Trunc(m_PtrToInt(m_Value(LHSOp)))) &&
+      match(Op1, m_Trunc(m_PtrToInt(m_Value(RHSOp)))))
+    if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType(),
+                                               /* IsNUW */ false))
+      return replaceInstUsesWith(I, Res);
+
+  // Canonicalize a shifty way to code absolute value to the common pattern.
+  // There are 2 potential commuted variants.
+  // We're relying on the fact that we only do this transform when the shift has
+  // exactly 2 uses and the xor has exactly 1 use (otherwise, we might increase
+  // instructions).
+  Value *A;
+  const APInt *ShAmt;
+  Type *Ty = I.getType();
+  if (match(Op1, m_AShr(m_Value(A), m_APInt(ShAmt))) &&
+      Op1->hasNUses(2) && *ShAmt == Ty->getScalarSizeInBits() - 1 &&
+      match(Op0, m_OneUse(m_c_Xor(m_Specific(A), m_Specific(Op1))))) {
+    // B = ashr i32 A, 31 ; smear the sign bit
+    // sub (xor A, B), B  ; flip bits if negative and subtract -1 (add 1)
+    // --> (A < 0) ? -A : A
+    Value *Cmp = Builder.CreateICmpSLT(A, ConstantInt::getNullValue(Ty));
+    // Copy the nuw/nsw flags from the sub to the negate.
+    Value *Neg = Builder.CreateNeg(A, "", I.hasNoUnsignedWrap(),
+                                   I.hasNoSignedWrap());
+    return SelectInst::Create(Cmp, Neg, A);
+  }
+
   // If we are subtracting a low-bit masked subset of some value from an add
   // of that same value with no low bits changed, that is clearing some low bits
   // of the sum:
@@ -2081,238 +2081,238 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
       return BinaryOperator::CreateAnd(Op0, ConstantInt::get(Ty, ~(*AndC)));
   }
 
-  if (Instruction *V = 
-          canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(I)) 
-    return V; 
- 
-  return TryToNarrowDeduceFlags(); 
-} 
- 
-/// This eliminates floating-point negation in either 'fneg(X)' or 
-/// 'fsub(-0.0, X)' form by combining into a constant operand. 
-static Instruction *foldFNegIntoConstant(Instruction &I) { 
-  Value *X; 
-  Constant *C; 
- 
-  // Fold negation into constant operand. This is limited with one-use because 
-  // fneg is assumed better for analysis and cheaper in codegen than fmul/fdiv. 
-  // -(X * C) --> X * (-C) 
-  // FIXME: It's arguable whether these should be m_OneUse or not. The current 
-  // belief is that the FNeg allows for better reassociation opportunities. 
-  if (match(&I, m_FNeg(m_OneUse(m_FMul(m_Value(X), m_Constant(C)))))) 
-    return BinaryOperator::CreateFMulFMF(X, ConstantExpr::getFNeg(C), &I); 
-  // -(X / C) --> X / (-C) 
-  if (match(&I, m_FNeg(m_OneUse(m_FDiv(m_Value(X), m_Constant(C)))))) 
-    return BinaryOperator::CreateFDivFMF(X, ConstantExpr::getFNeg(C), &I); 
-  // -(C / X) --> (-C) / X 
-  if (match(&I, m_FNeg(m_OneUse(m_FDiv(m_Constant(C), m_Value(X)))))) 
-    return BinaryOperator::CreateFDivFMF(ConstantExpr::getFNeg(C), X, &I); 
- 
-  // With NSZ [ counter-example with -0.0: -(-0.0 + 0.0) != 0.0 + -0.0 ]: 
-  // -(X + C) --> -X + -C --> -C - X 
-  if (I.hasNoSignedZeros() && 
-      match(&I, m_FNeg(m_OneUse(m_FAdd(m_Value(X), m_Constant(C)))))) 
-    return BinaryOperator::CreateFSubFMF(ConstantExpr::getFNeg(C), X, &I); 
- 
-  return nullptr; 
-} 
- 
-static Instruction *hoistFNegAboveFMulFDiv(Instruction &I, 
-                                           InstCombiner::BuilderTy &Builder) { 
-  Value *FNeg; 
-  if (!match(&I, m_FNeg(m_Value(FNeg)))) 
-    return nullptr; 
- 
-  Value *X, *Y; 
-  if (match(FNeg, m_OneUse(m_FMul(m_Value(X), m_Value(Y))))) 
-    return BinaryOperator::CreateFMulFMF(Builder.CreateFNegFMF(X, &I), Y, &I); 
- 
-  if (match(FNeg, m_OneUse(m_FDiv(m_Value(X), m_Value(Y))))) 
-    return BinaryOperator::CreateFDivFMF(Builder.CreateFNegFMF(X, &I), Y, &I); 
- 
-  return nullptr; 
-} 
- 
+  if (Instruction *V =
+          canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(I))
+    return V;
+
+  return TryToNarrowDeduceFlags();
+}
+
+/// This eliminates floating-point negation in either 'fneg(X)' or
+/// 'fsub(-0.0, X)' form by combining into a constant operand.
+static Instruction *foldFNegIntoConstant(Instruction &I) {
+  Value *X;
+  Constant *C;
+
+  // Fold negation into constant operand. This is limited with one-use because
+  // fneg is assumed better for analysis and cheaper in codegen than fmul/fdiv.
+  // -(X * C) --> X * (-C)
+  // FIXME: It's arguable whether these should be m_OneUse or not. The current
+  // belief is that the FNeg allows for better reassociation opportunities.
+  if (match(&I, m_FNeg(m_OneUse(m_FMul(m_Value(X), m_Constant(C))))))
+    return BinaryOperator::CreateFMulFMF(X, ConstantExpr::getFNeg(C), &I);
+  // -(X / C) --> X / (-C)
+  if (match(&I, m_FNeg(m_OneUse(m_FDiv(m_Value(X), m_Constant(C))))))
+    return BinaryOperator::CreateFDivFMF(X, ConstantExpr::getFNeg(C), &I);
+  // -(C / X) --> (-C) / X
+  if (match(&I, m_FNeg(m_OneUse(m_FDiv(m_Constant(C), m_Value(X))))))
+    return BinaryOperator::CreateFDivFMF(ConstantExpr::getFNeg(C), X, &I);
+
+  // With NSZ [ counter-example with -0.0: -(-0.0 + 0.0) != 0.0 + -0.0 ]:
+  // -(X + C) --> -X + -C --> -C - X
+  if (I.hasNoSignedZeros() &&
+      match(&I, m_FNeg(m_OneUse(m_FAdd(m_Value(X), m_Constant(C))))))
+    return BinaryOperator::CreateFSubFMF(ConstantExpr::getFNeg(C), X, &I);
+
+  return nullptr;
+}
+
+static Instruction *hoistFNegAboveFMulFDiv(Instruction &I,
+                                           InstCombiner::BuilderTy &Builder) {
+  Value *FNeg;
+  if (!match(&I, m_FNeg(m_Value(FNeg))))
+    return nullptr;
+
+  Value *X, *Y;
+  if (match(FNeg, m_OneUse(m_FMul(m_Value(X), m_Value(Y)))))
+    return BinaryOperator::CreateFMulFMF(Builder.CreateFNegFMF(X, &I), Y, &I);
+
+  if (match(FNeg, m_OneUse(m_FDiv(m_Value(X), m_Value(Y)))))
+    return BinaryOperator::CreateFDivFMF(Builder.CreateFNegFMF(X, &I), Y, &I);
+
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::visitFNeg(UnaryOperator &I) {
-  Value *Op = I.getOperand(0); 
- 
-  if (Value *V = SimplifyFNegInst(Op, I.getFastMathFlags(), 
+  Value *Op = I.getOperand(0);
+
+  if (Value *V = SimplifyFNegInst(Op, I.getFastMathFlags(),
                                   getSimplifyQuery().getWithInstruction(&I)))
-    return replaceInstUsesWith(I, V); 
- 
-  if (Instruction *X = foldFNegIntoConstant(I)) 
-    return X; 
- 
-  Value *X, *Y; 
- 
-  // If we can ignore the sign of zeros: -(X - Y) --> (Y - X) 
-  if (I.hasNoSignedZeros() && 
-      match(Op, m_OneUse(m_FSub(m_Value(X), m_Value(Y))))) 
-    return BinaryOperator::CreateFSubFMF(Y, X, &I); 
- 
-  if (Instruction *R = hoistFNegAboveFMulFDiv(I, Builder)) 
-    return R; 
- 
-  return nullptr; 
-} 
- 
+    return replaceInstUsesWith(I, V);
+
+  if (Instruction *X = foldFNegIntoConstant(I))
+    return X;
+
+  Value *X, *Y;
+
+  // If we can ignore the sign of zeros: -(X - Y) --> (Y - X)
+  if (I.hasNoSignedZeros() &&
+      match(Op, m_OneUse(m_FSub(m_Value(X), m_Value(Y)))))
+    return BinaryOperator::CreateFSubFMF(Y, X, &I);
+
+  if (Instruction *R = hoistFNegAboveFMulFDiv(I, Builder))
+    return R;
+
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::visitFSub(BinaryOperator &I) {
-  if (Value *V = SimplifyFSubInst(I.getOperand(0), I.getOperand(1), 
-                                  I.getFastMathFlags(), 
+  if (Value *V = SimplifyFSubInst(I.getOperand(0), I.getOperand(1),
+                                  I.getFastMathFlags(),
                                   getSimplifyQuery().getWithInstruction(&I)))
-    return replaceInstUsesWith(I, V); 
- 
-  if (Instruction *X = foldVectorBinop(I)) 
-    return X; 
- 
-  // Subtraction from -0.0 is the canonical form of fneg. 
-  // fsub -0.0, X ==> fneg X 
-  // fsub nsz 0.0, X ==> fneg nsz X 
-  // 
-  // FIXME This matcher does not respect FTZ or DAZ yet: 
-  // fsub -0.0, Denorm ==> +-0 
-  // fneg Denorm ==> -Denorm 
-  Value *Op; 
-  if (match(&I, m_FNeg(m_Value(Op)))) 
-    return UnaryOperator::CreateFNegFMF(Op, &I); 
- 
-  if (Instruction *X = foldFNegIntoConstant(I)) 
-    return X; 
- 
-  if (Instruction *R = hoistFNegAboveFMulFDiv(I, Builder)) 
-    return R; 
- 
-  Value *X, *Y; 
-  Constant *C; 
- 
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); 
-  // If Op0 is not -0.0 or we can ignore -0.0: Z - (X - Y) --> Z + (Y - X) 
-  // Canonicalize to fadd to make analysis easier. 
-  // This can also help codegen because fadd is commutative. 
-  // Note that if this fsub was really an fneg, the fadd with -0.0 will get 
-  // killed later. We still limit that particular transform with 'hasOneUse' 
-  // because an fneg is assumed better/cheaper than a generic fsub. 
-  if (I.hasNoSignedZeros() || CannotBeNegativeZero(Op0, SQ.TLI)) { 
-    if (match(Op1, m_OneUse(m_FSub(m_Value(X), m_Value(Y))))) { 
-      Value *NewSub = Builder.CreateFSubFMF(Y, X, &I); 
-      return BinaryOperator::CreateFAddFMF(Op0, NewSub, &I); 
-    } 
-  } 
- 
-  // (-X) - Op1 --> -(X + Op1) 
-  if (I.hasNoSignedZeros() && !isa<ConstantExpr>(Op0) && 
-      match(Op0, m_OneUse(m_FNeg(m_Value(X))))) { 
-    Value *FAdd = Builder.CreateFAddFMF(X, Op1, &I); 
-    return UnaryOperator::CreateFNegFMF(FAdd, &I); 
-  } 
- 
-  if (isa<Constant>(Op0)) 
-    if (SelectInst *SI = dyn_cast<SelectInst>(Op1)) 
-      if (Instruction *NV = FoldOpIntoSelect(I, SI)) 
-        return NV; 
- 
-  // X - C --> X + (-C) 
-  // But don't transform constant expressions because there's an inverse fold 
-  // for X + (-Y) --> X - Y. 
+    return replaceInstUsesWith(I, V);
+
+  if (Instruction *X = foldVectorBinop(I))
+    return X;
+
+  // Subtraction from -0.0 is the canonical form of fneg.
+  // fsub -0.0, X ==> fneg X
+  // fsub nsz 0.0, X ==> fneg nsz X
+  //
+  // FIXME This matcher does not respect FTZ or DAZ yet:
+  // fsub -0.0, Denorm ==> +-0
+  // fneg Denorm ==> -Denorm
+  Value *Op;
+  if (match(&I, m_FNeg(m_Value(Op))))
+    return UnaryOperator::CreateFNegFMF(Op, &I);
+
+  if (Instruction *X = foldFNegIntoConstant(I))
+    return X;
+
+  if (Instruction *R = hoistFNegAboveFMulFDiv(I, Builder))
+    return R;
+
+  Value *X, *Y;
+  Constant *C;
+
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  // If Op0 is not -0.0 or we can ignore -0.0: Z - (X - Y) --> Z + (Y - X)
+  // Canonicalize to fadd to make analysis easier.
+  // This can also help codegen because fadd is commutative.
+  // Note that if this fsub was really an fneg, the fadd with -0.0 will get
+  // killed later. We still limit that particular transform with 'hasOneUse'
+  // because an fneg is assumed better/cheaper than a generic fsub.
+  if (I.hasNoSignedZeros() || CannotBeNegativeZero(Op0, SQ.TLI)) {
+    if (match(Op1, m_OneUse(m_FSub(m_Value(X), m_Value(Y))))) {
+      Value *NewSub = Builder.CreateFSubFMF(Y, X, &I);
+      return BinaryOperator::CreateFAddFMF(Op0, NewSub, &I);
+    }
+  }
+
+  // (-X) - Op1 --> -(X + Op1)
+  if (I.hasNoSignedZeros() && !isa<ConstantExpr>(Op0) &&
+      match(Op0, m_OneUse(m_FNeg(m_Value(X))))) {
+    Value *FAdd = Builder.CreateFAddFMF(X, Op1, &I);
+    return UnaryOperator::CreateFNegFMF(FAdd, &I);
+  }
+
+  if (isa<Constant>(Op0))
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
+      if (Instruction *NV = FoldOpIntoSelect(I, SI))
+        return NV;
+
+  // X - C --> X + (-C)
+  // But don't transform constant expressions because there's an inverse fold
+  // for X + (-Y) --> X - Y.
   if (match(Op1, m_ImmConstant(C)))
-    return BinaryOperator::CreateFAddFMF(Op0, ConstantExpr::getFNeg(C), &I); 
- 
-  // X - (-Y) --> X + Y 
-  if (match(Op1, m_FNeg(m_Value(Y)))) 
-    return BinaryOperator::CreateFAddFMF(Op0, Y, &I); 
- 
-  // Similar to above, but look through a cast of the negated value: 
-  // X - (fptrunc(-Y)) --> X + fptrunc(Y) 
-  Type *Ty = I.getType(); 
-  if (match(Op1, m_OneUse(m_FPTrunc(m_FNeg(m_Value(Y)))))) 
-    return BinaryOperator::CreateFAddFMF(Op0, Builder.CreateFPTrunc(Y, Ty), &I); 
- 
-  // X - (fpext(-Y)) --> X + fpext(Y) 
-  if (match(Op1, m_OneUse(m_FPExt(m_FNeg(m_Value(Y)))))) 
-    return BinaryOperator::CreateFAddFMF(Op0, Builder.CreateFPExt(Y, Ty), &I); 
- 
-  // Similar to above, but look through fmul/fdiv of the negated value: 
-  // Op0 - (-X * Y) --> Op0 + (X * Y) 
-  // Op0 - (Y * -X) --> Op0 + (X * Y) 
-  if (match(Op1, m_OneUse(m_c_FMul(m_FNeg(m_Value(X)), m_Value(Y))))) { 
-    Value *FMul = Builder.CreateFMulFMF(X, Y, &I); 
-    return BinaryOperator::CreateFAddFMF(Op0, FMul, &I); 
-  } 
-  // Op0 - (-X / Y) --> Op0 + (X / Y) 
-  // Op0 - (X / -Y) --> Op0 + (X / Y) 
-  if (match(Op1, m_OneUse(m_FDiv(m_FNeg(m_Value(X)), m_Value(Y)))) || 
-      match(Op1, m_OneUse(m_FDiv(m_Value(X), m_FNeg(m_Value(Y)))))) { 
-    Value *FDiv = Builder.CreateFDivFMF(X, Y, &I); 
-    return BinaryOperator::CreateFAddFMF(Op0, FDiv, &I); 
-  } 
- 
-  // Handle special cases for FSub with selects feeding the operation 
-  if (Value *V = SimplifySelectsFeedingBinaryOp(I, Op0, Op1)) 
-    return replaceInstUsesWith(I, V); 
- 
-  if (I.hasAllowReassoc() && I.hasNoSignedZeros()) { 
-    // (Y - X) - Y --> -X 
-    if (match(Op0, m_FSub(m_Specific(Op1), m_Value(X)))) 
-      return UnaryOperator::CreateFNegFMF(X, &I); 
- 
-    // Y - (X + Y) --> -X 
-    // Y - (Y + X) --> -X 
-    if (match(Op1, m_c_FAdd(m_Specific(Op0), m_Value(X)))) 
-      return UnaryOperator::CreateFNegFMF(X, &I); 
- 
-    // (X * C) - X --> X * (C - 1.0) 
-    if (match(Op0, m_FMul(m_Specific(Op1), m_Constant(C)))) { 
-      Constant *CSubOne = ConstantExpr::getFSub(C, ConstantFP::get(Ty, 1.0)); 
-      return BinaryOperator::CreateFMulFMF(Op1, CSubOne, &I); 
-    } 
-    // X - (X * C) --> X * (1.0 - C) 
-    if (match(Op1, m_FMul(m_Specific(Op0), m_Constant(C)))) { 
-      Constant *OneSubC = ConstantExpr::getFSub(ConstantFP::get(Ty, 1.0), C); 
-      return BinaryOperator::CreateFMulFMF(Op0, OneSubC, &I); 
-    } 
- 
-    // Reassociate fsub/fadd sequences to create more fadd instructions and 
-    // reduce dependency chains: 
-    // ((X - Y) + Z) - Op1 --> (X + Z) - (Y + Op1) 
-    Value *Z; 
-    if (match(Op0, m_OneUse(m_c_FAdd(m_OneUse(m_FSub(m_Value(X), m_Value(Y))), 
-                                     m_Value(Z))))) { 
-      Value *XZ = Builder.CreateFAddFMF(X, Z, &I); 
-      Value *YW = Builder.CreateFAddFMF(Y, Op1, &I); 
-      return BinaryOperator::CreateFSubFMF(XZ, YW, &I); 
-    } 
- 
-    auto m_FaddRdx = [](Value *&Sum, Value *&Vec) { 
+    return BinaryOperator::CreateFAddFMF(Op0, ConstantExpr::getFNeg(C), &I);
+
+  // X - (-Y) --> X + Y
+  if (match(Op1, m_FNeg(m_Value(Y))))
+    return BinaryOperator::CreateFAddFMF(Op0, Y, &I);
+
+  // Similar to above, but look through a cast of the negated value:
+  // X - (fptrunc(-Y)) --> X + fptrunc(Y)
+  Type *Ty = I.getType();
+  if (match(Op1, m_OneUse(m_FPTrunc(m_FNeg(m_Value(Y))))))
+    return BinaryOperator::CreateFAddFMF(Op0, Builder.CreateFPTrunc(Y, Ty), &I);
+
+  // X - (fpext(-Y)) --> X + fpext(Y)
+  if (match(Op1, m_OneUse(m_FPExt(m_FNeg(m_Value(Y))))))
+    return BinaryOperator::CreateFAddFMF(Op0, Builder.CreateFPExt(Y, Ty), &I);
+
+  // Similar to above, but look through fmul/fdiv of the negated value:
+  // Op0 - (-X * Y) --> Op0 + (X * Y)
+  // Op0 - (Y * -X) --> Op0 + (X * Y)
+  if (match(Op1, m_OneUse(m_c_FMul(m_FNeg(m_Value(X)), m_Value(Y))))) {
+    Value *FMul = Builder.CreateFMulFMF(X, Y, &I);
+    return BinaryOperator::CreateFAddFMF(Op0, FMul, &I);
+  }
+  // Op0 - (-X / Y) --> Op0 + (X / Y)
+  // Op0 - (X / -Y) --> Op0 + (X / Y)
+  if (match(Op1, m_OneUse(m_FDiv(m_FNeg(m_Value(X)), m_Value(Y)))) ||
+      match(Op1, m_OneUse(m_FDiv(m_Value(X), m_FNeg(m_Value(Y)))))) {
+    Value *FDiv = Builder.CreateFDivFMF(X, Y, &I);
+    return BinaryOperator::CreateFAddFMF(Op0, FDiv, &I);
+  }
+
+  // Handle special cases for FSub with selects feeding the operation
+  if (Value *V = SimplifySelectsFeedingBinaryOp(I, Op0, Op1))
+    return replaceInstUsesWith(I, V);
+
+  if (I.hasAllowReassoc() && I.hasNoSignedZeros()) {
+    // (Y - X) - Y --> -X
+    if (match(Op0, m_FSub(m_Specific(Op1), m_Value(X))))
+      return UnaryOperator::CreateFNegFMF(X, &I);
+
+    // Y - (X + Y) --> -X
+    // Y - (Y + X) --> -X
+    if (match(Op1, m_c_FAdd(m_Specific(Op0), m_Value(X))))
+      return UnaryOperator::CreateFNegFMF(X, &I);
+
+    // (X * C) - X --> X * (C - 1.0)
+    if (match(Op0, m_FMul(m_Specific(Op1), m_Constant(C)))) {
+      Constant *CSubOne = ConstantExpr::getFSub(C, ConstantFP::get(Ty, 1.0));
+      return BinaryOperator::CreateFMulFMF(Op1, CSubOne, &I);
+    }
+    // X - (X * C) --> X * (1.0 - C)
+    if (match(Op1, m_FMul(m_Specific(Op0), m_Constant(C)))) {
+      Constant *OneSubC = ConstantExpr::getFSub(ConstantFP::get(Ty, 1.0), C);
+      return BinaryOperator::CreateFMulFMF(Op0, OneSubC, &I);
+    }
+
+    // Reassociate fsub/fadd sequences to create more fadd instructions and
+    // reduce dependency chains:
+    // ((X - Y) + Z) - Op1 --> (X + Z) - (Y + Op1)
+    Value *Z;
+    if (match(Op0, m_OneUse(m_c_FAdd(m_OneUse(m_FSub(m_Value(X), m_Value(Y))),
+                                     m_Value(Z))))) {
+      Value *XZ = Builder.CreateFAddFMF(X, Z, &I);
+      Value *YW = Builder.CreateFAddFMF(Y, Op1, &I);
+      return BinaryOperator::CreateFSubFMF(XZ, YW, &I);
+    }
+
+    auto m_FaddRdx = [](Value *&Sum, Value *&Vec) {
       return m_OneUse(m_Intrinsic<Intrinsic::vector_reduce_fadd>(m_Value(Sum),
                                                                  m_Value(Vec)));
-    }; 
-    Value *A0, *A1, *V0, *V1; 
-    if (match(Op0, m_FaddRdx(A0, V0)) && match(Op1, m_FaddRdx(A1, V1)) && 
-        V0->getType() == V1->getType()) { 
-      // Difference of sums is sum of differences: 
-      // add_rdx(A0, V0) - add_rdx(A1, V1) --> add_rdx(A0, V0 - V1) - A1 
-      Value *Sub = Builder.CreateFSubFMF(V0, V1, &I); 
+    };
+    Value *A0, *A1, *V0, *V1;
+    if (match(Op0, m_FaddRdx(A0, V0)) && match(Op1, m_FaddRdx(A1, V1)) &&
+        V0->getType() == V1->getType()) {
+      // Difference of sums is sum of differences:
+      // add_rdx(A0, V0) - add_rdx(A1, V1) --> add_rdx(A0, V0 - V1) - A1
+      Value *Sub = Builder.CreateFSubFMF(V0, V1, &I);
       Value *Rdx = Builder.CreateIntrinsic(Intrinsic::vector_reduce_fadd,
                                            {Sub->getType()}, {A0, Sub}, &I);
-      return BinaryOperator::CreateFSubFMF(Rdx, A1, &I); 
-    } 
- 
-    if (Instruction *F = factorizeFAddFSub(I, Builder)) 
-      return F; 
- 
-    // TODO: This performs reassociative folds for FP ops. Some fraction of the 
-    // functionality has been subsumed by simple pattern matching here and in 
-    // InstSimplify. We should let a dedicated reassociation pass handle more 
-    // complex pattern matching and remove this from InstCombine. 
-    if (Value *V = FAddCombine(Builder).simplify(&I)) 
-      return replaceInstUsesWith(I, V); 
- 
-    // (X - Y) - Op1 --> X - (Y + Op1) 
-    if (match(Op0, m_OneUse(m_FSub(m_Value(X), m_Value(Y))))) { 
-      Value *FAdd = Builder.CreateFAddFMF(Y, Op1, &I); 
-      return BinaryOperator::CreateFSubFMF(X, FAdd, &I); 
-    } 
-  } 
- 
-  return nullptr; 
-} 
+      return BinaryOperator::CreateFSubFMF(Rdx, A1, &I);
+    }
+
+    if (Instruction *F = factorizeFAddFSub(I, Builder))
+      return F;
+
+    // TODO: This performs reassociative folds for FP ops. Some fraction of the
+    // functionality has been subsumed by simple pattern matching here and in
+    // InstSimplify. We should let a dedicated reassociation pass handle more
+    // complex pattern matching and remove this from InstCombine.
+    if (Value *V = FAddCombine(Builder).simplify(&I))
+      return replaceInstUsesWith(I, V);
+
+    // (X - Y) - Op1 --> X - (Y + Op1)
+    if (match(Op0, m_OneUse(m_FSub(m_Value(X), m_Value(Y))))) {
+      Value *FAdd = Builder.CreateFAddFMF(Y, Op1, &I);
+      return BinaryOperator::CreateFSubFMF(X, FAdd, &I);
+    }
+  }
+
+  return nullptr;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index fcf09f9216..85a7abe211 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1,1632 +1,1632 @@
-//===- InstCombineAndOrXor.cpp --------------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the visitAnd, visitOr, and visitXor functions. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "InstCombineInternal.h" 
-#include "llvm/Analysis/CmpInstAnalysis.h" 
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/IR/ConstantRange.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/PatternMatch.h" 
+//===- InstCombineAndOrXor.cpp --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visitAnd, visitOr, and visitXor functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/Analysis/CmpInstAnalysis.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include "llvm/Transforms/Utils/Local.h"
 
-using namespace llvm; 
-using namespace PatternMatch; 
- 
-#define DEBUG_TYPE "instcombine" 
- 
-/// Similar to getICmpCode but for FCmpInst. This encodes a fcmp predicate into 
-/// a four bit mask. 
-static unsigned getFCmpCode(FCmpInst::Predicate CC) { 
-  assert(FCmpInst::FCMP_FALSE <= CC && CC <= FCmpInst::FCMP_TRUE && 
-         "Unexpected FCmp predicate!"); 
-  // Take advantage of the bit pattern of FCmpInst::Predicate here. 
-  //                                                 U L G E 
-  static_assert(FCmpInst::FCMP_FALSE ==  0, "");  // 0 0 0 0 
-  static_assert(FCmpInst::FCMP_OEQ   ==  1, "");  // 0 0 0 1 
-  static_assert(FCmpInst::FCMP_OGT   ==  2, "");  // 0 0 1 0 
-  static_assert(FCmpInst::FCMP_OGE   ==  3, "");  // 0 0 1 1 
-  static_assert(FCmpInst::FCMP_OLT   ==  4, "");  // 0 1 0 0 
-  static_assert(FCmpInst::FCMP_OLE   ==  5, "");  // 0 1 0 1 
-  static_assert(FCmpInst::FCMP_ONE   ==  6, "");  // 0 1 1 0 
-  static_assert(FCmpInst::FCMP_ORD   ==  7, "");  // 0 1 1 1 
-  static_assert(FCmpInst::FCMP_UNO   ==  8, "");  // 1 0 0 0 
-  static_assert(FCmpInst::FCMP_UEQ   ==  9, "");  // 1 0 0 1 
-  static_assert(FCmpInst::FCMP_UGT   == 10, "");  // 1 0 1 0 
-  static_assert(FCmpInst::FCMP_UGE   == 11, "");  // 1 0 1 1 
-  static_assert(FCmpInst::FCMP_ULT   == 12, "");  // 1 1 0 0 
-  static_assert(FCmpInst::FCMP_ULE   == 13, "");  // 1 1 0 1 
-  static_assert(FCmpInst::FCMP_UNE   == 14, "");  // 1 1 1 0 
-  static_assert(FCmpInst::FCMP_TRUE  == 15, "");  // 1 1 1 1 
-  return CC; 
-} 
- 
-/// This is the complement of getICmpCode, which turns an opcode and two 
-/// operands into either a constant true or false, or a brand new ICmp 
-/// instruction. The sign is passed in to determine which kind of predicate to 
-/// use in the new icmp instruction. 
-static Value *getNewICmpValue(unsigned Code, bool Sign, Value *LHS, Value *RHS, 
-                              InstCombiner::BuilderTy &Builder) { 
-  ICmpInst::Predicate NewPred; 
-  if (Constant *TorF = getPredForICmpCode(Code, Sign, LHS->getType(), NewPred)) 
-    return TorF; 
-  return Builder.CreateICmp(NewPred, LHS, RHS); 
-} 
- 
-/// This is the complement of getFCmpCode, which turns an opcode and two 
-/// operands into either a FCmp instruction, or a true/false constant. 
-static Value *getFCmpValue(unsigned Code, Value *LHS, Value *RHS, 
-                           InstCombiner::BuilderTy &Builder) { 
-  const auto Pred = static_cast<FCmpInst::Predicate>(Code); 
-  assert(FCmpInst::FCMP_FALSE <= Pred && Pred <= FCmpInst::FCMP_TRUE && 
-         "Unexpected FCmp predicate!"); 
-  if (Pred == FCmpInst::FCMP_FALSE) 
-    return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0); 
-  if (Pred == FCmpInst::FCMP_TRUE) 
-    return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 1); 
-  return Builder.CreateFCmp(Pred, LHS, RHS); 
-} 
- 
-/// Transform BITWISE_OP(BSWAP(A),BSWAP(B)) or 
-/// BITWISE_OP(BSWAP(A), Constant) to BSWAP(BITWISE_OP(A, B)) 
-/// \param I Binary operator to transform. 
-/// \return Pointer to node that must replace the original binary operator, or 
-///         null pointer if no transformation was made. 
-static Value *SimplifyBSwap(BinaryOperator &I, 
-                            InstCombiner::BuilderTy &Builder) { 
-  assert(I.isBitwiseLogicOp() && "Unexpected opcode for bswap simplifying"); 
- 
-  Value *OldLHS = I.getOperand(0); 
-  Value *OldRHS = I.getOperand(1); 
- 
-  Value *NewLHS; 
-  if (!match(OldLHS, m_BSwap(m_Value(NewLHS)))) 
-    return nullptr; 
- 
-  Value *NewRHS; 
-  const APInt *C; 
- 
-  if (match(OldRHS, m_BSwap(m_Value(NewRHS)))) { 
-    // OP( BSWAP(x), BSWAP(y) ) -> BSWAP( OP(x, y) ) 
-    if (!OldLHS->hasOneUse() && !OldRHS->hasOneUse()) 
-      return nullptr; 
-    // NewRHS initialized by the matcher. 
-  } else if (match(OldRHS, m_APInt(C))) { 
-    // OP( BSWAP(x), CONSTANT ) -> BSWAP( OP(x, BSWAP(CONSTANT) ) ) 
-    if (!OldLHS->hasOneUse()) 
-      return nullptr; 
-    NewRHS = ConstantInt::get(I.getType(), C->byteSwap()); 
-  } else 
-    return nullptr; 
- 
-  Value *BinOp = Builder.CreateBinOp(I.getOpcode(), NewLHS, NewRHS); 
-  Function *F = Intrinsic::getDeclaration(I.getModule(), Intrinsic::bswap, 
-                                          I.getType()); 
-  return Builder.CreateCall(F, BinOp); 
-} 
- 
-/// Emit a computation of: (V >= Lo && V < Hi) if Inside is true, otherwise 
-/// (V < Lo || V >= Hi). This method expects that Lo < Hi. IsSigned indicates 
-/// whether to treat V, Lo, and Hi as signed or not. 
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
+/// Similar to getICmpCode but for FCmpInst. This encodes a fcmp predicate into
+/// a four bit mask.
+static unsigned getFCmpCode(FCmpInst::Predicate CC) {
+  assert(FCmpInst::FCMP_FALSE <= CC && CC <= FCmpInst::FCMP_TRUE &&
+         "Unexpected FCmp predicate!");
+  // Take advantage of the bit pattern of FCmpInst::Predicate here.
+  //                                                 U L G E
+  static_assert(FCmpInst::FCMP_FALSE ==  0, "");  // 0 0 0 0
+  static_assert(FCmpInst::FCMP_OEQ   ==  1, "");  // 0 0 0 1
+  static_assert(FCmpInst::FCMP_OGT   ==  2, "");  // 0 0 1 0
+  static_assert(FCmpInst::FCMP_OGE   ==  3, "");  // 0 0 1 1
+  static_assert(FCmpInst::FCMP_OLT   ==  4, "");  // 0 1 0 0
+  static_assert(FCmpInst::FCMP_OLE   ==  5, "");  // 0 1 0 1
+  static_assert(FCmpInst::FCMP_ONE   ==  6, "");  // 0 1 1 0
+  static_assert(FCmpInst::FCMP_ORD   ==  7, "");  // 0 1 1 1
+  static_assert(FCmpInst::FCMP_UNO   ==  8, "");  // 1 0 0 0
+  static_assert(FCmpInst::FCMP_UEQ   ==  9, "");  // 1 0 0 1
+  static_assert(FCmpInst::FCMP_UGT   == 10, "");  // 1 0 1 0
+  static_assert(FCmpInst::FCMP_UGE   == 11, "");  // 1 0 1 1
+  static_assert(FCmpInst::FCMP_ULT   == 12, "");  // 1 1 0 0
+  static_assert(FCmpInst::FCMP_ULE   == 13, "");  // 1 1 0 1
+  static_assert(FCmpInst::FCMP_UNE   == 14, "");  // 1 1 1 0
+  static_assert(FCmpInst::FCMP_TRUE  == 15, "");  // 1 1 1 1
+  return CC;
+}
+
+/// This is the complement of getICmpCode, which turns an opcode and two
+/// operands into either a constant true or false, or a brand new ICmp
+/// instruction. The sign is passed in to determine which kind of predicate to
+/// use in the new icmp instruction.
+static Value *getNewICmpValue(unsigned Code, bool Sign, Value *LHS, Value *RHS,
+                              InstCombiner::BuilderTy &Builder) {
+  ICmpInst::Predicate NewPred;
+  if (Constant *TorF = getPredForICmpCode(Code, Sign, LHS->getType(), NewPred))
+    return TorF;
+  return Builder.CreateICmp(NewPred, LHS, RHS);
+}
+
+/// This is the complement of getFCmpCode, which turns an opcode and two
+/// operands into either a FCmp instruction, or a true/false constant.
+static Value *getFCmpValue(unsigned Code, Value *LHS, Value *RHS,
+                           InstCombiner::BuilderTy &Builder) {
+  const auto Pred = static_cast<FCmpInst::Predicate>(Code);
+  assert(FCmpInst::FCMP_FALSE <= Pred && Pred <= FCmpInst::FCMP_TRUE &&
+         "Unexpected FCmp predicate!");
+  if (Pred == FCmpInst::FCMP_FALSE)
+    return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0);
+  if (Pred == FCmpInst::FCMP_TRUE)
+    return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 1);
+  return Builder.CreateFCmp(Pred, LHS, RHS);
+}
+
+/// Transform BITWISE_OP(BSWAP(A),BSWAP(B)) or
+/// BITWISE_OP(BSWAP(A), Constant) to BSWAP(BITWISE_OP(A, B))
+/// \param I Binary operator to transform.
+/// \return Pointer to node that must replace the original binary operator, or
+///         null pointer if no transformation was made.
+static Value *SimplifyBSwap(BinaryOperator &I,
+                            InstCombiner::BuilderTy &Builder) {
+  assert(I.isBitwiseLogicOp() && "Unexpected opcode for bswap simplifying");
+
+  Value *OldLHS = I.getOperand(0);
+  Value *OldRHS = I.getOperand(1);
+
+  Value *NewLHS;
+  if (!match(OldLHS, m_BSwap(m_Value(NewLHS))))
+    return nullptr;
+
+  Value *NewRHS;
+  const APInt *C;
+
+  if (match(OldRHS, m_BSwap(m_Value(NewRHS)))) {
+    // OP( BSWAP(x), BSWAP(y) ) -> BSWAP( OP(x, y) )
+    if (!OldLHS->hasOneUse() && !OldRHS->hasOneUse())
+      return nullptr;
+    // NewRHS initialized by the matcher.
+  } else if (match(OldRHS, m_APInt(C))) {
+    // OP( BSWAP(x), CONSTANT ) -> BSWAP( OP(x, BSWAP(CONSTANT) ) )
+    if (!OldLHS->hasOneUse())
+      return nullptr;
+    NewRHS = ConstantInt::get(I.getType(), C->byteSwap());
+  } else
+    return nullptr;
+
+  Value *BinOp = Builder.CreateBinOp(I.getOpcode(), NewLHS, NewRHS);
+  Function *F = Intrinsic::getDeclaration(I.getModule(), Intrinsic::bswap,
+                                          I.getType());
+  return Builder.CreateCall(F, BinOp);
+}
+
+/// Emit a computation of: (V >= Lo && V < Hi) if Inside is true, otherwise
+/// (V < Lo || V >= Hi). This method expects that Lo < Hi. IsSigned indicates
+/// whether to treat V, Lo, and Hi as signed or not.
 Value *InstCombinerImpl::insertRangeTest(Value *V, const APInt &Lo,
                                          const APInt &Hi, bool isSigned,
                                          bool Inside) {
-  assert((isSigned ? Lo.slt(Hi) : Lo.ult(Hi)) && 
-         "Lo is not < Hi in range emission code!"); 
- 
-  Type *Ty = V->getType(); 
- 
-  // V >= Min && V <  Hi --> V <  Hi 
-  // V <  Min || V >= Hi --> V >= Hi 
-  ICmpInst::Predicate Pred = Inside ? ICmpInst::ICMP_ULT : ICmpInst::ICMP_UGE; 
-  if (isSigned ? Lo.isMinSignedValue() : Lo.isMinValue()) { 
-    Pred = isSigned ? ICmpInst::getSignedPredicate(Pred) : Pred; 
-    return Builder.CreateICmp(Pred, V, ConstantInt::get(Ty, Hi)); 
-  } 
- 
-  // V >= Lo && V <  Hi --> V - Lo u<  Hi - Lo 
-  // V <  Lo || V >= Hi --> V - Lo u>= Hi - Lo 
-  Value *VMinusLo = 
-      Builder.CreateSub(V, ConstantInt::get(Ty, Lo), V->getName() + ".off"); 
-  Constant *HiMinusLo = ConstantInt::get(Ty, Hi - Lo); 
-  return Builder.CreateICmp(Pred, VMinusLo, HiMinusLo); 
-} 
- 
-/// Classify (icmp eq (A & B), C) and (icmp ne (A & B), C) as matching patterns 
-/// that can be simplified. 
-/// One of A and B is considered the mask. The other is the value. This is 
-/// described as the "AMask" or "BMask" part of the enum. If the enum contains 
-/// only "Mask", then both A and B can be considered masks. If A is the mask, 
-/// then it was proven that (A & C) == C. This is trivial if C == A or C == 0. 
-/// If both A and C are constants, this proof is also easy. 
-/// For the following explanations, we assume that A is the mask. 
-/// 
-/// "AllOnes" declares that the comparison is true only if (A & B) == A or all 
-/// bits of A are set in B. 
-///   Example: (icmp eq (A & 3), 3) -> AMask_AllOnes 
-/// 
-/// "AllZeros" declares that the comparison is true only if (A & B) == 0 or all 
-/// bits of A are cleared in B. 
-///   Example: (icmp eq (A & 3), 0) -> Mask_AllZeroes 
-/// 
-/// "Mixed" declares that (A & B) == C and C might or might not contain any 
-/// number of one bits and zero bits. 
-///   Example: (icmp eq (A & 3), 1) -> AMask_Mixed 
-/// 
-/// "Not" means that in above descriptions "==" should be replaced by "!=". 
-///   Example: (icmp ne (A & 3), 3) -> AMask_NotAllOnes 
-/// 
-/// If the mask A contains a single bit, then the following is equivalent: 
-///    (icmp eq (A & B), A) equals (icmp ne (A & B), 0) 
-///    (icmp ne (A & B), A) equals (icmp eq (A & B), 0) 
-enum MaskedICmpType { 
-  AMask_AllOnes           =     1, 
-  AMask_NotAllOnes        =     2, 
-  BMask_AllOnes           =     4, 
-  BMask_NotAllOnes        =     8, 
-  Mask_AllZeros           =    16, 
-  Mask_NotAllZeros        =    32, 
-  AMask_Mixed             =    64, 
-  AMask_NotMixed          =   128, 
-  BMask_Mixed             =   256, 
-  BMask_NotMixed          =   512 
-}; 
- 
-/// Return the set of patterns (from MaskedICmpType) that (icmp SCC (A & B), C) 
-/// satisfies. 
-static unsigned getMaskedICmpType(Value *A, Value *B, Value *C, 
-                                  ICmpInst::Predicate Pred) { 
-  ConstantInt *ACst = dyn_cast<ConstantInt>(A); 
-  ConstantInt *BCst = dyn_cast<ConstantInt>(B); 
-  ConstantInt *CCst = dyn_cast<ConstantInt>(C); 
-  bool IsEq = (Pred == ICmpInst::ICMP_EQ); 
-  bool IsAPow2 = (ACst && !ACst->isZero() && ACst->getValue().isPowerOf2()); 
-  bool IsBPow2 = (BCst && !BCst->isZero() && BCst->getValue().isPowerOf2()); 
-  unsigned MaskVal = 0; 
-  if (CCst && CCst->isZero()) { 
-    // if C is zero, then both A and B qualify as mask 
-    MaskVal |= (IsEq ? (Mask_AllZeros | AMask_Mixed | BMask_Mixed) 
-                     : (Mask_NotAllZeros | AMask_NotMixed | BMask_NotMixed)); 
-    if (IsAPow2) 
-      MaskVal |= (IsEq ? (AMask_NotAllOnes | AMask_NotMixed) 
-                       : (AMask_AllOnes | AMask_Mixed)); 
-    if (IsBPow2) 
-      MaskVal |= (IsEq ? (BMask_NotAllOnes | BMask_NotMixed) 
-                       : (BMask_AllOnes | BMask_Mixed)); 
-    return MaskVal; 
-  } 
- 
-  if (A == C) { 
-    MaskVal |= (IsEq ? (AMask_AllOnes | AMask_Mixed) 
-                     : (AMask_NotAllOnes | AMask_NotMixed)); 
-    if (IsAPow2) 
-      MaskVal |= (IsEq ? (Mask_NotAllZeros | AMask_NotMixed) 
-                       : (Mask_AllZeros | AMask_Mixed)); 
-  } else if (ACst && CCst && ConstantExpr::getAnd(ACst, CCst) == CCst) { 
-    MaskVal |= (IsEq ? AMask_Mixed : AMask_NotMixed); 
-  } 
- 
-  if (B == C) { 
-    MaskVal |= (IsEq ? (BMask_AllOnes | BMask_Mixed) 
-                     : (BMask_NotAllOnes | BMask_NotMixed)); 
-    if (IsBPow2) 
-      MaskVal |= (IsEq ? (Mask_NotAllZeros | BMask_NotMixed) 
-                       : (Mask_AllZeros | BMask_Mixed)); 
-  } else if (BCst && CCst && ConstantExpr::getAnd(BCst, CCst) == CCst) { 
-    MaskVal |= (IsEq ? BMask_Mixed : BMask_NotMixed); 
-  } 
- 
-  return MaskVal; 
-} 
- 
-/// Convert an analysis of a masked ICmp into its equivalent if all boolean 
-/// operations had the opposite sense. Since each "NotXXX" flag (recording !=) 
-/// is adjacent to the corresponding normal flag (recording ==), this just 
-/// involves swapping those bits over. 
-static unsigned conjugateICmpMask(unsigned Mask) { 
-  unsigned NewMask; 
-  NewMask = (Mask & (AMask_AllOnes | BMask_AllOnes | Mask_AllZeros | 
-                     AMask_Mixed | BMask_Mixed)) 
-            << 1; 
- 
-  NewMask |= (Mask & (AMask_NotAllOnes | BMask_NotAllOnes | Mask_NotAllZeros | 
-                      AMask_NotMixed | BMask_NotMixed)) 
-             >> 1; 
- 
-  return NewMask; 
-} 
- 
-// Adapts the external decomposeBitTestICmp for local use. 
-static bool decomposeBitTestICmp(Value *LHS, Value *RHS, CmpInst::Predicate &Pred, 
-                                 Value *&X, Value *&Y, Value *&Z) { 
-  APInt Mask; 
-  if (!llvm::decomposeBitTestICmp(LHS, RHS, Pred, X, Mask)) 
-    return false; 
- 
-  Y = ConstantInt::get(X->getType(), Mask); 
-  Z = ConstantInt::get(X->getType(), 0); 
-  return true; 
-} 
- 
-/// Handle (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E). 
-/// Return the pattern classes (from MaskedICmpType) for the left hand side and 
-/// the right hand side as a pair. 
-/// LHS and RHS are the left hand side and the right hand side ICmps and PredL 
-/// and PredR are their predicates, respectively. 
-static 
-Optional<std::pair<unsigned, unsigned>> 
-getMaskedTypeForICmpPair(Value *&A, Value *&B, Value *&C, 
-                         Value *&D, Value *&E, ICmpInst *LHS, 
-                         ICmpInst *RHS, 
-                         ICmpInst::Predicate &PredL, 
-                         ICmpInst::Predicate &PredR) { 
-  // vectors are not (yet?) supported. Don't support pointers either. 
-  if (!LHS->getOperand(0)->getType()->isIntegerTy() || 
-      !RHS->getOperand(0)->getType()->isIntegerTy()) 
-    return None; 
- 
-  // Here comes the tricky part: 
-  // LHS might be of the form L11 & L12 == X, X == L21 & L22, 
-  // and L11 & L12 == L21 & L22. The same goes for RHS. 
-  // Now we must find those components L** and R**, that are equal, so 
-  // that we can extract the parameters A, B, C, D, and E for the canonical 
-  // above. 
-  Value *L1 = LHS->getOperand(0); 
-  Value *L2 = LHS->getOperand(1); 
-  Value *L11, *L12, *L21, *L22; 
-  // Check whether the icmp can be decomposed into a bit test. 
-  if (decomposeBitTestICmp(L1, L2, PredL, L11, L12, L2)) { 
-    L21 = L22 = L1 = nullptr; 
-  } else { 
-    // Look for ANDs in the LHS icmp. 
-    if (!match(L1, m_And(m_Value(L11), m_Value(L12)))) { 
-      // Any icmp can be viewed as being trivially masked; if it allows us to 
-      // remove one, it's worth it. 
-      L11 = L1; 
-      L12 = Constant::getAllOnesValue(L1->getType()); 
-    } 
- 
-    if (!match(L2, m_And(m_Value(L21), m_Value(L22)))) { 
-      L21 = L2; 
-      L22 = Constant::getAllOnesValue(L2->getType()); 
-    } 
-  } 
- 
-  // Bail if LHS was a icmp that can't be decomposed into an equality. 
-  if (!ICmpInst::isEquality(PredL)) 
-    return None; 
- 
-  Value *R1 = RHS->getOperand(0); 
-  Value *R2 = RHS->getOperand(1); 
-  Value *R11, *R12; 
-  bool Ok = false; 
-  if (decomposeBitTestICmp(R1, R2, PredR, R11, R12, R2)) { 
-    if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) { 
-      A = R11; 
-      D = R12; 
-    } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) { 
-      A = R12; 
-      D = R11; 
-    } else { 
-      return None; 
-    } 
-    E = R2; 
-    R1 = nullptr; 
-    Ok = true; 
-  } else { 
-    if (!match(R1, m_And(m_Value(R11), m_Value(R12)))) { 
-      // As before, model no mask as a trivial mask if it'll let us do an 
-      // optimization. 
-      R11 = R1; 
-      R12 = Constant::getAllOnesValue(R1->getType()); 
-    } 
- 
-    if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) { 
-      A = R11; 
-      D = R12; 
-      E = R2; 
-      Ok = true; 
-    } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) { 
-      A = R12; 
-      D = R11; 
-      E = R2; 
-      Ok = true; 
-    } 
-  } 
- 
-  // Bail if RHS was a icmp that can't be decomposed into an equality. 
-  if (!ICmpInst::isEquality(PredR)) 
-    return None; 
- 
-  // Look for ANDs on the right side of the RHS icmp. 
-  if (!Ok) { 
-    if (!match(R2, m_And(m_Value(R11), m_Value(R12)))) { 
-      R11 = R2; 
-      R12 = Constant::getAllOnesValue(R2->getType()); 
-    } 
- 
-    if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) { 
-      A = R11; 
-      D = R12; 
-      E = R1; 
-      Ok = true; 
-    } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) { 
-      A = R12; 
-      D = R11; 
-      E = R1; 
-      Ok = true; 
-    } else { 
-      return None; 
-    } 
-  } 
-  if (!Ok) 
-    return None; 
- 
-  if (L11 == A) { 
-    B = L12; 
-    C = L2; 
-  } else if (L12 == A) { 
-    B = L11; 
-    C = L2; 
-  } else if (L21 == A) { 
-    B = L22; 
-    C = L1; 
-  } else if (L22 == A) { 
-    B = L21; 
-    C = L1; 
-  } 
- 
-  unsigned LeftType = getMaskedICmpType(A, B, C, PredL); 
-  unsigned RightType = getMaskedICmpType(A, D, E, PredR); 
-  return Optional<std::pair<unsigned, unsigned>>(std::make_pair(LeftType, RightType)); 
-} 
- 
-/// Try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E) into a single 
-/// (icmp(A & X) ==/!= Y), where the left-hand side is of type Mask_NotAllZeros 
-/// and the right hand side is of type BMask_Mixed. For example, 
-/// (icmp (A & 12) != 0) & (icmp (A & 15) == 8) -> (icmp (A & 15) == 8). 
+  assert((isSigned ? Lo.slt(Hi) : Lo.ult(Hi)) &&
+         "Lo is not < Hi in range emission code!");
+
+  Type *Ty = V->getType();
+
+  // V >= Min && V <  Hi --> V <  Hi
+  // V <  Min || V >= Hi --> V >= Hi
+  ICmpInst::Predicate Pred = Inside ? ICmpInst::ICMP_ULT : ICmpInst::ICMP_UGE;
+  if (isSigned ? Lo.isMinSignedValue() : Lo.isMinValue()) {
+    Pred = isSigned ? ICmpInst::getSignedPredicate(Pred) : Pred;
+    return Builder.CreateICmp(Pred, V, ConstantInt::get(Ty, Hi));
+  }
+
+  // V >= Lo && V <  Hi --> V - Lo u<  Hi - Lo
+  // V <  Lo || V >= Hi --> V - Lo u>= Hi - Lo
+  Value *VMinusLo =
+      Builder.CreateSub(V, ConstantInt::get(Ty, Lo), V->getName() + ".off");
+  Constant *HiMinusLo = ConstantInt::get(Ty, Hi - Lo);
+  return Builder.CreateICmp(Pred, VMinusLo, HiMinusLo);
+}
+
+/// Classify (icmp eq (A & B), C) and (icmp ne (A & B), C) as matching patterns
+/// that can be simplified.
+/// One of A and B is considered the mask. The other is the value. This is
+/// described as the "AMask" or "BMask" part of the enum. If the enum contains
+/// only "Mask", then both A and B can be considered masks. If A is the mask,
+/// then it was proven that (A & C) == C. This is trivial if C == A or C == 0.
+/// If both A and C are constants, this proof is also easy.
+/// For the following explanations, we assume that A is the mask.
+///
+/// "AllOnes" declares that the comparison is true only if (A & B) == A or all
+/// bits of A are set in B.
+///   Example: (icmp eq (A & 3), 3) -> AMask_AllOnes
+///
+/// "AllZeros" declares that the comparison is true only if (A & B) == 0 or all
+/// bits of A are cleared in B.
+///   Example: (icmp eq (A & 3), 0) -> Mask_AllZeroes
+///
+/// "Mixed" declares that (A & B) == C and C might or might not contain any
+/// number of one bits and zero bits.
+///   Example: (icmp eq (A & 3), 1) -> AMask_Mixed
+///
+/// "Not" means that in above descriptions "==" should be replaced by "!=".
+///   Example: (icmp ne (A & 3), 3) -> AMask_NotAllOnes
+///
+/// If the mask A contains a single bit, then the following is equivalent:
+///    (icmp eq (A & B), A) equals (icmp ne (A & B), 0)
+///    (icmp ne (A & B), A) equals (icmp eq (A & B), 0)
+enum MaskedICmpType {
+  AMask_AllOnes           =     1,
+  AMask_NotAllOnes        =     2,
+  BMask_AllOnes           =     4,
+  BMask_NotAllOnes        =     8,
+  Mask_AllZeros           =    16,
+  Mask_NotAllZeros        =    32,
+  AMask_Mixed             =    64,
+  AMask_NotMixed          =   128,
+  BMask_Mixed             =   256,
+  BMask_NotMixed          =   512
+};
+
+/// Return the set of patterns (from MaskedICmpType) that (icmp SCC (A & B), C)
+/// satisfies.
+static unsigned getMaskedICmpType(Value *A, Value *B, Value *C,
+                                  ICmpInst::Predicate Pred) {
+  ConstantInt *ACst = dyn_cast<ConstantInt>(A);
+  ConstantInt *BCst = dyn_cast<ConstantInt>(B);
+  ConstantInt *CCst = dyn_cast<ConstantInt>(C);
+  bool IsEq = (Pred == ICmpInst::ICMP_EQ);
+  bool IsAPow2 = (ACst && !ACst->isZero() && ACst->getValue().isPowerOf2());
+  bool IsBPow2 = (BCst && !BCst->isZero() && BCst->getValue().isPowerOf2());
+  unsigned MaskVal = 0;
+  if (CCst && CCst->isZero()) {
+    // if C is zero, then both A and B qualify as mask
+    MaskVal |= (IsEq ? (Mask_AllZeros | AMask_Mixed | BMask_Mixed)
+                     : (Mask_NotAllZeros | AMask_NotMixed | BMask_NotMixed));
+    if (IsAPow2)
+      MaskVal |= (IsEq ? (AMask_NotAllOnes | AMask_NotMixed)
+                       : (AMask_AllOnes | AMask_Mixed));
+    if (IsBPow2)
+      MaskVal |= (IsEq ? (BMask_NotAllOnes | BMask_NotMixed)
+                       : (BMask_AllOnes | BMask_Mixed));
+    return MaskVal;
+  }
+
+  if (A == C) {
+    MaskVal |= (IsEq ? (AMask_AllOnes | AMask_Mixed)
+                     : (AMask_NotAllOnes | AMask_NotMixed));
+    if (IsAPow2)
+      MaskVal |= (IsEq ? (Mask_NotAllZeros | AMask_NotMixed)
+                       : (Mask_AllZeros | AMask_Mixed));
+  } else if (ACst && CCst && ConstantExpr::getAnd(ACst, CCst) == CCst) {
+    MaskVal |= (IsEq ? AMask_Mixed : AMask_NotMixed);
+  }
+
+  if (B == C) {
+    MaskVal |= (IsEq ? (BMask_AllOnes | BMask_Mixed)
+                     : (BMask_NotAllOnes | BMask_NotMixed));
+    if (IsBPow2)
+      MaskVal |= (IsEq ? (Mask_NotAllZeros | BMask_NotMixed)
+                       : (Mask_AllZeros | BMask_Mixed));
+  } else if (BCst && CCst && ConstantExpr::getAnd(BCst, CCst) == CCst) {
+    MaskVal |= (IsEq ? BMask_Mixed : BMask_NotMixed);
+  }
+
+  return MaskVal;
+}
+
+/// Convert an analysis of a masked ICmp into its equivalent if all boolean
+/// operations had the opposite sense. Since each "NotXXX" flag (recording !=)
+/// is adjacent to the corresponding normal flag (recording ==), this just
+/// involves swapping those bits over.
+static unsigned conjugateICmpMask(unsigned Mask) {
+  unsigned NewMask;
+  NewMask = (Mask & (AMask_AllOnes | BMask_AllOnes | Mask_AllZeros |
+                     AMask_Mixed | BMask_Mixed))
+            << 1;
+
+  NewMask |= (Mask & (AMask_NotAllOnes | BMask_NotAllOnes | Mask_NotAllZeros |
+                      AMask_NotMixed | BMask_NotMixed))
+             >> 1;
+
+  return NewMask;
+}
+
+// Adapts the external decomposeBitTestICmp for local use.
+static bool decomposeBitTestICmp(Value *LHS, Value *RHS, CmpInst::Predicate &Pred,
+                                 Value *&X, Value *&Y, Value *&Z) {
+  APInt Mask;
+  if (!llvm::decomposeBitTestICmp(LHS, RHS, Pred, X, Mask))
+    return false;
+
+  Y = ConstantInt::get(X->getType(), Mask);
+  Z = ConstantInt::get(X->getType(), 0);
+  return true;
+}
+
+/// Handle (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E).
+/// Return the pattern classes (from MaskedICmpType) for the left hand side and
+/// the right hand side as a pair.
+/// LHS and RHS are the left hand side and the right hand side ICmps and PredL
+/// and PredR are their predicates, respectively.
+static
+Optional<std::pair<unsigned, unsigned>>
+getMaskedTypeForICmpPair(Value *&A, Value *&B, Value *&C,
+                         Value *&D, Value *&E, ICmpInst *LHS,
+                         ICmpInst *RHS,
+                         ICmpInst::Predicate &PredL,
+                         ICmpInst::Predicate &PredR) {
+  // vectors are not (yet?) supported. Don't support pointers either.
+  if (!LHS->getOperand(0)->getType()->isIntegerTy() ||
+      !RHS->getOperand(0)->getType()->isIntegerTy())
+    return None;
+
+  // Here comes the tricky part:
+  // LHS might be of the form L11 & L12 == X, X == L21 & L22,
+  // and L11 & L12 == L21 & L22. The same goes for RHS.
+  // Now we must find those components L** and R**, that are equal, so
+  // that we can extract the parameters A, B, C, D, and E for the canonical
+  // above.
+  Value *L1 = LHS->getOperand(0);
+  Value *L2 = LHS->getOperand(1);
+  Value *L11, *L12, *L21, *L22;
+  // Check whether the icmp can be decomposed into a bit test.
+  if (decomposeBitTestICmp(L1, L2, PredL, L11, L12, L2)) {
+    L21 = L22 = L1 = nullptr;
+  } else {
+    // Look for ANDs in the LHS icmp.
+    if (!match(L1, m_And(m_Value(L11), m_Value(L12)))) {
+      // Any icmp can be viewed as being trivially masked; if it allows us to
+      // remove one, it's worth it.
+      L11 = L1;
+      L12 = Constant::getAllOnesValue(L1->getType());
+    }
+
+    if (!match(L2, m_And(m_Value(L21), m_Value(L22)))) {
+      L21 = L2;
+      L22 = Constant::getAllOnesValue(L2->getType());
+    }
+  }
+
+  // Bail if LHS was a icmp that can't be decomposed into an equality.
+  if (!ICmpInst::isEquality(PredL))
+    return None;
+
+  Value *R1 = RHS->getOperand(0);
+  Value *R2 = RHS->getOperand(1);
+  Value *R11, *R12;
+  bool Ok = false;
+  if (decomposeBitTestICmp(R1, R2, PredR, R11, R12, R2)) {
+    if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
+      A = R11;
+      D = R12;
+    } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
+      A = R12;
+      D = R11;
+    } else {
+      return None;
+    }
+    E = R2;
+    R1 = nullptr;
+    Ok = true;
+  } else {
+    if (!match(R1, m_And(m_Value(R11), m_Value(R12)))) {
+      // As before, model no mask as a trivial mask if it'll let us do an
+      // optimization.
+      R11 = R1;
+      R12 = Constant::getAllOnesValue(R1->getType());
+    }
+
+    if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
+      A = R11;
+      D = R12;
+      E = R2;
+      Ok = true;
+    } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
+      A = R12;
+      D = R11;
+      E = R2;
+      Ok = true;
+    }
+  }
+
+  // Bail if RHS was a icmp that can't be decomposed into an equality.
+  if (!ICmpInst::isEquality(PredR))
+    return None;
+
+  // Look for ANDs on the right side of the RHS icmp.
+  if (!Ok) {
+    if (!match(R2, m_And(m_Value(R11), m_Value(R12)))) {
+      R11 = R2;
+      R12 = Constant::getAllOnesValue(R2->getType());
+    }
+
+    if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
+      A = R11;
+      D = R12;
+      E = R1;
+      Ok = true;
+    } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
+      A = R12;
+      D = R11;
+      E = R1;
+      Ok = true;
+    } else {
+      return None;
+    }
+  }
+  if (!Ok)
+    return None;
+
+  if (L11 == A) {
+    B = L12;
+    C = L2;
+  } else if (L12 == A) {
+    B = L11;
+    C = L2;
+  } else if (L21 == A) {
+    B = L22;
+    C = L1;
+  } else if (L22 == A) {
+    B = L21;
+    C = L1;
+  }
+
+  unsigned LeftType = getMaskedICmpType(A, B, C, PredL);
+  unsigned RightType = getMaskedICmpType(A, D, E, PredR);
+  return Optional<std::pair<unsigned, unsigned>>(std::make_pair(LeftType, RightType));
+}
+
+/// Try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E) into a single
+/// (icmp(A & X) ==/!= Y), where the left-hand side is of type Mask_NotAllZeros
+/// and the right hand side is of type BMask_Mixed. For example,
+/// (icmp (A & 12) != 0) & (icmp (A & 15) == 8) -> (icmp (A & 15) == 8).
 static Value *foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
     ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, Value *A, Value *B, Value *C,
     Value *D, Value *E, ICmpInst::Predicate PredL, ICmpInst::Predicate PredR,
     InstCombiner::BuilderTy &Builder) {
-  // We are given the canonical form: 
-  //   (icmp ne (A & B), 0) & (icmp eq (A & D), E). 
-  // where D & E == E. 
-  // 
-  // If IsAnd is false, we get it in negated form: 
-  //   (icmp eq (A & B), 0) | (icmp ne (A & D), E) -> 
-  //      !((icmp ne (A & B), 0) & (icmp eq (A & D), E)). 
-  // 
-  // We currently handle the case of B, C, D, E are constant. 
-  // 
+  // We are given the canonical form:
+  //   (icmp ne (A & B), 0) & (icmp eq (A & D), E).
+  // where D & E == E.
+  //
+  // If IsAnd is false, we get it in negated form:
+  //   (icmp eq (A & B), 0) | (icmp ne (A & D), E) ->
+  //      !((icmp ne (A & B), 0) & (icmp eq (A & D), E)).
+  //
+  // We currently handle the case of B, C, D, E are constant.
+  //
   ConstantInt *BCst, *CCst, *DCst, *ECst;
   if (!match(B, m_ConstantInt(BCst)) || !match(C, m_ConstantInt(CCst)) ||
       !match(D, m_ConstantInt(DCst)) || !match(E, m_ConstantInt(ECst)))
-    return nullptr; 
- 
-  ICmpInst::Predicate NewCC = IsAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE; 
- 
-  // Update E to the canonical form when D is a power of two and RHS is 
-  // canonicalized as, 
-  // (icmp ne (A & D), 0) -> (icmp eq (A & D), D) or 
-  // (icmp ne (A & D), D) -> (icmp eq (A & D), 0). 
-  if (PredR != NewCC) 
-    ECst = cast<ConstantInt>(ConstantExpr::getXor(DCst, ECst)); 
- 
-  // If B or D is zero, skip because if LHS or RHS can be trivially folded by 
-  // other folding rules and this pattern won't apply any more. 
-  if (BCst->getValue() == 0 || DCst->getValue() == 0) 
-    return nullptr; 
- 
-  // If B and D don't intersect, ie. (B & D) == 0, no folding because we can't 
-  // deduce anything from it. 
-  // For example, 
-  // (icmp ne (A & 12), 0) & (icmp eq (A & 3), 1) -> no folding. 
-  if ((BCst->getValue() & DCst->getValue()) == 0) 
-    return nullptr; 
- 
-  // If the following two conditions are met: 
-  // 
-  // 1. mask B covers only a single bit that's not covered by mask D, that is, 
-  // (B & (B ^ D)) is a power of 2 (in other words, B minus the intersection of 
-  // B and D has only one bit set) and, 
-  // 
-  // 2. RHS (and E) indicates that the rest of B's bits are zero (in other 
-  // words, the intersection of B and D is zero), that is, ((B & D) & E) == 0 
-  // 
-  // then that single bit in B must be one and thus the whole expression can be 
-  // folded to 
-  //   (A & (B | D)) == (B & (B ^ D)) | E. 
-  // 
-  // For example, 
-  // (icmp ne (A & 12), 0) & (icmp eq (A & 7), 1) -> (icmp eq (A & 15), 9) 
-  // (icmp ne (A & 15), 0) & (icmp eq (A & 7), 0) -> (icmp eq (A & 15), 8) 
-  if ((((BCst->getValue() & DCst->getValue()) & ECst->getValue()) == 0) && 
-      (BCst->getValue() & (BCst->getValue() ^ DCst->getValue())).isPowerOf2()) { 
-    APInt BorD = BCst->getValue() | DCst->getValue(); 
-    APInt BandBxorDorE = (BCst->getValue() & (BCst->getValue() ^ DCst->getValue())) | 
-        ECst->getValue(); 
-    Value *NewMask = ConstantInt::get(BCst->getType(), BorD); 
-    Value *NewMaskedValue = ConstantInt::get(BCst->getType(), BandBxorDorE); 
-    Value *NewAnd = Builder.CreateAnd(A, NewMask); 
-    return Builder.CreateICmp(NewCC, NewAnd, NewMaskedValue); 
-  } 
- 
-  auto IsSubSetOrEqual = [](ConstantInt *C1, ConstantInt *C2) { 
-    return (C1->getValue() & C2->getValue()) == C1->getValue(); 
-  }; 
-  auto IsSuperSetOrEqual = [](ConstantInt *C1, ConstantInt *C2) { 
-    return (C1->getValue() & C2->getValue()) == C2->getValue(); 
-  }; 
- 
-  // In the following, we consider only the cases where B is a superset of D, B 
-  // is a subset of D, or B == D because otherwise there's at least one bit 
-  // covered by B but not D, in which case we can't deduce much from it, so 
-  // no folding (aside from the single must-be-one bit case right above.) 
-  // For example, 
-  // (icmp ne (A & 14), 0) & (icmp eq (A & 3), 1) -> no folding. 
-  if (!IsSubSetOrEqual(BCst, DCst) && !IsSuperSetOrEqual(BCst, DCst)) 
-    return nullptr; 
- 
-  // At this point, either B is a superset of D, B is a subset of D or B == D. 
- 
-  // If E is zero, if B is a subset of (or equal to) D, LHS and RHS contradict 
-  // and the whole expression becomes false (or true if negated), otherwise, no 
-  // folding. 
-  // For example, 
-  // (icmp ne (A & 3), 0) & (icmp eq (A & 7), 0) -> false. 
-  // (icmp ne (A & 15), 0) & (icmp eq (A & 3), 0) -> no folding. 
-  if (ECst->isZero()) { 
-    if (IsSubSetOrEqual(BCst, DCst)) 
-      return ConstantInt::get(LHS->getType(), !IsAnd); 
-    return nullptr; 
-  } 
- 
-  // At this point, B, D, E aren't zero and (B & D) == B, (B & D) == D or B == 
-  // D. If B is a superset of (or equal to) D, since E is not zero, LHS is 
-  // subsumed by RHS (RHS implies LHS.) So the whole expression becomes 
-  // RHS. For example, 
-  // (icmp ne (A & 255), 0) & (icmp eq (A & 15), 8) -> (icmp eq (A & 15), 8). 
-  // (icmp ne (A & 15), 0) & (icmp eq (A & 15), 8) -> (icmp eq (A & 15), 8). 
-  if (IsSuperSetOrEqual(BCst, DCst)) 
-    return RHS; 
-  // Otherwise, B is a subset of D. If B and E have a common bit set, 
-  // ie. (B & E) != 0, then LHS is subsumed by RHS. For example. 
-  // (icmp ne (A & 12), 0) & (icmp eq (A & 15), 8) -> (icmp eq (A & 15), 8). 
-  assert(IsSubSetOrEqual(BCst, DCst) && "Precondition due to above code"); 
-  if ((BCst->getValue() & ECst->getValue()) != 0) 
-    return RHS; 
-  // Otherwise, LHS and RHS contradict and the whole expression becomes false 
-  // (or true if negated.) For example, 
-  // (icmp ne (A & 7), 0) & (icmp eq (A & 15), 8) -> false. 
-  // (icmp ne (A & 6), 0) & (icmp eq (A & 15), 8) -> false. 
-  return ConstantInt::get(LHS->getType(), !IsAnd); 
-} 
- 
-/// Try to fold (icmp(A & B) ==/!= 0) &/| (icmp(A & D) ==/!= E) into a single 
-/// (icmp(A & X) ==/!= Y), where the left-hand side and the right hand side 
-/// aren't of the common mask pattern type. 
-static Value *foldLogOpOfMaskedICmpsAsymmetric( 
+    return nullptr;
+
+  ICmpInst::Predicate NewCC = IsAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE;
+
+  // Update E to the canonical form when D is a power of two and RHS is
+  // canonicalized as,
+  // (icmp ne (A & D), 0) -> (icmp eq (A & D), D) or
+  // (icmp ne (A & D), D) -> (icmp eq (A & D), 0).
+  if (PredR != NewCC)
+    ECst = cast<ConstantInt>(ConstantExpr::getXor(DCst, ECst));
+
+  // If B or D is zero, skip because if LHS or RHS can be trivially folded by
+  // other folding rules and this pattern won't apply any more.
+  if (BCst->getValue() == 0 || DCst->getValue() == 0)
+    return nullptr;
+
+  // If B and D don't intersect, ie. (B & D) == 0, no folding because we can't
+  // deduce anything from it.
+  // For example,
+  // (icmp ne (A & 12), 0) & (icmp eq (A & 3), 1) -> no folding.
+  if ((BCst->getValue() & DCst->getValue()) == 0)
+    return nullptr;
+
+  // If the following two conditions are met:
+  //
+  // 1. mask B covers only a single bit that's not covered by mask D, that is,
+  // (B & (B ^ D)) is a power of 2 (in other words, B minus the intersection of
+  // B and D has only one bit set) and,
+  //
+  // 2. RHS (and E) indicates that the rest of B's bits are zero (in other
+  // words, the intersection of B and D is zero), that is, ((B & D) & E) == 0
+  //
+  // then that single bit in B must be one and thus the whole expression can be
+  // folded to
+  //   (A & (B | D)) == (B & (B ^ D)) | E.
+  //
+  // For example,
+  // (icmp ne (A & 12), 0) & (icmp eq (A & 7), 1) -> (icmp eq (A & 15), 9)
+  // (icmp ne (A & 15), 0) & (icmp eq (A & 7), 0) -> (icmp eq (A & 15), 8)
+  if ((((BCst->getValue() & DCst->getValue()) & ECst->getValue()) == 0) &&
+      (BCst->getValue() & (BCst->getValue() ^ DCst->getValue())).isPowerOf2()) {
+    APInt BorD = BCst->getValue() | DCst->getValue();
+    APInt BandBxorDorE = (BCst->getValue() & (BCst->getValue() ^ DCst->getValue())) |
+        ECst->getValue();
+    Value *NewMask = ConstantInt::get(BCst->getType(), BorD);
+    Value *NewMaskedValue = ConstantInt::get(BCst->getType(), BandBxorDorE);
+    Value *NewAnd = Builder.CreateAnd(A, NewMask);
+    return Builder.CreateICmp(NewCC, NewAnd, NewMaskedValue);
+  }
+
+  auto IsSubSetOrEqual = [](ConstantInt *C1, ConstantInt *C2) {
+    return (C1->getValue() & C2->getValue()) == C1->getValue();
+  };
+  auto IsSuperSetOrEqual = [](ConstantInt *C1, ConstantInt *C2) {
+    return (C1->getValue() & C2->getValue()) == C2->getValue();
+  };
+
+  // In the following, we consider only the cases where B is a superset of D, B
+  // is a subset of D, or B == D because otherwise there's at least one bit
+  // covered by B but not D, in which case we can't deduce much from it, so
+  // no folding (aside from the single must-be-one bit case right above.)
+  // For example,
+  // (icmp ne (A & 14), 0) & (icmp eq (A & 3), 1) -> no folding.
+  if (!IsSubSetOrEqual(BCst, DCst) && !IsSuperSetOrEqual(BCst, DCst))
+    return nullptr;
+
+  // At this point, either B is a superset of D, B is a subset of D or B == D.
+
+  // If E is zero, if B is a subset of (or equal to) D, LHS and RHS contradict
+  // and the whole expression becomes false (or true if negated), otherwise, no
+  // folding.
+  // For example,
+  // (icmp ne (A & 3), 0) & (icmp eq (A & 7), 0) -> false.
+  // (icmp ne (A & 15), 0) & (icmp eq (A & 3), 0) -> no folding.
+  if (ECst->isZero()) {
+    if (IsSubSetOrEqual(BCst, DCst))
+      return ConstantInt::get(LHS->getType(), !IsAnd);
+    return nullptr;
+  }
+
+  // At this point, B, D, E aren't zero and (B & D) == B, (B & D) == D or B ==
+  // D. If B is a superset of (or equal to) D, since E is not zero, LHS is
+  // subsumed by RHS (RHS implies LHS.) So the whole expression becomes
+  // RHS. For example,
+  // (icmp ne (A & 255), 0) & (icmp eq (A & 15), 8) -> (icmp eq (A & 15), 8).
+  // (icmp ne (A & 15), 0) & (icmp eq (A & 15), 8) -> (icmp eq (A & 15), 8).
+  if (IsSuperSetOrEqual(BCst, DCst))
+    return RHS;
+  // Otherwise, B is a subset of D. If B and E have a common bit set,
+  // ie. (B & E) != 0, then LHS is subsumed by RHS. For example.
+  // (icmp ne (A & 12), 0) & (icmp eq (A & 15), 8) -> (icmp eq (A & 15), 8).
+  assert(IsSubSetOrEqual(BCst, DCst) && "Precondition due to above code");
+  if ((BCst->getValue() & ECst->getValue()) != 0)
+    return RHS;
+  // Otherwise, LHS and RHS contradict and the whole expression becomes false
+  // (or true if negated.) For example,
+  // (icmp ne (A & 7), 0) & (icmp eq (A & 15), 8) -> false.
+  // (icmp ne (A & 6), 0) & (icmp eq (A & 15), 8) -> false.
+  return ConstantInt::get(LHS->getType(), !IsAnd);
+}
+
+/// Try to fold (icmp(A & B) ==/!= 0) &/| (icmp(A & D) ==/!= E) into a single
+/// (icmp(A & X) ==/!= Y), where the left-hand side and the right hand side
+/// aren't of the common mask pattern type.
+static Value *foldLogOpOfMaskedICmpsAsymmetric(
     ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, Value *A, Value *B, Value *C,
     Value *D, Value *E, ICmpInst::Predicate PredL, ICmpInst::Predicate PredR,
     unsigned LHSMask, unsigned RHSMask, InstCombiner::BuilderTy &Builder) {
-  assert(ICmpInst::isEquality(PredL) && ICmpInst::isEquality(PredR) && 
-         "Expected equality predicates for masked type of icmps."); 
-  // Handle Mask_NotAllZeros-BMask_Mixed cases. 
-  // (icmp ne/eq (A & B), C) &/| (icmp eq/ne (A & D), E), or 
-  // (icmp eq/ne (A & B), C) &/| (icmp ne/eq (A & D), E) 
-  //    which gets swapped to 
-  //    (icmp ne/eq (A & D), E) &/| (icmp eq/ne (A & B), C). 
-  if (!IsAnd) { 
-    LHSMask = conjugateICmpMask(LHSMask); 
-    RHSMask = conjugateICmpMask(RHSMask); 
-  } 
-  if ((LHSMask & Mask_NotAllZeros) && (RHSMask & BMask_Mixed)) { 
-    if (Value *V = foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed( 
-            LHS, RHS, IsAnd, A, B, C, D, E, 
-            PredL, PredR, Builder)) { 
-      return V; 
-    } 
-  } else if ((LHSMask & BMask_Mixed) && (RHSMask & Mask_NotAllZeros)) { 
-    if (Value *V = foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed( 
-            RHS, LHS, IsAnd, A, D, E, B, C, 
-            PredR, PredL, Builder)) { 
-      return V; 
-    } 
-  } 
-  return nullptr; 
-} 
- 
-/// Try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E) 
-/// into a single (icmp(A & X) ==/!= Y). 
-static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, 
+  assert(ICmpInst::isEquality(PredL) && ICmpInst::isEquality(PredR) &&
+         "Expected equality predicates for masked type of icmps.");
+  // Handle Mask_NotAllZeros-BMask_Mixed cases.
+  // (icmp ne/eq (A & B), C) &/| (icmp eq/ne (A & D), E), or
+  // (icmp eq/ne (A & B), C) &/| (icmp ne/eq (A & D), E)
+  //    which gets swapped to
+  //    (icmp ne/eq (A & D), E) &/| (icmp eq/ne (A & B), C).
+  if (!IsAnd) {
+    LHSMask = conjugateICmpMask(LHSMask);
+    RHSMask = conjugateICmpMask(RHSMask);
+  }
+  if ((LHSMask & Mask_NotAllZeros) && (RHSMask & BMask_Mixed)) {
+    if (Value *V = foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
+            LHS, RHS, IsAnd, A, B, C, D, E,
+            PredL, PredR, Builder)) {
+      return V;
+    }
+  } else if ((LHSMask & BMask_Mixed) && (RHSMask & Mask_NotAllZeros)) {
+    if (Value *V = foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
+            RHS, LHS, IsAnd, A, D, E, B, C,
+            PredR, PredL, Builder)) {
+      return V;
+    }
+  }
+  return nullptr;
+}
+
+/// Try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E)
+/// into a single (icmp(A & X) ==/!= Y).
+static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
                                      InstCombiner::BuilderTy &Builder) {
-  Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr; 
-  ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate(); 
-  Optional<std::pair<unsigned, unsigned>> MaskPair = 
-      getMaskedTypeForICmpPair(A, B, C, D, E, LHS, RHS, PredL, PredR); 
-  if (!MaskPair) 
-    return nullptr; 
-  assert(ICmpInst::isEquality(PredL) && ICmpInst::isEquality(PredR) && 
-         "Expected equality predicates for masked type of icmps."); 
-  unsigned LHSMask = MaskPair->first; 
-  unsigned RHSMask = MaskPair->second; 
-  unsigned Mask = LHSMask & RHSMask; 
-  if (Mask == 0) { 
-    // Even if the two sides don't share a common pattern, check if folding can 
-    // still happen. 
-    if (Value *V = foldLogOpOfMaskedICmpsAsymmetric( 
-            LHS, RHS, IsAnd, A, B, C, D, E, PredL, PredR, LHSMask, RHSMask, 
-            Builder)) 
-      return V; 
-    return nullptr; 
-  } 
- 
-  // In full generality: 
-  //     (icmp (A & B) Op C) | (icmp (A & D) Op E) 
-  // ==  ![ (icmp (A & B) !Op C) & (icmp (A & D) !Op E) ] 
-  // 
-  // If the latter can be converted into (icmp (A & X) Op Y) then the former is 
-  // equivalent to (icmp (A & X) !Op Y). 
-  // 
-  // Therefore, we can pretend for the rest of this function that we're dealing 
-  // with the conjunction, provided we flip the sense of any comparisons (both 
-  // input and output). 
- 
-  // In most cases we're going to produce an EQ for the "&&" case. 
-  ICmpInst::Predicate NewCC = IsAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE; 
-  if (!IsAnd) { 
-    // Convert the masking analysis into its equivalent with negated 
-    // comparisons. 
-    Mask = conjugateICmpMask(Mask); 
-  } 
- 
-  if (Mask & Mask_AllZeros) { 
-    // (icmp eq (A & B), 0) & (icmp eq (A & D), 0) 
-    // -> (icmp eq (A & (B|D)), 0) 
-    Value *NewOr = Builder.CreateOr(B, D); 
-    Value *NewAnd = Builder.CreateAnd(A, NewOr); 
-    // We can't use C as zero because we might actually handle 
-    //   (icmp ne (A & B), B) & (icmp ne (A & D), D) 
-    // with B and D, having a single bit set. 
-    Value *Zero = Constant::getNullValue(A->getType()); 
-    return Builder.CreateICmp(NewCC, NewAnd, Zero); 
-  } 
-  if (Mask & BMask_AllOnes) { 
-    // (icmp eq (A & B), B) & (icmp eq (A & D), D) 
-    // -> (icmp eq (A & (B|D)), (B|D)) 
-    Value *NewOr = Builder.CreateOr(B, D); 
-    Value *NewAnd = Builder.CreateAnd(A, NewOr); 
-    return Builder.CreateICmp(NewCC, NewAnd, NewOr); 
-  } 
-  if (Mask & AMask_AllOnes) { 
-    // (icmp eq (A & B), A) & (icmp eq (A & D), A) 
-    // -> (icmp eq (A & (B&D)), A) 
-    Value *NewAnd1 = Builder.CreateAnd(B, D); 
-    Value *NewAnd2 = Builder.CreateAnd(A, NewAnd1); 
-    return Builder.CreateICmp(NewCC, NewAnd2, A); 
-  } 
- 
-  // Remaining cases assume at least that B and D are constant, and depend on 
-  // their actual values. This isn't strictly necessary, just a "handle the 
-  // easy cases for now" decision. 
+  Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr;
+  ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
+  Optional<std::pair<unsigned, unsigned>> MaskPair =
+      getMaskedTypeForICmpPair(A, B, C, D, E, LHS, RHS, PredL, PredR);
+  if (!MaskPair)
+    return nullptr;
+  assert(ICmpInst::isEquality(PredL) && ICmpInst::isEquality(PredR) &&
+         "Expected equality predicates for masked type of icmps.");
+  unsigned LHSMask = MaskPair->first;
+  unsigned RHSMask = MaskPair->second;
+  unsigned Mask = LHSMask & RHSMask;
+  if (Mask == 0) {
+    // Even if the two sides don't share a common pattern, check if folding can
+    // still happen.
+    if (Value *V = foldLogOpOfMaskedICmpsAsymmetric(
+            LHS, RHS, IsAnd, A, B, C, D, E, PredL, PredR, LHSMask, RHSMask,
+            Builder))
+      return V;
+    return nullptr;
+  }
+
+  // In full generality:
+  //     (icmp (A & B) Op C) | (icmp (A & D) Op E)
+  // ==  ![ (icmp (A & B) !Op C) & (icmp (A & D) !Op E) ]
+  //
+  // If the latter can be converted into (icmp (A & X) Op Y) then the former is
+  // equivalent to (icmp (A & X) !Op Y).
+  //
+  // Therefore, we can pretend for the rest of this function that we're dealing
+  // with the conjunction, provided we flip the sense of any comparisons (both
+  // input and output).
+
+  // In most cases we're going to produce an EQ for the "&&" case.
+  ICmpInst::Predicate NewCC = IsAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE;
+  if (!IsAnd) {
+    // Convert the masking analysis into its equivalent with negated
+    // comparisons.
+    Mask = conjugateICmpMask(Mask);
+  }
+
+  if (Mask & Mask_AllZeros) {
+    // (icmp eq (A & B), 0) & (icmp eq (A & D), 0)
+    // -> (icmp eq (A & (B|D)), 0)
+    Value *NewOr = Builder.CreateOr(B, D);
+    Value *NewAnd = Builder.CreateAnd(A, NewOr);
+    // We can't use C as zero because we might actually handle
+    //   (icmp ne (A & B), B) & (icmp ne (A & D), D)
+    // with B and D, having a single bit set.
+    Value *Zero = Constant::getNullValue(A->getType());
+    return Builder.CreateICmp(NewCC, NewAnd, Zero);
+  }
+  if (Mask & BMask_AllOnes) {
+    // (icmp eq (A & B), B) & (icmp eq (A & D), D)
+    // -> (icmp eq (A & (B|D)), (B|D))
+    Value *NewOr = Builder.CreateOr(B, D);
+    Value *NewAnd = Builder.CreateAnd(A, NewOr);
+    return Builder.CreateICmp(NewCC, NewAnd, NewOr);
+  }
+  if (Mask & AMask_AllOnes) {
+    // (icmp eq (A & B), A) & (icmp eq (A & D), A)
+    // -> (icmp eq (A & (B&D)), A)
+    Value *NewAnd1 = Builder.CreateAnd(B, D);
+    Value *NewAnd2 = Builder.CreateAnd(A, NewAnd1);
+    return Builder.CreateICmp(NewCC, NewAnd2, A);
+  }
+
+  // Remaining cases assume at least that B and D are constant, and depend on
+  // their actual values. This isn't strictly necessary, just a "handle the
+  // easy cases for now" decision.
   ConstantInt *BCst, *DCst;
   if (!match(B, m_ConstantInt(BCst)) || !match(D, m_ConstantInt(DCst)))
-    return nullptr; 
- 
-  if (Mask & (Mask_NotAllZeros | BMask_NotAllOnes)) { 
-    // (icmp ne (A & B), 0) & (icmp ne (A & D), 0) and 
-    // (icmp ne (A & B), B) & (icmp ne (A & D), D) 
-    //     -> (icmp ne (A & B), 0) or (icmp ne (A & D), 0) 
-    // Only valid if one of the masks is a superset of the other (check "B&D" is 
-    // the same as either B or D). 
-    APInt NewMask = BCst->getValue() & DCst->getValue(); 
- 
-    if (NewMask == BCst->getValue()) 
-      return LHS; 
-    else if (NewMask == DCst->getValue()) 
-      return RHS; 
-  } 
- 
-  if (Mask & AMask_NotAllOnes) { 
-    // (icmp ne (A & B), B) & (icmp ne (A & D), D) 
-    //     -> (icmp ne (A & B), A) or (icmp ne (A & D), A) 
-    // Only valid if one of the masks is a superset of the other (check "B|D" is 
-    // the same as either B or D). 
-    APInt NewMask = BCst->getValue() | DCst->getValue(); 
- 
-    if (NewMask == BCst->getValue()) 
-      return LHS; 
-    else if (NewMask == DCst->getValue()) 
-      return RHS; 
-  } 
- 
-  if (Mask & BMask_Mixed) { 
-    // (icmp eq (A & B), C) & (icmp eq (A & D), E) 
-    // We already know that B & C == C && D & E == E. 
-    // If we can prove that (B & D) & (C ^ E) == 0, that is, the bits of 
-    // C and E, which are shared by both the mask B and the mask D, don't 
-    // contradict, then we can transform to 
-    // -> (icmp eq (A & (B|D)), (C|E)) 
-    // Currently, we only handle the case of B, C, D, and E being constant. 
-    // We can't simply use C and E because we might actually handle 
-    //   (icmp ne (A & B), B) & (icmp eq (A & D), D) 
-    // with B and D, having a single bit set. 
+    return nullptr;
+
+  if (Mask & (Mask_NotAllZeros | BMask_NotAllOnes)) {
+    // (icmp ne (A & B), 0) & (icmp ne (A & D), 0) and
+    // (icmp ne (A & B), B) & (icmp ne (A & D), D)
+    //     -> (icmp ne (A & B), 0) or (icmp ne (A & D), 0)
+    // Only valid if one of the masks is a superset of the other (check "B&D" is
+    // the same as either B or D).
+    APInt NewMask = BCst->getValue() & DCst->getValue();
+
+    if (NewMask == BCst->getValue())
+      return LHS;
+    else if (NewMask == DCst->getValue())
+      return RHS;
+  }
+
+  if (Mask & AMask_NotAllOnes) {
+    // (icmp ne (A & B), B) & (icmp ne (A & D), D)
+    //     -> (icmp ne (A & B), A) or (icmp ne (A & D), A)
+    // Only valid if one of the masks is a superset of the other (check "B|D" is
+    // the same as either B or D).
+    APInt NewMask = BCst->getValue() | DCst->getValue();
+
+    if (NewMask == BCst->getValue())
+      return LHS;
+    else if (NewMask == DCst->getValue())
+      return RHS;
+  }
+
+  if (Mask & BMask_Mixed) {
+    // (icmp eq (A & B), C) & (icmp eq (A & D), E)
+    // We already know that B & C == C && D & E == E.
+    // If we can prove that (B & D) & (C ^ E) == 0, that is, the bits of
+    // C and E, which are shared by both the mask B and the mask D, don't
+    // contradict, then we can transform to
+    // -> (icmp eq (A & (B|D)), (C|E))
+    // Currently, we only handle the case of B, C, D, and E being constant.
+    // We can't simply use C and E because we might actually handle
+    //   (icmp ne (A & B), B) & (icmp eq (A & D), D)
+    // with B and D, having a single bit set.
     ConstantInt *CCst, *ECst;
     if (!match(C, m_ConstantInt(CCst)) || !match(E, m_ConstantInt(ECst)))
-      return nullptr; 
-    if (PredL != NewCC) 
-      CCst = cast<ConstantInt>(ConstantExpr::getXor(BCst, CCst)); 
-    if (PredR != NewCC) 
-      ECst = cast<ConstantInt>(ConstantExpr::getXor(DCst, ECst)); 
- 
-    // If there is a conflict, we should actually return a false for the 
-    // whole construct. 
-    if (((BCst->getValue() & DCst->getValue()) & 
-         (CCst->getValue() ^ ECst->getValue())).getBoolValue()) 
-      return ConstantInt::get(LHS->getType(), !IsAnd); 
- 
-    Value *NewOr1 = Builder.CreateOr(B, D); 
-    Value *NewOr2 = ConstantExpr::getOr(CCst, ECst); 
-    Value *NewAnd = Builder.CreateAnd(A, NewOr1); 
-    return Builder.CreateICmp(NewCC, NewAnd, NewOr2); 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Try to fold a signed range checked with lower bound 0 to an unsigned icmp. 
-/// Example: (icmp sge x, 0) & (icmp slt x, n) --> icmp ult x, n 
-/// If \p Inverted is true then the check is for the inverted range, e.g. 
-/// (icmp slt x, 0) | (icmp sgt x, n) --> icmp ugt x, n 
+      return nullptr;
+    if (PredL != NewCC)
+      CCst = cast<ConstantInt>(ConstantExpr::getXor(BCst, CCst));
+    if (PredR != NewCC)
+      ECst = cast<ConstantInt>(ConstantExpr::getXor(DCst, ECst));
+
+    // If there is a conflict, we should actually return a false for the
+    // whole construct.
+    if (((BCst->getValue() & DCst->getValue()) &
+         (CCst->getValue() ^ ECst->getValue())).getBoolValue())
+      return ConstantInt::get(LHS->getType(), !IsAnd);
+
+    Value *NewOr1 = Builder.CreateOr(B, D);
+    Value *NewOr2 = ConstantExpr::getOr(CCst, ECst);
+    Value *NewAnd = Builder.CreateAnd(A, NewOr1);
+    return Builder.CreateICmp(NewCC, NewAnd, NewOr2);
+  }
+
+  return nullptr;
+}
+
+/// Try to fold a signed range checked with lower bound 0 to an unsigned icmp.
+/// Example: (icmp sge x, 0) & (icmp slt x, n) --> icmp ult x, n
+/// If \p Inverted is true then the check is for the inverted range, e.g.
+/// (icmp slt x, 0) | (icmp sgt x, n) --> icmp ugt x, n
 Value *InstCombinerImpl::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1,
                                             bool Inverted) {
-  // Check the lower range comparison, e.g. x >= 0 
-  // InstCombine already ensured that if there is a constant it's on the RHS. 
-  ConstantInt *RangeStart = dyn_cast<ConstantInt>(Cmp0->getOperand(1)); 
-  if (!RangeStart) 
-    return nullptr; 
- 
-  ICmpInst::Predicate Pred0 = (Inverted ? Cmp0->getInversePredicate() : 
-                               Cmp0->getPredicate()); 
- 
-  // Accept x > -1 or x >= 0 (after potentially inverting the predicate). 
-  if (!((Pred0 == ICmpInst::ICMP_SGT && RangeStart->isMinusOne()) || 
-        (Pred0 == ICmpInst::ICMP_SGE && RangeStart->isZero()))) 
-    return nullptr; 
- 
-  ICmpInst::Predicate Pred1 = (Inverted ? Cmp1->getInversePredicate() : 
-                               Cmp1->getPredicate()); 
- 
-  Value *Input = Cmp0->getOperand(0); 
-  Value *RangeEnd; 
-  if (Cmp1->getOperand(0) == Input) { 
-    // For the upper range compare we have: icmp x, n 
-    RangeEnd = Cmp1->getOperand(1); 
-  } else if (Cmp1->getOperand(1) == Input) { 
-    // For the upper range compare we have: icmp n, x 
-    RangeEnd = Cmp1->getOperand(0); 
-    Pred1 = ICmpInst::getSwappedPredicate(Pred1); 
-  } else { 
-    return nullptr; 
-  } 
- 
-  // Check the upper range comparison, e.g. x < n 
-  ICmpInst::Predicate NewPred; 
-  switch (Pred1) { 
-    case ICmpInst::ICMP_SLT: NewPred = ICmpInst::ICMP_ULT; break; 
-    case ICmpInst::ICMP_SLE: NewPred = ICmpInst::ICMP_ULE; break; 
-    default: return nullptr; 
-  } 
- 
-  // This simplification is only valid if the upper range is not negative. 
-  KnownBits Known = computeKnownBits(RangeEnd, /*Depth=*/0, Cmp1); 
-  if (!Known.isNonNegative()) 
-    return nullptr; 
- 
-  if (Inverted) 
-    NewPred = ICmpInst::getInversePredicate(NewPred); 
- 
-  return Builder.CreateICmp(NewPred, Input, RangeEnd); 
-} 
- 
-static Value * 
-foldAndOrOfEqualityCmpsWithConstants(ICmpInst *LHS, ICmpInst *RHS, 
-                                     bool JoinedByAnd, 
-                                     InstCombiner::BuilderTy &Builder) { 
-  Value *X = LHS->getOperand(0); 
-  if (X != RHS->getOperand(0)) 
-    return nullptr; 
- 
-  const APInt *C1, *C2; 
-  if (!match(LHS->getOperand(1), m_APInt(C1)) || 
-      !match(RHS->getOperand(1), m_APInt(C2))) 
-    return nullptr; 
- 
-  // We only handle (X != C1 && X != C2) and (X == C1 || X == C2). 
-  ICmpInst::Predicate Pred = LHS->getPredicate(); 
-  if (Pred !=  RHS->getPredicate()) 
-    return nullptr; 
-  if (JoinedByAnd && Pred != ICmpInst::ICMP_NE) 
-    return nullptr; 
-  if (!JoinedByAnd && Pred != ICmpInst::ICMP_EQ) 
-    return nullptr; 
- 
-  // The larger unsigned constant goes on the right. 
-  if (C1->ugt(*C2)) 
-    std::swap(C1, C2); 
- 
-  APInt Xor = *C1 ^ *C2; 
-  if (Xor.isPowerOf2()) { 
-    // If LHSC and RHSC differ by only one bit, then set that bit in X and 
-    // compare against the larger constant: 
-    // (X == C1 || X == C2) --> (X | (C1 ^ C2)) == C2 
-    // (X != C1 && X != C2) --> (X | (C1 ^ C2)) != C2 
-    // We choose an 'or' with a Pow2 constant rather than the inverse mask with 
-    // 'and' because that may lead to smaller codegen from a smaller constant. 
-    Value *Or = Builder.CreateOr(X, ConstantInt::get(X->getType(), Xor)); 
-    return Builder.CreateICmp(Pred, Or, ConstantInt::get(X->getType(), *C2)); 
-  } 
- 
-  // Special case: get the ordering right when the values wrap around zero. 
-  // Ie, we assumed the constants were unsigned when swapping earlier. 
-  if (C1->isNullValue() && C2->isAllOnesValue()) 
-    std::swap(C1, C2); 
- 
-  if (*C1 == *C2 - 1) { 
-    // (X == 13 || X == 14) --> X - 13 <=u 1 
-    // (X != 13 && X != 14) --> X - 13  >u 1 
-    // An 'add' is the canonical IR form, so favor that over a 'sub'. 
-    Value *Add = Builder.CreateAdd(X, ConstantInt::get(X->getType(), -(*C1))); 
-    auto NewPred = JoinedByAnd ? ICmpInst::ICMP_UGT : ICmpInst::ICMP_ULE; 
-    return Builder.CreateICmp(NewPred, Add, ConstantInt::get(X->getType(), 1)); 
-  } 
- 
-  return nullptr; 
-} 
- 
-// Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2) 
-// Fold (!iszero(A & K1) & !iszero(A & K2)) -> (A & (K1 | K2)) == (K1 | K2) 
+  // Check the lower range comparison, e.g. x >= 0
+  // InstCombine already ensured that if there is a constant it's on the RHS.
+  ConstantInt *RangeStart = dyn_cast<ConstantInt>(Cmp0->getOperand(1));
+  if (!RangeStart)
+    return nullptr;
+
+  ICmpInst::Predicate Pred0 = (Inverted ? Cmp0->getInversePredicate() :
+                               Cmp0->getPredicate());
+
+  // Accept x > -1 or x >= 0 (after potentially inverting the predicate).
+  if (!((Pred0 == ICmpInst::ICMP_SGT && RangeStart->isMinusOne()) ||
+        (Pred0 == ICmpInst::ICMP_SGE && RangeStart->isZero())))
+    return nullptr;
+
+  ICmpInst::Predicate Pred1 = (Inverted ? Cmp1->getInversePredicate() :
+                               Cmp1->getPredicate());
+
+  Value *Input = Cmp0->getOperand(0);
+  Value *RangeEnd;
+  if (Cmp1->getOperand(0) == Input) {
+    // For the upper range compare we have: icmp x, n
+    RangeEnd = Cmp1->getOperand(1);
+  } else if (Cmp1->getOperand(1) == Input) {
+    // For the upper range compare we have: icmp n, x
+    RangeEnd = Cmp1->getOperand(0);
+    Pred1 = ICmpInst::getSwappedPredicate(Pred1);
+  } else {
+    return nullptr;
+  }
+
+  // Check the upper range comparison, e.g. x < n
+  ICmpInst::Predicate NewPred;
+  switch (Pred1) {
+    case ICmpInst::ICMP_SLT: NewPred = ICmpInst::ICMP_ULT; break;
+    case ICmpInst::ICMP_SLE: NewPred = ICmpInst::ICMP_ULE; break;
+    default: return nullptr;
+  }
+
+  // This simplification is only valid if the upper range is not negative.
+  KnownBits Known = computeKnownBits(RangeEnd, /*Depth=*/0, Cmp1);
+  if (!Known.isNonNegative())
+    return nullptr;
+
+  if (Inverted)
+    NewPred = ICmpInst::getInversePredicate(NewPred);
+
+  return Builder.CreateICmp(NewPred, Input, RangeEnd);
+}
+
+static Value *
+foldAndOrOfEqualityCmpsWithConstants(ICmpInst *LHS, ICmpInst *RHS,
+                                     bool JoinedByAnd,
+                                     InstCombiner::BuilderTy &Builder) {
+  Value *X = LHS->getOperand(0);
+  if (X != RHS->getOperand(0))
+    return nullptr;
+
+  const APInt *C1, *C2;
+  if (!match(LHS->getOperand(1), m_APInt(C1)) ||
+      !match(RHS->getOperand(1), m_APInt(C2)))
+    return nullptr;
+
+  // We only handle (X != C1 && X != C2) and (X == C1 || X == C2).
+  ICmpInst::Predicate Pred = LHS->getPredicate();
+  if (Pred !=  RHS->getPredicate())
+    return nullptr;
+  if (JoinedByAnd && Pred != ICmpInst::ICMP_NE)
+    return nullptr;
+  if (!JoinedByAnd && Pred != ICmpInst::ICMP_EQ)
+    return nullptr;
+
+  // The larger unsigned constant goes on the right.
+  if (C1->ugt(*C2))
+    std::swap(C1, C2);
+
+  APInt Xor = *C1 ^ *C2;
+  if (Xor.isPowerOf2()) {
+    // If LHSC and RHSC differ by only one bit, then set that bit in X and
+    // compare against the larger constant:
+    // (X == C1 || X == C2) --> (X | (C1 ^ C2)) == C2
+    // (X != C1 && X != C2) --> (X | (C1 ^ C2)) != C2
+    // We choose an 'or' with a Pow2 constant rather than the inverse mask with
+    // 'and' because that may lead to smaller codegen from a smaller constant.
+    Value *Or = Builder.CreateOr(X, ConstantInt::get(X->getType(), Xor));
+    return Builder.CreateICmp(Pred, Or, ConstantInt::get(X->getType(), *C2));
+  }
+
+  // Special case: get the ordering right when the values wrap around zero.
+  // Ie, we assumed the constants were unsigned when swapping earlier.
+  if (C1->isNullValue() && C2->isAllOnesValue())
+    std::swap(C1, C2);
+
+  if (*C1 == *C2 - 1) {
+    // (X == 13 || X == 14) --> X - 13 <=u 1
+    // (X != 13 && X != 14) --> X - 13  >u 1
+    // An 'add' is the canonical IR form, so favor that over a 'sub'.
+    Value *Add = Builder.CreateAdd(X, ConstantInt::get(X->getType(), -(*C1)));
+    auto NewPred = JoinedByAnd ? ICmpInst::ICMP_UGT : ICmpInst::ICMP_ULE;
+    return Builder.CreateICmp(NewPred, Add, ConstantInt::get(X->getType(), 1));
+  }
+
+  return nullptr;
+}
+
+// Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2)
+// Fold (!iszero(A & K1) & !iszero(A & K2)) -> (A & (K1 | K2)) == (K1 | K2)
 Value *InstCombinerImpl::foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS,
                                                        ICmpInst *RHS,
                                                        BinaryOperator &Logic) {
-  bool JoinedByAnd = Logic.getOpcode() == Instruction::And; 
-  assert((JoinedByAnd || Logic.getOpcode() == Instruction::Or) && 
-         "Wrong opcode"); 
-  ICmpInst::Predicate Pred = LHS->getPredicate(); 
-  if (Pred != RHS->getPredicate()) 
-    return nullptr; 
-  if (JoinedByAnd && Pred != ICmpInst::ICMP_NE) 
-    return nullptr; 
-  if (!JoinedByAnd && Pred != ICmpInst::ICMP_EQ) 
-    return nullptr; 
- 
+  bool JoinedByAnd = Logic.getOpcode() == Instruction::And;
+  assert((JoinedByAnd || Logic.getOpcode() == Instruction::Or) &&
+         "Wrong opcode");
+  ICmpInst::Predicate Pred = LHS->getPredicate();
+  if (Pred != RHS->getPredicate())
+    return nullptr;
+  if (JoinedByAnd && Pred != ICmpInst::ICMP_NE)
+    return nullptr;
+  if (!JoinedByAnd && Pred != ICmpInst::ICMP_EQ)
+    return nullptr;
+
   if (!match(LHS->getOperand(1), m_Zero()) ||
       !match(RHS->getOperand(1), m_Zero()))
-    return nullptr; 
- 
-  Value *A, *B, *C, *D; 
-  if (match(LHS->getOperand(0), m_And(m_Value(A), m_Value(B))) && 
-      match(RHS->getOperand(0), m_And(m_Value(C), m_Value(D)))) { 
-    if (A == D || B == D) 
-      std::swap(C, D); 
-    if (B == C) 
-      std::swap(A, B); 
- 
-    if (A == C && 
-        isKnownToBeAPowerOfTwo(B, false, 0, &Logic) && 
-        isKnownToBeAPowerOfTwo(D, false, 0, &Logic)) { 
-      Value *Mask = Builder.CreateOr(B, D); 
-      Value *Masked = Builder.CreateAnd(A, Mask); 
-      auto NewPred = JoinedByAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE; 
-      return Builder.CreateICmp(NewPred, Masked, Mask); 
-    } 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// General pattern: 
-///   X & Y 
-/// 
-/// Where Y is checking that all the high bits (covered by a mask 4294967168) 
-/// are uniform, i.e.  %arg & 4294967168  can be either  4294967168  or  0 
-/// Pattern can be one of: 
-///   %t = add        i32 %arg,    128 
-///   %r = icmp   ult i32 %t,      256 
-/// Or 
-///   %t0 = shl       i32 %arg,    24 
-///   %t1 = ashr      i32 %t0,     24 
-///   %r  = icmp  eq  i32 %t1,     %arg 
-/// Or 
-///   %t0 = trunc     i32 %arg  to i8 
-///   %t1 = sext      i8  %t0   to i32 
-///   %r  = icmp  eq  i32 %t1,     %arg 
-/// This pattern is a signed truncation check. 
-/// 
-/// And X is checking that some bit in that same mask is zero. 
-/// I.e. can be one of: 
-///   %r = icmp sgt i32   %arg,    -1 
-/// Or 
-///   %t = and      i32   %arg,    2147483648 
-///   %r = icmp eq  i32   %t,      0 
-/// 
-/// Since we are checking that all the bits in that mask are the same, 
-/// and a particular bit is zero, what we are really checking is that all the 
-/// masked bits are zero. 
-/// So this should be transformed to: 
-///   %r = icmp ult i32 %arg, 128 
-static Value *foldSignedTruncationCheck(ICmpInst *ICmp0, ICmpInst *ICmp1, 
-                                        Instruction &CxtI, 
-                                        InstCombiner::BuilderTy &Builder) { 
-  assert(CxtI.getOpcode() == Instruction::And); 
- 
-  // Match  icmp ult (add %arg, C01), C1   (C1 == C01 << 1; powers of two) 
-  auto tryToMatchSignedTruncationCheck = [](ICmpInst *ICmp, Value *&X, 
-                                            APInt &SignBitMask) -> bool { 
-    CmpInst::Predicate Pred; 
-    const APInt *I01, *I1; // powers of two; I1 == I01 << 1 
-    if (!(match(ICmp, 
-                m_ICmp(Pred, m_Add(m_Value(X), m_Power2(I01)), m_Power2(I1))) && 
-          Pred == ICmpInst::ICMP_ULT && I1->ugt(*I01) && I01->shl(1) == *I1)) 
-      return false; 
-    // Which bit is the new sign bit as per the 'signed truncation' pattern? 
-    SignBitMask = *I01; 
-    return true; 
-  }; 
- 
-  // One icmp needs to be 'signed truncation check'. 
-  // We need to match this first, else we will mismatch commutative cases. 
-  Value *X1; 
-  APInt HighestBit; 
-  ICmpInst *OtherICmp; 
-  if (tryToMatchSignedTruncationCheck(ICmp1, X1, HighestBit)) 
-    OtherICmp = ICmp0; 
-  else if (tryToMatchSignedTruncationCheck(ICmp0, X1, HighestBit)) 
-    OtherICmp = ICmp1; 
-  else 
-    return nullptr; 
- 
-  assert(HighestBit.isPowerOf2() && "expected to be power of two (non-zero)"); 
- 
-  // Try to match/decompose into:  icmp eq (X & Mask), 0 
-  auto tryToDecompose = [](ICmpInst *ICmp, Value *&X, 
-                           APInt &UnsetBitsMask) -> bool { 
-    CmpInst::Predicate Pred = ICmp->getPredicate(); 
-    // Can it be decomposed into  icmp eq (X & Mask), 0  ? 
-    if (llvm::decomposeBitTestICmp(ICmp->getOperand(0), ICmp->getOperand(1), 
-                                   Pred, X, UnsetBitsMask, 
-                                   /*LookThroughTrunc=*/false) && 
-        Pred == ICmpInst::ICMP_EQ) 
-      return true; 
-    // Is it  icmp eq (X & Mask), 0  already? 
-    const APInt *Mask; 
-    if (match(ICmp, m_ICmp(Pred, m_And(m_Value(X), m_APInt(Mask)), m_Zero())) && 
-        Pred == ICmpInst::ICMP_EQ) { 
-      UnsetBitsMask = *Mask; 
-      return true; 
-    } 
-    return false; 
-  }; 
- 
-  // And the other icmp needs to be decomposable into a bit test. 
-  Value *X0; 
-  APInt UnsetBitsMask; 
-  if (!tryToDecompose(OtherICmp, X0, UnsetBitsMask)) 
-    return nullptr; 
- 
-  assert(!UnsetBitsMask.isNullValue() && "empty mask makes no sense."); 
- 
-  // Are they working on the same value? 
-  Value *X; 
-  if (X1 == X0) { 
-    // Ok as is. 
-    X = X1; 
-  } else if (match(X0, m_Trunc(m_Specific(X1)))) { 
-    UnsetBitsMask = UnsetBitsMask.zext(X1->getType()->getScalarSizeInBits()); 
-    X = X1; 
-  } else 
-    return nullptr; 
- 
-  // So which bits should be uniform as per the 'signed truncation check'? 
-  // (all the bits starting with (i.e. including) HighestBit) 
-  APInt SignBitsMask = ~(HighestBit - 1U); 
- 
-  // UnsetBitsMask must have some common bits with SignBitsMask, 
-  if (!UnsetBitsMask.intersects(SignBitsMask)) 
-    return nullptr; 
- 
-  // Does UnsetBitsMask contain any bits outside of SignBitsMask? 
-  if (!UnsetBitsMask.isSubsetOf(SignBitsMask)) { 
-    APInt OtherHighestBit = (~UnsetBitsMask) + 1U; 
-    if (!OtherHighestBit.isPowerOf2()) 
-      return nullptr; 
-    HighestBit = APIntOps::umin(HighestBit, OtherHighestBit); 
-  } 
-  // Else, if it does not, then all is ok as-is. 
- 
-  // %r = icmp ult %X, SignBit 
-  return Builder.CreateICmpULT(X, ConstantInt::get(X->getType(), HighestBit), 
-                               CxtI.getName() + ".simplified"); 
-} 
- 
-/// Reduce a pair of compares that check if a value has exactly 1 bit set. 
-static Value *foldIsPowerOf2(ICmpInst *Cmp0, ICmpInst *Cmp1, bool JoinedByAnd, 
-                             InstCombiner::BuilderTy &Builder) { 
-  // Handle 'and' / 'or' commutation: make the equality check the first operand. 
-  if (JoinedByAnd && Cmp1->getPredicate() == ICmpInst::ICMP_NE) 
-    std::swap(Cmp0, Cmp1); 
-  else if (!JoinedByAnd && Cmp1->getPredicate() == ICmpInst::ICMP_EQ) 
-    std::swap(Cmp0, Cmp1); 
- 
-  // (X != 0) && (ctpop(X) u< 2) --> ctpop(X) == 1 
-  CmpInst::Predicate Pred0, Pred1; 
-  Value *X; 
-  if (JoinedByAnd && match(Cmp0, m_ICmp(Pred0, m_Value(X), m_ZeroInt())) && 
-      match(Cmp1, m_ICmp(Pred1, m_Intrinsic<Intrinsic::ctpop>(m_Specific(X)), 
-                         m_SpecificInt(2))) && 
-      Pred0 == ICmpInst::ICMP_NE && Pred1 == ICmpInst::ICMP_ULT) { 
-    Value *CtPop = Cmp1->getOperand(0); 
-    return Builder.CreateICmpEQ(CtPop, ConstantInt::get(CtPop->getType(), 1)); 
-  } 
-  // (X == 0) || (ctpop(X) u> 1) --> ctpop(X) != 1 
-  if (!JoinedByAnd && match(Cmp0, m_ICmp(Pred0, m_Value(X), m_ZeroInt())) && 
-      match(Cmp1, m_ICmp(Pred1, m_Intrinsic<Intrinsic::ctpop>(m_Specific(X)), 
-                         m_SpecificInt(1))) && 
-      Pred0 == ICmpInst::ICMP_EQ && Pred1 == ICmpInst::ICMP_UGT) { 
-    Value *CtPop = Cmp1->getOperand(0); 
-    return Builder.CreateICmpNE(CtPop, ConstantInt::get(CtPop->getType(), 1)); 
-  } 
-  return nullptr; 
-} 
- 
-/// Commuted variants are assumed to be handled by calling this function again 
-/// with the parameters swapped. 
-static Value *foldUnsignedUnderflowCheck(ICmpInst *ZeroICmp, 
-                                         ICmpInst *UnsignedICmp, bool IsAnd, 
-                                         const SimplifyQuery &Q, 
-                                         InstCombiner::BuilderTy &Builder) { 
-  Value *ZeroCmpOp; 
-  ICmpInst::Predicate EqPred; 
-  if (!match(ZeroICmp, m_ICmp(EqPred, m_Value(ZeroCmpOp), m_Zero())) || 
-      !ICmpInst::isEquality(EqPred)) 
-    return nullptr; 
- 
-  auto IsKnownNonZero = [&](Value *V) { 
-    return isKnownNonZero(V, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT); 
-  }; 
- 
-  ICmpInst::Predicate UnsignedPred; 
- 
-  Value *A, *B; 
-  if (match(UnsignedICmp, 
-            m_c_ICmp(UnsignedPred, m_Specific(ZeroCmpOp), m_Value(A))) && 
-      match(ZeroCmpOp, m_c_Add(m_Specific(A), m_Value(B))) && 
-      (ZeroICmp->hasOneUse() || UnsignedICmp->hasOneUse())) { 
-    auto GetKnownNonZeroAndOther = [&](Value *&NonZero, Value *&Other) { 
-      if (!IsKnownNonZero(NonZero)) 
-        std::swap(NonZero, Other); 
-      return IsKnownNonZero(NonZero); 
-    }; 
- 
-    // Given  ZeroCmpOp = (A + B) 
-    //   ZeroCmpOp <= A && ZeroCmpOp != 0  -->  (0-B) <  A 
-    //   ZeroCmpOp >  A || ZeroCmpOp == 0  -->  (0-B) >= A 
-    // 
-    //   ZeroCmpOp <  A && ZeroCmpOp != 0  -->  (0-X) <  Y  iff 
-    //   ZeroCmpOp >= A || ZeroCmpOp == 0  -->  (0-X) >= Y  iff 
-    //     with X being the value (A/B) that is known to be non-zero, 
-    //     and Y being remaining value. 
-    if (UnsignedPred == ICmpInst::ICMP_ULE && EqPred == ICmpInst::ICMP_NE && 
-        IsAnd) 
-      return Builder.CreateICmpULT(Builder.CreateNeg(B), A); 
-    if (UnsignedPred == ICmpInst::ICMP_ULT && EqPred == ICmpInst::ICMP_NE && 
-        IsAnd && GetKnownNonZeroAndOther(B, A)) 
-      return Builder.CreateICmpULT(Builder.CreateNeg(B), A); 
-    if (UnsignedPred == ICmpInst::ICMP_UGT && EqPred == ICmpInst::ICMP_EQ && 
-        !IsAnd) 
-      return Builder.CreateICmpUGE(Builder.CreateNeg(B), A); 
-    if (UnsignedPred == ICmpInst::ICMP_UGE && EqPred == ICmpInst::ICMP_EQ && 
-        !IsAnd && GetKnownNonZeroAndOther(B, A)) 
-      return Builder.CreateICmpUGE(Builder.CreateNeg(B), A); 
-  } 
- 
-  Value *Base, *Offset; 
-  if (!match(ZeroCmpOp, m_Sub(m_Value(Base), m_Value(Offset)))) 
-    return nullptr; 
- 
-  if (!match(UnsignedICmp, 
-             m_c_ICmp(UnsignedPred, m_Specific(Base), m_Specific(Offset))) || 
-      !ICmpInst::isUnsigned(UnsignedPred)) 
-    return nullptr; 
- 
-  // Base >=/> Offset && (Base - Offset) != 0  <-->  Base > Offset 
-  // (no overflow and not null) 
-  if ((UnsignedPred == ICmpInst::ICMP_UGE || 
-       UnsignedPred == ICmpInst::ICMP_UGT) && 
-      EqPred == ICmpInst::ICMP_NE && IsAnd) 
-    return Builder.CreateICmpUGT(Base, Offset); 
- 
-  // Base <=/< Offset || (Base - Offset) == 0  <-->  Base <= Offset 
-  // (overflow or null) 
-  if ((UnsignedPred == ICmpInst::ICMP_ULE || 
-       UnsignedPred == ICmpInst::ICMP_ULT) && 
-      EqPred == ICmpInst::ICMP_EQ && !IsAnd) 
-    return Builder.CreateICmpULE(Base, Offset); 
- 
-  // Base <= Offset && (Base - Offset) != 0  -->  Base < Offset 
-  if (UnsignedPred == ICmpInst::ICMP_ULE && EqPred == ICmpInst::ICMP_NE && 
-      IsAnd) 
-    return Builder.CreateICmpULT(Base, Offset); 
- 
-  // Base > Offset || (Base - Offset) == 0  -->  Base >= Offset 
-  if (UnsignedPred == ICmpInst::ICMP_UGT && EqPred == ICmpInst::ICMP_EQ && 
-      !IsAnd) 
-    return Builder.CreateICmpUGE(Base, Offset); 
- 
-  return nullptr; 
-} 
- 
-/// Reduce logic-of-compares with equality to a constant by substituting a 
-/// common operand with the constant. Callers are expected to call this with 
-/// Cmp0/Cmp1 switched to handle logic op commutativity. 
-static Value *foldAndOrOfICmpsWithConstEq(ICmpInst *Cmp0, ICmpInst *Cmp1, 
-                                          BinaryOperator &Logic, 
-                                          InstCombiner::BuilderTy &Builder, 
-                                          const SimplifyQuery &Q) { 
-  bool IsAnd = Logic.getOpcode() == Instruction::And; 
-  assert((IsAnd || Logic.getOpcode() == Instruction::Or) && "Wrong logic op"); 
- 
-  // Match an equality compare with a non-poison constant as Cmp0. 
-  // Also, give up if the compare can be constant-folded to avoid looping. 
-  ICmpInst::Predicate Pred0; 
-  Value *X; 
-  Constant *C; 
-  if (!match(Cmp0, m_ICmp(Pred0, m_Value(X), m_Constant(C))) || 
-      !isGuaranteedNotToBeUndefOrPoison(C) || isa<Constant>(X)) 
-    return nullptr; 
-  if ((IsAnd && Pred0 != ICmpInst::ICMP_EQ) || 
-      (!IsAnd && Pred0 != ICmpInst::ICMP_NE)) 
-    return nullptr; 
- 
-  // The other compare must include a common operand (X). Canonicalize the 
-  // common operand as operand 1 (Pred1 is swapped if the common operand was 
-  // operand 0). 
-  Value *Y; 
-  ICmpInst::Predicate Pred1; 
-  if (!match(Cmp1, m_c_ICmp(Pred1, m_Value(Y), m_Deferred(X)))) 
-    return nullptr; 
- 
-  // Replace variable with constant value equivalence to remove a variable use: 
-  // (X == C) && (Y Pred1 X) --> (X == C) && (Y Pred1 C) 
-  // (X != C) || (Y Pred1 X) --> (X != C) || (Y Pred1 C) 
-  // Can think of the 'or' substitution with the 'and' bool equivalent: 
-  // A || B --> A || (!A && B) 
-  Value *SubstituteCmp = SimplifyICmpInst(Pred1, Y, C, Q); 
-  if (!SubstituteCmp) { 
-    // If we need to create a new instruction, require that the old compare can 
-    // be removed. 
-    if (!Cmp1->hasOneUse()) 
-      return nullptr; 
-    SubstituteCmp = Builder.CreateICmp(Pred1, Y, C); 
-  } 
-  return Builder.CreateBinOp(Logic.getOpcode(), Cmp0, SubstituteCmp); 
-} 
- 
-/// Fold (icmp)&(icmp) if possible. 
+    return nullptr;
+
+  Value *A, *B, *C, *D;
+  if (match(LHS->getOperand(0), m_And(m_Value(A), m_Value(B))) &&
+      match(RHS->getOperand(0), m_And(m_Value(C), m_Value(D)))) {
+    if (A == D || B == D)
+      std::swap(C, D);
+    if (B == C)
+      std::swap(A, B);
+
+    if (A == C &&
+        isKnownToBeAPowerOfTwo(B, false, 0, &Logic) &&
+        isKnownToBeAPowerOfTwo(D, false, 0, &Logic)) {
+      Value *Mask = Builder.CreateOr(B, D);
+      Value *Masked = Builder.CreateAnd(A, Mask);
+      auto NewPred = JoinedByAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE;
+      return Builder.CreateICmp(NewPred, Masked, Mask);
+    }
+  }
+
+  return nullptr;
+}
+
+/// General pattern:
+///   X & Y
+///
+/// Where Y is checking that all the high bits (covered by a mask 4294967168)
+/// are uniform, i.e.  %arg & 4294967168  can be either  4294967168  or  0
+/// Pattern can be one of:
+///   %t = add        i32 %arg,    128
+///   %r = icmp   ult i32 %t,      256
+/// Or
+///   %t0 = shl       i32 %arg,    24
+///   %t1 = ashr      i32 %t0,     24
+///   %r  = icmp  eq  i32 %t1,     %arg
+/// Or
+///   %t0 = trunc     i32 %arg  to i8
+///   %t1 = sext      i8  %t0   to i32
+///   %r  = icmp  eq  i32 %t1,     %arg
+/// This pattern is a signed truncation check.
+///
+/// And X is checking that some bit in that same mask is zero.
+/// I.e. can be one of:
+///   %r = icmp sgt i32   %arg,    -1
+/// Or
+///   %t = and      i32   %arg,    2147483648
+///   %r = icmp eq  i32   %t,      0
+///
+/// Since we are checking that all the bits in that mask are the same,
+/// and a particular bit is zero, what we are really checking is that all the
+/// masked bits are zero.
+/// So this should be transformed to:
+///   %r = icmp ult i32 %arg, 128
+static Value *foldSignedTruncationCheck(ICmpInst *ICmp0, ICmpInst *ICmp1,
+                                        Instruction &CxtI,
+                                        InstCombiner::BuilderTy &Builder) {
+  assert(CxtI.getOpcode() == Instruction::And);
+
+  // Match  icmp ult (add %arg, C01), C1   (C1 == C01 << 1; powers of two)
+  auto tryToMatchSignedTruncationCheck = [](ICmpInst *ICmp, Value *&X,
+                                            APInt &SignBitMask) -> bool {
+    CmpInst::Predicate Pred;
+    const APInt *I01, *I1; // powers of two; I1 == I01 << 1
+    if (!(match(ICmp,
+                m_ICmp(Pred, m_Add(m_Value(X), m_Power2(I01)), m_Power2(I1))) &&
+          Pred == ICmpInst::ICMP_ULT && I1->ugt(*I01) && I01->shl(1) == *I1))
+      return false;
+    // Which bit is the new sign bit as per the 'signed truncation' pattern?
+    SignBitMask = *I01;
+    return true;
+  };
+
+  // One icmp needs to be 'signed truncation check'.
+  // We need to match this first, else we will mismatch commutative cases.
+  Value *X1;
+  APInt HighestBit;
+  ICmpInst *OtherICmp;
+  if (tryToMatchSignedTruncationCheck(ICmp1, X1, HighestBit))
+    OtherICmp = ICmp0;
+  else if (tryToMatchSignedTruncationCheck(ICmp0, X1, HighestBit))
+    OtherICmp = ICmp1;
+  else
+    return nullptr;
+
+  assert(HighestBit.isPowerOf2() && "expected to be power of two (non-zero)");
+
+  // Try to match/decompose into:  icmp eq (X & Mask), 0
+  auto tryToDecompose = [](ICmpInst *ICmp, Value *&X,
+                           APInt &UnsetBitsMask) -> bool {
+    CmpInst::Predicate Pred = ICmp->getPredicate();
+    // Can it be decomposed into  icmp eq (X & Mask), 0  ?
+    if (llvm::decomposeBitTestICmp(ICmp->getOperand(0), ICmp->getOperand(1),
+                                   Pred, X, UnsetBitsMask,
+                                   /*LookThroughTrunc=*/false) &&
+        Pred == ICmpInst::ICMP_EQ)
+      return true;
+    // Is it  icmp eq (X & Mask), 0  already?
+    const APInt *Mask;
+    if (match(ICmp, m_ICmp(Pred, m_And(m_Value(X), m_APInt(Mask)), m_Zero())) &&
+        Pred == ICmpInst::ICMP_EQ) {
+      UnsetBitsMask = *Mask;
+      return true;
+    }
+    return false;
+  };
+
+  // And the other icmp needs to be decomposable into a bit test.
+  Value *X0;
+  APInt UnsetBitsMask;
+  if (!tryToDecompose(OtherICmp, X0, UnsetBitsMask))
+    return nullptr;
+
+  assert(!UnsetBitsMask.isNullValue() && "empty mask makes no sense.");
+
+  // Are they working on the same value?
+  Value *X;
+  if (X1 == X0) {
+    // Ok as is.
+    X = X1;
+  } else if (match(X0, m_Trunc(m_Specific(X1)))) {
+    UnsetBitsMask = UnsetBitsMask.zext(X1->getType()->getScalarSizeInBits());
+    X = X1;
+  } else
+    return nullptr;
+
+  // So which bits should be uniform as per the 'signed truncation check'?
+  // (all the bits starting with (i.e. including) HighestBit)
+  APInt SignBitsMask = ~(HighestBit - 1U);
+
+  // UnsetBitsMask must have some common bits with SignBitsMask,
+  if (!UnsetBitsMask.intersects(SignBitsMask))
+    return nullptr;
+
+  // Does UnsetBitsMask contain any bits outside of SignBitsMask?
+  if (!UnsetBitsMask.isSubsetOf(SignBitsMask)) {
+    APInt OtherHighestBit = (~UnsetBitsMask) + 1U;
+    if (!OtherHighestBit.isPowerOf2())
+      return nullptr;
+    HighestBit = APIntOps::umin(HighestBit, OtherHighestBit);
+  }
+  // Else, if it does not, then all is ok as-is.
+
+  // %r = icmp ult %X, SignBit
+  return Builder.CreateICmpULT(X, ConstantInt::get(X->getType(), HighestBit),
+                               CxtI.getName() + ".simplified");
+}
+
+/// Reduce a pair of compares that check if a value has exactly 1 bit set.
+static Value *foldIsPowerOf2(ICmpInst *Cmp0, ICmpInst *Cmp1, bool JoinedByAnd,
+                             InstCombiner::BuilderTy &Builder) {
+  // Handle 'and' / 'or' commutation: make the equality check the first operand.
+  if (JoinedByAnd && Cmp1->getPredicate() == ICmpInst::ICMP_NE)
+    std::swap(Cmp0, Cmp1);
+  else if (!JoinedByAnd && Cmp1->getPredicate() == ICmpInst::ICMP_EQ)
+    std::swap(Cmp0, Cmp1);
+
+  // (X != 0) && (ctpop(X) u< 2) --> ctpop(X) == 1
+  CmpInst::Predicate Pred0, Pred1;
+  Value *X;
+  if (JoinedByAnd && match(Cmp0, m_ICmp(Pred0, m_Value(X), m_ZeroInt())) &&
+      match(Cmp1, m_ICmp(Pred1, m_Intrinsic<Intrinsic::ctpop>(m_Specific(X)),
+                         m_SpecificInt(2))) &&
+      Pred0 == ICmpInst::ICMP_NE && Pred1 == ICmpInst::ICMP_ULT) {
+    Value *CtPop = Cmp1->getOperand(0);
+    return Builder.CreateICmpEQ(CtPop, ConstantInt::get(CtPop->getType(), 1));
+  }
+  // (X == 0) || (ctpop(X) u> 1) --> ctpop(X) != 1
+  if (!JoinedByAnd && match(Cmp0, m_ICmp(Pred0, m_Value(X), m_ZeroInt())) &&
+      match(Cmp1, m_ICmp(Pred1, m_Intrinsic<Intrinsic::ctpop>(m_Specific(X)),
+                         m_SpecificInt(1))) &&
+      Pred0 == ICmpInst::ICMP_EQ && Pred1 == ICmpInst::ICMP_UGT) {
+    Value *CtPop = Cmp1->getOperand(0);
+    return Builder.CreateICmpNE(CtPop, ConstantInt::get(CtPop->getType(), 1));
+  }
+  return nullptr;
+}
+
+/// Commuted variants are assumed to be handled by calling this function again
+/// with the parameters swapped.
+static Value *foldUnsignedUnderflowCheck(ICmpInst *ZeroICmp,
+                                         ICmpInst *UnsignedICmp, bool IsAnd,
+                                         const SimplifyQuery &Q,
+                                         InstCombiner::BuilderTy &Builder) {
+  Value *ZeroCmpOp;
+  ICmpInst::Predicate EqPred;
+  if (!match(ZeroICmp, m_ICmp(EqPred, m_Value(ZeroCmpOp), m_Zero())) ||
+      !ICmpInst::isEquality(EqPred))
+    return nullptr;
+
+  auto IsKnownNonZero = [&](Value *V) {
+    return isKnownNonZero(V, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT);
+  };
+
+  ICmpInst::Predicate UnsignedPred;
+
+  Value *A, *B;
+  if (match(UnsignedICmp,
+            m_c_ICmp(UnsignedPred, m_Specific(ZeroCmpOp), m_Value(A))) &&
+      match(ZeroCmpOp, m_c_Add(m_Specific(A), m_Value(B))) &&
+      (ZeroICmp->hasOneUse() || UnsignedICmp->hasOneUse())) {
+    auto GetKnownNonZeroAndOther = [&](Value *&NonZero, Value *&Other) {
+      if (!IsKnownNonZero(NonZero))
+        std::swap(NonZero, Other);
+      return IsKnownNonZero(NonZero);
+    };
+
+    // Given  ZeroCmpOp = (A + B)
+    //   ZeroCmpOp <= A && ZeroCmpOp != 0  -->  (0-B) <  A
+    //   ZeroCmpOp >  A || ZeroCmpOp == 0  -->  (0-B) >= A
+    //
+    //   ZeroCmpOp <  A && ZeroCmpOp != 0  -->  (0-X) <  Y  iff
+    //   ZeroCmpOp >= A || ZeroCmpOp == 0  -->  (0-X) >= Y  iff
+    //     with X being the value (A/B) that is known to be non-zero,
+    //     and Y being remaining value.
+    if (UnsignedPred == ICmpInst::ICMP_ULE && EqPred == ICmpInst::ICMP_NE &&
+        IsAnd)
+      return Builder.CreateICmpULT(Builder.CreateNeg(B), A);
+    if (UnsignedPred == ICmpInst::ICMP_ULT && EqPred == ICmpInst::ICMP_NE &&
+        IsAnd && GetKnownNonZeroAndOther(B, A))
+      return Builder.CreateICmpULT(Builder.CreateNeg(B), A);
+    if (UnsignedPred == ICmpInst::ICMP_UGT && EqPred == ICmpInst::ICMP_EQ &&
+        !IsAnd)
+      return Builder.CreateICmpUGE(Builder.CreateNeg(B), A);
+    if (UnsignedPred == ICmpInst::ICMP_UGE && EqPred == ICmpInst::ICMP_EQ &&
+        !IsAnd && GetKnownNonZeroAndOther(B, A))
+      return Builder.CreateICmpUGE(Builder.CreateNeg(B), A);
+  }
+
+  Value *Base, *Offset;
+  if (!match(ZeroCmpOp, m_Sub(m_Value(Base), m_Value(Offset))))
+    return nullptr;
+
+  if (!match(UnsignedICmp,
+             m_c_ICmp(UnsignedPred, m_Specific(Base), m_Specific(Offset))) ||
+      !ICmpInst::isUnsigned(UnsignedPred))
+    return nullptr;
+
+  // Base >=/> Offset && (Base - Offset) != 0  <-->  Base > Offset
+  // (no overflow and not null)
+  if ((UnsignedPred == ICmpInst::ICMP_UGE ||
+       UnsignedPred == ICmpInst::ICMP_UGT) &&
+      EqPred == ICmpInst::ICMP_NE && IsAnd)
+    return Builder.CreateICmpUGT(Base, Offset);
+
+  // Base <=/< Offset || (Base - Offset) == 0  <-->  Base <= Offset
+  // (overflow or null)
+  if ((UnsignedPred == ICmpInst::ICMP_ULE ||
+       UnsignedPred == ICmpInst::ICMP_ULT) &&
+      EqPred == ICmpInst::ICMP_EQ && !IsAnd)
+    return Builder.CreateICmpULE(Base, Offset);
+
+  // Base <= Offset && (Base - Offset) != 0  -->  Base < Offset
+  if (UnsignedPred == ICmpInst::ICMP_ULE && EqPred == ICmpInst::ICMP_NE &&
+      IsAnd)
+    return Builder.CreateICmpULT(Base, Offset);
+
+  // Base > Offset || (Base - Offset) == 0  -->  Base >= Offset
+  if (UnsignedPred == ICmpInst::ICMP_UGT && EqPred == ICmpInst::ICMP_EQ &&
+      !IsAnd)
+    return Builder.CreateICmpUGE(Base, Offset);
+
+  return nullptr;
+}
+
+/// Reduce logic-of-compares with equality to a constant by substituting a
+/// common operand with the constant. Callers are expected to call this with
+/// Cmp0/Cmp1 switched to handle logic op commutativity.
+static Value *foldAndOrOfICmpsWithConstEq(ICmpInst *Cmp0, ICmpInst *Cmp1,
+                                          BinaryOperator &Logic,
+                                          InstCombiner::BuilderTy &Builder,
+                                          const SimplifyQuery &Q) {
+  bool IsAnd = Logic.getOpcode() == Instruction::And;
+  assert((IsAnd || Logic.getOpcode() == Instruction::Or) && "Wrong logic op");
+
+  // Match an equality compare with a non-poison constant as Cmp0.
+  // Also, give up if the compare can be constant-folded to avoid looping.
+  ICmpInst::Predicate Pred0;
+  Value *X;
+  Constant *C;
+  if (!match(Cmp0, m_ICmp(Pred0, m_Value(X), m_Constant(C))) ||
+      !isGuaranteedNotToBeUndefOrPoison(C) || isa<Constant>(X))
+    return nullptr;
+  if ((IsAnd && Pred0 != ICmpInst::ICMP_EQ) ||
+      (!IsAnd && Pred0 != ICmpInst::ICMP_NE))
+    return nullptr;
+
+  // The other compare must include a common operand (X). Canonicalize the
+  // common operand as operand 1 (Pred1 is swapped if the common operand was
+  // operand 0).
+  Value *Y;
+  ICmpInst::Predicate Pred1;
+  if (!match(Cmp1, m_c_ICmp(Pred1, m_Value(Y), m_Deferred(X))))
+    return nullptr;
+
+  // Replace variable with constant value equivalence to remove a variable use:
+  // (X == C) && (Y Pred1 X) --> (X == C) && (Y Pred1 C)
+  // (X != C) || (Y Pred1 X) --> (X != C) || (Y Pred1 C)
+  // Can think of the 'or' substitution with the 'and' bool equivalent:
+  // A || B --> A || (!A && B)
+  Value *SubstituteCmp = SimplifyICmpInst(Pred1, Y, C, Q);
+  if (!SubstituteCmp) {
+    // If we need to create a new instruction, require that the old compare can
+    // be removed.
+    if (!Cmp1->hasOneUse())
+      return nullptr;
+    SubstituteCmp = Builder.CreateICmp(Pred1, Y, C);
+  }
+  return Builder.CreateBinOp(Logic.getOpcode(), Cmp0, SubstituteCmp);
+}
+
+/// Fold (icmp)&(icmp) if possible.
 Value *InstCombinerImpl::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
                                         BinaryOperator &And) {
-  const SimplifyQuery Q = SQ.getWithInstruction(&And); 
- 
-  // Fold (!iszero(A & K1) & !iszero(A & K2)) ->  (A & (K1 | K2)) == (K1 | K2) 
-  // if K1 and K2 are a one-bit mask. 
-  if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, And)) 
-    return V; 
- 
-  ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate(); 
- 
-  // (icmp1 A, B) & (icmp2 A, B) --> (icmp3 A, B) 
-  if (predicatesFoldable(PredL, PredR)) { 
-    if (LHS->getOperand(0) == RHS->getOperand(1) && 
-        LHS->getOperand(1) == RHS->getOperand(0)) 
-      LHS->swapOperands(); 
-    if (LHS->getOperand(0) == RHS->getOperand(0) && 
-        LHS->getOperand(1) == RHS->getOperand(1)) { 
-      Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1); 
-      unsigned Code = getICmpCode(LHS) & getICmpCode(RHS); 
-      bool IsSigned = LHS->isSigned() || RHS->isSigned(); 
-      return getNewICmpValue(Code, IsSigned, Op0, Op1, Builder); 
-    } 
-  } 
- 
-  // handle (roughly):  (icmp eq (A & B), C) & (icmp eq (A & D), E) 
-  if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, true, Builder)) 
-    return V; 
- 
-  if (Value *V = foldAndOrOfICmpsWithConstEq(LHS, RHS, And, Builder, Q)) 
-    return V; 
-  if (Value *V = foldAndOrOfICmpsWithConstEq(RHS, LHS, And, Builder, Q)) 
-    return V; 
- 
-  // E.g. (icmp sge x, 0) & (icmp slt x, n) --> icmp ult x, n 
-  if (Value *V = simplifyRangeCheck(LHS, RHS, /*Inverted=*/false)) 
-    return V; 
- 
-  // E.g. (icmp slt x, n) & (icmp sge x, 0) --> icmp ult x, n 
-  if (Value *V = simplifyRangeCheck(RHS, LHS, /*Inverted=*/false)) 
-    return V; 
- 
-  if (Value *V = foldAndOrOfEqualityCmpsWithConstants(LHS, RHS, true, Builder)) 
-    return V; 
- 
-  if (Value *V = foldSignedTruncationCheck(LHS, RHS, And, Builder)) 
-    return V; 
- 
-  if (Value *V = foldIsPowerOf2(LHS, RHS, true /* JoinedByAnd */, Builder)) 
-    return V; 
- 
-  if (Value *X = 
-          foldUnsignedUnderflowCheck(LHS, RHS, /*IsAnd=*/true, Q, Builder)) 
-    return X; 
-  if (Value *X = 
-          foldUnsignedUnderflowCheck(RHS, LHS, /*IsAnd=*/true, Q, Builder)) 
-    return X; 
- 
-  // This only handles icmp of constants: (icmp1 A, C1) & (icmp2 B, C2). 
-  Value *LHS0 = LHS->getOperand(0), *RHS0 = RHS->getOperand(0); 
+  const SimplifyQuery Q = SQ.getWithInstruction(&And);
+
+  // Fold (!iszero(A & K1) & !iszero(A & K2)) ->  (A & (K1 | K2)) == (K1 | K2)
+  // if K1 and K2 are a one-bit mask.
+  if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, And))
+    return V;
+
+  ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
+
+  // (icmp1 A, B) & (icmp2 A, B) --> (icmp3 A, B)
+  if (predicatesFoldable(PredL, PredR)) {
+    if (LHS->getOperand(0) == RHS->getOperand(1) &&
+        LHS->getOperand(1) == RHS->getOperand(0))
+      LHS->swapOperands();
+    if (LHS->getOperand(0) == RHS->getOperand(0) &&
+        LHS->getOperand(1) == RHS->getOperand(1)) {
+      Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1);
+      unsigned Code = getICmpCode(LHS) & getICmpCode(RHS);
+      bool IsSigned = LHS->isSigned() || RHS->isSigned();
+      return getNewICmpValue(Code, IsSigned, Op0, Op1, Builder);
+    }
+  }
+
+  // handle (roughly):  (icmp eq (A & B), C) & (icmp eq (A & D), E)
+  if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, true, Builder))
+    return V;
+
+  if (Value *V = foldAndOrOfICmpsWithConstEq(LHS, RHS, And, Builder, Q))
+    return V;
+  if (Value *V = foldAndOrOfICmpsWithConstEq(RHS, LHS, And, Builder, Q))
+    return V;
+
+  // E.g. (icmp sge x, 0) & (icmp slt x, n) --> icmp ult x, n
+  if (Value *V = simplifyRangeCheck(LHS, RHS, /*Inverted=*/false))
+    return V;
+
+  // E.g. (icmp slt x, n) & (icmp sge x, 0) --> icmp ult x, n
+  if (Value *V = simplifyRangeCheck(RHS, LHS, /*Inverted=*/false))
+    return V;
+
+  if (Value *V = foldAndOrOfEqualityCmpsWithConstants(LHS, RHS, true, Builder))
+    return V;
+
+  if (Value *V = foldSignedTruncationCheck(LHS, RHS, And, Builder))
+    return V;
+
+  if (Value *V = foldIsPowerOf2(LHS, RHS, true /* JoinedByAnd */, Builder))
+    return V;
+
+  if (Value *X =
+          foldUnsignedUnderflowCheck(LHS, RHS, /*IsAnd=*/true, Q, Builder))
+    return X;
+  if (Value *X =
+          foldUnsignedUnderflowCheck(RHS, LHS, /*IsAnd=*/true, Q, Builder))
+    return X;
+
+  // This only handles icmp of constants: (icmp1 A, C1) & (icmp2 B, C2).
+  Value *LHS0 = LHS->getOperand(0), *RHS0 = RHS->getOperand(0);
 
   ConstantInt *LHSC, *RHSC;
   if (!match(LHS->getOperand(1), m_ConstantInt(LHSC)) ||
       !match(RHS->getOperand(1), m_ConstantInt(RHSC)))
-    return nullptr; 
- 
-  if (LHSC == RHSC && PredL == PredR) { 
-    // (icmp ult A, C) & (icmp ult B, C) --> (icmp ult (A|B), C) 
-    // where C is a power of 2 or 
-    // (icmp eq A, 0) & (icmp eq B, 0) --> (icmp eq (A|B), 0) 
-    if ((PredL == ICmpInst::ICMP_ULT && LHSC->getValue().isPowerOf2()) || 
-        (PredL == ICmpInst::ICMP_EQ && LHSC->isZero())) { 
-      Value *NewOr = Builder.CreateOr(LHS0, RHS0); 
-      return Builder.CreateICmp(PredL, NewOr, LHSC); 
-    } 
-  } 
- 
-  // (trunc x) == C1 & (and x, CA) == C2 -> (and x, CA|CMAX) == C1|C2 
-  // where CMAX is the all ones value for the truncated type, 
-  // iff the lower bits of C2 and CA are zero. 
-  if (PredL == ICmpInst::ICMP_EQ && PredL == PredR && LHS->hasOneUse() && 
-      RHS->hasOneUse()) { 
-    Value *V; 
-    ConstantInt *AndC, *SmallC = nullptr, *BigC = nullptr; 
- 
-    // (trunc x) == C1 & (and x, CA) == C2 
-    // (and x, CA) == C2 & (trunc x) == C1 
-    if (match(RHS0, m_Trunc(m_Value(V))) && 
-        match(LHS0, m_And(m_Specific(V), m_ConstantInt(AndC)))) { 
-      SmallC = RHSC; 
-      BigC = LHSC; 
-    } else if (match(LHS0, m_Trunc(m_Value(V))) && 
-               match(RHS0, m_And(m_Specific(V), m_ConstantInt(AndC)))) { 
-      SmallC = LHSC; 
-      BigC = RHSC; 
-    } 
- 
-    if (SmallC && BigC) { 
-      unsigned BigBitSize = BigC->getType()->getBitWidth(); 
-      unsigned SmallBitSize = SmallC->getType()->getBitWidth(); 
- 
-      // Check that the low bits are zero. 
-      APInt Low = APInt::getLowBitsSet(BigBitSize, SmallBitSize); 
-      if ((Low & AndC->getValue()).isNullValue() && 
-          (Low & BigC->getValue()).isNullValue()) { 
-        Value *NewAnd = Builder.CreateAnd(V, Low | AndC->getValue()); 
-        APInt N = SmallC->getValue().zext(BigBitSize) | BigC->getValue(); 
-        Value *NewVal = ConstantInt::get(AndC->getType()->getContext(), N); 
-        return Builder.CreateICmp(PredL, NewAnd, NewVal); 
-      } 
-    } 
-  } 
- 
-  // From here on, we only handle: 
-  //    (icmp1 A, C1) & (icmp2 A, C2) --> something simpler. 
-  if (LHS0 != RHS0) 
-    return nullptr; 
- 
-  // ICMP_[US][GL]E X, C is folded to ICMP_[US][GL]T elsewhere. 
-  if (PredL == ICmpInst::ICMP_UGE || PredL == ICmpInst::ICMP_ULE || 
-      PredR == ICmpInst::ICMP_UGE || PredR == ICmpInst::ICMP_ULE || 
-      PredL == ICmpInst::ICMP_SGE || PredL == ICmpInst::ICMP_SLE || 
-      PredR == ICmpInst::ICMP_SGE || PredR == ICmpInst::ICMP_SLE) 
-    return nullptr; 
- 
-  // We can't fold (ugt x, C) & (sgt x, C2). 
-  if (!predicatesFoldable(PredL, PredR)) 
-    return nullptr; 
- 
-  // Ensure that the larger constant is on the RHS. 
-  bool ShouldSwap; 
-  if (CmpInst::isSigned(PredL) || 
-      (ICmpInst::isEquality(PredL) && CmpInst::isSigned(PredR))) 
-    ShouldSwap = LHSC->getValue().sgt(RHSC->getValue()); 
-  else 
-    ShouldSwap = LHSC->getValue().ugt(RHSC->getValue()); 
- 
-  if (ShouldSwap) { 
-    std::swap(LHS, RHS); 
-    std::swap(LHSC, RHSC); 
-    std::swap(PredL, PredR); 
-  } 
- 
-  // At this point, we know we have two icmp instructions 
-  // comparing a value against two constants and and'ing the result 
-  // together.  Because of the above check, we know that we only have 
-  // icmp eq, icmp ne, icmp [su]lt, and icmp [SU]gt here. We also know 
-  // (from the icmp folding check above), that the two constants 
-  // are not equal and that the larger constant is on the RHS 
-  assert(LHSC != RHSC && "Compares not folded above?"); 
- 
-  switch (PredL) { 
-  default: 
-    llvm_unreachable("Unknown integer condition code!"); 
-  case ICmpInst::ICMP_NE: 
-    switch (PredR) { 
-    default: 
-      llvm_unreachable("Unknown integer condition code!"); 
-    case ICmpInst::ICMP_ULT: 
-      // (X != 13 & X u< 14) -> X < 13 
-      if (LHSC->getValue() == (RHSC->getValue() - 1)) 
-        return Builder.CreateICmpULT(LHS0, LHSC); 
-      if (LHSC->isZero()) // (X != 0 & X u< C) -> X-1 u< C-1 
-        return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(), 
-                               false, true); 
-      break; // (X != 13 & X u< 15) -> no change 
-    case ICmpInst::ICMP_SLT: 
-      // (X != 13 & X s< 14) -> X < 13 
-      if (LHSC->getValue() == (RHSC->getValue() - 1)) 
-        return Builder.CreateICmpSLT(LHS0, LHSC); 
-      // (X != INT_MIN & X s< C) -> X-(INT_MIN+1) u< (C-(INT_MIN+1)) 
-      if (LHSC->isMinValue(true)) 
-        return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(), 
-                               true, true); 
-      break; // (X != 13 & X s< 15) -> no change 
-    case ICmpInst::ICMP_NE: 
-      // Potential folds for this case should already be handled. 
-      break; 
-    } 
-    break; 
-  case ICmpInst::ICMP_UGT: 
-    switch (PredR) { 
-    default: 
-      llvm_unreachable("Unknown integer condition code!"); 
-    case ICmpInst::ICMP_NE: 
-      // (X u> 13 & X != 14) -> X u> 14 
-      if (RHSC->getValue() == (LHSC->getValue() + 1)) 
-        return Builder.CreateICmp(PredL, LHS0, RHSC); 
-      // X u> C & X != UINT_MAX -> (X-(C+1)) u< UINT_MAX-(C+1) 
-      if (RHSC->isMaxValue(false)) 
-        return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(), 
-                               false, true); 
-      break;                 // (X u> 13 & X != 15) -> no change 
-    case ICmpInst::ICMP_ULT: // (X u> 13 & X u< 15) -> (X-14) u< 1 
-      return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(), 
-                             false, true); 
-    } 
-    break; 
-  case ICmpInst::ICMP_SGT: 
-    switch (PredR) { 
-    default: 
-      llvm_unreachable("Unknown integer condition code!"); 
-    case ICmpInst::ICMP_NE: 
-      // (X s> 13 & X != 14) -> X s> 14 
-      if (RHSC->getValue() == (LHSC->getValue() + 1)) 
-        return Builder.CreateICmp(PredL, LHS0, RHSC); 
-      // X s> C & X != INT_MAX -> (X-(C+1)) u< INT_MAX-(C+1) 
-      if (RHSC->isMaxValue(true)) 
-        return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(), 
-                               true, true); 
-      break;                 // (X s> 13 & X != 15) -> no change 
-    case ICmpInst::ICMP_SLT: // (X s> 13 & X s< 15) -> (X-14) u< 1 
-      return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(), true, 
-                             true); 
-    } 
-    break; 
-  } 
- 
-  return nullptr; 
-} 
- 
+    return nullptr;
+
+  if (LHSC == RHSC && PredL == PredR) {
+    // (icmp ult A, C) & (icmp ult B, C) --> (icmp ult (A|B), C)
+    // where C is a power of 2 or
+    // (icmp eq A, 0) & (icmp eq B, 0) --> (icmp eq (A|B), 0)
+    if ((PredL == ICmpInst::ICMP_ULT && LHSC->getValue().isPowerOf2()) ||
+        (PredL == ICmpInst::ICMP_EQ && LHSC->isZero())) {
+      Value *NewOr = Builder.CreateOr(LHS0, RHS0);
+      return Builder.CreateICmp(PredL, NewOr, LHSC);
+    }
+  }
+
+  // (trunc x) == C1 & (and x, CA) == C2 -> (and x, CA|CMAX) == C1|C2
+  // where CMAX is the all ones value for the truncated type,
+  // iff the lower bits of C2 and CA are zero.
+  if (PredL == ICmpInst::ICMP_EQ && PredL == PredR && LHS->hasOneUse() &&
+      RHS->hasOneUse()) {
+    Value *V;
+    ConstantInt *AndC, *SmallC = nullptr, *BigC = nullptr;
+
+    // (trunc x) == C1 & (and x, CA) == C2
+    // (and x, CA) == C2 & (trunc x) == C1
+    if (match(RHS0, m_Trunc(m_Value(V))) &&
+        match(LHS0, m_And(m_Specific(V), m_ConstantInt(AndC)))) {
+      SmallC = RHSC;
+      BigC = LHSC;
+    } else if (match(LHS0, m_Trunc(m_Value(V))) &&
+               match(RHS0, m_And(m_Specific(V), m_ConstantInt(AndC)))) {
+      SmallC = LHSC;
+      BigC = RHSC;
+    }
+
+    if (SmallC && BigC) {
+      unsigned BigBitSize = BigC->getType()->getBitWidth();
+      unsigned SmallBitSize = SmallC->getType()->getBitWidth();
+
+      // Check that the low bits are zero.
+      APInt Low = APInt::getLowBitsSet(BigBitSize, SmallBitSize);
+      if ((Low & AndC->getValue()).isNullValue() &&
+          (Low & BigC->getValue()).isNullValue()) {
+        Value *NewAnd = Builder.CreateAnd(V, Low | AndC->getValue());
+        APInt N = SmallC->getValue().zext(BigBitSize) | BigC->getValue();
+        Value *NewVal = ConstantInt::get(AndC->getType()->getContext(), N);
+        return Builder.CreateICmp(PredL, NewAnd, NewVal);
+      }
+    }
+  }
+
+  // From here on, we only handle:
+  //    (icmp1 A, C1) & (icmp2 A, C2) --> something simpler.
+  if (LHS0 != RHS0)
+    return nullptr;
+
+  // ICMP_[US][GL]E X, C is folded to ICMP_[US][GL]T elsewhere.
+  if (PredL == ICmpInst::ICMP_UGE || PredL == ICmpInst::ICMP_ULE ||
+      PredR == ICmpInst::ICMP_UGE || PredR == ICmpInst::ICMP_ULE ||
+      PredL == ICmpInst::ICMP_SGE || PredL == ICmpInst::ICMP_SLE ||
+      PredR == ICmpInst::ICMP_SGE || PredR == ICmpInst::ICMP_SLE)
+    return nullptr;
+
+  // We can't fold (ugt x, C) & (sgt x, C2).
+  if (!predicatesFoldable(PredL, PredR))
+    return nullptr;
+
+  // Ensure that the larger constant is on the RHS.
+  bool ShouldSwap;
+  if (CmpInst::isSigned(PredL) ||
+      (ICmpInst::isEquality(PredL) && CmpInst::isSigned(PredR)))
+    ShouldSwap = LHSC->getValue().sgt(RHSC->getValue());
+  else
+    ShouldSwap = LHSC->getValue().ugt(RHSC->getValue());
+
+  if (ShouldSwap) {
+    std::swap(LHS, RHS);
+    std::swap(LHSC, RHSC);
+    std::swap(PredL, PredR);
+  }
+
+  // At this point, we know we have two icmp instructions
+  // comparing a value against two constants and and'ing the result
+  // together.  Because of the above check, we know that we only have
+  // icmp eq, icmp ne, icmp [su]lt, and icmp [SU]gt here. We also know
+  // (from the icmp folding check above), that the two constants
+  // are not equal and that the larger constant is on the RHS
+  assert(LHSC != RHSC && "Compares not folded above?");
+
+  switch (PredL) {
+  default:
+    llvm_unreachable("Unknown integer condition code!");
+  case ICmpInst::ICMP_NE:
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_ULT:
+      // (X != 13 & X u< 14) -> X < 13
+      if (LHSC->getValue() == (RHSC->getValue() - 1))
+        return Builder.CreateICmpULT(LHS0, LHSC);
+      if (LHSC->isZero()) // (X != 0 & X u< C) -> X-1 u< C-1
+        return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
+                               false, true);
+      break; // (X != 13 & X u< 15) -> no change
+    case ICmpInst::ICMP_SLT:
+      // (X != 13 & X s< 14) -> X < 13
+      if (LHSC->getValue() == (RHSC->getValue() - 1))
+        return Builder.CreateICmpSLT(LHS0, LHSC);
+      // (X != INT_MIN & X s< C) -> X-(INT_MIN+1) u< (C-(INT_MIN+1))
+      if (LHSC->isMinValue(true))
+        return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
+                               true, true);
+      break; // (X != 13 & X s< 15) -> no change
+    case ICmpInst::ICMP_NE:
+      // Potential folds for this case should already be handled.
+      break;
+    }
+    break;
+  case ICmpInst::ICMP_UGT:
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_NE:
+      // (X u> 13 & X != 14) -> X u> 14
+      if (RHSC->getValue() == (LHSC->getValue() + 1))
+        return Builder.CreateICmp(PredL, LHS0, RHSC);
+      // X u> C & X != UINT_MAX -> (X-(C+1)) u< UINT_MAX-(C+1)
+      if (RHSC->isMaxValue(false))
+        return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
+                               false, true);
+      break;                 // (X u> 13 & X != 15) -> no change
+    case ICmpInst::ICMP_ULT: // (X u> 13 & X u< 15) -> (X-14) u< 1
+      return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
+                             false, true);
+    }
+    break;
+  case ICmpInst::ICMP_SGT:
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_NE:
+      // (X s> 13 & X != 14) -> X s> 14
+      if (RHSC->getValue() == (LHSC->getValue() + 1))
+        return Builder.CreateICmp(PredL, LHS0, RHSC);
+      // X s> C & X != INT_MAX -> (X-(C+1)) u< INT_MAX-(C+1)
+      if (RHSC->isMaxValue(true))
+        return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
+                               true, true);
+      break;                 // (X s> 13 & X != 15) -> no change
+    case ICmpInst::ICMP_SLT: // (X s> 13 & X s< 15) -> (X-14) u< 1
+      return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(), true,
+                             true);
+    }
+    break;
+  }
+
+  return nullptr;
+}
+
 Value *InstCombinerImpl::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS,
                                           bool IsAnd) {
-  Value *LHS0 = LHS->getOperand(0), *LHS1 = LHS->getOperand(1); 
-  Value *RHS0 = RHS->getOperand(0), *RHS1 = RHS->getOperand(1); 
-  FCmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate(); 
- 
-  if (LHS0 == RHS1 && RHS0 == LHS1) { 
-    // Swap RHS operands to match LHS. 
-    PredR = FCmpInst::getSwappedPredicate(PredR); 
-    std::swap(RHS0, RHS1); 
-  } 
- 
-  // Simplify (fcmp cc0 x, y) & (fcmp cc1 x, y). 
-  // Suppose the relation between x and y is R, where R is one of 
-  // U(1000), L(0100), G(0010) or E(0001), and CC0 and CC1 are the bitmasks for 
-  // testing the desired relations. 
-  // 
-  // Since (R & CC0) and (R & CC1) are either R or 0, we actually have this: 
-  //    bool(R & CC0) && bool(R & CC1) 
-  //  = bool((R & CC0) & (R & CC1)) 
-  //  = bool(R & (CC0 & CC1)) <= by re-association, commutation, and idempotency 
-  // 
-  // Since (R & CC0) and (R & CC1) are either R or 0, we actually have this: 
-  //    bool(R & CC0) || bool(R & CC1) 
-  //  = bool((R & CC0) | (R & CC1)) 
-  //  = bool(R & (CC0 | CC1)) <= by reversed distribution (contribution? ;) 
-  if (LHS0 == RHS0 && LHS1 == RHS1) { 
-    unsigned FCmpCodeL = getFCmpCode(PredL); 
-    unsigned FCmpCodeR = getFCmpCode(PredR); 
-    unsigned NewPred = IsAnd ? FCmpCodeL & FCmpCodeR : FCmpCodeL | FCmpCodeR; 
-    return getFCmpValue(NewPred, LHS0, LHS1, Builder); 
-  } 
- 
-  if ((PredL == FCmpInst::FCMP_ORD && PredR == FCmpInst::FCMP_ORD && IsAnd) || 
-      (PredL == FCmpInst::FCMP_UNO && PredR == FCmpInst::FCMP_UNO && !IsAnd)) { 
-    if (LHS0->getType() != RHS0->getType()) 
-      return nullptr; 
- 
-    // FCmp canonicalization ensures that (fcmp ord/uno X, X) and 
-    // (fcmp ord/uno X, C) will be transformed to (fcmp X, +0.0). 
-    if (match(LHS1, m_PosZeroFP()) && match(RHS1, m_PosZeroFP())) 
-      // Ignore the constants because they are obviously not NANs: 
-      // (fcmp ord x, 0.0) & (fcmp ord y, 0.0)  -> (fcmp ord x, y) 
-      // (fcmp uno x, 0.0) | (fcmp uno y, 0.0)  -> (fcmp uno x, y) 
-      return Builder.CreateFCmp(PredL, LHS0, RHS0); 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// This a limited reassociation for a special case (see above) where we are 
-/// checking if two values are either both NAN (unordered) or not-NAN (ordered). 
-/// This could be handled more generally in '-reassociation', but it seems like 
-/// an unlikely pattern for a large number of logic ops and fcmps. 
-static Instruction *reassociateFCmps(BinaryOperator &BO, 
-                                     InstCombiner::BuilderTy &Builder) { 
-  Instruction::BinaryOps Opcode = BO.getOpcode(); 
-  assert((Opcode == Instruction::And || Opcode == Instruction::Or) && 
-         "Expecting and/or op for fcmp transform"); 
- 
-  // There are 4 commuted variants of the pattern. Canonicalize operands of this 
-  // logic op so an fcmp is operand 0 and a matching logic op is operand 1. 
-  Value *Op0 = BO.getOperand(0), *Op1 = BO.getOperand(1), *X; 
-  FCmpInst::Predicate Pred; 
-  if (match(Op1, m_FCmp(Pred, m_Value(), m_AnyZeroFP()))) 
-    std::swap(Op0, Op1); 
- 
-  // Match inner binop and the predicate for combining 2 NAN checks into 1. 
-  BinaryOperator *BO1; 
-  FCmpInst::Predicate NanPred = Opcode == Instruction::And ? FCmpInst::FCMP_ORD 
-                                                           : FCmpInst::FCMP_UNO; 
-  if (!match(Op0, m_FCmp(Pred, m_Value(X), m_AnyZeroFP())) || Pred != NanPred || 
-      !match(Op1, m_BinOp(BO1)) || BO1->getOpcode() != Opcode) 
-    return nullptr; 
- 
-  // The inner logic op must have a matching fcmp operand. 
-  Value *BO10 = BO1->getOperand(0), *BO11 = BO1->getOperand(1), *Y; 
-  if (!match(BO10, m_FCmp(Pred, m_Value(Y), m_AnyZeroFP())) || 
-      Pred != NanPred || X->getType() != Y->getType()) 
-    std::swap(BO10, BO11); 
- 
-  if (!match(BO10, m_FCmp(Pred, m_Value(Y), m_AnyZeroFP())) || 
-      Pred != NanPred || X->getType() != Y->getType()) 
-    return nullptr; 
- 
-  // and (fcmp ord X, 0), (and (fcmp ord Y, 0), Z) --> and (fcmp ord X, Y), Z 
-  // or  (fcmp uno X, 0), (or  (fcmp uno Y, 0), Z) --> or  (fcmp uno X, Y), Z 
-  Value *NewFCmp = Builder.CreateFCmp(Pred, X, Y); 
-  if (auto *NewFCmpInst = dyn_cast<FCmpInst>(NewFCmp)) { 
-    // Intersect FMF from the 2 source fcmps. 
-    NewFCmpInst->copyIRFlags(Op0); 
-    NewFCmpInst->andIRFlags(BO10); 
-  } 
-  return BinaryOperator::Create(Opcode, NewFCmp, BO11); 
-} 
- 
-/// Match De Morgan's Laws: 
-/// (~A & ~B) == (~(A | B)) 
-/// (~A | ~B) == (~(A & B)) 
-static Instruction *matchDeMorgansLaws(BinaryOperator &I, 
-                                       InstCombiner::BuilderTy &Builder) { 
-  auto Opcode = I.getOpcode(); 
-  assert((Opcode == Instruction::And || Opcode == Instruction::Or) && 
-         "Trying to match De Morgan's Laws with something other than and/or"); 
- 
-  // Flip the logic operation. 
-  Opcode = (Opcode == Instruction::And) ? Instruction::Or : Instruction::And; 
- 
-  Value *A, *B; 
-  if (match(I.getOperand(0), m_OneUse(m_Not(m_Value(A)))) && 
-      match(I.getOperand(1), m_OneUse(m_Not(m_Value(B)))) && 
+  Value *LHS0 = LHS->getOperand(0), *LHS1 = LHS->getOperand(1);
+  Value *RHS0 = RHS->getOperand(0), *RHS1 = RHS->getOperand(1);
+  FCmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
+
+  if (LHS0 == RHS1 && RHS0 == LHS1) {
+    // Swap RHS operands to match LHS.
+    PredR = FCmpInst::getSwappedPredicate(PredR);
+    std::swap(RHS0, RHS1);
+  }
+
+  // Simplify (fcmp cc0 x, y) & (fcmp cc1 x, y).
+  // Suppose the relation between x and y is R, where R is one of
+  // U(1000), L(0100), G(0010) or E(0001), and CC0 and CC1 are the bitmasks for
+  // testing the desired relations.
+  //
+  // Since (R & CC0) and (R & CC1) are either R or 0, we actually have this:
+  //    bool(R & CC0) && bool(R & CC1)
+  //  = bool((R & CC0) & (R & CC1))
+  //  = bool(R & (CC0 & CC1)) <= by re-association, commutation, and idempotency
+  //
+  // Since (R & CC0) and (R & CC1) are either R or 0, we actually have this:
+  //    bool(R & CC0) || bool(R & CC1)
+  //  = bool((R & CC0) | (R & CC1))
+  //  = bool(R & (CC0 | CC1)) <= by reversed distribution (contribution? ;)
+  if (LHS0 == RHS0 && LHS1 == RHS1) {
+    unsigned FCmpCodeL = getFCmpCode(PredL);
+    unsigned FCmpCodeR = getFCmpCode(PredR);
+    unsigned NewPred = IsAnd ? FCmpCodeL & FCmpCodeR : FCmpCodeL | FCmpCodeR;
+    return getFCmpValue(NewPred, LHS0, LHS1, Builder);
+  }
+
+  if ((PredL == FCmpInst::FCMP_ORD && PredR == FCmpInst::FCMP_ORD && IsAnd) ||
+      (PredL == FCmpInst::FCMP_UNO && PredR == FCmpInst::FCMP_UNO && !IsAnd)) {
+    if (LHS0->getType() != RHS0->getType())
+      return nullptr;
+
+    // FCmp canonicalization ensures that (fcmp ord/uno X, X) and
+    // (fcmp ord/uno X, C) will be transformed to (fcmp X, +0.0).
+    if (match(LHS1, m_PosZeroFP()) && match(RHS1, m_PosZeroFP()))
+      // Ignore the constants because they are obviously not NANs:
+      // (fcmp ord x, 0.0) & (fcmp ord y, 0.0)  -> (fcmp ord x, y)
+      // (fcmp uno x, 0.0) | (fcmp uno y, 0.0)  -> (fcmp uno x, y)
+      return Builder.CreateFCmp(PredL, LHS0, RHS0);
+  }
+
+  return nullptr;
+}
+
+/// This a limited reassociation for a special case (see above) where we are
+/// checking if two values are either both NAN (unordered) or not-NAN (ordered).
+/// This could be handled more generally in '-reassociation', but it seems like
+/// an unlikely pattern for a large number of logic ops and fcmps.
+static Instruction *reassociateFCmps(BinaryOperator &BO,
+                                     InstCombiner::BuilderTy &Builder) {
+  Instruction::BinaryOps Opcode = BO.getOpcode();
+  assert((Opcode == Instruction::And || Opcode == Instruction::Or) &&
+         "Expecting and/or op for fcmp transform");
+
+  // There are 4 commuted variants of the pattern. Canonicalize operands of this
+  // logic op so an fcmp is operand 0 and a matching logic op is operand 1.
+  Value *Op0 = BO.getOperand(0), *Op1 = BO.getOperand(1), *X;
+  FCmpInst::Predicate Pred;
+  if (match(Op1, m_FCmp(Pred, m_Value(), m_AnyZeroFP())))
+    std::swap(Op0, Op1);
+
+  // Match inner binop and the predicate for combining 2 NAN checks into 1.
+  BinaryOperator *BO1;
+  FCmpInst::Predicate NanPred = Opcode == Instruction::And ? FCmpInst::FCMP_ORD
+                                                           : FCmpInst::FCMP_UNO;
+  if (!match(Op0, m_FCmp(Pred, m_Value(X), m_AnyZeroFP())) || Pred != NanPred ||
+      !match(Op1, m_BinOp(BO1)) || BO1->getOpcode() != Opcode)
+    return nullptr;
+
+  // The inner logic op must have a matching fcmp operand.
+  Value *BO10 = BO1->getOperand(0), *BO11 = BO1->getOperand(1), *Y;
+  if (!match(BO10, m_FCmp(Pred, m_Value(Y), m_AnyZeroFP())) ||
+      Pred != NanPred || X->getType() != Y->getType())
+    std::swap(BO10, BO11);
+
+  if (!match(BO10, m_FCmp(Pred, m_Value(Y), m_AnyZeroFP())) ||
+      Pred != NanPred || X->getType() != Y->getType())
+    return nullptr;
+
+  // and (fcmp ord X, 0), (and (fcmp ord Y, 0), Z) --> and (fcmp ord X, Y), Z
+  // or  (fcmp uno X, 0), (or  (fcmp uno Y, 0), Z) --> or  (fcmp uno X, Y), Z
+  Value *NewFCmp = Builder.CreateFCmp(Pred, X, Y);
+  if (auto *NewFCmpInst = dyn_cast<FCmpInst>(NewFCmp)) {
+    // Intersect FMF from the 2 source fcmps.
+    NewFCmpInst->copyIRFlags(Op0);
+    NewFCmpInst->andIRFlags(BO10);
+  }
+  return BinaryOperator::Create(Opcode, NewFCmp, BO11);
+}
+
+/// Match De Morgan's Laws:
+/// (~A & ~B) == (~(A | B))
+/// (~A | ~B) == (~(A & B))
+static Instruction *matchDeMorgansLaws(BinaryOperator &I,
+                                       InstCombiner::BuilderTy &Builder) {
+  auto Opcode = I.getOpcode();
+  assert((Opcode == Instruction::And || Opcode == Instruction::Or) &&
+         "Trying to match De Morgan's Laws with something other than and/or");
+
+  // Flip the logic operation.
+  Opcode = (Opcode == Instruction::And) ? Instruction::Or : Instruction::And;
+
+  Value *A, *B;
+  if (match(I.getOperand(0), m_OneUse(m_Not(m_Value(A)))) &&
+      match(I.getOperand(1), m_OneUse(m_Not(m_Value(B)))) &&
       !InstCombiner::isFreeToInvert(A, A->hasOneUse()) &&
       !InstCombiner::isFreeToInvert(B, B->hasOneUse())) {
-    Value *AndOr = Builder.CreateBinOp(Opcode, A, B, I.getName() + ".demorgan"); 
-    return BinaryOperator::CreateNot(AndOr); 
-  } 
- 
-  return nullptr; 
-} 
- 
+    Value *AndOr = Builder.CreateBinOp(Opcode, A, B, I.getName() + ".demorgan");
+    return BinaryOperator::CreateNot(AndOr);
+  }
+
+  return nullptr;
+}
+
 bool InstCombinerImpl::shouldOptimizeCast(CastInst *CI) {
-  Value *CastSrc = CI->getOperand(0); 
- 
-  // Noop casts and casts of constants should be eliminated trivially. 
-  if (CI->getSrcTy() == CI->getDestTy() || isa<Constant>(CastSrc)) 
-    return false; 
- 
-  // If this cast is paired with another cast that can be eliminated, we prefer 
-  // to have it eliminated. 
-  if (const auto *PrecedingCI = dyn_cast<CastInst>(CastSrc)) 
-    if (isEliminableCastPair(PrecedingCI, CI)) 
-      return false; 
- 
-  return true; 
-} 
- 
-/// Fold {and,or,xor} (cast X), C. 
-static Instruction *foldLogicCastConstant(BinaryOperator &Logic, CastInst *Cast, 
-                                          InstCombiner::BuilderTy &Builder) { 
-  Constant *C = dyn_cast<Constant>(Logic.getOperand(1)); 
-  if (!C) 
-    return nullptr; 
- 
-  auto LogicOpc = Logic.getOpcode(); 
-  Type *DestTy = Logic.getType(); 
-  Type *SrcTy = Cast->getSrcTy(); 
- 
-  // Move the logic operation ahead of a zext or sext if the constant is 
-  // unchanged in the smaller source type. Performing the logic in a smaller 
-  // type may provide more information to later folds, and the smaller logic 
-  // instruction may be cheaper (particularly in the case of vectors). 
-  Value *X; 
-  if (match(Cast, m_OneUse(m_ZExt(m_Value(X))))) { 
-    Constant *TruncC = ConstantExpr::getTrunc(C, SrcTy); 
-    Constant *ZextTruncC = ConstantExpr::getZExt(TruncC, DestTy); 
-    if (ZextTruncC == C) { 
-      // LogicOpc (zext X), C --> zext (LogicOpc X, C) 
-      Value *NewOp = Builder.CreateBinOp(LogicOpc, X, TruncC); 
-      return new ZExtInst(NewOp, DestTy); 
-    } 
-  } 
- 
-  if (match(Cast, m_OneUse(m_SExt(m_Value(X))))) { 
-    Constant *TruncC = ConstantExpr::getTrunc(C, SrcTy); 
-    Constant *SextTruncC = ConstantExpr::getSExt(TruncC, DestTy); 
-    if (SextTruncC == C) { 
-      // LogicOpc (sext X), C --> sext (LogicOpc X, C) 
-      Value *NewOp = Builder.CreateBinOp(LogicOpc, X, TruncC); 
-      return new SExtInst(NewOp, DestTy); 
-    } 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Fold {and,or,xor} (cast X), Y. 
+  Value *CastSrc = CI->getOperand(0);
+
+  // Noop casts and casts of constants should be eliminated trivially.
+  if (CI->getSrcTy() == CI->getDestTy() || isa<Constant>(CastSrc))
+    return false;
+
+  // If this cast is paired with another cast that can be eliminated, we prefer
+  // to have it eliminated.
+  if (const auto *PrecedingCI = dyn_cast<CastInst>(CastSrc))
+    if (isEliminableCastPair(PrecedingCI, CI))
+      return false;
+
+  return true;
+}
+
+/// Fold {and,or,xor} (cast X), C.
+static Instruction *foldLogicCastConstant(BinaryOperator &Logic, CastInst *Cast,
+                                          InstCombiner::BuilderTy &Builder) {
+  Constant *C = dyn_cast<Constant>(Logic.getOperand(1));
+  if (!C)
+    return nullptr;
+
+  auto LogicOpc = Logic.getOpcode();
+  Type *DestTy = Logic.getType();
+  Type *SrcTy = Cast->getSrcTy();
+
+  // Move the logic operation ahead of a zext or sext if the constant is
+  // unchanged in the smaller source type. Performing the logic in a smaller
+  // type may provide more information to later folds, and the smaller logic
+  // instruction may be cheaper (particularly in the case of vectors).
+  Value *X;
+  if (match(Cast, m_OneUse(m_ZExt(m_Value(X))))) {
+    Constant *TruncC = ConstantExpr::getTrunc(C, SrcTy);
+    Constant *ZextTruncC = ConstantExpr::getZExt(TruncC, DestTy);
+    if (ZextTruncC == C) {
+      // LogicOpc (zext X), C --> zext (LogicOpc X, C)
+      Value *NewOp = Builder.CreateBinOp(LogicOpc, X, TruncC);
+      return new ZExtInst(NewOp, DestTy);
+    }
+  }
+
+  if (match(Cast, m_OneUse(m_SExt(m_Value(X))))) {
+    Constant *TruncC = ConstantExpr::getTrunc(C, SrcTy);
+    Constant *SextTruncC = ConstantExpr::getSExt(TruncC, DestTy);
+    if (SextTruncC == C) {
+      // LogicOpc (sext X), C --> sext (LogicOpc X, C)
+      Value *NewOp = Builder.CreateBinOp(LogicOpc, X, TruncC);
+      return new SExtInst(NewOp, DestTy);
+    }
+  }
+
+  return nullptr;
+}
+
+/// Fold {and,or,xor} (cast X), Y.
 Instruction *InstCombinerImpl::foldCastedBitwiseLogic(BinaryOperator &I) {
-  auto LogicOpc = I.getOpcode(); 
-  assert(I.isBitwiseLogicOp() && "Unexpected opcode for bitwise logic folding"); 
- 
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); 
-  CastInst *Cast0 = dyn_cast<CastInst>(Op0); 
-  if (!Cast0) 
-    return nullptr; 
- 
-  // This must be a cast from an integer or integer vector source type to allow 
-  // transformation of the logic operation to the source type. 
-  Type *DestTy = I.getType(); 
-  Type *SrcTy = Cast0->getSrcTy(); 
-  if (!SrcTy->isIntOrIntVectorTy()) 
-    return nullptr; 
- 
-  if (Instruction *Ret = foldLogicCastConstant(I, Cast0, Builder)) 
-    return Ret; 
- 
-  CastInst *Cast1 = dyn_cast<CastInst>(Op1); 
-  if (!Cast1) 
-    return nullptr; 
- 
-  // Both operands of the logic operation are casts. The casts must be of the 
-  // same type for reduction. 
-  auto CastOpcode = Cast0->getOpcode(); 
-  if (CastOpcode != Cast1->getOpcode() || SrcTy != Cast1->getSrcTy()) 
-    return nullptr; 
- 
-  Value *Cast0Src = Cast0->getOperand(0); 
-  Value *Cast1Src = Cast1->getOperand(0); 
- 
-  // fold logic(cast(A), cast(B)) -> cast(logic(A, B)) 
-  if (shouldOptimizeCast(Cast0) && shouldOptimizeCast(Cast1)) { 
-    Value *NewOp = Builder.CreateBinOp(LogicOpc, Cast0Src, Cast1Src, 
-                                        I.getName()); 
-    return CastInst::Create(CastOpcode, NewOp, DestTy); 
-  } 
- 
-  // For now, only 'and'/'or' have optimizations after this. 
-  if (LogicOpc == Instruction::Xor) 
-    return nullptr; 
- 
-  // If this is logic(cast(icmp), cast(icmp)), try to fold this even if the 
-  // cast is otherwise not optimizable.  This happens for vector sexts. 
-  ICmpInst *ICmp0 = dyn_cast<ICmpInst>(Cast0Src); 
-  ICmpInst *ICmp1 = dyn_cast<ICmpInst>(Cast1Src); 
-  if (ICmp0 && ICmp1) { 
-    Value *Res = LogicOpc == Instruction::And ? foldAndOfICmps(ICmp0, ICmp1, I) 
-                                              : foldOrOfICmps(ICmp0, ICmp1, I); 
-    if (Res) 
-      return CastInst::Create(CastOpcode, Res, DestTy); 
-    return nullptr; 
-  } 
- 
-  // If this is logic(cast(fcmp), cast(fcmp)), try to fold this even if the 
-  // cast is otherwise not optimizable.  This happens for vector sexts. 
-  FCmpInst *FCmp0 = dyn_cast<FCmpInst>(Cast0Src); 
-  FCmpInst *FCmp1 = dyn_cast<FCmpInst>(Cast1Src); 
-  if (FCmp0 && FCmp1) 
-    if (Value *R = foldLogicOfFCmps(FCmp0, FCmp1, LogicOpc == Instruction::And)) 
-      return CastInst::Create(CastOpcode, R, DestTy); 
- 
-  return nullptr; 
-} 
- 
-static Instruction *foldAndToXor(BinaryOperator &I, 
-                                 InstCombiner::BuilderTy &Builder) { 
-  assert(I.getOpcode() == Instruction::And); 
-  Value *Op0 = I.getOperand(0); 
-  Value *Op1 = I.getOperand(1); 
-  Value *A, *B; 
- 
-  // Operand complexity canonicalization guarantees that the 'or' is Op0. 
-  // (A | B) & ~(A & B) --> A ^ B 
-  // (A | B) & ~(B & A) --> A ^ B 
-  if (match(&I, m_BinOp(m_Or(m_Value(A), m_Value(B)), 
-                        m_Not(m_c_And(m_Deferred(A), m_Deferred(B)))))) 
-    return BinaryOperator::CreateXor(A, B); 
- 
-  // (A | ~B) & (~A | B) --> ~(A ^ B) 
-  // (A | ~B) & (B | ~A) --> ~(A ^ B) 
-  // (~B | A) & (~A | B) --> ~(A ^ B) 
-  // (~B | A) & (B | ~A) --> ~(A ^ B) 
-  if (Op0->hasOneUse() || Op1->hasOneUse()) 
-    if (match(&I, m_BinOp(m_c_Or(m_Value(A), m_Not(m_Value(B))), 
-                          m_c_Or(m_Not(m_Deferred(A)), m_Deferred(B))))) 
-      return BinaryOperator::CreateNot(Builder.CreateXor(A, B)); 
- 
-  return nullptr; 
-} 
- 
-static Instruction *foldOrToXor(BinaryOperator &I, 
-                                InstCombiner::BuilderTy &Builder) { 
-  assert(I.getOpcode() == Instruction::Or); 
-  Value *Op0 = I.getOperand(0); 
-  Value *Op1 = I.getOperand(1); 
-  Value *A, *B; 
- 
-  // Operand complexity canonicalization guarantees that the 'and' is Op0. 
-  // (A & B) | ~(A | B) --> ~(A ^ B) 
-  // (A & B) | ~(B | A) --> ~(A ^ B) 
-  if (Op0->hasOneUse() || Op1->hasOneUse()) 
-    if (match(Op0, m_And(m_Value(A), m_Value(B))) && 
-        match(Op1, m_Not(m_c_Or(m_Specific(A), m_Specific(B))))) 
-      return BinaryOperator::CreateNot(Builder.CreateXor(A, B)); 
- 
+  auto LogicOpc = I.getOpcode();
+  assert(I.isBitwiseLogicOp() && "Unexpected opcode for bitwise logic folding");
+
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  CastInst *Cast0 = dyn_cast<CastInst>(Op0);
+  if (!Cast0)
+    return nullptr;
+
+  // This must be a cast from an integer or integer vector source type to allow
+  // transformation of the logic operation to the source type.
+  Type *DestTy = I.getType();
+  Type *SrcTy = Cast0->getSrcTy();
+  if (!SrcTy->isIntOrIntVectorTy())
+    return nullptr;
+
+  if (Instruction *Ret = foldLogicCastConstant(I, Cast0, Builder))
+    return Ret;
+
+  CastInst *Cast1 = dyn_cast<CastInst>(Op1);
+  if (!Cast1)
+    return nullptr;
+
+  // Both operands of the logic operation are casts. The casts must be of the
+  // same type for reduction.
+  auto CastOpcode = Cast0->getOpcode();
+  if (CastOpcode != Cast1->getOpcode() || SrcTy != Cast1->getSrcTy())
+    return nullptr;
+
+  Value *Cast0Src = Cast0->getOperand(0);
+  Value *Cast1Src = Cast1->getOperand(0);
+
+  // fold logic(cast(A), cast(B)) -> cast(logic(A, B))
+  if (shouldOptimizeCast(Cast0) && shouldOptimizeCast(Cast1)) {
+    Value *NewOp = Builder.CreateBinOp(LogicOpc, Cast0Src, Cast1Src,
+                                        I.getName());
+    return CastInst::Create(CastOpcode, NewOp, DestTy);
+  }
+
+  // For now, only 'and'/'or' have optimizations after this.
+  if (LogicOpc == Instruction::Xor)
+    return nullptr;
+
+  // If this is logic(cast(icmp), cast(icmp)), try to fold this even if the
+  // cast is otherwise not optimizable.  This happens for vector sexts.
+  ICmpInst *ICmp0 = dyn_cast<ICmpInst>(Cast0Src);
+  ICmpInst *ICmp1 = dyn_cast<ICmpInst>(Cast1Src);
+  if (ICmp0 && ICmp1) {
+    Value *Res = LogicOpc == Instruction::And ? foldAndOfICmps(ICmp0, ICmp1, I)
+                                              : foldOrOfICmps(ICmp0, ICmp1, I);
+    if (Res)
+      return CastInst::Create(CastOpcode, Res, DestTy);
+    return nullptr;
+  }
+
+  // If this is logic(cast(fcmp), cast(fcmp)), try to fold this even if the
+  // cast is otherwise not optimizable.  This happens for vector sexts.
+  FCmpInst *FCmp0 = dyn_cast<FCmpInst>(Cast0Src);
+  FCmpInst *FCmp1 = dyn_cast<FCmpInst>(Cast1Src);
+  if (FCmp0 && FCmp1)
+    if (Value *R = foldLogicOfFCmps(FCmp0, FCmp1, LogicOpc == Instruction::And))
+      return CastInst::Create(CastOpcode, R, DestTy);
+
+  return nullptr;
+}
+
+static Instruction *foldAndToXor(BinaryOperator &I,
+                                 InstCombiner::BuilderTy &Builder) {
+  assert(I.getOpcode() == Instruction::And);
+  Value *Op0 = I.getOperand(0);
+  Value *Op1 = I.getOperand(1);
+  Value *A, *B;
+
+  // Operand complexity canonicalization guarantees that the 'or' is Op0.
+  // (A | B) & ~(A & B) --> A ^ B
+  // (A | B) & ~(B & A) --> A ^ B
+  if (match(&I, m_BinOp(m_Or(m_Value(A), m_Value(B)),
+                        m_Not(m_c_And(m_Deferred(A), m_Deferred(B))))))
+    return BinaryOperator::CreateXor(A, B);
+
+  // (A | ~B) & (~A | B) --> ~(A ^ B)
+  // (A | ~B) & (B | ~A) --> ~(A ^ B)
+  // (~B | A) & (~A | B) --> ~(A ^ B)
+  // (~B | A) & (B | ~A) --> ~(A ^ B)
+  if (Op0->hasOneUse() || Op1->hasOneUse())
+    if (match(&I, m_BinOp(m_c_Or(m_Value(A), m_Not(m_Value(B))),
+                          m_c_Or(m_Not(m_Deferred(A)), m_Deferred(B)))))
+      return BinaryOperator::CreateNot(Builder.CreateXor(A, B));
+
+  return nullptr;
+}
+
+static Instruction *foldOrToXor(BinaryOperator &I,
+                                InstCombiner::BuilderTy &Builder) {
+  assert(I.getOpcode() == Instruction::Or);
+  Value *Op0 = I.getOperand(0);
+  Value *Op1 = I.getOperand(1);
+  Value *A, *B;
+
+  // Operand complexity canonicalization guarantees that the 'and' is Op0.
+  // (A & B) | ~(A | B) --> ~(A ^ B)
+  // (A & B) | ~(B | A) --> ~(A ^ B)
+  if (Op0->hasOneUse() || Op1->hasOneUse())
+    if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
+        match(Op1, m_Not(m_c_Or(m_Specific(A), m_Specific(B)))))
+      return BinaryOperator::CreateNot(Builder.CreateXor(A, B));
+
   // Operand complexity canonicalization guarantees that the 'xor' is Op0.
   // (A ^ B) | ~(A | B) --> ~(A & B)
   // (A ^ B) | ~(B | A) --> ~(A & B)
@@ -1635,98 +1635,98 @@ static Instruction *foldOrToXor(BinaryOperator &I,
         match(Op1, m_Not(m_c_Or(m_Specific(A), m_Specific(B)))))
       return BinaryOperator::CreateNot(Builder.CreateAnd(A, B));
 
-  // (A & ~B) | (~A & B) --> A ^ B 
-  // (A & ~B) | (B & ~A) --> A ^ B 
-  // (~B & A) | (~A & B) --> A ^ B 
-  // (~B & A) | (B & ~A) --> A ^ B 
-  if (match(Op0, m_c_And(m_Value(A), m_Not(m_Value(B)))) && 
-      match(Op1, m_c_And(m_Not(m_Specific(A)), m_Specific(B)))) 
-    return BinaryOperator::CreateXor(A, B); 
- 
-  return nullptr; 
-} 
- 
-/// Return true if a constant shift amount is always less than the specified 
-/// bit-width. If not, the shift could create poison in the narrower type. 
-static bool canNarrowShiftAmt(Constant *C, unsigned BitWidth) { 
+  // (A & ~B) | (~A & B) --> A ^ B
+  // (A & ~B) | (B & ~A) --> A ^ B
+  // (~B & A) | (~A & B) --> A ^ B
+  // (~B & A) | (B & ~A) --> A ^ B
+  if (match(Op0, m_c_And(m_Value(A), m_Not(m_Value(B)))) &&
+      match(Op1, m_c_And(m_Not(m_Specific(A)), m_Specific(B))))
+    return BinaryOperator::CreateXor(A, B);
+
+  return nullptr;
+}
+
+/// Return true if a constant shift amount is always less than the specified
+/// bit-width. If not, the shift could create poison in the narrower type.
+static bool canNarrowShiftAmt(Constant *C, unsigned BitWidth) {
   APInt Threshold(C->getType()->getScalarSizeInBits(), BitWidth);
   return match(C, m_SpecificInt_ICMP(ICmpInst::ICMP_ULT, Threshold));
-} 
- 
-/// Try to use narrower ops (sink zext ops) for an 'and' with binop operand and 
-/// a common zext operand: and (binop (zext X), C), (zext X). 
+}
+
+/// Try to use narrower ops (sink zext ops) for an 'and' with binop operand and
+/// a common zext operand: and (binop (zext X), C), (zext X).
 Instruction *InstCombinerImpl::narrowMaskedBinOp(BinaryOperator &And) {
-  // This transform could also apply to {or, and, xor}, but there are better 
-  // folds for those cases, so we don't expect those patterns here. AShr is not 
-  // handled because it should always be transformed to LShr in this sequence. 
-  // The subtract transform is different because it has a constant on the left. 
-  // Add/mul commute the constant to RHS; sub with constant RHS becomes add. 
-  Value *Op0 = And.getOperand(0), *Op1 = And.getOperand(1); 
-  Constant *C; 
-  if (!match(Op0, m_OneUse(m_Add(m_Specific(Op1), m_Constant(C)))) && 
-      !match(Op0, m_OneUse(m_Mul(m_Specific(Op1), m_Constant(C)))) && 
-      !match(Op0, m_OneUse(m_LShr(m_Specific(Op1), m_Constant(C)))) && 
-      !match(Op0, m_OneUse(m_Shl(m_Specific(Op1), m_Constant(C)))) && 
-      !match(Op0, m_OneUse(m_Sub(m_Constant(C), m_Specific(Op1))))) 
-    return nullptr; 
- 
-  Value *X; 
-  if (!match(Op1, m_ZExt(m_Value(X))) || Op1->hasNUsesOrMore(3)) 
-    return nullptr; 
- 
-  Type *Ty = And.getType(); 
-  if (!isa<VectorType>(Ty) && !shouldChangeType(Ty, X->getType())) 
-    return nullptr; 
- 
-  // If we're narrowing a shift, the shift amount must be safe (less than the 
-  // width) in the narrower type. If the shift amount is greater, instsimplify 
-  // usually handles that case, but we can't guarantee/assert it. 
-  Instruction::BinaryOps Opc = cast<BinaryOperator>(Op0)->getOpcode(); 
-  if (Opc == Instruction::LShr || Opc == Instruction::Shl) 
-    if (!canNarrowShiftAmt(C, X->getType()->getScalarSizeInBits())) 
-      return nullptr; 
- 
-  // and (sub C, (zext X)), (zext X) --> zext (and (sub C', X), X) 
-  // and (binop (zext X), C), (zext X) --> zext (and (binop X, C'), X) 
-  Value *NewC = ConstantExpr::getTrunc(C, X->getType()); 
-  Value *NewBO = Opc == Instruction::Sub ? Builder.CreateBinOp(Opc, NewC, X) 
-                                         : Builder.CreateBinOp(Opc, X, NewC); 
-  return new ZExtInst(Builder.CreateAnd(NewBO, X), Ty); 
-} 
- 
-// FIXME: We use commutative matchers (m_c_*) for some, but not all, matches 
-// here. We should standardize that construct where it is needed or choose some 
-// other way to ensure that commutated variants of patterns are not missed. 
+  // This transform could also apply to {or, and, xor}, but there are better
+  // folds for those cases, so we don't expect those patterns here. AShr is not
+  // handled because it should always be transformed to LShr in this sequence.
+  // The subtract transform is different because it has a constant on the left.
+  // Add/mul commute the constant to RHS; sub with constant RHS becomes add.
+  Value *Op0 = And.getOperand(0), *Op1 = And.getOperand(1);
+  Constant *C;
+  if (!match(Op0, m_OneUse(m_Add(m_Specific(Op1), m_Constant(C)))) &&
+      !match(Op0, m_OneUse(m_Mul(m_Specific(Op1), m_Constant(C)))) &&
+      !match(Op0, m_OneUse(m_LShr(m_Specific(Op1), m_Constant(C)))) &&
+      !match(Op0, m_OneUse(m_Shl(m_Specific(Op1), m_Constant(C)))) &&
+      !match(Op0, m_OneUse(m_Sub(m_Constant(C), m_Specific(Op1)))))
+    return nullptr;
+
+  Value *X;
+  if (!match(Op1, m_ZExt(m_Value(X))) || Op1->hasNUsesOrMore(3))
+    return nullptr;
+
+  Type *Ty = And.getType();
+  if (!isa<VectorType>(Ty) && !shouldChangeType(Ty, X->getType()))
+    return nullptr;
+
+  // If we're narrowing a shift, the shift amount must be safe (less than the
+  // width) in the narrower type. If the shift amount is greater, instsimplify
+  // usually handles that case, but we can't guarantee/assert it.
+  Instruction::BinaryOps Opc = cast<BinaryOperator>(Op0)->getOpcode();
+  if (Opc == Instruction::LShr || Opc == Instruction::Shl)
+    if (!canNarrowShiftAmt(C, X->getType()->getScalarSizeInBits()))
+      return nullptr;
+
+  // and (sub C, (zext X)), (zext X) --> zext (and (sub C', X), X)
+  // and (binop (zext X), C), (zext X) --> zext (and (binop X, C'), X)
+  Value *NewC = ConstantExpr::getTrunc(C, X->getType());
+  Value *NewBO = Opc == Instruction::Sub ? Builder.CreateBinOp(Opc, NewC, X)
+                                         : Builder.CreateBinOp(Opc, X, NewC);
+  return new ZExtInst(Builder.CreateAnd(NewBO, X), Ty);
+}
+
+// FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
+// here. We should standardize that construct where it is needed or choose some
+// other way to ensure that commutated variants of patterns are not missed.
 Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
   Type *Ty = I.getType();
 
-  if (Value *V = SimplifyAndInst(I.getOperand(0), I.getOperand(1), 
-                                 SQ.getWithInstruction(&I))) 
-    return replaceInstUsesWith(I, V); 
- 
-  if (SimplifyAssociativeOrCommutative(I)) 
-    return &I; 
- 
-  if (Instruction *X = foldVectorBinop(I)) 
-    return X; 
- 
-  // See if we can simplify any instructions used by the instruction whose sole 
-  // purpose is to compute bits we don't care about. 
-  if (SimplifyDemandedInstructionBits(I)) 
-    return &I; 
- 
-  // Do this before using distributive laws to catch simple and/or/not patterns. 
-  if (Instruction *Xor = foldAndToXor(I, Builder)) 
-    return Xor; 
- 
-  // (A|B)&(A|C) -> A|(B&C) etc 
-  if (Value *V = SimplifyUsingDistributiveLaws(I)) 
-    return replaceInstUsesWith(I, V); 
- 
-  if (Value *V = SimplifyBSwap(I, Builder)) 
-    return replaceInstUsesWith(I, V); 
- 
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); 
+  if (Value *V = SimplifyAndInst(I.getOperand(0), I.getOperand(1),
+                                 SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  if (SimplifyAssociativeOrCommutative(I))
+    return &I;
+
+  if (Instruction *X = foldVectorBinop(I))
+    return X;
+
+  // See if we can simplify any instructions used by the instruction whose sole
+  // purpose is to compute bits we don't care about.
+  if (SimplifyDemandedInstructionBits(I))
+    return &I;
+
+  // Do this before using distributive laws to catch simple and/or/not patterns.
+  if (Instruction *Xor = foldAndToXor(I, Builder))
+    return Xor;
+
+  // (A|B)&(A|C) -> A|(B&C) etc
+  if (Value *V = SimplifyUsingDistributiveLaws(I))
+    return replaceInstUsesWith(I, V);
+
+  if (Value *V = SimplifyBSwap(I, Builder))
+    return replaceInstUsesWith(I, V);
+
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
   Value *X, *Y;
   if (match(Op0, m_OneUse(m_LogicalShift(m_One(), m_Value(X)))) &&
@@ -1737,61 +1737,61 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
     return new ZExtInst(IsZero, Ty);
   }
 
-  const APInt *C; 
-  if (match(Op1, m_APInt(C))) { 
-    const APInt *XorC; 
-    if (match(Op0, m_OneUse(m_Xor(m_Value(X), m_APInt(XorC))))) { 
-      // (X ^ C1) & C2 --> (X & C2) ^ (C1&C2) 
+  const APInt *C;
+  if (match(Op1, m_APInt(C))) {
+    const APInt *XorC;
+    if (match(Op0, m_OneUse(m_Xor(m_Value(X), m_APInt(XorC))))) {
+      // (X ^ C1) & C2 --> (X & C2) ^ (C1&C2)
       Constant *NewC = ConstantInt::get(Ty, *C & *XorC);
-      Value *And = Builder.CreateAnd(X, Op1); 
-      And->takeName(Op0); 
-      return BinaryOperator::CreateXor(And, NewC); 
-    } 
- 
-    const APInt *OrC; 
-    if (match(Op0, m_OneUse(m_Or(m_Value(X), m_APInt(OrC))))) { 
-      // (X | C1) & C2 --> (X & C2^(C1&C2)) | (C1&C2) 
-      // NOTE: This reduces the number of bits set in the & mask, which 
-      // can expose opportunities for store narrowing for scalars. 
-      // NOTE: SimplifyDemandedBits should have already removed bits from C1 
-      // that aren't set in C2. Meaning we can replace (C1&C2) with C1 in 
-      // above, but this feels safer. 
-      APInt Together = *C & *OrC; 
+      Value *And = Builder.CreateAnd(X, Op1);
+      And->takeName(Op0);
+      return BinaryOperator::CreateXor(And, NewC);
+    }
+
+    const APInt *OrC;
+    if (match(Op0, m_OneUse(m_Or(m_Value(X), m_APInt(OrC))))) {
+      // (X | C1) & C2 --> (X & C2^(C1&C2)) | (C1&C2)
+      // NOTE: This reduces the number of bits set in the & mask, which
+      // can expose opportunities for store narrowing for scalars.
+      // NOTE: SimplifyDemandedBits should have already removed bits from C1
+      // that aren't set in C2. Meaning we can replace (C1&C2) with C1 in
+      // above, but this feels safer.
+      APInt Together = *C & *OrC;
       Value *And = Builder.CreateAnd(X, ConstantInt::get(Ty, Together ^ *C));
-      And->takeName(Op0); 
+      And->takeName(Op0);
       return BinaryOperator::CreateOr(And, ConstantInt::get(Ty, Together));
-    } 
- 
-    // If the mask is only needed on one incoming arm, push the 'and' op up. 
-    if (match(Op0, m_OneUse(m_Xor(m_Value(X), m_Value(Y)))) || 
-        match(Op0, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) { 
-      APInt NotAndMask(~(*C)); 
-      BinaryOperator::BinaryOps BinOp = cast<BinaryOperator>(Op0)->getOpcode(); 
-      if (MaskedValueIsZero(X, NotAndMask, 0, &I)) { 
-        // Not masking anything out for the LHS, move mask to RHS. 
-        // and ({x}or X, Y), C --> {x}or X, (and Y, C) 
-        Value *NewRHS = Builder.CreateAnd(Y, Op1, Y->getName() + ".masked"); 
-        return BinaryOperator::Create(BinOp, X, NewRHS); 
-      } 
-      if (!isa<Constant>(Y) && MaskedValueIsZero(Y, NotAndMask, 0, &I)) { 
-        // Not masking anything out for the RHS, move mask to LHS. 
-        // and ({x}or X, Y), C --> {x}or (and X, C), Y 
-        Value *NewLHS = Builder.CreateAnd(X, Op1, X->getName() + ".masked"); 
-        return BinaryOperator::Create(BinOp, NewLHS, Y); 
-      } 
-    } 
+    }
+
+    // If the mask is only needed on one incoming arm, push the 'and' op up.
+    if (match(Op0, m_OneUse(m_Xor(m_Value(X), m_Value(Y)))) ||
+        match(Op0, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) {
+      APInt NotAndMask(~(*C));
+      BinaryOperator::BinaryOps BinOp = cast<BinaryOperator>(Op0)->getOpcode();
+      if (MaskedValueIsZero(X, NotAndMask, 0, &I)) {
+        // Not masking anything out for the LHS, move mask to RHS.
+        // and ({x}or X, Y), C --> {x}or X, (and Y, C)
+        Value *NewRHS = Builder.CreateAnd(Y, Op1, Y->getName() + ".masked");
+        return BinaryOperator::Create(BinOp, X, NewRHS);
+      }
+      if (!isa<Constant>(Y) && MaskedValueIsZero(Y, NotAndMask, 0, &I)) {
+        // Not masking anything out for the RHS, move mask to LHS.
+        // and ({x}or X, Y), C --> {x}or (and X, C), Y
+        Value *NewLHS = Builder.CreateAnd(X, Op1, X->getName() + ".masked");
+        return BinaryOperator::Create(BinOp, NewLHS, Y);
+      }
+    }
 
     unsigned Width = Ty->getScalarSizeInBits();
-    const APInt *ShiftC; 
-    if (match(Op0, m_OneUse(m_SExt(m_AShr(m_Value(X), m_APInt(ShiftC)))))) { 
-      if (*C == APInt::getLowBitsSet(Width, Width - ShiftC->getZExtValue())) { 
-        // We are clearing high bits that were potentially set by sext+ashr: 
-        // and (sext (ashr X, ShiftC)), C --> lshr (sext X), ShiftC 
+    const APInt *ShiftC;
+    if (match(Op0, m_OneUse(m_SExt(m_AShr(m_Value(X), m_APInt(ShiftC)))))) {
+      if (*C == APInt::getLowBitsSet(Width, Width - ShiftC->getZExtValue())) {
+        // We are clearing high bits that were potentially set by sext+ashr:
+        // and (sext (ashr X, ShiftC)), C --> lshr (sext X), ShiftC
         Value *Sext = Builder.CreateSExt(X, Ty);
         Constant *ShAmtC = ConstantInt::get(Ty, ShiftC->zext(Width));
-        return BinaryOperator::CreateLShr(Sext, ShAmtC); 
-      } 
-    } 
+        return BinaryOperator::CreateLShr(Sext, ShAmtC);
+      }
+    }
 
     const APInt *AddC;
     if (match(Op0, m_Add(m_Value(X), m_APInt(AddC)))) {
@@ -1812,48 +1812,48 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
         return BinaryOperator::CreateXor(NewAnd, Op1);
       }
     }
-  } 
- 
+  }
+
   ConstantInt *AndRHS;
   if (match(Op1, m_ConstantInt(AndRHS))) {
-    const APInt &AndRHSMask = AndRHS->getValue(); 
- 
-    // Optimize a variety of ((val OP C1) & C2) combinations... 
-    if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0)) { 
-      // ((C1 OP zext(X)) & C2) -> zext((C1-X) & C2) if C2 fits in the bitwidth 
-      // of X and OP behaves well when given trunc(C1) and X. 
+    const APInt &AndRHSMask = AndRHS->getValue();
+
+    // Optimize a variety of ((val OP C1) & C2) combinations...
+    if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0)) {
+      // ((C1 OP zext(X)) & C2) -> zext((C1-X) & C2) if C2 fits in the bitwidth
+      // of X and OP behaves well when given trunc(C1) and X.
       // TODO: Do this for vectors by using m_APInt instead of m_ConstantInt.
-      switch (Op0I->getOpcode()) { 
-      default: 
-        break; 
-      case Instruction::Xor: 
-      case Instruction::Or: 
-      case Instruction::Mul: 
-      case Instruction::Add: 
-      case Instruction::Sub: 
-        Value *X; 
-        ConstantInt *C1; 
-        // TODO: The one use restrictions could be relaxed a little if the AND 
-        // is going to be removed. 
-        if (match(Op0I, m_OneUse(m_c_BinOp(m_OneUse(m_ZExt(m_Value(X))), 
-                                           m_ConstantInt(C1))))) { 
-          if (AndRHSMask.isIntN(X->getType()->getScalarSizeInBits())) { 
-            auto *TruncC1 = ConstantExpr::getTrunc(C1, X->getType()); 
-            Value *BinOp; 
-            Value *Op0LHS = Op0I->getOperand(0); 
-            if (isa<ZExtInst>(Op0LHS)) 
-              BinOp = Builder.CreateBinOp(Op0I->getOpcode(), X, TruncC1); 
-            else 
-              BinOp = Builder.CreateBinOp(Op0I->getOpcode(), TruncC1, X); 
-            auto *TruncC2 = ConstantExpr::getTrunc(AndRHS, X->getType()); 
-            auto *And = Builder.CreateAnd(BinOp, TruncC2); 
+      switch (Op0I->getOpcode()) {
+      default:
+        break;
+      case Instruction::Xor:
+      case Instruction::Or:
+      case Instruction::Mul:
+      case Instruction::Add:
+      case Instruction::Sub:
+        Value *X;
+        ConstantInt *C1;
+        // TODO: The one use restrictions could be relaxed a little if the AND
+        // is going to be removed.
+        if (match(Op0I, m_OneUse(m_c_BinOp(m_OneUse(m_ZExt(m_Value(X))),
+                                           m_ConstantInt(C1))))) {
+          if (AndRHSMask.isIntN(X->getType()->getScalarSizeInBits())) {
+            auto *TruncC1 = ConstantExpr::getTrunc(C1, X->getType());
+            Value *BinOp;
+            Value *Op0LHS = Op0I->getOperand(0);
+            if (isa<ZExtInst>(Op0LHS))
+              BinOp = Builder.CreateBinOp(Op0I->getOpcode(), X, TruncC1);
+            else
+              BinOp = Builder.CreateBinOp(Op0I->getOpcode(), TruncC1, X);
+            auto *TruncC2 = ConstantExpr::getTrunc(AndRHS, X->getType());
+            auto *And = Builder.CreateAnd(BinOp, TruncC2);
             return new ZExtInst(And, Ty);
-          } 
-        } 
-      } 
-    } 
+          }
+        }
+      }
+    }
   }
- 
+
   if (match(&I, m_And(m_OneUse(m_Shl(m_ZExt(m_Value(X)), m_Value(Y))),
                       m_SignMask())) &&
       match(Y, m_SpecificInt_ICMP(
@@ -1871,26 +1871,26 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
     SanitizedSignMask =
         Constant::mergeUndefsWith(SanitizedSignMask, cast<Constant>(Y));
     return BinaryOperator::CreateAnd(SExt, SanitizedSignMask);
-  } 
- 
-  if (Instruction *Z = narrowMaskedBinOp(I)) 
-    return Z; 
- 
-  if (Instruction *FoldedLogic = foldBinOpIntoSelectOrPhi(I)) 
-    return FoldedLogic; 
- 
-  if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder)) 
-    return DeMorgan; 
- 
-  { 
-    Value *A, *B, *C; 
-    // A & (A ^ B) --> A & ~B 
-    if (match(Op1, m_OneUse(m_c_Xor(m_Specific(Op0), m_Value(B))))) 
-      return BinaryOperator::CreateAnd(Op0, Builder.CreateNot(B)); 
-    // (A ^ B) & A --> A & ~B 
-    if (match(Op0, m_OneUse(m_c_Xor(m_Specific(Op1), m_Value(B))))) 
-      return BinaryOperator::CreateAnd(Op1, Builder.CreateNot(B)); 
- 
+  }
+
+  if (Instruction *Z = narrowMaskedBinOp(I))
+    return Z;
+
+  if (Instruction *FoldedLogic = foldBinOpIntoSelectOrPhi(I))
+    return FoldedLogic;
+
+  if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder))
+    return DeMorgan;
+
+  {
+    Value *A, *B, *C;
+    // A & (A ^ B) --> A & ~B
+    if (match(Op1, m_OneUse(m_c_Xor(m_Specific(Op0), m_Value(B)))))
+      return BinaryOperator::CreateAnd(Op0, Builder.CreateNot(B));
+    // (A ^ B) & A --> A & ~B
+    if (match(Op0, m_OneUse(m_c_Xor(m_Specific(Op1), m_Value(B)))))
+      return BinaryOperator::CreateAnd(Op1, Builder.CreateNot(B));
+
     // A & ~(A ^ B) --> A & B
     if (match(Op1, m_Not(m_c_Xor(m_Specific(Op0), m_Value(B)))))
       return BinaryOperator::CreateAnd(Op0, B);
@@ -1898,166 +1898,166 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
     if (match(Op0, m_Not(m_c_Xor(m_Specific(Op1), m_Value(B)))))
       return BinaryOperator::CreateAnd(Op1, B);
 
-    // (A ^ B) & ((B ^ C) ^ A) -> (A ^ B) & ~C 
-    if (match(Op0, m_Xor(m_Value(A), m_Value(B)))) 
-      if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A)))) 
-        if (Op1->hasOneUse() || isFreeToInvert(C, C->hasOneUse())) 
-          return BinaryOperator::CreateAnd(Op0, Builder.CreateNot(C)); 
- 
-    // ((A ^ C) ^ B) & (B ^ A) -> (B ^ A) & ~C 
-    if (match(Op0, m_Xor(m_Xor(m_Value(A), m_Value(C)), m_Value(B)))) 
-      if (match(Op1, m_Xor(m_Specific(B), m_Specific(A)))) 
-        if (Op0->hasOneUse() || isFreeToInvert(C, C->hasOneUse())) 
-          return BinaryOperator::CreateAnd(Op1, Builder.CreateNot(C)); 
- 
-    // (A | B) & ((~A) ^ B) -> (A & B) 
-    // (A | B) & (B ^ (~A)) -> (A & B) 
-    // (B | A) & ((~A) ^ B) -> (A & B) 
-    // (B | A) & (B ^ (~A)) -> (A & B) 
-    if (match(Op1, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) && 
-        match(Op0, m_c_Or(m_Specific(A), m_Specific(B)))) 
-      return BinaryOperator::CreateAnd(A, B); 
- 
-    // ((~A) ^ B) & (A | B) -> (A & B) 
-    // ((~A) ^ B) & (B | A) -> (A & B) 
-    // (B ^ (~A)) & (A | B) -> (A & B) 
-    // (B ^ (~A)) & (B | A) -> (A & B) 
-    if (match(Op0, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) && 
-        match(Op1, m_c_Or(m_Specific(A), m_Specific(B)))) 
-      return BinaryOperator::CreateAnd(A, B); 
-  } 
- 
-  { 
-    ICmpInst *LHS = dyn_cast<ICmpInst>(Op0); 
-    ICmpInst *RHS = dyn_cast<ICmpInst>(Op1); 
-    if (LHS && RHS) 
-      if (Value *Res = foldAndOfICmps(LHS, RHS, I)) 
-        return replaceInstUsesWith(I, Res); 
- 
-    // TODO: Make this recursive; it's a little tricky because an arbitrary 
-    // number of 'and' instructions might have to be created. 
-    if (LHS && match(Op1, m_OneUse(m_And(m_Value(X), m_Value(Y))))) { 
-      if (auto *Cmp = dyn_cast<ICmpInst>(X)) 
-        if (Value *Res = foldAndOfICmps(LHS, Cmp, I)) 
-          return replaceInstUsesWith(I, Builder.CreateAnd(Res, Y)); 
-      if (auto *Cmp = dyn_cast<ICmpInst>(Y)) 
-        if (Value *Res = foldAndOfICmps(LHS, Cmp, I)) 
-          return replaceInstUsesWith(I, Builder.CreateAnd(Res, X)); 
-    } 
-    if (RHS && match(Op0, m_OneUse(m_And(m_Value(X), m_Value(Y))))) { 
-      if (auto *Cmp = dyn_cast<ICmpInst>(X)) 
-        if (Value *Res = foldAndOfICmps(Cmp, RHS, I)) 
-          return replaceInstUsesWith(I, Builder.CreateAnd(Res, Y)); 
-      if (auto *Cmp = dyn_cast<ICmpInst>(Y)) 
-        if (Value *Res = foldAndOfICmps(Cmp, RHS, I)) 
-          return replaceInstUsesWith(I, Builder.CreateAnd(Res, X)); 
-    } 
-  } 
- 
-  if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0))) 
-    if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1))) 
-      if (Value *Res = foldLogicOfFCmps(LHS, RHS, true)) 
-        return replaceInstUsesWith(I, Res); 
- 
-  if (Instruction *FoldedFCmps = reassociateFCmps(I, Builder)) 
-    return FoldedFCmps; 
- 
-  if (Instruction *CastedAnd = foldCastedBitwiseLogic(I)) 
-    return CastedAnd; 
- 
-  // and(sext(A), B) / and(B, sext(A)) --> A ? B : 0, where A is i1 or <N x i1>. 
-  Value *A; 
-  if (match(Op0, m_OneUse(m_SExt(m_Value(A)))) && 
-      A->getType()->isIntOrIntVectorTy(1)) 
+    // (A ^ B) & ((B ^ C) ^ A) -> (A ^ B) & ~C
+    if (match(Op0, m_Xor(m_Value(A), m_Value(B))))
+      if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A))))
+        if (Op1->hasOneUse() || isFreeToInvert(C, C->hasOneUse()))
+          return BinaryOperator::CreateAnd(Op0, Builder.CreateNot(C));
+
+    // ((A ^ C) ^ B) & (B ^ A) -> (B ^ A) & ~C
+    if (match(Op0, m_Xor(m_Xor(m_Value(A), m_Value(C)), m_Value(B))))
+      if (match(Op1, m_Xor(m_Specific(B), m_Specific(A))))
+        if (Op0->hasOneUse() || isFreeToInvert(C, C->hasOneUse()))
+          return BinaryOperator::CreateAnd(Op1, Builder.CreateNot(C));
+
+    // (A | B) & ((~A) ^ B) -> (A & B)
+    // (A | B) & (B ^ (~A)) -> (A & B)
+    // (B | A) & ((~A) ^ B) -> (A & B)
+    // (B | A) & (B ^ (~A)) -> (A & B)
+    if (match(Op1, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) &&
+        match(Op0, m_c_Or(m_Specific(A), m_Specific(B))))
+      return BinaryOperator::CreateAnd(A, B);
+
+    // ((~A) ^ B) & (A | B) -> (A & B)
+    // ((~A) ^ B) & (B | A) -> (A & B)
+    // (B ^ (~A)) & (A | B) -> (A & B)
+    // (B ^ (~A)) & (B | A) -> (A & B)
+    if (match(Op0, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) &&
+        match(Op1, m_c_Or(m_Specific(A), m_Specific(B))))
+      return BinaryOperator::CreateAnd(A, B);
+  }
+
+  {
+    ICmpInst *LHS = dyn_cast<ICmpInst>(Op0);
+    ICmpInst *RHS = dyn_cast<ICmpInst>(Op1);
+    if (LHS && RHS)
+      if (Value *Res = foldAndOfICmps(LHS, RHS, I))
+        return replaceInstUsesWith(I, Res);
+
+    // TODO: Make this recursive; it's a little tricky because an arbitrary
+    // number of 'and' instructions might have to be created.
+    if (LHS && match(Op1, m_OneUse(m_And(m_Value(X), m_Value(Y))))) {
+      if (auto *Cmp = dyn_cast<ICmpInst>(X))
+        if (Value *Res = foldAndOfICmps(LHS, Cmp, I))
+          return replaceInstUsesWith(I, Builder.CreateAnd(Res, Y));
+      if (auto *Cmp = dyn_cast<ICmpInst>(Y))
+        if (Value *Res = foldAndOfICmps(LHS, Cmp, I))
+          return replaceInstUsesWith(I, Builder.CreateAnd(Res, X));
+    }
+    if (RHS && match(Op0, m_OneUse(m_And(m_Value(X), m_Value(Y))))) {
+      if (auto *Cmp = dyn_cast<ICmpInst>(X))
+        if (Value *Res = foldAndOfICmps(Cmp, RHS, I))
+          return replaceInstUsesWith(I, Builder.CreateAnd(Res, Y));
+      if (auto *Cmp = dyn_cast<ICmpInst>(Y))
+        if (Value *Res = foldAndOfICmps(Cmp, RHS, I))
+          return replaceInstUsesWith(I, Builder.CreateAnd(Res, X));
+    }
+  }
+
+  if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0)))
+    if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1)))
+      if (Value *Res = foldLogicOfFCmps(LHS, RHS, true))
+        return replaceInstUsesWith(I, Res);
+
+  if (Instruction *FoldedFCmps = reassociateFCmps(I, Builder))
+    return FoldedFCmps;
+
+  if (Instruction *CastedAnd = foldCastedBitwiseLogic(I))
+    return CastedAnd;
+
+  // and(sext(A), B) / and(B, sext(A)) --> A ? B : 0, where A is i1 or <N x i1>.
+  Value *A;
+  if (match(Op0, m_OneUse(m_SExt(m_Value(A)))) &&
+      A->getType()->isIntOrIntVectorTy(1))
     return SelectInst::Create(A, Op1, Constant::getNullValue(Ty));
-  if (match(Op1, m_OneUse(m_SExt(m_Value(A)))) && 
-      A->getType()->isIntOrIntVectorTy(1)) 
+  if (match(Op1, m_OneUse(m_SExt(m_Value(A)))) &&
+      A->getType()->isIntOrIntVectorTy(1))
     return SelectInst::Create(A, Op0, Constant::getNullValue(Ty));
- 
-  // and(ashr(subNSW(Y, X), ScalarSizeInBits(Y)-1), X) --> X s> Y ? X : 0. 
+
+  // and(ashr(subNSW(Y, X), ScalarSizeInBits(Y)-1), X) --> X s> Y ? X : 0.
   if (match(&I, m_c_And(m_OneUse(m_AShr(
                             m_NSWSub(m_Value(Y), m_Value(X)),
                             m_SpecificInt(Ty->getScalarSizeInBits() - 1))),
                         m_Deferred(X)))) {
     Value *NewICmpInst = Builder.CreateICmpSGT(X, Y);
     return SelectInst::Create(NewICmpInst, X, ConstantInt::getNullValue(Ty));
-  } 
- 
+  }
+
   // (~x) & y  -->  ~(x | (~y))  iff that gets rid of inversions
   if (sinkNotIntoOtherHandOfAndOrOr(I))
     return &I;
 
-  return nullptr; 
-} 
- 
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::matchBSwapOrBitReverse(BinaryOperator &Or,
                                                       bool MatchBSwaps,
                                                       bool MatchBitReversals) {
-  assert(Or.getOpcode() == Instruction::Or && "bswap requires an 'or'"); 
-  Value *Op0 = Or.getOperand(0), *Op1 = Or.getOperand(1); 
- 
-  // Look through zero extends. 
-  if (Instruction *Ext = dyn_cast<ZExtInst>(Op0)) 
-    Op0 = Ext->getOperand(0); 
- 
-  if (Instruction *Ext = dyn_cast<ZExtInst>(Op1)) 
-    Op1 = Ext->getOperand(0); 
- 
-  // (A | B) | C  and  A | (B | C)                  -> bswap if possible. 
+  assert(Or.getOpcode() == Instruction::Or && "bswap requires an 'or'");
+  Value *Op0 = Or.getOperand(0), *Op1 = Or.getOperand(1);
+
+  // Look through zero extends.
+  if (Instruction *Ext = dyn_cast<ZExtInst>(Op0))
+    Op0 = Ext->getOperand(0);
+
+  if (Instruction *Ext = dyn_cast<ZExtInst>(Op1))
+    Op1 = Ext->getOperand(0);
+
+  // (A | B) | C  and  A | (B | C)                  -> bswap if possible.
   bool OrWithOrs = match(Op0, m_Or(m_Value(), m_Value())) ||
                    match(Op1, m_Or(m_Value(), m_Value()));
- 
+
   // (A >> B) | C  and  (A << B) | C                -> bswap if possible.
   bool OrWithShifts = match(Op0, m_LogicalShift(m_Value(), m_Value())) ||
                       match(Op1, m_LogicalShift(m_Value(), m_Value()));
- 
+
   // (A & B) | C  and  A | (B & C)                  -> bswap if possible.
   bool OrWithAnds = match(Op0, m_And(m_Value(), m_Value())) ||
                     match(Op1, m_And(m_Value(), m_Value()));
- 
+
   // fshl(A,B,C) | D  and  A | fshl(B,C,D)          -> bswap if possible.
   // fshr(A,B,C) | D  and  A | fshr(B,C,D)          -> bswap if possible.
   bool OrWithFunnels = match(Op0, m_FShl(m_Value(), m_Value(), m_Value())) ||
                        match(Op0, m_FShr(m_Value(), m_Value(), m_Value())) ||
                        match(Op0, m_FShl(m_Value(), m_Value(), m_Value())) ||
                        match(Op0, m_FShr(m_Value(), m_Value(), m_Value()));
- 
+
   // TODO: Do we need all these filtering checks or should we just rely on
   // recognizeBSwapOrBitReverseIdiom + collectBitParts to reject them quickly?
   if (!OrWithOrs && !OrWithShifts && !OrWithAnds && !OrWithFunnels)
-    return nullptr; 
- 
+    return nullptr;
+
   SmallVector<Instruction *, 4> Insts;
   if (!recognizeBSwapOrBitReverseIdiom(&Or, MatchBSwaps, MatchBitReversals,
                                        Insts))
-    return nullptr; 
-  Instruction *LastInst = Insts.pop_back_val(); 
-  LastInst->removeFromParent(); 
- 
-  for (auto *Inst : Insts) 
-    Worklist.push(Inst); 
-  return LastInst; 
-} 
- 
+    return nullptr;
+  Instruction *LastInst = Insts.pop_back_val();
+  LastInst->removeFromParent();
+
+  for (auto *Inst : Insts)
+    Worklist.push(Inst);
+  return LastInst;
+}
+
 /// Match UB-safe variants of the funnel shift intrinsic.
 static Instruction *matchFunnelShift(Instruction &Or, InstCombinerImpl &IC) {
-  // TODO: Can we reduce the code duplication between this and the related 
-  // rotate matching code under visitSelect and visitTrunc? 
-  unsigned Width = Or.getType()->getScalarSizeInBits(); 
- 
+  // TODO: Can we reduce the code duplication between this and the related
+  // rotate matching code under visitSelect and visitTrunc?
+  unsigned Width = Or.getType()->getScalarSizeInBits();
+
   // First, find an or'd pair of opposite shifts:
   // or (lshr ShVal0, ShAmt0), (shl ShVal1, ShAmt1)
-  BinaryOperator *Or0, *Or1; 
-  if (!match(Or.getOperand(0), m_BinOp(Or0)) || 
-      !match(Or.getOperand(1), m_BinOp(Or1))) 
-    return nullptr; 
- 
+  BinaryOperator *Or0, *Or1;
+  if (!match(Or.getOperand(0), m_BinOp(Or0)) ||
+      !match(Or.getOperand(1), m_BinOp(Or1)))
+    return nullptr;
+
   Value *ShVal0, *ShVal1, *ShAmt0, *ShAmt1;
   if (!match(Or0, m_OneUse(m_LogicalShift(m_Value(ShVal0), m_Value(ShAmt0)))) ||
       !match(Or1, m_OneUse(m_LogicalShift(m_Value(ShVal1), m_Value(ShAmt1)))) ||
       Or0->getOpcode() == Or1->getOpcode())
-    return nullptr; 
- 
+    return nullptr;
+
   // Canonicalize to or(shl(ShVal0, ShAmt0), lshr(ShVal1, ShAmt1)).
   if (Or0->getOpcode() == BinaryOperator::LShr) {
     std::swap(Or0, Or1);
@@ -2067,7 +2067,7 @@ static Instruction *matchFunnelShift(Instruction &Or, InstCombinerImpl &IC) {
   assert(Or0->getOpcode() == BinaryOperator::Shl &&
          Or1->getOpcode() == BinaryOperator::LShr &&
          "Illegal or(shift,shift) pair");
- 
+
   // Match the shift amount operands for a funnel shift pattern. This always
   // matches a subtraction on the R operand.
   auto matchShiftAmount = [&](Value *L, Value *R, unsigned Width) -> Value * {
@@ -2105,327 +2105,327 @@ static Instruction *matchFunnelShift(Instruction &Or, InstCombinerImpl &IC) {
     if (!isPowerOf2_32(Width))
       return nullptr;
 
-    // The shift amount may be masked with negation: 
-    // (shl ShVal, (X & (Width - 1))) | (lshr ShVal, ((-X) & (Width - 1))) 
-    Value *X; 
-    unsigned Mask = Width - 1; 
-    if (match(L, m_And(m_Value(X), m_SpecificInt(Mask))) && 
-        match(R, m_And(m_Neg(m_Specific(X)), m_SpecificInt(Mask)))) 
-      return X; 
- 
-    // Similar to above, but the shift amount may be extended after masking, 
-    // so return the extended value as the parameter for the intrinsic. 
-    if (match(L, m_ZExt(m_And(m_Value(X), m_SpecificInt(Mask)))) && 
-        match(R, m_And(m_Neg(m_ZExt(m_And(m_Specific(X), m_SpecificInt(Mask)))), 
-                       m_SpecificInt(Mask)))) 
-      return L; 
- 
+    // The shift amount may be masked with negation:
+    // (shl ShVal, (X & (Width - 1))) | (lshr ShVal, ((-X) & (Width - 1)))
+    Value *X;
+    unsigned Mask = Width - 1;
+    if (match(L, m_And(m_Value(X), m_SpecificInt(Mask))) &&
+        match(R, m_And(m_Neg(m_Specific(X)), m_SpecificInt(Mask))))
+      return X;
+
+    // Similar to above, but the shift amount may be extended after masking,
+    // so return the extended value as the parameter for the intrinsic.
+    if (match(L, m_ZExt(m_And(m_Value(X), m_SpecificInt(Mask)))) &&
+        match(R, m_And(m_Neg(m_ZExt(m_And(m_Specific(X), m_SpecificInt(Mask)))),
+                       m_SpecificInt(Mask))))
+      return L;
+
     if (match(L, m_ZExt(m_And(m_Value(X), m_SpecificInt(Mask)))) &&
         match(R, m_ZExt(m_And(m_Neg(m_Specific(X)), m_SpecificInt(Mask)))))
       return L;
 
-    return nullptr; 
-  }; 
- 
-  Value *ShAmt = matchShiftAmount(ShAmt0, ShAmt1, Width); 
+    return nullptr;
+  };
+
+  Value *ShAmt = matchShiftAmount(ShAmt0, ShAmt1, Width);
   bool IsFshl = true; // Sub on LSHR.
-  if (!ShAmt) { 
-    ShAmt = matchShiftAmount(ShAmt1, ShAmt0, Width); 
+  if (!ShAmt) {
+    ShAmt = matchShiftAmount(ShAmt1, ShAmt0, Width);
     IsFshl = false; // Sub on SHL.
-  } 
-  if (!ShAmt) 
-    return nullptr; 
- 
-  Intrinsic::ID IID = IsFshl ? Intrinsic::fshl : Intrinsic::fshr; 
-  Function *F = Intrinsic::getDeclaration(Or.getModule(), IID, Or.getType()); 
+  }
+  if (!ShAmt)
+    return nullptr;
+
+  Intrinsic::ID IID = IsFshl ? Intrinsic::fshl : Intrinsic::fshr;
+  Function *F = Intrinsic::getDeclaration(Or.getModule(), IID, Or.getType());
   return IntrinsicInst::Create(F, {ShVal0, ShVal1, ShAmt});
-} 
- 
-/// Attempt to combine or(zext(x),shl(zext(y),bw/2) concat packing patterns. 
-static Instruction *matchOrConcat(Instruction &Or, 
-                                  InstCombiner::BuilderTy &Builder) { 
-  assert(Or.getOpcode() == Instruction::Or && "bswap requires an 'or'"); 
-  Value *Op0 = Or.getOperand(0), *Op1 = Or.getOperand(1); 
-  Type *Ty = Or.getType(); 
- 
-  unsigned Width = Ty->getScalarSizeInBits(); 
-  if ((Width & 1) != 0) 
-    return nullptr; 
-  unsigned HalfWidth = Width / 2; 
- 
-  // Canonicalize zext (lower half) to LHS. 
-  if (!isa<ZExtInst>(Op0)) 
-    std::swap(Op0, Op1); 
- 
-  // Find lower/upper half. 
-  Value *LowerSrc, *ShlVal, *UpperSrc; 
-  const APInt *C; 
-  if (!match(Op0, m_OneUse(m_ZExt(m_Value(LowerSrc)))) || 
-      !match(Op1, m_OneUse(m_Shl(m_Value(ShlVal), m_APInt(C)))) || 
-      !match(ShlVal, m_OneUse(m_ZExt(m_Value(UpperSrc))))) 
-    return nullptr; 
-  if (*C != HalfWidth || LowerSrc->getType() != UpperSrc->getType() || 
-      LowerSrc->getType()->getScalarSizeInBits() != HalfWidth) 
-    return nullptr; 
- 
-  auto ConcatIntrinsicCalls = [&](Intrinsic::ID id, Value *Lo, Value *Hi) { 
-    Value *NewLower = Builder.CreateZExt(Lo, Ty); 
-    Value *NewUpper = Builder.CreateZExt(Hi, Ty); 
-    NewUpper = Builder.CreateShl(NewUpper, HalfWidth); 
-    Value *BinOp = Builder.CreateOr(NewLower, NewUpper); 
-    Function *F = Intrinsic::getDeclaration(Or.getModule(), id, Ty); 
-    return Builder.CreateCall(F, BinOp); 
-  }; 
- 
-  // BSWAP: Push the concat down, swapping the lower/upper sources. 
-  // concat(bswap(x),bswap(y)) -> bswap(concat(x,y)) 
-  Value *LowerBSwap, *UpperBSwap; 
-  if (match(LowerSrc, m_BSwap(m_Value(LowerBSwap))) && 
-      match(UpperSrc, m_BSwap(m_Value(UpperBSwap)))) 
-    return ConcatIntrinsicCalls(Intrinsic::bswap, UpperBSwap, LowerBSwap); 
- 
-  // BITREVERSE: Push the concat down, swapping the lower/upper sources. 
-  // concat(bitreverse(x),bitreverse(y)) -> bitreverse(concat(x,y)) 
-  Value *LowerBRev, *UpperBRev; 
-  if (match(LowerSrc, m_BitReverse(m_Value(LowerBRev))) && 
-      match(UpperSrc, m_BitReverse(m_Value(UpperBRev)))) 
-    return ConcatIntrinsicCalls(Intrinsic::bitreverse, UpperBRev, LowerBRev); 
- 
-  return nullptr; 
-} 
- 
-/// If all elements of two constant vectors are 0/-1 and inverses, return true. 
-static bool areInverseVectorBitmasks(Constant *C1, Constant *C2) { 
+}
+
+/// Attempt to combine or(zext(x),shl(zext(y),bw/2) concat packing patterns.
+static Instruction *matchOrConcat(Instruction &Or,
+                                  InstCombiner::BuilderTy &Builder) {
+  assert(Or.getOpcode() == Instruction::Or && "bswap requires an 'or'");
+  Value *Op0 = Or.getOperand(0), *Op1 = Or.getOperand(1);
+  Type *Ty = Or.getType();
+
+  unsigned Width = Ty->getScalarSizeInBits();
+  if ((Width & 1) != 0)
+    return nullptr;
+  unsigned HalfWidth = Width / 2;
+
+  // Canonicalize zext (lower half) to LHS.
+  if (!isa<ZExtInst>(Op0))
+    std::swap(Op0, Op1);
+
+  // Find lower/upper half.
+  Value *LowerSrc, *ShlVal, *UpperSrc;
+  const APInt *C;
+  if (!match(Op0, m_OneUse(m_ZExt(m_Value(LowerSrc)))) ||
+      !match(Op1, m_OneUse(m_Shl(m_Value(ShlVal), m_APInt(C)))) ||
+      !match(ShlVal, m_OneUse(m_ZExt(m_Value(UpperSrc)))))
+    return nullptr;
+  if (*C != HalfWidth || LowerSrc->getType() != UpperSrc->getType() ||
+      LowerSrc->getType()->getScalarSizeInBits() != HalfWidth)
+    return nullptr;
+
+  auto ConcatIntrinsicCalls = [&](Intrinsic::ID id, Value *Lo, Value *Hi) {
+    Value *NewLower = Builder.CreateZExt(Lo, Ty);
+    Value *NewUpper = Builder.CreateZExt(Hi, Ty);
+    NewUpper = Builder.CreateShl(NewUpper, HalfWidth);
+    Value *BinOp = Builder.CreateOr(NewLower, NewUpper);
+    Function *F = Intrinsic::getDeclaration(Or.getModule(), id, Ty);
+    return Builder.CreateCall(F, BinOp);
+  };
+
+  // BSWAP: Push the concat down, swapping the lower/upper sources.
+  // concat(bswap(x),bswap(y)) -> bswap(concat(x,y))
+  Value *LowerBSwap, *UpperBSwap;
+  if (match(LowerSrc, m_BSwap(m_Value(LowerBSwap))) &&
+      match(UpperSrc, m_BSwap(m_Value(UpperBSwap))))
+    return ConcatIntrinsicCalls(Intrinsic::bswap, UpperBSwap, LowerBSwap);
+
+  // BITREVERSE: Push the concat down, swapping the lower/upper sources.
+  // concat(bitreverse(x),bitreverse(y)) -> bitreverse(concat(x,y))
+  Value *LowerBRev, *UpperBRev;
+  if (match(LowerSrc, m_BitReverse(m_Value(LowerBRev))) &&
+      match(UpperSrc, m_BitReverse(m_Value(UpperBRev))))
+    return ConcatIntrinsicCalls(Intrinsic::bitreverse, UpperBRev, LowerBRev);
+
+  return nullptr;
+}
+
+/// If all elements of two constant vectors are 0/-1 and inverses, return true.
+static bool areInverseVectorBitmasks(Constant *C1, Constant *C2) {
   unsigned NumElts = cast<FixedVectorType>(C1->getType())->getNumElements();
-  for (unsigned i = 0; i != NumElts; ++i) { 
-    Constant *EltC1 = C1->getAggregateElement(i); 
-    Constant *EltC2 = C2->getAggregateElement(i); 
-    if (!EltC1 || !EltC2) 
-      return false; 
- 
-    // One element must be all ones, and the other must be all zeros. 
-    if (!((match(EltC1, m_Zero()) && match(EltC2, m_AllOnes())) || 
-          (match(EltC2, m_Zero()) && match(EltC1, m_AllOnes())))) 
-      return false; 
-  } 
-  return true; 
-} 
- 
-/// We have an expression of the form (A & C) | (B & D). If A is a scalar or 
-/// vector composed of all-zeros or all-ones values and is the bitwise 'not' of 
-/// B, it can be used as the condition operand of a select instruction. 
+  for (unsigned i = 0; i != NumElts; ++i) {
+    Constant *EltC1 = C1->getAggregateElement(i);
+    Constant *EltC2 = C2->getAggregateElement(i);
+    if (!EltC1 || !EltC2)
+      return false;
+
+    // One element must be all ones, and the other must be all zeros.
+    if (!((match(EltC1, m_Zero()) && match(EltC2, m_AllOnes())) ||
+          (match(EltC2, m_Zero()) && match(EltC1, m_AllOnes()))))
+      return false;
+  }
+  return true;
+}
+
+/// We have an expression of the form (A & C) | (B & D). If A is a scalar or
+/// vector composed of all-zeros or all-ones values and is the bitwise 'not' of
+/// B, it can be used as the condition operand of a select instruction.
 Value *InstCombinerImpl::getSelectCondition(Value *A, Value *B) {
-  // Step 1: We may have peeked through bitcasts in the caller. 
-  // Exit immediately if we don't have (vector) integer types. 
-  Type *Ty = A->getType(); 
-  if (!Ty->isIntOrIntVectorTy() || !B->getType()->isIntOrIntVectorTy()) 
-    return nullptr; 
- 
-  // Step 2: We need 0 or all-1's bitmasks. 
-  if (ComputeNumSignBits(A) != Ty->getScalarSizeInBits()) 
-    return nullptr; 
- 
-  // Step 3: If B is the 'not' value of A, we have our answer. 
-  if (match(A, m_Not(m_Specific(B)))) { 
-    // If these are scalars or vectors of i1, A can be used directly. 
-    if (Ty->isIntOrIntVectorTy(1)) 
-      return A; 
-    return Builder.CreateTrunc(A, CmpInst::makeCmpResultType(Ty)); 
-  } 
- 
-  // If both operands are constants, see if the constants are inverse bitmasks. 
-  Constant *AConst, *BConst; 
-  if (match(A, m_Constant(AConst)) && match(B, m_Constant(BConst))) 
-    if (AConst == ConstantExpr::getNot(BConst)) 
-      return Builder.CreateZExtOrTrunc(A, CmpInst::makeCmpResultType(Ty)); 
- 
-  // Look for more complex patterns. The 'not' op may be hidden behind various 
-  // casts. Look through sexts and bitcasts to find the booleans. 
-  Value *Cond; 
-  Value *NotB; 
-  if (match(A, m_SExt(m_Value(Cond))) && 
-      Cond->getType()->isIntOrIntVectorTy(1) && 
-      match(B, m_OneUse(m_Not(m_Value(NotB))))) { 
-    NotB = peekThroughBitcast(NotB, true); 
-    if (match(NotB, m_SExt(m_Specific(Cond)))) 
-      return Cond; 
-  } 
- 
-  // All scalar (and most vector) possibilities should be handled now. 
-  // Try more matches that only apply to non-splat constant vectors. 
-  if (!Ty->isVectorTy()) 
-    return nullptr; 
- 
-  // If both operands are xor'd with constants using the same sexted boolean 
-  // operand, see if the constants are inverse bitmasks. 
-  // TODO: Use ConstantExpr::getNot()? 
-  if (match(A, (m_Xor(m_SExt(m_Value(Cond)), m_Constant(AConst)))) && 
-      match(B, (m_Xor(m_SExt(m_Specific(Cond)), m_Constant(BConst)))) && 
-      Cond->getType()->isIntOrIntVectorTy(1) && 
-      areInverseVectorBitmasks(AConst, BConst)) { 
-    AConst = ConstantExpr::getTrunc(AConst, CmpInst::makeCmpResultType(Ty)); 
-    return Builder.CreateXor(Cond, AConst); 
-  } 
-  return nullptr; 
-} 
- 
-/// We have an expression of the form (A & C) | (B & D). Try to simplify this 
-/// to "A' ? C : D", where A' is a boolean or vector of booleans. 
+  // Step 1: We may have peeked through bitcasts in the caller.
+  // Exit immediately if we don't have (vector) integer types.
+  Type *Ty = A->getType();
+  if (!Ty->isIntOrIntVectorTy() || !B->getType()->isIntOrIntVectorTy())
+    return nullptr;
+
+  // Step 2: We need 0 or all-1's bitmasks.
+  if (ComputeNumSignBits(A) != Ty->getScalarSizeInBits())
+    return nullptr;
+
+  // Step 3: If B is the 'not' value of A, we have our answer.
+  if (match(A, m_Not(m_Specific(B)))) {
+    // If these are scalars or vectors of i1, A can be used directly.
+    if (Ty->isIntOrIntVectorTy(1))
+      return A;
+    return Builder.CreateTrunc(A, CmpInst::makeCmpResultType(Ty));
+  }
+
+  // If both operands are constants, see if the constants are inverse bitmasks.
+  Constant *AConst, *BConst;
+  if (match(A, m_Constant(AConst)) && match(B, m_Constant(BConst)))
+    if (AConst == ConstantExpr::getNot(BConst))
+      return Builder.CreateZExtOrTrunc(A, CmpInst::makeCmpResultType(Ty));
+
+  // Look for more complex patterns. The 'not' op may be hidden behind various
+  // casts. Look through sexts and bitcasts to find the booleans.
+  Value *Cond;
+  Value *NotB;
+  if (match(A, m_SExt(m_Value(Cond))) &&
+      Cond->getType()->isIntOrIntVectorTy(1) &&
+      match(B, m_OneUse(m_Not(m_Value(NotB))))) {
+    NotB = peekThroughBitcast(NotB, true);
+    if (match(NotB, m_SExt(m_Specific(Cond))))
+      return Cond;
+  }
+
+  // All scalar (and most vector) possibilities should be handled now.
+  // Try more matches that only apply to non-splat constant vectors.
+  if (!Ty->isVectorTy())
+    return nullptr;
+
+  // If both operands are xor'd with constants using the same sexted boolean
+  // operand, see if the constants are inverse bitmasks.
+  // TODO: Use ConstantExpr::getNot()?
+  if (match(A, (m_Xor(m_SExt(m_Value(Cond)), m_Constant(AConst)))) &&
+      match(B, (m_Xor(m_SExt(m_Specific(Cond)), m_Constant(BConst)))) &&
+      Cond->getType()->isIntOrIntVectorTy(1) &&
+      areInverseVectorBitmasks(AConst, BConst)) {
+    AConst = ConstantExpr::getTrunc(AConst, CmpInst::makeCmpResultType(Ty));
+    return Builder.CreateXor(Cond, AConst);
+  }
+  return nullptr;
+}
+
+/// We have an expression of the form (A & C) | (B & D). Try to simplify this
+/// to "A' ? C : D", where A' is a boolean or vector of booleans.
 Value *InstCombinerImpl::matchSelectFromAndOr(Value *A, Value *C, Value *B,
                                               Value *D) {
-  // The potential condition of the select may be bitcasted. In that case, look 
-  // through its bitcast and the corresponding bitcast of the 'not' condition. 
-  Type *OrigType = A->getType(); 
-  A = peekThroughBitcast(A, true); 
-  B = peekThroughBitcast(B, true); 
-  if (Value *Cond = getSelectCondition(A, B)) { 
-    // ((bc Cond) & C) | ((bc ~Cond) & D) --> bc (select Cond, (bc C), (bc D)) 
-    // The bitcasts will either all exist or all not exist. The builder will 
-    // not create unnecessary casts if the types already match. 
-    Value *BitcastC = Builder.CreateBitCast(C, A->getType()); 
-    Value *BitcastD = Builder.CreateBitCast(D, A->getType()); 
-    Value *Select = Builder.CreateSelect(Cond, BitcastC, BitcastD); 
-    return Builder.CreateBitCast(Select, OrigType); 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Fold (icmp)|(icmp) if possible. 
+  // The potential condition of the select may be bitcasted. In that case, look
+  // through its bitcast and the corresponding bitcast of the 'not' condition.
+  Type *OrigType = A->getType();
+  A = peekThroughBitcast(A, true);
+  B = peekThroughBitcast(B, true);
+  if (Value *Cond = getSelectCondition(A, B)) {
+    // ((bc Cond) & C) | ((bc ~Cond) & D) --> bc (select Cond, (bc C), (bc D))
+    // The bitcasts will either all exist or all not exist. The builder will
+    // not create unnecessary casts if the types already match.
+    Value *BitcastC = Builder.CreateBitCast(C, A->getType());
+    Value *BitcastD = Builder.CreateBitCast(D, A->getType());
+    Value *Select = Builder.CreateSelect(Cond, BitcastC, BitcastD);
+    return Builder.CreateBitCast(Select, OrigType);
+  }
+
+  return nullptr;
+}
+
+/// Fold (icmp)|(icmp) if possible.
 Value *InstCombinerImpl::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
                                        BinaryOperator &Or) {
-  const SimplifyQuery Q = SQ.getWithInstruction(&Or); 
- 
-  // Fold (iszero(A & K1) | iszero(A & K2)) ->  (A & (K1 | K2)) != (K1 | K2) 
-  // if K1 and K2 are a one-bit mask. 
-  if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, Or)) 
-    return V; 
- 
-  ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate(); 
+  const SimplifyQuery Q = SQ.getWithInstruction(&Or);
+
+  // Fold (iszero(A & K1) | iszero(A & K2)) ->  (A & (K1 | K2)) != (K1 | K2)
+  // if K1 and K2 are a one-bit mask.
+  if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, Or))
+    return V;
+
+  ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
   Value *LHS0 = LHS->getOperand(0), *RHS0 = RHS->getOperand(0);
   Value *LHS1 = LHS->getOperand(1), *RHS1 = RHS->getOperand(1);
   auto *LHSC = dyn_cast<ConstantInt>(LHS1);
   auto *RHSC = dyn_cast<ConstantInt>(RHS1);
- 
-  // Fold (icmp ult/ule (A + C1), C3) | (icmp ult/ule (A + C2), C3) 
-  //                   -->  (icmp ult/ule ((A & ~(C1 ^ C2)) + max(C1, C2)), C3) 
-  // The original condition actually refers to the following two ranges: 
-  // [MAX_UINT-C1+1, MAX_UINT-C1+1+C3] and [MAX_UINT-C2+1, MAX_UINT-C2+1+C3] 
-  // We can fold these two ranges if: 
-  // 1) C1 and C2 is unsigned greater than C3. 
-  // 2) The two ranges are separated. 
-  // 3) C1 ^ C2 is one-bit mask. 
-  // 4) LowRange1 ^ LowRange2 and HighRange1 ^ HighRange2 are one-bit mask. 
-  // This implies all values in the two ranges differ by exactly one bit. 
-  if ((PredL == ICmpInst::ICMP_ULT || PredL == ICmpInst::ICMP_ULE) && 
-      PredL == PredR && LHSC && RHSC && LHS->hasOneUse() && RHS->hasOneUse() && 
-      LHSC->getType() == RHSC->getType() && 
-      LHSC->getValue() == (RHSC->getValue())) { 
- 
+
+  // Fold (icmp ult/ule (A + C1), C3) | (icmp ult/ule (A + C2), C3)
+  //                   -->  (icmp ult/ule ((A & ~(C1 ^ C2)) + max(C1, C2)), C3)
+  // The original condition actually refers to the following two ranges:
+  // [MAX_UINT-C1+1, MAX_UINT-C1+1+C3] and [MAX_UINT-C2+1, MAX_UINT-C2+1+C3]
+  // We can fold these two ranges if:
+  // 1) C1 and C2 is unsigned greater than C3.
+  // 2) The two ranges are separated.
+  // 3) C1 ^ C2 is one-bit mask.
+  // 4) LowRange1 ^ LowRange2 and HighRange1 ^ HighRange2 are one-bit mask.
+  // This implies all values in the two ranges differ by exactly one bit.
+  if ((PredL == ICmpInst::ICMP_ULT || PredL == ICmpInst::ICMP_ULE) &&
+      PredL == PredR && LHSC && RHSC && LHS->hasOneUse() && RHS->hasOneUse() &&
+      LHSC->getType() == RHSC->getType() &&
+      LHSC->getValue() == (RHSC->getValue())) {
+
     Value *AddOpnd;
-    ConstantInt *LAddC, *RAddC; 
+    ConstantInt *LAddC, *RAddC;
     if (match(LHS0, m_Add(m_Value(AddOpnd), m_ConstantInt(LAddC))) &&
         match(RHS0, m_Add(m_Specific(AddOpnd), m_ConstantInt(RAddC))) &&
-        LAddC->getValue().ugt(LHSC->getValue()) && 
-        RAddC->getValue().ugt(LHSC->getValue())) { 
- 
-      APInt DiffC = LAddC->getValue() ^ RAddC->getValue(); 
+        LAddC->getValue().ugt(LHSC->getValue()) &&
+        RAddC->getValue().ugt(LHSC->getValue())) {
+
+      APInt DiffC = LAddC->getValue() ^ RAddC->getValue();
       if (DiffC.isPowerOf2()) {
-        ConstantInt *MaxAddC = nullptr; 
-        if (LAddC->getValue().ult(RAddC->getValue())) 
-          MaxAddC = RAddC; 
-        else 
-          MaxAddC = LAddC; 
- 
-        APInt RRangeLow = -RAddC->getValue(); 
-        APInt RRangeHigh = RRangeLow + LHSC->getValue(); 
-        APInt LRangeLow = -LAddC->getValue(); 
-        APInt LRangeHigh = LRangeLow + LHSC->getValue(); 
-        APInt LowRangeDiff = RRangeLow ^ LRangeLow; 
-        APInt HighRangeDiff = RRangeHigh ^ LRangeHigh; 
-        APInt RangeDiff = LRangeLow.sgt(RRangeLow) ? LRangeLow - RRangeLow 
-                                                   : RRangeLow - LRangeLow; 
- 
-        if (LowRangeDiff.isPowerOf2() && LowRangeDiff == HighRangeDiff && 
-            RangeDiff.ugt(LHSC->getValue())) { 
-          Value *MaskC = ConstantInt::get(LAddC->getType(), ~DiffC); 
- 
+        ConstantInt *MaxAddC = nullptr;
+        if (LAddC->getValue().ult(RAddC->getValue()))
+          MaxAddC = RAddC;
+        else
+          MaxAddC = LAddC;
+
+        APInt RRangeLow = -RAddC->getValue();
+        APInt RRangeHigh = RRangeLow + LHSC->getValue();
+        APInt LRangeLow = -LAddC->getValue();
+        APInt LRangeHigh = LRangeLow + LHSC->getValue();
+        APInt LowRangeDiff = RRangeLow ^ LRangeLow;
+        APInt HighRangeDiff = RRangeHigh ^ LRangeHigh;
+        APInt RangeDiff = LRangeLow.sgt(RRangeLow) ? LRangeLow - RRangeLow
+                                                   : RRangeLow - LRangeLow;
+
+        if (LowRangeDiff.isPowerOf2() && LowRangeDiff == HighRangeDiff &&
+            RangeDiff.ugt(LHSC->getValue())) {
+          Value *MaskC = ConstantInt::get(LAddC->getType(), ~DiffC);
+
           Value *NewAnd = Builder.CreateAnd(AddOpnd, MaskC);
-          Value *NewAdd = Builder.CreateAdd(NewAnd, MaxAddC); 
-          return Builder.CreateICmp(LHS->getPredicate(), NewAdd, LHSC); 
-        } 
-      } 
-    } 
-  } 
- 
-  // (icmp1 A, B) | (icmp2 A, B) --> (icmp3 A, B) 
-  if (predicatesFoldable(PredL, PredR)) { 
+          Value *NewAdd = Builder.CreateAdd(NewAnd, MaxAddC);
+          return Builder.CreateICmp(LHS->getPredicate(), NewAdd, LHSC);
+        }
+      }
+    }
+  }
+
+  // (icmp1 A, B) | (icmp2 A, B) --> (icmp3 A, B)
+  if (predicatesFoldable(PredL, PredR)) {
     if (LHS0 == RHS1 && LHS1 == RHS0)
-      LHS->swapOperands(); 
+      LHS->swapOperands();
     if (LHS0 == RHS0 && LHS1 == RHS1) {
-      unsigned Code = getICmpCode(LHS) | getICmpCode(RHS); 
-      bool IsSigned = LHS->isSigned() || RHS->isSigned(); 
+      unsigned Code = getICmpCode(LHS) | getICmpCode(RHS);
+      bool IsSigned = LHS->isSigned() || RHS->isSigned();
       return getNewICmpValue(Code, IsSigned, LHS0, LHS1, Builder);
-    } 
-  } 
- 
-  // handle (roughly): 
-  // (icmp ne (A & B), C) | (icmp ne (A & D), E) 
-  if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, false, Builder)) 
-    return V; 
- 
-  if (LHS->hasOneUse() || RHS->hasOneUse()) { 
-    // (icmp eq B, 0) | (icmp ult A, B) -> (icmp ule A, B-1) 
-    // (icmp eq B, 0) | (icmp ugt B, A) -> (icmp ule A, B-1) 
-    Value *A = nullptr, *B = nullptr; 
+    }
+  }
+
+  // handle (roughly):
+  // (icmp ne (A & B), C) | (icmp ne (A & D), E)
+  if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, false, Builder))
+    return V;
+
+  if (LHS->hasOneUse() || RHS->hasOneUse()) {
+    // (icmp eq B, 0) | (icmp ult A, B) -> (icmp ule A, B-1)
+    // (icmp eq B, 0) | (icmp ugt B, A) -> (icmp ule A, B-1)
+    Value *A = nullptr, *B = nullptr;
     if (PredL == ICmpInst::ICMP_EQ && match(LHS1, m_Zero())) {
-      B = LHS0; 
+      B = LHS0;
       if (PredR == ICmpInst::ICMP_ULT && LHS0 == RHS1)
-        A = RHS0; 
-      else if (PredR == ICmpInst::ICMP_UGT && LHS0 == RHS0) 
+        A = RHS0;
+      else if (PredR == ICmpInst::ICMP_UGT && LHS0 == RHS0)
         A = RHS1;
-    } 
-    // (icmp ult A, B) | (icmp eq B, 0) -> (icmp ule A, B-1) 
-    // (icmp ugt B, A) | (icmp eq B, 0) -> (icmp ule A, B-1) 
+    }
+    // (icmp ult A, B) | (icmp eq B, 0) -> (icmp ule A, B-1)
+    // (icmp ugt B, A) | (icmp eq B, 0) -> (icmp ule A, B-1)
     else if (PredR == ICmpInst::ICMP_EQ && match(RHS1, m_Zero())) {
-      B = RHS0; 
+      B = RHS0;
       if (PredL == ICmpInst::ICMP_ULT && RHS0 == LHS1)
-        A = LHS0; 
+        A = LHS0;
       else if (PredL == ICmpInst::ICMP_UGT && RHS0 == LHS0)
         A = LHS1;
-    } 
+    }
     if (A && B && B->getType()->isIntOrIntVectorTy())
-      return Builder.CreateICmp( 
-          ICmpInst::ICMP_UGE, 
+      return Builder.CreateICmp(
+          ICmpInst::ICMP_UGE,
           Builder.CreateAdd(B, Constant::getAllOnesValue(B->getType())), A);
-  } 
- 
-  if (Value *V = foldAndOrOfICmpsWithConstEq(LHS, RHS, Or, Builder, Q)) 
-    return V; 
-  if (Value *V = foldAndOrOfICmpsWithConstEq(RHS, LHS, Or, Builder, Q)) 
-    return V; 
- 
-  // E.g. (icmp slt x, 0) | (icmp sgt x, n) --> icmp ugt x, n 
-  if (Value *V = simplifyRangeCheck(LHS, RHS, /*Inverted=*/true)) 
-    return V; 
- 
-  // E.g. (icmp sgt x, n) | (icmp slt x, 0) --> icmp ugt x, n 
-  if (Value *V = simplifyRangeCheck(RHS, LHS, /*Inverted=*/true)) 
-    return V; 
- 
-  if (Value *V = foldAndOrOfEqualityCmpsWithConstants(LHS, RHS, false, Builder)) 
-    return V; 
- 
-  if (Value *V = foldIsPowerOf2(LHS, RHS, false /* JoinedByAnd */, Builder)) 
-    return V; 
- 
-  if (Value *X = 
-          foldUnsignedUnderflowCheck(LHS, RHS, /*IsAnd=*/false, Q, Builder)) 
-    return X; 
-  if (Value *X = 
-          foldUnsignedUnderflowCheck(RHS, LHS, /*IsAnd=*/false, Q, Builder)) 
-    return X; 
- 
+  }
+
+  if (Value *V = foldAndOrOfICmpsWithConstEq(LHS, RHS, Or, Builder, Q))
+    return V;
+  if (Value *V = foldAndOrOfICmpsWithConstEq(RHS, LHS, Or, Builder, Q))
+    return V;
+
+  // E.g. (icmp slt x, 0) | (icmp sgt x, n) --> icmp ugt x, n
+  if (Value *V = simplifyRangeCheck(LHS, RHS, /*Inverted=*/true))
+    return V;
+
+  // E.g. (icmp sgt x, n) | (icmp slt x, 0) --> icmp ugt x, n
+  if (Value *V = simplifyRangeCheck(RHS, LHS, /*Inverted=*/true))
+    return V;
+
+  if (Value *V = foldAndOrOfEqualityCmpsWithConstants(LHS, RHS, false, Builder))
+    return V;
+
+  if (Value *V = foldIsPowerOf2(LHS, RHS, false /* JoinedByAnd */, Builder))
+    return V;
+
+  if (Value *X =
+          foldUnsignedUnderflowCheck(LHS, RHS, /*IsAnd=*/false, Q, Builder))
+    return X;
+  if (Value *X =
+          foldUnsignedUnderflowCheck(RHS, LHS, /*IsAnd=*/false, Q, Builder))
+    return X;
+
   // (icmp ne A, 0) | (icmp ne B, 0) --> (icmp ne (A|B), 0)
   // TODO: Remove this when foldLogOpOfMaskedICmps can handle vectors.
   if (PredL == ICmpInst::ICMP_NE && match(LHS1, m_Zero()) &&
@@ -2437,666 +2437,666 @@ Value *InstCombinerImpl::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
                               Constant::getNullValue(NewOr->getType()));
   }
 
-  // This only handles icmp of constants: (icmp1 A, C1) | (icmp2 B, C2). 
-  if (!LHSC || !RHSC) 
-    return nullptr; 
- 
-  // (icmp ult (X + CA), C1) | (icmp eq X, C2) -> (icmp ule (X + CA), C1) 
-  //   iff C2 + CA == C1. 
-  if (PredL == ICmpInst::ICMP_ULT && PredR == ICmpInst::ICMP_EQ) { 
-    ConstantInt *AddC; 
-    if (match(LHS0, m_Add(m_Specific(RHS0), m_ConstantInt(AddC)))) 
-      if (RHSC->getValue() + AddC->getValue() == LHSC->getValue()) 
-        return Builder.CreateICmpULE(LHS0, LHSC); 
-  } 
- 
-  // From here on, we only handle: 
-  //    (icmp1 A, C1) | (icmp2 A, C2) --> something simpler. 
-  if (LHS0 != RHS0) 
-    return nullptr; 
- 
-  // ICMP_[US][GL]E X, C is folded to ICMP_[US][GL]T elsewhere. 
-  if (PredL == ICmpInst::ICMP_UGE || PredL == ICmpInst::ICMP_ULE || 
-      PredR == ICmpInst::ICMP_UGE || PredR == ICmpInst::ICMP_ULE || 
-      PredL == ICmpInst::ICMP_SGE || PredL == ICmpInst::ICMP_SLE || 
-      PredR == ICmpInst::ICMP_SGE || PredR == ICmpInst::ICMP_SLE) 
-    return nullptr; 
- 
-  // We can't fold (ugt x, C) | (sgt x, C2). 
-  if (!predicatesFoldable(PredL, PredR)) 
-    return nullptr; 
- 
-  // Ensure that the larger constant is on the RHS. 
-  bool ShouldSwap; 
-  if (CmpInst::isSigned(PredL) || 
-      (ICmpInst::isEquality(PredL) && CmpInst::isSigned(PredR))) 
-    ShouldSwap = LHSC->getValue().sgt(RHSC->getValue()); 
-  else 
-    ShouldSwap = LHSC->getValue().ugt(RHSC->getValue()); 
- 
-  if (ShouldSwap) { 
-    std::swap(LHS, RHS); 
-    std::swap(LHSC, RHSC); 
-    std::swap(PredL, PredR); 
-  } 
- 
-  // At this point, we know we have two icmp instructions 
-  // comparing a value against two constants and or'ing the result 
-  // together.  Because of the above check, we know that we only have 
-  // ICMP_EQ, ICMP_NE, ICMP_LT, and ICMP_GT here. We also know (from the 
-  // icmp folding check above), that the two constants are not 
-  // equal. 
-  assert(LHSC != RHSC && "Compares not folded above?"); 
- 
-  switch (PredL) { 
-  default: 
-    llvm_unreachable("Unknown integer condition code!"); 
-  case ICmpInst::ICMP_EQ: 
-    switch (PredR) { 
-    default: 
-      llvm_unreachable("Unknown integer condition code!"); 
-    case ICmpInst::ICMP_EQ: 
-      // Potential folds for this case should already be handled. 
-      break; 
-    case ICmpInst::ICMP_UGT: 
-      // (X == 0 || X u> C) -> (X-1) u>= C 
-      if (LHSC->isMinValue(false)) 
-        return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue() + 1, 
-                               false, false); 
-      // (X == 13 | X u> 14) -> no change 
-      break; 
-    case ICmpInst::ICMP_SGT: 
-      // (X == INT_MIN || X s> C) -> (X-(INT_MIN+1)) u>= C-INT_MIN 
-      if (LHSC->isMinValue(true)) 
-        return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue() + 1, 
-                               true, false); 
-      // (X == 13 | X s> 14) -> no change 
-      break; 
-    } 
-    break; 
-  case ICmpInst::ICMP_ULT: 
-    switch (PredR) { 
-    default: 
-      llvm_unreachable("Unknown integer condition code!"); 
-    case ICmpInst::ICMP_EQ: // (X u< 13 | X == 14) -> no change 
-      // (X u< C || X == UINT_MAX) => (X-C) u>= UINT_MAX-C 
-      if (RHSC->isMaxValue(false)) 
-        return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue(), 
-                               false, false); 
-      break; 
-    case ICmpInst::ICMP_UGT: // (X u< 13 | X u> 15) -> (X-13) u> 2 
-      assert(!RHSC->isMaxValue(false) && "Missed icmp simplification"); 
-      return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue() + 1, 
-                             false, false); 
-    } 
-    break; 
-  case ICmpInst::ICMP_SLT: 
-    switch (PredR) { 
-    default: 
-      llvm_unreachable("Unknown integer condition code!"); 
-    case ICmpInst::ICMP_EQ: 
-      // (X s< C || X == INT_MAX) => (X-C) u>= INT_MAX-C 
-      if (RHSC->isMaxValue(true)) 
-        return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue(), 
-                               true, false); 
-      // (X s< 13 | X == 14) -> no change 
-      break; 
-    case ICmpInst::ICMP_SGT: // (X s< 13 | X s> 15) -> (X-13) u> 2 
-      assert(!RHSC->isMaxValue(true) && "Missed icmp simplification"); 
-      return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue() + 1, true, 
-                             false); 
-    } 
-    break; 
-  } 
-  return nullptr; 
-} 
- 
-// FIXME: We use commutative matchers (m_c_*) for some, but not all, matches 
-// here. We should standardize that construct where it is needed or choose some 
-// other way to ensure that commutated variants of patterns are not missed. 
+  // This only handles icmp of constants: (icmp1 A, C1) | (icmp2 B, C2).
+  if (!LHSC || !RHSC)
+    return nullptr;
+
+  // (icmp ult (X + CA), C1) | (icmp eq X, C2) -> (icmp ule (X + CA), C1)
+  //   iff C2 + CA == C1.
+  if (PredL == ICmpInst::ICMP_ULT && PredR == ICmpInst::ICMP_EQ) {
+    ConstantInt *AddC;
+    if (match(LHS0, m_Add(m_Specific(RHS0), m_ConstantInt(AddC))))
+      if (RHSC->getValue() + AddC->getValue() == LHSC->getValue())
+        return Builder.CreateICmpULE(LHS0, LHSC);
+  }
+
+  // From here on, we only handle:
+  //    (icmp1 A, C1) | (icmp2 A, C2) --> something simpler.
+  if (LHS0 != RHS0)
+    return nullptr;
+
+  // ICMP_[US][GL]E X, C is folded to ICMP_[US][GL]T elsewhere.
+  if (PredL == ICmpInst::ICMP_UGE || PredL == ICmpInst::ICMP_ULE ||
+      PredR == ICmpInst::ICMP_UGE || PredR == ICmpInst::ICMP_ULE ||
+      PredL == ICmpInst::ICMP_SGE || PredL == ICmpInst::ICMP_SLE ||
+      PredR == ICmpInst::ICMP_SGE || PredR == ICmpInst::ICMP_SLE)
+    return nullptr;
+
+  // We can't fold (ugt x, C) | (sgt x, C2).
+  if (!predicatesFoldable(PredL, PredR))
+    return nullptr;
+
+  // Ensure that the larger constant is on the RHS.
+  bool ShouldSwap;
+  if (CmpInst::isSigned(PredL) ||
+      (ICmpInst::isEquality(PredL) && CmpInst::isSigned(PredR)))
+    ShouldSwap = LHSC->getValue().sgt(RHSC->getValue());
+  else
+    ShouldSwap = LHSC->getValue().ugt(RHSC->getValue());
+
+  if (ShouldSwap) {
+    std::swap(LHS, RHS);
+    std::swap(LHSC, RHSC);
+    std::swap(PredL, PredR);
+  }
+
+  // At this point, we know we have two icmp instructions
+  // comparing a value against two constants and or'ing the result
+  // together.  Because of the above check, we know that we only have
+  // ICMP_EQ, ICMP_NE, ICMP_LT, and ICMP_GT here. We also know (from the
+  // icmp folding check above), that the two constants are not
+  // equal.
+  assert(LHSC != RHSC && "Compares not folded above?");
+
+  switch (PredL) {
+  default:
+    llvm_unreachable("Unknown integer condition code!");
+  case ICmpInst::ICMP_EQ:
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:
+      // Potential folds for this case should already be handled.
+      break;
+    case ICmpInst::ICMP_UGT:
+      // (X == 0 || X u> C) -> (X-1) u>= C
+      if (LHSC->isMinValue(false))
+        return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue() + 1,
+                               false, false);
+      // (X == 13 | X u> 14) -> no change
+      break;
+    case ICmpInst::ICMP_SGT:
+      // (X == INT_MIN || X s> C) -> (X-(INT_MIN+1)) u>= C-INT_MIN
+      if (LHSC->isMinValue(true))
+        return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue() + 1,
+                               true, false);
+      // (X == 13 | X s> 14) -> no change
+      break;
+    }
+    break;
+  case ICmpInst::ICMP_ULT:
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ: // (X u< 13 | X == 14) -> no change
+      // (X u< C || X == UINT_MAX) => (X-C) u>= UINT_MAX-C
+      if (RHSC->isMaxValue(false))
+        return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue(),
+                               false, false);
+      break;
+    case ICmpInst::ICMP_UGT: // (X u< 13 | X u> 15) -> (X-13) u> 2
+      assert(!RHSC->isMaxValue(false) && "Missed icmp simplification");
+      return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue() + 1,
+                             false, false);
+    }
+    break;
+  case ICmpInst::ICMP_SLT:
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:
+      // (X s< C || X == INT_MAX) => (X-C) u>= INT_MAX-C
+      if (RHSC->isMaxValue(true))
+        return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue(),
+                               true, false);
+      // (X s< 13 | X == 14) -> no change
+      break;
+    case ICmpInst::ICMP_SGT: // (X s< 13 | X s> 15) -> (X-13) u> 2
+      assert(!RHSC->isMaxValue(true) && "Missed icmp simplification");
+      return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue() + 1, true,
+                             false);
+    }
+    break;
+  }
+  return nullptr;
+}
+
+// FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
+// here. We should standardize that construct where it is needed or choose some
+// other way to ensure that commutated variants of patterns are not missed.
 Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
-  if (Value *V = SimplifyOrInst(I.getOperand(0), I.getOperand(1), 
-                                SQ.getWithInstruction(&I))) 
-    return replaceInstUsesWith(I, V); 
- 
-  if (SimplifyAssociativeOrCommutative(I)) 
-    return &I; 
- 
-  if (Instruction *X = foldVectorBinop(I)) 
-    return X; 
- 
-  // See if we can simplify any instructions used by the instruction whose sole 
-  // purpose is to compute bits we don't care about. 
-  if (SimplifyDemandedInstructionBits(I)) 
-    return &I; 
- 
-  // Do this before using distributive laws to catch simple and/or/not patterns. 
-  if (Instruction *Xor = foldOrToXor(I, Builder)) 
-    return Xor; 
- 
-  // (A&B)|(A&C) -> A&(B|C) etc 
-  if (Value *V = SimplifyUsingDistributiveLaws(I)) 
-    return replaceInstUsesWith(I, V); 
- 
-  if (Value *V = SimplifyBSwap(I, Builder)) 
-    return replaceInstUsesWith(I, V); 
- 
-  if (Instruction *FoldedLogic = foldBinOpIntoSelectOrPhi(I)) 
-    return FoldedLogic; 
- 
+  if (Value *V = SimplifyOrInst(I.getOperand(0), I.getOperand(1),
+                                SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  if (SimplifyAssociativeOrCommutative(I))
+    return &I;
+
+  if (Instruction *X = foldVectorBinop(I))
+    return X;
+
+  // See if we can simplify any instructions used by the instruction whose sole
+  // purpose is to compute bits we don't care about.
+  if (SimplifyDemandedInstructionBits(I))
+    return &I;
+
+  // Do this before using distributive laws to catch simple and/or/not patterns.
+  if (Instruction *Xor = foldOrToXor(I, Builder))
+    return Xor;
+
+  // (A&B)|(A&C) -> A&(B|C) etc
+  if (Value *V = SimplifyUsingDistributiveLaws(I))
+    return replaceInstUsesWith(I, V);
+
+  if (Value *V = SimplifyBSwap(I, Builder))
+    return replaceInstUsesWith(I, V);
+
+  if (Instruction *FoldedLogic = foldBinOpIntoSelectOrPhi(I))
+    return FoldedLogic;
+
   if (Instruction *BSwap = matchBSwapOrBitReverse(I, /*MatchBSwaps*/ true,
                                                   /*MatchBitReversals*/ false))
-    return BSwap; 
- 
+    return BSwap;
+
   if (Instruction *Funnel = matchFunnelShift(I, *this))
     return Funnel;
- 
-  if (Instruction *Concat = matchOrConcat(I, Builder)) 
-    return replaceInstUsesWith(I, Concat); 
- 
-  Value *X, *Y; 
-  const APInt *CV; 
-  if (match(&I, m_c_Or(m_OneUse(m_Xor(m_Value(X), m_APInt(CV))), m_Value(Y))) && 
-      !CV->isAllOnesValue() && MaskedValueIsZero(Y, *CV, 0, &I)) { 
-    // (X ^ C) | Y -> (X | Y) ^ C iff Y & C == 0 
-    // The check for a 'not' op is for efficiency (if Y is known zero --> ~X). 
-    Value *Or = Builder.CreateOr(X, Y); 
-    return BinaryOperator::CreateXor(Or, ConstantInt::get(I.getType(), *CV)); 
-  } 
- 
-  // (A & C)|(B & D) 
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); 
-  Value *A, *B, *C, *D; 
-  if (match(Op0, m_And(m_Value(A), m_Value(C))) && 
-      match(Op1, m_And(m_Value(B), m_Value(D)))) { 
+
+  if (Instruction *Concat = matchOrConcat(I, Builder))
+    return replaceInstUsesWith(I, Concat);
+
+  Value *X, *Y;
+  const APInt *CV;
+  if (match(&I, m_c_Or(m_OneUse(m_Xor(m_Value(X), m_APInt(CV))), m_Value(Y))) &&
+      !CV->isAllOnesValue() && MaskedValueIsZero(Y, *CV, 0, &I)) {
+    // (X ^ C) | Y -> (X | Y) ^ C iff Y & C == 0
+    // The check for a 'not' op is for efficiency (if Y is known zero --> ~X).
+    Value *Or = Builder.CreateOr(X, Y);
+    return BinaryOperator::CreateXor(Or, ConstantInt::get(I.getType(), *CV));
+  }
+
+  // (A & C)|(B & D)
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  Value *A, *B, *C, *D;
+  if (match(Op0, m_And(m_Value(A), m_Value(C))) &&
+      match(Op1, m_And(m_Value(B), m_Value(D)))) {
     // (A & C1)|(B & C2)
     ConstantInt *C1, *C2;
     if (match(C, m_ConstantInt(C1)) && match(D, m_ConstantInt(C2))) {
-      Value *V1 = nullptr, *V2 = nullptr; 
-      if ((C1->getValue() & C2->getValue()).isNullValue()) { 
-        // ((V | N) & C1) | (V & C2) --> (V|N) & (C1|C2) 
-        // iff (C1&C2) == 0 and (N&~C1) == 0 
-        if (match(A, m_Or(m_Value(V1), m_Value(V2))) && 
-            ((V1 == B && 
-              MaskedValueIsZero(V2, ~C1->getValue(), 0, &I)) || // (V|N) 
-             (V2 == B && 
-              MaskedValueIsZero(V1, ~C1->getValue(), 0, &I))))  // (N|V) 
-          return BinaryOperator::CreateAnd(A, 
-                                Builder.getInt(C1->getValue()|C2->getValue())); 
-        // Or commutes, try both ways. 
-        if (match(B, m_Or(m_Value(V1), m_Value(V2))) && 
-            ((V1 == A && 
-              MaskedValueIsZero(V2, ~C2->getValue(), 0, &I)) || // (V|N) 
-             (V2 == A && 
-              MaskedValueIsZero(V1, ~C2->getValue(), 0, &I))))  // (N|V) 
-          return BinaryOperator::CreateAnd(B, 
-                                 Builder.getInt(C1->getValue()|C2->getValue())); 
- 
-        // ((V|C3)&C1) | ((V|C4)&C2) --> (V|C3|C4)&(C1|C2) 
-        // iff (C1&C2) == 0 and (C3&~C1) == 0 and (C4&~C2) == 0. 
-        ConstantInt *C3 = nullptr, *C4 = nullptr; 
-        if (match(A, m_Or(m_Value(V1), m_ConstantInt(C3))) && 
-            (C3->getValue() & ~C1->getValue()).isNullValue() && 
-            match(B, m_Or(m_Specific(V1), m_ConstantInt(C4))) && 
-            (C4->getValue() & ~C2->getValue()).isNullValue()) { 
-          V2 = Builder.CreateOr(V1, ConstantExpr::getOr(C3, C4), "bitfield"); 
-          return BinaryOperator::CreateAnd(V2, 
-                                 Builder.getInt(C1->getValue()|C2->getValue())); 
-        } 
-      } 
- 
-      if (C1->getValue() == ~C2->getValue()) { 
-        Value *X; 
- 
-        // ((X|B)&C1)|(B&C2) -> (X&C1) | B iff C1 == ~C2 
-        if (match(A, m_c_Or(m_Value(X), m_Specific(B)))) 
-          return BinaryOperator::CreateOr(Builder.CreateAnd(X, C1), B); 
-        // (A&C2)|((X|A)&C1) -> (X&C2) | A iff C1 == ~C2 
-        if (match(B, m_c_Or(m_Specific(A), m_Value(X)))) 
-          return BinaryOperator::CreateOr(Builder.CreateAnd(X, C2), A); 
- 
-        // ((X^B)&C1)|(B&C2) -> (X&C1) ^ B iff C1 == ~C2 
-        if (match(A, m_c_Xor(m_Value(X), m_Specific(B)))) 
-          return BinaryOperator::CreateXor(Builder.CreateAnd(X, C1), B); 
-        // (A&C2)|((X^A)&C1) -> (X&C2) ^ A iff C1 == ~C2 
-        if (match(B, m_c_Xor(m_Specific(A), m_Value(X)))) 
-          return BinaryOperator::CreateXor(Builder.CreateAnd(X, C2), A); 
-      } 
-    } 
- 
-    // Don't try to form a select if it's unlikely that we'll get rid of at 
-    // least one of the operands. A select is generally more expensive than the 
-    // 'or' that it is replacing. 
-    if (Op0->hasOneUse() || Op1->hasOneUse()) { 
-      // (Cond & C) | (~Cond & D) -> Cond ? C : D, and commuted variants. 
-      if (Value *V = matchSelectFromAndOr(A, C, B, D)) 
-        return replaceInstUsesWith(I, V); 
-      if (Value *V = matchSelectFromAndOr(A, C, D, B)) 
-        return replaceInstUsesWith(I, V); 
-      if (Value *V = matchSelectFromAndOr(C, A, B, D)) 
-        return replaceInstUsesWith(I, V); 
-      if (Value *V = matchSelectFromAndOr(C, A, D, B)) 
-        return replaceInstUsesWith(I, V); 
-      if (Value *V = matchSelectFromAndOr(B, D, A, C)) 
-        return replaceInstUsesWith(I, V); 
-      if (Value *V = matchSelectFromAndOr(B, D, C, A)) 
-        return replaceInstUsesWith(I, V); 
-      if (Value *V = matchSelectFromAndOr(D, B, A, C)) 
-        return replaceInstUsesWith(I, V); 
-      if (Value *V = matchSelectFromAndOr(D, B, C, A)) 
-        return replaceInstUsesWith(I, V); 
-    } 
-  } 
- 
-  // (A ^ B) | ((B ^ C) ^ A) -> (A ^ B) | C 
-  if (match(Op0, m_Xor(m_Value(A), m_Value(B)))) 
-    if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A)))) 
-      return BinaryOperator::CreateOr(Op0, C); 
- 
-  // ((A ^ C) ^ B) | (B ^ A) -> (B ^ A) | C 
-  if (match(Op0, m_Xor(m_Xor(m_Value(A), m_Value(C)), m_Value(B)))) 
-    if (match(Op1, m_Xor(m_Specific(B), m_Specific(A)))) 
-      return BinaryOperator::CreateOr(Op1, C); 
- 
-  // ((B | C) & A) | B -> B | (A & C) 
-  if (match(Op0, m_And(m_Or(m_Specific(Op1), m_Value(C)), m_Value(A)))) 
-    return BinaryOperator::CreateOr(Op1, Builder.CreateAnd(A, C)); 
- 
-  if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder)) 
-    return DeMorgan; 
- 
-  // Canonicalize xor to the RHS. 
-  bool SwappedForXor = false; 
-  if (match(Op0, m_Xor(m_Value(), m_Value()))) { 
-    std::swap(Op0, Op1); 
-    SwappedForXor = true; 
-  } 
- 
-  // A | ( A ^ B) -> A |  B 
-  // A | (~A ^ B) -> A | ~B 
-  // (A & B) | (A ^ B) 
-  if (match(Op1, m_Xor(m_Value(A), m_Value(B)))) { 
-    if (Op0 == A || Op0 == B) 
-      return BinaryOperator::CreateOr(A, B); 
- 
-    if (match(Op0, m_And(m_Specific(A), m_Specific(B))) || 
-        match(Op0, m_And(m_Specific(B), m_Specific(A)))) 
-      return BinaryOperator::CreateOr(A, B); 
- 
-    if (Op1->hasOneUse() && match(A, m_Not(m_Specific(Op0)))) { 
-      Value *Not = Builder.CreateNot(B, B->getName() + ".not"); 
-      return BinaryOperator::CreateOr(Not, Op0); 
-    } 
-    if (Op1->hasOneUse() && match(B, m_Not(m_Specific(Op0)))) { 
-      Value *Not = Builder.CreateNot(A, A->getName() + ".not"); 
-      return BinaryOperator::CreateOr(Not, Op0); 
-    } 
-  } 
- 
-  // A | ~(A | B) -> A | ~B 
-  // A | ~(A ^ B) -> A | ~B 
-  if (match(Op1, m_Not(m_Value(A)))) 
-    if (BinaryOperator *B = dyn_cast<BinaryOperator>(A)) 
-      if ((Op0 == B->getOperand(0) || Op0 == B->getOperand(1)) && 
-          Op1->hasOneUse() && (B->getOpcode() == Instruction::Or || 
-                               B->getOpcode() == Instruction::Xor)) { 
-        Value *NotOp = Op0 == B->getOperand(0) ? B->getOperand(1) : 
-                                                 B->getOperand(0); 
-        Value *Not = Builder.CreateNot(NotOp, NotOp->getName() + ".not"); 
-        return BinaryOperator::CreateOr(Not, Op0); 
-      } 
- 
-  if (SwappedForXor) 
-    std::swap(Op0, Op1); 
- 
-  { 
-    ICmpInst *LHS = dyn_cast<ICmpInst>(Op0); 
-    ICmpInst *RHS = dyn_cast<ICmpInst>(Op1); 
-    if (LHS && RHS) 
-      if (Value *Res = foldOrOfICmps(LHS, RHS, I)) 
-        return replaceInstUsesWith(I, Res); 
- 
-    // TODO: Make this recursive; it's a little tricky because an arbitrary 
-    // number of 'or' instructions might have to be created. 
-    Value *X, *Y; 
-    if (LHS && match(Op1, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) { 
-      if (auto *Cmp = dyn_cast<ICmpInst>(X)) 
-        if (Value *Res = foldOrOfICmps(LHS, Cmp, I)) 
-          return replaceInstUsesWith(I, Builder.CreateOr(Res, Y)); 
-      if (auto *Cmp = dyn_cast<ICmpInst>(Y)) 
-        if (Value *Res = foldOrOfICmps(LHS, Cmp, I)) 
-          return replaceInstUsesWith(I, Builder.CreateOr(Res, X)); 
-    } 
-    if (RHS && match(Op0, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) { 
-      if (auto *Cmp = dyn_cast<ICmpInst>(X)) 
-        if (Value *Res = foldOrOfICmps(Cmp, RHS, I)) 
-          return replaceInstUsesWith(I, Builder.CreateOr(Res, Y)); 
-      if (auto *Cmp = dyn_cast<ICmpInst>(Y)) 
-        if (Value *Res = foldOrOfICmps(Cmp, RHS, I)) 
-          return replaceInstUsesWith(I, Builder.CreateOr(Res, X)); 
-    } 
-  } 
- 
-  if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0))) 
-    if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1))) 
-      if (Value *Res = foldLogicOfFCmps(LHS, RHS, false)) 
-        return replaceInstUsesWith(I, Res); 
- 
-  if (Instruction *FoldedFCmps = reassociateFCmps(I, Builder)) 
-    return FoldedFCmps; 
- 
-  if (Instruction *CastedOr = foldCastedBitwiseLogic(I)) 
-    return CastedOr; 
- 
-  // or(sext(A), B) / or(B, sext(A)) --> A ? -1 : B, where A is i1 or <N x i1>. 
-  if (match(Op0, m_OneUse(m_SExt(m_Value(A)))) && 
-      A->getType()->isIntOrIntVectorTy(1)) 
-    return SelectInst::Create(A, ConstantInt::getSigned(I.getType(), -1), Op1); 
-  if (match(Op1, m_OneUse(m_SExt(m_Value(A)))) && 
-      A->getType()->isIntOrIntVectorTy(1)) 
-    return SelectInst::Create(A, ConstantInt::getSigned(I.getType(), -1), Op0); 
- 
-  // Note: If we've gotten to the point of visiting the outer OR, then the 
-  // inner one couldn't be simplified.  If it was a constant, then it won't 
-  // be simplified by a later pass either, so we try swapping the inner/outer 
-  // ORs in the hopes that we'll be able to simplify it this way. 
-  // (X|C) | V --> (X|V) | C 
-  ConstantInt *CI; 
+      Value *V1 = nullptr, *V2 = nullptr;
+      if ((C1->getValue() & C2->getValue()).isNullValue()) {
+        // ((V | N) & C1) | (V & C2) --> (V|N) & (C1|C2)
+        // iff (C1&C2) == 0 and (N&~C1) == 0
+        if (match(A, m_Or(m_Value(V1), m_Value(V2))) &&
+            ((V1 == B &&
+              MaskedValueIsZero(V2, ~C1->getValue(), 0, &I)) || // (V|N)
+             (V2 == B &&
+              MaskedValueIsZero(V1, ~C1->getValue(), 0, &I))))  // (N|V)
+          return BinaryOperator::CreateAnd(A,
+                                Builder.getInt(C1->getValue()|C2->getValue()));
+        // Or commutes, try both ways.
+        if (match(B, m_Or(m_Value(V1), m_Value(V2))) &&
+            ((V1 == A &&
+              MaskedValueIsZero(V2, ~C2->getValue(), 0, &I)) || // (V|N)
+             (V2 == A &&
+              MaskedValueIsZero(V1, ~C2->getValue(), 0, &I))))  // (N|V)
+          return BinaryOperator::CreateAnd(B,
+                                 Builder.getInt(C1->getValue()|C2->getValue()));
+
+        // ((V|C3)&C1) | ((V|C4)&C2) --> (V|C3|C4)&(C1|C2)
+        // iff (C1&C2) == 0 and (C3&~C1) == 0 and (C4&~C2) == 0.
+        ConstantInt *C3 = nullptr, *C4 = nullptr;
+        if (match(A, m_Or(m_Value(V1), m_ConstantInt(C3))) &&
+            (C3->getValue() & ~C1->getValue()).isNullValue() &&
+            match(B, m_Or(m_Specific(V1), m_ConstantInt(C4))) &&
+            (C4->getValue() & ~C2->getValue()).isNullValue()) {
+          V2 = Builder.CreateOr(V1, ConstantExpr::getOr(C3, C4), "bitfield");
+          return BinaryOperator::CreateAnd(V2,
+                                 Builder.getInt(C1->getValue()|C2->getValue()));
+        }
+      }
+
+      if (C1->getValue() == ~C2->getValue()) {
+        Value *X;
+
+        // ((X|B)&C1)|(B&C2) -> (X&C1) | B iff C1 == ~C2
+        if (match(A, m_c_Or(m_Value(X), m_Specific(B))))
+          return BinaryOperator::CreateOr(Builder.CreateAnd(X, C1), B);
+        // (A&C2)|((X|A)&C1) -> (X&C2) | A iff C1 == ~C2
+        if (match(B, m_c_Or(m_Specific(A), m_Value(X))))
+          return BinaryOperator::CreateOr(Builder.CreateAnd(X, C2), A);
+
+        // ((X^B)&C1)|(B&C2) -> (X&C1) ^ B iff C1 == ~C2
+        if (match(A, m_c_Xor(m_Value(X), m_Specific(B))))
+          return BinaryOperator::CreateXor(Builder.CreateAnd(X, C1), B);
+        // (A&C2)|((X^A)&C1) -> (X&C2) ^ A iff C1 == ~C2
+        if (match(B, m_c_Xor(m_Specific(A), m_Value(X))))
+          return BinaryOperator::CreateXor(Builder.CreateAnd(X, C2), A);
+      }
+    }
+
+    // Don't try to form a select if it's unlikely that we'll get rid of at
+    // least one of the operands. A select is generally more expensive than the
+    // 'or' that it is replacing.
+    if (Op0->hasOneUse() || Op1->hasOneUse()) {
+      // (Cond & C) | (~Cond & D) -> Cond ? C : D, and commuted variants.
+      if (Value *V = matchSelectFromAndOr(A, C, B, D))
+        return replaceInstUsesWith(I, V);
+      if (Value *V = matchSelectFromAndOr(A, C, D, B))
+        return replaceInstUsesWith(I, V);
+      if (Value *V = matchSelectFromAndOr(C, A, B, D))
+        return replaceInstUsesWith(I, V);
+      if (Value *V = matchSelectFromAndOr(C, A, D, B))
+        return replaceInstUsesWith(I, V);
+      if (Value *V = matchSelectFromAndOr(B, D, A, C))
+        return replaceInstUsesWith(I, V);
+      if (Value *V = matchSelectFromAndOr(B, D, C, A))
+        return replaceInstUsesWith(I, V);
+      if (Value *V = matchSelectFromAndOr(D, B, A, C))
+        return replaceInstUsesWith(I, V);
+      if (Value *V = matchSelectFromAndOr(D, B, C, A))
+        return replaceInstUsesWith(I, V);
+    }
+  }
+
+  // (A ^ B) | ((B ^ C) ^ A) -> (A ^ B) | C
+  if (match(Op0, m_Xor(m_Value(A), m_Value(B))))
+    if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A))))
+      return BinaryOperator::CreateOr(Op0, C);
+
+  // ((A ^ C) ^ B) | (B ^ A) -> (B ^ A) | C
+  if (match(Op0, m_Xor(m_Xor(m_Value(A), m_Value(C)), m_Value(B))))
+    if (match(Op1, m_Xor(m_Specific(B), m_Specific(A))))
+      return BinaryOperator::CreateOr(Op1, C);
+
+  // ((B | C) & A) | B -> B | (A & C)
+  if (match(Op0, m_And(m_Or(m_Specific(Op1), m_Value(C)), m_Value(A))))
+    return BinaryOperator::CreateOr(Op1, Builder.CreateAnd(A, C));
+
+  if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder))
+    return DeMorgan;
+
+  // Canonicalize xor to the RHS.
+  bool SwappedForXor = false;
+  if (match(Op0, m_Xor(m_Value(), m_Value()))) {
+    std::swap(Op0, Op1);
+    SwappedForXor = true;
+  }
+
+  // A | ( A ^ B) -> A |  B
+  // A | (~A ^ B) -> A | ~B
+  // (A & B) | (A ^ B)
+  if (match(Op1, m_Xor(m_Value(A), m_Value(B)))) {
+    if (Op0 == A || Op0 == B)
+      return BinaryOperator::CreateOr(A, B);
+
+    if (match(Op0, m_And(m_Specific(A), m_Specific(B))) ||
+        match(Op0, m_And(m_Specific(B), m_Specific(A))))
+      return BinaryOperator::CreateOr(A, B);
+
+    if (Op1->hasOneUse() && match(A, m_Not(m_Specific(Op0)))) {
+      Value *Not = Builder.CreateNot(B, B->getName() + ".not");
+      return BinaryOperator::CreateOr(Not, Op0);
+    }
+    if (Op1->hasOneUse() && match(B, m_Not(m_Specific(Op0)))) {
+      Value *Not = Builder.CreateNot(A, A->getName() + ".not");
+      return BinaryOperator::CreateOr(Not, Op0);
+    }
+  }
+
+  // A | ~(A | B) -> A | ~B
+  // A | ~(A ^ B) -> A | ~B
+  if (match(Op1, m_Not(m_Value(A))))
+    if (BinaryOperator *B = dyn_cast<BinaryOperator>(A))
+      if ((Op0 == B->getOperand(0) || Op0 == B->getOperand(1)) &&
+          Op1->hasOneUse() && (B->getOpcode() == Instruction::Or ||
+                               B->getOpcode() == Instruction::Xor)) {
+        Value *NotOp = Op0 == B->getOperand(0) ? B->getOperand(1) :
+                                                 B->getOperand(0);
+        Value *Not = Builder.CreateNot(NotOp, NotOp->getName() + ".not");
+        return BinaryOperator::CreateOr(Not, Op0);
+      }
+
+  if (SwappedForXor)
+    std::swap(Op0, Op1);
+
+  {
+    ICmpInst *LHS = dyn_cast<ICmpInst>(Op0);
+    ICmpInst *RHS = dyn_cast<ICmpInst>(Op1);
+    if (LHS && RHS)
+      if (Value *Res = foldOrOfICmps(LHS, RHS, I))
+        return replaceInstUsesWith(I, Res);
+
+    // TODO: Make this recursive; it's a little tricky because an arbitrary
+    // number of 'or' instructions might have to be created.
+    Value *X, *Y;
+    if (LHS && match(Op1, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) {
+      if (auto *Cmp = dyn_cast<ICmpInst>(X))
+        if (Value *Res = foldOrOfICmps(LHS, Cmp, I))
+          return replaceInstUsesWith(I, Builder.CreateOr(Res, Y));
+      if (auto *Cmp = dyn_cast<ICmpInst>(Y))
+        if (Value *Res = foldOrOfICmps(LHS, Cmp, I))
+          return replaceInstUsesWith(I, Builder.CreateOr(Res, X));
+    }
+    if (RHS && match(Op0, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) {
+      if (auto *Cmp = dyn_cast<ICmpInst>(X))
+        if (Value *Res = foldOrOfICmps(Cmp, RHS, I))
+          return replaceInstUsesWith(I, Builder.CreateOr(Res, Y));
+      if (auto *Cmp = dyn_cast<ICmpInst>(Y))
+        if (Value *Res = foldOrOfICmps(Cmp, RHS, I))
+          return replaceInstUsesWith(I, Builder.CreateOr(Res, X));
+    }
+  }
+
+  if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0)))
+    if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1)))
+      if (Value *Res = foldLogicOfFCmps(LHS, RHS, false))
+        return replaceInstUsesWith(I, Res);
+
+  if (Instruction *FoldedFCmps = reassociateFCmps(I, Builder))
+    return FoldedFCmps;
+
+  if (Instruction *CastedOr = foldCastedBitwiseLogic(I))
+    return CastedOr;
+
+  // or(sext(A), B) / or(B, sext(A)) --> A ? -1 : B, where A is i1 or <N x i1>.
+  if (match(Op0, m_OneUse(m_SExt(m_Value(A)))) &&
+      A->getType()->isIntOrIntVectorTy(1))
+    return SelectInst::Create(A, ConstantInt::getSigned(I.getType(), -1), Op1);
+  if (match(Op1, m_OneUse(m_SExt(m_Value(A)))) &&
+      A->getType()->isIntOrIntVectorTy(1))
+    return SelectInst::Create(A, ConstantInt::getSigned(I.getType(), -1), Op0);
+
+  // Note: If we've gotten to the point of visiting the outer OR, then the
+  // inner one couldn't be simplified.  If it was a constant, then it won't
+  // be simplified by a later pass either, so we try swapping the inner/outer
+  // ORs in the hopes that we'll be able to simplify it this way.
+  // (X|C) | V --> (X|V) | C
+  ConstantInt *CI;
   if (Op0->hasOneUse() && !match(Op1, m_ConstantInt()) &&
-      match(Op0, m_Or(m_Value(A), m_ConstantInt(CI)))) { 
-    Value *Inner = Builder.CreateOr(A, Op1); 
-    Inner->takeName(Op0); 
-    return BinaryOperator::CreateOr(Inner, CI); 
-  } 
- 
-  // Change (or (bool?A:B),(bool?C:D)) --> (bool?(or A,C):(or B,D)) 
-  // Since this OR statement hasn't been optimized further yet, we hope 
-  // that this transformation will allow the new ORs to be optimized. 
-  { 
-    Value *X = nullptr, *Y = nullptr; 
-    if (Op0->hasOneUse() && Op1->hasOneUse() && 
-        match(Op0, m_Select(m_Value(X), m_Value(A), m_Value(B))) && 
-        match(Op1, m_Select(m_Value(Y), m_Value(C), m_Value(D))) && X == Y) { 
-      Value *orTrue = Builder.CreateOr(A, C); 
-      Value *orFalse = Builder.CreateOr(B, D); 
-      return SelectInst::Create(X, orTrue, orFalse); 
-    } 
-  } 
- 
+      match(Op0, m_Or(m_Value(A), m_ConstantInt(CI)))) {
+    Value *Inner = Builder.CreateOr(A, Op1);
+    Inner->takeName(Op0);
+    return BinaryOperator::CreateOr(Inner, CI);
+  }
+
+  // Change (or (bool?A:B),(bool?C:D)) --> (bool?(or A,C):(or B,D))
+  // Since this OR statement hasn't been optimized further yet, we hope
+  // that this transformation will allow the new ORs to be optimized.
+  {
+    Value *X = nullptr, *Y = nullptr;
+    if (Op0->hasOneUse() && Op1->hasOneUse() &&
+        match(Op0, m_Select(m_Value(X), m_Value(A), m_Value(B))) &&
+        match(Op1, m_Select(m_Value(Y), m_Value(C), m_Value(D))) && X == Y) {
+      Value *orTrue = Builder.CreateOr(A, C);
+      Value *orFalse = Builder.CreateOr(B, D);
+      return SelectInst::Create(X, orTrue, orFalse);
+    }
+  }
+
   // or(ashr(subNSW(Y, X), ScalarSizeInBits(Y) - 1), X)  --> X s> Y ? -1 : X.
-  { 
-    Value *X, *Y; 
-    Type *Ty = I.getType(); 
+  {
+    Value *X, *Y;
+    Type *Ty = I.getType();
     if (match(&I, m_c_Or(m_OneUse(m_AShr(
                              m_NSWSub(m_Value(Y), m_Value(X)),
                              m_SpecificInt(Ty->getScalarSizeInBits() - 1))),
                          m_Deferred(X)))) {
-      Value *NewICmpInst = Builder.CreateICmpSGT(X, Y); 
+      Value *NewICmpInst = Builder.CreateICmpSGT(X, Y);
       Value *AllOnes = ConstantInt::getAllOnesValue(Ty);
       return SelectInst::Create(NewICmpInst, AllOnes, X);
-    } 
-  } 
- 
-  if (Instruction *V = 
-          canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(I)) 
-    return V; 
- 
-  CmpInst::Predicate Pred; 
-  Value *Mul, *Ov, *MulIsNotZero, *UMulWithOv; 
-  // Check if the OR weakens the overflow condition for umul.with.overflow by 
-  // treating any non-zero result as overflow. In that case, we overflow if both 
-  // umul.with.overflow operands are != 0, as in that case the result can only 
-  // be 0, iff the multiplication overflows. 
-  if (match(&I, 
-            m_c_Or(m_CombineAnd(m_ExtractValue<1>(m_Value(UMulWithOv)), 
-                                m_Value(Ov)), 
-                   m_CombineAnd(m_ICmp(Pred, 
-                                       m_CombineAnd(m_ExtractValue<0>( 
-                                                        m_Deferred(UMulWithOv)), 
-                                                    m_Value(Mul)), 
-                                       m_ZeroInt()), 
-                                m_Value(MulIsNotZero)))) && 
-      (Ov->hasOneUse() || (MulIsNotZero->hasOneUse() && Mul->hasOneUse())) && 
-      Pred == CmpInst::ICMP_NE) { 
-    Value *A, *B; 
-    if (match(UMulWithOv, m_Intrinsic<Intrinsic::umul_with_overflow>( 
-                              m_Value(A), m_Value(B)))) { 
-      Value *NotNullA = Builder.CreateIsNotNull(A); 
-      Value *NotNullB = Builder.CreateIsNotNull(B); 
-      return BinaryOperator::CreateAnd(NotNullA, NotNullB); 
-    } 
-  } 
- 
+    }
+  }
+
+  if (Instruction *V =
+          canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(I))
+    return V;
+
+  CmpInst::Predicate Pred;
+  Value *Mul, *Ov, *MulIsNotZero, *UMulWithOv;
+  // Check if the OR weakens the overflow condition for umul.with.overflow by
+  // treating any non-zero result as overflow. In that case, we overflow if both
+  // umul.with.overflow operands are != 0, as in that case the result can only
+  // be 0, iff the multiplication overflows.
+  if (match(&I,
+            m_c_Or(m_CombineAnd(m_ExtractValue<1>(m_Value(UMulWithOv)),
+                                m_Value(Ov)),
+                   m_CombineAnd(m_ICmp(Pred,
+                                       m_CombineAnd(m_ExtractValue<0>(
+                                                        m_Deferred(UMulWithOv)),
+                                                    m_Value(Mul)),
+                                       m_ZeroInt()),
+                                m_Value(MulIsNotZero)))) &&
+      (Ov->hasOneUse() || (MulIsNotZero->hasOneUse() && Mul->hasOneUse())) &&
+      Pred == CmpInst::ICMP_NE) {
+    Value *A, *B;
+    if (match(UMulWithOv, m_Intrinsic<Intrinsic::umul_with_overflow>(
+                              m_Value(A), m_Value(B)))) {
+      Value *NotNullA = Builder.CreateIsNotNull(A);
+      Value *NotNullB = Builder.CreateIsNotNull(B);
+      return BinaryOperator::CreateAnd(NotNullA, NotNullB);
+    }
+  }
+
   // (~x) | y  -->  ~(x & (~y))  iff that gets rid of inversions
   if (sinkNotIntoOtherHandOfAndOrOr(I))
     return &I;
 
-  return nullptr; 
-} 
- 
-/// A ^ B can be specified using other logic ops in a variety of patterns. We 
-/// can fold these early and efficiently by morphing an existing instruction. 
-static Instruction *foldXorToXor(BinaryOperator &I, 
-                                 InstCombiner::BuilderTy &Builder) { 
-  assert(I.getOpcode() == Instruction::Xor); 
-  Value *Op0 = I.getOperand(0); 
-  Value *Op1 = I.getOperand(1); 
-  Value *A, *B; 
- 
-  // There are 4 commuted variants for each of the basic patterns. 
- 
-  // (A & B) ^ (A | B) -> A ^ B 
-  // (A & B) ^ (B | A) -> A ^ B 
-  // (A | B) ^ (A & B) -> A ^ B 
-  // (A | B) ^ (B & A) -> A ^ B 
-  if (match(&I, m_c_Xor(m_And(m_Value(A), m_Value(B)), 
-                        m_c_Or(m_Deferred(A), m_Deferred(B))))) 
-    return BinaryOperator::CreateXor(A, B); 
- 
-  // (A | ~B) ^ (~A | B) -> A ^ B 
-  // (~B | A) ^ (~A | B) -> A ^ B 
-  // (~A | B) ^ (A | ~B) -> A ^ B 
-  // (B | ~A) ^ (A | ~B) -> A ^ B 
-  if (match(&I, m_Xor(m_c_Or(m_Value(A), m_Not(m_Value(B))), 
-                      m_c_Or(m_Not(m_Deferred(A)), m_Deferred(B))))) 
-    return BinaryOperator::CreateXor(A, B); 
- 
-  // (A & ~B) ^ (~A & B) -> A ^ B 
-  // (~B & A) ^ (~A & B) -> A ^ B 
-  // (~A & B) ^ (A & ~B) -> A ^ B 
-  // (B & ~A) ^ (A & ~B) -> A ^ B 
-  if (match(&I, m_Xor(m_c_And(m_Value(A), m_Not(m_Value(B))), 
-                      m_c_And(m_Not(m_Deferred(A)), m_Deferred(B))))) 
-    return BinaryOperator::CreateXor(A, B); 
- 
-  // For the remaining cases we need to get rid of one of the operands. 
-  if (!Op0->hasOneUse() && !Op1->hasOneUse()) 
-    return nullptr; 
- 
-  // (A | B) ^ ~(A & B) -> ~(A ^ B) 
-  // (A | B) ^ ~(B & A) -> ~(A ^ B) 
-  // (A & B) ^ ~(A | B) -> ~(A ^ B) 
-  // (A & B) ^ ~(B | A) -> ~(A ^ B) 
-  // Complexity sorting ensures the not will be on the right side. 
-  if ((match(Op0, m_Or(m_Value(A), m_Value(B))) && 
-       match(Op1, m_Not(m_c_And(m_Specific(A), m_Specific(B))))) || 
-      (match(Op0, m_And(m_Value(A), m_Value(B))) && 
-       match(Op1, m_Not(m_c_Or(m_Specific(A), m_Specific(B)))))) 
-    return BinaryOperator::CreateNot(Builder.CreateXor(A, B)); 
- 
-  return nullptr; 
-} 
- 
+  return nullptr;
+}
+
+/// A ^ B can be specified using other logic ops in a variety of patterns. We
+/// can fold these early and efficiently by morphing an existing instruction.
+static Instruction *foldXorToXor(BinaryOperator &I,
+                                 InstCombiner::BuilderTy &Builder) {
+  assert(I.getOpcode() == Instruction::Xor);
+  Value *Op0 = I.getOperand(0);
+  Value *Op1 = I.getOperand(1);
+  Value *A, *B;
+
+  // There are 4 commuted variants for each of the basic patterns.
+
+  // (A & B) ^ (A | B) -> A ^ B
+  // (A & B) ^ (B | A) -> A ^ B
+  // (A | B) ^ (A & B) -> A ^ B
+  // (A | B) ^ (B & A) -> A ^ B
+  if (match(&I, m_c_Xor(m_And(m_Value(A), m_Value(B)),
+                        m_c_Or(m_Deferred(A), m_Deferred(B)))))
+    return BinaryOperator::CreateXor(A, B);
+
+  // (A | ~B) ^ (~A | B) -> A ^ B
+  // (~B | A) ^ (~A | B) -> A ^ B
+  // (~A | B) ^ (A | ~B) -> A ^ B
+  // (B | ~A) ^ (A | ~B) -> A ^ B
+  if (match(&I, m_Xor(m_c_Or(m_Value(A), m_Not(m_Value(B))),
+                      m_c_Or(m_Not(m_Deferred(A)), m_Deferred(B)))))
+    return BinaryOperator::CreateXor(A, B);
+
+  // (A & ~B) ^ (~A & B) -> A ^ B
+  // (~B & A) ^ (~A & B) -> A ^ B
+  // (~A & B) ^ (A & ~B) -> A ^ B
+  // (B & ~A) ^ (A & ~B) -> A ^ B
+  if (match(&I, m_Xor(m_c_And(m_Value(A), m_Not(m_Value(B))),
+                      m_c_And(m_Not(m_Deferred(A)), m_Deferred(B)))))
+    return BinaryOperator::CreateXor(A, B);
+
+  // For the remaining cases we need to get rid of one of the operands.
+  if (!Op0->hasOneUse() && !Op1->hasOneUse())
+    return nullptr;
+
+  // (A | B) ^ ~(A & B) -> ~(A ^ B)
+  // (A | B) ^ ~(B & A) -> ~(A ^ B)
+  // (A & B) ^ ~(A | B) -> ~(A ^ B)
+  // (A & B) ^ ~(B | A) -> ~(A ^ B)
+  // Complexity sorting ensures the not will be on the right side.
+  if ((match(Op0, m_Or(m_Value(A), m_Value(B))) &&
+       match(Op1, m_Not(m_c_And(m_Specific(A), m_Specific(B))))) ||
+      (match(Op0, m_And(m_Value(A), m_Value(B))) &&
+       match(Op1, m_Not(m_c_Or(m_Specific(A), m_Specific(B))))))
+    return BinaryOperator::CreateNot(Builder.CreateXor(A, B));
+
+  return nullptr;
+}
+
 Value *InstCombinerImpl::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS,
                                         BinaryOperator &I) {
-  assert(I.getOpcode() == Instruction::Xor && I.getOperand(0) == LHS && 
-         I.getOperand(1) == RHS && "Should be 'xor' with these operands"); 
- 
-  if (predicatesFoldable(LHS->getPredicate(), RHS->getPredicate())) { 
-    if (LHS->getOperand(0) == RHS->getOperand(1) && 
-        LHS->getOperand(1) == RHS->getOperand(0)) 
-      LHS->swapOperands(); 
-    if (LHS->getOperand(0) == RHS->getOperand(0) && 
-        LHS->getOperand(1) == RHS->getOperand(1)) { 
-      // (icmp1 A, B) ^ (icmp2 A, B) --> (icmp3 A, B) 
-      Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1); 
-      unsigned Code = getICmpCode(LHS) ^ getICmpCode(RHS); 
-      bool IsSigned = LHS->isSigned() || RHS->isSigned(); 
-      return getNewICmpValue(Code, IsSigned, Op0, Op1, Builder); 
-    } 
-  } 
- 
-  // TODO: This can be generalized to compares of non-signbits using 
-  // decomposeBitTestICmp(). It could be enhanced more by using (something like) 
-  // foldLogOpOfMaskedICmps(). 
-  ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate(); 
-  Value *LHS0 = LHS->getOperand(0), *LHS1 = LHS->getOperand(1); 
-  Value *RHS0 = RHS->getOperand(0), *RHS1 = RHS->getOperand(1); 
-  if ((LHS->hasOneUse() || RHS->hasOneUse()) && 
-      LHS0->getType() == RHS0->getType() && 
-      LHS0->getType()->isIntOrIntVectorTy()) { 
-    // (X > -1) ^ (Y > -1) --> (X ^ Y) < 0 
-    // (X <  0) ^ (Y <  0) --> (X ^ Y) < 0 
-    if ((PredL == CmpInst::ICMP_SGT && match(LHS1, m_AllOnes()) && 
-         PredR == CmpInst::ICMP_SGT && match(RHS1, m_AllOnes())) || 
-        (PredL == CmpInst::ICMP_SLT && match(LHS1, m_Zero()) && 
-         PredR == CmpInst::ICMP_SLT && match(RHS1, m_Zero()))) { 
-      Value *Zero = ConstantInt::getNullValue(LHS0->getType()); 
-      return Builder.CreateICmpSLT(Builder.CreateXor(LHS0, RHS0), Zero); 
-    } 
-    // (X > -1) ^ (Y <  0) --> (X ^ Y) > -1 
-    // (X <  0) ^ (Y > -1) --> (X ^ Y) > -1 
-    if ((PredL == CmpInst::ICMP_SGT && match(LHS1, m_AllOnes()) && 
-         PredR == CmpInst::ICMP_SLT && match(RHS1, m_Zero())) || 
-        (PredL == CmpInst::ICMP_SLT && match(LHS1, m_Zero()) && 
-         PredR == CmpInst::ICMP_SGT && match(RHS1, m_AllOnes()))) { 
-      Value *MinusOne = ConstantInt::getAllOnesValue(LHS0->getType()); 
-      return Builder.CreateICmpSGT(Builder.CreateXor(LHS0, RHS0), MinusOne); 
-    } 
-  } 
- 
-  // Instead of trying to imitate the folds for and/or, decompose this 'xor' 
-  // into those logic ops. That is, try to turn this into an and-of-icmps 
-  // because we have many folds for that pattern. 
-  // 
-  // This is based on a truth table definition of xor: 
-  // X ^ Y --> (X | Y) & !(X & Y) 
-  if (Value *OrICmp = SimplifyBinOp(Instruction::Or, LHS, RHS, SQ)) { 
-    // TODO: If OrICmp is true, then the definition of xor simplifies to !(X&Y). 
-    // TODO: If OrICmp is false, the whole thing is false (InstSimplify?). 
-    if (Value *AndICmp = SimplifyBinOp(Instruction::And, LHS, RHS, SQ)) { 
-      // TODO: Independently handle cases where the 'and' side is a constant. 
-      ICmpInst *X = nullptr, *Y = nullptr; 
-      if (OrICmp == LHS && AndICmp == RHS) { 
-        // (LHS | RHS) & !(LHS & RHS) --> LHS & !RHS  --> X & !Y 
-        X = LHS; 
-        Y = RHS; 
-      } 
-      if (OrICmp == RHS && AndICmp == LHS) { 
-        // !(LHS & RHS) & (LHS | RHS) --> !LHS & RHS  --> !Y & X 
-        X = RHS; 
-        Y = LHS; 
-      } 
-      if (X && Y && (Y->hasOneUse() || canFreelyInvertAllUsersOf(Y, &I))) { 
-        // Invert the predicate of 'Y', thus inverting its output. 
-        Y->setPredicate(Y->getInversePredicate()); 
-        // So, are there other uses of Y? 
-        if (!Y->hasOneUse()) { 
-          // We need to adapt other uses of Y though. Get a value that matches 
-          // the original value of Y before inversion. While this increases 
-          // immediate instruction count, we have just ensured that all the 
-          // users are freely-invertible, so that 'not' *will* get folded away. 
-          BuilderTy::InsertPointGuard Guard(Builder); 
-          // Set insertion point to right after the Y. 
-          Builder.SetInsertPoint(Y->getParent(), ++(Y->getIterator())); 
-          Value *NotY = Builder.CreateNot(Y, Y->getName() + ".not"); 
-          // Replace all uses of Y (excluding the one in NotY!) with NotY. 
-          Worklist.pushUsersToWorkList(*Y); 
-          Y->replaceUsesWithIf(NotY, 
-                               [NotY](Use &U) { return U.getUser() != NotY; }); 
-        } 
-        // All done. 
-        return Builder.CreateAnd(LHS, RHS); 
-      } 
-    } 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// If we have a masked merge, in the canonical form of: 
-/// (assuming that A only has one use.) 
-///   |        A  |  |B| 
-///   ((x ^ y) & M) ^ y 
-///    |  D  | 
-/// * If M is inverted: 
-///      |  D  | 
-///     ((x ^ y) & ~M) ^ y 
-///   We can canonicalize by swapping the final xor operand 
-///   to eliminate the 'not' of the mask. 
-///     ((x ^ y) & M) ^ x 
-/// * If M is a constant, and D has one use, we transform to 'and' / 'or' ops 
-///   because that shortens the dependency chain and improves analysis: 
-///     (x & M) | (y & ~M) 
-static Instruction *visitMaskedMerge(BinaryOperator &I, 
-                                     InstCombiner::BuilderTy &Builder) { 
-  Value *B, *X, *D; 
-  Value *M; 
-  if (!match(&I, m_c_Xor(m_Value(B), 
-                         m_OneUse(m_c_And( 
-                             m_CombineAnd(m_c_Xor(m_Deferred(B), m_Value(X)), 
-                                          m_Value(D)), 
-                             m_Value(M)))))) 
-    return nullptr; 
- 
-  Value *NotM; 
-  if (match(M, m_Not(m_Value(NotM)))) { 
-    // De-invert the mask and swap the value in B part. 
-    Value *NewA = Builder.CreateAnd(D, NotM); 
-    return BinaryOperator::CreateXor(NewA, X); 
-  } 
- 
-  Constant *C; 
-  if (D->hasOneUse() && match(M, m_Constant(C))) { 
-    // Propagating undef is unsafe. Clamp undef elements to -1. 
-    Type *EltTy = C->getType()->getScalarType(); 
-    C = Constant::replaceUndefsWith(C, ConstantInt::getAllOnesValue(EltTy)); 
-    // Unfold. 
-    Value *LHS = Builder.CreateAnd(X, C); 
-    Value *NotC = Builder.CreateNot(C); 
-    Value *RHS = Builder.CreateAnd(B, NotC); 
-    return BinaryOperator::CreateOr(LHS, RHS); 
-  } 
- 
-  return nullptr; 
-} 
- 
-// Transform 
-//   ~(x ^ y) 
-// into: 
-//   (~x) ^ y 
-// or into 
-//   x ^ (~y) 
-static Instruction *sinkNotIntoXor(BinaryOperator &I, 
-                                   InstCombiner::BuilderTy &Builder) { 
-  Value *X, *Y; 
-  // FIXME: one-use check is not needed in general, but currently we are unable 
-  // to fold 'not' into 'icmp', if that 'icmp' has multiple uses. (D35182) 
-  if (!match(&I, m_Not(m_OneUse(m_Xor(m_Value(X), m_Value(Y)))))) 
-    return nullptr; 
- 
-  // We only want to do the transform if it is free to do. 
+  assert(I.getOpcode() == Instruction::Xor && I.getOperand(0) == LHS &&
+         I.getOperand(1) == RHS && "Should be 'xor' with these operands");
+
+  if (predicatesFoldable(LHS->getPredicate(), RHS->getPredicate())) {
+    if (LHS->getOperand(0) == RHS->getOperand(1) &&
+        LHS->getOperand(1) == RHS->getOperand(0))
+      LHS->swapOperands();
+    if (LHS->getOperand(0) == RHS->getOperand(0) &&
+        LHS->getOperand(1) == RHS->getOperand(1)) {
+      // (icmp1 A, B) ^ (icmp2 A, B) --> (icmp3 A, B)
+      Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1);
+      unsigned Code = getICmpCode(LHS) ^ getICmpCode(RHS);
+      bool IsSigned = LHS->isSigned() || RHS->isSigned();
+      return getNewICmpValue(Code, IsSigned, Op0, Op1, Builder);
+    }
+  }
+
+  // TODO: This can be generalized to compares of non-signbits using
+  // decomposeBitTestICmp(). It could be enhanced more by using (something like)
+  // foldLogOpOfMaskedICmps().
+  ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
+  Value *LHS0 = LHS->getOperand(0), *LHS1 = LHS->getOperand(1);
+  Value *RHS0 = RHS->getOperand(0), *RHS1 = RHS->getOperand(1);
+  if ((LHS->hasOneUse() || RHS->hasOneUse()) &&
+      LHS0->getType() == RHS0->getType() &&
+      LHS0->getType()->isIntOrIntVectorTy()) {
+    // (X > -1) ^ (Y > -1) --> (X ^ Y) < 0
+    // (X <  0) ^ (Y <  0) --> (X ^ Y) < 0
+    if ((PredL == CmpInst::ICMP_SGT && match(LHS1, m_AllOnes()) &&
+         PredR == CmpInst::ICMP_SGT && match(RHS1, m_AllOnes())) ||
+        (PredL == CmpInst::ICMP_SLT && match(LHS1, m_Zero()) &&
+         PredR == CmpInst::ICMP_SLT && match(RHS1, m_Zero()))) {
+      Value *Zero = ConstantInt::getNullValue(LHS0->getType());
+      return Builder.CreateICmpSLT(Builder.CreateXor(LHS0, RHS0), Zero);
+    }
+    // (X > -1) ^ (Y <  0) --> (X ^ Y) > -1
+    // (X <  0) ^ (Y > -1) --> (X ^ Y) > -1
+    if ((PredL == CmpInst::ICMP_SGT && match(LHS1, m_AllOnes()) &&
+         PredR == CmpInst::ICMP_SLT && match(RHS1, m_Zero())) ||
+        (PredL == CmpInst::ICMP_SLT && match(LHS1, m_Zero()) &&
+         PredR == CmpInst::ICMP_SGT && match(RHS1, m_AllOnes()))) {
+      Value *MinusOne = ConstantInt::getAllOnesValue(LHS0->getType());
+      return Builder.CreateICmpSGT(Builder.CreateXor(LHS0, RHS0), MinusOne);
+    }
+  }
+
+  // Instead of trying to imitate the folds for and/or, decompose this 'xor'
+  // into those logic ops. That is, try to turn this into an and-of-icmps
+  // because we have many folds for that pattern.
+  //
+  // This is based on a truth table definition of xor:
+  // X ^ Y --> (X | Y) & !(X & Y)
+  if (Value *OrICmp = SimplifyBinOp(Instruction::Or, LHS, RHS, SQ)) {
+    // TODO: If OrICmp is true, then the definition of xor simplifies to !(X&Y).
+    // TODO: If OrICmp is false, the whole thing is false (InstSimplify?).
+    if (Value *AndICmp = SimplifyBinOp(Instruction::And, LHS, RHS, SQ)) {
+      // TODO: Independently handle cases where the 'and' side is a constant.
+      ICmpInst *X = nullptr, *Y = nullptr;
+      if (OrICmp == LHS && AndICmp == RHS) {
+        // (LHS | RHS) & !(LHS & RHS) --> LHS & !RHS  --> X & !Y
+        X = LHS;
+        Y = RHS;
+      }
+      if (OrICmp == RHS && AndICmp == LHS) {
+        // !(LHS & RHS) & (LHS | RHS) --> !LHS & RHS  --> !Y & X
+        X = RHS;
+        Y = LHS;
+      }
+      if (X && Y && (Y->hasOneUse() || canFreelyInvertAllUsersOf(Y, &I))) {
+        // Invert the predicate of 'Y', thus inverting its output.
+        Y->setPredicate(Y->getInversePredicate());
+        // So, are there other uses of Y?
+        if (!Y->hasOneUse()) {
+          // We need to adapt other uses of Y though. Get a value that matches
+          // the original value of Y before inversion. While this increases
+          // immediate instruction count, we have just ensured that all the
+          // users are freely-invertible, so that 'not' *will* get folded away.
+          BuilderTy::InsertPointGuard Guard(Builder);
+          // Set insertion point to right after the Y.
+          Builder.SetInsertPoint(Y->getParent(), ++(Y->getIterator()));
+          Value *NotY = Builder.CreateNot(Y, Y->getName() + ".not");
+          // Replace all uses of Y (excluding the one in NotY!) with NotY.
+          Worklist.pushUsersToWorkList(*Y);
+          Y->replaceUsesWithIf(NotY,
+                               [NotY](Use &U) { return U.getUser() != NotY; });
+        }
+        // All done.
+        return Builder.CreateAnd(LHS, RHS);
+      }
+    }
+  }
+
+  return nullptr;
+}
+
+/// If we have a masked merge, in the canonical form of:
+/// (assuming that A only has one use.)
+///   |        A  |  |B|
+///   ((x ^ y) & M) ^ y
+///    |  D  |
+/// * If M is inverted:
+///      |  D  |
+///     ((x ^ y) & ~M) ^ y
+///   We can canonicalize by swapping the final xor operand
+///   to eliminate the 'not' of the mask.
+///     ((x ^ y) & M) ^ x
+/// * If M is a constant, and D has one use, we transform to 'and' / 'or' ops
+///   because that shortens the dependency chain and improves analysis:
+///     (x & M) | (y & ~M)
+static Instruction *visitMaskedMerge(BinaryOperator &I,
+                                     InstCombiner::BuilderTy &Builder) {
+  Value *B, *X, *D;
+  Value *M;
+  if (!match(&I, m_c_Xor(m_Value(B),
+                         m_OneUse(m_c_And(
+                             m_CombineAnd(m_c_Xor(m_Deferred(B), m_Value(X)),
+                                          m_Value(D)),
+                             m_Value(M))))))
+    return nullptr;
+
+  Value *NotM;
+  if (match(M, m_Not(m_Value(NotM)))) {
+    // De-invert the mask and swap the value in B part.
+    Value *NewA = Builder.CreateAnd(D, NotM);
+    return BinaryOperator::CreateXor(NewA, X);
+  }
+
+  Constant *C;
+  if (D->hasOneUse() && match(M, m_Constant(C))) {
+    // Propagating undef is unsafe. Clamp undef elements to -1.
+    Type *EltTy = C->getType()->getScalarType();
+    C = Constant::replaceUndefsWith(C, ConstantInt::getAllOnesValue(EltTy));
+    // Unfold.
+    Value *LHS = Builder.CreateAnd(X, C);
+    Value *NotC = Builder.CreateNot(C);
+    Value *RHS = Builder.CreateAnd(B, NotC);
+    return BinaryOperator::CreateOr(LHS, RHS);
+  }
+
+  return nullptr;
+}
+
+// Transform
+//   ~(x ^ y)
+// into:
+//   (~x) ^ y
+// or into
+//   x ^ (~y)
+static Instruction *sinkNotIntoXor(BinaryOperator &I,
+                                   InstCombiner::BuilderTy &Builder) {
+  Value *X, *Y;
+  // FIXME: one-use check is not needed in general, but currently we are unable
+  // to fold 'not' into 'icmp', if that 'icmp' has multiple uses. (D35182)
+  if (!match(&I, m_Not(m_OneUse(m_Xor(m_Value(X), m_Value(Y))))))
+    return nullptr;
+
+  // We only want to do the transform if it is free to do.
   if (InstCombiner::isFreeToInvert(X, X->hasOneUse())) {
-    // Ok, good. 
+    // Ok, good.
   } else if (InstCombiner::isFreeToInvert(Y, Y->hasOneUse())) {
-    std::swap(X, Y); 
-  } else 
-    return nullptr; 
- 
-  Value *NotX = Builder.CreateNot(X, X->getName() + ".not"); 
-  return BinaryOperator::CreateXor(NotX, Y, I.getName() + ".demorgan"); 
-} 
- 
+    std::swap(X, Y);
+  } else
+    return nullptr;
+
+  Value *NotX = Builder.CreateNot(X, X->getName() + ".not");
+  return BinaryOperator::CreateXor(NotX, Y, I.getName() + ".demorgan");
+}
+
 // Transform
 //   z = (~x) &/| y
 // into:
@@ -3139,118 +3139,118 @@ bool InstCombinerImpl::sinkNotIntoOtherHandOfAndOrOr(BinaryOperator &I) {
   return true;
 }
 
-// FIXME: We use commutative matchers (m_c_*) for some, but not all, matches 
-// here. We should standardize that construct where it is needed or choose some 
-// other way to ensure that commutated variants of patterns are not missed. 
+// FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
+// here. We should standardize that construct where it is needed or choose some
+// other way to ensure that commutated variants of patterns are not missed.
 Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
-  if (Value *V = SimplifyXorInst(I.getOperand(0), I.getOperand(1), 
-                                 SQ.getWithInstruction(&I))) 
-    return replaceInstUsesWith(I, V); 
- 
-  if (SimplifyAssociativeOrCommutative(I)) 
-    return &I; 
- 
-  if (Instruction *X = foldVectorBinop(I)) 
-    return X; 
- 
-  if (Instruction *NewXor = foldXorToXor(I, Builder)) 
-    return NewXor; 
- 
-  // (A&B)^(A&C) -> A&(B^C) etc 
-  if (Value *V = SimplifyUsingDistributiveLaws(I)) 
-    return replaceInstUsesWith(I, V); 
- 
-  // See if we can simplify any instructions used by the instruction whose sole 
-  // purpose is to compute bits we don't care about. 
-  if (SimplifyDemandedInstructionBits(I)) 
-    return &I; 
- 
-  if (Value *V = SimplifyBSwap(I, Builder)) 
-    return replaceInstUsesWith(I, V); 
- 
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); 
+  if (Value *V = SimplifyXorInst(I.getOperand(0), I.getOperand(1),
+                                 SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  if (SimplifyAssociativeOrCommutative(I))
+    return &I;
+
+  if (Instruction *X = foldVectorBinop(I))
+    return X;
+
+  if (Instruction *NewXor = foldXorToXor(I, Builder))
+    return NewXor;
+
+  // (A&B)^(A&C) -> A&(B^C) etc
+  if (Value *V = SimplifyUsingDistributiveLaws(I))
+    return replaceInstUsesWith(I, V);
+
+  // See if we can simplify any instructions used by the instruction whose sole
+  // purpose is to compute bits we don't care about.
+  if (SimplifyDemandedInstructionBits(I))
+    return &I;
+
+  if (Value *V = SimplifyBSwap(I, Builder))
+    return replaceInstUsesWith(I, V);
+
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   Type *Ty = I.getType();
- 
-  // Fold (X & M) ^ (Y & ~M) -> (X & M) | (Y & ~M) 
-  // This it a special case in haveNoCommonBitsSet, but the computeKnownBits 
-  // calls in there are unnecessary as SimplifyDemandedInstructionBits should 
-  // have already taken care of those cases. 
-  Value *M; 
-  if (match(&I, m_c_Xor(m_c_And(m_Not(m_Value(M)), m_Value()), 
-                        m_c_And(m_Deferred(M), m_Value())))) 
-    return BinaryOperator::CreateOr(Op0, Op1); 
- 
-  // Apply DeMorgan's Law for 'nand' / 'nor' logic with an inverted operand. 
-  Value *X, *Y; 
- 
-  // We must eliminate the and/or (one-use) for these transforms to not increase 
-  // the instruction count. 
-  // ~(~X & Y) --> (X | ~Y) 
-  // ~(Y & ~X) --> (X | ~Y) 
-  if (match(&I, m_Not(m_OneUse(m_c_And(m_Not(m_Value(X)), m_Value(Y)))))) { 
-    Value *NotY = Builder.CreateNot(Y, Y->getName() + ".not"); 
-    return BinaryOperator::CreateOr(X, NotY); 
-  } 
-  // ~(~X | Y) --> (X & ~Y) 
-  // ~(Y | ~X) --> (X & ~Y) 
-  if (match(&I, m_Not(m_OneUse(m_c_Or(m_Not(m_Value(X)), m_Value(Y)))))) { 
-    Value *NotY = Builder.CreateNot(Y, Y->getName() + ".not"); 
-    return BinaryOperator::CreateAnd(X, NotY); 
-  } 
- 
-  if (Instruction *Xor = visitMaskedMerge(I, Builder)) 
-    return Xor; 
- 
-  // Is this a 'not' (~) fed by a binary operator? 
-  BinaryOperator *NotVal; 
-  if (match(&I, m_Not(m_BinOp(NotVal)))) { 
-    if (NotVal->getOpcode() == Instruction::And || 
-        NotVal->getOpcode() == Instruction::Or) { 
-      // Apply DeMorgan's Law when inverts are free: 
-      // ~(X & Y) --> (~X | ~Y) 
-      // ~(X | Y) --> (~X & ~Y) 
-      if (isFreeToInvert(NotVal->getOperand(0), 
-                         NotVal->getOperand(0)->hasOneUse()) && 
-          isFreeToInvert(NotVal->getOperand(1), 
-                         NotVal->getOperand(1)->hasOneUse())) { 
-        Value *NotX = Builder.CreateNot(NotVal->getOperand(0), "notlhs"); 
-        Value *NotY = Builder.CreateNot(NotVal->getOperand(1), "notrhs"); 
-        if (NotVal->getOpcode() == Instruction::And) 
-          return BinaryOperator::CreateOr(NotX, NotY); 
-        return BinaryOperator::CreateAnd(NotX, NotY); 
-      } 
-    } 
- 
-    // ~(~X >>s Y) --> (X >>s Y) 
-    if (match(NotVal, m_AShr(m_Not(m_Value(X)), m_Value(Y)))) 
-      return BinaryOperator::CreateAShr(X, Y); 
- 
-    // If we are inverting a right-shifted constant, we may be able to eliminate 
-    // the 'not' by inverting the constant and using the opposite shift type. 
-    // Canonicalization rules ensure that only a negative constant uses 'ashr', 
-    // but we must check that in case that transform has not fired yet. 
- 
-    // ~(C >>s Y) --> ~C >>u Y (when inverting the replicated sign bits) 
-    Constant *C; 
-    if (match(NotVal, m_AShr(m_Constant(C), m_Value(Y))) && 
-        match(C, m_Negative())) { 
-      // We matched a negative constant, so propagating undef is unsafe. 
-      // Clamp undef elements to -1. 
+
+  // Fold (X & M) ^ (Y & ~M) -> (X & M) | (Y & ~M)
+  // This it a special case in haveNoCommonBitsSet, but the computeKnownBits
+  // calls in there are unnecessary as SimplifyDemandedInstructionBits should
+  // have already taken care of those cases.
+  Value *M;
+  if (match(&I, m_c_Xor(m_c_And(m_Not(m_Value(M)), m_Value()),
+                        m_c_And(m_Deferred(M), m_Value()))))
+    return BinaryOperator::CreateOr(Op0, Op1);
+
+  // Apply DeMorgan's Law for 'nand' / 'nor' logic with an inverted operand.
+  Value *X, *Y;
+
+  // We must eliminate the and/or (one-use) for these transforms to not increase
+  // the instruction count.
+  // ~(~X & Y) --> (X | ~Y)
+  // ~(Y & ~X) --> (X | ~Y)
+  if (match(&I, m_Not(m_OneUse(m_c_And(m_Not(m_Value(X)), m_Value(Y)))))) {
+    Value *NotY = Builder.CreateNot(Y, Y->getName() + ".not");
+    return BinaryOperator::CreateOr(X, NotY);
+  }
+  // ~(~X | Y) --> (X & ~Y)
+  // ~(Y | ~X) --> (X & ~Y)
+  if (match(&I, m_Not(m_OneUse(m_c_Or(m_Not(m_Value(X)), m_Value(Y)))))) {
+    Value *NotY = Builder.CreateNot(Y, Y->getName() + ".not");
+    return BinaryOperator::CreateAnd(X, NotY);
+  }
+
+  if (Instruction *Xor = visitMaskedMerge(I, Builder))
+    return Xor;
+
+  // Is this a 'not' (~) fed by a binary operator?
+  BinaryOperator *NotVal;
+  if (match(&I, m_Not(m_BinOp(NotVal)))) {
+    if (NotVal->getOpcode() == Instruction::And ||
+        NotVal->getOpcode() == Instruction::Or) {
+      // Apply DeMorgan's Law when inverts are free:
+      // ~(X & Y) --> (~X | ~Y)
+      // ~(X | Y) --> (~X & ~Y)
+      if (isFreeToInvert(NotVal->getOperand(0),
+                         NotVal->getOperand(0)->hasOneUse()) &&
+          isFreeToInvert(NotVal->getOperand(1),
+                         NotVal->getOperand(1)->hasOneUse())) {
+        Value *NotX = Builder.CreateNot(NotVal->getOperand(0), "notlhs");
+        Value *NotY = Builder.CreateNot(NotVal->getOperand(1), "notrhs");
+        if (NotVal->getOpcode() == Instruction::And)
+          return BinaryOperator::CreateOr(NotX, NotY);
+        return BinaryOperator::CreateAnd(NotX, NotY);
+      }
+    }
+
+    // ~(~X >>s Y) --> (X >>s Y)
+    if (match(NotVal, m_AShr(m_Not(m_Value(X)), m_Value(Y))))
+      return BinaryOperator::CreateAShr(X, Y);
+
+    // If we are inverting a right-shifted constant, we may be able to eliminate
+    // the 'not' by inverting the constant and using the opposite shift type.
+    // Canonicalization rules ensure that only a negative constant uses 'ashr',
+    // but we must check that in case that transform has not fired yet.
+
+    // ~(C >>s Y) --> ~C >>u Y (when inverting the replicated sign bits)
+    Constant *C;
+    if (match(NotVal, m_AShr(m_Constant(C), m_Value(Y))) &&
+        match(C, m_Negative())) {
+      // We matched a negative constant, so propagating undef is unsafe.
+      // Clamp undef elements to -1.
       Type *EltTy = Ty->getScalarType();
-      C = Constant::replaceUndefsWith(C, ConstantInt::getAllOnesValue(EltTy)); 
-      return BinaryOperator::CreateLShr(ConstantExpr::getNot(C), Y); 
-    } 
- 
-    // ~(C >>u Y) --> ~C >>s Y (when inverting the replicated sign bits) 
-    if (match(NotVal, m_LShr(m_Constant(C), m_Value(Y))) && 
-        match(C, m_NonNegative())) { 
-      // We matched a non-negative constant, so propagating undef is unsafe. 
-      // Clamp undef elements to 0. 
+      C = Constant::replaceUndefsWith(C, ConstantInt::getAllOnesValue(EltTy));
+      return BinaryOperator::CreateLShr(ConstantExpr::getNot(C), Y);
+    }
+
+    // ~(C >>u Y) --> ~C >>s Y (when inverting the replicated sign bits)
+    if (match(NotVal, m_LShr(m_Constant(C), m_Value(Y))) &&
+        match(C, m_NonNegative())) {
+      // We matched a non-negative constant, so propagating undef is unsafe.
+      // Clamp undef elements to 0.
       Type *EltTy = Ty->getScalarType();
-      C = Constant::replaceUndefsWith(C, ConstantInt::getNullValue(EltTy)); 
-      return BinaryOperator::CreateAShr(ConstantExpr::getNot(C), Y); 
-    } 
- 
+      C = Constant::replaceUndefsWith(C, ConstantInt::getNullValue(EltTy));
+      return BinaryOperator::CreateAShr(ConstantExpr::getNot(C), Y);
+    }
+
     // ~(X + C) --> ~C - X
     if (match(NotVal, m_c_Add(m_Value(X), m_ImmConstant(C))))
       return BinaryOperator::CreateSub(ConstantExpr::getNot(C), X);
@@ -3265,46 +3265,46 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
     if (match(NotVal, m_c_Add(m_Not(m_Value(X)), m_Value(Y))))
       return BinaryOperator::CreateWithCopiedFlags(Instruction::Sub, X, Y,
                                                    NotVal);
-  } 
- 
-  // Use DeMorgan and reassociation to eliminate a 'not' op. 
-  Constant *C1; 
-  if (match(Op1, m_Constant(C1))) { 
-    Constant *C2; 
-    if (match(Op0, m_OneUse(m_Or(m_Not(m_Value(X)), m_Constant(C2))))) { 
-      // (~X | C2) ^ C1 --> ((X & ~C2) ^ -1) ^ C1 --> (X & ~C2) ^ ~C1 
-      Value *And = Builder.CreateAnd(X, ConstantExpr::getNot(C2)); 
-      return BinaryOperator::CreateXor(And, ConstantExpr::getNot(C1)); 
-    } 
-    if (match(Op0, m_OneUse(m_And(m_Not(m_Value(X)), m_Constant(C2))))) { 
-      // (~X & C2) ^ C1 --> ((X | ~C2) ^ -1) ^ C1 --> (X | ~C2) ^ ~C1 
-      Value *Or = Builder.CreateOr(X, ConstantExpr::getNot(C2)); 
-      return BinaryOperator::CreateXor(Or, ConstantExpr::getNot(C1)); 
-    } 
-  } 
- 
-  // not (cmp A, B) = !cmp A, B 
-  CmpInst::Predicate Pred; 
-  if (match(&I, m_Not(m_OneUse(m_Cmp(Pred, m_Value(), m_Value()))))) { 
-    cast<CmpInst>(Op0)->setPredicate(CmpInst::getInversePredicate(Pred)); 
-    return replaceInstUsesWith(I, Op0); 
-  } 
- 
-  { 
-    const APInt *RHSC; 
-    if (match(Op1, m_APInt(RHSC))) { 
-      Value *X; 
-      const APInt *C; 
+  }
+
+  // Use DeMorgan and reassociation to eliminate a 'not' op.
+  Constant *C1;
+  if (match(Op1, m_Constant(C1))) {
+    Constant *C2;
+    if (match(Op0, m_OneUse(m_Or(m_Not(m_Value(X)), m_Constant(C2))))) {
+      // (~X | C2) ^ C1 --> ((X & ~C2) ^ -1) ^ C1 --> (X & ~C2) ^ ~C1
+      Value *And = Builder.CreateAnd(X, ConstantExpr::getNot(C2));
+      return BinaryOperator::CreateXor(And, ConstantExpr::getNot(C1));
+    }
+    if (match(Op0, m_OneUse(m_And(m_Not(m_Value(X)), m_Constant(C2))))) {
+      // (~X & C2) ^ C1 --> ((X | ~C2) ^ -1) ^ C1 --> (X | ~C2) ^ ~C1
+      Value *Or = Builder.CreateOr(X, ConstantExpr::getNot(C2));
+      return BinaryOperator::CreateXor(Or, ConstantExpr::getNot(C1));
+    }
+  }
+
+  // not (cmp A, B) = !cmp A, B
+  CmpInst::Predicate Pred;
+  if (match(&I, m_Not(m_OneUse(m_Cmp(Pred, m_Value(), m_Value()))))) {
+    cast<CmpInst>(Op0)->setPredicate(CmpInst::getInversePredicate(Pred));
+    return replaceInstUsesWith(I, Op0);
+  }
+
+  {
+    const APInt *RHSC;
+    if (match(Op1, m_APInt(RHSC))) {
+      Value *X;
+      const APInt *C;
       // (C - X) ^ signmaskC --> (C + signmaskC) - X
       if (RHSC->isSignMask() && match(Op0, m_Sub(m_APInt(C), m_Value(X))))
         return BinaryOperator::CreateSub(ConstantInt::get(Ty, *C + *RHSC), X);
- 
+
       // (X + C) ^ signmaskC --> X + (C + signmaskC)
       if (RHSC->isSignMask() && match(Op0, m_Add(m_Value(X), m_APInt(C))))
         return BinaryOperator::CreateAdd(X, ConstantInt::get(Ty, *C + *RHSC));
 
       // (X | C) ^ RHSC --> X ^ (C ^ RHSC) iff X & C == 0
-      if (match(Op0, m_Or(m_Value(X), m_APInt(C))) && 
+      if (match(Op0, m_Or(m_Value(X), m_APInt(C))) &&
           MaskedValueIsZero(X, *C, 0, &I))
         return BinaryOperator::CreateXor(X, ConstantInt::get(Ty, *C ^ *RHSC));
 
@@ -3315,7 +3315,7 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
           *RHSC == APInt::getAllOnesValue(Ty->getScalarSizeInBits()).shl(*C)) {
         Value *NotX = Builder.CreateNot(X);
         return BinaryOperator::CreateShl(NotX, ConstantInt::get(Ty, *C));
-      } 
+      }
       // (X >>u C) ^ RHSC --> ~X >>u C
       if (match(Op0, m_OneUse(m_LShr(m_Value(X), m_APInt(C)))) &&
           *RHSC == APInt::getAllOnesValue(Ty->getScalarSizeInBits()).lshr(*C)) {
@@ -3325,9 +3325,9 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
       // TODO: We could handle 'ashr' here as well. That would be matching
       //       a 'not' op and moving it before the shift. Doing that requires
       //       preventing the inverse fold in canShiftBinOpWithConstantRHS().
-    } 
-  } 
- 
+    }
+  }
+
   // FIXME: This should not be limited to scalar (pull into APInt match above).
   {
     Value *X;
@@ -3345,62 +3345,62 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
       Opnd0->takeName(cast<Instruction>(Op0));
       Opnd0->setDebugLoc(I.getDebugLoc());
       return BinaryOperator::CreateXor(Opnd0, ConstantInt::get(Ty, FoldConst));
-    } 
-  } 
- 
-  if (Instruction *FoldedLogic = foldBinOpIntoSelectOrPhi(I)) 
-    return FoldedLogic; 
- 
-  // Y ^ (X | Y) --> X & ~Y 
-  // Y ^ (Y | X) --> X & ~Y 
-  if (match(Op1, m_OneUse(m_c_Or(m_Value(X), m_Specific(Op0))))) 
-    return BinaryOperator::CreateAnd(X, Builder.CreateNot(Op0)); 
-  // (X | Y) ^ Y --> X & ~Y 
-  // (Y | X) ^ Y --> X & ~Y 
-  if (match(Op0, m_OneUse(m_c_Or(m_Value(X), m_Specific(Op1))))) 
-    return BinaryOperator::CreateAnd(X, Builder.CreateNot(Op1)); 
- 
-  // Y ^ (X & Y) --> ~X & Y 
-  // Y ^ (Y & X) --> ~X & Y 
-  if (match(Op1, m_OneUse(m_c_And(m_Value(X), m_Specific(Op0))))) 
-    return BinaryOperator::CreateAnd(Op0, Builder.CreateNot(X)); 
-  // (X & Y) ^ Y --> ~X & Y 
-  // (Y & X) ^ Y --> ~X & Y 
-  // Canonical form is (X & C) ^ C; don't touch that. 
-  // TODO: A 'not' op is better for analysis and codegen, but demanded bits must 
-  //       be fixed to prefer that (otherwise we get infinite looping). 
-  if (!match(Op1, m_Constant()) && 
-      match(Op0, m_OneUse(m_c_And(m_Value(X), m_Specific(Op1))))) 
-    return BinaryOperator::CreateAnd(Op1, Builder.CreateNot(X)); 
- 
-  Value *A, *B, *C; 
-  // (A ^ B) ^ (A | C) --> (~A & C) ^ B -- There are 4 commuted variants. 
-  if (match(&I, m_c_Xor(m_OneUse(m_Xor(m_Value(A), m_Value(B))), 
-                        m_OneUse(m_c_Or(m_Deferred(A), m_Value(C)))))) 
-      return BinaryOperator::CreateXor( 
-          Builder.CreateAnd(Builder.CreateNot(A), C), B); 
- 
-  // (A ^ B) ^ (B | C) --> (~B & C) ^ A -- There are 4 commuted variants. 
-  if (match(&I, m_c_Xor(m_OneUse(m_Xor(m_Value(A), m_Value(B))), 
-                        m_OneUse(m_c_Or(m_Deferred(B), m_Value(C)))))) 
-      return BinaryOperator::CreateXor( 
-          Builder.CreateAnd(Builder.CreateNot(B), C), A); 
- 
-  // (A & B) ^ (A ^ B) -> (A | B) 
-  if (match(Op0, m_And(m_Value(A), m_Value(B))) && 
-      match(Op1, m_c_Xor(m_Specific(A), m_Specific(B)))) 
-    return BinaryOperator::CreateOr(A, B); 
-  // (A ^ B) ^ (A & B) -> (A | B) 
-  if (match(Op0, m_Xor(m_Value(A), m_Value(B))) && 
-      match(Op1, m_c_And(m_Specific(A), m_Specific(B)))) 
-    return BinaryOperator::CreateOr(A, B); 
- 
-  // (A & ~B) ^ ~A -> ~(A & B) 
-  // (~B & A) ^ ~A -> ~(A & B) 
-  if (match(Op0, m_c_And(m_Value(A), m_Not(m_Value(B)))) && 
-      match(Op1, m_Not(m_Specific(A)))) 
-    return BinaryOperator::CreateNot(Builder.CreateAnd(A, B)); 
- 
+    }
+  }
+
+  if (Instruction *FoldedLogic = foldBinOpIntoSelectOrPhi(I))
+    return FoldedLogic;
+
+  // Y ^ (X | Y) --> X & ~Y
+  // Y ^ (Y | X) --> X & ~Y
+  if (match(Op1, m_OneUse(m_c_Or(m_Value(X), m_Specific(Op0)))))
+    return BinaryOperator::CreateAnd(X, Builder.CreateNot(Op0));
+  // (X | Y) ^ Y --> X & ~Y
+  // (Y | X) ^ Y --> X & ~Y
+  if (match(Op0, m_OneUse(m_c_Or(m_Value(X), m_Specific(Op1)))))
+    return BinaryOperator::CreateAnd(X, Builder.CreateNot(Op1));
+
+  // Y ^ (X & Y) --> ~X & Y
+  // Y ^ (Y & X) --> ~X & Y
+  if (match(Op1, m_OneUse(m_c_And(m_Value(X), m_Specific(Op0)))))
+    return BinaryOperator::CreateAnd(Op0, Builder.CreateNot(X));
+  // (X & Y) ^ Y --> ~X & Y
+  // (Y & X) ^ Y --> ~X & Y
+  // Canonical form is (X & C) ^ C; don't touch that.
+  // TODO: A 'not' op is better for analysis and codegen, but demanded bits must
+  //       be fixed to prefer that (otherwise we get infinite looping).
+  if (!match(Op1, m_Constant()) &&
+      match(Op0, m_OneUse(m_c_And(m_Value(X), m_Specific(Op1)))))
+    return BinaryOperator::CreateAnd(Op1, Builder.CreateNot(X));
+
+  Value *A, *B, *C;
+  // (A ^ B) ^ (A | C) --> (~A & C) ^ B -- There are 4 commuted variants.
+  if (match(&I, m_c_Xor(m_OneUse(m_Xor(m_Value(A), m_Value(B))),
+                        m_OneUse(m_c_Or(m_Deferred(A), m_Value(C))))))
+      return BinaryOperator::CreateXor(
+          Builder.CreateAnd(Builder.CreateNot(A), C), B);
+
+  // (A ^ B) ^ (B | C) --> (~B & C) ^ A -- There are 4 commuted variants.
+  if (match(&I, m_c_Xor(m_OneUse(m_Xor(m_Value(A), m_Value(B))),
+                        m_OneUse(m_c_Or(m_Deferred(B), m_Value(C))))))
+      return BinaryOperator::CreateXor(
+          Builder.CreateAnd(Builder.CreateNot(B), C), A);
+
+  // (A & B) ^ (A ^ B) -> (A | B)
+  if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
+      match(Op1, m_c_Xor(m_Specific(A), m_Specific(B))))
+    return BinaryOperator::CreateOr(A, B);
+  // (A ^ B) ^ (A & B) -> (A | B)
+  if (match(Op0, m_Xor(m_Value(A), m_Value(B))) &&
+      match(Op1, m_c_And(m_Specific(A), m_Specific(B))))
+    return BinaryOperator::CreateOr(A, B);
+
+  // (A & ~B) ^ ~A -> ~(A & B)
+  // (~B & A) ^ ~A -> ~(A & B)
+  if (match(Op0, m_c_And(m_Value(A), m_Not(m_Value(B)))) &&
+      match(Op1, m_Not(m_Specific(A))))
+    return BinaryOperator::CreateNot(Builder.CreateAnd(A, B));
+
   // (~A & B) ^ A --> A | B -- There are 4 commuted variants.
   if (match(&I, m_c_Xor(m_c_And(m_Not(m_Value(A)), m_Value(B)), m_Deferred(A))))
     return BinaryOperator::CreateOr(A, B);
@@ -3420,90 +3420,90 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
     }
   }
 
-  if (auto *LHS = dyn_cast<ICmpInst>(I.getOperand(0))) 
-    if (auto *RHS = dyn_cast<ICmpInst>(I.getOperand(1))) 
-      if (Value *V = foldXorOfICmps(LHS, RHS, I)) 
-        return replaceInstUsesWith(I, V); 
- 
-  if (Instruction *CastedXor = foldCastedBitwiseLogic(I)) 
-    return CastedXor; 
- 
-  // Canonicalize a shifty way to code absolute value to the common pattern. 
-  // There are 4 potential commuted variants. Move the 'ashr' candidate to Op1. 
-  // We're relying on the fact that we only do this transform when the shift has 
-  // exactly 2 uses and the add has exactly 1 use (otherwise, we might increase 
-  // instructions). 
-  if (Op0->hasNUses(2)) 
-    std::swap(Op0, Op1); 
- 
-  const APInt *ShAmt; 
-  if (match(Op1, m_AShr(m_Value(A), m_APInt(ShAmt))) && 
-      Op1->hasNUses(2) && *ShAmt == Ty->getScalarSizeInBits() - 1 && 
-      match(Op0, m_OneUse(m_c_Add(m_Specific(A), m_Specific(Op1))))) { 
-    // B = ashr i32 A, 31 ; smear the sign bit 
-    // xor (add A, B), B  ; add -1 and flip bits if negative 
-    // --> (A < 0) ? -A : A 
-    Value *Cmp = Builder.CreateICmpSLT(A, ConstantInt::getNullValue(Ty)); 
-    // Copy the nuw/nsw flags from the add to the negate. 
-    auto *Add = cast<BinaryOperator>(Op0); 
-    Value *Neg = Builder.CreateNeg(A, "", Add->hasNoUnsignedWrap(), 
-                                   Add->hasNoSignedWrap()); 
-    return SelectInst::Create(Cmp, Neg, A); 
-  } 
- 
-  // Eliminate a bitwise 'not' op of 'not' min/max by inverting the min/max: 
-  // 
-  //   %notx = xor i32 %x, -1 
-  //   %cmp1 = icmp sgt i32 %notx, %y 
-  //   %smax = select i1 %cmp1, i32 %notx, i32 %y 
-  //   %res = xor i32 %smax, -1 
-  // => 
-  //   %noty = xor i32 %y, -1 
-  //   %cmp2 = icmp slt %x, %noty 
-  //   %res = select i1 %cmp2, i32 %x, i32 %noty 
-  // 
-  // Same is applicable for smin/umax/umin. 
-  if (match(Op1, m_AllOnes()) && Op0->hasOneUse()) { 
-    Value *LHS, *RHS; 
-    SelectPatternFlavor SPF = matchSelectPattern(Op0, LHS, RHS).Flavor; 
-    if (SelectPatternResult::isMinOrMax(SPF)) { 
-      // It's possible we get here before the not has been simplified, so make 
-      // sure the input to the not isn't freely invertible. 
-      if (match(LHS, m_Not(m_Value(X))) && !isFreeToInvert(X, X->hasOneUse())) { 
-        Value *NotY = Builder.CreateNot(RHS); 
-        return SelectInst::Create( 
-            Builder.CreateICmp(getInverseMinMaxPred(SPF), X, NotY), X, NotY); 
-      } 
- 
-      // It's possible we get here before the not has been simplified, so make 
-      // sure the input to the not isn't freely invertible. 
-      if (match(RHS, m_Not(m_Value(Y))) && !isFreeToInvert(Y, Y->hasOneUse())) { 
-        Value *NotX = Builder.CreateNot(LHS); 
-        return SelectInst::Create( 
-            Builder.CreateICmp(getInverseMinMaxPred(SPF), NotX, Y), NotX, Y); 
-      } 
- 
-      // If both sides are freely invertible, then we can get rid of the xor 
-      // completely. 
-      if (isFreeToInvert(LHS, !LHS->hasNUsesOrMore(3)) && 
-          isFreeToInvert(RHS, !RHS->hasNUsesOrMore(3))) { 
-        Value *NotLHS = Builder.CreateNot(LHS); 
-        Value *NotRHS = Builder.CreateNot(RHS); 
-        return SelectInst::Create( 
-            Builder.CreateICmp(getInverseMinMaxPred(SPF), NotLHS, NotRHS), 
-            NotLHS, NotRHS); 
-      } 
-    } 
- 
+  if (auto *LHS = dyn_cast<ICmpInst>(I.getOperand(0)))
+    if (auto *RHS = dyn_cast<ICmpInst>(I.getOperand(1)))
+      if (Value *V = foldXorOfICmps(LHS, RHS, I))
+        return replaceInstUsesWith(I, V);
+
+  if (Instruction *CastedXor = foldCastedBitwiseLogic(I))
+    return CastedXor;
+
+  // Canonicalize a shifty way to code absolute value to the common pattern.
+  // There are 4 potential commuted variants. Move the 'ashr' candidate to Op1.
+  // We're relying on the fact that we only do this transform when the shift has
+  // exactly 2 uses and the add has exactly 1 use (otherwise, we might increase
+  // instructions).
+  if (Op0->hasNUses(2))
+    std::swap(Op0, Op1);
+
+  const APInt *ShAmt;
+  if (match(Op1, m_AShr(m_Value(A), m_APInt(ShAmt))) &&
+      Op1->hasNUses(2) && *ShAmt == Ty->getScalarSizeInBits() - 1 &&
+      match(Op0, m_OneUse(m_c_Add(m_Specific(A), m_Specific(Op1))))) {
+    // B = ashr i32 A, 31 ; smear the sign bit
+    // xor (add A, B), B  ; add -1 and flip bits if negative
+    // --> (A < 0) ? -A : A
+    Value *Cmp = Builder.CreateICmpSLT(A, ConstantInt::getNullValue(Ty));
+    // Copy the nuw/nsw flags from the add to the negate.
+    auto *Add = cast<BinaryOperator>(Op0);
+    Value *Neg = Builder.CreateNeg(A, "", Add->hasNoUnsignedWrap(),
+                                   Add->hasNoSignedWrap());
+    return SelectInst::Create(Cmp, Neg, A);
+  }
+
+  // Eliminate a bitwise 'not' op of 'not' min/max by inverting the min/max:
+  //
+  //   %notx = xor i32 %x, -1
+  //   %cmp1 = icmp sgt i32 %notx, %y
+  //   %smax = select i1 %cmp1, i32 %notx, i32 %y
+  //   %res = xor i32 %smax, -1
+  // =>
+  //   %noty = xor i32 %y, -1
+  //   %cmp2 = icmp slt %x, %noty
+  //   %res = select i1 %cmp2, i32 %x, i32 %noty
+  //
+  // Same is applicable for smin/umax/umin.
+  if (match(Op1, m_AllOnes()) && Op0->hasOneUse()) {
+    Value *LHS, *RHS;
+    SelectPatternFlavor SPF = matchSelectPattern(Op0, LHS, RHS).Flavor;
+    if (SelectPatternResult::isMinOrMax(SPF)) {
+      // It's possible we get here before the not has been simplified, so make
+      // sure the input to the not isn't freely invertible.
+      if (match(LHS, m_Not(m_Value(X))) && !isFreeToInvert(X, X->hasOneUse())) {
+        Value *NotY = Builder.CreateNot(RHS);
+        return SelectInst::Create(
+            Builder.CreateICmp(getInverseMinMaxPred(SPF), X, NotY), X, NotY);
+      }
+
+      // It's possible we get here before the not has been simplified, so make
+      // sure the input to the not isn't freely invertible.
+      if (match(RHS, m_Not(m_Value(Y))) && !isFreeToInvert(Y, Y->hasOneUse())) {
+        Value *NotX = Builder.CreateNot(LHS);
+        return SelectInst::Create(
+            Builder.CreateICmp(getInverseMinMaxPred(SPF), NotX, Y), NotX, Y);
+      }
+
+      // If both sides are freely invertible, then we can get rid of the xor
+      // completely.
+      if (isFreeToInvert(LHS, !LHS->hasNUsesOrMore(3)) &&
+          isFreeToInvert(RHS, !RHS->hasNUsesOrMore(3))) {
+        Value *NotLHS = Builder.CreateNot(LHS);
+        Value *NotRHS = Builder.CreateNot(RHS);
+        return SelectInst::Create(
+            Builder.CreateICmp(getInverseMinMaxPred(SPF), NotLHS, NotRHS),
+            NotLHS, NotRHS);
+      }
+    }
+
     // Pull 'not' into operands of select if both operands are one-use compares
     // or one is one-use compare and the other one is a constant.
-    // Inverting the predicates eliminates the 'not' operation. 
-    // Example: 
+    // Inverting the predicates eliminates the 'not' operation.
+    // Example:
     //   not (select ?, (cmp TPred, ?, ?), (cmp FPred, ?, ?) -->
-    //     select ?, (cmp InvTPred, ?, ?), (cmp InvFPred, ?, ?) 
+    //     select ?, (cmp InvTPred, ?, ?), (cmp InvFPred, ?, ?)
     //   not (select ?, (cmp TPred, ?, ?), true -->
     //     select ?, (cmp InvTPred, ?, ?), false
-    if (auto *Sel = dyn_cast<SelectInst>(Op0)) { 
+    if (auto *Sel = dyn_cast<SelectInst>(Op0)) {
       Value *TV = Sel->getTrueValue();
       Value *FV = Sel->getFalseValue();
       auto *CmpT = dyn_cast<CmpInst>(TV);
@@ -3519,14 +3519,14 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
           CmpF->setPredicate(CmpF->getInversePredicate());
         else
           Sel->setFalseValue(ConstantExpr::getNot(cast<Constant>(FV)));
-        return replaceInstUsesWith(I, Sel); 
-      } 
-    } 
-  } 
- 
-  if (Instruction *NewXor = sinkNotIntoXor(I, Builder)) 
-    return NewXor; 
- 
+        return replaceInstUsesWith(I, Sel);
+      }
+    }
+  }
+
+  if (Instruction *NewXor = sinkNotIntoXor(I, Builder))
+    return NewXor;
+
   // Otherwise, if all else failed, try to hoist the xor-by-constant:
   //   (X ^ C) ^ Y --> (X ^ Y) ^ C
   // Just like we do in other places, we completely avoid the fold
@@ -3537,5 +3537,5 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
                         m_Value(Y))))
     return BinaryOperator::CreateXor(Builder.CreateXor(X, Y), C1);
 
-  return nullptr; 
-} 
+  return nullptr;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp
index e9115e2eae..495493aab4 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp
@@ -1,159 +1,159 @@
-//===- InstCombineAtomicRMW.cpp -------------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the visit functions for atomic rmw instructions. 
-// 
-//===----------------------------------------------------------------------===// 
-
-#include "InstCombineInternal.h" 
-#include "llvm/IR/Instructions.h" 
+//===- InstCombineAtomicRMW.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visit functions for atomic rmw instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
- 
-using namespace llvm; 
- 
-namespace { 
-/// Return true if and only if the given instruction does not modify the memory 
-/// location referenced.  Note that an idemptent atomicrmw may still have 
-/// ordering effects on nearby instructions, or be volatile. 
-/// TODO: Common w/ the version in AtomicExpandPass, and change the term used. 
-/// Idemptotent is confusing in this context. 
-bool isIdempotentRMW(AtomicRMWInst& RMWI) { 
-  if (auto CF = dyn_cast<ConstantFP>(RMWI.getValOperand())) 
-    switch(RMWI.getOperation()) { 
-    case AtomicRMWInst::FAdd: // -0.0 
-      return CF->isZero() && CF->isNegative(); 
-    case AtomicRMWInst::FSub: // +0.0 
-      return CF->isZero() && !CF->isNegative(); 
-    default: 
-      return false; 
-    }; 
-
-  auto C = dyn_cast<ConstantInt>(RMWI.getValOperand()); 
-  if(!C) 
-    return false; 
- 
-  switch(RMWI.getOperation()) { 
-    case AtomicRMWInst::Add: 
-    case AtomicRMWInst::Sub: 
-    case AtomicRMWInst::Or: 
-    case AtomicRMWInst::Xor: 
-      return C->isZero(); 
-    case AtomicRMWInst::And: 
-      return C->isMinusOne(); 
-    case AtomicRMWInst::Min: 
-      return C->isMaxValue(true); 
-    case AtomicRMWInst::Max: 
-      return C->isMinValue(true); 
-    case AtomicRMWInst::UMin: 
-      return C->isMaxValue(false); 
-    case AtomicRMWInst::UMax: 
-      return C->isMinValue(false); 
-    default: 
-      return false; 
-  } 
-} 
- 
-/// Return true if the given instruction always produces a value in memory 
-/// equivalent to its value operand. 
-bool isSaturating(AtomicRMWInst& RMWI) { 
-  if (auto CF = dyn_cast<ConstantFP>(RMWI.getValOperand())) 
-    switch(RMWI.getOperation()) { 
-    case AtomicRMWInst::FAdd: 
-    case AtomicRMWInst::FSub: 
-      return CF->isNaN(); 
-    default: 
-      return false; 
-    }; 
- 
-  auto C = dyn_cast<ConstantInt>(RMWI.getValOperand()); 
-  if(!C) 
-    return false; 
- 
-  switch(RMWI.getOperation()) { 
-  default: 
-    return false; 
-  case AtomicRMWInst::Xchg: 
-    return true; 
-  case AtomicRMWInst::Or: 
-    return C->isAllOnesValue(); 
-  case AtomicRMWInst::And: 
-    return C->isZero(); 
-  case AtomicRMWInst::Min: 
-    return C->isMinValue(true); 
-  case AtomicRMWInst::Max: 
-    return C->isMaxValue(true); 
-  case AtomicRMWInst::UMin: 
-    return C->isMinValue(false); 
-  case AtomicRMWInst::UMax: 
-    return C->isMaxValue(false); 
-  }; 
-} 
+
+using namespace llvm;
+
+namespace {
+/// Return true if and only if the given instruction does not modify the memory
+/// location referenced.  Note that an idemptent atomicrmw may still have
+/// ordering effects on nearby instructions, or be volatile.
+/// TODO: Common w/ the version in AtomicExpandPass, and change the term used.
+/// Idemptotent is confusing in this context.
+bool isIdempotentRMW(AtomicRMWInst& RMWI) {
+  if (auto CF = dyn_cast<ConstantFP>(RMWI.getValOperand()))
+    switch(RMWI.getOperation()) {
+    case AtomicRMWInst::FAdd: // -0.0
+      return CF->isZero() && CF->isNegative();
+    case AtomicRMWInst::FSub: // +0.0
+      return CF->isZero() && !CF->isNegative();
+    default:
+      return false;
+    };
+
+  auto C = dyn_cast<ConstantInt>(RMWI.getValOperand());
+  if(!C)
+    return false;
+
+  switch(RMWI.getOperation()) {
+    case AtomicRMWInst::Add:
+    case AtomicRMWInst::Sub:
+    case AtomicRMWInst::Or:
+    case AtomicRMWInst::Xor:
+      return C->isZero();
+    case AtomicRMWInst::And:
+      return C->isMinusOne();
+    case AtomicRMWInst::Min:
+      return C->isMaxValue(true);
+    case AtomicRMWInst::Max:
+      return C->isMinValue(true);
+    case AtomicRMWInst::UMin:
+      return C->isMaxValue(false);
+    case AtomicRMWInst::UMax:
+      return C->isMinValue(false);
+    default:
+      return false;
+  }
+}
+
+/// Return true if the given instruction always produces a value in memory
+/// equivalent to its value operand.
+bool isSaturating(AtomicRMWInst& RMWI) {
+  if (auto CF = dyn_cast<ConstantFP>(RMWI.getValOperand()))
+    switch(RMWI.getOperation()) {
+    case AtomicRMWInst::FAdd:
+    case AtomicRMWInst::FSub:
+      return CF->isNaN();
+    default:
+      return false;
+    };
+
+  auto C = dyn_cast<ConstantInt>(RMWI.getValOperand());
+  if(!C)
+    return false;
+
+  switch(RMWI.getOperation()) {
+  default:
+    return false;
+  case AtomicRMWInst::Xchg:
+    return true;
+  case AtomicRMWInst::Or:
+    return C->isAllOnesValue();
+  case AtomicRMWInst::And:
+    return C->isZero();
+  case AtomicRMWInst::Min:
+    return C->isMinValue(true);
+  case AtomicRMWInst::Max:
+    return C->isMaxValue(true);
+  case AtomicRMWInst::UMin:
+    return C->isMinValue(false);
+  case AtomicRMWInst::UMax:
+    return C->isMaxValue(false);
+  };
+}
 } // namespace
- 
+
 Instruction *InstCombinerImpl::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
- 
-  // Volatile RMWs perform a load and a store, we cannot replace this by just a 
-  // load or just a store. We chose not to canonicalize out of general paranoia 
+
+  // Volatile RMWs perform a load and a store, we cannot replace this by just a
+  // load or just a store. We chose not to canonicalize out of general paranoia
   // about user expectations around volatile.
-  if (RMWI.isVolatile()) 
-    return nullptr; 
- 
-  // Any atomicrmw op which produces a known result in memory can be 
-  // replaced w/an atomicrmw xchg. 
-  if (isSaturating(RMWI) && 
-      RMWI.getOperation() != AtomicRMWInst::Xchg) { 
-    RMWI.setOperation(AtomicRMWInst::Xchg); 
-    return &RMWI; 
-  } 
- 
-  AtomicOrdering Ordering = RMWI.getOrdering(); 
-  assert(Ordering != AtomicOrdering::NotAtomic && 
-         Ordering != AtomicOrdering::Unordered && 
-         "AtomicRMWs don't make sense with Unordered or NotAtomic"); 
- 
-  // Any atomicrmw xchg with no uses can be converted to a atomic store if the 
+  if (RMWI.isVolatile())
+    return nullptr;
+
+  // Any atomicrmw op which produces a known result in memory can be
+  // replaced w/an atomicrmw xchg.
+  if (isSaturating(RMWI) &&
+      RMWI.getOperation() != AtomicRMWInst::Xchg) {
+    RMWI.setOperation(AtomicRMWInst::Xchg);
+    return &RMWI;
+  }
+
+  AtomicOrdering Ordering = RMWI.getOrdering();
+  assert(Ordering != AtomicOrdering::NotAtomic &&
+         Ordering != AtomicOrdering::Unordered &&
+         "AtomicRMWs don't make sense with Unordered or NotAtomic");
+
+  // Any atomicrmw xchg with no uses can be converted to a atomic store if the
   // ordering is compatible.
-  if (RMWI.getOperation() == AtomicRMWInst::Xchg && 
-      RMWI.use_empty()) { 
-    if (Ordering != AtomicOrdering::Release && 
-        Ordering != AtomicOrdering::Monotonic) 
-      return nullptr; 
-    auto *SI = new StoreInst(RMWI.getValOperand(), 
-                             RMWI.getPointerOperand(), &RMWI); 
-    SI->setAtomic(Ordering, RMWI.getSyncScopeID()); 
-    SI->setAlignment(DL.getABITypeAlign(RMWI.getType())); 
-    return eraseInstFromFunction(RMWI); 
-  } 
-
-  if (!isIdempotentRMW(RMWI)) 
-    return nullptr; 
- 
-  // We chose to canonicalize all idempotent operations to an single 
-  // operation code and constant.  This makes it easier for the rest of the 
-  // optimizer to match easily.  The choices of or w/0 and fadd w/-0.0 are 
+  if (RMWI.getOperation() == AtomicRMWInst::Xchg &&
+      RMWI.use_empty()) {
+    if (Ordering != AtomicOrdering::Release &&
+        Ordering != AtomicOrdering::Monotonic)
+      return nullptr;
+    auto *SI = new StoreInst(RMWI.getValOperand(),
+                             RMWI.getPointerOperand(), &RMWI);
+    SI->setAtomic(Ordering, RMWI.getSyncScopeID());
+    SI->setAlignment(DL.getABITypeAlign(RMWI.getType()));
+    return eraseInstFromFunction(RMWI);
+  }
+
+  if (!isIdempotentRMW(RMWI))
+    return nullptr;
+
+  // We chose to canonicalize all idempotent operations to an single
+  // operation code and constant.  This makes it easier for the rest of the
+  // optimizer to match easily.  The choices of or w/0 and fadd w/-0.0 are
   // arbitrary.
-  if (RMWI.getType()->isIntegerTy() && 
-      RMWI.getOperation() != AtomicRMWInst::Or) { 
-    RMWI.setOperation(AtomicRMWInst::Or); 
-    return replaceOperand(RMWI, 1, ConstantInt::get(RMWI.getType(), 0)); 
-  } else if (RMWI.getType()->isFloatingPointTy() && 
-             RMWI.getOperation() != AtomicRMWInst::FAdd) { 
-    RMWI.setOperation(AtomicRMWInst::FAdd); 
-    return replaceOperand(RMWI, 1, ConstantFP::getNegativeZero(RMWI.getType())); 
-  } 
- 
-  // Check if the required ordering is compatible with an atomic load. 
-  if (Ordering != AtomicOrdering::Acquire && 
-      Ordering != AtomicOrdering::Monotonic) 
-    return nullptr; 
-
-  LoadInst *Load = new LoadInst(RMWI.getType(), RMWI.getPointerOperand(), "", 
-                                false, DL.getABITypeAlign(RMWI.getType()), 
-                                Ordering, RMWI.getSyncScopeID()); 
-  return Load; 
-} 
+  if (RMWI.getType()->isIntegerTy() &&
+      RMWI.getOperation() != AtomicRMWInst::Or) {
+    RMWI.setOperation(AtomicRMWInst::Or);
+    return replaceOperand(RMWI, 1, ConstantInt::get(RMWI.getType(), 0));
+  } else if (RMWI.getType()->isFloatingPointTy() &&
+             RMWI.getOperation() != AtomicRMWInst::FAdd) {
+    RMWI.setOperation(AtomicRMWInst::FAdd);
+    return replaceOperand(RMWI, 1, ConstantFP::getNegativeZero(RMWI.getType()));
+  }
+
+  // Check if the required ordering is compatible with an atomic load.
+  if (Ordering != AtomicOrdering::Acquire &&
+      Ordering != AtomicOrdering::Monotonic)
+    return nullptr;
+
+  LoadInst *Load = new LoadInst(RMWI.getType(), RMWI.getPointerOperand(), "",
+                                false, DL.getABITypeAlign(RMWI.getType()),
+                                Ordering, RMWI.getSyncScopeID());
+  return Load;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineCalls.cpp b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineCalls.cpp
index a1fd5f4c4c..5482b944e3 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1,646 +1,646 @@
-//===- InstCombineCalls.cpp -----------------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the visitCall, visitInvoke, and visitCallBr functions. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "InstCombineInternal.h" 
-#include "llvm/ADT/APFloat.h" 
-#include "llvm/ADT/APInt.h" 
-#include "llvm/ADT/APSInt.h" 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/FloatingPointMode.h" 
-#include "llvm/ADT/None.h" 
-#include "llvm/ADT/Optional.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/Twine.h" 
-#include "llvm/Analysis/AliasAnalysis.h" 
-#include "llvm/Analysis/AssumeBundleQueries.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/Analysis/Loads.h" 
-#include "llvm/Analysis/MemoryBuiltins.h" 
+//===- InstCombineCalls.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visitCall, visitInvoke, and visitCallBr functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/FloatingPointMode.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumeBundleQueries.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/Analysis/VectorUtils.h" 
-#include "llvm/IR/Attributes.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GlobalVariable.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/IntrinsicsAArch64.h" 
-#include "llvm/IR/IntrinsicsAMDGPU.h" 
-#include "llvm/IR/IntrinsicsARM.h" 
-#include "llvm/IR/IntrinsicsHexagon.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/IR/Statepoint.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/IR/ValueHandle.h" 
-#include "llvm/Support/AtomicOrdering.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Compiler.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/KnownBits.h" 
-#include "llvm/Support/MathExtras.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/InstCombine/InstCombineWorklist.h" 
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsARM.h"
+#include "llvm/IR/IntrinsicsHexagon.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Transforms/Utils/SimplifyLibCalls.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <cstdint> 
-#include <cstring> 
-#include <utility> 
-#include <vector> 
- 
-using namespace llvm; 
-using namespace PatternMatch; 
- 
-#define DEBUG_TYPE "instcombine" 
- 
-STATISTIC(NumSimplified, "Number of library calls simplified"); 
- 
-static cl::opt<unsigned> GuardWideningWindow( 
-    "instcombine-guard-widening-window", 
-    cl::init(3), 
-    cl::desc("How wide an instruction window to bypass looking for " 
-             "another guard")); 
- 
-/// Return the specified type promoted as it would be to pass though a va_arg 
-/// area. 
-static Type *getPromotedType(Type *Ty) { 
-  if (IntegerType* ITy = dyn_cast<IntegerType>(Ty)) { 
-    if (ITy->getBitWidth() < 32) 
-      return Type::getInt32Ty(Ty->getContext()); 
-  } 
-  return Ty; 
-} 
- 
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
+STATISTIC(NumSimplified, "Number of library calls simplified");
+
+static cl::opt<unsigned> GuardWideningWindow(
+    "instcombine-guard-widening-window",
+    cl::init(3),
+    cl::desc("How wide an instruction window to bypass looking for "
+             "another guard"));
+
+/// Return the specified type promoted as it would be to pass though a va_arg
+/// area.
+static Type *getPromotedType(Type *Ty) {
+  if (IntegerType* ITy = dyn_cast<IntegerType>(Ty)) {
+    if (ITy->getBitWidth() < 32)
+      return Type::getInt32Ty(Ty->getContext());
+  }
+  return Ty;
+}
+
 Instruction *InstCombinerImpl::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
-  Align DstAlign = getKnownAlignment(MI->getRawDest(), DL, MI, &AC, &DT); 
-  MaybeAlign CopyDstAlign = MI->getDestAlign(); 
-  if (!CopyDstAlign || *CopyDstAlign < DstAlign) { 
-    MI->setDestAlignment(DstAlign); 
-    return MI; 
-  } 
- 
-  Align SrcAlign = getKnownAlignment(MI->getRawSource(), DL, MI, &AC, &DT); 
-  MaybeAlign CopySrcAlign = MI->getSourceAlign(); 
-  if (!CopySrcAlign || *CopySrcAlign < SrcAlign) { 
-    MI->setSourceAlignment(SrcAlign); 
-    return MI; 
-  } 
- 
-  // If we have a store to a location which is known constant, we can conclude 
-  // that the store must be storing the constant value (else the memory 
-  // wouldn't be constant), and this must be a noop. 
-  if (AA->pointsToConstantMemory(MI->getDest())) { 
-    // Set the size of the copy to 0, it will be deleted on the next iteration. 
-    MI->setLength(Constant::getNullValue(MI->getLength()->getType())); 
-    return MI; 
-  } 
- 
-  // If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with 
-  // load/store. 
-  ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getLength()); 
-  if (!MemOpLength) return nullptr; 
- 
-  // Source and destination pointer types are always "i8*" for intrinsic.  See 
-  // if the size is something we can handle with a single primitive load/store. 
-  // A single load+store correctly handles overlapping memory in the memmove 
-  // case. 
-  uint64_t Size = MemOpLength->getLimitedValue(); 
-  assert(Size && "0-sized memory transferring should be removed already."); 
- 
-  if (Size > 8 || (Size&(Size-1))) 
-    return nullptr;  // If not 1/2/4/8 bytes, exit. 
- 
-  // If it is an atomic and alignment is less than the size then we will 
-  // introduce the unaligned memory access which will be later transformed 
-  // into libcall in CodeGen. This is not evident performance gain so disable 
-  // it now. 
-  if (isa<AtomicMemTransferInst>(MI)) 
-    if (*CopyDstAlign < Size || *CopySrcAlign < Size) 
-      return nullptr; 
- 
-  // Use an integer load+store unless we can find something better. 
-  unsigned SrcAddrSp = 
-    cast<PointerType>(MI->getArgOperand(1)->getType())->getAddressSpace(); 
-  unsigned DstAddrSp = 
-    cast<PointerType>(MI->getArgOperand(0)->getType())->getAddressSpace(); 
- 
-  IntegerType* IntType = IntegerType::get(MI->getContext(), Size<<3); 
-  Type *NewSrcPtrTy = PointerType::get(IntType, SrcAddrSp); 
-  Type *NewDstPtrTy = PointerType::get(IntType, DstAddrSp); 
- 
-  // If the memcpy has metadata describing the members, see if we can get the 
-  // TBAA tag describing our copy. 
-  MDNode *CopyMD = nullptr; 
-  if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa)) { 
-    CopyMD = M; 
-  } else if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) { 
-    if (M->getNumOperands() == 3 && M->getOperand(0) && 
-        mdconst::hasa<ConstantInt>(M->getOperand(0)) && 
-        mdconst::extract<ConstantInt>(M->getOperand(0))->isZero() && 
-        M->getOperand(1) && 
-        mdconst::hasa<ConstantInt>(M->getOperand(1)) && 
-        mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() == 
-        Size && 
-        M->getOperand(2) && isa<MDNode>(M->getOperand(2))) 
-      CopyMD = cast<MDNode>(M->getOperand(2)); 
-  } 
- 
-  Value *Src = Builder.CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy); 
-  Value *Dest = Builder.CreateBitCast(MI->getArgOperand(0), NewDstPtrTy); 
-  LoadInst *L = Builder.CreateLoad(IntType, Src); 
-  // Alignment from the mem intrinsic will be better, so use it. 
-  L->setAlignment(*CopySrcAlign); 
-  if (CopyMD) 
-    L->setMetadata(LLVMContext::MD_tbaa, CopyMD); 
-  MDNode *LoopMemParallelMD = 
-    MI->getMetadata(LLVMContext::MD_mem_parallel_loop_access); 
-  if (LoopMemParallelMD) 
-    L->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD); 
-  MDNode *AccessGroupMD = MI->getMetadata(LLVMContext::MD_access_group); 
-  if (AccessGroupMD) 
-    L->setMetadata(LLVMContext::MD_access_group, AccessGroupMD); 
- 
-  StoreInst *S = Builder.CreateStore(L, Dest); 
-  // Alignment from the mem intrinsic will be better, so use it. 
-  S->setAlignment(*CopyDstAlign); 
-  if (CopyMD) 
-    S->setMetadata(LLVMContext::MD_tbaa, CopyMD); 
-  if (LoopMemParallelMD) 
-    S->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD); 
-  if (AccessGroupMD) 
-    S->setMetadata(LLVMContext::MD_access_group, AccessGroupMD); 
- 
-  if (auto *MT = dyn_cast<MemTransferInst>(MI)) { 
-    // non-atomics can be volatile 
-    L->setVolatile(MT->isVolatile()); 
-    S->setVolatile(MT->isVolatile()); 
-  } 
-  if (isa<AtomicMemTransferInst>(MI)) { 
-    // atomics have to be unordered 
-    L->setOrdering(AtomicOrdering::Unordered); 
-    S->setOrdering(AtomicOrdering::Unordered); 
-  } 
- 
-  // Set the size of the copy to 0, it will be deleted on the next iteration. 
-  MI->setLength(Constant::getNullValue(MemOpLength->getType())); 
-  return MI; 
-} 
- 
+  Align DstAlign = getKnownAlignment(MI->getRawDest(), DL, MI, &AC, &DT);
+  MaybeAlign CopyDstAlign = MI->getDestAlign();
+  if (!CopyDstAlign || *CopyDstAlign < DstAlign) {
+    MI->setDestAlignment(DstAlign);
+    return MI;
+  }
+
+  Align SrcAlign = getKnownAlignment(MI->getRawSource(), DL, MI, &AC, &DT);
+  MaybeAlign CopySrcAlign = MI->getSourceAlign();
+  if (!CopySrcAlign || *CopySrcAlign < SrcAlign) {
+    MI->setSourceAlignment(SrcAlign);
+    return MI;
+  }
+
+  // If we have a store to a location which is known constant, we can conclude
+  // that the store must be storing the constant value (else the memory
+  // wouldn't be constant), and this must be a noop.
+  if (AA->pointsToConstantMemory(MI->getDest())) {
+    // Set the size of the copy to 0, it will be deleted on the next iteration.
+    MI->setLength(Constant::getNullValue(MI->getLength()->getType()));
+    return MI;
+  }
+
+  // If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with
+  // load/store.
+  ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getLength());
+  if (!MemOpLength) return nullptr;
+
+  // Source and destination pointer types are always "i8*" for intrinsic.  See
+  // if the size is something we can handle with a single primitive load/store.
+  // A single load+store correctly handles overlapping memory in the memmove
+  // case.
+  uint64_t Size = MemOpLength->getLimitedValue();
+  assert(Size && "0-sized memory transferring should be removed already.");
+
+  if (Size > 8 || (Size&(Size-1)))
+    return nullptr;  // If not 1/2/4/8 bytes, exit.
+
+  // If it is an atomic and alignment is less than the size then we will
+  // introduce the unaligned memory access which will be later transformed
+  // into libcall in CodeGen. This is not evident performance gain so disable
+  // it now.
+  if (isa<AtomicMemTransferInst>(MI))
+    if (*CopyDstAlign < Size || *CopySrcAlign < Size)
+      return nullptr;
+
+  // Use an integer load+store unless we can find something better.
+  unsigned SrcAddrSp =
+    cast<PointerType>(MI->getArgOperand(1)->getType())->getAddressSpace();
+  unsigned DstAddrSp =
+    cast<PointerType>(MI->getArgOperand(0)->getType())->getAddressSpace();
+
+  IntegerType* IntType = IntegerType::get(MI->getContext(), Size<<3);
+  Type *NewSrcPtrTy = PointerType::get(IntType, SrcAddrSp);
+  Type *NewDstPtrTy = PointerType::get(IntType, DstAddrSp);
+
+  // If the memcpy has metadata describing the members, see if we can get the
+  // TBAA tag describing our copy.
+  MDNode *CopyMD = nullptr;
+  if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa)) {
+    CopyMD = M;
+  } else if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) {
+    if (M->getNumOperands() == 3 && M->getOperand(0) &&
+        mdconst::hasa<ConstantInt>(M->getOperand(0)) &&
+        mdconst::extract<ConstantInt>(M->getOperand(0))->isZero() &&
+        M->getOperand(1) &&
+        mdconst::hasa<ConstantInt>(M->getOperand(1)) &&
+        mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() ==
+        Size &&
+        M->getOperand(2) && isa<MDNode>(M->getOperand(2)))
+      CopyMD = cast<MDNode>(M->getOperand(2));
+  }
+
+  Value *Src = Builder.CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy);
+  Value *Dest = Builder.CreateBitCast(MI->getArgOperand(0), NewDstPtrTy);
+  LoadInst *L = Builder.CreateLoad(IntType, Src);
+  // Alignment from the mem intrinsic will be better, so use it.
+  L->setAlignment(*CopySrcAlign);
+  if (CopyMD)
+    L->setMetadata(LLVMContext::MD_tbaa, CopyMD);
+  MDNode *LoopMemParallelMD =
+    MI->getMetadata(LLVMContext::MD_mem_parallel_loop_access);
+  if (LoopMemParallelMD)
+    L->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);
+  MDNode *AccessGroupMD = MI->getMetadata(LLVMContext::MD_access_group);
+  if (AccessGroupMD)
+    L->setMetadata(LLVMContext::MD_access_group, AccessGroupMD);
+
+  StoreInst *S = Builder.CreateStore(L, Dest);
+  // Alignment from the mem intrinsic will be better, so use it.
+  S->setAlignment(*CopyDstAlign);
+  if (CopyMD)
+    S->setMetadata(LLVMContext::MD_tbaa, CopyMD);
+  if (LoopMemParallelMD)
+    S->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);
+  if (AccessGroupMD)
+    S->setMetadata(LLVMContext::MD_access_group, AccessGroupMD);
+
+  if (auto *MT = dyn_cast<MemTransferInst>(MI)) {
+    // non-atomics can be volatile
+    L->setVolatile(MT->isVolatile());
+    S->setVolatile(MT->isVolatile());
+  }
+  if (isa<AtomicMemTransferInst>(MI)) {
+    // atomics have to be unordered
+    L->setOrdering(AtomicOrdering::Unordered);
+    S->setOrdering(AtomicOrdering::Unordered);
+  }
+
+  // Set the size of the copy to 0, it will be deleted on the next iteration.
+  MI->setLength(Constant::getNullValue(MemOpLength->getType()));
+  return MI;
+}
+
 Instruction *InstCombinerImpl::SimplifyAnyMemSet(AnyMemSetInst *MI) {
-  const Align KnownAlignment = 
-      getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT); 
-  MaybeAlign MemSetAlign = MI->getDestAlign(); 
-  if (!MemSetAlign || *MemSetAlign < KnownAlignment) { 
-    MI->setDestAlignment(KnownAlignment); 
-    return MI; 
-  } 
- 
-  // If we have a store to a location which is known constant, we can conclude 
-  // that the store must be storing the constant value (else the memory 
-  // wouldn't be constant), and this must be a noop. 
-  if (AA->pointsToConstantMemory(MI->getDest())) { 
-    // Set the size of the copy to 0, it will be deleted on the next iteration. 
-    MI->setLength(Constant::getNullValue(MI->getLength()->getType())); 
-    return MI; 
-  } 
- 
-  // Extract the length and alignment and fill if they are constant. 
-  ConstantInt *LenC = dyn_cast<ConstantInt>(MI->getLength()); 
-  ConstantInt *FillC = dyn_cast<ConstantInt>(MI->getValue()); 
-  if (!LenC || !FillC || !FillC->getType()->isIntegerTy(8)) 
-    return nullptr; 
-  const uint64_t Len = LenC->getLimitedValue(); 
-  assert(Len && "0-sized memory setting should be removed already."); 
-  const Align Alignment = assumeAligned(MI->getDestAlignment()); 
- 
-  // If it is an atomic and alignment is less than the size then we will 
-  // introduce the unaligned memory access which will be later transformed 
-  // into libcall in CodeGen. This is not evident performance gain so disable 
-  // it now. 
-  if (isa<AtomicMemSetInst>(MI)) 
-    if (Alignment < Len) 
-      return nullptr; 
- 
-  // memset(s,c,n) -> store s, c (for n=1,2,4,8) 
-  if (Len <= 8 && isPowerOf2_32((uint32_t)Len)) { 
-    Type *ITy = IntegerType::get(MI->getContext(), Len*8);  // n=1 -> i8. 
- 
-    Value *Dest = MI->getDest(); 
-    unsigned DstAddrSp = cast<PointerType>(Dest->getType())->getAddressSpace(); 
-    Type *NewDstPtrTy = PointerType::get(ITy, DstAddrSp); 
-    Dest = Builder.CreateBitCast(Dest, NewDstPtrTy); 
- 
-    // Extract the fill value and store. 
-    uint64_t Fill = FillC->getZExtValue()*0x0101010101010101ULL; 
-    StoreInst *S = Builder.CreateStore(ConstantInt::get(ITy, Fill), Dest, 
-                                       MI->isVolatile()); 
-    S->setAlignment(Alignment); 
-    if (isa<AtomicMemSetInst>(MI)) 
-      S->setOrdering(AtomicOrdering::Unordered); 
- 
-    // Set the size of the copy to 0, it will be deleted on the next iteration. 
-    MI->setLength(Constant::getNullValue(LenC->getType())); 
-    return MI; 
-  } 
- 
-  return nullptr; 
-} 
- 
-// TODO, Obvious Missing Transforms: 
-// * Narrow width by halfs excluding zero/undef lanes 
+  const Align KnownAlignment =
+      getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT);
+  MaybeAlign MemSetAlign = MI->getDestAlign();
+  if (!MemSetAlign || *MemSetAlign < KnownAlignment) {
+    MI->setDestAlignment(KnownAlignment);
+    return MI;
+  }
+
+  // If we have a store to a location which is known constant, we can conclude
+  // that the store must be storing the constant value (else the memory
+  // wouldn't be constant), and this must be a noop.
+  if (AA->pointsToConstantMemory(MI->getDest())) {
+    // Set the size of the copy to 0, it will be deleted on the next iteration.
+    MI->setLength(Constant::getNullValue(MI->getLength()->getType()));
+    return MI;
+  }
+
+  // Extract the length and alignment and fill if they are constant.
+  ConstantInt *LenC = dyn_cast<ConstantInt>(MI->getLength());
+  ConstantInt *FillC = dyn_cast<ConstantInt>(MI->getValue());
+  if (!LenC || !FillC || !FillC->getType()->isIntegerTy(8))
+    return nullptr;
+  const uint64_t Len = LenC->getLimitedValue();
+  assert(Len && "0-sized memory setting should be removed already.");
+  const Align Alignment = assumeAligned(MI->getDestAlignment());
+
+  // If it is an atomic and alignment is less than the size then we will
+  // introduce the unaligned memory access which will be later transformed
+  // into libcall in CodeGen. This is not evident performance gain so disable
+  // it now.
+  if (isa<AtomicMemSetInst>(MI))
+    if (Alignment < Len)
+      return nullptr;
+
+  // memset(s,c,n) -> store s, c (for n=1,2,4,8)
+  if (Len <= 8 && isPowerOf2_32((uint32_t)Len)) {
+    Type *ITy = IntegerType::get(MI->getContext(), Len*8);  // n=1 -> i8.
+
+    Value *Dest = MI->getDest();
+    unsigned DstAddrSp = cast<PointerType>(Dest->getType())->getAddressSpace();
+    Type *NewDstPtrTy = PointerType::get(ITy, DstAddrSp);
+    Dest = Builder.CreateBitCast(Dest, NewDstPtrTy);
+
+    // Extract the fill value and store.
+    uint64_t Fill = FillC->getZExtValue()*0x0101010101010101ULL;
+    StoreInst *S = Builder.CreateStore(ConstantInt::get(ITy, Fill), Dest,
+                                       MI->isVolatile());
+    S->setAlignment(Alignment);
+    if (isa<AtomicMemSetInst>(MI))
+      S->setOrdering(AtomicOrdering::Unordered);
+
+    // Set the size of the copy to 0, it will be deleted on the next iteration.
+    MI->setLength(Constant::getNullValue(LenC->getType()));
+    return MI;
+  }
+
+  return nullptr;
+}
+
+// TODO, Obvious Missing Transforms:
+// * Narrow width by halfs excluding zero/undef lanes
 Value *InstCombinerImpl::simplifyMaskedLoad(IntrinsicInst &II) {
-  Value *LoadPtr = II.getArgOperand(0); 
-  const Align Alignment = 
-      cast<ConstantInt>(II.getArgOperand(1))->getAlignValue(); 
- 
-  // If the mask is all ones or undefs, this is a plain vector load of the 1st 
-  // argument. 
-  if (maskIsAllOneOrUndef(II.getArgOperand(2))) 
-    return Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment, 
-                                     "unmaskedload"); 
- 
-  // If we can unconditionally load from this address, replace with a 
-  // load/select idiom. TODO: use DT for context sensitive query 
+  Value *LoadPtr = II.getArgOperand(0);
+  const Align Alignment =
+      cast<ConstantInt>(II.getArgOperand(1))->getAlignValue();
+
+  // If the mask is all ones or undefs, this is a plain vector load of the 1st
+  // argument.
+  if (maskIsAllOneOrUndef(II.getArgOperand(2)))
+    return Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment,
+                                     "unmaskedload");
+
+  // If we can unconditionally load from this address, replace with a
+  // load/select idiom. TODO: use DT for context sensitive query
   if (isDereferenceablePointer(LoadPtr, II.getType(),
                                II.getModule()->getDataLayout(), &II, nullptr)) {
-    Value *LI = Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment, 
-                                         "unmaskedload"); 
-    return Builder.CreateSelect(II.getArgOperand(2), LI, II.getArgOperand(3)); 
-  } 
- 
-  return nullptr; 
-} 
- 
-// TODO, Obvious Missing Transforms: 
-// * Single constant active lane -> store 
-// * Narrow width by halfs excluding zero/undef lanes 
+    Value *LI = Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment,
+                                         "unmaskedload");
+    return Builder.CreateSelect(II.getArgOperand(2), LI, II.getArgOperand(3));
+  }
+
+  return nullptr;
+}
+
+// TODO, Obvious Missing Transforms:
+// * Single constant active lane -> store
+// * Narrow width by halfs excluding zero/undef lanes
 Instruction *InstCombinerImpl::simplifyMaskedStore(IntrinsicInst &II) {
-  auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3)); 
-  if (!ConstMask) 
-    return nullptr; 
- 
-  // If the mask is all zeros, this instruction does nothing. 
-  if (ConstMask->isNullValue()) 
-    return eraseInstFromFunction(II); 
- 
-  // If the mask is all ones, this is a plain vector store of the 1st argument. 
-  if (ConstMask->isAllOnesValue()) { 
-    Value *StorePtr = II.getArgOperand(1); 
-    Align Alignment = cast<ConstantInt>(II.getArgOperand(2))->getAlignValue(); 
-    return new StoreInst(II.getArgOperand(0), StorePtr, false, Alignment); 
-  } 
- 
+  auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
+  if (!ConstMask)
+    return nullptr;
+
+  // If the mask is all zeros, this instruction does nothing.
+  if (ConstMask->isNullValue())
+    return eraseInstFromFunction(II);
+
+  // If the mask is all ones, this is a plain vector store of the 1st argument.
+  if (ConstMask->isAllOnesValue()) {
+    Value *StorePtr = II.getArgOperand(1);
+    Align Alignment = cast<ConstantInt>(II.getArgOperand(2))->getAlignValue();
+    return new StoreInst(II.getArgOperand(0), StorePtr, false, Alignment);
+  }
+
   if (isa<ScalableVectorType>(ConstMask->getType()))
     return nullptr;
 
-  // Use masked off lanes to simplify operands via SimplifyDemandedVectorElts 
-  APInt DemandedElts = possiblyDemandedEltsInMask(ConstMask); 
-  APInt UndefElts(DemandedElts.getBitWidth(), 0); 
+  // Use masked off lanes to simplify operands via SimplifyDemandedVectorElts
+  APInt DemandedElts = possiblyDemandedEltsInMask(ConstMask);
+  APInt UndefElts(DemandedElts.getBitWidth(), 0);
   if (Value *V =
           SimplifyDemandedVectorElts(II.getOperand(0), DemandedElts, UndefElts))
-    return replaceOperand(II, 0, V); 
- 
-  return nullptr; 
-} 
- 
-// TODO, Obvious Missing Transforms: 
-// * Single constant active lane load -> load 
-// * Dereferenceable address & few lanes -> scalarize speculative load/selects 
-// * Adjacent vector addresses -> masked.load 
-// * Narrow width by halfs excluding zero/undef lanes 
-// * Vector splat address w/known mask -> scalar load 
-// * Vector incrementing address -> vector masked load 
+    return replaceOperand(II, 0, V);
+
+  return nullptr;
+}
+
+// TODO, Obvious Missing Transforms:
+// * Single constant active lane load -> load
+// * Dereferenceable address & few lanes -> scalarize speculative load/selects
+// * Adjacent vector addresses -> masked.load
+// * Narrow width by halfs excluding zero/undef lanes
+// * Vector splat address w/known mask -> scalar load
+// * Vector incrementing address -> vector masked load
 Instruction *InstCombinerImpl::simplifyMaskedGather(IntrinsicInst &II) {
-  return nullptr; 
-} 
- 
-// TODO, Obvious Missing Transforms: 
-// * Single constant active lane -> store 
-// * Adjacent vector addresses -> masked.store 
-// * Narrow store width by halfs excluding zero/undef lanes 
-// * Vector splat address w/known mask -> scalar store 
-// * Vector incrementing address -> vector masked store 
+  return nullptr;
+}
+
+// TODO, Obvious Missing Transforms:
+// * Single constant active lane -> store
+// * Adjacent vector addresses -> masked.store
+// * Narrow store width by halfs excluding zero/undef lanes
+// * Vector splat address w/known mask -> scalar store
+// * Vector incrementing address -> vector masked store
 Instruction *InstCombinerImpl::simplifyMaskedScatter(IntrinsicInst &II) {
-  auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3)); 
-  if (!ConstMask) 
-    return nullptr; 
- 
-  // If the mask is all zeros, a scatter does nothing. 
-  if (ConstMask->isNullValue()) 
-    return eraseInstFromFunction(II); 
- 
+  auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
+  if (!ConstMask)
+    return nullptr;
+
+  // If the mask is all zeros, a scatter does nothing.
+  if (ConstMask->isNullValue())
+    return eraseInstFromFunction(II);
+
   if (isa<ScalableVectorType>(ConstMask->getType()))
     return nullptr;
 
-  // Use masked off lanes to simplify operands via SimplifyDemandedVectorElts 
-  APInt DemandedElts = possiblyDemandedEltsInMask(ConstMask); 
-  APInt UndefElts(DemandedElts.getBitWidth(), 0); 
+  // Use masked off lanes to simplify operands via SimplifyDemandedVectorElts
+  APInt DemandedElts = possiblyDemandedEltsInMask(ConstMask);
+  APInt UndefElts(DemandedElts.getBitWidth(), 0);
   if (Value *V =
           SimplifyDemandedVectorElts(II.getOperand(0), DemandedElts, UndefElts))
-    return replaceOperand(II, 0, V); 
+    return replaceOperand(II, 0, V);
   if (Value *V =
           SimplifyDemandedVectorElts(II.getOperand(1), DemandedElts, UndefElts))
-    return replaceOperand(II, 1, V); 
- 
-  return nullptr; 
-} 
- 
-/// This function transforms launder.invariant.group and strip.invariant.group 
-/// like: 
-/// launder(launder(%x)) -> launder(%x)       (the result is not the argument) 
-/// launder(strip(%x)) -> launder(%x) 
-/// strip(strip(%x)) -> strip(%x)             (the result is not the argument) 
-/// strip(launder(%x)) -> strip(%x) 
-/// This is legal because it preserves the most recent information about 
-/// the presence or absence of invariant.group. 
-static Instruction *simplifyInvariantGroupIntrinsic(IntrinsicInst &II, 
+    return replaceOperand(II, 1, V);
+
+  return nullptr;
+}
+
+/// This function transforms launder.invariant.group and strip.invariant.group
+/// like:
+/// launder(launder(%x)) -> launder(%x)       (the result is not the argument)
+/// launder(strip(%x)) -> launder(%x)
+/// strip(strip(%x)) -> strip(%x)             (the result is not the argument)
+/// strip(launder(%x)) -> strip(%x)
+/// This is legal because it preserves the most recent information about
+/// the presence or absence of invariant.group.
+static Instruction *simplifyInvariantGroupIntrinsic(IntrinsicInst &II,
                                                     InstCombinerImpl &IC) {
-  auto *Arg = II.getArgOperand(0); 
-  auto *StrippedArg = Arg->stripPointerCasts(); 
-  auto *StrippedInvariantGroupsArg = Arg->stripPointerCastsAndInvariantGroups(); 
-  if (StrippedArg == StrippedInvariantGroupsArg) 
-    return nullptr; // No launders/strips to remove. 
- 
-  Value *Result = nullptr; 
- 
-  if (II.getIntrinsicID() == Intrinsic::launder_invariant_group) 
-    Result = IC.Builder.CreateLaunderInvariantGroup(StrippedInvariantGroupsArg); 
-  else if (II.getIntrinsicID() == Intrinsic::strip_invariant_group) 
-    Result = IC.Builder.CreateStripInvariantGroup(StrippedInvariantGroupsArg); 
-  else 
-    llvm_unreachable( 
-        "simplifyInvariantGroupIntrinsic only handles launder and strip"); 
-  if (Result->getType()->getPointerAddressSpace() != 
-      II.getType()->getPointerAddressSpace()) 
-    Result = IC.Builder.CreateAddrSpaceCast(Result, II.getType()); 
-  if (Result->getType() != II.getType()) 
-    Result = IC.Builder.CreateBitCast(Result, II.getType()); 
- 
-  return cast<Instruction>(Result); 
-} 
- 
+  auto *Arg = II.getArgOperand(0);
+  auto *StrippedArg = Arg->stripPointerCasts();
+  auto *StrippedInvariantGroupsArg = Arg->stripPointerCastsAndInvariantGroups();
+  if (StrippedArg == StrippedInvariantGroupsArg)
+    return nullptr; // No launders/strips to remove.
+
+  Value *Result = nullptr;
+
+  if (II.getIntrinsicID() == Intrinsic::launder_invariant_group)
+    Result = IC.Builder.CreateLaunderInvariantGroup(StrippedInvariantGroupsArg);
+  else if (II.getIntrinsicID() == Intrinsic::strip_invariant_group)
+    Result = IC.Builder.CreateStripInvariantGroup(StrippedInvariantGroupsArg);
+  else
+    llvm_unreachable(
+        "simplifyInvariantGroupIntrinsic only handles launder and strip");
+  if (Result->getType()->getPointerAddressSpace() !=
+      II.getType()->getPointerAddressSpace())
+    Result = IC.Builder.CreateAddrSpaceCast(Result, II.getType());
+  if (Result->getType() != II.getType())
+    Result = IC.Builder.CreateBitCast(Result, II.getType());
+
+  return cast<Instruction>(Result);
+}
+
 static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) {
-  assert((II.getIntrinsicID() == Intrinsic::cttz || 
-          II.getIntrinsicID() == Intrinsic::ctlz) && 
-         "Expected cttz or ctlz intrinsic"); 
-  bool IsTZ = II.getIntrinsicID() == Intrinsic::cttz; 
-  Value *Op0 = II.getArgOperand(0); 
-  Value *X; 
-  // ctlz(bitreverse(x)) -> cttz(x) 
-  // cttz(bitreverse(x)) -> ctlz(x) 
-  if (match(Op0, m_BitReverse(m_Value(X)))) { 
-    Intrinsic::ID ID = IsTZ ? Intrinsic::ctlz : Intrinsic::cttz; 
-    Function *F = Intrinsic::getDeclaration(II.getModule(), ID, II.getType()); 
-    return CallInst::Create(F, {X, II.getArgOperand(1)}); 
-  } 
- 
-  if (IsTZ) { 
-    // cttz(-x) -> cttz(x) 
-    if (match(Op0, m_Neg(m_Value(X)))) 
-      return IC.replaceOperand(II, 0, X); 
- 
-    // cttz(abs(x)) -> cttz(x) 
-    // cttz(nabs(x)) -> cttz(x) 
-    Value *Y; 
-    SelectPatternFlavor SPF = matchSelectPattern(Op0, X, Y).Flavor; 
-    if (SPF == SPF_ABS || SPF == SPF_NABS) 
-      return IC.replaceOperand(II, 0, X); 
+  assert((II.getIntrinsicID() == Intrinsic::cttz ||
+          II.getIntrinsicID() == Intrinsic::ctlz) &&
+         "Expected cttz or ctlz intrinsic");
+  bool IsTZ = II.getIntrinsicID() == Intrinsic::cttz;
+  Value *Op0 = II.getArgOperand(0);
+  Value *X;
+  // ctlz(bitreverse(x)) -> cttz(x)
+  // cttz(bitreverse(x)) -> ctlz(x)
+  if (match(Op0, m_BitReverse(m_Value(X)))) {
+    Intrinsic::ID ID = IsTZ ? Intrinsic::ctlz : Intrinsic::cttz;
+    Function *F = Intrinsic::getDeclaration(II.getModule(), ID, II.getType());
+    return CallInst::Create(F, {X, II.getArgOperand(1)});
+  }
+
+  if (IsTZ) {
+    // cttz(-x) -> cttz(x)
+    if (match(Op0, m_Neg(m_Value(X))))
+      return IC.replaceOperand(II, 0, X);
+
+    // cttz(abs(x)) -> cttz(x)
+    // cttz(nabs(x)) -> cttz(x)
+    Value *Y;
+    SelectPatternFlavor SPF = matchSelectPattern(Op0, X, Y).Flavor;
+    if (SPF == SPF_ABS || SPF == SPF_NABS)
+      return IC.replaceOperand(II, 0, X);
 
     if (match(Op0, m_Intrinsic<Intrinsic::abs>(m_Value(X))))
       return IC.replaceOperand(II, 0, X);
-  } 
- 
-  KnownBits Known = IC.computeKnownBits(Op0, 0, &II); 
- 
-  // Create a mask for bits above (ctlz) or below (cttz) the first known one. 
-  unsigned PossibleZeros = IsTZ ? Known.countMaxTrailingZeros() 
-                                : Known.countMaxLeadingZeros(); 
-  unsigned DefiniteZeros = IsTZ ? Known.countMinTrailingZeros() 
-                                : Known.countMinLeadingZeros(); 
- 
-  // If all bits above (ctlz) or below (cttz) the first known one are known 
-  // zero, this value is constant. 
-  // FIXME: This should be in InstSimplify because we're replacing an 
-  // instruction with a constant. 
-  if (PossibleZeros == DefiniteZeros) { 
-    auto *C = ConstantInt::get(Op0->getType(), DefiniteZeros); 
-    return IC.replaceInstUsesWith(II, C); 
-  } 
- 
-  // If the input to cttz/ctlz is known to be non-zero, 
-  // then change the 'ZeroIsUndef' parameter to 'true' 
-  // because we know the zero behavior can't affect the result. 
-  if (!Known.One.isNullValue() || 
-      isKnownNonZero(Op0, IC.getDataLayout(), 0, &IC.getAssumptionCache(), &II, 
-                     &IC.getDominatorTree())) { 
-    if (!match(II.getArgOperand(1), m_One())) 
-      return IC.replaceOperand(II, 1, IC.Builder.getTrue()); 
-  } 
- 
-  // Add range metadata since known bits can't completely reflect what we know. 
-  // TODO: Handle splat vectors. 
-  auto *IT = dyn_cast<IntegerType>(Op0->getType()); 
-  if (IT && IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) { 
-    Metadata *LowAndHigh[] = { 
-        ConstantAsMetadata::get(ConstantInt::get(IT, DefiniteZeros)), 
-        ConstantAsMetadata::get(ConstantInt::get(IT, PossibleZeros + 1))}; 
-    II.setMetadata(LLVMContext::MD_range, 
-                   MDNode::get(II.getContext(), LowAndHigh)); 
-    return &II; 
-  } 
- 
-  return nullptr; 
-} 
- 
+  }
+
+  KnownBits Known = IC.computeKnownBits(Op0, 0, &II);
+
+  // Create a mask for bits above (ctlz) or below (cttz) the first known one.
+  unsigned PossibleZeros = IsTZ ? Known.countMaxTrailingZeros()
+                                : Known.countMaxLeadingZeros();
+  unsigned DefiniteZeros = IsTZ ? Known.countMinTrailingZeros()
+                                : Known.countMinLeadingZeros();
+
+  // If all bits above (ctlz) or below (cttz) the first known one are known
+  // zero, this value is constant.
+  // FIXME: This should be in InstSimplify because we're replacing an
+  // instruction with a constant.
+  if (PossibleZeros == DefiniteZeros) {
+    auto *C = ConstantInt::get(Op0->getType(), DefiniteZeros);
+    return IC.replaceInstUsesWith(II, C);
+  }
+
+  // If the input to cttz/ctlz is known to be non-zero,
+  // then change the 'ZeroIsUndef' parameter to 'true'
+  // because we know the zero behavior can't affect the result.
+  if (!Known.One.isNullValue() ||
+      isKnownNonZero(Op0, IC.getDataLayout(), 0, &IC.getAssumptionCache(), &II,
+                     &IC.getDominatorTree())) {
+    if (!match(II.getArgOperand(1), m_One()))
+      return IC.replaceOperand(II, 1, IC.Builder.getTrue());
+  }
+
+  // Add range metadata since known bits can't completely reflect what we know.
+  // TODO: Handle splat vectors.
+  auto *IT = dyn_cast<IntegerType>(Op0->getType());
+  if (IT && IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) {
+    Metadata *LowAndHigh[] = {
+        ConstantAsMetadata::get(ConstantInt::get(IT, DefiniteZeros)),
+        ConstantAsMetadata::get(ConstantInt::get(IT, PossibleZeros + 1))};
+    II.setMetadata(LLVMContext::MD_range,
+                   MDNode::get(II.getContext(), LowAndHigh));
+    return &II;
+  }
+
+  return nullptr;
+}
+
 static Instruction *foldCtpop(IntrinsicInst &II, InstCombinerImpl &IC) {
-  assert(II.getIntrinsicID() == Intrinsic::ctpop && 
-         "Expected ctpop intrinsic"); 
-  Type *Ty = II.getType(); 
-  unsigned BitWidth = Ty->getScalarSizeInBits(); 
-  Value *Op0 = II.getArgOperand(0); 
-  Value *X; 
- 
-  // ctpop(bitreverse(x)) -> ctpop(x) 
-  // ctpop(bswap(x)) -> ctpop(x) 
-  if (match(Op0, m_BitReverse(m_Value(X))) || match(Op0, m_BSwap(m_Value(X)))) 
-    return IC.replaceOperand(II, 0, X); 
- 
-  // ctpop(x | -x) -> bitwidth - cttz(x, false) 
-  if (Op0->hasOneUse() && 
-      match(Op0, m_c_Or(m_Value(X), m_Neg(m_Deferred(X))))) { 
-    Function *F = 
-        Intrinsic::getDeclaration(II.getModule(), Intrinsic::cttz, Ty); 
-    auto *Cttz = IC.Builder.CreateCall(F, {X, IC.Builder.getFalse()}); 
-    auto *Bw = ConstantInt::get(Ty, APInt(BitWidth, BitWidth)); 
-    return IC.replaceInstUsesWith(II, IC.Builder.CreateSub(Bw, Cttz)); 
-  } 
- 
-  // ctpop(~x & (x - 1)) -> cttz(x, false) 
-  if (match(Op0, 
-            m_c_And(m_Not(m_Value(X)), m_Add(m_Deferred(X), m_AllOnes())))) { 
-    Function *F = 
-        Intrinsic::getDeclaration(II.getModule(), Intrinsic::cttz, Ty); 
-    return CallInst::Create(F, {X, IC.Builder.getFalse()}); 
-  } 
- 
-  // FIXME: Try to simplify vectors of integers. 
-  auto *IT = dyn_cast<IntegerType>(Ty); 
-  if (!IT) 
-    return nullptr; 
- 
-  KnownBits Known(BitWidth); 
-  IC.computeKnownBits(Op0, Known, 0, &II); 
- 
-  unsigned MinCount = Known.countMinPopulation(); 
-  unsigned MaxCount = Known.countMaxPopulation(); 
- 
-  // Add range metadata since known bits can't completely reflect what we know. 
-  if (IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) { 
-    Metadata *LowAndHigh[] = { 
-        ConstantAsMetadata::get(ConstantInt::get(IT, MinCount)), 
-        ConstantAsMetadata::get(ConstantInt::get(IT, MaxCount + 1))}; 
-    II.setMetadata(LLVMContext::MD_range, 
-                   MDNode::get(II.getContext(), LowAndHigh)); 
-    return &II; 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Convert a table lookup to shufflevector if the mask is constant. 
-/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in 
-/// which case we could lower the shufflevector with rev64 instructions 
-/// as it's actually a byte reverse. 
-static Value *simplifyNeonTbl1(const IntrinsicInst &II, 
-                               InstCombiner::BuilderTy &Builder) { 
-  // Bail out if the mask is not a constant. 
-  auto *C = dyn_cast<Constant>(II.getArgOperand(1)); 
-  if (!C) 
-    return nullptr; 
- 
+  assert(II.getIntrinsicID() == Intrinsic::ctpop &&
+         "Expected ctpop intrinsic");
+  Type *Ty = II.getType();
+  unsigned BitWidth = Ty->getScalarSizeInBits();
+  Value *Op0 = II.getArgOperand(0);
+  Value *X;
+
+  // ctpop(bitreverse(x)) -> ctpop(x)
+  // ctpop(bswap(x)) -> ctpop(x)
+  if (match(Op0, m_BitReverse(m_Value(X))) || match(Op0, m_BSwap(m_Value(X))))
+    return IC.replaceOperand(II, 0, X);
+
+  // ctpop(x | -x) -> bitwidth - cttz(x, false)
+  if (Op0->hasOneUse() &&
+      match(Op0, m_c_Or(m_Value(X), m_Neg(m_Deferred(X))))) {
+    Function *F =
+        Intrinsic::getDeclaration(II.getModule(), Intrinsic::cttz, Ty);
+    auto *Cttz = IC.Builder.CreateCall(F, {X, IC.Builder.getFalse()});
+    auto *Bw = ConstantInt::get(Ty, APInt(BitWidth, BitWidth));
+    return IC.replaceInstUsesWith(II, IC.Builder.CreateSub(Bw, Cttz));
+  }
+
+  // ctpop(~x & (x - 1)) -> cttz(x, false)
+  if (match(Op0,
+            m_c_And(m_Not(m_Value(X)), m_Add(m_Deferred(X), m_AllOnes())))) {
+    Function *F =
+        Intrinsic::getDeclaration(II.getModule(), Intrinsic::cttz, Ty);
+    return CallInst::Create(F, {X, IC.Builder.getFalse()});
+  }
+
+  // FIXME: Try to simplify vectors of integers.
+  auto *IT = dyn_cast<IntegerType>(Ty);
+  if (!IT)
+    return nullptr;
+
+  KnownBits Known(BitWidth);
+  IC.computeKnownBits(Op0, Known, 0, &II);
+
+  unsigned MinCount = Known.countMinPopulation();
+  unsigned MaxCount = Known.countMaxPopulation();
+
+  // Add range metadata since known bits can't completely reflect what we know.
+  if (IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) {
+    Metadata *LowAndHigh[] = {
+        ConstantAsMetadata::get(ConstantInt::get(IT, MinCount)),
+        ConstantAsMetadata::get(ConstantInt::get(IT, MaxCount + 1))};
+    II.setMetadata(LLVMContext::MD_range,
+                   MDNode::get(II.getContext(), LowAndHigh));
+    return &II;
+  }
+
+  return nullptr;
+}
+
+/// Convert a table lookup to shufflevector if the mask is constant.
+/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
+/// which case we could lower the shufflevector with rev64 instructions
+/// as it's actually a byte reverse.
+static Value *simplifyNeonTbl1(const IntrinsicInst &II,
+                               InstCombiner::BuilderTy &Builder) {
+  // Bail out if the mask is not a constant.
+  auto *C = dyn_cast<Constant>(II.getArgOperand(1));
+  if (!C)
+    return nullptr;
+
   auto *VecTy = cast<FixedVectorType>(II.getType());
-  unsigned NumElts = VecTy->getNumElements(); 
- 
-  // Only perform this transformation for <8 x i8> vector types. 
-  if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8) 
-    return nullptr; 
- 
-  int Indexes[8]; 
- 
-  for (unsigned I = 0; I < NumElts; ++I) { 
-    Constant *COp = C->getAggregateElement(I); 
- 
-    if (!COp || !isa<ConstantInt>(COp)) 
-      return nullptr; 
- 
-    Indexes[I] = cast<ConstantInt>(COp)->getLimitedValue(); 
- 
-    // Make sure the mask indices are in range. 
-    if ((unsigned)Indexes[I] >= NumElts) 
-      return nullptr; 
-  } 
- 
-  auto *V1 = II.getArgOperand(0); 
-  auto *V2 = Constant::getNullValue(V1->getType()); 
-  return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes)); 
-} 
- 
-// Returns true iff the 2 intrinsics have the same operands, limiting the 
-// comparison to the first NumOperands. 
-static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E, 
-                             unsigned NumOperands) { 
-  assert(I.getNumArgOperands() >= NumOperands && "Not enough operands"); 
-  assert(E.getNumArgOperands() >= NumOperands && "Not enough operands"); 
-  for (unsigned i = 0; i < NumOperands; i++) 
-    if (I.getArgOperand(i) != E.getArgOperand(i)) 
-      return false; 
-  return true; 
-} 
- 
-// Remove trivially empty start/end intrinsic ranges, i.e. a start 
-// immediately followed by an end (ignoring debuginfo or other 
-// start/end intrinsics in between). As this handles only the most trivial 
-// cases, tracking the nesting level is not needed: 
-// 
-//   call @llvm.foo.start(i1 0) 
-//   call @llvm.foo.start(i1 0) ; This one won't be skipped: it will be removed 
-//   call @llvm.foo.end(i1 0) 
-//   call @llvm.foo.end(i1 0) ; &I 
+  unsigned NumElts = VecTy->getNumElements();
+
+  // Only perform this transformation for <8 x i8> vector types.
+  if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8)
+    return nullptr;
+
+  int Indexes[8];
+
+  for (unsigned I = 0; I < NumElts; ++I) {
+    Constant *COp = C->getAggregateElement(I);
+
+    if (!COp || !isa<ConstantInt>(COp))
+      return nullptr;
+
+    Indexes[I] = cast<ConstantInt>(COp)->getLimitedValue();
+
+    // Make sure the mask indices are in range.
+    if ((unsigned)Indexes[I] >= NumElts)
+      return nullptr;
+  }
+
+  auto *V1 = II.getArgOperand(0);
+  auto *V2 = Constant::getNullValue(V1->getType());
+  return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes));
+}
+
+// Returns true iff the 2 intrinsics have the same operands, limiting the
+// comparison to the first NumOperands.
+static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E,
+                             unsigned NumOperands) {
+  assert(I.getNumArgOperands() >= NumOperands && "Not enough operands");
+  assert(E.getNumArgOperands() >= NumOperands && "Not enough operands");
+  for (unsigned i = 0; i < NumOperands; i++)
+    if (I.getArgOperand(i) != E.getArgOperand(i))
+      return false;
+  return true;
+}
+
+// Remove trivially empty start/end intrinsic ranges, i.e. a start
+// immediately followed by an end (ignoring debuginfo or other
+// start/end intrinsics in between). As this handles only the most trivial
+// cases, tracking the nesting level is not needed:
+//
+//   call @llvm.foo.start(i1 0)
+//   call @llvm.foo.start(i1 0) ; This one won't be skipped: it will be removed
+//   call @llvm.foo.end(i1 0)
+//   call @llvm.foo.end(i1 0) ; &I
 static bool
 removeTriviallyEmptyRange(IntrinsicInst &EndI, InstCombinerImpl &IC,
                           std::function<bool(const IntrinsicInst &)> IsStart) {
-  // We start from the end intrinsic and scan backwards, so that InstCombine 
-  // has already processed (and potentially removed) all the instructions 
-  // before the end intrinsic. 
-  BasicBlock::reverse_iterator BI(EndI), BE(EndI.getParent()->rend()); 
-  for (; BI != BE; ++BI) { 
-    if (auto *I = dyn_cast<IntrinsicInst>(&*BI)) { 
-      if (isa<DbgInfoIntrinsic>(I) || 
-          I->getIntrinsicID() == EndI.getIntrinsicID()) 
-        continue; 
-      if (IsStart(*I)) { 
-        if (haveSameOperands(EndI, *I, EndI.getNumArgOperands())) { 
-          IC.eraseInstFromFunction(*I); 
-          IC.eraseInstFromFunction(EndI); 
-          return true; 
-        } 
-        // Skip start intrinsics that don't pair with this end intrinsic. 
-        continue; 
-      } 
-    } 
-    break; 
-  } 
- 
-  return false; 
-} 
- 
+  // We start from the end intrinsic and scan backwards, so that InstCombine
+  // has already processed (and potentially removed) all the instructions
+  // before the end intrinsic.
+  BasicBlock::reverse_iterator BI(EndI), BE(EndI.getParent()->rend());
+  for (; BI != BE; ++BI) {
+    if (auto *I = dyn_cast<IntrinsicInst>(&*BI)) {
+      if (isa<DbgInfoIntrinsic>(I) ||
+          I->getIntrinsicID() == EndI.getIntrinsicID())
+        continue;
+      if (IsStart(*I)) {
+        if (haveSameOperands(EndI, *I, EndI.getNumArgOperands())) {
+          IC.eraseInstFromFunction(*I);
+          IC.eraseInstFromFunction(EndI);
+          return true;
+        }
+        // Skip start intrinsics that don't pair with this end intrinsic.
+        continue;
+      }
+    }
+    break;
+  }
+
+  return false;
+}
+
 Instruction *InstCombinerImpl::visitVAEndInst(VAEndInst &I) {
-  removeTriviallyEmptyRange(I, *this, [](const IntrinsicInst &I) { 
-    return I.getIntrinsicID() == Intrinsic::vastart || 
-           I.getIntrinsicID() == Intrinsic::vacopy; 
-  }); 
-  return nullptr; 
-} 
- 
+  removeTriviallyEmptyRange(I, *this, [](const IntrinsicInst &I) {
+    return I.getIntrinsicID() == Intrinsic::vastart ||
+           I.getIntrinsicID() == Intrinsic::vacopy;
+  });
+  return nullptr;
+}
+
 static CallInst *canonicalizeConstantArg0ToArg1(CallInst &Call) {
-  assert(Call.getNumArgOperands() > 1 && "Need at least 2 args to swap"); 
-  Value *Arg0 = Call.getArgOperand(0), *Arg1 = Call.getArgOperand(1); 
-  if (isa<Constant>(Arg0) && !isa<Constant>(Arg1)) { 
-    Call.setArgOperand(0, Arg1); 
-    Call.setArgOperand(1, Arg0); 
-    return &Call; 
-  } 
-  return nullptr; 
-} 
- 
+  assert(Call.getNumArgOperands() > 1 && "Need at least 2 args to swap");
+  Value *Arg0 = Call.getArgOperand(0), *Arg1 = Call.getArgOperand(1);
+  if (isa<Constant>(Arg0) && !isa<Constant>(Arg1)) {
+    Call.setArgOperand(0, Arg1);
+    Call.setArgOperand(1, Arg0);
+    return &Call;
+  }
+  return nullptr;
+}
+
 /// Creates a result tuple for an overflow intrinsic \p II with a given
 /// \p Result and a constant \p Overflow value.
 static Instruction *createOverflowTuple(IntrinsicInst *II, Value *Result,
@@ -653,15 +653,15 @@ static Instruction *createOverflowTuple(IntrinsicInst *II, Value *Result,
 
 Instruction *
 InstCombinerImpl::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) {
-  WithOverflowInst *WO = cast<WithOverflowInst>(II); 
-  Value *OperationResult = nullptr; 
-  Constant *OverflowResult = nullptr; 
-  if (OptimizeOverflowCheck(WO->getBinaryOp(), WO->isSigned(), WO->getLHS(), 
-                            WO->getRHS(), *WO, OperationResult, OverflowResult)) 
+  WithOverflowInst *WO = cast<WithOverflowInst>(II);
+  Value *OperationResult = nullptr;
+  Constant *OverflowResult = nullptr;
+  if (OptimizeOverflowCheck(WO->getBinaryOp(), WO->isSigned(), WO->getLHS(),
+                            WO->getRHS(), *WO, OperationResult, OverflowResult))
     return createOverflowTuple(WO, OperationResult, OverflowResult);
-  return nullptr; 
-} 
- 
+  return nullptr;
+}
+
 static Optional<bool> getKnownSign(Value *Op, Instruction *CxtI,
                                    const DataLayout &DL, AssumptionCache *AC,
                                    DominatorTree *DT) {
@@ -675,126 +675,126 @@ static Optional<bool> getKnownSign(Value *Op, Instruction *CxtI,
       ICmpInst::ICMP_SLT, Op, Constant::getNullValue(Op->getType()), CxtI, DL);
 }
 
-/// CallInst simplification. This mostly only handles folding of intrinsic 
-/// instructions. For normal calls, it allows visitCallBase to do the heavy 
-/// lifting. 
+/// CallInst simplification. This mostly only handles folding of intrinsic
+/// instructions. For normal calls, it allows visitCallBase to do the heavy
+/// lifting.
 Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
-  // Don't try to simplify calls without uses. It will not do anything useful, 
-  // but will result in the following folds being skipped. 
-  if (!CI.use_empty()) 
-    if (Value *V = SimplifyCall(&CI, SQ.getWithInstruction(&CI))) 
-      return replaceInstUsesWith(CI, V); 
- 
-  if (isFreeCall(&CI, &TLI)) 
-    return visitFree(CI); 
- 
-  // If the caller function is nounwind, mark the call as nounwind, even if the 
-  // callee isn't. 
-  if (CI.getFunction()->doesNotThrow() && !CI.doesNotThrow()) { 
-    CI.setDoesNotThrow(); 
-    return &CI; 
-  } 
- 
-  IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CI); 
-  if (!II) return visitCallBase(CI); 
- 
-  // For atomic unordered mem intrinsics if len is not a positive or 
-  // not a multiple of element size then behavior is undefined. 
-  if (auto *AMI = dyn_cast<AtomicMemIntrinsic>(II)) 
-    if (ConstantInt *NumBytes = dyn_cast<ConstantInt>(AMI->getLength())) 
-      if (NumBytes->getSExtValue() < 0 || 
-          (NumBytes->getZExtValue() % AMI->getElementSizeInBytes() != 0)) { 
-        CreateNonTerminatorUnreachable(AMI); 
-        assert(AMI->getType()->isVoidTy() && 
-               "non void atomic unordered mem intrinsic"); 
-        return eraseInstFromFunction(*AMI); 
-      } 
- 
-  // Intrinsics cannot occur in an invoke or a callbr, so handle them here 
-  // instead of in visitCallBase. 
-  if (auto *MI = dyn_cast<AnyMemIntrinsic>(II)) { 
-    bool Changed = false; 
- 
-    // memmove/cpy/set of zero bytes is a noop. 
-    if (Constant *NumBytes = dyn_cast<Constant>(MI->getLength())) { 
-      if (NumBytes->isNullValue()) 
-        return eraseInstFromFunction(CI); 
- 
-      if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes)) 
-        if (CI->getZExtValue() == 1) { 
-          // Replace the instruction with just byte operations.  We would 
-          // transform other cases to loads/stores, but we don't know if 
-          // alignment is sufficient. 
-        } 
-    } 
- 
-    // No other transformations apply to volatile transfers. 
-    if (auto *M = dyn_cast<MemIntrinsic>(MI)) 
-      if (M->isVolatile()) 
-        return nullptr; 
- 
-    // If we have a memmove and the source operation is a constant global, 
-    // then the source and dest pointers can't alias, so we can change this 
-    // into a call to memcpy. 
-    if (auto *MMI = dyn_cast<AnyMemMoveInst>(MI)) { 
-      if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource())) 
-        if (GVSrc->isConstant()) { 
-          Module *M = CI.getModule(); 
-          Intrinsic::ID MemCpyID = 
-              isa<AtomicMemMoveInst>(MMI) 
-                  ? Intrinsic::memcpy_element_unordered_atomic 
-                  : Intrinsic::memcpy; 
-          Type *Tys[3] = { CI.getArgOperand(0)->getType(), 
-                           CI.getArgOperand(1)->getType(), 
-                           CI.getArgOperand(2)->getType() }; 
-          CI.setCalledFunction(Intrinsic::getDeclaration(M, MemCpyID, Tys)); 
-          Changed = true; 
-        } 
-    } 
- 
-    if (AnyMemTransferInst *MTI = dyn_cast<AnyMemTransferInst>(MI)) { 
-      // memmove(x,x,size) -> noop. 
-      if (MTI->getSource() == MTI->getDest()) 
-        return eraseInstFromFunction(CI); 
-    } 
- 
-    // If we can determine a pointer alignment that is bigger than currently 
-    // set, update the alignment. 
-    if (auto *MTI = dyn_cast<AnyMemTransferInst>(MI)) { 
-      if (Instruction *I = SimplifyAnyMemTransfer(MTI)) 
-        return I; 
-    } else if (auto *MSI = dyn_cast<AnyMemSetInst>(MI)) { 
-      if (Instruction *I = SimplifyAnyMemSet(MSI)) 
-        return I; 
-    } 
- 
-    if (Changed) return II; 
-  } 
- 
-  // For fixed width vector result intrinsics, use the generic demanded vector 
-  // support. 
-  if (auto *IIFVTy = dyn_cast<FixedVectorType>(II->getType())) { 
-    auto VWidth = IIFVTy->getNumElements(); 
-    APInt UndefElts(VWidth, 0); 
-    APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth)); 
-    if (Value *V = SimplifyDemandedVectorElts(II, AllOnesEltMask, UndefElts)) { 
-      if (V != II) 
-        return replaceInstUsesWith(*II, V); 
-      return II; 
-    } 
-  } 
- 
+  // Don't try to simplify calls without uses. It will not do anything useful,
+  // but will result in the following folds being skipped.
+  if (!CI.use_empty())
+    if (Value *V = SimplifyCall(&CI, SQ.getWithInstruction(&CI)))
+      return replaceInstUsesWith(CI, V);
+
+  if (isFreeCall(&CI, &TLI))
+    return visitFree(CI);
+
+  // If the caller function is nounwind, mark the call as nounwind, even if the
+  // callee isn't.
+  if (CI.getFunction()->doesNotThrow() && !CI.doesNotThrow()) {
+    CI.setDoesNotThrow();
+    return &CI;
+  }
+
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CI);
+  if (!II) return visitCallBase(CI);
+
+  // For atomic unordered mem intrinsics if len is not a positive or
+  // not a multiple of element size then behavior is undefined.
+  if (auto *AMI = dyn_cast<AtomicMemIntrinsic>(II))
+    if (ConstantInt *NumBytes = dyn_cast<ConstantInt>(AMI->getLength()))
+      if (NumBytes->getSExtValue() < 0 ||
+          (NumBytes->getZExtValue() % AMI->getElementSizeInBytes() != 0)) {
+        CreateNonTerminatorUnreachable(AMI);
+        assert(AMI->getType()->isVoidTy() &&
+               "non void atomic unordered mem intrinsic");
+        return eraseInstFromFunction(*AMI);
+      }
+
+  // Intrinsics cannot occur in an invoke or a callbr, so handle them here
+  // instead of in visitCallBase.
+  if (auto *MI = dyn_cast<AnyMemIntrinsic>(II)) {
+    bool Changed = false;
+
+    // memmove/cpy/set of zero bytes is a noop.
+    if (Constant *NumBytes = dyn_cast<Constant>(MI->getLength())) {
+      if (NumBytes->isNullValue())
+        return eraseInstFromFunction(CI);
+
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes))
+        if (CI->getZExtValue() == 1) {
+          // Replace the instruction with just byte operations.  We would
+          // transform other cases to loads/stores, but we don't know if
+          // alignment is sufficient.
+        }
+    }
+
+    // No other transformations apply to volatile transfers.
+    if (auto *M = dyn_cast<MemIntrinsic>(MI))
+      if (M->isVolatile())
+        return nullptr;
+
+    // If we have a memmove and the source operation is a constant global,
+    // then the source and dest pointers can't alias, so we can change this
+    // into a call to memcpy.
+    if (auto *MMI = dyn_cast<AnyMemMoveInst>(MI)) {
+      if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource()))
+        if (GVSrc->isConstant()) {
+          Module *M = CI.getModule();
+          Intrinsic::ID MemCpyID =
+              isa<AtomicMemMoveInst>(MMI)
+                  ? Intrinsic::memcpy_element_unordered_atomic
+                  : Intrinsic::memcpy;
+          Type *Tys[3] = { CI.getArgOperand(0)->getType(),
+                           CI.getArgOperand(1)->getType(),
+                           CI.getArgOperand(2)->getType() };
+          CI.setCalledFunction(Intrinsic::getDeclaration(M, MemCpyID, Tys));
+          Changed = true;
+        }
+    }
+
+    if (AnyMemTransferInst *MTI = dyn_cast<AnyMemTransferInst>(MI)) {
+      // memmove(x,x,size) -> noop.
+      if (MTI->getSource() == MTI->getDest())
+        return eraseInstFromFunction(CI);
+    }
+
+    // If we can determine a pointer alignment that is bigger than currently
+    // set, update the alignment.
+    if (auto *MTI = dyn_cast<AnyMemTransferInst>(MI)) {
+      if (Instruction *I = SimplifyAnyMemTransfer(MTI))
+        return I;
+    } else if (auto *MSI = dyn_cast<AnyMemSetInst>(MI)) {
+      if (Instruction *I = SimplifyAnyMemSet(MSI))
+        return I;
+    }
+
+    if (Changed) return II;
+  }
+
+  // For fixed width vector result intrinsics, use the generic demanded vector
+  // support.
+  if (auto *IIFVTy = dyn_cast<FixedVectorType>(II->getType())) {
+    auto VWidth = IIFVTy->getNumElements();
+    APInt UndefElts(VWidth, 0);
+    APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
+    if (Value *V = SimplifyDemandedVectorElts(II, AllOnesEltMask, UndefElts)) {
+      if (V != II)
+        return replaceInstUsesWith(*II, V);
+      return II;
+    }
+  }
+
   if (II->isCommutative()) {
     if (CallInst *NewCall = canonicalizeConstantArg0ToArg1(CI))
       return NewCall;
   }
- 
-  Intrinsic::ID IID = II->getIntrinsicID(); 
-  switch (IID) { 
-  case Intrinsic::objectsize: 
-    if (Value *V = lowerObjectSizeCall(II, DL, &TLI, /*MustSucceed=*/false)) 
-      return replaceInstUsesWith(CI, V); 
-    return nullptr; 
+
+  Intrinsic::ID IID = II->getIntrinsicID();
+  switch (IID) {
+  case Intrinsic::objectsize:
+    if (Value *V = lowerObjectSizeCall(II, DL, &TLI, /*MustSucceed=*/false))
+      return replaceInstUsesWith(CI, V);
+    return nullptr;
   case Intrinsic::abs: {
     Value *IIOperand = II->getArgOperand(0);
     bool IntMinIsPoison = cast<Constant>(II->getArgOperand(1))->isOneValue();
@@ -854,444 +854,444 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     }
     break;
   }
-  case Intrinsic::bswap: { 
-    Value *IIOperand = II->getArgOperand(0); 
-    Value *X = nullptr; 
- 
-    // bswap(trunc(bswap(x))) -> trunc(lshr(x, c)) 
-    if (match(IIOperand, m_Trunc(m_BSwap(m_Value(X))))) { 
+  case Intrinsic::bswap: {
+    Value *IIOperand = II->getArgOperand(0);
+    Value *X = nullptr;
+
+    // bswap(trunc(bswap(x))) -> trunc(lshr(x, c))
+    if (match(IIOperand, m_Trunc(m_BSwap(m_Value(X))))) {
       unsigned C = X->getType()->getScalarSizeInBits() -
                    IIOperand->getType()->getScalarSizeInBits();
-      Value *CV = ConstantInt::get(X->getType(), C); 
-      Value *V = Builder.CreateLShr(X, CV); 
-      return new TruncInst(V, IIOperand->getType()); 
-    } 
-    break; 
-  } 
-  case Intrinsic::masked_load: 
-    if (Value *SimplifiedMaskedOp = simplifyMaskedLoad(*II)) 
-      return replaceInstUsesWith(CI, SimplifiedMaskedOp); 
-    break; 
-  case Intrinsic::masked_store: 
-    return simplifyMaskedStore(*II); 
-  case Intrinsic::masked_gather: 
-    return simplifyMaskedGather(*II); 
-  case Intrinsic::masked_scatter: 
-    return simplifyMaskedScatter(*II); 
-  case Intrinsic::launder_invariant_group: 
-  case Intrinsic::strip_invariant_group: 
-    if (auto *SkippedBarrier = simplifyInvariantGroupIntrinsic(*II, *this)) 
-      return replaceInstUsesWith(*II, SkippedBarrier); 
-    break; 
-  case Intrinsic::powi: 
-    if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getArgOperand(1))) { 
-      // 0 and 1 are handled in instsimplify 
-      // powi(x, -1) -> 1/x 
-      if (Power->isMinusOne()) 
+      Value *CV = ConstantInt::get(X->getType(), C);
+      Value *V = Builder.CreateLShr(X, CV);
+      return new TruncInst(V, IIOperand->getType());
+    }
+    break;
+  }
+  case Intrinsic::masked_load:
+    if (Value *SimplifiedMaskedOp = simplifyMaskedLoad(*II))
+      return replaceInstUsesWith(CI, SimplifiedMaskedOp);
+    break;
+  case Intrinsic::masked_store:
+    return simplifyMaskedStore(*II);
+  case Intrinsic::masked_gather:
+    return simplifyMaskedGather(*II);
+  case Intrinsic::masked_scatter:
+    return simplifyMaskedScatter(*II);
+  case Intrinsic::launder_invariant_group:
+  case Intrinsic::strip_invariant_group:
+    if (auto *SkippedBarrier = simplifyInvariantGroupIntrinsic(*II, *this))
+      return replaceInstUsesWith(*II, SkippedBarrier);
+    break;
+  case Intrinsic::powi:
+    if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
+      // 0 and 1 are handled in instsimplify
+      // powi(x, -1) -> 1/x
+      if (Power->isMinusOne())
         return BinaryOperator::CreateFDivFMF(ConstantFP::get(CI.getType(), 1.0),
                                              II->getArgOperand(0), II);
-      // powi(x, 2) -> x*x 
-      if (Power->equalsInt(2)) 
+      // powi(x, 2) -> x*x
+      if (Power->equalsInt(2))
         return BinaryOperator::CreateFMulFMF(II->getArgOperand(0),
                                              II->getArgOperand(0), II);
-    } 
-    break; 
- 
-  case Intrinsic::cttz: 
-  case Intrinsic::ctlz: 
-    if (auto *I = foldCttzCtlz(*II, *this)) 
-      return I; 
-    break; 
- 
-  case Intrinsic::ctpop: 
-    if (auto *I = foldCtpop(*II, *this)) 
-      return I; 
-    break; 
- 
-  case Intrinsic::fshl: 
-  case Intrinsic::fshr: { 
-    Value *Op0 = II->getArgOperand(0), *Op1 = II->getArgOperand(1); 
-    Type *Ty = II->getType(); 
-    unsigned BitWidth = Ty->getScalarSizeInBits(); 
-    Constant *ShAmtC; 
+    }
+    break;
+
+  case Intrinsic::cttz:
+  case Intrinsic::ctlz:
+    if (auto *I = foldCttzCtlz(*II, *this))
+      return I;
+    break;
+
+  case Intrinsic::ctpop:
+    if (auto *I = foldCtpop(*II, *this))
+      return I;
+    break;
+
+  case Intrinsic::fshl:
+  case Intrinsic::fshr: {
+    Value *Op0 = II->getArgOperand(0), *Op1 = II->getArgOperand(1);
+    Type *Ty = II->getType();
+    unsigned BitWidth = Ty->getScalarSizeInBits();
+    Constant *ShAmtC;
     if (match(II->getArgOperand(2), m_ImmConstant(ShAmtC)) &&
         !ShAmtC->containsConstantExpression()) {
-      // Canonicalize a shift amount constant operand to modulo the bit-width. 
-      Constant *WidthC = ConstantInt::get(Ty, BitWidth); 
-      Constant *ModuloC = ConstantExpr::getURem(ShAmtC, WidthC); 
-      if (ModuloC != ShAmtC) 
-        return replaceOperand(*II, 2, ModuloC); 
- 
-      assert(ConstantExpr::getICmp(ICmpInst::ICMP_UGT, WidthC, ShAmtC) == 
-                 ConstantInt::getTrue(CmpInst::makeCmpResultType(Ty)) && 
-             "Shift amount expected to be modulo bitwidth"); 
- 
-      // Canonicalize funnel shift right by constant to funnel shift left. This 
-      // is not entirely arbitrary. For historical reasons, the backend may 
-      // recognize rotate left patterns but miss rotate right patterns. 
-      if (IID == Intrinsic::fshr) { 
-        // fshr X, Y, C --> fshl X, Y, (BitWidth - C) 
-        Constant *LeftShiftC = ConstantExpr::getSub(WidthC, ShAmtC); 
-        Module *Mod = II->getModule(); 
-        Function *Fshl = Intrinsic::getDeclaration(Mod, Intrinsic::fshl, Ty); 
-        return CallInst::Create(Fshl, { Op0, Op1, LeftShiftC }); 
-      } 
-      assert(IID == Intrinsic::fshl && 
-             "All funnel shifts by simple constants should go left"); 
- 
-      // fshl(X, 0, C) --> shl X, C 
-      // fshl(X, undef, C) --> shl X, C 
-      if (match(Op1, m_ZeroInt()) || match(Op1, m_Undef())) 
-        return BinaryOperator::CreateShl(Op0, ShAmtC); 
- 
-      // fshl(0, X, C) --> lshr X, (BW-C) 
-      // fshl(undef, X, C) --> lshr X, (BW-C) 
-      if (match(Op0, m_ZeroInt()) || match(Op0, m_Undef())) 
-        return BinaryOperator::CreateLShr(Op1, 
-                                          ConstantExpr::getSub(WidthC, ShAmtC)); 
- 
-      // fshl i16 X, X, 8 --> bswap i16 X (reduce to more-specific form) 
-      if (Op0 == Op1 && BitWidth == 16 && match(ShAmtC, m_SpecificInt(8))) { 
-        Module *Mod = II->getModule(); 
-        Function *Bswap = Intrinsic::getDeclaration(Mod, Intrinsic::bswap, Ty); 
-        return CallInst::Create(Bswap, { Op0 }); 
-      } 
-    } 
- 
-    // Left or right might be masked. 
-    if (SimplifyDemandedInstructionBits(*II)) 
-      return &CI; 
- 
-    // The shift amount (operand 2) of a funnel shift is modulo the bitwidth, 
-    // so only the low bits of the shift amount are demanded if the bitwidth is 
-    // a power-of-2. 
-    if (!isPowerOf2_32(BitWidth)) 
-      break; 
-    APInt Op2Demanded = APInt::getLowBitsSet(BitWidth, Log2_32_Ceil(BitWidth)); 
-    KnownBits Op2Known(BitWidth); 
-    if (SimplifyDemandedBits(II, 2, Op2Demanded, Op2Known)) 
-      return &CI; 
-    break; 
-  } 
-  case Intrinsic::uadd_with_overflow: 
-  case Intrinsic::sadd_with_overflow: { 
-    if (Instruction *I = foldIntrinsicWithOverflowCommon(II)) 
-      return I; 
- 
-    // Given 2 constant operands whose sum does not overflow: 
-    // uaddo (X +nuw C0), C1 -> uaddo X, C0 + C1 
-    // saddo (X +nsw C0), C1 -> saddo X, C0 + C1 
-    Value *X; 
-    const APInt *C0, *C1; 
-    Value *Arg0 = II->getArgOperand(0); 
-    Value *Arg1 = II->getArgOperand(1); 
-    bool IsSigned = IID == Intrinsic::sadd_with_overflow; 
-    bool HasNWAdd = IsSigned ? match(Arg0, m_NSWAdd(m_Value(X), m_APInt(C0))) 
-                             : match(Arg0, m_NUWAdd(m_Value(X), m_APInt(C0))); 
-    if (HasNWAdd && match(Arg1, m_APInt(C1))) { 
-      bool Overflow; 
-      APInt NewC = 
-          IsSigned ? C1->sadd_ov(*C0, Overflow) : C1->uadd_ov(*C0, Overflow); 
-      if (!Overflow) 
-        return replaceInstUsesWith( 
-            *II, Builder.CreateBinaryIntrinsic( 
-                     IID, X, ConstantInt::get(Arg1->getType(), NewC))); 
-    } 
-    break; 
-  } 
- 
-  case Intrinsic::umul_with_overflow: 
-  case Intrinsic::smul_with_overflow: 
-  case Intrinsic::usub_with_overflow: 
-    if (Instruction *I = foldIntrinsicWithOverflowCommon(II)) 
-      return I; 
-    break; 
- 
-  case Intrinsic::ssub_with_overflow: { 
-    if (Instruction *I = foldIntrinsicWithOverflowCommon(II)) 
-      return I; 
- 
-    Constant *C; 
-    Value *Arg0 = II->getArgOperand(0); 
-    Value *Arg1 = II->getArgOperand(1); 
-    // Given a constant C that is not the minimum signed value 
-    // for an integer of a given bit width: 
-    // 
-    // ssubo X, C -> saddo X, -C 
-    if (match(Arg1, m_Constant(C)) && C->isNotMinSignedValue()) { 
-      Value *NegVal = ConstantExpr::getNeg(C); 
-      // Build a saddo call that is equivalent to the discovered 
-      // ssubo call. 
-      return replaceInstUsesWith( 
-          *II, Builder.CreateBinaryIntrinsic(Intrinsic::sadd_with_overflow, 
-                                             Arg0, NegVal)); 
-    } 
- 
-    break; 
-  } 
- 
-  case Intrinsic::uadd_sat: 
-  case Intrinsic::sadd_sat: 
-  case Intrinsic::usub_sat: 
-  case Intrinsic::ssub_sat: { 
-    SaturatingInst *SI = cast<SaturatingInst>(II); 
-    Type *Ty = SI->getType(); 
-    Value *Arg0 = SI->getLHS(); 
-    Value *Arg1 = SI->getRHS(); 
- 
-    // Make use of known overflow information. 
-    OverflowResult OR = computeOverflow(SI->getBinaryOp(), SI->isSigned(), 
-                                        Arg0, Arg1, SI); 
-    switch (OR) { 
-      case OverflowResult::MayOverflow: 
-        break; 
-      case OverflowResult::NeverOverflows: 
-        if (SI->isSigned()) 
-          return BinaryOperator::CreateNSW(SI->getBinaryOp(), Arg0, Arg1); 
-        else 
-          return BinaryOperator::CreateNUW(SI->getBinaryOp(), Arg0, Arg1); 
-      case OverflowResult::AlwaysOverflowsLow: { 
-        unsigned BitWidth = Ty->getScalarSizeInBits(); 
-        APInt Min = APSInt::getMinValue(BitWidth, !SI->isSigned()); 
-        return replaceInstUsesWith(*SI, ConstantInt::get(Ty, Min)); 
-      } 
-      case OverflowResult::AlwaysOverflowsHigh: { 
-        unsigned BitWidth = Ty->getScalarSizeInBits(); 
-        APInt Max = APSInt::getMaxValue(BitWidth, !SI->isSigned()); 
-        return replaceInstUsesWith(*SI, ConstantInt::get(Ty, Max)); 
-      } 
-    } 
- 
-    // ssub.sat(X, C) -> sadd.sat(X, -C) if C != MIN 
-    Constant *C; 
-    if (IID == Intrinsic::ssub_sat && match(Arg1, m_Constant(C)) && 
-        C->isNotMinSignedValue()) { 
-      Value *NegVal = ConstantExpr::getNeg(C); 
-      return replaceInstUsesWith( 
-          *II, Builder.CreateBinaryIntrinsic( 
-              Intrinsic::sadd_sat, Arg0, NegVal)); 
-    } 
- 
-    // sat(sat(X + Val2) + Val) -> sat(X + (Val+Val2)) 
-    // sat(sat(X - Val2) - Val) -> sat(X - (Val+Val2)) 
-    // if Val and Val2 have the same sign 
-    if (auto *Other = dyn_cast<IntrinsicInst>(Arg0)) { 
-      Value *X; 
-      const APInt *Val, *Val2; 
-      APInt NewVal; 
-      bool IsUnsigned = 
-          IID == Intrinsic::uadd_sat || IID == Intrinsic::usub_sat; 
-      if (Other->getIntrinsicID() == IID && 
-          match(Arg1, m_APInt(Val)) && 
-          match(Other->getArgOperand(0), m_Value(X)) && 
-          match(Other->getArgOperand(1), m_APInt(Val2))) { 
-        if (IsUnsigned) 
-          NewVal = Val->uadd_sat(*Val2); 
-        else if (Val->isNonNegative() == Val2->isNonNegative()) { 
-          bool Overflow; 
-          NewVal = Val->sadd_ov(*Val2, Overflow); 
-          if (Overflow) { 
-            // Both adds together may add more than SignedMaxValue 
-            // without saturating the final result. 
-            break; 
-          } 
-        } else { 
-          // Cannot fold saturated addition with different signs. 
-          break; 
-        } 
- 
-        return replaceInstUsesWith( 
-            *II, Builder.CreateBinaryIntrinsic( 
-                     IID, X, ConstantInt::get(II->getType(), NewVal))); 
-      } 
-    } 
-    break; 
-  } 
- 
-  case Intrinsic::minnum: 
-  case Intrinsic::maxnum: 
-  case Intrinsic::minimum: 
-  case Intrinsic::maximum: { 
-    Value *Arg0 = II->getArgOperand(0); 
-    Value *Arg1 = II->getArgOperand(1); 
-    Value *X, *Y; 
-    if (match(Arg0, m_FNeg(m_Value(X))) && match(Arg1, m_FNeg(m_Value(Y))) && 
-        (Arg0->hasOneUse() || Arg1->hasOneUse())) { 
-      // If both operands are negated, invert the call and negate the result: 
-      // min(-X, -Y) --> -(max(X, Y)) 
-      // max(-X, -Y) --> -(min(X, Y)) 
-      Intrinsic::ID NewIID; 
-      switch (IID) { 
-      case Intrinsic::maxnum: 
-        NewIID = Intrinsic::minnum; 
-        break; 
-      case Intrinsic::minnum: 
-        NewIID = Intrinsic::maxnum; 
-        break; 
-      case Intrinsic::maximum: 
-        NewIID = Intrinsic::minimum; 
-        break; 
-      case Intrinsic::minimum: 
-        NewIID = Intrinsic::maximum; 
-        break; 
-      default: 
-        llvm_unreachable("unexpected intrinsic ID"); 
-      } 
-      Value *NewCall = Builder.CreateBinaryIntrinsic(NewIID, X, Y, II); 
-      Instruction *FNeg = UnaryOperator::CreateFNeg(NewCall); 
-      FNeg->copyIRFlags(II); 
-      return FNeg; 
-    } 
- 
-    // m(m(X, C2), C1) -> m(X, C) 
-    const APFloat *C1, *C2; 
-    if (auto *M = dyn_cast<IntrinsicInst>(Arg0)) { 
-      if (M->getIntrinsicID() == IID && match(Arg1, m_APFloat(C1)) && 
-          ((match(M->getArgOperand(0), m_Value(X)) && 
-            match(M->getArgOperand(1), m_APFloat(C2))) || 
-           (match(M->getArgOperand(1), m_Value(X)) && 
-            match(M->getArgOperand(0), m_APFloat(C2))))) { 
-        APFloat Res(0.0); 
-        switch (IID) { 
-        case Intrinsic::maxnum: 
-          Res = maxnum(*C1, *C2); 
-          break; 
-        case Intrinsic::minnum: 
-          Res = minnum(*C1, *C2); 
-          break; 
-        case Intrinsic::maximum: 
-          Res = maximum(*C1, *C2); 
-          break; 
-        case Intrinsic::minimum: 
-          Res = minimum(*C1, *C2); 
-          break; 
-        default: 
-          llvm_unreachable("unexpected intrinsic ID"); 
-        } 
-        Instruction *NewCall = Builder.CreateBinaryIntrinsic( 
-            IID, X, ConstantFP::get(Arg0->getType(), Res), II); 
-        // TODO: Conservatively intersecting FMF. If Res == C2, the transform 
-        //       was a simplification (so Arg0 and its original flags could 
-        //       propagate?) 
-        NewCall->andIRFlags(M); 
-        return replaceInstUsesWith(*II, NewCall); 
-      } 
-    } 
- 
-    Value *ExtSrc0; 
-    Value *ExtSrc1; 
- 
-    // minnum (fpext x), (fpext y) -> minnum x, y 
-    // maxnum (fpext x), (fpext y) -> maxnum x, y 
-    if (match(II->getArgOperand(0), m_OneUse(m_FPExt(m_Value(ExtSrc0)))) && 
-        match(II->getArgOperand(1), m_OneUse(m_FPExt(m_Value(ExtSrc1)))) && 
-        ExtSrc0->getType() == ExtSrc1->getType()) { 
-      Function *F = Intrinsic::getDeclaration( 
-          II->getModule(), II->getIntrinsicID(), {ExtSrc0->getType()}); 
-      CallInst *NewCall = Builder.CreateCall(F, { ExtSrc0, ExtSrc1 }); 
-      NewCall->copyFastMathFlags(II); 
-      NewCall->takeName(II); 
-      return new FPExtInst(NewCall, II->getType()); 
-    } 
- 
-    break; 
-  } 
-  case Intrinsic::fmuladd: { 
-    // Canonicalize fast fmuladd to the separate fmul + fadd. 
-    if (II->isFast()) { 
-      BuilderTy::FastMathFlagGuard Guard(Builder); 
-      Builder.setFastMathFlags(II->getFastMathFlags()); 
-      Value *Mul = Builder.CreateFMul(II->getArgOperand(0), 
-                                      II->getArgOperand(1)); 
-      Value *Add = Builder.CreateFAdd(Mul, II->getArgOperand(2)); 
-      Add->takeName(II); 
-      return replaceInstUsesWith(*II, Add); 
-    } 
- 
-    // Try to simplify the underlying FMul. 
-    if (Value *V = SimplifyFMulInst(II->getArgOperand(0), II->getArgOperand(1), 
-                                    II->getFastMathFlags(), 
-                                    SQ.getWithInstruction(II))) { 
-      auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2)); 
-      FAdd->copyFastMathFlags(II); 
-      return FAdd; 
-    } 
- 
-    LLVM_FALLTHROUGH; 
-  } 
-  case Intrinsic::fma: { 
-    // fma fneg(x), fneg(y), z -> fma x, y, z 
-    Value *Src0 = II->getArgOperand(0); 
-    Value *Src1 = II->getArgOperand(1); 
-    Value *X, *Y; 
-    if (match(Src0, m_FNeg(m_Value(X))) && match(Src1, m_FNeg(m_Value(Y)))) { 
-      replaceOperand(*II, 0, X); 
-      replaceOperand(*II, 1, Y); 
-      return II; 
-    } 
- 
-    // fma fabs(x), fabs(x), z -> fma x, x, z 
-    if (match(Src0, m_FAbs(m_Value(X))) && 
-        match(Src1, m_FAbs(m_Specific(X)))) { 
-      replaceOperand(*II, 0, X); 
-      replaceOperand(*II, 1, X); 
-      return II; 
-    } 
- 
-    // Try to simplify the underlying FMul. We can only apply simplifications 
-    // that do not require rounding. 
-    if (Value *V = SimplifyFMAFMul(II->getArgOperand(0), II->getArgOperand(1), 
-                                   II->getFastMathFlags(), 
-                                   SQ.getWithInstruction(II))) { 
-      auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2)); 
-      FAdd->copyFastMathFlags(II); 
-      return FAdd; 
-    } 
- 
-    // fma x, y, 0 -> fmul x, y 
-    // This is always valid for -0.0, but requires nsz for +0.0 as 
-    // -0.0 + 0.0 = 0.0, which would not be the same as the fmul on its own. 
-    if (match(II->getArgOperand(2), m_NegZeroFP()) || 
-        (match(II->getArgOperand(2), m_PosZeroFP()) && 
-         II->getFastMathFlags().noSignedZeros())) 
-      return BinaryOperator::CreateFMulFMF(Src0, Src1, II); 
- 
-    break; 
-  } 
-  case Intrinsic::copysign: { 
+      // Canonicalize a shift amount constant operand to modulo the bit-width.
+      Constant *WidthC = ConstantInt::get(Ty, BitWidth);
+      Constant *ModuloC = ConstantExpr::getURem(ShAmtC, WidthC);
+      if (ModuloC != ShAmtC)
+        return replaceOperand(*II, 2, ModuloC);
+
+      assert(ConstantExpr::getICmp(ICmpInst::ICMP_UGT, WidthC, ShAmtC) ==
+                 ConstantInt::getTrue(CmpInst::makeCmpResultType(Ty)) &&
+             "Shift amount expected to be modulo bitwidth");
+
+      // Canonicalize funnel shift right by constant to funnel shift left. This
+      // is not entirely arbitrary. For historical reasons, the backend may
+      // recognize rotate left patterns but miss rotate right patterns.
+      if (IID == Intrinsic::fshr) {
+        // fshr X, Y, C --> fshl X, Y, (BitWidth - C)
+        Constant *LeftShiftC = ConstantExpr::getSub(WidthC, ShAmtC);
+        Module *Mod = II->getModule();
+        Function *Fshl = Intrinsic::getDeclaration(Mod, Intrinsic::fshl, Ty);
+        return CallInst::Create(Fshl, { Op0, Op1, LeftShiftC });
+      }
+      assert(IID == Intrinsic::fshl &&
+             "All funnel shifts by simple constants should go left");
+
+      // fshl(X, 0, C) --> shl X, C
+      // fshl(X, undef, C) --> shl X, C
+      if (match(Op1, m_ZeroInt()) || match(Op1, m_Undef()))
+        return BinaryOperator::CreateShl(Op0, ShAmtC);
+
+      // fshl(0, X, C) --> lshr X, (BW-C)
+      // fshl(undef, X, C) --> lshr X, (BW-C)
+      if (match(Op0, m_ZeroInt()) || match(Op0, m_Undef()))
+        return BinaryOperator::CreateLShr(Op1,
+                                          ConstantExpr::getSub(WidthC, ShAmtC));
+
+      // fshl i16 X, X, 8 --> bswap i16 X (reduce to more-specific form)
+      if (Op0 == Op1 && BitWidth == 16 && match(ShAmtC, m_SpecificInt(8))) {
+        Module *Mod = II->getModule();
+        Function *Bswap = Intrinsic::getDeclaration(Mod, Intrinsic::bswap, Ty);
+        return CallInst::Create(Bswap, { Op0 });
+      }
+    }
+
+    // Left or right might be masked.
+    if (SimplifyDemandedInstructionBits(*II))
+      return &CI;
+
+    // The shift amount (operand 2) of a funnel shift is modulo the bitwidth,
+    // so only the low bits of the shift amount are demanded if the bitwidth is
+    // a power-of-2.
+    if (!isPowerOf2_32(BitWidth))
+      break;
+    APInt Op2Demanded = APInt::getLowBitsSet(BitWidth, Log2_32_Ceil(BitWidth));
+    KnownBits Op2Known(BitWidth);
+    if (SimplifyDemandedBits(II, 2, Op2Demanded, Op2Known))
+      return &CI;
+    break;
+  }
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::sadd_with_overflow: {
+    if (Instruction *I = foldIntrinsicWithOverflowCommon(II))
+      return I;
+
+    // Given 2 constant operands whose sum does not overflow:
+    // uaddo (X +nuw C0), C1 -> uaddo X, C0 + C1
+    // saddo (X +nsw C0), C1 -> saddo X, C0 + C1
+    Value *X;
+    const APInt *C0, *C1;
+    Value *Arg0 = II->getArgOperand(0);
+    Value *Arg1 = II->getArgOperand(1);
+    bool IsSigned = IID == Intrinsic::sadd_with_overflow;
+    bool HasNWAdd = IsSigned ? match(Arg0, m_NSWAdd(m_Value(X), m_APInt(C0)))
+                             : match(Arg0, m_NUWAdd(m_Value(X), m_APInt(C0)));
+    if (HasNWAdd && match(Arg1, m_APInt(C1))) {
+      bool Overflow;
+      APInt NewC =
+          IsSigned ? C1->sadd_ov(*C0, Overflow) : C1->uadd_ov(*C0, Overflow);
+      if (!Overflow)
+        return replaceInstUsesWith(
+            *II, Builder.CreateBinaryIntrinsic(
+                     IID, X, ConstantInt::get(Arg1->getType(), NewC)));
+    }
+    break;
+  }
+
+  case Intrinsic::umul_with_overflow:
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::usub_with_overflow:
+    if (Instruction *I = foldIntrinsicWithOverflowCommon(II))
+      return I;
+    break;
+
+  case Intrinsic::ssub_with_overflow: {
+    if (Instruction *I = foldIntrinsicWithOverflowCommon(II))
+      return I;
+
+    Constant *C;
+    Value *Arg0 = II->getArgOperand(0);
+    Value *Arg1 = II->getArgOperand(1);
+    // Given a constant C that is not the minimum signed value
+    // for an integer of a given bit width:
+    //
+    // ssubo X, C -> saddo X, -C
+    if (match(Arg1, m_Constant(C)) && C->isNotMinSignedValue()) {
+      Value *NegVal = ConstantExpr::getNeg(C);
+      // Build a saddo call that is equivalent to the discovered
+      // ssubo call.
+      return replaceInstUsesWith(
+          *II, Builder.CreateBinaryIntrinsic(Intrinsic::sadd_with_overflow,
+                                             Arg0, NegVal));
+    }
+
+    break;
+  }
+
+  case Intrinsic::uadd_sat:
+  case Intrinsic::sadd_sat:
+  case Intrinsic::usub_sat:
+  case Intrinsic::ssub_sat: {
+    SaturatingInst *SI = cast<SaturatingInst>(II);
+    Type *Ty = SI->getType();
+    Value *Arg0 = SI->getLHS();
+    Value *Arg1 = SI->getRHS();
+
+    // Make use of known overflow information.
+    OverflowResult OR = computeOverflow(SI->getBinaryOp(), SI->isSigned(),
+                                        Arg0, Arg1, SI);
+    switch (OR) {
+      case OverflowResult::MayOverflow:
+        break;
+      case OverflowResult::NeverOverflows:
+        if (SI->isSigned())
+          return BinaryOperator::CreateNSW(SI->getBinaryOp(), Arg0, Arg1);
+        else
+          return BinaryOperator::CreateNUW(SI->getBinaryOp(), Arg0, Arg1);
+      case OverflowResult::AlwaysOverflowsLow: {
+        unsigned BitWidth = Ty->getScalarSizeInBits();
+        APInt Min = APSInt::getMinValue(BitWidth, !SI->isSigned());
+        return replaceInstUsesWith(*SI, ConstantInt::get(Ty, Min));
+      }
+      case OverflowResult::AlwaysOverflowsHigh: {
+        unsigned BitWidth = Ty->getScalarSizeInBits();
+        APInt Max = APSInt::getMaxValue(BitWidth, !SI->isSigned());
+        return replaceInstUsesWith(*SI, ConstantInt::get(Ty, Max));
+      }
+    }
+
+    // ssub.sat(X, C) -> sadd.sat(X, -C) if C != MIN
+    Constant *C;
+    if (IID == Intrinsic::ssub_sat && match(Arg1, m_Constant(C)) &&
+        C->isNotMinSignedValue()) {
+      Value *NegVal = ConstantExpr::getNeg(C);
+      return replaceInstUsesWith(
+          *II, Builder.CreateBinaryIntrinsic(
+              Intrinsic::sadd_sat, Arg0, NegVal));
+    }
+
+    // sat(sat(X + Val2) + Val) -> sat(X + (Val+Val2))
+    // sat(sat(X - Val2) - Val) -> sat(X - (Val+Val2))
+    // if Val and Val2 have the same sign
+    if (auto *Other = dyn_cast<IntrinsicInst>(Arg0)) {
+      Value *X;
+      const APInt *Val, *Val2;
+      APInt NewVal;
+      bool IsUnsigned =
+          IID == Intrinsic::uadd_sat || IID == Intrinsic::usub_sat;
+      if (Other->getIntrinsicID() == IID &&
+          match(Arg1, m_APInt(Val)) &&
+          match(Other->getArgOperand(0), m_Value(X)) &&
+          match(Other->getArgOperand(1), m_APInt(Val2))) {
+        if (IsUnsigned)
+          NewVal = Val->uadd_sat(*Val2);
+        else if (Val->isNonNegative() == Val2->isNonNegative()) {
+          bool Overflow;
+          NewVal = Val->sadd_ov(*Val2, Overflow);
+          if (Overflow) {
+            // Both adds together may add more than SignedMaxValue
+            // without saturating the final result.
+            break;
+          }
+        } else {
+          // Cannot fold saturated addition with different signs.
+          break;
+        }
+
+        return replaceInstUsesWith(
+            *II, Builder.CreateBinaryIntrinsic(
+                     IID, X, ConstantInt::get(II->getType(), NewVal)));
+      }
+    }
+    break;
+  }
+
+  case Intrinsic::minnum:
+  case Intrinsic::maxnum:
+  case Intrinsic::minimum:
+  case Intrinsic::maximum: {
+    Value *Arg0 = II->getArgOperand(0);
+    Value *Arg1 = II->getArgOperand(1);
+    Value *X, *Y;
+    if (match(Arg0, m_FNeg(m_Value(X))) && match(Arg1, m_FNeg(m_Value(Y))) &&
+        (Arg0->hasOneUse() || Arg1->hasOneUse())) {
+      // If both operands are negated, invert the call and negate the result:
+      // min(-X, -Y) --> -(max(X, Y))
+      // max(-X, -Y) --> -(min(X, Y))
+      Intrinsic::ID NewIID;
+      switch (IID) {
+      case Intrinsic::maxnum:
+        NewIID = Intrinsic::minnum;
+        break;
+      case Intrinsic::minnum:
+        NewIID = Intrinsic::maxnum;
+        break;
+      case Intrinsic::maximum:
+        NewIID = Intrinsic::minimum;
+        break;
+      case Intrinsic::minimum:
+        NewIID = Intrinsic::maximum;
+        break;
+      default:
+        llvm_unreachable("unexpected intrinsic ID");
+      }
+      Value *NewCall = Builder.CreateBinaryIntrinsic(NewIID, X, Y, II);
+      Instruction *FNeg = UnaryOperator::CreateFNeg(NewCall);
+      FNeg->copyIRFlags(II);
+      return FNeg;
+    }
+
+    // m(m(X, C2), C1) -> m(X, C)
+    const APFloat *C1, *C2;
+    if (auto *M = dyn_cast<IntrinsicInst>(Arg0)) {
+      if (M->getIntrinsicID() == IID && match(Arg1, m_APFloat(C1)) &&
+          ((match(M->getArgOperand(0), m_Value(X)) &&
+            match(M->getArgOperand(1), m_APFloat(C2))) ||
+           (match(M->getArgOperand(1), m_Value(X)) &&
+            match(M->getArgOperand(0), m_APFloat(C2))))) {
+        APFloat Res(0.0);
+        switch (IID) {
+        case Intrinsic::maxnum:
+          Res = maxnum(*C1, *C2);
+          break;
+        case Intrinsic::minnum:
+          Res = minnum(*C1, *C2);
+          break;
+        case Intrinsic::maximum:
+          Res = maximum(*C1, *C2);
+          break;
+        case Intrinsic::minimum:
+          Res = minimum(*C1, *C2);
+          break;
+        default:
+          llvm_unreachable("unexpected intrinsic ID");
+        }
+        Instruction *NewCall = Builder.CreateBinaryIntrinsic(
+            IID, X, ConstantFP::get(Arg0->getType(), Res), II);
+        // TODO: Conservatively intersecting FMF. If Res == C2, the transform
+        //       was a simplification (so Arg0 and its original flags could
+        //       propagate?)
+        NewCall->andIRFlags(M);
+        return replaceInstUsesWith(*II, NewCall);
+      }
+    }
+
+    Value *ExtSrc0;
+    Value *ExtSrc1;
+
+    // minnum (fpext x), (fpext y) -> minnum x, y
+    // maxnum (fpext x), (fpext y) -> maxnum x, y
+    if (match(II->getArgOperand(0), m_OneUse(m_FPExt(m_Value(ExtSrc0)))) &&
+        match(II->getArgOperand(1), m_OneUse(m_FPExt(m_Value(ExtSrc1)))) &&
+        ExtSrc0->getType() == ExtSrc1->getType()) {
+      Function *F = Intrinsic::getDeclaration(
+          II->getModule(), II->getIntrinsicID(), {ExtSrc0->getType()});
+      CallInst *NewCall = Builder.CreateCall(F, { ExtSrc0, ExtSrc1 });
+      NewCall->copyFastMathFlags(II);
+      NewCall->takeName(II);
+      return new FPExtInst(NewCall, II->getType());
+    }
+
+    break;
+  }
+  case Intrinsic::fmuladd: {
+    // Canonicalize fast fmuladd to the separate fmul + fadd.
+    if (II->isFast()) {
+      BuilderTy::FastMathFlagGuard Guard(Builder);
+      Builder.setFastMathFlags(II->getFastMathFlags());
+      Value *Mul = Builder.CreateFMul(II->getArgOperand(0),
+                                      II->getArgOperand(1));
+      Value *Add = Builder.CreateFAdd(Mul, II->getArgOperand(2));
+      Add->takeName(II);
+      return replaceInstUsesWith(*II, Add);
+    }
+
+    // Try to simplify the underlying FMul.
+    if (Value *V = SimplifyFMulInst(II->getArgOperand(0), II->getArgOperand(1),
+                                    II->getFastMathFlags(),
+                                    SQ.getWithInstruction(II))) {
+      auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2));
+      FAdd->copyFastMathFlags(II);
+      return FAdd;
+    }
+
+    LLVM_FALLTHROUGH;
+  }
+  case Intrinsic::fma: {
+    // fma fneg(x), fneg(y), z -> fma x, y, z
+    Value *Src0 = II->getArgOperand(0);
+    Value *Src1 = II->getArgOperand(1);
+    Value *X, *Y;
+    if (match(Src0, m_FNeg(m_Value(X))) && match(Src1, m_FNeg(m_Value(Y)))) {
+      replaceOperand(*II, 0, X);
+      replaceOperand(*II, 1, Y);
+      return II;
+    }
+
+    // fma fabs(x), fabs(x), z -> fma x, x, z
+    if (match(Src0, m_FAbs(m_Value(X))) &&
+        match(Src1, m_FAbs(m_Specific(X)))) {
+      replaceOperand(*II, 0, X);
+      replaceOperand(*II, 1, X);
+      return II;
+    }
+
+    // Try to simplify the underlying FMul. We can only apply simplifications
+    // that do not require rounding.
+    if (Value *V = SimplifyFMAFMul(II->getArgOperand(0), II->getArgOperand(1),
+                                   II->getFastMathFlags(),
+                                   SQ.getWithInstruction(II))) {
+      auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2));
+      FAdd->copyFastMathFlags(II);
+      return FAdd;
+    }
+
+    // fma x, y, 0 -> fmul x, y
+    // This is always valid for -0.0, but requires nsz for +0.0 as
+    // -0.0 + 0.0 = 0.0, which would not be the same as the fmul on its own.
+    if (match(II->getArgOperand(2), m_NegZeroFP()) ||
+        (match(II->getArgOperand(2), m_PosZeroFP()) &&
+         II->getFastMathFlags().noSignedZeros()))
+      return BinaryOperator::CreateFMulFMF(Src0, Src1, II);
+
+    break;
+  }
+  case Intrinsic::copysign: {
     Value *Mag = II->getArgOperand(0), *Sign = II->getArgOperand(1);
     if (SignBitMustBeZero(Sign, &TLI)) {
-      // If we know that the sign argument is positive, reduce to FABS: 
+      // If we know that the sign argument is positive, reduce to FABS:
       // copysign Mag, +Sign --> fabs Mag
       Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, Mag, II);
-      return replaceInstUsesWith(*II, Fabs); 
-    } 
-    // TODO: There should be a ValueTracking sibling like SignBitMustBeOne. 
-    const APFloat *C; 
+      return replaceInstUsesWith(*II, Fabs);
+    }
+    // TODO: There should be a ValueTracking sibling like SignBitMustBeOne.
+    const APFloat *C;
     if (match(Sign, m_APFloat(C)) && C->isNegative()) {
-      // If we know that the sign argument is negative, reduce to FNABS: 
+      // If we know that the sign argument is negative, reduce to FNABS:
       // copysign Mag, -Sign --> fneg (fabs Mag)
       Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, Mag, II);
-      return replaceInstUsesWith(*II, Builder.CreateFNegFMF(Fabs, II)); 
-    } 
- 
-    // Propagate sign argument through nested calls: 
+      return replaceInstUsesWith(*II, Builder.CreateFNegFMF(Fabs, II));
+    }
+
+    // Propagate sign argument through nested calls:
     // copysign Mag, (copysign ?, X) --> copysign Mag, X
     Value *X;
     if (match(Sign, m_Intrinsic<Intrinsic::copysign>(m_Value(), m_Value(X))))
       return replaceOperand(*II, 1, X);
- 
+
     // Peek through changes of magnitude's sign-bit. This call rewrites those:
     // copysign (fabs X), Sign --> copysign X, Sign
     // copysign (fneg X), Sign --> copysign X, Sign
     if (match(Mag, m_FAbs(m_Value(X))) || match(Mag, m_FNeg(m_Value(X))))
       return replaceOperand(*II, 0, X);
 
-    break; 
-  } 
-  case Intrinsic::fabs: { 
+    break;
+  }
+  case Intrinsic::fabs: {
     Value *Cond, *TVal, *FVal;
-    if (match(II->getArgOperand(0), 
+    if (match(II->getArgOperand(0),
               m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal)))) {
       // fabs (select Cond, TrueC, FalseC) --> select Cond, AbsT, AbsF
       if (isa<Constant>(TVal) && isa<Constant>(FVal)) {
@@ -1305,276 +1305,276 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       // fabs (select Cond, TVal, -TVal) --> fabs TVal
       if (match(FVal, m_FNeg(m_Specific(TVal))))
         return replaceOperand(*II, 0, TVal);
-    } 
- 
-    LLVM_FALLTHROUGH; 
-  } 
-  case Intrinsic::ceil: 
-  case Intrinsic::floor: 
-  case Intrinsic::round: 
-  case Intrinsic::roundeven: 
-  case Intrinsic::nearbyint: 
-  case Intrinsic::rint: 
-  case Intrinsic::trunc: { 
-    Value *ExtSrc; 
-    if (match(II->getArgOperand(0), m_OneUse(m_FPExt(m_Value(ExtSrc))))) { 
-      // Narrow the call: intrinsic (fpext x) -> fpext (intrinsic x) 
-      Value *NarrowII = Builder.CreateUnaryIntrinsic(IID, ExtSrc, II); 
-      return new FPExtInst(NarrowII, II->getType()); 
-    } 
-    break; 
-  } 
-  case Intrinsic::cos: 
-  case Intrinsic::amdgcn_cos: { 
-    Value *X; 
-    Value *Src = II->getArgOperand(0); 
-    if (match(Src, m_FNeg(m_Value(X))) || match(Src, m_FAbs(m_Value(X)))) { 
-      // cos(-x) -> cos(x) 
-      // cos(fabs(x)) -> cos(x) 
-      return replaceOperand(*II, 0, X); 
-    } 
-    break; 
-  } 
-  case Intrinsic::sin: { 
-    Value *X; 
-    if (match(II->getArgOperand(0), m_OneUse(m_FNeg(m_Value(X))))) { 
-      // sin(-x) --> -sin(x) 
-      Value *NewSin = Builder.CreateUnaryIntrinsic(Intrinsic::sin, X, II); 
-      Instruction *FNeg = UnaryOperator::CreateFNeg(NewSin); 
-      FNeg->copyFastMathFlags(II); 
-      return FNeg; 
-    } 
-    break; 
-  } 
- 
-  case Intrinsic::arm_neon_vtbl1: 
-  case Intrinsic::aarch64_neon_tbl1: 
-    if (Value *V = simplifyNeonTbl1(*II, Builder)) 
-      return replaceInstUsesWith(*II, V); 
-    break; 
- 
-  case Intrinsic::arm_neon_vmulls: 
-  case Intrinsic::arm_neon_vmullu: 
-  case Intrinsic::aarch64_neon_smull: 
-  case Intrinsic::aarch64_neon_umull: { 
-    Value *Arg0 = II->getArgOperand(0); 
-    Value *Arg1 = II->getArgOperand(1); 
- 
-    // Handle mul by zero first: 
-    if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) { 
-      return replaceInstUsesWith(CI, ConstantAggregateZero::get(II->getType())); 
-    } 
- 
-    // Check for constant LHS & RHS - in this case we just simplify. 
-    bool Zext = (IID == Intrinsic::arm_neon_vmullu || 
-                 IID == Intrinsic::aarch64_neon_umull); 
-    VectorType *NewVT = cast<VectorType>(II->getType()); 
-    if (Constant *CV0 = dyn_cast<Constant>(Arg0)) { 
-      if (Constant *CV1 = dyn_cast<Constant>(Arg1)) { 
-        CV0 = ConstantExpr::getIntegerCast(CV0, NewVT, /*isSigned=*/!Zext); 
-        CV1 = ConstantExpr::getIntegerCast(CV1, NewVT, /*isSigned=*/!Zext); 
- 
-        return replaceInstUsesWith(CI, ConstantExpr::getMul(CV0, CV1)); 
-      } 
- 
-      // Couldn't simplify - canonicalize constant to the RHS. 
-      std::swap(Arg0, Arg1); 
-    } 
- 
-    // Handle mul by one: 
-    if (Constant *CV1 = dyn_cast<Constant>(Arg1)) 
-      if (ConstantInt *Splat = 
-              dyn_cast_or_null<ConstantInt>(CV1->getSplatValue())) 
-        if (Splat->isOne()) 
-          return CastInst::CreateIntegerCast(Arg0, II->getType(), 
-                                             /*isSigned=*/!Zext); 
- 
-    break; 
-  } 
-  case Intrinsic::arm_neon_aesd: 
-  case Intrinsic::arm_neon_aese: 
-  case Intrinsic::aarch64_crypto_aesd: 
-  case Intrinsic::aarch64_crypto_aese: { 
-    Value *DataArg = II->getArgOperand(0); 
-    Value *KeyArg  = II->getArgOperand(1); 
- 
-    // Try to use the builtin XOR in AESE and AESD to eliminate a prior XOR 
-    Value *Data, *Key; 
-    if (match(KeyArg, m_ZeroInt()) && 
-        match(DataArg, m_Xor(m_Value(Data), m_Value(Key)))) { 
-      replaceOperand(*II, 0, Data); 
-      replaceOperand(*II, 1, Key); 
-      return II; 
-    } 
-    break; 
-  } 
-  case Intrinsic::hexagon_V6_vandvrt: 
-  case Intrinsic::hexagon_V6_vandvrt_128B: { 
-    // Simplify Q -> V -> Q conversion. 
-    if (auto Op0 = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) { 
-      Intrinsic::ID ID0 = Op0->getIntrinsicID(); 
-      if (ID0 != Intrinsic::hexagon_V6_vandqrt && 
-          ID0 != Intrinsic::hexagon_V6_vandqrt_128B) 
-        break; 
-      Value *Bytes = Op0->getArgOperand(1), *Mask = II->getArgOperand(1); 
-      uint64_t Bytes1 = computeKnownBits(Bytes, 0, Op0).One.getZExtValue(); 
-      uint64_t Mask1 = computeKnownBits(Mask, 0, II).One.getZExtValue(); 
-      // Check if every byte has common bits in Bytes and Mask. 
-      uint64_t C = Bytes1 & Mask1; 
-      if ((C & 0xFF) && (C & 0xFF00) && (C & 0xFF0000) && (C & 0xFF000000)) 
-        return replaceInstUsesWith(*II, Op0->getArgOperand(0)); 
-    } 
-    break; 
-  } 
-  case Intrinsic::stackrestore: { 
-    // If the save is right next to the restore, remove the restore.  This can 
-    // happen when variable allocas are DCE'd. 
-    if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) { 
-      if (SS->getIntrinsicID() == Intrinsic::stacksave) { 
-        // Skip over debug info. 
-        if (SS->getNextNonDebugInstruction() == II) { 
-          return eraseInstFromFunction(CI); 
-        } 
-      } 
-    } 
- 
-    // Scan down this block to see if there is another stack restore in the 
-    // same block without an intervening call/alloca. 
-    BasicBlock::iterator BI(II); 
-    Instruction *TI = II->getParent()->getTerminator(); 
-    bool CannotRemove = false; 
-    for (++BI; &*BI != TI; ++BI) { 
-      if (isa<AllocaInst>(BI)) { 
-        CannotRemove = true; 
-        break; 
-      } 
-      if (CallInst *BCI = dyn_cast<CallInst>(BI)) { 
-        if (auto *II2 = dyn_cast<IntrinsicInst>(BCI)) { 
-          // If there is a stackrestore below this one, remove this one. 
-          if (II2->getIntrinsicID() == Intrinsic::stackrestore) 
-            return eraseInstFromFunction(CI); 
- 
-          // Bail if we cross over an intrinsic with side effects, such as 
-          // llvm.stacksave, or llvm.read_register. 
-          if (II2->mayHaveSideEffects()) { 
-            CannotRemove = true; 
-            break; 
-          } 
-        } else { 
-          // If we found a non-intrinsic call, we can't remove the stack 
-          // restore. 
-          CannotRemove = true; 
-          break; 
-        } 
-      } 
-    } 
- 
-    // If the stack restore is in a return, resume, or unwind block and if there 
-    // are no allocas or calls between the restore and the return, nuke the 
-    // restore. 
-    if (!CannotRemove && (isa<ReturnInst>(TI) || isa<ResumeInst>(TI))) 
-      return eraseInstFromFunction(CI); 
-    break; 
-  } 
-  case Intrinsic::lifetime_end: 
-    // Asan needs to poison memory to detect invalid access which is possible 
-    // even for empty lifetime range. 
-    if (II->getFunction()->hasFnAttribute(Attribute::SanitizeAddress) || 
-        II->getFunction()->hasFnAttribute(Attribute::SanitizeMemory) || 
-        II->getFunction()->hasFnAttribute(Attribute::SanitizeHWAddress)) 
-      break; 
- 
-    if (removeTriviallyEmptyRange(*II, *this, [](const IntrinsicInst &I) { 
-          return I.getIntrinsicID() == Intrinsic::lifetime_start; 
-        })) 
-      return nullptr; 
-    break; 
-  case Intrinsic::assume: { 
-    Value *IIOperand = II->getArgOperand(0); 
+    }
+
+    LLVM_FALLTHROUGH;
+  }
+  case Intrinsic::ceil:
+  case Intrinsic::floor:
+  case Intrinsic::round:
+  case Intrinsic::roundeven:
+  case Intrinsic::nearbyint:
+  case Intrinsic::rint:
+  case Intrinsic::trunc: {
+    Value *ExtSrc;
+    if (match(II->getArgOperand(0), m_OneUse(m_FPExt(m_Value(ExtSrc))))) {
+      // Narrow the call: intrinsic (fpext x) -> fpext (intrinsic x)
+      Value *NarrowII = Builder.CreateUnaryIntrinsic(IID, ExtSrc, II);
+      return new FPExtInst(NarrowII, II->getType());
+    }
+    break;
+  }
+  case Intrinsic::cos:
+  case Intrinsic::amdgcn_cos: {
+    Value *X;
+    Value *Src = II->getArgOperand(0);
+    if (match(Src, m_FNeg(m_Value(X))) || match(Src, m_FAbs(m_Value(X)))) {
+      // cos(-x) -> cos(x)
+      // cos(fabs(x)) -> cos(x)
+      return replaceOperand(*II, 0, X);
+    }
+    break;
+  }
+  case Intrinsic::sin: {
+    Value *X;
+    if (match(II->getArgOperand(0), m_OneUse(m_FNeg(m_Value(X))))) {
+      // sin(-x) --> -sin(x)
+      Value *NewSin = Builder.CreateUnaryIntrinsic(Intrinsic::sin, X, II);
+      Instruction *FNeg = UnaryOperator::CreateFNeg(NewSin);
+      FNeg->copyFastMathFlags(II);
+      return FNeg;
+    }
+    break;
+  }
+
+  case Intrinsic::arm_neon_vtbl1:
+  case Intrinsic::aarch64_neon_tbl1:
+    if (Value *V = simplifyNeonTbl1(*II, Builder))
+      return replaceInstUsesWith(*II, V);
+    break;
+
+  case Intrinsic::arm_neon_vmulls:
+  case Intrinsic::arm_neon_vmullu:
+  case Intrinsic::aarch64_neon_smull:
+  case Intrinsic::aarch64_neon_umull: {
+    Value *Arg0 = II->getArgOperand(0);
+    Value *Arg1 = II->getArgOperand(1);
+
+    // Handle mul by zero first:
+    if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) {
+      return replaceInstUsesWith(CI, ConstantAggregateZero::get(II->getType()));
+    }
+
+    // Check for constant LHS & RHS - in this case we just simplify.
+    bool Zext = (IID == Intrinsic::arm_neon_vmullu ||
+                 IID == Intrinsic::aarch64_neon_umull);
+    VectorType *NewVT = cast<VectorType>(II->getType());
+    if (Constant *CV0 = dyn_cast<Constant>(Arg0)) {
+      if (Constant *CV1 = dyn_cast<Constant>(Arg1)) {
+        CV0 = ConstantExpr::getIntegerCast(CV0, NewVT, /*isSigned=*/!Zext);
+        CV1 = ConstantExpr::getIntegerCast(CV1, NewVT, /*isSigned=*/!Zext);
+
+        return replaceInstUsesWith(CI, ConstantExpr::getMul(CV0, CV1));
+      }
+
+      // Couldn't simplify - canonicalize constant to the RHS.
+      std::swap(Arg0, Arg1);
+    }
+
+    // Handle mul by one:
+    if (Constant *CV1 = dyn_cast<Constant>(Arg1))
+      if (ConstantInt *Splat =
+              dyn_cast_or_null<ConstantInt>(CV1->getSplatValue()))
+        if (Splat->isOne())
+          return CastInst::CreateIntegerCast(Arg0, II->getType(),
+                                             /*isSigned=*/!Zext);
+
+    break;
+  }
+  case Intrinsic::arm_neon_aesd:
+  case Intrinsic::arm_neon_aese:
+  case Intrinsic::aarch64_crypto_aesd:
+  case Intrinsic::aarch64_crypto_aese: {
+    Value *DataArg = II->getArgOperand(0);
+    Value *KeyArg  = II->getArgOperand(1);
+
+    // Try to use the builtin XOR in AESE and AESD to eliminate a prior XOR
+    Value *Data, *Key;
+    if (match(KeyArg, m_ZeroInt()) &&
+        match(DataArg, m_Xor(m_Value(Data), m_Value(Key)))) {
+      replaceOperand(*II, 0, Data);
+      replaceOperand(*II, 1, Key);
+      return II;
+    }
+    break;
+  }
+  case Intrinsic::hexagon_V6_vandvrt:
+  case Intrinsic::hexagon_V6_vandvrt_128B: {
+    // Simplify Q -> V -> Q conversion.
+    if (auto Op0 = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) {
+      Intrinsic::ID ID0 = Op0->getIntrinsicID();
+      if (ID0 != Intrinsic::hexagon_V6_vandqrt &&
+          ID0 != Intrinsic::hexagon_V6_vandqrt_128B)
+        break;
+      Value *Bytes = Op0->getArgOperand(1), *Mask = II->getArgOperand(1);
+      uint64_t Bytes1 = computeKnownBits(Bytes, 0, Op0).One.getZExtValue();
+      uint64_t Mask1 = computeKnownBits(Mask, 0, II).One.getZExtValue();
+      // Check if every byte has common bits in Bytes and Mask.
+      uint64_t C = Bytes1 & Mask1;
+      if ((C & 0xFF) && (C & 0xFF00) && (C & 0xFF0000) && (C & 0xFF000000))
+        return replaceInstUsesWith(*II, Op0->getArgOperand(0));
+    }
+    break;
+  }
+  case Intrinsic::stackrestore: {
+    // If the save is right next to the restore, remove the restore.  This can
+    // happen when variable allocas are DCE'd.
+    if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) {
+      if (SS->getIntrinsicID() == Intrinsic::stacksave) {
+        // Skip over debug info.
+        if (SS->getNextNonDebugInstruction() == II) {
+          return eraseInstFromFunction(CI);
+        }
+      }
+    }
+
+    // Scan down this block to see if there is another stack restore in the
+    // same block without an intervening call/alloca.
+    BasicBlock::iterator BI(II);
+    Instruction *TI = II->getParent()->getTerminator();
+    bool CannotRemove = false;
+    for (++BI; &*BI != TI; ++BI) {
+      if (isa<AllocaInst>(BI)) {
+        CannotRemove = true;
+        break;
+      }
+      if (CallInst *BCI = dyn_cast<CallInst>(BI)) {
+        if (auto *II2 = dyn_cast<IntrinsicInst>(BCI)) {
+          // If there is a stackrestore below this one, remove this one.
+          if (II2->getIntrinsicID() == Intrinsic::stackrestore)
+            return eraseInstFromFunction(CI);
+
+          // Bail if we cross over an intrinsic with side effects, such as
+          // llvm.stacksave, or llvm.read_register.
+          if (II2->mayHaveSideEffects()) {
+            CannotRemove = true;
+            break;
+          }
+        } else {
+          // If we found a non-intrinsic call, we can't remove the stack
+          // restore.
+          CannotRemove = true;
+          break;
+        }
+      }
+    }
+
+    // If the stack restore is in a return, resume, or unwind block and if there
+    // are no allocas or calls between the restore and the return, nuke the
+    // restore.
+    if (!CannotRemove && (isa<ReturnInst>(TI) || isa<ResumeInst>(TI)))
+      return eraseInstFromFunction(CI);
+    break;
+  }
+  case Intrinsic::lifetime_end:
+    // Asan needs to poison memory to detect invalid access which is possible
+    // even for empty lifetime range.
+    if (II->getFunction()->hasFnAttribute(Attribute::SanitizeAddress) ||
+        II->getFunction()->hasFnAttribute(Attribute::SanitizeMemory) ||
+        II->getFunction()->hasFnAttribute(Attribute::SanitizeHWAddress))
+      break;
+
+    if (removeTriviallyEmptyRange(*II, *this, [](const IntrinsicInst &I) {
+          return I.getIntrinsicID() == Intrinsic::lifetime_start;
+        }))
+      return nullptr;
+    break;
+  case Intrinsic::assume: {
+    Value *IIOperand = II->getArgOperand(0);
     SmallVector<OperandBundleDef, 4> OpBundles;
     II->getOperandBundlesAsDefs(OpBundles);
     bool HasOpBundles = !OpBundles.empty();
-    // Remove an assume if it is followed by an identical assume. 
-    // TODO: Do we need this? Unless there are conflicting assumptions, the 
-    // computeKnownBits(IIOperand) below here eliminates redundant assumes. 
-    Instruction *Next = II->getNextNonDebugInstruction(); 
+    // Remove an assume if it is followed by an identical assume.
+    // TODO: Do we need this? Unless there are conflicting assumptions, the
+    // computeKnownBits(IIOperand) below here eliminates redundant assumes.
+    Instruction *Next = II->getNextNonDebugInstruction();
     if (HasOpBundles &&
         match(Next, m_Intrinsic<Intrinsic::assume>(m_Specific(IIOperand))) &&
         !cast<IntrinsicInst>(Next)->hasOperandBundles())
-      return eraseInstFromFunction(CI); 
- 
-    // Canonicalize assume(a && b) -> assume(a); assume(b); 
-    // Note: New assumption intrinsics created here are registered by 
-    // the InstCombineIRInserter object. 
-    FunctionType *AssumeIntrinsicTy = II->getFunctionType(); 
-    Value *AssumeIntrinsic = II->getCalledOperand(); 
-    Value *A, *B; 
+      return eraseInstFromFunction(CI);
+
+    // Canonicalize assume(a && b) -> assume(a); assume(b);
+    // Note: New assumption intrinsics created here are registered by
+    // the InstCombineIRInserter object.
+    FunctionType *AssumeIntrinsicTy = II->getFunctionType();
+    Value *AssumeIntrinsic = II->getCalledOperand();
+    Value *A, *B;
     if (match(IIOperand, m_LogicalAnd(m_Value(A), m_Value(B)))) {
       Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, A, OpBundles,
                          II->getName());
-      Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, B, II->getName()); 
-      return eraseInstFromFunction(*II); 
-    } 
-    // assume(!(a || b)) -> assume(!a); assume(!b); 
+      Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, B, II->getName());
+      return eraseInstFromFunction(*II);
+    }
+    // assume(!(a || b)) -> assume(!a); assume(!b);
     if (match(IIOperand, m_Not(m_LogicalOr(m_Value(A), m_Value(B))))) {
-      Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, 
+      Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic,
                          Builder.CreateNot(A), OpBundles, II->getName());
-      Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, 
-                         Builder.CreateNot(B), II->getName()); 
-      return eraseInstFromFunction(*II); 
-    } 
- 
-    // assume( (load addr) != null ) -> add 'nonnull' metadata to load 
-    // (if assume is valid at the load) 
-    CmpInst::Predicate Pred; 
-    Instruction *LHS; 
-    if (match(IIOperand, m_ICmp(Pred, m_Instruction(LHS), m_Zero())) && 
-        Pred == ICmpInst::ICMP_NE && LHS->getOpcode() == Instruction::Load && 
-        LHS->getType()->isPointerTy() && 
-        isValidAssumeForContext(II, LHS, &DT)) { 
-      MDNode *MD = MDNode::get(II->getContext(), None); 
-      LHS->setMetadata(LLVMContext::MD_nonnull, MD); 
+      Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic,
+                         Builder.CreateNot(B), II->getName());
+      return eraseInstFromFunction(*II);
+    }
+
+    // assume( (load addr) != null ) -> add 'nonnull' metadata to load
+    // (if assume is valid at the load)
+    CmpInst::Predicate Pred;
+    Instruction *LHS;
+    if (match(IIOperand, m_ICmp(Pred, m_Instruction(LHS), m_Zero())) &&
+        Pred == ICmpInst::ICMP_NE && LHS->getOpcode() == Instruction::Load &&
+        LHS->getType()->isPointerTy() &&
+        isValidAssumeForContext(II, LHS, &DT)) {
+      MDNode *MD = MDNode::get(II->getContext(), None);
+      LHS->setMetadata(LLVMContext::MD_nonnull, MD);
       if (!HasOpBundles)
         return eraseInstFromFunction(*II);
- 
-      // TODO: apply nonnull return attributes to calls and invokes 
-      // TODO: apply range metadata for range check patterns? 
-    } 
- 
-    // If there is a dominating assume with the same condition as this one, 
-    // then this one is redundant, and should be removed. 
-    KnownBits Known(1); 
-    computeKnownBits(IIOperand, Known, 0, II); 
-    if (Known.isAllOnes() && isAssumeWithEmptyBundle(*II)) 
-      return eraseInstFromFunction(*II); 
- 
-    // Update the cache of affected values for this assumption (we might be 
-    // here because we just simplified the condition). 
-    AC.updateAffectedValues(II); 
-    break; 
-  } 
+
+      // TODO: apply nonnull return attributes to calls and invokes
+      // TODO: apply range metadata for range check patterns?
+    }
+
+    // If there is a dominating assume with the same condition as this one,
+    // then this one is redundant, and should be removed.
+    KnownBits Known(1);
+    computeKnownBits(IIOperand, Known, 0, II);
+    if (Known.isAllOnes() && isAssumeWithEmptyBundle(*II))
+      return eraseInstFromFunction(*II);
+
+    // Update the cache of affected values for this assumption (we might be
+    // here because we just simplified the condition).
+    AC.updateAffectedValues(II);
+    break;
+  }
   case Intrinsic::experimental_gc_statepoint: {
     GCStatepointInst &GCSP = *cast<GCStatepointInst>(II);
     SmallPtrSet<Value *, 32> LiveGcValues;
     for (const GCRelocateInst *Reloc : GCSP.getGCRelocates()) {
       GCRelocateInst &GCR = *const_cast<GCRelocateInst *>(Reloc);
- 
+
       // Remove the relocation if unused.
       if (GCR.use_empty()) {
         eraseInstFromFunction(GCR);
         continue;
       }
- 
+
       Value *DerivedPtr = GCR.getDerivedPtr();
       Value *BasePtr = GCR.getBasePtr();
- 
+
       // Undef is undef, even after relocation.
       if (isa<UndefValue>(DerivedPtr) || isa<UndefValue>(BasePtr)) {
         replaceInstUsesWith(GCR, UndefValue::get(GCR.getType()));
         eraseInstFromFunction(GCR);
         continue;
       }
- 
+
       if (auto *PT = dyn_cast<PointerType>(GCR.getType())) {
         // The relocation of null will be null for most any collector.
         // TODO: provide a hook for this in GCStrategy.  There might be some
@@ -1585,7 +1585,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
           eraseInstFromFunction(GCR);
           continue;
         }
- 
+
         // isKnownNonNull -> nonnull attribute
         if (!GCR.hasRetAttr(Attribute::NonNull) &&
             isKnownNonZero(DerivedPtr, DL, 0, &AC, II, &DT)) {
@@ -1594,18 +1594,18 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
           Worklist.pushUsersToWorkList(GCR);
         }
       }
- 
+
       // If we have two copies of the same pointer in the statepoint argument
       // list, canonicalize to one.  This may let us common gc.relocates.
       if (GCR.getBasePtr() == GCR.getDerivedPtr() &&
           GCR.getBasePtrIndex() != GCR.getDerivedPtrIndex()) {
         auto *OpIntTy = GCR.getOperand(2)->getType();
         GCR.setOperand(2, ConstantInt::get(OpIntTy, GCR.getBasePtrIndex()));
-      } 
- 
+      }
+
       // TODO: bitcast(relocate(p)) -> relocate(bitcast(p))
       // Canonicalize on the type from the uses to the defs
- 
+
       // TODO: relocate((gep p, C, C2, ...)) -> gep(relocate(p), C, C2, ...)
       LiveGcValues.insert(BasePtr);
       LiveGcValues.insert(DerivedPtr);
@@ -1649,40 +1649,40 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     else
       return InvokeInst::CreateWithReplacedBundle(cast<InvokeInst>(II),
                                                   NewBundle);
-    break; 
-  } 
-  case Intrinsic::experimental_guard: { 
-    // Is this guard followed by another guard?  We scan forward over a small 
-    // fixed window of instructions to handle common cases with conditions 
-    // computed between guards. 
-    Instruction *NextInst = II->getNextNonDebugInstruction(); 
-    for (unsigned i = 0; i < GuardWideningWindow; i++) { 
-      // Note: Using context-free form to avoid compile time blow up 
-      if (!isSafeToSpeculativelyExecute(NextInst)) 
-        break; 
-      NextInst = NextInst->getNextNonDebugInstruction(); 
-    } 
-    Value *NextCond = nullptr; 
-    if (match(NextInst, 
-              m_Intrinsic<Intrinsic::experimental_guard>(m_Value(NextCond)))) { 
-      Value *CurrCond = II->getArgOperand(0); 
- 
-      // Remove a guard that it is immediately preceded by an identical guard. 
-      // Otherwise canonicalize guard(a); guard(b) -> guard(a & b). 
-      if (CurrCond != NextCond) { 
-        Instruction *MoveI = II->getNextNonDebugInstruction(); 
-        while (MoveI != NextInst) { 
-          auto *Temp = MoveI; 
-          MoveI = MoveI->getNextNonDebugInstruction(); 
-          Temp->moveBefore(II); 
-        } 
-        replaceOperand(*II, 0, Builder.CreateAnd(CurrCond, NextCond)); 
-      } 
-      eraseInstFromFunction(*NextInst); 
-      return II; 
-    } 
-    break; 
-  } 
+    break;
+  }
+  case Intrinsic::experimental_guard: {
+    // Is this guard followed by another guard?  We scan forward over a small
+    // fixed window of instructions to handle common cases with conditions
+    // computed between guards.
+    Instruction *NextInst = II->getNextNonDebugInstruction();
+    for (unsigned i = 0; i < GuardWideningWindow; i++) {
+      // Note: Using context-free form to avoid compile time blow up
+      if (!isSafeToSpeculativelyExecute(NextInst))
+        break;
+      NextInst = NextInst->getNextNonDebugInstruction();
+    }
+    Value *NextCond = nullptr;
+    if (match(NextInst,
+              m_Intrinsic<Intrinsic::experimental_guard>(m_Value(NextCond)))) {
+      Value *CurrCond = II->getArgOperand(0);
+
+      // Remove a guard that it is immediately preceded by an identical guard.
+      // Otherwise canonicalize guard(a); guard(b) -> guard(a & b).
+      if (CurrCond != NextCond) {
+        Instruction *MoveI = II->getNextNonDebugInstruction();
+        while (MoveI != NextInst) {
+          auto *Temp = MoveI;
+          MoveI = MoveI->getNextNonDebugInstruction();
+          Temp->moveBefore(II);
+        }
+        replaceOperand(*II, 0, Builder.CreateAnd(CurrCond, NextCond));
+      }
+      eraseInstFromFunction(*NextInst);
+      return II;
+    }
+    break;
+  }
   case Intrinsic::experimental_vector_insert: {
     Value *Vec = II->getArgOperand(0);
     Value *SubVec = II->getArgOperand(1);
@@ -1738,7 +1738,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       return eraseInstFromFunction(CI);
     }
     break;
-  } 
+  }
   case Intrinsic::experimental_vector_extract: {
     Value *Vec = II->getArgOperand(0);
     Value *Idx = II->getArgOperand(1);
@@ -1786,804 +1786,804 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     break;
   }
   }
-  return visitCallBase(*II); 
-} 
- 
-// Fence instruction simplification 
+  return visitCallBase(*II);
+}
+
+// Fence instruction simplification
 Instruction *InstCombinerImpl::visitFenceInst(FenceInst &FI) {
-  // Remove identical consecutive fences. 
-  Instruction *Next = FI.getNextNonDebugInstruction(); 
-  if (auto *NFI = dyn_cast<FenceInst>(Next)) 
-    if (FI.isIdenticalTo(NFI)) 
-      return eraseInstFromFunction(FI); 
-  return nullptr; 
-} 
- 
-// InvokeInst simplification 
+  // Remove identical consecutive fences.
+  Instruction *Next = FI.getNextNonDebugInstruction();
+  if (auto *NFI = dyn_cast<FenceInst>(Next))
+    if (FI.isIdenticalTo(NFI))
+      return eraseInstFromFunction(FI);
+  return nullptr;
+}
+
+// InvokeInst simplification
 Instruction *InstCombinerImpl::visitInvokeInst(InvokeInst &II) {
-  return visitCallBase(II); 
-} 
- 
-// CallBrInst simplification 
+  return visitCallBase(II);
+}
+
+// CallBrInst simplification
 Instruction *InstCombinerImpl::visitCallBrInst(CallBrInst &CBI) {
-  return visitCallBase(CBI); 
-} 
- 
-/// If this cast does not affect the value passed through the varargs area, we 
-/// can eliminate the use of the cast. 
-static bool isSafeToEliminateVarargsCast(const CallBase &Call, 
-                                         const DataLayout &DL, 
-                                         const CastInst *const CI, 
-                                         const int ix) { 
-  if (!CI->isLosslessCast()) 
-    return false; 
- 
-  // If this is a GC intrinsic, avoid munging types.  We need types for 
-  // statepoint reconstruction in SelectionDAG. 
-  // TODO: This is probably something which should be expanded to all 
-  // intrinsics since the entire point of intrinsics is that 
-  // they are understandable by the optimizer. 
-  if (isa<GCStatepointInst>(Call) || isa<GCRelocateInst>(Call) || 
-      isa<GCResultInst>(Call)) 
-    return false; 
- 
-  // The size of ByVal or InAlloca arguments is derived from the type, so we 
-  // can't change to a type with a different size.  If the size were 
-  // passed explicitly we could avoid this check. 
-  if (!Call.isPassPointeeByValueArgument(ix)) 
-    return true; 
- 
-  Type* SrcTy = 
-            cast<PointerType>(CI->getOperand(0)->getType())->getElementType(); 
-  Type *DstTy = Call.isByValArgument(ix) 
-                    ? Call.getParamByValType(ix) 
-                    : cast<PointerType>(CI->getType())->getElementType(); 
-  if (!SrcTy->isSized() || !DstTy->isSized()) 
-    return false; 
-  if (DL.getTypeAllocSize(SrcTy) != DL.getTypeAllocSize(DstTy)) 
-    return false; 
-  return true; 
-} 
- 
+  return visitCallBase(CBI);
+}
+
+/// If this cast does not affect the value passed through the varargs area, we
+/// can eliminate the use of the cast.
+static bool isSafeToEliminateVarargsCast(const CallBase &Call,
+                                         const DataLayout &DL,
+                                         const CastInst *const CI,
+                                         const int ix) {
+  if (!CI->isLosslessCast())
+    return false;
+
+  // If this is a GC intrinsic, avoid munging types.  We need types for
+  // statepoint reconstruction in SelectionDAG.
+  // TODO: This is probably something which should be expanded to all
+  // intrinsics since the entire point of intrinsics is that
+  // they are understandable by the optimizer.
+  if (isa<GCStatepointInst>(Call) || isa<GCRelocateInst>(Call) ||
+      isa<GCResultInst>(Call))
+    return false;
+
+  // The size of ByVal or InAlloca arguments is derived from the type, so we
+  // can't change to a type with a different size.  If the size were
+  // passed explicitly we could avoid this check.
+  if (!Call.isPassPointeeByValueArgument(ix))
+    return true;
+
+  Type* SrcTy =
+            cast<PointerType>(CI->getOperand(0)->getType())->getElementType();
+  Type *DstTy = Call.isByValArgument(ix)
+                    ? Call.getParamByValType(ix)
+                    : cast<PointerType>(CI->getType())->getElementType();
+  if (!SrcTy->isSized() || !DstTy->isSized())
+    return false;
+  if (DL.getTypeAllocSize(SrcTy) != DL.getTypeAllocSize(DstTy))
+    return false;
+  return true;
+}
+
 Instruction *InstCombinerImpl::tryOptimizeCall(CallInst *CI) {
-  if (!CI->getCalledFunction()) return nullptr; 
- 
-  auto InstCombineRAUW = [this](Instruction *From, Value *With) { 
-    replaceInstUsesWith(*From, With); 
-  }; 
-  auto InstCombineErase = [this](Instruction *I) { 
-    eraseInstFromFunction(*I); 
-  }; 
-  LibCallSimplifier Simplifier(DL, &TLI, ORE, BFI, PSI, InstCombineRAUW, 
-                               InstCombineErase); 
-  if (Value *With = Simplifier.optimizeCall(CI, Builder)) { 
-    ++NumSimplified; 
-    return CI->use_empty() ? CI : replaceInstUsesWith(*CI, With); 
-  } 
- 
-  return nullptr; 
-} 
- 
-static IntrinsicInst *findInitTrampolineFromAlloca(Value *TrampMem) { 
-  // Strip off at most one level of pointer casts, looking for an alloca.  This 
-  // is good enough in practice and simpler than handling any number of casts. 
-  Value *Underlying = TrampMem->stripPointerCasts(); 
-  if (Underlying != TrampMem && 
-      (!Underlying->hasOneUse() || Underlying->user_back() != TrampMem)) 
-    return nullptr; 
-  if (!isa<AllocaInst>(Underlying)) 
-    return nullptr; 
- 
-  IntrinsicInst *InitTrampoline = nullptr; 
-  for (User *U : TrampMem->users()) { 
-    IntrinsicInst *II = dyn_cast<IntrinsicInst>(U); 
-    if (!II) 
-      return nullptr; 
-    if (II->getIntrinsicID() == Intrinsic::init_trampoline) { 
-      if (InitTrampoline) 
-        // More than one init_trampoline writes to this value.  Give up. 
-        return nullptr; 
-      InitTrampoline = II; 
-      continue; 
-    } 
-    if (II->getIntrinsicID() == Intrinsic::adjust_trampoline) 
-      // Allow any number of calls to adjust.trampoline. 
-      continue; 
-    return nullptr; 
-  } 
- 
-  // No call to init.trampoline found. 
-  if (!InitTrampoline) 
-    return nullptr; 
- 
-  // Check that the alloca is being used in the expected way. 
-  if (InitTrampoline->getOperand(0) != TrampMem) 
-    return nullptr; 
- 
-  return InitTrampoline; 
-} 
- 
-static IntrinsicInst *findInitTrampolineFromBB(IntrinsicInst *AdjustTramp, 
-                                               Value *TrampMem) { 
-  // Visit all the previous instructions in the basic block, and try to find a 
-  // init.trampoline which has a direct path to the adjust.trampoline. 
-  for (BasicBlock::iterator I = AdjustTramp->getIterator(), 
-                            E = AdjustTramp->getParent()->begin(); 
-       I != E;) { 
-    Instruction *Inst = &*--I; 
-    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) 
-      if (II->getIntrinsicID() == Intrinsic::init_trampoline && 
-          II->getOperand(0) == TrampMem) 
-        return II; 
-    if (Inst->mayWriteToMemory()) 
-      return nullptr; 
-  } 
-  return nullptr; 
-} 
- 
-// Given a call to llvm.adjust.trampoline, find and return the corresponding 
-// call to llvm.init.trampoline if the call to the trampoline can be optimized 
-// to a direct call to a function.  Otherwise return NULL. 
-static IntrinsicInst *findInitTrampoline(Value *Callee) { 
-  Callee = Callee->stripPointerCasts(); 
-  IntrinsicInst *AdjustTramp = dyn_cast<IntrinsicInst>(Callee); 
-  if (!AdjustTramp || 
-      AdjustTramp->getIntrinsicID() != Intrinsic::adjust_trampoline) 
-    return nullptr; 
- 
-  Value *TrampMem = AdjustTramp->getOperand(0); 
- 
-  if (IntrinsicInst *IT = findInitTrampolineFromAlloca(TrampMem)) 
-    return IT; 
-  if (IntrinsicInst *IT = findInitTrampolineFromBB(AdjustTramp, TrampMem)) 
-    return IT; 
-  return nullptr; 
-} 
- 
-static void annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI) { 
-  unsigned NumArgs = Call.getNumArgOperands(); 
-  ConstantInt *Op0C = dyn_cast<ConstantInt>(Call.getOperand(0)); 
-  ConstantInt *Op1C = 
-      (NumArgs == 1) ? nullptr : dyn_cast<ConstantInt>(Call.getOperand(1)); 
-  // Bail out if the allocation size is zero (or an invalid alignment of zero 
-  // with aligned_alloc). 
-  if ((Op0C && Op0C->isNullValue()) || (Op1C && Op1C->isNullValue())) 
-    return; 
- 
-  if (isMallocLikeFn(&Call, TLI) && Op0C) { 
-    if (isOpNewLikeFn(&Call, TLI)) 
-      Call.addAttribute(AttributeList::ReturnIndex, 
-                        Attribute::getWithDereferenceableBytes( 
-                            Call.getContext(), Op0C->getZExtValue())); 
-    else 
-      Call.addAttribute(AttributeList::ReturnIndex, 
-                        Attribute::getWithDereferenceableOrNullBytes( 
-                            Call.getContext(), Op0C->getZExtValue())); 
-  } else if (isAlignedAllocLikeFn(&Call, TLI) && Op1C) { 
-    Call.addAttribute(AttributeList::ReturnIndex, 
-                      Attribute::getWithDereferenceableOrNullBytes( 
-                          Call.getContext(), Op1C->getZExtValue())); 
-    // Add alignment attribute if alignment is a power of two constant. 
-    if (Op0C && Op0C->getValue().ult(llvm::Value::MaximumAlignment)) { 
-      uint64_t AlignmentVal = Op0C->getZExtValue(); 
-      if (llvm::isPowerOf2_64(AlignmentVal)) 
-        Call.addAttribute(AttributeList::ReturnIndex, 
-                          Attribute::getWithAlignment(Call.getContext(), 
-                                                      Align(AlignmentVal))); 
-    } 
-  } else if (isReallocLikeFn(&Call, TLI) && Op1C) { 
-    Call.addAttribute(AttributeList::ReturnIndex, 
-                      Attribute::getWithDereferenceableOrNullBytes( 
-                          Call.getContext(), Op1C->getZExtValue())); 
-  } else if (isCallocLikeFn(&Call, TLI) && Op0C && Op1C) { 
-    bool Overflow; 
-    const APInt &N = Op0C->getValue(); 
-    APInt Size = N.umul_ov(Op1C->getValue(), Overflow); 
-    if (!Overflow) 
-      Call.addAttribute(AttributeList::ReturnIndex, 
-                        Attribute::getWithDereferenceableOrNullBytes( 
-                            Call.getContext(), Size.getZExtValue())); 
-  } else if (isStrdupLikeFn(&Call, TLI)) { 
-    uint64_t Len = GetStringLength(Call.getOperand(0)); 
-    if (Len) { 
-      // strdup 
-      if (NumArgs == 1) 
-        Call.addAttribute(AttributeList::ReturnIndex, 
-                          Attribute::getWithDereferenceableOrNullBytes( 
-                              Call.getContext(), Len)); 
-      // strndup 
-      else if (NumArgs == 2 && Op1C) 
-        Call.addAttribute( 
-            AttributeList::ReturnIndex, 
-            Attribute::getWithDereferenceableOrNullBytes( 
-                Call.getContext(), std::min(Len, Op1C->getZExtValue() + 1))); 
-    } 
-  } 
-} 
- 
-/// Improvements for call, callbr and invoke instructions. 
+  if (!CI->getCalledFunction()) return nullptr;
+
+  auto InstCombineRAUW = [this](Instruction *From, Value *With) {
+    replaceInstUsesWith(*From, With);
+  };
+  auto InstCombineErase = [this](Instruction *I) {
+    eraseInstFromFunction(*I);
+  };
+  LibCallSimplifier Simplifier(DL, &TLI, ORE, BFI, PSI, InstCombineRAUW,
+                               InstCombineErase);
+  if (Value *With = Simplifier.optimizeCall(CI, Builder)) {
+    ++NumSimplified;
+    return CI->use_empty() ? CI : replaceInstUsesWith(*CI, With);
+  }
+
+  return nullptr;
+}
+
+static IntrinsicInst *findInitTrampolineFromAlloca(Value *TrampMem) {
+  // Strip off at most one level of pointer casts, looking for an alloca.  This
+  // is good enough in practice and simpler than handling any number of casts.
+  Value *Underlying = TrampMem->stripPointerCasts();
+  if (Underlying != TrampMem &&
+      (!Underlying->hasOneUse() || Underlying->user_back() != TrampMem))
+    return nullptr;
+  if (!isa<AllocaInst>(Underlying))
+    return nullptr;
+
+  IntrinsicInst *InitTrampoline = nullptr;
+  for (User *U : TrampMem->users()) {
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
+    if (!II)
+      return nullptr;
+    if (II->getIntrinsicID() == Intrinsic::init_trampoline) {
+      if (InitTrampoline)
+        // More than one init_trampoline writes to this value.  Give up.
+        return nullptr;
+      InitTrampoline = II;
+      continue;
+    }
+    if (II->getIntrinsicID() == Intrinsic::adjust_trampoline)
+      // Allow any number of calls to adjust.trampoline.
+      continue;
+    return nullptr;
+  }
+
+  // No call to init.trampoline found.
+  if (!InitTrampoline)
+    return nullptr;
+
+  // Check that the alloca is being used in the expected way.
+  if (InitTrampoline->getOperand(0) != TrampMem)
+    return nullptr;
+
+  return InitTrampoline;
+}
+
+static IntrinsicInst *findInitTrampolineFromBB(IntrinsicInst *AdjustTramp,
+                                               Value *TrampMem) {
+  // Visit all the previous instructions in the basic block, and try to find a
+  // init.trampoline which has a direct path to the adjust.trampoline.
+  for (BasicBlock::iterator I = AdjustTramp->getIterator(),
+                            E = AdjustTramp->getParent()->begin();
+       I != E;) {
+    Instruction *Inst = &*--I;
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+      if (II->getIntrinsicID() == Intrinsic::init_trampoline &&
+          II->getOperand(0) == TrampMem)
+        return II;
+    if (Inst->mayWriteToMemory())
+      return nullptr;
+  }
+  return nullptr;
+}
+
+// Given a call to llvm.adjust.trampoline, find and return the corresponding
+// call to llvm.init.trampoline if the call to the trampoline can be optimized
+// to a direct call to a function.  Otherwise return NULL.
+static IntrinsicInst *findInitTrampoline(Value *Callee) {
+  Callee = Callee->stripPointerCasts();
+  IntrinsicInst *AdjustTramp = dyn_cast<IntrinsicInst>(Callee);
+  if (!AdjustTramp ||
+      AdjustTramp->getIntrinsicID() != Intrinsic::adjust_trampoline)
+    return nullptr;
+
+  Value *TrampMem = AdjustTramp->getOperand(0);
+
+  if (IntrinsicInst *IT = findInitTrampolineFromAlloca(TrampMem))
+    return IT;
+  if (IntrinsicInst *IT = findInitTrampolineFromBB(AdjustTramp, TrampMem))
+    return IT;
+  return nullptr;
+}
+
+static void annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI) {
+  unsigned NumArgs = Call.getNumArgOperands();
+  ConstantInt *Op0C = dyn_cast<ConstantInt>(Call.getOperand(0));
+  ConstantInt *Op1C =
+      (NumArgs == 1) ? nullptr : dyn_cast<ConstantInt>(Call.getOperand(1));
+  // Bail out if the allocation size is zero (or an invalid alignment of zero
+  // with aligned_alloc).
+  if ((Op0C && Op0C->isNullValue()) || (Op1C && Op1C->isNullValue()))
+    return;
+
+  if (isMallocLikeFn(&Call, TLI) && Op0C) {
+    if (isOpNewLikeFn(&Call, TLI))
+      Call.addAttribute(AttributeList::ReturnIndex,
+                        Attribute::getWithDereferenceableBytes(
+                            Call.getContext(), Op0C->getZExtValue()));
+    else
+      Call.addAttribute(AttributeList::ReturnIndex,
+                        Attribute::getWithDereferenceableOrNullBytes(
+                            Call.getContext(), Op0C->getZExtValue()));
+  } else if (isAlignedAllocLikeFn(&Call, TLI) && Op1C) {
+    Call.addAttribute(AttributeList::ReturnIndex,
+                      Attribute::getWithDereferenceableOrNullBytes(
+                          Call.getContext(), Op1C->getZExtValue()));
+    // Add alignment attribute if alignment is a power of two constant.
+    if (Op0C && Op0C->getValue().ult(llvm::Value::MaximumAlignment)) {
+      uint64_t AlignmentVal = Op0C->getZExtValue();
+      if (llvm::isPowerOf2_64(AlignmentVal))
+        Call.addAttribute(AttributeList::ReturnIndex,
+                          Attribute::getWithAlignment(Call.getContext(),
+                                                      Align(AlignmentVal)));
+    }
+  } else if (isReallocLikeFn(&Call, TLI) && Op1C) {
+    Call.addAttribute(AttributeList::ReturnIndex,
+                      Attribute::getWithDereferenceableOrNullBytes(
+                          Call.getContext(), Op1C->getZExtValue()));
+  } else if (isCallocLikeFn(&Call, TLI) && Op0C && Op1C) {
+    bool Overflow;
+    const APInt &N = Op0C->getValue();
+    APInt Size = N.umul_ov(Op1C->getValue(), Overflow);
+    if (!Overflow)
+      Call.addAttribute(AttributeList::ReturnIndex,
+                        Attribute::getWithDereferenceableOrNullBytes(
+                            Call.getContext(), Size.getZExtValue()));
+  } else if (isStrdupLikeFn(&Call, TLI)) {
+    uint64_t Len = GetStringLength(Call.getOperand(0));
+    if (Len) {
+      // strdup
+      if (NumArgs == 1)
+        Call.addAttribute(AttributeList::ReturnIndex,
+                          Attribute::getWithDereferenceableOrNullBytes(
+                              Call.getContext(), Len));
+      // strndup
+      else if (NumArgs == 2 && Op1C)
+        Call.addAttribute(
+            AttributeList::ReturnIndex,
+            Attribute::getWithDereferenceableOrNullBytes(
+                Call.getContext(), std::min(Len, Op1C->getZExtValue() + 1)));
+    }
+  }
+}
+
+/// Improvements for call, callbr and invoke instructions.
 Instruction *InstCombinerImpl::visitCallBase(CallBase &Call) {
-  if (isAllocationFn(&Call, &TLI)) 
-    annotateAnyAllocSite(Call, &TLI); 
- 
-  bool Changed = false; 
- 
-  // Mark any parameters that are known to be non-null with the nonnull 
-  // attribute.  This is helpful for inlining calls to functions with null 
-  // checks on their arguments. 
-  SmallVector<unsigned, 4> ArgNos; 
-  unsigned ArgNo = 0; 
- 
-  for (Value *V : Call.args()) { 
-    if (V->getType()->isPointerTy() && 
-        !Call.paramHasAttr(ArgNo, Attribute::NonNull) && 
-        isKnownNonZero(V, DL, 0, &AC, &Call, &DT)) 
-      ArgNos.push_back(ArgNo); 
-    ArgNo++; 
-  } 
- 
-  assert(ArgNo == Call.arg_size() && "sanity check"); 
- 
-  if (!ArgNos.empty()) { 
-    AttributeList AS = Call.getAttributes(); 
-    LLVMContext &Ctx = Call.getContext(); 
-    AS = AS.addParamAttribute(Ctx, ArgNos, 
-                              Attribute::get(Ctx, Attribute::NonNull)); 
-    Call.setAttributes(AS); 
-    Changed = true; 
-  } 
- 
-  // If the callee is a pointer to a function, attempt to move any casts to the 
-  // arguments of the call/callbr/invoke. 
-  Value *Callee = Call.getCalledOperand(); 
-  if (!isa<Function>(Callee) && transformConstExprCastCall(Call)) 
-    return nullptr; 
- 
-  if (Function *CalleeF = dyn_cast<Function>(Callee)) { 
-    // Remove the convergent attr on calls when the callee is not convergent. 
-    if (Call.isConvergent() && !CalleeF->isConvergent() && 
-        !CalleeF->isIntrinsic()) { 
-      LLVM_DEBUG(dbgs() << "Removing convergent attr from instr " << Call 
-                        << "\n"); 
-      Call.setNotConvergent(); 
-      return &Call; 
-    } 
- 
-    // If the call and callee calling conventions don't match, this call must 
-    // be unreachable, as the call is undefined. 
-    if (CalleeF->getCallingConv() != Call.getCallingConv() && 
-        // Only do this for calls to a function with a body.  A prototype may 
-        // not actually end up matching the implementation's calling conv for a 
-        // variety of reasons (e.g. it may be written in assembly). 
-        !CalleeF->isDeclaration()) { 
-      Instruction *OldCall = &Call; 
-      CreateNonTerminatorUnreachable(OldCall); 
+  if (isAllocationFn(&Call, &TLI))
+    annotateAnyAllocSite(Call, &TLI);
+
+  bool Changed = false;
+
+  // Mark any parameters that are known to be non-null with the nonnull
+  // attribute.  This is helpful for inlining calls to functions with null
+  // checks on their arguments.
+  SmallVector<unsigned, 4> ArgNos;
+  unsigned ArgNo = 0;
+
+  for (Value *V : Call.args()) {
+    if (V->getType()->isPointerTy() &&
+        !Call.paramHasAttr(ArgNo, Attribute::NonNull) &&
+        isKnownNonZero(V, DL, 0, &AC, &Call, &DT))
+      ArgNos.push_back(ArgNo);
+    ArgNo++;
+  }
+
+  assert(ArgNo == Call.arg_size() && "sanity check");
+
+  if (!ArgNos.empty()) {
+    AttributeList AS = Call.getAttributes();
+    LLVMContext &Ctx = Call.getContext();
+    AS = AS.addParamAttribute(Ctx, ArgNos,
+                              Attribute::get(Ctx, Attribute::NonNull));
+    Call.setAttributes(AS);
+    Changed = true;
+  }
+
+  // If the callee is a pointer to a function, attempt to move any casts to the
+  // arguments of the call/callbr/invoke.
+  Value *Callee = Call.getCalledOperand();
+  if (!isa<Function>(Callee) && transformConstExprCastCall(Call))
+    return nullptr;
+
+  if (Function *CalleeF = dyn_cast<Function>(Callee)) {
+    // Remove the convergent attr on calls when the callee is not convergent.
+    if (Call.isConvergent() && !CalleeF->isConvergent() &&
+        !CalleeF->isIntrinsic()) {
+      LLVM_DEBUG(dbgs() << "Removing convergent attr from instr " << Call
+                        << "\n");
+      Call.setNotConvergent();
+      return &Call;
+    }
+
+    // If the call and callee calling conventions don't match, this call must
+    // be unreachable, as the call is undefined.
+    if (CalleeF->getCallingConv() != Call.getCallingConv() &&
+        // Only do this for calls to a function with a body.  A prototype may
+        // not actually end up matching the implementation's calling conv for a
+        // variety of reasons (e.g. it may be written in assembly).
+        !CalleeF->isDeclaration()) {
+      Instruction *OldCall = &Call;
+      CreateNonTerminatorUnreachable(OldCall);
       // If OldCall does not return void then replaceInstUsesWith undef.
-      // This allows ValueHandlers and custom metadata to adjust itself. 
-      if (!OldCall->getType()->isVoidTy()) 
-        replaceInstUsesWith(*OldCall, UndefValue::get(OldCall->getType())); 
-      if (isa<CallInst>(OldCall)) 
-        return eraseInstFromFunction(*OldCall); 
- 
-      // We cannot remove an invoke or a callbr, because it would change thexi 
-      // CFG, just change the callee to a null pointer. 
-      cast<CallBase>(OldCall)->setCalledFunction( 
-          CalleeF->getFunctionType(), 
-          Constant::getNullValue(CalleeF->getType())); 
-      return nullptr; 
-    } 
-  } 
- 
-  if ((isa<ConstantPointerNull>(Callee) && 
-       !NullPointerIsDefined(Call.getFunction())) || 
-      isa<UndefValue>(Callee)) { 
+      // This allows ValueHandlers and custom metadata to adjust itself.
+      if (!OldCall->getType()->isVoidTy())
+        replaceInstUsesWith(*OldCall, UndefValue::get(OldCall->getType()));
+      if (isa<CallInst>(OldCall))
+        return eraseInstFromFunction(*OldCall);
+
+      // We cannot remove an invoke or a callbr, because it would change thexi
+      // CFG, just change the callee to a null pointer.
+      cast<CallBase>(OldCall)->setCalledFunction(
+          CalleeF->getFunctionType(),
+          Constant::getNullValue(CalleeF->getType()));
+      return nullptr;
+    }
+  }
+
+  if ((isa<ConstantPointerNull>(Callee) &&
+       !NullPointerIsDefined(Call.getFunction())) ||
+      isa<UndefValue>(Callee)) {
     // If Call does not return void then replaceInstUsesWith undef.
-    // This allows ValueHandlers and custom metadata to adjust itself. 
-    if (!Call.getType()->isVoidTy()) 
-      replaceInstUsesWith(Call, UndefValue::get(Call.getType())); 
- 
-    if (Call.isTerminator()) { 
-      // Can't remove an invoke or callbr because we cannot change the CFG. 
-      return nullptr; 
-    } 
- 
-    // This instruction is not reachable, just remove it. 
-    CreateNonTerminatorUnreachable(&Call); 
-    return eraseInstFromFunction(Call); 
-  } 
- 
-  if (IntrinsicInst *II = findInitTrampoline(Callee)) 
-    return transformCallThroughTrampoline(Call, *II); 
- 
-  PointerType *PTy = cast<PointerType>(Callee->getType()); 
-  FunctionType *FTy = cast<FunctionType>(PTy->getElementType()); 
-  if (FTy->isVarArg()) { 
-    int ix = FTy->getNumParams(); 
-    // See if we can optimize any arguments passed through the varargs area of 
-    // the call. 
-    for (auto I = Call.arg_begin() + FTy->getNumParams(), E = Call.arg_end(); 
-         I != E; ++I, ++ix) { 
-      CastInst *CI = dyn_cast<CastInst>(*I); 
-      if (CI && isSafeToEliminateVarargsCast(Call, DL, CI, ix)) { 
-        replaceUse(*I, CI->getOperand(0)); 
- 
-        // Update the byval type to match the argument type. 
-        if (Call.isByValArgument(ix)) { 
-          Call.removeParamAttr(ix, Attribute::ByVal); 
-          Call.addParamAttr( 
-              ix, Attribute::getWithByValType( 
-                      Call.getContext(), 
-                      CI->getOperand(0)->getType()->getPointerElementType())); 
-        } 
-        Changed = true; 
-      } 
-    } 
-  } 
- 
-  if (isa<InlineAsm>(Callee) && !Call.doesNotThrow()) { 
-    // Inline asm calls cannot throw - mark them 'nounwind'. 
-    Call.setDoesNotThrow(); 
-    Changed = true; 
-  } 
- 
-  // Try to optimize the call if possible, we require DataLayout for most of 
-  // this.  None of these calls are seen as possibly dead so go ahead and 
-  // delete the instruction now. 
-  if (CallInst *CI = dyn_cast<CallInst>(&Call)) { 
-    Instruction *I = tryOptimizeCall(CI); 
-    // If we changed something return the result, etc. Otherwise let 
-    // the fallthrough check. 
-    if (I) return eraseInstFromFunction(*I); 
-  } 
- 
-  if (!Call.use_empty() && !Call.isMustTailCall()) 
-    if (Value *ReturnedArg = Call.getReturnedArgOperand()) { 
-      Type *CallTy = Call.getType(); 
-      Type *RetArgTy = ReturnedArg->getType(); 
-      if (RetArgTy->canLosslesslyBitCastTo(CallTy)) 
-        return replaceInstUsesWith( 
-            Call, Builder.CreateBitOrPointerCast(ReturnedArg, CallTy)); 
-    } 
- 
-  if (isAllocLikeFn(&Call, &TLI)) 
-    return visitAllocSite(Call); 
- 
-  return Changed ? &Call : nullptr; 
-} 
- 
-/// If the callee is a constexpr cast of a function, attempt to move the cast to 
-/// the arguments of the call/callbr/invoke. 
+    // This allows ValueHandlers and custom metadata to adjust itself.
+    if (!Call.getType()->isVoidTy())
+      replaceInstUsesWith(Call, UndefValue::get(Call.getType()));
+
+    if (Call.isTerminator()) {
+      // Can't remove an invoke or callbr because we cannot change the CFG.
+      return nullptr;
+    }
+
+    // This instruction is not reachable, just remove it.
+    CreateNonTerminatorUnreachable(&Call);
+    return eraseInstFromFunction(Call);
+  }
+
+  if (IntrinsicInst *II = findInitTrampoline(Callee))
+    return transformCallThroughTrampoline(Call, *II);
+
+  PointerType *PTy = cast<PointerType>(Callee->getType());
+  FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
+  if (FTy->isVarArg()) {
+    int ix = FTy->getNumParams();
+    // See if we can optimize any arguments passed through the varargs area of
+    // the call.
+    for (auto I = Call.arg_begin() + FTy->getNumParams(), E = Call.arg_end();
+         I != E; ++I, ++ix) {
+      CastInst *CI = dyn_cast<CastInst>(*I);
+      if (CI && isSafeToEliminateVarargsCast(Call, DL, CI, ix)) {
+        replaceUse(*I, CI->getOperand(0));
+
+        // Update the byval type to match the argument type.
+        if (Call.isByValArgument(ix)) {
+          Call.removeParamAttr(ix, Attribute::ByVal);
+          Call.addParamAttr(
+              ix, Attribute::getWithByValType(
+                      Call.getContext(),
+                      CI->getOperand(0)->getType()->getPointerElementType()));
+        }
+        Changed = true;
+      }
+    }
+  }
+
+  if (isa<InlineAsm>(Callee) && !Call.doesNotThrow()) {
+    // Inline asm calls cannot throw - mark them 'nounwind'.
+    Call.setDoesNotThrow();
+    Changed = true;
+  }
+
+  // Try to optimize the call if possible, we require DataLayout for most of
+  // this.  None of these calls are seen as possibly dead so go ahead and
+  // delete the instruction now.
+  if (CallInst *CI = dyn_cast<CallInst>(&Call)) {
+    Instruction *I = tryOptimizeCall(CI);
+    // If we changed something return the result, etc. Otherwise let
+    // the fallthrough check.
+    if (I) return eraseInstFromFunction(*I);
+  }
+
+  if (!Call.use_empty() && !Call.isMustTailCall())
+    if (Value *ReturnedArg = Call.getReturnedArgOperand()) {
+      Type *CallTy = Call.getType();
+      Type *RetArgTy = ReturnedArg->getType();
+      if (RetArgTy->canLosslesslyBitCastTo(CallTy))
+        return replaceInstUsesWith(
+            Call, Builder.CreateBitOrPointerCast(ReturnedArg, CallTy));
+    }
+
+  if (isAllocLikeFn(&Call, &TLI))
+    return visitAllocSite(Call);
+
+  return Changed ? &Call : nullptr;
+}
+
+/// If the callee is a constexpr cast of a function, attempt to move the cast to
+/// the arguments of the call/callbr/invoke.
 bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) {
-  auto *Callee = 
-      dyn_cast<Function>(Call.getCalledOperand()->stripPointerCasts()); 
-  if (!Callee) 
-    return false; 
- 
-  // If this is a call to a thunk function, don't remove the cast. Thunks are 
-  // used to transparently forward all incoming parameters and outgoing return 
-  // values, so it's important to leave the cast in place. 
-  if (Callee->hasFnAttribute("thunk")) 
-    return false; 
- 
-  // If this is a musttail call, the callee's prototype must match the caller's 
-  // prototype with the exception of pointee types. The code below doesn't 
-  // implement that, so we can't do this transform. 
-  // TODO: Do the transform if it only requires adding pointer casts. 
-  if (Call.isMustTailCall()) 
-    return false; 
- 
-  Instruction *Caller = &Call; 
-  const AttributeList &CallerPAL = Call.getAttributes(); 
- 
-  // Okay, this is a cast from a function to a different type.  Unless doing so 
-  // would cause a type conversion of one of our arguments, change this call to 
-  // be a direct call with arguments casted to the appropriate types. 
-  FunctionType *FT = Callee->getFunctionType(); 
-  Type *OldRetTy = Caller->getType(); 
-  Type *NewRetTy = FT->getReturnType(); 
- 
-  // Check to see if we are changing the return type... 
-  if (OldRetTy != NewRetTy) { 
- 
-    if (NewRetTy->isStructTy()) 
-      return false; // TODO: Handle multiple return values. 
- 
-    if (!CastInst::isBitOrNoopPointerCastable(NewRetTy, OldRetTy, DL)) { 
-      if (Callee->isDeclaration()) 
-        return false;   // Cannot transform this return value. 
- 
-      if (!Caller->use_empty() && 
-          // void -> non-void is handled specially 
-          !NewRetTy->isVoidTy()) 
-        return false;   // Cannot transform this return value. 
-    } 
- 
-    if (!CallerPAL.isEmpty() && !Caller->use_empty()) { 
-      AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex); 
-      if (RAttrs.overlaps(AttributeFuncs::typeIncompatible(NewRetTy))) 
-        return false;   // Attribute not compatible with transformed value. 
-    } 
- 
-    // If the callbase is an invoke/callbr instruction, and the return value is 
-    // used by a PHI node in a successor, we cannot change the return type of 
-    // the call because there is no place to put the cast instruction (without 
-    // breaking the critical edge).  Bail out in this case. 
-    if (!Caller->use_empty()) { 
-      if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) 
-        for (User *U : II->users()) 
-          if (PHINode *PN = dyn_cast<PHINode>(U)) 
-            if (PN->getParent() == II->getNormalDest() || 
-                PN->getParent() == II->getUnwindDest()) 
-              return false; 
-      // FIXME: Be conservative for callbr to avoid a quadratic search. 
-      if (isa<CallBrInst>(Caller)) 
-        return false; 
-    } 
-  } 
- 
-  unsigned NumActualArgs = Call.arg_size(); 
-  unsigned NumCommonArgs = std::min(FT->getNumParams(), NumActualArgs); 
- 
-  // Prevent us turning: 
-  // declare void @takes_i32_inalloca(i32* inalloca) 
-  //  call void bitcast (void (i32*)* @takes_i32_inalloca to void (i32)*)(i32 0) 
-  // 
-  // into: 
-  //  call void @takes_i32_inalloca(i32* null) 
-  // 
-  //  Similarly, avoid folding away bitcasts of byval calls. 
-  if (Callee->getAttributes().hasAttrSomewhere(Attribute::InAlloca) || 
-      Callee->getAttributes().hasAttrSomewhere(Attribute::Preallocated) || 
-      Callee->getAttributes().hasAttrSomewhere(Attribute::ByVal)) 
-    return false; 
- 
-  auto AI = Call.arg_begin(); 
-  for (unsigned i = 0, e = NumCommonArgs; i != e; ++i, ++AI) { 
-    Type *ParamTy = FT->getParamType(i); 
-    Type *ActTy = (*AI)->getType(); 
- 
-    if (!CastInst::isBitOrNoopPointerCastable(ActTy, ParamTy, DL)) 
-      return false;   // Cannot transform this parameter value. 
- 
-    if (AttrBuilder(CallerPAL.getParamAttributes(i)) 
-            .overlaps(AttributeFuncs::typeIncompatible(ParamTy))) 
-      return false;   // Attribute not compatible with transformed value. 
- 
-    if (Call.isInAllocaArgument(i)) 
-      return false;   // Cannot transform to and from inalloca. 
- 
+  auto *Callee =
+      dyn_cast<Function>(Call.getCalledOperand()->stripPointerCasts());
+  if (!Callee)
+    return false;
+
+  // If this is a call to a thunk function, don't remove the cast. Thunks are
+  // used to transparently forward all incoming parameters and outgoing return
+  // values, so it's important to leave the cast in place.
+  if (Callee->hasFnAttribute("thunk"))
+    return false;
+
+  // If this is a musttail call, the callee's prototype must match the caller's
+  // prototype with the exception of pointee types. The code below doesn't
+  // implement that, so we can't do this transform.
+  // TODO: Do the transform if it only requires adding pointer casts.
+  if (Call.isMustTailCall())
+    return false;
+
+  Instruction *Caller = &Call;
+  const AttributeList &CallerPAL = Call.getAttributes();
+
+  // Okay, this is a cast from a function to a different type.  Unless doing so
+  // would cause a type conversion of one of our arguments, change this call to
+  // be a direct call with arguments casted to the appropriate types.
+  FunctionType *FT = Callee->getFunctionType();
+  Type *OldRetTy = Caller->getType();
+  Type *NewRetTy = FT->getReturnType();
+
+  // Check to see if we are changing the return type...
+  if (OldRetTy != NewRetTy) {
+
+    if (NewRetTy->isStructTy())
+      return false; // TODO: Handle multiple return values.
+
+    if (!CastInst::isBitOrNoopPointerCastable(NewRetTy, OldRetTy, DL)) {
+      if (Callee->isDeclaration())
+        return false;   // Cannot transform this return value.
+
+      if (!Caller->use_empty() &&
+          // void -> non-void is handled specially
+          !NewRetTy->isVoidTy())
+        return false;   // Cannot transform this return value.
+    }
+
+    if (!CallerPAL.isEmpty() && !Caller->use_empty()) {
+      AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex);
+      if (RAttrs.overlaps(AttributeFuncs::typeIncompatible(NewRetTy)))
+        return false;   // Attribute not compatible with transformed value.
+    }
+
+    // If the callbase is an invoke/callbr instruction, and the return value is
+    // used by a PHI node in a successor, we cannot change the return type of
+    // the call because there is no place to put the cast instruction (without
+    // breaking the critical edge).  Bail out in this case.
+    if (!Caller->use_empty()) {
+      if (InvokeInst *II = dyn_cast<InvokeInst>(Caller))
+        for (User *U : II->users())
+          if (PHINode *PN = dyn_cast<PHINode>(U))
+            if (PN->getParent() == II->getNormalDest() ||
+                PN->getParent() == II->getUnwindDest())
+              return false;
+      // FIXME: Be conservative for callbr to avoid a quadratic search.
+      if (isa<CallBrInst>(Caller))
+        return false;
+    }
+  }
+
+  unsigned NumActualArgs = Call.arg_size();
+  unsigned NumCommonArgs = std::min(FT->getNumParams(), NumActualArgs);
+
+  // Prevent us turning:
+  // declare void @takes_i32_inalloca(i32* inalloca)
+  //  call void bitcast (void (i32*)* @takes_i32_inalloca to void (i32)*)(i32 0)
+  //
+  // into:
+  //  call void @takes_i32_inalloca(i32* null)
+  //
+  //  Similarly, avoid folding away bitcasts of byval calls.
+  if (Callee->getAttributes().hasAttrSomewhere(Attribute::InAlloca) ||
+      Callee->getAttributes().hasAttrSomewhere(Attribute::Preallocated) ||
+      Callee->getAttributes().hasAttrSomewhere(Attribute::ByVal))
+    return false;
+
+  auto AI = Call.arg_begin();
+  for (unsigned i = 0, e = NumCommonArgs; i != e; ++i, ++AI) {
+    Type *ParamTy = FT->getParamType(i);
+    Type *ActTy = (*AI)->getType();
+
+    if (!CastInst::isBitOrNoopPointerCastable(ActTy, ParamTy, DL))
+      return false;   // Cannot transform this parameter value.
+
+    if (AttrBuilder(CallerPAL.getParamAttributes(i))
+            .overlaps(AttributeFuncs::typeIncompatible(ParamTy)))
+      return false;   // Attribute not compatible with transformed value.
+
+    if (Call.isInAllocaArgument(i))
+      return false;   // Cannot transform to and from inalloca.
+
     if (CallerPAL.hasParamAttribute(i, Attribute::SwiftError))
       return false;
 
-    // If the parameter is passed as a byval argument, then we have to have a 
-    // sized type and the sized type has to have the same size as the old type. 
-    if (ParamTy != ActTy && CallerPAL.hasParamAttribute(i, Attribute::ByVal)) { 
-      PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy); 
-      if (!ParamPTy || !ParamPTy->getElementType()->isSized()) 
-        return false; 
- 
-      Type *CurElTy = Call.getParamByValType(i); 
-      if (DL.getTypeAllocSize(CurElTy) != 
-          DL.getTypeAllocSize(ParamPTy->getElementType())) 
-        return false; 
-    } 
-  } 
- 
-  if (Callee->isDeclaration()) { 
-    // Do not delete arguments unless we have a function body. 
-    if (FT->getNumParams() < NumActualArgs && !FT->isVarArg()) 
-      return false; 
- 
-    // If the callee is just a declaration, don't change the varargsness of the 
-    // call.  We don't want to introduce a varargs call where one doesn't 
-    // already exist. 
-    PointerType *APTy = cast<PointerType>(Call.getCalledOperand()->getType()); 
-    if (FT->isVarArg()!=cast<FunctionType>(APTy->getElementType())->isVarArg()) 
-      return false; 
- 
-    // If both the callee and the cast type are varargs, we still have to make 
-    // sure the number of fixed parameters are the same or we have the same 
-    // ABI issues as if we introduce a varargs call. 
-    if (FT->isVarArg() && 
-        cast<FunctionType>(APTy->getElementType())->isVarArg() && 
-        FT->getNumParams() != 
-        cast<FunctionType>(APTy->getElementType())->getNumParams()) 
-      return false; 
-  } 
- 
-  if (FT->getNumParams() < NumActualArgs && FT->isVarArg() && 
-      !CallerPAL.isEmpty()) { 
-    // In this case we have more arguments than the new function type, but we 
-    // won't be dropping them.  Check that these extra arguments have attributes 
-    // that are compatible with being a vararg call argument. 
-    unsigned SRetIdx; 
-    if (CallerPAL.hasAttrSomewhere(Attribute::StructRet, &SRetIdx) && 
-        SRetIdx > FT->getNumParams()) 
-      return false; 
-  } 
- 
-  // Okay, we decided that this is a safe thing to do: go ahead and start 
-  // inserting cast instructions as necessary. 
-  SmallVector<Value *, 8> Args; 
-  SmallVector<AttributeSet, 8> ArgAttrs; 
-  Args.reserve(NumActualArgs); 
-  ArgAttrs.reserve(NumActualArgs); 
- 
-  // Get any return attributes. 
-  AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex); 
- 
-  // If the return value is not being used, the type may not be compatible 
-  // with the existing attributes.  Wipe out any problematic attributes. 
-  RAttrs.remove(AttributeFuncs::typeIncompatible(NewRetTy)); 
- 
-  LLVMContext &Ctx = Call.getContext(); 
-  AI = Call.arg_begin(); 
-  for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) { 
-    Type *ParamTy = FT->getParamType(i); 
- 
-    Value *NewArg = *AI; 
-    if ((*AI)->getType() != ParamTy) 
-      NewArg = Builder.CreateBitOrPointerCast(*AI, ParamTy); 
-    Args.push_back(NewArg); 
- 
-    // Add any parameter attributes. 
-    if (CallerPAL.hasParamAttribute(i, Attribute::ByVal)) { 
-      AttrBuilder AB(CallerPAL.getParamAttributes(i)); 
-      AB.addByValAttr(NewArg->getType()->getPointerElementType()); 
-      ArgAttrs.push_back(AttributeSet::get(Ctx, AB)); 
-    } else 
-      ArgAttrs.push_back(CallerPAL.getParamAttributes(i)); 
-  } 
- 
-  // If the function takes more arguments than the call was taking, add them 
-  // now. 
-  for (unsigned i = NumCommonArgs; i != FT->getNumParams(); ++i) { 
-    Args.push_back(Constant::getNullValue(FT->getParamType(i))); 
-    ArgAttrs.push_back(AttributeSet()); 
-  } 
- 
-  // If we are removing arguments to the function, emit an obnoxious warning. 
-  if (FT->getNumParams() < NumActualArgs) { 
-    // TODO: if (!FT->isVarArg()) this call may be unreachable. PR14722 
-    if (FT->isVarArg()) { 
-      // Add all of the arguments in their promoted form to the arg list. 
-      for (unsigned i = FT->getNumParams(); i != NumActualArgs; ++i, ++AI) { 
-        Type *PTy = getPromotedType((*AI)->getType()); 
-        Value *NewArg = *AI; 
-        if (PTy != (*AI)->getType()) { 
-          // Must promote to pass through va_arg area! 
-          Instruction::CastOps opcode = 
-            CastInst::getCastOpcode(*AI, false, PTy, false); 
-          NewArg = Builder.CreateCast(opcode, *AI, PTy); 
-        } 
-        Args.push_back(NewArg); 
- 
-        // Add any parameter attributes. 
-        ArgAttrs.push_back(CallerPAL.getParamAttributes(i)); 
-      } 
-    } 
-  } 
- 
-  AttributeSet FnAttrs = CallerPAL.getFnAttributes(); 
- 
-  if (NewRetTy->isVoidTy()) 
-    Caller->setName("");   // Void type should not have a name. 
- 
-  assert((ArgAttrs.size() == FT->getNumParams() || FT->isVarArg()) && 
-         "missing argument attributes"); 
-  AttributeList NewCallerPAL = AttributeList::get( 
-      Ctx, FnAttrs, AttributeSet::get(Ctx, RAttrs), ArgAttrs); 
- 
-  SmallVector<OperandBundleDef, 1> OpBundles; 
-  Call.getOperandBundlesAsDefs(OpBundles); 
- 
-  CallBase *NewCall; 
-  if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { 
-    NewCall = Builder.CreateInvoke(Callee, II->getNormalDest(), 
-                                   II->getUnwindDest(), Args, OpBundles); 
-  } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(Caller)) { 
-    NewCall = Builder.CreateCallBr(Callee, CBI->getDefaultDest(), 
-                                   CBI->getIndirectDests(), Args, OpBundles); 
-  } else { 
-    NewCall = Builder.CreateCall(Callee, Args, OpBundles); 
-    cast<CallInst>(NewCall)->setTailCallKind( 
-        cast<CallInst>(Caller)->getTailCallKind()); 
-  } 
-  NewCall->takeName(Caller); 
-  NewCall->setCallingConv(Call.getCallingConv()); 
-  NewCall->setAttributes(NewCallerPAL); 
- 
-  // Preserve prof metadata if any. 
-  NewCall->copyMetadata(*Caller, {LLVMContext::MD_prof}); 
- 
-  // Insert a cast of the return type as necessary. 
-  Instruction *NC = NewCall; 
-  Value *NV = NC; 
-  if (OldRetTy != NV->getType() && !Caller->use_empty()) { 
-    if (!NV->getType()->isVoidTy()) { 
-      NV = NC = CastInst::CreateBitOrPointerCast(NC, OldRetTy); 
-      NC->setDebugLoc(Caller->getDebugLoc()); 
- 
-      // If this is an invoke/callbr instruction, we should insert it after the 
-      // first non-phi instruction in the normal successor block. 
-      if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { 
-        BasicBlock::iterator I = II->getNormalDest()->getFirstInsertionPt(); 
-        InsertNewInstBefore(NC, *I); 
-      } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(Caller)) { 
-        BasicBlock::iterator I = CBI->getDefaultDest()->getFirstInsertionPt(); 
-        InsertNewInstBefore(NC, *I); 
-      } else { 
-        // Otherwise, it's a call, just insert cast right after the call. 
-        InsertNewInstBefore(NC, *Caller); 
-      } 
-      Worklist.pushUsersToWorkList(*Caller); 
-    } else { 
-      NV = UndefValue::get(Caller->getType()); 
-    } 
-  } 
- 
-  if (!Caller->use_empty()) 
-    replaceInstUsesWith(*Caller, NV); 
-  else if (Caller->hasValueHandle()) { 
-    if (OldRetTy == NV->getType()) 
-      ValueHandleBase::ValueIsRAUWd(Caller, NV); 
-    else 
-      // We cannot call ValueIsRAUWd with a different type, and the 
-      // actual tracked value will disappear. 
-      ValueHandleBase::ValueIsDeleted(Caller); 
-  } 
- 
-  eraseInstFromFunction(*Caller); 
-  return true; 
-} 
- 
-/// Turn a call to a function created by init_trampoline / adjust_trampoline 
-/// intrinsic pair into a direct call to the underlying function. 
-Instruction * 
+    // If the parameter is passed as a byval argument, then we have to have a
+    // sized type and the sized type has to have the same size as the old type.
+    if (ParamTy != ActTy && CallerPAL.hasParamAttribute(i, Attribute::ByVal)) {
+      PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy);
+      if (!ParamPTy || !ParamPTy->getElementType()->isSized())
+        return false;
+
+      Type *CurElTy = Call.getParamByValType(i);
+      if (DL.getTypeAllocSize(CurElTy) !=
+          DL.getTypeAllocSize(ParamPTy->getElementType()))
+        return false;
+    }
+  }
+
+  if (Callee->isDeclaration()) {
+    // Do not delete arguments unless we have a function body.
+    if (FT->getNumParams() < NumActualArgs && !FT->isVarArg())
+      return false;
+
+    // If the callee is just a declaration, don't change the varargsness of the
+    // call.  We don't want to introduce a varargs call where one doesn't
+    // already exist.
+    PointerType *APTy = cast<PointerType>(Call.getCalledOperand()->getType());
+    if (FT->isVarArg()!=cast<FunctionType>(APTy->getElementType())->isVarArg())
+      return false;
+
+    // If both the callee and the cast type are varargs, we still have to make
+    // sure the number of fixed parameters are the same or we have the same
+    // ABI issues as if we introduce a varargs call.
+    if (FT->isVarArg() &&
+        cast<FunctionType>(APTy->getElementType())->isVarArg() &&
+        FT->getNumParams() !=
+        cast<FunctionType>(APTy->getElementType())->getNumParams())
+      return false;
+  }
+
+  if (FT->getNumParams() < NumActualArgs && FT->isVarArg() &&
+      !CallerPAL.isEmpty()) {
+    // In this case we have more arguments than the new function type, but we
+    // won't be dropping them.  Check that these extra arguments have attributes
+    // that are compatible with being a vararg call argument.
+    unsigned SRetIdx;
+    if (CallerPAL.hasAttrSomewhere(Attribute::StructRet, &SRetIdx) &&
+        SRetIdx > FT->getNumParams())
+      return false;
+  }
+
+  // Okay, we decided that this is a safe thing to do: go ahead and start
+  // inserting cast instructions as necessary.
+  SmallVector<Value *, 8> Args;
+  SmallVector<AttributeSet, 8> ArgAttrs;
+  Args.reserve(NumActualArgs);
+  ArgAttrs.reserve(NumActualArgs);
+
+  // Get any return attributes.
+  AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex);
+
+  // If the return value is not being used, the type may not be compatible
+  // with the existing attributes.  Wipe out any problematic attributes.
+  RAttrs.remove(AttributeFuncs::typeIncompatible(NewRetTy));
+
+  LLVMContext &Ctx = Call.getContext();
+  AI = Call.arg_begin();
+  for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) {
+    Type *ParamTy = FT->getParamType(i);
+
+    Value *NewArg = *AI;
+    if ((*AI)->getType() != ParamTy)
+      NewArg = Builder.CreateBitOrPointerCast(*AI, ParamTy);
+    Args.push_back(NewArg);
+
+    // Add any parameter attributes.
+    if (CallerPAL.hasParamAttribute(i, Attribute::ByVal)) {
+      AttrBuilder AB(CallerPAL.getParamAttributes(i));
+      AB.addByValAttr(NewArg->getType()->getPointerElementType());
+      ArgAttrs.push_back(AttributeSet::get(Ctx, AB));
+    } else
+      ArgAttrs.push_back(CallerPAL.getParamAttributes(i));
+  }
+
+  // If the function takes more arguments than the call was taking, add them
+  // now.
+  for (unsigned i = NumCommonArgs; i != FT->getNumParams(); ++i) {
+    Args.push_back(Constant::getNullValue(FT->getParamType(i)));
+    ArgAttrs.push_back(AttributeSet());
+  }
+
+  // If we are removing arguments to the function, emit an obnoxious warning.
+  if (FT->getNumParams() < NumActualArgs) {
+    // TODO: if (!FT->isVarArg()) this call may be unreachable. PR14722
+    if (FT->isVarArg()) {
+      // Add all of the arguments in their promoted form to the arg list.
+      for (unsigned i = FT->getNumParams(); i != NumActualArgs; ++i, ++AI) {
+        Type *PTy = getPromotedType((*AI)->getType());
+        Value *NewArg = *AI;
+        if (PTy != (*AI)->getType()) {
+          // Must promote to pass through va_arg area!
+          Instruction::CastOps opcode =
+            CastInst::getCastOpcode(*AI, false, PTy, false);
+          NewArg = Builder.CreateCast(opcode, *AI, PTy);
+        }
+        Args.push_back(NewArg);
+
+        // Add any parameter attributes.
+        ArgAttrs.push_back(CallerPAL.getParamAttributes(i));
+      }
+    }
+  }
+
+  AttributeSet FnAttrs = CallerPAL.getFnAttributes();
+
+  if (NewRetTy->isVoidTy())
+    Caller->setName("");   // Void type should not have a name.
+
+  assert((ArgAttrs.size() == FT->getNumParams() || FT->isVarArg()) &&
+         "missing argument attributes");
+  AttributeList NewCallerPAL = AttributeList::get(
+      Ctx, FnAttrs, AttributeSet::get(Ctx, RAttrs), ArgAttrs);
+
+  SmallVector<OperandBundleDef, 1> OpBundles;
+  Call.getOperandBundlesAsDefs(OpBundles);
+
+  CallBase *NewCall;
+  if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
+    NewCall = Builder.CreateInvoke(Callee, II->getNormalDest(),
+                                   II->getUnwindDest(), Args, OpBundles);
+  } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(Caller)) {
+    NewCall = Builder.CreateCallBr(Callee, CBI->getDefaultDest(),
+                                   CBI->getIndirectDests(), Args, OpBundles);
+  } else {
+    NewCall = Builder.CreateCall(Callee, Args, OpBundles);
+    cast<CallInst>(NewCall)->setTailCallKind(
+        cast<CallInst>(Caller)->getTailCallKind());
+  }
+  NewCall->takeName(Caller);
+  NewCall->setCallingConv(Call.getCallingConv());
+  NewCall->setAttributes(NewCallerPAL);
+
+  // Preserve prof metadata if any.
+  NewCall->copyMetadata(*Caller, {LLVMContext::MD_prof});
+
+  // Insert a cast of the return type as necessary.
+  Instruction *NC = NewCall;
+  Value *NV = NC;
+  if (OldRetTy != NV->getType() && !Caller->use_empty()) {
+    if (!NV->getType()->isVoidTy()) {
+      NV = NC = CastInst::CreateBitOrPointerCast(NC, OldRetTy);
+      NC->setDebugLoc(Caller->getDebugLoc());
+
+      // If this is an invoke/callbr instruction, we should insert it after the
+      // first non-phi instruction in the normal successor block.
+      if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
+        BasicBlock::iterator I = II->getNormalDest()->getFirstInsertionPt();
+        InsertNewInstBefore(NC, *I);
+      } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(Caller)) {
+        BasicBlock::iterator I = CBI->getDefaultDest()->getFirstInsertionPt();
+        InsertNewInstBefore(NC, *I);
+      } else {
+        // Otherwise, it's a call, just insert cast right after the call.
+        InsertNewInstBefore(NC, *Caller);
+      }
+      Worklist.pushUsersToWorkList(*Caller);
+    } else {
+      NV = UndefValue::get(Caller->getType());
+    }
+  }
+
+  if (!Caller->use_empty())
+    replaceInstUsesWith(*Caller, NV);
+  else if (Caller->hasValueHandle()) {
+    if (OldRetTy == NV->getType())
+      ValueHandleBase::ValueIsRAUWd(Caller, NV);
+    else
+      // We cannot call ValueIsRAUWd with a different type, and the
+      // actual tracked value will disappear.
+      ValueHandleBase::ValueIsDeleted(Caller);
+  }
+
+  eraseInstFromFunction(*Caller);
+  return true;
+}
+
+/// Turn a call to a function created by init_trampoline / adjust_trampoline
+/// intrinsic pair into a direct call to the underlying function.
+Instruction *
 InstCombinerImpl::transformCallThroughTrampoline(CallBase &Call,
                                                  IntrinsicInst &Tramp) {
-  Value *Callee = Call.getCalledOperand(); 
-  Type *CalleeTy = Callee->getType(); 
-  FunctionType *FTy = Call.getFunctionType(); 
-  AttributeList Attrs = Call.getAttributes(); 
- 
-  // If the call already has the 'nest' attribute somewhere then give up - 
-  // otherwise 'nest' would occur twice after splicing in the chain. 
-  if (Attrs.hasAttrSomewhere(Attribute::Nest)) 
-    return nullptr; 
- 
-  Function *NestF = cast<Function>(Tramp.getArgOperand(1)->stripPointerCasts()); 
-  FunctionType *NestFTy = NestF->getFunctionType(); 
- 
-  AttributeList NestAttrs = NestF->getAttributes(); 
-  if (!NestAttrs.isEmpty()) { 
-    unsigned NestArgNo = 0; 
-    Type *NestTy = nullptr; 
-    AttributeSet NestAttr; 
- 
-    // Look for a parameter marked with the 'nest' attribute. 
-    for (FunctionType::param_iterator I = NestFTy->param_begin(), 
-                                      E = NestFTy->param_end(); 
-         I != E; ++NestArgNo, ++I) { 
-      AttributeSet AS = NestAttrs.getParamAttributes(NestArgNo); 
-      if (AS.hasAttribute(Attribute::Nest)) { 
-        // Record the parameter type and any other attributes. 
-        NestTy = *I; 
-        NestAttr = AS; 
-        break; 
-      } 
-    } 
- 
-    if (NestTy) { 
-      std::vector<Value*> NewArgs; 
-      std::vector<AttributeSet> NewArgAttrs; 
-      NewArgs.reserve(Call.arg_size() + 1); 
-      NewArgAttrs.reserve(Call.arg_size()); 
- 
-      // Insert the nest argument into the call argument list, which may 
-      // mean appending it.  Likewise for attributes. 
- 
-      { 
-        unsigned ArgNo = 0; 
-        auto I = Call.arg_begin(), E = Call.arg_end(); 
-        do { 
-          if (ArgNo == NestArgNo) { 
-            // Add the chain argument and attributes. 
-            Value *NestVal = Tramp.getArgOperand(2); 
-            if (NestVal->getType() != NestTy) 
-              NestVal = Builder.CreateBitCast(NestVal, NestTy, "nest"); 
-            NewArgs.push_back(NestVal); 
-            NewArgAttrs.push_back(NestAttr); 
-          } 
- 
-          if (I == E) 
-            break; 
- 
-          // Add the original argument and attributes. 
-          NewArgs.push_back(*I); 
-          NewArgAttrs.push_back(Attrs.getParamAttributes(ArgNo)); 
- 
-          ++ArgNo; 
-          ++I; 
-        } while (true); 
-      } 
- 
-      // The trampoline may have been bitcast to a bogus type (FTy). 
-      // Handle this by synthesizing a new function type, equal to FTy 
-      // with the chain parameter inserted. 
- 
-      std::vector<Type*> NewTypes; 
-      NewTypes.reserve(FTy->getNumParams()+1); 
- 
-      // Insert the chain's type into the list of parameter types, which may 
-      // mean appending it. 
-      { 
-        unsigned ArgNo = 0; 
-        FunctionType::param_iterator I = FTy->param_begin(), 
-          E = FTy->param_end(); 
- 
-        do { 
-          if (ArgNo == NestArgNo) 
-            // Add the chain's type. 
-            NewTypes.push_back(NestTy); 
- 
-          if (I == E) 
-            break; 
- 
-          // Add the original type. 
-          NewTypes.push_back(*I); 
- 
-          ++ArgNo; 
-          ++I; 
-        } while (true); 
-      } 
- 
-      // Replace the trampoline call with a direct call.  Let the generic 
-      // code sort out any function type mismatches. 
-      FunctionType *NewFTy = FunctionType::get(FTy->getReturnType(), NewTypes, 
-                                                FTy->isVarArg()); 
-      Constant *NewCallee = 
-        NestF->getType() == PointerType::getUnqual(NewFTy) ? 
-        NestF : ConstantExpr::getBitCast(NestF, 
-                                         PointerType::getUnqual(NewFTy)); 
-      AttributeList NewPAL = 
-          AttributeList::get(FTy->getContext(), Attrs.getFnAttributes(), 
-                             Attrs.getRetAttributes(), NewArgAttrs); 
- 
-      SmallVector<OperandBundleDef, 1> OpBundles; 
-      Call.getOperandBundlesAsDefs(OpBundles); 
- 
-      Instruction *NewCaller; 
-      if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) { 
-        NewCaller = InvokeInst::Create(NewFTy, NewCallee, 
-                                       II->getNormalDest(), II->getUnwindDest(), 
-                                       NewArgs, OpBundles); 
-        cast<InvokeInst>(NewCaller)->setCallingConv(II->getCallingConv()); 
-        cast<InvokeInst>(NewCaller)->setAttributes(NewPAL); 
-      } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(&Call)) { 
-        NewCaller = 
-            CallBrInst::Create(NewFTy, NewCallee, CBI->getDefaultDest(), 
-                               CBI->getIndirectDests(), NewArgs, OpBundles); 
-        cast<CallBrInst>(NewCaller)->setCallingConv(CBI->getCallingConv()); 
-        cast<CallBrInst>(NewCaller)->setAttributes(NewPAL); 
-      } else { 
-        NewCaller = CallInst::Create(NewFTy, NewCallee, NewArgs, OpBundles); 
-        cast<CallInst>(NewCaller)->setTailCallKind( 
-            cast<CallInst>(Call).getTailCallKind()); 
-        cast<CallInst>(NewCaller)->setCallingConv( 
-            cast<CallInst>(Call).getCallingConv()); 
-        cast<CallInst>(NewCaller)->setAttributes(NewPAL); 
-      } 
-      NewCaller->setDebugLoc(Call.getDebugLoc()); 
- 
-      return NewCaller; 
-    } 
-  } 
- 
-  // Replace the trampoline call with a direct call.  Since there is no 'nest' 
-  // parameter, there is no need to adjust the argument list.  Let the generic 
-  // code sort out any function type mismatches. 
-  Constant *NewCallee = ConstantExpr::getBitCast(NestF, CalleeTy); 
-  Call.setCalledFunction(FTy, NewCallee); 
-  return &Call; 
-} 
+  Value *Callee = Call.getCalledOperand();
+  Type *CalleeTy = Callee->getType();
+  FunctionType *FTy = Call.getFunctionType();
+  AttributeList Attrs = Call.getAttributes();
+
+  // If the call already has the 'nest' attribute somewhere then give up -
+  // otherwise 'nest' would occur twice after splicing in the chain.
+  if (Attrs.hasAttrSomewhere(Attribute::Nest))
+    return nullptr;
+
+  Function *NestF = cast<Function>(Tramp.getArgOperand(1)->stripPointerCasts());
+  FunctionType *NestFTy = NestF->getFunctionType();
+
+  AttributeList NestAttrs = NestF->getAttributes();
+  if (!NestAttrs.isEmpty()) {
+    unsigned NestArgNo = 0;
+    Type *NestTy = nullptr;
+    AttributeSet NestAttr;
+
+    // Look for a parameter marked with the 'nest' attribute.
+    for (FunctionType::param_iterator I = NestFTy->param_begin(),
+                                      E = NestFTy->param_end();
+         I != E; ++NestArgNo, ++I) {
+      AttributeSet AS = NestAttrs.getParamAttributes(NestArgNo);
+      if (AS.hasAttribute(Attribute::Nest)) {
+        // Record the parameter type and any other attributes.
+        NestTy = *I;
+        NestAttr = AS;
+        break;
+      }
+    }
+
+    if (NestTy) {
+      std::vector<Value*> NewArgs;
+      std::vector<AttributeSet> NewArgAttrs;
+      NewArgs.reserve(Call.arg_size() + 1);
+      NewArgAttrs.reserve(Call.arg_size());
+
+      // Insert the nest argument into the call argument list, which may
+      // mean appending it.  Likewise for attributes.
+
+      {
+        unsigned ArgNo = 0;
+        auto I = Call.arg_begin(), E = Call.arg_end();
+        do {
+          if (ArgNo == NestArgNo) {
+            // Add the chain argument and attributes.
+            Value *NestVal = Tramp.getArgOperand(2);
+            if (NestVal->getType() != NestTy)
+              NestVal = Builder.CreateBitCast(NestVal, NestTy, "nest");
+            NewArgs.push_back(NestVal);
+            NewArgAttrs.push_back(NestAttr);
+          }
+
+          if (I == E)
+            break;
+
+          // Add the original argument and attributes.
+          NewArgs.push_back(*I);
+          NewArgAttrs.push_back(Attrs.getParamAttributes(ArgNo));
+
+          ++ArgNo;
+          ++I;
+        } while (true);
+      }
+
+      // The trampoline may have been bitcast to a bogus type (FTy).
+      // Handle this by synthesizing a new function type, equal to FTy
+      // with the chain parameter inserted.
+
+      std::vector<Type*> NewTypes;
+      NewTypes.reserve(FTy->getNumParams()+1);
+
+      // Insert the chain's type into the list of parameter types, which may
+      // mean appending it.
+      {
+        unsigned ArgNo = 0;
+        FunctionType::param_iterator I = FTy->param_begin(),
+          E = FTy->param_end();
+
+        do {
+          if (ArgNo == NestArgNo)
+            // Add the chain's type.
+            NewTypes.push_back(NestTy);
+
+          if (I == E)
+            break;
+
+          // Add the original type.
+          NewTypes.push_back(*I);
+
+          ++ArgNo;
+          ++I;
+        } while (true);
+      }
+
+      // Replace the trampoline call with a direct call.  Let the generic
+      // code sort out any function type mismatches.
+      FunctionType *NewFTy = FunctionType::get(FTy->getReturnType(), NewTypes,
+                                                FTy->isVarArg());
+      Constant *NewCallee =
+        NestF->getType() == PointerType::getUnqual(NewFTy) ?
+        NestF : ConstantExpr::getBitCast(NestF,
+                                         PointerType::getUnqual(NewFTy));
+      AttributeList NewPAL =
+          AttributeList::get(FTy->getContext(), Attrs.getFnAttributes(),
+                             Attrs.getRetAttributes(), NewArgAttrs);
+
+      SmallVector<OperandBundleDef, 1> OpBundles;
+      Call.getOperandBundlesAsDefs(OpBundles);
+
+      Instruction *NewCaller;
+      if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) {
+        NewCaller = InvokeInst::Create(NewFTy, NewCallee,
+                                       II->getNormalDest(), II->getUnwindDest(),
+                                       NewArgs, OpBundles);
+        cast<InvokeInst>(NewCaller)->setCallingConv(II->getCallingConv());
+        cast<InvokeInst>(NewCaller)->setAttributes(NewPAL);
+      } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(&Call)) {
+        NewCaller =
+            CallBrInst::Create(NewFTy, NewCallee, CBI->getDefaultDest(),
+                               CBI->getIndirectDests(), NewArgs, OpBundles);
+        cast<CallBrInst>(NewCaller)->setCallingConv(CBI->getCallingConv());
+        cast<CallBrInst>(NewCaller)->setAttributes(NewPAL);
+      } else {
+        NewCaller = CallInst::Create(NewFTy, NewCallee, NewArgs, OpBundles);
+        cast<CallInst>(NewCaller)->setTailCallKind(
+            cast<CallInst>(Call).getTailCallKind());
+        cast<CallInst>(NewCaller)->setCallingConv(
+            cast<CallInst>(Call).getCallingConv());
+        cast<CallInst>(NewCaller)->setAttributes(NewPAL);
+      }
+      NewCaller->setDebugLoc(Call.getDebugLoc());
+
+      return NewCaller;
+    }
+  }
+
+  // Replace the trampoline call with a direct call.  Since there is no 'nest'
+  // parameter, there is no need to adjust the argument list.  Let the generic
+  // code sort out any function type mismatches.
+  Constant *NewCallee = ConstantExpr::getBitCast(NestF, CalleeTy);
+  Call.setCalledFunction(FTy, NewCallee);
+  return &Call;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineCasts.cpp b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineCasts.cpp
index d0bb02568d..07e68c4441 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1,99 +1,99 @@
-//===- InstCombineCasts.cpp -----------------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the visit functions for cast operations. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "InstCombineInternal.h" 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/Analysis/ConstantFolding.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
+//===- InstCombineCasts.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visit functions for cast operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/DIBuilder.h"
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/Support/KnownBits.h" 
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
-#include <numeric> 
-using namespace llvm; 
-using namespace PatternMatch; 
- 
-#define DEBUG_TYPE "instcombine" 
- 
-/// Analyze 'Val', seeing if it is a simple linear expression. 
-/// If so, decompose it, returning some value X, such that Val is 
-/// X*Scale+Offset. 
-/// 
-static Value *decomposeSimpleLinearExpr(Value *Val, unsigned &Scale, 
-                                        uint64_t &Offset) { 
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(Val)) { 
-    Offset = CI->getZExtValue(); 
-    Scale  = 0; 
-    return ConstantInt::get(Val->getType(), 0); 
-  } 
- 
-  if (BinaryOperator *I = dyn_cast<BinaryOperator>(Val)) { 
-    // Cannot look past anything that might overflow. 
-    OverflowingBinaryOperator *OBI = dyn_cast<OverflowingBinaryOperator>(Val); 
-    if (OBI && !OBI->hasNoUnsignedWrap() && !OBI->hasNoSignedWrap()) { 
-      Scale = 1; 
-      Offset = 0; 
-      return Val; 
-    } 
- 
-    if (ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1))) { 
-      if (I->getOpcode() == Instruction::Shl) { 
-        // This is a value scaled by '1 << the shift amt'. 
-        Scale = UINT64_C(1) << RHS->getZExtValue(); 
-        Offset = 0; 
-        return I->getOperand(0); 
-      } 
- 
-      if (I->getOpcode() == Instruction::Mul) { 
-        // This value is scaled by 'RHS'. 
-        Scale = RHS->getZExtValue(); 
-        Offset = 0; 
-        return I->getOperand(0); 
-      } 
- 
-      if (I->getOpcode() == Instruction::Add) { 
-        // We have X+C.  Check to see if we really have (X*C2)+C1, 
-        // where C1 is divisible by C2. 
-        unsigned SubScale; 
-        Value *SubVal = 
-          decomposeSimpleLinearExpr(I->getOperand(0), SubScale, Offset); 
-        Offset += RHS->getZExtValue(); 
-        Scale = SubScale; 
-        return SubVal; 
-      } 
-    } 
-  } 
- 
-  // Otherwise, we can't look past this. 
-  Scale = 1; 
-  Offset = 0; 
-  return Val; 
-} 
- 
-/// If we find a cast of an allocation instruction, try to eliminate the cast by 
-/// moving the type information into the alloc. 
+#include <numeric>
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
+/// Analyze 'Val', seeing if it is a simple linear expression.
+/// If so, decompose it, returning some value X, such that Val is
+/// X*Scale+Offset.
+///
+static Value *decomposeSimpleLinearExpr(Value *Val, unsigned &Scale,
+                                        uint64_t &Offset) {
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
+    Offset = CI->getZExtValue();
+    Scale  = 0;
+    return ConstantInt::get(Val->getType(), 0);
+  }
+
+  if (BinaryOperator *I = dyn_cast<BinaryOperator>(Val)) {
+    // Cannot look past anything that might overflow.
+    OverflowingBinaryOperator *OBI = dyn_cast<OverflowingBinaryOperator>(Val);
+    if (OBI && !OBI->hasNoUnsignedWrap() && !OBI->hasNoSignedWrap()) {
+      Scale = 1;
+      Offset = 0;
+      return Val;
+    }
+
+    if (ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      if (I->getOpcode() == Instruction::Shl) {
+        // This is a value scaled by '1 << the shift amt'.
+        Scale = UINT64_C(1) << RHS->getZExtValue();
+        Offset = 0;
+        return I->getOperand(0);
+      }
+
+      if (I->getOpcode() == Instruction::Mul) {
+        // This value is scaled by 'RHS'.
+        Scale = RHS->getZExtValue();
+        Offset = 0;
+        return I->getOperand(0);
+      }
+
+      if (I->getOpcode() == Instruction::Add) {
+        // We have X+C.  Check to see if we really have (X*C2)+C1,
+        // where C1 is divisible by C2.
+        unsigned SubScale;
+        Value *SubVal =
+          decomposeSimpleLinearExpr(I->getOperand(0), SubScale, Offset);
+        Offset += RHS->getZExtValue();
+        Scale = SubScale;
+        return SubVal;
+      }
+    }
+  }
+
+  // Otherwise, we can't look past this.
+  Scale = 1;
+  Offset = 0;
+  return Val;
+}
+
+/// If we find a cast of an allocation instruction, try to eliminate the cast by
+/// moving the type information into the alloc.
 Instruction *InstCombinerImpl::PromoteCastOfAllocation(BitCastInst &CI,
                                                        AllocaInst &AI) {
-  PointerType *PTy = cast<PointerType>(CI.getType()); 
- 
-  IRBuilderBase::InsertPointGuard Guard(Builder); 
-  Builder.SetInsertPoint(&AI); 
- 
-  // Get the type really allocated and the type casted to. 
-  Type *AllocElTy = AI.getAllocatedType(); 
-  Type *CastElTy = PTy->getElementType(); 
-  if (!AllocElTy->isSized() || !CastElTy->isSized()) return nullptr; 
- 
+  PointerType *PTy = cast<PointerType>(CI.getType());
+
+  IRBuilderBase::InsertPointGuard Guard(Builder);
+  Builder.SetInsertPoint(&AI);
+
+  // Get the type really allocated and the type casted to.
+  Type *AllocElTy = AI.getAllocatedType();
+  Type *CastElTy = PTy->getElementType();
+  if (!AllocElTy->isSized() || !CastElTy->isSized()) return nullptr;
+
   // This optimisation does not work for cases where the cast type
   // is scalable and the allocated type is not. This because we need to
   // know how many times the casted type fits into the allocated type.
@@ -106,441 +106,441 @@ Instruction *InstCombinerImpl::PromoteCastOfAllocation(BitCastInst &CI,
   bool CastIsScalable = isa<ScalableVectorType>(CastElTy);
   if (AllocIsScalable != CastIsScalable) return nullptr;
 
-  Align AllocElTyAlign = DL.getABITypeAlign(AllocElTy); 
-  Align CastElTyAlign = DL.getABITypeAlign(CastElTy); 
-  if (CastElTyAlign < AllocElTyAlign) return nullptr; 
- 
-  // If the allocation has multiple uses, only promote it if we are strictly 
-  // increasing the alignment of the resultant allocation.  If we keep it the 
-  // same, we open the door to infinite loops of various kinds. 
-  if (!AI.hasOneUse() && CastElTyAlign == AllocElTyAlign) return nullptr; 
- 
+  Align AllocElTyAlign = DL.getABITypeAlign(AllocElTy);
+  Align CastElTyAlign = DL.getABITypeAlign(CastElTy);
+  if (CastElTyAlign < AllocElTyAlign) return nullptr;
+
+  // If the allocation has multiple uses, only promote it if we are strictly
+  // increasing the alignment of the resultant allocation.  If we keep it the
+  // same, we open the door to infinite loops of various kinds.
+  if (!AI.hasOneUse() && CastElTyAlign == AllocElTyAlign) return nullptr;
+
   // The alloc and cast types should be either both fixed or both scalable.
   uint64_t AllocElTySize = DL.getTypeAllocSize(AllocElTy).getKnownMinSize();
   uint64_t CastElTySize = DL.getTypeAllocSize(CastElTy).getKnownMinSize();
-  if (CastElTySize == 0 || AllocElTySize == 0) return nullptr; 
- 
-  // If the allocation has multiple uses, only promote it if we're not 
-  // shrinking the amount of memory being allocated. 
+  if (CastElTySize == 0 || AllocElTySize == 0) return nullptr;
+
+  // If the allocation has multiple uses, only promote it if we're not
+  // shrinking the amount of memory being allocated.
   uint64_t AllocElTyStoreSize = DL.getTypeStoreSize(AllocElTy).getKnownMinSize();
   uint64_t CastElTyStoreSize = DL.getTypeStoreSize(CastElTy).getKnownMinSize();
-  if (!AI.hasOneUse() && CastElTyStoreSize < AllocElTyStoreSize) return nullptr; 
- 
-  // See if we can satisfy the modulus by pulling a scale out of the array 
-  // size argument. 
-  unsigned ArraySizeScale; 
-  uint64_t ArrayOffset; 
-  Value *NumElements = // See if the array size is a decomposable linear expr. 
-    decomposeSimpleLinearExpr(AI.getOperand(0), ArraySizeScale, ArrayOffset); 
- 
-  // If we can now satisfy the modulus, by using a non-1 scale, we really can 
-  // do the xform. 
-  if ((AllocElTySize*ArraySizeScale) % CastElTySize != 0 || 
-      (AllocElTySize*ArrayOffset   ) % CastElTySize != 0) return nullptr; 
- 
+  if (!AI.hasOneUse() && CastElTyStoreSize < AllocElTyStoreSize) return nullptr;
+
+  // See if we can satisfy the modulus by pulling a scale out of the array
+  // size argument.
+  unsigned ArraySizeScale;
+  uint64_t ArrayOffset;
+  Value *NumElements = // See if the array size is a decomposable linear expr.
+    decomposeSimpleLinearExpr(AI.getOperand(0), ArraySizeScale, ArrayOffset);
+
+  // If we can now satisfy the modulus, by using a non-1 scale, we really can
+  // do the xform.
+  if ((AllocElTySize*ArraySizeScale) % CastElTySize != 0 ||
+      (AllocElTySize*ArrayOffset   ) % CastElTySize != 0) return nullptr;
+
   // We don't currently support arrays of scalable types.
   assert(!AllocIsScalable || (ArrayOffset == 1 && ArraySizeScale == 0));
 
-  unsigned Scale = (AllocElTySize*ArraySizeScale)/CastElTySize; 
-  Value *Amt = nullptr; 
-  if (Scale == 1) { 
-    Amt = NumElements; 
-  } else { 
-    Amt = ConstantInt::get(AI.getArraySize()->getType(), Scale); 
-    // Insert before the alloca, not before the cast. 
-    Amt = Builder.CreateMul(Amt, NumElements); 
-  } 
- 
-  if (uint64_t Offset = (AllocElTySize*ArrayOffset)/CastElTySize) { 
-    Value *Off = ConstantInt::get(AI.getArraySize()->getType(), 
-                                  Offset, true); 
-    Amt = Builder.CreateAdd(Amt, Off); 
-  } 
- 
-  AllocaInst *New = Builder.CreateAlloca(CastElTy, Amt); 
-  New->setAlignment(AI.getAlign()); 
-  New->takeName(&AI); 
-  New->setUsedWithInAlloca(AI.isUsedWithInAlloca()); 
- 
-  // If the allocation has multiple real uses, insert a cast and change all 
-  // things that used it to use the new cast.  This will also hack on CI, but it 
-  // will die soon. 
-  if (!AI.hasOneUse()) { 
-    // New is the allocation instruction, pointer typed. AI is the original 
-    // allocation instruction, also pointer typed. Thus, cast to use is BitCast. 
-    Value *NewCast = Builder.CreateBitCast(New, AI.getType(), "tmpcast"); 
-    replaceInstUsesWith(AI, NewCast); 
-    eraseInstFromFunction(AI); 
-  } 
-  return replaceInstUsesWith(CI, New); 
-} 
- 
-/// Given an expression that CanEvaluateTruncated or CanEvaluateSExtd returns 
-/// true for, actually insert the code to evaluate the expression. 
+  unsigned Scale = (AllocElTySize*ArraySizeScale)/CastElTySize;
+  Value *Amt = nullptr;
+  if (Scale == 1) {
+    Amt = NumElements;
+  } else {
+    Amt = ConstantInt::get(AI.getArraySize()->getType(), Scale);
+    // Insert before the alloca, not before the cast.
+    Amt = Builder.CreateMul(Amt, NumElements);
+  }
+
+  if (uint64_t Offset = (AllocElTySize*ArrayOffset)/CastElTySize) {
+    Value *Off = ConstantInt::get(AI.getArraySize()->getType(),
+                                  Offset, true);
+    Amt = Builder.CreateAdd(Amt, Off);
+  }
+
+  AllocaInst *New = Builder.CreateAlloca(CastElTy, Amt);
+  New->setAlignment(AI.getAlign());
+  New->takeName(&AI);
+  New->setUsedWithInAlloca(AI.isUsedWithInAlloca());
+
+  // If the allocation has multiple real uses, insert a cast and change all
+  // things that used it to use the new cast.  This will also hack on CI, but it
+  // will die soon.
+  if (!AI.hasOneUse()) {
+    // New is the allocation instruction, pointer typed. AI is the original
+    // allocation instruction, also pointer typed. Thus, cast to use is BitCast.
+    Value *NewCast = Builder.CreateBitCast(New, AI.getType(), "tmpcast");
+    replaceInstUsesWith(AI, NewCast);
+    eraseInstFromFunction(AI);
+  }
+  return replaceInstUsesWith(CI, New);
+}
+
+/// Given an expression that CanEvaluateTruncated or CanEvaluateSExtd returns
+/// true for, actually insert the code to evaluate the expression.
 Value *InstCombinerImpl::EvaluateInDifferentType(Value *V, Type *Ty,
                                                  bool isSigned) {
-  if (Constant *C = dyn_cast<Constant>(V)) { 
-    C = ConstantExpr::getIntegerCast(C, Ty, isSigned /*Sext or ZExt*/); 
-    // If we got a constantexpr back, try to simplify it with DL info. 
-    return ConstantFoldConstant(C, DL, &TLI); 
-  } 
- 
-  // Otherwise, it must be an instruction. 
-  Instruction *I = cast<Instruction>(V); 
-  Instruction *Res = nullptr; 
-  unsigned Opc = I->getOpcode(); 
-  switch (Opc) { 
-  case Instruction::Add: 
-  case Instruction::Sub: 
-  case Instruction::Mul: 
-  case Instruction::And: 
-  case Instruction::Or: 
-  case Instruction::Xor: 
-  case Instruction::AShr: 
-  case Instruction::LShr: 
-  case Instruction::Shl: 
-  case Instruction::UDiv: 
-  case Instruction::URem: { 
-    Value *LHS = EvaluateInDifferentType(I->getOperand(0), Ty, isSigned); 
-    Value *RHS = EvaluateInDifferentType(I->getOperand(1), Ty, isSigned); 
-    Res = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS); 
-    break; 
-  } 
-  case Instruction::Trunc: 
-  case Instruction::ZExt: 
-  case Instruction::SExt: 
-    // If the source type of the cast is the type we're trying for then we can 
-    // just return the source.  There's no need to insert it because it is not 
-    // new. 
-    if (I->getOperand(0)->getType() == Ty) 
-      return I->getOperand(0); 
- 
-    // Otherwise, must be the same type of cast, so just reinsert a new one. 
-    // This also handles the case of zext(trunc(x)) -> zext(x). 
-    Res = CastInst::CreateIntegerCast(I->getOperand(0), Ty, 
-                                      Opc == Instruction::SExt); 
-    break; 
-  case Instruction::Select: { 
-    Value *True = EvaluateInDifferentType(I->getOperand(1), Ty, isSigned); 
-    Value *False = EvaluateInDifferentType(I->getOperand(2), Ty, isSigned); 
-    Res = SelectInst::Create(I->getOperand(0), True, False); 
-    break; 
-  } 
-  case Instruction::PHI: { 
-    PHINode *OPN = cast<PHINode>(I); 
-    PHINode *NPN = PHINode::Create(Ty, OPN->getNumIncomingValues()); 
-    for (unsigned i = 0, e = OPN->getNumIncomingValues(); i != e; ++i) { 
-      Value *V = 
-          EvaluateInDifferentType(OPN->getIncomingValue(i), Ty, isSigned); 
-      NPN->addIncoming(V, OPN->getIncomingBlock(i)); 
-    } 
-    Res = NPN; 
-    break; 
-  } 
-  default: 
-    // TODO: Can handle more cases here. 
-    llvm_unreachable("Unreachable!"); 
-  } 
- 
-  Res->takeName(I); 
-  return InsertNewInstWith(Res, *I); 
-} 
- 
+  if (Constant *C = dyn_cast<Constant>(V)) {
+    C = ConstantExpr::getIntegerCast(C, Ty, isSigned /*Sext or ZExt*/);
+    // If we got a constantexpr back, try to simplify it with DL info.
+    return ConstantFoldConstant(C, DL, &TLI);
+  }
+
+  // Otherwise, it must be an instruction.
+  Instruction *I = cast<Instruction>(V);
+  Instruction *Res = nullptr;
+  unsigned Opc = I->getOpcode();
+  switch (Opc) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::AShr:
+  case Instruction::LShr:
+  case Instruction::Shl:
+  case Instruction::UDiv:
+  case Instruction::URem: {
+    Value *LHS = EvaluateInDifferentType(I->getOperand(0), Ty, isSigned);
+    Value *RHS = EvaluateInDifferentType(I->getOperand(1), Ty, isSigned);
+    Res = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS);
+    break;
+  }
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+    // If the source type of the cast is the type we're trying for then we can
+    // just return the source.  There's no need to insert it because it is not
+    // new.
+    if (I->getOperand(0)->getType() == Ty)
+      return I->getOperand(0);
+
+    // Otherwise, must be the same type of cast, so just reinsert a new one.
+    // This also handles the case of zext(trunc(x)) -> zext(x).
+    Res = CastInst::CreateIntegerCast(I->getOperand(0), Ty,
+                                      Opc == Instruction::SExt);
+    break;
+  case Instruction::Select: {
+    Value *True = EvaluateInDifferentType(I->getOperand(1), Ty, isSigned);
+    Value *False = EvaluateInDifferentType(I->getOperand(2), Ty, isSigned);
+    Res = SelectInst::Create(I->getOperand(0), True, False);
+    break;
+  }
+  case Instruction::PHI: {
+    PHINode *OPN = cast<PHINode>(I);
+    PHINode *NPN = PHINode::Create(Ty, OPN->getNumIncomingValues());
+    for (unsigned i = 0, e = OPN->getNumIncomingValues(); i != e; ++i) {
+      Value *V =
+          EvaluateInDifferentType(OPN->getIncomingValue(i), Ty, isSigned);
+      NPN->addIncoming(V, OPN->getIncomingBlock(i));
+    }
+    Res = NPN;
+    break;
+  }
+  default:
+    // TODO: Can handle more cases here.
+    llvm_unreachable("Unreachable!");
+  }
+
+  Res->takeName(I);
+  return InsertNewInstWith(Res, *I);
+}
+
 Instruction::CastOps
 InstCombinerImpl::isEliminableCastPair(const CastInst *CI1,
                                        const CastInst *CI2) {
-  Type *SrcTy = CI1->getSrcTy(); 
-  Type *MidTy = CI1->getDestTy(); 
-  Type *DstTy = CI2->getDestTy(); 
- 
-  Instruction::CastOps firstOp = CI1->getOpcode(); 
-  Instruction::CastOps secondOp = CI2->getOpcode(); 
-  Type *SrcIntPtrTy = 
-      SrcTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(SrcTy) : nullptr; 
-  Type *MidIntPtrTy = 
-      MidTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(MidTy) : nullptr; 
-  Type *DstIntPtrTy = 
-      DstTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(DstTy) : nullptr; 
-  unsigned Res = CastInst::isEliminableCastPair(firstOp, secondOp, SrcTy, MidTy, 
-                                                DstTy, SrcIntPtrTy, MidIntPtrTy, 
-                                                DstIntPtrTy); 
- 
-  // We don't want to form an inttoptr or ptrtoint that converts to an integer 
-  // type that differs from the pointer size. 
-  if ((Res == Instruction::IntToPtr && SrcTy != DstIntPtrTy) || 
-      (Res == Instruction::PtrToInt && DstTy != SrcIntPtrTy)) 
-    Res = 0; 
- 
-  return Instruction::CastOps(Res); 
-} 
- 
-/// Implement the transforms common to all CastInst visitors. 
+  Type *SrcTy = CI1->getSrcTy();
+  Type *MidTy = CI1->getDestTy();
+  Type *DstTy = CI2->getDestTy();
+
+  Instruction::CastOps firstOp = CI1->getOpcode();
+  Instruction::CastOps secondOp = CI2->getOpcode();
+  Type *SrcIntPtrTy =
+      SrcTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(SrcTy) : nullptr;
+  Type *MidIntPtrTy =
+      MidTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(MidTy) : nullptr;
+  Type *DstIntPtrTy =
+      DstTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(DstTy) : nullptr;
+  unsigned Res = CastInst::isEliminableCastPair(firstOp, secondOp, SrcTy, MidTy,
+                                                DstTy, SrcIntPtrTy, MidIntPtrTy,
+                                                DstIntPtrTy);
+
+  // We don't want to form an inttoptr or ptrtoint that converts to an integer
+  // type that differs from the pointer size.
+  if ((Res == Instruction::IntToPtr && SrcTy != DstIntPtrTy) ||
+      (Res == Instruction::PtrToInt && DstTy != SrcIntPtrTy))
+    Res = 0;
+
+  return Instruction::CastOps(Res);
+}
+
+/// Implement the transforms common to all CastInst visitors.
 Instruction *InstCombinerImpl::commonCastTransforms(CastInst &CI) {
-  Value *Src = CI.getOperand(0); 
- 
-  // Try to eliminate a cast of a cast. 
-  if (auto *CSrc = dyn_cast<CastInst>(Src)) {   // A->B->C cast 
-    if (Instruction::CastOps NewOpc = isEliminableCastPair(CSrc, &CI)) { 
-      // The first cast (CSrc) is eliminable so we need to fix up or replace 
-      // the second cast (CI). CSrc will then have a good chance of being dead. 
-      auto *Ty = CI.getType(); 
-      auto *Res = CastInst::Create(NewOpc, CSrc->getOperand(0), Ty); 
-      // Point debug users of the dying cast to the new one. 
-      if (CSrc->hasOneUse()) 
-        replaceAllDbgUsesWith(*CSrc, *Res, CI, DT); 
-      return Res; 
-    } 
-  } 
- 
-  if (auto *Sel = dyn_cast<SelectInst>(Src)) { 
-    // We are casting a select. Try to fold the cast into the select if the 
-    // select does not have a compare instruction with matching operand types 
-    // or the select is likely better done in a narrow type. 
-    // Creating a select with operands that are different sizes than its 
-    // condition may inhibit other folds and lead to worse codegen. 
-    auto *Cmp = dyn_cast<CmpInst>(Sel->getCondition()); 
-    if (!Cmp || Cmp->getOperand(0)->getType() != Sel->getType() || 
-        (CI.getOpcode() == Instruction::Trunc && 
-         shouldChangeType(CI.getSrcTy(), CI.getType()))) { 
-      if (Instruction *NV = FoldOpIntoSelect(CI, Sel)) { 
-        replaceAllDbgUsesWith(*Sel, *NV, CI, DT); 
-        return NV; 
-      } 
-    } 
-  } 
- 
-  // If we are casting a PHI, then fold the cast into the PHI. 
-  if (auto *PN = dyn_cast<PHINode>(Src)) { 
-    // Don't do this if it would create a PHI node with an illegal type from a 
-    // legal type. 
-    if (!Src->getType()->isIntegerTy() || !CI.getType()->isIntegerTy() || 
-        shouldChangeType(CI.getSrcTy(), CI.getType())) 
-      if (Instruction *NV = foldOpIntoPhi(CI, PN)) 
-        return NV; 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Constants and extensions/truncates from the destination type are always 
-/// free to be evaluated in that type. This is a helper for canEvaluate*. 
-static bool canAlwaysEvaluateInType(Value *V, Type *Ty) { 
-  if (isa<Constant>(V)) 
-    return true; 
-  Value *X; 
-  if ((match(V, m_ZExtOrSExt(m_Value(X))) || match(V, m_Trunc(m_Value(X)))) && 
-      X->getType() == Ty) 
-    return true; 
- 
-  return false; 
-} 
- 
-/// Filter out values that we can not evaluate in the destination type for free. 
-/// This is a helper for canEvaluate*. 
-static bool canNotEvaluateInType(Value *V, Type *Ty) { 
-  assert(!isa<Constant>(V) && "Constant should already be handled."); 
-  if (!isa<Instruction>(V)) 
-    return true; 
-  // We don't extend or shrink something that has multiple uses --  doing so 
-  // would require duplicating the instruction which isn't profitable. 
-  if (!V->hasOneUse()) 
-    return true; 
- 
-  return false; 
-} 
- 
-/// Return true if we can evaluate the specified expression tree as type Ty 
-/// instead of its larger type, and arrive with the same value. 
-/// This is used by code that tries to eliminate truncates. 
-/// 
-/// Ty will always be a type smaller than V.  We should return true if trunc(V) 
-/// can be computed by computing V in the smaller type.  If V is an instruction, 
-/// then trunc(inst(x,y)) can be computed as inst(trunc(x),trunc(y)), which only 
-/// makes sense if x and y can be efficiently truncated. 
-/// 
-/// This function works on both vectors and scalars. 
-/// 
+  Value *Src = CI.getOperand(0);
+
+  // Try to eliminate a cast of a cast.
+  if (auto *CSrc = dyn_cast<CastInst>(Src)) {   // A->B->C cast
+    if (Instruction::CastOps NewOpc = isEliminableCastPair(CSrc, &CI)) {
+      // The first cast (CSrc) is eliminable so we need to fix up or replace
+      // the second cast (CI). CSrc will then have a good chance of being dead.
+      auto *Ty = CI.getType();
+      auto *Res = CastInst::Create(NewOpc, CSrc->getOperand(0), Ty);
+      // Point debug users of the dying cast to the new one.
+      if (CSrc->hasOneUse())
+        replaceAllDbgUsesWith(*CSrc, *Res, CI, DT);
+      return Res;
+    }
+  }
+
+  if (auto *Sel = dyn_cast<SelectInst>(Src)) {
+    // We are casting a select. Try to fold the cast into the select if the
+    // select does not have a compare instruction with matching operand types
+    // or the select is likely better done in a narrow type.
+    // Creating a select with operands that are different sizes than its
+    // condition may inhibit other folds and lead to worse codegen.
+    auto *Cmp = dyn_cast<CmpInst>(Sel->getCondition());
+    if (!Cmp || Cmp->getOperand(0)->getType() != Sel->getType() ||
+        (CI.getOpcode() == Instruction::Trunc &&
+         shouldChangeType(CI.getSrcTy(), CI.getType()))) {
+      if (Instruction *NV = FoldOpIntoSelect(CI, Sel)) {
+        replaceAllDbgUsesWith(*Sel, *NV, CI, DT);
+        return NV;
+      }
+    }
+  }
+
+  // If we are casting a PHI, then fold the cast into the PHI.
+  if (auto *PN = dyn_cast<PHINode>(Src)) {
+    // Don't do this if it would create a PHI node with an illegal type from a
+    // legal type.
+    if (!Src->getType()->isIntegerTy() || !CI.getType()->isIntegerTy() ||
+        shouldChangeType(CI.getSrcTy(), CI.getType()))
+      if (Instruction *NV = foldOpIntoPhi(CI, PN))
+        return NV;
+  }
+
+  return nullptr;
+}
+
+/// Constants and extensions/truncates from the destination type are always
+/// free to be evaluated in that type. This is a helper for canEvaluate*.
+static bool canAlwaysEvaluateInType(Value *V, Type *Ty) {
+  if (isa<Constant>(V))
+    return true;
+  Value *X;
+  if ((match(V, m_ZExtOrSExt(m_Value(X))) || match(V, m_Trunc(m_Value(X)))) &&
+      X->getType() == Ty)
+    return true;
+
+  return false;
+}
+
+/// Filter out values that we can not evaluate in the destination type for free.
+/// This is a helper for canEvaluate*.
+static bool canNotEvaluateInType(Value *V, Type *Ty) {
+  assert(!isa<Constant>(V) && "Constant should already be handled.");
+  if (!isa<Instruction>(V))
+    return true;
+  // We don't extend or shrink something that has multiple uses --  doing so
+  // would require duplicating the instruction which isn't profitable.
+  if (!V->hasOneUse())
+    return true;
+
+  return false;
+}
+
+/// Return true if we can evaluate the specified expression tree as type Ty
+/// instead of its larger type, and arrive with the same value.
+/// This is used by code that tries to eliminate truncates.
+///
+/// Ty will always be a type smaller than V.  We should return true if trunc(V)
+/// can be computed by computing V in the smaller type.  If V is an instruction,
+/// then trunc(inst(x,y)) can be computed as inst(trunc(x),trunc(y)), which only
+/// makes sense if x and y can be efficiently truncated.
+///
+/// This function works on both vectors and scalars.
+///
 static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombinerImpl &IC,
-                                 Instruction *CxtI) { 
-  if (canAlwaysEvaluateInType(V, Ty)) 
-    return true; 
-  if (canNotEvaluateInType(V, Ty)) 
-    return false; 
- 
-  auto *I = cast<Instruction>(V); 
-  Type *OrigTy = V->getType(); 
-  switch (I->getOpcode()) { 
-  case Instruction::Add: 
-  case Instruction::Sub: 
-  case Instruction::Mul: 
-  case Instruction::And: 
-  case Instruction::Or: 
-  case Instruction::Xor: 
-    // These operators can all arbitrarily be extended or truncated. 
-    return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) && 
-           canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI); 
- 
-  case Instruction::UDiv: 
-  case Instruction::URem: { 
-    // UDiv and URem can be truncated if all the truncated bits are zero. 
-    uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits(); 
-    uint32_t BitWidth = Ty->getScalarSizeInBits(); 
-    assert(BitWidth < OrigBitWidth && "Unexpected bitwidths!"); 
-    APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth); 
-    if (IC.MaskedValueIsZero(I->getOperand(0), Mask, 0, CxtI) && 
-        IC.MaskedValueIsZero(I->getOperand(1), Mask, 0, CxtI)) { 
-      return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) && 
-             canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI); 
-    } 
-    break; 
-  } 
-  case Instruction::Shl: { 
-    // If we are truncating the result of this SHL, and if it's a shift of an 
-    // inrange amount, we can always perform a SHL in a smaller type. 
-    uint32_t BitWidth = Ty->getScalarSizeInBits(); 
-    KnownBits AmtKnownBits = 
-        llvm::computeKnownBits(I->getOperand(1), IC.getDataLayout()); 
-    if (AmtKnownBits.getMaxValue().ult(BitWidth)) 
-      return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) && 
-             canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI); 
-    break; 
-  } 
-  case Instruction::LShr: { 
-    // If this is a truncate of a logical shr, we can truncate it to a smaller 
-    // lshr iff we know that the bits we would otherwise be shifting in are 
-    // already zeros. 
-    // TODO: It is enough to check that the bits we would be shifting in are 
-    //       zero - use AmtKnownBits.getMaxValue(). 
-    uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits(); 
-    uint32_t BitWidth = Ty->getScalarSizeInBits(); 
-    KnownBits AmtKnownBits = 
-        llvm::computeKnownBits(I->getOperand(1), IC.getDataLayout()); 
-    APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth); 
-    if (AmtKnownBits.getMaxValue().ult(BitWidth) && 
-        IC.MaskedValueIsZero(I->getOperand(0), ShiftedBits, 0, CxtI)) { 
-      return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) && 
-             canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI); 
-    } 
-    break; 
-  } 
-  case Instruction::AShr: { 
-    // If this is a truncate of an arithmetic shr, we can truncate it to a 
-    // smaller ashr iff we know that all the bits from the sign bit of the 
-    // original type and the sign bit of the truncate type are similar. 
-    // TODO: It is enough to check that the bits we would be shifting in are 
-    //       similar to sign bit of the truncate type. 
-    uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits(); 
-    uint32_t BitWidth = Ty->getScalarSizeInBits(); 
-    KnownBits AmtKnownBits = 
-        llvm::computeKnownBits(I->getOperand(1), IC.getDataLayout()); 
-    unsigned ShiftedBits = OrigBitWidth - BitWidth; 
-    if (AmtKnownBits.getMaxValue().ult(BitWidth) && 
-        ShiftedBits < IC.ComputeNumSignBits(I->getOperand(0), 0, CxtI)) 
-      return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) && 
-             canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI); 
-    break; 
-  } 
-  case Instruction::Trunc: 
-    // trunc(trunc(x)) -> trunc(x) 
-    return true; 
-  case Instruction::ZExt: 
-  case Instruction::SExt: 
-    // trunc(ext(x)) -> ext(x) if the source type is smaller than the new dest 
-    // trunc(ext(x)) -> trunc(x) if the source type is larger than the new dest 
-    return true; 
-  case Instruction::Select: { 
-    SelectInst *SI = cast<SelectInst>(I); 
-    return canEvaluateTruncated(SI->getTrueValue(), Ty, IC, CxtI) && 
-           canEvaluateTruncated(SI->getFalseValue(), Ty, IC, CxtI); 
-  } 
-  case Instruction::PHI: { 
-    // We can change a phi if we can change all operands.  Note that we never 
-    // get into trouble with cyclic PHIs here because we only consider 
-    // instructions with a single use. 
-    PHINode *PN = cast<PHINode>(I); 
-    for (Value *IncValue : PN->incoming_values()) 
-      if (!canEvaluateTruncated(IncValue, Ty, IC, CxtI)) 
-        return false; 
-    return true; 
-  } 
-  default: 
-    // TODO: Can handle more cases here. 
-    break; 
-  } 
- 
-  return false; 
-} 
- 
-/// Given a vector that is bitcast to an integer, optionally logically 
-/// right-shifted, and truncated, convert it to an extractelement. 
-/// Example (big endian): 
-///   trunc (lshr (bitcast <4 x i32> %X to i128), 32) to i32 
-///   ---> 
-///   extractelement <4 x i32> %X, 1 
+                                 Instruction *CxtI) {
+  if (canAlwaysEvaluateInType(V, Ty))
+    return true;
+  if (canNotEvaluateInType(V, Ty))
+    return false;
+
+  auto *I = cast<Instruction>(V);
+  Type *OrigTy = V->getType();
+  switch (I->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    // These operators can all arbitrarily be extended or truncated.
+    return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
+           canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
+
+  case Instruction::UDiv:
+  case Instruction::URem: {
+    // UDiv and URem can be truncated if all the truncated bits are zero.
+    uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
+    uint32_t BitWidth = Ty->getScalarSizeInBits();
+    assert(BitWidth < OrigBitWidth && "Unexpected bitwidths!");
+    APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
+    if (IC.MaskedValueIsZero(I->getOperand(0), Mask, 0, CxtI) &&
+        IC.MaskedValueIsZero(I->getOperand(1), Mask, 0, CxtI)) {
+      return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
+             canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
+    }
+    break;
+  }
+  case Instruction::Shl: {
+    // If we are truncating the result of this SHL, and if it's a shift of an
+    // inrange amount, we can always perform a SHL in a smaller type.
+    uint32_t BitWidth = Ty->getScalarSizeInBits();
+    KnownBits AmtKnownBits =
+        llvm::computeKnownBits(I->getOperand(1), IC.getDataLayout());
+    if (AmtKnownBits.getMaxValue().ult(BitWidth))
+      return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
+             canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
+    break;
+  }
+  case Instruction::LShr: {
+    // If this is a truncate of a logical shr, we can truncate it to a smaller
+    // lshr iff we know that the bits we would otherwise be shifting in are
+    // already zeros.
+    // TODO: It is enough to check that the bits we would be shifting in are
+    //       zero - use AmtKnownBits.getMaxValue().
+    uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
+    uint32_t BitWidth = Ty->getScalarSizeInBits();
+    KnownBits AmtKnownBits =
+        llvm::computeKnownBits(I->getOperand(1), IC.getDataLayout());
+    APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
+    if (AmtKnownBits.getMaxValue().ult(BitWidth) &&
+        IC.MaskedValueIsZero(I->getOperand(0), ShiftedBits, 0, CxtI)) {
+      return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
+             canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
+    }
+    break;
+  }
+  case Instruction::AShr: {
+    // If this is a truncate of an arithmetic shr, we can truncate it to a
+    // smaller ashr iff we know that all the bits from the sign bit of the
+    // original type and the sign bit of the truncate type are similar.
+    // TODO: It is enough to check that the bits we would be shifting in are
+    //       similar to sign bit of the truncate type.
+    uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
+    uint32_t BitWidth = Ty->getScalarSizeInBits();
+    KnownBits AmtKnownBits =
+        llvm::computeKnownBits(I->getOperand(1), IC.getDataLayout());
+    unsigned ShiftedBits = OrigBitWidth - BitWidth;
+    if (AmtKnownBits.getMaxValue().ult(BitWidth) &&
+        ShiftedBits < IC.ComputeNumSignBits(I->getOperand(0), 0, CxtI))
+      return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
+             canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
+    break;
+  }
+  case Instruction::Trunc:
+    // trunc(trunc(x)) -> trunc(x)
+    return true;
+  case Instruction::ZExt:
+  case Instruction::SExt:
+    // trunc(ext(x)) -> ext(x) if the source type is smaller than the new dest
+    // trunc(ext(x)) -> trunc(x) if the source type is larger than the new dest
+    return true;
+  case Instruction::Select: {
+    SelectInst *SI = cast<SelectInst>(I);
+    return canEvaluateTruncated(SI->getTrueValue(), Ty, IC, CxtI) &&
+           canEvaluateTruncated(SI->getFalseValue(), Ty, IC, CxtI);
+  }
+  case Instruction::PHI: {
+    // We can change a phi if we can change all operands.  Note that we never
+    // get into trouble with cyclic PHIs here because we only consider
+    // instructions with a single use.
+    PHINode *PN = cast<PHINode>(I);
+    for (Value *IncValue : PN->incoming_values())
+      if (!canEvaluateTruncated(IncValue, Ty, IC, CxtI))
+        return false;
+    return true;
+  }
+  default:
+    // TODO: Can handle more cases here.
+    break;
+  }
+
+  return false;
+}
+
+/// Given a vector that is bitcast to an integer, optionally logically
+/// right-shifted, and truncated, convert it to an extractelement.
+/// Example (big endian):
+///   trunc (lshr (bitcast <4 x i32> %X to i128), 32) to i32
+///   --->
+///   extractelement <4 x i32> %X, 1
 static Instruction *foldVecTruncToExtElt(TruncInst &Trunc,
                                          InstCombinerImpl &IC) {
-  Value *TruncOp = Trunc.getOperand(0); 
-  Type *DestType = Trunc.getType(); 
-  if (!TruncOp->hasOneUse() || !isa<IntegerType>(DestType)) 
-    return nullptr; 
- 
-  Value *VecInput = nullptr; 
-  ConstantInt *ShiftVal = nullptr; 
-  if (!match(TruncOp, m_CombineOr(m_BitCast(m_Value(VecInput)), 
-                                  m_LShr(m_BitCast(m_Value(VecInput)), 
-                                         m_ConstantInt(ShiftVal)))) || 
-      !isa<VectorType>(VecInput->getType())) 
-    return nullptr; 
- 
-  VectorType *VecType = cast<VectorType>(VecInput->getType()); 
-  unsigned VecWidth = VecType->getPrimitiveSizeInBits(); 
-  unsigned DestWidth = DestType->getPrimitiveSizeInBits(); 
-  unsigned ShiftAmount = ShiftVal ? ShiftVal->getZExtValue() : 0; 
- 
-  if ((VecWidth % DestWidth != 0) || (ShiftAmount % DestWidth != 0)) 
-    return nullptr; 
- 
-  // If the element type of the vector doesn't match the result type, 
-  // bitcast it to a vector type that we can extract from. 
-  unsigned NumVecElts = VecWidth / DestWidth; 
-  if (VecType->getElementType() != DestType) { 
-    VecType = FixedVectorType::get(DestType, NumVecElts); 
-    VecInput = IC.Builder.CreateBitCast(VecInput, VecType, "bc"); 
-  } 
- 
-  unsigned Elt = ShiftAmount / DestWidth; 
-  if (IC.getDataLayout().isBigEndian()) 
-    Elt = NumVecElts - 1 - Elt; 
- 
-  return ExtractElementInst::Create(VecInput, IC.Builder.getInt32(Elt)); 
-} 
- 
+  Value *TruncOp = Trunc.getOperand(0);
+  Type *DestType = Trunc.getType();
+  if (!TruncOp->hasOneUse() || !isa<IntegerType>(DestType))
+    return nullptr;
+
+  Value *VecInput = nullptr;
+  ConstantInt *ShiftVal = nullptr;
+  if (!match(TruncOp, m_CombineOr(m_BitCast(m_Value(VecInput)),
+                                  m_LShr(m_BitCast(m_Value(VecInput)),
+                                         m_ConstantInt(ShiftVal)))) ||
+      !isa<VectorType>(VecInput->getType()))
+    return nullptr;
+
+  VectorType *VecType = cast<VectorType>(VecInput->getType());
+  unsigned VecWidth = VecType->getPrimitiveSizeInBits();
+  unsigned DestWidth = DestType->getPrimitiveSizeInBits();
+  unsigned ShiftAmount = ShiftVal ? ShiftVal->getZExtValue() : 0;
+
+  if ((VecWidth % DestWidth != 0) || (ShiftAmount % DestWidth != 0))
+    return nullptr;
+
+  // If the element type of the vector doesn't match the result type,
+  // bitcast it to a vector type that we can extract from.
+  unsigned NumVecElts = VecWidth / DestWidth;
+  if (VecType->getElementType() != DestType) {
+    VecType = FixedVectorType::get(DestType, NumVecElts);
+    VecInput = IC.Builder.CreateBitCast(VecInput, VecType, "bc");
+  }
+
+  unsigned Elt = ShiftAmount / DestWidth;
+  if (IC.getDataLayout().isBigEndian())
+    Elt = NumVecElts - 1 - Elt;
+
+  return ExtractElementInst::Create(VecInput, IC.Builder.getInt32(Elt));
+}
+
 /// Funnel/Rotate left/right may occur in a wider type than necessary because of
 /// type promotion rules. Try to narrow the inputs and convert to funnel shift.
 Instruction *InstCombinerImpl::narrowFunnelShift(TruncInst &Trunc) {
-  assert((isa<VectorType>(Trunc.getSrcTy()) || 
-          shouldChangeType(Trunc.getSrcTy(), Trunc.getType())) && 
-         "Don't narrow to an illegal scalar type"); 
- 
-  // Bail out on strange types. It is possible to handle some of these patterns 
-  // even with non-power-of-2 sizes, but it is not a likely scenario. 
-  Type *DestTy = Trunc.getType(); 
-  unsigned NarrowWidth = DestTy->getScalarSizeInBits(); 
-  if (!isPowerOf2_32(NarrowWidth)) 
-    return nullptr; 
- 
+  assert((isa<VectorType>(Trunc.getSrcTy()) ||
+          shouldChangeType(Trunc.getSrcTy(), Trunc.getType())) &&
+         "Don't narrow to an illegal scalar type");
+
+  // Bail out on strange types. It is possible to handle some of these patterns
+  // even with non-power-of-2 sizes, but it is not a likely scenario.
+  Type *DestTy = Trunc.getType();
+  unsigned NarrowWidth = DestTy->getScalarSizeInBits();
+  if (!isPowerOf2_32(NarrowWidth))
+    return nullptr;
+
   // First, find an or'd pair of opposite shifts:
   // trunc (or (lshr ShVal0, ShAmt0), (shl ShVal1, ShAmt1))
   BinaryOperator *Or0, *Or1;
   if (!match(Trunc.getOperand(0), m_OneUse(m_Or(m_BinOp(Or0), m_BinOp(Or1)))))
-    return nullptr; 
- 
+    return nullptr;
+
   Value *ShVal0, *ShVal1, *ShAmt0, *ShAmt1;
   if (!match(Or0, m_OneUse(m_LogicalShift(m_Value(ShVal0), m_Value(ShAmt0)))) ||
       !match(Or1, m_OneUse(m_LogicalShift(m_Value(ShVal1), m_Value(ShAmt1)))) ||
       Or0->getOpcode() == Or1->getOpcode())
-    return nullptr; 
- 
+    return nullptr;
+
   // Canonicalize to or(shl(ShVal0, ShAmt0), lshr(ShVal1, ShAmt1)).
   if (Or0->getOpcode() == BinaryOperator::LShr) {
     std::swap(Or0, Or1);
@@ -550,303 +550,303 @@ Instruction *InstCombinerImpl::narrowFunnelShift(TruncInst &Trunc) {
   assert(Or0->getOpcode() == BinaryOperator::Shl &&
          Or1->getOpcode() == BinaryOperator::LShr &&
          "Illegal or(shift,shift) pair");
- 
+
   // Match the shift amount operands for a funnel/rotate pattern. This always
   // matches a subtraction on the R operand.
   auto matchShiftAmount = [&](Value *L, Value *R, unsigned Width) -> Value * {
-    // The shift amounts may add up to the narrow bit width: 
+    // The shift amounts may add up to the narrow bit width:
     // (shl ShVal0, L) | (lshr ShVal1, Width - L)
-    if (match(R, m_OneUse(m_Sub(m_SpecificInt(Width), m_Specific(L))))) 
-      return L; 
- 
+    if (match(R, m_OneUse(m_Sub(m_SpecificInt(Width), m_Specific(L)))))
+      return L;
+
     // The following patterns currently only work for rotation patterns.
     // TODO: Add more general funnel-shift compatible patterns.
     if (ShVal0 != ShVal1)
       return nullptr;
 
-    // The shift amount may be masked with negation: 
+    // The shift amount may be masked with negation:
     // (shl ShVal0, (X & (Width - 1))) | (lshr ShVal1, ((-X) & (Width - 1)))
-    Value *X; 
-    unsigned Mask = Width - 1; 
-    if (match(L, m_And(m_Value(X), m_SpecificInt(Mask))) && 
-        match(R, m_And(m_Neg(m_Specific(X)), m_SpecificInt(Mask)))) 
-      return X; 
- 
-    // Same as above, but the shift amount may be extended after masking: 
-    if (match(L, m_ZExt(m_And(m_Value(X), m_SpecificInt(Mask)))) && 
-        match(R, m_ZExt(m_And(m_Neg(m_Specific(X)), m_SpecificInt(Mask))))) 
-      return X; 
- 
-    return nullptr; 
-  }; 
- 
-  Value *ShAmt = matchShiftAmount(ShAmt0, ShAmt1, NarrowWidth); 
+    Value *X;
+    unsigned Mask = Width - 1;
+    if (match(L, m_And(m_Value(X), m_SpecificInt(Mask))) &&
+        match(R, m_And(m_Neg(m_Specific(X)), m_SpecificInt(Mask))))
+      return X;
+
+    // Same as above, but the shift amount may be extended after masking:
+    if (match(L, m_ZExt(m_And(m_Value(X), m_SpecificInt(Mask)))) &&
+        match(R, m_ZExt(m_And(m_Neg(m_Specific(X)), m_SpecificInt(Mask)))))
+      return X;
+
+    return nullptr;
+  };
+
+  Value *ShAmt = matchShiftAmount(ShAmt0, ShAmt1, NarrowWidth);
   bool IsFshl = true; // Sub on LSHR.
-  if (!ShAmt) { 
-    ShAmt = matchShiftAmount(ShAmt1, ShAmt0, NarrowWidth); 
+  if (!ShAmt) {
+    ShAmt = matchShiftAmount(ShAmt1, ShAmt0, NarrowWidth);
     IsFshl = false; // Sub on SHL.
-  } 
-  if (!ShAmt) 
-    return nullptr; 
- 
-  // The shifted value must have high zeros in the wide type. Typically, this 
-  // will be a zext, but it could also be the result of an 'and' or 'shift'. 
-  unsigned WideWidth = Trunc.getSrcTy()->getScalarSizeInBits(); 
-  APInt HiBitMask = APInt::getHighBitsSet(WideWidth, WideWidth - NarrowWidth); 
+  }
+  if (!ShAmt)
+    return nullptr;
+
+  // The shifted value must have high zeros in the wide type. Typically, this
+  // will be a zext, but it could also be the result of an 'and' or 'shift'.
+  unsigned WideWidth = Trunc.getSrcTy()->getScalarSizeInBits();
+  APInt HiBitMask = APInt::getHighBitsSet(WideWidth, WideWidth - NarrowWidth);
   if (!MaskedValueIsZero(ShVal0, HiBitMask, 0, &Trunc) ||
       !MaskedValueIsZero(ShVal1, HiBitMask, 0, &Trunc))
-    return nullptr; 
- 
-  // We have an unnecessarily wide rotate! 
+    return nullptr;
+
+  // We have an unnecessarily wide rotate!
   // trunc (or (lshr ShVal0, ShAmt), (shl ShVal1, BitWidth - ShAmt))
-  // Narrow the inputs and convert to funnel shift intrinsic: 
-  // llvm.fshl.i8(trunc(ShVal), trunc(ShVal), trunc(ShAmt)) 
-  Value *NarrowShAmt = Builder.CreateTrunc(ShAmt, DestTy); 
+  // Narrow the inputs and convert to funnel shift intrinsic:
+  // llvm.fshl.i8(trunc(ShVal), trunc(ShVal), trunc(ShAmt))
+  Value *NarrowShAmt = Builder.CreateTrunc(ShAmt, DestTy);
   Value *X, *Y;
   X = Y = Builder.CreateTrunc(ShVal0, DestTy);
   if (ShVal0 != ShVal1)
     Y = Builder.CreateTrunc(ShVal1, DestTy);
-  Intrinsic::ID IID = IsFshl ? Intrinsic::fshl : Intrinsic::fshr; 
-  Function *F = Intrinsic::getDeclaration(Trunc.getModule(), IID, DestTy); 
+  Intrinsic::ID IID = IsFshl ? Intrinsic::fshl : Intrinsic::fshr;
+  Function *F = Intrinsic::getDeclaration(Trunc.getModule(), IID, DestTy);
   return IntrinsicInst::Create(F, {X, Y, NarrowShAmt});
-} 
- 
-/// Try to narrow the width of math or bitwise logic instructions by pulling a 
-/// truncate ahead of binary operators. 
-/// TODO: Transforms for truncated shifts should be moved into here. 
+}
+
+/// Try to narrow the width of math or bitwise logic instructions by pulling a
+/// truncate ahead of binary operators.
+/// TODO: Transforms for truncated shifts should be moved into here.
 Instruction *InstCombinerImpl::narrowBinOp(TruncInst &Trunc) {
-  Type *SrcTy = Trunc.getSrcTy(); 
-  Type *DestTy = Trunc.getType(); 
-  if (!isa<VectorType>(SrcTy) && !shouldChangeType(SrcTy, DestTy)) 
-    return nullptr; 
- 
-  BinaryOperator *BinOp; 
-  if (!match(Trunc.getOperand(0), m_OneUse(m_BinOp(BinOp)))) 
-    return nullptr; 
- 
-  Value *BinOp0 = BinOp->getOperand(0); 
-  Value *BinOp1 = BinOp->getOperand(1); 
-  switch (BinOp->getOpcode()) { 
-  case Instruction::And: 
-  case Instruction::Or: 
-  case Instruction::Xor: 
-  case Instruction::Add: 
-  case Instruction::Sub: 
-  case Instruction::Mul: { 
-    Constant *C; 
-    if (match(BinOp0, m_Constant(C))) { 
-      // trunc (binop C, X) --> binop (trunc C', X) 
-      Constant *NarrowC = ConstantExpr::getTrunc(C, DestTy); 
-      Value *TruncX = Builder.CreateTrunc(BinOp1, DestTy); 
-      return BinaryOperator::Create(BinOp->getOpcode(), NarrowC, TruncX); 
-    } 
-    if (match(BinOp1, m_Constant(C))) { 
-      // trunc (binop X, C) --> binop (trunc X, C') 
-      Constant *NarrowC = ConstantExpr::getTrunc(C, DestTy); 
-      Value *TruncX = Builder.CreateTrunc(BinOp0, DestTy); 
-      return BinaryOperator::Create(BinOp->getOpcode(), TruncX, NarrowC); 
-    } 
-    Value *X; 
-    if (match(BinOp0, m_ZExtOrSExt(m_Value(X))) && X->getType() == DestTy) { 
-      // trunc (binop (ext X), Y) --> binop X, (trunc Y) 
-      Value *NarrowOp1 = Builder.CreateTrunc(BinOp1, DestTy); 
-      return BinaryOperator::Create(BinOp->getOpcode(), X, NarrowOp1); 
-    } 
-    if (match(BinOp1, m_ZExtOrSExt(m_Value(X))) && X->getType() == DestTy) { 
-      // trunc (binop Y, (ext X)) --> binop (trunc Y), X 
-      Value *NarrowOp0 = Builder.CreateTrunc(BinOp0, DestTy); 
-      return BinaryOperator::Create(BinOp->getOpcode(), NarrowOp0, X); 
-    } 
-    break; 
-  } 
- 
-  default: break; 
-  } 
- 
+  Type *SrcTy = Trunc.getSrcTy();
+  Type *DestTy = Trunc.getType();
+  if (!isa<VectorType>(SrcTy) && !shouldChangeType(SrcTy, DestTy))
+    return nullptr;
+
+  BinaryOperator *BinOp;
+  if (!match(Trunc.getOperand(0), m_OneUse(m_BinOp(BinOp))))
+    return nullptr;
+
+  Value *BinOp0 = BinOp->getOperand(0);
+  Value *BinOp1 = BinOp->getOperand(1);
+  switch (BinOp->getOpcode()) {
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul: {
+    Constant *C;
+    if (match(BinOp0, m_Constant(C))) {
+      // trunc (binop C, X) --> binop (trunc C', X)
+      Constant *NarrowC = ConstantExpr::getTrunc(C, DestTy);
+      Value *TruncX = Builder.CreateTrunc(BinOp1, DestTy);
+      return BinaryOperator::Create(BinOp->getOpcode(), NarrowC, TruncX);
+    }
+    if (match(BinOp1, m_Constant(C))) {
+      // trunc (binop X, C) --> binop (trunc X, C')
+      Constant *NarrowC = ConstantExpr::getTrunc(C, DestTy);
+      Value *TruncX = Builder.CreateTrunc(BinOp0, DestTy);
+      return BinaryOperator::Create(BinOp->getOpcode(), TruncX, NarrowC);
+    }
+    Value *X;
+    if (match(BinOp0, m_ZExtOrSExt(m_Value(X))) && X->getType() == DestTy) {
+      // trunc (binop (ext X), Y) --> binop X, (trunc Y)
+      Value *NarrowOp1 = Builder.CreateTrunc(BinOp1, DestTy);
+      return BinaryOperator::Create(BinOp->getOpcode(), X, NarrowOp1);
+    }
+    if (match(BinOp1, m_ZExtOrSExt(m_Value(X))) && X->getType() == DestTy) {
+      // trunc (binop Y, (ext X)) --> binop (trunc Y), X
+      Value *NarrowOp0 = Builder.CreateTrunc(BinOp0, DestTy);
+      return BinaryOperator::Create(BinOp->getOpcode(), NarrowOp0, X);
+    }
+    break;
+  }
+
+  default: break;
+  }
+
   if (Instruction *NarrowOr = narrowFunnelShift(Trunc))
-    return NarrowOr; 
- 
-  return nullptr; 
-} 
- 
-/// Try to narrow the width of a splat shuffle. This could be generalized to any 
-/// shuffle with a constant operand, but we limit the transform to avoid 
-/// creating a shuffle type that targets may not be able to lower effectively. 
-static Instruction *shrinkSplatShuffle(TruncInst &Trunc, 
-                                       InstCombiner::BuilderTy &Builder) { 
-  auto *Shuf = dyn_cast<ShuffleVectorInst>(Trunc.getOperand(0)); 
-  if (Shuf && Shuf->hasOneUse() && isa<UndefValue>(Shuf->getOperand(1)) && 
-      is_splat(Shuf->getShuffleMask()) && 
-      Shuf->getType() == Shuf->getOperand(0)->getType()) { 
-    // trunc (shuf X, Undef, SplatMask) --> shuf (trunc X), Undef, SplatMask 
-    Constant *NarrowUndef = UndefValue::get(Trunc.getType()); 
-    Value *NarrowOp = Builder.CreateTrunc(Shuf->getOperand(0), Trunc.getType()); 
-    return new ShuffleVectorInst(NarrowOp, NarrowUndef, Shuf->getShuffleMask()); 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Try to narrow the width of an insert element. This could be generalized for 
-/// any vector constant, but we limit the transform to insertion into undef to 
-/// avoid potential backend problems from unsupported insertion widths. This 
-/// could also be extended to handle the case of inserting a scalar constant 
-/// into a vector variable. 
-static Instruction *shrinkInsertElt(CastInst &Trunc, 
-                                    InstCombiner::BuilderTy &Builder) { 
-  Instruction::CastOps Opcode = Trunc.getOpcode(); 
-  assert((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) && 
-         "Unexpected instruction for shrinking"); 
- 
-  auto *InsElt = dyn_cast<InsertElementInst>(Trunc.getOperand(0)); 
-  if (!InsElt || !InsElt->hasOneUse()) 
-    return nullptr; 
- 
-  Type *DestTy = Trunc.getType(); 
-  Type *DestScalarTy = DestTy->getScalarType(); 
-  Value *VecOp = InsElt->getOperand(0); 
-  Value *ScalarOp = InsElt->getOperand(1); 
-  Value *Index = InsElt->getOperand(2); 
- 
-  if (isa<UndefValue>(VecOp)) { 
-    // trunc   (inselt undef, X, Index) --> inselt undef,   (trunc X), Index 
-    // fptrunc (inselt undef, X, Index) --> inselt undef, (fptrunc X), Index 
-    UndefValue *NarrowUndef = UndefValue::get(DestTy); 
-    Value *NarrowOp = Builder.CreateCast(Opcode, ScalarOp, DestScalarTy); 
-    return InsertElementInst::Create(NarrowUndef, NarrowOp, Index); 
-  } 
- 
-  return nullptr; 
-} 
- 
+    return NarrowOr;
+
+  return nullptr;
+}
+
+/// Try to narrow the width of a splat shuffle. This could be generalized to any
+/// shuffle with a constant operand, but we limit the transform to avoid
+/// creating a shuffle type that targets may not be able to lower effectively.
+static Instruction *shrinkSplatShuffle(TruncInst &Trunc,
+                                       InstCombiner::BuilderTy &Builder) {
+  auto *Shuf = dyn_cast<ShuffleVectorInst>(Trunc.getOperand(0));
+  if (Shuf && Shuf->hasOneUse() && isa<UndefValue>(Shuf->getOperand(1)) &&
+      is_splat(Shuf->getShuffleMask()) &&
+      Shuf->getType() == Shuf->getOperand(0)->getType()) {
+    // trunc (shuf X, Undef, SplatMask) --> shuf (trunc X), Undef, SplatMask
+    Constant *NarrowUndef = UndefValue::get(Trunc.getType());
+    Value *NarrowOp = Builder.CreateTrunc(Shuf->getOperand(0), Trunc.getType());
+    return new ShuffleVectorInst(NarrowOp, NarrowUndef, Shuf->getShuffleMask());
+  }
+
+  return nullptr;
+}
+
+/// Try to narrow the width of an insert element. This could be generalized for
+/// any vector constant, but we limit the transform to insertion into undef to
+/// avoid potential backend problems from unsupported insertion widths. This
+/// could also be extended to handle the case of inserting a scalar constant
+/// into a vector variable.
+static Instruction *shrinkInsertElt(CastInst &Trunc,
+                                    InstCombiner::BuilderTy &Builder) {
+  Instruction::CastOps Opcode = Trunc.getOpcode();
+  assert((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) &&
+         "Unexpected instruction for shrinking");
+
+  auto *InsElt = dyn_cast<InsertElementInst>(Trunc.getOperand(0));
+  if (!InsElt || !InsElt->hasOneUse())
+    return nullptr;
+
+  Type *DestTy = Trunc.getType();
+  Type *DestScalarTy = DestTy->getScalarType();
+  Value *VecOp = InsElt->getOperand(0);
+  Value *ScalarOp = InsElt->getOperand(1);
+  Value *Index = InsElt->getOperand(2);
+
+  if (isa<UndefValue>(VecOp)) {
+    // trunc   (inselt undef, X, Index) --> inselt undef,   (trunc X), Index
+    // fptrunc (inselt undef, X, Index) --> inselt undef, (fptrunc X), Index
+    UndefValue *NarrowUndef = UndefValue::get(DestTy);
+    Value *NarrowOp = Builder.CreateCast(Opcode, ScalarOp, DestScalarTy);
+    return InsertElementInst::Create(NarrowUndef, NarrowOp, Index);
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) {
-  if (Instruction *Result = commonCastTransforms(Trunc)) 
-    return Result; 
- 
-  Value *Src = Trunc.getOperand(0); 
-  Type *DestTy = Trunc.getType(), *SrcTy = Src->getType(); 
-  unsigned DestWidth = DestTy->getScalarSizeInBits(); 
-  unsigned SrcWidth = SrcTy->getScalarSizeInBits(); 
- 
-  // Attempt to truncate the entire input expression tree to the destination 
-  // type.   Only do this if the dest type is a simple type, don't convert the 
-  // expression tree to something weird like i93 unless the source is also 
-  // strange. 
-  if ((DestTy->isVectorTy() || shouldChangeType(SrcTy, DestTy)) && 
-      canEvaluateTruncated(Src, DestTy, *this, &Trunc)) { 
- 
-    // If this cast is a truncate, evaluting in a different type always 
-    // eliminates the cast, so it is always a win. 
-    LLVM_DEBUG( 
-        dbgs() << "ICE: EvaluateInDifferentType converting expression type" 
-                  " to avoid cast: " 
-               << Trunc << '\n'); 
-    Value *Res = EvaluateInDifferentType(Src, DestTy, false); 
-    assert(Res->getType() == DestTy); 
-    return replaceInstUsesWith(Trunc, Res); 
-  } 
- 
-  // For integer types, check if we can shorten the entire input expression to 
-  // DestWidth * 2, which won't allow removing the truncate, but reducing the 
-  // width may enable further optimizations, e.g. allowing for larger 
-  // vectorization factors. 
-  if (auto *DestITy = dyn_cast<IntegerType>(DestTy)) { 
-    if (DestWidth * 2 < SrcWidth) { 
-      auto *NewDestTy = DestITy->getExtendedType(); 
-      if (shouldChangeType(SrcTy, NewDestTy) && 
-          canEvaluateTruncated(Src, NewDestTy, *this, &Trunc)) { 
-        LLVM_DEBUG( 
-            dbgs() << "ICE: EvaluateInDifferentType converting expression type" 
-                      " to reduce the width of operand of" 
-                   << Trunc << '\n'); 
-        Value *Res = EvaluateInDifferentType(Src, NewDestTy, false); 
-        return new TruncInst(Res, DestTy); 
-      } 
-    } 
-  } 
- 
-  // Test if the trunc is the user of a select which is part of a 
-  // minimum or maximum operation. If so, don't do any more simplification. 
-  // Even simplifying demanded bits can break the canonical form of a 
-  // min/max. 
-  Value *LHS, *RHS; 
-  if (SelectInst *Sel = dyn_cast<SelectInst>(Src)) 
-    if (matchSelectPattern(Sel, LHS, RHS).Flavor != SPF_UNKNOWN) 
-      return nullptr; 
- 
-  // See if we can simplify any instructions used by the input whose sole 
-  // purpose is to compute bits we don't care about. 
-  if (SimplifyDemandedInstructionBits(Trunc)) 
-    return &Trunc; 
- 
-  if (DestWidth == 1) { 
-    Value *Zero = Constant::getNullValue(SrcTy); 
-    if (DestTy->isIntegerTy()) { 
-      // Canonicalize trunc x to i1 -> icmp ne (and x, 1), 0 (scalar only). 
-      // TODO: We canonicalize to more instructions here because we are probably 
-      // lacking equivalent analysis for trunc relative to icmp. There may also 
-      // be codegen concerns. If those trunc limitations were removed, we could 
-      // remove this transform. 
-      Value *And = Builder.CreateAnd(Src, ConstantInt::get(SrcTy, 1)); 
-      return new ICmpInst(ICmpInst::ICMP_NE, And, Zero); 
-    } 
- 
-    // For vectors, we do not canonicalize all truncs to icmp, so optimize 
-    // patterns that would be covered within visitICmpInst. 
-    Value *X; 
-    Constant *C; 
-    if (match(Src, m_OneUse(m_LShr(m_Value(X), m_Constant(C))))) { 
-      // trunc (lshr X, C) to i1 --> icmp ne (and X, C'), 0 
-      Constant *One = ConstantInt::get(SrcTy, APInt(SrcWidth, 1)); 
-      Constant *MaskC = ConstantExpr::getShl(One, C); 
-      Value *And = Builder.CreateAnd(X, MaskC); 
-      return new ICmpInst(ICmpInst::ICMP_NE, And, Zero); 
-    } 
-    if (match(Src, m_OneUse(m_c_Or(m_LShr(m_Value(X), m_Constant(C)), 
-                                   m_Deferred(X))))) { 
-      // trunc (or (lshr X, C), X) to i1 --> icmp ne (and X, C'), 0 
-      Constant *One = ConstantInt::get(SrcTy, APInt(SrcWidth, 1)); 
-      Constant *MaskC = ConstantExpr::getShl(One, C); 
-      MaskC = ConstantExpr::getOr(MaskC, One); 
-      Value *And = Builder.CreateAnd(X, MaskC); 
-      return new ICmpInst(ICmpInst::ICMP_NE, And, Zero); 
-    } 
-  } 
- 
+  if (Instruction *Result = commonCastTransforms(Trunc))
+    return Result;
+
+  Value *Src = Trunc.getOperand(0);
+  Type *DestTy = Trunc.getType(), *SrcTy = Src->getType();
+  unsigned DestWidth = DestTy->getScalarSizeInBits();
+  unsigned SrcWidth = SrcTy->getScalarSizeInBits();
+
+  // Attempt to truncate the entire input expression tree to the destination
+  // type.   Only do this if the dest type is a simple type, don't convert the
+  // expression tree to something weird like i93 unless the source is also
+  // strange.
+  if ((DestTy->isVectorTy() || shouldChangeType(SrcTy, DestTy)) &&
+      canEvaluateTruncated(Src, DestTy, *this, &Trunc)) {
+
+    // If this cast is a truncate, evaluting in a different type always
+    // eliminates the cast, so it is always a win.
+    LLVM_DEBUG(
+        dbgs() << "ICE: EvaluateInDifferentType converting expression type"
+                  " to avoid cast: "
+               << Trunc << '\n');
+    Value *Res = EvaluateInDifferentType(Src, DestTy, false);
+    assert(Res->getType() == DestTy);
+    return replaceInstUsesWith(Trunc, Res);
+  }
+
+  // For integer types, check if we can shorten the entire input expression to
+  // DestWidth * 2, which won't allow removing the truncate, but reducing the
+  // width may enable further optimizations, e.g. allowing for larger
+  // vectorization factors.
+  if (auto *DestITy = dyn_cast<IntegerType>(DestTy)) {
+    if (DestWidth * 2 < SrcWidth) {
+      auto *NewDestTy = DestITy->getExtendedType();
+      if (shouldChangeType(SrcTy, NewDestTy) &&
+          canEvaluateTruncated(Src, NewDestTy, *this, &Trunc)) {
+        LLVM_DEBUG(
+            dbgs() << "ICE: EvaluateInDifferentType converting expression type"
+                      " to reduce the width of operand of"
+                   << Trunc << '\n');
+        Value *Res = EvaluateInDifferentType(Src, NewDestTy, false);
+        return new TruncInst(Res, DestTy);
+      }
+    }
+  }
+
+  // Test if the trunc is the user of a select which is part of a
+  // minimum or maximum operation. If so, don't do any more simplification.
+  // Even simplifying demanded bits can break the canonical form of a
+  // min/max.
+  Value *LHS, *RHS;
+  if (SelectInst *Sel = dyn_cast<SelectInst>(Src))
+    if (matchSelectPattern(Sel, LHS, RHS).Flavor != SPF_UNKNOWN)
+      return nullptr;
+
+  // See if we can simplify any instructions used by the input whose sole
+  // purpose is to compute bits we don't care about.
+  if (SimplifyDemandedInstructionBits(Trunc))
+    return &Trunc;
+
+  if (DestWidth == 1) {
+    Value *Zero = Constant::getNullValue(SrcTy);
+    if (DestTy->isIntegerTy()) {
+      // Canonicalize trunc x to i1 -> icmp ne (and x, 1), 0 (scalar only).
+      // TODO: We canonicalize to more instructions here because we are probably
+      // lacking equivalent analysis for trunc relative to icmp. There may also
+      // be codegen concerns. If those trunc limitations were removed, we could
+      // remove this transform.
+      Value *And = Builder.CreateAnd(Src, ConstantInt::get(SrcTy, 1));
+      return new ICmpInst(ICmpInst::ICMP_NE, And, Zero);
+    }
+
+    // For vectors, we do not canonicalize all truncs to icmp, so optimize
+    // patterns that would be covered within visitICmpInst.
+    Value *X;
+    Constant *C;
+    if (match(Src, m_OneUse(m_LShr(m_Value(X), m_Constant(C))))) {
+      // trunc (lshr X, C) to i1 --> icmp ne (and X, C'), 0
+      Constant *One = ConstantInt::get(SrcTy, APInt(SrcWidth, 1));
+      Constant *MaskC = ConstantExpr::getShl(One, C);
+      Value *And = Builder.CreateAnd(X, MaskC);
+      return new ICmpInst(ICmpInst::ICMP_NE, And, Zero);
+    }
+    if (match(Src, m_OneUse(m_c_Or(m_LShr(m_Value(X), m_Constant(C)),
+                                   m_Deferred(X))))) {
+      // trunc (or (lshr X, C), X) to i1 --> icmp ne (and X, C'), 0
+      Constant *One = ConstantInt::get(SrcTy, APInt(SrcWidth, 1));
+      Constant *MaskC = ConstantExpr::getShl(One, C);
+      MaskC = ConstantExpr::getOr(MaskC, One);
+      Value *And = Builder.CreateAnd(X, MaskC);
+      return new ICmpInst(ICmpInst::ICMP_NE, And, Zero);
+    }
+  }
+
   Value *A;
   Constant *C;
   if (match(Src, m_LShr(m_SExt(m_Value(A)), m_Constant(C)))) {
-    unsigned AWidth = A->getType()->getScalarSizeInBits(); 
-    unsigned MaxShiftAmt = SrcWidth - std::max(DestWidth, AWidth); 
+    unsigned AWidth = A->getType()->getScalarSizeInBits();
+    unsigned MaxShiftAmt = SrcWidth - std::max(DestWidth, AWidth);
     auto *OldSh = cast<Instruction>(Src);
     bool IsExact = OldSh->isExact();
- 
-    // If the shift is small enough, all zero bits created by the shift are 
-    // removed by the trunc. 
+
+    // If the shift is small enough, all zero bits created by the shift are
+    // removed by the trunc.
     if (match(C, m_SpecificInt_ICMP(ICmpInst::ICMP_ULE,
                                     APInt(SrcWidth, MaxShiftAmt)))) {
-      // trunc (lshr (sext A), C) --> ashr A, C 
-      if (A->getType() == DestTy) { 
+      // trunc (lshr (sext A), C) --> ashr A, C
+      if (A->getType() == DestTy) {
         Constant *MaxAmt = ConstantInt::get(SrcTy, DestWidth - 1, false);
         Constant *ShAmt = ConstantExpr::getUMin(C, MaxAmt);
         ShAmt = ConstantExpr::getTrunc(ShAmt, A->getType());
         ShAmt = Constant::mergeUndefsWith(ShAmt, C);
         return IsExact ? BinaryOperator::CreateExactAShr(A, ShAmt)
                        : BinaryOperator::CreateAShr(A, ShAmt);
-      } 
-      // The types are mismatched, so create a cast after shifting: 
-      // trunc (lshr (sext A), C) --> sext/trunc (ashr A, C) 
-      if (Src->hasOneUse()) { 
+      }
+      // The types are mismatched, so create a cast after shifting:
+      // trunc (lshr (sext A), C) --> sext/trunc (ashr A, C)
+      if (Src->hasOneUse()) {
         Constant *MaxAmt = ConstantInt::get(SrcTy, AWidth - 1, false);
         Constant *ShAmt = ConstantExpr::getUMin(C, MaxAmt);
         ShAmt = ConstantExpr::getTrunc(ShAmt, A->getType());
         Value *Shift = Builder.CreateAShr(A, ShAmt, "", IsExact);
-        return CastInst::CreateIntegerCast(Shift, DestTy, true); 
-      } 
-    } 
-    // TODO: Mask high bits with 'and'. 
-  } 
- 
+        return CastInst::CreateIntegerCast(Shift, DestTy, true);
+      }
+    }
+    // TODO: Mask high bits with 'and'.
+  }
+
   // trunc (*shr (trunc A), C) --> trunc(*shr A, C)
   if (match(Src, m_OneUse(m_Shr(m_Trunc(m_Value(A)), m_Constant(C))))) {
     unsigned MaxShiftAmt = SrcWidth - DestWidth;
@@ -867,661 +867,661 @@ Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) {
     }
   }
 
-  if (Instruction *I = narrowBinOp(Trunc)) 
-    return I; 
- 
-  if (Instruction *I = shrinkSplatShuffle(Trunc, Builder)) 
-    return I; 
- 
-  if (Instruction *I = shrinkInsertElt(Trunc, Builder)) 
-    return I; 
- 
+  if (Instruction *I = narrowBinOp(Trunc))
+    return I;
+
+  if (Instruction *I = shrinkSplatShuffle(Trunc, Builder))
+    return I;
+
+  if (Instruction *I = shrinkInsertElt(Trunc, Builder))
+    return I;
+
   if (Src->hasOneUse() &&
       (isa<VectorType>(SrcTy) || shouldChangeType(SrcTy, DestTy))) {
-    // Transform "trunc (shl X, cst)" -> "shl (trunc X), cst" so long as the 
-    // dest type is native and cst < dest size. 
+    // Transform "trunc (shl X, cst)" -> "shl (trunc X), cst" so long as the
+    // dest type is native and cst < dest size.
     if (match(Src, m_Shl(m_Value(A), m_Constant(C))) &&
-        !match(A, m_Shr(m_Value(), m_Constant()))) { 
-      // Skip shifts of shift by constants. It undoes a combine in 
-      // FoldShiftByConstant and is the extend in reg pattern. 
+        !match(A, m_Shr(m_Value(), m_Constant()))) {
+      // Skip shifts of shift by constants. It undoes a combine in
+      // FoldShiftByConstant and is the extend in reg pattern.
       APInt Threshold = APInt(C->getType()->getScalarSizeInBits(), DestWidth);
       if (match(C, m_SpecificInt_ICMP(ICmpInst::ICMP_ULT, Threshold))) {
-        Value *NewTrunc = Builder.CreateTrunc(A, DestTy, A->getName() + ".tr"); 
+        Value *NewTrunc = Builder.CreateTrunc(A, DestTy, A->getName() + ".tr");
         return BinaryOperator::Create(Instruction::Shl, NewTrunc,
                                       ConstantExpr::getTrunc(C, DestTy));
-      } 
-    } 
-  } 
- 
-  if (Instruction *I = foldVecTruncToExtElt(Trunc, *this)) 
-    return I; 
- 
-  // Whenever an element is extracted from a vector, and then truncated, 
-  // canonicalize by converting it to a bitcast followed by an 
-  // extractelement. 
-  // 
-  // Example (little endian): 
-  //   trunc (extractelement <4 x i64> %X, 0) to i32 
-  //   ---> 
-  //   extractelement <8 x i32> (bitcast <4 x i64> %X to <8 x i32>), i32 0 
-  Value *VecOp; 
+      }
+    }
+  }
+
+  if (Instruction *I = foldVecTruncToExtElt(Trunc, *this))
+    return I;
+
+  // Whenever an element is extracted from a vector, and then truncated,
+  // canonicalize by converting it to a bitcast followed by an
+  // extractelement.
+  //
+  // Example (little endian):
+  //   trunc (extractelement <4 x i64> %X, 0) to i32
+  //   --->
+  //   extractelement <8 x i32> (bitcast <4 x i64> %X to <8 x i32>), i32 0
+  Value *VecOp;
   ConstantInt *Cst;
-  if (match(Src, m_OneUse(m_ExtractElt(m_Value(VecOp), m_ConstantInt(Cst))))) { 
-    auto *VecOpTy = cast<VectorType>(VecOp->getType()); 
+  if (match(Src, m_OneUse(m_ExtractElt(m_Value(VecOp), m_ConstantInt(Cst))))) {
+    auto *VecOpTy = cast<VectorType>(VecOp->getType());
     auto VecElts = VecOpTy->getElementCount();
- 
-    // A badly fit destination size would result in an invalid cast. 
-    if (SrcWidth % DestWidth == 0) { 
-      uint64_t TruncRatio = SrcWidth / DestWidth; 
+
+    // A badly fit destination size would result in an invalid cast.
+    if (SrcWidth % DestWidth == 0) {
+      uint64_t TruncRatio = SrcWidth / DestWidth;
       uint64_t BitCastNumElts = VecElts.getKnownMinValue() * TruncRatio;
-      uint64_t VecOpIdx = Cst->getZExtValue(); 
-      uint64_t NewIdx = DL.isBigEndian() ? (VecOpIdx + 1) * TruncRatio - 1 
-                                         : VecOpIdx * TruncRatio; 
-      assert(BitCastNumElts <= std::numeric_limits<uint32_t>::max() && 
-             "overflow 32-bits"); 
- 
+      uint64_t VecOpIdx = Cst->getZExtValue();
+      uint64_t NewIdx = DL.isBigEndian() ? (VecOpIdx + 1) * TruncRatio - 1
+                                         : VecOpIdx * TruncRatio;
+      assert(BitCastNumElts <= std::numeric_limits<uint32_t>::max() &&
+             "overflow 32-bits");
+
       auto *BitCastTo =
           VectorType::get(DestTy, BitCastNumElts, VecElts.isScalable());
-      Value *BitCast = Builder.CreateBitCast(VecOp, BitCastTo); 
-      return ExtractElementInst::Create(BitCast, Builder.getInt32(NewIdx)); 
-    } 
-  } 
- 
-  return nullptr; 
-} 
- 
+      Value *BitCast = Builder.CreateBitCast(VecOp, BitCastTo);
+      return ExtractElementInst::Create(BitCast, Builder.getInt32(NewIdx));
+    }
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::transformZExtICmp(ICmpInst *Cmp, ZExtInst &Zext,
                                                  bool DoTransform) {
-  // If we are just checking for a icmp eq of a single bit and zext'ing it 
-  // to an integer, then shift the bit to the appropriate place and then 
-  // cast to integer to avoid the comparison. 
-  const APInt *Op1CV; 
-  if (match(Cmp->getOperand(1), m_APInt(Op1CV))) { 
- 
-    // zext (x <s  0) to i32 --> x>>u31      true if signbit set. 
-    // zext (x >s -1) to i32 --> (x>>u31)^1  true if signbit clear. 
-    if ((Cmp->getPredicate() == ICmpInst::ICMP_SLT && Op1CV->isNullValue()) || 
-        (Cmp->getPredicate() == ICmpInst::ICMP_SGT && Op1CV->isAllOnesValue())) { 
-      if (!DoTransform) return Cmp; 
- 
-      Value *In = Cmp->getOperand(0); 
-      Value *Sh = ConstantInt::get(In->getType(), 
-                                   In->getType()->getScalarSizeInBits() - 1); 
-      In = Builder.CreateLShr(In, Sh, In->getName() + ".lobit"); 
-      if (In->getType() != Zext.getType()) 
-        In = Builder.CreateIntCast(In, Zext.getType(), false /*ZExt*/); 
- 
-      if (Cmp->getPredicate() == ICmpInst::ICMP_SGT) { 
-        Constant *One = ConstantInt::get(In->getType(), 1); 
-        In = Builder.CreateXor(In, One, In->getName() + ".not"); 
-      } 
- 
-      return replaceInstUsesWith(Zext, In); 
-    } 
- 
-    // zext (X == 0) to i32 --> X^1      iff X has only the low bit set. 
-    // zext (X == 0) to i32 --> (X>>1)^1 iff X has only the 2nd bit set. 
-    // zext (X == 1) to i32 --> X        iff X has only the low bit set. 
-    // zext (X == 2) to i32 --> X>>1     iff X has only the 2nd bit set. 
-    // zext (X != 0) to i32 --> X        iff X has only the low bit set. 
-    // zext (X != 0) to i32 --> X>>1     iff X has only the 2nd bit set. 
-    // zext (X != 1) to i32 --> X^1      iff X has only the low bit set. 
-    // zext (X != 2) to i32 --> (X>>1)^1 iff X has only the 2nd bit set. 
-    if ((Op1CV->isNullValue() || Op1CV->isPowerOf2()) && 
-        // This only works for EQ and NE 
-        Cmp->isEquality()) { 
-      // If Op1C some other power of two, convert: 
-      KnownBits Known = computeKnownBits(Cmp->getOperand(0), 0, &Zext); 
- 
-      APInt KnownZeroMask(~Known.Zero); 
-      if (KnownZeroMask.isPowerOf2()) { // Exactly 1 possible 1? 
-        if (!DoTransform) return Cmp; 
- 
-        bool isNE = Cmp->getPredicate() == ICmpInst::ICMP_NE; 
-        if (!Op1CV->isNullValue() && (*Op1CV != KnownZeroMask)) { 
-          // (X&4) == 2 --> false 
-          // (X&4) != 2 --> true 
-          Constant *Res = ConstantInt::get(Zext.getType(), isNE); 
-          return replaceInstUsesWith(Zext, Res); 
-        } 
- 
-        uint32_t ShAmt = KnownZeroMask.logBase2(); 
-        Value *In = Cmp->getOperand(0); 
-        if (ShAmt) { 
-          // Perform a logical shr by shiftamt. 
-          // Insert the shift to put the result in the low bit. 
-          In = Builder.CreateLShr(In, ConstantInt::get(In->getType(), ShAmt), 
-                                  In->getName() + ".lobit"); 
-        } 
- 
-        if (!Op1CV->isNullValue() == isNE) { // Toggle the low bit. 
-          Constant *One = ConstantInt::get(In->getType(), 1); 
-          In = Builder.CreateXor(In, One); 
-        } 
- 
-        if (Zext.getType() == In->getType()) 
-          return replaceInstUsesWith(Zext, In); 
- 
-        Value *IntCast = Builder.CreateIntCast(In, Zext.getType(), false); 
-        return replaceInstUsesWith(Zext, IntCast); 
-      } 
-    } 
-  } 
- 
-  // icmp ne A, B is equal to xor A, B when A and B only really have one bit. 
-  // It is also profitable to transform icmp eq into not(xor(A, B)) because that 
-  // may lead to additional simplifications. 
-  if (Cmp->isEquality() && Zext.getType() == Cmp->getOperand(0)->getType()) { 
-    if (IntegerType *ITy = dyn_cast<IntegerType>(Zext.getType())) { 
-      Value *LHS = Cmp->getOperand(0); 
-      Value *RHS = Cmp->getOperand(1); 
- 
-      KnownBits KnownLHS = computeKnownBits(LHS, 0, &Zext); 
-      KnownBits KnownRHS = computeKnownBits(RHS, 0, &Zext); 
- 
-      if (KnownLHS.Zero == KnownRHS.Zero && KnownLHS.One == KnownRHS.One) { 
-        APInt KnownBits = KnownLHS.Zero | KnownLHS.One; 
-        APInt UnknownBit = ~KnownBits; 
-        if (UnknownBit.countPopulation() == 1) { 
-          if (!DoTransform) return Cmp; 
- 
-          Value *Result = Builder.CreateXor(LHS, RHS); 
- 
-          // Mask off any bits that are set and won't be shifted away. 
-          if (KnownLHS.One.uge(UnknownBit)) 
-            Result = Builder.CreateAnd(Result, 
-                                        ConstantInt::get(ITy, UnknownBit)); 
- 
-          // Shift the bit we're testing down to the lsb. 
-          Result = Builder.CreateLShr( 
-               Result, ConstantInt::get(ITy, UnknownBit.countTrailingZeros())); 
- 
-          if (Cmp->getPredicate() == ICmpInst::ICMP_EQ) 
-            Result = Builder.CreateXor(Result, ConstantInt::get(ITy, 1)); 
-          Result->takeName(Cmp); 
-          return replaceInstUsesWith(Zext, Result); 
-        } 
-      } 
-    } 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Determine if the specified value can be computed in the specified wider type 
-/// and produce the same low bits. If not, return false. 
-/// 
-/// If this function returns true, it can also return a non-zero number of bits 
-/// (in BitsToClear) which indicates that the value it computes is correct for 
-/// the zero extend, but that the additional BitsToClear bits need to be zero'd 
-/// out.  For example, to promote something like: 
-/// 
-///   %B = trunc i64 %A to i32 
-///   %C = lshr i32 %B, 8 
-///   %E = zext i32 %C to i64 
-/// 
-/// CanEvaluateZExtd for the 'lshr' will return true, and BitsToClear will be 
-/// set to 8 to indicate that the promoted value needs to have bits 24-31 
-/// cleared in addition to bits 32-63.  Since an 'and' will be generated to 
-/// clear the top bits anyway, doing this has no extra cost. 
-/// 
-/// This function works on both vectors and scalars. 
-static bool canEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear, 
+  // If we are just checking for a icmp eq of a single bit and zext'ing it
+  // to an integer, then shift the bit to the appropriate place and then
+  // cast to integer to avoid the comparison.
+  const APInt *Op1CV;
+  if (match(Cmp->getOperand(1), m_APInt(Op1CV))) {
+
+    // zext (x <s  0) to i32 --> x>>u31      true if signbit set.
+    // zext (x >s -1) to i32 --> (x>>u31)^1  true if signbit clear.
+    if ((Cmp->getPredicate() == ICmpInst::ICMP_SLT && Op1CV->isNullValue()) ||
+        (Cmp->getPredicate() == ICmpInst::ICMP_SGT && Op1CV->isAllOnesValue())) {
+      if (!DoTransform) return Cmp;
+
+      Value *In = Cmp->getOperand(0);
+      Value *Sh = ConstantInt::get(In->getType(),
+                                   In->getType()->getScalarSizeInBits() - 1);
+      In = Builder.CreateLShr(In, Sh, In->getName() + ".lobit");
+      if (In->getType() != Zext.getType())
+        In = Builder.CreateIntCast(In, Zext.getType(), false /*ZExt*/);
+
+      if (Cmp->getPredicate() == ICmpInst::ICMP_SGT) {
+        Constant *One = ConstantInt::get(In->getType(), 1);
+        In = Builder.CreateXor(In, One, In->getName() + ".not");
+      }
+
+      return replaceInstUsesWith(Zext, In);
+    }
+
+    // zext (X == 0) to i32 --> X^1      iff X has only the low bit set.
+    // zext (X == 0) to i32 --> (X>>1)^1 iff X has only the 2nd bit set.
+    // zext (X == 1) to i32 --> X        iff X has only the low bit set.
+    // zext (X == 2) to i32 --> X>>1     iff X has only the 2nd bit set.
+    // zext (X != 0) to i32 --> X        iff X has only the low bit set.
+    // zext (X != 0) to i32 --> X>>1     iff X has only the 2nd bit set.
+    // zext (X != 1) to i32 --> X^1      iff X has only the low bit set.
+    // zext (X != 2) to i32 --> (X>>1)^1 iff X has only the 2nd bit set.
+    if ((Op1CV->isNullValue() || Op1CV->isPowerOf2()) &&
+        // This only works for EQ and NE
+        Cmp->isEquality()) {
+      // If Op1C some other power of two, convert:
+      KnownBits Known = computeKnownBits(Cmp->getOperand(0), 0, &Zext);
+
+      APInt KnownZeroMask(~Known.Zero);
+      if (KnownZeroMask.isPowerOf2()) { // Exactly 1 possible 1?
+        if (!DoTransform) return Cmp;
+
+        bool isNE = Cmp->getPredicate() == ICmpInst::ICMP_NE;
+        if (!Op1CV->isNullValue() && (*Op1CV != KnownZeroMask)) {
+          // (X&4) == 2 --> false
+          // (X&4) != 2 --> true
+          Constant *Res = ConstantInt::get(Zext.getType(), isNE);
+          return replaceInstUsesWith(Zext, Res);
+        }
+
+        uint32_t ShAmt = KnownZeroMask.logBase2();
+        Value *In = Cmp->getOperand(0);
+        if (ShAmt) {
+          // Perform a logical shr by shiftamt.
+          // Insert the shift to put the result in the low bit.
+          In = Builder.CreateLShr(In, ConstantInt::get(In->getType(), ShAmt),
+                                  In->getName() + ".lobit");
+        }
+
+        if (!Op1CV->isNullValue() == isNE) { // Toggle the low bit.
+          Constant *One = ConstantInt::get(In->getType(), 1);
+          In = Builder.CreateXor(In, One);
+        }
+
+        if (Zext.getType() == In->getType())
+          return replaceInstUsesWith(Zext, In);
+
+        Value *IntCast = Builder.CreateIntCast(In, Zext.getType(), false);
+        return replaceInstUsesWith(Zext, IntCast);
+      }
+    }
+  }
+
+  // icmp ne A, B is equal to xor A, B when A and B only really have one bit.
+  // It is also profitable to transform icmp eq into not(xor(A, B)) because that
+  // may lead to additional simplifications.
+  if (Cmp->isEquality() && Zext.getType() == Cmp->getOperand(0)->getType()) {
+    if (IntegerType *ITy = dyn_cast<IntegerType>(Zext.getType())) {
+      Value *LHS = Cmp->getOperand(0);
+      Value *RHS = Cmp->getOperand(1);
+
+      KnownBits KnownLHS = computeKnownBits(LHS, 0, &Zext);
+      KnownBits KnownRHS = computeKnownBits(RHS, 0, &Zext);
+
+      if (KnownLHS.Zero == KnownRHS.Zero && KnownLHS.One == KnownRHS.One) {
+        APInt KnownBits = KnownLHS.Zero | KnownLHS.One;
+        APInt UnknownBit = ~KnownBits;
+        if (UnknownBit.countPopulation() == 1) {
+          if (!DoTransform) return Cmp;
+
+          Value *Result = Builder.CreateXor(LHS, RHS);
+
+          // Mask off any bits that are set and won't be shifted away.
+          if (KnownLHS.One.uge(UnknownBit))
+            Result = Builder.CreateAnd(Result,
+                                        ConstantInt::get(ITy, UnknownBit));
+
+          // Shift the bit we're testing down to the lsb.
+          Result = Builder.CreateLShr(
+               Result, ConstantInt::get(ITy, UnknownBit.countTrailingZeros()));
+
+          if (Cmp->getPredicate() == ICmpInst::ICMP_EQ)
+            Result = Builder.CreateXor(Result, ConstantInt::get(ITy, 1));
+          Result->takeName(Cmp);
+          return replaceInstUsesWith(Zext, Result);
+        }
+      }
+    }
+  }
+
+  return nullptr;
+}
+
+/// Determine if the specified value can be computed in the specified wider type
+/// and produce the same low bits. If not, return false.
+///
+/// If this function returns true, it can also return a non-zero number of bits
+/// (in BitsToClear) which indicates that the value it computes is correct for
+/// the zero extend, but that the additional BitsToClear bits need to be zero'd
+/// out.  For example, to promote something like:
+///
+///   %B = trunc i64 %A to i32
+///   %C = lshr i32 %B, 8
+///   %E = zext i32 %C to i64
+///
+/// CanEvaluateZExtd for the 'lshr' will return true, and BitsToClear will be
+/// set to 8 to indicate that the promoted value needs to have bits 24-31
+/// cleared in addition to bits 32-63.  Since an 'and' will be generated to
+/// clear the top bits anyway, doing this has no extra cost.
+///
+/// This function works on both vectors and scalars.
+static bool canEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,
                              InstCombinerImpl &IC, Instruction *CxtI) {
-  BitsToClear = 0; 
-  if (canAlwaysEvaluateInType(V, Ty)) 
-    return true; 
-  if (canNotEvaluateInType(V, Ty)) 
-    return false; 
- 
-  auto *I = cast<Instruction>(V); 
-  unsigned Tmp; 
-  switch (I->getOpcode()) { 
-  case Instruction::ZExt:  // zext(zext(x)) -> zext(x). 
-  case Instruction::SExt:  // zext(sext(x)) -> sext(x). 
-  case Instruction::Trunc: // zext(trunc(x)) -> trunc(x) or zext(x) 
-    return true; 
-  case Instruction::And: 
-  case Instruction::Or: 
-  case Instruction::Xor: 
-  case Instruction::Add: 
-  case Instruction::Sub: 
-  case Instruction::Mul: 
-    if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI) || 
-        !canEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI)) 
-      return false; 
-    // These can all be promoted if neither operand has 'bits to clear'. 
-    if (BitsToClear == 0 && Tmp == 0) 
-      return true; 
- 
-    // If the operation is an AND/OR/XOR and the bits to clear are zero in the 
-    // other side, BitsToClear is ok. 
-    if (Tmp == 0 && I->isBitwiseLogicOp()) { 
-      // We use MaskedValueIsZero here for generality, but the case we care 
-      // about the most is constant RHS. 
-      unsigned VSize = V->getType()->getScalarSizeInBits(); 
-      if (IC.MaskedValueIsZero(I->getOperand(1), 
-                               APInt::getHighBitsSet(VSize, BitsToClear), 
-                               0, CxtI)) { 
-        // If this is an And instruction and all of the BitsToClear are 
-        // known to be zero we can reset BitsToClear. 
-        if (I->getOpcode() == Instruction::And) 
-          BitsToClear = 0; 
-        return true; 
-      } 
-    } 
- 
-    // Otherwise, we don't know how to analyze this BitsToClear case yet. 
-    return false; 
- 
-  case Instruction::Shl: { 
-    // We can promote shl(x, cst) if we can promote x.  Since shl overwrites the 
-    // upper bits we can reduce BitsToClear by the shift amount. 
-    const APInt *Amt; 
-    if (match(I->getOperand(1), m_APInt(Amt))) { 
-      if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI)) 
-        return false; 
-      uint64_t ShiftAmt = Amt->getZExtValue(); 
-      BitsToClear = ShiftAmt < BitsToClear ? BitsToClear - ShiftAmt : 0; 
-      return true; 
-    } 
-    return false; 
-  } 
-  case Instruction::LShr: { 
-    // We can promote lshr(x, cst) if we can promote x.  This requires the 
-    // ultimate 'and' to clear out the high zero bits we're clearing out though. 
-    const APInt *Amt; 
-    if (match(I->getOperand(1), m_APInt(Amt))) { 
-      if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI)) 
-        return false; 
-      BitsToClear += Amt->getZExtValue(); 
-      if (BitsToClear > V->getType()->getScalarSizeInBits()) 
-        BitsToClear = V->getType()->getScalarSizeInBits(); 
-      return true; 
-    } 
-    // Cannot promote variable LSHR. 
-    return false; 
-  } 
-  case Instruction::Select: 
-    if (!canEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI) || 
-        !canEvaluateZExtd(I->getOperand(2), Ty, BitsToClear, IC, CxtI) || 
-        // TODO: If important, we could handle the case when the BitsToClear are 
-        // known zero in the disagreeing side. 
-        Tmp != BitsToClear) 
-      return false; 
-    return true; 
- 
-  case Instruction::PHI: { 
-    // We can change a phi if we can change all operands.  Note that we never 
-    // get into trouble with cyclic PHIs here because we only consider 
-    // instructions with a single use. 
-    PHINode *PN = cast<PHINode>(I); 
-    if (!canEvaluateZExtd(PN->getIncomingValue(0), Ty, BitsToClear, IC, CxtI)) 
-      return false; 
-    for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i) 
-      if (!canEvaluateZExtd(PN->getIncomingValue(i), Ty, Tmp, IC, CxtI) || 
-          // TODO: If important, we could handle the case when the BitsToClear 
-          // are known zero in the disagreeing input. 
-          Tmp != BitsToClear) 
-        return false; 
-    return true; 
-  } 
-  default: 
-    // TODO: Can handle more cases here. 
-    return false; 
-  } 
-} 
- 
+  BitsToClear = 0;
+  if (canAlwaysEvaluateInType(V, Ty))
+    return true;
+  if (canNotEvaluateInType(V, Ty))
+    return false;
+
+  auto *I = cast<Instruction>(V);
+  unsigned Tmp;
+  switch (I->getOpcode()) {
+  case Instruction::ZExt:  // zext(zext(x)) -> zext(x).
+  case Instruction::SExt:  // zext(sext(x)) -> sext(x).
+  case Instruction::Trunc: // zext(trunc(x)) -> trunc(x) or zext(x)
+    return true;
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+    if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI) ||
+        !canEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI))
+      return false;
+    // These can all be promoted if neither operand has 'bits to clear'.
+    if (BitsToClear == 0 && Tmp == 0)
+      return true;
+
+    // If the operation is an AND/OR/XOR and the bits to clear are zero in the
+    // other side, BitsToClear is ok.
+    if (Tmp == 0 && I->isBitwiseLogicOp()) {
+      // We use MaskedValueIsZero here for generality, but the case we care
+      // about the most is constant RHS.
+      unsigned VSize = V->getType()->getScalarSizeInBits();
+      if (IC.MaskedValueIsZero(I->getOperand(1),
+                               APInt::getHighBitsSet(VSize, BitsToClear),
+                               0, CxtI)) {
+        // If this is an And instruction and all of the BitsToClear are
+        // known to be zero we can reset BitsToClear.
+        if (I->getOpcode() == Instruction::And)
+          BitsToClear = 0;
+        return true;
+      }
+    }
+
+    // Otherwise, we don't know how to analyze this BitsToClear case yet.
+    return false;
+
+  case Instruction::Shl: {
+    // We can promote shl(x, cst) if we can promote x.  Since shl overwrites the
+    // upper bits we can reduce BitsToClear by the shift amount.
+    const APInt *Amt;
+    if (match(I->getOperand(1), m_APInt(Amt))) {
+      if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI))
+        return false;
+      uint64_t ShiftAmt = Amt->getZExtValue();
+      BitsToClear = ShiftAmt < BitsToClear ? BitsToClear - ShiftAmt : 0;
+      return true;
+    }
+    return false;
+  }
+  case Instruction::LShr: {
+    // We can promote lshr(x, cst) if we can promote x.  This requires the
+    // ultimate 'and' to clear out the high zero bits we're clearing out though.
+    const APInt *Amt;
+    if (match(I->getOperand(1), m_APInt(Amt))) {
+      if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI))
+        return false;
+      BitsToClear += Amt->getZExtValue();
+      if (BitsToClear > V->getType()->getScalarSizeInBits())
+        BitsToClear = V->getType()->getScalarSizeInBits();
+      return true;
+    }
+    // Cannot promote variable LSHR.
+    return false;
+  }
+  case Instruction::Select:
+    if (!canEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI) ||
+        !canEvaluateZExtd(I->getOperand(2), Ty, BitsToClear, IC, CxtI) ||
+        // TODO: If important, we could handle the case when the BitsToClear are
+        // known zero in the disagreeing side.
+        Tmp != BitsToClear)
+      return false;
+    return true;
+
+  case Instruction::PHI: {
+    // We can change a phi if we can change all operands.  Note that we never
+    // get into trouble with cyclic PHIs here because we only consider
+    // instructions with a single use.
+    PHINode *PN = cast<PHINode>(I);
+    if (!canEvaluateZExtd(PN->getIncomingValue(0), Ty, BitsToClear, IC, CxtI))
+      return false;
+    for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (!canEvaluateZExtd(PN->getIncomingValue(i), Ty, Tmp, IC, CxtI) ||
+          // TODO: If important, we could handle the case when the BitsToClear
+          // are known zero in the disagreeing input.
+          Tmp != BitsToClear)
+        return false;
+    return true;
+  }
+  default:
+    // TODO: Can handle more cases here.
+    return false;
+  }
+}
+
 Instruction *InstCombinerImpl::visitZExt(ZExtInst &CI) {
-  // If this zero extend is only used by a truncate, let the truncate be 
-  // eliminated before we try to optimize this zext. 
-  if (CI.hasOneUse() && isa<TruncInst>(CI.user_back())) 
-    return nullptr; 
- 
-  // If one of the common conversion will work, do it. 
-  if (Instruction *Result = commonCastTransforms(CI)) 
-    return Result; 
- 
-  Value *Src = CI.getOperand(0); 
-  Type *SrcTy = Src->getType(), *DestTy = CI.getType(); 
- 
-  // Try to extend the entire expression tree to the wide destination type. 
-  unsigned BitsToClear; 
-  if (shouldChangeType(SrcTy, DestTy) && 
-      canEvaluateZExtd(Src, DestTy, BitsToClear, *this, &CI)) { 
-    assert(BitsToClear <= SrcTy->getScalarSizeInBits() && 
-           "Can't clear more bits than in SrcTy"); 
- 
-    // Okay, we can transform this!  Insert the new expression now. 
-    LLVM_DEBUG( 
-        dbgs() << "ICE: EvaluateInDifferentType converting expression type" 
-                  " to avoid zero extend: " 
-               << CI << '\n'); 
-    Value *Res = EvaluateInDifferentType(Src, DestTy, false); 
-    assert(Res->getType() == DestTy); 
- 
-    // Preserve debug values referring to Src if the zext is its last use. 
-    if (auto *SrcOp = dyn_cast<Instruction>(Src)) 
-      if (SrcOp->hasOneUse()) 
-        replaceAllDbgUsesWith(*SrcOp, *Res, CI, DT); 
- 
-    uint32_t SrcBitsKept = SrcTy->getScalarSizeInBits()-BitsToClear; 
-    uint32_t DestBitSize = DestTy->getScalarSizeInBits(); 
- 
-    // If the high bits are already filled with zeros, just replace this 
-    // cast with the result. 
-    if (MaskedValueIsZero(Res, 
-                          APInt::getHighBitsSet(DestBitSize, 
-                                                DestBitSize-SrcBitsKept), 
-                             0, &CI)) 
-      return replaceInstUsesWith(CI, Res); 
- 
-    // We need to emit an AND to clear the high bits. 
-    Constant *C = ConstantInt::get(Res->getType(), 
-                               APInt::getLowBitsSet(DestBitSize, SrcBitsKept)); 
-    return BinaryOperator::CreateAnd(Res, C); 
-  } 
- 
-  // If this is a TRUNC followed by a ZEXT then we are dealing with integral 
-  // types and if the sizes are just right we can convert this into a logical 
-  // 'and' which will be much cheaper than the pair of casts. 
-  if (TruncInst *CSrc = dyn_cast<TruncInst>(Src)) {   // A->B->C cast 
-    // TODO: Subsume this into EvaluateInDifferentType. 
- 
-    // Get the sizes of the types involved.  We know that the intermediate type 
-    // will be smaller than A or C, but don't know the relation between A and C. 
-    Value *A = CSrc->getOperand(0); 
-    unsigned SrcSize = A->getType()->getScalarSizeInBits(); 
-    unsigned MidSize = CSrc->getType()->getScalarSizeInBits(); 
-    unsigned DstSize = CI.getType()->getScalarSizeInBits(); 
-    // If we're actually extending zero bits, then if 
-    // SrcSize <  DstSize: zext(a & mask) 
-    // SrcSize == DstSize: a & mask 
-    // SrcSize  > DstSize: trunc(a) & mask 
-    if (SrcSize < DstSize) { 
-      APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize)); 
-      Constant *AndConst = ConstantInt::get(A->getType(), AndValue); 
-      Value *And = Builder.CreateAnd(A, AndConst, CSrc->getName() + ".mask"); 
-      return new ZExtInst(And, CI.getType()); 
-    } 
- 
-    if (SrcSize == DstSize) { 
-      APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize)); 
-      return BinaryOperator::CreateAnd(A, ConstantInt::get(A->getType(), 
-                                                           AndValue)); 
-    } 
-    if (SrcSize > DstSize) { 
-      Value *Trunc = Builder.CreateTrunc(A, CI.getType()); 
-      APInt AndValue(APInt::getLowBitsSet(DstSize, MidSize)); 
-      return BinaryOperator::CreateAnd(Trunc, 
-                                       ConstantInt::get(Trunc->getType(), 
-                                                        AndValue)); 
-    } 
-  } 
- 
-  if (ICmpInst *Cmp = dyn_cast<ICmpInst>(Src)) 
-    return transformZExtICmp(Cmp, CI); 
- 
-  BinaryOperator *SrcI = dyn_cast<BinaryOperator>(Src); 
-  if (SrcI && SrcI->getOpcode() == Instruction::Or) { 
-    // zext (or icmp, icmp) -> or (zext icmp), (zext icmp) if at least one 
-    // of the (zext icmp) can be eliminated. If so, immediately perform the 
-    // according elimination. 
-    ICmpInst *LHS = dyn_cast<ICmpInst>(SrcI->getOperand(0)); 
-    ICmpInst *RHS = dyn_cast<ICmpInst>(SrcI->getOperand(1)); 
-    if (LHS && RHS && LHS->hasOneUse() && RHS->hasOneUse() && 
+  // If this zero extend is only used by a truncate, let the truncate be
+  // eliminated before we try to optimize this zext.
+  if (CI.hasOneUse() && isa<TruncInst>(CI.user_back()))
+    return nullptr;
+
+  // If one of the common conversion will work, do it.
+  if (Instruction *Result = commonCastTransforms(CI))
+    return Result;
+
+  Value *Src = CI.getOperand(0);
+  Type *SrcTy = Src->getType(), *DestTy = CI.getType();
+
+  // Try to extend the entire expression tree to the wide destination type.
+  unsigned BitsToClear;
+  if (shouldChangeType(SrcTy, DestTy) &&
+      canEvaluateZExtd(Src, DestTy, BitsToClear, *this, &CI)) {
+    assert(BitsToClear <= SrcTy->getScalarSizeInBits() &&
+           "Can't clear more bits than in SrcTy");
+
+    // Okay, we can transform this!  Insert the new expression now.
+    LLVM_DEBUG(
+        dbgs() << "ICE: EvaluateInDifferentType converting expression type"
+                  " to avoid zero extend: "
+               << CI << '\n');
+    Value *Res = EvaluateInDifferentType(Src, DestTy, false);
+    assert(Res->getType() == DestTy);
+
+    // Preserve debug values referring to Src if the zext is its last use.
+    if (auto *SrcOp = dyn_cast<Instruction>(Src))
+      if (SrcOp->hasOneUse())
+        replaceAllDbgUsesWith(*SrcOp, *Res, CI, DT);
+
+    uint32_t SrcBitsKept = SrcTy->getScalarSizeInBits()-BitsToClear;
+    uint32_t DestBitSize = DestTy->getScalarSizeInBits();
+
+    // If the high bits are already filled with zeros, just replace this
+    // cast with the result.
+    if (MaskedValueIsZero(Res,
+                          APInt::getHighBitsSet(DestBitSize,
+                                                DestBitSize-SrcBitsKept),
+                             0, &CI))
+      return replaceInstUsesWith(CI, Res);
+
+    // We need to emit an AND to clear the high bits.
+    Constant *C = ConstantInt::get(Res->getType(),
+                               APInt::getLowBitsSet(DestBitSize, SrcBitsKept));
+    return BinaryOperator::CreateAnd(Res, C);
+  }
+
+  // If this is a TRUNC followed by a ZEXT then we are dealing with integral
+  // types and if the sizes are just right we can convert this into a logical
+  // 'and' which will be much cheaper than the pair of casts.
+  if (TruncInst *CSrc = dyn_cast<TruncInst>(Src)) {   // A->B->C cast
+    // TODO: Subsume this into EvaluateInDifferentType.
+
+    // Get the sizes of the types involved.  We know that the intermediate type
+    // will be smaller than A or C, but don't know the relation between A and C.
+    Value *A = CSrc->getOperand(0);
+    unsigned SrcSize = A->getType()->getScalarSizeInBits();
+    unsigned MidSize = CSrc->getType()->getScalarSizeInBits();
+    unsigned DstSize = CI.getType()->getScalarSizeInBits();
+    // If we're actually extending zero bits, then if
+    // SrcSize <  DstSize: zext(a & mask)
+    // SrcSize == DstSize: a & mask
+    // SrcSize  > DstSize: trunc(a) & mask
+    if (SrcSize < DstSize) {
+      APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize));
+      Constant *AndConst = ConstantInt::get(A->getType(), AndValue);
+      Value *And = Builder.CreateAnd(A, AndConst, CSrc->getName() + ".mask");
+      return new ZExtInst(And, CI.getType());
+    }
+
+    if (SrcSize == DstSize) {
+      APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize));
+      return BinaryOperator::CreateAnd(A, ConstantInt::get(A->getType(),
+                                                           AndValue));
+    }
+    if (SrcSize > DstSize) {
+      Value *Trunc = Builder.CreateTrunc(A, CI.getType());
+      APInt AndValue(APInt::getLowBitsSet(DstSize, MidSize));
+      return BinaryOperator::CreateAnd(Trunc,
+                                       ConstantInt::get(Trunc->getType(),
+                                                        AndValue));
+    }
+  }
+
+  if (ICmpInst *Cmp = dyn_cast<ICmpInst>(Src))
+    return transformZExtICmp(Cmp, CI);
+
+  BinaryOperator *SrcI = dyn_cast<BinaryOperator>(Src);
+  if (SrcI && SrcI->getOpcode() == Instruction::Or) {
+    // zext (or icmp, icmp) -> or (zext icmp), (zext icmp) if at least one
+    // of the (zext icmp) can be eliminated. If so, immediately perform the
+    // according elimination.
+    ICmpInst *LHS = dyn_cast<ICmpInst>(SrcI->getOperand(0));
+    ICmpInst *RHS = dyn_cast<ICmpInst>(SrcI->getOperand(1));
+    if (LHS && RHS && LHS->hasOneUse() && RHS->hasOneUse() &&
         LHS->getOperand(0)->getType() == RHS->getOperand(0)->getType() &&
-        (transformZExtICmp(LHS, CI, false) || 
-         transformZExtICmp(RHS, CI, false))) { 
-      // zext (or icmp, icmp) -> or (zext icmp), (zext icmp) 
-      Value *LCast = Builder.CreateZExt(LHS, CI.getType(), LHS->getName()); 
-      Value *RCast = Builder.CreateZExt(RHS, CI.getType(), RHS->getName()); 
-      Value *Or = Builder.CreateOr(LCast, RCast, CI.getName()); 
-      if (auto *OrInst = dyn_cast<Instruction>(Or)) 
-        Builder.SetInsertPoint(OrInst); 
- 
-      // Perform the elimination. 
-      if (auto *LZExt = dyn_cast<ZExtInst>(LCast)) 
-        transformZExtICmp(LHS, *LZExt); 
-      if (auto *RZExt = dyn_cast<ZExtInst>(RCast)) 
-        transformZExtICmp(RHS, *RZExt); 
- 
-      return replaceInstUsesWith(CI, Or); 
-    } 
-  } 
- 
-  // zext(trunc(X) & C) -> (X & zext(C)). 
-  Constant *C; 
-  Value *X; 
-  if (SrcI && 
-      match(SrcI, m_OneUse(m_And(m_Trunc(m_Value(X)), m_Constant(C)))) && 
-      X->getType() == CI.getType()) 
-    return BinaryOperator::CreateAnd(X, ConstantExpr::getZExt(C, CI.getType())); 
- 
-  // zext((trunc(X) & C) ^ C) -> ((X & zext(C)) ^ zext(C)). 
-  Value *And; 
-  if (SrcI && match(SrcI, m_OneUse(m_Xor(m_Value(And), m_Constant(C)))) && 
-      match(And, m_OneUse(m_And(m_Trunc(m_Value(X)), m_Specific(C)))) && 
-      X->getType() == CI.getType()) { 
-    Constant *ZC = ConstantExpr::getZExt(C, CI.getType()); 
-    return BinaryOperator::CreateXor(Builder.CreateAnd(X, ZC), ZC); 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Transform (sext icmp) to bitwise / integer operations to eliminate the icmp. 
+        (transformZExtICmp(LHS, CI, false) ||
+         transformZExtICmp(RHS, CI, false))) {
+      // zext (or icmp, icmp) -> or (zext icmp), (zext icmp)
+      Value *LCast = Builder.CreateZExt(LHS, CI.getType(), LHS->getName());
+      Value *RCast = Builder.CreateZExt(RHS, CI.getType(), RHS->getName());
+      Value *Or = Builder.CreateOr(LCast, RCast, CI.getName());
+      if (auto *OrInst = dyn_cast<Instruction>(Or))
+        Builder.SetInsertPoint(OrInst);
+
+      // Perform the elimination.
+      if (auto *LZExt = dyn_cast<ZExtInst>(LCast))
+        transformZExtICmp(LHS, *LZExt);
+      if (auto *RZExt = dyn_cast<ZExtInst>(RCast))
+        transformZExtICmp(RHS, *RZExt);
+
+      return replaceInstUsesWith(CI, Or);
+    }
+  }
+
+  // zext(trunc(X) & C) -> (X & zext(C)).
+  Constant *C;
+  Value *X;
+  if (SrcI &&
+      match(SrcI, m_OneUse(m_And(m_Trunc(m_Value(X)), m_Constant(C)))) &&
+      X->getType() == CI.getType())
+    return BinaryOperator::CreateAnd(X, ConstantExpr::getZExt(C, CI.getType()));
+
+  // zext((trunc(X) & C) ^ C) -> ((X & zext(C)) ^ zext(C)).
+  Value *And;
+  if (SrcI && match(SrcI, m_OneUse(m_Xor(m_Value(And), m_Constant(C)))) &&
+      match(And, m_OneUse(m_And(m_Trunc(m_Value(X)), m_Specific(C)))) &&
+      X->getType() == CI.getType()) {
+    Constant *ZC = ConstantExpr::getZExt(C, CI.getType());
+    return BinaryOperator::CreateXor(Builder.CreateAnd(X, ZC), ZC);
+  }
+
+  return nullptr;
+}
+
+/// Transform (sext icmp) to bitwise / integer operations to eliminate the icmp.
 Instruction *InstCombinerImpl::transformSExtICmp(ICmpInst *ICI,
                                                  Instruction &CI) {
-  Value *Op0 = ICI->getOperand(0), *Op1 = ICI->getOperand(1); 
-  ICmpInst::Predicate Pred = ICI->getPredicate(); 
- 
-  // Don't bother if Op1 isn't of vector or integer type. 
-  if (!Op1->getType()->isIntOrIntVectorTy()) 
-    return nullptr; 
- 
-  if ((Pred == ICmpInst::ICMP_SLT && match(Op1, m_ZeroInt())) || 
-      (Pred == ICmpInst::ICMP_SGT && match(Op1, m_AllOnes()))) { 
-    // (x <s  0) ? -1 : 0 -> ashr x, 31        -> all ones if negative 
-    // (x >s -1) ? -1 : 0 -> not (ashr x, 31)  -> all ones if positive 
-    Value *Sh = ConstantInt::get(Op0->getType(), 
-                                 Op0->getType()->getScalarSizeInBits() - 1); 
-    Value *In = Builder.CreateAShr(Op0, Sh, Op0->getName() + ".lobit"); 
-    if (In->getType() != CI.getType()) 
-      In = Builder.CreateIntCast(In, CI.getType(), true /*SExt*/); 
- 
-    if (Pred == ICmpInst::ICMP_SGT) 
-      In = Builder.CreateNot(In, In->getName() + ".not"); 
-    return replaceInstUsesWith(CI, In); 
-  } 
- 
-  if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) { 
-    // If we know that only one bit of the LHS of the icmp can be set and we 
-    // have an equality comparison with zero or a power of 2, we can transform 
-    // the icmp and sext into bitwise/integer operations. 
-    if (ICI->hasOneUse() && 
-        ICI->isEquality() && (Op1C->isZero() || Op1C->getValue().isPowerOf2())){ 
-      KnownBits Known = computeKnownBits(Op0, 0, &CI); 
- 
-      APInt KnownZeroMask(~Known.Zero); 
-      if (KnownZeroMask.isPowerOf2()) { 
-        Value *In = ICI->getOperand(0); 
- 
-        // If the icmp tests for a known zero bit we can constant fold it. 
-        if (!Op1C->isZero() && Op1C->getValue() != KnownZeroMask) { 
-          Value *V = Pred == ICmpInst::ICMP_NE ? 
-                       ConstantInt::getAllOnesValue(CI.getType()) : 
-                       ConstantInt::getNullValue(CI.getType()); 
-          return replaceInstUsesWith(CI, V); 
-        } 
- 
-        if (!Op1C->isZero() == (Pred == ICmpInst::ICMP_NE)) { 
-          // sext ((x & 2^n) == 0)   -> (x >> n) - 1 
-          // sext ((x & 2^n) != 2^n) -> (x >> n) - 1 
-          unsigned ShiftAmt = KnownZeroMask.countTrailingZeros(); 
-          // Perform a right shift to place the desired bit in the LSB. 
-          if (ShiftAmt) 
-            In = Builder.CreateLShr(In, 
-                                    ConstantInt::get(In->getType(), ShiftAmt)); 
- 
-          // At this point "In" is either 1 or 0. Subtract 1 to turn 
-          // {1, 0} -> {0, -1}. 
-          In = Builder.CreateAdd(In, 
-                                 ConstantInt::getAllOnesValue(In->getType()), 
-                                 "sext"); 
-        } else { 
-          // sext ((x & 2^n) != 0)   -> (x << bitwidth-n) a>> bitwidth-1 
-          // sext ((x & 2^n) == 2^n) -> (x << bitwidth-n) a>> bitwidth-1 
-          unsigned ShiftAmt = KnownZeroMask.countLeadingZeros(); 
-          // Perform a left shift to place the desired bit in the MSB. 
-          if (ShiftAmt) 
-            In = Builder.CreateShl(In, 
-                                   ConstantInt::get(In->getType(), ShiftAmt)); 
- 
-          // Distribute the bit over the whole bit width. 
-          In = Builder.CreateAShr(In, ConstantInt::get(In->getType(), 
-                                  KnownZeroMask.getBitWidth() - 1), "sext"); 
-        } 
- 
-        if (CI.getType() == In->getType()) 
-          return replaceInstUsesWith(CI, In); 
-        return CastInst::CreateIntegerCast(In, CI.getType(), true/*SExt*/); 
-      } 
-    } 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Return true if we can take the specified value and return it as type Ty 
-/// without inserting any new casts and without changing the value of the common 
-/// low bits.  This is used by code that tries to promote integer operations to 
-/// a wider types will allow us to eliminate the extension. 
-/// 
-/// This function works on both vectors and scalars. 
-/// 
-static bool canEvaluateSExtd(Value *V, Type *Ty) { 
-  assert(V->getType()->getScalarSizeInBits() < Ty->getScalarSizeInBits() && 
-         "Can't sign extend type to a smaller type"); 
-  if (canAlwaysEvaluateInType(V, Ty)) 
-    return true; 
-  if (canNotEvaluateInType(V, Ty)) 
-    return false; 
- 
-  auto *I = cast<Instruction>(V); 
-  switch (I->getOpcode()) { 
-  case Instruction::SExt:  // sext(sext(x)) -> sext(x) 
-  case Instruction::ZExt:  // sext(zext(x)) -> zext(x) 
-  case Instruction::Trunc: // sext(trunc(x)) -> trunc(x) or sext(x) 
-    return true; 
-  case Instruction::And: 
-  case Instruction::Or: 
-  case Instruction::Xor: 
-  case Instruction::Add: 
-  case Instruction::Sub: 
-  case Instruction::Mul: 
-    // These operators can all arbitrarily be extended if their inputs can. 
-    return canEvaluateSExtd(I->getOperand(0), Ty) && 
-           canEvaluateSExtd(I->getOperand(1), Ty); 
- 
-  //case Instruction::Shl:   TODO 
-  //case Instruction::LShr:  TODO 
- 
-  case Instruction::Select: 
-    return canEvaluateSExtd(I->getOperand(1), Ty) && 
-           canEvaluateSExtd(I->getOperand(2), Ty); 
- 
-  case Instruction::PHI: { 
-    // We can change a phi if we can change all operands.  Note that we never 
-    // get into trouble with cyclic PHIs here because we only consider 
-    // instructions with a single use. 
-    PHINode *PN = cast<PHINode>(I); 
-    for (Value *IncValue : PN->incoming_values()) 
-      if (!canEvaluateSExtd(IncValue, Ty)) return false; 
-    return true; 
-  } 
-  default: 
-    // TODO: Can handle more cases here. 
-    break; 
-  } 
- 
-  return false; 
-} 
- 
+  Value *Op0 = ICI->getOperand(0), *Op1 = ICI->getOperand(1);
+  ICmpInst::Predicate Pred = ICI->getPredicate();
+
+  // Don't bother if Op1 isn't of vector or integer type.
+  if (!Op1->getType()->isIntOrIntVectorTy())
+    return nullptr;
+
+  if ((Pred == ICmpInst::ICMP_SLT && match(Op1, m_ZeroInt())) ||
+      (Pred == ICmpInst::ICMP_SGT && match(Op1, m_AllOnes()))) {
+    // (x <s  0) ? -1 : 0 -> ashr x, 31        -> all ones if negative
+    // (x >s -1) ? -1 : 0 -> not (ashr x, 31)  -> all ones if positive
+    Value *Sh = ConstantInt::get(Op0->getType(),
+                                 Op0->getType()->getScalarSizeInBits() - 1);
+    Value *In = Builder.CreateAShr(Op0, Sh, Op0->getName() + ".lobit");
+    if (In->getType() != CI.getType())
+      In = Builder.CreateIntCast(In, CI.getType(), true /*SExt*/);
+
+    if (Pred == ICmpInst::ICMP_SGT)
+      In = Builder.CreateNot(In, In->getName() + ".not");
+    return replaceInstUsesWith(CI, In);
+  }
+
+  if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
+    // If we know that only one bit of the LHS of the icmp can be set and we
+    // have an equality comparison with zero or a power of 2, we can transform
+    // the icmp and sext into bitwise/integer operations.
+    if (ICI->hasOneUse() &&
+        ICI->isEquality() && (Op1C->isZero() || Op1C->getValue().isPowerOf2())){
+      KnownBits Known = computeKnownBits(Op0, 0, &CI);
+
+      APInt KnownZeroMask(~Known.Zero);
+      if (KnownZeroMask.isPowerOf2()) {
+        Value *In = ICI->getOperand(0);
+
+        // If the icmp tests for a known zero bit we can constant fold it.
+        if (!Op1C->isZero() && Op1C->getValue() != KnownZeroMask) {
+          Value *V = Pred == ICmpInst::ICMP_NE ?
+                       ConstantInt::getAllOnesValue(CI.getType()) :
+                       ConstantInt::getNullValue(CI.getType());
+          return replaceInstUsesWith(CI, V);
+        }
+
+        if (!Op1C->isZero() == (Pred == ICmpInst::ICMP_NE)) {
+          // sext ((x & 2^n) == 0)   -> (x >> n) - 1
+          // sext ((x & 2^n) != 2^n) -> (x >> n) - 1
+          unsigned ShiftAmt = KnownZeroMask.countTrailingZeros();
+          // Perform a right shift to place the desired bit in the LSB.
+          if (ShiftAmt)
+            In = Builder.CreateLShr(In,
+                                    ConstantInt::get(In->getType(), ShiftAmt));
+
+          // At this point "In" is either 1 or 0. Subtract 1 to turn
+          // {1, 0} -> {0, -1}.
+          In = Builder.CreateAdd(In,
+                                 ConstantInt::getAllOnesValue(In->getType()),
+                                 "sext");
+        } else {
+          // sext ((x & 2^n) != 0)   -> (x << bitwidth-n) a>> bitwidth-1
+          // sext ((x & 2^n) == 2^n) -> (x << bitwidth-n) a>> bitwidth-1
+          unsigned ShiftAmt = KnownZeroMask.countLeadingZeros();
+          // Perform a left shift to place the desired bit in the MSB.
+          if (ShiftAmt)
+            In = Builder.CreateShl(In,
+                                   ConstantInt::get(In->getType(), ShiftAmt));
+
+          // Distribute the bit over the whole bit width.
+          In = Builder.CreateAShr(In, ConstantInt::get(In->getType(),
+                                  KnownZeroMask.getBitWidth() - 1), "sext");
+        }
+
+        if (CI.getType() == In->getType())
+          return replaceInstUsesWith(CI, In);
+        return CastInst::CreateIntegerCast(In, CI.getType(), true/*SExt*/);
+      }
+    }
+  }
+
+  return nullptr;
+}
+
+/// Return true if we can take the specified value and return it as type Ty
+/// without inserting any new casts and without changing the value of the common
+/// low bits.  This is used by code that tries to promote integer operations to
+/// a wider types will allow us to eliminate the extension.
+///
+/// This function works on both vectors and scalars.
+///
+static bool canEvaluateSExtd(Value *V, Type *Ty) {
+  assert(V->getType()->getScalarSizeInBits() < Ty->getScalarSizeInBits() &&
+         "Can't sign extend type to a smaller type");
+  if (canAlwaysEvaluateInType(V, Ty))
+    return true;
+  if (canNotEvaluateInType(V, Ty))
+    return false;
+
+  auto *I = cast<Instruction>(V);
+  switch (I->getOpcode()) {
+  case Instruction::SExt:  // sext(sext(x)) -> sext(x)
+  case Instruction::ZExt:  // sext(zext(x)) -> zext(x)
+  case Instruction::Trunc: // sext(trunc(x)) -> trunc(x) or sext(x)
+    return true;
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+    // These operators can all arbitrarily be extended if their inputs can.
+    return canEvaluateSExtd(I->getOperand(0), Ty) &&
+           canEvaluateSExtd(I->getOperand(1), Ty);
+
+  //case Instruction::Shl:   TODO
+  //case Instruction::LShr:  TODO
+
+  case Instruction::Select:
+    return canEvaluateSExtd(I->getOperand(1), Ty) &&
+           canEvaluateSExtd(I->getOperand(2), Ty);
+
+  case Instruction::PHI: {
+    // We can change a phi if we can change all operands.  Note that we never
+    // get into trouble with cyclic PHIs here because we only consider
+    // instructions with a single use.
+    PHINode *PN = cast<PHINode>(I);
+    for (Value *IncValue : PN->incoming_values())
+      if (!canEvaluateSExtd(IncValue, Ty)) return false;
+    return true;
+  }
+  default:
+    // TODO: Can handle more cases here.
+    break;
+  }
+
+  return false;
+}
+
 Instruction *InstCombinerImpl::visitSExt(SExtInst &CI) {
-  // If this sign extend is only used by a truncate, let the truncate be 
-  // eliminated before we try to optimize this sext. 
-  if (CI.hasOneUse() && isa<TruncInst>(CI.user_back())) 
-    return nullptr; 
- 
-  if (Instruction *I = commonCastTransforms(CI)) 
-    return I; 
- 
-  Value *Src = CI.getOperand(0); 
-  Type *SrcTy = Src->getType(), *DestTy = CI.getType(); 
- 
-  // If we know that the value being extended is positive, we can use a zext 
-  // instead. 
-  KnownBits Known = computeKnownBits(Src, 0, &CI); 
-  if (Known.isNonNegative()) 
-    return CastInst::Create(Instruction::ZExt, Src, DestTy); 
- 
-  // Try to extend the entire expression tree to the wide destination type. 
-  if (shouldChangeType(SrcTy, DestTy) && canEvaluateSExtd(Src, DestTy)) { 
-    // Okay, we can transform this!  Insert the new expression now. 
-    LLVM_DEBUG( 
-        dbgs() << "ICE: EvaluateInDifferentType converting expression type" 
-                  " to avoid sign extend: " 
-               << CI << '\n'); 
-    Value *Res = EvaluateInDifferentType(Src, DestTy, true); 
-    assert(Res->getType() == DestTy); 
- 
-    uint32_t SrcBitSize = SrcTy->getScalarSizeInBits(); 
-    uint32_t DestBitSize = DestTy->getScalarSizeInBits(); 
- 
-    // If the high bits are already filled with sign bit, just replace this 
-    // cast with the result. 
-    if (ComputeNumSignBits(Res, 0, &CI) > DestBitSize - SrcBitSize) 
-      return replaceInstUsesWith(CI, Res); 
- 
-    // We need to emit a shl + ashr to do the sign extend. 
-    Value *ShAmt = ConstantInt::get(DestTy, DestBitSize-SrcBitSize); 
-    return BinaryOperator::CreateAShr(Builder.CreateShl(Res, ShAmt, "sext"), 
-                                      ShAmt); 
-  } 
- 
-  // If the input is a trunc from the destination type, then turn sext(trunc(x)) 
-  // into shifts. 
-  Value *X; 
-  if (match(Src, m_OneUse(m_Trunc(m_Value(X)))) && X->getType() == DestTy) { 
-    // sext(trunc(X)) --> ashr(shl(X, C), C) 
-    unsigned SrcBitSize = SrcTy->getScalarSizeInBits(); 
-    unsigned DestBitSize = DestTy->getScalarSizeInBits(); 
-    Constant *ShAmt = ConstantInt::get(DestTy, DestBitSize - SrcBitSize); 
-    return BinaryOperator::CreateAShr(Builder.CreateShl(X, ShAmt), ShAmt); 
-  } 
- 
-  if (ICmpInst *ICI = dyn_cast<ICmpInst>(Src)) 
-    return transformSExtICmp(ICI, CI); 
- 
-  // If the input is a shl/ashr pair of a same constant, then this is a sign 
-  // extension from a smaller value.  If we could trust arbitrary bitwidth 
-  // integers, we could turn this into a truncate to the smaller bit and then 
-  // use a sext for the whole extension.  Since we don't, look deeper and check 
-  // for a truncate.  If the source and dest are the same type, eliminate the 
-  // trunc and extend and just do shifts.  For example, turn: 
-  //   %a = trunc i32 %i to i8 
+  // If this sign extend is only used by a truncate, let the truncate be
+  // eliminated before we try to optimize this sext.
+  if (CI.hasOneUse() && isa<TruncInst>(CI.user_back()))
+    return nullptr;
+
+  if (Instruction *I = commonCastTransforms(CI))
+    return I;
+
+  Value *Src = CI.getOperand(0);
+  Type *SrcTy = Src->getType(), *DestTy = CI.getType();
+
+  // If we know that the value being extended is positive, we can use a zext
+  // instead.
+  KnownBits Known = computeKnownBits(Src, 0, &CI);
+  if (Known.isNonNegative())
+    return CastInst::Create(Instruction::ZExt, Src, DestTy);
+
+  // Try to extend the entire expression tree to the wide destination type.
+  if (shouldChangeType(SrcTy, DestTy) && canEvaluateSExtd(Src, DestTy)) {
+    // Okay, we can transform this!  Insert the new expression now.
+    LLVM_DEBUG(
+        dbgs() << "ICE: EvaluateInDifferentType converting expression type"
+                  " to avoid sign extend: "
+               << CI << '\n');
+    Value *Res = EvaluateInDifferentType(Src, DestTy, true);
+    assert(Res->getType() == DestTy);
+
+    uint32_t SrcBitSize = SrcTy->getScalarSizeInBits();
+    uint32_t DestBitSize = DestTy->getScalarSizeInBits();
+
+    // If the high bits are already filled with sign bit, just replace this
+    // cast with the result.
+    if (ComputeNumSignBits(Res, 0, &CI) > DestBitSize - SrcBitSize)
+      return replaceInstUsesWith(CI, Res);
+
+    // We need to emit a shl + ashr to do the sign extend.
+    Value *ShAmt = ConstantInt::get(DestTy, DestBitSize-SrcBitSize);
+    return BinaryOperator::CreateAShr(Builder.CreateShl(Res, ShAmt, "sext"),
+                                      ShAmt);
+  }
+
+  // If the input is a trunc from the destination type, then turn sext(trunc(x))
+  // into shifts.
+  Value *X;
+  if (match(Src, m_OneUse(m_Trunc(m_Value(X)))) && X->getType() == DestTy) {
+    // sext(trunc(X)) --> ashr(shl(X, C), C)
+    unsigned SrcBitSize = SrcTy->getScalarSizeInBits();
+    unsigned DestBitSize = DestTy->getScalarSizeInBits();
+    Constant *ShAmt = ConstantInt::get(DestTy, DestBitSize - SrcBitSize);
+    return BinaryOperator::CreateAShr(Builder.CreateShl(X, ShAmt), ShAmt);
+  }
+
+  if (ICmpInst *ICI = dyn_cast<ICmpInst>(Src))
+    return transformSExtICmp(ICI, CI);
+
+  // If the input is a shl/ashr pair of a same constant, then this is a sign
+  // extension from a smaller value.  If we could trust arbitrary bitwidth
+  // integers, we could turn this into a truncate to the smaller bit and then
+  // use a sext for the whole extension.  Since we don't, look deeper and check
+  // for a truncate.  If the source and dest are the same type, eliminate the
+  // trunc and extend and just do shifts.  For example, turn:
+  //   %a = trunc i32 %i to i8
   //   %b = shl i8 %a, C
   //   %c = ashr i8 %b, C
-  //   %d = sext i8 %c to i32 
-  // into: 
+  //   %d = sext i8 %c to i32
+  // into:
   //   %a = shl i32 %i, 32-(8-C)
   //   %d = ashr i32 %a, 32-(8-C)
-  Value *A = nullptr; 
-  // TODO: Eventually this could be subsumed by EvaluateInDifferentType. 
-  Constant *BA = nullptr, *CA = nullptr; 
-  if (match(Src, m_AShr(m_Shl(m_Trunc(m_Value(A)), m_Constant(BA)), 
-                        m_Constant(CA))) && 
+  Value *A = nullptr;
+  // TODO: Eventually this could be subsumed by EvaluateInDifferentType.
+  Constant *BA = nullptr, *CA = nullptr;
+  if (match(Src, m_AShr(m_Shl(m_Trunc(m_Value(A)), m_Constant(BA)),
+                        m_Constant(CA))) &&
       BA->isElementWiseEqual(CA) && A->getType() == DestTy) {
     Constant *WideCurrShAmt = ConstantExpr::getSExt(CA, DestTy);
     Constant *NumLowbitsLeft = ConstantExpr::getSub(
@@ -1533,445 +1533,445 @@ Instruction *InstCombinerImpl::visitSExt(SExtInst &CI) {
         Constant::mergeUndefsWith(Constant::mergeUndefsWith(NewShAmt, BA), CA);
     A = Builder.CreateShl(A, NewShAmt, CI.getName());
     return BinaryOperator::CreateAShr(A, NewShAmt);
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Return a Constant* for the specified floating-point constant if it fits 
-/// in the specified FP type without changing its value. 
-static bool fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) { 
-  bool losesInfo; 
-  APFloat F = CFP->getValueAPF(); 
-  (void)F.convert(Sem, APFloat::rmNearestTiesToEven, &losesInfo); 
-  return !losesInfo; 
-} 
- 
-static Type *shrinkFPConstant(ConstantFP *CFP) { 
-  if (CFP->getType() == Type::getPPC_FP128Ty(CFP->getContext())) 
-    return nullptr;  // No constant folding of this. 
-  // See if the value can be truncated to half and then reextended. 
-  if (fitsInFPType(CFP, APFloat::IEEEhalf())) 
-    return Type::getHalfTy(CFP->getContext()); 
-  // See if the value can be truncated to float and then reextended. 
-  if (fitsInFPType(CFP, APFloat::IEEEsingle())) 
-    return Type::getFloatTy(CFP->getContext()); 
-  if (CFP->getType()->isDoubleTy()) 
-    return nullptr;  // Won't shrink. 
-  if (fitsInFPType(CFP, APFloat::IEEEdouble())) 
-    return Type::getDoubleTy(CFP->getContext()); 
-  // Don't try to shrink to various long double types. 
-  return nullptr; 
-} 
- 
-// Determine if this is a vector of ConstantFPs and if so, return the minimal 
-// type we can safely truncate all elements to. 
-// TODO: Make these support undef elements. 
-static Type *shrinkFPConstantVector(Value *V) { 
-  auto *CV = dyn_cast<Constant>(V); 
-  auto *CVVTy = dyn_cast<VectorType>(V->getType()); 
-  if (!CV || !CVVTy) 
-    return nullptr; 
- 
-  Type *MinType = nullptr; 
- 
+  }
+
+  return nullptr;
+}
+
+/// Return a Constant* for the specified floating-point constant if it fits
+/// in the specified FP type without changing its value.
+static bool fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) {
+  bool losesInfo;
+  APFloat F = CFP->getValueAPF();
+  (void)F.convert(Sem, APFloat::rmNearestTiesToEven, &losesInfo);
+  return !losesInfo;
+}
+
+static Type *shrinkFPConstant(ConstantFP *CFP) {
+  if (CFP->getType() == Type::getPPC_FP128Ty(CFP->getContext()))
+    return nullptr;  // No constant folding of this.
+  // See if the value can be truncated to half and then reextended.
+  if (fitsInFPType(CFP, APFloat::IEEEhalf()))
+    return Type::getHalfTy(CFP->getContext());
+  // See if the value can be truncated to float and then reextended.
+  if (fitsInFPType(CFP, APFloat::IEEEsingle()))
+    return Type::getFloatTy(CFP->getContext());
+  if (CFP->getType()->isDoubleTy())
+    return nullptr;  // Won't shrink.
+  if (fitsInFPType(CFP, APFloat::IEEEdouble()))
+    return Type::getDoubleTy(CFP->getContext());
+  // Don't try to shrink to various long double types.
+  return nullptr;
+}
+
+// Determine if this is a vector of ConstantFPs and if so, return the minimal
+// type we can safely truncate all elements to.
+// TODO: Make these support undef elements.
+static Type *shrinkFPConstantVector(Value *V) {
+  auto *CV = dyn_cast<Constant>(V);
+  auto *CVVTy = dyn_cast<VectorType>(V->getType());
+  if (!CV || !CVVTy)
+    return nullptr;
+
+  Type *MinType = nullptr;
+
   unsigned NumElts = cast<FixedVectorType>(CVVTy)->getNumElements();
-  for (unsigned i = 0; i != NumElts; ++i) { 
-    auto *CFP = dyn_cast_or_null<ConstantFP>(CV->getAggregateElement(i)); 
-    if (!CFP) 
-      return nullptr; 
- 
-    Type *T = shrinkFPConstant(CFP); 
-    if (!T) 
-      return nullptr; 
- 
-    // If we haven't found a type yet or this type has a larger mantissa than 
-    // our previous type, this is our new minimal type. 
-    if (!MinType || T->getFPMantissaWidth() > MinType->getFPMantissaWidth()) 
-      MinType = T; 
-  } 
- 
-  // Make a vector type from the minimal type. 
-  return FixedVectorType::get(MinType, NumElts); 
-} 
- 
-/// Find the minimum FP type we can safely truncate to. 
-static Type *getMinimumFPType(Value *V) { 
-  if (auto *FPExt = dyn_cast<FPExtInst>(V)) 
-    return FPExt->getOperand(0)->getType(); 
- 
-  // If this value is a constant, return the constant in the smallest FP type 
-  // that can accurately represent it.  This allows us to turn 
-  // (float)((double)X+2.0) into x+2.0f. 
-  if (auto *CFP = dyn_cast<ConstantFP>(V)) 
-    if (Type *T = shrinkFPConstant(CFP)) 
-      return T; 
- 
-  // Try to shrink a vector of FP constants. 
-  if (Type *T = shrinkFPConstantVector(V)) 
-    return T; 
- 
-  return V->getType(); 
-} 
- 
-/// Return true if the cast from integer to FP can be proven to be exact for all 
-/// possible inputs (the conversion does not lose any precision). 
-static bool isKnownExactCastIntToFP(CastInst &I) { 
-  CastInst::CastOps Opcode = I.getOpcode(); 
-  assert((Opcode == CastInst::SIToFP || Opcode == CastInst::UIToFP) && 
-         "Unexpected cast"); 
-  Value *Src = I.getOperand(0); 
-  Type *SrcTy = Src->getType(); 
-  Type *FPTy = I.getType(); 
-  bool IsSigned = Opcode == Instruction::SIToFP; 
-  int SrcSize = (int)SrcTy->getScalarSizeInBits() - IsSigned; 
- 
-  // Easy case - if the source integer type has less bits than the FP mantissa, 
-  // then the cast must be exact. 
-  int DestNumSigBits = FPTy->getFPMantissaWidth(); 
-  if (SrcSize <= DestNumSigBits) 
-    return true; 
- 
-  // Cast from FP to integer and back to FP is independent of the intermediate 
-  // integer width because of poison on overflow. 
-  Value *F; 
-  if (match(Src, m_FPToSI(m_Value(F))) || match(Src, m_FPToUI(m_Value(F)))) { 
-    // If this is uitofp (fptosi F), the source needs an extra bit to avoid 
-    // potential rounding of negative FP input values. 
-    int SrcNumSigBits = F->getType()->getFPMantissaWidth(); 
-    if (!IsSigned && match(Src, m_FPToSI(m_Value()))) 
-      SrcNumSigBits++; 
- 
-    // [su]itofp (fpto[su]i F) --> exact if the source type has less or equal 
-    // significant bits than the destination (and make sure neither type is 
-    // weird -- ppc_fp128). 
-    if (SrcNumSigBits > 0 && DestNumSigBits > 0 && 
-        SrcNumSigBits <= DestNumSigBits) 
-      return true; 
-  } 
- 
-  // TODO: 
-  // Try harder to find if the source integer type has less significant bits. 
-  // For example, compute number of sign bits or compute low bit mask. 
-  return false; 
-} 
- 
+  for (unsigned i = 0; i != NumElts; ++i) {
+    auto *CFP = dyn_cast_or_null<ConstantFP>(CV->getAggregateElement(i));
+    if (!CFP)
+      return nullptr;
+
+    Type *T = shrinkFPConstant(CFP);
+    if (!T)
+      return nullptr;
+
+    // If we haven't found a type yet or this type has a larger mantissa than
+    // our previous type, this is our new minimal type.
+    if (!MinType || T->getFPMantissaWidth() > MinType->getFPMantissaWidth())
+      MinType = T;
+  }
+
+  // Make a vector type from the minimal type.
+  return FixedVectorType::get(MinType, NumElts);
+}
+
+/// Find the minimum FP type we can safely truncate to.
+static Type *getMinimumFPType(Value *V) {
+  if (auto *FPExt = dyn_cast<FPExtInst>(V))
+    return FPExt->getOperand(0)->getType();
+
+  // If this value is a constant, return the constant in the smallest FP type
+  // that can accurately represent it.  This allows us to turn
+  // (float)((double)X+2.0) into x+2.0f.
+  if (auto *CFP = dyn_cast<ConstantFP>(V))
+    if (Type *T = shrinkFPConstant(CFP))
+      return T;
+
+  // Try to shrink a vector of FP constants.
+  if (Type *T = shrinkFPConstantVector(V))
+    return T;
+
+  return V->getType();
+}
+
+/// Return true if the cast from integer to FP can be proven to be exact for all
+/// possible inputs (the conversion does not lose any precision).
+static bool isKnownExactCastIntToFP(CastInst &I) {
+  CastInst::CastOps Opcode = I.getOpcode();
+  assert((Opcode == CastInst::SIToFP || Opcode == CastInst::UIToFP) &&
+         "Unexpected cast");
+  Value *Src = I.getOperand(0);
+  Type *SrcTy = Src->getType();
+  Type *FPTy = I.getType();
+  bool IsSigned = Opcode == Instruction::SIToFP;
+  int SrcSize = (int)SrcTy->getScalarSizeInBits() - IsSigned;
+
+  // Easy case - if the source integer type has less bits than the FP mantissa,
+  // then the cast must be exact.
+  int DestNumSigBits = FPTy->getFPMantissaWidth();
+  if (SrcSize <= DestNumSigBits)
+    return true;
+
+  // Cast from FP to integer and back to FP is independent of the intermediate
+  // integer width because of poison on overflow.
+  Value *F;
+  if (match(Src, m_FPToSI(m_Value(F))) || match(Src, m_FPToUI(m_Value(F)))) {
+    // If this is uitofp (fptosi F), the source needs an extra bit to avoid
+    // potential rounding of negative FP input values.
+    int SrcNumSigBits = F->getType()->getFPMantissaWidth();
+    if (!IsSigned && match(Src, m_FPToSI(m_Value())))
+      SrcNumSigBits++;
+
+    // [su]itofp (fpto[su]i F) --> exact if the source type has less or equal
+    // significant bits than the destination (and make sure neither type is
+    // weird -- ppc_fp128).
+    if (SrcNumSigBits > 0 && DestNumSigBits > 0 &&
+        SrcNumSigBits <= DestNumSigBits)
+      return true;
+  }
+
+  // TODO:
+  // Try harder to find if the source integer type has less significant bits.
+  // For example, compute number of sign bits or compute low bit mask.
+  return false;
+}
+
 Instruction *InstCombinerImpl::visitFPTrunc(FPTruncInst &FPT) {
-  if (Instruction *I = commonCastTransforms(FPT)) 
-    return I; 
- 
-  // If we have fptrunc(OpI (fpextend x), (fpextend y)), we would like to 
-  // simplify this expression to avoid one or more of the trunc/extend 
-  // operations if we can do so without changing the numerical results. 
-  // 
-  // The exact manner in which the widths of the operands interact to limit 
-  // what we can and cannot do safely varies from operation to operation, and 
-  // is explained below in the various case statements. 
-  Type *Ty = FPT.getType(); 
-  auto *BO = dyn_cast<BinaryOperator>(FPT.getOperand(0)); 
-  if (BO && BO->hasOneUse()) { 
-    Type *LHSMinType = getMinimumFPType(BO->getOperand(0)); 
-    Type *RHSMinType = getMinimumFPType(BO->getOperand(1)); 
-    unsigned OpWidth = BO->getType()->getFPMantissaWidth(); 
-    unsigned LHSWidth = LHSMinType->getFPMantissaWidth(); 
-    unsigned RHSWidth = RHSMinType->getFPMantissaWidth(); 
-    unsigned SrcWidth = std::max(LHSWidth, RHSWidth); 
-    unsigned DstWidth = Ty->getFPMantissaWidth(); 
-    switch (BO->getOpcode()) { 
-      default: break; 
-      case Instruction::FAdd: 
-      case Instruction::FSub: 
-        // For addition and subtraction, the infinitely precise result can 
-        // essentially be arbitrarily wide; proving that double rounding 
-        // will not occur because the result of OpI is exact (as we will for 
-        // FMul, for example) is hopeless.  However, we *can* nonetheless 
-        // frequently know that double rounding cannot occur (or that it is 
-        // innocuous) by taking advantage of the specific structure of 
-        // infinitely-precise results that admit double rounding. 
-        // 
-        // Specifically, if OpWidth >= 2*DstWdith+1 and DstWidth is sufficient 
-        // to represent both sources, we can guarantee that the double 
-        // rounding is innocuous (See p50 of Figueroa's 2000 PhD thesis, 
-        // "A Rigorous Framework for Fully Supporting the IEEE Standard ..." 
-        // for proof of this fact). 
-        // 
-        // Note: Figueroa does not consider the case where DstFormat != 
-        // SrcFormat.  It's possible (likely even!) that this analysis 
-        // could be tightened for those cases, but they are rare (the main 
-        // case of interest here is (float)((double)float + float)). 
-        if (OpWidth >= 2*DstWidth+1 && DstWidth >= SrcWidth) { 
-          Value *LHS = Builder.CreateFPTrunc(BO->getOperand(0), Ty); 
-          Value *RHS = Builder.CreateFPTrunc(BO->getOperand(1), Ty); 
-          Instruction *RI = BinaryOperator::Create(BO->getOpcode(), LHS, RHS); 
-          RI->copyFastMathFlags(BO); 
-          return RI; 
-        } 
-        break; 
-      case Instruction::FMul: 
-        // For multiplication, the infinitely precise result has at most 
-        // LHSWidth + RHSWidth significant bits; if OpWidth is sufficient 
-        // that such a value can be exactly represented, then no double 
-        // rounding can possibly occur; we can safely perform the operation 
-        // in the destination format if it can represent both sources. 
-        if (OpWidth >= LHSWidth + RHSWidth && DstWidth >= SrcWidth) { 
-          Value *LHS = Builder.CreateFPTrunc(BO->getOperand(0), Ty); 
-          Value *RHS = Builder.CreateFPTrunc(BO->getOperand(1), Ty); 
-          return BinaryOperator::CreateFMulFMF(LHS, RHS, BO); 
-        } 
-        break; 
-      case Instruction::FDiv: 
-        // For division, we use again use the bound from Figueroa's 
-        // dissertation.  I am entirely certain that this bound can be 
-        // tightened in the unbalanced operand case by an analysis based on 
-        // the diophantine rational approximation bound, but the well-known 
-        // condition used here is a good conservative first pass. 
-        // TODO: Tighten bound via rigorous analysis of the unbalanced case. 
-        if (OpWidth >= 2*DstWidth && DstWidth >= SrcWidth) { 
-          Value *LHS = Builder.CreateFPTrunc(BO->getOperand(0), Ty); 
-          Value *RHS = Builder.CreateFPTrunc(BO->getOperand(1), Ty); 
-          return BinaryOperator::CreateFDivFMF(LHS, RHS, BO); 
-        } 
-        break; 
-      case Instruction::FRem: { 
-        // Remainder is straightforward.  Remainder is always exact, so the 
-        // type of OpI doesn't enter into things at all.  We simply evaluate 
-        // in whichever source type is larger, then convert to the 
-        // destination type. 
-        if (SrcWidth == OpWidth) 
-          break; 
-        Value *LHS, *RHS; 
-        if (LHSWidth == SrcWidth) { 
-           LHS = Builder.CreateFPTrunc(BO->getOperand(0), LHSMinType); 
-           RHS = Builder.CreateFPTrunc(BO->getOperand(1), LHSMinType); 
-        } else { 
-           LHS = Builder.CreateFPTrunc(BO->getOperand(0), RHSMinType); 
-           RHS = Builder.CreateFPTrunc(BO->getOperand(1), RHSMinType); 
-        } 
- 
-        Value *ExactResult = Builder.CreateFRemFMF(LHS, RHS, BO); 
-        return CastInst::CreateFPCast(ExactResult, Ty); 
-      } 
-    } 
-  } 
- 
-  // (fptrunc (fneg x)) -> (fneg (fptrunc x)) 
-  Value *X; 
-  Instruction *Op = dyn_cast<Instruction>(FPT.getOperand(0)); 
-  if (Op && Op->hasOneUse()) { 
-    // FIXME: The FMF should propagate from the fptrunc, not the source op. 
-    IRBuilder<>::FastMathFlagGuard FMFG(Builder); 
-    if (isa<FPMathOperator>(Op)) 
-      Builder.setFastMathFlags(Op->getFastMathFlags()); 
- 
-    if (match(Op, m_FNeg(m_Value(X)))) { 
-      Value *InnerTrunc = Builder.CreateFPTrunc(X, Ty); 
- 
-      return UnaryOperator::CreateFNegFMF(InnerTrunc, Op); 
-    } 
- 
-    // If we are truncating a select that has an extended operand, we can 
-    // narrow the other operand and do the select as a narrow op. 
-    Value *Cond, *X, *Y; 
-    if (match(Op, m_Select(m_Value(Cond), m_FPExt(m_Value(X)), m_Value(Y))) && 
-        X->getType() == Ty) { 
-      // fptrunc (select Cond, (fpext X), Y --> select Cond, X, (fptrunc Y) 
-      Value *NarrowY = Builder.CreateFPTrunc(Y, Ty); 
-      Value *Sel = Builder.CreateSelect(Cond, X, NarrowY, "narrow.sel", Op); 
-      return replaceInstUsesWith(FPT, Sel); 
-    } 
-    if (match(Op, m_Select(m_Value(Cond), m_Value(Y), m_FPExt(m_Value(X)))) && 
-        X->getType() == Ty) { 
-      // fptrunc (select Cond, Y, (fpext X) --> select Cond, (fptrunc Y), X 
-      Value *NarrowY = Builder.CreateFPTrunc(Y, Ty); 
-      Value *Sel = Builder.CreateSelect(Cond, NarrowY, X, "narrow.sel", Op); 
-      return replaceInstUsesWith(FPT, Sel); 
-    } 
-  } 
- 
-  if (auto *II = dyn_cast<IntrinsicInst>(FPT.getOperand(0))) { 
-    switch (II->getIntrinsicID()) { 
-    default: break; 
-    case Intrinsic::ceil: 
-    case Intrinsic::fabs: 
-    case Intrinsic::floor: 
-    case Intrinsic::nearbyint: 
-    case Intrinsic::rint: 
-    case Intrinsic::round: 
-    case Intrinsic::roundeven: 
-    case Intrinsic::trunc: { 
-      Value *Src = II->getArgOperand(0); 
-      if (!Src->hasOneUse()) 
-        break; 
- 
-      // Except for fabs, this transformation requires the input of the unary FP 
-      // operation to be itself an fpext from the type to which we're 
-      // truncating. 
-      if (II->getIntrinsicID() != Intrinsic::fabs) { 
-        FPExtInst *FPExtSrc = dyn_cast<FPExtInst>(Src); 
-        if (!FPExtSrc || FPExtSrc->getSrcTy() != Ty) 
-          break; 
-      } 
- 
-      // Do unary FP operation on smaller type. 
-      // (fptrunc (fabs x)) -> (fabs (fptrunc x)) 
-      Value *InnerTrunc = Builder.CreateFPTrunc(Src, Ty); 
-      Function *Overload = Intrinsic::getDeclaration(FPT.getModule(), 
-                                                     II->getIntrinsicID(), Ty); 
-      SmallVector<OperandBundleDef, 1> OpBundles; 
-      II->getOperandBundlesAsDefs(OpBundles); 
-      CallInst *NewCI = 
-          CallInst::Create(Overload, {InnerTrunc}, OpBundles, II->getName()); 
-      NewCI->copyFastMathFlags(II); 
-      return NewCI; 
-    } 
-    } 
-  } 
- 
-  if (Instruction *I = shrinkInsertElt(FPT, Builder)) 
-    return I; 
- 
-  Value *Src = FPT.getOperand(0); 
-  if (isa<SIToFPInst>(Src) || isa<UIToFPInst>(Src)) { 
-    auto *FPCast = cast<CastInst>(Src); 
-    if (isKnownExactCastIntToFP(*FPCast)) 
-      return CastInst::Create(FPCast->getOpcode(), FPCast->getOperand(0), Ty); 
-  } 
- 
-  return nullptr; 
-} 
- 
+  if (Instruction *I = commonCastTransforms(FPT))
+    return I;
+
+  // If we have fptrunc(OpI (fpextend x), (fpextend y)), we would like to
+  // simplify this expression to avoid one or more of the trunc/extend
+  // operations if we can do so without changing the numerical results.
+  //
+  // The exact manner in which the widths of the operands interact to limit
+  // what we can and cannot do safely varies from operation to operation, and
+  // is explained below in the various case statements.
+  Type *Ty = FPT.getType();
+  auto *BO = dyn_cast<BinaryOperator>(FPT.getOperand(0));
+  if (BO && BO->hasOneUse()) {
+    Type *LHSMinType = getMinimumFPType(BO->getOperand(0));
+    Type *RHSMinType = getMinimumFPType(BO->getOperand(1));
+    unsigned OpWidth = BO->getType()->getFPMantissaWidth();
+    unsigned LHSWidth = LHSMinType->getFPMantissaWidth();
+    unsigned RHSWidth = RHSMinType->getFPMantissaWidth();
+    unsigned SrcWidth = std::max(LHSWidth, RHSWidth);
+    unsigned DstWidth = Ty->getFPMantissaWidth();
+    switch (BO->getOpcode()) {
+      default: break;
+      case Instruction::FAdd:
+      case Instruction::FSub:
+        // For addition and subtraction, the infinitely precise result can
+        // essentially be arbitrarily wide; proving that double rounding
+        // will not occur because the result of OpI is exact (as we will for
+        // FMul, for example) is hopeless.  However, we *can* nonetheless
+        // frequently know that double rounding cannot occur (or that it is
+        // innocuous) by taking advantage of the specific structure of
+        // infinitely-precise results that admit double rounding.
+        //
+        // Specifically, if OpWidth >= 2*DstWdith+1 and DstWidth is sufficient
+        // to represent both sources, we can guarantee that the double
+        // rounding is innocuous (See p50 of Figueroa's 2000 PhD thesis,
+        // "A Rigorous Framework for Fully Supporting the IEEE Standard ..."
+        // for proof of this fact).
+        //
+        // Note: Figueroa does not consider the case where DstFormat !=
+        // SrcFormat.  It's possible (likely even!) that this analysis
+        // could be tightened for those cases, but they are rare (the main
+        // case of interest here is (float)((double)float + float)).
+        if (OpWidth >= 2*DstWidth+1 && DstWidth >= SrcWidth) {
+          Value *LHS = Builder.CreateFPTrunc(BO->getOperand(0), Ty);
+          Value *RHS = Builder.CreateFPTrunc(BO->getOperand(1), Ty);
+          Instruction *RI = BinaryOperator::Create(BO->getOpcode(), LHS, RHS);
+          RI->copyFastMathFlags(BO);
+          return RI;
+        }
+        break;
+      case Instruction::FMul:
+        // For multiplication, the infinitely precise result has at most
+        // LHSWidth + RHSWidth significant bits; if OpWidth is sufficient
+        // that such a value can be exactly represented, then no double
+        // rounding can possibly occur; we can safely perform the operation
+        // in the destination format if it can represent both sources.
+        if (OpWidth >= LHSWidth + RHSWidth && DstWidth >= SrcWidth) {
+          Value *LHS = Builder.CreateFPTrunc(BO->getOperand(0), Ty);
+          Value *RHS = Builder.CreateFPTrunc(BO->getOperand(1), Ty);
+          return BinaryOperator::CreateFMulFMF(LHS, RHS, BO);
+        }
+        break;
+      case Instruction::FDiv:
+        // For division, we use again use the bound from Figueroa's
+        // dissertation.  I am entirely certain that this bound can be
+        // tightened in the unbalanced operand case by an analysis based on
+        // the diophantine rational approximation bound, but the well-known
+        // condition used here is a good conservative first pass.
+        // TODO: Tighten bound via rigorous analysis of the unbalanced case.
+        if (OpWidth >= 2*DstWidth && DstWidth >= SrcWidth) {
+          Value *LHS = Builder.CreateFPTrunc(BO->getOperand(0), Ty);
+          Value *RHS = Builder.CreateFPTrunc(BO->getOperand(1), Ty);
+          return BinaryOperator::CreateFDivFMF(LHS, RHS, BO);
+        }
+        break;
+      case Instruction::FRem: {
+        // Remainder is straightforward.  Remainder is always exact, so the
+        // type of OpI doesn't enter into things at all.  We simply evaluate
+        // in whichever source type is larger, then convert to the
+        // destination type.
+        if (SrcWidth == OpWidth)
+          break;
+        Value *LHS, *RHS;
+        if (LHSWidth == SrcWidth) {
+           LHS = Builder.CreateFPTrunc(BO->getOperand(0), LHSMinType);
+           RHS = Builder.CreateFPTrunc(BO->getOperand(1), LHSMinType);
+        } else {
+           LHS = Builder.CreateFPTrunc(BO->getOperand(0), RHSMinType);
+           RHS = Builder.CreateFPTrunc(BO->getOperand(1), RHSMinType);
+        }
+
+        Value *ExactResult = Builder.CreateFRemFMF(LHS, RHS, BO);
+        return CastInst::CreateFPCast(ExactResult, Ty);
+      }
+    }
+  }
+
+  // (fptrunc (fneg x)) -> (fneg (fptrunc x))
+  Value *X;
+  Instruction *Op = dyn_cast<Instruction>(FPT.getOperand(0));
+  if (Op && Op->hasOneUse()) {
+    // FIXME: The FMF should propagate from the fptrunc, not the source op.
+    IRBuilder<>::FastMathFlagGuard FMFG(Builder);
+    if (isa<FPMathOperator>(Op))
+      Builder.setFastMathFlags(Op->getFastMathFlags());
+
+    if (match(Op, m_FNeg(m_Value(X)))) {
+      Value *InnerTrunc = Builder.CreateFPTrunc(X, Ty);
+
+      return UnaryOperator::CreateFNegFMF(InnerTrunc, Op);
+    }
+
+    // If we are truncating a select that has an extended operand, we can
+    // narrow the other operand and do the select as a narrow op.
+    Value *Cond, *X, *Y;
+    if (match(Op, m_Select(m_Value(Cond), m_FPExt(m_Value(X)), m_Value(Y))) &&
+        X->getType() == Ty) {
+      // fptrunc (select Cond, (fpext X), Y --> select Cond, X, (fptrunc Y)
+      Value *NarrowY = Builder.CreateFPTrunc(Y, Ty);
+      Value *Sel = Builder.CreateSelect(Cond, X, NarrowY, "narrow.sel", Op);
+      return replaceInstUsesWith(FPT, Sel);
+    }
+    if (match(Op, m_Select(m_Value(Cond), m_Value(Y), m_FPExt(m_Value(X)))) &&
+        X->getType() == Ty) {
+      // fptrunc (select Cond, Y, (fpext X) --> select Cond, (fptrunc Y), X
+      Value *NarrowY = Builder.CreateFPTrunc(Y, Ty);
+      Value *Sel = Builder.CreateSelect(Cond, NarrowY, X, "narrow.sel", Op);
+      return replaceInstUsesWith(FPT, Sel);
+    }
+  }
+
+  if (auto *II = dyn_cast<IntrinsicInst>(FPT.getOperand(0))) {
+    switch (II->getIntrinsicID()) {
+    default: break;
+    case Intrinsic::ceil:
+    case Intrinsic::fabs:
+    case Intrinsic::floor:
+    case Intrinsic::nearbyint:
+    case Intrinsic::rint:
+    case Intrinsic::round:
+    case Intrinsic::roundeven:
+    case Intrinsic::trunc: {
+      Value *Src = II->getArgOperand(0);
+      if (!Src->hasOneUse())
+        break;
+
+      // Except for fabs, this transformation requires the input of the unary FP
+      // operation to be itself an fpext from the type to which we're
+      // truncating.
+      if (II->getIntrinsicID() != Intrinsic::fabs) {
+        FPExtInst *FPExtSrc = dyn_cast<FPExtInst>(Src);
+        if (!FPExtSrc || FPExtSrc->getSrcTy() != Ty)
+          break;
+      }
+
+      // Do unary FP operation on smaller type.
+      // (fptrunc (fabs x)) -> (fabs (fptrunc x))
+      Value *InnerTrunc = Builder.CreateFPTrunc(Src, Ty);
+      Function *Overload = Intrinsic::getDeclaration(FPT.getModule(),
+                                                     II->getIntrinsicID(), Ty);
+      SmallVector<OperandBundleDef, 1> OpBundles;
+      II->getOperandBundlesAsDefs(OpBundles);
+      CallInst *NewCI =
+          CallInst::Create(Overload, {InnerTrunc}, OpBundles, II->getName());
+      NewCI->copyFastMathFlags(II);
+      return NewCI;
+    }
+    }
+  }
+
+  if (Instruction *I = shrinkInsertElt(FPT, Builder))
+    return I;
+
+  Value *Src = FPT.getOperand(0);
+  if (isa<SIToFPInst>(Src) || isa<UIToFPInst>(Src)) {
+    auto *FPCast = cast<CastInst>(Src);
+    if (isKnownExactCastIntToFP(*FPCast))
+      return CastInst::Create(FPCast->getOpcode(), FPCast->getOperand(0), Ty);
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::visitFPExt(CastInst &FPExt) {
-  // If the source operand is a cast from integer to FP and known exact, then 
-  // cast the integer operand directly to the destination type. 
-  Type *Ty = FPExt.getType(); 
-  Value *Src = FPExt.getOperand(0); 
-  if (isa<SIToFPInst>(Src) || isa<UIToFPInst>(Src)) { 
-    auto *FPCast = cast<CastInst>(Src); 
-    if (isKnownExactCastIntToFP(*FPCast)) 
-      return CastInst::Create(FPCast->getOpcode(), FPCast->getOperand(0), Ty); 
-  } 
- 
-  return commonCastTransforms(FPExt); 
-} 
- 
-/// fpto{s/u}i({u/s}itofp(X)) --> X or zext(X) or sext(X) or trunc(X) 
-/// This is safe if the intermediate type has enough bits in its mantissa to 
-/// accurately represent all values of X.  For example, this won't work with 
-/// i64 -> float -> i64. 
+  // If the source operand is a cast from integer to FP and known exact, then
+  // cast the integer operand directly to the destination type.
+  Type *Ty = FPExt.getType();
+  Value *Src = FPExt.getOperand(0);
+  if (isa<SIToFPInst>(Src) || isa<UIToFPInst>(Src)) {
+    auto *FPCast = cast<CastInst>(Src);
+    if (isKnownExactCastIntToFP(*FPCast))
+      return CastInst::Create(FPCast->getOpcode(), FPCast->getOperand(0), Ty);
+  }
+
+  return commonCastTransforms(FPExt);
+}
+
+/// fpto{s/u}i({u/s}itofp(X)) --> X or zext(X) or sext(X) or trunc(X)
+/// This is safe if the intermediate type has enough bits in its mantissa to
+/// accurately represent all values of X.  For example, this won't work with
+/// i64 -> float -> i64.
 Instruction *InstCombinerImpl::foldItoFPtoI(CastInst &FI) {
-  if (!isa<UIToFPInst>(FI.getOperand(0)) && !isa<SIToFPInst>(FI.getOperand(0))) 
-    return nullptr; 
- 
-  auto *OpI = cast<CastInst>(FI.getOperand(0)); 
-  Value *X = OpI->getOperand(0); 
-  Type *XType = X->getType(); 
-  Type *DestType = FI.getType(); 
-  bool IsOutputSigned = isa<FPToSIInst>(FI); 
- 
-  // Since we can assume the conversion won't overflow, our decision as to 
-  // whether the input will fit in the float should depend on the minimum 
-  // of the input range and output range. 
- 
-  // This means this is also safe for a signed input and unsigned output, since 
-  // a negative input would lead to undefined behavior. 
-  if (!isKnownExactCastIntToFP(*OpI)) { 
-    // The first cast may not round exactly based on the source integer width 
-    // and FP width, but the overflow UB rules can still allow this to fold. 
-    // If the destination type is narrow, that means the intermediate FP value 
-    // must be large enough to hold the source value exactly. 
-    // For example, (uint8_t)((float)(uint32_t 16777217) is undefined behavior. 
-    int OutputSize = (int)DestType->getScalarSizeInBits() - IsOutputSigned; 
-    if (OutputSize > OpI->getType()->getFPMantissaWidth()) 
-      return nullptr; 
-  } 
- 
-  if (DestType->getScalarSizeInBits() > XType->getScalarSizeInBits()) { 
-    bool IsInputSigned = isa<SIToFPInst>(OpI); 
-    if (IsInputSigned && IsOutputSigned) 
-      return new SExtInst(X, DestType); 
-    return new ZExtInst(X, DestType); 
-  } 
-  if (DestType->getScalarSizeInBits() < XType->getScalarSizeInBits()) 
-    return new TruncInst(X, DestType); 
- 
-  assert(XType == DestType && "Unexpected types for int to FP to int casts"); 
-  return replaceInstUsesWith(FI, X); 
-} 
- 
+  if (!isa<UIToFPInst>(FI.getOperand(0)) && !isa<SIToFPInst>(FI.getOperand(0)))
+    return nullptr;
+
+  auto *OpI = cast<CastInst>(FI.getOperand(0));
+  Value *X = OpI->getOperand(0);
+  Type *XType = X->getType();
+  Type *DestType = FI.getType();
+  bool IsOutputSigned = isa<FPToSIInst>(FI);
+
+  // Since we can assume the conversion won't overflow, our decision as to
+  // whether the input will fit in the float should depend on the minimum
+  // of the input range and output range.
+
+  // This means this is also safe for a signed input and unsigned output, since
+  // a negative input would lead to undefined behavior.
+  if (!isKnownExactCastIntToFP(*OpI)) {
+    // The first cast may not round exactly based on the source integer width
+    // and FP width, but the overflow UB rules can still allow this to fold.
+    // If the destination type is narrow, that means the intermediate FP value
+    // must be large enough to hold the source value exactly.
+    // For example, (uint8_t)((float)(uint32_t 16777217) is undefined behavior.
+    int OutputSize = (int)DestType->getScalarSizeInBits() - IsOutputSigned;
+    if (OutputSize > OpI->getType()->getFPMantissaWidth())
+      return nullptr;
+  }
+
+  if (DestType->getScalarSizeInBits() > XType->getScalarSizeInBits()) {
+    bool IsInputSigned = isa<SIToFPInst>(OpI);
+    if (IsInputSigned && IsOutputSigned)
+      return new SExtInst(X, DestType);
+    return new ZExtInst(X, DestType);
+  }
+  if (DestType->getScalarSizeInBits() < XType->getScalarSizeInBits())
+    return new TruncInst(X, DestType);
+
+  assert(XType == DestType && "Unexpected types for int to FP to int casts");
+  return replaceInstUsesWith(FI, X);
+}
+
 Instruction *InstCombinerImpl::visitFPToUI(FPToUIInst &FI) {
-  if (Instruction *I = foldItoFPtoI(FI)) 
-    return I; 
- 
-  return commonCastTransforms(FI); 
-} 
- 
+  if (Instruction *I = foldItoFPtoI(FI))
+    return I;
+
+  return commonCastTransforms(FI);
+}
+
 Instruction *InstCombinerImpl::visitFPToSI(FPToSIInst &FI) {
-  if (Instruction *I = foldItoFPtoI(FI)) 
-    return I; 
- 
-  return commonCastTransforms(FI); 
-} 
- 
+  if (Instruction *I = foldItoFPtoI(FI))
+    return I;
+
+  return commonCastTransforms(FI);
+}
+
 Instruction *InstCombinerImpl::visitUIToFP(CastInst &CI) {
-  return commonCastTransforms(CI); 
-} 
- 
+  return commonCastTransforms(CI);
+}
+
 Instruction *InstCombinerImpl::visitSIToFP(CastInst &CI) {
-  return commonCastTransforms(CI); 
-} 
- 
+  return commonCastTransforms(CI);
+}
+
 Instruction *InstCombinerImpl::visitIntToPtr(IntToPtrInst &CI) {
-  // If the source integer type is not the intptr_t type for this target, do a 
-  // trunc or zext to the intptr_t type, then inttoptr of it.  This allows the 
-  // cast to be exposed to other transforms. 
-  unsigned AS = CI.getAddressSpace(); 
-  if (CI.getOperand(0)->getType()->getScalarSizeInBits() != 
-      DL.getPointerSizeInBits(AS)) { 
-    Type *Ty = DL.getIntPtrType(CI.getContext(), AS); 
-    // Handle vectors of pointers. 
-    if (auto *CIVTy = dyn_cast<VectorType>(CI.getType())) 
-      Ty = VectorType::get(Ty, CIVTy->getElementCount()); 
- 
-    Value *P = Builder.CreateZExtOrTrunc(CI.getOperand(0), Ty); 
-    return new IntToPtrInst(P, CI.getType()); 
-  } 
- 
-  if (Instruction *I = commonCastTransforms(CI)) 
-    return I; 
- 
-  return nullptr; 
-} 
- 
-/// Implement the transforms for cast of pointer (bitcast/ptrtoint) 
+  // If the source integer type is not the intptr_t type for this target, do a
+  // trunc or zext to the intptr_t type, then inttoptr of it.  This allows the
+  // cast to be exposed to other transforms.
+  unsigned AS = CI.getAddressSpace();
+  if (CI.getOperand(0)->getType()->getScalarSizeInBits() !=
+      DL.getPointerSizeInBits(AS)) {
+    Type *Ty = DL.getIntPtrType(CI.getContext(), AS);
+    // Handle vectors of pointers.
+    if (auto *CIVTy = dyn_cast<VectorType>(CI.getType()))
+      Ty = VectorType::get(Ty, CIVTy->getElementCount());
+
+    Value *P = Builder.CreateZExtOrTrunc(CI.getOperand(0), Ty);
+    return new IntToPtrInst(P, CI.getType());
+  }
+
+  if (Instruction *I = commonCastTransforms(CI))
+    return I;
+
+  return nullptr;
+}
+
+/// Implement the transforms for cast of pointer (bitcast/ptrtoint)
 Instruction *InstCombinerImpl::commonPointerCastTransforms(CastInst &CI) {
-  Value *Src = CI.getOperand(0); 
- 
-  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Src)) { 
-    // If casting the result of a getelementptr instruction with no offset, turn 
-    // this into a cast of the original pointer! 
-    if (GEP->hasAllZeroIndices() && 
-        // If CI is an addrspacecast and GEP changes the poiner type, merging 
-        // GEP into CI would undo canonicalizing addrspacecast with different 
-        // pointer types, causing infinite loops. 
-        (!isa<AddrSpaceCastInst>(CI) || 
-         GEP->getType() == GEP->getPointerOperandType())) { 
-      // Changing the cast operand is usually not a good idea but it is safe 
-      // here because the pointer operand is being replaced with another 
-      // pointer operand so the opcode doesn't need to change. 
-      return replaceOperand(CI, 0, GEP->getOperand(0)); 
-    } 
-  } 
- 
-  return commonCastTransforms(CI); 
-} 
- 
+  Value *Src = CI.getOperand(0);
+
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Src)) {
+    // If casting the result of a getelementptr instruction with no offset, turn
+    // this into a cast of the original pointer!
+    if (GEP->hasAllZeroIndices() &&
+        // If CI is an addrspacecast and GEP changes the poiner type, merging
+        // GEP into CI would undo canonicalizing addrspacecast with different
+        // pointer types, causing infinite loops.
+        (!isa<AddrSpaceCastInst>(CI) ||
+         GEP->getType() == GEP->getPointerOperandType())) {
+      // Changing the cast operand is usually not a good idea but it is safe
+      // here because the pointer operand is being replaced with another
+      // pointer operand so the opcode doesn't need to change.
+      return replaceOperand(CI, 0, GEP->getOperand(0));
+    }
+  }
+
+  return commonCastTransforms(CI);
+}
+
 Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) {
-  // If the destination integer type is not the intptr_t type for this target, 
-  // do a ptrtoint to intptr_t then do a trunc or zext.  This allows the cast 
-  // to be exposed to other transforms. 
+  // If the destination integer type is not the intptr_t type for this target,
+  // do a ptrtoint to intptr_t then do a trunc or zext.  This allows the cast
+  // to be exposed to other transforms.
   Value *SrcOp = CI.getPointerOperand();
-  Type *Ty = CI.getType(); 
-  unsigned AS = CI.getPointerAddressSpace(); 
+  Type *Ty = CI.getType();
+  unsigned AS = CI.getPointerAddressSpace();
   unsigned TySize = Ty->getScalarSizeInBits();
   unsigned PtrSize = DL.getPointerSizeInBits(AS);
   if (TySize != PtrSize) {
@@ -1979,11 +1979,11 @@ Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) {
     // Handle vectors of pointers.
     if (auto *VecTy = dyn_cast<VectorType>(Ty))
       IntPtrTy = VectorType::get(IntPtrTy, VecTy->getElementCount());
- 
+
     Value *P = Builder.CreatePtrToInt(SrcOp, IntPtrTy);
     return CastInst::CreateIntegerCast(P, Ty, /*isSigned=*/false);
   }
- 
+
   Value *Vec, *Scalar, *Index;
   if (match(SrcOp, m_OneUse(m_InsertElt(m_IntToPtr(m_Value(Vec)),
                                         m_Value(Scalar), m_Value(Index)))) &&
@@ -1993,745 +1993,745 @@ Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) {
     // p2i (ins (i2p Vec), Scalar, Index --> ins Vec, (p2i Scalar), Index
     Value *NewCast = Builder.CreatePtrToInt(Scalar, Ty->getScalarType());
     return InsertElementInst::Create(Vec, NewCast, Index);
-  } 
- 
+  }
+
   return commonPointerCastTransforms(CI);
-} 
- 
-/// This input value (which is known to have vector type) is being zero extended 
-/// or truncated to the specified vector type. Since the zext/trunc is done 
-/// using an integer type, we have a (bitcast(cast(bitcast))) pattern, 
-/// endianness will impact which end of the vector that is extended or 
-/// truncated. 
-/// 
-/// A vector is always stored with index 0 at the lowest address, which 
-/// corresponds to the most significant bits for a big endian stored integer and 
-/// the least significant bits for little endian. A trunc/zext of an integer 
-/// impacts the big end of the integer. Thus, we need to add/remove elements at 
-/// the front of the vector for big endian targets, and the back of the vector 
-/// for little endian targets. 
-/// 
-/// Try to replace it with a shuffle (and vector/vector bitcast) if possible. 
-/// 
-/// The source and destination vector types may have different element types. 
+}
+
+/// This input value (which is known to have vector type) is being zero extended
+/// or truncated to the specified vector type. Since the zext/trunc is done
+/// using an integer type, we have a (bitcast(cast(bitcast))) pattern,
+/// endianness will impact which end of the vector that is extended or
+/// truncated.
+///
+/// A vector is always stored with index 0 at the lowest address, which
+/// corresponds to the most significant bits for a big endian stored integer and
+/// the least significant bits for little endian. A trunc/zext of an integer
+/// impacts the big end of the integer. Thus, we need to add/remove elements at
+/// the front of the vector for big endian targets, and the back of the vector
+/// for little endian targets.
+///
+/// Try to replace it with a shuffle (and vector/vector bitcast) if possible.
+///
+/// The source and destination vector types may have different element types.
 static Instruction *
 optimizeVectorResizeWithIntegerBitCasts(Value *InVal, VectorType *DestTy,
                                         InstCombinerImpl &IC) {
-  // We can only do this optimization if the output is a multiple of the input 
-  // element size, or the input is a multiple of the output element size. 
-  // Convert the input type to have the same element type as the output. 
-  VectorType *SrcTy = cast<VectorType>(InVal->getType()); 
- 
-  if (SrcTy->getElementType() != DestTy->getElementType()) { 
-    // The input types don't need to be identical, but for now they must be the 
-    // same size.  There is no specific reason we couldn't handle things like 
-    // <4 x i16> -> <4 x i32> by bitcasting to <2 x i32> but haven't gotten 
-    // there yet. 
-    if (SrcTy->getElementType()->getPrimitiveSizeInBits() != 
-        DestTy->getElementType()->getPrimitiveSizeInBits()) 
-      return nullptr; 
- 
-    SrcTy = 
+  // We can only do this optimization if the output is a multiple of the input
+  // element size, or the input is a multiple of the output element size.
+  // Convert the input type to have the same element type as the output.
+  VectorType *SrcTy = cast<VectorType>(InVal->getType());
+
+  if (SrcTy->getElementType() != DestTy->getElementType()) {
+    // The input types don't need to be identical, but for now they must be the
+    // same size.  There is no specific reason we couldn't handle things like
+    // <4 x i16> -> <4 x i32> by bitcasting to <2 x i32> but haven't gotten
+    // there yet.
+    if (SrcTy->getElementType()->getPrimitiveSizeInBits() !=
+        DestTy->getElementType()->getPrimitiveSizeInBits())
+      return nullptr;
+
+    SrcTy =
         FixedVectorType::get(DestTy->getElementType(),
                              cast<FixedVectorType>(SrcTy)->getNumElements());
-    InVal = IC.Builder.CreateBitCast(InVal, SrcTy); 
-  } 
- 
-  bool IsBigEndian = IC.getDataLayout().isBigEndian(); 
+    InVal = IC.Builder.CreateBitCast(InVal, SrcTy);
+  }
+
+  bool IsBigEndian = IC.getDataLayout().isBigEndian();
   unsigned SrcElts = cast<FixedVectorType>(SrcTy)->getNumElements();
   unsigned DestElts = cast<FixedVectorType>(DestTy)->getNumElements();
- 
-  assert(SrcElts != DestElts && "Element counts should be different."); 
- 
-  // Now that the element types match, get the shuffle mask and RHS of the 
-  // shuffle to use, which depends on whether we're increasing or decreasing the 
-  // size of the input. 
-  SmallVector<int, 16> ShuffleMaskStorage; 
-  ArrayRef<int> ShuffleMask; 
-  Value *V2; 
- 
-  // Produce an identify shuffle mask for the src vector. 
-  ShuffleMaskStorage.resize(SrcElts); 
-  std::iota(ShuffleMaskStorage.begin(), ShuffleMaskStorage.end(), 0); 
- 
-  if (SrcElts > DestElts) { 
-    // If we're shrinking the number of elements (rewriting an integer 
-    // truncate), just shuffle in the elements corresponding to the least 
-    // significant bits from the input and use undef as the second shuffle 
-    // input. 
-    V2 = UndefValue::get(SrcTy); 
-    // Make sure the shuffle mask selects the "least significant bits" by 
-    // keeping elements from back of the src vector for big endian, and from the 
-    // front for little endian. 
-    ShuffleMask = ShuffleMaskStorage; 
-    if (IsBigEndian) 
-      ShuffleMask = ShuffleMask.take_back(DestElts); 
-    else 
-      ShuffleMask = ShuffleMask.take_front(DestElts); 
-  } else { 
-    // If we're increasing the number of elements (rewriting an integer zext), 
-    // shuffle in all of the elements from InVal. Fill the rest of the result 
-    // elements with zeros from a constant zero. 
-    V2 = Constant::getNullValue(SrcTy); 
-    // Use first elt from V2 when indicating zero in the shuffle mask. 
-    uint32_t NullElt = SrcElts; 
-    // Extend with null values in the "most significant bits" by adding elements 
-    // in front of the src vector for big endian, and at the back for little 
-    // endian. 
-    unsigned DeltaElts = DestElts - SrcElts; 
-    if (IsBigEndian) 
-      ShuffleMaskStorage.insert(ShuffleMaskStorage.begin(), DeltaElts, NullElt); 
-    else 
-      ShuffleMaskStorage.append(DeltaElts, NullElt); 
-    ShuffleMask = ShuffleMaskStorage; 
-  } 
- 
-  return new ShuffleVectorInst(InVal, V2, ShuffleMask); 
-} 
- 
-static bool isMultipleOfTypeSize(unsigned Value, Type *Ty) { 
-  return Value % Ty->getPrimitiveSizeInBits() == 0; 
-} 
- 
-static unsigned getTypeSizeIndex(unsigned Value, Type *Ty) { 
-  return Value / Ty->getPrimitiveSizeInBits(); 
-} 
- 
-/// V is a value which is inserted into a vector of VecEltTy. 
-/// Look through the value to see if we can decompose it into 
-/// insertions into the vector.  See the example in the comment for 
-/// OptimizeIntegerToVectorInsertions for the pattern this handles. 
-/// The type of V is always a non-zero multiple of VecEltTy's size. 
-/// Shift is the number of bits between the lsb of V and the lsb of 
-/// the vector. 
-/// 
-/// This returns false if the pattern can't be matched or true if it can, 
-/// filling in Elements with the elements found here. 
-static bool collectInsertionElements(Value *V, unsigned Shift, 
-                                     SmallVectorImpl<Value *> &Elements, 
-                                     Type *VecEltTy, bool isBigEndian) { 
-  assert(isMultipleOfTypeSize(Shift, VecEltTy) && 
-         "Shift should be a multiple of the element type size"); 
- 
-  // Undef values never contribute useful bits to the result. 
-  if (isa<UndefValue>(V)) return true; 
- 
-  // If we got down to a value of the right type, we win, try inserting into the 
-  // right element. 
-  if (V->getType() == VecEltTy) { 
-    // Inserting null doesn't actually insert any elements. 
-    if (Constant *C = dyn_cast<Constant>(V)) 
-      if (C->isNullValue()) 
-        return true; 
- 
-    unsigned ElementIndex = getTypeSizeIndex(Shift, VecEltTy); 
-    if (isBigEndian) 
-      ElementIndex = Elements.size() - ElementIndex - 1; 
- 
-    // Fail if multiple elements are inserted into this slot. 
-    if (Elements[ElementIndex]) 
-      return false; 
- 
-    Elements[ElementIndex] = V; 
-    return true; 
-  } 
- 
-  if (Constant *C = dyn_cast<Constant>(V)) { 
-    // Figure out the # elements this provides, and bitcast it or slice it up 
-    // as required. 
-    unsigned NumElts = getTypeSizeIndex(C->getType()->getPrimitiveSizeInBits(), 
-                                        VecEltTy); 
-    // If the constant is the size of a vector element, we just need to bitcast 
-    // it to the right type so it gets properly inserted. 
-    if (NumElts == 1) 
-      return collectInsertionElements(ConstantExpr::getBitCast(C, VecEltTy), 
-                                      Shift, Elements, VecEltTy, isBigEndian); 
- 
-    // Okay, this is a constant that covers multiple elements.  Slice it up into 
-    // pieces and insert each element-sized piece into the vector. 
-    if (!isa<IntegerType>(C->getType())) 
-      C = ConstantExpr::getBitCast(C, IntegerType::get(V->getContext(), 
-                                       C->getType()->getPrimitiveSizeInBits())); 
-    unsigned ElementSize = VecEltTy->getPrimitiveSizeInBits(); 
-    Type *ElementIntTy = IntegerType::get(C->getContext(), ElementSize); 
- 
-    for (unsigned i = 0; i != NumElts; ++i) { 
-      unsigned ShiftI = Shift+i*ElementSize; 
-      Constant *Piece = ConstantExpr::getLShr(C, ConstantInt::get(C->getType(), 
-                                                                  ShiftI)); 
-      Piece = ConstantExpr::getTrunc(Piece, ElementIntTy); 
-      if (!collectInsertionElements(Piece, ShiftI, Elements, VecEltTy, 
-                                    isBigEndian)) 
-        return false; 
-    } 
-    return true; 
-  } 
- 
-  if (!V->hasOneUse()) return false; 
- 
-  Instruction *I = dyn_cast<Instruction>(V); 
-  if (!I) return false; 
-  switch (I->getOpcode()) { 
-  default: return false; // Unhandled case. 
-  case Instruction::BitCast: 
-    return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, 
-                                    isBigEndian); 
-  case Instruction::ZExt: 
-    if (!isMultipleOfTypeSize( 
-                          I->getOperand(0)->getType()->getPrimitiveSizeInBits(), 
-                              VecEltTy)) 
-      return false; 
-    return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, 
-                                    isBigEndian); 
-  case Instruction::Or: 
-    return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, 
-                                    isBigEndian) && 
-           collectInsertionElements(I->getOperand(1), Shift, Elements, VecEltTy, 
-                                    isBigEndian); 
-  case Instruction::Shl: { 
-    // Must be shifting by a constant that is a multiple of the element size. 
-    ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1)); 
-    if (!CI) return false; 
-    Shift += CI->getZExtValue(); 
-    if (!isMultipleOfTypeSize(Shift, VecEltTy)) return false; 
-    return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, 
-                                    isBigEndian); 
-  } 
- 
-  } 
-} 
- 
- 
-/// If the input is an 'or' instruction, we may be doing shifts and ors to 
-/// assemble the elements of the vector manually. 
-/// Try to rip the code out and replace it with insertelements.  This is to 
-/// optimize code like this: 
-/// 
-///    %tmp37 = bitcast float %inc to i32 
-///    %tmp38 = zext i32 %tmp37 to i64 
-///    %tmp31 = bitcast float %inc5 to i32 
-///    %tmp32 = zext i32 %tmp31 to i64 
-///    %tmp33 = shl i64 %tmp32, 32 
-///    %ins35 = or i64 %tmp33, %tmp38 
-///    %tmp43 = bitcast i64 %ins35 to <2 x float> 
-/// 
-/// Into two insertelements that do "buildvector{%inc, %inc5}". 
-static Value *optimizeIntegerToVectorInsertions(BitCastInst &CI, 
+
+  assert(SrcElts != DestElts && "Element counts should be different.");
+
+  // Now that the element types match, get the shuffle mask and RHS of the
+  // shuffle to use, which depends on whether we're increasing or decreasing the
+  // size of the input.
+  SmallVector<int, 16> ShuffleMaskStorage;
+  ArrayRef<int> ShuffleMask;
+  Value *V2;
+
+  // Produce an identify shuffle mask for the src vector.
+  ShuffleMaskStorage.resize(SrcElts);
+  std::iota(ShuffleMaskStorage.begin(), ShuffleMaskStorage.end(), 0);
+
+  if (SrcElts > DestElts) {
+    // If we're shrinking the number of elements (rewriting an integer
+    // truncate), just shuffle in the elements corresponding to the least
+    // significant bits from the input and use undef as the second shuffle
+    // input.
+    V2 = UndefValue::get(SrcTy);
+    // Make sure the shuffle mask selects the "least significant bits" by
+    // keeping elements from back of the src vector for big endian, and from the
+    // front for little endian.
+    ShuffleMask = ShuffleMaskStorage;
+    if (IsBigEndian)
+      ShuffleMask = ShuffleMask.take_back(DestElts);
+    else
+      ShuffleMask = ShuffleMask.take_front(DestElts);
+  } else {
+    // If we're increasing the number of elements (rewriting an integer zext),
+    // shuffle in all of the elements from InVal. Fill the rest of the result
+    // elements with zeros from a constant zero.
+    V2 = Constant::getNullValue(SrcTy);
+    // Use first elt from V2 when indicating zero in the shuffle mask.
+    uint32_t NullElt = SrcElts;
+    // Extend with null values in the "most significant bits" by adding elements
+    // in front of the src vector for big endian, and at the back for little
+    // endian.
+    unsigned DeltaElts = DestElts - SrcElts;
+    if (IsBigEndian)
+      ShuffleMaskStorage.insert(ShuffleMaskStorage.begin(), DeltaElts, NullElt);
+    else
+      ShuffleMaskStorage.append(DeltaElts, NullElt);
+    ShuffleMask = ShuffleMaskStorage;
+  }
+
+  return new ShuffleVectorInst(InVal, V2, ShuffleMask);
+}
+
+static bool isMultipleOfTypeSize(unsigned Value, Type *Ty) {
+  return Value % Ty->getPrimitiveSizeInBits() == 0;
+}
+
+static unsigned getTypeSizeIndex(unsigned Value, Type *Ty) {
+  return Value / Ty->getPrimitiveSizeInBits();
+}
+
+/// V is a value which is inserted into a vector of VecEltTy.
+/// Look through the value to see if we can decompose it into
+/// insertions into the vector.  See the example in the comment for
+/// OptimizeIntegerToVectorInsertions for the pattern this handles.
+/// The type of V is always a non-zero multiple of VecEltTy's size.
+/// Shift is the number of bits between the lsb of V and the lsb of
+/// the vector.
+///
+/// This returns false if the pattern can't be matched or true if it can,
+/// filling in Elements with the elements found here.
+static bool collectInsertionElements(Value *V, unsigned Shift,
+                                     SmallVectorImpl<Value *> &Elements,
+                                     Type *VecEltTy, bool isBigEndian) {
+  assert(isMultipleOfTypeSize(Shift, VecEltTy) &&
+         "Shift should be a multiple of the element type size");
+
+  // Undef values never contribute useful bits to the result.
+  if (isa<UndefValue>(V)) return true;
+
+  // If we got down to a value of the right type, we win, try inserting into the
+  // right element.
+  if (V->getType() == VecEltTy) {
+    // Inserting null doesn't actually insert any elements.
+    if (Constant *C = dyn_cast<Constant>(V))
+      if (C->isNullValue())
+        return true;
+
+    unsigned ElementIndex = getTypeSizeIndex(Shift, VecEltTy);
+    if (isBigEndian)
+      ElementIndex = Elements.size() - ElementIndex - 1;
+
+    // Fail if multiple elements are inserted into this slot.
+    if (Elements[ElementIndex])
+      return false;
+
+    Elements[ElementIndex] = V;
+    return true;
+  }
+
+  if (Constant *C = dyn_cast<Constant>(V)) {
+    // Figure out the # elements this provides, and bitcast it or slice it up
+    // as required.
+    unsigned NumElts = getTypeSizeIndex(C->getType()->getPrimitiveSizeInBits(),
+                                        VecEltTy);
+    // If the constant is the size of a vector element, we just need to bitcast
+    // it to the right type so it gets properly inserted.
+    if (NumElts == 1)
+      return collectInsertionElements(ConstantExpr::getBitCast(C, VecEltTy),
+                                      Shift, Elements, VecEltTy, isBigEndian);
+
+    // Okay, this is a constant that covers multiple elements.  Slice it up into
+    // pieces and insert each element-sized piece into the vector.
+    if (!isa<IntegerType>(C->getType()))
+      C = ConstantExpr::getBitCast(C, IntegerType::get(V->getContext(),
+                                       C->getType()->getPrimitiveSizeInBits()));
+    unsigned ElementSize = VecEltTy->getPrimitiveSizeInBits();
+    Type *ElementIntTy = IntegerType::get(C->getContext(), ElementSize);
+
+    for (unsigned i = 0; i != NumElts; ++i) {
+      unsigned ShiftI = Shift+i*ElementSize;
+      Constant *Piece = ConstantExpr::getLShr(C, ConstantInt::get(C->getType(),
+                                                                  ShiftI));
+      Piece = ConstantExpr::getTrunc(Piece, ElementIntTy);
+      if (!collectInsertionElements(Piece, ShiftI, Elements, VecEltTy,
+                                    isBigEndian))
+        return false;
+    }
+    return true;
+  }
+
+  if (!V->hasOneUse()) return false;
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
+  switch (I->getOpcode()) {
+  default: return false; // Unhandled case.
+  case Instruction::BitCast:
+    return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy,
+                                    isBigEndian);
+  case Instruction::ZExt:
+    if (!isMultipleOfTypeSize(
+                          I->getOperand(0)->getType()->getPrimitiveSizeInBits(),
+                              VecEltTy))
+      return false;
+    return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy,
+                                    isBigEndian);
+  case Instruction::Or:
+    return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy,
+                                    isBigEndian) &&
+           collectInsertionElements(I->getOperand(1), Shift, Elements, VecEltTy,
+                                    isBigEndian);
+  case Instruction::Shl: {
+    // Must be shifting by a constant that is a multiple of the element size.
+    ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1));
+    if (!CI) return false;
+    Shift += CI->getZExtValue();
+    if (!isMultipleOfTypeSize(Shift, VecEltTy)) return false;
+    return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy,
+                                    isBigEndian);
+  }
+
+  }
+}
+
+
+/// If the input is an 'or' instruction, we may be doing shifts and ors to
+/// assemble the elements of the vector manually.
+/// Try to rip the code out and replace it with insertelements.  This is to
+/// optimize code like this:
+///
+///    %tmp37 = bitcast float %inc to i32
+///    %tmp38 = zext i32 %tmp37 to i64
+///    %tmp31 = bitcast float %inc5 to i32
+///    %tmp32 = zext i32 %tmp31 to i64
+///    %tmp33 = shl i64 %tmp32, 32
+///    %ins35 = or i64 %tmp33, %tmp38
+///    %tmp43 = bitcast i64 %ins35 to <2 x float>
+///
+/// Into two insertelements that do "buildvector{%inc, %inc5}".
+static Value *optimizeIntegerToVectorInsertions(BitCastInst &CI,
                                                 InstCombinerImpl &IC) {
   auto *DestVecTy = cast<FixedVectorType>(CI.getType());
-  Value *IntInput = CI.getOperand(0); 
- 
-  SmallVector<Value*, 8> Elements(DestVecTy->getNumElements()); 
-  if (!collectInsertionElements(IntInput, 0, Elements, 
-                                DestVecTy->getElementType(), 
-                                IC.getDataLayout().isBigEndian())) 
-    return nullptr; 
- 
-  // If we succeeded, we know that all of the element are specified by Elements 
-  // or are zero if Elements has a null entry.  Recast this as a set of 
-  // insertions. 
-  Value *Result = Constant::getNullValue(CI.getType()); 
-  for (unsigned i = 0, e = Elements.size(); i != e; ++i) { 
-    if (!Elements[i]) continue;  // Unset element. 
- 
-    Result = IC.Builder.CreateInsertElement(Result, Elements[i], 
-                                            IC.Builder.getInt32(i)); 
-  } 
- 
-  return Result; 
-} 
- 
-/// Canonicalize scalar bitcasts of extracted elements into a bitcast of the 
-/// vector followed by extract element. The backend tends to handle bitcasts of 
-/// vectors better than bitcasts of scalars because vector registers are 
-/// usually not type-specific like scalar integer or scalar floating-point. 
-static Instruction *canonicalizeBitCastExtElt(BitCastInst &BitCast, 
+  Value *IntInput = CI.getOperand(0);
+
+  SmallVector<Value*, 8> Elements(DestVecTy->getNumElements());
+  if (!collectInsertionElements(IntInput, 0, Elements,
+                                DestVecTy->getElementType(),
+                                IC.getDataLayout().isBigEndian()))
+    return nullptr;
+
+  // If we succeeded, we know that all of the element are specified by Elements
+  // or are zero if Elements has a null entry.  Recast this as a set of
+  // insertions.
+  Value *Result = Constant::getNullValue(CI.getType());
+  for (unsigned i = 0, e = Elements.size(); i != e; ++i) {
+    if (!Elements[i]) continue;  // Unset element.
+
+    Result = IC.Builder.CreateInsertElement(Result, Elements[i],
+                                            IC.Builder.getInt32(i));
+  }
+
+  return Result;
+}
+
+/// Canonicalize scalar bitcasts of extracted elements into a bitcast of the
+/// vector followed by extract element. The backend tends to handle bitcasts of
+/// vectors better than bitcasts of scalars because vector registers are
+/// usually not type-specific like scalar integer or scalar floating-point.
+static Instruction *canonicalizeBitCastExtElt(BitCastInst &BitCast,
                                               InstCombinerImpl &IC) {
-  // TODO: Create and use a pattern matcher for ExtractElementInst. 
-  auto *ExtElt = dyn_cast<ExtractElementInst>(BitCast.getOperand(0)); 
-  if (!ExtElt || !ExtElt->hasOneUse()) 
-    return nullptr; 
- 
-  // The bitcast must be to a vectorizable type, otherwise we can't make a new 
-  // type to extract from. 
-  Type *DestType = BitCast.getType(); 
-  if (!VectorType::isValidElementType(DestType)) 
-    return nullptr; 
- 
+  // TODO: Create and use a pattern matcher for ExtractElementInst.
+  auto *ExtElt = dyn_cast<ExtractElementInst>(BitCast.getOperand(0));
+  if (!ExtElt || !ExtElt->hasOneUse())
+    return nullptr;
+
+  // The bitcast must be to a vectorizable type, otherwise we can't make a new
+  // type to extract from.
+  Type *DestType = BitCast.getType();
+  if (!VectorType::isValidElementType(DestType))
+    return nullptr;
+
   auto *NewVecType = VectorType::get(DestType, ExtElt->getVectorOperandType());
-  auto *NewBC = IC.Builder.CreateBitCast(ExtElt->getVectorOperand(), 
-                                         NewVecType, "bc"); 
-  return ExtractElementInst::Create(NewBC, ExtElt->getIndexOperand()); 
-} 
- 
-/// Change the type of a bitwise logic operation if we can eliminate a bitcast. 
-static Instruction *foldBitCastBitwiseLogic(BitCastInst &BitCast, 
-                                            InstCombiner::BuilderTy &Builder) { 
-  Type *DestTy = BitCast.getType(); 
-  BinaryOperator *BO; 
-  if (!DestTy->isIntOrIntVectorTy() || 
-      !match(BitCast.getOperand(0), m_OneUse(m_BinOp(BO))) || 
-      !BO->isBitwiseLogicOp()) 
-    return nullptr; 
- 
-  // FIXME: This transform is restricted to vector types to avoid backend 
-  // problems caused by creating potentially illegal operations. If a fix-up is 
-  // added to handle that situation, we can remove this check. 
-  if (!DestTy->isVectorTy() || !BO->getType()->isVectorTy()) 
-    return nullptr; 
- 
-  Value *X; 
-  if (match(BO->getOperand(0), m_OneUse(m_BitCast(m_Value(X)))) && 
-      X->getType() == DestTy && !isa<Constant>(X)) { 
-    // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y)) 
-    Value *CastedOp1 = Builder.CreateBitCast(BO->getOperand(1), DestTy); 
-    return BinaryOperator::Create(BO->getOpcode(), X, CastedOp1); 
-  } 
- 
-  if (match(BO->getOperand(1), m_OneUse(m_BitCast(m_Value(X)))) && 
-      X->getType() == DestTy && !isa<Constant>(X)) { 
-    // bitcast(logic(Y, bitcast(X))) --> logic'(bitcast(Y), X) 
-    Value *CastedOp0 = Builder.CreateBitCast(BO->getOperand(0), DestTy); 
-    return BinaryOperator::Create(BO->getOpcode(), CastedOp0, X); 
-  } 
- 
-  // Canonicalize vector bitcasts to come before vector bitwise logic with a 
-  // constant. This eases recognition of special constants for later ops. 
-  // Example: 
-  // icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b 
-  Constant *C; 
-  if (match(BO->getOperand(1), m_Constant(C))) { 
-    // bitcast (logic X, C) --> logic (bitcast X, C') 
-    Value *CastedOp0 = Builder.CreateBitCast(BO->getOperand(0), DestTy); 
-    Value *CastedC = Builder.CreateBitCast(C, DestTy); 
-    return BinaryOperator::Create(BO->getOpcode(), CastedOp0, CastedC); 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Change the type of a select if we can eliminate a bitcast. 
-static Instruction *foldBitCastSelect(BitCastInst &BitCast, 
-                                      InstCombiner::BuilderTy &Builder) { 
-  Value *Cond, *TVal, *FVal; 
-  if (!match(BitCast.getOperand(0), 
-             m_OneUse(m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal))))) 
-    return nullptr; 
- 
-  // A vector select must maintain the same number of elements in its operands. 
-  Type *CondTy = Cond->getType(); 
-  Type *DestTy = BitCast.getType(); 
+  auto *NewBC = IC.Builder.CreateBitCast(ExtElt->getVectorOperand(),
+                                         NewVecType, "bc");
+  return ExtractElementInst::Create(NewBC, ExtElt->getIndexOperand());
+}
+
+/// Change the type of a bitwise logic operation if we can eliminate a bitcast.
+static Instruction *foldBitCastBitwiseLogic(BitCastInst &BitCast,
+                                            InstCombiner::BuilderTy &Builder) {
+  Type *DestTy = BitCast.getType();
+  BinaryOperator *BO;
+  if (!DestTy->isIntOrIntVectorTy() ||
+      !match(BitCast.getOperand(0), m_OneUse(m_BinOp(BO))) ||
+      !BO->isBitwiseLogicOp())
+    return nullptr;
+
+  // FIXME: This transform is restricted to vector types to avoid backend
+  // problems caused by creating potentially illegal operations. If a fix-up is
+  // added to handle that situation, we can remove this check.
+  if (!DestTy->isVectorTy() || !BO->getType()->isVectorTy())
+    return nullptr;
+
+  Value *X;
+  if (match(BO->getOperand(0), m_OneUse(m_BitCast(m_Value(X)))) &&
+      X->getType() == DestTy && !isa<Constant>(X)) {
+    // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
+    Value *CastedOp1 = Builder.CreateBitCast(BO->getOperand(1), DestTy);
+    return BinaryOperator::Create(BO->getOpcode(), X, CastedOp1);
+  }
+
+  if (match(BO->getOperand(1), m_OneUse(m_BitCast(m_Value(X)))) &&
+      X->getType() == DestTy && !isa<Constant>(X)) {
+    // bitcast(logic(Y, bitcast(X))) --> logic'(bitcast(Y), X)
+    Value *CastedOp0 = Builder.CreateBitCast(BO->getOperand(0), DestTy);
+    return BinaryOperator::Create(BO->getOpcode(), CastedOp0, X);
+  }
+
+  // Canonicalize vector bitcasts to come before vector bitwise logic with a
+  // constant. This eases recognition of special constants for later ops.
+  // Example:
+  // icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b
+  Constant *C;
+  if (match(BO->getOperand(1), m_Constant(C))) {
+    // bitcast (logic X, C) --> logic (bitcast X, C')
+    Value *CastedOp0 = Builder.CreateBitCast(BO->getOperand(0), DestTy);
+    Value *CastedC = Builder.CreateBitCast(C, DestTy);
+    return BinaryOperator::Create(BO->getOpcode(), CastedOp0, CastedC);
+  }
+
+  return nullptr;
+}
+
+/// Change the type of a select if we can eliminate a bitcast.
+static Instruction *foldBitCastSelect(BitCastInst &BitCast,
+                                      InstCombiner::BuilderTy &Builder) {
+  Value *Cond, *TVal, *FVal;
+  if (!match(BitCast.getOperand(0),
+             m_OneUse(m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal)))))
+    return nullptr;
+
+  // A vector select must maintain the same number of elements in its operands.
+  Type *CondTy = Cond->getType();
+  Type *DestTy = BitCast.getType();
   if (auto *CondVTy = dyn_cast<VectorType>(CondTy))
     if (!DestTy->isVectorTy() ||
         CondVTy->getElementCount() !=
             cast<VectorType>(DestTy)->getElementCount())
-      return nullptr; 
- 
-  // FIXME: This transform is restricted from changing the select between 
-  // scalars and vectors to avoid backend problems caused by creating 
-  // potentially illegal operations. If a fix-up is added to handle that 
-  // situation, we can remove this check. 
-  if (DestTy->isVectorTy() != TVal->getType()->isVectorTy()) 
-    return nullptr; 
- 
-  auto *Sel = cast<Instruction>(BitCast.getOperand(0)); 
-  Value *X; 
-  if (match(TVal, m_OneUse(m_BitCast(m_Value(X)))) && X->getType() == DestTy && 
-      !isa<Constant>(X)) { 
-    // bitcast(select(Cond, bitcast(X), Y)) --> select'(Cond, X, bitcast(Y)) 
-    Value *CastedVal = Builder.CreateBitCast(FVal, DestTy); 
-    return SelectInst::Create(Cond, X, CastedVal, "", nullptr, Sel); 
-  } 
- 
-  if (match(FVal, m_OneUse(m_BitCast(m_Value(X)))) && X->getType() == DestTy && 
-      !isa<Constant>(X)) { 
-    // bitcast(select(Cond, Y, bitcast(X))) --> select'(Cond, bitcast(Y), X) 
-    Value *CastedVal = Builder.CreateBitCast(TVal, DestTy); 
-    return SelectInst::Create(Cond, CastedVal, X, "", nullptr, Sel); 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Check if all users of CI are StoreInsts. 
-static bool hasStoreUsersOnly(CastInst &CI) { 
-  for (User *U : CI.users()) { 
-    if (!isa<StoreInst>(U)) 
-      return false; 
-  } 
-  return true; 
-} 
- 
-/// This function handles following case 
-/// 
-///     A  ->  B    cast 
-///     PHI 
-///     B  ->  A    cast 
-/// 
-/// All the related PHI nodes can be replaced by new PHI nodes with type A. 
-/// The uses of \p CI can be changed to the new PHI node corresponding to \p PN. 
+      return nullptr;
+
+  // FIXME: This transform is restricted from changing the select between
+  // scalars and vectors to avoid backend problems caused by creating
+  // potentially illegal operations. If a fix-up is added to handle that
+  // situation, we can remove this check.
+  if (DestTy->isVectorTy() != TVal->getType()->isVectorTy())
+    return nullptr;
+
+  auto *Sel = cast<Instruction>(BitCast.getOperand(0));
+  Value *X;
+  if (match(TVal, m_OneUse(m_BitCast(m_Value(X)))) && X->getType() == DestTy &&
+      !isa<Constant>(X)) {
+    // bitcast(select(Cond, bitcast(X), Y)) --> select'(Cond, X, bitcast(Y))
+    Value *CastedVal = Builder.CreateBitCast(FVal, DestTy);
+    return SelectInst::Create(Cond, X, CastedVal, "", nullptr, Sel);
+  }
+
+  if (match(FVal, m_OneUse(m_BitCast(m_Value(X)))) && X->getType() == DestTy &&
+      !isa<Constant>(X)) {
+    // bitcast(select(Cond, Y, bitcast(X))) --> select'(Cond, bitcast(Y), X)
+    Value *CastedVal = Builder.CreateBitCast(TVal, DestTy);
+    return SelectInst::Create(Cond, CastedVal, X, "", nullptr, Sel);
+  }
+
+  return nullptr;
+}
+
+/// Check if all users of CI are StoreInsts.
+static bool hasStoreUsersOnly(CastInst &CI) {
+  for (User *U : CI.users()) {
+    if (!isa<StoreInst>(U))
+      return false;
+  }
+  return true;
+}
+
+/// This function handles following case
+///
+///     A  ->  B    cast
+///     PHI
+///     B  ->  A    cast
+///
+/// All the related PHI nodes can be replaced by new PHI nodes with type A.
+/// The uses of \p CI can be changed to the new PHI node corresponding to \p PN.
 Instruction *InstCombinerImpl::optimizeBitCastFromPhi(CastInst &CI,
                                                       PHINode *PN) {
-  // BitCast used by Store can be handled in InstCombineLoadStoreAlloca.cpp. 
-  if (hasStoreUsersOnly(CI)) 
-    return nullptr; 
- 
-  Value *Src = CI.getOperand(0); 
-  Type *SrcTy = Src->getType();         // Type B 
-  Type *DestTy = CI.getType();          // Type A 
- 
-  SmallVector<PHINode *, 4> PhiWorklist; 
-  SmallSetVector<PHINode *, 4> OldPhiNodes; 
- 
-  // Find all of the A->B casts and PHI nodes. 
-  // We need to inspect all related PHI nodes, but PHIs can be cyclic, so 
-  // OldPhiNodes is used to track all known PHI nodes, before adding a new 
-  // PHI to PhiWorklist, it is checked against and added to OldPhiNodes first. 
-  PhiWorklist.push_back(PN); 
-  OldPhiNodes.insert(PN); 
-  while (!PhiWorklist.empty()) { 
-    auto *OldPN = PhiWorklist.pop_back_val(); 
-    for (Value *IncValue : OldPN->incoming_values()) { 
-      if (isa<Constant>(IncValue)) 
-        continue; 
- 
-      if (auto *LI = dyn_cast<LoadInst>(IncValue)) { 
-        // If there is a sequence of one or more load instructions, each loaded 
-        // value is used as address of later load instruction, bitcast is 
-        // necessary to change the value type, don't optimize it. For 
-        // simplicity we give up if the load address comes from another load. 
-        Value *Addr = LI->getOperand(0); 
-        if (Addr == &CI || isa<LoadInst>(Addr)) 
-          return nullptr; 
-        if (LI->hasOneUse() && LI->isSimple()) 
-          continue; 
-        // If a LoadInst has more than one use, changing the type of loaded 
-        // value may create another bitcast. 
-        return nullptr; 
-      } 
- 
-      if (auto *PNode = dyn_cast<PHINode>(IncValue)) { 
-        if (OldPhiNodes.insert(PNode)) 
-          PhiWorklist.push_back(PNode); 
-        continue; 
-      } 
- 
-      auto *BCI = dyn_cast<BitCastInst>(IncValue); 
-      // We can't handle other instructions. 
-      if (!BCI) 
-        return nullptr; 
- 
-      // Verify it's a A->B cast. 
-      Type *TyA = BCI->getOperand(0)->getType(); 
-      Type *TyB = BCI->getType(); 
-      if (TyA != DestTy || TyB != SrcTy) 
-        return nullptr; 
-    } 
-  } 
- 
-  // Check that each user of each old PHI node is something that we can 
-  // rewrite, so that all of the old PHI nodes can be cleaned up afterwards. 
-  for (auto *OldPN : OldPhiNodes) { 
-    for (User *V : OldPN->users()) { 
-      if (auto *SI = dyn_cast<StoreInst>(V)) { 
-        if (!SI->isSimple() || SI->getOperand(0) != OldPN) 
-          return nullptr; 
-      } else if (auto *BCI = dyn_cast<BitCastInst>(V)) { 
-        // Verify it's a B->A cast. 
-        Type *TyB = BCI->getOperand(0)->getType(); 
-        Type *TyA = BCI->getType(); 
-        if (TyA != DestTy || TyB != SrcTy) 
-          return nullptr; 
-      } else if (auto *PHI = dyn_cast<PHINode>(V)) { 
-        // As long as the user is another old PHI node, then even if we don't 
-        // rewrite it, the PHI web we're considering won't have any users 
-        // outside itself, so it'll be dead. 
-        if (OldPhiNodes.count(PHI) == 0) 
-          return nullptr; 
-      } else { 
-        return nullptr; 
-      } 
-    } 
-  } 
- 
-  // For each old PHI node, create a corresponding new PHI node with a type A. 
-  SmallDenseMap<PHINode *, PHINode *> NewPNodes; 
-  for (auto *OldPN : OldPhiNodes) { 
-    Builder.SetInsertPoint(OldPN); 
-    PHINode *NewPN = Builder.CreatePHI(DestTy, OldPN->getNumOperands()); 
-    NewPNodes[OldPN] = NewPN; 
-  } 
- 
-  // Fill in the operands of new PHI nodes. 
-  for (auto *OldPN : OldPhiNodes) { 
-    PHINode *NewPN = NewPNodes[OldPN]; 
-    for (unsigned j = 0, e = OldPN->getNumOperands(); j != e; ++j) { 
-      Value *V = OldPN->getOperand(j); 
-      Value *NewV = nullptr; 
-      if (auto *C = dyn_cast<Constant>(V)) { 
-        NewV = ConstantExpr::getBitCast(C, DestTy); 
-      } else if (auto *LI = dyn_cast<LoadInst>(V)) { 
-        // Explicitly perform load combine to make sure no opposing transform 
-        // can remove the bitcast in the meantime and trigger an infinite loop. 
-        Builder.SetInsertPoint(LI); 
-        NewV = combineLoadToNewType(*LI, DestTy); 
-        // Remove the old load and its use in the old phi, which itself becomes 
-        // dead once the whole transform finishes. 
-        replaceInstUsesWith(*LI, UndefValue::get(LI->getType())); 
-        eraseInstFromFunction(*LI); 
-      } else if (auto *BCI = dyn_cast<BitCastInst>(V)) { 
-        NewV = BCI->getOperand(0); 
-      } else if (auto *PrevPN = dyn_cast<PHINode>(V)) { 
-        NewV = NewPNodes[PrevPN]; 
-      } 
-      assert(NewV); 
-      NewPN->addIncoming(NewV, OldPN->getIncomingBlock(j)); 
-    } 
-  } 
- 
-  // Traverse all accumulated PHI nodes and process its users, 
-  // which are Stores and BitcCasts. Without this processing 
-  // NewPHI nodes could be replicated and could lead to extra 
-  // moves generated after DeSSA. 
-  // If there is a store with type B, change it to type A. 
- 
- 
-  // Replace users of BitCast B->A with NewPHI. These will help 
-  // later to get rid off a closure formed by OldPHI nodes. 
-  Instruction *RetVal = nullptr; 
-  for (auto *OldPN : OldPhiNodes) { 
-    PHINode *NewPN = NewPNodes[OldPN]; 
+  // BitCast used by Store can be handled in InstCombineLoadStoreAlloca.cpp.
+  if (hasStoreUsersOnly(CI))
+    return nullptr;
+
+  Value *Src = CI.getOperand(0);
+  Type *SrcTy = Src->getType();         // Type B
+  Type *DestTy = CI.getType();          // Type A
+
+  SmallVector<PHINode *, 4> PhiWorklist;
+  SmallSetVector<PHINode *, 4> OldPhiNodes;
+
+  // Find all of the A->B casts and PHI nodes.
+  // We need to inspect all related PHI nodes, but PHIs can be cyclic, so
+  // OldPhiNodes is used to track all known PHI nodes, before adding a new
+  // PHI to PhiWorklist, it is checked against and added to OldPhiNodes first.
+  PhiWorklist.push_back(PN);
+  OldPhiNodes.insert(PN);
+  while (!PhiWorklist.empty()) {
+    auto *OldPN = PhiWorklist.pop_back_val();
+    for (Value *IncValue : OldPN->incoming_values()) {
+      if (isa<Constant>(IncValue))
+        continue;
+
+      if (auto *LI = dyn_cast<LoadInst>(IncValue)) {
+        // If there is a sequence of one or more load instructions, each loaded
+        // value is used as address of later load instruction, bitcast is
+        // necessary to change the value type, don't optimize it. For
+        // simplicity we give up if the load address comes from another load.
+        Value *Addr = LI->getOperand(0);
+        if (Addr == &CI || isa<LoadInst>(Addr))
+          return nullptr;
+        if (LI->hasOneUse() && LI->isSimple())
+          continue;
+        // If a LoadInst has more than one use, changing the type of loaded
+        // value may create another bitcast.
+        return nullptr;
+      }
+
+      if (auto *PNode = dyn_cast<PHINode>(IncValue)) {
+        if (OldPhiNodes.insert(PNode))
+          PhiWorklist.push_back(PNode);
+        continue;
+      }
+
+      auto *BCI = dyn_cast<BitCastInst>(IncValue);
+      // We can't handle other instructions.
+      if (!BCI)
+        return nullptr;
+
+      // Verify it's a A->B cast.
+      Type *TyA = BCI->getOperand(0)->getType();
+      Type *TyB = BCI->getType();
+      if (TyA != DestTy || TyB != SrcTy)
+        return nullptr;
+    }
+  }
+
+  // Check that each user of each old PHI node is something that we can
+  // rewrite, so that all of the old PHI nodes can be cleaned up afterwards.
+  for (auto *OldPN : OldPhiNodes) {
+    for (User *V : OldPN->users()) {
+      if (auto *SI = dyn_cast<StoreInst>(V)) {
+        if (!SI->isSimple() || SI->getOperand(0) != OldPN)
+          return nullptr;
+      } else if (auto *BCI = dyn_cast<BitCastInst>(V)) {
+        // Verify it's a B->A cast.
+        Type *TyB = BCI->getOperand(0)->getType();
+        Type *TyA = BCI->getType();
+        if (TyA != DestTy || TyB != SrcTy)
+          return nullptr;
+      } else if (auto *PHI = dyn_cast<PHINode>(V)) {
+        // As long as the user is another old PHI node, then even if we don't
+        // rewrite it, the PHI web we're considering won't have any users
+        // outside itself, so it'll be dead.
+        if (OldPhiNodes.count(PHI) == 0)
+          return nullptr;
+      } else {
+        return nullptr;
+      }
+    }
+  }
+
+  // For each old PHI node, create a corresponding new PHI node with a type A.
+  SmallDenseMap<PHINode *, PHINode *> NewPNodes;
+  for (auto *OldPN : OldPhiNodes) {
+    Builder.SetInsertPoint(OldPN);
+    PHINode *NewPN = Builder.CreatePHI(DestTy, OldPN->getNumOperands());
+    NewPNodes[OldPN] = NewPN;
+  }
+
+  // Fill in the operands of new PHI nodes.
+  for (auto *OldPN : OldPhiNodes) {
+    PHINode *NewPN = NewPNodes[OldPN];
+    for (unsigned j = 0, e = OldPN->getNumOperands(); j != e; ++j) {
+      Value *V = OldPN->getOperand(j);
+      Value *NewV = nullptr;
+      if (auto *C = dyn_cast<Constant>(V)) {
+        NewV = ConstantExpr::getBitCast(C, DestTy);
+      } else if (auto *LI = dyn_cast<LoadInst>(V)) {
+        // Explicitly perform load combine to make sure no opposing transform
+        // can remove the bitcast in the meantime and trigger an infinite loop.
+        Builder.SetInsertPoint(LI);
+        NewV = combineLoadToNewType(*LI, DestTy);
+        // Remove the old load and its use in the old phi, which itself becomes
+        // dead once the whole transform finishes.
+        replaceInstUsesWith(*LI, UndefValue::get(LI->getType()));
+        eraseInstFromFunction(*LI);
+      } else if (auto *BCI = dyn_cast<BitCastInst>(V)) {
+        NewV = BCI->getOperand(0);
+      } else if (auto *PrevPN = dyn_cast<PHINode>(V)) {
+        NewV = NewPNodes[PrevPN];
+      }
+      assert(NewV);
+      NewPN->addIncoming(NewV, OldPN->getIncomingBlock(j));
+    }
+  }
+
+  // Traverse all accumulated PHI nodes and process its users,
+  // which are Stores and BitcCasts. Without this processing
+  // NewPHI nodes could be replicated and could lead to extra
+  // moves generated after DeSSA.
+  // If there is a store with type B, change it to type A.
+
+
+  // Replace users of BitCast B->A with NewPHI. These will help
+  // later to get rid off a closure formed by OldPHI nodes.
+  Instruction *RetVal = nullptr;
+  for (auto *OldPN : OldPhiNodes) {
+    PHINode *NewPN = NewPNodes[OldPN];
     for (User *V : make_early_inc_range(OldPN->users())) {
-      if (auto *SI = dyn_cast<StoreInst>(V)) { 
-        assert(SI->isSimple() && SI->getOperand(0) == OldPN); 
-        Builder.SetInsertPoint(SI); 
-        auto *NewBC = 
-          cast<BitCastInst>(Builder.CreateBitCast(NewPN, SrcTy)); 
-        SI->setOperand(0, NewBC); 
-        Worklist.push(SI); 
-        assert(hasStoreUsersOnly(*NewBC)); 
-      } 
-      else if (auto *BCI = dyn_cast<BitCastInst>(V)) { 
-        Type *TyB = BCI->getOperand(0)->getType(); 
-        Type *TyA = BCI->getType(); 
-        assert(TyA == DestTy && TyB == SrcTy); 
-        (void) TyA; 
-        (void) TyB; 
-        Instruction *I = replaceInstUsesWith(*BCI, NewPN); 
-        if (BCI == &CI) 
-          RetVal = I; 
-      } else if (auto *PHI = dyn_cast<PHINode>(V)) { 
+      if (auto *SI = dyn_cast<StoreInst>(V)) {
+        assert(SI->isSimple() && SI->getOperand(0) == OldPN);
+        Builder.SetInsertPoint(SI);
+        auto *NewBC =
+          cast<BitCastInst>(Builder.CreateBitCast(NewPN, SrcTy));
+        SI->setOperand(0, NewBC);
+        Worklist.push(SI);
+        assert(hasStoreUsersOnly(*NewBC));
+      }
+      else if (auto *BCI = dyn_cast<BitCastInst>(V)) {
+        Type *TyB = BCI->getOperand(0)->getType();
+        Type *TyA = BCI->getType();
+        assert(TyA == DestTy && TyB == SrcTy);
+        (void) TyA;
+        (void) TyB;
+        Instruction *I = replaceInstUsesWith(*BCI, NewPN);
+        if (BCI == &CI)
+          RetVal = I;
+      } else if (auto *PHI = dyn_cast<PHINode>(V)) {
         assert(OldPhiNodes.contains(PHI));
-        (void) PHI; 
-      } else { 
-        llvm_unreachable("all uses should be handled"); 
-      } 
-    } 
-  } 
- 
-  return RetVal; 
-} 
- 
+        (void) PHI;
+      } else {
+        llvm_unreachable("all uses should be handled");
+      }
+    }
+  }
+
+  return RetVal;
+}
+
 Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) {
-  // If the operands are integer typed then apply the integer transforms, 
-  // otherwise just apply the common ones. 
-  Value *Src = CI.getOperand(0); 
-  Type *SrcTy = Src->getType(); 
-  Type *DestTy = CI.getType(); 
- 
-  // Get rid of casts from one type to the same type. These are useless and can 
-  // be replaced by the operand. 
-  if (DestTy == Src->getType()) 
-    return replaceInstUsesWith(CI, Src); 
- 
-  if (isa<PointerType>(SrcTy) && isa<PointerType>(DestTy)) { 
-    PointerType *SrcPTy = cast<PointerType>(SrcTy); 
-    PointerType *DstPTy = cast<PointerType>(DestTy); 
-    Type *DstElTy = DstPTy->getElementType(); 
-    Type *SrcElTy = SrcPTy->getElementType(); 
- 
-    // Casting pointers between the same type, but with different address spaces 
-    // is an addrspace cast rather than a bitcast. 
-    if ((DstElTy == SrcElTy) && 
-        (DstPTy->getAddressSpace() != SrcPTy->getAddressSpace())) 
-      return new AddrSpaceCastInst(Src, DestTy); 
- 
-    // If we are casting a alloca to a pointer to a type of the same 
-    // size, rewrite the allocation instruction to allocate the "right" type. 
-    // There is no need to modify malloc calls because it is their bitcast that 
-    // needs to be cleaned up. 
-    if (AllocaInst *AI = dyn_cast<AllocaInst>(Src)) 
-      if (Instruction *V = PromoteCastOfAllocation(CI, *AI)) 
-        return V; 
- 
-    // When the type pointed to is not sized the cast cannot be 
-    // turned into a gep. 
-    Type *PointeeType = 
-        cast<PointerType>(Src->getType()->getScalarType())->getElementType(); 
-    if (!PointeeType->isSized()) 
-      return nullptr; 
- 
-    // If the source and destination are pointers, and this cast is equivalent 
-    // to a getelementptr X, 0, 0, 0...  turn it into the appropriate gep. 
-    // This can enhance SROA and other transforms that want type-safe pointers. 
-    unsigned NumZeros = 0; 
-    while (SrcElTy && SrcElTy != DstElTy) { 
-      SrcElTy = GetElementPtrInst::getTypeAtIndex(SrcElTy, (uint64_t)0); 
-      ++NumZeros; 
-    } 
- 
-    // If we found a path from the src to dest, create the getelementptr now. 
-    if (SrcElTy == DstElTy) { 
-      SmallVector<Value *, 8> Idxs(NumZeros + 1, Builder.getInt32(0)); 
-      GetElementPtrInst *GEP = 
-          GetElementPtrInst::Create(SrcPTy->getElementType(), Src, Idxs); 
- 
-      // If the source pointer is dereferenceable, then assume it points to an 
-      // allocated object and apply "inbounds" to the GEP. 
-      bool CanBeNull; 
-      if (Src->getPointerDereferenceableBytes(DL, CanBeNull)) { 
-        // In a non-default address space (not 0), a null pointer can not be 
-        // assumed inbounds, so ignore that case (dereferenceable_or_null). 
-        // The reason is that 'null' is not treated differently in these address 
-        // spaces, and we consequently ignore the 'gep inbounds' special case 
-        // for 'null' which allows 'inbounds' on 'null' if the indices are 
-        // zeros. 
-        if (SrcPTy->getAddressSpace() == 0 || !CanBeNull) 
-          GEP->setIsInBounds(); 
-      } 
-      return GEP; 
-    } 
-  } 
- 
-  if (FixedVectorType *DestVTy = dyn_cast<FixedVectorType>(DestTy)) { 
-    // Beware: messing with this target-specific oddity may cause trouble. 
-    if (DestVTy->getNumElements() == 1 && SrcTy->isX86_MMXTy()) { 
-      Value *Elem = Builder.CreateBitCast(Src, DestVTy->getElementType()); 
-      return InsertElementInst::Create(UndefValue::get(DestTy), Elem, 
-                     Constant::getNullValue(Type::getInt32Ty(CI.getContext()))); 
-    } 
- 
-    if (isa<IntegerType>(SrcTy)) { 
-      // If this is a cast from an integer to vector, check to see if the input 
-      // is a trunc or zext of a bitcast from vector.  If so, we can replace all 
-      // the casts with a shuffle and (potentially) a bitcast. 
-      if (isa<TruncInst>(Src) || isa<ZExtInst>(Src)) { 
-        CastInst *SrcCast = cast<CastInst>(Src); 
-        if (BitCastInst *BCIn = dyn_cast<BitCastInst>(SrcCast->getOperand(0))) 
-          if (isa<VectorType>(BCIn->getOperand(0)->getType())) 
-            if (Instruction *I = optimizeVectorResizeWithIntegerBitCasts( 
-                    BCIn->getOperand(0), cast<VectorType>(DestTy), *this)) 
-              return I; 
-      } 
- 
-      // If the input is an 'or' instruction, we may be doing shifts and ors to 
-      // assemble the elements of the vector manually.  Try to rip the code out 
-      // and replace it with insertelements. 
-      if (Value *V = optimizeIntegerToVectorInsertions(CI, *this)) 
-        return replaceInstUsesWith(CI, V); 
-    } 
-  } 
- 
-  if (FixedVectorType *SrcVTy = dyn_cast<FixedVectorType>(SrcTy)) { 
-    if (SrcVTy->getNumElements() == 1) { 
-      // If our destination is not a vector, then make this a straight 
-      // scalar-scalar cast. 
-      if (!DestTy->isVectorTy()) { 
-        Value *Elem = 
-          Builder.CreateExtractElement(Src, 
-                     Constant::getNullValue(Type::getInt32Ty(CI.getContext()))); 
-        return CastInst::Create(Instruction::BitCast, Elem, DestTy); 
-      } 
- 
-      // Otherwise, see if our source is an insert. If so, then use the scalar 
-      // component directly: 
-      // bitcast (inselt <1 x elt> V, X, 0) to <n x m> --> bitcast X to <n x m> 
-      if (auto *InsElt = dyn_cast<InsertElementInst>(Src)) 
-        return new BitCastInst(InsElt->getOperand(1), DestTy); 
-    } 
-  } 
- 
-  if (auto *Shuf = dyn_cast<ShuffleVectorInst>(Src)) { 
-    // Okay, we have (bitcast (shuffle ..)).  Check to see if this is 
-    // a bitcast to a vector with the same # elts. 
-    Value *ShufOp0 = Shuf->getOperand(0); 
-    Value *ShufOp1 = Shuf->getOperand(1); 
+  // If the operands are integer typed then apply the integer transforms,
+  // otherwise just apply the common ones.
+  Value *Src = CI.getOperand(0);
+  Type *SrcTy = Src->getType();
+  Type *DestTy = CI.getType();
+
+  // Get rid of casts from one type to the same type. These are useless and can
+  // be replaced by the operand.
+  if (DestTy == Src->getType())
+    return replaceInstUsesWith(CI, Src);
+
+  if (isa<PointerType>(SrcTy) && isa<PointerType>(DestTy)) {
+    PointerType *SrcPTy = cast<PointerType>(SrcTy);
+    PointerType *DstPTy = cast<PointerType>(DestTy);
+    Type *DstElTy = DstPTy->getElementType();
+    Type *SrcElTy = SrcPTy->getElementType();
+
+    // Casting pointers between the same type, but with different address spaces
+    // is an addrspace cast rather than a bitcast.
+    if ((DstElTy == SrcElTy) &&
+        (DstPTy->getAddressSpace() != SrcPTy->getAddressSpace()))
+      return new AddrSpaceCastInst(Src, DestTy);
+
+    // If we are casting a alloca to a pointer to a type of the same
+    // size, rewrite the allocation instruction to allocate the "right" type.
+    // There is no need to modify malloc calls because it is their bitcast that
+    // needs to be cleaned up.
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(Src))
+      if (Instruction *V = PromoteCastOfAllocation(CI, *AI))
+        return V;
+
+    // When the type pointed to is not sized the cast cannot be
+    // turned into a gep.
+    Type *PointeeType =
+        cast<PointerType>(Src->getType()->getScalarType())->getElementType();
+    if (!PointeeType->isSized())
+      return nullptr;
+
+    // If the source and destination are pointers, and this cast is equivalent
+    // to a getelementptr X, 0, 0, 0...  turn it into the appropriate gep.
+    // This can enhance SROA and other transforms that want type-safe pointers.
+    unsigned NumZeros = 0;
+    while (SrcElTy && SrcElTy != DstElTy) {
+      SrcElTy = GetElementPtrInst::getTypeAtIndex(SrcElTy, (uint64_t)0);
+      ++NumZeros;
+    }
+
+    // If we found a path from the src to dest, create the getelementptr now.
+    if (SrcElTy == DstElTy) {
+      SmallVector<Value *, 8> Idxs(NumZeros + 1, Builder.getInt32(0));
+      GetElementPtrInst *GEP =
+          GetElementPtrInst::Create(SrcPTy->getElementType(), Src, Idxs);
+
+      // If the source pointer is dereferenceable, then assume it points to an
+      // allocated object and apply "inbounds" to the GEP.
+      bool CanBeNull;
+      if (Src->getPointerDereferenceableBytes(DL, CanBeNull)) {
+        // In a non-default address space (not 0), a null pointer can not be
+        // assumed inbounds, so ignore that case (dereferenceable_or_null).
+        // The reason is that 'null' is not treated differently in these address
+        // spaces, and we consequently ignore the 'gep inbounds' special case
+        // for 'null' which allows 'inbounds' on 'null' if the indices are
+        // zeros.
+        if (SrcPTy->getAddressSpace() == 0 || !CanBeNull)
+          GEP->setIsInBounds();
+      }
+      return GEP;
+    }
+  }
+
+  if (FixedVectorType *DestVTy = dyn_cast<FixedVectorType>(DestTy)) {
+    // Beware: messing with this target-specific oddity may cause trouble.
+    if (DestVTy->getNumElements() == 1 && SrcTy->isX86_MMXTy()) {
+      Value *Elem = Builder.CreateBitCast(Src, DestVTy->getElementType());
+      return InsertElementInst::Create(UndefValue::get(DestTy), Elem,
+                     Constant::getNullValue(Type::getInt32Ty(CI.getContext())));
+    }
+
+    if (isa<IntegerType>(SrcTy)) {
+      // If this is a cast from an integer to vector, check to see if the input
+      // is a trunc or zext of a bitcast from vector.  If so, we can replace all
+      // the casts with a shuffle and (potentially) a bitcast.
+      if (isa<TruncInst>(Src) || isa<ZExtInst>(Src)) {
+        CastInst *SrcCast = cast<CastInst>(Src);
+        if (BitCastInst *BCIn = dyn_cast<BitCastInst>(SrcCast->getOperand(0)))
+          if (isa<VectorType>(BCIn->getOperand(0)->getType()))
+            if (Instruction *I = optimizeVectorResizeWithIntegerBitCasts(
+                    BCIn->getOperand(0), cast<VectorType>(DestTy), *this))
+              return I;
+      }
+
+      // If the input is an 'or' instruction, we may be doing shifts and ors to
+      // assemble the elements of the vector manually.  Try to rip the code out
+      // and replace it with insertelements.
+      if (Value *V = optimizeIntegerToVectorInsertions(CI, *this))
+        return replaceInstUsesWith(CI, V);
+    }
+  }
+
+  if (FixedVectorType *SrcVTy = dyn_cast<FixedVectorType>(SrcTy)) {
+    if (SrcVTy->getNumElements() == 1) {
+      // If our destination is not a vector, then make this a straight
+      // scalar-scalar cast.
+      if (!DestTy->isVectorTy()) {
+        Value *Elem =
+          Builder.CreateExtractElement(Src,
+                     Constant::getNullValue(Type::getInt32Ty(CI.getContext())));
+        return CastInst::Create(Instruction::BitCast, Elem, DestTy);
+      }
+
+      // Otherwise, see if our source is an insert. If so, then use the scalar
+      // component directly:
+      // bitcast (inselt <1 x elt> V, X, 0) to <n x m> --> bitcast X to <n x m>
+      if (auto *InsElt = dyn_cast<InsertElementInst>(Src))
+        return new BitCastInst(InsElt->getOperand(1), DestTy);
+    }
+  }
+
+  if (auto *Shuf = dyn_cast<ShuffleVectorInst>(Src)) {
+    // Okay, we have (bitcast (shuffle ..)).  Check to see if this is
+    // a bitcast to a vector with the same # elts.
+    Value *ShufOp0 = Shuf->getOperand(0);
+    Value *ShufOp1 = Shuf->getOperand(1);
     auto ShufElts = cast<VectorType>(Shuf->getType())->getElementCount();
     auto SrcVecElts = cast<VectorType>(ShufOp0->getType())->getElementCount();
-    if (Shuf->hasOneUse() && DestTy->isVectorTy() && 
+    if (Shuf->hasOneUse() && DestTy->isVectorTy() &&
         cast<VectorType>(DestTy)->getElementCount() == ShufElts &&
         ShufElts == SrcVecElts) {
-      BitCastInst *Tmp; 
-      // If either of the operands is a cast from CI.getType(), then 
-      // evaluating the shuffle in the casted destination's type will allow 
-      // us to eliminate at least one cast. 
-      if (((Tmp = dyn_cast<BitCastInst>(ShufOp0)) && 
-           Tmp->getOperand(0)->getType() == DestTy) || 
-          ((Tmp = dyn_cast<BitCastInst>(ShufOp1)) && 
-           Tmp->getOperand(0)->getType() == DestTy)) { 
-        Value *LHS = Builder.CreateBitCast(ShufOp0, DestTy); 
-        Value *RHS = Builder.CreateBitCast(ShufOp1, DestTy); 
-        // Return a new shuffle vector.  Use the same element ID's, as we 
-        // know the vector types match #elts. 
-        return new ShuffleVectorInst(LHS, RHS, Shuf->getShuffleMask()); 
-      } 
-    } 
- 
-    // A bitcasted-to-scalar and byte-reversing shuffle is better recognized as 
-    // a byte-swap: 
-    // bitcast <N x i8> (shuf X, undef, <N, N-1,...0>) --> bswap (bitcast X) 
-    // TODO: We should match the related pattern for bitreverse. 
-    if (DestTy->isIntegerTy() && 
-        DL.isLegalInteger(DestTy->getScalarSizeInBits()) && 
+      BitCastInst *Tmp;
+      // If either of the operands is a cast from CI.getType(), then
+      // evaluating the shuffle in the casted destination's type will allow
+      // us to eliminate at least one cast.
+      if (((Tmp = dyn_cast<BitCastInst>(ShufOp0)) &&
+           Tmp->getOperand(0)->getType() == DestTy) ||
+          ((Tmp = dyn_cast<BitCastInst>(ShufOp1)) &&
+           Tmp->getOperand(0)->getType() == DestTy)) {
+        Value *LHS = Builder.CreateBitCast(ShufOp0, DestTy);
+        Value *RHS = Builder.CreateBitCast(ShufOp1, DestTy);
+        // Return a new shuffle vector.  Use the same element ID's, as we
+        // know the vector types match #elts.
+        return new ShuffleVectorInst(LHS, RHS, Shuf->getShuffleMask());
+      }
+    }
+
+    // A bitcasted-to-scalar and byte-reversing shuffle is better recognized as
+    // a byte-swap:
+    // bitcast <N x i8> (shuf X, undef, <N, N-1,...0>) --> bswap (bitcast X)
+    // TODO: We should match the related pattern for bitreverse.
+    if (DestTy->isIntegerTy() &&
+        DL.isLegalInteger(DestTy->getScalarSizeInBits()) &&
         SrcTy->getScalarSizeInBits() == 8 &&
         ShufElts.getKnownMinValue() % 2 == 0 && Shuf->hasOneUse() &&
         Shuf->isReverse()) {
-      assert(ShufOp0->getType() == SrcTy && "Unexpected shuffle mask"); 
-      assert(isa<UndefValue>(ShufOp1) && "Unexpected shuffle op"); 
-      Function *Bswap = 
-          Intrinsic::getDeclaration(CI.getModule(), Intrinsic::bswap, DestTy); 
-      Value *ScalarX = Builder.CreateBitCast(ShufOp0, DestTy); 
-      return IntrinsicInst::Create(Bswap, { ScalarX }); 
-    } 
-  } 
- 
-  // Handle the A->B->A cast, and there is an intervening PHI node. 
-  if (PHINode *PN = dyn_cast<PHINode>(Src)) 
-    if (Instruction *I = optimizeBitCastFromPhi(CI, PN)) 
-      return I; 
- 
-  if (Instruction *I = canonicalizeBitCastExtElt(CI, *this)) 
-    return I; 
- 
-  if (Instruction *I = foldBitCastBitwiseLogic(CI, Builder)) 
-    return I; 
- 
-  if (Instruction *I = foldBitCastSelect(CI, Builder)) 
-    return I; 
- 
-  if (SrcTy->isPointerTy()) 
-    return commonPointerCastTransforms(CI); 
-  return commonCastTransforms(CI); 
-} 
- 
+      assert(ShufOp0->getType() == SrcTy && "Unexpected shuffle mask");
+      assert(isa<UndefValue>(ShufOp1) && "Unexpected shuffle op");
+      Function *Bswap =
+          Intrinsic::getDeclaration(CI.getModule(), Intrinsic::bswap, DestTy);
+      Value *ScalarX = Builder.CreateBitCast(ShufOp0, DestTy);
+      return IntrinsicInst::Create(Bswap, { ScalarX });
+    }
+  }
+
+  // Handle the A->B->A cast, and there is an intervening PHI node.
+  if (PHINode *PN = dyn_cast<PHINode>(Src))
+    if (Instruction *I = optimizeBitCastFromPhi(CI, PN))
+      return I;
+
+  if (Instruction *I = canonicalizeBitCastExtElt(CI, *this))
+    return I;
+
+  if (Instruction *I = foldBitCastBitwiseLogic(CI, Builder))
+    return I;
+
+  if (Instruction *I = foldBitCastSelect(CI, Builder))
+    return I;
+
+  if (SrcTy->isPointerTy())
+    return commonPointerCastTransforms(CI);
+  return commonCastTransforms(CI);
+}
+
 Instruction *InstCombinerImpl::visitAddrSpaceCast(AddrSpaceCastInst &CI) {
-  // If the destination pointer element type is not the same as the source's 
-  // first do a bitcast to the destination type, and then the addrspacecast. 
-  // This allows the cast to be exposed to other transforms. 
-  Value *Src = CI.getOperand(0); 
-  PointerType *SrcTy = cast<PointerType>(Src->getType()->getScalarType()); 
-  PointerType *DestTy = cast<PointerType>(CI.getType()->getScalarType()); 
- 
-  Type *DestElemTy = DestTy->getElementType(); 
-  if (SrcTy->getElementType() != DestElemTy) { 
-    Type *MidTy = PointerType::get(DestElemTy, SrcTy->getAddressSpace()); 
+  // If the destination pointer element type is not the same as the source's
+  // first do a bitcast to the destination type, and then the addrspacecast.
+  // This allows the cast to be exposed to other transforms.
+  Value *Src = CI.getOperand(0);
+  PointerType *SrcTy = cast<PointerType>(Src->getType()->getScalarType());
+  PointerType *DestTy = cast<PointerType>(CI.getType()->getScalarType());
+
+  Type *DestElemTy = DestTy->getElementType();
+  if (SrcTy->getElementType() != DestElemTy) {
+    Type *MidTy = PointerType::get(DestElemTy, SrcTy->getAddressSpace());
     // Handle vectors of pointers.
     if (VectorType *VT = dyn_cast<VectorType>(CI.getType()))
       MidTy = VectorType::get(MidTy, VT->getElementCount());
- 
-    Value *NewBitCast = Builder.CreateBitCast(Src, MidTy); 
-    return new AddrSpaceCastInst(NewBitCast, CI.getType()); 
-  } 
- 
-  return commonPointerCastTransforms(CI); 
-} 
+
+    Value *NewBitCast = Builder.CreateBitCast(Src, MidTy);
+    return new AddrSpaceCastInst(NewBitCast, CI.getType());
+  }
+
+  return commonPointerCastTransforms(CI);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineCompares.cpp b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineCompares.cpp
index ff56c39d78..cd9a036179 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1,1946 +1,1946 @@
-//===- InstCombineCompares.cpp --------------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the visitICmp and visitFCmp functions. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "InstCombineInternal.h" 
-#include "llvm/ADT/APSInt.h" 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/ConstantFolding.h" 
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/IR/ConstantRange.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/GetElementPtrTypeIterator.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/KnownBits.h" 
+//===- InstCombineCompares.cpp --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visitICmp and visitFCmp functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
- 
-using namespace llvm; 
-using namespace PatternMatch; 
- 
-#define DEBUG_TYPE "instcombine" 
- 
-// How many times is a select replaced by one of its operands? 
-STATISTIC(NumSel, "Number of select opts"); 
- 
- 
-/// Compute Result = In1+In2, returning true if the result overflowed for this 
-/// type. 
-static bool addWithOverflow(APInt &Result, const APInt &In1, 
-                            const APInt &In2, bool IsSigned = false) { 
-  bool Overflow; 
-  if (IsSigned) 
-    Result = In1.sadd_ov(In2, Overflow); 
-  else 
-    Result = In1.uadd_ov(In2, Overflow); 
- 
-  return Overflow; 
-} 
- 
-/// Compute Result = In1-In2, returning true if the result overflowed for this 
-/// type. 
-static bool subWithOverflow(APInt &Result, const APInt &In1, 
-                            const APInt &In2, bool IsSigned = false) { 
-  bool Overflow; 
-  if (IsSigned) 
-    Result = In1.ssub_ov(In2, Overflow); 
-  else 
-    Result = In1.usub_ov(In2, Overflow); 
- 
-  return Overflow; 
-} 
- 
-/// Given an icmp instruction, return true if any use of this comparison is a 
-/// branch on sign bit comparison. 
-static bool hasBranchUse(ICmpInst &I) { 
-  for (auto *U : I.users()) 
-    if (isa<BranchInst>(U)) 
-      return true; 
-  return false; 
-} 
- 
-/// Returns true if the exploded icmp can be expressed as a signed comparison 
-/// to zero and updates the predicate accordingly. 
-/// The signedness of the comparison is preserved. 
-/// TODO: Refactor with decomposeBitTestICmp()? 
-static bool isSignTest(ICmpInst::Predicate &Pred, const APInt &C) { 
-  if (!ICmpInst::isSigned(Pred)) 
-    return false; 
- 
-  if (C.isNullValue()) 
-    return ICmpInst::isRelational(Pred); 
- 
-  if (C.isOneValue()) { 
-    if (Pred == ICmpInst::ICMP_SLT) { 
-      Pred = ICmpInst::ICMP_SLE; 
-      return true; 
-    } 
-  } else if (C.isAllOnesValue()) { 
-    if (Pred == ICmpInst::ICMP_SGT) { 
-      Pred = ICmpInst::ICMP_SGE; 
-      return true; 
-    } 
-  } 
- 
-  return false; 
-} 
- 
-/// This is called when we see this pattern: 
-///   cmp pred (load (gep GV, ...)), cmpcst 
-/// where GV is a global variable with a constant initializer. Try to simplify 
-/// this into some simple computation that does not need the load. For example 
-/// we can optimize "icmp eq (load (gep "foo", 0, i)), 0" into "icmp eq i, 3". 
-/// 
-/// If AndCst is non-null, then the loaded value is masked with that constant 
-/// before doing the comparison. This handles cases like "A[i]&4 == 0". 
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
+// How many times is a select replaced by one of its operands?
+STATISTIC(NumSel, "Number of select opts");
+
+
+/// Compute Result = In1+In2, returning true if the result overflowed for this
+/// type.
+static bool addWithOverflow(APInt &Result, const APInt &In1,
+                            const APInt &In2, bool IsSigned = false) {
+  bool Overflow;
+  if (IsSigned)
+    Result = In1.sadd_ov(In2, Overflow);
+  else
+    Result = In1.uadd_ov(In2, Overflow);
+
+  return Overflow;
+}
+
+/// Compute Result = In1-In2, returning true if the result overflowed for this
+/// type.
+static bool subWithOverflow(APInt &Result, const APInt &In1,
+                            const APInt &In2, bool IsSigned = false) {
+  bool Overflow;
+  if (IsSigned)
+    Result = In1.ssub_ov(In2, Overflow);
+  else
+    Result = In1.usub_ov(In2, Overflow);
+
+  return Overflow;
+}
+
+/// Given an icmp instruction, return true if any use of this comparison is a
+/// branch on sign bit comparison.
+static bool hasBranchUse(ICmpInst &I) {
+  for (auto *U : I.users())
+    if (isa<BranchInst>(U))
+      return true;
+  return false;
+}
+
+/// Returns true if the exploded icmp can be expressed as a signed comparison
+/// to zero and updates the predicate accordingly.
+/// The signedness of the comparison is preserved.
+/// TODO: Refactor with decomposeBitTestICmp()?
+static bool isSignTest(ICmpInst::Predicate &Pred, const APInt &C) {
+  if (!ICmpInst::isSigned(Pred))
+    return false;
+
+  if (C.isNullValue())
+    return ICmpInst::isRelational(Pred);
+
+  if (C.isOneValue()) {
+    if (Pred == ICmpInst::ICMP_SLT) {
+      Pred = ICmpInst::ICMP_SLE;
+      return true;
+    }
+  } else if (C.isAllOnesValue()) {
+    if (Pred == ICmpInst::ICMP_SGT) {
+      Pred = ICmpInst::ICMP_SGE;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/// This is called when we see this pattern:
+///   cmp pred (load (gep GV, ...)), cmpcst
+/// where GV is a global variable with a constant initializer. Try to simplify
+/// this into some simple computation that does not need the load. For example
+/// we can optimize "icmp eq (load (gep "foo", 0, i)), 0" into "icmp eq i, 3".
+///
+/// If AndCst is non-null, then the loaded value is masked with that constant
+/// before doing the comparison. This handles cases like "A[i]&4 == 0".
 Instruction *
 InstCombinerImpl::foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
                                                GlobalVariable *GV, CmpInst &ICI,
                                                ConstantInt *AndCst) {
-  Constant *Init = GV->getInitializer(); 
-  if (!isa<ConstantArray>(Init) && !isa<ConstantDataArray>(Init)) 
-    return nullptr; 
- 
-  uint64_t ArrayElementCount = Init->getType()->getArrayNumElements(); 
-  // Don't blow up on huge arrays. 
-  if (ArrayElementCount > MaxArraySizeForCombine) 
-    return nullptr; 
- 
-  // There are many forms of this optimization we can handle, for now, just do 
-  // the simple index into a single-dimensional array. 
-  // 
-  // Require: GEP GV, 0, i {{, constant indices}} 
-  if (GEP->getNumOperands() < 3 || 
-      !isa<ConstantInt>(GEP->getOperand(1)) || 
-      !cast<ConstantInt>(GEP->getOperand(1))->isZero() || 
-      isa<Constant>(GEP->getOperand(2))) 
-    return nullptr; 
- 
-  // Check that indices after the variable are constants and in-range for the 
-  // type they index.  Collect the indices.  This is typically for arrays of 
-  // structs. 
-  SmallVector<unsigned, 4> LaterIndices; 
- 
-  Type *EltTy = Init->getType()->getArrayElementType(); 
-  for (unsigned i = 3, e = GEP->getNumOperands(); i != e; ++i) { 
-    ConstantInt *Idx = dyn_cast<ConstantInt>(GEP->getOperand(i)); 
-    if (!Idx) return nullptr;  // Variable index. 
- 
-    uint64_t IdxVal = Idx->getZExtValue(); 
-    if ((unsigned)IdxVal != IdxVal) return nullptr; // Too large array index. 
- 
-    if (StructType *STy = dyn_cast<StructType>(EltTy)) 
-      EltTy = STy->getElementType(IdxVal); 
-    else if (ArrayType *ATy = dyn_cast<ArrayType>(EltTy)) { 
-      if (IdxVal >= ATy->getNumElements()) return nullptr; 
-      EltTy = ATy->getElementType(); 
-    } else { 
-      return nullptr; // Unknown type. 
-    } 
- 
-    LaterIndices.push_back(IdxVal); 
-  } 
- 
-  enum { Overdefined = -3, Undefined = -2 }; 
- 
-  // Variables for our state machines. 
- 
-  // FirstTrueElement/SecondTrueElement - Used to emit a comparison of the form 
-  // "i == 47 | i == 87", where 47 is the first index the condition is true for, 
-  // and 87 is the second (and last) index.  FirstTrueElement is -2 when 
-  // undefined, otherwise set to the first true element.  SecondTrueElement is 
-  // -2 when undefined, -3 when overdefined and >= 0 when that index is true. 
-  int FirstTrueElement = Undefined, SecondTrueElement = Undefined; 
- 
-  // FirstFalseElement/SecondFalseElement - Used to emit a comparison of the 
-  // form "i != 47 & i != 87".  Same state transitions as for true elements. 
-  int FirstFalseElement = Undefined, SecondFalseElement = Undefined; 
- 
-  /// TrueRangeEnd/FalseRangeEnd - In conjunction with First*Element, these 
-  /// define a state machine that triggers for ranges of values that the index 
-  /// is true or false for.  This triggers on things like "abbbbc"[i] == 'b'. 
-  /// This is -2 when undefined, -3 when overdefined, and otherwise the last 
-  /// index in the range (inclusive).  We use -2 for undefined here because we 
-  /// use relative comparisons and don't want 0-1 to match -1. 
-  int TrueRangeEnd = Undefined, FalseRangeEnd = Undefined; 
- 
-  // MagicBitvector - This is a magic bitvector where we set a bit if the 
-  // comparison is true for element 'i'.  If there are 64 elements or less in 
-  // the array, this will fully represent all the comparison results. 
-  uint64_t MagicBitvector = 0; 
- 
-  // Scan the array and see if one of our patterns matches. 
-  Constant *CompareRHS = cast<Constant>(ICI.getOperand(1)); 
-  for (unsigned i = 0, e = ArrayElementCount; i != e; ++i) { 
-    Constant *Elt = Init->getAggregateElement(i); 
-    if (!Elt) return nullptr; 
- 
-    // If this is indexing an array of structures, get the structure element. 
-    if (!LaterIndices.empty()) 
-      Elt = ConstantExpr::getExtractValue(Elt, LaterIndices); 
- 
-    // If the element is masked, handle it. 
-    if (AndCst) Elt = ConstantExpr::getAnd(Elt, AndCst); 
- 
-    // Find out if the comparison would be true or false for the i'th element. 
-    Constant *C = ConstantFoldCompareInstOperands(ICI.getPredicate(), Elt, 
-                                                  CompareRHS, DL, &TLI); 
-    // If the result is undef for this element, ignore it. 
-    if (isa<UndefValue>(C)) { 
-      // Extend range state machines to cover this element in case there is an 
-      // undef in the middle of the range. 
-      if (TrueRangeEnd == (int)i-1) 
-        TrueRangeEnd = i; 
-      if (FalseRangeEnd == (int)i-1) 
-        FalseRangeEnd = i; 
-      continue; 
-    } 
- 
-    // If we can't compute the result for any of the elements, we have to give 
-    // up evaluating the entire conditional. 
-    if (!isa<ConstantInt>(C)) return nullptr; 
- 
-    // Otherwise, we know if the comparison is true or false for this element, 
-    // update our state machines. 
-    bool IsTrueForElt = !cast<ConstantInt>(C)->isZero(); 
- 
-    // State machine for single/double/range index comparison. 
-    if (IsTrueForElt) { 
-      // Update the TrueElement state machine. 
-      if (FirstTrueElement == Undefined) 
-        FirstTrueElement = TrueRangeEnd = i;  // First true element. 
-      else { 
-        // Update double-compare state machine. 
-        if (SecondTrueElement == Undefined) 
-          SecondTrueElement = i; 
-        else 
-          SecondTrueElement = Overdefined; 
- 
-        // Update range state machine. 
-        if (TrueRangeEnd == (int)i-1) 
-          TrueRangeEnd = i; 
-        else 
-          TrueRangeEnd = Overdefined; 
-      } 
-    } else { 
-      // Update the FalseElement state machine. 
-      if (FirstFalseElement == Undefined) 
-        FirstFalseElement = FalseRangeEnd = i; // First false element. 
-      else { 
-        // Update double-compare state machine. 
-        if (SecondFalseElement == Undefined) 
-          SecondFalseElement = i; 
-        else 
-          SecondFalseElement = Overdefined; 
- 
-        // Update range state machine. 
-        if (FalseRangeEnd == (int)i-1) 
-          FalseRangeEnd = i; 
-        else 
-          FalseRangeEnd = Overdefined; 
-      } 
-    } 
- 
-    // If this element is in range, update our magic bitvector. 
-    if (i < 64 && IsTrueForElt) 
-      MagicBitvector |= 1ULL << i; 
- 
-    // If all of our states become overdefined, bail out early.  Since the 
-    // predicate is expensive, only check it every 8 elements.  This is only 
-    // really useful for really huge arrays. 
-    if ((i & 8) == 0 && i >= 64 && SecondTrueElement == Overdefined && 
-        SecondFalseElement == Overdefined && TrueRangeEnd == Overdefined && 
-        FalseRangeEnd == Overdefined) 
-      return nullptr; 
-  } 
- 
-  // Now that we've scanned the entire array, emit our new comparison(s).  We 
-  // order the state machines in complexity of the generated code. 
-  Value *Idx = GEP->getOperand(2); 
- 
-  // If the index is larger than the pointer size of the target, truncate the 
-  // index down like the GEP would do implicitly.  We don't have to do this for 
-  // an inbounds GEP because the index can't be out of range. 
-  if (!GEP->isInBounds()) { 
-    Type *IntPtrTy = DL.getIntPtrType(GEP->getType()); 
-    unsigned PtrSize = IntPtrTy->getIntegerBitWidth(); 
+  Constant *Init = GV->getInitializer();
+  if (!isa<ConstantArray>(Init) && !isa<ConstantDataArray>(Init))
+    return nullptr;
+
+  uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
+  // Don't blow up on huge arrays.
+  if (ArrayElementCount > MaxArraySizeForCombine)
+    return nullptr;
+
+  // There are many forms of this optimization we can handle, for now, just do
+  // the simple index into a single-dimensional array.
+  //
+  // Require: GEP GV, 0, i {{, constant indices}}
+  if (GEP->getNumOperands() < 3 ||
+      !isa<ConstantInt>(GEP->getOperand(1)) ||
+      !cast<ConstantInt>(GEP->getOperand(1))->isZero() ||
+      isa<Constant>(GEP->getOperand(2)))
+    return nullptr;
+
+  // Check that indices after the variable are constants and in-range for the
+  // type they index.  Collect the indices.  This is typically for arrays of
+  // structs.
+  SmallVector<unsigned, 4> LaterIndices;
+
+  Type *EltTy = Init->getType()->getArrayElementType();
+  for (unsigned i = 3, e = GEP->getNumOperands(); i != e; ++i) {
+    ConstantInt *Idx = dyn_cast<ConstantInt>(GEP->getOperand(i));
+    if (!Idx) return nullptr;  // Variable index.
+
+    uint64_t IdxVal = Idx->getZExtValue();
+    if ((unsigned)IdxVal != IdxVal) return nullptr; // Too large array index.
+
+    if (StructType *STy = dyn_cast<StructType>(EltTy))
+      EltTy = STy->getElementType(IdxVal);
+    else if (ArrayType *ATy = dyn_cast<ArrayType>(EltTy)) {
+      if (IdxVal >= ATy->getNumElements()) return nullptr;
+      EltTy = ATy->getElementType();
+    } else {
+      return nullptr; // Unknown type.
+    }
+
+    LaterIndices.push_back(IdxVal);
+  }
+
+  enum { Overdefined = -3, Undefined = -2 };
+
+  // Variables for our state machines.
+
+  // FirstTrueElement/SecondTrueElement - Used to emit a comparison of the form
+  // "i == 47 | i == 87", where 47 is the first index the condition is true for,
+  // and 87 is the second (and last) index.  FirstTrueElement is -2 when
+  // undefined, otherwise set to the first true element.  SecondTrueElement is
+  // -2 when undefined, -3 when overdefined and >= 0 when that index is true.
+  int FirstTrueElement = Undefined, SecondTrueElement = Undefined;
+
+  // FirstFalseElement/SecondFalseElement - Used to emit a comparison of the
+  // form "i != 47 & i != 87".  Same state transitions as for true elements.
+  int FirstFalseElement = Undefined, SecondFalseElement = Undefined;
+
+  /// TrueRangeEnd/FalseRangeEnd - In conjunction with First*Element, these
+  /// define a state machine that triggers for ranges of values that the index
+  /// is true or false for.  This triggers on things like "abbbbc"[i] == 'b'.
+  /// This is -2 when undefined, -3 when overdefined, and otherwise the last
+  /// index in the range (inclusive).  We use -2 for undefined here because we
+  /// use relative comparisons and don't want 0-1 to match -1.
+  int TrueRangeEnd = Undefined, FalseRangeEnd = Undefined;
+
+  // MagicBitvector - This is a magic bitvector where we set a bit if the
+  // comparison is true for element 'i'.  If there are 64 elements or less in
+  // the array, this will fully represent all the comparison results.
+  uint64_t MagicBitvector = 0;
+
+  // Scan the array and see if one of our patterns matches.
+  Constant *CompareRHS = cast<Constant>(ICI.getOperand(1));
+  for (unsigned i = 0, e = ArrayElementCount; i != e; ++i) {
+    Constant *Elt = Init->getAggregateElement(i);
+    if (!Elt) return nullptr;
+
+    // If this is indexing an array of structures, get the structure element.
+    if (!LaterIndices.empty())
+      Elt = ConstantExpr::getExtractValue(Elt, LaterIndices);
+
+    // If the element is masked, handle it.
+    if (AndCst) Elt = ConstantExpr::getAnd(Elt, AndCst);
+
+    // Find out if the comparison would be true or false for the i'th element.
+    Constant *C = ConstantFoldCompareInstOperands(ICI.getPredicate(), Elt,
+                                                  CompareRHS, DL, &TLI);
+    // If the result is undef for this element, ignore it.
+    if (isa<UndefValue>(C)) {
+      // Extend range state machines to cover this element in case there is an
+      // undef in the middle of the range.
+      if (TrueRangeEnd == (int)i-1)
+        TrueRangeEnd = i;
+      if (FalseRangeEnd == (int)i-1)
+        FalseRangeEnd = i;
+      continue;
+    }
+
+    // If we can't compute the result for any of the elements, we have to give
+    // up evaluating the entire conditional.
+    if (!isa<ConstantInt>(C)) return nullptr;
+
+    // Otherwise, we know if the comparison is true or false for this element,
+    // update our state machines.
+    bool IsTrueForElt = !cast<ConstantInt>(C)->isZero();
+
+    // State machine for single/double/range index comparison.
+    if (IsTrueForElt) {
+      // Update the TrueElement state machine.
+      if (FirstTrueElement == Undefined)
+        FirstTrueElement = TrueRangeEnd = i;  // First true element.
+      else {
+        // Update double-compare state machine.
+        if (SecondTrueElement == Undefined)
+          SecondTrueElement = i;
+        else
+          SecondTrueElement = Overdefined;
+
+        // Update range state machine.
+        if (TrueRangeEnd == (int)i-1)
+          TrueRangeEnd = i;
+        else
+          TrueRangeEnd = Overdefined;
+      }
+    } else {
+      // Update the FalseElement state machine.
+      if (FirstFalseElement == Undefined)
+        FirstFalseElement = FalseRangeEnd = i; // First false element.
+      else {
+        // Update double-compare state machine.
+        if (SecondFalseElement == Undefined)
+          SecondFalseElement = i;
+        else
+          SecondFalseElement = Overdefined;
+
+        // Update range state machine.
+        if (FalseRangeEnd == (int)i-1)
+          FalseRangeEnd = i;
+        else
+          FalseRangeEnd = Overdefined;
+      }
+    }
+
+    // If this element is in range, update our magic bitvector.
+    if (i < 64 && IsTrueForElt)
+      MagicBitvector |= 1ULL << i;
+
+    // If all of our states become overdefined, bail out early.  Since the
+    // predicate is expensive, only check it every 8 elements.  This is only
+    // really useful for really huge arrays.
+    if ((i & 8) == 0 && i >= 64 && SecondTrueElement == Overdefined &&
+        SecondFalseElement == Overdefined && TrueRangeEnd == Overdefined &&
+        FalseRangeEnd == Overdefined)
+      return nullptr;
+  }
+
+  // Now that we've scanned the entire array, emit our new comparison(s).  We
+  // order the state machines in complexity of the generated code.
+  Value *Idx = GEP->getOperand(2);
+
+  // If the index is larger than the pointer size of the target, truncate the
+  // index down like the GEP would do implicitly.  We don't have to do this for
+  // an inbounds GEP because the index can't be out of range.
+  if (!GEP->isInBounds()) {
+    Type *IntPtrTy = DL.getIntPtrType(GEP->getType());
+    unsigned PtrSize = IntPtrTy->getIntegerBitWidth();
     if (Idx->getType()->getPrimitiveSizeInBits().getFixedSize() > PtrSize)
-      Idx = Builder.CreateTrunc(Idx, IntPtrTy); 
-  } 
- 
-  // If the comparison is only true for one or two elements, emit direct 
-  // comparisons. 
-  if (SecondTrueElement != Overdefined) { 
-    // None true -> false. 
-    if (FirstTrueElement == Undefined) 
-      return replaceInstUsesWith(ICI, Builder.getFalse()); 
- 
-    Value *FirstTrueIdx = ConstantInt::get(Idx->getType(), FirstTrueElement); 
- 
-    // True for one element -> 'i == 47'. 
-    if (SecondTrueElement == Undefined) 
-      return new ICmpInst(ICmpInst::ICMP_EQ, Idx, FirstTrueIdx); 
- 
-    // True for two elements -> 'i == 47 | i == 72'. 
-    Value *C1 = Builder.CreateICmpEQ(Idx, FirstTrueIdx); 
-    Value *SecondTrueIdx = ConstantInt::get(Idx->getType(), SecondTrueElement); 
-    Value *C2 = Builder.CreateICmpEQ(Idx, SecondTrueIdx); 
-    return BinaryOperator::CreateOr(C1, C2); 
-  } 
- 
-  // If the comparison is only false for one or two elements, emit direct 
-  // comparisons. 
-  if (SecondFalseElement != Overdefined) { 
-    // None false -> true. 
-    if (FirstFalseElement == Undefined) 
-      return replaceInstUsesWith(ICI, Builder.getTrue()); 
- 
-    Value *FirstFalseIdx = ConstantInt::get(Idx->getType(), FirstFalseElement); 
- 
-    // False for one element -> 'i != 47'. 
-    if (SecondFalseElement == Undefined) 
-      return new ICmpInst(ICmpInst::ICMP_NE, Idx, FirstFalseIdx); 
- 
-    // False for two elements -> 'i != 47 & i != 72'. 
-    Value *C1 = Builder.CreateICmpNE(Idx, FirstFalseIdx); 
-    Value *SecondFalseIdx = ConstantInt::get(Idx->getType(),SecondFalseElement); 
-    Value *C2 = Builder.CreateICmpNE(Idx, SecondFalseIdx); 
-    return BinaryOperator::CreateAnd(C1, C2); 
-  } 
- 
-  // If the comparison can be replaced with a range comparison for the elements 
-  // where it is true, emit the range check. 
-  if (TrueRangeEnd != Overdefined) { 
-    assert(TrueRangeEnd != FirstTrueElement && "Should emit single compare"); 
- 
-    // Generate (i-FirstTrue) <u (TrueRangeEnd-FirstTrue+1). 
-    if (FirstTrueElement) { 
-      Value *Offs = ConstantInt::get(Idx->getType(), -FirstTrueElement); 
-      Idx = Builder.CreateAdd(Idx, Offs); 
-    } 
- 
-    Value *End = ConstantInt::get(Idx->getType(), 
-                                  TrueRangeEnd-FirstTrueElement+1); 
-    return new ICmpInst(ICmpInst::ICMP_ULT, Idx, End); 
-  } 
- 
-  // False range check. 
-  if (FalseRangeEnd != Overdefined) { 
-    assert(FalseRangeEnd != FirstFalseElement && "Should emit single compare"); 
-    // Generate (i-FirstFalse) >u (FalseRangeEnd-FirstFalse). 
-    if (FirstFalseElement) { 
-      Value *Offs = ConstantInt::get(Idx->getType(), -FirstFalseElement); 
-      Idx = Builder.CreateAdd(Idx, Offs); 
-    } 
- 
-    Value *End = ConstantInt::get(Idx->getType(), 
-                                  FalseRangeEnd-FirstFalseElement); 
-    return new ICmpInst(ICmpInst::ICMP_UGT, Idx, End); 
-  } 
- 
-  // If a magic bitvector captures the entire comparison state 
-  // of this load, replace it with computation that does: 
-  //   ((magic_cst >> i) & 1) != 0 
-  { 
-    Type *Ty = nullptr; 
- 
-    // Look for an appropriate type: 
-    // - The type of Idx if the magic fits 
-    // - The smallest fitting legal type 
-    if (ArrayElementCount <= Idx->getType()->getIntegerBitWidth()) 
-      Ty = Idx->getType(); 
-    else 
-      Ty = DL.getSmallestLegalIntType(Init->getContext(), ArrayElementCount); 
- 
-    if (Ty) { 
-      Value *V = Builder.CreateIntCast(Idx, Ty, false); 
-      V = Builder.CreateLShr(ConstantInt::get(Ty, MagicBitvector), V); 
-      V = Builder.CreateAnd(ConstantInt::get(Ty, 1), V); 
-      return new ICmpInst(ICmpInst::ICMP_NE, V, ConstantInt::get(Ty, 0)); 
-    } 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Return a value that can be used to compare the *offset* implied by a GEP to 
-/// zero. For example, if we have &A[i], we want to return 'i' for 
-/// "icmp ne i, 0". Note that, in general, indices can be complex, and scales 
-/// are involved. The above expression would also be legal to codegen as 
-/// "icmp ne (i*4), 0" (assuming A is a pointer to i32). 
-/// This latter form is less amenable to optimization though, and we are allowed 
-/// to generate the first by knowing that pointer arithmetic doesn't overflow. 
-/// 
-/// If we can't emit an optimized form for this expression, this returns null. 
-/// 
+      Idx = Builder.CreateTrunc(Idx, IntPtrTy);
+  }
+
+  // If the comparison is only true for one or two elements, emit direct
+  // comparisons.
+  if (SecondTrueElement != Overdefined) {
+    // None true -> false.
+    if (FirstTrueElement == Undefined)
+      return replaceInstUsesWith(ICI, Builder.getFalse());
+
+    Value *FirstTrueIdx = ConstantInt::get(Idx->getType(), FirstTrueElement);
+
+    // True for one element -> 'i == 47'.
+    if (SecondTrueElement == Undefined)
+      return new ICmpInst(ICmpInst::ICMP_EQ, Idx, FirstTrueIdx);
+
+    // True for two elements -> 'i == 47 | i == 72'.
+    Value *C1 = Builder.CreateICmpEQ(Idx, FirstTrueIdx);
+    Value *SecondTrueIdx = ConstantInt::get(Idx->getType(), SecondTrueElement);
+    Value *C2 = Builder.CreateICmpEQ(Idx, SecondTrueIdx);
+    return BinaryOperator::CreateOr(C1, C2);
+  }
+
+  // If the comparison is only false for one or two elements, emit direct
+  // comparisons.
+  if (SecondFalseElement != Overdefined) {
+    // None false -> true.
+    if (FirstFalseElement == Undefined)
+      return replaceInstUsesWith(ICI, Builder.getTrue());
+
+    Value *FirstFalseIdx = ConstantInt::get(Idx->getType(), FirstFalseElement);
+
+    // False for one element -> 'i != 47'.
+    if (SecondFalseElement == Undefined)
+      return new ICmpInst(ICmpInst::ICMP_NE, Idx, FirstFalseIdx);
+
+    // False for two elements -> 'i != 47 & i != 72'.
+    Value *C1 = Builder.CreateICmpNE(Idx, FirstFalseIdx);
+    Value *SecondFalseIdx = ConstantInt::get(Idx->getType(),SecondFalseElement);
+    Value *C2 = Builder.CreateICmpNE(Idx, SecondFalseIdx);
+    return BinaryOperator::CreateAnd(C1, C2);
+  }
+
+  // If the comparison can be replaced with a range comparison for the elements
+  // where it is true, emit the range check.
+  if (TrueRangeEnd != Overdefined) {
+    assert(TrueRangeEnd != FirstTrueElement && "Should emit single compare");
+
+    // Generate (i-FirstTrue) <u (TrueRangeEnd-FirstTrue+1).
+    if (FirstTrueElement) {
+      Value *Offs = ConstantInt::get(Idx->getType(), -FirstTrueElement);
+      Idx = Builder.CreateAdd(Idx, Offs);
+    }
+
+    Value *End = ConstantInt::get(Idx->getType(),
+                                  TrueRangeEnd-FirstTrueElement+1);
+    return new ICmpInst(ICmpInst::ICMP_ULT, Idx, End);
+  }
+
+  // False range check.
+  if (FalseRangeEnd != Overdefined) {
+    assert(FalseRangeEnd != FirstFalseElement && "Should emit single compare");
+    // Generate (i-FirstFalse) >u (FalseRangeEnd-FirstFalse).
+    if (FirstFalseElement) {
+      Value *Offs = ConstantInt::get(Idx->getType(), -FirstFalseElement);
+      Idx = Builder.CreateAdd(Idx, Offs);
+    }
+
+    Value *End = ConstantInt::get(Idx->getType(),
+                                  FalseRangeEnd-FirstFalseElement);
+    return new ICmpInst(ICmpInst::ICMP_UGT, Idx, End);
+  }
+
+  // If a magic bitvector captures the entire comparison state
+  // of this load, replace it with computation that does:
+  //   ((magic_cst >> i) & 1) != 0
+  {
+    Type *Ty = nullptr;
+
+    // Look for an appropriate type:
+    // - The type of Idx if the magic fits
+    // - The smallest fitting legal type
+    if (ArrayElementCount <= Idx->getType()->getIntegerBitWidth())
+      Ty = Idx->getType();
+    else
+      Ty = DL.getSmallestLegalIntType(Init->getContext(), ArrayElementCount);
+
+    if (Ty) {
+      Value *V = Builder.CreateIntCast(Idx, Ty, false);
+      V = Builder.CreateLShr(ConstantInt::get(Ty, MagicBitvector), V);
+      V = Builder.CreateAnd(ConstantInt::get(Ty, 1), V);
+      return new ICmpInst(ICmpInst::ICMP_NE, V, ConstantInt::get(Ty, 0));
+    }
+  }
+
+  return nullptr;
+}
+
+/// Return a value that can be used to compare the *offset* implied by a GEP to
+/// zero. For example, if we have &A[i], we want to return 'i' for
+/// "icmp ne i, 0". Note that, in general, indices can be complex, and scales
+/// are involved. The above expression would also be legal to codegen as
+/// "icmp ne (i*4), 0" (assuming A is a pointer to i32).
+/// This latter form is less amenable to optimization though, and we are allowed
+/// to generate the first by knowing that pointer arithmetic doesn't overflow.
+///
+/// If we can't emit an optimized form for this expression, this returns null.
+///
 static Value *evaluateGEPOffsetExpression(User *GEP, InstCombinerImpl &IC,
-                                          const DataLayout &DL) { 
-  gep_type_iterator GTI = gep_type_begin(GEP); 
- 
-  // Check to see if this gep only has a single variable index.  If so, and if 
-  // any constant indices are a multiple of its scale, then we can compute this 
-  // in terms of the scale of the variable index.  For example, if the GEP 
-  // implies an offset of "12 + i*4", then we can codegen this as "3 + i", 
-  // because the expression will cross zero at the same point. 
-  unsigned i, e = GEP->getNumOperands(); 
-  int64_t Offset = 0; 
-  for (i = 1; i != e; ++i, ++GTI) { 
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i))) { 
-      // Compute the aggregate offset of constant indices. 
-      if (CI->isZero()) continue; 
- 
-      // Handle a struct index, which adds its field offset to the pointer. 
-      if (StructType *STy = GTI.getStructTypeOrNull()) { 
-        Offset += DL.getStructLayout(STy)->getElementOffset(CI->getZExtValue()); 
-      } else { 
-        uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType()); 
-        Offset += Size*CI->getSExtValue(); 
-      } 
-    } else { 
-      // Found our variable index. 
-      break; 
-    } 
-  } 
- 
-  // If there are no variable indices, we must have a constant offset, just 
-  // evaluate it the general way. 
-  if (i == e) return nullptr; 
- 
-  Value *VariableIdx = GEP->getOperand(i); 
-  // Determine the scale factor of the variable element.  For example, this is 
-  // 4 if the variable index is into an array of i32. 
-  uint64_t VariableScale = DL.getTypeAllocSize(GTI.getIndexedType()); 
- 
-  // Verify that there are no other variable indices.  If so, emit the hard way. 
-  for (++i, ++GTI; i != e; ++i, ++GTI) { 
-    ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i)); 
-    if (!CI) return nullptr; 
- 
-    // Compute the aggregate offset of constant indices. 
-    if (CI->isZero()) continue; 
- 
-    // Handle a struct index, which adds its field offset to the pointer. 
-    if (StructType *STy = GTI.getStructTypeOrNull()) { 
-      Offset += DL.getStructLayout(STy)->getElementOffset(CI->getZExtValue()); 
-    } else { 
-      uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType()); 
-      Offset += Size*CI->getSExtValue(); 
-    } 
-  } 
- 
-  // Okay, we know we have a single variable index, which must be a 
-  // pointer/array/vector index.  If there is no offset, life is simple, return 
-  // the index. 
-  Type *IntPtrTy = DL.getIntPtrType(GEP->getOperand(0)->getType()); 
-  unsigned IntPtrWidth = IntPtrTy->getIntegerBitWidth(); 
-  if (Offset == 0) { 
-    // Cast to intptrty in case a truncation occurs.  If an extension is needed, 
-    // we don't need to bother extending: the extension won't affect where the 
-    // computation crosses zero. 
+                                          const DataLayout &DL) {
+  gep_type_iterator GTI = gep_type_begin(GEP);
+
+  // Check to see if this gep only has a single variable index.  If so, and if
+  // any constant indices are a multiple of its scale, then we can compute this
+  // in terms of the scale of the variable index.  For example, if the GEP
+  // implies an offset of "12 + i*4", then we can codegen this as "3 + i",
+  // because the expression will cross zero at the same point.
+  unsigned i, e = GEP->getNumOperands();
+  int64_t Offset = 0;
+  for (i = 1; i != e; ++i, ++GTI) {
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i))) {
+      // Compute the aggregate offset of constant indices.
+      if (CI->isZero()) continue;
+
+      // Handle a struct index, which adds its field offset to the pointer.
+      if (StructType *STy = GTI.getStructTypeOrNull()) {
+        Offset += DL.getStructLayout(STy)->getElementOffset(CI->getZExtValue());
+      } else {
+        uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType());
+        Offset += Size*CI->getSExtValue();
+      }
+    } else {
+      // Found our variable index.
+      break;
+    }
+  }
+
+  // If there are no variable indices, we must have a constant offset, just
+  // evaluate it the general way.
+  if (i == e) return nullptr;
+
+  Value *VariableIdx = GEP->getOperand(i);
+  // Determine the scale factor of the variable element.  For example, this is
+  // 4 if the variable index is into an array of i32.
+  uint64_t VariableScale = DL.getTypeAllocSize(GTI.getIndexedType());
+
+  // Verify that there are no other variable indices.  If so, emit the hard way.
+  for (++i, ++GTI; i != e; ++i, ++GTI) {
+    ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i));
+    if (!CI) return nullptr;
+
+    // Compute the aggregate offset of constant indices.
+    if (CI->isZero()) continue;
+
+    // Handle a struct index, which adds its field offset to the pointer.
+    if (StructType *STy = GTI.getStructTypeOrNull()) {
+      Offset += DL.getStructLayout(STy)->getElementOffset(CI->getZExtValue());
+    } else {
+      uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType());
+      Offset += Size*CI->getSExtValue();
+    }
+  }
+
+  // Okay, we know we have a single variable index, which must be a
+  // pointer/array/vector index.  If there is no offset, life is simple, return
+  // the index.
+  Type *IntPtrTy = DL.getIntPtrType(GEP->getOperand(0)->getType());
+  unsigned IntPtrWidth = IntPtrTy->getIntegerBitWidth();
+  if (Offset == 0) {
+    // Cast to intptrty in case a truncation occurs.  If an extension is needed,
+    // we don't need to bother extending: the extension won't affect where the
+    // computation crosses zero.
     if (VariableIdx->getType()->getPrimitiveSizeInBits().getFixedSize() >
         IntPtrWidth) {
-      VariableIdx = IC.Builder.CreateTrunc(VariableIdx, IntPtrTy); 
-    } 
-    return VariableIdx; 
-  } 
- 
-  // Otherwise, there is an index.  The computation we will do will be modulo 
-  // the pointer size. 
-  Offset = SignExtend64(Offset, IntPtrWidth); 
-  VariableScale = SignExtend64(VariableScale, IntPtrWidth); 
- 
-  // To do this transformation, any constant index must be a multiple of the 
-  // variable scale factor.  For example, we can evaluate "12 + 4*i" as "3 + i", 
-  // but we can't evaluate "10 + 3*i" in terms of i.  Check that the offset is a 
-  // multiple of the variable scale. 
-  int64_t NewOffs = Offset / (int64_t)VariableScale; 
-  if (Offset != NewOffs*(int64_t)VariableScale) 
-    return nullptr; 
- 
-  // Okay, we can do this evaluation.  Start by converting the index to intptr. 
-  if (VariableIdx->getType() != IntPtrTy) 
-    VariableIdx = IC.Builder.CreateIntCast(VariableIdx, IntPtrTy, 
-                                            true /*Signed*/); 
-  Constant *OffsetVal = ConstantInt::get(IntPtrTy, NewOffs); 
-  return IC.Builder.CreateAdd(VariableIdx, OffsetVal, "offset"); 
-} 
- 
-/// Returns true if we can rewrite Start as a GEP with pointer Base 
-/// and some integer offset. The nodes that need to be re-written 
-/// for this transformation will be added to Explored. 
-static bool canRewriteGEPAsOffset(Value *Start, Value *Base, 
-                                  const DataLayout &DL, 
-                                  SetVector<Value *> &Explored) { 
-  SmallVector<Value *, 16> WorkList(1, Start); 
-  Explored.insert(Base); 
- 
-  // The following traversal gives us an order which can be used 
-  // when doing the final transformation. Since in the final 
-  // transformation we create the PHI replacement instructions first, 
-  // we don't have to get them in any particular order. 
-  // 
-  // However, for other instructions we will have to traverse the 
-  // operands of an instruction first, which means that we have to 
-  // do a post-order traversal. 
-  while (!WorkList.empty()) { 
-    SetVector<PHINode *> PHIs; 
- 
-    while (!WorkList.empty()) { 
-      if (Explored.size() >= 100) 
-        return false; 
- 
-      Value *V = WorkList.back(); 
- 
+      VariableIdx = IC.Builder.CreateTrunc(VariableIdx, IntPtrTy);
+    }
+    return VariableIdx;
+  }
+
+  // Otherwise, there is an index.  The computation we will do will be modulo
+  // the pointer size.
+  Offset = SignExtend64(Offset, IntPtrWidth);
+  VariableScale = SignExtend64(VariableScale, IntPtrWidth);
+
+  // To do this transformation, any constant index must be a multiple of the
+  // variable scale factor.  For example, we can evaluate "12 + 4*i" as "3 + i",
+  // but we can't evaluate "10 + 3*i" in terms of i.  Check that the offset is a
+  // multiple of the variable scale.
+  int64_t NewOffs = Offset / (int64_t)VariableScale;
+  if (Offset != NewOffs*(int64_t)VariableScale)
+    return nullptr;
+
+  // Okay, we can do this evaluation.  Start by converting the index to intptr.
+  if (VariableIdx->getType() != IntPtrTy)
+    VariableIdx = IC.Builder.CreateIntCast(VariableIdx, IntPtrTy,
+                                            true /*Signed*/);
+  Constant *OffsetVal = ConstantInt::get(IntPtrTy, NewOffs);
+  return IC.Builder.CreateAdd(VariableIdx, OffsetVal, "offset");
+}
+
+/// Returns true if we can rewrite Start as a GEP with pointer Base
+/// and some integer offset. The nodes that need to be re-written
+/// for this transformation will be added to Explored.
+static bool canRewriteGEPAsOffset(Value *Start, Value *Base,
+                                  const DataLayout &DL,
+                                  SetVector<Value *> &Explored) {
+  SmallVector<Value *, 16> WorkList(1, Start);
+  Explored.insert(Base);
+
+  // The following traversal gives us an order which can be used
+  // when doing the final transformation. Since in the final
+  // transformation we create the PHI replacement instructions first,
+  // we don't have to get them in any particular order.
+  //
+  // However, for other instructions we will have to traverse the
+  // operands of an instruction first, which means that we have to
+  // do a post-order traversal.
+  while (!WorkList.empty()) {
+    SetVector<PHINode *> PHIs;
+
+    while (!WorkList.empty()) {
+      if (Explored.size() >= 100)
+        return false;
+
+      Value *V = WorkList.back();
+
       if (Explored.contains(V)) {
-        WorkList.pop_back(); 
-        continue; 
-      } 
- 
-      if (!isa<IntToPtrInst>(V) && !isa<PtrToIntInst>(V) && 
-          !isa<GetElementPtrInst>(V) && !isa<PHINode>(V)) 
-        // We've found some value that we can't explore which is different from 
-        // the base. Therefore we can't do this transformation. 
-        return false; 
- 
-      if (isa<IntToPtrInst>(V) || isa<PtrToIntInst>(V)) { 
+        WorkList.pop_back();
+        continue;
+      }
+
+      if (!isa<IntToPtrInst>(V) && !isa<PtrToIntInst>(V) &&
+          !isa<GetElementPtrInst>(V) && !isa<PHINode>(V))
+        // We've found some value that we can't explore which is different from
+        // the base. Therefore we can't do this transformation.
+        return false;
+
+      if (isa<IntToPtrInst>(V) || isa<PtrToIntInst>(V)) {
         auto *CI = cast<CastInst>(V);
-        if (!CI->isNoopCast(DL)) 
-          return false; 
- 
-        if (Explored.count(CI->getOperand(0)) == 0) 
-          WorkList.push_back(CI->getOperand(0)); 
-      } 
- 
-      if (auto *GEP = dyn_cast<GEPOperator>(V)) { 
-        // We're limiting the GEP to having one index. This will preserve 
-        // the original pointer type. We could handle more cases in the 
-        // future. 
-        if (GEP->getNumIndices() != 1 || !GEP->isInBounds() || 
-            GEP->getType() != Start->getType()) 
-          return false; 
- 
-        if (Explored.count(GEP->getOperand(0)) == 0) 
-          WorkList.push_back(GEP->getOperand(0)); 
-      } 
- 
-      if (WorkList.back() == V) { 
-        WorkList.pop_back(); 
-        // We've finished visiting this node, mark it as such. 
-        Explored.insert(V); 
-      } 
- 
-      if (auto *PN = dyn_cast<PHINode>(V)) { 
-        // We cannot transform PHIs on unsplittable basic blocks. 
-        if (isa<CatchSwitchInst>(PN->getParent()->getTerminator())) 
-          return false; 
-        Explored.insert(PN); 
-        PHIs.insert(PN); 
-      } 
-    } 
- 
-    // Explore the PHI nodes further. 
-    for (auto *PN : PHIs) 
-      for (Value *Op : PN->incoming_values()) 
-        if (Explored.count(Op) == 0) 
-          WorkList.push_back(Op); 
-  } 
- 
-  // Make sure that we can do this. Since we can't insert GEPs in a basic 
-  // block before a PHI node, we can't easily do this transformation if 
-  // we have PHI node users of transformed instructions. 
-  for (Value *Val : Explored) { 
-    for (Value *Use : Val->uses()) { 
- 
-      auto *PHI = dyn_cast<PHINode>(Use); 
-      auto *Inst = dyn_cast<Instruction>(Val); 
- 
-      if (Inst == Base || Inst == PHI || !Inst || !PHI || 
-          Explored.count(PHI) == 0) 
-        continue; 
- 
-      if (PHI->getParent() == Inst->getParent()) 
-        return false; 
-    } 
-  } 
-  return true; 
-} 
- 
-// Sets the appropriate insert point on Builder where we can add 
-// a replacement Instruction for V (if that is possible). 
-static void setInsertionPoint(IRBuilder<> &Builder, Value *V, 
-                              bool Before = true) { 
-  if (auto *PHI = dyn_cast<PHINode>(V)) { 
-    Builder.SetInsertPoint(&*PHI->getParent()->getFirstInsertionPt()); 
-    return; 
-  } 
-  if (auto *I = dyn_cast<Instruction>(V)) { 
-    if (!Before) 
-      I = &*std::next(I->getIterator()); 
-    Builder.SetInsertPoint(I); 
-    return; 
-  } 
-  if (auto *A = dyn_cast<Argument>(V)) { 
-    // Set the insertion point in the entry block. 
-    BasicBlock &Entry = A->getParent()->getEntryBlock(); 
-    Builder.SetInsertPoint(&*Entry.getFirstInsertionPt()); 
-    return; 
-  } 
-  // Otherwise, this is a constant and we don't need to set a new 
-  // insertion point. 
-  assert(isa<Constant>(V) && "Setting insertion point for unknown value!"); 
-} 
- 
-/// Returns a re-written value of Start as an indexed GEP using Base as a 
-/// pointer. 
-static Value *rewriteGEPAsOffset(Value *Start, Value *Base, 
-                                 const DataLayout &DL, 
-                                 SetVector<Value *> &Explored) { 
-  // Perform all the substitutions. This is a bit tricky because we can 
-  // have cycles in our use-def chains. 
-  // 1. Create the PHI nodes without any incoming values. 
-  // 2. Create all the other values. 
-  // 3. Add the edges for the PHI nodes. 
-  // 4. Emit GEPs to get the original pointers. 
-  // 5. Remove the original instructions. 
-  Type *IndexType = IntegerType::get( 
-      Base->getContext(), DL.getIndexTypeSizeInBits(Start->getType())); 
- 
-  DenseMap<Value *, Value *> NewInsts; 
-  NewInsts[Base] = ConstantInt::getNullValue(IndexType); 
- 
-  // Create the new PHI nodes, without adding any incoming values. 
-  for (Value *Val : Explored) { 
-    if (Val == Base) 
-      continue; 
-    // Create empty phi nodes. This avoids cyclic dependencies when creating 
-    // the remaining instructions. 
-    if (auto *PHI = dyn_cast<PHINode>(Val)) 
-      NewInsts[PHI] = PHINode::Create(IndexType, PHI->getNumIncomingValues(), 
-                                      PHI->getName() + ".idx", PHI); 
-  } 
-  IRBuilder<> Builder(Base->getContext()); 
- 
-  // Create all the other instructions. 
-  for (Value *Val : Explored) { 
- 
-    if (NewInsts.find(Val) != NewInsts.end()) 
-      continue; 
- 
-    if (auto *CI = dyn_cast<CastInst>(Val)) { 
-      // Don't get rid of the intermediate variable here; the store can grow 
-      // the map which will invalidate the reference to the input value. 
-      Value *V = NewInsts[CI->getOperand(0)]; 
-      NewInsts[CI] = V; 
-      continue; 
-    } 
-    if (auto *GEP = dyn_cast<GEPOperator>(Val)) { 
-      Value *Index = NewInsts[GEP->getOperand(1)] ? NewInsts[GEP->getOperand(1)] 
-                                                  : GEP->getOperand(1); 
-      setInsertionPoint(Builder, GEP); 
-      // Indices might need to be sign extended. GEPs will magically do 
-      // this, but we need to do it ourselves here. 
-      if (Index->getType()->getScalarSizeInBits() != 
-          NewInsts[GEP->getOperand(0)]->getType()->getScalarSizeInBits()) { 
-        Index = Builder.CreateSExtOrTrunc( 
-            Index, NewInsts[GEP->getOperand(0)]->getType(), 
-            GEP->getOperand(0)->getName() + ".sext"); 
-      } 
- 
-      auto *Op = NewInsts[GEP->getOperand(0)]; 
-      if (isa<ConstantInt>(Op) && cast<ConstantInt>(Op)->isZero()) 
-        NewInsts[GEP] = Index; 
-      else 
-        NewInsts[GEP] = Builder.CreateNSWAdd( 
-            Op, Index, GEP->getOperand(0)->getName() + ".add"); 
-      continue; 
-    } 
-    if (isa<PHINode>(Val)) 
-      continue; 
- 
-    llvm_unreachable("Unexpected instruction type"); 
-  } 
- 
-  // Add the incoming values to the PHI nodes. 
-  for (Value *Val : Explored) { 
-    if (Val == Base) 
-      continue; 
-    // All the instructions have been created, we can now add edges to the 
-    // phi nodes. 
-    if (auto *PHI = dyn_cast<PHINode>(Val)) { 
-      PHINode *NewPhi = static_cast<PHINode *>(NewInsts[PHI]); 
-      for (unsigned I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) { 
-        Value *NewIncoming = PHI->getIncomingValue(I); 
- 
-        if (NewInsts.find(NewIncoming) != NewInsts.end()) 
-          NewIncoming = NewInsts[NewIncoming]; 
- 
-        NewPhi->addIncoming(NewIncoming, PHI->getIncomingBlock(I)); 
-      } 
-    } 
-  } 
- 
-  for (Value *Val : Explored) { 
-    if (Val == Base) 
-      continue; 
- 
-    // Depending on the type, for external users we have to emit 
-    // a GEP or a GEP + ptrtoint. 
-    setInsertionPoint(Builder, Val, false); 
- 
-    // If required, create an inttoptr instruction for Base. 
-    Value *NewBase = Base; 
-    if (!Base->getType()->isPointerTy()) 
-      NewBase = Builder.CreateBitOrPointerCast(Base, Start->getType(), 
-                                               Start->getName() + "to.ptr"); 
- 
-    Value *GEP = Builder.CreateInBoundsGEP( 
-        Start->getType()->getPointerElementType(), NewBase, 
-        makeArrayRef(NewInsts[Val]), Val->getName() + ".ptr"); 
- 
-    if (!Val->getType()->isPointerTy()) { 
-      Value *Cast = Builder.CreatePointerCast(GEP, Val->getType(), 
-                                              Val->getName() + ".conv"); 
-      GEP = Cast; 
-    } 
-    Val->replaceAllUsesWith(GEP); 
-  } 
- 
-  return NewInsts[Start]; 
-} 
- 
-/// Looks through GEPs, IntToPtrInsts and PtrToIntInsts in order to express 
-/// the input Value as a constant indexed GEP. Returns a pair containing 
-/// the GEPs Pointer and Index. 
-static std::pair<Value *, Value *> 
-getAsConstantIndexedAddress(Value *V, const DataLayout &DL) { 
-  Type *IndexType = IntegerType::get(V->getContext(), 
-                                     DL.getIndexTypeSizeInBits(V->getType())); 
- 
-  Constant *Index = ConstantInt::getNullValue(IndexType); 
-  while (true) { 
-    if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) { 
-      // We accept only inbouds GEPs here to exclude the possibility of 
-      // overflow. 
-      if (!GEP->isInBounds()) 
-        break; 
-      if (GEP->hasAllConstantIndices() && GEP->getNumIndices() == 1 && 
-          GEP->getType() == V->getType()) { 
-        V = GEP->getOperand(0); 
-        Constant *GEPIndex = static_cast<Constant *>(GEP->getOperand(1)); 
-        Index = ConstantExpr::getAdd( 
-            Index, ConstantExpr::getSExtOrBitCast(GEPIndex, IndexType)); 
-        continue; 
-      } 
-      break; 
-    } 
-    if (auto *CI = dyn_cast<IntToPtrInst>(V)) { 
-      if (!CI->isNoopCast(DL)) 
-        break; 
-      V = CI->getOperand(0); 
-      continue; 
-    } 
-    if (auto *CI = dyn_cast<PtrToIntInst>(V)) { 
-      if (!CI->isNoopCast(DL)) 
-        break; 
-      V = CI->getOperand(0); 
-      continue; 
-    } 
-    break; 
-  } 
-  return {V, Index}; 
-} 
- 
-/// Converts (CMP GEPLHS, RHS) if this change would make RHS a constant. 
-/// We can look through PHIs, GEPs and casts in order to determine a common base 
-/// between GEPLHS and RHS. 
-static Instruction *transformToIndexedCompare(GEPOperator *GEPLHS, Value *RHS, 
-                                              ICmpInst::Predicate Cond, 
-                                              const DataLayout &DL) { 
-  // FIXME: Support vector of pointers. 
-  if (GEPLHS->getType()->isVectorTy()) 
-    return nullptr; 
- 
-  if (!GEPLHS->hasAllConstantIndices()) 
-    return nullptr; 
- 
-  // Make sure the pointers have the same type. 
-  if (GEPLHS->getType() != RHS->getType()) 
-    return nullptr; 
- 
-  Value *PtrBase, *Index; 
-  std::tie(PtrBase, Index) = getAsConstantIndexedAddress(GEPLHS, DL); 
- 
-  // The set of nodes that will take part in this transformation. 
-  SetVector<Value *> Nodes; 
- 
-  if (!canRewriteGEPAsOffset(RHS, PtrBase, DL, Nodes)) 
-    return nullptr; 
- 
-  // We know we can re-write this as 
-  //  ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2) 
-  // Since we've only looked through inbouds GEPs we know that we 
-  // can't have overflow on either side. We can therefore re-write 
-  // this as: 
-  //   OFFSET1 cmp OFFSET2 
-  Value *NewRHS = rewriteGEPAsOffset(RHS, PtrBase, DL, Nodes); 
- 
-  // RewriteGEPAsOffset has replaced RHS and all of its uses with a re-written 
-  // GEP having PtrBase as the pointer base, and has returned in NewRHS the 
-  // offset. Since Index is the offset of LHS to the base pointer, we will now 
-  // compare the offsets instead of comparing the pointers. 
-  return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Index, NewRHS); 
-} 
- 
-/// Fold comparisons between a GEP instruction and something else. At this point 
-/// we know that the GEP is on the LHS of the comparison. 
+        if (!CI->isNoopCast(DL))
+          return false;
+
+        if (Explored.count(CI->getOperand(0)) == 0)
+          WorkList.push_back(CI->getOperand(0));
+      }
+
+      if (auto *GEP = dyn_cast<GEPOperator>(V)) {
+        // We're limiting the GEP to having one index. This will preserve
+        // the original pointer type. We could handle more cases in the
+        // future.
+        if (GEP->getNumIndices() != 1 || !GEP->isInBounds() ||
+            GEP->getType() != Start->getType())
+          return false;
+
+        if (Explored.count(GEP->getOperand(0)) == 0)
+          WorkList.push_back(GEP->getOperand(0));
+      }
+
+      if (WorkList.back() == V) {
+        WorkList.pop_back();
+        // We've finished visiting this node, mark it as such.
+        Explored.insert(V);
+      }
+
+      if (auto *PN = dyn_cast<PHINode>(V)) {
+        // We cannot transform PHIs on unsplittable basic blocks.
+        if (isa<CatchSwitchInst>(PN->getParent()->getTerminator()))
+          return false;
+        Explored.insert(PN);
+        PHIs.insert(PN);
+      }
+    }
+
+    // Explore the PHI nodes further.
+    for (auto *PN : PHIs)
+      for (Value *Op : PN->incoming_values())
+        if (Explored.count(Op) == 0)
+          WorkList.push_back(Op);
+  }
+
+  // Make sure that we can do this. Since we can't insert GEPs in a basic
+  // block before a PHI node, we can't easily do this transformation if
+  // we have PHI node users of transformed instructions.
+  for (Value *Val : Explored) {
+    for (Value *Use : Val->uses()) {
+
+      auto *PHI = dyn_cast<PHINode>(Use);
+      auto *Inst = dyn_cast<Instruction>(Val);
+
+      if (Inst == Base || Inst == PHI || !Inst || !PHI ||
+          Explored.count(PHI) == 0)
+        continue;
+
+      if (PHI->getParent() == Inst->getParent())
+        return false;
+    }
+  }
+  return true;
+}
+
+// Sets the appropriate insert point on Builder where we can add
+// a replacement Instruction for V (if that is possible).
+static void setInsertionPoint(IRBuilder<> &Builder, Value *V,
+                              bool Before = true) {
+  if (auto *PHI = dyn_cast<PHINode>(V)) {
+    Builder.SetInsertPoint(&*PHI->getParent()->getFirstInsertionPt());
+    return;
+  }
+  if (auto *I = dyn_cast<Instruction>(V)) {
+    if (!Before)
+      I = &*std::next(I->getIterator());
+    Builder.SetInsertPoint(I);
+    return;
+  }
+  if (auto *A = dyn_cast<Argument>(V)) {
+    // Set the insertion point in the entry block.
+    BasicBlock &Entry = A->getParent()->getEntryBlock();
+    Builder.SetInsertPoint(&*Entry.getFirstInsertionPt());
+    return;
+  }
+  // Otherwise, this is a constant and we don't need to set a new
+  // insertion point.
+  assert(isa<Constant>(V) && "Setting insertion point for unknown value!");
+}
+
+/// Returns a re-written value of Start as an indexed GEP using Base as a
+/// pointer.
+static Value *rewriteGEPAsOffset(Value *Start, Value *Base,
+                                 const DataLayout &DL,
+                                 SetVector<Value *> &Explored) {
+  // Perform all the substitutions. This is a bit tricky because we can
+  // have cycles in our use-def chains.
+  // 1. Create the PHI nodes without any incoming values.
+  // 2. Create all the other values.
+  // 3. Add the edges for the PHI nodes.
+  // 4. Emit GEPs to get the original pointers.
+  // 5. Remove the original instructions.
+  Type *IndexType = IntegerType::get(
+      Base->getContext(), DL.getIndexTypeSizeInBits(Start->getType()));
+
+  DenseMap<Value *, Value *> NewInsts;
+  NewInsts[Base] = ConstantInt::getNullValue(IndexType);
+
+  // Create the new PHI nodes, without adding any incoming values.
+  for (Value *Val : Explored) {
+    if (Val == Base)
+      continue;
+    // Create empty phi nodes. This avoids cyclic dependencies when creating
+    // the remaining instructions.
+    if (auto *PHI = dyn_cast<PHINode>(Val))
+      NewInsts[PHI] = PHINode::Create(IndexType, PHI->getNumIncomingValues(),
+                                      PHI->getName() + ".idx", PHI);
+  }
+  IRBuilder<> Builder(Base->getContext());
+
+  // Create all the other instructions.
+  for (Value *Val : Explored) {
+
+    if (NewInsts.find(Val) != NewInsts.end())
+      continue;
+
+    if (auto *CI = dyn_cast<CastInst>(Val)) {
+      // Don't get rid of the intermediate variable here; the store can grow
+      // the map which will invalidate the reference to the input value.
+      Value *V = NewInsts[CI->getOperand(0)];
+      NewInsts[CI] = V;
+      continue;
+    }
+    if (auto *GEP = dyn_cast<GEPOperator>(Val)) {
+      Value *Index = NewInsts[GEP->getOperand(1)] ? NewInsts[GEP->getOperand(1)]
+                                                  : GEP->getOperand(1);
+      setInsertionPoint(Builder, GEP);
+      // Indices might need to be sign extended. GEPs will magically do
+      // this, but we need to do it ourselves here.
+      if (Index->getType()->getScalarSizeInBits() !=
+          NewInsts[GEP->getOperand(0)]->getType()->getScalarSizeInBits()) {
+        Index = Builder.CreateSExtOrTrunc(
+            Index, NewInsts[GEP->getOperand(0)]->getType(),
+            GEP->getOperand(0)->getName() + ".sext");
+      }
+
+      auto *Op = NewInsts[GEP->getOperand(0)];
+      if (isa<ConstantInt>(Op) && cast<ConstantInt>(Op)->isZero())
+        NewInsts[GEP] = Index;
+      else
+        NewInsts[GEP] = Builder.CreateNSWAdd(
+            Op, Index, GEP->getOperand(0)->getName() + ".add");
+      continue;
+    }
+    if (isa<PHINode>(Val))
+      continue;
+
+    llvm_unreachable("Unexpected instruction type");
+  }
+
+  // Add the incoming values to the PHI nodes.
+  for (Value *Val : Explored) {
+    if (Val == Base)
+      continue;
+    // All the instructions have been created, we can now add edges to the
+    // phi nodes.
+    if (auto *PHI = dyn_cast<PHINode>(Val)) {
+      PHINode *NewPhi = static_cast<PHINode *>(NewInsts[PHI]);
+      for (unsigned I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
+        Value *NewIncoming = PHI->getIncomingValue(I);
+
+        if (NewInsts.find(NewIncoming) != NewInsts.end())
+          NewIncoming = NewInsts[NewIncoming];
+
+        NewPhi->addIncoming(NewIncoming, PHI->getIncomingBlock(I));
+      }
+    }
+  }
+
+  for (Value *Val : Explored) {
+    if (Val == Base)
+      continue;
+
+    // Depending on the type, for external users we have to emit
+    // a GEP or a GEP + ptrtoint.
+    setInsertionPoint(Builder, Val, false);
+
+    // If required, create an inttoptr instruction for Base.
+    Value *NewBase = Base;
+    if (!Base->getType()->isPointerTy())
+      NewBase = Builder.CreateBitOrPointerCast(Base, Start->getType(),
+                                               Start->getName() + "to.ptr");
+
+    Value *GEP = Builder.CreateInBoundsGEP(
+        Start->getType()->getPointerElementType(), NewBase,
+        makeArrayRef(NewInsts[Val]), Val->getName() + ".ptr");
+
+    if (!Val->getType()->isPointerTy()) {
+      Value *Cast = Builder.CreatePointerCast(GEP, Val->getType(),
+                                              Val->getName() + ".conv");
+      GEP = Cast;
+    }
+    Val->replaceAllUsesWith(GEP);
+  }
+
+  return NewInsts[Start];
+}
+
+/// Looks through GEPs, IntToPtrInsts and PtrToIntInsts in order to express
+/// the input Value as a constant indexed GEP. Returns a pair containing
+/// the GEPs Pointer and Index.
+static std::pair<Value *, Value *>
+getAsConstantIndexedAddress(Value *V, const DataLayout &DL) {
+  Type *IndexType = IntegerType::get(V->getContext(),
+                                     DL.getIndexTypeSizeInBits(V->getType()));
+
+  Constant *Index = ConstantInt::getNullValue(IndexType);
+  while (true) {
+    if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+      // We accept only inbouds GEPs here to exclude the possibility of
+      // overflow.
+      if (!GEP->isInBounds())
+        break;
+      if (GEP->hasAllConstantIndices() && GEP->getNumIndices() == 1 &&
+          GEP->getType() == V->getType()) {
+        V = GEP->getOperand(0);
+        Constant *GEPIndex = static_cast<Constant *>(GEP->getOperand(1));
+        Index = ConstantExpr::getAdd(
+            Index, ConstantExpr::getSExtOrBitCast(GEPIndex, IndexType));
+        continue;
+      }
+      break;
+    }
+    if (auto *CI = dyn_cast<IntToPtrInst>(V)) {
+      if (!CI->isNoopCast(DL))
+        break;
+      V = CI->getOperand(0);
+      continue;
+    }
+    if (auto *CI = dyn_cast<PtrToIntInst>(V)) {
+      if (!CI->isNoopCast(DL))
+        break;
+      V = CI->getOperand(0);
+      continue;
+    }
+    break;
+  }
+  return {V, Index};
+}
+
+/// Converts (CMP GEPLHS, RHS) if this change would make RHS a constant.
+/// We can look through PHIs, GEPs and casts in order to determine a common base
+/// between GEPLHS and RHS.
+static Instruction *transformToIndexedCompare(GEPOperator *GEPLHS, Value *RHS,
+                                              ICmpInst::Predicate Cond,
+                                              const DataLayout &DL) {
+  // FIXME: Support vector of pointers.
+  if (GEPLHS->getType()->isVectorTy())
+    return nullptr;
+
+  if (!GEPLHS->hasAllConstantIndices())
+    return nullptr;
+
+  // Make sure the pointers have the same type.
+  if (GEPLHS->getType() != RHS->getType())
+    return nullptr;
+
+  Value *PtrBase, *Index;
+  std::tie(PtrBase, Index) = getAsConstantIndexedAddress(GEPLHS, DL);
+
+  // The set of nodes that will take part in this transformation.
+  SetVector<Value *> Nodes;
+
+  if (!canRewriteGEPAsOffset(RHS, PtrBase, DL, Nodes))
+    return nullptr;
+
+  // We know we can re-write this as
+  //  ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2)
+  // Since we've only looked through inbouds GEPs we know that we
+  // can't have overflow on either side. We can therefore re-write
+  // this as:
+  //   OFFSET1 cmp OFFSET2
+  Value *NewRHS = rewriteGEPAsOffset(RHS, PtrBase, DL, Nodes);
+
+  // RewriteGEPAsOffset has replaced RHS and all of its uses with a re-written
+  // GEP having PtrBase as the pointer base, and has returned in NewRHS the
+  // offset. Since Index is the offset of LHS to the base pointer, we will now
+  // compare the offsets instead of comparing the pointers.
+  return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Index, NewRHS);
+}
+
+/// Fold comparisons between a GEP instruction and something else. At this point
+/// we know that the GEP is on the LHS of the comparison.
 Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
                                            ICmpInst::Predicate Cond,
                                            Instruction &I) {
-  // Don't transform signed compares of GEPs into index compares. Even if the 
-  // GEP is inbounds, the final add of the base pointer can have signed overflow 
-  // and would change the result of the icmp. 
-  // e.g. "&foo[0] <s &foo[1]" can't be folded to "true" because "foo" could be 
-  // the maximum signed value for the pointer type. 
-  if (ICmpInst::isSigned(Cond)) 
-    return nullptr; 
- 
-  // Look through bitcasts and addrspacecasts. We do not however want to remove 
-  // 0 GEPs. 
-  if (!isa<GetElementPtrInst>(RHS)) 
-    RHS = RHS->stripPointerCasts(); 
- 
-  Value *PtrBase = GEPLHS->getOperand(0); 
-  // FIXME: Support vector pointer GEPs. 
-  if (PtrBase == RHS && GEPLHS->isInBounds() && 
-      !GEPLHS->getType()->isVectorTy()) { 
-    // ((gep Ptr, OFFSET) cmp Ptr)   ---> (OFFSET cmp 0). 
-    // This transformation (ignoring the base and scales) is valid because we 
-    // know pointers can't overflow since the gep is inbounds.  See if we can 
-    // output an optimized form. 
-    Value *Offset = evaluateGEPOffsetExpression(GEPLHS, *this, DL); 
- 
-    // If not, synthesize the offset the hard way. 
-    if (!Offset) 
-      Offset = EmitGEPOffset(GEPLHS); 
-    return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Offset, 
-                        Constant::getNullValue(Offset->getType())); 
-  } 
- 
-  if (GEPLHS->isInBounds() && ICmpInst::isEquality(Cond) && 
-      isa<Constant>(RHS) && cast<Constant>(RHS)->isNullValue() && 
-      !NullPointerIsDefined(I.getFunction(), 
-                            RHS->getType()->getPointerAddressSpace())) { 
-    // For most address spaces, an allocation can't be placed at null, but null 
-    // itself is treated as a 0 size allocation in the in bounds rules.  Thus, 
-    // the only valid inbounds address derived from null, is null itself. 
-    // Thus, we have four cases to consider: 
-    // 1) Base == nullptr, Offset == 0 -> inbounds, null 
-    // 2) Base == nullptr, Offset != 0 -> poison as the result is out of bounds 
-    // 3) Base != nullptr, Offset == (-base) -> poison (crossing allocations) 
-    // 4) Base != nullptr, Offset != (-base) -> nonnull (and possibly poison) 
-    // 
-    // (Note if we're indexing a type of size 0, that simply collapses into one 
-    //  of the buckets above.) 
-    // 
-    // In general, we're allowed to make values less poison (i.e. remove 
-    //   sources of full UB), so in this case, we just select between the two 
-    //   non-poison cases (1 and 4 above). 
-    // 
-    // For vectors, we apply the same reasoning on a per-lane basis. 
-    auto *Base = GEPLHS->getPointerOperand(); 
-    if (GEPLHS->getType()->isVectorTy() && Base->getType()->isPointerTy()) { 
+  // Don't transform signed compares of GEPs into index compares. Even if the
+  // GEP is inbounds, the final add of the base pointer can have signed overflow
+  // and would change the result of the icmp.
+  // e.g. "&foo[0] <s &foo[1]" can't be folded to "true" because "foo" could be
+  // the maximum signed value for the pointer type.
+  if (ICmpInst::isSigned(Cond))
+    return nullptr;
+
+  // Look through bitcasts and addrspacecasts. We do not however want to remove
+  // 0 GEPs.
+  if (!isa<GetElementPtrInst>(RHS))
+    RHS = RHS->stripPointerCasts();
+
+  Value *PtrBase = GEPLHS->getOperand(0);
+  // FIXME: Support vector pointer GEPs.
+  if (PtrBase == RHS && GEPLHS->isInBounds() &&
+      !GEPLHS->getType()->isVectorTy()) {
+    // ((gep Ptr, OFFSET) cmp Ptr)   ---> (OFFSET cmp 0).
+    // This transformation (ignoring the base and scales) is valid because we
+    // know pointers can't overflow since the gep is inbounds.  See if we can
+    // output an optimized form.
+    Value *Offset = evaluateGEPOffsetExpression(GEPLHS, *this, DL);
+
+    // If not, synthesize the offset the hard way.
+    if (!Offset)
+      Offset = EmitGEPOffset(GEPLHS);
+    return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Offset,
+                        Constant::getNullValue(Offset->getType()));
+  }
+
+  if (GEPLHS->isInBounds() && ICmpInst::isEquality(Cond) &&
+      isa<Constant>(RHS) && cast<Constant>(RHS)->isNullValue() &&
+      !NullPointerIsDefined(I.getFunction(),
+                            RHS->getType()->getPointerAddressSpace())) {
+    // For most address spaces, an allocation can't be placed at null, but null
+    // itself is treated as a 0 size allocation in the in bounds rules.  Thus,
+    // the only valid inbounds address derived from null, is null itself.
+    // Thus, we have four cases to consider:
+    // 1) Base == nullptr, Offset == 0 -> inbounds, null
+    // 2) Base == nullptr, Offset != 0 -> poison as the result is out of bounds
+    // 3) Base != nullptr, Offset == (-base) -> poison (crossing allocations)
+    // 4) Base != nullptr, Offset != (-base) -> nonnull (and possibly poison)
+    //
+    // (Note if we're indexing a type of size 0, that simply collapses into one
+    //  of the buckets above.)
+    //
+    // In general, we're allowed to make values less poison (i.e. remove
+    //   sources of full UB), so in this case, we just select between the two
+    //   non-poison cases (1 and 4 above).
+    //
+    // For vectors, we apply the same reasoning on a per-lane basis.
+    auto *Base = GEPLHS->getPointerOperand();
+    if (GEPLHS->getType()->isVectorTy() && Base->getType()->isPointerTy()) {
       auto EC = cast<VectorType>(GEPLHS->getType())->getElementCount();
       Base = Builder.CreateVectorSplat(EC, Base);
-    } 
-    return new ICmpInst(Cond, Base, 
-                        ConstantExpr::getPointerBitCastOrAddrSpaceCast( 
-                            cast<Constant>(RHS), Base->getType())); 
-  } else if (GEPOperator *GEPRHS = dyn_cast<GEPOperator>(RHS)) { 
-    // If the base pointers are different, but the indices are the same, just 
-    // compare the base pointer. 
-    if (PtrBase != GEPRHS->getOperand(0)) { 
-      bool IndicesTheSame = GEPLHS->getNumOperands()==GEPRHS->getNumOperands(); 
-      IndicesTheSame &= GEPLHS->getOperand(0)->getType() == 
-                        GEPRHS->getOperand(0)->getType(); 
-      if (IndicesTheSame) 
-        for (unsigned i = 1, e = GEPLHS->getNumOperands(); i != e; ++i) 
-          if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) { 
-            IndicesTheSame = false; 
-            break; 
-          } 
- 
-      // If all indices are the same, just compare the base pointers. 
-      Type *BaseType = GEPLHS->getOperand(0)->getType(); 
-      if (IndicesTheSame && CmpInst::makeCmpResultType(BaseType) == I.getType()) 
-        return new ICmpInst(Cond, GEPLHS->getOperand(0), GEPRHS->getOperand(0)); 
- 
-      // If we're comparing GEPs with two base pointers that only differ in type 
-      // and both GEPs have only constant indices or just one use, then fold 
-      // the compare with the adjusted indices. 
-      // FIXME: Support vector of pointers. 
-      if (GEPLHS->isInBounds() && GEPRHS->isInBounds() && 
-          (GEPLHS->hasAllConstantIndices() || GEPLHS->hasOneUse()) && 
-          (GEPRHS->hasAllConstantIndices() || GEPRHS->hasOneUse()) && 
-          PtrBase->stripPointerCasts() == 
-              GEPRHS->getOperand(0)->stripPointerCasts() && 
-          !GEPLHS->getType()->isVectorTy()) { 
-        Value *LOffset = EmitGEPOffset(GEPLHS); 
-        Value *ROffset = EmitGEPOffset(GEPRHS); 
- 
-        // If we looked through an addrspacecast between different sized address 
-        // spaces, the LHS and RHS pointers are different sized 
-        // integers. Truncate to the smaller one. 
-        Type *LHSIndexTy = LOffset->getType(); 
-        Type *RHSIndexTy = ROffset->getType(); 
-        if (LHSIndexTy != RHSIndexTy) { 
+    }
+    return new ICmpInst(Cond, Base,
+                        ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+                            cast<Constant>(RHS), Base->getType()));
+  } else if (GEPOperator *GEPRHS = dyn_cast<GEPOperator>(RHS)) {
+    // If the base pointers are different, but the indices are the same, just
+    // compare the base pointer.
+    if (PtrBase != GEPRHS->getOperand(0)) {
+      bool IndicesTheSame = GEPLHS->getNumOperands()==GEPRHS->getNumOperands();
+      IndicesTheSame &= GEPLHS->getOperand(0)->getType() ==
+                        GEPRHS->getOperand(0)->getType();
+      if (IndicesTheSame)
+        for (unsigned i = 1, e = GEPLHS->getNumOperands(); i != e; ++i)
+          if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) {
+            IndicesTheSame = false;
+            break;
+          }
+
+      // If all indices are the same, just compare the base pointers.
+      Type *BaseType = GEPLHS->getOperand(0)->getType();
+      if (IndicesTheSame && CmpInst::makeCmpResultType(BaseType) == I.getType())
+        return new ICmpInst(Cond, GEPLHS->getOperand(0), GEPRHS->getOperand(0));
+
+      // If we're comparing GEPs with two base pointers that only differ in type
+      // and both GEPs have only constant indices or just one use, then fold
+      // the compare with the adjusted indices.
+      // FIXME: Support vector of pointers.
+      if (GEPLHS->isInBounds() && GEPRHS->isInBounds() &&
+          (GEPLHS->hasAllConstantIndices() || GEPLHS->hasOneUse()) &&
+          (GEPRHS->hasAllConstantIndices() || GEPRHS->hasOneUse()) &&
+          PtrBase->stripPointerCasts() ==
+              GEPRHS->getOperand(0)->stripPointerCasts() &&
+          !GEPLHS->getType()->isVectorTy()) {
+        Value *LOffset = EmitGEPOffset(GEPLHS);
+        Value *ROffset = EmitGEPOffset(GEPRHS);
+
+        // If we looked through an addrspacecast between different sized address
+        // spaces, the LHS and RHS pointers are different sized
+        // integers. Truncate to the smaller one.
+        Type *LHSIndexTy = LOffset->getType();
+        Type *RHSIndexTy = ROffset->getType();
+        if (LHSIndexTy != RHSIndexTy) {
           if (LHSIndexTy->getPrimitiveSizeInBits().getFixedSize() <
               RHSIndexTy->getPrimitiveSizeInBits().getFixedSize()) {
-            ROffset = Builder.CreateTrunc(ROffset, LHSIndexTy); 
-          } else 
-            LOffset = Builder.CreateTrunc(LOffset, RHSIndexTy); 
-        } 
- 
-        Value *Cmp = Builder.CreateICmp(ICmpInst::getSignedPredicate(Cond), 
-                                        LOffset, ROffset); 
-        return replaceInstUsesWith(I, Cmp); 
-      } 
- 
-      // Otherwise, the base pointers are different and the indices are 
-      // different. Try convert this to an indexed compare by looking through 
-      // PHIs/casts. 
-      return transformToIndexedCompare(GEPLHS, RHS, Cond, DL); 
-    } 
- 
-    // If one of the GEPs has all zero indices, recurse. 
-    // FIXME: Handle vector of pointers. 
-    if (!GEPLHS->getType()->isVectorTy() && GEPLHS->hasAllZeroIndices()) 
-      return foldGEPICmp(GEPRHS, GEPLHS->getOperand(0), 
-                         ICmpInst::getSwappedPredicate(Cond), I); 
- 
-    // If the other GEP has all zero indices, recurse. 
-    // FIXME: Handle vector of pointers. 
-    if (!GEPRHS->getType()->isVectorTy() && GEPRHS->hasAllZeroIndices()) 
-      return foldGEPICmp(GEPLHS, GEPRHS->getOperand(0), Cond, I); 
- 
-    bool GEPsInBounds = GEPLHS->isInBounds() && GEPRHS->isInBounds(); 
-    if (GEPLHS->getNumOperands() == GEPRHS->getNumOperands()) { 
-      // If the GEPs only differ by one index, compare it. 
-      unsigned NumDifferences = 0;  // Keep track of # differences. 
-      unsigned DiffOperand = 0;     // The operand that differs. 
-      for (unsigned i = 1, e = GEPRHS->getNumOperands(); i != e; ++i) 
-        if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) { 
-          Type *LHSType = GEPLHS->getOperand(i)->getType(); 
-          Type *RHSType = GEPRHS->getOperand(i)->getType(); 
-          // FIXME: Better support for vector of pointers. 
-          if (LHSType->getPrimitiveSizeInBits() != 
-                   RHSType->getPrimitiveSizeInBits() || 
-              (GEPLHS->getType()->isVectorTy() && 
-               (!LHSType->isVectorTy() || !RHSType->isVectorTy()))) { 
-            // Irreconcilable differences. 
-            NumDifferences = 2; 
-            break; 
-          } 
- 
-          if (NumDifferences++) break; 
-          DiffOperand = i; 
-        } 
- 
-      if (NumDifferences == 0)   // SAME GEP? 
-        return replaceInstUsesWith(I, // No comparison is needed here. 
-          ConstantInt::get(I.getType(), ICmpInst::isTrueWhenEqual(Cond))); 
- 
-      else if (NumDifferences == 1 && GEPsInBounds) { 
-        Value *LHSV = GEPLHS->getOperand(DiffOperand); 
-        Value *RHSV = GEPRHS->getOperand(DiffOperand); 
-        // Make sure we do a signed comparison here. 
-        return new ICmpInst(ICmpInst::getSignedPredicate(Cond), LHSV, RHSV); 
-      } 
-    } 
- 
-    // Only lower this if the icmp is the only user of the GEP or if we expect 
-    // the result to fold to a constant! 
-    if (GEPsInBounds && (isa<ConstantExpr>(GEPLHS) || GEPLHS->hasOneUse()) && 
-        (isa<ConstantExpr>(GEPRHS) || GEPRHS->hasOneUse())) { 
-      // ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2)  --->  (OFFSET1 cmp OFFSET2) 
-      Value *L = EmitGEPOffset(GEPLHS); 
-      Value *R = EmitGEPOffset(GEPRHS); 
-      return new ICmpInst(ICmpInst::getSignedPredicate(Cond), L, R); 
-    } 
-  } 
- 
-  // Try convert this to an indexed compare by looking through PHIs/casts as a 
-  // last resort. 
-  return transformToIndexedCompare(GEPLHS, RHS, Cond, DL); 
-} 
- 
+            ROffset = Builder.CreateTrunc(ROffset, LHSIndexTy);
+          } else
+            LOffset = Builder.CreateTrunc(LOffset, RHSIndexTy);
+        }
+
+        Value *Cmp = Builder.CreateICmp(ICmpInst::getSignedPredicate(Cond),
+                                        LOffset, ROffset);
+        return replaceInstUsesWith(I, Cmp);
+      }
+
+      // Otherwise, the base pointers are different and the indices are
+      // different. Try convert this to an indexed compare by looking through
+      // PHIs/casts.
+      return transformToIndexedCompare(GEPLHS, RHS, Cond, DL);
+    }
+
+    // If one of the GEPs has all zero indices, recurse.
+    // FIXME: Handle vector of pointers.
+    if (!GEPLHS->getType()->isVectorTy() && GEPLHS->hasAllZeroIndices())
+      return foldGEPICmp(GEPRHS, GEPLHS->getOperand(0),
+                         ICmpInst::getSwappedPredicate(Cond), I);
+
+    // If the other GEP has all zero indices, recurse.
+    // FIXME: Handle vector of pointers.
+    if (!GEPRHS->getType()->isVectorTy() && GEPRHS->hasAllZeroIndices())
+      return foldGEPICmp(GEPLHS, GEPRHS->getOperand(0), Cond, I);
+
+    bool GEPsInBounds = GEPLHS->isInBounds() && GEPRHS->isInBounds();
+    if (GEPLHS->getNumOperands() == GEPRHS->getNumOperands()) {
+      // If the GEPs only differ by one index, compare it.
+      unsigned NumDifferences = 0;  // Keep track of # differences.
+      unsigned DiffOperand = 0;     // The operand that differs.
+      for (unsigned i = 1, e = GEPRHS->getNumOperands(); i != e; ++i)
+        if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) {
+          Type *LHSType = GEPLHS->getOperand(i)->getType();
+          Type *RHSType = GEPRHS->getOperand(i)->getType();
+          // FIXME: Better support for vector of pointers.
+          if (LHSType->getPrimitiveSizeInBits() !=
+                   RHSType->getPrimitiveSizeInBits() ||
+              (GEPLHS->getType()->isVectorTy() &&
+               (!LHSType->isVectorTy() || !RHSType->isVectorTy()))) {
+            // Irreconcilable differences.
+            NumDifferences = 2;
+            break;
+          }
+
+          if (NumDifferences++) break;
+          DiffOperand = i;
+        }
+
+      if (NumDifferences == 0)   // SAME GEP?
+        return replaceInstUsesWith(I, // No comparison is needed here.
+          ConstantInt::get(I.getType(), ICmpInst::isTrueWhenEqual(Cond)));
+
+      else if (NumDifferences == 1 && GEPsInBounds) {
+        Value *LHSV = GEPLHS->getOperand(DiffOperand);
+        Value *RHSV = GEPRHS->getOperand(DiffOperand);
+        // Make sure we do a signed comparison here.
+        return new ICmpInst(ICmpInst::getSignedPredicate(Cond), LHSV, RHSV);
+      }
+    }
+
+    // Only lower this if the icmp is the only user of the GEP or if we expect
+    // the result to fold to a constant!
+    if (GEPsInBounds && (isa<ConstantExpr>(GEPLHS) || GEPLHS->hasOneUse()) &&
+        (isa<ConstantExpr>(GEPRHS) || GEPRHS->hasOneUse())) {
+      // ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2)  --->  (OFFSET1 cmp OFFSET2)
+      Value *L = EmitGEPOffset(GEPLHS);
+      Value *R = EmitGEPOffset(GEPRHS);
+      return new ICmpInst(ICmpInst::getSignedPredicate(Cond), L, R);
+    }
+  }
+
+  // Try convert this to an indexed compare by looking through PHIs/casts as a
+  // last resort.
+  return transformToIndexedCompare(GEPLHS, RHS, Cond, DL);
+}
+
 Instruction *InstCombinerImpl::foldAllocaCmp(ICmpInst &ICI,
                                              const AllocaInst *Alloca,
                                              const Value *Other) {
-  assert(ICI.isEquality() && "Cannot fold non-equality comparison."); 
- 
-  // It would be tempting to fold away comparisons between allocas and any 
-  // pointer not based on that alloca (e.g. an argument). However, even 
-  // though such pointers cannot alias, they can still compare equal. 
-  // 
-  // But LLVM doesn't specify where allocas get their memory, so if the alloca 
-  // doesn't escape we can argue that it's impossible to guess its value, and we 
-  // can therefore act as if any such guesses are wrong. 
-  // 
-  // The code below checks that the alloca doesn't escape, and that it's only 
-  // used in a comparison once (the current instruction). The 
-  // single-comparison-use condition ensures that we're trivially folding all 
-  // comparisons against the alloca consistently, and avoids the risk of 
-  // erroneously folding a comparison of the pointer with itself. 
- 
-  unsigned MaxIter = 32; // Break cycles and bound to constant-time. 
- 
-  SmallVector<const Use *, 32> Worklist; 
-  for (const Use &U : Alloca->uses()) { 
-    if (Worklist.size() >= MaxIter) 
-      return nullptr; 
-    Worklist.push_back(&U); 
-  } 
- 
-  unsigned NumCmps = 0; 
-  while (!Worklist.empty()) { 
-    assert(Worklist.size() <= MaxIter); 
-    const Use *U = Worklist.pop_back_val(); 
-    const Value *V = U->getUser(); 
-    --MaxIter; 
- 
-    if (isa<BitCastInst>(V) || isa<GetElementPtrInst>(V) || isa<PHINode>(V) || 
-        isa<SelectInst>(V)) { 
-      // Track the uses. 
-    } else if (isa<LoadInst>(V)) { 
-      // Loading from the pointer doesn't escape it. 
-      continue; 
-    } else if (const auto *SI = dyn_cast<StoreInst>(V)) { 
-      // Storing *to* the pointer is fine, but storing the pointer escapes it. 
-      if (SI->getValueOperand() == U->get()) 
-        return nullptr; 
-      continue; 
-    } else if (isa<ICmpInst>(V)) { 
-      if (NumCmps++) 
-        return nullptr; // Found more than one cmp. 
-      continue; 
-    } else if (const auto *Intrin = dyn_cast<IntrinsicInst>(V)) { 
-      switch (Intrin->getIntrinsicID()) { 
-        // These intrinsics don't escape or compare the pointer. Memset is safe 
-        // because we don't allow ptrtoint. Memcpy and memmove are safe because 
-        // we don't allow stores, so src cannot point to V. 
-        case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: 
-        case Intrinsic::memcpy: case Intrinsic::memmove: case Intrinsic::memset: 
-          continue; 
-        default: 
-          return nullptr; 
-      } 
-    } else { 
-      return nullptr; 
-    } 
-    for (const Use &U : V->uses()) { 
-      if (Worklist.size() >= MaxIter) 
-        return nullptr; 
-      Worklist.push_back(&U); 
-    } 
-  } 
- 
-  Type *CmpTy = CmpInst::makeCmpResultType(Other->getType()); 
-  return replaceInstUsesWith( 
-      ICI, 
-      ConstantInt::get(CmpTy, !CmpInst::isTrueWhenEqual(ICI.getPredicate()))); 
-} 
- 
-/// Fold "icmp pred (X+C), X". 
+  assert(ICI.isEquality() && "Cannot fold non-equality comparison.");
+
+  // It would be tempting to fold away comparisons between allocas and any
+  // pointer not based on that alloca (e.g. an argument). However, even
+  // though such pointers cannot alias, they can still compare equal.
+  //
+  // But LLVM doesn't specify where allocas get their memory, so if the alloca
+  // doesn't escape we can argue that it's impossible to guess its value, and we
+  // can therefore act as if any such guesses are wrong.
+  //
+  // The code below checks that the alloca doesn't escape, and that it's only
+  // used in a comparison once (the current instruction). The
+  // single-comparison-use condition ensures that we're trivially folding all
+  // comparisons against the alloca consistently, and avoids the risk of
+  // erroneously folding a comparison of the pointer with itself.
+
+  unsigned MaxIter = 32; // Break cycles and bound to constant-time.
+
+  SmallVector<const Use *, 32> Worklist;
+  for (const Use &U : Alloca->uses()) {
+    if (Worklist.size() >= MaxIter)
+      return nullptr;
+    Worklist.push_back(&U);
+  }
+
+  unsigned NumCmps = 0;
+  while (!Worklist.empty()) {
+    assert(Worklist.size() <= MaxIter);
+    const Use *U = Worklist.pop_back_val();
+    const Value *V = U->getUser();
+    --MaxIter;
+
+    if (isa<BitCastInst>(V) || isa<GetElementPtrInst>(V) || isa<PHINode>(V) ||
+        isa<SelectInst>(V)) {
+      // Track the uses.
+    } else if (isa<LoadInst>(V)) {
+      // Loading from the pointer doesn't escape it.
+      continue;
+    } else if (const auto *SI = dyn_cast<StoreInst>(V)) {
+      // Storing *to* the pointer is fine, but storing the pointer escapes it.
+      if (SI->getValueOperand() == U->get())
+        return nullptr;
+      continue;
+    } else if (isa<ICmpInst>(V)) {
+      if (NumCmps++)
+        return nullptr; // Found more than one cmp.
+      continue;
+    } else if (const auto *Intrin = dyn_cast<IntrinsicInst>(V)) {
+      switch (Intrin->getIntrinsicID()) {
+        // These intrinsics don't escape or compare the pointer. Memset is safe
+        // because we don't allow ptrtoint. Memcpy and memmove are safe because
+        // we don't allow stores, so src cannot point to V.
+        case Intrinsic::lifetime_start: case Intrinsic::lifetime_end:
+        case Intrinsic::memcpy: case Intrinsic::memmove: case Intrinsic::memset:
+          continue;
+        default:
+          return nullptr;
+      }
+    } else {
+      return nullptr;
+    }
+    for (const Use &U : V->uses()) {
+      if (Worklist.size() >= MaxIter)
+        return nullptr;
+      Worklist.push_back(&U);
+    }
+  }
+
+  Type *CmpTy = CmpInst::makeCmpResultType(Other->getType());
+  return replaceInstUsesWith(
+      ICI,
+      ConstantInt::get(CmpTy, !CmpInst::isTrueWhenEqual(ICI.getPredicate())));
+}
+
+/// Fold "icmp pred (X+C), X".
 Instruction *InstCombinerImpl::foldICmpAddOpConst(Value *X, const APInt &C,
                                                   ICmpInst::Predicate Pred) {
-  // From this point on, we know that (X+C <= X) --> (X+C < X) because C != 0, 
-  // so the values can never be equal.  Similarly for all other "or equals" 
-  // operators. 
-  assert(!!C && "C should not be zero!"); 
- 
-  // (X+1) <u X        --> X >u (MAXUINT-1)        --> X == 255 
-  // (X+2) <u X        --> X >u (MAXUINT-2)        --> X > 253 
-  // (X+MAXUINT) <u X  --> X >u (MAXUINT-MAXUINT)  --> X != 0 
-  if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_ULE) { 
-    Constant *R = ConstantInt::get(X->getType(), 
-                                   APInt::getMaxValue(C.getBitWidth()) - C); 
-    return new ICmpInst(ICmpInst::ICMP_UGT, X, R); 
-  } 
- 
-  // (X+1) >u X        --> X <u (0-1)        --> X != 255 
-  // (X+2) >u X        --> X <u (0-2)        --> X <u 254 
-  // (X+MAXUINT) >u X  --> X <u (0-MAXUINT)  --> X <u 1  --> X == 0 
-  if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_UGE) 
-    return new ICmpInst(ICmpInst::ICMP_ULT, X, 
-                        ConstantInt::get(X->getType(), -C)); 
- 
-  APInt SMax = APInt::getSignedMaxValue(C.getBitWidth()); 
- 
-  // (X+ 1) <s X       --> X >s (MAXSINT-1)          --> X == 127 
-  // (X+ 2) <s X       --> X >s (MAXSINT-2)          --> X >s 125 
-  // (X+MAXSINT) <s X  --> X >s (MAXSINT-MAXSINT)    --> X >s 0 
-  // (X+MINSINT) <s X  --> X >s (MAXSINT-MINSINT)    --> X >s -1 
-  // (X+ -2) <s X      --> X >s (MAXSINT- -2)        --> X >s 126 
-  // (X+ -1) <s X      --> X >s (MAXSINT- -1)        --> X != 127 
-  if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE) 
-    return new ICmpInst(ICmpInst::ICMP_SGT, X, 
-                        ConstantInt::get(X->getType(), SMax - C)); 
- 
-  // (X+ 1) >s X       --> X <s (MAXSINT-(1-1))       --> X != 127 
-  // (X+ 2) >s X       --> X <s (MAXSINT-(2-1))       --> X <s 126 
-  // (X+MAXSINT) >s X  --> X <s (MAXSINT-(MAXSINT-1)) --> X <s 1 
-  // (X+MINSINT) >s X  --> X <s (MAXSINT-(MINSINT-1)) --> X <s -2 
-  // (X+ -2) >s X      --> X <s (MAXSINT-(-2-1))      --> X <s -126 
-  // (X+ -1) >s X      --> X <s (MAXSINT-(-1-1))      --> X == -128 
- 
-  assert(Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE); 
-  return new ICmpInst(ICmpInst::ICMP_SLT, X, 
-                      ConstantInt::get(X->getType(), SMax - (C - 1))); 
-} 
- 
-/// Handle "(icmp eq/ne (ashr/lshr AP2, A), AP1)" -> 
-/// (icmp eq/ne A, Log2(AP2/AP1)) -> 
-/// (icmp eq/ne A, Log2(AP2) - Log2(AP1)). 
+  // From this point on, we know that (X+C <= X) --> (X+C < X) because C != 0,
+  // so the values can never be equal.  Similarly for all other "or equals"
+  // operators.
+  assert(!!C && "C should not be zero!");
+
+  // (X+1) <u X        --> X >u (MAXUINT-1)        --> X == 255
+  // (X+2) <u X        --> X >u (MAXUINT-2)        --> X > 253
+  // (X+MAXUINT) <u X  --> X >u (MAXUINT-MAXUINT)  --> X != 0
+  if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_ULE) {
+    Constant *R = ConstantInt::get(X->getType(),
+                                   APInt::getMaxValue(C.getBitWidth()) - C);
+    return new ICmpInst(ICmpInst::ICMP_UGT, X, R);
+  }
+
+  // (X+1) >u X        --> X <u (0-1)        --> X != 255
+  // (X+2) >u X        --> X <u (0-2)        --> X <u 254
+  // (X+MAXUINT) >u X  --> X <u (0-MAXUINT)  --> X <u 1  --> X == 0
+  if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_UGE)
+    return new ICmpInst(ICmpInst::ICMP_ULT, X,
+                        ConstantInt::get(X->getType(), -C));
+
+  APInt SMax = APInt::getSignedMaxValue(C.getBitWidth());
+
+  // (X+ 1) <s X       --> X >s (MAXSINT-1)          --> X == 127
+  // (X+ 2) <s X       --> X >s (MAXSINT-2)          --> X >s 125
+  // (X+MAXSINT) <s X  --> X >s (MAXSINT-MAXSINT)    --> X >s 0
+  // (X+MINSINT) <s X  --> X >s (MAXSINT-MINSINT)    --> X >s -1
+  // (X+ -2) <s X      --> X >s (MAXSINT- -2)        --> X >s 126
+  // (X+ -1) <s X      --> X >s (MAXSINT- -1)        --> X != 127
+  if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE)
+    return new ICmpInst(ICmpInst::ICMP_SGT, X,
+                        ConstantInt::get(X->getType(), SMax - C));
+
+  // (X+ 1) >s X       --> X <s (MAXSINT-(1-1))       --> X != 127
+  // (X+ 2) >s X       --> X <s (MAXSINT-(2-1))       --> X <s 126
+  // (X+MAXSINT) >s X  --> X <s (MAXSINT-(MAXSINT-1)) --> X <s 1
+  // (X+MINSINT) >s X  --> X <s (MAXSINT-(MINSINT-1)) --> X <s -2
+  // (X+ -2) >s X      --> X <s (MAXSINT-(-2-1))      --> X <s -126
+  // (X+ -1) >s X      --> X <s (MAXSINT-(-1-1))      --> X == -128
+
+  assert(Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE);
+  return new ICmpInst(ICmpInst::ICMP_SLT, X,
+                      ConstantInt::get(X->getType(), SMax - (C - 1)));
+}
+
+/// Handle "(icmp eq/ne (ashr/lshr AP2, A), AP1)" ->
+/// (icmp eq/ne A, Log2(AP2/AP1)) ->
+/// (icmp eq/ne A, Log2(AP2) - Log2(AP1)).
 Instruction *InstCombinerImpl::foldICmpShrConstConst(ICmpInst &I, Value *A,
                                                      const APInt &AP1,
                                                      const APInt &AP2) {
-  assert(I.isEquality() && "Cannot fold icmp gt/lt"); 
- 
-  auto getICmp = [&I](CmpInst::Predicate Pred, Value *LHS, Value *RHS) { 
-    if (I.getPredicate() == I.ICMP_NE) 
-      Pred = CmpInst::getInversePredicate(Pred); 
-    return new ICmpInst(Pred, LHS, RHS); 
-  }; 
- 
-  // Don't bother doing any work for cases which InstSimplify handles. 
-  if (AP2.isNullValue()) 
-    return nullptr; 
- 
-  bool IsAShr = isa<AShrOperator>(I.getOperand(0)); 
-  if (IsAShr) { 
-    if (AP2.isAllOnesValue()) 
-      return nullptr; 
-    if (AP2.isNegative() != AP1.isNegative()) 
-      return nullptr; 
-    if (AP2.sgt(AP1)) 
-      return nullptr; 
-  } 
- 
-  if (!AP1) 
-    // 'A' must be large enough to shift out the highest set bit. 
-    return getICmp(I.ICMP_UGT, A, 
-                   ConstantInt::get(A->getType(), AP2.logBase2())); 
- 
-  if (AP1 == AP2) 
-    return getICmp(I.ICMP_EQ, A, ConstantInt::getNullValue(A->getType())); 
- 
-  int Shift; 
-  if (IsAShr && AP1.isNegative()) 
-    Shift = AP1.countLeadingOnes() - AP2.countLeadingOnes(); 
-  else 
-    Shift = AP1.countLeadingZeros() - AP2.countLeadingZeros(); 
- 
-  if (Shift > 0) { 
-    if (IsAShr && AP1 == AP2.ashr(Shift)) { 
-      // There are multiple solutions if we are comparing against -1 and the LHS 
-      // of the ashr is not a power of two. 
-      if (AP1.isAllOnesValue() && !AP2.isPowerOf2()) 
-        return getICmp(I.ICMP_UGE, A, ConstantInt::get(A->getType(), Shift)); 
-      return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift)); 
-    } else if (AP1 == AP2.lshr(Shift)) { 
-      return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift)); 
-    } 
-  } 
- 
-  // Shifting const2 will never be equal to const1. 
-  // FIXME: This should always be handled by InstSimplify? 
-  auto *TorF = ConstantInt::get(I.getType(), I.getPredicate() == I.ICMP_NE); 
-  return replaceInstUsesWith(I, TorF); 
-} 
- 
-/// Handle "(icmp eq/ne (shl AP2, A), AP1)" -> 
-/// (icmp eq/ne A, TrailingZeros(AP1) - TrailingZeros(AP2)). 
+  assert(I.isEquality() && "Cannot fold icmp gt/lt");
+
+  auto getICmp = [&I](CmpInst::Predicate Pred, Value *LHS, Value *RHS) {
+    if (I.getPredicate() == I.ICMP_NE)
+      Pred = CmpInst::getInversePredicate(Pred);
+    return new ICmpInst(Pred, LHS, RHS);
+  };
+
+  // Don't bother doing any work for cases which InstSimplify handles.
+  if (AP2.isNullValue())
+    return nullptr;
+
+  bool IsAShr = isa<AShrOperator>(I.getOperand(0));
+  if (IsAShr) {
+    if (AP2.isAllOnesValue())
+      return nullptr;
+    if (AP2.isNegative() != AP1.isNegative())
+      return nullptr;
+    if (AP2.sgt(AP1))
+      return nullptr;
+  }
+
+  if (!AP1)
+    // 'A' must be large enough to shift out the highest set bit.
+    return getICmp(I.ICMP_UGT, A,
+                   ConstantInt::get(A->getType(), AP2.logBase2()));
+
+  if (AP1 == AP2)
+    return getICmp(I.ICMP_EQ, A, ConstantInt::getNullValue(A->getType()));
+
+  int Shift;
+  if (IsAShr && AP1.isNegative())
+    Shift = AP1.countLeadingOnes() - AP2.countLeadingOnes();
+  else
+    Shift = AP1.countLeadingZeros() - AP2.countLeadingZeros();
+
+  if (Shift > 0) {
+    if (IsAShr && AP1 == AP2.ashr(Shift)) {
+      // There are multiple solutions if we are comparing against -1 and the LHS
+      // of the ashr is not a power of two.
+      if (AP1.isAllOnesValue() && !AP2.isPowerOf2())
+        return getICmp(I.ICMP_UGE, A, ConstantInt::get(A->getType(), Shift));
+      return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift));
+    } else if (AP1 == AP2.lshr(Shift)) {
+      return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift));
+    }
+  }
+
+  // Shifting const2 will never be equal to const1.
+  // FIXME: This should always be handled by InstSimplify?
+  auto *TorF = ConstantInt::get(I.getType(), I.getPredicate() == I.ICMP_NE);
+  return replaceInstUsesWith(I, TorF);
+}
+
+/// Handle "(icmp eq/ne (shl AP2, A), AP1)" ->
+/// (icmp eq/ne A, TrailingZeros(AP1) - TrailingZeros(AP2)).
 Instruction *InstCombinerImpl::foldICmpShlConstConst(ICmpInst &I, Value *A,
                                                      const APInt &AP1,
                                                      const APInt &AP2) {
-  assert(I.isEquality() && "Cannot fold icmp gt/lt"); 
- 
-  auto getICmp = [&I](CmpInst::Predicate Pred, Value *LHS, Value *RHS) { 
-    if (I.getPredicate() == I.ICMP_NE) 
-      Pred = CmpInst::getInversePredicate(Pred); 
-    return new ICmpInst(Pred, LHS, RHS); 
-  }; 
- 
-  // Don't bother doing any work for cases which InstSimplify handles. 
-  if (AP2.isNullValue()) 
-    return nullptr; 
- 
-  unsigned AP2TrailingZeros = AP2.countTrailingZeros(); 
- 
-  if (!AP1 && AP2TrailingZeros != 0) 
-    return getICmp( 
-        I.ICMP_UGE, A, 
-        ConstantInt::get(A->getType(), AP2.getBitWidth() - AP2TrailingZeros)); 
- 
-  if (AP1 == AP2) 
-    return getICmp(I.ICMP_EQ, A, ConstantInt::getNullValue(A->getType())); 
- 
-  // Get the distance between the lowest bits that are set. 
-  int Shift = AP1.countTrailingZeros() - AP2TrailingZeros; 
- 
-  if (Shift > 0 && AP2.shl(Shift) == AP1) 
-    return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift)); 
- 
-  // Shifting const2 will never be equal to const1. 
-  // FIXME: This should always be handled by InstSimplify? 
-  auto *TorF = ConstantInt::get(I.getType(), I.getPredicate() == I.ICMP_NE); 
-  return replaceInstUsesWith(I, TorF); 
-} 
- 
-/// The caller has matched a pattern of the form: 
-///   I = icmp ugt (add (add A, B), CI2), CI1 
-/// If this is of the form: 
-///   sum = a + b 
-///   if (sum+128 >u 255) 
-/// Then replace it with llvm.sadd.with.overflow.i8. 
-/// 
-static Instruction *processUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B, 
-                                          ConstantInt *CI2, ConstantInt *CI1, 
+  assert(I.isEquality() && "Cannot fold icmp gt/lt");
+
+  auto getICmp = [&I](CmpInst::Predicate Pred, Value *LHS, Value *RHS) {
+    if (I.getPredicate() == I.ICMP_NE)
+      Pred = CmpInst::getInversePredicate(Pred);
+    return new ICmpInst(Pred, LHS, RHS);
+  };
+
+  // Don't bother doing any work for cases which InstSimplify handles.
+  if (AP2.isNullValue())
+    return nullptr;
+
+  unsigned AP2TrailingZeros = AP2.countTrailingZeros();
+
+  if (!AP1 && AP2TrailingZeros != 0)
+    return getICmp(
+        I.ICMP_UGE, A,
+        ConstantInt::get(A->getType(), AP2.getBitWidth() - AP2TrailingZeros));
+
+  if (AP1 == AP2)
+    return getICmp(I.ICMP_EQ, A, ConstantInt::getNullValue(A->getType()));
+
+  // Get the distance between the lowest bits that are set.
+  int Shift = AP1.countTrailingZeros() - AP2TrailingZeros;
+
+  if (Shift > 0 && AP2.shl(Shift) == AP1)
+    return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift));
+
+  // Shifting const2 will never be equal to const1.
+  // FIXME: This should always be handled by InstSimplify?
+  auto *TorF = ConstantInt::get(I.getType(), I.getPredicate() == I.ICMP_NE);
+  return replaceInstUsesWith(I, TorF);
+}
+
+/// The caller has matched a pattern of the form:
+///   I = icmp ugt (add (add A, B), CI2), CI1
+/// If this is of the form:
+///   sum = a + b
+///   if (sum+128 >u 255)
+/// Then replace it with llvm.sadd.with.overflow.i8.
+///
+static Instruction *processUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
+                                          ConstantInt *CI2, ConstantInt *CI1,
                                           InstCombinerImpl &IC) {
-  // The transformation we're trying to do here is to transform this into an 
-  // llvm.sadd.with.overflow.  To do this, we have to replace the original add 
-  // with a narrower add, and discard the add-with-constant that is part of the 
-  // range check (if we can't eliminate it, this isn't profitable). 
- 
-  // In order to eliminate the add-with-constant, the compare can be its only 
-  // use. 
-  Instruction *AddWithCst = cast<Instruction>(I.getOperand(0)); 
-  if (!AddWithCst->hasOneUse()) 
-    return nullptr; 
- 
-  // If CI2 is 2^7, 2^15, 2^31, then it might be an sadd.with.overflow. 
-  if (!CI2->getValue().isPowerOf2()) 
-    return nullptr; 
-  unsigned NewWidth = CI2->getValue().countTrailingZeros(); 
-  if (NewWidth != 7 && NewWidth != 15 && NewWidth != 31) 
-    return nullptr; 
- 
-  // The width of the new add formed is 1 more than the bias. 
-  ++NewWidth; 
- 
-  // Check to see that CI1 is an all-ones value with NewWidth bits. 
-  if (CI1->getBitWidth() == NewWidth || 
-      CI1->getValue() != APInt::getLowBitsSet(CI1->getBitWidth(), NewWidth)) 
-    return nullptr; 
- 
-  // This is only really a signed overflow check if the inputs have been 
-  // sign-extended; check for that condition. For example, if CI2 is 2^31 and 
-  // the operands of the add are 64 bits wide, we need at least 33 sign bits. 
-  unsigned NeededSignBits = CI1->getBitWidth() - NewWidth + 1; 
-  if (IC.ComputeNumSignBits(A, 0, &I) < NeededSignBits || 
-      IC.ComputeNumSignBits(B, 0, &I) < NeededSignBits) 
-    return nullptr; 
- 
-  // In order to replace the original add with a narrower 
-  // llvm.sadd.with.overflow, the only uses allowed are the add-with-constant 
-  // and truncates that discard the high bits of the add.  Verify that this is 
-  // the case. 
-  Instruction *OrigAdd = cast<Instruction>(AddWithCst->getOperand(0)); 
-  for (User *U : OrigAdd->users()) { 
-    if (U == AddWithCst) 
-      continue; 
- 
-    // Only accept truncates for now.  We would really like a nice recursive 
-    // predicate like SimplifyDemandedBits, but which goes downwards the use-def 
-    // chain to see which bits of a value are actually demanded.  If the 
-    // original add had another add which was then immediately truncated, we 
-    // could still do the transformation. 
-    TruncInst *TI = dyn_cast<TruncInst>(U); 
-    if (!TI || TI->getType()->getPrimitiveSizeInBits() > NewWidth) 
-      return nullptr; 
-  } 
- 
-  // If the pattern matches, truncate the inputs to the narrower type and 
-  // use the sadd_with_overflow intrinsic to efficiently compute both the 
-  // result and the overflow bit. 
-  Type *NewType = IntegerType::get(OrigAdd->getContext(), NewWidth); 
-  Function *F = Intrinsic::getDeclaration( 
-      I.getModule(), Intrinsic::sadd_with_overflow, NewType); 
- 
-  InstCombiner::BuilderTy &Builder = IC.Builder; 
- 
-  // Put the new code above the original add, in case there are any uses of the 
-  // add between the add and the compare. 
-  Builder.SetInsertPoint(OrigAdd); 
- 
-  Value *TruncA = Builder.CreateTrunc(A, NewType, A->getName() + ".trunc"); 
-  Value *TruncB = Builder.CreateTrunc(B, NewType, B->getName() + ".trunc"); 
-  CallInst *Call = Builder.CreateCall(F, {TruncA, TruncB}, "sadd"); 
-  Value *Add = Builder.CreateExtractValue(Call, 0, "sadd.result"); 
-  Value *ZExt = Builder.CreateZExt(Add, OrigAdd->getType()); 
- 
-  // The inner add was the result of the narrow add, zero extended to the 
-  // wider type.  Replace it with the result computed by the intrinsic. 
-  IC.replaceInstUsesWith(*OrigAdd, ZExt); 
-  IC.eraseInstFromFunction(*OrigAdd); 
- 
-  // The original icmp gets replaced with the overflow value. 
-  return ExtractValueInst::Create(Call, 1, "sadd.overflow"); 
-} 
- 
-/// If we have: 
-///   icmp eq/ne (urem/srem %x, %y), 0 
-/// iff %y is a power-of-two, we can replace this with a bit test: 
-///   icmp eq/ne (and %x, (add %y, -1)), 0 
+  // The transformation we're trying to do here is to transform this into an
+  // llvm.sadd.with.overflow.  To do this, we have to replace the original add
+  // with a narrower add, and discard the add-with-constant that is part of the
+  // range check (if we can't eliminate it, this isn't profitable).
+
+  // In order to eliminate the add-with-constant, the compare can be its only
+  // use.
+  Instruction *AddWithCst = cast<Instruction>(I.getOperand(0));
+  if (!AddWithCst->hasOneUse())
+    return nullptr;
+
+  // If CI2 is 2^7, 2^15, 2^31, then it might be an sadd.with.overflow.
+  if (!CI2->getValue().isPowerOf2())
+    return nullptr;
+  unsigned NewWidth = CI2->getValue().countTrailingZeros();
+  if (NewWidth != 7 && NewWidth != 15 && NewWidth != 31)
+    return nullptr;
+
+  // The width of the new add formed is 1 more than the bias.
+  ++NewWidth;
+
+  // Check to see that CI1 is an all-ones value with NewWidth bits.
+  if (CI1->getBitWidth() == NewWidth ||
+      CI1->getValue() != APInt::getLowBitsSet(CI1->getBitWidth(), NewWidth))
+    return nullptr;
+
+  // This is only really a signed overflow check if the inputs have been
+  // sign-extended; check for that condition. For example, if CI2 is 2^31 and
+  // the operands of the add are 64 bits wide, we need at least 33 sign bits.
+  unsigned NeededSignBits = CI1->getBitWidth() - NewWidth + 1;
+  if (IC.ComputeNumSignBits(A, 0, &I) < NeededSignBits ||
+      IC.ComputeNumSignBits(B, 0, &I) < NeededSignBits)
+    return nullptr;
+
+  // In order to replace the original add with a narrower
+  // llvm.sadd.with.overflow, the only uses allowed are the add-with-constant
+  // and truncates that discard the high bits of the add.  Verify that this is
+  // the case.
+  Instruction *OrigAdd = cast<Instruction>(AddWithCst->getOperand(0));
+  for (User *U : OrigAdd->users()) {
+    if (U == AddWithCst)
+      continue;
+
+    // Only accept truncates for now.  We would really like a nice recursive
+    // predicate like SimplifyDemandedBits, but which goes downwards the use-def
+    // chain to see which bits of a value are actually demanded.  If the
+    // original add had another add which was then immediately truncated, we
+    // could still do the transformation.
+    TruncInst *TI = dyn_cast<TruncInst>(U);
+    if (!TI || TI->getType()->getPrimitiveSizeInBits() > NewWidth)
+      return nullptr;
+  }
+
+  // If the pattern matches, truncate the inputs to the narrower type and
+  // use the sadd_with_overflow intrinsic to efficiently compute both the
+  // result and the overflow bit.
+  Type *NewType = IntegerType::get(OrigAdd->getContext(), NewWidth);
+  Function *F = Intrinsic::getDeclaration(
+      I.getModule(), Intrinsic::sadd_with_overflow, NewType);
+
+  InstCombiner::BuilderTy &Builder = IC.Builder;
+
+  // Put the new code above the original add, in case there are any uses of the
+  // add between the add and the compare.
+  Builder.SetInsertPoint(OrigAdd);
+
+  Value *TruncA = Builder.CreateTrunc(A, NewType, A->getName() + ".trunc");
+  Value *TruncB = Builder.CreateTrunc(B, NewType, B->getName() + ".trunc");
+  CallInst *Call = Builder.CreateCall(F, {TruncA, TruncB}, "sadd");
+  Value *Add = Builder.CreateExtractValue(Call, 0, "sadd.result");
+  Value *ZExt = Builder.CreateZExt(Add, OrigAdd->getType());
+
+  // The inner add was the result of the narrow add, zero extended to the
+  // wider type.  Replace it with the result computed by the intrinsic.
+  IC.replaceInstUsesWith(*OrigAdd, ZExt);
+  IC.eraseInstFromFunction(*OrigAdd);
+
+  // The original icmp gets replaced with the overflow value.
+  return ExtractValueInst::Create(Call, 1, "sadd.overflow");
+}
+
+/// If we have:
+///   icmp eq/ne (urem/srem %x, %y), 0
+/// iff %y is a power-of-two, we can replace this with a bit test:
+///   icmp eq/ne (and %x, (add %y, -1)), 0
 Instruction *InstCombinerImpl::foldIRemByPowerOfTwoToBitTest(ICmpInst &I) {
-  // This fold is only valid for equality predicates. 
-  if (!I.isEquality()) 
-    return nullptr; 
-  ICmpInst::Predicate Pred; 
-  Value *X, *Y, *Zero; 
-  if (!match(&I, m_ICmp(Pred, m_OneUse(m_IRem(m_Value(X), m_Value(Y))), 
-                        m_CombineAnd(m_Zero(), m_Value(Zero))))) 
-    return nullptr; 
-  if (!isKnownToBeAPowerOfTwo(Y, /*OrZero*/ true, 0, &I)) 
-    return nullptr; 
-  // This may increase instruction count, we don't enforce that Y is a constant. 
-  Value *Mask = Builder.CreateAdd(Y, Constant::getAllOnesValue(Y->getType())); 
-  Value *Masked = Builder.CreateAnd(X, Mask); 
-  return ICmpInst::Create(Instruction::ICmp, Pred, Masked, Zero); 
-} 
- 
-/// Fold equality-comparison between zero and any (maybe truncated) right-shift 
-/// by one-less-than-bitwidth into a sign test on the original value. 
+  // This fold is only valid for equality predicates.
+  if (!I.isEquality())
+    return nullptr;
+  ICmpInst::Predicate Pred;
+  Value *X, *Y, *Zero;
+  if (!match(&I, m_ICmp(Pred, m_OneUse(m_IRem(m_Value(X), m_Value(Y))),
+                        m_CombineAnd(m_Zero(), m_Value(Zero)))))
+    return nullptr;
+  if (!isKnownToBeAPowerOfTwo(Y, /*OrZero*/ true, 0, &I))
+    return nullptr;
+  // This may increase instruction count, we don't enforce that Y is a constant.
+  Value *Mask = Builder.CreateAdd(Y, Constant::getAllOnesValue(Y->getType()));
+  Value *Masked = Builder.CreateAnd(X, Mask);
+  return ICmpInst::Create(Instruction::ICmp, Pred, Masked, Zero);
+}
+
+/// Fold equality-comparison between zero and any (maybe truncated) right-shift
+/// by one-less-than-bitwidth into a sign test on the original value.
 Instruction *InstCombinerImpl::foldSignBitTest(ICmpInst &I) {
-  Instruction *Val; 
-  ICmpInst::Predicate Pred; 
-  if (!I.isEquality() || !match(&I, m_ICmp(Pred, m_Instruction(Val), m_Zero()))) 
-    return nullptr; 
- 
-  Value *X; 
-  Type *XTy; 
- 
-  Constant *C; 
-  if (match(Val, m_TruncOrSelf(m_Shr(m_Value(X), m_Constant(C))))) { 
-    XTy = X->getType(); 
-    unsigned XBitWidth = XTy->getScalarSizeInBits(); 
-    if (!match(C, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ, 
-                                     APInt(XBitWidth, XBitWidth - 1)))) 
-      return nullptr; 
-  } else if (isa<BinaryOperator>(Val) && 
-             (X = reassociateShiftAmtsOfTwoSameDirectionShifts( 
-                  cast<BinaryOperator>(Val), SQ.getWithInstruction(Val), 
-                  /*AnalyzeForSignBitExtraction=*/true))) { 
-    XTy = X->getType(); 
-  } else 
-    return nullptr; 
- 
-  return ICmpInst::Create(Instruction::ICmp, 
-                          Pred == ICmpInst::ICMP_EQ ? ICmpInst::ICMP_SGE 
-                                                    : ICmpInst::ICMP_SLT, 
-                          X, ConstantInt::getNullValue(XTy)); 
-} 
- 
-// Handle  icmp pred X, 0 
+  Instruction *Val;
+  ICmpInst::Predicate Pred;
+  if (!I.isEquality() || !match(&I, m_ICmp(Pred, m_Instruction(Val), m_Zero())))
+    return nullptr;
+
+  Value *X;
+  Type *XTy;
+
+  Constant *C;
+  if (match(Val, m_TruncOrSelf(m_Shr(m_Value(X), m_Constant(C))))) {
+    XTy = X->getType();
+    unsigned XBitWidth = XTy->getScalarSizeInBits();
+    if (!match(C, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ,
+                                     APInt(XBitWidth, XBitWidth - 1))))
+      return nullptr;
+  } else if (isa<BinaryOperator>(Val) &&
+             (X = reassociateShiftAmtsOfTwoSameDirectionShifts(
+                  cast<BinaryOperator>(Val), SQ.getWithInstruction(Val),
+                  /*AnalyzeForSignBitExtraction=*/true))) {
+    XTy = X->getType();
+  } else
+    return nullptr;
+
+  return ICmpInst::Create(Instruction::ICmp,
+                          Pred == ICmpInst::ICMP_EQ ? ICmpInst::ICMP_SGE
+                                                    : ICmpInst::ICMP_SLT,
+                          X, ConstantInt::getNullValue(XTy));
+}
+
+// Handle  icmp pred X, 0
 Instruction *InstCombinerImpl::foldICmpWithZero(ICmpInst &Cmp) {
-  CmpInst::Predicate Pred = Cmp.getPredicate(); 
-  if (!match(Cmp.getOperand(1), m_Zero())) 
-    return nullptr; 
- 
-  // (icmp sgt smin(PosA, B) 0) -> (icmp sgt B 0) 
-  if (Pred == ICmpInst::ICMP_SGT) { 
-    Value *A, *B; 
-    SelectPatternResult SPR = matchSelectPattern(Cmp.getOperand(0), A, B); 
-    if (SPR.Flavor == SPF_SMIN) { 
-      if (isKnownPositive(A, DL, 0, &AC, &Cmp, &DT)) 
-        return new ICmpInst(Pred, B, Cmp.getOperand(1)); 
-      if (isKnownPositive(B, DL, 0, &AC, &Cmp, &DT)) 
-        return new ICmpInst(Pred, A, Cmp.getOperand(1)); 
-    } 
-  } 
- 
-  if (Instruction *New = foldIRemByPowerOfTwoToBitTest(Cmp)) 
-    return New; 
- 
-  // Given: 
-  //   icmp eq/ne (urem %x, %y), 0 
-  // Iff %x has 0 or 1 bits set, and %y has at least 2 bits set, omit 'urem': 
-  //   icmp eq/ne %x, 0 
-  Value *X, *Y; 
-  if (match(Cmp.getOperand(0), m_URem(m_Value(X), m_Value(Y))) && 
-      ICmpInst::isEquality(Pred)) { 
-    KnownBits XKnown = computeKnownBits(X, 0, &Cmp); 
-    KnownBits YKnown = computeKnownBits(Y, 0, &Cmp); 
-    if (XKnown.countMaxPopulation() == 1 && YKnown.countMinPopulation() >= 2) 
-      return new ICmpInst(Pred, X, Cmp.getOperand(1)); 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Fold icmp Pred X, C. 
-/// TODO: This code structure does not make sense. The saturating add fold 
-/// should be moved to some other helper and extended as noted below (it is also 
-/// possible that code has been made unnecessary - do we canonicalize IR to 
-/// overflow/saturating intrinsics or not?). 
+  CmpInst::Predicate Pred = Cmp.getPredicate();
+  if (!match(Cmp.getOperand(1), m_Zero()))
+    return nullptr;
+
+  // (icmp sgt smin(PosA, B) 0) -> (icmp sgt B 0)
+  if (Pred == ICmpInst::ICMP_SGT) {
+    Value *A, *B;
+    SelectPatternResult SPR = matchSelectPattern(Cmp.getOperand(0), A, B);
+    if (SPR.Flavor == SPF_SMIN) {
+      if (isKnownPositive(A, DL, 0, &AC, &Cmp, &DT))
+        return new ICmpInst(Pred, B, Cmp.getOperand(1));
+      if (isKnownPositive(B, DL, 0, &AC, &Cmp, &DT))
+        return new ICmpInst(Pred, A, Cmp.getOperand(1));
+    }
+  }
+
+  if (Instruction *New = foldIRemByPowerOfTwoToBitTest(Cmp))
+    return New;
+
+  // Given:
+  //   icmp eq/ne (urem %x, %y), 0
+  // Iff %x has 0 or 1 bits set, and %y has at least 2 bits set, omit 'urem':
+  //   icmp eq/ne %x, 0
+  Value *X, *Y;
+  if (match(Cmp.getOperand(0), m_URem(m_Value(X), m_Value(Y))) &&
+      ICmpInst::isEquality(Pred)) {
+    KnownBits XKnown = computeKnownBits(X, 0, &Cmp);
+    KnownBits YKnown = computeKnownBits(Y, 0, &Cmp);
+    if (XKnown.countMaxPopulation() == 1 && YKnown.countMinPopulation() >= 2)
+      return new ICmpInst(Pred, X, Cmp.getOperand(1));
+  }
+
+  return nullptr;
+}
+
+/// Fold icmp Pred X, C.
+/// TODO: This code structure does not make sense. The saturating add fold
+/// should be moved to some other helper and extended as noted below (it is also
+/// possible that code has been made unnecessary - do we canonicalize IR to
+/// overflow/saturating intrinsics or not?).
 Instruction *InstCombinerImpl::foldICmpWithConstant(ICmpInst &Cmp) {
-  // Match the following pattern, which is a common idiom when writing 
-  // overflow-safe integer arithmetic functions. The source performs an addition 
-  // in wider type and explicitly checks for overflow using comparisons against 
-  // INT_MIN and INT_MAX. Simplify by using the sadd_with_overflow intrinsic. 
-  // 
-  // TODO: This could probably be generalized to handle other overflow-safe 
-  // operations if we worked out the formulas to compute the appropriate magic 
-  // constants. 
-  // 
-  // sum = a + b 
-  // if (sum+128 >u 255)  ...  -> llvm.sadd.with.overflow.i8 
-  CmpInst::Predicate Pred = Cmp.getPredicate(); 
-  Value *Op0 = Cmp.getOperand(0), *Op1 = Cmp.getOperand(1); 
-  Value *A, *B; 
-  ConstantInt *CI, *CI2; // I = icmp ugt (add (add A, B), CI2), CI 
-  if (Pred == ICmpInst::ICMP_UGT && match(Op1, m_ConstantInt(CI)) && 
-      match(Op0, m_Add(m_Add(m_Value(A), m_Value(B)), m_ConstantInt(CI2)))) 
-    if (Instruction *Res = processUGT_ADDCST_ADD(Cmp, A, B, CI2, CI, *this)) 
-      return Res; 
- 
-  // icmp(phi(C1, C2, ...), C) -> phi(icmp(C1, C), icmp(C2, C), ...). 
-  Constant *C = dyn_cast<Constant>(Op1); 
-  if (!C) 
-    return nullptr; 
- 
-  if (auto *Phi = dyn_cast<PHINode>(Op0)) 
-    if (all_of(Phi->operands(), [](Value *V) { return isa<Constant>(V); })) { 
-      Type *Ty = Cmp.getType(); 
-      Builder.SetInsertPoint(Phi); 
-      PHINode *NewPhi = 
-          Builder.CreatePHI(Ty, Phi->getNumOperands()); 
-      for (BasicBlock *Predecessor : predecessors(Phi->getParent())) { 
-        auto *Input = 
-            cast<Constant>(Phi->getIncomingValueForBlock(Predecessor)); 
-        auto *BoolInput = ConstantExpr::getCompare(Pred, Input, C); 
-        NewPhi->addIncoming(BoolInput, Predecessor); 
-      } 
-      NewPhi->takeName(&Cmp); 
-      return replaceInstUsesWith(Cmp, NewPhi); 
-    } 
- 
-  return nullptr; 
-} 
- 
-/// Canonicalize icmp instructions based on dominating conditions. 
+  // Match the following pattern, which is a common idiom when writing
+  // overflow-safe integer arithmetic functions. The source performs an addition
+  // in wider type and explicitly checks for overflow using comparisons against
+  // INT_MIN and INT_MAX. Simplify by using the sadd_with_overflow intrinsic.
+  //
+  // TODO: This could probably be generalized to handle other overflow-safe
+  // operations if we worked out the formulas to compute the appropriate magic
+  // constants.
+  //
+  // sum = a + b
+  // if (sum+128 >u 255)  ...  -> llvm.sadd.with.overflow.i8
+  CmpInst::Predicate Pred = Cmp.getPredicate();
+  Value *Op0 = Cmp.getOperand(0), *Op1 = Cmp.getOperand(1);
+  Value *A, *B;
+  ConstantInt *CI, *CI2; // I = icmp ugt (add (add A, B), CI2), CI
+  if (Pred == ICmpInst::ICMP_UGT && match(Op1, m_ConstantInt(CI)) &&
+      match(Op0, m_Add(m_Add(m_Value(A), m_Value(B)), m_ConstantInt(CI2))))
+    if (Instruction *Res = processUGT_ADDCST_ADD(Cmp, A, B, CI2, CI, *this))
+      return Res;
+
+  // icmp(phi(C1, C2, ...), C) -> phi(icmp(C1, C), icmp(C2, C), ...).
+  Constant *C = dyn_cast<Constant>(Op1);
+  if (!C)
+    return nullptr;
+
+  if (auto *Phi = dyn_cast<PHINode>(Op0))
+    if (all_of(Phi->operands(), [](Value *V) { return isa<Constant>(V); })) {
+      Type *Ty = Cmp.getType();
+      Builder.SetInsertPoint(Phi);
+      PHINode *NewPhi =
+          Builder.CreatePHI(Ty, Phi->getNumOperands());
+      for (BasicBlock *Predecessor : predecessors(Phi->getParent())) {
+        auto *Input =
+            cast<Constant>(Phi->getIncomingValueForBlock(Predecessor));
+        auto *BoolInput = ConstantExpr::getCompare(Pred, Input, C);
+        NewPhi->addIncoming(BoolInput, Predecessor);
+      }
+      NewPhi->takeName(&Cmp);
+      return replaceInstUsesWith(Cmp, NewPhi);
+    }
+
+  return nullptr;
+}
+
+/// Canonicalize icmp instructions based on dominating conditions.
 Instruction *InstCombinerImpl::foldICmpWithDominatingICmp(ICmpInst &Cmp) {
-  // This is a cheap/incomplete check for dominance - just match a single 
-  // predecessor with a conditional branch. 
-  BasicBlock *CmpBB = Cmp.getParent(); 
-  BasicBlock *DomBB = CmpBB->getSinglePredecessor(); 
-  if (!DomBB) 
-    return nullptr; 
- 
-  Value *DomCond; 
-  BasicBlock *TrueBB, *FalseBB; 
-  if (!match(DomBB->getTerminator(), m_Br(m_Value(DomCond), TrueBB, FalseBB))) 
-    return nullptr; 
- 
-  assert((TrueBB == CmpBB || FalseBB == CmpBB) && 
-         "Predecessor block does not point to successor?"); 
- 
-  // The branch should get simplified. Don't bother simplifying this condition. 
-  if (TrueBB == FalseBB) 
-    return nullptr; 
- 
-  // Try to simplify this compare to T/F based on the dominating condition. 
-  Optional<bool> Imp = isImpliedCondition(DomCond, &Cmp, DL, TrueBB == CmpBB); 
-  if (Imp) 
-    return replaceInstUsesWith(Cmp, ConstantInt::get(Cmp.getType(), *Imp)); 
- 
-  CmpInst::Predicate Pred = Cmp.getPredicate(); 
-  Value *X = Cmp.getOperand(0), *Y = Cmp.getOperand(1); 
-  ICmpInst::Predicate DomPred; 
-  const APInt *C, *DomC; 
-  if (match(DomCond, m_ICmp(DomPred, m_Specific(X), m_APInt(DomC))) && 
-      match(Y, m_APInt(C))) { 
-    // We have 2 compares of a variable with constants. Calculate the constant 
-    // ranges of those compares to see if we can transform the 2nd compare: 
-    // DomBB: 
-    //   DomCond = icmp DomPred X, DomC 
-    //   br DomCond, CmpBB, FalseBB 
-    // CmpBB: 
-    //   Cmp = icmp Pred X, C 
-    ConstantRange CR = ConstantRange::makeAllowedICmpRegion(Pred, *C); 
-    ConstantRange DominatingCR = 
-        (CmpBB == TrueBB) ? ConstantRange::makeExactICmpRegion(DomPred, *DomC) 
-                          : ConstantRange::makeExactICmpRegion( 
-                                CmpInst::getInversePredicate(DomPred), *DomC); 
-    ConstantRange Intersection = DominatingCR.intersectWith(CR); 
-    ConstantRange Difference = DominatingCR.difference(CR); 
-    if (Intersection.isEmptySet()) 
-      return replaceInstUsesWith(Cmp, Builder.getFalse()); 
-    if (Difference.isEmptySet()) 
-      return replaceInstUsesWith(Cmp, Builder.getTrue()); 
- 
-    // Canonicalizing a sign bit comparison that gets used in a branch, 
-    // pessimizes codegen by generating branch on zero instruction instead 
-    // of a test and branch. So we avoid canonicalizing in such situations 
-    // because test and branch instruction has better branch displacement 
-    // than compare and branch instruction. 
-    bool UnusedBit; 
-    bool IsSignBit = isSignBitCheck(Pred, *C, UnusedBit); 
-    if (Cmp.isEquality() || (IsSignBit && hasBranchUse(Cmp))) 
-      return nullptr; 
- 
-    if (const APInt *EqC = Intersection.getSingleElement()) 
-      return new ICmpInst(ICmpInst::ICMP_EQ, X, Builder.getInt(*EqC)); 
-    if (const APInt *NeC = Difference.getSingleElement()) 
-      return new ICmpInst(ICmpInst::ICMP_NE, X, Builder.getInt(*NeC)); 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Fold icmp (trunc X, Y), C. 
+  // This is a cheap/incomplete check for dominance - just match a single
+  // predecessor with a conditional branch.
+  BasicBlock *CmpBB = Cmp.getParent();
+  BasicBlock *DomBB = CmpBB->getSinglePredecessor();
+  if (!DomBB)
+    return nullptr;
+
+  Value *DomCond;
+  BasicBlock *TrueBB, *FalseBB;
+  if (!match(DomBB->getTerminator(), m_Br(m_Value(DomCond), TrueBB, FalseBB)))
+    return nullptr;
+
+  assert((TrueBB == CmpBB || FalseBB == CmpBB) &&
+         "Predecessor block does not point to successor?");
+
+  // The branch should get simplified. Don't bother simplifying this condition.
+  if (TrueBB == FalseBB)
+    return nullptr;
+
+  // Try to simplify this compare to T/F based on the dominating condition.
+  Optional<bool> Imp = isImpliedCondition(DomCond, &Cmp, DL, TrueBB == CmpBB);
+  if (Imp)
+    return replaceInstUsesWith(Cmp, ConstantInt::get(Cmp.getType(), *Imp));
+
+  CmpInst::Predicate Pred = Cmp.getPredicate();
+  Value *X = Cmp.getOperand(0), *Y = Cmp.getOperand(1);
+  ICmpInst::Predicate DomPred;
+  const APInt *C, *DomC;
+  if (match(DomCond, m_ICmp(DomPred, m_Specific(X), m_APInt(DomC))) &&
+      match(Y, m_APInt(C))) {
+    // We have 2 compares of a variable with constants. Calculate the constant
+    // ranges of those compares to see if we can transform the 2nd compare:
+    // DomBB:
+    //   DomCond = icmp DomPred X, DomC
+    //   br DomCond, CmpBB, FalseBB
+    // CmpBB:
+    //   Cmp = icmp Pred X, C
+    ConstantRange CR = ConstantRange::makeAllowedICmpRegion(Pred, *C);
+    ConstantRange DominatingCR =
+        (CmpBB == TrueBB) ? ConstantRange::makeExactICmpRegion(DomPred, *DomC)
+                          : ConstantRange::makeExactICmpRegion(
+                                CmpInst::getInversePredicate(DomPred), *DomC);
+    ConstantRange Intersection = DominatingCR.intersectWith(CR);
+    ConstantRange Difference = DominatingCR.difference(CR);
+    if (Intersection.isEmptySet())
+      return replaceInstUsesWith(Cmp, Builder.getFalse());
+    if (Difference.isEmptySet())
+      return replaceInstUsesWith(Cmp, Builder.getTrue());
+
+    // Canonicalizing a sign bit comparison that gets used in a branch,
+    // pessimizes codegen by generating branch on zero instruction instead
+    // of a test and branch. So we avoid canonicalizing in such situations
+    // because test and branch instruction has better branch displacement
+    // than compare and branch instruction.
+    bool UnusedBit;
+    bool IsSignBit = isSignBitCheck(Pred, *C, UnusedBit);
+    if (Cmp.isEquality() || (IsSignBit && hasBranchUse(Cmp)))
+      return nullptr;
+
+    if (const APInt *EqC = Intersection.getSingleElement())
+      return new ICmpInst(ICmpInst::ICMP_EQ, X, Builder.getInt(*EqC));
+    if (const APInt *NeC = Difference.getSingleElement())
+      return new ICmpInst(ICmpInst::ICMP_NE, X, Builder.getInt(*NeC));
+  }
+
+  return nullptr;
+}
+
+/// Fold icmp (trunc X, Y), C.
 Instruction *InstCombinerImpl::foldICmpTruncConstant(ICmpInst &Cmp,
                                                      TruncInst *Trunc,
                                                      const APInt &C) {
-  ICmpInst::Predicate Pred = Cmp.getPredicate(); 
-  Value *X = Trunc->getOperand(0); 
-  if (C.isOneValue() && C.getBitWidth() > 1) { 
-    // icmp slt trunc(signum(V)) 1 --> icmp slt V, 1 
-    Value *V = nullptr; 
-    if (Pred == ICmpInst::ICMP_SLT && match(X, m_Signum(m_Value(V)))) 
-      return new ICmpInst(ICmpInst::ICMP_SLT, V, 
-                          ConstantInt::get(V->getType(), 1)); 
-  } 
- 
-  if (Cmp.isEquality() && Trunc->hasOneUse()) { 
-    // Simplify icmp eq (trunc x to i8), 42 -> icmp eq x, 42|highbits if all 
-    // of the high bits truncated out of x are known. 
-    unsigned DstBits = Trunc->getType()->getScalarSizeInBits(), 
-             SrcBits = X->getType()->getScalarSizeInBits(); 
-    KnownBits Known = computeKnownBits(X, 0, &Cmp); 
- 
-    // If all the high bits are known, we can do this xform. 
-    if ((Known.Zero | Known.One).countLeadingOnes() >= SrcBits - DstBits) { 
-      // Pull in the high bits from known-ones set. 
-      APInt NewRHS = C.zext(SrcBits); 
-      NewRHS |= Known.One & APInt::getHighBitsSet(SrcBits, SrcBits - DstBits); 
-      return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), NewRHS)); 
-    } 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Fold icmp (xor X, Y), C. 
+  ICmpInst::Predicate Pred = Cmp.getPredicate();
+  Value *X = Trunc->getOperand(0);
+  if (C.isOneValue() && C.getBitWidth() > 1) {
+    // icmp slt trunc(signum(V)) 1 --> icmp slt V, 1
+    Value *V = nullptr;
+    if (Pred == ICmpInst::ICMP_SLT && match(X, m_Signum(m_Value(V))))
+      return new ICmpInst(ICmpInst::ICMP_SLT, V,
+                          ConstantInt::get(V->getType(), 1));
+  }
+
+  if (Cmp.isEquality() && Trunc->hasOneUse()) {
+    // Simplify icmp eq (trunc x to i8), 42 -> icmp eq x, 42|highbits if all
+    // of the high bits truncated out of x are known.
+    unsigned DstBits = Trunc->getType()->getScalarSizeInBits(),
+             SrcBits = X->getType()->getScalarSizeInBits();
+    KnownBits Known = computeKnownBits(X, 0, &Cmp);
+
+    // If all the high bits are known, we can do this xform.
+    if ((Known.Zero | Known.One).countLeadingOnes() >= SrcBits - DstBits) {
+      // Pull in the high bits from known-ones set.
+      APInt NewRHS = C.zext(SrcBits);
+      NewRHS |= Known.One & APInt::getHighBitsSet(SrcBits, SrcBits - DstBits);
+      return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), NewRHS));
+    }
+  }
+
+  return nullptr;
+}
+
+/// Fold icmp (xor X, Y), C.
 Instruction *InstCombinerImpl::foldICmpXorConstant(ICmpInst &Cmp,
                                                    BinaryOperator *Xor,
                                                    const APInt &C) {
-  Value *X = Xor->getOperand(0); 
-  Value *Y = Xor->getOperand(1); 
-  const APInt *XorC; 
-  if (!match(Y, m_APInt(XorC))) 
-    return nullptr; 
- 
-  // If this is a comparison that tests the signbit (X < 0) or (x > -1), 
-  // fold the xor. 
-  ICmpInst::Predicate Pred = Cmp.getPredicate(); 
-  bool TrueIfSigned = false; 
-  if (isSignBitCheck(Cmp.getPredicate(), C, TrueIfSigned)) { 
- 
-    // If the sign bit of the XorCst is not set, there is no change to 
-    // the operation, just stop using the Xor. 
-    if (!XorC->isNegative()) 
-      return replaceOperand(Cmp, 0, X); 
- 
-    // Emit the opposite comparison. 
-    if (TrueIfSigned) 
-      return new ICmpInst(ICmpInst::ICMP_SGT, X, 
-                          ConstantInt::getAllOnesValue(X->getType())); 
-    else 
-      return new ICmpInst(ICmpInst::ICMP_SLT, X, 
-                          ConstantInt::getNullValue(X->getType())); 
-  } 
- 
-  if (Xor->hasOneUse()) { 
-    // (icmp u/s (xor X SignMask), C) -> (icmp s/u X, (xor C SignMask)) 
-    if (!Cmp.isEquality() && XorC->isSignMask()) { 
+  Value *X = Xor->getOperand(0);
+  Value *Y = Xor->getOperand(1);
+  const APInt *XorC;
+  if (!match(Y, m_APInt(XorC)))
+    return nullptr;
+
+  // If this is a comparison that tests the signbit (X < 0) or (x > -1),
+  // fold the xor.
+  ICmpInst::Predicate Pred = Cmp.getPredicate();
+  bool TrueIfSigned = false;
+  if (isSignBitCheck(Cmp.getPredicate(), C, TrueIfSigned)) {
+
+    // If the sign bit of the XorCst is not set, there is no change to
+    // the operation, just stop using the Xor.
+    if (!XorC->isNegative())
+      return replaceOperand(Cmp, 0, X);
+
+    // Emit the opposite comparison.
+    if (TrueIfSigned)
+      return new ICmpInst(ICmpInst::ICMP_SGT, X,
+                          ConstantInt::getAllOnesValue(X->getType()));
+    else
+      return new ICmpInst(ICmpInst::ICMP_SLT, X,
+                          ConstantInt::getNullValue(X->getType()));
+  }
+
+  if (Xor->hasOneUse()) {
+    // (icmp u/s (xor X SignMask), C) -> (icmp s/u X, (xor C SignMask))
+    if (!Cmp.isEquality() && XorC->isSignMask()) {
       Pred = Cmp.getFlippedSignednessPredicate();
-      return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), C ^ *XorC)); 
-    } 
- 
-    // (icmp u/s (xor X ~SignMask), C) -> (icmp s/u X, (xor C ~SignMask)) 
-    if (!Cmp.isEquality() && XorC->isMaxSignedValue()) { 
+      return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), C ^ *XorC));
+    }
+
+    // (icmp u/s (xor X ~SignMask), C) -> (icmp s/u X, (xor C ~SignMask))
+    if (!Cmp.isEquality() && XorC->isMaxSignedValue()) {
       Pred = Cmp.getFlippedSignednessPredicate();
-      Pred = Cmp.getSwappedPredicate(Pred); 
-      return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), C ^ *XorC)); 
-    } 
-  } 
- 
-  // Mask constant magic can eliminate an 'xor' with unsigned compares. 
-  if (Pred == ICmpInst::ICMP_UGT) { 
-    // (xor X, ~C) >u C --> X <u ~C (when C+1 is a power of 2) 
-    if (*XorC == ~C && (C + 1).isPowerOf2()) 
-      return new ICmpInst(ICmpInst::ICMP_ULT, X, Y); 
-    // (xor X, C) >u C --> X >u C (when C+1 is a power of 2) 
-    if (*XorC == C && (C + 1).isPowerOf2()) 
-      return new ICmpInst(ICmpInst::ICMP_UGT, X, Y); 
-  } 
-  if (Pred == ICmpInst::ICMP_ULT) { 
-    // (xor X, -C) <u C --> X >u ~C (when C is a power of 2) 
-    if (*XorC == -C && C.isPowerOf2()) 
-      return new ICmpInst(ICmpInst::ICMP_UGT, X, 
-                          ConstantInt::get(X->getType(), ~C)); 
-    // (xor X, C) <u C --> X >u ~C (when -C is a power of 2) 
-    if (*XorC == C && (-C).isPowerOf2()) 
-      return new ICmpInst(ICmpInst::ICMP_UGT, X, 
-                          ConstantInt::get(X->getType(), ~C)); 
-  } 
-  return nullptr; 
-} 
- 
-/// Fold icmp (and (sh X, Y), C2), C1. 
+      Pred = Cmp.getSwappedPredicate(Pred);
+      return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), C ^ *XorC));
+    }
+  }
+
+  // Mask constant magic can eliminate an 'xor' with unsigned compares.
+  if (Pred == ICmpInst::ICMP_UGT) {
+    // (xor X, ~C) >u C --> X <u ~C (when C+1 is a power of 2)
+    if (*XorC == ~C && (C + 1).isPowerOf2())
+      return new ICmpInst(ICmpInst::ICMP_ULT, X, Y);
+    // (xor X, C) >u C --> X >u C (when C+1 is a power of 2)
+    if (*XorC == C && (C + 1).isPowerOf2())
+      return new ICmpInst(ICmpInst::ICMP_UGT, X, Y);
+  }
+  if (Pred == ICmpInst::ICMP_ULT) {
+    // (xor X, -C) <u C --> X >u ~C (when C is a power of 2)
+    if (*XorC == -C && C.isPowerOf2())
+      return new ICmpInst(ICmpInst::ICMP_UGT, X,
+                          ConstantInt::get(X->getType(), ~C));
+    // (xor X, C) <u C --> X >u ~C (when -C is a power of 2)
+    if (*XorC == C && (-C).isPowerOf2())
+      return new ICmpInst(ICmpInst::ICMP_UGT, X,
+                          ConstantInt::get(X->getType(), ~C));
+  }
+  return nullptr;
+}
+
+/// Fold icmp (and (sh X, Y), C2), C1.
 Instruction *InstCombinerImpl::foldICmpAndShift(ICmpInst &Cmp,
                                                 BinaryOperator *And,
                                                 const APInt &C1,
                                                 const APInt &C2) {
-  BinaryOperator *Shift = dyn_cast<BinaryOperator>(And->getOperand(0)); 
-  if (!Shift || !Shift->isShift()) 
-    return nullptr; 
- 
-  // If this is: (X >> C3) & C2 != C1 (where any shift and any compare could 
-  // exist), turn it into (X & (C2 << C3)) != (C1 << C3). This happens a LOT in 
-  // code produced by the clang front-end, for bitfield access. 
-  // This seemingly simple opportunity to fold away a shift turns out to be 
-  // rather complicated. See PR17827 for details. 
-  unsigned ShiftOpcode = Shift->getOpcode(); 
-  bool IsShl = ShiftOpcode == Instruction::Shl; 
-  const APInt *C3; 
-  if (match(Shift->getOperand(1), m_APInt(C3))) { 
-    APInt NewAndCst, NewCmpCst; 
-    bool AnyCmpCstBitsShiftedOut; 
-    if (ShiftOpcode == Instruction::Shl) { 
-      // For a left shift, we can fold if the comparison is not signed. We can 
-      // also fold a signed comparison if the mask value and comparison value 
-      // are not negative. These constraints may not be obvious, but we can 
-      // prove that they are correct using an SMT solver. 
-      if (Cmp.isSigned() && (C2.isNegative() || C1.isNegative())) 
-        return nullptr; 
- 
-      NewCmpCst = C1.lshr(*C3); 
-      NewAndCst = C2.lshr(*C3); 
-      AnyCmpCstBitsShiftedOut = NewCmpCst.shl(*C3) != C1; 
-    } else if (ShiftOpcode == Instruction::LShr) { 
-      // For a logical right shift, we can fold if the comparison is not signed. 
-      // We can also fold a signed comparison if the shifted mask value and the 
-      // shifted comparison value are not negative. These constraints may not be 
-      // obvious, but we can prove that they are correct using an SMT solver. 
-      NewCmpCst = C1.shl(*C3); 
-      NewAndCst = C2.shl(*C3); 
-      AnyCmpCstBitsShiftedOut = NewCmpCst.lshr(*C3) != C1; 
-      if (Cmp.isSigned() && (NewAndCst.isNegative() || NewCmpCst.isNegative())) 
-        return nullptr; 
-    } else { 
-      // For an arithmetic shift, check that both constants don't use (in a 
-      // signed sense) the top bits being shifted out. 
-      assert(ShiftOpcode == Instruction::AShr && "Unknown shift opcode"); 
-      NewCmpCst = C1.shl(*C3); 
-      NewAndCst = C2.shl(*C3); 
-      AnyCmpCstBitsShiftedOut = NewCmpCst.ashr(*C3) != C1; 
-      if (NewAndCst.ashr(*C3) != C2) 
-        return nullptr; 
-    } 
- 
-    if (AnyCmpCstBitsShiftedOut) { 
-      // If we shifted bits out, the fold is not going to work out. As a 
-      // special case, check to see if this means that the result is always 
-      // true or false now. 
-      if (Cmp.getPredicate() == ICmpInst::ICMP_EQ) 
-        return replaceInstUsesWith(Cmp, ConstantInt::getFalse(Cmp.getType())); 
-      if (Cmp.getPredicate() == ICmpInst::ICMP_NE) 
-        return replaceInstUsesWith(Cmp, ConstantInt::getTrue(Cmp.getType())); 
-    } else { 
-      Value *NewAnd = Builder.CreateAnd( 
-          Shift->getOperand(0), ConstantInt::get(And->getType(), NewAndCst)); 
-      return new ICmpInst(Cmp.getPredicate(), 
-          NewAnd, ConstantInt::get(And->getType(), NewCmpCst)); 
-    } 
-  } 
- 
-  // Turn ((X >> Y) & C2) == 0  into  (X & (C2 << Y)) == 0.  The latter is 
-  // preferable because it allows the C2 << Y expression to be hoisted out of a 
-  // loop if Y is invariant and X is not. 
-  if (Shift->hasOneUse() && C1.isNullValue() && Cmp.isEquality() && 
-      !Shift->isArithmeticShift() && !isa<Constant>(Shift->getOperand(0))) { 
-    // Compute C2 << Y. 
-    Value *NewShift = 
-        IsShl ? Builder.CreateLShr(And->getOperand(1), Shift->getOperand(1)) 
-              : Builder.CreateShl(And->getOperand(1), Shift->getOperand(1)); 
- 
-    // Compute X & (C2 << Y). 
-    Value *NewAnd = Builder.CreateAnd(Shift->getOperand(0), NewShift); 
-    return replaceOperand(Cmp, 0, NewAnd); 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Fold icmp (and X, C2), C1. 
+  BinaryOperator *Shift = dyn_cast<BinaryOperator>(And->getOperand(0));
+  if (!Shift || !Shift->isShift())
+    return nullptr;
+
+  // If this is: (X >> C3) & C2 != C1 (where any shift and any compare could
+  // exist), turn it into (X & (C2 << C3)) != (C1 << C3). This happens a LOT in
+  // code produced by the clang front-end, for bitfield access.
+  // This seemingly simple opportunity to fold away a shift turns out to be
+  // rather complicated. See PR17827 for details.
+  unsigned ShiftOpcode = Shift->getOpcode();
+  bool IsShl = ShiftOpcode == Instruction::Shl;
+  const APInt *C3;
+  if (match(Shift->getOperand(1), m_APInt(C3))) {
+    APInt NewAndCst, NewCmpCst;
+    bool AnyCmpCstBitsShiftedOut;
+    if (ShiftOpcode == Instruction::Shl) {
+      // For a left shift, we can fold if the comparison is not signed. We can
+      // also fold a signed comparison if the mask value and comparison value
+      // are not negative. These constraints may not be obvious, but we can
+      // prove that they are correct using an SMT solver.
+      if (Cmp.isSigned() && (C2.isNegative() || C1.isNegative()))
+        return nullptr;
+
+      NewCmpCst = C1.lshr(*C3);
+      NewAndCst = C2.lshr(*C3);
+      AnyCmpCstBitsShiftedOut = NewCmpCst.shl(*C3) != C1;
+    } else if (ShiftOpcode == Instruction::LShr) {
+      // For a logical right shift, we can fold if the comparison is not signed.
+      // We can also fold a signed comparison if the shifted mask value and the
+      // shifted comparison value are not negative. These constraints may not be
+      // obvious, but we can prove that they are correct using an SMT solver.
+      NewCmpCst = C1.shl(*C3);
+      NewAndCst = C2.shl(*C3);
+      AnyCmpCstBitsShiftedOut = NewCmpCst.lshr(*C3) != C1;
+      if (Cmp.isSigned() && (NewAndCst.isNegative() || NewCmpCst.isNegative()))
+        return nullptr;
+    } else {
+      // For an arithmetic shift, check that both constants don't use (in a
+      // signed sense) the top bits being shifted out.
+      assert(ShiftOpcode == Instruction::AShr && "Unknown shift opcode");
+      NewCmpCst = C1.shl(*C3);
+      NewAndCst = C2.shl(*C3);
+      AnyCmpCstBitsShiftedOut = NewCmpCst.ashr(*C3) != C1;
+      if (NewAndCst.ashr(*C3) != C2)
+        return nullptr;
+    }
+
+    if (AnyCmpCstBitsShiftedOut) {
+      // If we shifted bits out, the fold is not going to work out. As a
+      // special case, check to see if this means that the result is always
+      // true or false now.
+      if (Cmp.getPredicate() == ICmpInst::ICMP_EQ)
+        return replaceInstUsesWith(Cmp, ConstantInt::getFalse(Cmp.getType()));
+      if (Cmp.getPredicate() == ICmpInst::ICMP_NE)
+        return replaceInstUsesWith(Cmp, ConstantInt::getTrue(Cmp.getType()));
+    } else {
+      Value *NewAnd = Builder.CreateAnd(
+          Shift->getOperand(0), ConstantInt::get(And->getType(), NewAndCst));
+      return new ICmpInst(Cmp.getPredicate(),
+          NewAnd, ConstantInt::get(And->getType(), NewCmpCst));
+    }
+  }
+
+  // Turn ((X >> Y) & C2) == 0  into  (X & (C2 << Y)) == 0.  The latter is
+  // preferable because it allows the C2 << Y expression to be hoisted out of a
+  // loop if Y is invariant and X is not.
+  if (Shift->hasOneUse() && C1.isNullValue() && Cmp.isEquality() &&
+      !Shift->isArithmeticShift() && !isa<Constant>(Shift->getOperand(0))) {
+    // Compute C2 << Y.
+    Value *NewShift =
+        IsShl ? Builder.CreateLShr(And->getOperand(1), Shift->getOperand(1))
+              : Builder.CreateShl(And->getOperand(1), Shift->getOperand(1));
+
+    // Compute X & (C2 << Y).
+    Value *NewAnd = Builder.CreateAnd(Shift->getOperand(0), NewShift);
+    return replaceOperand(Cmp, 0, NewAnd);
+  }
+
+  return nullptr;
+}
+
+/// Fold icmp (and X, C2), C1.
 Instruction *InstCombinerImpl::foldICmpAndConstConst(ICmpInst &Cmp,
                                                      BinaryOperator *And,
                                                      const APInt &C1) {
-  bool isICMP_NE = Cmp.getPredicate() == ICmpInst::ICMP_NE; 
- 
-  // For vectors: icmp ne (and X, 1), 0 --> trunc X to N x i1 
-  // TODO: We canonicalize to the longer form for scalars because we have 
-  // better analysis/folds for icmp, and codegen may be better with icmp. 
-  if (isICMP_NE && Cmp.getType()->isVectorTy() && C1.isNullValue() && 
-      match(And->getOperand(1), m_One())) 
-    return new TruncInst(And->getOperand(0), Cmp.getType()); 
- 
-  const APInt *C2; 
-  Value *X; 
-  if (!match(And, m_And(m_Value(X), m_APInt(C2)))) 
-    return nullptr; 
- 
-  // Don't perform the following transforms if the AND has multiple uses 
-  if (!And->hasOneUse()) 
-    return nullptr; 
- 
-  if (Cmp.isEquality() && C1.isNullValue()) { 
-    // Restrict this fold to single-use 'and' (PR10267). 
-    // Replace (and X, (1 << size(X)-1) != 0) with X s< 0 
-    if (C2->isSignMask()) { 
-      Constant *Zero = Constant::getNullValue(X->getType()); 
-      auto NewPred = isICMP_NE ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_SGE; 
-      return new ICmpInst(NewPred, X, Zero); 
-    } 
- 
-    // Restrict this fold only for single-use 'and' (PR10267). 
-    // ((%x & C) == 0) --> %x u< (-C)  iff (-C) is power of two. 
-    if ((~(*C2) + 1).isPowerOf2()) { 
-      Constant *NegBOC = 
-          ConstantExpr::getNeg(cast<Constant>(And->getOperand(1))); 
-      auto NewPred = isICMP_NE ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT; 
-      return new ICmpInst(NewPred, X, NegBOC); 
-    } 
-  } 
- 
-  // If the LHS is an 'and' of a truncate and we can widen the and/compare to 
-  // the input width without changing the value produced, eliminate the cast: 
-  // 
-  // icmp (and (trunc W), C2), C1 -> icmp (and W, C2'), C1' 
-  // 
-  // We can do this transformation if the constants do not have their sign bits 
-  // set or if it is an equality comparison. Extending a relational comparison 
-  // when we're checking the sign bit would not work. 
-  Value *W; 
-  if (match(And->getOperand(0), m_OneUse(m_Trunc(m_Value(W)))) && 
-      (Cmp.isEquality() || (!C1.isNegative() && !C2->isNegative()))) { 
-    // TODO: Is this a good transform for vectors? Wider types may reduce 
-    // throughput. Should this transform be limited (even for scalars) by using 
-    // shouldChangeType()? 
-    if (!Cmp.getType()->isVectorTy()) { 
-      Type *WideType = W->getType(); 
-      unsigned WideScalarBits = WideType->getScalarSizeInBits(); 
-      Constant *ZextC1 = ConstantInt::get(WideType, C1.zext(WideScalarBits)); 
-      Constant *ZextC2 = ConstantInt::get(WideType, C2->zext(WideScalarBits)); 
-      Value *NewAnd = Builder.CreateAnd(W, ZextC2, And->getName()); 
-      return new ICmpInst(Cmp.getPredicate(), NewAnd, ZextC1); 
-    } 
-  } 
- 
-  if (Instruction *I = foldICmpAndShift(Cmp, And, C1, *C2)) 
-    return I; 
- 
-  // (icmp pred (and (or (lshr A, B), A), 1), 0) --> 
-  // (icmp pred (and A, (or (shl 1, B), 1), 0)) 
-  // 
-  // iff pred isn't signed 
-  if (!Cmp.isSigned() && C1.isNullValue() && And->getOperand(0)->hasOneUse() && 
-      match(And->getOperand(1), m_One())) { 
-    Constant *One = cast<Constant>(And->getOperand(1)); 
-    Value *Or = And->getOperand(0); 
-    Value *A, *B, *LShr; 
-    if (match(Or, m_Or(m_Value(LShr), m_Value(A))) && 
-        match(LShr, m_LShr(m_Specific(A), m_Value(B)))) { 
-      unsigned UsesRemoved = 0; 
-      if (And->hasOneUse()) 
-        ++UsesRemoved; 
-      if (Or->hasOneUse()) 
-        ++UsesRemoved; 
-      if (LShr->hasOneUse()) 
-        ++UsesRemoved; 
- 
-      // Compute A & ((1 << B) | 1) 
-      Value *NewOr = nullptr; 
-      if (auto *C = dyn_cast<Constant>(B)) { 
-        if (UsesRemoved >= 1) 
-          NewOr = ConstantExpr::getOr(ConstantExpr::getNUWShl(One, C), One); 
-      } else { 
-        if (UsesRemoved >= 3) 
-          NewOr = Builder.CreateOr(Builder.CreateShl(One, B, LShr->getName(), 
-                                                     /*HasNUW=*/true), 
-                                   One, Or->getName()); 
-      } 
-      if (NewOr) { 
-        Value *NewAnd = Builder.CreateAnd(A, NewOr, And->getName()); 
-        return replaceOperand(Cmp, 0, NewAnd); 
-      } 
-    } 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Fold icmp (and X, Y), C. 
+  bool isICMP_NE = Cmp.getPredicate() == ICmpInst::ICMP_NE;
+
+  // For vectors: icmp ne (and X, 1), 0 --> trunc X to N x i1
+  // TODO: We canonicalize to the longer form for scalars because we have
+  // better analysis/folds for icmp, and codegen may be better with icmp.
+  if (isICMP_NE && Cmp.getType()->isVectorTy() && C1.isNullValue() &&
+      match(And->getOperand(1), m_One()))
+    return new TruncInst(And->getOperand(0), Cmp.getType());
+
+  const APInt *C2;
+  Value *X;
+  if (!match(And, m_And(m_Value(X), m_APInt(C2))))
+    return nullptr;
+
+  // Don't perform the following transforms if the AND has multiple uses
+  if (!And->hasOneUse())
+    return nullptr;
+
+  if (Cmp.isEquality() && C1.isNullValue()) {
+    // Restrict this fold to single-use 'and' (PR10267).
+    // Replace (and X, (1 << size(X)-1) != 0) with X s< 0
+    if (C2->isSignMask()) {
+      Constant *Zero = Constant::getNullValue(X->getType());
+      auto NewPred = isICMP_NE ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_SGE;
+      return new ICmpInst(NewPred, X, Zero);
+    }
+
+    // Restrict this fold only for single-use 'and' (PR10267).
+    // ((%x & C) == 0) --> %x u< (-C)  iff (-C) is power of two.
+    if ((~(*C2) + 1).isPowerOf2()) {
+      Constant *NegBOC =
+          ConstantExpr::getNeg(cast<Constant>(And->getOperand(1)));
+      auto NewPred = isICMP_NE ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT;
+      return new ICmpInst(NewPred, X, NegBOC);
+    }
+  }
+
+  // If the LHS is an 'and' of a truncate and we can widen the and/compare to
+  // the input width without changing the value produced, eliminate the cast:
+  //
+  // icmp (and (trunc W), C2), C1 -> icmp (and W, C2'), C1'
+  //
+  // We can do this transformation if the constants do not have their sign bits
+  // set or if it is an equality comparison. Extending a relational comparison
+  // when we're checking the sign bit would not work.
+  Value *W;
+  if (match(And->getOperand(0), m_OneUse(m_Trunc(m_Value(W)))) &&
+      (Cmp.isEquality() || (!C1.isNegative() && !C2->isNegative()))) {
+    // TODO: Is this a good transform for vectors? Wider types may reduce
+    // throughput. Should this transform be limited (even for scalars) by using
+    // shouldChangeType()?
+    if (!Cmp.getType()->isVectorTy()) {
+      Type *WideType = W->getType();
+      unsigned WideScalarBits = WideType->getScalarSizeInBits();
+      Constant *ZextC1 = ConstantInt::get(WideType, C1.zext(WideScalarBits));
+      Constant *ZextC2 = ConstantInt::get(WideType, C2->zext(WideScalarBits));
+      Value *NewAnd = Builder.CreateAnd(W, ZextC2, And->getName());
+      return new ICmpInst(Cmp.getPredicate(), NewAnd, ZextC1);
+    }
+  }
+
+  if (Instruction *I = foldICmpAndShift(Cmp, And, C1, *C2))
+    return I;
+
+  // (icmp pred (and (or (lshr A, B), A), 1), 0) -->
+  // (icmp pred (and A, (or (shl 1, B), 1), 0))
+  //
+  // iff pred isn't signed
+  if (!Cmp.isSigned() && C1.isNullValue() && And->getOperand(0)->hasOneUse() &&
+      match(And->getOperand(1), m_One())) {
+    Constant *One = cast<Constant>(And->getOperand(1));
+    Value *Or = And->getOperand(0);
+    Value *A, *B, *LShr;
+    if (match(Or, m_Or(m_Value(LShr), m_Value(A))) &&
+        match(LShr, m_LShr(m_Specific(A), m_Value(B)))) {
+      unsigned UsesRemoved = 0;
+      if (And->hasOneUse())
+        ++UsesRemoved;
+      if (Or->hasOneUse())
+        ++UsesRemoved;
+      if (LShr->hasOneUse())
+        ++UsesRemoved;
+
+      // Compute A & ((1 << B) | 1)
+      Value *NewOr = nullptr;
+      if (auto *C = dyn_cast<Constant>(B)) {
+        if (UsesRemoved >= 1)
+          NewOr = ConstantExpr::getOr(ConstantExpr::getNUWShl(One, C), One);
+      } else {
+        if (UsesRemoved >= 3)
+          NewOr = Builder.CreateOr(Builder.CreateShl(One, B, LShr->getName(),
+                                                     /*HasNUW=*/true),
+                                   One, Or->getName());
+      }
+      if (NewOr) {
+        Value *NewAnd = Builder.CreateAnd(A, NewOr, And->getName());
+        return replaceOperand(Cmp, 0, NewAnd);
+      }
+    }
+  }
+
+  return nullptr;
+}
+
+/// Fold icmp (and X, Y), C.
 Instruction *InstCombinerImpl::foldICmpAndConstant(ICmpInst &Cmp,
                                                    BinaryOperator *And,
                                                    const APInt &C) {
-  if (Instruction *I = foldICmpAndConstConst(Cmp, And, C)) 
-    return I; 
- 
-  // TODO: These all require that Y is constant too, so refactor with the above. 
- 
-  // Try to optimize things like "A[i] & 42 == 0" to index computations. 
-  Value *X = And->getOperand(0); 
-  Value *Y = And->getOperand(1); 
-  if (auto *LI = dyn_cast<LoadInst>(X)) 
-    if (auto *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0))) 
-      if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) 
-        if (GV->isConstant() && GV->hasDefinitiveInitializer() && 
-            !LI->isVolatile() && isa<ConstantInt>(Y)) { 
-          ConstantInt *C2 = cast<ConstantInt>(Y); 
-          if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, Cmp, C2)) 
-            return Res; 
-        } 
- 
-  if (!Cmp.isEquality()) 
-    return nullptr; 
- 
-  // X & -C == -C -> X >  u ~C 
-  // X & -C != -C -> X <= u ~C 
-  //   iff C is a power of 2 
-  if (Cmp.getOperand(1) == Y && (-C).isPowerOf2()) { 
-    auto NewPred = Cmp.getPredicate() == CmpInst::ICMP_EQ ? CmpInst::ICMP_UGT 
-                                                          : CmpInst::ICMP_ULE; 
-    return new ICmpInst(NewPred, X, SubOne(cast<Constant>(Cmp.getOperand(1)))); 
-  } 
- 
-  // (X & C2) == 0 -> (trunc X) >= 0 
-  // (X & C2) != 0 -> (trunc X) <  0 
-  //   iff C2 is a power of 2 and it masks the sign bit of a legal integer type. 
-  const APInt *C2; 
-  if (And->hasOneUse() && C.isNullValue() && match(Y, m_APInt(C2))) { 
-    int32_t ExactLogBase2 = C2->exactLogBase2(); 
-    if (ExactLogBase2 != -1 && DL.isLegalInteger(ExactLogBase2 + 1)) { 
-      Type *NTy = IntegerType::get(Cmp.getContext(), ExactLogBase2 + 1); 
-      if (auto *AndVTy = dyn_cast<VectorType>(And->getType())) 
+  if (Instruction *I = foldICmpAndConstConst(Cmp, And, C))
+    return I;
+
+  // TODO: These all require that Y is constant too, so refactor with the above.
+
+  // Try to optimize things like "A[i] & 42 == 0" to index computations.
+  Value *X = And->getOperand(0);
+  Value *Y = And->getOperand(1);
+  if (auto *LI = dyn_cast<LoadInst>(X))
+    if (auto *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0)))
+      if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
+        if (GV->isConstant() && GV->hasDefinitiveInitializer() &&
+            !LI->isVolatile() && isa<ConstantInt>(Y)) {
+          ConstantInt *C2 = cast<ConstantInt>(Y);
+          if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, Cmp, C2))
+            return Res;
+        }
+
+  if (!Cmp.isEquality())
+    return nullptr;
+
+  // X & -C == -C -> X >  u ~C
+  // X & -C != -C -> X <= u ~C
+  //   iff C is a power of 2
+  if (Cmp.getOperand(1) == Y && (-C).isPowerOf2()) {
+    auto NewPred = Cmp.getPredicate() == CmpInst::ICMP_EQ ? CmpInst::ICMP_UGT
+                                                          : CmpInst::ICMP_ULE;
+    return new ICmpInst(NewPred, X, SubOne(cast<Constant>(Cmp.getOperand(1))));
+  }
+
+  // (X & C2) == 0 -> (trunc X) >= 0
+  // (X & C2) != 0 -> (trunc X) <  0
+  //   iff C2 is a power of 2 and it masks the sign bit of a legal integer type.
+  const APInt *C2;
+  if (And->hasOneUse() && C.isNullValue() && match(Y, m_APInt(C2))) {
+    int32_t ExactLogBase2 = C2->exactLogBase2();
+    if (ExactLogBase2 != -1 && DL.isLegalInteger(ExactLogBase2 + 1)) {
+      Type *NTy = IntegerType::get(Cmp.getContext(), ExactLogBase2 + 1);
+      if (auto *AndVTy = dyn_cast<VectorType>(And->getType()))
         NTy = VectorType::get(NTy, AndVTy->getElementCount());
-      Value *Trunc = Builder.CreateTrunc(X, NTy); 
-      auto NewPred = Cmp.getPredicate() == CmpInst::ICMP_EQ ? CmpInst::ICMP_SGE 
-                                                            : CmpInst::ICMP_SLT; 
-      return new ICmpInst(NewPred, Trunc, Constant::getNullValue(NTy)); 
-    } 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Fold icmp (or X, Y), C. 
+      Value *Trunc = Builder.CreateTrunc(X, NTy);
+      auto NewPred = Cmp.getPredicate() == CmpInst::ICMP_EQ ? CmpInst::ICMP_SGE
+                                                            : CmpInst::ICMP_SLT;
+      return new ICmpInst(NewPred, Trunc, Constant::getNullValue(NTy));
+    }
+  }
+
+  return nullptr;
+}
+
+/// Fold icmp (or X, Y), C.
 Instruction *InstCombinerImpl::foldICmpOrConstant(ICmpInst &Cmp,
                                                   BinaryOperator *Or,
                                                   const APInt &C) {
-  ICmpInst::Predicate Pred = Cmp.getPredicate(); 
-  if (C.isOneValue()) { 
-    // icmp slt signum(V) 1 --> icmp slt V, 1 
-    Value *V = nullptr; 
-    if (Pred == ICmpInst::ICMP_SLT && match(Or, m_Signum(m_Value(V)))) 
-      return new ICmpInst(ICmpInst::ICMP_SLT, V, 
-                          ConstantInt::get(V->getType(), 1)); 
-  } 
- 
-  Value *OrOp0 = Or->getOperand(0), *OrOp1 = Or->getOperand(1); 
-  const APInt *MaskC; 
-  if (match(OrOp1, m_APInt(MaskC)) && Cmp.isEquality()) { 
-    if (*MaskC == C && (C + 1).isPowerOf2()) { 
-      // X | C == C --> X <=u C 
-      // X | C != C --> X  >u C 
-      //   iff C+1 is a power of 2 (C is a bitmask of the low bits) 
-      Pred = (Pred == CmpInst::ICMP_EQ) ? CmpInst::ICMP_ULE : CmpInst::ICMP_UGT; 
-      return new ICmpInst(Pred, OrOp0, OrOp1); 
-    } 
- 
-    // More general: canonicalize 'equality with set bits mask' to 
-    // 'equality with clear bits mask'. 
-    // (X | MaskC) == C --> (X & ~MaskC) == C ^ MaskC 
-    // (X | MaskC) != C --> (X & ~MaskC) != C ^ MaskC 
-    if (Or->hasOneUse()) { 
-      Value *And = Builder.CreateAnd(OrOp0, ~(*MaskC)); 
-      Constant *NewC = ConstantInt::get(Or->getType(), C ^ (*MaskC)); 
-      return new ICmpInst(Pred, And, NewC); 
-    } 
-  } 
- 
-  if (!Cmp.isEquality() || !C.isNullValue() || !Or->hasOneUse()) 
-    return nullptr; 
- 
-  Value *P, *Q; 
-  if (match(Or, m_Or(m_PtrToInt(m_Value(P)), m_PtrToInt(m_Value(Q))))) { 
-    // Simplify icmp eq (or (ptrtoint P), (ptrtoint Q)), 0 
-    // -> and (icmp eq P, null), (icmp eq Q, null). 
-    Value *CmpP = 
-        Builder.CreateICmp(Pred, P, ConstantInt::getNullValue(P->getType())); 
-    Value *CmpQ = 
-        Builder.CreateICmp(Pred, Q, ConstantInt::getNullValue(Q->getType())); 
-    auto BOpc = Pred == CmpInst::ICMP_EQ ? Instruction::And : Instruction::Or; 
-    return BinaryOperator::Create(BOpc, CmpP, CmpQ); 
-  } 
- 
-  // Are we using xors to bitwise check for a pair of (in)equalities? Convert to 
-  // a shorter form that has more potential to be folded even further. 
-  Value *X1, *X2, *X3, *X4; 
-  if (match(OrOp0, m_OneUse(m_Xor(m_Value(X1), m_Value(X2)))) && 
-      match(OrOp1, m_OneUse(m_Xor(m_Value(X3), m_Value(X4))))) { 
-    // ((X1 ^ X2) || (X3 ^ X4)) == 0 --> (X1 == X2) && (X3 == X4) 
-    // ((X1 ^ X2) || (X3 ^ X4)) != 0 --> (X1 != X2) || (X3 != X4) 
-    Value *Cmp12 = Builder.CreateICmp(Pred, X1, X2); 
-    Value *Cmp34 = Builder.CreateICmp(Pred, X3, X4); 
-    auto BOpc = Pred == CmpInst::ICMP_EQ ? Instruction::And : Instruction::Or; 
-    return BinaryOperator::Create(BOpc, Cmp12, Cmp34); 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Fold icmp (mul X, Y), C. 
+  ICmpInst::Predicate Pred = Cmp.getPredicate();
+  if (C.isOneValue()) {
+    // icmp slt signum(V) 1 --> icmp slt V, 1
+    Value *V = nullptr;
+    if (Pred == ICmpInst::ICMP_SLT && match(Or, m_Signum(m_Value(V))))
+      return new ICmpInst(ICmpInst::ICMP_SLT, V,
+                          ConstantInt::get(V->getType(), 1));
+  }
+
+  Value *OrOp0 = Or->getOperand(0), *OrOp1 = Or->getOperand(1);
+  const APInt *MaskC;
+  if (match(OrOp1, m_APInt(MaskC)) && Cmp.isEquality()) {
+    if (*MaskC == C && (C + 1).isPowerOf2()) {
+      // X | C == C --> X <=u C
+      // X | C != C --> X  >u C
+      //   iff C+1 is a power of 2 (C is a bitmask of the low bits)
+      Pred = (Pred == CmpInst::ICMP_EQ) ? CmpInst::ICMP_ULE : CmpInst::ICMP_UGT;
+      return new ICmpInst(Pred, OrOp0, OrOp1);
+    }
+
+    // More general: canonicalize 'equality with set bits mask' to
+    // 'equality with clear bits mask'.
+    // (X | MaskC) == C --> (X & ~MaskC) == C ^ MaskC
+    // (X | MaskC) != C --> (X & ~MaskC) != C ^ MaskC
+    if (Or->hasOneUse()) {
+      Value *And = Builder.CreateAnd(OrOp0, ~(*MaskC));
+      Constant *NewC = ConstantInt::get(Or->getType(), C ^ (*MaskC));
+      return new ICmpInst(Pred, And, NewC);
+    }
+  }
+
+  if (!Cmp.isEquality() || !C.isNullValue() || !Or->hasOneUse())
+    return nullptr;
+
+  Value *P, *Q;
+  if (match(Or, m_Or(m_PtrToInt(m_Value(P)), m_PtrToInt(m_Value(Q))))) {
+    // Simplify icmp eq (or (ptrtoint P), (ptrtoint Q)), 0
+    // -> and (icmp eq P, null), (icmp eq Q, null).
+    Value *CmpP =
+        Builder.CreateICmp(Pred, P, ConstantInt::getNullValue(P->getType()));
+    Value *CmpQ =
+        Builder.CreateICmp(Pred, Q, ConstantInt::getNullValue(Q->getType()));
+    auto BOpc = Pred == CmpInst::ICMP_EQ ? Instruction::And : Instruction::Or;
+    return BinaryOperator::Create(BOpc, CmpP, CmpQ);
+  }
+
+  // Are we using xors to bitwise check for a pair of (in)equalities? Convert to
+  // a shorter form that has more potential to be folded even further.
+  Value *X1, *X2, *X3, *X4;
+  if (match(OrOp0, m_OneUse(m_Xor(m_Value(X1), m_Value(X2)))) &&
+      match(OrOp1, m_OneUse(m_Xor(m_Value(X3), m_Value(X4))))) {
+    // ((X1 ^ X2) || (X3 ^ X4)) == 0 --> (X1 == X2) && (X3 == X4)
+    // ((X1 ^ X2) || (X3 ^ X4)) != 0 --> (X1 != X2) || (X3 != X4)
+    Value *Cmp12 = Builder.CreateICmp(Pred, X1, X2);
+    Value *Cmp34 = Builder.CreateICmp(Pred, X3, X4);
+    auto BOpc = Pred == CmpInst::ICMP_EQ ? Instruction::And : Instruction::Or;
+    return BinaryOperator::Create(BOpc, Cmp12, Cmp34);
+  }
+
+  return nullptr;
+}
+
+/// Fold icmp (mul X, Y), C.
 Instruction *InstCombinerImpl::foldICmpMulConstant(ICmpInst &Cmp,
                                                    BinaryOperator *Mul,
                                                    const APInt &C) {
-  const APInt *MulC; 
-  if (!match(Mul->getOperand(1), m_APInt(MulC))) 
-    return nullptr; 
- 
-  // If this is a test of the sign bit and the multiply is sign-preserving with 
-  // a constant operand, use the multiply LHS operand instead. 
-  ICmpInst::Predicate Pred = Cmp.getPredicate(); 
-  if (isSignTest(Pred, C) && Mul->hasNoSignedWrap()) { 
-    if (MulC->isNegative()) 
-      Pred = ICmpInst::getSwappedPredicate(Pred); 
-    return new ICmpInst(Pred, Mul->getOperand(0), 
-                        Constant::getNullValue(Mul->getType())); 
-  } 
- 
+  const APInt *MulC;
+  if (!match(Mul->getOperand(1), m_APInt(MulC)))
+    return nullptr;
+
+  // If this is a test of the sign bit and the multiply is sign-preserving with
+  // a constant operand, use the multiply LHS operand instead.
+  ICmpInst::Predicate Pred = Cmp.getPredicate();
+  if (isSignTest(Pred, C) && Mul->hasNoSignedWrap()) {
+    if (MulC->isNegative())
+      Pred = ICmpInst::getSwappedPredicate(Pred);
+    return new ICmpInst(Pred, Mul->getOperand(0),
+                        Constant::getNullValue(Mul->getType()));
+  }
+
   // If the multiply does not wrap, try to divide the compare constant by the
   // multiplication factor.
   if (Cmp.isEquality() && !MulC->isNullValue()) {
@@ -1956,260 +1956,260 @@ Instruction *InstCombinerImpl::foldICmpMulConstant(ICmpInst &Cmp,
     }
   }
 
-  return nullptr; 
-} 
- 
-/// Fold icmp (shl 1, Y), C. 
-static Instruction *foldICmpShlOne(ICmpInst &Cmp, Instruction *Shl, 
-                                   const APInt &C) { 
-  Value *Y; 
-  if (!match(Shl, m_Shl(m_One(), m_Value(Y)))) 
-    return nullptr; 
- 
-  Type *ShiftType = Shl->getType(); 
-  unsigned TypeBits = C.getBitWidth(); 
-  bool CIsPowerOf2 = C.isPowerOf2(); 
-  ICmpInst::Predicate Pred = Cmp.getPredicate(); 
-  if (Cmp.isUnsigned()) { 
-    // (1 << Y) pred C -> Y pred Log2(C) 
-    if (!CIsPowerOf2) { 
-      // (1 << Y) <  30 -> Y <= 4 
-      // (1 << Y) <= 30 -> Y <= 4 
-      // (1 << Y) >= 30 -> Y >  4 
-      // (1 << Y) >  30 -> Y >  4 
-      if (Pred == ICmpInst::ICMP_ULT) 
-        Pred = ICmpInst::ICMP_ULE; 
-      else if (Pred == ICmpInst::ICMP_UGE) 
-        Pred = ICmpInst::ICMP_UGT; 
-    } 
- 
-    // (1 << Y) >= 2147483648 -> Y >= 31 -> Y == 31 
-    // (1 << Y) <  2147483648 -> Y <  31 -> Y != 31 
-    unsigned CLog2 = C.logBase2(); 
-    if (CLog2 == TypeBits - 1) { 
-      if (Pred == ICmpInst::ICMP_UGE) 
-        Pred = ICmpInst::ICMP_EQ; 
-      else if (Pred == ICmpInst::ICMP_ULT) 
-        Pred = ICmpInst::ICMP_NE; 
-    } 
-    return new ICmpInst(Pred, Y, ConstantInt::get(ShiftType, CLog2)); 
-  } else if (Cmp.isSigned()) { 
-    Constant *BitWidthMinusOne = ConstantInt::get(ShiftType, TypeBits - 1); 
-    if (C.isAllOnesValue()) { 
-      // (1 << Y) <= -1 -> Y == 31 
-      if (Pred == ICmpInst::ICMP_SLE) 
-        return new ICmpInst(ICmpInst::ICMP_EQ, Y, BitWidthMinusOne); 
- 
-      // (1 << Y) >  -1 -> Y != 31 
-      if (Pred == ICmpInst::ICMP_SGT) 
-        return new ICmpInst(ICmpInst::ICMP_NE, Y, BitWidthMinusOne); 
-    } else if (!C) { 
-      // (1 << Y) <  0 -> Y == 31 
-      // (1 << Y) <= 0 -> Y == 31 
-      if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE) 
-        return new ICmpInst(ICmpInst::ICMP_EQ, Y, BitWidthMinusOne); 
- 
-      // (1 << Y) >= 0 -> Y != 31 
-      // (1 << Y) >  0 -> Y != 31 
-      if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE) 
-        return new ICmpInst(ICmpInst::ICMP_NE, Y, BitWidthMinusOne); 
-    } 
-  } else if (Cmp.isEquality() && CIsPowerOf2) { 
-    return new ICmpInst(Pred, Y, ConstantInt::get(ShiftType, C.logBase2())); 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Fold icmp (shl X, Y), C. 
+  return nullptr;
+}
+
+/// Fold icmp (shl 1, Y), C.
+static Instruction *foldICmpShlOne(ICmpInst &Cmp, Instruction *Shl,
+                                   const APInt &C) {
+  Value *Y;
+  if (!match(Shl, m_Shl(m_One(), m_Value(Y))))
+    return nullptr;
+
+  Type *ShiftType = Shl->getType();
+  unsigned TypeBits = C.getBitWidth();
+  bool CIsPowerOf2 = C.isPowerOf2();
+  ICmpInst::Predicate Pred = Cmp.getPredicate();
+  if (Cmp.isUnsigned()) {
+    // (1 << Y) pred C -> Y pred Log2(C)
+    if (!CIsPowerOf2) {
+      // (1 << Y) <  30 -> Y <= 4
+      // (1 << Y) <= 30 -> Y <= 4
+      // (1 << Y) >= 30 -> Y >  4
+      // (1 << Y) >  30 -> Y >  4
+      if (Pred == ICmpInst::ICMP_ULT)
+        Pred = ICmpInst::ICMP_ULE;
+      else if (Pred == ICmpInst::ICMP_UGE)
+        Pred = ICmpInst::ICMP_UGT;
+    }
+
+    // (1 << Y) >= 2147483648 -> Y >= 31 -> Y == 31
+    // (1 << Y) <  2147483648 -> Y <  31 -> Y != 31
+    unsigned CLog2 = C.logBase2();
+    if (CLog2 == TypeBits - 1) {
+      if (Pred == ICmpInst::ICMP_UGE)
+        Pred = ICmpInst::ICMP_EQ;
+      else if (Pred == ICmpInst::ICMP_ULT)
+        Pred = ICmpInst::ICMP_NE;
+    }
+    return new ICmpInst(Pred, Y, ConstantInt::get(ShiftType, CLog2));
+  } else if (Cmp.isSigned()) {
+    Constant *BitWidthMinusOne = ConstantInt::get(ShiftType, TypeBits - 1);
+    if (C.isAllOnesValue()) {
+      // (1 << Y) <= -1 -> Y == 31
+      if (Pred == ICmpInst::ICMP_SLE)
+        return new ICmpInst(ICmpInst::ICMP_EQ, Y, BitWidthMinusOne);
+
+      // (1 << Y) >  -1 -> Y != 31
+      if (Pred == ICmpInst::ICMP_SGT)
+        return new ICmpInst(ICmpInst::ICMP_NE, Y, BitWidthMinusOne);
+    } else if (!C) {
+      // (1 << Y) <  0 -> Y == 31
+      // (1 << Y) <= 0 -> Y == 31
+      if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE)
+        return new ICmpInst(ICmpInst::ICMP_EQ, Y, BitWidthMinusOne);
+
+      // (1 << Y) >= 0 -> Y != 31
+      // (1 << Y) >  0 -> Y != 31
+      if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE)
+        return new ICmpInst(ICmpInst::ICMP_NE, Y, BitWidthMinusOne);
+    }
+  } else if (Cmp.isEquality() && CIsPowerOf2) {
+    return new ICmpInst(Pred, Y, ConstantInt::get(ShiftType, C.logBase2()));
+  }
+
+  return nullptr;
+}
+
+/// Fold icmp (shl X, Y), C.
 Instruction *InstCombinerImpl::foldICmpShlConstant(ICmpInst &Cmp,
                                                    BinaryOperator *Shl,
                                                    const APInt &C) {
-  const APInt *ShiftVal; 
-  if (Cmp.isEquality() && match(Shl->getOperand(0), m_APInt(ShiftVal))) 
-    return foldICmpShlConstConst(Cmp, Shl->getOperand(1), C, *ShiftVal); 
- 
-  const APInt *ShiftAmt; 
-  if (!match(Shl->getOperand(1), m_APInt(ShiftAmt))) 
-    return foldICmpShlOne(Cmp, Shl, C); 
- 
-  // Check that the shift amount is in range. If not, don't perform undefined 
-  // shifts. When the shift is visited, it will be simplified. 
-  unsigned TypeBits = C.getBitWidth(); 
-  if (ShiftAmt->uge(TypeBits)) 
-    return nullptr; 
- 
-  ICmpInst::Predicate Pred = Cmp.getPredicate(); 
-  Value *X = Shl->getOperand(0); 
-  Type *ShType = Shl->getType(); 
- 
-  // NSW guarantees that we are only shifting out sign bits from the high bits, 
-  // so we can ASHR the compare constant without needing a mask and eliminate 
-  // the shift. 
-  if (Shl->hasNoSignedWrap()) { 
-    if (Pred == ICmpInst::ICMP_SGT) { 
-      // icmp Pred (shl nsw X, ShiftAmt), C --> icmp Pred X, (C >>s ShiftAmt) 
-      APInt ShiftedC = C.ashr(*ShiftAmt); 
-      return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC)); 
-    } 
-    if ((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) && 
-        C.ashr(*ShiftAmt).shl(*ShiftAmt) == C) { 
-      APInt ShiftedC = C.ashr(*ShiftAmt); 
-      return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC)); 
-    } 
-    if (Pred == ICmpInst::ICMP_SLT) { 
-      // SLE is the same as above, but SLE is canonicalized to SLT, so convert: 
-      // (X << S) <=s C is equiv to X <=s (C >> S) for all C 
-      // (X << S) <s (C + 1) is equiv to X <s (C >> S) + 1 if C <s SMAX 
-      // (X << S) <s C is equiv to X <s ((C - 1) >> S) + 1 if C >s SMIN 
-      assert(!C.isMinSignedValue() && "Unexpected icmp slt"); 
-      APInt ShiftedC = (C - 1).ashr(*ShiftAmt) + 1; 
-      return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC)); 
-    } 
-    // If this is a signed comparison to 0 and the shift is sign preserving, 
-    // use the shift LHS operand instead; isSignTest may change 'Pred', so only 
-    // do that if we're sure to not continue on in this function. 
-    if (isSignTest(Pred, C)) 
-      return new ICmpInst(Pred, X, Constant::getNullValue(ShType)); 
-  } 
- 
-  // NUW guarantees that we are only shifting out zero bits from the high bits, 
-  // so we can LSHR the compare constant without needing a mask and eliminate 
-  // the shift. 
-  if (Shl->hasNoUnsignedWrap()) { 
-    if (Pred == ICmpInst::ICMP_UGT) { 
-      // icmp Pred (shl nuw X, ShiftAmt), C --> icmp Pred X, (C >>u ShiftAmt) 
-      APInt ShiftedC = C.lshr(*ShiftAmt); 
-      return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC)); 
-    } 
-    if ((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) && 
-        C.lshr(*ShiftAmt).shl(*ShiftAmt) == C) { 
-      APInt ShiftedC = C.lshr(*ShiftAmt); 
-      return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC)); 
-    } 
-    if (Pred == ICmpInst::ICMP_ULT) { 
-      // ULE is the same as above, but ULE is canonicalized to ULT, so convert: 
-      // (X << S) <=u C is equiv to X <=u (C >> S) for all C 
-      // (X << S) <u (C + 1) is equiv to X <u (C >> S) + 1 if C <u ~0u 
-      // (X << S) <u C is equiv to X <u ((C - 1) >> S) + 1 if C >u 0 
-      assert(C.ugt(0) && "ult 0 should have been eliminated"); 
-      APInt ShiftedC = (C - 1).lshr(*ShiftAmt) + 1; 
-      return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC)); 
-    } 
-  } 
- 
-  if (Cmp.isEquality() && Shl->hasOneUse()) { 
-    // Strength-reduce the shift into an 'and'. 
-    Constant *Mask = ConstantInt::get( 
-        ShType, 
-        APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt->getZExtValue())); 
-    Value *And = Builder.CreateAnd(X, Mask, Shl->getName() + ".mask"); 
-    Constant *LShrC = ConstantInt::get(ShType, C.lshr(*ShiftAmt)); 
-    return new ICmpInst(Pred, And, LShrC); 
-  } 
- 
-  // Otherwise, if this is a comparison of the sign bit, simplify to and/test. 
-  bool TrueIfSigned = false; 
-  if (Shl->hasOneUse() && isSignBitCheck(Pred, C, TrueIfSigned)) { 
-    // (X << 31) <s 0  --> (X & 1) != 0 
-    Constant *Mask = ConstantInt::get( 
-        ShType, 
-        APInt::getOneBitSet(TypeBits, TypeBits - ShiftAmt->getZExtValue() - 1)); 
-    Value *And = Builder.CreateAnd(X, Mask, Shl->getName() + ".mask"); 
-    return new ICmpInst(TrueIfSigned ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ, 
-                        And, Constant::getNullValue(ShType)); 
-  } 
- 
-  // Simplify 'shl' inequality test into 'and' equality test. 
-  if (Cmp.isUnsigned() && Shl->hasOneUse()) { 
-    // (X l<< C2) u<=/u> C1 iff C1+1 is power of two -> X & (~C1 l>> C2) ==/!= 0 
-    if ((C + 1).isPowerOf2() && 
-        (Pred == ICmpInst::ICMP_ULE || Pred == ICmpInst::ICMP_UGT)) { 
-      Value *And = Builder.CreateAnd(X, (~C).lshr(ShiftAmt->getZExtValue())); 
-      return new ICmpInst(Pred == ICmpInst::ICMP_ULE ? ICmpInst::ICMP_EQ 
-                                                     : ICmpInst::ICMP_NE, 
-                          And, Constant::getNullValue(ShType)); 
-    } 
-    // (X l<< C2) u</u>= C1 iff C1 is power of two -> X & (-C1 l>> C2) ==/!= 0 
-    if (C.isPowerOf2() && 
-        (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_UGE)) { 
-      Value *And = 
-          Builder.CreateAnd(X, (~(C - 1)).lshr(ShiftAmt->getZExtValue())); 
-      return new ICmpInst(Pred == ICmpInst::ICMP_ULT ? ICmpInst::ICMP_EQ 
-                                                     : ICmpInst::ICMP_NE, 
-                          And, Constant::getNullValue(ShType)); 
-    } 
-  } 
- 
-  // Transform (icmp pred iM (shl iM %v, N), C) 
-  // -> (icmp pred i(M-N) (trunc %v iM to i(M-N)), (trunc (C>>N)) 
-  // Transform the shl to a trunc if (trunc (C>>N)) has no loss and M-N. 
-  // This enables us to get rid of the shift in favor of a trunc that may be 
-  // free on the target. It has the additional benefit of comparing to a 
-  // smaller constant that may be more target-friendly. 
-  unsigned Amt = ShiftAmt->getLimitedValue(TypeBits - 1); 
-  if (Shl->hasOneUse() && Amt != 0 && C.countTrailingZeros() >= Amt && 
-      DL.isLegalInteger(TypeBits - Amt)) { 
-    Type *TruncTy = IntegerType::get(Cmp.getContext(), TypeBits - Amt); 
-    if (auto *ShVTy = dyn_cast<VectorType>(ShType)) 
+  const APInt *ShiftVal;
+  if (Cmp.isEquality() && match(Shl->getOperand(0), m_APInt(ShiftVal)))
+    return foldICmpShlConstConst(Cmp, Shl->getOperand(1), C, *ShiftVal);
+
+  const APInt *ShiftAmt;
+  if (!match(Shl->getOperand(1), m_APInt(ShiftAmt)))
+    return foldICmpShlOne(Cmp, Shl, C);
+
+  // Check that the shift amount is in range. If not, don't perform undefined
+  // shifts. When the shift is visited, it will be simplified.
+  unsigned TypeBits = C.getBitWidth();
+  if (ShiftAmt->uge(TypeBits))
+    return nullptr;
+
+  ICmpInst::Predicate Pred = Cmp.getPredicate();
+  Value *X = Shl->getOperand(0);
+  Type *ShType = Shl->getType();
+
+  // NSW guarantees that we are only shifting out sign bits from the high bits,
+  // so we can ASHR the compare constant without needing a mask and eliminate
+  // the shift.
+  if (Shl->hasNoSignedWrap()) {
+    if (Pred == ICmpInst::ICMP_SGT) {
+      // icmp Pred (shl nsw X, ShiftAmt), C --> icmp Pred X, (C >>s ShiftAmt)
+      APInt ShiftedC = C.ashr(*ShiftAmt);
+      return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
+    }
+    if ((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) &&
+        C.ashr(*ShiftAmt).shl(*ShiftAmt) == C) {
+      APInt ShiftedC = C.ashr(*ShiftAmt);
+      return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
+    }
+    if (Pred == ICmpInst::ICMP_SLT) {
+      // SLE is the same as above, but SLE is canonicalized to SLT, so convert:
+      // (X << S) <=s C is equiv to X <=s (C >> S) for all C
+      // (X << S) <s (C + 1) is equiv to X <s (C >> S) + 1 if C <s SMAX
+      // (X << S) <s C is equiv to X <s ((C - 1) >> S) + 1 if C >s SMIN
+      assert(!C.isMinSignedValue() && "Unexpected icmp slt");
+      APInt ShiftedC = (C - 1).ashr(*ShiftAmt) + 1;
+      return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
+    }
+    // If this is a signed comparison to 0 and the shift is sign preserving,
+    // use the shift LHS operand instead; isSignTest may change 'Pred', so only
+    // do that if we're sure to not continue on in this function.
+    if (isSignTest(Pred, C))
+      return new ICmpInst(Pred, X, Constant::getNullValue(ShType));
+  }
+
+  // NUW guarantees that we are only shifting out zero bits from the high bits,
+  // so we can LSHR the compare constant without needing a mask and eliminate
+  // the shift.
+  if (Shl->hasNoUnsignedWrap()) {
+    if (Pred == ICmpInst::ICMP_UGT) {
+      // icmp Pred (shl nuw X, ShiftAmt), C --> icmp Pred X, (C >>u ShiftAmt)
+      APInt ShiftedC = C.lshr(*ShiftAmt);
+      return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
+    }
+    if ((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) &&
+        C.lshr(*ShiftAmt).shl(*ShiftAmt) == C) {
+      APInt ShiftedC = C.lshr(*ShiftAmt);
+      return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
+    }
+    if (Pred == ICmpInst::ICMP_ULT) {
+      // ULE is the same as above, but ULE is canonicalized to ULT, so convert:
+      // (X << S) <=u C is equiv to X <=u (C >> S) for all C
+      // (X << S) <u (C + 1) is equiv to X <u (C >> S) + 1 if C <u ~0u
+      // (X << S) <u C is equiv to X <u ((C - 1) >> S) + 1 if C >u 0
+      assert(C.ugt(0) && "ult 0 should have been eliminated");
+      APInt ShiftedC = (C - 1).lshr(*ShiftAmt) + 1;
+      return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
+    }
+  }
+
+  if (Cmp.isEquality() && Shl->hasOneUse()) {
+    // Strength-reduce the shift into an 'and'.
+    Constant *Mask = ConstantInt::get(
+        ShType,
+        APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt->getZExtValue()));
+    Value *And = Builder.CreateAnd(X, Mask, Shl->getName() + ".mask");
+    Constant *LShrC = ConstantInt::get(ShType, C.lshr(*ShiftAmt));
+    return new ICmpInst(Pred, And, LShrC);
+  }
+
+  // Otherwise, if this is a comparison of the sign bit, simplify to and/test.
+  bool TrueIfSigned = false;
+  if (Shl->hasOneUse() && isSignBitCheck(Pred, C, TrueIfSigned)) {
+    // (X << 31) <s 0  --> (X & 1) != 0
+    Constant *Mask = ConstantInt::get(
+        ShType,
+        APInt::getOneBitSet(TypeBits, TypeBits - ShiftAmt->getZExtValue() - 1));
+    Value *And = Builder.CreateAnd(X, Mask, Shl->getName() + ".mask");
+    return new ICmpInst(TrueIfSigned ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ,
+                        And, Constant::getNullValue(ShType));
+  }
+
+  // Simplify 'shl' inequality test into 'and' equality test.
+  if (Cmp.isUnsigned() && Shl->hasOneUse()) {
+    // (X l<< C2) u<=/u> C1 iff C1+1 is power of two -> X & (~C1 l>> C2) ==/!= 0
+    if ((C + 1).isPowerOf2() &&
+        (Pred == ICmpInst::ICMP_ULE || Pred == ICmpInst::ICMP_UGT)) {
+      Value *And = Builder.CreateAnd(X, (~C).lshr(ShiftAmt->getZExtValue()));
+      return new ICmpInst(Pred == ICmpInst::ICMP_ULE ? ICmpInst::ICMP_EQ
+                                                     : ICmpInst::ICMP_NE,
+                          And, Constant::getNullValue(ShType));
+    }
+    // (X l<< C2) u</u>= C1 iff C1 is power of two -> X & (-C1 l>> C2) ==/!= 0
+    if (C.isPowerOf2() &&
+        (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_UGE)) {
+      Value *And =
+          Builder.CreateAnd(X, (~(C - 1)).lshr(ShiftAmt->getZExtValue()));
+      return new ICmpInst(Pred == ICmpInst::ICMP_ULT ? ICmpInst::ICMP_EQ
+                                                     : ICmpInst::ICMP_NE,
+                          And, Constant::getNullValue(ShType));
+    }
+  }
+
+  // Transform (icmp pred iM (shl iM %v, N), C)
+  // -> (icmp pred i(M-N) (trunc %v iM to i(M-N)), (trunc (C>>N))
+  // Transform the shl to a trunc if (trunc (C>>N)) has no loss and M-N.
+  // This enables us to get rid of the shift in favor of a trunc that may be
+  // free on the target. It has the additional benefit of comparing to a
+  // smaller constant that may be more target-friendly.
+  unsigned Amt = ShiftAmt->getLimitedValue(TypeBits - 1);
+  if (Shl->hasOneUse() && Amt != 0 && C.countTrailingZeros() >= Amt &&
+      DL.isLegalInteger(TypeBits - Amt)) {
+    Type *TruncTy = IntegerType::get(Cmp.getContext(), TypeBits - Amt);
+    if (auto *ShVTy = dyn_cast<VectorType>(ShType))
       TruncTy = VectorType::get(TruncTy, ShVTy->getElementCount());
-    Constant *NewC = 
-        ConstantInt::get(TruncTy, C.ashr(*ShiftAmt).trunc(TypeBits - Amt)); 
-    return new ICmpInst(Pred, Builder.CreateTrunc(X, TruncTy), NewC); 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Fold icmp ({al}shr X, Y), C. 
+    Constant *NewC =
+        ConstantInt::get(TruncTy, C.ashr(*ShiftAmt).trunc(TypeBits - Amt));
+    return new ICmpInst(Pred, Builder.CreateTrunc(X, TruncTy), NewC);
+  }
+
+  return nullptr;
+}
+
+/// Fold icmp ({al}shr X, Y), C.
 Instruction *InstCombinerImpl::foldICmpShrConstant(ICmpInst &Cmp,
                                                    BinaryOperator *Shr,
                                                    const APInt &C) {
-  // An exact shr only shifts out zero bits, so: 
-  // icmp eq/ne (shr X, Y), 0 --> icmp eq/ne X, 0 
-  Value *X = Shr->getOperand(0); 
-  CmpInst::Predicate Pred = Cmp.getPredicate(); 
-  if (Cmp.isEquality() && Shr->isExact() && Shr->hasOneUse() && 
-      C.isNullValue()) 
-    return new ICmpInst(Pred, X, Cmp.getOperand(1)); 
- 
-  const APInt *ShiftVal; 
-  if (Cmp.isEquality() && match(Shr->getOperand(0), m_APInt(ShiftVal))) 
-    return foldICmpShrConstConst(Cmp, Shr->getOperand(1), C, *ShiftVal); 
- 
-  const APInt *ShiftAmt; 
-  if (!match(Shr->getOperand(1), m_APInt(ShiftAmt))) 
-    return nullptr; 
- 
-  // Check that the shift amount is in range. If not, don't perform undefined 
-  // shifts. When the shift is visited it will be simplified. 
-  unsigned TypeBits = C.getBitWidth(); 
-  unsigned ShAmtVal = ShiftAmt->getLimitedValue(TypeBits); 
-  if (ShAmtVal >= TypeBits || ShAmtVal == 0) 
-    return nullptr; 
- 
-  bool IsAShr = Shr->getOpcode() == Instruction::AShr; 
-  bool IsExact = Shr->isExact(); 
-  Type *ShrTy = Shr->getType(); 
-  // TODO: If we could guarantee that InstSimplify would handle all of the 
-  // constant-value-based preconditions in the folds below, then we could assert 
-  // those conditions rather than checking them. This is difficult because of 
-  // undef/poison (PR34838). 
-  if (IsAShr) { 
-    if (Pred == CmpInst::ICMP_SLT || (Pred == CmpInst::ICMP_SGT && IsExact)) { 
-      // icmp slt (ashr X, ShAmtC), C --> icmp slt X, (C << ShAmtC) 
-      // icmp sgt (ashr exact X, ShAmtC), C --> icmp sgt X, (C << ShAmtC) 
-      APInt ShiftedC = C.shl(ShAmtVal); 
-      if (ShiftedC.ashr(ShAmtVal) == C) 
-        return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC)); 
-    } 
-    if (Pred == CmpInst::ICMP_SGT) { 
-      // icmp sgt (ashr X, ShAmtC), C --> icmp sgt X, ((C + 1) << ShAmtC) - 1 
-      APInt ShiftedC = (C + 1).shl(ShAmtVal) - 1; 
-      if (!C.isMaxSignedValue() && !(C + 1).shl(ShAmtVal).isMinSignedValue() && 
-          (ShiftedC + 1).ashr(ShAmtVal) == (C + 1)) 
-        return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC)); 
-    } 
+  // An exact shr only shifts out zero bits, so:
+  // icmp eq/ne (shr X, Y), 0 --> icmp eq/ne X, 0
+  Value *X = Shr->getOperand(0);
+  CmpInst::Predicate Pred = Cmp.getPredicate();
+  if (Cmp.isEquality() && Shr->isExact() && Shr->hasOneUse() &&
+      C.isNullValue())
+    return new ICmpInst(Pred, X, Cmp.getOperand(1));
+
+  const APInt *ShiftVal;
+  if (Cmp.isEquality() && match(Shr->getOperand(0), m_APInt(ShiftVal)))
+    return foldICmpShrConstConst(Cmp, Shr->getOperand(1), C, *ShiftVal);
+
+  const APInt *ShiftAmt;
+  if (!match(Shr->getOperand(1), m_APInt(ShiftAmt)))
+    return nullptr;
+
+  // Check that the shift amount is in range. If not, don't perform undefined
+  // shifts. When the shift is visited it will be simplified.
+  unsigned TypeBits = C.getBitWidth();
+  unsigned ShAmtVal = ShiftAmt->getLimitedValue(TypeBits);
+  if (ShAmtVal >= TypeBits || ShAmtVal == 0)
+    return nullptr;
+
+  bool IsAShr = Shr->getOpcode() == Instruction::AShr;
+  bool IsExact = Shr->isExact();
+  Type *ShrTy = Shr->getType();
+  // TODO: If we could guarantee that InstSimplify would handle all of the
+  // constant-value-based preconditions in the folds below, then we could assert
+  // those conditions rather than checking them. This is difficult because of
+  // undef/poison (PR34838).
+  if (IsAShr) {
+    if (Pred == CmpInst::ICMP_SLT || (Pred == CmpInst::ICMP_SGT && IsExact)) {
+      // icmp slt (ashr X, ShAmtC), C --> icmp slt X, (C << ShAmtC)
+      // icmp sgt (ashr exact X, ShAmtC), C --> icmp sgt X, (C << ShAmtC)
+      APInt ShiftedC = C.shl(ShAmtVal);
+      if (ShiftedC.ashr(ShAmtVal) == C)
+        return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC));
+    }
+    if (Pred == CmpInst::ICMP_SGT) {
+      // icmp sgt (ashr X, ShAmtC), C --> icmp sgt X, ((C + 1) << ShAmtC) - 1
+      APInt ShiftedC = (C + 1).shl(ShAmtVal) - 1;
+      if (!C.isMaxSignedValue() && !(C + 1).shl(ShAmtVal).isMinSignedValue() &&
+          (ShiftedC + 1).ashr(ShAmtVal) == (C + 1))
+        return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC));
+    }
 
     // If the compare constant has significant bits above the lowest sign-bit,
     // then convert an unsigned cmp to a test of the sign-bit:
@@ -2225,841 +2225,841 @@ Instruction *InstCombinerImpl::foldICmpShrConstant(ICmpInst &Cmp,
                             ConstantInt::getAllOnesValue(ShrTy));
       }
     }
-  } else { 
-    if (Pred == CmpInst::ICMP_ULT || (Pred == CmpInst::ICMP_UGT && IsExact)) { 
-      // icmp ult (lshr X, ShAmtC), C --> icmp ult X, (C << ShAmtC) 
-      // icmp ugt (lshr exact X, ShAmtC), C --> icmp ugt X, (C << ShAmtC) 
-      APInt ShiftedC = C.shl(ShAmtVal); 
-      if (ShiftedC.lshr(ShAmtVal) == C) 
-        return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC)); 
-    } 
-    if (Pred == CmpInst::ICMP_UGT) { 
-      // icmp ugt (lshr X, ShAmtC), C --> icmp ugt X, ((C + 1) << ShAmtC) - 1 
-      APInt ShiftedC = (C + 1).shl(ShAmtVal) - 1; 
-      if ((ShiftedC + 1).lshr(ShAmtVal) == (C + 1)) 
-        return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC)); 
-    } 
-  } 
- 
-  if (!Cmp.isEquality()) 
-    return nullptr; 
- 
-  // Handle equality comparisons of shift-by-constant. 
- 
-  // If the comparison constant changes with the shift, the comparison cannot 
-  // succeed (bits of the comparison constant cannot match the shifted value). 
-  // This should be known by InstSimplify and already be folded to true/false. 
-  assert(((IsAShr && C.shl(ShAmtVal).ashr(ShAmtVal) == C) || 
-          (!IsAShr && C.shl(ShAmtVal).lshr(ShAmtVal) == C)) && 
-         "Expected icmp+shr simplify did not occur."); 
- 
-  // If the bits shifted out are known zero, compare the unshifted value: 
-  //  (X & 4) >> 1 == 2  --> (X & 4) == 4. 
-  if (Shr->isExact()) 
-    return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, C << ShAmtVal)); 
- 
-  if (Shr->hasOneUse()) { 
-    // Canonicalize the shift into an 'and': 
-    // icmp eq/ne (shr X, ShAmt), C --> icmp eq/ne (and X, HiMask), (C << ShAmt) 
-    APInt Val(APInt::getHighBitsSet(TypeBits, TypeBits - ShAmtVal)); 
-    Constant *Mask = ConstantInt::get(ShrTy, Val); 
-    Value *And = Builder.CreateAnd(X, Mask, Shr->getName() + ".mask"); 
-    return new ICmpInst(Pred, And, ConstantInt::get(ShrTy, C << ShAmtVal)); 
-  } 
- 
-  return nullptr; 
-} 
- 
+  } else {
+    if (Pred == CmpInst::ICMP_ULT || (Pred == CmpInst::ICMP_UGT && IsExact)) {
+      // icmp ult (lshr X, ShAmtC), C --> icmp ult X, (C << ShAmtC)
+      // icmp ugt (lshr exact X, ShAmtC), C --> icmp ugt X, (C << ShAmtC)
+      APInt ShiftedC = C.shl(ShAmtVal);
+      if (ShiftedC.lshr(ShAmtVal) == C)
+        return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC));
+    }
+    if (Pred == CmpInst::ICMP_UGT) {
+      // icmp ugt (lshr X, ShAmtC), C --> icmp ugt X, ((C + 1) << ShAmtC) - 1
+      APInt ShiftedC = (C + 1).shl(ShAmtVal) - 1;
+      if ((ShiftedC + 1).lshr(ShAmtVal) == (C + 1))
+        return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC));
+    }
+  }
+
+  if (!Cmp.isEquality())
+    return nullptr;
+
+  // Handle equality comparisons of shift-by-constant.
+
+  // If the comparison constant changes with the shift, the comparison cannot
+  // succeed (bits of the comparison constant cannot match the shifted value).
+  // This should be known by InstSimplify and already be folded to true/false.
+  assert(((IsAShr && C.shl(ShAmtVal).ashr(ShAmtVal) == C) ||
+          (!IsAShr && C.shl(ShAmtVal).lshr(ShAmtVal) == C)) &&
+         "Expected icmp+shr simplify did not occur.");
+
+  // If the bits shifted out are known zero, compare the unshifted value:
+  //  (X & 4) >> 1 == 2  --> (X & 4) == 4.
+  if (Shr->isExact())
+    return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, C << ShAmtVal));
+
+  if (Shr->hasOneUse()) {
+    // Canonicalize the shift into an 'and':
+    // icmp eq/ne (shr X, ShAmt), C --> icmp eq/ne (and X, HiMask), (C << ShAmt)
+    APInt Val(APInt::getHighBitsSet(TypeBits, TypeBits - ShAmtVal));
+    Constant *Mask = ConstantInt::get(ShrTy, Val);
+    Value *And = Builder.CreateAnd(X, Mask, Shr->getName() + ".mask");
+    return new ICmpInst(Pred, And, ConstantInt::get(ShrTy, C << ShAmtVal));
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::foldICmpSRemConstant(ICmpInst &Cmp,
                                                     BinaryOperator *SRem,
                                                     const APInt &C) {
-  // Match an 'is positive' or 'is negative' comparison of remainder by a 
-  // constant power-of-2 value: 
-  // (X % pow2C) sgt/slt 0 
-  const ICmpInst::Predicate Pred = Cmp.getPredicate(); 
-  if (Pred != ICmpInst::ICMP_SGT && Pred != ICmpInst::ICMP_SLT) 
-    return nullptr; 
- 
-  // TODO: The one-use check is standard because we do not typically want to 
-  //       create longer instruction sequences, but this might be a special-case 
-  //       because srem is not good for analysis or codegen. 
-  if (!SRem->hasOneUse()) 
-    return nullptr; 
- 
-  const APInt *DivisorC; 
-  if (!C.isNullValue() || !match(SRem->getOperand(1), m_Power2(DivisorC))) 
-    return nullptr; 
- 
-  // Mask off the sign bit and the modulo bits (low-bits). 
-  Type *Ty = SRem->getType(); 
-  APInt SignMask = APInt::getSignMask(Ty->getScalarSizeInBits()); 
-  Constant *MaskC = ConstantInt::get(Ty, SignMask | (*DivisorC - 1)); 
-  Value *And = Builder.CreateAnd(SRem->getOperand(0), MaskC); 
- 
-  // For 'is positive?' check that the sign-bit is clear and at least 1 masked 
-  // bit is set. Example: 
-  // (i8 X % 32) s> 0 --> (X & 159) s> 0 
-  if (Pred == ICmpInst::ICMP_SGT) 
-    return new ICmpInst(ICmpInst::ICMP_SGT, And, ConstantInt::getNullValue(Ty)); 
- 
-  // For 'is negative?' check that the sign-bit is set and at least 1 masked 
-  // bit is set. Example: 
-  // (i16 X % 4) s< 0 --> (X & 32771) u> 32768 
-  return new ICmpInst(ICmpInst::ICMP_UGT, And, ConstantInt::get(Ty, SignMask)); 
-} 
- 
-/// Fold icmp (udiv X, Y), C. 
+  // Match an 'is positive' or 'is negative' comparison of remainder by a
+  // constant power-of-2 value:
+  // (X % pow2C) sgt/slt 0
+  const ICmpInst::Predicate Pred = Cmp.getPredicate();
+  if (Pred != ICmpInst::ICMP_SGT && Pred != ICmpInst::ICMP_SLT)
+    return nullptr;
+
+  // TODO: The one-use check is standard because we do not typically want to
+  //       create longer instruction sequences, but this might be a special-case
+  //       because srem is not good for analysis or codegen.
+  if (!SRem->hasOneUse())
+    return nullptr;
+
+  const APInt *DivisorC;
+  if (!C.isNullValue() || !match(SRem->getOperand(1), m_Power2(DivisorC)))
+    return nullptr;
+
+  // Mask off the sign bit and the modulo bits (low-bits).
+  Type *Ty = SRem->getType();
+  APInt SignMask = APInt::getSignMask(Ty->getScalarSizeInBits());
+  Constant *MaskC = ConstantInt::get(Ty, SignMask | (*DivisorC - 1));
+  Value *And = Builder.CreateAnd(SRem->getOperand(0), MaskC);
+
+  // For 'is positive?' check that the sign-bit is clear and at least 1 masked
+  // bit is set. Example:
+  // (i8 X % 32) s> 0 --> (X & 159) s> 0
+  if (Pred == ICmpInst::ICMP_SGT)
+    return new ICmpInst(ICmpInst::ICMP_SGT, And, ConstantInt::getNullValue(Ty));
+
+  // For 'is negative?' check that the sign-bit is set and at least 1 masked
+  // bit is set. Example:
+  // (i16 X % 4) s< 0 --> (X & 32771) u> 32768
+  return new ICmpInst(ICmpInst::ICMP_UGT, And, ConstantInt::get(Ty, SignMask));
+}
+
+/// Fold icmp (udiv X, Y), C.
 Instruction *InstCombinerImpl::foldICmpUDivConstant(ICmpInst &Cmp,
                                                     BinaryOperator *UDiv,
                                                     const APInt &C) {
-  const APInt *C2; 
-  if (!match(UDiv->getOperand(0), m_APInt(C2))) 
-    return nullptr; 
- 
-  assert(*C2 != 0 && "udiv 0, X should have been simplified already."); 
- 
-  // (icmp ugt (udiv C2, Y), C) -> (icmp ule Y, C2/(C+1)) 
-  Value *Y = UDiv->getOperand(1); 
-  if (Cmp.getPredicate() == ICmpInst::ICMP_UGT) { 
-    assert(!C.isMaxValue() && 
-           "icmp ugt X, UINT_MAX should have been simplified already."); 
-    return new ICmpInst(ICmpInst::ICMP_ULE, Y, 
-                        ConstantInt::get(Y->getType(), C2->udiv(C + 1))); 
-  } 
- 
-  // (icmp ult (udiv C2, Y), C) -> (icmp ugt Y, C2/C) 
-  if (Cmp.getPredicate() == ICmpInst::ICMP_ULT) { 
-    assert(C != 0 && "icmp ult X, 0 should have been simplified already."); 
-    return new ICmpInst(ICmpInst::ICMP_UGT, Y, 
-                        ConstantInt::get(Y->getType(), C2->udiv(C))); 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Fold icmp ({su}div X, Y), C. 
+  const APInt *C2;
+  if (!match(UDiv->getOperand(0), m_APInt(C2)))
+    return nullptr;
+
+  assert(*C2 != 0 && "udiv 0, X should have been simplified already.");
+
+  // (icmp ugt (udiv C2, Y), C) -> (icmp ule Y, C2/(C+1))
+  Value *Y = UDiv->getOperand(1);
+  if (Cmp.getPredicate() == ICmpInst::ICMP_UGT) {
+    assert(!C.isMaxValue() &&
+           "icmp ugt X, UINT_MAX should have been simplified already.");
+    return new ICmpInst(ICmpInst::ICMP_ULE, Y,
+                        ConstantInt::get(Y->getType(), C2->udiv(C + 1)));
+  }
+
+  // (icmp ult (udiv C2, Y), C) -> (icmp ugt Y, C2/C)
+  if (Cmp.getPredicate() == ICmpInst::ICMP_ULT) {
+    assert(C != 0 && "icmp ult X, 0 should have been simplified already.");
+    return new ICmpInst(ICmpInst::ICMP_UGT, Y,
+                        ConstantInt::get(Y->getType(), C2->udiv(C)));
+  }
+
+  return nullptr;
+}
+
+/// Fold icmp ({su}div X, Y), C.
 Instruction *InstCombinerImpl::foldICmpDivConstant(ICmpInst &Cmp,
                                                    BinaryOperator *Div,
                                                    const APInt &C) {
-  // Fold: icmp pred ([us]div X, C2), C -> range test 
-  // Fold this div into the comparison, producing a range check. 
-  // Determine, based on the divide type, what the range is being 
-  // checked.  If there is an overflow on the low or high side, remember 
-  // it, otherwise compute the range [low, hi) bounding the new value. 
-  // See: InsertRangeTest above for the kinds of replacements possible. 
-  const APInt *C2; 
-  if (!match(Div->getOperand(1), m_APInt(C2))) 
-    return nullptr; 
- 
-  // FIXME: If the operand types don't match the type of the divide 
-  // then don't attempt this transform. The code below doesn't have the 
-  // logic to deal with a signed divide and an unsigned compare (and 
-  // vice versa). This is because (x /s C2) <s C  produces different 
-  // results than (x /s C2) <u C or (x /u C2) <s C or even 
-  // (x /u C2) <u C.  Simply casting the operands and result won't 
-  // work. :(  The if statement below tests that condition and bails 
-  // if it finds it. 
-  bool DivIsSigned = Div->getOpcode() == Instruction::SDiv; 
-  if (!Cmp.isEquality() && DivIsSigned != Cmp.isSigned()) 
-    return nullptr; 
- 
-  // The ProdOV computation fails on divide by 0 and divide by -1. Cases with 
-  // INT_MIN will also fail if the divisor is 1. Although folds of all these 
-  // division-by-constant cases should be present, we can not assert that they 
-  // have happened before we reach this icmp instruction. 
-  if (C2->isNullValue() || C2->isOneValue() || 
-      (DivIsSigned && C2->isAllOnesValue())) 
-    return nullptr; 
- 
-  // Compute Prod = C * C2. We are essentially solving an equation of 
-  // form X / C2 = C. We solve for X by multiplying C2 and C. 
-  // By solving for X, we can turn this into a range check instead of computing 
-  // a divide. 
-  APInt Prod = C * *C2; 
- 
-  // Determine if the product overflows by seeing if the product is not equal to 
-  // the divide. Make sure we do the same kind of divide as in the LHS 
-  // instruction that we're folding. 
-  bool ProdOV = (DivIsSigned ? Prod.sdiv(*C2) : Prod.udiv(*C2)) != C; 
- 
-  ICmpInst::Predicate Pred = Cmp.getPredicate(); 
- 
-  // If the division is known to be exact, then there is no remainder from the 
-  // divide, so the covered range size is unit, otherwise it is the divisor. 
-  APInt RangeSize = Div->isExact() ? APInt(C2->getBitWidth(), 1) : *C2; 
- 
-  // Figure out the interval that is being checked.  For example, a comparison 
-  // like "X /u 5 == 0" is really checking that X is in the interval [0, 5). 
-  // Compute this interval based on the constants involved and the signedness of 
-  // the compare/divide.  This computes a half-open interval, keeping track of 
-  // whether either value in the interval overflows.  After analysis each 
-  // overflow variable is set to 0 if it's corresponding bound variable is valid 
-  // -1 if overflowed off the bottom end, or +1 if overflowed off the top end. 
-  int LoOverflow = 0, HiOverflow = 0; 
-  APInt LoBound, HiBound; 
- 
-  if (!DivIsSigned) {  // udiv 
-    // e.g. X/5 op 3  --> [15, 20) 
-    LoBound = Prod; 
-    HiOverflow = LoOverflow = ProdOV; 
-    if (!HiOverflow) { 
-      // If this is not an exact divide, then many values in the range collapse 
-      // to the same result value. 
-      HiOverflow = addWithOverflow(HiBound, LoBound, RangeSize, false); 
-    } 
-  } else if (C2->isStrictlyPositive()) { // Divisor is > 0. 
-    if (C.isNullValue()) {       // (X / pos) op 0 
-      // Can't overflow.  e.g.  X/2 op 0 --> [-1, 2) 
-      LoBound = -(RangeSize - 1); 
-      HiBound = RangeSize; 
-    } else if (C.isStrictlyPositive()) {   // (X / pos) op pos 
-      LoBound = Prod;     // e.g.   X/5 op 3 --> [15, 20) 
-      HiOverflow = LoOverflow = ProdOV; 
-      if (!HiOverflow) 
-        HiOverflow = addWithOverflow(HiBound, Prod, RangeSize, true); 
-    } else {                       // (X / pos) op neg 
-      // e.g. X/5 op -3  --> [-15-4, -15+1) --> [-19, -14) 
-      HiBound = Prod + 1; 
-      LoOverflow = HiOverflow = ProdOV ? -1 : 0; 
-      if (!LoOverflow) { 
-        APInt DivNeg = -RangeSize; 
-        LoOverflow = addWithOverflow(LoBound, HiBound, DivNeg, true) ? -1 : 0; 
-      } 
-    } 
-  } else if (C2->isNegative()) { // Divisor is < 0. 
-    if (Div->isExact()) 
-      RangeSize.negate(); 
-    if (C.isNullValue()) { // (X / neg) op 0 
-      // e.g. X/-5 op 0  --> [-4, 5) 
-      LoBound = RangeSize + 1; 
-      HiBound = -RangeSize; 
-      if (HiBound == *C2) {        // -INTMIN = INTMIN 
-        HiOverflow = 1;            // [INTMIN+1, overflow) 
-        HiBound = APInt();         // e.g. X/INTMIN = 0 --> X > INTMIN 
-      } 
-    } else if (C.isStrictlyPositive()) {   // (X / neg) op pos 
-      // e.g. X/-5 op 3  --> [-19, -14) 
-      HiBound = Prod + 1; 
-      HiOverflow = LoOverflow = ProdOV ? -1 : 0; 
-      if (!LoOverflow) 
-        LoOverflow = addWithOverflow(LoBound, HiBound, RangeSize, true) ? -1:0; 
-    } else {                       // (X / neg) op neg 
-      LoBound = Prod;       // e.g. X/-5 op -3  --> [15, 20) 
-      LoOverflow = HiOverflow = ProdOV; 
-      if (!HiOverflow) 
-        HiOverflow = subWithOverflow(HiBound, Prod, RangeSize, true); 
-    } 
- 
-    // Dividing by a negative swaps the condition.  LT <-> GT 
-    Pred = ICmpInst::getSwappedPredicate(Pred); 
-  } 
- 
-  Value *X = Div->getOperand(0); 
-  switch (Pred) { 
-    default: llvm_unreachable("Unhandled icmp opcode!"); 
-    case ICmpInst::ICMP_EQ: 
-      if (LoOverflow && HiOverflow) 
-        return replaceInstUsesWith(Cmp, Builder.getFalse()); 
-      if (HiOverflow) 
-        return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE : 
-                            ICmpInst::ICMP_UGE, X, 
-                            ConstantInt::get(Div->getType(), LoBound)); 
-      if (LoOverflow) 
-        return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT : 
-                            ICmpInst::ICMP_ULT, X, 
-                            ConstantInt::get(Div->getType(), HiBound)); 
-      return replaceInstUsesWith( 
-          Cmp, insertRangeTest(X, LoBound, HiBound, DivIsSigned, true)); 
-    case ICmpInst::ICMP_NE: 
-      if (LoOverflow && HiOverflow) 
-        return replaceInstUsesWith(Cmp, Builder.getTrue()); 
-      if (HiOverflow) 
-        return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT : 
-                            ICmpInst::ICMP_ULT, X, 
-                            ConstantInt::get(Div->getType(), LoBound)); 
-      if (LoOverflow) 
-        return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE : 
-                            ICmpInst::ICMP_UGE, X, 
-                            ConstantInt::get(Div->getType(), HiBound)); 
-      return replaceInstUsesWith(Cmp, 
-                                 insertRangeTest(X, LoBound, HiBound, 
-                                                 DivIsSigned, false)); 
-    case ICmpInst::ICMP_ULT: 
-    case ICmpInst::ICMP_SLT: 
-      if (LoOverflow == +1)   // Low bound is greater than input range. 
-        return replaceInstUsesWith(Cmp, Builder.getTrue()); 
-      if (LoOverflow == -1)   // Low bound is less than input range. 
-        return replaceInstUsesWith(Cmp, Builder.getFalse()); 
-      return new ICmpInst(Pred, X, ConstantInt::get(Div->getType(), LoBound)); 
-    case ICmpInst::ICMP_UGT: 
-    case ICmpInst::ICMP_SGT: 
-      if (HiOverflow == +1)       // High bound greater than input range. 
-        return replaceInstUsesWith(Cmp, Builder.getFalse()); 
-      if (HiOverflow == -1)       // High bound less than input range. 
-        return replaceInstUsesWith(Cmp, Builder.getTrue()); 
-      if (Pred == ICmpInst::ICMP_UGT) 
-        return new ICmpInst(ICmpInst::ICMP_UGE, X, 
-                            ConstantInt::get(Div->getType(), HiBound)); 
-      return new ICmpInst(ICmpInst::ICMP_SGE, X, 
-                          ConstantInt::get(Div->getType(), HiBound)); 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Fold icmp (sub X, Y), C. 
+  // Fold: icmp pred ([us]div X, C2), C -> range test
+  // Fold this div into the comparison, producing a range check.
+  // Determine, based on the divide type, what the range is being
+  // checked.  If there is an overflow on the low or high side, remember
+  // it, otherwise compute the range [low, hi) bounding the new value.
+  // See: InsertRangeTest above for the kinds of replacements possible.
+  const APInt *C2;
+  if (!match(Div->getOperand(1), m_APInt(C2)))
+    return nullptr;
+
+  // FIXME: If the operand types don't match the type of the divide
+  // then don't attempt this transform. The code below doesn't have the
+  // logic to deal with a signed divide and an unsigned compare (and
+  // vice versa). This is because (x /s C2) <s C  produces different
+  // results than (x /s C2) <u C or (x /u C2) <s C or even
+  // (x /u C2) <u C.  Simply casting the operands and result won't
+  // work. :(  The if statement below tests that condition and bails
+  // if it finds it.
+  bool DivIsSigned = Div->getOpcode() == Instruction::SDiv;
+  if (!Cmp.isEquality() && DivIsSigned != Cmp.isSigned())
+    return nullptr;
+
+  // The ProdOV computation fails on divide by 0 and divide by -1. Cases with
+  // INT_MIN will also fail if the divisor is 1. Although folds of all these
+  // division-by-constant cases should be present, we can not assert that they
+  // have happened before we reach this icmp instruction.
+  if (C2->isNullValue() || C2->isOneValue() ||
+      (DivIsSigned && C2->isAllOnesValue()))
+    return nullptr;
+
+  // Compute Prod = C * C2. We are essentially solving an equation of
+  // form X / C2 = C. We solve for X by multiplying C2 and C.
+  // By solving for X, we can turn this into a range check instead of computing
+  // a divide.
+  APInt Prod = C * *C2;
+
+  // Determine if the product overflows by seeing if the product is not equal to
+  // the divide. Make sure we do the same kind of divide as in the LHS
+  // instruction that we're folding.
+  bool ProdOV = (DivIsSigned ? Prod.sdiv(*C2) : Prod.udiv(*C2)) != C;
+
+  ICmpInst::Predicate Pred = Cmp.getPredicate();
+
+  // If the division is known to be exact, then there is no remainder from the
+  // divide, so the covered range size is unit, otherwise it is the divisor.
+  APInt RangeSize = Div->isExact() ? APInt(C2->getBitWidth(), 1) : *C2;
+
+  // Figure out the interval that is being checked.  For example, a comparison
+  // like "X /u 5 == 0" is really checking that X is in the interval [0, 5).
+  // Compute this interval based on the constants involved and the signedness of
+  // the compare/divide.  This computes a half-open interval, keeping track of
+  // whether either value in the interval overflows.  After analysis each
+  // overflow variable is set to 0 if it's corresponding bound variable is valid
+  // -1 if overflowed off the bottom end, or +1 if overflowed off the top end.
+  int LoOverflow = 0, HiOverflow = 0;
+  APInt LoBound, HiBound;
+
+  if (!DivIsSigned) {  // udiv
+    // e.g. X/5 op 3  --> [15, 20)
+    LoBound = Prod;
+    HiOverflow = LoOverflow = ProdOV;
+    if (!HiOverflow) {
+      // If this is not an exact divide, then many values in the range collapse
+      // to the same result value.
+      HiOverflow = addWithOverflow(HiBound, LoBound, RangeSize, false);
+    }
+  } else if (C2->isStrictlyPositive()) { // Divisor is > 0.
+    if (C.isNullValue()) {       // (X / pos) op 0
+      // Can't overflow.  e.g.  X/2 op 0 --> [-1, 2)
+      LoBound = -(RangeSize - 1);
+      HiBound = RangeSize;
+    } else if (C.isStrictlyPositive()) {   // (X / pos) op pos
+      LoBound = Prod;     // e.g.   X/5 op 3 --> [15, 20)
+      HiOverflow = LoOverflow = ProdOV;
+      if (!HiOverflow)
+        HiOverflow = addWithOverflow(HiBound, Prod, RangeSize, true);
+    } else {                       // (X / pos) op neg
+      // e.g. X/5 op -3  --> [-15-4, -15+1) --> [-19, -14)
+      HiBound = Prod + 1;
+      LoOverflow = HiOverflow = ProdOV ? -1 : 0;
+      if (!LoOverflow) {
+        APInt DivNeg = -RangeSize;
+        LoOverflow = addWithOverflow(LoBound, HiBound, DivNeg, true) ? -1 : 0;
+      }
+    }
+  } else if (C2->isNegative()) { // Divisor is < 0.
+    if (Div->isExact())
+      RangeSize.negate();
+    if (C.isNullValue()) { // (X / neg) op 0
+      // e.g. X/-5 op 0  --> [-4, 5)
+      LoBound = RangeSize + 1;
+      HiBound = -RangeSize;
+      if (HiBound == *C2) {        // -INTMIN = INTMIN
+        HiOverflow = 1;            // [INTMIN+1, overflow)
+        HiBound = APInt();         // e.g. X/INTMIN = 0 --> X > INTMIN
+      }
+    } else if (C.isStrictlyPositive()) {   // (X / neg) op pos
+      // e.g. X/-5 op 3  --> [-19, -14)
+      HiBound = Prod + 1;
+      HiOverflow = LoOverflow = ProdOV ? -1 : 0;
+      if (!LoOverflow)
+        LoOverflow = addWithOverflow(LoBound, HiBound, RangeSize, true) ? -1:0;
+    } else {                       // (X / neg) op neg
+      LoBound = Prod;       // e.g. X/-5 op -3  --> [15, 20)
+      LoOverflow = HiOverflow = ProdOV;
+      if (!HiOverflow)
+        HiOverflow = subWithOverflow(HiBound, Prod, RangeSize, true);
+    }
+
+    // Dividing by a negative swaps the condition.  LT <-> GT
+    Pred = ICmpInst::getSwappedPredicate(Pred);
+  }
+
+  Value *X = Div->getOperand(0);
+  switch (Pred) {
+    default: llvm_unreachable("Unhandled icmp opcode!");
+    case ICmpInst::ICMP_EQ:
+      if (LoOverflow && HiOverflow)
+        return replaceInstUsesWith(Cmp, Builder.getFalse());
+      if (HiOverflow)
+        return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE :
+                            ICmpInst::ICMP_UGE, X,
+                            ConstantInt::get(Div->getType(), LoBound));
+      if (LoOverflow)
+        return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT :
+                            ICmpInst::ICMP_ULT, X,
+                            ConstantInt::get(Div->getType(), HiBound));
+      return replaceInstUsesWith(
+          Cmp, insertRangeTest(X, LoBound, HiBound, DivIsSigned, true));
+    case ICmpInst::ICMP_NE:
+      if (LoOverflow && HiOverflow)
+        return replaceInstUsesWith(Cmp, Builder.getTrue());
+      if (HiOverflow)
+        return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT :
+                            ICmpInst::ICMP_ULT, X,
+                            ConstantInt::get(Div->getType(), LoBound));
+      if (LoOverflow)
+        return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE :
+                            ICmpInst::ICMP_UGE, X,
+                            ConstantInt::get(Div->getType(), HiBound));
+      return replaceInstUsesWith(Cmp,
+                                 insertRangeTest(X, LoBound, HiBound,
+                                                 DivIsSigned, false));
+    case ICmpInst::ICMP_ULT:
+    case ICmpInst::ICMP_SLT:
+      if (LoOverflow == +1)   // Low bound is greater than input range.
+        return replaceInstUsesWith(Cmp, Builder.getTrue());
+      if (LoOverflow == -1)   // Low bound is less than input range.
+        return replaceInstUsesWith(Cmp, Builder.getFalse());
+      return new ICmpInst(Pred, X, ConstantInt::get(Div->getType(), LoBound));
+    case ICmpInst::ICMP_UGT:
+    case ICmpInst::ICMP_SGT:
+      if (HiOverflow == +1)       // High bound greater than input range.
+        return replaceInstUsesWith(Cmp, Builder.getFalse());
+      if (HiOverflow == -1)       // High bound less than input range.
+        return replaceInstUsesWith(Cmp, Builder.getTrue());
+      if (Pred == ICmpInst::ICMP_UGT)
+        return new ICmpInst(ICmpInst::ICMP_UGE, X,
+                            ConstantInt::get(Div->getType(), HiBound));
+      return new ICmpInst(ICmpInst::ICMP_SGE, X,
+                          ConstantInt::get(Div->getType(), HiBound));
+  }
+
+  return nullptr;
+}
+
+/// Fold icmp (sub X, Y), C.
 Instruction *InstCombinerImpl::foldICmpSubConstant(ICmpInst &Cmp,
                                                    BinaryOperator *Sub,
                                                    const APInt &C) {
-  Value *X = Sub->getOperand(0), *Y = Sub->getOperand(1); 
-  ICmpInst::Predicate Pred = Cmp.getPredicate(); 
-  const APInt *C2; 
-  APInt SubResult; 
- 
-  // icmp eq/ne (sub C, Y), C -> icmp eq/ne Y, 0 
-  if (match(X, m_APInt(C2)) && *C2 == C && Cmp.isEquality()) 
-    return new ICmpInst(Cmp.getPredicate(), Y, 
-                        ConstantInt::get(Y->getType(), 0)); 
- 
-  // (icmp P (sub nuw|nsw C2, Y), C) -> (icmp swap(P) Y, C2-C) 
-  if (match(X, m_APInt(C2)) && 
-      ((Cmp.isUnsigned() && Sub->hasNoUnsignedWrap()) || 
-       (Cmp.isSigned() && Sub->hasNoSignedWrap())) && 
-      !subWithOverflow(SubResult, *C2, C, Cmp.isSigned())) 
-    return new ICmpInst(Cmp.getSwappedPredicate(), Y, 
-                        ConstantInt::get(Y->getType(), SubResult)); 
- 
-  // The following transforms are only worth it if the only user of the subtract 
-  // is the icmp. 
-  if (!Sub->hasOneUse()) 
-    return nullptr; 
- 
-  if (Sub->hasNoSignedWrap()) { 
-    // (icmp sgt (sub nsw X, Y), -1) -> (icmp sge X, Y) 
-    if (Pred == ICmpInst::ICMP_SGT && C.isAllOnesValue()) 
-      return new ICmpInst(ICmpInst::ICMP_SGE, X, Y); 
- 
-    // (icmp sgt (sub nsw X, Y), 0) -> (icmp sgt X, Y) 
-    if (Pred == ICmpInst::ICMP_SGT && C.isNullValue()) 
-      return new ICmpInst(ICmpInst::ICMP_SGT, X, Y); 
- 
-    // (icmp slt (sub nsw X, Y), 0) -> (icmp slt X, Y) 
-    if (Pred == ICmpInst::ICMP_SLT && C.isNullValue()) 
-      return new ICmpInst(ICmpInst::ICMP_SLT, X, Y); 
- 
-    // (icmp slt (sub nsw X, Y), 1) -> (icmp sle X, Y) 
-    if (Pred == ICmpInst::ICMP_SLT && C.isOneValue()) 
-      return new ICmpInst(ICmpInst::ICMP_SLE, X, Y); 
-  } 
- 
-  if (!match(X, m_APInt(C2))) 
-    return nullptr; 
- 
-  // C2 - Y <u C -> (Y | (C - 1)) == C2 
-  //   iff (C2 & (C - 1)) == C - 1 and C is a power of 2 
-  if (Pred == ICmpInst::ICMP_ULT && C.isPowerOf2() && 
-      (*C2 & (C - 1)) == (C - 1)) 
-    return new ICmpInst(ICmpInst::ICMP_EQ, Builder.CreateOr(Y, C - 1), X); 
- 
-  // C2 - Y >u C -> (Y | C) != C2 
-  //   iff C2 & C == C and C + 1 is a power of 2 
-  if (Pred == ICmpInst::ICMP_UGT && (C + 1).isPowerOf2() && (*C2 & C) == C) 
-    return new ICmpInst(ICmpInst::ICMP_NE, Builder.CreateOr(Y, C), X); 
- 
-  return nullptr; 
-} 
- 
-/// Fold icmp (add X, Y), C. 
+  Value *X = Sub->getOperand(0), *Y = Sub->getOperand(1);
+  ICmpInst::Predicate Pred = Cmp.getPredicate();
+  const APInt *C2;
+  APInt SubResult;
+
+  // icmp eq/ne (sub C, Y), C -> icmp eq/ne Y, 0
+  if (match(X, m_APInt(C2)) && *C2 == C && Cmp.isEquality())
+    return new ICmpInst(Cmp.getPredicate(), Y,
+                        ConstantInt::get(Y->getType(), 0));
+
+  // (icmp P (sub nuw|nsw C2, Y), C) -> (icmp swap(P) Y, C2-C)
+  if (match(X, m_APInt(C2)) &&
+      ((Cmp.isUnsigned() && Sub->hasNoUnsignedWrap()) ||
+       (Cmp.isSigned() && Sub->hasNoSignedWrap())) &&
+      !subWithOverflow(SubResult, *C2, C, Cmp.isSigned()))
+    return new ICmpInst(Cmp.getSwappedPredicate(), Y,
+                        ConstantInt::get(Y->getType(), SubResult));
+
+  // The following transforms are only worth it if the only user of the subtract
+  // is the icmp.
+  if (!Sub->hasOneUse())
+    return nullptr;
+
+  if (Sub->hasNoSignedWrap()) {
+    // (icmp sgt (sub nsw X, Y), -1) -> (icmp sge X, Y)
+    if (Pred == ICmpInst::ICMP_SGT && C.isAllOnesValue())
+      return new ICmpInst(ICmpInst::ICMP_SGE, X, Y);
+
+    // (icmp sgt (sub nsw X, Y), 0) -> (icmp sgt X, Y)
+    if (Pred == ICmpInst::ICMP_SGT && C.isNullValue())
+      return new ICmpInst(ICmpInst::ICMP_SGT, X, Y);
+
+    // (icmp slt (sub nsw X, Y), 0) -> (icmp slt X, Y)
+    if (Pred == ICmpInst::ICMP_SLT && C.isNullValue())
+      return new ICmpInst(ICmpInst::ICMP_SLT, X, Y);
+
+    // (icmp slt (sub nsw X, Y), 1) -> (icmp sle X, Y)
+    if (Pred == ICmpInst::ICMP_SLT && C.isOneValue())
+      return new ICmpInst(ICmpInst::ICMP_SLE, X, Y);
+  }
+
+  if (!match(X, m_APInt(C2)))
+    return nullptr;
+
+  // C2 - Y <u C -> (Y | (C - 1)) == C2
+  //   iff (C2 & (C - 1)) == C - 1 and C is a power of 2
+  if (Pred == ICmpInst::ICMP_ULT && C.isPowerOf2() &&
+      (*C2 & (C - 1)) == (C - 1))
+    return new ICmpInst(ICmpInst::ICMP_EQ, Builder.CreateOr(Y, C - 1), X);
+
+  // C2 - Y >u C -> (Y | C) != C2
+  //   iff C2 & C == C and C + 1 is a power of 2
+  if (Pred == ICmpInst::ICMP_UGT && (C + 1).isPowerOf2() && (*C2 & C) == C)
+    return new ICmpInst(ICmpInst::ICMP_NE, Builder.CreateOr(Y, C), X);
+
+  return nullptr;
+}
+
+/// Fold icmp (add X, Y), C.
 Instruction *InstCombinerImpl::foldICmpAddConstant(ICmpInst &Cmp,
                                                    BinaryOperator *Add,
                                                    const APInt &C) {
-  Value *Y = Add->getOperand(1); 
-  const APInt *C2; 
-  if (Cmp.isEquality() || !match(Y, m_APInt(C2))) 
-    return nullptr; 
- 
-  // Fold icmp pred (add X, C2), C. 
-  Value *X = Add->getOperand(0); 
-  Type *Ty = Add->getType(); 
-  CmpInst::Predicate Pred = Cmp.getPredicate(); 
- 
-  // If the add does not wrap, we can always adjust the compare by subtracting 
-  // the constants. Equality comparisons are handled elsewhere. SGE/SLE/UGE/ULE 
-  // are canonicalized to SGT/SLT/UGT/ULT. 
-  if ((Add->hasNoSignedWrap() && 
-       (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLT)) || 
-      (Add->hasNoUnsignedWrap() && 
-       (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULT))) { 
-    bool Overflow; 
-    APInt NewC = 
-        Cmp.isSigned() ? C.ssub_ov(*C2, Overflow) : C.usub_ov(*C2, Overflow); 
-    // If there is overflow, the result must be true or false. 
-    // TODO: Can we assert there is no overflow because InstSimplify always 
-    // handles those cases? 
-    if (!Overflow) 
-      // icmp Pred (add nsw X, C2), C --> icmp Pred X, (C - C2) 
-      return new ICmpInst(Pred, X, ConstantInt::get(Ty, NewC)); 
-  } 
- 
-  auto CR = ConstantRange::makeExactICmpRegion(Pred, C).subtract(*C2); 
-  const APInt &Upper = CR.getUpper(); 
-  const APInt &Lower = CR.getLower(); 
-  if (Cmp.isSigned()) { 
-    if (Lower.isSignMask()) 
-      return new ICmpInst(ICmpInst::ICMP_SLT, X, ConstantInt::get(Ty, Upper)); 
-    if (Upper.isSignMask()) 
-      return new ICmpInst(ICmpInst::ICMP_SGE, X, ConstantInt::get(Ty, Lower)); 
-  } else { 
-    if (Lower.isMinValue()) 
-      return new ICmpInst(ICmpInst::ICMP_ULT, X, ConstantInt::get(Ty, Upper)); 
-    if (Upper.isMinValue()) 
-      return new ICmpInst(ICmpInst::ICMP_UGE, X, ConstantInt::get(Ty, Lower)); 
-  } 
- 
-  if (!Add->hasOneUse()) 
-    return nullptr; 
- 
-  // X+C <u C2 -> (X & -C2) == C 
-  //   iff C & (C2-1) == 0 
-  //       C2 is a power of 2 
-  if (Pred == ICmpInst::ICMP_ULT && C.isPowerOf2() && (*C2 & (C - 1)) == 0) 
-    return new ICmpInst(ICmpInst::ICMP_EQ, Builder.CreateAnd(X, -C), 
-                        ConstantExpr::getNeg(cast<Constant>(Y))); 
- 
-  // X+C >u C2 -> (X & ~C2) != C 
-  //   iff C & C2 == 0 
-  //       C2+1 is a power of 2 
-  if (Pred == ICmpInst::ICMP_UGT && (C + 1).isPowerOf2() && (*C2 & C) == 0) 
-    return new ICmpInst(ICmpInst::ICMP_NE, Builder.CreateAnd(X, ~C), 
-                        ConstantExpr::getNeg(cast<Constant>(Y))); 
- 
-  return nullptr; 
-} 
- 
+  Value *Y = Add->getOperand(1);
+  const APInt *C2;
+  if (Cmp.isEquality() || !match(Y, m_APInt(C2)))
+    return nullptr;
+
+  // Fold icmp pred (add X, C2), C.
+  Value *X = Add->getOperand(0);
+  Type *Ty = Add->getType();
+  CmpInst::Predicate Pred = Cmp.getPredicate();
+
+  // If the add does not wrap, we can always adjust the compare by subtracting
+  // the constants. Equality comparisons are handled elsewhere. SGE/SLE/UGE/ULE
+  // are canonicalized to SGT/SLT/UGT/ULT.
+  if ((Add->hasNoSignedWrap() &&
+       (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLT)) ||
+      (Add->hasNoUnsignedWrap() &&
+       (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULT))) {
+    bool Overflow;
+    APInt NewC =
+        Cmp.isSigned() ? C.ssub_ov(*C2, Overflow) : C.usub_ov(*C2, Overflow);
+    // If there is overflow, the result must be true or false.
+    // TODO: Can we assert there is no overflow because InstSimplify always
+    // handles those cases?
+    if (!Overflow)
+      // icmp Pred (add nsw X, C2), C --> icmp Pred X, (C - C2)
+      return new ICmpInst(Pred, X, ConstantInt::get(Ty, NewC));
+  }
+
+  auto CR = ConstantRange::makeExactICmpRegion(Pred, C).subtract(*C2);
+  const APInt &Upper = CR.getUpper();
+  const APInt &Lower = CR.getLower();
+  if (Cmp.isSigned()) {
+    if (Lower.isSignMask())
+      return new ICmpInst(ICmpInst::ICMP_SLT, X, ConstantInt::get(Ty, Upper));
+    if (Upper.isSignMask())
+      return new ICmpInst(ICmpInst::ICMP_SGE, X, ConstantInt::get(Ty, Lower));
+  } else {
+    if (Lower.isMinValue())
+      return new ICmpInst(ICmpInst::ICMP_ULT, X, ConstantInt::get(Ty, Upper));
+    if (Upper.isMinValue())
+      return new ICmpInst(ICmpInst::ICMP_UGE, X, ConstantInt::get(Ty, Lower));
+  }
+
+  if (!Add->hasOneUse())
+    return nullptr;
+
+  // X+C <u C2 -> (X & -C2) == C
+  //   iff C & (C2-1) == 0
+  //       C2 is a power of 2
+  if (Pred == ICmpInst::ICMP_ULT && C.isPowerOf2() && (*C2 & (C - 1)) == 0)
+    return new ICmpInst(ICmpInst::ICMP_EQ, Builder.CreateAnd(X, -C),
+                        ConstantExpr::getNeg(cast<Constant>(Y)));
+
+  // X+C >u C2 -> (X & ~C2) != C
+  //   iff C & C2 == 0
+  //       C2+1 is a power of 2
+  if (Pred == ICmpInst::ICMP_UGT && (C + 1).isPowerOf2() && (*C2 & C) == 0)
+    return new ICmpInst(ICmpInst::ICMP_NE, Builder.CreateAnd(X, ~C),
+                        ConstantExpr::getNeg(cast<Constant>(Y)));
+
+  return nullptr;
+}
+
 bool InstCombinerImpl::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS,
                                                Value *&RHS, ConstantInt *&Less,
                                                ConstantInt *&Equal,
                                                ConstantInt *&Greater) {
-  // TODO: Generalize this to work with other comparison idioms or ensure 
-  // they get canonicalized into this form. 
- 
-  // select i1 (a == b), 
-  //        i32 Equal, 
-  //        i32 (select i1 (a < b), i32 Less, i32 Greater) 
-  // where Equal, Less and Greater are placeholders for any three constants. 
-  ICmpInst::Predicate PredA; 
-  if (!match(SI->getCondition(), m_ICmp(PredA, m_Value(LHS), m_Value(RHS))) || 
-      !ICmpInst::isEquality(PredA)) 
-    return false; 
-  Value *EqualVal = SI->getTrueValue(); 
-  Value *UnequalVal = SI->getFalseValue(); 
-  // We still can get non-canonical predicate here, so canonicalize. 
-  if (PredA == ICmpInst::ICMP_NE) 
-    std::swap(EqualVal, UnequalVal); 
-  if (!match(EqualVal, m_ConstantInt(Equal))) 
-    return false; 
-  ICmpInst::Predicate PredB; 
-  Value *LHS2, *RHS2; 
-  if (!match(UnequalVal, m_Select(m_ICmp(PredB, m_Value(LHS2), m_Value(RHS2)), 
-                                  m_ConstantInt(Less), m_ConstantInt(Greater)))) 
-    return false; 
-  // We can get predicate mismatch here, so canonicalize if possible: 
-  // First, ensure that 'LHS' match. 
-  if (LHS2 != LHS) { 
-    // x sgt y <--> y slt x 
-    std::swap(LHS2, RHS2); 
-    PredB = ICmpInst::getSwappedPredicate(PredB); 
-  } 
-  if (LHS2 != LHS) 
-    return false; 
-  // We also need to canonicalize 'RHS'. 
-  if (PredB == ICmpInst::ICMP_SGT && isa<Constant>(RHS2)) { 
-    // x sgt C-1  <-->  x sge C  <-->  not(x slt C) 
-    auto FlippedStrictness = 
+  // TODO: Generalize this to work with other comparison idioms or ensure
+  // they get canonicalized into this form.
+
+  // select i1 (a == b),
+  //        i32 Equal,
+  //        i32 (select i1 (a < b), i32 Less, i32 Greater)
+  // where Equal, Less and Greater are placeholders for any three constants.
+  ICmpInst::Predicate PredA;
+  if (!match(SI->getCondition(), m_ICmp(PredA, m_Value(LHS), m_Value(RHS))) ||
+      !ICmpInst::isEquality(PredA))
+    return false;
+  Value *EqualVal = SI->getTrueValue();
+  Value *UnequalVal = SI->getFalseValue();
+  // We still can get non-canonical predicate here, so canonicalize.
+  if (PredA == ICmpInst::ICMP_NE)
+    std::swap(EqualVal, UnequalVal);
+  if (!match(EqualVal, m_ConstantInt(Equal)))
+    return false;
+  ICmpInst::Predicate PredB;
+  Value *LHS2, *RHS2;
+  if (!match(UnequalVal, m_Select(m_ICmp(PredB, m_Value(LHS2), m_Value(RHS2)),
+                                  m_ConstantInt(Less), m_ConstantInt(Greater))))
+    return false;
+  // We can get predicate mismatch here, so canonicalize if possible:
+  // First, ensure that 'LHS' match.
+  if (LHS2 != LHS) {
+    // x sgt y <--> y slt x
+    std::swap(LHS2, RHS2);
+    PredB = ICmpInst::getSwappedPredicate(PredB);
+  }
+  if (LHS2 != LHS)
+    return false;
+  // We also need to canonicalize 'RHS'.
+  if (PredB == ICmpInst::ICMP_SGT && isa<Constant>(RHS2)) {
+    // x sgt C-1  <-->  x sge C  <-->  not(x slt C)
+    auto FlippedStrictness =
         InstCombiner::getFlippedStrictnessPredicateAndConstant(
             PredB, cast<Constant>(RHS2));
-    if (!FlippedStrictness) 
-      return false; 
-    assert(FlippedStrictness->first == ICmpInst::ICMP_SGE && "Sanity check"); 
-    RHS2 = FlippedStrictness->second; 
-    // And kind-of perform the result swap. 
-    std::swap(Less, Greater); 
-    PredB = ICmpInst::ICMP_SLT; 
-  } 
-  return PredB == ICmpInst::ICMP_SLT && RHS == RHS2; 
-} 
- 
+    if (!FlippedStrictness)
+      return false;
+    assert(FlippedStrictness->first == ICmpInst::ICMP_SGE && "Sanity check");
+    RHS2 = FlippedStrictness->second;
+    // And kind-of perform the result swap.
+    std::swap(Less, Greater);
+    PredB = ICmpInst::ICMP_SLT;
+  }
+  return PredB == ICmpInst::ICMP_SLT && RHS == RHS2;
+}
+
 Instruction *InstCombinerImpl::foldICmpSelectConstant(ICmpInst &Cmp,
                                                       SelectInst *Select,
                                                       ConstantInt *C) {
- 
-  assert(C && "Cmp RHS should be a constant int!"); 
-  // If we're testing a constant value against the result of a three way 
-  // comparison, the result can be expressed directly in terms of the 
-  // original values being compared.  Note: We could possibly be more 
-  // aggressive here and remove the hasOneUse test. The original select is 
-  // really likely to simplify or sink when we remove a test of the result. 
-  Value *OrigLHS, *OrigRHS; 
-  ConstantInt *C1LessThan, *C2Equal, *C3GreaterThan; 
-  if (Cmp.hasOneUse() && 
-      matchThreeWayIntCompare(Select, OrigLHS, OrigRHS, C1LessThan, C2Equal, 
-                              C3GreaterThan)) { 
-    assert(C1LessThan && C2Equal && C3GreaterThan); 
- 
-    bool TrueWhenLessThan = 
-        ConstantExpr::getCompare(Cmp.getPredicate(), C1LessThan, C) 
-            ->isAllOnesValue(); 
-    bool TrueWhenEqual = 
-        ConstantExpr::getCompare(Cmp.getPredicate(), C2Equal, C) 
-            ->isAllOnesValue(); 
-    bool TrueWhenGreaterThan = 
-        ConstantExpr::getCompare(Cmp.getPredicate(), C3GreaterThan, C) 
-            ->isAllOnesValue(); 
- 
-    // This generates the new instruction that will replace the original Cmp 
-    // Instruction. Instead of enumerating the various combinations when 
-    // TrueWhenLessThan, TrueWhenEqual and TrueWhenGreaterThan are true versus 
-    // false, we rely on chaining of ORs and future passes of InstCombine to 
-    // simplify the OR further (i.e. a s< b || a == b becomes a s<= b). 
- 
-    // When none of the three constants satisfy the predicate for the RHS (C), 
-    // the entire original Cmp can be simplified to a false. 
-    Value *Cond = Builder.getFalse(); 
-    if (TrueWhenLessThan) 
-      Cond = Builder.CreateOr(Cond, Builder.CreateICmp(ICmpInst::ICMP_SLT, 
-                                                       OrigLHS, OrigRHS)); 
-    if (TrueWhenEqual) 
-      Cond = Builder.CreateOr(Cond, Builder.CreateICmp(ICmpInst::ICMP_EQ, 
-                                                       OrigLHS, OrigRHS)); 
-    if (TrueWhenGreaterThan) 
-      Cond = Builder.CreateOr(Cond, Builder.CreateICmp(ICmpInst::ICMP_SGT, 
-                                                       OrigLHS, OrigRHS)); 
- 
-    return replaceInstUsesWith(Cmp, Cond); 
-  } 
-  return nullptr; 
-} 
- 
-static Instruction *foldICmpBitCast(ICmpInst &Cmp, 
-                                    InstCombiner::BuilderTy &Builder) { 
-  auto *Bitcast = dyn_cast<BitCastInst>(Cmp.getOperand(0)); 
-  if (!Bitcast) 
-    return nullptr; 
- 
-  ICmpInst::Predicate Pred = Cmp.getPredicate(); 
-  Value *Op1 = Cmp.getOperand(1); 
-  Value *BCSrcOp = Bitcast->getOperand(0); 
- 
-  // Make sure the bitcast doesn't change the number of vector elements. 
-  if (Bitcast->getSrcTy()->getScalarSizeInBits() == 
-          Bitcast->getDestTy()->getScalarSizeInBits()) { 
-    // Zero-equality and sign-bit checks are preserved through sitofp + bitcast. 
-    Value *X; 
-    if (match(BCSrcOp, m_SIToFP(m_Value(X)))) { 
-      // icmp  eq (bitcast (sitofp X)), 0 --> icmp  eq X, 0 
-      // icmp  ne (bitcast (sitofp X)), 0 --> icmp  ne X, 0 
-      // icmp slt (bitcast (sitofp X)), 0 --> icmp slt X, 0 
-      // icmp sgt (bitcast (sitofp X)), 0 --> icmp sgt X, 0 
-      if ((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_SLT || 
-           Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SGT) && 
-          match(Op1, m_Zero())) 
-        return new ICmpInst(Pred, X, ConstantInt::getNullValue(X->getType())); 
- 
-      // icmp slt (bitcast (sitofp X)), 1 --> icmp slt X, 1 
-      if (Pred == ICmpInst::ICMP_SLT && match(Op1, m_One())) 
-        return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), 1)); 
- 
-      // icmp sgt (bitcast (sitofp X)), -1 --> icmp sgt X, -1 
-      if (Pred == ICmpInst::ICMP_SGT && match(Op1, m_AllOnes())) 
-        return new ICmpInst(Pred, X, 
-                            ConstantInt::getAllOnesValue(X->getType())); 
-    } 
- 
-    // Zero-equality checks are preserved through unsigned floating-point casts: 
-    // icmp eq (bitcast (uitofp X)), 0 --> icmp eq X, 0 
-    // icmp ne (bitcast (uitofp X)), 0 --> icmp ne X, 0 
-    if (match(BCSrcOp, m_UIToFP(m_Value(X)))) 
-      if (Cmp.isEquality() && match(Op1, m_Zero())) 
-        return new ICmpInst(Pred, X, ConstantInt::getNullValue(X->getType())); 
- 
-    // If this is a sign-bit test of a bitcast of a casted FP value, eliminate 
-    // the FP extend/truncate because that cast does not change the sign-bit. 
-    // This is true for all standard IEEE-754 types and the X86 80-bit type. 
-    // The sign-bit is always the most significant bit in those types. 
-    const APInt *C; 
-    bool TrueIfSigned; 
-    if (match(Op1, m_APInt(C)) && Bitcast->hasOneUse() && 
+
+  assert(C && "Cmp RHS should be a constant int!");
+  // If we're testing a constant value against the result of a three way
+  // comparison, the result can be expressed directly in terms of the
+  // original values being compared.  Note: We could possibly be more
+  // aggressive here and remove the hasOneUse test. The original select is
+  // really likely to simplify or sink when we remove a test of the result.
+  Value *OrigLHS, *OrigRHS;
+  ConstantInt *C1LessThan, *C2Equal, *C3GreaterThan;
+  if (Cmp.hasOneUse() &&
+      matchThreeWayIntCompare(Select, OrigLHS, OrigRHS, C1LessThan, C2Equal,
+                              C3GreaterThan)) {
+    assert(C1LessThan && C2Equal && C3GreaterThan);
+
+    bool TrueWhenLessThan =
+        ConstantExpr::getCompare(Cmp.getPredicate(), C1LessThan, C)
+            ->isAllOnesValue();
+    bool TrueWhenEqual =
+        ConstantExpr::getCompare(Cmp.getPredicate(), C2Equal, C)
+            ->isAllOnesValue();
+    bool TrueWhenGreaterThan =
+        ConstantExpr::getCompare(Cmp.getPredicate(), C3GreaterThan, C)
+            ->isAllOnesValue();
+
+    // This generates the new instruction that will replace the original Cmp
+    // Instruction. Instead of enumerating the various combinations when
+    // TrueWhenLessThan, TrueWhenEqual and TrueWhenGreaterThan are true versus
+    // false, we rely on chaining of ORs and future passes of InstCombine to
+    // simplify the OR further (i.e. a s< b || a == b becomes a s<= b).
+
+    // When none of the three constants satisfy the predicate for the RHS (C),
+    // the entire original Cmp can be simplified to a false.
+    Value *Cond = Builder.getFalse();
+    if (TrueWhenLessThan)
+      Cond = Builder.CreateOr(Cond, Builder.CreateICmp(ICmpInst::ICMP_SLT,
+                                                       OrigLHS, OrigRHS));
+    if (TrueWhenEqual)
+      Cond = Builder.CreateOr(Cond, Builder.CreateICmp(ICmpInst::ICMP_EQ,
+                                                       OrigLHS, OrigRHS));
+    if (TrueWhenGreaterThan)
+      Cond = Builder.CreateOr(Cond, Builder.CreateICmp(ICmpInst::ICMP_SGT,
+                                                       OrigLHS, OrigRHS));
+
+    return replaceInstUsesWith(Cmp, Cond);
+  }
+  return nullptr;
+}
+
+static Instruction *foldICmpBitCast(ICmpInst &Cmp,
+                                    InstCombiner::BuilderTy &Builder) {
+  auto *Bitcast = dyn_cast<BitCastInst>(Cmp.getOperand(0));
+  if (!Bitcast)
+    return nullptr;
+
+  ICmpInst::Predicate Pred = Cmp.getPredicate();
+  Value *Op1 = Cmp.getOperand(1);
+  Value *BCSrcOp = Bitcast->getOperand(0);
+
+  // Make sure the bitcast doesn't change the number of vector elements.
+  if (Bitcast->getSrcTy()->getScalarSizeInBits() ==
+          Bitcast->getDestTy()->getScalarSizeInBits()) {
+    // Zero-equality and sign-bit checks are preserved through sitofp + bitcast.
+    Value *X;
+    if (match(BCSrcOp, m_SIToFP(m_Value(X)))) {
+      // icmp  eq (bitcast (sitofp X)), 0 --> icmp  eq X, 0
+      // icmp  ne (bitcast (sitofp X)), 0 --> icmp  ne X, 0
+      // icmp slt (bitcast (sitofp X)), 0 --> icmp slt X, 0
+      // icmp sgt (bitcast (sitofp X)), 0 --> icmp sgt X, 0
+      if ((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_SLT ||
+           Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SGT) &&
+          match(Op1, m_Zero()))
+        return new ICmpInst(Pred, X, ConstantInt::getNullValue(X->getType()));
+
+      // icmp slt (bitcast (sitofp X)), 1 --> icmp slt X, 1
+      if (Pred == ICmpInst::ICMP_SLT && match(Op1, m_One()))
+        return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), 1));
+
+      // icmp sgt (bitcast (sitofp X)), -1 --> icmp sgt X, -1
+      if (Pred == ICmpInst::ICMP_SGT && match(Op1, m_AllOnes()))
+        return new ICmpInst(Pred, X,
+                            ConstantInt::getAllOnesValue(X->getType()));
+    }
+
+    // Zero-equality checks are preserved through unsigned floating-point casts:
+    // icmp eq (bitcast (uitofp X)), 0 --> icmp eq X, 0
+    // icmp ne (bitcast (uitofp X)), 0 --> icmp ne X, 0
+    if (match(BCSrcOp, m_UIToFP(m_Value(X))))
+      if (Cmp.isEquality() && match(Op1, m_Zero()))
+        return new ICmpInst(Pred, X, ConstantInt::getNullValue(X->getType()));
+
+    // If this is a sign-bit test of a bitcast of a casted FP value, eliminate
+    // the FP extend/truncate because that cast does not change the sign-bit.
+    // This is true for all standard IEEE-754 types and the X86 80-bit type.
+    // The sign-bit is always the most significant bit in those types.
+    const APInt *C;
+    bool TrueIfSigned;
+    if (match(Op1, m_APInt(C)) && Bitcast->hasOneUse() &&
         InstCombiner::isSignBitCheck(Pred, *C, TrueIfSigned)) {
-      if (match(BCSrcOp, m_FPExt(m_Value(X))) || 
-          match(BCSrcOp, m_FPTrunc(m_Value(X)))) { 
-        // (bitcast (fpext/fptrunc X)) to iX) < 0 --> (bitcast X to iY) < 0 
-        // (bitcast (fpext/fptrunc X)) to iX) > -1 --> (bitcast X to iY) > -1 
-        Type *XType = X->getType(); 
- 
-        // We can't currently handle Power style floating point operations here. 
-        if (!(XType->isPPC_FP128Ty() || BCSrcOp->getType()->isPPC_FP128Ty())) { 
- 
-          Type *NewType = Builder.getIntNTy(XType->getScalarSizeInBits()); 
-          if (auto *XVTy = dyn_cast<VectorType>(XType)) 
+      if (match(BCSrcOp, m_FPExt(m_Value(X))) ||
+          match(BCSrcOp, m_FPTrunc(m_Value(X)))) {
+        // (bitcast (fpext/fptrunc X)) to iX) < 0 --> (bitcast X to iY) < 0
+        // (bitcast (fpext/fptrunc X)) to iX) > -1 --> (bitcast X to iY) > -1
+        Type *XType = X->getType();
+
+        // We can't currently handle Power style floating point operations here.
+        if (!(XType->isPPC_FP128Ty() || BCSrcOp->getType()->isPPC_FP128Ty())) {
+
+          Type *NewType = Builder.getIntNTy(XType->getScalarSizeInBits());
+          if (auto *XVTy = dyn_cast<VectorType>(XType))
             NewType = VectorType::get(NewType, XVTy->getElementCount());
-          Value *NewBitcast = Builder.CreateBitCast(X, NewType); 
-          if (TrueIfSigned) 
-            return new ICmpInst(ICmpInst::ICMP_SLT, NewBitcast, 
-                                ConstantInt::getNullValue(NewType)); 
-          else 
-            return new ICmpInst(ICmpInst::ICMP_SGT, NewBitcast, 
-                                ConstantInt::getAllOnesValue(NewType)); 
-        } 
-      } 
-    } 
-  } 
- 
-  // Test to see if the operands of the icmp are casted versions of other 
-  // values. If the ptr->ptr cast can be stripped off both arguments, do so. 
-  if (Bitcast->getType()->isPointerTy() && 
-      (isa<Constant>(Op1) || isa<BitCastInst>(Op1))) { 
-    // If operand #1 is a bitcast instruction, it must also be a ptr->ptr cast 
-    // so eliminate it as well. 
-    if (auto *BC2 = dyn_cast<BitCastInst>(Op1)) 
-      Op1 = BC2->getOperand(0); 
- 
-    Op1 = Builder.CreateBitCast(Op1, BCSrcOp->getType()); 
-    return new ICmpInst(Pred, BCSrcOp, Op1); 
-  } 
- 
-  // Folding: icmp <pred> iN X, C 
-  //  where X = bitcast <M x iK> (shufflevector <M x iK> %vec, undef, SC)) to iN 
-  //    and C is a splat of a K-bit pattern 
-  //    and SC is a constant vector = <C', C', C', ..., C'> 
-  // Into: 
-  //   %E = extractelement <M x iK> %vec, i32 C' 
-  //   icmp <pred> iK %E, trunc(C) 
-  const APInt *C; 
-  if (!match(Cmp.getOperand(1), m_APInt(C)) || 
-      !Bitcast->getType()->isIntegerTy() || 
-      !Bitcast->getSrcTy()->isIntOrIntVectorTy()) 
-    return nullptr; 
- 
-  Value *Vec; 
-  ArrayRef<int> Mask; 
-  if (match(BCSrcOp, m_Shuffle(m_Value(Vec), m_Undef(), m_Mask(Mask)))) { 
-    // Check whether every element of Mask is the same constant 
-    if (is_splat(Mask)) { 
-      auto *VecTy = cast<VectorType>(BCSrcOp->getType()); 
-      auto *EltTy = cast<IntegerType>(VecTy->getElementType()); 
-      if (C->isSplat(EltTy->getBitWidth())) { 
-        // Fold the icmp based on the value of C 
-        // If C is M copies of an iK sized bit pattern, 
-        // then: 
-        //   =>  %E = extractelement <N x iK> %vec, i32 Elem 
-        //       icmp <pred> iK %SplatVal, <pattern> 
-        Value *Elem = Builder.getInt32(Mask[0]); 
-        Value *Extract = Builder.CreateExtractElement(Vec, Elem); 
-        Value *NewC = ConstantInt::get(EltTy, C->trunc(EltTy->getBitWidth())); 
-        return new ICmpInst(Pred, Extract, NewC); 
-      } 
-    } 
-  } 
-  return nullptr; 
-} 
- 
-/// Try to fold integer comparisons with a constant operand: icmp Pred X, C 
-/// where X is some kind of instruction. 
+          Value *NewBitcast = Builder.CreateBitCast(X, NewType);
+          if (TrueIfSigned)
+            return new ICmpInst(ICmpInst::ICMP_SLT, NewBitcast,
+                                ConstantInt::getNullValue(NewType));
+          else
+            return new ICmpInst(ICmpInst::ICMP_SGT, NewBitcast,
+                                ConstantInt::getAllOnesValue(NewType));
+        }
+      }
+    }
+  }
+
+  // Test to see if the operands of the icmp are casted versions of other
+  // values. If the ptr->ptr cast can be stripped off both arguments, do so.
+  if (Bitcast->getType()->isPointerTy() &&
+      (isa<Constant>(Op1) || isa<BitCastInst>(Op1))) {
+    // If operand #1 is a bitcast instruction, it must also be a ptr->ptr cast
+    // so eliminate it as well.
+    if (auto *BC2 = dyn_cast<BitCastInst>(Op1))
+      Op1 = BC2->getOperand(0);
+
+    Op1 = Builder.CreateBitCast(Op1, BCSrcOp->getType());
+    return new ICmpInst(Pred, BCSrcOp, Op1);
+  }
+
+  // Folding: icmp <pred> iN X, C
+  //  where X = bitcast <M x iK> (shufflevector <M x iK> %vec, undef, SC)) to iN
+  //    and C is a splat of a K-bit pattern
+  //    and SC is a constant vector = <C', C', C', ..., C'>
+  // Into:
+  //   %E = extractelement <M x iK> %vec, i32 C'
+  //   icmp <pred> iK %E, trunc(C)
+  const APInt *C;
+  if (!match(Cmp.getOperand(1), m_APInt(C)) ||
+      !Bitcast->getType()->isIntegerTy() ||
+      !Bitcast->getSrcTy()->isIntOrIntVectorTy())
+    return nullptr;
+
+  Value *Vec;
+  ArrayRef<int> Mask;
+  if (match(BCSrcOp, m_Shuffle(m_Value(Vec), m_Undef(), m_Mask(Mask)))) {
+    // Check whether every element of Mask is the same constant
+    if (is_splat(Mask)) {
+      auto *VecTy = cast<VectorType>(BCSrcOp->getType());
+      auto *EltTy = cast<IntegerType>(VecTy->getElementType());
+      if (C->isSplat(EltTy->getBitWidth())) {
+        // Fold the icmp based on the value of C
+        // If C is M copies of an iK sized bit pattern,
+        // then:
+        //   =>  %E = extractelement <N x iK> %vec, i32 Elem
+        //       icmp <pred> iK %SplatVal, <pattern>
+        Value *Elem = Builder.getInt32(Mask[0]);
+        Value *Extract = Builder.CreateExtractElement(Vec, Elem);
+        Value *NewC = ConstantInt::get(EltTy, C->trunc(EltTy->getBitWidth()));
+        return new ICmpInst(Pred, Extract, NewC);
+      }
+    }
+  }
+  return nullptr;
+}
+
+/// Try to fold integer comparisons with a constant operand: icmp Pred X, C
+/// where X is some kind of instruction.
 Instruction *InstCombinerImpl::foldICmpInstWithConstant(ICmpInst &Cmp) {
-  const APInt *C; 
-  if (!match(Cmp.getOperand(1), m_APInt(C))) 
-    return nullptr; 
- 
-  if (auto *BO = dyn_cast<BinaryOperator>(Cmp.getOperand(0))) { 
-    switch (BO->getOpcode()) { 
-    case Instruction::Xor: 
-      if (Instruction *I = foldICmpXorConstant(Cmp, BO, *C)) 
-        return I; 
-      break; 
-    case Instruction::And: 
-      if (Instruction *I = foldICmpAndConstant(Cmp, BO, *C)) 
-        return I; 
-      break; 
-    case Instruction::Or: 
-      if (Instruction *I = foldICmpOrConstant(Cmp, BO, *C)) 
-        return I; 
-      break; 
-    case Instruction::Mul: 
-      if (Instruction *I = foldICmpMulConstant(Cmp, BO, *C)) 
-        return I; 
-      break; 
-    case Instruction::Shl: 
-      if (Instruction *I = foldICmpShlConstant(Cmp, BO, *C)) 
-        return I; 
-      break; 
-    case Instruction::LShr: 
-    case Instruction::AShr: 
-      if (Instruction *I = foldICmpShrConstant(Cmp, BO, *C)) 
-        return I; 
-      break; 
-    case Instruction::SRem: 
-      if (Instruction *I = foldICmpSRemConstant(Cmp, BO, *C)) 
-        return I; 
-      break; 
-    case Instruction::UDiv: 
-      if (Instruction *I = foldICmpUDivConstant(Cmp, BO, *C)) 
-        return I; 
-      LLVM_FALLTHROUGH; 
-    case Instruction::SDiv: 
-      if (Instruction *I = foldICmpDivConstant(Cmp, BO, *C)) 
-        return I; 
-      break; 
-    case Instruction::Sub: 
-      if (Instruction *I = foldICmpSubConstant(Cmp, BO, *C)) 
-        return I; 
-      break; 
-    case Instruction::Add: 
-      if (Instruction *I = foldICmpAddConstant(Cmp, BO, *C)) 
-        return I; 
-      break; 
-    default: 
-      break; 
-    } 
-    // TODO: These folds could be refactored to be part of the above calls. 
-    if (Instruction *I = foldICmpBinOpEqualityWithConstant(Cmp, BO, *C)) 
-      return I; 
-  } 
- 
-  // Match against CmpInst LHS being instructions other than binary operators. 
- 
-  if (auto *SI = dyn_cast<SelectInst>(Cmp.getOperand(0))) { 
-    // For now, we only support constant integers while folding the 
-    // ICMP(SELECT)) pattern. We can extend this to support vector of integers 
-    // similar to the cases handled by binary ops above. 
-    if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(Cmp.getOperand(1))) 
-      if (Instruction *I = foldICmpSelectConstant(Cmp, SI, ConstRHS)) 
-        return I; 
-  } 
- 
-  if (auto *TI = dyn_cast<TruncInst>(Cmp.getOperand(0))) { 
-    if (Instruction *I = foldICmpTruncConstant(Cmp, TI, *C)) 
-      return I; 
-  } 
- 
-  if (auto *II = dyn_cast<IntrinsicInst>(Cmp.getOperand(0))) 
-    if (Instruction *I = foldICmpIntrinsicWithConstant(Cmp, II, *C)) 
-      return I; 
- 
-  return nullptr; 
-} 
- 
-/// Fold an icmp equality instruction with binary operator LHS and constant RHS: 
-/// icmp eq/ne BO, C. 
+  const APInt *C;
+  if (!match(Cmp.getOperand(1), m_APInt(C)))
+    return nullptr;
+
+  if (auto *BO = dyn_cast<BinaryOperator>(Cmp.getOperand(0))) {
+    switch (BO->getOpcode()) {
+    case Instruction::Xor:
+      if (Instruction *I = foldICmpXorConstant(Cmp, BO, *C))
+        return I;
+      break;
+    case Instruction::And:
+      if (Instruction *I = foldICmpAndConstant(Cmp, BO, *C))
+        return I;
+      break;
+    case Instruction::Or:
+      if (Instruction *I = foldICmpOrConstant(Cmp, BO, *C))
+        return I;
+      break;
+    case Instruction::Mul:
+      if (Instruction *I = foldICmpMulConstant(Cmp, BO, *C))
+        return I;
+      break;
+    case Instruction::Shl:
+      if (Instruction *I = foldICmpShlConstant(Cmp, BO, *C))
+        return I;
+      break;
+    case Instruction::LShr:
+    case Instruction::AShr:
+      if (Instruction *I = foldICmpShrConstant(Cmp, BO, *C))
+        return I;
+      break;
+    case Instruction::SRem:
+      if (Instruction *I = foldICmpSRemConstant(Cmp, BO, *C))
+        return I;
+      break;
+    case Instruction::UDiv:
+      if (Instruction *I = foldICmpUDivConstant(Cmp, BO, *C))
+        return I;
+      LLVM_FALLTHROUGH;
+    case Instruction::SDiv:
+      if (Instruction *I = foldICmpDivConstant(Cmp, BO, *C))
+        return I;
+      break;
+    case Instruction::Sub:
+      if (Instruction *I = foldICmpSubConstant(Cmp, BO, *C))
+        return I;
+      break;
+    case Instruction::Add:
+      if (Instruction *I = foldICmpAddConstant(Cmp, BO, *C))
+        return I;
+      break;
+    default:
+      break;
+    }
+    // TODO: These folds could be refactored to be part of the above calls.
+    if (Instruction *I = foldICmpBinOpEqualityWithConstant(Cmp, BO, *C))
+      return I;
+  }
+
+  // Match against CmpInst LHS being instructions other than binary operators.
+
+  if (auto *SI = dyn_cast<SelectInst>(Cmp.getOperand(0))) {
+    // For now, we only support constant integers while folding the
+    // ICMP(SELECT)) pattern. We can extend this to support vector of integers
+    // similar to the cases handled by binary ops above.
+    if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(Cmp.getOperand(1)))
+      if (Instruction *I = foldICmpSelectConstant(Cmp, SI, ConstRHS))
+        return I;
+  }
+
+  if (auto *TI = dyn_cast<TruncInst>(Cmp.getOperand(0))) {
+    if (Instruction *I = foldICmpTruncConstant(Cmp, TI, *C))
+      return I;
+  }
+
+  if (auto *II = dyn_cast<IntrinsicInst>(Cmp.getOperand(0)))
+    if (Instruction *I = foldICmpIntrinsicWithConstant(Cmp, II, *C))
+      return I;
+
+  return nullptr;
+}
+
+/// Fold an icmp equality instruction with binary operator LHS and constant RHS:
+/// icmp eq/ne BO, C.
 Instruction *InstCombinerImpl::foldICmpBinOpEqualityWithConstant(
     ICmpInst &Cmp, BinaryOperator *BO, const APInt &C) {
-  // TODO: Some of these folds could work with arbitrary constants, but this 
-  // function is limited to scalar and vector splat constants. 
-  if (!Cmp.isEquality()) 
-    return nullptr; 
- 
-  ICmpInst::Predicate Pred = Cmp.getPredicate(); 
-  bool isICMP_NE = Pred == ICmpInst::ICMP_NE; 
-  Constant *RHS = cast<Constant>(Cmp.getOperand(1)); 
-  Value *BOp0 = BO->getOperand(0), *BOp1 = BO->getOperand(1); 
- 
-  switch (BO->getOpcode()) { 
-  case Instruction::SRem: 
-    // If we have a signed (X % (2^c)) == 0, turn it into an unsigned one. 
-    if (C.isNullValue() && BO->hasOneUse()) { 
-      const APInt *BOC; 
-      if (match(BOp1, m_APInt(BOC)) && BOC->sgt(1) && BOC->isPowerOf2()) { 
-        Value *NewRem = Builder.CreateURem(BOp0, BOp1, BO->getName()); 
-        return new ICmpInst(Pred, NewRem, 
-                            Constant::getNullValue(BO->getType())); 
-      } 
-    } 
-    break; 
-  case Instruction::Add: { 
-    // Replace ((add A, B) != C) with (A != C-B) if B & C are constants. 
-    if (Constant *BOC = dyn_cast<Constant>(BOp1)) { 
-      if (BO->hasOneUse()) 
-        return new ICmpInst(Pred, BOp0, ConstantExpr::getSub(RHS, BOC)); 
-    } else if (C.isNullValue()) { 
-      // Replace ((add A, B) != 0) with (A != -B) if A or B is 
-      // efficiently invertible, or if the add has just this one use. 
-      if (Value *NegVal = dyn_castNegVal(BOp1)) 
-        return new ICmpInst(Pred, BOp0, NegVal); 
-      if (Value *NegVal = dyn_castNegVal(BOp0)) 
-        return new ICmpInst(Pred, NegVal, BOp1); 
-      if (BO->hasOneUse()) { 
-        Value *Neg = Builder.CreateNeg(BOp1); 
-        Neg->takeName(BO); 
-        return new ICmpInst(Pred, BOp0, Neg); 
-      } 
-    } 
-    break; 
-  } 
-  case Instruction::Xor: 
-    if (BO->hasOneUse()) { 
-      if (Constant *BOC = dyn_cast<Constant>(BOp1)) { 
-        // For the xor case, we can xor two constants together, eliminating 
-        // the explicit xor. 
-        return new ICmpInst(Pred, BOp0, ConstantExpr::getXor(RHS, BOC)); 
-      } else if (C.isNullValue()) { 
-        // Replace ((xor A, B) != 0) with (A != B) 
-        return new ICmpInst(Pred, BOp0, BOp1); 
-      } 
-    } 
-    break; 
-  case Instruction::Sub: 
-    if (BO->hasOneUse()) { 
-      // Only check for constant LHS here, as constant RHS will be canonicalized 
-      // to add and use the fold above. 
-      if (Constant *BOC = dyn_cast<Constant>(BOp0)) { 
-        // Replace ((sub BOC, B) != C) with (B != BOC-C). 
-        return new ICmpInst(Pred, BOp1, ConstantExpr::getSub(BOC, RHS)); 
-      } else if (C.isNullValue()) { 
-        // Replace ((sub A, B) != 0) with (A != B). 
-        return new ICmpInst(Pred, BOp0, BOp1); 
-      } 
-    } 
-    break; 
-  case Instruction::Or: { 
-    const APInt *BOC; 
-    if (match(BOp1, m_APInt(BOC)) && BO->hasOneUse() && RHS->isAllOnesValue()) { 
-      // Comparing if all bits outside of a constant mask are set? 
-      // Replace (X | C) == -1 with (X & ~C) == ~C. 
-      // This removes the -1 constant. 
-      Constant *NotBOC = ConstantExpr::getNot(cast<Constant>(BOp1)); 
-      Value *And = Builder.CreateAnd(BOp0, NotBOC); 
-      return new ICmpInst(Pred, And, NotBOC); 
-    } 
-    break; 
-  } 
-  case Instruction::And: { 
-    const APInt *BOC; 
-    if (match(BOp1, m_APInt(BOC))) { 
-      // If we have ((X & C) == C), turn it into ((X & C) != 0). 
-      if (C == *BOC && C.isPowerOf2()) 
-        return new ICmpInst(isICMP_NE ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE, 
-                            BO, Constant::getNullValue(RHS->getType())); 
-    } 
-    break; 
-  } 
-  case Instruction::UDiv: 
-    if (C.isNullValue()) { 
-      // (icmp eq/ne (udiv A, B), 0) -> (icmp ugt/ule i32 B, A) 
-      auto NewPred = isICMP_NE ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_UGT; 
-      return new ICmpInst(NewPred, BOp1, BOp0); 
-    } 
-    break; 
-  default: 
-    break; 
-  } 
-  return nullptr; 
-} 
- 
-/// Fold an equality icmp with LLVM intrinsic and constant operand. 
+  // TODO: Some of these folds could work with arbitrary constants, but this
+  // function is limited to scalar and vector splat constants.
+  if (!Cmp.isEquality())
+    return nullptr;
+
+  ICmpInst::Predicate Pred = Cmp.getPredicate();
+  bool isICMP_NE = Pred == ICmpInst::ICMP_NE;
+  Constant *RHS = cast<Constant>(Cmp.getOperand(1));
+  Value *BOp0 = BO->getOperand(0), *BOp1 = BO->getOperand(1);
+
+  switch (BO->getOpcode()) {
+  case Instruction::SRem:
+    // If we have a signed (X % (2^c)) == 0, turn it into an unsigned one.
+    if (C.isNullValue() && BO->hasOneUse()) {
+      const APInt *BOC;
+      if (match(BOp1, m_APInt(BOC)) && BOC->sgt(1) && BOC->isPowerOf2()) {
+        Value *NewRem = Builder.CreateURem(BOp0, BOp1, BO->getName());
+        return new ICmpInst(Pred, NewRem,
+                            Constant::getNullValue(BO->getType()));
+      }
+    }
+    break;
+  case Instruction::Add: {
+    // Replace ((add A, B) != C) with (A != C-B) if B & C are constants.
+    if (Constant *BOC = dyn_cast<Constant>(BOp1)) {
+      if (BO->hasOneUse())
+        return new ICmpInst(Pred, BOp0, ConstantExpr::getSub(RHS, BOC));
+    } else if (C.isNullValue()) {
+      // Replace ((add A, B) != 0) with (A != -B) if A or B is
+      // efficiently invertible, or if the add has just this one use.
+      if (Value *NegVal = dyn_castNegVal(BOp1))
+        return new ICmpInst(Pred, BOp0, NegVal);
+      if (Value *NegVal = dyn_castNegVal(BOp0))
+        return new ICmpInst(Pred, NegVal, BOp1);
+      if (BO->hasOneUse()) {
+        Value *Neg = Builder.CreateNeg(BOp1);
+        Neg->takeName(BO);
+        return new ICmpInst(Pred, BOp0, Neg);
+      }
+    }
+    break;
+  }
+  case Instruction::Xor:
+    if (BO->hasOneUse()) {
+      if (Constant *BOC = dyn_cast<Constant>(BOp1)) {
+        // For the xor case, we can xor two constants together, eliminating
+        // the explicit xor.
+        return new ICmpInst(Pred, BOp0, ConstantExpr::getXor(RHS, BOC));
+      } else if (C.isNullValue()) {
+        // Replace ((xor A, B) != 0) with (A != B)
+        return new ICmpInst(Pred, BOp0, BOp1);
+      }
+    }
+    break;
+  case Instruction::Sub:
+    if (BO->hasOneUse()) {
+      // Only check for constant LHS here, as constant RHS will be canonicalized
+      // to add and use the fold above.
+      if (Constant *BOC = dyn_cast<Constant>(BOp0)) {
+        // Replace ((sub BOC, B) != C) with (B != BOC-C).
+        return new ICmpInst(Pred, BOp1, ConstantExpr::getSub(BOC, RHS));
+      } else if (C.isNullValue()) {
+        // Replace ((sub A, B) != 0) with (A != B).
+        return new ICmpInst(Pred, BOp0, BOp1);
+      }
+    }
+    break;
+  case Instruction::Or: {
+    const APInt *BOC;
+    if (match(BOp1, m_APInt(BOC)) && BO->hasOneUse() && RHS->isAllOnesValue()) {
+      // Comparing if all bits outside of a constant mask are set?
+      // Replace (X | C) == -1 with (X & ~C) == ~C.
+      // This removes the -1 constant.
+      Constant *NotBOC = ConstantExpr::getNot(cast<Constant>(BOp1));
+      Value *And = Builder.CreateAnd(BOp0, NotBOC);
+      return new ICmpInst(Pred, And, NotBOC);
+    }
+    break;
+  }
+  case Instruction::And: {
+    const APInt *BOC;
+    if (match(BOp1, m_APInt(BOC))) {
+      // If we have ((X & C) == C), turn it into ((X & C) != 0).
+      if (C == *BOC && C.isPowerOf2())
+        return new ICmpInst(isICMP_NE ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE,
+                            BO, Constant::getNullValue(RHS->getType()));
+    }
+    break;
+  }
+  case Instruction::UDiv:
+    if (C.isNullValue()) {
+      // (icmp eq/ne (udiv A, B), 0) -> (icmp ugt/ule i32 B, A)
+      auto NewPred = isICMP_NE ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_UGT;
+      return new ICmpInst(NewPred, BOp1, BOp0);
+    }
+    break;
+  default:
+    break;
+  }
+  return nullptr;
+}
+
+/// Fold an equality icmp with LLVM intrinsic and constant operand.
 Instruction *InstCombinerImpl::foldICmpEqIntrinsicWithConstant(
     ICmpInst &Cmp, IntrinsicInst *II, const APInt &C) {
-  Type *Ty = II->getType(); 
-  unsigned BitWidth = C.getBitWidth(); 
-  switch (II->getIntrinsicID()) { 
+  Type *Ty = II->getType();
+  unsigned BitWidth = C.getBitWidth();
+  switch (II->getIntrinsicID()) {
   case Intrinsic::abs:
     // abs(A) == 0  ->  A == 0
     // abs(A) == INT_MIN  ->  A == INT_MIN
@@ -3068,83 +3068,83 @@ Instruction *InstCombinerImpl::foldICmpEqIntrinsicWithConstant(
                           ConstantInt::get(Ty, C));
     break;
 
-  case Intrinsic::bswap: 
-    // bswap(A) == C  ->  A == bswap(C) 
-    return new ICmpInst(Cmp.getPredicate(), II->getArgOperand(0), 
-                        ConstantInt::get(Ty, C.byteSwap())); 
- 
-  case Intrinsic::ctlz: 
-  case Intrinsic::cttz: { 
-    // ctz(A) == bitwidth(A)  ->  A == 0 and likewise for != 
-    if (C == BitWidth) 
-      return new ICmpInst(Cmp.getPredicate(), II->getArgOperand(0), 
-                          ConstantInt::getNullValue(Ty)); 
- 
-    // ctz(A) == C -> A & Mask1 == Mask2, where Mask2 only has bit C set 
-    // and Mask1 has bits 0..C+1 set. Similar for ctl, but for high bits. 
-    // Limit to one use to ensure we don't increase instruction count. 
-    unsigned Num = C.getLimitedValue(BitWidth); 
-    if (Num != BitWidth && II->hasOneUse()) { 
-      bool IsTrailing = II->getIntrinsicID() == Intrinsic::cttz; 
-      APInt Mask1 = IsTrailing ? APInt::getLowBitsSet(BitWidth, Num + 1) 
-                               : APInt::getHighBitsSet(BitWidth, Num + 1); 
-      APInt Mask2 = IsTrailing 
-        ? APInt::getOneBitSet(BitWidth, Num) 
-        : APInt::getOneBitSet(BitWidth, BitWidth - Num - 1); 
-      return new ICmpInst(Cmp.getPredicate(), 
-          Builder.CreateAnd(II->getArgOperand(0), Mask1), 
-          ConstantInt::get(Ty, Mask2)); 
-    } 
-    break; 
-  } 
- 
-  case Intrinsic::ctpop: { 
-    // popcount(A) == 0  ->  A == 0 and likewise for != 
-    // popcount(A) == bitwidth(A)  ->  A == -1 and likewise for != 
-    bool IsZero = C.isNullValue(); 
-    if (IsZero || C == BitWidth) 
-      return new ICmpInst(Cmp.getPredicate(), II->getArgOperand(0), 
-          IsZero ? Constant::getNullValue(Ty) : Constant::getAllOnesValue(Ty)); 
- 
-    break; 
-  } 
- 
-  case Intrinsic::uadd_sat: { 
-    // uadd.sat(a, b) == 0  ->  (a | b) == 0 
-    if (C.isNullValue()) { 
-      Value *Or = Builder.CreateOr(II->getArgOperand(0), II->getArgOperand(1)); 
-      return new ICmpInst(Cmp.getPredicate(), Or, Constant::getNullValue(Ty)); 
-    } 
-    break; 
-  } 
- 
-  case Intrinsic::usub_sat: { 
-    // usub.sat(a, b) == 0  ->  a <= b 
-    if (C.isNullValue()) { 
-      ICmpInst::Predicate NewPred = Cmp.getPredicate() == ICmpInst::ICMP_EQ 
-          ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_UGT; 
-      return new ICmpInst(NewPred, II->getArgOperand(0), II->getArgOperand(1)); 
-    } 
-    break; 
-  } 
-  default: 
-    break; 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Fold an icmp with LLVM intrinsic and constant operand: icmp Pred II, C. 
+  case Intrinsic::bswap:
+    // bswap(A) == C  ->  A == bswap(C)
+    return new ICmpInst(Cmp.getPredicate(), II->getArgOperand(0),
+                        ConstantInt::get(Ty, C.byteSwap()));
+
+  case Intrinsic::ctlz:
+  case Intrinsic::cttz: {
+    // ctz(A) == bitwidth(A)  ->  A == 0 and likewise for !=
+    if (C == BitWidth)
+      return new ICmpInst(Cmp.getPredicate(), II->getArgOperand(0),
+                          ConstantInt::getNullValue(Ty));
+
+    // ctz(A) == C -> A & Mask1 == Mask2, where Mask2 only has bit C set
+    // and Mask1 has bits 0..C+1 set. Similar for ctl, but for high bits.
+    // Limit to one use to ensure we don't increase instruction count.
+    unsigned Num = C.getLimitedValue(BitWidth);
+    if (Num != BitWidth && II->hasOneUse()) {
+      bool IsTrailing = II->getIntrinsicID() == Intrinsic::cttz;
+      APInt Mask1 = IsTrailing ? APInt::getLowBitsSet(BitWidth, Num + 1)
+                               : APInt::getHighBitsSet(BitWidth, Num + 1);
+      APInt Mask2 = IsTrailing
+        ? APInt::getOneBitSet(BitWidth, Num)
+        : APInt::getOneBitSet(BitWidth, BitWidth - Num - 1);
+      return new ICmpInst(Cmp.getPredicate(),
+          Builder.CreateAnd(II->getArgOperand(0), Mask1),
+          ConstantInt::get(Ty, Mask2));
+    }
+    break;
+  }
+
+  case Intrinsic::ctpop: {
+    // popcount(A) == 0  ->  A == 0 and likewise for !=
+    // popcount(A) == bitwidth(A)  ->  A == -1 and likewise for !=
+    bool IsZero = C.isNullValue();
+    if (IsZero || C == BitWidth)
+      return new ICmpInst(Cmp.getPredicate(), II->getArgOperand(0),
+          IsZero ? Constant::getNullValue(Ty) : Constant::getAllOnesValue(Ty));
+
+    break;
+  }
+
+  case Intrinsic::uadd_sat: {
+    // uadd.sat(a, b) == 0  ->  (a | b) == 0
+    if (C.isNullValue()) {
+      Value *Or = Builder.CreateOr(II->getArgOperand(0), II->getArgOperand(1));
+      return new ICmpInst(Cmp.getPredicate(), Or, Constant::getNullValue(Ty));
+    }
+    break;
+  }
+
+  case Intrinsic::usub_sat: {
+    // usub.sat(a, b) == 0  ->  a <= b
+    if (C.isNullValue()) {
+      ICmpInst::Predicate NewPred = Cmp.getPredicate() == ICmpInst::ICMP_EQ
+          ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_UGT;
+      return new ICmpInst(NewPred, II->getArgOperand(0), II->getArgOperand(1));
+    }
+    break;
+  }
+  default:
+    break;
+  }
+
+  return nullptr;
+}
+
+/// Fold an icmp with LLVM intrinsic and constant operand: icmp Pred II, C.
 Instruction *InstCombinerImpl::foldICmpIntrinsicWithConstant(ICmpInst &Cmp,
                                                              IntrinsicInst *II,
                                                              const APInt &C) {
-  if (Cmp.isEquality()) 
-    return foldICmpEqIntrinsicWithConstant(Cmp, II, C); 
- 
-  Type *Ty = II->getType(); 
-  unsigned BitWidth = C.getBitWidth(); 
+  if (Cmp.isEquality())
+    return foldICmpEqIntrinsicWithConstant(Cmp, II, C);
+
+  Type *Ty = II->getType();
+  unsigned BitWidth = C.getBitWidth();
   ICmpInst::Predicate Pred = Cmp.getPredicate();
-  switch (II->getIntrinsicID()) { 
+  switch (II->getIntrinsicID()) {
   case Intrinsic::ctpop: {
     // (ctpop X > BitWidth - 1) --> X == -1
     Value *X = II->getArgOperand(0);
@@ -3157,562 +3157,562 @@ Instruction *InstCombinerImpl::foldICmpIntrinsicWithConstant(ICmpInst &Cmp,
                              ConstantInt::getAllOnesValue(Ty));
     break;
   }
-  case Intrinsic::ctlz: { 
-    // ctlz(0bXXXXXXXX) > 3 -> 0bXXXXXXXX < 0b00010000 
+  case Intrinsic::ctlz: {
+    // ctlz(0bXXXXXXXX) > 3 -> 0bXXXXXXXX < 0b00010000
     if (Pred == ICmpInst::ICMP_UGT && C.ult(BitWidth)) {
-      unsigned Num = C.getLimitedValue(); 
-      APInt Limit = APInt::getOneBitSet(BitWidth, BitWidth - Num - 1); 
-      return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_ULT, 
-                             II->getArgOperand(0), ConstantInt::get(Ty, Limit)); 
-    } 
- 
-    // ctlz(0bXXXXXXXX) < 3 -> 0bXXXXXXXX > 0b00011111 
+      unsigned Num = C.getLimitedValue();
+      APInt Limit = APInt::getOneBitSet(BitWidth, BitWidth - Num - 1);
+      return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_ULT,
+                             II->getArgOperand(0), ConstantInt::get(Ty, Limit));
+    }
+
+    // ctlz(0bXXXXXXXX) < 3 -> 0bXXXXXXXX > 0b00011111
     if (Pred == ICmpInst::ICMP_ULT && C.uge(1) && C.ule(BitWidth)) {
-      unsigned Num = C.getLimitedValue(); 
-      APInt Limit = APInt::getLowBitsSet(BitWidth, BitWidth - Num); 
-      return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_UGT, 
-                             II->getArgOperand(0), ConstantInt::get(Ty, Limit)); 
-    } 
-    break; 
-  } 
-  case Intrinsic::cttz: { 
-    // Limit to one use to ensure we don't increase instruction count. 
-    if (!II->hasOneUse()) 
-      return nullptr; 
- 
-    // cttz(0bXXXXXXXX) > 3 -> 0bXXXXXXXX & 0b00001111 == 0 
+      unsigned Num = C.getLimitedValue();
+      APInt Limit = APInt::getLowBitsSet(BitWidth, BitWidth - Num);
+      return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_UGT,
+                             II->getArgOperand(0), ConstantInt::get(Ty, Limit));
+    }
+    break;
+  }
+  case Intrinsic::cttz: {
+    // Limit to one use to ensure we don't increase instruction count.
+    if (!II->hasOneUse())
+      return nullptr;
+
+    // cttz(0bXXXXXXXX) > 3 -> 0bXXXXXXXX & 0b00001111 == 0
     if (Pred == ICmpInst::ICMP_UGT && C.ult(BitWidth)) {
-      APInt Mask = APInt::getLowBitsSet(BitWidth, C.getLimitedValue() + 1); 
-      return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_EQ, 
-                             Builder.CreateAnd(II->getArgOperand(0), Mask), 
-                             ConstantInt::getNullValue(Ty)); 
-    } 
- 
-    // cttz(0bXXXXXXXX) < 3 -> 0bXXXXXXXX & 0b00000111 != 0 
+      APInt Mask = APInt::getLowBitsSet(BitWidth, C.getLimitedValue() + 1);
+      return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_EQ,
+                             Builder.CreateAnd(II->getArgOperand(0), Mask),
+                             ConstantInt::getNullValue(Ty));
+    }
+
+    // cttz(0bXXXXXXXX) < 3 -> 0bXXXXXXXX & 0b00000111 != 0
     if (Pred == ICmpInst::ICMP_ULT && C.uge(1) && C.ule(BitWidth)) {
-      APInt Mask = APInt::getLowBitsSet(BitWidth, C.getLimitedValue()); 
-      return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_NE, 
-                             Builder.CreateAnd(II->getArgOperand(0), Mask), 
-                             ConstantInt::getNullValue(Ty)); 
-    } 
-    break; 
-  } 
-  default: 
-    break; 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Handle icmp with constant (but not simple integer constant) RHS. 
+      APInt Mask = APInt::getLowBitsSet(BitWidth, C.getLimitedValue());
+      return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_NE,
+                             Builder.CreateAnd(II->getArgOperand(0), Mask),
+                             ConstantInt::getNullValue(Ty));
+    }
+    break;
+  }
+  default:
+    break;
+  }
+
+  return nullptr;
+}
+
+/// Handle icmp with constant (but not simple integer constant) RHS.
 Instruction *InstCombinerImpl::foldICmpInstWithConstantNotInt(ICmpInst &I) {
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); 
-  Constant *RHSC = dyn_cast<Constant>(Op1); 
-  Instruction *LHSI = dyn_cast<Instruction>(Op0); 
-  if (!RHSC || !LHSI) 
-    return nullptr; 
- 
-  switch (LHSI->getOpcode()) { 
-  case Instruction::GetElementPtr: 
-    // icmp pred GEP (P, int 0, int 0, int 0), null -> icmp pred P, null 
-    if (RHSC->isNullValue() && 
-        cast<GetElementPtrInst>(LHSI)->hasAllZeroIndices()) 
-      return new ICmpInst( 
-          I.getPredicate(), LHSI->getOperand(0), 
-          Constant::getNullValue(LHSI->getOperand(0)->getType())); 
-    break; 
-  case Instruction::PHI: 
-    // Only fold icmp into the PHI if the phi and icmp are in the same 
-    // block.  If in the same block, we're encouraging jump threading.  If 
-    // not, we are just pessimizing the code by making an i1 phi. 
-    if (LHSI->getParent() == I.getParent()) 
-      if (Instruction *NV = foldOpIntoPhi(I, cast<PHINode>(LHSI))) 
-        return NV; 
-    break; 
-  case Instruction::Select: { 
-    // If either operand of the select is a constant, we can fold the 
-    // comparison into the select arms, which will cause one to be 
-    // constant folded and the select turned into a bitwise or. 
-    Value *Op1 = nullptr, *Op2 = nullptr; 
-    ConstantInt *CI = nullptr; 
-    if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1))) { 
-      Op1 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC); 
-      CI = dyn_cast<ConstantInt>(Op1); 
-    } 
-    if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(2))) { 
-      Op2 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC); 
-      CI = dyn_cast<ConstantInt>(Op2); 
-    } 
- 
-    // We only want to perform this transformation if it will not lead to 
-    // additional code. This is true if either both sides of the select 
-    // fold to a constant (in which case the icmp is replaced with a select 
-    // which will usually simplify) or this is the only user of the 
-    // select (in which case we are trading a select+icmp for a simpler 
-    // select+icmp) or all uses of the select can be replaced based on 
-    // dominance information ("Global cases"). 
-    bool Transform = false; 
-    if (Op1 && Op2) 
-      Transform = true; 
-    else if (Op1 || Op2) { 
-      // Local case 
-      if (LHSI->hasOneUse()) 
-        Transform = true; 
-      // Global cases 
-      else if (CI && !CI->isZero()) 
-        // When Op1 is constant try replacing select with second operand. 
-        // Otherwise Op2 is constant and try replacing select with first 
-        // operand. 
-        Transform = 
-            replacedSelectWithOperand(cast<SelectInst>(LHSI), &I, Op1 ? 2 : 1); 
-    } 
-    if (Transform) { 
-      if (!Op1) 
-        Op1 = Builder.CreateICmp(I.getPredicate(), LHSI->getOperand(1), RHSC, 
-                                 I.getName()); 
-      if (!Op2) 
-        Op2 = Builder.CreateICmp(I.getPredicate(), LHSI->getOperand(2), RHSC, 
-                                 I.getName()); 
-      return SelectInst::Create(LHSI->getOperand(0), Op1, Op2); 
-    } 
-    break; 
-  } 
-  case Instruction::IntToPtr: 
-    // icmp pred inttoptr(X), null -> icmp pred X, 0 
-    if (RHSC->isNullValue() && 
-        DL.getIntPtrType(RHSC->getType()) == LHSI->getOperand(0)->getType()) 
-      return new ICmpInst( 
-          I.getPredicate(), LHSI->getOperand(0), 
-          Constant::getNullValue(LHSI->getOperand(0)->getType())); 
-    break; 
- 
-  case Instruction::Load: 
-    // Try to optimize things like "A[i] > 4" to index computations. 
-    if (GetElementPtrInst *GEP = 
-            dyn_cast<GetElementPtrInst>(LHSI->getOperand(0))) { 
-      if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) 
-        if (GV->isConstant() && GV->hasDefinitiveInitializer() && 
-            !cast<LoadInst>(LHSI)->isVolatile()) 
-          if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, I)) 
-            return Res; 
-    } 
-    break; 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Some comparisons can be simplified. 
-/// In this case, we are looking for comparisons that look like 
-/// a check for a lossy truncation. 
-/// Folds: 
-///   icmp SrcPred (x & Mask), x    to    icmp DstPred x, Mask 
-/// Where Mask is some pattern that produces all-ones in low bits: 
-///    (-1 >> y) 
-///    ((-1 << y) >> y)     <- non-canonical, has extra uses 
-///   ~(-1 << y) 
-///    ((1 << y) + (-1))    <- non-canonical, has extra uses 
-/// The Mask can be a constant, too. 
-/// For some predicates, the operands are commutative. 
-/// For others, x can only be on a specific side. 
-static Value *foldICmpWithLowBitMaskedVal(ICmpInst &I, 
-                                          InstCombiner::BuilderTy &Builder) { 
-  ICmpInst::Predicate SrcPred; 
-  Value *X, *M, *Y; 
-  auto m_VariableMask = m_CombineOr( 
-      m_CombineOr(m_Not(m_Shl(m_AllOnes(), m_Value())), 
-                  m_Add(m_Shl(m_One(), m_Value()), m_AllOnes())), 
-      m_CombineOr(m_LShr(m_AllOnes(), m_Value()), 
-                  m_LShr(m_Shl(m_AllOnes(), m_Value(Y)), m_Deferred(Y)))); 
-  auto m_Mask = m_CombineOr(m_VariableMask, m_LowBitMask()); 
-  if (!match(&I, m_c_ICmp(SrcPred, 
-                          m_c_And(m_CombineAnd(m_Mask, m_Value(M)), m_Value(X)), 
-                          m_Deferred(X)))) 
-    return nullptr; 
- 
-  ICmpInst::Predicate DstPred; 
-  switch (SrcPred) { 
-  case ICmpInst::Predicate::ICMP_EQ: 
-    //  x & (-1 >> y) == x    ->    x u<= (-1 >> y) 
-    DstPred = ICmpInst::Predicate::ICMP_ULE; 
-    break; 
-  case ICmpInst::Predicate::ICMP_NE: 
-    //  x & (-1 >> y) != x    ->    x u> (-1 >> y) 
-    DstPred = ICmpInst::Predicate::ICMP_UGT; 
-    break; 
-  case ICmpInst::Predicate::ICMP_ULT: 
-    //  x & (-1 >> y) u< x    ->    x u> (-1 >> y) 
-    //  x u> x & (-1 >> y)    ->    x u> (-1 >> y) 
-    DstPred = ICmpInst::Predicate::ICMP_UGT; 
-    break; 
-  case ICmpInst::Predicate::ICMP_UGE: 
-    //  x & (-1 >> y) u>= x    ->    x u<= (-1 >> y) 
-    //  x u<= x & (-1 >> y)    ->    x u<= (-1 >> y) 
-    DstPred = ICmpInst::Predicate::ICMP_ULE; 
-    break; 
-  case ICmpInst::Predicate::ICMP_SLT: 
-    //  x & (-1 >> y) s< x    ->    x s> (-1 >> y) 
-    //  x s> x & (-1 >> y)    ->    x s> (-1 >> y) 
-    if (!match(M, m_Constant())) // Can not do this fold with non-constant. 
-      return nullptr; 
-    if (!match(M, m_NonNegative())) // Must not have any -1 vector elements. 
-      return nullptr; 
-    DstPred = ICmpInst::Predicate::ICMP_SGT; 
-    break; 
-  case ICmpInst::Predicate::ICMP_SGE: 
-    //  x & (-1 >> y) s>= x    ->    x s<= (-1 >> y) 
-    //  x s<= x & (-1 >> y)    ->    x s<= (-1 >> y) 
-    if (!match(M, m_Constant())) // Can not do this fold with non-constant. 
-      return nullptr; 
-    if (!match(M, m_NonNegative())) // Must not have any -1 vector elements. 
-      return nullptr; 
-    DstPred = ICmpInst::Predicate::ICMP_SLE; 
-    break; 
-  case ICmpInst::Predicate::ICMP_SGT: 
-  case ICmpInst::Predicate::ICMP_SLE: 
-    return nullptr; 
-  case ICmpInst::Predicate::ICMP_UGT: 
-  case ICmpInst::Predicate::ICMP_ULE: 
-    llvm_unreachable("Instsimplify took care of commut. variant"); 
-    break; 
-  default: 
-    llvm_unreachable("All possible folds are handled."); 
-  } 
- 
-  // The mask value may be a vector constant that has undefined elements. But it 
-  // may not be safe to propagate those undefs into the new compare, so replace 
-  // those elements by copying an existing, defined, and safe scalar constant. 
-  Type *OpTy = M->getType(); 
-  auto *VecC = dyn_cast<Constant>(M); 
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  Constant *RHSC = dyn_cast<Constant>(Op1);
+  Instruction *LHSI = dyn_cast<Instruction>(Op0);
+  if (!RHSC || !LHSI)
+    return nullptr;
+
+  switch (LHSI->getOpcode()) {
+  case Instruction::GetElementPtr:
+    // icmp pred GEP (P, int 0, int 0, int 0), null -> icmp pred P, null
+    if (RHSC->isNullValue() &&
+        cast<GetElementPtrInst>(LHSI)->hasAllZeroIndices())
+      return new ICmpInst(
+          I.getPredicate(), LHSI->getOperand(0),
+          Constant::getNullValue(LHSI->getOperand(0)->getType()));
+    break;
+  case Instruction::PHI:
+    // Only fold icmp into the PHI if the phi and icmp are in the same
+    // block.  If in the same block, we're encouraging jump threading.  If
+    // not, we are just pessimizing the code by making an i1 phi.
+    if (LHSI->getParent() == I.getParent())
+      if (Instruction *NV = foldOpIntoPhi(I, cast<PHINode>(LHSI)))
+        return NV;
+    break;
+  case Instruction::Select: {
+    // If either operand of the select is a constant, we can fold the
+    // comparison into the select arms, which will cause one to be
+    // constant folded and the select turned into a bitwise or.
+    Value *Op1 = nullptr, *Op2 = nullptr;
+    ConstantInt *CI = nullptr;
+    if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1))) {
+      Op1 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);
+      CI = dyn_cast<ConstantInt>(Op1);
+    }
+    if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(2))) {
+      Op2 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);
+      CI = dyn_cast<ConstantInt>(Op2);
+    }
+
+    // We only want to perform this transformation if it will not lead to
+    // additional code. This is true if either both sides of the select
+    // fold to a constant (in which case the icmp is replaced with a select
+    // which will usually simplify) or this is the only user of the
+    // select (in which case we are trading a select+icmp for a simpler
+    // select+icmp) or all uses of the select can be replaced based on
+    // dominance information ("Global cases").
+    bool Transform = false;
+    if (Op1 && Op2)
+      Transform = true;
+    else if (Op1 || Op2) {
+      // Local case
+      if (LHSI->hasOneUse())
+        Transform = true;
+      // Global cases
+      else if (CI && !CI->isZero())
+        // When Op1 is constant try replacing select with second operand.
+        // Otherwise Op2 is constant and try replacing select with first
+        // operand.
+        Transform =
+            replacedSelectWithOperand(cast<SelectInst>(LHSI), &I, Op1 ? 2 : 1);
+    }
+    if (Transform) {
+      if (!Op1)
+        Op1 = Builder.CreateICmp(I.getPredicate(), LHSI->getOperand(1), RHSC,
+                                 I.getName());
+      if (!Op2)
+        Op2 = Builder.CreateICmp(I.getPredicate(), LHSI->getOperand(2), RHSC,
+                                 I.getName());
+      return SelectInst::Create(LHSI->getOperand(0), Op1, Op2);
+    }
+    break;
+  }
+  case Instruction::IntToPtr:
+    // icmp pred inttoptr(X), null -> icmp pred X, 0
+    if (RHSC->isNullValue() &&
+        DL.getIntPtrType(RHSC->getType()) == LHSI->getOperand(0)->getType())
+      return new ICmpInst(
+          I.getPredicate(), LHSI->getOperand(0),
+          Constant::getNullValue(LHSI->getOperand(0)->getType()));
+    break;
+
+  case Instruction::Load:
+    // Try to optimize things like "A[i] > 4" to index computations.
+    if (GetElementPtrInst *GEP =
+            dyn_cast<GetElementPtrInst>(LHSI->getOperand(0))) {
+      if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
+        if (GV->isConstant() && GV->hasDefinitiveInitializer() &&
+            !cast<LoadInst>(LHSI)->isVolatile())
+          if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, I))
+            return Res;
+    }
+    break;
+  }
+
+  return nullptr;
+}
+
+/// Some comparisons can be simplified.
+/// In this case, we are looking for comparisons that look like
+/// a check for a lossy truncation.
+/// Folds:
+///   icmp SrcPred (x & Mask), x    to    icmp DstPred x, Mask
+/// Where Mask is some pattern that produces all-ones in low bits:
+///    (-1 >> y)
+///    ((-1 << y) >> y)     <- non-canonical, has extra uses
+///   ~(-1 << y)
+///    ((1 << y) + (-1))    <- non-canonical, has extra uses
+/// The Mask can be a constant, too.
+/// For some predicates, the operands are commutative.
+/// For others, x can only be on a specific side.
+static Value *foldICmpWithLowBitMaskedVal(ICmpInst &I,
+                                          InstCombiner::BuilderTy &Builder) {
+  ICmpInst::Predicate SrcPred;
+  Value *X, *M, *Y;
+  auto m_VariableMask = m_CombineOr(
+      m_CombineOr(m_Not(m_Shl(m_AllOnes(), m_Value())),
+                  m_Add(m_Shl(m_One(), m_Value()), m_AllOnes())),
+      m_CombineOr(m_LShr(m_AllOnes(), m_Value()),
+                  m_LShr(m_Shl(m_AllOnes(), m_Value(Y)), m_Deferred(Y))));
+  auto m_Mask = m_CombineOr(m_VariableMask, m_LowBitMask());
+  if (!match(&I, m_c_ICmp(SrcPred,
+                          m_c_And(m_CombineAnd(m_Mask, m_Value(M)), m_Value(X)),
+                          m_Deferred(X))))
+    return nullptr;
+
+  ICmpInst::Predicate DstPred;
+  switch (SrcPred) {
+  case ICmpInst::Predicate::ICMP_EQ:
+    //  x & (-1 >> y) == x    ->    x u<= (-1 >> y)
+    DstPred = ICmpInst::Predicate::ICMP_ULE;
+    break;
+  case ICmpInst::Predicate::ICMP_NE:
+    //  x & (-1 >> y) != x    ->    x u> (-1 >> y)
+    DstPred = ICmpInst::Predicate::ICMP_UGT;
+    break;
+  case ICmpInst::Predicate::ICMP_ULT:
+    //  x & (-1 >> y) u< x    ->    x u> (-1 >> y)
+    //  x u> x & (-1 >> y)    ->    x u> (-1 >> y)
+    DstPred = ICmpInst::Predicate::ICMP_UGT;
+    break;
+  case ICmpInst::Predicate::ICMP_UGE:
+    //  x & (-1 >> y) u>= x    ->    x u<= (-1 >> y)
+    //  x u<= x & (-1 >> y)    ->    x u<= (-1 >> y)
+    DstPred = ICmpInst::Predicate::ICMP_ULE;
+    break;
+  case ICmpInst::Predicate::ICMP_SLT:
+    //  x & (-1 >> y) s< x    ->    x s> (-1 >> y)
+    //  x s> x & (-1 >> y)    ->    x s> (-1 >> y)
+    if (!match(M, m_Constant())) // Can not do this fold with non-constant.
+      return nullptr;
+    if (!match(M, m_NonNegative())) // Must not have any -1 vector elements.
+      return nullptr;
+    DstPred = ICmpInst::Predicate::ICMP_SGT;
+    break;
+  case ICmpInst::Predicate::ICMP_SGE:
+    //  x & (-1 >> y) s>= x    ->    x s<= (-1 >> y)
+    //  x s<= x & (-1 >> y)    ->    x s<= (-1 >> y)
+    if (!match(M, m_Constant())) // Can not do this fold with non-constant.
+      return nullptr;
+    if (!match(M, m_NonNegative())) // Must not have any -1 vector elements.
+      return nullptr;
+    DstPred = ICmpInst::Predicate::ICMP_SLE;
+    break;
+  case ICmpInst::Predicate::ICMP_SGT:
+  case ICmpInst::Predicate::ICMP_SLE:
+    return nullptr;
+  case ICmpInst::Predicate::ICMP_UGT:
+  case ICmpInst::Predicate::ICMP_ULE:
+    llvm_unreachable("Instsimplify took care of commut. variant");
+    break;
+  default:
+    llvm_unreachable("All possible folds are handled.");
+  }
+
+  // The mask value may be a vector constant that has undefined elements. But it
+  // may not be safe to propagate those undefs into the new compare, so replace
+  // those elements by copying an existing, defined, and safe scalar constant.
+  Type *OpTy = M->getType();
+  auto *VecC = dyn_cast<Constant>(M);
   auto *OpVTy = dyn_cast<FixedVectorType>(OpTy);
   if (OpVTy && VecC && VecC->containsUndefOrPoisonElement()) {
-    Constant *SafeReplacementConstant = nullptr; 
-    for (unsigned i = 0, e = OpVTy->getNumElements(); i != e; ++i) { 
-      if (!isa<UndefValue>(VecC->getAggregateElement(i))) { 
-        SafeReplacementConstant = VecC->getAggregateElement(i); 
-        break; 
-      } 
-    } 
-    assert(SafeReplacementConstant && "Failed to find undef replacement"); 
-    M = Constant::replaceUndefsWith(VecC, SafeReplacementConstant); 
-  } 
- 
-  return Builder.CreateICmp(DstPred, X, M); 
-} 
- 
-/// Some comparisons can be simplified. 
-/// In this case, we are looking for comparisons that look like 
-/// a check for a lossy signed truncation. 
-/// Folds:   (MaskedBits is a constant.) 
-///   ((%x << MaskedBits) a>> MaskedBits) SrcPred %x 
-/// Into: 
-///   (add %x, (1 << (KeptBits-1))) DstPred (1 << KeptBits) 
-/// Where  KeptBits = bitwidth(%x) - MaskedBits 
-static Value * 
-foldICmpWithTruncSignExtendedVal(ICmpInst &I, 
-                                 InstCombiner::BuilderTy &Builder) { 
-  ICmpInst::Predicate SrcPred; 
-  Value *X; 
-  const APInt *C0, *C1; // FIXME: non-splats, potentially with undef. 
-  // We are ok with 'shl' having multiple uses, but 'ashr' must be one-use. 
-  if (!match(&I, m_c_ICmp(SrcPred, 
-                          m_OneUse(m_AShr(m_Shl(m_Value(X), m_APInt(C0)), 
-                                          m_APInt(C1))), 
-                          m_Deferred(X)))) 
-    return nullptr; 
- 
-  // Potential handling of non-splats: for each element: 
-  //  * if both are undef, replace with constant 0. 
-  //    Because (1<<0) is OK and is 1, and ((1<<0)>>1) is also OK and is 0. 
-  //  * if both are not undef, and are different, bailout. 
-  //  * else, only one is undef, then pick the non-undef one. 
- 
-  // The shift amount must be equal. 
-  if (*C0 != *C1) 
-    return nullptr; 
-  const APInt &MaskedBits = *C0; 
-  assert(MaskedBits != 0 && "shift by zero should be folded away already."); 
- 
-  ICmpInst::Predicate DstPred; 
-  switch (SrcPred) { 
-  case ICmpInst::Predicate::ICMP_EQ: 
-    // ((%x << MaskedBits) a>> MaskedBits) == %x 
-    //   => 
-    // (add %x, (1 << (KeptBits-1))) u< (1 << KeptBits) 
-    DstPred = ICmpInst::Predicate::ICMP_ULT; 
-    break; 
-  case ICmpInst::Predicate::ICMP_NE: 
-    // ((%x << MaskedBits) a>> MaskedBits) != %x 
-    //   => 
-    // (add %x, (1 << (KeptBits-1))) u>= (1 << KeptBits) 
-    DstPred = ICmpInst::Predicate::ICMP_UGE; 
-    break; 
-  // FIXME: are more folds possible? 
-  default: 
-    return nullptr; 
-  } 
- 
-  auto *XType = X->getType(); 
-  const unsigned XBitWidth = XType->getScalarSizeInBits(); 
-  const APInt BitWidth = APInt(XBitWidth, XBitWidth); 
-  assert(BitWidth.ugt(MaskedBits) && "shifts should leave some bits untouched"); 
- 
-  // KeptBits = bitwidth(%x) - MaskedBits 
-  const APInt KeptBits = BitWidth - MaskedBits; 
-  assert(KeptBits.ugt(0) && KeptBits.ult(BitWidth) && "unreachable"); 
-  // ICmpCst = (1 << KeptBits) 
-  const APInt ICmpCst = APInt(XBitWidth, 1).shl(KeptBits); 
-  assert(ICmpCst.isPowerOf2()); 
-  // AddCst = (1 << (KeptBits-1)) 
-  const APInt AddCst = ICmpCst.lshr(1); 
-  assert(AddCst.ult(ICmpCst) && AddCst.isPowerOf2()); 
- 
-  // T0 = add %x, AddCst 
-  Value *T0 = Builder.CreateAdd(X, ConstantInt::get(XType, AddCst)); 
-  // T1 = T0 DstPred ICmpCst 
-  Value *T1 = Builder.CreateICmp(DstPred, T0, ConstantInt::get(XType, ICmpCst)); 
- 
-  return T1; 
-} 
- 
-// Given pattern: 
-//   icmp eq/ne (and ((x shift Q), (y oppositeshift K))), 0 
-// we should move shifts to the same hand of 'and', i.e. rewrite as 
-//   icmp eq/ne (and (x shift (Q+K)), y), 0  iff (Q+K) u< bitwidth(x) 
-// We are only interested in opposite logical shifts here. 
-// One of the shifts can be truncated. 
-// If we can, we want to end up creating 'lshr' shift. 
-static Value * 
-foldShiftIntoShiftInAnotherHandOfAndInICmp(ICmpInst &I, const SimplifyQuery SQ, 
-                                           InstCombiner::BuilderTy &Builder) { 
-  if (!I.isEquality() || !match(I.getOperand(1), m_Zero()) || 
-      !I.getOperand(0)->hasOneUse()) 
-    return nullptr; 
- 
-  auto m_AnyLogicalShift = m_LogicalShift(m_Value(), m_Value()); 
- 
-  // Look for an 'and' of two logical shifts, one of which may be truncated. 
-  // We use m_TruncOrSelf() on the RHS to correctly handle commutative case. 
-  Instruction *XShift, *MaybeTruncation, *YShift; 
-  if (!match( 
-          I.getOperand(0), 
-          m_c_And(m_CombineAnd(m_AnyLogicalShift, m_Instruction(XShift)), 
-                  m_CombineAnd(m_TruncOrSelf(m_CombineAnd( 
-                                   m_AnyLogicalShift, m_Instruction(YShift))), 
-                               m_Instruction(MaybeTruncation))))) 
-    return nullptr; 
- 
-  // We potentially looked past 'trunc', but only when matching YShift, 
-  // therefore YShift must have the widest type. 
-  Instruction *WidestShift = YShift; 
-  // Therefore XShift must have the shallowest type. 
-  // Or they both have identical types if there was no truncation. 
-  Instruction *NarrowestShift = XShift; 
- 
-  Type *WidestTy = WidestShift->getType(); 
-  Type *NarrowestTy = NarrowestShift->getType(); 
-  assert(NarrowestTy == I.getOperand(0)->getType() && 
-         "We did not look past any shifts while matching XShift though."); 
-  bool HadTrunc = WidestTy != I.getOperand(0)->getType(); 
- 
-  // If YShift is a 'lshr', swap the shifts around. 
-  if (match(YShift, m_LShr(m_Value(), m_Value()))) 
-    std::swap(XShift, YShift); 
- 
-  // The shifts must be in opposite directions. 
-  auto XShiftOpcode = XShift->getOpcode(); 
-  if (XShiftOpcode == YShift->getOpcode()) 
-    return nullptr; // Do not care about same-direction shifts here. 
- 
-  Value *X, *XShAmt, *Y, *YShAmt; 
-  match(XShift, m_BinOp(m_Value(X), m_ZExtOrSelf(m_Value(XShAmt)))); 
-  match(YShift, m_BinOp(m_Value(Y), m_ZExtOrSelf(m_Value(YShAmt)))); 
- 
-  // If one of the values being shifted is a constant, then we will end with 
-  // and+icmp, and [zext+]shift instrs will be constant-folded. If they are not, 
-  // however, we will need to ensure that we won't increase instruction count. 
-  if (!isa<Constant>(X) && !isa<Constant>(Y)) { 
-    // At least one of the hands of the 'and' should be one-use shift. 
-    if (!match(I.getOperand(0), 
-               m_c_And(m_OneUse(m_AnyLogicalShift), m_Value()))) 
-      return nullptr; 
-    if (HadTrunc) { 
-      // Due to the 'trunc', we will need to widen X. For that either the old 
-      // 'trunc' or the shift amt in the non-truncated shift should be one-use. 
-      if (!MaybeTruncation->hasOneUse() && 
-          !NarrowestShift->getOperand(1)->hasOneUse()) 
-        return nullptr; 
-    } 
-  } 
- 
-  // We have two shift amounts from two different shifts. The types of those 
-  // shift amounts may not match. If that's the case let's bailout now. 
-  if (XShAmt->getType() != YShAmt->getType()) 
-    return nullptr; 
- 
-  // As input, we have the following pattern: 
-  //   icmp eq/ne (and ((x shift Q), (y oppositeshift K))), 0 
-  // We want to rewrite that as: 
-  //   icmp eq/ne (and (x shift (Q+K)), y), 0  iff (Q+K) u< bitwidth(x) 
-  // While we know that originally (Q+K) would not overflow 
-  // (because  2 * (N-1) u<= iN -1), we have looked past extensions of 
-  // shift amounts. so it may now overflow in smaller bitwidth. 
-  // To ensure that does not happen, we need to ensure that the total maximal 
-  // shift amount is still representable in that smaller bit width. 
-  unsigned MaximalPossibleTotalShiftAmount = 
-      (WidestTy->getScalarSizeInBits() - 1) + 
-      (NarrowestTy->getScalarSizeInBits() - 1); 
-  APInt MaximalRepresentableShiftAmount = 
-      APInt::getAllOnesValue(XShAmt->getType()->getScalarSizeInBits()); 
-  if (MaximalRepresentableShiftAmount.ult(MaximalPossibleTotalShiftAmount)) 
-    return nullptr; 
- 
-  // Can we fold (XShAmt+YShAmt) ? 
-  auto *NewShAmt = dyn_cast_or_null<Constant>( 
-      SimplifyAddInst(XShAmt, YShAmt, /*isNSW=*/false, 
-                      /*isNUW=*/false, SQ.getWithInstruction(&I))); 
-  if (!NewShAmt) 
-    return nullptr; 
-  NewShAmt = ConstantExpr::getZExtOrBitCast(NewShAmt, WidestTy); 
-  unsigned WidestBitWidth = WidestTy->getScalarSizeInBits(); 
- 
-  // Is the new shift amount smaller than the bit width? 
-  // FIXME: could also rely on ConstantRange. 
-  if (!match(NewShAmt, 
-             m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT, 
-                                APInt(WidestBitWidth, WidestBitWidth)))) 
-    return nullptr; 
- 
-  // An extra legality check is needed if we had trunc-of-lshr. 
-  if (HadTrunc && match(WidestShift, m_LShr(m_Value(), m_Value()))) { 
-    auto CanFold = [NewShAmt, WidestBitWidth, NarrowestShift, SQ, 
-                    WidestShift]() { 
-      // It isn't obvious whether it's worth it to analyze non-constants here. 
-      // Also, let's basically give up on non-splat cases, pessimizing vectors. 
-      // If *any* of these preconditions matches we can perform the fold. 
-      Constant *NewShAmtSplat = NewShAmt->getType()->isVectorTy() 
-                                    ? NewShAmt->getSplatValue() 
-                                    : NewShAmt; 
-      // If it's edge-case shift (by 0 or by WidestBitWidth-1) we can fold. 
-      if (NewShAmtSplat && 
-          (NewShAmtSplat->isNullValue() || 
-           NewShAmtSplat->getUniqueInteger() == WidestBitWidth - 1)) 
-        return true; 
-      // We consider *min* leading zeros so a single outlier 
-      // blocks the transform as opposed to allowing it. 
-      if (auto *C = dyn_cast<Constant>(NarrowestShift->getOperand(0))) { 
-        KnownBits Known = computeKnownBits(C, SQ.DL); 
-        unsigned MinLeadZero = Known.countMinLeadingZeros(); 
-        // If the value being shifted has at most lowest bit set we can fold. 
-        unsigned MaxActiveBits = Known.getBitWidth() - MinLeadZero; 
-        if (MaxActiveBits <= 1) 
-          return true; 
-        // Precondition:  NewShAmt u<= countLeadingZeros(C) 
-        if (NewShAmtSplat && NewShAmtSplat->getUniqueInteger().ule(MinLeadZero)) 
-          return true; 
-      } 
-      if (auto *C = dyn_cast<Constant>(WidestShift->getOperand(0))) { 
-        KnownBits Known = computeKnownBits(C, SQ.DL); 
-        unsigned MinLeadZero = Known.countMinLeadingZeros(); 
-        // If the value being shifted has at most lowest bit set we can fold. 
-        unsigned MaxActiveBits = Known.getBitWidth() - MinLeadZero; 
-        if (MaxActiveBits <= 1) 
-          return true; 
-        // Precondition:  ((WidestBitWidth-1)-NewShAmt) u<= countLeadingZeros(C) 
-        if (NewShAmtSplat) { 
-          APInt AdjNewShAmt = 
-              (WidestBitWidth - 1) - NewShAmtSplat->getUniqueInteger(); 
-          if (AdjNewShAmt.ule(MinLeadZero)) 
-            return true; 
-        } 
-      } 
-      return false; // Can't tell if it's ok. 
-    }; 
-    if (!CanFold()) 
-      return nullptr; 
-  } 
- 
-  // All good, we can do this fold. 
-  X = Builder.CreateZExt(X, WidestTy); 
-  Y = Builder.CreateZExt(Y, WidestTy); 
-  // The shift is the same that was for X. 
-  Value *T0 = XShiftOpcode == Instruction::BinaryOps::LShr 
-                  ? Builder.CreateLShr(X, NewShAmt) 
-                  : Builder.CreateShl(X, NewShAmt); 
-  Value *T1 = Builder.CreateAnd(T0, Y); 
-  return Builder.CreateICmp(I.getPredicate(), T1, 
-                            Constant::getNullValue(WidestTy)); 
-} 
- 
-/// Fold 
-///   (-1 u/ x) u< y 
-///   ((x * y) u/ x) != y 
-/// to 
-///   @llvm.umul.with.overflow(x, y) plus extraction of overflow bit 
-/// Note that the comparison is commutative, while inverted (u>=, ==) predicate 
-/// will mean that we are looking for the opposite answer. 
+    Constant *SafeReplacementConstant = nullptr;
+    for (unsigned i = 0, e = OpVTy->getNumElements(); i != e; ++i) {
+      if (!isa<UndefValue>(VecC->getAggregateElement(i))) {
+        SafeReplacementConstant = VecC->getAggregateElement(i);
+        break;
+      }
+    }
+    assert(SafeReplacementConstant && "Failed to find undef replacement");
+    M = Constant::replaceUndefsWith(VecC, SafeReplacementConstant);
+  }
+
+  return Builder.CreateICmp(DstPred, X, M);
+}
+
+/// Some comparisons can be simplified.
+/// In this case, we are looking for comparisons that look like
+/// a check for a lossy signed truncation.
+/// Folds:   (MaskedBits is a constant.)
+///   ((%x << MaskedBits) a>> MaskedBits) SrcPred %x
+/// Into:
+///   (add %x, (1 << (KeptBits-1))) DstPred (1 << KeptBits)
+/// Where  KeptBits = bitwidth(%x) - MaskedBits
+static Value *
+foldICmpWithTruncSignExtendedVal(ICmpInst &I,
+                                 InstCombiner::BuilderTy &Builder) {
+  ICmpInst::Predicate SrcPred;
+  Value *X;
+  const APInt *C0, *C1; // FIXME: non-splats, potentially with undef.
+  // We are ok with 'shl' having multiple uses, but 'ashr' must be one-use.
+  if (!match(&I, m_c_ICmp(SrcPred,
+                          m_OneUse(m_AShr(m_Shl(m_Value(X), m_APInt(C0)),
+                                          m_APInt(C1))),
+                          m_Deferred(X))))
+    return nullptr;
+
+  // Potential handling of non-splats: for each element:
+  //  * if both are undef, replace with constant 0.
+  //    Because (1<<0) is OK and is 1, and ((1<<0)>>1) is also OK and is 0.
+  //  * if both are not undef, and are different, bailout.
+  //  * else, only one is undef, then pick the non-undef one.
+
+  // The shift amount must be equal.
+  if (*C0 != *C1)
+    return nullptr;
+  const APInt &MaskedBits = *C0;
+  assert(MaskedBits != 0 && "shift by zero should be folded away already.");
+
+  ICmpInst::Predicate DstPred;
+  switch (SrcPred) {
+  case ICmpInst::Predicate::ICMP_EQ:
+    // ((%x << MaskedBits) a>> MaskedBits) == %x
+    //   =>
+    // (add %x, (1 << (KeptBits-1))) u< (1 << KeptBits)
+    DstPred = ICmpInst::Predicate::ICMP_ULT;
+    break;
+  case ICmpInst::Predicate::ICMP_NE:
+    // ((%x << MaskedBits) a>> MaskedBits) != %x
+    //   =>
+    // (add %x, (1 << (KeptBits-1))) u>= (1 << KeptBits)
+    DstPred = ICmpInst::Predicate::ICMP_UGE;
+    break;
+  // FIXME: are more folds possible?
+  default:
+    return nullptr;
+  }
+
+  auto *XType = X->getType();
+  const unsigned XBitWidth = XType->getScalarSizeInBits();
+  const APInt BitWidth = APInt(XBitWidth, XBitWidth);
+  assert(BitWidth.ugt(MaskedBits) && "shifts should leave some bits untouched");
+
+  // KeptBits = bitwidth(%x) - MaskedBits
+  const APInt KeptBits = BitWidth - MaskedBits;
+  assert(KeptBits.ugt(0) && KeptBits.ult(BitWidth) && "unreachable");
+  // ICmpCst = (1 << KeptBits)
+  const APInt ICmpCst = APInt(XBitWidth, 1).shl(KeptBits);
+  assert(ICmpCst.isPowerOf2());
+  // AddCst = (1 << (KeptBits-1))
+  const APInt AddCst = ICmpCst.lshr(1);
+  assert(AddCst.ult(ICmpCst) && AddCst.isPowerOf2());
+
+  // T0 = add %x, AddCst
+  Value *T0 = Builder.CreateAdd(X, ConstantInt::get(XType, AddCst));
+  // T1 = T0 DstPred ICmpCst
+  Value *T1 = Builder.CreateICmp(DstPred, T0, ConstantInt::get(XType, ICmpCst));
+
+  return T1;
+}
+
+// Given pattern:
+//   icmp eq/ne (and ((x shift Q), (y oppositeshift K))), 0
+// we should move shifts to the same hand of 'and', i.e. rewrite as
+//   icmp eq/ne (and (x shift (Q+K)), y), 0  iff (Q+K) u< bitwidth(x)
+// We are only interested in opposite logical shifts here.
+// One of the shifts can be truncated.
+// If we can, we want to end up creating 'lshr' shift.
+static Value *
+foldShiftIntoShiftInAnotherHandOfAndInICmp(ICmpInst &I, const SimplifyQuery SQ,
+                                           InstCombiner::BuilderTy &Builder) {
+  if (!I.isEquality() || !match(I.getOperand(1), m_Zero()) ||
+      !I.getOperand(0)->hasOneUse())
+    return nullptr;
+
+  auto m_AnyLogicalShift = m_LogicalShift(m_Value(), m_Value());
+
+  // Look for an 'and' of two logical shifts, one of which may be truncated.
+  // We use m_TruncOrSelf() on the RHS to correctly handle commutative case.
+  Instruction *XShift, *MaybeTruncation, *YShift;
+  if (!match(
+          I.getOperand(0),
+          m_c_And(m_CombineAnd(m_AnyLogicalShift, m_Instruction(XShift)),
+                  m_CombineAnd(m_TruncOrSelf(m_CombineAnd(
+                                   m_AnyLogicalShift, m_Instruction(YShift))),
+                               m_Instruction(MaybeTruncation)))))
+    return nullptr;
+
+  // We potentially looked past 'trunc', but only when matching YShift,
+  // therefore YShift must have the widest type.
+  Instruction *WidestShift = YShift;
+  // Therefore XShift must have the shallowest type.
+  // Or they both have identical types if there was no truncation.
+  Instruction *NarrowestShift = XShift;
+
+  Type *WidestTy = WidestShift->getType();
+  Type *NarrowestTy = NarrowestShift->getType();
+  assert(NarrowestTy == I.getOperand(0)->getType() &&
+         "We did not look past any shifts while matching XShift though.");
+  bool HadTrunc = WidestTy != I.getOperand(0)->getType();
+
+  // If YShift is a 'lshr', swap the shifts around.
+  if (match(YShift, m_LShr(m_Value(), m_Value())))
+    std::swap(XShift, YShift);
+
+  // The shifts must be in opposite directions.
+  auto XShiftOpcode = XShift->getOpcode();
+  if (XShiftOpcode == YShift->getOpcode())
+    return nullptr; // Do not care about same-direction shifts here.
+
+  Value *X, *XShAmt, *Y, *YShAmt;
+  match(XShift, m_BinOp(m_Value(X), m_ZExtOrSelf(m_Value(XShAmt))));
+  match(YShift, m_BinOp(m_Value(Y), m_ZExtOrSelf(m_Value(YShAmt))));
+
+  // If one of the values being shifted is a constant, then we will end with
+  // and+icmp, and [zext+]shift instrs will be constant-folded. If they are not,
+  // however, we will need to ensure that we won't increase instruction count.
+  if (!isa<Constant>(X) && !isa<Constant>(Y)) {
+    // At least one of the hands of the 'and' should be one-use shift.
+    if (!match(I.getOperand(0),
+               m_c_And(m_OneUse(m_AnyLogicalShift), m_Value())))
+      return nullptr;
+    if (HadTrunc) {
+      // Due to the 'trunc', we will need to widen X. For that either the old
+      // 'trunc' or the shift amt in the non-truncated shift should be one-use.
+      if (!MaybeTruncation->hasOneUse() &&
+          !NarrowestShift->getOperand(1)->hasOneUse())
+        return nullptr;
+    }
+  }
+
+  // We have two shift amounts from two different shifts. The types of those
+  // shift amounts may not match. If that's the case let's bailout now.
+  if (XShAmt->getType() != YShAmt->getType())
+    return nullptr;
+
+  // As input, we have the following pattern:
+  //   icmp eq/ne (and ((x shift Q), (y oppositeshift K))), 0
+  // We want to rewrite that as:
+  //   icmp eq/ne (and (x shift (Q+K)), y), 0  iff (Q+K) u< bitwidth(x)
+  // While we know that originally (Q+K) would not overflow
+  // (because  2 * (N-1) u<= iN -1), we have looked past extensions of
+  // shift amounts. so it may now overflow in smaller bitwidth.
+  // To ensure that does not happen, we need to ensure that the total maximal
+  // shift amount is still representable in that smaller bit width.
+  unsigned MaximalPossibleTotalShiftAmount =
+      (WidestTy->getScalarSizeInBits() - 1) +
+      (NarrowestTy->getScalarSizeInBits() - 1);
+  APInt MaximalRepresentableShiftAmount =
+      APInt::getAllOnesValue(XShAmt->getType()->getScalarSizeInBits());
+  if (MaximalRepresentableShiftAmount.ult(MaximalPossibleTotalShiftAmount))
+    return nullptr;
+
+  // Can we fold (XShAmt+YShAmt) ?
+  auto *NewShAmt = dyn_cast_or_null<Constant>(
+      SimplifyAddInst(XShAmt, YShAmt, /*isNSW=*/false,
+                      /*isNUW=*/false, SQ.getWithInstruction(&I)));
+  if (!NewShAmt)
+    return nullptr;
+  NewShAmt = ConstantExpr::getZExtOrBitCast(NewShAmt, WidestTy);
+  unsigned WidestBitWidth = WidestTy->getScalarSizeInBits();
+
+  // Is the new shift amount smaller than the bit width?
+  // FIXME: could also rely on ConstantRange.
+  if (!match(NewShAmt,
+             m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT,
+                                APInt(WidestBitWidth, WidestBitWidth))))
+    return nullptr;
+
+  // An extra legality check is needed if we had trunc-of-lshr.
+  if (HadTrunc && match(WidestShift, m_LShr(m_Value(), m_Value()))) {
+    auto CanFold = [NewShAmt, WidestBitWidth, NarrowestShift, SQ,
+                    WidestShift]() {
+      // It isn't obvious whether it's worth it to analyze non-constants here.
+      // Also, let's basically give up on non-splat cases, pessimizing vectors.
+      // If *any* of these preconditions matches we can perform the fold.
+      Constant *NewShAmtSplat = NewShAmt->getType()->isVectorTy()
+                                    ? NewShAmt->getSplatValue()
+                                    : NewShAmt;
+      // If it's edge-case shift (by 0 or by WidestBitWidth-1) we can fold.
+      if (NewShAmtSplat &&
+          (NewShAmtSplat->isNullValue() ||
+           NewShAmtSplat->getUniqueInteger() == WidestBitWidth - 1))
+        return true;
+      // We consider *min* leading zeros so a single outlier
+      // blocks the transform as opposed to allowing it.
+      if (auto *C = dyn_cast<Constant>(NarrowestShift->getOperand(0))) {
+        KnownBits Known = computeKnownBits(C, SQ.DL);
+        unsigned MinLeadZero = Known.countMinLeadingZeros();
+        // If the value being shifted has at most lowest bit set we can fold.
+        unsigned MaxActiveBits = Known.getBitWidth() - MinLeadZero;
+        if (MaxActiveBits <= 1)
+          return true;
+        // Precondition:  NewShAmt u<= countLeadingZeros(C)
+        if (NewShAmtSplat && NewShAmtSplat->getUniqueInteger().ule(MinLeadZero))
+          return true;
+      }
+      if (auto *C = dyn_cast<Constant>(WidestShift->getOperand(0))) {
+        KnownBits Known = computeKnownBits(C, SQ.DL);
+        unsigned MinLeadZero = Known.countMinLeadingZeros();
+        // If the value being shifted has at most lowest bit set we can fold.
+        unsigned MaxActiveBits = Known.getBitWidth() - MinLeadZero;
+        if (MaxActiveBits <= 1)
+          return true;
+        // Precondition:  ((WidestBitWidth-1)-NewShAmt) u<= countLeadingZeros(C)
+        if (NewShAmtSplat) {
+          APInt AdjNewShAmt =
+              (WidestBitWidth - 1) - NewShAmtSplat->getUniqueInteger();
+          if (AdjNewShAmt.ule(MinLeadZero))
+            return true;
+        }
+      }
+      return false; // Can't tell if it's ok.
+    };
+    if (!CanFold())
+      return nullptr;
+  }
+
+  // All good, we can do this fold.
+  X = Builder.CreateZExt(X, WidestTy);
+  Y = Builder.CreateZExt(Y, WidestTy);
+  // The shift is the same that was for X.
+  Value *T0 = XShiftOpcode == Instruction::BinaryOps::LShr
+                  ? Builder.CreateLShr(X, NewShAmt)
+                  : Builder.CreateShl(X, NewShAmt);
+  Value *T1 = Builder.CreateAnd(T0, Y);
+  return Builder.CreateICmp(I.getPredicate(), T1,
+                            Constant::getNullValue(WidestTy));
+}
+
+/// Fold
+///   (-1 u/ x) u< y
+///   ((x * y) u/ x) != y
+/// to
+///   @llvm.umul.with.overflow(x, y) plus extraction of overflow bit
+/// Note that the comparison is commutative, while inverted (u>=, ==) predicate
+/// will mean that we are looking for the opposite answer.
 Value *InstCombinerImpl::foldUnsignedMultiplicationOverflowCheck(ICmpInst &I) {
-  ICmpInst::Predicate Pred; 
-  Value *X, *Y; 
-  Instruction *Mul; 
-  bool NeedNegation; 
-  // Look for: (-1 u/ x) u</u>= y 
-  if (!I.isEquality() && 
-      match(&I, m_c_ICmp(Pred, m_OneUse(m_UDiv(m_AllOnes(), m_Value(X))), 
-                         m_Value(Y)))) { 
-    Mul = nullptr; 
- 
-    // Are we checking that overflow does not happen, or does happen? 
-    switch (Pred) { 
-    case ICmpInst::Predicate::ICMP_ULT: 
-      NeedNegation = false; 
-      break; // OK 
-    case ICmpInst::Predicate::ICMP_UGE: 
-      NeedNegation = true; 
-      break; // OK 
-    default: 
-      return nullptr; // Wrong predicate. 
-    } 
-  } else // Look for: ((x * y) u/ x) !=/== y 
-      if (I.isEquality() && 
-          match(&I, m_c_ICmp(Pred, m_Value(Y), 
-                             m_OneUse(m_UDiv(m_CombineAnd(m_c_Mul(m_Deferred(Y), 
-                                                                  m_Value(X)), 
-                                                          m_Instruction(Mul)), 
-                                             m_Deferred(X)))))) { 
-    NeedNegation = Pred == ICmpInst::Predicate::ICMP_EQ; 
-  } else 
-    return nullptr; 
- 
-  BuilderTy::InsertPointGuard Guard(Builder); 
-  // If the pattern included (x * y), we'll want to insert new instructions 
-  // right before that original multiplication so that we can replace it. 
-  bool MulHadOtherUses = Mul && !Mul->hasOneUse(); 
-  if (MulHadOtherUses) 
-    Builder.SetInsertPoint(Mul); 
- 
-  Function *F = Intrinsic::getDeclaration( 
-      I.getModule(), Intrinsic::umul_with_overflow, X->getType()); 
-  CallInst *Call = Builder.CreateCall(F, {X, Y}, "umul"); 
- 
-  // If the multiplication was used elsewhere, to ensure that we don't leave 
-  // "duplicate" instructions, replace uses of that original multiplication 
-  // with the multiplication result from the with.overflow intrinsic. 
-  if (MulHadOtherUses) 
-    replaceInstUsesWith(*Mul, Builder.CreateExtractValue(Call, 0, "umul.val")); 
- 
-  Value *Res = Builder.CreateExtractValue(Call, 1, "umul.ov"); 
-  if (NeedNegation) // This technically increases instruction count. 
-    Res = Builder.CreateNot(Res, "umul.not.ov"); 
- 
-  // If we replaced the mul, erase it. Do this after all uses of Builder, 
-  // as the mul is used as insertion point. 
-  if (MulHadOtherUses) 
-    eraseInstFromFunction(*Mul); 
- 
-  return Res; 
-} 
- 
+  ICmpInst::Predicate Pred;
+  Value *X, *Y;
+  Instruction *Mul;
+  bool NeedNegation;
+  // Look for: (-1 u/ x) u</u>= y
+  if (!I.isEquality() &&
+      match(&I, m_c_ICmp(Pred, m_OneUse(m_UDiv(m_AllOnes(), m_Value(X))),
+                         m_Value(Y)))) {
+    Mul = nullptr;
+
+    // Are we checking that overflow does not happen, or does happen?
+    switch (Pred) {
+    case ICmpInst::Predicate::ICMP_ULT:
+      NeedNegation = false;
+      break; // OK
+    case ICmpInst::Predicate::ICMP_UGE:
+      NeedNegation = true;
+      break; // OK
+    default:
+      return nullptr; // Wrong predicate.
+    }
+  } else // Look for: ((x * y) u/ x) !=/== y
+      if (I.isEquality() &&
+          match(&I, m_c_ICmp(Pred, m_Value(Y),
+                             m_OneUse(m_UDiv(m_CombineAnd(m_c_Mul(m_Deferred(Y),
+                                                                  m_Value(X)),
+                                                          m_Instruction(Mul)),
+                                             m_Deferred(X)))))) {
+    NeedNegation = Pred == ICmpInst::Predicate::ICMP_EQ;
+  } else
+    return nullptr;
+
+  BuilderTy::InsertPointGuard Guard(Builder);
+  // If the pattern included (x * y), we'll want to insert new instructions
+  // right before that original multiplication so that we can replace it.
+  bool MulHadOtherUses = Mul && !Mul->hasOneUse();
+  if (MulHadOtherUses)
+    Builder.SetInsertPoint(Mul);
+
+  Function *F = Intrinsic::getDeclaration(
+      I.getModule(), Intrinsic::umul_with_overflow, X->getType());
+  CallInst *Call = Builder.CreateCall(F, {X, Y}, "umul");
+
+  // If the multiplication was used elsewhere, to ensure that we don't leave
+  // "duplicate" instructions, replace uses of that original multiplication
+  // with the multiplication result from the with.overflow intrinsic.
+  if (MulHadOtherUses)
+    replaceInstUsesWith(*Mul, Builder.CreateExtractValue(Call, 0, "umul.val"));
+
+  Value *Res = Builder.CreateExtractValue(Call, 1, "umul.ov");
+  if (NeedNegation) // This technically increases instruction count.
+    Res = Builder.CreateNot(Res, "umul.not.ov");
+
+  // If we replaced the mul, erase it. Do this after all uses of Builder,
+  // as the mul is used as insertion point.
+  if (MulHadOtherUses)
+    eraseInstFromFunction(*Mul);
+
+  return Res;
+}
+
 static Instruction *foldICmpXNegX(ICmpInst &I) {
   CmpInst::Predicate Pred;
   Value *X;
@@ -3729,244 +3729,244 @@ static Instruction *foldICmpXNegX(ICmpInst &I) {
                           Constant::getNullValue(X->getType()), I.getName());
 }
 
-/// Try to fold icmp (binop), X or icmp X, (binop). 
-/// TODO: A large part of this logic is duplicated in InstSimplify's 
-/// simplifyICmpWithBinOp(). We should be able to share that and avoid the code 
-/// duplication. 
+/// Try to fold icmp (binop), X or icmp X, (binop).
+/// TODO: A large part of this logic is duplicated in InstSimplify's
+/// simplifyICmpWithBinOp(). We should be able to share that and avoid the code
+/// duplication.
 Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
                                              const SimplifyQuery &SQ) {
-  const SimplifyQuery Q = SQ.getWithInstruction(&I); 
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); 
- 
-  // Special logic for binary operators. 
-  BinaryOperator *BO0 = dyn_cast<BinaryOperator>(Op0); 
-  BinaryOperator *BO1 = dyn_cast<BinaryOperator>(Op1); 
-  if (!BO0 && !BO1) 
-    return nullptr; 
- 
+  const SimplifyQuery Q = SQ.getWithInstruction(&I);
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // Special logic for binary operators.
+  BinaryOperator *BO0 = dyn_cast<BinaryOperator>(Op0);
+  BinaryOperator *BO1 = dyn_cast<BinaryOperator>(Op1);
+  if (!BO0 && !BO1)
+    return nullptr;
+
   if (Instruction *NewICmp = foldICmpXNegX(I))
     return NewICmp;
 
-  const CmpInst::Predicate Pred = I.getPredicate(); 
-  Value *X; 
- 
-  // Convert add-with-unsigned-overflow comparisons into a 'not' with compare. 
-  // (Op1 + X) u</u>= Op1 --> ~Op1 u</u>= X 
-  if (match(Op0, m_OneUse(m_c_Add(m_Specific(Op1), m_Value(X)))) && 
-      (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_UGE)) 
-    return new ICmpInst(Pred, Builder.CreateNot(Op1), X); 
-  // Op0 u>/u<= (Op0 + X) --> X u>/u<= ~Op0 
-  if (match(Op1, m_OneUse(m_c_Add(m_Specific(Op0), m_Value(X)))) && 
-      (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULE)) 
-    return new ICmpInst(Pred, X, Builder.CreateNot(Op0)); 
- 
-  bool NoOp0WrapProblem = false, NoOp1WrapProblem = false; 
-  if (BO0 && isa<OverflowingBinaryOperator>(BO0)) 
-    NoOp0WrapProblem = 
-        ICmpInst::isEquality(Pred) || 
-        (CmpInst::isUnsigned(Pred) && BO0->hasNoUnsignedWrap()) || 
-        (CmpInst::isSigned(Pred) && BO0->hasNoSignedWrap()); 
-  if (BO1 && isa<OverflowingBinaryOperator>(BO1)) 
-    NoOp1WrapProblem = 
-        ICmpInst::isEquality(Pred) || 
-        (CmpInst::isUnsigned(Pred) && BO1->hasNoUnsignedWrap()) || 
-        (CmpInst::isSigned(Pred) && BO1->hasNoSignedWrap()); 
- 
-  // Analyze the case when either Op0 or Op1 is an add instruction. 
-  // Op0 = A + B (or A and B are null); Op1 = C + D (or C and D are null). 
-  Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr; 
-  if (BO0 && BO0->getOpcode() == Instruction::Add) { 
-    A = BO0->getOperand(0); 
-    B = BO0->getOperand(1); 
-  } 
-  if (BO1 && BO1->getOpcode() == Instruction::Add) { 
-    C = BO1->getOperand(0); 
-    D = BO1->getOperand(1); 
-  } 
- 
-  // icmp (A+B), A -> icmp B, 0 for equalities or if there is no overflow. 
-  // icmp (A+B), B -> icmp A, 0 for equalities or if there is no overflow. 
-  if ((A == Op1 || B == Op1) && NoOp0WrapProblem) 
-    return new ICmpInst(Pred, A == Op1 ? B : A, 
-                        Constant::getNullValue(Op1->getType())); 
- 
-  // icmp C, (C+D) -> icmp 0, D for equalities or if there is no overflow. 
-  // icmp D, (C+D) -> icmp 0, C for equalities or if there is no overflow. 
-  if ((C == Op0 || D == Op0) && NoOp1WrapProblem) 
-    return new ICmpInst(Pred, Constant::getNullValue(Op0->getType()), 
-                        C == Op0 ? D : C); 
- 
-  // icmp (A+B), (A+D) -> icmp B, D for equalities or if there is no overflow. 
-  if (A && C && (A == C || A == D || B == C || B == D) && NoOp0WrapProblem && 
-      NoOp1WrapProblem) { 
-    // Determine Y and Z in the form icmp (X+Y), (X+Z). 
-    Value *Y, *Z; 
-    if (A == C) { 
-      // C + B == C + D  ->  B == D 
-      Y = B; 
-      Z = D; 
-    } else if (A == D) { 
-      // D + B == C + D  ->  B == C 
-      Y = B; 
-      Z = C; 
-    } else if (B == C) { 
-      // A + C == C + D  ->  A == D 
-      Y = A; 
-      Z = D; 
-    } else { 
-      assert(B == D); 
-      // A + D == C + D  ->  A == C 
-      Y = A; 
-      Z = C; 
-    } 
-    return new ICmpInst(Pred, Y, Z); 
-  } 
- 
-  // icmp slt (A + -1), Op1 -> icmp sle A, Op1 
-  if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SLT && 
-      match(B, m_AllOnes())) 
-    return new ICmpInst(CmpInst::ICMP_SLE, A, Op1); 
- 
-  // icmp sge (A + -1), Op1 -> icmp sgt A, Op1 
-  if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SGE && 
-      match(B, m_AllOnes())) 
-    return new ICmpInst(CmpInst::ICMP_SGT, A, Op1); 
- 
-  // icmp sle (A + 1), Op1 -> icmp slt A, Op1 
-  if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SLE && match(B, m_One())) 
-    return new ICmpInst(CmpInst::ICMP_SLT, A, Op1); 
- 
-  // icmp sgt (A + 1), Op1 -> icmp sge A, Op1 
-  if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SGT && match(B, m_One())) 
-    return new ICmpInst(CmpInst::ICMP_SGE, A, Op1); 
- 
-  // icmp sgt Op0, (C + -1) -> icmp sge Op0, C 
-  if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGT && 
-      match(D, m_AllOnes())) 
-    return new ICmpInst(CmpInst::ICMP_SGE, Op0, C); 
- 
-  // icmp sle Op0, (C + -1) -> icmp slt Op0, C 
-  if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLE && 
-      match(D, m_AllOnes())) 
-    return new ICmpInst(CmpInst::ICMP_SLT, Op0, C); 
- 
-  // icmp sge Op0, (C + 1) -> icmp sgt Op0, C 
-  if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGE && match(D, m_One())) 
-    return new ICmpInst(CmpInst::ICMP_SGT, Op0, C); 
- 
-  // icmp slt Op0, (C + 1) -> icmp sle Op0, C 
-  if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLT && match(D, m_One())) 
-    return new ICmpInst(CmpInst::ICMP_SLE, Op0, C); 
- 
-  // TODO: The subtraction-related identities shown below also hold, but 
-  // canonicalization from (X -nuw 1) to (X + -1) means that the combinations 
-  // wouldn't happen even if they were implemented. 
-  // 
-  // icmp ult (A - 1), Op1 -> icmp ule A, Op1 
-  // icmp uge (A - 1), Op1 -> icmp ugt A, Op1 
-  // icmp ugt Op0, (C - 1) -> icmp uge Op0, C 
-  // icmp ule Op0, (C - 1) -> icmp ult Op0, C 
- 
-  // icmp ule (A + 1), Op0 -> icmp ult A, Op1 
-  if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_ULE && match(B, m_One())) 
-    return new ICmpInst(CmpInst::ICMP_ULT, A, Op1); 
- 
-  // icmp ugt (A + 1), Op0 -> icmp uge A, Op1 
-  if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_UGT && match(B, m_One())) 
-    return new ICmpInst(CmpInst::ICMP_UGE, A, Op1); 
- 
-  // icmp uge Op0, (C + 1) -> icmp ugt Op0, C 
-  if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_UGE && match(D, m_One())) 
-    return new ICmpInst(CmpInst::ICMP_UGT, Op0, C); 
- 
-  // icmp ult Op0, (C + 1) -> icmp ule Op0, C 
-  if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_ULT && match(D, m_One())) 
-    return new ICmpInst(CmpInst::ICMP_ULE, Op0, C); 
- 
-  // if C1 has greater magnitude than C2: 
-  //  icmp (A + C1), (C + C2) -> icmp (A + C3), C 
-  //  s.t. C3 = C1 - C2 
-  // 
-  // if C2 has greater magnitude than C1: 
-  //  icmp (A + C1), (C + C2) -> icmp A, (C + C3) 
-  //  s.t. C3 = C2 - C1 
-  if (A && C && NoOp0WrapProblem && NoOp1WrapProblem && 
-      (BO0->hasOneUse() || BO1->hasOneUse()) && !I.isUnsigned()) 
-    if (ConstantInt *C1 = dyn_cast<ConstantInt>(B)) 
-      if (ConstantInt *C2 = dyn_cast<ConstantInt>(D)) { 
-        const APInt &AP1 = C1->getValue(); 
-        const APInt &AP2 = C2->getValue(); 
-        if (AP1.isNegative() == AP2.isNegative()) { 
-          APInt AP1Abs = C1->getValue().abs(); 
-          APInt AP2Abs = C2->getValue().abs(); 
-          if (AP1Abs.uge(AP2Abs)) { 
-            ConstantInt *C3 = Builder.getInt(AP1 - AP2); 
-            Value *NewAdd = Builder.CreateNSWAdd(A, C3); 
-            return new ICmpInst(Pred, NewAdd, C); 
-          } else { 
-            ConstantInt *C3 = Builder.getInt(AP2 - AP1); 
-            Value *NewAdd = Builder.CreateNSWAdd(C, C3); 
-            return new ICmpInst(Pred, A, NewAdd); 
-          } 
-        } 
-      } 
- 
-  // Analyze the case when either Op0 or Op1 is a sub instruction. 
-  // Op0 = A - B (or A and B are null); Op1 = C - D (or C and D are null). 
-  A = nullptr; 
-  B = nullptr; 
-  C = nullptr; 
-  D = nullptr; 
-  if (BO0 && BO0->getOpcode() == Instruction::Sub) { 
-    A = BO0->getOperand(0); 
-    B = BO0->getOperand(1); 
-  } 
-  if (BO1 && BO1->getOpcode() == Instruction::Sub) { 
-    C = BO1->getOperand(0); 
-    D = BO1->getOperand(1); 
-  } 
- 
-  // icmp (A-B), A -> icmp 0, B for equalities or if there is no overflow. 
-  if (A == Op1 && NoOp0WrapProblem) 
-    return new ICmpInst(Pred, Constant::getNullValue(Op1->getType()), B); 
-  // icmp C, (C-D) -> icmp D, 0 for equalities or if there is no overflow. 
-  if (C == Op0 && NoOp1WrapProblem) 
-    return new ICmpInst(Pred, D, Constant::getNullValue(Op0->getType())); 
- 
-  // Convert sub-with-unsigned-overflow comparisons into a comparison of args. 
-  // (A - B) u>/u<= A --> B u>/u<= A 
-  if (A == Op1 && (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULE)) 
-    return new ICmpInst(Pred, B, A); 
-  // C u</u>= (C - D) --> C u</u>= D 
-  if (C == Op0 && (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_UGE)) 
-    return new ICmpInst(Pred, C, D); 
-  // (A - B) u>=/u< A --> B u>/u<= A  iff B != 0 
-  if (A == Op1 && (Pred == ICmpInst::ICMP_UGE || Pred == ICmpInst::ICMP_ULT) && 
-      isKnownNonZero(B, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT)) 
-    return new ICmpInst(CmpInst::getFlippedStrictnessPredicate(Pred), B, A); 
-  // C u<=/u> (C - D) --> C u</u>= D  iff B != 0 
-  if (C == Op0 && (Pred == ICmpInst::ICMP_ULE || Pred == ICmpInst::ICMP_UGT) && 
-      isKnownNonZero(D, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT)) 
-    return new ICmpInst(CmpInst::getFlippedStrictnessPredicate(Pred), C, D); 
- 
-  // icmp (A-B), (C-B) -> icmp A, C for equalities or if there is no overflow. 
-  if (B && D && B == D && NoOp0WrapProblem && NoOp1WrapProblem) 
-    return new ICmpInst(Pred, A, C); 
- 
-  // icmp (A-B), (A-D) -> icmp D, B for equalities or if there is no overflow. 
-  if (A && C && A == C && NoOp0WrapProblem && NoOp1WrapProblem) 
-    return new ICmpInst(Pred, D, B); 
- 
-  // icmp (0-X) < cst --> x > -cst 
-  if (NoOp0WrapProblem && ICmpInst::isSigned(Pred)) { 
-    Value *X; 
-    if (match(BO0, m_Neg(m_Value(X)))) 
-      if (Constant *RHSC = dyn_cast<Constant>(Op1)) 
-        if (RHSC->isNotMinSignedValue()) 
-          return new ICmpInst(I.getSwappedPredicate(), X, 
-                              ConstantExpr::getNeg(RHSC)); 
-  } 
- 
+  const CmpInst::Predicate Pred = I.getPredicate();
+  Value *X;
+
+  // Convert add-with-unsigned-overflow comparisons into a 'not' with compare.
+  // (Op1 + X) u</u>= Op1 --> ~Op1 u</u>= X
+  if (match(Op0, m_OneUse(m_c_Add(m_Specific(Op1), m_Value(X)))) &&
+      (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_UGE))
+    return new ICmpInst(Pred, Builder.CreateNot(Op1), X);
+  // Op0 u>/u<= (Op0 + X) --> X u>/u<= ~Op0
+  if (match(Op1, m_OneUse(m_c_Add(m_Specific(Op0), m_Value(X)))) &&
+      (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULE))
+    return new ICmpInst(Pred, X, Builder.CreateNot(Op0));
+
+  bool NoOp0WrapProblem = false, NoOp1WrapProblem = false;
+  if (BO0 && isa<OverflowingBinaryOperator>(BO0))
+    NoOp0WrapProblem =
+        ICmpInst::isEquality(Pred) ||
+        (CmpInst::isUnsigned(Pred) && BO0->hasNoUnsignedWrap()) ||
+        (CmpInst::isSigned(Pred) && BO0->hasNoSignedWrap());
+  if (BO1 && isa<OverflowingBinaryOperator>(BO1))
+    NoOp1WrapProblem =
+        ICmpInst::isEquality(Pred) ||
+        (CmpInst::isUnsigned(Pred) && BO1->hasNoUnsignedWrap()) ||
+        (CmpInst::isSigned(Pred) && BO1->hasNoSignedWrap());
+
+  // Analyze the case when either Op0 or Op1 is an add instruction.
+  // Op0 = A + B (or A and B are null); Op1 = C + D (or C and D are null).
+  Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr;
+  if (BO0 && BO0->getOpcode() == Instruction::Add) {
+    A = BO0->getOperand(0);
+    B = BO0->getOperand(1);
+  }
+  if (BO1 && BO1->getOpcode() == Instruction::Add) {
+    C = BO1->getOperand(0);
+    D = BO1->getOperand(1);
+  }
+
+  // icmp (A+B), A -> icmp B, 0 for equalities or if there is no overflow.
+  // icmp (A+B), B -> icmp A, 0 for equalities or if there is no overflow.
+  if ((A == Op1 || B == Op1) && NoOp0WrapProblem)
+    return new ICmpInst(Pred, A == Op1 ? B : A,
+                        Constant::getNullValue(Op1->getType()));
+
+  // icmp C, (C+D) -> icmp 0, D for equalities or if there is no overflow.
+  // icmp D, (C+D) -> icmp 0, C for equalities or if there is no overflow.
+  if ((C == Op0 || D == Op0) && NoOp1WrapProblem)
+    return new ICmpInst(Pred, Constant::getNullValue(Op0->getType()),
+                        C == Op0 ? D : C);
+
+  // icmp (A+B), (A+D) -> icmp B, D for equalities or if there is no overflow.
+  if (A && C && (A == C || A == D || B == C || B == D) && NoOp0WrapProblem &&
+      NoOp1WrapProblem) {
+    // Determine Y and Z in the form icmp (X+Y), (X+Z).
+    Value *Y, *Z;
+    if (A == C) {
+      // C + B == C + D  ->  B == D
+      Y = B;
+      Z = D;
+    } else if (A == D) {
+      // D + B == C + D  ->  B == C
+      Y = B;
+      Z = C;
+    } else if (B == C) {
+      // A + C == C + D  ->  A == D
+      Y = A;
+      Z = D;
+    } else {
+      assert(B == D);
+      // A + D == C + D  ->  A == C
+      Y = A;
+      Z = C;
+    }
+    return new ICmpInst(Pred, Y, Z);
+  }
+
+  // icmp slt (A + -1), Op1 -> icmp sle A, Op1
+  if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SLT &&
+      match(B, m_AllOnes()))
+    return new ICmpInst(CmpInst::ICMP_SLE, A, Op1);
+
+  // icmp sge (A + -1), Op1 -> icmp sgt A, Op1
+  if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SGE &&
+      match(B, m_AllOnes()))
+    return new ICmpInst(CmpInst::ICMP_SGT, A, Op1);
+
+  // icmp sle (A + 1), Op1 -> icmp slt A, Op1
+  if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SLE && match(B, m_One()))
+    return new ICmpInst(CmpInst::ICMP_SLT, A, Op1);
+
+  // icmp sgt (A + 1), Op1 -> icmp sge A, Op1
+  if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SGT && match(B, m_One()))
+    return new ICmpInst(CmpInst::ICMP_SGE, A, Op1);
+
+  // icmp sgt Op0, (C + -1) -> icmp sge Op0, C
+  if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGT &&
+      match(D, m_AllOnes()))
+    return new ICmpInst(CmpInst::ICMP_SGE, Op0, C);
+
+  // icmp sle Op0, (C + -1) -> icmp slt Op0, C
+  if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLE &&
+      match(D, m_AllOnes()))
+    return new ICmpInst(CmpInst::ICMP_SLT, Op0, C);
+
+  // icmp sge Op0, (C + 1) -> icmp sgt Op0, C
+  if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGE && match(D, m_One()))
+    return new ICmpInst(CmpInst::ICMP_SGT, Op0, C);
+
+  // icmp slt Op0, (C + 1) -> icmp sle Op0, C
+  if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLT && match(D, m_One()))
+    return new ICmpInst(CmpInst::ICMP_SLE, Op0, C);
+
+  // TODO: The subtraction-related identities shown below also hold, but
+  // canonicalization from (X -nuw 1) to (X + -1) means that the combinations
+  // wouldn't happen even if they were implemented.
+  //
+  // icmp ult (A - 1), Op1 -> icmp ule A, Op1
+  // icmp uge (A - 1), Op1 -> icmp ugt A, Op1
+  // icmp ugt Op0, (C - 1) -> icmp uge Op0, C
+  // icmp ule Op0, (C - 1) -> icmp ult Op0, C
+
+  // icmp ule (A + 1), Op0 -> icmp ult A, Op1
+  if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_ULE && match(B, m_One()))
+    return new ICmpInst(CmpInst::ICMP_ULT, A, Op1);
+
+  // icmp ugt (A + 1), Op0 -> icmp uge A, Op1
+  if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_UGT && match(B, m_One()))
+    return new ICmpInst(CmpInst::ICMP_UGE, A, Op1);
+
+  // icmp uge Op0, (C + 1) -> icmp ugt Op0, C
+  if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_UGE && match(D, m_One()))
+    return new ICmpInst(CmpInst::ICMP_UGT, Op0, C);
+
+  // icmp ult Op0, (C + 1) -> icmp ule Op0, C
+  if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_ULT && match(D, m_One()))
+    return new ICmpInst(CmpInst::ICMP_ULE, Op0, C);
+
+  // if C1 has greater magnitude than C2:
+  //  icmp (A + C1), (C + C2) -> icmp (A + C3), C
+  //  s.t. C3 = C1 - C2
+  //
+  // if C2 has greater magnitude than C1:
+  //  icmp (A + C1), (C + C2) -> icmp A, (C + C3)
+  //  s.t. C3 = C2 - C1
+  if (A && C && NoOp0WrapProblem && NoOp1WrapProblem &&
+      (BO0->hasOneUse() || BO1->hasOneUse()) && !I.isUnsigned())
+    if (ConstantInt *C1 = dyn_cast<ConstantInt>(B))
+      if (ConstantInt *C2 = dyn_cast<ConstantInt>(D)) {
+        const APInt &AP1 = C1->getValue();
+        const APInt &AP2 = C2->getValue();
+        if (AP1.isNegative() == AP2.isNegative()) {
+          APInt AP1Abs = C1->getValue().abs();
+          APInt AP2Abs = C2->getValue().abs();
+          if (AP1Abs.uge(AP2Abs)) {
+            ConstantInt *C3 = Builder.getInt(AP1 - AP2);
+            Value *NewAdd = Builder.CreateNSWAdd(A, C3);
+            return new ICmpInst(Pred, NewAdd, C);
+          } else {
+            ConstantInt *C3 = Builder.getInt(AP2 - AP1);
+            Value *NewAdd = Builder.CreateNSWAdd(C, C3);
+            return new ICmpInst(Pred, A, NewAdd);
+          }
+        }
+      }
+
+  // Analyze the case when either Op0 or Op1 is a sub instruction.
+  // Op0 = A - B (or A and B are null); Op1 = C - D (or C and D are null).
+  A = nullptr;
+  B = nullptr;
+  C = nullptr;
+  D = nullptr;
+  if (BO0 && BO0->getOpcode() == Instruction::Sub) {
+    A = BO0->getOperand(0);
+    B = BO0->getOperand(1);
+  }
+  if (BO1 && BO1->getOpcode() == Instruction::Sub) {
+    C = BO1->getOperand(0);
+    D = BO1->getOperand(1);
+  }
+
+  // icmp (A-B), A -> icmp 0, B for equalities or if there is no overflow.
+  if (A == Op1 && NoOp0WrapProblem)
+    return new ICmpInst(Pred, Constant::getNullValue(Op1->getType()), B);
+  // icmp C, (C-D) -> icmp D, 0 for equalities or if there is no overflow.
+  if (C == Op0 && NoOp1WrapProblem)
+    return new ICmpInst(Pred, D, Constant::getNullValue(Op0->getType()));
+
+  // Convert sub-with-unsigned-overflow comparisons into a comparison of args.
+  // (A - B) u>/u<= A --> B u>/u<= A
+  if (A == Op1 && (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULE))
+    return new ICmpInst(Pred, B, A);
+  // C u</u>= (C - D) --> C u</u>= D
+  if (C == Op0 && (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_UGE))
+    return new ICmpInst(Pred, C, D);
+  // (A - B) u>=/u< A --> B u>/u<= A  iff B != 0
+  if (A == Op1 && (Pred == ICmpInst::ICMP_UGE || Pred == ICmpInst::ICMP_ULT) &&
+      isKnownNonZero(B, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT))
+    return new ICmpInst(CmpInst::getFlippedStrictnessPredicate(Pred), B, A);
+  // C u<=/u> (C - D) --> C u</u>= D  iff B != 0
+  if (C == Op0 && (Pred == ICmpInst::ICMP_ULE || Pred == ICmpInst::ICMP_UGT) &&
+      isKnownNonZero(D, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT))
+    return new ICmpInst(CmpInst::getFlippedStrictnessPredicate(Pred), C, D);
+
+  // icmp (A-B), (C-B) -> icmp A, C for equalities or if there is no overflow.
+  if (B && D && B == D && NoOp0WrapProblem && NoOp1WrapProblem)
+    return new ICmpInst(Pred, A, C);
+
+  // icmp (A-B), (A-D) -> icmp D, B for equalities or if there is no overflow.
+  if (A && C && A == C && NoOp0WrapProblem && NoOp1WrapProblem)
+    return new ICmpInst(Pred, D, B);
+
+  // icmp (0-X) < cst --> x > -cst
+  if (NoOp0WrapProblem && ICmpInst::isSigned(Pred)) {
+    Value *X;
+    if (match(BO0, m_Neg(m_Value(X))))
+      if (Constant *RHSC = dyn_cast<Constant>(Op1))
+        if (RHSC->isNotMinSignedValue())
+          return new ICmpInst(I.getSwappedPredicate(), X,
+                              ConstantExpr::getNeg(RHSC));
+  }
+
   {
     // Try to remove shared constant multiplier from equality comparison:
     // X * C == Y * C (with no overflowing/aliasing) --> X == Y
@@ -3980,2296 +3980,2296 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
       return new ICmpInst(Pred, X, Y);
   }
 
-  BinaryOperator *SRem = nullptr; 
-  // icmp (srem X, Y), Y 
-  if (BO0 && BO0->getOpcode() == Instruction::SRem && Op1 == BO0->getOperand(1)) 
-    SRem = BO0; 
-  // icmp Y, (srem X, Y) 
-  else if (BO1 && BO1->getOpcode() == Instruction::SRem && 
-           Op0 == BO1->getOperand(1)) 
-    SRem = BO1; 
-  if (SRem) { 
-    // We don't check hasOneUse to avoid increasing register pressure because 
-    // the value we use is the same value this instruction was already using. 
-    switch (SRem == BO0 ? ICmpInst::getSwappedPredicate(Pred) : Pred) { 
-    default: 
-      break; 
-    case ICmpInst::ICMP_EQ: 
-      return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType())); 
-    case ICmpInst::ICMP_NE: 
-      return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType())); 
-    case ICmpInst::ICMP_SGT: 
-    case ICmpInst::ICMP_SGE: 
-      return new ICmpInst(ICmpInst::ICMP_SGT, SRem->getOperand(1), 
-                          Constant::getAllOnesValue(SRem->getType())); 
-    case ICmpInst::ICMP_SLT: 
-    case ICmpInst::ICMP_SLE: 
-      return new ICmpInst(ICmpInst::ICMP_SLT, SRem->getOperand(1), 
-                          Constant::getNullValue(SRem->getType())); 
-    } 
-  } 
- 
-  if (BO0 && BO1 && BO0->getOpcode() == BO1->getOpcode() && BO0->hasOneUse() && 
-      BO1->hasOneUse() && BO0->getOperand(1) == BO1->getOperand(1)) { 
-    switch (BO0->getOpcode()) { 
-    default: 
-      break; 
-    case Instruction::Add: 
-    case Instruction::Sub: 
-    case Instruction::Xor: { 
-      if (I.isEquality()) // a+x icmp eq/ne b+x --> a icmp b 
-        return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); 
- 
-      const APInt *C; 
-      if (match(BO0->getOperand(1), m_APInt(C))) { 
-        // icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b 
-        if (C->isSignMask()) { 
+  BinaryOperator *SRem = nullptr;
+  // icmp (srem X, Y), Y
+  if (BO0 && BO0->getOpcode() == Instruction::SRem && Op1 == BO0->getOperand(1))
+    SRem = BO0;
+  // icmp Y, (srem X, Y)
+  else if (BO1 && BO1->getOpcode() == Instruction::SRem &&
+           Op0 == BO1->getOperand(1))
+    SRem = BO1;
+  if (SRem) {
+    // We don't check hasOneUse to avoid increasing register pressure because
+    // the value we use is the same value this instruction was already using.
+    switch (SRem == BO0 ? ICmpInst::getSwappedPredicate(Pred) : Pred) {
+    default:
+      break;
+    case ICmpInst::ICMP_EQ:
+      return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+    case ICmpInst::ICMP_NE:
+      return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+    case ICmpInst::ICMP_SGT:
+    case ICmpInst::ICMP_SGE:
+      return new ICmpInst(ICmpInst::ICMP_SGT, SRem->getOperand(1),
+                          Constant::getAllOnesValue(SRem->getType()));
+    case ICmpInst::ICMP_SLT:
+    case ICmpInst::ICMP_SLE:
+      return new ICmpInst(ICmpInst::ICMP_SLT, SRem->getOperand(1),
+                          Constant::getNullValue(SRem->getType()));
+    }
+  }
+
+  if (BO0 && BO1 && BO0->getOpcode() == BO1->getOpcode() && BO0->hasOneUse() &&
+      BO1->hasOneUse() && BO0->getOperand(1) == BO1->getOperand(1)) {
+    switch (BO0->getOpcode()) {
+    default:
+      break;
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::Xor: {
+      if (I.isEquality()) // a+x icmp eq/ne b+x --> a icmp b
+        return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+
+      const APInt *C;
+      if (match(BO0->getOperand(1), m_APInt(C))) {
+        // icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b
+        if (C->isSignMask()) {
           ICmpInst::Predicate NewPred = I.getFlippedSignednessPredicate();
-          return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0)); 
-        } 
- 
-        // icmp u/s (a ^ maxsignval), (b ^ maxsignval) --> icmp s/u' a, b 
-        if (BO0->getOpcode() == Instruction::Xor && C->isMaxSignedValue()) { 
+          return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0));
+        }
+
+        // icmp u/s (a ^ maxsignval), (b ^ maxsignval) --> icmp s/u' a, b
+        if (BO0->getOpcode() == Instruction::Xor && C->isMaxSignedValue()) {
           ICmpInst::Predicate NewPred = I.getFlippedSignednessPredicate();
-          NewPred = I.getSwappedPredicate(NewPred); 
-          return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0)); 
-        } 
-      } 
-      break; 
-    } 
-    case Instruction::Mul: { 
-      if (!I.isEquality()) 
-        break; 
- 
-      const APInt *C; 
-      if (match(BO0->getOperand(1), m_APInt(C)) && !C->isNullValue() && 
-          !C->isOneValue()) { 
-        // icmp eq/ne (X * C), (Y * C) --> icmp (X & Mask), (Y & Mask) 
-        // Mask = -1 >> count-trailing-zeros(C). 
-        if (unsigned TZs = C->countTrailingZeros()) { 
-          Constant *Mask = ConstantInt::get( 
-              BO0->getType(), 
-              APInt::getLowBitsSet(C->getBitWidth(), C->getBitWidth() - TZs)); 
-          Value *And1 = Builder.CreateAnd(BO0->getOperand(0), Mask); 
-          Value *And2 = Builder.CreateAnd(BO1->getOperand(0), Mask); 
-          return new ICmpInst(Pred, And1, And2); 
-        } 
-      } 
-      break; 
-    } 
-    case Instruction::UDiv: 
-    case Instruction::LShr: 
-      if (I.isSigned() || !BO0->isExact() || !BO1->isExact()) 
-        break; 
-      return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); 
- 
-    case Instruction::SDiv: 
-      if (!I.isEquality() || !BO0->isExact() || !BO1->isExact()) 
-        break; 
-      return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); 
- 
-    case Instruction::AShr: 
-      if (!BO0->isExact() || !BO1->isExact()) 
-        break; 
-      return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); 
- 
-    case Instruction::Shl: { 
-      bool NUW = BO0->hasNoUnsignedWrap() && BO1->hasNoUnsignedWrap(); 
-      bool NSW = BO0->hasNoSignedWrap() && BO1->hasNoSignedWrap(); 
-      if (!NUW && !NSW) 
-        break; 
-      if (!NSW && I.isSigned()) 
-        break; 
-      return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); 
-    } 
-    } 
-  } 
- 
-  if (BO0) { 
-    // Transform  A & (L - 1) `ult` L --> L != 0 
-    auto LSubOne = m_Add(m_Specific(Op1), m_AllOnes()); 
-    auto BitwiseAnd = m_c_And(m_Value(), LSubOne); 
- 
-    if (match(BO0, BitwiseAnd) && Pred == ICmpInst::ICMP_ULT) { 
-      auto *Zero = Constant::getNullValue(BO0->getType()); 
-      return new ICmpInst(ICmpInst::ICMP_NE, Op1, Zero); 
-    } 
-  } 
- 
-  if (Value *V = foldUnsignedMultiplicationOverflowCheck(I)) 
-    return replaceInstUsesWith(I, V); 
- 
-  if (Value *V = foldICmpWithLowBitMaskedVal(I, Builder)) 
-    return replaceInstUsesWith(I, V); 
- 
-  if (Value *V = foldICmpWithTruncSignExtendedVal(I, Builder)) 
-    return replaceInstUsesWith(I, V); 
- 
-  if (Value *V = foldShiftIntoShiftInAnotherHandOfAndInICmp(I, SQ, Builder)) 
-    return replaceInstUsesWith(I, V); 
- 
-  return nullptr; 
-} 
- 
-/// Fold icmp Pred min|max(X, Y), X. 
-static Instruction *foldICmpWithMinMax(ICmpInst &Cmp) { 
-  ICmpInst::Predicate Pred = Cmp.getPredicate(); 
-  Value *Op0 = Cmp.getOperand(0); 
-  Value *X = Cmp.getOperand(1); 
- 
-  // Canonicalize minimum or maximum operand to LHS of the icmp. 
-  if (match(X, m_c_SMin(m_Specific(Op0), m_Value())) || 
-      match(X, m_c_SMax(m_Specific(Op0), m_Value())) || 
-      match(X, m_c_UMin(m_Specific(Op0), m_Value())) || 
-      match(X, m_c_UMax(m_Specific(Op0), m_Value()))) { 
-    std::swap(Op0, X); 
-    Pred = Cmp.getSwappedPredicate(); 
-  } 
- 
-  Value *Y; 
-  if (match(Op0, m_c_SMin(m_Specific(X), m_Value(Y)))) { 
-    // smin(X, Y)  == X --> X s<= Y 
-    // smin(X, Y) s>= X --> X s<= Y 
-    if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_SGE) 
-      return new ICmpInst(ICmpInst::ICMP_SLE, X, Y); 
- 
-    // smin(X, Y) != X --> X s> Y 
-    // smin(X, Y) s< X --> X s> Y 
-    if (Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_SLT) 
-      return new ICmpInst(ICmpInst::ICMP_SGT, X, Y); 
- 
-    // These cases should be handled in InstSimplify: 
-    // smin(X, Y) s<= X --> true 
-    // smin(X, Y) s> X --> false 
-    return nullptr; 
-  } 
- 
-  if (match(Op0, m_c_SMax(m_Specific(X), m_Value(Y)))) { 
-    // smax(X, Y)  == X --> X s>= Y 
-    // smax(X, Y) s<= X --> X s>= Y 
-    if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_SLE) 
-      return new ICmpInst(ICmpInst::ICMP_SGE, X, Y); 
- 
-    // smax(X, Y) != X --> X s< Y 
-    // smax(X, Y) s> X --> X s< Y 
-    if (Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_SGT) 
-      return new ICmpInst(ICmpInst::ICMP_SLT, X, Y); 
- 
-    // These cases should be handled in InstSimplify: 
-    // smax(X, Y) s>= X --> true 
-    // smax(X, Y) s< X --> false 
-    return nullptr; 
-  } 
- 
-  if (match(Op0, m_c_UMin(m_Specific(X), m_Value(Y)))) { 
-    // umin(X, Y)  == X --> X u<= Y 
-    // umin(X, Y) u>= X --> X u<= Y 
-    if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_UGE) 
-      return new ICmpInst(ICmpInst::ICMP_ULE, X, Y); 
- 
-    // umin(X, Y) != X --> X u> Y 
-    // umin(X, Y) u< X --> X u> Y 
-    if (Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_ULT) 
-      return new ICmpInst(ICmpInst::ICMP_UGT, X, Y); 
- 
-    // These cases should be handled in InstSimplify: 
-    // umin(X, Y) u<= X --> true 
-    // umin(X, Y) u> X --> false 
-    return nullptr; 
-  } 
- 
-  if (match(Op0, m_c_UMax(m_Specific(X), m_Value(Y)))) { 
-    // umax(X, Y)  == X --> X u>= Y 
-    // umax(X, Y) u<= X --> X u>= Y 
-    if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_ULE) 
-      return new ICmpInst(ICmpInst::ICMP_UGE, X, Y); 
- 
-    // umax(X, Y) != X --> X u< Y 
-    // umax(X, Y) u> X --> X u< Y 
-    if (Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_UGT) 
-      return new ICmpInst(ICmpInst::ICMP_ULT, X, Y); 
- 
-    // These cases should be handled in InstSimplify: 
-    // umax(X, Y) u>= X --> true 
-    // umax(X, Y) u< X --> false 
-    return nullptr; 
-  } 
- 
-  return nullptr; 
-} 
- 
+          NewPred = I.getSwappedPredicate(NewPred);
+          return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0));
+        }
+      }
+      break;
+    }
+    case Instruction::Mul: {
+      if (!I.isEquality())
+        break;
+
+      const APInt *C;
+      if (match(BO0->getOperand(1), m_APInt(C)) && !C->isNullValue() &&
+          !C->isOneValue()) {
+        // icmp eq/ne (X * C), (Y * C) --> icmp (X & Mask), (Y & Mask)
+        // Mask = -1 >> count-trailing-zeros(C).
+        if (unsigned TZs = C->countTrailingZeros()) {
+          Constant *Mask = ConstantInt::get(
+              BO0->getType(),
+              APInt::getLowBitsSet(C->getBitWidth(), C->getBitWidth() - TZs));
+          Value *And1 = Builder.CreateAnd(BO0->getOperand(0), Mask);
+          Value *And2 = Builder.CreateAnd(BO1->getOperand(0), Mask);
+          return new ICmpInst(Pred, And1, And2);
+        }
+      }
+      break;
+    }
+    case Instruction::UDiv:
+    case Instruction::LShr:
+      if (I.isSigned() || !BO0->isExact() || !BO1->isExact())
+        break;
+      return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+
+    case Instruction::SDiv:
+      if (!I.isEquality() || !BO0->isExact() || !BO1->isExact())
+        break;
+      return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+
+    case Instruction::AShr:
+      if (!BO0->isExact() || !BO1->isExact())
+        break;
+      return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+
+    case Instruction::Shl: {
+      bool NUW = BO0->hasNoUnsignedWrap() && BO1->hasNoUnsignedWrap();
+      bool NSW = BO0->hasNoSignedWrap() && BO1->hasNoSignedWrap();
+      if (!NUW && !NSW)
+        break;
+      if (!NSW && I.isSigned())
+        break;
+      return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+    }
+    }
+  }
+
+  if (BO0) {
+    // Transform  A & (L - 1) `ult` L --> L != 0
+    auto LSubOne = m_Add(m_Specific(Op1), m_AllOnes());
+    auto BitwiseAnd = m_c_And(m_Value(), LSubOne);
+
+    if (match(BO0, BitwiseAnd) && Pred == ICmpInst::ICMP_ULT) {
+      auto *Zero = Constant::getNullValue(BO0->getType());
+      return new ICmpInst(ICmpInst::ICMP_NE, Op1, Zero);
+    }
+  }
+
+  if (Value *V = foldUnsignedMultiplicationOverflowCheck(I))
+    return replaceInstUsesWith(I, V);
+
+  if (Value *V = foldICmpWithLowBitMaskedVal(I, Builder))
+    return replaceInstUsesWith(I, V);
+
+  if (Value *V = foldICmpWithTruncSignExtendedVal(I, Builder))
+    return replaceInstUsesWith(I, V);
+
+  if (Value *V = foldShiftIntoShiftInAnotherHandOfAndInICmp(I, SQ, Builder))
+    return replaceInstUsesWith(I, V);
+
+  return nullptr;
+}
+
+/// Fold icmp Pred min|max(X, Y), X.
+static Instruction *foldICmpWithMinMax(ICmpInst &Cmp) {
+  ICmpInst::Predicate Pred = Cmp.getPredicate();
+  Value *Op0 = Cmp.getOperand(0);
+  Value *X = Cmp.getOperand(1);
+
+  // Canonicalize minimum or maximum operand to LHS of the icmp.
+  if (match(X, m_c_SMin(m_Specific(Op0), m_Value())) ||
+      match(X, m_c_SMax(m_Specific(Op0), m_Value())) ||
+      match(X, m_c_UMin(m_Specific(Op0), m_Value())) ||
+      match(X, m_c_UMax(m_Specific(Op0), m_Value()))) {
+    std::swap(Op0, X);
+    Pred = Cmp.getSwappedPredicate();
+  }
+
+  Value *Y;
+  if (match(Op0, m_c_SMin(m_Specific(X), m_Value(Y)))) {
+    // smin(X, Y)  == X --> X s<= Y
+    // smin(X, Y) s>= X --> X s<= Y
+    if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_SGE)
+      return new ICmpInst(ICmpInst::ICMP_SLE, X, Y);
+
+    // smin(X, Y) != X --> X s> Y
+    // smin(X, Y) s< X --> X s> Y
+    if (Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_SLT)
+      return new ICmpInst(ICmpInst::ICMP_SGT, X, Y);
+
+    // These cases should be handled in InstSimplify:
+    // smin(X, Y) s<= X --> true
+    // smin(X, Y) s> X --> false
+    return nullptr;
+  }
+
+  if (match(Op0, m_c_SMax(m_Specific(X), m_Value(Y)))) {
+    // smax(X, Y)  == X --> X s>= Y
+    // smax(X, Y) s<= X --> X s>= Y
+    if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_SLE)
+      return new ICmpInst(ICmpInst::ICMP_SGE, X, Y);
+
+    // smax(X, Y) != X --> X s< Y
+    // smax(X, Y) s> X --> X s< Y
+    if (Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_SGT)
+      return new ICmpInst(ICmpInst::ICMP_SLT, X, Y);
+
+    // These cases should be handled in InstSimplify:
+    // smax(X, Y) s>= X --> true
+    // smax(X, Y) s< X --> false
+    return nullptr;
+  }
+
+  if (match(Op0, m_c_UMin(m_Specific(X), m_Value(Y)))) {
+    // umin(X, Y)  == X --> X u<= Y
+    // umin(X, Y) u>= X --> X u<= Y
+    if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_UGE)
+      return new ICmpInst(ICmpInst::ICMP_ULE, X, Y);
+
+    // umin(X, Y) != X --> X u> Y
+    // umin(X, Y) u< X --> X u> Y
+    if (Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_ULT)
+      return new ICmpInst(ICmpInst::ICMP_UGT, X, Y);
+
+    // These cases should be handled in InstSimplify:
+    // umin(X, Y) u<= X --> true
+    // umin(X, Y) u> X --> false
+    return nullptr;
+  }
+
+  if (match(Op0, m_c_UMax(m_Specific(X), m_Value(Y)))) {
+    // umax(X, Y)  == X --> X u>= Y
+    // umax(X, Y) u<= X --> X u>= Y
+    if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_ULE)
+      return new ICmpInst(ICmpInst::ICMP_UGE, X, Y);
+
+    // umax(X, Y) != X --> X u< Y
+    // umax(X, Y) u> X --> X u< Y
+    if (Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_UGT)
+      return new ICmpInst(ICmpInst::ICMP_ULT, X, Y);
+
+    // These cases should be handled in InstSimplify:
+    // umax(X, Y) u>= X --> true
+    // umax(X, Y) u< X --> false
+    return nullptr;
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::foldICmpEquality(ICmpInst &I) {
-  if (!I.isEquality()) 
-    return nullptr; 
- 
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); 
-  const CmpInst::Predicate Pred = I.getPredicate(); 
-  Value *A, *B, *C, *D; 
-  if (match(Op0, m_Xor(m_Value(A), m_Value(B)))) { 
-    if (A == Op1 || B == Op1) { // (A^B) == A  ->  B == 0 
-      Value *OtherVal = A == Op1 ? B : A; 
-      return new ICmpInst(Pred, OtherVal, Constant::getNullValue(A->getType())); 
-    } 
- 
-    if (match(Op1, m_Xor(m_Value(C), m_Value(D)))) { 
-      // A^c1 == C^c2 --> A == C^(c1^c2) 
-      ConstantInt *C1, *C2; 
-      if (match(B, m_ConstantInt(C1)) && match(D, m_ConstantInt(C2)) && 
-          Op1->hasOneUse()) { 
-        Constant *NC = Builder.getInt(C1->getValue() ^ C2->getValue()); 
-        Value *Xor = Builder.CreateXor(C, NC); 
-        return new ICmpInst(Pred, A, Xor); 
-      } 
- 
-      // A^B == A^D -> B == D 
-      if (A == C) 
-        return new ICmpInst(Pred, B, D); 
-      if (A == D) 
-        return new ICmpInst(Pred, B, C); 
-      if (B == C) 
-        return new ICmpInst(Pred, A, D); 
-      if (B == D) 
-        return new ICmpInst(Pred, A, C); 
-    } 
-  } 
- 
-  if (match(Op1, m_Xor(m_Value(A), m_Value(B))) && (A == Op0 || B == Op0)) { 
-    // A == (A^B)  ->  B == 0 
-    Value *OtherVal = A == Op0 ? B : A; 
-    return new ICmpInst(Pred, OtherVal, Constant::getNullValue(A->getType())); 
-  } 
- 
-  // (X&Z) == (Y&Z) -> (X^Y) & Z == 0 
-  if (match(Op0, m_OneUse(m_And(m_Value(A), m_Value(B)))) && 
-      match(Op1, m_OneUse(m_And(m_Value(C), m_Value(D))))) { 
-    Value *X = nullptr, *Y = nullptr, *Z = nullptr; 
- 
-    if (A == C) { 
-      X = B; 
-      Y = D; 
-      Z = A; 
-    } else if (A == D) { 
-      X = B; 
-      Y = C; 
-      Z = A; 
-    } else if (B == C) { 
-      X = A; 
-      Y = D; 
-      Z = B; 
-    } else if (B == D) { 
-      X = A; 
-      Y = C; 
-      Z = B; 
-    } 
- 
-    if (X) { // Build (X^Y) & Z 
-      Op1 = Builder.CreateXor(X, Y); 
-      Op1 = Builder.CreateAnd(Op1, Z); 
-      return new ICmpInst(Pred, Op1, Constant::getNullValue(Op1->getType())); 
-    } 
-  } 
- 
-  // Transform (zext A) == (B & (1<<X)-1) --> A == (trunc B) 
-  // and       (B & (1<<X)-1) == (zext A) --> A == (trunc B) 
-  ConstantInt *Cst1; 
-  if ((Op0->hasOneUse() && match(Op0, m_ZExt(m_Value(A))) && 
-       match(Op1, m_And(m_Value(B), m_ConstantInt(Cst1)))) || 
-      (Op1->hasOneUse() && match(Op0, m_And(m_Value(B), m_ConstantInt(Cst1))) && 
-       match(Op1, m_ZExt(m_Value(A))))) { 
-    APInt Pow2 = Cst1->getValue() + 1; 
-    if (Pow2.isPowerOf2() && isa<IntegerType>(A->getType()) && 
-        Pow2.logBase2() == cast<IntegerType>(A->getType())->getBitWidth()) 
-      return new ICmpInst(Pred, A, Builder.CreateTrunc(B, A->getType())); 
-  } 
- 
-  // (A >> C) == (B >> C) --> (A^B) u< (1 << C) 
-  // For lshr and ashr pairs. 
-  if ((match(Op0, m_OneUse(m_LShr(m_Value(A), m_ConstantInt(Cst1)))) && 
-       match(Op1, m_OneUse(m_LShr(m_Value(B), m_Specific(Cst1))))) || 
-      (match(Op0, m_OneUse(m_AShr(m_Value(A), m_ConstantInt(Cst1)))) && 
-       match(Op1, m_OneUse(m_AShr(m_Value(B), m_Specific(Cst1)))))) { 
-    unsigned TypeBits = Cst1->getBitWidth(); 
-    unsigned ShAmt = (unsigned)Cst1->getLimitedValue(TypeBits); 
-    if (ShAmt < TypeBits && ShAmt != 0) { 
-      ICmpInst::Predicate NewPred = 
-          Pred == ICmpInst::ICMP_NE ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT; 
-      Value *Xor = Builder.CreateXor(A, B, I.getName() + ".unshifted"); 
-      APInt CmpVal = APInt::getOneBitSet(TypeBits, ShAmt); 
-      return new ICmpInst(NewPred, Xor, Builder.getInt(CmpVal)); 
-    } 
-  } 
- 
-  // (A << C) == (B << C) --> ((A^B) & (~0U >> C)) == 0 
-  if (match(Op0, m_OneUse(m_Shl(m_Value(A), m_ConstantInt(Cst1)))) && 
-      match(Op1, m_OneUse(m_Shl(m_Value(B), m_Specific(Cst1))))) { 
-    unsigned TypeBits = Cst1->getBitWidth(); 
-    unsigned ShAmt = (unsigned)Cst1->getLimitedValue(TypeBits); 
-    if (ShAmt < TypeBits && ShAmt != 0) { 
-      Value *Xor = Builder.CreateXor(A, B, I.getName() + ".unshifted"); 
-      APInt AndVal = APInt::getLowBitsSet(TypeBits, TypeBits - ShAmt); 
-      Value *And = Builder.CreateAnd(Xor, Builder.getInt(AndVal), 
-                                      I.getName() + ".mask"); 
-      return new ICmpInst(Pred, And, Constant::getNullValue(Cst1->getType())); 
-    } 
-  } 
- 
-  // Transform "icmp eq (trunc (lshr(X, cst1)), cst" to 
-  // "icmp (and X, mask), cst" 
-  uint64_t ShAmt = 0; 
-  if (Op0->hasOneUse() && 
-      match(Op0, m_Trunc(m_OneUse(m_LShr(m_Value(A), m_ConstantInt(ShAmt))))) && 
-      match(Op1, m_ConstantInt(Cst1)) && 
-      // Only do this when A has multiple uses.  This is most important to do 
-      // when it exposes other optimizations. 
-      !A->hasOneUse()) { 
-    unsigned ASize = cast<IntegerType>(A->getType())->getPrimitiveSizeInBits(); 
- 
-    if (ShAmt < ASize) { 
-      APInt MaskV = 
-          APInt::getLowBitsSet(ASize, Op0->getType()->getPrimitiveSizeInBits()); 
-      MaskV <<= ShAmt; 
- 
-      APInt CmpV = Cst1->getValue().zext(ASize); 
-      CmpV <<= ShAmt; 
- 
-      Value *Mask = Builder.CreateAnd(A, Builder.getInt(MaskV)); 
-      return new ICmpInst(Pred, Mask, Builder.getInt(CmpV)); 
-    } 
-  } 
- 
-  // If both operands are byte-swapped or bit-reversed, just compare the 
-  // original values. 
-  // TODO: Move this to a function similar to foldICmpIntrinsicWithConstant() 
-  // and handle more intrinsics. 
-  if ((match(Op0, m_BSwap(m_Value(A))) && match(Op1, m_BSwap(m_Value(B)))) || 
-      (match(Op0, m_BitReverse(m_Value(A))) && 
-       match(Op1, m_BitReverse(m_Value(B))))) 
-    return new ICmpInst(Pred, A, B); 
- 
-  // Canonicalize checking for a power-of-2-or-zero value: 
-  // (A & (A-1)) == 0 --> ctpop(A) < 2 (two commuted variants) 
-  // ((A-1) & A) != 0 --> ctpop(A) > 1 (two commuted variants) 
-  if (!match(Op0, m_OneUse(m_c_And(m_Add(m_Value(A), m_AllOnes()), 
-                                   m_Deferred(A)))) || 
-      !match(Op1, m_ZeroInt())) 
-    A = nullptr; 
- 
-  // (A & -A) == A --> ctpop(A) < 2 (four commuted variants) 
-  // (-A & A) != A --> ctpop(A) > 1 (four commuted variants) 
-  if (match(Op0, m_OneUse(m_c_And(m_Neg(m_Specific(Op1)), m_Specific(Op1))))) 
-    A = Op1; 
-  else if (match(Op1, 
-                 m_OneUse(m_c_And(m_Neg(m_Specific(Op0)), m_Specific(Op0))))) 
-    A = Op0; 
- 
-  if (A) { 
-    Type *Ty = A->getType(); 
-    CallInst *CtPop = Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, A); 
-    return Pred == ICmpInst::ICMP_EQ 
-        ? new ICmpInst(ICmpInst::ICMP_ULT, CtPop, ConstantInt::get(Ty, 2)) 
-        : new ICmpInst(ICmpInst::ICMP_UGT, CtPop, ConstantInt::get(Ty, 1)); 
-  } 
- 
-  return nullptr; 
-} 
- 
-static Instruction *foldICmpWithZextOrSext(ICmpInst &ICmp, 
-                                           InstCombiner::BuilderTy &Builder) { 
-  assert(isa<CastInst>(ICmp.getOperand(0)) && "Expected cast for operand 0"); 
-  auto *CastOp0 = cast<CastInst>(ICmp.getOperand(0)); 
-  Value *X; 
-  if (!match(CastOp0, m_ZExtOrSExt(m_Value(X)))) 
-    return nullptr; 
- 
-  bool IsSignedExt = CastOp0->getOpcode() == Instruction::SExt; 
-  bool IsSignedCmp = ICmp.isSigned(); 
-  if (auto *CastOp1 = dyn_cast<CastInst>(ICmp.getOperand(1))) { 
-    // If the signedness of the two casts doesn't agree (i.e. one is a sext 
-    // and the other is a zext), then we can't handle this. 
-    // TODO: This is too strict. We can handle some predicates (equality?). 
-    if (CastOp0->getOpcode() != CastOp1->getOpcode()) 
-      return nullptr; 
- 
-    // Not an extension from the same type? 
-    Value *Y = CastOp1->getOperand(0); 
-    Type *XTy = X->getType(), *YTy = Y->getType(); 
-    if (XTy != YTy) { 
-      // One of the casts must have one use because we are creating a new cast. 
-      if (!CastOp0->hasOneUse() && !CastOp1->hasOneUse()) 
-        return nullptr; 
-      // Extend the narrower operand to the type of the wider operand. 
-      if (XTy->getScalarSizeInBits() < YTy->getScalarSizeInBits()) 
-        X = Builder.CreateCast(CastOp0->getOpcode(), X, YTy); 
-      else if (YTy->getScalarSizeInBits() < XTy->getScalarSizeInBits()) 
-        Y = Builder.CreateCast(CastOp0->getOpcode(), Y, XTy); 
-      else 
-        return nullptr; 
-    } 
- 
-    // (zext X) == (zext Y) --> X == Y 
-    // (sext X) == (sext Y) --> X == Y 
-    if (ICmp.isEquality()) 
-      return new ICmpInst(ICmp.getPredicate(), X, Y); 
- 
-    // A signed comparison of sign extended values simplifies into a 
-    // signed comparison. 
-    if (IsSignedCmp && IsSignedExt) 
-      return new ICmpInst(ICmp.getPredicate(), X, Y); 
- 
-    // The other three cases all fold into an unsigned comparison. 
-    return new ICmpInst(ICmp.getUnsignedPredicate(), X, Y); 
-  } 
- 
-  // Below here, we are only folding a compare with constant. 
-  auto *C = dyn_cast<Constant>(ICmp.getOperand(1)); 
-  if (!C) 
-    return nullptr; 
- 
-  // Compute the constant that would happen if we truncated to SrcTy then 
-  // re-extended to DestTy. 
-  Type *SrcTy = CastOp0->getSrcTy(); 
-  Type *DestTy = CastOp0->getDestTy(); 
-  Constant *Res1 = ConstantExpr::getTrunc(C, SrcTy); 
-  Constant *Res2 = ConstantExpr::getCast(CastOp0->getOpcode(), Res1, DestTy); 
- 
-  // If the re-extended constant didn't change... 
-  if (Res2 == C) { 
-    if (ICmp.isEquality()) 
-      return new ICmpInst(ICmp.getPredicate(), X, Res1); 
- 
-    // A signed comparison of sign extended values simplifies into a 
-    // signed comparison. 
-    if (IsSignedExt && IsSignedCmp) 
-      return new ICmpInst(ICmp.getPredicate(), X, Res1); 
- 
-    // The other three cases all fold into an unsigned comparison. 
-    return new ICmpInst(ICmp.getUnsignedPredicate(), X, Res1); 
-  } 
- 
-  // The re-extended constant changed, partly changed (in the case of a vector), 
-  // or could not be determined to be equal (in the case of a constant 
-  // expression), so the constant cannot be represented in the shorter type. 
-  // All the cases that fold to true or false will have already been handled 
-  // by SimplifyICmpInst, so only deal with the tricky case. 
-  if (IsSignedCmp || !IsSignedExt || !isa<ConstantInt>(C)) 
-    return nullptr; 
- 
-  // Is source op positive? 
-  // icmp ult (sext X), C --> icmp sgt X, -1 
-  if (ICmp.getPredicate() == ICmpInst::ICMP_ULT) 
-    return new ICmpInst(CmpInst::ICMP_SGT, X, Constant::getAllOnesValue(SrcTy)); 
- 
-  // Is source op negative? 
-  // icmp ugt (sext X), C --> icmp slt X, 0 
-  assert(ICmp.getPredicate() == ICmpInst::ICMP_UGT && "ICmp should be folded!"); 
-  return new ICmpInst(CmpInst::ICMP_SLT, X, Constant::getNullValue(SrcTy)); 
-} 
- 
-/// Handle icmp (cast x), (cast or constant). 
+  if (!I.isEquality())
+    return nullptr;
+
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  const CmpInst::Predicate Pred = I.getPredicate();
+  Value *A, *B, *C, *D;
+  if (match(Op0, m_Xor(m_Value(A), m_Value(B)))) {
+    if (A == Op1 || B == Op1) { // (A^B) == A  ->  B == 0
+      Value *OtherVal = A == Op1 ? B : A;
+      return new ICmpInst(Pred, OtherVal, Constant::getNullValue(A->getType()));
+    }
+
+    if (match(Op1, m_Xor(m_Value(C), m_Value(D)))) {
+      // A^c1 == C^c2 --> A == C^(c1^c2)
+      ConstantInt *C1, *C2;
+      if (match(B, m_ConstantInt(C1)) && match(D, m_ConstantInt(C2)) &&
+          Op1->hasOneUse()) {
+        Constant *NC = Builder.getInt(C1->getValue() ^ C2->getValue());
+        Value *Xor = Builder.CreateXor(C, NC);
+        return new ICmpInst(Pred, A, Xor);
+      }
+
+      // A^B == A^D -> B == D
+      if (A == C)
+        return new ICmpInst(Pred, B, D);
+      if (A == D)
+        return new ICmpInst(Pred, B, C);
+      if (B == C)
+        return new ICmpInst(Pred, A, D);
+      if (B == D)
+        return new ICmpInst(Pred, A, C);
+    }
+  }
+
+  if (match(Op1, m_Xor(m_Value(A), m_Value(B))) && (A == Op0 || B == Op0)) {
+    // A == (A^B)  ->  B == 0
+    Value *OtherVal = A == Op0 ? B : A;
+    return new ICmpInst(Pred, OtherVal, Constant::getNullValue(A->getType()));
+  }
+
+  // (X&Z) == (Y&Z) -> (X^Y) & Z == 0
+  if (match(Op0, m_OneUse(m_And(m_Value(A), m_Value(B)))) &&
+      match(Op1, m_OneUse(m_And(m_Value(C), m_Value(D))))) {
+    Value *X = nullptr, *Y = nullptr, *Z = nullptr;
+
+    if (A == C) {
+      X = B;
+      Y = D;
+      Z = A;
+    } else if (A == D) {
+      X = B;
+      Y = C;
+      Z = A;
+    } else if (B == C) {
+      X = A;
+      Y = D;
+      Z = B;
+    } else if (B == D) {
+      X = A;
+      Y = C;
+      Z = B;
+    }
+
+    if (X) { // Build (X^Y) & Z
+      Op1 = Builder.CreateXor(X, Y);
+      Op1 = Builder.CreateAnd(Op1, Z);
+      return new ICmpInst(Pred, Op1, Constant::getNullValue(Op1->getType()));
+    }
+  }
+
+  // Transform (zext A) == (B & (1<<X)-1) --> A == (trunc B)
+  // and       (B & (1<<X)-1) == (zext A) --> A == (trunc B)
+  ConstantInt *Cst1;
+  if ((Op0->hasOneUse() && match(Op0, m_ZExt(m_Value(A))) &&
+       match(Op1, m_And(m_Value(B), m_ConstantInt(Cst1)))) ||
+      (Op1->hasOneUse() && match(Op0, m_And(m_Value(B), m_ConstantInt(Cst1))) &&
+       match(Op1, m_ZExt(m_Value(A))))) {
+    APInt Pow2 = Cst1->getValue() + 1;
+    if (Pow2.isPowerOf2() && isa<IntegerType>(A->getType()) &&
+        Pow2.logBase2() == cast<IntegerType>(A->getType())->getBitWidth())
+      return new ICmpInst(Pred, A, Builder.CreateTrunc(B, A->getType()));
+  }
+
+  // (A >> C) == (B >> C) --> (A^B) u< (1 << C)
+  // For lshr and ashr pairs.
+  if ((match(Op0, m_OneUse(m_LShr(m_Value(A), m_ConstantInt(Cst1)))) &&
+       match(Op1, m_OneUse(m_LShr(m_Value(B), m_Specific(Cst1))))) ||
+      (match(Op0, m_OneUse(m_AShr(m_Value(A), m_ConstantInt(Cst1)))) &&
+       match(Op1, m_OneUse(m_AShr(m_Value(B), m_Specific(Cst1)))))) {
+    unsigned TypeBits = Cst1->getBitWidth();
+    unsigned ShAmt = (unsigned)Cst1->getLimitedValue(TypeBits);
+    if (ShAmt < TypeBits && ShAmt != 0) {
+      ICmpInst::Predicate NewPred =
+          Pred == ICmpInst::ICMP_NE ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT;
+      Value *Xor = Builder.CreateXor(A, B, I.getName() + ".unshifted");
+      APInt CmpVal = APInt::getOneBitSet(TypeBits, ShAmt);
+      return new ICmpInst(NewPred, Xor, Builder.getInt(CmpVal));
+    }
+  }
+
+  // (A << C) == (B << C) --> ((A^B) & (~0U >> C)) == 0
+  if (match(Op0, m_OneUse(m_Shl(m_Value(A), m_ConstantInt(Cst1)))) &&
+      match(Op1, m_OneUse(m_Shl(m_Value(B), m_Specific(Cst1))))) {
+    unsigned TypeBits = Cst1->getBitWidth();
+    unsigned ShAmt = (unsigned)Cst1->getLimitedValue(TypeBits);
+    if (ShAmt < TypeBits && ShAmt != 0) {
+      Value *Xor = Builder.CreateXor(A, B, I.getName() + ".unshifted");
+      APInt AndVal = APInt::getLowBitsSet(TypeBits, TypeBits - ShAmt);
+      Value *And = Builder.CreateAnd(Xor, Builder.getInt(AndVal),
+                                      I.getName() + ".mask");
+      return new ICmpInst(Pred, And, Constant::getNullValue(Cst1->getType()));
+    }
+  }
+
+  // Transform "icmp eq (trunc (lshr(X, cst1)), cst" to
+  // "icmp (and X, mask), cst"
+  uint64_t ShAmt = 0;
+  if (Op0->hasOneUse() &&
+      match(Op0, m_Trunc(m_OneUse(m_LShr(m_Value(A), m_ConstantInt(ShAmt))))) &&
+      match(Op1, m_ConstantInt(Cst1)) &&
+      // Only do this when A has multiple uses.  This is most important to do
+      // when it exposes other optimizations.
+      !A->hasOneUse()) {
+    unsigned ASize = cast<IntegerType>(A->getType())->getPrimitiveSizeInBits();
+
+    if (ShAmt < ASize) {
+      APInt MaskV =
+          APInt::getLowBitsSet(ASize, Op0->getType()->getPrimitiveSizeInBits());
+      MaskV <<= ShAmt;
+
+      APInt CmpV = Cst1->getValue().zext(ASize);
+      CmpV <<= ShAmt;
+
+      Value *Mask = Builder.CreateAnd(A, Builder.getInt(MaskV));
+      return new ICmpInst(Pred, Mask, Builder.getInt(CmpV));
+    }
+  }
+
+  // If both operands are byte-swapped or bit-reversed, just compare the
+  // original values.
+  // TODO: Move this to a function similar to foldICmpIntrinsicWithConstant()
+  // and handle more intrinsics.
+  if ((match(Op0, m_BSwap(m_Value(A))) && match(Op1, m_BSwap(m_Value(B)))) ||
+      (match(Op0, m_BitReverse(m_Value(A))) &&
+       match(Op1, m_BitReverse(m_Value(B)))))
+    return new ICmpInst(Pred, A, B);
+
+  // Canonicalize checking for a power-of-2-or-zero value:
+  // (A & (A-1)) == 0 --> ctpop(A) < 2 (two commuted variants)
+  // ((A-1) & A) != 0 --> ctpop(A) > 1 (two commuted variants)
+  if (!match(Op0, m_OneUse(m_c_And(m_Add(m_Value(A), m_AllOnes()),
+                                   m_Deferred(A)))) ||
+      !match(Op1, m_ZeroInt()))
+    A = nullptr;
+
+  // (A & -A) == A --> ctpop(A) < 2 (four commuted variants)
+  // (-A & A) != A --> ctpop(A) > 1 (four commuted variants)
+  if (match(Op0, m_OneUse(m_c_And(m_Neg(m_Specific(Op1)), m_Specific(Op1)))))
+    A = Op1;
+  else if (match(Op1,
+                 m_OneUse(m_c_And(m_Neg(m_Specific(Op0)), m_Specific(Op0)))))
+    A = Op0;
+
+  if (A) {
+    Type *Ty = A->getType();
+    CallInst *CtPop = Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, A);
+    return Pred == ICmpInst::ICMP_EQ
+        ? new ICmpInst(ICmpInst::ICMP_ULT, CtPop, ConstantInt::get(Ty, 2))
+        : new ICmpInst(ICmpInst::ICMP_UGT, CtPop, ConstantInt::get(Ty, 1));
+  }
+
+  return nullptr;
+}
+
+static Instruction *foldICmpWithZextOrSext(ICmpInst &ICmp,
+                                           InstCombiner::BuilderTy &Builder) {
+  assert(isa<CastInst>(ICmp.getOperand(0)) && "Expected cast for operand 0");
+  auto *CastOp0 = cast<CastInst>(ICmp.getOperand(0));
+  Value *X;
+  if (!match(CastOp0, m_ZExtOrSExt(m_Value(X))))
+    return nullptr;
+
+  bool IsSignedExt = CastOp0->getOpcode() == Instruction::SExt;
+  bool IsSignedCmp = ICmp.isSigned();
+  if (auto *CastOp1 = dyn_cast<CastInst>(ICmp.getOperand(1))) {
+    // If the signedness of the two casts doesn't agree (i.e. one is a sext
+    // and the other is a zext), then we can't handle this.
+    // TODO: This is too strict. We can handle some predicates (equality?).
+    if (CastOp0->getOpcode() != CastOp1->getOpcode())
+      return nullptr;
+
+    // Not an extension from the same type?
+    Value *Y = CastOp1->getOperand(0);
+    Type *XTy = X->getType(), *YTy = Y->getType();
+    if (XTy != YTy) {
+      // One of the casts must have one use because we are creating a new cast.
+      if (!CastOp0->hasOneUse() && !CastOp1->hasOneUse())
+        return nullptr;
+      // Extend the narrower operand to the type of the wider operand.
+      if (XTy->getScalarSizeInBits() < YTy->getScalarSizeInBits())
+        X = Builder.CreateCast(CastOp0->getOpcode(), X, YTy);
+      else if (YTy->getScalarSizeInBits() < XTy->getScalarSizeInBits())
+        Y = Builder.CreateCast(CastOp0->getOpcode(), Y, XTy);
+      else
+        return nullptr;
+    }
+
+    // (zext X) == (zext Y) --> X == Y
+    // (sext X) == (sext Y) --> X == Y
+    if (ICmp.isEquality())
+      return new ICmpInst(ICmp.getPredicate(), X, Y);
+
+    // A signed comparison of sign extended values simplifies into a
+    // signed comparison.
+    if (IsSignedCmp && IsSignedExt)
+      return new ICmpInst(ICmp.getPredicate(), X, Y);
+
+    // The other three cases all fold into an unsigned comparison.
+    return new ICmpInst(ICmp.getUnsignedPredicate(), X, Y);
+  }
+
+  // Below here, we are only folding a compare with constant.
+  auto *C = dyn_cast<Constant>(ICmp.getOperand(1));
+  if (!C)
+    return nullptr;
+
+  // Compute the constant that would happen if we truncated to SrcTy then
+  // re-extended to DestTy.
+  Type *SrcTy = CastOp0->getSrcTy();
+  Type *DestTy = CastOp0->getDestTy();
+  Constant *Res1 = ConstantExpr::getTrunc(C, SrcTy);
+  Constant *Res2 = ConstantExpr::getCast(CastOp0->getOpcode(), Res1, DestTy);
+
+  // If the re-extended constant didn't change...
+  if (Res2 == C) {
+    if (ICmp.isEquality())
+      return new ICmpInst(ICmp.getPredicate(), X, Res1);
+
+    // A signed comparison of sign extended values simplifies into a
+    // signed comparison.
+    if (IsSignedExt && IsSignedCmp)
+      return new ICmpInst(ICmp.getPredicate(), X, Res1);
+
+    // The other three cases all fold into an unsigned comparison.
+    return new ICmpInst(ICmp.getUnsignedPredicate(), X, Res1);
+  }
+
+  // The re-extended constant changed, partly changed (in the case of a vector),
+  // or could not be determined to be equal (in the case of a constant
+  // expression), so the constant cannot be represented in the shorter type.
+  // All the cases that fold to true or false will have already been handled
+  // by SimplifyICmpInst, so only deal with the tricky case.
+  if (IsSignedCmp || !IsSignedExt || !isa<ConstantInt>(C))
+    return nullptr;
+
+  // Is source op positive?
+  // icmp ult (sext X), C --> icmp sgt X, -1
+  if (ICmp.getPredicate() == ICmpInst::ICMP_ULT)
+    return new ICmpInst(CmpInst::ICMP_SGT, X, Constant::getAllOnesValue(SrcTy));
+
+  // Is source op negative?
+  // icmp ugt (sext X), C --> icmp slt X, 0
+  assert(ICmp.getPredicate() == ICmpInst::ICMP_UGT && "ICmp should be folded!");
+  return new ICmpInst(CmpInst::ICMP_SLT, X, Constant::getNullValue(SrcTy));
+}
+
+/// Handle icmp (cast x), (cast or constant).
 Instruction *InstCombinerImpl::foldICmpWithCastOp(ICmpInst &ICmp) {
-  auto *CastOp0 = dyn_cast<CastInst>(ICmp.getOperand(0)); 
-  if (!CastOp0) 
-    return nullptr; 
-  if (!isa<Constant>(ICmp.getOperand(1)) && !isa<CastInst>(ICmp.getOperand(1))) 
-    return nullptr; 
- 
-  Value *Op0Src = CastOp0->getOperand(0); 
-  Type *SrcTy = CastOp0->getSrcTy(); 
-  Type *DestTy = CastOp0->getDestTy(); 
- 
-  // Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the 
-  // integer type is the same size as the pointer type. 
-  auto CompatibleSizes = [&](Type *SrcTy, Type *DestTy) { 
-    if (isa<VectorType>(SrcTy)) { 
-      SrcTy = cast<VectorType>(SrcTy)->getElementType(); 
-      DestTy = cast<VectorType>(DestTy)->getElementType(); 
-    } 
-    return DL.getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth(); 
-  }; 
-  if (CastOp0->getOpcode() == Instruction::PtrToInt && 
-      CompatibleSizes(SrcTy, DestTy)) { 
-    Value *NewOp1 = nullptr; 
-    if (auto *PtrToIntOp1 = dyn_cast<PtrToIntOperator>(ICmp.getOperand(1))) { 
-      Value *PtrSrc = PtrToIntOp1->getOperand(0); 
-      if (PtrSrc->getType()->getPointerAddressSpace() == 
-          Op0Src->getType()->getPointerAddressSpace()) { 
-        NewOp1 = PtrToIntOp1->getOperand(0); 
-        // If the pointer types don't match, insert a bitcast. 
-        if (Op0Src->getType() != NewOp1->getType()) 
-          NewOp1 = Builder.CreateBitCast(NewOp1, Op0Src->getType()); 
-      } 
-    } else if (auto *RHSC = dyn_cast<Constant>(ICmp.getOperand(1))) { 
-      NewOp1 = ConstantExpr::getIntToPtr(RHSC, SrcTy); 
-    } 
- 
-    if (NewOp1) 
-      return new ICmpInst(ICmp.getPredicate(), Op0Src, NewOp1); 
-  } 
- 
-  return foldICmpWithZextOrSext(ICmp, Builder); 
-} 
- 
-static bool isNeutralValue(Instruction::BinaryOps BinaryOp, Value *RHS) { 
-  switch (BinaryOp) { 
-    default: 
-      llvm_unreachable("Unsupported binary op"); 
-    case Instruction::Add: 
-    case Instruction::Sub: 
-      return match(RHS, m_Zero()); 
-    case Instruction::Mul: 
-      return match(RHS, m_One()); 
-  } 
-} 
- 
+  auto *CastOp0 = dyn_cast<CastInst>(ICmp.getOperand(0));
+  if (!CastOp0)
+    return nullptr;
+  if (!isa<Constant>(ICmp.getOperand(1)) && !isa<CastInst>(ICmp.getOperand(1)))
+    return nullptr;
+
+  Value *Op0Src = CastOp0->getOperand(0);
+  Type *SrcTy = CastOp0->getSrcTy();
+  Type *DestTy = CastOp0->getDestTy();
+
+  // Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the
+  // integer type is the same size as the pointer type.
+  auto CompatibleSizes = [&](Type *SrcTy, Type *DestTy) {
+    if (isa<VectorType>(SrcTy)) {
+      SrcTy = cast<VectorType>(SrcTy)->getElementType();
+      DestTy = cast<VectorType>(DestTy)->getElementType();
+    }
+    return DL.getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth();
+  };
+  if (CastOp0->getOpcode() == Instruction::PtrToInt &&
+      CompatibleSizes(SrcTy, DestTy)) {
+    Value *NewOp1 = nullptr;
+    if (auto *PtrToIntOp1 = dyn_cast<PtrToIntOperator>(ICmp.getOperand(1))) {
+      Value *PtrSrc = PtrToIntOp1->getOperand(0);
+      if (PtrSrc->getType()->getPointerAddressSpace() ==
+          Op0Src->getType()->getPointerAddressSpace()) {
+        NewOp1 = PtrToIntOp1->getOperand(0);
+        // If the pointer types don't match, insert a bitcast.
+        if (Op0Src->getType() != NewOp1->getType())
+          NewOp1 = Builder.CreateBitCast(NewOp1, Op0Src->getType());
+      }
+    } else if (auto *RHSC = dyn_cast<Constant>(ICmp.getOperand(1))) {
+      NewOp1 = ConstantExpr::getIntToPtr(RHSC, SrcTy);
+    }
+
+    if (NewOp1)
+      return new ICmpInst(ICmp.getPredicate(), Op0Src, NewOp1);
+  }
+
+  return foldICmpWithZextOrSext(ICmp, Builder);
+}
+
+static bool isNeutralValue(Instruction::BinaryOps BinaryOp, Value *RHS) {
+  switch (BinaryOp) {
+    default:
+      llvm_unreachable("Unsupported binary op");
+    case Instruction::Add:
+    case Instruction::Sub:
+      return match(RHS, m_Zero());
+    case Instruction::Mul:
+      return match(RHS, m_One());
+  }
+}
+
 OverflowResult
 InstCombinerImpl::computeOverflow(Instruction::BinaryOps BinaryOp,
                                   bool IsSigned, Value *LHS, Value *RHS,
                                   Instruction *CxtI) const {
-  switch (BinaryOp) { 
-    default: 
-      llvm_unreachable("Unsupported binary op"); 
-    case Instruction::Add: 
-      if (IsSigned) 
-        return computeOverflowForSignedAdd(LHS, RHS, CxtI); 
-      else 
-        return computeOverflowForUnsignedAdd(LHS, RHS, CxtI); 
-    case Instruction::Sub: 
-      if (IsSigned) 
-        return computeOverflowForSignedSub(LHS, RHS, CxtI); 
-      else 
-        return computeOverflowForUnsignedSub(LHS, RHS, CxtI); 
-    case Instruction::Mul: 
-      if (IsSigned) 
-        return computeOverflowForSignedMul(LHS, RHS, CxtI); 
-      else 
-        return computeOverflowForUnsignedMul(LHS, RHS, CxtI); 
-  } 
-} 
- 
+  switch (BinaryOp) {
+    default:
+      llvm_unreachable("Unsupported binary op");
+    case Instruction::Add:
+      if (IsSigned)
+        return computeOverflowForSignedAdd(LHS, RHS, CxtI);
+      else
+        return computeOverflowForUnsignedAdd(LHS, RHS, CxtI);
+    case Instruction::Sub:
+      if (IsSigned)
+        return computeOverflowForSignedSub(LHS, RHS, CxtI);
+      else
+        return computeOverflowForUnsignedSub(LHS, RHS, CxtI);
+    case Instruction::Mul:
+      if (IsSigned)
+        return computeOverflowForSignedMul(LHS, RHS, CxtI);
+      else
+        return computeOverflowForUnsignedMul(LHS, RHS, CxtI);
+  }
+}
+
 bool InstCombinerImpl::OptimizeOverflowCheck(Instruction::BinaryOps BinaryOp,
                                              bool IsSigned, Value *LHS,
                                              Value *RHS, Instruction &OrigI,
                                              Value *&Result,
                                              Constant *&Overflow) {
-  if (OrigI.isCommutative() && isa<Constant>(LHS) && !isa<Constant>(RHS)) 
-    std::swap(LHS, RHS); 
- 
-  // If the overflow check was an add followed by a compare, the insertion point 
-  // may be pointing to the compare.  We want to insert the new instructions 
-  // before the add in case there are uses of the add between the add and the 
-  // compare. 
-  Builder.SetInsertPoint(&OrigI); 
- 
+  if (OrigI.isCommutative() && isa<Constant>(LHS) && !isa<Constant>(RHS))
+    std::swap(LHS, RHS);
+
+  // If the overflow check was an add followed by a compare, the insertion point
+  // may be pointing to the compare.  We want to insert the new instructions
+  // before the add in case there are uses of the add between the add and the
+  // compare.
+  Builder.SetInsertPoint(&OrigI);
+
   Type *OverflowTy = Type::getInt1Ty(LHS->getContext());
   if (auto *LHSTy = dyn_cast<VectorType>(LHS->getType()))
     OverflowTy = VectorType::get(OverflowTy, LHSTy->getElementCount());
 
-  if (isNeutralValue(BinaryOp, RHS)) { 
-    Result = LHS; 
+  if (isNeutralValue(BinaryOp, RHS)) {
+    Result = LHS;
     Overflow = ConstantInt::getFalse(OverflowTy);
-    return true; 
-  } 
- 
-  switch (computeOverflow(BinaryOp, IsSigned, LHS, RHS, &OrigI)) { 
-    case OverflowResult::MayOverflow: 
-      return false; 
-    case OverflowResult::AlwaysOverflowsLow: 
-    case OverflowResult::AlwaysOverflowsHigh: 
-      Result = Builder.CreateBinOp(BinaryOp, LHS, RHS); 
-      Result->takeName(&OrigI); 
+    return true;
+  }
+
+  switch (computeOverflow(BinaryOp, IsSigned, LHS, RHS, &OrigI)) {
+    case OverflowResult::MayOverflow:
+      return false;
+    case OverflowResult::AlwaysOverflowsLow:
+    case OverflowResult::AlwaysOverflowsHigh:
+      Result = Builder.CreateBinOp(BinaryOp, LHS, RHS);
+      Result->takeName(&OrigI);
       Overflow = ConstantInt::getTrue(OverflowTy);
-      return true; 
-    case OverflowResult::NeverOverflows: 
-      Result = Builder.CreateBinOp(BinaryOp, LHS, RHS); 
-      Result->takeName(&OrigI); 
+      return true;
+    case OverflowResult::NeverOverflows:
+      Result = Builder.CreateBinOp(BinaryOp, LHS, RHS);
+      Result->takeName(&OrigI);
       Overflow = ConstantInt::getFalse(OverflowTy);
-      if (auto *Inst = dyn_cast<Instruction>(Result)) { 
-        if (IsSigned) 
-          Inst->setHasNoSignedWrap(); 
-        else 
-          Inst->setHasNoUnsignedWrap(); 
-      } 
-      return true; 
-  } 
- 
-  llvm_unreachable("Unexpected overflow result"); 
-} 
- 
-/// Recognize and process idiom involving test for multiplication 
-/// overflow. 
-/// 
-/// The caller has matched a pattern of the form: 
-///   I = cmp u (mul(zext A, zext B), V 
-/// The function checks if this is a test for overflow and if so replaces 
-/// multiplication with call to 'mul.with.overflow' intrinsic. 
-/// 
-/// \param I Compare instruction. 
-/// \param MulVal Result of 'mult' instruction.  It is one of the arguments of 
-///               the compare instruction.  Must be of integer type. 
-/// \param OtherVal The other argument of compare instruction. 
-/// \returns Instruction which must replace the compare instruction, NULL if no 
-///          replacement required. 
-static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal, 
+      if (auto *Inst = dyn_cast<Instruction>(Result)) {
+        if (IsSigned)
+          Inst->setHasNoSignedWrap();
+        else
+          Inst->setHasNoUnsignedWrap();
+      }
+      return true;
+  }
+
+  llvm_unreachable("Unexpected overflow result");
+}
+
+/// Recognize and process idiom involving test for multiplication
+/// overflow.
+///
+/// The caller has matched a pattern of the form:
+///   I = cmp u (mul(zext A, zext B), V
+/// The function checks if this is a test for overflow and if so replaces
+/// multiplication with call to 'mul.with.overflow' intrinsic.
+///
+/// \param I Compare instruction.
+/// \param MulVal Result of 'mult' instruction.  It is one of the arguments of
+///               the compare instruction.  Must be of integer type.
+/// \param OtherVal The other argument of compare instruction.
+/// \returns Instruction which must replace the compare instruction, NULL if no
+///          replacement required.
+static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal,
                                          Value *OtherVal,
                                          InstCombinerImpl &IC) {
-  // Don't bother doing this transformation for pointers, don't do it for 
-  // vectors. 
-  if (!isa<IntegerType>(MulVal->getType())) 
-    return nullptr; 
- 
-  assert(I.getOperand(0) == MulVal || I.getOperand(1) == MulVal); 
-  assert(I.getOperand(0) == OtherVal || I.getOperand(1) == OtherVal); 
-  auto *MulInstr = dyn_cast<Instruction>(MulVal); 
-  if (!MulInstr) 
-    return nullptr; 
-  assert(MulInstr->getOpcode() == Instruction::Mul); 
- 
-  auto *LHS = cast<ZExtOperator>(MulInstr->getOperand(0)), 
-       *RHS = cast<ZExtOperator>(MulInstr->getOperand(1)); 
-  assert(LHS->getOpcode() == Instruction::ZExt); 
-  assert(RHS->getOpcode() == Instruction::ZExt); 
-  Value *A = LHS->getOperand(0), *B = RHS->getOperand(0); 
- 
-  // Calculate type and width of the result produced by mul.with.overflow. 
-  Type *TyA = A->getType(), *TyB = B->getType(); 
-  unsigned WidthA = TyA->getPrimitiveSizeInBits(), 
-           WidthB = TyB->getPrimitiveSizeInBits(); 
-  unsigned MulWidth; 
-  Type *MulType; 
-  if (WidthB > WidthA) { 
-    MulWidth = WidthB; 
-    MulType = TyB; 
-  } else { 
-    MulWidth = WidthA; 
-    MulType = TyA; 
-  } 
- 
-  // In order to replace the original mul with a narrower mul.with.overflow, 
-  // all uses must ignore upper bits of the product.  The number of used low 
-  // bits must be not greater than the width of mul.with.overflow. 
-  if (MulVal->hasNUsesOrMore(2)) 
-    for (User *U : MulVal->users()) { 
-      if (U == &I) 
-        continue; 
-      if (TruncInst *TI = dyn_cast<TruncInst>(U)) { 
-        // Check if truncation ignores bits above MulWidth. 
-        unsigned TruncWidth = TI->getType()->getPrimitiveSizeInBits(); 
-        if (TruncWidth > MulWidth) 
-          return nullptr; 
-      } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U)) { 
-        // Check if AND ignores bits above MulWidth. 
-        if (BO->getOpcode() != Instruction::And) 
-          return nullptr; 
-        if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) { 
-          const APInt &CVal = CI->getValue(); 
-          if (CVal.getBitWidth() - CVal.countLeadingZeros() > MulWidth) 
-            return nullptr; 
-        } else { 
-          // In this case we could have the operand of the binary operation 
-          // being defined in another block, and performing the replacement 
-          // could break the dominance relation. 
-          return nullptr; 
-        } 
-      } else { 
-        // Other uses prohibit this transformation. 
-        return nullptr; 
-      } 
-    } 
- 
-  // Recognize patterns 
-  switch (I.getPredicate()) { 
-  case ICmpInst::ICMP_EQ: 
-  case ICmpInst::ICMP_NE: 
-    // Recognize pattern: 
-    //   mulval = mul(zext A, zext B) 
-    //   cmp eq/neq mulval, and(mulval, mask), mask selects low MulWidth bits. 
-    ConstantInt *CI; 
-    Value *ValToMask; 
-    if (match(OtherVal, m_And(m_Value(ValToMask), m_ConstantInt(CI)))) { 
-      if (ValToMask != MulVal) 
-        return nullptr; 
-      const APInt &CVal = CI->getValue() + 1; 
-      if (CVal.isPowerOf2()) { 
-        unsigned MaskWidth = CVal.logBase2(); 
-        if (MaskWidth == MulWidth) 
-          break; // Recognized 
-      } 
-    } 
-    return nullptr; 
- 
-  case ICmpInst::ICMP_UGT: 
-    // Recognize pattern: 
-    //   mulval = mul(zext A, zext B) 
-    //   cmp ugt mulval, max 
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) { 
-      APInt MaxVal = APInt::getMaxValue(MulWidth); 
-      MaxVal = MaxVal.zext(CI->getBitWidth()); 
-      if (MaxVal.eq(CI->getValue())) 
-        break; // Recognized 
-    } 
-    return nullptr; 
- 
-  case ICmpInst::ICMP_UGE: 
-    // Recognize pattern: 
-    //   mulval = mul(zext A, zext B) 
-    //   cmp uge mulval, max+1 
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) { 
-      APInt MaxVal = APInt::getOneBitSet(CI->getBitWidth(), MulWidth); 
-      if (MaxVal.eq(CI->getValue())) 
-        break; // Recognized 
-    } 
-    return nullptr; 
- 
-  case ICmpInst::ICMP_ULE: 
-    // Recognize pattern: 
-    //   mulval = mul(zext A, zext B) 
-    //   cmp ule mulval, max 
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) { 
-      APInt MaxVal = APInt::getMaxValue(MulWidth); 
-      MaxVal = MaxVal.zext(CI->getBitWidth()); 
-      if (MaxVal.eq(CI->getValue())) 
-        break; // Recognized 
-    } 
-    return nullptr; 
- 
-  case ICmpInst::ICMP_ULT: 
-    // Recognize pattern: 
-    //   mulval = mul(zext A, zext B) 
-    //   cmp ule mulval, max + 1 
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) { 
-      APInt MaxVal = APInt::getOneBitSet(CI->getBitWidth(), MulWidth); 
-      if (MaxVal.eq(CI->getValue())) 
-        break; // Recognized 
-    } 
-    return nullptr; 
- 
-  default: 
-    return nullptr; 
-  } 
- 
-  InstCombiner::BuilderTy &Builder = IC.Builder; 
-  Builder.SetInsertPoint(MulInstr); 
- 
-  // Replace: mul(zext A, zext B) --> mul.with.overflow(A, B) 
-  Value *MulA = A, *MulB = B; 
-  if (WidthA < MulWidth) 
-    MulA = Builder.CreateZExt(A, MulType); 
-  if (WidthB < MulWidth) 
-    MulB = Builder.CreateZExt(B, MulType); 
-  Function *F = Intrinsic::getDeclaration( 
-      I.getModule(), Intrinsic::umul_with_overflow, MulType); 
-  CallInst *Call = Builder.CreateCall(F, {MulA, MulB}, "umul"); 
+  // Don't bother doing this transformation for pointers, don't do it for
+  // vectors.
+  if (!isa<IntegerType>(MulVal->getType()))
+    return nullptr;
+
+  assert(I.getOperand(0) == MulVal || I.getOperand(1) == MulVal);
+  assert(I.getOperand(0) == OtherVal || I.getOperand(1) == OtherVal);
+  auto *MulInstr = dyn_cast<Instruction>(MulVal);
+  if (!MulInstr)
+    return nullptr;
+  assert(MulInstr->getOpcode() == Instruction::Mul);
+
+  auto *LHS = cast<ZExtOperator>(MulInstr->getOperand(0)),
+       *RHS = cast<ZExtOperator>(MulInstr->getOperand(1));
+  assert(LHS->getOpcode() == Instruction::ZExt);
+  assert(RHS->getOpcode() == Instruction::ZExt);
+  Value *A = LHS->getOperand(0), *B = RHS->getOperand(0);
+
+  // Calculate type and width of the result produced by mul.with.overflow.
+  Type *TyA = A->getType(), *TyB = B->getType();
+  unsigned WidthA = TyA->getPrimitiveSizeInBits(),
+           WidthB = TyB->getPrimitiveSizeInBits();
+  unsigned MulWidth;
+  Type *MulType;
+  if (WidthB > WidthA) {
+    MulWidth = WidthB;
+    MulType = TyB;
+  } else {
+    MulWidth = WidthA;
+    MulType = TyA;
+  }
+
+  // In order to replace the original mul with a narrower mul.with.overflow,
+  // all uses must ignore upper bits of the product.  The number of used low
+  // bits must be not greater than the width of mul.with.overflow.
+  if (MulVal->hasNUsesOrMore(2))
+    for (User *U : MulVal->users()) {
+      if (U == &I)
+        continue;
+      if (TruncInst *TI = dyn_cast<TruncInst>(U)) {
+        // Check if truncation ignores bits above MulWidth.
+        unsigned TruncWidth = TI->getType()->getPrimitiveSizeInBits();
+        if (TruncWidth > MulWidth)
+          return nullptr;
+      } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U)) {
+        // Check if AND ignores bits above MulWidth.
+        if (BO->getOpcode() != Instruction::And)
+          return nullptr;
+        if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) {
+          const APInt &CVal = CI->getValue();
+          if (CVal.getBitWidth() - CVal.countLeadingZeros() > MulWidth)
+            return nullptr;
+        } else {
+          // In this case we could have the operand of the binary operation
+          // being defined in another block, and performing the replacement
+          // could break the dominance relation.
+          return nullptr;
+        }
+      } else {
+        // Other uses prohibit this transformation.
+        return nullptr;
+      }
+    }
+
+  // Recognize patterns
+  switch (I.getPredicate()) {
+  case ICmpInst::ICMP_EQ:
+  case ICmpInst::ICMP_NE:
+    // Recognize pattern:
+    //   mulval = mul(zext A, zext B)
+    //   cmp eq/neq mulval, and(mulval, mask), mask selects low MulWidth bits.
+    ConstantInt *CI;
+    Value *ValToMask;
+    if (match(OtherVal, m_And(m_Value(ValToMask), m_ConstantInt(CI)))) {
+      if (ValToMask != MulVal)
+        return nullptr;
+      const APInt &CVal = CI->getValue() + 1;
+      if (CVal.isPowerOf2()) {
+        unsigned MaskWidth = CVal.logBase2();
+        if (MaskWidth == MulWidth)
+          break; // Recognized
+      }
+    }
+    return nullptr;
+
+  case ICmpInst::ICMP_UGT:
+    // Recognize pattern:
+    //   mulval = mul(zext A, zext B)
+    //   cmp ugt mulval, max
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) {
+      APInt MaxVal = APInt::getMaxValue(MulWidth);
+      MaxVal = MaxVal.zext(CI->getBitWidth());
+      if (MaxVal.eq(CI->getValue()))
+        break; // Recognized
+    }
+    return nullptr;
+
+  case ICmpInst::ICMP_UGE:
+    // Recognize pattern:
+    //   mulval = mul(zext A, zext B)
+    //   cmp uge mulval, max+1
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) {
+      APInt MaxVal = APInt::getOneBitSet(CI->getBitWidth(), MulWidth);
+      if (MaxVal.eq(CI->getValue()))
+        break; // Recognized
+    }
+    return nullptr;
+
+  case ICmpInst::ICMP_ULE:
+    // Recognize pattern:
+    //   mulval = mul(zext A, zext B)
+    //   cmp ule mulval, max
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) {
+      APInt MaxVal = APInt::getMaxValue(MulWidth);
+      MaxVal = MaxVal.zext(CI->getBitWidth());
+      if (MaxVal.eq(CI->getValue()))
+        break; // Recognized
+    }
+    return nullptr;
+
+  case ICmpInst::ICMP_ULT:
+    // Recognize pattern:
+    //   mulval = mul(zext A, zext B)
+    //   cmp ule mulval, max + 1
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) {
+      APInt MaxVal = APInt::getOneBitSet(CI->getBitWidth(), MulWidth);
+      if (MaxVal.eq(CI->getValue()))
+        break; // Recognized
+    }
+    return nullptr;
+
+  default:
+    return nullptr;
+  }
+
+  InstCombiner::BuilderTy &Builder = IC.Builder;
+  Builder.SetInsertPoint(MulInstr);
+
+  // Replace: mul(zext A, zext B) --> mul.with.overflow(A, B)
+  Value *MulA = A, *MulB = B;
+  if (WidthA < MulWidth)
+    MulA = Builder.CreateZExt(A, MulType);
+  if (WidthB < MulWidth)
+    MulB = Builder.CreateZExt(B, MulType);
+  Function *F = Intrinsic::getDeclaration(
+      I.getModule(), Intrinsic::umul_with_overflow, MulType);
+  CallInst *Call = Builder.CreateCall(F, {MulA, MulB}, "umul");
   IC.addToWorklist(MulInstr);
- 
-  // If there are uses of mul result other than the comparison, we know that 
-  // they are truncation or binary AND. Change them to use result of 
-  // mul.with.overflow and adjust properly mask/size. 
-  if (MulVal->hasNUsesOrMore(2)) { 
-    Value *Mul = Builder.CreateExtractValue(Call, 0, "umul.value"); 
+
+  // If there are uses of mul result other than the comparison, we know that
+  // they are truncation or binary AND. Change them to use result of
+  // mul.with.overflow and adjust properly mask/size.
+  if (MulVal->hasNUsesOrMore(2)) {
+    Value *Mul = Builder.CreateExtractValue(Call, 0, "umul.value");
     for (User *U : make_early_inc_range(MulVal->users())) {
-      if (U == &I || U == OtherVal) 
-        continue; 
-      if (TruncInst *TI = dyn_cast<TruncInst>(U)) { 
-        if (TI->getType()->getPrimitiveSizeInBits() == MulWidth) 
-          IC.replaceInstUsesWith(*TI, Mul); 
-        else 
-          TI->setOperand(0, Mul); 
-      } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U)) { 
-        assert(BO->getOpcode() == Instruction::And); 
-        // Replace (mul & mask) --> zext (mul.with.overflow & short_mask) 
-        ConstantInt *CI = cast<ConstantInt>(BO->getOperand(1)); 
-        APInt ShortMask = CI->getValue().trunc(MulWidth); 
-        Value *ShortAnd = Builder.CreateAnd(Mul, ShortMask); 
-        Value *Zext = Builder.CreateZExt(ShortAnd, BO->getType()); 
-        IC.replaceInstUsesWith(*BO, Zext); 
-      } else { 
-        llvm_unreachable("Unexpected Binary operation"); 
-      } 
+      if (U == &I || U == OtherVal)
+        continue;
+      if (TruncInst *TI = dyn_cast<TruncInst>(U)) {
+        if (TI->getType()->getPrimitiveSizeInBits() == MulWidth)
+          IC.replaceInstUsesWith(*TI, Mul);
+        else
+          TI->setOperand(0, Mul);
+      } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U)) {
+        assert(BO->getOpcode() == Instruction::And);
+        // Replace (mul & mask) --> zext (mul.with.overflow & short_mask)
+        ConstantInt *CI = cast<ConstantInt>(BO->getOperand(1));
+        APInt ShortMask = CI->getValue().trunc(MulWidth);
+        Value *ShortAnd = Builder.CreateAnd(Mul, ShortMask);
+        Value *Zext = Builder.CreateZExt(ShortAnd, BO->getType());
+        IC.replaceInstUsesWith(*BO, Zext);
+      } else {
+        llvm_unreachable("Unexpected Binary operation");
+      }
       IC.addToWorklist(cast<Instruction>(U));
-    } 
-  } 
-  if (isa<Instruction>(OtherVal)) 
+    }
+  }
+  if (isa<Instruction>(OtherVal))
     IC.addToWorklist(cast<Instruction>(OtherVal));
- 
-  // The original icmp gets replaced with the overflow value, maybe inverted 
-  // depending on predicate. 
-  bool Inverse = false; 
-  switch (I.getPredicate()) { 
-  case ICmpInst::ICMP_NE: 
-    break; 
-  case ICmpInst::ICMP_EQ: 
-    Inverse = true; 
-    break; 
-  case ICmpInst::ICMP_UGT: 
-  case ICmpInst::ICMP_UGE: 
-    if (I.getOperand(0) == MulVal) 
-      break; 
-    Inverse = true; 
-    break; 
-  case ICmpInst::ICMP_ULT: 
-  case ICmpInst::ICMP_ULE: 
-    if (I.getOperand(1) == MulVal) 
-      break; 
-    Inverse = true; 
-    break; 
-  default: 
-    llvm_unreachable("Unexpected predicate"); 
-  } 
-  if (Inverse) { 
-    Value *Res = Builder.CreateExtractValue(Call, 1); 
-    return BinaryOperator::CreateNot(Res); 
-  } 
- 
-  return ExtractValueInst::Create(Call, 1); 
-} 
- 
-/// When performing a comparison against a constant, it is possible that not all 
-/// the bits in the LHS are demanded. This helper method computes the mask that 
-/// IS demanded. 
-static APInt getDemandedBitsLHSMask(ICmpInst &I, unsigned BitWidth) { 
-  const APInt *RHS; 
-  if (!match(I.getOperand(1), m_APInt(RHS))) 
-    return APInt::getAllOnesValue(BitWidth); 
- 
-  // If this is a normal comparison, it demands all bits. If it is a sign bit 
-  // comparison, it only demands the sign bit. 
-  bool UnusedBit; 
+
+  // The original icmp gets replaced with the overflow value, maybe inverted
+  // depending on predicate.
+  bool Inverse = false;
+  switch (I.getPredicate()) {
+  case ICmpInst::ICMP_NE:
+    break;
+  case ICmpInst::ICMP_EQ:
+    Inverse = true;
+    break;
+  case ICmpInst::ICMP_UGT:
+  case ICmpInst::ICMP_UGE:
+    if (I.getOperand(0) == MulVal)
+      break;
+    Inverse = true;
+    break;
+  case ICmpInst::ICMP_ULT:
+  case ICmpInst::ICMP_ULE:
+    if (I.getOperand(1) == MulVal)
+      break;
+    Inverse = true;
+    break;
+  default:
+    llvm_unreachable("Unexpected predicate");
+  }
+  if (Inverse) {
+    Value *Res = Builder.CreateExtractValue(Call, 1);
+    return BinaryOperator::CreateNot(Res);
+  }
+
+  return ExtractValueInst::Create(Call, 1);
+}
+
+/// When performing a comparison against a constant, it is possible that not all
+/// the bits in the LHS are demanded. This helper method computes the mask that
+/// IS demanded.
+static APInt getDemandedBitsLHSMask(ICmpInst &I, unsigned BitWidth) {
+  const APInt *RHS;
+  if (!match(I.getOperand(1), m_APInt(RHS)))
+    return APInt::getAllOnesValue(BitWidth);
+
+  // If this is a normal comparison, it demands all bits. If it is a sign bit
+  // comparison, it only demands the sign bit.
+  bool UnusedBit;
   if (InstCombiner::isSignBitCheck(I.getPredicate(), *RHS, UnusedBit))
-    return APInt::getSignMask(BitWidth); 
- 
-  switch (I.getPredicate()) { 
-  // For a UGT comparison, we don't care about any bits that 
-  // correspond to the trailing ones of the comparand.  The value of these 
-  // bits doesn't impact the outcome of the comparison, because any value 
-  // greater than the RHS must differ in a bit higher than these due to carry. 
-  case ICmpInst::ICMP_UGT: 
-    return APInt::getBitsSetFrom(BitWidth, RHS->countTrailingOnes()); 
- 
-  // Similarly, for a ULT comparison, we don't care about the trailing zeros. 
-  // Any value less than the RHS must differ in a higher bit because of carries. 
-  case ICmpInst::ICMP_ULT: 
-    return APInt::getBitsSetFrom(BitWidth, RHS->countTrailingZeros()); 
- 
-  default: 
-    return APInt::getAllOnesValue(BitWidth); 
-  } 
-} 
- 
-/// Check if the order of \p Op0 and \p Op1 as operands in an ICmpInst 
-/// should be swapped. 
-/// The decision is based on how many times these two operands are reused 
-/// as subtract operands and their positions in those instructions. 
-/// The rationale is that several architectures use the same instruction for 
-/// both subtract and cmp. Thus, it is better if the order of those operands 
-/// match. 
-/// \return true if Op0 and Op1 should be swapped. 
-static bool swapMayExposeCSEOpportunities(const Value *Op0, const Value *Op1) { 
-  // Filter out pointer values as those cannot appear directly in subtract. 
-  // FIXME: we may want to go through inttoptrs or bitcasts. 
-  if (Op0->getType()->isPointerTy()) 
-    return false; 
-  // If a subtract already has the same operands as a compare, swapping would be 
-  // bad. If a subtract has the same operands as a compare but in reverse order, 
-  // then swapping is good. 
-  int GoodToSwap = 0; 
-  for (const User *U : Op0->users()) { 
-    if (match(U, m_Sub(m_Specific(Op1), m_Specific(Op0)))) 
-      GoodToSwap++; 
-    else if (match(U, m_Sub(m_Specific(Op0), m_Specific(Op1)))) 
-      GoodToSwap--; 
-  } 
-  return GoodToSwap > 0; 
-} 
- 
-/// Check that one use is in the same block as the definition and all 
-/// other uses are in blocks dominated by a given block. 
-/// 
-/// \param DI Definition 
-/// \param UI Use 
-/// \param DB Block that must dominate all uses of \p DI outside 
-///           the parent block 
-/// \return true when \p UI is the only use of \p DI in the parent block 
-/// and all other uses of \p DI are in blocks dominated by \p DB. 
-/// 
+    return APInt::getSignMask(BitWidth);
+
+  switch (I.getPredicate()) {
+  // For a UGT comparison, we don't care about any bits that
+  // correspond to the trailing ones of the comparand.  The value of these
+  // bits doesn't impact the outcome of the comparison, because any value
+  // greater than the RHS must differ in a bit higher than these due to carry.
+  case ICmpInst::ICMP_UGT:
+    return APInt::getBitsSetFrom(BitWidth, RHS->countTrailingOnes());
+
+  // Similarly, for a ULT comparison, we don't care about the trailing zeros.
+  // Any value less than the RHS must differ in a higher bit because of carries.
+  case ICmpInst::ICMP_ULT:
+    return APInt::getBitsSetFrom(BitWidth, RHS->countTrailingZeros());
+
+  default:
+    return APInt::getAllOnesValue(BitWidth);
+  }
+}
+
+/// Check if the order of \p Op0 and \p Op1 as operands in an ICmpInst
+/// should be swapped.
+/// The decision is based on how many times these two operands are reused
+/// as subtract operands and their positions in those instructions.
+/// The rationale is that several architectures use the same instruction for
+/// both subtract and cmp. Thus, it is better if the order of those operands
+/// match.
+/// \return true if Op0 and Op1 should be swapped.
+static bool swapMayExposeCSEOpportunities(const Value *Op0, const Value *Op1) {
+  // Filter out pointer values as those cannot appear directly in subtract.
+  // FIXME: we may want to go through inttoptrs or bitcasts.
+  if (Op0->getType()->isPointerTy())
+    return false;
+  // If a subtract already has the same operands as a compare, swapping would be
+  // bad. If a subtract has the same operands as a compare but in reverse order,
+  // then swapping is good.
+  int GoodToSwap = 0;
+  for (const User *U : Op0->users()) {
+    if (match(U, m_Sub(m_Specific(Op1), m_Specific(Op0))))
+      GoodToSwap++;
+    else if (match(U, m_Sub(m_Specific(Op0), m_Specific(Op1))))
+      GoodToSwap--;
+  }
+  return GoodToSwap > 0;
+}
+
+/// Check that one use is in the same block as the definition and all
+/// other uses are in blocks dominated by a given block.
+///
+/// \param DI Definition
+/// \param UI Use
+/// \param DB Block that must dominate all uses of \p DI outside
+///           the parent block
+/// \return true when \p UI is the only use of \p DI in the parent block
+/// and all other uses of \p DI are in blocks dominated by \p DB.
+///
 bool InstCombinerImpl::dominatesAllUses(const Instruction *DI,
                                         const Instruction *UI,
                                         const BasicBlock *DB) const {
-  assert(DI && UI && "Instruction not defined\n"); 
-  // Ignore incomplete definitions. 
-  if (!DI->getParent()) 
-    return false; 
-  // DI and UI must be in the same block. 
-  if (DI->getParent() != UI->getParent()) 
-    return false; 
-  // Protect from self-referencing blocks. 
-  if (DI->getParent() == DB) 
-    return false; 
-  for (const User *U : DI->users()) { 
-    auto *Usr = cast<Instruction>(U); 
-    if (Usr != UI && !DT.dominates(DB, Usr->getParent())) 
-      return false; 
-  } 
-  return true; 
-} 
- 
-/// Return true when the instruction sequence within a block is select-cmp-br. 
-static bool isChainSelectCmpBranch(const SelectInst *SI) { 
-  const BasicBlock *BB = SI->getParent(); 
-  if (!BB) 
-    return false; 
-  auto *BI = dyn_cast_or_null<BranchInst>(BB->getTerminator()); 
-  if (!BI || BI->getNumSuccessors() != 2) 
-    return false; 
-  auto *IC = dyn_cast<ICmpInst>(BI->getCondition()); 
-  if (!IC || (IC->getOperand(0) != SI && IC->getOperand(1) != SI)) 
-    return false; 
-  return true; 
-} 
- 
-/// True when a select result is replaced by one of its operands 
-/// in select-icmp sequence. This will eventually result in the elimination 
-/// of the select. 
-/// 
-/// \param SI    Select instruction 
-/// \param Icmp  Compare instruction 
-/// \param SIOpd Operand that replaces the select 
-/// 
-/// Notes: 
-/// - The replacement is global and requires dominator information 
-/// - The caller is responsible for the actual replacement 
-/// 
-/// Example: 
-/// 
-/// entry: 
-///  %4 = select i1 %3, %C* %0, %C* null 
-///  %5 = icmp eq %C* %4, null 
-///  br i1 %5, label %9, label %7 
-///  ... 
-///  ; <label>:7                                       ; preds = %entry 
-///  %8 = getelementptr inbounds %C* %4, i64 0, i32 0 
-///  ... 
-/// 
-/// can be transformed to 
-/// 
-///  %5 = icmp eq %C* %0, null 
-///  %6 = select i1 %3, i1 %5, i1 true 
-///  br i1 %6, label %9, label %7 
-///  ... 
-///  ; <label>:7                                       ; preds = %entry 
-///  %8 = getelementptr inbounds %C* %0, i64 0, i32 0  // replace by %0! 
-/// 
-/// Similar when the first operand of the select is a constant or/and 
-/// the compare is for not equal rather than equal. 
-/// 
-/// NOTE: The function is only called when the select and compare constants 
-/// are equal, the optimization can work only for EQ predicates. This is not a 
-/// major restriction since a NE compare should be 'normalized' to an equal 
-/// compare, which usually happens in the combiner and test case 
-/// select-cmp-br.ll checks for it. 
+  assert(DI && UI && "Instruction not defined\n");
+  // Ignore incomplete definitions.
+  if (!DI->getParent())
+    return false;
+  // DI and UI must be in the same block.
+  if (DI->getParent() != UI->getParent())
+    return false;
+  // Protect from self-referencing blocks.
+  if (DI->getParent() == DB)
+    return false;
+  for (const User *U : DI->users()) {
+    auto *Usr = cast<Instruction>(U);
+    if (Usr != UI && !DT.dominates(DB, Usr->getParent()))
+      return false;
+  }
+  return true;
+}
+
+/// Return true when the instruction sequence within a block is select-cmp-br.
+static bool isChainSelectCmpBranch(const SelectInst *SI) {
+  const BasicBlock *BB = SI->getParent();
+  if (!BB)
+    return false;
+  auto *BI = dyn_cast_or_null<BranchInst>(BB->getTerminator());
+  if (!BI || BI->getNumSuccessors() != 2)
+    return false;
+  auto *IC = dyn_cast<ICmpInst>(BI->getCondition());
+  if (!IC || (IC->getOperand(0) != SI && IC->getOperand(1) != SI))
+    return false;
+  return true;
+}
+
+/// True when a select result is replaced by one of its operands
+/// in select-icmp sequence. This will eventually result in the elimination
+/// of the select.
+///
+/// \param SI    Select instruction
+/// \param Icmp  Compare instruction
+/// \param SIOpd Operand that replaces the select
+///
+/// Notes:
+/// - The replacement is global and requires dominator information
+/// - The caller is responsible for the actual replacement
+///
+/// Example:
+///
+/// entry:
+///  %4 = select i1 %3, %C* %0, %C* null
+///  %5 = icmp eq %C* %4, null
+///  br i1 %5, label %9, label %7
+///  ...
+///  ; <label>:7                                       ; preds = %entry
+///  %8 = getelementptr inbounds %C* %4, i64 0, i32 0
+///  ...
+///
+/// can be transformed to
+///
+///  %5 = icmp eq %C* %0, null
+///  %6 = select i1 %3, i1 %5, i1 true
+///  br i1 %6, label %9, label %7
+///  ...
+///  ; <label>:7                                       ; preds = %entry
+///  %8 = getelementptr inbounds %C* %0, i64 0, i32 0  // replace by %0!
+///
+/// Similar when the first operand of the select is a constant or/and
+/// the compare is for not equal rather than equal.
+///
+/// NOTE: The function is only called when the select and compare constants
+/// are equal, the optimization can work only for EQ predicates. This is not a
+/// major restriction since a NE compare should be 'normalized' to an equal
+/// compare, which usually happens in the combiner and test case
+/// select-cmp-br.ll checks for it.
 bool InstCombinerImpl::replacedSelectWithOperand(SelectInst *SI,
                                                  const ICmpInst *Icmp,
                                                  const unsigned SIOpd) {
-  assert((SIOpd == 1 || SIOpd == 2) && "Invalid select operand!"); 
-  if (isChainSelectCmpBranch(SI) && Icmp->getPredicate() == ICmpInst::ICMP_EQ) { 
-    BasicBlock *Succ = SI->getParent()->getTerminator()->getSuccessor(1); 
-    // The check for the single predecessor is not the best that can be 
-    // done. But it protects efficiently against cases like when SI's 
-    // home block has two successors, Succ and Succ1, and Succ1 predecessor 
-    // of Succ. Then SI can't be replaced by SIOpd because the use that gets 
-    // replaced can be reached on either path. So the uniqueness check 
-    // guarantees that the path all uses of SI (outside SI's parent) are on 
-    // is disjoint from all other paths out of SI. But that information 
-    // is more expensive to compute, and the trade-off here is in favor 
-    // of compile-time. It should also be noticed that we check for a single 
-    // predecessor and not only uniqueness. This to handle the situation when 
-    // Succ and Succ1 points to the same basic block. 
-    if (Succ->getSinglePredecessor() && dominatesAllUses(SI, Icmp, Succ)) { 
-      NumSel++; 
-      SI->replaceUsesOutsideBlock(SI->getOperand(SIOpd), SI->getParent()); 
-      return true; 
-    } 
-  } 
-  return false; 
-} 
- 
-/// Try to fold the comparison based on range information we can get by checking 
-/// whether bits are known to be zero or one in the inputs. 
+  assert((SIOpd == 1 || SIOpd == 2) && "Invalid select operand!");
+  if (isChainSelectCmpBranch(SI) && Icmp->getPredicate() == ICmpInst::ICMP_EQ) {
+    BasicBlock *Succ = SI->getParent()->getTerminator()->getSuccessor(1);
+    // The check for the single predecessor is not the best that can be
+    // done. But it protects efficiently against cases like when SI's
+    // home block has two successors, Succ and Succ1, and Succ1 predecessor
+    // of Succ. Then SI can't be replaced by SIOpd because the use that gets
+    // replaced can be reached on either path. So the uniqueness check
+    // guarantees that the path all uses of SI (outside SI's parent) are on
+    // is disjoint from all other paths out of SI. But that information
+    // is more expensive to compute, and the trade-off here is in favor
+    // of compile-time. It should also be noticed that we check for a single
+    // predecessor and not only uniqueness. This to handle the situation when
+    // Succ and Succ1 points to the same basic block.
+    if (Succ->getSinglePredecessor() && dominatesAllUses(SI, Icmp, Succ)) {
+      NumSel++;
+      SI->replaceUsesOutsideBlock(SI->getOperand(SIOpd), SI->getParent());
+      return true;
+    }
+  }
+  return false;
+}
+
+/// Try to fold the comparison based on range information we can get by checking
+/// whether bits are known to be zero or one in the inputs.
 Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) {
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); 
-  Type *Ty = Op0->getType(); 
-  ICmpInst::Predicate Pred = I.getPredicate(); 
- 
-  // Get scalar or pointer size. 
-  unsigned BitWidth = Ty->isIntOrIntVectorTy() 
-                          ? Ty->getScalarSizeInBits() 
-                          : DL.getPointerTypeSizeInBits(Ty->getScalarType()); 
- 
-  if (!BitWidth) 
-    return nullptr; 
- 
-  KnownBits Op0Known(BitWidth); 
-  KnownBits Op1Known(BitWidth); 
- 
-  if (SimplifyDemandedBits(&I, 0, 
-                           getDemandedBitsLHSMask(I, BitWidth), 
-                           Op0Known, 0)) 
-    return &I; 
- 
-  if (SimplifyDemandedBits(&I, 1, APInt::getAllOnesValue(BitWidth), 
-                           Op1Known, 0)) 
-    return &I; 
- 
-  // Given the known and unknown bits, compute a range that the LHS could be 
-  // in.  Compute the Min, Max and RHS values based on the known bits. For the 
-  // EQ and NE we use unsigned values. 
-  APInt Op0Min(BitWidth, 0), Op0Max(BitWidth, 0); 
-  APInt Op1Min(BitWidth, 0), Op1Max(BitWidth, 0); 
-  if (I.isSigned()) { 
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  Type *Ty = Op0->getType();
+  ICmpInst::Predicate Pred = I.getPredicate();
+
+  // Get scalar or pointer size.
+  unsigned BitWidth = Ty->isIntOrIntVectorTy()
+                          ? Ty->getScalarSizeInBits()
+                          : DL.getPointerTypeSizeInBits(Ty->getScalarType());
+
+  if (!BitWidth)
+    return nullptr;
+
+  KnownBits Op0Known(BitWidth);
+  KnownBits Op1Known(BitWidth);
+
+  if (SimplifyDemandedBits(&I, 0,
+                           getDemandedBitsLHSMask(I, BitWidth),
+                           Op0Known, 0))
+    return &I;
+
+  if (SimplifyDemandedBits(&I, 1, APInt::getAllOnesValue(BitWidth),
+                           Op1Known, 0))
+    return &I;
+
+  // Given the known and unknown bits, compute a range that the LHS could be
+  // in.  Compute the Min, Max and RHS values based on the known bits. For the
+  // EQ and NE we use unsigned values.
+  APInt Op0Min(BitWidth, 0), Op0Max(BitWidth, 0);
+  APInt Op1Min(BitWidth, 0), Op1Max(BitWidth, 0);
+  if (I.isSigned()) {
     Op0Min = Op0Known.getSignedMinValue();
     Op0Max = Op0Known.getSignedMaxValue();
     Op1Min = Op1Known.getSignedMinValue();
     Op1Max = Op1Known.getSignedMaxValue();
-  } else { 
+  } else {
     Op0Min = Op0Known.getMinValue();
     Op0Max = Op0Known.getMaxValue();
     Op1Min = Op1Known.getMinValue();
     Op1Max = Op1Known.getMaxValue();
-  } 
- 
-  // If Min and Max are known to be the same, then SimplifyDemandedBits figured 
-  // out that the LHS or RHS is a constant. Constant fold this now, so that 
-  // code below can assume that Min != Max. 
-  if (!isa<Constant>(Op0) && Op0Min == Op0Max) 
-    return new ICmpInst(Pred, ConstantExpr::getIntegerValue(Ty, Op0Min), Op1); 
-  if (!isa<Constant>(Op1) && Op1Min == Op1Max) 
-    return new ICmpInst(Pred, Op0, ConstantExpr::getIntegerValue(Ty, Op1Min)); 
- 
-  // Based on the range information we know about the LHS, see if we can 
-  // simplify this comparison.  For example, (x&4) < 8 is always true. 
-  switch (Pred) { 
-  default: 
-    llvm_unreachable("Unknown icmp opcode!"); 
-  case ICmpInst::ICMP_EQ: 
-  case ICmpInst::ICMP_NE: { 
+  }
+
+  // If Min and Max are known to be the same, then SimplifyDemandedBits figured
+  // out that the LHS or RHS is a constant. Constant fold this now, so that
+  // code below can assume that Min != Max.
+  if (!isa<Constant>(Op0) && Op0Min == Op0Max)
+    return new ICmpInst(Pred, ConstantExpr::getIntegerValue(Ty, Op0Min), Op1);
+  if (!isa<Constant>(Op1) && Op1Min == Op1Max)
+    return new ICmpInst(Pred, Op0, ConstantExpr::getIntegerValue(Ty, Op1Min));
+
+  // Based on the range information we know about the LHS, see if we can
+  // simplify this comparison.  For example, (x&4) < 8 is always true.
+  switch (Pred) {
+  default:
+    llvm_unreachable("Unknown icmp opcode!");
+  case ICmpInst::ICMP_EQ:
+  case ICmpInst::ICMP_NE: {
     if (Op0Max.ult(Op1Min) || Op0Min.ugt(Op1Max))
       return replaceInstUsesWith(
           I, ConstantInt::getBool(I.getType(), Pred == CmpInst::ICMP_NE));
- 
-    // If all bits are known zero except for one, then we know at most one bit 
-    // is set. If the comparison is against zero, then this is a check to see if 
-    // *that* bit is set. 
-    APInt Op0KnownZeroInverted = ~Op0Known.Zero; 
-    if (Op1Known.isZero()) { 
-      // If the LHS is an AND with the same constant, look through it. 
-      Value *LHS = nullptr; 
-      const APInt *LHSC; 
-      if (!match(Op0, m_And(m_Value(LHS), m_APInt(LHSC))) || 
-          *LHSC != Op0KnownZeroInverted) 
-        LHS = Op0; 
- 
-      Value *X; 
-      if (match(LHS, m_Shl(m_One(), m_Value(X)))) { 
-        APInt ValToCheck = Op0KnownZeroInverted; 
-        Type *XTy = X->getType(); 
-        if (ValToCheck.isPowerOf2()) { 
-          // ((1 << X) & 8) == 0 -> X != 3 
-          // ((1 << X) & 8) != 0 -> X == 3 
-          auto *CmpC = ConstantInt::get(XTy, ValToCheck.countTrailingZeros()); 
-          auto NewPred = ICmpInst::getInversePredicate(Pred); 
-          return new ICmpInst(NewPred, X, CmpC); 
-        } else if ((++ValToCheck).isPowerOf2()) { 
-          // ((1 << X) & 7) == 0 -> X >= 3 
-          // ((1 << X) & 7) != 0 -> X  < 3 
-          auto *CmpC = ConstantInt::get(XTy, ValToCheck.countTrailingZeros()); 
-          auto NewPred = 
-              Pred == CmpInst::ICMP_EQ ? CmpInst::ICMP_UGE : CmpInst::ICMP_ULT; 
-          return new ICmpInst(NewPred, X, CmpC); 
-        } 
-      } 
- 
-      // Check if the LHS is 8 >>u x and the result is a power of 2 like 1. 
-      const APInt *CI; 
-      if (Op0KnownZeroInverted.isOneValue() && 
-          match(LHS, m_LShr(m_Power2(CI), m_Value(X)))) { 
-        // ((8 >>u X) & 1) == 0 -> X != 3 
-        // ((8 >>u X) & 1) != 0 -> X == 3 
-        unsigned CmpVal = CI->countTrailingZeros(); 
-        auto NewPred = ICmpInst::getInversePredicate(Pred); 
-        return new ICmpInst(NewPred, X, ConstantInt::get(X->getType(), CmpVal)); 
-      } 
-    } 
-    break; 
-  } 
-  case ICmpInst::ICMP_ULT: { 
-    if (Op0Max.ult(Op1Min)) // A <u B -> true if max(A) < min(B) 
-      return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType())); 
-    if (Op0Min.uge(Op1Max)) // A <u B -> false if min(A) >= max(B) 
-      return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType())); 
-    if (Op1Min == Op0Max) // A <u B -> A != B if max(A) == min(B) 
-      return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1); 
- 
-    const APInt *CmpC; 
-    if (match(Op1, m_APInt(CmpC))) { 
-      // A <u C -> A == C-1 if min(A)+1 == C 
-      if (*CmpC == Op0Min + 1) 
-        return new ICmpInst(ICmpInst::ICMP_EQ, Op0, 
-                            ConstantInt::get(Op1->getType(), *CmpC - 1)); 
-      // X <u C --> X == 0, if the number of zero bits in the bottom of X 
-      // exceeds the log2 of C. 
-      if (Op0Known.countMinTrailingZeros() >= CmpC->ceilLogBase2()) 
-        return new ICmpInst(ICmpInst::ICMP_EQ, Op0, 
-                            Constant::getNullValue(Op1->getType())); 
-    } 
-    break; 
-  } 
-  case ICmpInst::ICMP_UGT: { 
-    if (Op0Min.ugt(Op1Max)) // A >u B -> true if min(A) > max(B) 
-      return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType())); 
-    if (Op0Max.ule(Op1Min)) // A >u B -> false if max(A) <= max(B) 
-      return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType())); 
-    if (Op1Max == Op0Min) // A >u B -> A != B if min(A) == max(B) 
-      return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1); 
- 
-    const APInt *CmpC; 
-    if (match(Op1, m_APInt(CmpC))) { 
-      // A >u C -> A == C+1 if max(a)-1 == C 
-      if (*CmpC == Op0Max - 1) 
-        return new ICmpInst(ICmpInst::ICMP_EQ, Op0, 
-                            ConstantInt::get(Op1->getType(), *CmpC + 1)); 
-      // X >u C --> X != 0, if the number of zero bits in the bottom of X 
-      // exceeds the log2 of C. 
-      if (Op0Known.countMinTrailingZeros() >= CmpC->getActiveBits()) 
-        return new ICmpInst(ICmpInst::ICMP_NE, Op0, 
-                            Constant::getNullValue(Op1->getType())); 
-    } 
-    break; 
-  } 
-  case ICmpInst::ICMP_SLT: { 
-    if (Op0Max.slt(Op1Min)) // A <s B -> true if max(A) < min(C) 
-      return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType())); 
-    if (Op0Min.sge(Op1Max)) // A <s B -> false if min(A) >= max(C) 
-      return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType())); 
-    if (Op1Min == Op0Max) // A <s B -> A != B if max(A) == min(B) 
-      return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1); 
-    const APInt *CmpC; 
-    if (match(Op1, m_APInt(CmpC))) { 
-      if (*CmpC == Op0Min + 1) // A <s C -> A == C-1 if min(A)+1 == C 
-        return new ICmpInst(ICmpInst::ICMP_EQ, Op0, 
-                            ConstantInt::get(Op1->getType(), *CmpC - 1)); 
-    } 
-    break; 
-  } 
-  case ICmpInst::ICMP_SGT: { 
-    if (Op0Min.sgt(Op1Max)) // A >s B -> true if min(A) > max(B) 
-      return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType())); 
-    if (Op0Max.sle(Op1Min)) // A >s B -> false if max(A) <= min(B) 
-      return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType())); 
-    if (Op1Max == Op0Min) // A >s B -> A != B if min(A) == max(B) 
-      return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1); 
-    const APInt *CmpC; 
-    if (match(Op1, m_APInt(CmpC))) { 
-      if (*CmpC == Op0Max - 1) // A >s C -> A == C+1 if max(A)-1 == C 
-        return new ICmpInst(ICmpInst::ICMP_EQ, Op0, 
-                            ConstantInt::get(Op1->getType(), *CmpC + 1)); 
-    } 
-    break; 
-  } 
-  case ICmpInst::ICMP_SGE: 
-    assert(!isa<ConstantInt>(Op1) && "ICMP_SGE with ConstantInt not folded!"); 
-    if (Op0Min.sge(Op1Max)) // A >=s B -> true if min(A) >= max(B) 
-      return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType())); 
-    if (Op0Max.slt(Op1Min)) // A >=s B -> false if max(A) < min(B) 
-      return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType())); 
-    if (Op1Min == Op0Max) // A >=s B -> A == B if max(A) == min(B) 
-      return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1); 
-    break; 
-  case ICmpInst::ICMP_SLE: 
-    assert(!isa<ConstantInt>(Op1) && "ICMP_SLE with ConstantInt not folded!"); 
-    if (Op0Max.sle(Op1Min)) // A <=s B -> true if max(A) <= min(B) 
-      return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType())); 
-    if (Op0Min.sgt(Op1Max)) // A <=s B -> false if min(A) > max(B) 
-      return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType())); 
-    if (Op1Max == Op0Min) // A <=s B -> A == B if min(A) == max(B) 
-      return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1); 
-    break; 
-  case ICmpInst::ICMP_UGE: 
-    assert(!isa<ConstantInt>(Op1) && "ICMP_UGE with ConstantInt not folded!"); 
-    if (Op0Min.uge(Op1Max)) // A >=u B -> true if min(A) >= max(B) 
-      return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType())); 
-    if (Op0Max.ult(Op1Min)) // A >=u B -> false if max(A) < min(B) 
-      return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType())); 
-    if (Op1Min == Op0Max) // A >=u B -> A == B if max(A) == min(B) 
-      return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1); 
-    break; 
-  case ICmpInst::ICMP_ULE: 
-    assert(!isa<ConstantInt>(Op1) && "ICMP_ULE with ConstantInt not folded!"); 
-    if (Op0Max.ule(Op1Min)) // A <=u B -> true if max(A) <= min(B) 
-      return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType())); 
-    if (Op0Min.ugt(Op1Max)) // A <=u B -> false if min(A) > max(B) 
-      return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType())); 
-    if (Op1Max == Op0Min) // A <=u B -> A == B if min(A) == max(B) 
-      return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1); 
-    break; 
-  } 
- 
-  // Turn a signed comparison into an unsigned one if both operands are known to 
-  // have the same sign. 
-  if (I.isSigned() && 
-      ((Op0Known.Zero.isNegative() && Op1Known.Zero.isNegative()) || 
-       (Op0Known.One.isNegative() && Op1Known.One.isNegative()))) 
-    return new ICmpInst(I.getUnsignedPredicate(), Op0, Op1); 
- 
-  return nullptr; 
-} 
- 
-llvm::Optional<std::pair<CmpInst::Predicate, Constant *>> 
+
+    // If all bits are known zero except for one, then we know at most one bit
+    // is set. If the comparison is against zero, then this is a check to see if
+    // *that* bit is set.
+    APInt Op0KnownZeroInverted = ~Op0Known.Zero;
+    if (Op1Known.isZero()) {
+      // If the LHS is an AND with the same constant, look through it.
+      Value *LHS = nullptr;
+      const APInt *LHSC;
+      if (!match(Op0, m_And(m_Value(LHS), m_APInt(LHSC))) ||
+          *LHSC != Op0KnownZeroInverted)
+        LHS = Op0;
+
+      Value *X;
+      if (match(LHS, m_Shl(m_One(), m_Value(X)))) {
+        APInt ValToCheck = Op0KnownZeroInverted;
+        Type *XTy = X->getType();
+        if (ValToCheck.isPowerOf2()) {
+          // ((1 << X) & 8) == 0 -> X != 3
+          // ((1 << X) & 8) != 0 -> X == 3
+          auto *CmpC = ConstantInt::get(XTy, ValToCheck.countTrailingZeros());
+          auto NewPred = ICmpInst::getInversePredicate(Pred);
+          return new ICmpInst(NewPred, X, CmpC);
+        } else if ((++ValToCheck).isPowerOf2()) {
+          // ((1 << X) & 7) == 0 -> X >= 3
+          // ((1 << X) & 7) != 0 -> X  < 3
+          auto *CmpC = ConstantInt::get(XTy, ValToCheck.countTrailingZeros());
+          auto NewPred =
+              Pred == CmpInst::ICMP_EQ ? CmpInst::ICMP_UGE : CmpInst::ICMP_ULT;
+          return new ICmpInst(NewPred, X, CmpC);
+        }
+      }
+
+      // Check if the LHS is 8 >>u x and the result is a power of 2 like 1.
+      const APInt *CI;
+      if (Op0KnownZeroInverted.isOneValue() &&
+          match(LHS, m_LShr(m_Power2(CI), m_Value(X)))) {
+        // ((8 >>u X) & 1) == 0 -> X != 3
+        // ((8 >>u X) & 1) != 0 -> X == 3
+        unsigned CmpVal = CI->countTrailingZeros();
+        auto NewPred = ICmpInst::getInversePredicate(Pred);
+        return new ICmpInst(NewPred, X, ConstantInt::get(X->getType(), CmpVal));
+      }
+    }
+    break;
+  }
+  case ICmpInst::ICMP_ULT: {
+    if (Op0Max.ult(Op1Min)) // A <u B -> true if max(A) < min(B)
+      return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+    if (Op0Min.uge(Op1Max)) // A <u B -> false if min(A) >= max(B)
+      return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+    if (Op1Min == Op0Max) // A <u B -> A != B if max(A) == min(B)
+      return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+
+    const APInt *CmpC;
+    if (match(Op1, m_APInt(CmpC))) {
+      // A <u C -> A == C-1 if min(A)+1 == C
+      if (*CmpC == Op0Min + 1)
+        return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
+                            ConstantInt::get(Op1->getType(), *CmpC - 1));
+      // X <u C --> X == 0, if the number of zero bits in the bottom of X
+      // exceeds the log2 of C.
+      if (Op0Known.countMinTrailingZeros() >= CmpC->ceilLogBase2())
+        return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
+                            Constant::getNullValue(Op1->getType()));
+    }
+    break;
+  }
+  case ICmpInst::ICMP_UGT: {
+    if (Op0Min.ugt(Op1Max)) // A >u B -> true if min(A) > max(B)
+      return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+    if (Op0Max.ule(Op1Min)) // A >u B -> false if max(A) <= max(B)
+      return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+    if (Op1Max == Op0Min) // A >u B -> A != B if min(A) == max(B)
+      return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+
+    const APInt *CmpC;
+    if (match(Op1, m_APInt(CmpC))) {
+      // A >u C -> A == C+1 if max(a)-1 == C
+      if (*CmpC == Op0Max - 1)
+        return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
+                            ConstantInt::get(Op1->getType(), *CmpC + 1));
+      // X >u C --> X != 0, if the number of zero bits in the bottom of X
+      // exceeds the log2 of C.
+      if (Op0Known.countMinTrailingZeros() >= CmpC->getActiveBits())
+        return new ICmpInst(ICmpInst::ICMP_NE, Op0,
+                            Constant::getNullValue(Op1->getType()));
+    }
+    break;
+  }
+  case ICmpInst::ICMP_SLT: {
+    if (Op0Max.slt(Op1Min)) // A <s B -> true if max(A) < min(C)
+      return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+    if (Op0Min.sge(Op1Max)) // A <s B -> false if min(A) >= max(C)
+      return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+    if (Op1Min == Op0Max) // A <s B -> A != B if max(A) == min(B)
+      return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+    const APInt *CmpC;
+    if (match(Op1, m_APInt(CmpC))) {
+      if (*CmpC == Op0Min + 1) // A <s C -> A == C-1 if min(A)+1 == C
+        return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
+                            ConstantInt::get(Op1->getType(), *CmpC - 1));
+    }
+    break;
+  }
+  case ICmpInst::ICMP_SGT: {
+    if (Op0Min.sgt(Op1Max)) // A >s B -> true if min(A) > max(B)
+      return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+    if (Op0Max.sle(Op1Min)) // A >s B -> false if max(A) <= min(B)
+      return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+    if (Op1Max == Op0Min) // A >s B -> A != B if min(A) == max(B)
+      return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+    const APInt *CmpC;
+    if (match(Op1, m_APInt(CmpC))) {
+      if (*CmpC == Op0Max - 1) // A >s C -> A == C+1 if max(A)-1 == C
+        return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
+                            ConstantInt::get(Op1->getType(), *CmpC + 1));
+    }
+    break;
+  }
+  case ICmpInst::ICMP_SGE:
+    assert(!isa<ConstantInt>(Op1) && "ICMP_SGE with ConstantInt not folded!");
+    if (Op0Min.sge(Op1Max)) // A >=s B -> true if min(A) >= max(B)
+      return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+    if (Op0Max.slt(Op1Min)) // A >=s B -> false if max(A) < min(B)
+      return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+    if (Op1Min == Op0Max) // A >=s B -> A == B if max(A) == min(B)
+      return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1);
+    break;
+  case ICmpInst::ICMP_SLE:
+    assert(!isa<ConstantInt>(Op1) && "ICMP_SLE with ConstantInt not folded!");
+    if (Op0Max.sle(Op1Min)) // A <=s B -> true if max(A) <= min(B)
+      return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+    if (Op0Min.sgt(Op1Max)) // A <=s B -> false if min(A) > max(B)
+      return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+    if (Op1Max == Op0Min) // A <=s B -> A == B if min(A) == max(B)
+      return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1);
+    break;
+  case ICmpInst::ICMP_UGE:
+    assert(!isa<ConstantInt>(Op1) && "ICMP_UGE with ConstantInt not folded!");
+    if (Op0Min.uge(Op1Max)) // A >=u B -> true if min(A) >= max(B)
+      return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+    if (Op0Max.ult(Op1Min)) // A >=u B -> false if max(A) < min(B)
+      return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+    if (Op1Min == Op0Max) // A >=u B -> A == B if max(A) == min(B)
+      return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1);
+    break;
+  case ICmpInst::ICMP_ULE:
+    assert(!isa<ConstantInt>(Op1) && "ICMP_ULE with ConstantInt not folded!");
+    if (Op0Max.ule(Op1Min)) // A <=u B -> true if max(A) <= min(B)
+      return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+    if (Op0Min.ugt(Op1Max)) // A <=u B -> false if min(A) > max(B)
+      return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+    if (Op1Max == Op0Min) // A <=u B -> A == B if min(A) == max(B)
+      return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1);
+    break;
+  }
+
+  // Turn a signed comparison into an unsigned one if both operands are known to
+  // have the same sign.
+  if (I.isSigned() &&
+      ((Op0Known.Zero.isNegative() && Op1Known.Zero.isNegative()) ||
+       (Op0Known.One.isNegative() && Op1Known.One.isNegative())))
+    return new ICmpInst(I.getUnsignedPredicate(), Op0, Op1);
+
+  return nullptr;
+}
+
+llvm::Optional<std::pair<CmpInst::Predicate, Constant *>>
 InstCombiner::getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred,
                                                        Constant *C) {
-  assert(ICmpInst::isRelational(Pred) && ICmpInst::isIntPredicate(Pred) && 
-         "Only for relational integer predicates."); 
- 
-  Type *Type = C->getType(); 
-  bool IsSigned = ICmpInst::isSigned(Pred); 
- 
-  CmpInst::Predicate UnsignedPred = ICmpInst::getUnsignedPredicate(Pred); 
-  bool WillIncrement = 
-      UnsignedPred == ICmpInst::ICMP_ULE || UnsignedPred == ICmpInst::ICMP_UGT; 
- 
-  // Check if the constant operand can be safely incremented/decremented 
-  // without overflowing/underflowing. 
-  auto ConstantIsOk = [WillIncrement, IsSigned](ConstantInt *C) { 
-    return WillIncrement ? !C->isMaxValue(IsSigned) : !C->isMinValue(IsSigned); 
-  }; 
- 
-  Constant *SafeReplacementConstant = nullptr; 
-  if (auto *CI = dyn_cast<ConstantInt>(C)) { 
-    // Bail out if the constant can't be safely incremented/decremented. 
-    if (!ConstantIsOk(CI)) 
-      return llvm::None; 
+  assert(ICmpInst::isRelational(Pred) && ICmpInst::isIntPredicate(Pred) &&
+         "Only for relational integer predicates.");
+
+  Type *Type = C->getType();
+  bool IsSigned = ICmpInst::isSigned(Pred);
+
+  CmpInst::Predicate UnsignedPred = ICmpInst::getUnsignedPredicate(Pred);
+  bool WillIncrement =
+      UnsignedPred == ICmpInst::ICMP_ULE || UnsignedPred == ICmpInst::ICMP_UGT;
+
+  // Check if the constant operand can be safely incremented/decremented
+  // without overflowing/underflowing.
+  auto ConstantIsOk = [WillIncrement, IsSigned](ConstantInt *C) {
+    return WillIncrement ? !C->isMaxValue(IsSigned) : !C->isMinValue(IsSigned);
+  };
+
+  Constant *SafeReplacementConstant = nullptr;
+  if (auto *CI = dyn_cast<ConstantInt>(C)) {
+    // Bail out if the constant can't be safely incremented/decremented.
+    if (!ConstantIsOk(CI))
+      return llvm::None;
   } else if (auto *FVTy = dyn_cast<FixedVectorType>(Type)) {
     unsigned NumElts = FVTy->getNumElements();
-    for (unsigned i = 0; i != NumElts; ++i) { 
-      Constant *Elt = C->getAggregateElement(i); 
-      if (!Elt) 
-        return llvm::None; 
- 
-      if (isa<UndefValue>(Elt)) 
-        continue; 
- 
-      // Bail out if we can't determine if this constant is min/max or if we 
-      // know that this constant is min/max. 
-      auto *CI = dyn_cast<ConstantInt>(Elt); 
-      if (!CI || !ConstantIsOk(CI)) 
-        return llvm::None; 
- 
-      if (!SafeReplacementConstant) 
-        SafeReplacementConstant = CI; 
-    } 
-  } else { 
-    // ConstantExpr? 
-    return llvm::None; 
-  } 
- 
-  // It may not be safe to change a compare predicate in the presence of 
-  // undefined elements, so replace those elements with the first safe constant 
-  // that we found. 
+    for (unsigned i = 0; i != NumElts; ++i) {
+      Constant *Elt = C->getAggregateElement(i);
+      if (!Elt)
+        return llvm::None;
+
+      if (isa<UndefValue>(Elt))
+        continue;
+
+      // Bail out if we can't determine if this constant is min/max or if we
+      // know that this constant is min/max.
+      auto *CI = dyn_cast<ConstantInt>(Elt);
+      if (!CI || !ConstantIsOk(CI))
+        return llvm::None;
+
+      if (!SafeReplacementConstant)
+        SafeReplacementConstant = CI;
+    }
+  } else {
+    // ConstantExpr?
+    return llvm::None;
+  }
+
+  // It may not be safe to change a compare predicate in the presence of
+  // undefined elements, so replace those elements with the first safe constant
+  // that we found.
   // TODO: in case of poison, it is safe; let's replace undefs only.
   if (C->containsUndefOrPoisonElement()) {
-    assert(SafeReplacementConstant && "Replacement constant not set"); 
-    C = Constant::replaceUndefsWith(C, SafeReplacementConstant); 
-  } 
- 
-  CmpInst::Predicate NewPred = CmpInst::getFlippedStrictnessPredicate(Pred); 
- 
-  // Increment or decrement the constant. 
-  Constant *OneOrNegOne = ConstantInt::get(Type, WillIncrement ? 1 : -1, true); 
-  Constant *NewC = ConstantExpr::getAdd(C, OneOrNegOne); 
- 
-  return std::make_pair(NewPred, NewC); 
-} 
- 
-/// If we have an icmp le or icmp ge instruction with a constant operand, turn 
-/// it into the appropriate icmp lt or icmp gt instruction. This transform 
-/// allows them to be folded in visitICmpInst. 
-static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) { 
-  ICmpInst::Predicate Pred = I.getPredicate(); 
-  if (ICmpInst::isEquality(Pred) || !ICmpInst::isIntPredicate(Pred) || 
+    assert(SafeReplacementConstant && "Replacement constant not set");
+    C = Constant::replaceUndefsWith(C, SafeReplacementConstant);
+  }
+
+  CmpInst::Predicate NewPred = CmpInst::getFlippedStrictnessPredicate(Pred);
+
+  // Increment or decrement the constant.
+  Constant *OneOrNegOne = ConstantInt::get(Type, WillIncrement ? 1 : -1, true);
+  Constant *NewC = ConstantExpr::getAdd(C, OneOrNegOne);
+
+  return std::make_pair(NewPred, NewC);
+}
+
+/// If we have an icmp le or icmp ge instruction with a constant operand, turn
+/// it into the appropriate icmp lt or icmp gt instruction. This transform
+/// allows them to be folded in visitICmpInst.
+static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) {
+  ICmpInst::Predicate Pred = I.getPredicate();
+  if (ICmpInst::isEquality(Pred) || !ICmpInst::isIntPredicate(Pred) ||
       InstCombiner::isCanonicalPredicate(Pred))
-    return nullptr; 
- 
-  Value *Op0 = I.getOperand(0); 
-  Value *Op1 = I.getOperand(1); 
-  auto *Op1C = dyn_cast<Constant>(Op1); 
-  if (!Op1C) 
-    return nullptr; 
- 
+    return nullptr;
+
+  Value *Op0 = I.getOperand(0);
+  Value *Op1 = I.getOperand(1);
+  auto *Op1C = dyn_cast<Constant>(Op1);
+  if (!Op1C)
+    return nullptr;
+
   auto FlippedStrictness =
       InstCombiner::getFlippedStrictnessPredicateAndConstant(Pred, Op1C);
-  if (!FlippedStrictness) 
-    return nullptr; 
- 
-  return new ICmpInst(FlippedStrictness->first, Op0, FlippedStrictness->second); 
-} 
- 
-/// If we have a comparison with a non-canonical predicate, if we can update 
-/// all the users, invert the predicate and adjust all the users. 
+  if (!FlippedStrictness)
+    return nullptr;
+
+  return new ICmpInst(FlippedStrictness->first, Op0, FlippedStrictness->second);
+}
+
+/// If we have a comparison with a non-canonical predicate, if we can update
+/// all the users, invert the predicate and adjust all the users.
 CmpInst *InstCombinerImpl::canonicalizeICmpPredicate(CmpInst &I) {
-  // Is the predicate already canonical? 
-  CmpInst::Predicate Pred = I.getPredicate(); 
+  // Is the predicate already canonical?
+  CmpInst::Predicate Pred = I.getPredicate();
   if (InstCombiner::isCanonicalPredicate(Pred))
-    return nullptr; 
- 
-  // Can all users be adjusted to predicate inversion? 
+    return nullptr;
+
+  // Can all users be adjusted to predicate inversion?
   if (!InstCombiner::canFreelyInvertAllUsersOf(&I, /*IgnoredUser=*/nullptr))
-    return nullptr; 
- 
-  // Ok, we can canonicalize comparison! 
-  // Let's first invert the comparison's predicate. 
-  I.setPredicate(CmpInst::getInversePredicate(Pred)); 
-  I.setName(I.getName() + ".not"); 
- 
+    return nullptr;
+
+  // Ok, we can canonicalize comparison!
+  // Let's first invert the comparison's predicate.
+  I.setPredicate(CmpInst::getInversePredicate(Pred));
+  I.setName(I.getName() + ".not");
+
   // And, adapt users.
   freelyInvertAllUsersOf(&I);
- 
-  return &I; 
-} 
- 
-/// Integer compare with boolean values can always be turned into bitwise ops. 
-static Instruction *canonicalizeICmpBool(ICmpInst &I, 
-                                         InstCombiner::BuilderTy &Builder) { 
-  Value *A = I.getOperand(0), *B = I.getOperand(1); 
-  assert(A->getType()->isIntOrIntVectorTy(1) && "Bools only"); 
- 
-  // A boolean compared to true/false can be simplified to Op0/true/false in 
-  // 14 out of the 20 (10 predicates * 2 constants) possible combinations. 
-  // Cases not handled by InstSimplify are always 'not' of Op0. 
-  if (match(B, m_Zero())) { 
-    switch (I.getPredicate()) { 
-      case CmpInst::ICMP_EQ:  // A ==   0 -> !A 
-      case CmpInst::ICMP_ULE: // A <=u  0 -> !A 
-      case CmpInst::ICMP_SGE: // A >=s  0 -> !A 
-        return BinaryOperator::CreateNot(A); 
-      default: 
-        llvm_unreachable("ICmp i1 X, C not simplified as expected."); 
-    } 
-  } else if (match(B, m_One())) { 
-    switch (I.getPredicate()) { 
-      case CmpInst::ICMP_NE:  // A !=  1 -> !A 
-      case CmpInst::ICMP_ULT: // A <u  1 -> !A 
-      case CmpInst::ICMP_SGT: // A >s -1 -> !A 
-        return BinaryOperator::CreateNot(A); 
-      default: 
-        llvm_unreachable("ICmp i1 X, C not simplified as expected."); 
-    } 
-  } 
- 
-  switch (I.getPredicate()) { 
-  default: 
-    llvm_unreachable("Invalid icmp instruction!"); 
-  case ICmpInst::ICMP_EQ: 
-    // icmp eq i1 A, B -> ~(A ^ B) 
-    return BinaryOperator::CreateNot(Builder.CreateXor(A, B)); 
- 
-  case ICmpInst::ICMP_NE: 
-    // icmp ne i1 A, B -> A ^ B 
-    return BinaryOperator::CreateXor(A, B); 
- 
-  case ICmpInst::ICMP_UGT: 
-    // icmp ugt -> icmp ult 
-    std::swap(A, B); 
-    LLVM_FALLTHROUGH; 
-  case ICmpInst::ICMP_ULT: 
-    // icmp ult i1 A, B -> ~A & B 
-    return BinaryOperator::CreateAnd(Builder.CreateNot(A), B); 
- 
-  case ICmpInst::ICMP_SGT: 
-    // icmp sgt -> icmp slt 
-    std::swap(A, B); 
-    LLVM_FALLTHROUGH; 
-  case ICmpInst::ICMP_SLT: 
-    // icmp slt i1 A, B -> A & ~B 
-    return BinaryOperator::CreateAnd(Builder.CreateNot(B), A); 
- 
-  case ICmpInst::ICMP_UGE: 
-    // icmp uge -> icmp ule 
-    std::swap(A, B); 
-    LLVM_FALLTHROUGH; 
-  case ICmpInst::ICMP_ULE: 
-    // icmp ule i1 A, B -> ~A | B 
-    return BinaryOperator::CreateOr(Builder.CreateNot(A), B); 
- 
-  case ICmpInst::ICMP_SGE: 
-    // icmp sge -> icmp sle 
-    std::swap(A, B); 
-    LLVM_FALLTHROUGH; 
-  case ICmpInst::ICMP_SLE: 
-    // icmp sle i1 A, B -> A | ~B 
-    return BinaryOperator::CreateOr(Builder.CreateNot(B), A); 
-  } 
-} 
- 
-// Transform pattern like: 
-//   (1 << Y) u<= X  or  ~(-1 << Y) u<  X  or  ((1 << Y)+(-1)) u<  X 
-//   (1 << Y) u>  X  or  ~(-1 << Y) u>= X  or  ((1 << Y)+(-1)) u>= X 
-// Into: 
-//   (X l>> Y) != 0 
-//   (X l>> Y) == 0 
-static Instruction *foldICmpWithHighBitMask(ICmpInst &Cmp, 
-                                            InstCombiner::BuilderTy &Builder) { 
-  ICmpInst::Predicate Pred, NewPred; 
-  Value *X, *Y; 
-  if (match(&Cmp, 
-            m_c_ICmp(Pred, m_OneUse(m_Shl(m_One(), m_Value(Y))), m_Value(X)))) { 
-    switch (Pred) { 
-    case ICmpInst::ICMP_ULE: 
-      NewPred = ICmpInst::ICMP_NE; 
-      break; 
-    case ICmpInst::ICMP_UGT: 
-      NewPred = ICmpInst::ICMP_EQ; 
-      break; 
-    default: 
-      return nullptr; 
-    } 
-  } else if (match(&Cmp, m_c_ICmp(Pred, 
-                                  m_OneUse(m_CombineOr( 
-                                      m_Not(m_Shl(m_AllOnes(), m_Value(Y))), 
-                                      m_Add(m_Shl(m_One(), m_Value(Y)), 
-                                            m_AllOnes()))), 
-                                  m_Value(X)))) { 
-    // The variant with 'add' is not canonical, (the variant with 'not' is) 
-    // we only get it because it has extra uses, and can't be canonicalized, 
- 
-    switch (Pred) { 
-    case ICmpInst::ICMP_ULT: 
-      NewPred = ICmpInst::ICMP_NE; 
-      break; 
-    case ICmpInst::ICMP_UGE: 
-      NewPred = ICmpInst::ICMP_EQ; 
-      break; 
-    default: 
-      return nullptr; 
-    } 
-  } else 
-    return nullptr; 
- 
-  Value *NewX = Builder.CreateLShr(X, Y, X->getName() + ".highbits"); 
-  Constant *Zero = Constant::getNullValue(NewX->getType()); 
-  return CmpInst::Create(Instruction::ICmp, NewPred, NewX, Zero); 
-} 
- 
-static Instruction *foldVectorCmp(CmpInst &Cmp, 
-                                  InstCombiner::BuilderTy &Builder) { 
-  const CmpInst::Predicate Pred = Cmp.getPredicate(); 
-  Value *LHS = Cmp.getOperand(0), *RHS = Cmp.getOperand(1); 
-  Value *V1, *V2; 
-  ArrayRef<int> M; 
-  if (!match(LHS, m_Shuffle(m_Value(V1), m_Undef(), m_Mask(M)))) 
-    return nullptr; 
- 
-  // If both arguments of the cmp are shuffles that use the same mask and 
-  // shuffle within a single vector, move the shuffle after the cmp: 
-  // cmp (shuffle V1, M), (shuffle V2, M) --> shuffle (cmp V1, V2), M 
-  Type *V1Ty = V1->getType(); 
-  if (match(RHS, m_Shuffle(m_Value(V2), m_Undef(), m_SpecificMask(M))) && 
-      V1Ty == V2->getType() && (LHS->hasOneUse() || RHS->hasOneUse())) { 
-    Value *NewCmp = Builder.CreateCmp(Pred, V1, V2); 
-    return new ShuffleVectorInst(NewCmp, UndefValue::get(NewCmp->getType()), M); 
-  } 
- 
-  // Try to canonicalize compare with splatted operand and splat constant. 
-  // TODO: We could generalize this for more than splats. See/use the code in 
-  //       InstCombiner::foldVectorBinop(). 
-  Constant *C; 
-  if (!LHS->hasOneUse() || !match(RHS, m_Constant(C))) 
-    return nullptr; 
- 
-  // Length-changing splats are ok, so adjust the constants as needed: 
-  // cmp (shuffle V1, M), C --> shuffle (cmp V1, C'), M 
-  Constant *ScalarC = C->getSplatValue(/* AllowUndefs */ true); 
-  int MaskSplatIndex; 
-  if (ScalarC && match(M, m_SplatOrUndefMask(MaskSplatIndex))) { 
-    // We allow undefs in matching, but this transform removes those for safety. 
-    // Demanded elements analysis should be able to recover some/all of that. 
-    C = ConstantVector::getSplat(cast<VectorType>(V1Ty)->getElementCount(), 
-                                 ScalarC); 
-    SmallVector<int, 8> NewM(M.size(), MaskSplatIndex); 
-    Value *NewCmp = Builder.CreateCmp(Pred, V1, C); 
-    return new ShuffleVectorInst(NewCmp, UndefValue::get(NewCmp->getType()), 
-                                 NewM); 
-  } 
- 
-  return nullptr; 
-} 
- 
-// extract(uadd.with.overflow(A, B), 0) ult A 
-//  -> extract(uadd.with.overflow(A, B), 1) 
-static Instruction *foldICmpOfUAddOv(ICmpInst &I) { 
-  CmpInst::Predicate Pred = I.getPredicate(); 
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); 
- 
-  Value *UAddOv; 
-  Value *A, *B; 
-  auto UAddOvResultPat = m_ExtractValue<0>( 
-      m_Intrinsic<Intrinsic::uadd_with_overflow>(m_Value(A), m_Value(B))); 
-  if (match(Op0, UAddOvResultPat) && 
-      ((Pred == ICmpInst::ICMP_ULT && (Op1 == A || Op1 == B)) || 
-       (Pred == ICmpInst::ICMP_EQ && match(Op1, m_ZeroInt()) && 
-        (match(A, m_One()) || match(B, m_One()))) || 
-       (Pred == ICmpInst::ICMP_NE && match(Op1, m_AllOnes()) && 
-        (match(A, m_AllOnes()) || match(B, m_AllOnes()))))) 
-    // extract(uadd.with.overflow(A, B), 0) < A 
-    // extract(uadd.with.overflow(A, 1), 0) == 0 
-    // extract(uadd.with.overflow(A, -1), 0) != -1 
-    UAddOv = cast<ExtractValueInst>(Op0)->getAggregateOperand(); 
-  else if (match(Op1, UAddOvResultPat) && 
-           Pred == ICmpInst::ICMP_UGT && (Op0 == A || Op0 == B)) 
-    // A > extract(uadd.with.overflow(A, B), 0) 
-    UAddOv = cast<ExtractValueInst>(Op1)->getAggregateOperand(); 
-  else 
-    return nullptr; 
- 
-  return ExtractValueInst::Create(UAddOv, 1); 
-} 
- 
+
+  return &I;
+}
+
+/// Integer compare with boolean values can always be turned into bitwise ops.
+static Instruction *canonicalizeICmpBool(ICmpInst &I,
+                                         InstCombiner::BuilderTy &Builder) {
+  Value *A = I.getOperand(0), *B = I.getOperand(1);
+  assert(A->getType()->isIntOrIntVectorTy(1) && "Bools only");
+
+  // A boolean compared to true/false can be simplified to Op0/true/false in
+  // 14 out of the 20 (10 predicates * 2 constants) possible combinations.
+  // Cases not handled by InstSimplify are always 'not' of Op0.
+  if (match(B, m_Zero())) {
+    switch (I.getPredicate()) {
+      case CmpInst::ICMP_EQ:  // A ==   0 -> !A
+      case CmpInst::ICMP_ULE: // A <=u  0 -> !A
+      case CmpInst::ICMP_SGE: // A >=s  0 -> !A
+        return BinaryOperator::CreateNot(A);
+      default:
+        llvm_unreachable("ICmp i1 X, C not simplified as expected.");
+    }
+  } else if (match(B, m_One())) {
+    switch (I.getPredicate()) {
+      case CmpInst::ICMP_NE:  // A !=  1 -> !A
+      case CmpInst::ICMP_ULT: // A <u  1 -> !A
+      case CmpInst::ICMP_SGT: // A >s -1 -> !A
+        return BinaryOperator::CreateNot(A);
+      default:
+        llvm_unreachable("ICmp i1 X, C not simplified as expected.");
+    }
+  }
+
+  switch (I.getPredicate()) {
+  default:
+    llvm_unreachable("Invalid icmp instruction!");
+  case ICmpInst::ICMP_EQ:
+    // icmp eq i1 A, B -> ~(A ^ B)
+    return BinaryOperator::CreateNot(Builder.CreateXor(A, B));
+
+  case ICmpInst::ICMP_NE:
+    // icmp ne i1 A, B -> A ^ B
+    return BinaryOperator::CreateXor(A, B);
+
+  case ICmpInst::ICMP_UGT:
+    // icmp ugt -> icmp ult
+    std::swap(A, B);
+    LLVM_FALLTHROUGH;
+  case ICmpInst::ICMP_ULT:
+    // icmp ult i1 A, B -> ~A & B
+    return BinaryOperator::CreateAnd(Builder.CreateNot(A), B);
+
+  case ICmpInst::ICMP_SGT:
+    // icmp sgt -> icmp slt
+    std::swap(A, B);
+    LLVM_FALLTHROUGH;
+  case ICmpInst::ICMP_SLT:
+    // icmp slt i1 A, B -> A & ~B
+    return BinaryOperator::CreateAnd(Builder.CreateNot(B), A);
+
+  case ICmpInst::ICMP_UGE:
+    // icmp uge -> icmp ule
+    std::swap(A, B);
+    LLVM_FALLTHROUGH;
+  case ICmpInst::ICMP_ULE:
+    // icmp ule i1 A, B -> ~A | B
+    return BinaryOperator::CreateOr(Builder.CreateNot(A), B);
+
+  case ICmpInst::ICMP_SGE:
+    // icmp sge -> icmp sle
+    std::swap(A, B);
+    LLVM_FALLTHROUGH;
+  case ICmpInst::ICMP_SLE:
+    // icmp sle i1 A, B -> A | ~B
+    return BinaryOperator::CreateOr(Builder.CreateNot(B), A);
+  }
+}
+
+// Transform pattern like:
+//   (1 << Y) u<= X  or  ~(-1 << Y) u<  X  or  ((1 << Y)+(-1)) u<  X
+//   (1 << Y) u>  X  or  ~(-1 << Y) u>= X  or  ((1 << Y)+(-1)) u>= X
+// Into:
+//   (X l>> Y) != 0
+//   (X l>> Y) == 0
+static Instruction *foldICmpWithHighBitMask(ICmpInst &Cmp,
+                                            InstCombiner::BuilderTy &Builder) {
+  ICmpInst::Predicate Pred, NewPred;
+  Value *X, *Y;
+  if (match(&Cmp,
+            m_c_ICmp(Pred, m_OneUse(m_Shl(m_One(), m_Value(Y))), m_Value(X)))) {
+    switch (Pred) {
+    case ICmpInst::ICMP_ULE:
+      NewPred = ICmpInst::ICMP_NE;
+      break;
+    case ICmpInst::ICMP_UGT:
+      NewPred = ICmpInst::ICMP_EQ;
+      break;
+    default:
+      return nullptr;
+    }
+  } else if (match(&Cmp, m_c_ICmp(Pred,
+                                  m_OneUse(m_CombineOr(
+                                      m_Not(m_Shl(m_AllOnes(), m_Value(Y))),
+                                      m_Add(m_Shl(m_One(), m_Value(Y)),
+                                            m_AllOnes()))),
+                                  m_Value(X)))) {
+    // The variant with 'add' is not canonical, (the variant with 'not' is)
+    // we only get it because it has extra uses, and can't be canonicalized,
+
+    switch (Pred) {
+    case ICmpInst::ICMP_ULT:
+      NewPred = ICmpInst::ICMP_NE;
+      break;
+    case ICmpInst::ICMP_UGE:
+      NewPred = ICmpInst::ICMP_EQ;
+      break;
+    default:
+      return nullptr;
+    }
+  } else
+    return nullptr;
+
+  Value *NewX = Builder.CreateLShr(X, Y, X->getName() + ".highbits");
+  Constant *Zero = Constant::getNullValue(NewX->getType());
+  return CmpInst::Create(Instruction::ICmp, NewPred, NewX, Zero);
+}
+
+static Instruction *foldVectorCmp(CmpInst &Cmp,
+                                  InstCombiner::BuilderTy &Builder) {
+  const CmpInst::Predicate Pred = Cmp.getPredicate();
+  Value *LHS = Cmp.getOperand(0), *RHS = Cmp.getOperand(1);
+  Value *V1, *V2;
+  ArrayRef<int> M;
+  if (!match(LHS, m_Shuffle(m_Value(V1), m_Undef(), m_Mask(M))))
+    return nullptr;
+
+  // If both arguments of the cmp are shuffles that use the same mask and
+  // shuffle within a single vector, move the shuffle after the cmp:
+  // cmp (shuffle V1, M), (shuffle V2, M) --> shuffle (cmp V1, V2), M
+  Type *V1Ty = V1->getType();
+  if (match(RHS, m_Shuffle(m_Value(V2), m_Undef(), m_SpecificMask(M))) &&
+      V1Ty == V2->getType() && (LHS->hasOneUse() || RHS->hasOneUse())) {
+    Value *NewCmp = Builder.CreateCmp(Pred, V1, V2);
+    return new ShuffleVectorInst(NewCmp, UndefValue::get(NewCmp->getType()), M);
+  }
+
+  // Try to canonicalize compare with splatted operand and splat constant.
+  // TODO: We could generalize this for more than splats. See/use the code in
+  //       InstCombiner::foldVectorBinop().
+  Constant *C;
+  if (!LHS->hasOneUse() || !match(RHS, m_Constant(C)))
+    return nullptr;
+
+  // Length-changing splats are ok, so adjust the constants as needed:
+  // cmp (shuffle V1, M), C --> shuffle (cmp V1, C'), M
+  Constant *ScalarC = C->getSplatValue(/* AllowUndefs */ true);
+  int MaskSplatIndex;
+  if (ScalarC && match(M, m_SplatOrUndefMask(MaskSplatIndex))) {
+    // We allow undefs in matching, but this transform removes those for safety.
+    // Demanded elements analysis should be able to recover some/all of that.
+    C = ConstantVector::getSplat(cast<VectorType>(V1Ty)->getElementCount(),
+                                 ScalarC);
+    SmallVector<int, 8> NewM(M.size(), MaskSplatIndex);
+    Value *NewCmp = Builder.CreateCmp(Pred, V1, C);
+    return new ShuffleVectorInst(NewCmp, UndefValue::get(NewCmp->getType()),
+                                 NewM);
+  }
+
+  return nullptr;
+}
+
+// extract(uadd.with.overflow(A, B), 0) ult A
+//  -> extract(uadd.with.overflow(A, B), 1)
+static Instruction *foldICmpOfUAddOv(ICmpInst &I) {
+  CmpInst::Predicate Pred = I.getPredicate();
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  Value *UAddOv;
+  Value *A, *B;
+  auto UAddOvResultPat = m_ExtractValue<0>(
+      m_Intrinsic<Intrinsic::uadd_with_overflow>(m_Value(A), m_Value(B)));
+  if (match(Op0, UAddOvResultPat) &&
+      ((Pred == ICmpInst::ICMP_ULT && (Op1 == A || Op1 == B)) ||
+       (Pred == ICmpInst::ICMP_EQ && match(Op1, m_ZeroInt()) &&
+        (match(A, m_One()) || match(B, m_One()))) ||
+       (Pred == ICmpInst::ICMP_NE && match(Op1, m_AllOnes()) &&
+        (match(A, m_AllOnes()) || match(B, m_AllOnes())))))
+    // extract(uadd.with.overflow(A, B), 0) < A
+    // extract(uadd.with.overflow(A, 1), 0) == 0
+    // extract(uadd.with.overflow(A, -1), 0) != -1
+    UAddOv = cast<ExtractValueInst>(Op0)->getAggregateOperand();
+  else if (match(Op1, UAddOvResultPat) &&
+           Pred == ICmpInst::ICMP_UGT && (Op0 == A || Op0 == B))
+    // A > extract(uadd.with.overflow(A, B), 0)
+    UAddOv = cast<ExtractValueInst>(Op1)->getAggregateOperand();
+  else
+    return nullptr;
+
+  return ExtractValueInst::Create(UAddOv, 1);
+}
+
 Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
-  bool Changed = false; 
-  const SimplifyQuery Q = SQ.getWithInstruction(&I); 
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); 
-  unsigned Op0Cplxity = getComplexity(Op0); 
-  unsigned Op1Cplxity = getComplexity(Op1); 
- 
-  /// Orders the operands of the compare so that they are listed from most 
-  /// complex to least complex.  This puts constants before unary operators, 
-  /// before binary operators. 
-  if (Op0Cplxity < Op1Cplxity || 
-      (Op0Cplxity == Op1Cplxity && swapMayExposeCSEOpportunities(Op0, Op1))) { 
-    I.swapOperands(); 
-    std::swap(Op0, Op1); 
-    Changed = true; 
-  } 
- 
-  if (Value *V = SimplifyICmpInst(I.getPredicate(), Op0, Op1, Q)) 
-    return replaceInstUsesWith(I, V); 
- 
-  // Comparing -val or val with non-zero is the same as just comparing val 
-  // ie, abs(val) != 0 -> val != 0 
-  if (I.getPredicate() == ICmpInst::ICMP_NE && match(Op1, m_Zero())) { 
-    Value *Cond, *SelectTrue, *SelectFalse; 
-    if (match(Op0, m_Select(m_Value(Cond), m_Value(SelectTrue), 
-                            m_Value(SelectFalse)))) { 
-      if (Value *V = dyn_castNegVal(SelectTrue)) { 
-        if (V == SelectFalse) 
-          return CmpInst::Create(Instruction::ICmp, I.getPredicate(), V, Op1); 
-      } 
-      else if (Value *V = dyn_castNegVal(SelectFalse)) { 
-        if (V == SelectTrue) 
-          return CmpInst::Create(Instruction::ICmp, I.getPredicate(), V, Op1); 
-      } 
-    } 
-  } 
- 
-  if (Op0->getType()->isIntOrIntVectorTy(1)) 
-    if (Instruction *Res = canonicalizeICmpBool(I, Builder)) 
-      return Res; 
- 
-  if (Instruction *Res = canonicalizeCmpWithConstant(I)) 
-    return Res; 
- 
-  if (Instruction *Res = canonicalizeICmpPredicate(I)) 
-    return Res; 
- 
-  if (Instruction *Res = foldICmpWithConstant(I)) 
-    return Res; 
- 
-  if (Instruction *Res = foldICmpWithDominatingICmp(I)) 
-    return Res; 
- 
-  if (Instruction *Res = foldICmpBinOp(I, Q)) 
-    return Res; 
- 
-  if (Instruction *Res = foldICmpUsingKnownBits(I)) 
-    return Res; 
- 
-  // Test if the ICmpInst instruction is used exclusively by a select as 
-  // part of a minimum or maximum operation. If so, refrain from doing 
-  // any other folding. This helps out other analyses which understand 
-  // non-obfuscated minimum and maximum idioms, such as ScalarEvolution 
-  // and CodeGen. And in this case, at least one of the comparison 
-  // operands has at least one user besides the compare (the select), 
-  // which would often largely negate the benefit of folding anyway. 
-  // 
-  // Do the same for the other patterns recognized by matchSelectPattern. 
-  if (I.hasOneUse()) 
-    if (SelectInst *SI = dyn_cast<SelectInst>(I.user_back())) { 
-      Value *A, *B; 
-      SelectPatternResult SPR = matchSelectPattern(SI, A, B); 
-      if (SPR.Flavor != SPF_UNKNOWN) 
-        return nullptr; 
-    } 
- 
-  // Do this after checking for min/max to prevent infinite looping. 
-  if (Instruction *Res = foldICmpWithZero(I)) 
-    return Res; 
- 
-  // FIXME: We only do this after checking for min/max to prevent infinite 
-  // looping caused by a reverse canonicalization of these patterns for min/max. 
-  // FIXME: The organization of folds is a mess. These would naturally go into 
-  // canonicalizeCmpWithConstant(), but we can't move all of the above folds 
-  // down here after the min/max restriction. 
-  ICmpInst::Predicate Pred = I.getPredicate(); 
-  const APInt *C; 
-  if (match(Op1, m_APInt(C))) { 
-    // For i32: x >u 2147483647 -> x <s 0  -> true if sign bit set 
-    if (Pred == ICmpInst::ICMP_UGT && C->isMaxSignedValue()) { 
-      Constant *Zero = Constant::getNullValue(Op0->getType()); 
-      return new ICmpInst(ICmpInst::ICMP_SLT, Op0, Zero); 
-    } 
- 
-    // For i32: x <u 2147483648 -> x >s -1  -> true if sign bit clear 
-    if (Pred == ICmpInst::ICMP_ULT && C->isMinSignedValue()) { 
-      Constant *AllOnes = Constant::getAllOnesValue(Op0->getType()); 
-      return new ICmpInst(ICmpInst::ICMP_SGT, Op0, AllOnes); 
-    } 
-  } 
- 
-  if (Instruction *Res = foldICmpInstWithConstant(I)) 
-    return Res; 
- 
-  // Try to match comparison as a sign bit test. Intentionally do this after 
-  // foldICmpInstWithConstant() to potentially let other folds to happen first. 
-  if (Instruction *New = foldSignBitTest(I)) 
-    return New; 
- 
-  if (Instruction *Res = foldICmpInstWithConstantNotInt(I)) 
-    return Res; 
- 
-  // If we can optimize a 'icmp GEP, P' or 'icmp P, GEP', do so now. 
-  if (GEPOperator *GEP = dyn_cast<GEPOperator>(Op0)) 
-    if (Instruction *NI = foldGEPICmp(GEP, Op1, I.getPredicate(), I)) 
-      return NI; 
-  if (GEPOperator *GEP = dyn_cast<GEPOperator>(Op1)) 
-    if (Instruction *NI = foldGEPICmp(GEP, Op0, 
-                           ICmpInst::getSwappedPredicate(I.getPredicate()), I)) 
-      return NI; 
- 
-  // Try to optimize equality comparisons against alloca-based pointers. 
-  if (Op0->getType()->isPointerTy() && I.isEquality()) { 
-    assert(Op1->getType()->isPointerTy() && "Comparing pointer with non-pointer?"); 
+  bool Changed = false;
+  const SimplifyQuery Q = SQ.getWithInstruction(&I);
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  unsigned Op0Cplxity = getComplexity(Op0);
+  unsigned Op1Cplxity = getComplexity(Op1);
+
+  /// Orders the operands of the compare so that they are listed from most
+  /// complex to least complex.  This puts constants before unary operators,
+  /// before binary operators.
+  if (Op0Cplxity < Op1Cplxity ||
+      (Op0Cplxity == Op1Cplxity && swapMayExposeCSEOpportunities(Op0, Op1))) {
+    I.swapOperands();
+    std::swap(Op0, Op1);
+    Changed = true;
+  }
+
+  if (Value *V = SimplifyICmpInst(I.getPredicate(), Op0, Op1, Q))
+    return replaceInstUsesWith(I, V);
+
+  // Comparing -val or val with non-zero is the same as just comparing val
+  // ie, abs(val) != 0 -> val != 0
+  if (I.getPredicate() == ICmpInst::ICMP_NE && match(Op1, m_Zero())) {
+    Value *Cond, *SelectTrue, *SelectFalse;
+    if (match(Op0, m_Select(m_Value(Cond), m_Value(SelectTrue),
+                            m_Value(SelectFalse)))) {
+      if (Value *V = dyn_castNegVal(SelectTrue)) {
+        if (V == SelectFalse)
+          return CmpInst::Create(Instruction::ICmp, I.getPredicate(), V, Op1);
+      }
+      else if (Value *V = dyn_castNegVal(SelectFalse)) {
+        if (V == SelectTrue)
+          return CmpInst::Create(Instruction::ICmp, I.getPredicate(), V, Op1);
+      }
+    }
+  }
+
+  if (Op0->getType()->isIntOrIntVectorTy(1))
+    if (Instruction *Res = canonicalizeICmpBool(I, Builder))
+      return Res;
+
+  if (Instruction *Res = canonicalizeCmpWithConstant(I))
+    return Res;
+
+  if (Instruction *Res = canonicalizeICmpPredicate(I))
+    return Res;
+
+  if (Instruction *Res = foldICmpWithConstant(I))
+    return Res;
+
+  if (Instruction *Res = foldICmpWithDominatingICmp(I))
+    return Res;
+
+  if (Instruction *Res = foldICmpBinOp(I, Q))
+    return Res;
+
+  if (Instruction *Res = foldICmpUsingKnownBits(I))
+    return Res;
+
+  // Test if the ICmpInst instruction is used exclusively by a select as
+  // part of a minimum or maximum operation. If so, refrain from doing
+  // any other folding. This helps out other analyses which understand
+  // non-obfuscated minimum and maximum idioms, such as ScalarEvolution
+  // and CodeGen. And in this case, at least one of the comparison
+  // operands has at least one user besides the compare (the select),
+  // which would often largely negate the benefit of folding anyway.
+  //
+  // Do the same for the other patterns recognized by matchSelectPattern.
+  if (I.hasOneUse())
+    if (SelectInst *SI = dyn_cast<SelectInst>(I.user_back())) {
+      Value *A, *B;
+      SelectPatternResult SPR = matchSelectPattern(SI, A, B);
+      if (SPR.Flavor != SPF_UNKNOWN)
+        return nullptr;
+    }
+
+  // Do this after checking for min/max to prevent infinite looping.
+  if (Instruction *Res = foldICmpWithZero(I))
+    return Res;
+
+  // FIXME: We only do this after checking for min/max to prevent infinite
+  // looping caused by a reverse canonicalization of these patterns for min/max.
+  // FIXME: The organization of folds is a mess. These would naturally go into
+  // canonicalizeCmpWithConstant(), but we can't move all of the above folds
+  // down here after the min/max restriction.
+  ICmpInst::Predicate Pred = I.getPredicate();
+  const APInt *C;
+  if (match(Op1, m_APInt(C))) {
+    // For i32: x >u 2147483647 -> x <s 0  -> true if sign bit set
+    if (Pred == ICmpInst::ICMP_UGT && C->isMaxSignedValue()) {
+      Constant *Zero = Constant::getNullValue(Op0->getType());
+      return new ICmpInst(ICmpInst::ICMP_SLT, Op0, Zero);
+    }
+
+    // For i32: x <u 2147483648 -> x >s -1  -> true if sign bit clear
+    if (Pred == ICmpInst::ICMP_ULT && C->isMinSignedValue()) {
+      Constant *AllOnes = Constant::getAllOnesValue(Op0->getType());
+      return new ICmpInst(ICmpInst::ICMP_SGT, Op0, AllOnes);
+    }
+  }
+
+  if (Instruction *Res = foldICmpInstWithConstant(I))
+    return Res;
+
+  // Try to match comparison as a sign bit test. Intentionally do this after
+  // foldICmpInstWithConstant() to potentially let other folds to happen first.
+  if (Instruction *New = foldSignBitTest(I))
+    return New;
+
+  if (Instruction *Res = foldICmpInstWithConstantNotInt(I))
+    return Res;
+
+  // If we can optimize a 'icmp GEP, P' or 'icmp P, GEP', do so now.
+  if (GEPOperator *GEP = dyn_cast<GEPOperator>(Op0))
+    if (Instruction *NI = foldGEPICmp(GEP, Op1, I.getPredicate(), I))
+      return NI;
+  if (GEPOperator *GEP = dyn_cast<GEPOperator>(Op1))
+    if (Instruction *NI = foldGEPICmp(GEP, Op0,
+                           ICmpInst::getSwappedPredicate(I.getPredicate()), I))
+      return NI;
+
+  // Try to optimize equality comparisons against alloca-based pointers.
+  if (Op0->getType()->isPointerTy() && I.isEquality()) {
+    assert(Op1->getType()->isPointerTy() && "Comparing pointer with non-pointer?");
     if (auto *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(Op0)))
-      if (Instruction *New = foldAllocaCmp(I, Alloca, Op1)) 
-        return New; 
+      if (Instruction *New = foldAllocaCmp(I, Alloca, Op1))
+        return New;
     if (auto *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(Op1)))
-      if (Instruction *New = foldAllocaCmp(I, Alloca, Op0)) 
-        return New; 
-  } 
- 
-  if (Instruction *Res = foldICmpBitCast(I, Builder)) 
-    return Res; 
- 
-  // TODO: Hoist this above the min/max bailout. 
-  if (Instruction *R = foldICmpWithCastOp(I)) 
-    return R; 
- 
-  if (Instruction *Res = foldICmpWithMinMax(I)) 
-    return Res; 
- 
-  { 
-    Value *A, *B; 
-    // Transform (A & ~B) == 0 --> (A & B) != 0 
-    // and       (A & ~B) != 0 --> (A & B) == 0 
-    // if A is a power of 2. 
-    if (match(Op0, m_And(m_Value(A), m_Not(m_Value(B)))) && 
-        match(Op1, m_Zero()) && 
-        isKnownToBeAPowerOfTwo(A, false, 0, &I) && I.isEquality()) 
-      return new ICmpInst(I.getInversePredicate(), Builder.CreateAnd(A, B), 
-                          Op1); 
- 
-    // ~X < ~Y --> Y < X 
-    // ~X < C -->  X > ~C 
-    if (match(Op0, m_Not(m_Value(A)))) { 
-      if (match(Op1, m_Not(m_Value(B)))) 
-        return new ICmpInst(I.getPredicate(), B, A); 
- 
-      const APInt *C; 
-      if (match(Op1, m_APInt(C))) 
-        return new ICmpInst(I.getSwappedPredicate(), A, 
-                            ConstantInt::get(Op1->getType(), ~(*C))); 
-    } 
- 
-    Instruction *AddI = nullptr; 
-    if (match(&I, m_UAddWithOverflow(m_Value(A), m_Value(B), 
-                                     m_Instruction(AddI))) && 
-        isa<IntegerType>(A->getType())) { 
-      Value *Result; 
-      Constant *Overflow; 
-      // m_UAddWithOverflow can match patterns that do not include  an explicit 
-      // "add" instruction, so check the opcode of the matched op. 
-      if (AddI->getOpcode() == Instruction::Add && 
-          OptimizeOverflowCheck(Instruction::Add, /*Signed*/ false, A, B, *AddI, 
-                                Result, Overflow)) { 
-        replaceInstUsesWith(*AddI, Result); 
-        eraseInstFromFunction(*AddI); 
-        return replaceInstUsesWith(I, Overflow); 
-      } 
-    } 
- 
-    // (zext a) * (zext b)  --> llvm.umul.with.overflow. 
-    if (match(Op0, m_Mul(m_ZExt(m_Value(A)), m_ZExt(m_Value(B))))) { 
-      if (Instruction *R = processUMulZExtIdiom(I, Op0, Op1, *this)) 
-        return R; 
-    } 
-    if (match(Op1, m_Mul(m_ZExt(m_Value(A)), m_ZExt(m_Value(B))))) { 
-      if (Instruction *R = processUMulZExtIdiom(I, Op1, Op0, *this)) 
-        return R; 
-    } 
-  } 
- 
-  if (Instruction *Res = foldICmpEquality(I)) 
-    return Res; 
- 
-  if (Instruction *Res = foldICmpOfUAddOv(I)) 
-    return Res; 
- 
-  // The 'cmpxchg' instruction returns an aggregate containing the old value and 
-  // an i1 which indicates whether or not we successfully did the swap. 
-  // 
-  // Replace comparisons between the old value and the expected value with the 
-  // indicator that 'cmpxchg' returns. 
-  // 
-  // N.B.  This transform is only valid when the 'cmpxchg' is not permitted to 
-  // spuriously fail.  In those cases, the old value may equal the expected 
-  // value but it is possible for the swap to not occur. 
-  if (I.getPredicate() == ICmpInst::ICMP_EQ) 
-    if (auto *EVI = dyn_cast<ExtractValueInst>(Op0)) 
-      if (auto *ACXI = dyn_cast<AtomicCmpXchgInst>(EVI->getAggregateOperand())) 
-        if (EVI->getIndices()[0] == 0 && ACXI->getCompareOperand() == Op1 && 
-            !ACXI->isWeak()) 
-          return ExtractValueInst::Create(ACXI, 1); 
- 
-  { 
-    Value *X; 
-    const APInt *C; 
-    // icmp X+Cst, X 
-    if (match(Op0, m_Add(m_Value(X), m_APInt(C))) && Op1 == X) 
-      return foldICmpAddOpConst(X, *C, I.getPredicate()); 
- 
-    // icmp X, X+Cst 
-    if (match(Op1, m_Add(m_Value(X), m_APInt(C))) && Op0 == X) 
-      return foldICmpAddOpConst(X, *C, I.getSwappedPredicate()); 
-  } 
- 
-  if (Instruction *Res = foldICmpWithHighBitMask(I, Builder)) 
-    return Res; 
- 
-  if (I.getType()->isVectorTy()) 
-    if (Instruction *Res = foldVectorCmp(I, Builder)) 
-      return Res; 
- 
-  return Changed ? &I : nullptr; 
-} 
- 
-/// Fold fcmp ([us]itofp x, cst) if possible. 
+      if (Instruction *New = foldAllocaCmp(I, Alloca, Op0))
+        return New;
+  }
+
+  if (Instruction *Res = foldICmpBitCast(I, Builder))
+    return Res;
+
+  // TODO: Hoist this above the min/max bailout.
+  if (Instruction *R = foldICmpWithCastOp(I))
+    return R;
+
+  if (Instruction *Res = foldICmpWithMinMax(I))
+    return Res;
+
+  {
+    Value *A, *B;
+    // Transform (A & ~B) == 0 --> (A & B) != 0
+    // and       (A & ~B) != 0 --> (A & B) == 0
+    // if A is a power of 2.
+    if (match(Op0, m_And(m_Value(A), m_Not(m_Value(B)))) &&
+        match(Op1, m_Zero()) &&
+        isKnownToBeAPowerOfTwo(A, false, 0, &I) && I.isEquality())
+      return new ICmpInst(I.getInversePredicate(), Builder.CreateAnd(A, B),
+                          Op1);
+
+    // ~X < ~Y --> Y < X
+    // ~X < C -->  X > ~C
+    if (match(Op0, m_Not(m_Value(A)))) {
+      if (match(Op1, m_Not(m_Value(B))))
+        return new ICmpInst(I.getPredicate(), B, A);
+
+      const APInt *C;
+      if (match(Op1, m_APInt(C)))
+        return new ICmpInst(I.getSwappedPredicate(), A,
+                            ConstantInt::get(Op1->getType(), ~(*C)));
+    }
+
+    Instruction *AddI = nullptr;
+    if (match(&I, m_UAddWithOverflow(m_Value(A), m_Value(B),
+                                     m_Instruction(AddI))) &&
+        isa<IntegerType>(A->getType())) {
+      Value *Result;
+      Constant *Overflow;
+      // m_UAddWithOverflow can match patterns that do not include  an explicit
+      // "add" instruction, so check the opcode of the matched op.
+      if (AddI->getOpcode() == Instruction::Add &&
+          OptimizeOverflowCheck(Instruction::Add, /*Signed*/ false, A, B, *AddI,
+                                Result, Overflow)) {
+        replaceInstUsesWith(*AddI, Result);
+        eraseInstFromFunction(*AddI);
+        return replaceInstUsesWith(I, Overflow);
+      }
+    }
+
+    // (zext a) * (zext b)  --> llvm.umul.with.overflow.
+    if (match(Op0, m_Mul(m_ZExt(m_Value(A)), m_ZExt(m_Value(B))))) {
+      if (Instruction *R = processUMulZExtIdiom(I, Op0, Op1, *this))
+        return R;
+    }
+    if (match(Op1, m_Mul(m_ZExt(m_Value(A)), m_ZExt(m_Value(B))))) {
+      if (Instruction *R = processUMulZExtIdiom(I, Op1, Op0, *this))
+        return R;
+    }
+  }
+
+  if (Instruction *Res = foldICmpEquality(I))
+    return Res;
+
+  if (Instruction *Res = foldICmpOfUAddOv(I))
+    return Res;
+
+  // The 'cmpxchg' instruction returns an aggregate containing the old value and
+  // an i1 which indicates whether or not we successfully did the swap.
+  //
+  // Replace comparisons between the old value and the expected value with the
+  // indicator that 'cmpxchg' returns.
+  //
+  // N.B.  This transform is only valid when the 'cmpxchg' is not permitted to
+  // spuriously fail.  In those cases, the old value may equal the expected
+  // value but it is possible for the swap to not occur.
+  if (I.getPredicate() == ICmpInst::ICMP_EQ)
+    if (auto *EVI = dyn_cast<ExtractValueInst>(Op0))
+      if (auto *ACXI = dyn_cast<AtomicCmpXchgInst>(EVI->getAggregateOperand()))
+        if (EVI->getIndices()[0] == 0 && ACXI->getCompareOperand() == Op1 &&
+            !ACXI->isWeak())
+          return ExtractValueInst::Create(ACXI, 1);
+
+  {
+    Value *X;
+    const APInt *C;
+    // icmp X+Cst, X
+    if (match(Op0, m_Add(m_Value(X), m_APInt(C))) && Op1 == X)
+      return foldICmpAddOpConst(X, *C, I.getPredicate());
+
+    // icmp X, X+Cst
+    if (match(Op1, m_Add(m_Value(X), m_APInt(C))) && Op0 == X)
+      return foldICmpAddOpConst(X, *C, I.getSwappedPredicate());
+  }
+
+  if (Instruction *Res = foldICmpWithHighBitMask(I, Builder))
+    return Res;
+
+  if (I.getType()->isVectorTy())
+    if (Instruction *Res = foldVectorCmp(I, Builder))
+      return Res;
+
+  return Changed ? &I : nullptr;
+}
+
+/// Fold fcmp ([us]itofp x, cst) if possible.
 Instruction *InstCombinerImpl::foldFCmpIntToFPConst(FCmpInst &I,
                                                     Instruction *LHSI,
                                                     Constant *RHSC) {
-  if (!isa<ConstantFP>(RHSC)) return nullptr; 
-  const APFloat &RHS = cast<ConstantFP>(RHSC)->getValueAPF(); 
- 
-  // Get the width of the mantissa.  We don't want to hack on conversions that 
-  // might lose information from the integer, e.g. "i64 -> float" 
-  int MantissaWidth = LHSI->getType()->getFPMantissaWidth(); 
-  if (MantissaWidth == -1) return nullptr;  // Unknown. 
- 
-  IntegerType *IntTy = cast<IntegerType>(LHSI->getOperand(0)->getType()); 
- 
-  bool LHSUnsigned = isa<UIToFPInst>(LHSI); 
- 
-  if (I.isEquality()) { 
-    FCmpInst::Predicate P = I.getPredicate(); 
-    bool IsExact = false; 
-    APSInt RHSCvt(IntTy->getBitWidth(), LHSUnsigned); 
-    RHS.convertToInteger(RHSCvt, APFloat::rmNearestTiesToEven, &IsExact); 
- 
-    // If the floating point constant isn't an integer value, we know if we will 
-    // ever compare equal / not equal to it. 
-    if (!IsExact) { 
-      // TODO: Can never be -0.0 and other non-representable values 
-      APFloat RHSRoundInt(RHS); 
-      RHSRoundInt.roundToIntegral(APFloat::rmNearestTiesToEven); 
-      if (RHS != RHSRoundInt) { 
-        if (P == FCmpInst::FCMP_OEQ || P == FCmpInst::FCMP_UEQ) 
-          return replaceInstUsesWith(I, Builder.getFalse()); 
- 
-        assert(P == FCmpInst::FCMP_ONE || P == FCmpInst::FCMP_UNE); 
-        return replaceInstUsesWith(I, Builder.getTrue()); 
-      } 
-    } 
- 
-    // TODO: If the constant is exactly representable, is it always OK to do 
-    // equality compares as integer? 
-  } 
- 
-  // Check to see that the input is converted from an integer type that is small 
-  // enough that preserves all bits.  TODO: check here for "known" sign bits. 
-  // This would allow us to handle (fptosi (x >>s 62) to float) if x is i64 f.e. 
-  unsigned InputSize = IntTy->getScalarSizeInBits(); 
- 
-  // Following test does NOT adjust InputSize downwards for signed inputs, 
-  // because the most negative value still requires all the mantissa bits 
-  // to distinguish it from one less than that value. 
-  if ((int)InputSize > MantissaWidth) { 
-    // Conversion would lose accuracy. Check if loss can impact comparison. 
-    int Exp = ilogb(RHS); 
-    if (Exp == APFloat::IEK_Inf) { 
-      int MaxExponent = ilogb(APFloat::getLargest(RHS.getSemantics())); 
-      if (MaxExponent < (int)InputSize - !LHSUnsigned) 
-        // Conversion could create infinity. 
-        return nullptr; 
-    } else { 
-      // Note that if RHS is zero or NaN, then Exp is negative 
-      // and first condition is trivially false. 
-      if (MantissaWidth <= Exp && Exp <= (int)InputSize - !LHSUnsigned) 
-        // Conversion could affect comparison. 
-        return nullptr; 
-    } 
-  } 
- 
-  // Otherwise, we can potentially simplify the comparison.  We know that it 
-  // will always come through as an integer value and we know the constant is 
-  // not a NAN (it would have been previously simplified). 
-  assert(!RHS.isNaN() && "NaN comparison not already folded!"); 
- 
-  ICmpInst::Predicate Pred; 
-  switch (I.getPredicate()) { 
-  default: llvm_unreachable("Unexpected predicate!"); 
-  case FCmpInst::FCMP_UEQ: 
-  case FCmpInst::FCMP_OEQ: 
-    Pred = ICmpInst::ICMP_EQ; 
-    break; 
-  case FCmpInst::FCMP_UGT: 
-  case FCmpInst::FCMP_OGT: 
-    Pred = LHSUnsigned ? ICmpInst::ICMP_UGT : ICmpInst::ICMP_SGT; 
-    break; 
-  case FCmpInst::FCMP_UGE: 
-  case FCmpInst::FCMP_OGE: 
-    Pred = LHSUnsigned ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_SGE; 
-    break; 
-  case FCmpInst::FCMP_ULT: 
-  case FCmpInst::FCMP_OLT: 
-    Pred = LHSUnsigned ? ICmpInst::ICMP_ULT : ICmpInst::ICMP_SLT; 
-    break; 
-  case FCmpInst::FCMP_ULE: 
-  case FCmpInst::FCMP_OLE: 
-    Pred = LHSUnsigned ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_SLE; 
-    break; 
-  case FCmpInst::FCMP_UNE: 
-  case FCmpInst::FCMP_ONE: 
-    Pred = ICmpInst::ICMP_NE; 
-    break; 
-  case FCmpInst::FCMP_ORD: 
-    return replaceInstUsesWith(I, Builder.getTrue()); 
-  case FCmpInst::FCMP_UNO: 
-    return replaceInstUsesWith(I, Builder.getFalse()); 
-  } 
- 
-  // Now we know that the APFloat is a normal number, zero or inf. 
- 
-  // See if the FP constant is too large for the integer.  For example, 
-  // comparing an i8 to 300.0. 
-  unsigned IntWidth = IntTy->getScalarSizeInBits(); 
- 
-  if (!LHSUnsigned) { 
-    // If the RHS value is > SignedMax, fold the comparison.  This handles +INF 
-    // and large values. 
-    APFloat SMax(RHS.getSemantics()); 
-    SMax.convertFromAPInt(APInt::getSignedMaxValue(IntWidth), true, 
-                          APFloat::rmNearestTiesToEven); 
-    if (SMax < RHS) { // smax < 13123.0 
-      if (Pred == ICmpInst::ICMP_NE  || Pred == ICmpInst::ICMP_SLT || 
-          Pred == ICmpInst::ICMP_SLE) 
-        return replaceInstUsesWith(I, Builder.getTrue()); 
-      return replaceInstUsesWith(I, Builder.getFalse()); 
-    } 
-  } else { 
-    // If the RHS value is > UnsignedMax, fold the comparison. This handles 
-    // +INF and large values. 
-    APFloat UMax(RHS.getSemantics()); 
-    UMax.convertFromAPInt(APInt::getMaxValue(IntWidth), false, 
-                          APFloat::rmNearestTiesToEven); 
-    if (UMax < RHS) { // umax < 13123.0 
-      if (Pred == ICmpInst::ICMP_NE  || Pred == ICmpInst::ICMP_ULT || 
-          Pred == ICmpInst::ICMP_ULE) 
-        return replaceInstUsesWith(I, Builder.getTrue()); 
-      return replaceInstUsesWith(I, Builder.getFalse()); 
-    } 
-  } 
- 
-  if (!LHSUnsigned) { 
-    // See if the RHS value is < SignedMin. 
-    APFloat SMin(RHS.getSemantics()); 
-    SMin.convertFromAPInt(APInt::getSignedMinValue(IntWidth), true, 
-                          APFloat::rmNearestTiesToEven); 
-    if (SMin > RHS) { // smin > 12312.0 
-      if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SGT || 
-          Pred == ICmpInst::ICMP_SGE) 
-        return replaceInstUsesWith(I, Builder.getTrue()); 
-      return replaceInstUsesWith(I, Builder.getFalse()); 
-    } 
-  } else { 
-    // See if the RHS value is < UnsignedMin. 
-    APFloat UMin(RHS.getSemantics()); 
-    UMin.convertFromAPInt(APInt::getMinValue(IntWidth), false, 
-                          APFloat::rmNearestTiesToEven); 
-    if (UMin > RHS) { // umin > 12312.0 
-      if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_UGT || 
-          Pred == ICmpInst::ICMP_UGE) 
-        return replaceInstUsesWith(I, Builder.getTrue()); 
-      return replaceInstUsesWith(I, Builder.getFalse()); 
-    } 
-  } 
- 
-  // Okay, now we know that the FP constant fits in the range [SMIN, SMAX] or 
-  // [0, UMAX], but it may still be fractional.  See if it is fractional by 
-  // casting the FP value to the integer value and back, checking for equality. 
-  // Don't do this for zero, because -0.0 is not fractional. 
-  Constant *RHSInt = LHSUnsigned 
-    ? ConstantExpr::getFPToUI(RHSC, IntTy) 
-    : ConstantExpr::getFPToSI(RHSC, IntTy); 
-  if (!RHS.isZero()) { 
-    bool Equal = LHSUnsigned 
-      ? ConstantExpr::getUIToFP(RHSInt, RHSC->getType()) == RHSC 
-      : ConstantExpr::getSIToFP(RHSInt, RHSC->getType()) == RHSC; 
-    if (!Equal) { 
-      // If we had a comparison against a fractional value, we have to adjust 
-      // the compare predicate and sometimes the value.  RHSC is rounded towards 
-      // zero at this point. 
-      switch (Pred) { 
-      default: llvm_unreachable("Unexpected integer comparison!"); 
-      case ICmpInst::ICMP_NE:  // (float)int != 4.4   --> true 
-        return replaceInstUsesWith(I, Builder.getTrue()); 
-      case ICmpInst::ICMP_EQ:  // (float)int == 4.4   --> false 
-        return replaceInstUsesWith(I, Builder.getFalse()); 
-      case ICmpInst::ICMP_ULE: 
-        // (float)int <= 4.4   --> int <= 4 
-        // (float)int <= -4.4  --> false 
-        if (RHS.isNegative()) 
-          return replaceInstUsesWith(I, Builder.getFalse()); 
-        break; 
-      case ICmpInst::ICMP_SLE: 
-        // (float)int <= 4.4   --> int <= 4 
-        // (float)int <= -4.4  --> int < -4 
-        if (RHS.isNegative()) 
-          Pred = ICmpInst::ICMP_SLT; 
-        break; 
-      case ICmpInst::ICMP_ULT: 
-        // (float)int < -4.4   --> false 
-        // (float)int < 4.4    --> int <= 4 
-        if (RHS.isNegative()) 
-          return replaceInstUsesWith(I, Builder.getFalse()); 
-        Pred = ICmpInst::ICMP_ULE; 
-        break; 
-      case ICmpInst::ICMP_SLT: 
-        // (float)int < -4.4   --> int < -4 
-        // (float)int < 4.4    --> int <= 4 
-        if (!RHS.isNegative()) 
-          Pred = ICmpInst::ICMP_SLE; 
-        break; 
-      case ICmpInst::ICMP_UGT: 
-        // (float)int > 4.4    --> int > 4 
-        // (float)int > -4.4   --> true 
-        if (RHS.isNegative()) 
-          return replaceInstUsesWith(I, Builder.getTrue()); 
-        break; 
-      case ICmpInst::ICMP_SGT: 
-        // (float)int > 4.4    --> int > 4 
-        // (float)int > -4.4   --> int >= -4 
-        if (RHS.isNegative()) 
-          Pred = ICmpInst::ICMP_SGE; 
-        break; 
-      case ICmpInst::ICMP_UGE: 
-        // (float)int >= -4.4   --> true 
-        // (float)int >= 4.4    --> int > 4 
-        if (RHS.isNegative()) 
-          return replaceInstUsesWith(I, Builder.getTrue()); 
-        Pred = ICmpInst::ICMP_UGT; 
-        break; 
-      case ICmpInst::ICMP_SGE: 
-        // (float)int >= -4.4   --> int >= -4 
-        // (float)int >= 4.4    --> int > 4 
-        if (!RHS.isNegative()) 
-          Pred = ICmpInst::ICMP_SGT; 
-        break; 
-      } 
-    } 
-  } 
- 
-  // Lower this FP comparison into an appropriate integer version of the 
-  // comparison. 
-  return new ICmpInst(Pred, LHSI->getOperand(0), RHSInt); 
-} 
- 
-/// Fold (C / X) < 0.0 --> X < 0.0 if possible. Swap predicate if necessary. 
-static Instruction *foldFCmpReciprocalAndZero(FCmpInst &I, Instruction *LHSI, 
-                                              Constant *RHSC) { 
-  // When C is not 0.0 and infinities are not allowed: 
-  // (C / X) < 0.0 is a sign-bit test of X 
-  // (C / X) < 0.0 --> X < 0.0 (if C is positive) 
-  // (C / X) < 0.0 --> X > 0.0 (if C is negative, swap the predicate) 
-  // 
-  // Proof: 
-  // Multiply (C / X) < 0.0 by X * X / C. 
-  // - X is non zero, if it is the flag 'ninf' is violated. 
-  // - C defines the sign of X * X * C. Thus it also defines whether to swap 
-  //   the predicate. C is also non zero by definition. 
-  // 
-  // Thus X * X / C is non zero and the transformation is valid. [qed] 
- 
-  FCmpInst::Predicate Pred = I.getPredicate(); 
- 
-  // Check that predicates are valid. 
-  if ((Pred != FCmpInst::FCMP_OGT) && (Pred != FCmpInst::FCMP_OLT) && 
-      (Pred != FCmpInst::FCMP_OGE) && (Pred != FCmpInst::FCMP_OLE)) 
-    return nullptr; 
- 
-  // Check that RHS operand is zero. 
-  if (!match(RHSC, m_AnyZeroFP())) 
-    return nullptr; 
- 
-  // Check fastmath flags ('ninf'). 
-  if (!LHSI->hasNoInfs() || !I.hasNoInfs()) 
-    return nullptr; 
- 
-  // Check the properties of the dividend. It must not be zero to avoid a 
-  // division by zero (see Proof). 
-  const APFloat *C; 
-  if (!match(LHSI->getOperand(0), m_APFloat(C))) 
-    return nullptr; 
- 
-  if (C->isZero()) 
-    return nullptr; 
- 
-  // Get swapped predicate if necessary. 
-  if (C->isNegative()) 
-    Pred = I.getSwappedPredicate(); 
- 
-  return new FCmpInst(Pred, LHSI->getOperand(1), RHSC, "", &I); 
-} 
- 
-/// Optimize fabs(X) compared with zero. 
+  if (!isa<ConstantFP>(RHSC)) return nullptr;
+  const APFloat &RHS = cast<ConstantFP>(RHSC)->getValueAPF();
+
+  // Get the width of the mantissa.  We don't want to hack on conversions that
+  // might lose information from the integer, e.g. "i64 -> float"
+  int MantissaWidth = LHSI->getType()->getFPMantissaWidth();
+  if (MantissaWidth == -1) return nullptr;  // Unknown.
+
+  IntegerType *IntTy = cast<IntegerType>(LHSI->getOperand(0)->getType());
+
+  bool LHSUnsigned = isa<UIToFPInst>(LHSI);
+
+  if (I.isEquality()) {
+    FCmpInst::Predicate P = I.getPredicate();
+    bool IsExact = false;
+    APSInt RHSCvt(IntTy->getBitWidth(), LHSUnsigned);
+    RHS.convertToInteger(RHSCvt, APFloat::rmNearestTiesToEven, &IsExact);
+
+    // If the floating point constant isn't an integer value, we know if we will
+    // ever compare equal / not equal to it.
+    if (!IsExact) {
+      // TODO: Can never be -0.0 and other non-representable values
+      APFloat RHSRoundInt(RHS);
+      RHSRoundInt.roundToIntegral(APFloat::rmNearestTiesToEven);
+      if (RHS != RHSRoundInt) {
+        if (P == FCmpInst::FCMP_OEQ || P == FCmpInst::FCMP_UEQ)
+          return replaceInstUsesWith(I, Builder.getFalse());
+
+        assert(P == FCmpInst::FCMP_ONE || P == FCmpInst::FCMP_UNE);
+        return replaceInstUsesWith(I, Builder.getTrue());
+      }
+    }
+
+    // TODO: If the constant is exactly representable, is it always OK to do
+    // equality compares as integer?
+  }
+
+  // Check to see that the input is converted from an integer type that is small
+  // enough that preserves all bits.  TODO: check here for "known" sign bits.
+  // This would allow us to handle (fptosi (x >>s 62) to float) if x is i64 f.e.
+  unsigned InputSize = IntTy->getScalarSizeInBits();
+
+  // Following test does NOT adjust InputSize downwards for signed inputs,
+  // because the most negative value still requires all the mantissa bits
+  // to distinguish it from one less than that value.
+  if ((int)InputSize > MantissaWidth) {
+    // Conversion would lose accuracy. Check if loss can impact comparison.
+    int Exp = ilogb(RHS);
+    if (Exp == APFloat::IEK_Inf) {
+      int MaxExponent = ilogb(APFloat::getLargest(RHS.getSemantics()));
+      if (MaxExponent < (int)InputSize - !LHSUnsigned)
+        // Conversion could create infinity.
+        return nullptr;
+    } else {
+      // Note that if RHS is zero or NaN, then Exp is negative
+      // and first condition is trivially false.
+      if (MantissaWidth <= Exp && Exp <= (int)InputSize - !LHSUnsigned)
+        // Conversion could affect comparison.
+        return nullptr;
+    }
+  }
+
+  // Otherwise, we can potentially simplify the comparison.  We know that it
+  // will always come through as an integer value and we know the constant is
+  // not a NAN (it would have been previously simplified).
+  assert(!RHS.isNaN() && "NaN comparison not already folded!");
+
+  ICmpInst::Predicate Pred;
+  switch (I.getPredicate()) {
+  default: llvm_unreachable("Unexpected predicate!");
+  case FCmpInst::FCMP_UEQ:
+  case FCmpInst::FCMP_OEQ:
+    Pred = ICmpInst::ICMP_EQ;
+    break;
+  case FCmpInst::FCMP_UGT:
+  case FCmpInst::FCMP_OGT:
+    Pred = LHSUnsigned ? ICmpInst::ICMP_UGT : ICmpInst::ICMP_SGT;
+    break;
+  case FCmpInst::FCMP_UGE:
+  case FCmpInst::FCMP_OGE:
+    Pred = LHSUnsigned ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_SGE;
+    break;
+  case FCmpInst::FCMP_ULT:
+  case FCmpInst::FCMP_OLT:
+    Pred = LHSUnsigned ? ICmpInst::ICMP_ULT : ICmpInst::ICMP_SLT;
+    break;
+  case FCmpInst::FCMP_ULE:
+  case FCmpInst::FCMP_OLE:
+    Pred = LHSUnsigned ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_SLE;
+    break;
+  case FCmpInst::FCMP_UNE:
+  case FCmpInst::FCMP_ONE:
+    Pred = ICmpInst::ICMP_NE;
+    break;
+  case FCmpInst::FCMP_ORD:
+    return replaceInstUsesWith(I, Builder.getTrue());
+  case FCmpInst::FCMP_UNO:
+    return replaceInstUsesWith(I, Builder.getFalse());
+  }
+
+  // Now we know that the APFloat is a normal number, zero or inf.
+
+  // See if the FP constant is too large for the integer.  For example,
+  // comparing an i8 to 300.0.
+  unsigned IntWidth = IntTy->getScalarSizeInBits();
+
+  if (!LHSUnsigned) {
+    // If the RHS value is > SignedMax, fold the comparison.  This handles +INF
+    // and large values.
+    APFloat SMax(RHS.getSemantics());
+    SMax.convertFromAPInt(APInt::getSignedMaxValue(IntWidth), true,
+                          APFloat::rmNearestTiesToEven);
+    if (SMax < RHS) { // smax < 13123.0
+      if (Pred == ICmpInst::ICMP_NE  || Pred == ICmpInst::ICMP_SLT ||
+          Pred == ICmpInst::ICMP_SLE)
+        return replaceInstUsesWith(I, Builder.getTrue());
+      return replaceInstUsesWith(I, Builder.getFalse());
+    }
+  } else {
+    // If the RHS value is > UnsignedMax, fold the comparison. This handles
+    // +INF and large values.
+    APFloat UMax(RHS.getSemantics());
+    UMax.convertFromAPInt(APInt::getMaxValue(IntWidth), false,
+                          APFloat::rmNearestTiesToEven);
+    if (UMax < RHS) { // umax < 13123.0
+      if (Pred == ICmpInst::ICMP_NE  || Pred == ICmpInst::ICMP_ULT ||
+          Pred == ICmpInst::ICMP_ULE)
+        return replaceInstUsesWith(I, Builder.getTrue());
+      return replaceInstUsesWith(I, Builder.getFalse());
+    }
+  }
+
+  if (!LHSUnsigned) {
+    // See if the RHS value is < SignedMin.
+    APFloat SMin(RHS.getSemantics());
+    SMin.convertFromAPInt(APInt::getSignedMinValue(IntWidth), true,
+                          APFloat::rmNearestTiesToEven);
+    if (SMin > RHS) { // smin > 12312.0
+      if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SGT ||
+          Pred == ICmpInst::ICMP_SGE)
+        return replaceInstUsesWith(I, Builder.getTrue());
+      return replaceInstUsesWith(I, Builder.getFalse());
+    }
+  } else {
+    // See if the RHS value is < UnsignedMin.
+    APFloat UMin(RHS.getSemantics());
+    UMin.convertFromAPInt(APInt::getMinValue(IntWidth), false,
+                          APFloat::rmNearestTiesToEven);
+    if (UMin > RHS) { // umin > 12312.0
+      if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_UGT ||
+          Pred == ICmpInst::ICMP_UGE)
+        return replaceInstUsesWith(I, Builder.getTrue());
+      return replaceInstUsesWith(I, Builder.getFalse());
+    }
+  }
+
+  // Okay, now we know that the FP constant fits in the range [SMIN, SMAX] or
+  // [0, UMAX], but it may still be fractional.  See if it is fractional by
+  // casting the FP value to the integer value and back, checking for equality.
+  // Don't do this for zero, because -0.0 is not fractional.
+  Constant *RHSInt = LHSUnsigned
+    ? ConstantExpr::getFPToUI(RHSC, IntTy)
+    : ConstantExpr::getFPToSI(RHSC, IntTy);
+  if (!RHS.isZero()) {
+    bool Equal = LHSUnsigned
+      ? ConstantExpr::getUIToFP(RHSInt, RHSC->getType()) == RHSC
+      : ConstantExpr::getSIToFP(RHSInt, RHSC->getType()) == RHSC;
+    if (!Equal) {
+      // If we had a comparison against a fractional value, we have to adjust
+      // the compare predicate and sometimes the value.  RHSC is rounded towards
+      // zero at this point.
+      switch (Pred) {
+      default: llvm_unreachable("Unexpected integer comparison!");
+      case ICmpInst::ICMP_NE:  // (float)int != 4.4   --> true
+        return replaceInstUsesWith(I, Builder.getTrue());
+      case ICmpInst::ICMP_EQ:  // (float)int == 4.4   --> false
+        return replaceInstUsesWith(I, Builder.getFalse());
+      case ICmpInst::ICMP_ULE:
+        // (float)int <= 4.4   --> int <= 4
+        // (float)int <= -4.4  --> false
+        if (RHS.isNegative())
+          return replaceInstUsesWith(I, Builder.getFalse());
+        break;
+      case ICmpInst::ICMP_SLE:
+        // (float)int <= 4.4   --> int <= 4
+        // (float)int <= -4.4  --> int < -4
+        if (RHS.isNegative())
+          Pred = ICmpInst::ICMP_SLT;
+        break;
+      case ICmpInst::ICMP_ULT:
+        // (float)int < -4.4   --> false
+        // (float)int < 4.4    --> int <= 4
+        if (RHS.isNegative())
+          return replaceInstUsesWith(I, Builder.getFalse());
+        Pred = ICmpInst::ICMP_ULE;
+        break;
+      case ICmpInst::ICMP_SLT:
+        // (float)int < -4.4   --> int < -4
+        // (float)int < 4.4    --> int <= 4
+        if (!RHS.isNegative())
+          Pred = ICmpInst::ICMP_SLE;
+        break;
+      case ICmpInst::ICMP_UGT:
+        // (float)int > 4.4    --> int > 4
+        // (float)int > -4.4   --> true
+        if (RHS.isNegative())
+          return replaceInstUsesWith(I, Builder.getTrue());
+        break;
+      case ICmpInst::ICMP_SGT:
+        // (float)int > 4.4    --> int > 4
+        // (float)int > -4.4   --> int >= -4
+        if (RHS.isNegative())
+          Pred = ICmpInst::ICMP_SGE;
+        break;
+      case ICmpInst::ICMP_UGE:
+        // (float)int >= -4.4   --> true
+        // (float)int >= 4.4    --> int > 4
+        if (RHS.isNegative())
+          return replaceInstUsesWith(I, Builder.getTrue());
+        Pred = ICmpInst::ICMP_UGT;
+        break;
+      case ICmpInst::ICMP_SGE:
+        // (float)int >= -4.4   --> int >= -4
+        // (float)int >= 4.4    --> int > 4
+        if (!RHS.isNegative())
+          Pred = ICmpInst::ICMP_SGT;
+        break;
+      }
+    }
+  }
+
+  // Lower this FP comparison into an appropriate integer version of the
+  // comparison.
+  return new ICmpInst(Pred, LHSI->getOperand(0), RHSInt);
+}
+
+/// Fold (C / X) < 0.0 --> X < 0.0 if possible. Swap predicate if necessary.
+static Instruction *foldFCmpReciprocalAndZero(FCmpInst &I, Instruction *LHSI,
+                                              Constant *RHSC) {
+  // When C is not 0.0 and infinities are not allowed:
+  // (C / X) < 0.0 is a sign-bit test of X
+  // (C / X) < 0.0 --> X < 0.0 (if C is positive)
+  // (C / X) < 0.0 --> X > 0.0 (if C is negative, swap the predicate)
+  //
+  // Proof:
+  // Multiply (C / X) < 0.0 by X * X / C.
+  // - X is non zero, if it is the flag 'ninf' is violated.
+  // - C defines the sign of X * X * C. Thus it also defines whether to swap
+  //   the predicate. C is also non zero by definition.
+  //
+  // Thus X * X / C is non zero and the transformation is valid. [qed]
+
+  FCmpInst::Predicate Pred = I.getPredicate();
+
+  // Check that predicates are valid.
+  if ((Pred != FCmpInst::FCMP_OGT) && (Pred != FCmpInst::FCMP_OLT) &&
+      (Pred != FCmpInst::FCMP_OGE) && (Pred != FCmpInst::FCMP_OLE))
+    return nullptr;
+
+  // Check that RHS operand is zero.
+  if (!match(RHSC, m_AnyZeroFP()))
+    return nullptr;
+
+  // Check fastmath flags ('ninf').
+  if (!LHSI->hasNoInfs() || !I.hasNoInfs())
+    return nullptr;
+
+  // Check the properties of the dividend. It must not be zero to avoid a
+  // division by zero (see Proof).
+  const APFloat *C;
+  if (!match(LHSI->getOperand(0), m_APFloat(C)))
+    return nullptr;
+
+  if (C->isZero())
+    return nullptr;
+
+  // Get swapped predicate if necessary.
+  if (C->isNegative())
+    Pred = I.getSwappedPredicate();
+
+  return new FCmpInst(Pred, LHSI->getOperand(1), RHSC, "", &I);
+}
+
+/// Optimize fabs(X) compared with zero.
 static Instruction *foldFabsWithFcmpZero(FCmpInst &I, InstCombinerImpl &IC) {
-  Value *X; 
+  Value *X;
   if (!match(I.getOperand(0), m_FAbs(m_Value(X))) ||
-      !match(I.getOperand(1), m_PosZeroFP())) 
-    return nullptr; 
- 
-  auto replacePredAndOp0 = [&IC](FCmpInst *I, FCmpInst::Predicate P, Value *X) { 
-    I->setPredicate(P); 
-    return IC.replaceOperand(*I, 0, X); 
-  }; 
- 
-  switch (I.getPredicate()) { 
-  case FCmpInst::FCMP_UGE: 
-  case FCmpInst::FCMP_OLT: 
-    // fabs(X) >= 0.0 --> true 
-    // fabs(X) <  0.0 --> false 
-    llvm_unreachable("fcmp should have simplified"); 
- 
-  case FCmpInst::FCMP_OGT: 
-    // fabs(X) > 0.0 --> X != 0.0 
-    return replacePredAndOp0(&I, FCmpInst::FCMP_ONE, X); 
- 
-  case FCmpInst::FCMP_UGT: 
-    // fabs(X) u> 0.0 --> X u!= 0.0 
-    return replacePredAndOp0(&I, FCmpInst::FCMP_UNE, X); 
- 
-  case FCmpInst::FCMP_OLE: 
-    // fabs(X) <= 0.0 --> X == 0.0 
-    return replacePredAndOp0(&I, FCmpInst::FCMP_OEQ, X); 
- 
-  case FCmpInst::FCMP_ULE: 
-    // fabs(X) u<= 0.0 --> X u== 0.0 
-    return replacePredAndOp0(&I, FCmpInst::FCMP_UEQ, X); 
- 
-  case FCmpInst::FCMP_OGE: 
-    // fabs(X) >= 0.0 --> !isnan(X) 
-    assert(!I.hasNoNaNs() && "fcmp should have simplified"); 
-    return replacePredAndOp0(&I, FCmpInst::FCMP_ORD, X); 
- 
-  case FCmpInst::FCMP_ULT: 
-    // fabs(X) u< 0.0 --> isnan(X) 
-    assert(!I.hasNoNaNs() && "fcmp should have simplified"); 
-    return replacePredAndOp0(&I, FCmpInst::FCMP_UNO, X); 
- 
-  case FCmpInst::FCMP_OEQ: 
-  case FCmpInst::FCMP_UEQ: 
-  case FCmpInst::FCMP_ONE: 
-  case FCmpInst::FCMP_UNE: 
-  case FCmpInst::FCMP_ORD: 
-  case FCmpInst::FCMP_UNO: 
-    // Look through the fabs() because it doesn't change anything but the sign. 
-    // fabs(X) == 0.0 --> X == 0.0, 
-    // fabs(X) != 0.0 --> X != 0.0 
-    // isnan(fabs(X)) --> isnan(X) 
-    // !isnan(fabs(X) --> !isnan(X) 
-    return replacePredAndOp0(&I, I.getPredicate(), X); 
- 
-  default: 
-    return nullptr; 
-  } 
-} 
- 
+      !match(I.getOperand(1), m_PosZeroFP()))
+    return nullptr;
+
+  auto replacePredAndOp0 = [&IC](FCmpInst *I, FCmpInst::Predicate P, Value *X) {
+    I->setPredicate(P);
+    return IC.replaceOperand(*I, 0, X);
+  };
+
+  switch (I.getPredicate()) {
+  case FCmpInst::FCMP_UGE:
+  case FCmpInst::FCMP_OLT:
+    // fabs(X) >= 0.0 --> true
+    // fabs(X) <  0.0 --> false
+    llvm_unreachable("fcmp should have simplified");
+
+  case FCmpInst::FCMP_OGT:
+    // fabs(X) > 0.0 --> X != 0.0
+    return replacePredAndOp0(&I, FCmpInst::FCMP_ONE, X);
+
+  case FCmpInst::FCMP_UGT:
+    // fabs(X) u> 0.0 --> X u!= 0.0
+    return replacePredAndOp0(&I, FCmpInst::FCMP_UNE, X);
+
+  case FCmpInst::FCMP_OLE:
+    // fabs(X) <= 0.0 --> X == 0.0
+    return replacePredAndOp0(&I, FCmpInst::FCMP_OEQ, X);
+
+  case FCmpInst::FCMP_ULE:
+    // fabs(X) u<= 0.0 --> X u== 0.0
+    return replacePredAndOp0(&I, FCmpInst::FCMP_UEQ, X);
+
+  case FCmpInst::FCMP_OGE:
+    // fabs(X) >= 0.0 --> !isnan(X)
+    assert(!I.hasNoNaNs() && "fcmp should have simplified");
+    return replacePredAndOp0(&I, FCmpInst::FCMP_ORD, X);
+
+  case FCmpInst::FCMP_ULT:
+    // fabs(X) u< 0.0 --> isnan(X)
+    assert(!I.hasNoNaNs() && "fcmp should have simplified");
+    return replacePredAndOp0(&I, FCmpInst::FCMP_UNO, X);
+
+  case FCmpInst::FCMP_OEQ:
+  case FCmpInst::FCMP_UEQ:
+  case FCmpInst::FCMP_ONE:
+  case FCmpInst::FCMP_UNE:
+  case FCmpInst::FCMP_ORD:
+  case FCmpInst::FCMP_UNO:
+    // Look through the fabs() because it doesn't change anything but the sign.
+    // fabs(X) == 0.0 --> X == 0.0,
+    // fabs(X) != 0.0 --> X != 0.0
+    // isnan(fabs(X)) --> isnan(X)
+    // !isnan(fabs(X) --> !isnan(X)
+    return replacePredAndOp0(&I, I.getPredicate(), X);
+
+  default:
+    return nullptr;
+  }
+}
+
 Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) {
-  bool Changed = false; 
- 
-  /// Orders the operands of the compare so that they are listed from most 
-  /// complex to least complex.  This puts constants before unary operators, 
-  /// before binary operators. 
-  if (getComplexity(I.getOperand(0)) < getComplexity(I.getOperand(1))) { 
-    I.swapOperands(); 
-    Changed = true; 
-  } 
- 
-  const CmpInst::Predicate Pred = I.getPredicate(); 
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); 
-  if (Value *V = SimplifyFCmpInst(Pred, Op0, Op1, I.getFastMathFlags(), 
-                                  SQ.getWithInstruction(&I))) 
-    return replaceInstUsesWith(I, V); 
- 
-  // Simplify 'fcmp pred X, X' 
-  Type *OpType = Op0->getType(); 
-  assert(OpType == Op1->getType() && "fcmp with different-typed operands?"); 
-  if (Op0 == Op1) { 
-    switch (Pred) { 
-      default: break; 
-    case FCmpInst::FCMP_UNO:    // True if unordered: isnan(X) | isnan(Y) 
-    case FCmpInst::FCMP_ULT:    // True if unordered or less than 
-    case FCmpInst::FCMP_UGT:    // True if unordered or greater than 
-    case FCmpInst::FCMP_UNE:    // True if unordered or not equal 
-      // Canonicalize these to be 'fcmp uno %X, 0.0'. 
-      I.setPredicate(FCmpInst::FCMP_UNO); 
-      I.setOperand(1, Constant::getNullValue(OpType)); 
-      return &I; 
- 
-    case FCmpInst::FCMP_ORD:    // True if ordered (no nans) 
-    case FCmpInst::FCMP_OEQ:    // True if ordered and equal 
-    case FCmpInst::FCMP_OGE:    // True if ordered and greater than or equal 
-    case FCmpInst::FCMP_OLE:    // True if ordered and less than or equal 
-      // Canonicalize these to be 'fcmp ord %X, 0.0'. 
-      I.setPredicate(FCmpInst::FCMP_ORD); 
-      I.setOperand(1, Constant::getNullValue(OpType)); 
-      return &I; 
-    } 
-  } 
- 
-  // If we're just checking for a NaN (ORD/UNO) and have a non-NaN operand, 
-  // then canonicalize the operand to 0.0. 
-  if (Pred == CmpInst::FCMP_ORD || Pred == CmpInst::FCMP_UNO) { 
-    if (!match(Op0, m_PosZeroFP()) && isKnownNeverNaN(Op0, &TLI)) 
-      return replaceOperand(I, 0, ConstantFP::getNullValue(OpType)); 
- 
-    if (!match(Op1, m_PosZeroFP()) && isKnownNeverNaN(Op1, &TLI)) 
-      return replaceOperand(I, 1, ConstantFP::getNullValue(OpType)); 
-  } 
- 
-  // fcmp pred (fneg X), (fneg Y) -> fcmp swap(pred) X, Y 
-  Value *X, *Y; 
-  if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_FNeg(m_Value(Y)))) 
-    return new FCmpInst(I.getSwappedPredicate(), X, Y, "", &I); 
- 
-  // Test if the FCmpInst instruction is used exclusively by a select as 
-  // part of a minimum or maximum operation. If so, refrain from doing 
-  // any other folding. This helps out other analyses which understand 
-  // non-obfuscated minimum and maximum idioms, such as ScalarEvolution 
-  // and CodeGen. And in this case, at least one of the comparison 
-  // operands has at least one user besides the compare (the select), 
-  // which would often largely negate the benefit of folding anyway. 
-  if (I.hasOneUse()) 
-    if (SelectInst *SI = dyn_cast<SelectInst>(I.user_back())) { 
-      Value *A, *B; 
-      SelectPatternResult SPR = matchSelectPattern(SI, A, B); 
-      if (SPR.Flavor != SPF_UNKNOWN) 
-        return nullptr; 
-    } 
- 
-  // The sign of 0.0 is ignored by fcmp, so canonicalize to +0.0: 
-  // fcmp Pred X, -0.0 --> fcmp Pred X, 0.0 
-  if (match(Op1, m_AnyZeroFP()) && !match(Op1, m_PosZeroFP())) 
-    return replaceOperand(I, 1, ConstantFP::getNullValue(OpType)); 
- 
-  // Handle fcmp with instruction LHS and constant RHS. 
-  Instruction *LHSI; 
-  Constant *RHSC; 
-  if (match(Op0, m_Instruction(LHSI)) && match(Op1, m_Constant(RHSC))) { 
-    switch (LHSI->getOpcode()) { 
-    case Instruction::PHI: 
-      // Only fold fcmp into the PHI if the phi and fcmp are in the same 
-      // block.  If in the same block, we're encouraging jump threading.  If 
-      // not, we are just pessimizing the code by making an i1 phi. 
-      if (LHSI->getParent() == I.getParent()) 
-        if (Instruction *NV = foldOpIntoPhi(I, cast<PHINode>(LHSI))) 
-          return NV; 
-      break; 
-    case Instruction::SIToFP: 
-    case Instruction::UIToFP: 
-      if (Instruction *NV = foldFCmpIntToFPConst(I, LHSI, RHSC)) 
-        return NV; 
-      break; 
-    case Instruction::FDiv: 
-      if (Instruction *NV = foldFCmpReciprocalAndZero(I, LHSI, RHSC)) 
-        return NV; 
-      break; 
-    case Instruction::Load: 
-      if (auto *GEP = dyn_cast<GetElementPtrInst>(LHSI->getOperand(0))) 
-        if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) 
-          if (GV->isConstant() && GV->hasDefinitiveInitializer() && 
-              !cast<LoadInst>(LHSI)->isVolatile()) 
-            if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, I)) 
-              return Res; 
-      break; 
-  } 
-  } 
- 
-  if (Instruction *R = foldFabsWithFcmpZero(I, *this)) 
-    return R; 
- 
-  if (match(Op0, m_FNeg(m_Value(X)))) { 
-    // fcmp pred (fneg X), C --> fcmp swap(pred) X, -C 
-    Constant *C; 
-    if (match(Op1, m_Constant(C))) { 
-      Constant *NegC = ConstantExpr::getFNeg(C); 
-      return new FCmpInst(I.getSwappedPredicate(), X, NegC, "", &I); 
-    } 
-  } 
- 
-  if (match(Op0, m_FPExt(m_Value(X)))) { 
-    // fcmp (fpext X), (fpext Y) -> fcmp X, Y 
-    if (match(Op1, m_FPExt(m_Value(Y))) && X->getType() == Y->getType()) 
-      return new FCmpInst(Pred, X, Y, "", &I); 
- 
-    // fcmp (fpext X), C -> fcmp X, (fptrunc C) if fptrunc is lossless 
-    const APFloat *C; 
-    if (match(Op1, m_APFloat(C))) { 
-      const fltSemantics &FPSem = 
-          X->getType()->getScalarType()->getFltSemantics(); 
-      bool Lossy; 
-      APFloat TruncC = *C; 
-      TruncC.convert(FPSem, APFloat::rmNearestTiesToEven, &Lossy); 
- 
-      // Avoid lossy conversions and denormals. 
-      // Zero is a special case that's OK to convert. 
-      APFloat Fabs = TruncC; 
-      Fabs.clearSign(); 
-      if (!Lossy && 
-          (!(Fabs < APFloat::getSmallestNormalized(FPSem)) || Fabs.isZero())) { 
-        Constant *NewC = ConstantFP::get(X->getType(), TruncC); 
-        return new FCmpInst(Pred, X, NewC, "", &I); 
-      } 
-    } 
-  } 
- 
-  if (I.getType()->isVectorTy()) 
-    if (Instruction *Res = foldVectorCmp(I, Builder)) 
-      return Res; 
- 
-  return Changed ? &I : nullptr; 
-} 
+  bool Changed = false;
+
+  /// Orders the operands of the compare so that they are listed from most
+  /// complex to least complex.  This puts constants before unary operators,
+  /// before binary operators.
+  if (getComplexity(I.getOperand(0)) < getComplexity(I.getOperand(1))) {
+    I.swapOperands();
+    Changed = true;
+  }
+
+  const CmpInst::Predicate Pred = I.getPredicate();
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  if (Value *V = SimplifyFCmpInst(Pred, Op0, Op1, I.getFastMathFlags(),
+                                  SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  // Simplify 'fcmp pred X, X'
+  Type *OpType = Op0->getType();
+  assert(OpType == Op1->getType() && "fcmp with different-typed operands?");
+  if (Op0 == Op1) {
+    switch (Pred) {
+      default: break;
+    case FCmpInst::FCMP_UNO:    // True if unordered: isnan(X) | isnan(Y)
+    case FCmpInst::FCMP_ULT:    // True if unordered or less than
+    case FCmpInst::FCMP_UGT:    // True if unordered or greater than
+    case FCmpInst::FCMP_UNE:    // True if unordered or not equal
+      // Canonicalize these to be 'fcmp uno %X, 0.0'.
+      I.setPredicate(FCmpInst::FCMP_UNO);
+      I.setOperand(1, Constant::getNullValue(OpType));
+      return &I;
+
+    case FCmpInst::FCMP_ORD:    // True if ordered (no nans)
+    case FCmpInst::FCMP_OEQ:    // True if ordered and equal
+    case FCmpInst::FCMP_OGE:    // True if ordered and greater than or equal
+    case FCmpInst::FCMP_OLE:    // True if ordered and less than or equal
+      // Canonicalize these to be 'fcmp ord %X, 0.0'.
+      I.setPredicate(FCmpInst::FCMP_ORD);
+      I.setOperand(1, Constant::getNullValue(OpType));
+      return &I;
+    }
+  }
+
+  // If we're just checking for a NaN (ORD/UNO) and have a non-NaN operand,
+  // then canonicalize the operand to 0.0.
+  if (Pred == CmpInst::FCMP_ORD || Pred == CmpInst::FCMP_UNO) {
+    if (!match(Op0, m_PosZeroFP()) && isKnownNeverNaN(Op0, &TLI))
+      return replaceOperand(I, 0, ConstantFP::getNullValue(OpType));
+
+    if (!match(Op1, m_PosZeroFP()) && isKnownNeverNaN(Op1, &TLI))
+      return replaceOperand(I, 1, ConstantFP::getNullValue(OpType));
+  }
+
+  // fcmp pred (fneg X), (fneg Y) -> fcmp swap(pred) X, Y
+  Value *X, *Y;
+  if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_FNeg(m_Value(Y))))
+    return new FCmpInst(I.getSwappedPredicate(), X, Y, "", &I);
+
+  // Test if the FCmpInst instruction is used exclusively by a select as
+  // part of a minimum or maximum operation. If so, refrain from doing
+  // any other folding. This helps out other analyses which understand
+  // non-obfuscated minimum and maximum idioms, such as ScalarEvolution
+  // and CodeGen. And in this case, at least one of the comparison
+  // operands has at least one user besides the compare (the select),
+  // which would often largely negate the benefit of folding anyway.
+  if (I.hasOneUse())
+    if (SelectInst *SI = dyn_cast<SelectInst>(I.user_back())) {
+      Value *A, *B;
+      SelectPatternResult SPR = matchSelectPattern(SI, A, B);
+      if (SPR.Flavor != SPF_UNKNOWN)
+        return nullptr;
+    }
+
+  // The sign of 0.0 is ignored by fcmp, so canonicalize to +0.0:
+  // fcmp Pred X, -0.0 --> fcmp Pred X, 0.0
+  if (match(Op1, m_AnyZeroFP()) && !match(Op1, m_PosZeroFP()))
+    return replaceOperand(I, 1, ConstantFP::getNullValue(OpType));
+
+  // Handle fcmp with instruction LHS and constant RHS.
+  Instruction *LHSI;
+  Constant *RHSC;
+  if (match(Op0, m_Instruction(LHSI)) && match(Op1, m_Constant(RHSC))) {
+    switch (LHSI->getOpcode()) {
+    case Instruction::PHI:
+      // Only fold fcmp into the PHI if the phi and fcmp are in the same
+      // block.  If in the same block, we're encouraging jump threading.  If
+      // not, we are just pessimizing the code by making an i1 phi.
+      if (LHSI->getParent() == I.getParent())
+        if (Instruction *NV = foldOpIntoPhi(I, cast<PHINode>(LHSI)))
+          return NV;
+      break;
+    case Instruction::SIToFP:
+    case Instruction::UIToFP:
+      if (Instruction *NV = foldFCmpIntToFPConst(I, LHSI, RHSC))
+        return NV;
+      break;
+    case Instruction::FDiv:
+      if (Instruction *NV = foldFCmpReciprocalAndZero(I, LHSI, RHSC))
+        return NV;
+      break;
+    case Instruction::Load:
+      if (auto *GEP = dyn_cast<GetElementPtrInst>(LHSI->getOperand(0)))
+        if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
+          if (GV->isConstant() && GV->hasDefinitiveInitializer() &&
+              !cast<LoadInst>(LHSI)->isVolatile())
+            if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, I))
+              return Res;
+      break;
+  }
+  }
+
+  if (Instruction *R = foldFabsWithFcmpZero(I, *this))
+    return R;
+
+  if (match(Op0, m_FNeg(m_Value(X)))) {
+    // fcmp pred (fneg X), C --> fcmp swap(pred) X, -C
+    Constant *C;
+    if (match(Op1, m_Constant(C))) {
+      Constant *NegC = ConstantExpr::getFNeg(C);
+      return new FCmpInst(I.getSwappedPredicate(), X, NegC, "", &I);
+    }
+  }
+
+  if (match(Op0, m_FPExt(m_Value(X)))) {
+    // fcmp (fpext X), (fpext Y) -> fcmp X, Y
+    if (match(Op1, m_FPExt(m_Value(Y))) && X->getType() == Y->getType())
+      return new FCmpInst(Pred, X, Y, "", &I);
+
+    // fcmp (fpext X), C -> fcmp X, (fptrunc C) if fptrunc is lossless
+    const APFloat *C;
+    if (match(Op1, m_APFloat(C))) {
+      const fltSemantics &FPSem =
+          X->getType()->getScalarType()->getFltSemantics();
+      bool Lossy;
+      APFloat TruncC = *C;
+      TruncC.convert(FPSem, APFloat::rmNearestTiesToEven, &Lossy);
+
+      // Avoid lossy conversions and denormals.
+      // Zero is a special case that's OK to convert.
+      APFloat Fabs = TruncC;
+      Fabs.clearSign();
+      if (!Lossy &&
+          (!(Fabs < APFloat::getSmallestNormalized(FPSem)) || Fabs.isZero())) {
+        Constant *NewC = ConstantFP::get(X->getType(), TruncC);
+        return new FCmpInst(Pred, X, NewC, "", &I);
+      }
+    }
+  }
+
+  if (I.getType()->isVectorTy())
+    if (Instruction *Res = foldVectorCmp(I, Builder))
+      return Res;
+
+  return Changed ? &I : nullptr;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineInternal.h b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineInternal.h
index 68d36a72db..79e9d5c46c 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -1,38 +1,38 @@
-//===- InstCombineInternal.h - InstCombine pass internals -------*- C++ -*-===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-/// \file 
-/// 
-/// This file provides internal interfaces used to implement the InstCombine. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#ifndef LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H 
-#define LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H 
- 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/Analysis/TargetFolder.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstVisitor.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/KnownBits.h" 
-#include "llvm/Transforms/InstCombine/InstCombineWorklist.h" 
+//===- InstCombineInternal.h - InstCombine pass internals -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// This file provides internal interfaces used to implement the InstCombine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H
+#define LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetFolder.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
-#include "llvm/Transforms/Utils/Local.h" 
-#include <cassert> 
- 
-#define DEBUG_TYPE "instcombine" 
- 
-using namespace llvm::PatternMatch; 
- 
+#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+
+#define DEBUG_TYPE "instcombine"
+
+using namespace llvm::PatternMatch;
+
 // As a default, let's assume that we want to be aggressive,
 // and attempt to traverse with no limits in attempt to sink negation.
 static constexpr unsigned NegatorDefaultMaxDepth = ~0U;
@@ -41,26 +41,26 @@ static constexpr unsigned NegatorDefaultMaxDepth = ~0U;
 // fairly small number of new instructions.
 static constexpr unsigned NegatorMaxNodesSSO = 16;
 
-namespace llvm { 
- 
-class AAResults; 
-class APInt; 
-class AssumptionCache; 
-class BlockFrequencyInfo; 
-class DataLayout; 
-class DominatorTree; 
-class GEPOperator; 
-class GlobalVariable; 
-class LoopInfo; 
-class OptimizationRemarkEmitter; 
-class ProfileSummaryInfo; 
-class TargetLibraryInfo; 
-class User; 
- 
+namespace llvm {
+
+class AAResults;
+class APInt;
+class AssumptionCache;
+class BlockFrequencyInfo;
+class DataLayout;
+class DominatorTree;
+class GEPOperator;
+class GlobalVariable;
+class LoopInfo;
+class OptimizationRemarkEmitter;
+class ProfileSummaryInfo;
+class TargetLibraryInfo;
+class User;
+
 class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
     : public InstCombiner,
       public InstVisitor<InstCombinerImpl, Instruction *> {
-public: 
+public:
   InstCombinerImpl(InstCombineWorklist &Worklist, BuilderTy &Builder,
                    bool MinimizeSize, AAResults *AA, AssumptionCache &AC,
                    TargetLibraryInfo &TLI, TargetTransformInfo &TTI,
@@ -69,551 +69,551 @@ public:
                    const DataLayout &DL, LoopInfo *LI)
       : InstCombiner(Worklist, Builder, MinimizeSize, AA, AC, TLI, TTI, DT, ORE,
                      BFI, PSI, DL, LI) {}
- 
+
   virtual ~InstCombinerImpl() {}
- 
-  /// Run the combiner over the entire worklist until it is empty. 
-  /// 
-  /// \returns true if the IR is changed. 
-  bool run(); 
- 
-  // Visitation implementation - Implement instruction combining for different 
-  // instruction types.  The semantics are as follows: 
-  // Return Value: 
-  //    null        - No change was made 
-  //     I          - Change was made, I is still valid, I may be dead though 
-  //   otherwise    - Change was made, replace I with returned instruction 
-  // 
-  Instruction *visitFNeg(UnaryOperator &I); 
-  Instruction *visitAdd(BinaryOperator &I); 
-  Instruction *visitFAdd(BinaryOperator &I); 
-  Value *OptimizePointerDifference( 
-      Value *LHS, Value *RHS, Type *Ty, bool isNUW); 
-  Instruction *visitSub(BinaryOperator &I); 
-  Instruction *visitFSub(BinaryOperator &I); 
-  Instruction *visitMul(BinaryOperator &I); 
-  Instruction *visitFMul(BinaryOperator &I); 
-  Instruction *visitURem(BinaryOperator &I); 
-  Instruction *visitSRem(BinaryOperator &I); 
-  Instruction *visitFRem(BinaryOperator &I); 
-  bool simplifyDivRemOfSelectWithZeroOp(BinaryOperator &I); 
-  Instruction *commonIRemTransforms(BinaryOperator &I); 
-  Instruction *commonIDivTransforms(BinaryOperator &I); 
-  Instruction *visitUDiv(BinaryOperator &I); 
-  Instruction *visitSDiv(BinaryOperator &I); 
-  Instruction *visitFDiv(BinaryOperator &I); 
-  Value *simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1, bool Inverted); 
-  Instruction *visitAnd(BinaryOperator &I); 
-  Instruction *visitOr(BinaryOperator &I); 
+
+  /// Run the combiner over the entire worklist until it is empty.
+  ///
+  /// \returns true if the IR is changed.
+  bool run();
+
+  // Visitation implementation - Implement instruction combining for different
+  // instruction types.  The semantics are as follows:
+  // Return Value:
+  //    null        - No change was made
+  //     I          - Change was made, I is still valid, I may be dead though
+  //   otherwise    - Change was made, replace I with returned instruction
+  //
+  Instruction *visitFNeg(UnaryOperator &I);
+  Instruction *visitAdd(BinaryOperator &I);
+  Instruction *visitFAdd(BinaryOperator &I);
+  Value *OptimizePointerDifference(
+      Value *LHS, Value *RHS, Type *Ty, bool isNUW);
+  Instruction *visitSub(BinaryOperator &I);
+  Instruction *visitFSub(BinaryOperator &I);
+  Instruction *visitMul(BinaryOperator &I);
+  Instruction *visitFMul(BinaryOperator &I);
+  Instruction *visitURem(BinaryOperator &I);
+  Instruction *visitSRem(BinaryOperator &I);
+  Instruction *visitFRem(BinaryOperator &I);
+  bool simplifyDivRemOfSelectWithZeroOp(BinaryOperator &I);
+  Instruction *commonIRemTransforms(BinaryOperator &I);
+  Instruction *commonIDivTransforms(BinaryOperator &I);
+  Instruction *visitUDiv(BinaryOperator &I);
+  Instruction *visitSDiv(BinaryOperator &I);
+  Instruction *visitFDiv(BinaryOperator &I);
+  Value *simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1, bool Inverted);
+  Instruction *visitAnd(BinaryOperator &I);
+  Instruction *visitOr(BinaryOperator &I);
   bool sinkNotIntoOtherHandOfAndOrOr(BinaryOperator &I);
-  Instruction *visitXor(BinaryOperator &I); 
-  Instruction *visitShl(BinaryOperator &I); 
-  Value *reassociateShiftAmtsOfTwoSameDirectionShifts( 
-      BinaryOperator *Sh0, const SimplifyQuery &SQ, 
-      bool AnalyzeForSignBitExtraction = false); 
-  Instruction *canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract( 
-      BinaryOperator &I); 
-  Instruction *foldVariableSignZeroExtensionOfVariableHighBitExtract( 
-      BinaryOperator &OldAShr); 
-  Instruction *visitAShr(BinaryOperator &I); 
-  Instruction *visitLShr(BinaryOperator &I); 
-  Instruction *commonShiftTransforms(BinaryOperator &I); 
-  Instruction *visitFCmpInst(FCmpInst &I); 
+  Instruction *visitXor(BinaryOperator &I);
+  Instruction *visitShl(BinaryOperator &I);
+  Value *reassociateShiftAmtsOfTwoSameDirectionShifts(
+      BinaryOperator *Sh0, const SimplifyQuery &SQ,
+      bool AnalyzeForSignBitExtraction = false);
+  Instruction *canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(
+      BinaryOperator &I);
+  Instruction *foldVariableSignZeroExtensionOfVariableHighBitExtract(
+      BinaryOperator &OldAShr);
+  Instruction *visitAShr(BinaryOperator &I);
+  Instruction *visitLShr(BinaryOperator &I);
+  Instruction *commonShiftTransforms(BinaryOperator &I);
+  Instruction *visitFCmpInst(FCmpInst &I);
   CmpInst *canonicalizeICmpPredicate(CmpInst &I);
-  Instruction *visitICmpInst(ICmpInst &I); 
-  Instruction *FoldShiftByConstant(Value *Op0, Constant *Op1, 
-                                   BinaryOperator &I); 
-  Instruction *commonCastTransforms(CastInst &CI); 
-  Instruction *commonPointerCastTransforms(CastInst &CI); 
-  Instruction *visitTrunc(TruncInst &CI); 
-  Instruction *visitZExt(ZExtInst &CI); 
-  Instruction *visitSExt(SExtInst &CI); 
-  Instruction *visitFPTrunc(FPTruncInst &CI); 
-  Instruction *visitFPExt(CastInst &CI); 
-  Instruction *visitFPToUI(FPToUIInst &FI); 
-  Instruction *visitFPToSI(FPToSIInst &FI); 
-  Instruction *visitUIToFP(CastInst &CI); 
-  Instruction *visitSIToFP(CastInst &CI); 
-  Instruction *visitPtrToInt(PtrToIntInst &CI); 
-  Instruction *visitIntToPtr(IntToPtrInst &CI); 
-  Instruction *visitBitCast(BitCastInst &CI); 
-  Instruction *visitAddrSpaceCast(AddrSpaceCastInst &CI); 
-  Instruction *foldItoFPtoI(CastInst &FI); 
-  Instruction *visitSelectInst(SelectInst &SI); 
-  Instruction *visitCallInst(CallInst &CI); 
-  Instruction *visitInvokeInst(InvokeInst &II); 
-  Instruction *visitCallBrInst(CallBrInst &CBI); 
- 
-  Instruction *SliceUpIllegalIntegerPHI(PHINode &PN); 
-  Instruction *visitPHINode(PHINode &PN); 
-  Instruction *visitGetElementPtrInst(GetElementPtrInst &GEP); 
-  Instruction *visitAllocaInst(AllocaInst &AI); 
-  Instruction *visitAllocSite(Instruction &FI); 
-  Instruction *visitFree(CallInst &FI); 
-  Instruction *visitLoadInst(LoadInst &LI); 
-  Instruction *visitStoreInst(StoreInst &SI); 
-  Instruction *visitAtomicRMWInst(AtomicRMWInst &SI); 
-  Instruction *visitUnconditionalBranchInst(BranchInst &BI); 
-  Instruction *visitBranchInst(BranchInst &BI); 
-  Instruction *visitFenceInst(FenceInst &FI); 
-  Instruction *visitSwitchInst(SwitchInst &SI); 
-  Instruction *visitReturnInst(ReturnInst &RI); 
+  Instruction *visitICmpInst(ICmpInst &I);
+  Instruction *FoldShiftByConstant(Value *Op0, Constant *Op1,
+                                   BinaryOperator &I);
+  Instruction *commonCastTransforms(CastInst &CI);
+  Instruction *commonPointerCastTransforms(CastInst &CI);
+  Instruction *visitTrunc(TruncInst &CI);
+  Instruction *visitZExt(ZExtInst &CI);
+  Instruction *visitSExt(SExtInst &CI);
+  Instruction *visitFPTrunc(FPTruncInst &CI);
+  Instruction *visitFPExt(CastInst &CI);
+  Instruction *visitFPToUI(FPToUIInst &FI);
+  Instruction *visitFPToSI(FPToSIInst &FI);
+  Instruction *visitUIToFP(CastInst &CI);
+  Instruction *visitSIToFP(CastInst &CI);
+  Instruction *visitPtrToInt(PtrToIntInst &CI);
+  Instruction *visitIntToPtr(IntToPtrInst &CI);
+  Instruction *visitBitCast(BitCastInst &CI);
+  Instruction *visitAddrSpaceCast(AddrSpaceCastInst &CI);
+  Instruction *foldItoFPtoI(CastInst &FI);
+  Instruction *visitSelectInst(SelectInst &SI);
+  Instruction *visitCallInst(CallInst &CI);
+  Instruction *visitInvokeInst(InvokeInst &II);
+  Instruction *visitCallBrInst(CallBrInst &CBI);
+
+  Instruction *SliceUpIllegalIntegerPHI(PHINode &PN);
+  Instruction *visitPHINode(PHINode &PN);
+  Instruction *visitGetElementPtrInst(GetElementPtrInst &GEP);
+  Instruction *visitAllocaInst(AllocaInst &AI);
+  Instruction *visitAllocSite(Instruction &FI);
+  Instruction *visitFree(CallInst &FI);
+  Instruction *visitLoadInst(LoadInst &LI);
+  Instruction *visitStoreInst(StoreInst &SI);
+  Instruction *visitAtomicRMWInst(AtomicRMWInst &SI);
+  Instruction *visitUnconditionalBranchInst(BranchInst &BI);
+  Instruction *visitBranchInst(BranchInst &BI);
+  Instruction *visitFenceInst(FenceInst &FI);
+  Instruction *visitSwitchInst(SwitchInst &SI);
+  Instruction *visitReturnInst(ReturnInst &RI);
   Instruction *visitUnreachableInst(UnreachableInst &I);
   Instruction *
   foldAggregateConstructionIntoAggregateReuse(InsertValueInst &OrigIVI);
-  Instruction *visitInsertValueInst(InsertValueInst &IV); 
-  Instruction *visitInsertElementInst(InsertElementInst &IE); 
-  Instruction *visitExtractElementInst(ExtractElementInst &EI); 
-  Instruction *visitShuffleVectorInst(ShuffleVectorInst &SVI); 
-  Instruction *visitExtractValueInst(ExtractValueInst &EV); 
-  Instruction *visitLandingPadInst(LandingPadInst &LI); 
-  Instruction *visitVAEndInst(VAEndInst &I); 
-  Instruction *visitFreeze(FreezeInst &I); 
- 
-  /// Specify what to return for unhandled instructions. 
-  Instruction *visitInstruction(Instruction &I) { return nullptr; } 
- 
-  /// True when DB dominates all uses of DI except UI. 
-  /// UI must be in the same block as DI. 
-  /// The routine checks that the DI parent and DB are different. 
-  bool dominatesAllUses(const Instruction *DI, const Instruction *UI, 
-                        const BasicBlock *DB) const; 
- 
-  /// Try to replace select with select operand SIOpd in SI-ICmp sequence. 
-  bool replacedSelectWithOperand(SelectInst *SI, const ICmpInst *Icmp, 
-                                 const unsigned SIOpd); 
- 
-  LoadInst *combineLoadToNewType(LoadInst &LI, Type *NewTy, 
-                                 const Twine &Suffix = ""); 
- 
-private: 
-  bool shouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const; 
-  bool shouldChangeType(Type *From, Type *To) const; 
-  Value *dyn_castNegVal(Value *V) const; 
-  Type *FindElementAtOffset(PointerType *PtrTy, int64_t Offset, 
-                            SmallVectorImpl<Value *> &NewIndices); 
- 
-  /// Classify whether a cast is worth optimizing. 
-  /// 
-  /// This is a helper to decide whether the simplification of 
-  /// logic(cast(A), cast(B)) to cast(logic(A, B)) should be performed. 
-  /// 
-  /// \param CI The cast we are interested in. 
-  /// 
-  /// \return true if this cast actually results in any code being generated and 
-  /// if it cannot already be eliminated by some other transformation. 
-  bool shouldOptimizeCast(CastInst *CI); 
- 
-  /// Try to optimize a sequence of instructions checking if an operation 
-  /// on LHS and RHS overflows. 
-  /// 
-  /// If this overflow check is done via one of the overflow check intrinsics, 
-  /// then CtxI has to be the call instruction calling that intrinsic.  If this 
-  /// overflow check is done by arithmetic followed by a compare, then CtxI has 
-  /// to be the arithmetic instruction. 
-  /// 
-  /// If a simplification is possible, stores the simplified result of the 
-  /// operation in OperationResult and result of the overflow check in 
-  /// OverflowResult, and return true.  If no simplification is possible, 
-  /// returns false. 
-  bool OptimizeOverflowCheck(Instruction::BinaryOps BinaryOp, bool IsSigned, 
-                             Value *LHS, Value *RHS, 
-                             Instruction &CtxI, Value *&OperationResult, 
-                             Constant *&OverflowResult); 
- 
-  Instruction *visitCallBase(CallBase &Call); 
-  Instruction *tryOptimizeCall(CallInst *CI); 
-  bool transformConstExprCastCall(CallBase &Call); 
-  Instruction *transformCallThroughTrampoline(CallBase &Call, 
-                                              IntrinsicInst &Tramp); 
- 
-  Value *simplifyMaskedLoad(IntrinsicInst &II); 
-  Instruction *simplifyMaskedStore(IntrinsicInst &II); 
-  Instruction *simplifyMaskedGather(IntrinsicInst &II); 
-  Instruction *simplifyMaskedScatter(IntrinsicInst &II); 
- 
-  /// Transform (zext icmp) to bitwise / integer operations in order to 
-  /// eliminate it. 
-  /// 
-  /// \param ICI The icmp of the (zext icmp) pair we are interested in. 
-  /// \parem CI The zext of the (zext icmp) pair we are interested in. 
-  /// \param DoTransform Pass false to just test whether the given (zext icmp) 
-  /// would be transformed. Pass true to actually perform the transformation. 
-  /// 
-  /// \return null if the transformation cannot be performed. If the 
-  /// transformation can be performed the new instruction that replaces the 
-  /// (zext icmp) pair will be returned (if \p DoTransform is false the 
-  /// unmodified \p ICI will be returned in this case). 
-  Instruction *transformZExtICmp(ICmpInst *ICI, ZExtInst &CI, 
-                                 bool DoTransform = true); 
- 
-  Instruction *transformSExtICmp(ICmpInst *ICI, Instruction &CI); 
- 
-  bool willNotOverflowSignedAdd(const Value *LHS, const Value *RHS, 
-                                const Instruction &CxtI) const { 
-    return computeOverflowForSignedAdd(LHS, RHS, &CxtI) == 
-           OverflowResult::NeverOverflows; 
-  } 
- 
-  bool willNotOverflowUnsignedAdd(const Value *LHS, const Value *RHS, 
-                                  const Instruction &CxtI) const { 
-    return computeOverflowForUnsignedAdd(LHS, RHS, &CxtI) == 
-           OverflowResult::NeverOverflows; 
-  } 
- 
-  bool willNotOverflowAdd(const Value *LHS, const Value *RHS, 
-                          const Instruction &CxtI, bool IsSigned) const { 
-    return IsSigned ? willNotOverflowSignedAdd(LHS, RHS, CxtI) 
-                    : willNotOverflowUnsignedAdd(LHS, RHS, CxtI); 
-  } 
- 
-  bool willNotOverflowSignedSub(const Value *LHS, const Value *RHS, 
-                                const Instruction &CxtI) const { 
-    return computeOverflowForSignedSub(LHS, RHS, &CxtI) == 
-           OverflowResult::NeverOverflows; 
-  } 
- 
-  bool willNotOverflowUnsignedSub(const Value *LHS, const Value *RHS, 
-                                  const Instruction &CxtI) const { 
-    return computeOverflowForUnsignedSub(LHS, RHS, &CxtI) == 
-           OverflowResult::NeverOverflows; 
-  } 
- 
-  bool willNotOverflowSub(const Value *LHS, const Value *RHS, 
-                          const Instruction &CxtI, bool IsSigned) const { 
-    return IsSigned ? willNotOverflowSignedSub(LHS, RHS, CxtI) 
-                    : willNotOverflowUnsignedSub(LHS, RHS, CxtI); 
-  } 
- 
-  bool willNotOverflowSignedMul(const Value *LHS, const Value *RHS, 
-                                const Instruction &CxtI) const { 
-    return computeOverflowForSignedMul(LHS, RHS, &CxtI) == 
-           OverflowResult::NeverOverflows; 
-  } 
- 
-  bool willNotOverflowUnsignedMul(const Value *LHS, const Value *RHS, 
-                                  const Instruction &CxtI) const { 
-    return computeOverflowForUnsignedMul(LHS, RHS, &CxtI) == 
-           OverflowResult::NeverOverflows; 
-  } 
- 
-  bool willNotOverflowMul(const Value *LHS, const Value *RHS, 
-                          const Instruction &CxtI, bool IsSigned) const { 
-    return IsSigned ? willNotOverflowSignedMul(LHS, RHS, CxtI) 
-                    : willNotOverflowUnsignedMul(LHS, RHS, CxtI); 
-  } 
- 
-  bool willNotOverflow(BinaryOperator::BinaryOps Opcode, const Value *LHS, 
-                       const Value *RHS, const Instruction &CxtI, 
-                       bool IsSigned) const { 
-    switch (Opcode) { 
-    case Instruction::Add: return willNotOverflowAdd(LHS, RHS, CxtI, IsSigned); 
-    case Instruction::Sub: return willNotOverflowSub(LHS, RHS, CxtI, IsSigned); 
-    case Instruction::Mul: return willNotOverflowMul(LHS, RHS, CxtI, IsSigned); 
-    default: llvm_unreachable("Unexpected opcode for overflow query"); 
-    } 
-  } 
- 
-  Value *EmitGEPOffset(User *GEP); 
-  Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN); 
-  Instruction *foldCastedBitwiseLogic(BinaryOperator &I); 
-  Instruction *narrowBinOp(TruncInst &Trunc); 
-  Instruction *narrowMaskedBinOp(BinaryOperator &And); 
-  Instruction *narrowMathIfNoOverflow(BinaryOperator &I); 
+  Instruction *visitInsertValueInst(InsertValueInst &IV);
+  Instruction *visitInsertElementInst(InsertElementInst &IE);
+  Instruction *visitExtractElementInst(ExtractElementInst &EI);
+  Instruction *visitShuffleVectorInst(ShuffleVectorInst &SVI);
+  Instruction *visitExtractValueInst(ExtractValueInst &EV);
+  Instruction *visitLandingPadInst(LandingPadInst &LI);
+  Instruction *visitVAEndInst(VAEndInst &I);
+  Instruction *visitFreeze(FreezeInst &I);
+
+  /// Specify what to return for unhandled instructions.
+  Instruction *visitInstruction(Instruction &I) { return nullptr; }
+
+  /// True when DB dominates all uses of DI except UI.
+  /// UI must be in the same block as DI.
+  /// The routine checks that the DI parent and DB are different.
+  bool dominatesAllUses(const Instruction *DI, const Instruction *UI,
+                        const BasicBlock *DB) const;
+
+  /// Try to replace select with select operand SIOpd in SI-ICmp sequence.
+  bool replacedSelectWithOperand(SelectInst *SI, const ICmpInst *Icmp,
+                                 const unsigned SIOpd);
+
+  LoadInst *combineLoadToNewType(LoadInst &LI, Type *NewTy,
+                                 const Twine &Suffix = "");
+
+private:
+  bool shouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const;
+  bool shouldChangeType(Type *From, Type *To) const;
+  Value *dyn_castNegVal(Value *V) const;
+  Type *FindElementAtOffset(PointerType *PtrTy, int64_t Offset,
+                            SmallVectorImpl<Value *> &NewIndices);
+
+  /// Classify whether a cast is worth optimizing.
+  ///
+  /// This is a helper to decide whether the simplification of
+  /// logic(cast(A), cast(B)) to cast(logic(A, B)) should be performed.
+  ///
+  /// \param CI The cast we are interested in.
+  ///
+  /// \return true if this cast actually results in any code being generated and
+  /// if it cannot already be eliminated by some other transformation.
+  bool shouldOptimizeCast(CastInst *CI);
+
+  /// Try to optimize a sequence of instructions checking if an operation
+  /// on LHS and RHS overflows.
+  ///
+  /// If this overflow check is done via one of the overflow check intrinsics,
+  /// then CtxI has to be the call instruction calling that intrinsic.  If this
+  /// overflow check is done by arithmetic followed by a compare, then CtxI has
+  /// to be the arithmetic instruction.
+  ///
+  /// If a simplification is possible, stores the simplified result of the
+  /// operation in OperationResult and result of the overflow check in
+  /// OverflowResult, and return true.  If no simplification is possible,
+  /// returns false.
+  bool OptimizeOverflowCheck(Instruction::BinaryOps BinaryOp, bool IsSigned,
+                             Value *LHS, Value *RHS,
+                             Instruction &CtxI, Value *&OperationResult,
+                             Constant *&OverflowResult);
+
+  Instruction *visitCallBase(CallBase &Call);
+  Instruction *tryOptimizeCall(CallInst *CI);
+  bool transformConstExprCastCall(CallBase &Call);
+  Instruction *transformCallThroughTrampoline(CallBase &Call,
+                                              IntrinsicInst &Tramp);
+
+  Value *simplifyMaskedLoad(IntrinsicInst &II);
+  Instruction *simplifyMaskedStore(IntrinsicInst &II);
+  Instruction *simplifyMaskedGather(IntrinsicInst &II);
+  Instruction *simplifyMaskedScatter(IntrinsicInst &II);
+
+  /// Transform (zext icmp) to bitwise / integer operations in order to
+  /// eliminate it.
+  ///
+  /// \param ICI The icmp of the (zext icmp) pair we are interested in.
+  /// \parem CI The zext of the (zext icmp) pair we are interested in.
+  /// \param DoTransform Pass false to just test whether the given (zext icmp)
+  /// would be transformed. Pass true to actually perform the transformation.
+  ///
+  /// \return null if the transformation cannot be performed. If the
+  /// transformation can be performed the new instruction that replaces the
+  /// (zext icmp) pair will be returned (if \p DoTransform is false the
+  /// unmodified \p ICI will be returned in this case).
+  Instruction *transformZExtICmp(ICmpInst *ICI, ZExtInst &CI,
+                                 bool DoTransform = true);
+
+  Instruction *transformSExtICmp(ICmpInst *ICI, Instruction &CI);
+
+  bool willNotOverflowSignedAdd(const Value *LHS, const Value *RHS,
+                                const Instruction &CxtI) const {
+    return computeOverflowForSignedAdd(LHS, RHS, &CxtI) ==
+           OverflowResult::NeverOverflows;
+  }
+
+  bool willNotOverflowUnsignedAdd(const Value *LHS, const Value *RHS,
+                                  const Instruction &CxtI) const {
+    return computeOverflowForUnsignedAdd(LHS, RHS, &CxtI) ==
+           OverflowResult::NeverOverflows;
+  }
+
+  bool willNotOverflowAdd(const Value *LHS, const Value *RHS,
+                          const Instruction &CxtI, bool IsSigned) const {
+    return IsSigned ? willNotOverflowSignedAdd(LHS, RHS, CxtI)
+                    : willNotOverflowUnsignedAdd(LHS, RHS, CxtI);
+  }
+
+  bool willNotOverflowSignedSub(const Value *LHS, const Value *RHS,
+                                const Instruction &CxtI) const {
+    return computeOverflowForSignedSub(LHS, RHS, &CxtI) ==
+           OverflowResult::NeverOverflows;
+  }
+
+  bool willNotOverflowUnsignedSub(const Value *LHS, const Value *RHS,
+                                  const Instruction &CxtI) const {
+    return computeOverflowForUnsignedSub(LHS, RHS, &CxtI) ==
+           OverflowResult::NeverOverflows;
+  }
+
+  bool willNotOverflowSub(const Value *LHS, const Value *RHS,
+                          const Instruction &CxtI, bool IsSigned) const {
+    return IsSigned ? willNotOverflowSignedSub(LHS, RHS, CxtI)
+                    : willNotOverflowUnsignedSub(LHS, RHS, CxtI);
+  }
+
+  bool willNotOverflowSignedMul(const Value *LHS, const Value *RHS,
+                                const Instruction &CxtI) const {
+    return computeOverflowForSignedMul(LHS, RHS, &CxtI) ==
+           OverflowResult::NeverOverflows;
+  }
+
+  bool willNotOverflowUnsignedMul(const Value *LHS, const Value *RHS,
+                                  const Instruction &CxtI) const {
+    return computeOverflowForUnsignedMul(LHS, RHS, &CxtI) ==
+           OverflowResult::NeverOverflows;
+  }
+
+  bool willNotOverflowMul(const Value *LHS, const Value *RHS,
+                          const Instruction &CxtI, bool IsSigned) const {
+    return IsSigned ? willNotOverflowSignedMul(LHS, RHS, CxtI)
+                    : willNotOverflowUnsignedMul(LHS, RHS, CxtI);
+  }
+
+  bool willNotOverflow(BinaryOperator::BinaryOps Opcode, const Value *LHS,
+                       const Value *RHS, const Instruction &CxtI,
+                       bool IsSigned) const {
+    switch (Opcode) {
+    case Instruction::Add: return willNotOverflowAdd(LHS, RHS, CxtI, IsSigned);
+    case Instruction::Sub: return willNotOverflowSub(LHS, RHS, CxtI, IsSigned);
+    case Instruction::Mul: return willNotOverflowMul(LHS, RHS, CxtI, IsSigned);
+    default: llvm_unreachable("Unexpected opcode for overflow query");
+    }
+  }
+
+  Value *EmitGEPOffset(User *GEP);
+  Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN);
+  Instruction *foldCastedBitwiseLogic(BinaryOperator &I);
+  Instruction *narrowBinOp(TruncInst &Trunc);
+  Instruction *narrowMaskedBinOp(BinaryOperator &And);
+  Instruction *narrowMathIfNoOverflow(BinaryOperator &I);
   Instruction *narrowFunnelShift(TruncInst &Trunc);
-  Instruction *optimizeBitCastFromPhi(CastInst &CI, PHINode *PN); 
-  Instruction *matchSAddSubSat(SelectInst &MinMax1); 
- 
+  Instruction *optimizeBitCastFromPhi(CastInst &CI, PHINode *PN);
+  Instruction *matchSAddSubSat(SelectInst &MinMax1);
+
   void freelyInvertAllUsersOf(Value *V);
 
-  /// Determine if a pair of casts can be replaced by a single cast. 
-  /// 
-  /// \param CI1 The first of a pair of casts. 
-  /// \param CI2 The second of a pair of casts. 
-  /// 
-  /// \return 0 if the cast pair cannot be eliminated, otherwise returns an 
-  /// Instruction::CastOps value for a cast that can replace the pair, casting 
-  /// CI1->getSrcTy() to CI2->getDstTy(). 
-  /// 
-  /// \see CastInst::isEliminableCastPair 
-  Instruction::CastOps isEliminableCastPair(const CastInst *CI1, 
-                                            const CastInst *CI2); 
- 
-  Value *foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS, BinaryOperator &And); 
-  Value *foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, BinaryOperator &Or); 
-  Value *foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS, BinaryOperator &Xor); 
- 
-  /// Optimize (fcmp)&(fcmp) or (fcmp)|(fcmp). 
-  /// NOTE: Unlike most of instcombine, this returns a Value which should 
-  /// already be inserted into the function. 
-  Value *foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS, bool IsAnd); 
- 
-  Value *foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS, 
-                                       BinaryOperator &Logic); 
-  Value *matchSelectFromAndOr(Value *A, Value *B, Value *C, Value *D); 
-  Value *getSelectCondition(Value *A, Value *B); 
- 
-  Instruction *foldIntrinsicWithOverflowCommon(IntrinsicInst *II); 
-  Instruction *foldFPSignBitOps(BinaryOperator &I); 
- 
-public: 
-  /// Inserts an instruction \p New before instruction \p Old 
-  /// 
-  /// Also adds the new instruction to the worklist and returns \p New so that 
-  /// it is suitable for use as the return from the visitation patterns. 
-  Instruction *InsertNewInstBefore(Instruction *New, Instruction &Old) { 
-    assert(New && !New->getParent() && 
-           "New instruction already inserted into a basic block!"); 
-    BasicBlock *BB = Old.getParent(); 
-    BB->getInstList().insert(Old.getIterator(), New); // Insert inst 
-    Worklist.add(New); 
-    return New; 
-  } 
- 
-  /// Same as InsertNewInstBefore, but also sets the debug loc. 
-  Instruction *InsertNewInstWith(Instruction *New, Instruction &Old) { 
-    New->setDebugLoc(Old.getDebugLoc()); 
-    return InsertNewInstBefore(New, Old); 
-  } 
- 
-  /// A combiner-aware RAUW-like routine. 
-  /// 
-  /// This method is to be used when an instruction is found to be dead, 
-  /// replaceable with another preexisting expression. Here we add all uses of 
-  /// I to the worklist, replace all uses of I with the new value, then return 
-  /// I, so that the inst combiner will know that I was modified. 
-  Instruction *replaceInstUsesWith(Instruction &I, Value *V) { 
-    // If there are no uses to replace, then we return nullptr to indicate that 
-    // no changes were made to the program. 
-    if (I.use_empty()) return nullptr; 
- 
-    Worklist.pushUsersToWorkList(I); // Add all modified instrs to worklist. 
- 
-    // If we are replacing the instruction with itself, this must be in a 
-    // segment of unreachable code, so just clobber the instruction. 
-    if (&I == V) 
-      V = UndefValue::get(I.getType()); 
- 
-    LLVM_DEBUG(dbgs() << "IC: Replacing " << I << "\n" 
-                      << "    with " << *V << '\n'); 
- 
-    I.replaceAllUsesWith(V); 
+  /// Determine if a pair of casts can be replaced by a single cast.
+  ///
+  /// \param CI1 The first of a pair of casts.
+  /// \param CI2 The second of a pair of casts.
+  ///
+  /// \return 0 if the cast pair cannot be eliminated, otherwise returns an
+  /// Instruction::CastOps value for a cast that can replace the pair, casting
+  /// CI1->getSrcTy() to CI2->getDstTy().
+  ///
+  /// \see CastInst::isEliminableCastPair
+  Instruction::CastOps isEliminableCastPair(const CastInst *CI1,
+                                            const CastInst *CI2);
+
+  Value *foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS, BinaryOperator &And);
+  Value *foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, BinaryOperator &Or);
+  Value *foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS, BinaryOperator &Xor);
+
+  /// Optimize (fcmp)&(fcmp) or (fcmp)|(fcmp).
+  /// NOTE: Unlike most of instcombine, this returns a Value which should
+  /// already be inserted into the function.
+  Value *foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS, bool IsAnd);
+
+  Value *foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS,
+                                       BinaryOperator &Logic);
+  Value *matchSelectFromAndOr(Value *A, Value *B, Value *C, Value *D);
+  Value *getSelectCondition(Value *A, Value *B);
+
+  Instruction *foldIntrinsicWithOverflowCommon(IntrinsicInst *II);
+  Instruction *foldFPSignBitOps(BinaryOperator &I);
+
+public:
+  /// Inserts an instruction \p New before instruction \p Old
+  ///
+  /// Also adds the new instruction to the worklist and returns \p New so that
+  /// it is suitable for use as the return from the visitation patterns.
+  Instruction *InsertNewInstBefore(Instruction *New, Instruction &Old) {
+    assert(New && !New->getParent() &&
+           "New instruction already inserted into a basic block!");
+    BasicBlock *BB = Old.getParent();
+    BB->getInstList().insert(Old.getIterator(), New); // Insert inst
+    Worklist.add(New);
+    return New;
+  }
+
+  /// Same as InsertNewInstBefore, but also sets the debug loc.
+  Instruction *InsertNewInstWith(Instruction *New, Instruction &Old) {
+    New->setDebugLoc(Old.getDebugLoc());
+    return InsertNewInstBefore(New, Old);
+  }
+
+  /// A combiner-aware RAUW-like routine.
+  ///
+  /// This method is to be used when an instruction is found to be dead,
+  /// replaceable with another preexisting expression. Here we add all uses of
+  /// I to the worklist, replace all uses of I with the new value, then return
+  /// I, so that the inst combiner will know that I was modified.
+  Instruction *replaceInstUsesWith(Instruction &I, Value *V) {
+    // If there are no uses to replace, then we return nullptr to indicate that
+    // no changes were made to the program.
+    if (I.use_empty()) return nullptr;
+
+    Worklist.pushUsersToWorkList(I); // Add all modified instrs to worklist.
+
+    // If we are replacing the instruction with itself, this must be in a
+    // segment of unreachable code, so just clobber the instruction.
+    if (&I == V)
+      V = UndefValue::get(I.getType());
+
+    LLVM_DEBUG(dbgs() << "IC: Replacing " << I << "\n"
+                      << "    with " << *V << '\n');
+
+    I.replaceAllUsesWith(V);
     MadeIRChange = true;
-    return &I; 
-  } 
- 
-  /// Replace operand of instruction and add old operand to the worklist. 
-  Instruction *replaceOperand(Instruction &I, unsigned OpNum, Value *V) { 
-    Worklist.addValue(I.getOperand(OpNum)); 
-    I.setOperand(OpNum, V); 
-    return &I; 
-  } 
- 
-  /// Replace use and add the previously used value to the worklist. 
-  void replaceUse(Use &U, Value *NewValue) { 
-    Worklist.addValue(U); 
-    U = NewValue; 
-  } 
- 
-  /// Creates a result tuple for an overflow intrinsic \p II with a given 
-  /// \p Result and a constant \p Overflow value. 
-  Instruction *CreateOverflowTuple(IntrinsicInst *II, Value *Result, 
-                                   Constant *Overflow) { 
-    Constant *V[] = {UndefValue::get(Result->getType()), Overflow}; 
-    StructType *ST = cast<StructType>(II->getType()); 
-    Constant *Struct = ConstantStruct::get(ST, V); 
-    return InsertValueInst::Create(Struct, Result, 0); 
-  } 
- 
-  /// Create and insert the idiom we use to indicate a block is unreachable 
-  /// without having to rewrite the CFG from within InstCombine. 
-  void CreateNonTerminatorUnreachable(Instruction *InsertAt) { 
-    auto &Ctx = InsertAt->getContext(); 
-    new StoreInst(ConstantInt::getTrue(Ctx), 
-                  UndefValue::get(Type::getInt1PtrTy(Ctx)), 
-                  InsertAt); 
-  } 
- 
- 
-  /// Combiner aware instruction erasure. 
-  /// 
-  /// When dealing with an instruction that has side effects or produces a void 
-  /// value, we can't rely on DCE to delete the instruction. Instead, visit 
-  /// methods should return the value returned by this function. 
+    return &I;
+  }
+
+  /// Replace operand of instruction and add old operand to the worklist.
+  Instruction *replaceOperand(Instruction &I, unsigned OpNum, Value *V) {
+    Worklist.addValue(I.getOperand(OpNum));
+    I.setOperand(OpNum, V);
+    return &I;
+  }
+
+  /// Replace use and add the previously used value to the worklist.
+  void replaceUse(Use &U, Value *NewValue) {
+    Worklist.addValue(U);
+    U = NewValue;
+  }
+
+  /// Creates a result tuple for an overflow intrinsic \p II with a given
+  /// \p Result and a constant \p Overflow value.
+  Instruction *CreateOverflowTuple(IntrinsicInst *II, Value *Result,
+                                   Constant *Overflow) {
+    Constant *V[] = {UndefValue::get(Result->getType()), Overflow};
+    StructType *ST = cast<StructType>(II->getType());
+    Constant *Struct = ConstantStruct::get(ST, V);
+    return InsertValueInst::Create(Struct, Result, 0);
+  }
+
+  /// Create and insert the idiom we use to indicate a block is unreachable
+  /// without having to rewrite the CFG from within InstCombine.
+  void CreateNonTerminatorUnreachable(Instruction *InsertAt) {
+    auto &Ctx = InsertAt->getContext();
+    new StoreInst(ConstantInt::getTrue(Ctx),
+                  UndefValue::get(Type::getInt1PtrTy(Ctx)),
+                  InsertAt);
+  }
+
+
+  /// Combiner aware instruction erasure.
+  ///
+  /// When dealing with an instruction that has side effects or produces a void
+  /// value, we can't rely on DCE to delete the instruction. Instead, visit
+  /// methods should return the value returned by this function.
   Instruction *eraseInstFromFunction(Instruction &I) override {
-    LLVM_DEBUG(dbgs() << "IC: ERASE " << I << '\n'); 
-    assert(I.use_empty() && "Cannot erase instruction that is used!"); 
-    salvageDebugInfo(I); 
- 
-    // Make sure that we reprocess all operands now that we reduced their 
-    // use counts. 
-    for (Use &Operand : I.operands()) 
-      if (auto *Inst = dyn_cast<Instruction>(Operand)) 
-        Worklist.add(Inst); 
- 
-    Worklist.remove(&I); 
-    I.eraseFromParent(); 
-    MadeIRChange = true; 
-    return nullptr; // Don't do anything with FI 
-  } 
- 
-  void computeKnownBits(const Value *V, KnownBits &Known, 
-                        unsigned Depth, const Instruction *CxtI) const { 
-    llvm::computeKnownBits(V, Known, DL, Depth, &AC, CxtI, &DT); 
-  } 
- 
-  KnownBits computeKnownBits(const Value *V, unsigned Depth, 
-                             const Instruction *CxtI) const { 
-    return llvm::computeKnownBits(V, DL, Depth, &AC, CxtI, &DT); 
-  } 
- 
-  bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero = false, 
-                              unsigned Depth = 0, 
-                              const Instruction *CxtI = nullptr) { 
-    return llvm::isKnownToBeAPowerOfTwo(V, DL, OrZero, Depth, &AC, CxtI, &DT); 
-  } 
- 
-  bool MaskedValueIsZero(const Value *V, const APInt &Mask, unsigned Depth = 0, 
-                         const Instruction *CxtI = nullptr) const { 
-    return llvm::MaskedValueIsZero(V, Mask, DL, Depth, &AC, CxtI, &DT); 
-  } 
- 
-  unsigned ComputeNumSignBits(const Value *Op, unsigned Depth = 0, 
-                              const Instruction *CxtI = nullptr) const { 
-    return llvm::ComputeNumSignBits(Op, DL, Depth, &AC, CxtI, &DT); 
-  } 
- 
-  OverflowResult computeOverflowForUnsignedMul(const Value *LHS, 
-                                               const Value *RHS, 
-                                               const Instruction *CxtI) const { 
-    return llvm::computeOverflowForUnsignedMul(LHS, RHS, DL, &AC, CxtI, &DT); 
-  } 
- 
-  OverflowResult computeOverflowForSignedMul(const Value *LHS, 
-                                             const Value *RHS, 
-                                             const Instruction *CxtI) const { 
-    return llvm::computeOverflowForSignedMul(LHS, RHS, DL, &AC, CxtI, &DT); 
-  } 
- 
-  OverflowResult computeOverflowForUnsignedAdd(const Value *LHS, 
-                                               const Value *RHS, 
-                                               const Instruction *CxtI) const { 
-    return llvm::computeOverflowForUnsignedAdd(LHS, RHS, DL, &AC, CxtI, &DT); 
-  } 
- 
-  OverflowResult computeOverflowForSignedAdd(const Value *LHS, 
-                                             const Value *RHS, 
-                                             const Instruction *CxtI) const { 
-    return llvm::computeOverflowForSignedAdd(LHS, RHS, DL, &AC, CxtI, &DT); 
-  } 
- 
-  OverflowResult computeOverflowForUnsignedSub(const Value *LHS, 
-                                               const Value *RHS, 
-                                               const Instruction *CxtI) const { 
-    return llvm::computeOverflowForUnsignedSub(LHS, RHS, DL, &AC, CxtI, &DT); 
-  } 
- 
-  OverflowResult computeOverflowForSignedSub(const Value *LHS, const Value *RHS, 
-                                             const Instruction *CxtI) const { 
-    return llvm::computeOverflowForSignedSub(LHS, RHS, DL, &AC, CxtI, &DT); 
-  } 
- 
-  OverflowResult computeOverflow( 
-      Instruction::BinaryOps BinaryOp, bool IsSigned, 
-      Value *LHS, Value *RHS, Instruction *CxtI) const; 
- 
-  /// Performs a few simplifications for operators which are associative 
-  /// or commutative. 
-  bool SimplifyAssociativeOrCommutative(BinaryOperator &I); 
- 
-  /// Tries to simplify binary operations which some other binary 
-  /// operation distributes over. 
-  /// 
-  /// It does this by either by factorizing out common terms (eg "(A*B)+(A*C)" 
-  /// -> "A*(B+C)") or expanding out if this results in simplifications (eg: "A 
-  /// & (B | C) -> (A&B) | (A&C)" if this is a win).  Returns the simplified 
-  /// value, or null if it didn't simplify. 
-  Value *SimplifyUsingDistributiveLaws(BinaryOperator &I); 
- 
-  /// Tries to simplify add operations using the definition of remainder. 
-  /// 
-  /// The definition of remainder is X % C = X - (X / C ) * C. The add 
-  /// expression X % C0 + (( X / C0 ) % C1) * C0 can be simplified to 
-  /// X % (C0 * C1) 
-  Value *SimplifyAddWithRemainder(BinaryOperator &I); 
- 
-  // Binary Op helper for select operations where the expression can be 
-  // efficiently reorganized. 
-  Value *SimplifySelectsFeedingBinaryOp(BinaryOperator &I, Value *LHS, 
-                                        Value *RHS); 
- 
-  /// This tries to simplify binary operations by factorizing out common terms 
-  /// (e. g. "(A*B)+(A*C)" -> "A*(B+C)"). 
-  Value *tryFactorization(BinaryOperator &, Instruction::BinaryOps, Value *, 
-                          Value *, Value *, Value *); 
- 
-  /// Match a select chain which produces one of three values based on whether 
-  /// the LHS is less than, equal to, or greater than RHS respectively. 
-  /// Return true if we matched a three way compare idiom. The LHS, RHS, Less, 
-  /// Equal and Greater values are saved in the matching process and returned to 
-  /// the caller. 
-  bool matchThreeWayIntCompare(SelectInst *SI, Value *&LHS, Value *&RHS, 
-                               ConstantInt *&Less, ConstantInt *&Equal, 
-                               ConstantInt *&Greater); 
- 
-  /// Attempts to replace V with a simpler value based on the demanded 
-  /// bits. 
-  Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, KnownBits &Known, 
-                                 unsigned Depth, Instruction *CxtI); 
-  bool SimplifyDemandedBits(Instruction *I, unsigned Op, 
-                            const APInt &DemandedMask, KnownBits &Known, 
+    LLVM_DEBUG(dbgs() << "IC: ERASE " << I << '\n');
+    assert(I.use_empty() && "Cannot erase instruction that is used!");
+    salvageDebugInfo(I);
+
+    // Make sure that we reprocess all operands now that we reduced their
+    // use counts.
+    for (Use &Operand : I.operands())
+      if (auto *Inst = dyn_cast<Instruction>(Operand))
+        Worklist.add(Inst);
+
+    Worklist.remove(&I);
+    I.eraseFromParent();
+    MadeIRChange = true;
+    return nullptr; // Don't do anything with FI
+  }
+
+  void computeKnownBits(const Value *V, KnownBits &Known,
+                        unsigned Depth, const Instruction *CxtI) const {
+    llvm::computeKnownBits(V, Known, DL, Depth, &AC, CxtI, &DT);
+  }
+
+  KnownBits computeKnownBits(const Value *V, unsigned Depth,
+                             const Instruction *CxtI) const {
+    return llvm::computeKnownBits(V, DL, Depth, &AC, CxtI, &DT);
+  }
+
+  bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero = false,
+                              unsigned Depth = 0,
+                              const Instruction *CxtI = nullptr) {
+    return llvm::isKnownToBeAPowerOfTwo(V, DL, OrZero, Depth, &AC, CxtI, &DT);
+  }
+
+  bool MaskedValueIsZero(const Value *V, const APInt &Mask, unsigned Depth = 0,
+                         const Instruction *CxtI = nullptr) const {
+    return llvm::MaskedValueIsZero(V, Mask, DL, Depth, &AC, CxtI, &DT);
+  }
+
+  unsigned ComputeNumSignBits(const Value *Op, unsigned Depth = 0,
+                              const Instruction *CxtI = nullptr) const {
+    return llvm::ComputeNumSignBits(Op, DL, Depth, &AC, CxtI, &DT);
+  }
+
+  OverflowResult computeOverflowForUnsignedMul(const Value *LHS,
+                                               const Value *RHS,
+                                               const Instruction *CxtI) const {
+    return llvm::computeOverflowForUnsignedMul(LHS, RHS, DL, &AC, CxtI, &DT);
+  }
+
+  OverflowResult computeOverflowForSignedMul(const Value *LHS,
+                                             const Value *RHS,
+                                             const Instruction *CxtI) const {
+    return llvm::computeOverflowForSignedMul(LHS, RHS, DL, &AC, CxtI, &DT);
+  }
+
+  OverflowResult computeOverflowForUnsignedAdd(const Value *LHS,
+                                               const Value *RHS,
+                                               const Instruction *CxtI) const {
+    return llvm::computeOverflowForUnsignedAdd(LHS, RHS, DL, &AC, CxtI, &DT);
+  }
+
+  OverflowResult computeOverflowForSignedAdd(const Value *LHS,
+                                             const Value *RHS,
+                                             const Instruction *CxtI) const {
+    return llvm::computeOverflowForSignedAdd(LHS, RHS, DL, &AC, CxtI, &DT);
+  }
+
+  OverflowResult computeOverflowForUnsignedSub(const Value *LHS,
+                                               const Value *RHS,
+                                               const Instruction *CxtI) const {
+    return llvm::computeOverflowForUnsignedSub(LHS, RHS, DL, &AC, CxtI, &DT);
+  }
+
+  OverflowResult computeOverflowForSignedSub(const Value *LHS, const Value *RHS,
+                                             const Instruction *CxtI) const {
+    return llvm::computeOverflowForSignedSub(LHS, RHS, DL, &AC, CxtI, &DT);
+  }
+
+  OverflowResult computeOverflow(
+      Instruction::BinaryOps BinaryOp, bool IsSigned,
+      Value *LHS, Value *RHS, Instruction *CxtI) const;
+
+  /// Performs a few simplifications for operators which are associative
+  /// or commutative.
+  bool SimplifyAssociativeOrCommutative(BinaryOperator &I);
+
+  /// Tries to simplify binary operations which some other binary
+  /// operation distributes over.
+  ///
+  /// It does this by either by factorizing out common terms (eg "(A*B)+(A*C)"
+  /// -> "A*(B+C)") or expanding out if this results in simplifications (eg: "A
+  /// & (B | C) -> (A&B) | (A&C)" if this is a win).  Returns the simplified
+  /// value, or null if it didn't simplify.
+  Value *SimplifyUsingDistributiveLaws(BinaryOperator &I);
+
+  /// Tries to simplify add operations using the definition of remainder.
+  ///
+  /// The definition of remainder is X % C = X - (X / C ) * C. The add
+  /// expression X % C0 + (( X / C0 ) % C1) * C0 can be simplified to
+  /// X % (C0 * C1)
+  Value *SimplifyAddWithRemainder(BinaryOperator &I);
+
+  // Binary Op helper for select operations where the expression can be
+  // efficiently reorganized.
+  Value *SimplifySelectsFeedingBinaryOp(BinaryOperator &I, Value *LHS,
+                                        Value *RHS);
+
+  /// This tries to simplify binary operations by factorizing out common terms
+  /// (e. g. "(A*B)+(A*C)" -> "A*(B+C)").
+  Value *tryFactorization(BinaryOperator &, Instruction::BinaryOps, Value *,
+                          Value *, Value *, Value *);
+
+  /// Match a select chain which produces one of three values based on whether
+  /// the LHS is less than, equal to, or greater than RHS respectively.
+  /// Return true if we matched a three way compare idiom. The LHS, RHS, Less,
+  /// Equal and Greater values are saved in the matching process and returned to
+  /// the caller.
+  bool matchThreeWayIntCompare(SelectInst *SI, Value *&LHS, Value *&RHS,
+                               ConstantInt *&Less, ConstantInt *&Equal,
+                               ConstantInt *&Greater);
+
+  /// Attempts to replace V with a simpler value based on the demanded
+  /// bits.
+  Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, KnownBits &Known,
+                                 unsigned Depth, Instruction *CxtI);
+  bool SimplifyDemandedBits(Instruction *I, unsigned Op,
+                            const APInt &DemandedMask, KnownBits &Known,
                             unsigned Depth = 0) override;
- 
-  /// Helper routine of SimplifyDemandedUseBits. It computes KnownZero/KnownOne 
-  /// bits. It also tries to handle simplifications that can be done based on 
-  /// DemandedMask, but without modifying the Instruction. 
-  Value *SimplifyMultipleUseDemandedBits(Instruction *I, 
-                                         const APInt &DemandedMask, 
-                                         KnownBits &Known, 
-                                         unsigned Depth, Instruction *CxtI); 
- 
-  /// Helper routine of SimplifyDemandedUseBits. It tries to simplify demanded 
-  /// bit for "r1 = shr x, c1; r2 = shl r1, c2" instruction sequence. 
-  Value *simplifyShrShlDemandedBits( 
-      Instruction *Shr, const APInt &ShrOp1, Instruction *Shl, 
-      const APInt &ShlOp1, const APInt &DemandedMask, KnownBits &Known); 
- 
-  /// Tries to simplify operands to an integer instruction based on its 
-  /// demanded bits. 
-  bool SimplifyDemandedInstructionBits(Instruction &Inst); 
- 
+
+  /// Helper routine of SimplifyDemandedUseBits. It computes KnownZero/KnownOne
+  /// bits. It also tries to handle simplifications that can be done based on
+  /// DemandedMask, but without modifying the Instruction.
+  Value *SimplifyMultipleUseDemandedBits(Instruction *I,
+                                         const APInt &DemandedMask,
+                                         KnownBits &Known,
+                                         unsigned Depth, Instruction *CxtI);
+
+  /// Helper routine of SimplifyDemandedUseBits. It tries to simplify demanded
+  /// bit for "r1 = shr x, c1; r2 = shl r1, c2" instruction sequence.
+  Value *simplifyShrShlDemandedBits(
+      Instruction *Shr, const APInt &ShrOp1, Instruction *Shl,
+      const APInt &ShlOp1, const APInt &DemandedMask, KnownBits &Known);
+
+  /// Tries to simplify operands to an integer instruction based on its
+  /// demanded bits.
+  bool SimplifyDemandedInstructionBits(Instruction &Inst);
+
   virtual Value *
   SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, APInt &UndefElts,
                              unsigned Depth = 0,
                              bool AllowMultipleUsers = false) override;
- 
-  /// Canonicalize the position of binops relative to shufflevector. 
-  Instruction *foldVectorBinop(BinaryOperator &Inst); 
-  Instruction *foldVectorSelect(SelectInst &Sel); 
- 
-  /// Given a binary operator, cast instruction, or select which has a PHI node 
-  /// as operand #0, see if we can fold the instruction into the PHI (which is 
-  /// only possible if all operands to the PHI are constants). 
-  Instruction *foldOpIntoPhi(Instruction &I, PHINode *PN); 
- 
-  /// Given an instruction with a select as one operand and a constant as the 
-  /// other operand, try to fold the binary operator into the select arguments. 
-  /// This also works for Cast instructions, which obviously do not have a 
-  /// second operand. 
-  Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI); 
- 
-  /// This is a convenience wrapper function for the above two functions. 
-  Instruction *foldBinOpIntoSelectOrPhi(BinaryOperator &I); 
- 
-  Instruction *foldAddWithConstant(BinaryOperator &Add); 
- 
-  /// Try to rotate an operation below a PHI node, using PHI nodes for 
-  /// its operands. 
+
+  /// Canonicalize the position of binops relative to shufflevector.
+  Instruction *foldVectorBinop(BinaryOperator &Inst);
+  Instruction *foldVectorSelect(SelectInst &Sel);
+
+  /// Given a binary operator, cast instruction, or select which has a PHI node
+  /// as operand #0, see if we can fold the instruction into the PHI (which is
+  /// only possible if all operands to the PHI are constants).
+  Instruction *foldOpIntoPhi(Instruction &I, PHINode *PN);
+
+  /// Given an instruction with a select as one operand and a constant as the
+  /// other operand, try to fold the binary operator into the select arguments.
+  /// This also works for Cast instructions, which obviously do not have a
+  /// second operand.
+  Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI);
+
+  /// This is a convenience wrapper function for the above two functions.
+  Instruction *foldBinOpIntoSelectOrPhi(BinaryOperator &I);
+
+  Instruction *foldAddWithConstant(BinaryOperator &Add);
+
+  /// Try to rotate an operation below a PHI node, using PHI nodes for
+  /// its operands.
   Instruction *foldPHIArgOpIntoPHI(PHINode &PN);
   Instruction *foldPHIArgBinOpIntoPHI(PHINode &PN);
   Instruction *foldPHIArgInsertValueInstructionIntoPHI(PHINode &PN);
@@ -621,167 +621,167 @@ public:
   Instruction *foldPHIArgGEPIntoPHI(PHINode &PN);
   Instruction *foldPHIArgLoadIntoPHI(PHINode &PN);
   Instruction *foldPHIArgZextsIntoPHI(PHINode &PN);
- 
-  /// If an integer typed PHI has only one use which is an IntToPtr operation, 
-  /// replace the PHI with an existing pointer typed PHI if it exists. Otherwise 
-  /// insert a new pointer typed PHI and replace the original one. 
+
+  /// If an integer typed PHI has only one use which is an IntToPtr operation,
+  /// replace the PHI with an existing pointer typed PHI if it exists. Otherwise
+  /// insert a new pointer typed PHI and replace the original one.
   Instruction *foldIntegerTypedPHI(PHINode &PN);
- 
-  /// Helper function for FoldPHIArgXIntoPHI() to set debug location for the 
-  /// folded operation. 
-  void PHIArgMergedDebugLoc(Instruction *Inst, PHINode &PN); 
- 
-  Instruction *foldGEPICmp(GEPOperator *GEPLHS, Value *RHS, 
-                           ICmpInst::Predicate Cond, Instruction &I); 
-  Instruction *foldAllocaCmp(ICmpInst &ICI, const AllocaInst *Alloca, 
-                             const Value *Other); 
-  Instruction *foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, 
-                                            GlobalVariable *GV, CmpInst &ICI, 
-                                            ConstantInt *AndCst = nullptr); 
-  Instruction *foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI, 
-                                    Constant *RHSC); 
-  Instruction *foldICmpAddOpConst(Value *X, const APInt &C, 
-                                  ICmpInst::Predicate Pred); 
-  Instruction *foldICmpWithCastOp(ICmpInst &ICI); 
- 
-  Instruction *foldICmpUsingKnownBits(ICmpInst &Cmp); 
-  Instruction *foldICmpWithDominatingICmp(ICmpInst &Cmp); 
-  Instruction *foldICmpWithConstant(ICmpInst &Cmp); 
-  Instruction *foldICmpInstWithConstant(ICmpInst &Cmp); 
-  Instruction *foldICmpInstWithConstantNotInt(ICmpInst &Cmp); 
-  Instruction *foldICmpBinOp(ICmpInst &Cmp, const SimplifyQuery &SQ); 
-  Instruction *foldICmpEquality(ICmpInst &Cmp); 
-  Instruction *foldIRemByPowerOfTwoToBitTest(ICmpInst &I); 
-  Instruction *foldSignBitTest(ICmpInst &I); 
-  Instruction *foldICmpWithZero(ICmpInst &Cmp); 
- 
-  Value *foldUnsignedMultiplicationOverflowCheck(ICmpInst &Cmp); 
- 
-  Instruction *foldICmpSelectConstant(ICmpInst &Cmp, SelectInst *Select, 
-                                      ConstantInt *C); 
-  Instruction *foldICmpTruncConstant(ICmpInst &Cmp, TruncInst *Trunc, 
-                                     const APInt &C); 
-  Instruction *foldICmpAndConstant(ICmpInst &Cmp, BinaryOperator *And, 
-                                   const APInt &C); 
-  Instruction *foldICmpXorConstant(ICmpInst &Cmp, BinaryOperator *Xor, 
-                                   const APInt &C); 
-  Instruction *foldICmpOrConstant(ICmpInst &Cmp, BinaryOperator *Or, 
-                                  const APInt &C); 
-  Instruction *foldICmpMulConstant(ICmpInst &Cmp, BinaryOperator *Mul, 
-                                   const APInt &C); 
-  Instruction *foldICmpShlConstant(ICmpInst &Cmp, BinaryOperator *Shl, 
-                                   const APInt &C); 
-  Instruction *foldICmpShrConstant(ICmpInst &Cmp, BinaryOperator *Shr, 
-                                   const APInt &C); 
-  Instruction *foldICmpSRemConstant(ICmpInst &Cmp, BinaryOperator *UDiv, 
-                                    const APInt &C); 
-  Instruction *foldICmpUDivConstant(ICmpInst &Cmp, BinaryOperator *UDiv, 
-                                    const APInt &C); 
-  Instruction *foldICmpDivConstant(ICmpInst &Cmp, BinaryOperator *Div, 
-                                   const APInt &C); 
-  Instruction *foldICmpSubConstant(ICmpInst &Cmp, BinaryOperator *Sub, 
-                                   const APInt &C); 
-  Instruction *foldICmpAddConstant(ICmpInst &Cmp, BinaryOperator *Add, 
-                                   const APInt &C); 
-  Instruction *foldICmpAndConstConst(ICmpInst &Cmp, BinaryOperator *And, 
-                                     const APInt &C1); 
-  Instruction *foldICmpAndShift(ICmpInst &Cmp, BinaryOperator *And, 
-                                const APInt &C1, const APInt &C2); 
-  Instruction *foldICmpShrConstConst(ICmpInst &I, Value *ShAmt, const APInt &C1, 
-                                     const APInt &C2); 
-  Instruction *foldICmpShlConstConst(ICmpInst &I, Value *ShAmt, const APInt &C1, 
-                                     const APInt &C2); 
- 
-  Instruction *foldICmpBinOpEqualityWithConstant(ICmpInst &Cmp, 
-                                                 BinaryOperator *BO, 
-                                                 const APInt &C); 
-  Instruction *foldICmpIntrinsicWithConstant(ICmpInst &ICI, IntrinsicInst *II, 
-                                             const APInt &C); 
-  Instruction *foldICmpEqIntrinsicWithConstant(ICmpInst &ICI, IntrinsicInst *II, 
-                                               const APInt &C); 
- 
-  // Helpers of visitSelectInst(). 
-  Instruction *foldSelectExtConst(SelectInst &Sel); 
-  Instruction *foldSelectOpOp(SelectInst &SI, Instruction *TI, Instruction *FI); 
-  Instruction *foldSelectIntoOp(SelectInst &SI, Value *, Value *); 
-  Instruction *foldSPFofSPF(Instruction *Inner, SelectPatternFlavor SPF1, 
-                            Value *A, Value *B, Instruction &Outer, 
-                            SelectPatternFlavor SPF2, Value *C); 
-  Instruction *foldSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI); 
+
+  /// Helper function for FoldPHIArgXIntoPHI() to set debug location for the
+  /// folded operation.
+  void PHIArgMergedDebugLoc(Instruction *Inst, PHINode &PN);
+
+  Instruction *foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
+                           ICmpInst::Predicate Cond, Instruction &I);
+  Instruction *foldAllocaCmp(ICmpInst &ICI, const AllocaInst *Alloca,
+                             const Value *Other);
+  Instruction *foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
+                                            GlobalVariable *GV, CmpInst &ICI,
+                                            ConstantInt *AndCst = nullptr);
+  Instruction *foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI,
+                                    Constant *RHSC);
+  Instruction *foldICmpAddOpConst(Value *X, const APInt &C,
+                                  ICmpInst::Predicate Pred);
+  Instruction *foldICmpWithCastOp(ICmpInst &ICI);
+
+  Instruction *foldICmpUsingKnownBits(ICmpInst &Cmp);
+  Instruction *foldICmpWithDominatingICmp(ICmpInst &Cmp);
+  Instruction *foldICmpWithConstant(ICmpInst &Cmp);
+  Instruction *foldICmpInstWithConstant(ICmpInst &Cmp);
+  Instruction *foldICmpInstWithConstantNotInt(ICmpInst &Cmp);
+  Instruction *foldICmpBinOp(ICmpInst &Cmp, const SimplifyQuery &SQ);
+  Instruction *foldICmpEquality(ICmpInst &Cmp);
+  Instruction *foldIRemByPowerOfTwoToBitTest(ICmpInst &I);
+  Instruction *foldSignBitTest(ICmpInst &I);
+  Instruction *foldICmpWithZero(ICmpInst &Cmp);
+
+  Value *foldUnsignedMultiplicationOverflowCheck(ICmpInst &Cmp);
+
+  Instruction *foldICmpSelectConstant(ICmpInst &Cmp, SelectInst *Select,
+                                      ConstantInt *C);
+  Instruction *foldICmpTruncConstant(ICmpInst &Cmp, TruncInst *Trunc,
+                                     const APInt &C);
+  Instruction *foldICmpAndConstant(ICmpInst &Cmp, BinaryOperator *And,
+                                   const APInt &C);
+  Instruction *foldICmpXorConstant(ICmpInst &Cmp, BinaryOperator *Xor,
+                                   const APInt &C);
+  Instruction *foldICmpOrConstant(ICmpInst &Cmp, BinaryOperator *Or,
+                                  const APInt &C);
+  Instruction *foldICmpMulConstant(ICmpInst &Cmp, BinaryOperator *Mul,
+                                   const APInt &C);
+  Instruction *foldICmpShlConstant(ICmpInst &Cmp, BinaryOperator *Shl,
+                                   const APInt &C);
+  Instruction *foldICmpShrConstant(ICmpInst &Cmp, BinaryOperator *Shr,
+                                   const APInt &C);
+  Instruction *foldICmpSRemConstant(ICmpInst &Cmp, BinaryOperator *UDiv,
+                                    const APInt &C);
+  Instruction *foldICmpUDivConstant(ICmpInst &Cmp, BinaryOperator *UDiv,
+                                    const APInt &C);
+  Instruction *foldICmpDivConstant(ICmpInst &Cmp, BinaryOperator *Div,
+                                   const APInt &C);
+  Instruction *foldICmpSubConstant(ICmpInst &Cmp, BinaryOperator *Sub,
+                                   const APInt &C);
+  Instruction *foldICmpAddConstant(ICmpInst &Cmp, BinaryOperator *Add,
+                                   const APInt &C);
+  Instruction *foldICmpAndConstConst(ICmpInst &Cmp, BinaryOperator *And,
+                                     const APInt &C1);
+  Instruction *foldICmpAndShift(ICmpInst &Cmp, BinaryOperator *And,
+                                const APInt &C1, const APInt &C2);
+  Instruction *foldICmpShrConstConst(ICmpInst &I, Value *ShAmt, const APInt &C1,
+                                     const APInt &C2);
+  Instruction *foldICmpShlConstConst(ICmpInst &I, Value *ShAmt, const APInt &C1,
+                                     const APInt &C2);
+
+  Instruction *foldICmpBinOpEqualityWithConstant(ICmpInst &Cmp,
+                                                 BinaryOperator *BO,
+                                                 const APInt &C);
+  Instruction *foldICmpIntrinsicWithConstant(ICmpInst &ICI, IntrinsicInst *II,
+                                             const APInt &C);
+  Instruction *foldICmpEqIntrinsicWithConstant(ICmpInst &ICI, IntrinsicInst *II,
+                                               const APInt &C);
+
+  // Helpers of visitSelectInst().
+  Instruction *foldSelectExtConst(SelectInst &Sel);
+  Instruction *foldSelectOpOp(SelectInst &SI, Instruction *TI, Instruction *FI);
+  Instruction *foldSelectIntoOp(SelectInst &SI, Value *, Value *);
+  Instruction *foldSPFofSPF(Instruction *Inner, SelectPatternFlavor SPF1,
+                            Value *A, Value *B, Instruction &Outer,
+                            SelectPatternFlavor SPF2, Value *C);
+  Instruction *foldSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI);
   Instruction *foldSelectValueEquivalence(SelectInst &SI, ICmpInst &ICI);
- 
-  Value *insertRangeTest(Value *V, const APInt &Lo, const APInt &Hi, 
-                         bool isSigned, bool Inside); 
-  Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI); 
-  bool mergeStoreIntoSuccessor(StoreInst &SI); 
- 
+
+  Value *insertRangeTest(Value *V, const APInt &Lo, const APInt &Hi,
+                         bool isSigned, bool Inside);
+  Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI);
+  bool mergeStoreIntoSuccessor(StoreInst &SI);
+
   /// Given an 'or' instruction, check to see if it is part of a
   /// bswap/bitreverse idiom. If so, return the equivalent bswap/bitreverse
   /// intrinsic.
   Instruction *matchBSwapOrBitReverse(BinaryOperator &Or, bool MatchBSwaps,
                                       bool MatchBitReversals);
- 
-  Instruction *SimplifyAnyMemTransfer(AnyMemTransferInst *MI); 
-  Instruction *SimplifyAnyMemSet(AnyMemSetInst *MI); 
- 
-  Value *EvaluateInDifferentType(Value *V, Type *Ty, bool isSigned); 
- 
-  /// Returns a value X such that Val = X * Scale, or null if none. 
-  /// 
-  /// If the multiplication is known not to overflow then NoSignedWrap is set. 
-  Value *Descale(Value *Val, APInt Scale, bool &NoSignedWrap); 
-}; 
- 
-class Negator final { 
-  /// Top-to-bottom, def-to-use negated instruction tree we produced. 
-  SmallVector<Instruction *, NegatorMaxNodesSSO> NewInstructions; 
- 
-  using BuilderTy = IRBuilder<TargetFolder, IRBuilderCallbackInserter>; 
-  BuilderTy Builder; 
- 
-  const DataLayout &DL; 
-  AssumptionCache &AC; 
-  const DominatorTree &DT; 
- 
-  const bool IsTrulyNegation; 
- 
-  SmallDenseMap<Value *, Value *> NegationsCache; 
- 
-  Negator(LLVMContext &C, const DataLayout &DL, AssumptionCache &AC, 
-          const DominatorTree &DT, bool IsTrulyNegation); 
- 
-#if LLVM_ENABLE_STATS 
-  unsigned NumValuesVisitedInThisNegator = 0; 
-  ~Negator(); 
-#endif 
- 
-  using Result = std::pair<ArrayRef<Instruction *> /*NewInstructions*/, 
-                           Value * /*NegatedRoot*/>; 
- 
+
+  Instruction *SimplifyAnyMemTransfer(AnyMemTransferInst *MI);
+  Instruction *SimplifyAnyMemSet(AnyMemSetInst *MI);
+
+  Value *EvaluateInDifferentType(Value *V, Type *Ty, bool isSigned);
+
+  /// Returns a value X such that Val = X * Scale, or null if none.
+  ///
+  /// If the multiplication is known not to overflow then NoSignedWrap is set.
+  Value *Descale(Value *Val, APInt Scale, bool &NoSignedWrap);
+};
+
+class Negator final {
+  /// Top-to-bottom, def-to-use negated instruction tree we produced.
+  SmallVector<Instruction *, NegatorMaxNodesSSO> NewInstructions;
+
+  using BuilderTy = IRBuilder<TargetFolder, IRBuilderCallbackInserter>;
+  BuilderTy Builder;
+
+  const DataLayout &DL;
+  AssumptionCache &AC;
+  const DominatorTree &DT;
+
+  const bool IsTrulyNegation;
+
+  SmallDenseMap<Value *, Value *> NegationsCache;
+
+  Negator(LLVMContext &C, const DataLayout &DL, AssumptionCache &AC,
+          const DominatorTree &DT, bool IsTrulyNegation);
+
+#if LLVM_ENABLE_STATS
+  unsigned NumValuesVisitedInThisNegator = 0;
+  ~Negator();
+#endif
+
+  using Result = std::pair<ArrayRef<Instruction *> /*NewInstructions*/,
+                           Value * /*NegatedRoot*/>;
+
   std::array<Value *, 2> getSortedOperandsOfBinOp(Instruction *I);
 
-  LLVM_NODISCARD Value *visitImpl(Value *V, unsigned Depth); 
- 
-  LLVM_NODISCARD Value *negate(Value *V, unsigned Depth); 
- 
-  /// Recurse depth-first and attempt to sink the negation. 
-  /// FIXME: use worklist? 
-  LLVM_NODISCARD Optional<Result> run(Value *Root); 
- 
-  Negator(const Negator &) = delete; 
-  Negator(Negator &&) = delete; 
-  Negator &operator=(const Negator &) = delete; 
-  Negator &operator=(Negator &&) = delete; 
- 
-public: 
-  /// Attempt to negate \p Root. Retuns nullptr if negation can't be performed, 
-  /// otherwise returns negated value. 
-  LLVM_NODISCARD static Value *Negate(bool LHSIsZero, Value *Root, 
+  LLVM_NODISCARD Value *visitImpl(Value *V, unsigned Depth);
+
+  LLVM_NODISCARD Value *negate(Value *V, unsigned Depth);
+
+  /// Recurse depth-first and attempt to sink the negation.
+  /// FIXME: use worklist?
+  LLVM_NODISCARD Optional<Result> run(Value *Root);
+
+  Negator(const Negator &) = delete;
+  Negator(Negator &&) = delete;
+  Negator &operator=(const Negator &) = delete;
+  Negator &operator=(Negator &&) = delete;
+
+public:
+  /// Attempt to negate \p Root. Retuns nullptr if negation can't be performed,
+  /// otherwise returns negated value.
+  LLVM_NODISCARD static Value *Negate(bool LHSIsZero, Value *Root,
                                       InstCombinerImpl &IC);
-}; 
- 
-} // end namespace llvm 
- 
-#undef DEBUG_TYPE 
- 
-#endif // LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H 
+};
+
+} // end namespace llvm
+
+#undef DEBUG_TYPE
+
+#endif // LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index b5a97a4e26..c7b5f6f780 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -1,315 +1,315 @@
-//===- InstCombineLoadStoreAlloca.cpp -------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the visit functions for load, store and alloca. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "InstCombineInternal.h" 
-#include "llvm/ADT/MapVector.h" 
-#include "llvm/ADT/SmallString.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/AliasAnalysis.h" 
-#include "llvm/Analysis/Loads.h" 
-#include "llvm/IR/ConstantRange.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DebugInfoMetadata.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/MDBuilder.h" 
-#include "llvm/IR/PatternMatch.h" 
+//===- InstCombineLoadStoreAlloca.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visit functions for load, store and alloca.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-using namespace llvm; 
-using namespace PatternMatch; 
- 
-#define DEBUG_TYPE "instcombine" 
- 
-STATISTIC(NumDeadStore,    "Number of dead stores eliminated"); 
-STATISTIC(NumGlobalCopies, "Number of allocas copied from constant global"); 
- 
-/// isOnlyCopiedFromConstantGlobal - Recursively walk the uses of a (derived) 
-/// pointer to an alloca.  Ignore any reads of the pointer, return false if we 
-/// see any stores or other unknown uses.  If we see pointer arithmetic, keep 
-/// track of whether it moves the pointer (with IsOffset) but otherwise traverse 
-/// the uses.  If we see a memcpy/memmove that targets an unoffseted pointer to 
-/// the alloca, and if the source pointer is a pointer to a constant global, we 
-/// can optimize this. 
-static bool 
-isOnlyCopiedFromConstantMemory(AAResults *AA, 
-                               Value *V, MemTransferInst *&TheCopy, 
-                               SmallVectorImpl<Instruction *> &ToDelete) { 
-  // We track lifetime intrinsics as we encounter them.  If we decide to go 
-  // ahead and replace the value with the global, this lets the caller quickly 
-  // eliminate the markers. 
- 
-  SmallVector<std::pair<Value *, bool>, 35> ValuesToInspect; 
-  ValuesToInspect.emplace_back(V, false); 
-  while (!ValuesToInspect.empty()) { 
-    auto ValuePair = ValuesToInspect.pop_back_val(); 
-    const bool IsOffset = ValuePair.second; 
-    for (auto &U : ValuePair.first->uses()) { 
-      auto *I = cast<Instruction>(U.getUser()); 
- 
-      if (auto *LI = dyn_cast<LoadInst>(I)) { 
-        // Ignore non-volatile loads, they are always ok. 
-        if (!LI->isSimple()) return false; 
-        continue; 
-      } 
- 
-      if (isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I)) { 
-        // If uses of the bitcast are ok, we are ok. 
-        ValuesToInspect.emplace_back(I, IsOffset); 
-        continue; 
-      } 
-      if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) { 
-        // If the GEP has all zero indices, it doesn't offset the pointer. If it 
-        // doesn't, it does. 
-        ValuesToInspect.emplace_back(I, IsOffset || !GEP->hasAllZeroIndices()); 
-        continue; 
-      } 
- 
-      if (auto *Call = dyn_cast<CallBase>(I)) { 
-        // If this is the function being called then we treat it like a load and 
-        // ignore it. 
-        if (Call->isCallee(&U)) 
-          continue; 
- 
-        unsigned DataOpNo = Call->getDataOperandNo(&U); 
-        bool IsArgOperand = Call->isArgOperand(&U); 
- 
-        // Inalloca arguments are clobbered by the call. 
-        if (IsArgOperand && Call->isInAllocaArgument(DataOpNo)) 
-          return false; 
- 
-        // If this is a readonly/readnone call site, then we know it is just a 
-        // load (but one that potentially returns the value itself), so we can 
-        // ignore it if we know that the value isn't captured. 
-        if (Call->onlyReadsMemory() && 
-            (Call->use_empty() || Call->doesNotCapture(DataOpNo))) 
-          continue; 
- 
-        // If this is being passed as a byval argument, the caller is making a 
-        // copy, so it is only a read of the alloca. 
-        if (IsArgOperand && Call->isByValArgument(DataOpNo)) 
-          continue; 
-      } 
- 
-      // Lifetime intrinsics can be handled by the caller. 
-      if (I->isLifetimeStartOrEnd()) { 
-        assert(I->use_empty() && "Lifetime markers have no result to use!"); 
-        ToDelete.push_back(I); 
-        continue; 
-      } 
- 
-      // If this is isn't our memcpy/memmove, reject it as something we can't 
-      // handle. 
-      MemTransferInst *MI = dyn_cast<MemTransferInst>(I); 
-      if (!MI) 
-        return false; 
- 
-      // If the transfer is using the alloca as a source of the transfer, then 
-      // ignore it since it is a load (unless the transfer is volatile). 
-      if (U.getOperandNo() == 1) { 
-        if (MI->isVolatile()) return false; 
-        continue; 
-      } 
- 
-      // If we already have seen a copy, reject the second one. 
-      if (TheCopy) return false; 
- 
-      // If the pointer has been offset from the start of the alloca, we can't 
-      // safely handle this. 
-      if (IsOffset) return false; 
- 
-      // If the memintrinsic isn't using the alloca as the dest, reject it. 
-      if (U.getOperandNo() != 0) return false; 
- 
-      // If the source of the memcpy/move is not a constant global, reject it. 
-      if (!AA->pointsToConstantMemory(MI->getSource())) 
-        return false; 
- 
-      // Otherwise, the transform is safe.  Remember the copy instruction. 
-      TheCopy = MI; 
-    } 
-  } 
-  return true; 
-} 
- 
-/// isOnlyCopiedFromConstantGlobal - Return true if the specified alloca is only 
-/// modified by a copy from a constant global.  If we can prove this, we can 
-/// replace any uses of the alloca with uses of the global directly. 
-static MemTransferInst * 
-isOnlyCopiedFromConstantMemory(AAResults *AA, 
-                               AllocaInst *AI, 
-                               SmallVectorImpl<Instruction *> &ToDelete) { 
-  MemTransferInst *TheCopy = nullptr; 
-  if (isOnlyCopiedFromConstantMemory(AA, AI, TheCopy, ToDelete)) 
-    return TheCopy; 
-  return nullptr; 
-} 
- 
-/// Returns true if V is dereferenceable for size of alloca. 
-static bool isDereferenceableForAllocaSize(const Value *V, const AllocaInst *AI, 
-                                           const DataLayout &DL) { 
-  if (AI->isArrayAllocation()) 
-    return false; 
-  uint64_t AllocaSize = DL.getTypeStoreSize(AI->getAllocatedType()); 
-  if (!AllocaSize) 
-    return false; 
-  return isDereferenceableAndAlignedPointer(V, Align(AI->getAlignment()), 
-                                            APInt(64, AllocaSize), DL); 
-} 
- 
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
+STATISTIC(NumDeadStore,    "Number of dead stores eliminated");
+STATISTIC(NumGlobalCopies, "Number of allocas copied from constant global");
+
+/// isOnlyCopiedFromConstantGlobal - Recursively walk the uses of a (derived)
+/// pointer to an alloca.  Ignore any reads of the pointer, return false if we
+/// see any stores or other unknown uses.  If we see pointer arithmetic, keep
+/// track of whether it moves the pointer (with IsOffset) but otherwise traverse
+/// the uses.  If we see a memcpy/memmove that targets an unoffseted pointer to
+/// the alloca, and if the source pointer is a pointer to a constant global, we
+/// can optimize this.
+static bool
+isOnlyCopiedFromConstantMemory(AAResults *AA,
+                               Value *V, MemTransferInst *&TheCopy,
+                               SmallVectorImpl<Instruction *> &ToDelete) {
+  // We track lifetime intrinsics as we encounter them.  If we decide to go
+  // ahead and replace the value with the global, this lets the caller quickly
+  // eliminate the markers.
+
+  SmallVector<std::pair<Value *, bool>, 35> ValuesToInspect;
+  ValuesToInspect.emplace_back(V, false);
+  while (!ValuesToInspect.empty()) {
+    auto ValuePair = ValuesToInspect.pop_back_val();
+    const bool IsOffset = ValuePair.second;
+    for (auto &U : ValuePair.first->uses()) {
+      auto *I = cast<Instruction>(U.getUser());
+
+      if (auto *LI = dyn_cast<LoadInst>(I)) {
+        // Ignore non-volatile loads, they are always ok.
+        if (!LI->isSimple()) return false;
+        continue;
+      }
+
+      if (isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I)) {
+        // If uses of the bitcast are ok, we are ok.
+        ValuesToInspect.emplace_back(I, IsOffset);
+        continue;
+      }
+      if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
+        // If the GEP has all zero indices, it doesn't offset the pointer. If it
+        // doesn't, it does.
+        ValuesToInspect.emplace_back(I, IsOffset || !GEP->hasAllZeroIndices());
+        continue;
+      }
+
+      if (auto *Call = dyn_cast<CallBase>(I)) {
+        // If this is the function being called then we treat it like a load and
+        // ignore it.
+        if (Call->isCallee(&U))
+          continue;
+
+        unsigned DataOpNo = Call->getDataOperandNo(&U);
+        bool IsArgOperand = Call->isArgOperand(&U);
+
+        // Inalloca arguments are clobbered by the call.
+        if (IsArgOperand && Call->isInAllocaArgument(DataOpNo))
+          return false;
+
+        // If this is a readonly/readnone call site, then we know it is just a
+        // load (but one that potentially returns the value itself), so we can
+        // ignore it if we know that the value isn't captured.
+        if (Call->onlyReadsMemory() &&
+            (Call->use_empty() || Call->doesNotCapture(DataOpNo)))
+          continue;
+
+        // If this is being passed as a byval argument, the caller is making a
+        // copy, so it is only a read of the alloca.
+        if (IsArgOperand && Call->isByValArgument(DataOpNo))
+          continue;
+      }
+
+      // Lifetime intrinsics can be handled by the caller.
+      if (I->isLifetimeStartOrEnd()) {
+        assert(I->use_empty() && "Lifetime markers have no result to use!");
+        ToDelete.push_back(I);
+        continue;
+      }
+
+      // If this is isn't our memcpy/memmove, reject it as something we can't
+      // handle.
+      MemTransferInst *MI = dyn_cast<MemTransferInst>(I);
+      if (!MI)
+        return false;
+
+      // If the transfer is using the alloca as a source of the transfer, then
+      // ignore it since it is a load (unless the transfer is volatile).
+      if (U.getOperandNo() == 1) {
+        if (MI->isVolatile()) return false;
+        continue;
+      }
+
+      // If we already have seen a copy, reject the second one.
+      if (TheCopy) return false;
+
+      // If the pointer has been offset from the start of the alloca, we can't
+      // safely handle this.
+      if (IsOffset) return false;
+
+      // If the memintrinsic isn't using the alloca as the dest, reject it.
+      if (U.getOperandNo() != 0) return false;
+
+      // If the source of the memcpy/move is not a constant global, reject it.
+      if (!AA->pointsToConstantMemory(MI->getSource()))
+        return false;
+
+      // Otherwise, the transform is safe.  Remember the copy instruction.
+      TheCopy = MI;
+    }
+  }
+  return true;
+}
+
+/// isOnlyCopiedFromConstantGlobal - Return true if the specified alloca is only
+/// modified by a copy from a constant global.  If we can prove this, we can
+/// replace any uses of the alloca with uses of the global directly.
+static MemTransferInst *
+isOnlyCopiedFromConstantMemory(AAResults *AA,
+                               AllocaInst *AI,
+                               SmallVectorImpl<Instruction *> &ToDelete) {
+  MemTransferInst *TheCopy = nullptr;
+  if (isOnlyCopiedFromConstantMemory(AA, AI, TheCopy, ToDelete))
+    return TheCopy;
+  return nullptr;
+}
+
+/// Returns true if V is dereferenceable for size of alloca.
+static bool isDereferenceableForAllocaSize(const Value *V, const AllocaInst *AI,
+                                           const DataLayout &DL) {
+  if (AI->isArrayAllocation())
+    return false;
+  uint64_t AllocaSize = DL.getTypeStoreSize(AI->getAllocatedType());
+  if (!AllocaSize)
+    return false;
+  return isDereferenceableAndAlignedPointer(V, Align(AI->getAlignment()),
+                                            APInt(64, AllocaSize), DL);
+}
+
 static Instruction *simplifyAllocaArraySize(InstCombinerImpl &IC,
                                             AllocaInst &AI) {
-  // Check for array size of 1 (scalar allocation). 
-  if (!AI.isArrayAllocation()) { 
-    // i32 1 is the canonical array size for scalar allocations. 
-    if (AI.getArraySize()->getType()->isIntegerTy(32)) 
-      return nullptr; 
- 
-    // Canonicalize it. 
-    return IC.replaceOperand(AI, 0, IC.Builder.getInt32(1)); 
-  } 
- 
-  // Convert: alloca Ty, C - where C is a constant != 1 into: alloca [C x Ty], 1 
-  if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) { 
-    if (C->getValue().getActiveBits() <= 64) { 
-      Type *NewTy = ArrayType::get(AI.getAllocatedType(), C->getZExtValue()); 
-      AllocaInst *New = IC.Builder.CreateAlloca(NewTy, nullptr, AI.getName()); 
-      New->setAlignment(AI.getAlign()); 
- 
-      // Scan to the end of the allocation instructions, to skip over a block of 
-      // allocas if possible...also skip interleaved debug info 
-      // 
-      BasicBlock::iterator It(New); 
-      while (isa<AllocaInst>(*It) || isa<DbgInfoIntrinsic>(*It)) 
-        ++It; 
- 
-      // Now that I is pointing to the first non-allocation-inst in the block, 
-      // insert our getelementptr instruction... 
-      // 
-      Type *IdxTy = IC.getDataLayout().getIntPtrType(AI.getType()); 
-      Value *NullIdx = Constant::getNullValue(IdxTy); 
-      Value *Idx[2] = {NullIdx, NullIdx}; 
-      Instruction *GEP = GetElementPtrInst::CreateInBounds( 
-          NewTy, New, Idx, New->getName() + ".sub"); 
-      IC.InsertNewInstBefore(GEP, *It); 
- 
-      // Now make everything use the getelementptr instead of the original 
-      // allocation. 
-      return IC.replaceInstUsesWith(AI, GEP); 
-    } 
-  } 
- 
-  if (isa<UndefValue>(AI.getArraySize())) 
-    return IC.replaceInstUsesWith(AI, Constant::getNullValue(AI.getType())); 
- 
-  // Ensure that the alloca array size argument has type intptr_t, so that 
-  // any casting is exposed early. 
-  Type *IntPtrTy = IC.getDataLayout().getIntPtrType(AI.getType()); 
-  if (AI.getArraySize()->getType() != IntPtrTy) { 
-    Value *V = IC.Builder.CreateIntCast(AI.getArraySize(), IntPtrTy, false); 
-    return IC.replaceOperand(AI, 0, V); 
-  } 
- 
-  return nullptr; 
-} 
- 
-namespace { 
-// If I and V are pointers in different address space, it is not allowed to 
-// use replaceAllUsesWith since I and V have different types. A 
-// non-target-specific transformation should not use addrspacecast on V since 
-// the two address space may be disjoint depending on target. 
-// 
-// This class chases down uses of the old pointer until reaching the load 
-// instructions, then replaces the old pointer in the load instructions with 
-// the new pointer. If during the chasing it sees bitcast or GEP, it will 
-// create new bitcast or GEP with the new pointer and use them in the load 
-// instruction. 
-class PointerReplacer { 
-public: 
+  // Check for array size of 1 (scalar allocation).
+  if (!AI.isArrayAllocation()) {
+    // i32 1 is the canonical array size for scalar allocations.
+    if (AI.getArraySize()->getType()->isIntegerTy(32))
+      return nullptr;
+
+    // Canonicalize it.
+    return IC.replaceOperand(AI, 0, IC.Builder.getInt32(1));
+  }
+
+  // Convert: alloca Ty, C - where C is a constant != 1 into: alloca [C x Ty], 1
+  if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) {
+    if (C->getValue().getActiveBits() <= 64) {
+      Type *NewTy = ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
+      AllocaInst *New = IC.Builder.CreateAlloca(NewTy, nullptr, AI.getName());
+      New->setAlignment(AI.getAlign());
+
+      // Scan to the end of the allocation instructions, to skip over a block of
+      // allocas if possible...also skip interleaved debug info
+      //
+      BasicBlock::iterator It(New);
+      while (isa<AllocaInst>(*It) || isa<DbgInfoIntrinsic>(*It))
+        ++It;
+
+      // Now that I is pointing to the first non-allocation-inst in the block,
+      // insert our getelementptr instruction...
+      //
+      Type *IdxTy = IC.getDataLayout().getIntPtrType(AI.getType());
+      Value *NullIdx = Constant::getNullValue(IdxTy);
+      Value *Idx[2] = {NullIdx, NullIdx};
+      Instruction *GEP = GetElementPtrInst::CreateInBounds(
+          NewTy, New, Idx, New->getName() + ".sub");
+      IC.InsertNewInstBefore(GEP, *It);
+
+      // Now make everything use the getelementptr instead of the original
+      // allocation.
+      return IC.replaceInstUsesWith(AI, GEP);
+    }
+  }
+
+  if (isa<UndefValue>(AI.getArraySize()))
+    return IC.replaceInstUsesWith(AI, Constant::getNullValue(AI.getType()));
+
+  // Ensure that the alloca array size argument has type intptr_t, so that
+  // any casting is exposed early.
+  Type *IntPtrTy = IC.getDataLayout().getIntPtrType(AI.getType());
+  if (AI.getArraySize()->getType() != IntPtrTy) {
+    Value *V = IC.Builder.CreateIntCast(AI.getArraySize(), IntPtrTy, false);
+    return IC.replaceOperand(AI, 0, V);
+  }
+
+  return nullptr;
+}
+
+namespace {
+// If I and V are pointers in different address space, it is not allowed to
+// use replaceAllUsesWith since I and V have different types. A
+// non-target-specific transformation should not use addrspacecast on V since
+// the two address space may be disjoint depending on target.
+//
+// This class chases down uses of the old pointer until reaching the load
+// instructions, then replaces the old pointer in the load instructions with
+// the new pointer. If during the chasing it sees bitcast or GEP, it will
+// create new bitcast or GEP with the new pointer and use them in the load
+// instruction.
+class PointerReplacer {
+public:
   PointerReplacer(InstCombinerImpl &IC) : IC(IC) {}
 
   bool collectUsers(Instruction &I);
-  void replacePointer(Instruction &I, Value *V); 
- 
-private: 
-  void replace(Instruction *I); 
-  Value *getReplacement(Value *I); 
- 
+  void replacePointer(Instruction &I, Value *V);
+
+private:
+  void replace(Instruction *I);
+  Value *getReplacement(Value *I);
+
   SmallSetVector<Instruction *, 4> Worklist;
-  MapVector<Value *, Value *> WorkMap; 
+  MapVector<Value *, Value *> WorkMap;
   InstCombinerImpl &IC;
-}; 
-} // end anonymous namespace 
- 
+};
+} // end anonymous namespace
+
 bool PointerReplacer::collectUsers(Instruction &I) {
-  for (auto U : I.users()) { 
+  for (auto U : I.users()) {
     Instruction *Inst = cast<Instruction>(&*U);
     if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
       if (Load->isVolatile())
         return false;
       Worklist.insert(Load);
-    } else if (isa<GetElementPtrInst>(Inst) || isa<BitCastInst>(Inst)) { 
+    } else if (isa<GetElementPtrInst>(Inst) || isa<BitCastInst>(Inst)) {
       Worklist.insert(Inst);
       if (!collectUsers(*Inst))
         return false;
     } else if (isa<MemTransferInst>(Inst)) {
       Worklist.insert(Inst);
-    } else { 
+    } else {
       LLVM_DEBUG(dbgs() << "Cannot handle pointer user: " << *U << '\n');
       return false;
-    } 
-  } 
- 
+    }
+  }
+
   return true;
-} 
- 
+}
+
 Value *PointerReplacer::getReplacement(Value *V) { return WorkMap.lookup(V); }
 
-void PointerReplacer::replace(Instruction *I) { 
-  if (getReplacement(I)) 
-    return; 
- 
-  if (auto *LT = dyn_cast<LoadInst>(I)) { 
-    auto *V = getReplacement(LT->getPointerOperand()); 
-    assert(V && "Operand not replaced"); 
+void PointerReplacer::replace(Instruction *I) {
+  if (getReplacement(I))
+    return;
+
+  if (auto *LT = dyn_cast<LoadInst>(I)) {
+    auto *V = getReplacement(LT->getPointerOperand());
+    assert(V && "Operand not replaced");
     auto *NewI = new LoadInst(LT->getType(), V, "", LT->isVolatile(),
                               LT->getAlign(), LT->getOrdering(),
                               LT->getSyncScopeID());
-    NewI->takeName(LT); 
+    NewI->takeName(LT);
     copyMetadataForLoad(*NewI, *LT);
 
-    IC.InsertNewInstWith(NewI, *LT); 
-    IC.replaceInstUsesWith(*LT, NewI); 
-    WorkMap[LT] = NewI; 
-  } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) { 
-    auto *V = getReplacement(GEP->getPointerOperand()); 
-    assert(V && "Operand not replaced"); 
-    SmallVector<Value *, 8> Indices; 
-    Indices.append(GEP->idx_begin(), GEP->idx_end()); 
-    auto *NewI = GetElementPtrInst::Create( 
-        V->getType()->getPointerElementType(), V, Indices); 
-    IC.InsertNewInstWith(NewI, *GEP); 
-    NewI->takeName(GEP); 
-    WorkMap[GEP] = NewI; 
-  } else if (auto *BC = dyn_cast<BitCastInst>(I)) { 
-    auto *V = getReplacement(BC->getOperand(0)); 
-    assert(V && "Operand not replaced"); 
-    auto *NewT = PointerType::get(BC->getType()->getPointerElementType(), 
-                                  V->getType()->getPointerAddressSpace()); 
-    auto *NewI = new BitCastInst(V, NewT); 
-    IC.InsertNewInstWith(NewI, *BC); 
-    NewI->takeName(BC); 
-    WorkMap[BC] = NewI; 
+    IC.InsertNewInstWith(NewI, *LT);
+    IC.replaceInstUsesWith(*LT, NewI);
+    WorkMap[LT] = NewI;
+  } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
+    auto *V = getReplacement(GEP->getPointerOperand());
+    assert(V && "Operand not replaced");
+    SmallVector<Value *, 8> Indices;
+    Indices.append(GEP->idx_begin(), GEP->idx_end());
+    auto *NewI = GetElementPtrInst::Create(
+        V->getType()->getPointerElementType(), V, Indices);
+    IC.InsertNewInstWith(NewI, *GEP);
+    NewI->takeName(GEP);
+    WorkMap[GEP] = NewI;
+  } else if (auto *BC = dyn_cast<BitCastInst>(I)) {
+    auto *V = getReplacement(BC->getOperand(0));
+    assert(V && "Operand not replaced");
+    auto *NewT = PointerType::get(BC->getType()->getPointerElementType(),
+                                  V->getType()->getPointerAddressSpace());
+    auto *NewI = new BitCastInst(V, NewT);
+    IC.InsertNewInstWith(NewI, *BC);
+    NewI->takeName(BC);
+    WorkMap[BC] = NewI;
   } else if (auto *MemCpy = dyn_cast<MemTransferInst>(I)) {
     auto *SrcV = getReplacement(MemCpy->getRawSource());
     // The pointer may appear in the destination of a copy, but we don't want to
@@ -332,83 +332,83 @@ void PointerReplacer::replace(Instruction *I) {
 
     IC.eraseInstFromFunction(*MemCpy);
     WorkMap[MemCpy] = NewI;
-  } else { 
-    llvm_unreachable("should never reach here"); 
-  } 
-} 
- 
-void PointerReplacer::replacePointer(Instruction &I, Value *V) { 
-#ifndef NDEBUG 
-  auto *PT = cast<PointerType>(I.getType()); 
-  auto *NT = cast<PointerType>(V->getType()); 
-  assert(PT != NT && PT->getElementType() == NT->getElementType() && 
-         "Invalid usage"); 
-#endif 
-  WorkMap[&I] = V; 
+  } else {
+    llvm_unreachable("should never reach here");
+  }
+}
+
+void PointerReplacer::replacePointer(Instruction &I, Value *V) {
+#ifndef NDEBUG
+  auto *PT = cast<PointerType>(I.getType());
+  auto *NT = cast<PointerType>(V->getType());
+  assert(PT != NT && PT->getElementType() == NT->getElementType() &&
+         "Invalid usage");
+#endif
+  WorkMap[&I] = V;
 
   for (Instruction *Workitem : Worklist)
     replace(Workitem);
-} 
- 
+}
+
 Instruction *InstCombinerImpl::visitAllocaInst(AllocaInst &AI) {
-  if (auto *I = simplifyAllocaArraySize(*this, AI)) 
-    return I; 
- 
-  if (AI.getAllocatedType()->isSized()) { 
-    // Move all alloca's of zero byte objects to the entry block and merge them 
-    // together.  Note that we only do this for alloca's, because malloc should 
-    // allocate and return a unique pointer, even for a zero byte allocation. 
-    if (DL.getTypeAllocSize(AI.getAllocatedType()).getKnownMinSize() == 0) { 
-      // For a zero sized alloca there is no point in doing an array allocation. 
-      // This is helpful if the array size is a complicated expression not used 
-      // elsewhere. 
-      if (AI.isArrayAllocation()) 
-        return replaceOperand(AI, 0, 
-            ConstantInt::get(AI.getArraySize()->getType(), 1)); 
- 
-      // Get the first instruction in the entry block. 
-      BasicBlock &EntryBlock = AI.getParent()->getParent()->getEntryBlock(); 
-      Instruction *FirstInst = EntryBlock.getFirstNonPHIOrDbg(); 
-      if (FirstInst != &AI) { 
-        // If the entry block doesn't start with a zero-size alloca then move 
-        // this one to the start of the entry block.  There is no problem with 
-        // dominance as the array size was forced to a constant earlier already. 
-        AllocaInst *EntryAI = dyn_cast<AllocaInst>(FirstInst); 
-        if (!EntryAI || !EntryAI->getAllocatedType()->isSized() || 
-            DL.getTypeAllocSize(EntryAI->getAllocatedType()) 
-                    .getKnownMinSize() != 0) { 
-          AI.moveBefore(FirstInst); 
-          return &AI; 
-        } 
- 
-        // Replace this zero-sized alloca with the one at the start of the entry 
-        // block after ensuring that the address will be aligned enough for both 
-        // types. 
-        const Align MaxAlign = std::max(EntryAI->getAlign(), AI.getAlign()); 
-        EntryAI->setAlignment(MaxAlign); 
-        if (AI.getType() != EntryAI->getType()) 
-          return new BitCastInst(EntryAI, AI.getType()); 
-        return replaceInstUsesWith(AI, EntryAI); 
-      } 
-    } 
-  } 
- 
-  // Check to see if this allocation is only modified by a memcpy/memmove from 
-  // a constant whose alignment is equal to or exceeds that of the allocation. 
-  // If this is the case, we can change all users to use the constant global 
-  // instead.  This is commonly produced by the CFE by constructs like "void 
-  // foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A' is only subsequently 
-  // read. 
-  SmallVector<Instruction *, 4> ToDelete; 
-  if (MemTransferInst *Copy = isOnlyCopiedFromConstantMemory(AA, &AI, ToDelete)) { 
+  if (auto *I = simplifyAllocaArraySize(*this, AI))
+    return I;
+
+  if (AI.getAllocatedType()->isSized()) {
+    // Move all alloca's of zero byte objects to the entry block and merge them
+    // together.  Note that we only do this for alloca's, because malloc should
+    // allocate and return a unique pointer, even for a zero byte allocation.
+    if (DL.getTypeAllocSize(AI.getAllocatedType()).getKnownMinSize() == 0) {
+      // For a zero sized alloca there is no point in doing an array allocation.
+      // This is helpful if the array size is a complicated expression not used
+      // elsewhere.
+      if (AI.isArrayAllocation())
+        return replaceOperand(AI, 0,
+            ConstantInt::get(AI.getArraySize()->getType(), 1));
+
+      // Get the first instruction in the entry block.
+      BasicBlock &EntryBlock = AI.getParent()->getParent()->getEntryBlock();
+      Instruction *FirstInst = EntryBlock.getFirstNonPHIOrDbg();
+      if (FirstInst != &AI) {
+        // If the entry block doesn't start with a zero-size alloca then move
+        // this one to the start of the entry block.  There is no problem with
+        // dominance as the array size was forced to a constant earlier already.
+        AllocaInst *EntryAI = dyn_cast<AllocaInst>(FirstInst);
+        if (!EntryAI || !EntryAI->getAllocatedType()->isSized() ||
+            DL.getTypeAllocSize(EntryAI->getAllocatedType())
+                    .getKnownMinSize() != 0) {
+          AI.moveBefore(FirstInst);
+          return &AI;
+        }
+
+        // Replace this zero-sized alloca with the one at the start of the entry
+        // block after ensuring that the address will be aligned enough for both
+        // types.
+        const Align MaxAlign = std::max(EntryAI->getAlign(), AI.getAlign());
+        EntryAI->setAlignment(MaxAlign);
+        if (AI.getType() != EntryAI->getType())
+          return new BitCastInst(EntryAI, AI.getType());
+        return replaceInstUsesWith(AI, EntryAI);
+      }
+    }
+  }
+
+  // Check to see if this allocation is only modified by a memcpy/memmove from
+  // a constant whose alignment is equal to or exceeds that of the allocation.
+  // If this is the case, we can change all users to use the constant global
+  // instead.  This is commonly produced by the CFE by constructs like "void
+  // foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A' is only subsequently
+  // read.
+  SmallVector<Instruction *, 4> ToDelete;
+  if (MemTransferInst *Copy = isOnlyCopiedFromConstantMemory(AA, &AI, ToDelete)) {
     Value *TheSrc = Copy->getSource();
-    Align AllocaAlign = AI.getAlign(); 
-    Align SourceAlign = getOrEnforceKnownAlignment( 
+    Align AllocaAlign = AI.getAlign();
+    Align SourceAlign = getOrEnforceKnownAlignment(
       TheSrc, AllocaAlign, DL, &AI, &AC, &DT);
-    if (AllocaAlign <= SourceAlign && 
+    if (AllocaAlign <= SourceAlign &&
         isDereferenceableForAllocaSize(TheSrc, &AI, DL)) {
-      LLVM_DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n'); 
-      LLVM_DEBUG(dbgs() << "  memcpy = " << *Copy << '\n'); 
+      LLVM_DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n');
+      LLVM_DEBUG(dbgs() << "  memcpy = " << *Copy << '\n');
       unsigned SrcAddrSpace = TheSrc->getType()->getPointerAddressSpace();
       auto *DestTy = PointerType::get(AI.getAllocatedType(), SrcAddrSpace);
       if (AI.getType()->getAddressSpace() == SrcAddrSpace) {
@@ -416,13 +416,13 @@ Instruction *InstCombinerImpl::visitAllocaInst(AllocaInst &AI) {
           eraseInstFromFunction(*Delete);
 
         Value *Cast = Builder.CreateBitCast(TheSrc, DestTy);
-        Instruction *NewI = replaceInstUsesWith(AI, Cast); 
-        eraseInstFromFunction(*Copy); 
-        ++NumGlobalCopies; 
-        return NewI; 
-      } 
- 
-      PointerReplacer PtrReplacer(*this); 
+        Instruction *NewI = replaceInstUsesWith(AI, Cast);
+        eraseInstFromFunction(*Copy);
+        ++NumGlobalCopies;
+        return NewI;
+      }
+
+      PointerReplacer PtrReplacer(*this);
       if (PtrReplacer.collectUsers(AI)) {
         for (Instruction *Delete : ToDelete)
           eraseInstFromFunction(*Delete);
@@ -431,161 +431,161 @@ Instruction *InstCombinerImpl::visitAllocaInst(AllocaInst &AI) {
         PtrReplacer.replacePointer(AI, Cast);
         ++NumGlobalCopies;
       }
-    } 
-  } 
- 
-  // At last, use the generic allocation site handler to aggressively remove 
-  // unused allocas. 
-  return visitAllocSite(AI); 
-} 
- 
-// Are we allowed to form a atomic load or store of this type? 
-static bool isSupportedAtomicType(Type *Ty) { 
-  return Ty->isIntOrPtrTy() || Ty->isFloatingPointTy(); 
-} 
- 
-/// Helper to combine a load to a new type. 
-/// 
-/// This just does the work of combining a load to a new type. It handles 
-/// metadata, etc., and returns the new instruction. The \c NewTy should be the 
-/// loaded *value* type. This will convert it to a pointer, cast the operand to 
-/// that pointer type, load it, etc. 
-/// 
-/// Note that this will create all of the instructions with whatever insert 
+    }
+  }
+
+  // At last, use the generic allocation site handler to aggressively remove
+  // unused allocas.
+  return visitAllocSite(AI);
+}
+
+// Are we allowed to form a atomic load or store of this type?
+static bool isSupportedAtomicType(Type *Ty) {
+  return Ty->isIntOrPtrTy() || Ty->isFloatingPointTy();
+}
+
+/// Helper to combine a load to a new type.
+///
+/// This just does the work of combining a load to a new type. It handles
+/// metadata, etc., and returns the new instruction. The \c NewTy should be the
+/// loaded *value* type. This will convert it to a pointer, cast the operand to
+/// that pointer type, load it, etc.
+///
+/// Note that this will create all of the instructions with whatever insert
 /// point the \c InstCombinerImpl currently is using.
 LoadInst *InstCombinerImpl::combineLoadToNewType(LoadInst &LI, Type *NewTy,
                                                  const Twine &Suffix) {
-  assert((!LI.isAtomic() || isSupportedAtomicType(NewTy)) && 
-         "can't fold an atomic load to requested type"); 
- 
-  Value *Ptr = LI.getPointerOperand(); 
-  unsigned AS = LI.getPointerAddressSpace(); 
-  Value *NewPtr = nullptr; 
-  if (!(match(Ptr, m_BitCast(m_Value(NewPtr))) && 
-        NewPtr->getType()->getPointerElementType() == NewTy && 
-        NewPtr->getType()->getPointerAddressSpace() == AS)) 
-    NewPtr = Builder.CreateBitCast(Ptr, NewTy->getPointerTo(AS)); 
- 
-  LoadInst *NewLoad = Builder.CreateAlignedLoad( 
-      NewTy, NewPtr, LI.getAlign(), LI.isVolatile(), LI.getName() + Suffix); 
-  NewLoad->setAtomic(LI.getOrdering(), LI.getSyncScopeID()); 
-  copyMetadataForLoad(*NewLoad, LI); 
-  return NewLoad; 
-} 
- 
-/// Combine a store to a new type. 
-/// 
-/// Returns the newly created store instruction. 
+  assert((!LI.isAtomic() || isSupportedAtomicType(NewTy)) &&
+         "can't fold an atomic load to requested type");
+
+  Value *Ptr = LI.getPointerOperand();
+  unsigned AS = LI.getPointerAddressSpace();
+  Value *NewPtr = nullptr;
+  if (!(match(Ptr, m_BitCast(m_Value(NewPtr))) &&
+        NewPtr->getType()->getPointerElementType() == NewTy &&
+        NewPtr->getType()->getPointerAddressSpace() == AS))
+    NewPtr = Builder.CreateBitCast(Ptr, NewTy->getPointerTo(AS));
+
+  LoadInst *NewLoad = Builder.CreateAlignedLoad(
+      NewTy, NewPtr, LI.getAlign(), LI.isVolatile(), LI.getName() + Suffix);
+  NewLoad->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
+  copyMetadataForLoad(*NewLoad, LI);
+  return NewLoad;
+}
+
+/// Combine a store to a new type.
+///
+/// Returns the newly created store instruction.
 static StoreInst *combineStoreToNewValue(InstCombinerImpl &IC, StoreInst &SI,
                                          Value *V) {
-  assert((!SI.isAtomic() || isSupportedAtomicType(V->getType())) && 
-         "can't fold an atomic store of requested type"); 
- 
-  Value *Ptr = SI.getPointerOperand(); 
-  unsigned AS = SI.getPointerAddressSpace(); 
-  SmallVector<std::pair<unsigned, MDNode *>, 8> MD; 
-  SI.getAllMetadata(MD); 
- 
-  StoreInst *NewStore = IC.Builder.CreateAlignedStore( 
-      V, IC.Builder.CreateBitCast(Ptr, V->getType()->getPointerTo(AS)), 
-      SI.getAlign(), SI.isVolatile()); 
-  NewStore->setAtomic(SI.getOrdering(), SI.getSyncScopeID()); 
-  for (const auto &MDPair : MD) { 
-    unsigned ID = MDPair.first; 
-    MDNode *N = MDPair.second; 
-    // Note, essentially every kind of metadata should be preserved here! This 
-    // routine is supposed to clone a store instruction changing *only its 
-    // type*. The only metadata it makes sense to drop is metadata which is 
-    // invalidated when the pointer type changes. This should essentially 
-    // never be the case in LLVM, but we explicitly switch over only known 
-    // metadata to be conservatively correct. If you are adding metadata to 
-    // LLVM which pertains to stores, you almost certainly want to add it 
-    // here. 
-    switch (ID) { 
-    case LLVMContext::MD_dbg: 
-    case LLVMContext::MD_tbaa: 
-    case LLVMContext::MD_prof: 
-    case LLVMContext::MD_fpmath: 
-    case LLVMContext::MD_tbaa_struct: 
-    case LLVMContext::MD_alias_scope: 
-    case LLVMContext::MD_noalias: 
-    case LLVMContext::MD_nontemporal: 
-    case LLVMContext::MD_mem_parallel_loop_access: 
-    case LLVMContext::MD_access_group: 
-      // All of these directly apply. 
-      NewStore->setMetadata(ID, N); 
-      break; 
-    case LLVMContext::MD_invariant_load: 
-    case LLVMContext::MD_nonnull: 
+  assert((!SI.isAtomic() || isSupportedAtomicType(V->getType())) &&
+         "can't fold an atomic store of requested type");
+
+  Value *Ptr = SI.getPointerOperand();
+  unsigned AS = SI.getPointerAddressSpace();
+  SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
+  SI.getAllMetadata(MD);
+
+  StoreInst *NewStore = IC.Builder.CreateAlignedStore(
+      V, IC.Builder.CreateBitCast(Ptr, V->getType()->getPointerTo(AS)),
+      SI.getAlign(), SI.isVolatile());
+  NewStore->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
+  for (const auto &MDPair : MD) {
+    unsigned ID = MDPair.first;
+    MDNode *N = MDPair.second;
+    // Note, essentially every kind of metadata should be preserved here! This
+    // routine is supposed to clone a store instruction changing *only its
+    // type*. The only metadata it makes sense to drop is metadata which is
+    // invalidated when the pointer type changes. This should essentially
+    // never be the case in LLVM, but we explicitly switch over only known
+    // metadata to be conservatively correct. If you are adding metadata to
+    // LLVM which pertains to stores, you almost certainly want to add it
+    // here.
+    switch (ID) {
+    case LLVMContext::MD_dbg:
+    case LLVMContext::MD_tbaa:
+    case LLVMContext::MD_prof:
+    case LLVMContext::MD_fpmath:
+    case LLVMContext::MD_tbaa_struct:
+    case LLVMContext::MD_alias_scope:
+    case LLVMContext::MD_noalias:
+    case LLVMContext::MD_nontemporal:
+    case LLVMContext::MD_mem_parallel_loop_access:
+    case LLVMContext::MD_access_group:
+      // All of these directly apply.
+      NewStore->setMetadata(ID, N);
+      break;
+    case LLVMContext::MD_invariant_load:
+    case LLVMContext::MD_nonnull:
     case LLVMContext::MD_noundef:
-    case LLVMContext::MD_range: 
-    case LLVMContext::MD_align: 
-    case LLVMContext::MD_dereferenceable: 
-    case LLVMContext::MD_dereferenceable_or_null: 
-      // These don't apply for stores. 
-      break; 
-    } 
-  } 
- 
-  return NewStore; 
-} 
- 
-/// Returns true if instruction represent minmax pattern like: 
-///   select ((cmp load V1, load V2), V1, V2). 
-static bool isMinMaxWithLoads(Value *V, Type *&LoadTy) { 
-  assert(V->getType()->isPointerTy() && "Expected pointer type."); 
-  // Ignore possible ty* to ixx* bitcast. 
+    case LLVMContext::MD_range:
+    case LLVMContext::MD_align:
+    case LLVMContext::MD_dereferenceable:
+    case LLVMContext::MD_dereferenceable_or_null:
+      // These don't apply for stores.
+      break;
+    }
+  }
+
+  return NewStore;
+}
+
+/// Returns true if instruction represent minmax pattern like:
+///   select ((cmp load V1, load V2), V1, V2).
+static bool isMinMaxWithLoads(Value *V, Type *&LoadTy) {
+  assert(V->getType()->isPointerTy() && "Expected pointer type.");
+  // Ignore possible ty* to ixx* bitcast.
   V = InstCombiner::peekThroughBitcast(V);
-  // Check that select is select ((cmp load V1, load V2), V1, V2) - minmax 
-  // pattern. 
-  CmpInst::Predicate Pred; 
-  Instruction *L1; 
-  Instruction *L2; 
-  Value *LHS; 
-  Value *RHS; 
-  if (!match(V, m_Select(m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2)), 
-                         m_Value(LHS), m_Value(RHS)))) 
-    return false; 
-  LoadTy = L1->getType(); 
-  return (match(L1, m_Load(m_Specific(LHS))) && 
-          match(L2, m_Load(m_Specific(RHS)))) || 
-         (match(L1, m_Load(m_Specific(RHS))) && 
-          match(L2, m_Load(m_Specific(LHS)))); 
-} 
- 
-/// Combine loads to match the type of their uses' value after looking 
-/// through intervening bitcasts. 
-/// 
-/// The core idea here is that if the result of a load is used in an operation, 
-/// we should load the type most conducive to that operation. For example, when 
-/// loading an integer and converting that immediately to a pointer, we should 
-/// instead directly load a pointer. 
-/// 
-/// However, this routine must never change the width of a load or the number of 
-/// loads as that would introduce a semantic change. This combine is expected to 
-/// be a semantic no-op which just allows loads to more closely model the types 
-/// of their consuming operations. 
-/// 
-/// Currently, we also refuse to change the precise type used for an atomic load 
-/// or a volatile load. This is debatable, and might be reasonable to change 
-/// later. However, it is risky in case some backend or other part of LLVM is 
-/// relying on the exact type loaded to select appropriate atomic operations. 
+  // Check that select is select ((cmp load V1, load V2), V1, V2) - minmax
+  // pattern.
+  CmpInst::Predicate Pred;
+  Instruction *L1;
+  Instruction *L2;
+  Value *LHS;
+  Value *RHS;
+  if (!match(V, m_Select(m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2)),
+                         m_Value(LHS), m_Value(RHS))))
+    return false;
+  LoadTy = L1->getType();
+  return (match(L1, m_Load(m_Specific(LHS))) &&
+          match(L2, m_Load(m_Specific(RHS)))) ||
+         (match(L1, m_Load(m_Specific(RHS))) &&
+          match(L2, m_Load(m_Specific(LHS))));
+}
+
+/// Combine loads to match the type of their uses' value after looking
+/// through intervening bitcasts.
+///
+/// The core idea here is that if the result of a load is used in an operation,
+/// we should load the type most conducive to that operation. For example, when
+/// loading an integer and converting that immediately to a pointer, we should
+/// instead directly load a pointer.
+///
+/// However, this routine must never change the width of a load or the number of
+/// loads as that would introduce a semantic change. This combine is expected to
+/// be a semantic no-op which just allows loads to more closely model the types
+/// of their consuming operations.
+///
+/// Currently, we also refuse to change the precise type used for an atomic load
+/// or a volatile load. This is debatable, and might be reasonable to change
+/// later. However, it is risky in case some backend or other part of LLVM is
+/// relying on the exact type loaded to select appropriate atomic operations.
 static Instruction *combineLoadToOperationType(InstCombinerImpl &IC,
                                                LoadInst &LI) {
-  // FIXME: We could probably with some care handle both volatile and ordered 
-  // atomic loads here but it isn't clear that this is important. 
-  if (!LI.isUnordered()) 
-    return nullptr; 
- 
-  if (LI.use_empty()) 
-    return nullptr; 
- 
-  // swifterror values can't be bitcasted. 
-  if (LI.getPointerOperand()->isSwiftError()) 
-    return nullptr; 
- 
-  const DataLayout &DL = IC.getDataLayout(); 
- 
+  // FIXME: We could probably with some care handle both volatile and ordered
+  // atomic loads here but it isn't clear that this is important.
+  if (!LI.isUnordered())
+    return nullptr;
+
+  if (LI.use_empty())
+    return nullptr;
+
+  // swifterror values can't be bitcasted.
+  if (LI.getPointerOperand()->isSwiftError())
+    return nullptr;
+
+  const DataLayout &DL = IC.getDataLayout();
+
   // Fold away bit casts of the loaded value by loading the desired type.
   // Note that we should not do this for pointer<->integer casts,
   // because that would result in type punning.
@@ -597,253 +597,253 @@ static Instruction *combineLoadToOperationType(InstCombinerImpl &IC,
              "load from x86_amx* should not happen!");
       if (BC->getType()->isX86_AMXTy())
         return nullptr;
-    } 
- 
-    if (auto* CI = dyn_cast<CastInst>(LI.user_back())) 
+    }
+
+    if (auto* CI = dyn_cast<CastInst>(LI.user_back()))
       if (CI->isNoopCast(DL) && LI.getType()->isPtrOrPtrVectorTy() ==
                                     CI->getDestTy()->isPtrOrPtrVectorTy())
-        if (!LI.isAtomic() || isSupportedAtomicType(CI->getDestTy())) { 
-          LoadInst *NewLoad = IC.combineLoadToNewType(LI, CI->getDestTy()); 
-          CI->replaceAllUsesWith(NewLoad); 
-          IC.eraseInstFromFunction(*CI); 
-          return &LI; 
-        } 
+        if (!LI.isAtomic() || isSupportedAtomicType(CI->getDestTy())) {
+          LoadInst *NewLoad = IC.combineLoadToNewType(LI, CI->getDestTy());
+          CI->replaceAllUsesWith(NewLoad);
+          IC.eraseInstFromFunction(*CI);
+          return &LI;
+        }
   }
- 
-  // FIXME: We should also canonicalize loads of vectors when their elements are 
-  // cast to other types. 
-  return nullptr; 
-} 
- 
+
+  // FIXME: We should also canonicalize loads of vectors when their elements are
+  // cast to other types.
+  return nullptr;
+}
+
 static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) {
-  // FIXME: We could probably with some care handle both volatile and atomic 
-  // stores here but it isn't clear that this is important. 
-  if (!LI.isSimple()) 
-    return nullptr; 
- 
-  Type *T = LI.getType(); 
-  if (!T->isAggregateType()) 
-    return nullptr; 
- 
-  StringRef Name = LI.getName(); 
-  assert(LI.getAlignment() && "Alignment must be set at this point"); 
- 
-  if (auto *ST = dyn_cast<StructType>(T)) { 
-    // If the struct only have one element, we unpack. 
-    auto NumElements = ST->getNumElements(); 
-    if (NumElements == 1) { 
-      LoadInst *NewLoad = IC.combineLoadToNewType(LI, ST->getTypeAtIndex(0U), 
-                                                  ".unpack"); 
-      AAMDNodes AAMD; 
-      LI.getAAMetadata(AAMD); 
-      NewLoad->setAAMetadata(AAMD); 
-      return IC.replaceInstUsesWith(LI, IC.Builder.CreateInsertValue( 
-        UndefValue::get(T), NewLoad, 0, Name)); 
-    } 
- 
-    // We don't want to break loads with padding here as we'd loose 
-    // the knowledge that padding exists for the rest of the pipeline. 
-    const DataLayout &DL = IC.getDataLayout(); 
-    auto *SL = DL.getStructLayout(ST); 
-    if (SL->hasPadding()) 
-      return nullptr; 
- 
-    const auto Align = LI.getAlign(); 
-    auto *Addr = LI.getPointerOperand(); 
-    auto *IdxType = Type::getInt32Ty(T->getContext()); 
-    auto *Zero = ConstantInt::get(IdxType, 0); 
- 
-    Value *V = UndefValue::get(T); 
-    for (unsigned i = 0; i < NumElements; i++) { 
-      Value *Indices[2] = { 
-        Zero, 
-        ConstantInt::get(IdxType, i), 
-      }; 
-      auto *Ptr = IC.Builder.CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices), 
-                                               Name + ".elt"); 
-      auto *L = IC.Builder.CreateAlignedLoad( 
-          ST->getElementType(i), Ptr, 
-          commonAlignment(Align, SL->getElementOffset(i)), Name + ".unpack"); 
-      // Propagate AA metadata. It'll still be valid on the narrowed load. 
-      AAMDNodes AAMD; 
-      LI.getAAMetadata(AAMD); 
-      L->setAAMetadata(AAMD); 
-      V = IC.Builder.CreateInsertValue(V, L, i); 
-    } 
- 
-    V->setName(Name); 
-    return IC.replaceInstUsesWith(LI, V); 
-  } 
- 
-  if (auto *AT = dyn_cast<ArrayType>(T)) { 
-    auto *ET = AT->getElementType(); 
-    auto NumElements = AT->getNumElements(); 
-    if (NumElements == 1) { 
-      LoadInst *NewLoad = IC.combineLoadToNewType(LI, ET, ".unpack"); 
-      AAMDNodes AAMD; 
-      LI.getAAMetadata(AAMD); 
-      NewLoad->setAAMetadata(AAMD); 
-      return IC.replaceInstUsesWith(LI, IC.Builder.CreateInsertValue( 
-        UndefValue::get(T), NewLoad, 0, Name)); 
-    } 
- 
-    // Bail out if the array is too large. Ideally we would like to optimize 
-    // arrays of arbitrary size but this has a terrible impact on compile time. 
-    // The threshold here is chosen arbitrarily, maybe needs a little bit of 
-    // tuning. 
-    if (NumElements > IC.MaxArraySizeForCombine) 
-      return nullptr; 
- 
-    const DataLayout &DL = IC.getDataLayout(); 
-    auto EltSize = DL.getTypeAllocSize(ET); 
-    const auto Align = LI.getAlign(); 
- 
-    auto *Addr = LI.getPointerOperand(); 
-    auto *IdxType = Type::getInt64Ty(T->getContext()); 
-    auto *Zero = ConstantInt::get(IdxType, 0); 
- 
-    Value *V = UndefValue::get(T); 
-    uint64_t Offset = 0; 
-    for (uint64_t i = 0; i < NumElements; i++) { 
-      Value *Indices[2] = { 
-        Zero, 
-        ConstantInt::get(IdxType, i), 
-      }; 
-      auto *Ptr = IC.Builder.CreateInBoundsGEP(AT, Addr, makeArrayRef(Indices), 
-                                               Name + ".elt"); 
-      auto *L = IC.Builder.CreateAlignedLoad(AT->getElementType(), Ptr, 
-                                             commonAlignment(Align, Offset), 
-                                             Name + ".unpack"); 
-      AAMDNodes AAMD; 
-      LI.getAAMetadata(AAMD); 
-      L->setAAMetadata(AAMD); 
-      V = IC.Builder.CreateInsertValue(V, L, i); 
-      Offset += EltSize; 
-    } 
- 
-    V->setName(Name); 
-    return IC.replaceInstUsesWith(LI, V); 
-  } 
- 
-  return nullptr; 
-} 
- 
-// If we can determine that all possible objects pointed to by the provided 
-// pointer value are, not only dereferenceable, but also definitively less than 
-// or equal to the provided maximum size, then return true. Otherwise, return 
-// false (constant global values and allocas fall into this category). 
-// 
-// FIXME: This should probably live in ValueTracking (or similar). 
-static bool isObjectSizeLessThanOrEq(Value *V, uint64_t MaxSize, 
-                                     const DataLayout &DL) { 
-  SmallPtrSet<Value *, 4> Visited; 
-  SmallVector<Value *, 4> Worklist(1, V); 
- 
-  do { 
-    Value *P = Worklist.pop_back_val(); 
-    P = P->stripPointerCasts(); 
- 
-    if (!Visited.insert(P).second) 
-      continue; 
- 
-    if (SelectInst *SI = dyn_cast<SelectInst>(P)) { 
-      Worklist.push_back(SI->getTrueValue()); 
-      Worklist.push_back(SI->getFalseValue()); 
-      continue; 
-    } 
- 
-    if (PHINode *PN = dyn_cast<PHINode>(P)) { 
+  // FIXME: We could probably with some care handle both volatile and atomic
+  // stores here but it isn't clear that this is important.
+  if (!LI.isSimple())
+    return nullptr;
+
+  Type *T = LI.getType();
+  if (!T->isAggregateType())
+    return nullptr;
+
+  StringRef Name = LI.getName();
+  assert(LI.getAlignment() && "Alignment must be set at this point");
+
+  if (auto *ST = dyn_cast<StructType>(T)) {
+    // If the struct only have one element, we unpack.
+    auto NumElements = ST->getNumElements();
+    if (NumElements == 1) {
+      LoadInst *NewLoad = IC.combineLoadToNewType(LI, ST->getTypeAtIndex(0U),
+                                                  ".unpack");
+      AAMDNodes AAMD;
+      LI.getAAMetadata(AAMD);
+      NewLoad->setAAMetadata(AAMD);
+      return IC.replaceInstUsesWith(LI, IC.Builder.CreateInsertValue(
+        UndefValue::get(T), NewLoad, 0, Name));
+    }
+
+    // We don't want to break loads with padding here as we'd loose
+    // the knowledge that padding exists for the rest of the pipeline.
+    const DataLayout &DL = IC.getDataLayout();
+    auto *SL = DL.getStructLayout(ST);
+    if (SL->hasPadding())
+      return nullptr;
+
+    const auto Align = LI.getAlign();
+    auto *Addr = LI.getPointerOperand();
+    auto *IdxType = Type::getInt32Ty(T->getContext());
+    auto *Zero = ConstantInt::get(IdxType, 0);
+
+    Value *V = UndefValue::get(T);
+    for (unsigned i = 0; i < NumElements; i++) {
+      Value *Indices[2] = {
+        Zero,
+        ConstantInt::get(IdxType, i),
+      };
+      auto *Ptr = IC.Builder.CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices),
+                                               Name + ".elt");
+      auto *L = IC.Builder.CreateAlignedLoad(
+          ST->getElementType(i), Ptr,
+          commonAlignment(Align, SL->getElementOffset(i)), Name + ".unpack");
+      // Propagate AA metadata. It'll still be valid on the narrowed load.
+      AAMDNodes AAMD;
+      LI.getAAMetadata(AAMD);
+      L->setAAMetadata(AAMD);
+      V = IC.Builder.CreateInsertValue(V, L, i);
+    }
+
+    V->setName(Name);
+    return IC.replaceInstUsesWith(LI, V);
+  }
+
+  if (auto *AT = dyn_cast<ArrayType>(T)) {
+    auto *ET = AT->getElementType();
+    auto NumElements = AT->getNumElements();
+    if (NumElements == 1) {
+      LoadInst *NewLoad = IC.combineLoadToNewType(LI, ET, ".unpack");
+      AAMDNodes AAMD;
+      LI.getAAMetadata(AAMD);
+      NewLoad->setAAMetadata(AAMD);
+      return IC.replaceInstUsesWith(LI, IC.Builder.CreateInsertValue(
+        UndefValue::get(T), NewLoad, 0, Name));
+    }
+
+    // Bail out if the array is too large. Ideally we would like to optimize
+    // arrays of arbitrary size but this has a terrible impact on compile time.
+    // The threshold here is chosen arbitrarily, maybe needs a little bit of
+    // tuning.
+    if (NumElements > IC.MaxArraySizeForCombine)
+      return nullptr;
+
+    const DataLayout &DL = IC.getDataLayout();
+    auto EltSize = DL.getTypeAllocSize(ET);
+    const auto Align = LI.getAlign();
+
+    auto *Addr = LI.getPointerOperand();
+    auto *IdxType = Type::getInt64Ty(T->getContext());
+    auto *Zero = ConstantInt::get(IdxType, 0);
+
+    Value *V = UndefValue::get(T);
+    uint64_t Offset = 0;
+    for (uint64_t i = 0; i < NumElements; i++) {
+      Value *Indices[2] = {
+        Zero,
+        ConstantInt::get(IdxType, i),
+      };
+      auto *Ptr = IC.Builder.CreateInBoundsGEP(AT, Addr, makeArrayRef(Indices),
+                                               Name + ".elt");
+      auto *L = IC.Builder.CreateAlignedLoad(AT->getElementType(), Ptr,
+                                             commonAlignment(Align, Offset),
+                                             Name + ".unpack");
+      AAMDNodes AAMD;
+      LI.getAAMetadata(AAMD);
+      L->setAAMetadata(AAMD);
+      V = IC.Builder.CreateInsertValue(V, L, i);
+      Offset += EltSize;
+    }
+
+    V->setName(Name);
+    return IC.replaceInstUsesWith(LI, V);
+  }
+
+  return nullptr;
+}
+
+// If we can determine that all possible objects pointed to by the provided
+// pointer value are, not only dereferenceable, but also definitively less than
+// or equal to the provided maximum size, then return true. Otherwise, return
+// false (constant global values and allocas fall into this category).
+//
+// FIXME: This should probably live in ValueTracking (or similar).
+static bool isObjectSizeLessThanOrEq(Value *V, uint64_t MaxSize,
+                                     const DataLayout &DL) {
+  SmallPtrSet<Value *, 4> Visited;
+  SmallVector<Value *, 4> Worklist(1, V);
+
+  do {
+    Value *P = Worklist.pop_back_val();
+    P = P->stripPointerCasts();
+
+    if (!Visited.insert(P).second)
+      continue;
+
+    if (SelectInst *SI = dyn_cast<SelectInst>(P)) {
+      Worklist.push_back(SI->getTrueValue());
+      Worklist.push_back(SI->getFalseValue());
+      continue;
+    }
+
+    if (PHINode *PN = dyn_cast<PHINode>(P)) {
       append_range(Worklist, PN->incoming_values());
-      continue; 
-    } 
- 
-    if (GlobalAlias *GA = dyn_cast<GlobalAlias>(P)) { 
-      if (GA->isInterposable()) 
-        return false; 
-      Worklist.push_back(GA->getAliasee()); 
-      continue; 
-    } 
- 
-    // If we know how big this object is, and it is less than MaxSize, continue 
-    // searching. Otherwise, return false. 
-    if (AllocaInst *AI = dyn_cast<AllocaInst>(P)) { 
-      if (!AI->getAllocatedType()->isSized()) 
-        return false; 
- 
-      ConstantInt *CS = dyn_cast<ConstantInt>(AI->getArraySize()); 
-      if (!CS) 
-        return false; 
- 
-      uint64_t TypeSize = DL.getTypeAllocSize(AI->getAllocatedType()); 
-      // Make sure that, even if the multiplication below would wrap as an 
-      // uint64_t, we still do the right thing. 
-      if ((CS->getValue().zextOrSelf(128)*APInt(128, TypeSize)).ugt(MaxSize)) 
-        return false; 
-      continue; 
-    } 
- 
-    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P)) { 
-      if (!GV->hasDefinitiveInitializer() || !GV->isConstant()) 
-        return false; 
- 
-      uint64_t InitSize = DL.getTypeAllocSize(GV->getValueType()); 
-      if (InitSize > MaxSize) 
-        return false; 
-      continue; 
-    } 
- 
-    return false; 
-  } while (!Worklist.empty()); 
- 
-  return true; 
-} 
- 
-// If we're indexing into an object of a known size, and the outer index is 
-// not a constant, but having any value but zero would lead to undefined 
-// behavior, replace it with zero. 
-// 
-// For example, if we have: 
-// @f.a = private unnamed_addr constant [1 x i32] [i32 12], align 4 
-// ... 
-// %arrayidx = getelementptr inbounds [1 x i32]* @f.a, i64 0, i64 %x 
-// ... = load i32* %arrayidx, align 4 
-// Then we know that we can replace %x in the GEP with i64 0. 
-// 
-// FIXME: We could fold any GEP index to zero that would cause UB if it were 
-// not zero. Currently, we only handle the first such index. Also, we could 
-// also search through non-zero constant indices if we kept track of the 
-// offsets those indices implied. 
+      continue;
+    }
+
+    if (GlobalAlias *GA = dyn_cast<GlobalAlias>(P)) {
+      if (GA->isInterposable())
+        return false;
+      Worklist.push_back(GA->getAliasee());
+      continue;
+    }
+
+    // If we know how big this object is, and it is less than MaxSize, continue
+    // searching. Otherwise, return false.
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(P)) {
+      if (!AI->getAllocatedType()->isSized())
+        return false;
+
+      ConstantInt *CS = dyn_cast<ConstantInt>(AI->getArraySize());
+      if (!CS)
+        return false;
+
+      uint64_t TypeSize = DL.getTypeAllocSize(AI->getAllocatedType());
+      // Make sure that, even if the multiplication below would wrap as an
+      // uint64_t, we still do the right thing.
+      if ((CS->getValue().zextOrSelf(128)*APInt(128, TypeSize)).ugt(MaxSize))
+        return false;
+      continue;
+    }
+
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P)) {
+      if (!GV->hasDefinitiveInitializer() || !GV->isConstant())
+        return false;
+
+      uint64_t InitSize = DL.getTypeAllocSize(GV->getValueType());
+      if (InitSize > MaxSize)
+        return false;
+      continue;
+    }
+
+    return false;
+  } while (!Worklist.empty());
+
+  return true;
+}
+
+// If we're indexing into an object of a known size, and the outer index is
+// not a constant, but having any value but zero would lead to undefined
+// behavior, replace it with zero.
+//
+// For example, if we have:
+// @f.a = private unnamed_addr constant [1 x i32] [i32 12], align 4
+// ...
+// %arrayidx = getelementptr inbounds [1 x i32]* @f.a, i64 0, i64 %x
+// ... = load i32* %arrayidx, align 4
+// Then we know that we can replace %x in the GEP with i64 0.
+//
+// FIXME: We could fold any GEP index to zero that would cause UB if it were
+// not zero. Currently, we only handle the first such index. Also, we could
+// also search through non-zero constant indices if we kept track of the
+// offsets those indices implied.
 static bool canReplaceGEPIdxWithZero(InstCombinerImpl &IC,
                                      GetElementPtrInst *GEPI, Instruction *MemI,
                                      unsigned &Idx) {
-  if (GEPI->getNumOperands() < 2) 
-    return false; 
- 
-  // Find the first non-zero index of a GEP. If all indices are zero, return 
-  // one past the last index. 
-  auto FirstNZIdx = [](const GetElementPtrInst *GEPI) { 
-    unsigned I = 1; 
-    for (unsigned IE = GEPI->getNumOperands(); I != IE; ++I) { 
-      Value *V = GEPI->getOperand(I); 
-      if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) 
-        if (CI->isZero()) 
-          continue; 
- 
-      break; 
-    } 
- 
-    return I; 
-  }; 
- 
-  // Skip through initial 'zero' indices, and find the corresponding pointer 
-  // type. See if the next index is not a constant. 
-  Idx = FirstNZIdx(GEPI); 
-  if (Idx == GEPI->getNumOperands()) 
-    return false; 
-  if (isa<Constant>(GEPI->getOperand(Idx))) 
-    return false; 
- 
-  SmallVector<Value *, 4> Ops(GEPI->idx_begin(), GEPI->idx_begin() + Idx); 
+  if (GEPI->getNumOperands() < 2)
+    return false;
+
+  // Find the first non-zero index of a GEP. If all indices are zero, return
+  // one past the last index.
+  auto FirstNZIdx = [](const GetElementPtrInst *GEPI) {
+    unsigned I = 1;
+    for (unsigned IE = GEPI->getNumOperands(); I != IE; ++I) {
+      Value *V = GEPI->getOperand(I);
+      if (const ConstantInt *CI = dyn_cast<ConstantInt>(V))
+        if (CI->isZero())
+          continue;
+
+      break;
+    }
+
+    return I;
+  };
+
+  // Skip through initial 'zero' indices, and find the corresponding pointer
+  // type. See if the next index is not a constant.
+  Idx = FirstNZIdx(GEPI);
+  if (Idx == GEPI->getNumOperands())
+    return false;
+  if (isa<Constant>(GEPI->getOperand(Idx)))
+    return false;
+
+  SmallVector<Value *, 4> Ops(GEPI->idx_begin(), GEPI->idx_begin() + Idx);
   Type *SourceElementType = GEPI->getSourceElementType();
   // Size information about scalable vectors is not available, so we cannot
   // deduce whether indexing at n is undefined behaviour or not. Bail out.
@@ -851,720 +851,720 @@ static bool canReplaceGEPIdxWithZero(InstCombinerImpl &IC,
     return false;
 
   Type *AllocTy = GetElementPtrInst::getIndexedType(SourceElementType, Ops);
-  if (!AllocTy || !AllocTy->isSized()) 
-    return false; 
-  const DataLayout &DL = IC.getDataLayout(); 
+  if (!AllocTy || !AllocTy->isSized())
+    return false;
+  const DataLayout &DL = IC.getDataLayout();
   uint64_t TyAllocSize = DL.getTypeAllocSize(AllocTy).getFixedSize();
- 
-  // If there are more indices after the one we might replace with a zero, make 
-  // sure they're all non-negative. If any of them are negative, the overall 
-  // address being computed might be before the base address determined by the 
-  // first non-zero index. 
-  auto IsAllNonNegative = [&]() { 
-    for (unsigned i = Idx+1, e = GEPI->getNumOperands(); i != e; ++i) { 
-      KnownBits Known = IC.computeKnownBits(GEPI->getOperand(i), 0, MemI); 
-      if (Known.isNonNegative()) 
-        continue; 
-      return false; 
-    } 
- 
-    return true; 
-  }; 
- 
-  // FIXME: If the GEP is not inbounds, and there are extra indices after the 
-  // one we'll replace, those could cause the address computation to wrap 
-  // (rendering the IsAllNonNegative() check below insufficient). We can do 
-  // better, ignoring zero indices (and other indices we can prove small 
-  // enough not to wrap). 
-  if (Idx+1 != GEPI->getNumOperands() && !GEPI->isInBounds()) 
-    return false; 
- 
-  // Note that isObjectSizeLessThanOrEq will return true only if the pointer is 
-  // also known to be dereferenceable. 
-  return isObjectSizeLessThanOrEq(GEPI->getOperand(0), TyAllocSize, DL) && 
-         IsAllNonNegative(); 
-} 
- 
-// If we're indexing into an object with a variable index for the memory 
-// access, but the object has only one element, we can assume that the index 
-// will always be zero. If we replace the GEP, return it. 
-template <typename T> 
+
+  // If there are more indices after the one we might replace with a zero, make
+  // sure they're all non-negative. If any of them are negative, the overall
+  // address being computed might be before the base address determined by the
+  // first non-zero index.
+  auto IsAllNonNegative = [&]() {
+    for (unsigned i = Idx+1, e = GEPI->getNumOperands(); i != e; ++i) {
+      KnownBits Known = IC.computeKnownBits(GEPI->getOperand(i), 0, MemI);
+      if (Known.isNonNegative())
+        continue;
+      return false;
+    }
+
+    return true;
+  };
+
+  // FIXME: If the GEP is not inbounds, and there are extra indices after the
+  // one we'll replace, those could cause the address computation to wrap
+  // (rendering the IsAllNonNegative() check below insufficient). We can do
+  // better, ignoring zero indices (and other indices we can prove small
+  // enough not to wrap).
+  if (Idx+1 != GEPI->getNumOperands() && !GEPI->isInBounds())
+    return false;
+
+  // Note that isObjectSizeLessThanOrEq will return true only if the pointer is
+  // also known to be dereferenceable.
+  return isObjectSizeLessThanOrEq(GEPI->getOperand(0), TyAllocSize, DL) &&
+         IsAllNonNegative();
+}
+
+// If we're indexing into an object with a variable index for the memory
+// access, but the object has only one element, we can assume that the index
+// will always be zero. If we replace the GEP, return it.
+template <typename T>
 static Instruction *replaceGEPIdxWithZero(InstCombinerImpl &IC, Value *Ptr,
-                                          T &MemI) { 
-  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr)) { 
-    unsigned Idx; 
-    if (canReplaceGEPIdxWithZero(IC, GEPI, &MemI, Idx)) { 
-      Instruction *NewGEPI = GEPI->clone(); 
-      NewGEPI->setOperand(Idx, 
-        ConstantInt::get(GEPI->getOperand(Idx)->getType(), 0)); 
-      NewGEPI->insertBefore(GEPI); 
-      MemI.setOperand(MemI.getPointerOperandIndex(), NewGEPI); 
-      return NewGEPI; 
-    } 
-  } 
- 
-  return nullptr; 
-} 
- 
-static bool canSimplifyNullStoreOrGEP(StoreInst &SI) { 
-  if (NullPointerIsDefined(SI.getFunction(), SI.getPointerAddressSpace())) 
-    return false; 
- 
-  auto *Ptr = SI.getPointerOperand(); 
-  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr)) 
-    Ptr = GEPI->getOperand(0); 
-  return (isa<ConstantPointerNull>(Ptr) && 
-          !NullPointerIsDefined(SI.getFunction(), SI.getPointerAddressSpace())); 
-} 
- 
-static bool canSimplifyNullLoadOrGEP(LoadInst &LI, Value *Op) { 
-  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Op)) { 
-    const Value *GEPI0 = GEPI->getOperand(0); 
-    if (isa<ConstantPointerNull>(GEPI0) && 
-        !NullPointerIsDefined(LI.getFunction(), GEPI->getPointerAddressSpace())) 
-      return true; 
-  } 
-  if (isa<UndefValue>(Op) || 
-      (isa<ConstantPointerNull>(Op) && 
-       !NullPointerIsDefined(LI.getFunction(), LI.getPointerAddressSpace()))) 
-    return true; 
-  return false; 
-} 
- 
+                                          T &MemI) {
+  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr)) {
+    unsigned Idx;
+    if (canReplaceGEPIdxWithZero(IC, GEPI, &MemI, Idx)) {
+      Instruction *NewGEPI = GEPI->clone();
+      NewGEPI->setOperand(Idx,
+        ConstantInt::get(GEPI->getOperand(Idx)->getType(), 0));
+      NewGEPI->insertBefore(GEPI);
+      MemI.setOperand(MemI.getPointerOperandIndex(), NewGEPI);
+      return NewGEPI;
+    }
+  }
+
+  return nullptr;
+}
+
+static bool canSimplifyNullStoreOrGEP(StoreInst &SI) {
+  if (NullPointerIsDefined(SI.getFunction(), SI.getPointerAddressSpace()))
+    return false;
+
+  auto *Ptr = SI.getPointerOperand();
+  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr))
+    Ptr = GEPI->getOperand(0);
+  return (isa<ConstantPointerNull>(Ptr) &&
+          !NullPointerIsDefined(SI.getFunction(), SI.getPointerAddressSpace()));
+}
+
+static bool canSimplifyNullLoadOrGEP(LoadInst &LI, Value *Op) {
+  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Op)) {
+    const Value *GEPI0 = GEPI->getOperand(0);
+    if (isa<ConstantPointerNull>(GEPI0) &&
+        !NullPointerIsDefined(LI.getFunction(), GEPI->getPointerAddressSpace()))
+      return true;
+  }
+  if (isa<UndefValue>(Op) ||
+      (isa<ConstantPointerNull>(Op) &&
+       !NullPointerIsDefined(LI.getFunction(), LI.getPointerAddressSpace())))
+    return true;
+  return false;
+}
+
 Instruction *InstCombinerImpl::visitLoadInst(LoadInst &LI) {
-  Value *Op = LI.getOperand(0); 
- 
-  // Try to canonicalize the loaded type. 
-  if (Instruction *Res = combineLoadToOperationType(*this, LI)) 
-    return Res; 
- 
-  // Attempt to improve the alignment. 
-  Align KnownAlign = getOrEnforceKnownAlignment( 
-      Op, DL.getPrefTypeAlign(LI.getType()), DL, &LI, &AC, &DT); 
-  if (KnownAlign > LI.getAlign()) 
-    LI.setAlignment(KnownAlign); 
- 
-  // Replace GEP indices if possible. 
-  if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Op, LI)) { 
-      Worklist.push(NewGEPI); 
-      return &LI; 
-  } 
- 
-  if (Instruction *Res = unpackLoadToAggregate(*this, LI)) 
-    return Res; 
- 
-  // Do really simple store-to-load forwarding and load CSE, to catch cases 
-  // where there are several consecutive memory accesses to the same location, 
-  // separated by a few arithmetic operations. 
-  BasicBlock::iterator BBI(LI); 
-  bool IsLoadCSE = false; 
-  if (Value *AvailableVal = FindAvailableLoadedValue( 
-          &LI, LI.getParent(), BBI, DefMaxInstsToScan, AA, &IsLoadCSE)) { 
-    if (IsLoadCSE) 
-      combineMetadataForCSE(cast<LoadInst>(AvailableVal), &LI, false); 
- 
-    return replaceInstUsesWith( 
-        LI, Builder.CreateBitOrPointerCast(AvailableVal, LI.getType(), 
-                                           LI.getName() + ".cast")); 
-  } 
- 
-  // None of the following transforms are legal for volatile/ordered atomic 
-  // loads.  Most of them do apply for unordered atomics. 
-  if (!LI.isUnordered()) return nullptr; 
- 
-  // load(gep null, ...) -> unreachable 
-  // load null/undef -> unreachable 
-  // TODO: Consider a target hook for valid address spaces for this xforms. 
-  if (canSimplifyNullLoadOrGEP(LI, Op)) { 
-    // Insert a new store to null instruction before the load to indicate 
-    // that this code is not reachable.  We do this instead of inserting 
-    // an unreachable instruction directly because we cannot modify the 
-    // CFG. 
-    StoreInst *SI = new StoreInst(UndefValue::get(LI.getType()), 
-                                  Constant::getNullValue(Op->getType()), &LI); 
-    SI->setDebugLoc(LI.getDebugLoc()); 
-    return replaceInstUsesWith(LI, UndefValue::get(LI.getType())); 
-  } 
- 
-  if (Op->hasOneUse()) { 
-    // Change select and PHI nodes to select values instead of addresses: this 
-    // helps alias analysis out a lot, allows many others simplifications, and 
-    // exposes redundancy in the code. 
-    // 
-    // Note that we cannot do the transformation unless we know that the 
-    // introduced loads cannot trap!  Something like this is valid as long as 
-    // the condition is always false: load (select bool %C, int* null, int* %G), 
-    // but it would not be valid if we transformed it to load from null 
-    // unconditionally. 
-    // 
-    if (SelectInst *SI = dyn_cast<SelectInst>(Op)) { 
-      // load (select (Cond, &V1, &V2))  --> select(Cond, load &V1, load &V2). 
-      Align Alignment = LI.getAlign(); 
-      if (isSafeToLoadUnconditionally(SI->getOperand(1), LI.getType(), 
-                                      Alignment, DL, SI) && 
-          isSafeToLoadUnconditionally(SI->getOperand(2), LI.getType(), 
-                                      Alignment, DL, SI)) { 
-        LoadInst *V1 = 
-            Builder.CreateLoad(LI.getType(), SI->getOperand(1), 
-                               SI->getOperand(1)->getName() + ".val"); 
-        LoadInst *V2 = 
-            Builder.CreateLoad(LI.getType(), SI->getOperand(2), 
-                               SI->getOperand(2)->getName() + ".val"); 
-        assert(LI.isUnordered() && "implied by above"); 
-        V1->setAlignment(Alignment); 
-        V1->setAtomic(LI.getOrdering(), LI.getSyncScopeID()); 
-        V2->setAlignment(Alignment); 
-        V2->setAtomic(LI.getOrdering(), LI.getSyncScopeID()); 
-        return SelectInst::Create(SI->getCondition(), V1, V2); 
-      } 
- 
-      // load (select (cond, null, P)) -> load P 
-      if (isa<ConstantPointerNull>(SI->getOperand(1)) && 
-          !NullPointerIsDefined(SI->getFunction(), 
-                                LI.getPointerAddressSpace())) 
-        return replaceOperand(LI, 0, SI->getOperand(2)); 
- 
-      // load (select (cond, P, null)) -> load P 
-      if (isa<ConstantPointerNull>(SI->getOperand(2)) && 
-          !NullPointerIsDefined(SI->getFunction(), 
-                                LI.getPointerAddressSpace())) 
-        return replaceOperand(LI, 0, SI->getOperand(1)); 
-    } 
-  } 
-  return nullptr; 
-} 
- 
-/// Look for extractelement/insertvalue sequence that acts like a bitcast. 
-/// 
-/// \returns underlying value that was "cast", or nullptr otherwise. 
-/// 
-/// For example, if we have: 
-/// 
-///     %E0 = extractelement <2 x double> %U, i32 0 
-///     %V0 = insertvalue [2 x double] undef, double %E0, 0 
-///     %E1 = extractelement <2 x double> %U, i32 1 
-///     %V1 = insertvalue [2 x double] %V0, double %E1, 1 
-/// 
-/// and the layout of a <2 x double> is isomorphic to a [2 x double], 
-/// then %V1 can be safely approximated by a conceptual "bitcast" of %U. 
-/// Note that %U may contain non-undef values where %V1 has undef. 
+  Value *Op = LI.getOperand(0);
+
+  // Try to canonicalize the loaded type.
+  if (Instruction *Res = combineLoadToOperationType(*this, LI))
+    return Res;
+
+  // Attempt to improve the alignment.
+  Align KnownAlign = getOrEnforceKnownAlignment(
+      Op, DL.getPrefTypeAlign(LI.getType()), DL, &LI, &AC, &DT);
+  if (KnownAlign > LI.getAlign())
+    LI.setAlignment(KnownAlign);
+
+  // Replace GEP indices if possible.
+  if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Op, LI)) {
+      Worklist.push(NewGEPI);
+      return &LI;
+  }
+
+  if (Instruction *Res = unpackLoadToAggregate(*this, LI))
+    return Res;
+
+  // Do really simple store-to-load forwarding and load CSE, to catch cases
+  // where there are several consecutive memory accesses to the same location,
+  // separated by a few arithmetic operations.
+  BasicBlock::iterator BBI(LI);
+  bool IsLoadCSE = false;
+  if (Value *AvailableVal = FindAvailableLoadedValue(
+          &LI, LI.getParent(), BBI, DefMaxInstsToScan, AA, &IsLoadCSE)) {
+    if (IsLoadCSE)
+      combineMetadataForCSE(cast<LoadInst>(AvailableVal), &LI, false);
+
+    return replaceInstUsesWith(
+        LI, Builder.CreateBitOrPointerCast(AvailableVal, LI.getType(),
+                                           LI.getName() + ".cast"));
+  }
+
+  // None of the following transforms are legal for volatile/ordered atomic
+  // loads.  Most of them do apply for unordered atomics.
+  if (!LI.isUnordered()) return nullptr;
+
+  // load(gep null, ...) -> unreachable
+  // load null/undef -> unreachable
+  // TODO: Consider a target hook for valid address spaces for this xforms.
+  if (canSimplifyNullLoadOrGEP(LI, Op)) {
+    // Insert a new store to null instruction before the load to indicate
+    // that this code is not reachable.  We do this instead of inserting
+    // an unreachable instruction directly because we cannot modify the
+    // CFG.
+    StoreInst *SI = new StoreInst(UndefValue::get(LI.getType()),
+                                  Constant::getNullValue(Op->getType()), &LI);
+    SI->setDebugLoc(LI.getDebugLoc());
+    return replaceInstUsesWith(LI, UndefValue::get(LI.getType()));
+  }
+
+  if (Op->hasOneUse()) {
+    // Change select and PHI nodes to select values instead of addresses: this
+    // helps alias analysis out a lot, allows many others simplifications, and
+    // exposes redundancy in the code.
+    //
+    // Note that we cannot do the transformation unless we know that the
+    // introduced loads cannot trap!  Something like this is valid as long as
+    // the condition is always false: load (select bool %C, int* null, int* %G),
+    // but it would not be valid if we transformed it to load from null
+    // unconditionally.
+    //
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op)) {
+      // load (select (Cond, &V1, &V2))  --> select(Cond, load &V1, load &V2).
+      Align Alignment = LI.getAlign();
+      if (isSafeToLoadUnconditionally(SI->getOperand(1), LI.getType(),
+                                      Alignment, DL, SI) &&
+          isSafeToLoadUnconditionally(SI->getOperand(2), LI.getType(),
+                                      Alignment, DL, SI)) {
+        LoadInst *V1 =
+            Builder.CreateLoad(LI.getType(), SI->getOperand(1),
+                               SI->getOperand(1)->getName() + ".val");
+        LoadInst *V2 =
+            Builder.CreateLoad(LI.getType(), SI->getOperand(2),
+                               SI->getOperand(2)->getName() + ".val");
+        assert(LI.isUnordered() && "implied by above");
+        V1->setAlignment(Alignment);
+        V1->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
+        V2->setAlignment(Alignment);
+        V2->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
+        return SelectInst::Create(SI->getCondition(), V1, V2);
+      }
+
+      // load (select (cond, null, P)) -> load P
+      if (isa<ConstantPointerNull>(SI->getOperand(1)) &&
+          !NullPointerIsDefined(SI->getFunction(),
+                                LI.getPointerAddressSpace()))
+        return replaceOperand(LI, 0, SI->getOperand(2));
+
+      // load (select (cond, P, null)) -> load P
+      if (isa<ConstantPointerNull>(SI->getOperand(2)) &&
+          !NullPointerIsDefined(SI->getFunction(),
+                                LI.getPointerAddressSpace()))
+        return replaceOperand(LI, 0, SI->getOperand(1));
+    }
+  }
+  return nullptr;
+}
+
+/// Look for extractelement/insertvalue sequence that acts like a bitcast.
+///
+/// \returns underlying value that was "cast", or nullptr otherwise.
+///
+/// For example, if we have:
+///
+///     %E0 = extractelement <2 x double> %U, i32 0
+///     %V0 = insertvalue [2 x double] undef, double %E0, 0
+///     %E1 = extractelement <2 x double> %U, i32 1
+///     %V1 = insertvalue [2 x double] %V0, double %E1, 1
+///
+/// and the layout of a <2 x double> is isomorphic to a [2 x double],
+/// then %V1 can be safely approximated by a conceptual "bitcast" of %U.
+/// Note that %U may contain non-undef values where %V1 has undef.
 static Value *likeBitCastFromVector(InstCombinerImpl &IC, Value *V) {
-  Value *U = nullptr; 
-  while (auto *IV = dyn_cast<InsertValueInst>(V)) { 
-    auto *E = dyn_cast<ExtractElementInst>(IV->getInsertedValueOperand()); 
-    if (!E) 
-      return nullptr; 
-    auto *W = E->getVectorOperand(); 
-    if (!U) 
-      U = W; 
-    else if (U != W) 
-      return nullptr; 
-    auto *CI = dyn_cast<ConstantInt>(E->getIndexOperand()); 
-    if (!CI || IV->getNumIndices() != 1 || CI->getZExtValue() != *IV->idx_begin()) 
-      return nullptr; 
-    V = IV->getAggregateOperand(); 
-  } 
-  if (!isa<UndefValue>(V) ||!U) 
-    return nullptr; 
- 
-  auto *UT = cast<VectorType>(U->getType()); 
-  auto *VT = V->getType(); 
-  // Check that types UT and VT are bitwise isomorphic. 
-  const auto &DL = IC.getDataLayout(); 
-  if (DL.getTypeStoreSizeInBits(UT) != DL.getTypeStoreSizeInBits(VT)) { 
-    return nullptr; 
-  } 
-  if (auto *AT = dyn_cast<ArrayType>(VT)) { 
+  Value *U = nullptr;
+  while (auto *IV = dyn_cast<InsertValueInst>(V)) {
+    auto *E = dyn_cast<ExtractElementInst>(IV->getInsertedValueOperand());
+    if (!E)
+      return nullptr;
+    auto *W = E->getVectorOperand();
+    if (!U)
+      U = W;
+    else if (U != W)
+      return nullptr;
+    auto *CI = dyn_cast<ConstantInt>(E->getIndexOperand());
+    if (!CI || IV->getNumIndices() != 1 || CI->getZExtValue() != *IV->idx_begin())
+      return nullptr;
+    V = IV->getAggregateOperand();
+  }
+  if (!isa<UndefValue>(V) ||!U)
+    return nullptr;
+
+  auto *UT = cast<VectorType>(U->getType());
+  auto *VT = V->getType();
+  // Check that types UT and VT are bitwise isomorphic.
+  const auto &DL = IC.getDataLayout();
+  if (DL.getTypeStoreSizeInBits(UT) != DL.getTypeStoreSizeInBits(VT)) {
+    return nullptr;
+  }
+  if (auto *AT = dyn_cast<ArrayType>(VT)) {
     if (AT->getNumElements() != cast<FixedVectorType>(UT)->getNumElements())
-      return nullptr; 
-  } else { 
-    auto *ST = cast<StructType>(VT); 
+      return nullptr;
+  } else {
+    auto *ST = cast<StructType>(VT);
     if (ST->getNumElements() != cast<FixedVectorType>(UT)->getNumElements())
-      return nullptr; 
-    for (const auto *EltT : ST->elements()) { 
-      if (EltT != UT->getElementType()) 
-        return nullptr; 
-    } 
-  } 
-  return U; 
-} 
- 
-/// Combine stores to match the type of value being stored. 
-/// 
-/// The core idea here is that the memory does not have any intrinsic type and 
-/// where we can we should match the type of a store to the type of value being 
-/// stored. 
-/// 
-/// However, this routine must never change the width of a store or the number of 
-/// stores as that would introduce a semantic change. This combine is expected to 
-/// be a semantic no-op which just allows stores to more closely model the types 
-/// of their incoming values. 
-/// 
-/// Currently, we also refuse to change the precise type used for an atomic or 
-/// volatile store. This is debatable, and might be reasonable to change later. 
-/// However, it is risky in case some backend or other part of LLVM is relying 
-/// on the exact type stored to select appropriate atomic operations. 
-/// 
-/// \returns true if the store was successfully combined away. This indicates 
-/// the caller must erase the store instruction. We have to let the caller erase 
-/// the store instruction as otherwise there is no way to signal whether it was 
-/// combined or not: IC.EraseInstFromFunction returns a null pointer. 
+      return nullptr;
+    for (const auto *EltT : ST->elements()) {
+      if (EltT != UT->getElementType())
+        return nullptr;
+    }
+  }
+  return U;
+}
+
+/// Combine stores to match the type of value being stored.
+///
+/// The core idea here is that the memory does not have any intrinsic type and
+/// where we can we should match the type of a store to the type of value being
+/// stored.
+///
+/// However, this routine must never change the width of a store or the number of
+/// stores as that would introduce a semantic change. This combine is expected to
+/// be a semantic no-op which just allows stores to more closely model the types
+/// of their incoming values.
+///
+/// Currently, we also refuse to change the precise type used for an atomic or
+/// volatile store. This is debatable, and might be reasonable to change later.
+/// However, it is risky in case some backend or other part of LLVM is relying
+/// on the exact type stored to select appropriate atomic operations.
+///
+/// \returns true if the store was successfully combined away. This indicates
+/// the caller must erase the store instruction. We have to let the caller erase
+/// the store instruction as otherwise there is no way to signal whether it was
+/// combined or not: IC.EraseInstFromFunction returns a null pointer.
 static bool combineStoreToValueType(InstCombinerImpl &IC, StoreInst &SI) {
-  // FIXME: We could probably with some care handle both volatile and ordered 
-  // atomic stores here but it isn't clear that this is important. 
-  if (!SI.isUnordered()) 
-    return false; 
- 
-  // swifterror values can't be bitcasted. 
-  if (SI.getPointerOperand()->isSwiftError()) 
-    return false; 
- 
-  Value *V = SI.getValueOperand(); 
- 
-  // Fold away bit casts of the stored value by storing the original type. 
-  if (auto *BC = dyn_cast<BitCastInst>(V)) { 
+  // FIXME: We could probably with some care handle both volatile and ordered
+  // atomic stores here but it isn't clear that this is important.
+  if (!SI.isUnordered())
+    return false;
+
+  // swifterror values can't be bitcasted.
+  if (SI.getPointerOperand()->isSwiftError())
+    return false;
+
+  Value *V = SI.getValueOperand();
+
+  // Fold away bit casts of the stored value by storing the original type.
+  if (auto *BC = dyn_cast<BitCastInst>(V)) {
     assert(!BC->getType()->isX86_AMXTy() &&
            "store to x86_amx* should not happen!");
-    V = BC->getOperand(0); 
+    V = BC->getOperand(0);
     // Don't transform when the type is x86_amx, it makes the pass that lower
     // x86_amx type happy.
     if (V->getType()->isX86_AMXTy())
       return false;
-    if (!SI.isAtomic() || isSupportedAtomicType(V->getType())) { 
-      combineStoreToNewValue(IC, SI, V); 
-      return true; 
-    } 
-  } 
- 
-  if (Value *U = likeBitCastFromVector(IC, V)) 
-    if (!SI.isAtomic() || isSupportedAtomicType(U->getType())) { 
-      combineStoreToNewValue(IC, SI, U); 
-      return true; 
-    } 
- 
-  // FIXME: We should also canonicalize stores of vectors when their elements 
-  // are cast to other types. 
-  return false; 
-} 
- 
+    if (!SI.isAtomic() || isSupportedAtomicType(V->getType())) {
+      combineStoreToNewValue(IC, SI, V);
+      return true;
+    }
+  }
+
+  if (Value *U = likeBitCastFromVector(IC, V))
+    if (!SI.isAtomic() || isSupportedAtomicType(U->getType())) {
+      combineStoreToNewValue(IC, SI, U);
+      return true;
+    }
+
+  // FIXME: We should also canonicalize stores of vectors when their elements
+  // are cast to other types.
+  return false;
+}
+
 static bool unpackStoreToAggregate(InstCombinerImpl &IC, StoreInst &SI) {
-  // FIXME: We could probably with some care handle both volatile and atomic 
-  // stores here but it isn't clear that this is important. 
-  if (!SI.isSimple()) 
-    return false; 
- 
-  Value *V = SI.getValueOperand(); 
-  Type *T = V->getType(); 
- 
-  if (!T->isAggregateType()) 
-    return false; 
- 
-  if (auto *ST = dyn_cast<StructType>(T)) { 
-    // If the struct only have one element, we unpack. 
-    unsigned Count = ST->getNumElements(); 
-    if (Count == 1) { 
-      V = IC.Builder.CreateExtractValue(V, 0); 
-      combineStoreToNewValue(IC, SI, V); 
-      return true; 
-    } 
- 
-    // We don't want to break loads with padding here as we'd loose 
-    // the knowledge that padding exists for the rest of the pipeline. 
-    const DataLayout &DL = IC.getDataLayout(); 
-    auto *SL = DL.getStructLayout(ST); 
-    if (SL->hasPadding()) 
-      return false; 
- 
-    const auto Align = SI.getAlign(); 
- 
-    SmallString<16> EltName = V->getName(); 
-    EltName += ".elt"; 
-    auto *Addr = SI.getPointerOperand(); 
-    SmallString<16> AddrName = Addr->getName(); 
-    AddrName += ".repack"; 
- 
-    auto *IdxType = Type::getInt32Ty(ST->getContext()); 
-    auto *Zero = ConstantInt::get(IdxType, 0); 
-    for (unsigned i = 0; i < Count; i++) { 
-      Value *Indices[2] = { 
-        Zero, 
-        ConstantInt::get(IdxType, i), 
-      }; 
-      auto *Ptr = IC.Builder.CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices), 
-                                               AddrName); 
-      auto *Val = IC.Builder.CreateExtractValue(V, i, EltName); 
-      auto EltAlign = commonAlignment(Align, SL->getElementOffset(i)); 
-      llvm::Instruction *NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign); 
-      AAMDNodes AAMD; 
-      SI.getAAMetadata(AAMD); 
-      NS->setAAMetadata(AAMD); 
-    } 
- 
-    return true; 
-  } 
- 
-  if (auto *AT = dyn_cast<ArrayType>(T)) { 
-    // If the array only have one element, we unpack. 
-    auto NumElements = AT->getNumElements(); 
-    if (NumElements == 1) { 
-      V = IC.Builder.CreateExtractValue(V, 0); 
-      combineStoreToNewValue(IC, SI, V); 
-      return true; 
-    } 
- 
-    // Bail out if the array is too large. Ideally we would like to optimize 
-    // arrays of arbitrary size but this has a terrible impact on compile time. 
-    // The threshold here is chosen arbitrarily, maybe needs a little bit of 
-    // tuning. 
-    if (NumElements > IC.MaxArraySizeForCombine) 
-      return false; 
- 
-    const DataLayout &DL = IC.getDataLayout(); 
-    auto EltSize = DL.getTypeAllocSize(AT->getElementType()); 
-    const auto Align = SI.getAlign(); 
- 
-    SmallString<16> EltName = V->getName(); 
-    EltName += ".elt"; 
-    auto *Addr = SI.getPointerOperand(); 
-    SmallString<16> AddrName = Addr->getName(); 
-    AddrName += ".repack"; 
- 
-    auto *IdxType = Type::getInt64Ty(T->getContext()); 
-    auto *Zero = ConstantInt::get(IdxType, 0); 
- 
-    uint64_t Offset = 0; 
-    for (uint64_t i = 0; i < NumElements; i++) { 
-      Value *Indices[2] = { 
-        Zero, 
-        ConstantInt::get(IdxType, i), 
-      }; 
-      auto *Ptr = IC.Builder.CreateInBoundsGEP(AT, Addr, makeArrayRef(Indices), 
-                                               AddrName); 
-      auto *Val = IC.Builder.CreateExtractValue(V, i, EltName); 
-      auto EltAlign = commonAlignment(Align, Offset); 
-      Instruction *NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign); 
-      AAMDNodes AAMD; 
-      SI.getAAMetadata(AAMD); 
-      NS->setAAMetadata(AAMD); 
-      Offset += EltSize; 
-    } 
- 
-    return true; 
-  } 
- 
-  return false; 
-} 
- 
-/// equivalentAddressValues - Test if A and B will obviously have the same 
-/// value. This includes recognizing that %t0 and %t1 will have the same 
-/// value in code like this: 
-///   %t0 = getelementptr \@a, 0, 3 
-///   store i32 0, i32* %t0 
-///   %t1 = getelementptr \@a, 0, 3 
-///   %t2 = load i32* %t1 
-/// 
-static bool equivalentAddressValues(Value *A, Value *B) { 
-  // Test if the values are trivially equivalent. 
-  if (A == B) return true; 
- 
-  // Test if the values come form identical arithmetic instructions. 
-  // This uses isIdenticalToWhenDefined instead of isIdenticalTo because 
-  // its only used to compare two uses within the same basic block, which 
-  // means that they'll always either have the same value or one of them 
-  // will have an undefined value. 
-  if (isa<BinaryOperator>(A) || 
-      isa<CastInst>(A) || 
-      isa<PHINode>(A) || 
-      isa<GetElementPtrInst>(A)) 
-    if (Instruction *BI = dyn_cast<Instruction>(B)) 
-      if (cast<Instruction>(A)->isIdenticalToWhenDefined(BI)) 
-        return true; 
- 
-  // Otherwise they may not be equivalent. 
-  return false; 
-} 
- 
-/// Converts store (bitcast (load (bitcast (select ...)))) to 
-/// store (load (select ...)), where select is minmax: 
-/// select ((cmp load V1, load V2), V1, V2). 
+  // FIXME: We could probably with some care handle both volatile and atomic
+  // stores here but it isn't clear that this is important.
+  if (!SI.isSimple())
+    return false;
+
+  Value *V = SI.getValueOperand();
+  Type *T = V->getType();
+
+  if (!T->isAggregateType())
+    return false;
+
+  if (auto *ST = dyn_cast<StructType>(T)) {
+    // If the struct only have one element, we unpack.
+    unsigned Count = ST->getNumElements();
+    if (Count == 1) {
+      V = IC.Builder.CreateExtractValue(V, 0);
+      combineStoreToNewValue(IC, SI, V);
+      return true;
+    }
+
+    // We don't want to break loads with padding here as we'd loose
+    // the knowledge that padding exists for the rest of the pipeline.
+    const DataLayout &DL = IC.getDataLayout();
+    auto *SL = DL.getStructLayout(ST);
+    if (SL->hasPadding())
+      return false;
+
+    const auto Align = SI.getAlign();
+
+    SmallString<16> EltName = V->getName();
+    EltName += ".elt";
+    auto *Addr = SI.getPointerOperand();
+    SmallString<16> AddrName = Addr->getName();
+    AddrName += ".repack";
+
+    auto *IdxType = Type::getInt32Ty(ST->getContext());
+    auto *Zero = ConstantInt::get(IdxType, 0);
+    for (unsigned i = 0; i < Count; i++) {
+      Value *Indices[2] = {
+        Zero,
+        ConstantInt::get(IdxType, i),
+      };
+      auto *Ptr = IC.Builder.CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices),
+                                               AddrName);
+      auto *Val = IC.Builder.CreateExtractValue(V, i, EltName);
+      auto EltAlign = commonAlignment(Align, SL->getElementOffset(i));
+      llvm::Instruction *NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign);
+      AAMDNodes AAMD;
+      SI.getAAMetadata(AAMD);
+      NS->setAAMetadata(AAMD);
+    }
+
+    return true;
+  }
+
+  if (auto *AT = dyn_cast<ArrayType>(T)) {
+    // If the array only have one element, we unpack.
+    auto NumElements = AT->getNumElements();
+    if (NumElements == 1) {
+      V = IC.Builder.CreateExtractValue(V, 0);
+      combineStoreToNewValue(IC, SI, V);
+      return true;
+    }
+
+    // Bail out if the array is too large. Ideally we would like to optimize
+    // arrays of arbitrary size but this has a terrible impact on compile time.
+    // The threshold here is chosen arbitrarily, maybe needs a little bit of
+    // tuning.
+    if (NumElements > IC.MaxArraySizeForCombine)
+      return false;
+
+    const DataLayout &DL = IC.getDataLayout();
+    auto EltSize = DL.getTypeAllocSize(AT->getElementType());
+    const auto Align = SI.getAlign();
+
+    SmallString<16> EltName = V->getName();
+    EltName += ".elt";
+    auto *Addr = SI.getPointerOperand();
+    SmallString<16> AddrName = Addr->getName();
+    AddrName += ".repack";
+
+    auto *IdxType = Type::getInt64Ty(T->getContext());
+    auto *Zero = ConstantInt::get(IdxType, 0);
+
+    uint64_t Offset = 0;
+    for (uint64_t i = 0; i < NumElements; i++) {
+      Value *Indices[2] = {
+        Zero,
+        ConstantInt::get(IdxType, i),
+      };
+      auto *Ptr = IC.Builder.CreateInBoundsGEP(AT, Addr, makeArrayRef(Indices),
+                                               AddrName);
+      auto *Val = IC.Builder.CreateExtractValue(V, i, EltName);
+      auto EltAlign = commonAlignment(Align, Offset);
+      Instruction *NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign);
+      AAMDNodes AAMD;
+      SI.getAAMetadata(AAMD);
+      NS->setAAMetadata(AAMD);
+      Offset += EltSize;
+    }
+
+    return true;
+  }
+
+  return false;
+}
+
+/// equivalentAddressValues - Test if A and B will obviously have the same
+/// value. This includes recognizing that %t0 and %t1 will have the same
+/// value in code like this:
+///   %t0 = getelementptr \@a, 0, 3
+///   store i32 0, i32* %t0
+///   %t1 = getelementptr \@a, 0, 3
+///   %t2 = load i32* %t1
+///
+static bool equivalentAddressValues(Value *A, Value *B) {
+  // Test if the values are trivially equivalent.
+  if (A == B) return true;
+
+  // Test if the values come form identical arithmetic instructions.
+  // This uses isIdenticalToWhenDefined instead of isIdenticalTo because
+  // its only used to compare two uses within the same basic block, which
+  // means that they'll always either have the same value or one of them
+  // will have an undefined value.
+  if (isa<BinaryOperator>(A) ||
+      isa<CastInst>(A) ||
+      isa<PHINode>(A) ||
+      isa<GetElementPtrInst>(A))
+    if (Instruction *BI = dyn_cast<Instruction>(B))
+      if (cast<Instruction>(A)->isIdenticalToWhenDefined(BI))
+        return true;
+
+  // Otherwise they may not be equivalent.
+  return false;
+}
+
+/// Converts store (bitcast (load (bitcast (select ...)))) to
+/// store (load (select ...)), where select is minmax:
+/// select ((cmp load V1, load V2), V1, V2).
 static bool removeBitcastsFromLoadStoreOnMinMax(InstCombinerImpl &IC,
-                                                StoreInst &SI) { 
-  // bitcast? 
-  if (!match(SI.getPointerOperand(), m_BitCast(m_Value()))) 
-    return false; 
-  // load? integer? 
-  Value *LoadAddr; 
-  if (!match(SI.getValueOperand(), m_Load(m_BitCast(m_Value(LoadAddr))))) 
-    return false; 
-  auto *LI = cast<LoadInst>(SI.getValueOperand()); 
-  if (!LI->getType()->isIntegerTy()) 
-    return false; 
-  Type *CmpLoadTy; 
-  if (!isMinMaxWithLoads(LoadAddr, CmpLoadTy)) 
-    return false; 
- 
-  // Make sure the type would actually change. 
-  // This condition can be hit with chains of bitcasts. 
-  if (LI->getType() == CmpLoadTy) 
-    return false; 
- 
-  // Make sure we're not changing the size of the load/store. 
-  const auto &DL = IC.getDataLayout(); 
-  if (DL.getTypeStoreSizeInBits(LI->getType()) != 
-      DL.getTypeStoreSizeInBits(CmpLoadTy)) 
-    return false; 
- 
-  if (!all_of(LI->users(), [LI, LoadAddr](User *U) { 
-        auto *SI = dyn_cast<StoreInst>(U); 
-        return SI && SI->getPointerOperand() != LI && 
+                                                StoreInst &SI) {
+  // bitcast?
+  if (!match(SI.getPointerOperand(), m_BitCast(m_Value())))
+    return false;
+  // load? integer?
+  Value *LoadAddr;
+  if (!match(SI.getValueOperand(), m_Load(m_BitCast(m_Value(LoadAddr)))))
+    return false;
+  auto *LI = cast<LoadInst>(SI.getValueOperand());
+  if (!LI->getType()->isIntegerTy())
+    return false;
+  Type *CmpLoadTy;
+  if (!isMinMaxWithLoads(LoadAddr, CmpLoadTy))
+    return false;
+
+  // Make sure the type would actually change.
+  // This condition can be hit with chains of bitcasts.
+  if (LI->getType() == CmpLoadTy)
+    return false;
+
+  // Make sure we're not changing the size of the load/store.
+  const auto &DL = IC.getDataLayout();
+  if (DL.getTypeStoreSizeInBits(LI->getType()) !=
+      DL.getTypeStoreSizeInBits(CmpLoadTy))
+    return false;
+
+  if (!all_of(LI->users(), [LI, LoadAddr](User *U) {
+        auto *SI = dyn_cast<StoreInst>(U);
+        return SI && SI->getPointerOperand() != LI &&
                InstCombiner::peekThroughBitcast(SI->getPointerOperand()) !=
                    LoadAddr &&
-               !SI->getPointerOperand()->isSwiftError(); 
-      })) 
-    return false; 
- 
-  IC.Builder.SetInsertPoint(LI); 
-  LoadInst *NewLI = IC.combineLoadToNewType(*LI, CmpLoadTy); 
-  // Replace all the stores with stores of the newly loaded value. 
-  for (auto *UI : LI->users()) { 
-    auto *USI = cast<StoreInst>(UI); 
-    IC.Builder.SetInsertPoint(USI); 
-    combineStoreToNewValue(IC, *USI, NewLI); 
-  } 
-  IC.replaceInstUsesWith(*LI, UndefValue::get(LI->getType())); 
-  IC.eraseInstFromFunction(*LI); 
-  return true; 
-} 
- 
+               !SI->getPointerOperand()->isSwiftError();
+      }))
+    return false;
+
+  IC.Builder.SetInsertPoint(LI);
+  LoadInst *NewLI = IC.combineLoadToNewType(*LI, CmpLoadTy);
+  // Replace all the stores with stores of the newly loaded value.
+  for (auto *UI : LI->users()) {
+    auto *USI = cast<StoreInst>(UI);
+    IC.Builder.SetInsertPoint(USI);
+    combineStoreToNewValue(IC, *USI, NewLI);
+  }
+  IC.replaceInstUsesWith(*LI, UndefValue::get(LI->getType()));
+  IC.eraseInstFromFunction(*LI);
+  return true;
+}
+
 Instruction *InstCombinerImpl::visitStoreInst(StoreInst &SI) {
-  Value *Val = SI.getOperand(0); 
-  Value *Ptr = SI.getOperand(1); 
- 
-  // Try to canonicalize the stored type. 
-  if (combineStoreToValueType(*this, SI)) 
-    return eraseInstFromFunction(SI); 
- 
-  // Attempt to improve the alignment. 
-  const Align KnownAlign = getOrEnforceKnownAlignment( 
-      Ptr, DL.getPrefTypeAlign(Val->getType()), DL, &SI, &AC, &DT); 
-  if (KnownAlign > SI.getAlign()) 
-    SI.setAlignment(KnownAlign); 
- 
-  // Try to canonicalize the stored type. 
-  if (unpackStoreToAggregate(*this, SI)) 
-    return eraseInstFromFunction(SI); 
- 
-  if (removeBitcastsFromLoadStoreOnMinMax(*this, SI)) 
-    return eraseInstFromFunction(SI); 
- 
-  // Replace GEP indices if possible. 
-  if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Ptr, SI)) { 
-      Worklist.push(NewGEPI); 
-      return &SI; 
-  } 
- 
-  // Don't hack volatile/ordered stores. 
-  // FIXME: Some bits are legal for ordered atomic stores; needs refactoring. 
-  if (!SI.isUnordered()) return nullptr; 
- 
-  // If the RHS is an alloca with a single use, zapify the store, making the 
-  // alloca dead. 
-  if (Ptr->hasOneUse()) { 
-    if (isa<AllocaInst>(Ptr)) 
-      return eraseInstFromFunction(SI); 
-    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) { 
-      if (isa<AllocaInst>(GEP->getOperand(0))) { 
-        if (GEP->getOperand(0)->hasOneUse()) 
-          return eraseInstFromFunction(SI); 
-      } 
-    } 
-  } 
- 
-  // If we have a store to a location which is known constant, we can conclude 
-  // that the store must be storing the constant value (else the memory 
-  // wouldn't be constant), and this must be a noop. 
-  if (AA->pointsToConstantMemory(Ptr)) 
-    return eraseInstFromFunction(SI); 
- 
-  // Do really simple DSE, to catch cases where there are several consecutive 
-  // stores to the same location, separated by a few arithmetic operations. This 
-  // situation often occurs with bitfield accesses. 
-  BasicBlock::iterator BBI(SI); 
-  for (unsigned ScanInsts = 6; BBI != SI.getParent()->begin() && ScanInsts; 
-       --ScanInsts) { 
-    --BBI; 
-    // Don't count debug info directives, lest they affect codegen, 
-    // and we skip pointer-to-pointer bitcasts, which are NOPs. 
-    if (isa<DbgInfoIntrinsic>(BBI) || 
-        (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) { 
-      ScanInsts++; 
-      continue; 
-    } 
- 
-    if (StoreInst *PrevSI = dyn_cast<StoreInst>(BBI)) { 
-      // Prev store isn't volatile, and stores to the same location? 
-      if (PrevSI->isUnordered() && equivalentAddressValues(PrevSI->getOperand(1), 
-                                                        SI.getOperand(1))) { 
-        ++NumDeadStore; 
-        // Manually add back the original store to the worklist now, so it will 
-        // be processed after the operands of the removed store, as this may 
-        // expose additional DSE opportunities. 
-        Worklist.push(&SI); 
-        eraseInstFromFunction(*PrevSI); 
-        return nullptr; 
-      } 
-      break; 
-    } 
- 
-    // If this is a load, we have to stop.  However, if the loaded value is from 
-    // the pointer we're loading and is producing the pointer we're storing, 
-    // then *this* store is dead (X = load P; store X -> P). 
-    if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) { 
-      if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr)) { 
-        assert(SI.isUnordered() && "can't eliminate ordering operation"); 
-        return eraseInstFromFunction(SI); 
-      } 
- 
-      // Otherwise, this is a load from some other location.  Stores before it 
-      // may not be dead. 
-      break; 
-    } 
- 
-    // Don't skip over loads, throws or things that can modify memory. 
-    if (BBI->mayWriteToMemory() || BBI->mayReadFromMemory() || BBI->mayThrow()) 
-      break; 
-  } 
- 
-  // store X, null    -> turns into 'unreachable' in SimplifyCFG 
-  // store X, GEP(null, Y) -> turns into 'unreachable' in SimplifyCFG 
-  if (canSimplifyNullStoreOrGEP(SI)) { 
-    if (!isa<UndefValue>(Val)) 
-      return replaceOperand(SI, 0, UndefValue::get(Val->getType())); 
-    return nullptr;  // Do not modify these! 
-  } 
- 
-  // store undef, Ptr -> noop 
-  if (isa<UndefValue>(Val)) 
-    return eraseInstFromFunction(SI); 
- 
-  return nullptr; 
-} 
- 
-/// Try to transform: 
-///   if () { *P = v1; } else { *P = v2 } 
-/// or: 
-///   *P = v1; if () { *P = v2; } 
-/// into a phi node with a store in the successor. 
+  Value *Val = SI.getOperand(0);
+  Value *Ptr = SI.getOperand(1);
+
+  // Try to canonicalize the stored type.
+  if (combineStoreToValueType(*this, SI))
+    return eraseInstFromFunction(SI);
+
+  // Attempt to improve the alignment.
+  const Align KnownAlign = getOrEnforceKnownAlignment(
+      Ptr, DL.getPrefTypeAlign(Val->getType()), DL, &SI, &AC, &DT);
+  if (KnownAlign > SI.getAlign())
+    SI.setAlignment(KnownAlign);
+
+  // Try to canonicalize the stored type.
+  if (unpackStoreToAggregate(*this, SI))
+    return eraseInstFromFunction(SI);
+
+  if (removeBitcastsFromLoadStoreOnMinMax(*this, SI))
+    return eraseInstFromFunction(SI);
+
+  // Replace GEP indices if possible.
+  if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Ptr, SI)) {
+      Worklist.push(NewGEPI);
+      return &SI;
+  }
+
+  // Don't hack volatile/ordered stores.
+  // FIXME: Some bits are legal for ordered atomic stores; needs refactoring.
+  if (!SI.isUnordered()) return nullptr;
+
+  // If the RHS is an alloca with a single use, zapify the store, making the
+  // alloca dead.
+  if (Ptr->hasOneUse()) {
+    if (isa<AllocaInst>(Ptr))
+      return eraseInstFromFunction(SI);
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
+      if (isa<AllocaInst>(GEP->getOperand(0))) {
+        if (GEP->getOperand(0)->hasOneUse())
+          return eraseInstFromFunction(SI);
+      }
+    }
+  }
+
+  // If we have a store to a location which is known constant, we can conclude
+  // that the store must be storing the constant value (else the memory
+  // wouldn't be constant), and this must be a noop.
+  if (AA->pointsToConstantMemory(Ptr))
+    return eraseInstFromFunction(SI);
+
+  // Do really simple DSE, to catch cases where there are several consecutive
+  // stores to the same location, separated by a few arithmetic operations. This
+  // situation often occurs with bitfield accesses.
+  BasicBlock::iterator BBI(SI);
+  for (unsigned ScanInsts = 6; BBI != SI.getParent()->begin() && ScanInsts;
+       --ScanInsts) {
+    --BBI;
+    // Don't count debug info directives, lest they affect codegen,
+    // and we skip pointer-to-pointer bitcasts, which are NOPs.
+    if (isa<DbgInfoIntrinsic>(BBI) ||
+        (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) {
+      ScanInsts++;
+      continue;
+    }
+
+    if (StoreInst *PrevSI = dyn_cast<StoreInst>(BBI)) {
+      // Prev store isn't volatile, and stores to the same location?
+      if (PrevSI->isUnordered() && equivalentAddressValues(PrevSI->getOperand(1),
+                                                        SI.getOperand(1))) {
+        ++NumDeadStore;
+        // Manually add back the original store to the worklist now, so it will
+        // be processed after the operands of the removed store, as this may
+        // expose additional DSE opportunities.
+        Worklist.push(&SI);
+        eraseInstFromFunction(*PrevSI);
+        return nullptr;
+      }
+      break;
+    }
+
+    // If this is a load, we have to stop.  However, if the loaded value is from
+    // the pointer we're loading and is producing the pointer we're storing,
+    // then *this* store is dead (X = load P; store X -> P).
+    if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) {
+      if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr)) {
+        assert(SI.isUnordered() && "can't eliminate ordering operation");
+        return eraseInstFromFunction(SI);
+      }
+
+      // Otherwise, this is a load from some other location.  Stores before it
+      // may not be dead.
+      break;
+    }
+
+    // Don't skip over loads, throws or things that can modify memory.
+    if (BBI->mayWriteToMemory() || BBI->mayReadFromMemory() || BBI->mayThrow())
+      break;
+  }
+
+  // store X, null    -> turns into 'unreachable' in SimplifyCFG
+  // store X, GEP(null, Y) -> turns into 'unreachable' in SimplifyCFG
+  if (canSimplifyNullStoreOrGEP(SI)) {
+    if (!isa<UndefValue>(Val))
+      return replaceOperand(SI, 0, UndefValue::get(Val->getType()));
+    return nullptr;  // Do not modify these!
+  }
+
+  // store undef, Ptr -> noop
+  if (isa<UndefValue>(Val))
+    return eraseInstFromFunction(SI);
+
+  return nullptr;
+}
+
+/// Try to transform:
+///   if () { *P = v1; } else { *P = v2 }
+/// or:
+///   *P = v1; if () { *P = v2; }
+/// into a phi node with a store in the successor.
 bool InstCombinerImpl::mergeStoreIntoSuccessor(StoreInst &SI) {
-  if (!SI.isUnordered()) 
-    return false; // This code has not been audited for volatile/ordered case. 
- 
-  // Check if the successor block has exactly 2 incoming edges. 
-  BasicBlock *StoreBB = SI.getParent(); 
-  BasicBlock *DestBB = StoreBB->getTerminator()->getSuccessor(0); 
-  if (!DestBB->hasNPredecessors(2)) 
-    return false; 
- 
-  // Capture the other block (the block that doesn't contain our store). 
-  pred_iterator PredIter = pred_begin(DestBB); 
-  if (*PredIter == StoreBB) 
-    ++PredIter; 
-  BasicBlock *OtherBB = *PredIter; 
- 
-  // Bail out if all of the relevant blocks aren't distinct. This can happen, 
-  // for example, if SI is in an infinite loop. 
-  if (StoreBB == DestBB || OtherBB == DestBB) 
-    return false; 
- 
-  // Verify that the other block ends in a branch and is not otherwise empty. 
-  BasicBlock::iterator BBI(OtherBB->getTerminator()); 
-  BranchInst *OtherBr = dyn_cast<BranchInst>(BBI); 
-  if (!OtherBr || BBI == OtherBB->begin()) 
-    return false; 
- 
-  // If the other block ends in an unconditional branch, check for the 'if then 
-  // else' case. There is an instruction before the branch. 
-  StoreInst *OtherStore = nullptr; 
-  if (OtherBr->isUnconditional()) { 
-    --BBI; 
-    // Skip over debugging info. 
-    while (isa<DbgInfoIntrinsic>(BBI) || 
-           (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) { 
-      if (BBI==OtherBB->begin()) 
-        return false; 
-      --BBI; 
-    } 
-    // If this isn't a store, isn't a store to the same location, or is not the 
-    // right kind of store, bail out. 
-    OtherStore = dyn_cast<StoreInst>(BBI); 
-    if (!OtherStore || OtherStore->getOperand(1) != SI.getOperand(1) || 
-        !SI.isSameOperationAs(OtherStore)) 
-      return false; 
-  } else { 
-    // Otherwise, the other block ended with a conditional branch. If one of the 
-    // destinations is StoreBB, then we have the if/then case. 
-    if (OtherBr->getSuccessor(0) != StoreBB && 
-        OtherBr->getSuccessor(1) != StoreBB) 
-      return false; 
- 
-    // Okay, we know that OtherBr now goes to Dest and StoreBB, so this is an 
-    // if/then triangle. See if there is a store to the same ptr as SI that 
-    // lives in OtherBB. 
-    for (;; --BBI) { 
-      // Check to see if we find the matching store. 
-      if ((OtherStore = dyn_cast<StoreInst>(BBI))) { 
-        if (OtherStore->getOperand(1) != SI.getOperand(1) || 
-            !SI.isSameOperationAs(OtherStore)) 
-          return false; 
-        break; 
-      } 
-      // If we find something that may be using or overwriting the stored 
-      // value, or if we run out of instructions, we can't do the transform. 
-      if (BBI->mayReadFromMemory() || BBI->mayThrow() || 
-          BBI->mayWriteToMemory() || BBI == OtherBB->begin()) 
-        return false; 
-    } 
- 
-    // In order to eliminate the store in OtherBr, we have to make sure nothing 
-    // reads or overwrites the stored value in StoreBB. 
-    for (BasicBlock::iterator I = StoreBB->begin(); &*I != &SI; ++I) { 
-      // FIXME: This should really be AA driven. 
-      if (I->mayReadFromMemory() || I->mayThrow() || I->mayWriteToMemory()) 
-        return false; 
-    } 
-  } 
- 
-  // Insert a PHI node now if we need it. 
-  Value *MergedVal = OtherStore->getOperand(0); 
-  // The debug locations of the original instructions might differ. Merge them. 
-  DebugLoc MergedLoc = DILocation::getMergedLocation(SI.getDebugLoc(), 
-                                                     OtherStore->getDebugLoc()); 
-  if (MergedVal != SI.getOperand(0)) { 
-    PHINode *PN = PHINode::Create(MergedVal->getType(), 2, "storemerge"); 
-    PN->addIncoming(SI.getOperand(0), SI.getParent()); 
-    PN->addIncoming(OtherStore->getOperand(0), OtherBB); 
-    MergedVal = InsertNewInstBefore(PN, DestBB->front()); 
-    PN->setDebugLoc(MergedLoc); 
-  } 
- 
-  // Advance to a place where it is safe to insert the new store and insert it. 
-  BBI = DestBB->getFirstInsertionPt(); 
-  StoreInst *NewSI = 
-      new StoreInst(MergedVal, SI.getOperand(1), SI.isVolatile(), SI.getAlign(), 
-                    SI.getOrdering(), SI.getSyncScopeID()); 
-  InsertNewInstBefore(NewSI, *BBI); 
-  NewSI->setDebugLoc(MergedLoc); 
- 
-  // If the two stores had AA tags, merge them. 
-  AAMDNodes AATags; 
-  SI.getAAMetadata(AATags); 
-  if (AATags) { 
-    OtherStore->getAAMetadata(AATags, /* Merge = */ true); 
-    NewSI->setAAMetadata(AATags); 
-  } 
- 
-  // Nuke the old stores. 
-  eraseInstFromFunction(SI); 
-  eraseInstFromFunction(*OtherStore); 
-  return true; 
-} 
+  if (!SI.isUnordered())
+    return false; // This code has not been audited for volatile/ordered case.
+
+  // Check if the successor block has exactly 2 incoming edges.
+  BasicBlock *StoreBB = SI.getParent();
+  BasicBlock *DestBB = StoreBB->getTerminator()->getSuccessor(0);
+  if (!DestBB->hasNPredecessors(2))
+    return false;
+
+  // Capture the other block (the block that doesn't contain our store).
+  pred_iterator PredIter = pred_begin(DestBB);
+  if (*PredIter == StoreBB)
+    ++PredIter;
+  BasicBlock *OtherBB = *PredIter;
+
+  // Bail out if all of the relevant blocks aren't distinct. This can happen,
+  // for example, if SI is in an infinite loop.
+  if (StoreBB == DestBB || OtherBB == DestBB)
+    return false;
+
+  // Verify that the other block ends in a branch and is not otherwise empty.
+  BasicBlock::iterator BBI(OtherBB->getTerminator());
+  BranchInst *OtherBr = dyn_cast<BranchInst>(BBI);
+  if (!OtherBr || BBI == OtherBB->begin())
+    return false;
+
+  // If the other block ends in an unconditional branch, check for the 'if then
+  // else' case. There is an instruction before the branch.
+  StoreInst *OtherStore = nullptr;
+  if (OtherBr->isUnconditional()) {
+    --BBI;
+    // Skip over debugging info.
+    while (isa<DbgInfoIntrinsic>(BBI) ||
+           (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) {
+      if (BBI==OtherBB->begin())
+        return false;
+      --BBI;
+    }
+    // If this isn't a store, isn't a store to the same location, or is not the
+    // right kind of store, bail out.
+    OtherStore = dyn_cast<StoreInst>(BBI);
+    if (!OtherStore || OtherStore->getOperand(1) != SI.getOperand(1) ||
+        !SI.isSameOperationAs(OtherStore))
+      return false;
+  } else {
+    // Otherwise, the other block ended with a conditional branch. If one of the
+    // destinations is StoreBB, then we have the if/then case.
+    if (OtherBr->getSuccessor(0) != StoreBB &&
+        OtherBr->getSuccessor(1) != StoreBB)
+      return false;
+
+    // Okay, we know that OtherBr now goes to Dest and StoreBB, so this is an
+    // if/then triangle. See if there is a store to the same ptr as SI that
+    // lives in OtherBB.
+    for (;; --BBI) {
+      // Check to see if we find the matching store.
+      if ((OtherStore = dyn_cast<StoreInst>(BBI))) {
+        if (OtherStore->getOperand(1) != SI.getOperand(1) ||
+            !SI.isSameOperationAs(OtherStore))
+          return false;
+        break;
+      }
+      // If we find something that may be using or overwriting the stored
+      // value, or if we run out of instructions, we can't do the transform.
+      if (BBI->mayReadFromMemory() || BBI->mayThrow() ||
+          BBI->mayWriteToMemory() || BBI == OtherBB->begin())
+        return false;
+    }
+
+    // In order to eliminate the store in OtherBr, we have to make sure nothing
+    // reads or overwrites the stored value in StoreBB.
+    for (BasicBlock::iterator I = StoreBB->begin(); &*I != &SI; ++I) {
+      // FIXME: This should really be AA driven.
+      if (I->mayReadFromMemory() || I->mayThrow() || I->mayWriteToMemory())
+        return false;
+    }
+  }
+
+  // Insert a PHI node now if we need it.
+  Value *MergedVal = OtherStore->getOperand(0);
+  // The debug locations of the original instructions might differ. Merge them.
+  DebugLoc MergedLoc = DILocation::getMergedLocation(SI.getDebugLoc(),
+                                                     OtherStore->getDebugLoc());
+  if (MergedVal != SI.getOperand(0)) {
+    PHINode *PN = PHINode::Create(MergedVal->getType(), 2, "storemerge");
+    PN->addIncoming(SI.getOperand(0), SI.getParent());
+    PN->addIncoming(OtherStore->getOperand(0), OtherBB);
+    MergedVal = InsertNewInstBefore(PN, DestBB->front());
+    PN->setDebugLoc(MergedLoc);
+  }
+
+  // Advance to a place where it is safe to insert the new store and insert it.
+  BBI = DestBB->getFirstInsertionPt();
+  StoreInst *NewSI =
+      new StoreInst(MergedVal, SI.getOperand(1), SI.isVolatile(), SI.getAlign(),
+                    SI.getOrdering(), SI.getSyncScopeID());
+  InsertNewInstBefore(NewSI, *BBI);
+  NewSI->setDebugLoc(MergedLoc);
+
+  // If the two stores had AA tags, merge them.
+  AAMDNodes AATags;
+  SI.getAAMetadata(AATags);
+  if (AATags) {
+    OtherStore->getAAMetadata(AATags, /* Merge = */ true);
+    NewSI->setAAMetadata(AATags);
+  }
+
+  // Nuke the old stores.
+  eraseInstFromFunction(SI);
+  eraseInstFromFunction(*OtherStore);
+  return true;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 7987d53b03..4b485a0ad8 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -1,367 +1,367 @@
-//===- InstCombineMulDivRem.cpp -------------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the visit functions for mul, fmul, sdiv, udiv, fdiv, 
-// srem, urem, frem. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "InstCombineInternal.h" 
-#include "llvm/ADT/APFloat.h" 
-#include "llvm/ADT/APInt.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/Operator.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/KnownBits.h" 
-#include "llvm/Transforms/InstCombine/InstCombineWorklist.h" 
+//===- InstCombineMulDivRem.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visit functions for mul, fmul, sdiv, udiv, fdiv,
+// srem, urem, frem.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
-#include "llvm/Transforms/Utils/BuildLibCalls.h" 
-#include <cassert> 
-#include <cstddef> 
-#include <cstdint> 
-#include <utility> 
- 
-using namespace llvm; 
-using namespace PatternMatch; 
- 
-#define DEBUG_TYPE "instcombine" 
- 
-/// The specific integer value is used in a context where it is known to be 
-/// non-zero.  If this allows us to simplify the computation, do so and return 
-/// the new operand, otherwise return null. 
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
+/// The specific integer value is used in a context where it is known to be
+/// non-zero.  If this allows us to simplify the computation, do so and return
+/// the new operand, otherwise return null.
 static Value *simplifyValueKnownNonZero(Value *V, InstCombinerImpl &IC,
-                                        Instruction &CxtI) { 
-  // If V has multiple uses, then we would have to do more analysis to determine 
-  // if this is safe.  For example, the use could be in dynamically unreached 
-  // code. 
-  if (!V->hasOneUse()) return nullptr; 
- 
-  bool MadeChange = false; 
- 
-  // ((1 << A) >>u B) --> (1 << (A-B)) 
-  // Because V cannot be zero, we know that B is less than A. 
-  Value *A = nullptr, *B = nullptr, *One = nullptr; 
-  if (match(V, m_LShr(m_OneUse(m_Shl(m_Value(One), m_Value(A))), m_Value(B))) && 
-      match(One, m_One())) { 
-    A = IC.Builder.CreateSub(A, B); 
-    return IC.Builder.CreateShl(One, A); 
-  } 
- 
-  // (PowerOfTwo >>u B) --> isExact since shifting out the result would make it 
-  // inexact.  Similarly for <<. 
-  BinaryOperator *I = dyn_cast<BinaryOperator>(V); 
-  if (I && I->isLogicalShift() && 
-      IC.isKnownToBeAPowerOfTwo(I->getOperand(0), false, 0, &CxtI)) { 
-    // We know that this is an exact/nuw shift and that the input is a 
-    // non-zero context as well. 
-    if (Value *V2 = simplifyValueKnownNonZero(I->getOperand(0), IC, CxtI)) { 
-      IC.replaceOperand(*I, 0, V2); 
-      MadeChange = true; 
-    } 
- 
-    if (I->getOpcode() == Instruction::LShr && !I->isExact()) { 
-      I->setIsExact(); 
-      MadeChange = true; 
-    } 
- 
-    if (I->getOpcode() == Instruction::Shl && !I->hasNoUnsignedWrap()) { 
-      I->setHasNoUnsignedWrap(); 
-      MadeChange = true; 
-    } 
-  } 
- 
-  // TODO: Lots more we could do here: 
-  //    If V is a phi node, we can call this on each of its operands. 
-  //    "select cond, X, 0" can simplify to "X". 
- 
-  return MadeChange ? V : nullptr; 
-} 
- 
-// TODO: This is a specific form of a much more general pattern. 
-//       We could detect a select with any binop identity constant, or we 
-//       could use SimplifyBinOp to see if either arm of the select reduces. 
-//       But that needs to be done carefully and/or while removing potential 
-//       reverse canonicalizations as in InstCombiner::foldSelectIntoOp(). 
-static Value *foldMulSelectToNegate(BinaryOperator &I, 
-                                    InstCombiner::BuilderTy &Builder) { 
-  Value *Cond, *OtherOp; 
- 
-  // mul (select Cond, 1, -1), OtherOp --> select Cond, OtherOp, -OtherOp 
-  // mul OtherOp, (select Cond, 1, -1) --> select Cond, OtherOp, -OtherOp 
-  if (match(&I, m_c_Mul(m_OneUse(m_Select(m_Value(Cond), m_One(), m_AllOnes())), 
-                        m_Value(OtherOp)))) 
-    return Builder.CreateSelect(Cond, OtherOp, Builder.CreateNeg(OtherOp)); 
- 
-  // mul (select Cond, -1, 1), OtherOp --> select Cond, -OtherOp, OtherOp 
-  // mul OtherOp, (select Cond, -1, 1) --> select Cond, -OtherOp, OtherOp 
-  if (match(&I, m_c_Mul(m_OneUse(m_Select(m_Value(Cond), m_AllOnes(), m_One())), 
-                        m_Value(OtherOp)))) 
-    return Builder.CreateSelect(Cond, Builder.CreateNeg(OtherOp), OtherOp); 
- 
-  // fmul (select Cond, 1.0, -1.0), OtherOp --> select Cond, OtherOp, -OtherOp 
-  // fmul OtherOp, (select Cond, 1.0, -1.0) --> select Cond, OtherOp, -OtherOp 
-  if (match(&I, m_c_FMul(m_OneUse(m_Select(m_Value(Cond), m_SpecificFP(1.0), 
-                                           m_SpecificFP(-1.0))), 
-                         m_Value(OtherOp)))) { 
-    IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); 
-    Builder.setFastMathFlags(I.getFastMathFlags()); 
-    return Builder.CreateSelect(Cond, OtherOp, Builder.CreateFNeg(OtherOp)); 
-  } 
- 
-  // fmul (select Cond, -1.0, 1.0), OtherOp --> select Cond, -OtherOp, OtherOp 
-  // fmul OtherOp, (select Cond, -1.0, 1.0) --> select Cond, -OtherOp, OtherOp 
-  if (match(&I, m_c_FMul(m_OneUse(m_Select(m_Value(Cond), m_SpecificFP(-1.0), 
-                                           m_SpecificFP(1.0))), 
-                         m_Value(OtherOp)))) { 
-    IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); 
-    Builder.setFastMathFlags(I.getFastMathFlags()); 
-    return Builder.CreateSelect(Cond, Builder.CreateFNeg(OtherOp), OtherOp); 
-  } 
- 
-  return nullptr; 
-} 
- 
+                                        Instruction &CxtI) {
+  // If V has multiple uses, then we would have to do more analysis to determine
+  // if this is safe.  For example, the use could be in dynamically unreached
+  // code.
+  if (!V->hasOneUse()) return nullptr;
+
+  bool MadeChange = false;
+
+  // ((1 << A) >>u B) --> (1 << (A-B))
+  // Because V cannot be zero, we know that B is less than A.
+  Value *A = nullptr, *B = nullptr, *One = nullptr;
+  if (match(V, m_LShr(m_OneUse(m_Shl(m_Value(One), m_Value(A))), m_Value(B))) &&
+      match(One, m_One())) {
+    A = IC.Builder.CreateSub(A, B);
+    return IC.Builder.CreateShl(One, A);
+  }
+
+  // (PowerOfTwo >>u B) --> isExact since shifting out the result would make it
+  // inexact.  Similarly for <<.
+  BinaryOperator *I = dyn_cast<BinaryOperator>(V);
+  if (I && I->isLogicalShift() &&
+      IC.isKnownToBeAPowerOfTwo(I->getOperand(0), false, 0, &CxtI)) {
+    // We know that this is an exact/nuw shift and that the input is a
+    // non-zero context as well.
+    if (Value *V2 = simplifyValueKnownNonZero(I->getOperand(0), IC, CxtI)) {
+      IC.replaceOperand(*I, 0, V2);
+      MadeChange = true;
+    }
+
+    if (I->getOpcode() == Instruction::LShr && !I->isExact()) {
+      I->setIsExact();
+      MadeChange = true;
+    }
+
+    if (I->getOpcode() == Instruction::Shl && !I->hasNoUnsignedWrap()) {
+      I->setHasNoUnsignedWrap();
+      MadeChange = true;
+    }
+  }
+
+  // TODO: Lots more we could do here:
+  //    If V is a phi node, we can call this on each of its operands.
+  //    "select cond, X, 0" can simplify to "X".
+
+  return MadeChange ? V : nullptr;
+}
+
+// TODO: This is a specific form of a much more general pattern.
+//       We could detect a select with any binop identity constant, or we
+//       could use SimplifyBinOp to see if either arm of the select reduces.
+//       But that needs to be done carefully and/or while removing potential
+//       reverse canonicalizations as in InstCombiner::foldSelectIntoOp().
+static Value *foldMulSelectToNegate(BinaryOperator &I,
+                                    InstCombiner::BuilderTy &Builder) {
+  Value *Cond, *OtherOp;
+
+  // mul (select Cond, 1, -1), OtherOp --> select Cond, OtherOp, -OtherOp
+  // mul OtherOp, (select Cond, 1, -1) --> select Cond, OtherOp, -OtherOp
+  if (match(&I, m_c_Mul(m_OneUse(m_Select(m_Value(Cond), m_One(), m_AllOnes())),
+                        m_Value(OtherOp))))
+    return Builder.CreateSelect(Cond, OtherOp, Builder.CreateNeg(OtherOp));
+
+  // mul (select Cond, -1, 1), OtherOp --> select Cond, -OtherOp, OtherOp
+  // mul OtherOp, (select Cond, -1, 1) --> select Cond, -OtherOp, OtherOp
+  if (match(&I, m_c_Mul(m_OneUse(m_Select(m_Value(Cond), m_AllOnes(), m_One())),
+                        m_Value(OtherOp))))
+    return Builder.CreateSelect(Cond, Builder.CreateNeg(OtherOp), OtherOp);
+
+  // fmul (select Cond, 1.0, -1.0), OtherOp --> select Cond, OtherOp, -OtherOp
+  // fmul OtherOp, (select Cond, 1.0, -1.0) --> select Cond, OtherOp, -OtherOp
+  if (match(&I, m_c_FMul(m_OneUse(m_Select(m_Value(Cond), m_SpecificFP(1.0),
+                                           m_SpecificFP(-1.0))),
+                         m_Value(OtherOp)))) {
+    IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
+    Builder.setFastMathFlags(I.getFastMathFlags());
+    return Builder.CreateSelect(Cond, OtherOp, Builder.CreateFNeg(OtherOp));
+  }
+
+  // fmul (select Cond, -1.0, 1.0), OtherOp --> select Cond, -OtherOp, OtherOp
+  // fmul OtherOp, (select Cond, -1.0, 1.0) --> select Cond, -OtherOp, OtherOp
+  if (match(&I, m_c_FMul(m_OneUse(m_Select(m_Value(Cond), m_SpecificFP(-1.0),
+                                           m_SpecificFP(1.0))),
+                         m_Value(OtherOp)))) {
+    IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
+    Builder.setFastMathFlags(I.getFastMathFlags());
+    return Builder.CreateSelect(Cond, Builder.CreateFNeg(OtherOp), OtherOp);
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
-  if (Value *V = SimplifyMulInst(I.getOperand(0), I.getOperand(1), 
-                                 SQ.getWithInstruction(&I))) 
-    return replaceInstUsesWith(I, V); 
- 
-  if (SimplifyAssociativeOrCommutative(I)) 
-    return &I; 
- 
-  if (Instruction *X = foldVectorBinop(I)) 
-    return X; 
- 
-  if (Value *V = SimplifyUsingDistributiveLaws(I)) 
-    return replaceInstUsesWith(I, V); 
- 
+  if (Value *V = SimplifyMulInst(I.getOperand(0), I.getOperand(1),
+                                 SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  if (SimplifyAssociativeOrCommutative(I))
+    return &I;
+
+  if (Instruction *X = foldVectorBinop(I))
+    return X;
+
+  if (Value *V = SimplifyUsingDistributiveLaws(I))
+    return replaceInstUsesWith(I, V);
+
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   unsigned BitWidth = I.getType()->getScalarSizeInBits();
 
-  // X * -1 == 0 - X 
-  if (match(Op1, m_AllOnes())) { 
-    BinaryOperator *BO = BinaryOperator::CreateNeg(Op0, I.getName()); 
-    if (I.hasNoSignedWrap()) 
-      BO->setHasNoSignedWrap(); 
-    return BO; 
-  } 
- 
-  // Also allow combining multiply instructions on vectors. 
-  { 
-    Value *NewOp; 
-    Constant *C1, *C2; 
-    const APInt *IVal; 
-    if (match(&I, m_Mul(m_Shl(m_Value(NewOp), m_Constant(C2)), 
-                        m_Constant(C1))) && 
-        match(C1, m_APInt(IVal))) { 
-      // ((X << C2)*C1) == (X * (C1 << C2)) 
-      Constant *Shl = ConstantExpr::getShl(C1, C2); 
-      BinaryOperator *Mul = cast<BinaryOperator>(I.getOperand(0)); 
-      BinaryOperator *BO = BinaryOperator::CreateMul(NewOp, Shl); 
-      if (I.hasNoUnsignedWrap() && Mul->hasNoUnsignedWrap()) 
-        BO->setHasNoUnsignedWrap(); 
-      if (I.hasNoSignedWrap() && Mul->hasNoSignedWrap() && 
-          Shl->isNotMinSignedValue()) 
-        BO->setHasNoSignedWrap(); 
-      return BO; 
-    } 
- 
-    if (match(&I, m_Mul(m_Value(NewOp), m_Constant(C1)))) { 
-      // Replace X*(2^C) with X << C, where C is either a scalar or a vector. 
+  // X * -1 == 0 - X
+  if (match(Op1, m_AllOnes())) {
+    BinaryOperator *BO = BinaryOperator::CreateNeg(Op0, I.getName());
+    if (I.hasNoSignedWrap())
+      BO->setHasNoSignedWrap();
+    return BO;
+  }
+
+  // Also allow combining multiply instructions on vectors.
+  {
+    Value *NewOp;
+    Constant *C1, *C2;
+    const APInt *IVal;
+    if (match(&I, m_Mul(m_Shl(m_Value(NewOp), m_Constant(C2)),
+                        m_Constant(C1))) &&
+        match(C1, m_APInt(IVal))) {
+      // ((X << C2)*C1) == (X * (C1 << C2))
+      Constant *Shl = ConstantExpr::getShl(C1, C2);
+      BinaryOperator *Mul = cast<BinaryOperator>(I.getOperand(0));
+      BinaryOperator *BO = BinaryOperator::CreateMul(NewOp, Shl);
+      if (I.hasNoUnsignedWrap() && Mul->hasNoUnsignedWrap())
+        BO->setHasNoUnsignedWrap();
+      if (I.hasNoSignedWrap() && Mul->hasNoSignedWrap() &&
+          Shl->isNotMinSignedValue())
+        BO->setHasNoSignedWrap();
+      return BO;
+    }
+
+    if (match(&I, m_Mul(m_Value(NewOp), m_Constant(C1)))) {
+      // Replace X*(2^C) with X << C, where C is either a scalar or a vector.
       if (Constant *NewCst = ConstantExpr::getExactLogBase2(C1)) {
-        BinaryOperator *Shl = BinaryOperator::CreateShl(NewOp, NewCst); 
- 
-        if (I.hasNoUnsignedWrap()) 
-          Shl->setHasNoUnsignedWrap(); 
-        if (I.hasNoSignedWrap()) { 
-          const APInt *V; 
-          if (match(NewCst, m_APInt(V)) && *V != V->getBitWidth() - 1) 
-            Shl->setHasNoSignedWrap(); 
-        } 
- 
-        return Shl; 
-      } 
-    } 
-  } 
- 
+        BinaryOperator *Shl = BinaryOperator::CreateShl(NewOp, NewCst);
+
+        if (I.hasNoUnsignedWrap())
+          Shl->setHasNoUnsignedWrap();
+        if (I.hasNoSignedWrap()) {
+          const APInt *V;
+          if (match(NewCst, m_APInt(V)) && *V != V->getBitWidth() - 1)
+            Shl->setHasNoSignedWrap();
+        }
+
+        return Shl;
+      }
+    }
+  }
+
   if (Op0->hasOneUse() && match(Op1, m_NegatedPower2())) {
     // Interpret  X * (-1<<C)  as  (-X) * (1<<C)  and try to sink the negation.
     // The "* (1<<C)" thus becomes a potential shifting opportunity.
     if (Value *NegOp0 = Negator::Negate(/*IsNegation*/ true, Op0, *this))
       return BinaryOperator::CreateMul(
           NegOp0, ConstantExpr::getNeg(cast<Constant>(Op1)), I.getName());
-  } 
- 
-  if (Instruction *FoldedMul = foldBinOpIntoSelectOrPhi(I)) 
-    return FoldedMul; 
- 
-  if (Value *FoldedMul = foldMulSelectToNegate(I, Builder)) 
-    return replaceInstUsesWith(I, FoldedMul); 
- 
-  // Simplify mul instructions with a constant RHS. 
-  if (isa<Constant>(Op1)) { 
-    // Canonicalize (X+C1)*CI -> X*CI+C1*CI. 
-    Value *X; 
-    Constant *C1; 
-    if (match(Op0, m_OneUse(m_Add(m_Value(X), m_Constant(C1))))) { 
-      Value *Mul = Builder.CreateMul(C1, Op1); 
-      // Only go forward with the transform if C1*CI simplifies to a tidier 
-      // constant. 
-      if (!match(Mul, m_Mul(m_Value(), m_Value()))) 
-        return BinaryOperator::CreateAdd(Builder.CreateMul(X, Op1), Mul); 
-    } 
-  } 
- 
-  // abs(X) * abs(X) -> X * X 
-  // nabs(X) * nabs(X) -> X * X 
-  if (Op0 == Op1) { 
-    Value *X, *Y; 
-    SelectPatternFlavor SPF = matchSelectPattern(Op0, X, Y).Flavor; 
-    if (SPF == SPF_ABS || SPF == SPF_NABS) 
-      return BinaryOperator::CreateMul(X, X); 
+  }
+
+  if (Instruction *FoldedMul = foldBinOpIntoSelectOrPhi(I))
+    return FoldedMul;
+
+  if (Value *FoldedMul = foldMulSelectToNegate(I, Builder))
+    return replaceInstUsesWith(I, FoldedMul);
+
+  // Simplify mul instructions with a constant RHS.
+  if (isa<Constant>(Op1)) {
+    // Canonicalize (X+C1)*CI -> X*CI+C1*CI.
+    Value *X;
+    Constant *C1;
+    if (match(Op0, m_OneUse(m_Add(m_Value(X), m_Constant(C1))))) {
+      Value *Mul = Builder.CreateMul(C1, Op1);
+      // Only go forward with the transform if C1*CI simplifies to a tidier
+      // constant.
+      if (!match(Mul, m_Mul(m_Value(), m_Value())))
+        return BinaryOperator::CreateAdd(Builder.CreateMul(X, Op1), Mul);
+    }
+  }
+
+  // abs(X) * abs(X) -> X * X
+  // nabs(X) * nabs(X) -> X * X
+  if (Op0 == Op1) {
+    Value *X, *Y;
+    SelectPatternFlavor SPF = matchSelectPattern(Op0, X, Y).Flavor;
+    if (SPF == SPF_ABS || SPF == SPF_NABS)
+      return BinaryOperator::CreateMul(X, X);
 
     if (match(Op0, m_Intrinsic<Intrinsic::abs>(m_Value(X))))
       return BinaryOperator::CreateMul(X, X);
-  } 
- 
-  // -X * C --> X * -C 
-  Value *X, *Y; 
-  Constant *Op1C; 
-  if (match(Op0, m_Neg(m_Value(X))) && match(Op1, m_Constant(Op1C))) 
-    return BinaryOperator::CreateMul(X, ConstantExpr::getNeg(Op1C)); 
- 
-  // -X * -Y --> X * Y 
-  if (match(Op0, m_Neg(m_Value(X))) && match(Op1, m_Neg(m_Value(Y)))) { 
-    auto *NewMul = BinaryOperator::CreateMul(X, Y); 
-    if (I.hasNoSignedWrap() && 
-        cast<OverflowingBinaryOperator>(Op0)->hasNoSignedWrap() && 
-        cast<OverflowingBinaryOperator>(Op1)->hasNoSignedWrap()) 
-      NewMul->setHasNoSignedWrap(); 
-    return NewMul; 
-  } 
- 
-  // -X * Y --> -(X * Y) 
-  // X * -Y --> -(X * Y) 
-  if (match(&I, m_c_Mul(m_OneUse(m_Neg(m_Value(X))), m_Value(Y)))) 
-    return BinaryOperator::CreateNeg(Builder.CreateMul(X, Y)); 
- 
-  // (X / Y) *  Y = X - (X % Y) 
-  // (X / Y) * -Y = (X % Y) - X 
-  { 
-    Value *Y = Op1; 
-    BinaryOperator *Div = dyn_cast<BinaryOperator>(Op0); 
-    if (!Div || (Div->getOpcode() != Instruction::UDiv && 
-                 Div->getOpcode() != Instruction::SDiv)) { 
-      Y = Op0; 
-      Div = dyn_cast<BinaryOperator>(Op1); 
-    } 
-    Value *Neg = dyn_castNegVal(Y); 
-    if (Div && Div->hasOneUse() && 
-        (Div->getOperand(1) == Y || Div->getOperand(1) == Neg) && 
-        (Div->getOpcode() == Instruction::UDiv || 
-         Div->getOpcode() == Instruction::SDiv)) { 
-      Value *X = Div->getOperand(0), *DivOp1 = Div->getOperand(1); 
- 
-      // If the division is exact, X % Y is zero, so we end up with X or -X. 
-      if (Div->isExact()) { 
-        if (DivOp1 == Y) 
-          return replaceInstUsesWith(I, X); 
-        return BinaryOperator::CreateNeg(X); 
-      } 
- 
-      auto RemOpc = Div->getOpcode() == Instruction::UDiv ? Instruction::URem 
-                                                          : Instruction::SRem; 
-      Value *Rem = Builder.CreateBinOp(RemOpc, X, DivOp1); 
-      if (DivOp1 == Y) 
-        return BinaryOperator::CreateSub(X, Rem); 
-      return BinaryOperator::CreateSub(Rem, X); 
-    } 
-  } 
- 
-  /// i1 mul -> i1 and. 
-  if (I.getType()->isIntOrIntVectorTy(1)) 
-    return BinaryOperator::CreateAnd(Op0, Op1); 
- 
-  // X*(1 << Y) --> X << Y 
-  // (1 << Y)*X --> X << Y 
-  { 
-    Value *Y; 
-    BinaryOperator *BO = nullptr; 
-    bool ShlNSW = false; 
-    if (match(Op0, m_Shl(m_One(), m_Value(Y)))) { 
-      BO = BinaryOperator::CreateShl(Op1, Y); 
-      ShlNSW = cast<ShlOperator>(Op0)->hasNoSignedWrap(); 
-    } else if (match(Op1, m_Shl(m_One(), m_Value(Y)))) { 
-      BO = BinaryOperator::CreateShl(Op0, Y); 
-      ShlNSW = cast<ShlOperator>(Op1)->hasNoSignedWrap(); 
-    } 
-    if (BO) { 
-      if (I.hasNoUnsignedWrap()) 
-        BO->setHasNoUnsignedWrap(); 
-      if (I.hasNoSignedWrap() && ShlNSW) 
-        BO->setHasNoSignedWrap(); 
-      return BO; 
-    } 
-  } 
- 
-  // (zext bool X) * (zext bool Y) --> zext (and X, Y) 
-  // (sext bool X) * (sext bool Y) --> zext (and X, Y) 
-  // Note: -1 * -1 == 1 * 1 == 1 (if the extends match, the result is the same) 
-  if (((match(Op0, m_ZExt(m_Value(X))) && match(Op1, m_ZExt(m_Value(Y)))) || 
-       (match(Op0, m_SExt(m_Value(X))) && match(Op1, m_SExt(m_Value(Y))))) && 
-      X->getType()->isIntOrIntVectorTy(1) && X->getType() == Y->getType() && 
-      (Op0->hasOneUse() || Op1->hasOneUse())) { 
-    Value *And = Builder.CreateAnd(X, Y, "mulbool"); 
-    return CastInst::Create(Instruction::ZExt, And, I.getType()); 
-  } 
-  // (sext bool X) * (zext bool Y) --> sext (and X, Y) 
-  // (zext bool X) * (sext bool Y) --> sext (and X, Y) 
-  // Note: -1 * 1 == 1 * -1  == -1 
-  if (((match(Op0, m_SExt(m_Value(X))) && match(Op1, m_ZExt(m_Value(Y)))) || 
-       (match(Op0, m_ZExt(m_Value(X))) && match(Op1, m_SExt(m_Value(Y))))) && 
-      X->getType()->isIntOrIntVectorTy(1) && X->getType() == Y->getType() && 
-      (Op0->hasOneUse() || Op1->hasOneUse())) { 
-    Value *And = Builder.CreateAnd(X, Y, "mulbool"); 
-    return CastInst::Create(Instruction::SExt, And, I.getType()); 
-  } 
- 
-  // (bool X) * Y --> X ? Y : 0 
-  // Y * (bool X) --> X ? Y : 0 
-  if (match(Op0, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) 
-    return SelectInst::Create(X, Op1, ConstantInt::get(I.getType(), 0)); 
-  if (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) 
-    return SelectInst::Create(X, Op0, ConstantInt::get(I.getType(), 0)); 
- 
-  // (lshr X, 31) * Y --> (ashr X, 31) & Y 
-  // Y * (lshr X, 31) --> (ashr X, 31) & Y 
-  // TODO: We are not checking one-use because the elimination of the multiply 
-  //       is better for analysis? 
-  // TODO: Should we canonicalize to '(X < 0) ? Y : 0' instead? That would be 
-  //       more similar to what we're doing above. 
-  const APInt *C; 
-  if (match(Op0, m_LShr(m_Value(X), m_APInt(C))) && *C == C->getBitWidth() - 1) 
-    return BinaryOperator::CreateAnd(Builder.CreateAShr(X, *C), Op1); 
-  if (match(Op1, m_LShr(m_Value(X), m_APInt(C))) && *C == C->getBitWidth() - 1) 
-    return BinaryOperator::CreateAnd(Builder.CreateAShr(X, *C), Op0); 
- 
+  }
+
+  // -X * C --> X * -C
+  Value *X, *Y;
+  Constant *Op1C;
+  if (match(Op0, m_Neg(m_Value(X))) && match(Op1, m_Constant(Op1C)))
+    return BinaryOperator::CreateMul(X, ConstantExpr::getNeg(Op1C));
+
+  // -X * -Y --> X * Y
+  if (match(Op0, m_Neg(m_Value(X))) && match(Op1, m_Neg(m_Value(Y)))) {
+    auto *NewMul = BinaryOperator::CreateMul(X, Y);
+    if (I.hasNoSignedWrap() &&
+        cast<OverflowingBinaryOperator>(Op0)->hasNoSignedWrap() &&
+        cast<OverflowingBinaryOperator>(Op1)->hasNoSignedWrap())
+      NewMul->setHasNoSignedWrap();
+    return NewMul;
+  }
+
+  // -X * Y --> -(X * Y)
+  // X * -Y --> -(X * Y)
+  if (match(&I, m_c_Mul(m_OneUse(m_Neg(m_Value(X))), m_Value(Y))))
+    return BinaryOperator::CreateNeg(Builder.CreateMul(X, Y));
+
+  // (X / Y) *  Y = X - (X % Y)
+  // (X / Y) * -Y = (X % Y) - X
+  {
+    Value *Y = Op1;
+    BinaryOperator *Div = dyn_cast<BinaryOperator>(Op0);
+    if (!Div || (Div->getOpcode() != Instruction::UDiv &&
+                 Div->getOpcode() != Instruction::SDiv)) {
+      Y = Op0;
+      Div = dyn_cast<BinaryOperator>(Op1);
+    }
+    Value *Neg = dyn_castNegVal(Y);
+    if (Div && Div->hasOneUse() &&
+        (Div->getOperand(1) == Y || Div->getOperand(1) == Neg) &&
+        (Div->getOpcode() == Instruction::UDiv ||
+         Div->getOpcode() == Instruction::SDiv)) {
+      Value *X = Div->getOperand(0), *DivOp1 = Div->getOperand(1);
+
+      // If the division is exact, X % Y is zero, so we end up with X or -X.
+      if (Div->isExact()) {
+        if (DivOp1 == Y)
+          return replaceInstUsesWith(I, X);
+        return BinaryOperator::CreateNeg(X);
+      }
+
+      auto RemOpc = Div->getOpcode() == Instruction::UDiv ? Instruction::URem
+                                                          : Instruction::SRem;
+      Value *Rem = Builder.CreateBinOp(RemOpc, X, DivOp1);
+      if (DivOp1 == Y)
+        return BinaryOperator::CreateSub(X, Rem);
+      return BinaryOperator::CreateSub(Rem, X);
+    }
+  }
+
+  /// i1 mul -> i1 and.
+  if (I.getType()->isIntOrIntVectorTy(1))
+    return BinaryOperator::CreateAnd(Op0, Op1);
+
+  // X*(1 << Y) --> X << Y
+  // (1 << Y)*X --> X << Y
+  {
+    Value *Y;
+    BinaryOperator *BO = nullptr;
+    bool ShlNSW = false;
+    if (match(Op0, m_Shl(m_One(), m_Value(Y)))) {
+      BO = BinaryOperator::CreateShl(Op1, Y);
+      ShlNSW = cast<ShlOperator>(Op0)->hasNoSignedWrap();
+    } else if (match(Op1, m_Shl(m_One(), m_Value(Y)))) {
+      BO = BinaryOperator::CreateShl(Op0, Y);
+      ShlNSW = cast<ShlOperator>(Op1)->hasNoSignedWrap();
+    }
+    if (BO) {
+      if (I.hasNoUnsignedWrap())
+        BO->setHasNoUnsignedWrap();
+      if (I.hasNoSignedWrap() && ShlNSW)
+        BO->setHasNoSignedWrap();
+      return BO;
+    }
+  }
+
+  // (zext bool X) * (zext bool Y) --> zext (and X, Y)
+  // (sext bool X) * (sext bool Y) --> zext (and X, Y)
+  // Note: -1 * -1 == 1 * 1 == 1 (if the extends match, the result is the same)
+  if (((match(Op0, m_ZExt(m_Value(X))) && match(Op1, m_ZExt(m_Value(Y)))) ||
+       (match(Op0, m_SExt(m_Value(X))) && match(Op1, m_SExt(m_Value(Y))))) &&
+      X->getType()->isIntOrIntVectorTy(1) && X->getType() == Y->getType() &&
+      (Op0->hasOneUse() || Op1->hasOneUse())) {
+    Value *And = Builder.CreateAnd(X, Y, "mulbool");
+    return CastInst::Create(Instruction::ZExt, And, I.getType());
+  }
+  // (sext bool X) * (zext bool Y) --> sext (and X, Y)
+  // (zext bool X) * (sext bool Y) --> sext (and X, Y)
+  // Note: -1 * 1 == 1 * -1  == -1
+  if (((match(Op0, m_SExt(m_Value(X))) && match(Op1, m_ZExt(m_Value(Y)))) ||
+       (match(Op0, m_ZExt(m_Value(X))) && match(Op1, m_SExt(m_Value(Y))))) &&
+      X->getType()->isIntOrIntVectorTy(1) && X->getType() == Y->getType() &&
+      (Op0->hasOneUse() || Op1->hasOneUse())) {
+    Value *And = Builder.CreateAnd(X, Y, "mulbool");
+    return CastInst::Create(Instruction::SExt, And, I.getType());
+  }
+
+  // (bool X) * Y --> X ? Y : 0
+  // Y * (bool X) --> X ? Y : 0
+  if (match(Op0, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
+    return SelectInst::Create(X, Op1, ConstantInt::get(I.getType(), 0));
+  if (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
+    return SelectInst::Create(X, Op0, ConstantInt::get(I.getType(), 0));
+
+  // (lshr X, 31) * Y --> (ashr X, 31) & Y
+  // Y * (lshr X, 31) --> (ashr X, 31) & Y
+  // TODO: We are not checking one-use because the elimination of the multiply
+  //       is better for analysis?
+  // TODO: Should we canonicalize to '(X < 0) ? Y : 0' instead? That would be
+  //       more similar to what we're doing above.
+  const APInt *C;
+  if (match(Op0, m_LShr(m_Value(X), m_APInt(C))) && *C == C->getBitWidth() - 1)
+    return BinaryOperator::CreateAnd(Builder.CreateAShr(X, *C), Op1);
+  if (match(Op1, m_LShr(m_Value(X), m_APInt(C))) && *C == C->getBitWidth() - 1)
+    return BinaryOperator::CreateAnd(Builder.CreateAShr(X, *C), Op0);
+
   // ((ashr X, 31) | 1) * X --> abs(X)
   // X * ((ashr X, 31) | 1) --> abs(X)
   if (match(&I, m_c_BinOp(m_Or(m_AShr(m_Value(X),
@@ -375,152 +375,152 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
     return replaceInstUsesWith(I, Abs);
   }
 
-  if (Instruction *Ext = narrowMathIfNoOverflow(I)) 
-    return Ext; 
- 
-  bool Changed = false; 
-  if (!I.hasNoSignedWrap() && willNotOverflowSignedMul(Op0, Op1, I)) { 
-    Changed = true; 
-    I.setHasNoSignedWrap(true); 
-  } 
- 
-  if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedMul(Op0, Op1, I)) { 
-    Changed = true; 
-    I.setHasNoUnsignedWrap(true); 
-  } 
- 
-  return Changed ? &I : nullptr; 
-} 
- 
+  if (Instruction *Ext = narrowMathIfNoOverflow(I))
+    return Ext;
+
+  bool Changed = false;
+  if (!I.hasNoSignedWrap() && willNotOverflowSignedMul(Op0, Op1, I)) {
+    Changed = true;
+    I.setHasNoSignedWrap(true);
+  }
+
+  if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedMul(Op0, Op1, I)) {
+    Changed = true;
+    I.setHasNoUnsignedWrap(true);
+  }
+
+  return Changed ? &I : nullptr;
+}
+
 Instruction *InstCombinerImpl::foldFPSignBitOps(BinaryOperator &I) {
-  BinaryOperator::BinaryOps Opcode = I.getOpcode(); 
-  assert((Opcode == Instruction::FMul || Opcode == Instruction::FDiv) && 
-         "Expected fmul or fdiv"); 
- 
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); 
-  Value *X, *Y; 
- 
-  // -X * -Y --> X * Y 
-  // -X / -Y --> X / Y 
-  if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_FNeg(m_Value(Y)))) 
-    return BinaryOperator::CreateWithCopiedFlags(Opcode, X, Y, &I); 
- 
-  // fabs(X) * fabs(X) -> X * X 
-  // fabs(X) / fabs(X) -> X / X 
+  BinaryOperator::BinaryOps Opcode = I.getOpcode();
+  assert((Opcode == Instruction::FMul || Opcode == Instruction::FDiv) &&
+         "Expected fmul or fdiv");
+
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  Value *X, *Y;
+
+  // -X * -Y --> X * Y
+  // -X / -Y --> X / Y
+  if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_FNeg(m_Value(Y))))
+    return BinaryOperator::CreateWithCopiedFlags(Opcode, X, Y, &I);
+
+  // fabs(X) * fabs(X) -> X * X
+  // fabs(X) / fabs(X) -> X / X
   if (Op0 == Op1 && match(Op0, m_FAbs(m_Value(X))))
-    return BinaryOperator::CreateWithCopiedFlags(Opcode, X, X, &I); 
- 
-  // fabs(X) * fabs(Y) --> fabs(X * Y) 
-  // fabs(X) / fabs(Y) --> fabs(X / Y) 
+    return BinaryOperator::CreateWithCopiedFlags(Opcode, X, X, &I);
+
+  // fabs(X) * fabs(Y) --> fabs(X * Y)
+  // fabs(X) / fabs(Y) --> fabs(X / Y)
   if (match(Op0, m_FAbs(m_Value(X))) && match(Op1, m_FAbs(m_Value(Y))) &&
-      (Op0->hasOneUse() || Op1->hasOneUse())) { 
-    IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); 
-    Builder.setFastMathFlags(I.getFastMathFlags()); 
-    Value *XY = Builder.CreateBinOp(Opcode, X, Y); 
-    Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, XY); 
-    Fabs->takeName(&I); 
-    return replaceInstUsesWith(I, Fabs); 
-  } 
- 
-  return nullptr; 
-} 
- 
+      (Op0->hasOneUse() || Op1->hasOneUse())) {
+    IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
+    Builder.setFastMathFlags(I.getFastMathFlags());
+    Value *XY = Builder.CreateBinOp(Opcode, X, Y);
+    Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, XY);
+    Fabs->takeName(&I);
+    return replaceInstUsesWith(I, Fabs);
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::visitFMul(BinaryOperator &I) {
-  if (Value *V = SimplifyFMulInst(I.getOperand(0), I.getOperand(1), 
-                                  I.getFastMathFlags(), 
-                                  SQ.getWithInstruction(&I))) 
-    return replaceInstUsesWith(I, V); 
- 
-  if (SimplifyAssociativeOrCommutative(I)) 
-    return &I; 
- 
-  if (Instruction *X = foldVectorBinop(I)) 
-    return X; 
- 
-  if (Instruction *FoldedMul = foldBinOpIntoSelectOrPhi(I)) 
-    return FoldedMul; 
- 
-  if (Value *FoldedMul = foldMulSelectToNegate(I, Builder)) 
-    return replaceInstUsesWith(I, FoldedMul); 
- 
-  if (Instruction *R = foldFPSignBitOps(I)) 
-    return R; 
- 
-  // X * -1.0 --> -X 
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); 
-  if (match(Op1, m_SpecificFP(-1.0))) 
-    return UnaryOperator::CreateFNegFMF(Op0, &I); 
- 
-  // -X * C --> X * -C 
-  Value *X, *Y; 
-  Constant *C; 
-  if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_Constant(C))) 
-    return BinaryOperator::CreateFMulFMF(X, ConstantExpr::getFNeg(C), &I); 
- 
-  // (select A, B, C) * (select A, D, E) --> select A, (B*D), (C*E) 
-  if (Value *V = SimplifySelectsFeedingBinaryOp(I, Op0, Op1)) 
-    return replaceInstUsesWith(I, V); 
- 
-  if (I.hasAllowReassoc()) { 
-    // Reassociate constant RHS with another constant to form constant 
-    // expression. 
-    if (match(Op1, m_Constant(C)) && C->isFiniteNonZeroFP()) { 
-      Constant *C1; 
-      if (match(Op0, m_OneUse(m_FDiv(m_Constant(C1), m_Value(X))))) { 
-        // (C1 / X) * C --> (C * C1) / X 
-        Constant *CC1 = ConstantExpr::getFMul(C, C1); 
-        if (CC1->isNormalFP()) 
-          return BinaryOperator::CreateFDivFMF(CC1, X, &I); 
-      } 
-      if (match(Op0, m_FDiv(m_Value(X), m_Constant(C1)))) { 
-        // (X / C1) * C --> X * (C / C1) 
-        Constant *CDivC1 = ConstantExpr::getFDiv(C, C1); 
-        if (CDivC1->isNormalFP()) 
-          return BinaryOperator::CreateFMulFMF(X, CDivC1, &I); 
- 
-        // If the constant was a denormal, try reassociating differently. 
-        // (X / C1) * C --> X / (C1 / C) 
-        Constant *C1DivC = ConstantExpr::getFDiv(C1, C); 
-        if (Op0->hasOneUse() && C1DivC->isNormalFP()) 
-          return BinaryOperator::CreateFDivFMF(X, C1DivC, &I); 
-      } 
- 
-      // We do not need to match 'fadd C, X' and 'fsub X, C' because they are 
-      // canonicalized to 'fadd X, C'. Distributing the multiply may allow 
-      // further folds and (X * C) + C2 is 'fma'. 
-      if (match(Op0, m_OneUse(m_FAdd(m_Value(X), m_Constant(C1))))) { 
-        // (X + C1) * C --> (X * C) + (C * C1) 
-        Constant *CC1 = ConstantExpr::getFMul(C, C1); 
-        Value *XC = Builder.CreateFMulFMF(X, C, &I); 
-        return BinaryOperator::CreateFAddFMF(XC, CC1, &I); 
-      } 
-      if (match(Op0, m_OneUse(m_FSub(m_Constant(C1), m_Value(X))))) { 
-        // (C1 - X) * C --> (C * C1) - (X * C) 
-        Constant *CC1 = ConstantExpr::getFMul(C, C1); 
-        Value *XC = Builder.CreateFMulFMF(X, C, &I); 
-        return BinaryOperator::CreateFSubFMF(CC1, XC, &I); 
-      } 
-    } 
- 
-    Value *Z; 
-    if (match(&I, m_c_FMul(m_OneUse(m_FDiv(m_Value(X), m_Value(Y))), 
-                           m_Value(Z)))) { 
-      // Sink division: (X / Y) * Z --> (X * Z) / Y 
-      Value *NewFMul = Builder.CreateFMulFMF(X, Z, &I); 
-      return BinaryOperator::CreateFDivFMF(NewFMul, Y, &I); 
-    } 
- 
-    // sqrt(X) * sqrt(Y) -> sqrt(X * Y) 
-    // nnan disallows the possibility of returning a number if both operands are 
-    // negative (in that case, we should return NaN). 
-    if (I.hasNoNaNs() && 
-        match(Op0, m_OneUse(m_Intrinsic<Intrinsic::sqrt>(m_Value(X)))) && 
-        match(Op1, m_OneUse(m_Intrinsic<Intrinsic::sqrt>(m_Value(Y))))) { 
-      Value *XY = Builder.CreateFMulFMF(X, Y, &I); 
-      Value *Sqrt = Builder.CreateUnaryIntrinsic(Intrinsic::sqrt, XY, &I); 
-      return replaceInstUsesWith(I, Sqrt); 
-    } 
- 
+  if (Value *V = SimplifyFMulInst(I.getOperand(0), I.getOperand(1),
+                                  I.getFastMathFlags(),
+                                  SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  if (SimplifyAssociativeOrCommutative(I))
+    return &I;
+
+  if (Instruction *X = foldVectorBinop(I))
+    return X;
+
+  if (Instruction *FoldedMul = foldBinOpIntoSelectOrPhi(I))
+    return FoldedMul;
+
+  if (Value *FoldedMul = foldMulSelectToNegate(I, Builder))
+    return replaceInstUsesWith(I, FoldedMul);
+
+  if (Instruction *R = foldFPSignBitOps(I))
+    return R;
+
+  // X * -1.0 --> -X
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  if (match(Op1, m_SpecificFP(-1.0)))
+    return UnaryOperator::CreateFNegFMF(Op0, &I);
+
+  // -X * C --> X * -C
+  Value *X, *Y;
+  Constant *C;
+  if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_Constant(C)))
+    return BinaryOperator::CreateFMulFMF(X, ConstantExpr::getFNeg(C), &I);
+
+  // (select A, B, C) * (select A, D, E) --> select A, (B*D), (C*E)
+  if (Value *V = SimplifySelectsFeedingBinaryOp(I, Op0, Op1))
+    return replaceInstUsesWith(I, V);
+
+  if (I.hasAllowReassoc()) {
+    // Reassociate constant RHS with another constant to form constant
+    // expression.
+    if (match(Op1, m_Constant(C)) && C->isFiniteNonZeroFP()) {
+      Constant *C1;
+      if (match(Op0, m_OneUse(m_FDiv(m_Constant(C1), m_Value(X))))) {
+        // (C1 / X) * C --> (C * C1) / X
+        Constant *CC1 = ConstantExpr::getFMul(C, C1);
+        if (CC1->isNormalFP())
+          return BinaryOperator::CreateFDivFMF(CC1, X, &I);
+      }
+      if (match(Op0, m_FDiv(m_Value(X), m_Constant(C1)))) {
+        // (X / C1) * C --> X * (C / C1)
+        Constant *CDivC1 = ConstantExpr::getFDiv(C, C1);
+        if (CDivC1->isNormalFP())
+          return BinaryOperator::CreateFMulFMF(X, CDivC1, &I);
+
+        // If the constant was a denormal, try reassociating differently.
+        // (X / C1) * C --> X / (C1 / C)
+        Constant *C1DivC = ConstantExpr::getFDiv(C1, C);
+        if (Op0->hasOneUse() && C1DivC->isNormalFP())
+          return BinaryOperator::CreateFDivFMF(X, C1DivC, &I);
+      }
+
+      // We do not need to match 'fadd C, X' and 'fsub X, C' because they are
+      // canonicalized to 'fadd X, C'. Distributing the multiply may allow
+      // further folds and (X * C) + C2 is 'fma'.
+      if (match(Op0, m_OneUse(m_FAdd(m_Value(X), m_Constant(C1))))) {
+        // (X + C1) * C --> (X * C) + (C * C1)
+        Constant *CC1 = ConstantExpr::getFMul(C, C1);
+        Value *XC = Builder.CreateFMulFMF(X, C, &I);
+        return BinaryOperator::CreateFAddFMF(XC, CC1, &I);
+      }
+      if (match(Op0, m_OneUse(m_FSub(m_Constant(C1), m_Value(X))))) {
+        // (C1 - X) * C --> (C * C1) - (X * C)
+        Constant *CC1 = ConstantExpr::getFMul(C, C1);
+        Value *XC = Builder.CreateFMulFMF(X, C, &I);
+        return BinaryOperator::CreateFSubFMF(CC1, XC, &I);
+      }
+    }
+
+    Value *Z;
+    if (match(&I, m_c_FMul(m_OneUse(m_FDiv(m_Value(X), m_Value(Y))),
+                           m_Value(Z)))) {
+      // Sink division: (X / Y) * Z --> (X * Z) / Y
+      Value *NewFMul = Builder.CreateFMulFMF(X, Z, &I);
+      return BinaryOperator::CreateFDivFMF(NewFMul, Y, &I);
+    }
+
+    // sqrt(X) * sqrt(Y) -> sqrt(X * Y)
+    // nnan disallows the possibility of returning a number if both operands are
+    // negative (in that case, we should return NaN).
+    if (I.hasNoNaNs() &&
+        match(Op0, m_OneUse(m_Intrinsic<Intrinsic::sqrt>(m_Value(X)))) &&
+        match(Op1, m_OneUse(m_Intrinsic<Intrinsic::sqrt>(m_Value(Y))))) {
+      Value *XY = Builder.CreateFMulFMF(X, Y, &I);
+      Value *Sqrt = Builder.CreateUnaryIntrinsic(Intrinsic::sqrt, XY, &I);
+      return replaceInstUsesWith(I, Sqrt);
+    }
+
     // The following transforms are done irrespective of the number of uses
     // for the expression "1.0/sqrt(X)".
     //  1) 1.0/sqrt(X) * X -> X/sqrt(X)
@@ -536,588 +536,588 @@ Instruction *InstCombinerImpl::visitFMul(BinaryOperator &I) {
         match(Y, m_Intrinsic<Intrinsic::sqrt>(m_Value(X))) && Op0 == X)
       return BinaryOperator::CreateFDivFMF(X, Y, &I);
 
-    // Like the similar transform in instsimplify, this requires 'nsz' because 
-    // sqrt(-0.0) = -0.0, and -0.0 * -0.0 does not simplify to -0.0. 
-    if (I.hasNoNaNs() && I.hasNoSignedZeros() && Op0 == Op1 && 
-        Op0->hasNUses(2)) { 
-      // Peek through fdiv to find squaring of square root: 
-      // (X / sqrt(Y)) * (X / sqrt(Y)) --> (X * X) / Y 
-      if (match(Op0, m_FDiv(m_Value(X), 
-                            m_Intrinsic<Intrinsic::sqrt>(m_Value(Y))))) { 
-        Value *XX = Builder.CreateFMulFMF(X, X, &I); 
-        return BinaryOperator::CreateFDivFMF(XX, Y, &I); 
-      } 
-      // (sqrt(Y) / X) * (sqrt(Y) / X) --> Y / (X * X) 
-      if (match(Op0, m_FDiv(m_Intrinsic<Intrinsic::sqrt>(m_Value(Y)), 
-                            m_Value(X)))) { 
-        Value *XX = Builder.CreateFMulFMF(X, X, &I); 
-        return BinaryOperator::CreateFDivFMF(Y, XX, &I); 
-      } 
-    } 
- 
-    // exp(X) * exp(Y) -> exp(X + Y) 
-    // Match as long as at least one of exp has only one use. 
-    if (match(Op0, m_Intrinsic<Intrinsic::exp>(m_Value(X))) && 
-        match(Op1, m_Intrinsic<Intrinsic::exp>(m_Value(Y))) && 
-        (Op0->hasOneUse() || Op1->hasOneUse())) { 
-      Value *XY = Builder.CreateFAddFMF(X, Y, &I); 
-      Value *Exp = Builder.CreateUnaryIntrinsic(Intrinsic::exp, XY, &I); 
-      return replaceInstUsesWith(I, Exp); 
-    } 
- 
-    // exp2(X) * exp2(Y) -> exp2(X + Y) 
-    // Match as long as at least one of exp2 has only one use. 
-    if (match(Op0, m_Intrinsic<Intrinsic::exp2>(m_Value(X))) && 
-        match(Op1, m_Intrinsic<Intrinsic::exp2>(m_Value(Y))) && 
-        (Op0->hasOneUse() || Op1->hasOneUse())) { 
-      Value *XY = Builder.CreateFAddFMF(X, Y, &I); 
-      Value *Exp2 = Builder.CreateUnaryIntrinsic(Intrinsic::exp2, XY, &I); 
-      return replaceInstUsesWith(I, Exp2); 
-    } 
- 
-    // (X*Y) * X => (X*X) * Y where Y != X 
-    //  The purpose is two-fold: 
-    //   1) to form a power expression (of X). 
-    //   2) potentially shorten the critical path: After transformation, the 
-    //  latency of the instruction Y is amortized by the expression of X*X, 
-    //  and therefore Y is in a "less critical" position compared to what it 
-    //  was before the transformation. 
-    if (match(Op0, m_OneUse(m_c_FMul(m_Specific(Op1), m_Value(Y)))) && 
-        Op1 != Y) { 
-      Value *XX = Builder.CreateFMulFMF(Op1, Op1, &I); 
-      return BinaryOperator::CreateFMulFMF(XX, Y, &I); 
-    } 
-    if (match(Op1, m_OneUse(m_c_FMul(m_Specific(Op0), m_Value(Y)))) && 
-        Op0 != Y) { 
-      Value *XX = Builder.CreateFMulFMF(Op0, Op0, &I); 
-      return BinaryOperator::CreateFMulFMF(XX, Y, &I); 
-    } 
-  } 
- 
-  // log2(X * 0.5) * Y = log2(X) * Y - Y 
-  if (I.isFast()) { 
-    IntrinsicInst *Log2 = nullptr; 
-    if (match(Op0, m_OneUse(m_Intrinsic<Intrinsic::log2>( 
-            m_OneUse(m_FMul(m_Value(X), m_SpecificFP(0.5))))))) { 
-      Log2 = cast<IntrinsicInst>(Op0); 
-      Y = Op1; 
-    } 
-    if (match(Op1, m_OneUse(m_Intrinsic<Intrinsic::log2>( 
-            m_OneUse(m_FMul(m_Value(X), m_SpecificFP(0.5))))))) { 
-      Log2 = cast<IntrinsicInst>(Op1); 
-      Y = Op0; 
-    } 
-    if (Log2) { 
-      Value *Log2 = Builder.CreateUnaryIntrinsic(Intrinsic::log2, X, &I); 
-      Value *LogXTimesY = Builder.CreateFMulFMF(Log2, Y, &I); 
-      return BinaryOperator::CreateFSubFMF(LogXTimesY, Y, &I); 
-    } 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Fold a divide or remainder with a select instruction divisor when one of the 
-/// select operands is zero. In that case, we can use the other select operand 
-/// because div/rem by zero is undefined. 
+    // Like the similar transform in instsimplify, this requires 'nsz' because
+    // sqrt(-0.0) = -0.0, and -0.0 * -0.0 does not simplify to -0.0.
+    if (I.hasNoNaNs() && I.hasNoSignedZeros() && Op0 == Op1 &&
+        Op0->hasNUses(2)) {
+      // Peek through fdiv to find squaring of square root:
+      // (X / sqrt(Y)) * (X / sqrt(Y)) --> (X * X) / Y
+      if (match(Op0, m_FDiv(m_Value(X),
+                            m_Intrinsic<Intrinsic::sqrt>(m_Value(Y))))) {
+        Value *XX = Builder.CreateFMulFMF(X, X, &I);
+        return BinaryOperator::CreateFDivFMF(XX, Y, &I);
+      }
+      // (sqrt(Y) / X) * (sqrt(Y) / X) --> Y / (X * X)
+      if (match(Op0, m_FDiv(m_Intrinsic<Intrinsic::sqrt>(m_Value(Y)),
+                            m_Value(X)))) {
+        Value *XX = Builder.CreateFMulFMF(X, X, &I);
+        return BinaryOperator::CreateFDivFMF(Y, XX, &I);
+      }
+    }
+
+    // exp(X) * exp(Y) -> exp(X + Y)
+    // Match as long as at least one of exp has only one use.
+    if (match(Op0, m_Intrinsic<Intrinsic::exp>(m_Value(X))) &&
+        match(Op1, m_Intrinsic<Intrinsic::exp>(m_Value(Y))) &&
+        (Op0->hasOneUse() || Op1->hasOneUse())) {
+      Value *XY = Builder.CreateFAddFMF(X, Y, &I);
+      Value *Exp = Builder.CreateUnaryIntrinsic(Intrinsic::exp, XY, &I);
+      return replaceInstUsesWith(I, Exp);
+    }
+
+    // exp2(X) * exp2(Y) -> exp2(X + Y)
+    // Match as long as at least one of exp2 has only one use.
+    if (match(Op0, m_Intrinsic<Intrinsic::exp2>(m_Value(X))) &&
+        match(Op1, m_Intrinsic<Intrinsic::exp2>(m_Value(Y))) &&
+        (Op0->hasOneUse() || Op1->hasOneUse())) {
+      Value *XY = Builder.CreateFAddFMF(X, Y, &I);
+      Value *Exp2 = Builder.CreateUnaryIntrinsic(Intrinsic::exp2, XY, &I);
+      return replaceInstUsesWith(I, Exp2);
+    }
+
+    // (X*Y) * X => (X*X) * Y where Y != X
+    //  The purpose is two-fold:
+    //   1) to form a power expression (of X).
+    //   2) potentially shorten the critical path: After transformation, the
+    //  latency of the instruction Y is amortized by the expression of X*X,
+    //  and therefore Y is in a "less critical" position compared to what it
+    //  was before the transformation.
+    if (match(Op0, m_OneUse(m_c_FMul(m_Specific(Op1), m_Value(Y)))) &&
+        Op1 != Y) {
+      Value *XX = Builder.CreateFMulFMF(Op1, Op1, &I);
+      return BinaryOperator::CreateFMulFMF(XX, Y, &I);
+    }
+    if (match(Op1, m_OneUse(m_c_FMul(m_Specific(Op0), m_Value(Y)))) &&
+        Op0 != Y) {
+      Value *XX = Builder.CreateFMulFMF(Op0, Op0, &I);
+      return BinaryOperator::CreateFMulFMF(XX, Y, &I);
+    }
+  }
+
+  // log2(X * 0.5) * Y = log2(X) * Y - Y
+  if (I.isFast()) {
+    IntrinsicInst *Log2 = nullptr;
+    if (match(Op0, m_OneUse(m_Intrinsic<Intrinsic::log2>(
+            m_OneUse(m_FMul(m_Value(X), m_SpecificFP(0.5))))))) {
+      Log2 = cast<IntrinsicInst>(Op0);
+      Y = Op1;
+    }
+    if (match(Op1, m_OneUse(m_Intrinsic<Intrinsic::log2>(
+            m_OneUse(m_FMul(m_Value(X), m_SpecificFP(0.5))))))) {
+      Log2 = cast<IntrinsicInst>(Op1);
+      Y = Op0;
+    }
+    if (Log2) {
+      Value *Log2 = Builder.CreateUnaryIntrinsic(Intrinsic::log2, X, &I);
+      Value *LogXTimesY = Builder.CreateFMulFMF(Log2, Y, &I);
+      return BinaryOperator::CreateFSubFMF(LogXTimesY, Y, &I);
+    }
+  }
+
+  return nullptr;
+}
+
+/// Fold a divide or remainder with a select instruction divisor when one of the
+/// select operands is zero. In that case, we can use the other select operand
+/// because div/rem by zero is undefined.
 bool InstCombinerImpl::simplifyDivRemOfSelectWithZeroOp(BinaryOperator &I) {
-  SelectInst *SI = dyn_cast<SelectInst>(I.getOperand(1)); 
-  if (!SI) 
-    return false; 
- 
-  int NonNullOperand; 
-  if (match(SI->getTrueValue(), m_Zero())) 
-    // div/rem X, (Cond ? 0 : Y) -> div/rem X, Y 
-    NonNullOperand = 2; 
-  else if (match(SI->getFalseValue(), m_Zero())) 
-    // div/rem X, (Cond ? Y : 0) -> div/rem X, Y 
-    NonNullOperand = 1; 
-  else 
-    return false; 
- 
-  // Change the div/rem to use 'Y' instead of the select. 
-  replaceOperand(I, 1, SI->getOperand(NonNullOperand)); 
- 
-  // Okay, we know we replace the operand of the div/rem with 'Y' with no 
-  // problem.  However, the select, or the condition of the select may have 
-  // multiple uses.  Based on our knowledge that the operand must be non-zero, 
-  // propagate the known value for the select into other uses of it, and 
-  // propagate a known value of the condition into its other users. 
- 
-  // If the select and condition only have a single use, don't bother with this, 
-  // early exit. 
-  Value *SelectCond = SI->getCondition(); 
-  if (SI->use_empty() && SelectCond->hasOneUse()) 
-    return true; 
- 
-  // Scan the current block backward, looking for other uses of SI. 
-  BasicBlock::iterator BBI = I.getIterator(), BBFront = I.getParent()->begin(); 
-  Type *CondTy = SelectCond->getType(); 
-  while (BBI != BBFront) { 
-    --BBI; 
-    // If we found an instruction that we can't assume will return, so 
-    // information from below it cannot be propagated above it. 
-    if (!isGuaranteedToTransferExecutionToSuccessor(&*BBI)) 
-      break; 
- 
-    // Replace uses of the select or its condition with the known values. 
-    for (Instruction::op_iterator I = BBI->op_begin(), E = BBI->op_end(); 
-         I != E; ++I) { 
-      if (*I == SI) { 
-        replaceUse(*I, SI->getOperand(NonNullOperand)); 
-        Worklist.push(&*BBI); 
-      } else if (*I == SelectCond) { 
-        replaceUse(*I, NonNullOperand == 1 ? ConstantInt::getTrue(CondTy) 
-                                           : ConstantInt::getFalse(CondTy)); 
-        Worklist.push(&*BBI); 
-      } 
-    } 
- 
-    // If we past the instruction, quit looking for it. 
-    if (&*BBI == SI) 
-      SI = nullptr; 
-    if (&*BBI == SelectCond) 
-      SelectCond = nullptr; 
- 
-    // If we ran out of things to eliminate, break out of the loop. 
-    if (!SelectCond && !SI) 
-      break; 
- 
-  } 
-  return true; 
-} 
- 
-/// True if the multiply can not be expressed in an int this size. 
-static bool multiplyOverflows(const APInt &C1, const APInt &C2, APInt &Product, 
-                              bool IsSigned) { 
-  bool Overflow; 
-  Product = IsSigned ? C1.smul_ov(C2, Overflow) : C1.umul_ov(C2, Overflow); 
-  return Overflow; 
-} 
- 
-/// True if C1 is a multiple of C2. Quotient contains C1/C2. 
-static bool isMultiple(const APInt &C1, const APInt &C2, APInt &Quotient, 
-                       bool IsSigned) { 
-  assert(C1.getBitWidth() == C2.getBitWidth() && "Constant widths not equal"); 
- 
-  // Bail if we will divide by zero. 
-  if (C2.isNullValue()) 
-    return false; 
- 
-  // Bail if we would divide INT_MIN by -1. 
-  if (IsSigned && C1.isMinSignedValue() && C2.isAllOnesValue()) 
-    return false; 
- 
-  APInt Remainder(C1.getBitWidth(), /*val=*/0ULL, IsSigned); 
-  if (IsSigned) 
-    APInt::sdivrem(C1, C2, Quotient, Remainder); 
-  else 
-    APInt::udivrem(C1, C2, Quotient, Remainder); 
- 
-  return Remainder.isMinValue(); 
-} 
- 
-/// This function implements the transforms common to both integer division 
-/// instructions (udiv and sdiv). It is called by the visitors to those integer 
-/// division instructions. 
-/// Common integer divide transforms 
+  SelectInst *SI = dyn_cast<SelectInst>(I.getOperand(1));
+  if (!SI)
+    return false;
+
+  int NonNullOperand;
+  if (match(SI->getTrueValue(), m_Zero()))
+    // div/rem X, (Cond ? 0 : Y) -> div/rem X, Y
+    NonNullOperand = 2;
+  else if (match(SI->getFalseValue(), m_Zero()))
+    // div/rem X, (Cond ? Y : 0) -> div/rem X, Y
+    NonNullOperand = 1;
+  else
+    return false;
+
+  // Change the div/rem to use 'Y' instead of the select.
+  replaceOperand(I, 1, SI->getOperand(NonNullOperand));
+
+  // Okay, we know we replace the operand of the div/rem with 'Y' with no
+  // problem.  However, the select, or the condition of the select may have
+  // multiple uses.  Based on our knowledge that the operand must be non-zero,
+  // propagate the known value for the select into other uses of it, and
+  // propagate a known value of the condition into its other users.
+
+  // If the select and condition only have a single use, don't bother with this,
+  // early exit.
+  Value *SelectCond = SI->getCondition();
+  if (SI->use_empty() && SelectCond->hasOneUse())
+    return true;
+
+  // Scan the current block backward, looking for other uses of SI.
+  BasicBlock::iterator BBI = I.getIterator(), BBFront = I.getParent()->begin();
+  Type *CondTy = SelectCond->getType();
+  while (BBI != BBFront) {
+    --BBI;
+    // If we found an instruction that we can't assume will return, so
+    // information from below it cannot be propagated above it.
+    if (!isGuaranteedToTransferExecutionToSuccessor(&*BBI))
+      break;
+
+    // Replace uses of the select or its condition with the known values.
+    for (Instruction::op_iterator I = BBI->op_begin(), E = BBI->op_end();
+         I != E; ++I) {
+      if (*I == SI) {
+        replaceUse(*I, SI->getOperand(NonNullOperand));
+        Worklist.push(&*BBI);
+      } else if (*I == SelectCond) {
+        replaceUse(*I, NonNullOperand == 1 ? ConstantInt::getTrue(CondTy)
+                                           : ConstantInt::getFalse(CondTy));
+        Worklist.push(&*BBI);
+      }
+    }
+
+    // If we past the instruction, quit looking for it.
+    if (&*BBI == SI)
+      SI = nullptr;
+    if (&*BBI == SelectCond)
+      SelectCond = nullptr;
+
+    // If we ran out of things to eliminate, break out of the loop.
+    if (!SelectCond && !SI)
+      break;
+
+  }
+  return true;
+}
+
+/// True if the multiply can not be expressed in an int this size.
+static bool multiplyOverflows(const APInt &C1, const APInt &C2, APInt &Product,
+                              bool IsSigned) {
+  bool Overflow;
+  Product = IsSigned ? C1.smul_ov(C2, Overflow) : C1.umul_ov(C2, Overflow);
+  return Overflow;
+}
+
+/// True if C1 is a multiple of C2. Quotient contains C1/C2.
+static bool isMultiple(const APInt &C1, const APInt &C2, APInt &Quotient,
+                       bool IsSigned) {
+  assert(C1.getBitWidth() == C2.getBitWidth() && "Constant widths not equal");
+
+  // Bail if we will divide by zero.
+  if (C2.isNullValue())
+    return false;
+
+  // Bail if we would divide INT_MIN by -1.
+  if (IsSigned && C1.isMinSignedValue() && C2.isAllOnesValue())
+    return false;
+
+  APInt Remainder(C1.getBitWidth(), /*val=*/0ULL, IsSigned);
+  if (IsSigned)
+    APInt::sdivrem(C1, C2, Quotient, Remainder);
+  else
+    APInt::udivrem(C1, C2, Quotient, Remainder);
+
+  return Remainder.isMinValue();
+}
+
+/// This function implements the transforms common to both integer division
+/// instructions (udiv and sdiv). It is called by the visitors to those integer
+/// division instructions.
+/// Common integer divide transforms
 Instruction *InstCombinerImpl::commonIDivTransforms(BinaryOperator &I) {
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); 
-  bool IsSigned = I.getOpcode() == Instruction::SDiv; 
-  Type *Ty = I.getType(); 
- 
-  // The RHS is known non-zero. 
-  if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this, I)) 
-    return replaceOperand(I, 1, V); 
- 
-  // Handle cases involving: [su]div X, (select Cond, Y, Z) 
-  // This does not apply for fdiv. 
-  if (simplifyDivRemOfSelectWithZeroOp(I)) 
-    return &I; 
- 
-  const APInt *C2; 
-  if (match(Op1, m_APInt(C2))) { 
-    Value *X; 
-    const APInt *C1; 
- 
-    // (X / C1) / C2  -> X / (C1*C2) 
-    if ((IsSigned && match(Op0, m_SDiv(m_Value(X), m_APInt(C1)))) || 
-        (!IsSigned && match(Op0, m_UDiv(m_Value(X), m_APInt(C1))))) { 
-      APInt Product(C1->getBitWidth(), /*val=*/0ULL, IsSigned); 
-      if (!multiplyOverflows(*C1, *C2, Product, IsSigned)) 
-        return BinaryOperator::Create(I.getOpcode(), X, 
-                                      ConstantInt::get(Ty, Product)); 
-    } 
- 
-    if ((IsSigned && match(Op0, m_NSWMul(m_Value(X), m_APInt(C1)))) || 
-        (!IsSigned && match(Op0, m_NUWMul(m_Value(X), m_APInt(C1))))) { 
-      APInt Quotient(C1->getBitWidth(), /*val=*/0ULL, IsSigned); 
- 
-      // (X * C1) / C2 -> X / (C2 / C1) if C2 is a multiple of C1. 
-      if (isMultiple(*C2, *C1, Quotient, IsSigned)) { 
-        auto *NewDiv = BinaryOperator::Create(I.getOpcode(), X, 
-                                              ConstantInt::get(Ty, Quotient)); 
-        NewDiv->setIsExact(I.isExact()); 
-        return NewDiv; 
-      } 
- 
-      // (X * C1) / C2 -> X * (C1 / C2) if C1 is a multiple of C2. 
-      if (isMultiple(*C1, *C2, Quotient, IsSigned)) { 
-        auto *Mul = BinaryOperator::Create(Instruction::Mul, X, 
-                                           ConstantInt::get(Ty, Quotient)); 
-        auto *OBO = cast<OverflowingBinaryOperator>(Op0); 
-        Mul->setHasNoUnsignedWrap(!IsSigned && OBO->hasNoUnsignedWrap()); 
-        Mul->setHasNoSignedWrap(OBO->hasNoSignedWrap()); 
-        return Mul; 
-      } 
-    } 
- 
-    if ((IsSigned && match(Op0, m_NSWShl(m_Value(X), m_APInt(C1))) && 
-         *C1 != C1->getBitWidth() - 1) || 
-        (!IsSigned && match(Op0, m_NUWShl(m_Value(X), m_APInt(C1))))) { 
-      APInt Quotient(C1->getBitWidth(), /*val=*/0ULL, IsSigned); 
-      APInt C1Shifted = APInt::getOneBitSet( 
-          C1->getBitWidth(), static_cast<unsigned>(C1->getLimitedValue())); 
- 
-      // (X << C1) / C2 -> X / (C2 >> C1) if C2 is a multiple of 1 << C1. 
-      if (isMultiple(*C2, C1Shifted, Quotient, IsSigned)) { 
-        auto *BO = BinaryOperator::Create(I.getOpcode(), X, 
-                                          ConstantInt::get(Ty, Quotient)); 
-        BO->setIsExact(I.isExact()); 
-        return BO; 
-      } 
- 
-      // (X << C1) / C2 -> X * ((1 << C1) / C2) if 1 << C1 is a multiple of C2. 
-      if (isMultiple(C1Shifted, *C2, Quotient, IsSigned)) { 
-        auto *Mul = BinaryOperator::Create(Instruction::Mul, X, 
-                                           ConstantInt::get(Ty, Quotient)); 
-        auto *OBO = cast<OverflowingBinaryOperator>(Op0); 
-        Mul->setHasNoUnsignedWrap(!IsSigned && OBO->hasNoUnsignedWrap()); 
-        Mul->setHasNoSignedWrap(OBO->hasNoSignedWrap()); 
-        return Mul; 
-      } 
-    } 
- 
-    if (!C2->isNullValue()) // avoid X udiv 0 
-      if (Instruction *FoldedDiv = foldBinOpIntoSelectOrPhi(I)) 
-        return FoldedDiv; 
-  } 
- 
-  if (match(Op0, m_One())) { 
-    assert(!Ty->isIntOrIntVectorTy(1) && "i1 divide not removed?"); 
-    if (IsSigned) { 
-      // If Op1 is 0 then it's undefined behaviour, if Op1 is 1 then the 
-      // result is one, if Op1 is -1 then the result is minus one, otherwise 
-      // it's zero. 
-      Value *Inc = Builder.CreateAdd(Op1, Op0); 
-      Value *Cmp = Builder.CreateICmpULT(Inc, ConstantInt::get(Ty, 3)); 
-      return SelectInst::Create(Cmp, Op1, ConstantInt::get(Ty, 0)); 
-    } else { 
-      // If Op1 is 0 then it's undefined behaviour. If Op1 is 1 then the 
-      // result is one, otherwise it's zero. 
-      return new ZExtInst(Builder.CreateICmpEQ(Op1, Op0), Ty); 
-    } 
-  } 
- 
-  // See if we can fold away this div instruction. 
-  if (SimplifyDemandedInstructionBits(I)) 
-    return &I; 
- 
-  // (X - (X rem Y)) / Y -> X / Y; usually originates as ((X / Y) * Y) / Y 
-  Value *X, *Z; 
-  if (match(Op0, m_Sub(m_Value(X), m_Value(Z)))) // (X - Z) / Y; Y = Op1 
-    if ((IsSigned && match(Z, m_SRem(m_Specific(X), m_Specific(Op1)))) || 
-        (!IsSigned && match(Z, m_URem(m_Specific(X), m_Specific(Op1))))) 
-      return BinaryOperator::Create(I.getOpcode(), X, Op1); 
- 
-  // (X << Y) / X -> 1 << Y 
-  Value *Y; 
-  if (IsSigned && match(Op0, m_NSWShl(m_Specific(Op1), m_Value(Y)))) 
-    return BinaryOperator::CreateNSWShl(ConstantInt::get(Ty, 1), Y); 
-  if (!IsSigned && match(Op0, m_NUWShl(m_Specific(Op1), m_Value(Y)))) 
-    return BinaryOperator::CreateNUWShl(ConstantInt::get(Ty, 1), Y); 
- 
-  // X / (X * Y) -> 1 / Y if the multiplication does not overflow. 
-  if (match(Op1, m_c_Mul(m_Specific(Op0), m_Value(Y)))) { 
-    bool HasNSW = cast<OverflowingBinaryOperator>(Op1)->hasNoSignedWrap(); 
-    bool HasNUW = cast<OverflowingBinaryOperator>(Op1)->hasNoUnsignedWrap(); 
-    if ((IsSigned && HasNSW) || (!IsSigned && HasNUW)) { 
-      replaceOperand(I, 0, ConstantInt::get(Ty, 1)); 
-      replaceOperand(I, 1, Y); 
-      return &I; 
-    } 
-  } 
- 
-  return nullptr; 
-} 
- 
-static const unsigned MaxDepth = 6; 
- 
-namespace { 
- 
-using FoldUDivOperandCb = Instruction *(*)(Value *Op0, Value *Op1, 
-                                           const BinaryOperator &I, 
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  bool IsSigned = I.getOpcode() == Instruction::SDiv;
+  Type *Ty = I.getType();
+
+  // The RHS is known non-zero.
+  if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this, I))
+    return replaceOperand(I, 1, V);
+
+  // Handle cases involving: [su]div X, (select Cond, Y, Z)
+  // This does not apply for fdiv.
+  if (simplifyDivRemOfSelectWithZeroOp(I))
+    return &I;
+
+  const APInt *C2;
+  if (match(Op1, m_APInt(C2))) {
+    Value *X;
+    const APInt *C1;
+
+    // (X / C1) / C2  -> X / (C1*C2)
+    if ((IsSigned && match(Op0, m_SDiv(m_Value(X), m_APInt(C1)))) ||
+        (!IsSigned && match(Op0, m_UDiv(m_Value(X), m_APInt(C1))))) {
+      APInt Product(C1->getBitWidth(), /*val=*/0ULL, IsSigned);
+      if (!multiplyOverflows(*C1, *C2, Product, IsSigned))
+        return BinaryOperator::Create(I.getOpcode(), X,
+                                      ConstantInt::get(Ty, Product));
+    }
+
+    if ((IsSigned && match(Op0, m_NSWMul(m_Value(X), m_APInt(C1)))) ||
+        (!IsSigned && match(Op0, m_NUWMul(m_Value(X), m_APInt(C1))))) {
+      APInt Quotient(C1->getBitWidth(), /*val=*/0ULL, IsSigned);
+
+      // (X * C1) / C2 -> X / (C2 / C1) if C2 is a multiple of C1.
+      if (isMultiple(*C2, *C1, Quotient, IsSigned)) {
+        auto *NewDiv = BinaryOperator::Create(I.getOpcode(), X,
+                                              ConstantInt::get(Ty, Quotient));
+        NewDiv->setIsExact(I.isExact());
+        return NewDiv;
+      }
+
+      // (X * C1) / C2 -> X * (C1 / C2) if C1 is a multiple of C2.
+      if (isMultiple(*C1, *C2, Quotient, IsSigned)) {
+        auto *Mul = BinaryOperator::Create(Instruction::Mul, X,
+                                           ConstantInt::get(Ty, Quotient));
+        auto *OBO = cast<OverflowingBinaryOperator>(Op0);
+        Mul->setHasNoUnsignedWrap(!IsSigned && OBO->hasNoUnsignedWrap());
+        Mul->setHasNoSignedWrap(OBO->hasNoSignedWrap());
+        return Mul;
+      }
+    }
+
+    if ((IsSigned && match(Op0, m_NSWShl(m_Value(X), m_APInt(C1))) &&
+         *C1 != C1->getBitWidth() - 1) ||
+        (!IsSigned && match(Op0, m_NUWShl(m_Value(X), m_APInt(C1))))) {
+      APInt Quotient(C1->getBitWidth(), /*val=*/0ULL, IsSigned);
+      APInt C1Shifted = APInt::getOneBitSet(
+          C1->getBitWidth(), static_cast<unsigned>(C1->getLimitedValue()));
+
+      // (X << C1) / C2 -> X / (C2 >> C1) if C2 is a multiple of 1 << C1.
+      if (isMultiple(*C2, C1Shifted, Quotient, IsSigned)) {
+        auto *BO = BinaryOperator::Create(I.getOpcode(), X,
+                                          ConstantInt::get(Ty, Quotient));
+        BO->setIsExact(I.isExact());
+        return BO;
+      }
+
+      // (X << C1) / C2 -> X * ((1 << C1) / C2) if 1 << C1 is a multiple of C2.
+      if (isMultiple(C1Shifted, *C2, Quotient, IsSigned)) {
+        auto *Mul = BinaryOperator::Create(Instruction::Mul, X,
+                                           ConstantInt::get(Ty, Quotient));
+        auto *OBO = cast<OverflowingBinaryOperator>(Op0);
+        Mul->setHasNoUnsignedWrap(!IsSigned && OBO->hasNoUnsignedWrap());
+        Mul->setHasNoSignedWrap(OBO->hasNoSignedWrap());
+        return Mul;
+      }
+    }
+
+    if (!C2->isNullValue()) // avoid X udiv 0
+      if (Instruction *FoldedDiv = foldBinOpIntoSelectOrPhi(I))
+        return FoldedDiv;
+  }
+
+  if (match(Op0, m_One())) {
+    assert(!Ty->isIntOrIntVectorTy(1) && "i1 divide not removed?");
+    if (IsSigned) {
+      // If Op1 is 0 then it's undefined behaviour, if Op1 is 1 then the
+      // result is one, if Op1 is -1 then the result is minus one, otherwise
+      // it's zero.
+      Value *Inc = Builder.CreateAdd(Op1, Op0);
+      Value *Cmp = Builder.CreateICmpULT(Inc, ConstantInt::get(Ty, 3));
+      return SelectInst::Create(Cmp, Op1, ConstantInt::get(Ty, 0));
+    } else {
+      // If Op1 is 0 then it's undefined behaviour. If Op1 is 1 then the
+      // result is one, otherwise it's zero.
+      return new ZExtInst(Builder.CreateICmpEQ(Op1, Op0), Ty);
+    }
+  }
+
+  // See if we can fold away this div instruction.
+  if (SimplifyDemandedInstructionBits(I))
+    return &I;
+
+  // (X - (X rem Y)) / Y -> X / Y; usually originates as ((X / Y) * Y) / Y
+  Value *X, *Z;
+  if (match(Op0, m_Sub(m_Value(X), m_Value(Z)))) // (X - Z) / Y; Y = Op1
+    if ((IsSigned && match(Z, m_SRem(m_Specific(X), m_Specific(Op1)))) ||
+        (!IsSigned && match(Z, m_URem(m_Specific(X), m_Specific(Op1)))))
+      return BinaryOperator::Create(I.getOpcode(), X, Op1);
+
+  // (X << Y) / X -> 1 << Y
+  Value *Y;
+  if (IsSigned && match(Op0, m_NSWShl(m_Specific(Op1), m_Value(Y))))
+    return BinaryOperator::CreateNSWShl(ConstantInt::get(Ty, 1), Y);
+  if (!IsSigned && match(Op0, m_NUWShl(m_Specific(Op1), m_Value(Y))))
+    return BinaryOperator::CreateNUWShl(ConstantInt::get(Ty, 1), Y);
+
+  // X / (X * Y) -> 1 / Y if the multiplication does not overflow.
+  if (match(Op1, m_c_Mul(m_Specific(Op0), m_Value(Y)))) {
+    bool HasNSW = cast<OverflowingBinaryOperator>(Op1)->hasNoSignedWrap();
+    bool HasNUW = cast<OverflowingBinaryOperator>(Op1)->hasNoUnsignedWrap();
+    if ((IsSigned && HasNSW) || (!IsSigned && HasNUW)) {
+      replaceOperand(I, 0, ConstantInt::get(Ty, 1));
+      replaceOperand(I, 1, Y);
+      return &I;
+    }
+  }
+
+  return nullptr;
+}
+
+static const unsigned MaxDepth = 6;
+
+namespace {
+
+using FoldUDivOperandCb = Instruction *(*)(Value *Op0, Value *Op1,
+                                           const BinaryOperator &I,
                                            InstCombinerImpl &IC);
- 
-/// Used to maintain state for visitUDivOperand(). 
-struct UDivFoldAction { 
-  /// Informs visitUDiv() how to fold this operand.  This can be zero if this 
-  /// action joins two actions together. 
-  FoldUDivOperandCb FoldAction; 
- 
-  /// Which operand to fold. 
-  Value *OperandToFold; 
- 
-  union { 
-    /// The instruction returned when FoldAction is invoked. 
-    Instruction *FoldResult; 
- 
-    /// Stores the LHS action index if this action joins two actions together. 
-    size_t SelectLHSIdx; 
-  }; 
- 
-  UDivFoldAction(FoldUDivOperandCb FA, Value *InputOperand) 
-      : FoldAction(FA), OperandToFold(InputOperand), FoldResult(nullptr) {} 
-  UDivFoldAction(FoldUDivOperandCb FA, Value *InputOperand, size_t SLHS) 
-      : FoldAction(FA), OperandToFold(InputOperand), SelectLHSIdx(SLHS) {} 
-}; 
- 
-} // end anonymous namespace 
- 
-// X udiv 2^C -> X >> C 
-static Instruction *foldUDivPow2Cst(Value *Op0, Value *Op1, 
+
+/// Used to maintain state for visitUDivOperand().
+struct UDivFoldAction {
+  /// Informs visitUDiv() how to fold this operand.  This can be zero if this
+  /// action joins two actions together.
+  FoldUDivOperandCb FoldAction;
+
+  /// Which operand to fold.
+  Value *OperandToFold;
+
+  union {
+    /// The instruction returned when FoldAction is invoked.
+    Instruction *FoldResult;
+
+    /// Stores the LHS action index if this action joins two actions together.
+    size_t SelectLHSIdx;
+  };
+
+  UDivFoldAction(FoldUDivOperandCb FA, Value *InputOperand)
+      : FoldAction(FA), OperandToFold(InputOperand), FoldResult(nullptr) {}
+  UDivFoldAction(FoldUDivOperandCb FA, Value *InputOperand, size_t SLHS)
+      : FoldAction(FA), OperandToFold(InputOperand), SelectLHSIdx(SLHS) {}
+};
+
+} // end anonymous namespace
+
+// X udiv 2^C -> X >> C
+static Instruction *foldUDivPow2Cst(Value *Op0, Value *Op1,
                                     const BinaryOperator &I,
                                     InstCombinerImpl &IC) {
   Constant *C1 = ConstantExpr::getExactLogBase2(cast<Constant>(Op1));
-  if (!C1) 
-    llvm_unreachable("Failed to constant fold udiv -> logbase2"); 
-  BinaryOperator *LShr = BinaryOperator::CreateLShr(Op0, C1); 
-  if (I.isExact()) 
-    LShr->setIsExact(); 
-  return LShr; 
-} 
- 
-// X udiv (C1 << N), where C1 is "1<<C2"  -->  X >> (N+C2) 
-// X udiv (zext (C1 << N)), where C1 is "1<<C2"  -->  X >> (N+C2) 
-static Instruction *foldUDivShl(Value *Op0, Value *Op1, const BinaryOperator &I, 
+  if (!C1)
+    llvm_unreachable("Failed to constant fold udiv -> logbase2");
+  BinaryOperator *LShr = BinaryOperator::CreateLShr(Op0, C1);
+  if (I.isExact())
+    LShr->setIsExact();
+  return LShr;
+}
+
+// X udiv (C1 << N), where C1 is "1<<C2"  -->  X >> (N+C2)
+// X udiv (zext (C1 << N)), where C1 is "1<<C2"  -->  X >> (N+C2)
+static Instruction *foldUDivShl(Value *Op0, Value *Op1, const BinaryOperator &I,
                                 InstCombinerImpl &IC) {
-  Value *ShiftLeft; 
-  if (!match(Op1, m_ZExt(m_Value(ShiftLeft)))) 
-    ShiftLeft = Op1; 
- 
-  Constant *CI; 
-  Value *N; 
-  if (!match(ShiftLeft, m_Shl(m_Constant(CI), m_Value(N)))) 
-    llvm_unreachable("match should never fail here!"); 
+  Value *ShiftLeft;
+  if (!match(Op1, m_ZExt(m_Value(ShiftLeft))))
+    ShiftLeft = Op1;
+
+  Constant *CI;
+  Value *N;
+  if (!match(ShiftLeft, m_Shl(m_Constant(CI), m_Value(N))))
+    llvm_unreachable("match should never fail here!");
   Constant *Log2Base = ConstantExpr::getExactLogBase2(CI);
-  if (!Log2Base) 
-    llvm_unreachable("getLogBase2 should never fail here!"); 
-  N = IC.Builder.CreateAdd(N, Log2Base); 
-  if (Op1 != ShiftLeft) 
-    N = IC.Builder.CreateZExt(N, Op1->getType()); 
-  BinaryOperator *LShr = BinaryOperator::CreateLShr(Op0, N); 
-  if (I.isExact()) 
-    LShr->setIsExact(); 
-  return LShr; 
-} 
- 
-// Recursively visits the possible right hand operands of a udiv 
-// instruction, seeing through select instructions, to determine if we can 
-// replace the udiv with something simpler.  If we find that an operand is not 
-// able to simplify the udiv, we abort the entire transformation. 
-static size_t visitUDivOperand(Value *Op0, Value *Op1, const BinaryOperator &I, 
-                               SmallVectorImpl<UDivFoldAction> &Actions, 
-                               unsigned Depth = 0) { 
+  if (!Log2Base)
+    llvm_unreachable("getLogBase2 should never fail here!");
+  N = IC.Builder.CreateAdd(N, Log2Base);
+  if (Op1 != ShiftLeft)
+    N = IC.Builder.CreateZExt(N, Op1->getType());
+  BinaryOperator *LShr = BinaryOperator::CreateLShr(Op0, N);
+  if (I.isExact())
+    LShr->setIsExact();
+  return LShr;
+}
+
+// Recursively visits the possible right hand operands of a udiv
+// instruction, seeing through select instructions, to determine if we can
+// replace the udiv with something simpler.  If we find that an operand is not
+// able to simplify the udiv, we abort the entire transformation.
+static size_t visitUDivOperand(Value *Op0, Value *Op1, const BinaryOperator &I,
+                               SmallVectorImpl<UDivFoldAction> &Actions,
+                               unsigned Depth = 0) {
   // FIXME: assert that Op1 isn't/doesn't contain undef.
 
-  // Check to see if this is an unsigned division with an exact power of 2, 
-  // if so, convert to a right shift. 
-  if (match(Op1, m_Power2())) { 
-    Actions.push_back(UDivFoldAction(foldUDivPow2Cst, Op1)); 
-    return Actions.size(); 
-  } 
- 
-  // X udiv (C1 << N), where C1 is "1<<C2"  -->  X >> (N+C2) 
-  if (match(Op1, m_Shl(m_Power2(), m_Value())) || 
-      match(Op1, m_ZExt(m_Shl(m_Power2(), m_Value())))) { 
-    Actions.push_back(UDivFoldAction(foldUDivShl, Op1)); 
-    return Actions.size(); 
-  } 
- 
-  // The remaining tests are all recursive, so bail out if we hit the limit. 
-  if (Depth++ == MaxDepth) 
-    return 0; 
- 
-  if (SelectInst *SI = dyn_cast<SelectInst>(Op1)) 
+  // Check to see if this is an unsigned division with an exact power of 2,
+  // if so, convert to a right shift.
+  if (match(Op1, m_Power2())) {
+    Actions.push_back(UDivFoldAction(foldUDivPow2Cst, Op1));
+    return Actions.size();
+  }
+
+  // X udiv (C1 << N), where C1 is "1<<C2"  -->  X >> (N+C2)
+  if (match(Op1, m_Shl(m_Power2(), m_Value())) ||
+      match(Op1, m_ZExt(m_Shl(m_Power2(), m_Value())))) {
+    Actions.push_back(UDivFoldAction(foldUDivShl, Op1));
+    return Actions.size();
+  }
+
+  // The remaining tests are all recursive, so bail out if we hit the limit.
+  if (Depth++ == MaxDepth)
+    return 0;
+
+  if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
     // FIXME: missed optimization: if one of the hands of select is/contains
     //        undef, just directly pick the other one.
     // FIXME: can both hands contain undef?
-    if (size_t LHSIdx = 
-            visitUDivOperand(Op0, SI->getOperand(1), I, Actions, Depth)) 
-      if (visitUDivOperand(Op0, SI->getOperand(2), I, Actions, Depth)) { 
-        Actions.push_back(UDivFoldAction(nullptr, Op1, LHSIdx - 1)); 
-        return Actions.size(); 
-      } 
- 
-  return 0; 
-} 
- 
-/// If we have zero-extended operands of an unsigned div or rem, we may be able 
-/// to narrow the operation (sink the zext below the math). 
-static Instruction *narrowUDivURem(BinaryOperator &I, 
-                                   InstCombiner::BuilderTy &Builder) { 
-  Instruction::BinaryOps Opcode = I.getOpcode(); 
-  Value *N = I.getOperand(0); 
-  Value *D = I.getOperand(1); 
-  Type *Ty = I.getType(); 
-  Value *X, *Y; 
-  if (match(N, m_ZExt(m_Value(X))) && match(D, m_ZExt(m_Value(Y))) && 
-      X->getType() == Y->getType() && (N->hasOneUse() || D->hasOneUse())) { 
-    // udiv (zext X), (zext Y) --> zext (udiv X, Y) 
-    // urem (zext X), (zext Y) --> zext (urem X, Y) 
-    Value *NarrowOp = Builder.CreateBinOp(Opcode, X, Y); 
-    return new ZExtInst(NarrowOp, Ty); 
-  } 
- 
-  Constant *C; 
-  if ((match(N, m_OneUse(m_ZExt(m_Value(X)))) && match(D, m_Constant(C))) || 
-      (match(D, m_OneUse(m_ZExt(m_Value(X)))) && match(N, m_Constant(C)))) { 
-    // If the constant is the same in the smaller type, use the narrow version. 
-    Constant *TruncC = ConstantExpr::getTrunc(C, X->getType()); 
-    if (ConstantExpr::getZExt(TruncC, Ty) != C) 
-      return nullptr; 
- 
-    // udiv (zext X), C --> zext (udiv X, C') 
-    // urem (zext X), C --> zext (urem X, C') 
-    // udiv C, (zext X) --> zext (udiv C', X) 
-    // urem C, (zext X) --> zext (urem C', X) 
-    Value *NarrowOp = isa<Constant>(D) ? Builder.CreateBinOp(Opcode, X, TruncC) 
-                                       : Builder.CreateBinOp(Opcode, TruncC, X); 
-    return new ZExtInst(NarrowOp, Ty); 
-  } 
- 
-  return nullptr; 
-} 
- 
+    if (size_t LHSIdx =
+            visitUDivOperand(Op0, SI->getOperand(1), I, Actions, Depth))
+      if (visitUDivOperand(Op0, SI->getOperand(2), I, Actions, Depth)) {
+        Actions.push_back(UDivFoldAction(nullptr, Op1, LHSIdx - 1));
+        return Actions.size();
+      }
+
+  return 0;
+}
+
+/// If we have zero-extended operands of an unsigned div or rem, we may be able
+/// to narrow the operation (sink the zext below the math).
+static Instruction *narrowUDivURem(BinaryOperator &I,
+                                   InstCombiner::BuilderTy &Builder) {
+  Instruction::BinaryOps Opcode = I.getOpcode();
+  Value *N = I.getOperand(0);
+  Value *D = I.getOperand(1);
+  Type *Ty = I.getType();
+  Value *X, *Y;
+  if (match(N, m_ZExt(m_Value(X))) && match(D, m_ZExt(m_Value(Y))) &&
+      X->getType() == Y->getType() && (N->hasOneUse() || D->hasOneUse())) {
+    // udiv (zext X), (zext Y) --> zext (udiv X, Y)
+    // urem (zext X), (zext Y) --> zext (urem X, Y)
+    Value *NarrowOp = Builder.CreateBinOp(Opcode, X, Y);
+    return new ZExtInst(NarrowOp, Ty);
+  }
+
+  Constant *C;
+  if ((match(N, m_OneUse(m_ZExt(m_Value(X)))) && match(D, m_Constant(C))) ||
+      (match(D, m_OneUse(m_ZExt(m_Value(X)))) && match(N, m_Constant(C)))) {
+    // If the constant is the same in the smaller type, use the narrow version.
+    Constant *TruncC = ConstantExpr::getTrunc(C, X->getType());
+    if (ConstantExpr::getZExt(TruncC, Ty) != C)
+      return nullptr;
+
+    // udiv (zext X), C --> zext (udiv X, C')
+    // urem (zext X), C --> zext (urem X, C')
+    // udiv C, (zext X) --> zext (udiv C', X)
+    // urem C, (zext X) --> zext (urem C', X)
+    Value *NarrowOp = isa<Constant>(D) ? Builder.CreateBinOp(Opcode, X, TruncC)
+                                       : Builder.CreateBinOp(Opcode, TruncC, X);
+    return new ZExtInst(NarrowOp, Ty);
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::visitUDiv(BinaryOperator &I) {
-  if (Value *V = SimplifyUDivInst(I.getOperand(0), I.getOperand(1), 
-                                  SQ.getWithInstruction(&I))) 
-    return replaceInstUsesWith(I, V); 
- 
-  if (Instruction *X = foldVectorBinop(I)) 
-    return X; 
- 
-  // Handle the integer div common cases 
-  if (Instruction *Common = commonIDivTransforms(I)) 
-    return Common; 
- 
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); 
-  Value *X; 
-  const APInt *C1, *C2; 
-  if (match(Op0, m_LShr(m_Value(X), m_APInt(C1))) && match(Op1, m_APInt(C2))) { 
-    // (X lshr C1) udiv C2 --> X udiv (C2 << C1) 
-    bool Overflow; 
-    APInt C2ShlC1 = C2->ushl_ov(*C1, Overflow); 
-    if (!Overflow) { 
-      bool IsExact = I.isExact() && match(Op0, m_Exact(m_Value())); 
-      BinaryOperator *BO = BinaryOperator::CreateUDiv( 
-          X, ConstantInt::get(X->getType(), C2ShlC1)); 
-      if (IsExact) 
-        BO->setIsExact(); 
-      return BO; 
-    } 
-  } 
- 
-  // Op0 / C where C is large (negative) --> zext (Op0 >= C) 
-  // TODO: Could use isKnownNegative() to handle non-constant values. 
-  Type *Ty = I.getType(); 
-  if (match(Op1, m_Negative())) { 
-    Value *Cmp = Builder.CreateICmpUGE(Op0, Op1); 
-    return CastInst::CreateZExtOrBitCast(Cmp, Ty); 
-  } 
-  // Op0 / (sext i1 X) --> zext (Op0 == -1) (if X is 0, the div is undefined) 
-  if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) { 
-    Value *Cmp = Builder.CreateICmpEQ(Op0, ConstantInt::getAllOnesValue(Ty)); 
-    return CastInst::CreateZExtOrBitCast(Cmp, Ty); 
-  } 
- 
-  if (Instruction *NarrowDiv = narrowUDivURem(I, Builder)) 
-    return NarrowDiv; 
- 
-  // If the udiv operands are non-overflowing multiplies with a common operand, 
-  // then eliminate the common factor: 
-  // (A * B) / (A * X) --> B / X (and commuted variants) 
-  // TODO: The code would be reduced if we had m_c_NUWMul pattern matching. 
-  // TODO: If -reassociation handled this generally, we could remove this. 
-  Value *A, *B; 
-  if (match(Op0, m_NUWMul(m_Value(A), m_Value(B)))) { 
-    if (match(Op1, m_NUWMul(m_Specific(A), m_Value(X))) || 
-        match(Op1, m_NUWMul(m_Value(X), m_Specific(A)))) 
-      return BinaryOperator::CreateUDiv(B, X); 
-    if (match(Op1, m_NUWMul(m_Specific(B), m_Value(X))) || 
-        match(Op1, m_NUWMul(m_Value(X), m_Specific(B)))) 
-      return BinaryOperator::CreateUDiv(A, X); 
-  } 
- 
-  // (LHS udiv (select (select (...)))) -> (LHS >> (select (select (...)))) 
-  SmallVector<UDivFoldAction, 6> UDivActions; 
-  if (visitUDivOperand(Op0, Op1, I, UDivActions)) 
-    for (unsigned i = 0, e = UDivActions.size(); i != e; ++i) { 
-      FoldUDivOperandCb Action = UDivActions[i].FoldAction; 
-      Value *ActionOp1 = UDivActions[i].OperandToFold; 
-      Instruction *Inst; 
-      if (Action) 
-        Inst = Action(Op0, ActionOp1, I, *this); 
-      else { 
-        // This action joins two actions together.  The RHS of this action is 
-        // simply the last action we processed, we saved the LHS action index in 
-        // the joining action. 
-        size_t SelectRHSIdx = i - 1; 
-        Value *SelectRHS = UDivActions[SelectRHSIdx].FoldResult; 
-        size_t SelectLHSIdx = UDivActions[i].SelectLHSIdx; 
-        Value *SelectLHS = UDivActions[SelectLHSIdx].FoldResult; 
-        Inst = SelectInst::Create(cast<SelectInst>(ActionOp1)->getCondition(), 
-                                  SelectLHS, SelectRHS); 
-      } 
- 
-      // If this is the last action to process, return it to the InstCombiner. 
-      // Otherwise, we insert it before the UDiv and record it so that we may 
-      // use it as part of a joining action (i.e., a SelectInst). 
-      if (e - i != 1) { 
-        Inst->insertBefore(&I); 
-        UDivActions[i].FoldResult = Inst; 
-      } else 
-        return Inst; 
-    } 
- 
-  return nullptr; 
-} 
- 
+  if (Value *V = SimplifyUDivInst(I.getOperand(0), I.getOperand(1),
+                                  SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  if (Instruction *X = foldVectorBinop(I))
+    return X;
+
+  // Handle the integer div common cases
+  if (Instruction *Common = commonIDivTransforms(I))
+    return Common;
+
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  Value *X;
+  const APInt *C1, *C2;
+  if (match(Op0, m_LShr(m_Value(X), m_APInt(C1))) && match(Op1, m_APInt(C2))) {
+    // (X lshr C1) udiv C2 --> X udiv (C2 << C1)
+    bool Overflow;
+    APInt C2ShlC1 = C2->ushl_ov(*C1, Overflow);
+    if (!Overflow) {
+      bool IsExact = I.isExact() && match(Op0, m_Exact(m_Value()));
+      BinaryOperator *BO = BinaryOperator::CreateUDiv(
+          X, ConstantInt::get(X->getType(), C2ShlC1));
+      if (IsExact)
+        BO->setIsExact();
+      return BO;
+    }
+  }
+
+  // Op0 / C where C is large (negative) --> zext (Op0 >= C)
+  // TODO: Could use isKnownNegative() to handle non-constant values.
+  Type *Ty = I.getType();
+  if (match(Op1, m_Negative())) {
+    Value *Cmp = Builder.CreateICmpUGE(Op0, Op1);
+    return CastInst::CreateZExtOrBitCast(Cmp, Ty);
+  }
+  // Op0 / (sext i1 X) --> zext (Op0 == -1) (if X is 0, the div is undefined)
+  if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
+    Value *Cmp = Builder.CreateICmpEQ(Op0, ConstantInt::getAllOnesValue(Ty));
+    return CastInst::CreateZExtOrBitCast(Cmp, Ty);
+  }
+
+  if (Instruction *NarrowDiv = narrowUDivURem(I, Builder))
+    return NarrowDiv;
+
+  // If the udiv operands are non-overflowing multiplies with a common operand,
+  // then eliminate the common factor:
+  // (A * B) / (A * X) --> B / X (and commuted variants)
+  // TODO: The code would be reduced if we had m_c_NUWMul pattern matching.
+  // TODO: If -reassociation handled this generally, we could remove this.
+  Value *A, *B;
+  if (match(Op0, m_NUWMul(m_Value(A), m_Value(B)))) {
+    if (match(Op1, m_NUWMul(m_Specific(A), m_Value(X))) ||
+        match(Op1, m_NUWMul(m_Value(X), m_Specific(A))))
+      return BinaryOperator::CreateUDiv(B, X);
+    if (match(Op1, m_NUWMul(m_Specific(B), m_Value(X))) ||
+        match(Op1, m_NUWMul(m_Value(X), m_Specific(B))))
+      return BinaryOperator::CreateUDiv(A, X);
+  }
+
+  // (LHS udiv (select (select (...)))) -> (LHS >> (select (select (...))))
+  SmallVector<UDivFoldAction, 6> UDivActions;
+  if (visitUDivOperand(Op0, Op1, I, UDivActions))
+    for (unsigned i = 0, e = UDivActions.size(); i != e; ++i) {
+      FoldUDivOperandCb Action = UDivActions[i].FoldAction;
+      Value *ActionOp1 = UDivActions[i].OperandToFold;
+      Instruction *Inst;
+      if (Action)
+        Inst = Action(Op0, ActionOp1, I, *this);
+      else {
+        // This action joins two actions together.  The RHS of this action is
+        // simply the last action we processed, we saved the LHS action index in
+        // the joining action.
+        size_t SelectRHSIdx = i - 1;
+        Value *SelectRHS = UDivActions[SelectRHSIdx].FoldResult;
+        size_t SelectLHSIdx = UDivActions[i].SelectLHSIdx;
+        Value *SelectLHS = UDivActions[SelectLHSIdx].FoldResult;
+        Inst = SelectInst::Create(cast<SelectInst>(ActionOp1)->getCondition(),
+                                  SelectLHS, SelectRHS);
+      }
+
+      // If this is the last action to process, return it to the InstCombiner.
+      // Otherwise, we insert it before the UDiv and record it so that we may
+      // use it as part of a joining action (i.e., a SelectInst).
+      if (e - i != 1) {
+        Inst->insertBefore(&I);
+        UDivActions[i].FoldResult = Inst;
+      } else
+        return Inst;
+    }
+
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::visitSDiv(BinaryOperator &I) {
-  if (Value *V = SimplifySDivInst(I.getOperand(0), I.getOperand(1), 
-                                  SQ.getWithInstruction(&I))) 
-    return replaceInstUsesWith(I, V); 
- 
-  if (Instruction *X = foldVectorBinop(I)) 
-    return X; 
- 
-  // Handle the integer div common cases 
-  if (Instruction *Common = commonIDivTransforms(I)) 
-    return Common; 
- 
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); 
+  if (Value *V = SimplifySDivInst(I.getOperand(0), I.getOperand(1),
+                                  SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  if (Instruction *X = foldVectorBinop(I))
+    return X;
+
+  // Handle the integer div common cases
+  if (Instruction *Common = commonIDivTransforms(I))
+    return Common;
+
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   Type *Ty = I.getType();
-  Value *X; 
-  // sdiv Op0, -1 --> -Op0 
-  // sdiv Op0, (sext i1 X) --> -Op0 (because if X is 0, the op is undefined) 
-  if (match(Op1, m_AllOnes()) || 
-      (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))) 
-    return BinaryOperator::CreateNeg(Op0); 
- 
-  // X / INT_MIN --> X == INT_MIN 
-  if (match(Op1, m_SignMask())) 
+  Value *X;
+  // sdiv Op0, -1 --> -Op0
+  // sdiv Op0, (sext i1 X) --> -Op0 (because if X is 0, the op is undefined)
+  if (match(Op1, m_AllOnes()) ||
+      (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)))
+    return BinaryOperator::CreateNeg(Op0);
+
+  // X / INT_MIN --> X == INT_MIN
+  if (match(Op1, m_SignMask()))
     return new ZExtInst(Builder.CreateICmpEQ(Op0, Op1), Ty);
- 
+
   // sdiv exact X,  1<<C  -->    ashr exact X, C   iff  1<<C  is non-negative
   // sdiv exact X, -1<<C  -->  -(ashr exact X, C)
   if (I.isExact() && ((match(Op1, m_Power2()) && match(Op1, m_NonNegative())) ||
@@ -1134,43 +1134,43 @@ Instruction *InstCombinerImpl::visitSDiv(BinaryOperator &I) {
     return BinaryOperator::CreateNeg(AShr, I.getName());
   }
 
-  const APInt *Op1C; 
-  if (match(Op1, m_APInt(Op1C))) { 
-    // If the dividend is sign-extended and the constant divisor is small enough 
-    // to fit in the source type, shrink the division to the narrower type: 
-    // (sext X) sdiv C --> sext (X sdiv C) 
-    Value *Op0Src; 
-    if (match(Op0, m_OneUse(m_SExt(m_Value(Op0Src)))) && 
-        Op0Src->getType()->getScalarSizeInBits() >= Op1C->getMinSignedBits()) { 
- 
-      // In the general case, we need to make sure that the dividend is not the 
-      // minimum signed value because dividing that by -1 is UB. But here, we 
-      // know that the -1 divisor case is already handled above. 
- 
-      Constant *NarrowDivisor = 
-          ConstantExpr::getTrunc(cast<Constant>(Op1), Op0Src->getType()); 
-      Value *NarrowOp = Builder.CreateSDiv(Op0Src, NarrowDivisor); 
+  const APInt *Op1C;
+  if (match(Op1, m_APInt(Op1C))) {
+    // If the dividend is sign-extended and the constant divisor is small enough
+    // to fit in the source type, shrink the division to the narrower type:
+    // (sext X) sdiv C --> sext (X sdiv C)
+    Value *Op0Src;
+    if (match(Op0, m_OneUse(m_SExt(m_Value(Op0Src)))) &&
+        Op0Src->getType()->getScalarSizeInBits() >= Op1C->getMinSignedBits()) {
+
+      // In the general case, we need to make sure that the dividend is not the
+      // minimum signed value because dividing that by -1 is UB. But here, we
+      // know that the -1 divisor case is already handled above.
+
+      Constant *NarrowDivisor =
+          ConstantExpr::getTrunc(cast<Constant>(Op1), Op0Src->getType());
+      Value *NarrowOp = Builder.CreateSDiv(Op0Src, NarrowDivisor);
       return new SExtInst(NarrowOp, Ty);
-    } 
- 
-    // -X / C --> X / -C (if the negation doesn't overflow). 
-    // TODO: This could be enhanced to handle arbitrary vector constants by 
-    //       checking if all elements are not the min-signed-val. 
-    if (!Op1C->isMinSignedValue() && 
-        match(Op0, m_NSWSub(m_Zero(), m_Value(X)))) { 
+    }
+
+    // -X / C --> X / -C (if the negation doesn't overflow).
+    // TODO: This could be enhanced to handle arbitrary vector constants by
+    //       checking if all elements are not the min-signed-val.
+    if (!Op1C->isMinSignedValue() &&
+        match(Op0, m_NSWSub(m_Zero(), m_Value(X)))) {
       Constant *NegC = ConstantInt::get(Ty, -(*Op1C));
-      Instruction *BO = BinaryOperator::CreateSDiv(X, NegC); 
-      BO->setIsExact(I.isExact()); 
-      return BO; 
-    } 
-  } 
- 
-  // -X / Y --> -(X / Y) 
-  Value *Y; 
-  if (match(&I, m_SDiv(m_OneUse(m_NSWSub(m_Zero(), m_Value(X))), m_Value(Y)))) 
-    return BinaryOperator::CreateNSWNeg( 
-        Builder.CreateSDiv(X, Y, I.getName(), I.isExact())); 
- 
+      Instruction *BO = BinaryOperator::CreateSDiv(X, NegC);
+      BO->setIsExact(I.isExact());
+      return BO;
+    }
+  }
+
+  // -X / Y --> -(X / Y)
+  Value *Y;
+  if (match(&I, m_SDiv(m_OneUse(m_NSWSub(m_Zero(), m_Value(X))), m_Value(Y))))
+    return BinaryOperator::CreateNSWNeg(
+        Builder.CreateSDiv(X, Y, I.getName(), I.isExact()));
+
   // abs(X) / X --> X > -1 ? 1 : -1
   // X / abs(X) --> X > -1 ? 1 : -1
   if (match(&I, m_c_BinOp(
@@ -1181,17 +1181,17 @@ Instruction *InstCombinerImpl::visitSDiv(BinaryOperator &I) {
     return SelectInst::Create(Cond, ConstantInt::get(Ty, 1), NegOne);
   }
 
-  // If the sign bits of both operands are zero (i.e. we can prove they are 
-  // unsigned inputs), turn this into a udiv. 
+  // If the sign bits of both operands are zero (i.e. we can prove they are
+  // unsigned inputs), turn this into a udiv.
   APInt Mask(APInt::getSignMask(Ty->getScalarSizeInBits()));
-  if (MaskedValueIsZero(Op0, Mask, 0, &I)) { 
-    if (MaskedValueIsZero(Op1, Mask, 0, &I)) { 
-      // X sdiv Y -> X udiv Y, iff X and Y don't have sign bit set 
-      auto *BO = BinaryOperator::CreateUDiv(Op0, Op1, I.getName()); 
-      BO->setIsExact(I.isExact()); 
-      return BO; 
-    } 
- 
+  if (MaskedValueIsZero(Op0, Mask, 0, &I)) {
+    if (MaskedValueIsZero(Op1, Mask, 0, &I)) {
+      // X sdiv Y -> X udiv Y, iff X and Y don't have sign bit set
+      auto *BO = BinaryOperator::CreateUDiv(Op0, Op1, I.getName());
+      BO->setIsExact(I.isExact());
+      return BO;
+    }
+
     if (match(Op1, m_NegatedPower2())) {
       // X sdiv (-(1 << C)) -> -(X sdiv (1 << C)) ->
       //                    -> -(X udiv (1 << C)) -> -(X u>> C)
@@ -1199,356 +1199,356 @@ Instruction *InstCombinerImpl::visitSDiv(BinaryOperator &I) {
           Op0, ConstantExpr::getNeg(cast<Constant>(Op1)), I, *this)));
     }
 
-    if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, &I)) { 
-      // X sdiv (1 << Y) -> X udiv (1 << Y) ( -> X u>> Y) 
-      // Safe because the only negative value (1 << Y) can take on is 
-      // INT_MIN, and X sdiv INT_MIN == X udiv INT_MIN == 0 if X doesn't have 
-      // the sign bit set. 
-      auto *BO = BinaryOperator::CreateUDiv(Op0, Op1, I.getName()); 
-      BO->setIsExact(I.isExact()); 
-      return BO; 
-    } 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Remove negation and try to convert division into multiplication. 
-static Instruction *foldFDivConstantDivisor(BinaryOperator &I) { 
-  Constant *C; 
-  if (!match(I.getOperand(1), m_Constant(C))) 
-    return nullptr; 
- 
-  // -X / C --> X / -C 
-  Value *X; 
-  if (match(I.getOperand(0), m_FNeg(m_Value(X)))) 
-    return BinaryOperator::CreateFDivFMF(X, ConstantExpr::getFNeg(C), &I); 
- 
-  // If the constant divisor has an exact inverse, this is always safe. If not, 
-  // then we can still create a reciprocal if fast-math-flags allow it and the 
-  // constant is a regular number (not zero, infinite, or denormal). 
-  if (!(C->hasExactInverseFP() || (I.hasAllowReciprocal() && C->isNormalFP()))) 
-    return nullptr; 
- 
-  // Disallow denormal constants because we don't know what would happen 
-  // on all targets. 
-  // TODO: Use Intrinsic::canonicalize or let function attributes tell us that 
-  // denorms are flushed? 
-  auto *RecipC = ConstantExpr::getFDiv(ConstantFP::get(I.getType(), 1.0), C); 
-  if (!RecipC->isNormalFP()) 
-    return nullptr; 
- 
-  // X / C --> X * (1 / C) 
-  return BinaryOperator::CreateFMulFMF(I.getOperand(0), RecipC, &I); 
-} 
- 
-/// Remove negation and try to reassociate constant math. 
-static Instruction *foldFDivConstantDividend(BinaryOperator &I) { 
-  Constant *C; 
-  if (!match(I.getOperand(0), m_Constant(C))) 
-    return nullptr; 
- 
-  // C / -X --> -C / X 
-  Value *X; 
-  if (match(I.getOperand(1), m_FNeg(m_Value(X)))) 
-    return BinaryOperator::CreateFDivFMF(ConstantExpr::getFNeg(C), X, &I); 
- 
-  if (!I.hasAllowReassoc() || !I.hasAllowReciprocal()) 
-    return nullptr; 
- 
-  // Try to reassociate C / X expressions where X includes another constant. 
-  Constant *C2, *NewC = nullptr; 
-  if (match(I.getOperand(1), m_FMul(m_Value(X), m_Constant(C2)))) { 
-    // C / (X * C2) --> (C / C2) / X 
-    NewC = ConstantExpr::getFDiv(C, C2); 
-  } else if (match(I.getOperand(1), m_FDiv(m_Value(X), m_Constant(C2)))) { 
-    // C / (X / C2) --> (C * C2) / X 
-    NewC = ConstantExpr::getFMul(C, C2); 
-  } 
-  // Disallow denormal constants because we don't know what would happen 
-  // on all targets. 
-  // TODO: Use Intrinsic::canonicalize or let function attributes tell us that 
-  // denorms are flushed? 
-  if (!NewC || !NewC->isNormalFP()) 
-    return nullptr; 
- 
-  return BinaryOperator::CreateFDivFMF(NewC, X, &I); 
-} 
- 
+    if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, &I)) {
+      // X sdiv (1 << Y) -> X udiv (1 << Y) ( -> X u>> Y)
+      // Safe because the only negative value (1 << Y) can take on is
+      // INT_MIN, and X sdiv INT_MIN == X udiv INT_MIN == 0 if X doesn't have
+      // the sign bit set.
+      auto *BO = BinaryOperator::CreateUDiv(Op0, Op1, I.getName());
+      BO->setIsExact(I.isExact());
+      return BO;
+    }
+  }
+
+  return nullptr;
+}
+
+/// Remove negation and try to convert division into multiplication.
+static Instruction *foldFDivConstantDivisor(BinaryOperator &I) {
+  Constant *C;
+  if (!match(I.getOperand(1), m_Constant(C)))
+    return nullptr;
+
+  // -X / C --> X / -C
+  Value *X;
+  if (match(I.getOperand(0), m_FNeg(m_Value(X))))
+    return BinaryOperator::CreateFDivFMF(X, ConstantExpr::getFNeg(C), &I);
+
+  // If the constant divisor has an exact inverse, this is always safe. If not,
+  // then we can still create a reciprocal if fast-math-flags allow it and the
+  // constant is a regular number (not zero, infinite, or denormal).
+  if (!(C->hasExactInverseFP() || (I.hasAllowReciprocal() && C->isNormalFP())))
+    return nullptr;
+
+  // Disallow denormal constants because we don't know what would happen
+  // on all targets.
+  // TODO: Use Intrinsic::canonicalize or let function attributes tell us that
+  // denorms are flushed?
+  auto *RecipC = ConstantExpr::getFDiv(ConstantFP::get(I.getType(), 1.0), C);
+  if (!RecipC->isNormalFP())
+    return nullptr;
+
+  // X / C --> X * (1 / C)
+  return BinaryOperator::CreateFMulFMF(I.getOperand(0), RecipC, &I);
+}
+
+/// Remove negation and try to reassociate constant math.
+static Instruction *foldFDivConstantDividend(BinaryOperator &I) {
+  Constant *C;
+  if (!match(I.getOperand(0), m_Constant(C)))
+    return nullptr;
+
+  // C / -X --> -C / X
+  Value *X;
+  if (match(I.getOperand(1), m_FNeg(m_Value(X))))
+    return BinaryOperator::CreateFDivFMF(ConstantExpr::getFNeg(C), X, &I);
+
+  if (!I.hasAllowReassoc() || !I.hasAllowReciprocal())
+    return nullptr;
+
+  // Try to reassociate C / X expressions where X includes another constant.
+  Constant *C2, *NewC = nullptr;
+  if (match(I.getOperand(1), m_FMul(m_Value(X), m_Constant(C2)))) {
+    // C / (X * C2) --> (C / C2) / X
+    NewC = ConstantExpr::getFDiv(C, C2);
+  } else if (match(I.getOperand(1), m_FDiv(m_Value(X), m_Constant(C2)))) {
+    // C / (X / C2) --> (C * C2) / X
+    NewC = ConstantExpr::getFMul(C, C2);
+  }
+  // Disallow denormal constants because we don't know what would happen
+  // on all targets.
+  // TODO: Use Intrinsic::canonicalize or let function attributes tell us that
+  // denorms are flushed?
+  if (!NewC || !NewC->isNormalFP())
+    return nullptr;
+
+  return BinaryOperator::CreateFDivFMF(NewC, X, &I);
+}
+
 Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
-  if (Value *V = SimplifyFDivInst(I.getOperand(0), I.getOperand(1), 
-                                  I.getFastMathFlags(), 
-                                  SQ.getWithInstruction(&I))) 
-    return replaceInstUsesWith(I, V); 
- 
-  if (Instruction *X = foldVectorBinop(I)) 
-    return X; 
- 
-  if (Instruction *R = foldFDivConstantDivisor(I)) 
-    return R; 
- 
-  if (Instruction *R = foldFDivConstantDividend(I)) 
-    return R; 
- 
-  if (Instruction *R = foldFPSignBitOps(I)) 
-    return R; 
- 
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); 
-  if (isa<Constant>(Op0)) 
-    if (SelectInst *SI = dyn_cast<SelectInst>(Op1)) 
-      if (Instruction *R = FoldOpIntoSelect(I, SI)) 
-        return R; 
- 
-  if (isa<Constant>(Op1)) 
-    if (SelectInst *SI = dyn_cast<SelectInst>(Op0)) 
-      if (Instruction *R = FoldOpIntoSelect(I, SI)) 
-        return R; 
- 
-  if (I.hasAllowReassoc() && I.hasAllowReciprocal()) { 
-    Value *X, *Y; 
-    if (match(Op0, m_OneUse(m_FDiv(m_Value(X), m_Value(Y)))) && 
-        (!isa<Constant>(Y) || !isa<Constant>(Op1))) { 
-      // (X / Y) / Z => X / (Y * Z) 
-      Value *YZ = Builder.CreateFMulFMF(Y, Op1, &I); 
-      return BinaryOperator::CreateFDivFMF(X, YZ, &I); 
-    } 
-    if (match(Op1, m_OneUse(m_FDiv(m_Value(X), m_Value(Y)))) && 
-        (!isa<Constant>(Y) || !isa<Constant>(Op0))) { 
-      // Z / (X / Y) => (Y * Z) / X 
-      Value *YZ = Builder.CreateFMulFMF(Y, Op0, &I); 
-      return BinaryOperator::CreateFDivFMF(YZ, X, &I); 
-    } 
-    // Z / (1.0 / Y) => (Y * Z) 
-    // 
-    // This is a special case of Z / (X / Y) => (Y * Z) / X, with X = 1.0. The 
-    // m_OneUse check is avoided because even in the case of the multiple uses 
-    // for 1.0/Y, the number of instructions remain the same and a division is 
-    // replaced by a multiplication. 
-    if (match(Op1, m_FDiv(m_SpecificFP(1.0), m_Value(Y)))) 
-      return BinaryOperator::CreateFMulFMF(Y, Op0, &I); 
-  } 
- 
-  if (I.hasAllowReassoc() && Op0->hasOneUse() && Op1->hasOneUse()) { 
-    // sin(X) / cos(X) -> tan(X) 
-    // cos(X) / sin(X) -> 1/tan(X) (cotangent) 
-    Value *X; 
-    bool IsTan = match(Op0, m_Intrinsic<Intrinsic::sin>(m_Value(X))) && 
-                 match(Op1, m_Intrinsic<Intrinsic::cos>(m_Specific(X))); 
-    bool IsCot = 
-        !IsTan && match(Op0, m_Intrinsic<Intrinsic::cos>(m_Value(X))) && 
-                  match(Op1, m_Intrinsic<Intrinsic::sin>(m_Specific(X))); 
- 
-    if ((IsTan || IsCot) && 
-        hasFloatFn(&TLI, I.getType(), LibFunc_tan, LibFunc_tanf, LibFunc_tanl)) { 
-      IRBuilder<> B(&I); 
-      IRBuilder<>::FastMathFlagGuard FMFGuard(B); 
-      B.setFastMathFlags(I.getFastMathFlags()); 
-      AttributeList Attrs = 
-          cast<CallBase>(Op0)->getCalledFunction()->getAttributes(); 
-      Value *Res = emitUnaryFloatFnCall(X, &TLI, LibFunc_tan, LibFunc_tanf, 
-                                        LibFunc_tanl, B, Attrs); 
-      if (IsCot) 
-        Res = B.CreateFDiv(ConstantFP::get(I.getType(), 1.0), Res); 
-      return replaceInstUsesWith(I, Res); 
-    } 
-  } 
- 
-  // X / (X * Y) --> 1.0 / Y 
-  // Reassociate to (X / X -> 1.0) is legal when NaNs are not allowed. 
-  // We can ignore the possibility that X is infinity because INF/INF is NaN. 
-  Value *X, *Y; 
-  if (I.hasNoNaNs() && I.hasAllowReassoc() && 
-      match(Op1, m_c_FMul(m_Specific(Op0), m_Value(Y)))) { 
-    replaceOperand(I, 0, ConstantFP::get(I.getType(), 1.0)); 
-    replaceOperand(I, 1, Y); 
-    return &I; 
-  } 
- 
-  // X / fabs(X) -> copysign(1.0, X) 
-  // fabs(X) / X -> copysign(1.0, X) 
-  if (I.hasNoNaNs() && I.hasNoInfs() && 
+  if (Value *V = SimplifyFDivInst(I.getOperand(0), I.getOperand(1),
+                                  I.getFastMathFlags(),
+                                  SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  if (Instruction *X = foldVectorBinop(I))
+    return X;
+
+  if (Instruction *R = foldFDivConstantDivisor(I))
+    return R;
+
+  if (Instruction *R = foldFDivConstantDividend(I))
+    return R;
+
+  if (Instruction *R = foldFPSignBitOps(I))
+    return R;
+
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  if (isa<Constant>(Op0))
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
+      if (Instruction *R = FoldOpIntoSelect(I, SI))
+        return R;
+
+  if (isa<Constant>(Op1))
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
+      if (Instruction *R = FoldOpIntoSelect(I, SI))
+        return R;
+
+  if (I.hasAllowReassoc() && I.hasAllowReciprocal()) {
+    Value *X, *Y;
+    if (match(Op0, m_OneUse(m_FDiv(m_Value(X), m_Value(Y)))) &&
+        (!isa<Constant>(Y) || !isa<Constant>(Op1))) {
+      // (X / Y) / Z => X / (Y * Z)
+      Value *YZ = Builder.CreateFMulFMF(Y, Op1, &I);
+      return BinaryOperator::CreateFDivFMF(X, YZ, &I);
+    }
+    if (match(Op1, m_OneUse(m_FDiv(m_Value(X), m_Value(Y)))) &&
+        (!isa<Constant>(Y) || !isa<Constant>(Op0))) {
+      // Z / (X / Y) => (Y * Z) / X
+      Value *YZ = Builder.CreateFMulFMF(Y, Op0, &I);
+      return BinaryOperator::CreateFDivFMF(YZ, X, &I);
+    }
+    // Z / (1.0 / Y) => (Y * Z)
+    //
+    // This is a special case of Z / (X / Y) => (Y * Z) / X, with X = 1.0. The
+    // m_OneUse check is avoided because even in the case of the multiple uses
+    // for 1.0/Y, the number of instructions remain the same and a division is
+    // replaced by a multiplication.
+    if (match(Op1, m_FDiv(m_SpecificFP(1.0), m_Value(Y))))
+      return BinaryOperator::CreateFMulFMF(Y, Op0, &I);
+  }
+
+  if (I.hasAllowReassoc() && Op0->hasOneUse() && Op1->hasOneUse()) {
+    // sin(X) / cos(X) -> tan(X)
+    // cos(X) / sin(X) -> 1/tan(X) (cotangent)
+    Value *X;
+    bool IsTan = match(Op0, m_Intrinsic<Intrinsic::sin>(m_Value(X))) &&
+                 match(Op1, m_Intrinsic<Intrinsic::cos>(m_Specific(X)));
+    bool IsCot =
+        !IsTan && match(Op0, m_Intrinsic<Intrinsic::cos>(m_Value(X))) &&
+                  match(Op1, m_Intrinsic<Intrinsic::sin>(m_Specific(X)));
+
+    if ((IsTan || IsCot) &&
+        hasFloatFn(&TLI, I.getType(), LibFunc_tan, LibFunc_tanf, LibFunc_tanl)) {
+      IRBuilder<> B(&I);
+      IRBuilder<>::FastMathFlagGuard FMFGuard(B);
+      B.setFastMathFlags(I.getFastMathFlags());
+      AttributeList Attrs =
+          cast<CallBase>(Op0)->getCalledFunction()->getAttributes();
+      Value *Res = emitUnaryFloatFnCall(X, &TLI, LibFunc_tan, LibFunc_tanf,
+                                        LibFunc_tanl, B, Attrs);
+      if (IsCot)
+        Res = B.CreateFDiv(ConstantFP::get(I.getType(), 1.0), Res);
+      return replaceInstUsesWith(I, Res);
+    }
+  }
+
+  // X / (X * Y) --> 1.0 / Y
+  // Reassociate to (X / X -> 1.0) is legal when NaNs are not allowed.
+  // We can ignore the possibility that X is infinity because INF/INF is NaN.
+  Value *X, *Y;
+  if (I.hasNoNaNs() && I.hasAllowReassoc() &&
+      match(Op1, m_c_FMul(m_Specific(Op0), m_Value(Y)))) {
+    replaceOperand(I, 0, ConstantFP::get(I.getType(), 1.0));
+    replaceOperand(I, 1, Y);
+    return &I;
+  }
+
+  // X / fabs(X) -> copysign(1.0, X)
+  // fabs(X) / X -> copysign(1.0, X)
+  if (I.hasNoNaNs() && I.hasNoInfs() &&
       (match(&I, m_FDiv(m_Value(X), m_FAbs(m_Deferred(X)))) ||
        match(&I, m_FDiv(m_FAbs(m_Value(X)), m_Deferred(X))))) {
-    Value *V = Builder.CreateBinaryIntrinsic( 
-        Intrinsic::copysign, ConstantFP::get(I.getType(), 1.0), X, &I); 
-    return replaceInstUsesWith(I, V); 
-  } 
-  return nullptr; 
-} 
- 
-/// This function implements the transforms common to both integer remainder 
-/// instructions (urem and srem). It is called by the visitors to those integer 
-/// remainder instructions. 
-/// Common integer remainder transforms 
+    Value *V = Builder.CreateBinaryIntrinsic(
+        Intrinsic::copysign, ConstantFP::get(I.getType(), 1.0), X, &I);
+    return replaceInstUsesWith(I, V);
+  }
+  return nullptr;
+}
+
+/// This function implements the transforms common to both integer remainder
+/// instructions (urem and srem). It is called by the visitors to those integer
+/// remainder instructions.
+/// Common integer remainder transforms
 Instruction *InstCombinerImpl::commonIRemTransforms(BinaryOperator &I) {
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); 
- 
-  // The RHS is known non-zero. 
-  if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this, I)) 
-    return replaceOperand(I, 1, V); 
- 
-  // Handle cases involving: rem X, (select Cond, Y, Z) 
-  if (simplifyDivRemOfSelectWithZeroOp(I)) 
-    return &I; 
- 
-  if (isa<Constant>(Op1)) { 
-    if (Instruction *Op0I = dyn_cast<Instruction>(Op0)) { 
-      if (SelectInst *SI = dyn_cast<SelectInst>(Op0I)) { 
-        if (Instruction *R = FoldOpIntoSelect(I, SI)) 
-          return R; 
-      } else if (auto *PN = dyn_cast<PHINode>(Op0I)) { 
-        const APInt *Op1Int; 
-        if (match(Op1, m_APInt(Op1Int)) && !Op1Int->isMinValue() && 
-            (I.getOpcode() == Instruction::URem || 
-             !Op1Int->isMinSignedValue())) { 
-          // foldOpIntoPhi will speculate instructions to the end of the PHI's 
-          // predecessor blocks, so do this only if we know the srem or urem 
-          // will not fault. 
-          if (Instruction *NV = foldOpIntoPhi(I, PN)) 
-            return NV; 
-        } 
-      } 
- 
-      // See if we can fold away this rem instruction. 
-      if (SimplifyDemandedInstructionBits(I)) 
-        return &I; 
-    } 
-  } 
- 
-  return nullptr; 
-} 
- 
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // The RHS is known non-zero.
+  if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this, I))
+    return replaceOperand(I, 1, V);
+
+  // Handle cases involving: rem X, (select Cond, Y, Z)
+  if (simplifyDivRemOfSelectWithZeroOp(I))
+    return &I;
+
+  if (isa<Constant>(Op1)) {
+    if (Instruction *Op0I = dyn_cast<Instruction>(Op0)) {
+      if (SelectInst *SI = dyn_cast<SelectInst>(Op0I)) {
+        if (Instruction *R = FoldOpIntoSelect(I, SI))
+          return R;
+      } else if (auto *PN = dyn_cast<PHINode>(Op0I)) {
+        const APInt *Op1Int;
+        if (match(Op1, m_APInt(Op1Int)) && !Op1Int->isMinValue() &&
+            (I.getOpcode() == Instruction::URem ||
+             !Op1Int->isMinSignedValue())) {
+          // foldOpIntoPhi will speculate instructions to the end of the PHI's
+          // predecessor blocks, so do this only if we know the srem or urem
+          // will not fault.
+          if (Instruction *NV = foldOpIntoPhi(I, PN))
+            return NV;
+        }
+      }
+
+      // See if we can fold away this rem instruction.
+      if (SimplifyDemandedInstructionBits(I))
+        return &I;
+    }
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::visitURem(BinaryOperator &I) {
-  if (Value *V = SimplifyURemInst(I.getOperand(0), I.getOperand(1), 
-                                  SQ.getWithInstruction(&I))) 
-    return replaceInstUsesWith(I, V); 
- 
-  if (Instruction *X = foldVectorBinop(I)) 
-    return X; 
- 
-  if (Instruction *common = commonIRemTransforms(I)) 
-    return common; 
- 
-  if (Instruction *NarrowRem = narrowUDivURem(I, Builder)) 
-    return NarrowRem; 
- 
-  // X urem Y -> X and Y-1, where Y is a power of 2, 
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); 
-  Type *Ty = I.getType(); 
-  if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, &I)) { 
-    // This may increase instruction count, we don't enforce that Y is a 
-    // constant. 
-    Constant *N1 = Constant::getAllOnesValue(Ty); 
-    Value *Add = Builder.CreateAdd(Op1, N1); 
-    return BinaryOperator::CreateAnd(Op0, Add); 
-  } 
- 
-  // 1 urem X -> zext(X != 1) 
-  if (match(Op0, m_One())) { 
-    Value *Cmp = Builder.CreateICmpNE(Op1, ConstantInt::get(Ty, 1)); 
-    return CastInst::CreateZExtOrBitCast(Cmp, Ty); 
-  } 
- 
-  // X urem C -> X < C ? X : X - C, where C >= signbit. 
-  if (match(Op1, m_Negative())) { 
-    Value *Cmp = Builder.CreateICmpULT(Op0, Op1); 
-    Value *Sub = Builder.CreateSub(Op0, Op1); 
-    return SelectInst::Create(Cmp, Op0, Sub); 
-  } 
- 
-  // If the divisor is a sext of a boolean, then the divisor must be max 
-  // unsigned value (-1). Therefore, the remainder is Op0 unless Op0 is also 
-  // max unsigned value. In that case, the remainder is 0: 
-  // urem Op0, (sext i1 X) --> (Op0 == -1) ? 0 : Op0 
-  Value *X; 
-  if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) { 
-    Value *Cmp = Builder.CreateICmpEQ(Op0, ConstantInt::getAllOnesValue(Ty)); 
-    return SelectInst::Create(Cmp, ConstantInt::getNullValue(Ty), Op0); 
-  } 
- 
-  return nullptr; 
-} 
- 
+  if (Value *V = SimplifyURemInst(I.getOperand(0), I.getOperand(1),
+                                  SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  if (Instruction *X = foldVectorBinop(I))
+    return X;
+
+  if (Instruction *common = commonIRemTransforms(I))
+    return common;
+
+  if (Instruction *NarrowRem = narrowUDivURem(I, Builder))
+    return NarrowRem;
+
+  // X urem Y -> X and Y-1, where Y is a power of 2,
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  Type *Ty = I.getType();
+  if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, &I)) {
+    // This may increase instruction count, we don't enforce that Y is a
+    // constant.
+    Constant *N1 = Constant::getAllOnesValue(Ty);
+    Value *Add = Builder.CreateAdd(Op1, N1);
+    return BinaryOperator::CreateAnd(Op0, Add);
+  }
+
+  // 1 urem X -> zext(X != 1)
+  if (match(Op0, m_One())) {
+    Value *Cmp = Builder.CreateICmpNE(Op1, ConstantInt::get(Ty, 1));
+    return CastInst::CreateZExtOrBitCast(Cmp, Ty);
+  }
+
+  // X urem C -> X < C ? X : X - C, where C >= signbit.
+  if (match(Op1, m_Negative())) {
+    Value *Cmp = Builder.CreateICmpULT(Op0, Op1);
+    Value *Sub = Builder.CreateSub(Op0, Op1);
+    return SelectInst::Create(Cmp, Op0, Sub);
+  }
+
+  // If the divisor is a sext of a boolean, then the divisor must be max
+  // unsigned value (-1). Therefore, the remainder is Op0 unless Op0 is also
+  // max unsigned value. In that case, the remainder is 0:
+  // urem Op0, (sext i1 X) --> (Op0 == -1) ? 0 : Op0
+  Value *X;
+  if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
+    Value *Cmp = Builder.CreateICmpEQ(Op0, ConstantInt::getAllOnesValue(Ty));
+    return SelectInst::Create(Cmp, ConstantInt::getNullValue(Ty), Op0);
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::visitSRem(BinaryOperator &I) {
-  if (Value *V = SimplifySRemInst(I.getOperand(0), I.getOperand(1), 
-                                  SQ.getWithInstruction(&I))) 
-    return replaceInstUsesWith(I, V); 
- 
-  if (Instruction *X = foldVectorBinop(I)) 
-    return X; 
- 
-  // Handle the integer rem common cases 
-  if (Instruction *Common = commonIRemTransforms(I)) 
-    return Common; 
- 
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); 
-  { 
-    const APInt *Y; 
-    // X % -Y -> X % Y 
-    if (match(Op1, m_Negative(Y)) && !Y->isMinSignedValue()) 
-      return replaceOperand(I, 1, ConstantInt::get(I.getType(), -*Y)); 
-  } 
- 
-  // -X srem Y --> -(X srem Y) 
-  Value *X, *Y; 
-  if (match(&I, m_SRem(m_OneUse(m_NSWSub(m_Zero(), m_Value(X))), m_Value(Y)))) 
+  if (Value *V = SimplifySRemInst(I.getOperand(0), I.getOperand(1),
+                                  SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  if (Instruction *X = foldVectorBinop(I))
+    return X;
+
+  // Handle the integer rem common cases
+  if (Instruction *Common = commonIRemTransforms(I))
+    return Common;
+
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  {
+    const APInt *Y;
+    // X % -Y -> X % Y
+    if (match(Op1, m_Negative(Y)) && !Y->isMinSignedValue())
+      return replaceOperand(I, 1, ConstantInt::get(I.getType(), -*Y));
+  }
+
+  // -X srem Y --> -(X srem Y)
+  Value *X, *Y;
+  if (match(&I, m_SRem(m_OneUse(m_NSWSub(m_Zero(), m_Value(X))), m_Value(Y))))
     return BinaryOperator::CreateNSWNeg(Builder.CreateSRem(X, Y));
- 
-  // If the sign bits of both operands are zero (i.e. we can prove they are 
-  // unsigned inputs), turn this into a urem. 
-  APInt Mask(APInt::getSignMask(I.getType()->getScalarSizeInBits())); 
-  if (MaskedValueIsZero(Op1, Mask, 0, &I) && 
-      MaskedValueIsZero(Op0, Mask, 0, &I)) { 
-    // X srem Y -> X urem Y, iff X and Y don't have sign bit set 
-    return BinaryOperator::CreateURem(Op0, Op1, I.getName()); 
-  } 
- 
-  // If it's a constant vector, flip any negative values positive. 
-  if (isa<ConstantVector>(Op1) || isa<ConstantDataVector>(Op1)) { 
-    Constant *C = cast<Constant>(Op1); 
+
+  // If the sign bits of both operands are zero (i.e. we can prove they are
+  // unsigned inputs), turn this into a urem.
+  APInt Mask(APInt::getSignMask(I.getType()->getScalarSizeInBits()));
+  if (MaskedValueIsZero(Op1, Mask, 0, &I) &&
+      MaskedValueIsZero(Op0, Mask, 0, &I)) {
+    // X srem Y -> X urem Y, iff X and Y don't have sign bit set
+    return BinaryOperator::CreateURem(Op0, Op1, I.getName());
+  }
+
+  // If it's a constant vector, flip any negative values positive.
+  if (isa<ConstantVector>(Op1) || isa<ConstantDataVector>(Op1)) {
+    Constant *C = cast<Constant>(Op1);
     unsigned VWidth = cast<FixedVectorType>(C->getType())->getNumElements();
- 
-    bool hasNegative = false; 
-    bool hasMissing = false; 
-    for (unsigned i = 0; i != VWidth; ++i) { 
-      Constant *Elt = C->getAggregateElement(i); 
-      if (!Elt) { 
-        hasMissing = true; 
-        break; 
-      } 
- 
-      if (ConstantInt *RHS = dyn_cast<ConstantInt>(Elt)) 
-        if (RHS->isNegative()) 
-          hasNegative = true; 
-    } 
- 
-    if (hasNegative && !hasMissing) { 
-      SmallVector<Constant *, 16> Elts(VWidth); 
-      for (unsigned i = 0; i != VWidth; ++i) { 
-        Elts[i] = C->getAggregateElement(i);  // Handle undef, etc. 
-        if (ConstantInt *RHS = dyn_cast<ConstantInt>(Elts[i])) { 
-          if (RHS->isNegative()) 
-            Elts[i] = cast<ConstantInt>(ConstantExpr::getNeg(RHS)); 
-        } 
-      } 
- 
-      Constant *NewRHSV = ConstantVector::get(Elts); 
-      if (NewRHSV != C)  // Don't loop on -MININT 
-        return replaceOperand(I, 1, NewRHSV); 
-    } 
-  } 
- 
-  return nullptr; 
-} 
- 
+
+    bool hasNegative = false;
+    bool hasMissing = false;
+    for (unsigned i = 0; i != VWidth; ++i) {
+      Constant *Elt = C->getAggregateElement(i);
+      if (!Elt) {
+        hasMissing = true;
+        break;
+      }
+
+      if (ConstantInt *RHS = dyn_cast<ConstantInt>(Elt))
+        if (RHS->isNegative())
+          hasNegative = true;
+    }
+
+    if (hasNegative && !hasMissing) {
+      SmallVector<Constant *, 16> Elts(VWidth);
+      for (unsigned i = 0; i != VWidth; ++i) {
+        Elts[i] = C->getAggregateElement(i);  // Handle undef, etc.
+        if (ConstantInt *RHS = dyn_cast<ConstantInt>(Elts[i])) {
+          if (RHS->isNegative())
+            Elts[i] = cast<ConstantInt>(ConstantExpr::getNeg(RHS));
+        }
+      }
+
+      Constant *NewRHSV = ConstantVector::get(Elts);
+      if (NewRHSV != C)  // Don't loop on -MININT
+        return replaceOperand(I, 1, NewRHSV);
+    }
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::visitFRem(BinaryOperator &I) {
-  if (Value *V = SimplifyFRemInst(I.getOperand(0), I.getOperand(1), 
-                                  I.getFastMathFlags(), 
-                                  SQ.getWithInstruction(&I))) 
-    return replaceInstUsesWith(I, V); 
- 
-  if (Instruction *X = foldVectorBinop(I)) 
-    return X; 
- 
-  return nullptr; 
-} 
+  if (Value *V = SimplifyFRemInst(I.getOperand(0), I.getOperand(1),
+                                  I.getFastMathFlags(),
+                                  SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  if (Instruction *X = foldVectorBinop(I))
+    return X;
+
+  return nullptr;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineNegator.cpp b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineNegator.cpp
index c6d3604de8..7718c8b0ee 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineNegator.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineNegator.cpp
@@ -1,120 +1,120 @@
-//===- InstCombineNegator.cpp -----------------------------------*- C++ -*-===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements sinking of negation into expression trees, 
-// as long as that can be done without increasing instruction count. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "InstCombineInternal.h" 
-#include "llvm/ADT/APInt.h" 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/None.h" 
-#include "llvm/ADT/Optional.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/StringRef.h" 
-#include "llvm/ADT/Twine.h" 
-#include "llvm/ADT/iterator_range.h" 
-#include "llvm/Analysis/TargetFolder.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DebugLoc.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Compiler.h" 
-#include "llvm/Support/DebugCounter.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/raw_ostream.h" 
+//===- InstCombineNegator.cpp -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements sinking of negation into expression trees,
+// as long as that can be done without increasing instruction count.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/TargetFolder.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include <cassert>
 #include <cstdint>
-#include <functional> 
-#include <tuple> 
-#include <type_traits> 
-#include <utility> 
- 
-namespace llvm { 
-class AssumptionCache; 
-class DataLayout; 
-class DominatorTree; 
-class LLVMContext; 
-} // namespace llvm 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "instcombine" 
- 
-STATISTIC(NegatorTotalNegationsAttempted, 
-          "Negator: Number of negations attempted to be sinked"); 
-STATISTIC(NegatorNumTreesNegated, 
-          "Negator: Number of negations successfully sinked"); 
-STATISTIC(NegatorMaxDepthVisited, "Negator: Maximal traversal depth ever " 
-                                  "reached while attempting to sink negation"); 
-STATISTIC(NegatorTimesDepthLimitReached, 
-          "Negator: How many times did the traversal depth limit was reached " 
-          "during sinking"); 
-STATISTIC( 
-    NegatorNumValuesVisited, 
-    "Negator: Total number of values visited during attempts to sink negation"); 
-STATISTIC(NegatorNumNegationsFoundInCache, 
-          "Negator: How many negations did we retrieve/reuse from cache"); 
-STATISTIC(NegatorMaxTotalValuesVisited, 
-          "Negator: Maximal number of values ever visited while attempting to " 
-          "sink negation"); 
-STATISTIC(NegatorNumInstructionsCreatedTotal, 
-          "Negator: Number of new negated instructions created, total"); 
-STATISTIC(NegatorMaxInstructionsCreated, 
-          "Negator: Maximal number of new instructions created during negation " 
-          "attempt"); 
-STATISTIC(NegatorNumInstructionsNegatedSuccess, 
-          "Negator: Number of new negated instructions created in successful " 
-          "negation sinking attempts"); 
- 
-DEBUG_COUNTER(NegatorCounter, "instcombine-negator", 
-              "Controls Negator transformations in InstCombine pass"); 
- 
-static cl::opt<bool> 
-    NegatorEnabled("instcombine-negator-enabled", cl::init(true), 
-                   cl::desc("Should we attempt to sink negations?")); 
- 
-static cl::opt<unsigned> 
-    NegatorMaxDepth("instcombine-negator-max-depth", 
-                    cl::init(NegatorDefaultMaxDepth), 
-                    cl::desc("What is the maximal lookup depth when trying to " 
-                             "check for viability of negation sinking.")); 
- 
-Negator::Negator(LLVMContext &C, const DataLayout &DL_, AssumptionCache &AC_, 
-                 const DominatorTree &DT_, bool IsTrulyNegation_) 
-    : Builder(C, TargetFolder(DL_), 
-              IRBuilderCallbackInserter([&](Instruction *I) { 
-                ++NegatorNumInstructionsCreatedTotal; 
-                NewInstructions.push_back(I); 
-              })), 
-      DL(DL_), AC(AC_), DT(DT_), IsTrulyNegation(IsTrulyNegation_) {} 
- 
-#if LLVM_ENABLE_STATS 
-Negator::~Negator() { 
-  NegatorMaxTotalValuesVisited.updateMax(NumValuesVisitedInThisNegator); 
-} 
-#endif 
- 
+#include <functional>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+namespace llvm {
+class AssumptionCache;
+class DataLayout;
+class DominatorTree;
+class LLVMContext;
+} // namespace llvm
+
+using namespace llvm;
+
+#define DEBUG_TYPE "instcombine"
+
+STATISTIC(NegatorTotalNegationsAttempted,
+          "Negator: Number of negations attempted to be sinked");
+STATISTIC(NegatorNumTreesNegated,
+          "Negator: Number of negations successfully sinked");
+STATISTIC(NegatorMaxDepthVisited, "Negator: Maximal traversal depth ever "
+                                  "reached while attempting to sink negation");
+STATISTIC(NegatorTimesDepthLimitReached,
+          "Negator: How many times did the traversal depth limit was reached "
+          "during sinking");
+STATISTIC(
+    NegatorNumValuesVisited,
+    "Negator: Total number of values visited during attempts to sink negation");
+STATISTIC(NegatorNumNegationsFoundInCache,
+          "Negator: How many negations did we retrieve/reuse from cache");
+STATISTIC(NegatorMaxTotalValuesVisited,
+          "Negator: Maximal number of values ever visited while attempting to "
+          "sink negation");
+STATISTIC(NegatorNumInstructionsCreatedTotal,
+          "Negator: Number of new negated instructions created, total");
+STATISTIC(NegatorMaxInstructionsCreated,
+          "Negator: Maximal number of new instructions created during negation "
+          "attempt");
+STATISTIC(NegatorNumInstructionsNegatedSuccess,
+          "Negator: Number of new negated instructions created in successful "
+          "negation sinking attempts");
+
+DEBUG_COUNTER(NegatorCounter, "instcombine-negator",
+              "Controls Negator transformations in InstCombine pass");
+
+static cl::opt<bool>
+    NegatorEnabled("instcombine-negator-enabled", cl::init(true),
+                   cl::desc("Should we attempt to sink negations?"));
+
+static cl::opt<unsigned>
+    NegatorMaxDepth("instcombine-negator-max-depth",
+                    cl::init(NegatorDefaultMaxDepth),
+                    cl::desc("What is the maximal lookup depth when trying to "
+                             "check for viability of negation sinking."));
+
+Negator::Negator(LLVMContext &C, const DataLayout &DL_, AssumptionCache &AC_,
+                 const DominatorTree &DT_, bool IsTrulyNegation_)
+    : Builder(C, TargetFolder(DL_),
+              IRBuilderCallbackInserter([&](Instruction *I) {
+                ++NegatorNumInstructionsCreatedTotal;
+                NewInstructions.push_back(I);
+              })),
+      DL(DL_), AC(AC_), DT(DT_), IsTrulyNegation(IsTrulyNegation_) {}
+
+#if LLVM_ENABLE_STATS
+Negator::~Negator() {
+  NegatorMaxTotalValuesVisited.updateMax(NumValuesVisitedInThisNegator);
+}
+#endif
+
 // Due to the InstCombine's worklist management, there are no guarantees that
 // each instruction we'll encounter has been visited by InstCombine already.
 // In particular, most importantly for us, that means we have to canonicalize
@@ -128,97 +128,97 @@ std::array<Value *, 2> Negator::getSortedOperandsOfBinOp(Instruction *I) {
   return Ops;
 }
 
-// FIXME: can this be reworked into a worklist-based algorithm while preserving 
-// the depth-first, early bailout traversal? 
-LLVM_NODISCARD Value *Negator::visitImpl(Value *V, unsigned Depth) { 
-  // -(undef) -> undef. 
-  if (match(V, m_Undef())) 
-    return V; 
- 
-  // In i1, negation can simply be ignored. 
-  if (V->getType()->isIntOrIntVectorTy(1)) 
-    return V; 
- 
-  Value *X; 
- 
-  // -(-(X)) -> X. 
-  if (match(V, m_Neg(m_Value(X)))) 
-    return X; 
- 
-  // Integral constants can be freely negated. 
-  if (match(V, m_AnyIntegralConstant())) 
-    return ConstantExpr::getNeg(cast<Constant>(V), /*HasNUW=*/false, 
-                                /*HasNSW=*/false); 
- 
-  // If we have a non-instruction, then give up. 
-  if (!isa<Instruction>(V)) 
-    return nullptr; 
- 
-  // If we have started with a true negation (i.e. `sub 0, %y`), then if we've 
-  // got instruction that does not require recursive reasoning, we can still 
-  // negate it even if it has other uses, without increasing instruction count. 
-  if (!V->hasOneUse() && !IsTrulyNegation) 
-    return nullptr; 
- 
-  auto *I = cast<Instruction>(V); 
-  unsigned BitWidth = I->getType()->getScalarSizeInBits(); 
- 
-  // We must preserve the insertion point and debug info that is set in the 
-  // builder at the time this function is called. 
-  InstCombiner::BuilderTy::InsertPointGuard Guard(Builder); 
-  // And since we are trying to negate instruction I, that tells us about the 
-  // insertion point and the debug info that we need to keep. 
-  Builder.SetInsertPoint(I); 
- 
-  // In some cases we can give the answer without further recursion. 
-  switch (I->getOpcode()) { 
+// FIXME: can this be reworked into a worklist-based algorithm while preserving
+// the depth-first, early bailout traversal?
+LLVM_NODISCARD Value *Negator::visitImpl(Value *V, unsigned Depth) {
+  // -(undef) -> undef.
+  if (match(V, m_Undef()))
+    return V;
+
+  // In i1, negation can simply be ignored.
+  if (V->getType()->isIntOrIntVectorTy(1))
+    return V;
+
+  Value *X;
+
+  // -(-(X)) -> X.
+  if (match(V, m_Neg(m_Value(X))))
+    return X;
+
+  // Integral constants can be freely negated.
+  if (match(V, m_AnyIntegralConstant()))
+    return ConstantExpr::getNeg(cast<Constant>(V), /*HasNUW=*/false,
+                                /*HasNSW=*/false);
+
+  // If we have a non-instruction, then give up.
+  if (!isa<Instruction>(V))
+    return nullptr;
+
+  // If we have started with a true negation (i.e. `sub 0, %y`), then if we've
+  // got instruction that does not require recursive reasoning, we can still
+  // negate it even if it has other uses, without increasing instruction count.
+  if (!V->hasOneUse() && !IsTrulyNegation)
+    return nullptr;
+
+  auto *I = cast<Instruction>(V);
+  unsigned BitWidth = I->getType()->getScalarSizeInBits();
+
+  // We must preserve the insertion point and debug info that is set in the
+  // builder at the time this function is called.
+  InstCombiner::BuilderTy::InsertPointGuard Guard(Builder);
+  // And since we are trying to negate instruction I, that tells us about the
+  // insertion point and the debug info that we need to keep.
+  Builder.SetInsertPoint(I);
+
+  // In some cases we can give the answer without further recursion.
+  switch (I->getOpcode()) {
   case Instruction::Add: {
     std::array<Value *, 2> Ops = getSortedOperandsOfBinOp(I);
-    // `inc` is always negatible. 
+    // `inc` is always negatible.
     if (match(Ops[1], m_One()))
       return Builder.CreateNot(Ops[0], I->getName() + ".neg");
-    break; 
+    break;
   }
-  case Instruction::Xor: 
-    // `not` is always negatible. 
-    if (match(I, m_Not(m_Value(X)))) 
-      return Builder.CreateAdd(X, ConstantInt::get(X->getType(), 1), 
-                               I->getName() + ".neg"); 
-    break; 
-  case Instruction::AShr: 
-  case Instruction::LShr: { 
-    // Right-shift sign bit smear is negatible. 
-    const APInt *Op1Val; 
-    if (match(I->getOperand(1), m_APInt(Op1Val)) && *Op1Val == BitWidth - 1) { 
-      Value *BO = I->getOpcode() == Instruction::AShr 
-                      ? Builder.CreateLShr(I->getOperand(0), I->getOperand(1)) 
-                      : Builder.CreateAShr(I->getOperand(0), I->getOperand(1)); 
-      if (auto *NewInstr = dyn_cast<Instruction>(BO)) { 
-        NewInstr->copyIRFlags(I); 
-        NewInstr->setName(I->getName() + ".neg"); 
-      } 
-      return BO; 
-    } 
+  case Instruction::Xor:
+    // `not` is always negatible.
+    if (match(I, m_Not(m_Value(X))))
+      return Builder.CreateAdd(X, ConstantInt::get(X->getType(), 1),
+                               I->getName() + ".neg");
+    break;
+  case Instruction::AShr:
+  case Instruction::LShr: {
+    // Right-shift sign bit smear is negatible.
+    const APInt *Op1Val;
+    if (match(I->getOperand(1), m_APInt(Op1Val)) && *Op1Val == BitWidth - 1) {
+      Value *BO = I->getOpcode() == Instruction::AShr
+                      ? Builder.CreateLShr(I->getOperand(0), I->getOperand(1))
+                      : Builder.CreateAShr(I->getOperand(0), I->getOperand(1));
+      if (auto *NewInstr = dyn_cast<Instruction>(BO)) {
+        NewInstr->copyIRFlags(I);
+        NewInstr->setName(I->getName() + ".neg");
+      }
+      return BO;
+    }
     // While we could negate exact arithmetic shift:
     //   ashr exact %x, C  -->   sdiv exact i8 %x, -1<<C
     // iff C != 0 and C u< bitwidth(%x), we don't want to,
     // because division is *THAT* much worse than a shift.
-    break; 
-  } 
-  case Instruction::SExt: 
-  case Instruction::ZExt: 
-    // `*ext` of i1 is always negatible 
-    if (I->getOperand(0)->getType()->isIntOrIntVectorTy(1)) 
-      return I->getOpcode() == Instruction::SExt 
-                 ? Builder.CreateZExt(I->getOperand(0), I->getType(), 
-                                      I->getName() + ".neg") 
-                 : Builder.CreateSExt(I->getOperand(0), I->getType(), 
-                                      I->getName() + ".neg"); 
-    break; 
-  default: 
-    break; // Other instructions require recursive reasoning. 
-  } 
- 
+    break;
+  }
+  case Instruction::SExt:
+  case Instruction::ZExt:
+    // `*ext` of i1 is always negatible
+    if (I->getOperand(0)->getType()->isIntOrIntVectorTy(1))
+      return I->getOpcode() == Instruction::SExt
+                 ? Builder.CreateZExt(I->getOperand(0), I->getType(),
+                                      I->getName() + ".neg")
+                 : Builder.CreateSExt(I->getOperand(0), I->getType(),
+                                      I->getName() + ".neg");
+    break;
+  default:
+    break; // Other instructions require recursive reasoning.
+  }
+
   if (I->getOpcode() == Instruction::Sub &&
       (I->hasOneUse() || match(I->getOperand(0), m_ImmConstant()))) {
     // `sub` is always negatible.
@@ -228,39 +228,39 @@ LLVM_NODISCARD Value *Negator::visitImpl(Value *V, unsigned Depth) {
                              I->getName() + ".neg");
   }
 
-  // Some other cases, while still don't require recursion, 
-  // are restricted to the one-use case. 
-  if (!V->hasOneUse()) 
-    return nullptr; 
- 
-  switch (I->getOpcode()) { 
-  case Instruction::SDiv: 
-    // `sdiv` is negatible if divisor is not undef/INT_MIN/1. 
-    // While this is normally not behind a use-check, 
-    // let's consider division to be special since it's costly. 
-    if (auto *Op1C = dyn_cast<Constant>(I->getOperand(1))) { 
+  // Some other cases, while still don't require recursion,
+  // are restricted to the one-use case.
+  if (!V->hasOneUse())
+    return nullptr;
+
+  switch (I->getOpcode()) {
+  case Instruction::SDiv:
+    // `sdiv` is negatible if divisor is not undef/INT_MIN/1.
+    // While this is normally not behind a use-check,
+    // let's consider division to be special since it's costly.
+    if (auto *Op1C = dyn_cast<Constant>(I->getOperand(1))) {
       if (!Op1C->containsUndefOrPoisonElement() &&
           Op1C->isNotMinSignedValue() && Op1C->isNotOneValue()) {
-        Value *BO = 
-            Builder.CreateSDiv(I->getOperand(0), ConstantExpr::getNeg(Op1C), 
-                               I->getName() + ".neg"); 
-        if (auto *NewInstr = dyn_cast<Instruction>(BO)) 
-          NewInstr->setIsExact(I->isExact()); 
-        return BO; 
-      } 
-    } 
-    break; 
-  } 
- 
-  // Rest of the logic is recursive, so if it's time to give up then it's time. 
-  if (Depth > NegatorMaxDepth) { 
-    LLVM_DEBUG(dbgs() << "Negator: reached maximal allowed traversal depth in " 
-                      << *V << ". Giving up.\n"); 
-    ++NegatorTimesDepthLimitReached; 
-    return nullptr; 
-  } 
- 
-  switch (I->getOpcode()) { 
+        Value *BO =
+            Builder.CreateSDiv(I->getOperand(0), ConstantExpr::getNeg(Op1C),
+                               I->getName() + ".neg");
+        if (auto *NewInstr = dyn_cast<Instruction>(BO))
+          NewInstr->setIsExact(I->isExact());
+        return BO;
+      }
+    }
+    break;
+  }
+
+  // Rest of the logic is recursive, so if it's time to give up then it's time.
+  if (Depth > NegatorMaxDepth) {
+    LLVM_DEBUG(dbgs() << "Negator: reached maximal allowed traversal depth in "
+                      << *V << ". Giving up.\n");
+    ++NegatorTimesDepthLimitReached;
+    return nullptr;
+  }
+
+  switch (I->getOpcode()) {
   case Instruction::Freeze: {
     // `freeze` is negatible if its operand is negatible.
     Value *NegOp = negate(I->getOperand(0), Depth + 1);
@@ -268,23 +268,23 @@ LLVM_NODISCARD Value *Negator::visitImpl(Value *V, unsigned Depth) {
       return nullptr;
     return Builder.CreateFreeze(NegOp, I->getName() + ".neg");
   }
-  case Instruction::PHI: { 
-    // `phi` is negatible if all the incoming values are negatible. 
-    auto *PHI = cast<PHINode>(I); 
-    SmallVector<Value *, 4> NegatedIncomingValues(PHI->getNumOperands()); 
-    for (auto I : zip(PHI->incoming_values(), NegatedIncomingValues)) { 
-      if (!(std::get<1>(I) = 
-                negate(std::get<0>(I), Depth + 1))) // Early return. 
-        return nullptr; 
-    } 
-    // All incoming values are indeed negatible. Create negated PHI node. 
-    PHINode *NegatedPHI = Builder.CreatePHI( 
-        PHI->getType(), PHI->getNumOperands(), PHI->getName() + ".neg"); 
-    for (auto I : zip(NegatedIncomingValues, PHI->blocks())) 
-      NegatedPHI->addIncoming(std::get<0>(I), std::get<1>(I)); 
-    return NegatedPHI; 
-  } 
-  case Instruction::Select: { 
+  case Instruction::PHI: {
+    // `phi` is negatible if all the incoming values are negatible.
+    auto *PHI = cast<PHINode>(I);
+    SmallVector<Value *, 4> NegatedIncomingValues(PHI->getNumOperands());
+    for (auto I : zip(PHI->incoming_values(), NegatedIncomingValues)) {
+      if (!(std::get<1>(I) =
+                negate(std::get<0>(I), Depth + 1))) // Early return.
+        return nullptr;
+    }
+    // All incoming values are indeed negatible. Create negated PHI node.
+    PHINode *NegatedPHI = Builder.CreatePHI(
+        PHI->getType(), PHI->getNumOperands(), PHI->getName() + ".neg");
+    for (auto I : zip(NegatedIncomingValues, PHI->blocks()))
+      NegatedPHI->addIncoming(std::get<0>(I), std::get<1>(I));
+    return NegatedPHI;
+  }
+  case Instruction::Select: {
     if (isKnownNegation(I->getOperand(1), I->getOperand(2))) {
       // Of one hand of select is known to be negation of another hand,
       // just swap the hands around.
@@ -295,86 +295,86 @@ LLVM_NODISCARD Value *Negator::visitImpl(Value *V, unsigned Depth) {
       NewSelect->setName(I->getName() + ".neg");
       Builder.Insert(NewSelect);
       return NewSelect;
-    } 
-    // `select` is negatible if both hands of `select` are negatible. 
-    Value *NegOp1 = negate(I->getOperand(1), Depth + 1); 
-    if (!NegOp1) // Early return. 
-      return nullptr; 
-    Value *NegOp2 = negate(I->getOperand(2), Depth + 1); 
-    if (!NegOp2) 
-      return nullptr; 
-    // Do preserve the metadata! 
-    return Builder.CreateSelect(I->getOperand(0), NegOp1, NegOp2, 
-                                I->getName() + ".neg", /*MDFrom=*/I); 
-  } 
-  case Instruction::ShuffleVector: { 
-    // `shufflevector` is negatible if both operands are negatible. 
-    auto *Shuf = cast<ShuffleVectorInst>(I); 
-    Value *NegOp0 = negate(I->getOperand(0), Depth + 1); 
-    if (!NegOp0) // Early return. 
-      return nullptr; 
-    Value *NegOp1 = negate(I->getOperand(1), Depth + 1); 
-    if (!NegOp1) 
-      return nullptr; 
-    return Builder.CreateShuffleVector(NegOp0, NegOp1, Shuf->getShuffleMask(), 
-                                       I->getName() + ".neg"); 
-  } 
-  case Instruction::ExtractElement: { 
-    // `extractelement` is negatible if source operand is negatible. 
-    auto *EEI = cast<ExtractElementInst>(I); 
-    Value *NegVector = negate(EEI->getVectorOperand(), Depth + 1); 
-    if (!NegVector) // Early return. 
-      return nullptr; 
-    return Builder.CreateExtractElement(NegVector, EEI->getIndexOperand(), 
-                                        I->getName() + ".neg"); 
-  } 
-  case Instruction::InsertElement: { 
-    // `insertelement` is negatible if both the source vector and 
-    // element-to-be-inserted are negatible. 
-    auto *IEI = cast<InsertElementInst>(I); 
-    Value *NegVector = negate(IEI->getOperand(0), Depth + 1); 
-    if (!NegVector) // Early return. 
-      return nullptr; 
-    Value *NegNewElt = negate(IEI->getOperand(1), Depth + 1); 
-    if (!NegNewElt) // Early return. 
-      return nullptr; 
-    return Builder.CreateInsertElement(NegVector, NegNewElt, IEI->getOperand(2), 
-                                       I->getName() + ".neg"); 
-  } 
-  case Instruction::Trunc: { 
-    // `trunc` is negatible if its operand is negatible. 
-    Value *NegOp = negate(I->getOperand(0), Depth + 1); 
-    if (!NegOp) // Early return. 
-      return nullptr; 
-    return Builder.CreateTrunc(NegOp, I->getType(), I->getName() + ".neg"); 
-  } 
-  case Instruction::Shl: { 
-    // `shl` is negatible if the first operand is negatible. 
+    }
+    // `select` is negatible if both hands of `select` are negatible.
+    Value *NegOp1 = negate(I->getOperand(1), Depth + 1);
+    if (!NegOp1) // Early return.
+      return nullptr;
+    Value *NegOp2 = negate(I->getOperand(2), Depth + 1);
+    if (!NegOp2)
+      return nullptr;
+    // Do preserve the metadata!
+    return Builder.CreateSelect(I->getOperand(0), NegOp1, NegOp2,
+                                I->getName() + ".neg", /*MDFrom=*/I);
+  }
+  case Instruction::ShuffleVector: {
+    // `shufflevector` is negatible if both operands are negatible.
+    auto *Shuf = cast<ShuffleVectorInst>(I);
+    Value *NegOp0 = negate(I->getOperand(0), Depth + 1);
+    if (!NegOp0) // Early return.
+      return nullptr;
+    Value *NegOp1 = negate(I->getOperand(1), Depth + 1);
+    if (!NegOp1)
+      return nullptr;
+    return Builder.CreateShuffleVector(NegOp0, NegOp1, Shuf->getShuffleMask(),
+                                       I->getName() + ".neg");
+  }
+  case Instruction::ExtractElement: {
+    // `extractelement` is negatible if source operand is negatible.
+    auto *EEI = cast<ExtractElementInst>(I);
+    Value *NegVector = negate(EEI->getVectorOperand(), Depth + 1);
+    if (!NegVector) // Early return.
+      return nullptr;
+    return Builder.CreateExtractElement(NegVector, EEI->getIndexOperand(),
+                                        I->getName() + ".neg");
+  }
+  case Instruction::InsertElement: {
+    // `insertelement` is negatible if both the source vector and
+    // element-to-be-inserted are negatible.
+    auto *IEI = cast<InsertElementInst>(I);
+    Value *NegVector = negate(IEI->getOperand(0), Depth + 1);
+    if (!NegVector) // Early return.
+      return nullptr;
+    Value *NegNewElt = negate(IEI->getOperand(1), Depth + 1);
+    if (!NegNewElt) // Early return.
+      return nullptr;
+    return Builder.CreateInsertElement(NegVector, NegNewElt, IEI->getOperand(2),
+                                       I->getName() + ".neg");
+  }
+  case Instruction::Trunc: {
+    // `trunc` is negatible if its operand is negatible.
+    Value *NegOp = negate(I->getOperand(0), Depth + 1);
+    if (!NegOp) // Early return.
+      return nullptr;
+    return Builder.CreateTrunc(NegOp, I->getType(), I->getName() + ".neg");
+  }
+  case Instruction::Shl: {
+    // `shl` is negatible if the first operand is negatible.
     if (Value *NegOp0 = negate(I->getOperand(0), Depth + 1))
       return Builder.CreateShl(NegOp0, I->getOperand(1), I->getName() + ".neg");
     // Otherwise, `shl %x, C` can be interpreted as `mul %x, 1<<C`.
     auto *Op1C = dyn_cast<Constant>(I->getOperand(1));
     if (!Op1C) // Early return.
-      return nullptr; 
+      return nullptr;
     return Builder.CreateMul(
         I->getOperand(0),
         ConstantExpr::getShl(Constant::getAllOnesValue(Op1C->getType()), Op1C),
         I->getName() + ".neg");
-  } 
+  }
   case Instruction::Or: {
-    if (!haveNoCommonBitsSet(I->getOperand(0), I->getOperand(1), DL, &AC, I, 
-                             &DT)) 
-      return nullptr; // Don't know how to handle `or` in general. 
+    if (!haveNoCommonBitsSet(I->getOperand(0), I->getOperand(1), DL, &AC, I,
+                             &DT))
+      return nullptr; // Don't know how to handle `or` in general.
     std::array<Value *, 2> Ops = getSortedOperandsOfBinOp(I);
-    // `or`/`add` are interchangeable when operands have no common bits set. 
-    // `inc` is always negatible. 
+    // `or`/`add` are interchangeable when operands have no common bits set.
+    // `inc` is always negatible.
     if (match(Ops[1], m_One()))
       return Builder.CreateNot(Ops[0], I->getName() + ".neg");
-    // Else, just defer to Instruction::Add handling. 
-    LLVM_FALLTHROUGH; 
+    // Else, just defer to Instruction::Add handling.
+    LLVM_FALLTHROUGH;
   }
-  case Instruction::Add: { 
-    // `add` is negatible if both of its operands are negatible. 
+  case Instruction::Add: {
+    // `add` is negatible if both of its operands are negatible.
     SmallVector<Value *, 2> NegatedOps, NonNegatedOps;
     for (Value *Op : I->operands()) {
       // Can we sink the negation into this operand?
@@ -397,135 +397,135 @@ LLVM_NODISCARD Value *Negator::visitImpl(Value *V, unsigned Depth) {
     assert(IsTrulyNegation && "We should have early-exited then.");
     // Completely failed to sink negation?
     if (NonNegatedOps.size() == 2)
-      return nullptr; 
+      return nullptr;
     // 0-(a+b) --> (-a)-b
     return Builder.CreateSub(NegatedOps[0], NonNegatedOps[0],
                              I->getName() + ".neg");
-  } 
+  }
   case Instruction::Xor: {
     std::array<Value *, 2> Ops = getSortedOperandsOfBinOp(I);
-    // `xor` is negatible if one of its operands is invertible. 
-    // FIXME: InstCombineInverter? But how to connect Inverter and Negator? 
+    // `xor` is negatible if one of its operands is invertible.
+    // FIXME: InstCombineInverter? But how to connect Inverter and Negator?
     if (auto *C = dyn_cast<Constant>(Ops[1])) {
       Value *Xor = Builder.CreateXor(Ops[0], ConstantExpr::getNot(C));
-      return Builder.CreateAdd(Xor, ConstantInt::get(Xor->getType(), 1), 
-                               I->getName() + ".neg"); 
-    } 
-    return nullptr; 
+      return Builder.CreateAdd(Xor, ConstantInt::get(Xor->getType(), 1),
+                               I->getName() + ".neg");
+    }
+    return nullptr;
   }
-  case Instruction::Mul: { 
+  case Instruction::Mul: {
     std::array<Value *, 2> Ops = getSortedOperandsOfBinOp(I);
-    // `mul` is negatible if one of its operands is negatible. 
-    Value *NegatedOp, *OtherOp; 
-    // First try the second operand, in case it's a constant it will be best to 
-    // just invert it instead of sinking the `neg` deeper. 
+    // `mul` is negatible if one of its operands is negatible.
+    Value *NegatedOp, *OtherOp;
+    // First try the second operand, in case it's a constant it will be best to
+    // just invert it instead of sinking the `neg` deeper.
     if (Value *NegOp1 = negate(Ops[1], Depth + 1)) {
-      NegatedOp = NegOp1; 
+      NegatedOp = NegOp1;
       OtherOp = Ops[0];
     } else if (Value *NegOp0 = negate(Ops[0], Depth + 1)) {
-      NegatedOp = NegOp0; 
+      NegatedOp = NegOp0;
       OtherOp = Ops[1];
-    } else 
-      // Can't negate either of them. 
-      return nullptr; 
-    return Builder.CreateMul(NegatedOp, OtherOp, I->getName() + ".neg"); 
-  } 
-  default: 
-    return nullptr; // Don't know, likely not negatible for free. 
-  } 
- 
-  llvm_unreachable("Can't get here. We always return from switch."); 
-} 
- 
-LLVM_NODISCARD Value *Negator::negate(Value *V, unsigned Depth) { 
-  NegatorMaxDepthVisited.updateMax(Depth); 
-  ++NegatorNumValuesVisited; 
- 
-#if LLVM_ENABLE_STATS 
-  ++NumValuesVisitedInThisNegator; 
-#endif 
- 
-#ifndef NDEBUG 
-  // We can't ever have a Value with such an address. 
-  Value *Placeholder = reinterpret_cast<Value *>(static_cast<uintptr_t>(-1)); 
-#endif 
- 
-  // Did we already try to negate this value? 
-  auto NegationsCacheIterator = NegationsCache.find(V); 
-  if (NegationsCacheIterator != NegationsCache.end()) { 
-    ++NegatorNumNegationsFoundInCache; 
-    Value *NegatedV = NegationsCacheIterator->second; 
-    assert(NegatedV != Placeholder && "Encountered a cycle during negation."); 
-    return NegatedV; 
-  } 
- 
-#ifndef NDEBUG 
-  // We did not find a cached result for negation of V. While there, 
-  // let's temporairly cache a placeholder value, with the idea that if later 
-  // during negation we fetch it from cache, we'll know we're in a cycle. 
-  NegationsCache[V] = Placeholder; 
-#endif 
- 
-  // No luck. Try negating it for real. 
-  Value *NegatedV = visitImpl(V, Depth); 
-  // And cache the (real) result for the future. 
-  NegationsCache[V] = NegatedV; 
- 
-  return NegatedV; 
-} 
- 
-LLVM_NODISCARD Optional<Negator::Result> Negator::run(Value *Root) { 
-  Value *Negated = negate(Root, /*Depth=*/0); 
-  if (!Negated) { 
-    // We must cleanup newly-inserted instructions, to avoid any potential 
-    // endless combine looping. 
-    llvm::for_each(llvm::reverse(NewInstructions), 
-                   [&](Instruction *I) { I->eraseFromParent(); }); 
-    return llvm::None; 
-  } 
-  return std::make_pair(ArrayRef<Instruction *>(NewInstructions), Negated); 
-} 
- 
-LLVM_NODISCARD Value *Negator::Negate(bool LHSIsZero, Value *Root, 
+    } else
+      // Can't negate either of them.
+      return nullptr;
+    return Builder.CreateMul(NegatedOp, OtherOp, I->getName() + ".neg");
+  }
+  default:
+    return nullptr; // Don't know, likely not negatible for free.
+  }
+
+  llvm_unreachable("Can't get here. We always return from switch.");
+}
+
+LLVM_NODISCARD Value *Negator::negate(Value *V, unsigned Depth) {
+  NegatorMaxDepthVisited.updateMax(Depth);
+  ++NegatorNumValuesVisited;
+
+#if LLVM_ENABLE_STATS
+  ++NumValuesVisitedInThisNegator;
+#endif
+
+#ifndef NDEBUG
+  // We can't ever have a Value with such an address.
+  Value *Placeholder = reinterpret_cast<Value *>(static_cast<uintptr_t>(-1));
+#endif
+
+  // Did we already try to negate this value?
+  auto NegationsCacheIterator = NegationsCache.find(V);
+  if (NegationsCacheIterator != NegationsCache.end()) {
+    ++NegatorNumNegationsFoundInCache;
+    Value *NegatedV = NegationsCacheIterator->second;
+    assert(NegatedV != Placeholder && "Encountered a cycle during negation.");
+    return NegatedV;
+  }
+
+#ifndef NDEBUG
+  // We did not find a cached result for negation of V. While there,
+  // let's temporairly cache a placeholder value, with the idea that if later
+  // during negation we fetch it from cache, we'll know we're in a cycle.
+  NegationsCache[V] = Placeholder;
+#endif
+
+  // No luck. Try negating it for real.
+  Value *NegatedV = visitImpl(V, Depth);
+  // And cache the (real) result for the future.
+  NegationsCache[V] = NegatedV;
+
+  return NegatedV;
+}
+
+LLVM_NODISCARD Optional<Negator::Result> Negator::run(Value *Root) {
+  Value *Negated = negate(Root, /*Depth=*/0);
+  if (!Negated) {
+    // We must cleanup newly-inserted instructions, to avoid any potential
+    // endless combine looping.
+    llvm::for_each(llvm::reverse(NewInstructions),
+                   [&](Instruction *I) { I->eraseFromParent(); });
+    return llvm::None;
+  }
+  return std::make_pair(ArrayRef<Instruction *>(NewInstructions), Negated);
+}
+
+LLVM_NODISCARD Value *Negator::Negate(bool LHSIsZero, Value *Root,
                                       InstCombinerImpl &IC) {
-  ++NegatorTotalNegationsAttempted; 
-  LLVM_DEBUG(dbgs() << "Negator: attempting to sink negation into " << *Root 
-                    << "\n"); 
- 
-  if (!NegatorEnabled || !DebugCounter::shouldExecute(NegatorCounter)) 
-    return nullptr; 
- 
-  Negator N(Root->getContext(), IC.getDataLayout(), IC.getAssumptionCache(), 
-            IC.getDominatorTree(), LHSIsZero); 
-  Optional<Result> Res = N.run(Root); 
-  if (!Res) { // Negation failed. 
-    LLVM_DEBUG(dbgs() << "Negator: failed to sink negation into " << *Root 
-                      << "\n"); 
-    return nullptr; 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "Negator: successfully sunk negation into " << *Root 
-                    << "\n         NEW: " << *Res->second << "\n"); 
-  ++NegatorNumTreesNegated; 
- 
-  // We must temporarily unset the 'current' insertion point and DebugLoc of the 
-  // InstCombine's IRBuilder so that it won't interfere with the ones we have 
-  // already specified when producing negated instructions. 
-  InstCombiner::BuilderTy::InsertPointGuard Guard(IC.Builder); 
-  IC.Builder.ClearInsertionPoint(); 
-  IC.Builder.SetCurrentDebugLocation(DebugLoc()); 
- 
-  // And finally, we must add newly-created instructions into the InstCombine's 
-  // worklist (in a proper order!) so it can attempt to combine them. 
-  LLVM_DEBUG(dbgs() << "Negator: Propagating " << Res->first.size() 
-                    << " instrs to InstCombine\n"); 
-  NegatorMaxInstructionsCreated.updateMax(Res->first.size()); 
-  NegatorNumInstructionsNegatedSuccess += Res->first.size(); 
- 
-  // They are in def-use order, so nothing fancy, just insert them in order. 
-  llvm::for_each(Res->first, 
-                 [&](Instruction *I) { IC.Builder.Insert(I, I->getName()); }); 
- 
-  // And return the new root. 
-  return Res->second; 
-} 
+  ++NegatorTotalNegationsAttempted;
+  LLVM_DEBUG(dbgs() << "Negator: attempting to sink negation into " << *Root
+                    << "\n");
+
+  if (!NegatorEnabled || !DebugCounter::shouldExecute(NegatorCounter))
+    return nullptr;
+
+  Negator N(Root->getContext(), IC.getDataLayout(), IC.getAssumptionCache(),
+            IC.getDominatorTree(), LHSIsZero);
+  Optional<Result> Res = N.run(Root);
+  if (!Res) { // Negation failed.
+    LLVM_DEBUG(dbgs() << "Negator: failed to sink negation into " << *Root
+                      << "\n");
+    return nullptr;
+  }
+
+  LLVM_DEBUG(dbgs() << "Negator: successfully sunk negation into " << *Root
+                    << "\n         NEW: " << *Res->second << "\n");
+  ++NegatorNumTreesNegated;
+
+  // We must temporarily unset the 'current' insertion point and DebugLoc of the
+  // InstCombine's IRBuilder so that it won't interfere with the ones we have
+  // already specified when producing negated instructions.
+  InstCombiner::BuilderTy::InsertPointGuard Guard(IC.Builder);
+  IC.Builder.ClearInsertionPoint();
+  IC.Builder.SetCurrentDebugLocation(DebugLoc());
+
+  // And finally, we must add newly-created instructions into the InstCombine's
+  // worklist (in a proper order!) so it can attempt to combine them.
+  LLVM_DEBUG(dbgs() << "Negator: Propagating " << Res->first.size()
+                    << " instrs to InstCombine\n");
+  NegatorMaxInstructionsCreated.updateMax(Res->first.size());
+  NegatorNumInstructionsNegatedSuccess += Res->first.size();
+
+  // They are in def-use order, so nothing fancy, just insert them in order.
+  llvm::for_each(Res->first,
+                 [&](Instruction *I) { IC.Builder.Insert(I, I->getName()); });
+
+  // And return the new root.
+  return Res->second;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombinePHI.cpp b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombinePHI.cpp
index e4ba78e459..b211b08136 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -1,304 +1,304 @@
-//===- InstCombinePHI.cpp -------------------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the visitPHINode function. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "InstCombineInternal.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
+//===- InstCombinePHI.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visitPHINode function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/Support/CommandLine.h" 
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
-#include "llvm/Transforms/Utils/Local.h" 
-
-using namespace llvm; 
-using namespace llvm::PatternMatch; 
- 
-#define DEBUG_TYPE "instcombine" 
- 
-static cl::opt<unsigned> 
-MaxNumPhis("instcombine-max-num-phis", cl::init(512), 
-           cl::desc("Maximum number phis to handle in intptr/ptrint folding")); 
- 
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
+static cl::opt<unsigned>
+MaxNumPhis("instcombine-max-num-phis", cl::init(512),
+           cl::desc("Maximum number phis to handle in intptr/ptrint folding"));
+
 STATISTIC(NumPHIsOfInsertValues,
           "Number of phi-of-insertvalue turned into insertvalue-of-phis");
 STATISTIC(NumPHIsOfExtractValues,
           "Number of phi-of-extractvalue turned into extractvalue-of-phi");
 STATISTIC(NumPHICSEs, "Number of PHI's that got CSE'd");
 
-/// The PHI arguments will be folded into a single operation with a PHI node 
-/// as input. The debug location of the single operation will be the merged 
-/// locations of the original PHI node arguments. 
+/// The PHI arguments will be folded into a single operation with a PHI node
+/// as input. The debug location of the single operation will be the merged
+/// locations of the original PHI node arguments.
 void InstCombinerImpl::PHIArgMergedDebugLoc(Instruction *Inst, PHINode &PN) {
-  auto *FirstInst = cast<Instruction>(PN.getIncomingValue(0)); 
-  Inst->setDebugLoc(FirstInst->getDebugLoc()); 
-  // We do not expect a CallInst here, otherwise, N-way merging of DebugLoc 
-  // will be inefficient. 
-  assert(!isa<CallInst>(Inst)); 
- 
-  for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) { 
-    auto *I = cast<Instruction>(PN.getIncomingValue(i)); 
-    Inst->applyMergedLocation(Inst->getDebugLoc(), I->getDebugLoc()); 
-  } 
-} 
- 
-// Replace Integer typed PHI PN if the PHI's value is used as a pointer value. 
-// If there is an existing pointer typed PHI that produces the same value as PN, 
-// replace PN and the IntToPtr operation with it. Otherwise, synthesize a new 
-// PHI node: 
-// 
-// Case-1: 
-// bb1: 
-//     int_init = PtrToInt(ptr_init) 
-//     br label %bb2 
-// bb2: 
-//    int_val = PHI([int_init, %bb1], [int_val_inc, %bb2] 
-//    ptr_val = PHI([ptr_init, %bb1], [ptr_val_inc, %bb2] 
-//    ptr_val2 = IntToPtr(int_val) 
-//    ... 
-//    use(ptr_val2) 
-//    ptr_val_inc = ... 
-//    inc_val_inc = PtrToInt(ptr_val_inc) 
-// 
-// ==> 
-// bb1: 
-//     br label %bb2 
-// bb2: 
-//    ptr_val = PHI([ptr_init, %bb1], [ptr_val_inc, %bb2] 
-//    ... 
-//    use(ptr_val) 
-//    ptr_val_inc = ... 
-// 
-// Case-2: 
-// bb1: 
-//    int_ptr = BitCast(ptr_ptr) 
-//    int_init = Load(int_ptr) 
-//    br label %bb2 
-// bb2: 
-//    int_val = PHI([int_init, %bb1], [int_val_inc, %bb2] 
-//    ptr_val2 = IntToPtr(int_val) 
-//    ... 
-//    use(ptr_val2) 
-//    ptr_val_inc = ... 
-//    inc_val_inc = PtrToInt(ptr_val_inc) 
-// ==> 
-// bb1: 
-//    ptr_init = Load(ptr_ptr) 
-//    br label %bb2 
-// bb2: 
-//    ptr_val = PHI([ptr_init, %bb1], [ptr_val_inc, %bb2] 
-//    ... 
-//    use(ptr_val) 
-//    ptr_val_inc = ... 
-//    ... 
-// 
+  auto *FirstInst = cast<Instruction>(PN.getIncomingValue(0));
+  Inst->setDebugLoc(FirstInst->getDebugLoc());
+  // We do not expect a CallInst here, otherwise, N-way merging of DebugLoc
+  // will be inefficient.
+  assert(!isa<CallInst>(Inst));
+
+  for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) {
+    auto *I = cast<Instruction>(PN.getIncomingValue(i));
+    Inst->applyMergedLocation(Inst->getDebugLoc(), I->getDebugLoc());
+  }
+}
+
+// Replace Integer typed PHI PN if the PHI's value is used as a pointer value.
+// If there is an existing pointer typed PHI that produces the same value as PN,
+// replace PN and the IntToPtr operation with it. Otherwise, synthesize a new
+// PHI node:
+//
+// Case-1:
+// bb1:
+//     int_init = PtrToInt(ptr_init)
+//     br label %bb2
+// bb2:
+//    int_val = PHI([int_init, %bb1], [int_val_inc, %bb2]
+//    ptr_val = PHI([ptr_init, %bb1], [ptr_val_inc, %bb2]
+//    ptr_val2 = IntToPtr(int_val)
+//    ...
+//    use(ptr_val2)
+//    ptr_val_inc = ...
+//    inc_val_inc = PtrToInt(ptr_val_inc)
+//
+// ==>
+// bb1:
+//     br label %bb2
+// bb2:
+//    ptr_val = PHI([ptr_init, %bb1], [ptr_val_inc, %bb2]
+//    ...
+//    use(ptr_val)
+//    ptr_val_inc = ...
+//
+// Case-2:
+// bb1:
+//    int_ptr = BitCast(ptr_ptr)
+//    int_init = Load(int_ptr)
+//    br label %bb2
+// bb2:
+//    int_val = PHI([int_init, %bb1], [int_val_inc, %bb2]
+//    ptr_val2 = IntToPtr(int_val)
+//    ...
+//    use(ptr_val2)
+//    ptr_val_inc = ...
+//    inc_val_inc = PtrToInt(ptr_val_inc)
+// ==>
+// bb1:
+//    ptr_init = Load(ptr_ptr)
+//    br label %bb2
+// bb2:
+//    ptr_val = PHI([ptr_init, %bb1], [ptr_val_inc, %bb2]
+//    ...
+//    use(ptr_val)
+//    ptr_val_inc = ...
+//    ...
+//
 Instruction *InstCombinerImpl::foldIntegerTypedPHI(PHINode &PN) {
-  if (!PN.getType()->isIntegerTy()) 
-    return nullptr; 
-  if (!PN.hasOneUse()) 
-    return nullptr; 
- 
-  auto *IntToPtr = dyn_cast<IntToPtrInst>(PN.user_back()); 
-  if (!IntToPtr) 
-    return nullptr; 
- 
-  // Check if the pointer is actually used as pointer: 
-  auto HasPointerUse = [](Instruction *IIP) { 
-    for (User *U : IIP->users()) { 
-      Value *Ptr = nullptr; 
-      if (LoadInst *LoadI = dyn_cast<LoadInst>(U)) { 
-        Ptr = LoadI->getPointerOperand(); 
-      } else if (StoreInst *SI = dyn_cast<StoreInst>(U)) { 
-        Ptr = SI->getPointerOperand(); 
-      } else if (GetElementPtrInst *GI = dyn_cast<GetElementPtrInst>(U)) { 
-        Ptr = GI->getPointerOperand(); 
-      } 
- 
-      if (Ptr && Ptr == IIP) 
-        return true; 
-    } 
-    return false; 
-  }; 
- 
-  if (!HasPointerUse(IntToPtr)) 
-    return nullptr; 
- 
-  if (DL.getPointerSizeInBits(IntToPtr->getAddressSpace()) != 
-      DL.getTypeSizeInBits(IntToPtr->getOperand(0)->getType())) 
-    return nullptr; 
- 
-  SmallVector<Value *, 4> AvailablePtrVals; 
-  for (unsigned i = 0; i != PN.getNumIncomingValues(); ++i) { 
-    Value *Arg = PN.getIncomingValue(i); 
- 
-    // First look backward: 
-    if (auto *PI = dyn_cast<PtrToIntInst>(Arg)) { 
-      AvailablePtrVals.emplace_back(PI->getOperand(0)); 
-      continue; 
-    } 
- 
-    // Next look forward: 
-    Value *ArgIntToPtr = nullptr; 
-    for (User *U : Arg->users()) { 
-      if (isa<IntToPtrInst>(U) && U->getType() == IntToPtr->getType() && 
-          (DT.dominates(cast<Instruction>(U), PN.getIncomingBlock(i)) || 
-           cast<Instruction>(U)->getParent() == PN.getIncomingBlock(i))) { 
-        ArgIntToPtr = U; 
-        break; 
-      } 
-    } 
- 
-    if (ArgIntToPtr) { 
-      AvailablePtrVals.emplace_back(ArgIntToPtr); 
-      continue; 
-    } 
- 
-    // If Arg is defined by a PHI, allow it. This will also create 
-    // more opportunities iteratively. 
-    if (isa<PHINode>(Arg)) { 
-      AvailablePtrVals.emplace_back(Arg); 
-      continue; 
-    } 
- 
-    // For a single use integer load: 
-    auto *LoadI = dyn_cast<LoadInst>(Arg); 
-    if (!LoadI) 
-      return nullptr; 
- 
-    if (!LoadI->hasOneUse()) 
-      return nullptr; 
- 
-    // Push the integer typed Load instruction into the available 
-    // value set, and fix it up later when the pointer typed PHI 
-    // is synthesized. 
-    AvailablePtrVals.emplace_back(LoadI); 
-  } 
- 
-  // Now search for a matching PHI 
-  auto *BB = PN.getParent(); 
-  assert(AvailablePtrVals.size() == PN.getNumIncomingValues() && 
-         "Not enough available ptr typed incoming values"); 
-  PHINode *MatchingPtrPHI = nullptr; 
-  unsigned NumPhis = 0; 
-  for (auto II = BB->begin(); II != BB->end(); II++, NumPhis++) { 
-    // FIXME: consider handling this in AggressiveInstCombine 
-    PHINode *PtrPHI = dyn_cast<PHINode>(II); 
-    if (!PtrPHI) 
-      break; 
-    if (NumPhis > MaxNumPhis) 
-      return nullptr; 
-    if (PtrPHI == &PN || PtrPHI->getType() != IntToPtr->getType()) 
-      continue; 
-    MatchingPtrPHI = PtrPHI; 
-    for (unsigned i = 0; i != PtrPHI->getNumIncomingValues(); ++i) { 
-      if (AvailablePtrVals[i] != 
-          PtrPHI->getIncomingValueForBlock(PN.getIncomingBlock(i))) { 
-        MatchingPtrPHI = nullptr; 
-        break; 
-      } 
-    } 
- 
-    if (MatchingPtrPHI) 
-      break; 
-  } 
- 
-  if (MatchingPtrPHI) { 
-    assert(MatchingPtrPHI->getType() == IntToPtr->getType() && 
-           "Phi's Type does not match with IntToPtr"); 
-    // The PtrToCast + IntToPtr will be simplified later 
-    return CastInst::CreateBitOrPointerCast(MatchingPtrPHI, 
-                                            IntToPtr->getOperand(0)->getType()); 
-  } 
- 
-  // If it requires a conversion for every PHI operand, do not do it. 
-  if (all_of(AvailablePtrVals, [&](Value *V) { 
-        return (V->getType() != IntToPtr->getType()) || isa<IntToPtrInst>(V); 
-      })) 
-    return nullptr; 
- 
-  // If any of the operand that requires casting is a terminator 
-  // instruction, do not do it. Similarly, do not do the transform if the value 
-  // is PHI in a block with no insertion point, for example, a catchswitch 
-  // block, since we will not be able to insert a cast after the PHI. 
-  if (any_of(AvailablePtrVals, [&](Value *V) { 
-        if (V->getType() == IntToPtr->getType()) 
-          return false; 
-        auto *Inst = dyn_cast<Instruction>(V); 
-        if (!Inst) 
-          return false; 
-        if (Inst->isTerminator()) 
-          return true; 
-        auto *BB = Inst->getParent(); 
-        if (isa<PHINode>(Inst) && BB->getFirstInsertionPt() == BB->end()) 
-          return true; 
-        return false; 
-      })) 
-    return nullptr; 
- 
-  PHINode *NewPtrPHI = PHINode::Create( 
-      IntToPtr->getType(), PN.getNumIncomingValues(), PN.getName() + ".ptr"); 
- 
-  InsertNewInstBefore(NewPtrPHI, PN); 
-  SmallDenseMap<Value *, Instruction *> Casts; 
-  for (unsigned i = 0; i != PN.getNumIncomingValues(); ++i) { 
-    auto *IncomingBB = PN.getIncomingBlock(i); 
-    auto *IncomingVal = AvailablePtrVals[i]; 
- 
-    if (IncomingVal->getType() == IntToPtr->getType()) { 
-      NewPtrPHI->addIncoming(IncomingVal, IncomingBB); 
-      continue; 
-    } 
- 
-#ifndef NDEBUG 
-    LoadInst *LoadI = dyn_cast<LoadInst>(IncomingVal); 
-    assert((isa<PHINode>(IncomingVal) || 
-            IncomingVal->getType()->isPointerTy() || 
-            (LoadI && LoadI->hasOneUse())) && 
-           "Can not replace LoadInst with multiple uses"); 
-#endif 
-    // Need to insert a BitCast. 
-    // For an integer Load instruction with a single use, the load + IntToPtr 
-    // cast will be simplified into a pointer load: 
-    // %v = load i64, i64* %a.ip, align 8 
-    // %v.cast = inttoptr i64 %v to float ** 
-    // ==> 
-    // %v.ptrp = bitcast i64 * %a.ip to float ** 
-    // %v.cast = load float *, float ** %v.ptrp, align 8 
-    Instruction *&CI = Casts[IncomingVal]; 
-    if (!CI) { 
-      CI = CastInst::CreateBitOrPointerCast(IncomingVal, IntToPtr->getType(), 
-                                            IncomingVal->getName() + ".ptr"); 
-      if (auto *IncomingI = dyn_cast<Instruction>(IncomingVal)) { 
-        BasicBlock::iterator InsertPos(IncomingI); 
-        InsertPos++; 
-        BasicBlock *BB = IncomingI->getParent(); 
-        if (isa<PHINode>(IncomingI)) 
-          InsertPos = BB->getFirstInsertionPt(); 
-        assert(InsertPos != BB->end() && "should have checked above"); 
-        InsertNewInstBefore(CI, *InsertPos); 
-      } else { 
-        auto *InsertBB = &IncomingBB->getParent()->getEntryBlock(); 
-        InsertNewInstBefore(CI, *InsertBB->getFirstInsertionPt()); 
-      } 
-    } 
-    NewPtrPHI->addIncoming(CI, IncomingBB); 
-  } 
- 
-  // The PtrToCast + IntToPtr will be simplified later 
-  return CastInst::CreateBitOrPointerCast(NewPtrPHI, 
-                                          IntToPtr->getOperand(0)->getType()); 
-} 
- 
+  if (!PN.getType()->isIntegerTy())
+    return nullptr;
+  if (!PN.hasOneUse())
+    return nullptr;
+
+  auto *IntToPtr = dyn_cast<IntToPtrInst>(PN.user_back());
+  if (!IntToPtr)
+    return nullptr;
+
+  // Check if the pointer is actually used as pointer:
+  auto HasPointerUse = [](Instruction *IIP) {
+    for (User *U : IIP->users()) {
+      Value *Ptr = nullptr;
+      if (LoadInst *LoadI = dyn_cast<LoadInst>(U)) {
+        Ptr = LoadI->getPointerOperand();
+      } else if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+        Ptr = SI->getPointerOperand();
+      } else if (GetElementPtrInst *GI = dyn_cast<GetElementPtrInst>(U)) {
+        Ptr = GI->getPointerOperand();
+      }
+
+      if (Ptr && Ptr == IIP)
+        return true;
+    }
+    return false;
+  };
+
+  if (!HasPointerUse(IntToPtr))
+    return nullptr;
+
+  if (DL.getPointerSizeInBits(IntToPtr->getAddressSpace()) !=
+      DL.getTypeSizeInBits(IntToPtr->getOperand(0)->getType()))
+    return nullptr;
+
+  SmallVector<Value *, 4> AvailablePtrVals;
+  for (unsigned i = 0; i != PN.getNumIncomingValues(); ++i) {
+    Value *Arg = PN.getIncomingValue(i);
+
+    // First look backward:
+    if (auto *PI = dyn_cast<PtrToIntInst>(Arg)) {
+      AvailablePtrVals.emplace_back(PI->getOperand(0));
+      continue;
+    }
+
+    // Next look forward:
+    Value *ArgIntToPtr = nullptr;
+    for (User *U : Arg->users()) {
+      if (isa<IntToPtrInst>(U) && U->getType() == IntToPtr->getType() &&
+          (DT.dominates(cast<Instruction>(U), PN.getIncomingBlock(i)) ||
+           cast<Instruction>(U)->getParent() == PN.getIncomingBlock(i))) {
+        ArgIntToPtr = U;
+        break;
+      }
+    }
+
+    if (ArgIntToPtr) {
+      AvailablePtrVals.emplace_back(ArgIntToPtr);
+      continue;
+    }
+
+    // If Arg is defined by a PHI, allow it. This will also create
+    // more opportunities iteratively.
+    if (isa<PHINode>(Arg)) {
+      AvailablePtrVals.emplace_back(Arg);
+      continue;
+    }
+
+    // For a single use integer load:
+    auto *LoadI = dyn_cast<LoadInst>(Arg);
+    if (!LoadI)
+      return nullptr;
+
+    if (!LoadI->hasOneUse())
+      return nullptr;
+
+    // Push the integer typed Load instruction into the available
+    // value set, and fix it up later when the pointer typed PHI
+    // is synthesized.
+    AvailablePtrVals.emplace_back(LoadI);
+  }
+
+  // Now search for a matching PHI
+  auto *BB = PN.getParent();
+  assert(AvailablePtrVals.size() == PN.getNumIncomingValues() &&
+         "Not enough available ptr typed incoming values");
+  PHINode *MatchingPtrPHI = nullptr;
+  unsigned NumPhis = 0;
+  for (auto II = BB->begin(); II != BB->end(); II++, NumPhis++) {
+    // FIXME: consider handling this in AggressiveInstCombine
+    PHINode *PtrPHI = dyn_cast<PHINode>(II);
+    if (!PtrPHI)
+      break;
+    if (NumPhis > MaxNumPhis)
+      return nullptr;
+    if (PtrPHI == &PN || PtrPHI->getType() != IntToPtr->getType())
+      continue;
+    MatchingPtrPHI = PtrPHI;
+    for (unsigned i = 0; i != PtrPHI->getNumIncomingValues(); ++i) {
+      if (AvailablePtrVals[i] !=
+          PtrPHI->getIncomingValueForBlock(PN.getIncomingBlock(i))) {
+        MatchingPtrPHI = nullptr;
+        break;
+      }
+    }
+
+    if (MatchingPtrPHI)
+      break;
+  }
+
+  if (MatchingPtrPHI) {
+    assert(MatchingPtrPHI->getType() == IntToPtr->getType() &&
+           "Phi's Type does not match with IntToPtr");
+    // The PtrToCast + IntToPtr will be simplified later
+    return CastInst::CreateBitOrPointerCast(MatchingPtrPHI,
+                                            IntToPtr->getOperand(0)->getType());
+  }
+
+  // If it requires a conversion for every PHI operand, do not do it.
+  if (all_of(AvailablePtrVals, [&](Value *V) {
+        return (V->getType() != IntToPtr->getType()) || isa<IntToPtrInst>(V);
+      }))
+    return nullptr;
+
+  // If any of the operand that requires casting is a terminator
+  // instruction, do not do it. Similarly, do not do the transform if the value
+  // is PHI in a block with no insertion point, for example, a catchswitch
+  // block, since we will not be able to insert a cast after the PHI.
+  if (any_of(AvailablePtrVals, [&](Value *V) {
+        if (V->getType() == IntToPtr->getType())
+          return false;
+        auto *Inst = dyn_cast<Instruction>(V);
+        if (!Inst)
+          return false;
+        if (Inst->isTerminator())
+          return true;
+        auto *BB = Inst->getParent();
+        if (isa<PHINode>(Inst) && BB->getFirstInsertionPt() == BB->end())
+          return true;
+        return false;
+      }))
+    return nullptr;
+
+  PHINode *NewPtrPHI = PHINode::Create(
+      IntToPtr->getType(), PN.getNumIncomingValues(), PN.getName() + ".ptr");
+
+  InsertNewInstBefore(NewPtrPHI, PN);
+  SmallDenseMap<Value *, Instruction *> Casts;
+  for (unsigned i = 0; i != PN.getNumIncomingValues(); ++i) {
+    auto *IncomingBB = PN.getIncomingBlock(i);
+    auto *IncomingVal = AvailablePtrVals[i];
+
+    if (IncomingVal->getType() == IntToPtr->getType()) {
+      NewPtrPHI->addIncoming(IncomingVal, IncomingBB);
+      continue;
+    }
+
+#ifndef NDEBUG
+    LoadInst *LoadI = dyn_cast<LoadInst>(IncomingVal);
+    assert((isa<PHINode>(IncomingVal) ||
+            IncomingVal->getType()->isPointerTy() ||
+            (LoadI && LoadI->hasOneUse())) &&
+           "Can not replace LoadInst with multiple uses");
+#endif
+    // Need to insert a BitCast.
+    // For an integer Load instruction with a single use, the load + IntToPtr
+    // cast will be simplified into a pointer load:
+    // %v = load i64, i64* %a.ip, align 8
+    // %v.cast = inttoptr i64 %v to float **
+    // ==>
+    // %v.ptrp = bitcast i64 * %a.ip to float **
+    // %v.cast = load float *, float ** %v.ptrp, align 8
+    Instruction *&CI = Casts[IncomingVal];
+    if (!CI) {
+      CI = CastInst::CreateBitOrPointerCast(IncomingVal, IntToPtr->getType(),
+                                            IncomingVal->getName() + ".ptr");
+      if (auto *IncomingI = dyn_cast<Instruction>(IncomingVal)) {
+        BasicBlock::iterator InsertPos(IncomingI);
+        InsertPos++;
+        BasicBlock *BB = IncomingI->getParent();
+        if (isa<PHINode>(IncomingI))
+          InsertPos = BB->getFirstInsertionPt();
+        assert(InsertPos != BB->end() && "should have checked above");
+        InsertNewInstBefore(CI, *InsertPos);
+      } else {
+        auto *InsertBB = &IncomingBB->getParent()->getEntryBlock();
+        InsertNewInstBefore(CI, *InsertBB->getFirstInsertionPt());
+      }
+    }
+    NewPtrPHI->addIncoming(CI, IncomingBB);
+  }
+
+  // The PtrToCast + IntToPtr will be simplified later
+  return CastInst::CreateBitOrPointerCast(NewPtrPHI,
+                                          IntToPtr->getOperand(0)->getType());
+}
+
 /// If we have something like phi [insertvalue(a,b,0), insertvalue(c,d,0)],
 /// turn this into a phi[a,c] and phi[b,d] and a single insertvalue.
 Instruction *
@@ -376,855 +376,855 @@ InstCombinerImpl::foldPHIArgExtractValueInstructionIntoPHI(PHINode &PN) {
   return NewEVI;
 }
 
-/// If we have something like phi [add (a,b), add(a,c)] and if a/b/c and the 
+/// If we have something like phi [add (a,b), add(a,c)] and if a/b/c and the
 /// adds all have a single user, turn this into a phi and a single binop.
 Instruction *InstCombinerImpl::foldPHIArgBinOpIntoPHI(PHINode &PN) {
-  Instruction *FirstInst = cast<Instruction>(PN.getIncomingValue(0)); 
-  assert(isa<BinaryOperator>(FirstInst) || isa<CmpInst>(FirstInst)); 
-  unsigned Opc = FirstInst->getOpcode(); 
-  Value *LHSVal = FirstInst->getOperand(0); 
-  Value *RHSVal = FirstInst->getOperand(1); 
- 
-  Type *LHSType = LHSVal->getType(); 
-  Type *RHSType = RHSVal->getType(); 
- 
+  Instruction *FirstInst = cast<Instruction>(PN.getIncomingValue(0));
+  assert(isa<BinaryOperator>(FirstInst) || isa<CmpInst>(FirstInst));
+  unsigned Opc = FirstInst->getOpcode();
+  Value *LHSVal = FirstInst->getOperand(0);
+  Value *RHSVal = FirstInst->getOperand(1);
+
+  Type *LHSType = LHSVal->getType();
+  Type *RHSType = RHSVal->getType();
+
   // Scan to see if all operands are the same opcode, and all have one user.
-  for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) { 
-    Instruction *I = dyn_cast<Instruction>(PN.getIncomingValue(i)); 
+  for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) {
+    Instruction *I = dyn_cast<Instruction>(PN.getIncomingValue(i));
     if (!I || I->getOpcode() != Opc || !I->hasOneUser() ||
-        // Verify type of the LHS matches so we don't fold cmp's of different 
-        // types. 
-        I->getOperand(0)->getType() != LHSType || 
-        I->getOperand(1)->getType() != RHSType) 
-      return nullptr; 
- 
-    // If they are CmpInst instructions, check their predicates 
-    if (CmpInst *CI = dyn_cast<CmpInst>(I)) 
-      if (CI->getPredicate() != cast<CmpInst>(FirstInst)->getPredicate()) 
-        return nullptr; 
- 
-    // Keep track of which operand needs a phi node. 
-    if (I->getOperand(0) != LHSVal) LHSVal = nullptr; 
-    if (I->getOperand(1) != RHSVal) RHSVal = nullptr; 
-  } 
- 
-  // If both LHS and RHS would need a PHI, don't do this transformation, 
-  // because it would increase the number of PHIs entering the block, 
-  // which leads to higher register pressure. This is especially 
-  // bad when the PHIs are in the header of a loop. 
-  if (!LHSVal && !RHSVal) 
-    return nullptr; 
- 
-  // Otherwise, this is safe to transform! 
- 
-  Value *InLHS = FirstInst->getOperand(0); 
-  Value *InRHS = FirstInst->getOperand(1); 
-  PHINode *NewLHS = nullptr, *NewRHS = nullptr; 
-  if (!LHSVal) { 
-    NewLHS = PHINode::Create(LHSType, PN.getNumIncomingValues(), 
-                             FirstInst->getOperand(0)->getName() + ".pn"); 
-    NewLHS->addIncoming(InLHS, PN.getIncomingBlock(0)); 
-    InsertNewInstBefore(NewLHS, PN); 
-    LHSVal = NewLHS; 
-  } 
- 
-  if (!RHSVal) { 
-    NewRHS = PHINode::Create(RHSType, PN.getNumIncomingValues(), 
-                             FirstInst->getOperand(1)->getName() + ".pn"); 
-    NewRHS->addIncoming(InRHS, PN.getIncomingBlock(0)); 
-    InsertNewInstBefore(NewRHS, PN); 
-    RHSVal = NewRHS; 
-  } 
- 
-  // Add all operands to the new PHIs. 
-  if (NewLHS || NewRHS) { 
-    for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) { 
-      Instruction *InInst = cast<Instruction>(PN.getIncomingValue(i)); 
-      if (NewLHS) { 
-        Value *NewInLHS = InInst->getOperand(0); 
-        NewLHS->addIncoming(NewInLHS, PN.getIncomingBlock(i)); 
-      } 
-      if (NewRHS) { 
-        Value *NewInRHS = InInst->getOperand(1); 
-        NewRHS->addIncoming(NewInRHS, PN.getIncomingBlock(i)); 
-      } 
-    } 
-  } 
- 
-  if (CmpInst *CIOp = dyn_cast<CmpInst>(FirstInst)) { 
-    CmpInst *NewCI = CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(), 
-                                     LHSVal, RHSVal); 
-    PHIArgMergedDebugLoc(NewCI, PN); 
-    return NewCI; 
-  } 
- 
-  BinaryOperator *BinOp = cast<BinaryOperator>(FirstInst); 
-  BinaryOperator *NewBinOp = 
-    BinaryOperator::Create(BinOp->getOpcode(), LHSVal, RHSVal); 
- 
-  NewBinOp->copyIRFlags(PN.getIncomingValue(0)); 
- 
-  for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) 
-    NewBinOp->andIRFlags(PN.getIncomingValue(i)); 
- 
-  PHIArgMergedDebugLoc(NewBinOp, PN); 
-  return NewBinOp; 
-} 
- 
+        // Verify type of the LHS matches so we don't fold cmp's of different
+        // types.
+        I->getOperand(0)->getType() != LHSType ||
+        I->getOperand(1)->getType() != RHSType)
+      return nullptr;
+
+    // If they are CmpInst instructions, check their predicates
+    if (CmpInst *CI = dyn_cast<CmpInst>(I))
+      if (CI->getPredicate() != cast<CmpInst>(FirstInst)->getPredicate())
+        return nullptr;
+
+    // Keep track of which operand needs a phi node.
+    if (I->getOperand(0) != LHSVal) LHSVal = nullptr;
+    if (I->getOperand(1) != RHSVal) RHSVal = nullptr;
+  }
+
+  // If both LHS and RHS would need a PHI, don't do this transformation,
+  // because it would increase the number of PHIs entering the block,
+  // which leads to higher register pressure. This is especially
+  // bad when the PHIs are in the header of a loop.
+  if (!LHSVal && !RHSVal)
+    return nullptr;
+
+  // Otherwise, this is safe to transform!
+
+  Value *InLHS = FirstInst->getOperand(0);
+  Value *InRHS = FirstInst->getOperand(1);
+  PHINode *NewLHS = nullptr, *NewRHS = nullptr;
+  if (!LHSVal) {
+    NewLHS = PHINode::Create(LHSType, PN.getNumIncomingValues(),
+                             FirstInst->getOperand(0)->getName() + ".pn");
+    NewLHS->addIncoming(InLHS, PN.getIncomingBlock(0));
+    InsertNewInstBefore(NewLHS, PN);
+    LHSVal = NewLHS;
+  }
+
+  if (!RHSVal) {
+    NewRHS = PHINode::Create(RHSType, PN.getNumIncomingValues(),
+                             FirstInst->getOperand(1)->getName() + ".pn");
+    NewRHS->addIncoming(InRHS, PN.getIncomingBlock(0));
+    InsertNewInstBefore(NewRHS, PN);
+    RHSVal = NewRHS;
+  }
+
+  // Add all operands to the new PHIs.
+  if (NewLHS || NewRHS) {
+    for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+      Instruction *InInst = cast<Instruction>(PN.getIncomingValue(i));
+      if (NewLHS) {
+        Value *NewInLHS = InInst->getOperand(0);
+        NewLHS->addIncoming(NewInLHS, PN.getIncomingBlock(i));
+      }
+      if (NewRHS) {
+        Value *NewInRHS = InInst->getOperand(1);
+        NewRHS->addIncoming(NewInRHS, PN.getIncomingBlock(i));
+      }
+    }
+  }
+
+  if (CmpInst *CIOp = dyn_cast<CmpInst>(FirstInst)) {
+    CmpInst *NewCI = CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(),
+                                     LHSVal, RHSVal);
+    PHIArgMergedDebugLoc(NewCI, PN);
+    return NewCI;
+  }
+
+  BinaryOperator *BinOp = cast<BinaryOperator>(FirstInst);
+  BinaryOperator *NewBinOp =
+    BinaryOperator::Create(BinOp->getOpcode(), LHSVal, RHSVal);
+
+  NewBinOp->copyIRFlags(PN.getIncomingValue(0));
+
+  for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i)
+    NewBinOp->andIRFlags(PN.getIncomingValue(i));
+
+  PHIArgMergedDebugLoc(NewBinOp, PN);
+  return NewBinOp;
+}
+
 Instruction *InstCombinerImpl::foldPHIArgGEPIntoPHI(PHINode &PN) {
-  GetElementPtrInst *FirstInst =cast<GetElementPtrInst>(PN.getIncomingValue(0)); 
- 
-  SmallVector<Value*, 16> FixedOperands(FirstInst->op_begin(), 
-                                        FirstInst->op_end()); 
-  // This is true if all GEP bases are allocas and if all indices into them are 
-  // constants. 
-  bool AllBasePointersAreAllocas = true; 
- 
-  // We don't want to replace this phi if the replacement would require 
-  // more than one phi, which leads to higher register pressure. This is 
-  // especially bad when the PHIs are in the header of a loop. 
-  bool NeededPhi = false; 
- 
-  bool AllInBounds = true; 
- 
+  GetElementPtrInst *FirstInst =cast<GetElementPtrInst>(PN.getIncomingValue(0));
+
+  SmallVector<Value*, 16> FixedOperands(FirstInst->op_begin(),
+                                        FirstInst->op_end());
+  // This is true if all GEP bases are allocas and if all indices into them are
+  // constants.
+  bool AllBasePointersAreAllocas = true;
+
+  // We don't want to replace this phi if the replacement would require
+  // more than one phi, which leads to higher register pressure. This is
+  // especially bad when the PHIs are in the header of a loop.
+  bool NeededPhi = false;
+
+  bool AllInBounds = true;
+
   // Scan to see if all operands are the same opcode, and all have one user.
-  for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) { 
+  for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) {
     GetElementPtrInst *GEP =
         dyn_cast<GetElementPtrInst>(PN.getIncomingValue(i));
     if (!GEP || !GEP->hasOneUser() || GEP->getType() != FirstInst->getType() ||
         GEP->getNumOperands() != FirstInst->getNumOperands())
-      return nullptr; 
- 
-    AllInBounds &= GEP->isInBounds(); 
- 
-    // Keep track of whether or not all GEPs are of alloca pointers. 
-    if (AllBasePointersAreAllocas && 
-        (!isa<AllocaInst>(GEP->getOperand(0)) || 
-         !GEP->hasAllConstantIndices())) 
-      AllBasePointersAreAllocas = false; 
- 
-    // Compare the operand lists. 
-    for (unsigned op = 0, e = FirstInst->getNumOperands(); op != e; ++op) { 
-      if (FirstInst->getOperand(op) == GEP->getOperand(op)) 
-        continue; 
- 
-      // Don't merge two GEPs when two operands differ (introducing phi nodes) 
-      // if one of the PHIs has a constant for the index.  The index may be 
-      // substantially cheaper to compute for the constants, so making it a 
-      // variable index could pessimize the path.  This also handles the case 
-      // for struct indices, which must always be constant. 
-      if (isa<ConstantInt>(FirstInst->getOperand(op)) || 
-          isa<ConstantInt>(GEP->getOperand(op))) 
-        return nullptr; 
- 
-      if (FirstInst->getOperand(op)->getType() !=GEP->getOperand(op)->getType()) 
-        return nullptr; 
- 
-      // If we already needed a PHI for an earlier operand, and another operand 
-      // also requires a PHI, we'd be introducing more PHIs than we're 
-      // eliminating, which increases register pressure on entry to the PHI's 
-      // block. 
-      if (NeededPhi) 
-        return nullptr; 
- 
-      FixedOperands[op] = nullptr;  // Needs a PHI. 
-      NeededPhi = true; 
-    } 
-  } 
- 
-  // If all of the base pointers of the PHI'd GEPs are from allocas, don't 
-  // bother doing this transformation.  At best, this will just save a bit of 
-  // offset calculation, but all the predecessors will have to materialize the 
-  // stack address into a register anyway.  We'd actually rather *clone* the 
-  // load up into the predecessors so that we have a load of a gep of an alloca, 
-  // which can usually all be folded into the load. 
-  if (AllBasePointersAreAllocas) 
-    return nullptr; 
- 
-  // Otherwise, this is safe to transform.  Insert PHI nodes for each operand 
-  // that is variable. 
-  SmallVector<PHINode*, 16> OperandPhis(FixedOperands.size()); 
- 
-  bool HasAnyPHIs = false; 
-  for (unsigned i = 0, e = FixedOperands.size(); i != e; ++i) { 
-    if (FixedOperands[i]) continue;  // operand doesn't need a phi. 
-    Value *FirstOp = FirstInst->getOperand(i); 
-    PHINode *NewPN = PHINode::Create(FirstOp->getType(), e, 
-                                     FirstOp->getName()+".pn"); 
-    InsertNewInstBefore(NewPN, PN); 
- 
-    NewPN->addIncoming(FirstOp, PN.getIncomingBlock(0)); 
-    OperandPhis[i] = NewPN; 
-    FixedOperands[i] = NewPN; 
-    HasAnyPHIs = true; 
-  } 
- 
- 
-  // Add all operands to the new PHIs. 
-  if (HasAnyPHIs) { 
-    for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) { 
-      GetElementPtrInst *InGEP =cast<GetElementPtrInst>(PN.getIncomingValue(i)); 
-      BasicBlock *InBB = PN.getIncomingBlock(i); 
- 
-      for (unsigned op = 0, e = OperandPhis.size(); op != e; ++op) 
-        if (PHINode *OpPhi = OperandPhis[op]) 
-          OpPhi->addIncoming(InGEP->getOperand(op), InBB); 
-    } 
-  } 
- 
-  Value *Base = FixedOperands[0]; 
-  GetElementPtrInst *NewGEP = 
-      GetElementPtrInst::Create(FirstInst->getSourceElementType(), Base, 
-                                makeArrayRef(FixedOperands).slice(1)); 
-  if (AllInBounds) NewGEP->setIsInBounds(); 
-  PHIArgMergedDebugLoc(NewGEP, PN); 
-  return NewGEP; 
-} 
- 
-/// Return true if we know that it is safe to sink the load out of the block 
-/// that defines it. This means that it must be obvious the value of the load is 
-/// not changed from the point of the load to the end of the block it is in. 
-/// 
-/// Finally, it is safe, but not profitable, to sink a load targeting a 
-/// non-address-taken alloca.  Doing so will cause us to not promote the alloca 
-/// to a register. 
-static bool isSafeAndProfitableToSinkLoad(LoadInst *L) { 
-  BasicBlock::iterator BBI = L->getIterator(), E = L->getParent()->end(); 
- 
-  for (++BBI; BBI != E; ++BBI) 
+      return nullptr;
+
+    AllInBounds &= GEP->isInBounds();
+
+    // Keep track of whether or not all GEPs are of alloca pointers.
+    if (AllBasePointersAreAllocas &&
+        (!isa<AllocaInst>(GEP->getOperand(0)) ||
+         !GEP->hasAllConstantIndices()))
+      AllBasePointersAreAllocas = false;
+
+    // Compare the operand lists.
+    for (unsigned op = 0, e = FirstInst->getNumOperands(); op != e; ++op) {
+      if (FirstInst->getOperand(op) == GEP->getOperand(op))
+        continue;
+
+      // Don't merge two GEPs when two operands differ (introducing phi nodes)
+      // if one of the PHIs has a constant for the index.  The index may be
+      // substantially cheaper to compute for the constants, so making it a
+      // variable index could pessimize the path.  This also handles the case
+      // for struct indices, which must always be constant.
+      if (isa<ConstantInt>(FirstInst->getOperand(op)) ||
+          isa<ConstantInt>(GEP->getOperand(op)))
+        return nullptr;
+
+      if (FirstInst->getOperand(op)->getType() !=GEP->getOperand(op)->getType())
+        return nullptr;
+
+      // If we already needed a PHI for an earlier operand, and another operand
+      // also requires a PHI, we'd be introducing more PHIs than we're
+      // eliminating, which increases register pressure on entry to the PHI's
+      // block.
+      if (NeededPhi)
+        return nullptr;
+
+      FixedOperands[op] = nullptr;  // Needs a PHI.
+      NeededPhi = true;
+    }
+  }
+
+  // If all of the base pointers of the PHI'd GEPs are from allocas, don't
+  // bother doing this transformation.  At best, this will just save a bit of
+  // offset calculation, but all the predecessors will have to materialize the
+  // stack address into a register anyway.  We'd actually rather *clone* the
+  // load up into the predecessors so that we have a load of a gep of an alloca,
+  // which can usually all be folded into the load.
+  if (AllBasePointersAreAllocas)
+    return nullptr;
+
+  // Otherwise, this is safe to transform.  Insert PHI nodes for each operand
+  // that is variable.
+  SmallVector<PHINode*, 16> OperandPhis(FixedOperands.size());
+
+  bool HasAnyPHIs = false;
+  for (unsigned i = 0, e = FixedOperands.size(); i != e; ++i) {
+    if (FixedOperands[i]) continue;  // operand doesn't need a phi.
+    Value *FirstOp = FirstInst->getOperand(i);
+    PHINode *NewPN = PHINode::Create(FirstOp->getType(), e,
+                                     FirstOp->getName()+".pn");
+    InsertNewInstBefore(NewPN, PN);
+
+    NewPN->addIncoming(FirstOp, PN.getIncomingBlock(0));
+    OperandPhis[i] = NewPN;
+    FixedOperands[i] = NewPN;
+    HasAnyPHIs = true;
+  }
+
+
+  // Add all operands to the new PHIs.
+  if (HasAnyPHIs) {
+    for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+      GetElementPtrInst *InGEP =cast<GetElementPtrInst>(PN.getIncomingValue(i));
+      BasicBlock *InBB = PN.getIncomingBlock(i);
+
+      for (unsigned op = 0, e = OperandPhis.size(); op != e; ++op)
+        if (PHINode *OpPhi = OperandPhis[op])
+          OpPhi->addIncoming(InGEP->getOperand(op), InBB);
+    }
+  }
+
+  Value *Base = FixedOperands[0];
+  GetElementPtrInst *NewGEP =
+      GetElementPtrInst::Create(FirstInst->getSourceElementType(), Base,
+                                makeArrayRef(FixedOperands).slice(1));
+  if (AllInBounds) NewGEP->setIsInBounds();
+  PHIArgMergedDebugLoc(NewGEP, PN);
+  return NewGEP;
+}
+
+/// Return true if we know that it is safe to sink the load out of the block
+/// that defines it. This means that it must be obvious the value of the load is
+/// not changed from the point of the load to the end of the block it is in.
+///
+/// Finally, it is safe, but not profitable, to sink a load targeting a
+/// non-address-taken alloca.  Doing so will cause us to not promote the alloca
+/// to a register.
+static bool isSafeAndProfitableToSinkLoad(LoadInst *L) {
+  BasicBlock::iterator BBI = L->getIterator(), E = L->getParent()->end();
+
+  for (++BBI; BBI != E; ++BBI)
     if (BBI->mayWriteToMemory()) {
       // Calls that only access inaccessible memory do not block sinking the
       // load.
       if (auto *CB = dyn_cast<CallBase>(BBI))
         if (CB->onlyAccessesInaccessibleMemory())
           continue;
-      return false; 
+      return false;
     }
- 
-  // Check for non-address taken alloca.  If not address-taken already, it isn't 
-  // profitable to do this xform. 
-  if (AllocaInst *AI = dyn_cast<AllocaInst>(L->getOperand(0))) { 
-    bool isAddressTaken = false; 
-    for (User *U : AI->users()) { 
-      if (isa<LoadInst>(U)) continue; 
-      if (StoreInst *SI = dyn_cast<StoreInst>(U)) { 
-        // If storing TO the alloca, then the address isn't taken. 
-        if (SI->getOperand(1) == AI) continue; 
-      } 
-      isAddressTaken = true; 
-      break; 
-    } 
- 
-    if (!isAddressTaken && AI->isStaticAlloca()) 
-      return false; 
-  } 
- 
-  // If this load is a load from a GEP with a constant offset from an alloca, 
-  // then we don't want to sink it.  In its present form, it will be 
-  // load [constant stack offset].  Sinking it will cause us to have to 
-  // materialize the stack addresses in each predecessor in a register only to 
-  // do a shared load from register in the successor. 
-  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(L->getOperand(0))) 
-    if (AllocaInst *AI = dyn_cast<AllocaInst>(GEP->getOperand(0))) 
-      if (AI->isStaticAlloca() && GEP->hasAllConstantIndices()) 
-        return false; 
- 
-  return true; 
-} 
- 
+
+  // Check for non-address taken alloca.  If not address-taken already, it isn't
+  // profitable to do this xform.
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(L->getOperand(0))) {
+    bool isAddressTaken = false;
+    for (User *U : AI->users()) {
+      if (isa<LoadInst>(U)) continue;
+      if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+        // If storing TO the alloca, then the address isn't taken.
+        if (SI->getOperand(1) == AI) continue;
+      }
+      isAddressTaken = true;
+      break;
+    }
+
+    if (!isAddressTaken && AI->isStaticAlloca())
+      return false;
+  }
+
+  // If this load is a load from a GEP with a constant offset from an alloca,
+  // then we don't want to sink it.  In its present form, it will be
+  // load [constant stack offset].  Sinking it will cause us to have to
+  // materialize the stack addresses in each predecessor in a register only to
+  // do a shared load from register in the successor.
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(L->getOperand(0)))
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(GEP->getOperand(0)))
+      if (AI->isStaticAlloca() && GEP->hasAllConstantIndices())
+        return false;
+
+  return true;
+}
+
 Instruction *InstCombinerImpl::foldPHIArgLoadIntoPHI(PHINode &PN) {
-  LoadInst *FirstLI = cast<LoadInst>(PN.getIncomingValue(0)); 
- 
-  // FIXME: This is overconservative; this transform is allowed in some cases 
-  // for atomic operations. 
-  if (FirstLI->isAtomic()) 
-    return nullptr; 
- 
-  // When processing loads, we need to propagate two bits of information to the 
-  // sunk load: whether it is volatile, and what its alignment is.  We currently 
-  // don't sink loads when some have their alignment specified and some don't. 
-  // visitLoadInst will propagate an alignment onto the load when TD is around, 
-  // and if TD isn't around, we can't handle the mixed case. 
-  bool isVolatile = FirstLI->isVolatile(); 
-  Align LoadAlignment = FirstLI->getAlign(); 
-  unsigned LoadAddrSpace = FirstLI->getPointerAddressSpace(); 
- 
-  // We can't sink the load if the loaded value could be modified between the 
-  // load and the PHI. 
-  if (FirstLI->getParent() != PN.getIncomingBlock(0) || 
-      !isSafeAndProfitableToSinkLoad(FirstLI)) 
-    return nullptr; 
- 
-  // If the PHI is of volatile loads and the load block has multiple 
-  // successors, sinking it would remove a load of the volatile value from 
-  // the path through the other successor. 
-  if (isVolatile && 
-      FirstLI->getParent()->getTerminator()->getNumSuccessors() != 1) 
-    return nullptr; 
- 
-  // Check to see if all arguments are the same operation. 
-  for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) { 
-    LoadInst *LI = dyn_cast<LoadInst>(PN.getIncomingValue(i)); 
+  LoadInst *FirstLI = cast<LoadInst>(PN.getIncomingValue(0));
+
+  // FIXME: This is overconservative; this transform is allowed in some cases
+  // for atomic operations.
+  if (FirstLI->isAtomic())
+    return nullptr;
+
+  // When processing loads, we need to propagate two bits of information to the
+  // sunk load: whether it is volatile, and what its alignment is.  We currently
+  // don't sink loads when some have their alignment specified and some don't.
+  // visitLoadInst will propagate an alignment onto the load when TD is around,
+  // and if TD isn't around, we can't handle the mixed case.
+  bool isVolatile = FirstLI->isVolatile();
+  Align LoadAlignment = FirstLI->getAlign();
+  unsigned LoadAddrSpace = FirstLI->getPointerAddressSpace();
+
+  // We can't sink the load if the loaded value could be modified between the
+  // load and the PHI.
+  if (FirstLI->getParent() != PN.getIncomingBlock(0) ||
+      !isSafeAndProfitableToSinkLoad(FirstLI))
+    return nullptr;
+
+  // If the PHI is of volatile loads and the load block has multiple
+  // successors, sinking it would remove a load of the volatile value from
+  // the path through the other successor.
+  if (isVolatile &&
+      FirstLI->getParent()->getTerminator()->getNumSuccessors() != 1)
+    return nullptr;
+
+  // Check to see if all arguments are the same operation.
+  for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+    LoadInst *LI = dyn_cast<LoadInst>(PN.getIncomingValue(i));
     if (!LI || !LI->hasOneUser())
-      return nullptr; 
- 
-    // We can't sink the load if the loaded value could be modified between 
-    // the load and the PHI. 
-    if (LI->isVolatile() != isVolatile || 
-        LI->getParent() != PN.getIncomingBlock(i) || 
-        LI->getPointerAddressSpace() != LoadAddrSpace || 
-        !isSafeAndProfitableToSinkLoad(LI)) 
-      return nullptr; 
- 
-    LoadAlignment = std::min(LoadAlignment, Align(LI->getAlign())); 
- 
-    // If the PHI is of volatile loads and the load block has multiple 
-    // successors, sinking it would remove a load of the volatile value from 
-    // the path through the other successor. 
-    if (isVolatile && 
-        LI->getParent()->getTerminator()->getNumSuccessors() != 1) 
-      return nullptr; 
-  } 
- 
-  // Okay, they are all the same operation.  Create a new PHI node of the 
-  // correct type, and PHI together all of the LHS's of the instructions. 
-  PHINode *NewPN = PHINode::Create(FirstLI->getOperand(0)->getType(), 
-                                   PN.getNumIncomingValues(), 
-                                   PN.getName()+".in"); 
- 
-  Value *InVal = FirstLI->getOperand(0); 
-  NewPN->addIncoming(InVal, PN.getIncomingBlock(0)); 
-  LoadInst *NewLI = 
-      new LoadInst(FirstLI->getType(), NewPN, "", isVolatile, LoadAlignment); 
- 
-  unsigned KnownIDs[] = { 
-    LLVMContext::MD_tbaa, 
-    LLVMContext::MD_range, 
-    LLVMContext::MD_invariant_load, 
-    LLVMContext::MD_alias_scope, 
-    LLVMContext::MD_noalias, 
-    LLVMContext::MD_nonnull, 
-    LLVMContext::MD_align, 
-    LLVMContext::MD_dereferenceable, 
-    LLVMContext::MD_dereferenceable_or_null, 
-    LLVMContext::MD_access_group, 
-  }; 
- 
-  for (unsigned ID : KnownIDs) 
-    NewLI->setMetadata(ID, FirstLI->getMetadata(ID)); 
- 
-  // Add all operands to the new PHI and combine TBAA metadata. 
-  for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) { 
-    LoadInst *LI = cast<LoadInst>(PN.getIncomingValue(i)); 
-    combineMetadata(NewLI, LI, KnownIDs, true); 
-    Value *NewInVal = LI->getOperand(0); 
-    if (NewInVal != InVal) 
-      InVal = nullptr; 
-    NewPN->addIncoming(NewInVal, PN.getIncomingBlock(i)); 
-  } 
- 
-  if (InVal) { 
-    // The new PHI unions all of the same values together.  This is really 
-    // common, so we handle it intelligently here for compile-time speed. 
-    NewLI->setOperand(0, InVal); 
-    delete NewPN; 
-  } else { 
-    InsertNewInstBefore(NewPN, PN); 
-  } 
- 
-  // If this was a volatile load that we are merging, make sure to loop through 
-  // and mark all the input loads as non-volatile.  If we don't do this, we will 
-  // insert a new volatile load and the old ones will not be deletable. 
-  if (isVolatile) 
-    for (Value *IncValue : PN.incoming_values()) 
-      cast<LoadInst>(IncValue)->setVolatile(false); 
- 
-  PHIArgMergedDebugLoc(NewLI, PN); 
-  return NewLI; 
-} 
- 
-/// TODO: This function could handle other cast types, but then it might 
-/// require special-casing a cast from the 'i1' type. See the comment in 
-/// FoldPHIArgOpIntoPHI() about pessimizing illegal integer types. 
+      return nullptr;
+
+    // We can't sink the load if the loaded value could be modified between
+    // the load and the PHI.
+    if (LI->isVolatile() != isVolatile ||
+        LI->getParent() != PN.getIncomingBlock(i) ||
+        LI->getPointerAddressSpace() != LoadAddrSpace ||
+        !isSafeAndProfitableToSinkLoad(LI))
+      return nullptr;
+
+    LoadAlignment = std::min(LoadAlignment, Align(LI->getAlign()));
+
+    // If the PHI is of volatile loads and the load block has multiple
+    // successors, sinking it would remove a load of the volatile value from
+    // the path through the other successor.
+    if (isVolatile &&
+        LI->getParent()->getTerminator()->getNumSuccessors() != 1)
+      return nullptr;
+  }
+
+  // Okay, they are all the same operation.  Create a new PHI node of the
+  // correct type, and PHI together all of the LHS's of the instructions.
+  PHINode *NewPN = PHINode::Create(FirstLI->getOperand(0)->getType(),
+                                   PN.getNumIncomingValues(),
+                                   PN.getName()+".in");
+
+  Value *InVal = FirstLI->getOperand(0);
+  NewPN->addIncoming(InVal, PN.getIncomingBlock(0));
+  LoadInst *NewLI =
+      new LoadInst(FirstLI->getType(), NewPN, "", isVolatile, LoadAlignment);
+
+  unsigned KnownIDs[] = {
+    LLVMContext::MD_tbaa,
+    LLVMContext::MD_range,
+    LLVMContext::MD_invariant_load,
+    LLVMContext::MD_alias_scope,
+    LLVMContext::MD_noalias,
+    LLVMContext::MD_nonnull,
+    LLVMContext::MD_align,
+    LLVMContext::MD_dereferenceable,
+    LLVMContext::MD_dereferenceable_or_null,
+    LLVMContext::MD_access_group,
+  };
+
+  for (unsigned ID : KnownIDs)
+    NewLI->setMetadata(ID, FirstLI->getMetadata(ID));
+
+  // Add all operands to the new PHI and combine TBAA metadata.
+  for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+    LoadInst *LI = cast<LoadInst>(PN.getIncomingValue(i));
+    combineMetadata(NewLI, LI, KnownIDs, true);
+    Value *NewInVal = LI->getOperand(0);
+    if (NewInVal != InVal)
+      InVal = nullptr;
+    NewPN->addIncoming(NewInVal, PN.getIncomingBlock(i));
+  }
+
+  if (InVal) {
+    // The new PHI unions all of the same values together.  This is really
+    // common, so we handle it intelligently here for compile-time speed.
+    NewLI->setOperand(0, InVal);
+    delete NewPN;
+  } else {
+    InsertNewInstBefore(NewPN, PN);
+  }
+
+  // If this was a volatile load that we are merging, make sure to loop through
+  // and mark all the input loads as non-volatile.  If we don't do this, we will
+  // insert a new volatile load and the old ones will not be deletable.
+  if (isVolatile)
+    for (Value *IncValue : PN.incoming_values())
+      cast<LoadInst>(IncValue)->setVolatile(false);
+
+  PHIArgMergedDebugLoc(NewLI, PN);
+  return NewLI;
+}
+
+/// TODO: This function could handle other cast types, but then it might
+/// require special-casing a cast from the 'i1' type. See the comment in
+/// FoldPHIArgOpIntoPHI() about pessimizing illegal integer types.
 Instruction *InstCombinerImpl::foldPHIArgZextsIntoPHI(PHINode &Phi) {
-  // We cannot create a new instruction after the PHI if the terminator is an 
-  // EHPad because there is no valid insertion point. 
-  if (Instruction *TI = Phi.getParent()->getTerminator()) 
-    if (TI->isEHPad()) 
-      return nullptr; 
- 
-  // Early exit for the common case of a phi with two operands. These are 
-  // handled elsewhere. See the comment below where we check the count of zexts 
-  // and constants for more details. 
-  unsigned NumIncomingValues = Phi.getNumIncomingValues(); 
-  if (NumIncomingValues < 3) 
-    return nullptr; 
- 
-  // Find the narrower type specified by the first zext. 
-  Type *NarrowType = nullptr; 
-  for (Value *V : Phi.incoming_values()) { 
-    if (auto *Zext = dyn_cast<ZExtInst>(V)) { 
-      NarrowType = Zext->getSrcTy(); 
-      break; 
-    } 
-  } 
-  if (!NarrowType) 
-    return nullptr; 
- 
-  // Walk the phi operands checking that we only have zexts or constants that 
-  // we can shrink for free. Store the new operands for the new phi. 
-  SmallVector<Value *, 4> NewIncoming; 
-  unsigned NumZexts = 0; 
-  unsigned NumConsts = 0; 
-  for (Value *V : Phi.incoming_values()) { 
-    if (auto *Zext = dyn_cast<ZExtInst>(V)) { 
+  // We cannot create a new instruction after the PHI if the terminator is an
+  // EHPad because there is no valid insertion point.
+  if (Instruction *TI = Phi.getParent()->getTerminator())
+    if (TI->isEHPad())
+      return nullptr;
+
+  // Early exit for the common case of a phi with two operands. These are
+  // handled elsewhere. See the comment below where we check the count of zexts
+  // and constants for more details.
+  unsigned NumIncomingValues = Phi.getNumIncomingValues();
+  if (NumIncomingValues < 3)
+    return nullptr;
+
+  // Find the narrower type specified by the first zext.
+  Type *NarrowType = nullptr;
+  for (Value *V : Phi.incoming_values()) {
+    if (auto *Zext = dyn_cast<ZExtInst>(V)) {
+      NarrowType = Zext->getSrcTy();
+      break;
+    }
+  }
+  if (!NarrowType)
+    return nullptr;
+
+  // Walk the phi operands checking that we only have zexts or constants that
+  // we can shrink for free. Store the new operands for the new phi.
+  SmallVector<Value *, 4> NewIncoming;
+  unsigned NumZexts = 0;
+  unsigned NumConsts = 0;
+  for (Value *V : Phi.incoming_values()) {
+    if (auto *Zext = dyn_cast<ZExtInst>(V)) {
       // All zexts must be identical and have one user.
       if (Zext->getSrcTy() != NarrowType || !Zext->hasOneUser())
-        return nullptr; 
-      NewIncoming.push_back(Zext->getOperand(0)); 
-      NumZexts++; 
-    } else if (auto *C = dyn_cast<Constant>(V)) { 
-      // Make sure that constants can fit in the new type. 
-      Constant *Trunc = ConstantExpr::getTrunc(C, NarrowType); 
-      if (ConstantExpr::getZExt(Trunc, C->getType()) != C) 
-        return nullptr; 
-      NewIncoming.push_back(Trunc); 
-      NumConsts++; 
-    } else { 
-      // If it's not a cast or a constant, bail out. 
-      return nullptr; 
-    } 
-  } 
- 
-  // The more common cases of a phi with no constant operands or just one 
-  // variable operand are handled by FoldPHIArgOpIntoPHI() and foldOpIntoPhi() 
-  // respectively. foldOpIntoPhi() wants to do the opposite transform that is 
-  // performed here. It tries to replicate a cast in the phi operand's basic 
-  // block to expose other folding opportunities. Thus, InstCombine will 
-  // infinite loop without this check. 
-  if (NumConsts == 0 || NumZexts < 2) 
-    return nullptr; 
- 
-  // All incoming values are zexts or constants that are safe to truncate. 
-  // Create a new phi node of the narrow type, phi together all of the new 
-  // operands, and zext the result back to the original type. 
-  PHINode *NewPhi = PHINode::Create(NarrowType, NumIncomingValues, 
-                                    Phi.getName() + ".shrunk"); 
-  for (unsigned i = 0; i != NumIncomingValues; ++i) 
-    NewPhi->addIncoming(NewIncoming[i], Phi.getIncomingBlock(i)); 
- 
-  InsertNewInstBefore(NewPhi, Phi); 
-  return CastInst::CreateZExtOrBitCast(NewPhi, Phi.getType()); 
-} 
- 
-/// If all operands to a PHI node are the same "unary" operator and they all are 
-/// only used by the PHI, PHI together their inputs, and do the operation once, 
-/// to the result of the PHI. 
+        return nullptr;
+      NewIncoming.push_back(Zext->getOperand(0));
+      NumZexts++;
+    } else if (auto *C = dyn_cast<Constant>(V)) {
+      // Make sure that constants can fit in the new type.
+      Constant *Trunc = ConstantExpr::getTrunc(C, NarrowType);
+      if (ConstantExpr::getZExt(Trunc, C->getType()) != C)
+        return nullptr;
+      NewIncoming.push_back(Trunc);
+      NumConsts++;
+    } else {
+      // If it's not a cast or a constant, bail out.
+      return nullptr;
+    }
+  }
+
+  // The more common cases of a phi with no constant operands or just one
+  // variable operand are handled by FoldPHIArgOpIntoPHI() and foldOpIntoPhi()
+  // respectively. foldOpIntoPhi() wants to do the opposite transform that is
+  // performed here. It tries to replicate a cast in the phi operand's basic
+  // block to expose other folding opportunities. Thus, InstCombine will
+  // infinite loop without this check.
+  if (NumConsts == 0 || NumZexts < 2)
+    return nullptr;
+
+  // All incoming values are zexts or constants that are safe to truncate.
+  // Create a new phi node of the narrow type, phi together all of the new
+  // operands, and zext the result back to the original type.
+  PHINode *NewPhi = PHINode::Create(NarrowType, NumIncomingValues,
+                                    Phi.getName() + ".shrunk");
+  for (unsigned i = 0; i != NumIncomingValues; ++i)
+    NewPhi->addIncoming(NewIncoming[i], Phi.getIncomingBlock(i));
+
+  InsertNewInstBefore(NewPhi, Phi);
+  return CastInst::CreateZExtOrBitCast(NewPhi, Phi.getType());
+}
+
+/// If all operands to a PHI node are the same "unary" operator and they all are
+/// only used by the PHI, PHI together their inputs, and do the operation once,
+/// to the result of the PHI.
 Instruction *InstCombinerImpl::foldPHIArgOpIntoPHI(PHINode &PN) {
-  // We cannot create a new instruction after the PHI if the terminator is an 
-  // EHPad because there is no valid insertion point. 
-  if (Instruction *TI = PN.getParent()->getTerminator()) 
-    if (TI->isEHPad()) 
-      return nullptr; 
- 
-  Instruction *FirstInst = cast<Instruction>(PN.getIncomingValue(0)); 
- 
-  if (isa<GetElementPtrInst>(FirstInst)) 
+  // We cannot create a new instruction after the PHI if the terminator is an
+  // EHPad because there is no valid insertion point.
+  if (Instruction *TI = PN.getParent()->getTerminator())
+    if (TI->isEHPad())
+      return nullptr;
+
+  Instruction *FirstInst = cast<Instruction>(PN.getIncomingValue(0));
+
+  if (isa<GetElementPtrInst>(FirstInst))
     return foldPHIArgGEPIntoPHI(PN);
-  if (isa<LoadInst>(FirstInst)) 
+  if (isa<LoadInst>(FirstInst))
     return foldPHIArgLoadIntoPHI(PN);
   if (isa<InsertValueInst>(FirstInst))
     return foldPHIArgInsertValueInstructionIntoPHI(PN);
   if (isa<ExtractValueInst>(FirstInst))
     return foldPHIArgExtractValueInstructionIntoPHI(PN);
- 
-  // Scan the instruction, looking for input operations that can be folded away. 
-  // If all input operands to the phi are the same instruction (e.g. a cast from 
-  // the same type or "+42") we can pull the operation through the PHI, reducing 
-  // code size and simplifying code. 
-  Constant *ConstantOp = nullptr; 
-  Type *CastSrcTy = nullptr; 
- 
-  if (isa<CastInst>(FirstInst)) { 
-    CastSrcTy = FirstInst->getOperand(0)->getType(); 
- 
-    // Be careful about transforming integer PHIs.  We don't want to pessimize 
-    // the code by turning an i32 into an i1293. 
-    if (PN.getType()->isIntegerTy() && CastSrcTy->isIntegerTy()) { 
-      if (!shouldChangeType(PN.getType(), CastSrcTy)) 
-        return nullptr; 
-    } 
-  } else if (isa<BinaryOperator>(FirstInst) || isa<CmpInst>(FirstInst)) { 
-    // Can fold binop, compare or shift here if the RHS is a constant, 
-    // otherwise call FoldPHIArgBinOpIntoPHI. 
-    ConstantOp = dyn_cast<Constant>(FirstInst->getOperand(1)); 
-    if (!ConstantOp) 
+
+  // Scan the instruction, looking for input operations that can be folded away.
+  // If all input operands to the phi are the same instruction (e.g. a cast from
+  // the same type or "+42") we can pull the operation through the PHI, reducing
+  // code size and simplifying code.
+  Constant *ConstantOp = nullptr;
+  Type *CastSrcTy = nullptr;
+
+  if (isa<CastInst>(FirstInst)) {
+    CastSrcTy = FirstInst->getOperand(0)->getType();
+
+    // Be careful about transforming integer PHIs.  We don't want to pessimize
+    // the code by turning an i32 into an i1293.
+    if (PN.getType()->isIntegerTy() && CastSrcTy->isIntegerTy()) {
+      if (!shouldChangeType(PN.getType(), CastSrcTy))
+        return nullptr;
+    }
+  } else if (isa<BinaryOperator>(FirstInst) || isa<CmpInst>(FirstInst)) {
+    // Can fold binop, compare or shift here if the RHS is a constant,
+    // otherwise call FoldPHIArgBinOpIntoPHI.
+    ConstantOp = dyn_cast<Constant>(FirstInst->getOperand(1));
+    if (!ConstantOp)
       return foldPHIArgBinOpIntoPHI(PN);
-  } else { 
-    return nullptr;  // Cannot fold this operation. 
-  } 
- 
-  // Check to see if all arguments are the same operation. 
-  for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) { 
-    Instruction *I = dyn_cast<Instruction>(PN.getIncomingValue(i)); 
+  } else {
+    return nullptr;  // Cannot fold this operation.
+  }
+
+  // Check to see if all arguments are the same operation.
+  for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+    Instruction *I = dyn_cast<Instruction>(PN.getIncomingValue(i));
     if (!I || !I->hasOneUser() || !I->isSameOperationAs(FirstInst))
-      return nullptr; 
-    if (CastSrcTy) { 
-      if (I->getOperand(0)->getType() != CastSrcTy) 
-        return nullptr;  // Cast operation must match. 
-    } else if (I->getOperand(1) != ConstantOp) { 
-      return nullptr; 
-    } 
-  } 
- 
-  // Okay, they are all the same operation.  Create a new PHI node of the 
-  // correct type, and PHI together all of the LHS's of the instructions. 
-  PHINode *NewPN = PHINode::Create(FirstInst->getOperand(0)->getType(), 
-                                   PN.getNumIncomingValues(), 
-                                   PN.getName()+".in"); 
- 
-  Value *InVal = FirstInst->getOperand(0); 
-  NewPN->addIncoming(InVal, PN.getIncomingBlock(0)); 
- 
-  // Add all operands to the new PHI. 
-  for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) { 
-    Value *NewInVal = cast<Instruction>(PN.getIncomingValue(i))->getOperand(0); 
-    if (NewInVal != InVal) 
-      InVal = nullptr; 
-    NewPN->addIncoming(NewInVal, PN.getIncomingBlock(i)); 
-  } 
- 
-  Value *PhiVal; 
-  if (InVal) { 
-    // The new PHI unions all of the same values together.  This is really 
-    // common, so we handle it intelligently here for compile-time speed. 
-    PhiVal = InVal; 
-    delete NewPN; 
-  } else { 
-    InsertNewInstBefore(NewPN, PN); 
-    PhiVal = NewPN; 
-  } 
- 
-  // Insert and return the new operation. 
-  if (CastInst *FirstCI = dyn_cast<CastInst>(FirstInst)) { 
-    CastInst *NewCI = CastInst::Create(FirstCI->getOpcode(), PhiVal, 
-                                       PN.getType()); 
-    PHIArgMergedDebugLoc(NewCI, PN); 
-    return NewCI; 
-  } 
- 
-  if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(FirstInst)) { 
-    BinOp = BinaryOperator::Create(BinOp->getOpcode(), PhiVal, ConstantOp); 
-    BinOp->copyIRFlags(PN.getIncomingValue(0)); 
- 
-    for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) 
-      BinOp->andIRFlags(PN.getIncomingValue(i)); 
- 
-    PHIArgMergedDebugLoc(BinOp, PN); 
-    return BinOp; 
-  } 
- 
-  CmpInst *CIOp = cast<CmpInst>(FirstInst); 
-  CmpInst *NewCI = CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(), 
-                                   PhiVal, ConstantOp); 
-  PHIArgMergedDebugLoc(NewCI, PN); 
-  return NewCI; 
-} 
- 
-/// Return true if this PHI node is only used by a PHI node cycle that is dead. 
-static bool DeadPHICycle(PHINode *PN, 
-                         SmallPtrSetImpl<PHINode*> &PotentiallyDeadPHIs) { 
-  if (PN->use_empty()) return true; 
-  if (!PN->hasOneUse()) return false; 
- 
-  // Remember this node, and if we find the cycle, return. 
-  if (!PotentiallyDeadPHIs.insert(PN).second) 
-    return true; 
- 
-  // Don't scan crazily complex things. 
-  if (PotentiallyDeadPHIs.size() == 16) 
-    return false; 
- 
-  if (PHINode *PU = dyn_cast<PHINode>(PN->user_back())) 
-    return DeadPHICycle(PU, PotentiallyDeadPHIs); 
- 
-  return false; 
-} 
- 
-/// Return true if this phi node is always equal to NonPhiInVal. 
-/// This happens with mutually cyclic phi nodes like: 
-///   z = some value; x = phi (y, z); y = phi (x, z) 
-static bool PHIsEqualValue(PHINode *PN, Value *NonPhiInVal, 
-                           SmallPtrSetImpl<PHINode*> &ValueEqualPHIs) { 
-  // See if we already saw this PHI node. 
-  if (!ValueEqualPHIs.insert(PN).second) 
-    return true; 
- 
-  // Don't scan crazily complex things. 
-  if (ValueEqualPHIs.size() == 16) 
-    return false; 
- 
-  // Scan the operands to see if they are either phi nodes or are equal to 
-  // the value. 
-  for (Value *Op : PN->incoming_values()) { 
-    if (PHINode *OpPN = dyn_cast<PHINode>(Op)) { 
-      if (!PHIsEqualValue(OpPN, NonPhiInVal, ValueEqualPHIs)) 
-        return false; 
-    } else if (Op != NonPhiInVal) 
-      return false; 
-  } 
- 
-  return true; 
-} 
- 
-/// Return an existing non-zero constant if this phi node has one, otherwise 
-/// return constant 1. 
-static ConstantInt *GetAnyNonZeroConstInt(PHINode &PN) { 
-  assert(isa<IntegerType>(PN.getType()) && "Expect only integer type phi"); 
-  for (Value *V : PN.operands()) 
-    if (auto *ConstVA = dyn_cast<ConstantInt>(V)) 
-      if (!ConstVA->isZero()) 
-        return ConstVA; 
-  return ConstantInt::get(cast<IntegerType>(PN.getType()), 1); 
-} 
- 
-namespace { 
-struct PHIUsageRecord { 
-  unsigned PHIId;     // The ID # of the PHI (something determinstic to sort on) 
-  unsigned Shift;     // The amount shifted. 
-  Instruction *Inst;  // The trunc instruction. 
- 
-  PHIUsageRecord(unsigned pn, unsigned Sh, Instruction *User) 
-    : PHIId(pn), Shift(Sh), Inst(User) {} 
- 
-  bool operator<(const PHIUsageRecord &RHS) const { 
-    if (PHIId < RHS.PHIId) return true; 
-    if (PHIId > RHS.PHIId) return false; 
-    if (Shift < RHS.Shift) return true; 
-    if (Shift > RHS.Shift) return false; 
-    return Inst->getType()->getPrimitiveSizeInBits() < 
-           RHS.Inst->getType()->getPrimitiveSizeInBits(); 
-  } 
-}; 
- 
-struct LoweredPHIRecord { 
-  PHINode *PN;        // The PHI that was lowered. 
-  unsigned Shift;     // The amount shifted. 
-  unsigned Width;     // The width extracted. 
- 
-  LoweredPHIRecord(PHINode *pn, unsigned Sh, Type *Ty) 
-    : PN(pn), Shift(Sh), Width(Ty->getPrimitiveSizeInBits()) {} 
- 
-  // Ctor form used by DenseMap. 
-  LoweredPHIRecord(PHINode *pn, unsigned Sh) 
-    : PN(pn), Shift(Sh), Width(0) {} 
-}; 
+      return nullptr;
+    if (CastSrcTy) {
+      if (I->getOperand(0)->getType() != CastSrcTy)
+        return nullptr;  // Cast operation must match.
+    } else if (I->getOperand(1) != ConstantOp) {
+      return nullptr;
+    }
+  }
+
+  // Okay, they are all the same operation.  Create a new PHI node of the
+  // correct type, and PHI together all of the LHS's of the instructions.
+  PHINode *NewPN = PHINode::Create(FirstInst->getOperand(0)->getType(),
+                                   PN.getNumIncomingValues(),
+                                   PN.getName()+".in");
+
+  Value *InVal = FirstInst->getOperand(0);
+  NewPN->addIncoming(InVal, PN.getIncomingBlock(0));
+
+  // Add all operands to the new PHI.
+  for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+    Value *NewInVal = cast<Instruction>(PN.getIncomingValue(i))->getOperand(0);
+    if (NewInVal != InVal)
+      InVal = nullptr;
+    NewPN->addIncoming(NewInVal, PN.getIncomingBlock(i));
+  }
+
+  Value *PhiVal;
+  if (InVal) {
+    // The new PHI unions all of the same values together.  This is really
+    // common, so we handle it intelligently here for compile-time speed.
+    PhiVal = InVal;
+    delete NewPN;
+  } else {
+    InsertNewInstBefore(NewPN, PN);
+    PhiVal = NewPN;
+  }
+
+  // Insert and return the new operation.
+  if (CastInst *FirstCI = dyn_cast<CastInst>(FirstInst)) {
+    CastInst *NewCI = CastInst::Create(FirstCI->getOpcode(), PhiVal,
+                                       PN.getType());
+    PHIArgMergedDebugLoc(NewCI, PN);
+    return NewCI;
+  }
+
+  if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(FirstInst)) {
+    BinOp = BinaryOperator::Create(BinOp->getOpcode(), PhiVal, ConstantOp);
+    BinOp->copyIRFlags(PN.getIncomingValue(0));
+
+    for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i)
+      BinOp->andIRFlags(PN.getIncomingValue(i));
+
+    PHIArgMergedDebugLoc(BinOp, PN);
+    return BinOp;
+  }
+
+  CmpInst *CIOp = cast<CmpInst>(FirstInst);
+  CmpInst *NewCI = CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(),
+                                   PhiVal, ConstantOp);
+  PHIArgMergedDebugLoc(NewCI, PN);
+  return NewCI;
+}
+
+/// Return true if this PHI node is only used by a PHI node cycle that is dead.
+static bool DeadPHICycle(PHINode *PN,
+                         SmallPtrSetImpl<PHINode*> &PotentiallyDeadPHIs) {
+  if (PN->use_empty()) return true;
+  if (!PN->hasOneUse()) return false;
+
+  // Remember this node, and if we find the cycle, return.
+  if (!PotentiallyDeadPHIs.insert(PN).second)
+    return true;
+
+  // Don't scan crazily complex things.
+  if (PotentiallyDeadPHIs.size() == 16)
+    return false;
+
+  if (PHINode *PU = dyn_cast<PHINode>(PN->user_back()))
+    return DeadPHICycle(PU, PotentiallyDeadPHIs);
+
+  return false;
+}
+
+/// Return true if this phi node is always equal to NonPhiInVal.
+/// This happens with mutually cyclic phi nodes like:
+///   z = some value; x = phi (y, z); y = phi (x, z)
+static bool PHIsEqualValue(PHINode *PN, Value *NonPhiInVal,
+                           SmallPtrSetImpl<PHINode*> &ValueEqualPHIs) {
+  // See if we already saw this PHI node.
+  if (!ValueEqualPHIs.insert(PN).second)
+    return true;
+
+  // Don't scan crazily complex things.
+  if (ValueEqualPHIs.size() == 16)
+    return false;
+
+  // Scan the operands to see if they are either phi nodes or are equal to
+  // the value.
+  for (Value *Op : PN->incoming_values()) {
+    if (PHINode *OpPN = dyn_cast<PHINode>(Op)) {
+      if (!PHIsEqualValue(OpPN, NonPhiInVal, ValueEqualPHIs))
+        return false;
+    } else if (Op != NonPhiInVal)
+      return false;
+  }
+
+  return true;
+}
+
+/// Return an existing non-zero constant if this phi node has one, otherwise
+/// return constant 1.
+static ConstantInt *GetAnyNonZeroConstInt(PHINode &PN) {
+  assert(isa<IntegerType>(PN.getType()) && "Expect only integer type phi");
+  for (Value *V : PN.operands())
+    if (auto *ConstVA = dyn_cast<ConstantInt>(V))
+      if (!ConstVA->isZero())
+        return ConstVA;
+  return ConstantInt::get(cast<IntegerType>(PN.getType()), 1);
+}
+
+namespace {
+struct PHIUsageRecord {
+  unsigned PHIId;     // The ID # of the PHI (something determinstic to sort on)
+  unsigned Shift;     // The amount shifted.
+  Instruction *Inst;  // The trunc instruction.
+
+  PHIUsageRecord(unsigned pn, unsigned Sh, Instruction *User)
+    : PHIId(pn), Shift(Sh), Inst(User) {}
+
+  bool operator<(const PHIUsageRecord &RHS) const {
+    if (PHIId < RHS.PHIId) return true;
+    if (PHIId > RHS.PHIId) return false;
+    if (Shift < RHS.Shift) return true;
+    if (Shift > RHS.Shift) return false;
+    return Inst->getType()->getPrimitiveSizeInBits() <
+           RHS.Inst->getType()->getPrimitiveSizeInBits();
+  }
+};
+
+struct LoweredPHIRecord {
+  PHINode *PN;        // The PHI that was lowered.
+  unsigned Shift;     // The amount shifted.
+  unsigned Width;     // The width extracted.
+
+  LoweredPHIRecord(PHINode *pn, unsigned Sh, Type *Ty)
+    : PN(pn), Shift(Sh), Width(Ty->getPrimitiveSizeInBits()) {}
+
+  // Ctor form used by DenseMap.
+  LoweredPHIRecord(PHINode *pn, unsigned Sh)
+    : PN(pn), Shift(Sh), Width(0) {}
+};
 } // namespace
- 
-namespace llvm { 
-  template<> 
-  struct DenseMapInfo<LoweredPHIRecord> { 
-    static inline LoweredPHIRecord getEmptyKey() { 
-      return LoweredPHIRecord(nullptr, 0); 
-    } 
-    static inline LoweredPHIRecord getTombstoneKey() { 
-      return LoweredPHIRecord(nullptr, 1); 
-    } 
-    static unsigned getHashValue(const LoweredPHIRecord &Val) { 
-      return DenseMapInfo<PHINode*>::getHashValue(Val.PN) ^ (Val.Shift>>3) ^ 
-             (Val.Width>>3); 
-    } 
-    static bool isEqual(const LoweredPHIRecord &LHS, 
-                        const LoweredPHIRecord &RHS) { 
-      return LHS.PN == RHS.PN && LHS.Shift == RHS.Shift && 
-             LHS.Width == RHS.Width; 
-    } 
-  }; 
+
+namespace llvm {
+  template<>
+  struct DenseMapInfo<LoweredPHIRecord> {
+    static inline LoweredPHIRecord getEmptyKey() {
+      return LoweredPHIRecord(nullptr, 0);
+    }
+    static inline LoweredPHIRecord getTombstoneKey() {
+      return LoweredPHIRecord(nullptr, 1);
+    }
+    static unsigned getHashValue(const LoweredPHIRecord &Val) {
+      return DenseMapInfo<PHINode*>::getHashValue(Val.PN) ^ (Val.Shift>>3) ^
+             (Val.Width>>3);
+    }
+    static bool isEqual(const LoweredPHIRecord &LHS,
+                        const LoweredPHIRecord &RHS) {
+      return LHS.PN == RHS.PN && LHS.Shift == RHS.Shift &&
+             LHS.Width == RHS.Width;
+    }
+  };
 } // namespace llvm
- 
- 
-/// This is an integer PHI and we know that it has an illegal type: see if it is 
-/// only used by trunc or trunc(lshr) operations. If so, we split the PHI into 
-/// the various pieces being extracted. This sort of thing is introduced when 
-/// SROA promotes an aggregate to large integer values. 
-/// 
-/// TODO: The user of the trunc may be an bitcast to float/double/vector or an 
-/// inttoptr.  We should produce new PHIs in the right type. 
-/// 
+
+
+/// This is an integer PHI and we know that it has an illegal type: see if it is
+/// only used by trunc or trunc(lshr) operations. If so, we split the PHI into
+/// the various pieces being extracted. This sort of thing is introduced when
+/// SROA promotes an aggregate to large integer values.
+///
+/// TODO: The user of the trunc may be an bitcast to float/double/vector or an
+/// inttoptr.  We should produce new PHIs in the right type.
+///
 Instruction *InstCombinerImpl::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
-  // PHIUsers - Keep track of all of the truncated values extracted from a set 
-  // of PHIs, along with their offset.  These are the things we want to rewrite. 
-  SmallVector<PHIUsageRecord, 16> PHIUsers; 
- 
-  // PHIs are often mutually cyclic, so we keep track of a whole set of PHI 
-  // nodes which are extracted from. PHIsToSlice is a set we use to avoid 
-  // revisiting PHIs, PHIsInspected is a ordered list of PHIs that we need to 
-  // check the uses of (to ensure they are all extracts). 
-  SmallVector<PHINode*, 8> PHIsToSlice; 
-  SmallPtrSet<PHINode*, 8> PHIsInspected; 
- 
-  PHIsToSlice.push_back(&FirstPhi); 
-  PHIsInspected.insert(&FirstPhi); 
- 
-  for (unsigned PHIId = 0; PHIId != PHIsToSlice.size(); ++PHIId) { 
-    PHINode *PN = PHIsToSlice[PHIId]; 
- 
-    // Scan the input list of the PHI.  If any input is an invoke, and if the 
-    // input is defined in the predecessor, then we won't be split the critical 
-    // edge which is required to insert a truncate.  Because of this, we have to 
-    // bail out. 
-    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { 
-      InvokeInst *II = dyn_cast<InvokeInst>(PN->getIncomingValue(i)); 
-      if (!II) continue; 
-      if (II->getParent() != PN->getIncomingBlock(i)) 
-        continue; 
- 
-      // If we have a phi, and if it's directly in the predecessor, then we have 
-      // a critical edge where we need to put the truncate.  Since we can't 
-      // split the edge in instcombine, we have to bail out. 
-      return nullptr; 
-    } 
- 
-    for (User *U : PN->users()) { 
-      Instruction *UserI = cast<Instruction>(U); 
- 
-      // If the user is a PHI, inspect its uses recursively. 
-      if (PHINode *UserPN = dyn_cast<PHINode>(UserI)) { 
-        if (PHIsInspected.insert(UserPN).second) 
-          PHIsToSlice.push_back(UserPN); 
-        continue; 
-      } 
- 
-      // Truncates are always ok. 
-      if (isa<TruncInst>(UserI)) { 
-        PHIUsers.push_back(PHIUsageRecord(PHIId, 0, UserI)); 
-        continue; 
-      } 
- 
-      // Otherwise it must be a lshr which can only be used by one trunc. 
-      if (UserI->getOpcode() != Instruction::LShr || 
-          !UserI->hasOneUse() || !isa<TruncInst>(UserI->user_back()) || 
-          !isa<ConstantInt>(UserI->getOperand(1))) 
-        return nullptr; 
- 
-      // Bail on out of range shifts. 
-      unsigned SizeInBits = UserI->getType()->getScalarSizeInBits(); 
-      if (cast<ConstantInt>(UserI->getOperand(1))->getValue().uge(SizeInBits)) 
-        return nullptr; 
- 
-      unsigned Shift = cast<ConstantInt>(UserI->getOperand(1))->getZExtValue(); 
-      PHIUsers.push_back(PHIUsageRecord(PHIId, Shift, UserI->user_back())); 
-    } 
-  } 
- 
-  // If we have no users, they must be all self uses, just nuke the PHI. 
-  if (PHIUsers.empty()) 
-    return replaceInstUsesWith(FirstPhi, UndefValue::get(FirstPhi.getType())); 
- 
-  // If this phi node is transformable, create new PHIs for all the pieces 
-  // extracted out of it.  First, sort the users by their offset and size. 
-  array_pod_sort(PHIUsers.begin(), PHIUsers.end()); 
- 
-  LLVM_DEBUG(dbgs() << "SLICING UP PHI: " << FirstPhi << '\n'; 
-             for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i) dbgs() 
-             << "AND USER PHI #" << i << ": " << *PHIsToSlice[i] << '\n';); 
- 
-  // PredValues - This is a temporary used when rewriting PHI nodes.  It is 
-  // hoisted out here to avoid construction/destruction thrashing. 
-  DenseMap<BasicBlock*, Value*> PredValues; 
- 
-  // ExtractedVals - Each new PHI we introduce is saved here so we don't 
-  // introduce redundant PHIs. 
-  DenseMap<LoweredPHIRecord, PHINode*> ExtractedVals; 
- 
-  for (unsigned UserI = 0, UserE = PHIUsers.size(); UserI != UserE; ++UserI) { 
-    unsigned PHIId = PHIUsers[UserI].PHIId; 
-    PHINode *PN = PHIsToSlice[PHIId]; 
-    unsigned Offset = PHIUsers[UserI].Shift; 
-    Type *Ty = PHIUsers[UserI].Inst->getType(); 
- 
-    PHINode *EltPHI; 
- 
-    // If we've already lowered a user like this, reuse the previously lowered 
-    // value. 
-    if ((EltPHI = ExtractedVals[LoweredPHIRecord(PN, Offset, Ty)]) == nullptr) { 
- 
-      // Otherwise, Create the new PHI node for this user. 
-      EltPHI = PHINode::Create(Ty, PN->getNumIncomingValues(), 
-                               PN->getName()+".off"+Twine(Offset), PN); 
-      assert(EltPHI->getType() != PN->getType() && 
-             "Truncate didn't shrink phi?"); 
- 
-      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { 
-        BasicBlock *Pred = PN->getIncomingBlock(i); 
-        Value *&PredVal = PredValues[Pred]; 
- 
-        // If we already have a value for this predecessor, reuse it. 
-        if (PredVal) { 
-          EltPHI->addIncoming(PredVal, Pred); 
-          continue; 
-        } 
- 
-        // Handle the PHI self-reuse case. 
-        Value *InVal = PN->getIncomingValue(i); 
-        if (InVal == PN) { 
-          PredVal = EltPHI; 
-          EltPHI->addIncoming(PredVal, Pred); 
-          continue; 
-        } 
- 
-        if (PHINode *InPHI = dyn_cast<PHINode>(PN)) { 
-          // If the incoming value was a PHI, and if it was one of the PHIs we 
-          // already rewrote it, just use the lowered value. 
-          if (Value *Res = ExtractedVals[LoweredPHIRecord(InPHI, Offset, Ty)]) { 
-            PredVal = Res; 
-            EltPHI->addIncoming(PredVal, Pred); 
-            continue; 
-          } 
-        } 
- 
-        // Otherwise, do an extract in the predecessor. 
-        Builder.SetInsertPoint(Pred->getTerminator()); 
-        Value *Res = InVal; 
-        if (Offset) 
-          Res = Builder.CreateLShr(Res, ConstantInt::get(InVal->getType(), 
-                                                          Offset), "extract"); 
-        Res = Builder.CreateTrunc(Res, Ty, "extract.t"); 
-        PredVal = Res; 
-        EltPHI->addIncoming(Res, Pred); 
- 
-        // If the incoming value was a PHI, and if it was one of the PHIs we are 
-        // rewriting, we will ultimately delete the code we inserted.  This 
-        // means we need to revisit that PHI to make sure we extract out the 
-        // needed piece. 
-        if (PHINode *OldInVal = dyn_cast<PHINode>(PN->getIncomingValue(i))) 
-          if (PHIsInspected.count(OldInVal)) { 
-            unsigned RefPHIId = 
-                find(PHIsToSlice, OldInVal) - PHIsToSlice.begin(); 
-            PHIUsers.push_back(PHIUsageRecord(RefPHIId, Offset, 
-                                              cast<Instruction>(Res))); 
-            ++UserE; 
-          } 
-      } 
-      PredValues.clear(); 
- 
-      LLVM_DEBUG(dbgs() << "  Made element PHI for offset " << Offset << ": " 
-                        << *EltPHI << '\n'); 
-      ExtractedVals[LoweredPHIRecord(PN, Offset, Ty)] = EltPHI; 
-    } 
- 
-    // Replace the use of this piece with the PHI node. 
-    replaceInstUsesWith(*PHIUsers[UserI].Inst, EltPHI); 
-  } 
- 
-  // Replace all the remaining uses of the PHI nodes (self uses and the lshrs) 
-  // with undefs. 
-  Value *Undef = UndefValue::get(FirstPhi.getType()); 
-  for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i) 
-    replaceInstUsesWith(*PHIsToSlice[i], Undef); 
-  return replaceInstUsesWith(FirstPhi, Undef); 
-} 
- 
+  // PHIUsers - Keep track of all of the truncated values extracted from a set
+  // of PHIs, along with their offset.  These are the things we want to rewrite.
+  SmallVector<PHIUsageRecord, 16> PHIUsers;
+
+  // PHIs are often mutually cyclic, so we keep track of a whole set of PHI
+  // nodes which are extracted from. PHIsToSlice is a set we use to avoid
+  // revisiting PHIs, PHIsInspected is a ordered list of PHIs that we need to
+  // check the uses of (to ensure they are all extracts).
+  SmallVector<PHINode*, 8> PHIsToSlice;
+  SmallPtrSet<PHINode*, 8> PHIsInspected;
+
+  PHIsToSlice.push_back(&FirstPhi);
+  PHIsInspected.insert(&FirstPhi);
+
+  for (unsigned PHIId = 0; PHIId != PHIsToSlice.size(); ++PHIId) {
+    PHINode *PN = PHIsToSlice[PHIId];
+
+    // Scan the input list of the PHI.  If any input is an invoke, and if the
+    // input is defined in the predecessor, then we won't be split the critical
+    // edge which is required to insert a truncate.  Because of this, we have to
+    // bail out.
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      InvokeInst *II = dyn_cast<InvokeInst>(PN->getIncomingValue(i));
+      if (!II) continue;
+      if (II->getParent() != PN->getIncomingBlock(i))
+        continue;
+
+      // If we have a phi, and if it's directly in the predecessor, then we have
+      // a critical edge where we need to put the truncate.  Since we can't
+      // split the edge in instcombine, we have to bail out.
+      return nullptr;
+    }
+
+    for (User *U : PN->users()) {
+      Instruction *UserI = cast<Instruction>(U);
+
+      // If the user is a PHI, inspect its uses recursively.
+      if (PHINode *UserPN = dyn_cast<PHINode>(UserI)) {
+        if (PHIsInspected.insert(UserPN).second)
+          PHIsToSlice.push_back(UserPN);
+        continue;
+      }
+
+      // Truncates are always ok.
+      if (isa<TruncInst>(UserI)) {
+        PHIUsers.push_back(PHIUsageRecord(PHIId, 0, UserI));
+        continue;
+      }
+
+      // Otherwise it must be a lshr which can only be used by one trunc.
+      if (UserI->getOpcode() != Instruction::LShr ||
+          !UserI->hasOneUse() || !isa<TruncInst>(UserI->user_back()) ||
+          !isa<ConstantInt>(UserI->getOperand(1)))
+        return nullptr;
+
+      // Bail on out of range shifts.
+      unsigned SizeInBits = UserI->getType()->getScalarSizeInBits();
+      if (cast<ConstantInt>(UserI->getOperand(1))->getValue().uge(SizeInBits))
+        return nullptr;
+
+      unsigned Shift = cast<ConstantInt>(UserI->getOperand(1))->getZExtValue();
+      PHIUsers.push_back(PHIUsageRecord(PHIId, Shift, UserI->user_back()));
+    }
+  }
+
+  // If we have no users, they must be all self uses, just nuke the PHI.
+  if (PHIUsers.empty())
+    return replaceInstUsesWith(FirstPhi, UndefValue::get(FirstPhi.getType()));
+
+  // If this phi node is transformable, create new PHIs for all the pieces
+  // extracted out of it.  First, sort the users by their offset and size.
+  array_pod_sort(PHIUsers.begin(), PHIUsers.end());
+
+  LLVM_DEBUG(dbgs() << "SLICING UP PHI: " << FirstPhi << '\n';
+             for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i) dbgs()
+             << "AND USER PHI #" << i << ": " << *PHIsToSlice[i] << '\n';);
+
+  // PredValues - This is a temporary used when rewriting PHI nodes.  It is
+  // hoisted out here to avoid construction/destruction thrashing.
+  DenseMap<BasicBlock*, Value*> PredValues;
+
+  // ExtractedVals - Each new PHI we introduce is saved here so we don't
+  // introduce redundant PHIs.
+  DenseMap<LoweredPHIRecord, PHINode*> ExtractedVals;
+
+  for (unsigned UserI = 0, UserE = PHIUsers.size(); UserI != UserE; ++UserI) {
+    unsigned PHIId = PHIUsers[UserI].PHIId;
+    PHINode *PN = PHIsToSlice[PHIId];
+    unsigned Offset = PHIUsers[UserI].Shift;
+    Type *Ty = PHIUsers[UserI].Inst->getType();
+
+    PHINode *EltPHI;
+
+    // If we've already lowered a user like this, reuse the previously lowered
+    // value.
+    if ((EltPHI = ExtractedVals[LoweredPHIRecord(PN, Offset, Ty)]) == nullptr) {
+
+      // Otherwise, Create the new PHI node for this user.
+      EltPHI = PHINode::Create(Ty, PN->getNumIncomingValues(),
+                               PN->getName()+".off"+Twine(Offset), PN);
+      assert(EltPHI->getType() != PN->getType() &&
+             "Truncate didn't shrink phi?");
+
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+        BasicBlock *Pred = PN->getIncomingBlock(i);
+        Value *&PredVal = PredValues[Pred];
+
+        // If we already have a value for this predecessor, reuse it.
+        if (PredVal) {
+          EltPHI->addIncoming(PredVal, Pred);
+          continue;
+        }
+
+        // Handle the PHI self-reuse case.
+        Value *InVal = PN->getIncomingValue(i);
+        if (InVal == PN) {
+          PredVal = EltPHI;
+          EltPHI->addIncoming(PredVal, Pred);
+          continue;
+        }
+
+        if (PHINode *InPHI = dyn_cast<PHINode>(PN)) {
+          // If the incoming value was a PHI, and if it was one of the PHIs we
+          // already rewrote it, just use the lowered value.
+          if (Value *Res = ExtractedVals[LoweredPHIRecord(InPHI, Offset, Ty)]) {
+            PredVal = Res;
+            EltPHI->addIncoming(PredVal, Pred);
+            continue;
+          }
+        }
+
+        // Otherwise, do an extract in the predecessor.
+        Builder.SetInsertPoint(Pred->getTerminator());
+        Value *Res = InVal;
+        if (Offset)
+          Res = Builder.CreateLShr(Res, ConstantInt::get(InVal->getType(),
+                                                          Offset), "extract");
+        Res = Builder.CreateTrunc(Res, Ty, "extract.t");
+        PredVal = Res;
+        EltPHI->addIncoming(Res, Pred);
+
+        // If the incoming value was a PHI, and if it was one of the PHIs we are
+        // rewriting, we will ultimately delete the code we inserted.  This
+        // means we need to revisit that PHI to make sure we extract out the
+        // needed piece.
+        if (PHINode *OldInVal = dyn_cast<PHINode>(PN->getIncomingValue(i)))
+          if (PHIsInspected.count(OldInVal)) {
+            unsigned RefPHIId =
+                find(PHIsToSlice, OldInVal) - PHIsToSlice.begin();
+            PHIUsers.push_back(PHIUsageRecord(RefPHIId, Offset,
+                                              cast<Instruction>(Res)));
+            ++UserE;
+          }
+      }
+      PredValues.clear();
+
+      LLVM_DEBUG(dbgs() << "  Made element PHI for offset " << Offset << ": "
+                        << *EltPHI << '\n');
+      ExtractedVals[LoweredPHIRecord(PN, Offset, Ty)] = EltPHI;
+    }
+
+    // Replace the use of this piece with the PHI node.
+    replaceInstUsesWith(*PHIUsers[UserI].Inst, EltPHI);
+  }
+
+  // Replace all the remaining uses of the PHI nodes (self uses and the lshrs)
+  // with undefs.
+  Value *Undef = UndefValue::get(FirstPhi.getType());
+  for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i)
+    replaceInstUsesWith(*PHIsToSlice[i], Undef);
+  return replaceInstUsesWith(FirstPhi, Undef);
+}
+
 static Value *SimplifyUsingControlFlow(InstCombiner &Self, PHINode &PN,
                                        const DominatorTree &DT) {
   // Simplify the following patterns:
@@ -1297,142 +1297,142 @@ static Value *SimplifyUsingControlFlow(InstCombiner &Self, PHINode &PN,
   return nullptr;
 }
 
-// PHINode simplification 
-// 
+// PHINode simplification
+//
 Instruction *InstCombinerImpl::visitPHINode(PHINode &PN) {
-  if (Value *V = SimplifyInstruction(&PN, SQ.getWithInstruction(&PN))) 
-    return replaceInstUsesWith(PN, V); 
- 
+  if (Value *V = SimplifyInstruction(&PN, SQ.getWithInstruction(&PN)))
+    return replaceInstUsesWith(PN, V);
+
   if (Instruction *Result = foldPHIArgZextsIntoPHI(PN))
-    return Result; 
- 
-  // If all PHI operands are the same operation, pull them through the PHI, 
-  // reducing code size. 
-  if (isa<Instruction>(PN.getIncomingValue(0)) && 
-      isa<Instruction>(PN.getIncomingValue(1)) && 
-      cast<Instruction>(PN.getIncomingValue(0))->getOpcode() == 
+    return Result;
+
+  // If all PHI operands are the same operation, pull them through the PHI,
+  // reducing code size.
+  if (isa<Instruction>(PN.getIncomingValue(0)) &&
+      isa<Instruction>(PN.getIncomingValue(1)) &&
+      cast<Instruction>(PN.getIncomingValue(0))->getOpcode() ==
           cast<Instruction>(PN.getIncomingValue(1))->getOpcode() &&
       PN.getIncomingValue(0)->hasOneUser())
     if (Instruction *Result = foldPHIArgOpIntoPHI(PN))
-      return Result; 
- 
-  // If this is a trivial cycle in the PHI node graph, remove it.  Basically, if 
-  // this PHI only has a single use (a PHI), and if that PHI only has one use (a 
-  // PHI)... break the cycle. 
-  if (PN.hasOneUse()) { 
+      return Result;
+
+  // If this is a trivial cycle in the PHI node graph, remove it.  Basically, if
+  // this PHI only has a single use (a PHI), and if that PHI only has one use (a
+  // PHI)... break the cycle.
+  if (PN.hasOneUse()) {
     if (Instruction *Result = foldIntegerTypedPHI(PN))
-      return Result; 
- 
-    Instruction *PHIUser = cast<Instruction>(PN.user_back()); 
-    if (PHINode *PU = dyn_cast<PHINode>(PHIUser)) { 
-      SmallPtrSet<PHINode*, 16> PotentiallyDeadPHIs; 
-      PotentiallyDeadPHIs.insert(&PN); 
-      if (DeadPHICycle(PU, PotentiallyDeadPHIs)) 
-        return replaceInstUsesWith(PN, UndefValue::get(PN.getType())); 
-    } 
- 
-    // If this phi has a single use, and if that use just computes a value for 
-    // the next iteration of a loop, delete the phi.  This occurs with unused 
-    // induction variables, e.g. "for (int j = 0; ; ++j);".  Detecting this 
-    // common case here is good because the only other things that catch this 
-    // are induction variable analysis (sometimes) and ADCE, which is only run 
-    // late. 
-    if (PHIUser->hasOneUse() && 
-        (isa<BinaryOperator>(PHIUser) || isa<GetElementPtrInst>(PHIUser)) && 
-        PHIUser->user_back() == &PN) { 
-      return replaceInstUsesWith(PN, UndefValue::get(PN.getType())); 
-    } 
-    // When a PHI is used only to be compared with zero, it is safe to replace 
-    // an incoming value proved as known nonzero with any non-zero constant. 
-    // For example, in the code below, the incoming value %v can be replaced 
-    // with any non-zero constant based on the fact that the PHI is only used to 
-    // be compared with zero and %v is a known non-zero value: 
-    // %v = select %cond, 1, 2 
-    // %p = phi [%v, BB] ... 
-    //      icmp eq, %p, 0 
-    auto *CmpInst = dyn_cast<ICmpInst>(PHIUser); 
-    // FIXME: To be simple, handle only integer type for now. 
-    if (CmpInst && isa<IntegerType>(PN.getType()) && CmpInst->isEquality() && 
-        match(CmpInst->getOperand(1), m_Zero())) { 
-      ConstantInt *NonZeroConst = nullptr; 
-      bool MadeChange = false; 
-      for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) { 
-        Instruction *CtxI = PN.getIncomingBlock(i)->getTerminator(); 
-        Value *VA = PN.getIncomingValue(i); 
-        if (isKnownNonZero(VA, DL, 0, &AC, CtxI, &DT)) { 
-          if (!NonZeroConst) 
-            NonZeroConst = GetAnyNonZeroConstInt(PN); 
- 
-          if (NonZeroConst != VA) { 
-            replaceOperand(PN, i, NonZeroConst); 
-            MadeChange = true; 
-          } 
-        } 
-      } 
-      if (MadeChange) 
-        return &PN; 
-    } 
-  } 
- 
-  // We sometimes end up with phi cycles that non-obviously end up being the 
-  // same value, for example: 
-  //   z = some value; x = phi (y, z); y = phi (x, z) 
-  // where the phi nodes don't necessarily need to be in the same block.  Do a 
-  // quick check to see if the PHI node only contains a single non-phi value, if 
-  // so, scan to see if the phi cycle is actually equal to that value. 
-  { 
-    unsigned InValNo = 0, NumIncomingVals = PN.getNumIncomingValues(); 
-    // Scan for the first non-phi operand. 
-    while (InValNo != NumIncomingVals && 
-           isa<PHINode>(PN.getIncomingValue(InValNo))) 
-      ++InValNo; 
- 
-    if (InValNo != NumIncomingVals) { 
-      Value *NonPhiInVal = PN.getIncomingValue(InValNo); 
- 
-      // Scan the rest of the operands to see if there are any conflicts, if so 
-      // there is no need to recursively scan other phis. 
-      for (++InValNo; InValNo != NumIncomingVals; ++InValNo) { 
-        Value *OpVal = PN.getIncomingValue(InValNo); 
-        if (OpVal != NonPhiInVal && !isa<PHINode>(OpVal)) 
-          break; 
-      } 
- 
-      // If we scanned over all operands, then we have one unique value plus 
-      // phi values.  Scan PHI nodes to see if they all merge in each other or 
-      // the value. 
-      if (InValNo == NumIncomingVals) { 
-        SmallPtrSet<PHINode*, 16> ValueEqualPHIs; 
-        if (PHIsEqualValue(&PN, NonPhiInVal, ValueEqualPHIs)) 
-          return replaceInstUsesWith(PN, NonPhiInVal); 
-      } 
-    } 
-  } 
- 
-  // If there are multiple PHIs, sort their operands so that they all list 
-  // the blocks in the same order. This will help identical PHIs be eliminated 
-  // by other passes. Other passes shouldn't depend on this for correctness 
-  // however. 
-  PHINode *FirstPN = cast<PHINode>(PN.getParent()->begin()); 
-  if (&PN != FirstPN) 
-    for (unsigned i = 0, e = FirstPN->getNumIncomingValues(); i != e; ++i) { 
-      BasicBlock *BBA = PN.getIncomingBlock(i); 
-      BasicBlock *BBB = FirstPN->getIncomingBlock(i); 
-      if (BBA != BBB) { 
-        Value *VA = PN.getIncomingValue(i); 
-        unsigned j = PN.getBasicBlockIndex(BBB); 
-        Value *VB = PN.getIncomingValue(j); 
-        PN.setIncomingBlock(i, BBB); 
-        PN.setIncomingValue(i, VB); 
-        PN.setIncomingBlock(j, BBA); 
-        PN.setIncomingValue(j, VA); 
-        // NOTE: Instcombine normally would want us to "return &PN" if we 
-        // modified any of the operands of an instruction.  However, since we 
-        // aren't adding or removing uses (just rearranging them) we don't do 
-        // this in this case. 
-      } 
-    } 
- 
+      return Result;
+
+    Instruction *PHIUser = cast<Instruction>(PN.user_back());
+    if (PHINode *PU = dyn_cast<PHINode>(PHIUser)) {
+      SmallPtrSet<PHINode*, 16> PotentiallyDeadPHIs;
+      PotentiallyDeadPHIs.insert(&PN);
+      if (DeadPHICycle(PU, PotentiallyDeadPHIs))
+        return replaceInstUsesWith(PN, UndefValue::get(PN.getType()));
+    }
+
+    // If this phi has a single use, and if that use just computes a value for
+    // the next iteration of a loop, delete the phi.  This occurs with unused
+    // induction variables, e.g. "for (int j = 0; ; ++j);".  Detecting this
+    // common case here is good because the only other things that catch this
+    // are induction variable analysis (sometimes) and ADCE, which is only run
+    // late.
+    if (PHIUser->hasOneUse() &&
+        (isa<BinaryOperator>(PHIUser) || isa<GetElementPtrInst>(PHIUser)) &&
+        PHIUser->user_back() == &PN) {
+      return replaceInstUsesWith(PN, UndefValue::get(PN.getType()));
+    }
+    // When a PHI is used only to be compared with zero, it is safe to replace
+    // an incoming value proved as known nonzero with any non-zero constant.
+    // For example, in the code below, the incoming value %v can be replaced
+    // with any non-zero constant based on the fact that the PHI is only used to
+    // be compared with zero and %v is a known non-zero value:
+    // %v = select %cond, 1, 2
+    // %p = phi [%v, BB] ...
+    //      icmp eq, %p, 0
+    auto *CmpInst = dyn_cast<ICmpInst>(PHIUser);
+    // FIXME: To be simple, handle only integer type for now.
+    if (CmpInst && isa<IntegerType>(PN.getType()) && CmpInst->isEquality() &&
+        match(CmpInst->getOperand(1), m_Zero())) {
+      ConstantInt *NonZeroConst = nullptr;
+      bool MadeChange = false;
+      for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
+        Instruction *CtxI = PN.getIncomingBlock(i)->getTerminator();
+        Value *VA = PN.getIncomingValue(i);
+        if (isKnownNonZero(VA, DL, 0, &AC, CtxI, &DT)) {
+          if (!NonZeroConst)
+            NonZeroConst = GetAnyNonZeroConstInt(PN);
+
+          if (NonZeroConst != VA) {
+            replaceOperand(PN, i, NonZeroConst);
+            MadeChange = true;
+          }
+        }
+      }
+      if (MadeChange)
+        return &PN;
+    }
+  }
+
+  // We sometimes end up with phi cycles that non-obviously end up being the
+  // same value, for example:
+  //   z = some value; x = phi (y, z); y = phi (x, z)
+  // where the phi nodes don't necessarily need to be in the same block.  Do a
+  // quick check to see if the PHI node only contains a single non-phi value, if
+  // so, scan to see if the phi cycle is actually equal to that value.
+  {
+    unsigned InValNo = 0, NumIncomingVals = PN.getNumIncomingValues();
+    // Scan for the first non-phi operand.
+    while (InValNo != NumIncomingVals &&
+           isa<PHINode>(PN.getIncomingValue(InValNo)))
+      ++InValNo;
+
+    if (InValNo != NumIncomingVals) {
+      Value *NonPhiInVal = PN.getIncomingValue(InValNo);
+
+      // Scan the rest of the operands to see if there are any conflicts, if so
+      // there is no need to recursively scan other phis.
+      for (++InValNo; InValNo != NumIncomingVals; ++InValNo) {
+        Value *OpVal = PN.getIncomingValue(InValNo);
+        if (OpVal != NonPhiInVal && !isa<PHINode>(OpVal))
+          break;
+      }
+
+      // If we scanned over all operands, then we have one unique value plus
+      // phi values.  Scan PHI nodes to see if they all merge in each other or
+      // the value.
+      if (InValNo == NumIncomingVals) {
+        SmallPtrSet<PHINode*, 16> ValueEqualPHIs;
+        if (PHIsEqualValue(&PN, NonPhiInVal, ValueEqualPHIs))
+          return replaceInstUsesWith(PN, NonPhiInVal);
+      }
+    }
+  }
+
+  // If there are multiple PHIs, sort their operands so that they all list
+  // the blocks in the same order. This will help identical PHIs be eliminated
+  // by other passes. Other passes shouldn't depend on this for correctness
+  // however.
+  PHINode *FirstPN = cast<PHINode>(PN.getParent()->begin());
+  if (&PN != FirstPN)
+    for (unsigned i = 0, e = FirstPN->getNumIncomingValues(); i != e; ++i) {
+      BasicBlock *BBA = PN.getIncomingBlock(i);
+      BasicBlock *BBB = FirstPN->getIncomingBlock(i);
+      if (BBA != BBB) {
+        Value *VA = PN.getIncomingValue(i);
+        unsigned j = PN.getBasicBlockIndex(BBB);
+        Value *VB = PN.getIncomingValue(j);
+        PN.setIncomingBlock(i, BBB);
+        PN.setIncomingValue(i, VB);
+        PN.setIncomingBlock(j, BBA);
+        PN.setIncomingValue(j, VA);
+        // NOTE: Instcombine normally would want us to "return &PN" if we
+        // modified any of the operands of an instruction.  However, since we
+        // aren't adding or removing uses (just rearranging them) we don't do
+        // this in this case.
+      }
+    }
+
   // Is there an identical PHI node in this basic block?
   for (PHINode &IdenticalPN : PN.getParent()->phis()) {
     // Ignore the PHI node itself.
@@ -1448,18 +1448,18 @@ Instruction *InstCombinerImpl::visitPHINode(PHINode &PN) {
     return replaceInstUsesWith(PN, &IdenticalPN);
   }
 
-  // If this is an integer PHI and we know that it has an illegal type, see if 
-  // it is only used by trunc or trunc(lshr) operations.  If so, we split the 
-  // PHI into the various pieces being extracted.  This sort of thing is 
-  // introduced when SROA promotes an aggregate to a single large integer type. 
-  if (PN.getType()->isIntegerTy() && 
-      !DL.isLegalInteger(PN.getType()->getPrimitiveSizeInBits())) 
-    if (Instruction *Res = SliceUpIllegalIntegerPHI(PN)) 
-      return Res; 
- 
+  // If this is an integer PHI and we know that it has an illegal type, see if
+  // it is only used by trunc or trunc(lshr) operations.  If so, we split the
+  // PHI into the various pieces being extracted.  This sort of thing is
+  // introduced when SROA promotes an aggregate to a single large integer type.
+  if (PN.getType()->isIntegerTy() &&
+      !DL.isLegalInteger(PN.getType()->getPrimitiveSizeInBits()))
+    if (Instruction *Res = SliceUpIllegalIntegerPHI(PN))
+      return Res;
+
   // Ultimately, try to replace this Phi with a dominating condition.
   if (auto *V = SimplifyUsingControlFlow(*this, PN, DT))
     return replaceInstUsesWith(PN, V);
 
-  return nullptr; 
-} 
+  return nullptr;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineSelect.cpp b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 4197c03672..5f174aae09 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1,1068 +1,1068 @@
-//===- InstCombineSelect.cpp ----------------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the visitSelect function. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "InstCombineInternal.h" 
-#include "llvm/ADT/APInt.h" 
-#include "llvm/ADT/Optional.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/CmpInstAnalysis.h" 
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/Operator.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/KnownBits.h" 
-#include "llvm/Transforms/InstCombine/InstCombineWorklist.h" 
+//===- InstCombineSelect.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visitSelect function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CmpInstAnalysis.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
-#include <cassert> 
-#include <utility> 
- 
-using namespace llvm; 
-using namespace PatternMatch; 
- 
-#define DEBUG_TYPE "instcombine" 
- 
+#include <cassert>
+#include <utility>
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
 /// FIXME: Enabled by default until the pattern is supported well.
 static cl::opt<bool> EnableUnsafeSelectTransform(
     "instcombine-unsafe-select-transform", cl::init(true),
     cl::desc("Enable poison-unsafe select to and/or transform"));
 
-static Value *createMinMax(InstCombiner::BuilderTy &Builder, 
-                           SelectPatternFlavor SPF, Value *A, Value *B) { 
-  CmpInst::Predicate Pred = getMinMaxPred(SPF); 
-  assert(CmpInst::isIntPredicate(Pred) && "Expected integer predicate"); 
-  return Builder.CreateSelect(Builder.CreateICmp(Pred, A, B), A, B); 
-} 
- 
-/// Replace a select operand based on an equality comparison with the identity 
-/// constant of a binop. 
-static Instruction *foldSelectBinOpIdentity(SelectInst &Sel, 
-                                            const TargetLibraryInfo &TLI, 
+static Value *createMinMax(InstCombiner::BuilderTy &Builder,
+                           SelectPatternFlavor SPF, Value *A, Value *B) {
+  CmpInst::Predicate Pred = getMinMaxPred(SPF);
+  assert(CmpInst::isIntPredicate(Pred) && "Expected integer predicate");
+  return Builder.CreateSelect(Builder.CreateICmp(Pred, A, B), A, B);
+}
+
+/// Replace a select operand based on an equality comparison with the identity
+/// constant of a binop.
+static Instruction *foldSelectBinOpIdentity(SelectInst &Sel,
+                                            const TargetLibraryInfo &TLI,
                                             InstCombinerImpl &IC) {
-  // The select condition must be an equality compare with a constant operand. 
-  Value *X; 
-  Constant *C; 
-  CmpInst::Predicate Pred; 
-  if (!match(Sel.getCondition(), m_Cmp(Pred, m_Value(X), m_Constant(C)))) 
-    return nullptr; 
- 
-  bool IsEq; 
-  if (ICmpInst::isEquality(Pred)) 
-    IsEq = Pred == ICmpInst::ICMP_EQ; 
-  else if (Pred == FCmpInst::FCMP_OEQ) 
-    IsEq = true; 
-  else if (Pred == FCmpInst::FCMP_UNE) 
-    IsEq = false; 
-  else 
-    return nullptr; 
- 
-  // A select operand must be a binop. 
-  BinaryOperator *BO; 
-  if (!match(Sel.getOperand(IsEq ? 1 : 2), m_BinOp(BO))) 
-    return nullptr; 
- 
-  // The compare constant must be the identity constant for that binop. 
-  // If this a floating-point compare with 0.0, any zero constant will do. 
-  Type *Ty = BO->getType(); 
-  Constant *IdC = ConstantExpr::getBinOpIdentity(BO->getOpcode(), Ty, true); 
-  if (IdC != C) { 
-    if (!IdC || !CmpInst::isFPPredicate(Pred)) 
-      return nullptr; 
-    if (!match(IdC, m_AnyZeroFP()) || !match(C, m_AnyZeroFP())) 
-      return nullptr; 
-  } 
- 
-  // Last, match the compare variable operand with a binop operand. 
-  Value *Y; 
-  if (!BO->isCommutative() && !match(BO, m_BinOp(m_Value(Y), m_Specific(X)))) 
-    return nullptr; 
-  if (!match(BO, m_c_BinOp(m_Value(Y), m_Specific(X)))) 
-    return nullptr; 
- 
-  // +0.0 compares equal to -0.0, and so it does not behave as required for this 
-  // transform. Bail out if we can not exclude that possibility. 
-  if (isa<FPMathOperator>(BO)) 
-    if (!BO->hasNoSignedZeros() && !CannotBeNegativeZero(Y, &TLI)) 
-      return nullptr; 
- 
-  // BO = binop Y, X 
-  // S = { select (cmp eq X, C), BO, ? } or { select (cmp ne X, C), ?, BO } 
-  // => 
-  // S = { select (cmp eq X, C),  Y, ? } or { select (cmp ne X, C), ?,  Y } 
-  return IC.replaceOperand(Sel, IsEq ? 1 : 2, Y); 
-} 
- 
-/// This folds: 
-///  select (icmp eq (and X, C1)), TC, FC 
-///    iff C1 is a power 2 and the difference between TC and FC is a power-of-2. 
-/// To something like: 
-///  (shr (and (X, C1)), (log2(C1) - log2(TC-FC))) + FC 
-/// Or: 
-///  (shl (and (X, C1)), (log2(TC-FC) - log2(C1))) + FC 
-/// With some variations depending if FC is larger than TC, or the shift 
-/// isn't needed, or the bit widths don't match. 
-static Value *foldSelectICmpAnd(SelectInst &Sel, ICmpInst *Cmp, 
-                                InstCombiner::BuilderTy &Builder) { 
-  const APInt *SelTC, *SelFC; 
-  if (!match(Sel.getTrueValue(), m_APInt(SelTC)) || 
-      !match(Sel.getFalseValue(), m_APInt(SelFC))) 
-    return nullptr; 
- 
-  // If this is a vector select, we need a vector compare. 
-  Type *SelType = Sel.getType(); 
-  if (SelType->isVectorTy() != Cmp->getType()->isVectorTy()) 
-    return nullptr; 
- 
-  Value *V; 
-  APInt AndMask; 
-  bool CreateAnd = false; 
-  ICmpInst::Predicate Pred = Cmp->getPredicate(); 
-  if (ICmpInst::isEquality(Pred)) { 
-    if (!match(Cmp->getOperand(1), m_Zero())) 
-      return nullptr; 
- 
-    V = Cmp->getOperand(0); 
-    const APInt *AndRHS; 
-    if (!match(V, m_And(m_Value(), m_Power2(AndRHS)))) 
-      return nullptr; 
- 
-    AndMask = *AndRHS; 
-  } else if (decomposeBitTestICmp(Cmp->getOperand(0), Cmp->getOperand(1), 
-                                  Pred, V, AndMask)) { 
-    assert(ICmpInst::isEquality(Pred) && "Not equality test?"); 
-    if (!AndMask.isPowerOf2()) 
-      return nullptr; 
- 
-    CreateAnd = true; 
-  } else { 
-    return nullptr; 
-  } 
- 
-  // In general, when both constants are non-zero, we would need an offset to 
-  // replace the select. This would require more instructions than we started 
-  // with. But there's one special-case that we handle here because it can 
-  // simplify/reduce the instructions. 
-  APInt TC = *SelTC; 
-  APInt FC = *SelFC; 
-  if (!TC.isNullValue() && !FC.isNullValue()) { 
-    // If the select constants differ by exactly one bit and that's the same 
-    // bit that is masked and checked by the select condition, the select can 
-    // be replaced by bitwise logic to set/clear one bit of the constant result. 
-    if (TC.getBitWidth() != AndMask.getBitWidth() || (TC ^ FC) != AndMask) 
-      return nullptr; 
-    if (CreateAnd) { 
-      // If we have to create an 'and', then we must kill the cmp to not 
-      // increase the instruction count. 
-      if (!Cmp->hasOneUse()) 
-        return nullptr; 
-      V = Builder.CreateAnd(V, ConstantInt::get(SelType, AndMask)); 
-    } 
-    bool ExtraBitInTC = TC.ugt(FC); 
-    if (Pred == ICmpInst::ICMP_EQ) { 
-      // If the masked bit in V is clear, clear or set the bit in the result: 
-      // (V & AndMaskC) == 0 ? TC : FC --> (V & AndMaskC) ^ TC 
-      // (V & AndMaskC) == 0 ? TC : FC --> (V & AndMaskC) | TC 
-      Constant *C = ConstantInt::get(SelType, TC); 
-      return ExtraBitInTC ? Builder.CreateXor(V, C) : Builder.CreateOr(V, C); 
-    } 
-    if (Pred == ICmpInst::ICMP_NE) { 
-      // If the masked bit in V is set, set or clear the bit in the result: 
-      // (V & AndMaskC) != 0 ? TC : FC --> (V & AndMaskC) | FC 
-      // (V & AndMaskC) != 0 ? TC : FC --> (V & AndMaskC) ^ FC 
-      Constant *C = ConstantInt::get(SelType, FC); 
-      return ExtraBitInTC ? Builder.CreateOr(V, C) : Builder.CreateXor(V, C); 
-    } 
-    llvm_unreachable("Only expecting equality predicates"); 
-  } 
- 
-  // Make sure one of the select arms is a power-of-2. 
-  if (!TC.isPowerOf2() && !FC.isPowerOf2()) 
-    return nullptr; 
- 
-  // Determine which shift is needed to transform result of the 'and' into the 
-  // desired result. 
-  const APInt &ValC = !TC.isNullValue() ? TC : FC; 
-  unsigned ValZeros = ValC.logBase2(); 
-  unsigned AndZeros = AndMask.logBase2(); 
- 
-  // Insert the 'and' instruction on the input to the truncate. 
-  if (CreateAnd) 
-    V = Builder.CreateAnd(V, ConstantInt::get(V->getType(), AndMask)); 
- 
-  // If types don't match, we can still convert the select by introducing a zext 
-  // or a trunc of the 'and'. 
-  if (ValZeros > AndZeros) { 
-    V = Builder.CreateZExtOrTrunc(V, SelType); 
-    V = Builder.CreateShl(V, ValZeros - AndZeros); 
-  } else if (ValZeros < AndZeros) { 
-    V = Builder.CreateLShr(V, AndZeros - ValZeros); 
-    V = Builder.CreateZExtOrTrunc(V, SelType); 
-  } else { 
-    V = Builder.CreateZExtOrTrunc(V, SelType); 
-  } 
- 
-  // Okay, now we know that everything is set up, we just don't know whether we 
-  // have a icmp_ne or icmp_eq and whether the true or false val is the zero. 
-  bool ShouldNotVal = !TC.isNullValue(); 
-  ShouldNotVal ^= Pred == ICmpInst::ICMP_NE; 
-  if (ShouldNotVal) 
-    V = Builder.CreateXor(V, ValC); 
- 
-  return V; 
-} 
- 
-/// We want to turn code that looks like this: 
-///   %C = or %A, %B 
-///   %D = select %cond, %C, %A 
-/// into: 
-///   %C = select %cond, %B, 0 
-///   %D = or %A, %C 
-/// 
-/// Assuming that the specified instruction is an operand to the select, return 
-/// a bitmask indicating which operands of this instruction are foldable if they 
-/// equal the other incoming value of the select. 
-static unsigned getSelectFoldableOperands(BinaryOperator *I) { 
-  switch (I->getOpcode()) { 
-  case Instruction::Add: 
-  case Instruction::Mul: 
-  case Instruction::And: 
-  case Instruction::Or: 
-  case Instruction::Xor: 
-    return 3;              // Can fold through either operand. 
-  case Instruction::Sub:   // Can only fold on the amount subtracted. 
-  case Instruction::Shl:   // Can only fold on the shift amount. 
-  case Instruction::LShr: 
-  case Instruction::AShr: 
-    return 1; 
-  default: 
-    return 0;              // Cannot fold 
-  } 
-} 
- 
-/// We have (select c, TI, FI), and we know that TI and FI have the same opcode. 
+  // The select condition must be an equality compare with a constant operand.
+  Value *X;
+  Constant *C;
+  CmpInst::Predicate Pred;
+  if (!match(Sel.getCondition(), m_Cmp(Pred, m_Value(X), m_Constant(C))))
+    return nullptr;
+
+  bool IsEq;
+  if (ICmpInst::isEquality(Pred))
+    IsEq = Pred == ICmpInst::ICMP_EQ;
+  else if (Pred == FCmpInst::FCMP_OEQ)
+    IsEq = true;
+  else if (Pred == FCmpInst::FCMP_UNE)
+    IsEq = false;
+  else
+    return nullptr;
+
+  // A select operand must be a binop.
+  BinaryOperator *BO;
+  if (!match(Sel.getOperand(IsEq ? 1 : 2), m_BinOp(BO)))
+    return nullptr;
+
+  // The compare constant must be the identity constant for that binop.
+  // If this a floating-point compare with 0.0, any zero constant will do.
+  Type *Ty = BO->getType();
+  Constant *IdC = ConstantExpr::getBinOpIdentity(BO->getOpcode(), Ty, true);
+  if (IdC != C) {
+    if (!IdC || !CmpInst::isFPPredicate(Pred))
+      return nullptr;
+    if (!match(IdC, m_AnyZeroFP()) || !match(C, m_AnyZeroFP()))
+      return nullptr;
+  }
+
+  // Last, match the compare variable operand with a binop operand.
+  Value *Y;
+  if (!BO->isCommutative() && !match(BO, m_BinOp(m_Value(Y), m_Specific(X))))
+    return nullptr;
+  if (!match(BO, m_c_BinOp(m_Value(Y), m_Specific(X))))
+    return nullptr;
+
+  // +0.0 compares equal to -0.0, and so it does not behave as required for this
+  // transform. Bail out if we can not exclude that possibility.
+  if (isa<FPMathOperator>(BO))
+    if (!BO->hasNoSignedZeros() && !CannotBeNegativeZero(Y, &TLI))
+      return nullptr;
+
+  // BO = binop Y, X
+  // S = { select (cmp eq X, C), BO, ? } or { select (cmp ne X, C), ?, BO }
+  // =>
+  // S = { select (cmp eq X, C),  Y, ? } or { select (cmp ne X, C), ?,  Y }
+  return IC.replaceOperand(Sel, IsEq ? 1 : 2, Y);
+}
+
+/// This folds:
+///  select (icmp eq (and X, C1)), TC, FC
+///    iff C1 is a power 2 and the difference between TC and FC is a power-of-2.
+/// To something like:
+///  (shr (and (X, C1)), (log2(C1) - log2(TC-FC))) + FC
+/// Or:
+///  (shl (and (X, C1)), (log2(TC-FC) - log2(C1))) + FC
+/// With some variations depending if FC is larger than TC, or the shift
+/// isn't needed, or the bit widths don't match.
+static Value *foldSelectICmpAnd(SelectInst &Sel, ICmpInst *Cmp,
+                                InstCombiner::BuilderTy &Builder) {
+  const APInt *SelTC, *SelFC;
+  if (!match(Sel.getTrueValue(), m_APInt(SelTC)) ||
+      !match(Sel.getFalseValue(), m_APInt(SelFC)))
+    return nullptr;
+
+  // If this is a vector select, we need a vector compare.
+  Type *SelType = Sel.getType();
+  if (SelType->isVectorTy() != Cmp->getType()->isVectorTy())
+    return nullptr;
+
+  Value *V;
+  APInt AndMask;
+  bool CreateAnd = false;
+  ICmpInst::Predicate Pred = Cmp->getPredicate();
+  if (ICmpInst::isEquality(Pred)) {
+    if (!match(Cmp->getOperand(1), m_Zero()))
+      return nullptr;
+
+    V = Cmp->getOperand(0);
+    const APInt *AndRHS;
+    if (!match(V, m_And(m_Value(), m_Power2(AndRHS))))
+      return nullptr;
+
+    AndMask = *AndRHS;
+  } else if (decomposeBitTestICmp(Cmp->getOperand(0), Cmp->getOperand(1),
+                                  Pred, V, AndMask)) {
+    assert(ICmpInst::isEquality(Pred) && "Not equality test?");
+    if (!AndMask.isPowerOf2())
+      return nullptr;
+
+    CreateAnd = true;
+  } else {
+    return nullptr;
+  }
+
+  // In general, when both constants are non-zero, we would need an offset to
+  // replace the select. This would require more instructions than we started
+  // with. But there's one special-case that we handle here because it can
+  // simplify/reduce the instructions.
+  APInt TC = *SelTC;
+  APInt FC = *SelFC;
+  if (!TC.isNullValue() && !FC.isNullValue()) {
+    // If the select constants differ by exactly one bit and that's the same
+    // bit that is masked and checked by the select condition, the select can
+    // be replaced by bitwise logic to set/clear one bit of the constant result.
+    if (TC.getBitWidth() != AndMask.getBitWidth() || (TC ^ FC) != AndMask)
+      return nullptr;
+    if (CreateAnd) {
+      // If we have to create an 'and', then we must kill the cmp to not
+      // increase the instruction count.
+      if (!Cmp->hasOneUse())
+        return nullptr;
+      V = Builder.CreateAnd(V, ConstantInt::get(SelType, AndMask));
+    }
+    bool ExtraBitInTC = TC.ugt(FC);
+    if (Pred == ICmpInst::ICMP_EQ) {
+      // If the masked bit in V is clear, clear or set the bit in the result:
+      // (V & AndMaskC) == 0 ? TC : FC --> (V & AndMaskC) ^ TC
+      // (V & AndMaskC) == 0 ? TC : FC --> (V & AndMaskC) | TC
+      Constant *C = ConstantInt::get(SelType, TC);
+      return ExtraBitInTC ? Builder.CreateXor(V, C) : Builder.CreateOr(V, C);
+    }
+    if (Pred == ICmpInst::ICMP_NE) {
+      // If the masked bit in V is set, set or clear the bit in the result:
+      // (V & AndMaskC) != 0 ? TC : FC --> (V & AndMaskC) | FC
+      // (V & AndMaskC) != 0 ? TC : FC --> (V & AndMaskC) ^ FC
+      Constant *C = ConstantInt::get(SelType, FC);
+      return ExtraBitInTC ? Builder.CreateOr(V, C) : Builder.CreateXor(V, C);
+    }
+    llvm_unreachable("Only expecting equality predicates");
+  }
+
+  // Make sure one of the select arms is a power-of-2.
+  if (!TC.isPowerOf2() && !FC.isPowerOf2())
+    return nullptr;
+
+  // Determine which shift is needed to transform result of the 'and' into the
+  // desired result.
+  const APInt &ValC = !TC.isNullValue() ? TC : FC;
+  unsigned ValZeros = ValC.logBase2();
+  unsigned AndZeros = AndMask.logBase2();
+
+  // Insert the 'and' instruction on the input to the truncate.
+  if (CreateAnd)
+    V = Builder.CreateAnd(V, ConstantInt::get(V->getType(), AndMask));
+
+  // If types don't match, we can still convert the select by introducing a zext
+  // or a trunc of the 'and'.
+  if (ValZeros > AndZeros) {
+    V = Builder.CreateZExtOrTrunc(V, SelType);
+    V = Builder.CreateShl(V, ValZeros - AndZeros);
+  } else if (ValZeros < AndZeros) {
+    V = Builder.CreateLShr(V, AndZeros - ValZeros);
+    V = Builder.CreateZExtOrTrunc(V, SelType);
+  } else {
+    V = Builder.CreateZExtOrTrunc(V, SelType);
+  }
+
+  // Okay, now we know that everything is set up, we just don't know whether we
+  // have a icmp_ne or icmp_eq and whether the true or false val is the zero.
+  bool ShouldNotVal = !TC.isNullValue();
+  ShouldNotVal ^= Pred == ICmpInst::ICMP_NE;
+  if (ShouldNotVal)
+    V = Builder.CreateXor(V, ValC);
+
+  return V;
+}
+
+/// We want to turn code that looks like this:
+///   %C = or %A, %B
+///   %D = select %cond, %C, %A
+/// into:
+///   %C = select %cond, %B, 0
+///   %D = or %A, %C
+///
+/// Assuming that the specified instruction is an operand to the select, return
+/// a bitmask indicating which operands of this instruction are foldable if they
+/// equal the other incoming value of the select.
+static unsigned getSelectFoldableOperands(BinaryOperator *I) {
+  switch (I->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    return 3;              // Can fold through either operand.
+  case Instruction::Sub:   // Can only fold on the amount subtracted.
+  case Instruction::Shl:   // Can only fold on the shift amount.
+  case Instruction::LShr:
+  case Instruction::AShr:
+    return 1;
+  default:
+    return 0;              // Cannot fold
+  }
+}
+
+/// We have (select c, TI, FI), and we know that TI and FI have the same opcode.
 Instruction *InstCombinerImpl::foldSelectOpOp(SelectInst &SI, Instruction *TI,
                                               Instruction *FI) {
-  // Don't break up min/max patterns. The hasOneUse checks below prevent that 
-  // for most cases, but vector min/max with bitcasts can be transformed. If the 
-  // one-use restrictions are eased for other patterns, we still don't want to 
-  // obfuscate min/max. 
-  if ((match(&SI, m_SMin(m_Value(), m_Value())) || 
-       match(&SI, m_SMax(m_Value(), m_Value())) || 
-       match(&SI, m_UMin(m_Value(), m_Value())) || 
-       match(&SI, m_UMax(m_Value(), m_Value())))) 
-    return nullptr; 
- 
-  // If this is a cast from the same type, merge. 
-  Value *Cond = SI.getCondition(); 
-  Type *CondTy = Cond->getType(); 
-  if (TI->getNumOperands() == 1 && TI->isCast()) { 
-    Type *FIOpndTy = FI->getOperand(0)->getType(); 
-    if (TI->getOperand(0)->getType() != FIOpndTy) 
-      return nullptr; 
- 
-    // The select condition may be a vector. We may only change the operand 
-    // type if the vector width remains the same (and matches the condition). 
-    if (auto *CondVTy = dyn_cast<VectorType>(CondTy)) { 
+  // Don't break up min/max patterns. The hasOneUse checks below prevent that
+  // for most cases, but vector min/max with bitcasts can be transformed. If the
+  // one-use restrictions are eased for other patterns, we still don't want to
+  // obfuscate min/max.
+  if ((match(&SI, m_SMin(m_Value(), m_Value())) ||
+       match(&SI, m_SMax(m_Value(), m_Value())) ||
+       match(&SI, m_UMin(m_Value(), m_Value())) ||
+       match(&SI, m_UMax(m_Value(), m_Value()))))
+    return nullptr;
+
+  // If this is a cast from the same type, merge.
+  Value *Cond = SI.getCondition();
+  Type *CondTy = Cond->getType();
+  if (TI->getNumOperands() == 1 && TI->isCast()) {
+    Type *FIOpndTy = FI->getOperand(0)->getType();
+    if (TI->getOperand(0)->getType() != FIOpndTy)
+      return nullptr;
+
+    // The select condition may be a vector. We may only change the operand
+    // type if the vector width remains the same (and matches the condition).
+    if (auto *CondVTy = dyn_cast<VectorType>(CondTy)) {
       if (!FIOpndTy->isVectorTy() ||
           CondVTy->getElementCount() !=
               cast<VectorType>(FIOpndTy)->getElementCount())
-        return nullptr; 
- 
-      // TODO: If the backend knew how to deal with casts better, we could 
-      // remove this limitation. For now, there's too much potential to create 
-      // worse codegen by promoting the select ahead of size-altering casts 
-      // (PR28160). 
-      // 
-      // Note that ValueTracking's matchSelectPattern() looks through casts 
-      // without checking 'hasOneUse' when it matches min/max patterns, so this 
-      // transform may end up happening anyway. 
-      if (TI->getOpcode() != Instruction::BitCast && 
-          (!TI->hasOneUse() || !FI->hasOneUse())) 
-        return nullptr; 
-    } else if (!TI->hasOneUse() || !FI->hasOneUse()) { 
-      // TODO: The one-use restrictions for a scalar select could be eased if 
-      // the fold of a select in visitLoadInst() was enhanced to match a pattern 
-      // that includes a cast. 
-      return nullptr; 
-    } 
- 
-    // Fold this by inserting a select from the input values. 
-    Value *NewSI = 
-        Builder.CreateSelect(Cond, TI->getOperand(0), FI->getOperand(0), 
-                             SI.getName() + ".v", &SI); 
-    return CastInst::Create(Instruction::CastOps(TI->getOpcode()), NewSI, 
-                            TI->getType()); 
-  } 
- 
-  // Cond ? -X : -Y --> -(Cond ? X : Y) 
-  Value *X, *Y; 
-  if (match(TI, m_FNeg(m_Value(X))) && match(FI, m_FNeg(m_Value(Y))) && 
-      (TI->hasOneUse() || FI->hasOneUse())) { 
-    Value *NewSel = Builder.CreateSelect(Cond, X, Y, SI.getName() + ".v", &SI); 
-    return UnaryOperator::CreateFNegFMF(NewSel, TI); 
-  } 
- 
-  // Only handle binary operators (including two-operand getelementptr) with 
-  // one-use here. As with the cast case above, it may be possible to relax the 
-  // one-use constraint, but that needs be examined carefully since it may not 
-  // reduce the total number of instructions. 
-  if (TI->getNumOperands() != 2 || FI->getNumOperands() != 2 || 
-      (!isa<BinaryOperator>(TI) && !isa<GetElementPtrInst>(TI)) || 
-      !TI->hasOneUse() || !FI->hasOneUse()) 
-    return nullptr; 
- 
-  // Figure out if the operations have any operands in common. 
-  Value *MatchOp, *OtherOpT, *OtherOpF; 
-  bool MatchIsOpZero; 
-  if (TI->getOperand(0) == FI->getOperand(0)) { 
-    MatchOp  = TI->getOperand(0); 
-    OtherOpT = TI->getOperand(1); 
-    OtherOpF = FI->getOperand(1); 
-    MatchIsOpZero = true; 
-  } else if (TI->getOperand(1) == FI->getOperand(1)) { 
-    MatchOp  = TI->getOperand(1); 
-    OtherOpT = TI->getOperand(0); 
-    OtherOpF = FI->getOperand(0); 
-    MatchIsOpZero = false; 
-  } else if (!TI->isCommutative()) { 
-    return nullptr; 
-  } else if (TI->getOperand(0) == FI->getOperand(1)) { 
-    MatchOp  = TI->getOperand(0); 
-    OtherOpT = TI->getOperand(1); 
-    OtherOpF = FI->getOperand(0); 
-    MatchIsOpZero = true; 
-  } else if (TI->getOperand(1) == FI->getOperand(0)) { 
-    MatchOp  = TI->getOperand(1); 
-    OtherOpT = TI->getOperand(0); 
-    OtherOpF = FI->getOperand(1); 
-    MatchIsOpZero = true; 
-  } else { 
-    return nullptr; 
-  } 
- 
-  // If the select condition is a vector, the operands of the original select's 
-  // operands also must be vectors. This may not be the case for getelementptr 
-  // for example. 
-  if (CondTy->isVectorTy() && (!OtherOpT->getType()->isVectorTy() || 
-                               !OtherOpF->getType()->isVectorTy())) 
-    return nullptr; 
- 
-  // If we reach here, they do have operations in common. 
-  Value *NewSI = Builder.CreateSelect(Cond, OtherOpT, OtherOpF, 
-                                      SI.getName() + ".v", &SI); 
-  Value *Op0 = MatchIsOpZero ? MatchOp : NewSI; 
-  Value *Op1 = MatchIsOpZero ? NewSI : MatchOp; 
-  if (auto *BO = dyn_cast<BinaryOperator>(TI)) { 
-    BinaryOperator *NewBO = BinaryOperator::Create(BO->getOpcode(), Op0, Op1); 
-    NewBO->copyIRFlags(TI); 
-    NewBO->andIRFlags(FI); 
-    return NewBO; 
-  } 
-  if (auto *TGEP = dyn_cast<GetElementPtrInst>(TI)) { 
-    auto *FGEP = cast<GetElementPtrInst>(FI); 
-    Type *ElementType = TGEP->getResultElementType(); 
-    return TGEP->isInBounds() && FGEP->isInBounds() 
-               ? GetElementPtrInst::CreateInBounds(ElementType, Op0, {Op1}) 
-               : GetElementPtrInst::Create(ElementType, Op0, {Op1}); 
-  } 
-  llvm_unreachable("Expected BinaryOperator or GEP"); 
-  return nullptr; 
-} 
- 
-static bool isSelect01(const APInt &C1I, const APInt &C2I) { 
-  if (!C1I.isNullValue() && !C2I.isNullValue()) // One side must be zero. 
-    return false; 
-  return C1I.isOneValue() || C1I.isAllOnesValue() || 
-         C2I.isOneValue() || C2I.isAllOnesValue(); 
-} 
- 
-/// Try to fold the select into one of the operands to allow further 
-/// optimization. 
+        return nullptr;
+
+      // TODO: If the backend knew how to deal with casts better, we could
+      // remove this limitation. For now, there's too much potential to create
+      // worse codegen by promoting the select ahead of size-altering casts
+      // (PR28160).
+      //
+      // Note that ValueTracking's matchSelectPattern() looks through casts
+      // without checking 'hasOneUse' when it matches min/max patterns, so this
+      // transform may end up happening anyway.
+      if (TI->getOpcode() != Instruction::BitCast &&
+          (!TI->hasOneUse() || !FI->hasOneUse()))
+        return nullptr;
+    } else if (!TI->hasOneUse() || !FI->hasOneUse()) {
+      // TODO: The one-use restrictions for a scalar select could be eased if
+      // the fold of a select in visitLoadInst() was enhanced to match a pattern
+      // that includes a cast.
+      return nullptr;
+    }
+
+    // Fold this by inserting a select from the input values.
+    Value *NewSI =
+        Builder.CreateSelect(Cond, TI->getOperand(0), FI->getOperand(0),
+                             SI.getName() + ".v", &SI);
+    return CastInst::Create(Instruction::CastOps(TI->getOpcode()), NewSI,
+                            TI->getType());
+  }
+
+  // Cond ? -X : -Y --> -(Cond ? X : Y)
+  Value *X, *Y;
+  if (match(TI, m_FNeg(m_Value(X))) && match(FI, m_FNeg(m_Value(Y))) &&
+      (TI->hasOneUse() || FI->hasOneUse())) {
+    Value *NewSel = Builder.CreateSelect(Cond, X, Y, SI.getName() + ".v", &SI);
+    return UnaryOperator::CreateFNegFMF(NewSel, TI);
+  }
+
+  // Only handle binary operators (including two-operand getelementptr) with
+  // one-use here. As with the cast case above, it may be possible to relax the
+  // one-use constraint, but that needs be examined carefully since it may not
+  // reduce the total number of instructions.
+  if (TI->getNumOperands() != 2 || FI->getNumOperands() != 2 ||
+      (!isa<BinaryOperator>(TI) && !isa<GetElementPtrInst>(TI)) ||
+      !TI->hasOneUse() || !FI->hasOneUse())
+    return nullptr;
+
+  // Figure out if the operations have any operands in common.
+  Value *MatchOp, *OtherOpT, *OtherOpF;
+  bool MatchIsOpZero;
+  if (TI->getOperand(0) == FI->getOperand(0)) {
+    MatchOp  = TI->getOperand(0);
+    OtherOpT = TI->getOperand(1);
+    OtherOpF = FI->getOperand(1);
+    MatchIsOpZero = true;
+  } else if (TI->getOperand(1) == FI->getOperand(1)) {
+    MatchOp  = TI->getOperand(1);
+    OtherOpT = TI->getOperand(0);
+    OtherOpF = FI->getOperand(0);
+    MatchIsOpZero = false;
+  } else if (!TI->isCommutative()) {
+    return nullptr;
+  } else if (TI->getOperand(0) == FI->getOperand(1)) {
+    MatchOp  = TI->getOperand(0);
+    OtherOpT = TI->getOperand(1);
+    OtherOpF = FI->getOperand(0);
+    MatchIsOpZero = true;
+  } else if (TI->getOperand(1) == FI->getOperand(0)) {
+    MatchOp  = TI->getOperand(1);
+    OtherOpT = TI->getOperand(0);
+    OtherOpF = FI->getOperand(1);
+    MatchIsOpZero = true;
+  } else {
+    return nullptr;
+  }
+
+  // If the select condition is a vector, the operands of the original select's
+  // operands also must be vectors. This may not be the case for getelementptr
+  // for example.
+  if (CondTy->isVectorTy() && (!OtherOpT->getType()->isVectorTy() ||
+                               !OtherOpF->getType()->isVectorTy()))
+    return nullptr;
+
+  // If we reach here, they do have operations in common.
+  Value *NewSI = Builder.CreateSelect(Cond, OtherOpT, OtherOpF,
+                                      SI.getName() + ".v", &SI);
+  Value *Op0 = MatchIsOpZero ? MatchOp : NewSI;
+  Value *Op1 = MatchIsOpZero ? NewSI : MatchOp;
+  if (auto *BO = dyn_cast<BinaryOperator>(TI)) {
+    BinaryOperator *NewBO = BinaryOperator::Create(BO->getOpcode(), Op0, Op1);
+    NewBO->copyIRFlags(TI);
+    NewBO->andIRFlags(FI);
+    return NewBO;
+  }
+  if (auto *TGEP = dyn_cast<GetElementPtrInst>(TI)) {
+    auto *FGEP = cast<GetElementPtrInst>(FI);
+    Type *ElementType = TGEP->getResultElementType();
+    return TGEP->isInBounds() && FGEP->isInBounds()
+               ? GetElementPtrInst::CreateInBounds(ElementType, Op0, {Op1})
+               : GetElementPtrInst::Create(ElementType, Op0, {Op1});
+  }
+  llvm_unreachable("Expected BinaryOperator or GEP");
+  return nullptr;
+}
+
+static bool isSelect01(const APInt &C1I, const APInt &C2I) {
+  if (!C1I.isNullValue() && !C2I.isNullValue()) // One side must be zero.
+    return false;
+  return C1I.isOneValue() || C1I.isAllOnesValue() ||
+         C2I.isOneValue() || C2I.isAllOnesValue();
+}
+
+/// Try to fold the select into one of the operands to allow further
+/// optimization.
 Instruction *InstCombinerImpl::foldSelectIntoOp(SelectInst &SI, Value *TrueVal,
                                                 Value *FalseVal) {
-  // See the comment above GetSelectFoldableOperands for a description of the 
-  // transformation we are doing here. 
-  if (auto *TVI = dyn_cast<BinaryOperator>(TrueVal)) { 
-    if (TVI->hasOneUse() && !isa<Constant>(FalseVal)) { 
-      if (unsigned SFO = getSelectFoldableOperands(TVI)) { 
-        unsigned OpToFold = 0; 
-        if ((SFO & 1) && FalseVal == TVI->getOperand(0)) { 
-          OpToFold = 1; 
-        } else if ((SFO & 2) && FalseVal == TVI->getOperand(1)) { 
-          OpToFold = 2; 
-        } 
- 
-        if (OpToFold) { 
+  // See the comment above GetSelectFoldableOperands for a description of the
+  // transformation we are doing here.
+  if (auto *TVI = dyn_cast<BinaryOperator>(TrueVal)) {
+    if (TVI->hasOneUse() && !isa<Constant>(FalseVal)) {
+      if (unsigned SFO = getSelectFoldableOperands(TVI)) {
+        unsigned OpToFold = 0;
+        if ((SFO & 1) && FalseVal == TVI->getOperand(0)) {
+          OpToFold = 1;
+        } else if ((SFO & 2) && FalseVal == TVI->getOperand(1)) {
+          OpToFold = 2;
+        }
+
+        if (OpToFold) {
           Constant *C = ConstantExpr::getBinOpIdentity(TVI->getOpcode(),
                                                        TVI->getType(), true);
-          Value *OOp = TVI->getOperand(2-OpToFold); 
-          // Avoid creating select between 2 constants unless it's selecting 
-          // between 0, 1 and -1. 
-          const APInt *OOpC; 
-          bool OOpIsAPInt = match(OOp, m_APInt(OOpC)); 
+          Value *OOp = TVI->getOperand(2-OpToFold);
+          // Avoid creating select between 2 constants unless it's selecting
+          // between 0, 1 and -1.
+          const APInt *OOpC;
+          bool OOpIsAPInt = match(OOp, m_APInt(OOpC));
           if (!isa<Constant>(OOp) ||
               (OOpIsAPInt && isSelect01(C->getUniqueInteger(), *OOpC))) {
-            Value *NewSel = Builder.CreateSelect(SI.getCondition(), OOp, C); 
-            NewSel->takeName(TVI); 
-            BinaryOperator *BO = BinaryOperator::Create(TVI->getOpcode(), 
-                                                        FalseVal, NewSel); 
-            BO->copyIRFlags(TVI); 
-            return BO; 
-          } 
-        } 
-      } 
-    } 
-  } 
- 
-  if (auto *FVI = dyn_cast<BinaryOperator>(FalseVal)) { 
-    if (FVI->hasOneUse() && !isa<Constant>(TrueVal)) { 
-      if (unsigned SFO = getSelectFoldableOperands(FVI)) { 
-        unsigned OpToFold = 0; 
-        if ((SFO & 1) && TrueVal == FVI->getOperand(0)) { 
-          OpToFold = 1; 
-        } else if ((SFO & 2) && TrueVal == FVI->getOperand(1)) { 
-          OpToFold = 2; 
-        } 
- 
-        if (OpToFold) { 
+            Value *NewSel = Builder.CreateSelect(SI.getCondition(), OOp, C);
+            NewSel->takeName(TVI);
+            BinaryOperator *BO = BinaryOperator::Create(TVI->getOpcode(),
+                                                        FalseVal, NewSel);
+            BO->copyIRFlags(TVI);
+            return BO;
+          }
+        }
+      }
+    }
+  }
+
+  if (auto *FVI = dyn_cast<BinaryOperator>(FalseVal)) {
+    if (FVI->hasOneUse() && !isa<Constant>(TrueVal)) {
+      if (unsigned SFO = getSelectFoldableOperands(FVI)) {
+        unsigned OpToFold = 0;
+        if ((SFO & 1) && TrueVal == FVI->getOperand(0)) {
+          OpToFold = 1;
+        } else if ((SFO & 2) && TrueVal == FVI->getOperand(1)) {
+          OpToFold = 2;
+        }
+
+        if (OpToFold) {
           Constant *C = ConstantExpr::getBinOpIdentity(FVI->getOpcode(),
                                                        FVI->getType(), true);
-          Value *OOp = FVI->getOperand(2-OpToFold); 
-          // Avoid creating select between 2 constants unless it's selecting 
-          // between 0, 1 and -1. 
-          const APInt *OOpC; 
-          bool OOpIsAPInt = match(OOp, m_APInt(OOpC)); 
+          Value *OOp = FVI->getOperand(2-OpToFold);
+          // Avoid creating select between 2 constants unless it's selecting
+          // between 0, 1 and -1.
+          const APInt *OOpC;
+          bool OOpIsAPInt = match(OOp, m_APInt(OOpC));
           if (!isa<Constant>(OOp) ||
               (OOpIsAPInt && isSelect01(C->getUniqueInteger(), *OOpC))) {
-            Value *NewSel = Builder.CreateSelect(SI.getCondition(), C, OOp); 
-            NewSel->takeName(FVI); 
-            BinaryOperator *BO = BinaryOperator::Create(FVI->getOpcode(), 
-                                                        TrueVal, NewSel); 
-            BO->copyIRFlags(FVI); 
-            return BO; 
-          } 
-        } 
-      } 
-    } 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// We want to turn: 
-///   (select (icmp eq (and X, Y), 0), (and (lshr X, Z), 1), 1) 
-/// into: 
-///   zext (icmp ne i32 (and X, (or Y, (shl 1, Z))), 0) 
-/// Note: 
-///   Z may be 0 if lshr is missing. 
-/// Worst-case scenario is that we will replace 5 instructions with 5 different 
-/// instructions, but we got rid of select. 
-static Instruction *foldSelectICmpAndAnd(Type *SelType, const ICmpInst *Cmp, 
-                                         Value *TVal, Value *FVal, 
-                                         InstCombiner::BuilderTy &Builder) { 
-  if (!(Cmp->hasOneUse() && Cmp->getOperand(0)->hasOneUse() && 
-        Cmp->getPredicate() == ICmpInst::ICMP_EQ && 
-        match(Cmp->getOperand(1), m_Zero()) && match(FVal, m_One()))) 
-    return nullptr; 
- 
-  // The TrueVal has general form of:  and %B, 1 
-  Value *B; 
-  if (!match(TVal, m_OneUse(m_And(m_Value(B), m_One())))) 
-    return nullptr; 
- 
-  // Where %B may be optionally shifted:  lshr %X, %Z. 
-  Value *X, *Z; 
-  const bool HasShift = match(B, m_OneUse(m_LShr(m_Value(X), m_Value(Z)))); 
-  if (!HasShift) 
-    X = B; 
- 
-  Value *Y; 
-  if (!match(Cmp->getOperand(0), m_c_And(m_Specific(X), m_Value(Y)))) 
-    return nullptr; 
- 
-  // ((X & Y) == 0) ? ((X >> Z) & 1) : 1 --> (X & (Y | (1 << Z))) != 0 
-  // ((X & Y) == 0) ? (X & 1) : 1 --> (X & (Y | 1)) != 0 
-  Constant *One = ConstantInt::get(SelType, 1); 
-  Value *MaskB = HasShift ? Builder.CreateShl(One, Z) : One; 
-  Value *FullMask = Builder.CreateOr(Y, MaskB); 
-  Value *MaskedX = Builder.CreateAnd(X, FullMask); 
-  Value *ICmpNeZero = Builder.CreateIsNotNull(MaskedX); 
-  return new ZExtInst(ICmpNeZero, SelType); 
-} 
- 
-/// We want to turn: 
-///   (select (icmp sgt x, C), lshr (X, Y), ashr (X, Y)); iff C s>= -1 
-///   (select (icmp slt x, C), ashr (X, Y), lshr (X, Y)); iff C s>= 0 
-/// into: 
-///   ashr (X, Y) 
-static Value *foldSelectICmpLshrAshr(const ICmpInst *IC, Value *TrueVal, 
-                                     Value *FalseVal, 
-                                     InstCombiner::BuilderTy &Builder) { 
-  ICmpInst::Predicate Pred = IC->getPredicate(); 
-  Value *CmpLHS = IC->getOperand(0); 
-  Value *CmpRHS = IC->getOperand(1); 
-  if (!CmpRHS->getType()->isIntOrIntVectorTy()) 
-    return nullptr; 
- 
-  Value *X, *Y; 
-  unsigned Bitwidth = CmpRHS->getType()->getScalarSizeInBits(); 
-  if ((Pred != ICmpInst::ICMP_SGT || 
-       !match(CmpRHS, 
-              m_SpecificInt_ICMP(ICmpInst::ICMP_SGE, APInt(Bitwidth, -1)))) && 
-      (Pred != ICmpInst::ICMP_SLT || 
-       !match(CmpRHS, 
-              m_SpecificInt_ICMP(ICmpInst::ICMP_SGE, APInt(Bitwidth, 0))))) 
-    return nullptr; 
- 
-  // Canonicalize so that ashr is in FalseVal. 
-  if (Pred == ICmpInst::ICMP_SLT) 
-    std::swap(TrueVal, FalseVal); 
- 
-  if (match(TrueVal, m_LShr(m_Value(X), m_Value(Y))) && 
-      match(FalseVal, m_AShr(m_Specific(X), m_Specific(Y))) && 
-      match(CmpLHS, m_Specific(X))) { 
-    const auto *Ashr = cast<Instruction>(FalseVal); 
-    // if lshr is not exact and ashr is, this new ashr must not be exact. 
-    bool IsExact = Ashr->isExact() && cast<Instruction>(TrueVal)->isExact(); 
-    return Builder.CreateAShr(X, Y, IC->getName(), IsExact); 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// We want to turn: 
-///   (select (icmp eq (and X, C1), 0), Y, (or Y, C2)) 
-/// into: 
-///   (or (shl (and X, C1), C3), Y) 
-/// iff: 
-///   C1 and C2 are both powers of 2 
-/// where: 
-///   C3 = Log(C2) - Log(C1) 
-/// 
-/// This transform handles cases where: 
-/// 1. The icmp predicate is inverted 
-/// 2. The select operands are reversed 
-/// 3. The magnitude of C2 and C1 are flipped 
-static Value *foldSelectICmpAndOr(const ICmpInst *IC, Value *TrueVal, 
-                                  Value *FalseVal, 
-                                  InstCombiner::BuilderTy &Builder) { 
-  // Only handle integer compares. Also, if this is a vector select, we need a 
-  // vector compare. 
-  if (!TrueVal->getType()->isIntOrIntVectorTy() || 
-      TrueVal->getType()->isVectorTy() != IC->getType()->isVectorTy()) 
-    return nullptr; 
- 
-  Value *CmpLHS = IC->getOperand(0); 
-  Value *CmpRHS = IC->getOperand(1); 
- 
-  Value *V; 
-  unsigned C1Log; 
-  bool IsEqualZero; 
-  bool NeedAnd = false; 
-  if (IC->isEquality()) { 
-    if (!match(CmpRHS, m_Zero())) 
-      return nullptr; 
- 
-    const APInt *C1; 
-    if (!match(CmpLHS, m_And(m_Value(), m_Power2(C1)))) 
-      return nullptr; 
- 
-    V = CmpLHS; 
-    C1Log = C1->logBase2(); 
-    IsEqualZero = IC->getPredicate() == ICmpInst::ICMP_EQ; 
-  } else if (IC->getPredicate() == ICmpInst::ICMP_SLT || 
-             IC->getPredicate() == ICmpInst::ICMP_SGT) { 
-    // We also need to recognize (icmp slt (trunc (X)), 0) and 
-    // (icmp sgt (trunc (X)), -1). 
-    IsEqualZero = IC->getPredicate() == ICmpInst::ICMP_SGT; 
-    if ((IsEqualZero && !match(CmpRHS, m_AllOnes())) || 
-        (!IsEqualZero && !match(CmpRHS, m_Zero()))) 
-      return nullptr; 
- 
-    if (!match(CmpLHS, m_OneUse(m_Trunc(m_Value(V))))) 
-      return nullptr; 
- 
-    C1Log = CmpLHS->getType()->getScalarSizeInBits() - 1; 
-    NeedAnd = true; 
-  } else { 
-    return nullptr; 
-  } 
- 
-  const APInt *C2; 
-  bool OrOnTrueVal = false; 
-  bool OrOnFalseVal = match(FalseVal, m_Or(m_Specific(TrueVal), m_Power2(C2))); 
-  if (!OrOnFalseVal) 
-    OrOnTrueVal = match(TrueVal, m_Or(m_Specific(FalseVal), m_Power2(C2))); 
- 
-  if (!OrOnFalseVal && !OrOnTrueVal) 
-    return nullptr; 
- 
-  Value *Y = OrOnFalseVal ? TrueVal : FalseVal; 
- 
-  unsigned C2Log = C2->logBase2(); 
- 
-  bool NeedXor = (!IsEqualZero && OrOnFalseVal) || (IsEqualZero && OrOnTrueVal); 
-  bool NeedShift = C1Log != C2Log; 
-  bool NeedZExtTrunc = Y->getType()->getScalarSizeInBits() != 
-                       V->getType()->getScalarSizeInBits(); 
- 
-  // Make sure we don't create more instructions than we save. 
-  Value *Or = OrOnFalseVal ? FalseVal : TrueVal; 
-  if ((NeedShift + NeedXor + NeedZExtTrunc) > 
-      (IC->hasOneUse() + Or->hasOneUse())) 
-    return nullptr; 
- 
-  if (NeedAnd) { 
-    // Insert the AND instruction on the input to the truncate. 
-    APInt C1 = APInt::getOneBitSet(V->getType()->getScalarSizeInBits(), C1Log); 
-    V = Builder.CreateAnd(V, ConstantInt::get(V->getType(), C1)); 
-  } 
- 
-  if (C2Log > C1Log) { 
-    V = Builder.CreateZExtOrTrunc(V, Y->getType()); 
-    V = Builder.CreateShl(V, C2Log - C1Log); 
-  } else if (C1Log > C2Log) { 
-    V = Builder.CreateLShr(V, C1Log - C2Log); 
-    V = Builder.CreateZExtOrTrunc(V, Y->getType()); 
-  } else 
-    V = Builder.CreateZExtOrTrunc(V, Y->getType()); 
- 
-  if (NeedXor) 
-    V = Builder.CreateXor(V, *C2); 
- 
-  return Builder.CreateOr(V, Y); 
-} 
- 
-/// Canonicalize a set or clear of a masked set of constant bits to 
-/// select-of-constants form. 
-static Instruction *foldSetClearBits(SelectInst &Sel, 
-                                     InstCombiner::BuilderTy &Builder) { 
-  Value *Cond = Sel.getCondition(); 
-  Value *T = Sel.getTrueValue(); 
-  Value *F = Sel.getFalseValue(); 
-  Type *Ty = Sel.getType(); 
-  Value *X; 
-  const APInt *NotC, *C; 
- 
-  // Cond ? (X & ~C) : (X | C) --> (X & ~C) | (Cond ? 0 : C) 
-  if (match(T, m_And(m_Value(X), m_APInt(NotC))) && 
-      match(F, m_OneUse(m_Or(m_Specific(X), m_APInt(C)))) && *NotC == ~(*C)) { 
-    Constant *Zero = ConstantInt::getNullValue(Ty); 
-    Constant *OrC = ConstantInt::get(Ty, *C); 
-    Value *NewSel = Builder.CreateSelect(Cond, Zero, OrC, "masksel", &Sel); 
-    return BinaryOperator::CreateOr(T, NewSel); 
-  } 
- 
-  // Cond ? (X | C) : (X & ~C) --> (X & ~C) | (Cond ? C : 0) 
-  if (match(F, m_And(m_Value(X), m_APInt(NotC))) && 
-      match(T, m_OneUse(m_Or(m_Specific(X), m_APInt(C)))) && *NotC == ~(*C)) { 
-    Constant *Zero = ConstantInt::getNullValue(Ty); 
-    Constant *OrC = ConstantInt::get(Ty, *C); 
-    Value *NewSel = Builder.CreateSelect(Cond, OrC, Zero, "masksel", &Sel); 
-    return BinaryOperator::CreateOr(F, NewSel); 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Transform patterns such as (a > b) ? a - b : 0 into usub.sat(a, b). 
-/// There are 8 commuted/swapped variants of this pattern. 
-/// TODO: Also support a - UMIN(a,b) patterns. 
-static Value *canonicalizeSaturatedSubtract(const ICmpInst *ICI, 
-                                            const Value *TrueVal, 
-                                            const Value *FalseVal, 
-                                            InstCombiner::BuilderTy &Builder) { 
-  ICmpInst::Predicate Pred = ICI->getPredicate(); 
-  if (!ICmpInst::isUnsigned(Pred)) 
-    return nullptr; 
- 
-  // (b > a) ? 0 : a - b -> (b <= a) ? a - b : 0 
-  if (match(TrueVal, m_Zero())) { 
-    Pred = ICmpInst::getInversePredicate(Pred); 
-    std::swap(TrueVal, FalseVal); 
-  } 
-  if (!match(FalseVal, m_Zero())) 
-    return nullptr; 
- 
-  Value *A = ICI->getOperand(0); 
-  Value *B = ICI->getOperand(1); 
-  if (Pred == ICmpInst::ICMP_ULE || Pred == ICmpInst::ICMP_ULT) { 
-    // (b < a) ? a - b : 0 -> (a > b) ? a - b : 0 
-    std::swap(A, B); 
-    Pred = ICmpInst::getSwappedPredicate(Pred); 
-  } 
- 
-  assert((Pred == ICmpInst::ICMP_UGE || Pred == ICmpInst::ICMP_UGT) && 
-         "Unexpected isUnsigned predicate!"); 
- 
-  // Ensure the sub is of the form: 
-  //  (a > b) ? a - b : 0 -> usub.sat(a, b) 
-  //  (a > b) ? b - a : 0 -> -usub.sat(a, b) 
-  // Checking for both a-b and a+(-b) as a constant. 
-  bool IsNegative = false; 
-  const APInt *C; 
-  if (match(TrueVal, m_Sub(m_Specific(B), m_Specific(A))) || 
-      (match(A, m_APInt(C)) && 
-       match(TrueVal, m_Add(m_Specific(B), m_SpecificInt(-*C))))) 
-    IsNegative = true; 
-  else if (!match(TrueVal, m_Sub(m_Specific(A), m_Specific(B))) && 
-           !(match(B, m_APInt(C)) && 
-             match(TrueVal, m_Add(m_Specific(A), m_SpecificInt(-*C))))) 
-    return nullptr; 
- 
-  // If we are adding a negate and the sub and icmp are used anywhere else, we 
-  // would end up with more instructions. 
-  if (IsNegative && !TrueVal->hasOneUse() && !ICI->hasOneUse()) 
-    return nullptr; 
- 
-  // (a > b) ? a - b : 0 -> usub.sat(a, b) 
-  // (a > b) ? b - a : 0 -> -usub.sat(a, b) 
-  Value *Result = Builder.CreateBinaryIntrinsic(Intrinsic::usub_sat, A, B); 
-  if (IsNegative) 
-    Result = Builder.CreateNeg(Result); 
-  return Result; 
-} 
- 
-static Value *canonicalizeSaturatedAdd(ICmpInst *Cmp, Value *TVal, Value *FVal, 
-                                       InstCombiner::BuilderTy &Builder) { 
-  if (!Cmp->hasOneUse()) 
-    return nullptr; 
- 
-  // Match unsigned saturated add with constant. 
-  Value *Cmp0 = Cmp->getOperand(0); 
-  Value *Cmp1 = Cmp->getOperand(1); 
-  ICmpInst::Predicate Pred = Cmp->getPredicate(); 
-  Value *X; 
-  const APInt *C, *CmpC; 
-  if (Pred == ICmpInst::ICMP_ULT && 
-      match(TVal, m_Add(m_Value(X), m_APInt(C))) && X == Cmp0 && 
-      match(FVal, m_AllOnes()) && match(Cmp1, m_APInt(CmpC)) && *CmpC == ~*C) { 
-    // (X u< ~C) ? (X + C) : -1 --> uadd.sat(X, C) 
-    return Builder.CreateBinaryIntrinsic( 
-        Intrinsic::uadd_sat, X, ConstantInt::get(X->getType(), *C)); 
-  } 
- 
-  // Match unsigned saturated add of 2 variables with an unnecessary 'not'. 
-  // There are 8 commuted variants. 
-  // Canonicalize -1 (saturated result) to true value of the select. 
-  if (match(FVal, m_AllOnes())) { 
-    std::swap(TVal, FVal); 
-    Pred = CmpInst::getInversePredicate(Pred); 
-  } 
-  if (!match(TVal, m_AllOnes())) 
-    return nullptr; 
- 
-  // Canonicalize predicate to less-than or less-or-equal-than. 
-  if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_UGE) { 
-    std::swap(Cmp0, Cmp1); 
-    Pred = CmpInst::getSwappedPredicate(Pred); 
-  } 
-  if (Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_ULE) 
-    return nullptr; 
- 
-  // Match unsigned saturated add of 2 variables with an unnecessary 'not'. 
-  // Strictness of the comparison is irrelevant. 
-  Value *Y; 
-  if (match(Cmp0, m_Not(m_Value(X))) && 
-      match(FVal, m_c_Add(m_Specific(X), m_Value(Y))) && Y == Cmp1) { 
-    // (~X u< Y) ? -1 : (X + Y) --> uadd.sat(X, Y) 
-    // (~X u< Y) ? -1 : (Y + X) --> uadd.sat(X, Y) 
-    return Builder.CreateBinaryIntrinsic(Intrinsic::uadd_sat, X, Y); 
-  } 
-  // The 'not' op may be included in the sum but not the compare. 
-  // Strictness of the comparison is irrelevant. 
-  X = Cmp0; 
-  Y = Cmp1; 
-  if (match(FVal, m_c_Add(m_Not(m_Specific(X)), m_Specific(Y)))) { 
-    // (X u< Y) ? -1 : (~X + Y) --> uadd.sat(~X, Y) 
-    // (X u< Y) ? -1 : (Y + ~X) --> uadd.sat(Y, ~X) 
-    BinaryOperator *BO = cast<BinaryOperator>(FVal); 
-    return Builder.CreateBinaryIntrinsic( 
-        Intrinsic::uadd_sat, BO->getOperand(0), BO->getOperand(1)); 
-  } 
-  // The overflow may be detected via the add wrapping round. 
-  // This is only valid for strict comparison! 
-  if (Pred == ICmpInst::ICMP_ULT && 
-      match(Cmp0, m_c_Add(m_Specific(Cmp1), m_Value(Y))) && 
-      match(FVal, m_c_Add(m_Specific(Cmp1), m_Specific(Y)))) { 
-    // ((X + Y) u< X) ? -1 : (X + Y) --> uadd.sat(X, Y) 
-    // ((X + Y) u< Y) ? -1 : (X + Y) --> uadd.sat(X, Y) 
-    return Builder.CreateBinaryIntrinsic(Intrinsic::uadd_sat, Cmp1, Y); 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Fold the following code sequence: 
-/// \code 
-///   int a = ctlz(x & -x); 
-//    x ? 31 - a : a; 
-/// \code 
-/// 
-/// into: 
-///   cttz(x) 
-static Instruction *foldSelectCtlzToCttz(ICmpInst *ICI, Value *TrueVal, 
-                                         Value *FalseVal, 
-                                         InstCombiner::BuilderTy &Builder) { 
-  unsigned BitWidth = TrueVal->getType()->getScalarSizeInBits(); 
-  if (!ICI->isEquality() || !match(ICI->getOperand(1), m_Zero())) 
-    return nullptr; 
- 
-  if (ICI->getPredicate() == ICmpInst::ICMP_NE) 
-    std::swap(TrueVal, FalseVal); 
- 
-  if (!match(FalseVal, 
-             m_Xor(m_Deferred(TrueVal), m_SpecificInt(BitWidth - 1)))) 
-    return nullptr; 
- 
-  if (!match(TrueVal, m_Intrinsic<Intrinsic::ctlz>())) 
-    return nullptr; 
- 
-  Value *X = ICI->getOperand(0); 
-  auto *II = cast<IntrinsicInst>(TrueVal); 
-  if (!match(II->getOperand(0), m_c_And(m_Specific(X), m_Neg(m_Specific(X))))) 
-    return nullptr; 
- 
-  Function *F = Intrinsic::getDeclaration(II->getModule(), Intrinsic::cttz, 
-                                          II->getType()); 
-  return CallInst::Create(F, {X, II->getArgOperand(1)}); 
-} 
- 
-/// Attempt to fold a cttz/ctlz followed by a icmp plus select into a single 
-/// call to cttz/ctlz with flag 'is_zero_undef' cleared. 
-/// 
-/// For example, we can fold the following code sequence: 
-/// \code 
-///   %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 true) 
-///   %1 = icmp ne i32 %x, 0 
-///   %2 = select i1 %1, i32 %0, i32 32 
-/// \code 
-/// 
-/// into: 
-///   %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 false) 
-static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal, 
-                                 InstCombiner::BuilderTy &Builder) { 
-  ICmpInst::Predicate Pred = ICI->getPredicate(); 
-  Value *CmpLHS = ICI->getOperand(0); 
-  Value *CmpRHS = ICI->getOperand(1); 
- 
-  // Check if the condition value compares a value for equality against zero. 
-  if (!ICI->isEquality() || !match(CmpRHS, m_Zero())) 
-    return nullptr; 
- 
-  Value *SelectArg = FalseVal; 
-  Value *ValueOnZero = TrueVal; 
-  if (Pred == ICmpInst::ICMP_NE) 
-    std::swap(SelectArg, ValueOnZero); 
- 
-  // Skip zero extend/truncate. 
-  Value *Count = nullptr; 
-  if (!match(SelectArg, m_ZExt(m_Value(Count))) && 
-      !match(SelectArg, m_Trunc(m_Value(Count)))) 
-    Count = SelectArg; 
- 
-  // Check that 'Count' is a call to intrinsic cttz/ctlz. Also check that the 
-  // input to the cttz/ctlz is used as LHS for the compare instruction. 
-  if (!match(Count, m_Intrinsic<Intrinsic::cttz>(m_Specific(CmpLHS))) && 
-      !match(Count, m_Intrinsic<Intrinsic::ctlz>(m_Specific(CmpLHS)))) 
-    return nullptr; 
- 
-  IntrinsicInst *II = cast<IntrinsicInst>(Count); 
- 
-  // Check if the value propagated on zero is a constant number equal to the 
-  // sizeof in bits of 'Count'. 
-  unsigned SizeOfInBits = Count->getType()->getScalarSizeInBits(); 
-  if (match(ValueOnZero, m_SpecificInt(SizeOfInBits))) { 
-    // Explicitly clear the 'undef_on_zero' flag. It's always valid to go from 
-    // true to false on this flag, so we can replace it for all users. 
-    II->setArgOperand(1, ConstantInt::getFalse(II->getContext())); 
-    return SelectArg; 
-  } 
- 
-  // The ValueOnZero is not the bitwidth. But if the cttz/ctlz (and optional 
-  // zext/trunc) have one use (ending at the select), the cttz/ctlz result will 
-  // not be used if the input is zero. Relax to 'undef_on_zero' for that case. 
-  if (II->hasOneUse() && SelectArg->hasOneUse() && 
-      !match(II->getArgOperand(1), m_One())) 
-    II->setArgOperand(1, ConstantInt::getTrue(II->getContext())); 
- 
-  return nullptr; 
-} 
- 
-/// Return true if we find and adjust an icmp+select pattern where the compare 
-/// is with a constant that can be incremented or decremented to match the 
-/// minimum or maximum idiom. 
-static bool adjustMinMax(SelectInst &Sel, ICmpInst &Cmp) { 
-  ICmpInst::Predicate Pred = Cmp.getPredicate(); 
-  Value *CmpLHS = Cmp.getOperand(0); 
-  Value *CmpRHS = Cmp.getOperand(1); 
-  Value *TrueVal = Sel.getTrueValue(); 
-  Value *FalseVal = Sel.getFalseValue(); 
- 
-  // We may move or edit the compare, so make sure the select is the only user. 
-  const APInt *CmpC; 
-  if (!Cmp.hasOneUse() || !match(CmpRHS, m_APInt(CmpC))) 
-    return false; 
- 
-  // These transforms only work for selects of integers or vector selects of 
-  // integer vectors. 
-  Type *SelTy = Sel.getType(); 
-  auto *SelEltTy = dyn_cast<IntegerType>(SelTy->getScalarType()); 
-  if (!SelEltTy || SelTy->isVectorTy() != Cmp.getType()->isVectorTy()) 
-    return false; 
- 
-  Constant *AdjustedRHS; 
-  if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_SGT) 
-    AdjustedRHS = ConstantInt::get(CmpRHS->getType(), *CmpC + 1); 
-  else if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_SLT) 
-    AdjustedRHS = ConstantInt::get(CmpRHS->getType(), *CmpC - 1); 
-  else 
-    return false; 
- 
-  // X > C ? X : C+1  -->  X < C+1 ? C+1 : X 
-  // X < C ? X : C-1  -->  X > C-1 ? C-1 : X 
-  if ((CmpLHS == TrueVal && AdjustedRHS == FalseVal) || 
-      (CmpLHS == FalseVal && AdjustedRHS == TrueVal)) { 
-    ; // Nothing to do here. Values match without any sign/zero extension. 
-  } 
-  // Types do not match. Instead of calculating this with mixed types, promote 
-  // all to the larger type. This enables scalar evolution to analyze this 
-  // expression. 
-  else if (CmpRHS->getType()->getScalarSizeInBits() < SelEltTy->getBitWidth()) { 
-    Constant *SextRHS = ConstantExpr::getSExt(AdjustedRHS, SelTy); 
- 
-    // X = sext x; x >s c ? X : C+1 --> X = sext x; X <s C+1 ? C+1 : X 
-    // X = sext x; x <s c ? X : C-1 --> X = sext x; X >s C-1 ? C-1 : X 
-    // X = sext x; x >u c ? X : C+1 --> X = sext x; X <u C+1 ? C+1 : X 
-    // X = sext x; x <u c ? X : C-1 --> X = sext x; X >u C-1 ? C-1 : X 
-    if (match(TrueVal, m_SExt(m_Specific(CmpLHS))) && SextRHS == FalseVal) { 
-      CmpLHS = TrueVal; 
-      AdjustedRHS = SextRHS; 
-    } else if (match(FalseVal, m_SExt(m_Specific(CmpLHS))) && 
-               SextRHS == TrueVal) { 
-      CmpLHS = FalseVal; 
-      AdjustedRHS = SextRHS; 
-    } else if (Cmp.isUnsigned()) { 
-      Constant *ZextRHS = ConstantExpr::getZExt(AdjustedRHS, SelTy); 
-      // X = zext x; x >u c ? X : C+1 --> X = zext x; X <u C+1 ? C+1 : X 
-      // X = zext x; x <u c ? X : C-1 --> X = zext x; X >u C-1 ? C-1 : X 
-      // zext + signed compare cannot be changed: 
-      //    0xff <s 0x00, but 0x00ff >s 0x0000 
-      if (match(TrueVal, m_ZExt(m_Specific(CmpLHS))) && ZextRHS == FalseVal) { 
-        CmpLHS = TrueVal; 
-        AdjustedRHS = ZextRHS; 
-      } else if (match(FalseVal, m_ZExt(m_Specific(CmpLHS))) && 
-                 ZextRHS == TrueVal) { 
-        CmpLHS = FalseVal; 
-        AdjustedRHS = ZextRHS; 
-      } else { 
-        return false; 
-      } 
-    } else { 
-      return false; 
-    } 
-  } else { 
-    return false; 
-  } 
- 
-  Pred = ICmpInst::getSwappedPredicate(Pred); 
-  CmpRHS = AdjustedRHS; 
-  std::swap(FalseVal, TrueVal); 
-  Cmp.setPredicate(Pred); 
-  Cmp.setOperand(0, CmpLHS); 
-  Cmp.setOperand(1, CmpRHS); 
-  Sel.setOperand(1, TrueVal); 
-  Sel.setOperand(2, FalseVal); 
-  Sel.swapProfMetadata(); 
- 
-  // Move the compare instruction right before the select instruction. Otherwise 
-  // the sext/zext value may be defined after the compare instruction uses it. 
-  Cmp.moveBefore(&Sel); 
- 
-  return true; 
-} 
- 
-/// If this is an integer min/max (icmp + select) with a constant operand, 
-/// create the canonical icmp for the min/max operation and canonicalize the 
-/// constant to the 'false' operand of the select: 
-/// select (icmp Pred X, C1), C2, X --> select (icmp Pred' X, C2), X, C2 
-/// Note: if C1 != C2, this will change the icmp constant to the existing 
-/// constant operand of the select. 
+            Value *NewSel = Builder.CreateSelect(SI.getCondition(), C, OOp);
+            NewSel->takeName(FVI);
+            BinaryOperator *BO = BinaryOperator::Create(FVI->getOpcode(),
+                                                        TrueVal, NewSel);
+            BO->copyIRFlags(FVI);
+            return BO;
+          }
+        }
+      }
+    }
+  }
+
+  return nullptr;
+}
+
+/// We want to turn:
+///   (select (icmp eq (and X, Y), 0), (and (lshr X, Z), 1), 1)
+/// into:
+///   zext (icmp ne i32 (and X, (or Y, (shl 1, Z))), 0)
+/// Note:
+///   Z may be 0 if lshr is missing.
+/// Worst-case scenario is that we will replace 5 instructions with 5 different
+/// instructions, but we got rid of select.
+static Instruction *foldSelectICmpAndAnd(Type *SelType, const ICmpInst *Cmp,
+                                         Value *TVal, Value *FVal,
+                                         InstCombiner::BuilderTy &Builder) {
+  if (!(Cmp->hasOneUse() && Cmp->getOperand(0)->hasOneUse() &&
+        Cmp->getPredicate() == ICmpInst::ICMP_EQ &&
+        match(Cmp->getOperand(1), m_Zero()) && match(FVal, m_One())))
+    return nullptr;
+
+  // The TrueVal has general form of:  and %B, 1
+  Value *B;
+  if (!match(TVal, m_OneUse(m_And(m_Value(B), m_One()))))
+    return nullptr;
+
+  // Where %B may be optionally shifted:  lshr %X, %Z.
+  Value *X, *Z;
+  const bool HasShift = match(B, m_OneUse(m_LShr(m_Value(X), m_Value(Z))));
+  if (!HasShift)
+    X = B;
+
+  Value *Y;
+  if (!match(Cmp->getOperand(0), m_c_And(m_Specific(X), m_Value(Y))))
+    return nullptr;
+
+  // ((X & Y) == 0) ? ((X >> Z) & 1) : 1 --> (X & (Y | (1 << Z))) != 0
+  // ((X & Y) == 0) ? (X & 1) : 1 --> (X & (Y | 1)) != 0
+  Constant *One = ConstantInt::get(SelType, 1);
+  Value *MaskB = HasShift ? Builder.CreateShl(One, Z) : One;
+  Value *FullMask = Builder.CreateOr(Y, MaskB);
+  Value *MaskedX = Builder.CreateAnd(X, FullMask);
+  Value *ICmpNeZero = Builder.CreateIsNotNull(MaskedX);
+  return new ZExtInst(ICmpNeZero, SelType);
+}
+
+/// We want to turn:
+///   (select (icmp sgt x, C), lshr (X, Y), ashr (X, Y)); iff C s>= -1
+///   (select (icmp slt x, C), ashr (X, Y), lshr (X, Y)); iff C s>= 0
+/// into:
+///   ashr (X, Y)
+static Value *foldSelectICmpLshrAshr(const ICmpInst *IC, Value *TrueVal,
+                                     Value *FalseVal,
+                                     InstCombiner::BuilderTy &Builder) {
+  ICmpInst::Predicate Pred = IC->getPredicate();
+  Value *CmpLHS = IC->getOperand(0);
+  Value *CmpRHS = IC->getOperand(1);
+  if (!CmpRHS->getType()->isIntOrIntVectorTy())
+    return nullptr;
+
+  Value *X, *Y;
+  unsigned Bitwidth = CmpRHS->getType()->getScalarSizeInBits();
+  if ((Pred != ICmpInst::ICMP_SGT ||
+       !match(CmpRHS,
+              m_SpecificInt_ICMP(ICmpInst::ICMP_SGE, APInt(Bitwidth, -1)))) &&
+      (Pred != ICmpInst::ICMP_SLT ||
+       !match(CmpRHS,
+              m_SpecificInt_ICMP(ICmpInst::ICMP_SGE, APInt(Bitwidth, 0)))))
+    return nullptr;
+
+  // Canonicalize so that ashr is in FalseVal.
+  if (Pred == ICmpInst::ICMP_SLT)
+    std::swap(TrueVal, FalseVal);
+
+  if (match(TrueVal, m_LShr(m_Value(X), m_Value(Y))) &&
+      match(FalseVal, m_AShr(m_Specific(X), m_Specific(Y))) &&
+      match(CmpLHS, m_Specific(X))) {
+    const auto *Ashr = cast<Instruction>(FalseVal);
+    // if lshr is not exact and ashr is, this new ashr must not be exact.
+    bool IsExact = Ashr->isExact() && cast<Instruction>(TrueVal)->isExact();
+    return Builder.CreateAShr(X, Y, IC->getName(), IsExact);
+  }
+
+  return nullptr;
+}
+
+/// We want to turn:
+///   (select (icmp eq (and X, C1), 0), Y, (or Y, C2))
+/// into:
+///   (or (shl (and X, C1), C3), Y)
+/// iff:
+///   C1 and C2 are both powers of 2
+/// where:
+///   C3 = Log(C2) - Log(C1)
+///
+/// This transform handles cases where:
+/// 1. The icmp predicate is inverted
+/// 2. The select operands are reversed
+/// 3. The magnitude of C2 and C1 are flipped
+static Value *foldSelectICmpAndOr(const ICmpInst *IC, Value *TrueVal,
+                                  Value *FalseVal,
+                                  InstCombiner::BuilderTy &Builder) {
+  // Only handle integer compares. Also, if this is a vector select, we need a
+  // vector compare.
+  if (!TrueVal->getType()->isIntOrIntVectorTy() ||
+      TrueVal->getType()->isVectorTy() != IC->getType()->isVectorTy())
+    return nullptr;
+
+  Value *CmpLHS = IC->getOperand(0);
+  Value *CmpRHS = IC->getOperand(1);
+
+  Value *V;
+  unsigned C1Log;
+  bool IsEqualZero;
+  bool NeedAnd = false;
+  if (IC->isEquality()) {
+    if (!match(CmpRHS, m_Zero()))
+      return nullptr;
+
+    const APInt *C1;
+    if (!match(CmpLHS, m_And(m_Value(), m_Power2(C1))))
+      return nullptr;
+
+    V = CmpLHS;
+    C1Log = C1->logBase2();
+    IsEqualZero = IC->getPredicate() == ICmpInst::ICMP_EQ;
+  } else if (IC->getPredicate() == ICmpInst::ICMP_SLT ||
+             IC->getPredicate() == ICmpInst::ICMP_SGT) {
+    // We also need to recognize (icmp slt (trunc (X)), 0) and
+    // (icmp sgt (trunc (X)), -1).
+    IsEqualZero = IC->getPredicate() == ICmpInst::ICMP_SGT;
+    if ((IsEqualZero && !match(CmpRHS, m_AllOnes())) ||
+        (!IsEqualZero && !match(CmpRHS, m_Zero())))
+      return nullptr;
+
+    if (!match(CmpLHS, m_OneUse(m_Trunc(m_Value(V)))))
+      return nullptr;
+
+    C1Log = CmpLHS->getType()->getScalarSizeInBits() - 1;
+    NeedAnd = true;
+  } else {
+    return nullptr;
+  }
+
+  const APInt *C2;
+  bool OrOnTrueVal = false;
+  bool OrOnFalseVal = match(FalseVal, m_Or(m_Specific(TrueVal), m_Power2(C2)));
+  if (!OrOnFalseVal)
+    OrOnTrueVal = match(TrueVal, m_Or(m_Specific(FalseVal), m_Power2(C2)));
+
+  if (!OrOnFalseVal && !OrOnTrueVal)
+    return nullptr;
+
+  Value *Y = OrOnFalseVal ? TrueVal : FalseVal;
+
+  unsigned C2Log = C2->logBase2();
+
+  bool NeedXor = (!IsEqualZero && OrOnFalseVal) || (IsEqualZero && OrOnTrueVal);
+  bool NeedShift = C1Log != C2Log;
+  bool NeedZExtTrunc = Y->getType()->getScalarSizeInBits() !=
+                       V->getType()->getScalarSizeInBits();
+
+  // Make sure we don't create more instructions than we save.
+  Value *Or = OrOnFalseVal ? FalseVal : TrueVal;
+  if ((NeedShift + NeedXor + NeedZExtTrunc) >
+      (IC->hasOneUse() + Or->hasOneUse()))
+    return nullptr;
+
+  if (NeedAnd) {
+    // Insert the AND instruction on the input to the truncate.
+    APInt C1 = APInt::getOneBitSet(V->getType()->getScalarSizeInBits(), C1Log);
+    V = Builder.CreateAnd(V, ConstantInt::get(V->getType(), C1));
+  }
+
+  if (C2Log > C1Log) {
+    V = Builder.CreateZExtOrTrunc(V, Y->getType());
+    V = Builder.CreateShl(V, C2Log - C1Log);
+  } else if (C1Log > C2Log) {
+    V = Builder.CreateLShr(V, C1Log - C2Log);
+    V = Builder.CreateZExtOrTrunc(V, Y->getType());
+  } else
+    V = Builder.CreateZExtOrTrunc(V, Y->getType());
+
+  if (NeedXor)
+    V = Builder.CreateXor(V, *C2);
+
+  return Builder.CreateOr(V, Y);
+}
+
+/// Canonicalize a set or clear of a masked set of constant bits to
+/// select-of-constants form.
+static Instruction *foldSetClearBits(SelectInst &Sel,
+                                     InstCombiner::BuilderTy &Builder) {
+  Value *Cond = Sel.getCondition();
+  Value *T = Sel.getTrueValue();
+  Value *F = Sel.getFalseValue();
+  Type *Ty = Sel.getType();
+  Value *X;
+  const APInt *NotC, *C;
+
+  // Cond ? (X & ~C) : (X | C) --> (X & ~C) | (Cond ? 0 : C)
+  if (match(T, m_And(m_Value(X), m_APInt(NotC))) &&
+      match(F, m_OneUse(m_Or(m_Specific(X), m_APInt(C)))) && *NotC == ~(*C)) {
+    Constant *Zero = ConstantInt::getNullValue(Ty);
+    Constant *OrC = ConstantInt::get(Ty, *C);
+    Value *NewSel = Builder.CreateSelect(Cond, Zero, OrC, "masksel", &Sel);
+    return BinaryOperator::CreateOr(T, NewSel);
+  }
+
+  // Cond ? (X | C) : (X & ~C) --> (X & ~C) | (Cond ? C : 0)
+  if (match(F, m_And(m_Value(X), m_APInt(NotC))) &&
+      match(T, m_OneUse(m_Or(m_Specific(X), m_APInt(C)))) && *NotC == ~(*C)) {
+    Constant *Zero = ConstantInt::getNullValue(Ty);
+    Constant *OrC = ConstantInt::get(Ty, *C);
+    Value *NewSel = Builder.CreateSelect(Cond, OrC, Zero, "masksel", &Sel);
+    return BinaryOperator::CreateOr(F, NewSel);
+  }
+
+  return nullptr;
+}
+
+/// Transform patterns such as (a > b) ? a - b : 0 into usub.sat(a, b).
+/// There are 8 commuted/swapped variants of this pattern.
+/// TODO: Also support a - UMIN(a,b) patterns.
+static Value *canonicalizeSaturatedSubtract(const ICmpInst *ICI,
+                                            const Value *TrueVal,
+                                            const Value *FalseVal,
+                                            InstCombiner::BuilderTy &Builder) {
+  ICmpInst::Predicate Pred = ICI->getPredicate();
+  if (!ICmpInst::isUnsigned(Pred))
+    return nullptr;
+
+  // (b > a) ? 0 : a - b -> (b <= a) ? a - b : 0
+  if (match(TrueVal, m_Zero())) {
+    Pred = ICmpInst::getInversePredicate(Pred);
+    std::swap(TrueVal, FalseVal);
+  }
+  if (!match(FalseVal, m_Zero()))
+    return nullptr;
+
+  Value *A = ICI->getOperand(0);
+  Value *B = ICI->getOperand(1);
+  if (Pred == ICmpInst::ICMP_ULE || Pred == ICmpInst::ICMP_ULT) {
+    // (b < a) ? a - b : 0 -> (a > b) ? a - b : 0
+    std::swap(A, B);
+    Pred = ICmpInst::getSwappedPredicate(Pred);
+  }
+
+  assert((Pred == ICmpInst::ICMP_UGE || Pred == ICmpInst::ICMP_UGT) &&
+         "Unexpected isUnsigned predicate!");
+
+  // Ensure the sub is of the form:
+  //  (a > b) ? a - b : 0 -> usub.sat(a, b)
+  //  (a > b) ? b - a : 0 -> -usub.sat(a, b)
+  // Checking for both a-b and a+(-b) as a constant.
+  bool IsNegative = false;
+  const APInt *C;
+  if (match(TrueVal, m_Sub(m_Specific(B), m_Specific(A))) ||
+      (match(A, m_APInt(C)) &&
+       match(TrueVal, m_Add(m_Specific(B), m_SpecificInt(-*C)))))
+    IsNegative = true;
+  else if (!match(TrueVal, m_Sub(m_Specific(A), m_Specific(B))) &&
+           !(match(B, m_APInt(C)) &&
+             match(TrueVal, m_Add(m_Specific(A), m_SpecificInt(-*C)))))
+    return nullptr;
+
+  // If we are adding a negate and the sub and icmp are used anywhere else, we
+  // would end up with more instructions.
+  if (IsNegative && !TrueVal->hasOneUse() && !ICI->hasOneUse())
+    return nullptr;
+
+  // (a > b) ? a - b : 0 -> usub.sat(a, b)
+  // (a > b) ? b - a : 0 -> -usub.sat(a, b)
+  Value *Result = Builder.CreateBinaryIntrinsic(Intrinsic::usub_sat, A, B);
+  if (IsNegative)
+    Result = Builder.CreateNeg(Result);
+  return Result;
+}
+
+static Value *canonicalizeSaturatedAdd(ICmpInst *Cmp, Value *TVal, Value *FVal,
+                                       InstCombiner::BuilderTy &Builder) {
+  if (!Cmp->hasOneUse())
+    return nullptr;
+
+  // Match unsigned saturated add with constant.
+  Value *Cmp0 = Cmp->getOperand(0);
+  Value *Cmp1 = Cmp->getOperand(1);
+  ICmpInst::Predicate Pred = Cmp->getPredicate();
+  Value *X;
+  const APInt *C, *CmpC;
+  if (Pred == ICmpInst::ICMP_ULT &&
+      match(TVal, m_Add(m_Value(X), m_APInt(C))) && X == Cmp0 &&
+      match(FVal, m_AllOnes()) && match(Cmp1, m_APInt(CmpC)) && *CmpC == ~*C) {
+    // (X u< ~C) ? (X + C) : -1 --> uadd.sat(X, C)
+    return Builder.CreateBinaryIntrinsic(
+        Intrinsic::uadd_sat, X, ConstantInt::get(X->getType(), *C));
+  }
+
+  // Match unsigned saturated add of 2 variables with an unnecessary 'not'.
+  // There are 8 commuted variants.
+  // Canonicalize -1 (saturated result) to true value of the select.
+  if (match(FVal, m_AllOnes())) {
+    std::swap(TVal, FVal);
+    Pred = CmpInst::getInversePredicate(Pred);
+  }
+  if (!match(TVal, m_AllOnes()))
+    return nullptr;
+
+  // Canonicalize predicate to less-than or less-or-equal-than.
+  if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_UGE) {
+    std::swap(Cmp0, Cmp1);
+    Pred = CmpInst::getSwappedPredicate(Pred);
+  }
+  if (Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_ULE)
+    return nullptr;
+
+  // Match unsigned saturated add of 2 variables with an unnecessary 'not'.
+  // Strictness of the comparison is irrelevant.
+  Value *Y;
+  if (match(Cmp0, m_Not(m_Value(X))) &&
+      match(FVal, m_c_Add(m_Specific(X), m_Value(Y))) && Y == Cmp1) {
+    // (~X u< Y) ? -1 : (X + Y) --> uadd.sat(X, Y)
+    // (~X u< Y) ? -1 : (Y + X) --> uadd.sat(X, Y)
+    return Builder.CreateBinaryIntrinsic(Intrinsic::uadd_sat, X, Y);
+  }
+  // The 'not' op may be included in the sum but not the compare.
+  // Strictness of the comparison is irrelevant.
+  X = Cmp0;
+  Y = Cmp1;
+  if (match(FVal, m_c_Add(m_Not(m_Specific(X)), m_Specific(Y)))) {
+    // (X u< Y) ? -1 : (~X + Y) --> uadd.sat(~X, Y)
+    // (X u< Y) ? -1 : (Y + ~X) --> uadd.sat(Y, ~X)
+    BinaryOperator *BO = cast<BinaryOperator>(FVal);
+    return Builder.CreateBinaryIntrinsic(
+        Intrinsic::uadd_sat, BO->getOperand(0), BO->getOperand(1));
+  }
+  // The overflow may be detected via the add wrapping round.
+  // This is only valid for strict comparison!
+  if (Pred == ICmpInst::ICMP_ULT &&
+      match(Cmp0, m_c_Add(m_Specific(Cmp1), m_Value(Y))) &&
+      match(FVal, m_c_Add(m_Specific(Cmp1), m_Specific(Y)))) {
+    // ((X + Y) u< X) ? -1 : (X + Y) --> uadd.sat(X, Y)
+    // ((X + Y) u< Y) ? -1 : (X + Y) --> uadd.sat(X, Y)
+    return Builder.CreateBinaryIntrinsic(Intrinsic::uadd_sat, Cmp1, Y);
+  }
+
+  return nullptr;
+}
+
+/// Fold the following code sequence:
+/// \code
+///   int a = ctlz(x & -x);
+//    x ? 31 - a : a;
+/// \code
+///
+/// into:
+///   cttz(x)
+static Instruction *foldSelectCtlzToCttz(ICmpInst *ICI, Value *TrueVal,
+                                         Value *FalseVal,
+                                         InstCombiner::BuilderTy &Builder) {
+  unsigned BitWidth = TrueVal->getType()->getScalarSizeInBits();
+  if (!ICI->isEquality() || !match(ICI->getOperand(1), m_Zero()))
+    return nullptr;
+
+  if (ICI->getPredicate() == ICmpInst::ICMP_NE)
+    std::swap(TrueVal, FalseVal);
+
+  if (!match(FalseVal,
+             m_Xor(m_Deferred(TrueVal), m_SpecificInt(BitWidth - 1))))
+    return nullptr;
+
+  if (!match(TrueVal, m_Intrinsic<Intrinsic::ctlz>()))
+    return nullptr;
+
+  Value *X = ICI->getOperand(0);
+  auto *II = cast<IntrinsicInst>(TrueVal);
+  if (!match(II->getOperand(0), m_c_And(m_Specific(X), m_Neg(m_Specific(X)))))
+    return nullptr;
+
+  Function *F = Intrinsic::getDeclaration(II->getModule(), Intrinsic::cttz,
+                                          II->getType());
+  return CallInst::Create(F, {X, II->getArgOperand(1)});
+}
+
+/// Attempt to fold a cttz/ctlz followed by a icmp plus select into a single
+/// call to cttz/ctlz with flag 'is_zero_undef' cleared.
+///
+/// For example, we can fold the following code sequence:
+/// \code
+///   %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 true)
+///   %1 = icmp ne i32 %x, 0
+///   %2 = select i1 %1, i32 %0, i32 32
+/// \code
+///
+/// into:
+///   %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 false)
+static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal,
+                                 InstCombiner::BuilderTy &Builder) {
+  ICmpInst::Predicate Pred = ICI->getPredicate();
+  Value *CmpLHS = ICI->getOperand(0);
+  Value *CmpRHS = ICI->getOperand(1);
+
+  // Check if the condition value compares a value for equality against zero.
+  if (!ICI->isEquality() || !match(CmpRHS, m_Zero()))
+    return nullptr;
+
+  Value *SelectArg = FalseVal;
+  Value *ValueOnZero = TrueVal;
+  if (Pred == ICmpInst::ICMP_NE)
+    std::swap(SelectArg, ValueOnZero);
+
+  // Skip zero extend/truncate.
+  Value *Count = nullptr;
+  if (!match(SelectArg, m_ZExt(m_Value(Count))) &&
+      !match(SelectArg, m_Trunc(m_Value(Count))))
+    Count = SelectArg;
+
+  // Check that 'Count' is a call to intrinsic cttz/ctlz. Also check that the
+  // input to the cttz/ctlz is used as LHS for the compare instruction.
+  if (!match(Count, m_Intrinsic<Intrinsic::cttz>(m_Specific(CmpLHS))) &&
+      !match(Count, m_Intrinsic<Intrinsic::ctlz>(m_Specific(CmpLHS))))
+    return nullptr;
+
+  IntrinsicInst *II = cast<IntrinsicInst>(Count);
+
+  // Check if the value propagated on zero is a constant number equal to the
+  // sizeof in bits of 'Count'.
+  unsigned SizeOfInBits = Count->getType()->getScalarSizeInBits();
+  if (match(ValueOnZero, m_SpecificInt(SizeOfInBits))) {
+    // Explicitly clear the 'undef_on_zero' flag. It's always valid to go from
+    // true to false on this flag, so we can replace it for all users.
+    II->setArgOperand(1, ConstantInt::getFalse(II->getContext()));
+    return SelectArg;
+  }
+
+  // The ValueOnZero is not the bitwidth. But if the cttz/ctlz (and optional
+  // zext/trunc) have one use (ending at the select), the cttz/ctlz result will
+  // not be used if the input is zero. Relax to 'undef_on_zero' for that case.
+  if (II->hasOneUse() && SelectArg->hasOneUse() &&
+      !match(II->getArgOperand(1), m_One()))
+    II->setArgOperand(1, ConstantInt::getTrue(II->getContext()));
+
+  return nullptr;
+}
+
+/// Return true if we find and adjust an icmp+select pattern where the compare
+/// is with a constant that can be incremented or decremented to match the
+/// minimum or maximum idiom.
+static bool adjustMinMax(SelectInst &Sel, ICmpInst &Cmp) {
+  ICmpInst::Predicate Pred = Cmp.getPredicate();
+  Value *CmpLHS = Cmp.getOperand(0);
+  Value *CmpRHS = Cmp.getOperand(1);
+  Value *TrueVal = Sel.getTrueValue();
+  Value *FalseVal = Sel.getFalseValue();
+
+  // We may move or edit the compare, so make sure the select is the only user.
+  const APInt *CmpC;
+  if (!Cmp.hasOneUse() || !match(CmpRHS, m_APInt(CmpC)))
+    return false;
+
+  // These transforms only work for selects of integers or vector selects of
+  // integer vectors.
+  Type *SelTy = Sel.getType();
+  auto *SelEltTy = dyn_cast<IntegerType>(SelTy->getScalarType());
+  if (!SelEltTy || SelTy->isVectorTy() != Cmp.getType()->isVectorTy())
+    return false;
+
+  Constant *AdjustedRHS;
+  if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_SGT)
+    AdjustedRHS = ConstantInt::get(CmpRHS->getType(), *CmpC + 1);
+  else if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_SLT)
+    AdjustedRHS = ConstantInt::get(CmpRHS->getType(), *CmpC - 1);
+  else
+    return false;
+
+  // X > C ? X : C+1  -->  X < C+1 ? C+1 : X
+  // X < C ? X : C-1  -->  X > C-1 ? C-1 : X
+  if ((CmpLHS == TrueVal && AdjustedRHS == FalseVal) ||
+      (CmpLHS == FalseVal && AdjustedRHS == TrueVal)) {
+    ; // Nothing to do here. Values match without any sign/zero extension.
+  }
+  // Types do not match. Instead of calculating this with mixed types, promote
+  // all to the larger type. This enables scalar evolution to analyze this
+  // expression.
+  else if (CmpRHS->getType()->getScalarSizeInBits() < SelEltTy->getBitWidth()) {
+    Constant *SextRHS = ConstantExpr::getSExt(AdjustedRHS, SelTy);
+
+    // X = sext x; x >s c ? X : C+1 --> X = sext x; X <s C+1 ? C+1 : X
+    // X = sext x; x <s c ? X : C-1 --> X = sext x; X >s C-1 ? C-1 : X
+    // X = sext x; x >u c ? X : C+1 --> X = sext x; X <u C+1 ? C+1 : X
+    // X = sext x; x <u c ? X : C-1 --> X = sext x; X >u C-1 ? C-1 : X
+    if (match(TrueVal, m_SExt(m_Specific(CmpLHS))) && SextRHS == FalseVal) {
+      CmpLHS = TrueVal;
+      AdjustedRHS = SextRHS;
+    } else if (match(FalseVal, m_SExt(m_Specific(CmpLHS))) &&
+               SextRHS == TrueVal) {
+      CmpLHS = FalseVal;
+      AdjustedRHS = SextRHS;
+    } else if (Cmp.isUnsigned()) {
+      Constant *ZextRHS = ConstantExpr::getZExt(AdjustedRHS, SelTy);
+      // X = zext x; x >u c ? X : C+1 --> X = zext x; X <u C+1 ? C+1 : X
+      // X = zext x; x <u c ? X : C-1 --> X = zext x; X >u C-1 ? C-1 : X
+      // zext + signed compare cannot be changed:
+      //    0xff <s 0x00, but 0x00ff >s 0x0000
+      if (match(TrueVal, m_ZExt(m_Specific(CmpLHS))) && ZextRHS == FalseVal) {
+        CmpLHS = TrueVal;
+        AdjustedRHS = ZextRHS;
+      } else if (match(FalseVal, m_ZExt(m_Specific(CmpLHS))) &&
+                 ZextRHS == TrueVal) {
+        CmpLHS = FalseVal;
+        AdjustedRHS = ZextRHS;
+      } else {
+        return false;
+      }
+    } else {
+      return false;
+    }
+  } else {
+    return false;
+  }
+
+  Pred = ICmpInst::getSwappedPredicate(Pred);
+  CmpRHS = AdjustedRHS;
+  std::swap(FalseVal, TrueVal);
+  Cmp.setPredicate(Pred);
+  Cmp.setOperand(0, CmpLHS);
+  Cmp.setOperand(1, CmpRHS);
+  Sel.setOperand(1, TrueVal);
+  Sel.setOperand(2, FalseVal);
+  Sel.swapProfMetadata();
+
+  // Move the compare instruction right before the select instruction. Otherwise
+  // the sext/zext value may be defined after the compare instruction uses it.
+  Cmp.moveBefore(&Sel);
+
+  return true;
+}
+
+/// If this is an integer min/max (icmp + select) with a constant operand,
+/// create the canonical icmp for the min/max operation and canonicalize the
+/// constant to the 'false' operand of the select:
+/// select (icmp Pred X, C1), C2, X --> select (icmp Pred' X, C2), X, C2
+/// Note: if C1 != C2, this will change the icmp constant to the existing
+/// constant operand of the select.
 static Instruction *canonicalizeMinMaxWithConstant(SelectInst &Sel,
                                                    ICmpInst &Cmp,
                                                    InstCombinerImpl &IC) {
-  if (!Cmp.hasOneUse() || !isa<Constant>(Cmp.getOperand(1))) 
-    return nullptr; 
- 
-  // Canonicalize the compare predicate based on whether we have min or max. 
-  Value *LHS, *RHS; 
-  SelectPatternResult SPR = matchSelectPattern(&Sel, LHS, RHS); 
-  if (!SelectPatternResult::isMinOrMax(SPR.Flavor)) 
-    return nullptr; 
- 
-  // Is this already canonical? 
-  ICmpInst::Predicate CanonicalPred = getMinMaxPred(SPR.Flavor); 
-  if (Cmp.getOperand(0) == LHS && Cmp.getOperand(1) == RHS && 
-      Cmp.getPredicate() == CanonicalPred) 
-    return nullptr; 
- 
-  // Bail out on unsimplified X-0 operand (due to some worklist management bug), 
-  // as this may cause an infinite combine loop. Let the sub be folded first. 
-  if (match(LHS, m_Sub(m_Value(), m_Zero())) || 
-      match(RHS, m_Sub(m_Value(), m_Zero()))) 
-    return nullptr; 
- 
-  // Create the canonical compare and plug it into the select. 
-  IC.replaceOperand(Sel, 0, IC.Builder.CreateICmp(CanonicalPred, LHS, RHS)); 
- 
-  // If the select operands did not change, we're done. 
-  if (Sel.getTrueValue() == LHS && Sel.getFalseValue() == RHS) 
-    return &Sel; 
- 
-  // If we are swapping the select operands, swap the metadata too. 
-  assert(Sel.getTrueValue() == RHS && Sel.getFalseValue() == LHS && 
-         "Unexpected results from matchSelectPattern"); 
-  Sel.swapValues(); 
-  Sel.swapProfMetadata(); 
-  return &Sel; 
-} 
- 
-static Instruction *canonicalizeAbsNabs(SelectInst &Sel, ICmpInst &Cmp, 
+  if (!Cmp.hasOneUse() || !isa<Constant>(Cmp.getOperand(1)))
+    return nullptr;
+
+  // Canonicalize the compare predicate based on whether we have min or max.
+  Value *LHS, *RHS;
+  SelectPatternResult SPR = matchSelectPattern(&Sel, LHS, RHS);
+  if (!SelectPatternResult::isMinOrMax(SPR.Flavor))
+    return nullptr;
+
+  // Is this already canonical?
+  ICmpInst::Predicate CanonicalPred = getMinMaxPred(SPR.Flavor);
+  if (Cmp.getOperand(0) == LHS && Cmp.getOperand(1) == RHS &&
+      Cmp.getPredicate() == CanonicalPred)
+    return nullptr;
+
+  // Bail out on unsimplified X-0 operand (due to some worklist management bug),
+  // as this may cause an infinite combine loop. Let the sub be folded first.
+  if (match(LHS, m_Sub(m_Value(), m_Zero())) ||
+      match(RHS, m_Sub(m_Value(), m_Zero())))
+    return nullptr;
+
+  // Create the canonical compare and plug it into the select.
+  IC.replaceOperand(Sel, 0, IC.Builder.CreateICmp(CanonicalPred, LHS, RHS));
+
+  // If the select operands did not change, we're done.
+  if (Sel.getTrueValue() == LHS && Sel.getFalseValue() == RHS)
+    return &Sel;
+
+  // If we are swapping the select operands, swap the metadata too.
+  assert(Sel.getTrueValue() == RHS && Sel.getFalseValue() == LHS &&
+         "Unexpected results from matchSelectPattern");
+  Sel.swapValues();
+  Sel.swapProfMetadata();
+  return &Sel;
+}
+
+static Instruction *canonicalizeAbsNabs(SelectInst &Sel, ICmpInst &Cmp,
                                         InstCombinerImpl &IC) {
-  if (!Cmp.hasOneUse() || !isa<Constant>(Cmp.getOperand(1))) 
-    return nullptr; 
- 
-  Value *LHS, *RHS; 
-  SelectPatternFlavor SPF = matchSelectPattern(&Sel, LHS, RHS).Flavor; 
-  if (SPF != SelectPatternFlavor::SPF_ABS && 
-      SPF != SelectPatternFlavor::SPF_NABS) 
-    return nullptr; 
- 
+  if (!Cmp.hasOneUse() || !isa<Constant>(Cmp.getOperand(1)))
+    return nullptr;
+
+  Value *LHS, *RHS;
+  SelectPatternFlavor SPF = matchSelectPattern(&Sel, LHS, RHS).Flavor;
+  if (SPF != SelectPatternFlavor::SPF_ABS &&
+      SPF != SelectPatternFlavor::SPF_NABS)
+    return nullptr;
+
   // Note that NSW flag can only be propagated for normal, non-negated abs!
   bool IntMinIsPoison = SPF == SelectPatternFlavor::SPF_ABS &&
                         match(RHS, m_NSWNeg(m_Specific(LHS)));
@@ -1070,45 +1070,45 @@ static Instruction *canonicalizeAbsNabs(SelectInst &Sel, ICmpInst &Cmp,
       ConstantInt::get(Type::getInt1Ty(Sel.getContext()), IntMinIsPoison);
   Instruction *Abs =
       IC.Builder.CreateBinaryIntrinsic(Intrinsic::abs, LHS, IntMinIsPoisonC);
- 
+
   if (SPF == SelectPatternFlavor::SPF_NABS)
     return BinaryOperator::CreateNeg(Abs); // Always without NSW flag!
- 
+
   return IC.replaceInstUsesWith(Sel, Abs);
-} 
- 
-/// If we have a select with an equality comparison, then we know the value in 
-/// one of the arms of the select. See if substituting this value into an arm 
-/// and simplifying the result yields the same value as the other arm. 
-/// 
-/// To make this transform safe, we must drop poison-generating flags 
-/// (nsw, etc) if we simplified to a binop because the select may be guarding 
-/// that poison from propagating. If the existing binop already had no 
-/// poison-generating flags, then this transform can be done by instsimplify. 
-/// 
-/// Consider: 
-///   %cmp = icmp eq i32 %x, 2147483647 
-///   %add = add nsw i32 %x, 1 
-///   %sel = select i1 %cmp, i32 -2147483648, i32 %add 
-/// 
-/// We can't replace %sel with %add unless we strip away the flags. 
-/// TODO: Wrapping flags could be preserved in some cases with better analysis. 
+}
+
+/// If we have a select with an equality comparison, then we know the value in
+/// one of the arms of the select. See if substituting this value into an arm
+/// and simplifying the result yields the same value as the other arm.
+///
+/// To make this transform safe, we must drop poison-generating flags
+/// (nsw, etc) if we simplified to a binop because the select may be guarding
+/// that poison from propagating. If the existing binop already had no
+/// poison-generating flags, then this transform can be done by instsimplify.
+///
+/// Consider:
+///   %cmp = icmp eq i32 %x, 2147483647
+///   %add = add nsw i32 %x, 1
+///   %sel = select i1 %cmp, i32 -2147483648, i32 %add
+///
+/// We can't replace %sel with %add unless we strip away the flags.
+/// TODO: Wrapping flags could be preserved in some cases with better analysis.
 Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel,
                                                           ICmpInst &Cmp) {
   // Value equivalence substitution requires an all-or-nothing replacement.
   // It does not make sense for a vector compare where each lane is chosen
   // independently.
   if (!Cmp.isEquality() || Cmp.getType()->isVectorTy())
-    return nullptr; 
- 
-  // Canonicalize the pattern to ICMP_EQ by swapping the select operands. 
-  Value *TrueVal = Sel.getTrueValue(), *FalseVal = Sel.getFalseValue(); 
+    return nullptr;
+
+  // Canonicalize the pattern to ICMP_EQ by swapping the select operands.
+  Value *TrueVal = Sel.getTrueValue(), *FalseVal = Sel.getFalseValue();
   bool Swapped = false;
   if (Cmp.getPredicate() == ICmpInst::ICMP_NE) {
-    std::swap(TrueVal, FalseVal); 
+    std::swap(TrueVal, FalseVal);
     Swapped = true;
   }
- 
+
   // In X == Y ? f(X) : Z, try to evaluate f(Y) and replace the operand.
   // Make sure Y cannot be undef though, as we might pick different values for
   // undef in the icmp and in f(Y). Additionally, take care to avoid replacing
@@ -1143,1145 +1143,1145 @@ Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel,
                                           /* AllowRefinement */ true))
       return replaceOperand(Sel, Swapped ? 2 : 1, V);
 
-  auto *FalseInst = dyn_cast<Instruction>(FalseVal); 
-  if (!FalseInst) 
-    return nullptr; 
- 
-  // InstSimplify already performed this fold if it was possible subject to 
-  // current poison-generating flags. Try the transform again with 
-  // poison-generating flags temporarily dropped. 
+  auto *FalseInst = dyn_cast<Instruction>(FalseVal);
+  if (!FalseInst)
+    return nullptr;
+
+  // InstSimplify already performed this fold if it was possible subject to
+  // current poison-generating flags. Try the transform again with
+  // poison-generating flags temporarily dropped.
   bool WasNUW = false, WasNSW = false, WasExact = false, WasInBounds = false;
-  if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(FalseVal)) { 
-    WasNUW = OBO->hasNoUnsignedWrap(); 
-    WasNSW = OBO->hasNoSignedWrap(); 
-    FalseInst->setHasNoUnsignedWrap(false); 
-    FalseInst->setHasNoSignedWrap(false); 
-  } 
-  if (auto *PEO = dyn_cast<PossiblyExactOperator>(FalseVal)) { 
-    WasExact = PEO->isExact(); 
-    FalseInst->setIsExact(false); 
-  } 
+  if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(FalseVal)) {
+    WasNUW = OBO->hasNoUnsignedWrap();
+    WasNSW = OBO->hasNoSignedWrap();
+    FalseInst->setHasNoUnsignedWrap(false);
+    FalseInst->setHasNoSignedWrap(false);
+  }
+  if (auto *PEO = dyn_cast<PossiblyExactOperator>(FalseVal)) {
+    WasExact = PEO->isExact();
+    FalseInst->setIsExact(false);
+  }
   if (auto *GEP = dyn_cast<GetElementPtrInst>(FalseVal)) {
     WasInBounds = GEP->isInBounds();
     GEP->setIsInBounds(false);
   }
- 
-  // Try each equivalence substitution possibility. 
-  // We have an 'EQ' comparison, so the select's false value will propagate. 
-  // Example: 
-  // (X == 42) ? 43 : (X + 1) --> (X == 42) ? (X + 1) : (X + 1) --> X + 1 
+
+  // Try each equivalence substitution possibility.
+  // We have an 'EQ' comparison, so the select's false value will propagate.
+  // Example:
+  // (X == 42) ? 43 : (X + 1) --> (X == 42) ? (X + 1) : (X + 1) --> X + 1
   if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, SQ,
-                             /* AllowRefinement */ false) == TrueVal || 
+                             /* AllowRefinement */ false) == TrueVal ||
       SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, SQ,
-                             /* AllowRefinement */ false) == TrueVal) { 
+                             /* AllowRefinement */ false) == TrueVal) {
     return replaceInstUsesWith(Sel, FalseVal);
-  } 
- 
-  // Restore poison-generating flags if the transform did not apply. 
-  if (WasNUW) 
-    FalseInst->setHasNoUnsignedWrap(); 
-  if (WasNSW) 
-    FalseInst->setHasNoSignedWrap(); 
-  if (WasExact) 
-    FalseInst->setIsExact(); 
+  }
+
+  // Restore poison-generating flags if the transform did not apply.
+  if (WasNUW)
+    FalseInst->setHasNoUnsignedWrap();
+  if (WasNSW)
+    FalseInst->setHasNoSignedWrap();
+  if (WasExact)
+    FalseInst->setIsExact();
   if (WasInBounds)
     cast<GetElementPtrInst>(FalseInst)->setIsInBounds();
- 
-  return nullptr; 
-} 
- 
-// See if this is a pattern like: 
-//   %old_cmp1 = icmp slt i32 %x, C2 
-//   %old_replacement = select i1 %old_cmp1, i32 %target_low, i32 %target_high 
-//   %old_x_offseted = add i32 %x, C1 
-//   %old_cmp0 = icmp ult i32 %old_x_offseted, C0 
-//   %r = select i1 %old_cmp0, i32 %x, i32 %old_replacement 
-// This can be rewritten as more canonical pattern: 
-//   %new_cmp1 = icmp slt i32 %x, -C1 
-//   %new_cmp2 = icmp sge i32 %x, C0-C1 
-//   %new_clamped_low = select i1 %new_cmp1, i32 %target_low, i32 %x 
-//   %r = select i1 %new_cmp2, i32 %target_high, i32 %new_clamped_low 
-// Iff -C1 s<= C2 s<= C0-C1 
-// Also ULT predicate can also be UGT iff C0 != -1 (+invert result) 
-//      SLT predicate can also be SGT iff C2 != INT_MAX (+invert res.) 
-static Instruction *canonicalizeClampLike(SelectInst &Sel0, ICmpInst &Cmp0, 
-                                          InstCombiner::BuilderTy &Builder) { 
-  Value *X = Sel0.getTrueValue(); 
-  Value *Sel1 = Sel0.getFalseValue(); 
- 
-  // First match the condition of the outermost select. 
-  // Said condition must be one-use. 
-  if (!Cmp0.hasOneUse()) 
-    return nullptr; 
-  Value *Cmp00 = Cmp0.getOperand(0); 
-  Constant *C0; 
-  if (!match(Cmp0.getOperand(1), 
-             m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C0)))) 
-    return nullptr; 
-  // Canonicalize Cmp0 into the form we expect. 
-  // FIXME: we shouldn't care about lanes that are 'undef' in the end? 
-  switch (Cmp0.getPredicate()) { 
-  case ICmpInst::Predicate::ICMP_ULT: 
-    break; // Great! 
-  case ICmpInst::Predicate::ICMP_ULE: 
-    // We'd have to increment C0 by one, and for that it must not have all-ones 
-    // element, but then it would have been canonicalized to 'ult' before 
-    // we get here. So we can't do anything useful with 'ule'. 
-    return nullptr; 
-  case ICmpInst::Predicate::ICMP_UGT: 
-    // We want to canonicalize it to 'ult', so we'll need to increment C0, 
-    // which again means it must not have any all-ones elements. 
-    if (!match(C0, 
-               m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_NE, 
-                                  APInt::getAllOnesValue( 
-                                      C0->getType()->getScalarSizeInBits())))) 
-      return nullptr; // Can't do, have all-ones element[s]. 
+
+  return nullptr;
+}
+
+// See if this is a pattern like:
+//   %old_cmp1 = icmp slt i32 %x, C2
+//   %old_replacement = select i1 %old_cmp1, i32 %target_low, i32 %target_high
+//   %old_x_offseted = add i32 %x, C1
+//   %old_cmp0 = icmp ult i32 %old_x_offseted, C0
+//   %r = select i1 %old_cmp0, i32 %x, i32 %old_replacement
+// This can be rewritten as more canonical pattern:
+//   %new_cmp1 = icmp slt i32 %x, -C1
+//   %new_cmp2 = icmp sge i32 %x, C0-C1
+//   %new_clamped_low = select i1 %new_cmp1, i32 %target_low, i32 %x
+//   %r = select i1 %new_cmp2, i32 %target_high, i32 %new_clamped_low
+// Iff -C1 s<= C2 s<= C0-C1
+// Also ULT predicate can also be UGT iff C0 != -1 (+invert result)
+//      SLT predicate can also be SGT iff C2 != INT_MAX (+invert res.)
+static Instruction *canonicalizeClampLike(SelectInst &Sel0, ICmpInst &Cmp0,
+                                          InstCombiner::BuilderTy &Builder) {
+  Value *X = Sel0.getTrueValue();
+  Value *Sel1 = Sel0.getFalseValue();
+
+  // First match the condition of the outermost select.
+  // Said condition must be one-use.
+  if (!Cmp0.hasOneUse())
+    return nullptr;
+  Value *Cmp00 = Cmp0.getOperand(0);
+  Constant *C0;
+  if (!match(Cmp0.getOperand(1),
+             m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C0))))
+    return nullptr;
+  // Canonicalize Cmp0 into the form we expect.
+  // FIXME: we shouldn't care about lanes that are 'undef' in the end?
+  switch (Cmp0.getPredicate()) {
+  case ICmpInst::Predicate::ICMP_ULT:
+    break; // Great!
+  case ICmpInst::Predicate::ICMP_ULE:
+    // We'd have to increment C0 by one, and for that it must not have all-ones
+    // element, but then it would have been canonicalized to 'ult' before
+    // we get here. So we can't do anything useful with 'ule'.
+    return nullptr;
+  case ICmpInst::Predicate::ICMP_UGT:
+    // We want to canonicalize it to 'ult', so we'll need to increment C0,
+    // which again means it must not have any all-ones elements.
+    if (!match(C0,
+               m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_NE,
+                                  APInt::getAllOnesValue(
+                                      C0->getType()->getScalarSizeInBits()))))
+      return nullptr; // Can't do, have all-ones element[s].
     C0 = InstCombiner::AddOne(C0);
-    std::swap(X, Sel1); 
-    break; 
-  case ICmpInst::Predicate::ICMP_UGE: 
-    // The only way we'd get this predicate if this `icmp` has extra uses, 
-    // but then we won't be able to do this fold. 
-    return nullptr; 
-  default: 
-    return nullptr; // Unknown predicate. 
-  } 
- 
-  // Now that we've canonicalized the ICmp, we know the X we expect; 
-  // the select in other hand should be one-use. 
-  if (!Sel1->hasOneUse()) 
-    return nullptr; 
- 
-  // We now can finish matching the condition of the outermost select: 
-  // it should either be the X itself, or an addition of some constant to X. 
-  Constant *C1; 
-  if (Cmp00 == X) 
-    C1 = ConstantInt::getNullValue(Sel0.getType()); 
-  else if (!match(Cmp00, 
-                  m_Add(m_Specific(X), 
-                        m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C1))))) 
-    return nullptr; 
- 
-  Value *Cmp1; 
-  ICmpInst::Predicate Pred1; 
-  Constant *C2; 
-  Value *ReplacementLow, *ReplacementHigh; 
-  if (!match(Sel1, m_Select(m_Value(Cmp1), m_Value(ReplacementLow), 
-                            m_Value(ReplacementHigh))) || 
-      !match(Cmp1, 
-             m_ICmp(Pred1, m_Specific(X), 
-                    m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C2))))) 
-    return nullptr; 
- 
-  if (!Cmp1->hasOneUse() && (Cmp00 == X || !Cmp00->hasOneUse())) 
-    return nullptr; // Not enough one-use instructions for the fold. 
-  // FIXME: this restriction could be relaxed if Cmp1 can be reused as one of 
-  //        two comparisons we'll need to build. 
- 
-  // Canonicalize Cmp1 into the form we expect. 
-  // FIXME: we shouldn't care about lanes that are 'undef' in the end? 
-  switch (Pred1) { 
-  case ICmpInst::Predicate::ICMP_SLT: 
-    break; 
-  case ICmpInst::Predicate::ICMP_SLE: 
-    // We'd have to increment C2 by one, and for that it must not have signed 
-    // max element, but then it would have been canonicalized to 'slt' before 
-    // we get here. So we can't do anything useful with 'sle'. 
-    return nullptr; 
-  case ICmpInst::Predicate::ICMP_SGT: 
-    // We want to canonicalize it to 'slt', so we'll need to increment C2, 
-    // which again means it must not have any signed max elements. 
-    if (!match(C2, 
-               m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_NE, 
-                                  APInt::getSignedMaxValue( 
-                                      C2->getType()->getScalarSizeInBits())))) 
-      return nullptr; // Can't do, have signed max element[s]. 
+    std::swap(X, Sel1);
+    break;
+  case ICmpInst::Predicate::ICMP_UGE:
+    // The only way we'd get this predicate if this `icmp` has extra uses,
+    // but then we won't be able to do this fold.
+    return nullptr;
+  default:
+    return nullptr; // Unknown predicate.
+  }
+
+  // Now that we've canonicalized the ICmp, we know the X we expect;
+  // the select in other hand should be one-use.
+  if (!Sel1->hasOneUse())
+    return nullptr;
+
+  // We now can finish matching the condition of the outermost select:
+  // it should either be the X itself, or an addition of some constant to X.
+  Constant *C1;
+  if (Cmp00 == X)
+    C1 = ConstantInt::getNullValue(Sel0.getType());
+  else if (!match(Cmp00,
+                  m_Add(m_Specific(X),
+                        m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C1)))))
+    return nullptr;
+
+  Value *Cmp1;
+  ICmpInst::Predicate Pred1;
+  Constant *C2;
+  Value *ReplacementLow, *ReplacementHigh;
+  if (!match(Sel1, m_Select(m_Value(Cmp1), m_Value(ReplacementLow),
+                            m_Value(ReplacementHigh))) ||
+      !match(Cmp1,
+             m_ICmp(Pred1, m_Specific(X),
+                    m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C2)))))
+    return nullptr;
+
+  if (!Cmp1->hasOneUse() && (Cmp00 == X || !Cmp00->hasOneUse()))
+    return nullptr; // Not enough one-use instructions for the fold.
+  // FIXME: this restriction could be relaxed if Cmp1 can be reused as one of
+  //        two comparisons we'll need to build.
+
+  // Canonicalize Cmp1 into the form we expect.
+  // FIXME: we shouldn't care about lanes that are 'undef' in the end?
+  switch (Pred1) {
+  case ICmpInst::Predicate::ICMP_SLT:
+    break;
+  case ICmpInst::Predicate::ICMP_SLE:
+    // We'd have to increment C2 by one, and for that it must not have signed
+    // max element, but then it would have been canonicalized to 'slt' before
+    // we get here. So we can't do anything useful with 'sle'.
+    return nullptr;
+  case ICmpInst::Predicate::ICMP_SGT:
+    // We want to canonicalize it to 'slt', so we'll need to increment C2,
+    // which again means it must not have any signed max elements.
+    if (!match(C2,
+               m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_NE,
+                                  APInt::getSignedMaxValue(
+                                      C2->getType()->getScalarSizeInBits()))))
+      return nullptr; // Can't do, have signed max element[s].
     C2 = InstCombiner::AddOne(C2);
-    LLVM_FALLTHROUGH; 
-  case ICmpInst::Predicate::ICMP_SGE: 
-    // Also non-canonical, but here we don't need to change C2, 
-    // so we don't have any restrictions on C2, so we can just handle it. 
-    std::swap(ReplacementLow, ReplacementHigh); 
-    break; 
-  default: 
-    return nullptr; // Unknown predicate. 
-  } 
- 
-  // The thresholds of this clamp-like pattern. 
-  auto *ThresholdLowIncl = ConstantExpr::getNeg(C1); 
-  auto *ThresholdHighExcl = ConstantExpr::getSub(C0, C1); 
- 
-  // The fold has a precondition 1: C2 s>= ThresholdLow 
-  auto *Precond1 = ConstantExpr::getICmp(ICmpInst::Predicate::ICMP_SGE, C2, 
-                                         ThresholdLowIncl); 
-  if (!match(Precond1, m_One())) 
-    return nullptr; 
-  // The fold has a precondition 2: C2 s<= ThresholdHigh 
-  auto *Precond2 = ConstantExpr::getICmp(ICmpInst::Predicate::ICMP_SLE, C2, 
-                                         ThresholdHighExcl); 
-  if (!match(Precond2, m_One())) 
-    return nullptr; 
- 
-  // All good, finally emit the new pattern. 
-  Value *ShouldReplaceLow = Builder.CreateICmpSLT(X, ThresholdLowIncl); 
-  Value *ShouldReplaceHigh = Builder.CreateICmpSGE(X, ThresholdHighExcl); 
-  Value *MaybeReplacedLow = 
-      Builder.CreateSelect(ShouldReplaceLow, ReplacementLow, X); 
-  Instruction *MaybeReplacedHigh = 
-      SelectInst::Create(ShouldReplaceHigh, ReplacementHigh, MaybeReplacedLow); 
- 
-  return MaybeReplacedHigh; 
-} 
- 
-// If we have 
-//  %cmp = icmp [canonical predicate] i32 %x, C0 
-//  %r = select i1 %cmp, i32 %y, i32 C1 
-// Where C0 != C1 and %x may be different from %y, see if the constant that we 
-// will have if we flip the strictness of the predicate (i.e. without changing 
-// the result) is identical to the C1 in select. If it matches we can change 
-// original comparison to one with swapped predicate, reuse the constant, 
-// and swap the hands of select. 
-static Instruction * 
-tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp, 
+    LLVM_FALLTHROUGH;
+  case ICmpInst::Predicate::ICMP_SGE:
+    // Also non-canonical, but here we don't need to change C2,
+    // so we don't have any restrictions on C2, so we can just handle it.
+    std::swap(ReplacementLow, ReplacementHigh);
+    break;
+  default:
+    return nullptr; // Unknown predicate.
+  }
+
+  // The thresholds of this clamp-like pattern.
+  auto *ThresholdLowIncl = ConstantExpr::getNeg(C1);
+  auto *ThresholdHighExcl = ConstantExpr::getSub(C0, C1);
+
+  // The fold has a precondition 1: C2 s>= ThresholdLow
+  auto *Precond1 = ConstantExpr::getICmp(ICmpInst::Predicate::ICMP_SGE, C2,
+                                         ThresholdLowIncl);
+  if (!match(Precond1, m_One()))
+    return nullptr;
+  // The fold has a precondition 2: C2 s<= ThresholdHigh
+  auto *Precond2 = ConstantExpr::getICmp(ICmpInst::Predicate::ICMP_SLE, C2,
+                                         ThresholdHighExcl);
+  if (!match(Precond2, m_One()))
+    return nullptr;
+
+  // All good, finally emit the new pattern.
+  Value *ShouldReplaceLow = Builder.CreateICmpSLT(X, ThresholdLowIncl);
+  Value *ShouldReplaceHigh = Builder.CreateICmpSGE(X, ThresholdHighExcl);
+  Value *MaybeReplacedLow =
+      Builder.CreateSelect(ShouldReplaceLow, ReplacementLow, X);
+  Instruction *MaybeReplacedHigh =
+      SelectInst::Create(ShouldReplaceHigh, ReplacementHigh, MaybeReplacedLow);
+
+  return MaybeReplacedHigh;
+}
+
+// If we have
+//  %cmp = icmp [canonical predicate] i32 %x, C0
+//  %r = select i1 %cmp, i32 %y, i32 C1
+// Where C0 != C1 and %x may be different from %y, see if the constant that we
+// will have if we flip the strictness of the predicate (i.e. without changing
+// the result) is identical to the C1 in select. If it matches we can change
+// original comparison to one with swapped predicate, reuse the constant,
+// and swap the hands of select.
+static Instruction *
+tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp,
                                          InstCombinerImpl &IC) {
-  ICmpInst::Predicate Pred; 
-  Value *X; 
-  Constant *C0; 
-  if (!match(&Cmp, m_OneUse(m_ICmp( 
-                       Pred, m_Value(X), 
-                       m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C0)))))) 
-    return nullptr; 
- 
-  // If comparison predicate is non-relational, we won't be able to do anything. 
-  if (ICmpInst::isEquality(Pred)) 
-    return nullptr; 
- 
-  // If comparison predicate is non-canonical, then we certainly won't be able 
-  // to make it canonical; canonicalizeCmpWithConstant() already tried. 
+  ICmpInst::Predicate Pred;
+  Value *X;
+  Constant *C0;
+  if (!match(&Cmp, m_OneUse(m_ICmp(
+                       Pred, m_Value(X),
+                       m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C0))))))
+    return nullptr;
+
+  // If comparison predicate is non-relational, we won't be able to do anything.
+  if (ICmpInst::isEquality(Pred))
+    return nullptr;
+
+  // If comparison predicate is non-canonical, then we certainly won't be able
+  // to make it canonical; canonicalizeCmpWithConstant() already tried.
   if (!InstCombiner::isCanonicalPredicate(Pred))
-    return nullptr; 
- 
-  // If the [input] type of comparison and select type are different, lets abort 
-  // for now. We could try to compare constants with trunc/[zs]ext though. 
-  if (C0->getType() != Sel.getType()) 
-    return nullptr; 
- 
-  // FIXME: are there any magic icmp predicate+constant pairs we must not touch? 
- 
-  Value *SelVal0, *SelVal1; // We do not care which one is from where. 
-  match(&Sel, m_Select(m_Value(), m_Value(SelVal0), m_Value(SelVal1))); 
-  // At least one of these values we are selecting between must be a constant 
-  // else we'll never succeed. 
-  if (!match(SelVal0, m_AnyIntegralConstant()) && 
-      !match(SelVal1, m_AnyIntegralConstant())) 
-    return nullptr; 
- 
-  // Does this constant C match any of the `select` values? 
-  auto MatchesSelectValue = [SelVal0, SelVal1](Constant *C) { 
-    return C->isElementWiseEqual(SelVal0) || C->isElementWiseEqual(SelVal1); 
-  }; 
- 
-  // If C0 *already* matches true/false value of select, we are done. 
-  if (MatchesSelectValue(C0)) 
-    return nullptr; 
- 
-  // Check the constant we'd have with flipped-strictness predicate. 
+    return nullptr;
+
+  // If the [input] type of comparison and select type are different, lets abort
+  // for now. We could try to compare constants with trunc/[zs]ext though.
+  if (C0->getType() != Sel.getType())
+    return nullptr;
+
+  // FIXME: are there any magic icmp predicate+constant pairs we must not touch?
+
+  Value *SelVal0, *SelVal1; // We do not care which one is from where.
+  match(&Sel, m_Select(m_Value(), m_Value(SelVal0), m_Value(SelVal1)));
+  // At least one of these values we are selecting between must be a constant
+  // else we'll never succeed.
+  if (!match(SelVal0, m_AnyIntegralConstant()) &&
+      !match(SelVal1, m_AnyIntegralConstant()))
+    return nullptr;
+
+  // Does this constant C match any of the `select` values?
+  auto MatchesSelectValue = [SelVal0, SelVal1](Constant *C) {
+    return C->isElementWiseEqual(SelVal0) || C->isElementWiseEqual(SelVal1);
+  };
+
+  // If C0 *already* matches true/false value of select, we are done.
+  if (MatchesSelectValue(C0))
+    return nullptr;
+
+  // Check the constant we'd have with flipped-strictness predicate.
   auto FlippedStrictness =
       InstCombiner::getFlippedStrictnessPredicateAndConstant(Pred, C0);
-  if (!FlippedStrictness) 
-    return nullptr; 
- 
-  // If said constant doesn't match either, then there is no hope, 
-  if (!MatchesSelectValue(FlippedStrictness->second)) 
-    return nullptr; 
- 
-  // It matched! Lets insert the new comparison just before select. 
-  InstCombiner::BuilderTy::InsertPointGuard Guard(IC.Builder); 
-  IC.Builder.SetInsertPoint(&Sel); 
- 
-  Pred = ICmpInst::getSwappedPredicate(Pred); // Yes, swapped. 
-  Value *NewCmp = IC.Builder.CreateICmp(Pred, X, FlippedStrictness->second, 
-                                        Cmp.getName() + ".inv"); 
-  IC.replaceOperand(Sel, 0, NewCmp); 
-  Sel.swapValues(); 
-  Sel.swapProfMetadata(); 
- 
-  return &Sel; 
-} 
- 
-/// Visit a SelectInst that has an ICmpInst as its first operand. 
+  if (!FlippedStrictness)
+    return nullptr;
+
+  // If said constant doesn't match either, then there is no hope,
+  if (!MatchesSelectValue(FlippedStrictness->second))
+    return nullptr;
+
+  // It matched! Lets insert the new comparison just before select.
+  InstCombiner::BuilderTy::InsertPointGuard Guard(IC.Builder);
+  IC.Builder.SetInsertPoint(&Sel);
+
+  Pred = ICmpInst::getSwappedPredicate(Pred); // Yes, swapped.
+  Value *NewCmp = IC.Builder.CreateICmp(Pred, X, FlippedStrictness->second,
+                                        Cmp.getName() + ".inv");
+  IC.replaceOperand(Sel, 0, NewCmp);
+  Sel.swapValues();
+  Sel.swapProfMetadata();
+
+  return &Sel;
+}
+
+/// Visit a SelectInst that has an ICmpInst as its first operand.
 Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI,
                                                       ICmpInst *ICI) {
   if (Instruction *NewSel = foldSelectValueEquivalence(SI, *ICI))
     return NewSel;
- 
-  if (Instruction *NewSel = canonicalizeMinMaxWithConstant(SI, *ICI, *this)) 
-    return NewSel; 
- 
-  if (Instruction *NewAbs = canonicalizeAbsNabs(SI, *ICI, *this)) 
-    return NewAbs; 
- 
-  if (Instruction *NewAbs = canonicalizeClampLike(SI, *ICI, Builder)) 
-    return NewAbs; 
- 
-  if (Instruction *NewSel = 
-          tryToReuseConstantFromSelectInComparison(SI, *ICI, *this)) 
-    return NewSel; 
- 
-  bool Changed = adjustMinMax(SI, *ICI); 
- 
-  if (Value *V = foldSelectICmpAnd(SI, ICI, Builder)) 
-    return replaceInstUsesWith(SI, V); 
- 
-  // NOTE: if we wanted to, this is where to detect integer MIN/MAX 
-  Value *TrueVal = SI.getTrueValue(); 
-  Value *FalseVal = SI.getFalseValue(); 
-  ICmpInst::Predicate Pred = ICI->getPredicate(); 
-  Value *CmpLHS = ICI->getOperand(0); 
-  Value *CmpRHS = ICI->getOperand(1); 
-  if (CmpRHS != CmpLHS && isa<Constant>(CmpRHS)) { 
-    if (CmpLHS == TrueVal && Pred == ICmpInst::ICMP_EQ) { 
-      // Transform (X == C) ? X : Y -> (X == C) ? C : Y 
-      SI.setOperand(1, CmpRHS); 
-      Changed = true; 
-    } else if (CmpLHS == FalseVal && Pred == ICmpInst::ICMP_NE) { 
-      // Transform (X != C) ? Y : X -> (X != C) ? Y : C 
-      SI.setOperand(2, CmpRHS); 
-      Changed = true; 
-    } 
-  } 
- 
-  // FIXME: This code is nearly duplicated in InstSimplify. Using/refactoring 
-  // decomposeBitTestICmp() might help. 
-  { 
-    unsigned BitWidth = 
-        DL.getTypeSizeInBits(TrueVal->getType()->getScalarType()); 
-    APInt MinSignedValue = APInt::getSignedMinValue(BitWidth); 
-    Value *X; 
-    const APInt *Y, *C; 
-    bool TrueWhenUnset; 
-    bool IsBitTest = false; 
-    if (ICmpInst::isEquality(Pred) && 
-        match(CmpLHS, m_And(m_Value(X), m_Power2(Y))) && 
-        match(CmpRHS, m_Zero())) { 
-      IsBitTest = true; 
-      TrueWhenUnset = Pred == ICmpInst::ICMP_EQ; 
-    } else if (Pred == ICmpInst::ICMP_SLT && match(CmpRHS, m_Zero())) { 
-      X = CmpLHS; 
-      Y = &MinSignedValue; 
-      IsBitTest = true; 
-      TrueWhenUnset = false; 
-    } else if (Pred == ICmpInst::ICMP_SGT && match(CmpRHS, m_AllOnes())) { 
-      X = CmpLHS; 
-      Y = &MinSignedValue; 
-      IsBitTest = true; 
-      TrueWhenUnset = true; 
-    } 
-    if (IsBitTest) { 
-      Value *V = nullptr; 
-      // (X & Y) == 0 ? X : X ^ Y  --> X & ~Y 
-      if (TrueWhenUnset && TrueVal == X && 
-          match(FalseVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C) 
-        V = Builder.CreateAnd(X, ~(*Y)); 
-      // (X & Y) != 0 ? X ^ Y : X  --> X & ~Y 
-      else if (!TrueWhenUnset && FalseVal == X && 
-               match(TrueVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C) 
-        V = Builder.CreateAnd(X, ~(*Y)); 
-      // (X & Y) == 0 ? X ^ Y : X  --> X | Y 
-      else if (TrueWhenUnset && FalseVal == X && 
-               match(TrueVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C) 
-        V = Builder.CreateOr(X, *Y); 
-      // (X & Y) != 0 ? X : X ^ Y  --> X | Y 
-      else if (!TrueWhenUnset && TrueVal == X && 
-               match(FalseVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C) 
-        V = Builder.CreateOr(X, *Y); 
- 
-      if (V) 
-        return replaceInstUsesWith(SI, V); 
-    } 
-  } 
- 
-  if (Instruction *V = 
-          foldSelectICmpAndAnd(SI.getType(), ICI, TrueVal, FalseVal, Builder)) 
-    return V; 
- 
-  if (Instruction *V = foldSelectCtlzToCttz(ICI, TrueVal, FalseVal, Builder)) 
-    return V; 
- 
-  if (Value *V = foldSelectICmpAndOr(ICI, TrueVal, FalseVal, Builder)) 
-    return replaceInstUsesWith(SI, V); 
- 
-  if (Value *V = foldSelectICmpLshrAshr(ICI, TrueVal, FalseVal, Builder)) 
-    return replaceInstUsesWith(SI, V); 
- 
-  if (Value *V = foldSelectCttzCtlz(ICI, TrueVal, FalseVal, Builder)) 
-    return replaceInstUsesWith(SI, V); 
- 
-  if (Value *V = canonicalizeSaturatedSubtract(ICI, TrueVal, FalseVal, Builder)) 
-    return replaceInstUsesWith(SI, V); 
- 
-  if (Value *V = canonicalizeSaturatedAdd(ICI, TrueVal, FalseVal, Builder)) 
-    return replaceInstUsesWith(SI, V); 
- 
-  return Changed ? &SI : nullptr; 
-} 
- 
-/// SI is a select whose condition is a PHI node (but the two may be in 
-/// different blocks). See if the true/false values (V) are live in all of the 
-/// predecessor blocks of the PHI. For example, cases like this can't be mapped: 
-/// 
-///   X = phi [ C1, BB1], [C2, BB2] 
-///   Y = add 
-///   Z = select X, Y, 0 
-/// 
-/// because Y is not live in BB1/BB2. 
-static bool canSelectOperandBeMappingIntoPredBlock(const Value *V, 
-                                                   const SelectInst &SI) { 
-  // If the value is a non-instruction value like a constant or argument, it 
-  // can always be mapped. 
-  const Instruction *I = dyn_cast<Instruction>(V); 
-  if (!I) return true; 
- 
-  // If V is a PHI node defined in the same block as the condition PHI, we can 
-  // map the arguments. 
-  const PHINode *CondPHI = cast<PHINode>(SI.getCondition()); 
- 
-  if (const PHINode *VP = dyn_cast<PHINode>(I)) 
-    if (VP->getParent() == CondPHI->getParent()) 
-      return true; 
- 
-  // Otherwise, if the PHI and select are defined in the same block and if V is 
-  // defined in a different block, then we can transform it. 
-  if (SI.getParent() == CondPHI->getParent() && 
-      I->getParent() != CondPHI->getParent()) 
-    return true; 
- 
-  // Otherwise we have a 'hard' case and we can't tell without doing more 
-  // detailed dominator based analysis, punt. 
-  return false; 
-} 
- 
-/// We have an SPF (e.g. a min or max) of an SPF of the form: 
-///   SPF2(SPF1(A, B), C) 
+
+  if (Instruction *NewSel = canonicalizeMinMaxWithConstant(SI, *ICI, *this))
+    return NewSel;
+
+  if (Instruction *NewAbs = canonicalizeAbsNabs(SI, *ICI, *this))
+    return NewAbs;
+
+  if (Instruction *NewAbs = canonicalizeClampLike(SI, *ICI, Builder))
+    return NewAbs;
+
+  if (Instruction *NewSel =
+          tryToReuseConstantFromSelectInComparison(SI, *ICI, *this))
+    return NewSel;
+
+  bool Changed = adjustMinMax(SI, *ICI);
+
+  if (Value *V = foldSelectICmpAnd(SI, ICI, Builder))
+    return replaceInstUsesWith(SI, V);
+
+  // NOTE: if we wanted to, this is where to detect integer MIN/MAX
+  Value *TrueVal = SI.getTrueValue();
+  Value *FalseVal = SI.getFalseValue();
+  ICmpInst::Predicate Pred = ICI->getPredicate();
+  Value *CmpLHS = ICI->getOperand(0);
+  Value *CmpRHS = ICI->getOperand(1);
+  if (CmpRHS != CmpLHS && isa<Constant>(CmpRHS)) {
+    if (CmpLHS == TrueVal && Pred == ICmpInst::ICMP_EQ) {
+      // Transform (X == C) ? X : Y -> (X == C) ? C : Y
+      SI.setOperand(1, CmpRHS);
+      Changed = true;
+    } else if (CmpLHS == FalseVal && Pred == ICmpInst::ICMP_NE) {
+      // Transform (X != C) ? Y : X -> (X != C) ? Y : C
+      SI.setOperand(2, CmpRHS);
+      Changed = true;
+    }
+  }
+
+  // FIXME: This code is nearly duplicated in InstSimplify. Using/refactoring
+  // decomposeBitTestICmp() might help.
+  {
+    unsigned BitWidth =
+        DL.getTypeSizeInBits(TrueVal->getType()->getScalarType());
+    APInt MinSignedValue = APInt::getSignedMinValue(BitWidth);
+    Value *X;
+    const APInt *Y, *C;
+    bool TrueWhenUnset;
+    bool IsBitTest = false;
+    if (ICmpInst::isEquality(Pred) &&
+        match(CmpLHS, m_And(m_Value(X), m_Power2(Y))) &&
+        match(CmpRHS, m_Zero())) {
+      IsBitTest = true;
+      TrueWhenUnset = Pred == ICmpInst::ICMP_EQ;
+    } else if (Pred == ICmpInst::ICMP_SLT && match(CmpRHS, m_Zero())) {
+      X = CmpLHS;
+      Y = &MinSignedValue;
+      IsBitTest = true;
+      TrueWhenUnset = false;
+    } else if (Pred == ICmpInst::ICMP_SGT && match(CmpRHS, m_AllOnes())) {
+      X = CmpLHS;
+      Y = &MinSignedValue;
+      IsBitTest = true;
+      TrueWhenUnset = true;
+    }
+    if (IsBitTest) {
+      Value *V = nullptr;
+      // (X & Y) == 0 ? X : X ^ Y  --> X & ~Y
+      if (TrueWhenUnset && TrueVal == X &&
+          match(FalseVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
+        V = Builder.CreateAnd(X, ~(*Y));
+      // (X & Y) != 0 ? X ^ Y : X  --> X & ~Y
+      else if (!TrueWhenUnset && FalseVal == X &&
+               match(TrueVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
+        V = Builder.CreateAnd(X, ~(*Y));
+      // (X & Y) == 0 ? X ^ Y : X  --> X | Y
+      else if (TrueWhenUnset && FalseVal == X &&
+               match(TrueVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
+        V = Builder.CreateOr(X, *Y);
+      // (X & Y) != 0 ? X : X ^ Y  --> X | Y
+      else if (!TrueWhenUnset && TrueVal == X &&
+               match(FalseVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
+        V = Builder.CreateOr(X, *Y);
+
+      if (V)
+        return replaceInstUsesWith(SI, V);
+    }
+  }
+
+  if (Instruction *V =
+          foldSelectICmpAndAnd(SI.getType(), ICI, TrueVal, FalseVal, Builder))
+    return V;
+
+  if (Instruction *V = foldSelectCtlzToCttz(ICI, TrueVal, FalseVal, Builder))
+    return V;
+
+  if (Value *V = foldSelectICmpAndOr(ICI, TrueVal, FalseVal, Builder))
+    return replaceInstUsesWith(SI, V);
+
+  if (Value *V = foldSelectICmpLshrAshr(ICI, TrueVal, FalseVal, Builder))
+    return replaceInstUsesWith(SI, V);
+
+  if (Value *V = foldSelectCttzCtlz(ICI, TrueVal, FalseVal, Builder))
+    return replaceInstUsesWith(SI, V);
+
+  if (Value *V = canonicalizeSaturatedSubtract(ICI, TrueVal, FalseVal, Builder))
+    return replaceInstUsesWith(SI, V);
+
+  if (Value *V = canonicalizeSaturatedAdd(ICI, TrueVal, FalseVal, Builder))
+    return replaceInstUsesWith(SI, V);
+
+  return Changed ? &SI : nullptr;
+}
+
+/// SI is a select whose condition is a PHI node (but the two may be in
+/// different blocks). See if the true/false values (V) are live in all of the
+/// predecessor blocks of the PHI. For example, cases like this can't be mapped:
+///
+///   X = phi [ C1, BB1], [C2, BB2]
+///   Y = add
+///   Z = select X, Y, 0
+///
+/// because Y is not live in BB1/BB2.
+static bool canSelectOperandBeMappingIntoPredBlock(const Value *V,
+                                                   const SelectInst &SI) {
+  // If the value is a non-instruction value like a constant or argument, it
+  // can always be mapped.
+  const Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return true;
+
+  // If V is a PHI node defined in the same block as the condition PHI, we can
+  // map the arguments.
+  const PHINode *CondPHI = cast<PHINode>(SI.getCondition());
+
+  if (const PHINode *VP = dyn_cast<PHINode>(I))
+    if (VP->getParent() == CondPHI->getParent())
+      return true;
+
+  // Otherwise, if the PHI and select are defined in the same block and if V is
+  // defined in a different block, then we can transform it.
+  if (SI.getParent() == CondPHI->getParent() &&
+      I->getParent() != CondPHI->getParent())
+    return true;
+
+  // Otherwise we have a 'hard' case and we can't tell without doing more
+  // detailed dominator based analysis, punt.
+  return false;
+}
+
+/// We have an SPF (e.g. a min or max) of an SPF of the form:
+///   SPF2(SPF1(A, B), C)
 Instruction *InstCombinerImpl::foldSPFofSPF(Instruction *Inner,
                                             SelectPatternFlavor SPF1, Value *A,
                                             Value *B, Instruction &Outer,
                                             SelectPatternFlavor SPF2,
                                             Value *C) {
-  if (Outer.getType() != Inner->getType()) 
-    return nullptr; 
- 
-  if (C == A || C == B) { 
-    // MAX(MAX(A, B), B) -> MAX(A, B) 
-    // MIN(MIN(a, b), a) -> MIN(a, b) 
-    // TODO: This could be done in instsimplify. 
-    if (SPF1 == SPF2 && SelectPatternResult::isMinOrMax(SPF1)) 
-      return replaceInstUsesWith(Outer, Inner); 
- 
-    // MAX(MIN(a, b), a) -> a 
-    // MIN(MAX(a, b), a) -> a 
-    // TODO: This could be done in instsimplify. 
-    if ((SPF1 == SPF_SMIN && SPF2 == SPF_SMAX) || 
-        (SPF1 == SPF_SMAX && SPF2 == SPF_SMIN) || 
-        (SPF1 == SPF_UMIN && SPF2 == SPF_UMAX) || 
-        (SPF1 == SPF_UMAX && SPF2 == SPF_UMIN)) 
-      return replaceInstUsesWith(Outer, C); 
-  } 
- 
-  if (SPF1 == SPF2) { 
-    const APInt *CB, *CC; 
-    if (match(B, m_APInt(CB)) && match(C, m_APInt(CC))) { 
-      // MIN(MIN(A, 23), 97) -> MIN(A, 23) 
-      // MAX(MAX(A, 97), 23) -> MAX(A, 97) 
-      // TODO: This could be done in instsimplify. 
-      if ((SPF1 == SPF_UMIN && CB->ule(*CC)) || 
-          (SPF1 == SPF_SMIN && CB->sle(*CC)) || 
-          (SPF1 == SPF_UMAX && CB->uge(*CC)) || 
-          (SPF1 == SPF_SMAX && CB->sge(*CC))) 
-        return replaceInstUsesWith(Outer, Inner); 
- 
-      // MIN(MIN(A, 97), 23) -> MIN(A, 23) 
-      // MAX(MAX(A, 23), 97) -> MAX(A, 97) 
-      if ((SPF1 == SPF_UMIN && CB->ugt(*CC)) || 
-          (SPF1 == SPF_SMIN && CB->sgt(*CC)) || 
-          (SPF1 == SPF_UMAX && CB->ult(*CC)) || 
-          (SPF1 == SPF_SMAX && CB->slt(*CC))) { 
-        Outer.replaceUsesOfWith(Inner, A); 
-        return &Outer; 
-      } 
-    } 
-  } 
- 
-  // max(max(A, B), min(A, B)) --> max(A, B) 
-  // min(min(A, B), max(A, B)) --> min(A, B) 
-  // TODO: This could be done in instsimplify. 
-  if (SPF1 == SPF2 && 
-      ((SPF1 == SPF_UMIN && match(C, m_c_UMax(m_Specific(A), m_Specific(B)))) || 
-       (SPF1 == SPF_SMIN && match(C, m_c_SMax(m_Specific(A), m_Specific(B)))) || 
-       (SPF1 == SPF_UMAX && match(C, m_c_UMin(m_Specific(A), m_Specific(B)))) || 
-       (SPF1 == SPF_SMAX && match(C, m_c_SMin(m_Specific(A), m_Specific(B)))))) 
-    return replaceInstUsesWith(Outer, Inner); 
- 
-  // ABS(ABS(X)) -> ABS(X) 
-  // NABS(NABS(X)) -> NABS(X) 
-  // TODO: This could be done in instsimplify. 
-  if (SPF1 == SPF2 && (SPF1 == SPF_ABS || SPF1 == SPF_NABS)) { 
-    return replaceInstUsesWith(Outer, Inner); 
-  } 
- 
-  // ABS(NABS(X)) -> ABS(X) 
-  // NABS(ABS(X)) -> NABS(X) 
-  if ((SPF1 == SPF_ABS && SPF2 == SPF_NABS) || 
-      (SPF1 == SPF_NABS && SPF2 == SPF_ABS)) { 
-    SelectInst *SI = cast<SelectInst>(Inner); 
-    Value *NewSI = 
-        Builder.CreateSelect(SI->getCondition(), SI->getFalseValue(), 
-                             SI->getTrueValue(), SI->getName(), SI); 
-    return replaceInstUsesWith(Outer, NewSI); 
-  } 
- 
-  auto IsFreeOrProfitableToInvert = 
-      [&](Value *V, Value *&NotV, bool &ElidesXor) { 
-    if (match(V, m_Not(m_Value(NotV)))) { 
-      // If V has at most 2 uses then we can get rid of the xor operation 
-      // entirely. 
-      ElidesXor |= !V->hasNUsesOrMore(3); 
-      return true; 
-    } 
- 
-    if (isFreeToInvert(V, !V->hasNUsesOrMore(3))) { 
-      NotV = nullptr; 
-      return true; 
-    } 
- 
-    return false; 
-  }; 
- 
-  Value *NotA, *NotB, *NotC; 
-  bool ElidesXor = false; 
- 
-  // MIN(MIN(~A, ~B), ~C) == ~MAX(MAX(A, B), C) 
-  // MIN(MAX(~A, ~B), ~C) == ~MAX(MIN(A, B), C) 
-  // MAX(MIN(~A, ~B), ~C) == ~MIN(MAX(A, B), C) 
-  // MAX(MAX(~A, ~B), ~C) == ~MIN(MIN(A, B), C) 
-  // 
-  // This transform is performance neutral if we can elide at least one xor from 
-  // the set of three operands, since we'll be tacking on an xor at the very 
-  // end. 
-  if (SelectPatternResult::isMinOrMax(SPF1) && 
-      SelectPatternResult::isMinOrMax(SPF2) && 
-      IsFreeOrProfitableToInvert(A, NotA, ElidesXor) && 
-      IsFreeOrProfitableToInvert(B, NotB, ElidesXor) && 
-      IsFreeOrProfitableToInvert(C, NotC, ElidesXor) && ElidesXor) { 
-    if (!NotA) 
-      NotA = Builder.CreateNot(A); 
-    if (!NotB) 
-      NotB = Builder.CreateNot(B); 
-    if (!NotC) 
-      NotC = Builder.CreateNot(C); 
- 
-    Value *NewInner = createMinMax(Builder, getInverseMinMaxFlavor(SPF1), NotA, 
-                                   NotB); 
-    Value *NewOuter = Builder.CreateNot( 
-        createMinMax(Builder, getInverseMinMaxFlavor(SPF2), NewInner, NotC)); 
-    return replaceInstUsesWith(Outer, NewOuter); 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Turn select C, (X + Y), (X - Y) --> (X + (select C, Y, (-Y))). 
-/// This is even legal for FP. 
-static Instruction *foldAddSubSelect(SelectInst &SI, 
-                                     InstCombiner::BuilderTy &Builder) { 
-  Value *CondVal = SI.getCondition(); 
-  Value *TrueVal = SI.getTrueValue(); 
-  Value *FalseVal = SI.getFalseValue(); 
-  auto *TI = dyn_cast<Instruction>(TrueVal); 
-  auto *FI = dyn_cast<Instruction>(FalseVal); 
-  if (!TI || !FI || !TI->hasOneUse() || !FI->hasOneUse()) 
-    return nullptr; 
- 
-  Instruction *AddOp = nullptr, *SubOp = nullptr; 
-  if ((TI->getOpcode() == Instruction::Sub && 
-       FI->getOpcode() == Instruction::Add) || 
-      (TI->getOpcode() == Instruction::FSub && 
-       FI->getOpcode() == Instruction::FAdd)) { 
-    AddOp = FI; 
-    SubOp = TI; 
-  } else if ((FI->getOpcode() == Instruction::Sub && 
-              TI->getOpcode() == Instruction::Add) || 
-             (FI->getOpcode() == Instruction::FSub && 
-              TI->getOpcode() == Instruction::FAdd)) { 
-    AddOp = TI; 
-    SubOp = FI; 
-  } 
- 
-  if (AddOp) { 
-    Value *OtherAddOp = nullptr; 
-    if (SubOp->getOperand(0) == AddOp->getOperand(0)) { 
-      OtherAddOp = AddOp->getOperand(1); 
-    } else if (SubOp->getOperand(0) == AddOp->getOperand(1)) { 
-      OtherAddOp = AddOp->getOperand(0); 
-    } 
- 
-    if (OtherAddOp) { 
-      // So at this point we know we have (Y -> OtherAddOp): 
-      //        select C, (add X, Y), (sub X, Z) 
-      Value *NegVal; // Compute -Z 
-      if (SI.getType()->isFPOrFPVectorTy()) { 
-        NegVal = Builder.CreateFNeg(SubOp->getOperand(1)); 
-        if (Instruction *NegInst = dyn_cast<Instruction>(NegVal)) { 
-          FastMathFlags Flags = AddOp->getFastMathFlags(); 
-          Flags &= SubOp->getFastMathFlags(); 
-          NegInst->setFastMathFlags(Flags); 
-        } 
-      } else { 
-        NegVal = Builder.CreateNeg(SubOp->getOperand(1)); 
-      } 
- 
-      Value *NewTrueOp = OtherAddOp; 
-      Value *NewFalseOp = NegVal; 
-      if (AddOp != TI) 
-        std::swap(NewTrueOp, NewFalseOp); 
-      Value *NewSel = Builder.CreateSelect(CondVal, NewTrueOp, NewFalseOp, 
-                                           SI.getName() + ".p", &SI); 
- 
-      if (SI.getType()->isFPOrFPVectorTy()) { 
-        Instruction *RI = 
-            BinaryOperator::CreateFAdd(SubOp->getOperand(0), NewSel); 
- 
-        FastMathFlags Flags = AddOp->getFastMathFlags(); 
-        Flags &= SubOp->getFastMathFlags(); 
-        RI->setFastMathFlags(Flags); 
-        return RI; 
-      } else 
-        return BinaryOperator::CreateAdd(SubOp->getOperand(0), NewSel); 
-    } 
-  } 
-  return nullptr; 
-} 
- 
-/// Turn X + Y overflows ? -1 : X + Y -> uadd_sat X, Y 
-/// And X - Y overflows ? 0 : X - Y -> usub_sat X, Y 
-/// Along with a number of patterns similar to: 
-/// X + Y overflows ? (X < 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y 
-/// X - Y overflows ? (X > 0 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y 
-static Instruction * 
-foldOverflowingAddSubSelect(SelectInst &SI, InstCombiner::BuilderTy &Builder) { 
-  Value *CondVal = SI.getCondition(); 
-  Value *TrueVal = SI.getTrueValue(); 
-  Value *FalseVal = SI.getFalseValue(); 
- 
-  WithOverflowInst *II; 
-  if (!match(CondVal, m_ExtractValue<1>(m_WithOverflowInst(II))) || 
-      !match(FalseVal, m_ExtractValue<0>(m_Specific(II)))) 
-    return nullptr; 
- 
-  Value *X = II->getLHS(); 
-  Value *Y = II->getRHS(); 
- 
-  auto IsSignedSaturateLimit = [&](Value *Limit, bool IsAdd) { 
-    Type *Ty = Limit->getType(); 
- 
-    ICmpInst::Predicate Pred; 
-    Value *TrueVal, *FalseVal, *Op; 
-    const APInt *C; 
-    if (!match(Limit, m_Select(m_ICmp(Pred, m_Value(Op), m_APInt(C)), 
-                               m_Value(TrueVal), m_Value(FalseVal)))) 
-      return false; 
- 
-    auto IsZeroOrOne = [](const APInt &C) { 
-      return C.isNullValue() || C.isOneValue(); 
-    }; 
-    auto IsMinMax = [&](Value *Min, Value *Max) { 
-      APInt MinVal = APInt::getSignedMinValue(Ty->getScalarSizeInBits()); 
-      APInt MaxVal = APInt::getSignedMaxValue(Ty->getScalarSizeInBits()); 
-      return match(Min, m_SpecificInt(MinVal)) && 
-             match(Max, m_SpecificInt(MaxVal)); 
-    }; 
- 
-    if (Op != X && Op != Y) 
-      return false; 
- 
-    if (IsAdd) { 
-      // X + Y overflows ? (X <s 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y 
-      // X + Y overflows ? (X <s 1 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y 
-      // X + Y overflows ? (Y <s 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y 
-      // X + Y overflows ? (Y <s 1 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y 
-      if (Pred == ICmpInst::ICMP_SLT && IsZeroOrOne(*C) && 
-          IsMinMax(TrueVal, FalseVal)) 
-        return true; 
-      // X + Y overflows ? (X >s 0 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y 
-      // X + Y overflows ? (X >s -1 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y 
-      // X + Y overflows ? (Y >s 0 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y 
-      // X + Y overflows ? (Y >s -1 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y 
-      if (Pred == ICmpInst::ICMP_SGT && IsZeroOrOne(*C + 1) && 
-          IsMinMax(FalseVal, TrueVal)) 
-        return true; 
-    } else { 
-      // X - Y overflows ? (X <s 0 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y 
-      // X - Y overflows ? (X <s -1 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y 
-      if (Op == X && Pred == ICmpInst::ICMP_SLT && IsZeroOrOne(*C + 1) && 
-          IsMinMax(TrueVal, FalseVal)) 
-        return true; 
-      // X - Y overflows ? (X >s -1 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y 
-      // X - Y overflows ? (X >s -2 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y 
-      if (Op == X && Pred == ICmpInst::ICMP_SGT && IsZeroOrOne(*C + 2) && 
-          IsMinMax(FalseVal, TrueVal)) 
-        return true; 
-      // X - Y overflows ? (Y <s 0 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y 
-      // X - Y overflows ? (Y <s 1 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y 
-      if (Op == Y && Pred == ICmpInst::ICMP_SLT && IsZeroOrOne(*C) && 
-          IsMinMax(FalseVal, TrueVal)) 
-        return true; 
-      // X - Y overflows ? (Y >s 0 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y 
-      // X - Y overflows ? (Y >s -1 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y 
-      if (Op == Y && Pred == ICmpInst::ICMP_SGT && IsZeroOrOne(*C + 1) && 
-          IsMinMax(TrueVal, FalseVal)) 
-        return true; 
-    } 
- 
-    return false; 
-  }; 
- 
-  Intrinsic::ID NewIntrinsicID; 
-  if (II->getIntrinsicID() == Intrinsic::uadd_with_overflow && 
-      match(TrueVal, m_AllOnes())) 
-    // X + Y overflows ? -1 : X + Y -> uadd_sat X, Y 
-    NewIntrinsicID = Intrinsic::uadd_sat; 
-  else if (II->getIntrinsicID() == Intrinsic::usub_with_overflow && 
-           match(TrueVal, m_Zero())) 
-    // X - Y overflows ? 0 : X - Y -> usub_sat X, Y 
-    NewIntrinsicID = Intrinsic::usub_sat; 
-  else if (II->getIntrinsicID() == Intrinsic::sadd_with_overflow && 
-           IsSignedSaturateLimit(TrueVal, /*IsAdd=*/true)) 
-    // X + Y overflows ? (X <s 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y 
-    // X + Y overflows ? (X <s 1 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y 
-    // X + Y overflows ? (X >s 0 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y 
-    // X + Y overflows ? (X >s -1 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y 
-    // X + Y overflows ? (Y <s 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y 
-    // X + Y overflows ? (Y <s 1 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y 
-    // X + Y overflows ? (Y >s 0 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y 
-    // X + Y overflows ? (Y >s -1 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y 
-    NewIntrinsicID = Intrinsic::sadd_sat; 
-  else if (II->getIntrinsicID() == Intrinsic::ssub_with_overflow && 
-           IsSignedSaturateLimit(TrueVal, /*IsAdd=*/false)) 
-    // X - Y overflows ? (X <s 0 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y 
-    // X - Y overflows ? (X <s -1 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y 
-    // X - Y overflows ? (X >s -1 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y 
-    // X - Y overflows ? (X >s -2 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y 
-    // X - Y overflows ? (Y <s 0 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y 
-    // X - Y overflows ? (Y <s 1 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y 
-    // X - Y overflows ? (Y >s 0 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y 
-    // X - Y overflows ? (Y >s -1 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y 
-    NewIntrinsicID = Intrinsic::ssub_sat; 
-  else 
-    return nullptr; 
- 
-  Function *F = 
-      Intrinsic::getDeclaration(SI.getModule(), NewIntrinsicID, SI.getType()); 
-  return CallInst::Create(F, {X, Y}); 
-} 
- 
+  if (Outer.getType() != Inner->getType())
+    return nullptr;
+
+  if (C == A || C == B) {
+    // MAX(MAX(A, B), B) -> MAX(A, B)
+    // MIN(MIN(a, b), a) -> MIN(a, b)
+    // TODO: This could be done in instsimplify.
+    if (SPF1 == SPF2 && SelectPatternResult::isMinOrMax(SPF1))
+      return replaceInstUsesWith(Outer, Inner);
+
+    // MAX(MIN(a, b), a) -> a
+    // MIN(MAX(a, b), a) -> a
+    // TODO: This could be done in instsimplify.
+    if ((SPF1 == SPF_SMIN && SPF2 == SPF_SMAX) ||
+        (SPF1 == SPF_SMAX && SPF2 == SPF_SMIN) ||
+        (SPF1 == SPF_UMIN && SPF2 == SPF_UMAX) ||
+        (SPF1 == SPF_UMAX && SPF2 == SPF_UMIN))
+      return replaceInstUsesWith(Outer, C);
+  }
+
+  if (SPF1 == SPF2) {
+    const APInt *CB, *CC;
+    if (match(B, m_APInt(CB)) && match(C, m_APInt(CC))) {
+      // MIN(MIN(A, 23), 97) -> MIN(A, 23)
+      // MAX(MAX(A, 97), 23) -> MAX(A, 97)
+      // TODO: This could be done in instsimplify.
+      if ((SPF1 == SPF_UMIN && CB->ule(*CC)) ||
+          (SPF1 == SPF_SMIN && CB->sle(*CC)) ||
+          (SPF1 == SPF_UMAX && CB->uge(*CC)) ||
+          (SPF1 == SPF_SMAX && CB->sge(*CC)))
+        return replaceInstUsesWith(Outer, Inner);
+
+      // MIN(MIN(A, 97), 23) -> MIN(A, 23)
+      // MAX(MAX(A, 23), 97) -> MAX(A, 97)
+      if ((SPF1 == SPF_UMIN && CB->ugt(*CC)) ||
+          (SPF1 == SPF_SMIN && CB->sgt(*CC)) ||
+          (SPF1 == SPF_UMAX && CB->ult(*CC)) ||
+          (SPF1 == SPF_SMAX && CB->slt(*CC))) {
+        Outer.replaceUsesOfWith(Inner, A);
+        return &Outer;
+      }
+    }
+  }
+
+  // max(max(A, B), min(A, B)) --> max(A, B)
+  // min(min(A, B), max(A, B)) --> min(A, B)
+  // TODO: This could be done in instsimplify.
+  if (SPF1 == SPF2 &&
+      ((SPF1 == SPF_UMIN && match(C, m_c_UMax(m_Specific(A), m_Specific(B)))) ||
+       (SPF1 == SPF_SMIN && match(C, m_c_SMax(m_Specific(A), m_Specific(B)))) ||
+       (SPF1 == SPF_UMAX && match(C, m_c_UMin(m_Specific(A), m_Specific(B)))) ||
+       (SPF1 == SPF_SMAX && match(C, m_c_SMin(m_Specific(A), m_Specific(B))))))
+    return replaceInstUsesWith(Outer, Inner);
+
+  // ABS(ABS(X)) -> ABS(X)
+  // NABS(NABS(X)) -> NABS(X)
+  // TODO: This could be done in instsimplify.
+  if (SPF1 == SPF2 && (SPF1 == SPF_ABS || SPF1 == SPF_NABS)) {
+    return replaceInstUsesWith(Outer, Inner);
+  }
+
+  // ABS(NABS(X)) -> ABS(X)
+  // NABS(ABS(X)) -> NABS(X)
+  if ((SPF1 == SPF_ABS && SPF2 == SPF_NABS) ||
+      (SPF1 == SPF_NABS && SPF2 == SPF_ABS)) {
+    SelectInst *SI = cast<SelectInst>(Inner);
+    Value *NewSI =
+        Builder.CreateSelect(SI->getCondition(), SI->getFalseValue(),
+                             SI->getTrueValue(), SI->getName(), SI);
+    return replaceInstUsesWith(Outer, NewSI);
+  }
+
+  auto IsFreeOrProfitableToInvert =
+      [&](Value *V, Value *&NotV, bool &ElidesXor) {
+    if (match(V, m_Not(m_Value(NotV)))) {
+      // If V has at most 2 uses then we can get rid of the xor operation
+      // entirely.
+      ElidesXor |= !V->hasNUsesOrMore(3);
+      return true;
+    }
+
+    if (isFreeToInvert(V, !V->hasNUsesOrMore(3))) {
+      NotV = nullptr;
+      return true;
+    }
+
+    return false;
+  };
+
+  Value *NotA, *NotB, *NotC;
+  bool ElidesXor = false;
+
+  // MIN(MIN(~A, ~B), ~C) == ~MAX(MAX(A, B), C)
+  // MIN(MAX(~A, ~B), ~C) == ~MAX(MIN(A, B), C)
+  // MAX(MIN(~A, ~B), ~C) == ~MIN(MAX(A, B), C)
+  // MAX(MAX(~A, ~B), ~C) == ~MIN(MIN(A, B), C)
+  //
+  // This transform is performance neutral if we can elide at least one xor from
+  // the set of three operands, since we'll be tacking on an xor at the very
+  // end.
+  if (SelectPatternResult::isMinOrMax(SPF1) &&
+      SelectPatternResult::isMinOrMax(SPF2) &&
+      IsFreeOrProfitableToInvert(A, NotA, ElidesXor) &&
+      IsFreeOrProfitableToInvert(B, NotB, ElidesXor) &&
+      IsFreeOrProfitableToInvert(C, NotC, ElidesXor) && ElidesXor) {
+    if (!NotA)
+      NotA = Builder.CreateNot(A);
+    if (!NotB)
+      NotB = Builder.CreateNot(B);
+    if (!NotC)
+      NotC = Builder.CreateNot(C);
+
+    Value *NewInner = createMinMax(Builder, getInverseMinMaxFlavor(SPF1), NotA,
+                                   NotB);
+    Value *NewOuter = Builder.CreateNot(
+        createMinMax(Builder, getInverseMinMaxFlavor(SPF2), NewInner, NotC));
+    return replaceInstUsesWith(Outer, NewOuter);
+  }
+
+  return nullptr;
+}
+
+/// Turn select C, (X + Y), (X - Y) --> (X + (select C, Y, (-Y))).
+/// This is even legal for FP.
+static Instruction *foldAddSubSelect(SelectInst &SI,
+                                     InstCombiner::BuilderTy &Builder) {
+  Value *CondVal = SI.getCondition();
+  Value *TrueVal = SI.getTrueValue();
+  Value *FalseVal = SI.getFalseValue();
+  auto *TI = dyn_cast<Instruction>(TrueVal);
+  auto *FI = dyn_cast<Instruction>(FalseVal);
+  if (!TI || !FI || !TI->hasOneUse() || !FI->hasOneUse())
+    return nullptr;
+
+  Instruction *AddOp = nullptr, *SubOp = nullptr;
+  if ((TI->getOpcode() == Instruction::Sub &&
+       FI->getOpcode() == Instruction::Add) ||
+      (TI->getOpcode() == Instruction::FSub &&
+       FI->getOpcode() == Instruction::FAdd)) {
+    AddOp = FI;
+    SubOp = TI;
+  } else if ((FI->getOpcode() == Instruction::Sub &&
+              TI->getOpcode() == Instruction::Add) ||
+             (FI->getOpcode() == Instruction::FSub &&
+              TI->getOpcode() == Instruction::FAdd)) {
+    AddOp = TI;
+    SubOp = FI;
+  }
+
+  if (AddOp) {
+    Value *OtherAddOp = nullptr;
+    if (SubOp->getOperand(0) == AddOp->getOperand(0)) {
+      OtherAddOp = AddOp->getOperand(1);
+    } else if (SubOp->getOperand(0) == AddOp->getOperand(1)) {
+      OtherAddOp = AddOp->getOperand(0);
+    }
+
+    if (OtherAddOp) {
+      // So at this point we know we have (Y -> OtherAddOp):
+      //        select C, (add X, Y), (sub X, Z)
+      Value *NegVal; // Compute -Z
+      if (SI.getType()->isFPOrFPVectorTy()) {
+        NegVal = Builder.CreateFNeg(SubOp->getOperand(1));
+        if (Instruction *NegInst = dyn_cast<Instruction>(NegVal)) {
+          FastMathFlags Flags = AddOp->getFastMathFlags();
+          Flags &= SubOp->getFastMathFlags();
+          NegInst->setFastMathFlags(Flags);
+        }
+      } else {
+        NegVal = Builder.CreateNeg(SubOp->getOperand(1));
+      }
+
+      Value *NewTrueOp = OtherAddOp;
+      Value *NewFalseOp = NegVal;
+      if (AddOp != TI)
+        std::swap(NewTrueOp, NewFalseOp);
+      Value *NewSel = Builder.CreateSelect(CondVal, NewTrueOp, NewFalseOp,
+                                           SI.getName() + ".p", &SI);
+
+      if (SI.getType()->isFPOrFPVectorTy()) {
+        Instruction *RI =
+            BinaryOperator::CreateFAdd(SubOp->getOperand(0), NewSel);
+
+        FastMathFlags Flags = AddOp->getFastMathFlags();
+        Flags &= SubOp->getFastMathFlags();
+        RI->setFastMathFlags(Flags);
+        return RI;
+      } else
+        return BinaryOperator::CreateAdd(SubOp->getOperand(0), NewSel);
+    }
+  }
+  return nullptr;
+}
+
+/// Turn X + Y overflows ? -1 : X + Y -> uadd_sat X, Y
+/// And X - Y overflows ? 0 : X - Y -> usub_sat X, Y
+/// Along with a number of patterns similar to:
+/// X + Y overflows ? (X < 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
+/// X - Y overflows ? (X > 0 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
+static Instruction *
+foldOverflowingAddSubSelect(SelectInst &SI, InstCombiner::BuilderTy &Builder) {
+  Value *CondVal = SI.getCondition();
+  Value *TrueVal = SI.getTrueValue();
+  Value *FalseVal = SI.getFalseValue();
+
+  WithOverflowInst *II;
+  if (!match(CondVal, m_ExtractValue<1>(m_WithOverflowInst(II))) ||
+      !match(FalseVal, m_ExtractValue<0>(m_Specific(II))))
+    return nullptr;
+
+  Value *X = II->getLHS();
+  Value *Y = II->getRHS();
+
+  auto IsSignedSaturateLimit = [&](Value *Limit, bool IsAdd) {
+    Type *Ty = Limit->getType();
+
+    ICmpInst::Predicate Pred;
+    Value *TrueVal, *FalseVal, *Op;
+    const APInt *C;
+    if (!match(Limit, m_Select(m_ICmp(Pred, m_Value(Op), m_APInt(C)),
+                               m_Value(TrueVal), m_Value(FalseVal))))
+      return false;
+
+    auto IsZeroOrOne = [](const APInt &C) {
+      return C.isNullValue() || C.isOneValue();
+    };
+    auto IsMinMax = [&](Value *Min, Value *Max) {
+      APInt MinVal = APInt::getSignedMinValue(Ty->getScalarSizeInBits());
+      APInt MaxVal = APInt::getSignedMaxValue(Ty->getScalarSizeInBits());
+      return match(Min, m_SpecificInt(MinVal)) &&
+             match(Max, m_SpecificInt(MaxVal));
+    };
+
+    if (Op != X && Op != Y)
+      return false;
+
+    if (IsAdd) {
+      // X + Y overflows ? (X <s 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
+      // X + Y overflows ? (X <s 1 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
+      // X + Y overflows ? (Y <s 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
+      // X + Y overflows ? (Y <s 1 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
+      if (Pred == ICmpInst::ICMP_SLT && IsZeroOrOne(*C) &&
+          IsMinMax(TrueVal, FalseVal))
+        return true;
+      // X + Y overflows ? (X >s 0 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y
+      // X + Y overflows ? (X >s -1 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y
+      // X + Y overflows ? (Y >s 0 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y
+      // X + Y overflows ? (Y >s -1 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y
+      if (Pred == ICmpInst::ICMP_SGT && IsZeroOrOne(*C + 1) &&
+          IsMinMax(FalseVal, TrueVal))
+        return true;
+    } else {
+      // X - Y overflows ? (X <s 0 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y
+      // X - Y overflows ? (X <s -1 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y
+      if (Op == X && Pred == ICmpInst::ICMP_SLT && IsZeroOrOne(*C + 1) &&
+          IsMinMax(TrueVal, FalseVal))
+        return true;
+      // X - Y overflows ? (X >s -1 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
+      // X - Y overflows ? (X >s -2 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
+      if (Op == X && Pred == ICmpInst::ICMP_SGT && IsZeroOrOne(*C + 2) &&
+          IsMinMax(FalseVal, TrueVal))
+        return true;
+      // X - Y overflows ? (Y <s 0 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
+      // X - Y overflows ? (Y <s 1 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
+      if (Op == Y && Pred == ICmpInst::ICMP_SLT && IsZeroOrOne(*C) &&
+          IsMinMax(FalseVal, TrueVal))
+        return true;
+      // X - Y overflows ? (Y >s 0 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y
+      // X - Y overflows ? (Y >s -1 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y
+      if (Op == Y && Pred == ICmpInst::ICMP_SGT && IsZeroOrOne(*C + 1) &&
+          IsMinMax(TrueVal, FalseVal))
+        return true;
+    }
+
+    return false;
+  };
+
+  Intrinsic::ID NewIntrinsicID;
+  if (II->getIntrinsicID() == Intrinsic::uadd_with_overflow &&
+      match(TrueVal, m_AllOnes()))
+    // X + Y overflows ? -1 : X + Y -> uadd_sat X, Y
+    NewIntrinsicID = Intrinsic::uadd_sat;
+  else if (II->getIntrinsicID() == Intrinsic::usub_with_overflow &&
+           match(TrueVal, m_Zero()))
+    // X - Y overflows ? 0 : X - Y -> usub_sat X, Y
+    NewIntrinsicID = Intrinsic::usub_sat;
+  else if (II->getIntrinsicID() == Intrinsic::sadd_with_overflow &&
+           IsSignedSaturateLimit(TrueVal, /*IsAdd=*/true))
+    // X + Y overflows ? (X <s 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
+    // X + Y overflows ? (X <s 1 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
+    // X + Y overflows ? (X >s 0 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y
+    // X + Y overflows ? (X >s -1 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y
+    // X + Y overflows ? (Y <s 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
+    // X + Y overflows ? (Y <s 1 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
+    // X + Y overflows ? (Y >s 0 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y
+    // X + Y overflows ? (Y >s -1 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y
+    NewIntrinsicID = Intrinsic::sadd_sat;
+  else if (II->getIntrinsicID() == Intrinsic::ssub_with_overflow &&
+           IsSignedSaturateLimit(TrueVal, /*IsAdd=*/false))
+    // X - Y overflows ? (X <s 0 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y
+    // X - Y overflows ? (X <s -1 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y
+    // X - Y overflows ? (X >s -1 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
+    // X - Y overflows ? (X >s -2 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
+    // X - Y overflows ? (Y <s 0 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
+    // X - Y overflows ? (Y <s 1 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
+    // X - Y overflows ? (Y >s 0 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y
+    // X - Y overflows ? (Y >s -1 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y
+    NewIntrinsicID = Intrinsic::ssub_sat;
+  else
+    return nullptr;
+
+  Function *F =
+      Intrinsic::getDeclaration(SI.getModule(), NewIntrinsicID, SI.getType());
+  return CallInst::Create(F, {X, Y});
+}
+
 Instruction *InstCombinerImpl::foldSelectExtConst(SelectInst &Sel) {
-  Constant *C; 
-  if (!match(Sel.getTrueValue(), m_Constant(C)) && 
-      !match(Sel.getFalseValue(), m_Constant(C))) 
-    return nullptr; 
- 
-  Instruction *ExtInst; 
-  if (!match(Sel.getTrueValue(), m_Instruction(ExtInst)) && 
-      !match(Sel.getFalseValue(), m_Instruction(ExtInst))) 
-    return nullptr; 
- 
-  auto ExtOpcode = ExtInst->getOpcode(); 
-  if (ExtOpcode != Instruction::ZExt && ExtOpcode != Instruction::SExt) 
-    return nullptr; 
- 
-  // If we are extending from a boolean type or if we can create a select that 
-  // has the same size operands as its condition, try to narrow the select. 
-  Value *X = ExtInst->getOperand(0); 
-  Type *SmallType = X->getType(); 
-  Value *Cond = Sel.getCondition(); 
-  auto *Cmp = dyn_cast<CmpInst>(Cond); 
-  if (!SmallType->isIntOrIntVectorTy(1) && 
-      (!Cmp || Cmp->getOperand(0)->getType() != SmallType)) 
-    return nullptr; 
- 
-  // If the constant is the same after truncation to the smaller type and 
-  // extension to the original type, we can narrow the select. 
-  Type *SelType = Sel.getType(); 
-  Constant *TruncC = ConstantExpr::getTrunc(C, SmallType); 
-  Constant *ExtC = ConstantExpr::getCast(ExtOpcode, TruncC, SelType); 
-  if (ExtC == C && ExtInst->hasOneUse()) { 
-    Value *TruncCVal = cast<Value>(TruncC); 
-    if (ExtInst == Sel.getFalseValue()) 
-      std::swap(X, TruncCVal); 
- 
-    // select Cond, (ext X), C --> ext(select Cond, X, C') 
-    // select Cond, C, (ext X) --> ext(select Cond, C', X) 
-    Value *NewSel = Builder.CreateSelect(Cond, X, TruncCVal, "narrow", &Sel); 
-    return CastInst::Create(Instruction::CastOps(ExtOpcode), NewSel, SelType); 
-  } 
- 
-  // If one arm of the select is the extend of the condition, replace that arm 
-  // with the extension of the appropriate known bool value. 
-  if (Cond == X) { 
-    if (ExtInst == Sel.getTrueValue()) { 
-      // select X, (sext X), C --> select X, -1, C 
-      // select X, (zext X), C --> select X,  1, C 
-      Constant *One = ConstantInt::getTrue(SmallType); 
-      Constant *AllOnesOrOne = ConstantExpr::getCast(ExtOpcode, One, SelType); 
-      return SelectInst::Create(Cond, AllOnesOrOne, C, "", nullptr, &Sel); 
-    } else { 
-      // select X, C, (sext X) --> select X, C, 0 
-      // select X, C, (zext X) --> select X, C, 0 
-      Constant *Zero = ConstantInt::getNullValue(SelType); 
-      return SelectInst::Create(Cond, C, Zero, "", nullptr, &Sel); 
-    } 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Try to transform a vector select with a constant condition vector into a 
-/// shuffle for easier combining with other shuffles and insert/extract. 
-static Instruction *canonicalizeSelectToShuffle(SelectInst &SI) { 
-  Value *CondVal = SI.getCondition(); 
-  Constant *CondC; 
+  Constant *C;
+  if (!match(Sel.getTrueValue(), m_Constant(C)) &&
+      !match(Sel.getFalseValue(), m_Constant(C)))
+    return nullptr;
+
+  Instruction *ExtInst;
+  if (!match(Sel.getTrueValue(), m_Instruction(ExtInst)) &&
+      !match(Sel.getFalseValue(), m_Instruction(ExtInst)))
+    return nullptr;
+
+  auto ExtOpcode = ExtInst->getOpcode();
+  if (ExtOpcode != Instruction::ZExt && ExtOpcode != Instruction::SExt)
+    return nullptr;
+
+  // If we are extending from a boolean type or if we can create a select that
+  // has the same size operands as its condition, try to narrow the select.
+  Value *X = ExtInst->getOperand(0);
+  Type *SmallType = X->getType();
+  Value *Cond = Sel.getCondition();
+  auto *Cmp = dyn_cast<CmpInst>(Cond);
+  if (!SmallType->isIntOrIntVectorTy(1) &&
+      (!Cmp || Cmp->getOperand(0)->getType() != SmallType))
+    return nullptr;
+
+  // If the constant is the same after truncation to the smaller type and
+  // extension to the original type, we can narrow the select.
+  Type *SelType = Sel.getType();
+  Constant *TruncC = ConstantExpr::getTrunc(C, SmallType);
+  Constant *ExtC = ConstantExpr::getCast(ExtOpcode, TruncC, SelType);
+  if (ExtC == C && ExtInst->hasOneUse()) {
+    Value *TruncCVal = cast<Value>(TruncC);
+    if (ExtInst == Sel.getFalseValue())
+      std::swap(X, TruncCVal);
+
+    // select Cond, (ext X), C --> ext(select Cond, X, C')
+    // select Cond, C, (ext X) --> ext(select Cond, C', X)
+    Value *NewSel = Builder.CreateSelect(Cond, X, TruncCVal, "narrow", &Sel);
+    return CastInst::Create(Instruction::CastOps(ExtOpcode), NewSel, SelType);
+  }
+
+  // If one arm of the select is the extend of the condition, replace that arm
+  // with the extension of the appropriate known bool value.
+  if (Cond == X) {
+    if (ExtInst == Sel.getTrueValue()) {
+      // select X, (sext X), C --> select X, -1, C
+      // select X, (zext X), C --> select X,  1, C
+      Constant *One = ConstantInt::getTrue(SmallType);
+      Constant *AllOnesOrOne = ConstantExpr::getCast(ExtOpcode, One, SelType);
+      return SelectInst::Create(Cond, AllOnesOrOne, C, "", nullptr, &Sel);
+    } else {
+      // select X, C, (sext X) --> select X, C, 0
+      // select X, C, (zext X) --> select X, C, 0
+      Constant *Zero = ConstantInt::getNullValue(SelType);
+      return SelectInst::Create(Cond, C, Zero, "", nullptr, &Sel);
+    }
+  }
+
+  return nullptr;
+}
+
+/// Try to transform a vector select with a constant condition vector into a
+/// shuffle for easier combining with other shuffles and insert/extract.
+static Instruction *canonicalizeSelectToShuffle(SelectInst &SI) {
+  Value *CondVal = SI.getCondition();
+  Constant *CondC;
   auto *CondValTy = dyn_cast<FixedVectorType>(CondVal->getType());
   if (!CondValTy || !match(CondVal, m_Constant(CondC)))
-    return nullptr; 
- 
+    return nullptr;
+
   unsigned NumElts = CondValTy->getNumElements();
-  SmallVector<int, 16> Mask; 
-  Mask.reserve(NumElts); 
-  for (unsigned i = 0; i != NumElts; ++i) { 
-    Constant *Elt = CondC->getAggregateElement(i); 
-    if (!Elt) 
-      return nullptr; 
- 
-    if (Elt->isOneValue()) { 
-      // If the select condition element is true, choose from the 1st vector. 
-      Mask.push_back(i); 
-    } else if (Elt->isNullValue()) { 
-      // If the select condition element is false, choose from the 2nd vector. 
-      Mask.push_back(i + NumElts); 
-    } else if (isa<UndefValue>(Elt)) { 
-      // Undef in a select condition (choose one of the operands) does not mean 
-      // the same thing as undef in a shuffle mask (any value is acceptable), so 
-      // give up. 
-      return nullptr; 
-    } else { 
-      // Bail out on a constant expression. 
-      return nullptr; 
-    } 
-  } 
- 
-  return new ShuffleVectorInst(SI.getTrueValue(), SI.getFalseValue(), Mask); 
-} 
- 
-/// If we have a select of vectors with a scalar condition, try to convert that 
-/// to a vector select by splatting the condition. A splat may get folded with 
-/// other operations in IR and having all operands of a select be vector types 
-/// is likely better for vector codegen. 
+  SmallVector<int, 16> Mask;
+  Mask.reserve(NumElts);
+  for (unsigned i = 0; i != NumElts; ++i) {
+    Constant *Elt = CondC->getAggregateElement(i);
+    if (!Elt)
+      return nullptr;
+
+    if (Elt->isOneValue()) {
+      // If the select condition element is true, choose from the 1st vector.
+      Mask.push_back(i);
+    } else if (Elt->isNullValue()) {
+      // If the select condition element is false, choose from the 2nd vector.
+      Mask.push_back(i + NumElts);
+    } else if (isa<UndefValue>(Elt)) {
+      // Undef in a select condition (choose one of the operands) does not mean
+      // the same thing as undef in a shuffle mask (any value is acceptable), so
+      // give up.
+      return nullptr;
+    } else {
+      // Bail out on a constant expression.
+      return nullptr;
+    }
+  }
+
+  return new ShuffleVectorInst(SI.getTrueValue(), SI.getFalseValue(), Mask);
+}
+
+/// If we have a select of vectors with a scalar condition, try to convert that
+/// to a vector select by splatting the condition. A splat may get folded with
+/// other operations in IR and having all operands of a select be vector types
+/// is likely better for vector codegen.
 static Instruction *canonicalizeScalarSelectOfVecs(SelectInst &Sel,
                                                    InstCombinerImpl &IC) {
-  auto *Ty = dyn_cast<VectorType>(Sel.getType()); 
-  if (!Ty) 
-    return nullptr; 
- 
-  // We can replace a single-use extract with constant index. 
-  Value *Cond = Sel.getCondition(); 
-  if (!match(Cond, m_OneUse(m_ExtractElt(m_Value(), m_ConstantInt())))) 
-    return nullptr; 
- 
-  // select (extelt V, Index), T, F --> select (splat V, Index), T, F 
-  // Splatting the extracted condition reduces code (we could directly create a 
-  // splat shuffle of the source vector to eliminate the intermediate step). 
+  auto *Ty = dyn_cast<VectorType>(Sel.getType());
+  if (!Ty)
+    return nullptr;
+
+  // We can replace a single-use extract with constant index.
+  Value *Cond = Sel.getCondition();
+  if (!match(Cond, m_OneUse(m_ExtractElt(m_Value(), m_ConstantInt()))))
+    return nullptr;
+
+  // select (extelt V, Index), T, F --> select (splat V, Index), T, F
+  // Splatting the extracted condition reduces code (we could directly create a
+  // splat shuffle of the source vector to eliminate the intermediate step).
   return IC.replaceOperand(
       Sel, 0, IC.Builder.CreateVectorSplat(Ty->getElementCount(), Cond));
-} 
- 
-/// Reuse bitcasted operands between a compare and select: 
-/// select (cmp (bitcast C), (bitcast D)), (bitcast' C), (bitcast' D) --> 
-/// bitcast (select (cmp (bitcast C), (bitcast D)), (bitcast C), (bitcast D)) 
-static Instruction *foldSelectCmpBitcasts(SelectInst &Sel, 
-                                          InstCombiner::BuilderTy &Builder) { 
-  Value *Cond = Sel.getCondition(); 
-  Value *TVal = Sel.getTrueValue(); 
-  Value *FVal = Sel.getFalseValue(); 
- 
-  CmpInst::Predicate Pred; 
-  Value *A, *B; 
-  if (!match(Cond, m_Cmp(Pred, m_Value(A), m_Value(B)))) 
-    return nullptr; 
- 
-  // The select condition is a compare instruction. If the select's true/false 
-  // values are already the same as the compare operands, there's nothing to do. 
-  if (TVal == A || TVal == B || FVal == A || FVal == B) 
-    return nullptr; 
- 
-  Value *C, *D; 
-  if (!match(A, m_BitCast(m_Value(C))) || !match(B, m_BitCast(m_Value(D)))) 
-    return nullptr; 
- 
-  // select (cmp (bitcast C), (bitcast D)), (bitcast TSrc), (bitcast FSrc) 
-  Value *TSrc, *FSrc; 
-  if (!match(TVal, m_BitCast(m_Value(TSrc))) || 
-      !match(FVal, m_BitCast(m_Value(FSrc)))) 
-    return nullptr; 
- 
-  // If the select true/false values are *different bitcasts* of the same source 
-  // operands, make the select operands the same as the compare operands and 
-  // cast the result. This is the canonical select form for min/max. 
-  Value *NewSel; 
-  if (TSrc == C && FSrc == D) { 
-    // select (cmp (bitcast C), (bitcast D)), (bitcast' C), (bitcast' D) --> 
-    // bitcast (select (cmp A, B), A, B) 
-    NewSel = Builder.CreateSelect(Cond, A, B, "", &Sel); 
-  } else if (TSrc == D && FSrc == C) { 
-    // select (cmp (bitcast C), (bitcast D)), (bitcast' D), (bitcast' C) --> 
-    // bitcast (select (cmp A, B), B, A) 
-    NewSel = Builder.CreateSelect(Cond, B, A, "", &Sel); 
-  } else { 
-    return nullptr; 
-  } 
-  return CastInst::CreateBitOrPointerCast(NewSel, Sel.getType()); 
-} 
- 
-/// Try to eliminate select instructions that test the returned flag of cmpxchg 
-/// instructions. 
-/// 
-/// If a select instruction tests the returned flag of a cmpxchg instruction and 
-/// selects between the returned value of the cmpxchg instruction its compare 
-/// operand, the result of the select will always be equal to its false value. 
-/// For example: 
-/// 
-///   %0 = cmpxchg i64* %ptr, i64 %compare, i64 %new_value seq_cst seq_cst 
-///   %1 = extractvalue { i64, i1 } %0, 1 
-///   %2 = extractvalue { i64, i1 } %0, 0 
-///   %3 = select i1 %1, i64 %compare, i64 %2 
-///   ret i64 %3 
-/// 
-/// The returned value of the cmpxchg instruction (%2) is the original value 
-/// located at %ptr prior to any update. If the cmpxchg operation succeeds, %2 
-/// must have been equal to %compare. Thus, the result of the select is always 
-/// equal to %2, and the code can be simplified to: 
-/// 
-///   %0 = cmpxchg i64* %ptr, i64 %compare, i64 %new_value seq_cst seq_cst 
-///   %1 = extractvalue { i64, i1 } %0, 0 
-///   ret i64 %1 
-/// 
-static Value *foldSelectCmpXchg(SelectInst &SI) { 
-  // A helper that determines if V is an extractvalue instruction whose 
-  // aggregate operand is a cmpxchg instruction and whose single index is equal 
-  // to I. If such conditions are true, the helper returns the cmpxchg 
-  // instruction; otherwise, a nullptr is returned. 
-  auto isExtractFromCmpXchg = [](Value *V, unsigned I) -> AtomicCmpXchgInst * { 
-    auto *Extract = dyn_cast<ExtractValueInst>(V); 
-    if (!Extract) 
-      return nullptr; 
-    if (Extract->getIndices()[0] != I) 
-      return nullptr; 
-    return dyn_cast<AtomicCmpXchgInst>(Extract->getAggregateOperand()); 
-  }; 
- 
-  // If the select has a single user, and this user is a select instruction that 
-  // we can simplify, skip the cmpxchg simplification for now. 
-  if (SI.hasOneUse()) 
-    if (auto *Select = dyn_cast<SelectInst>(SI.user_back())) 
-      if (Select->getCondition() == SI.getCondition()) 
-        if (Select->getFalseValue() == SI.getTrueValue() || 
-            Select->getTrueValue() == SI.getFalseValue()) 
-          return nullptr; 
- 
-  // Ensure the select condition is the returned flag of a cmpxchg instruction. 
-  auto *CmpXchg = isExtractFromCmpXchg(SI.getCondition(), 1); 
-  if (!CmpXchg) 
-    return nullptr; 
- 
-  // Check the true value case: The true value of the select is the returned 
-  // value of the same cmpxchg used by the condition, and the false value is the 
-  // cmpxchg instruction's compare operand. 
-  if (auto *X = isExtractFromCmpXchg(SI.getTrueValue(), 0)) 
-    if (X == CmpXchg && X->getCompareOperand() == SI.getFalseValue()) 
-      return SI.getFalseValue(); 
- 
-  // Check the false value case: The false value of the select is the returned 
-  // value of the same cmpxchg used by the condition, and the true value is the 
-  // cmpxchg instruction's compare operand. 
-  if (auto *X = isExtractFromCmpXchg(SI.getFalseValue(), 0)) 
-    if (X == CmpXchg && X->getCompareOperand() == SI.getTrueValue()) 
-      return SI.getFalseValue(); 
- 
-  return nullptr; 
-} 
- 
-static Instruction *moveAddAfterMinMax(SelectPatternFlavor SPF, Value *X, 
-                                       Value *Y, 
-                                       InstCombiner::BuilderTy &Builder) { 
-  assert(SelectPatternResult::isMinOrMax(SPF) && "Expected min/max pattern"); 
-  bool IsUnsigned = SPF == SelectPatternFlavor::SPF_UMIN || 
-                    SPF == SelectPatternFlavor::SPF_UMAX; 
-  // TODO: If InstSimplify could fold all cases where C2 <= C1, we could change 
-  // the constant value check to an assert. 
-  Value *A; 
-  const APInt *C1, *C2; 
-  if (IsUnsigned && match(X, m_NUWAdd(m_Value(A), m_APInt(C1))) && 
-      match(Y, m_APInt(C2)) && C2->uge(*C1) && X->hasNUses(2)) { 
-    // umin (add nuw A, C1), C2 --> add nuw (umin A, C2 - C1), C1 
-    // umax (add nuw A, C1), C2 --> add nuw (umax A, C2 - C1), C1 
-    Value *NewMinMax = createMinMax(Builder, SPF, A, 
-                                    ConstantInt::get(X->getType(), *C2 - *C1)); 
-    return BinaryOperator::CreateNUW(BinaryOperator::Add, NewMinMax, 
-                                     ConstantInt::get(X->getType(), *C1)); 
-  } 
- 
-  if (!IsUnsigned && match(X, m_NSWAdd(m_Value(A), m_APInt(C1))) && 
-      match(Y, m_APInt(C2)) && X->hasNUses(2)) { 
-    bool Overflow; 
-    APInt Diff = C2->ssub_ov(*C1, Overflow); 
-    if (!Overflow) { 
-      // smin (add nsw A, C1), C2 --> add nsw (smin A, C2 - C1), C1 
-      // smax (add nsw A, C1), C2 --> add nsw (smax A, C2 - C1), C1 
-      Value *NewMinMax = createMinMax(Builder, SPF, A, 
-                                      ConstantInt::get(X->getType(), Diff)); 
-      return BinaryOperator::CreateNSW(BinaryOperator::Add, NewMinMax, 
-                                       ConstantInt::get(X->getType(), *C1)); 
-    } 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Match a sadd_sat or ssub_sat which is using min/max to clamp the value. 
+}
+
+/// Reuse bitcasted operands between a compare and select:
+/// select (cmp (bitcast C), (bitcast D)), (bitcast' C), (bitcast' D) -->
+/// bitcast (select (cmp (bitcast C), (bitcast D)), (bitcast C), (bitcast D))
+static Instruction *foldSelectCmpBitcasts(SelectInst &Sel,
+                                          InstCombiner::BuilderTy &Builder) {
+  Value *Cond = Sel.getCondition();
+  Value *TVal = Sel.getTrueValue();
+  Value *FVal = Sel.getFalseValue();
+
+  CmpInst::Predicate Pred;
+  Value *A, *B;
+  if (!match(Cond, m_Cmp(Pred, m_Value(A), m_Value(B))))
+    return nullptr;
+
+  // The select condition is a compare instruction. If the select's true/false
+  // values are already the same as the compare operands, there's nothing to do.
+  if (TVal == A || TVal == B || FVal == A || FVal == B)
+    return nullptr;
+
+  Value *C, *D;
+  if (!match(A, m_BitCast(m_Value(C))) || !match(B, m_BitCast(m_Value(D))))
+    return nullptr;
+
+  // select (cmp (bitcast C), (bitcast D)), (bitcast TSrc), (bitcast FSrc)
+  Value *TSrc, *FSrc;
+  if (!match(TVal, m_BitCast(m_Value(TSrc))) ||
+      !match(FVal, m_BitCast(m_Value(FSrc))))
+    return nullptr;
+
+  // If the select true/false values are *different bitcasts* of the same source
+  // operands, make the select operands the same as the compare operands and
+  // cast the result. This is the canonical select form for min/max.
+  Value *NewSel;
+  if (TSrc == C && FSrc == D) {
+    // select (cmp (bitcast C), (bitcast D)), (bitcast' C), (bitcast' D) -->
+    // bitcast (select (cmp A, B), A, B)
+    NewSel = Builder.CreateSelect(Cond, A, B, "", &Sel);
+  } else if (TSrc == D && FSrc == C) {
+    // select (cmp (bitcast C), (bitcast D)), (bitcast' D), (bitcast' C) -->
+    // bitcast (select (cmp A, B), B, A)
+    NewSel = Builder.CreateSelect(Cond, B, A, "", &Sel);
+  } else {
+    return nullptr;
+  }
+  return CastInst::CreateBitOrPointerCast(NewSel, Sel.getType());
+}
+
+/// Try to eliminate select instructions that test the returned flag of cmpxchg
+/// instructions.
+///
+/// If a select instruction tests the returned flag of a cmpxchg instruction and
+/// selects between the returned value of the cmpxchg instruction its compare
+/// operand, the result of the select will always be equal to its false value.
+/// For example:
+///
+///   %0 = cmpxchg i64* %ptr, i64 %compare, i64 %new_value seq_cst seq_cst
+///   %1 = extractvalue { i64, i1 } %0, 1
+///   %2 = extractvalue { i64, i1 } %0, 0
+///   %3 = select i1 %1, i64 %compare, i64 %2
+///   ret i64 %3
+///
+/// The returned value of the cmpxchg instruction (%2) is the original value
+/// located at %ptr prior to any update. If the cmpxchg operation succeeds, %2
+/// must have been equal to %compare. Thus, the result of the select is always
+/// equal to %2, and the code can be simplified to:
+///
+///   %0 = cmpxchg i64* %ptr, i64 %compare, i64 %new_value seq_cst seq_cst
+///   %1 = extractvalue { i64, i1 } %0, 0
+///   ret i64 %1
+///
+static Value *foldSelectCmpXchg(SelectInst &SI) {
+  // A helper that determines if V is an extractvalue instruction whose
+  // aggregate operand is a cmpxchg instruction and whose single index is equal
+  // to I. If such conditions are true, the helper returns the cmpxchg
+  // instruction; otherwise, a nullptr is returned.
+  auto isExtractFromCmpXchg = [](Value *V, unsigned I) -> AtomicCmpXchgInst * {
+    auto *Extract = dyn_cast<ExtractValueInst>(V);
+    if (!Extract)
+      return nullptr;
+    if (Extract->getIndices()[0] != I)
+      return nullptr;
+    return dyn_cast<AtomicCmpXchgInst>(Extract->getAggregateOperand());
+  };
+
+  // If the select has a single user, and this user is a select instruction that
+  // we can simplify, skip the cmpxchg simplification for now.
+  if (SI.hasOneUse())
+    if (auto *Select = dyn_cast<SelectInst>(SI.user_back()))
+      if (Select->getCondition() == SI.getCondition())
+        if (Select->getFalseValue() == SI.getTrueValue() ||
+            Select->getTrueValue() == SI.getFalseValue())
+          return nullptr;
+
+  // Ensure the select condition is the returned flag of a cmpxchg instruction.
+  auto *CmpXchg = isExtractFromCmpXchg(SI.getCondition(), 1);
+  if (!CmpXchg)
+    return nullptr;
+
+  // Check the true value case: The true value of the select is the returned
+  // value of the same cmpxchg used by the condition, and the false value is the
+  // cmpxchg instruction's compare operand.
+  if (auto *X = isExtractFromCmpXchg(SI.getTrueValue(), 0))
+    if (X == CmpXchg && X->getCompareOperand() == SI.getFalseValue())
+      return SI.getFalseValue();
+
+  // Check the false value case: The false value of the select is the returned
+  // value of the same cmpxchg used by the condition, and the true value is the
+  // cmpxchg instruction's compare operand.
+  if (auto *X = isExtractFromCmpXchg(SI.getFalseValue(), 0))
+    if (X == CmpXchg && X->getCompareOperand() == SI.getTrueValue())
+      return SI.getFalseValue();
+
+  return nullptr;
+}
+
+static Instruction *moveAddAfterMinMax(SelectPatternFlavor SPF, Value *X,
+                                       Value *Y,
+                                       InstCombiner::BuilderTy &Builder) {
+  assert(SelectPatternResult::isMinOrMax(SPF) && "Expected min/max pattern");
+  bool IsUnsigned = SPF == SelectPatternFlavor::SPF_UMIN ||
+                    SPF == SelectPatternFlavor::SPF_UMAX;
+  // TODO: If InstSimplify could fold all cases where C2 <= C1, we could change
+  // the constant value check to an assert.
+  Value *A;
+  const APInt *C1, *C2;
+  if (IsUnsigned && match(X, m_NUWAdd(m_Value(A), m_APInt(C1))) &&
+      match(Y, m_APInt(C2)) && C2->uge(*C1) && X->hasNUses(2)) {
+    // umin (add nuw A, C1), C2 --> add nuw (umin A, C2 - C1), C1
+    // umax (add nuw A, C1), C2 --> add nuw (umax A, C2 - C1), C1
+    Value *NewMinMax = createMinMax(Builder, SPF, A,
+                                    ConstantInt::get(X->getType(), *C2 - *C1));
+    return BinaryOperator::CreateNUW(BinaryOperator::Add, NewMinMax,
+                                     ConstantInt::get(X->getType(), *C1));
+  }
+
+  if (!IsUnsigned && match(X, m_NSWAdd(m_Value(A), m_APInt(C1))) &&
+      match(Y, m_APInt(C2)) && X->hasNUses(2)) {
+    bool Overflow;
+    APInt Diff = C2->ssub_ov(*C1, Overflow);
+    if (!Overflow) {
+      // smin (add nsw A, C1), C2 --> add nsw (smin A, C2 - C1), C1
+      // smax (add nsw A, C1), C2 --> add nsw (smax A, C2 - C1), C1
+      Value *NewMinMax = createMinMax(Builder, SPF, A,
+                                      ConstantInt::get(X->getType(), Diff));
+      return BinaryOperator::CreateNSW(BinaryOperator::Add, NewMinMax,
+                                       ConstantInt::get(X->getType(), *C1));
+    }
+  }
+
+  return nullptr;
+}
+
+/// Match a sadd_sat or ssub_sat which is using min/max to clamp the value.
 Instruction *InstCombinerImpl::matchSAddSubSat(SelectInst &MinMax1) {
-  Type *Ty = MinMax1.getType(); 
- 
-  // We are looking for a tree of: 
-  // max(INT_MIN, min(INT_MAX, add(sext(A), sext(B)))) 
-  // Where the min and max could be reversed 
-  Instruction *MinMax2; 
-  BinaryOperator *AddSub; 
-  const APInt *MinValue, *MaxValue; 
-  if (match(&MinMax1, m_SMin(m_Instruction(MinMax2), m_APInt(MaxValue)))) { 
-    if (!match(MinMax2, m_SMax(m_BinOp(AddSub), m_APInt(MinValue)))) 
-      return nullptr; 
-  } else if (match(&MinMax1, 
-                   m_SMax(m_Instruction(MinMax2), m_APInt(MinValue)))) { 
-    if (!match(MinMax2, m_SMin(m_BinOp(AddSub), m_APInt(MaxValue)))) 
-      return nullptr; 
-  } else 
-    return nullptr; 
- 
-  // Check that the constants clamp a saturate, and that the new type would be 
-  // sensible to convert to. 
-  if (!(*MaxValue + 1).isPowerOf2() || -*MinValue != *MaxValue + 1) 
-    return nullptr; 
-  // In what bitwidth can this be treated as saturating arithmetics? 
-  unsigned NewBitWidth = (*MaxValue + 1).logBase2() + 1; 
-  // FIXME: This isn't quite right for vectors, but using the scalar type is a 
-  // good first approximation for what should be done there. 
-  if (!shouldChangeType(Ty->getScalarType()->getIntegerBitWidth(), NewBitWidth)) 
-    return nullptr; 
- 
-  // Also make sure that the number of uses is as expected. The "3"s are for the 
-  // the two items of min/max (the compare and the select). 
-  if (MinMax2->hasNUsesOrMore(3) || AddSub->hasNUsesOrMore(3)) 
-    return nullptr; 
- 
-  // Create the new type (which can be a vector type) 
-  Type *NewTy = Ty->getWithNewBitWidth(NewBitWidth); 
-  // Match the two extends from the add/sub 
-  Value *A, *B; 
-  if(!match(AddSub, m_BinOp(m_SExt(m_Value(A)), m_SExt(m_Value(B))))) 
-    return nullptr; 
-  // And check the incoming values are of a type smaller than or equal to the 
-  // size of the saturation. Otherwise the higher bits can cause different 
-  // results. 
-  if (A->getType()->getScalarSizeInBits() > NewBitWidth || 
-      B->getType()->getScalarSizeInBits() > NewBitWidth) 
-    return nullptr; 
- 
-  Intrinsic::ID IntrinsicID; 
-  if (AddSub->getOpcode() == Instruction::Add) 
-    IntrinsicID = Intrinsic::sadd_sat; 
-  else if (AddSub->getOpcode() == Instruction::Sub) 
-    IntrinsicID = Intrinsic::ssub_sat; 
-  else 
-    return nullptr; 
- 
-  // Finally create and return the sat intrinsic, truncated to the new type 
-  Function *F = Intrinsic::getDeclaration(MinMax1.getModule(), IntrinsicID, NewTy); 
-  Value *AT = Builder.CreateSExt(A, NewTy); 
-  Value *BT = Builder.CreateSExt(B, NewTy); 
-  Value *Sat = Builder.CreateCall(F, {AT, BT}); 
-  return CastInst::Create(Instruction::SExt, Sat, Ty); 
-} 
- 
-/// Reduce a sequence of min/max with a common operand. 
-static Instruction *factorizeMinMaxTree(SelectPatternFlavor SPF, Value *LHS, 
-                                        Value *RHS, 
-                                        InstCombiner::BuilderTy &Builder) { 
-  assert(SelectPatternResult::isMinOrMax(SPF) && "Expected a min/max"); 
-  // TODO: Allow FP min/max with nnan/nsz. 
-  if (!LHS->getType()->isIntOrIntVectorTy()) 
-    return nullptr; 
- 
-  // Match 3 of the same min/max ops. Example: umin(umin(), umin()). 
-  Value *A, *B, *C, *D; 
-  SelectPatternResult L = matchSelectPattern(LHS, A, B); 
-  SelectPatternResult R = matchSelectPattern(RHS, C, D); 
-  if (SPF != L.Flavor || L.Flavor != R.Flavor) 
-    return nullptr; 
- 
-  // Look for a common operand. The use checks are different than usual because 
-  // a min/max pattern typically has 2 uses of each op: 1 by the cmp and 1 by 
-  // the select. 
-  Value *MinMaxOp = nullptr; 
-  Value *ThirdOp = nullptr; 
-  if (!LHS->hasNUsesOrMore(3) && RHS->hasNUsesOrMore(3)) { 
-    // If the LHS is only used in this chain and the RHS is used outside of it, 
-    // reuse the RHS min/max because that will eliminate the LHS. 
-    if (D == A || C == A) { 
-      // min(min(a, b), min(c, a)) --> min(min(c, a), b) 
-      // min(min(a, b), min(a, d)) --> min(min(a, d), b) 
-      MinMaxOp = RHS; 
-      ThirdOp = B; 
-    } else if (D == B || C == B) { 
-      // min(min(a, b), min(c, b)) --> min(min(c, b), a) 
-      // min(min(a, b), min(b, d)) --> min(min(b, d), a) 
-      MinMaxOp = RHS; 
-      ThirdOp = A; 
-    } 
-  } else if (!RHS->hasNUsesOrMore(3)) { 
-    // Reuse the LHS. This will eliminate the RHS. 
-    if (D == A || D == B) { 
-      // min(min(a, b), min(c, a)) --> min(min(a, b), c) 
-      // min(min(a, b), min(c, b)) --> min(min(a, b), c) 
-      MinMaxOp = LHS; 
-      ThirdOp = C; 
-    } else if (C == A || C == B) { 
-      // min(min(a, b), min(b, d)) --> min(min(a, b), d) 
-      // min(min(a, b), min(c, b)) --> min(min(a, b), d) 
-      MinMaxOp = LHS; 
-      ThirdOp = D; 
-    } 
-  } 
-  if (!MinMaxOp || !ThirdOp) 
-    return nullptr; 
- 
-  CmpInst::Predicate P = getMinMaxPred(SPF); 
-  Value *CmpABC = Builder.CreateICmp(P, MinMaxOp, ThirdOp); 
-  return SelectInst::Create(CmpABC, MinMaxOp, ThirdOp); 
-} 
- 
+  Type *Ty = MinMax1.getType();
+
+  // We are looking for a tree of:
+  // max(INT_MIN, min(INT_MAX, add(sext(A), sext(B))))
+  // Where the min and max could be reversed
+  Instruction *MinMax2;
+  BinaryOperator *AddSub;
+  const APInt *MinValue, *MaxValue;
+  if (match(&MinMax1, m_SMin(m_Instruction(MinMax2), m_APInt(MaxValue)))) {
+    if (!match(MinMax2, m_SMax(m_BinOp(AddSub), m_APInt(MinValue))))
+      return nullptr;
+  } else if (match(&MinMax1,
+                   m_SMax(m_Instruction(MinMax2), m_APInt(MinValue)))) {
+    if (!match(MinMax2, m_SMin(m_BinOp(AddSub), m_APInt(MaxValue))))
+      return nullptr;
+  } else
+    return nullptr;
+
+  // Check that the constants clamp a saturate, and that the new type would be
+  // sensible to convert to.
+  if (!(*MaxValue + 1).isPowerOf2() || -*MinValue != *MaxValue + 1)
+    return nullptr;
+  // In what bitwidth can this be treated as saturating arithmetics?
+  unsigned NewBitWidth = (*MaxValue + 1).logBase2() + 1;
+  // FIXME: This isn't quite right for vectors, but using the scalar type is a
+  // good first approximation for what should be done there.
+  if (!shouldChangeType(Ty->getScalarType()->getIntegerBitWidth(), NewBitWidth))
+    return nullptr;
+
+  // Also make sure that the number of uses is as expected. The "3"s are for the
+  // the two items of min/max (the compare and the select).
+  if (MinMax2->hasNUsesOrMore(3) || AddSub->hasNUsesOrMore(3))
+    return nullptr;
+
+  // Create the new type (which can be a vector type)
+  Type *NewTy = Ty->getWithNewBitWidth(NewBitWidth);
+  // Match the two extends from the add/sub
+  Value *A, *B;
+  if(!match(AddSub, m_BinOp(m_SExt(m_Value(A)), m_SExt(m_Value(B)))))
+    return nullptr;
+  // And check the incoming values are of a type smaller than or equal to the
+  // size of the saturation. Otherwise the higher bits can cause different
+  // results.
+  if (A->getType()->getScalarSizeInBits() > NewBitWidth ||
+      B->getType()->getScalarSizeInBits() > NewBitWidth)
+    return nullptr;
+
+  Intrinsic::ID IntrinsicID;
+  if (AddSub->getOpcode() == Instruction::Add)
+    IntrinsicID = Intrinsic::sadd_sat;
+  else if (AddSub->getOpcode() == Instruction::Sub)
+    IntrinsicID = Intrinsic::ssub_sat;
+  else
+    return nullptr;
+
+  // Finally create and return the sat intrinsic, truncated to the new type
+  Function *F = Intrinsic::getDeclaration(MinMax1.getModule(), IntrinsicID, NewTy);
+  Value *AT = Builder.CreateSExt(A, NewTy);
+  Value *BT = Builder.CreateSExt(B, NewTy);
+  Value *Sat = Builder.CreateCall(F, {AT, BT});
+  return CastInst::Create(Instruction::SExt, Sat, Ty);
+}
+
+/// Reduce a sequence of min/max with a common operand.
+static Instruction *factorizeMinMaxTree(SelectPatternFlavor SPF, Value *LHS,
+                                        Value *RHS,
+                                        InstCombiner::BuilderTy &Builder) {
+  assert(SelectPatternResult::isMinOrMax(SPF) && "Expected a min/max");
+  // TODO: Allow FP min/max with nnan/nsz.
+  if (!LHS->getType()->isIntOrIntVectorTy())
+    return nullptr;
+
+  // Match 3 of the same min/max ops. Example: umin(umin(), umin()).
+  Value *A, *B, *C, *D;
+  SelectPatternResult L = matchSelectPattern(LHS, A, B);
+  SelectPatternResult R = matchSelectPattern(RHS, C, D);
+  if (SPF != L.Flavor || L.Flavor != R.Flavor)
+    return nullptr;
+
+  // Look for a common operand. The use checks are different than usual because
+  // a min/max pattern typically has 2 uses of each op: 1 by the cmp and 1 by
+  // the select.
+  Value *MinMaxOp = nullptr;
+  Value *ThirdOp = nullptr;
+  if (!LHS->hasNUsesOrMore(3) && RHS->hasNUsesOrMore(3)) {
+    // If the LHS is only used in this chain and the RHS is used outside of it,
+    // reuse the RHS min/max because that will eliminate the LHS.
+    if (D == A || C == A) {
+      // min(min(a, b), min(c, a)) --> min(min(c, a), b)
+      // min(min(a, b), min(a, d)) --> min(min(a, d), b)
+      MinMaxOp = RHS;
+      ThirdOp = B;
+    } else if (D == B || C == B) {
+      // min(min(a, b), min(c, b)) --> min(min(c, b), a)
+      // min(min(a, b), min(b, d)) --> min(min(b, d), a)
+      MinMaxOp = RHS;
+      ThirdOp = A;
+    }
+  } else if (!RHS->hasNUsesOrMore(3)) {
+    // Reuse the LHS. This will eliminate the RHS.
+    if (D == A || D == B) {
+      // min(min(a, b), min(c, a)) --> min(min(a, b), c)
+      // min(min(a, b), min(c, b)) --> min(min(a, b), c)
+      MinMaxOp = LHS;
+      ThirdOp = C;
+    } else if (C == A || C == B) {
+      // min(min(a, b), min(b, d)) --> min(min(a, b), d)
+      // min(min(a, b), min(c, b)) --> min(min(a, b), d)
+      MinMaxOp = LHS;
+      ThirdOp = D;
+    }
+  }
+  if (!MinMaxOp || !ThirdOp)
+    return nullptr;
+
+  CmpInst::Predicate P = getMinMaxPred(SPF);
+  Value *CmpABC = Builder.CreateICmp(P, MinMaxOp, ThirdOp);
+  return SelectInst::Create(CmpABC, MinMaxOp, ThirdOp);
+}
+
 /// Try to reduce a funnel/rotate pattern that includes a compare and select
 /// into a funnel shift intrinsic. Example:
-/// rotl32(a, b) --> (b == 0 ? a : ((a >> (32 - b)) | (a << b))) 
-///              --> call llvm.fshl.i32(a, a, b) 
+/// rotl32(a, b) --> (b == 0 ? a : ((a >> (32 - b)) | (a << b)))
+///              --> call llvm.fshl.i32(a, a, b)
 /// fshl32(a, b, c) --> (c == 0 ? a : ((b >> (32 - c)) | (a << c)))
 ///                 --> call llvm.fshl.i32(a, b, c)
 /// fshr32(a, b, c) --> (c == 0 ? b : ((a >> (32 - c)) | (b << c)))
@@ -2291,20 +2291,20 @@ static Instruction *foldSelectFunnelShift(SelectInst &Sel,
   // This must be a power-of-2 type for a bitmasking transform to be valid.
   unsigned Width = Sel.getType()->getScalarSizeInBits();
   if (!isPowerOf2_32(Width))
-    return nullptr; 
- 
+    return nullptr;
+
   BinaryOperator *Or0, *Or1;
   if (!match(Sel.getFalseValue(), m_OneUse(m_Or(m_BinOp(Or0), m_BinOp(Or1)))))
-    return nullptr; 
- 
+    return nullptr;
+
   Value *SV0, *SV1, *SA0, *SA1;
   if (!match(Or0, m_OneUse(m_LogicalShift(m_Value(SV0),
                                           m_ZExtOrSelf(m_Value(SA0))))) ||
       !match(Or1, m_OneUse(m_LogicalShift(m_Value(SV1),
                                           m_ZExtOrSelf(m_Value(SA1))))) ||
       Or0->getOpcode() == Or1->getOpcode())
-    return nullptr; 
- 
+    return nullptr;
+
   // Canonicalize to or(shl(SV0, SA0), lshr(SV1, SA1)).
   if (Or0->getOpcode() == BinaryOperator::LShr) {
     std::swap(Or0, Or1);
@@ -2314,16 +2314,16 @@ static Instruction *foldSelectFunnelShift(SelectInst &Sel,
   assert(Or0->getOpcode() == BinaryOperator::Shl &&
          Or1->getOpcode() == BinaryOperator::LShr &&
          "Illegal or(shift,shift) pair");
- 
-  // Check the shift amounts to see if they are an opposite pair. 
-  Value *ShAmt; 
-  if (match(SA1, m_OneUse(m_Sub(m_SpecificInt(Width), m_Specific(SA0))))) 
-    ShAmt = SA0; 
-  else if (match(SA0, m_OneUse(m_Sub(m_SpecificInt(Width), m_Specific(SA1))))) 
-    ShAmt = SA1; 
-  else 
-    return nullptr; 
- 
+
+  // Check the shift amounts to see if they are an opposite pair.
+  Value *ShAmt;
+  if (match(SA1, m_OneUse(m_Sub(m_SpecificInt(Width), m_Specific(SA0)))))
+    ShAmt = SA0;
+  else if (match(SA0, m_OneUse(m_Sub(m_SpecificInt(Width), m_Specific(SA1)))))
+    ShAmt = SA1;
+  else
+    return nullptr;
+
   // We should now have this pattern:
   // select ?, TVal, (or (shl SV0, SA0), (lshr SV1, SA1))
   // The false value of the select must be a funnel-shift of the true value:
@@ -2333,13 +2333,13 @@ static Instruction *foldSelectFunnelShift(SelectInst &Sel,
   if ((IsFshl && TVal != SV0) || (!IsFshl && TVal != SV1))
     return nullptr;
 
-  // Finally, see if the select is filtering out a shift-by-zero. 
-  Value *Cond = Sel.getCondition(); 
-  ICmpInst::Predicate Pred; 
-  if (!match(Cond, m_OneUse(m_ICmp(Pred, m_Specific(ShAmt), m_ZeroInt()))) || 
-      Pred != ICmpInst::ICMP_EQ) 
-    return nullptr; 
- 
+  // Finally, see if the select is filtering out a shift-by-zero.
+  Value *Cond = Sel.getCondition();
+  ICmpInst::Predicate Pred;
+  if (!match(Cond, m_OneUse(m_ICmp(Pred, m_Specific(ShAmt), m_ZeroInt()))) ||
+      Pred != ICmpInst::ICMP_EQ)
+    return nullptr;
+
   // If this is not a rotate then the select was blocking poison from the
   // 'shift-by-zero' non-TVal, but a funnel shift won't - so freeze it.
   if (SV0 != SV1) {
@@ -2350,186 +2350,186 @@ static Instruction *foldSelectFunnelShift(SelectInst &Sel,
   }
 
   // This is a funnel/rotate that avoids shift-by-bitwidth UB in a suboptimal way.
-  // Convert to funnel shift intrinsic. 
-  Intrinsic::ID IID = IsFshl ? Intrinsic::fshl : Intrinsic::fshr; 
-  Function *F = Intrinsic::getDeclaration(Sel.getModule(), IID, Sel.getType()); 
+  // Convert to funnel shift intrinsic.
+  Intrinsic::ID IID = IsFshl ? Intrinsic::fshl : Intrinsic::fshr;
+  Function *F = Intrinsic::getDeclaration(Sel.getModule(), IID, Sel.getType());
   ShAmt = Builder.CreateZExt(ShAmt, Sel.getType());
   return IntrinsicInst::Create(F, { SV0, SV1, ShAmt });
-} 
- 
-static Instruction *foldSelectToCopysign(SelectInst &Sel, 
-                                         InstCombiner::BuilderTy &Builder) { 
-  Value *Cond = Sel.getCondition(); 
-  Value *TVal = Sel.getTrueValue(); 
-  Value *FVal = Sel.getFalseValue(); 
-  Type *SelType = Sel.getType(); 
- 
-  // Match select ?, TC, FC where the constants are equal but negated. 
-  // TODO: Generalize to handle a negated variable operand? 
-  const APFloat *TC, *FC; 
-  if (!match(TVal, m_APFloat(TC)) || !match(FVal, m_APFloat(FC)) || 
-      !abs(*TC).bitwiseIsEqual(abs(*FC))) 
-    return nullptr; 
- 
-  assert(TC != FC && "Expected equal select arms to simplify"); 
- 
-  Value *X; 
-  const APInt *C; 
-  bool IsTrueIfSignSet; 
-  ICmpInst::Predicate Pred; 
-  if (!match(Cond, m_OneUse(m_ICmp(Pred, m_BitCast(m_Value(X)), m_APInt(C)))) || 
+}
+
+static Instruction *foldSelectToCopysign(SelectInst &Sel,
+                                         InstCombiner::BuilderTy &Builder) {
+  Value *Cond = Sel.getCondition();
+  Value *TVal = Sel.getTrueValue();
+  Value *FVal = Sel.getFalseValue();
+  Type *SelType = Sel.getType();
+
+  // Match select ?, TC, FC where the constants are equal but negated.
+  // TODO: Generalize to handle a negated variable operand?
+  const APFloat *TC, *FC;
+  if (!match(TVal, m_APFloat(TC)) || !match(FVal, m_APFloat(FC)) ||
+      !abs(*TC).bitwiseIsEqual(abs(*FC)))
+    return nullptr;
+
+  assert(TC != FC && "Expected equal select arms to simplify");
+
+  Value *X;
+  const APInt *C;
+  bool IsTrueIfSignSet;
+  ICmpInst::Predicate Pred;
+  if (!match(Cond, m_OneUse(m_ICmp(Pred, m_BitCast(m_Value(X)), m_APInt(C)))) ||
       !InstCombiner::isSignBitCheck(Pred, *C, IsTrueIfSignSet) ||
       X->getType() != SelType)
-    return nullptr; 
- 
-  // If needed, negate the value that will be the sign argument of the copysign: 
-  // (bitcast X) <  0 ? -TC :  TC --> copysign(TC,  X) 
-  // (bitcast X) <  0 ?  TC : -TC --> copysign(TC, -X) 
-  // (bitcast X) >= 0 ? -TC :  TC --> copysign(TC, -X) 
-  // (bitcast X) >= 0 ?  TC : -TC --> copysign(TC,  X) 
-  if (IsTrueIfSignSet ^ TC->isNegative()) 
-    X = Builder.CreateFNegFMF(X, &Sel); 
- 
-  // Canonicalize the magnitude argument as the positive constant since we do 
-  // not care about its sign. 
-  Value *MagArg = TC->isNegative() ? FVal : TVal; 
-  Function *F = Intrinsic::getDeclaration(Sel.getModule(), Intrinsic::copysign, 
-                                          Sel.getType()); 
-  Instruction *CopySign = IntrinsicInst::Create(F, { MagArg, X }); 
-  CopySign->setFastMathFlags(Sel.getFastMathFlags()); 
-  return CopySign; 
-} 
- 
+    return nullptr;
+
+  // If needed, negate the value that will be the sign argument of the copysign:
+  // (bitcast X) <  0 ? -TC :  TC --> copysign(TC,  X)
+  // (bitcast X) <  0 ?  TC : -TC --> copysign(TC, -X)
+  // (bitcast X) >= 0 ? -TC :  TC --> copysign(TC, -X)
+  // (bitcast X) >= 0 ?  TC : -TC --> copysign(TC,  X)
+  if (IsTrueIfSignSet ^ TC->isNegative())
+    X = Builder.CreateFNegFMF(X, &Sel);
+
+  // Canonicalize the magnitude argument as the positive constant since we do
+  // not care about its sign.
+  Value *MagArg = TC->isNegative() ? FVal : TVal;
+  Function *F = Intrinsic::getDeclaration(Sel.getModule(), Intrinsic::copysign,
+                                          Sel.getType());
+  Instruction *CopySign = IntrinsicInst::Create(F, { MagArg, X });
+  CopySign->setFastMathFlags(Sel.getFastMathFlags());
+  return CopySign;
+}
+
 Instruction *InstCombinerImpl::foldVectorSelect(SelectInst &Sel) {
-  auto *VecTy = dyn_cast<FixedVectorType>(Sel.getType()); 
-  if (!VecTy) 
-    return nullptr; 
- 
-  unsigned NumElts = VecTy->getNumElements(); 
-  APInt UndefElts(NumElts, 0); 
-  APInt AllOnesEltMask(APInt::getAllOnesValue(NumElts)); 
-  if (Value *V = SimplifyDemandedVectorElts(&Sel, AllOnesEltMask, UndefElts)) { 
-    if (V != &Sel) 
-      return replaceInstUsesWith(Sel, V); 
-    return &Sel; 
-  } 
- 
-  // A select of a "select shuffle" with a common operand can be rearranged 
-  // to select followed by "select shuffle". Because of poison, this only works 
-  // in the case of a shuffle with no undefined mask elements. 
-  Value *Cond = Sel.getCondition(); 
-  Value *TVal = Sel.getTrueValue(); 
-  Value *FVal = Sel.getFalseValue(); 
-  Value *X, *Y; 
-  ArrayRef<int> Mask; 
-  if (match(TVal, m_OneUse(m_Shuffle(m_Value(X), m_Value(Y), m_Mask(Mask)))) && 
-      !is_contained(Mask, UndefMaskElem) && 
-      cast<ShuffleVectorInst>(TVal)->isSelect()) { 
-    if (X == FVal) { 
-      // select Cond, (shuf_sel X, Y), X --> shuf_sel X, (select Cond, Y, X) 
-      Value *NewSel = Builder.CreateSelect(Cond, Y, X, "sel", &Sel); 
-      return new ShuffleVectorInst(X, NewSel, Mask); 
-    } 
-    if (Y == FVal) { 
-      // select Cond, (shuf_sel X, Y), Y --> shuf_sel (select Cond, X, Y), Y 
-      Value *NewSel = Builder.CreateSelect(Cond, X, Y, "sel", &Sel); 
-      return new ShuffleVectorInst(NewSel, Y, Mask); 
-    } 
-  } 
-  if (match(FVal, m_OneUse(m_Shuffle(m_Value(X), m_Value(Y), m_Mask(Mask)))) && 
-      !is_contained(Mask, UndefMaskElem) && 
-      cast<ShuffleVectorInst>(FVal)->isSelect()) { 
-    if (X == TVal) { 
-      // select Cond, X, (shuf_sel X, Y) --> shuf_sel X, (select Cond, X, Y) 
-      Value *NewSel = Builder.CreateSelect(Cond, X, Y, "sel", &Sel); 
-      return new ShuffleVectorInst(X, NewSel, Mask); 
-    } 
-    if (Y == TVal) { 
-      // select Cond, Y, (shuf_sel X, Y) --> shuf_sel (select Cond, Y, X), Y 
-      Value *NewSel = Builder.CreateSelect(Cond, Y, X, "sel", &Sel); 
-      return new ShuffleVectorInst(NewSel, Y, Mask); 
-    } 
-  } 
- 
-  return nullptr; 
-} 
- 
-static Instruction *foldSelectToPhiImpl(SelectInst &Sel, BasicBlock *BB, 
-                                        const DominatorTree &DT, 
-                                        InstCombiner::BuilderTy &Builder) { 
-  // Find the block's immediate dominator that ends with a conditional branch 
-  // that matches select's condition (maybe inverted). 
-  auto *IDomNode = DT[BB]->getIDom(); 
-  if (!IDomNode) 
-    return nullptr; 
-  BasicBlock *IDom = IDomNode->getBlock(); 
- 
-  Value *Cond = Sel.getCondition(); 
-  Value *IfTrue, *IfFalse; 
-  BasicBlock *TrueSucc, *FalseSucc; 
-  if (match(IDom->getTerminator(), 
-            m_Br(m_Specific(Cond), m_BasicBlock(TrueSucc), 
-                 m_BasicBlock(FalseSucc)))) { 
-    IfTrue = Sel.getTrueValue(); 
-    IfFalse = Sel.getFalseValue(); 
-  } else if (match(IDom->getTerminator(), 
-                   m_Br(m_Not(m_Specific(Cond)), m_BasicBlock(TrueSucc), 
-                        m_BasicBlock(FalseSucc)))) { 
-    IfTrue = Sel.getFalseValue(); 
-    IfFalse = Sel.getTrueValue(); 
-  } else 
-    return nullptr; 
- 
-  // Make sure the branches are actually different. 
-  if (TrueSucc == FalseSucc) 
-    return nullptr; 
- 
-  // We want to replace select %cond, %a, %b with a phi that takes value %a 
-  // for all incoming edges that are dominated by condition `%cond == true`, 
-  // and value %b for edges dominated by condition `%cond == false`. If %a 
-  // or %b are also phis from the same basic block, we can go further and take 
-  // their incoming values from the corresponding blocks. 
-  BasicBlockEdge TrueEdge(IDom, TrueSucc); 
-  BasicBlockEdge FalseEdge(IDom, FalseSucc); 
-  DenseMap<BasicBlock *, Value *> Inputs; 
-  for (auto *Pred : predecessors(BB)) { 
-    // Check implication. 
-    BasicBlockEdge Incoming(Pred, BB); 
-    if (DT.dominates(TrueEdge, Incoming)) 
-      Inputs[Pred] = IfTrue->DoPHITranslation(BB, Pred); 
-    else if (DT.dominates(FalseEdge, Incoming)) 
-      Inputs[Pred] = IfFalse->DoPHITranslation(BB, Pred); 
-    else 
-      return nullptr; 
-    // Check availability. 
-    if (auto *Insn = dyn_cast<Instruction>(Inputs[Pred])) 
-      if (!DT.dominates(Insn, Pred->getTerminator())) 
-        return nullptr; 
-  } 
- 
-  Builder.SetInsertPoint(&*BB->begin()); 
-  auto *PN = Builder.CreatePHI(Sel.getType(), Inputs.size()); 
-  for (auto *Pred : predecessors(BB)) 
-    PN->addIncoming(Inputs[Pred], Pred); 
-  PN->takeName(&Sel); 
-  return PN; 
-} 
- 
-static Instruction *foldSelectToPhi(SelectInst &Sel, const DominatorTree &DT, 
-                                    InstCombiner::BuilderTy &Builder) { 
-  // Try to replace this select with Phi in one of these blocks. 
-  SmallSetVector<BasicBlock *, 4> CandidateBlocks; 
-  CandidateBlocks.insert(Sel.getParent()); 
-  for (Value *V : Sel.operands()) 
-    if (auto *I = dyn_cast<Instruction>(V)) 
-      CandidateBlocks.insert(I->getParent()); 
- 
-  for (BasicBlock *BB : CandidateBlocks) 
-    if (auto *PN = foldSelectToPhiImpl(Sel, BB, DT, Builder)) 
-      return PN; 
-  return nullptr; 
-} 
- 
+  auto *VecTy = dyn_cast<FixedVectorType>(Sel.getType());
+  if (!VecTy)
+    return nullptr;
+
+  unsigned NumElts = VecTy->getNumElements();
+  APInt UndefElts(NumElts, 0);
+  APInt AllOnesEltMask(APInt::getAllOnesValue(NumElts));
+  if (Value *V = SimplifyDemandedVectorElts(&Sel, AllOnesEltMask, UndefElts)) {
+    if (V != &Sel)
+      return replaceInstUsesWith(Sel, V);
+    return &Sel;
+  }
+
+  // A select of a "select shuffle" with a common operand can be rearranged
+  // to select followed by "select shuffle". Because of poison, this only works
+  // in the case of a shuffle with no undefined mask elements.
+  Value *Cond = Sel.getCondition();
+  Value *TVal = Sel.getTrueValue();
+  Value *FVal = Sel.getFalseValue();
+  Value *X, *Y;
+  ArrayRef<int> Mask;
+  if (match(TVal, m_OneUse(m_Shuffle(m_Value(X), m_Value(Y), m_Mask(Mask)))) &&
+      !is_contained(Mask, UndefMaskElem) &&
+      cast<ShuffleVectorInst>(TVal)->isSelect()) {
+    if (X == FVal) {
+      // select Cond, (shuf_sel X, Y), X --> shuf_sel X, (select Cond, Y, X)
+      Value *NewSel = Builder.CreateSelect(Cond, Y, X, "sel", &Sel);
+      return new ShuffleVectorInst(X, NewSel, Mask);
+    }
+    if (Y == FVal) {
+      // select Cond, (shuf_sel X, Y), Y --> shuf_sel (select Cond, X, Y), Y
+      Value *NewSel = Builder.CreateSelect(Cond, X, Y, "sel", &Sel);
+      return new ShuffleVectorInst(NewSel, Y, Mask);
+    }
+  }
+  if (match(FVal, m_OneUse(m_Shuffle(m_Value(X), m_Value(Y), m_Mask(Mask)))) &&
+      !is_contained(Mask, UndefMaskElem) &&
+      cast<ShuffleVectorInst>(FVal)->isSelect()) {
+    if (X == TVal) {
+      // select Cond, X, (shuf_sel X, Y) --> shuf_sel X, (select Cond, X, Y)
+      Value *NewSel = Builder.CreateSelect(Cond, X, Y, "sel", &Sel);
+      return new ShuffleVectorInst(X, NewSel, Mask);
+    }
+    if (Y == TVal) {
+      // select Cond, Y, (shuf_sel X, Y) --> shuf_sel (select Cond, Y, X), Y
+      Value *NewSel = Builder.CreateSelect(Cond, Y, X, "sel", &Sel);
+      return new ShuffleVectorInst(NewSel, Y, Mask);
+    }
+  }
+
+  return nullptr;
+}
+
+static Instruction *foldSelectToPhiImpl(SelectInst &Sel, BasicBlock *BB,
+                                        const DominatorTree &DT,
+                                        InstCombiner::BuilderTy &Builder) {
+  // Find the block's immediate dominator that ends with a conditional branch
+  // that matches select's condition (maybe inverted).
+  auto *IDomNode = DT[BB]->getIDom();
+  if (!IDomNode)
+    return nullptr;
+  BasicBlock *IDom = IDomNode->getBlock();
+
+  Value *Cond = Sel.getCondition();
+  Value *IfTrue, *IfFalse;
+  BasicBlock *TrueSucc, *FalseSucc;
+  if (match(IDom->getTerminator(),
+            m_Br(m_Specific(Cond), m_BasicBlock(TrueSucc),
+                 m_BasicBlock(FalseSucc)))) {
+    IfTrue = Sel.getTrueValue();
+    IfFalse = Sel.getFalseValue();
+  } else if (match(IDom->getTerminator(),
+                   m_Br(m_Not(m_Specific(Cond)), m_BasicBlock(TrueSucc),
+                        m_BasicBlock(FalseSucc)))) {
+    IfTrue = Sel.getFalseValue();
+    IfFalse = Sel.getTrueValue();
+  } else
+    return nullptr;
+
+  // Make sure the branches are actually different.
+  if (TrueSucc == FalseSucc)
+    return nullptr;
+
+  // We want to replace select %cond, %a, %b with a phi that takes value %a
+  // for all incoming edges that are dominated by condition `%cond == true`,
+  // and value %b for edges dominated by condition `%cond == false`. If %a
+  // or %b are also phis from the same basic block, we can go further and take
+  // their incoming values from the corresponding blocks.
+  BasicBlockEdge TrueEdge(IDom, TrueSucc);
+  BasicBlockEdge FalseEdge(IDom, FalseSucc);
+  DenseMap<BasicBlock *, Value *> Inputs;
+  for (auto *Pred : predecessors(BB)) {
+    // Check implication.
+    BasicBlockEdge Incoming(Pred, BB);
+    if (DT.dominates(TrueEdge, Incoming))
+      Inputs[Pred] = IfTrue->DoPHITranslation(BB, Pred);
+    else if (DT.dominates(FalseEdge, Incoming))
+      Inputs[Pred] = IfFalse->DoPHITranslation(BB, Pred);
+    else
+      return nullptr;
+    // Check availability.
+    if (auto *Insn = dyn_cast<Instruction>(Inputs[Pred]))
+      if (!DT.dominates(Insn, Pred->getTerminator()))
+        return nullptr;
+  }
+
+  Builder.SetInsertPoint(&*BB->begin());
+  auto *PN = Builder.CreatePHI(Sel.getType(), Inputs.size());
+  for (auto *Pred : predecessors(BB))
+    PN->addIncoming(Inputs[Pred], Pred);
+  PN->takeName(&Sel);
+  return PN;
+}
+
+static Instruction *foldSelectToPhi(SelectInst &Sel, const DominatorTree &DT,
+                                    InstCombiner::BuilderTy &Builder) {
+  // Try to replace this select with Phi in one of these blocks.
+  SmallSetVector<BasicBlock *, 4> CandidateBlocks;
+  CandidateBlocks.insert(Sel.getParent());
+  for (Value *V : Sel.operands())
+    if (auto *I = dyn_cast<Instruction>(V))
+      CandidateBlocks.insert(I->getParent());
+
+  for (BasicBlock *BB : CandidateBlocks)
+    if (auto *PN = foldSelectToPhiImpl(Sel, BB, DT, Builder))
+      return PN;
+  return nullptr;
+}
+
 static Value *foldSelectWithFrozenICmp(SelectInst &Sel, InstCombiner::BuilderTy &Builder) {
   FreezeInst *FI = dyn_cast<FreezeInst>(Sel.getCondition());
   if (!FI)
@@ -2557,46 +2557,46 @@ static Value *foldSelectWithFrozenICmp(SelectInst &Sel, InstCombiner::BuilderTy
 }
 
 Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
-  Value *CondVal = SI.getCondition(); 
-  Value *TrueVal = SI.getTrueValue(); 
-  Value *FalseVal = SI.getFalseValue(); 
-  Type *SelType = SI.getType(); 
- 
-  // FIXME: Remove this workaround when freeze related patches are done. 
-  // For select with undef operand which feeds into an equality comparison, 
-  // don't simplify it so loop unswitch can know the equality comparison 
-  // may have an undef operand. This is a workaround for PR31652 caused by 
-  // descrepancy about branch on undef between LoopUnswitch and GVN. 
-  if (isa<UndefValue>(TrueVal) || isa<UndefValue>(FalseVal)) { 
-    if (llvm::any_of(SI.users(), [&](User *U) { 
-          ICmpInst *CI = dyn_cast<ICmpInst>(U); 
-          if (CI && CI->isEquality()) 
-            return true; 
-          return false; 
-        })) { 
-      return nullptr; 
-    } 
-  } 
- 
-  if (Value *V = SimplifySelectInst(CondVal, TrueVal, FalseVal, 
-                                    SQ.getWithInstruction(&SI))) 
-    return replaceInstUsesWith(SI, V); 
- 
-  if (Instruction *I = canonicalizeSelectToShuffle(SI)) 
-    return I; 
- 
-  if (Instruction *I = canonicalizeScalarSelectOfVecs(SI, *this)) 
-    return I; 
- 
-  CmpInst::Predicate Pred; 
- 
-  if (SelType->isIntOrIntVectorTy(1) && 
-      TrueVal->getType() == CondVal->getType()) { 
+  Value *CondVal = SI.getCondition();
+  Value *TrueVal = SI.getTrueValue();
+  Value *FalseVal = SI.getFalseValue();
+  Type *SelType = SI.getType();
+
+  // FIXME: Remove this workaround when freeze related patches are done.
+  // For select with undef operand which feeds into an equality comparison,
+  // don't simplify it so loop unswitch can know the equality comparison
+  // may have an undef operand. This is a workaround for PR31652 caused by
+  // descrepancy about branch on undef between LoopUnswitch and GVN.
+  if (isa<UndefValue>(TrueVal) || isa<UndefValue>(FalseVal)) {
+    if (llvm::any_of(SI.users(), [&](User *U) {
+          ICmpInst *CI = dyn_cast<ICmpInst>(U);
+          if (CI && CI->isEquality())
+            return true;
+          return false;
+        })) {
+      return nullptr;
+    }
+  }
+
+  if (Value *V = SimplifySelectInst(CondVal, TrueVal, FalseVal,
+                                    SQ.getWithInstruction(&SI)))
+    return replaceInstUsesWith(SI, V);
+
+  if (Instruction *I = canonicalizeSelectToShuffle(SI))
+    return I;
+
+  if (Instruction *I = canonicalizeScalarSelectOfVecs(SI, *this))
+    return I;
+
+  CmpInst::Predicate Pred;
+
+  if (SelType->isIntOrIntVectorTy(1) &&
+      TrueVal->getType() == CondVal->getType()) {
     if (match(TrueVal, m_One()) &&
         (EnableUnsafeSelectTransform || impliesPoison(FalseVal, CondVal))) {
-      // Change: A = select B, true, C --> A = or B, C 
-      return BinaryOperator::CreateOr(CondVal, FalseVal); 
-    } 
+      // Change: A = select B, true, C --> A = or B, C
+      return BinaryOperator::CreateOr(CondVal, FalseVal);
+    }
     if (match(FalseVal, m_Zero()) &&
         (EnableUnsafeSelectTransform || impliesPoison(TrueVal, CondVal))) {
       // Change: A = select B, C, false --> A = and B, C
@@ -2604,422 +2604,422 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
     }
 
     // select a, false, b -> select !a, b, false
-    if (match(TrueVal, m_Zero())) { 
-      Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName()); 
+    if (match(TrueVal, m_Zero())) {
+      Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName());
       return SelectInst::Create(NotCond, FalseVal,
                                 ConstantInt::getFalse(SelType));
-    } 
+    }
     // select a, b, true -> select !a, true, b
-    if (match(FalseVal, m_One())) { 
-      Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName()); 
+    if (match(FalseVal, m_One())) {
+      Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName());
       return SelectInst::Create(NotCond, ConstantInt::getTrue(SelType),
                                 TrueVal);
-    } 
- 
+    }
+
     // select a, a, b -> select a, true, b
-    if (CondVal == TrueVal) 
+    if (CondVal == TrueVal)
       return replaceOperand(SI, 1, ConstantInt::getTrue(SelType));
     // select a, b, a -> select a, b, false
-    if (CondVal == FalseVal) 
+    if (CondVal == FalseVal)
       return replaceOperand(SI, 2, ConstantInt::getFalse(SelType));
- 
+
     // select a, !a, b -> select !a, b, false
-    if (match(TrueVal, m_Not(m_Specific(CondVal)))) 
+    if (match(TrueVal, m_Not(m_Specific(CondVal))))
       return SelectInst::Create(TrueVal, FalseVal,
                                 ConstantInt::getFalse(SelType));
     // select a, b, !a -> select !a, true, b
-    if (match(FalseVal, m_Not(m_Specific(CondVal)))) 
+    if (match(FalseVal, m_Not(m_Specific(CondVal))))
       return SelectInst::Create(FalseVal, ConstantInt::getTrue(SelType),
                                 TrueVal);
-  } 
- 
-  // Selecting between two integer or vector splat integer constants? 
-  // 
-  // Note that we don't handle a scalar select of vectors: 
-  // select i1 %c, <2 x i8> <1, 1>, <2 x i8> <0, 0> 
-  // because that may need 3 instructions to splat the condition value: 
-  // extend, insertelement, shufflevector. 
+  }
+
+  // Selecting between two integer or vector splat integer constants?
+  //
+  // Note that we don't handle a scalar select of vectors:
+  // select i1 %c, <2 x i8> <1, 1>, <2 x i8> <0, 0>
+  // because that may need 3 instructions to splat the condition value:
+  // extend, insertelement, shufflevector.
   //
   // Do not handle i1 TrueVal and FalseVal otherwise would result in
   // zext/sext i1 to i1.
   if (SelType->isIntOrIntVectorTy() && !SelType->isIntOrIntVectorTy(1) &&
-      CondVal->getType()->isVectorTy() == SelType->isVectorTy()) { 
-    // select C, 1, 0 -> zext C to int 
-    if (match(TrueVal, m_One()) && match(FalseVal, m_Zero())) 
-      return new ZExtInst(CondVal, SelType); 
- 
-    // select C, -1, 0 -> sext C to int 
-    if (match(TrueVal, m_AllOnes()) && match(FalseVal, m_Zero())) 
-      return new SExtInst(CondVal, SelType); 
- 
-    // select C, 0, 1 -> zext !C to int 
-    if (match(TrueVal, m_Zero()) && match(FalseVal, m_One())) { 
-      Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName()); 
-      return new ZExtInst(NotCond, SelType); 
-    } 
- 
-    // select C, 0, -1 -> sext !C to int 
-    if (match(TrueVal, m_Zero()) && match(FalseVal, m_AllOnes())) { 
-      Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName()); 
-      return new SExtInst(NotCond, SelType); 
-    } 
-  } 
- 
-  // See if we are selecting two values based on a comparison of the two values. 
-  if (FCmpInst *FCI = dyn_cast<FCmpInst>(CondVal)) { 
-    Value *Cmp0 = FCI->getOperand(0), *Cmp1 = FCI->getOperand(1); 
-    if ((Cmp0 == TrueVal && Cmp1 == FalseVal) || 
-        (Cmp0 == FalseVal && Cmp1 == TrueVal)) { 
-      // Canonicalize to use ordered comparisons by swapping the select 
-      // operands. 
-      // 
-      // e.g. 
-      // (X ugt Y) ? X : Y -> (X ole Y) ? Y : X 
-      if (FCI->hasOneUse() && FCmpInst::isUnordered(FCI->getPredicate())) { 
-        FCmpInst::Predicate InvPred = FCI->getInversePredicate(); 
-        IRBuilder<>::FastMathFlagGuard FMFG(Builder); 
-        // FIXME: The FMF should propagate from the select, not the fcmp. 
-        Builder.setFastMathFlags(FCI->getFastMathFlags()); 
-        Value *NewCond = Builder.CreateFCmp(InvPred, Cmp0, Cmp1, 
-                                            FCI->getName() + ".inv"); 
-        Value *NewSel = Builder.CreateSelect(NewCond, FalseVal, TrueVal); 
-        return replaceInstUsesWith(SI, NewSel); 
-      } 
- 
-      // NOTE: if we wanted to, this is where to detect MIN/MAX 
-    } 
-  } 
- 
-  // Canonicalize select with fcmp to fabs(). -0.0 makes this tricky. We need 
-  // fast-math-flags (nsz) or fsub with +0.0 (not fneg) for this to work. We 
-  // also require nnan because we do not want to unintentionally change the 
-  // sign of a NaN value. 
-  // FIXME: These folds should test/propagate FMF from the select, not the 
-  //        fsub or fneg. 
-  // (X <= +/-0.0) ? (0.0 - X) : X --> fabs(X) 
-  Instruction *FSub; 
-  if (match(CondVal, m_FCmp(Pred, m_Specific(FalseVal), m_AnyZeroFP())) && 
-      match(TrueVal, m_FSub(m_PosZeroFP(), m_Specific(FalseVal))) && 
-      match(TrueVal, m_Instruction(FSub)) && FSub->hasNoNaNs() && 
-      (Pred == FCmpInst::FCMP_OLE || Pred == FCmpInst::FCMP_ULE)) { 
-    Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FalseVal, FSub); 
-    return replaceInstUsesWith(SI, Fabs); 
-  } 
-  // (X >  +/-0.0) ? X : (0.0 - X) --> fabs(X) 
-  if (match(CondVal, m_FCmp(Pred, m_Specific(TrueVal), m_AnyZeroFP())) && 
-      match(FalseVal, m_FSub(m_PosZeroFP(), m_Specific(TrueVal))) && 
-      match(FalseVal, m_Instruction(FSub)) && FSub->hasNoNaNs() && 
-      (Pred == FCmpInst::FCMP_OGT || Pred == FCmpInst::FCMP_UGT)) { 
-    Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, TrueVal, FSub); 
-    return replaceInstUsesWith(SI, Fabs); 
-  } 
-  // With nnan and nsz: 
-  // (X <  +/-0.0) ? -X : X --> fabs(X) 
-  // (X <= +/-0.0) ? -X : X --> fabs(X) 
-  Instruction *FNeg; 
-  if (match(CondVal, m_FCmp(Pred, m_Specific(FalseVal), m_AnyZeroFP())) && 
-      match(TrueVal, m_FNeg(m_Specific(FalseVal))) && 
-      match(TrueVal, m_Instruction(FNeg)) && 
-      FNeg->hasNoNaNs() && FNeg->hasNoSignedZeros() && 
-      (Pred == FCmpInst::FCMP_OLT || Pred == FCmpInst::FCMP_OLE || 
-       Pred == FCmpInst::FCMP_ULT || Pred == FCmpInst::FCMP_ULE)) { 
-    Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FalseVal, FNeg); 
-    return replaceInstUsesWith(SI, Fabs); 
-  } 
-  // With nnan and nsz: 
-  // (X >  +/-0.0) ? X : -X --> fabs(X) 
-  // (X >= +/-0.0) ? X : -X --> fabs(X) 
-  if (match(CondVal, m_FCmp(Pred, m_Specific(TrueVal), m_AnyZeroFP())) && 
-      match(FalseVal, m_FNeg(m_Specific(TrueVal))) && 
-      match(FalseVal, m_Instruction(FNeg)) && 
-      FNeg->hasNoNaNs() && FNeg->hasNoSignedZeros() && 
-      (Pred == FCmpInst::FCMP_OGT || Pred == FCmpInst::FCMP_OGE || 
-       Pred == FCmpInst::FCMP_UGT || Pred == FCmpInst::FCMP_UGE)) { 
-    Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, TrueVal, FNeg); 
-    return replaceInstUsesWith(SI, Fabs); 
-  } 
- 
-  // See if we are selecting two values based on a comparison of the two values. 
-  if (ICmpInst *ICI = dyn_cast<ICmpInst>(CondVal)) 
-    if (Instruction *Result = foldSelectInstWithICmp(SI, ICI)) 
-      return Result; 
- 
-  if (Instruction *Add = foldAddSubSelect(SI, Builder)) 
-    return Add; 
-  if (Instruction *Add = foldOverflowingAddSubSelect(SI, Builder)) 
-    return Add; 
-  if (Instruction *Or = foldSetClearBits(SI, Builder)) 
-    return Or; 
- 
-  // Turn (select C, (op X, Y), (op X, Z)) -> (op X, (select C, Y, Z)) 
-  auto *TI = dyn_cast<Instruction>(TrueVal); 
-  auto *FI = dyn_cast<Instruction>(FalseVal); 
-  if (TI && FI && TI->getOpcode() == FI->getOpcode()) 
-    if (Instruction *IV = foldSelectOpOp(SI, TI, FI)) 
-      return IV; 
- 
-  if (Instruction *I = foldSelectExtConst(SI)) 
-    return I; 
- 
-  // See if we can fold the select into one of our operands. 
-  if (SelType->isIntOrIntVectorTy() || SelType->isFPOrFPVectorTy()) { 
-    if (Instruction *FoldI = foldSelectIntoOp(SI, TrueVal, FalseVal)) 
-      return FoldI; 
- 
-    Value *LHS, *RHS; 
-    Instruction::CastOps CastOp; 
-    SelectPatternResult SPR = matchSelectPattern(&SI, LHS, RHS, &CastOp); 
-    auto SPF = SPR.Flavor; 
-    if (SPF) { 
-      Value *LHS2, *RHS2; 
-      if (SelectPatternFlavor SPF2 = matchSelectPattern(LHS, LHS2, RHS2).Flavor) 
-        if (Instruction *R = foldSPFofSPF(cast<Instruction>(LHS), SPF2, LHS2, 
-                                          RHS2, SI, SPF, RHS)) 
-          return R; 
-      if (SelectPatternFlavor SPF2 = matchSelectPattern(RHS, LHS2, RHS2).Flavor) 
-        if (Instruction *R = foldSPFofSPF(cast<Instruction>(RHS), SPF2, LHS2, 
-                                          RHS2, SI, SPF, LHS)) 
-          return R; 
-      // TODO. 
-      // ABS(-X) -> ABS(X) 
-    } 
- 
-    if (SelectPatternResult::isMinOrMax(SPF)) { 
-      // Canonicalize so that 
-      // - type casts are outside select patterns. 
-      // - float clamp is transformed to min/max pattern 
- 
-      bool IsCastNeeded = LHS->getType() != SelType; 
-      Value *CmpLHS = cast<CmpInst>(CondVal)->getOperand(0); 
-      Value *CmpRHS = cast<CmpInst>(CondVal)->getOperand(1); 
-      if (IsCastNeeded || 
-          (LHS->getType()->isFPOrFPVectorTy() && 
-           ((CmpLHS != LHS && CmpLHS != RHS) || 
-            (CmpRHS != LHS && CmpRHS != RHS)))) { 
-        CmpInst::Predicate MinMaxPred = getMinMaxPred(SPF, SPR.Ordered); 
- 
-        Value *Cmp; 
-        if (CmpInst::isIntPredicate(MinMaxPred)) { 
-          Cmp = Builder.CreateICmp(MinMaxPred, LHS, RHS); 
-        } else { 
-          IRBuilder<>::FastMathFlagGuard FMFG(Builder); 
-          auto FMF = 
-              cast<FPMathOperator>(SI.getCondition())->getFastMathFlags(); 
-          Builder.setFastMathFlags(FMF); 
-          Cmp = Builder.CreateFCmp(MinMaxPred, LHS, RHS); 
-        } 
- 
-        Value *NewSI = Builder.CreateSelect(Cmp, LHS, RHS, SI.getName(), &SI); 
-        if (!IsCastNeeded) 
-          return replaceInstUsesWith(SI, NewSI); 
- 
-        Value *NewCast = Builder.CreateCast(CastOp, NewSI, SelType); 
-        return replaceInstUsesWith(SI, NewCast); 
-      } 
- 
-      // MAX(~a, ~b) -> ~MIN(a, b) 
-      // MAX(~a, C)  -> ~MIN(a, ~C) 
-      // MIN(~a, ~b) -> ~MAX(a, b) 
-      // MIN(~a, C)  -> ~MAX(a, ~C) 
-      auto moveNotAfterMinMax = [&](Value *X, Value *Y) -> Instruction * { 
-        Value *A; 
-        if (match(X, m_Not(m_Value(A))) && !X->hasNUsesOrMore(3) && 
-            !isFreeToInvert(A, A->hasOneUse()) && 
-            // Passing false to only consider m_Not and constants. 
-            isFreeToInvert(Y, false)) { 
-          Value *B = Builder.CreateNot(Y); 
-          Value *NewMinMax = createMinMax(Builder, getInverseMinMaxFlavor(SPF), 
-                                          A, B); 
-          // Copy the profile metadata. 
-          if (MDNode *MD = SI.getMetadata(LLVMContext::MD_prof)) { 
-            cast<SelectInst>(NewMinMax)->setMetadata(LLVMContext::MD_prof, MD); 
-            // Swap the metadata if the operands are swapped. 
-            if (X == SI.getFalseValue() && Y == SI.getTrueValue()) 
-              cast<SelectInst>(NewMinMax)->swapProfMetadata(); 
-          } 
- 
-          return BinaryOperator::CreateNot(NewMinMax); 
-        } 
- 
-        return nullptr; 
-      }; 
- 
-      if (Instruction *I = moveNotAfterMinMax(LHS, RHS)) 
-        return I; 
-      if (Instruction *I = moveNotAfterMinMax(RHS, LHS)) 
-        return I; 
- 
-      if (Instruction *I = moveAddAfterMinMax(SPF, LHS, RHS, Builder)) 
-        return I; 
- 
-      if (Instruction *I = factorizeMinMaxTree(SPF, LHS, RHS, Builder)) 
-        return I; 
-      if (Instruction *I = matchSAddSubSat(SI)) 
-        return I; 
-    } 
-  } 
- 
-  // Canonicalize select of FP values where NaN and -0.0 are not valid as 
-  // minnum/maxnum intrinsics. 
-  if (isa<FPMathOperator>(SI) && SI.hasNoNaNs() && SI.hasNoSignedZeros()) { 
-    Value *X, *Y; 
-    if (match(&SI, m_OrdFMax(m_Value(X), m_Value(Y)))) 
-      return replaceInstUsesWith( 
-          SI, Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, X, Y, &SI)); 
- 
-    if (match(&SI, m_OrdFMin(m_Value(X), m_Value(Y)))) 
-      return replaceInstUsesWith( 
-          SI, Builder.CreateBinaryIntrinsic(Intrinsic::minnum, X, Y, &SI)); 
-  } 
- 
-  // See if we can fold the select into a phi node if the condition is a select. 
-  if (auto *PN = dyn_cast<PHINode>(SI.getCondition())) 
-    // The true/false values have to be live in the PHI predecessor's blocks. 
-    if (canSelectOperandBeMappingIntoPredBlock(TrueVal, SI) && 
-        canSelectOperandBeMappingIntoPredBlock(FalseVal, SI)) 
-      if (Instruction *NV = foldOpIntoPhi(SI, PN)) 
-        return NV; 
- 
-  if (SelectInst *TrueSI = dyn_cast<SelectInst>(TrueVal)) { 
-    if (TrueSI->getCondition()->getType() == CondVal->getType()) { 
-      // select(C, select(C, a, b), c) -> select(C, a, c) 
-      if (TrueSI->getCondition() == CondVal) { 
-        if (SI.getTrueValue() == TrueSI->getTrueValue()) 
-          return nullptr; 
-        return replaceOperand(SI, 1, TrueSI->getTrueValue()); 
-      } 
-      // select(C0, select(C1, a, b), b) -> select(C0&C1, a, b) 
+      CondVal->getType()->isVectorTy() == SelType->isVectorTy()) {
+    // select C, 1, 0 -> zext C to int
+    if (match(TrueVal, m_One()) && match(FalseVal, m_Zero()))
+      return new ZExtInst(CondVal, SelType);
+
+    // select C, -1, 0 -> sext C to int
+    if (match(TrueVal, m_AllOnes()) && match(FalseVal, m_Zero()))
+      return new SExtInst(CondVal, SelType);
+
+    // select C, 0, 1 -> zext !C to int
+    if (match(TrueVal, m_Zero()) && match(FalseVal, m_One())) {
+      Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName());
+      return new ZExtInst(NotCond, SelType);
+    }
+
+    // select C, 0, -1 -> sext !C to int
+    if (match(TrueVal, m_Zero()) && match(FalseVal, m_AllOnes())) {
+      Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName());
+      return new SExtInst(NotCond, SelType);
+    }
+  }
+
+  // See if we are selecting two values based on a comparison of the two values.
+  if (FCmpInst *FCI = dyn_cast<FCmpInst>(CondVal)) {
+    Value *Cmp0 = FCI->getOperand(0), *Cmp1 = FCI->getOperand(1);
+    if ((Cmp0 == TrueVal && Cmp1 == FalseVal) ||
+        (Cmp0 == FalseVal && Cmp1 == TrueVal)) {
+      // Canonicalize to use ordered comparisons by swapping the select
+      // operands.
+      //
+      // e.g.
+      // (X ugt Y) ? X : Y -> (X ole Y) ? Y : X
+      if (FCI->hasOneUse() && FCmpInst::isUnordered(FCI->getPredicate())) {
+        FCmpInst::Predicate InvPred = FCI->getInversePredicate();
+        IRBuilder<>::FastMathFlagGuard FMFG(Builder);
+        // FIXME: The FMF should propagate from the select, not the fcmp.
+        Builder.setFastMathFlags(FCI->getFastMathFlags());
+        Value *NewCond = Builder.CreateFCmp(InvPred, Cmp0, Cmp1,
+                                            FCI->getName() + ".inv");
+        Value *NewSel = Builder.CreateSelect(NewCond, FalseVal, TrueVal);
+        return replaceInstUsesWith(SI, NewSel);
+      }
+
+      // NOTE: if we wanted to, this is where to detect MIN/MAX
+    }
+  }
+
+  // Canonicalize select with fcmp to fabs(). -0.0 makes this tricky. We need
+  // fast-math-flags (nsz) or fsub with +0.0 (not fneg) for this to work. We
+  // also require nnan because we do not want to unintentionally change the
+  // sign of a NaN value.
+  // FIXME: These folds should test/propagate FMF from the select, not the
+  //        fsub or fneg.
+  // (X <= +/-0.0) ? (0.0 - X) : X --> fabs(X)
+  Instruction *FSub;
+  if (match(CondVal, m_FCmp(Pred, m_Specific(FalseVal), m_AnyZeroFP())) &&
+      match(TrueVal, m_FSub(m_PosZeroFP(), m_Specific(FalseVal))) &&
+      match(TrueVal, m_Instruction(FSub)) && FSub->hasNoNaNs() &&
+      (Pred == FCmpInst::FCMP_OLE || Pred == FCmpInst::FCMP_ULE)) {
+    Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FalseVal, FSub);
+    return replaceInstUsesWith(SI, Fabs);
+  }
+  // (X >  +/-0.0) ? X : (0.0 - X) --> fabs(X)
+  if (match(CondVal, m_FCmp(Pred, m_Specific(TrueVal), m_AnyZeroFP())) &&
+      match(FalseVal, m_FSub(m_PosZeroFP(), m_Specific(TrueVal))) &&
+      match(FalseVal, m_Instruction(FSub)) && FSub->hasNoNaNs() &&
+      (Pred == FCmpInst::FCMP_OGT || Pred == FCmpInst::FCMP_UGT)) {
+    Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, TrueVal, FSub);
+    return replaceInstUsesWith(SI, Fabs);
+  }
+  // With nnan and nsz:
+  // (X <  +/-0.0) ? -X : X --> fabs(X)
+  // (X <= +/-0.0) ? -X : X --> fabs(X)
+  Instruction *FNeg;
+  if (match(CondVal, m_FCmp(Pred, m_Specific(FalseVal), m_AnyZeroFP())) &&
+      match(TrueVal, m_FNeg(m_Specific(FalseVal))) &&
+      match(TrueVal, m_Instruction(FNeg)) &&
+      FNeg->hasNoNaNs() && FNeg->hasNoSignedZeros() &&
+      (Pred == FCmpInst::FCMP_OLT || Pred == FCmpInst::FCMP_OLE ||
+       Pred == FCmpInst::FCMP_ULT || Pred == FCmpInst::FCMP_ULE)) {
+    Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FalseVal, FNeg);
+    return replaceInstUsesWith(SI, Fabs);
+  }
+  // With nnan and nsz:
+  // (X >  +/-0.0) ? X : -X --> fabs(X)
+  // (X >= +/-0.0) ? X : -X --> fabs(X)
+  if (match(CondVal, m_FCmp(Pred, m_Specific(TrueVal), m_AnyZeroFP())) &&
+      match(FalseVal, m_FNeg(m_Specific(TrueVal))) &&
+      match(FalseVal, m_Instruction(FNeg)) &&
+      FNeg->hasNoNaNs() && FNeg->hasNoSignedZeros() &&
+      (Pred == FCmpInst::FCMP_OGT || Pred == FCmpInst::FCMP_OGE ||
+       Pred == FCmpInst::FCMP_UGT || Pred == FCmpInst::FCMP_UGE)) {
+    Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, TrueVal, FNeg);
+    return replaceInstUsesWith(SI, Fabs);
+  }
+
+  // See if we are selecting two values based on a comparison of the two values.
+  if (ICmpInst *ICI = dyn_cast<ICmpInst>(CondVal))
+    if (Instruction *Result = foldSelectInstWithICmp(SI, ICI))
+      return Result;
+
+  if (Instruction *Add = foldAddSubSelect(SI, Builder))
+    return Add;
+  if (Instruction *Add = foldOverflowingAddSubSelect(SI, Builder))
+    return Add;
+  if (Instruction *Or = foldSetClearBits(SI, Builder))
+    return Or;
+
+  // Turn (select C, (op X, Y), (op X, Z)) -> (op X, (select C, Y, Z))
+  auto *TI = dyn_cast<Instruction>(TrueVal);
+  auto *FI = dyn_cast<Instruction>(FalseVal);
+  if (TI && FI && TI->getOpcode() == FI->getOpcode())
+    if (Instruction *IV = foldSelectOpOp(SI, TI, FI))
+      return IV;
+
+  if (Instruction *I = foldSelectExtConst(SI))
+    return I;
+
+  // See if we can fold the select into one of our operands.
+  if (SelType->isIntOrIntVectorTy() || SelType->isFPOrFPVectorTy()) {
+    if (Instruction *FoldI = foldSelectIntoOp(SI, TrueVal, FalseVal))
+      return FoldI;
+
+    Value *LHS, *RHS;
+    Instruction::CastOps CastOp;
+    SelectPatternResult SPR = matchSelectPattern(&SI, LHS, RHS, &CastOp);
+    auto SPF = SPR.Flavor;
+    if (SPF) {
+      Value *LHS2, *RHS2;
+      if (SelectPatternFlavor SPF2 = matchSelectPattern(LHS, LHS2, RHS2).Flavor)
+        if (Instruction *R = foldSPFofSPF(cast<Instruction>(LHS), SPF2, LHS2,
+                                          RHS2, SI, SPF, RHS))
+          return R;
+      if (SelectPatternFlavor SPF2 = matchSelectPattern(RHS, LHS2, RHS2).Flavor)
+        if (Instruction *R = foldSPFofSPF(cast<Instruction>(RHS), SPF2, LHS2,
+                                          RHS2, SI, SPF, LHS))
+          return R;
+      // TODO.
+      // ABS(-X) -> ABS(X)
+    }
+
+    if (SelectPatternResult::isMinOrMax(SPF)) {
+      // Canonicalize so that
+      // - type casts are outside select patterns.
+      // - float clamp is transformed to min/max pattern
+
+      bool IsCastNeeded = LHS->getType() != SelType;
+      Value *CmpLHS = cast<CmpInst>(CondVal)->getOperand(0);
+      Value *CmpRHS = cast<CmpInst>(CondVal)->getOperand(1);
+      if (IsCastNeeded ||
+          (LHS->getType()->isFPOrFPVectorTy() &&
+           ((CmpLHS != LHS && CmpLHS != RHS) ||
+            (CmpRHS != LHS && CmpRHS != RHS)))) {
+        CmpInst::Predicate MinMaxPred = getMinMaxPred(SPF, SPR.Ordered);
+
+        Value *Cmp;
+        if (CmpInst::isIntPredicate(MinMaxPred)) {
+          Cmp = Builder.CreateICmp(MinMaxPred, LHS, RHS);
+        } else {
+          IRBuilder<>::FastMathFlagGuard FMFG(Builder);
+          auto FMF =
+              cast<FPMathOperator>(SI.getCondition())->getFastMathFlags();
+          Builder.setFastMathFlags(FMF);
+          Cmp = Builder.CreateFCmp(MinMaxPred, LHS, RHS);
+        }
+
+        Value *NewSI = Builder.CreateSelect(Cmp, LHS, RHS, SI.getName(), &SI);
+        if (!IsCastNeeded)
+          return replaceInstUsesWith(SI, NewSI);
+
+        Value *NewCast = Builder.CreateCast(CastOp, NewSI, SelType);
+        return replaceInstUsesWith(SI, NewCast);
+      }
+
+      // MAX(~a, ~b) -> ~MIN(a, b)
+      // MAX(~a, C)  -> ~MIN(a, ~C)
+      // MIN(~a, ~b) -> ~MAX(a, b)
+      // MIN(~a, C)  -> ~MAX(a, ~C)
+      auto moveNotAfterMinMax = [&](Value *X, Value *Y) -> Instruction * {
+        Value *A;
+        if (match(X, m_Not(m_Value(A))) && !X->hasNUsesOrMore(3) &&
+            !isFreeToInvert(A, A->hasOneUse()) &&
+            // Passing false to only consider m_Not and constants.
+            isFreeToInvert(Y, false)) {
+          Value *B = Builder.CreateNot(Y);
+          Value *NewMinMax = createMinMax(Builder, getInverseMinMaxFlavor(SPF),
+                                          A, B);
+          // Copy the profile metadata.
+          if (MDNode *MD = SI.getMetadata(LLVMContext::MD_prof)) {
+            cast<SelectInst>(NewMinMax)->setMetadata(LLVMContext::MD_prof, MD);
+            // Swap the metadata if the operands are swapped.
+            if (X == SI.getFalseValue() && Y == SI.getTrueValue())
+              cast<SelectInst>(NewMinMax)->swapProfMetadata();
+          }
+
+          return BinaryOperator::CreateNot(NewMinMax);
+        }
+
+        return nullptr;
+      };
+
+      if (Instruction *I = moveNotAfterMinMax(LHS, RHS))
+        return I;
+      if (Instruction *I = moveNotAfterMinMax(RHS, LHS))
+        return I;
+
+      if (Instruction *I = moveAddAfterMinMax(SPF, LHS, RHS, Builder))
+        return I;
+
+      if (Instruction *I = factorizeMinMaxTree(SPF, LHS, RHS, Builder))
+        return I;
+      if (Instruction *I = matchSAddSubSat(SI))
+        return I;
+    }
+  }
+
+  // Canonicalize select of FP values where NaN and -0.0 are not valid as
+  // minnum/maxnum intrinsics.
+  if (isa<FPMathOperator>(SI) && SI.hasNoNaNs() && SI.hasNoSignedZeros()) {
+    Value *X, *Y;
+    if (match(&SI, m_OrdFMax(m_Value(X), m_Value(Y))))
+      return replaceInstUsesWith(
+          SI, Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, X, Y, &SI));
+
+    if (match(&SI, m_OrdFMin(m_Value(X), m_Value(Y))))
+      return replaceInstUsesWith(
+          SI, Builder.CreateBinaryIntrinsic(Intrinsic::minnum, X, Y, &SI));
+  }
+
+  // See if we can fold the select into a phi node if the condition is a select.
+  if (auto *PN = dyn_cast<PHINode>(SI.getCondition()))
+    // The true/false values have to be live in the PHI predecessor's blocks.
+    if (canSelectOperandBeMappingIntoPredBlock(TrueVal, SI) &&
+        canSelectOperandBeMappingIntoPredBlock(FalseVal, SI))
+      if (Instruction *NV = foldOpIntoPhi(SI, PN))
+        return NV;
+
+  if (SelectInst *TrueSI = dyn_cast<SelectInst>(TrueVal)) {
+    if (TrueSI->getCondition()->getType() == CondVal->getType()) {
+      // select(C, select(C, a, b), c) -> select(C, a, c)
+      if (TrueSI->getCondition() == CondVal) {
+        if (SI.getTrueValue() == TrueSI->getTrueValue())
+          return nullptr;
+        return replaceOperand(SI, 1, TrueSI->getTrueValue());
+      }
+      // select(C0, select(C1, a, b), b) -> select(C0&C1, a, b)
       // We choose this as normal form to enable folding on the And and
       // shortening paths for the values (this helps getUnderlyingObjects() for
       // example).
-      if (TrueSI->getFalseValue() == FalseVal && TrueSI->hasOneUse()) { 
-        Value *And = Builder.CreateAnd(CondVal, TrueSI->getCondition()); 
-        replaceOperand(SI, 0, And); 
-        replaceOperand(SI, 1, TrueSI->getTrueValue()); 
-        return &SI; 
-      } 
-    } 
-  } 
-  if (SelectInst *FalseSI = dyn_cast<SelectInst>(FalseVal)) { 
-    if (FalseSI->getCondition()->getType() == CondVal->getType()) { 
-      // select(C, a, select(C, b, c)) -> select(C, a, c) 
-      if (FalseSI->getCondition() == CondVal) { 
-        if (SI.getFalseValue() == FalseSI->getFalseValue()) 
-          return nullptr; 
-        return replaceOperand(SI, 2, FalseSI->getFalseValue()); 
-      } 
-      // select(C0, a, select(C1, a, b)) -> select(C0|C1, a, b) 
-      if (FalseSI->getTrueValue() == TrueVal && FalseSI->hasOneUse()) { 
-        Value *Or = Builder.CreateOr(CondVal, FalseSI->getCondition()); 
-        replaceOperand(SI, 0, Or); 
-        replaceOperand(SI, 2, FalseSI->getFalseValue()); 
-        return &SI; 
-      } 
-    } 
-  } 
- 
-  auto canMergeSelectThroughBinop = [](BinaryOperator *BO) { 
-    // The select might be preventing a division by 0. 
-    switch (BO->getOpcode()) { 
-    default: 
-      return true; 
-    case Instruction::SRem: 
-    case Instruction::URem: 
-    case Instruction::SDiv: 
-    case Instruction::UDiv: 
-      return false; 
-    } 
-  }; 
- 
-  // Try to simplify a binop sandwiched between 2 selects with the same 
-  // condition. 
-  // select(C, binop(select(C, X, Y), W), Z) -> select(C, binop(X, W), Z) 
-  BinaryOperator *TrueBO; 
-  if (match(TrueVal, m_OneUse(m_BinOp(TrueBO))) && 
-      canMergeSelectThroughBinop(TrueBO)) { 
-    if (auto *TrueBOSI = dyn_cast<SelectInst>(TrueBO->getOperand(0))) { 
-      if (TrueBOSI->getCondition() == CondVal) { 
-        replaceOperand(*TrueBO, 0, TrueBOSI->getTrueValue()); 
-        Worklist.push(TrueBO); 
-        return &SI; 
-      } 
-    } 
-    if (auto *TrueBOSI = dyn_cast<SelectInst>(TrueBO->getOperand(1))) { 
-      if (TrueBOSI->getCondition() == CondVal) { 
-        replaceOperand(*TrueBO, 1, TrueBOSI->getTrueValue()); 
-        Worklist.push(TrueBO); 
-        return &SI; 
-      } 
-    } 
-  } 
- 
-  // select(C, Z, binop(select(C, X, Y), W)) -> select(C, Z, binop(Y, W)) 
-  BinaryOperator *FalseBO; 
-  if (match(FalseVal, m_OneUse(m_BinOp(FalseBO))) && 
-      canMergeSelectThroughBinop(FalseBO)) { 
-    if (auto *FalseBOSI = dyn_cast<SelectInst>(FalseBO->getOperand(0))) { 
-      if (FalseBOSI->getCondition() == CondVal) { 
-        replaceOperand(*FalseBO, 0, FalseBOSI->getFalseValue()); 
-        Worklist.push(FalseBO); 
-        return &SI; 
-      } 
-    } 
-    if (auto *FalseBOSI = dyn_cast<SelectInst>(FalseBO->getOperand(1))) { 
-      if (FalseBOSI->getCondition() == CondVal) { 
-        replaceOperand(*FalseBO, 1, FalseBOSI->getFalseValue()); 
-        Worklist.push(FalseBO); 
-        return &SI; 
-      } 
-    } 
-  } 
- 
-  Value *NotCond; 
+      if (TrueSI->getFalseValue() == FalseVal && TrueSI->hasOneUse()) {
+        Value *And = Builder.CreateAnd(CondVal, TrueSI->getCondition());
+        replaceOperand(SI, 0, And);
+        replaceOperand(SI, 1, TrueSI->getTrueValue());
+        return &SI;
+      }
+    }
+  }
+  if (SelectInst *FalseSI = dyn_cast<SelectInst>(FalseVal)) {
+    if (FalseSI->getCondition()->getType() == CondVal->getType()) {
+      // select(C, a, select(C, b, c)) -> select(C, a, c)
+      if (FalseSI->getCondition() == CondVal) {
+        if (SI.getFalseValue() == FalseSI->getFalseValue())
+          return nullptr;
+        return replaceOperand(SI, 2, FalseSI->getFalseValue());
+      }
+      // select(C0, a, select(C1, a, b)) -> select(C0|C1, a, b)
+      if (FalseSI->getTrueValue() == TrueVal && FalseSI->hasOneUse()) {
+        Value *Or = Builder.CreateOr(CondVal, FalseSI->getCondition());
+        replaceOperand(SI, 0, Or);
+        replaceOperand(SI, 2, FalseSI->getFalseValue());
+        return &SI;
+      }
+    }
+  }
+
+  auto canMergeSelectThroughBinop = [](BinaryOperator *BO) {
+    // The select might be preventing a division by 0.
+    switch (BO->getOpcode()) {
+    default:
+      return true;
+    case Instruction::SRem:
+    case Instruction::URem:
+    case Instruction::SDiv:
+    case Instruction::UDiv:
+      return false;
+    }
+  };
+
+  // Try to simplify a binop sandwiched between 2 selects with the same
+  // condition.
+  // select(C, binop(select(C, X, Y), W), Z) -> select(C, binop(X, W), Z)
+  BinaryOperator *TrueBO;
+  if (match(TrueVal, m_OneUse(m_BinOp(TrueBO))) &&
+      canMergeSelectThroughBinop(TrueBO)) {
+    if (auto *TrueBOSI = dyn_cast<SelectInst>(TrueBO->getOperand(0))) {
+      if (TrueBOSI->getCondition() == CondVal) {
+        replaceOperand(*TrueBO, 0, TrueBOSI->getTrueValue());
+        Worklist.push(TrueBO);
+        return &SI;
+      }
+    }
+    if (auto *TrueBOSI = dyn_cast<SelectInst>(TrueBO->getOperand(1))) {
+      if (TrueBOSI->getCondition() == CondVal) {
+        replaceOperand(*TrueBO, 1, TrueBOSI->getTrueValue());
+        Worklist.push(TrueBO);
+        return &SI;
+      }
+    }
+  }
+
+  // select(C, Z, binop(select(C, X, Y), W)) -> select(C, Z, binop(Y, W))
+  BinaryOperator *FalseBO;
+  if (match(FalseVal, m_OneUse(m_BinOp(FalseBO))) &&
+      canMergeSelectThroughBinop(FalseBO)) {
+    if (auto *FalseBOSI = dyn_cast<SelectInst>(FalseBO->getOperand(0))) {
+      if (FalseBOSI->getCondition() == CondVal) {
+        replaceOperand(*FalseBO, 0, FalseBOSI->getFalseValue());
+        Worklist.push(FalseBO);
+        return &SI;
+      }
+    }
+    if (auto *FalseBOSI = dyn_cast<SelectInst>(FalseBO->getOperand(1))) {
+      if (FalseBOSI->getCondition() == CondVal) {
+        replaceOperand(*FalseBO, 1, FalseBOSI->getFalseValue());
+        Worklist.push(FalseBO);
+        return &SI;
+      }
+    }
+  }
+
+  Value *NotCond;
   if (match(CondVal, m_Not(m_Value(NotCond))) &&
       !InstCombiner::shouldAvoidAbsorbingNotIntoSelect(SI)) {
-    replaceOperand(SI, 0, NotCond); 
-    SI.swapValues(); 
-    SI.swapProfMetadata(); 
-    return &SI; 
-  } 
- 
-  if (Instruction *I = foldVectorSelect(SI)) 
-    return I; 
- 
-  // If we can compute the condition, there's no need for a select. 
-  // Like the above fold, we are attempting to reduce compile-time cost by 
-  // putting this fold here with limitations rather than in InstSimplify. 
-  // The motivation for this call into value tracking is to take advantage of 
-  // the assumption cache, so make sure that is populated. 
-  if (!CondVal->getType()->isVectorTy() && !AC.assumptions().empty()) { 
-    KnownBits Known(1); 
-    computeKnownBits(CondVal, Known, 0, &SI); 
-    if (Known.One.isOneValue()) 
-      return replaceInstUsesWith(SI, TrueVal); 
-    if (Known.Zero.isOneValue()) 
-      return replaceInstUsesWith(SI, FalseVal); 
-  } 
- 
-  if (Instruction *BitCastSel = foldSelectCmpBitcasts(SI, Builder)) 
-    return BitCastSel; 
- 
-  // Simplify selects that test the returned flag of cmpxchg instructions. 
-  if (Value *V = foldSelectCmpXchg(SI)) 
-    return replaceInstUsesWith(SI, V); 
- 
-  if (Instruction *Select = foldSelectBinOpIdentity(SI, TLI, *this)) 
-    return Select; 
- 
+    replaceOperand(SI, 0, NotCond);
+    SI.swapValues();
+    SI.swapProfMetadata();
+    return &SI;
+  }
+
+  if (Instruction *I = foldVectorSelect(SI))
+    return I;
+
+  // If we can compute the condition, there's no need for a select.
+  // Like the above fold, we are attempting to reduce compile-time cost by
+  // putting this fold here with limitations rather than in InstSimplify.
+  // The motivation for this call into value tracking is to take advantage of
+  // the assumption cache, so make sure that is populated.
+  if (!CondVal->getType()->isVectorTy() && !AC.assumptions().empty()) {
+    KnownBits Known(1);
+    computeKnownBits(CondVal, Known, 0, &SI);
+    if (Known.One.isOneValue())
+      return replaceInstUsesWith(SI, TrueVal);
+    if (Known.Zero.isOneValue())
+      return replaceInstUsesWith(SI, FalseVal);
+  }
+
+  if (Instruction *BitCastSel = foldSelectCmpBitcasts(SI, Builder))
+    return BitCastSel;
+
+  // Simplify selects that test the returned flag of cmpxchg instructions.
+  if (Value *V = foldSelectCmpXchg(SI))
+    return replaceInstUsesWith(SI, V);
+
+  if (Instruction *Select = foldSelectBinOpIdentity(SI, TLI, *this))
+    return Select;
+
   if (Instruction *Funnel = foldSelectFunnelShift(SI, Builder))
     return Funnel;
- 
-  if (Instruction *Copysign = foldSelectToCopysign(SI, Builder)) 
-    return Copysign; 
- 
-  if (Instruction *PN = foldSelectToPhi(SI, DT, Builder)) 
-    return replaceInstUsesWith(SI, PN); 
- 
+
+  if (Instruction *Copysign = foldSelectToCopysign(SI, Builder))
+    return Copysign;
+
+  if (Instruction *PN = foldSelectToPhi(SI, DT, Builder))
+    return replaceInstUsesWith(SI, PN);
+
   if (Value *Fr = foldSelectWithFrozenICmp(SI, Builder))
     return replaceInstUsesWith(SI, Fr);
 
-  return nullptr; 
-} 
+  return nullptr;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineShifts.cpp b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 194c67f595..127bf80809 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -1,26 +1,26 @@
-//===- InstCombineShifts.cpp ----------------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the visitShl, visitLShr, and visitAShr functions. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "InstCombineInternal.h" 
-#include "llvm/Analysis/ConstantFolding.h" 
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/PatternMatch.h" 
+//===- InstCombineShifts.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visitShl, visitLShr, and visitAShr functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
-using namespace llvm; 
-using namespace PatternMatch; 
- 
-#define DEBUG_TYPE "instcombine" 
- 
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
 bool canTryToConstantAddTwoShiftAmounts(Value *Sh0, Value *ShAmt0, Value *Sh1,
                                         Value *ShAmt1) {
   // We have two shift amounts from two different shifts. The types of those
@@ -45,911 +45,911 @@ bool canTryToConstantAddTwoShiftAmounts(Value *Sh0, Value *ShAmt0, Value *Sh1,
   return MaximalRepresentableShiftAmount.uge(MaximalPossibleTotalShiftAmount);
 }
 
-// Given pattern: 
-//   (x shiftopcode Q) shiftopcode K 
-// we should rewrite it as 
-//   x shiftopcode (Q+K)  iff (Q+K) u< bitwidth(x) and 
-// 
-// This is valid for any shift, but they must be identical, and we must be 
-// careful in case we have (zext(Q)+zext(K)) and look past extensions, 
-// (Q+K) must not overflow or else (Q+K) u< bitwidth(x) is bogus. 
-// 
-// AnalyzeForSignBitExtraction indicates that we will only analyze whether this 
-// pattern has any 2 right-shifts that sum to 1 less than original bit width. 
+// Given pattern:
+//   (x shiftopcode Q) shiftopcode K
+// we should rewrite it as
+//   x shiftopcode (Q+K)  iff (Q+K) u< bitwidth(x) and
+//
+// This is valid for any shift, but they must be identical, and we must be
+// careful in case we have (zext(Q)+zext(K)) and look past extensions,
+// (Q+K) must not overflow or else (Q+K) u< bitwidth(x) is bogus.
+//
+// AnalyzeForSignBitExtraction indicates that we will only analyze whether this
+// pattern has any 2 right-shifts that sum to 1 less than original bit width.
 Value *InstCombinerImpl::reassociateShiftAmtsOfTwoSameDirectionShifts(
-    BinaryOperator *Sh0, const SimplifyQuery &SQ, 
-    bool AnalyzeForSignBitExtraction) { 
-  // Look for a shift of some instruction, ignore zext of shift amount if any. 
-  Instruction *Sh0Op0; 
-  Value *ShAmt0; 
-  if (!match(Sh0, 
-             m_Shift(m_Instruction(Sh0Op0), m_ZExtOrSelf(m_Value(ShAmt0))))) 
-    return nullptr; 
- 
-  // If there is a truncation between the two shifts, we must make note of it 
-  // and look through it. The truncation imposes additional constraints on the 
-  // transform. 
-  Instruction *Sh1; 
-  Value *Trunc = nullptr; 
-  match(Sh0Op0, 
-        m_CombineOr(m_CombineAnd(m_Trunc(m_Instruction(Sh1)), m_Value(Trunc)), 
-                    m_Instruction(Sh1))); 
- 
-  // Inner shift: (x shiftopcode ShAmt1) 
-  // Like with other shift, ignore zext of shift amount if any. 
-  Value *X, *ShAmt1; 
-  if (!match(Sh1, m_Shift(m_Value(X), m_ZExtOrSelf(m_Value(ShAmt1))))) 
-    return nullptr; 
- 
+    BinaryOperator *Sh0, const SimplifyQuery &SQ,
+    bool AnalyzeForSignBitExtraction) {
+  // Look for a shift of some instruction, ignore zext of shift amount if any.
+  Instruction *Sh0Op0;
+  Value *ShAmt0;
+  if (!match(Sh0,
+             m_Shift(m_Instruction(Sh0Op0), m_ZExtOrSelf(m_Value(ShAmt0)))))
+    return nullptr;
+
+  // If there is a truncation between the two shifts, we must make note of it
+  // and look through it. The truncation imposes additional constraints on the
+  // transform.
+  Instruction *Sh1;
+  Value *Trunc = nullptr;
+  match(Sh0Op0,
+        m_CombineOr(m_CombineAnd(m_Trunc(m_Instruction(Sh1)), m_Value(Trunc)),
+                    m_Instruction(Sh1)));
+
+  // Inner shift: (x shiftopcode ShAmt1)
+  // Like with other shift, ignore zext of shift amount if any.
+  Value *X, *ShAmt1;
+  if (!match(Sh1, m_Shift(m_Value(X), m_ZExtOrSelf(m_Value(ShAmt1)))))
+    return nullptr;
+
   // Verify that it would be safe to try to add those two shift amounts.
   if (!canTryToConstantAddTwoShiftAmounts(Sh0, ShAmt0, Sh1, ShAmt1))
-    return nullptr; 
- 
-  // We are only looking for signbit extraction if we have two right shifts. 
-  bool HadTwoRightShifts = match(Sh0, m_Shr(m_Value(), m_Value())) && 
-                           match(Sh1, m_Shr(m_Value(), m_Value())); 
-  // ... and if it's not two right-shifts, we know the answer already. 
-  if (AnalyzeForSignBitExtraction && !HadTwoRightShifts) 
-    return nullptr; 
- 
-  // The shift opcodes must be identical, unless we are just checking whether 
-  // this pattern can be interpreted as a sign-bit-extraction. 
-  Instruction::BinaryOps ShiftOpcode = Sh0->getOpcode(); 
-  bool IdenticalShOpcodes = Sh0->getOpcode() == Sh1->getOpcode(); 
-  if (!IdenticalShOpcodes && !AnalyzeForSignBitExtraction) 
-    return nullptr; 
- 
-  // If we saw truncation, we'll need to produce extra instruction, 
-  // and for that one of the operands of the shift must be one-use, 
-  // unless of course we don't actually plan to produce any instructions here. 
-  if (Trunc && !AnalyzeForSignBitExtraction && 
-      !match(Sh0, m_c_BinOp(m_OneUse(m_Value()), m_Value()))) 
-    return nullptr; 
- 
-  // Can we fold (ShAmt0+ShAmt1) ? 
-  auto *NewShAmt = dyn_cast_or_null<Constant>( 
-      SimplifyAddInst(ShAmt0, ShAmt1, /*isNSW=*/false, /*isNUW=*/false, 
-                      SQ.getWithInstruction(Sh0))); 
-  if (!NewShAmt) 
-    return nullptr; // Did not simplify. 
-  unsigned NewShAmtBitWidth = NewShAmt->getType()->getScalarSizeInBits(); 
-  unsigned XBitWidth = X->getType()->getScalarSizeInBits(); 
-  // Is the new shift amount smaller than the bit width of inner/new shift? 
-  if (!match(NewShAmt, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT, 
-                                          APInt(NewShAmtBitWidth, XBitWidth)))) 
-    return nullptr; // FIXME: could perform constant-folding. 
- 
-  // If there was a truncation, and we have a right-shift, we can only fold if 
-  // we are left with the original sign bit. Likewise, if we were just checking 
-  // that this is a sighbit extraction, this is the place to check it. 
-  // FIXME: zero shift amount is also legal here, but we can't *easily* check 
-  // more than one predicate so it's not really worth it. 
-  if (HadTwoRightShifts && (Trunc || AnalyzeForSignBitExtraction)) { 
-    // If it's not a sign bit extraction, then we're done. 
-    if (!match(NewShAmt, 
-               m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ, 
-                                  APInt(NewShAmtBitWidth, XBitWidth - 1)))) 
-      return nullptr; 
-    // If it is, and that was the question, return the base value. 
-    if (AnalyzeForSignBitExtraction) 
-      return X; 
-  } 
- 
-  assert(IdenticalShOpcodes && "Should not get here with different shifts."); 
- 
-  // All good, we can do this fold. 
-  NewShAmt = ConstantExpr::getZExtOrBitCast(NewShAmt, X->getType()); 
- 
-  BinaryOperator *NewShift = BinaryOperator::Create(ShiftOpcode, X, NewShAmt); 
- 
-  // The flags can only be propagated if there wasn't a trunc. 
-  if (!Trunc) { 
-    // If the pattern did not involve trunc, and both of the original shifts 
-    // had the same flag set, preserve the flag. 
-    if (ShiftOpcode == Instruction::BinaryOps::Shl) { 
-      NewShift->setHasNoUnsignedWrap(Sh0->hasNoUnsignedWrap() && 
-                                     Sh1->hasNoUnsignedWrap()); 
-      NewShift->setHasNoSignedWrap(Sh0->hasNoSignedWrap() && 
-                                   Sh1->hasNoSignedWrap()); 
-    } else { 
-      NewShift->setIsExact(Sh0->isExact() && Sh1->isExact()); 
-    } 
-  } 
- 
-  Instruction *Ret = NewShift; 
-  if (Trunc) { 
-    Builder.Insert(NewShift); 
-    Ret = CastInst::Create(Instruction::Trunc, NewShift, Sh0->getType()); 
-  } 
- 
-  return Ret; 
-} 
- 
-// If we have some pattern that leaves only some low bits set, and then performs 
-// left-shift of those bits, if none of the bits that are left after the final 
-// shift are modified by the mask, we can omit the mask. 
-// 
-// There are many variants to this pattern: 
-//   a)  (x & ((1 << MaskShAmt) - 1)) << ShiftShAmt 
-//   b)  (x & (~(-1 << MaskShAmt))) << ShiftShAmt 
-//   c)  (x & (-1 >> MaskShAmt)) << ShiftShAmt 
-//   d)  (x & ((-1 << MaskShAmt) >> MaskShAmt)) << ShiftShAmt 
-//   e)  ((x << MaskShAmt) l>> MaskShAmt) << ShiftShAmt 
-//   f)  ((x << MaskShAmt) a>> MaskShAmt) << ShiftShAmt 
-// All these patterns can be simplified to just: 
-//   x << ShiftShAmt 
-// iff: 
-//   a,b)     (MaskShAmt+ShiftShAmt) u>= bitwidth(x) 
-//   c,d,e,f) (ShiftShAmt-MaskShAmt) s>= 0 (i.e. ShiftShAmt u>= MaskShAmt) 
-static Instruction * 
-dropRedundantMaskingOfLeftShiftInput(BinaryOperator *OuterShift, 
-                                     const SimplifyQuery &Q, 
-                                     InstCombiner::BuilderTy &Builder) { 
-  assert(OuterShift->getOpcode() == Instruction::BinaryOps::Shl && 
-         "The input must be 'shl'!"); 
- 
-  Value *Masked, *ShiftShAmt; 
-  match(OuterShift, 
-        m_Shift(m_Value(Masked), m_ZExtOrSelf(m_Value(ShiftShAmt)))); 
- 
-  // *If* there is a truncation between an outer shift and a possibly-mask, 
-  // then said truncation *must* be one-use, else we can't perform the fold. 
-  Value *Trunc; 
-  if (match(Masked, m_CombineAnd(m_Trunc(m_Value(Masked)), m_Value(Trunc))) && 
-      !Trunc->hasOneUse()) 
-    return nullptr; 
- 
-  Type *NarrowestTy = OuterShift->getType(); 
-  Type *WidestTy = Masked->getType(); 
-  bool HadTrunc = WidestTy != NarrowestTy; 
- 
-  // The mask must be computed in a type twice as wide to ensure 
-  // that no bits are lost if the sum-of-shifts is wider than the base type. 
-  Type *ExtendedTy = WidestTy->getExtendedType(); 
- 
-  Value *MaskShAmt; 
- 
-  // ((1 << MaskShAmt) - 1) 
-  auto MaskA = m_Add(m_Shl(m_One(), m_Value(MaskShAmt)), m_AllOnes()); 
-  // (~(-1 << maskNbits)) 
-  auto MaskB = m_Xor(m_Shl(m_AllOnes(), m_Value(MaskShAmt)), m_AllOnes()); 
-  // (-1 >> MaskShAmt) 
-  auto MaskC = m_Shr(m_AllOnes(), m_Value(MaskShAmt)); 
-  // ((-1 << MaskShAmt) >> MaskShAmt) 
-  auto MaskD = 
-      m_Shr(m_Shl(m_AllOnes(), m_Value(MaskShAmt)), m_Deferred(MaskShAmt)); 
- 
-  Value *X; 
-  Constant *NewMask; 
- 
-  if (match(Masked, m_c_And(m_CombineOr(MaskA, MaskB), m_Value(X)))) { 
-    // Peek through an optional zext of the shift amount. 
-    match(MaskShAmt, m_ZExtOrSelf(m_Value(MaskShAmt))); 
- 
+    return nullptr;
+
+  // We are only looking for signbit extraction if we have two right shifts.
+  bool HadTwoRightShifts = match(Sh0, m_Shr(m_Value(), m_Value())) &&
+                           match(Sh1, m_Shr(m_Value(), m_Value()));
+  // ... and if it's not two right-shifts, we know the answer already.
+  if (AnalyzeForSignBitExtraction && !HadTwoRightShifts)
+    return nullptr;
+
+  // The shift opcodes must be identical, unless we are just checking whether
+  // this pattern can be interpreted as a sign-bit-extraction.
+  Instruction::BinaryOps ShiftOpcode = Sh0->getOpcode();
+  bool IdenticalShOpcodes = Sh0->getOpcode() == Sh1->getOpcode();
+  if (!IdenticalShOpcodes && !AnalyzeForSignBitExtraction)
+    return nullptr;
+
+  // If we saw truncation, we'll need to produce extra instruction,
+  // and for that one of the operands of the shift must be one-use,
+  // unless of course we don't actually plan to produce any instructions here.
+  if (Trunc && !AnalyzeForSignBitExtraction &&
+      !match(Sh0, m_c_BinOp(m_OneUse(m_Value()), m_Value())))
+    return nullptr;
+
+  // Can we fold (ShAmt0+ShAmt1) ?
+  auto *NewShAmt = dyn_cast_or_null<Constant>(
+      SimplifyAddInst(ShAmt0, ShAmt1, /*isNSW=*/false, /*isNUW=*/false,
+                      SQ.getWithInstruction(Sh0)));
+  if (!NewShAmt)
+    return nullptr; // Did not simplify.
+  unsigned NewShAmtBitWidth = NewShAmt->getType()->getScalarSizeInBits();
+  unsigned XBitWidth = X->getType()->getScalarSizeInBits();
+  // Is the new shift amount smaller than the bit width of inner/new shift?
+  if (!match(NewShAmt, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT,
+                                          APInt(NewShAmtBitWidth, XBitWidth))))
+    return nullptr; // FIXME: could perform constant-folding.
+
+  // If there was a truncation, and we have a right-shift, we can only fold if
+  // we are left with the original sign bit. Likewise, if we were just checking
+  // that this is a sighbit extraction, this is the place to check it.
+  // FIXME: zero shift amount is also legal here, but we can't *easily* check
+  // more than one predicate so it's not really worth it.
+  if (HadTwoRightShifts && (Trunc || AnalyzeForSignBitExtraction)) {
+    // If it's not a sign bit extraction, then we're done.
+    if (!match(NewShAmt,
+               m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ,
+                                  APInt(NewShAmtBitWidth, XBitWidth - 1))))
+      return nullptr;
+    // If it is, and that was the question, return the base value.
+    if (AnalyzeForSignBitExtraction)
+      return X;
+  }
+
+  assert(IdenticalShOpcodes && "Should not get here with different shifts.");
+
+  // All good, we can do this fold.
+  NewShAmt = ConstantExpr::getZExtOrBitCast(NewShAmt, X->getType());
+
+  BinaryOperator *NewShift = BinaryOperator::Create(ShiftOpcode, X, NewShAmt);
+
+  // The flags can only be propagated if there wasn't a trunc.
+  if (!Trunc) {
+    // If the pattern did not involve trunc, and both of the original shifts
+    // had the same flag set, preserve the flag.
+    if (ShiftOpcode == Instruction::BinaryOps::Shl) {
+      NewShift->setHasNoUnsignedWrap(Sh0->hasNoUnsignedWrap() &&
+                                     Sh1->hasNoUnsignedWrap());
+      NewShift->setHasNoSignedWrap(Sh0->hasNoSignedWrap() &&
+                                   Sh1->hasNoSignedWrap());
+    } else {
+      NewShift->setIsExact(Sh0->isExact() && Sh1->isExact());
+    }
+  }
+
+  Instruction *Ret = NewShift;
+  if (Trunc) {
+    Builder.Insert(NewShift);
+    Ret = CastInst::Create(Instruction::Trunc, NewShift, Sh0->getType());
+  }
+
+  return Ret;
+}
+
+// If we have some pattern that leaves only some low bits set, and then performs
+// left-shift of those bits, if none of the bits that are left after the final
+// shift are modified by the mask, we can omit the mask.
+//
+// There are many variants to this pattern:
+//   a)  (x & ((1 << MaskShAmt) - 1)) << ShiftShAmt
+//   b)  (x & (~(-1 << MaskShAmt))) << ShiftShAmt
+//   c)  (x & (-1 >> MaskShAmt)) << ShiftShAmt
+//   d)  (x & ((-1 << MaskShAmt) >> MaskShAmt)) << ShiftShAmt
+//   e)  ((x << MaskShAmt) l>> MaskShAmt) << ShiftShAmt
+//   f)  ((x << MaskShAmt) a>> MaskShAmt) << ShiftShAmt
+// All these patterns can be simplified to just:
+//   x << ShiftShAmt
+// iff:
+//   a,b)     (MaskShAmt+ShiftShAmt) u>= bitwidth(x)
+//   c,d,e,f) (ShiftShAmt-MaskShAmt) s>= 0 (i.e. ShiftShAmt u>= MaskShAmt)
+static Instruction *
+dropRedundantMaskingOfLeftShiftInput(BinaryOperator *OuterShift,
+                                     const SimplifyQuery &Q,
+                                     InstCombiner::BuilderTy &Builder) {
+  assert(OuterShift->getOpcode() == Instruction::BinaryOps::Shl &&
+         "The input must be 'shl'!");
+
+  Value *Masked, *ShiftShAmt;
+  match(OuterShift,
+        m_Shift(m_Value(Masked), m_ZExtOrSelf(m_Value(ShiftShAmt))));
+
+  // *If* there is a truncation between an outer shift and a possibly-mask,
+  // then said truncation *must* be one-use, else we can't perform the fold.
+  Value *Trunc;
+  if (match(Masked, m_CombineAnd(m_Trunc(m_Value(Masked)), m_Value(Trunc))) &&
+      !Trunc->hasOneUse())
+    return nullptr;
+
+  Type *NarrowestTy = OuterShift->getType();
+  Type *WidestTy = Masked->getType();
+  bool HadTrunc = WidestTy != NarrowestTy;
+
+  // The mask must be computed in a type twice as wide to ensure
+  // that no bits are lost if the sum-of-shifts is wider than the base type.
+  Type *ExtendedTy = WidestTy->getExtendedType();
+
+  Value *MaskShAmt;
+
+  // ((1 << MaskShAmt) - 1)
+  auto MaskA = m_Add(m_Shl(m_One(), m_Value(MaskShAmt)), m_AllOnes());
+  // (~(-1 << maskNbits))
+  auto MaskB = m_Xor(m_Shl(m_AllOnes(), m_Value(MaskShAmt)), m_AllOnes());
+  // (-1 >> MaskShAmt)
+  auto MaskC = m_Shr(m_AllOnes(), m_Value(MaskShAmt));
+  // ((-1 << MaskShAmt) >> MaskShAmt)
+  auto MaskD =
+      m_Shr(m_Shl(m_AllOnes(), m_Value(MaskShAmt)), m_Deferred(MaskShAmt));
+
+  Value *X;
+  Constant *NewMask;
+
+  if (match(Masked, m_c_And(m_CombineOr(MaskA, MaskB), m_Value(X)))) {
+    // Peek through an optional zext of the shift amount.
+    match(MaskShAmt, m_ZExtOrSelf(m_Value(MaskShAmt)));
+
     // Verify that it would be safe to try to add those two shift amounts.
     if (!canTryToConstantAddTwoShiftAmounts(OuterShift, ShiftShAmt, Masked,
                                             MaskShAmt))
-      return nullptr; 
- 
-    // Can we simplify (MaskShAmt+ShiftShAmt) ? 
-    auto *SumOfShAmts = dyn_cast_or_null<Constant>(SimplifyAddInst( 
-        MaskShAmt, ShiftShAmt, /*IsNSW=*/false, /*IsNUW=*/false, Q)); 
-    if (!SumOfShAmts) 
-      return nullptr; // Did not simplify. 
-    // In this pattern SumOfShAmts correlates with the number of low bits 
-    // that shall remain in the root value (OuterShift). 
- 
-    // An extend of an undef value becomes zero because the high bits are never 
-    // completely unknown. Replace the the `undef` shift amounts with final 
-    // shift bitwidth to ensure that the value remains undef when creating the 
-    // subsequent shift op. 
-    SumOfShAmts = Constant::replaceUndefsWith( 
-        SumOfShAmts, ConstantInt::get(SumOfShAmts->getType()->getScalarType(), 
-                                      ExtendedTy->getScalarSizeInBits())); 
-    auto *ExtendedSumOfShAmts = ConstantExpr::getZExt(SumOfShAmts, ExtendedTy); 
-    // And compute the mask as usual: ~(-1 << (SumOfShAmts)) 
-    auto *ExtendedAllOnes = ConstantExpr::getAllOnesValue(ExtendedTy); 
-    auto *ExtendedInvertedMask = 
-        ConstantExpr::getShl(ExtendedAllOnes, ExtendedSumOfShAmts); 
-    NewMask = ConstantExpr::getNot(ExtendedInvertedMask); 
-  } else if (match(Masked, m_c_And(m_CombineOr(MaskC, MaskD), m_Value(X))) || 
-             match(Masked, m_Shr(m_Shl(m_Value(X), m_Value(MaskShAmt)), 
-                                 m_Deferred(MaskShAmt)))) { 
-    // Peek through an optional zext of the shift amount. 
-    match(MaskShAmt, m_ZExtOrSelf(m_Value(MaskShAmt))); 
- 
+      return nullptr;
+
+    // Can we simplify (MaskShAmt+ShiftShAmt) ?
+    auto *SumOfShAmts = dyn_cast_or_null<Constant>(SimplifyAddInst(
+        MaskShAmt, ShiftShAmt, /*IsNSW=*/false, /*IsNUW=*/false, Q));
+    if (!SumOfShAmts)
+      return nullptr; // Did not simplify.
+    // In this pattern SumOfShAmts correlates with the number of low bits
+    // that shall remain in the root value (OuterShift).
+
+    // An extend of an undef value becomes zero because the high bits are never
+    // completely unknown. Replace the the `undef` shift amounts with final
+    // shift bitwidth to ensure that the value remains undef when creating the
+    // subsequent shift op.
+    SumOfShAmts = Constant::replaceUndefsWith(
+        SumOfShAmts, ConstantInt::get(SumOfShAmts->getType()->getScalarType(),
+                                      ExtendedTy->getScalarSizeInBits()));
+    auto *ExtendedSumOfShAmts = ConstantExpr::getZExt(SumOfShAmts, ExtendedTy);
+    // And compute the mask as usual: ~(-1 << (SumOfShAmts))
+    auto *ExtendedAllOnes = ConstantExpr::getAllOnesValue(ExtendedTy);
+    auto *ExtendedInvertedMask =
+        ConstantExpr::getShl(ExtendedAllOnes, ExtendedSumOfShAmts);
+    NewMask = ConstantExpr::getNot(ExtendedInvertedMask);
+  } else if (match(Masked, m_c_And(m_CombineOr(MaskC, MaskD), m_Value(X))) ||
+             match(Masked, m_Shr(m_Shl(m_Value(X), m_Value(MaskShAmt)),
+                                 m_Deferred(MaskShAmt)))) {
+    // Peek through an optional zext of the shift amount.
+    match(MaskShAmt, m_ZExtOrSelf(m_Value(MaskShAmt)));
+
     // Verify that it would be safe to try to add those two shift amounts.
     if (!canTryToConstantAddTwoShiftAmounts(OuterShift, ShiftShAmt, Masked,
                                             MaskShAmt))
-      return nullptr; 
- 
-    // Can we simplify (ShiftShAmt-MaskShAmt) ? 
-    auto *ShAmtsDiff = dyn_cast_or_null<Constant>(SimplifySubInst( 
-        ShiftShAmt, MaskShAmt, /*IsNSW=*/false, /*IsNUW=*/false, Q)); 
-    if (!ShAmtsDiff) 
-      return nullptr; // Did not simplify. 
-    // In this pattern ShAmtsDiff correlates with the number of high bits that 
-    // shall be unset in the root value (OuterShift). 
- 
-    // An extend of an undef value becomes zero because the high bits are never 
-    // completely unknown. Replace the the `undef` shift amounts with negated 
-    // bitwidth of innermost shift to ensure that the value remains undef when 
-    // creating the subsequent shift op. 
-    unsigned WidestTyBitWidth = WidestTy->getScalarSizeInBits(); 
-    ShAmtsDiff = Constant::replaceUndefsWith( 
-        ShAmtsDiff, ConstantInt::get(ShAmtsDiff->getType()->getScalarType(), 
-                                     -WidestTyBitWidth)); 
-    auto *ExtendedNumHighBitsToClear = ConstantExpr::getZExt( 
-        ConstantExpr::getSub(ConstantInt::get(ShAmtsDiff->getType(), 
-                                              WidestTyBitWidth, 
-                                              /*isSigned=*/false), 
-                             ShAmtsDiff), 
-        ExtendedTy); 
-    // And compute the mask as usual: (-1 l>> (NumHighBitsToClear)) 
-    auto *ExtendedAllOnes = ConstantExpr::getAllOnesValue(ExtendedTy); 
-    NewMask = 
-        ConstantExpr::getLShr(ExtendedAllOnes, ExtendedNumHighBitsToClear); 
-  } else 
-    return nullptr; // Don't know anything about this pattern. 
- 
-  NewMask = ConstantExpr::getTrunc(NewMask, NarrowestTy); 
- 
-  // Does this mask has any unset bits? If not then we can just not apply it. 
-  bool NeedMask = !match(NewMask, m_AllOnes()); 
- 
-  // If we need to apply a mask, there are several more restrictions we have. 
-  if (NeedMask) { 
-    // The old masking instruction must go away. 
-    if (!Masked->hasOneUse()) 
-      return nullptr; 
-    // The original "masking" instruction must not have been`ashr`. 
-    if (match(Masked, m_AShr(m_Value(), m_Value()))) 
-      return nullptr; 
-  } 
- 
-  // If we need to apply truncation, let's do it first, since we can. 
-  // We have already ensured that the old truncation will go away. 
-  if (HadTrunc) 
-    X = Builder.CreateTrunc(X, NarrowestTy); 
- 
-  // No 'NUW'/'NSW'! We no longer know that we won't shift-out non-0 bits. 
-  // We didn't change the Type of this outermost shift, so we can just do it. 
-  auto *NewShift = BinaryOperator::Create(OuterShift->getOpcode(), X, 
-                                          OuterShift->getOperand(1)); 
-  if (!NeedMask) 
-    return NewShift; 
- 
-  Builder.Insert(NewShift); 
-  return BinaryOperator::Create(Instruction::And, NewShift, NewMask); 
-} 
- 
-/// If we have a shift-by-constant of a bitwise logic op that itself has a 
-/// shift-by-constant operand with identical opcode, we may be able to convert 
-/// that into 2 independent shifts followed by the logic op. This eliminates a 
-/// a use of an intermediate value (reduces dependency chain). 
-static Instruction *foldShiftOfShiftedLogic(BinaryOperator &I, 
-                                            InstCombiner::BuilderTy &Builder) { 
-  assert(I.isShift() && "Expected a shift as input"); 
-  auto *LogicInst = dyn_cast<BinaryOperator>(I.getOperand(0)); 
-  if (!LogicInst || !LogicInst->isBitwiseLogicOp() || !LogicInst->hasOneUse()) 
-    return nullptr; 
- 
+      return nullptr;
+
+    // Can we simplify (ShiftShAmt-MaskShAmt) ?
+    auto *ShAmtsDiff = dyn_cast_or_null<Constant>(SimplifySubInst(
+        ShiftShAmt, MaskShAmt, /*IsNSW=*/false, /*IsNUW=*/false, Q));
+    if (!ShAmtsDiff)
+      return nullptr; // Did not simplify.
+    // In this pattern ShAmtsDiff correlates with the number of high bits that
+    // shall be unset in the root value (OuterShift).
+
+    // An extend of an undef value becomes zero because the high bits are never
+    // completely unknown. Replace the the `undef` shift amounts with negated
+    // bitwidth of innermost shift to ensure that the value remains undef when
+    // creating the subsequent shift op.
+    unsigned WidestTyBitWidth = WidestTy->getScalarSizeInBits();
+    ShAmtsDiff = Constant::replaceUndefsWith(
+        ShAmtsDiff, ConstantInt::get(ShAmtsDiff->getType()->getScalarType(),
+                                     -WidestTyBitWidth));
+    auto *ExtendedNumHighBitsToClear = ConstantExpr::getZExt(
+        ConstantExpr::getSub(ConstantInt::get(ShAmtsDiff->getType(),
+                                              WidestTyBitWidth,
+                                              /*isSigned=*/false),
+                             ShAmtsDiff),
+        ExtendedTy);
+    // And compute the mask as usual: (-1 l>> (NumHighBitsToClear))
+    auto *ExtendedAllOnes = ConstantExpr::getAllOnesValue(ExtendedTy);
+    NewMask =
+        ConstantExpr::getLShr(ExtendedAllOnes, ExtendedNumHighBitsToClear);
+  } else
+    return nullptr; // Don't know anything about this pattern.
+
+  NewMask = ConstantExpr::getTrunc(NewMask, NarrowestTy);
+
+  // Does this mask has any unset bits? If not then we can just not apply it.
+  bool NeedMask = !match(NewMask, m_AllOnes());
+
+  // If we need to apply a mask, there are several more restrictions we have.
+  if (NeedMask) {
+    // The old masking instruction must go away.
+    if (!Masked->hasOneUse())
+      return nullptr;
+    // The original "masking" instruction must not have been`ashr`.
+    if (match(Masked, m_AShr(m_Value(), m_Value())))
+      return nullptr;
+  }
+
+  // If we need to apply truncation, let's do it first, since we can.
+  // We have already ensured that the old truncation will go away.
+  if (HadTrunc)
+    X = Builder.CreateTrunc(X, NarrowestTy);
+
+  // No 'NUW'/'NSW'! We no longer know that we won't shift-out non-0 bits.
+  // We didn't change the Type of this outermost shift, so we can just do it.
+  auto *NewShift = BinaryOperator::Create(OuterShift->getOpcode(), X,
+                                          OuterShift->getOperand(1));
+  if (!NeedMask)
+    return NewShift;
+
+  Builder.Insert(NewShift);
+  return BinaryOperator::Create(Instruction::And, NewShift, NewMask);
+}
+
+/// If we have a shift-by-constant of a bitwise logic op that itself has a
+/// shift-by-constant operand with identical opcode, we may be able to convert
+/// that into 2 independent shifts followed by the logic op. This eliminates a
+/// a use of an intermediate value (reduces dependency chain).
+static Instruction *foldShiftOfShiftedLogic(BinaryOperator &I,
+                                            InstCombiner::BuilderTy &Builder) {
+  assert(I.isShift() && "Expected a shift as input");
+  auto *LogicInst = dyn_cast<BinaryOperator>(I.getOperand(0));
+  if (!LogicInst || !LogicInst->isBitwiseLogicOp() || !LogicInst->hasOneUse())
+    return nullptr;
+
   Constant *C0, *C1;
   if (!match(I.getOperand(1), m_Constant(C1)))
-    return nullptr; 
- 
-  Instruction::BinaryOps ShiftOpcode = I.getOpcode(); 
-  Type *Ty = I.getType(); 
- 
-  // Find a matching one-use shift by constant. The fold is not valid if the sum 
-  // of the shift values equals or exceeds bitwidth. 
-  // TODO: Remove the one-use check if the other logic operand (Y) is constant. 
-  Value *X, *Y; 
-  auto matchFirstShift = [&](Value *V) { 
+    return nullptr;
+
+  Instruction::BinaryOps ShiftOpcode = I.getOpcode();
+  Type *Ty = I.getType();
+
+  // Find a matching one-use shift by constant. The fold is not valid if the sum
+  // of the shift values equals or exceeds bitwidth.
+  // TODO: Remove the one-use check if the other logic operand (Y) is constant.
+  Value *X, *Y;
+  auto matchFirstShift = [&](Value *V) {
     BinaryOperator *BO;
     APInt Threshold(Ty->getScalarSizeInBits(), Ty->getScalarSizeInBits());
     return match(V, m_BinOp(BO)) && BO->getOpcode() == ShiftOpcode &&
            match(V, m_OneUse(m_Shift(m_Value(X), m_Constant(C0)))) &&
            match(ConstantExpr::getAdd(C0, C1),
                  m_SpecificInt_ICMP(ICmpInst::ICMP_ULT, Threshold));
-  }; 
- 
-  // Logic ops are commutative, so check each operand for a match. 
-  if (matchFirstShift(LogicInst->getOperand(0))) 
-    Y = LogicInst->getOperand(1); 
-  else if (matchFirstShift(LogicInst->getOperand(1))) 
-    Y = LogicInst->getOperand(0); 
-  else 
-    return nullptr; 
- 
-  // shift (logic (shift X, C0), Y), C1 -> logic (shift X, C0+C1), (shift Y, C1) 
+  };
+
+  // Logic ops are commutative, so check each operand for a match.
+  if (matchFirstShift(LogicInst->getOperand(0)))
+    Y = LogicInst->getOperand(1);
+  else if (matchFirstShift(LogicInst->getOperand(1)))
+    Y = LogicInst->getOperand(0);
+  else
+    return nullptr;
+
+  // shift (logic (shift X, C0), Y), C1 -> logic (shift X, C0+C1), (shift Y, C1)
   Constant *ShiftSumC = ConstantExpr::getAdd(C0, C1);
-  Value *NewShift1 = Builder.CreateBinOp(ShiftOpcode, X, ShiftSumC); 
-  Value *NewShift2 = Builder.CreateBinOp(ShiftOpcode, Y, I.getOperand(1)); 
-  return BinaryOperator::Create(LogicInst->getOpcode(), NewShift1, NewShift2); 
-} 
- 
+  Value *NewShift1 = Builder.CreateBinOp(ShiftOpcode, X, ShiftSumC);
+  Value *NewShift2 = Builder.CreateBinOp(ShiftOpcode, Y, I.getOperand(1));
+  return BinaryOperator::Create(LogicInst->getOpcode(), NewShift1, NewShift2);
+}
+
 Instruction *InstCombinerImpl::commonShiftTransforms(BinaryOperator &I) {
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); 
-  assert(Op0->getType() == Op1->getType()); 
- 
-  // If the shift amount is a one-use `sext`, we can demote it to `zext`. 
-  Value *Y; 
-  if (match(Op1, m_OneUse(m_SExt(m_Value(Y))))) { 
-    Value *NewExt = Builder.CreateZExt(Y, I.getType(), Op1->getName()); 
-    return BinaryOperator::Create(I.getOpcode(), Op0, NewExt); 
-  } 
- 
-  // See if we can fold away this shift. 
-  if (SimplifyDemandedInstructionBits(I)) 
-    return &I; 
- 
-  // Try to fold constant and into select arguments. 
-  if (isa<Constant>(Op0)) 
-    if (SelectInst *SI = dyn_cast<SelectInst>(Op1)) 
-      if (Instruction *R = FoldOpIntoSelect(I, SI)) 
-        return R; 
- 
-  if (Constant *CUI = dyn_cast<Constant>(Op1)) 
-    if (Instruction *Res = FoldShiftByConstant(Op0, CUI, I)) 
-      return Res; 
- 
-  if (auto *NewShift = cast_or_null<Instruction>( 
-          reassociateShiftAmtsOfTwoSameDirectionShifts(&I, SQ))) 
-    return NewShift; 
- 
-  // (C1 shift (A add C2)) -> (C1 shift C2) shift A) 
-  // iff A and C2 are both positive. 
-  Value *A; 
-  Constant *C; 
-  if (match(Op0, m_Constant()) && match(Op1, m_Add(m_Value(A), m_Constant(C)))) 
-    if (isKnownNonNegative(A, DL, 0, &AC, &I, &DT) && 
-        isKnownNonNegative(C, DL, 0, &AC, &I, &DT)) 
-      return BinaryOperator::Create( 
-          I.getOpcode(), Builder.CreateBinOp(I.getOpcode(), Op0, C), A); 
- 
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  assert(Op0->getType() == Op1->getType());
+
+  // If the shift amount is a one-use `sext`, we can demote it to `zext`.
+  Value *Y;
+  if (match(Op1, m_OneUse(m_SExt(m_Value(Y))))) {
+    Value *NewExt = Builder.CreateZExt(Y, I.getType(), Op1->getName());
+    return BinaryOperator::Create(I.getOpcode(), Op0, NewExt);
+  }
+
+  // See if we can fold away this shift.
+  if (SimplifyDemandedInstructionBits(I))
+    return &I;
+
+  // Try to fold constant and into select arguments.
+  if (isa<Constant>(Op0))
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
+      if (Instruction *R = FoldOpIntoSelect(I, SI))
+        return R;
+
+  if (Constant *CUI = dyn_cast<Constant>(Op1))
+    if (Instruction *Res = FoldShiftByConstant(Op0, CUI, I))
+      return Res;
+
+  if (auto *NewShift = cast_or_null<Instruction>(
+          reassociateShiftAmtsOfTwoSameDirectionShifts(&I, SQ)))
+    return NewShift;
+
+  // (C1 shift (A add C2)) -> (C1 shift C2) shift A)
+  // iff A and C2 are both positive.
+  Value *A;
+  Constant *C;
+  if (match(Op0, m_Constant()) && match(Op1, m_Add(m_Value(A), m_Constant(C))))
+    if (isKnownNonNegative(A, DL, 0, &AC, &I, &DT) &&
+        isKnownNonNegative(C, DL, 0, &AC, &I, &DT))
+      return BinaryOperator::Create(
+          I.getOpcode(), Builder.CreateBinOp(I.getOpcode(), Op0, C), A);
+
   // X shift (A srem C) -> X shift (A and (C - 1)) iff C is a power of 2.
-  // Because shifts by negative values (which could occur if A were negative) 
-  // are undefined. 
+  // Because shifts by negative values (which could occur if A were negative)
+  // are undefined.
   if (Op1->hasOneUse() && match(Op1, m_SRem(m_Value(A), m_Constant(C))) &&
       match(C, m_Power2())) {
-    // FIXME: Should this get moved into SimplifyDemandedBits by saying we don't 
-    // demand the sign bit (and many others) here?? 
+    // FIXME: Should this get moved into SimplifyDemandedBits by saying we don't
+    // demand the sign bit (and many others) here??
     Constant *Mask = ConstantExpr::getSub(C, ConstantInt::get(I.getType(), 1));
     Value *Rem = Builder.CreateAnd(A, Mask, Op1->getName());
-    return replaceOperand(I, 1, Rem); 
-  } 
- 
-  if (Instruction *Logic = foldShiftOfShiftedLogic(I, Builder)) 
-    return Logic; 
- 
-  return nullptr; 
-} 
- 
-/// Return true if we can simplify two logical (either left or right) shifts 
-/// that have constant shift amounts: OuterShift (InnerShift X, C1), C2. 
-static bool canEvaluateShiftedShift(unsigned OuterShAmt, bool IsOuterShl, 
+    return replaceOperand(I, 1, Rem);
+  }
+
+  if (Instruction *Logic = foldShiftOfShiftedLogic(I, Builder))
+    return Logic;
+
+  return nullptr;
+}
+
+/// Return true if we can simplify two logical (either left or right) shifts
+/// that have constant shift amounts: OuterShift (InnerShift X, C1), C2.
+static bool canEvaluateShiftedShift(unsigned OuterShAmt, bool IsOuterShl,
                                     Instruction *InnerShift,
                                     InstCombinerImpl &IC, Instruction *CxtI) {
-  assert(InnerShift->isLogicalShift() && "Unexpected instruction type"); 
- 
-  // We need constant scalar or constant splat shifts. 
-  const APInt *InnerShiftConst; 
-  if (!match(InnerShift->getOperand(1), m_APInt(InnerShiftConst))) 
-    return false; 
- 
-  // Two logical shifts in the same direction: 
-  // shl (shl X, C1), C2 -->  shl X, C1 + C2 
-  // lshr (lshr X, C1), C2 --> lshr X, C1 + C2 
-  bool IsInnerShl = InnerShift->getOpcode() == Instruction::Shl; 
-  if (IsInnerShl == IsOuterShl) 
-    return true; 
- 
-  // Equal shift amounts in opposite directions become bitwise 'and': 
-  // lshr (shl X, C), C --> and X, C' 
-  // shl (lshr X, C), C --> and X, C' 
-  if (*InnerShiftConst == OuterShAmt) 
-    return true; 
- 
-  // If the 2nd shift is bigger than the 1st, we can fold: 
-  // lshr (shl X, C1), C2 -->  and (shl X, C1 - C2), C3 
-  // shl (lshr X, C1), C2 --> and (lshr X, C1 - C2), C3 
-  // but it isn't profitable unless we know the and'd out bits are already zero. 
-  // Also, check that the inner shift is valid (less than the type width) or 
-  // we'll crash trying to produce the bit mask for the 'and'. 
-  unsigned TypeWidth = InnerShift->getType()->getScalarSizeInBits(); 
-  if (InnerShiftConst->ugt(OuterShAmt) && InnerShiftConst->ult(TypeWidth)) { 
-    unsigned InnerShAmt = InnerShiftConst->getZExtValue(); 
-    unsigned MaskShift = 
-        IsInnerShl ? TypeWidth - InnerShAmt : InnerShAmt - OuterShAmt; 
-    APInt Mask = APInt::getLowBitsSet(TypeWidth, OuterShAmt) << MaskShift; 
-    if (IC.MaskedValueIsZero(InnerShift->getOperand(0), Mask, 0, CxtI)) 
-      return true; 
-  } 
- 
-  return false; 
-} 
- 
-/// See if we can compute the specified value, but shifted logically to the left 
-/// or right by some number of bits. This should return true if the expression 
-/// can be computed for the same cost as the current expression tree. This is 
-/// used to eliminate extraneous shifting from things like: 
-///      %C = shl i128 %A, 64 
-///      %D = shl i128 %B, 96 
-///      %E = or i128 %C, %D 
-///      %F = lshr i128 %E, 64 
-/// where the client will ask if E can be computed shifted right by 64-bits. If 
-/// this succeeds, getShiftedValue() will be called to produce the value. 
-static bool canEvaluateShifted(Value *V, unsigned NumBits, bool IsLeftShift, 
+  assert(InnerShift->isLogicalShift() && "Unexpected instruction type");
+
+  // We need constant scalar or constant splat shifts.
+  const APInt *InnerShiftConst;
+  if (!match(InnerShift->getOperand(1), m_APInt(InnerShiftConst)))
+    return false;
+
+  // Two logical shifts in the same direction:
+  // shl (shl X, C1), C2 -->  shl X, C1 + C2
+  // lshr (lshr X, C1), C2 --> lshr X, C1 + C2
+  bool IsInnerShl = InnerShift->getOpcode() == Instruction::Shl;
+  if (IsInnerShl == IsOuterShl)
+    return true;
+
+  // Equal shift amounts in opposite directions become bitwise 'and':
+  // lshr (shl X, C), C --> and X, C'
+  // shl (lshr X, C), C --> and X, C'
+  if (*InnerShiftConst == OuterShAmt)
+    return true;
+
+  // If the 2nd shift is bigger than the 1st, we can fold:
+  // lshr (shl X, C1), C2 -->  and (shl X, C1 - C2), C3
+  // shl (lshr X, C1), C2 --> and (lshr X, C1 - C2), C3
+  // but it isn't profitable unless we know the and'd out bits are already zero.
+  // Also, check that the inner shift is valid (less than the type width) or
+  // we'll crash trying to produce the bit mask for the 'and'.
+  unsigned TypeWidth = InnerShift->getType()->getScalarSizeInBits();
+  if (InnerShiftConst->ugt(OuterShAmt) && InnerShiftConst->ult(TypeWidth)) {
+    unsigned InnerShAmt = InnerShiftConst->getZExtValue();
+    unsigned MaskShift =
+        IsInnerShl ? TypeWidth - InnerShAmt : InnerShAmt - OuterShAmt;
+    APInt Mask = APInt::getLowBitsSet(TypeWidth, OuterShAmt) << MaskShift;
+    if (IC.MaskedValueIsZero(InnerShift->getOperand(0), Mask, 0, CxtI))
+      return true;
+  }
+
+  return false;
+}
+
+/// See if we can compute the specified value, but shifted logically to the left
+/// or right by some number of bits. This should return true if the expression
+/// can be computed for the same cost as the current expression tree. This is
+/// used to eliminate extraneous shifting from things like:
+///      %C = shl i128 %A, 64
+///      %D = shl i128 %B, 96
+///      %E = or i128 %C, %D
+///      %F = lshr i128 %E, 64
+/// where the client will ask if E can be computed shifted right by 64-bits. If
+/// this succeeds, getShiftedValue() will be called to produce the value.
+static bool canEvaluateShifted(Value *V, unsigned NumBits, bool IsLeftShift,
                                InstCombinerImpl &IC, Instruction *CxtI) {
-  // We can always evaluate constants shifted. 
-  if (isa<Constant>(V)) 
-    return true; 
- 
-  Instruction *I = dyn_cast<Instruction>(V); 
-  if (!I) return false; 
- 
-  // We can't mutate something that has multiple uses: doing so would 
-  // require duplicating the instruction in general, which isn't profitable. 
-  if (!I->hasOneUse()) return false; 
- 
-  switch (I->getOpcode()) { 
-  default: return false; 
-  case Instruction::And: 
-  case Instruction::Or: 
-  case Instruction::Xor: 
-    // Bitwise operators can all arbitrarily be arbitrarily evaluated shifted. 
-    return canEvaluateShifted(I->getOperand(0), NumBits, IsLeftShift, IC, I) && 
-           canEvaluateShifted(I->getOperand(1), NumBits, IsLeftShift, IC, I); 
- 
-  case Instruction::Shl: 
-  case Instruction::LShr: 
-    return canEvaluateShiftedShift(NumBits, IsLeftShift, I, IC, CxtI); 
- 
-  case Instruction::Select: { 
-    SelectInst *SI = cast<SelectInst>(I); 
-    Value *TrueVal = SI->getTrueValue(); 
-    Value *FalseVal = SI->getFalseValue(); 
-    return canEvaluateShifted(TrueVal, NumBits, IsLeftShift, IC, SI) && 
-           canEvaluateShifted(FalseVal, NumBits, IsLeftShift, IC, SI); 
-  } 
-  case Instruction::PHI: { 
-    // We can change a phi if we can change all operands.  Note that we never 
-    // get into trouble with cyclic PHIs here because we only consider 
-    // instructions with a single use. 
-    PHINode *PN = cast<PHINode>(I); 
-    for (Value *IncValue : PN->incoming_values()) 
-      if (!canEvaluateShifted(IncValue, NumBits, IsLeftShift, IC, PN)) 
-        return false; 
-    return true; 
-  } 
-  } 
-} 
- 
-/// Fold OuterShift (InnerShift X, C1), C2. 
-/// See canEvaluateShiftedShift() for the constraints on these instructions. 
-static Value *foldShiftedShift(BinaryOperator *InnerShift, unsigned OuterShAmt, 
-                               bool IsOuterShl, 
-                               InstCombiner::BuilderTy &Builder) { 
-  bool IsInnerShl = InnerShift->getOpcode() == Instruction::Shl; 
-  Type *ShType = InnerShift->getType(); 
-  unsigned TypeWidth = ShType->getScalarSizeInBits(); 
- 
-  // We only accept shifts-by-a-constant in canEvaluateShifted(). 
-  const APInt *C1; 
-  match(InnerShift->getOperand(1), m_APInt(C1)); 
-  unsigned InnerShAmt = C1->getZExtValue(); 
- 
-  // Change the shift amount and clear the appropriate IR flags. 
-  auto NewInnerShift = [&](unsigned ShAmt) { 
-    InnerShift->setOperand(1, ConstantInt::get(ShType, ShAmt)); 
-    if (IsInnerShl) { 
-      InnerShift->setHasNoUnsignedWrap(false); 
-      InnerShift->setHasNoSignedWrap(false); 
-    } else { 
-      InnerShift->setIsExact(false); 
-    } 
-    return InnerShift; 
-  }; 
- 
-  // Two logical shifts in the same direction: 
-  // shl (shl X, C1), C2 -->  shl X, C1 + C2 
-  // lshr (lshr X, C1), C2 --> lshr X, C1 + C2 
-  if (IsInnerShl == IsOuterShl) { 
-    // If this is an oversized composite shift, then unsigned shifts get 0. 
-    if (InnerShAmt + OuterShAmt >= TypeWidth) 
-      return Constant::getNullValue(ShType); 
- 
-    return NewInnerShift(InnerShAmt + OuterShAmt); 
-  } 
- 
-  // Equal shift amounts in opposite directions become bitwise 'and': 
-  // lshr (shl X, C), C --> and X, C' 
-  // shl (lshr X, C), C --> and X, C' 
-  if (InnerShAmt == OuterShAmt) { 
-    APInt Mask = IsInnerShl 
-                     ? APInt::getLowBitsSet(TypeWidth, TypeWidth - OuterShAmt) 
-                     : APInt::getHighBitsSet(TypeWidth, TypeWidth - OuterShAmt); 
-    Value *And = Builder.CreateAnd(InnerShift->getOperand(0), 
-                                   ConstantInt::get(ShType, Mask)); 
-    if (auto *AndI = dyn_cast<Instruction>(And)) { 
-      AndI->moveBefore(InnerShift); 
-      AndI->takeName(InnerShift); 
-    } 
-    return And; 
-  } 
- 
-  assert(InnerShAmt > OuterShAmt && 
-         "Unexpected opposite direction logical shift pair"); 
- 
-  // In general, we would need an 'and' for this transform, but 
-  // canEvaluateShiftedShift() guarantees that the masked-off bits are not used. 
-  // lshr (shl X, C1), C2 -->  shl X, C1 - C2 
-  // shl (lshr X, C1), C2 --> lshr X, C1 - C2 
-  return NewInnerShift(InnerShAmt - OuterShAmt); 
-} 
- 
-/// When canEvaluateShifted() returns true for an expression, this function 
-/// inserts the new computation that produces the shifted value. 
-static Value *getShiftedValue(Value *V, unsigned NumBits, bool isLeftShift, 
+  // We can always evaluate constants shifted.
+  if (isa<Constant>(V))
+    return true;
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
+
+  // We can't mutate something that has multiple uses: doing so would
+  // require duplicating the instruction in general, which isn't profitable.
+  if (!I->hasOneUse()) return false;
+
+  switch (I->getOpcode()) {
+  default: return false;
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    // Bitwise operators can all arbitrarily be arbitrarily evaluated shifted.
+    return canEvaluateShifted(I->getOperand(0), NumBits, IsLeftShift, IC, I) &&
+           canEvaluateShifted(I->getOperand(1), NumBits, IsLeftShift, IC, I);
+
+  case Instruction::Shl:
+  case Instruction::LShr:
+    return canEvaluateShiftedShift(NumBits, IsLeftShift, I, IC, CxtI);
+
+  case Instruction::Select: {
+    SelectInst *SI = cast<SelectInst>(I);
+    Value *TrueVal = SI->getTrueValue();
+    Value *FalseVal = SI->getFalseValue();
+    return canEvaluateShifted(TrueVal, NumBits, IsLeftShift, IC, SI) &&
+           canEvaluateShifted(FalseVal, NumBits, IsLeftShift, IC, SI);
+  }
+  case Instruction::PHI: {
+    // We can change a phi if we can change all operands.  Note that we never
+    // get into trouble with cyclic PHIs here because we only consider
+    // instructions with a single use.
+    PHINode *PN = cast<PHINode>(I);
+    for (Value *IncValue : PN->incoming_values())
+      if (!canEvaluateShifted(IncValue, NumBits, IsLeftShift, IC, PN))
+        return false;
+    return true;
+  }
+  }
+}
+
+/// Fold OuterShift (InnerShift X, C1), C2.
+/// See canEvaluateShiftedShift() for the constraints on these instructions.
+static Value *foldShiftedShift(BinaryOperator *InnerShift, unsigned OuterShAmt,
+                               bool IsOuterShl,
+                               InstCombiner::BuilderTy &Builder) {
+  bool IsInnerShl = InnerShift->getOpcode() == Instruction::Shl;
+  Type *ShType = InnerShift->getType();
+  unsigned TypeWidth = ShType->getScalarSizeInBits();
+
+  // We only accept shifts-by-a-constant in canEvaluateShifted().
+  const APInt *C1;
+  match(InnerShift->getOperand(1), m_APInt(C1));
+  unsigned InnerShAmt = C1->getZExtValue();
+
+  // Change the shift amount and clear the appropriate IR flags.
+  auto NewInnerShift = [&](unsigned ShAmt) {
+    InnerShift->setOperand(1, ConstantInt::get(ShType, ShAmt));
+    if (IsInnerShl) {
+      InnerShift->setHasNoUnsignedWrap(false);
+      InnerShift->setHasNoSignedWrap(false);
+    } else {
+      InnerShift->setIsExact(false);
+    }
+    return InnerShift;
+  };
+
+  // Two logical shifts in the same direction:
+  // shl (shl X, C1), C2 -->  shl X, C1 + C2
+  // lshr (lshr X, C1), C2 --> lshr X, C1 + C2
+  if (IsInnerShl == IsOuterShl) {
+    // If this is an oversized composite shift, then unsigned shifts get 0.
+    if (InnerShAmt + OuterShAmt >= TypeWidth)
+      return Constant::getNullValue(ShType);
+
+    return NewInnerShift(InnerShAmt + OuterShAmt);
+  }
+
+  // Equal shift amounts in opposite directions become bitwise 'and':
+  // lshr (shl X, C), C --> and X, C'
+  // shl (lshr X, C), C --> and X, C'
+  if (InnerShAmt == OuterShAmt) {
+    APInt Mask = IsInnerShl
+                     ? APInt::getLowBitsSet(TypeWidth, TypeWidth - OuterShAmt)
+                     : APInt::getHighBitsSet(TypeWidth, TypeWidth - OuterShAmt);
+    Value *And = Builder.CreateAnd(InnerShift->getOperand(0),
+                                   ConstantInt::get(ShType, Mask));
+    if (auto *AndI = dyn_cast<Instruction>(And)) {
+      AndI->moveBefore(InnerShift);
+      AndI->takeName(InnerShift);
+    }
+    return And;
+  }
+
+  assert(InnerShAmt > OuterShAmt &&
+         "Unexpected opposite direction logical shift pair");
+
+  // In general, we would need an 'and' for this transform, but
+  // canEvaluateShiftedShift() guarantees that the masked-off bits are not used.
+  // lshr (shl X, C1), C2 -->  shl X, C1 - C2
+  // shl (lshr X, C1), C2 --> lshr X, C1 - C2
+  return NewInnerShift(InnerShAmt - OuterShAmt);
+}
+
+/// When canEvaluateShifted() returns true for an expression, this function
+/// inserts the new computation that produces the shifted value.
+static Value *getShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
                               InstCombinerImpl &IC, const DataLayout &DL) {
-  // We can always evaluate constants shifted. 
-  if (Constant *C = dyn_cast<Constant>(V)) { 
-    if (isLeftShift) 
-      return IC.Builder.CreateShl(C, NumBits); 
-    else 
-      return IC.Builder.CreateLShr(C, NumBits); 
-  } 
- 
-  Instruction *I = cast<Instruction>(V); 
+  // We can always evaluate constants shifted.
+  if (Constant *C = dyn_cast<Constant>(V)) {
+    if (isLeftShift)
+      return IC.Builder.CreateShl(C, NumBits);
+    else
+      return IC.Builder.CreateLShr(C, NumBits);
+  }
+
+  Instruction *I = cast<Instruction>(V);
   IC.addToWorklist(I);
- 
-  switch (I->getOpcode()) { 
-  default: llvm_unreachable("Inconsistency with CanEvaluateShifted"); 
-  case Instruction::And: 
-  case Instruction::Or: 
-  case Instruction::Xor: 
-    // Bitwise operators can all arbitrarily be arbitrarily evaluated shifted. 
-    I->setOperand( 
-        0, getShiftedValue(I->getOperand(0), NumBits, isLeftShift, IC, DL)); 
-    I->setOperand( 
-        1, getShiftedValue(I->getOperand(1), NumBits, isLeftShift, IC, DL)); 
-    return I; 
- 
-  case Instruction::Shl: 
-  case Instruction::LShr: 
-    return foldShiftedShift(cast<BinaryOperator>(I), NumBits, isLeftShift, 
-                            IC.Builder); 
- 
-  case Instruction::Select: 
-    I->setOperand( 
-        1, getShiftedValue(I->getOperand(1), NumBits, isLeftShift, IC, DL)); 
-    I->setOperand( 
-        2, getShiftedValue(I->getOperand(2), NumBits, isLeftShift, IC, DL)); 
-    return I; 
-  case Instruction::PHI: { 
-    // We can change a phi if we can change all operands.  Note that we never 
-    // get into trouble with cyclic PHIs here because we only consider 
-    // instructions with a single use. 
-    PHINode *PN = cast<PHINode>(I); 
-    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) 
-      PN->setIncomingValue(i, getShiftedValue(PN->getIncomingValue(i), NumBits, 
-                                              isLeftShift, IC, DL)); 
-    return PN; 
-  } 
-  } 
-} 
- 
-// If this is a bitwise operator or add with a constant RHS we might be able 
-// to pull it through a shift. 
-static bool canShiftBinOpWithConstantRHS(BinaryOperator &Shift, 
-                                         BinaryOperator *BO) { 
-  switch (BO->getOpcode()) { 
-  default: 
-    return false; // Do not perform transform! 
-  case Instruction::Add: 
-    return Shift.getOpcode() == Instruction::Shl; 
-  case Instruction::Or: 
-  case Instruction::And: 
-    return true; 
+
+  switch (I->getOpcode()) {
+  default: llvm_unreachable("Inconsistency with CanEvaluateShifted");
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    // Bitwise operators can all arbitrarily be arbitrarily evaluated shifted.
+    I->setOperand(
+        0, getShiftedValue(I->getOperand(0), NumBits, isLeftShift, IC, DL));
+    I->setOperand(
+        1, getShiftedValue(I->getOperand(1), NumBits, isLeftShift, IC, DL));
+    return I;
+
+  case Instruction::Shl:
+  case Instruction::LShr:
+    return foldShiftedShift(cast<BinaryOperator>(I), NumBits, isLeftShift,
+                            IC.Builder);
+
+  case Instruction::Select:
+    I->setOperand(
+        1, getShiftedValue(I->getOperand(1), NumBits, isLeftShift, IC, DL));
+    I->setOperand(
+        2, getShiftedValue(I->getOperand(2), NumBits, isLeftShift, IC, DL));
+    return I;
+  case Instruction::PHI: {
+    // We can change a phi if we can change all operands.  Note that we never
+    // get into trouble with cyclic PHIs here because we only consider
+    // instructions with a single use.
+    PHINode *PN = cast<PHINode>(I);
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      PN->setIncomingValue(i, getShiftedValue(PN->getIncomingValue(i), NumBits,
+                                              isLeftShift, IC, DL));
+    return PN;
+  }
+  }
+}
+
+// If this is a bitwise operator or add with a constant RHS we might be able
+// to pull it through a shift.
+static bool canShiftBinOpWithConstantRHS(BinaryOperator &Shift,
+                                         BinaryOperator *BO) {
+  switch (BO->getOpcode()) {
+  default:
+    return false; // Do not perform transform!
+  case Instruction::Add:
+    return Shift.getOpcode() == Instruction::Shl;
+  case Instruction::Or:
+  case Instruction::And:
+    return true;
   case Instruction::Xor:
     // Do not change a 'not' of logical shift because that would create a normal
     // 'xor'. The 'not' is likely better for analysis, SCEV, and codegen.
     return !(Shift.isLogicalShift() && match(BO, m_Not(m_Value())));
-  } 
-} 
- 
+  }
+}
+
 Instruction *InstCombinerImpl::FoldShiftByConstant(Value *Op0, Constant *Op1,
                                                    BinaryOperator &I) {
-  bool isLeftShift = I.getOpcode() == Instruction::Shl; 
- 
-  const APInt *Op1C; 
-  if (!match(Op1, m_APInt(Op1C))) 
-    return nullptr; 
- 
-  // See if we can propagate this shift into the input, this covers the trivial 
-  // cast of lshr(shl(x,c1),c2) as well as other more complex cases. 
-  if (I.getOpcode() != Instruction::AShr && 
-      canEvaluateShifted(Op0, Op1C->getZExtValue(), isLeftShift, *this, &I)) { 
-    LLVM_DEBUG( 
-        dbgs() << "ICE: GetShiftedValue propagating shift through expression" 
-                  " to eliminate shift:\n  IN: " 
-               << *Op0 << "\n  SH: " << I << "\n"); 
- 
-    return replaceInstUsesWith( 
-        I, getShiftedValue(Op0, Op1C->getZExtValue(), isLeftShift, *this, DL)); 
-  } 
- 
-  // See if we can simplify any instructions used by the instruction whose sole 
-  // purpose is to compute bits we don't care about. 
+  bool isLeftShift = I.getOpcode() == Instruction::Shl;
+
+  const APInt *Op1C;
+  if (!match(Op1, m_APInt(Op1C)))
+    return nullptr;
+
+  // See if we can propagate this shift into the input, this covers the trivial
+  // cast of lshr(shl(x,c1),c2) as well as other more complex cases.
+  if (I.getOpcode() != Instruction::AShr &&
+      canEvaluateShifted(Op0, Op1C->getZExtValue(), isLeftShift, *this, &I)) {
+    LLVM_DEBUG(
+        dbgs() << "ICE: GetShiftedValue propagating shift through expression"
+                  " to eliminate shift:\n  IN: "
+               << *Op0 << "\n  SH: " << I << "\n");
+
+    return replaceInstUsesWith(
+        I, getShiftedValue(Op0, Op1C->getZExtValue(), isLeftShift, *this, DL));
+  }
+
+  // See if we can simplify any instructions used by the instruction whose sole
+  // purpose is to compute bits we don't care about.
   Type *Ty = I.getType();
   unsigned TypeBits = Ty->getScalarSizeInBits();
-  assert(!Op1C->uge(TypeBits) && 
-         "Shift over the type width should have been removed already"); 
- 
-  if (Instruction *FoldedShift = foldBinOpIntoSelectOrPhi(I)) 
-    return FoldedShift; 
- 
-  // Fold shift2(trunc(shift1(x,c1)), c2) -> trunc(shift2(shift1(x,c1),c2)) 
+  assert(!Op1C->uge(TypeBits) &&
+         "Shift over the type width should have been removed already");
+
+  if (Instruction *FoldedShift = foldBinOpIntoSelectOrPhi(I))
+    return FoldedShift;
+
+  // Fold shift2(trunc(shift1(x,c1)), c2) -> trunc(shift2(shift1(x,c1),c2))
   if (auto *TI = dyn_cast<TruncInst>(Op0)) {
-    // If 'shift2' is an ashr, we would have to get the sign bit into a funny 
-    // place.  Don't try to do this transformation in this case.  Also, we 
-    // require that the input operand is a shift-by-constant so that we have 
-    // confidence that the shifts will get folded together.  We could do this 
-    // xform in more cases, but it is unlikely to be profitable. 
+    // If 'shift2' is an ashr, we would have to get the sign bit into a funny
+    // place.  Don't try to do this transformation in this case.  Also, we
+    // require that the input operand is a shift-by-constant so that we have
+    // confidence that the shifts will get folded together.  We could do this
+    // xform in more cases, but it is unlikely to be profitable.
     const APInt *TrShiftAmt;
     if (I.isLogicalShift() &&
         match(TI->getOperand(0), m_Shift(m_Value(), m_APInt(TrShiftAmt)))) {
       auto *TrOp = cast<Instruction>(TI->getOperand(0));
       Type *SrcTy = TrOp->getType();
 
-      // Okay, we'll do this xform.  Make the shift of shift. 
+      // Okay, we'll do this xform.  Make the shift of shift.
       Constant *ShAmt = ConstantExpr::getZExt(Op1, SrcTy);
-      // (shift2 (shift1 & 0x00FF), c2) 
-      Value *NSh = Builder.CreateBinOp(I.getOpcode(), TrOp, ShAmt, I.getName()); 
- 
-      // For logical shifts, the truncation has the effect of making the high 
-      // part of the register be zeros.  Emulate this by inserting an AND to 
-      // clear the top bits as needed.  This 'and' will usually be zapped by 
-      // other xforms later if dead. 
+      // (shift2 (shift1 & 0x00FF), c2)
+      Value *NSh = Builder.CreateBinOp(I.getOpcode(), TrOp, ShAmt, I.getName());
+
+      // For logical shifts, the truncation has the effect of making the high
+      // part of the register be zeros.  Emulate this by inserting an AND to
+      // clear the top bits as needed.  This 'and' will usually be zapped by
+      // other xforms later if dead.
       unsigned SrcSize = SrcTy->getScalarSizeInBits();
       Constant *MaskV =
           ConstantInt::get(SrcTy, APInt::getLowBitsSet(SrcSize, TypeBits));
- 
-      // The mask we constructed says what the trunc would do if occurring 
-      // between the shifts.  We want to know the effect *after* the second 
-      // shift.  We know that it is a logical shift by a constant, so adjust the 
-      // mask as appropriate. 
+
+      // The mask we constructed says what the trunc would do if occurring
+      // between the shifts.  We want to know the effect *after* the second
+      // shift.  We know that it is a logical shift by a constant, so adjust the
+      // mask as appropriate.
       MaskV = ConstantExpr::get(I.getOpcode(), MaskV, ShAmt);
-      // shift1 & 0x00FF 
+      // shift1 & 0x00FF
       Value *And = Builder.CreateAnd(NSh, MaskV, TI->getName());
-      // Return the value truncated to the interesting size. 
+      // Return the value truncated to the interesting size.
       return new TruncInst(And, Ty);
-    } 
-  } 
- 
-  if (Op0->hasOneUse()) { 
-    if (BinaryOperator *Op0BO = dyn_cast<BinaryOperator>(Op0)) { 
-      // Turn ((X >> C) + Y) << C  ->  (X + (Y << C)) & (~0 << C) 
+    }
+  }
+
+  if (Op0->hasOneUse()) {
+    if (BinaryOperator *Op0BO = dyn_cast<BinaryOperator>(Op0)) {
+      // Turn ((X >> C) + Y) << C  ->  (X + (Y << C)) & (~0 << C)
       Value *V1;
       const APInt *CC;
-      switch (Op0BO->getOpcode()) { 
-      default: break; 
-      case Instruction::Add: 
-      case Instruction::And: 
-      case Instruction::Or: 
-      case Instruction::Xor: { 
-        // These operators commute. 
-        // Turn (Y + (X >> C)) << C  ->  (X + (Y << C)) & (~0 << C) 
-        if (isLeftShift && Op0BO->getOperand(1)->hasOneUse() && 
-            match(Op0BO->getOperand(1), m_Shr(m_Value(V1), 
-                  m_Specific(Op1)))) { 
-          Value *YS =         // (Y << C) 
-            Builder.CreateShl(Op0BO->getOperand(0), Op1, Op0BO->getName()); 
-          // (X + (Y << C)) 
-          Value *X = Builder.CreateBinOp(Op0BO->getOpcode(), YS, V1, 
-                                         Op0BO->getOperand(1)->getName()); 
-          unsigned Op1Val = Op1C->getLimitedValue(TypeBits); 
-          APInt Bits = APInt::getHighBitsSet(TypeBits, TypeBits - Op1Val); 
+      switch (Op0BO->getOpcode()) {
+      default: break;
+      case Instruction::Add:
+      case Instruction::And:
+      case Instruction::Or:
+      case Instruction::Xor: {
+        // These operators commute.
+        // Turn (Y + (X >> C)) << C  ->  (X + (Y << C)) & (~0 << C)
+        if (isLeftShift && Op0BO->getOperand(1)->hasOneUse() &&
+            match(Op0BO->getOperand(1), m_Shr(m_Value(V1),
+                  m_Specific(Op1)))) {
+          Value *YS =         // (Y << C)
+            Builder.CreateShl(Op0BO->getOperand(0), Op1, Op0BO->getName());
+          // (X + (Y << C))
+          Value *X = Builder.CreateBinOp(Op0BO->getOpcode(), YS, V1,
+                                         Op0BO->getOperand(1)->getName());
+          unsigned Op1Val = Op1C->getLimitedValue(TypeBits);
+          APInt Bits = APInt::getHighBitsSet(TypeBits, TypeBits - Op1Val);
           Constant *Mask = ConstantInt::get(Ty, Bits);
-          return BinaryOperator::CreateAnd(X, Mask); 
-        } 
- 
-        // Turn (Y + ((X >> C) & CC)) << C  ->  ((X & (CC << C)) + (Y << C)) 
-        Value *Op0BOOp1 = Op0BO->getOperand(1); 
-        if (isLeftShift && Op0BOOp1->hasOneUse() && 
+          return BinaryOperator::CreateAnd(X, Mask);
+        }
+
+        // Turn (Y + ((X >> C) & CC)) << C  ->  ((X & (CC << C)) + (Y << C))
+        Value *Op0BOOp1 = Op0BO->getOperand(1);
+        if (isLeftShift && Op0BOOp1->hasOneUse() &&
             match(Op0BOOp1, m_And(m_OneUse(m_Shr(m_Value(V1), m_Specific(Op1))),
                                   m_APInt(CC)))) {
           Value *YS = // (Y << C)
               Builder.CreateShl(Op0BO->getOperand(0), Op1, Op0BO->getName());
-          // X & (CC << C) 
+          // X & (CC << C)
           Value *XM = Builder.CreateAnd(
               V1, ConstantExpr::getShl(ConstantInt::get(Ty, *CC), Op1),
               V1->getName() + ".mask");
-          return BinaryOperator::Create(Op0BO->getOpcode(), YS, XM); 
-        } 
-        LLVM_FALLTHROUGH; 
-      } 
- 
-      case Instruction::Sub: { 
-        // Turn ((X >> C) + Y) << C  ->  (X + (Y << C)) & (~0 << C) 
-        if (isLeftShift && Op0BO->getOperand(0)->hasOneUse() && 
-            match(Op0BO->getOperand(0), m_Shr(m_Value(V1), 
-                  m_Specific(Op1)))) { 
-          Value *YS =  // (Y << C) 
-            Builder.CreateShl(Op0BO->getOperand(1), Op1, Op0BO->getName()); 
-          // (X + (Y << C)) 
-          Value *X = Builder.CreateBinOp(Op0BO->getOpcode(), V1, YS, 
-                                         Op0BO->getOperand(0)->getName()); 
-          unsigned Op1Val = Op1C->getLimitedValue(TypeBits); 
-          APInt Bits = APInt::getHighBitsSet(TypeBits, TypeBits - Op1Val); 
+          return BinaryOperator::Create(Op0BO->getOpcode(), YS, XM);
+        }
+        LLVM_FALLTHROUGH;
+      }
+
+      case Instruction::Sub: {
+        // Turn ((X >> C) + Y) << C  ->  (X + (Y << C)) & (~0 << C)
+        if (isLeftShift && Op0BO->getOperand(0)->hasOneUse() &&
+            match(Op0BO->getOperand(0), m_Shr(m_Value(V1),
+                  m_Specific(Op1)))) {
+          Value *YS =  // (Y << C)
+            Builder.CreateShl(Op0BO->getOperand(1), Op1, Op0BO->getName());
+          // (X + (Y << C))
+          Value *X = Builder.CreateBinOp(Op0BO->getOpcode(), V1, YS,
+                                         Op0BO->getOperand(0)->getName());
+          unsigned Op1Val = Op1C->getLimitedValue(TypeBits);
+          APInt Bits = APInt::getHighBitsSet(TypeBits, TypeBits - Op1Val);
           Constant *Mask = ConstantInt::get(Ty, Bits);
-          return BinaryOperator::CreateAnd(X, Mask); 
-        } 
- 
-        // Turn (((X >> C)&CC) + Y) << C  ->  (X + (Y << C)) & (CC << C) 
-        if (isLeftShift && Op0BO->getOperand(0)->hasOneUse() && 
-            match(Op0BO->getOperand(0), 
+          return BinaryOperator::CreateAnd(X, Mask);
+        }
+
+        // Turn (((X >> C)&CC) + Y) << C  ->  (X + (Y << C)) & (CC << C)
+        if (isLeftShift && Op0BO->getOperand(0)->hasOneUse() &&
+            match(Op0BO->getOperand(0),
                   m_And(m_OneUse(m_Shr(m_Value(V1), m_Specific(Op1))),
                         m_APInt(CC)))) {
-          Value *YS = // (Y << C) 
+          Value *YS = // (Y << C)
               Builder.CreateShl(Op0BO->getOperand(1), Op1, Op0BO->getName());
-          // X & (CC << C) 
+          // X & (CC << C)
           Value *XM = Builder.CreateAnd(
               V1, ConstantExpr::getShl(ConstantInt::get(Ty, *CC), Op1),
               V1->getName() + ".mask");
-          return BinaryOperator::Create(Op0BO->getOpcode(), XM, YS); 
-        } 
- 
-        break; 
-      } 
-      } 
- 
-      // If the operand is a bitwise operator with a constant RHS, and the 
-      // shift is the only use, we can pull it out of the shift. 
-      const APInt *Op0C; 
-      if (match(Op0BO->getOperand(1), m_APInt(Op0C))) { 
-        if (canShiftBinOpWithConstantRHS(I, Op0BO)) { 
-          Constant *NewRHS = ConstantExpr::get(I.getOpcode(), 
-                                     cast<Constant>(Op0BO->getOperand(1)), Op1); 
- 
-          Value *NewShift = 
-            Builder.CreateBinOp(I.getOpcode(), Op0BO->getOperand(0), Op1); 
-          NewShift->takeName(Op0BO); 
- 
-          return BinaryOperator::Create(Op0BO->getOpcode(), NewShift, 
-                                        NewRHS); 
-        } 
-      } 
- 
-      // If the operand is a subtract with a constant LHS, and the shift 
-      // is the only use, we can pull it out of the shift. 
-      // This folds (shl (sub C1, X), C2) -> (sub (C1 << C2), (shl X, C2)) 
-      if (isLeftShift && Op0BO->getOpcode() == Instruction::Sub && 
-          match(Op0BO->getOperand(0), m_APInt(Op0C))) { 
-        Constant *NewRHS = ConstantExpr::get(I.getOpcode(), 
-                                   cast<Constant>(Op0BO->getOperand(0)), Op1); 
- 
-        Value *NewShift = Builder.CreateShl(Op0BO->getOperand(1), Op1); 
-        NewShift->takeName(Op0BO); 
- 
-        return BinaryOperator::CreateSub(NewRHS, NewShift); 
-      } 
-    } 
- 
-    // If we have a select that conditionally executes some binary operator, 
-    // see if we can pull it the select and operator through the shift. 
-    // 
-    // For example, turning: 
-    //   shl (select C, (add X, C1), X), C2 
-    // Into: 
-    //   Y = shl X, C2 
-    //   select C, (add Y, C1 << C2), Y 
-    Value *Cond; 
-    BinaryOperator *TBO; 
-    Value *FalseVal; 
-    if (match(Op0, m_Select(m_Value(Cond), m_OneUse(m_BinOp(TBO)), 
-                            m_Value(FalseVal)))) { 
-      const APInt *C; 
-      if (!isa<Constant>(FalseVal) && TBO->getOperand(0) == FalseVal && 
-          match(TBO->getOperand(1), m_APInt(C)) && 
-          canShiftBinOpWithConstantRHS(I, TBO)) { 
-        Constant *NewRHS = ConstantExpr::get(I.getOpcode(), 
-                                       cast<Constant>(TBO->getOperand(1)), Op1); 
- 
-        Value *NewShift = 
-          Builder.CreateBinOp(I.getOpcode(), FalseVal, Op1); 
-        Value *NewOp = Builder.CreateBinOp(TBO->getOpcode(), NewShift, 
-                                           NewRHS); 
-        return SelectInst::Create(Cond, NewOp, NewShift); 
-      } 
-    } 
- 
-    BinaryOperator *FBO; 
-    Value *TrueVal; 
-    if (match(Op0, m_Select(m_Value(Cond), m_Value(TrueVal), 
-                            m_OneUse(m_BinOp(FBO))))) { 
-      const APInt *C; 
-      if (!isa<Constant>(TrueVal) && FBO->getOperand(0) == TrueVal && 
-          match(FBO->getOperand(1), m_APInt(C)) && 
-          canShiftBinOpWithConstantRHS(I, FBO)) { 
-        Constant *NewRHS = ConstantExpr::get(I.getOpcode(), 
-                                       cast<Constant>(FBO->getOperand(1)), Op1); 
- 
-        Value *NewShift = 
-          Builder.CreateBinOp(I.getOpcode(), TrueVal, Op1); 
-        Value *NewOp = Builder.CreateBinOp(FBO->getOpcode(), NewShift, 
-                                           NewRHS); 
-        return SelectInst::Create(Cond, NewShift, NewOp); 
-      } 
-    } 
-  } 
- 
-  return nullptr; 
-} 
- 
+          return BinaryOperator::Create(Op0BO->getOpcode(), XM, YS);
+        }
+
+        break;
+      }
+      }
+
+      // If the operand is a bitwise operator with a constant RHS, and the
+      // shift is the only use, we can pull it out of the shift.
+      const APInt *Op0C;
+      if (match(Op0BO->getOperand(1), m_APInt(Op0C))) {
+        if (canShiftBinOpWithConstantRHS(I, Op0BO)) {
+          Constant *NewRHS = ConstantExpr::get(I.getOpcode(),
+                                     cast<Constant>(Op0BO->getOperand(1)), Op1);
+
+          Value *NewShift =
+            Builder.CreateBinOp(I.getOpcode(), Op0BO->getOperand(0), Op1);
+          NewShift->takeName(Op0BO);
+
+          return BinaryOperator::Create(Op0BO->getOpcode(), NewShift,
+                                        NewRHS);
+        }
+      }
+
+      // If the operand is a subtract with a constant LHS, and the shift
+      // is the only use, we can pull it out of the shift.
+      // This folds (shl (sub C1, X), C2) -> (sub (C1 << C2), (shl X, C2))
+      if (isLeftShift && Op0BO->getOpcode() == Instruction::Sub &&
+          match(Op0BO->getOperand(0), m_APInt(Op0C))) {
+        Constant *NewRHS = ConstantExpr::get(I.getOpcode(),
+                                   cast<Constant>(Op0BO->getOperand(0)), Op1);
+
+        Value *NewShift = Builder.CreateShl(Op0BO->getOperand(1), Op1);
+        NewShift->takeName(Op0BO);
+
+        return BinaryOperator::CreateSub(NewRHS, NewShift);
+      }
+    }
+
+    // If we have a select that conditionally executes some binary operator,
+    // see if we can pull it the select and operator through the shift.
+    //
+    // For example, turning:
+    //   shl (select C, (add X, C1), X), C2
+    // Into:
+    //   Y = shl X, C2
+    //   select C, (add Y, C1 << C2), Y
+    Value *Cond;
+    BinaryOperator *TBO;
+    Value *FalseVal;
+    if (match(Op0, m_Select(m_Value(Cond), m_OneUse(m_BinOp(TBO)),
+                            m_Value(FalseVal)))) {
+      const APInt *C;
+      if (!isa<Constant>(FalseVal) && TBO->getOperand(0) == FalseVal &&
+          match(TBO->getOperand(1), m_APInt(C)) &&
+          canShiftBinOpWithConstantRHS(I, TBO)) {
+        Constant *NewRHS = ConstantExpr::get(I.getOpcode(),
+                                       cast<Constant>(TBO->getOperand(1)), Op1);
+
+        Value *NewShift =
+          Builder.CreateBinOp(I.getOpcode(), FalseVal, Op1);
+        Value *NewOp = Builder.CreateBinOp(TBO->getOpcode(), NewShift,
+                                           NewRHS);
+        return SelectInst::Create(Cond, NewOp, NewShift);
+      }
+    }
+
+    BinaryOperator *FBO;
+    Value *TrueVal;
+    if (match(Op0, m_Select(m_Value(Cond), m_Value(TrueVal),
+                            m_OneUse(m_BinOp(FBO))))) {
+      const APInt *C;
+      if (!isa<Constant>(TrueVal) && FBO->getOperand(0) == TrueVal &&
+          match(FBO->getOperand(1), m_APInt(C)) &&
+          canShiftBinOpWithConstantRHS(I, FBO)) {
+        Constant *NewRHS = ConstantExpr::get(I.getOpcode(),
+                                       cast<Constant>(FBO->getOperand(1)), Op1);
+
+        Value *NewShift =
+          Builder.CreateBinOp(I.getOpcode(), TrueVal, Op1);
+        Value *NewOp = Builder.CreateBinOp(FBO->getOpcode(), NewShift,
+                                           NewRHS);
+        return SelectInst::Create(Cond, NewShift, NewOp);
+      }
+    }
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::visitShl(BinaryOperator &I) {
-  const SimplifyQuery Q = SQ.getWithInstruction(&I); 
- 
-  if (Value *V = SimplifyShlInst(I.getOperand(0), I.getOperand(1), 
-                                 I.hasNoSignedWrap(), I.hasNoUnsignedWrap(), Q)) 
-    return replaceInstUsesWith(I, V); 
- 
-  if (Instruction *X = foldVectorBinop(I)) 
-    return X; 
- 
-  if (Instruction *V = commonShiftTransforms(I)) 
-    return V; 
- 
-  if (Instruction *V = dropRedundantMaskingOfLeftShiftInput(&I, Q, Builder)) 
-    return V; 
- 
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); 
-  Type *Ty = I.getType(); 
-  unsigned BitWidth = Ty->getScalarSizeInBits(); 
- 
-  const APInt *ShAmtAPInt; 
-  if (match(Op1, m_APInt(ShAmtAPInt))) { 
-    unsigned ShAmt = ShAmtAPInt->getZExtValue(); 
- 
-    // shl (zext X), ShAmt --> zext (shl X, ShAmt) 
-    // This is only valid if X would have zeros shifted out. 
-    Value *X; 
-    if (match(Op0, m_OneUse(m_ZExt(m_Value(X))))) { 
-      unsigned SrcWidth = X->getType()->getScalarSizeInBits(); 
-      if (ShAmt < SrcWidth && 
-          MaskedValueIsZero(X, APInt::getHighBitsSet(SrcWidth, ShAmt), 0, &I)) 
-        return new ZExtInst(Builder.CreateShl(X, ShAmt), Ty); 
-    } 
- 
-    // (X >> C) << C --> X & (-1 << C) 
-    if (match(Op0, m_Shr(m_Value(X), m_Specific(Op1)))) { 
-      APInt Mask(APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)); 
-      return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask)); 
-    } 
- 
-    const APInt *ShOp1; 
+  const SimplifyQuery Q = SQ.getWithInstruction(&I);
+
+  if (Value *V = SimplifyShlInst(I.getOperand(0), I.getOperand(1),
+                                 I.hasNoSignedWrap(), I.hasNoUnsignedWrap(), Q))
+    return replaceInstUsesWith(I, V);
+
+  if (Instruction *X = foldVectorBinop(I))
+    return X;
+
+  if (Instruction *V = commonShiftTransforms(I))
+    return V;
+
+  if (Instruction *V = dropRedundantMaskingOfLeftShiftInput(&I, Q, Builder))
+    return V;
+
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  Type *Ty = I.getType();
+  unsigned BitWidth = Ty->getScalarSizeInBits();
+
+  const APInt *ShAmtAPInt;
+  if (match(Op1, m_APInt(ShAmtAPInt))) {
+    unsigned ShAmt = ShAmtAPInt->getZExtValue();
+
+    // shl (zext X), ShAmt --> zext (shl X, ShAmt)
+    // This is only valid if X would have zeros shifted out.
+    Value *X;
+    if (match(Op0, m_OneUse(m_ZExt(m_Value(X))))) {
+      unsigned SrcWidth = X->getType()->getScalarSizeInBits();
+      if (ShAmt < SrcWidth &&
+          MaskedValueIsZero(X, APInt::getHighBitsSet(SrcWidth, ShAmt), 0, &I))
+        return new ZExtInst(Builder.CreateShl(X, ShAmt), Ty);
+    }
+
+    // (X >> C) << C --> X & (-1 << C)
+    if (match(Op0, m_Shr(m_Value(X), m_Specific(Op1)))) {
+      APInt Mask(APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt));
+      return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask));
+    }
+
+    const APInt *ShOp1;
     if (match(Op0, m_Exact(m_Shr(m_Value(X), m_APInt(ShOp1)))) &&
         ShOp1->ult(BitWidth)) {
-      unsigned ShrAmt = ShOp1->getZExtValue(); 
-      if (ShrAmt < ShAmt) { 
-        // If C1 < C2: (X >>?,exact C1) << C2 --> X << (C2 - C1) 
-        Constant *ShiftDiff = ConstantInt::get(Ty, ShAmt - ShrAmt); 
-        auto *NewShl = BinaryOperator::CreateShl(X, ShiftDiff); 
-        NewShl->setHasNoUnsignedWrap(I.hasNoUnsignedWrap()); 
-        NewShl->setHasNoSignedWrap(I.hasNoSignedWrap()); 
-        return NewShl; 
-      } 
-      if (ShrAmt > ShAmt) { 
-        // If C1 > C2: (X >>?exact C1) << C2 --> X >>?exact (C1 - C2) 
-        Constant *ShiftDiff = ConstantInt::get(Ty, ShrAmt - ShAmt); 
-        auto *NewShr = BinaryOperator::Create( 
-            cast<BinaryOperator>(Op0)->getOpcode(), X, ShiftDiff); 
-        NewShr->setIsExact(true); 
-        return NewShr; 
-      } 
-    } 
- 
+      unsigned ShrAmt = ShOp1->getZExtValue();
+      if (ShrAmt < ShAmt) {
+        // If C1 < C2: (X >>?,exact C1) << C2 --> X << (C2 - C1)
+        Constant *ShiftDiff = ConstantInt::get(Ty, ShAmt - ShrAmt);
+        auto *NewShl = BinaryOperator::CreateShl(X, ShiftDiff);
+        NewShl->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
+        NewShl->setHasNoSignedWrap(I.hasNoSignedWrap());
+        return NewShl;
+      }
+      if (ShrAmt > ShAmt) {
+        // If C1 > C2: (X >>?exact C1) << C2 --> X >>?exact (C1 - C2)
+        Constant *ShiftDiff = ConstantInt::get(Ty, ShrAmt - ShAmt);
+        auto *NewShr = BinaryOperator::Create(
+            cast<BinaryOperator>(Op0)->getOpcode(), X, ShiftDiff);
+        NewShr->setIsExact(true);
+        return NewShr;
+      }
+    }
+
     if (match(Op0, m_OneUse(m_Shr(m_Value(X), m_APInt(ShOp1)))) &&
         ShOp1->ult(BitWidth)) {
       unsigned ShrAmt = ShOp1->getZExtValue();
@@ -977,354 +977,354 @@ Instruction *InstCombinerImpl::visitShl(BinaryOperator &I) {
     }
 
     if (match(Op0, m_Shl(m_Value(X), m_APInt(ShOp1))) && ShOp1->ult(BitWidth)) {
-      unsigned AmtSum = ShAmt + ShOp1->getZExtValue(); 
-      // Oversized shifts are simplified to zero in InstSimplify. 
-      if (AmtSum < BitWidth) 
-        // (X << C1) << C2 --> X << (C1 + C2) 
-        return BinaryOperator::CreateShl(X, ConstantInt::get(Ty, AmtSum)); 
-    } 
- 
-    // If the shifted-out value is known-zero, then this is a NUW shift. 
-    if (!I.hasNoUnsignedWrap() && 
-        MaskedValueIsZero(Op0, APInt::getHighBitsSet(BitWidth, ShAmt), 0, &I)) { 
-      I.setHasNoUnsignedWrap(); 
-      return &I; 
-    } 
- 
-    // If the shifted-out value is all signbits, then this is a NSW shift. 
-    if (!I.hasNoSignedWrap() && ComputeNumSignBits(Op0, 0, &I) > ShAmt) { 
-      I.setHasNoSignedWrap(); 
-      return &I; 
-    } 
-  } 
- 
-  // Transform  (x >> y) << y  to  x & (-1 << y) 
-  // Valid for any type of right-shift. 
-  Value *X; 
-  if (match(Op0, m_OneUse(m_Shr(m_Value(X), m_Specific(Op1))))) { 
-    Constant *AllOnes = ConstantInt::getAllOnesValue(Ty); 
-    Value *Mask = Builder.CreateShl(AllOnes, Op1); 
-    return BinaryOperator::CreateAnd(Mask, X); 
-  } 
- 
-  Constant *C1; 
-  if (match(Op1, m_Constant(C1))) { 
-    Constant *C2; 
-    Value *X; 
-    // (C2 << X) << C1 --> (C2 << C1) << X 
-    if (match(Op0, m_OneUse(m_Shl(m_Constant(C2), m_Value(X))))) 
-      return BinaryOperator::CreateShl(ConstantExpr::getShl(C2, C1), X); 
- 
-    // (X * C2) << C1 --> X * (C2 << C1) 
-    if (match(Op0, m_Mul(m_Value(X), m_Constant(C2)))) 
-      return BinaryOperator::CreateMul(X, ConstantExpr::getShl(C2, C1)); 
- 
-    // shl (zext i1 X), C1 --> select (X, 1 << C1, 0) 
-    if (match(Op0, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) { 
-      auto *NewC = ConstantExpr::getShl(ConstantInt::get(Ty, 1), C1); 
-      return SelectInst::Create(X, NewC, ConstantInt::getNullValue(Ty)); 
-    } 
-  } 
- 
-  // (1 << (C - x)) -> ((1 << C) >> x) if C is bitwidth - 1 
-  if (match(Op0, m_One()) && 
-      match(Op1, m_Sub(m_SpecificInt(BitWidth - 1), m_Value(X)))) 
-    return BinaryOperator::CreateLShr( 
-        ConstantInt::get(Ty, APInt::getSignMask(BitWidth)), X); 
- 
-  return nullptr; 
-} 
- 
+      unsigned AmtSum = ShAmt + ShOp1->getZExtValue();
+      // Oversized shifts are simplified to zero in InstSimplify.
+      if (AmtSum < BitWidth)
+        // (X << C1) << C2 --> X << (C1 + C2)
+        return BinaryOperator::CreateShl(X, ConstantInt::get(Ty, AmtSum));
+    }
+
+    // If the shifted-out value is known-zero, then this is a NUW shift.
+    if (!I.hasNoUnsignedWrap() &&
+        MaskedValueIsZero(Op0, APInt::getHighBitsSet(BitWidth, ShAmt), 0, &I)) {
+      I.setHasNoUnsignedWrap();
+      return &I;
+    }
+
+    // If the shifted-out value is all signbits, then this is a NSW shift.
+    if (!I.hasNoSignedWrap() && ComputeNumSignBits(Op0, 0, &I) > ShAmt) {
+      I.setHasNoSignedWrap();
+      return &I;
+    }
+  }
+
+  // Transform  (x >> y) << y  to  x & (-1 << y)
+  // Valid for any type of right-shift.
+  Value *X;
+  if (match(Op0, m_OneUse(m_Shr(m_Value(X), m_Specific(Op1))))) {
+    Constant *AllOnes = ConstantInt::getAllOnesValue(Ty);
+    Value *Mask = Builder.CreateShl(AllOnes, Op1);
+    return BinaryOperator::CreateAnd(Mask, X);
+  }
+
+  Constant *C1;
+  if (match(Op1, m_Constant(C1))) {
+    Constant *C2;
+    Value *X;
+    // (C2 << X) << C1 --> (C2 << C1) << X
+    if (match(Op0, m_OneUse(m_Shl(m_Constant(C2), m_Value(X)))))
+      return BinaryOperator::CreateShl(ConstantExpr::getShl(C2, C1), X);
+
+    // (X * C2) << C1 --> X * (C2 << C1)
+    if (match(Op0, m_Mul(m_Value(X), m_Constant(C2))))
+      return BinaryOperator::CreateMul(X, ConstantExpr::getShl(C2, C1));
+
+    // shl (zext i1 X), C1 --> select (X, 1 << C1, 0)
+    if (match(Op0, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
+      auto *NewC = ConstantExpr::getShl(ConstantInt::get(Ty, 1), C1);
+      return SelectInst::Create(X, NewC, ConstantInt::getNullValue(Ty));
+    }
+  }
+
+  // (1 << (C - x)) -> ((1 << C) >> x) if C is bitwidth - 1
+  if (match(Op0, m_One()) &&
+      match(Op1, m_Sub(m_SpecificInt(BitWidth - 1), m_Value(X))))
+    return BinaryOperator::CreateLShr(
+        ConstantInt::get(Ty, APInt::getSignMask(BitWidth)), X);
+
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) {
-  if (Value *V = SimplifyLShrInst(I.getOperand(0), I.getOperand(1), I.isExact(), 
-                                  SQ.getWithInstruction(&I))) 
-    return replaceInstUsesWith(I, V); 
- 
-  if (Instruction *X = foldVectorBinop(I)) 
-    return X; 
- 
-  if (Instruction *R = commonShiftTransforms(I)) 
-    return R; 
- 
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); 
-  Type *Ty = I.getType(); 
-  const APInt *ShAmtAPInt; 
-  if (match(Op1, m_APInt(ShAmtAPInt))) { 
-    unsigned ShAmt = ShAmtAPInt->getZExtValue(); 
-    unsigned BitWidth = Ty->getScalarSizeInBits(); 
-    auto *II = dyn_cast<IntrinsicInst>(Op0); 
-    if (II && isPowerOf2_32(BitWidth) && Log2_32(BitWidth) == ShAmt && 
-        (II->getIntrinsicID() == Intrinsic::ctlz || 
-         II->getIntrinsicID() == Intrinsic::cttz || 
-         II->getIntrinsicID() == Intrinsic::ctpop)) { 
-      // ctlz.i32(x)>>5  --> zext(x == 0) 
-      // cttz.i32(x)>>5  --> zext(x == 0) 
-      // ctpop.i32(x)>>5 --> zext(x == -1) 
-      bool IsPop = II->getIntrinsicID() == Intrinsic::ctpop; 
-      Constant *RHS = ConstantInt::getSigned(Ty, IsPop ? -1 : 0); 
-      Value *Cmp = Builder.CreateICmpEQ(II->getArgOperand(0), RHS); 
-      return new ZExtInst(Cmp, Ty); 
-    } 
- 
-    Value *X; 
-    const APInt *ShOp1; 
-    if (match(Op0, m_Shl(m_Value(X), m_APInt(ShOp1))) && ShOp1->ult(BitWidth)) { 
-      if (ShOp1->ult(ShAmt)) { 
-        unsigned ShlAmt = ShOp1->getZExtValue(); 
-        Constant *ShiftDiff = ConstantInt::get(Ty, ShAmt - ShlAmt); 
-        if (cast<BinaryOperator>(Op0)->hasNoUnsignedWrap()) { 
-          // (X <<nuw C1) >>u C2 --> X >>u (C2 - C1) 
-          auto *NewLShr = BinaryOperator::CreateLShr(X, ShiftDiff); 
-          NewLShr->setIsExact(I.isExact()); 
-          return NewLShr; 
-        } 
-        // (X << C1) >>u C2  --> (X >>u (C2 - C1)) & (-1 >> C2) 
-        Value *NewLShr = Builder.CreateLShr(X, ShiftDiff, "", I.isExact()); 
-        APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt)); 
-        return BinaryOperator::CreateAnd(NewLShr, ConstantInt::get(Ty, Mask)); 
-      } 
-      if (ShOp1->ugt(ShAmt)) { 
-        unsigned ShlAmt = ShOp1->getZExtValue(); 
-        Constant *ShiftDiff = ConstantInt::get(Ty, ShlAmt - ShAmt); 
-        if (cast<BinaryOperator>(Op0)->hasNoUnsignedWrap()) { 
-          // (X <<nuw C1) >>u C2 --> X <<nuw (C1 - C2) 
-          auto *NewShl = BinaryOperator::CreateShl(X, ShiftDiff); 
-          NewShl->setHasNoUnsignedWrap(true); 
-          return NewShl; 
-        } 
-        // (X << C1) >>u C2  --> X << (C1 - C2) & (-1 >> C2) 
-        Value *NewShl = Builder.CreateShl(X, ShiftDiff); 
-        APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt)); 
-        return BinaryOperator::CreateAnd(NewShl, ConstantInt::get(Ty, Mask)); 
-      } 
-      assert(*ShOp1 == ShAmt); 
-      // (X << C) >>u C --> X & (-1 >>u C) 
-      APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt)); 
-      return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask)); 
-    } 
- 
-    if (match(Op0, m_OneUse(m_ZExt(m_Value(X)))) && 
-        (!Ty->isIntegerTy() || shouldChangeType(Ty, X->getType()))) { 
-      assert(ShAmt < X->getType()->getScalarSizeInBits() && 
-             "Big shift not simplified to zero?"); 
-      // lshr (zext iM X to iN), C --> zext (lshr X, C) to iN 
-      Value *NewLShr = Builder.CreateLShr(X, ShAmt); 
-      return new ZExtInst(NewLShr, Ty); 
-    } 
- 
-    if (match(Op0, m_SExt(m_Value(X))) && 
-        (!Ty->isIntegerTy() || shouldChangeType(Ty, X->getType()))) { 
-      // Are we moving the sign bit to the low bit and widening with high zeros? 
-      unsigned SrcTyBitWidth = X->getType()->getScalarSizeInBits(); 
-      if (ShAmt == BitWidth - 1) { 
-        // lshr (sext i1 X to iN), N-1 --> zext X to iN 
-        if (SrcTyBitWidth == 1) 
-          return new ZExtInst(X, Ty); 
- 
-        // lshr (sext iM X to iN), N-1 --> zext (lshr X, M-1) to iN 
-        if (Op0->hasOneUse()) { 
-          Value *NewLShr = Builder.CreateLShr(X, SrcTyBitWidth - 1); 
-          return new ZExtInst(NewLShr, Ty); 
-        } 
-      } 
- 
-      // lshr (sext iM X to iN), N-M --> zext (ashr X, min(N-M, M-1)) to iN 
-      if (ShAmt == BitWidth - SrcTyBitWidth && Op0->hasOneUse()) { 
-        // The new shift amount can't be more than the narrow source type. 
-        unsigned NewShAmt = std::min(ShAmt, SrcTyBitWidth - 1); 
-        Value *AShr = Builder.CreateAShr(X, NewShAmt); 
-        return new ZExtInst(AShr, Ty); 
-      } 
-    } 
- 
+  if (Value *V = SimplifyLShrInst(I.getOperand(0), I.getOperand(1), I.isExact(),
+                                  SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  if (Instruction *X = foldVectorBinop(I))
+    return X;
+
+  if (Instruction *R = commonShiftTransforms(I))
+    return R;
+
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  Type *Ty = I.getType();
+  const APInt *ShAmtAPInt;
+  if (match(Op1, m_APInt(ShAmtAPInt))) {
+    unsigned ShAmt = ShAmtAPInt->getZExtValue();
+    unsigned BitWidth = Ty->getScalarSizeInBits();
+    auto *II = dyn_cast<IntrinsicInst>(Op0);
+    if (II && isPowerOf2_32(BitWidth) && Log2_32(BitWidth) == ShAmt &&
+        (II->getIntrinsicID() == Intrinsic::ctlz ||
+         II->getIntrinsicID() == Intrinsic::cttz ||
+         II->getIntrinsicID() == Intrinsic::ctpop)) {
+      // ctlz.i32(x)>>5  --> zext(x == 0)
+      // cttz.i32(x)>>5  --> zext(x == 0)
+      // ctpop.i32(x)>>5 --> zext(x == -1)
+      bool IsPop = II->getIntrinsicID() == Intrinsic::ctpop;
+      Constant *RHS = ConstantInt::getSigned(Ty, IsPop ? -1 : 0);
+      Value *Cmp = Builder.CreateICmpEQ(II->getArgOperand(0), RHS);
+      return new ZExtInst(Cmp, Ty);
+    }
+
+    Value *X;
+    const APInt *ShOp1;
+    if (match(Op0, m_Shl(m_Value(X), m_APInt(ShOp1))) && ShOp1->ult(BitWidth)) {
+      if (ShOp1->ult(ShAmt)) {
+        unsigned ShlAmt = ShOp1->getZExtValue();
+        Constant *ShiftDiff = ConstantInt::get(Ty, ShAmt - ShlAmt);
+        if (cast<BinaryOperator>(Op0)->hasNoUnsignedWrap()) {
+          // (X <<nuw C1) >>u C2 --> X >>u (C2 - C1)
+          auto *NewLShr = BinaryOperator::CreateLShr(X, ShiftDiff);
+          NewLShr->setIsExact(I.isExact());
+          return NewLShr;
+        }
+        // (X << C1) >>u C2  --> (X >>u (C2 - C1)) & (-1 >> C2)
+        Value *NewLShr = Builder.CreateLShr(X, ShiftDiff, "", I.isExact());
+        APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt));
+        return BinaryOperator::CreateAnd(NewLShr, ConstantInt::get(Ty, Mask));
+      }
+      if (ShOp1->ugt(ShAmt)) {
+        unsigned ShlAmt = ShOp1->getZExtValue();
+        Constant *ShiftDiff = ConstantInt::get(Ty, ShlAmt - ShAmt);
+        if (cast<BinaryOperator>(Op0)->hasNoUnsignedWrap()) {
+          // (X <<nuw C1) >>u C2 --> X <<nuw (C1 - C2)
+          auto *NewShl = BinaryOperator::CreateShl(X, ShiftDiff);
+          NewShl->setHasNoUnsignedWrap(true);
+          return NewShl;
+        }
+        // (X << C1) >>u C2  --> X << (C1 - C2) & (-1 >> C2)
+        Value *NewShl = Builder.CreateShl(X, ShiftDiff);
+        APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt));
+        return BinaryOperator::CreateAnd(NewShl, ConstantInt::get(Ty, Mask));
+      }
+      assert(*ShOp1 == ShAmt);
+      // (X << C) >>u C --> X & (-1 >>u C)
+      APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt));
+      return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask));
+    }
+
+    if (match(Op0, m_OneUse(m_ZExt(m_Value(X)))) &&
+        (!Ty->isIntegerTy() || shouldChangeType(Ty, X->getType()))) {
+      assert(ShAmt < X->getType()->getScalarSizeInBits() &&
+             "Big shift not simplified to zero?");
+      // lshr (zext iM X to iN), C --> zext (lshr X, C) to iN
+      Value *NewLShr = Builder.CreateLShr(X, ShAmt);
+      return new ZExtInst(NewLShr, Ty);
+    }
+
+    if (match(Op0, m_SExt(m_Value(X))) &&
+        (!Ty->isIntegerTy() || shouldChangeType(Ty, X->getType()))) {
+      // Are we moving the sign bit to the low bit and widening with high zeros?
+      unsigned SrcTyBitWidth = X->getType()->getScalarSizeInBits();
+      if (ShAmt == BitWidth - 1) {
+        // lshr (sext i1 X to iN), N-1 --> zext X to iN
+        if (SrcTyBitWidth == 1)
+          return new ZExtInst(X, Ty);
+
+        // lshr (sext iM X to iN), N-1 --> zext (lshr X, M-1) to iN
+        if (Op0->hasOneUse()) {
+          Value *NewLShr = Builder.CreateLShr(X, SrcTyBitWidth - 1);
+          return new ZExtInst(NewLShr, Ty);
+        }
+      }
+
+      // lshr (sext iM X to iN), N-M --> zext (ashr X, min(N-M, M-1)) to iN
+      if (ShAmt == BitWidth - SrcTyBitWidth && Op0->hasOneUse()) {
+        // The new shift amount can't be more than the narrow source type.
+        unsigned NewShAmt = std::min(ShAmt, SrcTyBitWidth - 1);
+        Value *AShr = Builder.CreateAShr(X, NewShAmt);
+        return new ZExtInst(AShr, Ty);
+      }
+    }
+
     // lshr i32 (X -nsw Y), 31 --> zext (X < Y)
     Value *Y;
     if (ShAmt == BitWidth - 1 &&
         match(Op0, m_OneUse(m_NSWSub(m_Value(X), m_Value(Y)))))
       return new ZExtInst(Builder.CreateICmpSLT(X, Y), Ty);
 
-    if (match(Op0, m_LShr(m_Value(X), m_APInt(ShOp1)))) { 
-      unsigned AmtSum = ShAmt + ShOp1->getZExtValue(); 
-      // Oversized shifts are simplified to zero in InstSimplify. 
-      if (AmtSum < BitWidth) 
-        // (X >>u C1) >>u C2 --> X >>u (C1 + C2) 
-        return BinaryOperator::CreateLShr(X, ConstantInt::get(Ty, AmtSum)); 
-    } 
- 
-    // If the shifted-out value is known-zero, then this is an exact shift. 
-    if (!I.isExact() && 
-        MaskedValueIsZero(Op0, APInt::getLowBitsSet(BitWidth, ShAmt), 0, &I)) { 
-      I.setIsExact(); 
-      return &I; 
-    } 
-  } 
- 
-  // Transform  (x << y) >> y  to  x & (-1 >> y) 
-  Value *X; 
-  if (match(Op0, m_OneUse(m_Shl(m_Value(X), m_Specific(Op1))))) { 
-    Constant *AllOnes = ConstantInt::getAllOnesValue(Ty); 
-    Value *Mask = Builder.CreateLShr(AllOnes, Op1); 
-    return BinaryOperator::CreateAnd(Mask, X); 
-  } 
- 
-  return nullptr; 
-} 
- 
-Instruction * 
+    if (match(Op0, m_LShr(m_Value(X), m_APInt(ShOp1)))) {
+      unsigned AmtSum = ShAmt + ShOp1->getZExtValue();
+      // Oversized shifts are simplified to zero in InstSimplify.
+      if (AmtSum < BitWidth)
+        // (X >>u C1) >>u C2 --> X >>u (C1 + C2)
+        return BinaryOperator::CreateLShr(X, ConstantInt::get(Ty, AmtSum));
+    }
+
+    // If the shifted-out value is known-zero, then this is an exact shift.
+    if (!I.isExact() &&
+        MaskedValueIsZero(Op0, APInt::getLowBitsSet(BitWidth, ShAmt), 0, &I)) {
+      I.setIsExact();
+      return &I;
+    }
+  }
+
+  // Transform  (x << y) >> y  to  x & (-1 >> y)
+  Value *X;
+  if (match(Op0, m_OneUse(m_Shl(m_Value(X), m_Specific(Op1))))) {
+    Constant *AllOnes = ConstantInt::getAllOnesValue(Ty);
+    Value *Mask = Builder.CreateLShr(AllOnes, Op1);
+    return BinaryOperator::CreateAnd(Mask, X);
+  }
+
+  return nullptr;
+}
+
+Instruction *
 InstCombinerImpl::foldVariableSignZeroExtensionOfVariableHighBitExtract(
-    BinaryOperator &OldAShr) { 
-  assert(OldAShr.getOpcode() == Instruction::AShr && 
-         "Must be called with arithmetic right-shift instruction only."); 
- 
-  // Check that constant C is a splat of the element-wise bitwidth of V. 
-  auto BitWidthSplat = [](Constant *C, Value *V) { 
-    return match( 
-        C, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ, 
-                              APInt(C->getType()->getScalarSizeInBits(), 
-                                    V->getType()->getScalarSizeInBits()))); 
-  }; 
- 
-  // It should look like variable-length sign-extension on the outside: 
-  //   (Val << (bitwidth(Val)-Nbits)) a>> (bitwidth(Val)-Nbits) 
-  Value *NBits; 
-  Instruction *MaybeTrunc; 
-  Constant *C1, *C2; 
-  if (!match(&OldAShr, 
-             m_AShr(m_Shl(m_Instruction(MaybeTrunc), 
-                          m_ZExtOrSelf(m_Sub(m_Constant(C1), 
-                                             m_ZExtOrSelf(m_Value(NBits))))), 
-                    m_ZExtOrSelf(m_Sub(m_Constant(C2), 
-                                       m_ZExtOrSelf(m_Deferred(NBits)))))) || 
-      !BitWidthSplat(C1, &OldAShr) || !BitWidthSplat(C2, &OldAShr)) 
-    return nullptr; 
- 
-  // There may or may not be a truncation after outer two shifts. 
-  Instruction *HighBitExtract; 
-  match(MaybeTrunc, m_TruncOrSelf(m_Instruction(HighBitExtract))); 
-  bool HadTrunc = MaybeTrunc != HighBitExtract; 
- 
-  // And finally, the innermost part of the pattern must be a right-shift. 
-  Value *X, *NumLowBitsToSkip; 
-  if (!match(HighBitExtract, m_Shr(m_Value(X), m_Value(NumLowBitsToSkip)))) 
-    return nullptr; 
- 
-  // Said right-shift must extract high NBits bits - C0 must be it's bitwidth. 
-  Constant *C0; 
-  if (!match(NumLowBitsToSkip, 
-             m_ZExtOrSelf( 
-                 m_Sub(m_Constant(C0), m_ZExtOrSelf(m_Specific(NBits))))) || 
-      !BitWidthSplat(C0, HighBitExtract)) 
-    return nullptr; 
- 
-  // Since the NBits is identical for all shifts, if the outermost and 
-  // innermost shifts are identical, then outermost shifts are redundant. 
-  // If we had truncation, do keep it though. 
-  if (HighBitExtract->getOpcode() == OldAShr.getOpcode()) 
-    return replaceInstUsesWith(OldAShr, MaybeTrunc); 
- 
-  // Else, if there was a truncation, then we need to ensure that one 
-  // instruction will go away. 
-  if (HadTrunc && !match(&OldAShr, m_c_BinOp(m_OneUse(m_Value()), m_Value()))) 
-    return nullptr; 
- 
-  // Finally, bypass two innermost shifts, and perform the outermost shift on 
-  // the operands of the innermost shift. 
-  Instruction *NewAShr = 
-      BinaryOperator::Create(OldAShr.getOpcode(), X, NumLowBitsToSkip); 
-  NewAShr->copyIRFlags(HighBitExtract); // We can preserve 'exact'-ness. 
-  if (!HadTrunc) 
-    return NewAShr; 
- 
-  Builder.Insert(NewAShr); 
-  return TruncInst::CreateTruncOrBitCast(NewAShr, OldAShr.getType()); 
-} 
- 
+    BinaryOperator &OldAShr) {
+  assert(OldAShr.getOpcode() == Instruction::AShr &&
+         "Must be called with arithmetic right-shift instruction only.");
+
+  // Check that constant C is a splat of the element-wise bitwidth of V.
+  auto BitWidthSplat = [](Constant *C, Value *V) {
+    return match(
+        C, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ,
+                              APInt(C->getType()->getScalarSizeInBits(),
+                                    V->getType()->getScalarSizeInBits())));
+  };
+
+  // It should look like variable-length sign-extension on the outside:
+  //   (Val << (bitwidth(Val)-Nbits)) a>> (bitwidth(Val)-Nbits)
+  Value *NBits;
+  Instruction *MaybeTrunc;
+  Constant *C1, *C2;
+  if (!match(&OldAShr,
+             m_AShr(m_Shl(m_Instruction(MaybeTrunc),
+                          m_ZExtOrSelf(m_Sub(m_Constant(C1),
+                                             m_ZExtOrSelf(m_Value(NBits))))),
+                    m_ZExtOrSelf(m_Sub(m_Constant(C2),
+                                       m_ZExtOrSelf(m_Deferred(NBits)))))) ||
+      !BitWidthSplat(C1, &OldAShr) || !BitWidthSplat(C2, &OldAShr))
+    return nullptr;
+
+  // There may or may not be a truncation after outer two shifts.
+  Instruction *HighBitExtract;
+  match(MaybeTrunc, m_TruncOrSelf(m_Instruction(HighBitExtract)));
+  bool HadTrunc = MaybeTrunc != HighBitExtract;
+
+  // And finally, the innermost part of the pattern must be a right-shift.
+  Value *X, *NumLowBitsToSkip;
+  if (!match(HighBitExtract, m_Shr(m_Value(X), m_Value(NumLowBitsToSkip))))
+    return nullptr;
+
+  // Said right-shift must extract high NBits bits - C0 must be it's bitwidth.
+  Constant *C0;
+  if (!match(NumLowBitsToSkip,
+             m_ZExtOrSelf(
+                 m_Sub(m_Constant(C0), m_ZExtOrSelf(m_Specific(NBits))))) ||
+      !BitWidthSplat(C0, HighBitExtract))
+    return nullptr;
+
+  // Since the NBits is identical for all shifts, if the outermost and
+  // innermost shifts are identical, then outermost shifts are redundant.
+  // If we had truncation, do keep it though.
+  if (HighBitExtract->getOpcode() == OldAShr.getOpcode())
+    return replaceInstUsesWith(OldAShr, MaybeTrunc);
+
+  // Else, if there was a truncation, then we need to ensure that one
+  // instruction will go away.
+  if (HadTrunc && !match(&OldAShr, m_c_BinOp(m_OneUse(m_Value()), m_Value())))
+    return nullptr;
+
+  // Finally, bypass two innermost shifts, and perform the outermost shift on
+  // the operands of the innermost shift.
+  Instruction *NewAShr =
+      BinaryOperator::Create(OldAShr.getOpcode(), X, NumLowBitsToSkip);
+  NewAShr->copyIRFlags(HighBitExtract); // We can preserve 'exact'-ness.
+  if (!HadTrunc)
+    return NewAShr;
+
+  Builder.Insert(NewAShr);
+  return TruncInst::CreateTruncOrBitCast(NewAShr, OldAShr.getType());
+}
+
 Instruction *InstCombinerImpl::visitAShr(BinaryOperator &I) {
-  if (Value *V = SimplifyAShrInst(I.getOperand(0), I.getOperand(1), I.isExact(), 
-                                  SQ.getWithInstruction(&I))) 
-    return replaceInstUsesWith(I, V); 
- 
-  if (Instruction *X = foldVectorBinop(I)) 
-    return X; 
- 
-  if (Instruction *R = commonShiftTransforms(I)) 
-    return R; 
- 
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); 
-  Type *Ty = I.getType(); 
-  unsigned BitWidth = Ty->getScalarSizeInBits(); 
-  const APInt *ShAmtAPInt; 
-  if (match(Op1, m_APInt(ShAmtAPInt)) && ShAmtAPInt->ult(BitWidth)) { 
-    unsigned ShAmt = ShAmtAPInt->getZExtValue(); 
- 
-    // If the shift amount equals the difference in width of the destination 
-    // and source scalar types: 
-    // ashr (shl (zext X), C), C --> sext X 
-    Value *X; 
-    if (match(Op0, m_Shl(m_ZExt(m_Value(X)), m_Specific(Op1))) && 
-        ShAmt == BitWidth - X->getType()->getScalarSizeInBits()) 
-      return new SExtInst(X, Ty); 
- 
-    // We can't handle (X << C1) >>s C2. It shifts arbitrary bits in. However, 
-    // we can handle (X <<nsw C1) >>s C2 since it only shifts in sign bits. 
-    const APInt *ShOp1; 
-    if (match(Op0, m_NSWShl(m_Value(X), m_APInt(ShOp1))) && 
-        ShOp1->ult(BitWidth)) { 
-      unsigned ShlAmt = ShOp1->getZExtValue(); 
-      if (ShlAmt < ShAmt) { 
-        // (X <<nsw C1) >>s C2 --> X >>s (C2 - C1) 
-        Constant *ShiftDiff = ConstantInt::get(Ty, ShAmt - ShlAmt); 
-        auto *NewAShr = BinaryOperator::CreateAShr(X, ShiftDiff); 
-        NewAShr->setIsExact(I.isExact()); 
-        return NewAShr; 
-      } 
-      if (ShlAmt > ShAmt) { 
-        // (X <<nsw C1) >>s C2 --> X <<nsw (C1 - C2) 
-        Constant *ShiftDiff = ConstantInt::get(Ty, ShlAmt - ShAmt); 
-        auto *NewShl = BinaryOperator::Create(Instruction::Shl, X, ShiftDiff); 
-        NewShl->setHasNoSignedWrap(true); 
-        return NewShl; 
-      } 
-    } 
- 
-    if (match(Op0, m_AShr(m_Value(X), m_APInt(ShOp1))) && 
-        ShOp1->ult(BitWidth)) { 
-      unsigned AmtSum = ShAmt + ShOp1->getZExtValue(); 
-      // Oversized arithmetic shifts replicate the sign bit. 
-      AmtSum = std::min(AmtSum, BitWidth - 1); 
-      // (X >>s C1) >>s C2 --> X >>s (C1 + C2) 
-      return BinaryOperator::CreateAShr(X, ConstantInt::get(Ty, AmtSum)); 
-    } 
- 
-    if (match(Op0, m_OneUse(m_SExt(m_Value(X)))) && 
-        (Ty->isVectorTy() || shouldChangeType(Ty, X->getType()))) { 
-      // ashr (sext X), C --> sext (ashr X, C') 
-      Type *SrcTy = X->getType(); 
-      ShAmt = std::min(ShAmt, SrcTy->getScalarSizeInBits() - 1); 
-      Value *NewSh = Builder.CreateAShr(X, ConstantInt::get(SrcTy, ShAmt)); 
-      return new SExtInst(NewSh, Ty); 
-    } 
- 
+  if (Value *V = SimplifyAShrInst(I.getOperand(0), I.getOperand(1), I.isExact(),
+                                  SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  if (Instruction *X = foldVectorBinop(I))
+    return X;
+
+  if (Instruction *R = commonShiftTransforms(I))
+    return R;
+
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  Type *Ty = I.getType();
+  unsigned BitWidth = Ty->getScalarSizeInBits();
+  const APInt *ShAmtAPInt;
+  if (match(Op1, m_APInt(ShAmtAPInt)) && ShAmtAPInt->ult(BitWidth)) {
+    unsigned ShAmt = ShAmtAPInt->getZExtValue();
+
+    // If the shift amount equals the difference in width of the destination
+    // and source scalar types:
+    // ashr (shl (zext X), C), C --> sext X
+    Value *X;
+    if (match(Op0, m_Shl(m_ZExt(m_Value(X)), m_Specific(Op1))) &&
+        ShAmt == BitWidth - X->getType()->getScalarSizeInBits())
+      return new SExtInst(X, Ty);
+
+    // We can't handle (X << C1) >>s C2. It shifts arbitrary bits in. However,
+    // we can handle (X <<nsw C1) >>s C2 since it only shifts in sign bits.
+    const APInt *ShOp1;
+    if (match(Op0, m_NSWShl(m_Value(X), m_APInt(ShOp1))) &&
+        ShOp1->ult(BitWidth)) {
+      unsigned ShlAmt = ShOp1->getZExtValue();
+      if (ShlAmt < ShAmt) {
+        // (X <<nsw C1) >>s C2 --> X >>s (C2 - C1)
+        Constant *ShiftDiff = ConstantInt::get(Ty, ShAmt - ShlAmt);
+        auto *NewAShr = BinaryOperator::CreateAShr(X, ShiftDiff);
+        NewAShr->setIsExact(I.isExact());
+        return NewAShr;
+      }
+      if (ShlAmt > ShAmt) {
+        // (X <<nsw C1) >>s C2 --> X <<nsw (C1 - C2)
+        Constant *ShiftDiff = ConstantInt::get(Ty, ShlAmt - ShAmt);
+        auto *NewShl = BinaryOperator::Create(Instruction::Shl, X, ShiftDiff);
+        NewShl->setHasNoSignedWrap(true);
+        return NewShl;
+      }
+    }
+
+    if (match(Op0, m_AShr(m_Value(X), m_APInt(ShOp1))) &&
+        ShOp1->ult(BitWidth)) {
+      unsigned AmtSum = ShAmt + ShOp1->getZExtValue();
+      // Oversized arithmetic shifts replicate the sign bit.
+      AmtSum = std::min(AmtSum, BitWidth - 1);
+      // (X >>s C1) >>s C2 --> X >>s (C1 + C2)
+      return BinaryOperator::CreateAShr(X, ConstantInt::get(Ty, AmtSum));
+    }
+
+    if (match(Op0, m_OneUse(m_SExt(m_Value(X)))) &&
+        (Ty->isVectorTy() || shouldChangeType(Ty, X->getType()))) {
+      // ashr (sext X), C --> sext (ashr X, C')
+      Type *SrcTy = X->getType();
+      ShAmt = std::min(ShAmt, SrcTy->getScalarSizeInBits() - 1);
+      Value *NewSh = Builder.CreateAShr(X, ConstantInt::get(SrcTy, ShAmt));
+      return new SExtInst(NewSh, Ty);
+    }
+
     // ashr i32 (X -nsw Y), 31 --> sext (X < Y)
     Value *Y;
     if (ShAmt == BitWidth - 1 &&
         match(Op0, m_OneUse(m_NSWSub(m_Value(X), m_Value(Y)))))
       return new SExtInst(Builder.CreateICmpSLT(X, Y), Ty);
 
-    // If the shifted-out value is known-zero, then this is an exact shift. 
-    if (!I.isExact() && 
-        MaskedValueIsZero(Op0, APInt::getLowBitsSet(BitWidth, ShAmt), 0, &I)) { 
-      I.setIsExact(); 
-      return &I; 
-    } 
-  } 
- 
-  if (Instruction *R = foldVariableSignZeroExtensionOfVariableHighBitExtract(I)) 
-    return R; 
- 
-  // See if we can turn a signed shr into an unsigned shr. 
-  if (MaskedValueIsZero(Op0, APInt::getSignMask(BitWidth), 0, &I)) 
-    return BinaryOperator::CreateLShr(Op0, Op1); 
- 
-  return nullptr; 
-} 
+    // If the shifted-out value is known-zero, then this is an exact shift.
+    if (!I.isExact() &&
+        MaskedValueIsZero(Op0, APInt::getLowBitsSet(BitWidth, ShAmt), 0, &I)) {
+      I.setIsExact();
+      return &I;
+    }
+  }
+
+  if (Instruction *R = foldVariableSignZeroExtensionOfVariableHighBitExtract(I))
+    return R;
+
+  // See if we can turn a signed shr into an unsigned shr.
+  if (MaskedValueIsZero(Op0, APInt::getSignMask(BitWidth), 0, &I))
+    return BinaryOperator::CreateLShr(Op0, Op1);
+
+  return nullptr;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 7aaa36f730..16efe86377 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -1,264 +1,264 @@
-//===- InstCombineSimplifyDemanded.cpp ------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file contains logic for simplifying instructions based on information 
-// about how they are used. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "InstCombineInternal.h" 
+//===- InstCombineSimplifyDemanded.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains logic for simplifying instructions based on information
+// about how they are used.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/Support/KnownBits.h" 
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
- 
-using namespace llvm; 
-using namespace llvm::PatternMatch; 
- 
-#define DEBUG_TYPE "instcombine" 
- 
-/// Check to see if the specified operand of the specified instruction is a 
-/// constant integer. If so, check to see if there are any bits set in the 
-/// constant that are not demanded. If so, shrink the constant and return true. 
-static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo, 
-                                   const APInt &Demanded) { 
-  assert(I && "No instruction?"); 
-  assert(OpNo < I->getNumOperands() && "Operand index too large"); 
- 
-  // The operand must be a constant integer or splat integer. 
-  Value *Op = I->getOperand(OpNo); 
-  const APInt *C; 
-  if (!match(Op, m_APInt(C))) 
-    return false; 
- 
-  // If there are no bits set that aren't demanded, nothing to do. 
-  if (C->isSubsetOf(Demanded)) 
-    return false; 
- 
-  // This instruction is producing bits that are not demanded. Shrink the RHS. 
-  I->setOperand(OpNo, ConstantInt::get(Op->getType(), *C & Demanded)); 
- 
-  return true; 
-} 
- 
- 
- 
-/// Inst is an integer instruction that SimplifyDemandedBits knows about. See if 
-/// the instruction has any properties that allow us to simplify its operands. 
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
+/// Check to see if the specified operand of the specified instruction is a
+/// constant integer. If so, check to see if there are any bits set in the
+/// constant that are not demanded. If so, shrink the constant and return true.
+static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo,
+                                   const APInt &Demanded) {
+  assert(I && "No instruction?");
+  assert(OpNo < I->getNumOperands() && "Operand index too large");
+
+  // The operand must be a constant integer or splat integer.
+  Value *Op = I->getOperand(OpNo);
+  const APInt *C;
+  if (!match(Op, m_APInt(C)))
+    return false;
+
+  // If there are no bits set that aren't demanded, nothing to do.
+  if (C->isSubsetOf(Demanded))
+    return false;
+
+  // This instruction is producing bits that are not demanded. Shrink the RHS.
+  I->setOperand(OpNo, ConstantInt::get(Op->getType(), *C & Demanded));
+
+  return true;
+}
+
+
+
+/// Inst is an integer instruction that SimplifyDemandedBits knows about. See if
+/// the instruction has any properties that allow us to simplify its operands.
 bool InstCombinerImpl::SimplifyDemandedInstructionBits(Instruction &Inst) {
-  unsigned BitWidth = Inst.getType()->getScalarSizeInBits(); 
-  KnownBits Known(BitWidth); 
-  APInt DemandedMask(APInt::getAllOnesValue(BitWidth)); 
- 
-  Value *V = SimplifyDemandedUseBits(&Inst, DemandedMask, Known, 
-                                     0, &Inst); 
-  if (!V) return false; 
-  if (V == &Inst) return true; 
-  replaceInstUsesWith(Inst, V); 
-  return true; 
-} 
- 
-/// This form of SimplifyDemandedBits simplifies the specified instruction 
-/// operand if possible, updating it in place. It returns true if it made any 
-/// change and false otherwise. 
+  unsigned BitWidth = Inst.getType()->getScalarSizeInBits();
+  KnownBits Known(BitWidth);
+  APInt DemandedMask(APInt::getAllOnesValue(BitWidth));
+
+  Value *V = SimplifyDemandedUseBits(&Inst, DemandedMask, Known,
+                                     0, &Inst);
+  if (!V) return false;
+  if (V == &Inst) return true;
+  replaceInstUsesWith(Inst, V);
+  return true;
+}
+
+/// This form of SimplifyDemandedBits simplifies the specified instruction
+/// operand if possible, updating it in place. It returns true if it made any
+/// change and false otherwise.
 bool InstCombinerImpl::SimplifyDemandedBits(Instruction *I, unsigned OpNo,
                                             const APInt &DemandedMask,
                                             KnownBits &Known, unsigned Depth) {
-  Use &U = I->getOperandUse(OpNo); 
-  Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask, Known, 
-                                          Depth, I); 
-  if (!NewVal) return false; 
-  if (Instruction* OpInst = dyn_cast<Instruction>(U)) 
-    salvageDebugInfo(*OpInst); 
-
-  replaceUse(U, NewVal); 
-  return true; 
-} 
- 
-/// This function attempts to replace V with a simpler value based on the 
-/// demanded bits. When this function is called, it is known that only the bits 
-/// set in DemandedMask of the result of V are ever used downstream. 
-/// Consequently, depending on the mask and V, it may be possible to replace V 
-/// with a constant or one of its operands. In such cases, this function does 
-/// the replacement and returns true. In all other cases, it returns false after 
-/// analyzing the expression and setting KnownOne and known to be one in the 
-/// expression. Known.Zero contains all the bits that are known to be zero in 
-/// the expression. These are provided to potentially allow the caller (which 
-/// might recursively be SimplifyDemandedBits itself) to simplify the 
-/// expression. 
-/// Known.One and Known.Zero always follow the invariant that: 
-///   Known.One & Known.Zero == 0. 
-/// That is, a bit can't be both 1 and 0. Note that the bits in Known.One and 
-/// Known.Zero may only be accurate for those bits set in DemandedMask. Note 
-/// also that the bitwidth of V, DemandedMask, Known.Zero and Known.One must all 
-/// be the same. 
-/// 
-/// This returns null if it did not change anything and it permits no 
-/// simplification.  This returns V itself if it did some simplification of V's 
-/// operands based on the information about what bits are demanded. This returns 
-/// some other non-null value if it found out that V is equal to another value 
-/// in the context where the specified bits are demanded, but not for all users. 
+  Use &U = I->getOperandUse(OpNo);
+  Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask, Known,
+                                          Depth, I);
+  if (!NewVal) return false;
+  if (Instruction* OpInst = dyn_cast<Instruction>(U))
+    salvageDebugInfo(*OpInst);
+
+  replaceUse(U, NewVal);
+  return true;
+}
+
+/// This function attempts to replace V with a simpler value based on the
+/// demanded bits. When this function is called, it is known that only the bits
+/// set in DemandedMask of the result of V are ever used downstream.
+/// Consequently, depending on the mask and V, it may be possible to replace V
+/// with a constant or one of its operands. In such cases, this function does
+/// the replacement and returns true. In all other cases, it returns false after
+/// analyzing the expression and setting KnownOne and known to be one in the
+/// expression. Known.Zero contains all the bits that are known to be zero in
+/// the expression. These are provided to potentially allow the caller (which
+/// might recursively be SimplifyDemandedBits itself) to simplify the
+/// expression.
+/// Known.One and Known.Zero always follow the invariant that:
+///   Known.One & Known.Zero == 0.
+/// That is, a bit can't be both 1 and 0. Note that the bits in Known.One and
+/// Known.Zero may only be accurate for those bits set in DemandedMask. Note
+/// also that the bitwidth of V, DemandedMask, Known.Zero and Known.One must all
+/// be the same.
+///
+/// This returns null if it did not change anything and it permits no
+/// simplification.  This returns V itself if it did some simplification of V's
+/// operands based on the information about what bits are demanded. This returns
+/// some other non-null value if it found out that V is equal to another value
+/// in the context where the specified bits are demanded, but not for all users.
 Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
                                                  KnownBits &Known,
                                                  unsigned Depth,
                                                  Instruction *CxtI) {
-  assert(V != nullptr && "Null pointer of Value???"); 
+  assert(V != nullptr && "Null pointer of Value???");
   assert(Depth <= MaxAnalysisRecursionDepth && "Limit Search Depth");
-  uint32_t BitWidth = DemandedMask.getBitWidth(); 
-  Type *VTy = V->getType(); 
-  assert( 
-      (!VTy->isIntOrIntVectorTy() || VTy->getScalarSizeInBits() == BitWidth) && 
-      Known.getBitWidth() == BitWidth && 
-      "Value *V, DemandedMask and Known must have same BitWidth"); 
- 
-  if (isa<Constant>(V)) { 
-    computeKnownBits(V, Known, Depth, CxtI); 
-    return nullptr; 
-  } 
- 
-  Known.resetAll(); 
-  if (DemandedMask.isNullValue())     // Not demanding any bits from V. 
-    return UndefValue::get(VTy); 
- 
+  uint32_t BitWidth = DemandedMask.getBitWidth();
+  Type *VTy = V->getType();
+  assert(
+      (!VTy->isIntOrIntVectorTy() || VTy->getScalarSizeInBits() == BitWidth) &&
+      Known.getBitWidth() == BitWidth &&
+      "Value *V, DemandedMask and Known must have same BitWidth");
+
+  if (isa<Constant>(V)) {
+    computeKnownBits(V, Known, Depth, CxtI);
+    return nullptr;
+  }
+
+  Known.resetAll();
+  if (DemandedMask.isNullValue())     // Not demanding any bits from V.
+    return UndefValue::get(VTy);
+
   if (Depth == MaxAnalysisRecursionDepth)
-    return nullptr; 
- 
+    return nullptr;
+
   if (isa<ScalableVectorType>(VTy))
     return nullptr;
 
-  Instruction *I = dyn_cast<Instruction>(V); 
-  if (!I) { 
-    computeKnownBits(V, Known, Depth, CxtI); 
-    return nullptr;        // Only analyze instructions. 
-  } 
- 
-  // If there are multiple uses of this value and we aren't at the root, then 
-  // we can't do any simplifications of the operands, because DemandedMask 
-  // only reflects the bits demanded by *one* of the users. 
-  if (Depth != 0 && !I->hasOneUse()) 
-    return SimplifyMultipleUseDemandedBits(I, DemandedMask, Known, Depth, CxtI); 
- 
-  KnownBits LHSKnown(BitWidth), RHSKnown(BitWidth); 
- 
-  // If this is the root being simplified, allow it to have multiple uses, 
-  // just set the DemandedMask to all bits so that we can try to simplify the 
-  // operands.  This allows visitTruncInst (for example) to simplify the 
-  // operand of a trunc without duplicating all the logic below. 
-  if (Depth == 0 && !V->hasOneUse()) 
-    DemandedMask.setAllBits(); 
- 
-  switch (I->getOpcode()) { 
-  default: 
-    computeKnownBits(I, Known, Depth, CxtI); 
-    break; 
-  case Instruction::And: { 
-    // If either the LHS or the RHS are Zero, the result is zero. 
-    if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) || 
-        SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnown.Zero, LHSKnown, 
-                             Depth + 1)) 
-      return I; 
-    assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?"); 
-    assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?"); 
- 
-    Known = LHSKnown & RHSKnown; 
- 
-    // If the client is only demanding bits that we know, return the known 
-    // constant. 
-    if (DemandedMask.isSubsetOf(Known.Zero | Known.One)) 
-      return Constant::getIntegerValue(VTy, Known.One); 
- 
-    // If all of the demanded bits are known 1 on one side, return the other. 
-    // These bits cannot contribute to the result of the 'and'. 
-    if (DemandedMask.isSubsetOf(LHSKnown.Zero | RHSKnown.One)) 
-      return I->getOperand(0); 
-    if (DemandedMask.isSubsetOf(RHSKnown.Zero | LHSKnown.One)) 
-      return I->getOperand(1); 
- 
-    // If the RHS is a constant, see if we can simplify it. 
-    if (ShrinkDemandedConstant(I, 1, DemandedMask & ~LHSKnown.Zero)) 
-      return I; 
- 
-    break; 
-  } 
-  case Instruction::Or: { 
-    // If either the LHS or the RHS are One, the result is One. 
-    if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) || 
-        SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnown.One, LHSKnown, 
-                             Depth + 1)) 
-      return I; 
-    assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?"); 
-    assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?"); 
- 
-    Known = LHSKnown | RHSKnown; 
- 
-    // If the client is only demanding bits that we know, return the known 
-    // constant. 
-    if (DemandedMask.isSubsetOf(Known.Zero | Known.One)) 
-      return Constant::getIntegerValue(VTy, Known.One); 
- 
-    // If all of the demanded bits are known zero on one side, return the other. 
-    // These bits cannot contribute to the result of the 'or'. 
-    if (DemandedMask.isSubsetOf(LHSKnown.One | RHSKnown.Zero)) 
-      return I->getOperand(0); 
-    if (DemandedMask.isSubsetOf(RHSKnown.One | LHSKnown.Zero)) 
-      return I->getOperand(1); 
- 
-    // If the RHS is a constant, see if we can simplify it. 
-    if (ShrinkDemandedConstant(I, 1, DemandedMask)) 
-      return I; 
- 
-    break; 
-  } 
-  case Instruction::Xor: { 
-    if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) || 
-        SimplifyDemandedBits(I, 0, DemandedMask, LHSKnown, Depth + 1)) 
-      return I; 
-    assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?"); 
-    assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?"); 
- 
-    Known = LHSKnown ^ RHSKnown; 
- 
-    // If the client is only demanding bits that we know, return the known 
-    // constant. 
-    if (DemandedMask.isSubsetOf(Known.Zero | Known.One)) 
-      return Constant::getIntegerValue(VTy, Known.One); 
- 
-    // If all of the demanded bits are known zero on one side, return the other. 
-    // These bits cannot contribute to the result of the 'xor'. 
-    if (DemandedMask.isSubsetOf(RHSKnown.Zero)) 
-      return I->getOperand(0); 
-    if (DemandedMask.isSubsetOf(LHSKnown.Zero)) 
-      return I->getOperand(1); 
- 
-    // If all of the demanded bits are known to be zero on one side or the 
-    // other, turn this into an *inclusive* or. 
-    //    e.g. (A & C1)^(B & C2) -> (A & C1)|(B & C2) iff C1&C2 == 0 
-    if (DemandedMask.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero)) { 
-      Instruction *Or = 
-        BinaryOperator::CreateOr(I->getOperand(0), I->getOperand(1), 
-                                 I->getName()); 
-      return InsertNewInstWith(Or, *I); 
-    } 
- 
-    // If all of the demanded bits on one side are known, and all of the set 
-    // bits on that side are also known to be set on the other side, turn this 
-    // into an AND, as we know the bits will be cleared. 
-    //    e.g. (X | C1) ^ C2 --> (X | C1) & ~C2 iff (C1&C2) == C2 
-    if (DemandedMask.isSubsetOf(RHSKnown.Zero|RHSKnown.One) && 
-        RHSKnown.One.isSubsetOf(LHSKnown.One)) { 
-      Constant *AndC = Constant::getIntegerValue(VTy, 
-                                                 ~RHSKnown.One & DemandedMask); 
-      Instruction *And = BinaryOperator::CreateAnd(I->getOperand(0), AndC); 
-      return InsertNewInstWith(And, *I); 
-    } 
- 
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) {
+    computeKnownBits(V, Known, Depth, CxtI);
+    return nullptr;        // Only analyze instructions.
+  }
+
+  // If there are multiple uses of this value and we aren't at the root, then
+  // we can't do any simplifications of the operands, because DemandedMask
+  // only reflects the bits demanded by *one* of the users.
+  if (Depth != 0 && !I->hasOneUse())
+    return SimplifyMultipleUseDemandedBits(I, DemandedMask, Known, Depth, CxtI);
+
+  KnownBits LHSKnown(BitWidth), RHSKnown(BitWidth);
+
+  // If this is the root being simplified, allow it to have multiple uses,
+  // just set the DemandedMask to all bits so that we can try to simplify the
+  // operands.  This allows visitTruncInst (for example) to simplify the
+  // operand of a trunc without duplicating all the logic below.
+  if (Depth == 0 && !V->hasOneUse())
+    DemandedMask.setAllBits();
+
+  switch (I->getOpcode()) {
+  default:
+    computeKnownBits(I, Known, Depth, CxtI);
+    break;
+  case Instruction::And: {
+    // If either the LHS or the RHS are Zero, the result is zero.
+    if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) ||
+        SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnown.Zero, LHSKnown,
+                             Depth + 1))
+      return I;
+    assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
+    assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");
+
+    Known = LHSKnown & RHSKnown;
+
+    // If the client is only demanding bits that we know, return the known
+    // constant.
+    if (DemandedMask.isSubsetOf(Known.Zero | Known.One))
+      return Constant::getIntegerValue(VTy, Known.One);
+
+    // If all of the demanded bits are known 1 on one side, return the other.
+    // These bits cannot contribute to the result of the 'and'.
+    if (DemandedMask.isSubsetOf(LHSKnown.Zero | RHSKnown.One))
+      return I->getOperand(0);
+    if (DemandedMask.isSubsetOf(RHSKnown.Zero | LHSKnown.One))
+      return I->getOperand(1);
+
+    // If the RHS is a constant, see if we can simplify it.
+    if (ShrinkDemandedConstant(I, 1, DemandedMask & ~LHSKnown.Zero))
+      return I;
+
+    break;
+  }
+  case Instruction::Or: {
+    // If either the LHS or the RHS are One, the result is One.
+    if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) ||
+        SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnown.One, LHSKnown,
+                             Depth + 1))
+      return I;
+    assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
+    assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");
+
+    Known = LHSKnown | RHSKnown;
+
+    // If the client is only demanding bits that we know, return the known
+    // constant.
+    if (DemandedMask.isSubsetOf(Known.Zero | Known.One))
+      return Constant::getIntegerValue(VTy, Known.One);
+
+    // If all of the demanded bits are known zero on one side, return the other.
+    // These bits cannot contribute to the result of the 'or'.
+    if (DemandedMask.isSubsetOf(LHSKnown.One | RHSKnown.Zero))
+      return I->getOperand(0);
+    if (DemandedMask.isSubsetOf(RHSKnown.One | LHSKnown.Zero))
+      return I->getOperand(1);
+
+    // If the RHS is a constant, see if we can simplify it.
+    if (ShrinkDemandedConstant(I, 1, DemandedMask))
+      return I;
+
+    break;
+  }
+  case Instruction::Xor: {
+    if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) ||
+        SimplifyDemandedBits(I, 0, DemandedMask, LHSKnown, Depth + 1))
+      return I;
+    assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
+    assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");
+
+    Known = LHSKnown ^ RHSKnown;
+
+    // If the client is only demanding bits that we know, return the known
+    // constant.
+    if (DemandedMask.isSubsetOf(Known.Zero | Known.One))
+      return Constant::getIntegerValue(VTy, Known.One);
+
+    // If all of the demanded bits are known zero on one side, return the other.
+    // These bits cannot contribute to the result of the 'xor'.
+    if (DemandedMask.isSubsetOf(RHSKnown.Zero))
+      return I->getOperand(0);
+    if (DemandedMask.isSubsetOf(LHSKnown.Zero))
+      return I->getOperand(1);
+
+    // If all of the demanded bits are known to be zero on one side or the
+    // other, turn this into an *inclusive* or.
+    //    e.g. (A & C1)^(B & C2) -> (A & C1)|(B & C2) iff C1&C2 == 0
+    if (DemandedMask.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero)) {
+      Instruction *Or =
+        BinaryOperator::CreateOr(I->getOperand(0), I->getOperand(1),
+                                 I->getName());
+      return InsertNewInstWith(Or, *I);
+    }
+
+    // If all of the demanded bits on one side are known, and all of the set
+    // bits on that side are also known to be set on the other side, turn this
+    // into an AND, as we know the bits will be cleared.
+    //    e.g. (X | C1) ^ C2 --> (X | C1) & ~C2 iff (C1&C2) == C2
+    if (DemandedMask.isSubsetOf(RHSKnown.Zero|RHSKnown.One) &&
+        RHSKnown.One.isSubsetOf(LHSKnown.One)) {
+      Constant *AndC = Constant::getIntegerValue(VTy,
+                                                 ~RHSKnown.One & DemandedMask);
+      Instruction *And = BinaryOperator::CreateAnd(I->getOperand(0), AndC);
+      return InsertNewInstWith(And, *I);
+    }
+
     // If the RHS is a constant, see if we can change it. Don't alter a -1
     // constant because that's a canonical 'not' op, and that is better for
     // combining, SCEV, and codegen.
@@ -273,636 +273,636 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       if (ShrinkDemandedConstant(I, 1, DemandedMask))
         return I;
     }
- 
-    // If our LHS is an 'and' and if it has one use, and if any of the bits we 
-    // are flipping are known to be set, then the xor is just resetting those 
-    // bits to zero.  We can just knock out bits from the 'and' and the 'xor', 
-    // simplifying both of them. 
+
+    // If our LHS is an 'and' and if it has one use, and if any of the bits we
+    // are flipping are known to be set, then the xor is just resetting those
+    // bits to zero.  We can just knock out bits from the 'and' and the 'xor',
+    // simplifying both of them.
     if (Instruction *LHSInst = dyn_cast<Instruction>(I->getOperand(0))) {
       ConstantInt *AndRHS, *XorRHS;
-      if (LHSInst->getOpcode() == Instruction::And && LHSInst->hasOneUse() && 
+      if (LHSInst->getOpcode() == Instruction::And && LHSInst->hasOneUse() &&
           match(I->getOperand(1), m_ConstantInt(XorRHS)) &&
           match(LHSInst->getOperand(1), m_ConstantInt(AndRHS)) &&
-          (LHSKnown.One & RHSKnown.One & DemandedMask) != 0) { 
-        APInt NewMask = ~(LHSKnown.One & RHSKnown.One & DemandedMask); 
- 
-        Constant *AndC = 
+          (LHSKnown.One & RHSKnown.One & DemandedMask) != 0) {
+        APInt NewMask = ~(LHSKnown.One & RHSKnown.One & DemandedMask);
+
+        Constant *AndC =
             ConstantInt::get(I->getType(), NewMask & AndRHS->getValue());
-        Instruction *NewAnd = BinaryOperator::CreateAnd(I->getOperand(0), AndC); 
-        InsertNewInstWith(NewAnd, *I); 
- 
-        Constant *XorC = 
+        Instruction *NewAnd = BinaryOperator::CreateAnd(I->getOperand(0), AndC);
+        InsertNewInstWith(NewAnd, *I);
+
+        Constant *XorC =
             ConstantInt::get(I->getType(), NewMask & XorRHS->getValue());
-        Instruction *NewXor = BinaryOperator::CreateXor(NewAnd, XorC); 
-        return InsertNewInstWith(NewXor, *I); 
-      } 
+        Instruction *NewXor = BinaryOperator::CreateXor(NewAnd, XorC);
+        return InsertNewInstWith(NewXor, *I);
+      }
     }
-    break; 
-  } 
-  case Instruction::Select: { 
-    Value *LHS, *RHS; 
-    SelectPatternFlavor SPF = matchSelectPattern(I, LHS, RHS).Flavor; 
-    if (SPF == SPF_UMAX) { 
-      // UMax(A, C) == A if ... 
-      // The lowest non-zero bit of DemandMask is higher than the highest 
-      // non-zero bit of C. 
-      const APInt *C; 
-      unsigned CTZ = DemandedMask.countTrailingZeros(); 
-      if (match(RHS, m_APInt(C)) && CTZ >= C->getActiveBits()) 
-        return LHS; 
-    } else if (SPF == SPF_UMIN) { 
-      // UMin(A, C) == A if ... 
-      // The lowest non-zero bit of DemandMask is higher than the highest 
-      // non-one bit of C. 
-      // This comes from using DeMorgans on the above umax example. 
-      const APInt *C; 
-      unsigned CTZ = DemandedMask.countTrailingZeros(); 
-      if (match(RHS, m_APInt(C)) && 
-          CTZ >= C->getBitWidth() - C->countLeadingOnes()) 
-        return LHS; 
-    } 
- 
-    // If this is a select as part of any other min/max pattern, don't simplify 
-    // any further in case we break the structure. 
-    if (SPF != SPF_UNKNOWN) 
-      return nullptr; 
- 
-    if (SimplifyDemandedBits(I, 2, DemandedMask, RHSKnown, Depth + 1) || 
-        SimplifyDemandedBits(I, 1, DemandedMask, LHSKnown, Depth + 1)) 
-      return I; 
-    assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?"); 
-    assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?"); 
- 
-    // If the operands are constants, see if we can simplify them. 
-    // This is similar to ShrinkDemandedConstant, but for a select we want to 
-    // try to keep the selected constants the same as icmp value constants, if 
-    // we can. This helps not break apart (or helps put back together) 
-    // canonical patterns like min and max. 
-    auto CanonicalizeSelectConstant = [](Instruction *I, unsigned OpNo, 
+    break;
+  }
+  case Instruction::Select: {
+    Value *LHS, *RHS;
+    SelectPatternFlavor SPF = matchSelectPattern(I, LHS, RHS).Flavor;
+    if (SPF == SPF_UMAX) {
+      // UMax(A, C) == A if ...
+      // The lowest non-zero bit of DemandMask is higher than the highest
+      // non-zero bit of C.
+      const APInt *C;
+      unsigned CTZ = DemandedMask.countTrailingZeros();
+      if (match(RHS, m_APInt(C)) && CTZ >= C->getActiveBits())
+        return LHS;
+    } else if (SPF == SPF_UMIN) {
+      // UMin(A, C) == A if ...
+      // The lowest non-zero bit of DemandMask is higher than the highest
+      // non-one bit of C.
+      // This comes from using DeMorgans on the above umax example.
+      const APInt *C;
+      unsigned CTZ = DemandedMask.countTrailingZeros();
+      if (match(RHS, m_APInt(C)) &&
+          CTZ >= C->getBitWidth() - C->countLeadingOnes())
+        return LHS;
+    }
+
+    // If this is a select as part of any other min/max pattern, don't simplify
+    // any further in case we break the structure.
+    if (SPF != SPF_UNKNOWN)
+      return nullptr;
+
+    if (SimplifyDemandedBits(I, 2, DemandedMask, RHSKnown, Depth + 1) ||
+        SimplifyDemandedBits(I, 1, DemandedMask, LHSKnown, Depth + 1))
+      return I;
+    assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
+    assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");
+
+    // If the operands are constants, see if we can simplify them.
+    // This is similar to ShrinkDemandedConstant, but for a select we want to
+    // try to keep the selected constants the same as icmp value constants, if
+    // we can. This helps not break apart (or helps put back together)
+    // canonical patterns like min and max.
+    auto CanonicalizeSelectConstant = [](Instruction *I, unsigned OpNo,
                                          const APInt &DemandedMask) {
-      const APInt *SelC; 
-      if (!match(I->getOperand(OpNo), m_APInt(SelC))) 
-        return false; 
- 
-      // Get the constant out of the ICmp, if there is one. 
+      const APInt *SelC;
+      if (!match(I->getOperand(OpNo), m_APInt(SelC)))
+        return false;
+
+      // Get the constant out of the ICmp, if there is one.
       // Only try this when exactly 1 operand is a constant (if both operands
       // are constant, the icmp should eventually simplify). Otherwise, we may
       // invert the transform that reduces set bits and infinite-loop.
       Value *X;
-      const APInt *CmpC; 
-      ICmpInst::Predicate Pred; 
+      const APInt *CmpC;
+      ICmpInst::Predicate Pred;
       if (!match(I->getOperand(0), m_ICmp(Pred, m_Value(X), m_APInt(CmpC))) ||
           isa<Constant>(X) || CmpC->getBitWidth() != SelC->getBitWidth())
-        return ShrinkDemandedConstant(I, OpNo, DemandedMask); 
- 
-      // If the constant is already the same as the ICmp, leave it as-is. 
-      if (*CmpC == *SelC) 
-        return false; 
-      // If the constants are not already the same, but can be with the demand 
-      // mask, use the constant value from the ICmp. 
-      if ((*CmpC & DemandedMask) == (*SelC & DemandedMask)) { 
-        I->setOperand(OpNo, ConstantInt::get(I->getType(), *CmpC)); 
-        return true; 
-      } 
-      return ShrinkDemandedConstant(I, OpNo, DemandedMask); 
-    }; 
-    if (CanonicalizeSelectConstant(I, 1, DemandedMask) || 
-        CanonicalizeSelectConstant(I, 2, DemandedMask)) 
-      return I; 
- 
-    // Only known if known in both the LHS and RHS. 
+        return ShrinkDemandedConstant(I, OpNo, DemandedMask);
+
+      // If the constant is already the same as the ICmp, leave it as-is.
+      if (*CmpC == *SelC)
+        return false;
+      // If the constants are not already the same, but can be with the demand
+      // mask, use the constant value from the ICmp.
+      if ((*CmpC & DemandedMask) == (*SelC & DemandedMask)) {
+        I->setOperand(OpNo, ConstantInt::get(I->getType(), *CmpC));
+        return true;
+      }
+      return ShrinkDemandedConstant(I, OpNo, DemandedMask);
+    };
+    if (CanonicalizeSelectConstant(I, 1, DemandedMask) ||
+        CanonicalizeSelectConstant(I, 2, DemandedMask))
+      return I;
+
+    // Only known if known in both the LHS and RHS.
     Known = KnownBits::commonBits(LHSKnown, RHSKnown);
-    break; 
-  } 
-  case Instruction::ZExt: 
-  case Instruction::Trunc: { 
-    unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits(); 
- 
-    APInt InputDemandedMask = DemandedMask.zextOrTrunc(SrcBitWidth); 
-    KnownBits InputKnown(SrcBitWidth); 
-    if (SimplifyDemandedBits(I, 0, InputDemandedMask, InputKnown, Depth + 1)) 
-      return I; 
-    assert(InputKnown.getBitWidth() == SrcBitWidth && "Src width changed?"); 
-    Known = InputKnown.zextOrTrunc(BitWidth); 
-    assert(!Known.hasConflict() && "Bits known to be one AND zero?"); 
-    break; 
-  } 
-  case Instruction::BitCast: 
-    if (!I->getOperand(0)->getType()->isIntOrIntVectorTy()) 
-      return nullptr;  // vector->int or fp->int? 
- 
-    if (VectorType *DstVTy = dyn_cast<VectorType>(I->getType())) { 
-      if (VectorType *SrcVTy = 
-            dyn_cast<VectorType>(I->getOperand(0)->getType())) { 
+    break;
+  }
+  case Instruction::ZExt:
+  case Instruction::Trunc: {
+    unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits();
+
+    APInt InputDemandedMask = DemandedMask.zextOrTrunc(SrcBitWidth);
+    KnownBits InputKnown(SrcBitWidth);
+    if (SimplifyDemandedBits(I, 0, InputDemandedMask, InputKnown, Depth + 1))
+      return I;
+    assert(InputKnown.getBitWidth() == SrcBitWidth && "Src width changed?");
+    Known = InputKnown.zextOrTrunc(BitWidth);
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+    break;
+  }
+  case Instruction::BitCast:
+    if (!I->getOperand(0)->getType()->isIntOrIntVectorTy())
+      return nullptr;  // vector->int or fp->int?
+
+    if (VectorType *DstVTy = dyn_cast<VectorType>(I->getType())) {
+      if (VectorType *SrcVTy =
+            dyn_cast<VectorType>(I->getOperand(0)->getType())) {
         if (cast<FixedVectorType>(DstVTy)->getNumElements() !=
             cast<FixedVectorType>(SrcVTy)->getNumElements())
-          // Don't touch a bitcast between vectors of different element counts. 
-          return nullptr; 
-      } else 
-        // Don't touch a scalar-to-vector bitcast. 
-        return nullptr; 
-    } else if (I->getOperand(0)->getType()->isVectorTy()) 
-      // Don't touch a vector-to-scalar bitcast. 
-      return nullptr; 
- 
-    if (SimplifyDemandedBits(I, 0, DemandedMask, Known, Depth + 1)) 
-      return I; 
-    assert(!Known.hasConflict() && "Bits known to be one AND zero?"); 
-    break; 
-  case Instruction::SExt: { 
-    // Compute the bits in the result that are not present in the input. 
-    unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits(); 
- 
-    APInt InputDemandedBits = DemandedMask.trunc(SrcBitWidth); 
- 
-    // If any of the sign extended bits are demanded, we know that the sign 
-    // bit is demanded. 
-    if (DemandedMask.getActiveBits() > SrcBitWidth) 
-      InputDemandedBits.setBit(SrcBitWidth-1); 
- 
-    KnownBits InputKnown(SrcBitWidth); 
-    if (SimplifyDemandedBits(I, 0, InputDemandedBits, InputKnown, Depth + 1)) 
-      return I; 
- 
-    // If the input sign bit is known zero, or if the NewBits are not demanded 
-    // convert this into a zero extension. 
-    if (InputKnown.isNonNegative() || 
-        DemandedMask.getActiveBits() <= SrcBitWidth) { 
-      // Convert to ZExt cast. 
-      CastInst *NewCast = new ZExtInst(I->getOperand(0), VTy, I->getName()); 
-      return InsertNewInstWith(NewCast, *I); 
-     } 
- 
-    // If the sign bit of the input is known set or clear, then we know the 
-    // top bits of the result. 
-    Known = InputKnown.sext(BitWidth); 
-    assert(!Known.hasConflict() && "Bits known to be one AND zero?"); 
-    break; 
-  } 
-  case Instruction::Add: 
-    if ((DemandedMask & 1) == 0) { 
-      // If we do not need the low bit, try to convert bool math to logic: 
-      // add iN (zext i1 X), (sext i1 Y) --> sext (~X & Y) to iN 
-      Value *X, *Y; 
-      if (match(I, m_c_Add(m_OneUse(m_ZExt(m_Value(X))), 
-                           m_OneUse(m_SExt(m_Value(Y))))) && 
-          X->getType()->isIntOrIntVectorTy(1) && X->getType() == Y->getType()) { 
-        // Truth table for inputs and output signbits: 
-        //       X:0 | X:1 
-        //      ---------- 
-        // Y:0  |  0 | 0 | 
-        // Y:1  | -1 | 0 | 
-        //      ---------- 
-        IRBuilderBase::InsertPointGuard Guard(Builder); 
-        Builder.SetInsertPoint(I); 
-        Value *AndNot = Builder.CreateAnd(Builder.CreateNot(X), Y); 
-        return Builder.CreateSExt(AndNot, VTy); 
-      } 
- 
-      // add iN (sext i1 X), (sext i1 Y) --> sext (X | Y) to iN 
-      // TODO: Relax the one-use checks because we are removing an instruction? 
-      if (match(I, m_Add(m_OneUse(m_SExt(m_Value(X))), 
-                         m_OneUse(m_SExt(m_Value(Y))))) && 
-          X->getType()->isIntOrIntVectorTy(1) && X->getType() == Y->getType()) { 
-        // Truth table for inputs and output signbits: 
-        //       X:0 | X:1 
-        //      ----------- 
-        // Y:0  | -1 | -1 | 
-        // Y:1  | -1 |  0 | 
-        //      ----------- 
-        IRBuilderBase::InsertPointGuard Guard(Builder); 
-        Builder.SetInsertPoint(I); 
-        Value *Or = Builder.CreateOr(X, Y); 
-        return Builder.CreateSExt(Or, VTy); 
-      } 
-    } 
-    LLVM_FALLTHROUGH; 
-  case Instruction::Sub: { 
-    /// If the high-bits of an ADD/SUB are not demanded, then we do not care 
-    /// about the high bits of the operands. 
-    unsigned NLZ = DemandedMask.countLeadingZeros(); 
-    // Right fill the mask of bits for this ADD/SUB to demand the most 
-    // significant bit and all those below it. 
-    APInt DemandedFromOps(APInt::getLowBitsSet(BitWidth, BitWidth-NLZ)); 
-    if (ShrinkDemandedConstant(I, 0, DemandedFromOps) || 
-        SimplifyDemandedBits(I, 0, DemandedFromOps, LHSKnown, Depth + 1) || 
-        ShrinkDemandedConstant(I, 1, DemandedFromOps) || 
-        SimplifyDemandedBits(I, 1, DemandedFromOps, RHSKnown, Depth + 1)) { 
-      if (NLZ > 0) { 
-        // Disable the nsw and nuw flags here: We can no longer guarantee that 
-        // we won't wrap after simplification. Removing the nsw/nuw flags is 
-        // legal here because the top bit is not demanded. 
-        BinaryOperator &BinOP = *cast<BinaryOperator>(I); 
-        BinOP.setHasNoSignedWrap(false); 
-        BinOP.setHasNoUnsignedWrap(false); 
-      } 
-      return I; 
-    } 
- 
-    // If we are known to be adding/subtracting zeros to every bit below 
-    // the highest demanded bit, we just return the other side. 
-    if (DemandedFromOps.isSubsetOf(RHSKnown.Zero)) 
-      return I->getOperand(0); 
-    // We can't do this with the LHS for subtraction, unless we are only 
-    // demanding the LSB. 
-    if ((I->getOpcode() == Instruction::Add || 
-         DemandedFromOps.isOneValue()) && 
-        DemandedFromOps.isSubsetOf(LHSKnown.Zero)) 
-      return I->getOperand(1); 
- 
-    // Otherwise just compute the known bits of the result. 
-    bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap(); 
-    Known = KnownBits::computeForAddSub(I->getOpcode() == Instruction::Add, 
-                                        NSW, LHSKnown, RHSKnown); 
-    break; 
-  } 
-  case Instruction::Shl: { 
-    const APInt *SA; 
-    if (match(I->getOperand(1), m_APInt(SA))) { 
-      const APInt *ShrAmt; 
-      if (match(I->getOperand(0), m_Shr(m_Value(), m_APInt(ShrAmt)))) 
-        if (Instruction *Shr = dyn_cast<Instruction>(I->getOperand(0))) 
-          if (Value *R = simplifyShrShlDemandedBits(Shr, *ShrAmt, I, *SA, 
-                                                    DemandedMask, Known)) 
-            return R; 
- 
-      uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1); 
-      APInt DemandedMaskIn(DemandedMask.lshr(ShiftAmt)); 
- 
-      // If the shift is NUW/NSW, then it does demand the high bits. 
-      ShlOperator *IOp = cast<ShlOperator>(I); 
-      if (IOp->hasNoSignedWrap()) 
-        DemandedMaskIn.setHighBits(ShiftAmt+1); 
-      else if (IOp->hasNoUnsignedWrap()) 
-        DemandedMaskIn.setHighBits(ShiftAmt); 
- 
-      if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1)) 
-        return I; 
-      assert(!Known.hasConflict() && "Bits known to be one AND zero?"); 
- 
-      bool SignBitZero = Known.Zero.isSignBitSet(); 
-      bool SignBitOne = Known.One.isSignBitSet(); 
-      Known.Zero <<= ShiftAmt; 
-      Known.One  <<= ShiftAmt; 
-      // low bits known zero. 
-      if (ShiftAmt) 
-        Known.Zero.setLowBits(ShiftAmt); 
- 
-      // If this shift has "nsw" keyword, then the result is either a poison 
-      // value or has the same sign bit as the first operand. 
-      if (IOp->hasNoSignedWrap()) { 
-        if (SignBitZero) 
-          Known.Zero.setSignBit(); 
-        else if (SignBitOne) 
-          Known.One.setSignBit(); 
-        if (Known.hasConflict()) 
-          return UndefValue::get(I->getType()); 
-      } 
-    } else { 
-      computeKnownBits(I, Known, Depth, CxtI); 
-    } 
-    break; 
-  } 
-  case Instruction::LShr: { 
-    const APInt *SA; 
-    if (match(I->getOperand(1), m_APInt(SA))) { 
-      uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1); 
- 
-      // Unsigned shift right. 
-      APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt)); 
- 
-      // If the shift is exact, then it does demand the low bits (and knows that 
-      // they are zero). 
-      if (cast<LShrOperator>(I)->isExact()) 
-        DemandedMaskIn.setLowBits(ShiftAmt); 
- 
-      if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1)) 
-        return I; 
-      assert(!Known.hasConflict() && "Bits known to be one AND zero?"); 
-      Known.Zero.lshrInPlace(ShiftAmt); 
-      Known.One.lshrInPlace(ShiftAmt); 
-      if (ShiftAmt) 
-        Known.Zero.setHighBits(ShiftAmt);  // high bits known zero. 
-    } else { 
-      computeKnownBits(I, Known, Depth, CxtI); 
-    } 
-    break; 
-  } 
-  case Instruction::AShr: { 
-    // If this is an arithmetic shift right and only the low-bit is set, we can 
-    // always convert this into a logical shr, even if the shift amount is 
-    // variable.  The low bit of the shift cannot be an input sign bit unless 
-    // the shift amount is >= the size of the datatype, which is undefined. 
-    if (DemandedMask.isOneValue()) { 
-      // Perform the logical shift right. 
-      Instruction *NewVal = BinaryOperator::CreateLShr( 
-                        I->getOperand(0), I->getOperand(1), I->getName()); 
-      return InsertNewInstWith(NewVal, *I); 
-    } 
- 
-    // If the sign bit is the only bit demanded by this ashr, then there is no 
-    // need to do it, the shift doesn't change the high bit. 
-    if (DemandedMask.isSignMask()) 
-      return I->getOperand(0); 
- 
-    const APInt *SA; 
-    if (match(I->getOperand(1), m_APInt(SA))) { 
-      uint32_t ShiftAmt = SA->getLimitedValue(BitWidth-1); 
- 
-      // Signed shift right. 
-      APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt)); 
-      // If any of the high bits are demanded, we should set the sign bit as 
-      // demanded. 
-      if (DemandedMask.countLeadingZeros() <= ShiftAmt) 
-        DemandedMaskIn.setSignBit(); 
- 
-      // If the shift is exact, then it does demand the low bits (and knows that 
-      // they are zero). 
-      if (cast<AShrOperator>(I)->isExact()) 
-        DemandedMaskIn.setLowBits(ShiftAmt); 
- 
-      if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1)) 
-        return I; 
- 
-      unsigned SignBits = ComputeNumSignBits(I->getOperand(0), Depth + 1, CxtI); 
- 
-      assert(!Known.hasConflict() && "Bits known to be one AND zero?"); 
-      // Compute the new bits that are at the top now plus sign bits. 
-      APInt HighBits(APInt::getHighBitsSet( 
-          BitWidth, std::min(SignBits + ShiftAmt - 1, BitWidth))); 
-      Known.Zero.lshrInPlace(ShiftAmt); 
-      Known.One.lshrInPlace(ShiftAmt); 
- 
-      // If the input sign bit is known to be zero, or if none of the top bits 
-      // are demanded, turn this into an unsigned shift right. 
-      assert(BitWidth > ShiftAmt && "Shift amount not saturated?"); 
-      if (Known.Zero[BitWidth-ShiftAmt-1] || 
-          !DemandedMask.intersects(HighBits)) { 
-        BinaryOperator *LShr = BinaryOperator::CreateLShr(I->getOperand(0), 
-                                                          I->getOperand(1)); 
-        LShr->setIsExact(cast<BinaryOperator>(I)->isExact()); 
-        return InsertNewInstWith(LShr, *I); 
-      } else if (Known.One[BitWidth-ShiftAmt-1]) { // New bits are known one. 
-        Known.One |= HighBits; 
-      } 
-    } else { 
-      computeKnownBits(I, Known, Depth, CxtI); 
-    } 
-    break; 
-  } 
-  case Instruction::UDiv: { 
-    // UDiv doesn't demand low bits that are zero in the divisor. 
-    const APInt *SA; 
-    if (match(I->getOperand(1), m_APInt(SA))) { 
-      // If the shift is exact, then it does demand the low bits. 
-      if (cast<UDivOperator>(I)->isExact()) 
-        break; 
- 
-      // FIXME: Take the demanded mask of the result into account. 
-      unsigned RHSTrailingZeros = SA->countTrailingZeros(); 
-      APInt DemandedMaskIn = 
-          APInt::getHighBitsSet(BitWidth, BitWidth - RHSTrailingZeros); 
-      if (SimplifyDemandedBits(I, 0, DemandedMaskIn, LHSKnown, Depth + 1)) 
-        return I; 
- 
-      // Propagate zero bits from the input. 
-      Known.Zero.setHighBits(std::min( 
-          BitWidth, LHSKnown.Zero.countLeadingOnes() + RHSTrailingZeros)); 
-    } else { 
-      computeKnownBits(I, Known, Depth, CxtI); 
-    } 
-    break; 
-  } 
+          // Don't touch a bitcast between vectors of different element counts.
+          return nullptr;
+      } else
+        // Don't touch a scalar-to-vector bitcast.
+        return nullptr;
+    } else if (I->getOperand(0)->getType()->isVectorTy())
+      // Don't touch a vector-to-scalar bitcast.
+      return nullptr;
+
+    if (SimplifyDemandedBits(I, 0, DemandedMask, Known, Depth + 1))
+      return I;
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+    break;
+  case Instruction::SExt: {
+    // Compute the bits in the result that are not present in the input.
+    unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits();
+
+    APInt InputDemandedBits = DemandedMask.trunc(SrcBitWidth);
+
+    // If any of the sign extended bits are demanded, we know that the sign
+    // bit is demanded.
+    if (DemandedMask.getActiveBits() > SrcBitWidth)
+      InputDemandedBits.setBit(SrcBitWidth-1);
+
+    KnownBits InputKnown(SrcBitWidth);
+    if (SimplifyDemandedBits(I, 0, InputDemandedBits, InputKnown, Depth + 1))
+      return I;
+
+    // If the input sign bit is known zero, or if the NewBits are not demanded
+    // convert this into a zero extension.
+    if (InputKnown.isNonNegative() ||
+        DemandedMask.getActiveBits() <= SrcBitWidth) {
+      // Convert to ZExt cast.
+      CastInst *NewCast = new ZExtInst(I->getOperand(0), VTy, I->getName());
+      return InsertNewInstWith(NewCast, *I);
+     }
+
+    // If the sign bit of the input is known set or clear, then we know the
+    // top bits of the result.
+    Known = InputKnown.sext(BitWidth);
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+    break;
+  }
+  case Instruction::Add:
+    if ((DemandedMask & 1) == 0) {
+      // If we do not need the low bit, try to convert bool math to logic:
+      // add iN (zext i1 X), (sext i1 Y) --> sext (~X & Y) to iN
+      Value *X, *Y;
+      if (match(I, m_c_Add(m_OneUse(m_ZExt(m_Value(X))),
+                           m_OneUse(m_SExt(m_Value(Y))))) &&
+          X->getType()->isIntOrIntVectorTy(1) && X->getType() == Y->getType()) {
+        // Truth table for inputs and output signbits:
+        //       X:0 | X:1
+        //      ----------
+        // Y:0  |  0 | 0 |
+        // Y:1  | -1 | 0 |
+        //      ----------
+        IRBuilderBase::InsertPointGuard Guard(Builder);
+        Builder.SetInsertPoint(I);
+        Value *AndNot = Builder.CreateAnd(Builder.CreateNot(X), Y);
+        return Builder.CreateSExt(AndNot, VTy);
+      }
+
+      // add iN (sext i1 X), (sext i1 Y) --> sext (X | Y) to iN
+      // TODO: Relax the one-use checks because we are removing an instruction?
+      if (match(I, m_Add(m_OneUse(m_SExt(m_Value(X))),
+                         m_OneUse(m_SExt(m_Value(Y))))) &&
+          X->getType()->isIntOrIntVectorTy(1) && X->getType() == Y->getType()) {
+        // Truth table for inputs and output signbits:
+        //       X:0 | X:1
+        //      -----------
+        // Y:0  | -1 | -1 |
+        // Y:1  | -1 |  0 |
+        //      -----------
+        IRBuilderBase::InsertPointGuard Guard(Builder);
+        Builder.SetInsertPoint(I);
+        Value *Or = Builder.CreateOr(X, Y);
+        return Builder.CreateSExt(Or, VTy);
+      }
+    }
+    LLVM_FALLTHROUGH;
+  case Instruction::Sub: {
+    /// If the high-bits of an ADD/SUB are not demanded, then we do not care
+    /// about the high bits of the operands.
+    unsigned NLZ = DemandedMask.countLeadingZeros();
+    // Right fill the mask of bits for this ADD/SUB to demand the most
+    // significant bit and all those below it.
+    APInt DemandedFromOps(APInt::getLowBitsSet(BitWidth, BitWidth-NLZ));
+    if (ShrinkDemandedConstant(I, 0, DemandedFromOps) ||
+        SimplifyDemandedBits(I, 0, DemandedFromOps, LHSKnown, Depth + 1) ||
+        ShrinkDemandedConstant(I, 1, DemandedFromOps) ||
+        SimplifyDemandedBits(I, 1, DemandedFromOps, RHSKnown, Depth + 1)) {
+      if (NLZ > 0) {
+        // Disable the nsw and nuw flags here: We can no longer guarantee that
+        // we won't wrap after simplification. Removing the nsw/nuw flags is
+        // legal here because the top bit is not demanded.
+        BinaryOperator &BinOP = *cast<BinaryOperator>(I);
+        BinOP.setHasNoSignedWrap(false);
+        BinOP.setHasNoUnsignedWrap(false);
+      }
+      return I;
+    }
+
+    // If we are known to be adding/subtracting zeros to every bit below
+    // the highest demanded bit, we just return the other side.
+    if (DemandedFromOps.isSubsetOf(RHSKnown.Zero))
+      return I->getOperand(0);
+    // We can't do this with the LHS for subtraction, unless we are only
+    // demanding the LSB.
+    if ((I->getOpcode() == Instruction::Add ||
+         DemandedFromOps.isOneValue()) &&
+        DemandedFromOps.isSubsetOf(LHSKnown.Zero))
+      return I->getOperand(1);
+
+    // Otherwise just compute the known bits of the result.
+    bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
+    Known = KnownBits::computeForAddSub(I->getOpcode() == Instruction::Add,
+                                        NSW, LHSKnown, RHSKnown);
+    break;
+  }
+  case Instruction::Shl: {
+    const APInt *SA;
+    if (match(I->getOperand(1), m_APInt(SA))) {
+      const APInt *ShrAmt;
+      if (match(I->getOperand(0), m_Shr(m_Value(), m_APInt(ShrAmt))))
+        if (Instruction *Shr = dyn_cast<Instruction>(I->getOperand(0)))
+          if (Value *R = simplifyShrShlDemandedBits(Shr, *ShrAmt, I, *SA,
+                                                    DemandedMask, Known))
+            return R;
+
+      uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
+      APInt DemandedMaskIn(DemandedMask.lshr(ShiftAmt));
+
+      // If the shift is NUW/NSW, then it does demand the high bits.
+      ShlOperator *IOp = cast<ShlOperator>(I);
+      if (IOp->hasNoSignedWrap())
+        DemandedMaskIn.setHighBits(ShiftAmt+1);
+      else if (IOp->hasNoUnsignedWrap())
+        DemandedMaskIn.setHighBits(ShiftAmt);
+
+      if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1))
+        return I;
+      assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+
+      bool SignBitZero = Known.Zero.isSignBitSet();
+      bool SignBitOne = Known.One.isSignBitSet();
+      Known.Zero <<= ShiftAmt;
+      Known.One  <<= ShiftAmt;
+      // low bits known zero.
+      if (ShiftAmt)
+        Known.Zero.setLowBits(ShiftAmt);
+
+      // If this shift has "nsw" keyword, then the result is either a poison
+      // value or has the same sign bit as the first operand.
+      if (IOp->hasNoSignedWrap()) {
+        if (SignBitZero)
+          Known.Zero.setSignBit();
+        else if (SignBitOne)
+          Known.One.setSignBit();
+        if (Known.hasConflict())
+          return UndefValue::get(I->getType());
+      }
+    } else {
+      computeKnownBits(I, Known, Depth, CxtI);
+    }
+    break;
+  }
+  case Instruction::LShr: {
+    const APInt *SA;
+    if (match(I->getOperand(1), m_APInt(SA))) {
+      uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
+
+      // Unsigned shift right.
+      APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt));
+
+      // If the shift is exact, then it does demand the low bits (and knows that
+      // they are zero).
+      if (cast<LShrOperator>(I)->isExact())
+        DemandedMaskIn.setLowBits(ShiftAmt);
+
+      if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1))
+        return I;
+      assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+      Known.Zero.lshrInPlace(ShiftAmt);
+      Known.One.lshrInPlace(ShiftAmt);
+      if (ShiftAmt)
+        Known.Zero.setHighBits(ShiftAmt);  // high bits known zero.
+    } else {
+      computeKnownBits(I, Known, Depth, CxtI);
+    }
+    break;
+  }
+  case Instruction::AShr: {
+    // If this is an arithmetic shift right and only the low-bit is set, we can
+    // always convert this into a logical shr, even if the shift amount is
+    // variable.  The low bit of the shift cannot be an input sign bit unless
+    // the shift amount is >= the size of the datatype, which is undefined.
+    if (DemandedMask.isOneValue()) {
+      // Perform the logical shift right.
+      Instruction *NewVal = BinaryOperator::CreateLShr(
+                        I->getOperand(0), I->getOperand(1), I->getName());
+      return InsertNewInstWith(NewVal, *I);
+    }
+
+    // If the sign bit is the only bit demanded by this ashr, then there is no
+    // need to do it, the shift doesn't change the high bit.
+    if (DemandedMask.isSignMask())
+      return I->getOperand(0);
+
+    const APInt *SA;
+    if (match(I->getOperand(1), m_APInt(SA))) {
+      uint32_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
+
+      // Signed shift right.
+      APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt));
+      // If any of the high bits are demanded, we should set the sign bit as
+      // demanded.
+      if (DemandedMask.countLeadingZeros() <= ShiftAmt)
+        DemandedMaskIn.setSignBit();
+
+      // If the shift is exact, then it does demand the low bits (and knows that
+      // they are zero).
+      if (cast<AShrOperator>(I)->isExact())
+        DemandedMaskIn.setLowBits(ShiftAmt);
+
+      if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1))
+        return I;
+
+      unsigned SignBits = ComputeNumSignBits(I->getOperand(0), Depth + 1, CxtI);
+
+      assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+      // Compute the new bits that are at the top now plus sign bits.
+      APInt HighBits(APInt::getHighBitsSet(
+          BitWidth, std::min(SignBits + ShiftAmt - 1, BitWidth)));
+      Known.Zero.lshrInPlace(ShiftAmt);
+      Known.One.lshrInPlace(ShiftAmt);
+
+      // If the input sign bit is known to be zero, or if none of the top bits
+      // are demanded, turn this into an unsigned shift right.
+      assert(BitWidth > ShiftAmt && "Shift amount not saturated?");
+      if (Known.Zero[BitWidth-ShiftAmt-1] ||
+          !DemandedMask.intersects(HighBits)) {
+        BinaryOperator *LShr = BinaryOperator::CreateLShr(I->getOperand(0),
+                                                          I->getOperand(1));
+        LShr->setIsExact(cast<BinaryOperator>(I)->isExact());
+        return InsertNewInstWith(LShr, *I);
+      } else if (Known.One[BitWidth-ShiftAmt-1]) { // New bits are known one.
+        Known.One |= HighBits;
+      }
+    } else {
+      computeKnownBits(I, Known, Depth, CxtI);
+    }
+    break;
+  }
+  case Instruction::UDiv: {
+    // UDiv doesn't demand low bits that are zero in the divisor.
+    const APInt *SA;
+    if (match(I->getOperand(1), m_APInt(SA))) {
+      // If the shift is exact, then it does demand the low bits.
+      if (cast<UDivOperator>(I)->isExact())
+        break;
+
+      // FIXME: Take the demanded mask of the result into account.
+      unsigned RHSTrailingZeros = SA->countTrailingZeros();
+      APInt DemandedMaskIn =
+          APInt::getHighBitsSet(BitWidth, BitWidth - RHSTrailingZeros);
+      if (SimplifyDemandedBits(I, 0, DemandedMaskIn, LHSKnown, Depth + 1))
+        return I;
+
+      // Propagate zero bits from the input.
+      Known.Zero.setHighBits(std::min(
+          BitWidth, LHSKnown.Zero.countLeadingOnes() + RHSTrailingZeros));
+    } else {
+      computeKnownBits(I, Known, Depth, CxtI);
+    }
+    break;
+  }
   case Instruction::SRem: {
     ConstantInt *Rem;
     if (match(I->getOperand(1), m_ConstantInt(Rem))) {
-      // X % -1 demands all the bits because we don't want to introduce 
-      // INT_MIN % -1 (== undef) by accident. 
-      if (Rem->isMinusOne()) 
-        break; 
-      APInt RA = Rem->getValue().abs(); 
-      if (RA.isPowerOf2()) { 
-        if (DemandedMask.ult(RA))    // srem won't affect demanded bits 
-          return I->getOperand(0); 
- 
-        APInt LowBits = RA - 1; 
-        APInt Mask2 = LowBits | APInt::getSignMask(BitWidth); 
-        if (SimplifyDemandedBits(I, 0, Mask2, LHSKnown, Depth + 1)) 
-          return I; 
- 
-        // The low bits of LHS are unchanged by the srem. 
-        Known.Zero = LHSKnown.Zero & LowBits; 
-        Known.One = LHSKnown.One & LowBits; 
- 
-        // If LHS is non-negative or has all low bits zero, then the upper bits 
-        // are all zero. 
-        if (LHSKnown.isNonNegative() || LowBits.isSubsetOf(LHSKnown.Zero)) 
-          Known.Zero |= ~LowBits; 
- 
-        // If LHS is negative and not all low bits are zero, then the upper bits 
-        // are all one. 
-        if (LHSKnown.isNegative() && LowBits.intersects(LHSKnown.One)) 
-          Known.One |= ~LowBits; 
- 
-        assert(!Known.hasConflict() && "Bits known to be one AND zero?"); 
-        break; 
-      } 
-    } 
- 
-    // The sign bit is the LHS's sign bit, except when the result of the 
-    // remainder is zero. 
-    if (DemandedMask.isSignBitSet()) { 
-      computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1, CxtI); 
-      // If it's known zero, our sign bit is also zero. 
-      if (LHSKnown.isNonNegative()) 
-        Known.makeNonNegative(); 
-    } 
-    break; 
+      // X % -1 demands all the bits because we don't want to introduce
+      // INT_MIN % -1 (== undef) by accident.
+      if (Rem->isMinusOne())
+        break;
+      APInt RA = Rem->getValue().abs();
+      if (RA.isPowerOf2()) {
+        if (DemandedMask.ult(RA))    // srem won't affect demanded bits
+          return I->getOperand(0);
+
+        APInt LowBits = RA - 1;
+        APInt Mask2 = LowBits | APInt::getSignMask(BitWidth);
+        if (SimplifyDemandedBits(I, 0, Mask2, LHSKnown, Depth + 1))
+          return I;
+
+        // The low bits of LHS are unchanged by the srem.
+        Known.Zero = LHSKnown.Zero & LowBits;
+        Known.One = LHSKnown.One & LowBits;
+
+        // If LHS is non-negative or has all low bits zero, then the upper bits
+        // are all zero.
+        if (LHSKnown.isNonNegative() || LowBits.isSubsetOf(LHSKnown.Zero))
+          Known.Zero |= ~LowBits;
+
+        // If LHS is negative and not all low bits are zero, then the upper bits
+        // are all one.
+        if (LHSKnown.isNegative() && LowBits.intersects(LHSKnown.One))
+          Known.One |= ~LowBits;
+
+        assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+        break;
+      }
+    }
+
+    // The sign bit is the LHS's sign bit, except when the result of the
+    // remainder is zero.
+    if (DemandedMask.isSignBitSet()) {
+      computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1, CxtI);
+      // If it's known zero, our sign bit is also zero.
+      if (LHSKnown.isNonNegative())
+        Known.makeNonNegative();
+    }
+    break;
   }
-  case Instruction::URem: { 
-    KnownBits Known2(BitWidth); 
-    APInt AllOnes = APInt::getAllOnesValue(BitWidth); 
-    if (SimplifyDemandedBits(I, 0, AllOnes, Known2, Depth + 1) || 
-        SimplifyDemandedBits(I, 1, AllOnes, Known2, Depth + 1)) 
-      return I; 
- 
-    unsigned Leaders = Known2.countMinLeadingZeros(); 
-    Known.Zero = APInt::getHighBitsSet(BitWidth, Leaders) & DemandedMask; 
-    break; 
-  } 
-  case Instruction::Call: { 
-    bool KnownBitsComputed = false; 
-    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 
-      switch (II->getIntrinsicID()) { 
-      case Intrinsic::bswap: { 
-        // If the only bits demanded come from one byte of the bswap result, 
-        // just shift the input byte into position to eliminate the bswap. 
-        unsigned NLZ = DemandedMask.countLeadingZeros(); 
-        unsigned NTZ = DemandedMask.countTrailingZeros(); 
- 
-        // Round NTZ down to the next byte.  If we have 11 trailing zeros, then 
-        // we need all the bits down to bit 8.  Likewise, round NLZ.  If we 
-        // have 14 leading zeros, round to 8. 
-        NLZ &= ~7; 
-        NTZ &= ~7; 
-        // If we need exactly one byte, we can do this transformation. 
-        if (BitWidth-NLZ-NTZ == 8) { 
-          unsigned ResultBit = NTZ; 
-          unsigned InputBit = BitWidth-NTZ-8; 
- 
-          // Replace this with either a left or right shift to get the byte into 
-          // the right place. 
-          Instruction *NewVal; 
-          if (InputBit > ResultBit) 
-            NewVal = BinaryOperator::CreateLShr(II->getArgOperand(0), 
-                    ConstantInt::get(I->getType(), InputBit-ResultBit)); 
-          else 
-            NewVal = BinaryOperator::CreateShl(II->getArgOperand(0), 
-                    ConstantInt::get(I->getType(), ResultBit-InputBit)); 
-          NewVal->takeName(I); 
-          return InsertNewInstWith(NewVal, *I); 
-        } 
-        break; 
-      } 
-      case Intrinsic::fshr: 
-      case Intrinsic::fshl: { 
-        const APInt *SA; 
-        if (!match(I->getOperand(2), m_APInt(SA))) 
-          break; 
- 
-        // Normalize to funnel shift left. APInt shifts of BitWidth are well- 
-        // defined, so no need to special-case zero shifts here. 
-        uint64_t ShiftAmt = SA->urem(BitWidth); 
-        if (II->getIntrinsicID() == Intrinsic::fshr) 
-          ShiftAmt = BitWidth - ShiftAmt; 
- 
-        APInt DemandedMaskLHS(DemandedMask.lshr(ShiftAmt)); 
-        APInt DemandedMaskRHS(DemandedMask.shl(BitWidth - ShiftAmt)); 
-        if (SimplifyDemandedBits(I, 0, DemandedMaskLHS, LHSKnown, Depth + 1) || 
-            SimplifyDemandedBits(I, 1, DemandedMaskRHS, RHSKnown, Depth + 1)) 
-          return I; 
- 
-        Known.Zero = LHSKnown.Zero.shl(ShiftAmt) | 
-                     RHSKnown.Zero.lshr(BitWidth - ShiftAmt); 
-        Known.One = LHSKnown.One.shl(ShiftAmt) | 
-                    RHSKnown.One.lshr(BitWidth - ShiftAmt); 
-        KnownBitsComputed = true; 
-        break; 
-      } 
+  case Instruction::URem: {
+    KnownBits Known2(BitWidth);
+    APInt AllOnes = APInt::getAllOnesValue(BitWidth);
+    if (SimplifyDemandedBits(I, 0, AllOnes, Known2, Depth + 1) ||
+        SimplifyDemandedBits(I, 1, AllOnes, Known2, Depth + 1))
+      return I;
+
+    unsigned Leaders = Known2.countMinLeadingZeros();
+    Known.Zero = APInt::getHighBitsSet(BitWidth, Leaders) & DemandedMask;
+    break;
+  }
+  case Instruction::Call: {
+    bool KnownBitsComputed = false;
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+      switch (II->getIntrinsicID()) {
+      case Intrinsic::bswap: {
+        // If the only bits demanded come from one byte of the bswap result,
+        // just shift the input byte into position to eliminate the bswap.
+        unsigned NLZ = DemandedMask.countLeadingZeros();
+        unsigned NTZ = DemandedMask.countTrailingZeros();
+
+        // Round NTZ down to the next byte.  If we have 11 trailing zeros, then
+        // we need all the bits down to bit 8.  Likewise, round NLZ.  If we
+        // have 14 leading zeros, round to 8.
+        NLZ &= ~7;
+        NTZ &= ~7;
+        // If we need exactly one byte, we can do this transformation.
+        if (BitWidth-NLZ-NTZ == 8) {
+          unsigned ResultBit = NTZ;
+          unsigned InputBit = BitWidth-NTZ-8;
+
+          // Replace this with either a left or right shift to get the byte into
+          // the right place.
+          Instruction *NewVal;
+          if (InputBit > ResultBit)
+            NewVal = BinaryOperator::CreateLShr(II->getArgOperand(0),
+                    ConstantInt::get(I->getType(), InputBit-ResultBit));
+          else
+            NewVal = BinaryOperator::CreateShl(II->getArgOperand(0),
+                    ConstantInt::get(I->getType(), ResultBit-InputBit));
+          NewVal->takeName(I);
+          return InsertNewInstWith(NewVal, *I);
+        }
+        break;
+      }
+      case Intrinsic::fshr:
+      case Intrinsic::fshl: {
+        const APInt *SA;
+        if (!match(I->getOperand(2), m_APInt(SA)))
+          break;
+
+        // Normalize to funnel shift left. APInt shifts of BitWidth are well-
+        // defined, so no need to special-case zero shifts here.
+        uint64_t ShiftAmt = SA->urem(BitWidth);
+        if (II->getIntrinsicID() == Intrinsic::fshr)
+          ShiftAmt = BitWidth - ShiftAmt;
+
+        APInt DemandedMaskLHS(DemandedMask.lshr(ShiftAmt));
+        APInt DemandedMaskRHS(DemandedMask.shl(BitWidth - ShiftAmt));
+        if (SimplifyDemandedBits(I, 0, DemandedMaskLHS, LHSKnown, Depth + 1) ||
+            SimplifyDemandedBits(I, 1, DemandedMaskRHS, RHSKnown, Depth + 1))
+          return I;
+
+        Known.Zero = LHSKnown.Zero.shl(ShiftAmt) |
+                     RHSKnown.Zero.lshr(BitWidth - ShiftAmt);
+        Known.One = LHSKnown.One.shl(ShiftAmt) |
+                    RHSKnown.One.lshr(BitWidth - ShiftAmt);
+        KnownBitsComputed = true;
+        break;
+      }
       default: {
         // Handle target specific intrinsics
         Optional<Value *> V = targetSimplifyDemandedUseBitsIntrinsic(
             *II, DemandedMask, Known, KnownBitsComputed);
         if (V.hasValue())
           return V.getValue();
-        break; 
-      } 
-      } 
-    } 
- 
-    if (!KnownBitsComputed) 
-      computeKnownBits(V, Known, Depth, CxtI); 
-    break; 
-  } 
-  } 
- 
-  // If the client is only demanding bits that we know, return the known 
-  // constant. 
-  if (DemandedMask.isSubsetOf(Known.Zero|Known.One)) 
-    return Constant::getIntegerValue(VTy, Known.One); 
-  return nullptr; 
-} 
- 
-/// Helper routine of SimplifyDemandedUseBits. It computes Known 
-/// bits. It also tries to handle simplifications that can be done based on 
-/// DemandedMask, but without modifying the Instruction. 
+        break;
+      }
+      }
+    }
+
+    if (!KnownBitsComputed)
+      computeKnownBits(V, Known, Depth, CxtI);
+    break;
+  }
+  }
+
+  // If the client is only demanding bits that we know, return the known
+  // constant.
+  if (DemandedMask.isSubsetOf(Known.Zero|Known.One))
+    return Constant::getIntegerValue(VTy, Known.One);
+  return nullptr;
+}
+
+/// Helper routine of SimplifyDemandedUseBits. It computes Known
+/// bits. It also tries to handle simplifications that can be done based on
+/// DemandedMask, but without modifying the Instruction.
 Value *InstCombinerImpl::SimplifyMultipleUseDemandedBits(
     Instruction *I, const APInt &DemandedMask, KnownBits &Known, unsigned Depth,
     Instruction *CxtI) {
-  unsigned BitWidth = DemandedMask.getBitWidth(); 
-  Type *ITy = I->getType(); 
- 
-  KnownBits LHSKnown(BitWidth); 
-  KnownBits RHSKnown(BitWidth); 
- 
-  // Despite the fact that we can't simplify this instruction in all User's 
-  // context, we can at least compute the known bits, and we can 
-  // do simplifications that apply to *just* the one user if we know that 
-  // this instruction has a simpler value in that context. 
-  switch (I->getOpcode()) { 
-  case Instruction::And: { 
-    // If either the LHS or the RHS are Zero, the result is zero. 
-    computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI); 
-    computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1, 
-                     CxtI); 
- 
-    Known = LHSKnown & RHSKnown; 
- 
-    // If the client is only demanding bits that we know, return the known 
-    // constant. 
-    if (DemandedMask.isSubsetOf(Known.Zero | Known.One)) 
-      return Constant::getIntegerValue(ITy, Known.One); 
- 
-    // If all of the demanded bits are known 1 on one side, return the other. 
-    // These bits cannot contribute to the result of the 'and' in this 
-    // context. 
-    if (DemandedMask.isSubsetOf(LHSKnown.Zero | RHSKnown.One)) 
-      return I->getOperand(0); 
-    if (DemandedMask.isSubsetOf(RHSKnown.Zero | LHSKnown.One)) 
-      return I->getOperand(1); 
- 
-    break; 
-  } 
-  case Instruction::Or: { 
-    // We can simplify (X|Y) -> X or Y in the user's context if we know that 
-    // only bits from X or Y are demanded. 
- 
-    // If either the LHS or the RHS are One, the result is One. 
-    computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI); 
-    computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1, 
-                     CxtI); 
- 
-    Known = LHSKnown | RHSKnown; 
- 
-    // If the client is only demanding bits that we know, return the known 
-    // constant. 
-    if (DemandedMask.isSubsetOf(Known.Zero | Known.One)) 
-      return Constant::getIntegerValue(ITy, Known.One); 
- 
-    // If all of the demanded bits are known zero on one side, return the 
-    // other.  These bits cannot contribute to the result of the 'or' in this 
-    // context. 
-    if (DemandedMask.isSubsetOf(LHSKnown.One | RHSKnown.Zero)) 
-      return I->getOperand(0); 
-    if (DemandedMask.isSubsetOf(RHSKnown.One | LHSKnown.Zero)) 
-      return I->getOperand(1); 
- 
-    break; 
-  } 
-  case Instruction::Xor: { 
-    // We can simplify (X^Y) -> X or Y in the user's context if we know that 
-    // only bits from X or Y are demanded. 
- 
-    computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI); 
-    computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1, 
-                     CxtI); 
- 
-    Known = LHSKnown ^ RHSKnown; 
- 
-    // If the client is only demanding bits that we know, return the known 
-    // constant. 
-    if (DemandedMask.isSubsetOf(Known.Zero | Known.One)) 
-      return Constant::getIntegerValue(ITy, Known.One); 
- 
-    // If all of the demanded bits are known zero on one side, return the 
-    // other. 
-    if (DemandedMask.isSubsetOf(RHSKnown.Zero)) 
-      return I->getOperand(0); 
-    if (DemandedMask.isSubsetOf(LHSKnown.Zero)) 
-      return I->getOperand(1); 
- 
-    break; 
-  } 
+  unsigned BitWidth = DemandedMask.getBitWidth();
+  Type *ITy = I->getType();
+
+  KnownBits LHSKnown(BitWidth);
+  KnownBits RHSKnown(BitWidth);
+
+  // Despite the fact that we can't simplify this instruction in all User's
+  // context, we can at least compute the known bits, and we can
+  // do simplifications that apply to *just* the one user if we know that
+  // this instruction has a simpler value in that context.
+  switch (I->getOpcode()) {
+  case Instruction::And: {
+    // If either the LHS or the RHS are Zero, the result is zero.
+    computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI);
+    computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1,
+                     CxtI);
+
+    Known = LHSKnown & RHSKnown;
+
+    // If the client is only demanding bits that we know, return the known
+    // constant.
+    if (DemandedMask.isSubsetOf(Known.Zero | Known.One))
+      return Constant::getIntegerValue(ITy, Known.One);
+
+    // If all of the demanded bits are known 1 on one side, return the other.
+    // These bits cannot contribute to the result of the 'and' in this
+    // context.
+    if (DemandedMask.isSubsetOf(LHSKnown.Zero | RHSKnown.One))
+      return I->getOperand(0);
+    if (DemandedMask.isSubsetOf(RHSKnown.Zero | LHSKnown.One))
+      return I->getOperand(1);
+
+    break;
+  }
+  case Instruction::Or: {
+    // We can simplify (X|Y) -> X or Y in the user's context if we know that
+    // only bits from X or Y are demanded.
+
+    // If either the LHS or the RHS are One, the result is One.
+    computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI);
+    computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1,
+                     CxtI);
+
+    Known = LHSKnown | RHSKnown;
+
+    // If the client is only demanding bits that we know, return the known
+    // constant.
+    if (DemandedMask.isSubsetOf(Known.Zero | Known.One))
+      return Constant::getIntegerValue(ITy, Known.One);
+
+    // If all of the demanded bits are known zero on one side, return the
+    // other.  These bits cannot contribute to the result of the 'or' in this
+    // context.
+    if (DemandedMask.isSubsetOf(LHSKnown.One | RHSKnown.Zero))
+      return I->getOperand(0);
+    if (DemandedMask.isSubsetOf(RHSKnown.One | LHSKnown.Zero))
+      return I->getOperand(1);
+
+    break;
+  }
+  case Instruction::Xor: {
+    // We can simplify (X^Y) -> X or Y in the user's context if we know that
+    // only bits from X or Y are demanded.
+
+    computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI);
+    computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1,
+                     CxtI);
+
+    Known = LHSKnown ^ RHSKnown;
+
+    // If the client is only demanding bits that we know, return the known
+    // constant.
+    if (DemandedMask.isSubsetOf(Known.Zero | Known.One))
+      return Constant::getIntegerValue(ITy, Known.One);
+
+    // If all of the demanded bits are known zero on one side, return the
+    // other.
+    if (DemandedMask.isSubsetOf(RHSKnown.Zero))
+      return I->getOperand(0);
+    if (DemandedMask.isSubsetOf(LHSKnown.Zero))
+      return I->getOperand(1);
+
+    break;
+  }
   case Instruction::AShr: {
     // Compute the Known bits to simplify things downstream.
     computeKnownBits(I, Known, Depth, CxtI);
@@ -930,260 +930,260 @@ Value *InstCombinerImpl::SimplifyMultipleUseDemandedBits(
 
     break;
   }
-  default: 
-    // Compute the Known bits to simplify things downstream. 
-    computeKnownBits(I, Known, Depth, CxtI); 
- 
-    // If this user is only demanding bits that we know, return the known 
-    // constant. 
-    if (DemandedMask.isSubsetOf(Known.Zero|Known.One)) 
-      return Constant::getIntegerValue(ITy, Known.One); 
- 
-    break; 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Helper routine of SimplifyDemandedUseBits. It tries to simplify 
-/// "E1 = (X lsr C1) << C2", where the C1 and C2 are constant, into 
-/// "E2 = X << (C2 - C1)" or "E2 = X >> (C1 - C2)", depending on the sign 
-/// of "C2-C1". 
-/// 
-/// Suppose E1 and E2 are generally different in bits S={bm, bm+1, 
-/// ..., bn}, without considering the specific value X is holding. 
-/// This transformation is legal iff one of following conditions is hold: 
-///  1) All the bit in S are 0, in this case E1 == E2. 
-///  2) We don't care those bits in S, per the input DemandedMask. 
-///  3) Combination of 1) and 2). Some bits in S are 0, and we don't care the 
-///     rest bits. 
-/// 
-/// Currently we only test condition 2). 
-/// 
-/// As with SimplifyDemandedUseBits, it returns NULL if the simplification was 
-/// not successful. 
+  default:
+    // Compute the Known bits to simplify things downstream.
+    computeKnownBits(I, Known, Depth, CxtI);
+
+    // If this user is only demanding bits that we know, return the known
+    // constant.
+    if (DemandedMask.isSubsetOf(Known.Zero|Known.One))
+      return Constant::getIntegerValue(ITy, Known.One);
+
+    break;
+  }
+
+  return nullptr;
+}
+
+/// Helper routine of SimplifyDemandedUseBits. It tries to simplify
+/// "E1 = (X lsr C1) << C2", where the C1 and C2 are constant, into
+/// "E2 = X << (C2 - C1)" or "E2 = X >> (C1 - C2)", depending on the sign
+/// of "C2-C1".
+///
+/// Suppose E1 and E2 are generally different in bits S={bm, bm+1,
+/// ..., bn}, without considering the specific value X is holding.
+/// This transformation is legal iff one of following conditions is hold:
+///  1) All the bit in S are 0, in this case E1 == E2.
+///  2) We don't care those bits in S, per the input DemandedMask.
+///  3) Combination of 1) and 2). Some bits in S are 0, and we don't care the
+///     rest bits.
+///
+/// Currently we only test condition 2).
+///
+/// As with SimplifyDemandedUseBits, it returns NULL if the simplification was
+/// not successful.
 Value *InstCombinerImpl::simplifyShrShlDemandedBits(
     Instruction *Shr, const APInt &ShrOp1, Instruction *Shl,
     const APInt &ShlOp1, const APInt &DemandedMask, KnownBits &Known) {
-  if (!ShlOp1 || !ShrOp1) 
-    return nullptr; // No-op. 
- 
-  Value *VarX = Shr->getOperand(0); 
-  Type *Ty = VarX->getType(); 
-  unsigned BitWidth = Ty->getScalarSizeInBits(); 
-  if (ShlOp1.uge(BitWidth) || ShrOp1.uge(BitWidth)) 
-    return nullptr; // Undef. 
- 
-  unsigned ShlAmt = ShlOp1.getZExtValue(); 
-  unsigned ShrAmt = ShrOp1.getZExtValue(); 
- 
-  Known.One.clearAllBits(); 
-  Known.Zero.setLowBits(ShlAmt - 1); 
-  Known.Zero &= DemandedMask; 
- 
-  APInt BitMask1(APInt::getAllOnesValue(BitWidth)); 
-  APInt BitMask2(APInt::getAllOnesValue(BitWidth)); 
- 
-  bool isLshr = (Shr->getOpcode() == Instruction::LShr); 
-  BitMask1 = isLshr ? (BitMask1.lshr(ShrAmt) << ShlAmt) : 
-                      (BitMask1.ashr(ShrAmt) << ShlAmt); 
- 
-  if (ShrAmt <= ShlAmt) { 
-    BitMask2 <<= (ShlAmt - ShrAmt); 
-  } else { 
-    BitMask2 = isLshr ? BitMask2.lshr(ShrAmt - ShlAmt): 
-                        BitMask2.ashr(ShrAmt - ShlAmt); 
-  } 
- 
-  // Check if condition-2 (see the comment to this function) is satified. 
-  if ((BitMask1 & DemandedMask) == (BitMask2 & DemandedMask)) { 
-    if (ShrAmt == ShlAmt) 
-      return VarX; 
- 
-    if (!Shr->hasOneUse()) 
-      return nullptr; 
- 
-    BinaryOperator *New; 
-    if (ShrAmt < ShlAmt) { 
-      Constant *Amt = ConstantInt::get(VarX->getType(), ShlAmt - ShrAmt); 
-      New = BinaryOperator::CreateShl(VarX, Amt); 
-      BinaryOperator *Orig = cast<BinaryOperator>(Shl); 
-      New->setHasNoSignedWrap(Orig->hasNoSignedWrap()); 
-      New->setHasNoUnsignedWrap(Orig->hasNoUnsignedWrap()); 
-    } else { 
-      Constant *Amt = ConstantInt::get(VarX->getType(), ShrAmt - ShlAmt); 
-      New = isLshr ? BinaryOperator::CreateLShr(VarX, Amt) : 
-                     BinaryOperator::CreateAShr(VarX, Amt); 
-      if (cast<BinaryOperator>(Shr)->isExact()) 
-        New->setIsExact(true); 
-    } 
- 
-    return InsertNewInstWith(New, *Shl); 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// The specified value produces a vector with any number of elements. 
+  if (!ShlOp1 || !ShrOp1)
+    return nullptr; // No-op.
+
+  Value *VarX = Shr->getOperand(0);
+  Type *Ty = VarX->getType();
+  unsigned BitWidth = Ty->getScalarSizeInBits();
+  if (ShlOp1.uge(BitWidth) || ShrOp1.uge(BitWidth))
+    return nullptr; // Undef.
+
+  unsigned ShlAmt = ShlOp1.getZExtValue();
+  unsigned ShrAmt = ShrOp1.getZExtValue();
+
+  Known.One.clearAllBits();
+  Known.Zero.setLowBits(ShlAmt - 1);
+  Known.Zero &= DemandedMask;
+
+  APInt BitMask1(APInt::getAllOnesValue(BitWidth));
+  APInt BitMask2(APInt::getAllOnesValue(BitWidth));
+
+  bool isLshr = (Shr->getOpcode() == Instruction::LShr);
+  BitMask1 = isLshr ? (BitMask1.lshr(ShrAmt) << ShlAmt) :
+                      (BitMask1.ashr(ShrAmt) << ShlAmt);
+
+  if (ShrAmt <= ShlAmt) {
+    BitMask2 <<= (ShlAmt - ShrAmt);
+  } else {
+    BitMask2 = isLshr ? BitMask2.lshr(ShrAmt - ShlAmt):
+                        BitMask2.ashr(ShrAmt - ShlAmt);
+  }
+
+  // Check if condition-2 (see the comment to this function) is satified.
+  if ((BitMask1 & DemandedMask) == (BitMask2 & DemandedMask)) {
+    if (ShrAmt == ShlAmt)
+      return VarX;
+
+    if (!Shr->hasOneUse())
+      return nullptr;
+
+    BinaryOperator *New;
+    if (ShrAmt < ShlAmt) {
+      Constant *Amt = ConstantInt::get(VarX->getType(), ShlAmt - ShrAmt);
+      New = BinaryOperator::CreateShl(VarX, Amt);
+      BinaryOperator *Orig = cast<BinaryOperator>(Shl);
+      New->setHasNoSignedWrap(Orig->hasNoSignedWrap());
+      New->setHasNoUnsignedWrap(Orig->hasNoUnsignedWrap());
+    } else {
+      Constant *Amt = ConstantInt::get(VarX->getType(), ShrAmt - ShlAmt);
+      New = isLshr ? BinaryOperator::CreateLShr(VarX, Amt) :
+                     BinaryOperator::CreateAShr(VarX, Amt);
+      if (cast<BinaryOperator>(Shr)->isExact())
+        New->setIsExact(true);
+    }
+
+    return InsertNewInstWith(New, *Shl);
+  }
+
+  return nullptr;
+}
+
+/// The specified value produces a vector with any number of elements.
 /// This method analyzes which elements of the operand are undef or poison and
 /// returns that information in UndefElts.
-/// 
-/// DemandedElts contains the set of elements that are actually used by the 
-/// caller, and by default (AllowMultipleUsers equals false) the value is 
-/// simplified only if it has a single caller. If AllowMultipleUsers is set 
-/// to true, DemandedElts refers to the union of sets of elements that are 
-/// used by all callers. 
-/// 
-/// If the information about demanded elements can be used to simplify the 
-/// operation, the operation is simplified, then the resultant value is 
-/// returned.  This returns null if no change was made. 
+///
+/// DemandedElts contains the set of elements that are actually used by the
+/// caller, and by default (AllowMultipleUsers equals false) the value is
+/// simplified only if it has a single caller. If AllowMultipleUsers is set
+/// to true, DemandedElts refers to the union of sets of elements that are
+/// used by all callers.
+///
+/// If the information about demanded elements can be used to simplify the
+/// operation, the operation is simplified, then the resultant value is
+/// returned.  This returns null if no change was made.
 Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V,
                                                     APInt DemandedElts,
                                                     APInt &UndefElts,
                                                     unsigned Depth,
                                                     bool AllowMultipleUsers) {
-  // Cannot analyze scalable type. The number of vector elements is not a 
-  // compile-time constant. 
-  if (isa<ScalableVectorType>(V->getType())) 
-    return nullptr; 
- 
-  unsigned VWidth = cast<FixedVectorType>(V->getType())->getNumElements(); 
-  APInt EltMask(APInt::getAllOnesValue(VWidth)); 
-  assert((DemandedElts & ~EltMask) == 0 && "Invalid DemandedElts!"); 
- 
-  if (isa<UndefValue>(V)) { 
+  // Cannot analyze scalable type. The number of vector elements is not a
+  // compile-time constant.
+  if (isa<ScalableVectorType>(V->getType()))
+    return nullptr;
+
+  unsigned VWidth = cast<FixedVectorType>(V->getType())->getNumElements();
+  APInt EltMask(APInt::getAllOnesValue(VWidth));
+  assert((DemandedElts & ~EltMask) == 0 && "Invalid DemandedElts!");
+
+  if (isa<UndefValue>(V)) {
     // If the entire vector is undef or poison, just return this info.
-    UndefElts = EltMask; 
-    return nullptr; 
-  } 
- 
+    UndefElts = EltMask;
+    return nullptr;
+  }
+
   if (DemandedElts.isNullValue()) { // If nothing is demanded, provide poison.
-    UndefElts = EltMask; 
+    UndefElts = EltMask;
     return PoisonValue::get(V->getType());
-  } 
- 
-  UndefElts = 0; 
- 
-  if (auto *C = dyn_cast<Constant>(V)) { 
-    // Check if this is identity. If so, return 0 since we are not simplifying 
-    // anything. 
-    if (DemandedElts.isAllOnesValue()) 
-      return nullptr; 
- 
-    Type *EltTy = cast<VectorType>(V->getType())->getElementType(); 
+  }
+
+  UndefElts = 0;
+
+  if (auto *C = dyn_cast<Constant>(V)) {
+    // Check if this is identity. If so, return 0 since we are not simplifying
+    // anything.
+    if (DemandedElts.isAllOnesValue())
+      return nullptr;
+
+    Type *EltTy = cast<VectorType>(V->getType())->getElementType();
     Constant *Poison = PoisonValue::get(EltTy);
-    SmallVector<Constant*, 16> Elts; 
-    for (unsigned i = 0; i != VWidth; ++i) { 
+    SmallVector<Constant*, 16> Elts;
+    for (unsigned i = 0; i != VWidth; ++i) {
       if (!DemandedElts[i]) {   // If not demanded, set to poison.
         Elts.push_back(Poison);
-        UndefElts.setBit(i); 
-        continue; 
-      } 
- 
-      Constant *Elt = C->getAggregateElement(i); 
-      if (!Elt) return nullptr; 
- 
+        UndefElts.setBit(i);
+        continue;
+      }
+
+      Constant *Elt = C->getAggregateElement(i);
+      if (!Elt) return nullptr;
+
       Elts.push_back(Elt);
       if (isa<UndefValue>(Elt))   // Already undef or poison.
-        UndefElts.setBit(i); 
-    } 
- 
-    // If we changed the constant, return it. 
-    Constant *NewCV = ConstantVector::get(Elts); 
-    return NewCV != C ? NewCV : nullptr; 
-  } 
- 
-  // Limit search depth. 
-  if (Depth == 10) 
-    return nullptr; 
- 
-  if (!AllowMultipleUsers) { 
-    // If multiple users are using the root value, proceed with 
-    // simplification conservatively assuming that all elements 
-    // are needed. 
-    if (!V->hasOneUse()) { 
-      // Quit if we find multiple users of a non-root value though. 
-      // They'll be handled when it's their turn to be visited by 
-      // the main instcombine process. 
-      if (Depth != 0) 
-        // TODO: Just compute the UndefElts information recursively. 
-        return nullptr; 
- 
-      // Conservatively assume that all elements are needed. 
-      DemandedElts = EltMask; 
-    } 
-  } 
- 
-  Instruction *I = dyn_cast<Instruction>(V); 
-  if (!I) return nullptr;        // Only analyze instructions. 
- 
-  bool MadeChange = false; 
-  auto simplifyAndSetOp = [&](Instruction *Inst, unsigned OpNum, 
-                              APInt Demanded, APInt &Undef) { 
-    auto *II = dyn_cast<IntrinsicInst>(Inst); 
-    Value *Op = II ? II->getArgOperand(OpNum) : Inst->getOperand(OpNum); 
-    if (Value *V = SimplifyDemandedVectorElts(Op, Demanded, Undef, Depth + 1)) { 
-      replaceOperand(*Inst, OpNum, V); 
-      MadeChange = true; 
-    } 
-  }; 
- 
-  APInt UndefElts2(VWidth, 0); 
-  APInt UndefElts3(VWidth, 0); 
-  switch (I->getOpcode()) { 
-  default: break; 
- 
-  case Instruction::GetElementPtr: { 
-    // The LangRef requires that struct geps have all constant indices.  As 
-    // such, we can't convert any operand to partial undef. 
-    auto mayIndexStructType = [](GetElementPtrInst &GEP) { 
-      for (auto I = gep_type_begin(GEP), E = gep_type_end(GEP); 
-           I != E; I++) 
-        if (I.isStruct()) 
-          return true;; 
-      return false; 
-    }; 
-    if (mayIndexStructType(cast<GetElementPtrInst>(*I))) 
-      break; 
-
-    // Conservatively track the demanded elements back through any vector 
-    // operands we may have.  We know there must be at least one, or we 
-    // wouldn't have a vector result to get here. Note that we intentionally 
-    // merge the undef bits here since gepping with either an undef base or 
+        UndefElts.setBit(i);
+    }
+
+    // If we changed the constant, return it.
+    Constant *NewCV = ConstantVector::get(Elts);
+    return NewCV != C ? NewCV : nullptr;
+  }
+
+  // Limit search depth.
+  if (Depth == 10)
+    return nullptr;
+
+  if (!AllowMultipleUsers) {
+    // If multiple users are using the root value, proceed with
+    // simplification conservatively assuming that all elements
+    // are needed.
+    if (!V->hasOneUse()) {
+      // Quit if we find multiple users of a non-root value though.
+      // They'll be handled when it's their turn to be visited by
+      // the main instcombine process.
+      if (Depth != 0)
+        // TODO: Just compute the UndefElts information recursively.
+        return nullptr;
+
+      // Conservatively assume that all elements are needed.
+      DemandedElts = EltMask;
+    }
+  }
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return nullptr;        // Only analyze instructions.
+
+  bool MadeChange = false;
+  auto simplifyAndSetOp = [&](Instruction *Inst, unsigned OpNum,
+                              APInt Demanded, APInt &Undef) {
+    auto *II = dyn_cast<IntrinsicInst>(Inst);
+    Value *Op = II ? II->getArgOperand(OpNum) : Inst->getOperand(OpNum);
+    if (Value *V = SimplifyDemandedVectorElts(Op, Demanded, Undef, Depth + 1)) {
+      replaceOperand(*Inst, OpNum, V);
+      MadeChange = true;
+    }
+  };
+
+  APInt UndefElts2(VWidth, 0);
+  APInt UndefElts3(VWidth, 0);
+  switch (I->getOpcode()) {
+  default: break;
+
+  case Instruction::GetElementPtr: {
+    // The LangRef requires that struct geps have all constant indices.  As
+    // such, we can't convert any operand to partial undef.
+    auto mayIndexStructType = [](GetElementPtrInst &GEP) {
+      for (auto I = gep_type_begin(GEP), E = gep_type_end(GEP);
+           I != E; I++)
+        if (I.isStruct())
+          return true;;
+      return false;
+    };
+    if (mayIndexStructType(cast<GetElementPtrInst>(*I)))
+      break;
+
+    // Conservatively track the demanded elements back through any vector
+    // operands we may have.  We know there must be at least one, or we
+    // wouldn't have a vector result to get here. Note that we intentionally
+    // merge the undef bits here since gepping with either an undef base or
     // index results in undef.
-    for (unsigned i = 0; i < I->getNumOperands(); i++) { 
-      if (isa<UndefValue>(I->getOperand(i))) { 
-        // If the entire vector is undefined, just return this info. 
-        UndefElts = EltMask; 
-        return nullptr; 
-      } 
-      if (I->getOperand(i)->getType()->isVectorTy()) { 
-        APInt UndefEltsOp(VWidth, 0); 
-        simplifyAndSetOp(I, i, DemandedElts, UndefEltsOp); 
-        UndefElts |= UndefEltsOp; 
-      } 
-    } 
- 
-    break; 
-  } 
-  case Instruction::InsertElement: { 
-    // If this is a variable index, we don't know which element it overwrites. 
-    // demand exactly the same input as we produce. 
-    ConstantInt *Idx = dyn_cast<ConstantInt>(I->getOperand(2)); 
-    if (!Idx) { 
-      // Note that we can't propagate undef elt info, because we don't know 
-      // which elt is getting updated. 
-      simplifyAndSetOp(I, 0, DemandedElts, UndefElts2); 
-      break; 
-    } 
- 
-    // The element inserted overwrites whatever was there, so the input demanded 
-    // set is simpler than the output set. 
-    unsigned IdxNo = Idx->getZExtValue(); 
-    APInt PreInsertDemandedElts = DemandedElts; 
-    if (IdxNo < VWidth) 
-      PreInsertDemandedElts.clearBit(IdxNo); 
- 
+    for (unsigned i = 0; i < I->getNumOperands(); i++) {
+      if (isa<UndefValue>(I->getOperand(i))) {
+        // If the entire vector is undefined, just return this info.
+        UndefElts = EltMask;
+        return nullptr;
+      }
+      if (I->getOperand(i)->getType()->isVectorTy()) {
+        APInt UndefEltsOp(VWidth, 0);
+        simplifyAndSetOp(I, i, DemandedElts, UndefEltsOp);
+        UndefElts |= UndefEltsOp;
+      }
+    }
+
+    break;
+  }
+  case Instruction::InsertElement: {
+    // If this is a variable index, we don't know which element it overwrites.
+    // demand exactly the same input as we produce.
+    ConstantInt *Idx = dyn_cast<ConstantInt>(I->getOperand(2));
+    if (!Idx) {
+      // Note that we can't propagate undef elt info, because we don't know
+      // which elt is getting updated.
+      simplifyAndSetOp(I, 0, DemandedElts, UndefElts2);
+      break;
+    }
+
+    // The element inserted overwrites whatever was there, so the input demanded
+    // set is simpler than the output set.
+    unsigned IdxNo = Idx->getZExtValue();
+    APInt PreInsertDemandedElts = DemandedElts;
+    if (IdxNo < VWidth)
+      PreInsertDemandedElts.clearBit(IdxNo);
+
     // If we only demand the element that is being inserted and that element
     // was extracted from the same index in another vector with the same type,
     // replace this insert with that other vector.
@@ -1197,339 +1197,339 @@ Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V,
       return Vec;
     }
 
-    simplifyAndSetOp(I, 0, PreInsertDemandedElts, UndefElts); 
- 
-    // If this is inserting an element that isn't demanded, remove this 
-    // insertelement. 
-    if (IdxNo >= VWidth || !DemandedElts[IdxNo]) { 
-      Worklist.push(I); 
-      return I->getOperand(0); 
-    } 
- 
-    // The inserted element is defined. 
-    UndefElts.clearBit(IdxNo); 
-    break; 
-  } 
-  case Instruction::ShuffleVector: { 
-    auto *Shuffle = cast<ShuffleVectorInst>(I); 
-    assert(Shuffle->getOperand(0)->getType() == 
-           Shuffle->getOperand(1)->getType() && 
-           "Expected shuffle operands to have same type"); 
+    simplifyAndSetOp(I, 0, PreInsertDemandedElts, UndefElts);
+
+    // If this is inserting an element that isn't demanded, remove this
+    // insertelement.
+    if (IdxNo >= VWidth || !DemandedElts[IdxNo]) {
+      Worklist.push(I);
+      return I->getOperand(0);
+    }
+
+    // The inserted element is defined.
+    UndefElts.clearBit(IdxNo);
+    break;
+  }
+  case Instruction::ShuffleVector: {
+    auto *Shuffle = cast<ShuffleVectorInst>(I);
+    assert(Shuffle->getOperand(0)->getType() ==
+           Shuffle->getOperand(1)->getType() &&
+           "Expected shuffle operands to have same type");
     unsigned OpWidth = cast<FixedVectorType>(Shuffle->getOperand(0)->getType())
                            ->getNumElements();
-    // Handle trivial case of a splat. Only check the first element of LHS 
-    // operand. 
-    if (all_of(Shuffle->getShuffleMask(), [](int Elt) { return Elt == 0; }) && 
-        DemandedElts.isAllOnesValue()) { 
-      if (!isa<UndefValue>(I->getOperand(1))) { 
-        I->setOperand(1, UndefValue::get(I->getOperand(1)->getType())); 
-        MadeChange = true; 
-      } 
-      APInt LeftDemanded(OpWidth, 1); 
-      APInt LHSUndefElts(OpWidth, 0); 
-      simplifyAndSetOp(I, 0, LeftDemanded, LHSUndefElts); 
-      if (LHSUndefElts[0]) 
-        UndefElts = EltMask; 
-      else 
-        UndefElts.clearAllBits(); 
-      break; 
-    } 
- 
-    APInt LeftDemanded(OpWidth, 0), RightDemanded(OpWidth, 0); 
-    for (unsigned i = 0; i < VWidth; i++) { 
-      if (DemandedElts[i]) { 
-        unsigned MaskVal = Shuffle->getMaskValue(i); 
-        if (MaskVal != -1u) { 
-          assert(MaskVal < OpWidth * 2 && 
-                 "shufflevector mask index out of range!"); 
-          if (MaskVal < OpWidth) 
-            LeftDemanded.setBit(MaskVal); 
-          else 
-            RightDemanded.setBit(MaskVal - OpWidth); 
-        } 
-      } 
-    } 
- 
-    APInt LHSUndefElts(OpWidth, 0); 
-    simplifyAndSetOp(I, 0, LeftDemanded, LHSUndefElts); 
- 
-    APInt RHSUndefElts(OpWidth, 0); 
-    simplifyAndSetOp(I, 1, RightDemanded, RHSUndefElts); 
- 
-    // If this shuffle does not change the vector length and the elements 
-    // demanded by this shuffle are an identity mask, then this shuffle is 
-    // unnecessary. 
-    // 
-    // We are assuming canonical form for the mask, so the source vector is 
-    // operand 0 and operand 1 is not used. 
-    // 
-    // Note that if an element is demanded and this shuffle mask is undefined 
-    // for that element, then the shuffle is not considered an identity 
-    // operation. The shuffle prevents poison from the operand vector from 
-    // leaking to the result by replacing poison with an undefined value. 
-    if (VWidth == OpWidth) { 
-      bool IsIdentityShuffle = true; 
-      for (unsigned i = 0; i < VWidth; i++) { 
-        unsigned MaskVal = Shuffle->getMaskValue(i); 
-        if (DemandedElts[i] && i != MaskVal) { 
-          IsIdentityShuffle = false; 
-          break; 
-        } 
-      } 
-      if (IsIdentityShuffle) 
-        return Shuffle->getOperand(0); 
-    } 
- 
-    bool NewUndefElts = false; 
-    unsigned LHSIdx = -1u, LHSValIdx = -1u; 
-    unsigned RHSIdx = -1u, RHSValIdx = -1u; 
-    bool LHSUniform = true; 
-    bool RHSUniform = true; 
-    for (unsigned i = 0; i < VWidth; i++) { 
-      unsigned MaskVal = Shuffle->getMaskValue(i); 
-      if (MaskVal == -1u) { 
-        UndefElts.setBit(i); 
-      } else if (!DemandedElts[i]) { 
-        NewUndefElts = true; 
-        UndefElts.setBit(i); 
-      } else if (MaskVal < OpWidth) { 
-        if (LHSUndefElts[MaskVal]) { 
-          NewUndefElts = true; 
-          UndefElts.setBit(i); 
-        } else { 
-          LHSIdx = LHSIdx == -1u ? i : OpWidth; 
-          LHSValIdx = LHSValIdx == -1u ? MaskVal : OpWidth; 
-          LHSUniform = LHSUniform && (MaskVal == i); 
-        } 
-      } else { 
-        if (RHSUndefElts[MaskVal - OpWidth]) { 
-          NewUndefElts = true; 
-          UndefElts.setBit(i); 
-        } else { 
-          RHSIdx = RHSIdx == -1u ? i : OpWidth; 
-          RHSValIdx = RHSValIdx == -1u ? MaskVal - OpWidth : OpWidth; 
-          RHSUniform = RHSUniform && (MaskVal - OpWidth == i); 
-        } 
-      } 
-    } 
- 
-    // Try to transform shuffle with constant vector and single element from 
-    // this constant vector to single insertelement instruction. 
-    // shufflevector V, C, <v1, v2, .., ci, .., vm> -> 
-    // insertelement V, C[ci], ci-n 
+    // Handle trivial case of a splat. Only check the first element of LHS
+    // operand.
+    if (all_of(Shuffle->getShuffleMask(), [](int Elt) { return Elt == 0; }) &&
+        DemandedElts.isAllOnesValue()) {
+      if (!isa<UndefValue>(I->getOperand(1))) {
+        I->setOperand(1, UndefValue::get(I->getOperand(1)->getType()));
+        MadeChange = true;
+      }
+      APInt LeftDemanded(OpWidth, 1);
+      APInt LHSUndefElts(OpWidth, 0);
+      simplifyAndSetOp(I, 0, LeftDemanded, LHSUndefElts);
+      if (LHSUndefElts[0])
+        UndefElts = EltMask;
+      else
+        UndefElts.clearAllBits();
+      break;
+    }
+
+    APInt LeftDemanded(OpWidth, 0), RightDemanded(OpWidth, 0);
+    for (unsigned i = 0; i < VWidth; i++) {
+      if (DemandedElts[i]) {
+        unsigned MaskVal = Shuffle->getMaskValue(i);
+        if (MaskVal != -1u) {
+          assert(MaskVal < OpWidth * 2 &&
+                 "shufflevector mask index out of range!");
+          if (MaskVal < OpWidth)
+            LeftDemanded.setBit(MaskVal);
+          else
+            RightDemanded.setBit(MaskVal - OpWidth);
+        }
+      }
+    }
+
+    APInt LHSUndefElts(OpWidth, 0);
+    simplifyAndSetOp(I, 0, LeftDemanded, LHSUndefElts);
+
+    APInt RHSUndefElts(OpWidth, 0);
+    simplifyAndSetOp(I, 1, RightDemanded, RHSUndefElts);
+
+    // If this shuffle does not change the vector length and the elements
+    // demanded by this shuffle are an identity mask, then this shuffle is
+    // unnecessary.
+    //
+    // We are assuming canonical form for the mask, so the source vector is
+    // operand 0 and operand 1 is not used.
+    //
+    // Note that if an element is demanded and this shuffle mask is undefined
+    // for that element, then the shuffle is not considered an identity
+    // operation. The shuffle prevents poison from the operand vector from
+    // leaking to the result by replacing poison with an undefined value.
+    if (VWidth == OpWidth) {
+      bool IsIdentityShuffle = true;
+      for (unsigned i = 0; i < VWidth; i++) {
+        unsigned MaskVal = Shuffle->getMaskValue(i);
+        if (DemandedElts[i] && i != MaskVal) {
+          IsIdentityShuffle = false;
+          break;
+        }
+      }
+      if (IsIdentityShuffle)
+        return Shuffle->getOperand(0);
+    }
+
+    bool NewUndefElts = false;
+    unsigned LHSIdx = -1u, LHSValIdx = -1u;
+    unsigned RHSIdx = -1u, RHSValIdx = -1u;
+    bool LHSUniform = true;
+    bool RHSUniform = true;
+    for (unsigned i = 0; i < VWidth; i++) {
+      unsigned MaskVal = Shuffle->getMaskValue(i);
+      if (MaskVal == -1u) {
+        UndefElts.setBit(i);
+      } else if (!DemandedElts[i]) {
+        NewUndefElts = true;
+        UndefElts.setBit(i);
+      } else if (MaskVal < OpWidth) {
+        if (LHSUndefElts[MaskVal]) {
+          NewUndefElts = true;
+          UndefElts.setBit(i);
+        } else {
+          LHSIdx = LHSIdx == -1u ? i : OpWidth;
+          LHSValIdx = LHSValIdx == -1u ? MaskVal : OpWidth;
+          LHSUniform = LHSUniform && (MaskVal == i);
+        }
+      } else {
+        if (RHSUndefElts[MaskVal - OpWidth]) {
+          NewUndefElts = true;
+          UndefElts.setBit(i);
+        } else {
+          RHSIdx = RHSIdx == -1u ? i : OpWidth;
+          RHSValIdx = RHSValIdx == -1u ? MaskVal - OpWidth : OpWidth;
+          RHSUniform = RHSUniform && (MaskVal - OpWidth == i);
+        }
+      }
+    }
+
+    // Try to transform shuffle with constant vector and single element from
+    // this constant vector to single insertelement instruction.
+    // shufflevector V, C, <v1, v2, .., ci, .., vm> ->
+    // insertelement V, C[ci], ci-n
     if (OpWidth ==
         cast<FixedVectorType>(Shuffle->getType())->getNumElements()) {
-      Value *Op = nullptr; 
-      Constant *Value = nullptr; 
-      unsigned Idx = -1u; 
- 
-      // Find constant vector with the single element in shuffle (LHS or RHS). 
-      if (LHSIdx < OpWidth && RHSUniform) { 
-        if (auto *CV = dyn_cast<ConstantVector>(Shuffle->getOperand(0))) { 
-          Op = Shuffle->getOperand(1); 
-          Value = CV->getOperand(LHSValIdx); 
-          Idx = LHSIdx; 
-        } 
-      } 
-      if (RHSIdx < OpWidth && LHSUniform) { 
-        if (auto *CV = dyn_cast<ConstantVector>(Shuffle->getOperand(1))) { 
-          Op = Shuffle->getOperand(0); 
-          Value = CV->getOperand(RHSValIdx); 
-          Idx = RHSIdx; 
-        } 
-      } 
-      // Found constant vector with single element - convert to insertelement. 
-      if (Op && Value) { 
-        Instruction *New = InsertElementInst::Create( 
-            Op, Value, ConstantInt::get(Type::getInt32Ty(I->getContext()), Idx), 
-            Shuffle->getName()); 
-        InsertNewInstWith(New, *Shuffle); 
-        return New; 
-      } 
-    } 
-    if (NewUndefElts) { 
-      // Add additional discovered undefs. 
-      SmallVector<int, 16> Elts; 
-      for (unsigned i = 0; i < VWidth; ++i) { 
-        if (UndefElts[i]) 
-          Elts.push_back(UndefMaskElem); 
-        else 
-          Elts.push_back(Shuffle->getMaskValue(i)); 
-      } 
-      Shuffle->setShuffleMask(Elts); 
-      MadeChange = true; 
-    } 
-    break; 
-  } 
-  case Instruction::Select: { 
-    // If this is a vector select, try to transform the select condition based 
-    // on the current demanded elements. 
-    SelectInst *Sel = cast<SelectInst>(I); 
-    if (Sel->getCondition()->getType()->isVectorTy()) { 
-      // TODO: We are not doing anything with UndefElts based on this call. 
-      // It is overwritten below based on the other select operands. If an 
-      // element of the select condition is known undef, then we are free to 
-      // choose the output value from either arm of the select. If we know that 
-      // one of those values is undef, then the output can be undef. 
-      simplifyAndSetOp(I, 0, DemandedElts, UndefElts); 
-    } 
- 
-    // Next, see if we can transform the arms of the select. 
-    APInt DemandedLHS(DemandedElts), DemandedRHS(DemandedElts); 
-    if (auto *CV = dyn_cast<ConstantVector>(Sel->getCondition())) { 
-      for (unsigned i = 0; i < VWidth; i++) { 
-        // isNullValue() always returns false when called on a ConstantExpr. 
-        // Skip constant expressions to avoid propagating incorrect information. 
-        Constant *CElt = CV->getAggregateElement(i); 
-        if (isa<ConstantExpr>(CElt)) 
-          continue; 
-        // TODO: If a select condition element is undef, we can demand from 
-        // either side. If one side is known undef, choosing that side would 
-        // propagate undef. 
-        if (CElt->isNullValue()) 
-          DemandedLHS.clearBit(i); 
-        else 
-          DemandedRHS.clearBit(i); 
-      } 
-    } 
- 
-    simplifyAndSetOp(I, 1, DemandedLHS, UndefElts2); 
-    simplifyAndSetOp(I, 2, DemandedRHS, UndefElts3); 
- 
-    // Output elements are undefined if the element from each arm is undefined. 
-    // TODO: This can be improved. See comment in select condition handling. 
-    UndefElts = UndefElts2 & UndefElts3; 
-    break; 
-  } 
-  case Instruction::BitCast: { 
-    // Vector->vector casts only. 
-    VectorType *VTy = dyn_cast<VectorType>(I->getOperand(0)->getType()); 
-    if (!VTy) break; 
+      Value *Op = nullptr;
+      Constant *Value = nullptr;
+      unsigned Idx = -1u;
+
+      // Find constant vector with the single element in shuffle (LHS or RHS).
+      if (LHSIdx < OpWidth && RHSUniform) {
+        if (auto *CV = dyn_cast<ConstantVector>(Shuffle->getOperand(0))) {
+          Op = Shuffle->getOperand(1);
+          Value = CV->getOperand(LHSValIdx);
+          Idx = LHSIdx;
+        }
+      }
+      if (RHSIdx < OpWidth && LHSUniform) {
+        if (auto *CV = dyn_cast<ConstantVector>(Shuffle->getOperand(1))) {
+          Op = Shuffle->getOperand(0);
+          Value = CV->getOperand(RHSValIdx);
+          Idx = RHSIdx;
+        }
+      }
+      // Found constant vector with single element - convert to insertelement.
+      if (Op && Value) {
+        Instruction *New = InsertElementInst::Create(
+            Op, Value, ConstantInt::get(Type::getInt32Ty(I->getContext()), Idx),
+            Shuffle->getName());
+        InsertNewInstWith(New, *Shuffle);
+        return New;
+      }
+    }
+    if (NewUndefElts) {
+      // Add additional discovered undefs.
+      SmallVector<int, 16> Elts;
+      for (unsigned i = 0; i < VWidth; ++i) {
+        if (UndefElts[i])
+          Elts.push_back(UndefMaskElem);
+        else
+          Elts.push_back(Shuffle->getMaskValue(i));
+      }
+      Shuffle->setShuffleMask(Elts);
+      MadeChange = true;
+    }
+    break;
+  }
+  case Instruction::Select: {
+    // If this is a vector select, try to transform the select condition based
+    // on the current demanded elements.
+    SelectInst *Sel = cast<SelectInst>(I);
+    if (Sel->getCondition()->getType()->isVectorTy()) {
+      // TODO: We are not doing anything with UndefElts based on this call.
+      // It is overwritten below based on the other select operands. If an
+      // element of the select condition is known undef, then we are free to
+      // choose the output value from either arm of the select. If we know that
+      // one of those values is undef, then the output can be undef.
+      simplifyAndSetOp(I, 0, DemandedElts, UndefElts);
+    }
+
+    // Next, see if we can transform the arms of the select.
+    APInt DemandedLHS(DemandedElts), DemandedRHS(DemandedElts);
+    if (auto *CV = dyn_cast<ConstantVector>(Sel->getCondition())) {
+      for (unsigned i = 0; i < VWidth; i++) {
+        // isNullValue() always returns false when called on a ConstantExpr.
+        // Skip constant expressions to avoid propagating incorrect information.
+        Constant *CElt = CV->getAggregateElement(i);
+        if (isa<ConstantExpr>(CElt))
+          continue;
+        // TODO: If a select condition element is undef, we can demand from
+        // either side. If one side is known undef, choosing that side would
+        // propagate undef.
+        if (CElt->isNullValue())
+          DemandedLHS.clearBit(i);
+        else
+          DemandedRHS.clearBit(i);
+      }
+    }
+
+    simplifyAndSetOp(I, 1, DemandedLHS, UndefElts2);
+    simplifyAndSetOp(I, 2, DemandedRHS, UndefElts3);
+
+    // Output elements are undefined if the element from each arm is undefined.
+    // TODO: This can be improved. See comment in select condition handling.
+    UndefElts = UndefElts2 & UndefElts3;
+    break;
+  }
+  case Instruction::BitCast: {
+    // Vector->vector casts only.
+    VectorType *VTy = dyn_cast<VectorType>(I->getOperand(0)->getType());
+    if (!VTy) break;
     unsigned InVWidth = cast<FixedVectorType>(VTy)->getNumElements();
-    APInt InputDemandedElts(InVWidth, 0); 
-    UndefElts2 = APInt(InVWidth, 0); 
-    unsigned Ratio; 
- 
-    if (VWidth == InVWidth) { 
-      // If we are converting from <4 x i32> -> <4 x f32>, we demand the same 
-      // elements as are demanded of us. 
-      Ratio = 1; 
-      InputDemandedElts = DemandedElts; 
-    } else if ((VWidth % InVWidth) == 0) { 
-      // If the number of elements in the output is a multiple of the number of 
-      // elements in the input then an input element is live if any of the 
-      // corresponding output elements are live. 
-      Ratio = VWidth / InVWidth; 
-      for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) 
-        if (DemandedElts[OutIdx]) 
-          InputDemandedElts.setBit(OutIdx / Ratio); 
-    } else if ((InVWidth % VWidth) == 0) { 
-      // If the number of elements in the input is a multiple of the number of 
-      // elements in the output then an input element is live if the 
-      // corresponding output element is live. 
-      Ratio = InVWidth / VWidth; 
-      for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx) 
-        if (DemandedElts[InIdx / Ratio]) 
-          InputDemandedElts.setBit(InIdx); 
-    } else { 
-      // Unsupported so far. 
-      break; 
-    } 
- 
-    simplifyAndSetOp(I, 0, InputDemandedElts, UndefElts2); 
- 
-    if (VWidth == InVWidth) { 
-      UndefElts = UndefElts2; 
-    } else if ((VWidth % InVWidth) == 0) { 
-      // If the number of elements in the output is a multiple of the number of 
-      // elements in the input then an output element is undef if the 
-      // corresponding input element is undef. 
-      for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) 
-        if (UndefElts2[OutIdx / Ratio]) 
-          UndefElts.setBit(OutIdx); 
-    } else if ((InVWidth % VWidth) == 0) { 
-      // If the number of elements in the input is a multiple of the number of 
-      // elements in the output then an output element is undef if all of the 
-      // corresponding input elements are undef. 
-      for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) { 
-        APInt SubUndef = UndefElts2.lshr(OutIdx * Ratio).zextOrTrunc(Ratio); 
-        if (SubUndef.countPopulation() == Ratio) 
-          UndefElts.setBit(OutIdx); 
-      } 
-    } else { 
-      llvm_unreachable("Unimp"); 
-    } 
-    break; 
-  } 
-  case Instruction::FPTrunc: 
-  case Instruction::FPExt: 
-    simplifyAndSetOp(I, 0, DemandedElts, UndefElts); 
-    break; 
- 
-  case Instruction::Call: { 
-    IntrinsicInst *II = dyn_cast<IntrinsicInst>(I); 
-    if (!II) break; 
-    switch (II->getIntrinsicID()) { 
-    case Intrinsic::masked_gather: // fallthrough 
-    case Intrinsic::masked_load: { 
-      // Subtlety: If we load from a pointer, the pointer must be valid 
-      // regardless of whether the element is demanded.  Doing otherwise risks 
-      // segfaults which didn't exist in the original program. 
-      APInt DemandedPtrs(APInt::getAllOnesValue(VWidth)), 
-        DemandedPassThrough(DemandedElts); 
-      if (auto *CV = dyn_cast<ConstantVector>(II->getOperand(2))) 
-        for (unsigned i = 0; i < VWidth; i++) { 
-          Constant *CElt = CV->getAggregateElement(i); 
-          if (CElt->isNullValue()) 
-            DemandedPtrs.clearBit(i); 
-          else if (CElt->isAllOnesValue()) 
-            DemandedPassThrough.clearBit(i); 
-        } 
-      if (II->getIntrinsicID() == Intrinsic::masked_gather) 
-        simplifyAndSetOp(II, 0, DemandedPtrs, UndefElts2); 
-      simplifyAndSetOp(II, 3, DemandedPassThrough, UndefElts3); 
-
-      // Output elements are undefined if the element from both sources are. 
-      // TODO: can strengthen via mask as well. 
-      UndefElts = UndefElts2 & UndefElts3; 
-      break; 
-    } 
-    default: { 
+    APInt InputDemandedElts(InVWidth, 0);
+    UndefElts2 = APInt(InVWidth, 0);
+    unsigned Ratio;
+
+    if (VWidth == InVWidth) {
+      // If we are converting from <4 x i32> -> <4 x f32>, we demand the same
+      // elements as are demanded of us.
+      Ratio = 1;
+      InputDemandedElts = DemandedElts;
+    } else if ((VWidth % InVWidth) == 0) {
+      // If the number of elements in the output is a multiple of the number of
+      // elements in the input then an input element is live if any of the
+      // corresponding output elements are live.
+      Ratio = VWidth / InVWidth;
+      for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx)
+        if (DemandedElts[OutIdx])
+          InputDemandedElts.setBit(OutIdx / Ratio);
+    } else if ((InVWidth % VWidth) == 0) {
+      // If the number of elements in the input is a multiple of the number of
+      // elements in the output then an input element is live if the
+      // corresponding output element is live.
+      Ratio = InVWidth / VWidth;
+      for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx)
+        if (DemandedElts[InIdx / Ratio])
+          InputDemandedElts.setBit(InIdx);
+    } else {
+      // Unsupported so far.
+      break;
+    }
+
+    simplifyAndSetOp(I, 0, InputDemandedElts, UndefElts2);
+
+    if (VWidth == InVWidth) {
+      UndefElts = UndefElts2;
+    } else if ((VWidth % InVWidth) == 0) {
+      // If the number of elements in the output is a multiple of the number of
+      // elements in the input then an output element is undef if the
+      // corresponding input element is undef.
+      for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx)
+        if (UndefElts2[OutIdx / Ratio])
+          UndefElts.setBit(OutIdx);
+    } else if ((InVWidth % VWidth) == 0) {
+      // If the number of elements in the input is a multiple of the number of
+      // elements in the output then an output element is undef if all of the
+      // corresponding input elements are undef.
+      for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) {
+        APInt SubUndef = UndefElts2.lshr(OutIdx * Ratio).zextOrTrunc(Ratio);
+        if (SubUndef.countPopulation() == Ratio)
+          UndefElts.setBit(OutIdx);
+      }
+    } else {
+      llvm_unreachable("Unimp");
+    }
+    break;
+  }
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+    simplifyAndSetOp(I, 0, DemandedElts, UndefElts);
+    break;
+
+  case Instruction::Call: {
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
+    if (!II) break;
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::masked_gather: // fallthrough
+    case Intrinsic::masked_load: {
+      // Subtlety: If we load from a pointer, the pointer must be valid
+      // regardless of whether the element is demanded.  Doing otherwise risks
+      // segfaults which didn't exist in the original program.
+      APInt DemandedPtrs(APInt::getAllOnesValue(VWidth)),
+        DemandedPassThrough(DemandedElts);
+      if (auto *CV = dyn_cast<ConstantVector>(II->getOperand(2)))
+        for (unsigned i = 0; i < VWidth; i++) {
+          Constant *CElt = CV->getAggregateElement(i);
+          if (CElt->isNullValue())
+            DemandedPtrs.clearBit(i);
+          else if (CElt->isAllOnesValue())
+            DemandedPassThrough.clearBit(i);
+        }
+      if (II->getIntrinsicID() == Intrinsic::masked_gather)
+        simplifyAndSetOp(II, 0, DemandedPtrs, UndefElts2);
+      simplifyAndSetOp(II, 3, DemandedPassThrough, UndefElts3);
+
+      // Output elements are undefined if the element from both sources are.
+      // TODO: can strengthen via mask as well.
+      UndefElts = UndefElts2 & UndefElts3;
+      break;
+    }
+    default: {
       // Handle target specific intrinsics
       Optional<Value *> V = targetSimplifyDemandedVectorEltsIntrinsic(
           *II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
           simplifyAndSetOp);
       if (V.hasValue())
         return V.getValue();
-      break; 
-    } 
-    } // switch on IntrinsicID 
-    break; 
-  } // case Call 
-  } // switch on Opcode 
- 
-  // TODO: We bail completely on integer div/rem and shifts because they have 
-  // UB/poison potential, but that should be refined. 
-  BinaryOperator *BO; 
-  if (match(I, m_BinOp(BO)) && !BO->isIntDivRem() && !BO->isShift()) { 
-    simplifyAndSetOp(I, 0, DemandedElts, UndefElts); 
-    simplifyAndSetOp(I, 1, DemandedElts, UndefElts2); 
- 
-    // Any change to an instruction with potential poison must clear those flags 
-    // because we can not guarantee those constraints now. Other analysis may 
-    // determine that it is safe to re-apply the flags. 
-    if (MadeChange) 
-      BO->dropPoisonGeneratingFlags(); 
- 
-    // Output elements are undefined if both are undefined. Consider things 
-    // like undef & 0. The result is known zero, not undef. 
-    UndefElts &= UndefElts2; 
-  } 
- 
-  // If we've proven all of the lanes undef, return an undef value. 
-  // TODO: Intersect w/demanded lanes 
-  if (UndefElts.isAllOnesValue()) 
-    return UndefValue::get(I->getType());; 
- 
-  return MadeChange ? I : nullptr; 
-} 
+      break;
+    }
+    } // switch on IntrinsicID
+    break;
+  } // case Call
+  } // switch on Opcode
+
+  // TODO: We bail completely on integer div/rem and shifts because they have
+  // UB/poison potential, but that should be refined.
+  BinaryOperator *BO;
+  if (match(I, m_BinOp(BO)) && !BO->isIntDivRem() && !BO->isShift()) {
+    simplifyAndSetOp(I, 0, DemandedElts, UndefElts);
+    simplifyAndSetOp(I, 1, DemandedElts, UndefElts2);
+
+    // Any change to an instruction with potential poison must clear those flags
+    // because we can not guarantee those constraints now. Other analysis may
+    // determine that it is safe to re-apply the flags.
+    if (MadeChange)
+      BO->dropPoisonGeneratingFlags();
+
+    // Output elements are undefined if both are undefined. Consider things
+    // like undef & 0. The result is known zero, not undef.
+    UndefElts &= UndefElts2;
+  }
+
+  // If we've proven all of the lanes undef, return an undef value.
+  // TODO: Intersect w/demanded lanes
+  if (UndefElts.isAllOnesValue())
+    return UndefValue::get(I->getType());;
+
+  return MadeChange ? I : nullptr;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 42c981566c..06f22cdfb6 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -1,710 +1,710 @@
-//===- InstCombineVectorOps.cpp -------------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements instcombine for ExtractElement, InsertElement and 
-// ShuffleVector. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "InstCombineInternal.h" 
-#include "llvm/ADT/APInt.h" 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallBitVector.h" 
-#include "llvm/ADT/SmallVector.h" 
+//===- InstCombineVectorOps.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements instcombine for ExtractElement, InsertElement and
+// ShuffleVector.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/Analysis/VectorUtils.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Operator.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Transforms/InstCombine/InstCombineWorklist.h" 
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
-#include <cassert> 
-#include <cstdint> 
-#include <iterator> 
-#include <utility> 
- 
-using namespace llvm; 
-using namespace PatternMatch; 
- 
-#define DEBUG_TYPE "instcombine" 
- 
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <utility>
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
 STATISTIC(NumAggregateReconstructionsSimplified,
           "Number of aggregate reconstructions turned into reuse of the "
           "original aggregate");
 
-/// Return true if the value is cheaper to scalarize than it is to leave as a 
-/// vector operation. IsConstantExtractIndex indicates whether we are extracting 
-/// one known element from a vector constant. 
-/// 
-/// FIXME: It's possible to create more instructions than previously existed. 
-static bool cheapToScalarize(Value *V, bool IsConstantExtractIndex) { 
-  // If we can pick a scalar constant value out of a vector, that is free. 
-  if (auto *C = dyn_cast<Constant>(V)) 
-    return IsConstantExtractIndex || C->getSplatValue(); 
- 
-  // An insertelement to the same constant index as our extract will simplify 
-  // to the scalar inserted element. An insertelement to a different constant 
-  // index is irrelevant to our extract. 
-  if (match(V, m_InsertElt(m_Value(), m_Value(), m_ConstantInt()))) 
-    return IsConstantExtractIndex; 
- 
-  if (match(V, m_OneUse(m_Load(m_Value())))) 
-    return true; 
- 
-  if (match(V, m_OneUse(m_UnOp()))) 
-    return true; 
- 
-  Value *V0, *V1; 
-  if (match(V, m_OneUse(m_BinOp(m_Value(V0), m_Value(V1))))) 
-    if (cheapToScalarize(V0, IsConstantExtractIndex) || 
-        cheapToScalarize(V1, IsConstantExtractIndex)) 
-      return true; 
- 
-  CmpInst::Predicate UnusedPred; 
-  if (match(V, m_OneUse(m_Cmp(UnusedPred, m_Value(V0), m_Value(V1))))) 
-    if (cheapToScalarize(V0, IsConstantExtractIndex) || 
-        cheapToScalarize(V1, IsConstantExtractIndex)) 
-      return true; 
- 
-  return false; 
-} 
- 
-// If we have a PHI node with a vector type that is only used to feed 
-// itself and be an operand of extractelement at a constant location, 
-// try to replace the PHI of the vector type with a PHI of a scalar type. 
+/// Return true if the value is cheaper to scalarize than it is to leave as a
+/// vector operation. IsConstantExtractIndex indicates whether we are extracting
+/// one known element from a vector constant.
+///
+/// FIXME: It's possible to create more instructions than previously existed.
+static bool cheapToScalarize(Value *V, bool IsConstantExtractIndex) {
+  // If we can pick a scalar constant value out of a vector, that is free.
+  if (auto *C = dyn_cast<Constant>(V))
+    return IsConstantExtractIndex || C->getSplatValue();
+
+  // An insertelement to the same constant index as our extract will simplify
+  // to the scalar inserted element. An insertelement to a different constant
+  // index is irrelevant to our extract.
+  if (match(V, m_InsertElt(m_Value(), m_Value(), m_ConstantInt())))
+    return IsConstantExtractIndex;
+
+  if (match(V, m_OneUse(m_Load(m_Value()))))
+    return true;
+
+  if (match(V, m_OneUse(m_UnOp())))
+    return true;
+
+  Value *V0, *V1;
+  if (match(V, m_OneUse(m_BinOp(m_Value(V0), m_Value(V1)))))
+    if (cheapToScalarize(V0, IsConstantExtractIndex) ||
+        cheapToScalarize(V1, IsConstantExtractIndex))
+      return true;
+
+  CmpInst::Predicate UnusedPred;
+  if (match(V, m_OneUse(m_Cmp(UnusedPred, m_Value(V0), m_Value(V1)))))
+    if (cheapToScalarize(V0, IsConstantExtractIndex) ||
+        cheapToScalarize(V1, IsConstantExtractIndex))
+      return true;
+
+  return false;
+}
+
+// If we have a PHI node with a vector type that is only used to feed
+// itself and be an operand of extractelement at a constant location,
+// try to replace the PHI of the vector type with a PHI of a scalar type.
 Instruction *InstCombinerImpl::scalarizePHI(ExtractElementInst &EI,
                                             PHINode *PN) {
-  SmallVector<Instruction *, 2> Extracts; 
-  // The users we want the PHI to have are: 
-  // 1) The EI ExtractElement (we already know this) 
-  // 2) Possibly more ExtractElements with the same index. 
-  // 3) Another operand, which will feed back into the PHI. 
-  Instruction *PHIUser = nullptr; 
-  for (auto U : PN->users()) { 
-    if (ExtractElementInst *EU = dyn_cast<ExtractElementInst>(U)) { 
-      if (EI.getIndexOperand() == EU->getIndexOperand()) 
-        Extracts.push_back(EU); 
-      else 
-        return nullptr; 
-    } else if (!PHIUser) { 
-      PHIUser = cast<Instruction>(U); 
-    } else { 
-      return nullptr; 
-    } 
-  } 
- 
-  if (!PHIUser) 
-    return nullptr; 
- 
-  // Verify that this PHI user has one use, which is the PHI itself, 
-  // and that it is a binary operation which is cheap to scalarize. 
-  // otherwise return nullptr. 
-  if (!PHIUser->hasOneUse() || !(PHIUser->user_back() == PN) || 
-      !(isa<BinaryOperator>(PHIUser)) || !cheapToScalarize(PHIUser, true)) 
-    return nullptr; 
- 
-  // Create a scalar PHI node that will replace the vector PHI node 
-  // just before the current PHI node. 
-  PHINode *scalarPHI = cast<PHINode>(InsertNewInstWith( 
-      PHINode::Create(EI.getType(), PN->getNumIncomingValues(), ""), *PN)); 
-  // Scalarize each PHI operand. 
-  for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) { 
-    Value *PHIInVal = PN->getIncomingValue(i); 
-    BasicBlock *inBB = PN->getIncomingBlock(i); 
-    Value *Elt = EI.getIndexOperand(); 
-    // If the operand is the PHI induction variable: 
-    if (PHIInVal == PHIUser) { 
-      // Scalarize the binary operation. Its first operand is the 
-      // scalar PHI, and the second operand is extracted from the other 
-      // vector operand. 
-      BinaryOperator *B0 = cast<BinaryOperator>(PHIUser); 
-      unsigned opId = (B0->getOperand(0) == PN) ? 1 : 0; 
-      Value *Op = InsertNewInstWith( 
-          ExtractElementInst::Create(B0->getOperand(opId), Elt, 
-                                     B0->getOperand(opId)->getName() + ".Elt"), 
-          *B0); 
-      Value *newPHIUser = InsertNewInstWith( 
-          BinaryOperator::CreateWithCopiedFlags(B0->getOpcode(), 
-                                                scalarPHI, Op, B0), *B0); 
-      scalarPHI->addIncoming(newPHIUser, inBB); 
-    } else { 
-      // Scalarize PHI input: 
-      Instruction *newEI = ExtractElementInst::Create(PHIInVal, Elt, ""); 
-      // Insert the new instruction into the predecessor basic block. 
-      Instruction *pos = dyn_cast<Instruction>(PHIInVal); 
-      BasicBlock::iterator InsertPos; 
-      if (pos && !isa<PHINode>(pos)) { 
-        InsertPos = ++pos->getIterator(); 
-      } else { 
-        InsertPos = inBB->getFirstInsertionPt(); 
-      } 
- 
-      InsertNewInstWith(newEI, *InsertPos); 
- 
-      scalarPHI->addIncoming(newEI, inBB); 
-    } 
-  } 
- 
-  for (auto E : Extracts) 
-    replaceInstUsesWith(*E, scalarPHI); 
- 
-  return &EI; 
-} 
- 
-static Instruction *foldBitcastExtElt(ExtractElementInst &Ext, 
-                                      InstCombiner::BuilderTy &Builder, 
-                                      bool IsBigEndian) { 
-  Value *X; 
-  uint64_t ExtIndexC; 
-  if (!match(Ext.getVectorOperand(), m_BitCast(m_Value(X))) || 
-      !X->getType()->isVectorTy() || 
-      !match(Ext.getIndexOperand(), m_ConstantInt(ExtIndexC))) 
-    return nullptr; 
- 
-  // If this extractelement is using a bitcast from a vector of the same number 
-  // of elements, see if we can find the source element from the source vector: 
-  // extelt (bitcast VecX), IndexC --> bitcast X[IndexC] 
-  auto *SrcTy = cast<VectorType>(X->getType()); 
-  Type *DestTy = Ext.getType(); 
+  SmallVector<Instruction *, 2> Extracts;
+  // The users we want the PHI to have are:
+  // 1) The EI ExtractElement (we already know this)
+  // 2) Possibly more ExtractElements with the same index.
+  // 3) Another operand, which will feed back into the PHI.
+  Instruction *PHIUser = nullptr;
+  for (auto U : PN->users()) {
+    if (ExtractElementInst *EU = dyn_cast<ExtractElementInst>(U)) {
+      if (EI.getIndexOperand() == EU->getIndexOperand())
+        Extracts.push_back(EU);
+      else
+        return nullptr;
+    } else if (!PHIUser) {
+      PHIUser = cast<Instruction>(U);
+    } else {
+      return nullptr;
+    }
+  }
+
+  if (!PHIUser)
+    return nullptr;
+
+  // Verify that this PHI user has one use, which is the PHI itself,
+  // and that it is a binary operation which is cheap to scalarize.
+  // otherwise return nullptr.
+  if (!PHIUser->hasOneUse() || !(PHIUser->user_back() == PN) ||
+      !(isa<BinaryOperator>(PHIUser)) || !cheapToScalarize(PHIUser, true))
+    return nullptr;
+
+  // Create a scalar PHI node that will replace the vector PHI node
+  // just before the current PHI node.
+  PHINode *scalarPHI = cast<PHINode>(InsertNewInstWith(
+      PHINode::Create(EI.getType(), PN->getNumIncomingValues(), ""), *PN));
+  // Scalarize each PHI operand.
+  for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) {
+    Value *PHIInVal = PN->getIncomingValue(i);
+    BasicBlock *inBB = PN->getIncomingBlock(i);
+    Value *Elt = EI.getIndexOperand();
+    // If the operand is the PHI induction variable:
+    if (PHIInVal == PHIUser) {
+      // Scalarize the binary operation. Its first operand is the
+      // scalar PHI, and the second operand is extracted from the other
+      // vector operand.
+      BinaryOperator *B0 = cast<BinaryOperator>(PHIUser);
+      unsigned opId = (B0->getOperand(0) == PN) ? 1 : 0;
+      Value *Op = InsertNewInstWith(
+          ExtractElementInst::Create(B0->getOperand(opId), Elt,
+                                     B0->getOperand(opId)->getName() + ".Elt"),
+          *B0);
+      Value *newPHIUser = InsertNewInstWith(
+          BinaryOperator::CreateWithCopiedFlags(B0->getOpcode(),
+                                                scalarPHI, Op, B0), *B0);
+      scalarPHI->addIncoming(newPHIUser, inBB);
+    } else {
+      // Scalarize PHI input:
+      Instruction *newEI = ExtractElementInst::Create(PHIInVal, Elt, "");
+      // Insert the new instruction into the predecessor basic block.
+      Instruction *pos = dyn_cast<Instruction>(PHIInVal);
+      BasicBlock::iterator InsertPos;
+      if (pos && !isa<PHINode>(pos)) {
+        InsertPos = ++pos->getIterator();
+      } else {
+        InsertPos = inBB->getFirstInsertionPt();
+      }
+
+      InsertNewInstWith(newEI, *InsertPos);
+
+      scalarPHI->addIncoming(newEI, inBB);
+    }
+  }
+
+  for (auto E : Extracts)
+    replaceInstUsesWith(*E, scalarPHI);
+
+  return &EI;
+}
+
+static Instruction *foldBitcastExtElt(ExtractElementInst &Ext,
+                                      InstCombiner::BuilderTy &Builder,
+                                      bool IsBigEndian) {
+  Value *X;
+  uint64_t ExtIndexC;
+  if (!match(Ext.getVectorOperand(), m_BitCast(m_Value(X))) ||
+      !X->getType()->isVectorTy() ||
+      !match(Ext.getIndexOperand(), m_ConstantInt(ExtIndexC)))
+    return nullptr;
+
+  // If this extractelement is using a bitcast from a vector of the same number
+  // of elements, see if we can find the source element from the source vector:
+  // extelt (bitcast VecX), IndexC --> bitcast X[IndexC]
+  auto *SrcTy = cast<VectorType>(X->getType());
+  Type *DestTy = Ext.getType();
   ElementCount NumSrcElts = SrcTy->getElementCount();
   ElementCount NumElts =
       cast<VectorType>(Ext.getVectorOperandType())->getElementCount();
-  if (NumSrcElts == NumElts) 
-    if (Value *Elt = findScalarElement(X, ExtIndexC)) 
-      return new BitCastInst(Elt, DestTy); 
- 
+  if (NumSrcElts == NumElts)
+    if (Value *Elt = findScalarElement(X, ExtIndexC))
+      return new BitCastInst(Elt, DestTy);
+
   assert(NumSrcElts.isScalable() == NumElts.isScalable() &&
          "Src and Dst must be the same sort of vector type");
 
-  // If the source elements are wider than the destination, try to shift and 
-  // truncate a subset of scalar bits of an insert op. 
+  // If the source elements are wider than the destination, try to shift and
+  // truncate a subset of scalar bits of an insert op.
   if (NumSrcElts.getKnownMinValue() < NumElts.getKnownMinValue()) {
-    Value *Scalar; 
-    uint64_t InsIndexC; 
-    if (!match(X, m_InsertElt(m_Value(), m_Value(Scalar), 
-                              m_ConstantInt(InsIndexC)))) 
-      return nullptr; 
- 
-    // The extract must be from the subset of vector elements that we inserted 
-    // into. Example: if we inserted element 1 of a <2 x i64> and we are 
-    // extracting an i16 (narrowing ratio = 4), then this extract must be from 1 
-    // of elements 4-7 of the bitcasted vector. 
+    Value *Scalar;
+    uint64_t InsIndexC;
+    if (!match(X, m_InsertElt(m_Value(), m_Value(Scalar),
+                              m_ConstantInt(InsIndexC))))
+      return nullptr;
+
+    // The extract must be from the subset of vector elements that we inserted
+    // into. Example: if we inserted element 1 of a <2 x i64> and we are
+    // extracting an i16 (narrowing ratio = 4), then this extract must be from 1
+    // of elements 4-7 of the bitcasted vector.
     unsigned NarrowingRatio =
         NumElts.getKnownMinValue() / NumSrcElts.getKnownMinValue();
-    if (ExtIndexC / NarrowingRatio != InsIndexC) 
-      return nullptr; 
- 
-    // We are extracting part of the original scalar. How that scalar is 
-    // inserted into the vector depends on the endian-ness. Example: 
-    //              Vector Byte Elt Index:    0  1  2  3  4  5  6  7 
-    //                                       +--+--+--+--+--+--+--+--+ 
-    // inselt <2 x i32> V, <i32> S, 1:       |V0|V1|V2|V3|S0|S1|S2|S3| 
-    // extelt <4 x i16> V', 3:               |                 |S2|S3| 
-    //                                       +--+--+--+--+--+--+--+--+ 
-    // If this is little-endian, S2|S3 are the MSB of the 32-bit 'S' value. 
-    // If this is big-endian, S2|S3 are the LSB of the 32-bit 'S' value. 
-    // In this example, we must right-shift little-endian. Big-endian is just a 
-    // truncate. 
-    unsigned Chunk = ExtIndexC % NarrowingRatio; 
-    if (IsBigEndian) 
-      Chunk = NarrowingRatio - 1 - Chunk; 
- 
-    // Bail out if this is an FP vector to FP vector sequence. That would take 
-    // more instructions than we started with unless there is no shift, and it 
-    // may not be handled as well in the backend. 
-    bool NeedSrcBitcast = SrcTy->getScalarType()->isFloatingPointTy(); 
-    bool NeedDestBitcast = DestTy->isFloatingPointTy(); 
-    if (NeedSrcBitcast && NeedDestBitcast) 
-      return nullptr; 
- 
-    unsigned SrcWidth = SrcTy->getScalarSizeInBits(); 
-    unsigned DestWidth = DestTy->getPrimitiveSizeInBits(); 
-    unsigned ShAmt = Chunk * DestWidth; 
- 
-    // TODO: This limitation is more strict than necessary. We could sum the 
-    // number of new instructions and subtract the number eliminated to know if 
-    // we can proceed. 
-    if (!X->hasOneUse() || !Ext.getVectorOperand()->hasOneUse()) 
-      if (NeedSrcBitcast || NeedDestBitcast) 
-        return nullptr; 
- 
-    if (NeedSrcBitcast) { 
-      Type *SrcIntTy = IntegerType::getIntNTy(Scalar->getContext(), SrcWidth); 
-      Scalar = Builder.CreateBitCast(Scalar, SrcIntTy); 
-    } 
- 
-    if (ShAmt) { 
-      // Bail out if we could end with more instructions than we started with. 
-      if (!Ext.getVectorOperand()->hasOneUse()) 
-        return nullptr; 
-      Scalar = Builder.CreateLShr(Scalar, ShAmt); 
-    } 
- 
-    if (NeedDestBitcast) { 
-      Type *DestIntTy = IntegerType::getIntNTy(Scalar->getContext(), DestWidth); 
-      return new BitCastInst(Builder.CreateTrunc(Scalar, DestIntTy), DestTy); 
-    } 
-    return new TruncInst(Scalar, DestTy); 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Find elements of V demanded by UserInstr. 
-static APInt findDemandedEltsBySingleUser(Value *V, Instruction *UserInstr) { 
+    if (ExtIndexC / NarrowingRatio != InsIndexC)
+      return nullptr;
+
+    // We are extracting part of the original scalar. How that scalar is
+    // inserted into the vector depends on the endian-ness. Example:
+    //              Vector Byte Elt Index:    0  1  2  3  4  5  6  7
+    //                                       +--+--+--+--+--+--+--+--+
+    // inselt <2 x i32> V, <i32> S, 1:       |V0|V1|V2|V3|S0|S1|S2|S3|
+    // extelt <4 x i16> V', 3:               |                 |S2|S3|
+    //                                       +--+--+--+--+--+--+--+--+
+    // If this is little-endian, S2|S3 are the MSB of the 32-bit 'S' value.
+    // If this is big-endian, S2|S3 are the LSB of the 32-bit 'S' value.
+    // In this example, we must right-shift little-endian. Big-endian is just a
+    // truncate.
+    unsigned Chunk = ExtIndexC % NarrowingRatio;
+    if (IsBigEndian)
+      Chunk = NarrowingRatio - 1 - Chunk;
+
+    // Bail out if this is an FP vector to FP vector sequence. That would take
+    // more instructions than we started with unless there is no shift, and it
+    // may not be handled as well in the backend.
+    bool NeedSrcBitcast = SrcTy->getScalarType()->isFloatingPointTy();
+    bool NeedDestBitcast = DestTy->isFloatingPointTy();
+    if (NeedSrcBitcast && NeedDestBitcast)
+      return nullptr;
+
+    unsigned SrcWidth = SrcTy->getScalarSizeInBits();
+    unsigned DestWidth = DestTy->getPrimitiveSizeInBits();
+    unsigned ShAmt = Chunk * DestWidth;
+
+    // TODO: This limitation is more strict than necessary. We could sum the
+    // number of new instructions and subtract the number eliminated to know if
+    // we can proceed.
+    if (!X->hasOneUse() || !Ext.getVectorOperand()->hasOneUse())
+      if (NeedSrcBitcast || NeedDestBitcast)
+        return nullptr;
+
+    if (NeedSrcBitcast) {
+      Type *SrcIntTy = IntegerType::getIntNTy(Scalar->getContext(), SrcWidth);
+      Scalar = Builder.CreateBitCast(Scalar, SrcIntTy);
+    }
+
+    if (ShAmt) {
+      // Bail out if we could end with more instructions than we started with.
+      if (!Ext.getVectorOperand()->hasOneUse())
+        return nullptr;
+      Scalar = Builder.CreateLShr(Scalar, ShAmt);
+    }
+
+    if (NeedDestBitcast) {
+      Type *DestIntTy = IntegerType::getIntNTy(Scalar->getContext(), DestWidth);
+      return new BitCastInst(Builder.CreateTrunc(Scalar, DestIntTy), DestTy);
+    }
+    return new TruncInst(Scalar, DestTy);
+  }
+
+  return nullptr;
+}
+
+/// Find elements of V demanded by UserInstr.
+static APInt findDemandedEltsBySingleUser(Value *V, Instruction *UserInstr) {
   unsigned VWidth = cast<FixedVectorType>(V->getType())->getNumElements();
- 
-  // Conservatively assume that all elements are needed. 
-  APInt UsedElts(APInt::getAllOnesValue(VWidth)); 
- 
-  switch (UserInstr->getOpcode()) { 
-  case Instruction::ExtractElement: { 
-    ExtractElementInst *EEI = cast<ExtractElementInst>(UserInstr); 
-    assert(EEI->getVectorOperand() == V); 
-    ConstantInt *EEIIndexC = dyn_cast<ConstantInt>(EEI->getIndexOperand()); 
-    if (EEIIndexC && EEIIndexC->getValue().ult(VWidth)) { 
-      UsedElts = APInt::getOneBitSet(VWidth, EEIIndexC->getZExtValue()); 
-    } 
-    break; 
-  } 
-  case Instruction::ShuffleVector: { 
-    ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(UserInstr); 
-    unsigned MaskNumElts = 
+
+  // Conservatively assume that all elements are needed.
+  APInt UsedElts(APInt::getAllOnesValue(VWidth));
+
+  switch (UserInstr->getOpcode()) {
+  case Instruction::ExtractElement: {
+    ExtractElementInst *EEI = cast<ExtractElementInst>(UserInstr);
+    assert(EEI->getVectorOperand() == V);
+    ConstantInt *EEIIndexC = dyn_cast<ConstantInt>(EEI->getIndexOperand());
+    if (EEIIndexC && EEIIndexC->getValue().ult(VWidth)) {
+      UsedElts = APInt::getOneBitSet(VWidth, EEIIndexC->getZExtValue());
+    }
+    break;
+  }
+  case Instruction::ShuffleVector: {
+    ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(UserInstr);
+    unsigned MaskNumElts =
         cast<FixedVectorType>(UserInstr->getType())->getNumElements();
- 
-    UsedElts = APInt(VWidth, 0); 
-    for (unsigned i = 0; i < MaskNumElts; i++) { 
-      unsigned MaskVal = Shuffle->getMaskValue(i); 
-      if (MaskVal == -1u || MaskVal >= 2 * VWidth) 
-        continue; 
-      if (Shuffle->getOperand(0) == V && (MaskVal < VWidth)) 
-        UsedElts.setBit(MaskVal); 
-      if (Shuffle->getOperand(1) == V && 
-          ((MaskVal >= VWidth) && (MaskVal < 2 * VWidth))) 
-        UsedElts.setBit(MaskVal - VWidth); 
-    } 
-    break; 
-  } 
-  default: 
-    break; 
-  } 
-  return UsedElts; 
-} 
- 
-/// Find union of elements of V demanded by all its users. 
-/// If it is known by querying findDemandedEltsBySingleUser that 
-/// no user demands an element of V, then the corresponding bit 
-/// remains unset in the returned value. 
-static APInt findDemandedEltsByAllUsers(Value *V) { 
+
+    UsedElts = APInt(VWidth, 0);
+    for (unsigned i = 0; i < MaskNumElts; i++) {
+      unsigned MaskVal = Shuffle->getMaskValue(i);
+      if (MaskVal == -1u || MaskVal >= 2 * VWidth)
+        continue;
+      if (Shuffle->getOperand(0) == V && (MaskVal < VWidth))
+        UsedElts.setBit(MaskVal);
+      if (Shuffle->getOperand(1) == V &&
+          ((MaskVal >= VWidth) && (MaskVal < 2 * VWidth)))
+        UsedElts.setBit(MaskVal - VWidth);
+    }
+    break;
+  }
+  default:
+    break;
+  }
+  return UsedElts;
+}
+
+/// Find union of elements of V demanded by all its users.
+/// If it is known by querying findDemandedEltsBySingleUser that
+/// no user demands an element of V, then the corresponding bit
+/// remains unset in the returned value.
+static APInt findDemandedEltsByAllUsers(Value *V) {
   unsigned VWidth = cast<FixedVectorType>(V->getType())->getNumElements();
- 
-  APInt UnionUsedElts(VWidth, 0); 
-  for (const Use &U : V->uses()) { 
-    if (Instruction *I = dyn_cast<Instruction>(U.getUser())) { 
-      UnionUsedElts |= findDemandedEltsBySingleUser(V, I); 
-    } else { 
-      UnionUsedElts = APInt::getAllOnesValue(VWidth); 
-      break; 
-    } 
- 
-    if (UnionUsedElts.isAllOnesValue()) 
-      break; 
-  } 
- 
-  return UnionUsedElts; 
-} 
- 
+
+  APInt UnionUsedElts(VWidth, 0);
+  for (const Use &U : V->uses()) {
+    if (Instruction *I = dyn_cast<Instruction>(U.getUser())) {
+      UnionUsedElts |= findDemandedEltsBySingleUser(V, I);
+    } else {
+      UnionUsedElts = APInt::getAllOnesValue(VWidth);
+      break;
+    }
+
+    if (UnionUsedElts.isAllOnesValue())
+      break;
+  }
+
+  return UnionUsedElts;
+}
+
 Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
-  Value *SrcVec = EI.getVectorOperand(); 
-  Value *Index = EI.getIndexOperand(); 
-  if (Value *V = SimplifyExtractElementInst(SrcVec, Index, 
-                                            SQ.getWithInstruction(&EI))) 
-    return replaceInstUsesWith(EI, V); 
- 
-  // If extracting a specified index from the vector, see if we can recursively 
-  // find a previously computed scalar that was inserted into the vector. 
-  auto *IndexC = dyn_cast<ConstantInt>(Index); 
-  if (IndexC) { 
-    ElementCount EC = EI.getVectorOperandType()->getElementCount(); 
+  Value *SrcVec = EI.getVectorOperand();
+  Value *Index = EI.getIndexOperand();
+  if (Value *V = SimplifyExtractElementInst(SrcVec, Index,
+                                            SQ.getWithInstruction(&EI)))
+    return replaceInstUsesWith(EI, V);
+
+  // If extracting a specified index from the vector, see if we can recursively
+  // find a previously computed scalar that was inserted into the vector.
+  auto *IndexC = dyn_cast<ConstantInt>(Index);
+  if (IndexC) {
+    ElementCount EC = EI.getVectorOperandType()->getElementCount();
     unsigned NumElts = EC.getKnownMinValue();
- 
-    // InstSimplify should handle cases where the index is invalid. 
-    // For fixed-length vector, it's invalid to extract out-of-range element. 
+
+    // InstSimplify should handle cases where the index is invalid.
+    // For fixed-length vector, it's invalid to extract out-of-range element.
     if (!EC.isScalable() && IndexC->getValue().uge(NumElts))
-      return nullptr; 
- 
-    // This instruction only demands the single element from the input vector. 
-    // Skip for scalable type, the number of elements is unknown at 
-    // compile-time. 
+      return nullptr;
+
+    // This instruction only demands the single element from the input vector.
+    // Skip for scalable type, the number of elements is unknown at
+    // compile-time.
     if (!EC.isScalable() && NumElts != 1) {
-      // If the input vector has a single use, simplify it based on this use 
-      // property. 
-      if (SrcVec->hasOneUse()) { 
-        APInt UndefElts(NumElts, 0); 
-        APInt DemandedElts(NumElts, 0); 
-        DemandedElts.setBit(IndexC->getZExtValue()); 
-        if (Value *V = 
-                SimplifyDemandedVectorElts(SrcVec, DemandedElts, UndefElts)) 
-          return replaceOperand(EI, 0, V); 
-      } else { 
-        // If the input vector has multiple uses, simplify it based on a union 
-        // of all elements used. 
-        APInt DemandedElts = findDemandedEltsByAllUsers(SrcVec); 
-        if (!DemandedElts.isAllOnesValue()) { 
-          APInt UndefElts(NumElts, 0); 
-          if (Value *V = SimplifyDemandedVectorElts( 
-                  SrcVec, DemandedElts, UndefElts, 0 /* Depth */, 
-                  true /* AllowMultipleUsers */)) { 
-            if (V != SrcVec) { 
-              SrcVec->replaceAllUsesWith(V); 
-              return &EI; 
-            } 
-          } 
-        } 
-      } 
-    } 
-    if (Instruction *I = foldBitcastExtElt(EI, Builder, DL.isBigEndian())) 
-      return I; 
- 
-    // If there's a vector PHI feeding a scalar use through this extractelement 
-    // instruction, try to scalarize the PHI. 
-    if (auto *Phi = dyn_cast<PHINode>(SrcVec)) 
-      if (Instruction *ScalarPHI = scalarizePHI(EI, Phi)) 
-        return ScalarPHI; 
-  } 
- 
-  // TODO come up with a n-ary matcher that subsumes both unary and 
-  // binary matchers. 
-  UnaryOperator *UO; 
-  if (match(SrcVec, m_UnOp(UO)) && cheapToScalarize(SrcVec, IndexC)) { 
-    // extelt (unop X), Index --> unop (extelt X, Index) 
-    Value *X = UO->getOperand(0); 
-    Value *E = Builder.CreateExtractElement(X, Index); 
-    return UnaryOperator::CreateWithCopiedFlags(UO->getOpcode(), E, UO); 
-  } 
- 
-  BinaryOperator *BO; 
-  if (match(SrcVec, m_BinOp(BO)) && cheapToScalarize(SrcVec, IndexC)) { 
-    // extelt (binop X, Y), Index --> binop (extelt X, Index), (extelt Y, Index) 
-    Value *X = BO->getOperand(0), *Y = BO->getOperand(1); 
-    Value *E0 = Builder.CreateExtractElement(X, Index); 
-    Value *E1 = Builder.CreateExtractElement(Y, Index); 
-    return BinaryOperator::CreateWithCopiedFlags(BO->getOpcode(), E0, E1, BO); 
-  } 
- 
-  Value *X, *Y; 
-  CmpInst::Predicate Pred; 
-  if (match(SrcVec, m_Cmp(Pred, m_Value(X), m_Value(Y))) && 
-      cheapToScalarize(SrcVec, IndexC)) { 
-    // extelt (cmp X, Y), Index --> cmp (extelt X, Index), (extelt Y, Index) 
-    Value *E0 = Builder.CreateExtractElement(X, Index); 
-    Value *E1 = Builder.CreateExtractElement(Y, Index); 
-    return CmpInst::Create(cast<CmpInst>(SrcVec)->getOpcode(), Pred, E0, E1); 
-  } 
- 
-  if (auto *I = dyn_cast<Instruction>(SrcVec)) { 
-    if (auto *IE = dyn_cast<InsertElementInst>(I)) { 
-      // Extracting the inserted element? 
-      if (IE->getOperand(2) == Index) 
-        return replaceInstUsesWith(EI, IE->getOperand(1)); 
-      // If the inserted and extracted elements are constants, they must not 
-      // be the same value, extract from the pre-inserted value instead. 
-      if (isa<Constant>(IE->getOperand(2)) && IndexC) 
-        return replaceOperand(EI, 0, IE->getOperand(0)); 
-    } else if (auto *SVI = dyn_cast<ShuffleVectorInst>(I)) { 
-      // If this is extracting an element from a shufflevector, figure out where 
-      // it came from and extract from the appropriate input element instead. 
-      // Restrict the following transformation to fixed-length vector. 
-      if (isa<FixedVectorType>(SVI->getType()) && isa<ConstantInt>(Index)) { 
-        int SrcIdx = 
-            SVI->getMaskValue(cast<ConstantInt>(Index)->getZExtValue()); 
-        Value *Src; 
-        unsigned LHSWidth = cast<FixedVectorType>(SVI->getOperand(0)->getType()) 
-                                ->getNumElements(); 
- 
-        if (SrcIdx < 0) 
-          return replaceInstUsesWith(EI, UndefValue::get(EI.getType())); 
-        if (SrcIdx < (int)LHSWidth) 
-          Src = SVI->getOperand(0); 
-        else { 
-          SrcIdx -= LHSWidth; 
-          Src = SVI->getOperand(1); 
-        } 
-        Type *Int32Ty = Type::getInt32Ty(EI.getContext()); 
-        return ExtractElementInst::Create( 
-            Src, ConstantInt::get(Int32Ty, SrcIdx, false)); 
-      } 
-    } else if (auto *CI = dyn_cast<CastInst>(I)) { 
-      // Canonicalize extractelement(cast) -> cast(extractelement). 
-      // Bitcasts can change the number of vector elements, and they cost 
-      // nothing. 
-      if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)) { 
-        Value *EE = Builder.CreateExtractElement(CI->getOperand(0), Index); 
-        return CastInst::Create(CI->getOpcode(), EE, EI.getType()); 
-      } 
-    } 
-  } 
-  return nullptr; 
-} 
- 
-/// If V is a shuffle of values that ONLY returns elements from either LHS or 
-/// RHS, return the shuffle mask and true. Otherwise, return false. 
-static bool collectSingleShuffleElements(Value *V, Value *LHS, Value *RHS, 
-                                         SmallVectorImpl<int> &Mask) { 
-  assert(LHS->getType() == RHS->getType() && 
-         "Invalid CollectSingleShuffleElements"); 
+      // If the input vector has a single use, simplify it based on this use
+      // property.
+      if (SrcVec->hasOneUse()) {
+        APInt UndefElts(NumElts, 0);
+        APInt DemandedElts(NumElts, 0);
+        DemandedElts.setBit(IndexC->getZExtValue());
+        if (Value *V =
+                SimplifyDemandedVectorElts(SrcVec, DemandedElts, UndefElts))
+          return replaceOperand(EI, 0, V);
+      } else {
+        // If the input vector has multiple uses, simplify it based on a union
+        // of all elements used.
+        APInt DemandedElts = findDemandedEltsByAllUsers(SrcVec);
+        if (!DemandedElts.isAllOnesValue()) {
+          APInt UndefElts(NumElts, 0);
+          if (Value *V = SimplifyDemandedVectorElts(
+                  SrcVec, DemandedElts, UndefElts, 0 /* Depth */,
+                  true /* AllowMultipleUsers */)) {
+            if (V != SrcVec) {
+              SrcVec->replaceAllUsesWith(V);
+              return &EI;
+            }
+          }
+        }
+      }
+    }
+    if (Instruction *I = foldBitcastExtElt(EI, Builder, DL.isBigEndian()))
+      return I;
+
+    // If there's a vector PHI feeding a scalar use through this extractelement
+    // instruction, try to scalarize the PHI.
+    if (auto *Phi = dyn_cast<PHINode>(SrcVec))
+      if (Instruction *ScalarPHI = scalarizePHI(EI, Phi))
+        return ScalarPHI;
+  }
+
+  // TODO come up with a n-ary matcher that subsumes both unary and
+  // binary matchers.
+  UnaryOperator *UO;
+  if (match(SrcVec, m_UnOp(UO)) && cheapToScalarize(SrcVec, IndexC)) {
+    // extelt (unop X), Index --> unop (extelt X, Index)
+    Value *X = UO->getOperand(0);
+    Value *E = Builder.CreateExtractElement(X, Index);
+    return UnaryOperator::CreateWithCopiedFlags(UO->getOpcode(), E, UO);
+  }
+
+  BinaryOperator *BO;
+  if (match(SrcVec, m_BinOp(BO)) && cheapToScalarize(SrcVec, IndexC)) {
+    // extelt (binop X, Y), Index --> binop (extelt X, Index), (extelt Y, Index)
+    Value *X = BO->getOperand(0), *Y = BO->getOperand(1);
+    Value *E0 = Builder.CreateExtractElement(X, Index);
+    Value *E1 = Builder.CreateExtractElement(Y, Index);
+    return BinaryOperator::CreateWithCopiedFlags(BO->getOpcode(), E0, E1, BO);
+  }
+
+  Value *X, *Y;
+  CmpInst::Predicate Pred;
+  if (match(SrcVec, m_Cmp(Pred, m_Value(X), m_Value(Y))) &&
+      cheapToScalarize(SrcVec, IndexC)) {
+    // extelt (cmp X, Y), Index --> cmp (extelt X, Index), (extelt Y, Index)
+    Value *E0 = Builder.CreateExtractElement(X, Index);
+    Value *E1 = Builder.CreateExtractElement(Y, Index);
+    return CmpInst::Create(cast<CmpInst>(SrcVec)->getOpcode(), Pred, E0, E1);
+  }
+
+  if (auto *I = dyn_cast<Instruction>(SrcVec)) {
+    if (auto *IE = dyn_cast<InsertElementInst>(I)) {
+      // Extracting the inserted element?
+      if (IE->getOperand(2) == Index)
+        return replaceInstUsesWith(EI, IE->getOperand(1));
+      // If the inserted and extracted elements are constants, they must not
+      // be the same value, extract from the pre-inserted value instead.
+      if (isa<Constant>(IE->getOperand(2)) && IndexC)
+        return replaceOperand(EI, 0, IE->getOperand(0));
+    } else if (auto *SVI = dyn_cast<ShuffleVectorInst>(I)) {
+      // If this is extracting an element from a shufflevector, figure out where
+      // it came from and extract from the appropriate input element instead.
+      // Restrict the following transformation to fixed-length vector.
+      if (isa<FixedVectorType>(SVI->getType()) && isa<ConstantInt>(Index)) {
+        int SrcIdx =
+            SVI->getMaskValue(cast<ConstantInt>(Index)->getZExtValue());
+        Value *Src;
+        unsigned LHSWidth = cast<FixedVectorType>(SVI->getOperand(0)->getType())
+                                ->getNumElements();
+
+        if (SrcIdx < 0)
+          return replaceInstUsesWith(EI, UndefValue::get(EI.getType()));
+        if (SrcIdx < (int)LHSWidth)
+          Src = SVI->getOperand(0);
+        else {
+          SrcIdx -= LHSWidth;
+          Src = SVI->getOperand(1);
+        }
+        Type *Int32Ty = Type::getInt32Ty(EI.getContext());
+        return ExtractElementInst::Create(
+            Src, ConstantInt::get(Int32Ty, SrcIdx, false));
+      }
+    } else if (auto *CI = dyn_cast<CastInst>(I)) {
+      // Canonicalize extractelement(cast) -> cast(extractelement).
+      // Bitcasts can change the number of vector elements, and they cost
+      // nothing.
+      if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)) {
+        Value *EE = Builder.CreateExtractElement(CI->getOperand(0), Index);
+        return CastInst::Create(CI->getOpcode(), EE, EI.getType());
+      }
+    }
+  }
+  return nullptr;
+}
+
+/// If V is a shuffle of values that ONLY returns elements from either LHS or
+/// RHS, return the shuffle mask and true. Otherwise, return false.
+static bool collectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
+                                         SmallVectorImpl<int> &Mask) {
+  assert(LHS->getType() == RHS->getType() &&
+         "Invalid CollectSingleShuffleElements");
   unsigned NumElts = cast<FixedVectorType>(V->getType())->getNumElements();
- 
-  if (isa<UndefValue>(V)) { 
-    Mask.assign(NumElts, -1); 
-    return true; 
-  } 
- 
-  if (V == LHS) { 
-    for (unsigned i = 0; i != NumElts; ++i) 
-      Mask.push_back(i); 
-    return true; 
-  } 
- 
-  if (V == RHS) { 
-    for (unsigned i = 0; i != NumElts; ++i) 
-      Mask.push_back(i + NumElts); 
-    return true; 
-  } 
- 
-  if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) { 
-    // If this is an insert of an extract from some other vector, include it. 
-    Value *VecOp    = IEI->getOperand(0); 
-    Value *ScalarOp = IEI->getOperand(1); 
-    Value *IdxOp    = IEI->getOperand(2); 
- 
-    if (!isa<ConstantInt>(IdxOp)) 
-      return false; 
-    unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue(); 
- 
-    if (isa<UndefValue>(ScalarOp)) {  // inserting undef into vector. 
-      // We can handle this if the vector we are inserting into is 
-      // transitively ok. 
-      if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) { 
-        // If so, update the mask to reflect the inserted undef. 
-        Mask[InsertedIdx] = -1; 
-        return true; 
-      } 
-    } else if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)){ 
-      if (isa<ConstantInt>(EI->getOperand(1))) { 
-        unsigned ExtractedIdx = 
-        cast<ConstantInt>(EI->getOperand(1))->getZExtValue(); 
-        unsigned NumLHSElts = 
+
+  if (isa<UndefValue>(V)) {
+    Mask.assign(NumElts, -1);
+    return true;
+  }
+
+  if (V == LHS) {
+    for (unsigned i = 0; i != NumElts; ++i)
+      Mask.push_back(i);
+    return true;
+  }
+
+  if (V == RHS) {
+    for (unsigned i = 0; i != NumElts; ++i)
+      Mask.push_back(i + NumElts);
+    return true;
+  }
+
+  if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
+    // If this is an insert of an extract from some other vector, include it.
+    Value *VecOp    = IEI->getOperand(0);
+    Value *ScalarOp = IEI->getOperand(1);
+    Value *IdxOp    = IEI->getOperand(2);
+
+    if (!isa<ConstantInt>(IdxOp))
+      return false;
+    unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
+
+    if (isa<UndefValue>(ScalarOp)) {  // inserting undef into vector.
+      // We can handle this if the vector we are inserting into is
+      // transitively ok.
+      if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
+        // If so, update the mask to reflect the inserted undef.
+        Mask[InsertedIdx] = -1;
+        return true;
+      }
+    } else if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)){
+      if (isa<ConstantInt>(EI->getOperand(1))) {
+        unsigned ExtractedIdx =
+        cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
+        unsigned NumLHSElts =
             cast<FixedVectorType>(LHS->getType())->getNumElements();
- 
-        // This must be extracting from either LHS or RHS. 
-        if (EI->getOperand(0) == LHS || EI->getOperand(0) == RHS) { 
-          // We can handle this if the vector we are inserting into is 
-          // transitively ok. 
-          if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) { 
-            // If so, update the mask to reflect the inserted value. 
-            if (EI->getOperand(0) == LHS) { 
-              Mask[InsertedIdx % NumElts] = ExtractedIdx; 
-            } else { 
-              assert(EI->getOperand(0) == RHS); 
-              Mask[InsertedIdx % NumElts] = ExtractedIdx + NumLHSElts; 
-            } 
-            return true; 
-          } 
-        } 
-      } 
-    } 
-  } 
- 
-  return false; 
-} 
- 
-/// If we have insertion into a vector that is wider than the vector that we 
-/// are extracting from, try to widen the source vector to allow a single 
-/// shufflevector to replace one or more insert/extract pairs. 
-static void replaceExtractElements(InsertElementInst *InsElt, 
-                                   ExtractElementInst *ExtElt, 
+
+        // This must be extracting from either LHS or RHS.
+        if (EI->getOperand(0) == LHS || EI->getOperand(0) == RHS) {
+          // We can handle this if the vector we are inserting into is
+          // transitively ok.
+          if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
+            // If so, update the mask to reflect the inserted value.
+            if (EI->getOperand(0) == LHS) {
+              Mask[InsertedIdx % NumElts] = ExtractedIdx;
+            } else {
+              assert(EI->getOperand(0) == RHS);
+              Mask[InsertedIdx % NumElts] = ExtractedIdx + NumLHSElts;
+            }
+            return true;
+          }
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+/// If we have insertion into a vector that is wider than the vector that we
+/// are extracting from, try to widen the source vector to allow a single
+/// shufflevector to replace one or more insert/extract pairs.
+static void replaceExtractElements(InsertElementInst *InsElt,
+                                   ExtractElementInst *ExtElt,
                                    InstCombinerImpl &IC) {
   auto *InsVecType = cast<FixedVectorType>(InsElt->getType());
   auto *ExtVecType = cast<FixedVectorType>(ExtElt->getVectorOperandType());
-  unsigned NumInsElts = InsVecType->getNumElements(); 
-  unsigned NumExtElts = ExtVecType->getNumElements(); 
- 
-  // The inserted-to vector must be wider than the extracted-from vector. 
-  if (InsVecType->getElementType() != ExtVecType->getElementType() || 
-      NumExtElts >= NumInsElts) 
-    return; 
- 
-  // Create a shuffle mask to widen the extended-from vector using undefined 
-  // values. The mask selects all of the values of the original vector followed 
-  // by as many undefined values as needed to create a vector of the same length 
-  // as the inserted-to vector. 
-  SmallVector<int, 16> ExtendMask; 
-  for (unsigned i = 0; i < NumExtElts; ++i) 
-    ExtendMask.push_back(i); 
-  for (unsigned i = NumExtElts; i < NumInsElts; ++i) 
-    ExtendMask.push_back(-1); 
- 
-  Value *ExtVecOp = ExtElt->getVectorOperand(); 
-  auto *ExtVecOpInst = dyn_cast<Instruction>(ExtVecOp); 
-  BasicBlock *InsertionBlock = (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst)) 
-                                   ? ExtVecOpInst->getParent() 
-                                   : ExtElt->getParent(); 
- 
-  // TODO: This restriction matches the basic block check below when creating 
-  // new extractelement instructions. If that limitation is removed, this one 
-  // could also be removed. But for now, we just bail out to ensure that we 
-  // will replace the extractelement instruction that is feeding our 
-  // insertelement instruction. This allows the insertelement to then be 
-  // replaced by a shufflevector. If the insertelement is not replaced, we can 
-  // induce infinite looping because there's an optimization for extractelement 
-  // that will delete our widening shuffle. This would trigger another attempt 
-  // here to create that shuffle, and we spin forever. 
-  if (InsertionBlock != InsElt->getParent()) 
-    return; 
- 
-  // TODO: This restriction matches the check in visitInsertElementInst() and 
-  // prevents an infinite loop caused by not turning the extract/insert pair 
-  // into a shuffle. We really should not need either check, but we're lacking 
-  // folds for shufflevectors because we're afraid to generate shuffle masks 
-  // that the backend can't handle. 
-  if (InsElt->hasOneUse() && isa<InsertElementInst>(InsElt->user_back())) 
-    return; 
- 
-  auto *WideVec = 
-      new ShuffleVectorInst(ExtVecOp, UndefValue::get(ExtVecType), ExtendMask); 
- 
-  // Insert the new shuffle after the vector operand of the extract is defined 
-  // (as long as it's not a PHI) or at the start of the basic block of the 
-  // extract, so any subsequent extracts in the same basic block can use it. 
-  // TODO: Insert before the earliest ExtractElementInst that is replaced. 
-  if (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst)) 
-    WideVec->insertAfter(ExtVecOpInst); 
-  else 
-    IC.InsertNewInstWith(WideVec, *ExtElt->getParent()->getFirstInsertionPt()); 
- 
-  // Replace extracts from the original narrow vector with extracts from the new 
-  // wide vector. 
-  for (User *U : ExtVecOp->users()) { 
-    ExtractElementInst *OldExt = dyn_cast<ExtractElementInst>(U); 
-    if (!OldExt || OldExt->getParent() != WideVec->getParent()) 
-      continue; 
-    auto *NewExt = ExtractElementInst::Create(WideVec, OldExt->getOperand(1)); 
-    NewExt->insertAfter(OldExt); 
-    IC.replaceInstUsesWith(*OldExt, NewExt); 
-  } 
-} 
- 
-/// We are building a shuffle to create V, which is a sequence of insertelement, 
-/// extractelement pairs. If PermittedRHS is set, then we must either use it or 
-/// not rely on the second vector source. Return a std::pair containing the 
-/// left and right vectors of the proposed shuffle (or 0), and set the Mask 
-/// parameter as required. 
-/// 
-/// Note: we intentionally don't try to fold earlier shuffles since they have 
-/// often been chosen carefully to be efficiently implementable on the target. 
-using ShuffleOps = std::pair<Value *, Value *>; 
- 
-static ShuffleOps collectShuffleElements(Value *V, SmallVectorImpl<int> &Mask, 
-                                         Value *PermittedRHS, 
+  unsigned NumInsElts = InsVecType->getNumElements();
+  unsigned NumExtElts = ExtVecType->getNumElements();
+
+  // The inserted-to vector must be wider than the extracted-from vector.
+  if (InsVecType->getElementType() != ExtVecType->getElementType() ||
+      NumExtElts >= NumInsElts)
+    return;
+
+  // Create a shuffle mask to widen the extended-from vector using undefined
+  // values. The mask selects all of the values of the original vector followed
+  // by as many undefined values as needed to create a vector of the same length
+  // as the inserted-to vector.
+  SmallVector<int, 16> ExtendMask;
+  for (unsigned i = 0; i < NumExtElts; ++i)
+    ExtendMask.push_back(i);
+  for (unsigned i = NumExtElts; i < NumInsElts; ++i)
+    ExtendMask.push_back(-1);
+
+  Value *ExtVecOp = ExtElt->getVectorOperand();
+  auto *ExtVecOpInst = dyn_cast<Instruction>(ExtVecOp);
+  BasicBlock *InsertionBlock = (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst))
+                                   ? ExtVecOpInst->getParent()
+                                   : ExtElt->getParent();
+
+  // TODO: This restriction matches the basic block check below when creating
+  // new extractelement instructions. If that limitation is removed, this one
+  // could also be removed. But for now, we just bail out to ensure that we
+  // will replace the extractelement instruction that is feeding our
+  // insertelement instruction. This allows the insertelement to then be
+  // replaced by a shufflevector. If the insertelement is not replaced, we can
+  // induce infinite looping because there's an optimization for extractelement
+  // that will delete our widening shuffle. This would trigger another attempt
+  // here to create that shuffle, and we spin forever.
+  if (InsertionBlock != InsElt->getParent())
+    return;
+
+  // TODO: This restriction matches the check in visitInsertElementInst() and
+  // prevents an infinite loop caused by not turning the extract/insert pair
+  // into a shuffle. We really should not need either check, but we're lacking
+  // folds for shufflevectors because we're afraid to generate shuffle masks
+  // that the backend can't handle.
+  if (InsElt->hasOneUse() && isa<InsertElementInst>(InsElt->user_back()))
+    return;
+
+  auto *WideVec =
+      new ShuffleVectorInst(ExtVecOp, UndefValue::get(ExtVecType), ExtendMask);
+
+  // Insert the new shuffle after the vector operand of the extract is defined
+  // (as long as it's not a PHI) or at the start of the basic block of the
+  // extract, so any subsequent extracts in the same basic block can use it.
+  // TODO: Insert before the earliest ExtractElementInst that is replaced.
+  if (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst))
+    WideVec->insertAfter(ExtVecOpInst);
+  else
+    IC.InsertNewInstWith(WideVec, *ExtElt->getParent()->getFirstInsertionPt());
+
+  // Replace extracts from the original narrow vector with extracts from the new
+  // wide vector.
+  for (User *U : ExtVecOp->users()) {
+    ExtractElementInst *OldExt = dyn_cast<ExtractElementInst>(U);
+    if (!OldExt || OldExt->getParent() != WideVec->getParent())
+      continue;
+    auto *NewExt = ExtractElementInst::Create(WideVec, OldExt->getOperand(1));
+    NewExt->insertAfter(OldExt);
+    IC.replaceInstUsesWith(*OldExt, NewExt);
+  }
+}
+
+/// We are building a shuffle to create V, which is a sequence of insertelement,
+/// extractelement pairs. If PermittedRHS is set, then we must either use it or
+/// not rely on the second vector source. Return a std::pair containing the
+/// left and right vectors of the proposed shuffle (or 0), and set the Mask
+/// parameter as required.
+///
+/// Note: we intentionally don't try to fold earlier shuffles since they have
+/// often been chosen carefully to be efficiently implementable on the target.
+using ShuffleOps = std::pair<Value *, Value *>;
+
+static ShuffleOps collectShuffleElements(Value *V, SmallVectorImpl<int> &Mask,
+                                         Value *PermittedRHS,
                                          InstCombinerImpl &IC) {
-  assert(V->getType()->isVectorTy() && "Invalid shuffle!"); 
-  unsigned NumElts = cast<FixedVectorType>(V->getType())->getNumElements(); 
- 
-  if (isa<UndefValue>(V)) { 
-    Mask.assign(NumElts, -1); 
-    return std::make_pair( 
-        PermittedRHS ? UndefValue::get(PermittedRHS->getType()) : V, nullptr); 
-  } 
- 
-  if (isa<ConstantAggregateZero>(V)) { 
-    Mask.assign(NumElts, 0); 
-    return std::make_pair(V, nullptr); 
-  } 
- 
-  if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) { 
-    // If this is an insert of an extract from some other vector, include it. 
-    Value *VecOp    = IEI->getOperand(0); 
-    Value *ScalarOp = IEI->getOperand(1); 
-    Value *IdxOp    = IEI->getOperand(2); 
- 
-    if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) { 
-      if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp)) { 
-        unsigned ExtractedIdx = 
-          cast<ConstantInt>(EI->getOperand(1))->getZExtValue(); 
-        unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue(); 
- 
-        // Either the extracted from or inserted into vector must be RHSVec, 
-        // otherwise we'd end up with a shuffle of three inputs. 
-        if (EI->getOperand(0) == PermittedRHS || PermittedRHS == nullptr) { 
-          Value *RHS = EI->getOperand(0); 
-          ShuffleOps LR = collectShuffleElements(VecOp, Mask, RHS, IC); 
-          assert(LR.second == nullptr || LR.second == RHS); 
- 
-          if (LR.first->getType() != RHS->getType()) { 
-            // Although we are giving up for now, see if we can create extracts 
-            // that match the inserts for another round of combining. 
-            replaceExtractElements(IEI, EI, IC); 
- 
-            // We tried our best, but we can't find anything compatible with RHS 
-            // further up the chain. Return a trivial shuffle. 
-            for (unsigned i = 0; i < NumElts; ++i) 
-              Mask[i] = i; 
-            return std::make_pair(V, nullptr); 
-          } 
- 
-          unsigned NumLHSElts = 
+  assert(V->getType()->isVectorTy() && "Invalid shuffle!");
+  unsigned NumElts = cast<FixedVectorType>(V->getType())->getNumElements();
+
+  if (isa<UndefValue>(V)) {
+    Mask.assign(NumElts, -1);
+    return std::make_pair(
+        PermittedRHS ? UndefValue::get(PermittedRHS->getType()) : V, nullptr);
+  }
+
+  if (isa<ConstantAggregateZero>(V)) {
+    Mask.assign(NumElts, 0);
+    return std::make_pair(V, nullptr);
+  }
+
+  if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
+    // If this is an insert of an extract from some other vector, include it.
+    Value *VecOp    = IEI->getOperand(0);
+    Value *ScalarOp = IEI->getOperand(1);
+    Value *IdxOp    = IEI->getOperand(2);
+
+    if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) {
+      if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp)) {
+        unsigned ExtractedIdx =
+          cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
+        unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
+
+        // Either the extracted from or inserted into vector must be RHSVec,
+        // otherwise we'd end up with a shuffle of three inputs.
+        if (EI->getOperand(0) == PermittedRHS || PermittedRHS == nullptr) {
+          Value *RHS = EI->getOperand(0);
+          ShuffleOps LR = collectShuffleElements(VecOp, Mask, RHS, IC);
+          assert(LR.second == nullptr || LR.second == RHS);
+
+          if (LR.first->getType() != RHS->getType()) {
+            // Although we are giving up for now, see if we can create extracts
+            // that match the inserts for another round of combining.
+            replaceExtractElements(IEI, EI, IC);
+
+            // We tried our best, but we can't find anything compatible with RHS
+            // further up the chain. Return a trivial shuffle.
+            for (unsigned i = 0; i < NumElts; ++i)
+              Mask[i] = i;
+            return std::make_pair(V, nullptr);
+          }
+
+          unsigned NumLHSElts =
               cast<FixedVectorType>(RHS->getType())->getNumElements();
-          Mask[InsertedIdx % NumElts] = NumLHSElts + ExtractedIdx; 
-          return std::make_pair(LR.first, RHS); 
-        } 
- 
-        if (VecOp == PermittedRHS) { 
-          // We've gone as far as we can: anything on the other side of the 
-          // extractelement will already have been converted into a shuffle. 
-          unsigned NumLHSElts = 
+          Mask[InsertedIdx % NumElts] = NumLHSElts + ExtractedIdx;
+          return std::make_pair(LR.first, RHS);
+        }
+
+        if (VecOp == PermittedRHS) {
+          // We've gone as far as we can: anything on the other side of the
+          // extractelement will already have been converted into a shuffle.
+          unsigned NumLHSElts =
               cast<FixedVectorType>(EI->getOperand(0)->getType())
                   ->getNumElements();
-          for (unsigned i = 0; i != NumElts; ++i) 
-            Mask.push_back(i == InsertedIdx ? ExtractedIdx : NumLHSElts + i); 
-          return std::make_pair(EI->getOperand(0), PermittedRHS); 
-        } 
- 
-        // If this insertelement is a chain that comes from exactly these two 
-        // vectors, return the vector and the effective shuffle. 
-        if (EI->getOperand(0)->getType() == PermittedRHS->getType() && 
-            collectSingleShuffleElements(IEI, EI->getOperand(0), PermittedRHS, 
-                                         Mask)) 
-          return std::make_pair(EI->getOperand(0), PermittedRHS); 
-      } 
-    } 
-  } 
- 
-  // Otherwise, we can't do anything fancy. Return an identity vector. 
-  for (unsigned i = 0; i != NumElts; ++i) 
-    Mask.push_back(i); 
-  return std::make_pair(V, nullptr); 
-} 
- 
+          for (unsigned i = 0; i != NumElts; ++i)
+            Mask.push_back(i == InsertedIdx ? ExtractedIdx : NumLHSElts + i);
+          return std::make_pair(EI->getOperand(0), PermittedRHS);
+        }
+
+        // If this insertelement is a chain that comes from exactly these two
+        // vectors, return the vector and the effective shuffle.
+        if (EI->getOperand(0)->getType() == PermittedRHS->getType() &&
+            collectSingleShuffleElements(IEI, EI->getOperand(0), PermittedRHS,
+                                         Mask))
+          return std::make_pair(EI->getOperand(0), PermittedRHS);
+      }
+    }
+  }
+
+  // Otherwise, we can't do anything fancy. Return an identity vector.
+  for (unsigned i = 0; i != NumElts; ++i)
+    Mask.push_back(i);
+  return std::make_pair(V, nullptr);
+}
+
 /// Look for chain of insertvalue's that fully define an aggregate, and trace
 /// back the values inserted, see if they are all were extractvalue'd from
 /// the same source aggregate from the exact same element indexes.
@@ -984,1661 +984,1661 @@ Instruction *InstCombinerImpl::foldAggregateConstructionIntoAggregateReuse(
   return replaceInstUsesWith(OrigIVI, PHI);
 }
 
-/// Try to find redundant insertvalue instructions, like the following ones: 
-///  %0 = insertvalue { i8, i32 } undef, i8 %x, 0 
-///  %1 = insertvalue { i8, i32 } %0,    i8 %y, 0 
-/// Here the second instruction inserts values at the same indices, as the 
-/// first one, making the first one redundant. 
-/// It should be transformed to: 
-///  %0 = insertvalue { i8, i32 } undef, i8 %y, 0 
+/// Try to find redundant insertvalue instructions, like the following ones:
+///  %0 = insertvalue { i8, i32 } undef, i8 %x, 0
+///  %1 = insertvalue { i8, i32 } %0,    i8 %y, 0
+/// Here the second instruction inserts values at the same indices, as the
+/// first one, making the first one redundant.
+/// It should be transformed to:
+///  %0 = insertvalue { i8, i32 } undef, i8 %y, 0
 Instruction *InstCombinerImpl::visitInsertValueInst(InsertValueInst &I) {
-  bool IsRedundant = false; 
-  ArrayRef<unsigned int> FirstIndices = I.getIndices(); 
- 
-  // If there is a chain of insertvalue instructions (each of them except the 
-  // last one has only one use and it's another insertvalue insn from this 
-  // chain), check if any of the 'children' uses the same indices as the first 
-  // instruction. In this case, the first one is redundant. 
-  Value *V = &I; 
-  unsigned Depth = 0; 
-  while (V->hasOneUse() && Depth < 10) { 
-    User *U = V->user_back(); 
-    auto UserInsInst = dyn_cast<InsertValueInst>(U); 
-    if (!UserInsInst || U->getOperand(0) != V) 
-      break; 
-    if (UserInsInst->getIndices() == FirstIndices) { 
-      IsRedundant = true; 
-      break; 
-    } 
-    V = UserInsInst; 
-    Depth++; 
-  } 
- 
-  if (IsRedundant) 
-    return replaceInstUsesWith(I, I.getOperand(0)); 
+  bool IsRedundant = false;
+  ArrayRef<unsigned int> FirstIndices = I.getIndices();
+
+  // If there is a chain of insertvalue instructions (each of them except the
+  // last one has only one use and it's another insertvalue insn from this
+  // chain), check if any of the 'children' uses the same indices as the first
+  // instruction. In this case, the first one is redundant.
+  Value *V = &I;
+  unsigned Depth = 0;
+  while (V->hasOneUse() && Depth < 10) {
+    User *U = V->user_back();
+    auto UserInsInst = dyn_cast<InsertValueInst>(U);
+    if (!UserInsInst || U->getOperand(0) != V)
+      break;
+    if (UserInsInst->getIndices() == FirstIndices) {
+      IsRedundant = true;
+      break;
+    }
+    V = UserInsInst;
+    Depth++;
+  }
+
+  if (IsRedundant)
+    return replaceInstUsesWith(I, I.getOperand(0));
 
   if (Instruction *NewI = foldAggregateConstructionIntoAggregateReuse(I))
     return NewI;
 
-  return nullptr; 
-} 
- 
-static bool isShuffleEquivalentToSelect(ShuffleVectorInst &Shuf) { 
-  // Can not analyze scalable type, the number of elements is not a compile-time 
-  // constant. 
-  if (isa<ScalableVectorType>(Shuf.getOperand(0)->getType())) 
-    return false; 
- 
-  int MaskSize = Shuf.getShuffleMask().size(); 
-  int VecSize = 
-      cast<FixedVectorType>(Shuf.getOperand(0)->getType())->getNumElements(); 
- 
-  // A vector select does not change the size of the operands. 
-  if (MaskSize != VecSize) 
-    return false; 
- 
-  // Each mask element must be undefined or choose a vector element from one of 
-  // the source operands without crossing vector lanes. 
-  for (int i = 0; i != MaskSize; ++i) { 
-    int Elt = Shuf.getMaskValue(i); 
-    if (Elt != -1 && Elt != i && Elt != i + VecSize) 
-      return false; 
-  } 
- 
-  return true; 
-} 
- 
-/// Turn a chain of inserts that splats a value into an insert + shuffle: 
-/// insertelt(insertelt(insertelt(insertelt X, %k, 0), %k, 1), %k, 2) ... -> 
-/// shufflevector(insertelt(X, %k, 0), undef, zero) 
-static Instruction *foldInsSequenceIntoSplat(InsertElementInst &InsElt) { 
-  // We are interested in the last insert in a chain. So if this insert has a 
-  // single user and that user is an insert, bail. 
-  if (InsElt.hasOneUse() && isa<InsertElementInst>(InsElt.user_back())) 
-    return nullptr; 
- 
-  VectorType *VecTy = InsElt.getType(); 
-  // Can not handle scalable type, the number of elements is not a compile-time 
-  // constant. 
-  if (isa<ScalableVectorType>(VecTy)) 
-    return nullptr; 
-  unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements(); 
- 
-  // Do not try to do this for a one-element vector, since that's a nop, 
-  // and will cause an inf-loop. 
-  if (NumElements == 1) 
-    return nullptr; 
- 
-  Value *SplatVal = InsElt.getOperand(1); 
-  InsertElementInst *CurrIE = &InsElt; 
-  SmallBitVector ElementPresent(NumElements, false); 
-  InsertElementInst *FirstIE = nullptr; 
- 
-  // Walk the chain backwards, keeping track of which indices we inserted into, 
-  // until we hit something that isn't an insert of the splatted value. 
-  while (CurrIE) { 
-    auto *Idx = dyn_cast<ConstantInt>(CurrIE->getOperand(2)); 
-    if (!Idx || CurrIE->getOperand(1) != SplatVal) 
-      return nullptr; 
- 
-    auto *NextIE = dyn_cast<InsertElementInst>(CurrIE->getOperand(0)); 
-    // Check none of the intermediate steps have any additional uses, except 
-    // for the root insertelement instruction, which can be re-used, if it 
-    // inserts at position 0. 
-    if (CurrIE != &InsElt && 
-        (!CurrIE->hasOneUse() && (NextIE != nullptr || !Idx->isZero()))) 
-      return nullptr; 
- 
-    ElementPresent[Idx->getZExtValue()] = true; 
-    FirstIE = CurrIE; 
-    CurrIE = NextIE; 
-  } 
- 
-  // If this is just a single insertelement (not a sequence), we are done. 
-  if (FirstIE == &InsElt) 
-    return nullptr; 
- 
-  // If we are not inserting into an undef vector, make sure we've seen an 
-  // insert into every element. 
-  // TODO: If the base vector is not undef, it might be better to create a splat 
-  //       and then a select-shuffle (blend) with the base vector. 
-  if (!isa<UndefValue>(FirstIE->getOperand(0))) 
-    if (!ElementPresent.all()) 
-      return nullptr; 
- 
-  // Create the insert + shuffle. 
-  Type *Int32Ty = Type::getInt32Ty(InsElt.getContext()); 
-  UndefValue *UndefVec = UndefValue::get(VecTy); 
-  Constant *Zero = ConstantInt::get(Int32Ty, 0); 
-  if (!cast<ConstantInt>(FirstIE->getOperand(2))->isZero()) 
-    FirstIE = InsertElementInst::Create(UndefVec, SplatVal, Zero, "", &InsElt); 
- 
-  // Splat from element 0, but replace absent elements with undef in the mask. 
-  SmallVector<int, 16> Mask(NumElements, 0); 
-  for (unsigned i = 0; i != NumElements; ++i) 
-    if (!ElementPresent[i]) 
-      Mask[i] = -1; 
- 
-  return new ShuffleVectorInst(FirstIE, UndefVec, Mask); 
-} 
- 
-/// Try to fold an insert element into an existing splat shuffle by changing 
-/// the shuffle's mask to include the index of this insert element. 
-static Instruction *foldInsEltIntoSplat(InsertElementInst &InsElt) { 
-  // Check if the vector operand of this insert is a canonical splat shuffle. 
-  auto *Shuf = dyn_cast<ShuffleVectorInst>(InsElt.getOperand(0)); 
-  if (!Shuf || !Shuf->isZeroEltSplat()) 
-    return nullptr; 
- 
-  // Bail out early if shuffle is scalable type. The number of elements in 
-  // shuffle mask is unknown at compile-time. 
-  if (isa<ScalableVectorType>(Shuf->getType())) 
-    return nullptr; 
- 
-  // Check for a constant insertion index. 
-  uint64_t IdxC; 
-  if (!match(InsElt.getOperand(2), m_ConstantInt(IdxC))) 
-    return nullptr; 
- 
-  // Check if the splat shuffle's input is the same as this insert's scalar op. 
-  Value *X = InsElt.getOperand(1); 
-  Value *Op0 = Shuf->getOperand(0); 
-  if (!match(Op0, m_InsertElt(m_Undef(), m_Specific(X), m_ZeroInt()))) 
-    return nullptr; 
- 
-  // Replace the shuffle mask element at the index of this insert with a zero. 
-  // For example: 
-  // inselt (shuf (inselt undef, X, 0), undef, <0,undef,0,undef>), X, 1 
-  //   --> shuf (inselt undef, X, 0), undef, <0,0,0,undef> 
+  return nullptr;
+}
+
+static bool isShuffleEquivalentToSelect(ShuffleVectorInst &Shuf) {
+  // Can not analyze scalable type, the number of elements is not a compile-time
+  // constant.
+  if (isa<ScalableVectorType>(Shuf.getOperand(0)->getType()))
+    return false;
+
+  int MaskSize = Shuf.getShuffleMask().size();
+  int VecSize =
+      cast<FixedVectorType>(Shuf.getOperand(0)->getType())->getNumElements();
+
+  // A vector select does not change the size of the operands.
+  if (MaskSize != VecSize)
+    return false;
+
+  // Each mask element must be undefined or choose a vector element from one of
+  // the source operands without crossing vector lanes.
+  for (int i = 0; i != MaskSize; ++i) {
+    int Elt = Shuf.getMaskValue(i);
+    if (Elt != -1 && Elt != i && Elt != i + VecSize)
+      return false;
+  }
+
+  return true;
+}
+
+/// Turn a chain of inserts that splats a value into an insert + shuffle:
+/// insertelt(insertelt(insertelt(insertelt X, %k, 0), %k, 1), %k, 2) ... ->
+/// shufflevector(insertelt(X, %k, 0), undef, zero)
+static Instruction *foldInsSequenceIntoSplat(InsertElementInst &InsElt) {
+  // We are interested in the last insert in a chain. So if this insert has a
+  // single user and that user is an insert, bail.
+  if (InsElt.hasOneUse() && isa<InsertElementInst>(InsElt.user_back()))
+    return nullptr;
+
+  VectorType *VecTy = InsElt.getType();
+  // Can not handle scalable type, the number of elements is not a compile-time
+  // constant.
+  if (isa<ScalableVectorType>(VecTy))
+    return nullptr;
+  unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
+
+  // Do not try to do this for a one-element vector, since that's a nop,
+  // and will cause an inf-loop.
+  if (NumElements == 1)
+    return nullptr;
+
+  Value *SplatVal = InsElt.getOperand(1);
+  InsertElementInst *CurrIE = &InsElt;
+  SmallBitVector ElementPresent(NumElements, false);
+  InsertElementInst *FirstIE = nullptr;
+
+  // Walk the chain backwards, keeping track of which indices we inserted into,
+  // until we hit something that isn't an insert of the splatted value.
+  while (CurrIE) {
+    auto *Idx = dyn_cast<ConstantInt>(CurrIE->getOperand(2));
+    if (!Idx || CurrIE->getOperand(1) != SplatVal)
+      return nullptr;
+
+    auto *NextIE = dyn_cast<InsertElementInst>(CurrIE->getOperand(0));
+    // Check none of the intermediate steps have any additional uses, except
+    // for the root insertelement instruction, which can be re-used, if it
+    // inserts at position 0.
+    if (CurrIE != &InsElt &&
+        (!CurrIE->hasOneUse() && (NextIE != nullptr || !Idx->isZero())))
+      return nullptr;
+
+    ElementPresent[Idx->getZExtValue()] = true;
+    FirstIE = CurrIE;
+    CurrIE = NextIE;
+  }
+
+  // If this is just a single insertelement (not a sequence), we are done.
+  if (FirstIE == &InsElt)
+    return nullptr;
+
+  // If we are not inserting into an undef vector, make sure we've seen an
+  // insert into every element.
+  // TODO: If the base vector is not undef, it might be better to create a splat
+  //       and then a select-shuffle (blend) with the base vector.
+  if (!isa<UndefValue>(FirstIE->getOperand(0)))
+    if (!ElementPresent.all())
+      return nullptr;
+
+  // Create the insert + shuffle.
+  Type *Int32Ty = Type::getInt32Ty(InsElt.getContext());
+  UndefValue *UndefVec = UndefValue::get(VecTy);
+  Constant *Zero = ConstantInt::get(Int32Ty, 0);
+  if (!cast<ConstantInt>(FirstIE->getOperand(2))->isZero())
+    FirstIE = InsertElementInst::Create(UndefVec, SplatVal, Zero, "", &InsElt);
+
+  // Splat from element 0, but replace absent elements with undef in the mask.
+  SmallVector<int, 16> Mask(NumElements, 0);
+  for (unsigned i = 0; i != NumElements; ++i)
+    if (!ElementPresent[i])
+      Mask[i] = -1;
+
+  return new ShuffleVectorInst(FirstIE, UndefVec, Mask);
+}
+
+/// Try to fold an insert element into an existing splat shuffle by changing
+/// the shuffle's mask to include the index of this insert element.
+static Instruction *foldInsEltIntoSplat(InsertElementInst &InsElt) {
+  // Check if the vector operand of this insert is a canonical splat shuffle.
+  auto *Shuf = dyn_cast<ShuffleVectorInst>(InsElt.getOperand(0));
+  if (!Shuf || !Shuf->isZeroEltSplat())
+    return nullptr;
+
+  // Bail out early if shuffle is scalable type. The number of elements in
+  // shuffle mask is unknown at compile-time.
+  if (isa<ScalableVectorType>(Shuf->getType()))
+    return nullptr;
+
+  // Check for a constant insertion index.
+  uint64_t IdxC;
+  if (!match(InsElt.getOperand(2), m_ConstantInt(IdxC)))
+    return nullptr;
+
+  // Check if the splat shuffle's input is the same as this insert's scalar op.
+  Value *X = InsElt.getOperand(1);
+  Value *Op0 = Shuf->getOperand(0);
+  if (!match(Op0, m_InsertElt(m_Undef(), m_Specific(X), m_ZeroInt())))
+    return nullptr;
+
+  // Replace the shuffle mask element at the index of this insert with a zero.
+  // For example:
+  // inselt (shuf (inselt undef, X, 0), undef, <0,undef,0,undef>), X, 1
+  //   --> shuf (inselt undef, X, 0), undef, <0,0,0,undef>
   unsigned NumMaskElts =
       cast<FixedVectorType>(Shuf->getType())->getNumElements();
-  SmallVector<int, 16> NewMask(NumMaskElts); 
-  for (unsigned i = 0; i != NumMaskElts; ++i) 
-    NewMask[i] = i == IdxC ? 0 : Shuf->getMaskValue(i); 
- 
-  return new ShuffleVectorInst(Op0, UndefValue::get(Op0->getType()), NewMask); 
-} 
- 
-/// Try to fold an extract+insert element into an existing identity shuffle by 
-/// changing the shuffle's mask to include the index of this insert element. 
-static Instruction *foldInsEltIntoIdentityShuffle(InsertElementInst &InsElt) { 
-  // Check if the vector operand of this insert is an identity shuffle. 
-  auto *Shuf = dyn_cast<ShuffleVectorInst>(InsElt.getOperand(0)); 
-  if (!Shuf || !isa<UndefValue>(Shuf->getOperand(1)) || 
-      !(Shuf->isIdentityWithExtract() || Shuf->isIdentityWithPadding())) 
-    return nullptr; 
- 
-  // Bail out early if shuffle is scalable type. The number of elements in 
-  // shuffle mask is unknown at compile-time. 
-  if (isa<ScalableVectorType>(Shuf->getType())) 
-    return nullptr; 
- 
-  // Check for a constant insertion index. 
-  uint64_t IdxC; 
-  if (!match(InsElt.getOperand(2), m_ConstantInt(IdxC))) 
-    return nullptr; 
- 
-  // Check if this insert's scalar op is extracted from the identity shuffle's 
-  // input vector. 
-  Value *Scalar = InsElt.getOperand(1); 
-  Value *X = Shuf->getOperand(0); 
-  if (!match(Scalar, m_ExtractElt(m_Specific(X), m_SpecificInt(IdxC)))) 
-    return nullptr; 
- 
-  // Replace the shuffle mask element at the index of this extract+insert with 
-  // that same index value. 
-  // For example: 
-  // inselt (shuf X, IdMask), (extelt X, IdxC), IdxC --> shuf X, IdMask' 
+  SmallVector<int, 16> NewMask(NumMaskElts);
+  for (unsigned i = 0; i != NumMaskElts; ++i)
+    NewMask[i] = i == IdxC ? 0 : Shuf->getMaskValue(i);
+
+  return new ShuffleVectorInst(Op0, UndefValue::get(Op0->getType()), NewMask);
+}
+
+/// Try to fold an extract+insert element into an existing identity shuffle by
+/// changing the shuffle's mask to include the index of this insert element.
+static Instruction *foldInsEltIntoIdentityShuffle(InsertElementInst &InsElt) {
+  // Check if the vector operand of this insert is an identity shuffle.
+  auto *Shuf = dyn_cast<ShuffleVectorInst>(InsElt.getOperand(0));
+  if (!Shuf || !isa<UndefValue>(Shuf->getOperand(1)) ||
+      !(Shuf->isIdentityWithExtract() || Shuf->isIdentityWithPadding()))
+    return nullptr;
+
+  // Bail out early if shuffle is scalable type. The number of elements in
+  // shuffle mask is unknown at compile-time.
+  if (isa<ScalableVectorType>(Shuf->getType()))
+    return nullptr;
+
+  // Check for a constant insertion index.
+  uint64_t IdxC;
+  if (!match(InsElt.getOperand(2), m_ConstantInt(IdxC)))
+    return nullptr;
+
+  // Check if this insert's scalar op is extracted from the identity shuffle's
+  // input vector.
+  Value *Scalar = InsElt.getOperand(1);
+  Value *X = Shuf->getOperand(0);
+  if (!match(Scalar, m_ExtractElt(m_Specific(X), m_SpecificInt(IdxC))))
+    return nullptr;
+
+  // Replace the shuffle mask element at the index of this extract+insert with
+  // that same index value.
+  // For example:
+  // inselt (shuf X, IdMask), (extelt X, IdxC), IdxC --> shuf X, IdMask'
   unsigned NumMaskElts =
       cast<FixedVectorType>(Shuf->getType())->getNumElements();
-  SmallVector<int, 16> NewMask(NumMaskElts); 
-  ArrayRef<int> OldMask = Shuf->getShuffleMask(); 
-  for (unsigned i = 0; i != NumMaskElts; ++i) { 
-    if (i != IdxC) { 
-      // All mask elements besides the inserted element remain the same. 
-      NewMask[i] = OldMask[i]; 
-    } else if (OldMask[i] == (int)IdxC) { 
-      // If the mask element was already set, there's nothing to do 
-      // (demanded elements analysis may unset it later). 
-      return nullptr; 
-    } else { 
-      assert(OldMask[i] == UndefMaskElem && 
-             "Unexpected shuffle mask element for identity shuffle"); 
-      NewMask[i] = IdxC; 
-    } 
-  } 
- 
-  return new ShuffleVectorInst(X, Shuf->getOperand(1), NewMask); 
-} 
- 
-/// If we have an insertelement instruction feeding into another insertelement 
-/// and the 2nd is inserting a constant into the vector, canonicalize that 
-/// constant insertion before the insertion of a variable: 
-/// 
-/// insertelement (insertelement X, Y, IdxC1), ScalarC, IdxC2 --> 
-/// insertelement (insertelement X, ScalarC, IdxC2), Y, IdxC1 
-/// 
-/// This has the potential of eliminating the 2nd insertelement instruction 
-/// via constant folding of the scalar constant into a vector constant. 
-static Instruction *hoistInsEltConst(InsertElementInst &InsElt2, 
-                                     InstCombiner::BuilderTy &Builder) { 
-  auto *InsElt1 = dyn_cast<InsertElementInst>(InsElt2.getOperand(0)); 
-  if (!InsElt1 || !InsElt1->hasOneUse()) 
-    return nullptr; 
- 
-  Value *X, *Y; 
-  Constant *ScalarC; 
-  ConstantInt *IdxC1, *IdxC2; 
-  if (match(InsElt1->getOperand(0), m_Value(X)) && 
-      match(InsElt1->getOperand(1), m_Value(Y)) && !isa<Constant>(Y) && 
-      match(InsElt1->getOperand(2), m_ConstantInt(IdxC1)) && 
-      match(InsElt2.getOperand(1), m_Constant(ScalarC)) && 
-      match(InsElt2.getOperand(2), m_ConstantInt(IdxC2)) && IdxC1 != IdxC2) { 
-    Value *NewInsElt1 = Builder.CreateInsertElement(X, ScalarC, IdxC2); 
-    return InsertElementInst::Create(NewInsElt1, Y, IdxC1); 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// insertelt (shufflevector X, CVec, Mask|insertelt X, C1, CIndex1), C, CIndex 
-/// --> shufflevector X, CVec', Mask' 
-static Instruction *foldConstantInsEltIntoShuffle(InsertElementInst &InsElt) { 
-  auto *Inst = dyn_cast<Instruction>(InsElt.getOperand(0)); 
-  // Bail out if the parent has more than one use. In that case, we'd be 
-  // replacing the insertelt with a shuffle, and that's not a clear win. 
-  if (!Inst || !Inst->hasOneUse()) 
-    return nullptr; 
-  if (auto *Shuf = dyn_cast<ShuffleVectorInst>(InsElt.getOperand(0))) { 
-    // The shuffle must have a constant vector operand. The insertelt must have 
-    // a constant scalar being inserted at a constant position in the vector. 
-    Constant *ShufConstVec, *InsEltScalar; 
-    uint64_t InsEltIndex; 
-    if (!match(Shuf->getOperand(1), m_Constant(ShufConstVec)) || 
-        !match(InsElt.getOperand(1), m_Constant(InsEltScalar)) || 
-        !match(InsElt.getOperand(2), m_ConstantInt(InsEltIndex))) 
-      return nullptr; 
- 
-    // Adding an element to an arbitrary shuffle could be expensive, but a 
-    // shuffle that selects elements from vectors without crossing lanes is 
-    // assumed cheap. 
-    // If we're just adding a constant into that shuffle, it will still be 
-    // cheap. 
-    if (!isShuffleEquivalentToSelect(*Shuf)) 
-      return nullptr; 
- 
-    // From the above 'select' check, we know that the mask has the same number 
-    // of elements as the vector input operands. We also know that each constant 
-    // input element is used in its lane and can not be used more than once by 
-    // the shuffle. Therefore, replace the constant in the shuffle's constant 
-    // vector with the insertelt constant. Replace the constant in the shuffle's 
-    // mask vector with the insertelt index plus the length of the vector 
-    // (because the constant vector operand of a shuffle is always the 2nd 
-    // operand). 
-    ArrayRef<int> Mask = Shuf->getShuffleMask(); 
-    unsigned NumElts = Mask.size(); 
-    SmallVector<Constant *, 16> NewShufElts(NumElts); 
-    SmallVector<int, 16> NewMaskElts(NumElts); 
-    for (unsigned I = 0; I != NumElts; ++I) { 
-      if (I == InsEltIndex) { 
-        NewShufElts[I] = InsEltScalar; 
-        NewMaskElts[I] = InsEltIndex + NumElts; 
-      } else { 
-        // Copy over the existing values. 
-        NewShufElts[I] = ShufConstVec->getAggregateElement(I); 
-        NewMaskElts[I] = Mask[I]; 
-      } 
-    } 
- 
-    // Create new operands for a shuffle that includes the constant of the 
-    // original insertelt. The old shuffle will be dead now. 
-    return new ShuffleVectorInst(Shuf->getOperand(0), 
-                                 ConstantVector::get(NewShufElts), NewMaskElts); 
-  } else if (auto *IEI = dyn_cast<InsertElementInst>(Inst)) { 
-    // Transform sequences of insertelements ops with constant data/indexes into 
-    // a single shuffle op. 
-    // Can not handle scalable type, the number of elements needed to create 
-    // shuffle mask is not a compile-time constant. 
-    if (isa<ScalableVectorType>(InsElt.getType())) 
-      return nullptr; 
-    unsigned NumElts = 
-        cast<FixedVectorType>(InsElt.getType())->getNumElements(); 
- 
-    uint64_t InsertIdx[2]; 
-    Constant *Val[2]; 
-    if (!match(InsElt.getOperand(2), m_ConstantInt(InsertIdx[0])) || 
-        !match(InsElt.getOperand(1), m_Constant(Val[0])) || 
-        !match(IEI->getOperand(2), m_ConstantInt(InsertIdx[1])) || 
-        !match(IEI->getOperand(1), m_Constant(Val[1]))) 
-      return nullptr; 
-    SmallVector<Constant *, 16> Values(NumElts); 
-    SmallVector<int, 16> Mask(NumElts); 
-    auto ValI = std::begin(Val); 
-    // Generate new constant vector and mask. 
-    // We have 2 values/masks from the insertelements instructions. Insert them 
-    // into new value/mask vectors. 
-    for (uint64_t I : InsertIdx) { 
-      if (!Values[I]) { 
-        Values[I] = *ValI; 
-        Mask[I] = NumElts + I; 
-      } 
-      ++ValI; 
-    } 
-    // Remaining values are filled with 'undef' values. 
-    for (unsigned I = 0; I < NumElts; ++I) { 
-      if (!Values[I]) { 
-        Values[I] = UndefValue::get(InsElt.getType()->getElementType()); 
-        Mask[I] = I; 
-      } 
-    } 
-    // Create new operands for a shuffle that includes the constant of the 
-    // original insertelt. 
-    return new ShuffleVectorInst(IEI->getOperand(0), 
-                                 ConstantVector::get(Values), Mask); 
-  } 
-  return nullptr; 
-} 
- 
+  SmallVector<int, 16> NewMask(NumMaskElts);
+  ArrayRef<int> OldMask = Shuf->getShuffleMask();
+  for (unsigned i = 0; i != NumMaskElts; ++i) {
+    if (i != IdxC) {
+      // All mask elements besides the inserted element remain the same.
+      NewMask[i] = OldMask[i];
+    } else if (OldMask[i] == (int)IdxC) {
+      // If the mask element was already set, there's nothing to do
+      // (demanded elements analysis may unset it later).
+      return nullptr;
+    } else {
+      assert(OldMask[i] == UndefMaskElem &&
+             "Unexpected shuffle mask element for identity shuffle");
+      NewMask[i] = IdxC;
+    }
+  }
+
+  return new ShuffleVectorInst(X, Shuf->getOperand(1), NewMask);
+}
+
+/// If we have an insertelement instruction feeding into another insertelement
+/// and the 2nd is inserting a constant into the vector, canonicalize that
+/// constant insertion before the insertion of a variable:
+///
+/// insertelement (insertelement X, Y, IdxC1), ScalarC, IdxC2 -->
+/// insertelement (insertelement X, ScalarC, IdxC2), Y, IdxC1
+///
+/// This has the potential of eliminating the 2nd insertelement instruction
+/// via constant folding of the scalar constant into a vector constant.
+static Instruction *hoistInsEltConst(InsertElementInst &InsElt2,
+                                     InstCombiner::BuilderTy &Builder) {
+  auto *InsElt1 = dyn_cast<InsertElementInst>(InsElt2.getOperand(0));
+  if (!InsElt1 || !InsElt1->hasOneUse())
+    return nullptr;
+
+  Value *X, *Y;
+  Constant *ScalarC;
+  ConstantInt *IdxC1, *IdxC2;
+  if (match(InsElt1->getOperand(0), m_Value(X)) &&
+      match(InsElt1->getOperand(1), m_Value(Y)) && !isa<Constant>(Y) &&
+      match(InsElt1->getOperand(2), m_ConstantInt(IdxC1)) &&
+      match(InsElt2.getOperand(1), m_Constant(ScalarC)) &&
+      match(InsElt2.getOperand(2), m_ConstantInt(IdxC2)) && IdxC1 != IdxC2) {
+    Value *NewInsElt1 = Builder.CreateInsertElement(X, ScalarC, IdxC2);
+    return InsertElementInst::Create(NewInsElt1, Y, IdxC1);
+  }
+
+  return nullptr;
+}
+
+/// insertelt (shufflevector X, CVec, Mask|insertelt X, C1, CIndex1), C, CIndex
+/// --> shufflevector X, CVec', Mask'
+static Instruction *foldConstantInsEltIntoShuffle(InsertElementInst &InsElt) {
+  auto *Inst = dyn_cast<Instruction>(InsElt.getOperand(0));
+  // Bail out if the parent has more than one use. In that case, we'd be
+  // replacing the insertelt with a shuffle, and that's not a clear win.
+  if (!Inst || !Inst->hasOneUse())
+    return nullptr;
+  if (auto *Shuf = dyn_cast<ShuffleVectorInst>(InsElt.getOperand(0))) {
+    // The shuffle must have a constant vector operand. The insertelt must have
+    // a constant scalar being inserted at a constant position in the vector.
+    Constant *ShufConstVec, *InsEltScalar;
+    uint64_t InsEltIndex;
+    if (!match(Shuf->getOperand(1), m_Constant(ShufConstVec)) ||
+        !match(InsElt.getOperand(1), m_Constant(InsEltScalar)) ||
+        !match(InsElt.getOperand(2), m_ConstantInt(InsEltIndex)))
+      return nullptr;
+
+    // Adding an element to an arbitrary shuffle could be expensive, but a
+    // shuffle that selects elements from vectors without crossing lanes is
+    // assumed cheap.
+    // If we're just adding a constant into that shuffle, it will still be
+    // cheap.
+    if (!isShuffleEquivalentToSelect(*Shuf))
+      return nullptr;
+
+    // From the above 'select' check, we know that the mask has the same number
+    // of elements as the vector input operands. We also know that each constant
+    // input element is used in its lane and can not be used more than once by
+    // the shuffle. Therefore, replace the constant in the shuffle's constant
+    // vector with the insertelt constant. Replace the constant in the shuffle's
+    // mask vector with the insertelt index plus the length of the vector
+    // (because the constant vector operand of a shuffle is always the 2nd
+    // operand).
+    ArrayRef<int> Mask = Shuf->getShuffleMask();
+    unsigned NumElts = Mask.size();
+    SmallVector<Constant *, 16> NewShufElts(NumElts);
+    SmallVector<int, 16> NewMaskElts(NumElts);
+    for (unsigned I = 0; I != NumElts; ++I) {
+      if (I == InsEltIndex) {
+        NewShufElts[I] = InsEltScalar;
+        NewMaskElts[I] = InsEltIndex + NumElts;
+      } else {
+        // Copy over the existing values.
+        NewShufElts[I] = ShufConstVec->getAggregateElement(I);
+        NewMaskElts[I] = Mask[I];
+      }
+    }
+
+    // Create new operands for a shuffle that includes the constant of the
+    // original insertelt. The old shuffle will be dead now.
+    return new ShuffleVectorInst(Shuf->getOperand(0),
+                                 ConstantVector::get(NewShufElts), NewMaskElts);
+  } else if (auto *IEI = dyn_cast<InsertElementInst>(Inst)) {
+    // Transform sequences of insertelements ops with constant data/indexes into
+    // a single shuffle op.
+    // Can not handle scalable type, the number of elements needed to create
+    // shuffle mask is not a compile-time constant.
+    if (isa<ScalableVectorType>(InsElt.getType()))
+      return nullptr;
+    unsigned NumElts =
+        cast<FixedVectorType>(InsElt.getType())->getNumElements();
+
+    uint64_t InsertIdx[2];
+    Constant *Val[2];
+    if (!match(InsElt.getOperand(2), m_ConstantInt(InsertIdx[0])) ||
+        !match(InsElt.getOperand(1), m_Constant(Val[0])) ||
+        !match(IEI->getOperand(2), m_ConstantInt(InsertIdx[1])) ||
+        !match(IEI->getOperand(1), m_Constant(Val[1])))
+      return nullptr;
+    SmallVector<Constant *, 16> Values(NumElts);
+    SmallVector<int, 16> Mask(NumElts);
+    auto ValI = std::begin(Val);
+    // Generate new constant vector and mask.
+    // We have 2 values/masks from the insertelements instructions. Insert them
+    // into new value/mask vectors.
+    for (uint64_t I : InsertIdx) {
+      if (!Values[I]) {
+        Values[I] = *ValI;
+        Mask[I] = NumElts + I;
+      }
+      ++ValI;
+    }
+    // Remaining values are filled with 'undef' values.
+    for (unsigned I = 0; I < NumElts; ++I) {
+      if (!Values[I]) {
+        Values[I] = UndefValue::get(InsElt.getType()->getElementType());
+        Mask[I] = I;
+      }
+    }
+    // Create new operands for a shuffle that includes the constant of the
+    // original insertelt.
+    return new ShuffleVectorInst(IEI->getOperand(0),
+                                 ConstantVector::get(Values), Mask);
+  }
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::visitInsertElementInst(InsertElementInst &IE) {
-  Value *VecOp    = IE.getOperand(0); 
-  Value *ScalarOp = IE.getOperand(1); 
-  Value *IdxOp    = IE.getOperand(2); 
- 
-  if (auto *V = SimplifyInsertElementInst( 
-          VecOp, ScalarOp, IdxOp, SQ.getWithInstruction(&IE))) 
-    return replaceInstUsesWith(IE, V); 
- 
-  // If the scalar is bitcast and inserted into undef, do the insert in the 
-  // source type followed by bitcast. 
-  // TODO: Generalize for insert into any constant, not just undef? 
-  Value *ScalarSrc; 
-  if (match(VecOp, m_Undef()) && 
-      match(ScalarOp, m_OneUse(m_BitCast(m_Value(ScalarSrc)))) && 
-      (ScalarSrc->getType()->isIntegerTy() || 
-       ScalarSrc->getType()->isFloatingPointTy())) { 
-    // inselt undef, (bitcast ScalarSrc), IdxOp --> 
-    //   bitcast (inselt undef, ScalarSrc, IdxOp) 
-    Type *ScalarTy = ScalarSrc->getType(); 
-    Type *VecTy = VectorType::get(ScalarTy, IE.getType()->getElementCount()); 
-    UndefValue *NewUndef = UndefValue::get(VecTy); 
-    Value *NewInsElt = Builder.CreateInsertElement(NewUndef, ScalarSrc, IdxOp); 
-    return new BitCastInst(NewInsElt, IE.getType()); 
-  } 
- 
-  // If the vector and scalar are both bitcast from the same element type, do 
-  // the insert in that source type followed by bitcast. 
-  Value *VecSrc; 
-  if (match(VecOp, m_BitCast(m_Value(VecSrc))) && 
-      match(ScalarOp, m_BitCast(m_Value(ScalarSrc))) && 
-      (VecOp->hasOneUse() || ScalarOp->hasOneUse()) && 
-      VecSrc->getType()->isVectorTy() && !ScalarSrc->getType()->isVectorTy() && 
-      cast<VectorType>(VecSrc->getType())->getElementType() == 
-          ScalarSrc->getType()) { 
-    // inselt (bitcast VecSrc), (bitcast ScalarSrc), IdxOp --> 
-    //   bitcast (inselt VecSrc, ScalarSrc, IdxOp) 
-    Value *NewInsElt = Builder.CreateInsertElement(VecSrc, ScalarSrc, IdxOp); 
-    return new BitCastInst(NewInsElt, IE.getType()); 
-  } 
- 
-  // If the inserted element was extracted from some other fixed-length vector 
-  // and both indexes are valid constants, try to turn this into a shuffle. 
-  // Can not handle scalable vector type, the number of elements needed to 
-  // create shuffle mask is not a compile-time constant. 
-  uint64_t InsertedIdx, ExtractedIdx; 
-  Value *ExtVecOp; 
-  if (isa<FixedVectorType>(IE.getType()) && 
-      match(IdxOp, m_ConstantInt(InsertedIdx)) && 
-      match(ScalarOp, 
-            m_ExtractElt(m_Value(ExtVecOp), m_ConstantInt(ExtractedIdx))) && 
-      isa<FixedVectorType>(ExtVecOp->getType()) && 
-      ExtractedIdx < 
-          cast<FixedVectorType>(ExtVecOp->getType())->getNumElements()) { 
-    // TODO: Looking at the user(s) to determine if this insert is a 
-    // fold-to-shuffle opportunity does not match the usual instcombine 
-    // constraints. We should decide if the transform is worthy based only 
-    // on this instruction and its operands, but that may not work currently. 
-    // 
-    // Here, we are trying to avoid creating shuffles before reaching 
-    // the end of a chain of extract-insert pairs. This is complicated because 
-    // we do not generally form arbitrary shuffle masks in instcombine 
-    // (because those may codegen poorly), but collectShuffleElements() does 
-    // exactly that. 
-    // 
-    // The rules for determining what is an acceptable target-independent 
-    // shuffle mask are fuzzy because they evolve based on the backend's 
-    // capabilities and real-world impact. 
-    auto isShuffleRootCandidate = [](InsertElementInst &Insert) { 
-      if (!Insert.hasOneUse()) 
-        return true; 
-      auto *InsertUser = dyn_cast<InsertElementInst>(Insert.user_back()); 
-      if (!InsertUser) 
-        return true; 
-      return false; 
-    }; 
- 
-    // Try to form a shuffle from a chain of extract-insert ops. 
-    if (isShuffleRootCandidate(IE)) { 
-      SmallVector<int, 16> Mask; 
-      ShuffleOps LR = collectShuffleElements(&IE, Mask, nullptr, *this); 
- 
-      // The proposed shuffle may be trivial, in which case we shouldn't 
-      // perform the combine. 
-      if (LR.first != &IE && LR.second != &IE) { 
-        // We now have a shuffle of LHS, RHS, Mask. 
-        if (LR.second == nullptr) 
-          LR.second = UndefValue::get(LR.first->getType()); 
-        return new ShuffleVectorInst(LR.first, LR.second, Mask); 
-      } 
-    } 
-  } 
- 
-  if (auto VecTy = dyn_cast<FixedVectorType>(VecOp->getType())) { 
-    unsigned VWidth = VecTy->getNumElements(); 
-    APInt UndefElts(VWidth, 0); 
-    APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth)); 
-    if (Value *V = SimplifyDemandedVectorElts(&IE, AllOnesEltMask, UndefElts)) { 
-      if (V != &IE) 
-        return replaceInstUsesWith(IE, V); 
-      return &IE; 
-    } 
-  } 
- 
-  if (Instruction *Shuf = foldConstantInsEltIntoShuffle(IE)) 
-    return Shuf; 
- 
-  if (Instruction *NewInsElt = hoistInsEltConst(IE, Builder)) 
-    return NewInsElt; 
- 
-  if (Instruction *Broadcast = foldInsSequenceIntoSplat(IE)) 
-    return Broadcast; 
- 
-  if (Instruction *Splat = foldInsEltIntoSplat(IE)) 
-    return Splat; 
- 
-  if (Instruction *IdentityShuf = foldInsEltIntoIdentityShuffle(IE)) 
-    return IdentityShuf; 
- 
-  return nullptr; 
-} 
- 
-/// Return true if we can evaluate the specified expression tree if the vector 
-/// elements were shuffled in a different order. 
-static bool canEvaluateShuffled(Value *V, ArrayRef<int> Mask, 
-                                unsigned Depth = 5) { 
-  // We can always reorder the elements of a constant. 
-  if (isa<Constant>(V)) 
-    return true; 
- 
-  // We won't reorder vector arguments. No IPO here. 
-  Instruction *I = dyn_cast<Instruction>(V); 
-  if (!I) return false; 
- 
-  // Two users may expect different orders of the elements. Don't try it. 
-  if (!I->hasOneUse()) 
-    return false; 
- 
-  if (Depth == 0) return false; 
- 
-  switch (I->getOpcode()) { 
-    case Instruction::UDiv: 
-    case Instruction::SDiv: 
-    case Instruction::URem: 
-    case Instruction::SRem: 
-      // Propagating an undefined shuffle mask element to integer div/rem is not 
-      // allowed because those opcodes can create immediate undefined behavior 
-      // from an undefined element in an operand. 
+  Value *VecOp    = IE.getOperand(0);
+  Value *ScalarOp = IE.getOperand(1);
+  Value *IdxOp    = IE.getOperand(2);
+
+  if (auto *V = SimplifyInsertElementInst(
+          VecOp, ScalarOp, IdxOp, SQ.getWithInstruction(&IE)))
+    return replaceInstUsesWith(IE, V);
+
+  // If the scalar is bitcast and inserted into undef, do the insert in the
+  // source type followed by bitcast.
+  // TODO: Generalize for insert into any constant, not just undef?
+  Value *ScalarSrc;
+  if (match(VecOp, m_Undef()) &&
+      match(ScalarOp, m_OneUse(m_BitCast(m_Value(ScalarSrc)))) &&
+      (ScalarSrc->getType()->isIntegerTy() ||
+       ScalarSrc->getType()->isFloatingPointTy())) {
+    // inselt undef, (bitcast ScalarSrc), IdxOp -->
+    //   bitcast (inselt undef, ScalarSrc, IdxOp)
+    Type *ScalarTy = ScalarSrc->getType();
+    Type *VecTy = VectorType::get(ScalarTy, IE.getType()->getElementCount());
+    UndefValue *NewUndef = UndefValue::get(VecTy);
+    Value *NewInsElt = Builder.CreateInsertElement(NewUndef, ScalarSrc, IdxOp);
+    return new BitCastInst(NewInsElt, IE.getType());
+  }
+
+  // If the vector and scalar are both bitcast from the same element type, do
+  // the insert in that source type followed by bitcast.
+  Value *VecSrc;
+  if (match(VecOp, m_BitCast(m_Value(VecSrc))) &&
+      match(ScalarOp, m_BitCast(m_Value(ScalarSrc))) &&
+      (VecOp->hasOneUse() || ScalarOp->hasOneUse()) &&
+      VecSrc->getType()->isVectorTy() && !ScalarSrc->getType()->isVectorTy() &&
+      cast<VectorType>(VecSrc->getType())->getElementType() ==
+          ScalarSrc->getType()) {
+    // inselt (bitcast VecSrc), (bitcast ScalarSrc), IdxOp -->
+    //   bitcast (inselt VecSrc, ScalarSrc, IdxOp)
+    Value *NewInsElt = Builder.CreateInsertElement(VecSrc, ScalarSrc, IdxOp);
+    return new BitCastInst(NewInsElt, IE.getType());
+  }
+
+  // If the inserted element was extracted from some other fixed-length vector
+  // and both indexes are valid constants, try to turn this into a shuffle.
+  // Can not handle scalable vector type, the number of elements needed to
+  // create shuffle mask is not a compile-time constant.
+  uint64_t InsertedIdx, ExtractedIdx;
+  Value *ExtVecOp;
+  if (isa<FixedVectorType>(IE.getType()) &&
+      match(IdxOp, m_ConstantInt(InsertedIdx)) &&
+      match(ScalarOp,
+            m_ExtractElt(m_Value(ExtVecOp), m_ConstantInt(ExtractedIdx))) &&
+      isa<FixedVectorType>(ExtVecOp->getType()) &&
+      ExtractedIdx <
+          cast<FixedVectorType>(ExtVecOp->getType())->getNumElements()) {
+    // TODO: Looking at the user(s) to determine if this insert is a
+    // fold-to-shuffle opportunity does not match the usual instcombine
+    // constraints. We should decide if the transform is worthy based only
+    // on this instruction and its operands, but that may not work currently.
+    //
+    // Here, we are trying to avoid creating shuffles before reaching
+    // the end of a chain of extract-insert pairs. This is complicated because
+    // we do not generally form arbitrary shuffle masks in instcombine
+    // (because those may codegen poorly), but collectShuffleElements() does
+    // exactly that.
+    //
+    // The rules for determining what is an acceptable target-independent
+    // shuffle mask are fuzzy because they evolve based on the backend's
+    // capabilities and real-world impact.
+    auto isShuffleRootCandidate = [](InsertElementInst &Insert) {
+      if (!Insert.hasOneUse())
+        return true;
+      auto *InsertUser = dyn_cast<InsertElementInst>(Insert.user_back());
+      if (!InsertUser)
+        return true;
+      return false;
+    };
+
+    // Try to form a shuffle from a chain of extract-insert ops.
+    if (isShuffleRootCandidate(IE)) {
+      SmallVector<int, 16> Mask;
+      ShuffleOps LR = collectShuffleElements(&IE, Mask, nullptr, *this);
+
+      // The proposed shuffle may be trivial, in which case we shouldn't
+      // perform the combine.
+      if (LR.first != &IE && LR.second != &IE) {
+        // We now have a shuffle of LHS, RHS, Mask.
+        if (LR.second == nullptr)
+          LR.second = UndefValue::get(LR.first->getType());
+        return new ShuffleVectorInst(LR.first, LR.second, Mask);
+      }
+    }
+  }
+
+  if (auto VecTy = dyn_cast<FixedVectorType>(VecOp->getType())) {
+    unsigned VWidth = VecTy->getNumElements();
+    APInt UndefElts(VWidth, 0);
+    APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
+    if (Value *V = SimplifyDemandedVectorElts(&IE, AllOnesEltMask, UndefElts)) {
+      if (V != &IE)
+        return replaceInstUsesWith(IE, V);
+      return &IE;
+    }
+  }
+
+  if (Instruction *Shuf = foldConstantInsEltIntoShuffle(IE))
+    return Shuf;
+
+  if (Instruction *NewInsElt = hoistInsEltConst(IE, Builder))
+    return NewInsElt;
+
+  if (Instruction *Broadcast = foldInsSequenceIntoSplat(IE))
+    return Broadcast;
+
+  if (Instruction *Splat = foldInsEltIntoSplat(IE))
+    return Splat;
+
+  if (Instruction *IdentityShuf = foldInsEltIntoIdentityShuffle(IE))
+    return IdentityShuf;
+
+  return nullptr;
+}
+
+/// Return true if we can evaluate the specified expression tree if the vector
+/// elements were shuffled in a different order.
+static bool canEvaluateShuffled(Value *V, ArrayRef<int> Mask,
+                                unsigned Depth = 5) {
+  // We can always reorder the elements of a constant.
+  if (isa<Constant>(V))
+    return true;
+
+  // We won't reorder vector arguments. No IPO here.
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
+
+  // Two users may expect different orders of the elements. Don't try it.
+  if (!I->hasOneUse())
+    return false;
+
+  if (Depth == 0) return false;
+
+  switch (I->getOpcode()) {
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+      // Propagating an undefined shuffle mask element to integer div/rem is not
+      // allowed because those opcodes can create immediate undefined behavior
+      // from an undefined element in an operand.
       if (llvm::is_contained(Mask, -1))
-        return false; 
-      LLVM_FALLTHROUGH; 
-    case Instruction::Add: 
-    case Instruction::FAdd: 
-    case Instruction::Sub: 
-    case Instruction::FSub: 
-    case Instruction::Mul: 
-    case Instruction::FMul: 
-    case Instruction::FDiv: 
-    case Instruction::FRem: 
-    case Instruction::Shl: 
-    case Instruction::LShr: 
-    case Instruction::AShr: 
-    case Instruction::And: 
-    case Instruction::Or: 
-    case Instruction::Xor: 
-    case Instruction::ICmp: 
-    case Instruction::FCmp: 
-    case Instruction::Trunc: 
-    case Instruction::ZExt: 
-    case Instruction::SExt: 
-    case Instruction::FPToUI: 
-    case Instruction::FPToSI: 
-    case Instruction::UIToFP: 
-    case Instruction::SIToFP: 
-    case Instruction::FPTrunc: 
-    case Instruction::FPExt: 
-    case Instruction::GetElementPtr: { 
-      // Bail out if we would create longer vector ops. We could allow creating 
-      // longer vector ops, but that may result in more expensive codegen. 
-      Type *ITy = I->getType(); 
-      if (ITy->isVectorTy() && 
+        return false;
+      LLVM_FALLTHROUGH;
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::FDiv:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::ICmp:
+    case Instruction::FCmp:
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::GetElementPtr: {
+      // Bail out if we would create longer vector ops. We could allow creating
+      // longer vector ops, but that may result in more expensive codegen.
+      Type *ITy = I->getType();
+      if (ITy->isVectorTy() &&
           Mask.size() > cast<FixedVectorType>(ITy)->getNumElements())
-        return false; 
-      for (Value *Operand : I->operands()) { 
-        if (!canEvaluateShuffled(Operand, Mask, Depth - 1)) 
-          return false; 
-      } 
-      return true; 
-    } 
-    case Instruction::InsertElement: { 
-      ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(2)); 
-      if (!CI) return false; 
-      int ElementNumber = CI->getLimitedValue(); 
- 
-      // Verify that 'CI' does not occur twice in Mask. A single 'insertelement' 
-      // can't put an element into multiple indices. 
-      bool SeenOnce = false; 
-      for (int i = 0, e = Mask.size(); i != e; ++i) { 
-        if (Mask[i] == ElementNumber) { 
-          if (SeenOnce) 
-            return false; 
-          SeenOnce = true; 
-        } 
-      } 
-      return canEvaluateShuffled(I->getOperand(0), Mask, Depth - 1); 
-    } 
-  } 
-  return false; 
-} 
- 
-/// Rebuild a new instruction just like 'I' but with the new operands given. 
-/// In the event of type mismatch, the type of the operands is correct. 
-static Value *buildNew(Instruction *I, ArrayRef<Value*> NewOps) { 
-  // We don't want to use the IRBuilder here because we want the replacement 
-  // instructions to appear next to 'I', not the builder's insertion point. 
-  switch (I->getOpcode()) { 
-    case Instruction::Add: 
-    case Instruction::FAdd: 
-    case Instruction::Sub: 
-    case Instruction::FSub: 
-    case Instruction::Mul: 
-    case Instruction::FMul: 
-    case Instruction::UDiv: 
-    case Instruction::SDiv: 
-    case Instruction::FDiv: 
-    case Instruction::URem: 
-    case Instruction::SRem: 
-    case Instruction::FRem: 
-    case Instruction::Shl: 
-    case Instruction::LShr: 
-    case Instruction::AShr: 
-    case Instruction::And: 
-    case Instruction::Or: 
-    case Instruction::Xor: { 
-      BinaryOperator *BO = cast<BinaryOperator>(I); 
-      assert(NewOps.size() == 2 && "binary operator with #ops != 2"); 
-      BinaryOperator *New = 
-          BinaryOperator::Create(cast<BinaryOperator>(I)->getOpcode(), 
-                                 NewOps[0], NewOps[1], "", BO); 
-      if (isa<OverflowingBinaryOperator>(BO)) { 
-        New->setHasNoUnsignedWrap(BO->hasNoUnsignedWrap()); 
-        New->setHasNoSignedWrap(BO->hasNoSignedWrap()); 
-      } 
-      if (isa<PossiblyExactOperator>(BO)) { 
-        New->setIsExact(BO->isExact()); 
-      } 
-      if (isa<FPMathOperator>(BO)) 
-        New->copyFastMathFlags(I); 
-      return New; 
-    } 
-    case Instruction::ICmp: 
-      assert(NewOps.size() == 2 && "icmp with #ops != 2"); 
-      return new ICmpInst(I, cast<ICmpInst>(I)->getPredicate(), 
-                          NewOps[0], NewOps[1]); 
-    case Instruction::FCmp: 
-      assert(NewOps.size() == 2 && "fcmp with #ops != 2"); 
-      return new FCmpInst(I, cast<FCmpInst>(I)->getPredicate(), 
-                          NewOps[0], NewOps[1]); 
-    case Instruction::Trunc: 
-    case Instruction::ZExt: 
-    case Instruction::SExt: 
-    case Instruction::FPToUI: 
-    case Instruction::FPToSI: 
-    case Instruction::UIToFP: 
-    case Instruction::SIToFP: 
-    case Instruction::FPTrunc: 
-    case Instruction::FPExt: { 
-      // It's possible that the mask has a different number of elements from 
-      // the original cast. We recompute the destination type to match the mask. 
-      Type *DestTy = VectorType::get( 
-          I->getType()->getScalarType(), 
-          cast<VectorType>(NewOps[0]->getType())->getElementCount()); 
-      assert(NewOps.size() == 1 && "cast with #ops != 1"); 
-      return CastInst::Create(cast<CastInst>(I)->getOpcode(), NewOps[0], DestTy, 
-                              "", I); 
-    } 
-    case Instruction::GetElementPtr: { 
-      Value *Ptr = NewOps[0]; 
-      ArrayRef<Value*> Idx = NewOps.slice(1); 
-      GetElementPtrInst *GEP = GetElementPtrInst::Create( 
-          cast<GetElementPtrInst>(I)->getSourceElementType(), Ptr, Idx, "", I); 
-      GEP->setIsInBounds(cast<GetElementPtrInst>(I)->isInBounds()); 
-      return GEP; 
-    } 
-  } 
-  llvm_unreachable("failed to rebuild vector instructions"); 
-} 
- 
-static Value *evaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) { 
-  // Mask.size() does not need to be equal to the number of vector elements. 
- 
-  assert(V->getType()->isVectorTy() && "can't reorder non-vector elements"); 
-  Type *EltTy = V->getType()->getScalarType(); 
-  Type *I32Ty = IntegerType::getInt32Ty(V->getContext()); 
-  if (isa<UndefValue>(V)) 
-    return UndefValue::get(FixedVectorType::get(EltTy, Mask.size())); 
- 
-  if (isa<ConstantAggregateZero>(V)) 
-    return ConstantAggregateZero::get(FixedVectorType::get(EltTy, Mask.size())); 
- 
-  if (Constant *C = dyn_cast<Constant>(V)) 
-    return ConstantExpr::getShuffleVector(C, UndefValue::get(C->getType()), 
-                                          Mask); 
- 
-  Instruction *I = cast<Instruction>(V); 
-  switch (I->getOpcode()) { 
-    case Instruction::Add: 
-    case Instruction::FAdd: 
-    case Instruction::Sub: 
-    case Instruction::FSub: 
-    case Instruction::Mul: 
-    case Instruction::FMul: 
-    case Instruction::UDiv: 
-    case Instruction::SDiv: 
-    case Instruction::FDiv: 
-    case Instruction::URem: 
-    case Instruction::SRem: 
-    case Instruction::FRem: 
-    case Instruction::Shl: 
-    case Instruction::LShr: 
-    case Instruction::AShr: 
-    case Instruction::And: 
-    case Instruction::Or: 
-    case Instruction::Xor: 
-    case Instruction::ICmp: 
-    case Instruction::FCmp: 
-    case Instruction::Trunc: 
-    case Instruction::ZExt: 
-    case Instruction::SExt: 
-    case Instruction::FPToUI: 
-    case Instruction::FPToSI: 
-    case Instruction::UIToFP: 
-    case Instruction::SIToFP: 
-    case Instruction::FPTrunc: 
-    case Instruction::FPExt: 
-    case Instruction::Select: 
-    case Instruction::GetElementPtr: { 
-      SmallVector<Value*, 8> NewOps; 
-      bool NeedsRebuild = 
+        return false;
+      for (Value *Operand : I->operands()) {
+        if (!canEvaluateShuffled(Operand, Mask, Depth - 1))
+          return false;
+      }
+      return true;
+    }
+    case Instruction::InsertElement: {
+      ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(2));
+      if (!CI) return false;
+      int ElementNumber = CI->getLimitedValue();
+
+      // Verify that 'CI' does not occur twice in Mask. A single 'insertelement'
+      // can't put an element into multiple indices.
+      bool SeenOnce = false;
+      for (int i = 0, e = Mask.size(); i != e; ++i) {
+        if (Mask[i] == ElementNumber) {
+          if (SeenOnce)
+            return false;
+          SeenOnce = true;
+        }
+      }
+      return canEvaluateShuffled(I->getOperand(0), Mask, Depth - 1);
+    }
+  }
+  return false;
+}
+
+/// Rebuild a new instruction just like 'I' but with the new operands given.
+/// In the event of type mismatch, the type of the operands is correct.
+static Value *buildNew(Instruction *I, ArrayRef<Value*> NewOps) {
+  // We don't want to use the IRBuilder here because we want the replacement
+  // instructions to appear next to 'I', not the builder's insertion point.
+  switch (I->getOpcode()) {
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor: {
+      BinaryOperator *BO = cast<BinaryOperator>(I);
+      assert(NewOps.size() == 2 && "binary operator with #ops != 2");
+      BinaryOperator *New =
+          BinaryOperator::Create(cast<BinaryOperator>(I)->getOpcode(),
+                                 NewOps[0], NewOps[1], "", BO);
+      if (isa<OverflowingBinaryOperator>(BO)) {
+        New->setHasNoUnsignedWrap(BO->hasNoUnsignedWrap());
+        New->setHasNoSignedWrap(BO->hasNoSignedWrap());
+      }
+      if (isa<PossiblyExactOperator>(BO)) {
+        New->setIsExact(BO->isExact());
+      }
+      if (isa<FPMathOperator>(BO))
+        New->copyFastMathFlags(I);
+      return New;
+    }
+    case Instruction::ICmp:
+      assert(NewOps.size() == 2 && "icmp with #ops != 2");
+      return new ICmpInst(I, cast<ICmpInst>(I)->getPredicate(),
+                          NewOps[0], NewOps[1]);
+    case Instruction::FCmp:
+      assert(NewOps.size() == 2 && "fcmp with #ops != 2");
+      return new FCmpInst(I, cast<FCmpInst>(I)->getPredicate(),
+                          NewOps[0], NewOps[1]);
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt: {
+      // It's possible that the mask has a different number of elements from
+      // the original cast. We recompute the destination type to match the mask.
+      Type *DestTy = VectorType::get(
+          I->getType()->getScalarType(),
+          cast<VectorType>(NewOps[0]->getType())->getElementCount());
+      assert(NewOps.size() == 1 && "cast with #ops != 1");
+      return CastInst::Create(cast<CastInst>(I)->getOpcode(), NewOps[0], DestTy,
+                              "", I);
+    }
+    case Instruction::GetElementPtr: {
+      Value *Ptr = NewOps[0];
+      ArrayRef<Value*> Idx = NewOps.slice(1);
+      GetElementPtrInst *GEP = GetElementPtrInst::Create(
+          cast<GetElementPtrInst>(I)->getSourceElementType(), Ptr, Idx, "", I);
+      GEP->setIsInBounds(cast<GetElementPtrInst>(I)->isInBounds());
+      return GEP;
+    }
+  }
+  llvm_unreachable("failed to rebuild vector instructions");
+}
+
+static Value *evaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) {
+  // Mask.size() does not need to be equal to the number of vector elements.
+
+  assert(V->getType()->isVectorTy() && "can't reorder non-vector elements");
+  Type *EltTy = V->getType()->getScalarType();
+  Type *I32Ty = IntegerType::getInt32Ty(V->getContext());
+  if (isa<UndefValue>(V))
+    return UndefValue::get(FixedVectorType::get(EltTy, Mask.size()));
+
+  if (isa<ConstantAggregateZero>(V))
+    return ConstantAggregateZero::get(FixedVectorType::get(EltTy, Mask.size()));
+
+  if (Constant *C = dyn_cast<Constant>(V))
+    return ConstantExpr::getShuffleVector(C, UndefValue::get(C->getType()),
+                                          Mask);
+
+  Instruction *I = cast<Instruction>(V);
+  switch (I->getOpcode()) {
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::ICmp:
+    case Instruction::FCmp:
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::Select:
+    case Instruction::GetElementPtr: {
+      SmallVector<Value*, 8> NewOps;
+      bool NeedsRebuild =
           (Mask.size() !=
            cast<FixedVectorType>(I->getType())->getNumElements());
-      for (int i = 0, e = I->getNumOperands(); i != e; ++i) { 
-        Value *V; 
-        // Recursively call evaluateInDifferentElementOrder on vector arguments 
-        // as well. E.g. GetElementPtr may have scalar operands even if the 
-        // return value is a vector, so we need to examine the operand type. 
-        if (I->getOperand(i)->getType()->isVectorTy()) 
-          V = evaluateInDifferentElementOrder(I->getOperand(i), Mask); 
-        else 
-          V = I->getOperand(i); 
-        NewOps.push_back(V); 
-        NeedsRebuild |= (V != I->getOperand(i)); 
-      } 
-      if (NeedsRebuild) { 
-        return buildNew(I, NewOps); 
-      } 
-      return I; 
-    } 
-    case Instruction::InsertElement: { 
-      int Element = cast<ConstantInt>(I->getOperand(2))->getLimitedValue(); 
- 
-      // The insertelement was inserting at Element. Figure out which element 
-      // that becomes after shuffling. The answer is guaranteed to be unique 
-      // by CanEvaluateShuffled. 
-      bool Found = false; 
-      int Index = 0; 
-      for (int e = Mask.size(); Index != e; ++Index) { 
-        if (Mask[Index] == Element) { 
-          Found = true; 
-          break; 
-        } 
-      } 
- 
-      // If element is not in Mask, no need to handle the operand 1 (element to 
-      // be inserted). Just evaluate values in operand 0 according to Mask. 
-      if (!Found) 
-        return evaluateInDifferentElementOrder(I->getOperand(0), Mask); 
- 
-      Value *V = evaluateInDifferentElementOrder(I->getOperand(0), Mask); 
-      return InsertElementInst::Create(V, I->getOperand(1), 
-                                       ConstantInt::get(I32Ty, Index), "", I); 
-    } 
-  } 
-  llvm_unreachable("failed to reorder elements of vector instruction!"); 
-} 
- 
-// Returns true if the shuffle is extracting a contiguous range of values from 
-// LHS, for example: 
-//                 +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 
-//   Input:        |AA|BB|CC|DD|EE|FF|GG|HH|II|JJ|KK|LL|MM|NN|OO|PP| 
-//   Shuffles to:  |EE|FF|GG|HH| 
-//                 +--+--+--+--+ 
-static bool isShuffleExtractingFromLHS(ShuffleVectorInst &SVI, 
-                                       ArrayRef<int> Mask) { 
-  unsigned LHSElems = 
+      for (int i = 0, e = I->getNumOperands(); i != e; ++i) {
+        Value *V;
+        // Recursively call evaluateInDifferentElementOrder on vector arguments
+        // as well. E.g. GetElementPtr may have scalar operands even if the
+        // return value is a vector, so we need to examine the operand type.
+        if (I->getOperand(i)->getType()->isVectorTy())
+          V = evaluateInDifferentElementOrder(I->getOperand(i), Mask);
+        else
+          V = I->getOperand(i);
+        NewOps.push_back(V);
+        NeedsRebuild |= (V != I->getOperand(i));
+      }
+      if (NeedsRebuild) {
+        return buildNew(I, NewOps);
+      }
+      return I;
+    }
+    case Instruction::InsertElement: {
+      int Element = cast<ConstantInt>(I->getOperand(2))->getLimitedValue();
+
+      // The insertelement was inserting at Element. Figure out which element
+      // that becomes after shuffling. The answer is guaranteed to be unique
+      // by CanEvaluateShuffled.
+      bool Found = false;
+      int Index = 0;
+      for (int e = Mask.size(); Index != e; ++Index) {
+        if (Mask[Index] == Element) {
+          Found = true;
+          break;
+        }
+      }
+
+      // If element is not in Mask, no need to handle the operand 1 (element to
+      // be inserted). Just evaluate values in operand 0 according to Mask.
+      if (!Found)
+        return evaluateInDifferentElementOrder(I->getOperand(0), Mask);
+
+      Value *V = evaluateInDifferentElementOrder(I->getOperand(0), Mask);
+      return InsertElementInst::Create(V, I->getOperand(1),
+                                       ConstantInt::get(I32Ty, Index), "", I);
+    }
+  }
+  llvm_unreachable("failed to reorder elements of vector instruction!");
+}
+
+// Returns true if the shuffle is extracting a contiguous range of values from
+// LHS, for example:
+//                 +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+//   Input:        |AA|BB|CC|DD|EE|FF|GG|HH|II|JJ|KK|LL|MM|NN|OO|PP|
+//   Shuffles to:  |EE|FF|GG|HH|
+//                 +--+--+--+--+
+static bool isShuffleExtractingFromLHS(ShuffleVectorInst &SVI,
+                                       ArrayRef<int> Mask) {
+  unsigned LHSElems =
       cast<FixedVectorType>(SVI.getOperand(0)->getType())->getNumElements();
-  unsigned MaskElems = Mask.size(); 
-  unsigned BegIdx = Mask.front(); 
-  unsigned EndIdx = Mask.back(); 
-  if (BegIdx > EndIdx || EndIdx >= LHSElems || EndIdx - BegIdx != MaskElems - 1) 
-    return false; 
-  for (unsigned I = 0; I != MaskElems; ++I) 
-    if (static_cast<unsigned>(Mask[I]) != BegIdx + I) 
-      return false; 
-  return true; 
-} 
- 
-/// These are the ingredients in an alternate form binary operator as described 
-/// below. 
-struct BinopElts { 
-  BinaryOperator::BinaryOps Opcode; 
-  Value *Op0; 
-  Value *Op1; 
-  BinopElts(BinaryOperator::BinaryOps Opc = (BinaryOperator::BinaryOps)0, 
-            Value *V0 = nullptr, Value *V1 = nullptr) : 
-      Opcode(Opc), Op0(V0), Op1(V1) {} 
-  operator bool() const { return Opcode != 0; } 
-}; 
- 
-/// Binops may be transformed into binops with different opcodes and operands. 
-/// Reverse the usual canonicalization to enable folds with the non-canonical 
-/// form of the binop. If a transform is possible, return the elements of the 
-/// new binop. If not, return invalid elements. 
-static BinopElts getAlternateBinop(BinaryOperator *BO, const DataLayout &DL) { 
-  Value *BO0 = BO->getOperand(0), *BO1 = BO->getOperand(1); 
-  Type *Ty = BO->getType(); 
-  switch (BO->getOpcode()) { 
-    case Instruction::Shl: { 
-      // shl X, C --> mul X, (1 << C) 
-      Constant *C; 
-      if (match(BO1, m_Constant(C))) { 
-        Constant *ShlOne = ConstantExpr::getShl(ConstantInt::get(Ty, 1), C); 
-        return { Instruction::Mul, BO0, ShlOne }; 
-      } 
-      break; 
-    } 
-    case Instruction::Or: { 
-      // or X, C --> add X, C (when X and C have no common bits set) 
-      const APInt *C; 
-      if (match(BO1, m_APInt(C)) && MaskedValueIsZero(BO0, *C, DL)) 
-        return { Instruction::Add, BO0, BO1 }; 
-      break; 
-    } 
-    default: 
-      break; 
-  } 
-  return {}; 
-} 
- 
-static Instruction *foldSelectShuffleWith1Binop(ShuffleVectorInst &Shuf) { 
-  assert(Shuf.isSelect() && "Must have select-equivalent shuffle"); 
- 
-  // Are we shuffling together some value and that same value after it has been 
-  // modified by a binop with a constant? 
-  Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1); 
-  Constant *C; 
-  bool Op0IsBinop; 
-  if (match(Op0, m_BinOp(m_Specific(Op1), m_Constant(C)))) 
-    Op0IsBinop = true; 
-  else if (match(Op1, m_BinOp(m_Specific(Op0), m_Constant(C)))) 
-    Op0IsBinop = false; 
-  else 
-    return nullptr; 
- 
-  // The identity constant for a binop leaves a variable operand unchanged. For 
-  // a vector, this is a splat of something like 0, -1, or 1. 
-  // If there's no identity constant for this binop, we're done. 
-  auto *BO = cast<BinaryOperator>(Op0IsBinop ? Op0 : Op1); 
-  BinaryOperator::BinaryOps BOpcode = BO->getOpcode(); 
-  Constant *IdC = ConstantExpr::getBinOpIdentity(BOpcode, Shuf.getType(), true); 
-  if (!IdC) 
-    return nullptr; 
- 
-  // Shuffle identity constants into the lanes that return the original value. 
-  // Example: shuf (mul X, {-1,-2,-3,-4}), X, {0,5,6,3} --> mul X, {-1,1,1,-4} 
-  // Example: shuf X, (add X, {-1,-2,-3,-4}), {0,1,6,7} --> add X, {0,0,-3,-4} 
-  // The existing binop constant vector remains in the same operand position. 
-  ArrayRef<int> Mask = Shuf.getShuffleMask(); 
-  Constant *NewC = Op0IsBinop ? ConstantExpr::getShuffleVector(C, IdC, Mask) : 
-                                ConstantExpr::getShuffleVector(IdC, C, Mask); 
- 
-  bool MightCreatePoisonOrUB = 
-      is_contained(Mask, UndefMaskElem) && 
-      (Instruction::isIntDivRem(BOpcode) || Instruction::isShift(BOpcode)); 
-  if (MightCreatePoisonOrUB) 
+  unsigned MaskElems = Mask.size();
+  unsigned BegIdx = Mask.front();
+  unsigned EndIdx = Mask.back();
+  if (BegIdx > EndIdx || EndIdx >= LHSElems || EndIdx - BegIdx != MaskElems - 1)
+    return false;
+  for (unsigned I = 0; I != MaskElems; ++I)
+    if (static_cast<unsigned>(Mask[I]) != BegIdx + I)
+      return false;
+  return true;
+}
+
+/// These are the ingredients in an alternate form binary operator as described
+/// below.
+struct BinopElts {
+  BinaryOperator::BinaryOps Opcode;
+  Value *Op0;
+  Value *Op1;
+  BinopElts(BinaryOperator::BinaryOps Opc = (BinaryOperator::BinaryOps)0,
+            Value *V0 = nullptr, Value *V1 = nullptr) :
+      Opcode(Opc), Op0(V0), Op1(V1) {}
+  operator bool() const { return Opcode != 0; }
+};
+
+/// Binops may be transformed into binops with different opcodes and operands.
+/// Reverse the usual canonicalization to enable folds with the non-canonical
+/// form of the binop. If a transform is possible, return the elements of the
+/// new binop. If not, return invalid elements.
+static BinopElts getAlternateBinop(BinaryOperator *BO, const DataLayout &DL) {
+  Value *BO0 = BO->getOperand(0), *BO1 = BO->getOperand(1);
+  Type *Ty = BO->getType();
+  switch (BO->getOpcode()) {
+    case Instruction::Shl: {
+      // shl X, C --> mul X, (1 << C)
+      Constant *C;
+      if (match(BO1, m_Constant(C))) {
+        Constant *ShlOne = ConstantExpr::getShl(ConstantInt::get(Ty, 1), C);
+        return { Instruction::Mul, BO0, ShlOne };
+      }
+      break;
+    }
+    case Instruction::Or: {
+      // or X, C --> add X, C (when X and C have no common bits set)
+      const APInt *C;
+      if (match(BO1, m_APInt(C)) && MaskedValueIsZero(BO0, *C, DL))
+        return { Instruction::Add, BO0, BO1 };
+      break;
+    }
+    default:
+      break;
+  }
+  return {};
+}
+
+static Instruction *foldSelectShuffleWith1Binop(ShuffleVectorInst &Shuf) {
+  assert(Shuf.isSelect() && "Must have select-equivalent shuffle");
+
+  // Are we shuffling together some value and that same value after it has been
+  // modified by a binop with a constant?
+  Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1);
+  Constant *C;
+  bool Op0IsBinop;
+  if (match(Op0, m_BinOp(m_Specific(Op1), m_Constant(C))))
+    Op0IsBinop = true;
+  else if (match(Op1, m_BinOp(m_Specific(Op0), m_Constant(C))))
+    Op0IsBinop = false;
+  else
+    return nullptr;
+
+  // The identity constant for a binop leaves a variable operand unchanged. For
+  // a vector, this is a splat of something like 0, -1, or 1.
+  // If there's no identity constant for this binop, we're done.
+  auto *BO = cast<BinaryOperator>(Op0IsBinop ? Op0 : Op1);
+  BinaryOperator::BinaryOps BOpcode = BO->getOpcode();
+  Constant *IdC = ConstantExpr::getBinOpIdentity(BOpcode, Shuf.getType(), true);
+  if (!IdC)
+    return nullptr;
+
+  // Shuffle identity constants into the lanes that return the original value.
+  // Example: shuf (mul X, {-1,-2,-3,-4}), X, {0,5,6,3} --> mul X, {-1,1,1,-4}
+  // Example: shuf X, (add X, {-1,-2,-3,-4}), {0,1,6,7} --> add X, {0,0,-3,-4}
+  // The existing binop constant vector remains in the same operand position.
+  ArrayRef<int> Mask = Shuf.getShuffleMask();
+  Constant *NewC = Op0IsBinop ? ConstantExpr::getShuffleVector(C, IdC, Mask) :
+                                ConstantExpr::getShuffleVector(IdC, C, Mask);
+
+  bool MightCreatePoisonOrUB =
+      is_contained(Mask, UndefMaskElem) &&
+      (Instruction::isIntDivRem(BOpcode) || Instruction::isShift(BOpcode));
+  if (MightCreatePoisonOrUB)
     NewC = InstCombiner::getSafeVectorConstantForBinop(BOpcode, NewC, true);
- 
-  // shuf (bop X, C), X, M --> bop X, C' 
-  // shuf X, (bop X, C), M --> bop X, C' 
-  Value *X = Op0IsBinop ? Op1 : Op0; 
-  Instruction *NewBO = BinaryOperator::Create(BOpcode, X, NewC); 
-  NewBO->copyIRFlags(BO); 
- 
-  // An undef shuffle mask element may propagate as an undef constant element in 
-  // the new binop. That would produce poison where the original code might not. 
-  // If we already made a safe constant, then there's no danger. 
-  if (is_contained(Mask, UndefMaskElem) && !MightCreatePoisonOrUB) 
-    NewBO->dropPoisonGeneratingFlags(); 
-  return NewBO; 
-} 
- 
-/// If we have an insert of a scalar to a non-zero element of an undefined 
-/// vector and then shuffle that value, that's the same as inserting to the zero 
-/// element and shuffling. Splatting from the zero element is recognized as the 
-/// canonical form of splat. 
-static Instruction *canonicalizeInsertSplat(ShuffleVectorInst &Shuf, 
-                                            InstCombiner::BuilderTy &Builder) { 
-  Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1); 
-  ArrayRef<int> Mask = Shuf.getShuffleMask(); 
-  Value *X; 
-  uint64_t IndexC; 
- 
-  // Match a shuffle that is a splat to a non-zero element. 
-  if (!match(Op0, m_OneUse(m_InsertElt(m_Undef(), m_Value(X), 
-                                       m_ConstantInt(IndexC)))) || 
-      !match(Op1, m_Undef()) || match(Mask, m_ZeroMask()) || IndexC == 0) 
-    return nullptr; 
- 
-  // Insert into element 0 of an undef vector. 
-  UndefValue *UndefVec = UndefValue::get(Shuf.getType()); 
-  Constant *Zero = Builder.getInt32(0); 
-  Value *NewIns = Builder.CreateInsertElement(UndefVec, X, Zero); 
- 
-  // Splat from element 0. Any mask element that is undefined remains undefined. 
-  // For example: 
-  // shuf (inselt undef, X, 2), undef, <2,2,undef> 
-  //   --> shuf (inselt undef, X, 0), undef, <0,0,undef> 
+
+  // shuf (bop X, C), X, M --> bop X, C'
+  // shuf X, (bop X, C), M --> bop X, C'
+  Value *X = Op0IsBinop ? Op1 : Op0;
+  Instruction *NewBO = BinaryOperator::Create(BOpcode, X, NewC);
+  NewBO->copyIRFlags(BO);
+
+  // An undef shuffle mask element may propagate as an undef constant element in
+  // the new binop. That would produce poison where the original code might not.
+  // If we already made a safe constant, then there's no danger.
+  if (is_contained(Mask, UndefMaskElem) && !MightCreatePoisonOrUB)
+    NewBO->dropPoisonGeneratingFlags();
+  return NewBO;
+}
+
+/// If we have an insert of a scalar to a non-zero element of an undefined
+/// vector and then shuffle that value, that's the same as inserting to the zero
+/// element and shuffling. Splatting from the zero element is recognized as the
+/// canonical form of splat.
+static Instruction *canonicalizeInsertSplat(ShuffleVectorInst &Shuf,
+                                            InstCombiner::BuilderTy &Builder) {
+  Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1);
+  ArrayRef<int> Mask = Shuf.getShuffleMask();
+  Value *X;
+  uint64_t IndexC;
+
+  // Match a shuffle that is a splat to a non-zero element.
+  if (!match(Op0, m_OneUse(m_InsertElt(m_Undef(), m_Value(X),
+                                       m_ConstantInt(IndexC)))) ||
+      !match(Op1, m_Undef()) || match(Mask, m_ZeroMask()) || IndexC == 0)
+    return nullptr;
+
+  // Insert into element 0 of an undef vector.
+  UndefValue *UndefVec = UndefValue::get(Shuf.getType());
+  Constant *Zero = Builder.getInt32(0);
+  Value *NewIns = Builder.CreateInsertElement(UndefVec, X, Zero);
+
+  // Splat from element 0. Any mask element that is undefined remains undefined.
+  // For example:
+  // shuf (inselt undef, X, 2), undef, <2,2,undef>
+  //   --> shuf (inselt undef, X, 0), undef, <0,0,undef>
   unsigned NumMaskElts =
       cast<FixedVectorType>(Shuf.getType())->getNumElements();
-  SmallVector<int, 16> NewMask(NumMaskElts, 0); 
-  for (unsigned i = 0; i != NumMaskElts; ++i) 
-    if (Mask[i] == UndefMaskElem) 
-      NewMask[i] = Mask[i]; 
- 
-  return new ShuffleVectorInst(NewIns, UndefVec, NewMask); 
-} 
- 
-/// Try to fold shuffles that are the equivalent of a vector select. 
-static Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf, 
-                                      InstCombiner::BuilderTy &Builder, 
-                                      const DataLayout &DL) { 
-  if (!Shuf.isSelect()) 
-    return nullptr; 
- 
-  // Canonicalize to choose from operand 0 first unless operand 1 is undefined. 
-  // Commuting undef to operand 0 conflicts with another canonicalization. 
+  SmallVector<int, 16> NewMask(NumMaskElts, 0);
+  for (unsigned i = 0; i != NumMaskElts; ++i)
+    if (Mask[i] == UndefMaskElem)
+      NewMask[i] = Mask[i];
+
+  return new ShuffleVectorInst(NewIns, UndefVec, NewMask);
+}
+
+/// Try to fold shuffles that are the equivalent of a vector select.
+static Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf,
+                                      InstCombiner::BuilderTy &Builder,
+                                      const DataLayout &DL) {
+  if (!Shuf.isSelect())
+    return nullptr;
+
+  // Canonicalize to choose from operand 0 first unless operand 1 is undefined.
+  // Commuting undef to operand 0 conflicts with another canonicalization.
   unsigned NumElts = cast<FixedVectorType>(Shuf.getType())->getNumElements();
-  if (!isa<UndefValue>(Shuf.getOperand(1)) && 
-      Shuf.getMaskValue(0) >= (int)NumElts) { 
-    // TODO: Can we assert that both operands of a shuffle-select are not undef 
-    // (otherwise, it would have been folded by instsimplify? 
-    Shuf.commute(); 
-    return &Shuf; 
-  } 
- 
-  if (Instruction *I = foldSelectShuffleWith1Binop(Shuf)) 
-    return I; 
- 
-  BinaryOperator *B0, *B1; 
-  if (!match(Shuf.getOperand(0), m_BinOp(B0)) || 
-      !match(Shuf.getOperand(1), m_BinOp(B1))) 
-    return nullptr; 
- 
-  Value *X, *Y; 
-  Constant *C0, *C1; 
-  bool ConstantsAreOp1; 
-  if (match(B0, m_BinOp(m_Value(X), m_Constant(C0))) && 
-      match(B1, m_BinOp(m_Value(Y), m_Constant(C1)))) 
-    ConstantsAreOp1 = true; 
-  else if (match(B0, m_BinOp(m_Constant(C0), m_Value(X))) && 
-           match(B1, m_BinOp(m_Constant(C1), m_Value(Y)))) 
-    ConstantsAreOp1 = false; 
-  else 
-    return nullptr; 
- 
-  // We need matching binops to fold the lanes together. 
-  BinaryOperator::BinaryOps Opc0 = B0->getOpcode(); 
-  BinaryOperator::BinaryOps Opc1 = B1->getOpcode(); 
-  bool DropNSW = false; 
-  if (ConstantsAreOp1 && Opc0 != Opc1) { 
-    // TODO: We drop "nsw" if shift is converted into multiply because it may 
-    // not be correct when the shift amount is BitWidth - 1. We could examine 
-    // each vector element to determine if it is safe to keep that flag. 
-    if (Opc0 == Instruction::Shl || Opc1 == Instruction::Shl) 
-      DropNSW = true; 
-    if (BinopElts AltB0 = getAlternateBinop(B0, DL)) { 
-      assert(isa<Constant>(AltB0.Op1) && "Expecting constant with alt binop"); 
-      Opc0 = AltB0.Opcode; 
-      C0 = cast<Constant>(AltB0.Op1); 
-    } else if (BinopElts AltB1 = getAlternateBinop(B1, DL)) { 
-      assert(isa<Constant>(AltB1.Op1) && "Expecting constant with alt binop"); 
-      Opc1 = AltB1.Opcode; 
-      C1 = cast<Constant>(AltB1.Op1); 
-    } 
-  } 
- 
-  if (Opc0 != Opc1) 
-    return nullptr; 
- 
-  // The opcodes must be the same. Use a new name to make that clear. 
-  BinaryOperator::BinaryOps BOpc = Opc0; 
- 
-  // Select the constant elements needed for the single binop. 
-  ArrayRef<int> Mask = Shuf.getShuffleMask(); 
-  Constant *NewC = ConstantExpr::getShuffleVector(C0, C1, Mask); 
- 
-  // We are moving a binop after a shuffle. When a shuffle has an undefined 
-  // mask element, the result is undefined, but it is not poison or undefined 
-  // behavior. That is not necessarily true for div/rem/shift. 
-  bool MightCreatePoisonOrUB = 
-      is_contained(Mask, UndefMaskElem) && 
-      (Instruction::isIntDivRem(BOpc) || Instruction::isShift(BOpc)); 
-  if (MightCreatePoisonOrUB) 
+  if (!isa<UndefValue>(Shuf.getOperand(1)) &&
+      Shuf.getMaskValue(0) >= (int)NumElts) {
+    // TODO: Can we assert that both operands of a shuffle-select are not undef
+    // (otherwise, it would have been folded by instsimplify?
+    Shuf.commute();
+    return &Shuf;
+  }
+
+  if (Instruction *I = foldSelectShuffleWith1Binop(Shuf))
+    return I;
+
+  BinaryOperator *B0, *B1;
+  if (!match(Shuf.getOperand(0), m_BinOp(B0)) ||
+      !match(Shuf.getOperand(1), m_BinOp(B1)))
+    return nullptr;
+
+  Value *X, *Y;
+  Constant *C0, *C1;
+  bool ConstantsAreOp1;
+  if (match(B0, m_BinOp(m_Value(X), m_Constant(C0))) &&
+      match(B1, m_BinOp(m_Value(Y), m_Constant(C1))))
+    ConstantsAreOp1 = true;
+  else if (match(B0, m_BinOp(m_Constant(C0), m_Value(X))) &&
+           match(B1, m_BinOp(m_Constant(C1), m_Value(Y))))
+    ConstantsAreOp1 = false;
+  else
+    return nullptr;
+
+  // We need matching binops to fold the lanes together.
+  BinaryOperator::BinaryOps Opc0 = B0->getOpcode();
+  BinaryOperator::BinaryOps Opc1 = B1->getOpcode();
+  bool DropNSW = false;
+  if (ConstantsAreOp1 && Opc0 != Opc1) {
+    // TODO: We drop "nsw" if shift is converted into multiply because it may
+    // not be correct when the shift amount is BitWidth - 1. We could examine
+    // each vector element to determine if it is safe to keep that flag.
+    if (Opc0 == Instruction::Shl || Opc1 == Instruction::Shl)
+      DropNSW = true;
+    if (BinopElts AltB0 = getAlternateBinop(B0, DL)) {
+      assert(isa<Constant>(AltB0.Op1) && "Expecting constant with alt binop");
+      Opc0 = AltB0.Opcode;
+      C0 = cast<Constant>(AltB0.Op1);
+    } else if (BinopElts AltB1 = getAlternateBinop(B1, DL)) {
+      assert(isa<Constant>(AltB1.Op1) && "Expecting constant with alt binop");
+      Opc1 = AltB1.Opcode;
+      C1 = cast<Constant>(AltB1.Op1);
+    }
+  }
+
+  if (Opc0 != Opc1)
+    return nullptr;
+
+  // The opcodes must be the same. Use a new name to make that clear.
+  BinaryOperator::BinaryOps BOpc = Opc0;
+
+  // Select the constant elements needed for the single binop.
+  ArrayRef<int> Mask = Shuf.getShuffleMask();
+  Constant *NewC = ConstantExpr::getShuffleVector(C0, C1, Mask);
+
+  // We are moving a binop after a shuffle. When a shuffle has an undefined
+  // mask element, the result is undefined, but it is not poison or undefined
+  // behavior. That is not necessarily true for div/rem/shift.
+  bool MightCreatePoisonOrUB =
+      is_contained(Mask, UndefMaskElem) &&
+      (Instruction::isIntDivRem(BOpc) || Instruction::isShift(BOpc));
+  if (MightCreatePoisonOrUB)
     NewC = InstCombiner::getSafeVectorConstantForBinop(BOpc, NewC,
                                                        ConstantsAreOp1);
- 
-  Value *V; 
-  if (X == Y) { 
-    // Remove a binop and the shuffle by rearranging the constant: 
-    // shuffle (op V, C0), (op V, C1), M --> op V, C' 
-    // shuffle (op C0, V), (op C1, V), M --> op C', V 
-    V = X; 
-  } else { 
-    // If there are 2 different variable operands, we must create a new shuffle 
-    // (select) first, so check uses to ensure that we don't end up with more 
-    // instructions than we started with. 
-    if (!B0->hasOneUse() && !B1->hasOneUse()) 
-      return nullptr; 
- 
-    // If we use the original shuffle mask and op1 is *variable*, we would be 
-    // putting an undef into operand 1 of div/rem/shift. This is either UB or 
-    // poison. We do not have to guard against UB when *constants* are op1 
-    // because safe constants guarantee that we do not overflow sdiv/srem (and 
-    // there's no danger for other opcodes). 
-    // TODO: To allow this case, create a new shuffle mask with no undefs. 
-    if (MightCreatePoisonOrUB && !ConstantsAreOp1) 
-      return nullptr; 
- 
-    // Note: In general, we do not create new shuffles in InstCombine because we 
-    // do not know if a target can lower an arbitrary shuffle optimally. In this 
-    // case, the shuffle uses the existing mask, so there is no additional risk. 
- 
-    // Select the variable vectors first, then perform the binop: 
-    // shuffle (op X, C0), (op Y, C1), M --> op (shuffle X, Y, M), C' 
-    // shuffle (op C0, X), (op C1, Y), M --> op C', (shuffle X, Y, M) 
-    V = Builder.CreateShuffleVector(X, Y, Mask); 
-  } 
- 
-  Instruction *NewBO = ConstantsAreOp1 ? BinaryOperator::Create(BOpc, V, NewC) : 
-                                         BinaryOperator::Create(BOpc, NewC, V); 
- 
-  // Flags are intersected from the 2 source binops. But there are 2 exceptions: 
-  // 1. If we changed an opcode, poison conditions might have changed. 
-  // 2. If the shuffle had undef mask elements, the new binop might have undefs 
-  //    where the original code did not. But if we already made a safe constant, 
-  //    then there's no danger. 
-  NewBO->copyIRFlags(B0); 
-  NewBO->andIRFlags(B1); 
-  if (DropNSW) 
-    NewBO->setHasNoSignedWrap(false); 
-  if (is_contained(Mask, UndefMaskElem) && !MightCreatePoisonOrUB) 
-    NewBO->dropPoisonGeneratingFlags(); 
-  return NewBO; 
-} 
- 
-/// Convert a narrowing shuffle of a bitcasted vector into a vector truncate. 
-/// Example (little endian): 
-/// shuf (bitcast <4 x i16> X to <8 x i8>), <0, 2, 4, 6> --> trunc X to <4 x i8> 
-static Instruction *foldTruncShuffle(ShuffleVectorInst &Shuf, 
-                                     bool IsBigEndian) { 
-  // This must be a bitcasted shuffle of 1 vector integer operand. 
-  Type *DestType = Shuf.getType(); 
-  Value *X; 
-  if (!match(Shuf.getOperand(0), m_BitCast(m_Value(X))) || 
-      !match(Shuf.getOperand(1), m_Undef()) || !DestType->isIntOrIntVectorTy()) 
-    return nullptr; 
- 
-  // The source type must have the same number of elements as the shuffle, 
-  // and the source element type must be larger than the shuffle element type. 
-  Type *SrcType = X->getType(); 
-  if (!SrcType->isVectorTy() || !SrcType->isIntOrIntVectorTy() || 
+
+  Value *V;
+  if (X == Y) {
+    // Remove a binop and the shuffle by rearranging the constant:
+    // shuffle (op V, C0), (op V, C1), M --> op V, C'
+    // shuffle (op C0, V), (op C1, V), M --> op C', V
+    V = X;
+  } else {
+    // If there are 2 different variable operands, we must create a new shuffle
+    // (select) first, so check uses to ensure that we don't end up with more
+    // instructions than we started with.
+    if (!B0->hasOneUse() && !B1->hasOneUse())
+      return nullptr;
+
+    // If we use the original shuffle mask and op1 is *variable*, we would be
+    // putting an undef into operand 1 of div/rem/shift. This is either UB or
+    // poison. We do not have to guard against UB when *constants* are op1
+    // because safe constants guarantee that we do not overflow sdiv/srem (and
+    // there's no danger for other opcodes).
+    // TODO: To allow this case, create a new shuffle mask with no undefs.
+    if (MightCreatePoisonOrUB && !ConstantsAreOp1)
+      return nullptr;
+
+    // Note: In general, we do not create new shuffles in InstCombine because we
+    // do not know if a target can lower an arbitrary shuffle optimally. In this
+    // case, the shuffle uses the existing mask, so there is no additional risk.
+
+    // Select the variable vectors first, then perform the binop:
+    // shuffle (op X, C0), (op Y, C1), M --> op (shuffle X, Y, M), C'
+    // shuffle (op C0, X), (op C1, Y), M --> op C', (shuffle X, Y, M)
+    V = Builder.CreateShuffleVector(X, Y, Mask);
+  }
+
+  Instruction *NewBO = ConstantsAreOp1 ? BinaryOperator::Create(BOpc, V, NewC) :
+                                         BinaryOperator::Create(BOpc, NewC, V);
+
+  // Flags are intersected from the 2 source binops. But there are 2 exceptions:
+  // 1. If we changed an opcode, poison conditions might have changed.
+  // 2. If the shuffle had undef mask elements, the new binop might have undefs
+  //    where the original code did not. But if we already made a safe constant,
+  //    then there's no danger.
+  NewBO->copyIRFlags(B0);
+  NewBO->andIRFlags(B1);
+  if (DropNSW)
+    NewBO->setHasNoSignedWrap(false);
+  if (is_contained(Mask, UndefMaskElem) && !MightCreatePoisonOrUB)
+    NewBO->dropPoisonGeneratingFlags();
+  return NewBO;
+}
+
+/// Convert a narrowing shuffle of a bitcasted vector into a vector truncate.
+/// Example (little endian):
+/// shuf (bitcast <4 x i16> X to <8 x i8>), <0, 2, 4, 6> --> trunc X to <4 x i8>
+static Instruction *foldTruncShuffle(ShuffleVectorInst &Shuf,
+                                     bool IsBigEndian) {
+  // This must be a bitcasted shuffle of 1 vector integer operand.
+  Type *DestType = Shuf.getType();
+  Value *X;
+  if (!match(Shuf.getOperand(0), m_BitCast(m_Value(X))) ||
+      !match(Shuf.getOperand(1), m_Undef()) || !DestType->isIntOrIntVectorTy())
+    return nullptr;
+
+  // The source type must have the same number of elements as the shuffle,
+  // and the source element type must be larger than the shuffle element type.
+  Type *SrcType = X->getType();
+  if (!SrcType->isVectorTy() || !SrcType->isIntOrIntVectorTy() ||
       cast<FixedVectorType>(SrcType)->getNumElements() !=
           cast<FixedVectorType>(DestType)->getNumElements() ||
-      SrcType->getScalarSizeInBits() % DestType->getScalarSizeInBits() != 0) 
-    return nullptr; 
- 
-  assert(Shuf.changesLength() && !Shuf.increasesLength() && 
-         "Expected a shuffle that decreases length"); 
- 
-  // Last, check that the mask chooses the correct low bits for each narrow 
-  // element in the result. 
-  uint64_t TruncRatio = 
-      SrcType->getScalarSizeInBits() / DestType->getScalarSizeInBits(); 
-  ArrayRef<int> Mask = Shuf.getShuffleMask(); 
-  for (unsigned i = 0, e = Mask.size(); i != e; ++i) { 
-    if (Mask[i] == UndefMaskElem) 
-      continue; 
-    uint64_t LSBIndex = IsBigEndian ? (i + 1) * TruncRatio - 1 : i * TruncRatio; 
+      SrcType->getScalarSizeInBits() % DestType->getScalarSizeInBits() != 0)
+    return nullptr;
+
+  assert(Shuf.changesLength() && !Shuf.increasesLength() &&
+         "Expected a shuffle that decreases length");
+
+  // Last, check that the mask chooses the correct low bits for each narrow
+  // element in the result.
+  uint64_t TruncRatio =
+      SrcType->getScalarSizeInBits() / DestType->getScalarSizeInBits();
+  ArrayRef<int> Mask = Shuf.getShuffleMask();
+  for (unsigned i = 0, e = Mask.size(); i != e; ++i) {
+    if (Mask[i] == UndefMaskElem)
+      continue;
+    uint64_t LSBIndex = IsBigEndian ? (i + 1) * TruncRatio - 1 : i * TruncRatio;
     assert(LSBIndex <= INT32_MAX && "Overflowed 32-bits");
-    if (Mask[i] != (int)LSBIndex) 
-      return nullptr; 
-  } 
- 
-  return new TruncInst(X, DestType); 
-} 
- 
-/// Match a shuffle-select-shuffle pattern where the shuffles are widening and 
-/// narrowing (concatenating with undef and extracting back to the original 
-/// length). This allows replacing the wide select with a narrow select. 
-static Instruction *narrowVectorSelect(ShuffleVectorInst &Shuf, 
-                                       InstCombiner::BuilderTy &Builder) { 
-  // This must be a narrowing identity shuffle. It extracts the 1st N elements 
-  // of the 1st vector operand of a shuffle. 
-  if (!match(Shuf.getOperand(1), m_Undef()) || !Shuf.isIdentityWithExtract()) 
-    return nullptr; 
- 
-  // The vector being shuffled must be a vector select that we can eliminate. 
-  // TODO: The one-use requirement could be eased if X and/or Y are constants. 
-  Value *Cond, *X, *Y; 
-  if (!match(Shuf.getOperand(0), 
-             m_OneUse(m_Select(m_Value(Cond), m_Value(X), m_Value(Y))))) 
-    return nullptr; 
- 
-  // We need a narrow condition value. It must be extended with undef elements 
-  // and have the same number of elements as this shuffle. 
+    if (Mask[i] != (int)LSBIndex)
+      return nullptr;
+  }
+
+  return new TruncInst(X, DestType);
+}
+
+/// Match a shuffle-select-shuffle pattern where the shuffles are widening and
+/// narrowing (concatenating with undef and extracting back to the original
+/// length). This allows replacing the wide select with a narrow select.
+static Instruction *narrowVectorSelect(ShuffleVectorInst &Shuf,
+                                       InstCombiner::BuilderTy &Builder) {
+  // This must be a narrowing identity shuffle. It extracts the 1st N elements
+  // of the 1st vector operand of a shuffle.
+  if (!match(Shuf.getOperand(1), m_Undef()) || !Shuf.isIdentityWithExtract())
+    return nullptr;
+
+  // The vector being shuffled must be a vector select that we can eliminate.
+  // TODO: The one-use requirement could be eased if X and/or Y are constants.
+  Value *Cond, *X, *Y;
+  if (!match(Shuf.getOperand(0),
+             m_OneUse(m_Select(m_Value(Cond), m_Value(X), m_Value(Y)))))
+    return nullptr;
+
+  // We need a narrow condition value. It must be extended with undef elements
+  // and have the same number of elements as this shuffle.
   unsigned NarrowNumElts =
       cast<FixedVectorType>(Shuf.getType())->getNumElements();
-  Value *NarrowCond; 
-  if (!match(Cond, m_OneUse(m_Shuffle(m_Value(NarrowCond), m_Undef()))) || 
+  Value *NarrowCond;
+  if (!match(Cond, m_OneUse(m_Shuffle(m_Value(NarrowCond), m_Undef()))) ||
       cast<FixedVectorType>(NarrowCond->getType())->getNumElements() !=
-          NarrowNumElts || 
-      !cast<ShuffleVectorInst>(Cond)->isIdentityWithPadding()) 
-    return nullptr; 
- 
-  // shuf (sel (shuf NarrowCond, undef, WideMask), X, Y), undef, NarrowMask) --> 
-  // sel NarrowCond, (shuf X, undef, NarrowMask), (shuf Y, undef, NarrowMask) 
+          NarrowNumElts ||
+      !cast<ShuffleVectorInst>(Cond)->isIdentityWithPadding())
+    return nullptr;
+
+  // shuf (sel (shuf NarrowCond, undef, WideMask), X, Y), undef, NarrowMask) -->
+  // sel NarrowCond, (shuf X, undef, NarrowMask), (shuf Y, undef, NarrowMask)
   Value *NarrowX = Builder.CreateShuffleVector(X, Shuf.getShuffleMask());
   Value *NarrowY = Builder.CreateShuffleVector(Y, Shuf.getShuffleMask());
-  return SelectInst::Create(NarrowCond, NarrowX, NarrowY); 
-} 
- 
-/// Try to combine 2 shuffles into 1 shuffle by concatenating a shuffle mask. 
-static Instruction *foldIdentityExtractShuffle(ShuffleVectorInst &Shuf) { 
-  Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1); 
-  if (!Shuf.isIdentityWithExtract() || !isa<UndefValue>(Op1)) 
-    return nullptr; 
- 
-  Value *X, *Y; 
-  ArrayRef<int> Mask; 
-  if (!match(Op0, m_Shuffle(m_Value(X), m_Value(Y), m_Mask(Mask)))) 
-    return nullptr; 
- 
-  // Be conservative with shuffle transforms. If we can't kill the 1st shuffle, 
-  // then combining may result in worse codegen. 
-  if (!Op0->hasOneUse()) 
-    return nullptr; 
- 
-  // We are extracting a subvector from a shuffle. Remove excess elements from 
-  // the 1st shuffle mask to eliminate the extract. 
-  // 
-  // This transform is conservatively limited to identity extracts because we do 
-  // not allow arbitrary shuffle mask creation as a target-independent transform 
-  // (because we can't guarantee that will lower efficiently). 
-  // 
-  // If the extracting shuffle has an undef mask element, it transfers to the 
-  // new shuffle mask. Otherwise, copy the original mask element. Example: 
-  //   shuf (shuf X, Y, <C0, C1, C2, undef, C4>), undef, <0, undef, 2, 3> --> 
-  //   shuf X, Y, <C0, undef, C2, undef> 
+  return SelectInst::Create(NarrowCond, NarrowX, NarrowY);
+}
+
+/// Try to combine 2 shuffles into 1 shuffle by concatenating a shuffle mask.
+static Instruction *foldIdentityExtractShuffle(ShuffleVectorInst &Shuf) {
+  Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1);
+  if (!Shuf.isIdentityWithExtract() || !isa<UndefValue>(Op1))
+    return nullptr;
+
+  Value *X, *Y;
+  ArrayRef<int> Mask;
+  if (!match(Op0, m_Shuffle(m_Value(X), m_Value(Y), m_Mask(Mask))))
+    return nullptr;
+
+  // Be conservative with shuffle transforms. If we can't kill the 1st shuffle,
+  // then combining may result in worse codegen.
+  if (!Op0->hasOneUse())
+    return nullptr;
+
+  // We are extracting a subvector from a shuffle. Remove excess elements from
+  // the 1st shuffle mask to eliminate the extract.
+  //
+  // This transform is conservatively limited to identity extracts because we do
+  // not allow arbitrary shuffle mask creation as a target-independent transform
+  // (because we can't guarantee that will lower efficiently).
+  //
+  // If the extracting shuffle has an undef mask element, it transfers to the
+  // new shuffle mask. Otherwise, copy the original mask element. Example:
+  //   shuf (shuf X, Y, <C0, C1, C2, undef, C4>), undef, <0, undef, 2, 3> -->
+  //   shuf X, Y, <C0, undef, C2, undef>
   unsigned NumElts = cast<FixedVectorType>(Shuf.getType())->getNumElements();
-  SmallVector<int, 16> NewMask(NumElts); 
-  assert(NumElts < Mask.size() && 
-         "Identity with extract must have less elements than its inputs"); 
- 
-  for (unsigned i = 0; i != NumElts; ++i) { 
-    int ExtractMaskElt = Shuf.getMaskValue(i); 
-    int MaskElt = Mask[i]; 
-    NewMask[i] = ExtractMaskElt == UndefMaskElem ? ExtractMaskElt : MaskElt; 
-  } 
-  return new ShuffleVectorInst(X, Y, NewMask); 
-} 
- 
-/// Try to replace a shuffle with an insertelement or try to replace a shuffle 
-/// operand with the operand of an insertelement. 
-static Instruction *foldShuffleWithInsert(ShuffleVectorInst &Shuf, 
+  SmallVector<int, 16> NewMask(NumElts);
+  assert(NumElts < Mask.size() &&
+         "Identity with extract must have less elements than its inputs");
+
+  for (unsigned i = 0; i != NumElts; ++i) {
+    int ExtractMaskElt = Shuf.getMaskValue(i);
+    int MaskElt = Mask[i];
+    NewMask[i] = ExtractMaskElt == UndefMaskElem ? ExtractMaskElt : MaskElt;
+  }
+  return new ShuffleVectorInst(X, Y, NewMask);
+}
+
+/// Try to replace a shuffle with an insertelement or try to replace a shuffle
+/// operand with the operand of an insertelement.
+static Instruction *foldShuffleWithInsert(ShuffleVectorInst &Shuf,
                                           InstCombinerImpl &IC) {
-  Value *V0 = Shuf.getOperand(0), *V1 = Shuf.getOperand(1); 
-  SmallVector<int, 16> Mask; 
-  Shuf.getShuffleMask(Mask); 
- 
-  // The shuffle must not change vector sizes. 
-  // TODO: This restriction could be removed if the insert has only one use 
-  //       (because the transform would require a new length-changing shuffle). 
-  int NumElts = Mask.size(); 
+  Value *V0 = Shuf.getOperand(0), *V1 = Shuf.getOperand(1);
+  SmallVector<int, 16> Mask;
+  Shuf.getShuffleMask(Mask);
+
+  // The shuffle must not change vector sizes.
+  // TODO: This restriction could be removed if the insert has only one use
+  //       (because the transform would require a new length-changing shuffle).
+  int NumElts = Mask.size();
   if (NumElts != (int)(cast<FixedVectorType>(V0->getType())->getNumElements()))
-    return nullptr; 
- 
-  // This is a specialization of a fold in SimplifyDemandedVectorElts. We may 
-  // not be able to handle it there if the insertelement has >1 use. 
-  // If the shuffle has an insertelement operand but does not choose the 
-  // inserted scalar element from that value, then we can replace that shuffle 
-  // operand with the source vector of the insertelement. 
-  Value *X; 
-  uint64_t IdxC; 
-  if (match(V0, m_InsertElt(m_Value(X), m_Value(), m_ConstantInt(IdxC)))) { 
-    // shuf (inselt X, ?, IdxC), ?, Mask --> shuf X, ?, Mask 
+    return nullptr;
+
+  // This is a specialization of a fold in SimplifyDemandedVectorElts. We may
+  // not be able to handle it there if the insertelement has >1 use.
+  // If the shuffle has an insertelement operand but does not choose the
+  // inserted scalar element from that value, then we can replace that shuffle
+  // operand with the source vector of the insertelement.
+  Value *X;
+  uint64_t IdxC;
+  if (match(V0, m_InsertElt(m_Value(X), m_Value(), m_ConstantInt(IdxC)))) {
+    // shuf (inselt X, ?, IdxC), ?, Mask --> shuf X, ?, Mask
     if (!is_contained(Mask, (int)IdxC))
-      return IC.replaceOperand(Shuf, 0, X); 
-  } 
-  if (match(V1, m_InsertElt(m_Value(X), m_Value(), m_ConstantInt(IdxC)))) { 
-    // Offset the index constant by the vector width because we are checking for 
-    // accesses to the 2nd vector input of the shuffle. 
-    IdxC += NumElts; 
-    // shuf ?, (inselt X, ?, IdxC), Mask --> shuf ?, X, Mask 
+      return IC.replaceOperand(Shuf, 0, X);
+  }
+  if (match(V1, m_InsertElt(m_Value(X), m_Value(), m_ConstantInt(IdxC)))) {
+    // Offset the index constant by the vector width because we are checking for
+    // accesses to the 2nd vector input of the shuffle.
+    IdxC += NumElts;
+    // shuf ?, (inselt X, ?, IdxC), Mask --> shuf ?, X, Mask
     if (!is_contained(Mask, (int)IdxC))
-      return IC.replaceOperand(Shuf, 1, X); 
-  } 
- 
-  // shuffle (insert ?, Scalar, IndexC), V1, Mask --> insert V1, Scalar, IndexC' 
-  auto isShufflingScalarIntoOp1 = [&](Value *&Scalar, ConstantInt *&IndexC) { 
-    // We need an insertelement with a constant index. 
-    if (!match(V0, m_InsertElt(m_Value(), m_Value(Scalar), 
-                               m_ConstantInt(IndexC)))) 
-      return false; 
- 
-    // Test the shuffle mask to see if it splices the inserted scalar into the 
-    // operand 1 vector of the shuffle. 
-    int NewInsIndex = -1; 
-    for (int i = 0; i != NumElts; ++i) { 
-      // Ignore undef mask elements. 
-      if (Mask[i] == -1) 
-        continue; 
- 
-      // The shuffle takes elements of operand 1 without lane changes. 
-      if (Mask[i] == NumElts + i) 
-        continue; 
- 
-      // The shuffle must choose the inserted scalar exactly once. 
-      if (NewInsIndex != -1 || Mask[i] != IndexC->getSExtValue()) 
-        return false; 
- 
-      // The shuffle is placing the inserted scalar into element i. 
-      NewInsIndex = i; 
-    } 
- 
-    assert(NewInsIndex != -1 && "Did not fold shuffle with unused operand?"); 
- 
-    // Index is updated to the potentially translated insertion lane. 
-    IndexC = ConstantInt::get(IndexC->getType(), NewInsIndex); 
-    return true; 
-  }; 
- 
-  // If the shuffle is unnecessary, insert the scalar operand directly into 
-  // operand 1 of the shuffle. Example: 
-  // shuffle (insert ?, S, 1), V1, <1, 5, 6, 7> --> insert V1, S, 0 
-  Value *Scalar; 
-  ConstantInt *IndexC; 
-  if (isShufflingScalarIntoOp1(Scalar, IndexC)) 
-    return InsertElementInst::Create(V1, Scalar, IndexC); 
- 
-  // Try again after commuting shuffle. Example: 
-  // shuffle V0, (insert ?, S, 0), <0, 1, 2, 4> --> 
-  // shuffle (insert ?, S, 0), V0, <4, 5, 6, 0> --> insert V0, S, 3 
-  std::swap(V0, V1); 
-  ShuffleVectorInst::commuteShuffleMask(Mask, NumElts); 
-  if (isShufflingScalarIntoOp1(Scalar, IndexC)) 
-    return InsertElementInst::Create(V1, Scalar, IndexC); 
- 
-  return nullptr; 
-} 
- 
-static Instruction *foldIdentityPaddedShuffles(ShuffleVectorInst &Shuf) { 
-  // Match the operands as identity with padding (also known as concatenation 
-  // with undef) shuffles of the same source type. The backend is expected to 
-  // recreate these concatenations from a shuffle of narrow operands. 
-  auto *Shuffle0 = dyn_cast<ShuffleVectorInst>(Shuf.getOperand(0)); 
-  auto *Shuffle1 = dyn_cast<ShuffleVectorInst>(Shuf.getOperand(1)); 
-  if (!Shuffle0 || !Shuffle0->isIdentityWithPadding() || 
-      !Shuffle1 || !Shuffle1->isIdentityWithPadding()) 
-    return nullptr; 
- 
-  // We limit this transform to power-of-2 types because we expect that the 
-  // backend can convert the simplified IR patterns to identical nodes as the 
-  // original IR. 
-  // TODO: If we can verify the same behavior for arbitrary types, the 
-  //       power-of-2 checks can be removed. 
-  Value *X = Shuffle0->getOperand(0); 
-  Value *Y = Shuffle1->getOperand(0); 
-  if (X->getType() != Y->getType() || 
+      return IC.replaceOperand(Shuf, 1, X);
+  }
+
+  // shuffle (insert ?, Scalar, IndexC), V1, Mask --> insert V1, Scalar, IndexC'
+  auto isShufflingScalarIntoOp1 = [&](Value *&Scalar, ConstantInt *&IndexC) {
+    // We need an insertelement with a constant index.
+    if (!match(V0, m_InsertElt(m_Value(), m_Value(Scalar),
+                               m_ConstantInt(IndexC))))
+      return false;
+
+    // Test the shuffle mask to see if it splices the inserted scalar into the
+    // operand 1 vector of the shuffle.
+    int NewInsIndex = -1;
+    for (int i = 0; i != NumElts; ++i) {
+      // Ignore undef mask elements.
+      if (Mask[i] == -1)
+        continue;
+
+      // The shuffle takes elements of operand 1 without lane changes.
+      if (Mask[i] == NumElts + i)
+        continue;
+
+      // The shuffle must choose the inserted scalar exactly once.
+      if (NewInsIndex != -1 || Mask[i] != IndexC->getSExtValue())
+        return false;
+
+      // The shuffle is placing the inserted scalar into element i.
+      NewInsIndex = i;
+    }
+
+    assert(NewInsIndex != -1 && "Did not fold shuffle with unused operand?");
+
+    // Index is updated to the potentially translated insertion lane.
+    IndexC = ConstantInt::get(IndexC->getType(), NewInsIndex);
+    return true;
+  };
+
+  // If the shuffle is unnecessary, insert the scalar operand directly into
+  // operand 1 of the shuffle. Example:
+  // shuffle (insert ?, S, 1), V1, <1, 5, 6, 7> --> insert V1, S, 0
+  Value *Scalar;
+  ConstantInt *IndexC;
+  if (isShufflingScalarIntoOp1(Scalar, IndexC))
+    return InsertElementInst::Create(V1, Scalar, IndexC);
+
+  // Try again after commuting shuffle. Example:
+  // shuffle V0, (insert ?, S, 0), <0, 1, 2, 4> -->
+  // shuffle (insert ?, S, 0), V0, <4, 5, 6, 0> --> insert V0, S, 3
+  std::swap(V0, V1);
+  ShuffleVectorInst::commuteShuffleMask(Mask, NumElts);
+  if (isShufflingScalarIntoOp1(Scalar, IndexC))
+    return InsertElementInst::Create(V1, Scalar, IndexC);
+
+  return nullptr;
+}
+
+static Instruction *foldIdentityPaddedShuffles(ShuffleVectorInst &Shuf) {
+  // Match the operands as identity with padding (also known as concatenation
+  // with undef) shuffles of the same source type. The backend is expected to
+  // recreate these concatenations from a shuffle of narrow operands.
+  auto *Shuffle0 = dyn_cast<ShuffleVectorInst>(Shuf.getOperand(0));
+  auto *Shuffle1 = dyn_cast<ShuffleVectorInst>(Shuf.getOperand(1));
+  if (!Shuffle0 || !Shuffle0->isIdentityWithPadding() ||
+      !Shuffle1 || !Shuffle1->isIdentityWithPadding())
+    return nullptr;
+
+  // We limit this transform to power-of-2 types because we expect that the
+  // backend can convert the simplified IR patterns to identical nodes as the
+  // original IR.
+  // TODO: If we can verify the same behavior for arbitrary types, the
+  //       power-of-2 checks can be removed.
+  Value *X = Shuffle0->getOperand(0);
+  Value *Y = Shuffle1->getOperand(0);
+  if (X->getType() != Y->getType() ||
       !isPowerOf2_32(cast<FixedVectorType>(Shuf.getType())->getNumElements()) ||
       !isPowerOf2_32(
           cast<FixedVectorType>(Shuffle0->getType())->getNumElements()) ||
       !isPowerOf2_32(cast<FixedVectorType>(X->getType())->getNumElements()) ||
-      isa<UndefValue>(X) || isa<UndefValue>(Y)) 
-    return nullptr; 
-  assert(isa<UndefValue>(Shuffle0->getOperand(1)) && 
-         isa<UndefValue>(Shuffle1->getOperand(1)) && 
-         "Unexpected operand for identity shuffle"); 
- 
-  // This is a shuffle of 2 widening shuffles. We can shuffle the narrow source 
-  // operands directly by adjusting the shuffle mask to account for the narrower 
-  // types: 
-  // shuf (widen X), (widen Y), Mask --> shuf X, Y, Mask' 
+      isa<UndefValue>(X) || isa<UndefValue>(Y))
+    return nullptr;
+  assert(isa<UndefValue>(Shuffle0->getOperand(1)) &&
+         isa<UndefValue>(Shuffle1->getOperand(1)) &&
+         "Unexpected operand for identity shuffle");
+
+  // This is a shuffle of 2 widening shuffles. We can shuffle the narrow source
+  // operands directly by adjusting the shuffle mask to account for the narrower
+  // types:
+  // shuf (widen X), (widen Y), Mask --> shuf X, Y, Mask'
   int NarrowElts = cast<FixedVectorType>(X->getType())->getNumElements();
   int WideElts = cast<FixedVectorType>(Shuffle0->getType())->getNumElements();
-  assert(WideElts > NarrowElts && "Unexpected types for identity with padding"); 
- 
-  ArrayRef<int> Mask = Shuf.getShuffleMask(); 
-  SmallVector<int, 16> NewMask(Mask.size(), -1); 
-  for (int i = 0, e = Mask.size(); i != e; ++i) { 
-    if (Mask[i] == -1) 
-      continue; 
- 
-    // If this shuffle is choosing an undef element from 1 of the sources, that 
-    // element is undef. 
-    if (Mask[i] < WideElts) { 
-      if (Shuffle0->getMaskValue(Mask[i]) == -1) 
-        continue; 
-    } else { 
-      if (Shuffle1->getMaskValue(Mask[i] - WideElts) == -1) 
-        continue; 
-    } 
- 
-    // If this shuffle is choosing from the 1st narrow op, the mask element is 
-    // the same. If this shuffle is choosing from the 2nd narrow op, the mask 
-    // element is offset down to adjust for the narrow vector widths. 
-    if (Mask[i] < WideElts) { 
-      assert(Mask[i] < NarrowElts && "Unexpected shuffle mask"); 
-      NewMask[i] = Mask[i]; 
-    } else { 
-      assert(Mask[i] < (WideElts + NarrowElts) && "Unexpected shuffle mask"); 
-      NewMask[i] = Mask[i] - (WideElts - NarrowElts); 
-    } 
-  } 
-  return new ShuffleVectorInst(X, Y, NewMask); 
-} 
- 
+  assert(WideElts > NarrowElts && "Unexpected types for identity with padding");
+
+  ArrayRef<int> Mask = Shuf.getShuffleMask();
+  SmallVector<int, 16> NewMask(Mask.size(), -1);
+  for (int i = 0, e = Mask.size(); i != e; ++i) {
+    if (Mask[i] == -1)
+      continue;
+
+    // If this shuffle is choosing an undef element from 1 of the sources, that
+    // element is undef.
+    if (Mask[i] < WideElts) {
+      if (Shuffle0->getMaskValue(Mask[i]) == -1)
+        continue;
+    } else {
+      if (Shuffle1->getMaskValue(Mask[i] - WideElts) == -1)
+        continue;
+    }
+
+    // If this shuffle is choosing from the 1st narrow op, the mask element is
+    // the same. If this shuffle is choosing from the 2nd narrow op, the mask
+    // element is offset down to adjust for the narrow vector widths.
+    if (Mask[i] < WideElts) {
+      assert(Mask[i] < NarrowElts && "Unexpected shuffle mask");
+      NewMask[i] = Mask[i];
+    } else {
+      assert(Mask[i] < (WideElts + NarrowElts) && "Unexpected shuffle mask");
+      NewMask[i] = Mask[i] - (WideElts - NarrowElts);
+    }
+  }
+  return new ShuffleVectorInst(X, Y, NewMask);
+}
+
 Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
-  Value *LHS = SVI.getOperand(0); 
-  Value *RHS = SVI.getOperand(1); 
-  SimplifyQuery ShufQuery = SQ.getWithInstruction(&SVI); 
-  if (auto *V = SimplifyShuffleVectorInst(LHS, RHS, SVI.getShuffleMask(), 
-                                          SVI.getType(), ShufQuery)) 
-    return replaceInstUsesWith(SVI, V); 
- 
+  Value *LHS = SVI.getOperand(0);
+  Value *RHS = SVI.getOperand(1);
+  SimplifyQuery ShufQuery = SQ.getWithInstruction(&SVI);
+  if (auto *V = SimplifyShuffleVectorInst(LHS, RHS, SVI.getShuffleMask(),
+                                          SVI.getType(), ShufQuery))
+    return replaceInstUsesWith(SVI, V);
+
   // Bail out for scalable vectors
   if (isa<ScalableVectorType>(LHS->getType()))
     return nullptr;
 
-  // shuffle x, x, mask --> shuffle x, undef, mask' 
+  // shuffle x, x, mask --> shuffle x, undef, mask'
   unsigned VWidth = cast<FixedVectorType>(SVI.getType())->getNumElements();
   unsigned LHSWidth = cast<FixedVectorType>(LHS->getType())->getNumElements();
-  ArrayRef<int> Mask = SVI.getShuffleMask(); 
-  Type *Int32Ty = Type::getInt32Ty(SVI.getContext()); 
- 
-  // Peek through a bitcasted shuffle operand by scaling the mask. If the 
-  // simulated shuffle can simplify, then this shuffle is unnecessary: 
-  // shuf (bitcast X), undef, Mask --> bitcast X' 
-  // TODO: This could be extended to allow length-changing shuffles. 
-  //       The transform might also be obsoleted if we allowed canonicalization 
-  //       of bitcasted shuffles. 
-  Value *X; 
-  if (match(LHS, m_BitCast(m_Value(X))) && match(RHS, m_Undef()) && 
-      X->getType()->isVectorTy() && VWidth == LHSWidth) { 
-    // Try to create a scaled mask constant. 
+  ArrayRef<int> Mask = SVI.getShuffleMask();
+  Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
+
+  // Peek through a bitcasted shuffle operand by scaling the mask. If the
+  // simulated shuffle can simplify, then this shuffle is unnecessary:
+  // shuf (bitcast X), undef, Mask --> bitcast X'
+  // TODO: This could be extended to allow length-changing shuffles.
+  //       The transform might also be obsoleted if we allowed canonicalization
+  //       of bitcasted shuffles.
+  Value *X;
+  if (match(LHS, m_BitCast(m_Value(X))) && match(RHS, m_Undef()) &&
+      X->getType()->isVectorTy() && VWidth == LHSWidth) {
+    // Try to create a scaled mask constant.
     auto *XType = cast<FixedVectorType>(X->getType());
-    unsigned XNumElts = XType->getNumElements(); 
-    SmallVector<int, 16> ScaledMask; 
-    if (XNumElts >= VWidth) { 
-      assert(XNumElts % VWidth == 0 && "Unexpected vector bitcast"); 
-      narrowShuffleMaskElts(XNumElts / VWidth, Mask, ScaledMask); 
-    } else { 
-      assert(VWidth % XNumElts == 0 && "Unexpected vector bitcast"); 
-      if (!widenShuffleMaskElts(VWidth / XNumElts, Mask, ScaledMask)) 
-        ScaledMask.clear(); 
-    } 
-    if (!ScaledMask.empty()) { 
-      // If the shuffled source vector simplifies, cast that value to this 
-      // shuffle's type. 
-      if (auto *V = SimplifyShuffleVectorInst(X, UndefValue::get(XType), 
-                                              ScaledMask, XType, ShufQuery)) 
-        return BitCastInst::Create(Instruction::BitCast, V, SVI.getType()); 
-    } 
-  } 
- 
-  if (LHS == RHS) { 
-    assert(!isa<UndefValue>(RHS) && "Shuffle with 2 undef ops not simplified?"); 
-    // Remap any references to RHS to use LHS. 
-    SmallVector<int, 16> Elts; 
-    for (unsigned i = 0; i != VWidth; ++i) { 
-      // Propagate undef elements or force mask to LHS. 
-      if (Mask[i] < 0) 
-        Elts.push_back(UndefMaskElem); 
-      else 
-        Elts.push_back(Mask[i] % LHSWidth); 
-    } 
-    return new ShuffleVectorInst(LHS, UndefValue::get(RHS->getType()), Elts); 
-  } 
- 
-  // shuffle undef, x, mask --> shuffle x, undef, mask' 
-  if (isa<UndefValue>(LHS)) { 
-    SVI.commute(); 
-    return &SVI; 
-  } 
- 
-  if (Instruction *I = canonicalizeInsertSplat(SVI, Builder)) 
-    return I; 
- 
-  if (Instruction *I = foldSelectShuffle(SVI, Builder, DL)) 
-    return I; 
- 
-  if (Instruction *I = foldTruncShuffle(SVI, DL.isBigEndian())) 
-    return I; 
- 
-  if (Instruction *I = narrowVectorSelect(SVI, Builder)) 
-    return I; 
- 
-  APInt UndefElts(VWidth, 0); 
-  APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth)); 
-  if (Value *V = SimplifyDemandedVectorElts(&SVI, AllOnesEltMask, UndefElts)) { 
-    if (V != &SVI) 
-      return replaceInstUsesWith(SVI, V); 
-    return &SVI; 
-  } 
- 
-  if (Instruction *I = foldIdentityExtractShuffle(SVI)) 
-    return I; 
- 
-  // These transforms have the potential to lose undef knowledge, so they are 
-  // intentionally placed after SimplifyDemandedVectorElts(). 
-  if (Instruction *I = foldShuffleWithInsert(SVI, *this)) 
-    return I; 
-  if (Instruction *I = foldIdentityPaddedShuffles(SVI)) 
-    return I; 
- 
-  if (isa<UndefValue>(RHS) && canEvaluateShuffled(LHS, Mask)) { 
-    Value *V = evaluateInDifferentElementOrder(LHS, Mask); 
-    return replaceInstUsesWith(SVI, V); 
-  } 
- 
-  // SROA generates shuffle+bitcast when the extracted sub-vector is bitcast to 
-  // a non-vector type. We can instead bitcast the original vector followed by 
-  // an extract of the desired element: 
-  // 
-  //   %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, 
-  //                         <4 x i32> <i32 0, i32 1, i32 2, i32 3> 
-  //   %1 = bitcast <4 x i8> %sroa to i32 
-  // Becomes: 
-  //   %bc = bitcast <16 x i8> %in to <4 x i32> 
-  //   %ext = extractelement <4 x i32> %bc, i32 0 
-  // 
-  // If the shuffle is extracting a contiguous range of values from the input 
-  // vector then each use which is a bitcast of the extracted size can be 
-  // replaced. This will work if the vector types are compatible, and the begin 
-  // index is aligned to a value in the casted vector type. If the begin index 
-  // isn't aligned then we can shuffle the original vector (keeping the same 
-  // vector type) before extracting. 
-  // 
-  // This code will bail out if the target type is fundamentally incompatible 
-  // with vectors of the source type. 
-  // 
-  // Example of <16 x i8>, target type i32: 
-  // Index range [4,8):         v-----------v Will work. 
-  //                +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ 
-  //     <16 x i8>: |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  | 
-  //     <4 x i32>: |           |           |           |           | 
-  //                +-----------+-----------+-----------+-----------+ 
-  // Index range [6,10):              ^-----------^ Needs an extra shuffle. 
-  // Target type i40:           ^--------------^ Won't work, bail. 
-  bool MadeChange = false; 
-  if (isShuffleExtractingFromLHS(SVI, Mask)) { 
-    Value *V = LHS; 
-    unsigned MaskElems = Mask.size(); 
+    unsigned XNumElts = XType->getNumElements();
+    SmallVector<int, 16> ScaledMask;
+    if (XNumElts >= VWidth) {
+      assert(XNumElts % VWidth == 0 && "Unexpected vector bitcast");
+      narrowShuffleMaskElts(XNumElts / VWidth, Mask, ScaledMask);
+    } else {
+      assert(VWidth % XNumElts == 0 && "Unexpected vector bitcast");
+      if (!widenShuffleMaskElts(VWidth / XNumElts, Mask, ScaledMask))
+        ScaledMask.clear();
+    }
+    if (!ScaledMask.empty()) {
+      // If the shuffled source vector simplifies, cast that value to this
+      // shuffle's type.
+      if (auto *V = SimplifyShuffleVectorInst(X, UndefValue::get(XType),
+                                              ScaledMask, XType, ShufQuery))
+        return BitCastInst::Create(Instruction::BitCast, V, SVI.getType());
+    }
+  }
+
+  if (LHS == RHS) {
+    assert(!isa<UndefValue>(RHS) && "Shuffle with 2 undef ops not simplified?");
+    // Remap any references to RHS to use LHS.
+    SmallVector<int, 16> Elts;
+    for (unsigned i = 0; i != VWidth; ++i) {
+      // Propagate undef elements or force mask to LHS.
+      if (Mask[i] < 0)
+        Elts.push_back(UndefMaskElem);
+      else
+        Elts.push_back(Mask[i] % LHSWidth);
+    }
+    return new ShuffleVectorInst(LHS, UndefValue::get(RHS->getType()), Elts);
+  }
+
+  // shuffle undef, x, mask --> shuffle x, undef, mask'
+  if (isa<UndefValue>(LHS)) {
+    SVI.commute();
+    return &SVI;
+  }
+
+  if (Instruction *I = canonicalizeInsertSplat(SVI, Builder))
+    return I;
+
+  if (Instruction *I = foldSelectShuffle(SVI, Builder, DL))
+    return I;
+
+  if (Instruction *I = foldTruncShuffle(SVI, DL.isBigEndian()))
+    return I;
+
+  if (Instruction *I = narrowVectorSelect(SVI, Builder))
+    return I;
+
+  APInt UndefElts(VWidth, 0);
+  APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
+  if (Value *V = SimplifyDemandedVectorElts(&SVI, AllOnesEltMask, UndefElts)) {
+    if (V != &SVI)
+      return replaceInstUsesWith(SVI, V);
+    return &SVI;
+  }
+
+  if (Instruction *I = foldIdentityExtractShuffle(SVI))
+    return I;
+
+  // These transforms have the potential to lose undef knowledge, so they are
+  // intentionally placed after SimplifyDemandedVectorElts().
+  if (Instruction *I = foldShuffleWithInsert(SVI, *this))
+    return I;
+  if (Instruction *I = foldIdentityPaddedShuffles(SVI))
+    return I;
+
+  if (isa<UndefValue>(RHS) && canEvaluateShuffled(LHS, Mask)) {
+    Value *V = evaluateInDifferentElementOrder(LHS, Mask);
+    return replaceInstUsesWith(SVI, V);
+  }
+
+  // SROA generates shuffle+bitcast when the extracted sub-vector is bitcast to
+  // a non-vector type. We can instead bitcast the original vector followed by
+  // an extract of the desired element:
+  //
+  //   %sroa = shufflevector <16 x i8> %in, <16 x i8> undef,
+  //                         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  //   %1 = bitcast <4 x i8> %sroa to i32
+  // Becomes:
+  //   %bc = bitcast <16 x i8> %in to <4 x i32>
+  //   %ext = extractelement <4 x i32> %bc, i32 0
+  //
+  // If the shuffle is extracting a contiguous range of values from the input
+  // vector then each use which is a bitcast of the extracted size can be
+  // replaced. This will work if the vector types are compatible, and the begin
+  // index is aligned to a value in the casted vector type. If the begin index
+  // isn't aligned then we can shuffle the original vector (keeping the same
+  // vector type) before extracting.
+  //
+  // This code will bail out if the target type is fundamentally incompatible
+  // with vectors of the source type.
+  //
+  // Example of <16 x i8>, target type i32:
+  // Index range [4,8):         v-----------v Will work.
+  //                +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+  //     <16 x i8>: |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |
+  //     <4 x i32>: |           |           |           |           |
+  //                +-----------+-----------+-----------+-----------+
+  // Index range [6,10):              ^-----------^ Needs an extra shuffle.
+  // Target type i40:           ^--------------^ Won't work, bail.
+  bool MadeChange = false;
+  if (isShuffleExtractingFromLHS(SVI, Mask)) {
+    Value *V = LHS;
+    unsigned MaskElems = Mask.size();
     auto *SrcTy = cast<FixedVectorType>(V->getType());
-    unsigned VecBitWidth = SrcTy->getPrimitiveSizeInBits().getFixedSize(); 
-    unsigned SrcElemBitWidth = DL.getTypeSizeInBits(SrcTy->getElementType()); 
-    assert(SrcElemBitWidth && "vector elements must have a bitwidth"); 
-    unsigned SrcNumElems = SrcTy->getNumElements(); 
-    SmallVector<BitCastInst *, 8> BCs; 
-    DenseMap<Type *, Value *> NewBCs; 
-    for (User *U : SVI.users()) 
-      if (BitCastInst *BC = dyn_cast<BitCastInst>(U)) 
-        if (!BC->use_empty()) 
-          // Only visit bitcasts that weren't previously handled. 
-          BCs.push_back(BC); 
-    for (BitCastInst *BC : BCs) { 
-      unsigned BegIdx = Mask.front(); 
-      Type *TgtTy = BC->getDestTy(); 
-      unsigned TgtElemBitWidth = DL.getTypeSizeInBits(TgtTy); 
-      if (!TgtElemBitWidth) 
-        continue; 
-      unsigned TgtNumElems = VecBitWidth / TgtElemBitWidth; 
-      bool VecBitWidthsEqual = VecBitWidth == TgtNumElems * TgtElemBitWidth; 
-      bool BegIsAligned = 0 == ((SrcElemBitWidth * BegIdx) % TgtElemBitWidth); 
-      if (!VecBitWidthsEqual) 
-        continue; 
-      if (!VectorType::isValidElementType(TgtTy)) 
-        continue; 
-      auto *CastSrcTy = FixedVectorType::get(TgtTy, TgtNumElems); 
-      if (!BegIsAligned) { 
-        // Shuffle the input so [0,NumElements) contains the output, and 
-        // [NumElems,SrcNumElems) is undef. 
-        SmallVector<int, 16> ShuffleMask(SrcNumElems, -1); 
-        for (unsigned I = 0, E = MaskElems, Idx = BegIdx; I != E; ++Idx, ++I) 
-          ShuffleMask[I] = Idx; 
+    unsigned VecBitWidth = SrcTy->getPrimitiveSizeInBits().getFixedSize();
+    unsigned SrcElemBitWidth = DL.getTypeSizeInBits(SrcTy->getElementType());
+    assert(SrcElemBitWidth && "vector elements must have a bitwidth");
+    unsigned SrcNumElems = SrcTy->getNumElements();
+    SmallVector<BitCastInst *, 8> BCs;
+    DenseMap<Type *, Value *> NewBCs;
+    for (User *U : SVI.users())
+      if (BitCastInst *BC = dyn_cast<BitCastInst>(U))
+        if (!BC->use_empty())
+          // Only visit bitcasts that weren't previously handled.
+          BCs.push_back(BC);
+    for (BitCastInst *BC : BCs) {
+      unsigned BegIdx = Mask.front();
+      Type *TgtTy = BC->getDestTy();
+      unsigned TgtElemBitWidth = DL.getTypeSizeInBits(TgtTy);
+      if (!TgtElemBitWidth)
+        continue;
+      unsigned TgtNumElems = VecBitWidth / TgtElemBitWidth;
+      bool VecBitWidthsEqual = VecBitWidth == TgtNumElems * TgtElemBitWidth;
+      bool BegIsAligned = 0 == ((SrcElemBitWidth * BegIdx) % TgtElemBitWidth);
+      if (!VecBitWidthsEqual)
+        continue;
+      if (!VectorType::isValidElementType(TgtTy))
+        continue;
+      auto *CastSrcTy = FixedVectorType::get(TgtTy, TgtNumElems);
+      if (!BegIsAligned) {
+        // Shuffle the input so [0,NumElements) contains the output, and
+        // [NumElems,SrcNumElems) is undef.
+        SmallVector<int, 16> ShuffleMask(SrcNumElems, -1);
+        for (unsigned I = 0, E = MaskElems, Idx = BegIdx; I != E; ++Idx, ++I)
+          ShuffleMask[I] = Idx;
         V = Builder.CreateShuffleVector(V, ShuffleMask,
-                                        SVI.getName() + ".extract"); 
-        BegIdx = 0; 
-      } 
-      unsigned SrcElemsPerTgtElem = TgtElemBitWidth / SrcElemBitWidth; 
-      assert(SrcElemsPerTgtElem); 
-      BegIdx /= SrcElemsPerTgtElem; 
-      bool BCAlreadyExists = NewBCs.find(CastSrcTy) != NewBCs.end(); 
-      auto *NewBC = 
-          BCAlreadyExists 
-              ? NewBCs[CastSrcTy] 
-              : Builder.CreateBitCast(V, CastSrcTy, SVI.getName() + ".bc"); 
-      if (!BCAlreadyExists) 
-        NewBCs[CastSrcTy] = NewBC; 
-      auto *Ext = Builder.CreateExtractElement( 
-          NewBC, ConstantInt::get(Int32Ty, BegIdx), SVI.getName() + ".extract"); 
-      // The shufflevector isn't being replaced: the bitcast that used it 
-      // is. InstCombine will visit the newly-created instructions. 
-      replaceInstUsesWith(*BC, Ext); 
-      MadeChange = true; 
-    } 
-  } 
- 
-  // If the LHS is a shufflevector itself, see if we can combine it with this 
-  // one without producing an unusual shuffle. 
-  // Cases that might be simplified: 
-  // 1. 
-  // x1=shuffle(v1,v2,mask1) 
-  //  x=shuffle(x1,undef,mask) 
-  //        ==> 
-  //  x=shuffle(v1,undef,newMask) 
-  // newMask[i] = (mask[i] < x1.size()) ? mask1[mask[i]] : -1 
-  // 2. 
-  // x1=shuffle(v1,undef,mask1) 
-  //  x=shuffle(x1,x2,mask) 
-  // where v1.size() == mask1.size() 
-  //        ==> 
-  //  x=shuffle(v1,x2,newMask) 
-  // newMask[i] = (mask[i] < x1.size()) ? mask1[mask[i]] : mask[i] 
-  // 3. 
-  // x2=shuffle(v2,undef,mask2) 
-  //  x=shuffle(x1,x2,mask) 
-  // where v2.size() == mask2.size() 
-  //        ==> 
-  //  x=shuffle(x1,v2,newMask) 
-  // newMask[i] = (mask[i] < x1.size()) 
-  //              ? mask[i] : mask2[mask[i]-x1.size()]+x1.size() 
-  // 4. 
-  // x1=shuffle(v1,undef,mask1) 
-  // x2=shuffle(v2,undef,mask2) 
-  //  x=shuffle(x1,x2,mask) 
-  // where v1.size() == v2.size() 
-  //        ==> 
-  //  x=shuffle(v1,v2,newMask) 
-  // newMask[i] = (mask[i] < x1.size()) 
-  //              ? mask1[mask[i]] : mask2[mask[i]-x1.size()]+v1.size() 
-  // 
-  // Here we are really conservative: 
-  // we are absolutely afraid of producing a shuffle mask not in the input 
-  // program, because the code gen may not be smart enough to turn a merged 
-  // shuffle into two specific shuffles: it may produce worse code.  As such, 
-  // we only merge two shuffles if the result is either a splat or one of the 
-  // input shuffle masks.  In this case, merging the shuffles just removes 
-  // one instruction, which we know is safe.  This is good for things like 
-  // turning: (splat(splat)) -> splat, or 
-  // merge(V[0..n], V[n+1..2n]) -> V[0..2n] 
-  ShuffleVectorInst* LHSShuffle = dyn_cast<ShuffleVectorInst>(LHS); 
-  ShuffleVectorInst* RHSShuffle = dyn_cast<ShuffleVectorInst>(RHS); 
-  if (LHSShuffle) 
-    if (!isa<UndefValue>(LHSShuffle->getOperand(1)) && !isa<UndefValue>(RHS)) 
-      LHSShuffle = nullptr; 
-  if (RHSShuffle) 
-    if (!isa<UndefValue>(RHSShuffle->getOperand(1))) 
-      RHSShuffle = nullptr; 
-  if (!LHSShuffle && !RHSShuffle) 
-    return MadeChange ? &SVI : nullptr; 
- 
-  Value* LHSOp0 = nullptr; 
-  Value* LHSOp1 = nullptr; 
-  Value* RHSOp0 = nullptr; 
-  unsigned LHSOp0Width = 0; 
-  unsigned RHSOp0Width = 0; 
-  if (LHSShuffle) { 
-    LHSOp0 = LHSShuffle->getOperand(0); 
-    LHSOp1 = LHSShuffle->getOperand(1); 
+                                        SVI.getName() + ".extract");
+        BegIdx = 0;
+      }
+      unsigned SrcElemsPerTgtElem = TgtElemBitWidth / SrcElemBitWidth;
+      assert(SrcElemsPerTgtElem);
+      BegIdx /= SrcElemsPerTgtElem;
+      bool BCAlreadyExists = NewBCs.find(CastSrcTy) != NewBCs.end();
+      auto *NewBC =
+          BCAlreadyExists
+              ? NewBCs[CastSrcTy]
+              : Builder.CreateBitCast(V, CastSrcTy, SVI.getName() + ".bc");
+      if (!BCAlreadyExists)
+        NewBCs[CastSrcTy] = NewBC;
+      auto *Ext = Builder.CreateExtractElement(
+          NewBC, ConstantInt::get(Int32Ty, BegIdx), SVI.getName() + ".extract");
+      // The shufflevector isn't being replaced: the bitcast that used it
+      // is. InstCombine will visit the newly-created instructions.
+      replaceInstUsesWith(*BC, Ext);
+      MadeChange = true;
+    }
+  }
+
+  // If the LHS is a shufflevector itself, see if we can combine it with this
+  // one without producing an unusual shuffle.
+  // Cases that might be simplified:
+  // 1.
+  // x1=shuffle(v1,v2,mask1)
+  //  x=shuffle(x1,undef,mask)
+  //        ==>
+  //  x=shuffle(v1,undef,newMask)
+  // newMask[i] = (mask[i] < x1.size()) ? mask1[mask[i]] : -1
+  // 2.
+  // x1=shuffle(v1,undef,mask1)
+  //  x=shuffle(x1,x2,mask)
+  // where v1.size() == mask1.size()
+  //        ==>
+  //  x=shuffle(v1,x2,newMask)
+  // newMask[i] = (mask[i] < x1.size()) ? mask1[mask[i]] : mask[i]
+  // 3.
+  // x2=shuffle(v2,undef,mask2)
+  //  x=shuffle(x1,x2,mask)
+  // where v2.size() == mask2.size()
+  //        ==>
+  //  x=shuffle(x1,v2,newMask)
+  // newMask[i] = (mask[i] < x1.size())
+  //              ? mask[i] : mask2[mask[i]-x1.size()]+x1.size()
+  // 4.
+  // x1=shuffle(v1,undef,mask1)
+  // x2=shuffle(v2,undef,mask2)
+  //  x=shuffle(x1,x2,mask)
+  // where v1.size() == v2.size()
+  //        ==>
+  //  x=shuffle(v1,v2,newMask)
+  // newMask[i] = (mask[i] < x1.size())
+  //              ? mask1[mask[i]] : mask2[mask[i]-x1.size()]+v1.size()
+  //
+  // Here we are really conservative:
+  // we are absolutely afraid of producing a shuffle mask not in the input
+  // program, because the code gen may not be smart enough to turn a merged
+  // shuffle into two specific shuffles: it may produce worse code.  As such,
+  // we only merge two shuffles if the result is either a splat or one of the
+  // input shuffle masks.  In this case, merging the shuffles just removes
+  // one instruction, which we know is safe.  This is good for things like
+  // turning: (splat(splat)) -> splat, or
+  // merge(V[0..n], V[n+1..2n]) -> V[0..2n]
+  ShuffleVectorInst* LHSShuffle = dyn_cast<ShuffleVectorInst>(LHS);
+  ShuffleVectorInst* RHSShuffle = dyn_cast<ShuffleVectorInst>(RHS);
+  if (LHSShuffle)
+    if (!isa<UndefValue>(LHSShuffle->getOperand(1)) && !isa<UndefValue>(RHS))
+      LHSShuffle = nullptr;
+  if (RHSShuffle)
+    if (!isa<UndefValue>(RHSShuffle->getOperand(1)))
+      RHSShuffle = nullptr;
+  if (!LHSShuffle && !RHSShuffle)
+    return MadeChange ? &SVI : nullptr;
+
+  Value* LHSOp0 = nullptr;
+  Value* LHSOp1 = nullptr;
+  Value* RHSOp0 = nullptr;
+  unsigned LHSOp0Width = 0;
+  unsigned RHSOp0Width = 0;
+  if (LHSShuffle) {
+    LHSOp0 = LHSShuffle->getOperand(0);
+    LHSOp1 = LHSShuffle->getOperand(1);
     LHSOp0Width = cast<FixedVectorType>(LHSOp0->getType())->getNumElements();
-  } 
-  if (RHSShuffle) { 
-    RHSOp0 = RHSShuffle->getOperand(0); 
+  }
+  if (RHSShuffle) {
+    RHSOp0 = RHSShuffle->getOperand(0);
     RHSOp0Width = cast<FixedVectorType>(RHSOp0->getType())->getNumElements();
-  } 
-  Value* newLHS = LHS; 
-  Value* newRHS = RHS; 
-  if (LHSShuffle) { 
-    // case 1 
-    if (isa<UndefValue>(RHS)) { 
-      newLHS = LHSOp0; 
-      newRHS = LHSOp1; 
-    } 
-    // case 2 or 4 
-    else if (LHSOp0Width == LHSWidth) { 
-      newLHS = LHSOp0; 
-    } 
-  } 
-  // case 3 or 4 
-  if (RHSShuffle && RHSOp0Width == LHSWidth) { 
-    newRHS = RHSOp0; 
-  } 
-  // case 4 
-  if (LHSOp0 == RHSOp0) { 
-    newLHS = LHSOp0; 
-    newRHS = nullptr; 
-  } 
- 
-  if (newLHS == LHS && newRHS == RHS) 
-    return MadeChange ? &SVI : nullptr; 
- 
-  ArrayRef<int> LHSMask; 
-  ArrayRef<int> RHSMask; 
-  if (newLHS != LHS) 
-    LHSMask = LHSShuffle->getShuffleMask(); 
-  if (RHSShuffle && newRHS != RHS) 
-    RHSMask = RHSShuffle->getShuffleMask(); 
- 
-  unsigned newLHSWidth = (newLHS != LHS) ? LHSOp0Width : LHSWidth; 
-  SmallVector<int, 16> newMask; 
-  bool isSplat = true; 
-  int SplatElt = -1; 
-  // Create a new mask for the new ShuffleVectorInst so that the new 
-  // ShuffleVectorInst is equivalent to the original one. 
-  for (unsigned i = 0; i < VWidth; ++i) { 
-    int eltMask; 
-    if (Mask[i] < 0) { 
-      // This element is an undef value. 
-      eltMask = -1; 
-    } else if (Mask[i] < (int)LHSWidth) { 
-      // This element is from left hand side vector operand. 
-      // 
-      // If LHS is going to be replaced (case 1, 2, or 4), calculate the 
-      // new mask value for the element. 
-      if (newLHS != LHS) { 
-        eltMask = LHSMask[Mask[i]]; 
-        // If the value selected is an undef value, explicitly specify it 
-        // with a -1 mask value. 
-        if (eltMask >= (int)LHSOp0Width && isa<UndefValue>(LHSOp1)) 
-          eltMask = -1; 
-      } else 
-        eltMask = Mask[i]; 
-    } else { 
-      // This element is from right hand side vector operand 
-      // 
-      // If the value selected is an undef value, explicitly specify it 
-      // with a -1 mask value. (case 1) 
-      if (isa<UndefValue>(RHS)) 
-        eltMask = -1; 
-      // If RHS is going to be replaced (case 3 or 4), calculate the 
-      // new mask value for the element. 
-      else if (newRHS != RHS) { 
-        eltMask = RHSMask[Mask[i]-LHSWidth]; 
-        // If the value selected is an undef value, explicitly specify it 
-        // with a -1 mask value. 
-        if (eltMask >= (int)RHSOp0Width) { 
-          assert(isa<UndefValue>(RHSShuffle->getOperand(1)) 
-                 && "should have been check above"); 
-          eltMask = -1; 
-        } 
-      } else 
-        eltMask = Mask[i]-LHSWidth; 
- 
-      // If LHS's width is changed, shift the mask value accordingly. 
-      // If newRHS == nullptr, i.e. LHSOp0 == RHSOp0, we want to remap any 
-      // references from RHSOp0 to LHSOp0, so we don't need to shift the mask. 
-      // If newRHS == newLHS, we want to remap any references from newRHS to 
-      // newLHS so that we can properly identify splats that may occur due to 
-      // obfuscation across the two vectors. 
-      if (eltMask >= 0 && newRHS != nullptr && newLHS != newRHS) 
-        eltMask += newLHSWidth; 
-    } 
- 
-    // Check if this could still be a splat. 
-    if (eltMask >= 0) { 
-      if (SplatElt >= 0 && SplatElt != eltMask) 
-        isSplat = false; 
-      SplatElt = eltMask; 
-    } 
- 
-    newMask.push_back(eltMask); 
-  } 
- 
-  // If the result mask is equal to one of the original shuffle masks, 
-  // or is a splat, do the replacement. 
-  if (isSplat || newMask == LHSMask || newMask == RHSMask || newMask == Mask) { 
-    if (!newRHS) 
-      newRHS = UndefValue::get(newLHS->getType()); 
+  }
+  Value* newLHS = LHS;
+  Value* newRHS = RHS;
+  if (LHSShuffle) {
+    // case 1
+    if (isa<UndefValue>(RHS)) {
+      newLHS = LHSOp0;
+      newRHS = LHSOp1;
+    }
+    // case 2 or 4
+    else if (LHSOp0Width == LHSWidth) {
+      newLHS = LHSOp0;
+    }
+  }
+  // case 3 or 4
+  if (RHSShuffle && RHSOp0Width == LHSWidth) {
+    newRHS = RHSOp0;
+  }
+  // case 4
+  if (LHSOp0 == RHSOp0) {
+    newLHS = LHSOp0;
+    newRHS = nullptr;
+  }
+
+  if (newLHS == LHS && newRHS == RHS)
+    return MadeChange ? &SVI : nullptr;
+
+  ArrayRef<int> LHSMask;
+  ArrayRef<int> RHSMask;
+  if (newLHS != LHS)
+    LHSMask = LHSShuffle->getShuffleMask();
+  if (RHSShuffle && newRHS != RHS)
+    RHSMask = RHSShuffle->getShuffleMask();
+
+  unsigned newLHSWidth = (newLHS != LHS) ? LHSOp0Width : LHSWidth;
+  SmallVector<int, 16> newMask;
+  bool isSplat = true;
+  int SplatElt = -1;
+  // Create a new mask for the new ShuffleVectorInst so that the new
+  // ShuffleVectorInst is equivalent to the original one.
+  for (unsigned i = 0; i < VWidth; ++i) {
+    int eltMask;
+    if (Mask[i] < 0) {
+      // This element is an undef value.
+      eltMask = -1;
+    } else if (Mask[i] < (int)LHSWidth) {
+      // This element is from left hand side vector operand.
+      //
+      // If LHS is going to be replaced (case 1, 2, or 4), calculate the
+      // new mask value for the element.
+      if (newLHS != LHS) {
+        eltMask = LHSMask[Mask[i]];
+        // If the value selected is an undef value, explicitly specify it
+        // with a -1 mask value.
+        if (eltMask >= (int)LHSOp0Width && isa<UndefValue>(LHSOp1))
+          eltMask = -1;
+      } else
+        eltMask = Mask[i];
+    } else {
+      // This element is from right hand side vector operand
+      //
+      // If the value selected is an undef value, explicitly specify it
+      // with a -1 mask value. (case 1)
+      if (isa<UndefValue>(RHS))
+        eltMask = -1;
+      // If RHS is going to be replaced (case 3 or 4), calculate the
+      // new mask value for the element.
+      else if (newRHS != RHS) {
+        eltMask = RHSMask[Mask[i]-LHSWidth];
+        // If the value selected is an undef value, explicitly specify it
+        // with a -1 mask value.
+        if (eltMask >= (int)RHSOp0Width) {
+          assert(isa<UndefValue>(RHSShuffle->getOperand(1))
+                 && "should have been check above");
+          eltMask = -1;
+        }
+      } else
+        eltMask = Mask[i]-LHSWidth;
+
+      // If LHS's width is changed, shift the mask value accordingly.
+      // If newRHS == nullptr, i.e. LHSOp0 == RHSOp0, we want to remap any
+      // references from RHSOp0 to LHSOp0, so we don't need to shift the mask.
+      // If newRHS == newLHS, we want to remap any references from newRHS to
+      // newLHS so that we can properly identify splats that may occur due to
+      // obfuscation across the two vectors.
+      if (eltMask >= 0 && newRHS != nullptr && newLHS != newRHS)
+        eltMask += newLHSWidth;
+    }
+
+    // Check if this could still be a splat.
+    if (eltMask >= 0) {
+      if (SplatElt >= 0 && SplatElt != eltMask)
+        isSplat = false;
+      SplatElt = eltMask;
+    }
+
+    newMask.push_back(eltMask);
+  }
+
+  // If the result mask is equal to one of the original shuffle masks,
+  // or is a splat, do the replacement.
+  if (isSplat || newMask == LHSMask || newMask == RHSMask || newMask == Mask) {
+    if (!newRHS)
+      newRHS = UndefValue::get(newLHS->getType());
     return new ShuffleVectorInst(newLHS, newRHS, newMask);
-  } 
- 
-  return MadeChange ? &SVI : nullptr; 
-} 
+  }
+
+  return MadeChange ? &SVI : nullptr;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstructionCombining.cpp
index 98006215ef..828fd49524 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1,169 +1,169 @@
-//===- InstructionCombining.cpp - Combine multiple instructions -----------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// InstructionCombining - Combine instructions to form fewer, simple 
-// instructions.  This pass does not modify the CFG.  This pass is where 
-// algebraic simplification happens. 
-// 
-// This pass combines things like: 
-//    %Y = add i32 %X, 1 
-//    %Z = add i32 %Y, 1 
-// into: 
-//    %Z = add i32 %X, 2 
-// 
-// This is a simple worklist driven algorithm. 
-// 
-// This pass guarantees that the following canonicalizations are performed on 
-// the program: 
-//    1. If a binary operator has a constant operand, it is moved to the RHS 
-//    2. Bitwise operators with constant operands are always grouped so that 
-//       shifts are performed first, then or's, then and's, then xor's. 
-//    3. Compare instructions are converted from <,>,<=,>= to ==,!= if possible 
-//    4. All cmp instructions on boolean values are replaced with logical ops 
-//    5. add X, X is represented as (X*2) => (X << 1) 
-//    6. Multiplies with a power-of-two constant argument are transformed into 
-//       shifts. 
-//   ... etc. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "InstCombineInternal.h" 
-#include "llvm-c/Initialization.h" 
-#include "llvm-c/Transforms/InstCombine.h" 
-#include "llvm/ADT/APInt.h" 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/None.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/TinyPtrVector.h" 
-#include "llvm/Analysis/AliasAnalysis.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/BasicAliasAnalysis.h" 
-#include "llvm/Analysis/BlockFrequencyInfo.h" 
-#include "llvm/Analysis/CFG.h" 
-#include "llvm/Analysis/ConstantFolding.h" 
-#include "llvm/Analysis/EHPersonalities.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/Analysis/LazyBlockFrequencyInfo.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/MemoryBuiltins.h" 
-#include "llvm/Analysis/OptimizationRemarkEmitter.h" 
-#include "llvm/Analysis/ProfileSummaryInfo.h" 
-#include "llvm/Analysis/TargetFolder.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
+//===- InstructionCombining.cpp - Combine multiple instructions -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// InstructionCombining - Combine instructions to form fewer, simple
+// instructions.  This pass does not modify the CFG.  This pass is where
+// algebraic simplification happens.
+//
+// This pass combines things like:
+//    %Y = add i32 %X, 1
+//    %Z = add i32 %Y, 1
+// into:
+//    %Z = add i32 %X, 2
+//
+// This is a simple worklist driven algorithm.
+//
+// This pass guarantees that the following canonicalizations are performed on
+// the program:
+//    1. If a binary operator has a constant operand, it is moved to the RHS
+//    2. Bitwise operators with constant operands are always grouped so that
+//       shifts are performed first, then or's, then and's, then xor's.
+//    3. Compare instructions are converted from <,>,<=,>= to ==,!= if possible
+//    4. All cmp instructions on boolean values are replaced with logical ops
+//    5. add X, X is represented as (X*2) => (X << 1)
+//    6. Multiplies with a power-of-two constant argument are transformed into
+//       shifts.
+//   ... etc.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm-c/Initialization.h"
+#include "llvm-c/Transforms/InstCombine.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetFolder.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/Analysis/VectorUtils.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DIBuilder.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GetElementPtrTypeIterator.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/LegacyPassManager.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/Operator.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/IR/ValueHandle.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/CBindingWrapping.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Compiler.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/DebugCounter.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/KnownBits.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/InstCombine/InstCombine.h" 
-#include "llvm/Transforms/InstCombine/InstCombineWorklist.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <cstdint> 
-#include <memory> 
-#include <string> 
-#include <utility> 
- 
-using namespace llvm; 
-using namespace llvm::PatternMatch; 
- 
-#define DEBUG_TYPE "instcombine" 
- 
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CBindingWrapping.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/InstCombine/InstCombine.h"
+#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
 STATISTIC(NumWorklistIterations,
           "Number of instruction combining iterations performed");
 
-STATISTIC(NumCombined , "Number of insts combined"); 
-STATISTIC(NumConstProp, "Number of constant folds"); 
-STATISTIC(NumDeadInst , "Number of dead inst eliminated"); 
-STATISTIC(NumSunkInst , "Number of instructions sunk"); 
-STATISTIC(NumExpand,    "Number of expansions"); 
-STATISTIC(NumFactor   , "Number of factorizations"); 
-STATISTIC(NumReassoc  , "Number of reassociations"); 
-DEBUG_COUNTER(VisitCounter, "instcombine-visit", 
-              "Controls which instructions are visited"); 
- 
+STATISTIC(NumCombined , "Number of insts combined");
+STATISTIC(NumConstProp, "Number of constant folds");
+STATISTIC(NumDeadInst , "Number of dead inst eliminated");
+STATISTIC(NumSunkInst , "Number of instructions sunk");
+STATISTIC(NumExpand,    "Number of expansions");
+STATISTIC(NumFactor   , "Number of factorizations");
+STATISTIC(NumReassoc  , "Number of reassociations");
+DEBUG_COUNTER(VisitCounter, "instcombine-visit",
+              "Controls which instructions are visited");
+
 // FIXME: these limits eventually should be as low as 2.
-static constexpr unsigned InstCombineDefaultMaxIterations = 1000; 
+static constexpr unsigned InstCombineDefaultMaxIterations = 1000;
 #ifndef NDEBUG
 static constexpr unsigned InstCombineDefaultInfiniteLoopThreshold = 100;
 #else
-static constexpr unsigned InstCombineDefaultInfiniteLoopThreshold = 1000; 
+static constexpr unsigned InstCombineDefaultInfiniteLoopThreshold = 1000;
 #endif
- 
-static cl::opt<bool> 
-EnableCodeSinking("instcombine-code-sinking", cl::desc("Enable code sinking"), 
-                                              cl::init(true)); 
- 
-static cl::opt<unsigned> LimitMaxIterations( 
-    "instcombine-max-iterations", 
-    cl::desc("Limit the maximum number of instruction combining iterations"), 
-    cl::init(InstCombineDefaultMaxIterations)); 
- 
-static cl::opt<unsigned> InfiniteLoopDetectionThreshold( 
-    "instcombine-infinite-loop-threshold", 
-    cl::desc("Number of instruction combining iterations considered an " 
-             "infinite loop"), 
-    cl::init(InstCombineDefaultInfiniteLoopThreshold), cl::Hidden); 
- 
-static cl::opt<unsigned> 
-MaxArraySize("instcombine-maxarray-size", cl::init(1024), 
-             cl::desc("Maximum array size considered when doing a combine")); 
- 
-// FIXME: Remove this flag when it is no longer necessary to convert 
-// llvm.dbg.declare to avoid inaccurate debug info. Setting this to false 
-// increases variable availability at the cost of accuracy. Variables that 
-// cannot be promoted by mem2reg or SROA will be described as living in memory 
-// for their entire lifetime. However, passes like DSE and instcombine can 
-// delete stores to the alloca, leading to misleading and inaccurate debug 
-// information. This flag can be removed when those passes are fixed. 
-static cl::opt<unsigned> ShouldLowerDbgDeclare("instcombine-lower-dbg-declare", 
-                                               cl::Hidden, cl::init(true)); 
- 
+
+static cl::opt<bool>
+EnableCodeSinking("instcombine-code-sinking", cl::desc("Enable code sinking"),
+                                              cl::init(true));
+
+static cl::opt<unsigned> LimitMaxIterations(
+    "instcombine-max-iterations",
+    cl::desc("Limit the maximum number of instruction combining iterations"),
+    cl::init(InstCombineDefaultMaxIterations));
+
+static cl::opt<unsigned> InfiniteLoopDetectionThreshold(
+    "instcombine-infinite-loop-threshold",
+    cl::desc("Number of instruction combining iterations considered an "
+             "infinite loop"),
+    cl::init(InstCombineDefaultInfiniteLoopThreshold), cl::Hidden);
+
+static cl::opt<unsigned>
+MaxArraySize("instcombine-maxarray-size", cl::init(1024),
+             cl::desc("Maximum array size considered when doing a combine"));
+
+// FIXME: Remove this flag when it is no longer necessary to convert
+// llvm.dbg.declare to avoid inaccurate debug info. Setting this to false
+// increases variable availability at the cost of accuracy. Variables that
+// cannot be promoted by mem2reg or SROA will be described as living in memory
+// for their entire lifetime. However, passes like DSE and instcombine can
+// delete stores to the alloca, leading to misleading and inaccurate debug
+// information. This flag can be removed when those passes are fixed.
+static cl::opt<unsigned> ShouldLowerDbgDeclare("instcombine-lower-dbg-declare",
+                                               cl::Hidden, cl::init(true));
+
 Optional<Instruction *>
 InstCombiner::targetInstCombineIntrinsic(IntrinsicInst &II) {
   // Handle target specific intrinsics
@@ -199,677 +199,677 @@ Optional<Value *> InstCombiner::targetSimplifyDemandedVectorEltsIntrinsic(
 }
 
 Value *InstCombinerImpl::EmitGEPOffset(User *GEP) {
-  return llvm::EmitGEPOffset(&Builder, DL, GEP); 
-} 
- 
-/// Return true if it is desirable to convert an integer computation from a 
-/// given bit width to a new bit width. 
-/// We don't want to convert from a legal to an illegal type or from a smaller 
-/// to a larger illegal type. A width of '1' is always treated as a legal type 
-/// because i1 is a fundamental type in IR, and there are many specialized 
-/// optimizations for i1 types. Widths of 8, 16 or 32 are equally treated as 
-/// legal to convert to, in order to open up more combining opportunities. 
-/// NOTE: this treats i8, i16 and i32 specially, due to them being so common 
-/// from frontend languages. 
+  return llvm::EmitGEPOffset(&Builder, DL, GEP);
+}
+
+/// Return true if it is desirable to convert an integer computation from a
+/// given bit width to a new bit width.
+/// We don't want to convert from a legal to an illegal type or from a smaller
+/// to a larger illegal type. A width of '1' is always treated as a legal type
+/// because i1 is a fundamental type in IR, and there are many specialized
+/// optimizations for i1 types. Widths of 8, 16 or 32 are equally treated as
+/// legal to convert to, in order to open up more combining opportunities.
+/// NOTE: this treats i8, i16 and i32 specially, due to them being so common
+/// from frontend languages.
 bool InstCombinerImpl::shouldChangeType(unsigned FromWidth,
                                         unsigned ToWidth) const {
-  bool FromLegal = FromWidth == 1 || DL.isLegalInteger(FromWidth); 
-  bool ToLegal = ToWidth == 1 || DL.isLegalInteger(ToWidth); 
- 
-  // Convert to widths of 8, 16 or 32 even if they are not legal types. Only 
-  // shrink types, to prevent infinite loops. 
-  if (ToWidth < FromWidth && (ToWidth == 8 || ToWidth == 16 || ToWidth == 32)) 
-    return true; 
- 
-  // If this is a legal integer from type, and the result would be an illegal 
-  // type, don't do the transformation. 
-  if (FromLegal && !ToLegal) 
-    return false; 
- 
-  // Otherwise, if both are illegal, do not increase the size of the result. We 
-  // do allow things like i160 -> i64, but not i64 -> i160. 
-  if (!FromLegal && !ToLegal && ToWidth > FromWidth) 
-    return false; 
- 
-  return true; 
-} 
- 
-/// Return true if it is desirable to convert a computation from 'From' to 'To'. 
-/// We don't want to convert from a legal to an illegal type or from a smaller 
-/// to a larger illegal type. i1 is always treated as a legal type because it is 
-/// a fundamental type in IR, and there are many specialized optimizations for 
-/// i1 types. 
+  bool FromLegal = FromWidth == 1 || DL.isLegalInteger(FromWidth);
+  bool ToLegal = ToWidth == 1 || DL.isLegalInteger(ToWidth);
+
+  // Convert to widths of 8, 16 or 32 even if they are not legal types. Only
+  // shrink types, to prevent infinite loops.
+  if (ToWidth < FromWidth && (ToWidth == 8 || ToWidth == 16 || ToWidth == 32))
+    return true;
+
+  // If this is a legal integer from type, and the result would be an illegal
+  // type, don't do the transformation.
+  if (FromLegal && !ToLegal)
+    return false;
+
+  // Otherwise, if both are illegal, do not increase the size of the result. We
+  // do allow things like i160 -> i64, but not i64 -> i160.
+  if (!FromLegal && !ToLegal && ToWidth > FromWidth)
+    return false;
+
+  return true;
+}
+
+/// Return true if it is desirable to convert a computation from 'From' to 'To'.
+/// We don't want to convert from a legal to an illegal type or from a smaller
+/// to a larger illegal type. i1 is always treated as a legal type because it is
+/// a fundamental type in IR, and there are many specialized optimizations for
+/// i1 types.
 bool InstCombinerImpl::shouldChangeType(Type *From, Type *To) const {
-  // TODO: This could be extended to allow vectors. Datalayout changes might be 
-  // needed to properly support that. 
-  if (!From->isIntegerTy() || !To->isIntegerTy()) 
-    return false; 
- 
-  unsigned FromWidth = From->getPrimitiveSizeInBits(); 
-  unsigned ToWidth = To->getPrimitiveSizeInBits(); 
-  return shouldChangeType(FromWidth, ToWidth); 
-} 
- 
-// Return true, if No Signed Wrap should be maintained for I. 
-// The No Signed Wrap flag can be kept if the operation "B (I.getOpcode) C", 
-// where both B and C should be ConstantInts, results in a constant that does 
-// not overflow. This function only handles the Add and Sub opcodes. For 
-// all other opcodes, the function conservatively returns false. 
-static bool maintainNoSignedWrap(BinaryOperator &I, Value *B, Value *C) { 
-  auto *OBO = dyn_cast<OverflowingBinaryOperator>(&I); 
-  if (!OBO || !OBO->hasNoSignedWrap()) 
-    return false; 
- 
-  // We reason about Add and Sub Only. 
-  Instruction::BinaryOps Opcode = I.getOpcode(); 
-  if (Opcode != Instruction::Add && Opcode != Instruction::Sub) 
-    return false; 
- 
-  const APInt *BVal, *CVal; 
-  if (!match(B, m_APInt(BVal)) || !match(C, m_APInt(CVal))) 
-    return false; 
- 
-  bool Overflow = false; 
-  if (Opcode == Instruction::Add) 
-    (void)BVal->sadd_ov(*CVal, Overflow); 
-  else 
-    (void)BVal->ssub_ov(*CVal, Overflow); 
- 
-  return !Overflow; 
-} 
- 
-static bool hasNoUnsignedWrap(BinaryOperator &I) { 
-  auto *OBO = dyn_cast<OverflowingBinaryOperator>(&I); 
-  return OBO && OBO->hasNoUnsignedWrap(); 
-} 
- 
-static bool hasNoSignedWrap(BinaryOperator &I) { 
-  auto *OBO = dyn_cast<OverflowingBinaryOperator>(&I); 
-  return OBO && OBO->hasNoSignedWrap(); 
-} 
- 
-/// Conservatively clears subclassOptionalData after a reassociation or 
-/// commutation. We preserve fast-math flags when applicable as they can be 
-/// preserved. 
-static void ClearSubclassDataAfterReassociation(BinaryOperator &I) { 
-  FPMathOperator *FPMO = dyn_cast<FPMathOperator>(&I); 
-  if (!FPMO) { 
-    I.clearSubclassOptionalData(); 
-    return; 
-  } 
- 
-  FastMathFlags FMF = I.getFastMathFlags(); 
-  I.clearSubclassOptionalData(); 
-  I.setFastMathFlags(FMF); 
-} 
- 
-/// Combine constant operands of associative operations either before or after a 
-/// cast to eliminate one of the associative operations: 
-/// (op (cast (op X, C2)), C1) --> (cast (op X, op (C1, C2))) 
-/// (op (cast (op X, C2)), C1) --> (op (cast X), op (C1, C2)) 
+  // TODO: This could be extended to allow vectors. Datalayout changes might be
+  // needed to properly support that.
+  if (!From->isIntegerTy() || !To->isIntegerTy())
+    return false;
+
+  unsigned FromWidth = From->getPrimitiveSizeInBits();
+  unsigned ToWidth = To->getPrimitiveSizeInBits();
+  return shouldChangeType(FromWidth, ToWidth);
+}
+
+// Return true, if No Signed Wrap should be maintained for I.
+// The No Signed Wrap flag can be kept if the operation "B (I.getOpcode) C",
+// where both B and C should be ConstantInts, results in a constant that does
+// not overflow. This function only handles the Add and Sub opcodes. For
+// all other opcodes, the function conservatively returns false.
+static bool maintainNoSignedWrap(BinaryOperator &I, Value *B, Value *C) {
+  auto *OBO = dyn_cast<OverflowingBinaryOperator>(&I);
+  if (!OBO || !OBO->hasNoSignedWrap())
+    return false;
+
+  // We reason about Add and Sub Only.
+  Instruction::BinaryOps Opcode = I.getOpcode();
+  if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
+    return false;
+
+  const APInt *BVal, *CVal;
+  if (!match(B, m_APInt(BVal)) || !match(C, m_APInt(CVal)))
+    return false;
+
+  bool Overflow = false;
+  if (Opcode == Instruction::Add)
+    (void)BVal->sadd_ov(*CVal, Overflow);
+  else
+    (void)BVal->ssub_ov(*CVal, Overflow);
+
+  return !Overflow;
+}
+
+static bool hasNoUnsignedWrap(BinaryOperator &I) {
+  auto *OBO = dyn_cast<OverflowingBinaryOperator>(&I);
+  return OBO && OBO->hasNoUnsignedWrap();
+}
+
+static bool hasNoSignedWrap(BinaryOperator &I) {
+  auto *OBO = dyn_cast<OverflowingBinaryOperator>(&I);
+  return OBO && OBO->hasNoSignedWrap();
+}
+
+/// Conservatively clears subclassOptionalData after a reassociation or
+/// commutation. We preserve fast-math flags when applicable as they can be
+/// preserved.
+static void ClearSubclassDataAfterReassociation(BinaryOperator &I) {
+  FPMathOperator *FPMO = dyn_cast<FPMathOperator>(&I);
+  if (!FPMO) {
+    I.clearSubclassOptionalData();
+    return;
+  }
+
+  FastMathFlags FMF = I.getFastMathFlags();
+  I.clearSubclassOptionalData();
+  I.setFastMathFlags(FMF);
+}
+
+/// Combine constant operands of associative operations either before or after a
+/// cast to eliminate one of the associative operations:
+/// (op (cast (op X, C2)), C1) --> (cast (op X, op (C1, C2)))
+/// (op (cast (op X, C2)), C1) --> (op (cast X), op (C1, C2))
 static bool simplifyAssocCastAssoc(BinaryOperator *BinOp1,
                                    InstCombinerImpl &IC) {
-  auto *Cast = dyn_cast<CastInst>(BinOp1->getOperand(0)); 
-  if (!Cast || !Cast->hasOneUse()) 
-    return false; 
- 
-  // TODO: Enhance logic for other casts and remove this check. 
-  auto CastOpcode = Cast->getOpcode(); 
-  if (CastOpcode != Instruction::ZExt) 
-    return false; 
- 
-  // TODO: Enhance logic for other BinOps and remove this check. 
-  if (!BinOp1->isBitwiseLogicOp()) 
-    return false; 
- 
-  auto AssocOpcode = BinOp1->getOpcode(); 
-  auto *BinOp2 = dyn_cast<BinaryOperator>(Cast->getOperand(0)); 
-  if (!BinOp2 || !BinOp2->hasOneUse() || BinOp2->getOpcode() != AssocOpcode) 
-    return false; 
- 
-  Constant *C1, *C2; 
-  if (!match(BinOp1->getOperand(1), m_Constant(C1)) || 
-      !match(BinOp2->getOperand(1), m_Constant(C2))) 
-    return false; 
- 
-  // TODO: This assumes a zext cast. 
-  // Eg, if it was a trunc, we'd cast C1 to the source type because casting C2 
-  // to the destination type might lose bits. 
- 
-  // Fold the constants together in the destination type: 
-  // (op (cast (op X, C2)), C1) --> (op (cast X), FoldedC) 
-  Type *DestTy = C1->getType(); 
-  Constant *CastC2 = ConstantExpr::getCast(CastOpcode, C2, DestTy); 
-  Constant *FoldedC = ConstantExpr::get(AssocOpcode, C1, CastC2); 
-  IC.replaceOperand(*Cast, 0, BinOp2->getOperand(0)); 
-  IC.replaceOperand(*BinOp1, 1, FoldedC); 
-  return true; 
-} 
- 
-/// This performs a few simplifications for operators that are associative or 
-/// commutative: 
-/// 
-///  Commutative operators: 
-/// 
-///  1. Order operands such that they are listed from right (least complex) to 
-///     left (most complex).  This puts constants before unary operators before 
-///     binary operators. 
-/// 
-///  Associative operators: 
-/// 
-///  2. Transform: "(A op B) op C" ==> "A op (B op C)" if "B op C" simplifies. 
-///  3. Transform: "A op (B op C)" ==> "(A op B) op C" if "A op B" simplifies. 
-/// 
-///  Associative and commutative operators: 
-/// 
-///  4. Transform: "(A op B) op C" ==> "(C op A) op B" if "C op A" simplifies. 
-///  5. Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies. 
-///  6. Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)" 
-///     if C1 and C2 are constants. 
+  auto *Cast = dyn_cast<CastInst>(BinOp1->getOperand(0));
+  if (!Cast || !Cast->hasOneUse())
+    return false;
+
+  // TODO: Enhance logic for other casts and remove this check.
+  auto CastOpcode = Cast->getOpcode();
+  if (CastOpcode != Instruction::ZExt)
+    return false;
+
+  // TODO: Enhance logic for other BinOps and remove this check.
+  if (!BinOp1->isBitwiseLogicOp())
+    return false;
+
+  auto AssocOpcode = BinOp1->getOpcode();
+  auto *BinOp2 = dyn_cast<BinaryOperator>(Cast->getOperand(0));
+  if (!BinOp2 || !BinOp2->hasOneUse() || BinOp2->getOpcode() != AssocOpcode)
+    return false;
+
+  Constant *C1, *C2;
+  if (!match(BinOp1->getOperand(1), m_Constant(C1)) ||
+      !match(BinOp2->getOperand(1), m_Constant(C2)))
+    return false;
+
+  // TODO: This assumes a zext cast.
+  // Eg, if it was a trunc, we'd cast C1 to the source type because casting C2
+  // to the destination type might lose bits.
+
+  // Fold the constants together in the destination type:
+  // (op (cast (op X, C2)), C1) --> (op (cast X), FoldedC)
+  Type *DestTy = C1->getType();
+  Constant *CastC2 = ConstantExpr::getCast(CastOpcode, C2, DestTy);
+  Constant *FoldedC = ConstantExpr::get(AssocOpcode, C1, CastC2);
+  IC.replaceOperand(*Cast, 0, BinOp2->getOperand(0));
+  IC.replaceOperand(*BinOp1, 1, FoldedC);
+  return true;
+}
+
+/// This performs a few simplifications for operators that are associative or
+/// commutative:
+///
+///  Commutative operators:
+///
+///  1. Order operands such that they are listed from right (least complex) to
+///     left (most complex).  This puts constants before unary operators before
+///     binary operators.
+///
+///  Associative operators:
+///
+///  2. Transform: "(A op B) op C" ==> "A op (B op C)" if "B op C" simplifies.
+///  3. Transform: "A op (B op C)" ==> "(A op B) op C" if "A op B" simplifies.
+///
+///  Associative and commutative operators:
+///
+///  4. Transform: "(A op B) op C" ==> "(C op A) op B" if "C op A" simplifies.
+///  5. Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies.
+///  6. Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)"
+///     if C1 and C2 are constants.
 bool InstCombinerImpl::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
-  Instruction::BinaryOps Opcode = I.getOpcode(); 
-  bool Changed = false; 
- 
-  do { 
-    // Order operands such that they are listed from right (least complex) to 
-    // left (most complex).  This puts constants before unary operators before 
-    // binary operators. 
-    if (I.isCommutative() && getComplexity(I.getOperand(0)) < 
-        getComplexity(I.getOperand(1))) 
-      Changed = !I.swapOperands(); 
- 
-    BinaryOperator *Op0 = dyn_cast<BinaryOperator>(I.getOperand(0)); 
-    BinaryOperator *Op1 = dyn_cast<BinaryOperator>(I.getOperand(1)); 
- 
-    if (I.isAssociative()) { 
-      // Transform: "(A op B) op C" ==> "A op (B op C)" if "B op C" simplifies. 
-      if (Op0 && Op0->getOpcode() == Opcode) { 
-        Value *A = Op0->getOperand(0); 
-        Value *B = Op0->getOperand(1); 
-        Value *C = I.getOperand(1); 
- 
-        // Does "B op C" simplify? 
-        if (Value *V = SimplifyBinOp(Opcode, B, C, SQ.getWithInstruction(&I))) { 
-          // It simplifies to V.  Form "A op V". 
-          replaceOperand(I, 0, A); 
-          replaceOperand(I, 1, V); 
-          bool IsNUW = hasNoUnsignedWrap(I) && hasNoUnsignedWrap(*Op0); 
-          bool IsNSW = maintainNoSignedWrap(I, B, C) && hasNoSignedWrap(*Op0); 
- 
-          // Conservatively clear all optional flags since they may not be 
-          // preserved by the reassociation. Reset nsw/nuw based on the above 
-          // analysis. 
-          ClearSubclassDataAfterReassociation(I); 
- 
-          // Note: this is only valid because SimplifyBinOp doesn't look at 
-          // the operands to Op0. 
-          if (IsNUW) 
-            I.setHasNoUnsignedWrap(true); 
- 
-          if (IsNSW) 
-            I.setHasNoSignedWrap(true); 
- 
-          Changed = true; 
-          ++NumReassoc; 
-          continue; 
-        } 
-      } 
- 
-      // Transform: "A op (B op C)" ==> "(A op B) op C" if "A op B" simplifies. 
-      if (Op1 && Op1->getOpcode() == Opcode) { 
-        Value *A = I.getOperand(0); 
-        Value *B = Op1->getOperand(0); 
-        Value *C = Op1->getOperand(1); 
- 
-        // Does "A op B" simplify? 
-        if (Value *V = SimplifyBinOp(Opcode, A, B, SQ.getWithInstruction(&I))) { 
-          // It simplifies to V.  Form "V op C". 
-          replaceOperand(I, 0, V); 
-          replaceOperand(I, 1, C); 
-          // Conservatively clear the optional flags, since they may not be 
-          // preserved by the reassociation. 
-          ClearSubclassDataAfterReassociation(I); 
-          Changed = true; 
-          ++NumReassoc; 
-          continue; 
-        } 
-      } 
-    } 
- 
-    if (I.isAssociative() && I.isCommutative()) { 
-      if (simplifyAssocCastAssoc(&I, *this)) { 
-        Changed = true; 
-        ++NumReassoc; 
-        continue; 
-      } 
- 
-      // Transform: "(A op B) op C" ==> "(C op A) op B" if "C op A" simplifies. 
-      if (Op0 && Op0->getOpcode() == Opcode) { 
-        Value *A = Op0->getOperand(0); 
-        Value *B = Op0->getOperand(1); 
-        Value *C = I.getOperand(1); 
- 
-        // Does "C op A" simplify? 
-        if (Value *V = SimplifyBinOp(Opcode, C, A, SQ.getWithInstruction(&I))) { 
-          // It simplifies to V.  Form "V op B". 
-          replaceOperand(I, 0, V); 
-          replaceOperand(I, 1, B); 
-          // Conservatively clear the optional flags, since they may not be 
-          // preserved by the reassociation. 
-          ClearSubclassDataAfterReassociation(I); 
-          Changed = true; 
-          ++NumReassoc; 
-          continue; 
-        } 
-      } 
- 
-      // Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies. 
-      if (Op1 && Op1->getOpcode() == Opcode) { 
-        Value *A = I.getOperand(0); 
-        Value *B = Op1->getOperand(0); 
-        Value *C = Op1->getOperand(1); 
- 
-        // Does "C op A" simplify? 
-        if (Value *V = SimplifyBinOp(Opcode, C, A, SQ.getWithInstruction(&I))) { 
-          // It simplifies to V.  Form "B op V". 
-          replaceOperand(I, 0, B); 
-          replaceOperand(I, 1, V); 
-          // Conservatively clear the optional flags, since they may not be 
-          // preserved by the reassociation. 
-          ClearSubclassDataAfterReassociation(I); 
-          Changed = true; 
-          ++NumReassoc; 
-          continue; 
-        } 
-      } 
- 
-      // Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)" 
-      // if C1 and C2 are constants. 
-      Value *A, *B; 
-      Constant *C1, *C2; 
-      if (Op0 && Op1 && 
-          Op0->getOpcode() == Opcode && Op1->getOpcode() == Opcode && 
-          match(Op0, m_OneUse(m_BinOp(m_Value(A), m_Constant(C1)))) && 
-          match(Op1, m_OneUse(m_BinOp(m_Value(B), m_Constant(C2))))) { 
-        bool IsNUW = hasNoUnsignedWrap(I) && 
-           hasNoUnsignedWrap(*Op0) && 
-           hasNoUnsignedWrap(*Op1); 
-         BinaryOperator *NewBO = (IsNUW && Opcode == Instruction::Add) ? 
-           BinaryOperator::CreateNUW(Opcode, A, B) : 
-           BinaryOperator::Create(Opcode, A, B); 
- 
-         if (isa<FPMathOperator>(NewBO)) { 
-          FastMathFlags Flags = I.getFastMathFlags(); 
-          Flags &= Op0->getFastMathFlags(); 
-          Flags &= Op1->getFastMathFlags(); 
-          NewBO->setFastMathFlags(Flags); 
-        } 
-        InsertNewInstWith(NewBO, I); 
-        NewBO->takeName(Op1); 
-        replaceOperand(I, 0, NewBO); 
-        replaceOperand(I, 1, ConstantExpr::get(Opcode, C1, C2)); 
-        // Conservatively clear the optional flags, since they may not be 
-        // preserved by the reassociation. 
-        ClearSubclassDataAfterReassociation(I); 
-        if (IsNUW) 
-          I.setHasNoUnsignedWrap(true); 
- 
-        Changed = true; 
-        continue; 
-      } 
-    } 
- 
-    // No further simplifications. 
-    return Changed; 
-  } while (true); 
-} 
- 
-/// Return whether "X LOp (Y ROp Z)" is always equal to 
-/// "(X LOp Y) ROp (X LOp Z)". 
-static bool leftDistributesOverRight(Instruction::BinaryOps LOp, 
-                                     Instruction::BinaryOps ROp) { 
-  // X & (Y | Z) <--> (X & Y) | (X & Z) 
-  // X & (Y ^ Z) <--> (X & Y) ^ (X & Z) 
-  if (LOp == Instruction::And) 
-    return ROp == Instruction::Or || ROp == Instruction::Xor; 
- 
-  // X | (Y & Z) <--> (X | Y) & (X | Z) 
-  if (LOp == Instruction::Or) 
-    return ROp == Instruction::And; 
- 
-  // X * (Y + Z) <--> (X * Y) + (X * Z) 
-  // X * (Y - Z) <--> (X * Y) - (X * Z) 
-  if (LOp == Instruction::Mul) 
-    return ROp == Instruction::Add || ROp == Instruction::Sub; 
- 
-  return false; 
-} 
- 
-/// Return whether "(X LOp Y) ROp Z" is always equal to 
-/// "(X ROp Z) LOp (Y ROp Z)". 
-static bool rightDistributesOverLeft(Instruction::BinaryOps LOp, 
-                                     Instruction::BinaryOps ROp) { 
-  if (Instruction::isCommutative(ROp)) 
-    return leftDistributesOverRight(ROp, LOp); 
- 
-  // (X {&|^} Y) >> Z <--> (X >> Z) {&|^} (Y >> Z) for all shifts. 
-  return Instruction::isBitwiseLogicOp(LOp) && Instruction::isShift(ROp); 
- 
-  // TODO: It would be nice to handle division, aka "(X + Y)/Z = X/Z + Y/Z", 
-  // but this requires knowing that the addition does not overflow and other 
-  // such subtleties. 
-} 
- 
-/// This function returns identity value for given opcode, which can be used to 
-/// factor patterns like (X * 2) + X ==> (X * 2) + (X * 1) ==> X * (2 + 1). 
-static Value *getIdentityValue(Instruction::BinaryOps Opcode, Value *V) { 
-  if (isa<Constant>(V)) 
-    return nullptr; 
- 
-  return ConstantExpr::getBinOpIdentity(Opcode, V->getType()); 
-} 
- 
-/// This function predicates factorization using distributive laws. By default, 
-/// it just returns the 'Op' inputs. But for special-cases like 
-/// 'add(shl(X, 5), ...)', this function will have TopOpcode == Instruction::Add 
-/// and Op = shl(X, 5). The 'shl' is treated as the more general 'mul X, 32' to 
-/// allow more factorization opportunities. 
-static Instruction::BinaryOps 
-getBinOpsForFactorization(Instruction::BinaryOps TopOpcode, BinaryOperator *Op, 
-                          Value *&LHS, Value *&RHS) { 
-  assert(Op && "Expected a binary operator"); 
-  LHS = Op->getOperand(0); 
-  RHS = Op->getOperand(1); 
-  if (TopOpcode == Instruction::Add || TopOpcode == Instruction::Sub) { 
-    Constant *C; 
-    if (match(Op, m_Shl(m_Value(), m_Constant(C)))) { 
-      // X << C --> X * (1 << C) 
-      RHS = ConstantExpr::getShl(ConstantInt::get(Op->getType(), 1), C); 
-      return Instruction::Mul; 
-    } 
-    // TODO: We can add other conversions e.g. shr => div etc. 
-  } 
-  return Op->getOpcode(); 
-} 
- 
-/// This tries to simplify binary operations by factorizing out common terms 
-/// (e. g. "(A*B)+(A*C)" -> "A*(B+C)"). 
+  Instruction::BinaryOps Opcode = I.getOpcode();
+  bool Changed = false;
+
+  do {
+    // Order operands such that they are listed from right (least complex) to
+    // left (most complex).  This puts constants before unary operators before
+    // binary operators.
+    if (I.isCommutative() && getComplexity(I.getOperand(0)) <
+        getComplexity(I.getOperand(1)))
+      Changed = !I.swapOperands();
+
+    BinaryOperator *Op0 = dyn_cast<BinaryOperator>(I.getOperand(0));
+    BinaryOperator *Op1 = dyn_cast<BinaryOperator>(I.getOperand(1));
+
+    if (I.isAssociative()) {
+      // Transform: "(A op B) op C" ==> "A op (B op C)" if "B op C" simplifies.
+      if (Op0 && Op0->getOpcode() == Opcode) {
+        Value *A = Op0->getOperand(0);
+        Value *B = Op0->getOperand(1);
+        Value *C = I.getOperand(1);
+
+        // Does "B op C" simplify?
+        if (Value *V = SimplifyBinOp(Opcode, B, C, SQ.getWithInstruction(&I))) {
+          // It simplifies to V.  Form "A op V".
+          replaceOperand(I, 0, A);
+          replaceOperand(I, 1, V);
+          bool IsNUW = hasNoUnsignedWrap(I) && hasNoUnsignedWrap(*Op0);
+          bool IsNSW = maintainNoSignedWrap(I, B, C) && hasNoSignedWrap(*Op0);
+
+          // Conservatively clear all optional flags since they may not be
+          // preserved by the reassociation. Reset nsw/nuw based on the above
+          // analysis.
+          ClearSubclassDataAfterReassociation(I);
+
+          // Note: this is only valid because SimplifyBinOp doesn't look at
+          // the operands to Op0.
+          if (IsNUW)
+            I.setHasNoUnsignedWrap(true);
+
+          if (IsNSW)
+            I.setHasNoSignedWrap(true);
+
+          Changed = true;
+          ++NumReassoc;
+          continue;
+        }
+      }
+
+      // Transform: "A op (B op C)" ==> "(A op B) op C" if "A op B" simplifies.
+      if (Op1 && Op1->getOpcode() == Opcode) {
+        Value *A = I.getOperand(0);
+        Value *B = Op1->getOperand(0);
+        Value *C = Op1->getOperand(1);
+
+        // Does "A op B" simplify?
+        if (Value *V = SimplifyBinOp(Opcode, A, B, SQ.getWithInstruction(&I))) {
+          // It simplifies to V.  Form "V op C".
+          replaceOperand(I, 0, V);
+          replaceOperand(I, 1, C);
+          // Conservatively clear the optional flags, since they may not be
+          // preserved by the reassociation.
+          ClearSubclassDataAfterReassociation(I);
+          Changed = true;
+          ++NumReassoc;
+          continue;
+        }
+      }
+    }
+
+    if (I.isAssociative() && I.isCommutative()) {
+      if (simplifyAssocCastAssoc(&I, *this)) {
+        Changed = true;
+        ++NumReassoc;
+        continue;
+      }
+
+      // Transform: "(A op B) op C" ==> "(C op A) op B" if "C op A" simplifies.
+      if (Op0 && Op0->getOpcode() == Opcode) {
+        Value *A = Op0->getOperand(0);
+        Value *B = Op0->getOperand(1);
+        Value *C = I.getOperand(1);
+
+        // Does "C op A" simplify?
+        if (Value *V = SimplifyBinOp(Opcode, C, A, SQ.getWithInstruction(&I))) {
+          // It simplifies to V.  Form "V op B".
+          replaceOperand(I, 0, V);
+          replaceOperand(I, 1, B);
+          // Conservatively clear the optional flags, since they may not be
+          // preserved by the reassociation.
+          ClearSubclassDataAfterReassociation(I);
+          Changed = true;
+          ++NumReassoc;
+          continue;
+        }
+      }
+
+      // Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies.
+      if (Op1 && Op1->getOpcode() == Opcode) {
+        Value *A = I.getOperand(0);
+        Value *B = Op1->getOperand(0);
+        Value *C = Op1->getOperand(1);
+
+        // Does "C op A" simplify?
+        if (Value *V = SimplifyBinOp(Opcode, C, A, SQ.getWithInstruction(&I))) {
+          // It simplifies to V.  Form "B op V".
+          replaceOperand(I, 0, B);
+          replaceOperand(I, 1, V);
+          // Conservatively clear the optional flags, since they may not be
+          // preserved by the reassociation.
+          ClearSubclassDataAfterReassociation(I);
+          Changed = true;
+          ++NumReassoc;
+          continue;
+        }
+      }
+
+      // Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)"
+      // if C1 and C2 are constants.
+      Value *A, *B;
+      Constant *C1, *C2;
+      if (Op0 && Op1 &&
+          Op0->getOpcode() == Opcode && Op1->getOpcode() == Opcode &&
+          match(Op0, m_OneUse(m_BinOp(m_Value(A), m_Constant(C1)))) &&
+          match(Op1, m_OneUse(m_BinOp(m_Value(B), m_Constant(C2))))) {
+        bool IsNUW = hasNoUnsignedWrap(I) &&
+           hasNoUnsignedWrap(*Op0) &&
+           hasNoUnsignedWrap(*Op1);
+         BinaryOperator *NewBO = (IsNUW && Opcode == Instruction::Add) ?
+           BinaryOperator::CreateNUW(Opcode, A, B) :
+           BinaryOperator::Create(Opcode, A, B);
+
+         if (isa<FPMathOperator>(NewBO)) {
+          FastMathFlags Flags = I.getFastMathFlags();
+          Flags &= Op0->getFastMathFlags();
+          Flags &= Op1->getFastMathFlags();
+          NewBO->setFastMathFlags(Flags);
+        }
+        InsertNewInstWith(NewBO, I);
+        NewBO->takeName(Op1);
+        replaceOperand(I, 0, NewBO);
+        replaceOperand(I, 1, ConstantExpr::get(Opcode, C1, C2));
+        // Conservatively clear the optional flags, since they may not be
+        // preserved by the reassociation.
+        ClearSubclassDataAfterReassociation(I);
+        if (IsNUW)
+          I.setHasNoUnsignedWrap(true);
+
+        Changed = true;
+        continue;
+      }
+    }
+
+    // No further simplifications.
+    return Changed;
+  } while (true);
+}
+
+/// Return whether "X LOp (Y ROp Z)" is always equal to
+/// "(X LOp Y) ROp (X LOp Z)".
+static bool leftDistributesOverRight(Instruction::BinaryOps LOp,
+                                     Instruction::BinaryOps ROp) {
+  // X & (Y | Z) <--> (X & Y) | (X & Z)
+  // X & (Y ^ Z) <--> (X & Y) ^ (X & Z)
+  if (LOp == Instruction::And)
+    return ROp == Instruction::Or || ROp == Instruction::Xor;
+
+  // X | (Y & Z) <--> (X | Y) & (X | Z)
+  if (LOp == Instruction::Or)
+    return ROp == Instruction::And;
+
+  // X * (Y + Z) <--> (X * Y) + (X * Z)
+  // X * (Y - Z) <--> (X * Y) - (X * Z)
+  if (LOp == Instruction::Mul)
+    return ROp == Instruction::Add || ROp == Instruction::Sub;
+
+  return false;
+}
+
+/// Return whether "(X LOp Y) ROp Z" is always equal to
+/// "(X ROp Z) LOp (Y ROp Z)".
+static bool rightDistributesOverLeft(Instruction::BinaryOps LOp,
+                                     Instruction::BinaryOps ROp) {
+  if (Instruction::isCommutative(ROp))
+    return leftDistributesOverRight(ROp, LOp);
+
+  // (X {&|^} Y) >> Z <--> (X >> Z) {&|^} (Y >> Z) for all shifts.
+  return Instruction::isBitwiseLogicOp(LOp) && Instruction::isShift(ROp);
+
+  // TODO: It would be nice to handle division, aka "(X + Y)/Z = X/Z + Y/Z",
+  // but this requires knowing that the addition does not overflow and other
+  // such subtleties.
+}
+
+/// This function returns identity value for given opcode, which can be used to
+/// factor patterns like (X * 2) + X ==> (X * 2) + (X * 1) ==> X * (2 + 1).
+static Value *getIdentityValue(Instruction::BinaryOps Opcode, Value *V) {
+  if (isa<Constant>(V))
+    return nullptr;
+
+  return ConstantExpr::getBinOpIdentity(Opcode, V->getType());
+}
+
+/// This function predicates factorization using distributive laws. By default,
+/// it just returns the 'Op' inputs. But for special-cases like
+/// 'add(shl(X, 5), ...)', this function will have TopOpcode == Instruction::Add
+/// and Op = shl(X, 5). The 'shl' is treated as the more general 'mul X, 32' to
+/// allow more factorization opportunities.
+static Instruction::BinaryOps
+getBinOpsForFactorization(Instruction::BinaryOps TopOpcode, BinaryOperator *Op,
+                          Value *&LHS, Value *&RHS) {
+  assert(Op && "Expected a binary operator");
+  LHS = Op->getOperand(0);
+  RHS = Op->getOperand(1);
+  if (TopOpcode == Instruction::Add || TopOpcode == Instruction::Sub) {
+    Constant *C;
+    if (match(Op, m_Shl(m_Value(), m_Constant(C)))) {
+      // X << C --> X * (1 << C)
+      RHS = ConstantExpr::getShl(ConstantInt::get(Op->getType(), 1), C);
+      return Instruction::Mul;
+    }
+    // TODO: We can add other conversions e.g. shr => div etc.
+  }
+  return Op->getOpcode();
+}
+
+/// This tries to simplify binary operations by factorizing out common terms
+/// (e. g. "(A*B)+(A*C)" -> "A*(B+C)").
 Value *InstCombinerImpl::tryFactorization(BinaryOperator &I,
                                           Instruction::BinaryOps InnerOpcode,
                                           Value *A, Value *B, Value *C,
                                           Value *D) {
-  assert(A && B && C && D && "All values must be provided"); 
- 
-  Value *V = nullptr; 
-  Value *SimplifiedInst = nullptr; 
-  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 
-  Instruction::BinaryOps TopLevelOpcode = I.getOpcode(); 
- 
-  // Does "X op' Y" always equal "Y op' X"? 
-  bool InnerCommutative = Instruction::isCommutative(InnerOpcode); 
- 
-  // Does "X op' (Y op Z)" always equal "(X op' Y) op (X op' Z)"? 
-  if (leftDistributesOverRight(InnerOpcode, TopLevelOpcode)) 
-    // Does the instruction have the form "(A op' B) op (A op' D)" or, in the 
-    // commutative case, "(A op' B) op (C op' A)"? 
-    if (A == C || (InnerCommutative && A == D)) { 
-      if (A != C) 
-        std::swap(C, D); 
-      // Consider forming "A op' (B op D)". 
-      // If "B op D" simplifies then it can be formed with no cost. 
-      V = SimplifyBinOp(TopLevelOpcode, B, D, SQ.getWithInstruction(&I)); 
-      // If "B op D" doesn't simplify then only go on if both of the existing 
-      // operations "A op' B" and "C op' D" will be zapped as no longer used. 
-      if (!V && LHS->hasOneUse() && RHS->hasOneUse()) 
-        V = Builder.CreateBinOp(TopLevelOpcode, B, D, RHS->getName()); 
-      if (V) { 
-        SimplifiedInst = Builder.CreateBinOp(InnerOpcode, A, V); 
-      } 
-    } 
- 
-  // Does "(X op Y) op' Z" always equal "(X op' Z) op (Y op' Z)"? 
-  if (!SimplifiedInst && rightDistributesOverLeft(TopLevelOpcode, InnerOpcode)) 
-    // Does the instruction have the form "(A op' B) op (C op' B)" or, in the 
-    // commutative case, "(A op' B) op (B op' D)"? 
-    if (B == D || (InnerCommutative && B == C)) { 
-      if (B != D) 
-        std::swap(C, D); 
-      // Consider forming "(A op C) op' B". 
-      // If "A op C" simplifies then it can be formed with no cost. 
-      V = SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I)); 
- 
-      // If "A op C" doesn't simplify then only go on if both of the existing 
-      // operations "A op' B" and "C op' D" will be zapped as no longer used. 
-      if (!V && LHS->hasOneUse() && RHS->hasOneUse()) 
-        V = Builder.CreateBinOp(TopLevelOpcode, A, C, LHS->getName()); 
-      if (V) { 
-        SimplifiedInst = Builder.CreateBinOp(InnerOpcode, V, B); 
-      } 
-    } 
- 
-  if (SimplifiedInst) { 
-    ++NumFactor; 
-    SimplifiedInst->takeName(&I); 
- 
-    // Check if we can add NSW/NUW flags to SimplifiedInst. If so, set them. 
-    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(SimplifiedInst)) { 
-      if (isa<OverflowingBinaryOperator>(SimplifiedInst)) { 
-        bool HasNSW = false; 
-        bool HasNUW = false; 
-        if (isa<OverflowingBinaryOperator>(&I)) { 
-          HasNSW = I.hasNoSignedWrap(); 
-          HasNUW = I.hasNoUnsignedWrap(); 
-        } 
- 
-        if (auto *LOBO = dyn_cast<OverflowingBinaryOperator>(LHS)) { 
-          HasNSW &= LOBO->hasNoSignedWrap(); 
-          HasNUW &= LOBO->hasNoUnsignedWrap(); 
-        } 
- 
-        if (auto *ROBO = dyn_cast<OverflowingBinaryOperator>(RHS)) { 
-          HasNSW &= ROBO->hasNoSignedWrap(); 
-          HasNUW &= ROBO->hasNoUnsignedWrap(); 
-        } 
- 
-        if (TopLevelOpcode == Instruction::Add && 
-            InnerOpcode == Instruction::Mul) { 
-          // We can propagate 'nsw' if we know that 
-          //  %Y = mul nsw i16 %X, C 
-          //  %Z = add nsw i16 %Y, %X 
-          // => 
-          //  %Z = mul nsw i16 %X, C+1 
-          // 
-          // iff C+1 isn't INT_MIN 
-          const APInt *CInt; 
-          if (match(V, m_APInt(CInt))) { 
-            if (!CInt->isMinSignedValue()) 
-              BO->setHasNoSignedWrap(HasNSW); 
-          } 
- 
-          // nuw can be propagated with any constant or nuw value. 
-          BO->setHasNoUnsignedWrap(HasNUW); 
-        } 
-      } 
-    } 
-  } 
-  return SimplifiedInst; 
-} 
- 
-/// This tries to simplify binary operations which some other binary operation 
-/// distributes over either by factorizing out common terms 
-/// (eg "(A*B)+(A*C)" -> "A*(B+C)") or expanding out if this results in 
-/// simplifications (eg: "A & (B | C) -> (A&B) | (A&C)" if this is a win). 
-/// Returns the simplified value, or null if it didn't simplify. 
+  assert(A && B && C && D && "All values must be provided");
+
+  Value *V = nullptr;
+  Value *SimplifiedInst = nullptr;
+  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+  Instruction::BinaryOps TopLevelOpcode = I.getOpcode();
+
+  // Does "X op' Y" always equal "Y op' X"?
+  bool InnerCommutative = Instruction::isCommutative(InnerOpcode);
+
+  // Does "X op' (Y op Z)" always equal "(X op' Y) op (X op' Z)"?
+  if (leftDistributesOverRight(InnerOpcode, TopLevelOpcode))
+    // Does the instruction have the form "(A op' B) op (A op' D)" or, in the
+    // commutative case, "(A op' B) op (C op' A)"?
+    if (A == C || (InnerCommutative && A == D)) {
+      if (A != C)
+        std::swap(C, D);
+      // Consider forming "A op' (B op D)".
+      // If "B op D" simplifies then it can be formed with no cost.
+      V = SimplifyBinOp(TopLevelOpcode, B, D, SQ.getWithInstruction(&I));
+      // If "B op D" doesn't simplify then only go on if both of the existing
+      // operations "A op' B" and "C op' D" will be zapped as no longer used.
+      if (!V && LHS->hasOneUse() && RHS->hasOneUse())
+        V = Builder.CreateBinOp(TopLevelOpcode, B, D, RHS->getName());
+      if (V) {
+        SimplifiedInst = Builder.CreateBinOp(InnerOpcode, A, V);
+      }
+    }
+
+  // Does "(X op Y) op' Z" always equal "(X op' Z) op (Y op' Z)"?
+  if (!SimplifiedInst && rightDistributesOverLeft(TopLevelOpcode, InnerOpcode))
+    // Does the instruction have the form "(A op' B) op (C op' B)" or, in the
+    // commutative case, "(A op' B) op (B op' D)"?
+    if (B == D || (InnerCommutative && B == C)) {
+      if (B != D)
+        std::swap(C, D);
+      // Consider forming "(A op C) op' B".
+      // If "A op C" simplifies then it can be formed with no cost.
+      V = SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I));
+
+      // If "A op C" doesn't simplify then only go on if both of the existing
+      // operations "A op' B" and "C op' D" will be zapped as no longer used.
+      if (!V && LHS->hasOneUse() && RHS->hasOneUse())
+        V = Builder.CreateBinOp(TopLevelOpcode, A, C, LHS->getName());
+      if (V) {
+        SimplifiedInst = Builder.CreateBinOp(InnerOpcode, V, B);
+      }
+    }
+
+  if (SimplifiedInst) {
+    ++NumFactor;
+    SimplifiedInst->takeName(&I);
+
+    // Check if we can add NSW/NUW flags to SimplifiedInst. If so, set them.
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(SimplifiedInst)) {
+      if (isa<OverflowingBinaryOperator>(SimplifiedInst)) {
+        bool HasNSW = false;
+        bool HasNUW = false;
+        if (isa<OverflowingBinaryOperator>(&I)) {
+          HasNSW = I.hasNoSignedWrap();
+          HasNUW = I.hasNoUnsignedWrap();
+        }
+
+        if (auto *LOBO = dyn_cast<OverflowingBinaryOperator>(LHS)) {
+          HasNSW &= LOBO->hasNoSignedWrap();
+          HasNUW &= LOBO->hasNoUnsignedWrap();
+        }
+
+        if (auto *ROBO = dyn_cast<OverflowingBinaryOperator>(RHS)) {
+          HasNSW &= ROBO->hasNoSignedWrap();
+          HasNUW &= ROBO->hasNoUnsignedWrap();
+        }
+
+        if (TopLevelOpcode == Instruction::Add &&
+            InnerOpcode == Instruction::Mul) {
+          // We can propagate 'nsw' if we know that
+          //  %Y = mul nsw i16 %X, C
+          //  %Z = add nsw i16 %Y, %X
+          // =>
+          //  %Z = mul nsw i16 %X, C+1
+          //
+          // iff C+1 isn't INT_MIN
+          const APInt *CInt;
+          if (match(V, m_APInt(CInt))) {
+            if (!CInt->isMinSignedValue())
+              BO->setHasNoSignedWrap(HasNSW);
+          }
+
+          // nuw can be propagated with any constant or nuw value.
+          BO->setHasNoUnsignedWrap(HasNUW);
+        }
+      }
+    }
+  }
+  return SimplifiedInst;
+}
+
+/// This tries to simplify binary operations which some other binary operation
+/// distributes over either by factorizing out common terms
+/// (eg "(A*B)+(A*C)" -> "A*(B+C)") or expanding out if this results in
+/// simplifications (eg: "A & (B | C) -> (A&B) | (A&C)" if this is a win).
+/// Returns the simplified value, or null if it didn't simplify.
 Value *InstCombinerImpl::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
-  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 
-  BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS); 
-  BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS); 
-  Instruction::BinaryOps TopLevelOpcode = I.getOpcode(); 
- 
-  { 
-    // Factorization. 
-    Value *A, *B, *C, *D; 
-    Instruction::BinaryOps LHSOpcode, RHSOpcode; 
-    if (Op0) 
-      LHSOpcode = getBinOpsForFactorization(TopLevelOpcode, Op0, A, B); 
-    if (Op1) 
-      RHSOpcode = getBinOpsForFactorization(TopLevelOpcode, Op1, C, D); 
- 
-    // The instruction has the form "(A op' B) op (C op' D)".  Try to factorize 
-    // a common term. 
-    if (Op0 && Op1 && LHSOpcode == RHSOpcode) 
-      if (Value *V = tryFactorization(I, LHSOpcode, A, B, C, D)) 
-        return V; 
- 
-    // The instruction has the form "(A op' B) op (C)".  Try to factorize common 
-    // term. 
-    if (Op0) 
-      if (Value *Ident = getIdentityValue(LHSOpcode, RHS)) 
-        if (Value *V = tryFactorization(I, LHSOpcode, A, B, RHS, Ident)) 
-          return V; 
- 
-    // The instruction has the form "(B) op (C op' D)".  Try to factorize common 
-    // term. 
-    if (Op1) 
-      if (Value *Ident = getIdentityValue(RHSOpcode, LHS)) 
-        if (Value *V = tryFactorization(I, RHSOpcode, LHS, Ident, C, D)) 
-          return V; 
-  } 
- 
-  // Expansion. 
-  if (Op0 && rightDistributesOverLeft(Op0->getOpcode(), TopLevelOpcode)) { 
-    // The instruction has the form "(A op' B) op C".  See if expanding it out 
-    // to "(A op C) op' (B op C)" results in simplifications. 
-    Value *A = Op0->getOperand(0), *B = Op0->getOperand(1), *C = RHS; 
-    Instruction::BinaryOps InnerOpcode = Op0->getOpcode(); // op' 
- 
+  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+  BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS);
+  BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS);
+  Instruction::BinaryOps TopLevelOpcode = I.getOpcode();
+
+  {
+    // Factorization.
+    Value *A, *B, *C, *D;
+    Instruction::BinaryOps LHSOpcode, RHSOpcode;
+    if (Op0)
+      LHSOpcode = getBinOpsForFactorization(TopLevelOpcode, Op0, A, B);
+    if (Op1)
+      RHSOpcode = getBinOpsForFactorization(TopLevelOpcode, Op1, C, D);
+
+    // The instruction has the form "(A op' B) op (C op' D)".  Try to factorize
+    // a common term.
+    if (Op0 && Op1 && LHSOpcode == RHSOpcode)
+      if (Value *V = tryFactorization(I, LHSOpcode, A, B, C, D))
+        return V;
+
+    // The instruction has the form "(A op' B) op (C)".  Try to factorize common
+    // term.
+    if (Op0)
+      if (Value *Ident = getIdentityValue(LHSOpcode, RHS))
+        if (Value *V = tryFactorization(I, LHSOpcode, A, B, RHS, Ident))
+          return V;
+
+    // The instruction has the form "(B) op (C op' D)".  Try to factorize common
+    // term.
+    if (Op1)
+      if (Value *Ident = getIdentityValue(RHSOpcode, LHS))
+        if (Value *V = tryFactorization(I, RHSOpcode, LHS, Ident, C, D))
+          return V;
+  }
+
+  // Expansion.
+  if (Op0 && rightDistributesOverLeft(Op0->getOpcode(), TopLevelOpcode)) {
+    // The instruction has the form "(A op' B) op C".  See if expanding it out
+    // to "(A op C) op' (B op C)" results in simplifications.
+    Value *A = Op0->getOperand(0), *B = Op0->getOperand(1), *C = RHS;
+    Instruction::BinaryOps InnerOpcode = Op0->getOpcode(); // op'
+
     // Disable the use of undef because it's not safe to distribute undef.
     auto SQDistributive = SQ.getWithInstruction(&I).getWithoutUndef();
     Value *L = SimplifyBinOp(TopLevelOpcode, A, C, SQDistributive);
     Value *R = SimplifyBinOp(TopLevelOpcode, B, C, SQDistributive);
- 
-    // Do "A op C" and "B op C" both simplify? 
-    if (L && R) { 
-      // They do! Return "L op' R". 
-      ++NumExpand; 
-      C = Builder.CreateBinOp(InnerOpcode, L, R); 
-      C->takeName(&I); 
-      return C; 
-    } 
- 
-    // Does "A op C" simplify to the identity value for the inner opcode? 
-    if (L && L == ConstantExpr::getBinOpIdentity(InnerOpcode, L->getType())) { 
-      // They do! Return "B op C". 
-      ++NumExpand; 
-      C = Builder.CreateBinOp(TopLevelOpcode, B, C); 
-      C->takeName(&I); 
-      return C; 
-    } 
- 
-    // Does "B op C" simplify to the identity value for the inner opcode? 
-    if (R && R == ConstantExpr::getBinOpIdentity(InnerOpcode, R->getType())) { 
-      // They do! Return "A op C". 
-      ++NumExpand; 
-      C = Builder.CreateBinOp(TopLevelOpcode, A, C); 
-      C->takeName(&I); 
-      return C; 
-    } 
-  } 
- 
-  if (Op1 && leftDistributesOverRight(TopLevelOpcode, Op1->getOpcode())) { 
-    // The instruction has the form "A op (B op' C)".  See if expanding it out 
-    // to "(A op B) op' (A op C)" results in simplifications. 
-    Value *A = LHS, *B = Op1->getOperand(0), *C = Op1->getOperand(1); 
-    Instruction::BinaryOps InnerOpcode = Op1->getOpcode(); // op' 
- 
+
+    // Do "A op C" and "B op C" both simplify?
+    if (L && R) {
+      // They do! Return "L op' R".
+      ++NumExpand;
+      C = Builder.CreateBinOp(InnerOpcode, L, R);
+      C->takeName(&I);
+      return C;
+    }
+
+    // Does "A op C" simplify to the identity value for the inner opcode?
+    if (L && L == ConstantExpr::getBinOpIdentity(InnerOpcode, L->getType())) {
+      // They do! Return "B op C".
+      ++NumExpand;
+      C = Builder.CreateBinOp(TopLevelOpcode, B, C);
+      C->takeName(&I);
+      return C;
+    }
+
+    // Does "B op C" simplify to the identity value for the inner opcode?
+    if (R && R == ConstantExpr::getBinOpIdentity(InnerOpcode, R->getType())) {
+      // They do! Return "A op C".
+      ++NumExpand;
+      C = Builder.CreateBinOp(TopLevelOpcode, A, C);
+      C->takeName(&I);
+      return C;
+    }
+  }
+
+  if (Op1 && leftDistributesOverRight(TopLevelOpcode, Op1->getOpcode())) {
+    // The instruction has the form "A op (B op' C)".  See if expanding it out
+    // to "(A op B) op' (A op C)" results in simplifications.
+    Value *A = LHS, *B = Op1->getOperand(0), *C = Op1->getOperand(1);
+    Instruction::BinaryOps InnerOpcode = Op1->getOpcode(); // op'
+
     // Disable the use of undef because it's not safe to distribute undef.
     auto SQDistributive = SQ.getWithInstruction(&I).getWithoutUndef();
     Value *L = SimplifyBinOp(TopLevelOpcode, A, B, SQDistributive);
     Value *R = SimplifyBinOp(TopLevelOpcode, A, C, SQDistributive);
- 
-    // Do "A op B" and "A op C" both simplify? 
-    if (L && R) { 
-      // They do! Return "L op' R". 
-      ++NumExpand; 
-      A = Builder.CreateBinOp(InnerOpcode, L, R); 
-      A->takeName(&I); 
-      return A; 
-    } 
- 
-    // Does "A op B" simplify to the identity value for the inner opcode? 
-    if (L && L == ConstantExpr::getBinOpIdentity(InnerOpcode, L->getType())) { 
-      // They do! Return "A op C". 
-      ++NumExpand; 
-      A = Builder.CreateBinOp(TopLevelOpcode, A, C); 
-      A->takeName(&I); 
-      return A; 
-    } 
- 
-    // Does "A op C" simplify to the identity value for the inner opcode? 
-    if (R && R == ConstantExpr::getBinOpIdentity(InnerOpcode, R->getType())) { 
-      // They do! Return "A op B". 
-      ++NumExpand; 
-      A = Builder.CreateBinOp(TopLevelOpcode, A, B); 
-      A->takeName(&I); 
-      return A; 
-    } 
-  } 
- 
-  return SimplifySelectsFeedingBinaryOp(I, LHS, RHS); 
-} 
- 
+
+    // Do "A op B" and "A op C" both simplify?
+    if (L && R) {
+      // They do! Return "L op' R".
+      ++NumExpand;
+      A = Builder.CreateBinOp(InnerOpcode, L, R);
+      A->takeName(&I);
+      return A;
+    }
+
+    // Does "A op B" simplify to the identity value for the inner opcode?
+    if (L && L == ConstantExpr::getBinOpIdentity(InnerOpcode, L->getType())) {
+      // They do! Return "A op C".
+      ++NumExpand;
+      A = Builder.CreateBinOp(TopLevelOpcode, A, C);
+      A->takeName(&I);
+      return A;
+    }
+
+    // Does "A op C" simplify to the identity value for the inner opcode?
+    if (R && R == ConstantExpr::getBinOpIdentity(InnerOpcode, R->getType())) {
+      // They do! Return "A op B".
+      ++NumExpand;
+      A = Builder.CreateBinOp(TopLevelOpcode, A, B);
+      A->takeName(&I);
+      return A;
+    }
+  }
+
+  return SimplifySelectsFeedingBinaryOp(I, LHS, RHS);
+}
+
 Value *InstCombinerImpl::SimplifySelectsFeedingBinaryOp(BinaryOperator &I,
                                                         Value *LHS,
                                                         Value *RHS) {
-  Value *A, *B, *C, *D, *E, *F; 
-  bool LHSIsSelect = match(LHS, m_Select(m_Value(A), m_Value(B), m_Value(C))); 
-  bool RHSIsSelect = match(RHS, m_Select(m_Value(D), m_Value(E), m_Value(F))); 
-  if (!LHSIsSelect && !RHSIsSelect) 
-    return nullptr; 
- 
-  FastMathFlags FMF; 
-  BuilderTy::FastMathFlagGuard Guard(Builder); 
-  if (isa<FPMathOperator>(&I)) { 
-    FMF = I.getFastMathFlags(); 
-    Builder.setFastMathFlags(FMF); 
-  } 
- 
-  Instruction::BinaryOps Opcode = I.getOpcode(); 
-  SimplifyQuery Q = SQ.getWithInstruction(&I); 
- 
-  Value *Cond, *True = nullptr, *False = nullptr; 
-  if (LHSIsSelect && RHSIsSelect && A == D) { 
-    // (A ? B : C) op (A ? E : F) -> A ? (B op E) : (C op F) 
-    Cond = A; 
-    True = SimplifyBinOp(Opcode, B, E, FMF, Q); 
-    False = SimplifyBinOp(Opcode, C, F, FMF, Q); 
- 
-    if (LHS->hasOneUse() && RHS->hasOneUse()) { 
-      if (False && !True) 
-        True = Builder.CreateBinOp(Opcode, B, E); 
-      else if (True && !False) 
-        False = Builder.CreateBinOp(Opcode, C, F); 
-    } 
-  } else if (LHSIsSelect && LHS->hasOneUse()) { 
-    // (A ? B : C) op Y -> A ? (B op Y) : (C op Y) 
-    Cond = A; 
-    True = SimplifyBinOp(Opcode, B, RHS, FMF, Q); 
-    False = SimplifyBinOp(Opcode, C, RHS, FMF, Q); 
-  } else if (RHSIsSelect && RHS->hasOneUse()) { 
-    // X op (D ? E : F) -> D ? (X op E) : (X op F) 
-    Cond = D; 
-    True = SimplifyBinOp(Opcode, LHS, E, FMF, Q); 
-    False = SimplifyBinOp(Opcode, LHS, F, FMF, Q); 
-  } 
- 
-  if (!True || !False) 
-    return nullptr; 
- 
-  Value *SI = Builder.CreateSelect(Cond, True, False); 
-  SI->takeName(&I); 
-  return SI; 
-} 
- 
+  Value *A, *B, *C, *D, *E, *F;
+  bool LHSIsSelect = match(LHS, m_Select(m_Value(A), m_Value(B), m_Value(C)));
+  bool RHSIsSelect = match(RHS, m_Select(m_Value(D), m_Value(E), m_Value(F)));
+  if (!LHSIsSelect && !RHSIsSelect)
+    return nullptr;
+
+  FastMathFlags FMF;
+  BuilderTy::FastMathFlagGuard Guard(Builder);
+  if (isa<FPMathOperator>(&I)) {
+    FMF = I.getFastMathFlags();
+    Builder.setFastMathFlags(FMF);
+  }
+
+  Instruction::BinaryOps Opcode = I.getOpcode();
+  SimplifyQuery Q = SQ.getWithInstruction(&I);
+
+  Value *Cond, *True = nullptr, *False = nullptr;
+  if (LHSIsSelect && RHSIsSelect && A == D) {
+    // (A ? B : C) op (A ? E : F) -> A ? (B op E) : (C op F)
+    Cond = A;
+    True = SimplifyBinOp(Opcode, B, E, FMF, Q);
+    False = SimplifyBinOp(Opcode, C, F, FMF, Q);
+
+    if (LHS->hasOneUse() && RHS->hasOneUse()) {
+      if (False && !True)
+        True = Builder.CreateBinOp(Opcode, B, E);
+      else if (True && !False)
+        False = Builder.CreateBinOp(Opcode, C, F);
+    }
+  } else if (LHSIsSelect && LHS->hasOneUse()) {
+    // (A ? B : C) op Y -> A ? (B op Y) : (C op Y)
+    Cond = A;
+    True = SimplifyBinOp(Opcode, B, RHS, FMF, Q);
+    False = SimplifyBinOp(Opcode, C, RHS, FMF, Q);
+  } else if (RHSIsSelect && RHS->hasOneUse()) {
+    // X op (D ? E : F) -> D ? (X op E) : (X op F)
+    Cond = D;
+    True = SimplifyBinOp(Opcode, LHS, E, FMF, Q);
+    False = SimplifyBinOp(Opcode, LHS, F, FMF, Q);
+  }
+
+  if (!True || !False)
+    return nullptr;
+
+  Value *SI = Builder.CreateSelect(Cond, True, False);
+  SI->takeName(&I);
+  return SI;
+}
+
 /// Freely adapt every user of V as-if V was changed to !V.
 /// WARNING: only if canFreelyInvertAllUsersOf() said this can be done.
 void InstCombinerImpl::freelyInvertAllUsersOf(Value *I) {
@@ -894,288 +894,288 @@ void InstCombinerImpl::freelyInvertAllUsersOf(Value *I) {
   }
 }
 
-/// Given a 'sub' instruction, return the RHS of the instruction if the LHS is a 
-/// constant zero (which is the 'negate' form). 
+/// Given a 'sub' instruction, return the RHS of the instruction if the LHS is a
+/// constant zero (which is the 'negate' form).
 Value *InstCombinerImpl::dyn_castNegVal(Value *V) const {
-  Value *NegV; 
-  if (match(V, m_Neg(m_Value(NegV)))) 
-    return NegV; 
- 
-  // Constants can be considered to be negated values if they can be folded. 
-  if (ConstantInt *C = dyn_cast<ConstantInt>(V)) 
-    return ConstantExpr::getNeg(C); 
- 
-  if (ConstantDataVector *C = dyn_cast<ConstantDataVector>(V)) 
-    if (C->getType()->getElementType()->isIntegerTy()) 
-      return ConstantExpr::getNeg(C); 
- 
-  if (ConstantVector *CV = dyn_cast<ConstantVector>(V)) { 
-    for (unsigned i = 0, e = CV->getNumOperands(); i != e; ++i) { 
-      Constant *Elt = CV->getAggregateElement(i); 
-      if (!Elt) 
-        return nullptr; 
- 
-      if (isa<UndefValue>(Elt)) 
-        continue; 
- 
-      if (!isa<ConstantInt>(Elt)) 
-        return nullptr; 
-    } 
-    return ConstantExpr::getNeg(CV); 
-  } 
- 
-  return nullptr; 
-} 
- 
-static Value *foldOperationIntoSelectOperand(Instruction &I, Value *SO, 
-                                             InstCombiner::BuilderTy &Builder) { 
-  if (auto *Cast = dyn_cast<CastInst>(&I)) 
-    return Builder.CreateCast(Cast->getOpcode(), SO, I.getType()); 
- 
-  assert(I.isBinaryOp() && "Unexpected opcode for select folding"); 
- 
-  // Figure out if the constant is the left or the right argument. 
-  bool ConstIsRHS = isa<Constant>(I.getOperand(1)); 
-  Constant *ConstOperand = cast<Constant>(I.getOperand(ConstIsRHS)); 
- 
-  if (auto *SOC = dyn_cast<Constant>(SO)) { 
-    if (ConstIsRHS) 
-      return ConstantExpr::get(I.getOpcode(), SOC, ConstOperand); 
-    return ConstantExpr::get(I.getOpcode(), ConstOperand, SOC); 
-  } 
- 
-  Value *Op0 = SO, *Op1 = ConstOperand; 
-  if (!ConstIsRHS) 
-    std::swap(Op0, Op1); 
- 
-  auto *BO = cast<BinaryOperator>(&I); 
-  Value *RI = Builder.CreateBinOp(BO->getOpcode(), Op0, Op1, 
-                                  SO->getName() + ".op"); 
-  auto *FPInst = dyn_cast<Instruction>(RI); 
-  if (FPInst && isa<FPMathOperator>(FPInst)) 
-    FPInst->copyFastMathFlags(BO); 
-  return RI; 
-} 
- 
+  Value *NegV;
+  if (match(V, m_Neg(m_Value(NegV))))
+    return NegV;
+
+  // Constants can be considered to be negated values if they can be folded.
+  if (ConstantInt *C = dyn_cast<ConstantInt>(V))
+    return ConstantExpr::getNeg(C);
+
+  if (ConstantDataVector *C = dyn_cast<ConstantDataVector>(V))
+    if (C->getType()->getElementType()->isIntegerTy())
+      return ConstantExpr::getNeg(C);
+
+  if (ConstantVector *CV = dyn_cast<ConstantVector>(V)) {
+    for (unsigned i = 0, e = CV->getNumOperands(); i != e; ++i) {
+      Constant *Elt = CV->getAggregateElement(i);
+      if (!Elt)
+        return nullptr;
+
+      if (isa<UndefValue>(Elt))
+        continue;
+
+      if (!isa<ConstantInt>(Elt))
+        return nullptr;
+    }
+    return ConstantExpr::getNeg(CV);
+  }
+
+  return nullptr;
+}
+
+static Value *foldOperationIntoSelectOperand(Instruction &I, Value *SO,
+                                             InstCombiner::BuilderTy &Builder) {
+  if (auto *Cast = dyn_cast<CastInst>(&I))
+    return Builder.CreateCast(Cast->getOpcode(), SO, I.getType());
+
+  assert(I.isBinaryOp() && "Unexpected opcode for select folding");
+
+  // Figure out if the constant is the left or the right argument.
+  bool ConstIsRHS = isa<Constant>(I.getOperand(1));
+  Constant *ConstOperand = cast<Constant>(I.getOperand(ConstIsRHS));
+
+  if (auto *SOC = dyn_cast<Constant>(SO)) {
+    if (ConstIsRHS)
+      return ConstantExpr::get(I.getOpcode(), SOC, ConstOperand);
+    return ConstantExpr::get(I.getOpcode(), ConstOperand, SOC);
+  }
+
+  Value *Op0 = SO, *Op1 = ConstOperand;
+  if (!ConstIsRHS)
+    std::swap(Op0, Op1);
+
+  auto *BO = cast<BinaryOperator>(&I);
+  Value *RI = Builder.CreateBinOp(BO->getOpcode(), Op0, Op1,
+                                  SO->getName() + ".op");
+  auto *FPInst = dyn_cast<Instruction>(RI);
+  if (FPInst && isa<FPMathOperator>(FPInst))
+    FPInst->copyFastMathFlags(BO);
+  return RI;
+}
+
 Instruction *InstCombinerImpl::FoldOpIntoSelect(Instruction &Op,
                                                 SelectInst *SI) {
-  // Don't modify shared select instructions. 
-  if (!SI->hasOneUse()) 
-    return nullptr; 
- 
-  Value *TV = SI->getTrueValue(); 
-  Value *FV = SI->getFalseValue(); 
-  if (!(isa<Constant>(TV) || isa<Constant>(FV))) 
-    return nullptr; 
- 
-  // Bool selects with constant operands can be folded to logical ops. 
-  if (SI->getType()->isIntOrIntVectorTy(1)) 
-    return nullptr; 
- 
-  // If it's a bitcast involving vectors, make sure it has the same number of 
-  // elements on both sides. 
-  if (auto *BC = dyn_cast<BitCastInst>(&Op)) { 
-    VectorType *DestTy = dyn_cast<VectorType>(BC->getDestTy()); 
-    VectorType *SrcTy = dyn_cast<VectorType>(BC->getSrcTy()); 
- 
-    // Verify that either both or neither are vectors. 
-    if ((SrcTy == nullptr) != (DestTy == nullptr)) 
-      return nullptr; 
- 
-    // If vectors, verify that they have the same number of elements. 
+  // Don't modify shared select instructions.
+  if (!SI->hasOneUse())
+    return nullptr;
+
+  Value *TV = SI->getTrueValue();
+  Value *FV = SI->getFalseValue();
+  if (!(isa<Constant>(TV) || isa<Constant>(FV)))
+    return nullptr;
+
+  // Bool selects with constant operands can be folded to logical ops.
+  if (SI->getType()->isIntOrIntVectorTy(1))
+    return nullptr;
+
+  // If it's a bitcast involving vectors, make sure it has the same number of
+  // elements on both sides.
+  if (auto *BC = dyn_cast<BitCastInst>(&Op)) {
+    VectorType *DestTy = dyn_cast<VectorType>(BC->getDestTy());
+    VectorType *SrcTy = dyn_cast<VectorType>(BC->getSrcTy());
+
+    // Verify that either both or neither are vectors.
+    if ((SrcTy == nullptr) != (DestTy == nullptr))
+      return nullptr;
+
+    // If vectors, verify that they have the same number of elements.
     if (SrcTy && SrcTy->getElementCount() != DestTy->getElementCount())
-      return nullptr; 
-  } 
- 
-  // Test if a CmpInst instruction is used exclusively by a select as 
-  // part of a minimum or maximum operation. If so, refrain from doing 
-  // any other folding. This helps out other analyses which understand 
-  // non-obfuscated minimum and maximum idioms, such as ScalarEvolution 
-  // and CodeGen. And in this case, at least one of the comparison 
-  // operands has at least one user besides the compare (the select), 
-  // which would often largely negate the benefit of folding anyway. 
-  if (auto *CI = dyn_cast<CmpInst>(SI->getCondition())) { 
-    if (CI->hasOneUse()) { 
-      Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1); 
- 
-      // FIXME: This is a hack to avoid infinite looping with min/max patterns. 
-      //        We have to ensure that vector constants that only differ with 
-      //        undef elements are treated as equivalent. 
-      auto areLooselyEqual = [](Value *A, Value *B) { 
-        if (A == B) 
-          return true; 
- 
-        // Test for vector constants. 
-        Constant *ConstA, *ConstB; 
-        if (!match(A, m_Constant(ConstA)) || !match(B, m_Constant(ConstB))) 
-          return false; 
- 
-        // TODO: Deal with FP constants? 
-        if (!A->getType()->isIntOrIntVectorTy() || A->getType() != B->getType()) 
-          return false; 
- 
-        // Compare for equality including undefs as equal. 
-        auto *Cmp = ConstantExpr::getCompare(ICmpInst::ICMP_EQ, ConstA, ConstB); 
-        const APInt *C; 
-        return match(Cmp, m_APIntAllowUndef(C)) && C->isOneValue(); 
-      }; 
- 
-      if ((areLooselyEqual(TV, Op0) && areLooselyEqual(FV, Op1)) || 
-          (areLooselyEqual(FV, Op0) && areLooselyEqual(TV, Op1))) 
-        return nullptr; 
-    } 
-  } 
- 
-  Value *NewTV = foldOperationIntoSelectOperand(Op, TV, Builder); 
-  Value *NewFV = foldOperationIntoSelectOperand(Op, FV, Builder); 
-  return SelectInst::Create(SI->getCondition(), NewTV, NewFV, "", nullptr, SI); 
-} 
- 
-static Value *foldOperationIntoPhiValue(BinaryOperator *I, Value *InV, 
-                                        InstCombiner::BuilderTy &Builder) { 
-  bool ConstIsRHS = isa<Constant>(I->getOperand(1)); 
-  Constant *C = cast<Constant>(I->getOperand(ConstIsRHS)); 
- 
-  if (auto *InC = dyn_cast<Constant>(InV)) { 
-    if (ConstIsRHS) 
-      return ConstantExpr::get(I->getOpcode(), InC, C); 
-    return ConstantExpr::get(I->getOpcode(), C, InC); 
-  } 
- 
-  Value *Op0 = InV, *Op1 = C; 
-  if (!ConstIsRHS) 
-    std::swap(Op0, Op1); 
- 
-  Value *RI = Builder.CreateBinOp(I->getOpcode(), Op0, Op1, "phi.bo"); 
-  auto *FPInst = dyn_cast<Instruction>(RI); 
-  if (FPInst && isa<FPMathOperator>(FPInst)) 
-    FPInst->copyFastMathFlags(I); 
-  return RI; 
-} 
- 
+      return nullptr;
+  }
+
+  // Test if a CmpInst instruction is used exclusively by a select as
+  // part of a minimum or maximum operation. If so, refrain from doing
+  // any other folding. This helps out other analyses which understand
+  // non-obfuscated minimum and maximum idioms, such as ScalarEvolution
+  // and CodeGen. And in this case, at least one of the comparison
+  // operands has at least one user besides the compare (the select),
+  // which would often largely negate the benefit of folding anyway.
+  if (auto *CI = dyn_cast<CmpInst>(SI->getCondition())) {
+    if (CI->hasOneUse()) {
+      Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1);
+
+      // FIXME: This is a hack to avoid infinite looping with min/max patterns.
+      //        We have to ensure that vector constants that only differ with
+      //        undef elements are treated as equivalent.
+      auto areLooselyEqual = [](Value *A, Value *B) {
+        if (A == B)
+          return true;
+
+        // Test for vector constants.
+        Constant *ConstA, *ConstB;
+        if (!match(A, m_Constant(ConstA)) || !match(B, m_Constant(ConstB)))
+          return false;
+
+        // TODO: Deal with FP constants?
+        if (!A->getType()->isIntOrIntVectorTy() || A->getType() != B->getType())
+          return false;
+
+        // Compare for equality including undefs as equal.
+        auto *Cmp = ConstantExpr::getCompare(ICmpInst::ICMP_EQ, ConstA, ConstB);
+        const APInt *C;
+        return match(Cmp, m_APIntAllowUndef(C)) && C->isOneValue();
+      };
+
+      if ((areLooselyEqual(TV, Op0) && areLooselyEqual(FV, Op1)) ||
+          (areLooselyEqual(FV, Op0) && areLooselyEqual(TV, Op1)))
+        return nullptr;
+    }
+  }
+
+  Value *NewTV = foldOperationIntoSelectOperand(Op, TV, Builder);
+  Value *NewFV = foldOperationIntoSelectOperand(Op, FV, Builder);
+  return SelectInst::Create(SI->getCondition(), NewTV, NewFV, "", nullptr, SI);
+}
+
+static Value *foldOperationIntoPhiValue(BinaryOperator *I, Value *InV,
+                                        InstCombiner::BuilderTy &Builder) {
+  bool ConstIsRHS = isa<Constant>(I->getOperand(1));
+  Constant *C = cast<Constant>(I->getOperand(ConstIsRHS));
+
+  if (auto *InC = dyn_cast<Constant>(InV)) {
+    if (ConstIsRHS)
+      return ConstantExpr::get(I->getOpcode(), InC, C);
+    return ConstantExpr::get(I->getOpcode(), C, InC);
+  }
+
+  Value *Op0 = InV, *Op1 = C;
+  if (!ConstIsRHS)
+    std::swap(Op0, Op1);
+
+  Value *RI = Builder.CreateBinOp(I->getOpcode(), Op0, Op1, "phi.bo");
+  auto *FPInst = dyn_cast<Instruction>(RI);
+  if (FPInst && isa<FPMathOperator>(FPInst))
+    FPInst->copyFastMathFlags(I);
+  return RI;
+}
+
 Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
-  unsigned NumPHIValues = PN->getNumIncomingValues(); 
-  if (NumPHIValues == 0) 
-    return nullptr; 
- 
-  // We normally only transform phis with a single use.  However, if a PHI has 
-  // multiple uses and they are all the same operation, we can fold *all* of the 
-  // uses into the PHI. 
-  if (!PN->hasOneUse()) { 
-    // Walk the use list for the instruction, comparing them to I. 
-    for (User *U : PN->users()) { 
-      Instruction *UI = cast<Instruction>(U); 
-      if (UI != &I && !I.isIdenticalTo(UI)) 
-        return nullptr; 
-    } 
-    // Otherwise, we can replace *all* users with the new PHI we form. 
-  } 
- 
-  // Check to see if all of the operands of the PHI are simple constants 
-  // (constantint/constantfp/undef).  If there is one non-constant value, 
-  // remember the BB it is in.  If there is more than one or if *it* is a PHI, 
-  // bail out.  We don't do arbitrary constant expressions here because moving 
-  // their computation can be expensive without a cost model. 
-  BasicBlock *NonConstBB = nullptr; 
-  for (unsigned i = 0; i != NumPHIValues; ++i) { 
-    Value *InVal = PN->getIncomingValue(i); 
+  unsigned NumPHIValues = PN->getNumIncomingValues();
+  if (NumPHIValues == 0)
+    return nullptr;
+
+  // We normally only transform phis with a single use.  However, if a PHI has
+  // multiple uses and they are all the same operation, we can fold *all* of the
+  // uses into the PHI.
+  if (!PN->hasOneUse()) {
+    // Walk the use list for the instruction, comparing them to I.
+    for (User *U : PN->users()) {
+      Instruction *UI = cast<Instruction>(U);
+      if (UI != &I && !I.isIdenticalTo(UI))
+        return nullptr;
+    }
+    // Otherwise, we can replace *all* users with the new PHI we form.
+  }
+
+  // Check to see if all of the operands of the PHI are simple constants
+  // (constantint/constantfp/undef).  If there is one non-constant value,
+  // remember the BB it is in.  If there is more than one or if *it* is a PHI,
+  // bail out.  We don't do arbitrary constant expressions here because moving
+  // their computation can be expensive without a cost model.
+  BasicBlock *NonConstBB = nullptr;
+  for (unsigned i = 0; i != NumPHIValues; ++i) {
+    Value *InVal = PN->getIncomingValue(i);
     // If I is a freeze instruction, count undef as a non-constant.
     if (match(InVal, m_ImmConstant()) &&
         (!isa<FreezeInst>(I) || isGuaranteedNotToBeUndefOrPoison(InVal)))
-      continue; 
- 
-    if (isa<PHINode>(InVal)) return nullptr;  // Itself a phi. 
-    if (NonConstBB) return nullptr;  // More than one non-const value. 
- 
-    NonConstBB = PN->getIncomingBlock(i); 
- 
-    // If the InVal is an invoke at the end of the pred block, then we can't 
-    // insert a computation after it without breaking the edge. 
-    if (isa<InvokeInst>(InVal)) 
-      if (cast<Instruction>(InVal)->getParent() == NonConstBB) 
-        return nullptr; 
- 
-    // If the incoming non-constant value is in I's block, we will remove one 
-    // instruction, but insert another equivalent one, leading to infinite 
-    // instcombine. 
-    if (isPotentiallyReachable(I.getParent(), NonConstBB, &DT, LI)) 
-      return nullptr; 
-  } 
- 
-  // If there is exactly one non-constant value, we can insert a copy of the 
-  // operation in that block.  However, if this is a critical edge, we would be 
-  // inserting the computation on some other paths (e.g. inside a loop).  Only 
-  // do this if the pred block is unconditionally branching into the phi block. 
+      continue;
+
+    if (isa<PHINode>(InVal)) return nullptr;  // Itself a phi.
+    if (NonConstBB) return nullptr;  // More than one non-const value.
+
+    NonConstBB = PN->getIncomingBlock(i);
+
+    // If the InVal is an invoke at the end of the pred block, then we can't
+    // insert a computation after it without breaking the edge.
+    if (isa<InvokeInst>(InVal))
+      if (cast<Instruction>(InVal)->getParent() == NonConstBB)
+        return nullptr;
+
+    // If the incoming non-constant value is in I's block, we will remove one
+    // instruction, but insert another equivalent one, leading to infinite
+    // instcombine.
+    if (isPotentiallyReachable(I.getParent(), NonConstBB, &DT, LI))
+      return nullptr;
+  }
+
+  // If there is exactly one non-constant value, we can insert a copy of the
+  // operation in that block.  However, if this is a critical edge, we would be
+  // inserting the computation on some other paths (e.g. inside a loop).  Only
+  // do this if the pred block is unconditionally branching into the phi block.
   // Also, make sure that the pred block is not dead code.
-  if (NonConstBB != nullptr) { 
-    BranchInst *BI = dyn_cast<BranchInst>(NonConstBB->getTerminator()); 
+  if (NonConstBB != nullptr) {
+    BranchInst *BI = dyn_cast<BranchInst>(NonConstBB->getTerminator());
     if (!BI || !BI->isUnconditional() || !DT.isReachableFromEntry(NonConstBB))
       return nullptr;
-  } 
- 
-  // Okay, we can do the transformation: create the new PHI node. 
-  PHINode *NewPN = PHINode::Create(I.getType(), PN->getNumIncomingValues()); 
-  InsertNewInstBefore(NewPN, *PN); 
-  NewPN->takeName(PN); 
- 
-  // If we are going to have to insert a new computation, do so right before the 
-  // predecessor's terminator. 
-  if (NonConstBB) 
-    Builder.SetInsertPoint(NonConstBB->getTerminator()); 
- 
-  // Next, add all of the operands to the PHI. 
-  if (SelectInst *SI = dyn_cast<SelectInst>(&I)) { 
-    // We only currently try to fold the condition of a select when it is a phi, 
-    // not the true/false values. 
-    Value *TrueV = SI->getTrueValue(); 
-    Value *FalseV = SI->getFalseValue(); 
-    BasicBlock *PhiTransBB = PN->getParent(); 
-    for (unsigned i = 0; i != NumPHIValues; ++i) { 
-      BasicBlock *ThisBB = PN->getIncomingBlock(i); 
-      Value *TrueVInPred = TrueV->DoPHITranslation(PhiTransBB, ThisBB); 
-      Value *FalseVInPred = FalseV->DoPHITranslation(PhiTransBB, ThisBB); 
-      Value *InV = nullptr; 
-      // Beware of ConstantExpr:  it may eventually evaluate to getNullValue, 
-      // even if currently isNullValue gives false. 
-      Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i)); 
-      // For vector constants, we cannot use isNullValue to fold into 
-      // FalseVInPred versus TrueVInPred. When we have individual nonzero 
-      // elements in the vector, we will incorrectly fold InC to 
-      // `TrueVInPred`. 
+  }
+
+  // Okay, we can do the transformation: create the new PHI node.
+  PHINode *NewPN = PHINode::Create(I.getType(), PN->getNumIncomingValues());
+  InsertNewInstBefore(NewPN, *PN);
+  NewPN->takeName(PN);
+
+  // If we are going to have to insert a new computation, do so right before the
+  // predecessor's terminator.
+  if (NonConstBB)
+    Builder.SetInsertPoint(NonConstBB->getTerminator());
+
+  // Next, add all of the operands to the PHI.
+  if (SelectInst *SI = dyn_cast<SelectInst>(&I)) {
+    // We only currently try to fold the condition of a select when it is a phi,
+    // not the true/false values.
+    Value *TrueV = SI->getTrueValue();
+    Value *FalseV = SI->getFalseValue();
+    BasicBlock *PhiTransBB = PN->getParent();
+    for (unsigned i = 0; i != NumPHIValues; ++i) {
+      BasicBlock *ThisBB = PN->getIncomingBlock(i);
+      Value *TrueVInPred = TrueV->DoPHITranslation(PhiTransBB, ThisBB);
+      Value *FalseVInPred = FalseV->DoPHITranslation(PhiTransBB, ThisBB);
+      Value *InV = nullptr;
+      // Beware of ConstantExpr:  it may eventually evaluate to getNullValue,
+      // even if currently isNullValue gives false.
+      Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i));
+      // For vector constants, we cannot use isNullValue to fold into
+      // FalseVInPred versus TrueVInPred. When we have individual nonzero
+      // elements in the vector, we will incorrectly fold InC to
+      // `TrueVInPred`.
       if (InC && isa<ConstantInt>(InC))
-        InV = InC->isNullValue() ? FalseVInPred : TrueVInPred; 
-      else { 
-        // Generate the select in the same block as PN's current incoming block. 
-        // Note: ThisBB need not be the NonConstBB because vector constants 
-        // which are constants by definition are handled here. 
-        // FIXME: This can lead to an increase in IR generation because we might 
-        // generate selects for vector constant phi operand, that could not be 
-        // folded to TrueVInPred or FalseVInPred as done for ConstantInt. For 
-        // non-vector phis, this transformation was always profitable because 
-        // the select would be generated exactly once in the NonConstBB. 
-        Builder.SetInsertPoint(ThisBB->getTerminator()); 
-        InV = Builder.CreateSelect(PN->getIncomingValue(i), TrueVInPred, 
-                                   FalseVInPred, "phi.sel"); 
-      } 
-      NewPN->addIncoming(InV, ThisBB); 
-    } 
-  } else if (CmpInst *CI = dyn_cast<CmpInst>(&I)) { 
-    Constant *C = cast<Constant>(I.getOperand(1)); 
-    for (unsigned i = 0; i != NumPHIValues; ++i) { 
-      Value *InV = nullptr; 
-      if (auto *InC = dyn_cast<Constant>(PN->getIncomingValue(i))) 
-        InV = ConstantExpr::getCompare(CI->getPredicate(), InC, C); 
-      else 
-        InV = Builder.CreateCmp(CI->getPredicate(), PN->getIncomingValue(i), 
-                                C, "phi.cmp"); 
-      NewPN->addIncoming(InV, PN->getIncomingBlock(i)); 
-    } 
-  } else if (auto *BO = dyn_cast<BinaryOperator>(&I)) { 
-    for (unsigned i = 0; i != NumPHIValues; ++i) { 
-      Value *InV = foldOperationIntoPhiValue(BO, PN->getIncomingValue(i), 
-                                             Builder); 
-      NewPN->addIncoming(InV, PN->getIncomingBlock(i)); 
-    } 
+        InV = InC->isNullValue() ? FalseVInPred : TrueVInPred;
+      else {
+        // Generate the select in the same block as PN's current incoming block.
+        // Note: ThisBB need not be the NonConstBB because vector constants
+        // which are constants by definition are handled here.
+        // FIXME: This can lead to an increase in IR generation because we might
+        // generate selects for vector constant phi operand, that could not be
+        // folded to TrueVInPred or FalseVInPred as done for ConstantInt. For
+        // non-vector phis, this transformation was always profitable because
+        // the select would be generated exactly once in the NonConstBB.
+        Builder.SetInsertPoint(ThisBB->getTerminator());
+        InV = Builder.CreateSelect(PN->getIncomingValue(i), TrueVInPred,
+                                   FalseVInPred, "phi.sel");
+      }
+      NewPN->addIncoming(InV, ThisBB);
+    }
+  } else if (CmpInst *CI = dyn_cast<CmpInst>(&I)) {
+    Constant *C = cast<Constant>(I.getOperand(1));
+    for (unsigned i = 0; i != NumPHIValues; ++i) {
+      Value *InV = nullptr;
+      if (auto *InC = dyn_cast<Constant>(PN->getIncomingValue(i)))
+        InV = ConstantExpr::getCompare(CI->getPredicate(), InC, C);
+      else
+        InV = Builder.CreateCmp(CI->getPredicate(), PN->getIncomingValue(i),
+                                C, "phi.cmp");
+      NewPN->addIncoming(InV, PN->getIncomingBlock(i));
+    }
+  } else if (auto *BO = dyn_cast<BinaryOperator>(&I)) {
+    for (unsigned i = 0; i != NumPHIValues; ++i) {
+      Value *InV = foldOperationIntoPhiValue(BO, PN->getIncomingValue(i),
+                                             Builder);
+      NewPN->addIncoming(InV, PN->getIncomingBlock(i));
+    }
   } else if (isa<FreezeInst>(&I)) {
     for (unsigned i = 0; i != NumPHIValues; ++i) {
       Value *InV;
@@ -1185,1493 +1185,1493 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
         InV = PN->getIncomingValue(i);
       NewPN->addIncoming(InV, PN->getIncomingBlock(i));
     }
-  } else { 
-    CastInst *CI = cast<CastInst>(&I); 
-    Type *RetTy = CI->getType(); 
-    for (unsigned i = 0; i != NumPHIValues; ++i) { 
-      Value *InV; 
-      if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i))) 
-        InV = ConstantExpr::getCast(CI->getOpcode(), InC, RetTy); 
-      else 
-        InV = Builder.CreateCast(CI->getOpcode(), PN->getIncomingValue(i), 
-                                 I.getType(), "phi.cast"); 
-      NewPN->addIncoming(InV, PN->getIncomingBlock(i)); 
-    } 
-  } 
- 
+  } else {
+    CastInst *CI = cast<CastInst>(&I);
+    Type *RetTy = CI->getType();
+    for (unsigned i = 0; i != NumPHIValues; ++i) {
+      Value *InV;
+      if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i)))
+        InV = ConstantExpr::getCast(CI->getOpcode(), InC, RetTy);
+      else
+        InV = Builder.CreateCast(CI->getOpcode(), PN->getIncomingValue(i),
+                                 I.getType(), "phi.cast");
+      NewPN->addIncoming(InV, PN->getIncomingBlock(i));
+    }
+  }
+
   for (User *U : make_early_inc_range(PN->users())) {
     Instruction *User = cast<Instruction>(U);
-    if (User == &I) continue; 
-    replaceInstUsesWith(*User, NewPN); 
-    eraseInstFromFunction(*User); 
-  } 
-  return replaceInstUsesWith(I, NewPN); 
-} 
- 
+    if (User == &I) continue;
+    replaceInstUsesWith(*User, NewPN);
+    eraseInstFromFunction(*User);
+  }
+  return replaceInstUsesWith(I, NewPN);
+}
+
 Instruction *InstCombinerImpl::foldBinOpIntoSelectOrPhi(BinaryOperator &I) {
-  if (!isa<Constant>(I.getOperand(1))) 
-    return nullptr; 
- 
-  if (auto *Sel = dyn_cast<SelectInst>(I.getOperand(0))) { 
-    if (Instruction *NewSel = FoldOpIntoSelect(I, Sel)) 
-      return NewSel; 
-  } else if (auto *PN = dyn_cast<PHINode>(I.getOperand(0))) { 
-    if (Instruction *NewPhi = foldOpIntoPhi(I, PN)) 
-      return NewPhi; 
-  } 
-  return nullptr; 
-} 
- 
-/// Given a pointer type and a constant offset, determine whether or not there 
-/// is a sequence of GEP indices into the pointed type that will land us at the 
-/// specified offset. If so, fill them into NewIndices and return the resultant 
-/// element type, otherwise return null. 
+  if (!isa<Constant>(I.getOperand(1)))
+    return nullptr;
+
+  if (auto *Sel = dyn_cast<SelectInst>(I.getOperand(0))) {
+    if (Instruction *NewSel = FoldOpIntoSelect(I, Sel))
+      return NewSel;
+  } else if (auto *PN = dyn_cast<PHINode>(I.getOperand(0))) {
+    if (Instruction *NewPhi = foldOpIntoPhi(I, PN))
+      return NewPhi;
+  }
+  return nullptr;
+}
+
+/// Given a pointer type and a constant offset, determine whether or not there
+/// is a sequence of GEP indices into the pointed type that will land us at the
+/// specified offset. If so, fill them into NewIndices and return the resultant
+/// element type, otherwise return null.
 Type *
 InstCombinerImpl::FindElementAtOffset(PointerType *PtrTy, int64_t Offset,
                                       SmallVectorImpl<Value *> &NewIndices) {
-  Type *Ty = PtrTy->getElementType(); 
-  if (!Ty->isSized()) 
-    return nullptr; 
- 
-  // Start with the index over the outer type.  Note that the type size 
-  // might be zero (even if the offset isn't zero) if the indexed type 
-  // is something like [0 x {int, int}] 
-  Type *IndexTy = DL.getIndexType(PtrTy); 
-  int64_t FirstIdx = 0; 
-  if (int64_t TySize = DL.getTypeAllocSize(Ty)) { 
-    FirstIdx = Offset/TySize; 
-    Offset -= FirstIdx*TySize; 
- 
-    // Handle hosts where % returns negative instead of values [0..TySize). 
-    if (Offset < 0) { 
-      --FirstIdx; 
-      Offset += TySize; 
-      assert(Offset >= 0); 
-    } 
-    assert((uint64_t)Offset < (uint64_t)TySize && "Out of range offset"); 
-  } 
- 
-  NewIndices.push_back(ConstantInt::get(IndexTy, FirstIdx)); 
- 
-  // Index into the types.  If we fail, set OrigBase to null. 
-  while (Offset) { 
-    // Indexing into tail padding between struct/array elements. 
-    if (uint64_t(Offset * 8) >= DL.getTypeSizeInBits(Ty)) 
-      return nullptr; 
- 
-    if (StructType *STy = dyn_cast<StructType>(Ty)) { 
-      const StructLayout *SL = DL.getStructLayout(STy); 
-      assert(Offset < (int64_t)SL->getSizeInBytes() && 
-             "Offset must stay within the indexed type"); 
- 
-      unsigned Elt = SL->getElementContainingOffset(Offset); 
-      NewIndices.push_back(ConstantInt::get(Type::getInt32Ty(Ty->getContext()), 
-                                            Elt)); 
- 
-      Offset -= SL->getElementOffset(Elt); 
-      Ty = STy->getElementType(Elt); 
-    } else if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) { 
-      uint64_t EltSize = DL.getTypeAllocSize(AT->getElementType()); 
-      assert(EltSize && "Cannot index into a zero-sized array"); 
-      NewIndices.push_back(ConstantInt::get(IndexTy,Offset/EltSize)); 
-      Offset %= EltSize; 
-      Ty = AT->getElementType(); 
-    } else { 
-      // Otherwise, we can't index into the middle of this atomic type, bail. 
-      return nullptr; 
-    } 
-  } 
- 
-  return Ty; 
-} 
- 
-static bool shouldMergeGEPs(GEPOperator &GEP, GEPOperator &Src) { 
-  // If this GEP has only 0 indices, it is the same pointer as 
-  // Src. If Src is not a trivial GEP too, don't combine 
-  // the indices. 
-  if (GEP.hasAllZeroIndices() && !Src.hasAllZeroIndices() && 
-      !Src.hasOneUse()) 
-    return false; 
-  return true; 
-} 
- 
-/// Return a value X such that Val = X * Scale, or null if none. 
-/// If the multiplication is known not to overflow, then NoSignedWrap is set. 
+  Type *Ty = PtrTy->getElementType();
+  if (!Ty->isSized())
+    return nullptr;
+
+  // Start with the index over the outer type.  Note that the type size
+  // might be zero (even if the offset isn't zero) if the indexed type
+  // is something like [0 x {int, int}]
+  Type *IndexTy = DL.getIndexType(PtrTy);
+  int64_t FirstIdx = 0;
+  if (int64_t TySize = DL.getTypeAllocSize(Ty)) {
+    FirstIdx = Offset/TySize;
+    Offset -= FirstIdx*TySize;
+
+    // Handle hosts where % returns negative instead of values [0..TySize).
+    if (Offset < 0) {
+      --FirstIdx;
+      Offset += TySize;
+      assert(Offset >= 0);
+    }
+    assert((uint64_t)Offset < (uint64_t)TySize && "Out of range offset");
+  }
+
+  NewIndices.push_back(ConstantInt::get(IndexTy, FirstIdx));
+
+  // Index into the types.  If we fail, set OrigBase to null.
+  while (Offset) {
+    // Indexing into tail padding between struct/array elements.
+    if (uint64_t(Offset * 8) >= DL.getTypeSizeInBits(Ty))
+      return nullptr;
+
+    if (StructType *STy = dyn_cast<StructType>(Ty)) {
+      const StructLayout *SL = DL.getStructLayout(STy);
+      assert(Offset < (int64_t)SL->getSizeInBytes() &&
+             "Offset must stay within the indexed type");
+
+      unsigned Elt = SL->getElementContainingOffset(Offset);
+      NewIndices.push_back(ConstantInt::get(Type::getInt32Ty(Ty->getContext()),
+                                            Elt));
+
+      Offset -= SL->getElementOffset(Elt);
+      Ty = STy->getElementType(Elt);
+    } else if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
+      uint64_t EltSize = DL.getTypeAllocSize(AT->getElementType());
+      assert(EltSize && "Cannot index into a zero-sized array");
+      NewIndices.push_back(ConstantInt::get(IndexTy,Offset/EltSize));
+      Offset %= EltSize;
+      Ty = AT->getElementType();
+    } else {
+      // Otherwise, we can't index into the middle of this atomic type, bail.
+      return nullptr;
+    }
+  }
+
+  return Ty;
+}
+
+static bool shouldMergeGEPs(GEPOperator &GEP, GEPOperator &Src) {
+  // If this GEP has only 0 indices, it is the same pointer as
+  // Src. If Src is not a trivial GEP too, don't combine
+  // the indices.
+  if (GEP.hasAllZeroIndices() && !Src.hasAllZeroIndices() &&
+      !Src.hasOneUse())
+    return false;
+  return true;
+}
+
+/// Return a value X such that Val = X * Scale, or null if none.
+/// If the multiplication is known not to overflow, then NoSignedWrap is set.
 Value *InstCombinerImpl::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
-  assert(isa<IntegerType>(Val->getType()) && "Can only descale integers!"); 
-  assert(cast<IntegerType>(Val->getType())->getBitWidth() == 
-         Scale.getBitWidth() && "Scale not compatible with value!"); 
- 
-  // If Val is zero or Scale is one then Val = Val * Scale. 
-  if (match(Val, m_Zero()) || Scale == 1) { 
-    NoSignedWrap = true; 
-    return Val; 
-  } 
- 
-  // If Scale is zero then it does not divide Val. 
-  if (Scale.isMinValue()) 
-    return nullptr; 
- 
-  // Look through chains of multiplications, searching for a constant that is 
-  // divisible by Scale.  For example, descaling X*(Y*(Z*4)) by a factor of 4 
-  // will find the constant factor 4 and produce X*(Y*Z).  Descaling X*(Y*8) by 
-  // a factor of 4 will produce X*(Y*2).  The principle of operation is to bore 
-  // down from Val: 
-  // 
-  //     Val = M1 * X          ||   Analysis starts here and works down 
-  //      M1 = M2 * Y          ||   Doesn't descend into terms with more 
-  //      M2 =  Z * 4          \/   than one use 
-  // 
-  // Then to modify a term at the bottom: 
-  // 
-  //     Val = M1 * X 
-  //      M1 =  Z * Y          ||   Replaced M2 with Z 
-  // 
-  // Then to work back up correcting nsw flags. 
- 
-  // Op - the term we are currently analyzing.  Starts at Val then drills down. 
-  // Replaced with its descaled value before exiting from the drill down loop. 
-  Value *Op = Val; 
- 
-  // Parent - initially null, but after drilling down notes where Op came from. 
-  // In the example above, Parent is (Val, 0) when Op is M1, because M1 is the 
-  // 0'th operand of Val. 
-  std::pair<Instruction *, unsigned> Parent; 
- 
-  // Set if the transform requires a descaling at deeper levels that doesn't 
-  // overflow. 
-  bool RequireNoSignedWrap = false; 
- 
-  // Log base 2 of the scale. Negative if not a power of 2. 
-  int32_t logScale = Scale.exactLogBase2(); 
- 
-  for (;; Op = Parent.first->getOperand(Parent.second)) { // Drill down 
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { 
-      // If Op is a constant divisible by Scale then descale to the quotient. 
-      APInt Quotient(Scale), Remainder(Scale); // Init ensures right bitwidth. 
-      APInt::sdivrem(CI->getValue(), Scale, Quotient, Remainder); 
-      if (!Remainder.isMinValue()) 
-        // Not divisible by Scale. 
-        return nullptr; 
-      // Replace with the quotient in the parent. 
-      Op = ConstantInt::get(CI->getType(), Quotient); 
-      NoSignedWrap = true; 
-      break; 
-    } 
- 
-    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op)) { 
-      if (BO->getOpcode() == Instruction::Mul) { 
-        // Multiplication. 
-        NoSignedWrap = BO->hasNoSignedWrap(); 
-        if (RequireNoSignedWrap && !NoSignedWrap) 
-          return nullptr; 
- 
-        // There are three cases for multiplication: multiplication by exactly 
-        // the scale, multiplication by a constant different to the scale, and 
-        // multiplication by something else. 
-        Value *LHS = BO->getOperand(0); 
-        Value *RHS = BO->getOperand(1); 
- 
-        if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) { 
-          // Multiplication by a constant. 
-          if (CI->getValue() == Scale) { 
-            // Multiplication by exactly the scale, replace the multiplication 
-            // by its left-hand side in the parent. 
-            Op = LHS; 
-            break; 
-          } 
- 
-          // Otherwise drill down into the constant. 
-          if (!Op->hasOneUse()) 
-            return nullptr; 
- 
-          Parent = std::make_pair(BO, 1); 
-          continue; 
-        } 
- 
-        // Multiplication by something else. Drill down into the left-hand side 
-        // since that's where the reassociate pass puts the good stuff. 
-        if (!Op->hasOneUse()) 
-          return nullptr; 
- 
-        Parent = std::make_pair(BO, 0); 
-        continue; 
-      } 
- 
-      if (logScale > 0 && BO->getOpcode() == Instruction::Shl && 
-          isa<ConstantInt>(BO->getOperand(1))) { 
-        // Multiplication by a power of 2. 
-        NoSignedWrap = BO->hasNoSignedWrap(); 
-        if (RequireNoSignedWrap && !NoSignedWrap) 
-          return nullptr; 
- 
-        Value *LHS = BO->getOperand(0); 
-        int32_t Amt = cast<ConstantInt>(BO->getOperand(1))-> 
-          getLimitedValue(Scale.getBitWidth()); 
-        // Op = LHS << Amt. 
- 
-        if (Amt == logScale) { 
-          // Multiplication by exactly the scale, replace the multiplication 
-          // by its left-hand side in the parent. 
-          Op = LHS; 
-          break; 
-        } 
-        if (Amt < logScale || !Op->hasOneUse()) 
-          return nullptr; 
- 
-        // Multiplication by more than the scale.  Reduce the multiplying amount 
-        // by the scale in the parent. 
-        Parent = std::make_pair(BO, 1); 
-        Op = ConstantInt::get(BO->getType(), Amt - logScale); 
-        break; 
-      } 
-    } 
- 
-    if (!Op->hasOneUse()) 
-      return nullptr; 
- 
-    if (CastInst *Cast = dyn_cast<CastInst>(Op)) { 
-      if (Cast->getOpcode() == Instruction::SExt) { 
-        // Op is sign-extended from a smaller type, descale in the smaller type. 
-        unsigned SmallSize = Cast->getSrcTy()->getPrimitiveSizeInBits(); 
-        APInt SmallScale = Scale.trunc(SmallSize); 
-        // Suppose Op = sext X, and we descale X as Y * SmallScale.  We want to 
-        // descale Op as (sext Y) * Scale.  In order to have 
-        //   sext (Y * SmallScale) = (sext Y) * Scale 
-        // some conditions need to hold however: SmallScale must sign-extend to 
-        // Scale and the multiplication Y * SmallScale should not overflow. 
-        if (SmallScale.sext(Scale.getBitWidth()) != Scale) 
-          // SmallScale does not sign-extend to Scale. 
-          return nullptr; 
-        assert(SmallScale.exactLogBase2() == logScale); 
-        // Require that Y * SmallScale must not overflow. 
-        RequireNoSignedWrap = true; 
- 
-        // Drill down through the cast. 
-        Parent = std::make_pair(Cast, 0); 
-        Scale = SmallScale; 
-        continue; 
-      } 
- 
-      if (Cast->getOpcode() == Instruction::Trunc) { 
-        // Op is truncated from a larger type, descale in the larger type. 
-        // Suppose Op = trunc X, and we descale X as Y * sext Scale.  Then 
-        //   trunc (Y * sext Scale) = (trunc Y) * Scale 
-        // always holds.  However (trunc Y) * Scale may overflow even if 
-        // trunc (Y * sext Scale) does not, so nsw flags need to be cleared 
-        // from this point up in the expression (see later). 
-        if (RequireNoSignedWrap) 
-          return nullptr; 
- 
-        // Drill down through the cast. 
-        unsigned LargeSize = Cast->getSrcTy()->getPrimitiveSizeInBits(); 
-        Parent = std::make_pair(Cast, 0); 
-        Scale = Scale.sext(LargeSize); 
-        if (logScale + 1 == (int32_t)Cast->getType()->getPrimitiveSizeInBits()) 
-          logScale = -1; 
-        assert(Scale.exactLogBase2() == logScale); 
-        continue; 
-      } 
-    } 
- 
-    // Unsupported expression, bail out. 
-    return nullptr; 
-  } 
- 
-  // If Op is zero then Val = Op * Scale. 
-  if (match(Op, m_Zero())) { 
-    NoSignedWrap = true; 
-    return Op; 
-  } 
- 
-  // We know that we can successfully descale, so from here on we can safely 
-  // modify the IR.  Op holds the descaled version of the deepest term in the 
-  // expression.  NoSignedWrap is 'true' if multiplying Op by Scale is known 
-  // not to overflow. 
- 
-  if (!Parent.first) 
-    // The expression only had one term. 
-    return Op; 
- 
-  // Rewrite the parent using the descaled version of its operand. 
-  assert(Parent.first->hasOneUse() && "Drilled down when more than one use!"); 
-  assert(Op != Parent.first->getOperand(Parent.second) && 
-         "Descaling was a no-op?"); 
-  replaceOperand(*Parent.first, Parent.second, Op); 
-  Worklist.push(Parent.first); 
- 
-  // Now work back up the expression correcting nsw flags.  The logic is based 
-  // on the following observation: if X * Y is known not to overflow as a signed 
-  // multiplication, and Y is replaced by a value Z with smaller absolute value, 
-  // then X * Z will not overflow as a signed multiplication either.  As we work 
-  // our way up, having NoSignedWrap 'true' means that the descaled value at the 
-  // current level has strictly smaller absolute value than the original. 
-  Instruction *Ancestor = Parent.first; 
-  do { 
-    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Ancestor)) { 
-      // If the multiplication wasn't nsw then we can't say anything about the 
-      // value of the descaled multiplication, and we have to clear nsw flags 
-      // from this point on up. 
-      bool OpNoSignedWrap = BO->hasNoSignedWrap(); 
-      NoSignedWrap &= OpNoSignedWrap; 
-      if (NoSignedWrap != OpNoSignedWrap) { 
-        BO->setHasNoSignedWrap(NoSignedWrap); 
-        Worklist.push(Ancestor); 
-      } 
-    } else if (Ancestor->getOpcode() == Instruction::Trunc) { 
-      // The fact that the descaled input to the trunc has smaller absolute 
-      // value than the original input doesn't tell us anything useful about 
-      // the absolute values of the truncations. 
-      NoSignedWrap = false; 
-    } 
-    assert((Ancestor->getOpcode() != Instruction::SExt || NoSignedWrap) && 
-           "Failed to keep proper track of nsw flags while drilling down?"); 
- 
-    if (Ancestor == Val) 
-      // Got to the top, all done! 
-      return Val; 
- 
-    // Move up one level in the expression. 
-    assert(Ancestor->hasOneUse() && "Drilled down when more than one use!"); 
-    Ancestor = Ancestor->user_back(); 
-  } while (true); 
-} 
- 
+  assert(isa<IntegerType>(Val->getType()) && "Can only descale integers!");
+  assert(cast<IntegerType>(Val->getType())->getBitWidth() ==
+         Scale.getBitWidth() && "Scale not compatible with value!");
+
+  // If Val is zero or Scale is one then Val = Val * Scale.
+  if (match(Val, m_Zero()) || Scale == 1) {
+    NoSignedWrap = true;
+    return Val;
+  }
+
+  // If Scale is zero then it does not divide Val.
+  if (Scale.isMinValue())
+    return nullptr;
+
+  // Look through chains of multiplications, searching for a constant that is
+  // divisible by Scale.  For example, descaling X*(Y*(Z*4)) by a factor of 4
+  // will find the constant factor 4 and produce X*(Y*Z).  Descaling X*(Y*8) by
+  // a factor of 4 will produce X*(Y*2).  The principle of operation is to bore
+  // down from Val:
+  //
+  //     Val = M1 * X          ||   Analysis starts here and works down
+  //      M1 = M2 * Y          ||   Doesn't descend into terms with more
+  //      M2 =  Z * 4          \/   than one use
+  //
+  // Then to modify a term at the bottom:
+  //
+  //     Val = M1 * X
+  //      M1 =  Z * Y          ||   Replaced M2 with Z
+  //
+  // Then to work back up correcting nsw flags.
+
+  // Op - the term we are currently analyzing.  Starts at Val then drills down.
+  // Replaced with its descaled value before exiting from the drill down loop.
+  Value *Op = Val;
+
+  // Parent - initially null, but after drilling down notes where Op came from.
+  // In the example above, Parent is (Val, 0) when Op is M1, because M1 is the
+  // 0'th operand of Val.
+  std::pair<Instruction *, unsigned> Parent;
+
+  // Set if the transform requires a descaling at deeper levels that doesn't
+  // overflow.
+  bool RequireNoSignedWrap = false;
+
+  // Log base 2 of the scale. Negative if not a power of 2.
+  int32_t logScale = Scale.exactLogBase2();
+
+  for (;; Op = Parent.first->getOperand(Parent.second)) { // Drill down
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+      // If Op is a constant divisible by Scale then descale to the quotient.
+      APInt Quotient(Scale), Remainder(Scale); // Init ensures right bitwidth.
+      APInt::sdivrem(CI->getValue(), Scale, Quotient, Remainder);
+      if (!Remainder.isMinValue())
+        // Not divisible by Scale.
+        return nullptr;
+      // Replace with the quotient in the parent.
+      Op = ConstantInt::get(CI->getType(), Quotient);
+      NoSignedWrap = true;
+      break;
+    }
+
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op)) {
+      if (BO->getOpcode() == Instruction::Mul) {
+        // Multiplication.
+        NoSignedWrap = BO->hasNoSignedWrap();
+        if (RequireNoSignedWrap && !NoSignedWrap)
+          return nullptr;
+
+        // There are three cases for multiplication: multiplication by exactly
+        // the scale, multiplication by a constant different to the scale, and
+        // multiplication by something else.
+        Value *LHS = BO->getOperand(0);
+        Value *RHS = BO->getOperand(1);
+
+        if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
+          // Multiplication by a constant.
+          if (CI->getValue() == Scale) {
+            // Multiplication by exactly the scale, replace the multiplication
+            // by its left-hand side in the parent.
+            Op = LHS;
+            break;
+          }
+
+          // Otherwise drill down into the constant.
+          if (!Op->hasOneUse())
+            return nullptr;
+
+          Parent = std::make_pair(BO, 1);
+          continue;
+        }
+
+        // Multiplication by something else. Drill down into the left-hand side
+        // since that's where the reassociate pass puts the good stuff.
+        if (!Op->hasOneUse())
+          return nullptr;
+
+        Parent = std::make_pair(BO, 0);
+        continue;
+      }
+
+      if (logScale > 0 && BO->getOpcode() == Instruction::Shl &&
+          isa<ConstantInt>(BO->getOperand(1))) {
+        // Multiplication by a power of 2.
+        NoSignedWrap = BO->hasNoSignedWrap();
+        if (RequireNoSignedWrap && !NoSignedWrap)
+          return nullptr;
+
+        Value *LHS = BO->getOperand(0);
+        int32_t Amt = cast<ConstantInt>(BO->getOperand(1))->
+          getLimitedValue(Scale.getBitWidth());
+        // Op = LHS << Amt.
+
+        if (Amt == logScale) {
+          // Multiplication by exactly the scale, replace the multiplication
+          // by its left-hand side in the parent.
+          Op = LHS;
+          break;
+        }
+        if (Amt < logScale || !Op->hasOneUse())
+          return nullptr;
+
+        // Multiplication by more than the scale.  Reduce the multiplying amount
+        // by the scale in the parent.
+        Parent = std::make_pair(BO, 1);
+        Op = ConstantInt::get(BO->getType(), Amt - logScale);
+        break;
+      }
+    }
+
+    if (!Op->hasOneUse())
+      return nullptr;
+
+    if (CastInst *Cast = dyn_cast<CastInst>(Op)) {
+      if (Cast->getOpcode() == Instruction::SExt) {
+        // Op is sign-extended from a smaller type, descale in the smaller type.
+        unsigned SmallSize = Cast->getSrcTy()->getPrimitiveSizeInBits();
+        APInt SmallScale = Scale.trunc(SmallSize);
+        // Suppose Op = sext X, and we descale X as Y * SmallScale.  We want to
+        // descale Op as (sext Y) * Scale.  In order to have
+        //   sext (Y * SmallScale) = (sext Y) * Scale
+        // some conditions need to hold however: SmallScale must sign-extend to
+        // Scale and the multiplication Y * SmallScale should not overflow.
+        if (SmallScale.sext(Scale.getBitWidth()) != Scale)
+          // SmallScale does not sign-extend to Scale.
+          return nullptr;
+        assert(SmallScale.exactLogBase2() == logScale);
+        // Require that Y * SmallScale must not overflow.
+        RequireNoSignedWrap = true;
+
+        // Drill down through the cast.
+        Parent = std::make_pair(Cast, 0);
+        Scale = SmallScale;
+        continue;
+      }
+
+      if (Cast->getOpcode() == Instruction::Trunc) {
+        // Op is truncated from a larger type, descale in the larger type.
+        // Suppose Op = trunc X, and we descale X as Y * sext Scale.  Then
+        //   trunc (Y * sext Scale) = (trunc Y) * Scale
+        // always holds.  However (trunc Y) * Scale may overflow even if
+        // trunc (Y * sext Scale) does not, so nsw flags need to be cleared
+        // from this point up in the expression (see later).
+        if (RequireNoSignedWrap)
+          return nullptr;
+
+        // Drill down through the cast.
+        unsigned LargeSize = Cast->getSrcTy()->getPrimitiveSizeInBits();
+        Parent = std::make_pair(Cast, 0);
+        Scale = Scale.sext(LargeSize);
+        if (logScale + 1 == (int32_t)Cast->getType()->getPrimitiveSizeInBits())
+          logScale = -1;
+        assert(Scale.exactLogBase2() == logScale);
+        continue;
+      }
+    }
+
+    // Unsupported expression, bail out.
+    return nullptr;
+  }
+
+  // If Op is zero then Val = Op * Scale.
+  if (match(Op, m_Zero())) {
+    NoSignedWrap = true;
+    return Op;
+  }
+
+  // We know that we can successfully descale, so from here on we can safely
+  // modify the IR.  Op holds the descaled version of the deepest term in the
+  // expression.  NoSignedWrap is 'true' if multiplying Op by Scale is known
+  // not to overflow.
+
+  if (!Parent.first)
+    // The expression only had one term.
+    return Op;
+
+  // Rewrite the parent using the descaled version of its operand.
+  assert(Parent.first->hasOneUse() && "Drilled down when more than one use!");
+  assert(Op != Parent.first->getOperand(Parent.second) &&
+         "Descaling was a no-op?");
+  replaceOperand(*Parent.first, Parent.second, Op);
+  Worklist.push(Parent.first);
+
+  // Now work back up the expression correcting nsw flags.  The logic is based
+  // on the following observation: if X * Y is known not to overflow as a signed
+  // multiplication, and Y is replaced by a value Z with smaller absolute value,
+  // then X * Z will not overflow as a signed multiplication either.  As we work
+  // our way up, having NoSignedWrap 'true' means that the descaled value at the
+  // current level has strictly smaller absolute value than the original.
+  Instruction *Ancestor = Parent.first;
+  do {
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Ancestor)) {
+      // If the multiplication wasn't nsw then we can't say anything about the
+      // value of the descaled multiplication, and we have to clear nsw flags
+      // from this point on up.
+      bool OpNoSignedWrap = BO->hasNoSignedWrap();
+      NoSignedWrap &= OpNoSignedWrap;
+      if (NoSignedWrap != OpNoSignedWrap) {
+        BO->setHasNoSignedWrap(NoSignedWrap);
+        Worklist.push(Ancestor);
+      }
+    } else if (Ancestor->getOpcode() == Instruction::Trunc) {
+      // The fact that the descaled input to the trunc has smaller absolute
+      // value than the original input doesn't tell us anything useful about
+      // the absolute values of the truncations.
+      NoSignedWrap = false;
+    }
+    assert((Ancestor->getOpcode() != Instruction::SExt || NoSignedWrap) &&
+           "Failed to keep proper track of nsw flags while drilling down?");
+
+    if (Ancestor == Val)
+      // Got to the top, all done!
+      return Val;
+
+    // Move up one level in the expression.
+    assert(Ancestor->hasOneUse() && "Drilled down when more than one use!");
+    Ancestor = Ancestor->user_back();
+  } while (true);
+}
+
 Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) {
   if (!isa<VectorType>(Inst.getType()))
-    return nullptr; 
- 
-  BinaryOperator::BinaryOps Opcode = Inst.getOpcode(); 
-  Value *LHS = Inst.getOperand(0), *RHS = Inst.getOperand(1); 
-  assert(cast<VectorType>(LHS->getType())->getElementCount() == 
-         cast<VectorType>(Inst.getType())->getElementCount()); 
-  assert(cast<VectorType>(RHS->getType())->getElementCount() == 
-         cast<VectorType>(Inst.getType())->getElementCount()); 
- 
-  // If both operands of the binop are vector concatenations, then perform the 
-  // narrow binop on each pair of the source operands followed by concatenation 
-  // of the results. 
-  Value *L0, *L1, *R0, *R1; 
-  ArrayRef<int> Mask; 
-  if (match(LHS, m_Shuffle(m_Value(L0), m_Value(L1), m_Mask(Mask))) && 
-      match(RHS, m_Shuffle(m_Value(R0), m_Value(R1), m_SpecificMask(Mask))) && 
-      LHS->hasOneUse() && RHS->hasOneUse() && 
-      cast<ShuffleVectorInst>(LHS)->isConcat() && 
-      cast<ShuffleVectorInst>(RHS)->isConcat()) { 
-    // This transform does not have the speculative execution constraint as 
-    // below because the shuffle is a concatenation. The new binops are 
-    // operating on exactly the same elements as the existing binop. 
-    // TODO: We could ease the mask requirement to allow different undef lanes, 
-    //       but that requires an analysis of the binop-with-undef output value. 
-    Value *NewBO0 = Builder.CreateBinOp(Opcode, L0, R0); 
-    if (auto *BO = dyn_cast<BinaryOperator>(NewBO0)) 
-      BO->copyIRFlags(&Inst); 
-    Value *NewBO1 = Builder.CreateBinOp(Opcode, L1, R1); 
-    if (auto *BO = dyn_cast<BinaryOperator>(NewBO1)) 
-      BO->copyIRFlags(&Inst); 
-    return new ShuffleVectorInst(NewBO0, NewBO1, Mask); 
-  } 
- 
-  // It may not be safe to reorder shuffles and things like div, urem, etc. 
-  // because we may trap when executing those ops on unknown vector elements. 
-  // See PR20059. 
-  if (!isSafeToSpeculativelyExecute(&Inst)) 
-    return nullptr; 
- 
-  auto createBinOpShuffle = [&](Value *X, Value *Y, ArrayRef<int> M) { 
-    Value *XY = Builder.CreateBinOp(Opcode, X, Y); 
-    if (auto *BO = dyn_cast<BinaryOperator>(XY)) 
-      BO->copyIRFlags(&Inst); 
-    return new ShuffleVectorInst(XY, UndefValue::get(XY->getType()), M); 
-  }; 
- 
-  // If both arguments of the binary operation are shuffles that use the same 
-  // mask and shuffle within a single vector, move the shuffle after the binop. 
-  Value *V1, *V2; 
-  if (match(LHS, m_Shuffle(m_Value(V1), m_Undef(), m_Mask(Mask))) && 
-      match(RHS, m_Shuffle(m_Value(V2), m_Undef(), m_SpecificMask(Mask))) && 
-      V1->getType() == V2->getType() && 
-      (LHS->hasOneUse() || RHS->hasOneUse() || LHS == RHS)) { 
-    // Op(shuffle(V1, Mask), shuffle(V2, Mask)) -> shuffle(Op(V1, V2), Mask) 
-    return createBinOpShuffle(V1, V2, Mask); 
-  } 
- 
-  // If both arguments of a commutative binop are select-shuffles that use the 
-  // same mask with commuted operands, the shuffles are unnecessary. 
-  if (Inst.isCommutative() && 
-      match(LHS, m_Shuffle(m_Value(V1), m_Value(V2), m_Mask(Mask))) && 
-      match(RHS, 
-            m_Shuffle(m_Specific(V2), m_Specific(V1), m_SpecificMask(Mask)))) { 
-    auto *LShuf = cast<ShuffleVectorInst>(LHS); 
-    auto *RShuf = cast<ShuffleVectorInst>(RHS); 
-    // TODO: Allow shuffles that contain undefs in the mask? 
-    //       That is legal, but it reduces undef knowledge. 
-    // TODO: Allow arbitrary shuffles by shuffling after binop? 
-    //       That might be legal, but we have to deal with poison. 
-    if (LShuf->isSelect() && 
-        !is_contained(LShuf->getShuffleMask(), UndefMaskElem) && 
-        RShuf->isSelect() && 
-        !is_contained(RShuf->getShuffleMask(), UndefMaskElem)) { 
-      // Example: 
-      // LHS = shuffle V1, V2, <0, 5, 6, 3> 
-      // RHS = shuffle V2, V1, <0, 5, 6, 3> 
-      // LHS + RHS --> (V10+V20, V21+V11, V22+V12, V13+V23) --> V1 + V2 
-      Instruction *NewBO = BinaryOperator::Create(Opcode, V1, V2); 
-      NewBO->copyIRFlags(&Inst); 
-      return NewBO; 
-    } 
-  } 
- 
-  // If one argument is a shuffle within one vector and the other is a constant, 
-  // try moving the shuffle after the binary operation. This canonicalization 
-  // intends to move shuffles closer to other shuffles and binops closer to 
-  // other binops, so they can be folded. It may also enable demanded elements 
-  // transforms. 
-  Constant *C; 
+    return nullptr;
+
+  BinaryOperator::BinaryOps Opcode = Inst.getOpcode();
+  Value *LHS = Inst.getOperand(0), *RHS = Inst.getOperand(1);
+  assert(cast<VectorType>(LHS->getType())->getElementCount() ==
+         cast<VectorType>(Inst.getType())->getElementCount());
+  assert(cast<VectorType>(RHS->getType())->getElementCount() ==
+         cast<VectorType>(Inst.getType())->getElementCount());
+
+  // If both operands of the binop are vector concatenations, then perform the
+  // narrow binop on each pair of the source operands followed by concatenation
+  // of the results.
+  Value *L0, *L1, *R0, *R1;
+  ArrayRef<int> Mask;
+  if (match(LHS, m_Shuffle(m_Value(L0), m_Value(L1), m_Mask(Mask))) &&
+      match(RHS, m_Shuffle(m_Value(R0), m_Value(R1), m_SpecificMask(Mask))) &&
+      LHS->hasOneUse() && RHS->hasOneUse() &&
+      cast<ShuffleVectorInst>(LHS)->isConcat() &&
+      cast<ShuffleVectorInst>(RHS)->isConcat()) {
+    // This transform does not have the speculative execution constraint as
+    // below because the shuffle is a concatenation. The new binops are
+    // operating on exactly the same elements as the existing binop.
+    // TODO: We could ease the mask requirement to allow different undef lanes,
+    //       but that requires an analysis of the binop-with-undef output value.
+    Value *NewBO0 = Builder.CreateBinOp(Opcode, L0, R0);
+    if (auto *BO = dyn_cast<BinaryOperator>(NewBO0))
+      BO->copyIRFlags(&Inst);
+    Value *NewBO1 = Builder.CreateBinOp(Opcode, L1, R1);
+    if (auto *BO = dyn_cast<BinaryOperator>(NewBO1))
+      BO->copyIRFlags(&Inst);
+    return new ShuffleVectorInst(NewBO0, NewBO1, Mask);
+  }
+
+  // It may not be safe to reorder shuffles and things like div, urem, etc.
+  // because we may trap when executing those ops on unknown vector elements.
+  // See PR20059.
+  if (!isSafeToSpeculativelyExecute(&Inst))
+    return nullptr;
+
+  auto createBinOpShuffle = [&](Value *X, Value *Y, ArrayRef<int> M) {
+    Value *XY = Builder.CreateBinOp(Opcode, X, Y);
+    if (auto *BO = dyn_cast<BinaryOperator>(XY))
+      BO->copyIRFlags(&Inst);
+    return new ShuffleVectorInst(XY, UndefValue::get(XY->getType()), M);
+  };
+
+  // If both arguments of the binary operation are shuffles that use the same
+  // mask and shuffle within a single vector, move the shuffle after the binop.
+  Value *V1, *V2;
+  if (match(LHS, m_Shuffle(m_Value(V1), m_Undef(), m_Mask(Mask))) &&
+      match(RHS, m_Shuffle(m_Value(V2), m_Undef(), m_SpecificMask(Mask))) &&
+      V1->getType() == V2->getType() &&
+      (LHS->hasOneUse() || RHS->hasOneUse() || LHS == RHS)) {
+    // Op(shuffle(V1, Mask), shuffle(V2, Mask)) -> shuffle(Op(V1, V2), Mask)
+    return createBinOpShuffle(V1, V2, Mask);
+  }
+
+  // If both arguments of a commutative binop are select-shuffles that use the
+  // same mask with commuted operands, the shuffles are unnecessary.
+  if (Inst.isCommutative() &&
+      match(LHS, m_Shuffle(m_Value(V1), m_Value(V2), m_Mask(Mask))) &&
+      match(RHS,
+            m_Shuffle(m_Specific(V2), m_Specific(V1), m_SpecificMask(Mask)))) {
+    auto *LShuf = cast<ShuffleVectorInst>(LHS);
+    auto *RShuf = cast<ShuffleVectorInst>(RHS);
+    // TODO: Allow shuffles that contain undefs in the mask?
+    //       That is legal, but it reduces undef knowledge.
+    // TODO: Allow arbitrary shuffles by shuffling after binop?
+    //       That might be legal, but we have to deal with poison.
+    if (LShuf->isSelect() &&
+        !is_contained(LShuf->getShuffleMask(), UndefMaskElem) &&
+        RShuf->isSelect() &&
+        !is_contained(RShuf->getShuffleMask(), UndefMaskElem)) {
+      // Example:
+      // LHS = shuffle V1, V2, <0, 5, 6, 3>
+      // RHS = shuffle V2, V1, <0, 5, 6, 3>
+      // LHS + RHS --> (V10+V20, V21+V11, V22+V12, V13+V23) --> V1 + V2
+      Instruction *NewBO = BinaryOperator::Create(Opcode, V1, V2);
+      NewBO->copyIRFlags(&Inst);
+      return NewBO;
+    }
+  }
+
+  // If one argument is a shuffle within one vector and the other is a constant,
+  // try moving the shuffle after the binary operation. This canonicalization
+  // intends to move shuffles closer to other shuffles and binops closer to
+  // other binops, so they can be folded. It may also enable demanded elements
+  // transforms.
+  Constant *C;
   auto *InstVTy = dyn_cast<FixedVectorType>(Inst.getType());
   if (InstVTy &&
       match(&Inst,
-            m_c_BinOp(m_OneUse(m_Shuffle(m_Value(V1), m_Undef(), m_Mask(Mask))), 
+            m_c_BinOp(m_OneUse(m_Shuffle(m_Value(V1), m_Undef(), m_Mask(Mask))),
                       m_ImmConstant(C))) &&
       cast<FixedVectorType>(V1->getType())->getNumElements() <=
           InstVTy->getNumElements()) {
     assert(InstVTy->getScalarType() == V1->getType()->getScalarType() &&
-           "Shuffle should not change scalar type"); 
- 
-    // Find constant NewC that has property: 
-    //   shuffle(NewC, ShMask) = C 
-    // If such constant does not exist (example: ShMask=<0,0> and C=<1,2>) 
-    // reorder is not possible. A 1-to-1 mapping is not required. Example: 
-    // ShMask = <1,1,2,2> and C = <5,5,6,6> --> NewC = <undef,5,6,undef> 
-    bool ConstOp1 = isa<Constant>(RHS); 
-    ArrayRef<int> ShMask = Mask; 
-    unsigned SrcVecNumElts = 
-        cast<FixedVectorType>(V1->getType())->getNumElements(); 
-    UndefValue *UndefScalar = UndefValue::get(C->getType()->getScalarType()); 
-    SmallVector<Constant *, 16> NewVecC(SrcVecNumElts, UndefScalar); 
-    bool MayChange = true; 
+           "Shuffle should not change scalar type");
+
+    // Find constant NewC that has property:
+    //   shuffle(NewC, ShMask) = C
+    // If such constant does not exist (example: ShMask=<0,0> and C=<1,2>)
+    // reorder is not possible. A 1-to-1 mapping is not required. Example:
+    // ShMask = <1,1,2,2> and C = <5,5,6,6> --> NewC = <undef,5,6,undef>
+    bool ConstOp1 = isa<Constant>(RHS);
+    ArrayRef<int> ShMask = Mask;
+    unsigned SrcVecNumElts =
+        cast<FixedVectorType>(V1->getType())->getNumElements();
+    UndefValue *UndefScalar = UndefValue::get(C->getType()->getScalarType());
+    SmallVector<Constant *, 16> NewVecC(SrcVecNumElts, UndefScalar);
+    bool MayChange = true;
     unsigned NumElts = InstVTy->getNumElements();
-    for (unsigned I = 0; I < NumElts; ++I) { 
-      Constant *CElt = C->getAggregateElement(I); 
-      if (ShMask[I] >= 0) { 
-        assert(ShMask[I] < (int)NumElts && "Not expecting narrowing shuffle"); 
-        Constant *NewCElt = NewVecC[ShMask[I]]; 
-        // Bail out if: 
-        // 1. The constant vector contains a constant expression. 
-        // 2. The shuffle needs an element of the constant vector that can't 
-        //    be mapped to a new constant vector. 
-        // 3. This is a widening shuffle that copies elements of V1 into the 
-        //    extended elements (extending with undef is allowed). 
-        if (!CElt || (!isa<UndefValue>(NewCElt) && NewCElt != CElt) || 
-            I >= SrcVecNumElts) { 
-          MayChange = false; 
-          break; 
-        } 
-        NewVecC[ShMask[I]] = CElt; 
-      } 
-      // If this is a widening shuffle, we must be able to extend with undef 
-      // elements. If the original binop does not produce an undef in the high 
-      // lanes, then this transform is not safe. 
-      // Similarly for undef lanes due to the shuffle mask, we can only 
-      // transform binops that preserve undef. 
-      // TODO: We could shuffle those non-undef constant values into the 
-      //       result by using a constant vector (rather than an undef vector) 
-      //       as operand 1 of the new binop, but that might be too aggressive 
-      //       for target-independent shuffle creation. 
-      if (I >= SrcVecNumElts || ShMask[I] < 0) { 
-        Constant *MaybeUndef = 
-            ConstOp1 ? ConstantExpr::get(Opcode, UndefScalar, CElt) 
-                     : ConstantExpr::get(Opcode, CElt, UndefScalar); 
-        if (!isa<UndefValue>(MaybeUndef)) { 
-          MayChange = false; 
-          break; 
-        } 
-      } 
-    } 
-    if (MayChange) { 
-      Constant *NewC = ConstantVector::get(NewVecC); 
-      // It may not be safe to execute a binop on a vector with undef elements 
-      // because the entire instruction can be folded to undef or create poison 
-      // that did not exist in the original code. 
-      if (Inst.isIntDivRem() || (Inst.isShift() && ConstOp1)) 
-        NewC = getSafeVectorConstantForBinop(Opcode, NewC, ConstOp1); 
- 
-      // Op(shuffle(V1, Mask), C) -> shuffle(Op(V1, NewC), Mask) 
-      // Op(C, shuffle(V1, Mask)) -> shuffle(Op(NewC, V1), Mask) 
-      Value *NewLHS = ConstOp1 ? V1 : NewC; 
-      Value *NewRHS = ConstOp1 ? NewC : V1; 
-      return createBinOpShuffle(NewLHS, NewRHS, Mask); 
-    } 
-  } 
- 
-  // Try to reassociate to sink a splat shuffle after a binary operation. 
-  if (Inst.isAssociative() && Inst.isCommutative()) { 
-    // Canonicalize shuffle operand as LHS. 
-    if (isa<ShuffleVectorInst>(RHS)) 
-      std::swap(LHS, RHS); 
- 
-    Value *X; 
-    ArrayRef<int> MaskC; 
-    int SplatIndex; 
-    BinaryOperator *BO; 
-    if (!match(LHS, 
-               m_OneUse(m_Shuffle(m_Value(X), m_Undef(), m_Mask(MaskC)))) || 
-        !match(MaskC, m_SplatOrUndefMask(SplatIndex)) || 
-        X->getType() != Inst.getType() || !match(RHS, m_OneUse(m_BinOp(BO))) || 
-        BO->getOpcode() != Opcode) 
-      return nullptr; 
- 
-    // FIXME: This may not be safe if the analysis allows undef elements. By 
-    //        moving 'Y' before the splat shuffle, we are implicitly assuming 
-    //        that it is not undef/poison at the splat index. 
-    Value *Y, *OtherOp; 
-    if (isSplatValue(BO->getOperand(0), SplatIndex)) { 
-      Y = BO->getOperand(0); 
-      OtherOp = BO->getOperand(1); 
-    } else if (isSplatValue(BO->getOperand(1), SplatIndex)) { 
-      Y = BO->getOperand(1); 
-      OtherOp = BO->getOperand(0); 
-    } else { 
-      return nullptr; 
-    } 
- 
-    // X and Y are splatted values, so perform the binary operation on those 
-    // values followed by a splat followed by the 2nd binary operation: 
-    // bo (splat X), (bo Y, OtherOp) --> bo (splat (bo X, Y)), OtherOp 
-    Value *NewBO = Builder.CreateBinOp(Opcode, X, Y); 
-    SmallVector<int, 8> NewMask(MaskC.size(), SplatIndex); 
+    for (unsigned I = 0; I < NumElts; ++I) {
+      Constant *CElt = C->getAggregateElement(I);
+      if (ShMask[I] >= 0) {
+        assert(ShMask[I] < (int)NumElts && "Not expecting narrowing shuffle");
+        Constant *NewCElt = NewVecC[ShMask[I]];
+        // Bail out if:
+        // 1. The constant vector contains a constant expression.
+        // 2. The shuffle needs an element of the constant vector that can't
+        //    be mapped to a new constant vector.
+        // 3. This is a widening shuffle that copies elements of V1 into the
+        //    extended elements (extending with undef is allowed).
+        if (!CElt || (!isa<UndefValue>(NewCElt) && NewCElt != CElt) ||
+            I >= SrcVecNumElts) {
+          MayChange = false;
+          break;
+        }
+        NewVecC[ShMask[I]] = CElt;
+      }
+      // If this is a widening shuffle, we must be able to extend with undef
+      // elements. If the original binop does not produce an undef in the high
+      // lanes, then this transform is not safe.
+      // Similarly for undef lanes due to the shuffle mask, we can only
+      // transform binops that preserve undef.
+      // TODO: We could shuffle those non-undef constant values into the
+      //       result by using a constant vector (rather than an undef vector)
+      //       as operand 1 of the new binop, but that might be too aggressive
+      //       for target-independent shuffle creation.
+      if (I >= SrcVecNumElts || ShMask[I] < 0) {
+        Constant *MaybeUndef =
+            ConstOp1 ? ConstantExpr::get(Opcode, UndefScalar, CElt)
+                     : ConstantExpr::get(Opcode, CElt, UndefScalar);
+        if (!isa<UndefValue>(MaybeUndef)) {
+          MayChange = false;
+          break;
+        }
+      }
+    }
+    if (MayChange) {
+      Constant *NewC = ConstantVector::get(NewVecC);
+      // It may not be safe to execute a binop on a vector with undef elements
+      // because the entire instruction can be folded to undef or create poison
+      // that did not exist in the original code.
+      if (Inst.isIntDivRem() || (Inst.isShift() && ConstOp1))
+        NewC = getSafeVectorConstantForBinop(Opcode, NewC, ConstOp1);
+
+      // Op(shuffle(V1, Mask), C) -> shuffle(Op(V1, NewC), Mask)
+      // Op(C, shuffle(V1, Mask)) -> shuffle(Op(NewC, V1), Mask)
+      Value *NewLHS = ConstOp1 ? V1 : NewC;
+      Value *NewRHS = ConstOp1 ? NewC : V1;
+      return createBinOpShuffle(NewLHS, NewRHS, Mask);
+    }
+  }
+
+  // Try to reassociate to sink a splat shuffle after a binary operation.
+  if (Inst.isAssociative() && Inst.isCommutative()) {
+    // Canonicalize shuffle operand as LHS.
+    if (isa<ShuffleVectorInst>(RHS))
+      std::swap(LHS, RHS);
+
+    Value *X;
+    ArrayRef<int> MaskC;
+    int SplatIndex;
+    BinaryOperator *BO;
+    if (!match(LHS,
+               m_OneUse(m_Shuffle(m_Value(X), m_Undef(), m_Mask(MaskC)))) ||
+        !match(MaskC, m_SplatOrUndefMask(SplatIndex)) ||
+        X->getType() != Inst.getType() || !match(RHS, m_OneUse(m_BinOp(BO))) ||
+        BO->getOpcode() != Opcode)
+      return nullptr;
+
+    // FIXME: This may not be safe if the analysis allows undef elements. By
+    //        moving 'Y' before the splat shuffle, we are implicitly assuming
+    //        that it is not undef/poison at the splat index.
+    Value *Y, *OtherOp;
+    if (isSplatValue(BO->getOperand(0), SplatIndex)) {
+      Y = BO->getOperand(0);
+      OtherOp = BO->getOperand(1);
+    } else if (isSplatValue(BO->getOperand(1), SplatIndex)) {
+      Y = BO->getOperand(1);
+      OtherOp = BO->getOperand(0);
+    } else {
+      return nullptr;
+    }
+
+    // X and Y are splatted values, so perform the binary operation on those
+    // values followed by a splat followed by the 2nd binary operation:
+    // bo (splat X), (bo Y, OtherOp) --> bo (splat (bo X, Y)), OtherOp
+    Value *NewBO = Builder.CreateBinOp(Opcode, X, Y);
+    SmallVector<int, 8> NewMask(MaskC.size(), SplatIndex);
     Value *NewSplat = Builder.CreateShuffleVector(NewBO, NewMask);
-    Instruction *R = BinaryOperator::Create(Opcode, NewSplat, OtherOp); 
- 
-    // Intersect FMF on both new binops. Other (poison-generating) flags are 
-    // dropped to be safe. 
-    if (isa<FPMathOperator>(R)) { 
-      R->copyFastMathFlags(&Inst); 
-      R->andIRFlags(BO); 
-    } 
-    if (auto *NewInstBO = dyn_cast<BinaryOperator>(NewBO)) 
-      NewInstBO->copyIRFlags(R); 
-    return R; 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Try to narrow the width of a binop if at least 1 operand is an extend of 
-/// of a value. This requires a potentially expensive known bits check to make 
-/// sure the narrow op does not overflow. 
+    Instruction *R = BinaryOperator::Create(Opcode, NewSplat, OtherOp);
+
+    // Intersect FMF on both new binops. Other (poison-generating) flags are
+    // dropped to be safe.
+    if (isa<FPMathOperator>(R)) {
+      R->copyFastMathFlags(&Inst);
+      R->andIRFlags(BO);
+    }
+    if (auto *NewInstBO = dyn_cast<BinaryOperator>(NewBO))
+      NewInstBO->copyIRFlags(R);
+    return R;
+  }
+
+  return nullptr;
+}
+
+/// Try to narrow the width of a binop if at least 1 operand is an extend of
+/// of a value. This requires a potentially expensive known bits check to make
+/// sure the narrow op does not overflow.
 Instruction *InstCombinerImpl::narrowMathIfNoOverflow(BinaryOperator &BO) {
-  // We need at least one extended operand. 
-  Value *Op0 = BO.getOperand(0), *Op1 = BO.getOperand(1); 
- 
-  // If this is a sub, we swap the operands since we always want an extension 
-  // on the RHS. The LHS can be an extension or a constant. 
-  if (BO.getOpcode() == Instruction::Sub) 
-    std::swap(Op0, Op1); 
- 
-  Value *X; 
-  bool IsSext = match(Op0, m_SExt(m_Value(X))); 
-  if (!IsSext && !match(Op0, m_ZExt(m_Value(X)))) 
-    return nullptr; 
- 
-  // If both operands are the same extension from the same source type and we 
-  // can eliminate at least one (hasOneUse), this might work. 
-  CastInst::CastOps CastOpc = IsSext ? Instruction::SExt : Instruction::ZExt; 
-  Value *Y; 
-  if (!(match(Op1, m_ZExtOrSExt(m_Value(Y))) && X->getType() == Y->getType() && 
-        cast<Operator>(Op1)->getOpcode() == CastOpc && 
-        (Op0->hasOneUse() || Op1->hasOneUse()))) { 
-    // If that did not match, see if we have a suitable constant operand. 
-    // Truncating and extending must produce the same constant. 
-    Constant *WideC; 
-    if (!Op0->hasOneUse() || !match(Op1, m_Constant(WideC))) 
-      return nullptr; 
-    Constant *NarrowC = ConstantExpr::getTrunc(WideC, X->getType()); 
-    if (ConstantExpr::getCast(CastOpc, NarrowC, BO.getType()) != WideC) 
-      return nullptr; 
-    Y = NarrowC; 
-  } 
- 
-  // Swap back now that we found our operands. 
-  if (BO.getOpcode() == Instruction::Sub) 
-    std::swap(X, Y); 
- 
-  // Both operands have narrow versions. Last step: the math must not overflow 
-  // in the narrow width. 
-  if (!willNotOverflow(BO.getOpcode(), X, Y, BO, IsSext)) 
-    return nullptr; 
- 
-  // bo (ext X), (ext Y) --> ext (bo X, Y) 
-  // bo (ext X), C       --> ext (bo X, C') 
-  Value *NarrowBO = Builder.CreateBinOp(BO.getOpcode(), X, Y, "narrow"); 
-  if (auto *NewBinOp = dyn_cast<BinaryOperator>(NarrowBO)) { 
-    if (IsSext) 
-      NewBinOp->setHasNoSignedWrap(); 
-    else 
-      NewBinOp->setHasNoUnsignedWrap(); 
-  } 
-  return CastInst::Create(CastOpc, NarrowBO, BO.getType()); 
-} 
- 
-static bool isMergedGEPInBounds(GEPOperator &GEP1, GEPOperator &GEP2) { 
-  // At least one GEP must be inbounds. 
-  if (!GEP1.isInBounds() && !GEP2.isInBounds()) 
-    return false; 
- 
-  return (GEP1.isInBounds() || GEP1.hasAllZeroIndices()) && 
-         (GEP2.isInBounds() || GEP2.hasAllZeroIndices()); 
-} 
- 
-/// Thread a GEP operation with constant indices through the constant true/false 
-/// arms of a select. 
-static Instruction *foldSelectGEP(GetElementPtrInst &GEP, 
-                                  InstCombiner::BuilderTy &Builder) { 
-  if (!GEP.hasAllConstantIndices()) 
-    return nullptr; 
- 
-  Instruction *Sel; 
-  Value *Cond; 
-  Constant *TrueC, *FalseC; 
-  if (!match(GEP.getPointerOperand(), m_Instruction(Sel)) || 
-      !match(Sel, 
-             m_Select(m_Value(Cond), m_Constant(TrueC), m_Constant(FalseC)))) 
-    return nullptr; 
- 
-  // gep (select Cond, TrueC, FalseC), IndexC --> select Cond, TrueC', FalseC' 
-  // Propagate 'inbounds' and metadata from existing instructions. 
-  // Note: using IRBuilder to create the constants for efficiency. 
+  // We need at least one extended operand.
+  Value *Op0 = BO.getOperand(0), *Op1 = BO.getOperand(1);
+
+  // If this is a sub, we swap the operands since we always want an extension
+  // on the RHS. The LHS can be an extension or a constant.
+  if (BO.getOpcode() == Instruction::Sub)
+    std::swap(Op0, Op1);
+
+  Value *X;
+  bool IsSext = match(Op0, m_SExt(m_Value(X)));
+  if (!IsSext && !match(Op0, m_ZExt(m_Value(X))))
+    return nullptr;
+
+  // If both operands are the same extension from the same source type and we
+  // can eliminate at least one (hasOneUse), this might work.
+  CastInst::CastOps CastOpc = IsSext ? Instruction::SExt : Instruction::ZExt;
+  Value *Y;
+  if (!(match(Op1, m_ZExtOrSExt(m_Value(Y))) && X->getType() == Y->getType() &&
+        cast<Operator>(Op1)->getOpcode() == CastOpc &&
+        (Op0->hasOneUse() || Op1->hasOneUse()))) {
+    // If that did not match, see if we have a suitable constant operand.
+    // Truncating and extending must produce the same constant.
+    Constant *WideC;
+    if (!Op0->hasOneUse() || !match(Op1, m_Constant(WideC)))
+      return nullptr;
+    Constant *NarrowC = ConstantExpr::getTrunc(WideC, X->getType());
+    if (ConstantExpr::getCast(CastOpc, NarrowC, BO.getType()) != WideC)
+      return nullptr;
+    Y = NarrowC;
+  }
+
+  // Swap back now that we found our operands.
+  if (BO.getOpcode() == Instruction::Sub)
+    std::swap(X, Y);
+
+  // Both operands have narrow versions. Last step: the math must not overflow
+  // in the narrow width.
+  if (!willNotOverflow(BO.getOpcode(), X, Y, BO, IsSext))
+    return nullptr;
+
+  // bo (ext X), (ext Y) --> ext (bo X, Y)
+  // bo (ext X), C       --> ext (bo X, C')
+  Value *NarrowBO = Builder.CreateBinOp(BO.getOpcode(), X, Y, "narrow");
+  if (auto *NewBinOp = dyn_cast<BinaryOperator>(NarrowBO)) {
+    if (IsSext)
+      NewBinOp->setHasNoSignedWrap();
+    else
+      NewBinOp->setHasNoUnsignedWrap();
+  }
+  return CastInst::Create(CastOpc, NarrowBO, BO.getType());
+}
+
+static bool isMergedGEPInBounds(GEPOperator &GEP1, GEPOperator &GEP2) {
+  // At least one GEP must be inbounds.
+  if (!GEP1.isInBounds() && !GEP2.isInBounds())
+    return false;
+
+  return (GEP1.isInBounds() || GEP1.hasAllZeroIndices()) &&
+         (GEP2.isInBounds() || GEP2.hasAllZeroIndices());
+}
+
+/// Thread a GEP operation with constant indices through the constant true/false
+/// arms of a select.
+static Instruction *foldSelectGEP(GetElementPtrInst &GEP,
+                                  InstCombiner::BuilderTy &Builder) {
+  if (!GEP.hasAllConstantIndices())
+    return nullptr;
+
+  Instruction *Sel;
+  Value *Cond;
+  Constant *TrueC, *FalseC;
+  if (!match(GEP.getPointerOperand(), m_Instruction(Sel)) ||
+      !match(Sel,
+             m_Select(m_Value(Cond), m_Constant(TrueC), m_Constant(FalseC))))
+    return nullptr;
+
+  // gep (select Cond, TrueC, FalseC), IndexC --> select Cond, TrueC', FalseC'
+  // Propagate 'inbounds' and metadata from existing instructions.
+  // Note: using IRBuilder to create the constants for efficiency.
   SmallVector<Value *, 4> IndexC(GEP.indices());
-  bool IsInBounds = GEP.isInBounds(); 
-  Value *NewTrueC = IsInBounds ? Builder.CreateInBoundsGEP(TrueC, IndexC) 
-                               : Builder.CreateGEP(TrueC, IndexC); 
-  Value *NewFalseC = IsInBounds ? Builder.CreateInBoundsGEP(FalseC, IndexC) 
-                                : Builder.CreateGEP(FalseC, IndexC); 
-  return SelectInst::Create(Cond, NewTrueC, NewFalseC, "", nullptr, Sel); 
-} 
- 
+  bool IsInBounds = GEP.isInBounds();
+  Value *NewTrueC = IsInBounds ? Builder.CreateInBoundsGEP(TrueC, IndexC)
+                               : Builder.CreateGEP(TrueC, IndexC);
+  Value *NewFalseC = IsInBounds ? Builder.CreateInBoundsGEP(FalseC, IndexC)
+                                : Builder.CreateGEP(FalseC, IndexC);
+  return SelectInst::Create(Cond, NewTrueC, NewFalseC, "", nullptr, Sel);
+}
+
 Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   SmallVector<Value *, 8> Ops(GEP.operands());
-  Type *GEPType = GEP.getType(); 
-  Type *GEPEltType = GEP.getSourceElementType(); 
-  bool IsGEPSrcEleScalable = isa<ScalableVectorType>(GEPEltType); 
-  if (Value *V = SimplifyGEPInst(GEPEltType, Ops, SQ.getWithInstruction(&GEP))) 
-    return replaceInstUsesWith(GEP, V); 
- 
-  // For vector geps, use the generic demanded vector support. 
-  // Skip if GEP return type is scalable. The number of elements is unknown at 
-  // compile-time. 
-  if (auto *GEPFVTy = dyn_cast<FixedVectorType>(GEPType)) { 
-    auto VWidth = GEPFVTy->getNumElements(); 
-    APInt UndefElts(VWidth, 0); 
-    APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth)); 
-    if (Value *V = SimplifyDemandedVectorElts(&GEP, AllOnesEltMask, 
-                                              UndefElts)) { 
-      if (V != &GEP) 
-        return replaceInstUsesWith(GEP, V); 
-      return &GEP; 
-    } 
- 
-    // TODO: 1) Scalarize splat operands, 2) scalarize entire instruction if 
-    // possible (decide on canonical form for pointer broadcast), 3) exploit 
-    // undef elements to decrease demanded bits 
-  } 
- 
-  Value *PtrOp = GEP.getOperand(0); 
- 
-  // Eliminate unneeded casts for indices, and replace indices which displace 
-  // by multiples of a zero size type with zero. 
-  bool MadeChange = false; 
- 
-  // Index width may not be the same width as pointer width. 
-  // Data layout chooses the right type based on supported integer types. 
-  Type *NewScalarIndexTy = 
-      DL.getIndexType(GEP.getPointerOperandType()->getScalarType()); 
- 
-  gep_type_iterator GTI = gep_type_begin(GEP); 
-  for (User::op_iterator I = GEP.op_begin() + 1, E = GEP.op_end(); I != E; 
-       ++I, ++GTI) { 
-    // Skip indices into struct types. 
-    if (GTI.isStruct()) 
-      continue; 
- 
-    Type *IndexTy = (*I)->getType(); 
-    Type *NewIndexType = 
-        IndexTy->isVectorTy() 
-            ? VectorType::get(NewScalarIndexTy, 
-                              cast<VectorType>(IndexTy)->getElementCount()) 
-            : NewScalarIndexTy; 
- 
-    // If the element type has zero size then any index over it is equivalent 
-    // to an index of zero, so replace it with zero if it is not zero already. 
-    Type *EltTy = GTI.getIndexedType(); 
-    if (EltTy->isSized() && DL.getTypeAllocSize(EltTy).isZero()) 
-      if (!isa<Constant>(*I) || !match(I->get(), m_Zero())) { 
-        *I = Constant::getNullValue(NewIndexType); 
-        MadeChange = true; 
-      } 
- 
-    if (IndexTy != NewIndexType) { 
-      // If we are using a wider index than needed for this platform, shrink 
-      // it to what we need.  If narrower, sign-extend it to what we need. 
-      // This explicit cast can make subsequent optimizations more obvious. 
-      *I = Builder.CreateIntCast(*I, NewIndexType, true); 
-      MadeChange = true; 
-    } 
-  } 
-  if (MadeChange) 
-    return &GEP; 
- 
-  // Check to see if the inputs to the PHI node are getelementptr instructions. 
-  if (auto *PN = dyn_cast<PHINode>(PtrOp)) { 
-    auto *Op1 = dyn_cast<GetElementPtrInst>(PN->getOperand(0)); 
-    if (!Op1) 
-      return nullptr; 
- 
-    // Don't fold a GEP into itself through a PHI node. This can only happen 
-    // through the back-edge of a loop. Folding a GEP into itself means that 
-    // the value of the previous iteration needs to be stored in the meantime, 
-    // thus requiring an additional register variable to be live, but not 
-    // actually achieving anything (the GEP still needs to be executed once per 
-    // loop iteration). 
-    if (Op1 == &GEP) 
-      return nullptr; 
- 
-    int DI = -1; 
- 
-    for (auto I = PN->op_begin()+1, E = PN->op_end(); I !=E; ++I) { 
-      auto *Op2 = dyn_cast<GetElementPtrInst>(*I); 
-      if (!Op2 || Op1->getNumOperands() != Op2->getNumOperands()) 
-        return nullptr; 
- 
-      // As for Op1 above, don't try to fold a GEP into itself. 
-      if (Op2 == &GEP) 
-        return nullptr; 
- 
-      // Keep track of the type as we walk the GEP. 
-      Type *CurTy = nullptr; 
- 
-      for (unsigned J = 0, F = Op1->getNumOperands(); J != F; ++J) { 
-        if (Op1->getOperand(J)->getType() != Op2->getOperand(J)->getType()) 
-          return nullptr; 
- 
-        if (Op1->getOperand(J) != Op2->getOperand(J)) { 
-          if (DI == -1) { 
-            // We have not seen any differences yet in the GEPs feeding the 
-            // PHI yet, so we record this one if it is allowed to be a 
-            // variable. 
- 
-            // The first two arguments can vary for any GEP, the rest have to be 
-            // static for struct slots 
-            if (J > 1) { 
-              assert(CurTy && "No current type?"); 
-              if (CurTy->isStructTy()) 
-                return nullptr; 
-            } 
- 
-            DI = J; 
-          } else { 
-            // The GEP is different by more than one input. While this could be 
-            // extended to support GEPs that vary by more than one variable it 
-            // doesn't make sense since it greatly increases the complexity and 
-            // would result in an R+R+R addressing mode which no backend 
-            // directly supports and would need to be broken into several 
-            // simpler instructions anyway. 
-            return nullptr; 
-          } 
-        } 
- 
-        // Sink down a layer of the type for the next iteration. 
-        if (J > 0) { 
-          if (J == 1) { 
-            CurTy = Op1->getSourceElementType(); 
-          } else { 
-            CurTy = 
-                GetElementPtrInst::getTypeAtIndex(CurTy, Op1->getOperand(J)); 
-          } 
-        } 
-      } 
-    } 
- 
-    // If not all GEPs are identical we'll have to create a new PHI node. 
-    // Check that the old PHI node has only one use so that it will get 
-    // removed. 
-    if (DI != -1 && !PN->hasOneUse()) 
-      return nullptr; 
- 
-    auto *NewGEP = cast<GetElementPtrInst>(Op1->clone()); 
-    if (DI == -1) { 
-      // All the GEPs feeding the PHI are identical. Clone one down into our 
-      // BB so that it can be merged with the current GEP. 
-    } else { 
-      // All the GEPs feeding the PHI differ at a single offset. Clone a GEP 
-      // into the current block so it can be merged, and create a new PHI to 
-      // set that index. 
-      PHINode *NewPN; 
-      { 
-        IRBuilderBase::InsertPointGuard Guard(Builder); 
-        Builder.SetInsertPoint(PN); 
-        NewPN = Builder.CreatePHI(Op1->getOperand(DI)->getType(), 
-                                  PN->getNumOperands()); 
-      } 
- 
-      for (auto &I : PN->operands()) 
-        NewPN->addIncoming(cast<GEPOperator>(I)->getOperand(DI), 
-                           PN->getIncomingBlock(I)); 
- 
-      NewGEP->setOperand(DI, NewPN); 
-    } 
- 
-    GEP.getParent()->getInstList().insert( 
-        GEP.getParent()->getFirstInsertionPt(), NewGEP); 
-    replaceOperand(GEP, 0, NewGEP); 
-    PtrOp = NewGEP; 
-  } 
- 
-  // Combine Indices - If the source pointer to this getelementptr instruction 
-  // is a getelementptr instruction, combine the indices of the two 
-  // getelementptr instructions into a single instruction. 
-  if (auto *Src = dyn_cast<GEPOperator>(PtrOp)) { 
-    if (!shouldMergeGEPs(*cast<GEPOperator>(&GEP), *Src)) 
-      return nullptr; 
- 
-    // Try to reassociate loop invariant GEP chains to enable LICM. 
-    if (LI && Src->getNumOperands() == 2 && GEP.getNumOperands() == 2 && 
-        Src->hasOneUse()) { 
-      if (Loop *L = LI->getLoopFor(GEP.getParent())) { 
-        Value *GO1 = GEP.getOperand(1); 
-        Value *SO1 = Src->getOperand(1); 
-        // Reassociate the two GEPs if SO1 is variant in the loop and GO1 is 
-        // invariant: this breaks the dependence between GEPs and allows LICM 
-        // to hoist the invariant part out of the loop. 
-        if (L->isLoopInvariant(GO1) && !L->isLoopInvariant(SO1)) { 
-          // We have to be careful here. 
-          // We have something like: 
-          //  %src = getelementptr <ty>, <ty>* %base, <ty> %idx 
-          //  %gep = getelementptr <ty>, <ty>* %src, <ty> %idx2 
-          // If we just swap idx & idx2 then we could inadvertantly 
-          // change %src from a vector to a scalar, or vice versa. 
-          // Cases: 
-          //  1) %base a scalar & idx a scalar & idx2 a vector 
-          //      => Swapping idx & idx2 turns %src into a vector type. 
-          //  2) %base a scalar & idx a vector & idx2 a scalar 
-          //      => Swapping idx & idx2 turns %src in a scalar type 
-          //  3) %base, %idx, and %idx2 are scalars 
-          //      => %src & %gep are scalars 
-          //      => swapping idx & idx2 is safe 
-          //  4) %base a vector 
-          //      => %src is a vector 
-          //      => swapping idx & idx2 is safe. 
-          auto *SO0 = Src->getOperand(0); 
-          auto *SO0Ty = SO0->getType(); 
-          if (!isa<VectorType>(GEPType) || // case 3 
-              isa<VectorType>(SO0Ty)) {    // case 4 
-            Src->setOperand(1, GO1); 
-            GEP.setOperand(1, SO1); 
-            return &GEP; 
-          } else { 
-            // Case 1 or 2 
-            // -- have to recreate %src & %gep 
-            // put NewSrc at same location as %src 
-            Builder.SetInsertPoint(cast<Instruction>(PtrOp)); 
-            auto *NewSrc = cast<GetElementPtrInst>( 
-                Builder.CreateGEP(GEPEltType, SO0, GO1, Src->getName())); 
-            NewSrc->setIsInBounds(Src->isInBounds()); 
-            auto *NewGEP = GetElementPtrInst::Create(GEPEltType, NewSrc, {SO1}); 
-            NewGEP->setIsInBounds(GEP.isInBounds()); 
-            return NewGEP; 
-          } 
-        } 
-      } 
-    } 
- 
-    // Note that if our source is a gep chain itself then we wait for that 
-    // chain to be resolved before we perform this transformation.  This 
-    // avoids us creating a TON of code in some cases. 
-    if (auto *SrcGEP = dyn_cast<GEPOperator>(Src->getOperand(0))) 
-      if (SrcGEP->getNumOperands() == 2 && shouldMergeGEPs(*Src, *SrcGEP)) 
-        return nullptr;   // Wait until our source is folded to completion. 
- 
-    SmallVector<Value*, 8> Indices; 
- 
-    // Find out whether the last index in the source GEP is a sequential idx. 
-    bool EndsWithSequential = false; 
-    for (gep_type_iterator I = gep_type_begin(*Src), E = gep_type_end(*Src); 
-         I != E; ++I) 
-      EndsWithSequential = I.isSequential(); 
- 
-    // Can we combine the two pointer arithmetics offsets? 
-    if (EndsWithSequential) { 
-      // Replace: gep (gep %P, long B), long A, ... 
-      // With:    T = long A+B; gep %P, T, ... 
-      Value *SO1 = Src->getOperand(Src->getNumOperands()-1); 
-      Value *GO1 = GEP.getOperand(1); 
- 
-      // If they aren't the same type, then the input hasn't been processed 
-      // by the loop above yet (which canonicalizes sequential index types to 
-      // intptr_t).  Just avoid transforming this until the input has been 
-      // normalized. 
-      if (SO1->getType() != GO1->getType()) 
-        return nullptr; 
- 
-      Value *Sum = 
-          SimplifyAddInst(GO1, SO1, false, false, SQ.getWithInstruction(&GEP)); 
-      // Only do the combine when we are sure the cost after the 
-      // merge is never more than that before the merge. 
-      if (Sum == nullptr) 
-        return nullptr; 
- 
-      // Update the GEP in place if possible. 
-      if (Src->getNumOperands() == 2) { 
-        GEP.setIsInBounds(isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP))); 
-        replaceOperand(GEP, 0, Src->getOperand(0)); 
-        replaceOperand(GEP, 1, Sum); 
-        return &GEP; 
-      } 
-      Indices.append(Src->op_begin()+1, Src->op_end()-1); 
-      Indices.push_back(Sum); 
-      Indices.append(GEP.op_begin()+2, GEP.op_end()); 
-    } else if (isa<Constant>(*GEP.idx_begin()) && 
-               cast<Constant>(*GEP.idx_begin())->isNullValue() && 
-               Src->getNumOperands() != 1) { 
-      // Otherwise we can do the fold if the first index of the GEP is a zero 
-      Indices.append(Src->op_begin()+1, Src->op_end()); 
-      Indices.append(GEP.idx_begin()+1, GEP.idx_end()); 
-    } 
- 
-    if (!Indices.empty()) 
-      return isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP)) 
-                 ? GetElementPtrInst::CreateInBounds( 
-                       Src->getSourceElementType(), Src->getOperand(0), Indices, 
-                       GEP.getName()) 
-                 : GetElementPtrInst::Create(Src->getSourceElementType(), 
-                                             Src->getOperand(0), Indices, 
-                                             GEP.getName()); 
-  } 
- 
-  // Skip if GEP source element type is scalable. The type alloc size is unknown 
-  // at compile-time. 
-  if (GEP.getNumIndices() == 1 && !IsGEPSrcEleScalable) { 
-    unsigned AS = GEP.getPointerAddressSpace(); 
-    if (GEP.getOperand(1)->getType()->getScalarSizeInBits() == 
-        DL.getIndexSizeInBits(AS)) { 
-      uint64_t TyAllocSize = DL.getTypeAllocSize(GEPEltType).getFixedSize(); 
- 
-      bool Matched = false; 
-      uint64_t C; 
-      Value *V = nullptr; 
-      if (TyAllocSize == 1) { 
-        V = GEP.getOperand(1); 
-        Matched = true; 
-      } else if (match(GEP.getOperand(1), 
-                       m_AShr(m_Value(V), m_ConstantInt(C)))) { 
-        if (TyAllocSize == 1ULL << C) 
-          Matched = true; 
-      } else if (match(GEP.getOperand(1), 
-                       m_SDiv(m_Value(V), m_ConstantInt(C)))) { 
-        if (TyAllocSize == C) 
-          Matched = true; 
-      } 
- 
-      if (Matched) { 
-        // Canonicalize (gep i8* X, -(ptrtoint Y)) 
-        // to (inttoptr (sub (ptrtoint X), (ptrtoint Y))) 
-        // The GEP pattern is emitted by the SCEV expander for certain kinds of 
-        // pointer arithmetic. 
-        if (match(V, m_Neg(m_PtrToInt(m_Value())))) { 
-          Operator *Index = cast<Operator>(V); 
-          Value *PtrToInt = Builder.CreatePtrToInt(PtrOp, Index->getType()); 
-          Value *NewSub = Builder.CreateSub(PtrToInt, Index->getOperand(1)); 
-          return CastInst::Create(Instruction::IntToPtr, NewSub, GEPType); 
-        } 
-        // Canonicalize (gep i8* X, (ptrtoint Y)-(ptrtoint X)) 
-        // to (bitcast Y) 
-        Value *Y; 
-        if (match(V, m_Sub(m_PtrToInt(m_Value(Y)), 
-                           m_PtrToInt(m_Specific(GEP.getOperand(0)))))) 
-          return CastInst::CreatePointerBitCastOrAddrSpaceCast(Y, GEPType); 
-      } 
-    } 
-  } 
- 
-  // We do not handle pointer-vector geps here. 
-  if (GEPType->isVectorTy()) 
-    return nullptr; 
- 
-  // Handle gep(bitcast x) and gep(gep x, 0, 0, 0). 
-  Value *StrippedPtr = PtrOp->stripPointerCasts(); 
-  PointerType *StrippedPtrTy = cast<PointerType>(StrippedPtr->getType()); 
- 
-  if (StrippedPtr != PtrOp) { 
-    bool HasZeroPointerIndex = false; 
-    Type *StrippedPtrEltTy = StrippedPtrTy->getElementType(); 
- 
-    if (auto *C = dyn_cast<ConstantInt>(GEP.getOperand(1))) 
-      HasZeroPointerIndex = C->isZero(); 
- 
-    // Transform: GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ... 
-    // into     : GEP [10 x i8]* X, i32 0, ... 
-    // 
-    // Likewise, transform: GEP (bitcast i8* X to [0 x i8]*), i32 0, ... 
-    //           into     : GEP i8* X, ... 
-    // 
-    // This occurs when the program declares an array extern like "int X[];" 
-    if (HasZeroPointerIndex) { 
-      if (auto *CATy = dyn_cast<ArrayType>(GEPEltType)) { 
-        // GEP (bitcast i8* X to [0 x i8]*), i32 0, ... ? 
-        if (CATy->getElementType() == StrippedPtrEltTy) { 
-          // -> GEP i8* X, ... 
+  Type *GEPType = GEP.getType();
+  Type *GEPEltType = GEP.getSourceElementType();
+  bool IsGEPSrcEleScalable = isa<ScalableVectorType>(GEPEltType);
+  if (Value *V = SimplifyGEPInst(GEPEltType, Ops, SQ.getWithInstruction(&GEP)))
+    return replaceInstUsesWith(GEP, V);
+
+  // For vector geps, use the generic demanded vector support.
+  // Skip if GEP return type is scalable. The number of elements is unknown at
+  // compile-time.
+  if (auto *GEPFVTy = dyn_cast<FixedVectorType>(GEPType)) {
+    auto VWidth = GEPFVTy->getNumElements();
+    APInt UndefElts(VWidth, 0);
+    APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
+    if (Value *V = SimplifyDemandedVectorElts(&GEP, AllOnesEltMask,
+                                              UndefElts)) {
+      if (V != &GEP)
+        return replaceInstUsesWith(GEP, V);
+      return &GEP;
+    }
+
+    // TODO: 1) Scalarize splat operands, 2) scalarize entire instruction if
+    // possible (decide on canonical form for pointer broadcast), 3) exploit
+    // undef elements to decrease demanded bits
+  }
+
+  Value *PtrOp = GEP.getOperand(0);
+
+  // Eliminate unneeded casts for indices, and replace indices which displace
+  // by multiples of a zero size type with zero.
+  bool MadeChange = false;
+
+  // Index width may not be the same width as pointer width.
+  // Data layout chooses the right type based on supported integer types.
+  Type *NewScalarIndexTy =
+      DL.getIndexType(GEP.getPointerOperandType()->getScalarType());
+
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  for (User::op_iterator I = GEP.op_begin() + 1, E = GEP.op_end(); I != E;
+       ++I, ++GTI) {
+    // Skip indices into struct types.
+    if (GTI.isStruct())
+      continue;
+
+    Type *IndexTy = (*I)->getType();
+    Type *NewIndexType =
+        IndexTy->isVectorTy()
+            ? VectorType::get(NewScalarIndexTy,
+                              cast<VectorType>(IndexTy)->getElementCount())
+            : NewScalarIndexTy;
+
+    // If the element type has zero size then any index over it is equivalent
+    // to an index of zero, so replace it with zero if it is not zero already.
+    Type *EltTy = GTI.getIndexedType();
+    if (EltTy->isSized() && DL.getTypeAllocSize(EltTy).isZero())
+      if (!isa<Constant>(*I) || !match(I->get(), m_Zero())) {
+        *I = Constant::getNullValue(NewIndexType);
+        MadeChange = true;
+      }
+
+    if (IndexTy != NewIndexType) {
+      // If we are using a wider index than needed for this platform, shrink
+      // it to what we need.  If narrower, sign-extend it to what we need.
+      // This explicit cast can make subsequent optimizations more obvious.
+      *I = Builder.CreateIntCast(*I, NewIndexType, true);
+      MadeChange = true;
+    }
+  }
+  if (MadeChange)
+    return &GEP;
+
+  // Check to see if the inputs to the PHI node are getelementptr instructions.
+  if (auto *PN = dyn_cast<PHINode>(PtrOp)) {
+    auto *Op1 = dyn_cast<GetElementPtrInst>(PN->getOperand(0));
+    if (!Op1)
+      return nullptr;
+
+    // Don't fold a GEP into itself through a PHI node. This can only happen
+    // through the back-edge of a loop. Folding a GEP into itself means that
+    // the value of the previous iteration needs to be stored in the meantime,
+    // thus requiring an additional register variable to be live, but not
+    // actually achieving anything (the GEP still needs to be executed once per
+    // loop iteration).
+    if (Op1 == &GEP)
+      return nullptr;
+
+    int DI = -1;
+
+    for (auto I = PN->op_begin()+1, E = PN->op_end(); I !=E; ++I) {
+      auto *Op2 = dyn_cast<GetElementPtrInst>(*I);
+      if (!Op2 || Op1->getNumOperands() != Op2->getNumOperands())
+        return nullptr;
+
+      // As for Op1 above, don't try to fold a GEP into itself.
+      if (Op2 == &GEP)
+        return nullptr;
+
+      // Keep track of the type as we walk the GEP.
+      Type *CurTy = nullptr;
+
+      for (unsigned J = 0, F = Op1->getNumOperands(); J != F; ++J) {
+        if (Op1->getOperand(J)->getType() != Op2->getOperand(J)->getType())
+          return nullptr;
+
+        if (Op1->getOperand(J) != Op2->getOperand(J)) {
+          if (DI == -1) {
+            // We have not seen any differences yet in the GEPs feeding the
+            // PHI yet, so we record this one if it is allowed to be a
+            // variable.
+
+            // The first two arguments can vary for any GEP, the rest have to be
+            // static for struct slots
+            if (J > 1) {
+              assert(CurTy && "No current type?");
+              if (CurTy->isStructTy())
+                return nullptr;
+            }
+
+            DI = J;
+          } else {
+            // The GEP is different by more than one input. While this could be
+            // extended to support GEPs that vary by more than one variable it
+            // doesn't make sense since it greatly increases the complexity and
+            // would result in an R+R+R addressing mode which no backend
+            // directly supports and would need to be broken into several
+            // simpler instructions anyway.
+            return nullptr;
+          }
+        }
+
+        // Sink down a layer of the type for the next iteration.
+        if (J > 0) {
+          if (J == 1) {
+            CurTy = Op1->getSourceElementType();
+          } else {
+            CurTy =
+                GetElementPtrInst::getTypeAtIndex(CurTy, Op1->getOperand(J));
+          }
+        }
+      }
+    }
+
+    // If not all GEPs are identical we'll have to create a new PHI node.
+    // Check that the old PHI node has only one use so that it will get
+    // removed.
+    if (DI != -1 && !PN->hasOneUse())
+      return nullptr;
+
+    auto *NewGEP = cast<GetElementPtrInst>(Op1->clone());
+    if (DI == -1) {
+      // All the GEPs feeding the PHI are identical. Clone one down into our
+      // BB so that it can be merged with the current GEP.
+    } else {
+      // All the GEPs feeding the PHI differ at a single offset. Clone a GEP
+      // into the current block so it can be merged, and create a new PHI to
+      // set that index.
+      PHINode *NewPN;
+      {
+        IRBuilderBase::InsertPointGuard Guard(Builder);
+        Builder.SetInsertPoint(PN);
+        NewPN = Builder.CreatePHI(Op1->getOperand(DI)->getType(),
+                                  PN->getNumOperands());
+      }
+
+      for (auto &I : PN->operands())
+        NewPN->addIncoming(cast<GEPOperator>(I)->getOperand(DI),
+                           PN->getIncomingBlock(I));
+
+      NewGEP->setOperand(DI, NewPN);
+    }
+
+    GEP.getParent()->getInstList().insert(
+        GEP.getParent()->getFirstInsertionPt(), NewGEP);
+    replaceOperand(GEP, 0, NewGEP);
+    PtrOp = NewGEP;
+  }
+
+  // Combine Indices - If the source pointer to this getelementptr instruction
+  // is a getelementptr instruction, combine the indices of the two
+  // getelementptr instructions into a single instruction.
+  if (auto *Src = dyn_cast<GEPOperator>(PtrOp)) {
+    if (!shouldMergeGEPs(*cast<GEPOperator>(&GEP), *Src))
+      return nullptr;
+
+    // Try to reassociate loop invariant GEP chains to enable LICM.
+    if (LI && Src->getNumOperands() == 2 && GEP.getNumOperands() == 2 &&
+        Src->hasOneUse()) {
+      if (Loop *L = LI->getLoopFor(GEP.getParent())) {
+        Value *GO1 = GEP.getOperand(1);
+        Value *SO1 = Src->getOperand(1);
+        // Reassociate the two GEPs if SO1 is variant in the loop and GO1 is
+        // invariant: this breaks the dependence between GEPs and allows LICM
+        // to hoist the invariant part out of the loop.
+        if (L->isLoopInvariant(GO1) && !L->isLoopInvariant(SO1)) {
+          // We have to be careful here.
+          // We have something like:
+          //  %src = getelementptr <ty>, <ty>* %base, <ty> %idx
+          //  %gep = getelementptr <ty>, <ty>* %src, <ty> %idx2
+          // If we just swap idx & idx2 then we could inadvertantly
+          // change %src from a vector to a scalar, or vice versa.
+          // Cases:
+          //  1) %base a scalar & idx a scalar & idx2 a vector
+          //      => Swapping idx & idx2 turns %src into a vector type.
+          //  2) %base a scalar & idx a vector & idx2 a scalar
+          //      => Swapping idx & idx2 turns %src in a scalar type
+          //  3) %base, %idx, and %idx2 are scalars
+          //      => %src & %gep are scalars
+          //      => swapping idx & idx2 is safe
+          //  4) %base a vector
+          //      => %src is a vector
+          //      => swapping idx & idx2 is safe.
+          auto *SO0 = Src->getOperand(0);
+          auto *SO0Ty = SO0->getType();
+          if (!isa<VectorType>(GEPType) || // case 3
+              isa<VectorType>(SO0Ty)) {    // case 4
+            Src->setOperand(1, GO1);
+            GEP.setOperand(1, SO1);
+            return &GEP;
+          } else {
+            // Case 1 or 2
+            // -- have to recreate %src & %gep
+            // put NewSrc at same location as %src
+            Builder.SetInsertPoint(cast<Instruction>(PtrOp));
+            auto *NewSrc = cast<GetElementPtrInst>(
+                Builder.CreateGEP(GEPEltType, SO0, GO1, Src->getName()));
+            NewSrc->setIsInBounds(Src->isInBounds());
+            auto *NewGEP = GetElementPtrInst::Create(GEPEltType, NewSrc, {SO1});
+            NewGEP->setIsInBounds(GEP.isInBounds());
+            return NewGEP;
+          }
+        }
+      }
+    }
+
+    // Note that if our source is a gep chain itself then we wait for that
+    // chain to be resolved before we perform this transformation.  This
+    // avoids us creating a TON of code in some cases.
+    if (auto *SrcGEP = dyn_cast<GEPOperator>(Src->getOperand(0)))
+      if (SrcGEP->getNumOperands() == 2 && shouldMergeGEPs(*Src, *SrcGEP))
+        return nullptr;   // Wait until our source is folded to completion.
+
+    SmallVector<Value*, 8> Indices;
+
+    // Find out whether the last index in the source GEP is a sequential idx.
+    bool EndsWithSequential = false;
+    for (gep_type_iterator I = gep_type_begin(*Src), E = gep_type_end(*Src);
+         I != E; ++I)
+      EndsWithSequential = I.isSequential();
+
+    // Can we combine the two pointer arithmetics offsets?
+    if (EndsWithSequential) {
+      // Replace: gep (gep %P, long B), long A, ...
+      // With:    T = long A+B; gep %P, T, ...
+      Value *SO1 = Src->getOperand(Src->getNumOperands()-1);
+      Value *GO1 = GEP.getOperand(1);
+
+      // If they aren't the same type, then the input hasn't been processed
+      // by the loop above yet (which canonicalizes sequential index types to
+      // intptr_t).  Just avoid transforming this until the input has been
+      // normalized.
+      if (SO1->getType() != GO1->getType())
+        return nullptr;
+
+      Value *Sum =
+          SimplifyAddInst(GO1, SO1, false, false, SQ.getWithInstruction(&GEP));
+      // Only do the combine when we are sure the cost after the
+      // merge is never more than that before the merge.
+      if (Sum == nullptr)
+        return nullptr;
+
+      // Update the GEP in place if possible.
+      if (Src->getNumOperands() == 2) {
+        GEP.setIsInBounds(isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP)));
+        replaceOperand(GEP, 0, Src->getOperand(0));
+        replaceOperand(GEP, 1, Sum);
+        return &GEP;
+      }
+      Indices.append(Src->op_begin()+1, Src->op_end()-1);
+      Indices.push_back(Sum);
+      Indices.append(GEP.op_begin()+2, GEP.op_end());
+    } else if (isa<Constant>(*GEP.idx_begin()) &&
+               cast<Constant>(*GEP.idx_begin())->isNullValue() &&
+               Src->getNumOperands() != 1) {
+      // Otherwise we can do the fold if the first index of the GEP is a zero
+      Indices.append(Src->op_begin()+1, Src->op_end());
+      Indices.append(GEP.idx_begin()+1, GEP.idx_end());
+    }
+
+    if (!Indices.empty())
+      return isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP))
+                 ? GetElementPtrInst::CreateInBounds(
+                       Src->getSourceElementType(), Src->getOperand(0), Indices,
+                       GEP.getName())
+                 : GetElementPtrInst::Create(Src->getSourceElementType(),
+                                             Src->getOperand(0), Indices,
+                                             GEP.getName());
+  }
+
+  // Skip if GEP source element type is scalable. The type alloc size is unknown
+  // at compile-time.
+  if (GEP.getNumIndices() == 1 && !IsGEPSrcEleScalable) {
+    unsigned AS = GEP.getPointerAddressSpace();
+    if (GEP.getOperand(1)->getType()->getScalarSizeInBits() ==
+        DL.getIndexSizeInBits(AS)) {
+      uint64_t TyAllocSize = DL.getTypeAllocSize(GEPEltType).getFixedSize();
+
+      bool Matched = false;
+      uint64_t C;
+      Value *V = nullptr;
+      if (TyAllocSize == 1) {
+        V = GEP.getOperand(1);
+        Matched = true;
+      } else if (match(GEP.getOperand(1),
+                       m_AShr(m_Value(V), m_ConstantInt(C)))) {
+        if (TyAllocSize == 1ULL << C)
+          Matched = true;
+      } else if (match(GEP.getOperand(1),
+                       m_SDiv(m_Value(V), m_ConstantInt(C)))) {
+        if (TyAllocSize == C)
+          Matched = true;
+      }
+
+      if (Matched) {
+        // Canonicalize (gep i8* X, -(ptrtoint Y))
+        // to (inttoptr (sub (ptrtoint X), (ptrtoint Y)))
+        // The GEP pattern is emitted by the SCEV expander for certain kinds of
+        // pointer arithmetic.
+        if (match(V, m_Neg(m_PtrToInt(m_Value())))) {
+          Operator *Index = cast<Operator>(V);
+          Value *PtrToInt = Builder.CreatePtrToInt(PtrOp, Index->getType());
+          Value *NewSub = Builder.CreateSub(PtrToInt, Index->getOperand(1));
+          return CastInst::Create(Instruction::IntToPtr, NewSub, GEPType);
+        }
+        // Canonicalize (gep i8* X, (ptrtoint Y)-(ptrtoint X))
+        // to (bitcast Y)
+        Value *Y;
+        if (match(V, m_Sub(m_PtrToInt(m_Value(Y)),
+                           m_PtrToInt(m_Specific(GEP.getOperand(0))))))
+          return CastInst::CreatePointerBitCastOrAddrSpaceCast(Y, GEPType);
+      }
+    }
+  }
+
+  // We do not handle pointer-vector geps here.
+  if (GEPType->isVectorTy())
+    return nullptr;
+
+  // Handle gep(bitcast x) and gep(gep x, 0, 0, 0).
+  Value *StrippedPtr = PtrOp->stripPointerCasts();
+  PointerType *StrippedPtrTy = cast<PointerType>(StrippedPtr->getType());
+
+  if (StrippedPtr != PtrOp) {
+    bool HasZeroPointerIndex = false;
+    Type *StrippedPtrEltTy = StrippedPtrTy->getElementType();
+
+    if (auto *C = dyn_cast<ConstantInt>(GEP.getOperand(1)))
+      HasZeroPointerIndex = C->isZero();
+
+    // Transform: GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ...
+    // into     : GEP [10 x i8]* X, i32 0, ...
+    //
+    // Likewise, transform: GEP (bitcast i8* X to [0 x i8]*), i32 0, ...
+    //           into     : GEP i8* X, ...
+    //
+    // This occurs when the program declares an array extern like "int X[];"
+    if (HasZeroPointerIndex) {
+      if (auto *CATy = dyn_cast<ArrayType>(GEPEltType)) {
+        // GEP (bitcast i8* X to [0 x i8]*), i32 0, ... ?
+        if (CATy->getElementType() == StrippedPtrEltTy) {
+          // -> GEP i8* X, ...
           SmallVector<Value *, 8> Idx(drop_begin(GEP.indices()));
-          GetElementPtrInst *Res = GetElementPtrInst::Create( 
-              StrippedPtrEltTy, StrippedPtr, Idx, GEP.getName()); 
-          Res->setIsInBounds(GEP.isInBounds()); 
-          if (StrippedPtrTy->getAddressSpace() == GEP.getAddressSpace()) 
-            return Res; 
-          // Insert Res, and create an addrspacecast. 
-          // e.g., 
-          // GEP (addrspacecast i8 addrspace(1)* X to [0 x i8]*), i32 0, ... 
-          // -> 
-          // %0 = GEP i8 addrspace(1)* X, ... 
-          // addrspacecast i8 addrspace(1)* %0 to i8* 
-          return new AddrSpaceCastInst(Builder.Insert(Res), GEPType); 
-        } 
- 
-        if (auto *XATy = dyn_cast<ArrayType>(StrippedPtrEltTy)) { 
-          // GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ... ? 
-          if (CATy->getElementType() == XATy->getElementType()) { 
-            // -> GEP [10 x i8]* X, i32 0, ... 
-            // At this point, we know that the cast source type is a pointer 
-            // to an array of the same type as the destination pointer 
-            // array.  Because the array type is never stepped over (there 
-            // is a leading zero) we can fold the cast into this GEP. 
-            if (StrippedPtrTy->getAddressSpace() == GEP.getAddressSpace()) { 
-              GEP.setSourceElementType(XATy); 
-              return replaceOperand(GEP, 0, StrippedPtr); 
-            } 
-            // Cannot replace the base pointer directly because StrippedPtr's 
-            // address space is different. Instead, create a new GEP followed by 
-            // an addrspacecast. 
-            // e.g., 
-            // GEP (addrspacecast [10 x i8] addrspace(1)* X to [0 x i8]*), 
-            //   i32 0, ... 
-            // -> 
-            // %0 = GEP [10 x i8] addrspace(1)* X, ... 
-            // addrspacecast i8 addrspace(1)* %0 to i8* 
+          GetElementPtrInst *Res = GetElementPtrInst::Create(
+              StrippedPtrEltTy, StrippedPtr, Idx, GEP.getName());
+          Res->setIsInBounds(GEP.isInBounds());
+          if (StrippedPtrTy->getAddressSpace() == GEP.getAddressSpace())
+            return Res;
+          // Insert Res, and create an addrspacecast.
+          // e.g.,
+          // GEP (addrspacecast i8 addrspace(1)* X to [0 x i8]*), i32 0, ...
+          // ->
+          // %0 = GEP i8 addrspace(1)* X, ...
+          // addrspacecast i8 addrspace(1)* %0 to i8*
+          return new AddrSpaceCastInst(Builder.Insert(Res), GEPType);
+        }
+
+        if (auto *XATy = dyn_cast<ArrayType>(StrippedPtrEltTy)) {
+          // GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ... ?
+          if (CATy->getElementType() == XATy->getElementType()) {
+            // -> GEP [10 x i8]* X, i32 0, ...
+            // At this point, we know that the cast source type is a pointer
+            // to an array of the same type as the destination pointer
+            // array.  Because the array type is never stepped over (there
+            // is a leading zero) we can fold the cast into this GEP.
+            if (StrippedPtrTy->getAddressSpace() == GEP.getAddressSpace()) {
+              GEP.setSourceElementType(XATy);
+              return replaceOperand(GEP, 0, StrippedPtr);
+            }
+            // Cannot replace the base pointer directly because StrippedPtr's
+            // address space is different. Instead, create a new GEP followed by
+            // an addrspacecast.
+            // e.g.,
+            // GEP (addrspacecast [10 x i8] addrspace(1)* X to [0 x i8]*),
+            //   i32 0, ...
+            // ->
+            // %0 = GEP [10 x i8] addrspace(1)* X, ...
+            // addrspacecast i8 addrspace(1)* %0 to i8*
             SmallVector<Value *, 8> Idx(GEP.indices());
-            Value *NewGEP = 
-                GEP.isInBounds() 
-                    ? Builder.CreateInBoundsGEP(StrippedPtrEltTy, StrippedPtr, 
-                                                Idx, GEP.getName()) 
-                    : Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, Idx, 
-                                        GEP.getName()); 
-            return new AddrSpaceCastInst(NewGEP, GEPType); 
-          } 
-        } 
-      } 
-    } else if (GEP.getNumOperands() == 2 && !IsGEPSrcEleScalable) { 
-      // Skip if GEP source element type is scalable. The type alloc size is 
-      // unknown at compile-time. 
-      // Transform things like: %t = getelementptr i32* 
-      // bitcast ([2 x i32]* %str to i32*), i32 %V into:  %t1 = getelementptr [2 
-      // x i32]* %str, i32 0, i32 %V; bitcast 
-      if (StrippedPtrEltTy->isArrayTy() && 
-          DL.getTypeAllocSize(StrippedPtrEltTy->getArrayElementType()) == 
-              DL.getTypeAllocSize(GEPEltType)) { 
-        Type *IdxType = DL.getIndexType(GEPType); 
-        Value *Idx[2] = { Constant::getNullValue(IdxType), GEP.getOperand(1) }; 
-        Value *NewGEP = 
-            GEP.isInBounds() 
-                ? Builder.CreateInBoundsGEP(StrippedPtrEltTy, StrippedPtr, Idx, 
-                                            GEP.getName()) 
-                : Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, Idx, 
-                                    GEP.getName()); 
- 
-        // V and GEP are both pointer types --> BitCast 
-        return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP, GEPType); 
-      } 
- 
-      // Transform things like: 
-      // %V = mul i64 %N, 4 
-      // %t = getelementptr i8* bitcast (i32* %arr to i8*), i32 %V 
-      // into:  %t1 = getelementptr i32* %arr, i32 %N; bitcast 
-      if (GEPEltType->isSized() && StrippedPtrEltTy->isSized()) { 
-        // Check that changing the type amounts to dividing the index by a scale 
-        // factor. 
-        uint64_t ResSize = DL.getTypeAllocSize(GEPEltType).getFixedSize(); 
-        uint64_t SrcSize = DL.getTypeAllocSize(StrippedPtrEltTy).getFixedSize(); 
-        if (ResSize && SrcSize % ResSize == 0) { 
-          Value *Idx = GEP.getOperand(1); 
-          unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits(); 
-          uint64_t Scale = SrcSize / ResSize; 
- 
-          // Earlier transforms ensure that the index has the right type 
-          // according to Data Layout, which considerably simplifies the 
-          // logic by eliminating implicit casts. 
-          assert(Idx->getType() == DL.getIndexType(GEPType) && 
-                 "Index type does not match the Data Layout preferences"); 
- 
-          bool NSW; 
-          if (Value *NewIdx = Descale(Idx, APInt(BitWidth, Scale), NSW)) { 
-            // Successfully decomposed Idx as NewIdx * Scale, form a new GEP. 
-            // If the multiplication NewIdx * Scale may overflow then the new 
-            // GEP may not be "inbounds". 
-            Value *NewGEP = 
-                GEP.isInBounds() && NSW 
-                    ? Builder.CreateInBoundsGEP(StrippedPtrEltTy, StrippedPtr, 
-                                                NewIdx, GEP.getName()) 
-                    : Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, NewIdx, 
-                                        GEP.getName()); 
- 
-            // The NewGEP must be pointer typed, so must the old one -> BitCast 
-            return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP, 
-                                                                 GEPType); 
-          } 
-        } 
-      } 
- 
-      // Similarly, transform things like: 
-      // getelementptr i8* bitcast ([100 x double]* X to i8*), i32 %tmp 
-      //   (where tmp = 8*tmp2) into: 
-      // getelementptr [100 x double]* %arr, i32 0, i32 %tmp2; bitcast 
-      if (GEPEltType->isSized() && StrippedPtrEltTy->isSized() && 
-          StrippedPtrEltTy->isArrayTy()) { 
-        // Check that changing to the array element type amounts to dividing the 
-        // index by a scale factor. 
-        uint64_t ResSize = DL.getTypeAllocSize(GEPEltType).getFixedSize(); 
-        uint64_t ArrayEltSize = 
-            DL.getTypeAllocSize(StrippedPtrEltTy->getArrayElementType()) 
-                .getFixedSize(); 
-        if (ResSize && ArrayEltSize % ResSize == 0) { 
-          Value *Idx = GEP.getOperand(1); 
-          unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits(); 
-          uint64_t Scale = ArrayEltSize / ResSize; 
- 
-          // Earlier transforms ensure that the index has the right type 
-          // according to the Data Layout, which considerably simplifies 
-          // the logic by eliminating implicit casts. 
-          assert(Idx->getType() == DL.getIndexType(GEPType) && 
-                 "Index type does not match the Data Layout preferences"); 
- 
-          bool NSW; 
-          if (Value *NewIdx = Descale(Idx, APInt(BitWidth, Scale), NSW)) { 
-            // Successfully decomposed Idx as NewIdx * Scale, form a new GEP. 
-            // If the multiplication NewIdx * Scale may overflow then the new 
-            // GEP may not be "inbounds". 
-            Type *IndTy = DL.getIndexType(GEPType); 
-            Value *Off[2] = {Constant::getNullValue(IndTy), NewIdx}; 
- 
-            Value *NewGEP = 
-                GEP.isInBounds() && NSW 
-                    ? Builder.CreateInBoundsGEP(StrippedPtrEltTy, StrippedPtr, 
-                                                Off, GEP.getName()) 
-                    : Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, Off, 
-                                        GEP.getName()); 
-            // The NewGEP must be pointer typed, so must the old one -> BitCast 
-            return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP, 
-                                                                 GEPType); 
-          } 
-        } 
-      } 
-    } 
-  } 
- 
-  // addrspacecast between types is canonicalized as a bitcast, then an 
-  // addrspacecast. To take advantage of the below bitcast + struct GEP, look 
-  // through the addrspacecast. 
-  Value *ASCStrippedPtrOp = PtrOp; 
-  if (auto *ASC = dyn_cast<AddrSpaceCastInst>(PtrOp)) { 
-    //   X = bitcast A addrspace(1)* to B addrspace(1)* 
-    //   Y = addrspacecast A addrspace(1)* to B addrspace(2)* 
-    //   Z = gep Y, <...constant indices...> 
-    // Into an addrspacecasted GEP of the struct. 
-    if (auto *BC = dyn_cast<BitCastInst>(ASC->getOperand(0))) 
-      ASCStrippedPtrOp = BC; 
-  } 
- 
-  if (auto *BCI = dyn_cast<BitCastInst>(ASCStrippedPtrOp)) { 
-    Value *SrcOp = BCI->getOperand(0); 
-    PointerType *SrcType = cast<PointerType>(BCI->getSrcTy()); 
-    Type *SrcEltType = SrcType->getElementType(); 
- 
-    // GEP directly using the source operand if this GEP is accessing an element 
-    // of a bitcasted pointer to vector or array of the same dimensions: 
-    // gep (bitcast <c x ty>* X to [c x ty]*), Y, Z --> gep X, Y, Z 
-    // gep (bitcast [c x ty]* X to <c x ty>*), Y, Z --> gep X, Y, Z 
-    auto areMatchingArrayAndVecTypes = [](Type *ArrTy, Type *VecTy, 
-                                          const DataLayout &DL) { 
+            Value *NewGEP =
+                GEP.isInBounds()
+                    ? Builder.CreateInBoundsGEP(StrippedPtrEltTy, StrippedPtr,
+                                                Idx, GEP.getName())
+                    : Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, Idx,
+                                        GEP.getName());
+            return new AddrSpaceCastInst(NewGEP, GEPType);
+          }
+        }
+      }
+    } else if (GEP.getNumOperands() == 2 && !IsGEPSrcEleScalable) {
+      // Skip if GEP source element type is scalable. The type alloc size is
+      // unknown at compile-time.
+      // Transform things like: %t = getelementptr i32*
+      // bitcast ([2 x i32]* %str to i32*), i32 %V into:  %t1 = getelementptr [2
+      // x i32]* %str, i32 0, i32 %V; bitcast
+      if (StrippedPtrEltTy->isArrayTy() &&
+          DL.getTypeAllocSize(StrippedPtrEltTy->getArrayElementType()) ==
+              DL.getTypeAllocSize(GEPEltType)) {
+        Type *IdxType = DL.getIndexType(GEPType);
+        Value *Idx[2] = { Constant::getNullValue(IdxType), GEP.getOperand(1) };
+        Value *NewGEP =
+            GEP.isInBounds()
+                ? Builder.CreateInBoundsGEP(StrippedPtrEltTy, StrippedPtr, Idx,
+                                            GEP.getName())
+                : Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, Idx,
+                                    GEP.getName());
+
+        // V and GEP are both pointer types --> BitCast
+        return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP, GEPType);
+      }
+
+      // Transform things like:
+      // %V = mul i64 %N, 4
+      // %t = getelementptr i8* bitcast (i32* %arr to i8*), i32 %V
+      // into:  %t1 = getelementptr i32* %arr, i32 %N; bitcast
+      if (GEPEltType->isSized() && StrippedPtrEltTy->isSized()) {
+        // Check that changing the type amounts to dividing the index by a scale
+        // factor.
+        uint64_t ResSize = DL.getTypeAllocSize(GEPEltType).getFixedSize();
+        uint64_t SrcSize = DL.getTypeAllocSize(StrippedPtrEltTy).getFixedSize();
+        if (ResSize && SrcSize % ResSize == 0) {
+          Value *Idx = GEP.getOperand(1);
+          unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits();
+          uint64_t Scale = SrcSize / ResSize;
+
+          // Earlier transforms ensure that the index has the right type
+          // according to Data Layout, which considerably simplifies the
+          // logic by eliminating implicit casts.
+          assert(Idx->getType() == DL.getIndexType(GEPType) &&
+                 "Index type does not match the Data Layout preferences");
+
+          bool NSW;
+          if (Value *NewIdx = Descale(Idx, APInt(BitWidth, Scale), NSW)) {
+            // Successfully decomposed Idx as NewIdx * Scale, form a new GEP.
+            // If the multiplication NewIdx * Scale may overflow then the new
+            // GEP may not be "inbounds".
+            Value *NewGEP =
+                GEP.isInBounds() && NSW
+                    ? Builder.CreateInBoundsGEP(StrippedPtrEltTy, StrippedPtr,
+                                                NewIdx, GEP.getName())
+                    : Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, NewIdx,
+                                        GEP.getName());
+
+            // The NewGEP must be pointer typed, so must the old one -> BitCast
+            return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP,
+                                                                 GEPType);
+          }
+        }
+      }
+
+      // Similarly, transform things like:
+      // getelementptr i8* bitcast ([100 x double]* X to i8*), i32 %tmp
+      //   (where tmp = 8*tmp2) into:
+      // getelementptr [100 x double]* %arr, i32 0, i32 %tmp2; bitcast
+      if (GEPEltType->isSized() && StrippedPtrEltTy->isSized() &&
+          StrippedPtrEltTy->isArrayTy()) {
+        // Check that changing to the array element type amounts to dividing the
+        // index by a scale factor.
+        uint64_t ResSize = DL.getTypeAllocSize(GEPEltType).getFixedSize();
+        uint64_t ArrayEltSize =
+            DL.getTypeAllocSize(StrippedPtrEltTy->getArrayElementType())
+                .getFixedSize();
+        if (ResSize && ArrayEltSize % ResSize == 0) {
+          Value *Idx = GEP.getOperand(1);
+          unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits();
+          uint64_t Scale = ArrayEltSize / ResSize;
+
+          // Earlier transforms ensure that the index has the right type
+          // according to the Data Layout, which considerably simplifies
+          // the logic by eliminating implicit casts.
+          assert(Idx->getType() == DL.getIndexType(GEPType) &&
+                 "Index type does not match the Data Layout preferences");
+
+          bool NSW;
+          if (Value *NewIdx = Descale(Idx, APInt(BitWidth, Scale), NSW)) {
+            // Successfully decomposed Idx as NewIdx * Scale, form a new GEP.
+            // If the multiplication NewIdx * Scale may overflow then the new
+            // GEP may not be "inbounds".
+            Type *IndTy = DL.getIndexType(GEPType);
+            Value *Off[2] = {Constant::getNullValue(IndTy), NewIdx};
+
+            Value *NewGEP =
+                GEP.isInBounds() && NSW
+                    ? Builder.CreateInBoundsGEP(StrippedPtrEltTy, StrippedPtr,
+                                                Off, GEP.getName())
+                    : Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, Off,
+                                        GEP.getName());
+            // The NewGEP must be pointer typed, so must the old one -> BitCast
+            return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP,
+                                                                 GEPType);
+          }
+        }
+      }
+    }
+  }
+
+  // addrspacecast between types is canonicalized as a bitcast, then an
+  // addrspacecast. To take advantage of the below bitcast + struct GEP, look
+  // through the addrspacecast.
+  Value *ASCStrippedPtrOp = PtrOp;
+  if (auto *ASC = dyn_cast<AddrSpaceCastInst>(PtrOp)) {
+    //   X = bitcast A addrspace(1)* to B addrspace(1)*
+    //   Y = addrspacecast A addrspace(1)* to B addrspace(2)*
+    //   Z = gep Y, <...constant indices...>
+    // Into an addrspacecasted GEP of the struct.
+    if (auto *BC = dyn_cast<BitCastInst>(ASC->getOperand(0)))
+      ASCStrippedPtrOp = BC;
+  }
+
+  if (auto *BCI = dyn_cast<BitCastInst>(ASCStrippedPtrOp)) {
+    Value *SrcOp = BCI->getOperand(0);
+    PointerType *SrcType = cast<PointerType>(BCI->getSrcTy());
+    Type *SrcEltType = SrcType->getElementType();
+
+    // GEP directly using the source operand if this GEP is accessing an element
+    // of a bitcasted pointer to vector or array of the same dimensions:
+    // gep (bitcast <c x ty>* X to [c x ty]*), Y, Z --> gep X, Y, Z
+    // gep (bitcast [c x ty]* X to <c x ty>*), Y, Z --> gep X, Y, Z
+    auto areMatchingArrayAndVecTypes = [](Type *ArrTy, Type *VecTy,
+                                          const DataLayout &DL) {
       auto *VecVTy = cast<FixedVectorType>(VecTy);
-      return ArrTy->getArrayElementType() == VecVTy->getElementType() && 
-             ArrTy->getArrayNumElements() == VecVTy->getNumElements() && 
-             DL.getTypeAllocSize(ArrTy) == DL.getTypeAllocSize(VecTy); 
-    }; 
-    if (GEP.getNumOperands() == 3 && 
+      return ArrTy->getArrayElementType() == VecVTy->getElementType() &&
+             ArrTy->getArrayNumElements() == VecVTy->getNumElements() &&
+             DL.getTypeAllocSize(ArrTy) == DL.getTypeAllocSize(VecTy);
+    };
+    if (GEP.getNumOperands() == 3 &&
         ((GEPEltType->isArrayTy() && isa<FixedVectorType>(SrcEltType) &&
-          areMatchingArrayAndVecTypes(GEPEltType, SrcEltType, DL)) || 
+          areMatchingArrayAndVecTypes(GEPEltType, SrcEltType, DL)) ||
          (isa<FixedVectorType>(GEPEltType) && SrcEltType->isArrayTy() &&
-          areMatchingArrayAndVecTypes(SrcEltType, GEPEltType, DL)))) { 
- 
-      // Create a new GEP here, as using `setOperand()` followed by 
-      // `setSourceElementType()` won't actually update the type of the 
-      // existing GEP Value. Causing issues if this Value is accessed when 
-      // constructing an AddrSpaceCastInst 
-      Value *NGEP = 
-          GEP.isInBounds() 
-              ? Builder.CreateInBoundsGEP(SrcEltType, SrcOp, {Ops[1], Ops[2]}) 
-              : Builder.CreateGEP(SrcEltType, SrcOp, {Ops[1], Ops[2]}); 
-      NGEP->takeName(&GEP); 
- 
-      // Preserve GEP address space to satisfy users 
-      if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace()) 
-        return new AddrSpaceCastInst(NGEP, GEPType); 
- 
-      return replaceInstUsesWith(GEP, NGEP); 
-    } 
- 
-    // See if we can simplify: 
-    //   X = bitcast A* to B* 
-    //   Y = gep X, <...constant indices...> 
-    // into a gep of the original struct. This is important for SROA and alias 
-    // analysis of unions. If "A" is also a bitcast, wait for A/X to be merged. 
-    unsigned OffsetBits = DL.getIndexTypeSizeInBits(GEPType); 
-    APInt Offset(OffsetBits, 0); 
-    if (!isa<BitCastInst>(SrcOp) && GEP.accumulateConstantOffset(DL, Offset)) { 
-      // If this GEP instruction doesn't move the pointer, just replace the GEP 
-      // with a bitcast of the real input to the dest type. 
-      if (!Offset) { 
-        // If the bitcast is of an allocation, and the allocation will be 
-        // converted to match the type of the cast, don't touch this. 
-        if (isa<AllocaInst>(SrcOp) || isAllocationFn(SrcOp, &TLI)) { 
-          // See if the bitcast simplifies, if so, don't nuke this GEP yet. 
-          if (Instruction *I = visitBitCast(*BCI)) { 
-            if (I != BCI) { 
-              I->takeName(BCI); 
-              BCI->getParent()->getInstList().insert(BCI->getIterator(), I); 
-              replaceInstUsesWith(*BCI, I); 
-            } 
-            return &GEP; 
-          } 
-        } 
- 
-        if (SrcType->getPointerAddressSpace() != GEP.getAddressSpace()) 
-          return new AddrSpaceCastInst(SrcOp, GEPType); 
-        return new BitCastInst(SrcOp, GEPType); 
-      } 
- 
-      // Otherwise, if the offset is non-zero, we need to find out if there is a 
-      // field at Offset in 'A's type.  If so, we can pull the cast through the 
-      // GEP. 
-      SmallVector<Value*, 8> NewIndices; 
-      if (FindElementAtOffset(SrcType, Offset.getSExtValue(), NewIndices)) { 
-        Value *NGEP = 
-            GEP.isInBounds() 
-                ? Builder.CreateInBoundsGEP(SrcEltType, SrcOp, NewIndices) 
-                : Builder.CreateGEP(SrcEltType, SrcOp, NewIndices); 
- 
-        if (NGEP->getType() == GEPType) 
-          return replaceInstUsesWith(GEP, NGEP); 
-        NGEP->takeName(&GEP); 
- 
-        if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace()) 
-          return new AddrSpaceCastInst(NGEP, GEPType); 
-        return new BitCastInst(NGEP, GEPType); 
-      } 
-    } 
-  } 
- 
-  if (!GEP.isInBounds()) { 
-    unsigned IdxWidth = 
-        DL.getIndexSizeInBits(PtrOp->getType()->getPointerAddressSpace()); 
-    APInt BasePtrOffset(IdxWidth, 0); 
-    Value *UnderlyingPtrOp = 
-            PtrOp->stripAndAccumulateInBoundsConstantOffsets(DL, 
-                                                             BasePtrOffset); 
-    if (auto *AI = dyn_cast<AllocaInst>(UnderlyingPtrOp)) { 
-      if (GEP.accumulateConstantOffset(DL, BasePtrOffset) && 
-          BasePtrOffset.isNonNegative()) { 
-        APInt AllocSize( 
-            IdxWidth, 
-            DL.getTypeAllocSize(AI->getAllocatedType()).getKnownMinSize()); 
-        if (BasePtrOffset.ule(AllocSize)) { 
-          return GetElementPtrInst::CreateInBounds( 
-              GEP.getSourceElementType(), PtrOp, makeArrayRef(Ops).slice(1), 
-              GEP.getName()); 
-        } 
-      } 
-    } 
-  } 
- 
-  if (Instruction *R = foldSelectGEP(GEP, Builder)) 
-    return R; 
- 
-  return nullptr; 
-} 
- 
-static bool isNeverEqualToUnescapedAlloc(Value *V, const TargetLibraryInfo *TLI, 
-                                         Instruction *AI) { 
-  if (isa<ConstantPointerNull>(V)) 
-    return true; 
-  if (auto *LI = dyn_cast<LoadInst>(V)) 
-    return isa<GlobalVariable>(LI->getPointerOperand()); 
-  // Two distinct allocations will never be equal. 
-  // We rely on LookThroughBitCast in isAllocLikeFn being false, since looking 
-  // through bitcasts of V can cause 
-  // the result statement below to be true, even when AI and V (ex: 
-  // i8* ->i32* ->i8* of AI) are the same allocations. 
-  return isAllocLikeFn(V, TLI) && V != AI; 
-} 
- 
-static bool isAllocSiteRemovable(Instruction *AI, 
-                                 SmallVectorImpl<WeakTrackingVH> &Users, 
-                                 const TargetLibraryInfo *TLI) { 
-  SmallVector<Instruction*, 4> Worklist; 
-  Worklist.push_back(AI); 
- 
-  do { 
-    Instruction *PI = Worklist.pop_back_val(); 
-    for (User *U : PI->users()) { 
-      Instruction *I = cast<Instruction>(U); 
-      switch (I->getOpcode()) { 
-      default: 
-        // Give up the moment we see something we can't handle. 
-        return false; 
- 
-      case Instruction::AddrSpaceCast: 
-      case Instruction::BitCast: 
-      case Instruction::GetElementPtr: 
-        Users.emplace_back(I); 
-        Worklist.push_back(I); 
-        continue; 
- 
-      case Instruction::ICmp: { 
-        ICmpInst *ICI = cast<ICmpInst>(I); 
-        // We can fold eq/ne comparisons with null to false/true, respectively. 
-        // We also fold comparisons in some conditions provided the alloc has 
-        // not escaped (see isNeverEqualToUnescapedAlloc). 
-        if (!ICI->isEquality()) 
-          return false; 
-        unsigned OtherIndex = (ICI->getOperand(0) == PI) ? 1 : 0; 
-        if (!isNeverEqualToUnescapedAlloc(ICI->getOperand(OtherIndex), TLI, AI)) 
-          return false; 
-        Users.emplace_back(I); 
-        continue; 
-      } 
- 
-      case Instruction::Call: 
-        // Ignore no-op and store intrinsics. 
-        if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 
-          switch (II->getIntrinsicID()) { 
-          default: 
-            return false; 
- 
-          case Intrinsic::memmove: 
-          case Intrinsic::memcpy: 
-          case Intrinsic::memset: { 
-            MemIntrinsic *MI = cast<MemIntrinsic>(II); 
-            if (MI->isVolatile() || MI->getRawDest() != PI) 
-              return false; 
-            LLVM_FALLTHROUGH; 
-          } 
-          case Intrinsic::assume: 
-          case Intrinsic::invariant_start: 
-          case Intrinsic::invariant_end: 
-          case Intrinsic::lifetime_start: 
-          case Intrinsic::lifetime_end: 
-          case Intrinsic::objectsize: 
-            Users.emplace_back(I); 
-            continue; 
-          } 
-        } 
- 
-        if (isFreeCall(I, TLI)) { 
-          Users.emplace_back(I); 
-          continue; 
-        } 
-        return false; 
- 
-      case Instruction::Store: { 
-        StoreInst *SI = cast<StoreInst>(I); 
-        if (SI->isVolatile() || SI->getPointerOperand() != PI) 
-          return false; 
-        Users.emplace_back(I); 
-        continue; 
-      } 
-      } 
-      llvm_unreachable("missing a return?"); 
-    } 
-  } while (!Worklist.empty()); 
-  return true; 
-} 
- 
+          areMatchingArrayAndVecTypes(SrcEltType, GEPEltType, DL)))) {
+
+      // Create a new GEP here, as using `setOperand()` followed by
+      // `setSourceElementType()` won't actually update the type of the
+      // existing GEP Value. Causing issues if this Value is accessed when
+      // constructing an AddrSpaceCastInst
+      Value *NGEP =
+          GEP.isInBounds()
+              ? Builder.CreateInBoundsGEP(SrcEltType, SrcOp, {Ops[1], Ops[2]})
+              : Builder.CreateGEP(SrcEltType, SrcOp, {Ops[1], Ops[2]});
+      NGEP->takeName(&GEP);
+
+      // Preserve GEP address space to satisfy users
+      if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace())
+        return new AddrSpaceCastInst(NGEP, GEPType);
+
+      return replaceInstUsesWith(GEP, NGEP);
+    }
+
+    // See if we can simplify:
+    //   X = bitcast A* to B*
+    //   Y = gep X, <...constant indices...>
+    // into a gep of the original struct. This is important for SROA and alias
+    // analysis of unions. If "A" is also a bitcast, wait for A/X to be merged.
+    unsigned OffsetBits = DL.getIndexTypeSizeInBits(GEPType);
+    APInt Offset(OffsetBits, 0);
+    if (!isa<BitCastInst>(SrcOp) && GEP.accumulateConstantOffset(DL, Offset)) {
+      // If this GEP instruction doesn't move the pointer, just replace the GEP
+      // with a bitcast of the real input to the dest type.
+      if (!Offset) {
+        // If the bitcast is of an allocation, and the allocation will be
+        // converted to match the type of the cast, don't touch this.
+        if (isa<AllocaInst>(SrcOp) || isAllocationFn(SrcOp, &TLI)) {
+          // See if the bitcast simplifies, if so, don't nuke this GEP yet.
+          if (Instruction *I = visitBitCast(*BCI)) {
+            if (I != BCI) {
+              I->takeName(BCI);
+              BCI->getParent()->getInstList().insert(BCI->getIterator(), I);
+              replaceInstUsesWith(*BCI, I);
+            }
+            return &GEP;
+          }
+        }
+
+        if (SrcType->getPointerAddressSpace() != GEP.getAddressSpace())
+          return new AddrSpaceCastInst(SrcOp, GEPType);
+        return new BitCastInst(SrcOp, GEPType);
+      }
+
+      // Otherwise, if the offset is non-zero, we need to find out if there is a
+      // field at Offset in 'A's type.  If so, we can pull the cast through the
+      // GEP.
+      SmallVector<Value*, 8> NewIndices;
+      if (FindElementAtOffset(SrcType, Offset.getSExtValue(), NewIndices)) {
+        Value *NGEP =
+            GEP.isInBounds()
+                ? Builder.CreateInBoundsGEP(SrcEltType, SrcOp, NewIndices)
+                : Builder.CreateGEP(SrcEltType, SrcOp, NewIndices);
+
+        if (NGEP->getType() == GEPType)
+          return replaceInstUsesWith(GEP, NGEP);
+        NGEP->takeName(&GEP);
+
+        if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace())
+          return new AddrSpaceCastInst(NGEP, GEPType);
+        return new BitCastInst(NGEP, GEPType);
+      }
+    }
+  }
+
+  if (!GEP.isInBounds()) {
+    unsigned IdxWidth =
+        DL.getIndexSizeInBits(PtrOp->getType()->getPointerAddressSpace());
+    APInt BasePtrOffset(IdxWidth, 0);
+    Value *UnderlyingPtrOp =
+            PtrOp->stripAndAccumulateInBoundsConstantOffsets(DL,
+                                                             BasePtrOffset);
+    if (auto *AI = dyn_cast<AllocaInst>(UnderlyingPtrOp)) {
+      if (GEP.accumulateConstantOffset(DL, BasePtrOffset) &&
+          BasePtrOffset.isNonNegative()) {
+        APInt AllocSize(
+            IdxWidth,
+            DL.getTypeAllocSize(AI->getAllocatedType()).getKnownMinSize());
+        if (BasePtrOffset.ule(AllocSize)) {
+          return GetElementPtrInst::CreateInBounds(
+              GEP.getSourceElementType(), PtrOp, makeArrayRef(Ops).slice(1),
+              GEP.getName());
+        }
+      }
+    }
+  }
+
+  if (Instruction *R = foldSelectGEP(GEP, Builder))
+    return R;
+
+  return nullptr;
+}
+
+static bool isNeverEqualToUnescapedAlloc(Value *V, const TargetLibraryInfo *TLI,
+                                         Instruction *AI) {
+  if (isa<ConstantPointerNull>(V))
+    return true;
+  if (auto *LI = dyn_cast<LoadInst>(V))
+    return isa<GlobalVariable>(LI->getPointerOperand());
+  // Two distinct allocations will never be equal.
+  // We rely on LookThroughBitCast in isAllocLikeFn being false, since looking
+  // through bitcasts of V can cause
+  // the result statement below to be true, even when AI and V (ex:
+  // i8* ->i32* ->i8* of AI) are the same allocations.
+  return isAllocLikeFn(V, TLI) && V != AI;
+}
+
+static bool isAllocSiteRemovable(Instruction *AI,
+                                 SmallVectorImpl<WeakTrackingVH> &Users,
+                                 const TargetLibraryInfo *TLI) {
+  SmallVector<Instruction*, 4> Worklist;
+  Worklist.push_back(AI);
+
+  do {
+    Instruction *PI = Worklist.pop_back_val();
+    for (User *U : PI->users()) {
+      Instruction *I = cast<Instruction>(U);
+      switch (I->getOpcode()) {
+      default:
+        // Give up the moment we see something we can't handle.
+        return false;
+
+      case Instruction::AddrSpaceCast:
+      case Instruction::BitCast:
+      case Instruction::GetElementPtr:
+        Users.emplace_back(I);
+        Worklist.push_back(I);
+        continue;
+
+      case Instruction::ICmp: {
+        ICmpInst *ICI = cast<ICmpInst>(I);
+        // We can fold eq/ne comparisons with null to false/true, respectively.
+        // We also fold comparisons in some conditions provided the alloc has
+        // not escaped (see isNeverEqualToUnescapedAlloc).
+        if (!ICI->isEquality())
+          return false;
+        unsigned OtherIndex = (ICI->getOperand(0) == PI) ? 1 : 0;
+        if (!isNeverEqualToUnescapedAlloc(ICI->getOperand(OtherIndex), TLI, AI))
+          return false;
+        Users.emplace_back(I);
+        continue;
+      }
+
+      case Instruction::Call:
+        // Ignore no-op and store intrinsics.
+        if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+          switch (II->getIntrinsicID()) {
+          default:
+            return false;
+
+          case Intrinsic::memmove:
+          case Intrinsic::memcpy:
+          case Intrinsic::memset: {
+            MemIntrinsic *MI = cast<MemIntrinsic>(II);
+            if (MI->isVolatile() || MI->getRawDest() != PI)
+              return false;
+            LLVM_FALLTHROUGH;
+          }
+          case Intrinsic::assume:
+          case Intrinsic::invariant_start:
+          case Intrinsic::invariant_end:
+          case Intrinsic::lifetime_start:
+          case Intrinsic::lifetime_end:
+          case Intrinsic::objectsize:
+            Users.emplace_back(I);
+            continue;
+          }
+        }
+
+        if (isFreeCall(I, TLI)) {
+          Users.emplace_back(I);
+          continue;
+        }
+        return false;
+
+      case Instruction::Store: {
+        StoreInst *SI = cast<StoreInst>(I);
+        if (SI->isVolatile() || SI->getPointerOperand() != PI)
+          return false;
+        Users.emplace_back(I);
+        continue;
+      }
+      }
+      llvm_unreachable("missing a return?");
+    }
+  } while (!Worklist.empty());
+  return true;
+}
+
 Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
-  // If we have a malloc call which is only used in any amount of comparisons to 
-  // null and free calls, delete the calls and replace the comparisons with true 
-  // or false as appropriate. 
- 
-  // This is based on the principle that we can substitute our own allocation 
-  // function (which will never return null) rather than knowledge of the 
-  // specific function being called. In some sense this can change the permitted 
-  // outputs of a program (when we convert a malloc to an alloca, the fact that 
-  // the allocation is now on the stack is potentially visible, for example), 
-  // but we believe in a permissible manner. 
-  SmallVector<WeakTrackingVH, 64> Users; 
- 
-  // If we are removing an alloca with a dbg.declare, insert dbg.value calls 
-  // before each store. 
+  // If we have a malloc call which is only used in any amount of comparisons to
+  // null and free calls, delete the calls and replace the comparisons with true
+  // or false as appropriate.
+
+  // This is based on the principle that we can substitute our own allocation
+  // function (which will never return null) rather than knowledge of the
+  // specific function being called. In some sense this can change the permitted
+  // outputs of a program (when we convert a malloc to an alloca, the fact that
+  // the allocation is now on the stack is potentially visible, for example),
+  // but we believe in a permissible manner.
+  SmallVector<WeakTrackingVH, 64> Users;
+
+  // If we are removing an alloca with a dbg.declare, insert dbg.value calls
+  // before each store.
   SmallVector<DbgVariableIntrinsic *, 8> DVIs;
-  std::unique_ptr<DIBuilder> DIB; 
-  if (isa<AllocaInst>(MI)) { 
+  std::unique_ptr<DIBuilder> DIB;
+  if (isa<AllocaInst>(MI)) {
     findDbgUsers(DVIs, &MI);
-    DIB.reset(new DIBuilder(*MI.getModule(), /*AllowUnresolved=*/false)); 
-  } 
- 
-  if (isAllocSiteRemovable(&MI, Users, &TLI)) { 
-    for (unsigned i = 0, e = Users.size(); i != e; ++i) { 
-      // Lowering all @llvm.objectsize calls first because they may 
-      // use a bitcast/GEP of the alloca we are removing. 
-      if (!Users[i]) 
-       continue; 
- 
-      Instruction *I = cast<Instruction>(&*Users[i]); 
- 
-      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 
-        if (II->getIntrinsicID() == Intrinsic::objectsize) { 
-          Value *Result = 
-              lowerObjectSizeCall(II, DL, &TLI, /*MustSucceed=*/true); 
-          replaceInstUsesWith(*I, Result); 
-          eraseInstFromFunction(*I); 
-          Users[i] = nullptr; // Skip examining in the next loop. 
-        } 
-      } 
-    } 
-    for (unsigned i = 0, e = Users.size(); i != e; ++i) { 
-      if (!Users[i]) 
-        continue; 
- 
-      Instruction *I = cast<Instruction>(&*Users[i]); 
- 
-      if (ICmpInst *C = dyn_cast<ICmpInst>(I)) { 
-        replaceInstUsesWith(*C, 
-                            ConstantInt::get(Type::getInt1Ty(C->getContext()), 
-                                             C->isFalseWhenEqual())); 
-      } else if (auto *SI = dyn_cast<StoreInst>(I)) { 
+    DIB.reset(new DIBuilder(*MI.getModule(), /*AllowUnresolved=*/false));
+  }
+
+  if (isAllocSiteRemovable(&MI, Users, &TLI)) {
+    for (unsigned i = 0, e = Users.size(); i != e; ++i) {
+      // Lowering all @llvm.objectsize calls first because they may
+      // use a bitcast/GEP of the alloca we are removing.
+      if (!Users[i])
+       continue;
+
+      Instruction *I = cast<Instruction>(&*Users[i]);
+
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+        if (II->getIntrinsicID() == Intrinsic::objectsize) {
+          Value *Result =
+              lowerObjectSizeCall(II, DL, &TLI, /*MustSucceed=*/true);
+          replaceInstUsesWith(*I, Result);
+          eraseInstFromFunction(*I);
+          Users[i] = nullptr; // Skip examining in the next loop.
+        }
+      }
+    }
+    for (unsigned i = 0, e = Users.size(); i != e; ++i) {
+      if (!Users[i])
+        continue;
+
+      Instruction *I = cast<Instruction>(&*Users[i]);
+
+      if (ICmpInst *C = dyn_cast<ICmpInst>(I)) {
+        replaceInstUsesWith(*C,
+                            ConstantInt::get(Type::getInt1Ty(C->getContext()),
+                                             C->isFalseWhenEqual()));
+      } else if (auto *SI = dyn_cast<StoreInst>(I)) {
         for (auto *DVI : DVIs)
           if (DVI->isAddressOfVariable())
             ConvertDebugDeclareToDebugValue(DVI, SI, *DIB);
-      } else { 
-        // Casts, GEP, or anything else: we're about to delete this instruction, 
-        // so it can not have any valid uses. 
-        replaceInstUsesWith(*I, UndefValue::get(I->getType())); 
-      } 
-      eraseInstFromFunction(*I); 
-    } 
- 
-    if (InvokeInst *II = dyn_cast<InvokeInst>(&MI)) { 
-      // Replace invoke with a NOP intrinsic to maintain the original CFG 
-      Module *M = II->getModule(); 
-      Function *F = Intrinsic::getDeclaration(M, Intrinsic::donothing); 
-      InvokeInst::Create(F, II->getNormalDest(), II->getUnwindDest(), 
-                         None, "", II->getParent()); 
-    } 
- 
+      } else {
+        // Casts, GEP, or anything else: we're about to delete this instruction,
+        // so it can not have any valid uses.
+        replaceInstUsesWith(*I, UndefValue::get(I->getType()));
+      }
+      eraseInstFromFunction(*I);
+    }
+
+    if (InvokeInst *II = dyn_cast<InvokeInst>(&MI)) {
+      // Replace invoke with a NOP intrinsic to maintain the original CFG
+      Module *M = II->getModule();
+      Function *F = Intrinsic::getDeclaration(M, Intrinsic::donothing);
+      InvokeInst::Create(F, II->getNormalDest(), II->getUnwindDest(),
+                         None, "", II->getParent());
+    }
+
     // Remove debug intrinsics which describe the value contained within the
     // alloca. In addition to removing dbg.{declare,addr} which simply point to
     // the alloca, remove dbg.value(<alloca>, ..., DW_OP_deref)'s as well, e.g.:
@@ -2697,157 +2697,157 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
     for (auto *DVI : DVIs)
       if (DVI->isAddressOfVariable() || DVI->getExpression()->startsWithDeref())
         DVI->eraseFromParent();
- 
-    return eraseInstFromFunction(MI); 
-  } 
-  return nullptr; 
-} 
- 
-/// Move the call to free before a NULL test. 
-/// 
-/// Check if this free is accessed after its argument has been test 
-/// against NULL (property 0). 
-/// If yes, it is legal to move this call in its predecessor block. 
-/// 
-/// The move is performed only if the block containing the call to free 
-/// will be removed, i.e.: 
-/// 1. it has only one predecessor P, and P has two successors 
-/// 2. it contains the call, noops, and an unconditional branch 
-/// 3. its successor is the same as its predecessor's successor 
-/// 
-/// The profitability is out-of concern here and this function should 
-/// be called only if the caller knows this transformation would be 
-/// profitable (e.g., for code size). 
-static Instruction *tryToMoveFreeBeforeNullTest(CallInst &FI, 
-                                                const DataLayout &DL) { 
-  Value *Op = FI.getArgOperand(0); 
-  BasicBlock *FreeInstrBB = FI.getParent(); 
-  BasicBlock *PredBB = FreeInstrBB->getSinglePredecessor(); 
- 
-  // Validate part of constraint #1: Only one predecessor 
-  // FIXME: We can extend the number of predecessor, but in that case, we 
-  //        would duplicate the call to free in each predecessor and it may 
-  //        not be profitable even for code size. 
-  if (!PredBB) 
-    return nullptr; 
- 
-  // Validate constraint #2: Does this block contains only the call to 
-  //                         free, noops, and an unconditional branch? 
-  BasicBlock *SuccBB; 
-  Instruction *FreeInstrBBTerminator = FreeInstrBB->getTerminator(); 
-  if (!match(FreeInstrBBTerminator, m_UnconditionalBr(SuccBB))) 
-    return nullptr; 
- 
-  // If there are only 2 instructions in the block, at this point, 
-  // this is the call to free and unconditional. 
-  // If there are more than 2 instructions, check that they are noops 
-  // i.e., they won't hurt the performance of the generated code. 
-  if (FreeInstrBB->size() != 2) { 
-    for (const Instruction &Inst : FreeInstrBB->instructionsWithoutDebug()) { 
-      if (&Inst == &FI || &Inst == FreeInstrBBTerminator) 
-        continue; 
-      auto *Cast = dyn_cast<CastInst>(&Inst); 
-      if (!Cast || !Cast->isNoopCast(DL)) 
-        return nullptr; 
-    } 
-  } 
-  // Validate the rest of constraint #1 by matching on the pred branch. 
-  Instruction *TI = PredBB->getTerminator(); 
-  BasicBlock *TrueBB, *FalseBB; 
-  ICmpInst::Predicate Pred; 
-  if (!match(TI, m_Br(m_ICmp(Pred, 
-                             m_CombineOr(m_Specific(Op), 
-                                         m_Specific(Op->stripPointerCasts())), 
-                             m_Zero()), 
-                      TrueBB, FalseBB))) 
-    return nullptr; 
-  if (Pred != ICmpInst::ICMP_EQ && Pred != ICmpInst::ICMP_NE) 
-    return nullptr; 
- 
-  // Validate constraint #3: Ensure the null case just falls through. 
-  if (SuccBB != (Pred == ICmpInst::ICMP_EQ ? TrueBB : FalseBB)) 
-    return nullptr; 
-  assert(FreeInstrBB == (Pred == ICmpInst::ICMP_EQ ? FalseBB : TrueBB) && 
-         "Broken CFG: missing edge from predecessor to successor"); 
- 
-  // At this point, we know that everything in FreeInstrBB can be moved 
-  // before TI. 
-  for (BasicBlock::iterator It = FreeInstrBB->begin(), End = FreeInstrBB->end(); 
-       It != End;) { 
-    Instruction &Instr = *It++; 
-    if (&Instr == FreeInstrBBTerminator) 
-      break; 
-    Instr.moveBefore(TI); 
-  } 
-  assert(FreeInstrBB->size() == 1 && 
-         "Only the branch instruction should remain"); 
-  return &FI; 
-} 
- 
+
+    return eraseInstFromFunction(MI);
+  }
+  return nullptr;
+}
+
+/// Move the call to free before a NULL test.
+///
+/// Check if this free is accessed after its argument has been test
+/// against NULL (property 0).
+/// If yes, it is legal to move this call in its predecessor block.
+///
+/// The move is performed only if the block containing the call to free
+/// will be removed, i.e.:
+/// 1. it has only one predecessor P, and P has two successors
+/// 2. it contains the call, noops, and an unconditional branch
+/// 3. its successor is the same as its predecessor's successor
+///
+/// The profitability is out-of concern here and this function should
+/// be called only if the caller knows this transformation would be
+/// profitable (e.g., for code size).
+static Instruction *tryToMoveFreeBeforeNullTest(CallInst &FI,
+                                                const DataLayout &DL) {
+  Value *Op = FI.getArgOperand(0);
+  BasicBlock *FreeInstrBB = FI.getParent();
+  BasicBlock *PredBB = FreeInstrBB->getSinglePredecessor();
+
+  // Validate part of constraint #1: Only one predecessor
+  // FIXME: We can extend the number of predecessor, but in that case, we
+  //        would duplicate the call to free in each predecessor and it may
+  //        not be profitable even for code size.
+  if (!PredBB)
+    return nullptr;
+
+  // Validate constraint #2: Does this block contains only the call to
+  //                         free, noops, and an unconditional branch?
+  BasicBlock *SuccBB;
+  Instruction *FreeInstrBBTerminator = FreeInstrBB->getTerminator();
+  if (!match(FreeInstrBBTerminator, m_UnconditionalBr(SuccBB)))
+    return nullptr;
+
+  // If there are only 2 instructions in the block, at this point,
+  // this is the call to free and unconditional.
+  // If there are more than 2 instructions, check that they are noops
+  // i.e., they won't hurt the performance of the generated code.
+  if (FreeInstrBB->size() != 2) {
+    for (const Instruction &Inst : FreeInstrBB->instructionsWithoutDebug()) {
+      if (&Inst == &FI || &Inst == FreeInstrBBTerminator)
+        continue;
+      auto *Cast = dyn_cast<CastInst>(&Inst);
+      if (!Cast || !Cast->isNoopCast(DL))
+        return nullptr;
+    }
+  }
+  // Validate the rest of constraint #1 by matching on the pred branch.
+  Instruction *TI = PredBB->getTerminator();
+  BasicBlock *TrueBB, *FalseBB;
+  ICmpInst::Predicate Pred;
+  if (!match(TI, m_Br(m_ICmp(Pred,
+                             m_CombineOr(m_Specific(Op),
+                                         m_Specific(Op->stripPointerCasts())),
+                             m_Zero()),
+                      TrueBB, FalseBB)))
+    return nullptr;
+  if (Pred != ICmpInst::ICMP_EQ && Pred != ICmpInst::ICMP_NE)
+    return nullptr;
+
+  // Validate constraint #3: Ensure the null case just falls through.
+  if (SuccBB != (Pred == ICmpInst::ICMP_EQ ? TrueBB : FalseBB))
+    return nullptr;
+  assert(FreeInstrBB == (Pred == ICmpInst::ICMP_EQ ? FalseBB : TrueBB) &&
+         "Broken CFG: missing edge from predecessor to successor");
+
+  // At this point, we know that everything in FreeInstrBB can be moved
+  // before TI.
+  for (BasicBlock::iterator It = FreeInstrBB->begin(), End = FreeInstrBB->end();
+       It != End;) {
+    Instruction &Instr = *It++;
+    if (&Instr == FreeInstrBBTerminator)
+      break;
+    Instr.moveBefore(TI);
+  }
+  assert(FreeInstrBB->size() == 1 &&
+         "Only the branch instruction should remain");
+  return &FI;
+}
+
 Instruction *InstCombinerImpl::visitFree(CallInst &FI) {
-  Value *Op = FI.getArgOperand(0); 
- 
-  // free undef -> unreachable. 
-  if (isa<UndefValue>(Op)) { 
-    // Leave a marker since we can't modify the CFG here. 
-    CreateNonTerminatorUnreachable(&FI); 
-    return eraseInstFromFunction(FI); 
-  } 
- 
-  // If we have 'free null' delete the instruction.  This can happen in stl code 
-  // when lots of inlining happens. 
-  if (isa<ConstantPointerNull>(Op)) 
-    return eraseInstFromFunction(FI); 
- 
-  // If we optimize for code size, try to move the call to free before the null 
-  // test so that simplify cfg can remove the empty block and dead code 
-  // elimination the branch. I.e., helps to turn something like: 
-  // if (foo) free(foo); 
-  // into 
-  // free(foo); 
-  // 
-  // Note that we can only do this for 'free' and not for any flavor of 
-  // 'operator delete'; there is no 'operator delete' symbol for which we are 
-  // permitted to invent a call, even if we're passing in a null pointer. 
-  if (MinimizeSize) { 
-    LibFunc Func; 
-    if (TLI.getLibFunc(FI, Func) && TLI.has(Func) && Func == LibFunc_free) 
-      if (Instruction *I = tryToMoveFreeBeforeNullTest(FI, DL)) 
-        return I; 
-  } 
- 
-  return nullptr; 
-} 
- 
-static bool isMustTailCall(Value *V) { 
-  if (auto *CI = dyn_cast<CallInst>(V)) 
-    return CI->isMustTailCall(); 
-  return false; 
-} 
- 
+  Value *Op = FI.getArgOperand(0);
+
+  // free undef -> unreachable.
+  if (isa<UndefValue>(Op)) {
+    // Leave a marker since we can't modify the CFG here.
+    CreateNonTerminatorUnreachable(&FI);
+    return eraseInstFromFunction(FI);
+  }
+
+  // If we have 'free null' delete the instruction.  This can happen in stl code
+  // when lots of inlining happens.
+  if (isa<ConstantPointerNull>(Op))
+    return eraseInstFromFunction(FI);
+
+  // If we optimize for code size, try to move the call to free before the null
+  // test so that simplify cfg can remove the empty block and dead code
+  // elimination the branch. I.e., helps to turn something like:
+  // if (foo) free(foo);
+  // into
+  // free(foo);
+  //
+  // Note that we can only do this for 'free' and not for any flavor of
+  // 'operator delete'; there is no 'operator delete' symbol for which we are
+  // permitted to invent a call, even if we're passing in a null pointer.
+  if (MinimizeSize) {
+    LibFunc Func;
+    if (TLI.getLibFunc(FI, Func) && TLI.has(Func) && Func == LibFunc_free)
+      if (Instruction *I = tryToMoveFreeBeforeNullTest(FI, DL))
+        return I;
+  }
+
+  return nullptr;
+}
+
+static bool isMustTailCall(Value *V) {
+  if (auto *CI = dyn_cast<CallInst>(V))
+    return CI->isMustTailCall();
+  return false;
+}
+
 Instruction *InstCombinerImpl::visitReturnInst(ReturnInst &RI) {
-  if (RI.getNumOperands() == 0) // ret void 
-    return nullptr; 
- 
-  Value *ResultOp = RI.getOperand(0); 
-  Type *VTy = ResultOp->getType(); 
-  if (!VTy->isIntegerTy() || isa<Constant>(ResultOp)) 
-    return nullptr; 
- 
-  // Don't replace result of musttail calls. 
-  if (isMustTailCall(ResultOp)) 
-    return nullptr; 
- 
-  // There might be assume intrinsics dominating this return that completely 
-  // determine the value. If so, constant fold it. 
-  KnownBits Known = computeKnownBits(ResultOp, 0, &RI); 
-  if (Known.isConstant()) 
-    return replaceOperand(RI, 0, 
-        Constant::getIntegerValue(VTy, Known.getConstant())); 
- 
-  return nullptr; 
-} 
- 
+  if (RI.getNumOperands() == 0) // ret void
+    return nullptr;
+
+  Value *ResultOp = RI.getOperand(0);
+  Type *VTy = ResultOp->getType();
+  if (!VTy->isIntegerTy() || isa<Constant>(ResultOp))
+    return nullptr;
+
+  // Don't replace result of musttail calls.
+  if (isMustTailCall(ResultOp))
+    return nullptr;
+
+  // There might be assume intrinsics dominating this return that completely
+  // determine the value. If so, constant fold it.
+  KnownBits Known = computeKnownBits(ResultOp, 0, &RI);
+  if (Known.isConstant())
+    return replaceOperand(RI, 0,
+        Constant::getIntegerValue(VTy, Known.getConstant()));
+
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::visitUnreachableInst(UnreachableInst &I) {
   // Try to remove the previous instruction if it must lead to unreachable.
   // This includes instructions like stores and "llvm.assume" that may not get
@@ -2873,597 +2873,597 @@ Instruction *InstCombinerImpl::visitUnreachableInst(UnreachableInst &I) {
 }
 
 Instruction *InstCombinerImpl::visitUnconditionalBranchInst(BranchInst &BI) {
-  assert(BI.isUnconditional() && "Only for unconditional branches."); 
- 
-  // If this store is the second-to-last instruction in the basic block 
-  // (excluding debug info and bitcasts of pointers) and if the block ends with 
-  // an unconditional branch, try to move the store to the successor block. 
- 
-  auto GetLastSinkableStore = [](BasicBlock::iterator BBI) { 
-    auto IsNoopInstrForStoreMerging = [](BasicBlock::iterator BBI) { 
-      return isa<DbgInfoIntrinsic>(BBI) || 
-             (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy()); 
-    }; 
- 
-    BasicBlock::iterator FirstInstr = BBI->getParent()->begin(); 
-    do { 
-      if (BBI != FirstInstr) 
-        --BBI; 
-    } while (BBI != FirstInstr && IsNoopInstrForStoreMerging(BBI)); 
- 
-    return dyn_cast<StoreInst>(BBI); 
-  }; 
- 
-  if (StoreInst *SI = GetLastSinkableStore(BasicBlock::iterator(BI))) 
-    if (mergeStoreIntoSuccessor(*SI)) 
-      return &BI; 
- 
-  return nullptr; 
-} 
- 
+  assert(BI.isUnconditional() && "Only for unconditional branches.");
+
+  // If this store is the second-to-last instruction in the basic block
+  // (excluding debug info and bitcasts of pointers) and if the block ends with
+  // an unconditional branch, try to move the store to the successor block.
+
+  auto GetLastSinkableStore = [](BasicBlock::iterator BBI) {
+    auto IsNoopInstrForStoreMerging = [](BasicBlock::iterator BBI) {
+      return isa<DbgInfoIntrinsic>(BBI) ||
+             (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy());
+    };
+
+    BasicBlock::iterator FirstInstr = BBI->getParent()->begin();
+    do {
+      if (BBI != FirstInstr)
+        --BBI;
+    } while (BBI != FirstInstr && IsNoopInstrForStoreMerging(BBI));
+
+    return dyn_cast<StoreInst>(BBI);
+  };
+
+  if (StoreInst *SI = GetLastSinkableStore(BasicBlock::iterator(BI)))
+    if (mergeStoreIntoSuccessor(*SI))
+      return &BI;
+
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::visitBranchInst(BranchInst &BI) {
-  if (BI.isUnconditional()) 
-    return visitUnconditionalBranchInst(BI); 
- 
-  // Change br (not X), label True, label False to: br X, label False, True 
-  Value *X = nullptr; 
-  if (match(&BI, m_Br(m_Not(m_Value(X)), m_BasicBlock(), m_BasicBlock())) && 
-      !isa<Constant>(X)) { 
-    // Swap Destinations and condition... 
-    BI.swapSuccessors(); 
-    return replaceOperand(BI, 0, X); 
-  } 
- 
-  // If the condition is irrelevant, remove the use so that other 
-  // transforms on the condition become more effective. 
-  if (!isa<ConstantInt>(BI.getCondition()) && 
-      BI.getSuccessor(0) == BI.getSuccessor(1)) 
-    return replaceOperand( 
-        BI, 0, ConstantInt::getFalse(BI.getCondition()->getType())); 
- 
-  // Canonicalize, for example, fcmp_one -> fcmp_oeq. 
-  CmpInst::Predicate Pred; 
-  if (match(&BI, m_Br(m_OneUse(m_FCmp(Pred, m_Value(), m_Value())), 
-                      m_BasicBlock(), m_BasicBlock())) && 
-      !isCanonicalPredicate(Pred)) { 
-    // Swap destinations and condition. 
-    CmpInst *Cond = cast<CmpInst>(BI.getCondition()); 
-    Cond->setPredicate(CmpInst::getInversePredicate(Pred)); 
-    BI.swapSuccessors(); 
-    Worklist.push(Cond); 
-    return &BI; 
-  } 
- 
-  return nullptr; 
-} 
- 
+  if (BI.isUnconditional())
+    return visitUnconditionalBranchInst(BI);
+
+  // Change br (not X), label True, label False to: br X, label False, True
+  Value *X = nullptr;
+  if (match(&BI, m_Br(m_Not(m_Value(X)), m_BasicBlock(), m_BasicBlock())) &&
+      !isa<Constant>(X)) {
+    // Swap Destinations and condition...
+    BI.swapSuccessors();
+    return replaceOperand(BI, 0, X);
+  }
+
+  // If the condition is irrelevant, remove the use so that other
+  // transforms on the condition become more effective.
+  if (!isa<ConstantInt>(BI.getCondition()) &&
+      BI.getSuccessor(0) == BI.getSuccessor(1))
+    return replaceOperand(
+        BI, 0, ConstantInt::getFalse(BI.getCondition()->getType()));
+
+  // Canonicalize, for example, fcmp_one -> fcmp_oeq.
+  CmpInst::Predicate Pred;
+  if (match(&BI, m_Br(m_OneUse(m_FCmp(Pred, m_Value(), m_Value())),
+                      m_BasicBlock(), m_BasicBlock())) &&
+      !isCanonicalPredicate(Pred)) {
+    // Swap destinations and condition.
+    CmpInst *Cond = cast<CmpInst>(BI.getCondition());
+    Cond->setPredicate(CmpInst::getInversePredicate(Pred));
+    BI.swapSuccessors();
+    Worklist.push(Cond);
+    return &BI;
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::visitSwitchInst(SwitchInst &SI) {
-  Value *Cond = SI.getCondition(); 
-  Value *Op0; 
-  ConstantInt *AddRHS; 
-  if (match(Cond, m_Add(m_Value(Op0), m_ConstantInt(AddRHS)))) { 
-    // Change 'switch (X+4) case 1:' into 'switch (X) case -3'. 
-    for (auto Case : SI.cases()) { 
-      Constant *NewCase = ConstantExpr::getSub(Case.getCaseValue(), AddRHS); 
-      assert(isa<ConstantInt>(NewCase) && 
-             "Result of expression should be constant"); 
-      Case.setValue(cast<ConstantInt>(NewCase)); 
-    } 
-    return replaceOperand(SI, 0, Op0); 
-  } 
- 
-  KnownBits Known = computeKnownBits(Cond, 0, &SI); 
-  unsigned LeadingKnownZeros = Known.countMinLeadingZeros(); 
-  unsigned LeadingKnownOnes = Known.countMinLeadingOnes(); 
- 
-  // Compute the number of leading bits we can ignore. 
-  // TODO: A better way to determine this would use ComputeNumSignBits(). 
-  for (auto &C : SI.cases()) { 
-    LeadingKnownZeros = std::min( 
-        LeadingKnownZeros, C.getCaseValue()->getValue().countLeadingZeros()); 
-    LeadingKnownOnes = std::min( 
-        LeadingKnownOnes, C.getCaseValue()->getValue().countLeadingOnes()); 
-  } 
- 
-  unsigned NewWidth = Known.getBitWidth() - std::max(LeadingKnownZeros, LeadingKnownOnes); 
- 
-  // Shrink the condition operand if the new type is smaller than the old type. 
+  Value *Cond = SI.getCondition();
+  Value *Op0;
+  ConstantInt *AddRHS;
+  if (match(Cond, m_Add(m_Value(Op0), m_ConstantInt(AddRHS)))) {
+    // Change 'switch (X+4) case 1:' into 'switch (X) case -3'.
+    for (auto Case : SI.cases()) {
+      Constant *NewCase = ConstantExpr::getSub(Case.getCaseValue(), AddRHS);
+      assert(isa<ConstantInt>(NewCase) &&
+             "Result of expression should be constant");
+      Case.setValue(cast<ConstantInt>(NewCase));
+    }
+    return replaceOperand(SI, 0, Op0);
+  }
+
+  KnownBits Known = computeKnownBits(Cond, 0, &SI);
+  unsigned LeadingKnownZeros = Known.countMinLeadingZeros();
+  unsigned LeadingKnownOnes = Known.countMinLeadingOnes();
+
+  // Compute the number of leading bits we can ignore.
+  // TODO: A better way to determine this would use ComputeNumSignBits().
+  for (auto &C : SI.cases()) {
+    LeadingKnownZeros = std::min(
+        LeadingKnownZeros, C.getCaseValue()->getValue().countLeadingZeros());
+    LeadingKnownOnes = std::min(
+        LeadingKnownOnes, C.getCaseValue()->getValue().countLeadingOnes());
+  }
+
+  unsigned NewWidth = Known.getBitWidth() - std::max(LeadingKnownZeros, LeadingKnownOnes);
+
+  // Shrink the condition operand if the new type is smaller than the old type.
   // But do not shrink to a non-standard type, because backend can't generate
-  // good code for that yet. 
-  // TODO: We can make it aggressive again after fixing PR39569. 
-  if (NewWidth > 0 && NewWidth < Known.getBitWidth() && 
-      shouldChangeType(Known.getBitWidth(), NewWidth)) { 
-    IntegerType *Ty = IntegerType::get(SI.getContext(), NewWidth); 
-    Builder.SetInsertPoint(&SI); 
-    Value *NewCond = Builder.CreateTrunc(Cond, Ty, "trunc"); 
- 
-    for (auto Case : SI.cases()) { 
-      APInt TruncatedCase = Case.getCaseValue()->getValue().trunc(NewWidth); 
-      Case.setValue(ConstantInt::get(SI.getContext(), TruncatedCase)); 
-    } 
-    return replaceOperand(SI, 0, NewCond); 
-  } 
- 
-  return nullptr; 
-} 
- 
+  // good code for that yet.
+  // TODO: We can make it aggressive again after fixing PR39569.
+  if (NewWidth > 0 && NewWidth < Known.getBitWidth() &&
+      shouldChangeType(Known.getBitWidth(), NewWidth)) {
+    IntegerType *Ty = IntegerType::get(SI.getContext(), NewWidth);
+    Builder.SetInsertPoint(&SI);
+    Value *NewCond = Builder.CreateTrunc(Cond, Ty, "trunc");
+
+    for (auto Case : SI.cases()) {
+      APInt TruncatedCase = Case.getCaseValue()->getValue().trunc(NewWidth);
+      Case.setValue(ConstantInt::get(SI.getContext(), TruncatedCase));
+    }
+    return replaceOperand(SI, 0, NewCond);
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::visitExtractValueInst(ExtractValueInst &EV) {
-  Value *Agg = EV.getAggregateOperand(); 
- 
-  if (!EV.hasIndices()) 
-    return replaceInstUsesWith(EV, Agg); 
- 
-  if (Value *V = SimplifyExtractValueInst(Agg, EV.getIndices(), 
-                                          SQ.getWithInstruction(&EV))) 
-    return replaceInstUsesWith(EV, V); 
- 
-  if (InsertValueInst *IV = dyn_cast<InsertValueInst>(Agg)) { 
-    // We're extracting from an insertvalue instruction, compare the indices 
-    const unsigned *exti, *exte, *insi, *inse; 
-    for (exti = EV.idx_begin(), insi = IV->idx_begin(), 
-         exte = EV.idx_end(), inse = IV->idx_end(); 
-         exti != exte && insi != inse; 
-         ++exti, ++insi) { 
-      if (*insi != *exti) 
-        // The insert and extract both reference distinctly different elements. 
-        // This means the extract is not influenced by the insert, and we can 
-        // replace the aggregate operand of the extract with the aggregate 
-        // operand of the insert. i.e., replace 
-        // %I = insertvalue { i32, { i32 } } %A, { i32 } { i32 42 }, 1 
-        // %E = extractvalue { i32, { i32 } } %I, 0 
-        // with 
-        // %E = extractvalue { i32, { i32 } } %A, 0 
-        return ExtractValueInst::Create(IV->getAggregateOperand(), 
-                                        EV.getIndices()); 
-    } 
-    if (exti == exte && insi == inse) 
-      // Both iterators are at the end: Index lists are identical. Replace 
-      // %B = insertvalue { i32, { i32 } } %A, i32 42, 1, 0 
-      // %C = extractvalue { i32, { i32 } } %B, 1, 0 
-      // with "i32 42" 
-      return replaceInstUsesWith(EV, IV->getInsertedValueOperand()); 
-    if (exti == exte) { 
-      // The extract list is a prefix of the insert list. i.e. replace 
-      // %I = insertvalue { i32, { i32 } } %A, i32 42, 1, 0 
-      // %E = extractvalue { i32, { i32 } } %I, 1 
-      // with 
-      // %X = extractvalue { i32, { i32 } } %A, 1 
-      // %E = insertvalue { i32 } %X, i32 42, 0 
-      // by switching the order of the insert and extract (though the 
-      // insertvalue should be left in, since it may have other uses). 
-      Value *NewEV = Builder.CreateExtractValue(IV->getAggregateOperand(), 
-                                                EV.getIndices()); 
-      return InsertValueInst::Create(NewEV, IV->getInsertedValueOperand(), 
-                                     makeArrayRef(insi, inse)); 
-    } 
-    if (insi == inse) 
-      // The insert list is a prefix of the extract list 
-      // We can simply remove the common indices from the extract and make it 
-      // operate on the inserted value instead of the insertvalue result. 
-      // i.e., replace 
-      // %I = insertvalue { i32, { i32 } } %A, { i32 } { i32 42 }, 1 
-      // %E = extractvalue { i32, { i32 } } %I, 1, 0 
-      // with 
-      // %E extractvalue { i32 } { i32 42 }, 0 
-      return ExtractValueInst::Create(IV->getInsertedValueOperand(), 
-                                      makeArrayRef(exti, exte)); 
-  } 
-  if (WithOverflowInst *WO = dyn_cast<WithOverflowInst>(Agg)) { 
-    // We're extracting from an overflow intrinsic, see if we're the only user, 
-    // which allows us to simplify multiple result intrinsics to simpler 
-    // things that just get one value. 
-    if (WO->hasOneUse()) { 
-      // Check if we're grabbing only the result of a 'with overflow' intrinsic 
-      // and replace it with a traditional binary instruction. 
-      if (*EV.idx_begin() == 0) { 
-        Instruction::BinaryOps BinOp = WO->getBinaryOp(); 
-        Value *LHS = WO->getLHS(), *RHS = WO->getRHS(); 
-        replaceInstUsesWith(*WO, UndefValue::get(WO->getType())); 
-        eraseInstFromFunction(*WO); 
-        return BinaryOperator::Create(BinOp, LHS, RHS); 
-      } 
- 
-      // If the normal result of the add is dead, and the RHS is a constant, 
-      // we can transform this into a range comparison. 
-      // overflow = uadd a, -4  -->  overflow = icmp ugt a, 3 
-      if (WO->getIntrinsicID() == Intrinsic::uadd_with_overflow) 
-        if (ConstantInt *CI = dyn_cast<ConstantInt>(WO->getRHS())) 
-          return new ICmpInst(ICmpInst::ICMP_UGT, WO->getLHS(), 
-                              ConstantExpr::getNot(CI)); 
-    } 
-  } 
-  if (LoadInst *L = dyn_cast<LoadInst>(Agg)) 
-    // If the (non-volatile) load only has one use, we can rewrite this to a 
-    // load from a GEP. This reduces the size of the load. If a load is used 
-    // only by extractvalue instructions then this either must have been 
-    // optimized before, or it is a struct with padding, in which case we 
-    // don't want to do the transformation as it loses padding knowledge. 
-    if (L->isSimple() && L->hasOneUse()) { 
-      // extractvalue has integer indices, getelementptr has Value*s. Convert. 
-      SmallVector<Value*, 4> Indices; 
-      // Prefix an i32 0 since we need the first element. 
-      Indices.push_back(Builder.getInt32(0)); 
-      for (ExtractValueInst::idx_iterator I = EV.idx_begin(), E = EV.idx_end(); 
-            I != E; ++I) 
-        Indices.push_back(Builder.getInt32(*I)); 
- 
-      // We need to insert these at the location of the old load, not at that of 
-      // the extractvalue. 
-      Builder.SetInsertPoint(L); 
-      Value *GEP = Builder.CreateInBoundsGEP(L->getType(), 
-                                             L->getPointerOperand(), Indices); 
-      Instruction *NL = Builder.CreateLoad(EV.getType(), GEP); 
-      // Whatever aliasing information we had for the orignal load must also 
-      // hold for the smaller load, so propagate the annotations. 
-      AAMDNodes Nodes; 
-      L->getAAMetadata(Nodes); 
-      NL->setAAMetadata(Nodes); 
-      // Returning the load directly will cause the main loop to insert it in 
-      // the wrong spot, so use replaceInstUsesWith(). 
-      return replaceInstUsesWith(EV, NL); 
-    } 
-  // We could simplify extracts from other values. Note that nested extracts may 
-  // already be simplified implicitly by the above: extract (extract (insert) ) 
-  // will be translated into extract ( insert ( extract ) ) first and then just 
-  // the value inserted, if appropriate. Similarly for extracts from single-use 
-  // loads: extract (extract (load)) will be translated to extract (load (gep)) 
-  // and if again single-use then via load (gep (gep)) to load (gep). 
-  // However, double extracts from e.g. function arguments or return values 
-  // aren't handled yet. 
-  return nullptr; 
-} 
- 
-/// Return 'true' if the given typeinfo will match anything. 
-static bool isCatchAll(EHPersonality Personality, Constant *TypeInfo) { 
-  switch (Personality) { 
-  case EHPersonality::GNU_C: 
-  case EHPersonality::GNU_C_SjLj: 
-  case EHPersonality::Rust: 
-    // The GCC C EH and Rust personality only exists to support cleanups, so 
-    // it's not clear what the semantics of catch clauses are. 
-    return false; 
-  case EHPersonality::Unknown: 
-    return false; 
-  case EHPersonality::GNU_Ada: 
-    // While __gnat_all_others_value will match any Ada exception, it doesn't 
-    // match foreign exceptions (or didn't, before gcc-4.7). 
-    return false; 
-  case EHPersonality::GNU_CXX: 
-  case EHPersonality::GNU_CXX_SjLj: 
-  case EHPersonality::GNU_ObjC: 
-  case EHPersonality::MSVC_X86SEH: 
+  Value *Agg = EV.getAggregateOperand();
+
+  if (!EV.hasIndices())
+    return replaceInstUsesWith(EV, Agg);
+
+  if (Value *V = SimplifyExtractValueInst(Agg, EV.getIndices(),
+                                          SQ.getWithInstruction(&EV)))
+    return replaceInstUsesWith(EV, V);
+
+  if (InsertValueInst *IV = dyn_cast<InsertValueInst>(Agg)) {
+    // We're extracting from an insertvalue instruction, compare the indices
+    const unsigned *exti, *exte, *insi, *inse;
+    for (exti = EV.idx_begin(), insi = IV->idx_begin(),
+         exte = EV.idx_end(), inse = IV->idx_end();
+         exti != exte && insi != inse;
+         ++exti, ++insi) {
+      if (*insi != *exti)
+        // The insert and extract both reference distinctly different elements.
+        // This means the extract is not influenced by the insert, and we can
+        // replace the aggregate operand of the extract with the aggregate
+        // operand of the insert. i.e., replace
+        // %I = insertvalue { i32, { i32 } } %A, { i32 } { i32 42 }, 1
+        // %E = extractvalue { i32, { i32 } } %I, 0
+        // with
+        // %E = extractvalue { i32, { i32 } } %A, 0
+        return ExtractValueInst::Create(IV->getAggregateOperand(),
+                                        EV.getIndices());
+    }
+    if (exti == exte && insi == inse)
+      // Both iterators are at the end: Index lists are identical. Replace
+      // %B = insertvalue { i32, { i32 } } %A, i32 42, 1, 0
+      // %C = extractvalue { i32, { i32 } } %B, 1, 0
+      // with "i32 42"
+      return replaceInstUsesWith(EV, IV->getInsertedValueOperand());
+    if (exti == exte) {
+      // The extract list is a prefix of the insert list. i.e. replace
+      // %I = insertvalue { i32, { i32 } } %A, i32 42, 1, 0
+      // %E = extractvalue { i32, { i32 } } %I, 1
+      // with
+      // %X = extractvalue { i32, { i32 } } %A, 1
+      // %E = insertvalue { i32 } %X, i32 42, 0
+      // by switching the order of the insert and extract (though the
+      // insertvalue should be left in, since it may have other uses).
+      Value *NewEV = Builder.CreateExtractValue(IV->getAggregateOperand(),
+                                                EV.getIndices());
+      return InsertValueInst::Create(NewEV, IV->getInsertedValueOperand(),
+                                     makeArrayRef(insi, inse));
+    }
+    if (insi == inse)
+      // The insert list is a prefix of the extract list
+      // We can simply remove the common indices from the extract and make it
+      // operate on the inserted value instead of the insertvalue result.
+      // i.e., replace
+      // %I = insertvalue { i32, { i32 } } %A, { i32 } { i32 42 }, 1
+      // %E = extractvalue { i32, { i32 } } %I, 1, 0
+      // with
+      // %E extractvalue { i32 } { i32 42 }, 0
+      return ExtractValueInst::Create(IV->getInsertedValueOperand(),
+                                      makeArrayRef(exti, exte));
+  }
+  if (WithOverflowInst *WO = dyn_cast<WithOverflowInst>(Agg)) {
+    // We're extracting from an overflow intrinsic, see if we're the only user,
+    // which allows us to simplify multiple result intrinsics to simpler
+    // things that just get one value.
+    if (WO->hasOneUse()) {
+      // Check if we're grabbing only the result of a 'with overflow' intrinsic
+      // and replace it with a traditional binary instruction.
+      if (*EV.idx_begin() == 0) {
+        Instruction::BinaryOps BinOp = WO->getBinaryOp();
+        Value *LHS = WO->getLHS(), *RHS = WO->getRHS();
+        replaceInstUsesWith(*WO, UndefValue::get(WO->getType()));
+        eraseInstFromFunction(*WO);
+        return BinaryOperator::Create(BinOp, LHS, RHS);
+      }
+
+      // If the normal result of the add is dead, and the RHS is a constant,
+      // we can transform this into a range comparison.
+      // overflow = uadd a, -4  -->  overflow = icmp ugt a, 3
+      if (WO->getIntrinsicID() == Intrinsic::uadd_with_overflow)
+        if (ConstantInt *CI = dyn_cast<ConstantInt>(WO->getRHS()))
+          return new ICmpInst(ICmpInst::ICMP_UGT, WO->getLHS(),
+                              ConstantExpr::getNot(CI));
+    }
+  }
+  if (LoadInst *L = dyn_cast<LoadInst>(Agg))
+    // If the (non-volatile) load only has one use, we can rewrite this to a
+    // load from a GEP. This reduces the size of the load. If a load is used
+    // only by extractvalue instructions then this either must have been
+    // optimized before, or it is a struct with padding, in which case we
+    // don't want to do the transformation as it loses padding knowledge.
+    if (L->isSimple() && L->hasOneUse()) {
+      // extractvalue has integer indices, getelementptr has Value*s. Convert.
+      SmallVector<Value*, 4> Indices;
+      // Prefix an i32 0 since we need the first element.
+      Indices.push_back(Builder.getInt32(0));
+      for (ExtractValueInst::idx_iterator I = EV.idx_begin(), E = EV.idx_end();
+            I != E; ++I)
+        Indices.push_back(Builder.getInt32(*I));
+
+      // We need to insert these at the location of the old load, not at that of
+      // the extractvalue.
+      Builder.SetInsertPoint(L);
+      Value *GEP = Builder.CreateInBoundsGEP(L->getType(),
+                                             L->getPointerOperand(), Indices);
+      Instruction *NL = Builder.CreateLoad(EV.getType(), GEP);
+      // Whatever aliasing information we had for the orignal load must also
+      // hold for the smaller load, so propagate the annotations.
+      AAMDNodes Nodes;
+      L->getAAMetadata(Nodes);
+      NL->setAAMetadata(Nodes);
+      // Returning the load directly will cause the main loop to insert it in
+      // the wrong spot, so use replaceInstUsesWith().
+      return replaceInstUsesWith(EV, NL);
+    }
+  // We could simplify extracts from other values. Note that nested extracts may
+  // already be simplified implicitly by the above: extract (extract (insert) )
+  // will be translated into extract ( insert ( extract ) ) first and then just
+  // the value inserted, if appropriate. Similarly for extracts from single-use
+  // loads: extract (extract (load)) will be translated to extract (load (gep))
+  // and if again single-use then via load (gep (gep)) to load (gep).
+  // However, double extracts from e.g. function arguments or return values
+  // aren't handled yet.
+  return nullptr;
+}
+
+/// Return 'true' if the given typeinfo will match anything.
+static bool isCatchAll(EHPersonality Personality, Constant *TypeInfo) {
+  switch (Personality) {
+  case EHPersonality::GNU_C:
+  case EHPersonality::GNU_C_SjLj:
+  case EHPersonality::Rust:
+    // The GCC C EH and Rust personality only exists to support cleanups, so
+    // it's not clear what the semantics of catch clauses are.
+    return false;
+  case EHPersonality::Unknown:
+    return false;
+  case EHPersonality::GNU_Ada:
+    // While __gnat_all_others_value will match any Ada exception, it doesn't
+    // match foreign exceptions (or didn't, before gcc-4.7).
+    return false;
+  case EHPersonality::GNU_CXX:
+  case EHPersonality::GNU_CXX_SjLj:
+  case EHPersonality::GNU_ObjC:
+  case EHPersonality::MSVC_X86SEH:
   case EHPersonality::MSVC_TableSEH:
-  case EHPersonality::MSVC_CXX: 
-  case EHPersonality::CoreCLR: 
-  case EHPersonality::Wasm_CXX: 
+  case EHPersonality::MSVC_CXX:
+  case EHPersonality::CoreCLR:
+  case EHPersonality::Wasm_CXX:
   case EHPersonality::XL_CXX:
-    return TypeInfo->isNullValue(); 
-  } 
-  llvm_unreachable("invalid enum"); 
-} 
- 
-static bool shorter_filter(const Value *LHS, const Value *RHS) { 
-  return 
-    cast<ArrayType>(LHS->getType())->getNumElements() 
-  < 
-    cast<ArrayType>(RHS->getType())->getNumElements(); 
-} 
- 
+    return TypeInfo->isNullValue();
+  }
+  llvm_unreachable("invalid enum");
+}
+
+static bool shorter_filter(const Value *LHS, const Value *RHS) {
+  return
+    cast<ArrayType>(LHS->getType())->getNumElements()
+  <
+    cast<ArrayType>(RHS->getType())->getNumElements();
+}
+
 Instruction *InstCombinerImpl::visitLandingPadInst(LandingPadInst &LI) {
-  // The logic here should be correct for any real-world personality function. 
-  // However if that turns out not to be true, the offending logic can always 
-  // be conditioned on the personality function, like the catch-all logic is. 
-  EHPersonality Personality = 
-      classifyEHPersonality(LI.getParent()->getParent()->getPersonalityFn()); 
- 
-  // Simplify the list of clauses, eg by removing repeated catch clauses 
-  // (these are often created by inlining). 
-  bool MakeNewInstruction = false; // If true, recreate using the following: 
-  SmallVector<Constant *, 16> NewClauses; // - Clauses for the new instruction; 
-  bool CleanupFlag = LI.isCleanup();   // - The new instruction is a cleanup. 
- 
-  SmallPtrSet<Value *, 16> AlreadyCaught; // Typeinfos known caught already. 
-  for (unsigned i = 0, e = LI.getNumClauses(); i != e; ++i) { 
-    bool isLastClause = i + 1 == e; 
-    if (LI.isCatch(i)) { 
-      // A catch clause. 
-      Constant *CatchClause = LI.getClause(i); 
-      Constant *TypeInfo = CatchClause->stripPointerCasts(); 
- 
-      // If we already saw this clause, there is no point in having a second 
-      // copy of it. 
-      if (AlreadyCaught.insert(TypeInfo).second) { 
-        // This catch clause was not already seen. 
-        NewClauses.push_back(CatchClause); 
-      } else { 
-        // Repeated catch clause - drop the redundant copy. 
-        MakeNewInstruction = true; 
-      } 
- 
-      // If this is a catch-all then there is no point in keeping any following 
-      // clauses or marking the landingpad as having a cleanup. 
-      if (isCatchAll(Personality, TypeInfo)) { 
-        if (!isLastClause) 
-          MakeNewInstruction = true; 
-        CleanupFlag = false; 
-        break; 
-      } 
-    } else { 
-      // A filter clause.  If any of the filter elements were already caught 
-      // then they can be dropped from the filter.  It is tempting to try to 
-      // exploit the filter further by saying that any typeinfo that does not 
-      // occur in the filter can't be caught later (and thus can be dropped). 
-      // However this would be wrong, since typeinfos can match without being 
-      // equal (for example if one represents a C++ class, and the other some 
-      // class derived from it). 
-      assert(LI.isFilter(i) && "Unsupported landingpad clause!"); 
-      Constant *FilterClause = LI.getClause(i); 
-      ArrayType *FilterType = cast<ArrayType>(FilterClause->getType()); 
-      unsigned NumTypeInfos = FilterType->getNumElements(); 
- 
-      // An empty filter catches everything, so there is no point in keeping any 
-      // following clauses or marking the landingpad as having a cleanup.  By 
-      // dealing with this case here the following code is made a bit simpler. 
-      if (!NumTypeInfos) { 
-        NewClauses.push_back(FilterClause); 
-        if (!isLastClause) 
-          MakeNewInstruction = true; 
-        CleanupFlag = false; 
-        break; 
-      } 
- 
-      bool MakeNewFilter = false; // If true, make a new filter. 
-      SmallVector<Constant *, 16> NewFilterElts; // New elements. 
-      if (isa<ConstantAggregateZero>(FilterClause)) { 
-        // Not an empty filter - it contains at least one null typeinfo. 
-        assert(NumTypeInfos > 0 && "Should have handled empty filter already!"); 
-        Constant *TypeInfo = 
-          Constant::getNullValue(FilterType->getElementType()); 
-        // If this typeinfo is a catch-all then the filter can never match. 
-        if (isCatchAll(Personality, TypeInfo)) { 
-          // Throw the filter away. 
-          MakeNewInstruction = true; 
-          continue; 
-        } 
- 
-        // There is no point in having multiple copies of this typeinfo, so 
-        // discard all but the first copy if there is more than one. 
-        NewFilterElts.push_back(TypeInfo); 
-        if (NumTypeInfos > 1) 
-          MakeNewFilter = true; 
-      } else { 
-        ConstantArray *Filter = cast<ConstantArray>(FilterClause); 
-        SmallPtrSet<Value *, 16> SeenInFilter; // For uniquing the elements. 
-        NewFilterElts.reserve(NumTypeInfos); 
- 
-        // Remove any filter elements that were already caught or that already 
-        // occurred in the filter.  While there, see if any of the elements are 
-        // catch-alls.  If so, the filter can be discarded. 
-        bool SawCatchAll = false; 
-        for (unsigned j = 0; j != NumTypeInfos; ++j) { 
-          Constant *Elt = Filter->getOperand(j); 
-          Constant *TypeInfo = Elt->stripPointerCasts(); 
-          if (isCatchAll(Personality, TypeInfo)) { 
-            // This element is a catch-all.  Bail out, noting this fact. 
-            SawCatchAll = true; 
-            break; 
-          } 
- 
-          // Even if we've seen a type in a catch clause, we don't want to 
-          // remove it from the filter.  An unexpected type handler may be 
-          // set up for a call site which throws an exception of the same 
-          // type caught.  In order for the exception thrown by the unexpected 
-          // handler to propagate correctly, the filter must be correctly 
-          // described for the call site. 
-          // 
-          // Example: 
-          // 
-          // void unexpected() { throw 1;} 
-          // void foo() throw (int) { 
-          //   std::set_unexpected(unexpected); 
-          //   try { 
-          //     throw 2.0; 
-          //   } catch (int i) {} 
-          // } 
- 
-          // There is no point in having multiple copies of the same typeinfo in 
-          // a filter, so only add it if we didn't already. 
-          if (SeenInFilter.insert(TypeInfo).second) 
-            NewFilterElts.push_back(cast<Constant>(Elt)); 
-        } 
-        // A filter containing a catch-all cannot match anything by definition. 
-        if (SawCatchAll) { 
-          // Throw the filter away. 
-          MakeNewInstruction = true; 
-          continue; 
-        } 
- 
-        // If we dropped something from the filter, make a new one. 
-        if (NewFilterElts.size() < NumTypeInfos) 
-          MakeNewFilter = true; 
-      } 
-      if (MakeNewFilter) { 
-        FilterType = ArrayType::get(FilterType->getElementType(), 
-                                    NewFilterElts.size()); 
-        FilterClause = ConstantArray::get(FilterType, NewFilterElts); 
-        MakeNewInstruction = true; 
-      } 
- 
-      NewClauses.push_back(FilterClause); 
- 
-      // If the new filter is empty then it will catch everything so there is 
-      // no point in keeping any following clauses or marking the landingpad 
-      // as having a cleanup.  The case of the original filter being empty was 
-      // already handled above. 
-      if (MakeNewFilter && !NewFilterElts.size()) { 
-        assert(MakeNewInstruction && "New filter but not a new instruction!"); 
-        CleanupFlag = false; 
-        break; 
-      } 
-    } 
-  } 
- 
-  // If several filters occur in a row then reorder them so that the shortest 
-  // filters come first (those with the smallest number of elements).  This is 
-  // advantageous because shorter filters are more likely to match, speeding up 
-  // unwinding, but mostly because it increases the effectiveness of the other 
-  // filter optimizations below. 
-  for (unsigned i = 0, e = NewClauses.size(); i + 1 < e; ) { 
-    unsigned j; 
-    // Find the maximal 'j' s.t. the range [i, j) consists entirely of filters. 
-    for (j = i; j != e; ++j) 
-      if (!isa<ArrayType>(NewClauses[j]->getType())) 
-        break; 
- 
-    // Check whether the filters are already sorted by length.  We need to know 
-    // if sorting them is actually going to do anything so that we only make a 
-    // new landingpad instruction if it does. 
-    for (unsigned k = i; k + 1 < j; ++k) 
-      if (shorter_filter(NewClauses[k+1], NewClauses[k])) { 
-        // Not sorted, so sort the filters now.  Doing an unstable sort would be 
-        // correct too but reordering filters pointlessly might confuse users. 
-        std::stable_sort(NewClauses.begin() + i, NewClauses.begin() + j, 
-                         shorter_filter); 
-        MakeNewInstruction = true; 
-        break; 
-      } 
- 
-    // Look for the next batch of filters. 
-    i = j + 1; 
-  } 
- 
-  // If typeinfos matched if and only if equal, then the elements of a filter L 
-  // that occurs later than a filter F could be replaced by the intersection of 
-  // the elements of F and L.  In reality two typeinfos can match without being 
-  // equal (for example if one represents a C++ class, and the other some class 
-  // derived from it) so it would be wrong to perform this transform in general. 
-  // However the transform is correct and useful if F is a subset of L.  In that 
-  // case L can be replaced by F, and thus removed altogether since repeating a 
-  // filter is pointless.  So here we look at all pairs of filters F and L where 
-  // L follows F in the list of clauses, and remove L if every element of F is 
-  // an element of L.  This can occur when inlining C++ functions with exception 
-  // specifications. 
-  for (unsigned i = 0; i + 1 < NewClauses.size(); ++i) { 
-    // Examine each filter in turn. 
-    Value *Filter = NewClauses[i]; 
-    ArrayType *FTy = dyn_cast<ArrayType>(Filter->getType()); 
-    if (!FTy) 
-      // Not a filter - skip it. 
-      continue; 
-    unsigned FElts = FTy->getNumElements(); 
-    // Examine each filter following this one.  Doing this backwards means that 
-    // we don't have to worry about filters disappearing under us when removed. 
-    for (unsigned j = NewClauses.size() - 1; j != i; --j) { 
-      Value *LFilter = NewClauses[j]; 
-      ArrayType *LTy = dyn_cast<ArrayType>(LFilter->getType()); 
-      if (!LTy) 
-        // Not a filter - skip it. 
-        continue; 
-      // If Filter is a subset of LFilter, i.e. every element of Filter is also 
-      // an element of LFilter, then discard LFilter. 
-      SmallVectorImpl<Constant *>::iterator J = NewClauses.begin() + j; 
-      // If Filter is empty then it is a subset of LFilter. 
-      if (!FElts) { 
-        // Discard LFilter. 
-        NewClauses.erase(J); 
-        MakeNewInstruction = true; 
-        // Move on to the next filter. 
-        continue; 
-      } 
-      unsigned LElts = LTy->getNumElements(); 
-      // If Filter is longer than LFilter then it cannot be a subset of it. 
-      if (FElts > LElts) 
-        // Move on to the next filter. 
-        continue; 
-      // At this point we know that LFilter has at least one element. 
-      if (isa<ConstantAggregateZero>(LFilter)) { // LFilter only contains zeros. 
-        // Filter is a subset of LFilter iff Filter contains only zeros (as we 
-        // already know that Filter is not longer than LFilter). 
-        if (isa<ConstantAggregateZero>(Filter)) { 
-          assert(FElts <= LElts && "Should have handled this case earlier!"); 
-          // Discard LFilter. 
-          NewClauses.erase(J); 
-          MakeNewInstruction = true; 
-        } 
-        // Move on to the next filter. 
-        continue; 
-      } 
-      ConstantArray *LArray = cast<ConstantArray>(LFilter); 
-      if (isa<ConstantAggregateZero>(Filter)) { // Filter only contains zeros. 
-        // Since Filter is non-empty and contains only zeros, it is a subset of 
-        // LFilter iff LFilter contains a zero. 
-        assert(FElts > 0 && "Should have eliminated the empty filter earlier!"); 
-        for (unsigned l = 0; l != LElts; ++l) 
-          if (LArray->getOperand(l)->isNullValue()) { 
-            // LFilter contains a zero - discard it. 
-            NewClauses.erase(J); 
-            MakeNewInstruction = true; 
-            break; 
-          } 
-        // Move on to the next filter. 
-        continue; 
-      } 
-      // At this point we know that both filters are ConstantArrays.  Loop over 
-      // operands to see whether every element of Filter is also an element of 
-      // LFilter.  Since filters tend to be short this is probably faster than 
-      // using a method that scales nicely. 
-      ConstantArray *FArray = cast<ConstantArray>(Filter); 
-      bool AllFound = true; 
-      for (unsigned f = 0; f != FElts; ++f) { 
-        Value *FTypeInfo = FArray->getOperand(f)->stripPointerCasts(); 
-        AllFound = false; 
-        for (unsigned l = 0; l != LElts; ++l) { 
-          Value *LTypeInfo = LArray->getOperand(l)->stripPointerCasts(); 
-          if (LTypeInfo == FTypeInfo) { 
-            AllFound = true; 
-            break; 
-          } 
-        } 
-        if (!AllFound) 
-          break; 
-      } 
-      if (AllFound) { 
-        // Discard LFilter. 
-        NewClauses.erase(J); 
-        MakeNewInstruction = true; 
-      } 
-      // Move on to the next filter. 
-    } 
-  } 
- 
-  // If we changed any of the clauses, replace the old landingpad instruction 
-  // with a new one. 
-  if (MakeNewInstruction) { 
-    LandingPadInst *NLI = LandingPadInst::Create(LI.getType(), 
-                                                 NewClauses.size()); 
-    for (unsigned i = 0, e = NewClauses.size(); i != e; ++i) 
-      NLI->addClause(NewClauses[i]); 
-    // A landing pad with no clauses must have the cleanup flag set.  It is 
-    // theoretically possible, though highly unlikely, that we eliminated all 
-    // clauses.  If so, force the cleanup flag to true. 
-    if (NewClauses.empty()) 
-      CleanupFlag = true; 
-    NLI->setCleanup(CleanupFlag); 
-    return NLI; 
-  } 
- 
-  // Even if none of the clauses changed, we may nonetheless have understood 
-  // that the cleanup flag is pointless.  Clear it if so. 
-  if (LI.isCleanup() != CleanupFlag) { 
-    assert(!CleanupFlag && "Adding a cleanup, not removing one?!"); 
-    LI.setCleanup(CleanupFlag); 
-    return &LI; 
-  } 
- 
-  return nullptr; 
-} 
- 
+  // The logic here should be correct for any real-world personality function.
+  // However if that turns out not to be true, the offending logic can always
+  // be conditioned on the personality function, like the catch-all logic is.
+  EHPersonality Personality =
+      classifyEHPersonality(LI.getParent()->getParent()->getPersonalityFn());
+
+  // Simplify the list of clauses, eg by removing repeated catch clauses
+  // (these are often created by inlining).
+  bool MakeNewInstruction = false; // If true, recreate using the following:
+  SmallVector<Constant *, 16> NewClauses; // - Clauses for the new instruction;
+  bool CleanupFlag = LI.isCleanup();   // - The new instruction is a cleanup.
+
+  SmallPtrSet<Value *, 16> AlreadyCaught; // Typeinfos known caught already.
+  for (unsigned i = 0, e = LI.getNumClauses(); i != e; ++i) {
+    bool isLastClause = i + 1 == e;
+    if (LI.isCatch(i)) {
+      // A catch clause.
+      Constant *CatchClause = LI.getClause(i);
+      Constant *TypeInfo = CatchClause->stripPointerCasts();
+
+      // If we already saw this clause, there is no point in having a second
+      // copy of it.
+      if (AlreadyCaught.insert(TypeInfo).second) {
+        // This catch clause was not already seen.
+        NewClauses.push_back(CatchClause);
+      } else {
+        // Repeated catch clause - drop the redundant copy.
+        MakeNewInstruction = true;
+      }
+
+      // If this is a catch-all then there is no point in keeping any following
+      // clauses or marking the landingpad as having a cleanup.
+      if (isCatchAll(Personality, TypeInfo)) {
+        if (!isLastClause)
+          MakeNewInstruction = true;
+        CleanupFlag = false;
+        break;
+      }
+    } else {
+      // A filter clause.  If any of the filter elements were already caught
+      // then they can be dropped from the filter.  It is tempting to try to
+      // exploit the filter further by saying that any typeinfo that does not
+      // occur in the filter can't be caught later (and thus can be dropped).
+      // However this would be wrong, since typeinfos can match without being
+      // equal (for example if one represents a C++ class, and the other some
+      // class derived from it).
+      assert(LI.isFilter(i) && "Unsupported landingpad clause!");
+      Constant *FilterClause = LI.getClause(i);
+      ArrayType *FilterType = cast<ArrayType>(FilterClause->getType());
+      unsigned NumTypeInfos = FilterType->getNumElements();
+
+      // An empty filter catches everything, so there is no point in keeping any
+      // following clauses or marking the landingpad as having a cleanup.  By
+      // dealing with this case here the following code is made a bit simpler.
+      if (!NumTypeInfos) {
+        NewClauses.push_back(FilterClause);
+        if (!isLastClause)
+          MakeNewInstruction = true;
+        CleanupFlag = false;
+        break;
+      }
+
+      bool MakeNewFilter = false; // If true, make a new filter.
+      SmallVector<Constant *, 16> NewFilterElts; // New elements.
+      if (isa<ConstantAggregateZero>(FilterClause)) {
+        // Not an empty filter - it contains at least one null typeinfo.
+        assert(NumTypeInfos > 0 && "Should have handled empty filter already!");
+        Constant *TypeInfo =
+          Constant::getNullValue(FilterType->getElementType());
+        // If this typeinfo is a catch-all then the filter can never match.
+        if (isCatchAll(Personality, TypeInfo)) {
+          // Throw the filter away.
+          MakeNewInstruction = true;
+          continue;
+        }
+
+        // There is no point in having multiple copies of this typeinfo, so
+        // discard all but the first copy if there is more than one.
+        NewFilterElts.push_back(TypeInfo);
+        if (NumTypeInfos > 1)
+          MakeNewFilter = true;
+      } else {
+        ConstantArray *Filter = cast<ConstantArray>(FilterClause);
+        SmallPtrSet<Value *, 16> SeenInFilter; // For uniquing the elements.
+        NewFilterElts.reserve(NumTypeInfos);
+
+        // Remove any filter elements that were already caught or that already
+        // occurred in the filter.  While there, see if any of the elements are
+        // catch-alls.  If so, the filter can be discarded.
+        bool SawCatchAll = false;
+        for (unsigned j = 0; j != NumTypeInfos; ++j) {
+          Constant *Elt = Filter->getOperand(j);
+          Constant *TypeInfo = Elt->stripPointerCasts();
+          if (isCatchAll(Personality, TypeInfo)) {
+            // This element is a catch-all.  Bail out, noting this fact.
+            SawCatchAll = true;
+            break;
+          }
+
+          // Even if we've seen a type in a catch clause, we don't want to
+          // remove it from the filter.  An unexpected type handler may be
+          // set up for a call site which throws an exception of the same
+          // type caught.  In order for the exception thrown by the unexpected
+          // handler to propagate correctly, the filter must be correctly
+          // described for the call site.
+          //
+          // Example:
+          //
+          // void unexpected() { throw 1;}
+          // void foo() throw (int) {
+          //   std::set_unexpected(unexpected);
+          //   try {
+          //     throw 2.0;
+          //   } catch (int i) {}
+          // }
+
+          // There is no point in having multiple copies of the same typeinfo in
+          // a filter, so only add it if we didn't already.
+          if (SeenInFilter.insert(TypeInfo).second)
+            NewFilterElts.push_back(cast<Constant>(Elt));
+        }
+        // A filter containing a catch-all cannot match anything by definition.
+        if (SawCatchAll) {
+          // Throw the filter away.
+          MakeNewInstruction = true;
+          continue;
+        }
+
+        // If we dropped something from the filter, make a new one.
+        if (NewFilterElts.size() < NumTypeInfos)
+          MakeNewFilter = true;
+      }
+      if (MakeNewFilter) {
+        FilterType = ArrayType::get(FilterType->getElementType(),
+                                    NewFilterElts.size());
+        FilterClause = ConstantArray::get(FilterType, NewFilterElts);
+        MakeNewInstruction = true;
+      }
+
+      NewClauses.push_back(FilterClause);
+
+      // If the new filter is empty then it will catch everything so there is
+      // no point in keeping any following clauses or marking the landingpad
+      // as having a cleanup.  The case of the original filter being empty was
+      // already handled above.
+      if (MakeNewFilter && !NewFilterElts.size()) {
+        assert(MakeNewInstruction && "New filter but not a new instruction!");
+        CleanupFlag = false;
+        break;
+      }
+    }
+  }
+
+  // If several filters occur in a row then reorder them so that the shortest
+  // filters come first (those with the smallest number of elements).  This is
+  // advantageous because shorter filters are more likely to match, speeding up
+  // unwinding, but mostly because it increases the effectiveness of the other
+  // filter optimizations below.
+  for (unsigned i = 0, e = NewClauses.size(); i + 1 < e; ) {
+    unsigned j;
+    // Find the maximal 'j' s.t. the range [i, j) consists entirely of filters.
+    for (j = i; j != e; ++j)
+      if (!isa<ArrayType>(NewClauses[j]->getType()))
+        break;
+
+    // Check whether the filters are already sorted by length.  We need to know
+    // if sorting them is actually going to do anything so that we only make a
+    // new landingpad instruction if it does.
+    for (unsigned k = i; k + 1 < j; ++k)
+      if (shorter_filter(NewClauses[k+1], NewClauses[k])) {
+        // Not sorted, so sort the filters now.  Doing an unstable sort would be
+        // correct too but reordering filters pointlessly might confuse users.
+        std::stable_sort(NewClauses.begin() + i, NewClauses.begin() + j,
+                         shorter_filter);
+        MakeNewInstruction = true;
+        break;
+      }
+
+    // Look for the next batch of filters.
+    i = j + 1;
+  }
+
+  // If typeinfos matched if and only if equal, then the elements of a filter L
+  // that occurs later than a filter F could be replaced by the intersection of
+  // the elements of F and L.  In reality two typeinfos can match without being
+  // equal (for example if one represents a C++ class, and the other some class
+  // derived from it) so it would be wrong to perform this transform in general.
+  // However the transform is correct and useful if F is a subset of L.  In that
+  // case L can be replaced by F, and thus removed altogether since repeating a
+  // filter is pointless.  So here we look at all pairs of filters F and L where
+  // L follows F in the list of clauses, and remove L if every element of F is
+  // an element of L.  This can occur when inlining C++ functions with exception
+  // specifications.
+  for (unsigned i = 0; i + 1 < NewClauses.size(); ++i) {
+    // Examine each filter in turn.
+    Value *Filter = NewClauses[i];
+    ArrayType *FTy = dyn_cast<ArrayType>(Filter->getType());
+    if (!FTy)
+      // Not a filter - skip it.
+      continue;
+    unsigned FElts = FTy->getNumElements();
+    // Examine each filter following this one.  Doing this backwards means that
+    // we don't have to worry about filters disappearing under us when removed.
+    for (unsigned j = NewClauses.size() - 1; j != i; --j) {
+      Value *LFilter = NewClauses[j];
+      ArrayType *LTy = dyn_cast<ArrayType>(LFilter->getType());
+      if (!LTy)
+        // Not a filter - skip it.
+        continue;
+      // If Filter is a subset of LFilter, i.e. every element of Filter is also
+      // an element of LFilter, then discard LFilter.
+      SmallVectorImpl<Constant *>::iterator J = NewClauses.begin() + j;
+      // If Filter is empty then it is a subset of LFilter.
+      if (!FElts) {
+        // Discard LFilter.
+        NewClauses.erase(J);
+        MakeNewInstruction = true;
+        // Move on to the next filter.
+        continue;
+      }
+      unsigned LElts = LTy->getNumElements();
+      // If Filter is longer than LFilter then it cannot be a subset of it.
+      if (FElts > LElts)
+        // Move on to the next filter.
+        continue;
+      // At this point we know that LFilter has at least one element.
+      if (isa<ConstantAggregateZero>(LFilter)) { // LFilter only contains zeros.
+        // Filter is a subset of LFilter iff Filter contains only zeros (as we
+        // already know that Filter is not longer than LFilter).
+        if (isa<ConstantAggregateZero>(Filter)) {
+          assert(FElts <= LElts && "Should have handled this case earlier!");
+          // Discard LFilter.
+          NewClauses.erase(J);
+          MakeNewInstruction = true;
+        }
+        // Move on to the next filter.
+        continue;
+      }
+      ConstantArray *LArray = cast<ConstantArray>(LFilter);
+      if (isa<ConstantAggregateZero>(Filter)) { // Filter only contains zeros.
+        // Since Filter is non-empty and contains only zeros, it is a subset of
+        // LFilter iff LFilter contains a zero.
+        assert(FElts > 0 && "Should have eliminated the empty filter earlier!");
+        for (unsigned l = 0; l != LElts; ++l)
+          if (LArray->getOperand(l)->isNullValue()) {
+            // LFilter contains a zero - discard it.
+            NewClauses.erase(J);
+            MakeNewInstruction = true;
+            break;
+          }
+        // Move on to the next filter.
+        continue;
+      }
+      // At this point we know that both filters are ConstantArrays.  Loop over
+      // operands to see whether every element of Filter is also an element of
+      // LFilter.  Since filters tend to be short this is probably faster than
+      // using a method that scales nicely.
+      ConstantArray *FArray = cast<ConstantArray>(Filter);
+      bool AllFound = true;
+      for (unsigned f = 0; f != FElts; ++f) {
+        Value *FTypeInfo = FArray->getOperand(f)->stripPointerCasts();
+        AllFound = false;
+        for (unsigned l = 0; l != LElts; ++l) {
+          Value *LTypeInfo = LArray->getOperand(l)->stripPointerCasts();
+          if (LTypeInfo == FTypeInfo) {
+            AllFound = true;
+            break;
+          }
+        }
+        if (!AllFound)
+          break;
+      }
+      if (AllFound) {
+        // Discard LFilter.
+        NewClauses.erase(J);
+        MakeNewInstruction = true;
+      }
+      // Move on to the next filter.
+    }
+  }
+
+  // If we changed any of the clauses, replace the old landingpad instruction
+  // with a new one.
+  if (MakeNewInstruction) {
+    LandingPadInst *NLI = LandingPadInst::Create(LI.getType(),
+                                                 NewClauses.size());
+    for (unsigned i = 0, e = NewClauses.size(); i != e; ++i)
+      NLI->addClause(NewClauses[i]);
+    // A landing pad with no clauses must have the cleanup flag set.  It is
+    // theoretically possible, though highly unlikely, that we eliminated all
+    // clauses.  If so, force the cleanup flag to true.
+    if (NewClauses.empty())
+      CleanupFlag = true;
+    NLI->setCleanup(CleanupFlag);
+    return NLI;
+  }
+
+  // Even if none of the clauses changed, we may nonetheless have understood
+  // that the cleanup flag is pointless.  Clear it if so.
+  if (LI.isCleanup() != CleanupFlag) {
+    assert(!CleanupFlag && "Adding a cleanup, not removing one?!");
+    LI.setCleanup(CleanupFlag);
+    return &LI;
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) {
-  Value *Op0 = I.getOperand(0); 
- 
-  if (Value *V = SimplifyFreezeInst(Op0, SQ.getWithInstruction(&I))) 
-    return replaceInstUsesWith(I, V); 
- 
+  Value *Op0 = I.getOperand(0);
+
+  if (Value *V = SimplifyFreezeInst(Op0, SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
   // freeze (phi const, x) --> phi const, (freeze x)
   if (auto *PN = dyn_cast<PHINode>(Op0)) {
     if (Instruction *NV = foldOpIntoPhi(I, PN))
@@ -3498,237 +3498,237 @@ Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) {
     return replaceInstUsesWith(I, BestValue);
   }
 
-  return nullptr; 
-} 
- 
-/// Try to move the specified instruction from its current block into the 
-/// beginning of DestBlock, which can only happen if it's safe to move the 
-/// instruction past all of the instructions between it and the end of its 
-/// block. 
-static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) { 
-  assert(I->getSingleUndroppableUse() && "Invariants didn't hold!"); 
-  BasicBlock *SrcBlock = I->getParent(); 
- 
-  // Cannot move control-flow-involving, volatile loads, vaarg, etc. 
-  if (isa<PHINode>(I) || I->isEHPad() || I->mayHaveSideEffects() || 
-      I->isTerminator()) 
-    return false; 
- 
-  // Do not sink static or dynamic alloca instructions. Static allocas must 
-  // remain in the entry block, and dynamic allocas must not be sunk in between 
-  // a stacksave / stackrestore pair, which would incorrectly shorten its 
-  // lifetime. 
-  if (isa<AllocaInst>(I)) 
-    return false; 
- 
-  // Do not sink into catchswitch blocks. 
-  if (isa<CatchSwitchInst>(DestBlock->getTerminator())) 
-    return false; 
- 
-  // Do not sink convergent call instructions. 
-  if (auto *CI = dyn_cast<CallInst>(I)) { 
-    if (CI->isConvergent()) 
-      return false; 
-  } 
-  // We can only sink load instructions if there is nothing between the load and 
-  // the end of block that could change the value. 
-  if (I->mayReadFromMemory()) { 
-    // We don't want to do any sophisticated alias analysis, so we only check 
-    // the instructions after I in I's parent block if we try to sink to its 
-    // successor block. 
-    if (DestBlock->getUniquePredecessor() != I->getParent()) 
-      return false; 
-    for (BasicBlock::iterator Scan = I->getIterator(), 
-                              E = I->getParent()->end(); 
-         Scan != E; ++Scan) 
-      if (Scan->mayWriteToMemory()) 
-        return false; 
-  } 
- 
-  I->dropDroppableUses([DestBlock](const Use *U) { 
-    if (auto *I = dyn_cast<Instruction>(U->getUser())) 
-      return I->getParent() != DestBlock; 
-    return true; 
-  }); 
-  /// FIXME: We could remove droppable uses that are not dominated by 
-  /// the new position. 
- 
-  BasicBlock::iterator InsertPos = DestBlock->getFirstInsertionPt(); 
-  I->moveBefore(&*InsertPos); 
-  ++NumSunkInst; 
- 
-  // Also sink all related debug uses from the source basic block. Otherwise we 
-  // get debug use before the def. Attempt to salvage debug uses first, to 
-  // maximise the range variables have location for. If we cannot salvage, then 
-  // mark the location undef: we know it was supposed to receive a new location 
-  // here, but that computation has been sunk. 
-  SmallVector<DbgVariableIntrinsic *, 2> DbgUsers; 
-  findDbgUsers(DbgUsers, I); 
- 
-  // Update the arguments of a dbg.declare instruction, so that it 
-  // does not point into a sunk instruction. 
-  auto updateDbgDeclare = [&I](DbgVariableIntrinsic *DII) { 
-    if (!isa<DbgDeclareInst>(DII)) 
-      return false; 
- 
-    if (isa<CastInst>(I)) 
-      DII->setOperand( 
-          0, MetadataAsValue::get(I->getContext(), 
-                                  ValueAsMetadata::get(I->getOperand(0)))); 
-    return true; 
-  }; 
- 
-  SmallVector<DbgVariableIntrinsic *, 2> DIIClones; 
-  for (auto User : DbgUsers) { 
-    // A dbg.declare instruction should not be cloned, since there can only be 
-    // one per variable fragment. It should be left in the original place 
-    // because the sunk instruction is not an alloca (otherwise we could not be 
-    // here). 
-    if (User->getParent() != SrcBlock || updateDbgDeclare(User)) 
-      continue; 
- 
-    DIIClones.emplace_back(cast<DbgVariableIntrinsic>(User->clone())); 
-    LLVM_DEBUG(dbgs() << "CLONE: " << *DIIClones.back() << '\n'); 
-  } 
- 
-  // Perform salvaging without the clones, then sink the clones. 
-  if (!DIIClones.empty()) { 
-    salvageDebugInfoForDbgValues(*I, DbgUsers); 
-    for (auto &DIIClone : DIIClones) { 
-      DIIClone->insertBefore(&*InsertPos); 
-      LLVM_DEBUG(dbgs() << "SINK: " << *DIIClone << '\n'); 
-    } 
-  } 
- 
-  return true; 
-} 
- 
+  return nullptr;
+}
+
+/// Try to move the specified instruction from its current block into the
+/// beginning of DestBlock, which can only happen if it's safe to move the
+/// instruction past all of the instructions between it and the end of its
+/// block.
+static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
+  assert(I->getSingleUndroppableUse() && "Invariants didn't hold!");
+  BasicBlock *SrcBlock = I->getParent();
+
+  // Cannot move control-flow-involving, volatile loads, vaarg, etc.
+  if (isa<PHINode>(I) || I->isEHPad() || I->mayHaveSideEffects() ||
+      I->isTerminator())
+    return false;
+
+  // Do not sink static or dynamic alloca instructions. Static allocas must
+  // remain in the entry block, and dynamic allocas must not be sunk in between
+  // a stacksave / stackrestore pair, which would incorrectly shorten its
+  // lifetime.
+  if (isa<AllocaInst>(I))
+    return false;
+
+  // Do not sink into catchswitch blocks.
+  if (isa<CatchSwitchInst>(DestBlock->getTerminator()))
+    return false;
+
+  // Do not sink convergent call instructions.
+  if (auto *CI = dyn_cast<CallInst>(I)) {
+    if (CI->isConvergent())
+      return false;
+  }
+  // We can only sink load instructions if there is nothing between the load and
+  // the end of block that could change the value.
+  if (I->mayReadFromMemory()) {
+    // We don't want to do any sophisticated alias analysis, so we only check
+    // the instructions after I in I's parent block if we try to sink to its
+    // successor block.
+    if (DestBlock->getUniquePredecessor() != I->getParent())
+      return false;
+    for (BasicBlock::iterator Scan = I->getIterator(),
+                              E = I->getParent()->end();
+         Scan != E; ++Scan)
+      if (Scan->mayWriteToMemory())
+        return false;
+  }
+
+  I->dropDroppableUses([DestBlock](const Use *U) {
+    if (auto *I = dyn_cast<Instruction>(U->getUser()))
+      return I->getParent() != DestBlock;
+    return true;
+  });
+  /// FIXME: We could remove droppable uses that are not dominated by
+  /// the new position.
+
+  BasicBlock::iterator InsertPos = DestBlock->getFirstInsertionPt();
+  I->moveBefore(&*InsertPos);
+  ++NumSunkInst;
+
+  // Also sink all related debug uses from the source basic block. Otherwise we
+  // get debug use before the def. Attempt to salvage debug uses first, to
+  // maximise the range variables have location for. If we cannot salvage, then
+  // mark the location undef: we know it was supposed to receive a new location
+  // here, but that computation has been sunk.
+  SmallVector<DbgVariableIntrinsic *, 2> DbgUsers;
+  findDbgUsers(DbgUsers, I);
+
+  // Update the arguments of a dbg.declare instruction, so that it
+  // does not point into a sunk instruction.
+  auto updateDbgDeclare = [&I](DbgVariableIntrinsic *DII) {
+    if (!isa<DbgDeclareInst>(DII))
+      return false;
+
+    if (isa<CastInst>(I))
+      DII->setOperand(
+          0, MetadataAsValue::get(I->getContext(),
+                                  ValueAsMetadata::get(I->getOperand(0))));
+    return true;
+  };
+
+  SmallVector<DbgVariableIntrinsic *, 2> DIIClones;
+  for (auto User : DbgUsers) {
+    // A dbg.declare instruction should not be cloned, since there can only be
+    // one per variable fragment. It should be left in the original place
+    // because the sunk instruction is not an alloca (otherwise we could not be
+    // here).
+    if (User->getParent() != SrcBlock || updateDbgDeclare(User))
+      continue;
+
+    DIIClones.emplace_back(cast<DbgVariableIntrinsic>(User->clone()));
+    LLVM_DEBUG(dbgs() << "CLONE: " << *DIIClones.back() << '\n');
+  }
+
+  // Perform salvaging without the clones, then sink the clones.
+  if (!DIIClones.empty()) {
+    salvageDebugInfoForDbgValues(*I, DbgUsers);
+    for (auto &DIIClone : DIIClones) {
+      DIIClone->insertBefore(&*InsertPos);
+      LLVM_DEBUG(dbgs() << "SINK: " << *DIIClone << '\n');
+    }
+  }
+
+  return true;
+}
+
 bool InstCombinerImpl::run() {
-  while (!Worklist.isEmpty()) { 
-    // Walk deferred instructions in reverse order, and push them to the 
-    // worklist, which means they'll end up popped from the worklist in-order. 
-    while (Instruction *I = Worklist.popDeferred()) { 
-      // Check to see if we can DCE the instruction. We do this already here to 
-      // reduce the number of uses and thus allow other folds to trigger. 
-      // Note that eraseInstFromFunction() may push additional instructions on 
-      // the deferred worklist, so this will DCE whole instruction chains. 
-      if (isInstructionTriviallyDead(I, &TLI)) { 
-        eraseInstFromFunction(*I); 
-        ++NumDeadInst; 
-        continue; 
-      } 
- 
-      Worklist.push(I); 
-    } 
- 
-    Instruction *I = Worklist.removeOne(); 
-    if (I == nullptr) continue;  // skip null values. 
- 
-    // Check to see if we can DCE the instruction. 
-    if (isInstructionTriviallyDead(I, &TLI)) { 
-      eraseInstFromFunction(*I); 
-      ++NumDeadInst; 
-      continue; 
-    } 
- 
-    if (!DebugCounter::shouldExecute(VisitCounter)) 
-      continue; 
- 
-    // Instruction isn't dead, see if we can constant propagate it. 
-    if (!I->use_empty() && 
-        (I->getNumOperands() == 0 || isa<Constant>(I->getOperand(0)))) { 
-      if (Constant *C = ConstantFoldInstruction(I, DL, &TLI)) { 
-        LLVM_DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *I 
-                          << '\n'); 
- 
-        // Add operands to the worklist. 
-        replaceInstUsesWith(*I, C); 
-        ++NumConstProp; 
-        if (isInstructionTriviallyDead(I, &TLI)) 
-          eraseInstFromFunction(*I); 
-        MadeIRChange = true; 
-        continue; 
-      } 
-    } 
- 
-    // See if we can trivially sink this instruction to its user if we can 
-    // prove that the successor is not executed more frequently than our block. 
-    if (EnableCodeSinking) 
-      if (Use *SingleUse = I->getSingleUndroppableUse()) { 
-        BasicBlock *BB = I->getParent(); 
-        Instruction *UserInst = cast<Instruction>(SingleUse->getUser()); 
-        BasicBlock *UserParent; 
- 
-        // Get the block the use occurs in. 
-        if (PHINode *PN = dyn_cast<PHINode>(UserInst)) 
-          UserParent = PN->getIncomingBlock(*SingleUse); 
-        else 
-          UserParent = UserInst->getParent(); 
- 
+  while (!Worklist.isEmpty()) {
+    // Walk deferred instructions in reverse order, and push them to the
+    // worklist, which means they'll end up popped from the worklist in-order.
+    while (Instruction *I = Worklist.popDeferred()) {
+      // Check to see if we can DCE the instruction. We do this already here to
+      // reduce the number of uses and thus allow other folds to trigger.
+      // Note that eraseInstFromFunction() may push additional instructions on
+      // the deferred worklist, so this will DCE whole instruction chains.
+      if (isInstructionTriviallyDead(I, &TLI)) {
+        eraseInstFromFunction(*I);
+        ++NumDeadInst;
+        continue;
+      }
+
+      Worklist.push(I);
+    }
+
+    Instruction *I = Worklist.removeOne();
+    if (I == nullptr) continue;  // skip null values.
+
+    // Check to see if we can DCE the instruction.
+    if (isInstructionTriviallyDead(I, &TLI)) {
+      eraseInstFromFunction(*I);
+      ++NumDeadInst;
+      continue;
+    }
+
+    if (!DebugCounter::shouldExecute(VisitCounter))
+      continue;
+
+    // Instruction isn't dead, see if we can constant propagate it.
+    if (!I->use_empty() &&
+        (I->getNumOperands() == 0 || isa<Constant>(I->getOperand(0)))) {
+      if (Constant *C = ConstantFoldInstruction(I, DL, &TLI)) {
+        LLVM_DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *I
+                          << '\n');
+
+        // Add operands to the worklist.
+        replaceInstUsesWith(*I, C);
+        ++NumConstProp;
+        if (isInstructionTriviallyDead(I, &TLI))
+          eraseInstFromFunction(*I);
+        MadeIRChange = true;
+        continue;
+      }
+    }
+
+    // See if we can trivially sink this instruction to its user if we can
+    // prove that the successor is not executed more frequently than our block.
+    if (EnableCodeSinking)
+      if (Use *SingleUse = I->getSingleUndroppableUse()) {
+        BasicBlock *BB = I->getParent();
+        Instruction *UserInst = cast<Instruction>(SingleUse->getUser());
+        BasicBlock *UserParent;
+
+        // Get the block the use occurs in.
+        if (PHINode *PN = dyn_cast<PHINode>(UserInst))
+          UserParent = PN->getIncomingBlock(*SingleUse);
+        else
+          UserParent = UserInst->getParent();
+
         // Try sinking to another block. If that block is unreachable, then do
         // not bother. SimplifyCFG should handle it.
         if (UserParent != BB && DT.isReachableFromEntry(UserParent)) {
-          // See if the user is one of our successors that has only one 
-          // predecessor, so that we don't have to split the critical edge. 
-          bool ShouldSink = UserParent->getUniquePredecessor() == BB; 
-          // Another option where we can sink is a block that ends with a 
-          // terminator that does not pass control to other block (such as 
-          // return or unreachable). In this case: 
-          //   - I dominates the User (by SSA form); 
-          //   - the User will be executed at most once. 
-          // So sinking I down to User is always profitable or neutral. 
-          if (!ShouldSink) { 
-            auto *Term = UserParent->getTerminator(); 
-            ShouldSink = isa<ReturnInst>(Term) || isa<UnreachableInst>(Term); 
-          } 
-          if (ShouldSink) { 
-            assert(DT.dominates(BB, UserParent) && 
-                   "Dominance relation broken?"); 
-            // Okay, the CFG is simple enough, try to sink this instruction. 
-            if (TryToSinkInstruction(I, UserParent)) { 
-              LLVM_DEBUG(dbgs() << "IC: Sink: " << *I << '\n'); 
-              MadeIRChange = true; 
-              // We'll add uses of the sunk instruction below, but since sinking 
-              // can expose opportunities for it's *operands* add them to the 
-              // worklist 
-              for (Use &U : I->operands()) 
-                if (Instruction *OpI = dyn_cast<Instruction>(U.get())) 
-                  Worklist.push(OpI); 
-            } 
-          } 
-        } 
-      } 
- 
-    // Now that we have an instruction, try combining it to simplify it. 
-    Builder.SetInsertPoint(I); 
+          // See if the user is one of our successors that has only one
+          // predecessor, so that we don't have to split the critical edge.
+          bool ShouldSink = UserParent->getUniquePredecessor() == BB;
+          // Another option where we can sink is a block that ends with a
+          // terminator that does not pass control to other block (such as
+          // return or unreachable). In this case:
+          //   - I dominates the User (by SSA form);
+          //   - the User will be executed at most once.
+          // So sinking I down to User is always profitable or neutral.
+          if (!ShouldSink) {
+            auto *Term = UserParent->getTerminator();
+            ShouldSink = isa<ReturnInst>(Term) || isa<UnreachableInst>(Term);
+          }
+          if (ShouldSink) {
+            assert(DT.dominates(BB, UserParent) &&
+                   "Dominance relation broken?");
+            // Okay, the CFG is simple enough, try to sink this instruction.
+            if (TryToSinkInstruction(I, UserParent)) {
+              LLVM_DEBUG(dbgs() << "IC: Sink: " << *I << '\n');
+              MadeIRChange = true;
+              // We'll add uses of the sunk instruction below, but since sinking
+              // can expose opportunities for it's *operands* add them to the
+              // worklist
+              for (Use &U : I->operands())
+                if (Instruction *OpI = dyn_cast<Instruction>(U.get()))
+                  Worklist.push(OpI);
+            }
+          }
+        }
+      }
+
+    // Now that we have an instruction, try combining it to simplify it.
+    Builder.SetInsertPoint(I);
     Builder.CollectMetadataToCopy(
         I, {LLVMContext::MD_dbg, LLVMContext::MD_annotation});
- 
-#ifndef NDEBUG 
-    std::string OrigI; 
-#endif 
-    LLVM_DEBUG(raw_string_ostream SS(OrigI); I->print(SS); OrigI = SS.str();); 
-    LLVM_DEBUG(dbgs() << "IC: Visiting: " << OrigI << '\n'); 
- 
-    if (Instruction *Result = visit(*I)) { 
-      ++NumCombined; 
-      // Should we replace the old instruction with a new one? 
-      if (Result != I) { 
-        LLVM_DEBUG(dbgs() << "IC: Old = " << *I << '\n' 
-                          << "    New = " << *Result << '\n'); 
- 
+
+#ifndef NDEBUG
+    std::string OrigI;
+#endif
+    LLVM_DEBUG(raw_string_ostream SS(OrigI); I->print(SS); OrigI = SS.str(););
+    LLVM_DEBUG(dbgs() << "IC: Visiting: " << OrigI << '\n');
+
+    if (Instruction *Result = visit(*I)) {
+      ++NumCombined;
+      // Should we replace the old instruction with a new one?
+      if (Result != I) {
+        LLVM_DEBUG(dbgs() << "IC: Old = " << *I << '\n'
+                          << "    New = " << *Result << '\n');
+
         Result->copyMetadata(*I,
                              {LLVMContext::MD_dbg, LLVMContext::MD_annotation});
-        // Everything uses the new instruction now. 
-        I->replaceAllUsesWith(Result); 
- 
-        // Move the name to the new instruction first. 
-        Result->takeName(I); 
- 
-        // Insert the new instruction into the basic block... 
-        BasicBlock *InstParent = I->getParent(); 
-        BasicBlock::iterator InsertPos = I->getIterator(); 
- 
+        // Everything uses the new instruction now.
+        I->replaceAllUsesWith(Result);
+
+        // Move the name to the new instruction first.
+        Result->takeName(I);
+
+        // Insert the new instruction into the basic block...
+        BasicBlock *InstParent = I->getParent();
+        BasicBlock::iterator InsertPos = I->getIterator();
+
         // Are we replace a PHI with something that isn't a PHI, or vice versa?
         if (isa<PHINode>(Result) != isa<PHINode>(I)) {
           // We need to fix up the insertion point.
@@ -3737,35 +3737,35 @@ bool InstCombinerImpl::run() {
           else // Non-PHI -> PHI
             InsertPos = InstParent->getFirstNonPHI()->getIterator();
         }
- 
-        InstParent->getInstList().insert(InsertPos, Result); 
- 
-        // Push the new instruction and any users onto the worklist. 
-        Worklist.pushUsersToWorkList(*Result); 
-        Worklist.push(Result); 
- 
-        eraseInstFromFunction(*I); 
-      } else { 
-        LLVM_DEBUG(dbgs() << "IC: Mod = " << OrigI << '\n' 
-                          << "    New = " << *I << '\n'); 
- 
-        // If the instruction was modified, it's possible that it is now dead. 
-        // if so, remove it. 
-        if (isInstructionTriviallyDead(I, &TLI)) { 
-          eraseInstFromFunction(*I); 
-        } else { 
-          Worklist.pushUsersToWorkList(*I); 
-          Worklist.push(I); 
-        } 
-      } 
-      MadeIRChange = true; 
-    } 
-  } 
- 
-  Worklist.zap(); 
-  return MadeIRChange; 
-} 
- 
+
+        InstParent->getInstList().insert(InsertPos, Result);
+
+        // Push the new instruction and any users onto the worklist.
+        Worklist.pushUsersToWorkList(*Result);
+        Worklist.push(Result);
+
+        eraseInstFromFunction(*I);
+      } else {
+        LLVM_DEBUG(dbgs() << "IC: Mod = " << OrigI << '\n'
+                          << "    New = " << *I << '\n');
+
+        // If the instruction was modified, it's possible that it is now dead.
+        // if so, remove it.
+        if (isInstructionTriviallyDead(I, &TLI)) {
+          eraseInstFromFunction(*I);
+        } else {
+          Worklist.pushUsersToWorkList(*I);
+          Worklist.push(I);
+        }
+      }
+      MadeIRChange = true;
+    }
+  }
+
+  Worklist.zap();
+  return MadeIRChange;
+}
+
 // Track the scopes used by !alias.scope and !noalias. In a function, a
 // @llvm.experimental.noalias.scope.decl is only useful if that scope is used
 // by both sets. If not, the declaration of the scope can be safely omitted.
@@ -3815,321 +3815,321 @@ public:
   }
 };
 
-/// Populate the IC worklist from a function, by walking it in depth-first 
-/// order and adding all reachable code to the worklist. 
-/// 
-/// This has a couple of tricks to make the code faster and more powerful.  In 
-/// particular, we constant fold and DCE instructions as we go, to avoid adding 
-/// them to the worklist (this significantly speeds up instcombine on code where 
-/// many instructions are dead or constant).  Additionally, if we find a branch 
-/// whose condition is a known constant, we only visit the reachable successors. 
-static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL, 
-                                          const TargetLibraryInfo *TLI, 
-                                          InstCombineWorklist &ICWorklist) { 
-  bool MadeIRChange = false; 
-  SmallPtrSet<BasicBlock *, 32> Visited; 
-  SmallVector<BasicBlock*, 256> Worklist; 
-  Worklist.push_back(&F.front()); 
- 
-  SmallVector<Instruction*, 128> InstrsForInstCombineWorklist; 
-  DenseMap<Constant *, Constant *> FoldedConstants; 
+/// Populate the IC worklist from a function, by walking it in depth-first
+/// order and adding all reachable code to the worklist.
+///
+/// This has a couple of tricks to make the code faster and more powerful.  In
+/// particular, we constant fold and DCE instructions as we go, to avoid adding
+/// them to the worklist (this significantly speeds up instcombine on code where
+/// many instructions are dead or constant).  Additionally, if we find a branch
+/// whose condition is a known constant, we only visit the reachable successors.
+static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
+                                          const TargetLibraryInfo *TLI,
+                                          InstCombineWorklist &ICWorklist) {
+  bool MadeIRChange = false;
+  SmallPtrSet<BasicBlock *, 32> Visited;
+  SmallVector<BasicBlock*, 256> Worklist;
+  Worklist.push_back(&F.front());
+
+  SmallVector<Instruction*, 128> InstrsForInstCombineWorklist;
+  DenseMap<Constant *, Constant *> FoldedConstants;
   AliasScopeTracker SeenAliasScopes;
- 
-  do { 
-    BasicBlock *BB = Worklist.pop_back_val(); 
- 
-    // We have now visited this block!  If we've already been here, ignore it. 
-    if (!Visited.insert(BB).second) 
-      continue; 
- 
-    for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) { 
-      Instruction *Inst = &*BBI++; 
- 
-      // ConstantProp instruction if trivially constant. 
-      if (!Inst->use_empty() && 
-          (Inst->getNumOperands() == 0 || isa<Constant>(Inst->getOperand(0)))) 
-        if (Constant *C = ConstantFoldInstruction(Inst, DL, TLI)) { 
-          LLVM_DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *Inst 
-                            << '\n'); 
-          Inst->replaceAllUsesWith(C); 
-          ++NumConstProp; 
-          if (isInstructionTriviallyDead(Inst, TLI)) 
-            Inst->eraseFromParent(); 
-          MadeIRChange = true; 
-          continue; 
-        } 
- 
-      // See if we can constant fold its operands. 
-      for (Use &U : Inst->operands()) { 
-        if (!isa<ConstantVector>(U) && !isa<ConstantExpr>(U)) 
-          continue; 
- 
-        auto *C = cast<Constant>(U); 
-        Constant *&FoldRes = FoldedConstants[C]; 
-        if (!FoldRes) 
-          FoldRes = ConstantFoldConstant(C, DL, TLI); 
- 
-        if (FoldRes != C) { 
-          LLVM_DEBUG(dbgs() << "IC: ConstFold operand of: " << *Inst 
-                            << "\n    Old = " << *C 
-                            << "\n    New = " << *FoldRes << '\n'); 
-          U = FoldRes; 
-          MadeIRChange = true; 
-        } 
-      } 
- 
+
+  do {
+    BasicBlock *BB = Worklist.pop_back_val();
+
+    // We have now visited this block!  If we've already been here, ignore it.
+    if (!Visited.insert(BB).second)
+      continue;
+
+    for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) {
+      Instruction *Inst = &*BBI++;
+
+      // ConstantProp instruction if trivially constant.
+      if (!Inst->use_empty() &&
+          (Inst->getNumOperands() == 0 || isa<Constant>(Inst->getOperand(0))))
+        if (Constant *C = ConstantFoldInstruction(Inst, DL, TLI)) {
+          LLVM_DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *Inst
+                            << '\n');
+          Inst->replaceAllUsesWith(C);
+          ++NumConstProp;
+          if (isInstructionTriviallyDead(Inst, TLI))
+            Inst->eraseFromParent();
+          MadeIRChange = true;
+          continue;
+        }
+
+      // See if we can constant fold its operands.
+      for (Use &U : Inst->operands()) {
+        if (!isa<ConstantVector>(U) && !isa<ConstantExpr>(U))
+          continue;
+
+        auto *C = cast<Constant>(U);
+        Constant *&FoldRes = FoldedConstants[C];
+        if (!FoldRes)
+          FoldRes = ConstantFoldConstant(C, DL, TLI);
+
+        if (FoldRes != C) {
+          LLVM_DEBUG(dbgs() << "IC: ConstFold operand of: " << *Inst
+                            << "\n    Old = " << *C
+                            << "\n    New = " << *FoldRes << '\n');
+          U = FoldRes;
+          MadeIRChange = true;
+        }
+      }
+
       // Skip processing debug and pseudo intrinsics in InstCombine. Processing
       // these call instructions consumes non-trivial amount of time and
       // provides no value for the optimization.
       if (!Inst->isDebugOrPseudoInst()) {
-        InstrsForInstCombineWorklist.push_back(Inst); 
+        InstrsForInstCombineWorklist.push_back(Inst);
         SeenAliasScopes.analyse(Inst);
       }
-    } 
- 
-    // Recursively visit successors.  If this is a branch or switch on a 
-    // constant, only visit the reachable successor. 
-    Instruction *TI = BB->getTerminator(); 
-    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { 
-      if (BI->isConditional() && isa<ConstantInt>(BI->getCondition())) { 
-        bool CondVal = cast<ConstantInt>(BI->getCondition())->getZExtValue(); 
-        BasicBlock *ReachableBB = BI->getSuccessor(!CondVal); 
-        Worklist.push_back(ReachableBB); 
-        continue; 
-      } 
-    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { 
-      if (ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition())) { 
-        Worklist.push_back(SI->findCaseValue(Cond)->getCaseSuccessor()); 
-        continue; 
-      } 
-    } 
- 
+    }
+
+    // Recursively visit successors.  If this is a branch or switch on a
+    // constant, only visit the reachable successor.
+    Instruction *TI = BB->getTerminator();
+    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+      if (BI->isConditional() && isa<ConstantInt>(BI->getCondition())) {
+        bool CondVal = cast<ConstantInt>(BI->getCondition())->getZExtValue();
+        BasicBlock *ReachableBB = BI->getSuccessor(!CondVal);
+        Worklist.push_back(ReachableBB);
+        continue;
+      }
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+      if (ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition())) {
+        Worklist.push_back(SI->findCaseValue(Cond)->getCaseSuccessor());
+        continue;
+      }
+    }
+
     append_range(Worklist, successors(TI));
-  } while (!Worklist.empty()); 
- 
-  // Remove instructions inside unreachable blocks. This prevents the 
-  // instcombine code from having to deal with some bad special cases, and 
-  // reduces use counts of instructions. 
-  for (BasicBlock &BB : F) { 
-    if (Visited.count(&BB)) 
-      continue; 
- 
+  } while (!Worklist.empty());
+
+  // Remove instructions inside unreachable blocks. This prevents the
+  // instcombine code from having to deal with some bad special cases, and
+  // reduces use counts of instructions.
+  for (BasicBlock &BB : F) {
+    if (Visited.count(&BB))
+      continue;
+
     unsigned NumDeadInstInBB;
     unsigned NumDeadDbgInstInBB;
     std::tie(NumDeadInstInBB, NumDeadDbgInstInBB) =
         removeAllNonTerminatorAndEHPadInstructions(&BB);
 
     MadeIRChange |= NumDeadInstInBB + NumDeadDbgInstInBB > 0;
-    NumDeadInst += NumDeadInstInBB; 
-  } 
- 
-  // Once we've found all of the instructions to add to instcombine's worklist, 
-  // add them in reverse order.  This way instcombine will visit from the top 
-  // of the function down.  This jives well with the way that it adds all uses 
-  // of instructions to the worklist after doing a transformation, thus avoiding 
-  // some N^2 behavior in pathological cases. 
-  ICWorklist.reserve(InstrsForInstCombineWorklist.size()); 
-  for (Instruction *Inst : reverse(InstrsForInstCombineWorklist)) { 
-    // DCE instruction if trivially dead. As we iterate in reverse program 
-    // order here, we will clean up whole chains of dead instructions. 
+    NumDeadInst += NumDeadInstInBB;
+  }
+
+  // Once we've found all of the instructions to add to instcombine's worklist,
+  // add them in reverse order.  This way instcombine will visit from the top
+  // of the function down.  This jives well with the way that it adds all uses
+  // of instructions to the worklist after doing a transformation, thus avoiding
+  // some N^2 behavior in pathological cases.
+  ICWorklist.reserve(InstrsForInstCombineWorklist.size());
+  for (Instruction *Inst : reverse(InstrsForInstCombineWorklist)) {
+    // DCE instruction if trivially dead. As we iterate in reverse program
+    // order here, we will clean up whole chains of dead instructions.
     if (isInstructionTriviallyDead(Inst, TLI) ||
         SeenAliasScopes.isNoAliasScopeDeclDead(Inst)) {
-      ++NumDeadInst; 
-      LLVM_DEBUG(dbgs() << "IC: DCE: " << *Inst << '\n'); 
-      salvageDebugInfo(*Inst); 
-      Inst->eraseFromParent(); 
-      MadeIRChange = true; 
-      continue; 
-    } 
- 
-    ICWorklist.push(Inst); 
-  } 
- 
-  return MadeIRChange; 
-} 
- 
-static bool combineInstructionsOverFunction( 
-    Function &F, InstCombineWorklist &Worklist, AliasAnalysis *AA, 
+      ++NumDeadInst;
+      LLVM_DEBUG(dbgs() << "IC: DCE: " << *Inst << '\n');
+      salvageDebugInfo(*Inst);
+      Inst->eraseFromParent();
+      MadeIRChange = true;
+      continue;
+    }
+
+    ICWorklist.push(Inst);
+  }
+
+  return MadeIRChange;
+}
+
+static bool combineInstructionsOverFunction(
+    Function &F, InstCombineWorklist &Worklist, AliasAnalysis *AA,
     AssumptionCache &AC, TargetLibraryInfo &TLI, TargetTransformInfo &TTI,
     DominatorTree &DT, OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
-    ProfileSummaryInfo *PSI, unsigned MaxIterations, LoopInfo *LI) { 
-  auto &DL = F.getParent()->getDataLayout(); 
-  MaxIterations = std::min(MaxIterations, LimitMaxIterations.getValue()); 
- 
-  /// Builder - This is an IRBuilder that automatically inserts new 
-  /// instructions into the worklist when they are created. 
-  IRBuilder<TargetFolder, IRBuilderCallbackInserter> Builder( 
-      F.getContext(), TargetFolder(DL), 
-      IRBuilderCallbackInserter([&Worklist, &AC](Instruction *I) { 
-        Worklist.add(I); 
-        if (match(I, m_Intrinsic<Intrinsic::assume>())) 
-          AC.registerAssumption(cast<CallInst>(I)); 
-      })); 
- 
-  // Lower dbg.declare intrinsics otherwise their value may be clobbered 
-  // by instcombiner. 
-  bool MadeIRChange = false; 
-  if (ShouldLowerDbgDeclare) 
-    MadeIRChange = LowerDbgDeclare(F); 
- 
-  // Iterate while there is work to do. 
-  unsigned Iteration = 0; 
-  while (true) { 
+    ProfileSummaryInfo *PSI, unsigned MaxIterations, LoopInfo *LI) {
+  auto &DL = F.getParent()->getDataLayout();
+  MaxIterations = std::min(MaxIterations, LimitMaxIterations.getValue());
+
+  /// Builder - This is an IRBuilder that automatically inserts new
+  /// instructions into the worklist when they are created.
+  IRBuilder<TargetFolder, IRBuilderCallbackInserter> Builder(
+      F.getContext(), TargetFolder(DL),
+      IRBuilderCallbackInserter([&Worklist, &AC](Instruction *I) {
+        Worklist.add(I);
+        if (match(I, m_Intrinsic<Intrinsic::assume>()))
+          AC.registerAssumption(cast<CallInst>(I));
+      }));
+
+  // Lower dbg.declare intrinsics otherwise their value may be clobbered
+  // by instcombiner.
+  bool MadeIRChange = false;
+  if (ShouldLowerDbgDeclare)
+    MadeIRChange = LowerDbgDeclare(F);
+
+  // Iterate while there is work to do.
+  unsigned Iteration = 0;
+  while (true) {
     ++NumWorklistIterations;
-    ++Iteration; 
- 
-    if (Iteration > InfiniteLoopDetectionThreshold) { 
-      report_fatal_error( 
-          "Instruction Combining seems stuck in an infinite loop after " + 
-          Twine(InfiniteLoopDetectionThreshold) + " iterations."); 
-    } 
- 
-    if (Iteration > MaxIterations) { 
-      LLVM_DEBUG(dbgs() << "\n\n[IC] Iteration limit #" << MaxIterations 
-                        << " on " << F.getName() 
-                        << " reached; stopping before reaching a fixpoint\n"); 
-      break; 
-    } 
- 
-    LLVM_DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on " 
-                      << F.getName() << "\n"); 
- 
-    MadeIRChange |= prepareICWorklistFromFunction(F, DL, &TLI, Worklist); 
- 
+    ++Iteration;
+
+    if (Iteration > InfiniteLoopDetectionThreshold) {
+      report_fatal_error(
+          "Instruction Combining seems stuck in an infinite loop after " +
+          Twine(InfiniteLoopDetectionThreshold) + " iterations.");
+    }
+
+    if (Iteration > MaxIterations) {
+      LLVM_DEBUG(dbgs() << "\n\n[IC] Iteration limit #" << MaxIterations
+                        << " on " << F.getName()
+                        << " reached; stopping before reaching a fixpoint\n");
+      break;
+    }
+
+    LLVM_DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
+                      << F.getName() << "\n");
+
+    MadeIRChange |= prepareICWorklistFromFunction(F, DL, &TLI, Worklist);
+
     InstCombinerImpl IC(Worklist, Builder, F.hasMinSize(), AA, AC, TLI, TTI, DT,
                         ORE, BFI, PSI, DL, LI);
-    IC.MaxArraySizeForCombine = MaxArraySize; 
- 
-    if (!IC.run()) 
-      break; 
- 
-    MadeIRChange = true; 
-  } 
- 
-  return MadeIRChange; 
-} 
- 
-InstCombinePass::InstCombinePass() : MaxIterations(LimitMaxIterations) {} 
- 
-InstCombinePass::InstCombinePass(unsigned MaxIterations) 
-    : MaxIterations(MaxIterations) {} 
- 
-PreservedAnalyses InstCombinePass::run(Function &F, 
-                                       FunctionAnalysisManager &AM) { 
-  auto &AC = AM.getResult<AssumptionAnalysis>(F); 
-  auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 
-  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 
-  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 
+    IC.MaxArraySizeForCombine = MaxArraySize;
+
+    if (!IC.run())
+      break;
+
+    MadeIRChange = true;
+  }
+
+  return MadeIRChange;
+}
+
+InstCombinePass::InstCombinePass() : MaxIterations(LimitMaxIterations) {}
+
+InstCombinePass::InstCombinePass(unsigned MaxIterations)
+    : MaxIterations(MaxIterations) {}
+
+PreservedAnalyses InstCombinePass::run(Function &F,
+                                       FunctionAnalysisManager &AM) {
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
   auto &TTI = AM.getResult<TargetIRAnalysis>(F);
- 
-  auto *LI = AM.getCachedResult<LoopAnalysis>(F); 
- 
-  auto *AA = &AM.getResult<AAManager>(F); 
-  auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 
-  ProfileSummaryInfo *PSI = 
-      MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 
-  auto *BFI = (PSI && PSI->hasProfileSummary()) ? 
-      &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr; 
- 
+
+  auto *LI = AM.getCachedResult<LoopAnalysis>(F);
+
+  auto *AA = &AM.getResult<AAManager>(F);
+  auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
+  ProfileSummaryInfo *PSI =
+      MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+  auto *BFI = (PSI && PSI->hasProfileSummary()) ?
+      &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
+
   if (!combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, TTI, DT, ORE,
                                        BFI, PSI, MaxIterations, LI))
-    // No changes, all analyses are preserved. 
-    return PreservedAnalyses::all(); 
- 
-  // Mark all the analyses that instcombine updates as preserved. 
-  PreservedAnalyses PA; 
-  PA.preserveSet<CFGAnalyses>(); 
-  PA.preserve<AAManager>(); 
-  PA.preserve<BasicAA>(); 
-  PA.preserve<GlobalsAA>(); 
-  return PA; 
-} 
- 
-void InstructionCombiningPass::getAnalysisUsage(AnalysisUsage &AU) const { 
-  AU.setPreservesCFG(); 
-  AU.addRequired<AAResultsWrapperPass>(); 
-  AU.addRequired<AssumptionCacheTracker>(); 
-  AU.addRequired<TargetLibraryInfoWrapperPass>(); 
+    // No changes, all analyses are preserved.
+    return PreservedAnalyses::all();
+
+  // Mark all the analyses that instcombine updates as preserved.
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<AAManager>();
+  PA.preserve<BasicAA>();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+
+void InstructionCombiningPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<AAResultsWrapperPass>();
+  AU.addRequired<AssumptionCacheTracker>();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
   AU.addRequired<TargetTransformInfoWrapperPass>();
-  AU.addRequired<DominatorTreeWrapperPass>(); 
-  AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 
-  AU.addPreserved<DominatorTreeWrapperPass>(); 
-  AU.addPreserved<AAResultsWrapperPass>(); 
-  AU.addPreserved<BasicAAWrapperPass>(); 
-  AU.addPreserved<GlobalsAAWrapperPass>(); 
-  AU.addRequired<ProfileSummaryInfoWrapperPass>(); 
-  LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU); 
-} 
- 
-bool InstructionCombiningPass::runOnFunction(Function &F) { 
-  if (skipFunction(F)) 
-    return false; 
- 
-  // Required analyses. 
-  auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 
-  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 
-  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+  AU.addPreserved<DominatorTreeWrapperPass>();
+  AU.addPreserved<AAResultsWrapperPass>();
+  AU.addPreserved<BasicAAWrapperPass>();
+  AU.addPreserved<GlobalsAAWrapperPass>();
+  AU.addRequired<ProfileSummaryInfoWrapperPass>();
+  LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
+}
+
+bool InstructionCombiningPass::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  // Required analyses.
+  auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-  auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 
- 
-  // Optional analyses. 
-  auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>(); 
-  auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; 
-  ProfileSummaryInfo *PSI = 
-      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 
-  BlockFrequencyInfo *BFI = 
-      (PSI && PSI->hasProfileSummary()) ? 
-      &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() : 
-      nullptr; 
- 
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+
+  // Optional analyses.
+  auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
+  auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
+  ProfileSummaryInfo *PSI =
+      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  BlockFrequencyInfo *BFI =
+      (PSI && PSI->hasProfileSummary()) ?
+      &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() :
+      nullptr;
+
   return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, TTI, DT, ORE,
                                          BFI, PSI, MaxIterations, LI);
-} 
- 
-char InstructionCombiningPass::ID = 0; 
- 
-InstructionCombiningPass::InstructionCombiningPass() 
-    : FunctionPass(ID), MaxIterations(InstCombineDefaultMaxIterations) { 
-  initializeInstructionCombiningPassPass(*PassRegistry::getPassRegistry()); 
-} 
- 
-InstructionCombiningPass::InstructionCombiningPass(unsigned MaxIterations) 
-    : FunctionPass(ID), MaxIterations(MaxIterations) { 
-  initializeInstructionCombiningPassPass(*PassRegistry::getPassRegistry()); 
-} 
- 
-INITIALIZE_PASS_BEGIN(InstructionCombiningPass, "instcombine", 
-                      "Combine redundant instructions", false, false) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
+}
+
+char InstructionCombiningPass::ID = 0;
+
+InstructionCombiningPass::InstructionCombiningPass()
+    : FunctionPass(ID), MaxIterations(InstCombineDefaultMaxIterations) {
+  initializeInstructionCombiningPassPass(*PassRegistry::getPassRegistry());
+}
+
+InstructionCombiningPass::InstructionCombiningPass(unsigned MaxIterations)
+    : FunctionPass(ID), MaxIterations(MaxIterations) {
+  initializeInstructionCombiningPassPass(*PassRegistry::getPassRegistry());
+}
+
+INITIALIZE_PASS_BEGIN(InstructionCombiningPass, "instcombine",
+                      "Combine redundant instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass) 
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 
-INITIALIZE_PASS_END(InstructionCombiningPass, "instcombine", 
-                    "Combine redundant instructions", false, false) 
- 
-// Initialization Routines 
-void llvm::initializeInstCombine(PassRegistry &Registry) { 
-  initializeInstructionCombiningPassPass(Registry); 
-} 
- 
-void LLVMInitializeInstCombine(LLVMPassRegistryRef R) { 
-  initializeInstructionCombiningPassPass(*unwrap(R)); 
-} 
- 
-FunctionPass *llvm::createInstructionCombiningPass() { 
-  return new InstructionCombiningPass(); 
-} 
- 
-FunctionPass *llvm::createInstructionCombiningPass(unsigned MaxIterations) { 
-  return new InstructionCombiningPass(MaxIterations); 
-} 
- 
-void LLVMAddInstructionCombiningPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createInstructionCombiningPass()); 
-} 
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_END(InstructionCombiningPass, "instcombine",
+                    "Combine redundant instructions", false, false)
+
+// Initialization Routines
+void llvm::initializeInstCombine(PassRegistry &Registry) {
+  initializeInstructionCombiningPassPass(Registry);
+}
+
+void LLVMInitializeInstCombine(LLVMPassRegistryRef R) {
+  initializeInstructionCombiningPassPass(*unwrap(R));
+}
+
+FunctionPass *llvm::createInstructionCombiningPass() {
+  return new InstructionCombiningPass();
+}
+
+FunctionPass *llvm::createInstructionCombiningPass(unsigned MaxIterations) {
+  return new InstructionCombiningPass(MaxIterations);
+}
+
+void LLVMAddInstructionCombiningPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createInstructionCombiningPass());
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/ya.make b/contrib/libs/llvm12/lib/Transforms/InstCombine/ya.make
index 69d2077a71..3f74e68d16 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/ya.make
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/ya.make
@@ -1,49 +1,49 @@
-# Generated by devtools/yamaker. 
- 
-LIBRARY() 
- 
+# Generated by devtools/yamaker.
+
+LIBRARY()
+
 OWNER(
     orivej
     g:cpp-contrib
 )
- 
+
 LICENSE(Apache-2.0 WITH LLVM-exception)
 
 LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
 
-PEERDIR( 
+PEERDIR(
     contrib/libs/llvm12
     contrib/libs/llvm12/include
     contrib/libs/llvm12/lib/Analysis
     contrib/libs/llvm12/lib/IR
     contrib/libs/llvm12/lib/Support
     contrib/libs/llvm12/lib/Transforms/Utils
-) 
- 
+)
+
 ADDINCL(
     contrib/libs/llvm12/lib/Transforms/InstCombine
 )
- 
-NO_COMPILER_WARNINGS() 
- 
-NO_UTIL() 
- 
-SRCS( 
-    InstCombineAddSub.cpp 
-    InstCombineAndOrXor.cpp 
-    InstCombineAtomicRMW.cpp 
-    InstCombineCalls.cpp 
-    InstCombineCasts.cpp 
-    InstCombineCompares.cpp 
-    InstCombineLoadStoreAlloca.cpp 
-    InstCombineMulDivRem.cpp 
-    InstCombineNegator.cpp 
-    InstCombinePHI.cpp 
-    InstCombineSelect.cpp 
-    InstCombineShifts.cpp 
-    InstCombineSimplifyDemanded.cpp 
-    InstCombineVectorOps.cpp 
-    InstructionCombining.cpp 
-) 
- 
-END() 
+
+NO_COMPILER_WARNINGS()
+
+NO_UTIL()
+
+SRCS(
+    InstCombineAddSub.cpp
+    InstCombineAndOrXor.cpp
+    InstCombineAtomicRMW.cpp
+    InstCombineCalls.cpp
+    InstCombineCasts.cpp
+    InstCombineCompares.cpp
+    InstCombineLoadStoreAlloca.cpp
+    InstCombineMulDivRem.cpp
+    InstCombineNegator.cpp
+    InstCombinePHI.cpp
+    InstCombineSelect.cpp
+    InstCombineShifts.cpp
+    InstCombineSimplifyDemanded.cpp
+    InstCombineVectorOps.cpp
+    InstructionCombining.cpp
+)
+
+END()
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 7212096f1b..f4e471706d 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1,140 +1,140 @@
-//===- AddressSanitizer.cpp - memory error detector -----------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file is a part of AddressSanitizer, an address sanity checker. 
-// Details of the algorithm: 
-//  https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm 
-// 
-// FIXME: This sanitizer does not yet handle scalable vectors 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Instrumentation/AddressSanitizer.h" 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/DepthFirstIterator.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/StringExtras.h" 
-#include "llvm/ADT/StringRef.h" 
-#include "llvm/ADT/Triple.h" 
-#include "llvm/ADT/Twine.h" 
-#include "llvm/Analysis/MemoryBuiltins.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/BinaryFormat/MachO.h" 
-#include "llvm/IR/Argument.h" 
-#include "llvm/IR/Attributes.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Comdat.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DIBuilder.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DebugInfoMetadata.h" 
-#include "llvm/IR/DebugLoc.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GlobalAlias.h" 
-#include "llvm/IR/GlobalValue.h" 
-#include "llvm/IR/GlobalVariable.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InlineAsm.h" 
-#include "llvm/IR/InstVisitor.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/MDBuilder.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/MC/MCSectionMachO.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/MathExtras.h" 
-#include "llvm/Support/ScopedPrinter.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Instrumentation.h" 
-#include "llvm/Transforms/Instrumentation/AddressSanitizerCommon.h" 
-#include "llvm/Transforms/Utils/ASanStackFrameLayout.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Transforms/Utils/ModuleUtils.h" 
-#include "llvm/Transforms/Utils/PromoteMemToReg.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <cstddef> 
-#include <cstdint> 
-#include <iomanip> 
-#include <limits> 
-#include <memory> 
-#include <sstream> 
-#include <string> 
-#include <tuple> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "asan" 
- 
-static const uint64_t kDefaultShadowScale = 3; 
-static const uint64_t kDefaultShadowOffset32 = 1ULL << 29; 
-static const uint64_t kDefaultShadowOffset64 = 1ULL << 44; 
-static const uint64_t kDynamicShadowSentinel = 
-    std::numeric_limits<uint64_t>::max(); 
-static const uint64_t kSmallX86_64ShadowOffsetBase = 0x7FFFFFFF;  // < 2G. 
-static const uint64_t kSmallX86_64ShadowOffsetAlignMask = ~0xFFFULL; 
-static const uint64_t kLinuxKasan_ShadowOffset64 = 0xdffffc0000000000; 
-static const uint64_t kPPC64_ShadowOffset64 = 1ULL << 44; 
-static const uint64_t kSystemZ_ShadowOffset64 = 1ULL << 52; 
-static const uint64_t kMIPS32_ShadowOffset32 = 0x0aaa0000; 
-static const uint64_t kMIPS64_ShadowOffset64 = 1ULL << 37; 
-static const uint64_t kAArch64_ShadowOffset64 = 1ULL << 36; 
+//===- AddressSanitizer.cpp - memory error detector -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of AddressSanitizer, an address sanity checker.
+// Details of the algorithm:
+//  https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm
+//
+// FIXME: This sanitizer does not yet handle scalable vectors
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/AddressSanitizer.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/BinaryFormat/MachO.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Comdat.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Instrumentation/AddressSanitizerCommon.h"
+#include "llvm/Transforms/Utils/ASanStackFrameLayout.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iomanip>
+#include <limits>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <tuple>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "asan"
+
+static const uint64_t kDefaultShadowScale = 3;
+static const uint64_t kDefaultShadowOffset32 = 1ULL << 29;
+static const uint64_t kDefaultShadowOffset64 = 1ULL << 44;
+static const uint64_t kDynamicShadowSentinel =
+    std::numeric_limits<uint64_t>::max();
+static const uint64_t kSmallX86_64ShadowOffsetBase = 0x7FFFFFFF;  // < 2G.
+static const uint64_t kSmallX86_64ShadowOffsetAlignMask = ~0xFFFULL;
+static const uint64_t kLinuxKasan_ShadowOffset64 = 0xdffffc0000000000;
+static const uint64_t kPPC64_ShadowOffset64 = 1ULL << 44;
+static const uint64_t kSystemZ_ShadowOffset64 = 1ULL << 52;
+static const uint64_t kMIPS32_ShadowOffset32 = 0x0aaa0000;
+static const uint64_t kMIPS64_ShadowOffset64 = 1ULL << 37;
+static const uint64_t kAArch64_ShadowOffset64 = 1ULL << 36;
 static const uint64_t kRISCV64_ShadowOffset64 = 0x20000000;
-static const uint64_t kFreeBSD_ShadowOffset32 = 1ULL << 30; 
-static const uint64_t kFreeBSD_ShadowOffset64 = 1ULL << 46; 
-static const uint64_t kNetBSD_ShadowOffset32 = 1ULL << 30; 
-static const uint64_t kNetBSD_ShadowOffset64 = 1ULL << 46; 
-static const uint64_t kNetBSDKasan_ShadowOffset64 = 0xdfff900000000000; 
-static const uint64_t kPS4CPU_ShadowOffset64 = 1ULL << 40; 
-static const uint64_t kWindowsShadowOffset32 = 3ULL << 28; 
-static const uint64_t kEmscriptenShadowOffset = 0; 
- 
-static const uint64_t kMyriadShadowScale = 5; 
-static const uint64_t kMyriadMemoryOffset32 = 0x80000000ULL; 
-static const uint64_t kMyriadMemorySize32 = 0x20000000ULL; 
-static const uint64_t kMyriadTagShift = 29; 
-static const uint64_t kMyriadDDRTag = 4; 
-static const uint64_t kMyriadCacheBitMask32 = 0x40000000ULL; 
- 
-// The shadow memory space is dynamically allocated. 
-static const uint64_t kWindowsShadowOffset64 = kDynamicShadowSentinel; 
- 
-static const size_t kMinStackMallocSize = 1 << 6;   // 64B 
-static const size_t kMaxStackMallocSize = 1 << 16;  // 64K 
-static const uintptr_t kCurrentStackFrameMagic = 0x41B58AB3; 
-static const uintptr_t kRetiredStackFrameMagic = 0x45E0360E; 
- 
+static const uint64_t kFreeBSD_ShadowOffset32 = 1ULL << 30;
+static const uint64_t kFreeBSD_ShadowOffset64 = 1ULL << 46;
+static const uint64_t kNetBSD_ShadowOffset32 = 1ULL << 30;
+static const uint64_t kNetBSD_ShadowOffset64 = 1ULL << 46;
+static const uint64_t kNetBSDKasan_ShadowOffset64 = 0xdfff900000000000;
+static const uint64_t kPS4CPU_ShadowOffset64 = 1ULL << 40;
+static const uint64_t kWindowsShadowOffset32 = 3ULL << 28;
+static const uint64_t kEmscriptenShadowOffset = 0;
+
+static const uint64_t kMyriadShadowScale = 5;
+static const uint64_t kMyriadMemoryOffset32 = 0x80000000ULL;
+static const uint64_t kMyriadMemorySize32 = 0x20000000ULL;
+static const uint64_t kMyriadTagShift = 29;
+static const uint64_t kMyriadDDRTag = 4;
+static const uint64_t kMyriadCacheBitMask32 = 0x40000000ULL;
+
+// The shadow memory space is dynamically allocated.
+static const uint64_t kWindowsShadowOffset64 = kDynamicShadowSentinel;
+
+static const size_t kMinStackMallocSize = 1 << 6;   // 64B
+static const size_t kMaxStackMallocSize = 1 << 16;  // 64K
+static const uintptr_t kCurrentStackFrameMagic = 0x41B58AB3;
+static const uintptr_t kRetiredStackFrameMagic = 0x45E0360E;
+
 const char kAsanModuleCtorName[] = "asan.module_ctor";
 const char kAsanModuleDtorName[] = "asan.module_dtor";
-static const uint64_t kAsanCtorAndDtorPriority = 1; 
-// On Emscripten, the system needs more than one priorities for constructors. 
-static const uint64_t kAsanEmscriptenCtorAndDtorPriority = 50; 
+static const uint64_t kAsanCtorAndDtorPriority = 1;
+// On Emscripten, the system needs more than one priorities for constructors.
+static const uint64_t kAsanEmscriptenCtorAndDtorPriority = 50;
 const char kAsanReportErrorTemplate[] = "__asan_report_";
 const char kAsanRegisterGlobalsName[] = "__asan_register_globals";
 const char kAsanUnregisterGlobalsName[] = "__asan_unregister_globals";
@@ -150,7 +150,7 @@ const char kAsanVersionCheckNamePrefix[] = "__asan_version_mismatch_check_v";
 const char kAsanPtrCmp[] = "__sanitizer_ptr_cmp";
 const char kAsanPtrSub[] = "__sanitizer_ptr_sub";
 const char kAsanHandleNoReturnName[] = "__asan_handle_no_return";
-static const int kMaxAsanStackMallocSizeClass = 10; 
+static const int kMaxAsanStackMallocSizeClass = 10;
 const char kAsanStackMallocNameTemplate[] = "__asan_stack_malloc_";
 const char kAsanStackFreeNameTemplate[] = "__asan_stack_free_";
 const char kAsanGenPrefix[] = "___asan_gen_";
@@ -159,808 +159,808 @@ const char kSanCovGenPrefix[] = "__sancov_gen_";
 const char kAsanSetShadowPrefix[] = "__asan_set_shadow_";
 const char kAsanPoisonStackMemoryName[] = "__asan_poison_stack_memory";
 const char kAsanUnpoisonStackMemoryName[] = "__asan_unpoison_stack_memory";
- 
-// ASan version script has __asan_* wildcard. Triple underscore prevents a 
-// linker (gold) warning about attempting to export a local symbol. 
+
+// ASan version script has __asan_* wildcard. Triple underscore prevents a
+// linker (gold) warning about attempting to export a local symbol.
 const char kAsanGlobalsRegisteredFlagName[] = "___asan_globals_registered";
- 
+
 const char kAsanOptionDetectUseAfterReturn[] =
-    "__asan_option_detect_stack_use_after_return"; 
- 
+    "__asan_option_detect_stack_use_after_return";
+
 const char kAsanShadowMemoryDynamicAddress[] =
-    "__asan_shadow_memory_dynamic_address"; 
- 
+    "__asan_shadow_memory_dynamic_address";
+
 const char kAsanAllocaPoison[] = "__asan_alloca_poison";
 const char kAsanAllocasUnpoison[] = "__asan_allocas_unpoison";
- 
-// Accesses sizes are powers of two: 1, 2, 4, 8, 16. 
-static const size_t kNumberOfAccessSizes = 5; 
- 
-static const unsigned kAllocaRzSize = 32; 
- 
-// Command-line flags. 
- 
-static cl::opt<bool> ClEnableKasan( 
-    "asan-kernel", cl::desc("Enable KernelAddressSanitizer instrumentation"), 
-    cl::Hidden, cl::init(false)); 
- 
-static cl::opt<bool> ClRecover( 
-    "asan-recover", 
-    cl::desc("Enable recovery mode (continue-after-error)."), 
-    cl::Hidden, cl::init(false)); 
- 
-static cl::opt<bool> ClInsertVersionCheck( 
-    "asan-guard-against-version-mismatch", 
-    cl::desc("Guard against compiler/runtime version mismatch."), 
-    cl::Hidden, cl::init(true)); 
- 
-// This flag may need to be replaced with -f[no-]asan-reads. 
-static cl::opt<bool> ClInstrumentReads("asan-instrument-reads", 
-                                       cl::desc("instrument read instructions"), 
-                                       cl::Hidden, cl::init(true)); 
- 
-static cl::opt<bool> ClInstrumentWrites( 
-    "asan-instrument-writes", cl::desc("instrument write instructions"), 
-    cl::Hidden, cl::init(true)); 
- 
-static cl::opt<bool> ClInstrumentAtomics( 
-    "asan-instrument-atomics", 
-    cl::desc("instrument atomic instructions (rmw, cmpxchg)"), cl::Hidden, 
-    cl::init(true)); 
- 
-static cl::opt<bool> 
-    ClInstrumentByval("asan-instrument-byval", 
-                      cl::desc("instrument byval call arguments"), cl::Hidden, 
-                      cl::init(true)); 
- 
-static cl::opt<bool> ClAlwaysSlowPath( 
-    "asan-always-slow-path", 
-    cl::desc("use instrumentation with slow path for all accesses"), cl::Hidden, 
-    cl::init(false)); 
- 
-static cl::opt<bool> ClForceDynamicShadow( 
-    "asan-force-dynamic-shadow", 
-    cl::desc("Load shadow address into a local variable for each function"), 
-    cl::Hidden, cl::init(false)); 
- 
-static cl::opt<bool> 
-    ClWithIfunc("asan-with-ifunc", 
-                cl::desc("Access dynamic shadow through an ifunc global on " 
-                         "platforms that support this"), 
-                cl::Hidden, cl::init(true)); 
- 
-static cl::opt<bool> ClWithIfuncSuppressRemat( 
-    "asan-with-ifunc-suppress-remat", 
-    cl::desc("Suppress rematerialization of dynamic shadow address by passing " 
-             "it through inline asm in prologue."), 
-    cl::Hidden, cl::init(true)); 
- 
-// This flag limits the number of instructions to be instrumented 
-// in any given BB. Normally, this should be set to unlimited (INT_MAX), 
-// but due to http://llvm.org/bugs/show_bug.cgi?id=12652 we temporary 
-// set it to 10000. 
-static cl::opt<int> ClMaxInsnsToInstrumentPerBB( 
-    "asan-max-ins-per-bb", cl::init(10000), 
-    cl::desc("maximal number of instructions to instrument in any given BB"), 
-    cl::Hidden); 
- 
-// This flag may need to be replaced with -f[no]asan-stack. 
-static cl::opt<bool> ClStack("asan-stack", cl::desc("Handle stack memory"), 
-                             cl::Hidden, cl::init(true)); 
-static cl::opt<uint32_t> ClMaxInlinePoisoningSize( 
-    "asan-max-inline-poisoning-size", 
-    cl::desc( 
-        "Inline shadow poisoning for blocks up to the given size in bytes."), 
-    cl::Hidden, cl::init(64)); 
- 
-static cl::opt<bool> ClUseAfterReturn("asan-use-after-return", 
-                                      cl::desc("Check stack-use-after-return"), 
-                                      cl::Hidden, cl::init(true)); 
- 
-static cl::opt<bool> ClRedzoneByvalArgs("asan-redzone-byval-args", 
-                                        cl::desc("Create redzones for byval " 
-                                                 "arguments (extra copy " 
-                                                 "required)"), cl::Hidden, 
-                                        cl::init(true)); 
- 
-static cl::opt<bool> ClUseAfterScope("asan-use-after-scope", 
-                                     cl::desc("Check stack-use-after-scope"), 
-                                     cl::Hidden, cl::init(false)); 
- 
-// This flag may need to be replaced with -f[no]asan-globals. 
-static cl::opt<bool> ClGlobals("asan-globals", 
-                               cl::desc("Handle global objects"), cl::Hidden, 
-                               cl::init(true)); 
- 
-static cl::opt<bool> ClInitializers("asan-initialization-order", 
-                                    cl::desc("Handle C++ initializer order"), 
-                                    cl::Hidden, cl::init(true)); 
- 
-static cl::opt<bool> ClInvalidPointerPairs( 
-    "asan-detect-invalid-pointer-pair", 
-    cl::desc("Instrument <, <=, >, >=, - with pointer operands"), cl::Hidden, 
-    cl::init(false)); 
- 
-static cl::opt<bool> ClInvalidPointerCmp( 
-    "asan-detect-invalid-pointer-cmp", 
-    cl::desc("Instrument <, <=, >, >= with pointer operands"), cl::Hidden, 
-    cl::init(false)); 
- 
-static cl::opt<bool> ClInvalidPointerSub( 
-    "asan-detect-invalid-pointer-sub", 
-    cl::desc("Instrument - operations with pointer operands"), cl::Hidden, 
-    cl::init(false)); 
- 
-static cl::opt<unsigned> ClRealignStack( 
-    "asan-realign-stack", 
-    cl::desc("Realign stack to the value of this flag (power of two)"), 
-    cl::Hidden, cl::init(32)); 
- 
-static cl::opt<int> ClInstrumentationWithCallsThreshold( 
-    "asan-instrumentation-with-call-threshold", 
-    cl::desc( 
-        "If the function being instrumented contains more than " 
-        "this number of memory accesses, use callbacks instead of " 
-        "inline checks (-1 means never use callbacks)."), 
-    cl::Hidden, cl::init(7000)); 
- 
-static cl::opt<std::string> ClMemoryAccessCallbackPrefix( 
-    "asan-memory-access-callback-prefix", 
-    cl::desc("Prefix for memory access callbacks"), cl::Hidden, 
-    cl::init("__asan_")); 
- 
-static cl::opt<bool> 
-    ClInstrumentDynamicAllocas("asan-instrument-dynamic-allocas", 
-                               cl::desc("instrument dynamic allocas"), 
-                               cl::Hidden, cl::init(true)); 
- 
-static cl::opt<bool> ClSkipPromotableAllocas( 
-    "asan-skip-promotable-allocas", 
-    cl::desc("Do not instrument promotable allocas"), cl::Hidden, 
-    cl::init(true)); 
- 
-// These flags allow to change the shadow mapping. 
-// The shadow mapping looks like 
-//    Shadow = (Mem >> scale) + offset 
- 
-static cl::opt<int> ClMappingScale("asan-mapping-scale", 
-                                   cl::desc("scale of asan shadow mapping"), 
-                                   cl::Hidden, cl::init(0)); 
- 
-static cl::opt<uint64_t> 
-    ClMappingOffset("asan-mapping-offset", 
-                    cl::desc("offset of asan shadow mapping [EXPERIMENTAL]"), 
-                    cl::Hidden, cl::init(0)); 
- 
-// Optimization flags. Not user visible, used mostly for testing 
-// and benchmarking the tool. 
- 
-static cl::opt<bool> ClOpt("asan-opt", cl::desc("Optimize instrumentation"), 
-                           cl::Hidden, cl::init(true)); 
- 
-static cl::opt<bool> ClOptSameTemp( 
-    "asan-opt-same-temp", cl::desc("Instrument the same temp just once"), 
-    cl::Hidden, cl::init(true)); 
- 
-static cl::opt<bool> ClOptGlobals("asan-opt-globals", 
-                                  cl::desc("Don't instrument scalar globals"), 
-                                  cl::Hidden, cl::init(true)); 
- 
-static cl::opt<bool> ClOptStack( 
-    "asan-opt-stack", cl::desc("Don't instrument scalar stack variables"), 
-    cl::Hidden, cl::init(false)); 
- 
-static cl::opt<bool> ClDynamicAllocaStack( 
-    "asan-stack-dynamic-alloca", 
-    cl::desc("Use dynamic alloca to represent stack variables"), cl::Hidden, 
-    cl::init(true)); 
- 
-static cl::opt<uint32_t> ClForceExperiment( 
-    "asan-force-experiment", 
-    cl::desc("Force optimization experiment (for testing)"), cl::Hidden, 
-    cl::init(0)); 
- 
-static cl::opt<bool> 
-    ClUsePrivateAlias("asan-use-private-alias", 
-                      cl::desc("Use private aliases for global variables"), 
-                      cl::Hidden, cl::init(false)); 
- 
-static cl::opt<bool> 
-    ClUseOdrIndicator("asan-use-odr-indicator", 
-                      cl::desc("Use odr indicators to improve ODR reporting"), 
-                      cl::Hidden, cl::init(false)); 
- 
-static cl::opt<bool> 
-    ClUseGlobalsGC("asan-globals-live-support", 
-                   cl::desc("Use linker features to support dead " 
-                            "code stripping of globals"), 
-                   cl::Hidden, cl::init(true)); 
- 
-// This is on by default even though there is a bug in gold: 
-// https://sourceware.org/bugzilla/show_bug.cgi?id=19002 
-static cl::opt<bool> 
-    ClWithComdat("asan-with-comdat", 
-                 cl::desc("Place ASan constructors in comdat sections"), 
-                 cl::Hidden, cl::init(true)); 
- 
-// Debug flags. 
- 
-static cl::opt<int> ClDebug("asan-debug", cl::desc("debug"), cl::Hidden, 
-                            cl::init(0)); 
- 
-static cl::opt<int> ClDebugStack("asan-debug-stack", cl::desc("debug stack"), 
-                                 cl::Hidden, cl::init(0)); 
- 
-static cl::opt<std::string> ClDebugFunc("asan-debug-func", cl::Hidden, 
-                                        cl::desc("Debug func")); 
- 
-static cl::opt<int> ClDebugMin("asan-debug-min", cl::desc("Debug min inst"), 
-                               cl::Hidden, cl::init(-1)); 
- 
-static cl::opt<int> ClDebugMax("asan-debug-max", cl::desc("Debug max inst"), 
-                               cl::Hidden, cl::init(-1)); 
- 
-STATISTIC(NumInstrumentedReads, "Number of instrumented reads"); 
-STATISTIC(NumInstrumentedWrites, "Number of instrumented writes"); 
-STATISTIC(NumOptimizedAccessesToGlobalVar, 
-          "Number of optimized accesses to global vars"); 
-STATISTIC(NumOptimizedAccessesToStackVar, 
-          "Number of optimized accesses to stack vars"); 
- 
-namespace { 
- 
-/// This struct defines the shadow mapping using the rule: 
-///   shadow = (mem >> Scale) ADD-or-OR Offset. 
-/// If InGlobal is true, then 
-///   extern char __asan_shadow[]; 
-///   shadow = (mem >> Scale) + &__asan_shadow 
-struct ShadowMapping { 
-  int Scale; 
-  uint64_t Offset; 
-  bool OrShadowOffset; 
-  bool InGlobal; 
-}; 
- 
-} // end anonymous namespace 
- 
-static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize, 
-                                      bool IsKasan) { 
-  bool IsAndroid = TargetTriple.isAndroid(); 
-  bool IsIOS = TargetTriple.isiOS() || TargetTriple.isWatchOS(); 
+
+// Accesses sizes are powers of two: 1, 2, 4, 8, 16.
+static const size_t kNumberOfAccessSizes = 5;
+
+static const unsigned kAllocaRzSize = 32;
+
+// Command-line flags.
+
+static cl::opt<bool> ClEnableKasan(
+    "asan-kernel", cl::desc("Enable KernelAddressSanitizer instrumentation"),
+    cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClRecover(
+    "asan-recover",
+    cl::desc("Enable recovery mode (continue-after-error)."),
+    cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClInsertVersionCheck(
+    "asan-guard-against-version-mismatch",
+    cl::desc("Guard against compiler/runtime version mismatch."),
+    cl::Hidden, cl::init(true));
+
+// This flag may need to be replaced with -f[no-]asan-reads.
+static cl::opt<bool> ClInstrumentReads("asan-instrument-reads",
+                                       cl::desc("instrument read instructions"),
+                                       cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClInstrumentWrites(
+    "asan-instrument-writes", cl::desc("instrument write instructions"),
+    cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClInstrumentAtomics(
+    "asan-instrument-atomics",
+    cl::desc("instrument atomic instructions (rmw, cmpxchg)"), cl::Hidden,
+    cl::init(true));
+
+static cl::opt<bool>
+    ClInstrumentByval("asan-instrument-byval",
+                      cl::desc("instrument byval call arguments"), cl::Hidden,
+                      cl::init(true));
+
+static cl::opt<bool> ClAlwaysSlowPath(
+    "asan-always-slow-path",
+    cl::desc("use instrumentation with slow path for all accesses"), cl::Hidden,
+    cl::init(false));
+
+static cl::opt<bool> ClForceDynamicShadow(
+    "asan-force-dynamic-shadow",
+    cl::desc("Load shadow address into a local variable for each function"),
+    cl::Hidden, cl::init(false));
+
+static cl::opt<bool>
+    ClWithIfunc("asan-with-ifunc",
+                cl::desc("Access dynamic shadow through an ifunc global on "
+                         "platforms that support this"),
+                cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClWithIfuncSuppressRemat(
+    "asan-with-ifunc-suppress-remat",
+    cl::desc("Suppress rematerialization of dynamic shadow address by passing "
+             "it through inline asm in prologue."),
+    cl::Hidden, cl::init(true));
+
+// This flag limits the number of instructions to be instrumented
+// in any given BB. Normally, this should be set to unlimited (INT_MAX),
+// but due to http://llvm.org/bugs/show_bug.cgi?id=12652 we temporary
+// set it to 10000.
+static cl::opt<int> ClMaxInsnsToInstrumentPerBB(
+    "asan-max-ins-per-bb", cl::init(10000),
+    cl::desc("maximal number of instructions to instrument in any given BB"),
+    cl::Hidden);
+
+// This flag may need to be replaced with -f[no]asan-stack.
+static cl::opt<bool> ClStack("asan-stack", cl::desc("Handle stack memory"),
+                             cl::Hidden, cl::init(true));
+static cl::opt<uint32_t> ClMaxInlinePoisoningSize(
+    "asan-max-inline-poisoning-size",
+    cl::desc(
+        "Inline shadow poisoning for blocks up to the given size in bytes."),
+    cl::Hidden, cl::init(64));
+
+static cl::opt<bool> ClUseAfterReturn("asan-use-after-return",
+                                      cl::desc("Check stack-use-after-return"),
+                                      cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClRedzoneByvalArgs("asan-redzone-byval-args",
+                                        cl::desc("Create redzones for byval "
+                                                 "arguments (extra copy "
+                                                 "required)"), cl::Hidden,
+                                        cl::init(true));
+
+static cl::opt<bool> ClUseAfterScope("asan-use-after-scope",
+                                     cl::desc("Check stack-use-after-scope"),
+                                     cl::Hidden, cl::init(false));
+
+// This flag may need to be replaced with -f[no]asan-globals.
+static cl::opt<bool> ClGlobals("asan-globals",
+                               cl::desc("Handle global objects"), cl::Hidden,
+                               cl::init(true));
+
+static cl::opt<bool> ClInitializers("asan-initialization-order",
+                                    cl::desc("Handle C++ initializer order"),
+                                    cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClInvalidPointerPairs(
+    "asan-detect-invalid-pointer-pair",
+    cl::desc("Instrument <, <=, >, >=, - with pointer operands"), cl::Hidden,
+    cl::init(false));
+
+static cl::opt<bool> ClInvalidPointerCmp(
+    "asan-detect-invalid-pointer-cmp",
+    cl::desc("Instrument <, <=, >, >= with pointer operands"), cl::Hidden,
+    cl::init(false));
+
+static cl::opt<bool> ClInvalidPointerSub(
+    "asan-detect-invalid-pointer-sub",
+    cl::desc("Instrument - operations with pointer operands"), cl::Hidden,
+    cl::init(false));
+
+static cl::opt<unsigned> ClRealignStack(
+    "asan-realign-stack",
+    cl::desc("Realign stack to the value of this flag (power of two)"),
+    cl::Hidden, cl::init(32));
+
+static cl::opt<int> ClInstrumentationWithCallsThreshold(
+    "asan-instrumentation-with-call-threshold",
+    cl::desc(
+        "If the function being instrumented contains more than "
+        "this number of memory accesses, use callbacks instead of "
+        "inline checks (-1 means never use callbacks)."),
+    cl::Hidden, cl::init(7000));
+
+static cl::opt<std::string> ClMemoryAccessCallbackPrefix(
+    "asan-memory-access-callback-prefix",
+    cl::desc("Prefix for memory access callbacks"), cl::Hidden,
+    cl::init("__asan_"));
+
+static cl::opt<bool>
+    ClInstrumentDynamicAllocas("asan-instrument-dynamic-allocas",
+                               cl::desc("instrument dynamic allocas"),
+                               cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClSkipPromotableAllocas(
+    "asan-skip-promotable-allocas",
+    cl::desc("Do not instrument promotable allocas"), cl::Hidden,
+    cl::init(true));
+
+// These flags allow to change the shadow mapping.
+// The shadow mapping looks like
+//    Shadow = (Mem >> scale) + offset
+
+static cl::opt<int> ClMappingScale("asan-mapping-scale",
+                                   cl::desc("scale of asan shadow mapping"),
+                                   cl::Hidden, cl::init(0));
+
+static cl::opt<uint64_t>
+    ClMappingOffset("asan-mapping-offset",
+                    cl::desc("offset of asan shadow mapping [EXPERIMENTAL]"),
+                    cl::Hidden, cl::init(0));
+
+// Optimization flags. Not user visible, used mostly for testing
+// and benchmarking the tool.
+
+static cl::opt<bool> ClOpt("asan-opt", cl::desc("Optimize instrumentation"),
+                           cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClOptSameTemp(
+    "asan-opt-same-temp", cl::desc("Instrument the same temp just once"),
+    cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClOptGlobals("asan-opt-globals",
+                                  cl::desc("Don't instrument scalar globals"),
+                                  cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClOptStack(
+    "asan-opt-stack", cl::desc("Don't instrument scalar stack variables"),
+    cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClDynamicAllocaStack(
+    "asan-stack-dynamic-alloca",
+    cl::desc("Use dynamic alloca to represent stack variables"), cl::Hidden,
+    cl::init(true));
+
+static cl::opt<uint32_t> ClForceExperiment(
+    "asan-force-experiment",
+    cl::desc("Force optimization experiment (for testing)"), cl::Hidden,
+    cl::init(0));
+
+static cl::opt<bool>
+    ClUsePrivateAlias("asan-use-private-alias",
+                      cl::desc("Use private aliases for global variables"),
+                      cl::Hidden, cl::init(false));
+
+static cl::opt<bool>
+    ClUseOdrIndicator("asan-use-odr-indicator",
+                      cl::desc("Use odr indicators to improve ODR reporting"),
+                      cl::Hidden, cl::init(false));
+
+static cl::opt<bool>
+    ClUseGlobalsGC("asan-globals-live-support",
+                   cl::desc("Use linker features to support dead "
+                            "code stripping of globals"),
+                   cl::Hidden, cl::init(true));
+
+// This is on by default even though there is a bug in gold:
+// https://sourceware.org/bugzilla/show_bug.cgi?id=19002
+static cl::opt<bool>
+    ClWithComdat("asan-with-comdat",
+                 cl::desc("Place ASan constructors in comdat sections"),
+                 cl::Hidden, cl::init(true));
+
+// Debug flags.
+
+static cl::opt<int> ClDebug("asan-debug", cl::desc("debug"), cl::Hidden,
+                            cl::init(0));
+
+static cl::opt<int> ClDebugStack("asan-debug-stack", cl::desc("debug stack"),
+                                 cl::Hidden, cl::init(0));
+
+static cl::opt<std::string> ClDebugFunc("asan-debug-func", cl::Hidden,
+                                        cl::desc("Debug func"));
+
+static cl::opt<int> ClDebugMin("asan-debug-min", cl::desc("Debug min inst"),
+                               cl::Hidden, cl::init(-1));
+
+static cl::opt<int> ClDebugMax("asan-debug-max", cl::desc("Debug max inst"),
+                               cl::Hidden, cl::init(-1));
+
+STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
+STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
+STATISTIC(NumOptimizedAccessesToGlobalVar,
+          "Number of optimized accesses to global vars");
+STATISTIC(NumOptimizedAccessesToStackVar,
+          "Number of optimized accesses to stack vars");
+
+namespace {
+
+/// This struct defines the shadow mapping using the rule:
+///   shadow = (mem >> Scale) ADD-or-OR Offset.
+/// If InGlobal is true, then
+///   extern char __asan_shadow[];
+///   shadow = (mem >> Scale) + &__asan_shadow
+struct ShadowMapping {
+  int Scale;
+  uint64_t Offset;
+  bool OrShadowOffset;
+  bool InGlobal;
+};
+
+} // end anonymous namespace
+
+static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
+                                      bool IsKasan) {
+  bool IsAndroid = TargetTriple.isAndroid();
+  bool IsIOS = TargetTriple.isiOS() || TargetTriple.isWatchOS();
   bool IsMacOS = TargetTriple.isMacOSX();
-  bool IsFreeBSD = TargetTriple.isOSFreeBSD(); 
-  bool IsNetBSD = TargetTriple.isOSNetBSD(); 
-  bool IsPS4CPU = TargetTriple.isPS4CPU(); 
-  bool IsLinux = TargetTriple.isOSLinux(); 
-  bool IsPPC64 = TargetTriple.getArch() == Triple::ppc64 || 
-                 TargetTriple.getArch() == Triple::ppc64le; 
-  bool IsSystemZ = TargetTriple.getArch() == Triple::systemz; 
-  bool IsX86_64 = TargetTriple.getArch() == Triple::x86_64; 
-  bool IsMIPS32 = TargetTriple.isMIPS32(); 
-  bool IsMIPS64 = TargetTriple.isMIPS64(); 
-  bool IsArmOrThumb = TargetTriple.isARM() || TargetTriple.isThumb(); 
-  bool IsAArch64 = TargetTriple.getArch() == Triple::aarch64; 
+  bool IsFreeBSD = TargetTriple.isOSFreeBSD();
+  bool IsNetBSD = TargetTriple.isOSNetBSD();
+  bool IsPS4CPU = TargetTriple.isPS4CPU();
+  bool IsLinux = TargetTriple.isOSLinux();
+  bool IsPPC64 = TargetTriple.getArch() == Triple::ppc64 ||
+                 TargetTriple.getArch() == Triple::ppc64le;
+  bool IsSystemZ = TargetTriple.getArch() == Triple::systemz;
+  bool IsX86_64 = TargetTriple.getArch() == Triple::x86_64;
+  bool IsMIPS32 = TargetTriple.isMIPS32();
+  bool IsMIPS64 = TargetTriple.isMIPS64();
+  bool IsArmOrThumb = TargetTriple.isARM() || TargetTriple.isThumb();
+  bool IsAArch64 = TargetTriple.getArch() == Triple::aarch64;
   bool IsRISCV64 = TargetTriple.getArch() == Triple::riscv64;
-  bool IsWindows = TargetTriple.isOSWindows(); 
-  bool IsFuchsia = TargetTriple.isOSFuchsia(); 
-  bool IsMyriad = TargetTriple.getVendor() == llvm::Triple::Myriad; 
-  bool IsEmscripten = TargetTriple.isOSEmscripten(); 
- 
-  ShadowMapping Mapping; 
- 
-  Mapping.Scale = IsMyriad ? kMyriadShadowScale : kDefaultShadowScale; 
-  if (ClMappingScale.getNumOccurrences() > 0) { 
-    Mapping.Scale = ClMappingScale; 
-  } 
- 
-  if (LongSize == 32) { 
-    if (IsAndroid) 
-      Mapping.Offset = kDynamicShadowSentinel; 
-    else if (IsMIPS32) 
-      Mapping.Offset = kMIPS32_ShadowOffset32; 
-    else if (IsFreeBSD) 
-      Mapping.Offset = kFreeBSD_ShadowOffset32; 
-    else if (IsNetBSD) 
-      Mapping.Offset = kNetBSD_ShadowOffset32; 
-    else if (IsIOS) 
-      Mapping.Offset = kDynamicShadowSentinel; 
-    else if (IsWindows) 
-      Mapping.Offset = kWindowsShadowOffset32; 
-    else if (IsEmscripten) 
-      Mapping.Offset = kEmscriptenShadowOffset; 
-    else if (IsMyriad) { 
-      uint64_t ShadowOffset = (kMyriadMemoryOffset32 + kMyriadMemorySize32 - 
-                               (kMyriadMemorySize32 >> Mapping.Scale)); 
-      Mapping.Offset = ShadowOffset - (kMyriadMemoryOffset32 >> Mapping.Scale); 
-    } 
-    else 
-      Mapping.Offset = kDefaultShadowOffset32; 
-  } else {  // LongSize == 64 
-    // Fuchsia is always PIE, which means that the beginning of the address 
-    // space is always available. 
-    if (IsFuchsia) 
-      Mapping.Offset = 0; 
-    else if (IsPPC64) 
-      Mapping.Offset = kPPC64_ShadowOffset64; 
-    else if (IsSystemZ) 
-      Mapping.Offset = kSystemZ_ShadowOffset64; 
-    else if (IsFreeBSD && !IsMIPS64) 
-      Mapping.Offset = kFreeBSD_ShadowOffset64; 
-    else if (IsNetBSD) { 
-      if (IsKasan) 
-        Mapping.Offset = kNetBSDKasan_ShadowOffset64; 
-      else 
-        Mapping.Offset = kNetBSD_ShadowOffset64; 
-    } else if (IsPS4CPU) 
-      Mapping.Offset = kPS4CPU_ShadowOffset64; 
-    else if (IsLinux && IsX86_64) { 
-      if (IsKasan) 
-        Mapping.Offset = kLinuxKasan_ShadowOffset64; 
-      else 
-        Mapping.Offset = (kSmallX86_64ShadowOffsetBase & 
-                          (kSmallX86_64ShadowOffsetAlignMask << Mapping.Scale)); 
-    } else if (IsWindows && IsX86_64) { 
-      Mapping.Offset = kWindowsShadowOffset64; 
-    } else if (IsMIPS64) 
-      Mapping.Offset = kMIPS64_ShadowOffset64; 
-    else if (IsIOS) 
-      Mapping.Offset = kDynamicShadowSentinel; 
+  bool IsWindows = TargetTriple.isOSWindows();
+  bool IsFuchsia = TargetTriple.isOSFuchsia();
+  bool IsMyriad = TargetTriple.getVendor() == llvm::Triple::Myriad;
+  bool IsEmscripten = TargetTriple.isOSEmscripten();
+
+  ShadowMapping Mapping;
+
+  Mapping.Scale = IsMyriad ? kMyriadShadowScale : kDefaultShadowScale;
+  if (ClMappingScale.getNumOccurrences() > 0) {
+    Mapping.Scale = ClMappingScale;
+  }
+
+  if (LongSize == 32) {
+    if (IsAndroid)
+      Mapping.Offset = kDynamicShadowSentinel;
+    else if (IsMIPS32)
+      Mapping.Offset = kMIPS32_ShadowOffset32;
+    else if (IsFreeBSD)
+      Mapping.Offset = kFreeBSD_ShadowOffset32;
+    else if (IsNetBSD)
+      Mapping.Offset = kNetBSD_ShadowOffset32;
+    else if (IsIOS)
+      Mapping.Offset = kDynamicShadowSentinel;
+    else if (IsWindows)
+      Mapping.Offset = kWindowsShadowOffset32;
+    else if (IsEmscripten)
+      Mapping.Offset = kEmscriptenShadowOffset;
+    else if (IsMyriad) {
+      uint64_t ShadowOffset = (kMyriadMemoryOffset32 + kMyriadMemorySize32 -
+                               (kMyriadMemorySize32 >> Mapping.Scale));
+      Mapping.Offset = ShadowOffset - (kMyriadMemoryOffset32 >> Mapping.Scale);
+    }
+    else
+      Mapping.Offset = kDefaultShadowOffset32;
+  } else {  // LongSize == 64
+    // Fuchsia is always PIE, which means that the beginning of the address
+    // space is always available.
+    if (IsFuchsia)
+      Mapping.Offset = 0;
+    else if (IsPPC64)
+      Mapping.Offset = kPPC64_ShadowOffset64;
+    else if (IsSystemZ)
+      Mapping.Offset = kSystemZ_ShadowOffset64;
+    else if (IsFreeBSD && !IsMIPS64)
+      Mapping.Offset = kFreeBSD_ShadowOffset64;
+    else if (IsNetBSD) {
+      if (IsKasan)
+        Mapping.Offset = kNetBSDKasan_ShadowOffset64;
+      else
+        Mapping.Offset = kNetBSD_ShadowOffset64;
+    } else if (IsPS4CPU)
+      Mapping.Offset = kPS4CPU_ShadowOffset64;
+    else if (IsLinux && IsX86_64) {
+      if (IsKasan)
+        Mapping.Offset = kLinuxKasan_ShadowOffset64;
+      else
+        Mapping.Offset = (kSmallX86_64ShadowOffsetBase &
+                          (kSmallX86_64ShadowOffsetAlignMask << Mapping.Scale));
+    } else if (IsWindows && IsX86_64) {
+      Mapping.Offset = kWindowsShadowOffset64;
+    } else if (IsMIPS64)
+      Mapping.Offset = kMIPS64_ShadowOffset64;
+    else if (IsIOS)
+      Mapping.Offset = kDynamicShadowSentinel;
     else if (IsMacOS && IsAArch64)
       Mapping.Offset = kDynamicShadowSentinel;
-    else if (IsAArch64) 
-      Mapping.Offset = kAArch64_ShadowOffset64; 
+    else if (IsAArch64)
+      Mapping.Offset = kAArch64_ShadowOffset64;
     else if (IsRISCV64)
       Mapping.Offset = kRISCV64_ShadowOffset64;
-    else 
-      Mapping.Offset = kDefaultShadowOffset64; 
-  } 
- 
-  if (ClForceDynamicShadow) { 
-    Mapping.Offset = kDynamicShadowSentinel; 
-  } 
- 
-  if (ClMappingOffset.getNumOccurrences() > 0) { 
-    Mapping.Offset = ClMappingOffset; 
-  } 
- 
-  // OR-ing shadow offset if more efficient (at least on x86) if the offset 
-  // is a power of two, but on ppc64 we have to use add since the shadow 
-  // offset is not necessary 1/8-th of the address space.  On SystemZ, 
-  // we could OR the constant in a single instruction, but it's more 
-  // efficient to load it once and use indexed addressing. 
-  Mapping.OrShadowOffset = !IsAArch64 && !IsPPC64 && !IsSystemZ && !IsPS4CPU && 
+    else
+      Mapping.Offset = kDefaultShadowOffset64;
+  }
+
+  if (ClForceDynamicShadow) {
+    Mapping.Offset = kDynamicShadowSentinel;
+  }
+
+  if (ClMappingOffset.getNumOccurrences() > 0) {
+    Mapping.Offset = ClMappingOffset;
+  }
+
+  // OR-ing shadow offset if more efficient (at least on x86) if the offset
+  // is a power of two, but on ppc64 we have to use add since the shadow
+  // offset is not necessary 1/8-th of the address space.  On SystemZ,
+  // we could OR the constant in a single instruction, but it's more
+  // efficient to load it once and use indexed addressing.
+  Mapping.OrShadowOffset = !IsAArch64 && !IsPPC64 && !IsSystemZ && !IsPS4CPU &&
                            !IsRISCV64 &&
-                           !(Mapping.Offset & (Mapping.Offset - 1)) && 
-                           Mapping.Offset != kDynamicShadowSentinel; 
-  bool IsAndroidWithIfuncSupport = 
-      IsAndroid && !TargetTriple.isAndroidVersionLT(21); 
-  Mapping.InGlobal = ClWithIfunc && IsAndroidWithIfuncSupport && IsArmOrThumb; 
- 
-  return Mapping; 
-} 
- 
-static uint64_t getRedzoneSizeForScale(int MappingScale) { 
-  // Redzone used for stack and globals is at least 32 bytes. 
-  // For scales 6 and 7, the redzone has to be 64 and 128 bytes respectively. 
-  return std::max(32U, 1U << MappingScale); 
-} 
- 
-static uint64_t GetCtorAndDtorPriority(Triple &TargetTriple) { 
-  if (TargetTriple.isOSEmscripten()) { 
-    return kAsanEmscriptenCtorAndDtorPriority; 
-  } else { 
-    return kAsanCtorAndDtorPriority; 
-  } 
-} 
- 
-namespace { 
- 
-/// Module analysis for getting various metadata about the module. 
-class ASanGlobalsMetadataWrapperPass : public ModulePass { 
-public: 
-  static char ID; 
- 
-  ASanGlobalsMetadataWrapperPass() : ModulePass(ID) { 
-    initializeASanGlobalsMetadataWrapperPassPass( 
-        *PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnModule(Module &M) override { 
-    GlobalsMD = GlobalsMetadata(M); 
-    return false; 
-  } 
- 
-  StringRef getPassName() const override { 
-    return "ASanGlobalsMetadataWrapperPass"; 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.setPreservesAll(); 
-  } 
- 
-  GlobalsMetadata &getGlobalsMD() { return GlobalsMD; } 
- 
-private: 
-  GlobalsMetadata GlobalsMD; 
-}; 
- 
-char ASanGlobalsMetadataWrapperPass::ID = 0; 
- 
-/// AddressSanitizer: instrument the code in module to find memory bugs. 
-struct AddressSanitizer { 
-  AddressSanitizer(Module &M, const GlobalsMetadata *GlobalsMD, 
-                   bool CompileKernel = false, bool Recover = false, 
-                   bool UseAfterScope = false) 
-      : CompileKernel(ClEnableKasan.getNumOccurrences() > 0 ? ClEnableKasan 
-                                                            : CompileKernel), 
-        Recover(ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover), 
-        UseAfterScope(UseAfterScope || ClUseAfterScope), GlobalsMD(*GlobalsMD) { 
-    C = &(M.getContext()); 
-    LongSize = M.getDataLayout().getPointerSizeInBits(); 
-    IntptrTy = Type::getIntNTy(*C, LongSize); 
-    TargetTriple = Triple(M.getTargetTriple()); 
- 
-    Mapping = getShadowMapping(TargetTriple, LongSize, this->CompileKernel); 
-  } 
- 
-  uint64_t getAllocaSizeInBytes(const AllocaInst &AI) const { 
-    uint64_t ArraySize = 1; 
-    if (AI.isArrayAllocation()) { 
-      const ConstantInt *CI = dyn_cast<ConstantInt>(AI.getArraySize()); 
-      assert(CI && "non-constant array size"); 
-      ArraySize = CI->getZExtValue(); 
-    } 
-    Type *Ty = AI.getAllocatedType(); 
-    uint64_t SizeInBytes = 
-        AI.getModule()->getDataLayout().getTypeAllocSize(Ty); 
-    return SizeInBytes * ArraySize; 
-  } 
- 
-  /// Check if we want (and can) handle this alloca. 
-  bool isInterestingAlloca(const AllocaInst &AI); 
- 
-  bool ignoreAccess(Value *Ptr); 
-  void getInterestingMemoryOperands( 
-      Instruction *I, SmallVectorImpl<InterestingMemoryOperand> &Interesting); 
- 
-  void instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis, 
-                     InterestingMemoryOperand &O, bool UseCalls, 
-                     const DataLayout &DL); 
-  void instrumentPointerComparisonOrSubtraction(Instruction *I); 
-  void instrumentAddress(Instruction *OrigIns, Instruction *InsertBefore, 
-                         Value *Addr, uint32_t TypeSize, bool IsWrite, 
-                         Value *SizeArgument, bool UseCalls, uint32_t Exp); 
-  void instrumentUnusualSizeOrAlignment(Instruction *I, 
-                                        Instruction *InsertBefore, Value *Addr, 
-                                        uint32_t TypeSize, bool IsWrite, 
-                                        Value *SizeArgument, bool UseCalls, 
-                                        uint32_t Exp); 
-  Value *createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong, 
-                           Value *ShadowValue, uint32_t TypeSize); 
-  Instruction *generateCrashCode(Instruction *InsertBefore, Value *Addr, 
-                                 bool IsWrite, size_t AccessSizeIndex, 
-                                 Value *SizeArgument, uint32_t Exp); 
-  void instrumentMemIntrinsic(MemIntrinsic *MI); 
-  Value *memToShadow(Value *Shadow, IRBuilder<> &IRB); 
-  bool suppressInstrumentationSiteForDebug(int &Instrumented); 
-  bool instrumentFunction(Function &F, const TargetLibraryInfo *TLI); 
-  bool maybeInsertAsanInitAtFunctionEntry(Function &F); 
-  bool maybeInsertDynamicShadowAtFunctionEntry(Function &F); 
-  void markEscapedLocalAllocas(Function &F); 
- 
-private: 
-  friend struct FunctionStackPoisoner; 
- 
-  void initializeCallbacks(Module &M); 
- 
-  bool LooksLikeCodeInBug11395(Instruction *I); 
-  bool GlobalIsLinkerInitialized(GlobalVariable *G); 
-  bool isSafeAccess(ObjectSizeOffsetVisitor &ObjSizeVis, Value *Addr, 
-                    uint64_t TypeSize) const; 
- 
-  /// Helper to cleanup per-function state. 
-  struct FunctionStateRAII { 
-    AddressSanitizer *Pass; 
- 
-    FunctionStateRAII(AddressSanitizer *Pass) : Pass(Pass) { 
-      assert(Pass->ProcessedAllocas.empty() && 
-             "last pass forgot to clear cache"); 
-      assert(!Pass->LocalDynamicShadow); 
-    } 
- 
-    ~FunctionStateRAII() { 
-      Pass->LocalDynamicShadow = nullptr; 
-      Pass->ProcessedAllocas.clear(); 
-    } 
-  }; 
- 
-  LLVMContext *C; 
-  Triple TargetTriple; 
-  int LongSize; 
-  bool CompileKernel; 
-  bool Recover; 
-  bool UseAfterScope; 
-  Type *IntptrTy; 
-  ShadowMapping Mapping; 
-  FunctionCallee AsanHandleNoReturnFunc; 
-  FunctionCallee AsanPtrCmpFunction, AsanPtrSubFunction; 
-  Constant *AsanShadowGlobal; 
- 
-  // These arrays is indexed by AccessIsWrite, Experiment and log2(AccessSize). 
-  FunctionCallee AsanErrorCallback[2][2][kNumberOfAccessSizes]; 
-  FunctionCallee AsanMemoryAccessCallback[2][2][kNumberOfAccessSizes]; 
- 
-  // These arrays is indexed by AccessIsWrite and Experiment. 
-  FunctionCallee AsanErrorCallbackSized[2][2]; 
-  FunctionCallee AsanMemoryAccessCallbackSized[2][2]; 
- 
-  FunctionCallee AsanMemmove, AsanMemcpy, AsanMemset; 
-  Value *LocalDynamicShadow = nullptr; 
-  const GlobalsMetadata &GlobalsMD; 
-  DenseMap<const AllocaInst *, bool> ProcessedAllocas; 
-}; 
- 
-class AddressSanitizerLegacyPass : public FunctionPass { 
-public: 
-  static char ID; 
- 
-  explicit AddressSanitizerLegacyPass(bool CompileKernel = false, 
-                                      bool Recover = false, 
-                                      bool UseAfterScope = false) 
-      : FunctionPass(ID), CompileKernel(CompileKernel), Recover(Recover), 
-        UseAfterScope(UseAfterScope) { 
-    initializeAddressSanitizerLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  StringRef getPassName() const override { 
-    return "AddressSanitizerFunctionPass"; 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<ASanGlobalsMetadataWrapperPass>(); 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-  } 
- 
-  bool runOnFunction(Function &F) override { 
-    GlobalsMetadata &GlobalsMD = 
-        getAnalysis<ASanGlobalsMetadataWrapperPass>().getGlobalsMD(); 
-    const TargetLibraryInfo *TLI = 
-        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 
-    AddressSanitizer ASan(*F.getParent(), &GlobalsMD, CompileKernel, Recover, 
-                          UseAfterScope); 
-    return ASan.instrumentFunction(F, TLI); 
-  } 
- 
-private: 
-  bool CompileKernel; 
-  bool Recover; 
-  bool UseAfterScope; 
-}; 
- 
-class ModuleAddressSanitizer { 
-public: 
-  ModuleAddressSanitizer(Module &M, const GlobalsMetadata *GlobalsMD, 
-                         bool CompileKernel = false, bool Recover = false, 
-                         bool UseGlobalsGC = true, bool UseOdrIndicator = false) 
-      : GlobalsMD(*GlobalsMD), 
-        CompileKernel(ClEnableKasan.getNumOccurrences() > 0 ? ClEnableKasan 
-                                                            : CompileKernel), 
-        Recover(ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover), 
-        UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC && !this->CompileKernel), 
-        // Enable aliases as they should have no downside with ODR indicators. 
-        UsePrivateAlias(UseOdrIndicator || ClUsePrivateAlias), 
-        UseOdrIndicator(UseOdrIndicator || ClUseOdrIndicator), 
-        // Not a typo: ClWithComdat is almost completely pointless without 
-        // ClUseGlobalsGC (because then it only works on modules without 
-        // globals, which are rare); it is a prerequisite for ClUseGlobalsGC; 
-        // and both suffer from gold PR19002 for which UseGlobalsGC constructor 
-        // argument is designed as workaround. Therefore, disable both 
-        // ClWithComdat and ClUseGlobalsGC unless the frontend says it's ok to 
-        // do globals-gc. 
-        UseCtorComdat(UseGlobalsGC && ClWithComdat && !this->CompileKernel) { 
-    C = &(M.getContext()); 
-    int LongSize = M.getDataLayout().getPointerSizeInBits(); 
-    IntptrTy = Type::getIntNTy(*C, LongSize); 
-    TargetTriple = Triple(M.getTargetTriple()); 
-    Mapping = getShadowMapping(TargetTriple, LongSize, this->CompileKernel); 
-  } 
- 
-  bool instrumentModule(Module &); 
- 
-private: 
-  void initializeCallbacks(Module &M); 
- 
-  bool InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool *CtorComdat); 
-  void InstrumentGlobalsCOFF(IRBuilder<> &IRB, Module &M, 
-                             ArrayRef<GlobalVariable *> ExtendedGlobals, 
-                             ArrayRef<Constant *> MetadataInitializers); 
-  void InstrumentGlobalsELF(IRBuilder<> &IRB, Module &M, 
-                            ArrayRef<GlobalVariable *> ExtendedGlobals, 
-                            ArrayRef<Constant *> MetadataInitializers, 
-                            const std::string &UniqueModuleId); 
-  void InstrumentGlobalsMachO(IRBuilder<> &IRB, Module &M, 
-                              ArrayRef<GlobalVariable *> ExtendedGlobals, 
-                              ArrayRef<Constant *> MetadataInitializers); 
-  void 
-  InstrumentGlobalsWithMetadataArray(IRBuilder<> &IRB, Module &M, 
-                                     ArrayRef<GlobalVariable *> ExtendedGlobals, 
-                                     ArrayRef<Constant *> MetadataInitializers); 
- 
-  GlobalVariable *CreateMetadataGlobal(Module &M, Constant *Initializer, 
-                                       StringRef OriginalName); 
-  void SetComdatForGlobalMetadata(GlobalVariable *G, GlobalVariable *Metadata, 
-                                  StringRef InternalSuffix); 
-  Instruction *CreateAsanModuleDtor(Module &M); 
- 
-  const GlobalVariable *getExcludedAliasedGlobal(const GlobalAlias &GA) const; 
-  bool shouldInstrumentGlobal(GlobalVariable *G) const; 
-  bool ShouldUseMachOGlobalsSection() const; 
-  StringRef getGlobalMetadataSection() const; 
-  void poisonOneInitializer(Function &GlobalInit, GlobalValue *ModuleName); 
-  void createInitializerPoisonCalls(Module &M, GlobalValue *ModuleName); 
-  uint64_t getMinRedzoneSizeForGlobal() const { 
-    return getRedzoneSizeForScale(Mapping.Scale); 
-  } 
-  uint64_t getRedzoneSizeForGlobal(uint64_t SizeInBytes) const; 
-  int GetAsanVersion(const Module &M) const; 
- 
-  const GlobalsMetadata &GlobalsMD; 
-  bool CompileKernel; 
-  bool Recover; 
-  bool UseGlobalsGC; 
-  bool UsePrivateAlias; 
-  bool UseOdrIndicator; 
-  bool UseCtorComdat; 
-  Type *IntptrTy; 
-  LLVMContext *C; 
-  Triple TargetTriple; 
-  ShadowMapping Mapping; 
-  FunctionCallee AsanPoisonGlobals; 
-  FunctionCallee AsanUnpoisonGlobals; 
-  FunctionCallee AsanRegisterGlobals; 
-  FunctionCallee AsanUnregisterGlobals; 
-  FunctionCallee AsanRegisterImageGlobals; 
-  FunctionCallee AsanUnregisterImageGlobals; 
-  FunctionCallee AsanRegisterElfGlobals; 
-  FunctionCallee AsanUnregisterElfGlobals; 
- 
-  Function *AsanCtorFunction = nullptr; 
-  Function *AsanDtorFunction = nullptr; 
-}; 
- 
-class ModuleAddressSanitizerLegacyPass : public ModulePass { 
-public: 
-  static char ID; 
- 
-  explicit ModuleAddressSanitizerLegacyPass(bool CompileKernel = false, 
-                                            bool Recover = false, 
-                                            bool UseGlobalGC = true, 
-                                            bool UseOdrIndicator = false) 
-      : ModulePass(ID), CompileKernel(CompileKernel), Recover(Recover), 
-        UseGlobalGC(UseGlobalGC), UseOdrIndicator(UseOdrIndicator) { 
-    initializeModuleAddressSanitizerLegacyPassPass( 
-        *PassRegistry::getPassRegistry()); 
-  } 
- 
-  StringRef getPassName() const override { return "ModuleAddressSanitizer"; } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<ASanGlobalsMetadataWrapperPass>(); 
-  } 
- 
-  bool runOnModule(Module &M) override { 
-    GlobalsMetadata &GlobalsMD = 
-        getAnalysis<ASanGlobalsMetadataWrapperPass>().getGlobalsMD(); 
-    ModuleAddressSanitizer ASanModule(M, &GlobalsMD, CompileKernel, Recover, 
-                                      UseGlobalGC, UseOdrIndicator); 
-    return ASanModule.instrumentModule(M); 
-  } 
- 
-private: 
-  bool CompileKernel; 
-  bool Recover; 
-  bool UseGlobalGC; 
-  bool UseOdrIndicator; 
-}; 
- 
-// Stack poisoning does not play well with exception handling. 
-// When an exception is thrown, we essentially bypass the code 
-// that unpoisones the stack. This is why the run-time library has 
-// to intercept __cxa_throw (as well as longjmp, etc) and unpoison the entire 
-// stack in the interceptor. This however does not work inside the 
-// actual function which catches the exception. Most likely because the 
-// compiler hoists the load of the shadow value somewhere too high. 
-// This causes asan to report a non-existing bug on 453.povray. 
-// It sounds like an LLVM bug. 
-struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { 
-  Function &F; 
-  AddressSanitizer &ASan; 
-  DIBuilder DIB; 
-  LLVMContext *C; 
-  Type *IntptrTy; 
-  Type *IntptrPtrTy; 
-  ShadowMapping Mapping; 
- 
-  SmallVector<AllocaInst *, 16> AllocaVec; 
-  SmallVector<AllocaInst *, 16> StaticAllocasToMoveUp; 
-  SmallVector<Instruction *, 8> RetVec; 
-  unsigned StackAlignment; 
- 
-  FunctionCallee AsanStackMallocFunc[kMaxAsanStackMallocSizeClass + 1], 
-      AsanStackFreeFunc[kMaxAsanStackMallocSizeClass + 1]; 
-  FunctionCallee AsanSetShadowFunc[0x100] = {}; 
-  FunctionCallee AsanPoisonStackMemoryFunc, AsanUnpoisonStackMemoryFunc; 
-  FunctionCallee AsanAllocaPoisonFunc, AsanAllocasUnpoisonFunc; 
- 
-  // Stores a place and arguments of poisoning/unpoisoning call for alloca. 
-  struct AllocaPoisonCall { 
-    IntrinsicInst *InsBefore; 
-    AllocaInst *AI; 
-    uint64_t Size; 
-    bool DoPoison; 
-  }; 
-  SmallVector<AllocaPoisonCall, 8> DynamicAllocaPoisonCallVec; 
-  SmallVector<AllocaPoisonCall, 8> StaticAllocaPoisonCallVec; 
-  bool HasUntracedLifetimeIntrinsic = false; 
- 
-  SmallVector<AllocaInst *, 1> DynamicAllocaVec; 
-  SmallVector<IntrinsicInst *, 1> StackRestoreVec; 
-  AllocaInst *DynamicAllocaLayout = nullptr; 
-  IntrinsicInst *LocalEscapeCall = nullptr; 
- 
-  bool HasInlineAsm = false; 
-  bool HasReturnsTwiceCall = false; 
- 
-  FunctionStackPoisoner(Function &F, AddressSanitizer &ASan) 
-      : F(F), ASan(ASan), DIB(*F.getParent(), /*AllowUnresolved*/ false), 
-        C(ASan.C), IntptrTy(ASan.IntptrTy), 
-        IntptrPtrTy(PointerType::get(IntptrTy, 0)), Mapping(ASan.Mapping), 
-        StackAlignment(1 << Mapping.Scale) {} 
- 
-  bool runOnFunction() { 
-    if (!ClStack) return false; 
- 
-    if (ClRedzoneByvalArgs) 
-      copyArgsPassedByValToAllocas(); 
- 
-    // Collect alloca, ret, lifetime instructions etc. 
-    for (BasicBlock *BB : depth_first(&F.getEntryBlock())) visit(*BB); 
- 
-    if (AllocaVec.empty() && DynamicAllocaVec.empty()) return false; 
- 
-    initializeCallbacks(*F.getParent()); 
- 
-    if (HasUntracedLifetimeIntrinsic) { 
-      // If there are lifetime intrinsics which couldn't be traced back to an 
-      // alloca, we may not know exactly when a variable enters scope, and 
-      // therefore should "fail safe" by not poisoning them. 
-      StaticAllocaPoisonCallVec.clear(); 
-      DynamicAllocaPoisonCallVec.clear(); 
-    } 
- 
-    processDynamicAllocas(); 
-    processStaticAllocas(); 
- 
-    if (ClDebugStack) { 
-      LLVM_DEBUG(dbgs() << F); 
-    } 
-    return true; 
-  } 
- 
-  // Arguments marked with the "byval" attribute are implicitly copied without 
-  // using an alloca instruction.  To produce redzones for those arguments, we 
-  // copy them a second time into memory allocated with an alloca instruction. 
-  void copyArgsPassedByValToAllocas(); 
- 
-  // Finds all Alloca instructions and puts 
-  // poisoned red zones around all of them. 
-  // Then unpoison everything back before the function returns. 
-  void processStaticAllocas(); 
-  void processDynamicAllocas(); 
- 
-  void createDynamicAllocasInitStorage(); 
- 
-  // ----------------------- Visitors. 
+                           !(Mapping.Offset & (Mapping.Offset - 1)) &&
+                           Mapping.Offset != kDynamicShadowSentinel;
+  bool IsAndroidWithIfuncSupport =
+      IsAndroid && !TargetTriple.isAndroidVersionLT(21);
+  Mapping.InGlobal = ClWithIfunc && IsAndroidWithIfuncSupport && IsArmOrThumb;
+
+  return Mapping;
+}
+
+static uint64_t getRedzoneSizeForScale(int MappingScale) {
+  // Redzone used for stack and globals is at least 32 bytes.
+  // For scales 6 and 7, the redzone has to be 64 and 128 bytes respectively.
+  return std::max(32U, 1U << MappingScale);
+}
+
+static uint64_t GetCtorAndDtorPriority(Triple &TargetTriple) {
+  if (TargetTriple.isOSEmscripten()) {
+    return kAsanEmscriptenCtorAndDtorPriority;
+  } else {
+    return kAsanCtorAndDtorPriority;
+  }
+}
+
+namespace {
+
+/// Module analysis for getting various metadata about the module.
+class ASanGlobalsMetadataWrapperPass : public ModulePass {
+public:
+  static char ID;
+
+  ASanGlobalsMetadataWrapperPass() : ModulePass(ID) {
+    initializeASanGlobalsMetadataWrapperPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    GlobalsMD = GlobalsMetadata(M);
+    return false;
+  }
+
+  StringRef getPassName() const override {
+    return "ASanGlobalsMetadataWrapperPass";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+
+  GlobalsMetadata &getGlobalsMD() { return GlobalsMD; }
+
+private:
+  GlobalsMetadata GlobalsMD;
+};
+
+char ASanGlobalsMetadataWrapperPass::ID = 0;
+
+/// AddressSanitizer: instrument the code in module to find memory bugs.
+struct AddressSanitizer {
+  AddressSanitizer(Module &M, const GlobalsMetadata *GlobalsMD,
+                   bool CompileKernel = false, bool Recover = false,
+                   bool UseAfterScope = false)
+      : CompileKernel(ClEnableKasan.getNumOccurrences() > 0 ? ClEnableKasan
+                                                            : CompileKernel),
+        Recover(ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover),
+        UseAfterScope(UseAfterScope || ClUseAfterScope), GlobalsMD(*GlobalsMD) {
+    C = &(M.getContext());
+    LongSize = M.getDataLayout().getPointerSizeInBits();
+    IntptrTy = Type::getIntNTy(*C, LongSize);
+    TargetTriple = Triple(M.getTargetTriple());
+
+    Mapping = getShadowMapping(TargetTriple, LongSize, this->CompileKernel);
+  }
+
+  uint64_t getAllocaSizeInBytes(const AllocaInst &AI) const {
+    uint64_t ArraySize = 1;
+    if (AI.isArrayAllocation()) {
+      const ConstantInt *CI = dyn_cast<ConstantInt>(AI.getArraySize());
+      assert(CI && "non-constant array size");
+      ArraySize = CI->getZExtValue();
+    }
+    Type *Ty = AI.getAllocatedType();
+    uint64_t SizeInBytes =
+        AI.getModule()->getDataLayout().getTypeAllocSize(Ty);
+    return SizeInBytes * ArraySize;
+  }
+
+  /// Check if we want (and can) handle this alloca.
+  bool isInterestingAlloca(const AllocaInst &AI);
+
+  bool ignoreAccess(Value *Ptr);
+  void getInterestingMemoryOperands(
+      Instruction *I, SmallVectorImpl<InterestingMemoryOperand> &Interesting);
+
+  void instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis,
+                     InterestingMemoryOperand &O, bool UseCalls,
+                     const DataLayout &DL);
+  void instrumentPointerComparisonOrSubtraction(Instruction *I);
+  void instrumentAddress(Instruction *OrigIns, Instruction *InsertBefore,
+                         Value *Addr, uint32_t TypeSize, bool IsWrite,
+                         Value *SizeArgument, bool UseCalls, uint32_t Exp);
+  void instrumentUnusualSizeOrAlignment(Instruction *I,
+                                        Instruction *InsertBefore, Value *Addr,
+                                        uint32_t TypeSize, bool IsWrite,
+                                        Value *SizeArgument, bool UseCalls,
+                                        uint32_t Exp);
+  Value *createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
+                           Value *ShadowValue, uint32_t TypeSize);
+  Instruction *generateCrashCode(Instruction *InsertBefore, Value *Addr,
+                                 bool IsWrite, size_t AccessSizeIndex,
+                                 Value *SizeArgument, uint32_t Exp);
+  void instrumentMemIntrinsic(MemIntrinsic *MI);
+  Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
+  bool suppressInstrumentationSiteForDebug(int &Instrumented);
+  bool instrumentFunction(Function &F, const TargetLibraryInfo *TLI);
+  bool maybeInsertAsanInitAtFunctionEntry(Function &F);
+  bool maybeInsertDynamicShadowAtFunctionEntry(Function &F);
+  void markEscapedLocalAllocas(Function &F);
+
+private:
+  friend struct FunctionStackPoisoner;
+
+  void initializeCallbacks(Module &M);
+
+  bool LooksLikeCodeInBug11395(Instruction *I);
+  bool GlobalIsLinkerInitialized(GlobalVariable *G);
+  bool isSafeAccess(ObjectSizeOffsetVisitor &ObjSizeVis, Value *Addr,
+                    uint64_t TypeSize) const;
+
+  /// Helper to cleanup per-function state.
+  struct FunctionStateRAII {
+    AddressSanitizer *Pass;
+
+    FunctionStateRAII(AddressSanitizer *Pass) : Pass(Pass) {
+      assert(Pass->ProcessedAllocas.empty() &&
+             "last pass forgot to clear cache");
+      assert(!Pass->LocalDynamicShadow);
+    }
+
+    ~FunctionStateRAII() {
+      Pass->LocalDynamicShadow = nullptr;
+      Pass->ProcessedAllocas.clear();
+    }
+  };
+
+  LLVMContext *C;
+  Triple TargetTriple;
+  int LongSize;
+  bool CompileKernel;
+  bool Recover;
+  bool UseAfterScope;
+  Type *IntptrTy;
+  ShadowMapping Mapping;
+  FunctionCallee AsanHandleNoReturnFunc;
+  FunctionCallee AsanPtrCmpFunction, AsanPtrSubFunction;
+  Constant *AsanShadowGlobal;
+
+  // These arrays is indexed by AccessIsWrite, Experiment and log2(AccessSize).
+  FunctionCallee AsanErrorCallback[2][2][kNumberOfAccessSizes];
+  FunctionCallee AsanMemoryAccessCallback[2][2][kNumberOfAccessSizes];
+
+  // These arrays is indexed by AccessIsWrite and Experiment.
+  FunctionCallee AsanErrorCallbackSized[2][2];
+  FunctionCallee AsanMemoryAccessCallbackSized[2][2];
+
+  FunctionCallee AsanMemmove, AsanMemcpy, AsanMemset;
+  Value *LocalDynamicShadow = nullptr;
+  const GlobalsMetadata &GlobalsMD;
+  DenseMap<const AllocaInst *, bool> ProcessedAllocas;
+};
+
+class AddressSanitizerLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  explicit AddressSanitizerLegacyPass(bool CompileKernel = false,
+                                      bool Recover = false,
+                                      bool UseAfterScope = false)
+      : FunctionPass(ID), CompileKernel(CompileKernel), Recover(Recover),
+        UseAfterScope(UseAfterScope) {
+    initializeAddressSanitizerLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "AddressSanitizerFunctionPass";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<ASanGlobalsMetadataWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override {
+    GlobalsMetadata &GlobalsMD =
+        getAnalysis<ASanGlobalsMetadataWrapperPass>().getGlobalsMD();
+    const TargetLibraryInfo *TLI =
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    AddressSanitizer ASan(*F.getParent(), &GlobalsMD, CompileKernel, Recover,
+                          UseAfterScope);
+    return ASan.instrumentFunction(F, TLI);
+  }
+
+private:
+  bool CompileKernel;
+  bool Recover;
+  bool UseAfterScope;
+};
+
+class ModuleAddressSanitizer {
+public:
+  ModuleAddressSanitizer(Module &M, const GlobalsMetadata *GlobalsMD,
+                         bool CompileKernel = false, bool Recover = false,
+                         bool UseGlobalsGC = true, bool UseOdrIndicator = false)
+      : GlobalsMD(*GlobalsMD),
+        CompileKernel(ClEnableKasan.getNumOccurrences() > 0 ? ClEnableKasan
+                                                            : CompileKernel),
+        Recover(ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover),
+        UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC && !this->CompileKernel),
+        // Enable aliases as they should have no downside with ODR indicators.
+        UsePrivateAlias(UseOdrIndicator || ClUsePrivateAlias),
+        UseOdrIndicator(UseOdrIndicator || ClUseOdrIndicator),
+        // Not a typo: ClWithComdat is almost completely pointless without
+        // ClUseGlobalsGC (because then it only works on modules without
+        // globals, which are rare); it is a prerequisite for ClUseGlobalsGC;
+        // and both suffer from gold PR19002 for which UseGlobalsGC constructor
+        // argument is designed as workaround. Therefore, disable both
+        // ClWithComdat and ClUseGlobalsGC unless the frontend says it's ok to
+        // do globals-gc.
+        UseCtorComdat(UseGlobalsGC && ClWithComdat && !this->CompileKernel) {
+    C = &(M.getContext());
+    int LongSize = M.getDataLayout().getPointerSizeInBits();
+    IntptrTy = Type::getIntNTy(*C, LongSize);
+    TargetTriple = Triple(M.getTargetTriple());
+    Mapping = getShadowMapping(TargetTriple, LongSize, this->CompileKernel);
+  }
+
+  bool instrumentModule(Module &);
+
+private:
+  void initializeCallbacks(Module &M);
+
+  bool InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool *CtorComdat);
+  void InstrumentGlobalsCOFF(IRBuilder<> &IRB, Module &M,
+                             ArrayRef<GlobalVariable *> ExtendedGlobals,
+                             ArrayRef<Constant *> MetadataInitializers);
+  void InstrumentGlobalsELF(IRBuilder<> &IRB, Module &M,
+                            ArrayRef<GlobalVariable *> ExtendedGlobals,
+                            ArrayRef<Constant *> MetadataInitializers,
+                            const std::string &UniqueModuleId);
+  void InstrumentGlobalsMachO(IRBuilder<> &IRB, Module &M,
+                              ArrayRef<GlobalVariable *> ExtendedGlobals,
+                              ArrayRef<Constant *> MetadataInitializers);
+  void
+  InstrumentGlobalsWithMetadataArray(IRBuilder<> &IRB, Module &M,
+                                     ArrayRef<GlobalVariable *> ExtendedGlobals,
+                                     ArrayRef<Constant *> MetadataInitializers);
+
+  GlobalVariable *CreateMetadataGlobal(Module &M, Constant *Initializer,
+                                       StringRef OriginalName);
+  void SetComdatForGlobalMetadata(GlobalVariable *G, GlobalVariable *Metadata,
+                                  StringRef InternalSuffix);
+  Instruction *CreateAsanModuleDtor(Module &M);
+
+  const GlobalVariable *getExcludedAliasedGlobal(const GlobalAlias &GA) const;
+  bool shouldInstrumentGlobal(GlobalVariable *G) const;
+  bool ShouldUseMachOGlobalsSection() const;
+  StringRef getGlobalMetadataSection() const;
+  void poisonOneInitializer(Function &GlobalInit, GlobalValue *ModuleName);
+  void createInitializerPoisonCalls(Module &M, GlobalValue *ModuleName);
+  uint64_t getMinRedzoneSizeForGlobal() const {
+    return getRedzoneSizeForScale(Mapping.Scale);
+  }
+  uint64_t getRedzoneSizeForGlobal(uint64_t SizeInBytes) const;
+  int GetAsanVersion(const Module &M) const;
+
+  const GlobalsMetadata &GlobalsMD;
+  bool CompileKernel;
+  bool Recover;
+  bool UseGlobalsGC;
+  bool UsePrivateAlias;
+  bool UseOdrIndicator;
+  bool UseCtorComdat;
+  Type *IntptrTy;
+  LLVMContext *C;
+  Triple TargetTriple;
+  ShadowMapping Mapping;
+  FunctionCallee AsanPoisonGlobals;
+  FunctionCallee AsanUnpoisonGlobals;
+  FunctionCallee AsanRegisterGlobals;
+  FunctionCallee AsanUnregisterGlobals;
+  FunctionCallee AsanRegisterImageGlobals;
+  FunctionCallee AsanUnregisterImageGlobals;
+  FunctionCallee AsanRegisterElfGlobals;
+  FunctionCallee AsanUnregisterElfGlobals;
+
+  Function *AsanCtorFunction = nullptr;
+  Function *AsanDtorFunction = nullptr;
+};
+
+class ModuleAddressSanitizerLegacyPass : public ModulePass {
+public:
+  static char ID;
+
+  explicit ModuleAddressSanitizerLegacyPass(bool CompileKernel = false,
+                                            bool Recover = false,
+                                            bool UseGlobalGC = true,
+                                            bool UseOdrIndicator = false)
+      : ModulePass(ID), CompileKernel(CompileKernel), Recover(Recover),
+        UseGlobalGC(UseGlobalGC), UseOdrIndicator(UseOdrIndicator) {
+    initializeModuleAddressSanitizerLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "ModuleAddressSanitizer"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<ASanGlobalsMetadataWrapperPass>();
+  }
+
+  bool runOnModule(Module &M) override {
+    GlobalsMetadata &GlobalsMD =
+        getAnalysis<ASanGlobalsMetadataWrapperPass>().getGlobalsMD();
+    ModuleAddressSanitizer ASanModule(M, &GlobalsMD, CompileKernel, Recover,
+                                      UseGlobalGC, UseOdrIndicator);
+    return ASanModule.instrumentModule(M);
+  }
+
+private:
+  bool CompileKernel;
+  bool Recover;
+  bool UseGlobalGC;
+  bool UseOdrIndicator;
+};
+
+// Stack poisoning does not play well with exception handling.
+// When an exception is thrown, we essentially bypass the code
+// that unpoisones the stack. This is why the run-time library has
+// to intercept __cxa_throw (as well as longjmp, etc) and unpoison the entire
+// stack in the interceptor. This however does not work inside the
+// actual function which catches the exception. Most likely because the
+// compiler hoists the load of the shadow value somewhere too high.
+// This causes asan to report a non-existing bug on 453.povray.
+// It sounds like an LLVM bug.
+struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
+  Function &F;
+  AddressSanitizer &ASan;
+  DIBuilder DIB;
+  LLVMContext *C;
+  Type *IntptrTy;
+  Type *IntptrPtrTy;
+  ShadowMapping Mapping;
+
+  SmallVector<AllocaInst *, 16> AllocaVec;
+  SmallVector<AllocaInst *, 16> StaticAllocasToMoveUp;
+  SmallVector<Instruction *, 8> RetVec;
+  unsigned StackAlignment;
+
+  FunctionCallee AsanStackMallocFunc[kMaxAsanStackMallocSizeClass + 1],
+      AsanStackFreeFunc[kMaxAsanStackMallocSizeClass + 1];
+  FunctionCallee AsanSetShadowFunc[0x100] = {};
+  FunctionCallee AsanPoisonStackMemoryFunc, AsanUnpoisonStackMemoryFunc;
+  FunctionCallee AsanAllocaPoisonFunc, AsanAllocasUnpoisonFunc;
+
+  // Stores a place and arguments of poisoning/unpoisoning call for alloca.
+  struct AllocaPoisonCall {
+    IntrinsicInst *InsBefore;
+    AllocaInst *AI;
+    uint64_t Size;
+    bool DoPoison;
+  };
+  SmallVector<AllocaPoisonCall, 8> DynamicAllocaPoisonCallVec;
+  SmallVector<AllocaPoisonCall, 8> StaticAllocaPoisonCallVec;
+  bool HasUntracedLifetimeIntrinsic = false;
+
+  SmallVector<AllocaInst *, 1> DynamicAllocaVec;
+  SmallVector<IntrinsicInst *, 1> StackRestoreVec;
+  AllocaInst *DynamicAllocaLayout = nullptr;
+  IntrinsicInst *LocalEscapeCall = nullptr;
+
+  bool HasInlineAsm = false;
+  bool HasReturnsTwiceCall = false;
+
+  FunctionStackPoisoner(Function &F, AddressSanitizer &ASan)
+      : F(F), ASan(ASan), DIB(*F.getParent(), /*AllowUnresolved*/ false),
+        C(ASan.C), IntptrTy(ASan.IntptrTy),
+        IntptrPtrTy(PointerType::get(IntptrTy, 0)), Mapping(ASan.Mapping),
+        StackAlignment(1 << Mapping.Scale) {}
+
+  bool runOnFunction() {
+    if (!ClStack) return false;
+
+    if (ClRedzoneByvalArgs)
+      copyArgsPassedByValToAllocas();
+
+    // Collect alloca, ret, lifetime instructions etc.
+    for (BasicBlock *BB : depth_first(&F.getEntryBlock())) visit(*BB);
+
+    if (AllocaVec.empty() && DynamicAllocaVec.empty()) return false;
+
+    initializeCallbacks(*F.getParent());
+
+    if (HasUntracedLifetimeIntrinsic) {
+      // If there are lifetime intrinsics which couldn't be traced back to an
+      // alloca, we may not know exactly when a variable enters scope, and
+      // therefore should "fail safe" by not poisoning them.
+      StaticAllocaPoisonCallVec.clear();
+      DynamicAllocaPoisonCallVec.clear();
+    }
+
+    processDynamicAllocas();
+    processStaticAllocas();
+
+    if (ClDebugStack) {
+      LLVM_DEBUG(dbgs() << F);
+    }
+    return true;
+  }
+
+  // Arguments marked with the "byval" attribute are implicitly copied without
+  // using an alloca instruction.  To produce redzones for those arguments, we
+  // copy them a second time into memory allocated with an alloca instruction.
+  void copyArgsPassedByValToAllocas();
+
+  // Finds all Alloca instructions and puts
+  // poisoned red zones around all of them.
+  // Then unpoison everything back before the function returns.
+  void processStaticAllocas();
+  void processDynamicAllocas();
+
+  void createDynamicAllocasInitStorage();
+
+  // ----------------------- Visitors.
   /// Collect all Ret instructions, or the musttail call instruction if it
   /// precedes the return instruction.
   void visitReturnInst(ReturnInst &RI) {
@@ -969,910 +969,910 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
     else
       RetVec.push_back(&RI);
   }
- 
-  /// Collect all Resume instructions. 
-  void visitResumeInst(ResumeInst &RI) { RetVec.push_back(&RI); } 
- 
-  /// Collect all CatchReturnInst instructions. 
-  void visitCleanupReturnInst(CleanupReturnInst &CRI) { RetVec.push_back(&CRI); } 
- 
-  void unpoisonDynamicAllocasBeforeInst(Instruction *InstBefore, 
-                                        Value *SavedStack) { 
-    IRBuilder<> IRB(InstBefore); 
-    Value *DynamicAreaPtr = IRB.CreatePtrToInt(SavedStack, IntptrTy); 
-    // When we insert _asan_allocas_unpoison before @llvm.stackrestore, we 
-    // need to adjust extracted SP to compute the address of the most recent 
-    // alloca. We have a special @llvm.get.dynamic.area.offset intrinsic for 
-    // this purpose. 
-    if (!isa<ReturnInst>(InstBefore)) { 
-      Function *DynamicAreaOffsetFunc = Intrinsic::getDeclaration( 
-          InstBefore->getModule(), Intrinsic::get_dynamic_area_offset, 
-          {IntptrTy}); 
- 
-      Value *DynamicAreaOffset = IRB.CreateCall(DynamicAreaOffsetFunc, {}); 
- 
-      DynamicAreaPtr = IRB.CreateAdd(IRB.CreatePtrToInt(SavedStack, IntptrTy), 
-                                     DynamicAreaOffset); 
-    } 
- 
-    IRB.CreateCall( 
-        AsanAllocasUnpoisonFunc, 
-        {IRB.CreateLoad(IntptrTy, DynamicAllocaLayout), DynamicAreaPtr}); 
-  } 
- 
-  // Unpoison dynamic allocas redzones. 
-  void unpoisonDynamicAllocas() { 
+
+  /// Collect all Resume instructions.
+  void visitResumeInst(ResumeInst &RI) { RetVec.push_back(&RI); }
+
+  /// Collect all CatchReturnInst instructions.
+  void visitCleanupReturnInst(CleanupReturnInst &CRI) { RetVec.push_back(&CRI); }
+
+  void unpoisonDynamicAllocasBeforeInst(Instruction *InstBefore,
+                                        Value *SavedStack) {
+    IRBuilder<> IRB(InstBefore);
+    Value *DynamicAreaPtr = IRB.CreatePtrToInt(SavedStack, IntptrTy);
+    // When we insert _asan_allocas_unpoison before @llvm.stackrestore, we
+    // need to adjust extracted SP to compute the address of the most recent
+    // alloca. We have a special @llvm.get.dynamic.area.offset intrinsic for
+    // this purpose.
+    if (!isa<ReturnInst>(InstBefore)) {
+      Function *DynamicAreaOffsetFunc = Intrinsic::getDeclaration(
+          InstBefore->getModule(), Intrinsic::get_dynamic_area_offset,
+          {IntptrTy});
+
+      Value *DynamicAreaOffset = IRB.CreateCall(DynamicAreaOffsetFunc, {});
+
+      DynamicAreaPtr = IRB.CreateAdd(IRB.CreatePtrToInt(SavedStack, IntptrTy),
+                                     DynamicAreaOffset);
+    }
+
+    IRB.CreateCall(
+        AsanAllocasUnpoisonFunc,
+        {IRB.CreateLoad(IntptrTy, DynamicAllocaLayout), DynamicAreaPtr});
+  }
+
+  // Unpoison dynamic allocas redzones.
+  void unpoisonDynamicAllocas() {
     for (Instruction *Ret : RetVec)
-      unpoisonDynamicAllocasBeforeInst(Ret, DynamicAllocaLayout); 
- 
+      unpoisonDynamicAllocasBeforeInst(Ret, DynamicAllocaLayout);
+
     for (Instruction *StackRestoreInst : StackRestoreVec)
-      unpoisonDynamicAllocasBeforeInst(StackRestoreInst, 
-                                       StackRestoreInst->getOperand(0)); 
-  } 
- 
-  // Deploy and poison redzones around dynamic alloca call. To do this, we 
-  // should replace this call with another one with changed parameters and 
-  // replace all its uses with new address, so 
-  //   addr = alloca type, old_size, align 
-  // is replaced by 
-  //   new_size = (old_size + additional_size) * sizeof(type) 
-  //   tmp = alloca i8, new_size, max(align, 32) 
-  //   addr = tmp + 32 (first 32 bytes are for the left redzone). 
-  // Additional_size is added to make new memory allocation contain not only 
-  // requested memory, but also left, partial and right redzones. 
-  void handleDynamicAllocaCall(AllocaInst *AI); 
- 
-  /// Collect Alloca instructions we want (and can) handle. 
-  void visitAllocaInst(AllocaInst &AI) { 
-    if (!ASan.isInterestingAlloca(AI)) { 
-      if (AI.isStaticAlloca()) { 
-        // Skip over allocas that are present *before* the first instrumented 
-        // alloca, we don't want to move those around. 
-        if (AllocaVec.empty()) 
-          return; 
- 
-        StaticAllocasToMoveUp.push_back(&AI); 
-      } 
-      return; 
-    } 
- 
-    StackAlignment = std::max(StackAlignment, AI.getAlignment()); 
-    if (!AI.isStaticAlloca()) 
-      DynamicAllocaVec.push_back(&AI); 
-    else 
-      AllocaVec.push_back(&AI); 
-  } 
- 
-  /// Collect lifetime intrinsic calls to check for use-after-scope 
-  /// errors. 
-  void visitIntrinsicInst(IntrinsicInst &II) { 
-    Intrinsic::ID ID = II.getIntrinsicID(); 
-    if (ID == Intrinsic::stackrestore) StackRestoreVec.push_back(&II); 
-    if (ID == Intrinsic::localescape) LocalEscapeCall = &II; 
-    if (!ASan.UseAfterScope) 
-      return; 
-    if (!II.isLifetimeStartOrEnd()) 
-      return; 
-    // Found lifetime intrinsic, add ASan instrumentation if necessary. 
-    auto *Size = cast<ConstantInt>(II.getArgOperand(0)); 
-    // If size argument is undefined, don't do anything. 
-    if (Size->isMinusOne()) return; 
-    // Check that size doesn't saturate uint64_t and can 
-    // be stored in IntptrTy. 
-    const uint64_t SizeValue = Size->getValue().getLimitedValue(); 
-    if (SizeValue == ~0ULL || 
-        !ConstantInt::isValueValidForType(IntptrTy, SizeValue)) 
-      return; 
-    // Find alloca instruction that corresponds to llvm.lifetime argument. 
+      unpoisonDynamicAllocasBeforeInst(StackRestoreInst,
+                                       StackRestoreInst->getOperand(0));
+  }
+
+  // Deploy and poison redzones around dynamic alloca call. To do this, we
+  // should replace this call with another one with changed parameters and
+  // replace all its uses with new address, so
+  //   addr = alloca type, old_size, align
+  // is replaced by
+  //   new_size = (old_size + additional_size) * sizeof(type)
+  //   tmp = alloca i8, new_size, max(align, 32)
+  //   addr = tmp + 32 (first 32 bytes are for the left redzone).
+  // Additional_size is added to make new memory allocation contain not only
+  // requested memory, but also left, partial and right redzones.
+  void handleDynamicAllocaCall(AllocaInst *AI);
+
+  /// Collect Alloca instructions we want (and can) handle.
+  void visitAllocaInst(AllocaInst &AI) {
+    if (!ASan.isInterestingAlloca(AI)) {
+      if (AI.isStaticAlloca()) {
+        // Skip over allocas that are present *before* the first instrumented
+        // alloca, we don't want to move those around.
+        if (AllocaVec.empty())
+          return;
+
+        StaticAllocasToMoveUp.push_back(&AI);
+      }
+      return;
+    }
+
+    StackAlignment = std::max(StackAlignment, AI.getAlignment());
+    if (!AI.isStaticAlloca())
+      DynamicAllocaVec.push_back(&AI);
+    else
+      AllocaVec.push_back(&AI);
+  }
+
+  /// Collect lifetime intrinsic calls to check for use-after-scope
+  /// errors.
+  void visitIntrinsicInst(IntrinsicInst &II) {
+    Intrinsic::ID ID = II.getIntrinsicID();
+    if (ID == Intrinsic::stackrestore) StackRestoreVec.push_back(&II);
+    if (ID == Intrinsic::localescape) LocalEscapeCall = &II;
+    if (!ASan.UseAfterScope)
+      return;
+    if (!II.isLifetimeStartOrEnd())
+      return;
+    // Found lifetime intrinsic, add ASan instrumentation if necessary.
+    auto *Size = cast<ConstantInt>(II.getArgOperand(0));
+    // If size argument is undefined, don't do anything.
+    if (Size->isMinusOne()) return;
+    // Check that size doesn't saturate uint64_t and can
+    // be stored in IntptrTy.
+    const uint64_t SizeValue = Size->getValue().getLimitedValue();
+    if (SizeValue == ~0ULL ||
+        !ConstantInt::isValueValidForType(IntptrTy, SizeValue))
+      return;
+    // Find alloca instruction that corresponds to llvm.lifetime argument.
     // Currently we can only handle lifetime markers pointing to the
     // beginning of the alloca.
     AllocaInst *AI = findAllocaForValue(II.getArgOperand(1), true);
-    if (!AI) { 
-      HasUntracedLifetimeIntrinsic = true; 
-      return; 
-    } 
-    // We're interested only in allocas we can handle. 
-    if (!ASan.isInterestingAlloca(*AI)) 
-      return; 
-    bool DoPoison = (ID == Intrinsic::lifetime_end); 
-    AllocaPoisonCall APC = {&II, AI, SizeValue, DoPoison}; 
-    if (AI->isStaticAlloca()) 
-      StaticAllocaPoisonCallVec.push_back(APC); 
-    else if (ClInstrumentDynamicAllocas) 
-      DynamicAllocaPoisonCallVec.push_back(APC); 
-  } 
- 
-  void visitCallBase(CallBase &CB) { 
-    if (CallInst *CI = dyn_cast<CallInst>(&CB)) { 
-      HasInlineAsm |= CI->isInlineAsm() && &CB != ASan.LocalDynamicShadow; 
-      HasReturnsTwiceCall |= CI->canReturnTwice(); 
-    } 
-  } 
- 
-  // ---------------------- Helpers. 
-  void initializeCallbacks(Module &M); 
- 
-  // Copies bytes from ShadowBytes into shadow memory for indexes where 
-  // ShadowMask is not zero. If ShadowMask[i] is zero, we assume that 
-  // ShadowBytes[i] is constantly zero and doesn't need to be overwritten. 
-  void copyToShadow(ArrayRef<uint8_t> ShadowMask, ArrayRef<uint8_t> ShadowBytes, 
-                    IRBuilder<> &IRB, Value *ShadowBase); 
-  void copyToShadow(ArrayRef<uint8_t> ShadowMask, ArrayRef<uint8_t> ShadowBytes, 
-                    size_t Begin, size_t End, IRBuilder<> &IRB, 
-                    Value *ShadowBase); 
-  void copyToShadowInline(ArrayRef<uint8_t> ShadowMask, 
-                          ArrayRef<uint8_t> ShadowBytes, size_t Begin, 
-                          size_t End, IRBuilder<> &IRB, Value *ShadowBase); 
- 
-  void poisonAlloca(Value *V, uint64_t Size, IRBuilder<> &IRB, bool DoPoison); 
- 
-  Value *createAllocaForLayout(IRBuilder<> &IRB, const ASanStackFrameLayout &L, 
-                               bool Dynamic); 
-  PHINode *createPHI(IRBuilder<> &IRB, Value *Cond, Value *ValueIfTrue, 
-                     Instruction *ThenTerm, Value *ValueIfFalse); 
-}; 
- 
-} // end anonymous namespace 
- 
-void LocationMetadata::parse(MDNode *MDN) { 
-  assert(MDN->getNumOperands() == 3); 
-  MDString *DIFilename = cast<MDString>(MDN->getOperand(0)); 
-  Filename = DIFilename->getString(); 
-  LineNo = mdconst::extract<ConstantInt>(MDN->getOperand(1))->getLimitedValue(); 
-  ColumnNo = 
-      mdconst::extract<ConstantInt>(MDN->getOperand(2))->getLimitedValue(); 
-} 
- 
-// FIXME: It would be cleaner to instead attach relevant metadata to the globals 
-// we want to sanitize instead and reading this metadata on each pass over a 
-// function instead of reading module level metadata at first. 
-GlobalsMetadata::GlobalsMetadata(Module &M) { 
-  NamedMDNode *Globals = M.getNamedMetadata("llvm.asan.globals"); 
-  if (!Globals) 
-    return; 
-  for (auto MDN : Globals->operands()) { 
-    // Metadata node contains the global and the fields of "Entry". 
-    assert(MDN->getNumOperands() == 5); 
-    auto *V = mdconst::extract_or_null<Constant>(MDN->getOperand(0)); 
-    // The optimizer may optimize away a global entirely. 
-    if (!V) 
-      continue; 
-    auto *StrippedV = V->stripPointerCasts(); 
-    auto *GV = dyn_cast<GlobalVariable>(StrippedV); 
-    if (!GV) 
-      continue; 
-    // We can already have an entry for GV if it was merged with another 
-    // global. 
-    Entry &E = Entries[GV]; 
-    if (auto *Loc = cast_or_null<MDNode>(MDN->getOperand(1))) 
-      E.SourceLoc.parse(Loc); 
-    if (auto *Name = cast_or_null<MDString>(MDN->getOperand(2))) 
-      E.Name = Name->getString(); 
-    ConstantInt *IsDynInit = mdconst::extract<ConstantInt>(MDN->getOperand(3)); 
-    E.IsDynInit |= IsDynInit->isOne(); 
-    ConstantInt *IsExcluded = 
-        mdconst::extract<ConstantInt>(MDN->getOperand(4)); 
-    E.IsExcluded |= IsExcluded->isOne(); 
-  } 
-} 
- 
-AnalysisKey ASanGlobalsMetadataAnalysis::Key; 
- 
-GlobalsMetadata ASanGlobalsMetadataAnalysis::run(Module &M, 
-                                                 ModuleAnalysisManager &AM) { 
-  return GlobalsMetadata(M); 
-} 
- 
-AddressSanitizerPass::AddressSanitizerPass(bool CompileKernel, bool Recover, 
-                                           bool UseAfterScope) 
-    : CompileKernel(CompileKernel), Recover(Recover), 
-      UseAfterScope(UseAfterScope) {} 
- 
-PreservedAnalyses AddressSanitizerPass::run(Function &F, 
-                                            AnalysisManager<Function> &AM) { 
-  auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 
-  Module &M = *F.getParent(); 
-  if (auto *R = MAMProxy.getCachedResult<ASanGlobalsMetadataAnalysis>(M)) { 
-    const TargetLibraryInfo *TLI = &AM.getResult<TargetLibraryAnalysis>(F); 
-    AddressSanitizer Sanitizer(M, R, CompileKernel, Recover, UseAfterScope); 
-    if (Sanitizer.instrumentFunction(F, TLI)) 
-      return PreservedAnalyses::none(); 
-    return PreservedAnalyses::all(); 
-  } 
- 
-  report_fatal_error( 
-      "The ASanGlobalsMetadataAnalysis is required to run before " 
-      "AddressSanitizer can run"); 
-  return PreservedAnalyses::all(); 
-} 
- 
-ModuleAddressSanitizerPass::ModuleAddressSanitizerPass(bool CompileKernel, 
-                                                       bool Recover, 
-                                                       bool UseGlobalGC, 
-                                                       bool UseOdrIndicator) 
-    : CompileKernel(CompileKernel), Recover(Recover), UseGlobalGC(UseGlobalGC), 
-      UseOdrIndicator(UseOdrIndicator) {} 
- 
-PreservedAnalyses ModuleAddressSanitizerPass::run(Module &M, 
-                                                  AnalysisManager<Module> &AM) { 
-  GlobalsMetadata &GlobalsMD = AM.getResult<ASanGlobalsMetadataAnalysis>(M); 
-  ModuleAddressSanitizer Sanitizer(M, &GlobalsMD, CompileKernel, Recover, 
-                                   UseGlobalGC, UseOdrIndicator); 
-  if (Sanitizer.instrumentModule(M)) 
-    return PreservedAnalyses::none(); 
-  return PreservedAnalyses::all(); 
-} 
- 
-INITIALIZE_PASS(ASanGlobalsMetadataWrapperPass, "asan-globals-md", 
-                "Read metadata to mark which globals should be instrumented " 
-                "when running ASan.", 
-                false, true) 
- 
-char AddressSanitizerLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN( 
-    AddressSanitizerLegacyPass, "asan", 
-    "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false, 
-    false) 
-INITIALIZE_PASS_DEPENDENCY(ASanGlobalsMetadataWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_END( 
-    AddressSanitizerLegacyPass, "asan", 
-    "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false, 
-    false) 
- 
-FunctionPass *llvm::createAddressSanitizerFunctionPass(bool CompileKernel, 
-                                                       bool Recover, 
-                                                       bool UseAfterScope) { 
-  assert(!CompileKernel || Recover); 
-  return new AddressSanitizerLegacyPass(CompileKernel, Recover, UseAfterScope); 
-} 
- 
-char ModuleAddressSanitizerLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS( 
-    ModuleAddressSanitizerLegacyPass, "asan-module", 
-    "AddressSanitizer: detects use-after-free and out-of-bounds bugs." 
-    "ModulePass", 
-    false, false) 
- 
-ModulePass *llvm::createModuleAddressSanitizerLegacyPassPass( 
-    bool CompileKernel, bool Recover, bool UseGlobalsGC, bool UseOdrIndicator) { 
-  assert(!CompileKernel || Recover); 
-  return new ModuleAddressSanitizerLegacyPass(CompileKernel, Recover, 
-                                              UseGlobalsGC, UseOdrIndicator); 
-} 
- 
-static size_t TypeSizeToSizeIndex(uint32_t TypeSize) { 
-  size_t Res = countTrailingZeros(TypeSize / 8); 
-  assert(Res < kNumberOfAccessSizes); 
-  return Res; 
-} 
- 
-/// Create a global describing a source location. 
-static GlobalVariable *createPrivateGlobalForSourceLoc(Module &M, 
-                                                       LocationMetadata MD) { 
-  Constant *LocData[] = { 
-      createPrivateGlobalForString(M, MD.Filename, true, kAsanGenPrefix), 
-      ConstantInt::get(Type::getInt32Ty(M.getContext()), MD.LineNo), 
-      ConstantInt::get(Type::getInt32Ty(M.getContext()), MD.ColumnNo), 
-  }; 
-  auto LocStruct = ConstantStruct::getAnon(LocData); 
-  auto GV = new GlobalVariable(M, LocStruct->getType(), true, 
-                               GlobalValue::PrivateLinkage, LocStruct, 
-                               kAsanGenPrefix); 
-  GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); 
-  return GV; 
-} 
- 
-/// Check if \p G has been created by a trusted compiler pass. 
-static bool GlobalWasGeneratedByCompiler(GlobalVariable *G) { 
-  // Do not instrument @llvm.global_ctors, @llvm.used, etc. 
-  if (G->getName().startswith("llvm.")) 
-    return true; 
- 
-  // Do not instrument asan globals. 
-  if (G->getName().startswith(kAsanGenPrefix) || 
-      G->getName().startswith(kSanCovGenPrefix) || 
-      G->getName().startswith(kODRGenPrefix)) 
-    return true; 
- 
-  // Do not instrument gcov counter arrays. 
-  if (G->getName() == "__llvm_gcov_ctr") 
-    return true; 
- 
-  return false; 
-} 
- 
-Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) { 
-  // Shadow >> scale 
-  Shadow = IRB.CreateLShr(Shadow, Mapping.Scale); 
-  if (Mapping.Offset == 0) return Shadow; 
-  // (Shadow >> scale) | offset 
-  Value *ShadowBase; 
-  if (LocalDynamicShadow) 
-    ShadowBase = LocalDynamicShadow; 
-  else 
-    ShadowBase = ConstantInt::get(IntptrTy, Mapping.Offset); 
-  if (Mapping.OrShadowOffset) 
-    return IRB.CreateOr(Shadow, ShadowBase); 
-  else 
-    return IRB.CreateAdd(Shadow, ShadowBase); 
-} 
- 
-// Instrument memset/memmove/memcpy 
-void AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) { 
-  IRBuilder<> IRB(MI); 
-  if (isa<MemTransferInst>(MI)) { 
-    IRB.CreateCall( 
-        isa<MemMoveInst>(MI) ? AsanMemmove : AsanMemcpy, 
-        {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()), 
-         IRB.CreatePointerCast(MI->getOperand(1), IRB.getInt8PtrTy()), 
-         IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)}); 
-  } else if (isa<MemSetInst>(MI)) { 
-    IRB.CreateCall( 
-        AsanMemset, 
-        {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()), 
-         IRB.CreateIntCast(MI->getOperand(1), IRB.getInt32Ty(), false), 
-         IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)}); 
-  } 
-  MI->eraseFromParent(); 
-} 
- 
-/// Check if we want (and can) handle this alloca. 
-bool AddressSanitizer::isInterestingAlloca(const AllocaInst &AI) { 
-  auto PreviouslySeenAllocaInfo = ProcessedAllocas.find(&AI); 
- 
-  if (PreviouslySeenAllocaInfo != ProcessedAllocas.end()) 
-    return PreviouslySeenAllocaInfo->getSecond(); 
- 
-  bool IsInteresting = 
-      (AI.getAllocatedType()->isSized() && 
-       // alloca() may be called with 0 size, ignore it. 
-       ((!AI.isStaticAlloca()) || getAllocaSizeInBytes(AI) > 0) && 
-       // We are only interested in allocas not promotable to registers. 
-       // Promotable allocas are common under -O0. 
-       (!ClSkipPromotableAllocas || !isAllocaPromotable(&AI)) && 
-       // inalloca allocas are not treated as static, and we don't want 
-       // dynamic alloca instrumentation for them as well. 
-       !AI.isUsedWithInAlloca() && 
-       // swifterror allocas are register promoted by ISel 
-       !AI.isSwiftError()); 
- 
-  ProcessedAllocas[&AI] = IsInteresting; 
-  return IsInteresting; 
-} 
- 
-bool AddressSanitizer::ignoreAccess(Value *Ptr) { 
-  // Do not instrument acesses from different address spaces; we cannot deal 
-  // with them. 
-  Type *PtrTy = cast<PointerType>(Ptr->getType()->getScalarType()); 
-  if (PtrTy->getPointerAddressSpace() != 0) 
-    return true; 
- 
-  // Ignore swifterror addresses. 
-  // swifterror memory addresses are mem2reg promoted by instruction 
-  // selection. As such they cannot have regular uses like an instrumentation 
-  // function and it makes no sense to track them as memory. 
-  if (Ptr->isSwiftError()) 
-    return true; 
- 
-  // Treat memory accesses to promotable allocas as non-interesting since they 
-  // will not cause memory violations. This greatly speeds up the instrumented 
-  // executable at -O0. 
-  if (auto AI = dyn_cast_or_null<AllocaInst>(Ptr)) 
-    if (ClSkipPromotableAllocas && !isInterestingAlloca(*AI)) 
-      return true; 
- 
-  return false; 
-} 
- 
-void AddressSanitizer::getInterestingMemoryOperands( 
-    Instruction *I, SmallVectorImpl<InterestingMemoryOperand> &Interesting) { 
-  // Skip memory accesses inserted by another instrumentation. 
-  if (I->hasMetadata("nosanitize")) 
-    return; 
- 
-  // Do not instrument the load fetching the dynamic shadow address. 
-  if (LocalDynamicShadow == I) 
-    return; 
- 
-  if (LoadInst *LI = dyn_cast<LoadInst>(I)) { 
-    if (!ClInstrumentReads || ignoreAccess(LI->getPointerOperand())) 
-      return; 
-    Interesting.emplace_back(I, LI->getPointerOperandIndex(), false, 
-                             LI->getType(), LI->getAlign()); 
-  } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) { 
-    if (!ClInstrumentWrites || ignoreAccess(SI->getPointerOperand())) 
-      return; 
-    Interesting.emplace_back(I, SI->getPointerOperandIndex(), true, 
-                             SI->getValueOperand()->getType(), SI->getAlign()); 
-  } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) { 
-    if (!ClInstrumentAtomics || ignoreAccess(RMW->getPointerOperand())) 
-      return; 
-    Interesting.emplace_back(I, RMW->getPointerOperandIndex(), true, 
-                             RMW->getValOperand()->getType(), None); 
-  } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) { 
-    if (!ClInstrumentAtomics || ignoreAccess(XCHG->getPointerOperand())) 
-      return; 
-    Interesting.emplace_back(I, XCHG->getPointerOperandIndex(), true, 
-                             XCHG->getCompareOperand()->getType(), None); 
-  } else if (auto CI = dyn_cast<CallInst>(I)) { 
-    auto *F = CI->getCalledFunction(); 
-    if (F && (F->getName().startswith("llvm.masked.load.") || 
-              F->getName().startswith("llvm.masked.store."))) { 
-      bool IsWrite = F->getName().startswith("llvm.masked.store."); 
-      // Masked store has an initial operand for the value. 
-      unsigned OpOffset = IsWrite ? 1 : 0; 
-      if (IsWrite ? !ClInstrumentWrites : !ClInstrumentReads) 
-        return; 
- 
-      auto BasePtr = CI->getOperand(OpOffset); 
-      if (ignoreAccess(BasePtr)) 
-        return; 
-      auto Ty = cast<PointerType>(BasePtr->getType())->getElementType(); 
-      MaybeAlign Alignment = Align(1); 
-      // Otherwise no alignment guarantees. We probably got Undef. 
-      if (auto *Op = dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset))) 
-        Alignment = Op->getMaybeAlignValue(); 
-      Value *Mask = CI->getOperand(2 + OpOffset); 
-      Interesting.emplace_back(I, OpOffset, IsWrite, Ty, Alignment, Mask); 
-    } else { 
-      for (unsigned ArgNo = 0; ArgNo < CI->getNumArgOperands(); ArgNo++) { 
-        if (!ClInstrumentByval || !CI->isByValArgument(ArgNo) || 
-            ignoreAccess(CI->getArgOperand(ArgNo))) 
-          continue; 
-        Type *Ty = CI->getParamByValType(ArgNo); 
-        Interesting.emplace_back(I, ArgNo, false, Ty, Align(1)); 
-      } 
-    } 
-  } 
-} 
- 
-static bool isPointerOperand(Value *V) { 
-  return V->getType()->isPointerTy() || isa<PtrToIntInst>(V); 
-} 
- 
-// This is a rough heuristic; it may cause both false positives and 
-// false negatives. The proper implementation requires cooperation with 
-// the frontend. 
-static bool isInterestingPointerComparison(Instruction *I) { 
-  if (ICmpInst *Cmp = dyn_cast<ICmpInst>(I)) { 
-    if (!Cmp->isRelational()) 
-      return false; 
-  } else { 
-    return false; 
-  } 
-  return isPointerOperand(I->getOperand(0)) && 
-         isPointerOperand(I->getOperand(1)); 
-} 
- 
-// This is a rough heuristic; it may cause both false positives and 
-// false negatives. The proper implementation requires cooperation with 
-// the frontend. 
-static bool isInterestingPointerSubtraction(Instruction *I) { 
-  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) { 
-    if (BO->getOpcode() != Instruction::Sub) 
-      return false; 
-  } else { 
-    return false; 
-  } 
-  return isPointerOperand(I->getOperand(0)) && 
-         isPointerOperand(I->getOperand(1)); 
-} 
- 
-bool AddressSanitizer::GlobalIsLinkerInitialized(GlobalVariable *G) { 
-  // If a global variable does not have dynamic initialization we don't 
-  // have to instrument it.  However, if a global does not have initializer 
-  // at all, we assume it has dynamic initializer (in other TU). 
-  // 
-  // FIXME: Metadata should be attched directly to the global directly instead 
-  // of being added to llvm.asan.globals. 
-  return G->hasInitializer() && !GlobalsMD.get(G).IsDynInit; 
-} 
- 
-void AddressSanitizer::instrumentPointerComparisonOrSubtraction( 
-    Instruction *I) { 
-  IRBuilder<> IRB(I); 
-  FunctionCallee F = isa<ICmpInst>(I) ? AsanPtrCmpFunction : AsanPtrSubFunction; 
-  Value *Param[2] = {I->getOperand(0), I->getOperand(1)}; 
-  for (Value *&i : Param) { 
-    if (i->getType()->isPointerTy()) 
-      i = IRB.CreatePointerCast(i, IntptrTy); 
-  } 
-  IRB.CreateCall(F, Param); 
-} 
- 
-static void doInstrumentAddress(AddressSanitizer *Pass, Instruction *I, 
-                                Instruction *InsertBefore, Value *Addr, 
-                                MaybeAlign Alignment, unsigned Granularity, 
-                                uint32_t TypeSize, bool IsWrite, 
-                                Value *SizeArgument, bool UseCalls, 
-                                uint32_t Exp) { 
-  // Instrument a 1-, 2-, 4-, 8-, or 16- byte access with one check 
-  // if the data is properly aligned. 
-  if ((TypeSize == 8 || TypeSize == 16 || TypeSize == 32 || TypeSize == 64 || 
-       TypeSize == 128) && 
-      (!Alignment || *Alignment >= Granularity || *Alignment >= TypeSize / 8)) 
-    return Pass->instrumentAddress(I, InsertBefore, Addr, TypeSize, IsWrite, 
-                                   nullptr, UseCalls, Exp); 
-  Pass->instrumentUnusualSizeOrAlignment(I, InsertBefore, Addr, TypeSize, 
-                                         IsWrite, nullptr, UseCalls, Exp); 
-} 
- 
-static void instrumentMaskedLoadOrStore(AddressSanitizer *Pass, 
-                                        const DataLayout &DL, Type *IntptrTy, 
-                                        Value *Mask, Instruction *I, 
-                                        Value *Addr, MaybeAlign Alignment, 
-                                        unsigned Granularity, uint32_t TypeSize, 
-                                        bool IsWrite, Value *SizeArgument, 
-                                        bool UseCalls, uint32_t Exp) { 
-  auto *VTy = cast<FixedVectorType>( 
-      cast<PointerType>(Addr->getType())->getElementType()); 
-  uint64_t ElemTypeSize = DL.getTypeStoreSizeInBits(VTy->getScalarType()); 
-  unsigned Num = VTy->getNumElements(); 
-  auto Zero = ConstantInt::get(IntptrTy, 0); 
-  for (unsigned Idx = 0; Idx < Num; ++Idx) { 
-    Value *InstrumentedAddress = nullptr; 
-    Instruction *InsertBefore = I; 
-    if (auto *Vector = dyn_cast<ConstantVector>(Mask)) { 
-      // dyn_cast as we might get UndefValue 
-      if (auto *Masked = dyn_cast<ConstantInt>(Vector->getOperand(Idx))) { 
-        if (Masked->isZero()) 
-          // Mask is constant false, so no instrumentation needed. 
-          continue; 
-        // If we have a true or undef value, fall through to doInstrumentAddress 
-        // with InsertBefore == I 
-      } 
-    } else { 
-      IRBuilder<> IRB(I); 
-      Value *MaskElem = IRB.CreateExtractElement(Mask, Idx); 
-      Instruction *ThenTerm = SplitBlockAndInsertIfThen(MaskElem, I, false); 
-      InsertBefore = ThenTerm; 
-    } 
- 
-    IRBuilder<> IRB(InsertBefore); 
-    InstrumentedAddress = 
-        IRB.CreateGEP(VTy, Addr, {Zero, ConstantInt::get(IntptrTy, Idx)}); 
-    doInstrumentAddress(Pass, I, InsertBefore, InstrumentedAddress, Alignment, 
-                        Granularity, ElemTypeSize, IsWrite, SizeArgument, 
-                        UseCalls, Exp); 
-  } 
-} 
- 
-void AddressSanitizer::instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis, 
-                                     InterestingMemoryOperand &O, bool UseCalls, 
-                                     const DataLayout &DL) { 
-  Value *Addr = O.getPtr(); 
- 
-  // Optimization experiments. 
-  // The experiments can be used to evaluate potential optimizations that remove 
-  // instrumentation (assess false negatives). Instead of completely removing 
-  // some instrumentation, you set Exp to a non-zero value (mask of optimization 
-  // experiments that want to remove instrumentation of this instruction). 
-  // If Exp is non-zero, this pass will emit special calls into runtime 
-  // (e.g. __asan_report_exp_load1 instead of __asan_report_load1). These calls 
-  // make runtime terminate the program in a special way (with a different 
-  // exit status). Then you run the new compiler on a buggy corpus, collect 
-  // the special terminations (ideally, you don't see them at all -- no false 
-  // negatives) and make the decision on the optimization. 
-  uint32_t Exp = ClForceExperiment; 
- 
-  if (ClOpt && ClOptGlobals) { 
-    // If initialization order checking is disabled, a simple access to a 
-    // dynamically initialized global is always valid. 
+    if (!AI) {
+      HasUntracedLifetimeIntrinsic = true;
+      return;
+    }
+    // We're interested only in allocas we can handle.
+    if (!ASan.isInterestingAlloca(*AI))
+      return;
+    bool DoPoison = (ID == Intrinsic::lifetime_end);
+    AllocaPoisonCall APC = {&II, AI, SizeValue, DoPoison};
+    if (AI->isStaticAlloca())
+      StaticAllocaPoisonCallVec.push_back(APC);
+    else if (ClInstrumentDynamicAllocas)
+      DynamicAllocaPoisonCallVec.push_back(APC);
+  }
+
+  void visitCallBase(CallBase &CB) {
+    if (CallInst *CI = dyn_cast<CallInst>(&CB)) {
+      HasInlineAsm |= CI->isInlineAsm() && &CB != ASan.LocalDynamicShadow;
+      HasReturnsTwiceCall |= CI->canReturnTwice();
+    }
+  }
+
+  // ---------------------- Helpers.
+  void initializeCallbacks(Module &M);
+
+  // Copies bytes from ShadowBytes into shadow memory for indexes where
+  // ShadowMask is not zero. If ShadowMask[i] is zero, we assume that
+  // ShadowBytes[i] is constantly zero and doesn't need to be overwritten.
+  void copyToShadow(ArrayRef<uint8_t> ShadowMask, ArrayRef<uint8_t> ShadowBytes,
+                    IRBuilder<> &IRB, Value *ShadowBase);
+  void copyToShadow(ArrayRef<uint8_t> ShadowMask, ArrayRef<uint8_t> ShadowBytes,
+                    size_t Begin, size_t End, IRBuilder<> &IRB,
+                    Value *ShadowBase);
+  void copyToShadowInline(ArrayRef<uint8_t> ShadowMask,
+                          ArrayRef<uint8_t> ShadowBytes, size_t Begin,
+                          size_t End, IRBuilder<> &IRB, Value *ShadowBase);
+
+  void poisonAlloca(Value *V, uint64_t Size, IRBuilder<> &IRB, bool DoPoison);
+
+  Value *createAllocaForLayout(IRBuilder<> &IRB, const ASanStackFrameLayout &L,
+                               bool Dynamic);
+  PHINode *createPHI(IRBuilder<> &IRB, Value *Cond, Value *ValueIfTrue,
+                     Instruction *ThenTerm, Value *ValueIfFalse);
+};
+
+} // end anonymous namespace
+
+void LocationMetadata::parse(MDNode *MDN) {
+  assert(MDN->getNumOperands() == 3);
+  MDString *DIFilename = cast<MDString>(MDN->getOperand(0));
+  Filename = DIFilename->getString();
+  LineNo = mdconst::extract<ConstantInt>(MDN->getOperand(1))->getLimitedValue();
+  ColumnNo =
+      mdconst::extract<ConstantInt>(MDN->getOperand(2))->getLimitedValue();
+}
+
+// FIXME: It would be cleaner to instead attach relevant metadata to the globals
+// we want to sanitize instead and reading this metadata on each pass over a
+// function instead of reading module level metadata at first.
+GlobalsMetadata::GlobalsMetadata(Module &M) {
+  NamedMDNode *Globals = M.getNamedMetadata("llvm.asan.globals");
+  if (!Globals)
+    return;
+  for (auto MDN : Globals->operands()) {
+    // Metadata node contains the global and the fields of "Entry".
+    assert(MDN->getNumOperands() == 5);
+    auto *V = mdconst::extract_or_null<Constant>(MDN->getOperand(0));
+    // The optimizer may optimize away a global entirely.
+    if (!V)
+      continue;
+    auto *StrippedV = V->stripPointerCasts();
+    auto *GV = dyn_cast<GlobalVariable>(StrippedV);
+    if (!GV)
+      continue;
+    // We can already have an entry for GV if it was merged with another
+    // global.
+    Entry &E = Entries[GV];
+    if (auto *Loc = cast_or_null<MDNode>(MDN->getOperand(1)))
+      E.SourceLoc.parse(Loc);
+    if (auto *Name = cast_or_null<MDString>(MDN->getOperand(2)))
+      E.Name = Name->getString();
+    ConstantInt *IsDynInit = mdconst::extract<ConstantInt>(MDN->getOperand(3));
+    E.IsDynInit |= IsDynInit->isOne();
+    ConstantInt *IsExcluded =
+        mdconst::extract<ConstantInt>(MDN->getOperand(4));
+    E.IsExcluded |= IsExcluded->isOne();
+  }
+}
+
+AnalysisKey ASanGlobalsMetadataAnalysis::Key;
+
+GlobalsMetadata ASanGlobalsMetadataAnalysis::run(Module &M,
+                                                 ModuleAnalysisManager &AM) {
+  return GlobalsMetadata(M);
+}
+
+AddressSanitizerPass::AddressSanitizerPass(bool CompileKernel, bool Recover,
+                                           bool UseAfterScope)
+    : CompileKernel(CompileKernel), Recover(Recover),
+      UseAfterScope(UseAfterScope) {}
+
+PreservedAnalyses AddressSanitizerPass::run(Function &F,
+                                            AnalysisManager<Function> &AM) {
+  auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
+  Module &M = *F.getParent();
+  if (auto *R = MAMProxy.getCachedResult<ASanGlobalsMetadataAnalysis>(M)) {
+    const TargetLibraryInfo *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
+    AddressSanitizer Sanitizer(M, R, CompileKernel, Recover, UseAfterScope);
+    if (Sanitizer.instrumentFunction(F, TLI))
+      return PreservedAnalyses::none();
+    return PreservedAnalyses::all();
+  }
+
+  report_fatal_error(
+      "The ASanGlobalsMetadataAnalysis is required to run before "
+      "AddressSanitizer can run");
+  return PreservedAnalyses::all();
+}
+
+ModuleAddressSanitizerPass::ModuleAddressSanitizerPass(bool CompileKernel,
+                                                       bool Recover,
+                                                       bool UseGlobalGC,
+                                                       bool UseOdrIndicator)
+    : CompileKernel(CompileKernel), Recover(Recover), UseGlobalGC(UseGlobalGC),
+      UseOdrIndicator(UseOdrIndicator) {}
+
+PreservedAnalyses ModuleAddressSanitizerPass::run(Module &M,
+                                                  AnalysisManager<Module> &AM) {
+  GlobalsMetadata &GlobalsMD = AM.getResult<ASanGlobalsMetadataAnalysis>(M);
+  ModuleAddressSanitizer Sanitizer(M, &GlobalsMD, CompileKernel, Recover,
+                                   UseGlobalGC, UseOdrIndicator);
+  if (Sanitizer.instrumentModule(M))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+INITIALIZE_PASS(ASanGlobalsMetadataWrapperPass, "asan-globals-md",
+                "Read metadata to mark which globals should be instrumented "
+                "when running ASan.",
+                false, true)
+
+char AddressSanitizerLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(
+    AddressSanitizerLegacyPass, "asan",
+    "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false,
+    false)
+INITIALIZE_PASS_DEPENDENCY(ASanGlobalsMetadataWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(
+    AddressSanitizerLegacyPass, "asan",
+    "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false,
+    false)
+
+FunctionPass *llvm::createAddressSanitizerFunctionPass(bool CompileKernel,
+                                                       bool Recover,
+                                                       bool UseAfterScope) {
+  assert(!CompileKernel || Recover);
+  return new AddressSanitizerLegacyPass(CompileKernel, Recover, UseAfterScope);
+}
+
+char ModuleAddressSanitizerLegacyPass::ID = 0;
+
+INITIALIZE_PASS(
+    ModuleAddressSanitizerLegacyPass, "asan-module",
+    "AddressSanitizer: detects use-after-free and out-of-bounds bugs."
+    "ModulePass",
+    false, false)
+
+ModulePass *llvm::createModuleAddressSanitizerLegacyPassPass(
+    bool CompileKernel, bool Recover, bool UseGlobalsGC, bool UseOdrIndicator) {
+  assert(!CompileKernel || Recover);
+  return new ModuleAddressSanitizerLegacyPass(CompileKernel, Recover,
+                                              UseGlobalsGC, UseOdrIndicator);
+}
+
+static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
+  size_t Res = countTrailingZeros(TypeSize / 8);
+  assert(Res < kNumberOfAccessSizes);
+  return Res;
+}
+
+/// Create a global describing a source location.
+static GlobalVariable *createPrivateGlobalForSourceLoc(Module &M,
+                                                       LocationMetadata MD) {
+  Constant *LocData[] = {
+      createPrivateGlobalForString(M, MD.Filename, true, kAsanGenPrefix),
+      ConstantInt::get(Type::getInt32Ty(M.getContext()), MD.LineNo),
+      ConstantInt::get(Type::getInt32Ty(M.getContext()), MD.ColumnNo),
+  };
+  auto LocStruct = ConstantStruct::getAnon(LocData);
+  auto GV = new GlobalVariable(M, LocStruct->getType(), true,
+                               GlobalValue::PrivateLinkage, LocStruct,
+                               kAsanGenPrefix);
+  GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  return GV;
+}
+
+/// Check if \p G has been created by a trusted compiler pass.
+static bool GlobalWasGeneratedByCompiler(GlobalVariable *G) {
+  // Do not instrument @llvm.global_ctors, @llvm.used, etc.
+  if (G->getName().startswith("llvm."))
+    return true;
+
+  // Do not instrument asan globals.
+  if (G->getName().startswith(kAsanGenPrefix) ||
+      G->getName().startswith(kSanCovGenPrefix) ||
+      G->getName().startswith(kODRGenPrefix))
+    return true;
+
+  // Do not instrument gcov counter arrays.
+  if (G->getName() == "__llvm_gcov_ctr")
+    return true;
+
+  return false;
+}
+
+Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) {
+  // Shadow >> scale
+  Shadow = IRB.CreateLShr(Shadow, Mapping.Scale);
+  if (Mapping.Offset == 0) return Shadow;
+  // (Shadow >> scale) | offset
+  Value *ShadowBase;
+  if (LocalDynamicShadow)
+    ShadowBase = LocalDynamicShadow;
+  else
+    ShadowBase = ConstantInt::get(IntptrTy, Mapping.Offset);
+  if (Mapping.OrShadowOffset)
+    return IRB.CreateOr(Shadow, ShadowBase);
+  else
+    return IRB.CreateAdd(Shadow, ShadowBase);
+}
+
+// Instrument memset/memmove/memcpy
+void AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
+  IRBuilder<> IRB(MI);
+  if (isa<MemTransferInst>(MI)) {
+    IRB.CreateCall(
+        isa<MemMoveInst>(MI) ? AsanMemmove : AsanMemcpy,
+        {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreatePointerCast(MI->getOperand(1), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
+  } else if (isa<MemSetInst>(MI)) {
+    IRB.CreateCall(
+        AsanMemset,
+        {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(MI->getOperand(1), IRB.getInt32Ty(), false),
+         IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
+  }
+  MI->eraseFromParent();
+}
+
+/// Check if we want (and can) handle this alloca.
+bool AddressSanitizer::isInterestingAlloca(const AllocaInst &AI) {
+  auto PreviouslySeenAllocaInfo = ProcessedAllocas.find(&AI);
+
+  if (PreviouslySeenAllocaInfo != ProcessedAllocas.end())
+    return PreviouslySeenAllocaInfo->getSecond();
+
+  bool IsInteresting =
+      (AI.getAllocatedType()->isSized() &&
+       // alloca() may be called with 0 size, ignore it.
+       ((!AI.isStaticAlloca()) || getAllocaSizeInBytes(AI) > 0) &&
+       // We are only interested in allocas not promotable to registers.
+       // Promotable allocas are common under -O0.
+       (!ClSkipPromotableAllocas || !isAllocaPromotable(&AI)) &&
+       // inalloca allocas are not treated as static, and we don't want
+       // dynamic alloca instrumentation for them as well.
+       !AI.isUsedWithInAlloca() &&
+       // swifterror allocas are register promoted by ISel
+       !AI.isSwiftError());
+
+  ProcessedAllocas[&AI] = IsInteresting;
+  return IsInteresting;
+}
+
+bool AddressSanitizer::ignoreAccess(Value *Ptr) {
+  // Do not instrument acesses from different address spaces; we cannot deal
+  // with them.
+  Type *PtrTy = cast<PointerType>(Ptr->getType()->getScalarType());
+  if (PtrTy->getPointerAddressSpace() != 0)
+    return true;
+
+  // Ignore swifterror addresses.
+  // swifterror memory addresses are mem2reg promoted by instruction
+  // selection. As such they cannot have regular uses like an instrumentation
+  // function and it makes no sense to track them as memory.
+  if (Ptr->isSwiftError())
+    return true;
+
+  // Treat memory accesses to promotable allocas as non-interesting since they
+  // will not cause memory violations. This greatly speeds up the instrumented
+  // executable at -O0.
+  if (auto AI = dyn_cast_or_null<AllocaInst>(Ptr))
+    if (ClSkipPromotableAllocas && !isInterestingAlloca(*AI))
+      return true;
+
+  return false;
+}
+
+void AddressSanitizer::getInterestingMemoryOperands(
+    Instruction *I, SmallVectorImpl<InterestingMemoryOperand> &Interesting) {
+  // Skip memory accesses inserted by another instrumentation.
+  if (I->hasMetadata("nosanitize"))
+    return;
+
+  // Do not instrument the load fetching the dynamic shadow address.
+  if (LocalDynamicShadow == I)
+    return;
+
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    if (!ClInstrumentReads || ignoreAccess(LI->getPointerOperand()))
+      return;
+    Interesting.emplace_back(I, LI->getPointerOperandIndex(), false,
+                             LI->getType(), LI->getAlign());
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    if (!ClInstrumentWrites || ignoreAccess(SI->getPointerOperand()))
+      return;
+    Interesting.emplace_back(I, SI->getPointerOperandIndex(), true,
+                             SI->getValueOperand()->getType(), SI->getAlign());
+  } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
+    if (!ClInstrumentAtomics || ignoreAccess(RMW->getPointerOperand()))
+      return;
+    Interesting.emplace_back(I, RMW->getPointerOperandIndex(), true,
+                             RMW->getValOperand()->getType(), None);
+  } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
+    if (!ClInstrumentAtomics || ignoreAccess(XCHG->getPointerOperand()))
+      return;
+    Interesting.emplace_back(I, XCHG->getPointerOperandIndex(), true,
+                             XCHG->getCompareOperand()->getType(), None);
+  } else if (auto CI = dyn_cast<CallInst>(I)) {
+    auto *F = CI->getCalledFunction();
+    if (F && (F->getName().startswith("llvm.masked.load.") ||
+              F->getName().startswith("llvm.masked.store."))) {
+      bool IsWrite = F->getName().startswith("llvm.masked.store.");
+      // Masked store has an initial operand for the value.
+      unsigned OpOffset = IsWrite ? 1 : 0;
+      if (IsWrite ? !ClInstrumentWrites : !ClInstrumentReads)
+        return;
+
+      auto BasePtr = CI->getOperand(OpOffset);
+      if (ignoreAccess(BasePtr))
+        return;
+      auto Ty = cast<PointerType>(BasePtr->getType())->getElementType();
+      MaybeAlign Alignment = Align(1);
+      // Otherwise no alignment guarantees. We probably got Undef.
+      if (auto *Op = dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset)))
+        Alignment = Op->getMaybeAlignValue();
+      Value *Mask = CI->getOperand(2 + OpOffset);
+      Interesting.emplace_back(I, OpOffset, IsWrite, Ty, Alignment, Mask);
+    } else {
+      for (unsigned ArgNo = 0; ArgNo < CI->getNumArgOperands(); ArgNo++) {
+        if (!ClInstrumentByval || !CI->isByValArgument(ArgNo) ||
+            ignoreAccess(CI->getArgOperand(ArgNo)))
+          continue;
+        Type *Ty = CI->getParamByValType(ArgNo);
+        Interesting.emplace_back(I, ArgNo, false, Ty, Align(1));
+      }
+    }
+  }
+}
+
+static bool isPointerOperand(Value *V) {
+  return V->getType()->isPointerTy() || isa<PtrToIntInst>(V);
+}
+
+// This is a rough heuristic; it may cause both false positives and
+// false negatives. The proper implementation requires cooperation with
+// the frontend.
+static bool isInterestingPointerComparison(Instruction *I) {
+  if (ICmpInst *Cmp = dyn_cast<ICmpInst>(I)) {
+    if (!Cmp->isRelational())
+      return false;
+  } else {
+    return false;
+  }
+  return isPointerOperand(I->getOperand(0)) &&
+         isPointerOperand(I->getOperand(1));
+}
+
+// This is a rough heuristic; it may cause both false positives and
+// false negatives. The proper implementation requires cooperation with
+// the frontend.
+static bool isInterestingPointerSubtraction(Instruction *I) {
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
+    if (BO->getOpcode() != Instruction::Sub)
+      return false;
+  } else {
+    return false;
+  }
+  return isPointerOperand(I->getOperand(0)) &&
+         isPointerOperand(I->getOperand(1));
+}
+
+bool AddressSanitizer::GlobalIsLinkerInitialized(GlobalVariable *G) {
+  // If a global variable does not have dynamic initialization we don't
+  // have to instrument it.  However, if a global does not have initializer
+  // at all, we assume it has dynamic initializer (in other TU).
+  //
+  // FIXME: Metadata should be attched directly to the global directly instead
+  // of being added to llvm.asan.globals.
+  return G->hasInitializer() && !GlobalsMD.get(G).IsDynInit;
+}
+
+void AddressSanitizer::instrumentPointerComparisonOrSubtraction(
+    Instruction *I) {
+  IRBuilder<> IRB(I);
+  FunctionCallee F = isa<ICmpInst>(I) ? AsanPtrCmpFunction : AsanPtrSubFunction;
+  Value *Param[2] = {I->getOperand(0), I->getOperand(1)};
+  for (Value *&i : Param) {
+    if (i->getType()->isPointerTy())
+      i = IRB.CreatePointerCast(i, IntptrTy);
+  }
+  IRB.CreateCall(F, Param);
+}
+
+static void doInstrumentAddress(AddressSanitizer *Pass, Instruction *I,
+                                Instruction *InsertBefore, Value *Addr,
+                                MaybeAlign Alignment, unsigned Granularity,
+                                uint32_t TypeSize, bool IsWrite,
+                                Value *SizeArgument, bool UseCalls,
+                                uint32_t Exp) {
+  // Instrument a 1-, 2-, 4-, 8-, or 16- byte access with one check
+  // if the data is properly aligned.
+  if ((TypeSize == 8 || TypeSize == 16 || TypeSize == 32 || TypeSize == 64 ||
+       TypeSize == 128) &&
+      (!Alignment || *Alignment >= Granularity || *Alignment >= TypeSize / 8))
+    return Pass->instrumentAddress(I, InsertBefore, Addr, TypeSize, IsWrite,
+                                   nullptr, UseCalls, Exp);
+  Pass->instrumentUnusualSizeOrAlignment(I, InsertBefore, Addr, TypeSize,
+                                         IsWrite, nullptr, UseCalls, Exp);
+}
+
+static void instrumentMaskedLoadOrStore(AddressSanitizer *Pass,
+                                        const DataLayout &DL, Type *IntptrTy,
+                                        Value *Mask, Instruction *I,
+                                        Value *Addr, MaybeAlign Alignment,
+                                        unsigned Granularity, uint32_t TypeSize,
+                                        bool IsWrite, Value *SizeArgument,
+                                        bool UseCalls, uint32_t Exp) {
+  auto *VTy = cast<FixedVectorType>(
+      cast<PointerType>(Addr->getType())->getElementType());
+  uint64_t ElemTypeSize = DL.getTypeStoreSizeInBits(VTy->getScalarType());
+  unsigned Num = VTy->getNumElements();
+  auto Zero = ConstantInt::get(IntptrTy, 0);
+  for (unsigned Idx = 0; Idx < Num; ++Idx) {
+    Value *InstrumentedAddress = nullptr;
+    Instruction *InsertBefore = I;
+    if (auto *Vector = dyn_cast<ConstantVector>(Mask)) {
+      // dyn_cast as we might get UndefValue
+      if (auto *Masked = dyn_cast<ConstantInt>(Vector->getOperand(Idx))) {
+        if (Masked->isZero())
+          // Mask is constant false, so no instrumentation needed.
+          continue;
+        // If we have a true or undef value, fall through to doInstrumentAddress
+        // with InsertBefore == I
+      }
+    } else {
+      IRBuilder<> IRB(I);
+      Value *MaskElem = IRB.CreateExtractElement(Mask, Idx);
+      Instruction *ThenTerm = SplitBlockAndInsertIfThen(MaskElem, I, false);
+      InsertBefore = ThenTerm;
+    }
+
+    IRBuilder<> IRB(InsertBefore);
+    InstrumentedAddress =
+        IRB.CreateGEP(VTy, Addr, {Zero, ConstantInt::get(IntptrTy, Idx)});
+    doInstrumentAddress(Pass, I, InsertBefore, InstrumentedAddress, Alignment,
+                        Granularity, ElemTypeSize, IsWrite, SizeArgument,
+                        UseCalls, Exp);
+  }
+}
+
+void AddressSanitizer::instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis,
+                                     InterestingMemoryOperand &O, bool UseCalls,
+                                     const DataLayout &DL) {
+  Value *Addr = O.getPtr();
+
+  // Optimization experiments.
+  // The experiments can be used to evaluate potential optimizations that remove
+  // instrumentation (assess false negatives). Instead of completely removing
+  // some instrumentation, you set Exp to a non-zero value (mask of optimization
+  // experiments that want to remove instrumentation of this instruction).
+  // If Exp is non-zero, this pass will emit special calls into runtime
+  // (e.g. __asan_report_exp_load1 instead of __asan_report_load1). These calls
+  // make runtime terminate the program in a special way (with a different
+  // exit status). Then you run the new compiler on a buggy corpus, collect
+  // the special terminations (ideally, you don't see them at all -- no false
+  // negatives) and make the decision on the optimization.
+  uint32_t Exp = ClForceExperiment;
+
+  if (ClOpt && ClOptGlobals) {
+    // If initialization order checking is disabled, a simple access to a
+    // dynamically initialized global is always valid.
     GlobalVariable *G = dyn_cast<GlobalVariable>(getUnderlyingObject(Addr));
-    if (G && (!ClInitializers || GlobalIsLinkerInitialized(G)) && 
-        isSafeAccess(ObjSizeVis, Addr, O.TypeSize)) { 
-      NumOptimizedAccessesToGlobalVar++; 
-      return; 
-    } 
-  } 
- 
-  if (ClOpt && ClOptStack) { 
-    // A direct inbounds access to a stack variable is always valid. 
+    if (G && (!ClInitializers || GlobalIsLinkerInitialized(G)) &&
+        isSafeAccess(ObjSizeVis, Addr, O.TypeSize)) {
+      NumOptimizedAccessesToGlobalVar++;
+      return;
+    }
+  }
+
+  if (ClOpt && ClOptStack) {
+    // A direct inbounds access to a stack variable is always valid.
     if (isa<AllocaInst>(getUnderlyingObject(Addr)) &&
-        isSafeAccess(ObjSizeVis, Addr, O.TypeSize)) { 
-      NumOptimizedAccessesToStackVar++; 
-      return; 
-    } 
-  } 
- 
-  if (O.IsWrite) 
-    NumInstrumentedWrites++; 
-  else 
-    NumInstrumentedReads++; 
- 
-  unsigned Granularity = 1 << Mapping.Scale; 
-  if (O.MaybeMask) { 
-    instrumentMaskedLoadOrStore(this, DL, IntptrTy, O.MaybeMask, O.getInsn(), 
-                                Addr, O.Alignment, Granularity, O.TypeSize, 
-                                O.IsWrite, nullptr, UseCalls, Exp); 
-  } else { 
-    doInstrumentAddress(this, O.getInsn(), O.getInsn(), Addr, O.Alignment, 
-                        Granularity, O.TypeSize, O.IsWrite, nullptr, UseCalls, 
-                        Exp); 
-  } 
-} 
- 
-Instruction *AddressSanitizer::generateCrashCode(Instruction *InsertBefore, 
-                                                 Value *Addr, bool IsWrite, 
-                                                 size_t AccessSizeIndex, 
-                                                 Value *SizeArgument, 
-                                                 uint32_t Exp) { 
-  IRBuilder<> IRB(InsertBefore); 
-  Value *ExpVal = Exp == 0 ? nullptr : ConstantInt::get(IRB.getInt32Ty(), Exp); 
-  CallInst *Call = nullptr; 
-  if (SizeArgument) { 
-    if (Exp == 0) 
-      Call = IRB.CreateCall(AsanErrorCallbackSized[IsWrite][0], 
-                            {Addr, SizeArgument}); 
-    else 
-      Call = IRB.CreateCall(AsanErrorCallbackSized[IsWrite][1], 
-                            {Addr, SizeArgument, ExpVal}); 
-  } else { 
-    if (Exp == 0) 
-      Call = 
-          IRB.CreateCall(AsanErrorCallback[IsWrite][0][AccessSizeIndex], Addr); 
-    else 
-      Call = IRB.CreateCall(AsanErrorCallback[IsWrite][1][AccessSizeIndex], 
-                            {Addr, ExpVal}); 
-  } 
- 
-  Call->setCannotMerge(); 
-  return Call; 
-} 
- 
-Value *AddressSanitizer::createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong, 
-                                           Value *ShadowValue, 
-                                           uint32_t TypeSize) { 
-  size_t Granularity = static_cast<size_t>(1) << Mapping.Scale; 
-  // Addr & (Granularity - 1) 
-  Value *LastAccessedByte = 
-      IRB.CreateAnd(AddrLong, ConstantInt::get(IntptrTy, Granularity - 1)); 
-  // (Addr & (Granularity - 1)) + size - 1 
-  if (TypeSize / 8 > 1) 
-    LastAccessedByte = IRB.CreateAdd( 
-        LastAccessedByte, ConstantInt::get(IntptrTy, TypeSize / 8 - 1)); 
-  // (uint8_t) ((Addr & (Granularity-1)) + size - 1) 
-  LastAccessedByte = 
-      IRB.CreateIntCast(LastAccessedByte, ShadowValue->getType(), false); 
-  // ((uint8_t) ((Addr & (Granularity-1)) + size - 1)) >= ShadowValue 
-  return IRB.CreateICmpSGE(LastAccessedByte, ShadowValue); 
-} 
- 
-void AddressSanitizer::instrumentAddress(Instruction *OrigIns, 
-                                         Instruction *InsertBefore, Value *Addr, 
-                                         uint32_t TypeSize, bool IsWrite, 
-                                         Value *SizeArgument, bool UseCalls, 
-                                         uint32_t Exp) { 
-  bool IsMyriad = TargetTriple.getVendor() == llvm::Triple::Myriad; 
- 
-  IRBuilder<> IRB(InsertBefore); 
-  Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy); 
-  size_t AccessSizeIndex = TypeSizeToSizeIndex(TypeSize); 
- 
-  if (UseCalls) { 
-    if (Exp == 0) 
-      IRB.CreateCall(AsanMemoryAccessCallback[IsWrite][0][AccessSizeIndex], 
-                     AddrLong); 
-    else 
-      IRB.CreateCall(AsanMemoryAccessCallback[IsWrite][1][AccessSizeIndex], 
-                     {AddrLong, ConstantInt::get(IRB.getInt32Ty(), Exp)}); 
-    return; 
-  } 
- 
-  if (IsMyriad) { 
-    // Strip the cache bit and do range check. 
-    // AddrLong &= ~kMyriadCacheBitMask32 
-    AddrLong = IRB.CreateAnd(AddrLong, ~kMyriadCacheBitMask32); 
-    // Tag = AddrLong >> kMyriadTagShift 
-    Value *Tag = IRB.CreateLShr(AddrLong, kMyriadTagShift); 
-    // Tag == kMyriadDDRTag 
-    Value *TagCheck = 
-        IRB.CreateICmpEQ(Tag, ConstantInt::get(IntptrTy, kMyriadDDRTag)); 
- 
-    Instruction *TagCheckTerm = 
-        SplitBlockAndInsertIfThen(TagCheck, InsertBefore, false, 
-                                  MDBuilder(*C).createBranchWeights(1, 100000)); 
-    assert(cast<BranchInst>(TagCheckTerm)->isUnconditional()); 
-    IRB.SetInsertPoint(TagCheckTerm); 
-    InsertBefore = TagCheckTerm; 
-  } 
- 
-  Type *ShadowTy = 
-      IntegerType::get(*C, std::max(8U, TypeSize >> Mapping.Scale)); 
-  Type *ShadowPtrTy = PointerType::get(ShadowTy, 0); 
-  Value *ShadowPtr = memToShadow(AddrLong, IRB); 
-  Value *CmpVal = Constant::getNullValue(ShadowTy); 
-  Value *ShadowValue = 
-      IRB.CreateLoad(ShadowTy, IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy)); 
- 
-  Value *Cmp = IRB.CreateICmpNE(ShadowValue, CmpVal); 
-  size_t Granularity = 1ULL << Mapping.Scale; 
-  Instruction *CrashTerm = nullptr; 
- 
-  if (ClAlwaysSlowPath || (TypeSize < 8 * Granularity)) { 
-    // We use branch weights for the slow path check, to indicate that the slow 
-    // path is rarely taken. This seems to be the case for SPEC benchmarks. 
-    Instruction *CheckTerm = SplitBlockAndInsertIfThen( 
-        Cmp, InsertBefore, false, MDBuilder(*C).createBranchWeights(1, 100000)); 
-    assert(cast<BranchInst>(CheckTerm)->isUnconditional()); 
-    BasicBlock *NextBB = CheckTerm->getSuccessor(0); 
-    IRB.SetInsertPoint(CheckTerm); 
-    Value *Cmp2 = createSlowPathCmp(IRB, AddrLong, ShadowValue, TypeSize); 
-    if (Recover) { 
-      CrashTerm = SplitBlockAndInsertIfThen(Cmp2, CheckTerm, false); 
-    } else { 
-      BasicBlock *CrashBlock = 
-        BasicBlock::Create(*C, "", NextBB->getParent(), NextBB); 
-      CrashTerm = new UnreachableInst(*C, CrashBlock); 
-      BranchInst *NewTerm = BranchInst::Create(CrashBlock, NextBB, Cmp2); 
-      ReplaceInstWithInst(CheckTerm, NewTerm); 
-    } 
-  } else { 
-    CrashTerm = SplitBlockAndInsertIfThen(Cmp, InsertBefore, !Recover); 
-  } 
- 
-  Instruction *Crash = generateCrashCode(CrashTerm, AddrLong, IsWrite, 
-                                         AccessSizeIndex, SizeArgument, Exp); 
-  Crash->setDebugLoc(OrigIns->getDebugLoc()); 
-} 
- 
-// Instrument unusual size or unusual alignment. 
-// We can not do it with a single check, so we do 1-byte check for the first 
-// and the last bytes. We call __asan_report_*_n(addr, real_size) to be able 
-// to report the actual access size. 
-void AddressSanitizer::instrumentUnusualSizeOrAlignment( 
-    Instruction *I, Instruction *InsertBefore, Value *Addr, uint32_t TypeSize, 
-    bool IsWrite, Value *SizeArgument, bool UseCalls, uint32_t Exp) { 
-  IRBuilder<> IRB(InsertBefore); 
-  Value *Size = ConstantInt::get(IntptrTy, TypeSize / 8); 
-  Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy); 
-  if (UseCalls) { 
-    if (Exp == 0) 
-      IRB.CreateCall(AsanMemoryAccessCallbackSized[IsWrite][0], 
-                     {AddrLong, Size}); 
-    else 
-      IRB.CreateCall(AsanMemoryAccessCallbackSized[IsWrite][1], 
-                     {AddrLong, Size, ConstantInt::get(IRB.getInt32Ty(), Exp)}); 
-  } else { 
-    Value *LastByte = IRB.CreateIntToPtr( 
-        IRB.CreateAdd(AddrLong, ConstantInt::get(IntptrTy, TypeSize / 8 - 1)), 
-        Addr->getType()); 
-    instrumentAddress(I, InsertBefore, Addr, 8, IsWrite, Size, false, Exp); 
-    instrumentAddress(I, InsertBefore, LastByte, 8, IsWrite, Size, false, Exp); 
-  } 
-} 
- 
-void ModuleAddressSanitizer::poisonOneInitializer(Function &GlobalInit, 
-                                                  GlobalValue *ModuleName) { 
-  // Set up the arguments to our poison/unpoison functions. 
-  IRBuilder<> IRB(&GlobalInit.front(), 
-                  GlobalInit.front().getFirstInsertionPt()); 
- 
-  // Add a call to poison all external globals before the given function starts. 
-  Value *ModuleNameAddr = ConstantExpr::getPointerCast(ModuleName, IntptrTy); 
-  IRB.CreateCall(AsanPoisonGlobals, ModuleNameAddr); 
- 
-  // Add calls to unpoison all globals before each return instruction. 
-  for (auto &BB : GlobalInit.getBasicBlockList()) 
-    if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator())) 
-      CallInst::Create(AsanUnpoisonGlobals, "", RI); 
-} 
- 
-void ModuleAddressSanitizer::createInitializerPoisonCalls( 
-    Module &M, GlobalValue *ModuleName) { 
-  GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors"); 
-  if (!GV) 
-    return; 
- 
-  ConstantArray *CA = dyn_cast<ConstantArray>(GV->getInitializer()); 
-  if (!CA) 
-    return; 
- 
-  for (Use &OP : CA->operands()) { 
-    if (isa<ConstantAggregateZero>(OP)) continue; 
-    ConstantStruct *CS = cast<ConstantStruct>(OP); 
- 
-    // Must have a function or null ptr. 
-    if (Function *F = dyn_cast<Function>(CS->getOperand(1))) { 
-      if (F->getName() == kAsanModuleCtorName) continue; 
-      auto *Priority = cast<ConstantInt>(CS->getOperand(0)); 
-      // Don't instrument CTORs that will run before asan.module_ctor. 
-      if (Priority->getLimitedValue() <= GetCtorAndDtorPriority(TargetTriple)) 
-        continue; 
-      poisonOneInitializer(*F, ModuleName); 
-    } 
-  } 
-} 
- 
-const GlobalVariable * 
-ModuleAddressSanitizer::getExcludedAliasedGlobal(const GlobalAlias &GA) const { 
-  // In case this function should be expanded to include rules that do not just 
-  // apply when CompileKernel is true, either guard all existing rules with an 
-  // 'if (CompileKernel) { ... }' or be absolutely sure that all these rules 
-  // should also apply to user space. 
-  assert(CompileKernel && "Only expecting to be called when compiling kernel"); 
- 
-  const Constant *C = GA.getAliasee(); 
- 
-  // When compiling the kernel, globals that are aliased by symbols prefixed 
-  // by "__" are special and cannot be padded with a redzone. 
-  if (GA.getName().startswith("__")) 
-    return dyn_cast<GlobalVariable>(C->stripPointerCastsAndAliases()); 
- 
-  return nullptr; 
-} 
- 
-bool ModuleAddressSanitizer::shouldInstrumentGlobal(GlobalVariable *G) const { 
-  Type *Ty = G->getValueType(); 
-  LLVM_DEBUG(dbgs() << "GLOBAL: " << *G << "\n"); 
- 
-  // FIXME: Metadata should be attched directly to the global directly instead 
-  // of being added to llvm.asan.globals. 
-  if (GlobalsMD.get(G).IsExcluded) return false; 
-  if (!Ty->isSized()) return false; 
-  if (!G->hasInitializer()) return false; 
-  // Only instrument globals of default address spaces 
-  if (G->getAddressSpace()) return false; 
-  if (GlobalWasGeneratedByCompiler(G)) return false; // Our own globals. 
-  // Two problems with thread-locals: 
-  //   - The address of the main thread's copy can't be computed at link-time. 
-  //   - Need to poison all copies, not just the main thread's one. 
-  if (G->isThreadLocal()) return false; 
-  // For now, just ignore this Global if the alignment is large. 
-  if (G->getAlignment() > getMinRedzoneSizeForGlobal()) return false; 
- 
-  // For non-COFF targets, only instrument globals known to be defined by this 
-  // TU. 
-  // FIXME: We can instrument comdat globals on ELF if we are using the 
-  // GC-friendly metadata scheme. 
-  if (!TargetTriple.isOSBinFormatCOFF()) { 
-    if (!G->hasExactDefinition() || G->hasComdat()) 
-      return false; 
-  } else { 
-    // On COFF, don't instrument non-ODR linkages. 
-    if (G->isInterposable()) 
-      return false; 
-  } 
- 
-  // If a comdat is present, it must have a selection kind that implies ODR 
-  // semantics: no duplicates, any, or exact match. 
-  if (Comdat *C = G->getComdat()) { 
-    switch (C->getSelectionKind()) { 
-    case Comdat::Any: 
-    case Comdat::ExactMatch: 
-    case Comdat::NoDuplicates: 
-      break; 
-    case Comdat::Largest: 
-    case Comdat::SameSize: 
-      return false; 
-    } 
-  } 
- 
-  if (G->hasSection()) { 
-    // The kernel uses explicit sections for mostly special global variables 
-    // that we should not instrument. E.g. the kernel may rely on their layout 
-    // without redzones, or remove them at link time ("discard.*"), etc. 
-    if (CompileKernel) 
-      return false; 
- 
-    StringRef Section = G->getSection(); 
- 
-    // Globals from llvm.metadata aren't emitted, do not instrument them. 
-    if (Section == "llvm.metadata") return false; 
-    // Do not instrument globals from special LLVM sections. 
-    if (Section.find("__llvm") != StringRef::npos || Section.find("__LLVM") != StringRef::npos) return false; 
- 
-    // Do not instrument function pointers to initialization and termination 
-    // routines: dynamic linker will not properly handle redzones. 
-    if (Section.startswith(".preinit_array") || 
-        Section.startswith(".init_array") || 
-        Section.startswith(".fini_array")) { 
-      return false; 
-    } 
- 
+        isSafeAccess(ObjSizeVis, Addr, O.TypeSize)) {
+      NumOptimizedAccessesToStackVar++;
+      return;
+    }
+  }
+
+  if (O.IsWrite)
+    NumInstrumentedWrites++;
+  else
+    NumInstrumentedReads++;
+
+  unsigned Granularity = 1 << Mapping.Scale;
+  if (O.MaybeMask) {
+    instrumentMaskedLoadOrStore(this, DL, IntptrTy, O.MaybeMask, O.getInsn(),
+                                Addr, O.Alignment, Granularity, O.TypeSize,
+                                O.IsWrite, nullptr, UseCalls, Exp);
+  } else {
+    doInstrumentAddress(this, O.getInsn(), O.getInsn(), Addr, O.Alignment,
+                        Granularity, O.TypeSize, O.IsWrite, nullptr, UseCalls,
+                        Exp);
+  }
+}
+
+Instruction *AddressSanitizer::generateCrashCode(Instruction *InsertBefore,
+                                                 Value *Addr, bool IsWrite,
+                                                 size_t AccessSizeIndex,
+                                                 Value *SizeArgument,
+                                                 uint32_t Exp) {
+  IRBuilder<> IRB(InsertBefore);
+  Value *ExpVal = Exp == 0 ? nullptr : ConstantInt::get(IRB.getInt32Ty(), Exp);
+  CallInst *Call = nullptr;
+  if (SizeArgument) {
+    if (Exp == 0)
+      Call = IRB.CreateCall(AsanErrorCallbackSized[IsWrite][0],
+                            {Addr, SizeArgument});
+    else
+      Call = IRB.CreateCall(AsanErrorCallbackSized[IsWrite][1],
+                            {Addr, SizeArgument, ExpVal});
+  } else {
+    if (Exp == 0)
+      Call =
+          IRB.CreateCall(AsanErrorCallback[IsWrite][0][AccessSizeIndex], Addr);
+    else
+      Call = IRB.CreateCall(AsanErrorCallback[IsWrite][1][AccessSizeIndex],
+                            {Addr, ExpVal});
+  }
+
+  Call->setCannotMerge();
+  return Call;
+}
+
+Value *AddressSanitizer::createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
+                                           Value *ShadowValue,
+                                           uint32_t TypeSize) {
+  size_t Granularity = static_cast<size_t>(1) << Mapping.Scale;
+  // Addr & (Granularity - 1)
+  Value *LastAccessedByte =
+      IRB.CreateAnd(AddrLong, ConstantInt::get(IntptrTy, Granularity - 1));
+  // (Addr & (Granularity - 1)) + size - 1
+  if (TypeSize / 8 > 1)
+    LastAccessedByte = IRB.CreateAdd(
+        LastAccessedByte, ConstantInt::get(IntptrTy, TypeSize / 8 - 1));
+  // (uint8_t) ((Addr & (Granularity-1)) + size - 1)
+  LastAccessedByte =
+      IRB.CreateIntCast(LastAccessedByte, ShadowValue->getType(), false);
+  // ((uint8_t) ((Addr & (Granularity-1)) + size - 1)) >= ShadowValue
+  return IRB.CreateICmpSGE(LastAccessedByte, ShadowValue);
+}
+
+void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
+                                         Instruction *InsertBefore, Value *Addr,
+                                         uint32_t TypeSize, bool IsWrite,
+                                         Value *SizeArgument, bool UseCalls,
+                                         uint32_t Exp) {
+  bool IsMyriad = TargetTriple.getVendor() == llvm::Triple::Myriad;
+
+  IRBuilder<> IRB(InsertBefore);
+  Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
+  size_t AccessSizeIndex = TypeSizeToSizeIndex(TypeSize);
+
+  if (UseCalls) {
+    if (Exp == 0)
+      IRB.CreateCall(AsanMemoryAccessCallback[IsWrite][0][AccessSizeIndex],
+                     AddrLong);
+    else
+      IRB.CreateCall(AsanMemoryAccessCallback[IsWrite][1][AccessSizeIndex],
+                     {AddrLong, ConstantInt::get(IRB.getInt32Ty(), Exp)});
+    return;
+  }
+
+  if (IsMyriad) {
+    // Strip the cache bit and do range check.
+    // AddrLong &= ~kMyriadCacheBitMask32
+    AddrLong = IRB.CreateAnd(AddrLong, ~kMyriadCacheBitMask32);
+    // Tag = AddrLong >> kMyriadTagShift
+    Value *Tag = IRB.CreateLShr(AddrLong, kMyriadTagShift);
+    // Tag == kMyriadDDRTag
+    Value *TagCheck =
+        IRB.CreateICmpEQ(Tag, ConstantInt::get(IntptrTy, kMyriadDDRTag));
+
+    Instruction *TagCheckTerm =
+        SplitBlockAndInsertIfThen(TagCheck, InsertBefore, false,
+                                  MDBuilder(*C).createBranchWeights(1, 100000));
+    assert(cast<BranchInst>(TagCheckTerm)->isUnconditional());
+    IRB.SetInsertPoint(TagCheckTerm);
+    InsertBefore = TagCheckTerm;
+  }
+
+  Type *ShadowTy =
+      IntegerType::get(*C, std::max(8U, TypeSize >> Mapping.Scale));
+  Type *ShadowPtrTy = PointerType::get(ShadowTy, 0);
+  Value *ShadowPtr = memToShadow(AddrLong, IRB);
+  Value *CmpVal = Constant::getNullValue(ShadowTy);
+  Value *ShadowValue =
+      IRB.CreateLoad(ShadowTy, IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy));
+
+  Value *Cmp = IRB.CreateICmpNE(ShadowValue, CmpVal);
+  size_t Granularity = 1ULL << Mapping.Scale;
+  Instruction *CrashTerm = nullptr;
+
+  if (ClAlwaysSlowPath || (TypeSize < 8 * Granularity)) {
+    // We use branch weights for the slow path check, to indicate that the slow
+    // path is rarely taken. This seems to be the case for SPEC benchmarks.
+    Instruction *CheckTerm = SplitBlockAndInsertIfThen(
+        Cmp, InsertBefore, false, MDBuilder(*C).createBranchWeights(1, 100000));
+    assert(cast<BranchInst>(CheckTerm)->isUnconditional());
+    BasicBlock *NextBB = CheckTerm->getSuccessor(0);
+    IRB.SetInsertPoint(CheckTerm);
+    Value *Cmp2 = createSlowPathCmp(IRB, AddrLong, ShadowValue, TypeSize);
+    if (Recover) {
+      CrashTerm = SplitBlockAndInsertIfThen(Cmp2, CheckTerm, false);
+    } else {
+      BasicBlock *CrashBlock =
+        BasicBlock::Create(*C, "", NextBB->getParent(), NextBB);
+      CrashTerm = new UnreachableInst(*C, CrashBlock);
+      BranchInst *NewTerm = BranchInst::Create(CrashBlock, NextBB, Cmp2);
+      ReplaceInstWithInst(CheckTerm, NewTerm);
+    }
+  } else {
+    CrashTerm = SplitBlockAndInsertIfThen(Cmp, InsertBefore, !Recover);
+  }
+
+  Instruction *Crash = generateCrashCode(CrashTerm, AddrLong, IsWrite,
+                                         AccessSizeIndex, SizeArgument, Exp);
+  Crash->setDebugLoc(OrigIns->getDebugLoc());
+}
+
+// Instrument unusual size or unusual alignment.
+// We can not do it with a single check, so we do 1-byte check for the first
+// and the last bytes. We call __asan_report_*_n(addr, real_size) to be able
+// to report the actual access size.
+void AddressSanitizer::instrumentUnusualSizeOrAlignment(
+    Instruction *I, Instruction *InsertBefore, Value *Addr, uint32_t TypeSize,
+    bool IsWrite, Value *SizeArgument, bool UseCalls, uint32_t Exp) {
+  IRBuilder<> IRB(InsertBefore);
+  Value *Size = ConstantInt::get(IntptrTy, TypeSize / 8);
+  Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
+  if (UseCalls) {
+    if (Exp == 0)
+      IRB.CreateCall(AsanMemoryAccessCallbackSized[IsWrite][0],
+                     {AddrLong, Size});
+    else
+      IRB.CreateCall(AsanMemoryAccessCallbackSized[IsWrite][1],
+                     {AddrLong, Size, ConstantInt::get(IRB.getInt32Ty(), Exp)});
+  } else {
+    Value *LastByte = IRB.CreateIntToPtr(
+        IRB.CreateAdd(AddrLong, ConstantInt::get(IntptrTy, TypeSize / 8 - 1)),
+        Addr->getType());
+    instrumentAddress(I, InsertBefore, Addr, 8, IsWrite, Size, false, Exp);
+    instrumentAddress(I, InsertBefore, LastByte, 8, IsWrite, Size, false, Exp);
+  }
+}
+
+void ModuleAddressSanitizer::poisonOneInitializer(Function &GlobalInit,
+                                                  GlobalValue *ModuleName) {
+  // Set up the arguments to our poison/unpoison functions.
+  IRBuilder<> IRB(&GlobalInit.front(),
+                  GlobalInit.front().getFirstInsertionPt());
+
+  // Add a call to poison all external globals before the given function starts.
+  Value *ModuleNameAddr = ConstantExpr::getPointerCast(ModuleName, IntptrTy);
+  IRB.CreateCall(AsanPoisonGlobals, ModuleNameAddr);
+
+  // Add calls to unpoison all globals before each return instruction.
+  for (auto &BB : GlobalInit.getBasicBlockList())
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
+      CallInst::Create(AsanUnpoisonGlobals, "", RI);
+}
+
+void ModuleAddressSanitizer::createInitializerPoisonCalls(
+    Module &M, GlobalValue *ModuleName) {
+  GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
+  if (!GV)
+    return;
+
+  ConstantArray *CA = dyn_cast<ConstantArray>(GV->getInitializer());
+  if (!CA)
+    return;
+
+  for (Use &OP : CA->operands()) {
+    if (isa<ConstantAggregateZero>(OP)) continue;
+    ConstantStruct *CS = cast<ConstantStruct>(OP);
+
+    // Must have a function or null ptr.
+    if (Function *F = dyn_cast<Function>(CS->getOperand(1))) {
+      if (F->getName() == kAsanModuleCtorName) continue;
+      auto *Priority = cast<ConstantInt>(CS->getOperand(0));
+      // Don't instrument CTORs that will run before asan.module_ctor.
+      if (Priority->getLimitedValue() <= GetCtorAndDtorPriority(TargetTriple))
+        continue;
+      poisonOneInitializer(*F, ModuleName);
+    }
+  }
+}
+
+const GlobalVariable *
+ModuleAddressSanitizer::getExcludedAliasedGlobal(const GlobalAlias &GA) const {
+  // In case this function should be expanded to include rules that do not just
+  // apply when CompileKernel is true, either guard all existing rules with an
+  // 'if (CompileKernel) { ... }' or be absolutely sure that all these rules
+  // should also apply to user space.
+  assert(CompileKernel && "Only expecting to be called when compiling kernel");
+
+  const Constant *C = GA.getAliasee();
+
+  // When compiling the kernel, globals that are aliased by symbols prefixed
+  // by "__" are special and cannot be padded with a redzone.
+  if (GA.getName().startswith("__"))
+    return dyn_cast<GlobalVariable>(C->stripPointerCastsAndAliases());
+
+  return nullptr;
+}
+
+bool ModuleAddressSanitizer::shouldInstrumentGlobal(GlobalVariable *G) const {
+  Type *Ty = G->getValueType();
+  LLVM_DEBUG(dbgs() << "GLOBAL: " << *G << "\n");
+
+  // FIXME: Metadata should be attched directly to the global directly instead
+  // of being added to llvm.asan.globals.
+  if (GlobalsMD.get(G).IsExcluded) return false;
+  if (!Ty->isSized()) return false;
+  if (!G->hasInitializer()) return false;
+  // Only instrument globals of default address spaces
+  if (G->getAddressSpace()) return false;
+  if (GlobalWasGeneratedByCompiler(G)) return false; // Our own globals.
+  // Two problems with thread-locals:
+  //   - The address of the main thread's copy can't be computed at link-time.
+  //   - Need to poison all copies, not just the main thread's one.
+  if (G->isThreadLocal()) return false;
+  // For now, just ignore this Global if the alignment is large.
+  if (G->getAlignment() > getMinRedzoneSizeForGlobal()) return false;
+
+  // For non-COFF targets, only instrument globals known to be defined by this
+  // TU.
+  // FIXME: We can instrument comdat globals on ELF if we are using the
+  // GC-friendly metadata scheme.
+  if (!TargetTriple.isOSBinFormatCOFF()) {
+    if (!G->hasExactDefinition() || G->hasComdat())
+      return false;
+  } else {
+    // On COFF, don't instrument non-ODR linkages.
+    if (G->isInterposable())
+      return false;
+  }
+
+  // If a comdat is present, it must have a selection kind that implies ODR
+  // semantics: no duplicates, any, or exact match.
+  if (Comdat *C = G->getComdat()) {
+    switch (C->getSelectionKind()) {
+    case Comdat::Any:
+    case Comdat::ExactMatch:
+    case Comdat::NoDuplicates:
+      break;
+    case Comdat::Largest:
+    case Comdat::SameSize:
+      return false;
+    }
+  }
+
+  if (G->hasSection()) {
+    // The kernel uses explicit sections for mostly special global variables
+    // that we should not instrument. E.g. the kernel may rely on their layout
+    // without redzones, or remove them at link time ("discard.*"), etc.
+    if (CompileKernel)
+      return false;
+
+    StringRef Section = G->getSection();
+
+    // Globals from llvm.metadata aren't emitted, do not instrument them.
+    if (Section == "llvm.metadata") return false;
+    // Do not instrument globals from special LLVM sections.
+    if (Section.find("__llvm") != StringRef::npos || Section.find("__LLVM") != StringRef::npos) return false;
+
+    // Do not instrument function pointers to initialization and termination
+    // routines: dynamic linker will not properly handle redzones.
+    if (Section.startswith(".preinit_array") ||
+        Section.startswith(".init_array") ||
+        Section.startswith(".fini_array")) {
+      return false;
+    }
+
     // Do not instrument user-defined sections (with names resembling
     // valid C identifiers)
     if (TargetTriple.isOSBinFormatELF()) {
@@ -1881,258 +1881,258 @@ bool ModuleAddressSanitizer::shouldInstrumentGlobal(GlobalVariable *G) const {
         return false;
     }
 
-    // On COFF, if the section name contains '$', it is highly likely that the 
-    // user is using section sorting to create an array of globals similar to 
-    // the way initialization callbacks are registered in .init_array and 
-    // .CRT$XCU. The ATL also registers things in .ATL$__[azm]. Adding redzones 
-    // to such globals is counterproductive, because the intent is that they 
-    // will form an array, and out-of-bounds accesses are expected. 
-    // See https://github.com/google/sanitizers/issues/305 
-    // and http://msdn.microsoft.com/en-US/en-en/library/bb918180(v=vs.120).aspx 
-    if (TargetTriple.isOSBinFormatCOFF() && Section.contains('$')) { 
-      LLVM_DEBUG(dbgs() << "Ignoring global in sorted section (contains '$'): " 
-                        << *G << "\n"); 
-      return false; 
-    } 
- 
-    if (TargetTriple.isOSBinFormatMachO()) { 
-      StringRef ParsedSegment, ParsedSection; 
-      unsigned TAA = 0, StubSize = 0; 
-      bool TAAParsed; 
-      std::string ErrorCode = MCSectionMachO::ParseSectionSpecifier( 
-          Section, ParsedSegment, ParsedSection, TAA, TAAParsed, StubSize); 
-      assert(ErrorCode.empty() && "Invalid section specifier."); 
- 
-      // Ignore the globals from the __OBJC section. The ObjC runtime assumes 
-      // those conform to /usr/lib/objc/runtime.h, so we can't add redzones to 
-      // them. 
-      if (ParsedSegment == "__OBJC" || 
-          (ParsedSegment == "__DATA" && ParsedSection.startswith("__objc_"))) { 
-        LLVM_DEBUG(dbgs() << "Ignoring ObjC runtime global: " << *G << "\n"); 
-        return false; 
-      } 
-      // See https://github.com/google/sanitizers/issues/32 
-      // Constant CFString instances are compiled in the following way: 
-      //  -- the string buffer is emitted into 
-      //     __TEXT,__cstring,cstring_literals 
-      //  -- the constant NSConstantString structure referencing that buffer 
-      //     is placed into __DATA,__cfstring 
-      // Therefore there's no point in placing redzones into __DATA,__cfstring. 
-      // Moreover, it causes the linker to crash on OS X 10.7 
-      if (ParsedSegment == "__DATA" && ParsedSection == "__cfstring") { 
-        LLVM_DEBUG(dbgs() << "Ignoring CFString: " << *G << "\n"); 
-        return false; 
-      } 
-      // The linker merges the contents of cstring_literals and removes the 
-      // trailing zeroes. 
-      if (ParsedSegment == "__TEXT" && (TAA & MachO::S_CSTRING_LITERALS)) { 
-        LLVM_DEBUG(dbgs() << "Ignoring a cstring literal: " << *G << "\n"); 
-        return false; 
-      } 
-    } 
-  } 
- 
-  if (CompileKernel) { 
-    // Globals that prefixed by "__" are special and cannot be padded with a 
-    // redzone. 
-    if (G->getName().startswith("__")) 
-      return false; 
-  } 
- 
-  return true; 
-} 
- 
-// On Mach-O platforms, we emit global metadata in a separate section of the 
-// binary in order to allow the linker to properly dead strip. This is only 
-// supported on recent versions of ld64. 
-bool ModuleAddressSanitizer::ShouldUseMachOGlobalsSection() const { 
-  if (!TargetTriple.isOSBinFormatMachO()) 
-    return false; 
- 
-  if (TargetTriple.isMacOSX() && !TargetTriple.isMacOSXVersionLT(10, 11)) 
-    return true; 
-  if (TargetTriple.isiOS() /* or tvOS */ && !TargetTriple.isOSVersionLT(9)) 
-    return true; 
-  if (TargetTriple.isWatchOS() && !TargetTriple.isOSVersionLT(2)) 
-    return true; 
- 
-  return false; 
-} 
- 
-StringRef ModuleAddressSanitizer::getGlobalMetadataSection() const { 
-  switch (TargetTriple.getObjectFormat()) { 
-  case Triple::COFF:  return ".ASAN$GL"; 
-  case Triple::ELF:   return "asan_globals"; 
-  case Triple::MachO: return "__DATA,__asan_globals,regular"; 
-  case Triple::Wasm: 
+    // On COFF, if the section name contains '$', it is highly likely that the
+    // user is using section sorting to create an array of globals similar to
+    // the way initialization callbacks are registered in .init_array and
+    // .CRT$XCU. The ATL also registers things in .ATL$__[azm]. Adding redzones
+    // to such globals is counterproductive, because the intent is that they
+    // will form an array, and out-of-bounds accesses are expected.
+    // See https://github.com/google/sanitizers/issues/305
+    // and http://msdn.microsoft.com/en-US/en-en/library/bb918180(v=vs.120).aspx
+    if (TargetTriple.isOSBinFormatCOFF() && Section.contains('$')) {
+      LLVM_DEBUG(dbgs() << "Ignoring global in sorted section (contains '$'): "
+                        << *G << "\n");
+      return false;
+    }
+
+    if (TargetTriple.isOSBinFormatMachO()) {
+      StringRef ParsedSegment, ParsedSection;
+      unsigned TAA = 0, StubSize = 0;
+      bool TAAParsed;
+      std::string ErrorCode = MCSectionMachO::ParseSectionSpecifier(
+          Section, ParsedSegment, ParsedSection, TAA, TAAParsed, StubSize);
+      assert(ErrorCode.empty() && "Invalid section specifier.");
+
+      // Ignore the globals from the __OBJC section. The ObjC runtime assumes
+      // those conform to /usr/lib/objc/runtime.h, so we can't add redzones to
+      // them.
+      if (ParsedSegment == "__OBJC" ||
+          (ParsedSegment == "__DATA" && ParsedSection.startswith("__objc_"))) {
+        LLVM_DEBUG(dbgs() << "Ignoring ObjC runtime global: " << *G << "\n");
+        return false;
+      }
+      // See https://github.com/google/sanitizers/issues/32
+      // Constant CFString instances are compiled in the following way:
+      //  -- the string buffer is emitted into
+      //     __TEXT,__cstring,cstring_literals
+      //  -- the constant NSConstantString structure referencing that buffer
+      //     is placed into __DATA,__cfstring
+      // Therefore there's no point in placing redzones into __DATA,__cfstring.
+      // Moreover, it causes the linker to crash on OS X 10.7
+      if (ParsedSegment == "__DATA" && ParsedSection == "__cfstring") {
+        LLVM_DEBUG(dbgs() << "Ignoring CFString: " << *G << "\n");
+        return false;
+      }
+      // The linker merges the contents of cstring_literals and removes the
+      // trailing zeroes.
+      if (ParsedSegment == "__TEXT" && (TAA & MachO::S_CSTRING_LITERALS)) {
+        LLVM_DEBUG(dbgs() << "Ignoring a cstring literal: " << *G << "\n");
+        return false;
+      }
+    }
+  }
+
+  if (CompileKernel) {
+    // Globals that prefixed by "__" are special and cannot be padded with a
+    // redzone.
+    if (G->getName().startswith("__"))
+      return false;
+  }
+
+  return true;
+}
+
+// On Mach-O platforms, we emit global metadata in a separate section of the
+// binary in order to allow the linker to properly dead strip. This is only
+// supported on recent versions of ld64.
+bool ModuleAddressSanitizer::ShouldUseMachOGlobalsSection() const {
+  if (!TargetTriple.isOSBinFormatMachO())
+    return false;
+
+  if (TargetTriple.isMacOSX() && !TargetTriple.isMacOSXVersionLT(10, 11))
+    return true;
+  if (TargetTriple.isiOS() /* or tvOS */ && !TargetTriple.isOSVersionLT(9))
+    return true;
+  if (TargetTriple.isWatchOS() && !TargetTriple.isOSVersionLT(2))
+    return true;
+
+  return false;
+}
+
+StringRef ModuleAddressSanitizer::getGlobalMetadataSection() const {
+  switch (TargetTriple.getObjectFormat()) {
+  case Triple::COFF:  return ".ASAN$GL";
+  case Triple::ELF:   return "asan_globals";
+  case Triple::MachO: return "__DATA,__asan_globals,regular";
+  case Triple::Wasm:
   case Triple::GOFF:
-  case Triple::XCOFF: 
-    report_fatal_error( 
+  case Triple::XCOFF:
+    report_fatal_error(
         "ModuleAddressSanitizer not implemented for object file format");
-  case Triple::UnknownObjectFormat: 
-    break; 
-  } 
-  llvm_unreachable("unsupported object format"); 
-} 
- 
-void ModuleAddressSanitizer::initializeCallbacks(Module &M) { 
-  IRBuilder<> IRB(*C); 
- 
-  // Declare our poisoning and unpoisoning functions. 
-  AsanPoisonGlobals = 
-      M.getOrInsertFunction(kAsanPoisonGlobalsName, IRB.getVoidTy(), IntptrTy); 
-  AsanUnpoisonGlobals = 
-      M.getOrInsertFunction(kAsanUnpoisonGlobalsName, IRB.getVoidTy()); 
- 
-  // Declare functions that register/unregister globals. 
-  AsanRegisterGlobals = M.getOrInsertFunction( 
-      kAsanRegisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy); 
-  AsanUnregisterGlobals = M.getOrInsertFunction( 
-      kAsanUnregisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy); 
- 
-  // Declare the functions that find globals in a shared object and then invoke 
-  // the (un)register function on them. 
-  AsanRegisterImageGlobals = M.getOrInsertFunction( 
-      kAsanRegisterImageGlobalsName, IRB.getVoidTy(), IntptrTy); 
-  AsanUnregisterImageGlobals = M.getOrInsertFunction( 
-      kAsanUnregisterImageGlobalsName, IRB.getVoidTy(), IntptrTy); 
- 
-  AsanRegisterElfGlobals = 
-      M.getOrInsertFunction(kAsanRegisterElfGlobalsName, IRB.getVoidTy(), 
-                            IntptrTy, IntptrTy, IntptrTy); 
-  AsanUnregisterElfGlobals = 
-      M.getOrInsertFunction(kAsanUnregisterElfGlobalsName, IRB.getVoidTy(), 
-                            IntptrTy, IntptrTy, IntptrTy); 
-} 
- 
-// Put the metadata and the instrumented global in the same group. This ensures 
-// that the metadata is discarded if the instrumented global is discarded. 
-void ModuleAddressSanitizer::SetComdatForGlobalMetadata( 
-    GlobalVariable *G, GlobalVariable *Metadata, StringRef InternalSuffix) { 
-  Module &M = *G->getParent(); 
-  Comdat *C = G->getComdat(); 
-  if (!C) { 
-    if (!G->hasName()) { 
-      // If G is unnamed, it must be internal. Give it an artificial name 
-      // so we can put it in a comdat. 
-      assert(G->hasLocalLinkage()); 
-      G->setName(Twine(kAsanGenPrefix) + "_anon_global"); 
-    } 
- 
-    if (!InternalSuffix.empty() && G->hasLocalLinkage()) { 
-      std::string Name = std::string(G->getName()); 
-      Name += InternalSuffix; 
-      C = M.getOrInsertComdat(Name); 
-    } else { 
-      C = M.getOrInsertComdat(G->getName()); 
-    } 
- 
-    // Make this IMAGE_COMDAT_SELECT_NODUPLICATES on COFF. Also upgrade private 
-    // linkage to internal linkage so that a symbol table entry is emitted. This 
-    // is necessary in order to create the comdat group. 
-    if (TargetTriple.isOSBinFormatCOFF()) { 
-      C->setSelectionKind(Comdat::NoDuplicates); 
-      if (G->hasPrivateLinkage()) 
-        G->setLinkage(GlobalValue::InternalLinkage); 
-    } 
-    G->setComdat(C); 
-  } 
- 
-  assert(G->hasComdat()); 
-  Metadata->setComdat(G->getComdat()); 
-} 
- 
-// Create a separate metadata global and put it in the appropriate ASan 
-// global registration section. 
-GlobalVariable * 
-ModuleAddressSanitizer::CreateMetadataGlobal(Module &M, Constant *Initializer, 
-                                             StringRef OriginalName) { 
-  auto Linkage = TargetTriple.isOSBinFormatMachO() 
-                     ? GlobalVariable::InternalLinkage 
-                     : GlobalVariable::PrivateLinkage; 
-  GlobalVariable *Metadata = new GlobalVariable( 
-      M, Initializer->getType(), false, Linkage, Initializer, 
-      Twine("__asan_global_") + GlobalValue::dropLLVMManglingEscape(OriginalName)); 
-  Metadata->setSection(getGlobalMetadataSection()); 
-  return Metadata; 
-} 
- 
-Instruction *ModuleAddressSanitizer::CreateAsanModuleDtor(Module &M) { 
-  AsanDtorFunction = 
-      Function::Create(FunctionType::get(Type::getVoidTy(*C), false), 
-                       GlobalValue::InternalLinkage, kAsanModuleDtorName, &M); 
-  BasicBlock *AsanDtorBB = BasicBlock::Create(*C, "", AsanDtorFunction); 
- 
-  return ReturnInst::Create(*C, AsanDtorBB); 
-} 
- 
-void ModuleAddressSanitizer::InstrumentGlobalsCOFF( 
-    IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals, 
-    ArrayRef<Constant *> MetadataInitializers) { 
-  assert(ExtendedGlobals.size() == MetadataInitializers.size()); 
-  auto &DL = M.getDataLayout(); 
- 
-  SmallVector<GlobalValue *, 16> MetadataGlobals(ExtendedGlobals.size()); 
-  for (size_t i = 0; i < ExtendedGlobals.size(); i++) { 
-    Constant *Initializer = MetadataInitializers[i]; 
-    GlobalVariable *G = ExtendedGlobals[i]; 
-    GlobalVariable *Metadata = 
-        CreateMetadataGlobal(M, Initializer, G->getName()); 
-    MDNode *MD = MDNode::get(M.getContext(), ValueAsMetadata::get(G)); 
-    Metadata->setMetadata(LLVMContext::MD_associated, MD); 
-    MetadataGlobals[i] = Metadata; 
- 
-    // The MSVC linker always inserts padding when linking incrementally. We 
-    // cope with that by aligning each struct to its size, which must be a power 
-    // of two. 
-    unsigned SizeOfGlobalStruct = DL.getTypeAllocSize(Initializer->getType()); 
-    assert(isPowerOf2_32(SizeOfGlobalStruct) && 
-           "global metadata will not be padded appropriately"); 
-    Metadata->setAlignment(assumeAligned(SizeOfGlobalStruct)); 
- 
-    SetComdatForGlobalMetadata(G, Metadata, ""); 
-  } 
- 
-  // Update llvm.compiler.used, adding the new metadata globals. This is 
-  // needed so that during LTO these variables stay alive. 
-  if (!MetadataGlobals.empty()) 
-    appendToCompilerUsed(M, MetadataGlobals); 
-} 
- 
-void ModuleAddressSanitizer::InstrumentGlobalsELF( 
-    IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals, 
-    ArrayRef<Constant *> MetadataInitializers, 
-    const std::string &UniqueModuleId) { 
-  assert(ExtendedGlobals.size() == MetadataInitializers.size()); 
- 
-  SmallVector<GlobalValue *, 16> MetadataGlobals(ExtendedGlobals.size()); 
-  for (size_t i = 0; i < ExtendedGlobals.size(); i++) { 
-    GlobalVariable *G = ExtendedGlobals[i]; 
-    GlobalVariable *Metadata = 
-        CreateMetadataGlobal(M, MetadataInitializers[i], G->getName()); 
-    MDNode *MD = MDNode::get(M.getContext(), ValueAsMetadata::get(G)); 
-    Metadata->setMetadata(LLVMContext::MD_associated, MD); 
-    MetadataGlobals[i] = Metadata; 
- 
-    SetComdatForGlobalMetadata(G, Metadata, UniqueModuleId); 
-  } 
- 
-  // Update llvm.compiler.used, adding the new metadata globals. This is 
-  // needed so that during LTO these variables stay alive. 
+  case Triple::UnknownObjectFormat:
+    break;
+  }
+  llvm_unreachable("unsupported object format");
+}
+
+void ModuleAddressSanitizer::initializeCallbacks(Module &M) {
+  IRBuilder<> IRB(*C);
+
+  // Declare our poisoning and unpoisoning functions.
+  AsanPoisonGlobals =
+      M.getOrInsertFunction(kAsanPoisonGlobalsName, IRB.getVoidTy(), IntptrTy);
+  AsanUnpoisonGlobals =
+      M.getOrInsertFunction(kAsanUnpoisonGlobalsName, IRB.getVoidTy());
+
+  // Declare functions that register/unregister globals.
+  AsanRegisterGlobals = M.getOrInsertFunction(
+      kAsanRegisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy);
+  AsanUnregisterGlobals = M.getOrInsertFunction(
+      kAsanUnregisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy);
+
+  // Declare the functions that find globals in a shared object and then invoke
+  // the (un)register function on them.
+  AsanRegisterImageGlobals = M.getOrInsertFunction(
+      kAsanRegisterImageGlobalsName, IRB.getVoidTy(), IntptrTy);
+  AsanUnregisterImageGlobals = M.getOrInsertFunction(
+      kAsanUnregisterImageGlobalsName, IRB.getVoidTy(), IntptrTy);
+
+  AsanRegisterElfGlobals =
+      M.getOrInsertFunction(kAsanRegisterElfGlobalsName, IRB.getVoidTy(),
+                            IntptrTy, IntptrTy, IntptrTy);
+  AsanUnregisterElfGlobals =
+      M.getOrInsertFunction(kAsanUnregisterElfGlobalsName, IRB.getVoidTy(),
+                            IntptrTy, IntptrTy, IntptrTy);
+}
+
+// Put the metadata and the instrumented global in the same group. This ensures
+// that the metadata is discarded if the instrumented global is discarded.
+void ModuleAddressSanitizer::SetComdatForGlobalMetadata(
+    GlobalVariable *G, GlobalVariable *Metadata, StringRef InternalSuffix) {
+  Module &M = *G->getParent();
+  Comdat *C = G->getComdat();
+  if (!C) {
+    if (!G->hasName()) {
+      // If G is unnamed, it must be internal. Give it an artificial name
+      // so we can put it in a comdat.
+      assert(G->hasLocalLinkage());
+      G->setName(Twine(kAsanGenPrefix) + "_anon_global");
+    }
+
+    if (!InternalSuffix.empty() && G->hasLocalLinkage()) {
+      std::string Name = std::string(G->getName());
+      Name += InternalSuffix;
+      C = M.getOrInsertComdat(Name);
+    } else {
+      C = M.getOrInsertComdat(G->getName());
+    }
+
+    // Make this IMAGE_COMDAT_SELECT_NODUPLICATES on COFF. Also upgrade private
+    // linkage to internal linkage so that a symbol table entry is emitted. This
+    // is necessary in order to create the comdat group.
+    if (TargetTriple.isOSBinFormatCOFF()) {
+      C->setSelectionKind(Comdat::NoDuplicates);
+      if (G->hasPrivateLinkage())
+        G->setLinkage(GlobalValue::InternalLinkage);
+    }
+    G->setComdat(C);
+  }
+
+  assert(G->hasComdat());
+  Metadata->setComdat(G->getComdat());
+}
+
+// Create a separate metadata global and put it in the appropriate ASan
+// global registration section.
+GlobalVariable *
+ModuleAddressSanitizer::CreateMetadataGlobal(Module &M, Constant *Initializer,
+                                             StringRef OriginalName) {
+  auto Linkage = TargetTriple.isOSBinFormatMachO()
+                     ? GlobalVariable::InternalLinkage
+                     : GlobalVariable::PrivateLinkage;
+  GlobalVariable *Metadata = new GlobalVariable(
+      M, Initializer->getType(), false, Linkage, Initializer,
+      Twine("__asan_global_") + GlobalValue::dropLLVMManglingEscape(OriginalName));
+  Metadata->setSection(getGlobalMetadataSection());
+  return Metadata;
+}
+
+Instruction *ModuleAddressSanitizer::CreateAsanModuleDtor(Module &M) {
+  AsanDtorFunction =
+      Function::Create(FunctionType::get(Type::getVoidTy(*C), false),
+                       GlobalValue::InternalLinkage, kAsanModuleDtorName, &M);
+  BasicBlock *AsanDtorBB = BasicBlock::Create(*C, "", AsanDtorFunction);
+
+  return ReturnInst::Create(*C, AsanDtorBB);
+}
+
+void ModuleAddressSanitizer::InstrumentGlobalsCOFF(
+    IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals,
+    ArrayRef<Constant *> MetadataInitializers) {
+  assert(ExtendedGlobals.size() == MetadataInitializers.size());
+  auto &DL = M.getDataLayout();
+
+  SmallVector<GlobalValue *, 16> MetadataGlobals(ExtendedGlobals.size());
+  for (size_t i = 0; i < ExtendedGlobals.size(); i++) {
+    Constant *Initializer = MetadataInitializers[i];
+    GlobalVariable *G = ExtendedGlobals[i];
+    GlobalVariable *Metadata =
+        CreateMetadataGlobal(M, Initializer, G->getName());
+    MDNode *MD = MDNode::get(M.getContext(), ValueAsMetadata::get(G));
+    Metadata->setMetadata(LLVMContext::MD_associated, MD);
+    MetadataGlobals[i] = Metadata;
+
+    // The MSVC linker always inserts padding when linking incrementally. We
+    // cope with that by aligning each struct to its size, which must be a power
+    // of two.
+    unsigned SizeOfGlobalStruct = DL.getTypeAllocSize(Initializer->getType());
+    assert(isPowerOf2_32(SizeOfGlobalStruct) &&
+           "global metadata will not be padded appropriately");
+    Metadata->setAlignment(assumeAligned(SizeOfGlobalStruct));
+
+    SetComdatForGlobalMetadata(G, Metadata, "");
+  }
+
+  // Update llvm.compiler.used, adding the new metadata globals. This is
+  // needed so that during LTO these variables stay alive.
   if (!MetadataGlobals.empty())
     appendToCompilerUsed(M, MetadataGlobals);
- 
-  // RegisteredFlag serves two purposes. First, we can pass it to dladdr() 
-  // to look up the loaded image that contains it. Second, we can store in it 
-  // whether registration has already occurred, to prevent duplicate 
-  // registration. 
-  // 
-  // Common linkage ensures that there is only one global per shared library. 
-  GlobalVariable *RegisteredFlag = new GlobalVariable( 
-      M, IntptrTy, false, GlobalVariable::CommonLinkage, 
-      ConstantInt::get(IntptrTy, 0), kAsanGlobalsRegisteredFlagName); 
-  RegisteredFlag->setVisibility(GlobalVariable::HiddenVisibility); 
- 
+}
+
+void ModuleAddressSanitizer::InstrumentGlobalsELF(
+    IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals,
+    ArrayRef<Constant *> MetadataInitializers,
+    const std::string &UniqueModuleId) {
+  assert(ExtendedGlobals.size() == MetadataInitializers.size());
+
+  SmallVector<GlobalValue *, 16> MetadataGlobals(ExtendedGlobals.size());
+  for (size_t i = 0; i < ExtendedGlobals.size(); i++) {
+    GlobalVariable *G = ExtendedGlobals[i];
+    GlobalVariable *Metadata =
+        CreateMetadataGlobal(M, MetadataInitializers[i], G->getName());
+    MDNode *MD = MDNode::get(M.getContext(), ValueAsMetadata::get(G));
+    Metadata->setMetadata(LLVMContext::MD_associated, MD);
+    MetadataGlobals[i] = Metadata;
+
+    SetComdatForGlobalMetadata(G, Metadata, UniqueModuleId);
+  }
+
+  // Update llvm.compiler.used, adding the new metadata globals. This is
+  // needed so that during LTO these variables stay alive.
+  if (!MetadataGlobals.empty())
+    appendToCompilerUsed(M, MetadataGlobals);
+
+  // RegisteredFlag serves two purposes. First, we can pass it to dladdr()
+  // to look up the loaded image that contains it. Second, we can store in it
+  // whether registration has already occurred, to prevent duplicate
+  // registration.
+  //
+  // Common linkage ensures that there is only one global per shared library.
+  GlobalVariable *RegisteredFlag = new GlobalVariable(
+      M, IntptrTy, false, GlobalVariable::CommonLinkage,
+      ConstantInt::get(IntptrTy, 0), kAsanGlobalsRegisteredFlagName);
+  RegisteredFlag->setVisibility(GlobalVariable::HiddenVisibility);
+
   // Create start and stop symbols.
   GlobalVariable *StartELFMetadata = new GlobalVariable(
       M, IntptrTy, false, GlobalVariable::ExternalWeakLinkage, nullptr,
@@ -2142,1326 +2142,1326 @@ void ModuleAddressSanitizer::InstrumentGlobalsELF(
       M, IntptrTy, false, GlobalVariable::ExternalWeakLinkage, nullptr,
       "__stop_" + getGlobalMetadataSection());
   StopELFMetadata->setVisibility(GlobalVariable::HiddenVisibility);
- 
-  // Create a call to register the globals with the runtime. 
-  IRB.CreateCall(AsanRegisterElfGlobals, 
-                 {IRB.CreatePointerCast(RegisteredFlag, IntptrTy), 
-                  IRB.CreatePointerCast(StartELFMetadata, IntptrTy), 
-                  IRB.CreatePointerCast(StopELFMetadata, IntptrTy)}); 
- 
-  // We also need to unregister globals at the end, e.g., when a shared library 
-  // gets closed. 
-  IRBuilder<> IRB_Dtor(CreateAsanModuleDtor(M)); 
-  IRB_Dtor.CreateCall(AsanUnregisterElfGlobals, 
-                      {IRB.CreatePointerCast(RegisteredFlag, IntptrTy), 
-                       IRB.CreatePointerCast(StartELFMetadata, IntptrTy), 
-                       IRB.CreatePointerCast(StopELFMetadata, IntptrTy)}); 
-} 
- 
-void ModuleAddressSanitizer::InstrumentGlobalsMachO( 
-    IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals, 
-    ArrayRef<Constant *> MetadataInitializers) { 
-  assert(ExtendedGlobals.size() == MetadataInitializers.size()); 
- 
-  // On recent Mach-O platforms, use a structure which binds the liveness of 
-  // the global variable to the metadata struct. Keep the list of "Liveness" GV 
-  // created to be added to llvm.compiler.used 
-  StructType *LivenessTy = StructType::get(IntptrTy, IntptrTy); 
-  SmallVector<GlobalValue *, 16> LivenessGlobals(ExtendedGlobals.size()); 
- 
-  for (size_t i = 0; i < ExtendedGlobals.size(); i++) { 
-    Constant *Initializer = MetadataInitializers[i]; 
-    GlobalVariable *G = ExtendedGlobals[i]; 
-    GlobalVariable *Metadata = 
-        CreateMetadataGlobal(M, Initializer, G->getName()); 
- 
-    // On recent Mach-O platforms, we emit the global metadata in a way that 
-    // allows the linker to properly strip dead globals. 
-    auto LivenessBinder = 
-        ConstantStruct::get(LivenessTy, Initializer->getAggregateElement(0u), 
-                            ConstantExpr::getPointerCast(Metadata, IntptrTy)); 
-    GlobalVariable *Liveness = new GlobalVariable( 
-        M, LivenessTy, false, GlobalVariable::InternalLinkage, LivenessBinder, 
-        Twine("__asan_binder_") + G->getName()); 
-    Liveness->setSection("__DATA,__asan_liveness,regular,live_support"); 
-    LivenessGlobals[i] = Liveness; 
-  } 
- 
-  // Update llvm.compiler.used, adding the new liveness globals. This is 
-  // needed so that during LTO these variables stay alive. The alternative 
-  // would be to have the linker handling the LTO symbols, but libLTO 
-  // current API does not expose access to the section for each symbol. 
-  if (!LivenessGlobals.empty()) 
-    appendToCompilerUsed(M, LivenessGlobals); 
- 
-  // RegisteredFlag serves two purposes. First, we can pass it to dladdr() 
-  // to look up the loaded image that contains it. Second, we can store in it 
-  // whether registration has already occurred, to prevent duplicate 
-  // registration. 
-  // 
-  // common linkage ensures that there is only one global per shared library. 
-  GlobalVariable *RegisteredFlag = new GlobalVariable( 
-      M, IntptrTy, false, GlobalVariable::CommonLinkage, 
-      ConstantInt::get(IntptrTy, 0), kAsanGlobalsRegisteredFlagName); 
-  RegisteredFlag->setVisibility(GlobalVariable::HiddenVisibility); 
- 
-  IRB.CreateCall(AsanRegisterImageGlobals, 
-                 {IRB.CreatePointerCast(RegisteredFlag, IntptrTy)}); 
- 
-  // We also need to unregister globals at the end, e.g., when a shared library 
-  // gets closed. 
-  IRBuilder<> IRB_Dtor(CreateAsanModuleDtor(M)); 
-  IRB_Dtor.CreateCall(AsanUnregisterImageGlobals, 
-                      {IRB.CreatePointerCast(RegisteredFlag, IntptrTy)}); 
-} 
- 
-void ModuleAddressSanitizer::InstrumentGlobalsWithMetadataArray( 
-    IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals, 
-    ArrayRef<Constant *> MetadataInitializers) { 
-  assert(ExtendedGlobals.size() == MetadataInitializers.size()); 
-  unsigned N = ExtendedGlobals.size(); 
-  assert(N > 0); 
- 
-  // On platforms that don't have a custom metadata section, we emit an array 
-  // of global metadata structures. 
-  ArrayType *ArrayOfGlobalStructTy = 
-      ArrayType::get(MetadataInitializers[0]->getType(), N); 
-  auto AllGlobals = new GlobalVariable( 
-      M, ArrayOfGlobalStructTy, false, GlobalVariable::InternalLinkage, 
-      ConstantArray::get(ArrayOfGlobalStructTy, MetadataInitializers), ""); 
-  if (Mapping.Scale > 3) 
-    AllGlobals->setAlignment(Align(1ULL << Mapping.Scale)); 
- 
-  IRB.CreateCall(AsanRegisterGlobals, 
-                 {IRB.CreatePointerCast(AllGlobals, IntptrTy), 
-                  ConstantInt::get(IntptrTy, N)}); 
- 
-  // We also need to unregister globals at the end, e.g., when a shared library 
-  // gets closed. 
-  IRBuilder<> IRB_Dtor(CreateAsanModuleDtor(M)); 
-  IRB_Dtor.CreateCall(AsanUnregisterGlobals, 
-                      {IRB.CreatePointerCast(AllGlobals, IntptrTy), 
-                       ConstantInt::get(IntptrTy, N)}); 
-} 
- 
-// This function replaces all global variables with new variables that have 
-// trailing redzones. It also creates a function that poisons 
-// redzones and inserts this function into llvm.global_ctors. 
-// Sets *CtorComdat to true if the global registration code emitted into the 
-// asan constructor is comdat-compatible. 
-bool ModuleAddressSanitizer::InstrumentGlobals(IRBuilder<> &IRB, Module &M, 
-                                               bool *CtorComdat) { 
-  *CtorComdat = false; 
- 
-  // Build set of globals that are aliased by some GA, where 
-  // getExcludedAliasedGlobal(GA) returns the relevant GlobalVariable. 
-  SmallPtrSet<const GlobalVariable *, 16> AliasedGlobalExclusions; 
-  if (CompileKernel) { 
-    for (auto &GA : M.aliases()) { 
-      if (const GlobalVariable *GV = getExcludedAliasedGlobal(GA)) 
-        AliasedGlobalExclusions.insert(GV); 
-    } 
-  } 
- 
-  SmallVector<GlobalVariable *, 16> GlobalsToChange; 
-  for (auto &G : M.globals()) { 
-    if (!AliasedGlobalExclusions.count(&G) && shouldInstrumentGlobal(&G)) 
-      GlobalsToChange.push_back(&G); 
-  } 
- 
-  size_t n = GlobalsToChange.size(); 
-  if (n == 0) { 
-    *CtorComdat = true; 
-    return false; 
-  } 
- 
-  auto &DL = M.getDataLayout(); 
- 
-  // A global is described by a structure 
-  //   size_t beg; 
-  //   size_t size; 
-  //   size_t size_with_redzone; 
-  //   const char *name; 
-  //   const char *module_name; 
-  //   size_t has_dynamic_init; 
-  //   void *source_location; 
-  //   size_t odr_indicator; 
-  // We initialize an array of such structures and pass it to a run-time call. 
-  StructType *GlobalStructTy = 
-      StructType::get(IntptrTy, IntptrTy, IntptrTy, IntptrTy, IntptrTy, 
-                      IntptrTy, IntptrTy, IntptrTy); 
-  SmallVector<GlobalVariable *, 16> NewGlobals(n); 
-  SmallVector<Constant *, 16> Initializers(n); 
- 
-  bool HasDynamicallyInitializedGlobals = false; 
- 
-  // We shouldn't merge same module names, as this string serves as unique 
-  // module ID in runtime. 
-  GlobalVariable *ModuleName = createPrivateGlobalForString( 
-      M, M.getModuleIdentifier(), /*AllowMerging*/ false, kAsanGenPrefix); 
- 
-  for (size_t i = 0; i < n; i++) { 
-    GlobalVariable *G = GlobalsToChange[i]; 
- 
-    // FIXME: Metadata should be attched directly to the global directly instead 
-    // of being added to llvm.asan.globals. 
-    auto MD = GlobalsMD.get(G); 
-    StringRef NameForGlobal = G->getName(); 
-    // Create string holding the global name (use global name from metadata 
-    // if it's available, otherwise just write the name of global variable). 
-    GlobalVariable *Name = createPrivateGlobalForString( 
-        M, MD.Name.empty() ? NameForGlobal : MD.Name, 
-        /*AllowMerging*/ true, kAsanGenPrefix); 
- 
-    Type *Ty = G->getValueType(); 
-    const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty); 
-    const uint64_t RightRedzoneSize = getRedzoneSizeForGlobal(SizeInBytes); 
-    Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize); 
- 
-    StructType *NewTy = StructType::get(Ty, RightRedZoneTy); 
-    Constant *NewInitializer = ConstantStruct::get( 
-        NewTy, G->getInitializer(), Constant::getNullValue(RightRedZoneTy)); 
- 
-    // Create a new global variable with enough space for a redzone. 
-    GlobalValue::LinkageTypes Linkage = G->getLinkage(); 
-    if (G->isConstant() && Linkage == GlobalValue::PrivateLinkage) 
-      Linkage = GlobalValue::InternalLinkage; 
-    GlobalVariable *NewGlobal = 
-        new GlobalVariable(M, NewTy, G->isConstant(), Linkage, NewInitializer, 
-                           "", G, G->getThreadLocalMode()); 
-    NewGlobal->copyAttributesFrom(G); 
-    NewGlobal->setComdat(G->getComdat()); 
-    NewGlobal->setAlignment(MaybeAlign(getMinRedzoneSizeForGlobal())); 
-    // Don't fold globals with redzones. ODR violation detector and redzone 
-    // poisoning implicitly creates a dependence on the global's address, so it 
-    // is no longer valid for it to be marked unnamed_addr. 
-    NewGlobal->setUnnamedAddr(GlobalValue::UnnamedAddr::None); 
- 
-    // Move null-terminated C strings to "__asan_cstring" section on Darwin. 
-    if (TargetTriple.isOSBinFormatMachO() && !G->hasSection() && 
-        G->isConstant()) { 
-      auto Seq = dyn_cast<ConstantDataSequential>(G->getInitializer()); 
-      if (Seq && Seq->isCString()) 
-        NewGlobal->setSection("__TEXT,__asan_cstring,regular"); 
-    } 
- 
+
+  // Create a call to register the globals with the runtime.
+  IRB.CreateCall(AsanRegisterElfGlobals,
+                 {IRB.CreatePointerCast(RegisteredFlag, IntptrTy),
+                  IRB.CreatePointerCast(StartELFMetadata, IntptrTy),
+                  IRB.CreatePointerCast(StopELFMetadata, IntptrTy)});
+
+  // We also need to unregister globals at the end, e.g., when a shared library
+  // gets closed.
+  IRBuilder<> IRB_Dtor(CreateAsanModuleDtor(M));
+  IRB_Dtor.CreateCall(AsanUnregisterElfGlobals,
+                      {IRB.CreatePointerCast(RegisteredFlag, IntptrTy),
+                       IRB.CreatePointerCast(StartELFMetadata, IntptrTy),
+                       IRB.CreatePointerCast(StopELFMetadata, IntptrTy)});
+}
+
+void ModuleAddressSanitizer::InstrumentGlobalsMachO(
+    IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals,
+    ArrayRef<Constant *> MetadataInitializers) {
+  assert(ExtendedGlobals.size() == MetadataInitializers.size());
+
+  // On recent Mach-O platforms, use a structure which binds the liveness of
+  // the global variable to the metadata struct. Keep the list of "Liveness" GV
+  // created to be added to llvm.compiler.used
+  StructType *LivenessTy = StructType::get(IntptrTy, IntptrTy);
+  SmallVector<GlobalValue *, 16> LivenessGlobals(ExtendedGlobals.size());
+
+  for (size_t i = 0; i < ExtendedGlobals.size(); i++) {
+    Constant *Initializer = MetadataInitializers[i];
+    GlobalVariable *G = ExtendedGlobals[i];
+    GlobalVariable *Metadata =
+        CreateMetadataGlobal(M, Initializer, G->getName());
+
+    // On recent Mach-O platforms, we emit the global metadata in a way that
+    // allows the linker to properly strip dead globals.
+    auto LivenessBinder =
+        ConstantStruct::get(LivenessTy, Initializer->getAggregateElement(0u),
+                            ConstantExpr::getPointerCast(Metadata, IntptrTy));
+    GlobalVariable *Liveness = new GlobalVariable(
+        M, LivenessTy, false, GlobalVariable::InternalLinkage, LivenessBinder,
+        Twine("__asan_binder_") + G->getName());
+    Liveness->setSection("__DATA,__asan_liveness,regular,live_support");
+    LivenessGlobals[i] = Liveness;
+  }
+
+  // Update llvm.compiler.used, adding the new liveness globals. This is
+  // needed so that during LTO these variables stay alive. The alternative
+  // would be to have the linker handling the LTO symbols, but libLTO
+  // current API does not expose access to the section for each symbol.
+  if (!LivenessGlobals.empty())
+    appendToCompilerUsed(M, LivenessGlobals);
+
+  // RegisteredFlag serves two purposes. First, we can pass it to dladdr()
+  // to look up the loaded image that contains it. Second, we can store in it
+  // whether registration has already occurred, to prevent duplicate
+  // registration.
+  //
+  // common linkage ensures that there is only one global per shared library.
+  GlobalVariable *RegisteredFlag = new GlobalVariable(
+      M, IntptrTy, false, GlobalVariable::CommonLinkage,
+      ConstantInt::get(IntptrTy, 0), kAsanGlobalsRegisteredFlagName);
+  RegisteredFlag->setVisibility(GlobalVariable::HiddenVisibility);
+
+  IRB.CreateCall(AsanRegisterImageGlobals,
+                 {IRB.CreatePointerCast(RegisteredFlag, IntptrTy)});
+
+  // We also need to unregister globals at the end, e.g., when a shared library
+  // gets closed.
+  IRBuilder<> IRB_Dtor(CreateAsanModuleDtor(M));
+  IRB_Dtor.CreateCall(AsanUnregisterImageGlobals,
+                      {IRB.CreatePointerCast(RegisteredFlag, IntptrTy)});
+}
+
+void ModuleAddressSanitizer::InstrumentGlobalsWithMetadataArray(
+    IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals,
+    ArrayRef<Constant *> MetadataInitializers) {
+  assert(ExtendedGlobals.size() == MetadataInitializers.size());
+  unsigned N = ExtendedGlobals.size();
+  assert(N > 0);
+
+  // On platforms that don't have a custom metadata section, we emit an array
+  // of global metadata structures.
+  ArrayType *ArrayOfGlobalStructTy =
+      ArrayType::get(MetadataInitializers[0]->getType(), N);
+  auto AllGlobals = new GlobalVariable(
+      M, ArrayOfGlobalStructTy, false, GlobalVariable::InternalLinkage,
+      ConstantArray::get(ArrayOfGlobalStructTy, MetadataInitializers), "");
+  if (Mapping.Scale > 3)
+    AllGlobals->setAlignment(Align(1ULL << Mapping.Scale));
+
+  IRB.CreateCall(AsanRegisterGlobals,
+                 {IRB.CreatePointerCast(AllGlobals, IntptrTy),
+                  ConstantInt::get(IntptrTy, N)});
+
+  // We also need to unregister globals at the end, e.g., when a shared library
+  // gets closed.
+  IRBuilder<> IRB_Dtor(CreateAsanModuleDtor(M));
+  IRB_Dtor.CreateCall(AsanUnregisterGlobals,
+                      {IRB.CreatePointerCast(AllGlobals, IntptrTy),
+                       ConstantInt::get(IntptrTy, N)});
+}
+
+// This function replaces all global variables with new variables that have
+// trailing redzones. It also creates a function that poisons
+// redzones and inserts this function into llvm.global_ctors.
+// Sets *CtorComdat to true if the global registration code emitted into the
+// asan constructor is comdat-compatible.
+bool ModuleAddressSanitizer::InstrumentGlobals(IRBuilder<> &IRB, Module &M,
+                                               bool *CtorComdat) {
+  *CtorComdat = false;
+
+  // Build set of globals that are aliased by some GA, where
+  // getExcludedAliasedGlobal(GA) returns the relevant GlobalVariable.
+  SmallPtrSet<const GlobalVariable *, 16> AliasedGlobalExclusions;
+  if (CompileKernel) {
+    for (auto &GA : M.aliases()) {
+      if (const GlobalVariable *GV = getExcludedAliasedGlobal(GA))
+        AliasedGlobalExclusions.insert(GV);
+    }
+  }
+
+  SmallVector<GlobalVariable *, 16> GlobalsToChange;
+  for (auto &G : M.globals()) {
+    if (!AliasedGlobalExclusions.count(&G) && shouldInstrumentGlobal(&G))
+      GlobalsToChange.push_back(&G);
+  }
+
+  size_t n = GlobalsToChange.size();
+  if (n == 0) {
+    *CtorComdat = true;
+    return false;
+  }
+
+  auto &DL = M.getDataLayout();
+
+  // A global is described by a structure
+  //   size_t beg;
+  //   size_t size;
+  //   size_t size_with_redzone;
+  //   const char *name;
+  //   const char *module_name;
+  //   size_t has_dynamic_init;
+  //   void *source_location;
+  //   size_t odr_indicator;
+  // We initialize an array of such structures and pass it to a run-time call.
+  StructType *GlobalStructTy =
+      StructType::get(IntptrTy, IntptrTy, IntptrTy, IntptrTy, IntptrTy,
+                      IntptrTy, IntptrTy, IntptrTy);
+  SmallVector<GlobalVariable *, 16> NewGlobals(n);
+  SmallVector<Constant *, 16> Initializers(n);
+
+  bool HasDynamicallyInitializedGlobals = false;
+
+  // We shouldn't merge same module names, as this string serves as unique
+  // module ID in runtime.
+  GlobalVariable *ModuleName = createPrivateGlobalForString(
+      M, M.getModuleIdentifier(), /*AllowMerging*/ false, kAsanGenPrefix);
+
+  for (size_t i = 0; i < n; i++) {
+    GlobalVariable *G = GlobalsToChange[i];
+
+    // FIXME: Metadata should be attched directly to the global directly instead
+    // of being added to llvm.asan.globals.
+    auto MD = GlobalsMD.get(G);
+    StringRef NameForGlobal = G->getName();
+    // Create string holding the global name (use global name from metadata
+    // if it's available, otherwise just write the name of global variable).
+    GlobalVariable *Name = createPrivateGlobalForString(
+        M, MD.Name.empty() ? NameForGlobal : MD.Name,
+        /*AllowMerging*/ true, kAsanGenPrefix);
+
+    Type *Ty = G->getValueType();
+    const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
+    const uint64_t RightRedzoneSize = getRedzoneSizeForGlobal(SizeInBytes);
+    Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize);
+
+    StructType *NewTy = StructType::get(Ty, RightRedZoneTy);
+    Constant *NewInitializer = ConstantStruct::get(
+        NewTy, G->getInitializer(), Constant::getNullValue(RightRedZoneTy));
+
+    // Create a new global variable with enough space for a redzone.
+    GlobalValue::LinkageTypes Linkage = G->getLinkage();
+    if (G->isConstant() && Linkage == GlobalValue::PrivateLinkage)
+      Linkage = GlobalValue::InternalLinkage;
+    GlobalVariable *NewGlobal =
+        new GlobalVariable(M, NewTy, G->isConstant(), Linkage, NewInitializer,
+                           "", G, G->getThreadLocalMode());
+    NewGlobal->copyAttributesFrom(G);
+    NewGlobal->setComdat(G->getComdat());
+    NewGlobal->setAlignment(MaybeAlign(getMinRedzoneSizeForGlobal()));
+    // Don't fold globals with redzones. ODR violation detector and redzone
+    // poisoning implicitly creates a dependence on the global's address, so it
+    // is no longer valid for it to be marked unnamed_addr.
+    NewGlobal->setUnnamedAddr(GlobalValue::UnnamedAddr::None);
+
+    // Move null-terminated C strings to "__asan_cstring" section on Darwin.
+    if (TargetTriple.isOSBinFormatMachO() && !G->hasSection() &&
+        G->isConstant()) {
+      auto Seq = dyn_cast<ConstantDataSequential>(G->getInitializer());
+      if (Seq && Seq->isCString())
+        NewGlobal->setSection("__TEXT,__asan_cstring,regular");
+    }
+
     // Transfer the debug info and type metadata.  The payload starts at offset
     // zero so we can copy the metadata over as is.
     NewGlobal->copyMetadata(G, 0);
- 
-    Value *Indices2[2]; 
-    Indices2[0] = IRB.getInt32(0); 
-    Indices2[1] = IRB.getInt32(0); 
- 
-    G->replaceAllUsesWith( 
-        ConstantExpr::getGetElementPtr(NewTy, NewGlobal, Indices2, true)); 
-    NewGlobal->takeName(G); 
-    G->eraseFromParent(); 
-    NewGlobals[i] = NewGlobal; 
- 
-    Constant *SourceLoc; 
-    if (!MD.SourceLoc.empty()) { 
-      auto SourceLocGlobal = createPrivateGlobalForSourceLoc(M, MD.SourceLoc); 
-      SourceLoc = ConstantExpr::getPointerCast(SourceLocGlobal, IntptrTy); 
-    } else { 
-      SourceLoc = ConstantInt::get(IntptrTy, 0); 
-    } 
- 
-    Constant *ODRIndicator = ConstantExpr::getNullValue(IRB.getInt8PtrTy()); 
-    GlobalValue *InstrumentedGlobal = NewGlobal; 
- 
-    bool CanUsePrivateAliases = 
-        TargetTriple.isOSBinFormatELF() || TargetTriple.isOSBinFormatMachO() || 
-        TargetTriple.isOSBinFormatWasm(); 
-    if (CanUsePrivateAliases && UsePrivateAlias) { 
-      // Create local alias for NewGlobal to avoid crash on ODR between 
-      // instrumented and non-instrumented libraries. 
-      InstrumentedGlobal = 
-          GlobalAlias::create(GlobalValue::PrivateLinkage, "", NewGlobal); 
-    } 
- 
-    // ODR should not happen for local linkage. 
-    if (NewGlobal->hasLocalLinkage()) { 
-      ODRIndicator = ConstantExpr::getIntToPtr(ConstantInt::get(IntptrTy, -1), 
-                                               IRB.getInt8PtrTy()); 
-    } else if (UseOdrIndicator) { 
-      // With local aliases, we need to provide another externally visible 
-      // symbol __odr_asan_XXX to detect ODR violation. 
-      auto *ODRIndicatorSym = 
-          new GlobalVariable(M, IRB.getInt8Ty(), false, Linkage, 
-                             Constant::getNullValue(IRB.getInt8Ty()), 
-                             kODRGenPrefix + NameForGlobal, nullptr, 
-                             NewGlobal->getThreadLocalMode()); 
- 
-      // Set meaningful attributes for indicator symbol. 
-      ODRIndicatorSym->setVisibility(NewGlobal->getVisibility()); 
-      ODRIndicatorSym->setDLLStorageClass(NewGlobal->getDLLStorageClass()); 
-      ODRIndicatorSym->setAlignment(Align(1)); 
-      ODRIndicator = ODRIndicatorSym; 
-    } 
- 
-    Constant *Initializer = ConstantStruct::get( 
-        GlobalStructTy, 
-        ConstantExpr::getPointerCast(InstrumentedGlobal, IntptrTy), 
-        ConstantInt::get(IntptrTy, SizeInBytes), 
-        ConstantInt::get(IntptrTy, SizeInBytes + RightRedzoneSize), 
-        ConstantExpr::getPointerCast(Name, IntptrTy), 
-        ConstantExpr::getPointerCast(ModuleName, IntptrTy), 
-        ConstantInt::get(IntptrTy, MD.IsDynInit), SourceLoc, 
-        ConstantExpr::getPointerCast(ODRIndicator, IntptrTy)); 
- 
-    if (ClInitializers && MD.IsDynInit) HasDynamicallyInitializedGlobals = true; 
- 
-    LLVM_DEBUG(dbgs() << "NEW GLOBAL: " << *NewGlobal << "\n"); 
- 
-    Initializers[i] = Initializer; 
-  } 
- 
-  // Add instrumented globals to llvm.compiler.used list to avoid LTO from 
-  // ConstantMerge'ing them. 
-  SmallVector<GlobalValue *, 16> GlobalsToAddToUsedList; 
-  for (size_t i = 0; i < n; i++) { 
-    GlobalVariable *G = NewGlobals[i]; 
-    if (G->getName().empty()) continue; 
-    GlobalsToAddToUsedList.push_back(G); 
-  } 
-  appendToCompilerUsed(M, ArrayRef<GlobalValue *>(GlobalsToAddToUsedList)); 
- 
-  std::string ELFUniqueModuleId = 
-      (UseGlobalsGC && TargetTriple.isOSBinFormatELF()) ? getUniqueModuleId(&M) 
-                                                        : ""; 
- 
-  if (!ELFUniqueModuleId.empty()) { 
-    InstrumentGlobalsELF(IRB, M, NewGlobals, Initializers, ELFUniqueModuleId); 
-    *CtorComdat = true; 
-  } else if (UseGlobalsGC && TargetTriple.isOSBinFormatCOFF()) { 
-    InstrumentGlobalsCOFF(IRB, M, NewGlobals, Initializers); 
-  } else if (UseGlobalsGC && ShouldUseMachOGlobalsSection()) { 
-    InstrumentGlobalsMachO(IRB, M, NewGlobals, Initializers); 
-  } else { 
-    InstrumentGlobalsWithMetadataArray(IRB, M, NewGlobals, Initializers); 
-  } 
- 
-  // Create calls for poisoning before initializers run and unpoisoning after. 
-  if (HasDynamicallyInitializedGlobals) 
-    createInitializerPoisonCalls(M, ModuleName); 
- 
-  LLVM_DEBUG(dbgs() << M); 
-  return true; 
-} 
- 
-uint64_t 
-ModuleAddressSanitizer::getRedzoneSizeForGlobal(uint64_t SizeInBytes) const { 
-  constexpr uint64_t kMaxRZ = 1 << 18; 
-  const uint64_t MinRZ = getMinRedzoneSizeForGlobal(); 
- 
-  // Calculate RZ, where MinRZ <= RZ <= MaxRZ, and RZ ~ 1/4 * SizeInBytes. 
-  uint64_t RZ = 
-      std::max(MinRZ, std::min(kMaxRZ, (SizeInBytes / MinRZ / 4) * MinRZ)); 
- 
-  // Round up to multiple of MinRZ. 
-  if (SizeInBytes % MinRZ) 
-    RZ += MinRZ - (SizeInBytes % MinRZ); 
-  assert((RZ + SizeInBytes) % MinRZ == 0); 
- 
-  return RZ; 
-} 
- 
-int ModuleAddressSanitizer::GetAsanVersion(const Module &M) const { 
-  int LongSize = M.getDataLayout().getPointerSizeInBits(); 
-  bool isAndroid = Triple(M.getTargetTriple()).isAndroid(); 
-  int Version = 8; 
-  // 32-bit Android is one version ahead because of the switch to dynamic 
-  // shadow. 
-  Version += (LongSize == 32 && isAndroid); 
-  return Version; 
-} 
- 
-bool ModuleAddressSanitizer::instrumentModule(Module &M) { 
-  initializeCallbacks(M); 
- 
-  // Create a module constructor. A destructor is created lazily because not all 
-  // platforms, and not all modules need it. 
-  if (CompileKernel) { 
-    // The kernel always builds with its own runtime, and therefore does not 
-    // need the init and version check calls. 
-    AsanCtorFunction = createSanitizerCtor(M, kAsanModuleCtorName); 
-  } else { 
-    std::string AsanVersion = std::to_string(GetAsanVersion(M)); 
-    std::string VersionCheckName = 
-        ClInsertVersionCheck ? (kAsanVersionCheckNamePrefix + AsanVersion) : ""; 
-    std::tie(AsanCtorFunction, std::ignore) = 
-        createSanitizerCtorAndInitFunctions(M, kAsanModuleCtorName, 
-                                            kAsanInitName, /*InitArgTypes=*/{}, 
-                                            /*InitArgs=*/{}, VersionCheckName); 
-  } 
- 
-  bool CtorComdat = true; 
-  if (ClGlobals) { 
-    IRBuilder<> IRB(AsanCtorFunction->getEntryBlock().getTerminator()); 
-    InstrumentGlobals(IRB, M, &CtorComdat); 
-  } 
- 
-  const uint64_t Priority = GetCtorAndDtorPriority(TargetTriple); 
- 
-  // Put the constructor and destructor in comdat if both 
-  // (1) global instrumentation is not TU-specific 
-  // (2) target is ELF. 
-  if (UseCtorComdat && TargetTriple.isOSBinFormatELF() && CtorComdat) { 
-    AsanCtorFunction->setComdat(M.getOrInsertComdat(kAsanModuleCtorName)); 
-    appendToGlobalCtors(M, AsanCtorFunction, Priority, AsanCtorFunction); 
-    if (AsanDtorFunction) { 
-      AsanDtorFunction->setComdat(M.getOrInsertComdat(kAsanModuleDtorName)); 
-      appendToGlobalDtors(M, AsanDtorFunction, Priority, AsanDtorFunction); 
-    } 
-  } else { 
-    appendToGlobalCtors(M, AsanCtorFunction, Priority); 
-    if (AsanDtorFunction) 
-      appendToGlobalDtors(M, AsanDtorFunction, Priority); 
-  } 
- 
-  return true; 
-} 
- 
-void AddressSanitizer::initializeCallbacks(Module &M) { 
-  IRBuilder<> IRB(*C); 
-  // Create __asan_report* callbacks. 
-  // IsWrite, TypeSize and Exp are encoded in the function name. 
-  for (int Exp = 0; Exp < 2; Exp++) { 
-    for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) { 
-      const std::string TypeStr = AccessIsWrite ? "store" : "load"; 
-      const std::string ExpStr = Exp ? "exp_" : ""; 
-      const std::string EndingStr = Recover ? "_noabort" : ""; 
- 
-      SmallVector<Type *, 3> Args2 = {IntptrTy, IntptrTy}; 
-      SmallVector<Type *, 2> Args1{1, IntptrTy}; 
-      if (Exp) { 
-        Type *ExpType = Type::getInt32Ty(*C); 
-        Args2.push_back(ExpType); 
-        Args1.push_back(ExpType); 
-      } 
-      AsanErrorCallbackSized[AccessIsWrite][Exp] = M.getOrInsertFunction( 
-          kAsanReportErrorTemplate + ExpStr + TypeStr + "_n" + EndingStr, 
-          FunctionType::get(IRB.getVoidTy(), Args2, false)); 
- 
-      AsanMemoryAccessCallbackSized[AccessIsWrite][Exp] = M.getOrInsertFunction( 
-          ClMemoryAccessCallbackPrefix + ExpStr + TypeStr + "N" + EndingStr, 
-          FunctionType::get(IRB.getVoidTy(), Args2, false)); 
- 
-      for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes; 
-           AccessSizeIndex++) { 
-        const std::string Suffix = TypeStr + itostr(1ULL << AccessSizeIndex); 
-        AsanErrorCallback[AccessIsWrite][Exp][AccessSizeIndex] = 
-            M.getOrInsertFunction( 
-                kAsanReportErrorTemplate + ExpStr + Suffix + EndingStr, 
-                FunctionType::get(IRB.getVoidTy(), Args1, false)); 
- 
-        AsanMemoryAccessCallback[AccessIsWrite][Exp][AccessSizeIndex] = 
-            M.getOrInsertFunction( 
-                ClMemoryAccessCallbackPrefix + ExpStr + Suffix + EndingStr, 
-                FunctionType::get(IRB.getVoidTy(), Args1, false)); 
-      } 
-    } 
-  } 
- 
-  const std::string MemIntrinCallbackPrefix = 
-      CompileKernel ? std::string("") : ClMemoryAccessCallbackPrefix; 
-  AsanMemmove = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memmove", 
-                                      IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), 
-                                      IRB.getInt8PtrTy(), IntptrTy); 
-  AsanMemcpy = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memcpy", 
-                                     IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), 
-                                     IRB.getInt8PtrTy(), IntptrTy); 
-  AsanMemset = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memset", 
-                                     IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), 
-                                     IRB.getInt32Ty(), IntptrTy); 
- 
-  AsanHandleNoReturnFunc = 
-      M.getOrInsertFunction(kAsanHandleNoReturnName, IRB.getVoidTy()); 
- 
-  AsanPtrCmpFunction = 
-      M.getOrInsertFunction(kAsanPtrCmp, IRB.getVoidTy(), IntptrTy, IntptrTy); 
-  AsanPtrSubFunction = 
-      M.getOrInsertFunction(kAsanPtrSub, IRB.getVoidTy(), IntptrTy, IntptrTy); 
-  if (Mapping.InGlobal) 
-    AsanShadowGlobal = M.getOrInsertGlobal("__asan_shadow", 
-                                           ArrayType::get(IRB.getInt8Ty(), 0)); 
-} 
- 
-bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) { 
-  // For each NSObject descendant having a +load method, this method is invoked 
-  // by the ObjC runtime before any of the static constructors is called. 
-  // Therefore we need to instrument such methods with a call to __asan_init 
-  // at the beginning in order to initialize our runtime before any access to 
-  // the shadow memory. 
-  // We cannot just ignore these methods, because they may call other 
-  // instrumented functions. 
-  if (F.getName().find(" load]") != std::string::npos) { 
-    FunctionCallee AsanInitFunction = 
-        declareSanitizerInitFunction(*F.getParent(), kAsanInitName, {}); 
-    IRBuilder<> IRB(&F.front(), F.front().begin()); 
-    IRB.CreateCall(AsanInitFunction, {}); 
-    return true; 
-  } 
-  return false; 
-} 
- 
-bool AddressSanitizer::maybeInsertDynamicShadowAtFunctionEntry(Function &F) { 
-  // Generate code only when dynamic addressing is needed. 
-  if (Mapping.Offset != kDynamicShadowSentinel) 
-    return false; 
- 
-  IRBuilder<> IRB(&F.front().front()); 
-  if (Mapping.InGlobal) { 
-    if (ClWithIfuncSuppressRemat) { 
-      // An empty inline asm with input reg == output reg. 
-      // An opaque pointer-to-int cast, basically. 
-      InlineAsm *Asm = InlineAsm::get( 
-          FunctionType::get(IntptrTy, {AsanShadowGlobal->getType()}, false), 
-          StringRef(""), StringRef("=r,0"), 
-          /*hasSideEffects=*/false); 
-      LocalDynamicShadow = 
-          IRB.CreateCall(Asm, {AsanShadowGlobal}, ".asan.shadow"); 
-    } else { 
-      LocalDynamicShadow = 
-          IRB.CreatePointerCast(AsanShadowGlobal, IntptrTy, ".asan.shadow"); 
-    } 
-  } else { 
-    Value *GlobalDynamicAddress = F.getParent()->getOrInsertGlobal( 
-        kAsanShadowMemoryDynamicAddress, IntptrTy); 
-    LocalDynamicShadow = IRB.CreateLoad(IntptrTy, GlobalDynamicAddress); 
-  } 
-  return true; 
-} 
- 
-void AddressSanitizer::markEscapedLocalAllocas(Function &F) { 
-  // Find the one possible call to llvm.localescape and pre-mark allocas passed 
-  // to it as uninteresting. This assumes we haven't started processing allocas 
-  // yet. This check is done up front because iterating the use list in 
-  // isInterestingAlloca would be algorithmically slower. 
-  assert(ProcessedAllocas.empty() && "must process localescape before allocas"); 
- 
-  // Try to get the declaration of llvm.localescape. If it's not in the module, 
-  // we can exit early. 
-  if (!F.getParent()->getFunction("llvm.localescape")) return; 
- 
-  // Look for a call to llvm.localescape call in the entry block. It can't be in 
-  // any other block. 
-  for (Instruction &I : F.getEntryBlock()) { 
-    IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I); 
-    if (II && II->getIntrinsicID() == Intrinsic::localescape) { 
-      // We found a call. Mark all the allocas passed in as uninteresting. 
-      for (Value *Arg : II->arg_operands()) { 
-        AllocaInst *AI = dyn_cast<AllocaInst>(Arg->stripPointerCasts()); 
-        assert(AI && AI->isStaticAlloca() && 
-               "non-static alloca arg to localescape"); 
-        ProcessedAllocas[AI] = false; 
-      } 
-      break; 
-    } 
-  } 
-} 
- 
-bool AddressSanitizer::suppressInstrumentationSiteForDebug(int &Instrumented) { 
-  bool ShouldInstrument = 
-      ClDebugMin < 0 || ClDebugMax < 0 || 
-      (Instrumented >= ClDebugMin && Instrumented <= ClDebugMax); 
-  Instrumented++; 
-  return !ShouldInstrument; 
-} 
- 
-bool AddressSanitizer::instrumentFunction(Function &F, 
-                                          const TargetLibraryInfo *TLI) { 
-  if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) return false; 
-  if (!ClDebugFunc.empty() && ClDebugFunc == F.getName()) return false; 
-  if (F.getName().startswith("__asan_")) return false; 
- 
-  bool FunctionModified = false; 
- 
-  // If needed, insert __asan_init before checking for SanitizeAddress attr. 
-  // This function needs to be called even if the function body is not 
-  // instrumented. 
-  if (maybeInsertAsanInitAtFunctionEntry(F)) 
-    FunctionModified = true; 
- 
-  // Leave if the function doesn't need instrumentation. 
-  if (!F.hasFnAttribute(Attribute::SanitizeAddress)) return FunctionModified; 
- 
-  LLVM_DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n"); 
- 
-  initializeCallbacks(*F.getParent()); 
- 
-  FunctionStateRAII CleanupObj(this); 
- 
-  FunctionModified |= maybeInsertDynamicShadowAtFunctionEntry(F); 
- 
-  // We can't instrument allocas used with llvm.localescape. Only static allocas 
-  // can be passed to that intrinsic. 
-  markEscapedLocalAllocas(F); 
- 
-  // We want to instrument every address only once per basic block (unless there 
-  // are calls between uses). 
-  SmallPtrSet<Value *, 16> TempsToInstrument; 
-  SmallVector<InterestingMemoryOperand, 16> OperandsToInstrument; 
-  SmallVector<MemIntrinsic *, 16> IntrinToInstrument; 
-  SmallVector<Instruction *, 8> NoReturnCalls; 
-  SmallVector<BasicBlock *, 16> AllBlocks; 
-  SmallVector<Instruction *, 16> PointerComparisonsOrSubtracts; 
-  int NumAllocas = 0; 
- 
-  // Fill the set of memory operations to instrument. 
-  for (auto &BB : F) { 
-    AllBlocks.push_back(&BB); 
-    TempsToInstrument.clear(); 
-    int NumInsnsPerBB = 0; 
-    for (auto &Inst : BB) { 
-      if (LooksLikeCodeInBug11395(&Inst)) return false; 
-      SmallVector<InterestingMemoryOperand, 1> InterestingOperands; 
-      getInterestingMemoryOperands(&Inst, InterestingOperands); 
- 
-      if (!InterestingOperands.empty()) { 
-        for (auto &Operand : InterestingOperands) { 
-          if (ClOpt && ClOptSameTemp) { 
-            Value *Ptr = Operand.getPtr(); 
-            // If we have a mask, skip instrumentation if we've already 
-            // instrumented the full object. But don't add to TempsToInstrument 
-            // because we might get another load/store with a different mask. 
-            if (Operand.MaybeMask) { 
-              if (TempsToInstrument.count(Ptr)) 
-                continue; // We've seen this (whole) temp in the current BB. 
-            } else { 
-              if (!TempsToInstrument.insert(Ptr).second) 
-                continue; // We've seen this temp in the current BB. 
-            } 
-          } 
-          OperandsToInstrument.push_back(Operand); 
-          NumInsnsPerBB++; 
-        } 
-      } else if (((ClInvalidPointerPairs || ClInvalidPointerCmp) && 
-                  isInterestingPointerComparison(&Inst)) || 
-                 ((ClInvalidPointerPairs || ClInvalidPointerSub) && 
-                  isInterestingPointerSubtraction(&Inst))) { 
-        PointerComparisonsOrSubtracts.push_back(&Inst); 
-      } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(&Inst)) { 
-        // ok, take it. 
-        IntrinToInstrument.push_back(MI); 
-        NumInsnsPerBB++; 
-      } else { 
-        if (isa<AllocaInst>(Inst)) NumAllocas++; 
-        if (auto *CB = dyn_cast<CallBase>(&Inst)) { 
-          // A call inside BB. 
-          TempsToInstrument.clear(); 
-          if (CB->doesNotReturn() && !CB->hasMetadata("nosanitize")) 
-            NoReturnCalls.push_back(CB); 
-        } 
-        if (CallInst *CI = dyn_cast<CallInst>(&Inst)) 
-          maybeMarkSanitizerLibraryCallNoBuiltin(CI, TLI); 
-      } 
-      if (NumInsnsPerBB >= ClMaxInsnsToInstrumentPerBB) break; 
-    } 
-  } 
- 
-  bool UseCalls = (ClInstrumentationWithCallsThreshold >= 0 && 
-                   OperandsToInstrument.size() + IntrinToInstrument.size() > 
-                       (unsigned)ClInstrumentationWithCallsThreshold); 
-  const DataLayout &DL = F.getParent()->getDataLayout(); 
-  ObjectSizeOpts ObjSizeOpts; 
-  ObjSizeOpts.RoundToAlign = true; 
-  ObjectSizeOffsetVisitor ObjSizeVis(DL, TLI, F.getContext(), ObjSizeOpts); 
- 
-  // Instrument. 
-  int NumInstrumented = 0; 
-  for (auto &Operand : OperandsToInstrument) { 
-    if (!suppressInstrumentationSiteForDebug(NumInstrumented)) 
-      instrumentMop(ObjSizeVis, Operand, UseCalls, 
-                    F.getParent()->getDataLayout()); 
-    FunctionModified = true; 
-  } 
-  for (auto Inst : IntrinToInstrument) { 
-    if (!suppressInstrumentationSiteForDebug(NumInstrumented)) 
-      instrumentMemIntrinsic(Inst); 
-    FunctionModified = true; 
-  } 
- 
-  FunctionStackPoisoner FSP(F, *this); 
-  bool ChangedStack = FSP.runOnFunction(); 
- 
-  // We must unpoison the stack before NoReturn calls (throw, _exit, etc). 
-  // See e.g. https://github.com/google/sanitizers/issues/37 
-  for (auto CI : NoReturnCalls) { 
-    IRBuilder<> IRB(CI); 
-    IRB.CreateCall(AsanHandleNoReturnFunc, {}); 
-  } 
- 
-  for (auto Inst : PointerComparisonsOrSubtracts) { 
-    instrumentPointerComparisonOrSubtraction(Inst); 
-    FunctionModified = true; 
-  } 
- 
-  if (ChangedStack || !NoReturnCalls.empty()) 
-    FunctionModified = true; 
- 
-  LLVM_DEBUG(dbgs() << "ASAN done instrumenting: " << FunctionModified << " " 
-                    << F << "\n"); 
- 
-  return FunctionModified; 
-} 
- 
-// Workaround for bug 11395: we don't want to instrument stack in functions 
-// with large assembly blobs (32-bit only), otherwise reg alloc may crash. 
-// FIXME: remove once the bug 11395 is fixed. 
-bool AddressSanitizer::LooksLikeCodeInBug11395(Instruction *I) { 
-  if (LongSize != 32) return false; 
-  CallInst *CI = dyn_cast<CallInst>(I); 
-  if (!CI || !CI->isInlineAsm()) return false; 
-  if (CI->getNumArgOperands() <= 5) return false; 
-  // We have inline assembly with quite a few arguments. 
-  return true; 
-} 
- 
-void FunctionStackPoisoner::initializeCallbacks(Module &M) { 
-  IRBuilder<> IRB(*C); 
-  for (int i = 0; i <= kMaxAsanStackMallocSizeClass; i++) { 
-    std::string Suffix = itostr(i); 
-    AsanStackMallocFunc[i] = M.getOrInsertFunction( 
-        kAsanStackMallocNameTemplate + Suffix, IntptrTy, IntptrTy); 
-    AsanStackFreeFunc[i] = 
-        M.getOrInsertFunction(kAsanStackFreeNameTemplate + Suffix, 
-                              IRB.getVoidTy(), IntptrTy, IntptrTy); 
-  } 
-  if (ASan.UseAfterScope) { 
-    AsanPoisonStackMemoryFunc = M.getOrInsertFunction( 
-        kAsanPoisonStackMemoryName, IRB.getVoidTy(), IntptrTy, IntptrTy); 
-    AsanUnpoisonStackMemoryFunc = M.getOrInsertFunction( 
-        kAsanUnpoisonStackMemoryName, IRB.getVoidTy(), IntptrTy, IntptrTy); 
-  } 
- 
-  for (size_t Val : {0x00, 0xf1, 0xf2, 0xf3, 0xf5, 0xf8}) { 
-    std::ostringstream Name; 
-    Name << kAsanSetShadowPrefix; 
-    Name << std::setw(2) << std::setfill('0') << std::hex << Val; 
-    AsanSetShadowFunc[Val] = 
-        M.getOrInsertFunction(Name.str(), IRB.getVoidTy(), IntptrTy, IntptrTy); 
-  } 
- 
-  AsanAllocaPoisonFunc = M.getOrInsertFunction( 
-      kAsanAllocaPoison, IRB.getVoidTy(), IntptrTy, IntptrTy); 
-  AsanAllocasUnpoisonFunc = M.getOrInsertFunction( 
-      kAsanAllocasUnpoison, IRB.getVoidTy(), IntptrTy, IntptrTy); 
-} 
- 
-void FunctionStackPoisoner::copyToShadowInline(ArrayRef<uint8_t> ShadowMask, 
-                                               ArrayRef<uint8_t> ShadowBytes, 
-                                               size_t Begin, size_t End, 
-                                               IRBuilder<> &IRB, 
-                                               Value *ShadowBase) { 
-  if (Begin >= End) 
-    return; 
- 
-  const size_t LargestStoreSizeInBytes = 
-      std::min<size_t>(sizeof(uint64_t), ASan.LongSize / 8); 
- 
-  const bool IsLittleEndian = F.getParent()->getDataLayout().isLittleEndian(); 
- 
-  // Poison given range in shadow using larges store size with out leading and 
-  // trailing zeros in ShadowMask. Zeros never change, so they need neither 
-  // poisoning nor up-poisoning. Still we don't mind if some of them get into a 
-  // middle of a store. 
-  for (size_t i = Begin; i < End;) { 
-    if (!ShadowMask[i]) { 
-      assert(!ShadowBytes[i]); 
-      ++i; 
-      continue; 
-    } 
- 
-    size_t StoreSizeInBytes = LargestStoreSizeInBytes; 
-    // Fit store size into the range. 
-    while (StoreSizeInBytes > End - i) 
-      StoreSizeInBytes /= 2; 
- 
-    // Minimize store size by trimming trailing zeros. 
-    for (size_t j = StoreSizeInBytes - 1; j && !ShadowMask[i + j]; --j) { 
-      while (j <= StoreSizeInBytes / 2) 
-        StoreSizeInBytes /= 2; 
-    } 
- 
-    uint64_t Val = 0; 
-    for (size_t j = 0; j < StoreSizeInBytes; j++) { 
-      if (IsLittleEndian) 
-        Val |= (uint64_t)ShadowBytes[i + j] << (8 * j); 
-      else 
-        Val = (Val << 8) | ShadowBytes[i + j]; 
-    } 
- 
-    Value *Ptr = IRB.CreateAdd(ShadowBase, ConstantInt::get(IntptrTy, i)); 
-    Value *Poison = IRB.getIntN(StoreSizeInBytes * 8, Val); 
-    IRB.CreateAlignedStore( 
-        Poison, IRB.CreateIntToPtr(Ptr, Poison->getType()->getPointerTo()), 
-        Align(1)); 
- 
-    i += StoreSizeInBytes; 
-  } 
-} 
- 
-void FunctionStackPoisoner::copyToShadow(ArrayRef<uint8_t> ShadowMask, 
-                                         ArrayRef<uint8_t> ShadowBytes, 
-                                         IRBuilder<> &IRB, Value *ShadowBase) { 
-  copyToShadow(ShadowMask, ShadowBytes, 0, ShadowMask.size(), IRB, ShadowBase); 
-} 
- 
-void FunctionStackPoisoner::copyToShadow(ArrayRef<uint8_t> ShadowMask, 
-                                         ArrayRef<uint8_t> ShadowBytes, 
-                                         size_t Begin, size_t End, 
-                                         IRBuilder<> &IRB, Value *ShadowBase) { 
-  assert(ShadowMask.size() == ShadowBytes.size()); 
-  size_t Done = Begin; 
-  for (size_t i = Begin, j = Begin + 1; i < End; i = j++) { 
-    if (!ShadowMask[i]) { 
-      assert(!ShadowBytes[i]); 
-      continue; 
-    } 
-    uint8_t Val = ShadowBytes[i]; 
-    if (!AsanSetShadowFunc[Val]) 
-      continue; 
- 
-    // Skip same values. 
-    for (; j < End && ShadowMask[j] && Val == ShadowBytes[j]; ++j) { 
-    } 
- 
-    if (j - i >= ClMaxInlinePoisoningSize) { 
-      copyToShadowInline(ShadowMask, ShadowBytes, Done, i, IRB, ShadowBase); 
-      IRB.CreateCall(AsanSetShadowFunc[Val], 
-                     {IRB.CreateAdd(ShadowBase, ConstantInt::get(IntptrTy, i)), 
-                      ConstantInt::get(IntptrTy, j - i)}); 
-      Done = j; 
-    } 
-  } 
- 
-  copyToShadowInline(ShadowMask, ShadowBytes, Done, End, IRB, ShadowBase); 
-} 
- 
-// Fake stack allocator (asan_fake_stack.h) has 11 size classes 
-// for every power of 2 from kMinStackMallocSize to kMaxAsanStackMallocSizeClass 
-static int StackMallocSizeClass(uint64_t LocalStackSize) { 
-  assert(LocalStackSize <= kMaxStackMallocSize); 
-  uint64_t MaxSize = kMinStackMallocSize; 
-  for (int i = 0;; i++, MaxSize *= 2) 
-    if (LocalStackSize <= MaxSize) return i; 
-  llvm_unreachable("impossible LocalStackSize"); 
-} 
- 
-void FunctionStackPoisoner::copyArgsPassedByValToAllocas() { 
-  Instruction *CopyInsertPoint = &F.front().front(); 
-  if (CopyInsertPoint == ASan.LocalDynamicShadow) { 
-    // Insert after the dynamic shadow location is determined 
-    CopyInsertPoint = CopyInsertPoint->getNextNode(); 
-    assert(CopyInsertPoint); 
-  } 
-  IRBuilder<> IRB(CopyInsertPoint); 
-  const DataLayout &DL = F.getParent()->getDataLayout(); 
-  for (Argument &Arg : F.args()) { 
-    if (Arg.hasByValAttr()) { 
-      Type *Ty = Arg.getParamByValType(); 
-      const Align Alignment = 
-          DL.getValueOrABITypeAlignment(Arg.getParamAlign(), Ty); 
- 
-      AllocaInst *AI = IRB.CreateAlloca( 
-          Ty, nullptr, 
-          (Arg.hasName() ? Arg.getName() : "Arg" + Twine(Arg.getArgNo())) + 
-              ".byval"); 
-      AI->setAlignment(Alignment); 
-      Arg.replaceAllUsesWith(AI); 
- 
-      uint64_t AllocSize = DL.getTypeAllocSize(Ty); 
-      IRB.CreateMemCpy(AI, Alignment, &Arg, Alignment, AllocSize); 
-    } 
-  } 
-} 
- 
-PHINode *FunctionStackPoisoner::createPHI(IRBuilder<> &IRB, Value *Cond, 
-                                          Value *ValueIfTrue, 
-                                          Instruction *ThenTerm, 
-                                          Value *ValueIfFalse) { 
-  PHINode *PHI = IRB.CreatePHI(IntptrTy, 2); 
-  BasicBlock *CondBlock = cast<Instruction>(Cond)->getParent(); 
-  PHI->addIncoming(ValueIfFalse, CondBlock); 
-  BasicBlock *ThenBlock = ThenTerm->getParent(); 
-  PHI->addIncoming(ValueIfTrue, ThenBlock); 
-  return PHI; 
-} 
- 
-Value *FunctionStackPoisoner::createAllocaForLayout( 
-    IRBuilder<> &IRB, const ASanStackFrameLayout &L, bool Dynamic) { 
-  AllocaInst *Alloca; 
-  if (Dynamic) { 
-    Alloca = IRB.CreateAlloca(IRB.getInt8Ty(), 
-                              ConstantInt::get(IRB.getInt64Ty(), L.FrameSize), 
-                              "MyAlloca"); 
-  } else { 
-    Alloca = IRB.CreateAlloca(ArrayType::get(IRB.getInt8Ty(), L.FrameSize), 
-                              nullptr, "MyAlloca"); 
-    assert(Alloca->isStaticAlloca()); 
-  } 
-  assert((ClRealignStack & (ClRealignStack - 1)) == 0); 
-  size_t FrameAlignment = std::max(L.FrameAlignment, (size_t)ClRealignStack); 
-  Alloca->setAlignment(Align(FrameAlignment)); 
-  return IRB.CreatePointerCast(Alloca, IntptrTy); 
-} 
- 
-void FunctionStackPoisoner::createDynamicAllocasInitStorage() { 
-  BasicBlock &FirstBB = *F.begin(); 
-  IRBuilder<> IRB(dyn_cast<Instruction>(FirstBB.begin())); 
-  DynamicAllocaLayout = IRB.CreateAlloca(IntptrTy, nullptr); 
-  IRB.CreateStore(Constant::getNullValue(IntptrTy), DynamicAllocaLayout); 
-  DynamicAllocaLayout->setAlignment(Align(32)); 
-} 
- 
-void FunctionStackPoisoner::processDynamicAllocas() { 
-  if (!ClInstrumentDynamicAllocas || DynamicAllocaVec.empty()) { 
-    assert(DynamicAllocaPoisonCallVec.empty()); 
-    return; 
-  } 
- 
-  // Insert poison calls for lifetime intrinsics for dynamic allocas. 
-  for (const auto &APC : DynamicAllocaPoisonCallVec) { 
-    assert(APC.InsBefore); 
-    assert(APC.AI); 
-    assert(ASan.isInterestingAlloca(*APC.AI)); 
-    assert(!APC.AI->isStaticAlloca()); 
- 
-    IRBuilder<> IRB(APC.InsBefore); 
-    poisonAlloca(APC.AI, APC.Size, IRB, APC.DoPoison); 
-    // Dynamic allocas will be unpoisoned unconditionally below in 
-    // unpoisonDynamicAllocas. 
-    // Flag that we need unpoison static allocas. 
-  } 
- 
-  // Handle dynamic allocas. 
-  createDynamicAllocasInitStorage(); 
-  for (auto &AI : DynamicAllocaVec) 
-    handleDynamicAllocaCall(AI); 
-  unpoisonDynamicAllocas(); 
-} 
- 
-/// Collect instructions in the entry block after \p InsBefore which initialize 
-/// permanent storage for a function argument. These instructions must remain in 
-/// the entry block so that uninitialized values do not appear in backtraces. An 
-/// added benefit is that this conserves spill slots. This does not move stores 
-/// before instrumented / "interesting" allocas. 
-static void findStoresToUninstrumentedArgAllocas( 
-    AddressSanitizer &ASan, Instruction &InsBefore, 
-    SmallVectorImpl<Instruction *> &InitInsts) { 
-  Instruction *Start = InsBefore.getNextNonDebugInstruction(); 
-  for (Instruction *It = Start; It; It = It->getNextNonDebugInstruction()) { 
-    // Argument initialization looks like: 
-    // 1) store <Argument>, <Alloca> OR 
-    // 2) <CastArgument> = cast <Argument> to ... 
-    //    store <CastArgument> to <Alloca> 
-    // Do not consider any other kind of instruction. 
-    // 
-    // Note: This covers all known cases, but may not be exhaustive. An 
-    // alternative to pattern-matching stores is to DFS over all Argument uses: 
-    // this might be more general, but is probably much more complicated. 
-    if (isa<AllocaInst>(It) || isa<CastInst>(It)) 
-      continue; 
-    if (auto *Store = dyn_cast<StoreInst>(It)) { 
-      // The store destination must be an alloca that isn't interesting for 
-      // ASan to instrument. These are moved up before InsBefore, and they're 
-      // not interesting because allocas for arguments can be mem2reg'd. 
-      auto *Alloca = dyn_cast<AllocaInst>(Store->getPointerOperand()); 
-      if (!Alloca || ASan.isInterestingAlloca(*Alloca)) 
-        continue; 
- 
-      Value *Val = Store->getValueOperand(); 
-      bool IsDirectArgInit = isa<Argument>(Val); 
-      bool IsArgInitViaCast = 
-          isa<CastInst>(Val) && 
-          isa<Argument>(cast<CastInst>(Val)->getOperand(0)) && 
-          // Check that the cast appears directly before the store. Otherwise 
-          // moving the cast before InsBefore may break the IR. 
-          Val == It->getPrevNonDebugInstruction(); 
-      bool IsArgInit = IsDirectArgInit || IsArgInitViaCast; 
-      if (!IsArgInit) 
-        continue; 
- 
-      if (IsArgInitViaCast) 
-        InitInsts.push_back(cast<Instruction>(Val)); 
-      InitInsts.push_back(Store); 
-      continue; 
-    } 
- 
-    // Do not reorder past unknown instructions: argument initialization should 
-    // only involve casts and stores. 
-    return; 
-  } 
-} 
- 
-void FunctionStackPoisoner::processStaticAllocas() { 
-  if (AllocaVec.empty()) { 
-    assert(StaticAllocaPoisonCallVec.empty()); 
-    return; 
-  } 
- 
-  int StackMallocIdx = -1; 
-  DebugLoc EntryDebugLocation; 
-  if (auto SP = F.getSubprogram()) 
+
+    Value *Indices2[2];
+    Indices2[0] = IRB.getInt32(0);
+    Indices2[1] = IRB.getInt32(0);
+
+    G->replaceAllUsesWith(
+        ConstantExpr::getGetElementPtr(NewTy, NewGlobal, Indices2, true));
+    NewGlobal->takeName(G);
+    G->eraseFromParent();
+    NewGlobals[i] = NewGlobal;
+
+    Constant *SourceLoc;
+    if (!MD.SourceLoc.empty()) {
+      auto SourceLocGlobal = createPrivateGlobalForSourceLoc(M, MD.SourceLoc);
+      SourceLoc = ConstantExpr::getPointerCast(SourceLocGlobal, IntptrTy);
+    } else {
+      SourceLoc = ConstantInt::get(IntptrTy, 0);
+    }
+
+    Constant *ODRIndicator = ConstantExpr::getNullValue(IRB.getInt8PtrTy());
+    GlobalValue *InstrumentedGlobal = NewGlobal;
+
+    bool CanUsePrivateAliases =
+        TargetTriple.isOSBinFormatELF() || TargetTriple.isOSBinFormatMachO() ||
+        TargetTriple.isOSBinFormatWasm();
+    if (CanUsePrivateAliases && UsePrivateAlias) {
+      // Create local alias for NewGlobal to avoid crash on ODR between
+      // instrumented and non-instrumented libraries.
+      InstrumentedGlobal =
+          GlobalAlias::create(GlobalValue::PrivateLinkage, "", NewGlobal);
+    }
+
+    // ODR should not happen for local linkage.
+    if (NewGlobal->hasLocalLinkage()) {
+      ODRIndicator = ConstantExpr::getIntToPtr(ConstantInt::get(IntptrTy, -1),
+                                               IRB.getInt8PtrTy());
+    } else if (UseOdrIndicator) {
+      // With local aliases, we need to provide another externally visible
+      // symbol __odr_asan_XXX to detect ODR violation.
+      auto *ODRIndicatorSym =
+          new GlobalVariable(M, IRB.getInt8Ty(), false, Linkage,
+                             Constant::getNullValue(IRB.getInt8Ty()),
+                             kODRGenPrefix + NameForGlobal, nullptr,
+                             NewGlobal->getThreadLocalMode());
+
+      // Set meaningful attributes for indicator symbol.
+      ODRIndicatorSym->setVisibility(NewGlobal->getVisibility());
+      ODRIndicatorSym->setDLLStorageClass(NewGlobal->getDLLStorageClass());
+      ODRIndicatorSym->setAlignment(Align(1));
+      ODRIndicator = ODRIndicatorSym;
+    }
+
+    Constant *Initializer = ConstantStruct::get(
+        GlobalStructTy,
+        ConstantExpr::getPointerCast(InstrumentedGlobal, IntptrTy),
+        ConstantInt::get(IntptrTy, SizeInBytes),
+        ConstantInt::get(IntptrTy, SizeInBytes + RightRedzoneSize),
+        ConstantExpr::getPointerCast(Name, IntptrTy),
+        ConstantExpr::getPointerCast(ModuleName, IntptrTy),
+        ConstantInt::get(IntptrTy, MD.IsDynInit), SourceLoc,
+        ConstantExpr::getPointerCast(ODRIndicator, IntptrTy));
+
+    if (ClInitializers && MD.IsDynInit) HasDynamicallyInitializedGlobals = true;
+
+    LLVM_DEBUG(dbgs() << "NEW GLOBAL: " << *NewGlobal << "\n");
+
+    Initializers[i] = Initializer;
+  }
+
+  // Add instrumented globals to llvm.compiler.used list to avoid LTO from
+  // ConstantMerge'ing them.
+  SmallVector<GlobalValue *, 16> GlobalsToAddToUsedList;
+  for (size_t i = 0; i < n; i++) {
+    GlobalVariable *G = NewGlobals[i];
+    if (G->getName().empty()) continue;
+    GlobalsToAddToUsedList.push_back(G);
+  }
+  appendToCompilerUsed(M, ArrayRef<GlobalValue *>(GlobalsToAddToUsedList));
+
+  std::string ELFUniqueModuleId =
+      (UseGlobalsGC && TargetTriple.isOSBinFormatELF()) ? getUniqueModuleId(&M)
+                                                        : "";
+
+  if (!ELFUniqueModuleId.empty()) {
+    InstrumentGlobalsELF(IRB, M, NewGlobals, Initializers, ELFUniqueModuleId);
+    *CtorComdat = true;
+  } else if (UseGlobalsGC && TargetTriple.isOSBinFormatCOFF()) {
+    InstrumentGlobalsCOFF(IRB, M, NewGlobals, Initializers);
+  } else if (UseGlobalsGC && ShouldUseMachOGlobalsSection()) {
+    InstrumentGlobalsMachO(IRB, M, NewGlobals, Initializers);
+  } else {
+    InstrumentGlobalsWithMetadataArray(IRB, M, NewGlobals, Initializers);
+  }
+
+  // Create calls for poisoning before initializers run and unpoisoning after.
+  if (HasDynamicallyInitializedGlobals)
+    createInitializerPoisonCalls(M, ModuleName);
+
+  LLVM_DEBUG(dbgs() << M);
+  return true;
+}
+
+uint64_t
+ModuleAddressSanitizer::getRedzoneSizeForGlobal(uint64_t SizeInBytes) const {
+  constexpr uint64_t kMaxRZ = 1 << 18;
+  const uint64_t MinRZ = getMinRedzoneSizeForGlobal();
+
+  // Calculate RZ, where MinRZ <= RZ <= MaxRZ, and RZ ~ 1/4 * SizeInBytes.
+  uint64_t RZ =
+      std::max(MinRZ, std::min(kMaxRZ, (SizeInBytes / MinRZ / 4) * MinRZ));
+
+  // Round up to multiple of MinRZ.
+  if (SizeInBytes % MinRZ)
+    RZ += MinRZ - (SizeInBytes % MinRZ);
+  assert((RZ + SizeInBytes) % MinRZ == 0);
+
+  return RZ;
+}
+
+int ModuleAddressSanitizer::GetAsanVersion(const Module &M) const {
+  int LongSize = M.getDataLayout().getPointerSizeInBits();
+  bool isAndroid = Triple(M.getTargetTriple()).isAndroid();
+  int Version = 8;
+  // 32-bit Android is one version ahead because of the switch to dynamic
+  // shadow.
+  Version += (LongSize == 32 && isAndroid);
+  return Version;
+}
+
+bool ModuleAddressSanitizer::instrumentModule(Module &M) {
+  initializeCallbacks(M);
+
+  // Create a module constructor. A destructor is created lazily because not all
+  // platforms, and not all modules need it.
+  if (CompileKernel) {
+    // The kernel always builds with its own runtime, and therefore does not
+    // need the init and version check calls.
+    AsanCtorFunction = createSanitizerCtor(M, kAsanModuleCtorName);
+  } else {
+    std::string AsanVersion = std::to_string(GetAsanVersion(M));
+    std::string VersionCheckName =
+        ClInsertVersionCheck ? (kAsanVersionCheckNamePrefix + AsanVersion) : "";
+    std::tie(AsanCtorFunction, std::ignore) =
+        createSanitizerCtorAndInitFunctions(M, kAsanModuleCtorName,
+                                            kAsanInitName, /*InitArgTypes=*/{},
+                                            /*InitArgs=*/{}, VersionCheckName);
+  }
+
+  bool CtorComdat = true;
+  if (ClGlobals) {
+    IRBuilder<> IRB(AsanCtorFunction->getEntryBlock().getTerminator());
+    InstrumentGlobals(IRB, M, &CtorComdat);
+  }
+
+  const uint64_t Priority = GetCtorAndDtorPriority(TargetTriple);
+
+  // Put the constructor and destructor in comdat if both
+  // (1) global instrumentation is not TU-specific
+  // (2) target is ELF.
+  if (UseCtorComdat && TargetTriple.isOSBinFormatELF() && CtorComdat) {
+    AsanCtorFunction->setComdat(M.getOrInsertComdat(kAsanModuleCtorName));
+    appendToGlobalCtors(M, AsanCtorFunction, Priority, AsanCtorFunction);
+    if (AsanDtorFunction) {
+      AsanDtorFunction->setComdat(M.getOrInsertComdat(kAsanModuleDtorName));
+      appendToGlobalDtors(M, AsanDtorFunction, Priority, AsanDtorFunction);
+    }
+  } else {
+    appendToGlobalCtors(M, AsanCtorFunction, Priority);
+    if (AsanDtorFunction)
+      appendToGlobalDtors(M, AsanDtorFunction, Priority);
+  }
+
+  return true;
+}
+
+void AddressSanitizer::initializeCallbacks(Module &M) {
+  IRBuilder<> IRB(*C);
+  // Create __asan_report* callbacks.
+  // IsWrite, TypeSize and Exp are encoded in the function name.
+  for (int Exp = 0; Exp < 2; Exp++) {
+    for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) {
+      const std::string TypeStr = AccessIsWrite ? "store" : "load";
+      const std::string ExpStr = Exp ? "exp_" : "";
+      const std::string EndingStr = Recover ? "_noabort" : "";
+
+      SmallVector<Type *, 3> Args2 = {IntptrTy, IntptrTy};
+      SmallVector<Type *, 2> Args1{1, IntptrTy};
+      if (Exp) {
+        Type *ExpType = Type::getInt32Ty(*C);
+        Args2.push_back(ExpType);
+        Args1.push_back(ExpType);
+      }
+      AsanErrorCallbackSized[AccessIsWrite][Exp] = M.getOrInsertFunction(
+          kAsanReportErrorTemplate + ExpStr + TypeStr + "_n" + EndingStr,
+          FunctionType::get(IRB.getVoidTy(), Args2, false));
+
+      AsanMemoryAccessCallbackSized[AccessIsWrite][Exp] = M.getOrInsertFunction(
+          ClMemoryAccessCallbackPrefix + ExpStr + TypeStr + "N" + EndingStr,
+          FunctionType::get(IRB.getVoidTy(), Args2, false));
+
+      for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
+           AccessSizeIndex++) {
+        const std::string Suffix = TypeStr + itostr(1ULL << AccessSizeIndex);
+        AsanErrorCallback[AccessIsWrite][Exp][AccessSizeIndex] =
+            M.getOrInsertFunction(
+                kAsanReportErrorTemplate + ExpStr + Suffix + EndingStr,
+                FunctionType::get(IRB.getVoidTy(), Args1, false));
+
+        AsanMemoryAccessCallback[AccessIsWrite][Exp][AccessSizeIndex] =
+            M.getOrInsertFunction(
+                ClMemoryAccessCallbackPrefix + ExpStr + Suffix + EndingStr,
+                FunctionType::get(IRB.getVoidTy(), Args1, false));
+      }
+    }
+  }
+
+  const std::string MemIntrinCallbackPrefix =
+      CompileKernel ? std::string("") : ClMemoryAccessCallbackPrefix;
+  AsanMemmove = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memmove",
+                                      IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                                      IRB.getInt8PtrTy(), IntptrTy);
+  AsanMemcpy = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memcpy",
+                                     IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                                     IRB.getInt8PtrTy(), IntptrTy);
+  AsanMemset = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memset",
+                                     IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                                     IRB.getInt32Ty(), IntptrTy);
+
+  AsanHandleNoReturnFunc =
+      M.getOrInsertFunction(kAsanHandleNoReturnName, IRB.getVoidTy());
+
+  AsanPtrCmpFunction =
+      M.getOrInsertFunction(kAsanPtrCmp, IRB.getVoidTy(), IntptrTy, IntptrTy);
+  AsanPtrSubFunction =
+      M.getOrInsertFunction(kAsanPtrSub, IRB.getVoidTy(), IntptrTy, IntptrTy);
+  if (Mapping.InGlobal)
+    AsanShadowGlobal = M.getOrInsertGlobal("__asan_shadow",
+                                           ArrayType::get(IRB.getInt8Ty(), 0));
+}
+
+bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {
+  // For each NSObject descendant having a +load method, this method is invoked
+  // by the ObjC runtime before any of the static constructors is called.
+  // Therefore we need to instrument such methods with a call to __asan_init
+  // at the beginning in order to initialize our runtime before any access to
+  // the shadow memory.
+  // We cannot just ignore these methods, because they may call other
+  // instrumented functions.
+  if (F.getName().find(" load]") != std::string::npos) {
+    FunctionCallee AsanInitFunction =
+        declareSanitizerInitFunction(*F.getParent(), kAsanInitName, {});
+    IRBuilder<> IRB(&F.front(), F.front().begin());
+    IRB.CreateCall(AsanInitFunction, {});
+    return true;
+  }
+  return false;
+}
+
+bool AddressSanitizer::maybeInsertDynamicShadowAtFunctionEntry(Function &F) {
+  // Generate code only when dynamic addressing is needed.
+  if (Mapping.Offset != kDynamicShadowSentinel)
+    return false;
+
+  IRBuilder<> IRB(&F.front().front());
+  if (Mapping.InGlobal) {
+    if (ClWithIfuncSuppressRemat) {
+      // An empty inline asm with input reg == output reg.
+      // An opaque pointer-to-int cast, basically.
+      InlineAsm *Asm = InlineAsm::get(
+          FunctionType::get(IntptrTy, {AsanShadowGlobal->getType()}, false),
+          StringRef(""), StringRef("=r,0"),
+          /*hasSideEffects=*/false);
+      LocalDynamicShadow =
+          IRB.CreateCall(Asm, {AsanShadowGlobal}, ".asan.shadow");
+    } else {
+      LocalDynamicShadow =
+          IRB.CreatePointerCast(AsanShadowGlobal, IntptrTy, ".asan.shadow");
+    }
+  } else {
+    Value *GlobalDynamicAddress = F.getParent()->getOrInsertGlobal(
+        kAsanShadowMemoryDynamicAddress, IntptrTy);
+    LocalDynamicShadow = IRB.CreateLoad(IntptrTy, GlobalDynamicAddress);
+  }
+  return true;
+}
+
+void AddressSanitizer::markEscapedLocalAllocas(Function &F) {
+  // Find the one possible call to llvm.localescape and pre-mark allocas passed
+  // to it as uninteresting. This assumes we haven't started processing allocas
+  // yet. This check is done up front because iterating the use list in
+  // isInterestingAlloca would be algorithmically slower.
+  assert(ProcessedAllocas.empty() && "must process localescape before allocas");
+
+  // Try to get the declaration of llvm.localescape. If it's not in the module,
+  // we can exit early.
+  if (!F.getParent()->getFunction("llvm.localescape")) return;
+
+  // Look for a call to llvm.localescape call in the entry block. It can't be in
+  // any other block.
+  for (Instruction &I : F.getEntryBlock()) {
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
+    if (II && II->getIntrinsicID() == Intrinsic::localescape) {
+      // We found a call. Mark all the allocas passed in as uninteresting.
+      for (Value *Arg : II->arg_operands()) {
+        AllocaInst *AI = dyn_cast<AllocaInst>(Arg->stripPointerCasts());
+        assert(AI && AI->isStaticAlloca() &&
+               "non-static alloca arg to localescape");
+        ProcessedAllocas[AI] = false;
+      }
+      break;
+    }
+  }
+}
+
+bool AddressSanitizer::suppressInstrumentationSiteForDebug(int &Instrumented) {
+  bool ShouldInstrument =
+      ClDebugMin < 0 || ClDebugMax < 0 ||
+      (Instrumented >= ClDebugMin && Instrumented <= ClDebugMax);
+  Instrumented++;
+  return !ShouldInstrument;
+}
+
+bool AddressSanitizer::instrumentFunction(Function &F,
+                                          const TargetLibraryInfo *TLI) {
+  if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) return false;
+  if (!ClDebugFunc.empty() && ClDebugFunc == F.getName()) return false;
+  if (F.getName().startswith("__asan_")) return false;
+
+  bool FunctionModified = false;
+
+  // If needed, insert __asan_init before checking for SanitizeAddress attr.
+  // This function needs to be called even if the function body is not
+  // instrumented.
+  if (maybeInsertAsanInitAtFunctionEntry(F))
+    FunctionModified = true;
+
+  // Leave if the function doesn't need instrumentation.
+  if (!F.hasFnAttribute(Attribute::SanitizeAddress)) return FunctionModified;
+
+  LLVM_DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n");
+
+  initializeCallbacks(*F.getParent());
+
+  FunctionStateRAII CleanupObj(this);
+
+  FunctionModified |= maybeInsertDynamicShadowAtFunctionEntry(F);
+
+  // We can't instrument allocas used with llvm.localescape. Only static allocas
+  // can be passed to that intrinsic.
+  markEscapedLocalAllocas(F);
+
+  // We want to instrument every address only once per basic block (unless there
+  // are calls between uses).
+  SmallPtrSet<Value *, 16> TempsToInstrument;
+  SmallVector<InterestingMemoryOperand, 16> OperandsToInstrument;
+  SmallVector<MemIntrinsic *, 16> IntrinToInstrument;
+  SmallVector<Instruction *, 8> NoReturnCalls;
+  SmallVector<BasicBlock *, 16> AllBlocks;
+  SmallVector<Instruction *, 16> PointerComparisonsOrSubtracts;
+  int NumAllocas = 0;
+
+  // Fill the set of memory operations to instrument.
+  for (auto &BB : F) {
+    AllBlocks.push_back(&BB);
+    TempsToInstrument.clear();
+    int NumInsnsPerBB = 0;
+    for (auto &Inst : BB) {
+      if (LooksLikeCodeInBug11395(&Inst)) return false;
+      SmallVector<InterestingMemoryOperand, 1> InterestingOperands;
+      getInterestingMemoryOperands(&Inst, InterestingOperands);
+
+      if (!InterestingOperands.empty()) {
+        for (auto &Operand : InterestingOperands) {
+          if (ClOpt && ClOptSameTemp) {
+            Value *Ptr = Operand.getPtr();
+            // If we have a mask, skip instrumentation if we've already
+            // instrumented the full object. But don't add to TempsToInstrument
+            // because we might get another load/store with a different mask.
+            if (Operand.MaybeMask) {
+              if (TempsToInstrument.count(Ptr))
+                continue; // We've seen this (whole) temp in the current BB.
+            } else {
+              if (!TempsToInstrument.insert(Ptr).second)
+                continue; // We've seen this temp in the current BB.
+            }
+          }
+          OperandsToInstrument.push_back(Operand);
+          NumInsnsPerBB++;
+        }
+      } else if (((ClInvalidPointerPairs || ClInvalidPointerCmp) &&
+                  isInterestingPointerComparison(&Inst)) ||
+                 ((ClInvalidPointerPairs || ClInvalidPointerSub) &&
+                  isInterestingPointerSubtraction(&Inst))) {
+        PointerComparisonsOrSubtracts.push_back(&Inst);
+      } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(&Inst)) {
+        // ok, take it.
+        IntrinToInstrument.push_back(MI);
+        NumInsnsPerBB++;
+      } else {
+        if (isa<AllocaInst>(Inst)) NumAllocas++;
+        if (auto *CB = dyn_cast<CallBase>(&Inst)) {
+          // A call inside BB.
+          TempsToInstrument.clear();
+          if (CB->doesNotReturn() && !CB->hasMetadata("nosanitize"))
+            NoReturnCalls.push_back(CB);
+        }
+        if (CallInst *CI = dyn_cast<CallInst>(&Inst))
+          maybeMarkSanitizerLibraryCallNoBuiltin(CI, TLI);
+      }
+      if (NumInsnsPerBB >= ClMaxInsnsToInstrumentPerBB) break;
+    }
+  }
+
+  bool UseCalls = (ClInstrumentationWithCallsThreshold >= 0 &&
+                   OperandsToInstrument.size() + IntrinToInstrument.size() >
+                       (unsigned)ClInstrumentationWithCallsThreshold);
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  ObjectSizeOpts ObjSizeOpts;
+  ObjSizeOpts.RoundToAlign = true;
+  ObjectSizeOffsetVisitor ObjSizeVis(DL, TLI, F.getContext(), ObjSizeOpts);
+
+  // Instrument.
+  int NumInstrumented = 0;
+  for (auto &Operand : OperandsToInstrument) {
+    if (!suppressInstrumentationSiteForDebug(NumInstrumented))
+      instrumentMop(ObjSizeVis, Operand, UseCalls,
+                    F.getParent()->getDataLayout());
+    FunctionModified = true;
+  }
+  for (auto Inst : IntrinToInstrument) {
+    if (!suppressInstrumentationSiteForDebug(NumInstrumented))
+      instrumentMemIntrinsic(Inst);
+    FunctionModified = true;
+  }
+
+  FunctionStackPoisoner FSP(F, *this);
+  bool ChangedStack = FSP.runOnFunction();
+
+  // We must unpoison the stack before NoReturn calls (throw, _exit, etc).
+  // See e.g. https://github.com/google/sanitizers/issues/37
+  for (auto CI : NoReturnCalls) {
+    IRBuilder<> IRB(CI);
+    IRB.CreateCall(AsanHandleNoReturnFunc, {});
+  }
+
+  for (auto Inst : PointerComparisonsOrSubtracts) {
+    instrumentPointerComparisonOrSubtraction(Inst);
+    FunctionModified = true;
+  }
+
+  if (ChangedStack || !NoReturnCalls.empty())
+    FunctionModified = true;
+
+  LLVM_DEBUG(dbgs() << "ASAN done instrumenting: " << FunctionModified << " "
+                    << F << "\n");
+
+  return FunctionModified;
+}
+
+// Workaround for bug 11395: we don't want to instrument stack in functions
+// with large assembly blobs (32-bit only), otherwise reg alloc may crash.
+// FIXME: remove once the bug 11395 is fixed.
+bool AddressSanitizer::LooksLikeCodeInBug11395(Instruction *I) {
+  if (LongSize != 32) return false;
+  CallInst *CI = dyn_cast<CallInst>(I);
+  if (!CI || !CI->isInlineAsm()) return false;
+  if (CI->getNumArgOperands() <= 5) return false;
+  // We have inline assembly with quite a few arguments.
+  return true;
+}
+
+void FunctionStackPoisoner::initializeCallbacks(Module &M) {
+  IRBuilder<> IRB(*C);
+  for (int i = 0; i <= kMaxAsanStackMallocSizeClass; i++) {
+    std::string Suffix = itostr(i);
+    AsanStackMallocFunc[i] = M.getOrInsertFunction(
+        kAsanStackMallocNameTemplate + Suffix, IntptrTy, IntptrTy);
+    AsanStackFreeFunc[i] =
+        M.getOrInsertFunction(kAsanStackFreeNameTemplate + Suffix,
+                              IRB.getVoidTy(), IntptrTy, IntptrTy);
+  }
+  if (ASan.UseAfterScope) {
+    AsanPoisonStackMemoryFunc = M.getOrInsertFunction(
+        kAsanPoisonStackMemoryName, IRB.getVoidTy(), IntptrTy, IntptrTy);
+    AsanUnpoisonStackMemoryFunc = M.getOrInsertFunction(
+        kAsanUnpoisonStackMemoryName, IRB.getVoidTy(), IntptrTy, IntptrTy);
+  }
+
+  for (size_t Val : {0x00, 0xf1, 0xf2, 0xf3, 0xf5, 0xf8}) {
+    std::ostringstream Name;
+    Name << kAsanSetShadowPrefix;
+    Name << std::setw(2) << std::setfill('0') << std::hex << Val;
+    AsanSetShadowFunc[Val] =
+        M.getOrInsertFunction(Name.str(), IRB.getVoidTy(), IntptrTy, IntptrTy);
+  }
+
+  AsanAllocaPoisonFunc = M.getOrInsertFunction(
+      kAsanAllocaPoison, IRB.getVoidTy(), IntptrTy, IntptrTy);
+  AsanAllocasUnpoisonFunc = M.getOrInsertFunction(
+      kAsanAllocasUnpoison, IRB.getVoidTy(), IntptrTy, IntptrTy);
+}
+
+void FunctionStackPoisoner::copyToShadowInline(ArrayRef<uint8_t> ShadowMask,
+                                               ArrayRef<uint8_t> ShadowBytes,
+                                               size_t Begin, size_t End,
+                                               IRBuilder<> &IRB,
+                                               Value *ShadowBase) {
+  if (Begin >= End)
+    return;
+
+  const size_t LargestStoreSizeInBytes =
+      std::min<size_t>(sizeof(uint64_t), ASan.LongSize / 8);
+
+  const bool IsLittleEndian = F.getParent()->getDataLayout().isLittleEndian();
+
+  // Poison given range in shadow using larges store size with out leading and
+  // trailing zeros in ShadowMask. Zeros never change, so they need neither
+  // poisoning nor up-poisoning. Still we don't mind if some of them get into a
+  // middle of a store.
+  for (size_t i = Begin; i < End;) {
+    if (!ShadowMask[i]) {
+      assert(!ShadowBytes[i]);
+      ++i;
+      continue;
+    }
+
+    size_t StoreSizeInBytes = LargestStoreSizeInBytes;
+    // Fit store size into the range.
+    while (StoreSizeInBytes > End - i)
+      StoreSizeInBytes /= 2;
+
+    // Minimize store size by trimming trailing zeros.
+    for (size_t j = StoreSizeInBytes - 1; j && !ShadowMask[i + j]; --j) {
+      while (j <= StoreSizeInBytes / 2)
+        StoreSizeInBytes /= 2;
+    }
+
+    uint64_t Val = 0;
+    for (size_t j = 0; j < StoreSizeInBytes; j++) {
+      if (IsLittleEndian)
+        Val |= (uint64_t)ShadowBytes[i + j] << (8 * j);
+      else
+        Val = (Val << 8) | ShadowBytes[i + j];
+    }
+
+    Value *Ptr = IRB.CreateAdd(ShadowBase, ConstantInt::get(IntptrTy, i));
+    Value *Poison = IRB.getIntN(StoreSizeInBytes * 8, Val);
+    IRB.CreateAlignedStore(
+        Poison, IRB.CreateIntToPtr(Ptr, Poison->getType()->getPointerTo()),
+        Align(1));
+
+    i += StoreSizeInBytes;
+  }
+}
+
+void FunctionStackPoisoner::copyToShadow(ArrayRef<uint8_t> ShadowMask,
+                                         ArrayRef<uint8_t> ShadowBytes,
+                                         IRBuilder<> &IRB, Value *ShadowBase) {
+  copyToShadow(ShadowMask, ShadowBytes, 0, ShadowMask.size(), IRB, ShadowBase);
+}
+
+void FunctionStackPoisoner::copyToShadow(ArrayRef<uint8_t> ShadowMask,
+                                         ArrayRef<uint8_t> ShadowBytes,
+                                         size_t Begin, size_t End,
+                                         IRBuilder<> &IRB, Value *ShadowBase) {
+  assert(ShadowMask.size() == ShadowBytes.size());
+  size_t Done = Begin;
+  for (size_t i = Begin, j = Begin + 1; i < End; i = j++) {
+    if (!ShadowMask[i]) {
+      assert(!ShadowBytes[i]);
+      continue;
+    }
+    uint8_t Val = ShadowBytes[i];
+    if (!AsanSetShadowFunc[Val])
+      continue;
+
+    // Skip same values.
+    for (; j < End && ShadowMask[j] && Val == ShadowBytes[j]; ++j) {
+    }
+
+    if (j - i >= ClMaxInlinePoisoningSize) {
+      copyToShadowInline(ShadowMask, ShadowBytes, Done, i, IRB, ShadowBase);
+      IRB.CreateCall(AsanSetShadowFunc[Val],
+                     {IRB.CreateAdd(ShadowBase, ConstantInt::get(IntptrTy, i)),
+                      ConstantInt::get(IntptrTy, j - i)});
+      Done = j;
+    }
+  }
+
+  copyToShadowInline(ShadowMask, ShadowBytes, Done, End, IRB, ShadowBase);
+}
+
+// Fake stack allocator (asan_fake_stack.h) has 11 size classes
+// for every power of 2 from kMinStackMallocSize to kMaxAsanStackMallocSizeClass
+static int StackMallocSizeClass(uint64_t LocalStackSize) {
+  assert(LocalStackSize <= kMaxStackMallocSize);
+  uint64_t MaxSize = kMinStackMallocSize;
+  for (int i = 0;; i++, MaxSize *= 2)
+    if (LocalStackSize <= MaxSize) return i;
+  llvm_unreachable("impossible LocalStackSize");
+}
+
+void FunctionStackPoisoner::copyArgsPassedByValToAllocas() {
+  Instruction *CopyInsertPoint = &F.front().front();
+  if (CopyInsertPoint == ASan.LocalDynamicShadow) {
+    // Insert after the dynamic shadow location is determined
+    CopyInsertPoint = CopyInsertPoint->getNextNode();
+    assert(CopyInsertPoint);
+  }
+  IRBuilder<> IRB(CopyInsertPoint);
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  for (Argument &Arg : F.args()) {
+    if (Arg.hasByValAttr()) {
+      Type *Ty = Arg.getParamByValType();
+      const Align Alignment =
+          DL.getValueOrABITypeAlignment(Arg.getParamAlign(), Ty);
+
+      AllocaInst *AI = IRB.CreateAlloca(
+          Ty, nullptr,
+          (Arg.hasName() ? Arg.getName() : "Arg" + Twine(Arg.getArgNo())) +
+              ".byval");
+      AI->setAlignment(Alignment);
+      Arg.replaceAllUsesWith(AI);
+
+      uint64_t AllocSize = DL.getTypeAllocSize(Ty);
+      IRB.CreateMemCpy(AI, Alignment, &Arg, Alignment, AllocSize);
+    }
+  }
+}
+
+PHINode *FunctionStackPoisoner::createPHI(IRBuilder<> &IRB, Value *Cond,
+                                          Value *ValueIfTrue,
+                                          Instruction *ThenTerm,
+                                          Value *ValueIfFalse) {
+  PHINode *PHI = IRB.CreatePHI(IntptrTy, 2);
+  BasicBlock *CondBlock = cast<Instruction>(Cond)->getParent();
+  PHI->addIncoming(ValueIfFalse, CondBlock);
+  BasicBlock *ThenBlock = ThenTerm->getParent();
+  PHI->addIncoming(ValueIfTrue, ThenBlock);
+  return PHI;
+}
+
+Value *FunctionStackPoisoner::createAllocaForLayout(
+    IRBuilder<> &IRB, const ASanStackFrameLayout &L, bool Dynamic) {
+  AllocaInst *Alloca;
+  if (Dynamic) {
+    Alloca = IRB.CreateAlloca(IRB.getInt8Ty(),
+                              ConstantInt::get(IRB.getInt64Ty(), L.FrameSize),
+                              "MyAlloca");
+  } else {
+    Alloca = IRB.CreateAlloca(ArrayType::get(IRB.getInt8Ty(), L.FrameSize),
+                              nullptr, "MyAlloca");
+    assert(Alloca->isStaticAlloca());
+  }
+  assert((ClRealignStack & (ClRealignStack - 1)) == 0);
+  size_t FrameAlignment = std::max(L.FrameAlignment, (size_t)ClRealignStack);
+  Alloca->setAlignment(Align(FrameAlignment));
+  return IRB.CreatePointerCast(Alloca, IntptrTy);
+}
+
+void FunctionStackPoisoner::createDynamicAllocasInitStorage() {
+  BasicBlock &FirstBB = *F.begin();
+  IRBuilder<> IRB(dyn_cast<Instruction>(FirstBB.begin()));
+  DynamicAllocaLayout = IRB.CreateAlloca(IntptrTy, nullptr);
+  IRB.CreateStore(Constant::getNullValue(IntptrTy), DynamicAllocaLayout);
+  DynamicAllocaLayout->setAlignment(Align(32));
+}
+
+void FunctionStackPoisoner::processDynamicAllocas() {
+  if (!ClInstrumentDynamicAllocas || DynamicAllocaVec.empty()) {
+    assert(DynamicAllocaPoisonCallVec.empty());
+    return;
+  }
+
+  // Insert poison calls for lifetime intrinsics for dynamic allocas.
+  for (const auto &APC : DynamicAllocaPoisonCallVec) {
+    assert(APC.InsBefore);
+    assert(APC.AI);
+    assert(ASan.isInterestingAlloca(*APC.AI));
+    assert(!APC.AI->isStaticAlloca());
+
+    IRBuilder<> IRB(APC.InsBefore);
+    poisonAlloca(APC.AI, APC.Size, IRB, APC.DoPoison);
+    // Dynamic allocas will be unpoisoned unconditionally below in
+    // unpoisonDynamicAllocas.
+    // Flag that we need unpoison static allocas.
+  }
+
+  // Handle dynamic allocas.
+  createDynamicAllocasInitStorage();
+  for (auto &AI : DynamicAllocaVec)
+    handleDynamicAllocaCall(AI);
+  unpoisonDynamicAllocas();
+}
+
+/// Collect instructions in the entry block after \p InsBefore which initialize
+/// permanent storage for a function argument. These instructions must remain in
+/// the entry block so that uninitialized values do not appear in backtraces. An
+/// added benefit is that this conserves spill slots. This does not move stores
+/// before instrumented / "interesting" allocas.
+static void findStoresToUninstrumentedArgAllocas(
+    AddressSanitizer &ASan, Instruction &InsBefore,
+    SmallVectorImpl<Instruction *> &InitInsts) {
+  Instruction *Start = InsBefore.getNextNonDebugInstruction();
+  for (Instruction *It = Start; It; It = It->getNextNonDebugInstruction()) {
+    // Argument initialization looks like:
+    // 1) store <Argument>, <Alloca> OR
+    // 2) <CastArgument> = cast <Argument> to ...
+    //    store <CastArgument> to <Alloca>
+    // Do not consider any other kind of instruction.
+    //
+    // Note: This covers all known cases, but may not be exhaustive. An
+    // alternative to pattern-matching stores is to DFS over all Argument uses:
+    // this might be more general, but is probably much more complicated.
+    if (isa<AllocaInst>(It) || isa<CastInst>(It))
+      continue;
+    if (auto *Store = dyn_cast<StoreInst>(It)) {
+      // The store destination must be an alloca that isn't interesting for
+      // ASan to instrument. These are moved up before InsBefore, and they're
+      // not interesting because allocas for arguments can be mem2reg'd.
+      auto *Alloca = dyn_cast<AllocaInst>(Store->getPointerOperand());
+      if (!Alloca || ASan.isInterestingAlloca(*Alloca))
+        continue;
+
+      Value *Val = Store->getValueOperand();
+      bool IsDirectArgInit = isa<Argument>(Val);
+      bool IsArgInitViaCast =
+          isa<CastInst>(Val) &&
+          isa<Argument>(cast<CastInst>(Val)->getOperand(0)) &&
+          // Check that the cast appears directly before the store. Otherwise
+          // moving the cast before InsBefore may break the IR.
+          Val == It->getPrevNonDebugInstruction();
+      bool IsArgInit = IsDirectArgInit || IsArgInitViaCast;
+      if (!IsArgInit)
+        continue;
+
+      if (IsArgInitViaCast)
+        InitInsts.push_back(cast<Instruction>(Val));
+      InitInsts.push_back(Store);
+      continue;
+    }
+
+    // Do not reorder past unknown instructions: argument initialization should
+    // only involve casts and stores.
+    return;
+  }
+}
+
+void FunctionStackPoisoner::processStaticAllocas() {
+  if (AllocaVec.empty()) {
+    assert(StaticAllocaPoisonCallVec.empty());
+    return;
+  }
+
+  int StackMallocIdx = -1;
+  DebugLoc EntryDebugLocation;
+  if (auto SP = F.getSubprogram())
     EntryDebugLocation =
         DILocation::get(SP->getContext(), SP->getScopeLine(), 0, SP);
- 
-  Instruction *InsBefore = AllocaVec[0]; 
-  IRBuilder<> IRB(InsBefore); 
- 
-  // Make sure non-instrumented allocas stay in the entry block. Otherwise, 
-  // debug info is broken, because only entry-block allocas are treated as 
-  // regular stack slots. 
-  auto InsBeforeB = InsBefore->getParent(); 
-  assert(InsBeforeB == &F.getEntryBlock()); 
-  for (auto *AI : StaticAllocasToMoveUp) 
-    if (AI->getParent() == InsBeforeB) 
-      AI->moveBefore(InsBefore); 
- 
-  // Move stores of arguments into entry-block allocas as well. This prevents 
-  // extra stack slots from being generated (to house the argument values until 
-  // they can be stored into the allocas). This also prevents uninitialized 
-  // values from being shown in backtraces. 
-  SmallVector<Instruction *, 8> ArgInitInsts; 
-  findStoresToUninstrumentedArgAllocas(ASan, *InsBefore, ArgInitInsts); 
-  for (Instruction *ArgInitInst : ArgInitInsts) 
-    ArgInitInst->moveBefore(InsBefore); 
- 
-  // If we have a call to llvm.localescape, keep it in the entry block. 
-  if (LocalEscapeCall) LocalEscapeCall->moveBefore(InsBefore); 
- 
-  SmallVector<ASanStackVariableDescription, 16> SVD; 
-  SVD.reserve(AllocaVec.size()); 
-  for (AllocaInst *AI : AllocaVec) { 
-    ASanStackVariableDescription D = {AI->getName().data(), 
-                                      ASan.getAllocaSizeInBytes(*AI), 
-                                      0, 
-                                      AI->getAlignment(), 
-                                      AI, 
-                                      0, 
-                                      0}; 
-    SVD.push_back(D); 
-  } 
- 
-  // Minimal header size (left redzone) is 4 pointers, 
-  // i.e. 32 bytes on 64-bit platforms and 16 bytes in 32-bit platforms. 
-  size_t Granularity = 1ULL << Mapping.Scale; 
-  size_t MinHeaderSize = std::max((size_t)ASan.LongSize / 2, Granularity); 
-  const ASanStackFrameLayout &L = 
-      ComputeASanStackFrameLayout(SVD, Granularity, MinHeaderSize); 
- 
-  // Build AllocaToSVDMap for ASanStackVariableDescription lookup. 
-  DenseMap<const AllocaInst *, ASanStackVariableDescription *> AllocaToSVDMap; 
-  for (auto &Desc : SVD) 
-    AllocaToSVDMap[Desc.AI] = &Desc; 
- 
-  // Update SVD with information from lifetime intrinsics. 
-  for (const auto &APC : StaticAllocaPoisonCallVec) { 
-    assert(APC.InsBefore); 
-    assert(APC.AI); 
-    assert(ASan.isInterestingAlloca(*APC.AI)); 
-    assert(APC.AI->isStaticAlloca()); 
- 
-    ASanStackVariableDescription &Desc = *AllocaToSVDMap[APC.AI]; 
-    Desc.LifetimeSize = Desc.Size; 
-    if (const DILocation *FnLoc = EntryDebugLocation.get()) { 
-      if (const DILocation *LifetimeLoc = APC.InsBefore->getDebugLoc().get()) { 
-        if (LifetimeLoc->getFile() == FnLoc->getFile()) 
-          if (unsigned Line = LifetimeLoc->getLine()) 
-            Desc.Line = std::min(Desc.Line ? Desc.Line : Line, Line); 
-      } 
-    } 
-  } 
- 
-  auto DescriptionString = ComputeASanStackFrameDescription(SVD); 
-  LLVM_DEBUG(dbgs() << DescriptionString << " --- " << L.FrameSize << "\n"); 
-  uint64_t LocalStackSize = L.FrameSize; 
-  bool DoStackMalloc = ClUseAfterReturn && !ASan.CompileKernel && 
-                       LocalStackSize <= kMaxStackMallocSize; 
-  bool DoDynamicAlloca = ClDynamicAllocaStack; 
-  // Don't do dynamic alloca or stack malloc if: 
-  // 1) There is inline asm: too often it makes assumptions on which registers 
-  //    are available. 
-  // 2) There is a returns_twice call (typically setjmp), which is 
-  //    optimization-hostile, and doesn't play well with introduced indirect 
-  //    register-relative calculation of local variable addresses. 
-  DoDynamicAlloca &= !HasInlineAsm && !HasReturnsTwiceCall; 
-  DoStackMalloc &= !HasInlineAsm && !HasReturnsTwiceCall; 
- 
-  Value *StaticAlloca = 
-      DoDynamicAlloca ? nullptr : createAllocaForLayout(IRB, L, false); 
- 
-  Value *FakeStack; 
-  Value *LocalStackBase; 
-  Value *LocalStackBaseAlloca; 
-  uint8_t DIExprFlags = DIExpression::ApplyOffset; 
- 
-  if (DoStackMalloc) { 
-    LocalStackBaseAlloca = 
-        IRB.CreateAlloca(IntptrTy, nullptr, "asan_local_stack_base"); 
-    // void *FakeStack = __asan_option_detect_stack_use_after_return 
-    //     ? __asan_stack_malloc_N(LocalStackSize) 
-    //     : nullptr; 
-    // void *LocalStackBase = (FakeStack) ? FakeStack : alloca(LocalStackSize); 
-    Constant *OptionDetectUseAfterReturn = F.getParent()->getOrInsertGlobal( 
-        kAsanOptionDetectUseAfterReturn, IRB.getInt32Ty()); 
-    Value *UseAfterReturnIsEnabled = IRB.CreateICmpNE( 
-        IRB.CreateLoad(IRB.getInt32Ty(), OptionDetectUseAfterReturn), 
-        Constant::getNullValue(IRB.getInt32Ty())); 
-    Instruction *Term = 
-        SplitBlockAndInsertIfThen(UseAfterReturnIsEnabled, InsBefore, false); 
-    IRBuilder<> IRBIf(Term); 
-    StackMallocIdx = StackMallocSizeClass(LocalStackSize); 
-    assert(StackMallocIdx <= kMaxAsanStackMallocSizeClass); 
-    Value *FakeStackValue = 
-        IRBIf.CreateCall(AsanStackMallocFunc[StackMallocIdx], 
-                         ConstantInt::get(IntptrTy, LocalStackSize)); 
-    IRB.SetInsertPoint(InsBefore); 
-    FakeStack = createPHI(IRB, UseAfterReturnIsEnabled, FakeStackValue, Term, 
-                          ConstantInt::get(IntptrTy, 0)); 
- 
-    Value *NoFakeStack = 
-        IRB.CreateICmpEQ(FakeStack, Constant::getNullValue(IntptrTy)); 
-    Term = SplitBlockAndInsertIfThen(NoFakeStack, InsBefore, false); 
-    IRBIf.SetInsertPoint(Term); 
-    Value *AllocaValue = 
-        DoDynamicAlloca ? createAllocaForLayout(IRBIf, L, true) : StaticAlloca; 
- 
-    IRB.SetInsertPoint(InsBefore); 
-    LocalStackBase = createPHI(IRB, NoFakeStack, AllocaValue, Term, FakeStack); 
-    IRB.CreateStore(LocalStackBase, LocalStackBaseAlloca); 
-    DIExprFlags |= DIExpression::DerefBefore; 
-  } else { 
-    // void *FakeStack = nullptr; 
-    // void *LocalStackBase = alloca(LocalStackSize); 
-    FakeStack = ConstantInt::get(IntptrTy, 0); 
-    LocalStackBase = 
-        DoDynamicAlloca ? createAllocaForLayout(IRB, L, true) : StaticAlloca; 
-    LocalStackBaseAlloca = LocalStackBase; 
-  } 
- 
-  // It shouldn't matter whether we pass an `alloca` or a `ptrtoint` as the 
-  // dbg.declare address opereand, but passing a `ptrtoint` seems to confuse 
-  // later passes and can result in dropped variable coverage in debug info. 
-  Value *LocalStackBaseAllocaPtr = 
-      isa<PtrToIntInst>(LocalStackBaseAlloca) 
-          ? cast<PtrToIntInst>(LocalStackBaseAlloca)->getPointerOperand() 
-          : LocalStackBaseAlloca; 
-  assert(isa<AllocaInst>(LocalStackBaseAllocaPtr) && 
-         "Variable descriptions relative to ASan stack base will be dropped"); 
- 
-  // Replace Alloca instructions with base+offset. 
-  for (const auto &Desc : SVD) { 
-    AllocaInst *AI = Desc.AI; 
-    replaceDbgDeclare(AI, LocalStackBaseAllocaPtr, DIB, DIExprFlags, 
-                      Desc.Offset); 
-    Value *NewAllocaPtr = IRB.CreateIntToPtr( 
-        IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, Desc.Offset)), 
-        AI->getType()); 
-    AI->replaceAllUsesWith(NewAllocaPtr); 
-  } 
- 
-  // The left-most redzone has enough space for at least 4 pointers. 
-  // Write the Magic value to redzone[0]. 
-  Value *BasePlus0 = IRB.CreateIntToPtr(LocalStackBase, IntptrPtrTy); 
-  IRB.CreateStore(ConstantInt::get(IntptrTy, kCurrentStackFrameMagic), 
-                  BasePlus0); 
-  // Write the frame description constant to redzone[1]. 
-  Value *BasePlus1 = IRB.CreateIntToPtr( 
-      IRB.CreateAdd(LocalStackBase, 
-                    ConstantInt::get(IntptrTy, ASan.LongSize / 8)), 
-      IntptrPtrTy); 
-  GlobalVariable *StackDescriptionGlobal = 
-      createPrivateGlobalForString(*F.getParent(), DescriptionString, 
-                                   /*AllowMerging*/ true, kAsanGenPrefix); 
-  Value *Description = IRB.CreatePointerCast(StackDescriptionGlobal, IntptrTy); 
-  IRB.CreateStore(Description, BasePlus1); 
-  // Write the PC to redzone[2]. 
-  Value *BasePlus2 = IRB.CreateIntToPtr( 
-      IRB.CreateAdd(LocalStackBase, 
-                    ConstantInt::get(IntptrTy, 2 * ASan.LongSize / 8)), 
-      IntptrPtrTy); 
-  IRB.CreateStore(IRB.CreatePointerCast(&F, IntptrTy), BasePlus2); 
- 
-  const auto &ShadowAfterScope = GetShadowBytesAfterScope(SVD, L); 
- 
-  // Poison the stack red zones at the entry. 
-  Value *ShadowBase = ASan.memToShadow(LocalStackBase, IRB); 
-  // As mask we must use most poisoned case: red zones and after scope. 
-  // As bytes we can use either the same or just red zones only. 
-  copyToShadow(ShadowAfterScope, ShadowAfterScope, IRB, ShadowBase); 
- 
-  if (!StaticAllocaPoisonCallVec.empty()) { 
-    const auto &ShadowInScope = GetShadowBytes(SVD, L); 
- 
-    // Poison static allocas near lifetime intrinsics. 
-    for (const auto &APC : StaticAllocaPoisonCallVec) { 
-      const ASanStackVariableDescription &Desc = *AllocaToSVDMap[APC.AI]; 
-      assert(Desc.Offset % L.Granularity == 0); 
-      size_t Begin = Desc.Offset / L.Granularity; 
-      size_t End = Begin + (APC.Size + L.Granularity - 1) / L.Granularity; 
- 
-      IRBuilder<> IRB(APC.InsBefore); 
-      copyToShadow(ShadowAfterScope, 
-                   APC.DoPoison ? ShadowAfterScope : ShadowInScope, Begin, End, 
-                   IRB, ShadowBase); 
-    } 
-  } 
- 
-  SmallVector<uint8_t, 64> ShadowClean(ShadowAfterScope.size(), 0); 
-  SmallVector<uint8_t, 64> ShadowAfterReturn; 
- 
-  // (Un)poison the stack before all ret instructions. 
+
+  Instruction *InsBefore = AllocaVec[0];
+  IRBuilder<> IRB(InsBefore);
+
+  // Make sure non-instrumented allocas stay in the entry block. Otherwise,
+  // debug info is broken, because only entry-block allocas are treated as
+  // regular stack slots.
+  auto InsBeforeB = InsBefore->getParent();
+  assert(InsBeforeB == &F.getEntryBlock());
+  for (auto *AI : StaticAllocasToMoveUp)
+    if (AI->getParent() == InsBeforeB)
+      AI->moveBefore(InsBefore);
+
+  // Move stores of arguments into entry-block allocas as well. This prevents
+  // extra stack slots from being generated (to house the argument values until
+  // they can be stored into the allocas). This also prevents uninitialized
+  // values from being shown in backtraces.
+  SmallVector<Instruction *, 8> ArgInitInsts;
+  findStoresToUninstrumentedArgAllocas(ASan, *InsBefore, ArgInitInsts);
+  for (Instruction *ArgInitInst : ArgInitInsts)
+    ArgInitInst->moveBefore(InsBefore);
+
+  // If we have a call to llvm.localescape, keep it in the entry block.
+  if (LocalEscapeCall) LocalEscapeCall->moveBefore(InsBefore);
+
+  SmallVector<ASanStackVariableDescription, 16> SVD;
+  SVD.reserve(AllocaVec.size());
+  for (AllocaInst *AI : AllocaVec) {
+    ASanStackVariableDescription D = {AI->getName().data(),
+                                      ASan.getAllocaSizeInBytes(*AI),
+                                      0,
+                                      AI->getAlignment(),
+                                      AI,
+                                      0,
+                                      0};
+    SVD.push_back(D);
+  }
+
+  // Minimal header size (left redzone) is 4 pointers,
+  // i.e. 32 bytes on 64-bit platforms and 16 bytes in 32-bit platforms.
+  size_t Granularity = 1ULL << Mapping.Scale;
+  size_t MinHeaderSize = std::max((size_t)ASan.LongSize / 2, Granularity);
+  const ASanStackFrameLayout &L =
+      ComputeASanStackFrameLayout(SVD, Granularity, MinHeaderSize);
+
+  // Build AllocaToSVDMap for ASanStackVariableDescription lookup.
+  DenseMap<const AllocaInst *, ASanStackVariableDescription *> AllocaToSVDMap;
+  for (auto &Desc : SVD)
+    AllocaToSVDMap[Desc.AI] = &Desc;
+
+  // Update SVD with information from lifetime intrinsics.
+  for (const auto &APC : StaticAllocaPoisonCallVec) {
+    assert(APC.InsBefore);
+    assert(APC.AI);
+    assert(ASan.isInterestingAlloca(*APC.AI));
+    assert(APC.AI->isStaticAlloca());
+
+    ASanStackVariableDescription &Desc = *AllocaToSVDMap[APC.AI];
+    Desc.LifetimeSize = Desc.Size;
+    if (const DILocation *FnLoc = EntryDebugLocation.get()) {
+      if (const DILocation *LifetimeLoc = APC.InsBefore->getDebugLoc().get()) {
+        if (LifetimeLoc->getFile() == FnLoc->getFile())
+          if (unsigned Line = LifetimeLoc->getLine())
+            Desc.Line = std::min(Desc.Line ? Desc.Line : Line, Line);
+      }
+    }
+  }
+
+  auto DescriptionString = ComputeASanStackFrameDescription(SVD);
+  LLVM_DEBUG(dbgs() << DescriptionString << " --- " << L.FrameSize << "\n");
+  uint64_t LocalStackSize = L.FrameSize;
+  bool DoStackMalloc = ClUseAfterReturn && !ASan.CompileKernel &&
+                       LocalStackSize <= kMaxStackMallocSize;
+  bool DoDynamicAlloca = ClDynamicAllocaStack;
+  // Don't do dynamic alloca or stack malloc if:
+  // 1) There is inline asm: too often it makes assumptions on which registers
+  //    are available.
+  // 2) There is a returns_twice call (typically setjmp), which is
+  //    optimization-hostile, and doesn't play well with introduced indirect
+  //    register-relative calculation of local variable addresses.
+  DoDynamicAlloca &= !HasInlineAsm && !HasReturnsTwiceCall;
+  DoStackMalloc &= !HasInlineAsm && !HasReturnsTwiceCall;
+
+  Value *StaticAlloca =
+      DoDynamicAlloca ? nullptr : createAllocaForLayout(IRB, L, false);
+
+  Value *FakeStack;
+  Value *LocalStackBase;
+  Value *LocalStackBaseAlloca;
+  uint8_t DIExprFlags = DIExpression::ApplyOffset;
+
+  if (DoStackMalloc) {
+    LocalStackBaseAlloca =
+        IRB.CreateAlloca(IntptrTy, nullptr, "asan_local_stack_base");
+    // void *FakeStack = __asan_option_detect_stack_use_after_return
+    //     ? __asan_stack_malloc_N(LocalStackSize)
+    //     : nullptr;
+    // void *LocalStackBase = (FakeStack) ? FakeStack : alloca(LocalStackSize);
+    Constant *OptionDetectUseAfterReturn = F.getParent()->getOrInsertGlobal(
+        kAsanOptionDetectUseAfterReturn, IRB.getInt32Ty());
+    Value *UseAfterReturnIsEnabled = IRB.CreateICmpNE(
+        IRB.CreateLoad(IRB.getInt32Ty(), OptionDetectUseAfterReturn),
+        Constant::getNullValue(IRB.getInt32Ty()));
+    Instruction *Term =
+        SplitBlockAndInsertIfThen(UseAfterReturnIsEnabled, InsBefore, false);
+    IRBuilder<> IRBIf(Term);
+    StackMallocIdx = StackMallocSizeClass(LocalStackSize);
+    assert(StackMallocIdx <= kMaxAsanStackMallocSizeClass);
+    Value *FakeStackValue =
+        IRBIf.CreateCall(AsanStackMallocFunc[StackMallocIdx],
+                         ConstantInt::get(IntptrTy, LocalStackSize));
+    IRB.SetInsertPoint(InsBefore);
+    FakeStack = createPHI(IRB, UseAfterReturnIsEnabled, FakeStackValue, Term,
+                          ConstantInt::get(IntptrTy, 0));
+
+    Value *NoFakeStack =
+        IRB.CreateICmpEQ(FakeStack, Constant::getNullValue(IntptrTy));
+    Term = SplitBlockAndInsertIfThen(NoFakeStack, InsBefore, false);
+    IRBIf.SetInsertPoint(Term);
+    Value *AllocaValue =
+        DoDynamicAlloca ? createAllocaForLayout(IRBIf, L, true) : StaticAlloca;
+
+    IRB.SetInsertPoint(InsBefore);
+    LocalStackBase = createPHI(IRB, NoFakeStack, AllocaValue, Term, FakeStack);
+    IRB.CreateStore(LocalStackBase, LocalStackBaseAlloca);
+    DIExprFlags |= DIExpression::DerefBefore;
+  } else {
+    // void *FakeStack = nullptr;
+    // void *LocalStackBase = alloca(LocalStackSize);
+    FakeStack = ConstantInt::get(IntptrTy, 0);
+    LocalStackBase =
+        DoDynamicAlloca ? createAllocaForLayout(IRB, L, true) : StaticAlloca;
+    LocalStackBaseAlloca = LocalStackBase;
+  }
+
+  // It shouldn't matter whether we pass an `alloca` or a `ptrtoint` as the
+  // dbg.declare address opereand, but passing a `ptrtoint` seems to confuse
+  // later passes and can result in dropped variable coverage in debug info.
+  Value *LocalStackBaseAllocaPtr =
+      isa<PtrToIntInst>(LocalStackBaseAlloca)
+          ? cast<PtrToIntInst>(LocalStackBaseAlloca)->getPointerOperand()
+          : LocalStackBaseAlloca;
+  assert(isa<AllocaInst>(LocalStackBaseAllocaPtr) &&
+         "Variable descriptions relative to ASan stack base will be dropped");
+
+  // Replace Alloca instructions with base+offset.
+  for (const auto &Desc : SVD) {
+    AllocaInst *AI = Desc.AI;
+    replaceDbgDeclare(AI, LocalStackBaseAllocaPtr, DIB, DIExprFlags,
+                      Desc.Offset);
+    Value *NewAllocaPtr = IRB.CreateIntToPtr(
+        IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, Desc.Offset)),
+        AI->getType());
+    AI->replaceAllUsesWith(NewAllocaPtr);
+  }
+
+  // The left-most redzone has enough space for at least 4 pointers.
+  // Write the Magic value to redzone[0].
+  Value *BasePlus0 = IRB.CreateIntToPtr(LocalStackBase, IntptrPtrTy);
+  IRB.CreateStore(ConstantInt::get(IntptrTy, kCurrentStackFrameMagic),
+                  BasePlus0);
+  // Write the frame description constant to redzone[1].
+  Value *BasePlus1 = IRB.CreateIntToPtr(
+      IRB.CreateAdd(LocalStackBase,
+                    ConstantInt::get(IntptrTy, ASan.LongSize / 8)),
+      IntptrPtrTy);
+  GlobalVariable *StackDescriptionGlobal =
+      createPrivateGlobalForString(*F.getParent(), DescriptionString,
+                                   /*AllowMerging*/ true, kAsanGenPrefix);
+  Value *Description = IRB.CreatePointerCast(StackDescriptionGlobal, IntptrTy);
+  IRB.CreateStore(Description, BasePlus1);
+  // Write the PC to redzone[2].
+  Value *BasePlus2 = IRB.CreateIntToPtr(
+      IRB.CreateAdd(LocalStackBase,
+                    ConstantInt::get(IntptrTy, 2 * ASan.LongSize / 8)),
+      IntptrPtrTy);
+  IRB.CreateStore(IRB.CreatePointerCast(&F, IntptrTy), BasePlus2);
+
+  const auto &ShadowAfterScope = GetShadowBytesAfterScope(SVD, L);
+
+  // Poison the stack red zones at the entry.
+  Value *ShadowBase = ASan.memToShadow(LocalStackBase, IRB);
+  // As mask we must use most poisoned case: red zones and after scope.
+  // As bytes we can use either the same or just red zones only.
+  copyToShadow(ShadowAfterScope, ShadowAfterScope, IRB, ShadowBase);
+
+  if (!StaticAllocaPoisonCallVec.empty()) {
+    const auto &ShadowInScope = GetShadowBytes(SVD, L);
+
+    // Poison static allocas near lifetime intrinsics.
+    for (const auto &APC : StaticAllocaPoisonCallVec) {
+      const ASanStackVariableDescription &Desc = *AllocaToSVDMap[APC.AI];
+      assert(Desc.Offset % L.Granularity == 0);
+      size_t Begin = Desc.Offset / L.Granularity;
+      size_t End = Begin + (APC.Size + L.Granularity - 1) / L.Granularity;
+
+      IRBuilder<> IRB(APC.InsBefore);
+      copyToShadow(ShadowAfterScope,
+                   APC.DoPoison ? ShadowAfterScope : ShadowInScope, Begin, End,
+                   IRB, ShadowBase);
+    }
+  }
+
+  SmallVector<uint8_t, 64> ShadowClean(ShadowAfterScope.size(), 0);
+  SmallVector<uint8_t, 64> ShadowAfterReturn;
+
+  // (Un)poison the stack before all ret instructions.
   for (Instruction *Ret : RetVec) {
-    IRBuilder<> IRBRet(Ret); 
-    // Mark the current frame as retired. 
-    IRBRet.CreateStore(ConstantInt::get(IntptrTy, kRetiredStackFrameMagic), 
-                       BasePlus0); 
-    if (DoStackMalloc) { 
-      assert(StackMallocIdx >= 0); 
-      // if FakeStack != 0  // LocalStackBase == FakeStack 
-      //     // In use-after-return mode, poison the whole stack frame. 
-      //     if StackMallocIdx <= 4 
-      //         // For small sizes inline the whole thing: 
-      //         memset(ShadowBase, kAsanStackAfterReturnMagic, ShadowSize); 
-      //         **SavedFlagPtr(FakeStack) = 0 
-      //     else 
-      //         __asan_stack_free_N(FakeStack, LocalStackSize) 
-      // else 
-      //     <This is not a fake stack; unpoison the redzones> 
-      Value *Cmp = 
-          IRBRet.CreateICmpNE(FakeStack, Constant::getNullValue(IntptrTy)); 
-      Instruction *ThenTerm, *ElseTerm; 
-      SplitBlockAndInsertIfThenElse(Cmp, Ret, &ThenTerm, &ElseTerm); 
- 
-      IRBuilder<> IRBPoison(ThenTerm); 
-      if (StackMallocIdx <= 4) { 
-        int ClassSize = kMinStackMallocSize << StackMallocIdx; 
-        ShadowAfterReturn.resize(ClassSize / L.Granularity, 
-                                 kAsanStackUseAfterReturnMagic); 
-        copyToShadow(ShadowAfterReturn, ShadowAfterReturn, IRBPoison, 
-                     ShadowBase); 
-        Value *SavedFlagPtrPtr = IRBPoison.CreateAdd( 
-            FakeStack, 
-            ConstantInt::get(IntptrTy, ClassSize - ASan.LongSize / 8)); 
-        Value *SavedFlagPtr = IRBPoison.CreateLoad( 
-            IntptrTy, IRBPoison.CreateIntToPtr(SavedFlagPtrPtr, IntptrPtrTy)); 
-        IRBPoison.CreateStore( 
-            Constant::getNullValue(IRBPoison.getInt8Ty()), 
-            IRBPoison.CreateIntToPtr(SavedFlagPtr, IRBPoison.getInt8PtrTy())); 
-      } else { 
-        // For larger frames call __asan_stack_free_*. 
-        IRBPoison.CreateCall( 
-            AsanStackFreeFunc[StackMallocIdx], 
-            {FakeStack, ConstantInt::get(IntptrTy, LocalStackSize)}); 
-      } 
- 
-      IRBuilder<> IRBElse(ElseTerm); 
-      copyToShadow(ShadowAfterScope, ShadowClean, IRBElse, ShadowBase); 
-    } else { 
-      copyToShadow(ShadowAfterScope, ShadowClean, IRBRet, ShadowBase); 
-    } 
-  } 
- 
-  // We are done. Remove the old unused alloca instructions. 
-  for (auto AI : AllocaVec) AI->eraseFromParent(); 
-} 
- 
-void FunctionStackPoisoner::poisonAlloca(Value *V, uint64_t Size, 
-                                         IRBuilder<> &IRB, bool DoPoison) { 
-  // For now just insert the call to ASan runtime. 
-  Value *AddrArg = IRB.CreatePointerCast(V, IntptrTy); 
-  Value *SizeArg = ConstantInt::get(IntptrTy, Size); 
-  IRB.CreateCall( 
-      DoPoison ? AsanPoisonStackMemoryFunc : AsanUnpoisonStackMemoryFunc, 
-      {AddrArg, SizeArg}); 
-} 
- 
-// Handling llvm.lifetime intrinsics for a given %alloca: 
-// (1) collect all llvm.lifetime.xxx(%size, %value) describing the alloca. 
-// (2) if %size is constant, poison memory for llvm.lifetime.end (to detect 
-//     invalid accesses) and unpoison it for llvm.lifetime.start (the memory 
-//     could be poisoned by previous llvm.lifetime.end instruction, as the 
-//     variable may go in and out of scope several times, e.g. in loops). 
-// (3) if we poisoned at least one %alloca in a function, 
-//     unpoison the whole stack frame at function exit. 
-void FunctionStackPoisoner::handleDynamicAllocaCall(AllocaInst *AI) { 
-  IRBuilder<> IRB(AI); 
- 
-  const unsigned Alignment = std::max(kAllocaRzSize, AI->getAlignment()); 
-  const uint64_t AllocaRedzoneMask = kAllocaRzSize - 1; 
- 
-  Value *Zero = Constant::getNullValue(IntptrTy); 
-  Value *AllocaRzSize = ConstantInt::get(IntptrTy, kAllocaRzSize); 
-  Value *AllocaRzMask = ConstantInt::get(IntptrTy, AllocaRedzoneMask); 
- 
-  // Since we need to extend alloca with additional memory to locate 
-  // redzones, and OldSize is number of allocated blocks with 
-  // ElementSize size, get allocated memory size in bytes by 
-  // OldSize * ElementSize. 
-  const unsigned ElementSize = 
-      F.getParent()->getDataLayout().getTypeAllocSize(AI->getAllocatedType()); 
-  Value *OldSize = 
-      IRB.CreateMul(IRB.CreateIntCast(AI->getArraySize(), IntptrTy, false), 
-                    ConstantInt::get(IntptrTy, ElementSize)); 
- 
-  // PartialSize = OldSize % 32 
-  Value *PartialSize = IRB.CreateAnd(OldSize, AllocaRzMask); 
- 
-  // Misalign = kAllocaRzSize - PartialSize; 
-  Value *Misalign = IRB.CreateSub(AllocaRzSize, PartialSize); 
- 
-  // PartialPadding = Misalign != kAllocaRzSize ? Misalign : 0; 
-  Value *Cond = IRB.CreateICmpNE(Misalign, AllocaRzSize); 
-  Value *PartialPadding = IRB.CreateSelect(Cond, Misalign, Zero); 
- 
-  // AdditionalChunkSize = Alignment + PartialPadding + kAllocaRzSize 
-  // Alignment is added to locate left redzone, PartialPadding for possible 
-  // partial redzone and kAllocaRzSize for right redzone respectively. 
-  Value *AdditionalChunkSize = IRB.CreateAdd( 
-      ConstantInt::get(IntptrTy, Alignment + kAllocaRzSize), PartialPadding); 
- 
-  Value *NewSize = IRB.CreateAdd(OldSize, AdditionalChunkSize); 
- 
-  // Insert new alloca with new NewSize and Alignment params. 
-  AllocaInst *NewAlloca = IRB.CreateAlloca(IRB.getInt8Ty(), NewSize); 
-  NewAlloca->setAlignment(Align(Alignment)); 
- 
-  // NewAddress = Address + Alignment 
-  Value *NewAddress = IRB.CreateAdd(IRB.CreatePtrToInt(NewAlloca, IntptrTy), 
-                                    ConstantInt::get(IntptrTy, Alignment)); 
- 
-  // Insert __asan_alloca_poison call for new created alloca. 
-  IRB.CreateCall(AsanAllocaPoisonFunc, {NewAddress, OldSize}); 
- 
-  // Store the last alloca's address to DynamicAllocaLayout. We'll need this 
-  // for unpoisoning stuff. 
-  IRB.CreateStore(IRB.CreatePtrToInt(NewAlloca, IntptrTy), DynamicAllocaLayout); 
- 
-  Value *NewAddressPtr = IRB.CreateIntToPtr(NewAddress, AI->getType()); 
- 
-  // Replace all uses of AddessReturnedByAlloca with NewAddressPtr. 
-  AI->replaceAllUsesWith(NewAddressPtr); 
- 
-  // We are done. Erase old alloca from parent. 
-  AI->eraseFromParent(); 
-} 
- 
-// isSafeAccess returns true if Addr is always inbounds with respect to its 
-// base object. For example, it is a field access or an array access with 
-// constant inbounds index. 
-bool AddressSanitizer::isSafeAccess(ObjectSizeOffsetVisitor &ObjSizeVis, 
-                                    Value *Addr, uint64_t TypeSize) const { 
-  SizeOffsetType SizeOffset = ObjSizeVis.compute(Addr); 
-  if (!ObjSizeVis.bothKnown(SizeOffset)) return false; 
-  uint64_t Size = SizeOffset.first.getZExtValue(); 
-  int64_t Offset = SizeOffset.second.getSExtValue(); 
-  // Three checks are required to ensure safety: 
-  // . Offset >= 0  (since the offset is given from the base ptr) 
-  // . Size >= Offset  (unsigned) 
-  // . Size - Offset >= NeededSize  (unsigned) 
-  return Offset >= 0 && Size >= uint64_t(Offset) && 
-         Size - uint64_t(Offset) >= TypeSize / 8; 
-} 
+    IRBuilder<> IRBRet(Ret);
+    // Mark the current frame as retired.
+    IRBRet.CreateStore(ConstantInt::get(IntptrTy, kRetiredStackFrameMagic),
+                       BasePlus0);
+    if (DoStackMalloc) {
+      assert(StackMallocIdx >= 0);
+      // if FakeStack != 0  // LocalStackBase == FakeStack
+      //     // In use-after-return mode, poison the whole stack frame.
+      //     if StackMallocIdx <= 4
+      //         // For small sizes inline the whole thing:
+      //         memset(ShadowBase, kAsanStackAfterReturnMagic, ShadowSize);
+      //         **SavedFlagPtr(FakeStack) = 0
+      //     else
+      //         __asan_stack_free_N(FakeStack, LocalStackSize)
+      // else
+      //     <This is not a fake stack; unpoison the redzones>
+      Value *Cmp =
+          IRBRet.CreateICmpNE(FakeStack, Constant::getNullValue(IntptrTy));
+      Instruction *ThenTerm, *ElseTerm;
+      SplitBlockAndInsertIfThenElse(Cmp, Ret, &ThenTerm, &ElseTerm);
+
+      IRBuilder<> IRBPoison(ThenTerm);
+      if (StackMallocIdx <= 4) {
+        int ClassSize = kMinStackMallocSize << StackMallocIdx;
+        ShadowAfterReturn.resize(ClassSize / L.Granularity,
+                                 kAsanStackUseAfterReturnMagic);
+        copyToShadow(ShadowAfterReturn, ShadowAfterReturn, IRBPoison,
+                     ShadowBase);
+        Value *SavedFlagPtrPtr = IRBPoison.CreateAdd(
+            FakeStack,
+            ConstantInt::get(IntptrTy, ClassSize - ASan.LongSize / 8));
+        Value *SavedFlagPtr = IRBPoison.CreateLoad(
+            IntptrTy, IRBPoison.CreateIntToPtr(SavedFlagPtrPtr, IntptrPtrTy));
+        IRBPoison.CreateStore(
+            Constant::getNullValue(IRBPoison.getInt8Ty()),
+            IRBPoison.CreateIntToPtr(SavedFlagPtr, IRBPoison.getInt8PtrTy()));
+      } else {
+        // For larger frames call __asan_stack_free_*.
+        IRBPoison.CreateCall(
+            AsanStackFreeFunc[StackMallocIdx],
+            {FakeStack, ConstantInt::get(IntptrTy, LocalStackSize)});
+      }
+
+      IRBuilder<> IRBElse(ElseTerm);
+      copyToShadow(ShadowAfterScope, ShadowClean, IRBElse, ShadowBase);
+    } else {
+      copyToShadow(ShadowAfterScope, ShadowClean, IRBRet, ShadowBase);
+    }
+  }
+
+  // We are done. Remove the old unused alloca instructions.
+  for (auto AI : AllocaVec) AI->eraseFromParent();
+}
+
+void FunctionStackPoisoner::poisonAlloca(Value *V, uint64_t Size,
+                                         IRBuilder<> &IRB, bool DoPoison) {
+  // For now just insert the call to ASan runtime.
+  Value *AddrArg = IRB.CreatePointerCast(V, IntptrTy);
+  Value *SizeArg = ConstantInt::get(IntptrTy, Size);
+  IRB.CreateCall(
+      DoPoison ? AsanPoisonStackMemoryFunc : AsanUnpoisonStackMemoryFunc,
+      {AddrArg, SizeArg});
+}
+
+// Handling llvm.lifetime intrinsics for a given %alloca:
+// (1) collect all llvm.lifetime.xxx(%size, %value) describing the alloca.
+// (2) if %size is constant, poison memory for llvm.lifetime.end (to detect
+//     invalid accesses) and unpoison it for llvm.lifetime.start (the memory
+//     could be poisoned by previous llvm.lifetime.end instruction, as the
+//     variable may go in and out of scope several times, e.g. in loops).
+// (3) if we poisoned at least one %alloca in a function,
+//     unpoison the whole stack frame at function exit.
+void FunctionStackPoisoner::handleDynamicAllocaCall(AllocaInst *AI) {
+  IRBuilder<> IRB(AI);
+
+  const unsigned Alignment = std::max(kAllocaRzSize, AI->getAlignment());
+  const uint64_t AllocaRedzoneMask = kAllocaRzSize - 1;
+
+  Value *Zero = Constant::getNullValue(IntptrTy);
+  Value *AllocaRzSize = ConstantInt::get(IntptrTy, kAllocaRzSize);
+  Value *AllocaRzMask = ConstantInt::get(IntptrTy, AllocaRedzoneMask);
+
+  // Since we need to extend alloca with additional memory to locate
+  // redzones, and OldSize is number of allocated blocks with
+  // ElementSize size, get allocated memory size in bytes by
+  // OldSize * ElementSize.
+  const unsigned ElementSize =
+      F.getParent()->getDataLayout().getTypeAllocSize(AI->getAllocatedType());
+  Value *OldSize =
+      IRB.CreateMul(IRB.CreateIntCast(AI->getArraySize(), IntptrTy, false),
+                    ConstantInt::get(IntptrTy, ElementSize));
+
+  // PartialSize = OldSize % 32
+  Value *PartialSize = IRB.CreateAnd(OldSize, AllocaRzMask);
+
+  // Misalign = kAllocaRzSize - PartialSize;
+  Value *Misalign = IRB.CreateSub(AllocaRzSize, PartialSize);
+
+  // PartialPadding = Misalign != kAllocaRzSize ? Misalign : 0;
+  Value *Cond = IRB.CreateICmpNE(Misalign, AllocaRzSize);
+  Value *PartialPadding = IRB.CreateSelect(Cond, Misalign, Zero);
+
+  // AdditionalChunkSize = Alignment + PartialPadding + kAllocaRzSize
+  // Alignment is added to locate left redzone, PartialPadding for possible
+  // partial redzone and kAllocaRzSize for right redzone respectively.
+  Value *AdditionalChunkSize = IRB.CreateAdd(
+      ConstantInt::get(IntptrTy, Alignment + kAllocaRzSize), PartialPadding);
+
+  Value *NewSize = IRB.CreateAdd(OldSize, AdditionalChunkSize);
+
+  // Insert new alloca with new NewSize and Alignment params.
+  AllocaInst *NewAlloca = IRB.CreateAlloca(IRB.getInt8Ty(), NewSize);
+  NewAlloca->setAlignment(Align(Alignment));
+
+  // NewAddress = Address + Alignment
+  Value *NewAddress = IRB.CreateAdd(IRB.CreatePtrToInt(NewAlloca, IntptrTy),
+                                    ConstantInt::get(IntptrTy, Alignment));
+
+  // Insert __asan_alloca_poison call for new created alloca.
+  IRB.CreateCall(AsanAllocaPoisonFunc, {NewAddress, OldSize});
+
+  // Store the last alloca's address to DynamicAllocaLayout. We'll need this
+  // for unpoisoning stuff.
+  IRB.CreateStore(IRB.CreatePtrToInt(NewAlloca, IntptrTy), DynamicAllocaLayout);
+
+  Value *NewAddressPtr = IRB.CreateIntToPtr(NewAddress, AI->getType());
+
+  // Replace all uses of AddessReturnedByAlloca with NewAddressPtr.
+  AI->replaceAllUsesWith(NewAddressPtr);
+
+  // We are done. Erase old alloca from parent.
+  AI->eraseFromParent();
+}
+
+// isSafeAccess returns true if Addr is always inbounds with respect to its
+// base object. For example, it is a field access or an array access with
+// constant inbounds index.
+bool AddressSanitizer::isSafeAccess(ObjectSizeOffsetVisitor &ObjSizeVis,
+                                    Value *Addr, uint64_t TypeSize) const {
+  SizeOffsetType SizeOffset = ObjSizeVis.compute(Addr);
+  if (!ObjSizeVis.bothKnown(SizeOffset)) return false;
+  uint64_t Size = SizeOffset.first.getZExtValue();
+  int64_t Offset = SizeOffset.second.getSExtValue();
+  // Three checks are required to ensure safety:
+  // . Offset >= 0  (since the offset is given from the base ptr)
+  // . Size >= Offset  (unsigned)
+  // . Size - Offset >= NeededSize  (unsigned)
+  return Offset >= 0 && Size >= uint64_t(Offset) &&
+         Size - uint64_t(Offset) >= TypeSize / 8;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/BoundsChecking.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/BoundsChecking.cpp
index c2d9964ecc..efb11b68a1 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -1,254 +1,254 @@
-//===- BoundsChecking.cpp - Instrumentation for run-time bounds checking --===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Instrumentation/BoundsChecking.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/Twine.h" 
-#include "llvm/Analysis/MemoryBuiltins.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/Analysis/TargetFolder.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstIterator.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include <cstdint> 
-#include <vector> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "bounds-checking" 
- 
-static cl::opt<bool> SingleTrapBB("bounds-checking-single-trap", 
-                                  cl::desc("Use one trap block per function")); 
- 
-STATISTIC(ChecksAdded, "Bounds checks added"); 
-STATISTIC(ChecksSkipped, "Bounds checks skipped"); 
-STATISTIC(ChecksUnable, "Bounds checks unable to add"); 
- 
-using BuilderTy = IRBuilder<TargetFolder>; 
- 
-/// Gets the conditions under which memory accessing instructions will overflow. 
-/// 
-/// \p Ptr is the pointer that will be read/written, and \p InstVal is either 
-/// the result from the load or the value being stored. It is used to determine 
-/// the size of memory block that is touched. 
-/// 
-/// Returns the condition under which the access will overflow. 
-static Value *getBoundsCheckCond(Value *Ptr, Value *InstVal, 
-                                 const DataLayout &DL, TargetLibraryInfo &TLI, 
-                                 ObjectSizeOffsetEvaluator &ObjSizeEval, 
-                                 BuilderTy &IRB, ScalarEvolution &SE) { 
-  uint64_t NeededSize = DL.getTypeStoreSize(InstVal->getType()); 
-  LLVM_DEBUG(dbgs() << "Instrument " << *Ptr << " for " << Twine(NeededSize) 
-                    << " bytes\n"); 
- 
-  SizeOffsetEvalType SizeOffset = ObjSizeEval.compute(Ptr); 
- 
-  if (!ObjSizeEval.bothKnown(SizeOffset)) { 
-    ++ChecksUnable; 
-    return nullptr; 
-  } 
- 
-  Value *Size   = SizeOffset.first; 
-  Value *Offset = SizeOffset.second; 
-  ConstantInt *SizeCI = dyn_cast<ConstantInt>(Size); 
- 
-  Type *IntTy = DL.getIntPtrType(Ptr->getType()); 
-  Value *NeededSizeVal = ConstantInt::get(IntTy, NeededSize); 
- 
-  auto SizeRange = SE.getUnsignedRange(SE.getSCEV(Size)); 
-  auto OffsetRange = SE.getUnsignedRange(SE.getSCEV(Offset)); 
-  auto NeededSizeRange = SE.getUnsignedRange(SE.getSCEV(NeededSizeVal)); 
- 
-  // three checks are required to ensure safety: 
-  // . Offset >= 0  (since the offset is given from the base ptr) 
-  // . Size >= Offset  (unsigned) 
-  // . Size - Offset >= NeededSize  (unsigned) 
-  // 
-  // optimization: if Size >= 0 (signed), skip 1st check 
-  // FIXME: add NSW/NUW here?  -- we dont care if the subtraction overflows 
-  Value *ObjSize = IRB.CreateSub(Size, Offset); 
-  Value *Cmp2 = SizeRange.getUnsignedMin().uge(OffsetRange.getUnsignedMax()) 
-                    ? ConstantInt::getFalse(Ptr->getContext()) 
-                    : IRB.CreateICmpULT(Size, Offset); 
-  Value *Cmp3 = SizeRange.sub(OffsetRange) 
-                        .getUnsignedMin() 
-                        .uge(NeededSizeRange.getUnsignedMax()) 
-                    ? ConstantInt::getFalse(Ptr->getContext()) 
-                    : IRB.CreateICmpULT(ObjSize, NeededSizeVal); 
-  Value *Or = IRB.CreateOr(Cmp2, Cmp3); 
-  if ((!SizeCI || SizeCI->getValue().slt(0)) && 
-      !SizeRange.getSignedMin().isNonNegative()) { 
-    Value *Cmp1 = IRB.CreateICmpSLT(Offset, ConstantInt::get(IntTy, 0)); 
-    Or = IRB.CreateOr(Cmp1, Or); 
-  } 
- 
-  return Or; 
-} 
- 
-/// Adds run-time bounds checks to memory accessing instructions. 
-/// 
-/// \p Or is the condition that should guard the trap. 
-/// 
-/// \p GetTrapBB is a callable that returns the trap BB to use on failure. 
-template <typename GetTrapBBT> 
-static void insertBoundsCheck(Value *Or, BuilderTy &IRB, GetTrapBBT GetTrapBB) { 
-  // check if the comparison is always false 
-  ConstantInt *C = dyn_cast_or_null<ConstantInt>(Or); 
-  if (C) { 
-    ++ChecksSkipped; 
-    // If non-zero, nothing to do. 
-    if (!C->getZExtValue()) 
-      return; 
-  } 
-  ++ChecksAdded; 
- 
-  BasicBlock::iterator SplitI = IRB.GetInsertPoint(); 
-  BasicBlock *OldBB = SplitI->getParent(); 
-  BasicBlock *Cont = OldBB->splitBasicBlock(SplitI); 
-  OldBB->getTerminator()->eraseFromParent(); 
- 
-  if (C) { 
-    // If we have a constant zero, unconditionally branch. 
-    // FIXME: We should really handle this differently to bypass the splitting 
-    // the block. 
-    BranchInst::Create(GetTrapBB(IRB), OldBB); 
-    return; 
-  } 
- 
-  // Create the conditional branch. 
-  BranchInst::Create(GetTrapBB(IRB), Cont, Or, OldBB); 
-} 
- 
-static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI, 
-                              ScalarEvolution &SE) { 
-  const DataLayout &DL = F.getParent()->getDataLayout(); 
-  ObjectSizeOpts EvalOpts; 
-  EvalOpts.RoundToAlign = true; 
-  ObjectSizeOffsetEvaluator ObjSizeEval(DL, &TLI, F.getContext(), EvalOpts); 
- 
-  // check HANDLE_MEMORY_INST in include/llvm/Instruction.def for memory 
-  // touching instructions 
-  SmallVector<std::pair<Instruction *, Value *>, 4> TrapInfo; 
-  for (Instruction &I : instructions(F)) { 
-    Value *Or = nullptr; 
-    BuilderTy IRB(I.getParent(), BasicBlock::iterator(&I), TargetFolder(DL)); 
-    if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { 
-      if (!LI->isVolatile()) 
-        Or = getBoundsCheckCond(LI->getPointerOperand(), LI, DL, TLI, 
-                                ObjSizeEval, IRB, SE); 
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { 
-      if (!SI->isVolatile()) 
-        Or = getBoundsCheckCond(SI->getPointerOperand(), SI->getValueOperand(), 
-                                DL, TLI, ObjSizeEval, IRB, SE); 
-    } else if (AtomicCmpXchgInst *AI = dyn_cast<AtomicCmpXchgInst>(&I)) { 
-      if (!AI->isVolatile()) 
-        Or = 
-            getBoundsCheckCond(AI->getPointerOperand(), AI->getCompareOperand(), 
-                               DL, TLI, ObjSizeEval, IRB, SE); 
-    } else if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(&I)) { 
-      if (!AI->isVolatile()) 
-        Or = getBoundsCheckCond(AI->getPointerOperand(), AI->getValOperand(), 
-                                DL, TLI, ObjSizeEval, IRB, SE); 
-    } 
-    if (Or) 
-      TrapInfo.push_back(std::make_pair(&I, Or)); 
-  } 
- 
-  // Create a trapping basic block on demand using a callback. Depending on 
-  // flags, this will either create a single block for the entire function or 
-  // will create a fresh block every time it is called. 
-  BasicBlock *TrapBB = nullptr; 
-  auto GetTrapBB = [&TrapBB](BuilderTy &IRB) { 
-    if (TrapBB && SingleTrapBB) 
-      return TrapBB; 
- 
-    Function *Fn = IRB.GetInsertBlock()->getParent(); 
-    // FIXME: This debug location doesn't make a lot of sense in the 
-    // `SingleTrapBB` case. 
-    auto DebugLoc = IRB.getCurrentDebugLocation(); 
-    IRBuilder<>::InsertPointGuard Guard(IRB); 
-    TrapBB = BasicBlock::Create(Fn->getContext(), "trap", Fn); 
-    IRB.SetInsertPoint(TrapBB); 
- 
-    auto *F = Intrinsic::getDeclaration(Fn->getParent(), Intrinsic::trap); 
-    CallInst *TrapCall = IRB.CreateCall(F, {}); 
-    TrapCall->setDoesNotReturn(); 
-    TrapCall->setDoesNotThrow(); 
-    TrapCall->setDebugLoc(DebugLoc); 
-    IRB.CreateUnreachable(); 
- 
-    return TrapBB; 
-  }; 
- 
-  // Add the checks. 
-  for (const auto &Entry : TrapInfo) { 
-    Instruction *Inst = Entry.first; 
-    BuilderTy IRB(Inst->getParent(), BasicBlock::iterator(Inst), TargetFolder(DL)); 
-    insertBoundsCheck(Entry.second, IRB, GetTrapBB); 
-  } 
- 
-  return !TrapInfo.empty(); 
-} 
- 
-PreservedAnalyses BoundsCheckingPass::run(Function &F, FunctionAnalysisManager &AM) { 
-  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 
-  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 
- 
-  if (!addBoundsChecking(F, TLI, SE)) 
-    return PreservedAnalyses::all(); 
- 
-  return PreservedAnalyses::none(); 
-} 
- 
-namespace { 
-struct BoundsCheckingLegacyPass : public FunctionPass { 
-  static char ID; 
- 
-  BoundsCheckingLegacyPass() : FunctionPass(ID) { 
-    initializeBoundsCheckingLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnFunction(Function &F) override { 
-    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 
-    auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 
-    return addBoundsChecking(F, TLI, SE); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-    AU.addRequired<ScalarEvolutionWrapperPass>(); 
-  } 
-}; 
-} // namespace 
- 
-char BoundsCheckingLegacyPass::ID = 0; 
-INITIALIZE_PASS_BEGIN(BoundsCheckingLegacyPass, "bounds-checking", 
-                      "Run-time bounds checking", false, false) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_END(BoundsCheckingLegacyPass, "bounds-checking", 
-                    "Run-time bounds checking", false, false) 
- 
-FunctionPass *llvm::createBoundsCheckingLegacyPass() { 
-  return new BoundsCheckingLegacyPass(); 
-} 
+//===- BoundsChecking.cpp - Instrumentation for run-time bounds checking --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/BoundsChecking.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetFolder.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdint>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "bounds-checking"
+
+static cl::opt<bool> SingleTrapBB("bounds-checking-single-trap",
+                                  cl::desc("Use one trap block per function"));
+
+STATISTIC(ChecksAdded, "Bounds checks added");
+STATISTIC(ChecksSkipped, "Bounds checks skipped");
+STATISTIC(ChecksUnable, "Bounds checks unable to add");
+
+using BuilderTy = IRBuilder<TargetFolder>;
+
+/// Gets the conditions under which memory accessing instructions will overflow.
+///
+/// \p Ptr is the pointer that will be read/written, and \p InstVal is either
+/// the result from the load or the value being stored. It is used to determine
+/// the size of memory block that is touched.
+///
+/// Returns the condition under which the access will overflow.
+static Value *getBoundsCheckCond(Value *Ptr, Value *InstVal,
+                                 const DataLayout &DL, TargetLibraryInfo &TLI,
+                                 ObjectSizeOffsetEvaluator &ObjSizeEval,
+                                 BuilderTy &IRB, ScalarEvolution &SE) {
+  uint64_t NeededSize = DL.getTypeStoreSize(InstVal->getType());
+  LLVM_DEBUG(dbgs() << "Instrument " << *Ptr << " for " << Twine(NeededSize)
+                    << " bytes\n");
+
+  SizeOffsetEvalType SizeOffset = ObjSizeEval.compute(Ptr);
+
+  if (!ObjSizeEval.bothKnown(SizeOffset)) {
+    ++ChecksUnable;
+    return nullptr;
+  }
+
+  Value *Size   = SizeOffset.first;
+  Value *Offset = SizeOffset.second;
+  ConstantInt *SizeCI = dyn_cast<ConstantInt>(Size);
+
+  Type *IntTy = DL.getIntPtrType(Ptr->getType());
+  Value *NeededSizeVal = ConstantInt::get(IntTy, NeededSize);
+
+  auto SizeRange = SE.getUnsignedRange(SE.getSCEV(Size));
+  auto OffsetRange = SE.getUnsignedRange(SE.getSCEV(Offset));
+  auto NeededSizeRange = SE.getUnsignedRange(SE.getSCEV(NeededSizeVal));
+
+  // three checks are required to ensure safety:
+  // . Offset >= 0  (since the offset is given from the base ptr)
+  // . Size >= Offset  (unsigned)
+  // . Size - Offset >= NeededSize  (unsigned)
+  //
+  // optimization: if Size >= 0 (signed), skip 1st check
+  // FIXME: add NSW/NUW here?  -- we dont care if the subtraction overflows
+  Value *ObjSize = IRB.CreateSub(Size, Offset);
+  Value *Cmp2 = SizeRange.getUnsignedMin().uge(OffsetRange.getUnsignedMax())
+                    ? ConstantInt::getFalse(Ptr->getContext())
+                    : IRB.CreateICmpULT(Size, Offset);
+  Value *Cmp3 = SizeRange.sub(OffsetRange)
+                        .getUnsignedMin()
+                        .uge(NeededSizeRange.getUnsignedMax())
+                    ? ConstantInt::getFalse(Ptr->getContext())
+                    : IRB.CreateICmpULT(ObjSize, NeededSizeVal);
+  Value *Or = IRB.CreateOr(Cmp2, Cmp3);
+  if ((!SizeCI || SizeCI->getValue().slt(0)) &&
+      !SizeRange.getSignedMin().isNonNegative()) {
+    Value *Cmp1 = IRB.CreateICmpSLT(Offset, ConstantInt::get(IntTy, 0));
+    Or = IRB.CreateOr(Cmp1, Or);
+  }
+
+  return Or;
+}
+
+/// Adds run-time bounds checks to memory accessing instructions.
+///
+/// \p Or is the condition that should guard the trap.
+///
+/// \p GetTrapBB is a callable that returns the trap BB to use on failure.
+template <typename GetTrapBBT>
+static void insertBoundsCheck(Value *Or, BuilderTy &IRB, GetTrapBBT GetTrapBB) {
+  // check if the comparison is always false
+  ConstantInt *C = dyn_cast_or_null<ConstantInt>(Or);
+  if (C) {
+    ++ChecksSkipped;
+    // If non-zero, nothing to do.
+    if (!C->getZExtValue())
+      return;
+  }
+  ++ChecksAdded;
+
+  BasicBlock::iterator SplitI = IRB.GetInsertPoint();
+  BasicBlock *OldBB = SplitI->getParent();
+  BasicBlock *Cont = OldBB->splitBasicBlock(SplitI);
+  OldBB->getTerminator()->eraseFromParent();
+
+  if (C) {
+    // If we have a constant zero, unconditionally branch.
+    // FIXME: We should really handle this differently to bypass the splitting
+    // the block.
+    BranchInst::Create(GetTrapBB(IRB), OldBB);
+    return;
+  }
+
+  // Create the conditional branch.
+  BranchInst::Create(GetTrapBB(IRB), Cont, Or, OldBB);
+}
+
+static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI,
+                              ScalarEvolution &SE) {
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  ObjectSizeOpts EvalOpts;
+  EvalOpts.RoundToAlign = true;
+  ObjectSizeOffsetEvaluator ObjSizeEval(DL, &TLI, F.getContext(), EvalOpts);
+
+  // check HANDLE_MEMORY_INST in include/llvm/Instruction.def for memory
+  // touching instructions
+  SmallVector<std::pair<Instruction *, Value *>, 4> TrapInfo;
+  for (Instruction &I : instructions(F)) {
+    Value *Or = nullptr;
+    BuilderTy IRB(I.getParent(), BasicBlock::iterator(&I), TargetFolder(DL));
+    if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+      if (!LI->isVolatile())
+        Or = getBoundsCheckCond(LI->getPointerOperand(), LI, DL, TLI,
+                                ObjSizeEval, IRB, SE);
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
+      if (!SI->isVolatile())
+        Or = getBoundsCheckCond(SI->getPointerOperand(), SI->getValueOperand(),
+                                DL, TLI, ObjSizeEval, IRB, SE);
+    } else if (AtomicCmpXchgInst *AI = dyn_cast<AtomicCmpXchgInst>(&I)) {
+      if (!AI->isVolatile())
+        Or =
+            getBoundsCheckCond(AI->getPointerOperand(), AI->getCompareOperand(),
+                               DL, TLI, ObjSizeEval, IRB, SE);
+    } else if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(&I)) {
+      if (!AI->isVolatile())
+        Or = getBoundsCheckCond(AI->getPointerOperand(), AI->getValOperand(),
+                                DL, TLI, ObjSizeEval, IRB, SE);
+    }
+    if (Or)
+      TrapInfo.push_back(std::make_pair(&I, Or));
+  }
+
+  // Create a trapping basic block on demand using a callback. Depending on
+  // flags, this will either create a single block for the entire function or
+  // will create a fresh block every time it is called.
+  BasicBlock *TrapBB = nullptr;
+  auto GetTrapBB = [&TrapBB](BuilderTy &IRB) {
+    if (TrapBB && SingleTrapBB)
+      return TrapBB;
+
+    Function *Fn = IRB.GetInsertBlock()->getParent();
+    // FIXME: This debug location doesn't make a lot of sense in the
+    // `SingleTrapBB` case.
+    auto DebugLoc = IRB.getCurrentDebugLocation();
+    IRBuilder<>::InsertPointGuard Guard(IRB);
+    TrapBB = BasicBlock::Create(Fn->getContext(), "trap", Fn);
+    IRB.SetInsertPoint(TrapBB);
+
+    auto *F = Intrinsic::getDeclaration(Fn->getParent(), Intrinsic::trap);
+    CallInst *TrapCall = IRB.CreateCall(F, {});
+    TrapCall->setDoesNotReturn();
+    TrapCall->setDoesNotThrow();
+    TrapCall->setDebugLoc(DebugLoc);
+    IRB.CreateUnreachable();
+
+    return TrapBB;
+  };
+
+  // Add the checks.
+  for (const auto &Entry : TrapInfo) {
+    Instruction *Inst = Entry.first;
+    BuilderTy IRB(Inst->getParent(), BasicBlock::iterator(Inst), TargetFolder(DL));
+    insertBoundsCheck(Entry.second, IRB, GetTrapBB);
+  }
+
+  return !TrapInfo.empty();
+}
+
+PreservedAnalyses BoundsCheckingPass::run(Function &F, FunctionAnalysisManager &AM) {
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+
+  if (!addBoundsChecking(F, TLI, SE))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+
+namespace {
+struct BoundsCheckingLegacyPass : public FunctionPass {
+  static char ID;
+
+  BoundsCheckingLegacyPass() : FunctionPass(ID) {
+    initializeBoundsCheckingLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    return addBoundsChecking(F, TLI, SE);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+  }
+};
+} // namespace
+
+char BoundsCheckingLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(BoundsCheckingLegacyPass, "bounds-checking",
+                      "Run-time bounds checking", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(BoundsCheckingLegacyPass, "bounds-checking",
+                    "Run-time bounds checking", false, false)
+
+FunctionPass *llvm::createBoundsCheckingLegacyPass() {
+  return new BoundsCheckingLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/CFGMST.h b/contrib/libs/llvm12/lib/Transforms/Instrumentation/CFGMST.h
index 9de6edaadf..6580b6d7d7 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/CFGMST.h
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/CFGMST.h
@@ -1,303 +1,303 @@
-//===-- CFGMST.h - Minimum Spanning Tree for CFG ----------------*- C++ -*-===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements a Union-find algorithm to compute Minimum Spanning Tree 
-// for a given CFG. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#ifndef LLVM_LIB_TRANSFORMS_INSTRUMENTATION_CFGMST_H 
-#define LLVM_LIB_TRANSFORMS_INSTRUMENTATION_CFGMST_H 
- 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/Analysis/BlockFrequencyInfo.h" 
-#include "llvm/Analysis/BranchProbabilityInfo.h" 
-#include "llvm/Analysis/CFG.h" 
-#include "llvm/Support/BranchProbability.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include <utility> 
-#include <vector> 
- 
-#define DEBUG_TYPE "cfgmst" 
- 
-using namespace llvm; 
- 
-namespace llvm { 
- 
-/// An union-find based Minimum Spanning Tree for CFG 
-/// 
-/// Implements a Union-find algorithm to compute Minimum Spanning Tree 
-/// for a given CFG. 
-template <class Edge, class BBInfo> class CFGMST { 
-public: 
-  Function &F; 
- 
-  // Store all the edges in CFG. It may contain some stale edges 
-  // when Removed is set. 
-  std::vector<std::unique_ptr<Edge>> AllEdges; 
- 
-  // This map records the auxiliary information for each BB. 
-  DenseMap<const BasicBlock *, std::unique_ptr<BBInfo>> BBInfos; 
- 
-  // Whehter the function has an exit block with no successors. 
-  // (For function with an infinite loop, this block may be absent) 
-  bool ExitBlockFound = false; 
- 
-  // Find the root group of the G and compress the path from G to the root. 
-  BBInfo *findAndCompressGroup(BBInfo *G) { 
-    if (G->Group != G) 
-      G->Group = findAndCompressGroup(static_cast<BBInfo *>(G->Group)); 
-    return static_cast<BBInfo *>(G->Group); 
-  } 
- 
-  // Union BB1 and BB2 into the same group and return true. 
-  // Returns false if BB1 and BB2 are already in the same group. 
-  bool unionGroups(const BasicBlock *BB1, const BasicBlock *BB2) { 
-    BBInfo *BB1G = findAndCompressGroup(&getBBInfo(BB1)); 
-    BBInfo *BB2G = findAndCompressGroup(&getBBInfo(BB2)); 
- 
-    if (BB1G == BB2G) 
-      return false; 
- 
-    // Make the smaller rank tree a direct child or the root of high rank tree. 
-    if (BB1G->Rank < BB2G->Rank) 
-      BB1G->Group = BB2G; 
-    else { 
-      BB2G->Group = BB1G; 
-      // If the ranks are the same, increment root of one tree by one. 
-      if (BB1G->Rank == BB2G->Rank) 
-        BB1G->Rank++; 
-    } 
-    return true; 
-  } 
- 
-  // Give BB, return the auxiliary information. 
-  BBInfo &getBBInfo(const BasicBlock *BB) const { 
-    auto It = BBInfos.find(BB); 
-    assert(It->second.get() != nullptr); 
-    return *It->second.get(); 
-  } 
- 
-  // Give BB, return the auxiliary information if it's available. 
-  BBInfo *findBBInfo(const BasicBlock *BB) const { 
-    auto It = BBInfos.find(BB); 
-    if (It == BBInfos.end()) 
-      return nullptr; 
-    return It->second.get(); 
-  } 
- 
-  // Traverse the CFG using a stack. Find all the edges and assign the weight. 
-  // Edges with large weight will be put into MST first so they are less likely 
-  // to be instrumented. 
-  void buildEdges() { 
-    LLVM_DEBUG(dbgs() << "Build Edge on " << F.getName() << "\n"); 
- 
-    const BasicBlock *Entry = &(F.getEntryBlock()); 
-    uint64_t EntryWeight = (BFI != nullptr ? BFI->getEntryFreq() : 2); 
-    // If we want to instrument the entry count, lower the weight to 0. 
+//===-- CFGMST.h - Minimum Spanning Tree for CFG ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a Union-find algorithm to compute Minimum Spanning Tree
+// for a given CFG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_INSTRUMENTATION_CFGMST_H
+#define LLVM_LIB_TRANSFORMS_INSTRUMENTATION_CFGMST_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <utility>
+#include <vector>
+
+#define DEBUG_TYPE "cfgmst"
+
+using namespace llvm;
+
+namespace llvm {
+
+/// An union-find based Minimum Spanning Tree for CFG
+///
+/// Implements a Union-find algorithm to compute Minimum Spanning Tree
+/// for a given CFG.
+template <class Edge, class BBInfo> class CFGMST {
+public:
+  Function &F;
+
+  // Store all the edges in CFG. It may contain some stale edges
+  // when Removed is set.
+  std::vector<std::unique_ptr<Edge>> AllEdges;
+
+  // This map records the auxiliary information for each BB.
+  DenseMap<const BasicBlock *, std::unique_ptr<BBInfo>> BBInfos;
+
+  // Whehter the function has an exit block with no successors.
+  // (For function with an infinite loop, this block may be absent)
+  bool ExitBlockFound = false;
+
+  // Find the root group of the G and compress the path from G to the root.
+  BBInfo *findAndCompressGroup(BBInfo *G) {
+    if (G->Group != G)
+      G->Group = findAndCompressGroup(static_cast<BBInfo *>(G->Group));
+    return static_cast<BBInfo *>(G->Group);
+  }
+
+  // Union BB1 and BB2 into the same group and return true.
+  // Returns false if BB1 and BB2 are already in the same group.
+  bool unionGroups(const BasicBlock *BB1, const BasicBlock *BB2) {
+    BBInfo *BB1G = findAndCompressGroup(&getBBInfo(BB1));
+    BBInfo *BB2G = findAndCompressGroup(&getBBInfo(BB2));
+
+    if (BB1G == BB2G)
+      return false;
+
+    // Make the smaller rank tree a direct child or the root of high rank tree.
+    if (BB1G->Rank < BB2G->Rank)
+      BB1G->Group = BB2G;
+    else {
+      BB2G->Group = BB1G;
+      // If the ranks are the same, increment root of one tree by one.
+      if (BB1G->Rank == BB2G->Rank)
+        BB1G->Rank++;
+    }
+    return true;
+  }
+
+  // Give BB, return the auxiliary information.
+  BBInfo &getBBInfo(const BasicBlock *BB) const {
+    auto It = BBInfos.find(BB);
+    assert(It->second.get() != nullptr);
+    return *It->second.get();
+  }
+
+  // Give BB, return the auxiliary information if it's available.
+  BBInfo *findBBInfo(const BasicBlock *BB) const {
+    auto It = BBInfos.find(BB);
+    if (It == BBInfos.end())
+      return nullptr;
+    return It->second.get();
+  }
+
+  // Traverse the CFG using a stack. Find all the edges and assign the weight.
+  // Edges with large weight will be put into MST first so they are less likely
+  // to be instrumented.
+  void buildEdges() {
+    LLVM_DEBUG(dbgs() << "Build Edge on " << F.getName() << "\n");
+
+    const BasicBlock *Entry = &(F.getEntryBlock());
+    uint64_t EntryWeight = (BFI != nullptr ? BFI->getEntryFreq() : 2);
+    // If we want to instrument the entry count, lower the weight to 0.
     if (InstrumentFuncEntry)
-      EntryWeight = 0; 
-    Edge *EntryIncoming = nullptr, *EntryOutgoing = nullptr, 
-         *ExitOutgoing = nullptr, *ExitIncoming = nullptr; 
-    uint64_t MaxEntryOutWeight = 0, MaxExitOutWeight = 0, MaxExitInWeight = 0; 
- 
-    // Add a fake edge to the entry. 
-    EntryIncoming = &addEdge(nullptr, Entry, EntryWeight); 
-    LLVM_DEBUG(dbgs() << "  Edge: from fake node to " << Entry->getName() 
-                      << " w = " << EntryWeight << "\n"); 
- 
-    // Special handling for single BB functions. 
-    if (succ_empty(Entry)) { 
-      addEdge(Entry, nullptr, EntryWeight); 
-      return; 
-    } 
- 
-    static const uint32_t CriticalEdgeMultiplier = 1000; 
- 
-    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { 
-      Instruction *TI = BB->getTerminator(); 
-      uint64_t BBWeight = 
-          (BFI != nullptr ? BFI->getBlockFreq(&*BB).getFrequency() : 2); 
-      uint64_t Weight = 2; 
-      if (int successors = TI->getNumSuccessors()) { 
-        for (int i = 0; i != successors; ++i) { 
-          BasicBlock *TargetBB = TI->getSuccessor(i); 
-          bool Critical = isCriticalEdge(TI, i); 
-          uint64_t scaleFactor = BBWeight; 
-          if (Critical) { 
-            if (scaleFactor < UINT64_MAX / CriticalEdgeMultiplier) 
-              scaleFactor *= CriticalEdgeMultiplier; 
-            else 
-              scaleFactor = UINT64_MAX; 
-          } 
-          if (BPI != nullptr) 
-            Weight = BPI->getEdgeProbability(&*BB, TargetBB).scale(scaleFactor); 
-          if (Weight == 0) 
-            Weight++; 
-          auto *E = &addEdge(&*BB, TargetBB, Weight); 
-          E->IsCritical = Critical; 
-          LLVM_DEBUG(dbgs() << "  Edge: from " << BB->getName() << " to " 
-                            << TargetBB->getName() << "  w=" << Weight << "\n"); 
- 
-          // Keep track of entry/exit edges: 
-          if (&*BB == Entry) { 
-            if (Weight > MaxEntryOutWeight) { 
-              MaxEntryOutWeight = Weight; 
-              EntryOutgoing = E; 
-            } 
-          } 
- 
-          auto *TargetTI = TargetBB->getTerminator(); 
-          if (TargetTI && !TargetTI->getNumSuccessors()) { 
-            if (Weight > MaxExitInWeight) { 
-              MaxExitInWeight = Weight; 
-              ExitIncoming = E; 
-            } 
-          } 
-        } 
-      } else { 
-        ExitBlockFound = true; 
-        Edge *ExitO = &addEdge(&*BB, nullptr, BBWeight); 
-        if (BBWeight > MaxExitOutWeight) { 
-          MaxExitOutWeight = BBWeight; 
-          ExitOutgoing = ExitO; 
-        } 
-        LLVM_DEBUG(dbgs() << "  Edge: from " << BB->getName() << " to fake exit" 
-                          << " w = " << BBWeight << "\n"); 
-      } 
-    } 
- 
-    // Entry/exit edge adjustment heurisitic: 
-    // prefer instrumenting entry edge over exit edge 
-    // if possible. Those exit edges may never have a chance to be 
-    // executed (for instance the program is an event handling loop) 
-    // before the profile is asynchronously dumped. 
-    // 
-    // If EntryIncoming and ExitOutgoing has similar weight, make sure 
-    // ExitOutging is selected as the min-edge. Similarly, if EntryOutgoing 
-    // and ExitIncoming has similar weight, make sure ExitIncoming becomes 
-    // the min-edge. 
-    uint64_t EntryInWeight = EntryWeight; 
- 
-    if (EntryInWeight >= MaxExitOutWeight && 
-        EntryInWeight * 2 < MaxExitOutWeight * 3) { 
-      EntryIncoming->Weight = MaxExitOutWeight; 
-      ExitOutgoing->Weight = EntryInWeight + 1; 
-    } 
- 
-    if (MaxEntryOutWeight >= MaxExitInWeight && 
-        MaxEntryOutWeight * 2 < MaxExitInWeight * 3) { 
-      EntryOutgoing->Weight = MaxExitInWeight; 
-      ExitIncoming->Weight = MaxEntryOutWeight + 1; 
-    } 
-  } 
- 
-  // Sort CFG edges based on its weight. 
-  void sortEdgesByWeight() { 
-    llvm::stable_sort(AllEdges, [](const std::unique_ptr<Edge> &Edge1, 
-                                   const std::unique_ptr<Edge> &Edge2) { 
-      return Edge1->Weight > Edge2->Weight; 
-    }); 
-  } 
- 
-  // Traverse all the edges and compute the Minimum Weight Spanning Tree 
-  // using union-find algorithm. 
-  void computeMinimumSpanningTree() { 
-    // First, put all the critical edge with landing-pad as the Dest to MST. 
-    // This works around the insufficient support of critical edges split 
-    // when destination BB is a landing pad. 
-    for (auto &Ei : AllEdges) { 
-      if (Ei->Removed) 
-        continue; 
-      if (Ei->IsCritical) { 
-        if (Ei->DestBB && Ei->DestBB->isLandingPad()) { 
-          if (unionGroups(Ei->SrcBB, Ei->DestBB)) 
-            Ei->InMST = true; 
-        } 
-      } 
-    } 
- 
-    for (auto &Ei : AllEdges) { 
-      if (Ei->Removed) 
-        continue; 
-      // If we detect infinite loops, force 
-      // instrumenting the entry edge: 
-      if (!ExitBlockFound && Ei->SrcBB == nullptr) 
-        continue; 
-      if (unionGroups(Ei->SrcBB, Ei->DestBB)) 
-        Ei->InMST = true; 
-    } 
-  } 
- 
-  // Dump the Debug information about the instrumentation. 
-  void dumpEdges(raw_ostream &OS, const Twine &Message) const { 
-    if (!Message.str().empty()) 
-      OS << Message << "\n"; 
-    OS << "  Number of Basic Blocks: " << BBInfos.size() << "\n"; 
-    for (auto &BI : BBInfos) { 
-      const BasicBlock *BB = BI.first; 
-      OS << "  BB: " << (BB == nullptr ? "FakeNode" : BB->getName()) << "  " 
-         << BI.second->infoString() << "\n"; 
-    } 
- 
-    OS << "  Number of Edges: " << AllEdges.size() 
-       << " (*: Instrument, C: CriticalEdge, -: Removed)\n"; 
-    uint32_t Count = 0; 
-    for (auto &EI : AllEdges) 
-      OS << "  Edge " << Count++ << ": " << getBBInfo(EI->SrcBB).Index << "-->" 
-         << getBBInfo(EI->DestBB).Index << EI->infoString() << "\n"; 
-  } 
- 
-  // Add an edge to AllEdges with weight W. 
-  Edge &addEdge(const BasicBlock *Src, const BasicBlock *Dest, uint64_t W) { 
-    uint32_t Index = BBInfos.size(); 
-    auto Iter = BBInfos.end(); 
-    bool Inserted; 
-    std::tie(Iter, Inserted) = BBInfos.insert(std::make_pair(Src, nullptr)); 
-    if (Inserted) { 
-      // Newly inserted, update the real info. 
-      Iter->second = std::move(std::make_unique<BBInfo>(Index)); 
-      Index++; 
-    } 
-    std::tie(Iter, Inserted) = BBInfos.insert(std::make_pair(Dest, nullptr)); 
-    if (Inserted) 
-      // Newly inserted, update the real info. 
-      Iter->second = std::move(std::make_unique<BBInfo>(Index)); 
-    AllEdges.emplace_back(new Edge(Src, Dest, W)); 
-    return *AllEdges.back(); 
-  } 
- 
-  BranchProbabilityInfo *BPI; 
-  BlockFrequencyInfo *BFI; 
- 
+      EntryWeight = 0;
+    Edge *EntryIncoming = nullptr, *EntryOutgoing = nullptr,
+         *ExitOutgoing = nullptr, *ExitIncoming = nullptr;
+    uint64_t MaxEntryOutWeight = 0, MaxExitOutWeight = 0, MaxExitInWeight = 0;
+
+    // Add a fake edge to the entry.
+    EntryIncoming = &addEdge(nullptr, Entry, EntryWeight);
+    LLVM_DEBUG(dbgs() << "  Edge: from fake node to " << Entry->getName()
+                      << " w = " << EntryWeight << "\n");
+
+    // Special handling for single BB functions.
+    if (succ_empty(Entry)) {
+      addEdge(Entry, nullptr, EntryWeight);
+      return;
+    }
+
+    static const uint32_t CriticalEdgeMultiplier = 1000;
+
+    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+      Instruction *TI = BB->getTerminator();
+      uint64_t BBWeight =
+          (BFI != nullptr ? BFI->getBlockFreq(&*BB).getFrequency() : 2);
+      uint64_t Weight = 2;
+      if (int successors = TI->getNumSuccessors()) {
+        for (int i = 0; i != successors; ++i) {
+          BasicBlock *TargetBB = TI->getSuccessor(i);
+          bool Critical = isCriticalEdge(TI, i);
+          uint64_t scaleFactor = BBWeight;
+          if (Critical) {
+            if (scaleFactor < UINT64_MAX / CriticalEdgeMultiplier)
+              scaleFactor *= CriticalEdgeMultiplier;
+            else
+              scaleFactor = UINT64_MAX;
+          }
+          if (BPI != nullptr)
+            Weight = BPI->getEdgeProbability(&*BB, TargetBB).scale(scaleFactor);
+          if (Weight == 0)
+            Weight++;
+          auto *E = &addEdge(&*BB, TargetBB, Weight);
+          E->IsCritical = Critical;
+          LLVM_DEBUG(dbgs() << "  Edge: from " << BB->getName() << " to "
+                            << TargetBB->getName() << "  w=" << Weight << "\n");
+
+          // Keep track of entry/exit edges:
+          if (&*BB == Entry) {
+            if (Weight > MaxEntryOutWeight) {
+              MaxEntryOutWeight = Weight;
+              EntryOutgoing = E;
+            }
+          }
+
+          auto *TargetTI = TargetBB->getTerminator();
+          if (TargetTI && !TargetTI->getNumSuccessors()) {
+            if (Weight > MaxExitInWeight) {
+              MaxExitInWeight = Weight;
+              ExitIncoming = E;
+            }
+          }
+        }
+      } else {
+        ExitBlockFound = true;
+        Edge *ExitO = &addEdge(&*BB, nullptr, BBWeight);
+        if (BBWeight > MaxExitOutWeight) {
+          MaxExitOutWeight = BBWeight;
+          ExitOutgoing = ExitO;
+        }
+        LLVM_DEBUG(dbgs() << "  Edge: from " << BB->getName() << " to fake exit"
+                          << " w = " << BBWeight << "\n");
+      }
+    }
+
+    // Entry/exit edge adjustment heurisitic:
+    // prefer instrumenting entry edge over exit edge
+    // if possible. Those exit edges may never have a chance to be
+    // executed (for instance the program is an event handling loop)
+    // before the profile is asynchronously dumped.
+    //
+    // If EntryIncoming and ExitOutgoing has similar weight, make sure
+    // ExitOutging is selected as the min-edge. Similarly, if EntryOutgoing
+    // and ExitIncoming has similar weight, make sure ExitIncoming becomes
+    // the min-edge.
+    uint64_t EntryInWeight = EntryWeight;
+
+    if (EntryInWeight >= MaxExitOutWeight &&
+        EntryInWeight * 2 < MaxExitOutWeight * 3) {
+      EntryIncoming->Weight = MaxExitOutWeight;
+      ExitOutgoing->Weight = EntryInWeight + 1;
+    }
+
+    if (MaxEntryOutWeight >= MaxExitInWeight &&
+        MaxEntryOutWeight * 2 < MaxExitInWeight * 3) {
+      EntryOutgoing->Weight = MaxExitInWeight;
+      ExitIncoming->Weight = MaxEntryOutWeight + 1;
+    }
+  }
+
+  // Sort CFG edges based on its weight.
+  void sortEdgesByWeight() {
+    llvm::stable_sort(AllEdges, [](const std::unique_ptr<Edge> &Edge1,
+                                   const std::unique_ptr<Edge> &Edge2) {
+      return Edge1->Weight > Edge2->Weight;
+    });
+  }
+
+  // Traverse all the edges and compute the Minimum Weight Spanning Tree
+  // using union-find algorithm.
+  void computeMinimumSpanningTree() {
+    // First, put all the critical edge with landing-pad as the Dest to MST.
+    // This works around the insufficient support of critical edges split
+    // when destination BB is a landing pad.
+    for (auto &Ei : AllEdges) {
+      if (Ei->Removed)
+        continue;
+      if (Ei->IsCritical) {
+        if (Ei->DestBB && Ei->DestBB->isLandingPad()) {
+          if (unionGroups(Ei->SrcBB, Ei->DestBB))
+            Ei->InMST = true;
+        }
+      }
+    }
+
+    for (auto &Ei : AllEdges) {
+      if (Ei->Removed)
+        continue;
+      // If we detect infinite loops, force
+      // instrumenting the entry edge:
+      if (!ExitBlockFound && Ei->SrcBB == nullptr)
+        continue;
+      if (unionGroups(Ei->SrcBB, Ei->DestBB))
+        Ei->InMST = true;
+    }
+  }
+
+  // Dump the Debug information about the instrumentation.
+  void dumpEdges(raw_ostream &OS, const Twine &Message) const {
+    if (!Message.str().empty())
+      OS << Message << "\n";
+    OS << "  Number of Basic Blocks: " << BBInfos.size() << "\n";
+    for (auto &BI : BBInfos) {
+      const BasicBlock *BB = BI.first;
+      OS << "  BB: " << (BB == nullptr ? "FakeNode" : BB->getName()) << "  "
+         << BI.second->infoString() << "\n";
+    }
+
+    OS << "  Number of Edges: " << AllEdges.size()
+       << " (*: Instrument, C: CriticalEdge, -: Removed)\n";
+    uint32_t Count = 0;
+    for (auto &EI : AllEdges)
+      OS << "  Edge " << Count++ << ": " << getBBInfo(EI->SrcBB).Index << "-->"
+         << getBBInfo(EI->DestBB).Index << EI->infoString() << "\n";
+  }
+
+  // Add an edge to AllEdges with weight W.
+  Edge &addEdge(const BasicBlock *Src, const BasicBlock *Dest, uint64_t W) {
+    uint32_t Index = BBInfos.size();
+    auto Iter = BBInfos.end();
+    bool Inserted;
+    std::tie(Iter, Inserted) = BBInfos.insert(std::make_pair(Src, nullptr));
+    if (Inserted) {
+      // Newly inserted, update the real info.
+      Iter->second = std::move(std::make_unique<BBInfo>(Index));
+      Index++;
+    }
+    std::tie(Iter, Inserted) = BBInfos.insert(std::make_pair(Dest, nullptr));
+    if (Inserted)
+      // Newly inserted, update the real info.
+      Iter->second = std::move(std::make_unique<BBInfo>(Index));
+    AllEdges.emplace_back(new Edge(Src, Dest, W));
+    return *AllEdges.back();
+  }
+
+  BranchProbabilityInfo *BPI;
+  BlockFrequencyInfo *BFI;
+
   // If function entry will be always instrumented.
   bool InstrumentFuncEntry;
 
-public: 
+public:
   CFGMST(Function &Func, bool InstrumentFuncEntry_,
          BranchProbabilityInfo *BPI_ = nullptr,
-         BlockFrequencyInfo *BFI_ = nullptr) 
+         BlockFrequencyInfo *BFI_ = nullptr)
       : F(Func), BPI(BPI_), BFI(BFI_),
         InstrumentFuncEntry(InstrumentFuncEntry_) {
-    buildEdges(); 
-    sortEdgesByWeight(); 
-    computeMinimumSpanningTree(); 
+    buildEdges();
+    sortEdgesByWeight();
+    computeMinimumSpanningTree();
     if (AllEdges.size() > 1 && InstrumentFuncEntry)
-      std::iter_swap(std::move(AllEdges.begin()), 
-                     std::move(AllEdges.begin() + AllEdges.size() - 1)); 
-  } 
-}; 
- 
-} // end namespace llvm 
- 
-#undef DEBUG_TYPE // "cfgmst" 
- 
-#endif // LLVM_LIB_TRANSFORMS_INSTRUMENTATION_CFGMST_H 
+      std::iter_swap(std::move(AllEdges.begin()),
+                     std::move(AllEdges.begin() + AllEdges.size() - 1));
+  }
+};
+
+} // end namespace llvm
+
+#undef DEBUG_TYPE // "cfgmst"
+
+#endif // LLVM_LIB_TRANSFORMS_INSTRUMENTATION_CFGMST_H
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/CGProfile.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/CGProfile.cpp
index 7f658fa68f..9acd82c005 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/CGProfile.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/CGProfile.cpp
@@ -1,153 +1,153 @@
-//===-- CGProfile.cpp -----------------------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Instrumentation/CGProfile.h" 
- 
-#include "llvm/ADT/MapVector.h" 
-#include "llvm/Analysis/BlockFrequencyInfo.h" 
-#include "llvm/Analysis/LazyBlockFrequencyInfo.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/MDBuilder.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/ProfileData/InstrProf.h" 
-#include "llvm/Transforms/Instrumentation.h" 
- 
-#include <array> 
- 
-using namespace llvm; 
- 
-static bool 
-addModuleFlags(Module &M, 
-               MapVector<std::pair<Function *, Function *>, uint64_t> &Counts) { 
-  if (Counts.empty()) 
-    return false; 
- 
-  LLVMContext &Context = M.getContext(); 
-  MDBuilder MDB(Context); 
-  std::vector<Metadata *> Nodes; 
- 
-  for (auto E : Counts) { 
-    Metadata *Vals[] = {ValueAsMetadata::get(E.first.first), 
-                        ValueAsMetadata::get(E.first.second), 
-                        MDB.createConstant(ConstantInt::get( 
-                            Type::getInt64Ty(Context), E.second))}; 
-    Nodes.push_back(MDNode::get(Context, Vals)); 
-  } 
- 
-  M.addModuleFlag(Module::Append, "CG Profile", MDNode::get(Context, Nodes)); 
-  return true; 
-} 
- 
-static bool runCGProfilePass( 
-    Module &M, function_ref<BlockFrequencyInfo &(Function &)> GetBFI, 
-    function_ref<TargetTransformInfo &(Function &)> GetTTI, bool LazyBFI) { 
-  MapVector<std::pair<Function *, Function *>, uint64_t> Counts; 
-  InstrProfSymtab Symtab; 
-  auto UpdateCounts = [&](TargetTransformInfo &TTI, Function *F, 
-                          Function *CalledF, uint64_t NewCount) { 
+//===-- CGProfile.cpp -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/CGProfile.h"
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Transforms/Instrumentation.h"
+
+#include <array>
+
+using namespace llvm;
+
+static bool
+addModuleFlags(Module &M,
+               MapVector<std::pair<Function *, Function *>, uint64_t> &Counts) {
+  if (Counts.empty())
+    return false;
+
+  LLVMContext &Context = M.getContext();
+  MDBuilder MDB(Context);
+  std::vector<Metadata *> Nodes;
+
+  for (auto E : Counts) {
+    Metadata *Vals[] = {ValueAsMetadata::get(E.first.first),
+                        ValueAsMetadata::get(E.first.second),
+                        MDB.createConstant(ConstantInt::get(
+                            Type::getInt64Ty(Context), E.second))};
+    Nodes.push_back(MDNode::get(Context, Vals));
+  }
+
+  M.addModuleFlag(Module::Append, "CG Profile", MDNode::get(Context, Nodes));
+  return true;
+}
+
+static bool runCGProfilePass(
+    Module &M, function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
+    function_ref<TargetTransformInfo &(Function &)> GetTTI, bool LazyBFI) {
+  MapVector<std::pair<Function *, Function *>, uint64_t> Counts;
+  InstrProfSymtab Symtab;
+  auto UpdateCounts = [&](TargetTransformInfo &TTI, Function *F,
+                          Function *CalledF, uint64_t NewCount) {
     if (!CalledF || !TTI.isLoweredToCall(CalledF) ||
         CalledF->hasDLLImportStorageClass())
-      return; 
-    uint64_t &Count = Counts[std::make_pair(F, CalledF)]; 
-    Count = SaturatingAdd(Count, NewCount); 
-  }; 
-  // Ignore error here.  Indirect calls are ignored if this fails. 
-  (void)(bool) Symtab.create(M); 
-  for (auto &F : M) { 
-    // Avoid extra cost of running passes for BFI when the function doesn't have 
-    // entry count. Since LazyBlockFrequencyInfoPass only exists in LPM, check 
-    // if using LazyBlockFrequencyInfoPass. 
-    // TODO: Remove LazyBFI when LazyBlockFrequencyInfoPass is available in NPM. 
-    if (F.isDeclaration() || (LazyBFI && !F.getEntryCount())) 
-      continue; 
-    auto &BFI = GetBFI(F); 
-    if (BFI.getEntryFreq() == 0) 
-      continue; 
-    TargetTransformInfo &TTI = GetTTI(F); 
-    for (auto &BB : F) { 
-      Optional<uint64_t> BBCount = BFI.getBlockProfileCount(&BB); 
-      if (!BBCount) 
-        continue; 
-      for (auto &I : BB) { 
-        CallBase *CB = dyn_cast<CallBase>(&I); 
-        if (!CB) 
-          continue; 
-        if (CB->isIndirectCall()) { 
-          InstrProfValueData ValueData[8]; 
-          uint32_t ActualNumValueData; 
-          uint64_t TotalC; 
-          if (!getValueProfDataFromInst(*CB, IPVK_IndirectCallTarget, 8, 
-                                        ValueData, ActualNumValueData, TotalC)) 
-            continue; 
-          for (const auto &VD : 
-               ArrayRef<InstrProfValueData>(ValueData, ActualNumValueData)) { 
-            UpdateCounts(TTI, &F, Symtab.getFunction(VD.Value), VD.Count); 
-          } 
-          continue; 
-        } 
-        UpdateCounts(TTI, &F, CB->getCalledFunction(), *BBCount); 
-      } 
-    } 
-  } 
- 
-  return addModuleFlags(M, Counts); 
-} 
- 
-namespace { 
-struct CGProfileLegacyPass final : public ModulePass { 
-  static char ID; 
-  CGProfileLegacyPass() : ModulePass(ID) { 
-    initializeCGProfileLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.setPreservesCFG(); 
-    AU.addRequired<LazyBlockFrequencyInfoPass>(); 
-    AU.addRequired<TargetTransformInfoWrapperPass>(); 
-  } 
- 
-  bool runOnModule(Module &M) override { 
-    auto GetBFI = [this](Function &F) -> BlockFrequencyInfo & { 
-      return this->getAnalysis<LazyBlockFrequencyInfoPass>(F).getBFI(); 
-    }; 
-    auto GetTTI = [this](Function &F) -> TargetTransformInfo & { 
-      return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 
-    }; 
- 
-    return runCGProfilePass(M, GetBFI, GetTTI, true); 
-  } 
-}; 
- 
-} // namespace 
- 
-char CGProfileLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS(CGProfileLegacyPass, "cg-profile", "Call Graph Profile", false, 
-                false) 
- 
-ModulePass *llvm::createCGProfileLegacyPass() { 
-  return new CGProfileLegacyPass(); 
-} 
- 
-PreservedAnalyses CGProfilePass::run(Module &M, ModuleAnalysisManager &MAM) { 
-  FunctionAnalysisManager &FAM = 
-      MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 
-  auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & { 
-    return FAM.getResult<BlockFrequencyAnalysis>(F); 
-  }; 
-  auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & { 
-    return FAM.getResult<TargetIRAnalysis>(F); 
-  }; 
- 
-  runCGProfilePass(M, GetBFI, GetTTI, false); 
- 
-  return PreservedAnalyses::all(); 
-} 
+      return;
+    uint64_t &Count = Counts[std::make_pair(F, CalledF)];
+    Count = SaturatingAdd(Count, NewCount);
+  };
+  // Ignore error here.  Indirect calls are ignored if this fails.
+  (void)(bool) Symtab.create(M);
+  for (auto &F : M) {
+    // Avoid extra cost of running passes for BFI when the function doesn't have
+    // entry count. Since LazyBlockFrequencyInfoPass only exists in LPM, check
+    // if using LazyBlockFrequencyInfoPass.
+    // TODO: Remove LazyBFI when LazyBlockFrequencyInfoPass is available in NPM.
+    if (F.isDeclaration() || (LazyBFI && !F.getEntryCount()))
+      continue;
+    auto &BFI = GetBFI(F);
+    if (BFI.getEntryFreq() == 0)
+      continue;
+    TargetTransformInfo &TTI = GetTTI(F);
+    for (auto &BB : F) {
+      Optional<uint64_t> BBCount = BFI.getBlockProfileCount(&BB);
+      if (!BBCount)
+        continue;
+      for (auto &I : BB) {
+        CallBase *CB = dyn_cast<CallBase>(&I);
+        if (!CB)
+          continue;
+        if (CB->isIndirectCall()) {
+          InstrProfValueData ValueData[8];
+          uint32_t ActualNumValueData;
+          uint64_t TotalC;
+          if (!getValueProfDataFromInst(*CB, IPVK_IndirectCallTarget, 8,
+                                        ValueData, ActualNumValueData, TotalC))
+            continue;
+          for (const auto &VD :
+               ArrayRef<InstrProfValueData>(ValueData, ActualNumValueData)) {
+            UpdateCounts(TTI, &F, Symtab.getFunction(VD.Value), VD.Count);
+          }
+          continue;
+        }
+        UpdateCounts(TTI, &F, CB->getCalledFunction(), *BBCount);
+      }
+    }
+  }
+
+  return addModuleFlags(M, Counts);
+}
+
+namespace {
+struct CGProfileLegacyPass final : public ModulePass {
+  static char ID;
+  CGProfileLegacyPass() : ModulePass(ID) {
+    initializeCGProfileLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<LazyBlockFrequencyInfoPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
+
+  bool runOnModule(Module &M) override {
+    auto GetBFI = [this](Function &F) -> BlockFrequencyInfo & {
+      return this->getAnalysis<LazyBlockFrequencyInfoPass>(F).getBFI();
+    };
+    auto GetTTI = [this](Function &F) -> TargetTransformInfo & {
+      return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    };
+
+    return runCGProfilePass(M, GetBFI, GetTTI, true);
+  }
+};
+
+} // namespace
+
+char CGProfileLegacyPass::ID = 0;
+
+INITIALIZE_PASS(CGProfileLegacyPass, "cg-profile", "Call Graph Profile", false,
+                false)
+
+ModulePass *llvm::createCGProfileLegacyPass() {
+  return new CGProfileLegacyPass();
+}
+
+PreservedAnalyses CGProfilePass::run(Module &M, ModuleAnalysisManager &MAM) {
+  FunctionAnalysisManager &FAM =
+      MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
+    return FAM.getResult<BlockFrequencyAnalysis>(F);
+  };
+  auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
+    return FAM.getResult<TargetIRAnalysis>(F);
+  };
+
+  runCGProfilePass(M, GetBFI, GetTTI, false);
+
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
index 6fdeb88658..927c34180d 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
@@ -1,2103 +1,2103 @@
-//===-- ControlHeightReduction.cpp - Control Height Reduction -------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass merges conditional blocks of code and reduces the number of 
-// conditional branches in the hot paths based on profiles. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Instrumentation/ControlHeightReduction.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/DenseSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/StringSet.h" 
-#include "llvm/Analysis/BlockFrequencyInfo.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/OptimizationRemarkEmitter.h" 
-#include "llvm/Analysis/ProfileSummaryInfo.h" 
-#include "llvm/Analysis/RegionInfo.h" 
-#include "llvm/Analysis/RegionIterator.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/MDBuilder.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Support/BranchProbability.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/MemoryBuffer.h" 
-#include "llvm/Transforms/Utils.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/Cloning.h" 
-#include "llvm/Transforms/Utils/ValueMapper.h" 
- 
-#include <set> 
-#include <sstream> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "chr" 
- 
-#define CHR_DEBUG(X) LLVM_DEBUG(X) 
- 
-static cl::opt<bool> ForceCHR("force-chr", cl::init(false), cl::Hidden, 
-                              cl::desc("Apply CHR for all functions")); 
- 
-static cl::opt<double> CHRBiasThreshold( 
-    "chr-bias-threshold", cl::init(0.99), cl::Hidden, 
-    cl::desc("CHR considers a branch bias greater than this ratio as biased")); 
- 
-static cl::opt<unsigned> CHRMergeThreshold( 
-    "chr-merge-threshold", cl::init(2), cl::Hidden, 
-    cl::desc("CHR merges a group of N branches/selects where N >= this value")); 
- 
-static cl::opt<std::string> CHRModuleList( 
-    "chr-module-list", cl::init(""), cl::Hidden, 
-    cl::desc("Specify file to retrieve the list of modules to apply CHR to")); 
- 
-static cl::opt<std::string> CHRFunctionList( 
-    "chr-function-list", cl::init(""), cl::Hidden, 
-    cl::desc("Specify file to retrieve the list of functions to apply CHR to")); 
- 
-static StringSet<> CHRModules; 
-static StringSet<> CHRFunctions; 
- 
-static void parseCHRFilterFiles() { 
-  if (!CHRModuleList.empty()) { 
-    auto FileOrErr = MemoryBuffer::getFile(CHRModuleList); 
-    if (!FileOrErr) { 
-      errs() << "Error: Couldn't read the chr-module-list file " << CHRModuleList << "\n"; 
-      std::exit(1); 
-    } 
-    StringRef Buf = FileOrErr->get()->getBuffer(); 
-    SmallVector<StringRef, 0> Lines; 
-    Buf.split(Lines, '\n'); 
-    for (StringRef Line : Lines) { 
-      Line = Line.trim(); 
-      if (!Line.empty()) 
-        CHRModules.insert(Line); 
-    } 
-  } 
-  if (!CHRFunctionList.empty()) { 
-    auto FileOrErr = MemoryBuffer::getFile(CHRFunctionList); 
-    if (!FileOrErr) { 
-      errs() << "Error: Couldn't read the chr-function-list file " << CHRFunctionList << "\n"; 
-      std::exit(1); 
-    } 
-    StringRef Buf = FileOrErr->get()->getBuffer(); 
-    SmallVector<StringRef, 0> Lines; 
-    Buf.split(Lines, '\n'); 
-    for (StringRef Line : Lines) { 
-      Line = Line.trim(); 
-      if (!Line.empty()) 
-        CHRFunctions.insert(Line); 
-    } 
-  } 
-} 
- 
-namespace { 
-class ControlHeightReductionLegacyPass : public FunctionPass { 
-public: 
-  static char ID; 
- 
-  ControlHeightReductionLegacyPass() : FunctionPass(ID) { 
-    initializeControlHeightReductionLegacyPassPass( 
-        *PassRegistry::getPassRegistry()); 
-    parseCHRFilterFiles(); 
-  } 
- 
-  bool runOnFunction(Function &F) override; 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<BlockFrequencyInfoWrapperPass>(); 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addRequired<ProfileSummaryInfoWrapperPass>(); 
-    AU.addRequired<RegionInfoPass>(); 
-    AU.addPreserved<GlobalsAAWrapperPass>(); 
-  } 
-}; 
-} // end anonymous namespace 
- 
-char ControlHeightReductionLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(ControlHeightReductionLegacyPass, 
-                      "chr", 
-                      "Reduce control height in the hot paths", 
-                      false, false) 
-INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(RegionInfoPass) 
-INITIALIZE_PASS_END(ControlHeightReductionLegacyPass, 
-                    "chr", 
-                    "Reduce control height in the hot paths", 
-                    false, false) 
- 
-FunctionPass *llvm::createControlHeightReductionLegacyPass() { 
-  return new ControlHeightReductionLegacyPass(); 
-} 
- 
-namespace { 
- 
-struct CHRStats { 
-  CHRStats() : NumBranches(0), NumBranchesDelta(0), 
-               WeightedNumBranchesDelta(0) {} 
-  void print(raw_ostream &OS) const { 
-    OS << "CHRStats: NumBranches " << NumBranches 
-       << " NumBranchesDelta " << NumBranchesDelta 
-       << " WeightedNumBranchesDelta " << WeightedNumBranchesDelta; 
-  } 
-  uint64_t NumBranches;       // The original number of conditional branches / 
-                              // selects 
-  uint64_t NumBranchesDelta;  // The decrease of the number of conditional 
-                              // branches / selects in the hot paths due to CHR. 
-  uint64_t WeightedNumBranchesDelta; // NumBranchesDelta weighted by the profile 
-                                     // count at the scope entry. 
-}; 
- 
-// RegInfo - some properties of a Region. 
-struct RegInfo { 
-  RegInfo() : R(nullptr), HasBranch(false) {} 
-  RegInfo(Region *RegionIn) : R(RegionIn), HasBranch(false) {} 
-  Region *R; 
-  bool HasBranch; 
-  SmallVector<SelectInst *, 8> Selects; 
-}; 
- 
-typedef DenseMap<Region *, DenseSet<Instruction *>> HoistStopMapTy; 
- 
-// CHRScope - a sequence of regions to CHR together. It corresponds to a 
-// sequence of conditional blocks. It can have subscopes which correspond to 
-// nested conditional blocks. Nested CHRScopes form a tree. 
-class CHRScope { 
- public: 
-  CHRScope(RegInfo RI) : BranchInsertPoint(nullptr) { 
-    assert(RI.R && "Null RegionIn"); 
-    RegInfos.push_back(RI); 
-  } 
- 
-  Region *getParentRegion() { 
-    assert(RegInfos.size() > 0 && "Empty CHRScope"); 
-    Region *Parent = RegInfos[0].R->getParent(); 
-    assert(Parent && "Unexpected to call this on the top-level region"); 
-    return Parent; 
-  } 
- 
-  BasicBlock *getEntryBlock() { 
-    assert(RegInfos.size() > 0 && "Empty CHRScope"); 
-    return RegInfos.front().R->getEntry(); 
-  } 
- 
-  BasicBlock *getExitBlock() { 
-    assert(RegInfos.size() > 0 && "Empty CHRScope"); 
-    return RegInfos.back().R->getExit(); 
-  } 
- 
-  bool appendable(CHRScope *Next) { 
-    // The next scope is appendable only if this scope is directly connected to 
-    // it (which implies it post-dominates this scope) and this scope dominates 
-    // it (no edge to the next scope outside this scope). 
-    BasicBlock *NextEntry = Next->getEntryBlock(); 
-    if (getExitBlock() != NextEntry) 
-      // Not directly connected. 
-      return false; 
-    Region *LastRegion = RegInfos.back().R; 
-    for (BasicBlock *Pred : predecessors(NextEntry)) 
-      if (!LastRegion->contains(Pred)) 
-        // There's an edge going into the entry of the next scope from outside 
-        // of this scope. 
-        return false; 
-    return true; 
-  } 
- 
-  void append(CHRScope *Next) { 
-    assert(RegInfos.size() > 0 && "Empty CHRScope"); 
-    assert(Next->RegInfos.size() > 0 && "Empty CHRScope"); 
-    assert(getParentRegion() == Next->getParentRegion() && 
-           "Must be siblings"); 
-    assert(getExitBlock() == Next->getEntryBlock() && 
-           "Must be adjacent"); 
-    RegInfos.append(Next->RegInfos.begin(), Next->RegInfos.end()); 
-    Subs.append(Next->Subs.begin(), Next->Subs.end()); 
-  } 
- 
-  void addSub(CHRScope *SubIn) { 
-#ifndef NDEBUG 
-    bool IsChild = false; 
-    for (RegInfo &RI : RegInfos) 
-      if (RI.R == SubIn->getParentRegion()) { 
-        IsChild = true; 
-        break; 
-      } 
-    assert(IsChild && "Must be a child"); 
-#endif 
-    Subs.push_back(SubIn); 
-  } 
- 
-  // Split this scope at the boundary region into two, which will belong to the 
-  // tail and returns the tail. 
-  CHRScope *split(Region *Boundary) { 
-    assert(Boundary && "Boundary null"); 
-    assert(RegInfos.begin()->R != Boundary && 
-           "Can't be split at beginning"); 
-    auto BoundaryIt = llvm::find_if( 
-        RegInfos, [&Boundary](const RegInfo &RI) { return Boundary == RI.R; }); 
-    if (BoundaryIt == RegInfos.end()) 
-      return nullptr; 
-    ArrayRef<RegInfo> TailRegInfos(BoundaryIt, RegInfos.end()); 
-    DenseSet<Region *> TailRegionSet; 
-    for (const RegInfo &RI : TailRegInfos) 
-      TailRegionSet.insert(RI.R); 
- 
-    auto TailIt = 
-        std::stable_partition(Subs.begin(), Subs.end(), [&](CHRScope *Sub) { 
-          assert(Sub && "null Sub"); 
-          Region *Parent = Sub->getParentRegion(); 
-          if (TailRegionSet.count(Parent)) 
-            return false; 
- 
+//===-- ControlHeightReduction.cpp - Control Height Reduction -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass merges conditional blocks of code and reduces the number of
+// conditional branches in the hot paths based on profiles.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/ControlHeightReduction.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/RegionInfo.h"
+#include "llvm/Analysis/RegionIterator.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+
+#include <set>
+#include <sstream>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "chr"
+
+#define CHR_DEBUG(X) LLVM_DEBUG(X)
+
+static cl::opt<bool> ForceCHR("force-chr", cl::init(false), cl::Hidden,
+                              cl::desc("Apply CHR for all functions"));
+
+static cl::opt<double> CHRBiasThreshold(
+    "chr-bias-threshold", cl::init(0.99), cl::Hidden,
+    cl::desc("CHR considers a branch bias greater than this ratio as biased"));
+
+static cl::opt<unsigned> CHRMergeThreshold(
+    "chr-merge-threshold", cl::init(2), cl::Hidden,
+    cl::desc("CHR merges a group of N branches/selects where N >= this value"));
+
+static cl::opt<std::string> CHRModuleList(
+    "chr-module-list", cl::init(""), cl::Hidden,
+    cl::desc("Specify file to retrieve the list of modules to apply CHR to"));
+
+static cl::opt<std::string> CHRFunctionList(
+    "chr-function-list", cl::init(""), cl::Hidden,
+    cl::desc("Specify file to retrieve the list of functions to apply CHR to"));
+
+static StringSet<> CHRModules;
+static StringSet<> CHRFunctions;
+
+static void parseCHRFilterFiles() {
+  if (!CHRModuleList.empty()) {
+    auto FileOrErr = MemoryBuffer::getFile(CHRModuleList);
+    if (!FileOrErr) {
+      errs() << "Error: Couldn't read the chr-module-list file " << CHRModuleList << "\n";
+      std::exit(1);
+    }
+    StringRef Buf = FileOrErr->get()->getBuffer();
+    SmallVector<StringRef, 0> Lines;
+    Buf.split(Lines, '\n');
+    for (StringRef Line : Lines) {
+      Line = Line.trim();
+      if (!Line.empty())
+        CHRModules.insert(Line);
+    }
+  }
+  if (!CHRFunctionList.empty()) {
+    auto FileOrErr = MemoryBuffer::getFile(CHRFunctionList);
+    if (!FileOrErr) {
+      errs() << "Error: Couldn't read the chr-function-list file " << CHRFunctionList << "\n";
+      std::exit(1);
+    }
+    StringRef Buf = FileOrErr->get()->getBuffer();
+    SmallVector<StringRef, 0> Lines;
+    Buf.split(Lines, '\n');
+    for (StringRef Line : Lines) {
+      Line = Line.trim();
+      if (!Line.empty())
+        CHRFunctions.insert(Line);
+    }
+  }
+}
+
+namespace {
+class ControlHeightReductionLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  ControlHeightReductionLegacyPass() : FunctionPass(ID) {
+    initializeControlHeightReductionLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+    parseCHRFilterFiles();
+  }
+
+  bool runOnFunction(Function &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
+    AU.addRequired<RegionInfoPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+} // end anonymous namespace
+
+char ControlHeightReductionLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(ControlHeightReductionLegacyPass,
+                      "chr",
+                      "Reduce control height in the hot paths",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(RegionInfoPass)
+INITIALIZE_PASS_END(ControlHeightReductionLegacyPass,
+                    "chr",
+                    "Reduce control height in the hot paths",
+                    false, false)
+
+FunctionPass *llvm::createControlHeightReductionLegacyPass() {
+  return new ControlHeightReductionLegacyPass();
+}
+
+namespace {
+
+struct CHRStats {
+  CHRStats() : NumBranches(0), NumBranchesDelta(0),
+               WeightedNumBranchesDelta(0) {}
+  void print(raw_ostream &OS) const {
+    OS << "CHRStats: NumBranches " << NumBranches
+       << " NumBranchesDelta " << NumBranchesDelta
+       << " WeightedNumBranchesDelta " << WeightedNumBranchesDelta;
+  }
+  uint64_t NumBranches;       // The original number of conditional branches /
+                              // selects
+  uint64_t NumBranchesDelta;  // The decrease of the number of conditional
+                              // branches / selects in the hot paths due to CHR.
+  uint64_t WeightedNumBranchesDelta; // NumBranchesDelta weighted by the profile
+                                     // count at the scope entry.
+};
+
+// RegInfo - some properties of a Region.
+struct RegInfo {
+  RegInfo() : R(nullptr), HasBranch(false) {}
+  RegInfo(Region *RegionIn) : R(RegionIn), HasBranch(false) {}
+  Region *R;
+  bool HasBranch;
+  SmallVector<SelectInst *, 8> Selects;
+};
+
+typedef DenseMap<Region *, DenseSet<Instruction *>> HoistStopMapTy;
+
+// CHRScope - a sequence of regions to CHR together. It corresponds to a
+// sequence of conditional blocks. It can have subscopes which correspond to
+// nested conditional blocks. Nested CHRScopes form a tree.
+class CHRScope {
+ public:
+  CHRScope(RegInfo RI) : BranchInsertPoint(nullptr) {
+    assert(RI.R && "Null RegionIn");
+    RegInfos.push_back(RI);
+  }
+
+  Region *getParentRegion() {
+    assert(RegInfos.size() > 0 && "Empty CHRScope");
+    Region *Parent = RegInfos[0].R->getParent();
+    assert(Parent && "Unexpected to call this on the top-level region");
+    return Parent;
+  }
+
+  BasicBlock *getEntryBlock() {
+    assert(RegInfos.size() > 0 && "Empty CHRScope");
+    return RegInfos.front().R->getEntry();
+  }
+
+  BasicBlock *getExitBlock() {
+    assert(RegInfos.size() > 0 && "Empty CHRScope");
+    return RegInfos.back().R->getExit();
+  }
+
+  bool appendable(CHRScope *Next) {
+    // The next scope is appendable only if this scope is directly connected to
+    // it (which implies it post-dominates this scope) and this scope dominates
+    // it (no edge to the next scope outside this scope).
+    BasicBlock *NextEntry = Next->getEntryBlock();
+    if (getExitBlock() != NextEntry)
+      // Not directly connected.
+      return false;
+    Region *LastRegion = RegInfos.back().R;
+    for (BasicBlock *Pred : predecessors(NextEntry))
+      if (!LastRegion->contains(Pred))
+        // There's an edge going into the entry of the next scope from outside
+        // of this scope.
+        return false;
+    return true;
+  }
+
+  void append(CHRScope *Next) {
+    assert(RegInfos.size() > 0 && "Empty CHRScope");
+    assert(Next->RegInfos.size() > 0 && "Empty CHRScope");
+    assert(getParentRegion() == Next->getParentRegion() &&
+           "Must be siblings");
+    assert(getExitBlock() == Next->getEntryBlock() &&
+           "Must be adjacent");
+    RegInfos.append(Next->RegInfos.begin(), Next->RegInfos.end());
+    Subs.append(Next->Subs.begin(), Next->Subs.end());
+  }
+
+  void addSub(CHRScope *SubIn) {
+#ifndef NDEBUG
+    bool IsChild = false;
+    for (RegInfo &RI : RegInfos)
+      if (RI.R == SubIn->getParentRegion()) {
+        IsChild = true;
+        break;
+      }
+    assert(IsChild && "Must be a child");
+#endif
+    Subs.push_back(SubIn);
+  }
+
+  // Split this scope at the boundary region into two, which will belong to the
+  // tail and returns the tail.
+  CHRScope *split(Region *Boundary) {
+    assert(Boundary && "Boundary null");
+    assert(RegInfos.begin()->R != Boundary &&
+           "Can't be split at beginning");
+    auto BoundaryIt = llvm::find_if(
+        RegInfos, [&Boundary](const RegInfo &RI) { return Boundary == RI.R; });
+    if (BoundaryIt == RegInfos.end())
+      return nullptr;
+    ArrayRef<RegInfo> TailRegInfos(BoundaryIt, RegInfos.end());
+    DenseSet<Region *> TailRegionSet;
+    for (const RegInfo &RI : TailRegInfos)
+      TailRegionSet.insert(RI.R);
+
+    auto TailIt =
+        std::stable_partition(Subs.begin(), Subs.end(), [&](CHRScope *Sub) {
+          assert(Sub && "null Sub");
+          Region *Parent = Sub->getParentRegion();
+          if (TailRegionSet.count(Parent))
+            return false;
+
           assert(llvm::any_of(
                      RegInfos,
                      [&Parent](const RegInfo &RI) { return Parent == RI.R; }) &&
-                 "Must be in head"); 
-          return true; 
-        }); 
-    ArrayRef<CHRScope *> TailSubs(TailIt, Subs.end()); 
- 
-    assert(HoistStopMap.empty() && "MapHoistStops must be empty"); 
-    auto *Scope = new CHRScope(TailRegInfos, TailSubs); 
-    RegInfos.erase(BoundaryIt, RegInfos.end()); 
-    Subs.erase(TailIt, Subs.end()); 
-    return Scope; 
-  } 
- 
-  bool contains(Instruction *I) const { 
-    BasicBlock *Parent = I->getParent(); 
-    for (const RegInfo &RI : RegInfos) 
-      if (RI.R->contains(Parent)) 
-        return true; 
-    return false; 
-  } 
- 
-  void print(raw_ostream &OS) const; 
- 
-  SmallVector<RegInfo, 8> RegInfos; // Regions that belong to this scope 
-  SmallVector<CHRScope *, 8> Subs;  // Subscopes. 
- 
-  // The instruction at which to insert the CHR conditional branch (and hoist 
-  // the dependent condition values). 
-  Instruction *BranchInsertPoint; 
- 
-  // True-biased and false-biased regions (conditional blocks), 
-  // respectively. Used only for the outermost scope and includes regions in 
-  // subscopes. The rest are unbiased. 
-  DenseSet<Region *> TrueBiasedRegions; 
-  DenseSet<Region *> FalseBiasedRegions; 
-  // Among the biased regions, the regions that get CHRed. 
-  SmallVector<RegInfo, 8> CHRRegions; 
- 
-  // True-biased and false-biased selects, respectively. Used only for the 
-  // outermost scope and includes ones in subscopes. 
-  DenseSet<SelectInst *> TrueBiasedSelects; 
-  DenseSet<SelectInst *> FalseBiasedSelects; 
- 
-  // Map from one of the above regions to the instructions to stop 
-  // hoisting instructions at through use-def chains. 
-  HoistStopMapTy HoistStopMap; 
- 
- private: 
-   CHRScope(ArrayRef<RegInfo> RegInfosIn, ArrayRef<CHRScope *> SubsIn) 
-       : RegInfos(RegInfosIn.begin(), RegInfosIn.end()), 
-         Subs(SubsIn.begin(), SubsIn.end()), BranchInsertPoint(nullptr) {} 
-}; 
- 
-class CHR { 
- public: 
-  CHR(Function &Fin, BlockFrequencyInfo &BFIin, DominatorTree &DTin, 
-      ProfileSummaryInfo &PSIin, RegionInfo &RIin, 
-      OptimizationRemarkEmitter &OREin) 
-      : F(Fin), BFI(BFIin), DT(DTin), PSI(PSIin), RI(RIin), ORE(OREin) {} 
- 
-  ~CHR() { 
-    for (CHRScope *Scope : Scopes) { 
-      delete Scope; 
-    } 
-  } 
- 
-  bool run(); 
- 
- private: 
-  // See the comments in CHR::run() for the high level flow of the algorithm and 
-  // what the following functions do. 
- 
-  void findScopes(SmallVectorImpl<CHRScope *> &Output) { 
-    Region *R = RI.getTopLevelRegion(); 
-    if (CHRScope *Scope = findScopes(R, nullptr, nullptr, Output)) { 
-      Output.push_back(Scope); 
-    } 
-  } 
-  CHRScope *findScopes(Region *R, Region *NextRegion, Region *ParentRegion, 
-                        SmallVectorImpl<CHRScope *> &Scopes); 
-  CHRScope *findScope(Region *R); 
-  void checkScopeHoistable(CHRScope *Scope); 
- 
-  void splitScopes(SmallVectorImpl<CHRScope *> &Input, 
-                   SmallVectorImpl<CHRScope *> &Output); 
-  SmallVector<CHRScope *, 8> splitScope(CHRScope *Scope, 
-                                        CHRScope *Outer, 
-                                        DenseSet<Value *> *OuterConditionValues, 
-                                        Instruction *OuterInsertPoint, 
-                                        SmallVectorImpl<CHRScope *> &Output, 
-                                        DenseSet<Instruction *> &Unhoistables); 
- 
-  void classifyBiasedScopes(SmallVectorImpl<CHRScope *> &Scopes); 
-  void classifyBiasedScopes(CHRScope *Scope, CHRScope *OutermostScope); 
- 
-  void filterScopes(SmallVectorImpl<CHRScope *> &Input, 
-                    SmallVectorImpl<CHRScope *> &Output); 
- 
-  void setCHRRegions(SmallVectorImpl<CHRScope *> &Input, 
-                     SmallVectorImpl<CHRScope *> &Output); 
-  void setCHRRegions(CHRScope *Scope, CHRScope *OutermostScope); 
- 
-  void sortScopes(SmallVectorImpl<CHRScope *> &Input, 
-                  SmallVectorImpl<CHRScope *> &Output); 
- 
-  void transformScopes(SmallVectorImpl<CHRScope *> &CHRScopes); 
-  void transformScopes(CHRScope *Scope, DenseSet<PHINode *> &TrivialPHIs); 
-  void cloneScopeBlocks(CHRScope *Scope, 
-                        BasicBlock *PreEntryBlock, 
-                        BasicBlock *ExitBlock, 
-                        Region *LastRegion, 
-                        ValueToValueMapTy &VMap); 
-  BranchInst *createMergedBranch(BasicBlock *PreEntryBlock, 
-                                 BasicBlock *EntryBlock, 
-                                 BasicBlock *NewEntryBlock, 
-                                 ValueToValueMapTy &VMap); 
-  void fixupBranchesAndSelects(CHRScope *Scope, 
-                               BasicBlock *PreEntryBlock, 
-                               BranchInst *MergedBR, 
-                               uint64_t ProfileCount); 
-  void fixupBranch(Region *R, 
-                   CHRScope *Scope, 
-                   IRBuilder<> &IRB, 
-                   Value *&MergedCondition, BranchProbability &CHRBranchBias); 
-  void fixupSelect(SelectInst* SI, 
-                   CHRScope *Scope, 
-                   IRBuilder<> &IRB, 
-                   Value *&MergedCondition, BranchProbability &CHRBranchBias); 
-  void addToMergedCondition(bool IsTrueBiased, Value *Cond, 
-                            Instruction *BranchOrSelect, 
-                            CHRScope *Scope, 
-                            IRBuilder<> &IRB, 
-                            Value *&MergedCondition); 
- 
-  Function &F; 
-  BlockFrequencyInfo &BFI; 
-  DominatorTree &DT; 
-  ProfileSummaryInfo &PSI; 
-  RegionInfo &RI; 
-  OptimizationRemarkEmitter &ORE; 
-  CHRStats Stats; 
- 
-  // All the true-biased regions in the function 
-  DenseSet<Region *> TrueBiasedRegionsGlobal; 
-  // All the false-biased regions in the function 
-  DenseSet<Region *> FalseBiasedRegionsGlobal; 
-  // All the true-biased selects in the function 
-  DenseSet<SelectInst *> TrueBiasedSelectsGlobal; 
-  // All the false-biased selects in the function 
-  DenseSet<SelectInst *> FalseBiasedSelectsGlobal; 
-  // A map from biased regions to their branch bias 
-  DenseMap<Region *, BranchProbability> BranchBiasMap; 
-  // A map from biased selects to their branch bias 
-  DenseMap<SelectInst *, BranchProbability> SelectBiasMap; 
-  // All the scopes. 
-  DenseSet<CHRScope *> Scopes; 
-}; 
- 
-} // end anonymous namespace 
- 
-static inline 
-raw_ostream LLVM_ATTRIBUTE_UNUSED &operator<<(raw_ostream &OS, 
-                                              const CHRStats &Stats) { 
-  Stats.print(OS); 
-  return OS; 
-} 
- 
-static inline 
-raw_ostream &operator<<(raw_ostream &OS, const CHRScope &Scope) { 
-  Scope.print(OS); 
-  return OS; 
-} 
- 
-static bool shouldApply(Function &F, ProfileSummaryInfo& PSI) { 
-  if (ForceCHR) 
-    return true; 
- 
-  if (!CHRModuleList.empty() || !CHRFunctionList.empty()) { 
-    if (CHRModules.count(F.getParent()->getName())) 
-      return true; 
-    return CHRFunctions.count(F.getName()); 
-  } 
- 
-  assert(PSI.hasProfileSummary() && "Empty PSI?"); 
-  return PSI.isFunctionEntryHot(&F); 
-} 
- 
-static void LLVM_ATTRIBUTE_UNUSED dumpIR(Function &F, const char *Label, 
-                                         CHRStats *Stats) { 
-  StringRef FuncName = F.getName(); 
-  StringRef ModuleName = F.getParent()->getName(); 
-  (void)(FuncName); // Unused in release build. 
-  (void)(ModuleName); // Unused in release build. 
-  CHR_DEBUG(dbgs() << "CHR IR dump " << Label << " " << ModuleName << " " 
-            << FuncName); 
-  if (Stats) 
-    CHR_DEBUG(dbgs() << " " << *Stats); 
-  CHR_DEBUG(dbgs() << "\n"); 
-  CHR_DEBUG(F.dump()); 
-} 
- 
-void CHRScope::print(raw_ostream &OS) const { 
-  assert(RegInfos.size() > 0 && "Empty CHRScope"); 
-  OS << "CHRScope["; 
-  OS << RegInfos.size() << ", Regions["; 
-  for (const RegInfo &RI : RegInfos) { 
-    OS << RI.R->getNameStr(); 
-    if (RI.HasBranch) 
-      OS << " B"; 
-    if (RI.Selects.size() > 0) 
-      OS << " S" << RI.Selects.size(); 
-    OS << ", "; 
-  } 
-  if (RegInfos[0].R->getParent()) { 
-    OS << "], Parent " << RegInfos[0].R->getParent()->getNameStr(); 
-  } else { 
-    // top level region 
-    OS << "]"; 
-  } 
-  OS << ", Subs["; 
-  for (CHRScope *Sub : Subs) { 
-    OS << *Sub << ", "; 
-  } 
-  OS << "]]"; 
-} 
- 
-// Return true if the given instruction type can be hoisted by CHR. 
-static bool isHoistableInstructionType(Instruction *I) { 
-  return isa<BinaryOperator>(I) || isa<CastInst>(I) || isa<SelectInst>(I) || 
-      isa<GetElementPtrInst>(I) || isa<CmpInst>(I) || 
-      isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 
-      isa<ShuffleVectorInst>(I) || isa<ExtractValueInst>(I) || 
-      isa<InsertValueInst>(I); 
-} 
- 
-// Return true if the given instruction can be hoisted by CHR. 
-static bool isHoistable(Instruction *I, DominatorTree &DT) { 
-  if (!isHoistableInstructionType(I)) 
-    return false; 
-  return isSafeToSpeculativelyExecute(I, nullptr, &DT); 
-} 
- 
-// Recursively traverse the use-def chains of the given value and return a set 
-// of the unhoistable base values defined within the scope (excluding the 
-// first-region entry block) or the (hoistable or unhoistable) base values that 
-// are defined outside (including the first-region entry block) of the 
-// scope. The returned set doesn't include constants. 
-static const std::set<Value *> & 
-getBaseValues(Value *V, DominatorTree &DT, 
-              DenseMap<Value *, std::set<Value *>> &Visited) { 
-  auto It = Visited.find(V); 
-  if (It != Visited.end()) { 
-    return It->second; 
-  } 
-  std::set<Value *> Result; 
-  if (auto *I = dyn_cast<Instruction>(V)) { 
-    // We don't stop at a block that's not in the Scope because we would miss 
-    // some instructions that are based on the same base values if we stop 
-    // there. 
-    if (!isHoistable(I, DT)) { 
-      Result.insert(I); 
-      return Visited.insert(std::make_pair(V, std::move(Result))).first->second; 
-    } 
-    // I is hoistable above the Scope. 
-    for (Value *Op : I->operands()) { 
-      const std::set<Value *> &OpResult = getBaseValues(Op, DT, Visited); 
-      Result.insert(OpResult.begin(), OpResult.end()); 
-    } 
-    return Visited.insert(std::make_pair(V, std::move(Result))).first->second; 
-  } 
-  if (isa<Argument>(V)) { 
-    Result.insert(V); 
-  } 
-  // We don't include others like constants because those won't lead to any 
-  // chance of folding of conditions (eg two bit checks merged into one check) 
-  // after CHR. 
-  return Visited.insert(std::make_pair(V, std::move(Result))).first->second; 
-} 
- 
-// Return true if V is already hoisted or can be hoisted (along with its 
-// operands) above the insert point. When it returns true and HoistStops is 
-// non-null, the instructions to stop hoisting at through the use-def chains are 
-// inserted into HoistStops. 
-static bool 
-checkHoistValue(Value *V, Instruction *InsertPoint, DominatorTree &DT, 
-                DenseSet<Instruction *> &Unhoistables, 
-                DenseSet<Instruction *> *HoistStops, 
-                DenseMap<Instruction *, bool> &Visited) { 
-  assert(InsertPoint && "Null InsertPoint"); 
-  if (auto *I = dyn_cast<Instruction>(V)) { 
-    auto It = Visited.find(I); 
-    if (It != Visited.end()) { 
-      return It->second; 
-    } 
-    assert(DT.getNode(I->getParent()) && "DT must contain I's parent block"); 
-    assert(DT.getNode(InsertPoint->getParent()) && "DT must contain Destination"); 
-    if (Unhoistables.count(I)) { 
-      // Don't hoist if they are not to be hoisted. 
-      Visited[I] = false; 
-      return false; 
-    } 
-    if (DT.dominates(I, InsertPoint)) { 
-      // We are already above the insert point. Stop here. 
-      if (HoistStops) 
-        HoistStops->insert(I); 
-      Visited[I] = true; 
-      return true; 
-    } 
-    // We aren't not above the insert point, check if we can hoist it above the 
-    // insert point. 
-    if (isHoistable(I, DT)) { 
-      // Check operands first. 
-      DenseSet<Instruction *> OpsHoistStops; 
-      bool AllOpsHoisted = true; 
-      for (Value *Op : I->operands()) { 
-        if (!checkHoistValue(Op, InsertPoint, DT, Unhoistables, &OpsHoistStops, 
-                             Visited)) { 
-          AllOpsHoisted = false; 
-          break; 
-        } 
-      } 
-      if (AllOpsHoisted) { 
-        CHR_DEBUG(dbgs() << "checkHoistValue " << *I << "\n"); 
-        if (HoistStops) 
-          HoistStops->insert(OpsHoistStops.begin(), OpsHoistStops.end()); 
-        Visited[I] = true; 
-        return true; 
-      } 
-    } 
-    Visited[I] = false; 
-    return false; 
-  } 
-  // Non-instructions are considered hoistable. 
-  return true; 
-} 
- 
-// Returns true and sets the true probability and false probability of an 
-// MD_prof metadata if it's well-formed. 
-static bool checkMDProf(MDNode *MD, BranchProbability &TrueProb, 
-                        BranchProbability &FalseProb) { 
-  if (!MD) return false; 
-  MDString *MDName = cast<MDString>(MD->getOperand(0)); 
-  if (MDName->getString() != "branch_weights" || 
-      MD->getNumOperands() != 3) 
-    return false; 
-  ConstantInt *TrueWeight = mdconst::extract<ConstantInt>(MD->getOperand(1)); 
-  ConstantInt *FalseWeight = mdconst::extract<ConstantInt>(MD->getOperand(2)); 
-  if (!TrueWeight || !FalseWeight) 
-    return false; 
-  uint64_t TrueWt = TrueWeight->getValue().getZExtValue(); 
-  uint64_t FalseWt = FalseWeight->getValue().getZExtValue(); 
-  uint64_t SumWt = TrueWt + FalseWt; 
- 
-  assert(SumWt >= TrueWt && SumWt >= FalseWt && 
-         "Overflow calculating branch probabilities."); 
- 
-  // Guard against 0-to-0 branch weights to avoid a division-by-zero crash. 
-  if (SumWt == 0) 
-    return false; 
- 
-  TrueProb = BranchProbability::getBranchProbability(TrueWt, SumWt); 
-  FalseProb = BranchProbability::getBranchProbability(FalseWt, SumWt); 
-  return true; 
-} 
- 
-static BranchProbability getCHRBiasThreshold() { 
-  return BranchProbability::getBranchProbability( 
-      static_cast<uint64_t>(CHRBiasThreshold * 1000000), 1000000); 
-} 
- 
-// A helper for CheckBiasedBranch and CheckBiasedSelect. If TrueProb >= 
-// CHRBiasThreshold, put Key into TrueSet and return true. If FalseProb >= 
-// CHRBiasThreshold, put Key into FalseSet and return true. Otherwise, return 
-// false. 
-template <typename K, typename S, typename M> 
-static bool checkBias(K *Key, BranchProbability TrueProb, 
-                      BranchProbability FalseProb, S &TrueSet, S &FalseSet, 
-                      M &BiasMap) { 
-  BranchProbability Threshold = getCHRBiasThreshold(); 
-  if (TrueProb >= Threshold) { 
-    TrueSet.insert(Key); 
-    BiasMap[Key] = TrueProb; 
-    return true; 
-  } else if (FalseProb >= Threshold) { 
-    FalseSet.insert(Key); 
-    BiasMap[Key] = FalseProb; 
-    return true; 
-  } 
-  return false; 
-} 
- 
-// Returns true and insert a region into the right biased set and the map if the 
-// branch of the region is biased. 
-static bool checkBiasedBranch(BranchInst *BI, Region *R, 
-                              DenseSet<Region *> &TrueBiasedRegionsGlobal, 
-                              DenseSet<Region *> &FalseBiasedRegionsGlobal, 
-                              DenseMap<Region *, BranchProbability> &BranchBiasMap) { 
-  if (!BI->isConditional()) 
-    return false; 
-  BranchProbability ThenProb, ElseProb; 
-  if (!checkMDProf(BI->getMetadata(LLVMContext::MD_prof), 
-                   ThenProb, ElseProb)) 
-    return false; 
-  BasicBlock *IfThen = BI->getSuccessor(0); 
-  BasicBlock *IfElse = BI->getSuccessor(1); 
-  assert((IfThen == R->getExit() || IfElse == R->getExit()) && 
-         IfThen != IfElse && 
-         "Invariant from findScopes"); 
-  if (IfThen == R->getExit()) { 
-    // Swap them so that IfThen/ThenProb means going into the conditional code 
-    // and IfElse/ElseProb means skipping it. 
-    std::swap(IfThen, IfElse); 
-    std::swap(ThenProb, ElseProb); 
-  } 
-  CHR_DEBUG(dbgs() << "BI " << *BI << " "); 
-  CHR_DEBUG(dbgs() << "ThenProb " << ThenProb << " "); 
-  CHR_DEBUG(dbgs() << "ElseProb " << ElseProb << "\n"); 
-  return checkBias(R, ThenProb, ElseProb, 
-                   TrueBiasedRegionsGlobal, FalseBiasedRegionsGlobal, 
-                   BranchBiasMap); 
-} 
- 
-// Returns true and insert a select into the right biased set and the map if the 
-// select is biased. 
-static bool checkBiasedSelect( 
-    SelectInst *SI, Region *R, 
-    DenseSet<SelectInst *> &TrueBiasedSelectsGlobal, 
-    DenseSet<SelectInst *> &FalseBiasedSelectsGlobal, 
-    DenseMap<SelectInst *, BranchProbability> &SelectBiasMap) { 
-  BranchProbability TrueProb, FalseProb; 
-  if (!checkMDProf(SI->getMetadata(LLVMContext::MD_prof), 
-                   TrueProb, FalseProb)) 
-    return false; 
-  CHR_DEBUG(dbgs() << "SI " << *SI << " "); 
-  CHR_DEBUG(dbgs() << "TrueProb " << TrueProb << " "); 
-  CHR_DEBUG(dbgs() << "FalseProb " << FalseProb << "\n"); 
-  return checkBias(SI, TrueProb, FalseProb, 
-                   TrueBiasedSelectsGlobal, FalseBiasedSelectsGlobal, 
-                   SelectBiasMap); 
-} 
- 
-// Returns the instruction at which to hoist the dependent condition values and 
-// insert the CHR branch for a region. This is the terminator branch in the 
-// entry block or the first select in the entry block, if any. 
-static Instruction* getBranchInsertPoint(RegInfo &RI) { 
-  Region *R = RI.R; 
-  BasicBlock *EntryBB = R->getEntry(); 
-  // The hoist point is by default the terminator of the entry block, which is 
-  // the same as the branch instruction if RI.HasBranch is true. 
-  Instruction *HoistPoint = EntryBB->getTerminator(); 
-  for (SelectInst *SI : RI.Selects) { 
-    if (SI->getParent() == EntryBB) { 
-      // Pick the first select in Selects in the entry block.  Note Selects is 
-      // sorted in the instruction order within a block (asserted below). 
-      HoistPoint = SI; 
-      break; 
-    } 
-  } 
-  assert(HoistPoint && "Null HoistPoint"); 
-#ifndef NDEBUG 
-  // Check that HoistPoint is the first one in Selects in the entry block, 
-  // if any. 
-  DenseSet<Instruction *> EntryBlockSelectSet; 
-  for (SelectInst *SI : RI.Selects) { 
-    if (SI->getParent() == EntryBB) { 
-      EntryBlockSelectSet.insert(SI); 
-    } 
-  } 
-  for (Instruction &I : *EntryBB) { 
+                 "Must be in head");
+          return true;
+        });
+    ArrayRef<CHRScope *> TailSubs(TailIt, Subs.end());
+
+    assert(HoistStopMap.empty() && "MapHoistStops must be empty");
+    auto *Scope = new CHRScope(TailRegInfos, TailSubs);
+    RegInfos.erase(BoundaryIt, RegInfos.end());
+    Subs.erase(TailIt, Subs.end());
+    return Scope;
+  }
+
+  bool contains(Instruction *I) const {
+    BasicBlock *Parent = I->getParent();
+    for (const RegInfo &RI : RegInfos)
+      if (RI.R->contains(Parent))
+        return true;
+    return false;
+  }
+
+  void print(raw_ostream &OS) const;
+
+  SmallVector<RegInfo, 8> RegInfos; // Regions that belong to this scope
+  SmallVector<CHRScope *, 8> Subs;  // Subscopes.
+
+  // The instruction at which to insert the CHR conditional branch (and hoist
+  // the dependent condition values).
+  Instruction *BranchInsertPoint;
+
+  // True-biased and false-biased regions (conditional blocks),
+  // respectively. Used only for the outermost scope and includes regions in
+  // subscopes. The rest are unbiased.
+  DenseSet<Region *> TrueBiasedRegions;
+  DenseSet<Region *> FalseBiasedRegions;
+  // Among the biased regions, the regions that get CHRed.
+  SmallVector<RegInfo, 8> CHRRegions;
+
+  // True-biased and false-biased selects, respectively. Used only for the
+  // outermost scope and includes ones in subscopes.
+  DenseSet<SelectInst *> TrueBiasedSelects;
+  DenseSet<SelectInst *> FalseBiasedSelects;
+
+  // Map from one of the above regions to the instructions to stop
+  // hoisting instructions at through use-def chains.
+  HoistStopMapTy HoistStopMap;
+
+ private:
+   CHRScope(ArrayRef<RegInfo> RegInfosIn, ArrayRef<CHRScope *> SubsIn)
+       : RegInfos(RegInfosIn.begin(), RegInfosIn.end()),
+         Subs(SubsIn.begin(), SubsIn.end()), BranchInsertPoint(nullptr) {}
+};
+
+class CHR {
+ public:
+  CHR(Function &Fin, BlockFrequencyInfo &BFIin, DominatorTree &DTin,
+      ProfileSummaryInfo &PSIin, RegionInfo &RIin,
+      OptimizationRemarkEmitter &OREin)
+      : F(Fin), BFI(BFIin), DT(DTin), PSI(PSIin), RI(RIin), ORE(OREin) {}
+
+  ~CHR() {
+    for (CHRScope *Scope : Scopes) {
+      delete Scope;
+    }
+  }
+
+  bool run();
+
+ private:
+  // See the comments in CHR::run() for the high level flow of the algorithm and
+  // what the following functions do.
+
+  void findScopes(SmallVectorImpl<CHRScope *> &Output) {
+    Region *R = RI.getTopLevelRegion();
+    if (CHRScope *Scope = findScopes(R, nullptr, nullptr, Output)) {
+      Output.push_back(Scope);
+    }
+  }
+  CHRScope *findScopes(Region *R, Region *NextRegion, Region *ParentRegion,
+                        SmallVectorImpl<CHRScope *> &Scopes);
+  CHRScope *findScope(Region *R);
+  void checkScopeHoistable(CHRScope *Scope);
+
+  void splitScopes(SmallVectorImpl<CHRScope *> &Input,
+                   SmallVectorImpl<CHRScope *> &Output);
+  SmallVector<CHRScope *, 8> splitScope(CHRScope *Scope,
+                                        CHRScope *Outer,
+                                        DenseSet<Value *> *OuterConditionValues,
+                                        Instruction *OuterInsertPoint,
+                                        SmallVectorImpl<CHRScope *> &Output,
+                                        DenseSet<Instruction *> &Unhoistables);
+
+  void classifyBiasedScopes(SmallVectorImpl<CHRScope *> &Scopes);
+  void classifyBiasedScopes(CHRScope *Scope, CHRScope *OutermostScope);
+
+  void filterScopes(SmallVectorImpl<CHRScope *> &Input,
+                    SmallVectorImpl<CHRScope *> &Output);
+
+  void setCHRRegions(SmallVectorImpl<CHRScope *> &Input,
+                     SmallVectorImpl<CHRScope *> &Output);
+  void setCHRRegions(CHRScope *Scope, CHRScope *OutermostScope);
+
+  void sortScopes(SmallVectorImpl<CHRScope *> &Input,
+                  SmallVectorImpl<CHRScope *> &Output);
+
+  void transformScopes(SmallVectorImpl<CHRScope *> &CHRScopes);
+  void transformScopes(CHRScope *Scope, DenseSet<PHINode *> &TrivialPHIs);
+  void cloneScopeBlocks(CHRScope *Scope,
+                        BasicBlock *PreEntryBlock,
+                        BasicBlock *ExitBlock,
+                        Region *LastRegion,
+                        ValueToValueMapTy &VMap);
+  BranchInst *createMergedBranch(BasicBlock *PreEntryBlock,
+                                 BasicBlock *EntryBlock,
+                                 BasicBlock *NewEntryBlock,
+                                 ValueToValueMapTy &VMap);
+  void fixupBranchesAndSelects(CHRScope *Scope,
+                               BasicBlock *PreEntryBlock,
+                               BranchInst *MergedBR,
+                               uint64_t ProfileCount);
+  void fixupBranch(Region *R,
+                   CHRScope *Scope,
+                   IRBuilder<> &IRB,
+                   Value *&MergedCondition, BranchProbability &CHRBranchBias);
+  void fixupSelect(SelectInst* SI,
+                   CHRScope *Scope,
+                   IRBuilder<> &IRB,
+                   Value *&MergedCondition, BranchProbability &CHRBranchBias);
+  void addToMergedCondition(bool IsTrueBiased, Value *Cond,
+                            Instruction *BranchOrSelect,
+                            CHRScope *Scope,
+                            IRBuilder<> &IRB,
+                            Value *&MergedCondition);
+
+  Function &F;
+  BlockFrequencyInfo &BFI;
+  DominatorTree &DT;
+  ProfileSummaryInfo &PSI;
+  RegionInfo &RI;
+  OptimizationRemarkEmitter &ORE;
+  CHRStats Stats;
+
+  // All the true-biased regions in the function
+  DenseSet<Region *> TrueBiasedRegionsGlobal;
+  // All the false-biased regions in the function
+  DenseSet<Region *> FalseBiasedRegionsGlobal;
+  // All the true-biased selects in the function
+  DenseSet<SelectInst *> TrueBiasedSelectsGlobal;
+  // All the false-biased selects in the function
+  DenseSet<SelectInst *> FalseBiasedSelectsGlobal;
+  // A map from biased regions to their branch bias
+  DenseMap<Region *, BranchProbability> BranchBiasMap;
+  // A map from biased selects to their branch bias
+  DenseMap<SelectInst *, BranchProbability> SelectBiasMap;
+  // All the scopes.
+  DenseSet<CHRScope *> Scopes;
+};
+
+} // end anonymous namespace
+
+static inline
+raw_ostream LLVM_ATTRIBUTE_UNUSED &operator<<(raw_ostream &OS,
+                                              const CHRStats &Stats) {
+  Stats.print(OS);
+  return OS;
+}
+
+static inline
+raw_ostream &operator<<(raw_ostream &OS, const CHRScope &Scope) {
+  Scope.print(OS);
+  return OS;
+}
+
+static bool shouldApply(Function &F, ProfileSummaryInfo& PSI) {
+  if (ForceCHR)
+    return true;
+
+  if (!CHRModuleList.empty() || !CHRFunctionList.empty()) {
+    if (CHRModules.count(F.getParent()->getName()))
+      return true;
+    return CHRFunctions.count(F.getName());
+  }
+
+  assert(PSI.hasProfileSummary() && "Empty PSI?");
+  return PSI.isFunctionEntryHot(&F);
+}
+
+static void LLVM_ATTRIBUTE_UNUSED dumpIR(Function &F, const char *Label,
+                                         CHRStats *Stats) {
+  StringRef FuncName = F.getName();
+  StringRef ModuleName = F.getParent()->getName();
+  (void)(FuncName); // Unused in release build.
+  (void)(ModuleName); // Unused in release build.
+  CHR_DEBUG(dbgs() << "CHR IR dump " << Label << " " << ModuleName << " "
+            << FuncName);
+  if (Stats)
+    CHR_DEBUG(dbgs() << " " << *Stats);
+  CHR_DEBUG(dbgs() << "\n");
+  CHR_DEBUG(F.dump());
+}
+
+void CHRScope::print(raw_ostream &OS) const {
+  assert(RegInfos.size() > 0 && "Empty CHRScope");
+  OS << "CHRScope[";
+  OS << RegInfos.size() << ", Regions[";
+  for (const RegInfo &RI : RegInfos) {
+    OS << RI.R->getNameStr();
+    if (RI.HasBranch)
+      OS << " B";
+    if (RI.Selects.size() > 0)
+      OS << " S" << RI.Selects.size();
+    OS << ", ";
+  }
+  if (RegInfos[0].R->getParent()) {
+    OS << "], Parent " << RegInfos[0].R->getParent()->getNameStr();
+  } else {
+    // top level region
+    OS << "]";
+  }
+  OS << ", Subs[";
+  for (CHRScope *Sub : Subs) {
+    OS << *Sub << ", ";
+  }
+  OS << "]]";
+}
+
+// Return true if the given instruction type can be hoisted by CHR.
+static bool isHoistableInstructionType(Instruction *I) {
+  return isa<BinaryOperator>(I) || isa<CastInst>(I) || isa<SelectInst>(I) ||
+      isa<GetElementPtrInst>(I) || isa<CmpInst>(I) ||
+      isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
+      isa<ShuffleVectorInst>(I) || isa<ExtractValueInst>(I) ||
+      isa<InsertValueInst>(I);
+}
+
+// Return true if the given instruction can be hoisted by CHR.
+static bool isHoistable(Instruction *I, DominatorTree &DT) {
+  if (!isHoistableInstructionType(I))
+    return false;
+  return isSafeToSpeculativelyExecute(I, nullptr, &DT);
+}
+
+// Recursively traverse the use-def chains of the given value and return a set
+// of the unhoistable base values defined within the scope (excluding the
+// first-region entry block) or the (hoistable or unhoistable) base values that
+// are defined outside (including the first-region entry block) of the
+// scope. The returned set doesn't include constants.
+static const std::set<Value *> &
+getBaseValues(Value *V, DominatorTree &DT,
+              DenseMap<Value *, std::set<Value *>> &Visited) {
+  auto It = Visited.find(V);
+  if (It != Visited.end()) {
+    return It->second;
+  }
+  std::set<Value *> Result;
+  if (auto *I = dyn_cast<Instruction>(V)) {
+    // We don't stop at a block that's not in the Scope because we would miss
+    // some instructions that are based on the same base values if we stop
+    // there.
+    if (!isHoistable(I, DT)) {
+      Result.insert(I);
+      return Visited.insert(std::make_pair(V, std::move(Result))).first->second;
+    }
+    // I is hoistable above the Scope.
+    for (Value *Op : I->operands()) {
+      const std::set<Value *> &OpResult = getBaseValues(Op, DT, Visited);
+      Result.insert(OpResult.begin(), OpResult.end());
+    }
+    return Visited.insert(std::make_pair(V, std::move(Result))).first->second;
+  }
+  if (isa<Argument>(V)) {
+    Result.insert(V);
+  }
+  // We don't include others like constants because those won't lead to any
+  // chance of folding of conditions (eg two bit checks merged into one check)
+  // after CHR.
+  return Visited.insert(std::make_pair(V, std::move(Result))).first->second;
+}
+
+// Return true if V is already hoisted or can be hoisted (along with its
+// operands) above the insert point. When it returns true and HoistStops is
+// non-null, the instructions to stop hoisting at through the use-def chains are
+// inserted into HoistStops.
+static bool
+checkHoistValue(Value *V, Instruction *InsertPoint, DominatorTree &DT,
+                DenseSet<Instruction *> &Unhoistables,
+                DenseSet<Instruction *> *HoistStops,
+                DenseMap<Instruction *, bool> &Visited) {
+  assert(InsertPoint && "Null InsertPoint");
+  if (auto *I = dyn_cast<Instruction>(V)) {
+    auto It = Visited.find(I);
+    if (It != Visited.end()) {
+      return It->second;
+    }
+    assert(DT.getNode(I->getParent()) && "DT must contain I's parent block");
+    assert(DT.getNode(InsertPoint->getParent()) && "DT must contain Destination");
+    if (Unhoistables.count(I)) {
+      // Don't hoist if they are not to be hoisted.
+      Visited[I] = false;
+      return false;
+    }
+    if (DT.dominates(I, InsertPoint)) {
+      // We are already above the insert point. Stop here.
+      if (HoistStops)
+        HoistStops->insert(I);
+      Visited[I] = true;
+      return true;
+    }
+    // We aren't not above the insert point, check if we can hoist it above the
+    // insert point.
+    if (isHoistable(I, DT)) {
+      // Check operands first.
+      DenseSet<Instruction *> OpsHoistStops;
+      bool AllOpsHoisted = true;
+      for (Value *Op : I->operands()) {
+        if (!checkHoistValue(Op, InsertPoint, DT, Unhoistables, &OpsHoistStops,
+                             Visited)) {
+          AllOpsHoisted = false;
+          break;
+        }
+      }
+      if (AllOpsHoisted) {
+        CHR_DEBUG(dbgs() << "checkHoistValue " << *I << "\n");
+        if (HoistStops)
+          HoistStops->insert(OpsHoistStops.begin(), OpsHoistStops.end());
+        Visited[I] = true;
+        return true;
+      }
+    }
+    Visited[I] = false;
+    return false;
+  }
+  // Non-instructions are considered hoistable.
+  return true;
+}
+
+// Returns true and sets the true probability and false probability of an
+// MD_prof metadata if it's well-formed.
+static bool checkMDProf(MDNode *MD, BranchProbability &TrueProb,
+                        BranchProbability &FalseProb) {
+  if (!MD) return false;
+  MDString *MDName = cast<MDString>(MD->getOperand(0));
+  if (MDName->getString() != "branch_weights" ||
+      MD->getNumOperands() != 3)
+    return false;
+  ConstantInt *TrueWeight = mdconst::extract<ConstantInt>(MD->getOperand(1));
+  ConstantInt *FalseWeight = mdconst::extract<ConstantInt>(MD->getOperand(2));
+  if (!TrueWeight || !FalseWeight)
+    return false;
+  uint64_t TrueWt = TrueWeight->getValue().getZExtValue();
+  uint64_t FalseWt = FalseWeight->getValue().getZExtValue();
+  uint64_t SumWt = TrueWt + FalseWt;
+
+  assert(SumWt >= TrueWt && SumWt >= FalseWt &&
+         "Overflow calculating branch probabilities.");
+
+  // Guard against 0-to-0 branch weights to avoid a division-by-zero crash.
+  if (SumWt == 0)
+    return false;
+
+  TrueProb = BranchProbability::getBranchProbability(TrueWt, SumWt);
+  FalseProb = BranchProbability::getBranchProbability(FalseWt, SumWt);
+  return true;
+}
+
+static BranchProbability getCHRBiasThreshold() {
+  return BranchProbability::getBranchProbability(
+      static_cast<uint64_t>(CHRBiasThreshold * 1000000), 1000000);
+}
+
+// A helper for CheckBiasedBranch and CheckBiasedSelect. If TrueProb >=
+// CHRBiasThreshold, put Key into TrueSet and return true. If FalseProb >=
+// CHRBiasThreshold, put Key into FalseSet and return true. Otherwise, return
+// false.
+template <typename K, typename S, typename M>
+static bool checkBias(K *Key, BranchProbability TrueProb,
+                      BranchProbability FalseProb, S &TrueSet, S &FalseSet,
+                      M &BiasMap) {
+  BranchProbability Threshold = getCHRBiasThreshold();
+  if (TrueProb >= Threshold) {
+    TrueSet.insert(Key);
+    BiasMap[Key] = TrueProb;
+    return true;
+  } else if (FalseProb >= Threshold) {
+    FalseSet.insert(Key);
+    BiasMap[Key] = FalseProb;
+    return true;
+  }
+  return false;
+}
+
+// Returns true and insert a region into the right biased set and the map if the
+// branch of the region is biased.
+static bool checkBiasedBranch(BranchInst *BI, Region *R,
+                              DenseSet<Region *> &TrueBiasedRegionsGlobal,
+                              DenseSet<Region *> &FalseBiasedRegionsGlobal,
+                              DenseMap<Region *, BranchProbability> &BranchBiasMap) {
+  if (!BI->isConditional())
+    return false;
+  BranchProbability ThenProb, ElseProb;
+  if (!checkMDProf(BI->getMetadata(LLVMContext::MD_prof),
+                   ThenProb, ElseProb))
+    return false;
+  BasicBlock *IfThen = BI->getSuccessor(0);
+  BasicBlock *IfElse = BI->getSuccessor(1);
+  assert((IfThen == R->getExit() || IfElse == R->getExit()) &&
+         IfThen != IfElse &&
+         "Invariant from findScopes");
+  if (IfThen == R->getExit()) {
+    // Swap them so that IfThen/ThenProb means going into the conditional code
+    // and IfElse/ElseProb means skipping it.
+    std::swap(IfThen, IfElse);
+    std::swap(ThenProb, ElseProb);
+  }
+  CHR_DEBUG(dbgs() << "BI " << *BI << " ");
+  CHR_DEBUG(dbgs() << "ThenProb " << ThenProb << " ");
+  CHR_DEBUG(dbgs() << "ElseProb " << ElseProb << "\n");
+  return checkBias(R, ThenProb, ElseProb,
+                   TrueBiasedRegionsGlobal, FalseBiasedRegionsGlobal,
+                   BranchBiasMap);
+}
+
+// Returns true and insert a select into the right biased set and the map if the
+// select is biased.
+static bool checkBiasedSelect(
+    SelectInst *SI, Region *R,
+    DenseSet<SelectInst *> &TrueBiasedSelectsGlobal,
+    DenseSet<SelectInst *> &FalseBiasedSelectsGlobal,
+    DenseMap<SelectInst *, BranchProbability> &SelectBiasMap) {
+  BranchProbability TrueProb, FalseProb;
+  if (!checkMDProf(SI->getMetadata(LLVMContext::MD_prof),
+                   TrueProb, FalseProb))
+    return false;
+  CHR_DEBUG(dbgs() << "SI " << *SI << " ");
+  CHR_DEBUG(dbgs() << "TrueProb " << TrueProb << " ");
+  CHR_DEBUG(dbgs() << "FalseProb " << FalseProb << "\n");
+  return checkBias(SI, TrueProb, FalseProb,
+                   TrueBiasedSelectsGlobal, FalseBiasedSelectsGlobal,
+                   SelectBiasMap);
+}
+
+// Returns the instruction at which to hoist the dependent condition values and
+// insert the CHR branch for a region. This is the terminator branch in the
+// entry block or the first select in the entry block, if any.
+static Instruction* getBranchInsertPoint(RegInfo &RI) {
+  Region *R = RI.R;
+  BasicBlock *EntryBB = R->getEntry();
+  // The hoist point is by default the terminator of the entry block, which is
+  // the same as the branch instruction if RI.HasBranch is true.
+  Instruction *HoistPoint = EntryBB->getTerminator();
+  for (SelectInst *SI : RI.Selects) {
+    if (SI->getParent() == EntryBB) {
+      // Pick the first select in Selects in the entry block.  Note Selects is
+      // sorted in the instruction order within a block (asserted below).
+      HoistPoint = SI;
+      break;
+    }
+  }
+  assert(HoistPoint && "Null HoistPoint");
+#ifndef NDEBUG
+  // Check that HoistPoint is the first one in Selects in the entry block,
+  // if any.
+  DenseSet<Instruction *> EntryBlockSelectSet;
+  for (SelectInst *SI : RI.Selects) {
+    if (SI->getParent() == EntryBB) {
+      EntryBlockSelectSet.insert(SI);
+    }
+  }
+  for (Instruction &I : *EntryBB) {
     if (EntryBlockSelectSet.contains(&I)) {
-      assert(&I == HoistPoint && 
-             "HoistPoint must be the first one in Selects"); 
-      break; 
-    } 
-  } 
-#endif 
-  return HoistPoint; 
-} 
- 
-// Find a CHR scope in the given region. 
-CHRScope * CHR::findScope(Region *R) { 
-  CHRScope *Result = nullptr; 
-  BasicBlock *Entry = R->getEntry(); 
-  BasicBlock *Exit = R->getExit();  // null if top level. 
-  assert(Entry && "Entry must not be null"); 
-  assert((Exit == nullptr) == (R->isTopLevelRegion()) && 
-         "Only top level region has a null exit"); 
-  if (Entry) 
-    CHR_DEBUG(dbgs() << "Entry " << Entry->getName() << "\n"); 
-  else 
-    CHR_DEBUG(dbgs() << "Entry null\n"); 
-  if (Exit) 
-    CHR_DEBUG(dbgs() << "Exit " << Exit->getName() << "\n"); 
-  else 
-    CHR_DEBUG(dbgs() << "Exit null\n"); 
-  // Exclude cases where Entry is part of a subregion (hence it doesn't belong 
-  // to this region). 
-  bool EntryInSubregion = RI.getRegionFor(Entry) != R; 
-  if (EntryInSubregion) 
-    return nullptr; 
-  // Exclude loops 
-  for (BasicBlock *Pred : predecessors(Entry)) 
-    if (R->contains(Pred)) 
-      return nullptr; 
-  if (Exit) { 
-    // Try to find an if-then block (check if R is an if-then). 
-    // if (cond) { 
-    //  ... 
-    // } 
-    auto *BI = dyn_cast<BranchInst>(Entry->getTerminator()); 
-    if (BI) 
-      CHR_DEBUG(dbgs() << "BI.isConditional " << BI->isConditional() << "\n"); 
-    else 
-      CHR_DEBUG(dbgs() << "BI null\n"); 
-    if (BI && BI->isConditional()) { 
-      BasicBlock *S0 = BI->getSuccessor(0); 
-      BasicBlock *S1 = BI->getSuccessor(1); 
-      CHR_DEBUG(dbgs() << "S0 " << S0->getName() << "\n"); 
-      CHR_DEBUG(dbgs() << "S1 " << S1->getName() << "\n"); 
-      if (S0 != S1 && (S0 == Exit || S1 == Exit)) { 
-        RegInfo RI(R); 
-        RI.HasBranch = checkBiasedBranch( 
-            BI, R, TrueBiasedRegionsGlobal, FalseBiasedRegionsGlobal, 
-            BranchBiasMap); 
-        Result = new CHRScope(RI); 
-        Scopes.insert(Result); 
-        CHR_DEBUG(dbgs() << "Found a region with a branch\n"); 
-        ++Stats.NumBranches; 
-        if (!RI.HasBranch) { 
-          ORE.emit([&]() { 
-            return OptimizationRemarkMissed(DEBUG_TYPE, "BranchNotBiased", BI) 
-                << "Branch not biased"; 
-          }); 
-        } 
-      } 
-    } 
-  } 
-  { 
-    // Try to look for selects in the direct child blocks (as opposed to in 
-    // subregions) of R. 
-    // ... 
-    // if (..) { // Some subregion 
-    //   ... 
-    // } 
-    // if (..) { // Some subregion 
-    //   ... 
-    // } 
-    // ... 
-    // a = cond ? b : c; 
-    // ... 
-    SmallVector<SelectInst *, 8> Selects; 
-    for (RegionNode *E : R->elements()) { 
-      if (E->isSubRegion()) 
-        continue; 
-      // This returns the basic block of E if E is a direct child of R (not a 
-      // subregion.) 
-      BasicBlock *BB = E->getEntry(); 
-      // Need to push in the order to make it easier to find the first Select 
-      // later. 
-      for (Instruction &I : *BB) { 
-        if (auto *SI = dyn_cast<SelectInst>(&I)) { 
-          Selects.push_back(SI); 
-          ++Stats.NumBranches; 
-        } 
-      } 
-    } 
-    if (Selects.size() > 0) { 
-      auto AddSelects = [&](RegInfo &RI) { 
-        for (auto *SI : Selects) 
-          if (checkBiasedSelect(SI, RI.R, 
-                                TrueBiasedSelectsGlobal, 
-                                FalseBiasedSelectsGlobal, 
-                                SelectBiasMap)) 
-            RI.Selects.push_back(SI); 
-          else 
-            ORE.emit([&]() { 
-              return OptimizationRemarkMissed(DEBUG_TYPE, "SelectNotBiased", SI) 
-                  << "Select not biased"; 
-            }); 
-      }; 
-      if (!Result) { 
-        CHR_DEBUG(dbgs() << "Found a select-only region\n"); 
-        RegInfo RI(R); 
-        AddSelects(RI); 
-        Result = new CHRScope(RI); 
-        Scopes.insert(Result); 
-      } else { 
-        CHR_DEBUG(dbgs() << "Found select(s) in a region with a branch\n"); 
-        AddSelects(Result->RegInfos[0]); 
-      } 
-    } 
-  } 
- 
-  if (Result) { 
-    checkScopeHoistable(Result); 
-  } 
-  return Result; 
-} 
- 
-// Check that any of the branch and the selects in the region could be 
-// hoisted above the the CHR branch insert point (the most dominating of 
-// them, either the branch (at the end of the first block) or the first 
-// select in the first block). If the branch can't be hoisted, drop the 
-// selects in the first blocks. 
-// 
-// For example, for the following scope/region with selects, we want to insert 
-// the merged branch right before the first select in the first/entry block by 
-// hoisting c1, c2, c3, and c4. 
-// 
-// // Branch insert point here. 
-// a = c1 ? b : c; // Select 1 
-// d = c2 ? e : f; // Select 2 
-// if (c3) { // Branch 
-//   ... 
-//   c4 = foo() // A call. 
-//   g = c4 ? h : i; // Select 3 
-// } 
-// 
-// But suppose we can't hoist c4 because it's dependent on the preceding 
-// call. Then, we drop Select 3. Furthermore, if we can't hoist c2, we also drop 
-// Select 2. If we can't hoist c3, we drop Selects 1 & 2. 
-void CHR::checkScopeHoistable(CHRScope *Scope) { 
-  RegInfo &RI = Scope->RegInfos[0]; 
-  Region *R = RI.R; 
-  BasicBlock *EntryBB = R->getEntry(); 
-  auto *Branch = RI.HasBranch ? 
-                 cast<BranchInst>(EntryBB->getTerminator()) : nullptr; 
-  SmallVector<SelectInst *, 8> &Selects = RI.Selects; 
-  if (RI.HasBranch || !Selects.empty()) { 
-    Instruction *InsertPoint = getBranchInsertPoint(RI); 
-    CHR_DEBUG(dbgs() << "InsertPoint " << *InsertPoint << "\n"); 
-    // Avoid a data dependence from a select or a branch to a(nother) 
-    // select. Note no instruction can't data-depend on a branch (a branch 
-    // instruction doesn't produce a value). 
-    DenseSet<Instruction *> Unhoistables; 
-    // Initialize Unhoistables with the selects. 
-    for (SelectInst *SI : Selects) { 
-      Unhoistables.insert(SI); 
-    } 
-    // Remove Selects that can't be hoisted. 
-    for (auto it = Selects.begin(); it != Selects.end(); ) { 
-      SelectInst *SI = *it; 
-      if (SI == InsertPoint) { 
-        ++it; 
-        continue; 
-      } 
-      DenseMap<Instruction *, bool> Visited; 
-      bool IsHoistable = checkHoistValue(SI->getCondition(), InsertPoint, 
-                                         DT, Unhoistables, nullptr, Visited); 
-      if (!IsHoistable) { 
-        CHR_DEBUG(dbgs() << "Dropping select " << *SI << "\n"); 
-        ORE.emit([&]() { 
-          return OptimizationRemarkMissed(DEBUG_TYPE, 
-                                          "DropUnhoistableSelect", SI) 
-              << "Dropped unhoistable select"; 
-        }); 
-        it = Selects.erase(it); 
-        // Since we are dropping the select here, we also drop it from 
-        // Unhoistables. 
-        Unhoistables.erase(SI); 
-      } else 
-        ++it; 
-    } 
-    // Update InsertPoint after potentially removing selects. 
-    InsertPoint = getBranchInsertPoint(RI); 
-    CHR_DEBUG(dbgs() << "InsertPoint " << *InsertPoint << "\n"); 
-    if (RI.HasBranch && InsertPoint != Branch) { 
-      DenseMap<Instruction *, bool> Visited; 
-      bool IsHoistable = checkHoistValue(Branch->getCondition(), InsertPoint, 
-                                         DT, Unhoistables, nullptr, Visited); 
-      if (!IsHoistable) { 
-        // If the branch isn't hoistable, drop the selects in the entry 
-        // block, preferring the branch, which makes the branch the hoist 
-        // point. 
-        assert(InsertPoint != Branch && "Branch must not be the hoist point"); 
-        CHR_DEBUG(dbgs() << "Dropping selects in entry block \n"); 
-        CHR_DEBUG( 
-            for (SelectInst *SI : Selects) { 
-              dbgs() << "SI " << *SI << "\n"; 
-            }); 
-        for (SelectInst *SI : Selects) { 
-          ORE.emit([&]() { 
-            return OptimizationRemarkMissed(DEBUG_TYPE, 
-                                            "DropSelectUnhoistableBranch", SI) 
-                << "Dropped select due to unhoistable branch"; 
-          }); 
-        } 
+      assert(&I == HoistPoint &&
+             "HoistPoint must be the first one in Selects");
+      break;
+    }
+  }
+#endif
+  return HoistPoint;
+}
+
+// Find a CHR scope in the given region.
+CHRScope * CHR::findScope(Region *R) {
+  CHRScope *Result = nullptr;
+  BasicBlock *Entry = R->getEntry();
+  BasicBlock *Exit = R->getExit();  // null if top level.
+  assert(Entry && "Entry must not be null");
+  assert((Exit == nullptr) == (R->isTopLevelRegion()) &&
+         "Only top level region has a null exit");
+  if (Entry)
+    CHR_DEBUG(dbgs() << "Entry " << Entry->getName() << "\n");
+  else
+    CHR_DEBUG(dbgs() << "Entry null\n");
+  if (Exit)
+    CHR_DEBUG(dbgs() << "Exit " << Exit->getName() << "\n");
+  else
+    CHR_DEBUG(dbgs() << "Exit null\n");
+  // Exclude cases where Entry is part of a subregion (hence it doesn't belong
+  // to this region).
+  bool EntryInSubregion = RI.getRegionFor(Entry) != R;
+  if (EntryInSubregion)
+    return nullptr;
+  // Exclude loops
+  for (BasicBlock *Pred : predecessors(Entry))
+    if (R->contains(Pred))
+      return nullptr;
+  if (Exit) {
+    // Try to find an if-then block (check if R is an if-then).
+    // if (cond) {
+    //  ...
+    // }
+    auto *BI = dyn_cast<BranchInst>(Entry->getTerminator());
+    if (BI)
+      CHR_DEBUG(dbgs() << "BI.isConditional " << BI->isConditional() << "\n");
+    else
+      CHR_DEBUG(dbgs() << "BI null\n");
+    if (BI && BI->isConditional()) {
+      BasicBlock *S0 = BI->getSuccessor(0);
+      BasicBlock *S1 = BI->getSuccessor(1);
+      CHR_DEBUG(dbgs() << "S0 " << S0->getName() << "\n");
+      CHR_DEBUG(dbgs() << "S1 " << S1->getName() << "\n");
+      if (S0 != S1 && (S0 == Exit || S1 == Exit)) {
+        RegInfo RI(R);
+        RI.HasBranch = checkBiasedBranch(
+            BI, R, TrueBiasedRegionsGlobal, FalseBiasedRegionsGlobal,
+            BranchBiasMap);
+        Result = new CHRScope(RI);
+        Scopes.insert(Result);
+        CHR_DEBUG(dbgs() << "Found a region with a branch\n");
+        ++Stats.NumBranches;
+        if (!RI.HasBranch) {
+          ORE.emit([&]() {
+            return OptimizationRemarkMissed(DEBUG_TYPE, "BranchNotBiased", BI)
+                << "Branch not biased";
+          });
+        }
+      }
+    }
+  }
+  {
+    // Try to look for selects in the direct child blocks (as opposed to in
+    // subregions) of R.
+    // ...
+    // if (..) { // Some subregion
+    //   ...
+    // }
+    // if (..) { // Some subregion
+    //   ...
+    // }
+    // ...
+    // a = cond ? b : c;
+    // ...
+    SmallVector<SelectInst *, 8> Selects;
+    for (RegionNode *E : R->elements()) {
+      if (E->isSubRegion())
+        continue;
+      // This returns the basic block of E if E is a direct child of R (not a
+      // subregion.)
+      BasicBlock *BB = E->getEntry();
+      // Need to push in the order to make it easier to find the first Select
+      // later.
+      for (Instruction &I : *BB) {
+        if (auto *SI = dyn_cast<SelectInst>(&I)) {
+          Selects.push_back(SI);
+          ++Stats.NumBranches;
+        }
+      }
+    }
+    if (Selects.size() > 0) {
+      auto AddSelects = [&](RegInfo &RI) {
+        for (auto *SI : Selects)
+          if (checkBiasedSelect(SI, RI.R,
+                                TrueBiasedSelectsGlobal,
+                                FalseBiasedSelectsGlobal,
+                                SelectBiasMap))
+            RI.Selects.push_back(SI);
+          else
+            ORE.emit([&]() {
+              return OptimizationRemarkMissed(DEBUG_TYPE, "SelectNotBiased", SI)
+                  << "Select not biased";
+            });
+      };
+      if (!Result) {
+        CHR_DEBUG(dbgs() << "Found a select-only region\n");
+        RegInfo RI(R);
+        AddSelects(RI);
+        Result = new CHRScope(RI);
+        Scopes.insert(Result);
+      } else {
+        CHR_DEBUG(dbgs() << "Found select(s) in a region with a branch\n");
+        AddSelects(Result->RegInfos[0]);
+      }
+    }
+  }
+
+  if (Result) {
+    checkScopeHoistable(Result);
+  }
+  return Result;
+}
+
+// Check that any of the branch and the selects in the region could be
+// hoisted above the the CHR branch insert point (the most dominating of
+// them, either the branch (at the end of the first block) or the first
+// select in the first block). If the branch can't be hoisted, drop the
+// selects in the first blocks.
+//
+// For example, for the following scope/region with selects, we want to insert
+// the merged branch right before the first select in the first/entry block by
+// hoisting c1, c2, c3, and c4.
+//
+// // Branch insert point here.
+// a = c1 ? b : c; // Select 1
+// d = c2 ? e : f; // Select 2
+// if (c3) { // Branch
+//   ...
+//   c4 = foo() // A call.
+//   g = c4 ? h : i; // Select 3
+// }
+//
+// But suppose we can't hoist c4 because it's dependent on the preceding
+// call. Then, we drop Select 3. Furthermore, if we can't hoist c2, we also drop
+// Select 2. If we can't hoist c3, we drop Selects 1 & 2.
+void CHR::checkScopeHoistable(CHRScope *Scope) {
+  RegInfo &RI = Scope->RegInfos[0];
+  Region *R = RI.R;
+  BasicBlock *EntryBB = R->getEntry();
+  auto *Branch = RI.HasBranch ?
+                 cast<BranchInst>(EntryBB->getTerminator()) : nullptr;
+  SmallVector<SelectInst *, 8> &Selects = RI.Selects;
+  if (RI.HasBranch || !Selects.empty()) {
+    Instruction *InsertPoint = getBranchInsertPoint(RI);
+    CHR_DEBUG(dbgs() << "InsertPoint " << *InsertPoint << "\n");
+    // Avoid a data dependence from a select or a branch to a(nother)
+    // select. Note no instruction can't data-depend on a branch (a branch
+    // instruction doesn't produce a value).
+    DenseSet<Instruction *> Unhoistables;
+    // Initialize Unhoistables with the selects.
+    for (SelectInst *SI : Selects) {
+      Unhoistables.insert(SI);
+    }
+    // Remove Selects that can't be hoisted.
+    for (auto it = Selects.begin(); it != Selects.end(); ) {
+      SelectInst *SI = *it;
+      if (SI == InsertPoint) {
+        ++it;
+        continue;
+      }
+      DenseMap<Instruction *, bool> Visited;
+      bool IsHoistable = checkHoistValue(SI->getCondition(), InsertPoint,
+                                         DT, Unhoistables, nullptr, Visited);
+      if (!IsHoistable) {
+        CHR_DEBUG(dbgs() << "Dropping select " << *SI << "\n");
+        ORE.emit([&]() {
+          return OptimizationRemarkMissed(DEBUG_TYPE,
+                                          "DropUnhoistableSelect", SI)
+              << "Dropped unhoistable select";
+        });
+        it = Selects.erase(it);
+        // Since we are dropping the select here, we also drop it from
+        // Unhoistables.
+        Unhoistables.erase(SI);
+      } else
+        ++it;
+    }
+    // Update InsertPoint after potentially removing selects.
+    InsertPoint = getBranchInsertPoint(RI);
+    CHR_DEBUG(dbgs() << "InsertPoint " << *InsertPoint << "\n");
+    if (RI.HasBranch && InsertPoint != Branch) {
+      DenseMap<Instruction *, bool> Visited;
+      bool IsHoistable = checkHoistValue(Branch->getCondition(), InsertPoint,
+                                         DT, Unhoistables, nullptr, Visited);
+      if (!IsHoistable) {
+        // If the branch isn't hoistable, drop the selects in the entry
+        // block, preferring the branch, which makes the branch the hoist
+        // point.
+        assert(InsertPoint != Branch && "Branch must not be the hoist point");
+        CHR_DEBUG(dbgs() << "Dropping selects in entry block \n");
+        CHR_DEBUG(
+            for (SelectInst *SI : Selects) {
+              dbgs() << "SI " << *SI << "\n";
+            });
+        for (SelectInst *SI : Selects) {
+          ORE.emit([&]() {
+            return OptimizationRemarkMissed(DEBUG_TYPE,
+                                            "DropSelectUnhoistableBranch", SI)
+                << "Dropped select due to unhoistable branch";
+          });
+        }
         llvm::erase_if(Selects, [EntryBB](SelectInst *SI) {
           return SI->getParent() == EntryBB;
         });
-        Unhoistables.clear(); 
-        InsertPoint = Branch; 
-      } 
-    } 
-    CHR_DEBUG(dbgs() << "InsertPoint " << *InsertPoint << "\n"); 
-#ifndef NDEBUG 
-    if (RI.HasBranch) { 
-      assert(!DT.dominates(Branch, InsertPoint) && 
-             "Branch can't be already above the hoist point"); 
-      DenseMap<Instruction *, bool> Visited; 
-      assert(checkHoistValue(Branch->getCondition(), InsertPoint, 
-                             DT, Unhoistables, nullptr, Visited) && 
-             "checkHoistValue for branch"); 
-    } 
-    for (auto *SI : Selects) { 
-      assert(!DT.dominates(SI, InsertPoint) && 
-             "SI can't be already above the hoist point"); 
-      DenseMap<Instruction *, bool> Visited; 
-      assert(checkHoistValue(SI->getCondition(), InsertPoint, DT, 
-                             Unhoistables, nullptr, Visited) && 
-             "checkHoistValue for selects"); 
-    } 
-    CHR_DEBUG(dbgs() << "Result\n"); 
-    if (RI.HasBranch) { 
-      CHR_DEBUG(dbgs() << "BI " << *Branch << "\n"); 
-    } 
-    for (auto *SI : Selects) { 
-      CHR_DEBUG(dbgs() << "SI " << *SI << "\n"); 
-    } 
-#endif 
-  } 
-} 
- 
-// Traverse the region tree, find all nested scopes and merge them if possible. 
-CHRScope * CHR::findScopes(Region *R, Region *NextRegion, Region *ParentRegion, 
-                           SmallVectorImpl<CHRScope *> &Scopes) { 
-  CHR_DEBUG(dbgs() << "findScopes " << R->getNameStr() << "\n"); 
-  CHRScope *Result = findScope(R); 
-  // Visit subscopes. 
-  CHRScope *ConsecutiveSubscope = nullptr; 
-  SmallVector<CHRScope *, 8> Subscopes; 
-  for (auto It = R->begin(); It != R->end(); ++It) { 
-    const std::unique_ptr<Region> &SubR = *It; 
-    auto NextIt = std::next(It); 
-    Region *NextSubR = NextIt != R->end() ? NextIt->get() : nullptr; 
-    CHR_DEBUG(dbgs() << "Looking at subregion " << SubR.get()->getNameStr() 
-              << "\n"); 
-    CHRScope *SubCHRScope = findScopes(SubR.get(), NextSubR, R, Scopes); 
-    if (SubCHRScope) { 
-      CHR_DEBUG(dbgs() << "Subregion Scope " << *SubCHRScope << "\n"); 
-    } else { 
-      CHR_DEBUG(dbgs() << "Subregion Scope null\n"); 
-    } 
-    if (SubCHRScope) { 
-      if (!ConsecutiveSubscope) 
-        ConsecutiveSubscope = SubCHRScope; 
-      else if (!ConsecutiveSubscope->appendable(SubCHRScope)) { 
-        Subscopes.push_back(ConsecutiveSubscope); 
-        ConsecutiveSubscope = SubCHRScope; 
-      } else 
-        ConsecutiveSubscope->append(SubCHRScope); 
-    } else { 
-      if (ConsecutiveSubscope) { 
-        Subscopes.push_back(ConsecutiveSubscope); 
-      } 
-      ConsecutiveSubscope = nullptr; 
-    } 
-  } 
-  if (ConsecutiveSubscope) { 
-    Subscopes.push_back(ConsecutiveSubscope); 
-  } 
-  for (CHRScope *Sub : Subscopes) { 
-    if (Result) { 
-      // Combine it with the parent. 
-      Result->addSub(Sub); 
-    } else { 
-      // Push Subscopes as they won't be combined with the parent. 
-      Scopes.push_back(Sub); 
-    } 
-  } 
-  return Result; 
-} 
- 
-static DenseSet<Value *> getCHRConditionValuesForRegion(RegInfo &RI) { 
-  DenseSet<Value *> ConditionValues; 
-  if (RI.HasBranch) { 
-    auto *BI = cast<BranchInst>(RI.R->getEntry()->getTerminator()); 
-    ConditionValues.insert(BI->getCondition()); 
-  } 
-  for (SelectInst *SI : RI.Selects) { 
-    ConditionValues.insert(SI->getCondition()); 
-  } 
-  return ConditionValues; 
-} 
- 
- 
-// Determine whether to split a scope depending on the sets of the branch 
-// condition values of the previous region and the current region. We split 
-// (return true) it if 1) the condition values of the inner/lower scope can't be 
-// hoisted up to the outer/upper scope, or 2) the two sets of the condition 
-// values have an empty intersection (because the combined branch conditions 
-// won't probably lead to a simpler combined condition). 
-static bool shouldSplit(Instruction *InsertPoint, 
-                        DenseSet<Value *> &PrevConditionValues, 
-                        DenseSet<Value *> &ConditionValues, 
-                        DominatorTree &DT, 
-                        DenseSet<Instruction *> &Unhoistables) { 
-  assert(InsertPoint && "Null InsertPoint"); 
-  CHR_DEBUG( 
-      dbgs() << "shouldSplit " << *InsertPoint << " PrevConditionValues "; 
-      for (Value *V : PrevConditionValues) { 
-        dbgs() << *V << ", "; 
-      } 
-      dbgs() << " ConditionValues "; 
-      for (Value *V : ConditionValues) { 
-        dbgs() << *V << ", "; 
-      } 
-      dbgs() << "\n"); 
-  // If any of Bases isn't hoistable to the hoist point, split. 
-  for (Value *V : ConditionValues) { 
-    DenseMap<Instruction *, bool> Visited; 
-    if (!checkHoistValue(V, InsertPoint, DT, Unhoistables, nullptr, Visited)) { 
-      CHR_DEBUG(dbgs() << "Split. checkHoistValue false " << *V << "\n"); 
-      return true; // Not hoistable, split. 
-    } 
-  } 
-  // If PrevConditionValues or ConditionValues is empty, don't split to avoid 
-  // unnecessary splits at scopes with no branch/selects.  If 
-  // PrevConditionValues and ConditionValues don't intersect at all, split. 
-  if (!PrevConditionValues.empty() && !ConditionValues.empty()) { 
-    // Use std::set as DenseSet doesn't work with set_intersection. 
-    std::set<Value *> PrevBases, Bases; 
-    DenseMap<Value *, std::set<Value *>> Visited; 
-    for (Value *V : PrevConditionValues) { 
-      const std::set<Value *> &BaseValues = getBaseValues(V, DT, Visited); 
-      PrevBases.insert(BaseValues.begin(), BaseValues.end()); 
-    } 
-    for (Value *V : ConditionValues) { 
-      const std::set<Value *> &BaseValues = getBaseValues(V, DT, Visited); 
-      Bases.insert(BaseValues.begin(), BaseValues.end()); 
-    } 
-    CHR_DEBUG( 
-        dbgs() << "PrevBases "; 
-        for (Value *V : PrevBases) { 
-          dbgs() << *V << ", "; 
-        } 
-        dbgs() << " Bases "; 
-        for (Value *V : Bases) { 
-          dbgs() << *V << ", "; 
-        } 
-        dbgs() << "\n"); 
-    std::vector<Value *> Intersection; 
-    std::set_intersection(PrevBases.begin(), PrevBases.end(), Bases.begin(), 
-                          Bases.end(), std::back_inserter(Intersection)); 
-    if (Intersection.empty()) { 
-      // Empty intersection, split. 
-      CHR_DEBUG(dbgs() << "Split. Intersection empty\n"); 
-      return true; 
-    } 
-  } 
-  CHR_DEBUG(dbgs() << "No split\n"); 
-  return false;  // Don't split. 
-} 
- 
-static void getSelectsInScope(CHRScope *Scope, 
-                              DenseSet<Instruction *> &Output) { 
-  for (RegInfo &RI : Scope->RegInfos) 
-    for (SelectInst *SI : RI.Selects) 
-      Output.insert(SI); 
-  for (CHRScope *Sub : Scope->Subs) 
-    getSelectsInScope(Sub, Output); 
-} 
- 
-void CHR::splitScopes(SmallVectorImpl<CHRScope *> &Input, 
-                      SmallVectorImpl<CHRScope *> &Output) { 
-  for (CHRScope *Scope : Input) { 
-    assert(!Scope->BranchInsertPoint && 
-           "BranchInsertPoint must not be set"); 
-    DenseSet<Instruction *> Unhoistables; 
-    getSelectsInScope(Scope, Unhoistables); 
-    splitScope(Scope, nullptr, nullptr, nullptr, Output, Unhoistables); 
-  } 
-#ifndef NDEBUG 
-  for (CHRScope *Scope : Output) { 
-    assert(Scope->BranchInsertPoint && "BranchInsertPoint must be set"); 
-  } 
-#endif 
-} 
- 
-SmallVector<CHRScope *, 8> CHR::splitScope( 
-    CHRScope *Scope, 
-    CHRScope *Outer, 
-    DenseSet<Value *> *OuterConditionValues, 
-    Instruction *OuterInsertPoint, 
-    SmallVectorImpl<CHRScope *> &Output, 
-    DenseSet<Instruction *> &Unhoistables) { 
-  if (Outer) { 
-    assert(OuterConditionValues && "Null OuterConditionValues"); 
-    assert(OuterInsertPoint && "Null OuterInsertPoint"); 
-  } 
-  bool PrevSplitFromOuter = true; 
-  DenseSet<Value *> PrevConditionValues; 
-  Instruction *PrevInsertPoint = nullptr; 
-  SmallVector<CHRScope *, 8> Splits; 
-  SmallVector<bool, 8> SplitsSplitFromOuter; 
-  SmallVector<DenseSet<Value *>, 8> SplitsConditionValues; 
-  SmallVector<Instruction *, 8> SplitsInsertPoints; 
-  SmallVector<RegInfo, 8> RegInfos(Scope->RegInfos);  // Copy 
-  for (RegInfo &RI : RegInfos) { 
-    Instruction *InsertPoint = getBranchInsertPoint(RI); 
-    DenseSet<Value *> ConditionValues = getCHRConditionValuesForRegion(RI); 
-    CHR_DEBUG( 
-        dbgs() << "ConditionValues "; 
-        for (Value *V : ConditionValues) { 
-          dbgs() << *V << ", "; 
-        } 
-        dbgs() << "\n"); 
-    if (RI.R == RegInfos[0].R) { 
-      // First iteration. Check to see if we should split from the outer. 
-      if (Outer) { 
-        CHR_DEBUG(dbgs() << "Outer " << *Outer << "\n"); 
-        CHR_DEBUG(dbgs() << "Should split from outer at " 
-                  << RI.R->getNameStr() << "\n"); 
-        if (shouldSplit(OuterInsertPoint, *OuterConditionValues, 
-                        ConditionValues, DT, Unhoistables)) { 
-          PrevConditionValues = ConditionValues; 
-          PrevInsertPoint = InsertPoint; 
-          ORE.emit([&]() { 
-            return OptimizationRemarkMissed(DEBUG_TYPE, 
-                                            "SplitScopeFromOuter", 
-                                            RI.R->getEntry()->getTerminator()) 
-                << "Split scope from outer due to unhoistable branch/select " 
-                << "and/or lack of common condition values"; 
-          }); 
-        } else { 
-          // Not splitting from the outer. Use the outer bases and insert 
-          // point. Union the bases. 
-          PrevSplitFromOuter = false; 
-          PrevConditionValues = *OuterConditionValues; 
-          PrevConditionValues.insert(ConditionValues.begin(), 
-                                     ConditionValues.end()); 
-          PrevInsertPoint = OuterInsertPoint; 
-        } 
-      } else { 
-        CHR_DEBUG(dbgs() << "Outer null\n"); 
-        PrevConditionValues = ConditionValues; 
-        PrevInsertPoint = InsertPoint; 
-      } 
-    } else { 
-      CHR_DEBUG(dbgs() << "Should split from prev at " 
-                << RI.R->getNameStr() << "\n"); 
-      if (shouldSplit(PrevInsertPoint, PrevConditionValues, ConditionValues, 
-                      DT, Unhoistables)) { 
-        CHRScope *Tail = Scope->split(RI.R); 
-        Scopes.insert(Tail); 
-        Splits.push_back(Scope); 
-        SplitsSplitFromOuter.push_back(PrevSplitFromOuter); 
-        SplitsConditionValues.push_back(PrevConditionValues); 
-        SplitsInsertPoints.push_back(PrevInsertPoint); 
-        Scope = Tail; 
-        PrevConditionValues = ConditionValues; 
-        PrevInsertPoint = InsertPoint; 
-        PrevSplitFromOuter = true; 
-        ORE.emit([&]() { 
-          return OptimizationRemarkMissed(DEBUG_TYPE, 
-                                          "SplitScopeFromPrev", 
-                                          RI.R->getEntry()->getTerminator()) 
-              << "Split scope from previous due to unhoistable branch/select " 
-              << "and/or lack of common condition values"; 
-        }); 
-      } else { 
-        // Not splitting. Union the bases. Keep the hoist point. 
-        PrevConditionValues.insert(ConditionValues.begin(), ConditionValues.end()); 
-      } 
-    } 
-  } 
-  Splits.push_back(Scope); 
-  SplitsSplitFromOuter.push_back(PrevSplitFromOuter); 
-  SplitsConditionValues.push_back(PrevConditionValues); 
-  assert(PrevInsertPoint && "Null PrevInsertPoint"); 
-  SplitsInsertPoints.push_back(PrevInsertPoint); 
-  assert(Splits.size() == SplitsConditionValues.size() && 
-         Splits.size() == SplitsSplitFromOuter.size() && 
-         Splits.size() == SplitsInsertPoints.size() && "Mismatching sizes"); 
-  for (size_t I = 0; I < Splits.size(); ++I) { 
-    CHRScope *Split = Splits[I]; 
-    DenseSet<Value *> &SplitConditionValues = SplitsConditionValues[I]; 
-    Instruction *SplitInsertPoint = SplitsInsertPoints[I]; 
-    SmallVector<CHRScope *, 8> NewSubs; 
-    DenseSet<Instruction *> SplitUnhoistables; 
-    getSelectsInScope(Split, SplitUnhoistables); 
-    for (CHRScope *Sub : Split->Subs) { 
-      SmallVector<CHRScope *, 8> SubSplits = splitScope( 
-          Sub, Split, &SplitConditionValues, SplitInsertPoint, Output, 
-          SplitUnhoistables); 
+        Unhoistables.clear();
+        InsertPoint = Branch;
+      }
+    }
+    CHR_DEBUG(dbgs() << "InsertPoint " << *InsertPoint << "\n");
+#ifndef NDEBUG
+    if (RI.HasBranch) {
+      assert(!DT.dominates(Branch, InsertPoint) &&
+             "Branch can't be already above the hoist point");
+      DenseMap<Instruction *, bool> Visited;
+      assert(checkHoistValue(Branch->getCondition(), InsertPoint,
+                             DT, Unhoistables, nullptr, Visited) &&
+             "checkHoistValue for branch");
+    }
+    for (auto *SI : Selects) {
+      assert(!DT.dominates(SI, InsertPoint) &&
+             "SI can't be already above the hoist point");
+      DenseMap<Instruction *, bool> Visited;
+      assert(checkHoistValue(SI->getCondition(), InsertPoint, DT,
+                             Unhoistables, nullptr, Visited) &&
+             "checkHoistValue for selects");
+    }
+    CHR_DEBUG(dbgs() << "Result\n");
+    if (RI.HasBranch) {
+      CHR_DEBUG(dbgs() << "BI " << *Branch << "\n");
+    }
+    for (auto *SI : Selects) {
+      CHR_DEBUG(dbgs() << "SI " << *SI << "\n");
+    }
+#endif
+  }
+}
+
+// Traverse the region tree, find all nested scopes and merge them if possible.
+CHRScope * CHR::findScopes(Region *R, Region *NextRegion, Region *ParentRegion,
+                           SmallVectorImpl<CHRScope *> &Scopes) {
+  CHR_DEBUG(dbgs() << "findScopes " << R->getNameStr() << "\n");
+  CHRScope *Result = findScope(R);
+  // Visit subscopes.
+  CHRScope *ConsecutiveSubscope = nullptr;
+  SmallVector<CHRScope *, 8> Subscopes;
+  for (auto It = R->begin(); It != R->end(); ++It) {
+    const std::unique_ptr<Region> &SubR = *It;
+    auto NextIt = std::next(It);
+    Region *NextSubR = NextIt != R->end() ? NextIt->get() : nullptr;
+    CHR_DEBUG(dbgs() << "Looking at subregion " << SubR.get()->getNameStr()
+              << "\n");
+    CHRScope *SubCHRScope = findScopes(SubR.get(), NextSubR, R, Scopes);
+    if (SubCHRScope) {
+      CHR_DEBUG(dbgs() << "Subregion Scope " << *SubCHRScope << "\n");
+    } else {
+      CHR_DEBUG(dbgs() << "Subregion Scope null\n");
+    }
+    if (SubCHRScope) {
+      if (!ConsecutiveSubscope)
+        ConsecutiveSubscope = SubCHRScope;
+      else if (!ConsecutiveSubscope->appendable(SubCHRScope)) {
+        Subscopes.push_back(ConsecutiveSubscope);
+        ConsecutiveSubscope = SubCHRScope;
+      } else
+        ConsecutiveSubscope->append(SubCHRScope);
+    } else {
+      if (ConsecutiveSubscope) {
+        Subscopes.push_back(ConsecutiveSubscope);
+      }
+      ConsecutiveSubscope = nullptr;
+    }
+  }
+  if (ConsecutiveSubscope) {
+    Subscopes.push_back(ConsecutiveSubscope);
+  }
+  for (CHRScope *Sub : Subscopes) {
+    if (Result) {
+      // Combine it with the parent.
+      Result->addSub(Sub);
+    } else {
+      // Push Subscopes as they won't be combined with the parent.
+      Scopes.push_back(Sub);
+    }
+  }
+  return Result;
+}
+
+static DenseSet<Value *> getCHRConditionValuesForRegion(RegInfo &RI) {
+  DenseSet<Value *> ConditionValues;
+  if (RI.HasBranch) {
+    auto *BI = cast<BranchInst>(RI.R->getEntry()->getTerminator());
+    ConditionValues.insert(BI->getCondition());
+  }
+  for (SelectInst *SI : RI.Selects) {
+    ConditionValues.insert(SI->getCondition());
+  }
+  return ConditionValues;
+}
+
+
+// Determine whether to split a scope depending on the sets of the branch
+// condition values of the previous region and the current region. We split
+// (return true) it if 1) the condition values of the inner/lower scope can't be
+// hoisted up to the outer/upper scope, or 2) the two sets of the condition
+// values have an empty intersection (because the combined branch conditions
+// won't probably lead to a simpler combined condition).
+static bool shouldSplit(Instruction *InsertPoint,
+                        DenseSet<Value *> &PrevConditionValues,
+                        DenseSet<Value *> &ConditionValues,
+                        DominatorTree &DT,
+                        DenseSet<Instruction *> &Unhoistables) {
+  assert(InsertPoint && "Null InsertPoint");
+  CHR_DEBUG(
+      dbgs() << "shouldSplit " << *InsertPoint << " PrevConditionValues ";
+      for (Value *V : PrevConditionValues) {
+        dbgs() << *V << ", ";
+      }
+      dbgs() << " ConditionValues ";
+      for (Value *V : ConditionValues) {
+        dbgs() << *V << ", ";
+      }
+      dbgs() << "\n");
+  // If any of Bases isn't hoistable to the hoist point, split.
+  for (Value *V : ConditionValues) {
+    DenseMap<Instruction *, bool> Visited;
+    if (!checkHoistValue(V, InsertPoint, DT, Unhoistables, nullptr, Visited)) {
+      CHR_DEBUG(dbgs() << "Split. checkHoistValue false " << *V << "\n");
+      return true; // Not hoistable, split.
+    }
+  }
+  // If PrevConditionValues or ConditionValues is empty, don't split to avoid
+  // unnecessary splits at scopes with no branch/selects.  If
+  // PrevConditionValues and ConditionValues don't intersect at all, split.
+  if (!PrevConditionValues.empty() && !ConditionValues.empty()) {
+    // Use std::set as DenseSet doesn't work with set_intersection.
+    std::set<Value *> PrevBases, Bases;
+    DenseMap<Value *, std::set<Value *>> Visited;
+    for (Value *V : PrevConditionValues) {
+      const std::set<Value *> &BaseValues = getBaseValues(V, DT, Visited);
+      PrevBases.insert(BaseValues.begin(), BaseValues.end());
+    }
+    for (Value *V : ConditionValues) {
+      const std::set<Value *> &BaseValues = getBaseValues(V, DT, Visited);
+      Bases.insert(BaseValues.begin(), BaseValues.end());
+    }
+    CHR_DEBUG(
+        dbgs() << "PrevBases ";
+        for (Value *V : PrevBases) {
+          dbgs() << *V << ", ";
+        }
+        dbgs() << " Bases ";
+        for (Value *V : Bases) {
+          dbgs() << *V << ", ";
+        }
+        dbgs() << "\n");
+    std::vector<Value *> Intersection;
+    std::set_intersection(PrevBases.begin(), PrevBases.end(), Bases.begin(),
+                          Bases.end(), std::back_inserter(Intersection));
+    if (Intersection.empty()) {
+      // Empty intersection, split.
+      CHR_DEBUG(dbgs() << "Split. Intersection empty\n");
+      return true;
+    }
+  }
+  CHR_DEBUG(dbgs() << "No split\n");
+  return false;  // Don't split.
+}
+
+static void getSelectsInScope(CHRScope *Scope,
+                              DenseSet<Instruction *> &Output) {
+  for (RegInfo &RI : Scope->RegInfos)
+    for (SelectInst *SI : RI.Selects)
+      Output.insert(SI);
+  for (CHRScope *Sub : Scope->Subs)
+    getSelectsInScope(Sub, Output);
+}
+
+void CHR::splitScopes(SmallVectorImpl<CHRScope *> &Input,
+                      SmallVectorImpl<CHRScope *> &Output) {
+  for (CHRScope *Scope : Input) {
+    assert(!Scope->BranchInsertPoint &&
+           "BranchInsertPoint must not be set");
+    DenseSet<Instruction *> Unhoistables;
+    getSelectsInScope(Scope, Unhoistables);
+    splitScope(Scope, nullptr, nullptr, nullptr, Output, Unhoistables);
+  }
+#ifndef NDEBUG
+  for (CHRScope *Scope : Output) {
+    assert(Scope->BranchInsertPoint && "BranchInsertPoint must be set");
+  }
+#endif
+}
+
+SmallVector<CHRScope *, 8> CHR::splitScope(
+    CHRScope *Scope,
+    CHRScope *Outer,
+    DenseSet<Value *> *OuterConditionValues,
+    Instruction *OuterInsertPoint,
+    SmallVectorImpl<CHRScope *> &Output,
+    DenseSet<Instruction *> &Unhoistables) {
+  if (Outer) {
+    assert(OuterConditionValues && "Null OuterConditionValues");
+    assert(OuterInsertPoint && "Null OuterInsertPoint");
+  }
+  bool PrevSplitFromOuter = true;
+  DenseSet<Value *> PrevConditionValues;
+  Instruction *PrevInsertPoint = nullptr;
+  SmallVector<CHRScope *, 8> Splits;
+  SmallVector<bool, 8> SplitsSplitFromOuter;
+  SmallVector<DenseSet<Value *>, 8> SplitsConditionValues;
+  SmallVector<Instruction *, 8> SplitsInsertPoints;
+  SmallVector<RegInfo, 8> RegInfos(Scope->RegInfos);  // Copy
+  for (RegInfo &RI : RegInfos) {
+    Instruction *InsertPoint = getBranchInsertPoint(RI);
+    DenseSet<Value *> ConditionValues = getCHRConditionValuesForRegion(RI);
+    CHR_DEBUG(
+        dbgs() << "ConditionValues ";
+        for (Value *V : ConditionValues) {
+          dbgs() << *V << ", ";
+        }
+        dbgs() << "\n");
+    if (RI.R == RegInfos[0].R) {
+      // First iteration. Check to see if we should split from the outer.
+      if (Outer) {
+        CHR_DEBUG(dbgs() << "Outer " << *Outer << "\n");
+        CHR_DEBUG(dbgs() << "Should split from outer at "
+                  << RI.R->getNameStr() << "\n");
+        if (shouldSplit(OuterInsertPoint, *OuterConditionValues,
+                        ConditionValues, DT, Unhoistables)) {
+          PrevConditionValues = ConditionValues;
+          PrevInsertPoint = InsertPoint;
+          ORE.emit([&]() {
+            return OptimizationRemarkMissed(DEBUG_TYPE,
+                                            "SplitScopeFromOuter",
+                                            RI.R->getEntry()->getTerminator())
+                << "Split scope from outer due to unhoistable branch/select "
+                << "and/or lack of common condition values";
+          });
+        } else {
+          // Not splitting from the outer. Use the outer bases and insert
+          // point. Union the bases.
+          PrevSplitFromOuter = false;
+          PrevConditionValues = *OuterConditionValues;
+          PrevConditionValues.insert(ConditionValues.begin(),
+                                     ConditionValues.end());
+          PrevInsertPoint = OuterInsertPoint;
+        }
+      } else {
+        CHR_DEBUG(dbgs() << "Outer null\n");
+        PrevConditionValues = ConditionValues;
+        PrevInsertPoint = InsertPoint;
+      }
+    } else {
+      CHR_DEBUG(dbgs() << "Should split from prev at "
+                << RI.R->getNameStr() << "\n");
+      if (shouldSplit(PrevInsertPoint, PrevConditionValues, ConditionValues,
+                      DT, Unhoistables)) {
+        CHRScope *Tail = Scope->split(RI.R);
+        Scopes.insert(Tail);
+        Splits.push_back(Scope);
+        SplitsSplitFromOuter.push_back(PrevSplitFromOuter);
+        SplitsConditionValues.push_back(PrevConditionValues);
+        SplitsInsertPoints.push_back(PrevInsertPoint);
+        Scope = Tail;
+        PrevConditionValues = ConditionValues;
+        PrevInsertPoint = InsertPoint;
+        PrevSplitFromOuter = true;
+        ORE.emit([&]() {
+          return OptimizationRemarkMissed(DEBUG_TYPE,
+                                          "SplitScopeFromPrev",
+                                          RI.R->getEntry()->getTerminator())
+              << "Split scope from previous due to unhoistable branch/select "
+              << "and/or lack of common condition values";
+        });
+      } else {
+        // Not splitting. Union the bases. Keep the hoist point.
+        PrevConditionValues.insert(ConditionValues.begin(), ConditionValues.end());
+      }
+    }
+  }
+  Splits.push_back(Scope);
+  SplitsSplitFromOuter.push_back(PrevSplitFromOuter);
+  SplitsConditionValues.push_back(PrevConditionValues);
+  assert(PrevInsertPoint && "Null PrevInsertPoint");
+  SplitsInsertPoints.push_back(PrevInsertPoint);
+  assert(Splits.size() == SplitsConditionValues.size() &&
+         Splits.size() == SplitsSplitFromOuter.size() &&
+         Splits.size() == SplitsInsertPoints.size() && "Mismatching sizes");
+  for (size_t I = 0; I < Splits.size(); ++I) {
+    CHRScope *Split = Splits[I];
+    DenseSet<Value *> &SplitConditionValues = SplitsConditionValues[I];
+    Instruction *SplitInsertPoint = SplitsInsertPoints[I];
+    SmallVector<CHRScope *, 8> NewSubs;
+    DenseSet<Instruction *> SplitUnhoistables;
+    getSelectsInScope(Split, SplitUnhoistables);
+    for (CHRScope *Sub : Split->Subs) {
+      SmallVector<CHRScope *, 8> SubSplits = splitScope(
+          Sub, Split, &SplitConditionValues, SplitInsertPoint, Output,
+          SplitUnhoistables);
       llvm::append_range(NewSubs, SubSplits);
-    } 
-    Split->Subs = NewSubs; 
-  } 
-  SmallVector<CHRScope *, 8> Result; 
-  for (size_t I = 0; I < Splits.size(); ++I) { 
-    CHRScope *Split = Splits[I]; 
-    if (SplitsSplitFromOuter[I]) { 
-      // Split from the outer. 
-      Output.push_back(Split); 
-      Split->BranchInsertPoint = SplitsInsertPoints[I]; 
-      CHR_DEBUG(dbgs() << "BranchInsertPoint " << *SplitsInsertPoints[I] 
-                << "\n"); 
-    } else { 
-      // Connected to the outer. 
-      Result.push_back(Split); 
-    } 
-  } 
-  if (!Outer) 
-    assert(Result.empty() && 
-           "If no outer (top-level), must return no nested ones"); 
-  return Result; 
-} 
- 
-void CHR::classifyBiasedScopes(SmallVectorImpl<CHRScope *> &Scopes) { 
-  for (CHRScope *Scope : Scopes) { 
-    assert(Scope->TrueBiasedRegions.empty() && Scope->FalseBiasedRegions.empty() && "Empty"); 
-    classifyBiasedScopes(Scope, Scope); 
-    CHR_DEBUG( 
-        dbgs() << "classifyBiasedScopes " << *Scope << "\n"; 
-        dbgs() << "TrueBiasedRegions "; 
-        for (Region *R : Scope->TrueBiasedRegions) { 
-          dbgs() << R->getNameStr() << ", "; 
-        } 
-        dbgs() << "\n"; 
-        dbgs() << "FalseBiasedRegions "; 
-        for (Region *R : Scope->FalseBiasedRegions) { 
-          dbgs() << R->getNameStr() << ", "; 
-        } 
-        dbgs() << "\n"; 
-        dbgs() << "TrueBiasedSelects "; 
-        for (SelectInst *SI : Scope->TrueBiasedSelects) { 
-          dbgs() << *SI << ", "; 
-        } 
-        dbgs() << "\n"; 
-        dbgs() << "FalseBiasedSelects "; 
-        for (SelectInst *SI : Scope->FalseBiasedSelects) { 
-          dbgs() << *SI << ", "; 
-        } 
-        dbgs() << "\n";); 
-  } 
-} 
- 
-void CHR::classifyBiasedScopes(CHRScope *Scope, CHRScope *OutermostScope) { 
-  for (RegInfo &RI : Scope->RegInfos) { 
-    if (RI.HasBranch) { 
-      Region *R = RI.R; 
+    }
+    Split->Subs = NewSubs;
+  }
+  SmallVector<CHRScope *, 8> Result;
+  for (size_t I = 0; I < Splits.size(); ++I) {
+    CHRScope *Split = Splits[I];
+    if (SplitsSplitFromOuter[I]) {
+      // Split from the outer.
+      Output.push_back(Split);
+      Split->BranchInsertPoint = SplitsInsertPoints[I];
+      CHR_DEBUG(dbgs() << "BranchInsertPoint " << *SplitsInsertPoints[I]
+                << "\n");
+    } else {
+      // Connected to the outer.
+      Result.push_back(Split);
+    }
+  }
+  if (!Outer)
+    assert(Result.empty() &&
+           "If no outer (top-level), must return no nested ones");
+  return Result;
+}
+
+void CHR::classifyBiasedScopes(SmallVectorImpl<CHRScope *> &Scopes) {
+  for (CHRScope *Scope : Scopes) {
+    assert(Scope->TrueBiasedRegions.empty() && Scope->FalseBiasedRegions.empty() && "Empty");
+    classifyBiasedScopes(Scope, Scope);
+    CHR_DEBUG(
+        dbgs() << "classifyBiasedScopes " << *Scope << "\n";
+        dbgs() << "TrueBiasedRegions ";
+        for (Region *R : Scope->TrueBiasedRegions) {
+          dbgs() << R->getNameStr() << ", ";
+        }
+        dbgs() << "\n";
+        dbgs() << "FalseBiasedRegions ";
+        for (Region *R : Scope->FalseBiasedRegions) {
+          dbgs() << R->getNameStr() << ", ";
+        }
+        dbgs() << "\n";
+        dbgs() << "TrueBiasedSelects ";
+        for (SelectInst *SI : Scope->TrueBiasedSelects) {
+          dbgs() << *SI << ", ";
+        }
+        dbgs() << "\n";
+        dbgs() << "FalseBiasedSelects ";
+        for (SelectInst *SI : Scope->FalseBiasedSelects) {
+          dbgs() << *SI << ", ";
+        }
+        dbgs() << "\n";);
+  }
+}
+
+void CHR::classifyBiasedScopes(CHRScope *Scope, CHRScope *OutermostScope) {
+  for (RegInfo &RI : Scope->RegInfos) {
+    if (RI.HasBranch) {
+      Region *R = RI.R;
       if (TrueBiasedRegionsGlobal.contains(R))
-        OutermostScope->TrueBiasedRegions.insert(R); 
+        OutermostScope->TrueBiasedRegions.insert(R);
       else if (FalseBiasedRegionsGlobal.contains(R))
-        OutermostScope->FalseBiasedRegions.insert(R); 
-      else 
-        llvm_unreachable("Must be biased"); 
-    } 
-    for (SelectInst *SI : RI.Selects) { 
+        OutermostScope->FalseBiasedRegions.insert(R);
+      else
+        llvm_unreachable("Must be biased");
+    }
+    for (SelectInst *SI : RI.Selects) {
       if (TrueBiasedSelectsGlobal.contains(SI))
-        OutermostScope->TrueBiasedSelects.insert(SI); 
+        OutermostScope->TrueBiasedSelects.insert(SI);
       else if (FalseBiasedSelectsGlobal.contains(SI))
-        OutermostScope->FalseBiasedSelects.insert(SI); 
-      else 
-        llvm_unreachable("Must be biased"); 
-    } 
-  } 
-  for (CHRScope *Sub : Scope->Subs) { 
-    classifyBiasedScopes(Sub, OutermostScope); 
-  } 
-} 
- 
-static bool hasAtLeastTwoBiasedBranches(CHRScope *Scope) { 
-  unsigned NumBiased = Scope->TrueBiasedRegions.size() + 
-                       Scope->FalseBiasedRegions.size() + 
-                       Scope->TrueBiasedSelects.size() + 
-                       Scope->FalseBiasedSelects.size(); 
-  return NumBiased >= CHRMergeThreshold; 
-} 
- 
-void CHR::filterScopes(SmallVectorImpl<CHRScope *> &Input, 
-                       SmallVectorImpl<CHRScope *> &Output) { 
-  for (CHRScope *Scope : Input) { 
-    // Filter out the ones with only one region and no subs. 
-    if (!hasAtLeastTwoBiasedBranches(Scope)) { 
-      CHR_DEBUG(dbgs() << "Filtered out by biased branches truthy-regions " 
-                << Scope->TrueBiasedRegions.size() 
-                << " falsy-regions " << Scope->FalseBiasedRegions.size() 
-                << " true-selects " << Scope->TrueBiasedSelects.size() 
-                << " false-selects " << Scope->FalseBiasedSelects.size() << "\n"); 
-      ORE.emit([&]() { 
-        return OptimizationRemarkMissed( 
-            DEBUG_TYPE, 
-            "DropScopeWithOneBranchOrSelect", 
-            Scope->RegInfos[0].R->getEntry()->getTerminator()) 
-            << "Drop scope with < " 
-            << ore::NV("CHRMergeThreshold", CHRMergeThreshold) 
-            << " biased branch(es) or select(s)"; 
-      }); 
-      continue; 
-    } 
-    Output.push_back(Scope); 
-  } 
-} 
- 
-void CHR::setCHRRegions(SmallVectorImpl<CHRScope *> &Input, 
-                        SmallVectorImpl<CHRScope *> &Output) { 
-  for (CHRScope *Scope : Input) { 
-    assert(Scope->HoistStopMap.empty() && Scope->CHRRegions.empty() && 
-           "Empty"); 
-    setCHRRegions(Scope, Scope); 
-    Output.push_back(Scope); 
-    CHR_DEBUG( 
-        dbgs() << "setCHRRegions HoistStopMap " << *Scope << "\n"; 
-        for (auto pair : Scope->HoistStopMap) { 
-          Region *R = pair.first; 
-          dbgs() << "Region " << R->getNameStr() << "\n"; 
-          for (Instruction *I : pair.second) { 
-            dbgs() << "HoistStop " << *I << "\n"; 
-          } 
-        } 
-        dbgs() << "CHRRegions" << "\n"; 
-        for (RegInfo &RI : Scope->CHRRegions) { 
-          dbgs() << RI.R->getNameStr() << "\n"; 
-        }); 
-  } 
-} 
- 
-void CHR::setCHRRegions(CHRScope *Scope, CHRScope *OutermostScope) { 
-  DenseSet<Instruction *> Unhoistables; 
-  // Put the biased selects in Unhoistables because they should stay where they 
-  // are and constant-folded after CHR (in case one biased select or a branch 
-  // can depend on another biased select.) 
-  for (RegInfo &RI : Scope->RegInfos) { 
-    for (SelectInst *SI : RI.Selects) { 
-      Unhoistables.insert(SI); 
-    } 
-  } 
-  Instruction *InsertPoint = OutermostScope->BranchInsertPoint; 
-  for (RegInfo &RI : Scope->RegInfos) { 
-    Region *R = RI.R; 
-    DenseSet<Instruction *> HoistStops; 
-    bool IsHoisted = false; 
-    if (RI.HasBranch) { 
+        OutermostScope->FalseBiasedSelects.insert(SI);
+      else
+        llvm_unreachable("Must be biased");
+    }
+  }
+  for (CHRScope *Sub : Scope->Subs) {
+    classifyBiasedScopes(Sub, OutermostScope);
+  }
+}
+
+static bool hasAtLeastTwoBiasedBranches(CHRScope *Scope) {
+  unsigned NumBiased = Scope->TrueBiasedRegions.size() +
+                       Scope->FalseBiasedRegions.size() +
+                       Scope->TrueBiasedSelects.size() +
+                       Scope->FalseBiasedSelects.size();
+  return NumBiased >= CHRMergeThreshold;
+}
+
+void CHR::filterScopes(SmallVectorImpl<CHRScope *> &Input,
+                       SmallVectorImpl<CHRScope *> &Output) {
+  for (CHRScope *Scope : Input) {
+    // Filter out the ones with only one region and no subs.
+    if (!hasAtLeastTwoBiasedBranches(Scope)) {
+      CHR_DEBUG(dbgs() << "Filtered out by biased branches truthy-regions "
+                << Scope->TrueBiasedRegions.size()
+                << " falsy-regions " << Scope->FalseBiasedRegions.size()
+                << " true-selects " << Scope->TrueBiasedSelects.size()
+                << " false-selects " << Scope->FalseBiasedSelects.size() << "\n");
+      ORE.emit([&]() {
+        return OptimizationRemarkMissed(
+            DEBUG_TYPE,
+            "DropScopeWithOneBranchOrSelect",
+            Scope->RegInfos[0].R->getEntry()->getTerminator())
+            << "Drop scope with < "
+            << ore::NV("CHRMergeThreshold", CHRMergeThreshold)
+            << " biased branch(es) or select(s)";
+      });
+      continue;
+    }
+    Output.push_back(Scope);
+  }
+}
+
+void CHR::setCHRRegions(SmallVectorImpl<CHRScope *> &Input,
+                        SmallVectorImpl<CHRScope *> &Output) {
+  for (CHRScope *Scope : Input) {
+    assert(Scope->HoistStopMap.empty() && Scope->CHRRegions.empty() &&
+           "Empty");
+    setCHRRegions(Scope, Scope);
+    Output.push_back(Scope);
+    CHR_DEBUG(
+        dbgs() << "setCHRRegions HoistStopMap " << *Scope << "\n";
+        for (auto pair : Scope->HoistStopMap) {
+          Region *R = pair.first;
+          dbgs() << "Region " << R->getNameStr() << "\n";
+          for (Instruction *I : pair.second) {
+            dbgs() << "HoistStop " << *I << "\n";
+          }
+        }
+        dbgs() << "CHRRegions" << "\n";
+        for (RegInfo &RI : Scope->CHRRegions) {
+          dbgs() << RI.R->getNameStr() << "\n";
+        });
+  }
+}
+
+void CHR::setCHRRegions(CHRScope *Scope, CHRScope *OutermostScope) {
+  DenseSet<Instruction *> Unhoistables;
+  // Put the biased selects in Unhoistables because they should stay where they
+  // are and constant-folded after CHR (in case one biased select or a branch
+  // can depend on another biased select.)
+  for (RegInfo &RI : Scope->RegInfos) {
+    for (SelectInst *SI : RI.Selects) {
+      Unhoistables.insert(SI);
+    }
+  }
+  Instruction *InsertPoint = OutermostScope->BranchInsertPoint;
+  for (RegInfo &RI : Scope->RegInfos) {
+    Region *R = RI.R;
+    DenseSet<Instruction *> HoistStops;
+    bool IsHoisted = false;
+    if (RI.HasBranch) {
       assert((OutermostScope->TrueBiasedRegions.contains(R) ||
               OutermostScope->FalseBiasedRegions.contains(R)) &&
-             "Must be truthy or falsy"); 
-      auto *BI = cast<BranchInst>(R->getEntry()->getTerminator()); 
-      // Note checkHoistValue fills in HoistStops. 
-      DenseMap<Instruction *, bool> Visited; 
-      bool IsHoistable = checkHoistValue(BI->getCondition(), InsertPoint, DT, 
-                                         Unhoistables, &HoistStops, Visited); 
-      assert(IsHoistable && "Must be hoistable"); 
-      (void)(IsHoistable);  // Unused in release build 
-      IsHoisted = true; 
-    } 
-    for (SelectInst *SI : RI.Selects) { 
+             "Must be truthy or falsy");
+      auto *BI = cast<BranchInst>(R->getEntry()->getTerminator());
+      // Note checkHoistValue fills in HoistStops.
+      DenseMap<Instruction *, bool> Visited;
+      bool IsHoistable = checkHoistValue(BI->getCondition(), InsertPoint, DT,
+                                         Unhoistables, &HoistStops, Visited);
+      assert(IsHoistable && "Must be hoistable");
+      (void)(IsHoistable);  // Unused in release build
+      IsHoisted = true;
+    }
+    for (SelectInst *SI : RI.Selects) {
       assert((OutermostScope->TrueBiasedSelects.contains(SI) ||
               OutermostScope->FalseBiasedSelects.contains(SI)) &&
-             "Must be true or false biased"); 
-      // Note checkHoistValue fills in HoistStops. 
-      DenseMap<Instruction *, bool> Visited; 
-      bool IsHoistable = checkHoistValue(SI->getCondition(), InsertPoint, DT, 
-                                         Unhoistables, &HoistStops, Visited); 
-      assert(IsHoistable && "Must be hoistable"); 
-      (void)(IsHoistable);  // Unused in release build 
-      IsHoisted = true; 
-    } 
-    if (IsHoisted) { 
-      OutermostScope->CHRRegions.push_back(RI); 
-      OutermostScope->HoistStopMap[R] = HoistStops; 
-    } 
-  } 
-  for (CHRScope *Sub : Scope->Subs) 
-    setCHRRegions(Sub, OutermostScope); 
-} 
- 
-static bool CHRScopeSorter(CHRScope *Scope1, CHRScope *Scope2) { 
-  return Scope1->RegInfos[0].R->getDepth() < Scope2->RegInfos[0].R->getDepth(); 
-} 
- 
-void CHR::sortScopes(SmallVectorImpl<CHRScope *> &Input, 
-                     SmallVectorImpl<CHRScope *> &Output) { 
-  Output.resize(Input.size()); 
-  llvm::copy(Input, Output.begin()); 
-  llvm::stable_sort(Output, CHRScopeSorter); 
-} 
- 
-// Return true if V is already hoisted or was hoisted (along with its operands) 
-// to the insert point. 
-static void hoistValue(Value *V, Instruction *HoistPoint, Region *R, 
-                       HoistStopMapTy &HoistStopMap, 
-                       DenseSet<Instruction *> &HoistedSet, 
-                       DenseSet<PHINode *> &TrivialPHIs, 
-                       DominatorTree &DT) { 
-  auto IT = HoistStopMap.find(R); 
-  assert(IT != HoistStopMap.end() && "Region must be in hoist stop map"); 
-  DenseSet<Instruction *> &HoistStops = IT->second; 
-  if (auto *I = dyn_cast<Instruction>(V)) { 
-    if (I == HoistPoint) 
-      return; 
-    if (HoistStops.count(I)) 
-      return; 
-    if (auto *PN = dyn_cast<PHINode>(I)) 
-      if (TrivialPHIs.count(PN)) 
-        // The trivial phi inserted by the previous CHR scope could replace a 
-        // non-phi in HoistStops. Note that since this phi is at the exit of a 
-        // previous CHR scope, which dominates this scope, it's safe to stop 
-        // hoisting there. 
-        return; 
-    if (HoistedSet.count(I)) 
-      // Already hoisted, return. 
-      return; 
-    assert(isHoistableInstructionType(I) && "Unhoistable instruction type"); 
-    assert(DT.getNode(I->getParent()) && "DT must contain I's block"); 
-    assert(DT.getNode(HoistPoint->getParent()) && 
-           "DT must contain HoistPoint block"); 
-    if (DT.dominates(I, HoistPoint)) 
-      // We are already above the hoist point. Stop here. This may be necessary 
-      // when multiple scopes would independently hoist the same 
-      // instruction. Since an outer (dominating) scope would hoist it to its 
-      // entry before an inner (dominated) scope would to its entry, the inner 
-      // scope may see the instruction already hoisted, in which case it 
-      // potentially wrong for the inner scope to hoist it and could cause bad 
-      // IR (non-dominating def), but safe to skip hoisting it instead because 
-      // it's already in a block that dominates the inner scope. 
-      return; 
-    for (Value *Op : I->operands()) { 
-      hoistValue(Op, HoistPoint, R, HoistStopMap, HoistedSet, TrivialPHIs, DT); 
-    } 
-    I->moveBefore(HoistPoint); 
-    HoistedSet.insert(I); 
-    CHR_DEBUG(dbgs() << "hoistValue " << *I << "\n"); 
-  } 
-} 
- 
-// Hoist the dependent condition values of the branches and the selects in the 
-// scope to the insert point. 
-static void hoistScopeConditions(CHRScope *Scope, Instruction *HoistPoint, 
-                                 DenseSet<PHINode *> &TrivialPHIs, 
-                                 DominatorTree &DT) { 
-  DenseSet<Instruction *> HoistedSet; 
-  for (const RegInfo &RI : Scope->CHRRegions) { 
-    Region *R = RI.R; 
-    bool IsTrueBiased = Scope->TrueBiasedRegions.count(R); 
-    bool IsFalseBiased = Scope->FalseBiasedRegions.count(R); 
-    if (RI.HasBranch && (IsTrueBiased || IsFalseBiased)) { 
-      auto *BI = cast<BranchInst>(R->getEntry()->getTerminator()); 
-      hoistValue(BI->getCondition(), HoistPoint, R, Scope->HoistStopMap, 
-                 HoistedSet, TrivialPHIs, DT); 
-    } 
-    for (SelectInst *SI : RI.Selects) { 
-      bool IsTrueBiased = Scope->TrueBiasedSelects.count(SI); 
-      bool IsFalseBiased = Scope->FalseBiasedSelects.count(SI); 
-      if (!(IsTrueBiased || IsFalseBiased)) 
-        continue; 
-      hoistValue(SI->getCondition(), HoistPoint, R, Scope->HoistStopMap, 
-                 HoistedSet, TrivialPHIs, DT); 
-    } 
-  } 
-} 
- 
-// Negate the predicate if an ICmp if it's used only by branches or selects by 
-// swapping the operands of the branches or the selects. Returns true if success. 
-static bool negateICmpIfUsedByBranchOrSelectOnly(ICmpInst *ICmp, 
-                                                 Instruction *ExcludedUser, 
-                                                 CHRScope *Scope) { 
-  for (User *U : ICmp->users()) { 
-    if (U == ExcludedUser) 
-      continue; 
-    if (isa<BranchInst>(U) && cast<BranchInst>(U)->isConditional()) 
-      continue; 
-    if (isa<SelectInst>(U) && cast<SelectInst>(U)->getCondition() == ICmp) 
-      continue; 
-    return false; 
-  } 
-  for (User *U : ICmp->users()) { 
-    if (U == ExcludedUser) 
-      continue; 
-    if (auto *BI = dyn_cast<BranchInst>(U)) { 
-      assert(BI->isConditional() && "Must be conditional"); 
-      BI->swapSuccessors(); 
-      // Don't need to swap this in terms of 
-      // TrueBiasedRegions/FalseBiasedRegions because true-based/false-based 
-      // mean whehter the branch is likely go into the if-then rather than 
-      // successor0/successor1 and because we can tell which edge is the then or 
-      // the else one by comparing the destination to the region exit block. 
-      continue; 
-    } 
-    if (auto *SI = dyn_cast<SelectInst>(U)) { 
-      // Swap operands 
-      SI->swapValues(); 
-      SI->swapProfMetadata(); 
-      if (Scope->TrueBiasedSelects.count(SI)) { 
-        assert(Scope->FalseBiasedSelects.count(SI) == 0 && 
-               "Must not be already in"); 
-        Scope->FalseBiasedSelects.insert(SI); 
-      } else if (Scope->FalseBiasedSelects.count(SI)) { 
-        assert(Scope->TrueBiasedSelects.count(SI) == 0 && 
-               "Must not be already in"); 
-        Scope->TrueBiasedSelects.insert(SI); 
-      } 
-      continue; 
-    } 
-    llvm_unreachable("Must be a branch or a select"); 
-  } 
-  ICmp->setPredicate(CmpInst::getInversePredicate(ICmp->getPredicate())); 
-  return true; 
-} 
- 
-// A helper for transformScopes. Insert a trivial phi at the scope exit block 
-// for a value that's defined in the scope but used outside it (meaning it's 
-// alive at the exit block). 
-static void insertTrivialPHIs(CHRScope *Scope, 
-                              BasicBlock *EntryBlock, BasicBlock *ExitBlock, 
-                              DenseSet<PHINode *> &TrivialPHIs) { 
-  SmallSetVector<BasicBlock *, 8> BlocksInScope; 
-  for (RegInfo &RI : Scope->RegInfos) { 
-    for (BasicBlock *BB : RI.R->blocks()) { // This includes the blocks in the 
-                                            // sub-Scopes. 
-      BlocksInScope.insert(BB); 
-    } 
-  } 
-  CHR_DEBUG({ 
-    dbgs() << "Inserting redundant phis\n"; 
-    for (BasicBlock *BB : BlocksInScope) 
-      dbgs() << "BlockInScope " << BB->getName() << "\n"; 
-  }); 
-  for (BasicBlock *BB : BlocksInScope) { 
-    for (Instruction &I : *BB) { 
-      SmallVector<Instruction *, 8> Users; 
-      for (User *U : I.users()) { 
-        if (auto *UI = dyn_cast<Instruction>(U)) { 
-          if (BlocksInScope.count(UI->getParent()) == 0 && 
-              // Unless there's already a phi for I at the exit block. 
-              !(isa<PHINode>(UI) && UI->getParent() == ExitBlock)) { 
-            CHR_DEBUG(dbgs() << "V " << I << "\n"); 
-            CHR_DEBUG(dbgs() << "Used outside scope by user " << *UI << "\n"); 
-            Users.push_back(UI); 
-          } else if (UI->getParent() == EntryBlock && isa<PHINode>(UI)) { 
-            // There's a loop backedge from a block that's dominated by this 
-            // scope to the entry block. 
-            CHR_DEBUG(dbgs() << "V " << I << "\n"); 
-            CHR_DEBUG(dbgs() 
-                      << "Used at entry block (for a back edge) by a phi user " 
-                      << *UI << "\n"); 
-            Users.push_back(UI); 
-          } 
-        } 
-      } 
-      if (Users.size() > 0) { 
-        // Insert a trivial phi for I (phi [&I, P0], [&I, P1], ...) at 
-        // ExitBlock. Replace I with the new phi in UI unless UI is another 
-        // phi at ExitBlock. 
+             "Must be true or false biased");
+      // Note checkHoistValue fills in HoistStops.
+      DenseMap<Instruction *, bool> Visited;
+      bool IsHoistable = checkHoistValue(SI->getCondition(), InsertPoint, DT,
+                                         Unhoistables, &HoistStops, Visited);
+      assert(IsHoistable && "Must be hoistable");
+      (void)(IsHoistable);  // Unused in release build
+      IsHoisted = true;
+    }
+    if (IsHoisted) {
+      OutermostScope->CHRRegions.push_back(RI);
+      OutermostScope->HoistStopMap[R] = HoistStops;
+    }
+  }
+  for (CHRScope *Sub : Scope->Subs)
+    setCHRRegions(Sub, OutermostScope);
+}
+
+static bool CHRScopeSorter(CHRScope *Scope1, CHRScope *Scope2) {
+  return Scope1->RegInfos[0].R->getDepth() < Scope2->RegInfos[0].R->getDepth();
+}
+
+void CHR::sortScopes(SmallVectorImpl<CHRScope *> &Input,
+                     SmallVectorImpl<CHRScope *> &Output) {
+  Output.resize(Input.size());
+  llvm::copy(Input, Output.begin());
+  llvm::stable_sort(Output, CHRScopeSorter);
+}
+
+// Return true if V is already hoisted or was hoisted (along with its operands)
+// to the insert point.
+static void hoistValue(Value *V, Instruction *HoistPoint, Region *R,
+                       HoistStopMapTy &HoistStopMap,
+                       DenseSet<Instruction *> &HoistedSet,
+                       DenseSet<PHINode *> &TrivialPHIs,
+                       DominatorTree &DT) {
+  auto IT = HoistStopMap.find(R);
+  assert(IT != HoistStopMap.end() && "Region must be in hoist stop map");
+  DenseSet<Instruction *> &HoistStops = IT->second;
+  if (auto *I = dyn_cast<Instruction>(V)) {
+    if (I == HoistPoint)
+      return;
+    if (HoistStops.count(I))
+      return;
+    if (auto *PN = dyn_cast<PHINode>(I))
+      if (TrivialPHIs.count(PN))
+        // The trivial phi inserted by the previous CHR scope could replace a
+        // non-phi in HoistStops. Note that since this phi is at the exit of a
+        // previous CHR scope, which dominates this scope, it's safe to stop
+        // hoisting there.
+        return;
+    if (HoistedSet.count(I))
+      // Already hoisted, return.
+      return;
+    assert(isHoistableInstructionType(I) && "Unhoistable instruction type");
+    assert(DT.getNode(I->getParent()) && "DT must contain I's block");
+    assert(DT.getNode(HoistPoint->getParent()) &&
+           "DT must contain HoistPoint block");
+    if (DT.dominates(I, HoistPoint))
+      // We are already above the hoist point. Stop here. This may be necessary
+      // when multiple scopes would independently hoist the same
+      // instruction. Since an outer (dominating) scope would hoist it to its
+      // entry before an inner (dominated) scope would to its entry, the inner
+      // scope may see the instruction already hoisted, in which case it
+      // potentially wrong for the inner scope to hoist it and could cause bad
+      // IR (non-dominating def), but safe to skip hoisting it instead because
+      // it's already in a block that dominates the inner scope.
+      return;
+    for (Value *Op : I->operands()) {
+      hoistValue(Op, HoistPoint, R, HoistStopMap, HoistedSet, TrivialPHIs, DT);
+    }
+    I->moveBefore(HoistPoint);
+    HoistedSet.insert(I);
+    CHR_DEBUG(dbgs() << "hoistValue " << *I << "\n");
+  }
+}
+
+// Hoist the dependent condition values of the branches and the selects in the
+// scope to the insert point.
+static void hoistScopeConditions(CHRScope *Scope, Instruction *HoistPoint,
+                                 DenseSet<PHINode *> &TrivialPHIs,
+                                 DominatorTree &DT) {
+  DenseSet<Instruction *> HoistedSet;
+  for (const RegInfo &RI : Scope->CHRRegions) {
+    Region *R = RI.R;
+    bool IsTrueBiased = Scope->TrueBiasedRegions.count(R);
+    bool IsFalseBiased = Scope->FalseBiasedRegions.count(R);
+    if (RI.HasBranch && (IsTrueBiased || IsFalseBiased)) {
+      auto *BI = cast<BranchInst>(R->getEntry()->getTerminator());
+      hoistValue(BI->getCondition(), HoistPoint, R, Scope->HoistStopMap,
+                 HoistedSet, TrivialPHIs, DT);
+    }
+    for (SelectInst *SI : RI.Selects) {
+      bool IsTrueBiased = Scope->TrueBiasedSelects.count(SI);
+      bool IsFalseBiased = Scope->FalseBiasedSelects.count(SI);
+      if (!(IsTrueBiased || IsFalseBiased))
+        continue;
+      hoistValue(SI->getCondition(), HoistPoint, R, Scope->HoistStopMap,
+                 HoistedSet, TrivialPHIs, DT);
+    }
+  }
+}
+
+// Negate the predicate if an ICmp if it's used only by branches or selects by
+// swapping the operands of the branches or the selects. Returns true if success.
+static bool negateICmpIfUsedByBranchOrSelectOnly(ICmpInst *ICmp,
+                                                 Instruction *ExcludedUser,
+                                                 CHRScope *Scope) {
+  for (User *U : ICmp->users()) {
+    if (U == ExcludedUser)
+      continue;
+    if (isa<BranchInst>(U) && cast<BranchInst>(U)->isConditional())
+      continue;
+    if (isa<SelectInst>(U) && cast<SelectInst>(U)->getCondition() == ICmp)
+      continue;
+    return false;
+  }
+  for (User *U : ICmp->users()) {
+    if (U == ExcludedUser)
+      continue;
+    if (auto *BI = dyn_cast<BranchInst>(U)) {
+      assert(BI->isConditional() && "Must be conditional");
+      BI->swapSuccessors();
+      // Don't need to swap this in terms of
+      // TrueBiasedRegions/FalseBiasedRegions because true-based/false-based
+      // mean whehter the branch is likely go into the if-then rather than
+      // successor0/successor1 and because we can tell which edge is the then or
+      // the else one by comparing the destination to the region exit block.
+      continue;
+    }
+    if (auto *SI = dyn_cast<SelectInst>(U)) {
+      // Swap operands
+      SI->swapValues();
+      SI->swapProfMetadata();
+      if (Scope->TrueBiasedSelects.count(SI)) {
+        assert(Scope->FalseBiasedSelects.count(SI) == 0 &&
+               "Must not be already in");
+        Scope->FalseBiasedSelects.insert(SI);
+      } else if (Scope->FalseBiasedSelects.count(SI)) {
+        assert(Scope->TrueBiasedSelects.count(SI) == 0 &&
+               "Must not be already in");
+        Scope->TrueBiasedSelects.insert(SI);
+      }
+      continue;
+    }
+    llvm_unreachable("Must be a branch or a select");
+  }
+  ICmp->setPredicate(CmpInst::getInversePredicate(ICmp->getPredicate()));
+  return true;
+}
+
+// A helper for transformScopes. Insert a trivial phi at the scope exit block
+// for a value that's defined in the scope but used outside it (meaning it's
+// alive at the exit block).
+static void insertTrivialPHIs(CHRScope *Scope,
+                              BasicBlock *EntryBlock, BasicBlock *ExitBlock,
+                              DenseSet<PHINode *> &TrivialPHIs) {
+  SmallSetVector<BasicBlock *, 8> BlocksInScope;
+  for (RegInfo &RI : Scope->RegInfos) {
+    for (BasicBlock *BB : RI.R->blocks()) { // This includes the blocks in the
+                                            // sub-Scopes.
+      BlocksInScope.insert(BB);
+    }
+  }
+  CHR_DEBUG({
+    dbgs() << "Inserting redundant phis\n";
+    for (BasicBlock *BB : BlocksInScope)
+      dbgs() << "BlockInScope " << BB->getName() << "\n";
+  });
+  for (BasicBlock *BB : BlocksInScope) {
+    for (Instruction &I : *BB) {
+      SmallVector<Instruction *, 8> Users;
+      for (User *U : I.users()) {
+        if (auto *UI = dyn_cast<Instruction>(U)) {
+          if (BlocksInScope.count(UI->getParent()) == 0 &&
+              // Unless there's already a phi for I at the exit block.
+              !(isa<PHINode>(UI) && UI->getParent() == ExitBlock)) {
+            CHR_DEBUG(dbgs() << "V " << I << "\n");
+            CHR_DEBUG(dbgs() << "Used outside scope by user " << *UI << "\n");
+            Users.push_back(UI);
+          } else if (UI->getParent() == EntryBlock && isa<PHINode>(UI)) {
+            // There's a loop backedge from a block that's dominated by this
+            // scope to the entry block.
+            CHR_DEBUG(dbgs() << "V " << I << "\n");
+            CHR_DEBUG(dbgs()
+                      << "Used at entry block (for a back edge) by a phi user "
+                      << *UI << "\n");
+            Users.push_back(UI);
+          }
+        }
+      }
+      if (Users.size() > 0) {
+        // Insert a trivial phi for I (phi [&I, P0], [&I, P1], ...) at
+        // ExitBlock. Replace I with the new phi in UI unless UI is another
+        // phi at ExitBlock.
         PHINode *PN = PHINode::Create(I.getType(), pred_size(ExitBlock), "",
-                                      &ExitBlock->front()); 
-        for (BasicBlock *Pred : predecessors(ExitBlock)) { 
-          PN->addIncoming(&I, Pred); 
-        } 
-        TrivialPHIs.insert(PN); 
-        CHR_DEBUG(dbgs() << "Insert phi " << *PN << "\n"); 
-        for (Instruction *UI : Users) { 
-          for (unsigned J = 0, NumOps = UI->getNumOperands(); J < NumOps; ++J) { 
-            if (UI->getOperand(J) == &I) { 
-              UI->setOperand(J, PN); 
-            } 
-          } 
-          CHR_DEBUG(dbgs() << "Updated user " << *UI << "\n"); 
-        } 
-      } 
-    } 
-  } 
-} 
- 
-// Assert that all the CHR regions of the scope have a biased branch or select. 
-static void LLVM_ATTRIBUTE_UNUSED 
-assertCHRRegionsHaveBiasedBranchOrSelect(CHRScope *Scope) { 
-#ifndef NDEBUG 
-  auto HasBiasedBranchOrSelect = [](RegInfo &RI, CHRScope *Scope) { 
-    if (Scope->TrueBiasedRegions.count(RI.R) || 
-        Scope->FalseBiasedRegions.count(RI.R)) 
-      return true; 
-    for (SelectInst *SI : RI.Selects) 
-      if (Scope->TrueBiasedSelects.count(SI) || 
-          Scope->FalseBiasedSelects.count(SI)) 
-        return true; 
-    return false; 
-  }; 
-  for (RegInfo &RI : Scope->CHRRegions) { 
-    assert(HasBiasedBranchOrSelect(RI, Scope) && 
-           "Must have biased branch or select"); 
-  } 
-#endif 
-} 
- 
-// Assert that all the condition values of the biased branches and selects have 
-// been hoisted to the pre-entry block or outside of the scope. 
-static void LLVM_ATTRIBUTE_UNUSED assertBranchOrSelectConditionHoisted( 
-    CHRScope *Scope, BasicBlock *PreEntryBlock) { 
-  CHR_DEBUG(dbgs() << "Biased regions condition values \n"); 
-  for (RegInfo &RI : Scope->CHRRegions) { 
-    Region *R = RI.R; 
-    bool IsTrueBiased = Scope->TrueBiasedRegions.count(R); 
-    bool IsFalseBiased = Scope->FalseBiasedRegions.count(R); 
-    if (RI.HasBranch && (IsTrueBiased || IsFalseBiased)) { 
-      auto *BI = cast<BranchInst>(R->getEntry()->getTerminator()); 
-      Value *V = BI->getCondition(); 
-      CHR_DEBUG(dbgs() << *V << "\n"); 
-      if (auto *I = dyn_cast<Instruction>(V)) { 
-        (void)(I); // Unused in release build. 
-        assert((I->getParent() == PreEntryBlock || 
-                !Scope->contains(I)) && 
-               "Must have been hoisted to PreEntryBlock or outside the scope"); 
-      } 
-    } 
-    for (SelectInst *SI : RI.Selects) { 
-      bool IsTrueBiased = Scope->TrueBiasedSelects.count(SI); 
-      bool IsFalseBiased = Scope->FalseBiasedSelects.count(SI); 
-      if (!(IsTrueBiased || IsFalseBiased)) 
-        continue; 
-      Value *V = SI->getCondition(); 
-      CHR_DEBUG(dbgs() << *V << "\n"); 
-      if (auto *I = dyn_cast<Instruction>(V)) { 
-        (void)(I); // Unused in release build. 
-        assert((I->getParent() == PreEntryBlock || 
-                !Scope->contains(I)) && 
-               "Must have been hoisted to PreEntryBlock or outside the scope"); 
-      } 
-    } 
-  } 
-} 
- 
-void CHR::transformScopes(CHRScope *Scope, DenseSet<PHINode *> &TrivialPHIs) { 
-  CHR_DEBUG(dbgs() << "transformScopes " << *Scope << "\n"); 
- 
-  assert(Scope->RegInfos.size() >= 1 && "Should have at least one Region"); 
-  Region *FirstRegion = Scope->RegInfos[0].R; 
-  BasicBlock *EntryBlock = FirstRegion->getEntry(); 
-  Region *LastRegion = Scope->RegInfos[Scope->RegInfos.size() - 1].R; 
-  BasicBlock *ExitBlock = LastRegion->getExit(); 
-  Optional<uint64_t> ProfileCount = BFI.getBlockProfileCount(EntryBlock); 
- 
-  if (ExitBlock) { 
-    // Insert a trivial phi at the exit block (where the CHR hot path and the 
-    // cold path merges) for a value that's defined in the scope but used 
-    // outside it (meaning it's alive at the exit block). We will add the 
-    // incoming values for the CHR cold paths to it below. Without this, we'd 
-    // miss updating phi's for such values unless there happens to already be a 
-    // phi for that value there. 
-    insertTrivialPHIs(Scope, EntryBlock, ExitBlock, TrivialPHIs); 
-  } 
- 
-  // Split the entry block of the first region. The new block becomes the new 
-  // entry block of the first region. The old entry block becomes the block to 
-  // insert the CHR branch into. Note DT gets updated. Since DT gets updated 
-  // through the split, we update the entry of the first region after the split, 
-  // and Region only points to the entry and the exit blocks, rather than 
-  // keeping everything in a list or set, the blocks membership and the 
-  // entry/exit blocks of the region are still valid after the split. 
-  CHR_DEBUG(dbgs() << "Splitting entry block " << EntryBlock->getName() 
-            << " at " << *Scope->BranchInsertPoint << "\n"); 
-  BasicBlock *NewEntryBlock = 
-      SplitBlock(EntryBlock, Scope->BranchInsertPoint, &DT); 
-  assert(NewEntryBlock->getSinglePredecessor() == EntryBlock && 
-         "NewEntryBlock's only pred must be EntryBlock"); 
-  FirstRegion->replaceEntryRecursive(NewEntryBlock); 
-  BasicBlock *PreEntryBlock = EntryBlock; 
- 
-  ValueToValueMapTy VMap; 
-  // Clone the blocks in the scope (excluding the PreEntryBlock) to split into a 
-  // hot path (originals) and a cold path (clones) and update the PHIs at the 
-  // exit block. 
-  cloneScopeBlocks(Scope, PreEntryBlock, ExitBlock, LastRegion, VMap); 
- 
-  // Replace the old (placeholder) branch with the new (merged) conditional 
-  // branch. 
-  BranchInst *MergedBr = createMergedBranch(PreEntryBlock, EntryBlock, 
-                                            NewEntryBlock, VMap); 
- 
-#ifndef NDEBUG 
-  assertCHRRegionsHaveBiasedBranchOrSelect(Scope); 
-#endif 
- 
-  // Hoist the conditional values of the branches/selects. 
-  hoistScopeConditions(Scope, PreEntryBlock->getTerminator(), TrivialPHIs, DT); 
- 
-#ifndef NDEBUG 
-  assertBranchOrSelectConditionHoisted(Scope, PreEntryBlock); 
-#endif 
- 
-  // Create the combined branch condition and constant-fold the branches/selects 
-  // in the hot path. 
-  fixupBranchesAndSelects(Scope, PreEntryBlock, MergedBr, 
-                          ProfileCount ? ProfileCount.getValue() : 0); 
-} 
- 
-// A helper for transformScopes. Clone the blocks in the scope (excluding the 
-// PreEntryBlock) to split into a hot path and a cold path and update the PHIs 
-// at the exit block. 
-void CHR::cloneScopeBlocks(CHRScope *Scope, 
-                           BasicBlock *PreEntryBlock, 
-                           BasicBlock *ExitBlock, 
-                           Region *LastRegion, 
-                           ValueToValueMapTy &VMap) { 
-  // Clone all the blocks. The original blocks will be the hot-path 
-  // CHR-optimized code and the cloned blocks will be the original unoptimized 
-  // code. This is so that the block pointers from the 
-  // CHRScope/Region/RegionInfo can stay valid in pointing to the hot-path code 
-  // which CHR should apply to. 
-  SmallVector<BasicBlock*, 8> NewBlocks; 
-  for (RegInfo &RI : Scope->RegInfos) 
-    for (BasicBlock *BB : RI.R->blocks()) { // This includes the blocks in the 
-                                            // sub-Scopes. 
-      assert(BB != PreEntryBlock && "Don't copy the preetntry block"); 
-      BasicBlock *NewBB = CloneBasicBlock(BB, VMap, ".nonchr", &F); 
-      NewBlocks.push_back(NewBB); 
-      VMap[BB] = NewBB; 
-    } 
- 
-  // Place the cloned blocks right after the original blocks (right before the 
-  // exit block of.) 
-  if (ExitBlock) 
-    F.getBasicBlockList().splice(ExitBlock->getIterator(), 
-                                 F.getBasicBlockList(), 
-                                 NewBlocks[0]->getIterator(), F.end()); 
- 
-  // Update the cloned blocks/instructions to refer to themselves. 
-  for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) 
-    for (Instruction &I : *NewBlocks[i]) 
-      RemapInstruction(&I, VMap, 
-                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); 
- 
-  // Add the cloned blocks to the PHIs of the exit blocks. ExitBlock is null for 
-  // the top-level region but we don't need to add PHIs. The trivial PHIs 
-  // inserted above will be updated here. 
-  if (ExitBlock) 
-    for (PHINode &PN : ExitBlock->phis()) 
-      for (unsigned I = 0, NumOps = PN.getNumIncomingValues(); I < NumOps; 
-           ++I) { 
-        BasicBlock *Pred = PN.getIncomingBlock(I); 
-        if (LastRegion->contains(Pred)) { 
-          Value *V = PN.getIncomingValue(I); 
-          auto It = VMap.find(V); 
-          if (It != VMap.end()) V = It->second; 
-          assert(VMap.find(Pred) != VMap.end() && "Pred must have been cloned"); 
-          PN.addIncoming(V, cast<BasicBlock>(VMap[Pred])); 
-        } 
-      } 
-} 
- 
-// A helper for transformScope. Replace the old (placeholder) branch with the 
-// new (merged) conditional branch. 
-BranchInst *CHR::createMergedBranch(BasicBlock *PreEntryBlock, 
-                                    BasicBlock *EntryBlock, 
-                                    BasicBlock *NewEntryBlock, 
-                                    ValueToValueMapTy &VMap) { 
-  BranchInst *OldBR = cast<BranchInst>(PreEntryBlock->getTerminator()); 
-  assert(OldBR->isUnconditional() && OldBR->getSuccessor(0) == NewEntryBlock && 
-         "SplitBlock did not work correctly!"); 
-  assert(NewEntryBlock->getSinglePredecessor() == EntryBlock && 
-         "NewEntryBlock's only pred must be EntryBlock"); 
-  assert(VMap.find(NewEntryBlock) != VMap.end() && 
-         "NewEntryBlock must have been copied"); 
-  OldBR->dropAllReferences(); 
-  OldBR->eraseFromParent(); 
-  // The true predicate is a placeholder. It will be replaced later in 
-  // fixupBranchesAndSelects(). 
-  BranchInst *NewBR = BranchInst::Create(NewEntryBlock, 
-                                         cast<BasicBlock>(VMap[NewEntryBlock]), 
-                                         ConstantInt::getTrue(F.getContext())); 
-  PreEntryBlock->getInstList().push_back(NewBR); 
-  assert(NewEntryBlock->getSinglePredecessor() == EntryBlock && 
-         "NewEntryBlock's only pred must be EntryBlock"); 
-  return NewBR; 
-} 
- 
-// A helper for transformScopes. Create the combined branch condition and 
-// constant-fold the branches/selects in the hot path. 
-void CHR::fixupBranchesAndSelects(CHRScope *Scope, 
-                                  BasicBlock *PreEntryBlock, 
-                                  BranchInst *MergedBR, 
-                                  uint64_t ProfileCount) { 
-  Value *MergedCondition = ConstantInt::getTrue(F.getContext()); 
-  BranchProbability CHRBranchBias(1, 1); 
-  uint64_t NumCHRedBranches = 0; 
-  IRBuilder<> IRB(PreEntryBlock->getTerminator()); 
-  for (RegInfo &RI : Scope->CHRRegions) { 
-    Region *R = RI.R; 
-    if (RI.HasBranch) { 
-      fixupBranch(R, Scope, IRB, MergedCondition, CHRBranchBias); 
-      ++NumCHRedBranches; 
-    } 
-    for (SelectInst *SI : RI.Selects) { 
-      fixupSelect(SI, Scope, IRB, MergedCondition, CHRBranchBias); 
-      ++NumCHRedBranches; 
-    } 
-  } 
-  Stats.NumBranchesDelta += NumCHRedBranches - 1; 
-  Stats.WeightedNumBranchesDelta += (NumCHRedBranches - 1) * ProfileCount; 
-  ORE.emit([&]() { 
-    return OptimizationRemark(DEBUG_TYPE, 
-                              "CHR", 
-                              // Refer to the hot (original) path 
-                              MergedBR->getSuccessor(0)->getTerminator()) 
-        << "Merged " << ore::NV("NumCHRedBranches", NumCHRedBranches) 
-        << " branches or selects"; 
-  }); 
-  MergedBR->setCondition(MergedCondition); 
-  uint32_t Weights[] = { 
-      static_cast<uint32_t>(CHRBranchBias.scale(1000)), 
-      static_cast<uint32_t>(CHRBranchBias.getCompl().scale(1000)), 
-  }; 
-  MDBuilder MDB(F.getContext()); 
-  MergedBR->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights)); 
-  CHR_DEBUG(dbgs() << "CHR branch bias " << Weights[0] << ":" << Weights[1] 
-            << "\n"); 
-} 
- 
-// A helper for fixupBranchesAndSelects. Add to the combined branch condition 
-// and constant-fold a branch in the hot path. 
-void CHR::fixupBranch(Region *R, CHRScope *Scope, 
-                      IRBuilder<> &IRB, 
-                      Value *&MergedCondition, 
-                      BranchProbability &CHRBranchBias) { 
-  bool IsTrueBiased = Scope->TrueBiasedRegions.count(R); 
-  assert((IsTrueBiased || Scope->FalseBiasedRegions.count(R)) && 
-         "Must be truthy or falsy"); 
-  auto *BI = cast<BranchInst>(R->getEntry()->getTerminator()); 
-  assert(BranchBiasMap.find(R) != BranchBiasMap.end() && 
-         "Must be in the bias map"); 
-  BranchProbability Bias = BranchBiasMap[R]; 
-  assert(Bias >= getCHRBiasThreshold() && "Must be highly biased"); 
-  // Take the min. 
-  if (CHRBranchBias > Bias) 
-    CHRBranchBias = Bias; 
-  BasicBlock *IfThen = BI->getSuccessor(1); 
-  BasicBlock *IfElse = BI->getSuccessor(0); 
-  BasicBlock *RegionExitBlock = R->getExit(); 
-  assert(RegionExitBlock && "Null ExitBlock"); 
-  assert((IfThen == RegionExitBlock || IfElse == RegionExitBlock) && 
-         IfThen != IfElse && "Invariant from findScopes"); 
-  if (IfThen == RegionExitBlock) { 
-    // Swap them so that IfThen means going into it and IfElse means skipping 
-    // it. 
-    std::swap(IfThen, IfElse); 
-  } 
-  CHR_DEBUG(dbgs() << "IfThen " << IfThen->getName() 
-            << " IfElse " << IfElse->getName() << "\n"); 
-  Value *Cond = BI->getCondition(); 
-  BasicBlock *HotTarget = IsTrueBiased ? IfThen : IfElse; 
-  bool ConditionTrue = HotTarget == BI->getSuccessor(0); 
-  addToMergedCondition(ConditionTrue, Cond, BI, Scope, IRB, 
-                       MergedCondition); 
-  // Constant-fold the branch at ClonedEntryBlock. 
-  assert(ConditionTrue == (HotTarget == BI->getSuccessor(0)) && 
-         "The successor shouldn't change"); 
-  Value *NewCondition = ConditionTrue ? 
-                        ConstantInt::getTrue(F.getContext()) : 
-                        ConstantInt::getFalse(F.getContext()); 
-  BI->setCondition(NewCondition); 
-} 
- 
-// A helper for fixupBranchesAndSelects. Add to the combined branch condition 
-// and constant-fold a select in the hot path. 
-void CHR::fixupSelect(SelectInst *SI, CHRScope *Scope, 
-                      IRBuilder<> &IRB, 
-                      Value *&MergedCondition, 
-                      BranchProbability &CHRBranchBias) { 
-  bool IsTrueBiased = Scope->TrueBiasedSelects.count(SI); 
-  assert((IsTrueBiased || 
-          Scope->FalseBiasedSelects.count(SI)) && "Must be biased"); 
-  assert(SelectBiasMap.find(SI) != SelectBiasMap.end() && 
-         "Must be in the bias map"); 
-  BranchProbability Bias = SelectBiasMap[SI]; 
-  assert(Bias >= getCHRBiasThreshold() && "Must be highly biased"); 
-  // Take the min. 
-  if (CHRBranchBias > Bias) 
-    CHRBranchBias = Bias; 
-  Value *Cond = SI->getCondition(); 
-  addToMergedCondition(IsTrueBiased, Cond, SI, Scope, IRB, 
-                       MergedCondition); 
-  Value *NewCondition = IsTrueBiased ? 
-                        ConstantInt::getTrue(F.getContext()) : 
-                        ConstantInt::getFalse(F.getContext()); 
-  SI->setCondition(NewCondition); 
-} 
- 
-// A helper for fixupBranch/fixupSelect. Add a branch condition to the merged 
-// condition. 
-void CHR::addToMergedCondition(bool IsTrueBiased, Value *Cond, 
-                               Instruction *BranchOrSelect, 
-                               CHRScope *Scope, 
-                               IRBuilder<> &IRB, 
-                               Value *&MergedCondition) { 
-  if (IsTrueBiased) { 
-    MergedCondition = IRB.CreateAnd(MergedCondition, Cond); 
-  } else { 
-    // If Cond is an icmp and all users of V except for BranchOrSelect is a 
-    // branch, negate the icmp predicate and swap the branch targets and avoid 
-    // inserting an Xor to negate Cond. 
-    bool Done = false; 
-    if (auto *ICmp = dyn_cast<ICmpInst>(Cond)) 
-      if (negateICmpIfUsedByBranchOrSelectOnly(ICmp, BranchOrSelect, Scope)) { 
-        MergedCondition = IRB.CreateAnd(MergedCondition, Cond); 
-        Done = true; 
-      } 
-    if (!Done) { 
-      Value *Negate = IRB.CreateXor( 
-          ConstantInt::getTrue(F.getContext()), Cond); 
-      MergedCondition = IRB.CreateAnd(MergedCondition, Negate); 
-    } 
-  } 
-} 
- 
-void CHR::transformScopes(SmallVectorImpl<CHRScope *> &CHRScopes) { 
-  unsigned I = 0; 
-  DenseSet<PHINode *> TrivialPHIs; 
-  for (CHRScope *Scope : CHRScopes) { 
-    transformScopes(Scope, TrivialPHIs); 
-    CHR_DEBUG( 
-        std::ostringstream oss; 
-        oss << " after transformScopes " << I++; 
-        dumpIR(F, oss.str().c_str(), nullptr)); 
-    (void)I; 
-  } 
-} 
- 
-static void LLVM_ATTRIBUTE_UNUSED 
-dumpScopes(SmallVectorImpl<CHRScope *> &Scopes, const char *Label) { 
-  dbgs() << Label << " " << Scopes.size() << "\n"; 
-  for (CHRScope *Scope : Scopes) { 
-    dbgs() << *Scope << "\n"; 
-  } 
-} 
- 
-bool CHR::run() { 
-  if (!shouldApply(F, PSI)) 
-    return false; 
- 
-  CHR_DEBUG(dumpIR(F, "before", nullptr)); 
- 
-  bool Changed = false; 
-  { 
-    CHR_DEBUG( 
-        dbgs() << "RegionInfo:\n"; 
-        RI.print(dbgs())); 
- 
-    // Recursively traverse the region tree and find regions that have biased 
-    // branches and/or selects and create scopes. 
-    SmallVector<CHRScope *, 8> AllScopes; 
-    findScopes(AllScopes); 
-    CHR_DEBUG(dumpScopes(AllScopes, "All scopes")); 
- 
-    // Split the scopes if 1) the conditiona values of the biased 
-    // branches/selects of the inner/lower scope can't be hoisted up to the 
-    // outermost/uppermost scope entry, or 2) the condition values of the biased 
-    // branches/selects in a scope (including subscopes) don't share at least 
-    // one common value. 
-    SmallVector<CHRScope *, 8> SplitScopes; 
-    splitScopes(AllScopes, SplitScopes); 
-    CHR_DEBUG(dumpScopes(SplitScopes, "Split scopes")); 
- 
-    // After splitting, set the biased regions and selects of a scope (a tree 
-    // root) that include those of the subscopes. 
-    classifyBiasedScopes(SplitScopes); 
-    CHR_DEBUG(dbgs() << "Set per-scope bias " << SplitScopes.size() << "\n"); 
- 
-    // Filter out the scopes that has only one biased region or select (CHR 
-    // isn't useful in such a case). 
-    SmallVector<CHRScope *, 8> FilteredScopes; 
-    filterScopes(SplitScopes, FilteredScopes); 
-    CHR_DEBUG(dumpScopes(FilteredScopes, "Filtered scopes")); 
- 
-    // Set the regions to be CHR'ed and their hoist stops for each scope. 
-    SmallVector<CHRScope *, 8> SetScopes; 
-    setCHRRegions(FilteredScopes, SetScopes); 
-    CHR_DEBUG(dumpScopes(SetScopes, "Set CHR regions")); 
- 
-    // Sort CHRScopes by the depth so that outer CHRScopes comes before inner 
-    // ones. We need to apply CHR from outer to inner so that we apply CHR only 
-    // to the hot path, rather than both hot and cold paths. 
-    SmallVector<CHRScope *, 8> SortedScopes; 
-    sortScopes(SetScopes, SortedScopes); 
-    CHR_DEBUG(dumpScopes(SortedScopes, "Sorted scopes")); 
- 
-    CHR_DEBUG( 
-        dbgs() << "RegionInfo:\n"; 
-        RI.print(dbgs())); 
- 
-    // Apply the CHR transformation. 
-    if (!SortedScopes.empty()) { 
-      transformScopes(SortedScopes); 
-      Changed = true; 
-    } 
-  } 
- 
-  if (Changed) { 
-    CHR_DEBUG(dumpIR(F, "after", &Stats)); 
-    ORE.emit([&]() { 
-      return OptimizationRemark(DEBUG_TYPE, "Stats", &F) 
-          << ore::NV("Function", &F) << " " 
-          << "Reduced the number of branches in hot paths by " 
-          << ore::NV("NumBranchesDelta", Stats.NumBranchesDelta) 
-          << " (static) and " 
-          << ore::NV("WeightedNumBranchesDelta", Stats.WeightedNumBranchesDelta) 
-          << " (weighted by PGO count)"; 
-    }); 
-  } 
- 
-  return Changed; 
-} 
- 
-bool ControlHeightReductionLegacyPass::runOnFunction(Function &F) { 
-  BlockFrequencyInfo &BFI = 
-      getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 
-  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-  ProfileSummaryInfo &PSI = 
-      getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 
-  RegionInfo &RI = getAnalysis<RegionInfoPass>().getRegionInfo(); 
-  std::unique_ptr<OptimizationRemarkEmitter> OwnedORE = 
-      std::make_unique<OptimizationRemarkEmitter>(&F); 
-  return CHR(F, BFI, DT, PSI, RI, *OwnedORE.get()).run(); 
-} 
- 
-namespace llvm { 
- 
-ControlHeightReductionPass::ControlHeightReductionPass() { 
-  parseCHRFilterFiles(); 
-} 
- 
-PreservedAnalyses ControlHeightReductionPass::run( 
-    Function &F, 
-    FunctionAnalysisManager &FAM) { 
-  auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F); 
-  auto &DT = FAM.getResult<DominatorTreeAnalysis>(F); 
-  auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 
-  auto &PSI = *MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 
-  auto &RI = FAM.getResult<RegionInfoAnalysis>(F); 
-  auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); 
-  bool Changed = CHR(F, BFI, DT, PSI, RI, ORE).run(); 
-  if (!Changed) 
-    return PreservedAnalyses::all(); 
-  auto PA = PreservedAnalyses(); 
-  PA.preserve<GlobalsAA>(); 
-  return PA; 
-} 
- 
-} // namespace llvm 
+                                      &ExitBlock->front());
+        for (BasicBlock *Pred : predecessors(ExitBlock)) {
+          PN->addIncoming(&I, Pred);
+        }
+        TrivialPHIs.insert(PN);
+        CHR_DEBUG(dbgs() << "Insert phi " << *PN << "\n");
+        for (Instruction *UI : Users) {
+          for (unsigned J = 0, NumOps = UI->getNumOperands(); J < NumOps; ++J) {
+            if (UI->getOperand(J) == &I) {
+              UI->setOperand(J, PN);
+            }
+          }
+          CHR_DEBUG(dbgs() << "Updated user " << *UI << "\n");
+        }
+      }
+    }
+  }
+}
+
+// Assert that all the CHR regions of the scope have a biased branch or select.
+static void LLVM_ATTRIBUTE_UNUSED
+assertCHRRegionsHaveBiasedBranchOrSelect(CHRScope *Scope) {
+#ifndef NDEBUG
+  auto HasBiasedBranchOrSelect = [](RegInfo &RI, CHRScope *Scope) {
+    if (Scope->TrueBiasedRegions.count(RI.R) ||
+        Scope->FalseBiasedRegions.count(RI.R))
+      return true;
+    for (SelectInst *SI : RI.Selects)
+      if (Scope->TrueBiasedSelects.count(SI) ||
+          Scope->FalseBiasedSelects.count(SI))
+        return true;
+    return false;
+  };
+  for (RegInfo &RI : Scope->CHRRegions) {
+    assert(HasBiasedBranchOrSelect(RI, Scope) &&
+           "Must have biased branch or select");
+  }
+#endif
+}
+
+// Assert that all the condition values of the biased branches and selects have
+// been hoisted to the pre-entry block or outside of the scope.
+static void LLVM_ATTRIBUTE_UNUSED assertBranchOrSelectConditionHoisted(
+    CHRScope *Scope, BasicBlock *PreEntryBlock) {
+  CHR_DEBUG(dbgs() << "Biased regions condition values \n");
+  for (RegInfo &RI : Scope->CHRRegions) {
+    Region *R = RI.R;
+    bool IsTrueBiased = Scope->TrueBiasedRegions.count(R);
+    bool IsFalseBiased = Scope->FalseBiasedRegions.count(R);
+    if (RI.HasBranch && (IsTrueBiased || IsFalseBiased)) {
+      auto *BI = cast<BranchInst>(R->getEntry()->getTerminator());
+      Value *V = BI->getCondition();
+      CHR_DEBUG(dbgs() << *V << "\n");
+      if (auto *I = dyn_cast<Instruction>(V)) {
+        (void)(I); // Unused in release build.
+        assert((I->getParent() == PreEntryBlock ||
+                !Scope->contains(I)) &&
+               "Must have been hoisted to PreEntryBlock or outside the scope");
+      }
+    }
+    for (SelectInst *SI : RI.Selects) {
+      bool IsTrueBiased = Scope->TrueBiasedSelects.count(SI);
+      bool IsFalseBiased = Scope->FalseBiasedSelects.count(SI);
+      if (!(IsTrueBiased || IsFalseBiased))
+        continue;
+      Value *V = SI->getCondition();
+      CHR_DEBUG(dbgs() << *V << "\n");
+      if (auto *I = dyn_cast<Instruction>(V)) {
+        (void)(I); // Unused in release build.
+        assert((I->getParent() == PreEntryBlock ||
+                !Scope->contains(I)) &&
+               "Must have been hoisted to PreEntryBlock or outside the scope");
+      }
+    }
+  }
+}
+
+void CHR::transformScopes(CHRScope *Scope, DenseSet<PHINode *> &TrivialPHIs) {
+  CHR_DEBUG(dbgs() << "transformScopes " << *Scope << "\n");
+
+  assert(Scope->RegInfos.size() >= 1 && "Should have at least one Region");
+  Region *FirstRegion = Scope->RegInfos[0].R;
+  BasicBlock *EntryBlock = FirstRegion->getEntry();
+  Region *LastRegion = Scope->RegInfos[Scope->RegInfos.size() - 1].R;
+  BasicBlock *ExitBlock = LastRegion->getExit();
+  Optional<uint64_t> ProfileCount = BFI.getBlockProfileCount(EntryBlock);
+
+  if (ExitBlock) {
+    // Insert a trivial phi at the exit block (where the CHR hot path and the
+    // cold path merges) for a value that's defined in the scope but used
+    // outside it (meaning it's alive at the exit block). We will add the
+    // incoming values for the CHR cold paths to it below. Without this, we'd
+    // miss updating phi's for such values unless there happens to already be a
+    // phi for that value there.
+    insertTrivialPHIs(Scope, EntryBlock, ExitBlock, TrivialPHIs);
+  }
+
+  // Split the entry block of the first region. The new block becomes the new
+  // entry block of the first region. The old entry block becomes the block to
+  // insert the CHR branch into. Note DT gets updated. Since DT gets updated
+  // through the split, we update the entry of the first region after the split,
+  // and Region only points to the entry and the exit blocks, rather than
+  // keeping everything in a list or set, the blocks membership and the
+  // entry/exit blocks of the region are still valid after the split.
+  CHR_DEBUG(dbgs() << "Splitting entry block " << EntryBlock->getName()
+            << " at " << *Scope->BranchInsertPoint << "\n");
+  BasicBlock *NewEntryBlock =
+      SplitBlock(EntryBlock, Scope->BranchInsertPoint, &DT);
+  assert(NewEntryBlock->getSinglePredecessor() == EntryBlock &&
+         "NewEntryBlock's only pred must be EntryBlock");
+  FirstRegion->replaceEntryRecursive(NewEntryBlock);
+  BasicBlock *PreEntryBlock = EntryBlock;
+
+  ValueToValueMapTy VMap;
+  // Clone the blocks in the scope (excluding the PreEntryBlock) to split into a
+  // hot path (originals) and a cold path (clones) and update the PHIs at the
+  // exit block.
+  cloneScopeBlocks(Scope, PreEntryBlock, ExitBlock, LastRegion, VMap);
+
+  // Replace the old (placeholder) branch with the new (merged) conditional
+  // branch.
+  BranchInst *MergedBr = createMergedBranch(PreEntryBlock, EntryBlock,
+                                            NewEntryBlock, VMap);
+
+#ifndef NDEBUG
+  assertCHRRegionsHaveBiasedBranchOrSelect(Scope);
+#endif
+
+  // Hoist the conditional values of the branches/selects.
+  hoistScopeConditions(Scope, PreEntryBlock->getTerminator(), TrivialPHIs, DT);
+
+#ifndef NDEBUG
+  assertBranchOrSelectConditionHoisted(Scope, PreEntryBlock);
+#endif
+
+  // Create the combined branch condition and constant-fold the branches/selects
+  // in the hot path.
+  fixupBranchesAndSelects(Scope, PreEntryBlock, MergedBr,
+                          ProfileCount ? ProfileCount.getValue() : 0);
+}
+
+// A helper for transformScopes. Clone the blocks in the scope (excluding the
+// PreEntryBlock) to split into a hot path and a cold path and update the PHIs
+// at the exit block.
+void CHR::cloneScopeBlocks(CHRScope *Scope,
+                           BasicBlock *PreEntryBlock,
+                           BasicBlock *ExitBlock,
+                           Region *LastRegion,
+                           ValueToValueMapTy &VMap) {
+  // Clone all the blocks. The original blocks will be the hot-path
+  // CHR-optimized code and the cloned blocks will be the original unoptimized
+  // code. This is so that the block pointers from the
+  // CHRScope/Region/RegionInfo can stay valid in pointing to the hot-path code
+  // which CHR should apply to.
+  SmallVector<BasicBlock*, 8> NewBlocks;
+  for (RegInfo &RI : Scope->RegInfos)
+    for (BasicBlock *BB : RI.R->blocks()) { // This includes the blocks in the
+                                            // sub-Scopes.
+      assert(BB != PreEntryBlock && "Don't copy the preetntry block");
+      BasicBlock *NewBB = CloneBasicBlock(BB, VMap, ".nonchr", &F);
+      NewBlocks.push_back(NewBB);
+      VMap[BB] = NewBB;
+    }
+
+  // Place the cloned blocks right after the original blocks (right before the
+  // exit block of.)
+  if (ExitBlock)
+    F.getBasicBlockList().splice(ExitBlock->getIterator(),
+                                 F.getBasicBlockList(),
+                                 NewBlocks[0]->getIterator(), F.end());
+
+  // Update the cloned blocks/instructions to refer to themselves.
+  for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i)
+    for (Instruction &I : *NewBlocks[i])
+      RemapInstruction(&I, VMap,
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+
+  // Add the cloned blocks to the PHIs of the exit blocks. ExitBlock is null for
+  // the top-level region but we don't need to add PHIs. The trivial PHIs
+  // inserted above will be updated here.
+  if (ExitBlock)
+    for (PHINode &PN : ExitBlock->phis())
+      for (unsigned I = 0, NumOps = PN.getNumIncomingValues(); I < NumOps;
+           ++I) {
+        BasicBlock *Pred = PN.getIncomingBlock(I);
+        if (LastRegion->contains(Pred)) {
+          Value *V = PN.getIncomingValue(I);
+          auto It = VMap.find(V);
+          if (It != VMap.end()) V = It->second;
+          assert(VMap.find(Pred) != VMap.end() && "Pred must have been cloned");
+          PN.addIncoming(V, cast<BasicBlock>(VMap[Pred]));
+        }
+      }
+}
+
+// A helper for transformScope. Replace the old (placeholder) branch with the
+// new (merged) conditional branch.
+BranchInst *CHR::createMergedBranch(BasicBlock *PreEntryBlock,
+                                    BasicBlock *EntryBlock,
+                                    BasicBlock *NewEntryBlock,
+                                    ValueToValueMapTy &VMap) {
+  BranchInst *OldBR = cast<BranchInst>(PreEntryBlock->getTerminator());
+  assert(OldBR->isUnconditional() && OldBR->getSuccessor(0) == NewEntryBlock &&
+         "SplitBlock did not work correctly!");
+  assert(NewEntryBlock->getSinglePredecessor() == EntryBlock &&
+         "NewEntryBlock's only pred must be EntryBlock");
+  assert(VMap.find(NewEntryBlock) != VMap.end() &&
+         "NewEntryBlock must have been copied");
+  OldBR->dropAllReferences();
+  OldBR->eraseFromParent();
+  // The true predicate is a placeholder. It will be replaced later in
+  // fixupBranchesAndSelects().
+  BranchInst *NewBR = BranchInst::Create(NewEntryBlock,
+                                         cast<BasicBlock>(VMap[NewEntryBlock]),
+                                         ConstantInt::getTrue(F.getContext()));
+  PreEntryBlock->getInstList().push_back(NewBR);
+  assert(NewEntryBlock->getSinglePredecessor() == EntryBlock &&
+         "NewEntryBlock's only pred must be EntryBlock");
+  return NewBR;
+}
+
+// A helper for transformScopes. Create the combined branch condition and
+// constant-fold the branches/selects in the hot path.
+void CHR::fixupBranchesAndSelects(CHRScope *Scope,
+                                  BasicBlock *PreEntryBlock,
+                                  BranchInst *MergedBR,
+                                  uint64_t ProfileCount) {
+  Value *MergedCondition = ConstantInt::getTrue(F.getContext());
+  BranchProbability CHRBranchBias(1, 1);
+  uint64_t NumCHRedBranches = 0;
+  IRBuilder<> IRB(PreEntryBlock->getTerminator());
+  for (RegInfo &RI : Scope->CHRRegions) {
+    Region *R = RI.R;
+    if (RI.HasBranch) {
+      fixupBranch(R, Scope, IRB, MergedCondition, CHRBranchBias);
+      ++NumCHRedBranches;
+    }
+    for (SelectInst *SI : RI.Selects) {
+      fixupSelect(SI, Scope, IRB, MergedCondition, CHRBranchBias);
+      ++NumCHRedBranches;
+    }
+  }
+  Stats.NumBranchesDelta += NumCHRedBranches - 1;
+  Stats.WeightedNumBranchesDelta += (NumCHRedBranches - 1) * ProfileCount;
+  ORE.emit([&]() {
+    return OptimizationRemark(DEBUG_TYPE,
+                              "CHR",
+                              // Refer to the hot (original) path
+                              MergedBR->getSuccessor(0)->getTerminator())
+        << "Merged " << ore::NV("NumCHRedBranches", NumCHRedBranches)
+        << " branches or selects";
+  });
+  MergedBR->setCondition(MergedCondition);
+  uint32_t Weights[] = {
+      static_cast<uint32_t>(CHRBranchBias.scale(1000)),
+      static_cast<uint32_t>(CHRBranchBias.getCompl().scale(1000)),
+  };
+  MDBuilder MDB(F.getContext());
+  MergedBR->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
+  CHR_DEBUG(dbgs() << "CHR branch bias " << Weights[0] << ":" << Weights[1]
+            << "\n");
+}
+
+// A helper for fixupBranchesAndSelects. Add to the combined branch condition
+// and constant-fold a branch in the hot path.
+void CHR::fixupBranch(Region *R, CHRScope *Scope,
+                      IRBuilder<> &IRB,
+                      Value *&MergedCondition,
+                      BranchProbability &CHRBranchBias) {
+  bool IsTrueBiased = Scope->TrueBiasedRegions.count(R);
+  assert((IsTrueBiased || Scope->FalseBiasedRegions.count(R)) &&
+         "Must be truthy or falsy");
+  auto *BI = cast<BranchInst>(R->getEntry()->getTerminator());
+  assert(BranchBiasMap.find(R) != BranchBiasMap.end() &&
+         "Must be in the bias map");
+  BranchProbability Bias = BranchBiasMap[R];
+  assert(Bias >= getCHRBiasThreshold() && "Must be highly biased");
+  // Take the min.
+  if (CHRBranchBias > Bias)
+    CHRBranchBias = Bias;
+  BasicBlock *IfThen = BI->getSuccessor(1);
+  BasicBlock *IfElse = BI->getSuccessor(0);
+  BasicBlock *RegionExitBlock = R->getExit();
+  assert(RegionExitBlock && "Null ExitBlock");
+  assert((IfThen == RegionExitBlock || IfElse == RegionExitBlock) &&
+         IfThen != IfElse && "Invariant from findScopes");
+  if (IfThen == RegionExitBlock) {
+    // Swap them so that IfThen means going into it and IfElse means skipping
+    // it.
+    std::swap(IfThen, IfElse);
+  }
+  CHR_DEBUG(dbgs() << "IfThen " << IfThen->getName()
+            << " IfElse " << IfElse->getName() << "\n");
+  Value *Cond = BI->getCondition();
+  BasicBlock *HotTarget = IsTrueBiased ? IfThen : IfElse;
+  bool ConditionTrue = HotTarget == BI->getSuccessor(0);
+  addToMergedCondition(ConditionTrue, Cond, BI, Scope, IRB,
+                       MergedCondition);
+  // Constant-fold the branch at ClonedEntryBlock.
+  assert(ConditionTrue == (HotTarget == BI->getSuccessor(0)) &&
+         "The successor shouldn't change");
+  Value *NewCondition = ConditionTrue ?
+                        ConstantInt::getTrue(F.getContext()) :
+                        ConstantInt::getFalse(F.getContext());
+  BI->setCondition(NewCondition);
+}
+
+// A helper for fixupBranchesAndSelects. Add to the combined branch condition
+// and constant-fold a select in the hot path.
+void CHR::fixupSelect(SelectInst *SI, CHRScope *Scope,
+                      IRBuilder<> &IRB,
+                      Value *&MergedCondition,
+                      BranchProbability &CHRBranchBias) {
+  bool IsTrueBiased = Scope->TrueBiasedSelects.count(SI);
+  assert((IsTrueBiased ||
+          Scope->FalseBiasedSelects.count(SI)) && "Must be biased");
+  assert(SelectBiasMap.find(SI) != SelectBiasMap.end() &&
+         "Must be in the bias map");
+  BranchProbability Bias = SelectBiasMap[SI];
+  assert(Bias >= getCHRBiasThreshold() && "Must be highly biased");
+  // Take the min.
+  if (CHRBranchBias > Bias)
+    CHRBranchBias = Bias;
+  Value *Cond = SI->getCondition();
+  addToMergedCondition(IsTrueBiased, Cond, SI, Scope, IRB,
+                       MergedCondition);
+  Value *NewCondition = IsTrueBiased ?
+                        ConstantInt::getTrue(F.getContext()) :
+                        ConstantInt::getFalse(F.getContext());
+  SI->setCondition(NewCondition);
+}
+
+// A helper for fixupBranch/fixupSelect. Add a branch condition to the merged
+// condition.
+void CHR::addToMergedCondition(bool IsTrueBiased, Value *Cond,
+                               Instruction *BranchOrSelect,
+                               CHRScope *Scope,
+                               IRBuilder<> &IRB,
+                               Value *&MergedCondition) {
+  if (IsTrueBiased) {
+    MergedCondition = IRB.CreateAnd(MergedCondition, Cond);
+  } else {
+    // If Cond is an icmp and all users of V except for BranchOrSelect is a
+    // branch, negate the icmp predicate and swap the branch targets and avoid
+    // inserting an Xor to negate Cond.
+    bool Done = false;
+    if (auto *ICmp = dyn_cast<ICmpInst>(Cond))
+      if (negateICmpIfUsedByBranchOrSelectOnly(ICmp, BranchOrSelect, Scope)) {
+        MergedCondition = IRB.CreateAnd(MergedCondition, Cond);
+        Done = true;
+      }
+    if (!Done) {
+      Value *Negate = IRB.CreateXor(
+          ConstantInt::getTrue(F.getContext()), Cond);
+      MergedCondition = IRB.CreateAnd(MergedCondition, Negate);
+    }
+  }
+}
+
+void CHR::transformScopes(SmallVectorImpl<CHRScope *> &CHRScopes) {
+  unsigned I = 0;
+  DenseSet<PHINode *> TrivialPHIs;
+  for (CHRScope *Scope : CHRScopes) {
+    transformScopes(Scope, TrivialPHIs);
+    CHR_DEBUG(
+        std::ostringstream oss;
+        oss << " after transformScopes " << I++;
+        dumpIR(F, oss.str().c_str(), nullptr));
+    (void)I;
+  }
+}
+
+static void LLVM_ATTRIBUTE_UNUSED
+dumpScopes(SmallVectorImpl<CHRScope *> &Scopes, const char *Label) {
+  dbgs() << Label << " " << Scopes.size() << "\n";
+  for (CHRScope *Scope : Scopes) {
+    dbgs() << *Scope << "\n";
+  }
+}
+
+bool CHR::run() {
+  if (!shouldApply(F, PSI))
+    return false;
+
+  CHR_DEBUG(dumpIR(F, "before", nullptr));
+
+  bool Changed = false;
+  {
+    CHR_DEBUG(
+        dbgs() << "RegionInfo:\n";
+        RI.print(dbgs()));
+
+    // Recursively traverse the region tree and find regions that have biased
+    // branches and/or selects and create scopes.
+    SmallVector<CHRScope *, 8> AllScopes;
+    findScopes(AllScopes);
+    CHR_DEBUG(dumpScopes(AllScopes, "All scopes"));
+
+    // Split the scopes if 1) the conditiona values of the biased
+    // branches/selects of the inner/lower scope can't be hoisted up to the
+    // outermost/uppermost scope entry, or 2) the condition values of the biased
+    // branches/selects in a scope (including subscopes) don't share at least
+    // one common value.
+    SmallVector<CHRScope *, 8> SplitScopes;
+    splitScopes(AllScopes, SplitScopes);
+    CHR_DEBUG(dumpScopes(SplitScopes, "Split scopes"));
+
+    // After splitting, set the biased regions and selects of a scope (a tree
+    // root) that include those of the subscopes.
+    classifyBiasedScopes(SplitScopes);
+    CHR_DEBUG(dbgs() << "Set per-scope bias " << SplitScopes.size() << "\n");
+
+    // Filter out the scopes that has only one biased region or select (CHR
+    // isn't useful in such a case).
+    SmallVector<CHRScope *, 8> FilteredScopes;
+    filterScopes(SplitScopes, FilteredScopes);
+    CHR_DEBUG(dumpScopes(FilteredScopes, "Filtered scopes"));
+
+    // Set the regions to be CHR'ed and their hoist stops for each scope.
+    SmallVector<CHRScope *, 8> SetScopes;
+    setCHRRegions(FilteredScopes, SetScopes);
+    CHR_DEBUG(dumpScopes(SetScopes, "Set CHR regions"));
+
+    // Sort CHRScopes by the depth so that outer CHRScopes comes before inner
+    // ones. We need to apply CHR from outer to inner so that we apply CHR only
+    // to the hot path, rather than both hot and cold paths.
+    SmallVector<CHRScope *, 8> SortedScopes;
+    sortScopes(SetScopes, SortedScopes);
+    CHR_DEBUG(dumpScopes(SortedScopes, "Sorted scopes"));
+
+    CHR_DEBUG(
+        dbgs() << "RegionInfo:\n";
+        RI.print(dbgs()));
+
+    // Apply the CHR transformation.
+    if (!SortedScopes.empty()) {
+      transformScopes(SortedScopes);
+      Changed = true;
+    }
+  }
+
+  if (Changed) {
+    CHR_DEBUG(dumpIR(F, "after", &Stats));
+    ORE.emit([&]() {
+      return OptimizationRemark(DEBUG_TYPE, "Stats", &F)
+          << ore::NV("Function", &F) << " "
+          << "Reduced the number of branches in hot paths by "
+          << ore::NV("NumBranchesDelta", Stats.NumBranchesDelta)
+          << " (static) and "
+          << ore::NV("WeightedNumBranchesDelta", Stats.WeightedNumBranchesDelta)
+          << " (weighted by PGO count)";
+    });
+  }
+
+  return Changed;
+}
+
+bool ControlHeightReductionLegacyPass::runOnFunction(Function &F) {
+  BlockFrequencyInfo &BFI =
+      getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  ProfileSummaryInfo &PSI =
+      getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  RegionInfo &RI = getAnalysis<RegionInfoPass>().getRegionInfo();
+  std::unique_ptr<OptimizationRemarkEmitter> OwnedORE =
+      std::make_unique<OptimizationRemarkEmitter>(&F);
+  return CHR(F, BFI, DT, PSI, RI, *OwnedORE.get()).run();
+}
+
+namespace llvm {
+
+ControlHeightReductionPass::ControlHeightReductionPass() {
+  parseCHRFilterFiles();
+}
+
+PreservedAnalyses ControlHeightReductionPass::run(
+    Function &F,
+    FunctionAnalysisManager &FAM) {
+  auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+  auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+  auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
+  auto &PSI = *MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+  auto &RI = FAM.getResult<RegionInfoAnalysis>(F);
+  auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  bool Changed = CHR(F, BFI, DT, PSI, RI, ORE).run();
+  if (!Changed)
+    return PreservedAnalyses::all();
+  auto PA = PreservedAnalyses();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+
+} // namespace llvm
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index ebd7a997dd..1b14b8d569 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -1,111 +1,111 @@
-//===- DataFlowSanitizer.cpp - dynamic data flow analysis -----------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-/// \file 
-/// This file is a part of DataFlowSanitizer, a generalised dynamic data flow 
-/// analysis. 
-/// 
-/// Unlike other Sanitizer tools, this tool is not designed to detect a specific 
-/// class of bugs on its own.  Instead, it provides a generic dynamic data flow 
-/// analysis framework to be used by clients to help detect application-specific 
-/// issues within their own code. 
-/// 
-/// The analysis is based on automatic propagation of data flow labels (also 
-/// known as taint labels) through a program as it performs computation.  Each 
-/// byte of application memory is backed by two bytes of shadow memory which 
-/// hold the label.  On Linux/x86_64, memory is laid out as follows: 
-/// 
-/// +--------------------+ 0x800000000000 (top of memory) 
-/// | application memory | 
-/// +--------------------+ 0x700000008000 (kAppAddr) 
-/// |                    | 
-/// |       unused       | 
-/// |                    | 
-/// +--------------------+ 0x200200000000 (kUnusedAddr) 
-/// |    union table     | 
-/// +--------------------+ 0x200000000000 (kUnionTableAddr) 
-/// |   shadow memory    | 
-/// +--------------------+ 0x000000010000 (kShadowAddr) 
-/// | reserved by kernel | 
-/// +--------------------+ 0x000000000000 
-/// 
-/// To derive a shadow memory address from an application memory address, 
-/// bits 44-46 are cleared to bring the address into the range 
-/// [0x000000008000,0x100000000000).  Then the address is shifted left by 1 to 
-/// account for the double byte representation of shadow labels and move the 
-/// address into the shadow memory range.  See the function 
-/// DataFlowSanitizer::getShadowAddress below. 
-/// 
-/// For more information, please refer to the design document: 
-/// http://clang.llvm.org/docs/DataFlowSanitizerDesign.html 
-// 
-//===----------------------------------------------------------------------===// 
- 
+//===- DataFlowSanitizer.cpp - dynamic data flow analysis -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file is a part of DataFlowSanitizer, a generalised dynamic data flow
+/// analysis.
+///
+/// Unlike other Sanitizer tools, this tool is not designed to detect a specific
+/// class of bugs on its own.  Instead, it provides a generic dynamic data flow
+/// analysis framework to be used by clients to help detect application-specific
+/// issues within their own code.
+///
+/// The analysis is based on automatic propagation of data flow labels (also
+/// known as taint labels) through a program as it performs computation.  Each
+/// byte of application memory is backed by two bytes of shadow memory which
+/// hold the label.  On Linux/x86_64, memory is laid out as follows:
+///
+/// +--------------------+ 0x800000000000 (top of memory)
+/// | application memory |
+/// +--------------------+ 0x700000008000 (kAppAddr)
+/// |                    |
+/// |       unused       |
+/// |                    |
+/// +--------------------+ 0x200200000000 (kUnusedAddr)
+/// |    union table     |
+/// +--------------------+ 0x200000000000 (kUnionTableAddr)
+/// |   shadow memory    |
+/// +--------------------+ 0x000000010000 (kShadowAddr)
+/// | reserved by kernel |
+/// +--------------------+ 0x000000000000
+///
+/// To derive a shadow memory address from an application memory address,
+/// bits 44-46 are cleared to bring the address into the range
+/// [0x000000008000,0x100000000000).  Then the address is shifted left by 1 to
+/// account for the double byte representation of shadow labels and move the
+/// address into the shadow memory range.  See the function
+/// DataFlowSanitizer::getShadowAddress below.
+///
+/// For more information, please refer to the design document:
+/// http://clang.llvm.org/docs/DataFlowSanitizerDesign.html
+//
+//===----------------------------------------------------------------------===//
+
 #include "llvm/Transforms/Instrumentation/DataFlowSanitizer.h"
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/DenseSet.h" 
-#include "llvm/ADT/DepthFirstIterator.h" 
-#include "llvm/ADT/None.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/StringExtras.h" 
-#include "llvm/ADT/StringRef.h" 
-#include "llvm/ADT/Triple.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/Argument.h" 
-#include "llvm/IR/Attributes.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GlobalAlias.h" 
-#include "llvm/IR/GlobalValue.h" 
-#include "llvm/IR/GlobalVariable.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InlineAsm.h" 
-#include "llvm/IR/InstVisitor.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/MDBuilder.h" 
-#include "llvm/IR/Module.h" 
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/SpecialCaseList.h" 
-#include "llvm/Support/VirtualFileSystem.h" 
-#include "llvm/Transforms/Instrumentation.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <cstddef> 
-#include <cstdint> 
-#include <iterator> 
-#include <memory> 
-#include <set> 
-#include <string> 
-#include <utility> 
-#include <vector> 
- 
-using namespace llvm; 
- 
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SpecialCaseList.h"
+#include "llvm/Support/VirtualFileSystem.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
 // This must be consistent with ShadowWidthBits.
 static const Align kShadowTLSAlignment = Align(2);
 
@@ -114,78 +114,78 @@ static const Align kShadowTLSAlignment = Align(2);
 static const unsigned kArgTLSSize = 800;
 static const unsigned kRetvalTLSSize = 800;
 
-// External symbol to be used when generating the shadow address for 
-// architectures with multiple VMAs. Instead of using a constant integer 
-// the runtime will set the external mask based on the VMA range. 
+// External symbol to be used when generating the shadow address for
+// architectures with multiple VMAs. Instead of using a constant integer
+// the runtime will set the external mask based on the VMA range.
 const char kDFSanExternShadowPtrMask[] = "__dfsan_shadow_ptr_mask";
- 
-// The -dfsan-preserve-alignment flag controls whether this pass assumes that 
-// alignment requirements provided by the input IR are correct.  For example, 
-// if the input IR contains a load with alignment 8, this flag will cause 
-// the shadow load to have alignment 16.  This flag is disabled by default as 
-// we have unfortunately encountered too much code (including Clang itself; 
-// see PR14291) which performs misaligned access. 
-static cl::opt<bool> ClPreserveAlignment( 
-    "dfsan-preserve-alignment", 
-    cl::desc("respect alignment requirements provided by input IR"), cl::Hidden, 
-    cl::init(false)); 
- 
-// The ABI list files control how shadow parameters are passed. The pass treats 
-// every function labelled "uninstrumented" in the ABI list file as conforming 
-// to the "native" (i.e. unsanitized) ABI.  Unless the ABI list contains 
-// additional annotations for those functions, a call to one of those functions 
-// will produce a warning message, as the labelling behaviour of the function is 
-// unknown.  The other supported annotations are "functional" and "discard", 
-// which are described below under DataFlowSanitizer::WrapperKind. 
-static cl::list<std::string> ClABIListFiles( 
-    "dfsan-abilist", 
-    cl::desc("File listing native ABI functions and how the pass treats them"), 
-    cl::Hidden); 
- 
-// Controls whether the pass uses IA_Args or IA_TLS as the ABI for instrumented 
-// functions (see DataFlowSanitizer::InstrumentedABI below). 
-static cl::opt<bool> ClArgsABI( 
-    "dfsan-args-abi", 
-    cl::desc("Use the argument ABI rather than the TLS ABI"), 
-    cl::Hidden); 
- 
-// Controls whether the pass includes or ignores the labels of pointers in load 
-// instructions. 
-static cl::opt<bool> ClCombinePointerLabelsOnLoad( 
-    "dfsan-combine-pointer-labels-on-load", 
-    cl::desc("Combine the label of the pointer with the label of the data when " 
-             "loading from memory."), 
-    cl::Hidden, cl::init(true)); 
- 
-// Controls whether the pass includes or ignores the labels of pointers in 
-// stores instructions. 
-static cl::opt<bool> ClCombinePointerLabelsOnStore( 
-    "dfsan-combine-pointer-labels-on-store", 
-    cl::desc("Combine the label of the pointer with the label of the data when " 
-             "storing in memory."), 
-    cl::Hidden, cl::init(false)); 
- 
-static cl::opt<bool> ClDebugNonzeroLabels( 
-    "dfsan-debug-nonzero-labels", 
-    cl::desc("Insert calls to __dfsan_nonzero_label on observing a parameter, " 
-             "load or return with a nonzero label"), 
-    cl::Hidden); 
- 
-// Experimental feature that inserts callbacks for certain data events. 
-// Currently callbacks are only inserted for loads, stores, memory transfers 
-// (i.e. memcpy and memmove), and comparisons. 
-// 
-// If this flag is set to true, the user must provide definitions for the 
-// following callback functions: 
+
+// The -dfsan-preserve-alignment flag controls whether this pass assumes that
+// alignment requirements provided by the input IR are correct.  For example,
+// if the input IR contains a load with alignment 8, this flag will cause
+// the shadow load to have alignment 16.  This flag is disabled by default as
+// we have unfortunately encountered too much code (including Clang itself;
+// see PR14291) which performs misaligned access.
+static cl::opt<bool> ClPreserveAlignment(
+    "dfsan-preserve-alignment",
+    cl::desc("respect alignment requirements provided by input IR"), cl::Hidden,
+    cl::init(false));
+
+// The ABI list files control how shadow parameters are passed. The pass treats
+// every function labelled "uninstrumented" in the ABI list file as conforming
+// to the "native" (i.e. unsanitized) ABI.  Unless the ABI list contains
+// additional annotations for those functions, a call to one of those functions
+// will produce a warning message, as the labelling behaviour of the function is
+// unknown.  The other supported annotations are "functional" and "discard",
+// which are described below under DataFlowSanitizer::WrapperKind.
+static cl::list<std::string> ClABIListFiles(
+    "dfsan-abilist",
+    cl::desc("File listing native ABI functions and how the pass treats them"),
+    cl::Hidden);
+
+// Controls whether the pass uses IA_Args or IA_TLS as the ABI for instrumented
+// functions (see DataFlowSanitizer::InstrumentedABI below).
+static cl::opt<bool> ClArgsABI(
+    "dfsan-args-abi",
+    cl::desc("Use the argument ABI rather than the TLS ABI"),
+    cl::Hidden);
+
+// Controls whether the pass includes or ignores the labels of pointers in load
+// instructions.
+static cl::opt<bool> ClCombinePointerLabelsOnLoad(
+    "dfsan-combine-pointer-labels-on-load",
+    cl::desc("Combine the label of the pointer with the label of the data when "
+             "loading from memory."),
+    cl::Hidden, cl::init(true));
+
+// Controls whether the pass includes or ignores the labels of pointers in
+// stores instructions.
+static cl::opt<bool> ClCombinePointerLabelsOnStore(
+    "dfsan-combine-pointer-labels-on-store",
+    cl::desc("Combine the label of the pointer with the label of the data when "
+             "storing in memory."),
+    cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClDebugNonzeroLabels(
+    "dfsan-debug-nonzero-labels",
+    cl::desc("Insert calls to __dfsan_nonzero_label on observing a parameter, "
+             "load or return with a nonzero label"),
+    cl::Hidden);
+
+// Experimental feature that inserts callbacks for certain data events.
+// Currently callbacks are only inserted for loads, stores, memory transfers
+// (i.e. memcpy and memmove), and comparisons.
+//
+// If this flag is set to true, the user must provide definitions for the
+// following callback functions:
 //   void __dfsan_load_callback(dfsan_label Label, void* addr);
 //   void __dfsan_store_callback(dfsan_label Label, void* addr);
-//   void __dfsan_mem_transfer_callback(dfsan_label *Start, size_t Len); 
-//   void __dfsan_cmp_callback(dfsan_label CombinedLabel); 
-static cl::opt<bool> ClEventCallbacks( 
-    "dfsan-event-callbacks", 
-    cl::desc("Insert calls to __dfsan_*_callback functions on data events."), 
-    cl::Hidden, cl::init(false)); 
- 
+//   void __dfsan_mem_transfer_callback(dfsan_label *Start, size_t Len);
+//   void __dfsan_cmp_callback(dfsan_label CombinedLabel);
+static cl::opt<bool> ClEventCallbacks(
+    "dfsan-event-callbacks",
+    cl::desc("Insert calls to __dfsan_*_callback functions on data events."),
+    cl::Hidden, cl::init(false));
+
 // Use a distinct bit for each base label, enabling faster unions with less
 // instrumentation.  Limits the max number of base labels to 16.
 static cl::opt<bool> ClFast16Labels(
@@ -201,220 +201,220 @@ static cl::opt<bool> ClTrackSelectControlFlow(
              "to results."),
     cl::Hidden, cl::init(true));
 
-static StringRef GetGlobalTypeString(const GlobalValue &G) { 
-  // Types of GlobalVariables are always pointer types. 
-  Type *GType = G.getValueType(); 
-  // For now we support excluding struct types only. 
-  if (StructType *SGType = dyn_cast<StructType>(GType)) { 
-    if (!SGType->isLiteral()) 
-      return SGType->getName(); 
-  } 
-  return "<unknown type>"; 
-} 
- 
-namespace { 
- 
-class DFSanABIList { 
-  std::unique_ptr<SpecialCaseList> SCL; 
- 
- public: 
-  DFSanABIList() = default; 
- 
-  void set(std::unique_ptr<SpecialCaseList> List) { SCL = std::move(List); } 
- 
-  /// Returns whether either this function or its source file are listed in the 
-  /// given category. 
-  bool isIn(const Function &F, StringRef Category) const { 
-    return isIn(*F.getParent(), Category) || 
-           SCL->inSection("dataflow", "fun", F.getName(), Category); 
-  } 
- 
-  /// Returns whether this global alias is listed in the given category. 
-  /// 
-  /// If GA aliases a function, the alias's name is matched as a function name 
-  /// would be.  Similarly, aliases of globals are matched like globals. 
-  bool isIn(const GlobalAlias &GA, StringRef Category) const { 
-    if (isIn(*GA.getParent(), Category)) 
-      return true; 
- 
-    if (isa<FunctionType>(GA.getValueType())) 
-      return SCL->inSection("dataflow", "fun", GA.getName(), Category); 
- 
-    return SCL->inSection("dataflow", "global", GA.getName(), Category) || 
-           SCL->inSection("dataflow", "type", GetGlobalTypeString(GA), 
-                          Category); 
-  } 
- 
-  /// Returns whether this module is listed in the given category. 
-  bool isIn(const Module &M, StringRef Category) const { 
-    return SCL->inSection("dataflow", "src", M.getModuleIdentifier(), Category); 
-  } 
-}; 
- 
-/// TransformedFunction is used to express the result of transforming one 
-/// function type into another.  This struct is immutable.  It holds metadata 
-/// useful for updating calls of the old function to the new type. 
-struct TransformedFunction { 
-  TransformedFunction(FunctionType* OriginalType, 
-                      FunctionType* TransformedType, 
-                      std::vector<unsigned> ArgumentIndexMapping) 
-      : OriginalType(OriginalType), 
-        TransformedType(TransformedType), 
-        ArgumentIndexMapping(ArgumentIndexMapping) {} 
- 
-  // Disallow copies. 
-  TransformedFunction(const TransformedFunction&) = delete; 
-  TransformedFunction& operator=(const TransformedFunction&) = delete; 
- 
-  // Allow moves. 
-  TransformedFunction(TransformedFunction&&) = default; 
-  TransformedFunction& operator=(TransformedFunction&&) = default; 
- 
-  /// Type of the function before the transformation. 
-  FunctionType *OriginalType; 
- 
-  /// Type of the function after the transformation. 
-  FunctionType *TransformedType; 
- 
-  /// Transforming a function may change the position of arguments.  This 
-  /// member records the mapping from each argument's old position to its new 
-  /// position.  Argument positions are zero-indexed.  If the transformation 
-  /// from F to F' made the first argument of F into the third argument of F', 
-  /// then ArgumentIndexMapping[0] will equal 2. 
-  std::vector<unsigned> ArgumentIndexMapping; 
-}; 
- 
-/// Given function attributes from a call site for the original function, 
-/// return function attributes appropriate for a call to the transformed 
-/// function. 
-AttributeList TransformFunctionAttributes( 
-    const TransformedFunction& TransformedFunction, 
-    LLVMContext& Ctx, AttributeList CallSiteAttrs) { 
- 
-  // Construct a vector of AttributeSet for each function argument. 
-  std::vector<llvm::AttributeSet> ArgumentAttributes( 
-      TransformedFunction.TransformedType->getNumParams()); 
- 
-  // Copy attributes from the parameter of the original function to the 
-  // transformed version.  'ArgumentIndexMapping' holds the mapping from 
-  // old argument position to new. 
-  for (unsigned i=0, ie = TransformedFunction.ArgumentIndexMapping.size(); 
-       i < ie; ++i) { 
-    unsigned TransformedIndex = TransformedFunction.ArgumentIndexMapping[i]; 
-    ArgumentAttributes[TransformedIndex] = CallSiteAttrs.getParamAttributes(i); 
-  } 
- 
-  // Copy annotations on varargs arguments. 
-  for (unsigned i = TransformedFunction.OriginalType->getNumParams(), 
-       ie = CallSiteAttrs.getNumAttrSets(); i<ie; ++i) { 
-    ArgumentAttributes.push_back(CallSiteAttrs.getParamAttributes(i)); 
-  } 
- 
-  return AttributeList::get( 
-      Ctx, 
-      CallSiteAttrs.getFnAttributes(), 
-      CallSiteAttrs.getRetAttributes(), 
-      llvm::makeArrayRef(ArgumentAttributes)); 
-} 
- 
+static StringRef GetGlobalTypeString(const GlobalValue &G) {
+  // Types of GlobalVariables are always pointer types.
+  Type *GType = G.getValueType();
+  // For now we support excluding struct types only.
+  if (StructType *SGType = dyn_cast<StructType>(GType)) {
+    if (!SGType->isLiteral())
+      return SGType->getName();
+  }
+  return "<unknown type>";
+}
+
+namespace {
+
+class DFSanABIList {
+  std::unique_ptr<SpecialCaseList> SCL;
+
+ public:
+  DFSanABIList() = default;
+
+  void set(std::unique_ptr<SpecialCaseList> List) { SCL = std::move(List); }
+
+  /// Returns whether either this function or its source file are listed in the
+  /// given category.
+  bool isIn(const Function &F, StringRef Category) const {
+    return isIn(*F.getParent(), Category) ||
+           SCL->inSection("dataflow", "fun", F.getName(), Category);
+  }
+
+  /// Returns whether this global alias is listed in the given category.
+  ///
+  /// If GA aliases a function, the alias's name is matched as a function name
+  /// would be.  Similarly, aliases of globals are matched like globals.
+  bool isIn(const GlobalAlias &GA, StringRef Category) const {
+    if (isIn(*GA.getParent(), Category))
+      return true;
+
+    if (isa<FunctionType>(GA.getValueType()))
+      return SCL->inSection("dataflow", "fun", GA.getName(), Category);
+
+    return SCL->inSection("dataflow", "global", GA.getName(), Category) ||
+           SCL->inSection("dataflow", "type", GetGlobalTypeString(GA),
+                          Category);
+  }
+
+  /// Returns whether this module is listed in the given category.
+  bool isIn(const Module &M, StringRef Category) const {
+    return SCL->inSection("dataflow", "src", M.getModuleIdentifier(), Category);
+  }
+};
+
+/// TransformedFunction is used to express the result of transforming one
+/// function type into another.  This struct is immutable.  It holds metadata
+/// useful for updating calls of the old function to the new type.
+struct TransformedFunction {
+  TransformedFunction(FunctionType* OriginalType,
+                      FunctionType* TransformedType,
+                      std::vector<unsigned> ArgumentIndexMapping)
+      : OriginalType(OriginalType),
+        TransformedType(TransformedType),
+        ArgumentIndexMapping(ArgumentIndexMapping) {}
+
+  // Disallow copies.
+  TransformedFunction(const TransformedFunction&) = delete;
+  TransformedFunction& operator=(const TransformedFunction&) = delete;
+
+  // Allow moves.
+  TransformedFunction(TransformedFunction&&) = default;
+  TransformedFunction& operator=(TransformedFunction&&) = default;
+
+  /// Type of the function before the transformation.
+  FunctionType *OriginalType;
+
+  /// Type of the function after the transformation.
+  FunctionType *TransformedType;
+
+  /// Transforming a function may change the position of arguments.  This
+  /// member records the mapping from each argument's old position to its new
+  /// position.  Argument positions are zero-indexed.  If the transformation
+  /// from F to F' made the first argument of F into the third argument of F',
+  /// then ArgumentIndexMapping[0] will equal 2.
+  std::vector<unsigned> ArgumentIndexMapping;
+};
+
+/// Given function attributes from a call site for the original function,
+/// return function attributes appropriate for a call to the transformed
+/// function.
+AttributeList TransformFunctionAttributes(
+    const TransformedFunction& TransformedFunction,
+    LLVMContext& Ctx, AttributeList CallSiteAttrs) {
+
+  // Construct a vector of AttributeSet for each function argument.
+  std::vector<llvm::AttributeSet> ArgumentAttributes(
+      TransformedFunction.TransformedType->getNumParams());
+
+  // Copy attributes from the parameter of the original function to the
+  // transformed version.  'ArgumentIndexMapping' holds the mapping from
+  // old argument position to new.
+  for (unsigned i=0, ie = TransformedFunction.ArgumentIndexMapping.size();
+       i < ie; ++i) {
+    unsigned TransformedIndex = TransformedFunction.ArgumentIndexMapping[i];
+    ArgumentAttributes[TransformedIndex] = CallSiteAttrs.getParamAttributes(i);
+  }
+
+  // Copy annotations on varargs arguments.
+  for (unsigned i = TransformedFunction.OriginalType->getNumParams(),
+       ie = CallSiteAttrs.getNumAttrSets(); i<ie; ++i) {
+    ArgumentAttributes.push_back(CallSiteAttrs.getParamAttributes(i));
+  }
+
+  return AttributeList::get(
+      Ctx,
+      CallSiteAttrs.getFnAttributes(),
+      CallSiteAttrs.getRetAttributes(),
+      llvm::makeArrayRef(ArgumentAttributes));
+}
+
 class DataFlowSanitizer {
-  friend struct DFSanFunction; 
-  friend class DFSanVisitor; 
- 
-  enum { ShadowWidthBits = 16, ShadowWidthBytes = ShadowWidthBits / 8 }; 
- 
-  /// Which ABI should be used for instrumented functions? 
-  enum InstrumentedABI { 
-    /// Argument and return value labels are passed through additional 
-    /// arguments and by modifying the return type. 
-    IA_Args, 
- 
-    /// Argument and return value labels are passed through TLS variables 
-    /// __dfsan_arg_tls and __dfsan_retval_tls. 
-    IA_TLS 
-  }; 
- 
-  /// How should calls to uninstrumented functions be handled? 
-  enum WrapperKind { 
-    /// This function is present in an uninstrumented form but we don't know 
-    /// how it should be handled.  Print a warning and call the function anyway. 
-    /// Don't label the return value. 
-    WK_Warning, 
- 
-    /// This function does not write to (user-accessible) memory, and its return 
-    /// value is unlabelled. 
-    WK_Discard, 
- 
-    /// This function does not write to (user-accessible) memory, and the label 
-    /// of its return value is the union of the label of its arguments. 
-    WK_Functional, 
- 
-    /// Instead of calling the function, a custom wrapper __dfsw_F is called, 
-    /// where F is the name of the function.  This function may wrap the 
-    /// original function or provide its own implementation.  This is similar to 
-    /// the IA_Args ABI, except that IA_Args uses a struct return type to 
-    /// pass the return value shadow in a register, while WK_Custom uses an 
-    /// extra pointer argument to return the shadow.  This allows the wrapped 
-    /// form of the function type to be expressed in C. 
-    WK_Custom 
-  }; 
- 
-  Module *Mod; 
-  LLVMContext *Ctx; 
+  friend struct DFSanFunction;
+  friend class DFSanVisitor;
+
+  enum { ShadowWidthBits = 16, ShadowWidthBytes = ShadowWidthBits / 8 };
+
+  /// Which ABI should be used for instrumented functions?
+  enum InstrumentedABI {
+    /// Argument and return value labels are passed through additional
+    /// arguments and by modifying the return type.
+    IA_Args,
+
+    /// Argument and return value labels are passed through TLS variables
+    /// __dfsan_arg_tls and __dfsan_retval_tls.
+    IA_TLS
+  };
+
+  /// How should calls to uninstrumented functions be handled?
+  enum WrapperKind {
+    /// This function is present in an uninstrumented form but we don't know
+    /// how it should be handled.  Print a warning and call the function anyway.
+    /// Don't label the return value.
+    WK_Warning,
+
+    /// This function does not write to (user-accessible) memory, and its return
+    /// value is unlabelled.
+    WK_Discard,
+
+    /// This function does not write to (user-accessible) memory, and the label
+    /// of its return value is the union of the label of its arguments.
+    WK_Functional,
+
+    /// Instead of calling the function, a custom wrapper __dfsw_F is called,
+    /// where F is the name of the function.  This function may wrap the
+    /// original function or provide its own implementation.  This is similar to
+    /// the IA_Args ABI, except that IA_Args uses a struct return type to
+    /// pass the return value shadow in a register, while WK_Custom uses an
+    /// extra pointer argument to return the shadow.  This allows the wrapped
+    /// form of the function type to be expressed in C.
+    WK_Custom
+  };
+
+  Module *Mod;
+  LLVMContext *Ctx;
   Type *Int8Ptr;
   /// The shadow type for all primitive types and vector types.
   IntegerType *PrimitiveShadowTy;
   PointerType *PrimitiveShadowPtrTy;
-  IntegerType *IntptrTy; 
+  IntegerType *IntptrTy;
   ConstantInt *ZeroPrimitiveShadow;
-  ConstantInt *ShadowPtrMask; 
-  ConstantInt *ShadowPtrMul; 
-  Constant *ArgTLS; 
-  Constant *RetvalTLS; 
-  Constant *ExternalShadowMask; 
-  FunctionType *DFSanUnionFnTy; 
-  FunctionType *DFSanUnionLoadFnTy; 
-  FunctionType *DFSanUnimplementedFnTy; 
-  FunctionType *DFSanSetLabelFnTy; 
-  FunctionType *DFSanNonzeroLabelFnTy; 
-  FunctionType *DFSanVarargWrapperFnTy; 
+  ConstantInt *ShadowPtrMask;
+  ConstantInt *ShadowPtrMul;
+  Constant *ArgTLS;
+  Constant *RetvalTLS;
+  Constant *ExternalShadowMask;
+  FunctionType *DFSanUnionFnTy;
+  FunctionType *DFSanUnionLoadFnTy;
+  FunctionType *DFSanUnimplementedFnTy;
+  FunctionType *DFSanSetLabelFnTy;
+  FunctionType *DFSanNonzeroLabelFnTy;
+  FunctionType *DFSanVarargWrapperFnTy;
   FunctionType *DFSanCmpCallbackFnTy;
   FunctionType *DFSanLoadStoreCallbackFnTy;
-  FunctionType *DFSanMemTransferCallbackFnTy; 
-  FunctionCallee DFSanUnionFn; 
-  FunctionCallee DFSanCheckedUnionFn; 
-  FunctionCallee DFSanUnionLoadFn; 
+  FunctionType *DFSanMemTransferCallbackFnTy;
+  FunctionCallee DFSanUnionFn;
+  FunctionCallee DFSanCheckedUnionFn;
+  FunctionCallee DFSanUnionLoadFn;
   FunctionCallee DFSanUnionLoadFast16LabelsFn;
-  FunctionCallee DFSanUnimplementedFn; 
-  FunctionCallee DFSanSetLabelFn; 
-  FunctionCallee DFSanNonzeroLabelFn; 
-  FunctionCallee DFSanVarargWrapperFn; 
-  FunctionCallee DFSanLoadCallbackFn; 
-  FunctionCallee DFSanStoreCallbackFn; 
-  FunctionCallee DFSanMemTransferCallbackFn; 
-  FunctionCallee DFSanCmpCallbackFn; 
-  MDNode *ColdCallWeights; 
-  DFSanABIList ABIList; 
-  DenseMap<Value *, Function *> UnwrappedFnMap; 
-  AttrBuilder ReadOnlyNoneAttrs; 
-  bool DFSanRuntimeShadowMask = false; 
- 
-  Value *getShadowAddress(Value *Addr, Instruction *Pos); 
-  bool isInstrumented(const Function *F); 
-  bool isInstrumented(const GlobalAlias *GA); 
-  FunctionType *getArgsFunctionType(FunctionType *T); 
-  FunctionType *getTrampolineFunctionType(FunctionType *T); 
-  TransformedFunction getCustomFunctionType(FunctionType *T); 
-  InstrumentedABI getInstrumentedABI(); 
-  WrapperKind getWrapperKind(Function *F); 
-  void addGlobalNamePrefix(GlobalValue *GV); 
-  Function *buildWrapperFunction(Function *F, StringRef NewFName, 
-                                 GlobalValue::LinkageTypes NewFLink, 
-                                 FunctionType *NewFT); 
-  Constant *getOrBuildTrampolineFunction(FunctionType *FT, StringRef FName); 
-  void initializeCallbackFunctions(Module &M); 
-  void initializeRuntimeFunctions(Module &M); 
- 
+  FunctionCallee DFSanUnimplementedFn;
+  FunctionCallee DFSanSetLabelFn;
+  FunctionCallee DFSanNonzeroLabelFn;
+  FunctionCallee DFSanVarargWrapperFn;
+  FunctionCallee DFSanLoadCallbackFn;
+  FunctionCallee DFSanStoreCallbackFn;
+  FunctionCallee DFSanMemTransferCallbackFn;
+  FunctionCallee DFSanCmpCallbackFn;
+  MDNode *ColdCallWeights;
+  DFSanABIList ABIList;
+  DenseMap<Value *, Function *> UnwrappedFnMap;
+  AttrBuilder ReadOnlyNoneAttrs;
+  bool DFSanRuntimeShadowMask = false;
+
+  Value *getShadowAddress(Value *Addr, Instruction *Pos);
+  bool isInstrumented(const Function *F);
+  bool isInstrumented(const GlobalAlias *GA);
+  FunctionType *getArgsFunctionType(FunctionType *T);
+  FunctionType *getTrampolineFunctionType(FunctionType *T);
+  TransformedFunction getCustomFunctionType(FunctionType *T);
+  InstrumentedABI getInstrumentedABI();
+  WrapperKind getWrapperKind(Function *F);
+  void addGlobalNamePrefix(GlobalValue *GV);
+  Function *buildWrapperFunction(Function *F, StringRef NewFName,
+                                 GlobalValue::LinkageTypes NewFLink,
+                                 FunctionType *NewFT);
+  Constant *getOrBuildTrampolineFunction(FunctionType *FT, StringRef FName);
+  void initializeCallbackFunctions(Module &M);
+  void initializeRuntimeFunctions(Module &M);
+
   bool init(Module &M);
 
   /// Returns whether the pass tracks labels for struct fields and array
@@ -448,30 +448,30 @@ class DataFlowSanitizer {
   /// Returns the shadow type of of V's type.
   Type *getShadowTy(Value *V);
 
-public: 
+public:
   DataFlowSanitizer(const std::vector<std::string> &ABIListFiles);
- 
+
   bool runImpl(Module &M);
-}; 
- 
-struct DFSanFunction { 
-  DataFlowSanitizer &DFS; 
-  Function *F; 
-  DominatorTree DT; 
-  DataFlowSanitizer::InstrumentedABI IA; 
-  bool IsNativeABI; 
-  AllocaInst *LabelReturnAlloca = nullptr; 
-  DenseMap<Value *, Value *> ValShadowMap; 
-  DenseMap<AllocaInst *, AllocaInst *> AllocaShadowMap; 
-  std::vector<std::pair<PHINode *, PHINode *>> PHIFixups; 
-  DenseSet<Instruction *> SkipInsts; 
-  std::vector<Value *> NonZeroChecks; 
-  bool AvoidNewBlocks; 
- 
+};
+
+struct DFSanFunction {
+  DataFlowSanitizer &DFS;
+  Function *F;
+  DominatorTree DT;
+  DataFlowSanitizer::InstrumentedABI IA;
+  bool IsNativeABI;
+  AllocaInst *LabelReturnAlloca = nullptr;
+  DenseMap<Value *, Value *> ValShadowMap;
+  DenseMap<AllocaInst *, AllocaInst *> AllocaShadowMap;
+  std::vector<std::pair<PHINode *, PHINode *>> PHIFixups;
+  DenseSet<Instruction *> SkipInsts;
+  std::vector<Value *> NonZeroChecks;
+  bool AvoidNewBlocks;
+
   struct CachedShadow {
     BasicBlock *Block; // The block where Shadow is defined.
-    Value *Shadow; 
-  }; 
+    Value *Shadow;
+  };
   /// Maps a value to its latest shadow value in terms of domination tree.
   DenseMap<std::pair<Value *, Value *>, CachedShadow> CachedShadows;
   /// Maps a value to its latest collapsed shadow value it was converted to in
@@ -479,16 +479,16 @@ struct DFSanFunction {
   /// used at a post process where CFG blocks are split. So it does not cache
   /// BasicBlock like CachedShadows, but uses domination between values.
   DenseMap<Value *, Value *> CachedCollapsedShadows;
-  DenseMap<Value *, std::set<Value *>> ShadowElements; 
- 
-  DFSanFunction(DataFlowSanitizer &DFS, Function *F, bool IsNativeABI) 
-      : DFS(DFS), F(F), IA(DFS.getInstrumentedABI()), IsNativeABI(IsNativeABI) { 
-    DT.recalculate(*F); 
-    // FIXME: Need to track down the register allocator issue which causes poor 
-    // performance in pathological cases with large numbers of basic blocks. 
-    AvoidNewBlocks = F->size() > 1000; 
-  } 
- 
+  DenseMap<Value *, std::set<Value *>> ShadowElements;
+
+  DFSanFunction(DataFlowSanitizer &DFS, Function *F, bool IsNativeABI)
+      : DFS(DFS), F(F), IA(DFS.getInstrumentedABI()), IsNativeABI(IsNativeABI) {
+    DT.recalculate(*F);
+    // FIXME: Need to track down the register allocator issue which causes poor
+    // performance in pathological cases with large numbers of basic blocks.
+    AvoidNewBlocks = F->size() > 1000;
+  }
+
   /// Computes the shadow address for a given function argument.
   ///
   /// Shadow = ArgTLS+ArgOffset.
@@ -497,18 +497,18 @@ struct DFSanFunction {
   /// Computes the shadow address for a retval.
   Value *getRetvalTLS(Type *T, IRBuilder<> &IRB);
 
-  Value *getShadow(Value *V); 
-  void setShadow(Instruction *I, Value *Shadow); 
+  Value *getShadow(Value *V);
+  void setShadow(Instruction *I, Value *Shadow);
   /// Generates IR to compute the union of the two given shadows, inserting it
   /// before Pos. The combined value is with primitive type.
-  Value *combineShadows(Value *V1, Value *V2, Instruction *Pos); 
+  Value *combineShadows(Value *V1, Value *V2, Instruction *Pos);
   /// Combines the shadow values of V1 and V2, then converts the combined value
   /// with primitive type into a shadow value with the original type T.
   Value *combineShadowsThenConvert(Type *T, Value *V1, Value *V2,
                                    Instruction *Pos);
-  Value *combineOperandShadows(Instruction *Inst); 
-  Value *loadShadow(Value *ShadowAddr, uint64_t Size, uint64_t Align, 
-                    Instruction *Pos); 
+  Value *combineOperandShadows(Instruction *Inst);
+  Value *loadShadow(Value *ShadowAddr, uint64_t Size, uint64_t Align,
+                    Instruction *Pos);
   void storePrimitiveShadow(Value *Addr, uint64_t Size, Align Alignment,
                             Value *PrimitiveShadow, Instruction *Pos);
   /// Applies PrimitiveShadow to all primitive subtypes of T, returning
@@ -539,110 +539,110 @@ private:
 
   /// Returns the shadow value of an argument A.
   Value *getShadowForTLSArgument(Argument *A);
-}; 
- 
-class DFSanVisitor : public InstVisitor<DFSanVisitor> { 
-public: 
-  DFSanFunction &DFSF; 
- 
-  DFSanVisitor(DFSanFunction &DFSF) : DFSF(DFSF) {} 
- 
-  const DataLayout &getDataLayout() const { 
-    return DFSF.F->getParent()->getDataLayout(); 
-  } 
- 
-  // Combines shadow values for all of I's operands. Returns the combined shadow 
-  // value. 
-  Value *visitOperandShadowInst(Instruction &I); 
- 
-  void visitUnaryOperator(UnaryOperator &UO); 
-  void visitBinaryOperator(BinaryOperator &BO); 
-  void visitCastInst(CastInst &CI); 
-  void visitCmpInst(CmpInst &CI); 
-  void visitGetElementPtrInst(GetElementPtrInst &GEPI); 
-  void visitLoadInst(LoadInst &LI); 
-  void visitStoreInst(StoreInst &SI); 
-  void visitReturnInst(ReturnInst &RI); 
-  void visitCallBase(CallBase &CB); 
-  void visitPHINode(PHINode &PN); 
-  void visitExtractElementInst(ExtractElementInst &I); 
-  void visitInsertElementInst(InsertElementInst &I); 
-  void visitShuffleVectorInst(ShuffleVectorInst &I); 
-  void visitExtractValueInst(ExtractValueInst &I); 
-  void visitInsertValueInst(InsertValueInst &I); 
-  void visitAllocaInst(AllocaInst &I); 
-  void visitSelectInst(SelectInst &I); 
-  void visitMemSetInst(MemSetInst &I); 
-  void visitMemTransferInst(MemTransferInst &I); 
-}; 
- 
-} // end anonymous namespace 
- 
-DataFlowSanitizer::DataFlowSanitizer( 
+};
+
+class DFSanVisitor : public InstVisitor<DFSanVisitor> {
+public:
+  DFSanFunction &DFSF;
+
+  DFSanVisitor(DFSanFunction &DFSF) : DFSF(DFSF) {}
+
+  const DataLayout &getDataLayout() const {
+    return DFSF.F->getParent()->getDataLayout();
+  }
+
+  // Combines shadow values for all of I's operands. Returns the combined shadow
+  // value.
+  Value *visitOperandShadowInst(Instruction &I);
+
+  void visitUnaryOperator(UnaryOperator &UO);
+  void visitBinaryOperator(BinaryOperator &BO);
+  void visitCastInst(CastInst &CI);
+  void visitCmpInst(CmpInst &CI);
+  void visitGetElementPtrInst(GetElementPtrInst &GEPI);
+  void visitLoadInst(LoadInst &LI);
+  void visitStoreInst(StoreInst &SI);
+  void visitReturnInst(ReturnInst &RI);
+  void visitCallBase(CallBase &CB);
+  void visitPHINode(PHINode &PN);
+  void visitExtractElementInst(ExtractElementInst &I);
+  void visitInsertElementInst(InsertElementInst &I);
+  void visitShuffleVectorInst(ShuffleVectorInst &I);
+  void visitExtractValueInst(ExtractValueInst &I);
+  void visitInsertValueInst(InsertValueInst &I);
+  void visitAllocaInst(AllocaInst &I);
+  void visitSelectInst(SelectInst &I);
+  void visitMemSetInst(MemSetInst &I);
+  void visitMemTransferInst(MemTransferInst &I);
+};
+
+} // end anonymous namespace
+
+DataFlowSanitizer::DataFlowSanitizer(
     const std::vector<std::string> &ABIListFiles) {
-  std::vector<std::string> AllABIListFiles(std::move(ABIListFiles)); 
+  std::vector<std::string> AllABIListFiles(std::move(ABIListFiles));
   llvm::append_range(AllABIListFiles, ClABIListFiles);
-  // FIXME: should we propagate vfs::FileSystem to this constructor? 
-  ABIList.set( 
-      SpecialCaseList::createOrDie(AllABIListFiles, *vfs::getRealFileSystem())); 
-} 
- 
-FunctionType *DataFlowSanitizer::getArgsFunctionType(FunctionType *T) { 
-  SmallVector<Type *, 4> ArgTypes(T->param_begin(), T->param_end()); 
+  // FIXME: should we propagate vfs::FileSystem to this constructor?
+  ABIList.set(
+      SpecialCaseList::createOrDie(AllABIListFiles, *vfs::getRealFileSystem()));
+}
+
+FunctionType *DataFlowSanitizer::getArgsFunctionType(FunctionType *T) {
+  SmallVector<Type *, 4> ArgTypes(T->param_begin(), T->param_end());
   ArgTypes.append(T->getNumParams(), PrimitiveShadowTy);
-  if (T->isVarArg()) 
+  if (T->isVarArg())
     ArgTypes.push_back(PrimitiveShadowPtrTy);
-  Type *RetType = T->getReturnType(); 
-  if (!RetType->isVoidTy()) 
+  Type *RetType = T->getReturnType();
+  if (!RetType->isVoidTy())
     RetType = StructType::get(RetType, PrimitiveShadowTy);
-  return FunctionType::get(RetType, ArgTypes, T->isVarArg()); 
-} 
- 
-FunctionType *DataFlowSanitizer::getTrampolineFunctionType(FunctionType *T) { 
-  assert(!T->isVarArg()); 
-  SmallVector<Type *, 4> ArgTypes; 
-  ArgTypes.push_back(T->getPointerTo()); 
-  ArgTypes.append(T->param_begin(), T->param_end()); 
+  return FunctionType::get(RetType, ArgTypes, T->isVarArg());
+}
+
+FunctionType *DataFlowSanitizer::getTrampolineFunctionType(FunctionType *T) {
+  assert(!T->isVarArg());
+  SmallVector<Type *, 4> ArgTypes;
+  ArgTypes.push_back(T->getPointerTo());
+  ArgTypes.append(T->param_begin(), T->param_end());
   ArgTypes.append(T->getNumParams(), PrimitiveShadowTy);
-  Type *RetType = T->getReturnType(); 
-  if (!RetType->isVoidTy()) 
+  Type *RetType = T->getReturnType();
+  if (!RetType->isVoidTy())
     ArgTypes.push_back(PrimitiveShadowPtrTy);
-  return FunctionType::get(T->getReturnType(), ArgTypes, false); 
-} 
- 
-TransformedFunction DataFlowSanitizer::getCustomFunctionType(FunctionType *T) { 
-  SmallVector<Type *, 4> ArgTypes; 
- 
-  // Some parameters of the custom function being constructed are 
-  // parameters of T.  Record the mapping from parameters of T to 
-  // parameters of the custom function, so that parameter attributes 
-  // at call sites can be updated. 
-  std::vector<unsigned> ArgumentIndexMapping; 
-  for (unsigned i = 0, ie = T->getNumParams(); i != ie; ++i) { 
-    Type* param_type = T->getParamType(i); 
-    FunctionType *FT; 
-    if (isa<PointerType>(param_type) && (FT = dyn_cast<FunctionType>( 
-            cast<PointerType>(param_type)->getElementType()))) { 
-      ArgumentIndexMapping.push_back(ArgTypes.size()); 
-      ArgTypes.push_back(getTrampolineFunctionType(FT)->getPointerTo()); 
-      ArgTypes.push_back(Type::getInt8PtrTy(*Ctx)); 
-    } else { 
-      ArgumentIndexMapping.push_back(ArgTypes.size()); 
-      ArgTypes.push_back(param_type); 
-    } 
-  } 
-  for (unsigned i = 0, e = T->getNumParams(); i != e; ++i) 
+  return FunctionType::get(T->getReturnType(), ArgTypes, false);
+}
+
+TransformedFunction DataFlowSanitizer::getCustomFunctionType(FunctionType *T) {
+  SmallVector<Type *, 4> ArgTypes;
+
+  // Some parameters of the custom function being constructed are
+  // parameters of T.  Record the mapping from parameters of T to
+  // parameters of the custom function, so that parameter attributes
+  // at call sites can be updated.
+  std::vector<unsigned> ArgumentIndexMapping;
+  for (unsigned i = 0, ie = T->getNumParams(); i != ie; ++i) {
+    Type* param_type = T->getParamType(i);
+    FunctionType *FT;
+    if (isa<PointerType>(param_type) && (FT = dyn_cast<FunctionType>(
+            cast<PointerType>(param_type)->getElementType()))) {
+      ArgumentIndexMapping.push_back(ArgTypes.size());
+      ArgTypes.push_back(getTrampolineFunctionType(FT)->getPointerTo());
+      ArgTypes.push_back(Type::getInt8PtrTy(*Ctx));
+    } else {
+      ArgumentIndexMapping.push_back(ArgTypes.size());
+      ArgTypes.push_back(param_type);
+    }
+  }
+  for (unsigned i = 0, e = T->getNumParams(); i != e; ++i)
     ArgTypes.push_back(PrimitiveShadowTy);
-  if (T->isVarArg()) 
+  if (T->isVarArg())
     ArgTypes.push_back(PrimitiveShadowPtrTy);
-  Type *RetType = T->getReturnType(); 
-  if (!RetType->isVoidTy()) 
+  Type *RetType = T->getReturnType();
+  if (!RetType->isVoidTy())
     ArgTypes.push_back(PrimitiveShadowPtrTy);
-  return TransformedFunction( 
-      T, FunctionType::get(T->getReturnType(), ArgTypes, T->isVarArg()), 
-      ArgumentIndexMapping); 
-} 
- 
+  return TransformedFunction(
+      T, FunctionType::get(T->getReturnType(), ArgTypes, T->isVarArg()),
+      ArgumentIndexMapping);
+}
+
 bool DataFlowSanitizer::isZeroShadow(Value *V) {
   if (!shouldTrackFieldsAndIndices())
     return ZeroPrimitiveShadow == V;
@@ -800,48 +800,48 @@ Type *DataFlowSanitizer::getShadowTy(Value *V) {
 }
 
 bool DataFlowSanitizer::init(Module &M) {
-  Triple TargetTriple(M.getTargetTriple()); 
-  bool IsX86_64 = TargetTriple.getArch() == Triple::x86_64; 
-  bool IsMIPS64 = TargetTriple.isMIPS64(); 
-  bool IsAArch64 = TargetTriple.getArch() == Triple::aarch64 || 
-                   TargetTriple.getArch() == Triple::aarch64_be; 
- 
-  const DataLayout &DL = M.getDataLayout(); 
- 
-  Mod = &M; 
-  Ctx = &M.getContext(); 
+  Triple TargetTriple(M.getTargetTriple());
+  bool IsX86_64 = TargetTriple.getArch() == Triple::x86_64;
+  bool IsMIPS64 = TargetTriple.isMIPS64();
+  bool IsAArch64 = TargetTriple.getArch() == Triple::aarch64 ||
+                   TargetTriple.getArch() == Triple::aarch64_be;
+
+  const DataLayout &DL = M.getDataLayout();
+
+  Mod = &M;
+  Ctx = &M.getContext();
   Int8Ptr = Type::getInt8PtrTy(*Ctx);
   PrimitiveShadowTy = IntegerType::get(*Ctx, ShadowWidthBits);
   PrimitiveShadowPtrTy = PointerType::getUnqual(PrimitiveShadowTy);
-  IntptrTy = DL.getIntPtrType(*Ctx); 
+  IntptrTy = DL.getIntPtrType(*Ctx);
   ZeroPrimitiveShadow = ConstantInt::getSigned(PrimitiveShadowTy, 0);
-  ShadowPtrMul = ConstantInt::getSigned(IntptrTy, ShadowWidthBytes); 
-  if (IsX86_64) 
-    ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0x700000000000LL); 
-  else if (IsMIPS64) 
-    ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0xF000000000LL); 
-  // AArch64 supports multiple VMAs and the shadow mask is set at runtime. 
-  else if (IsAArch64) 
-    DFSanRuntimeShadowMask = true; 
-  else 
-    report_fatal_error("unsupported triple"); 
- 
+  ShadowPtrMul = ConstantInt::getSigned(IntptrTy, ShadowWidthBytes);
+  if (IsX86_64)
+    ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0x700000000000LL);
+  else if (IsMIPS64)
+    ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0xF000000000LL);
+  // AArch64 supports multiple VMAs and the shadow mask is set at runtime.
+  else if (IsAArch64)
+    DFSanRuntimeShadowMask = true;
+  else
+    report_fatal_error("unsupported triple");
+
   Type *DFSanUnionArgs[2] = {PrimitiveShadowTy, PrimitiveShadowTy};
-  DFSanUnionFnTy = 
+  DFSanUnionFnTy =
       FunctionType::get(PrimitiveShadowTy, DFSanUnionArgs, /*isVarArg=*/false);
   Type *DFSanUnionLoadArgs[2] = {PrimitiveShadowPtrTy, IntptrTy};
   DFSanUnionLoadFnTy = FunctionType::get(PrimitiveShadowTy, DFSanUnionLoadArgs,
                                          /*isVarArg=*/false);
-  DFSanUnimplementedFnTy = FunctionType::get( 
-      Type::getVoidTy(*Ctx), Type::getInt8PtrTy(*Ctx), /*isVarArg=*/false); 
+  DFSanUnimplementedFnTy = FunctionType::get(
+      Type::getVoidTy(*Ctx), Type::getInt8PtrTy(*Ctx), /*isVarArg=*/false);
   Type *DFSanSetLabelArgs[3] = {PrimitiveShadowTy, Type::getInt8PtrTy(*Ctx),
                                 IntptrTy};
-  DFSanSetLabelFnTy = FunctionType::get(Type::getVoidTy(*Ctx), 
-                                        DFSanSetLabelArgs, /*isVarArg=*/false); 
+  DFSanSetLabelFnTy = FunctionType::get(Type::getVoidTy(*Ctx),
+                                        DFSanSetLabelArgs, /*isVarArg=*/false);
   DFSanNonzeroLabelFnTy =
       FunctionType::get(Type::getVoidTy(*Ctx), None, /*isVarArg=*/false);
-  DFSanVarargWrapperFnTy = FunctionType::get( 
-      Type::getVoidTy(*Ctx), Type::getInt8PtrTy(*Ctx), /*isVarArg=*/false); 
+  DFSanVarargWrapperFnTy = FunctionType::get(
+      Type::getVoidTy(*Ctx), Type::getInt8PtrTy(*Ctx), /*isVarArg=*/false);
   DFSanCmpCallbackFnTy =
       FunctionType::get(Type::getVoidTy(*Ctx), PrimitiveShadowTy,
                         /*isVarArg=*/false);
@@ -850,169 +850,169 @@ bool DataFlowSanitizer::init(Module &M) {
       FunctionType::get(Type::getVoidTy(*Ctx), DFSanLoadStoreCallbackArgs,
                         /*isVarArg=*/false);
   Type *DFSanMemTransferCallbackArgs[2] = {PrimitiveShadowPtrTy, IntptrTy};
-  DFSanMemTransferCallbackFnTy = 
-      FunctionType::get(Type::getVoidTy(*Ctx), DFSanMemTransferCallbackArgs, 
-                        /*isVarArg=*/false); 
- 
-  ColdCallWeights = MDBuilder(*Ctx).createBranchWeights(1, 1000); 
-  return true; 
-} 
- 
-bool DataFlowSanitizer::isInstrumented(const Function *F) { 
-  return !ABIList.isIn(*F, "uninstrumented"); 
-} 
- 
-bool DataFlowSanitizer::isInstrumented(const GlobalAlias *GA) { 
-  return !ABIList.isIn(*GA, "uninstrumented"); 
-} 
- 
-DataFlowSanitizer::InstrumentedABI DataFlowSanitizer::getInstrumentedABI() { 
-  return ClArgsABI ? IA_Args : IA_TLS; 
-} 
- 
-DataFlowSanitizer::WrapperKind DataFlowSanitizer::getWrapperKind(Function *F) { 
-  if (ABIList.isIn(*F, "functional")) 
-    return WK_Functional; 
-  if (ABIList.isIn(*F, "discard")) 
-    return WK_Discard; 
-  if (ABIList.isIn(*F, "custom")) 
-    return WK_Custom; 
- 
-  return WK_Warning; 
-} 
- 
-void DataFlowSanitizer::addGlobalNamePrefix(GlobalValue *GV) { 
-  std::string GVName = std::string(GV->getName()), Prefix = "dfs$"; 
-  GV->setName(Prefix + GVName); 
- 
-  // Try to change the name of the function in module inline asm.  We only do 
-  // this for specific asm directives, currently only ".symver", to try to avoid 
-  // corrupting asm which happens to contain the symbol name as a substring. 
-  // Note that the substitution for .symver assumes that the versioned symbol 
-  // also has an instrumented name. 
-  std::string Asm = GV->getParent()->getModuleInlineAsm(); 
-  std::string SearchStr = ".symver " + GVName + ","; 
-  size_t Pos = Asm.find(SearchStr); 
-  if (Pos != std::string::npos) { 
-    Asm.replace(Pos, SearchStr.size(), 
-                ".symver " + Prefix + GVName + "," + Prefix); 
-    GV->getParent()->setModuleInlineAsm(Asm); 
-  } 
-} 
- 
-Function * 
-DataFlowSanitizer::buildWrapperFunction(Function *F, StringRef NewFName, 
-                                        GlobalValue::LinkageTypes NewFLink, 
-                                        FunctionType *NewFT) { 
-  FunctionType *FT = F->getFunctionType(); 
-  Function *NewF = Function::Create(NewFT, NewFLink, F->getAddressSpace(), 
-                                    NewFName, F->getParent()); 
-  NewF->copyAttributesFrom(F); 
-  NewF->removeAttributes( 
-      AttributeList::ReturnIndex, 
-      AttributeFuncs::typeIncompatible(NewFT->getReturnType())); 
- 
-  BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", NewF); 
-  if (F->isVarArg()) { 
-    NewF->removeAttributes(AttributeList::FunctionIndex, 
-                           AttrBuilder().addAttribute("split-stack")); 
-    CallInst::Create(DFSanVarargWrapperFn, 
-                     IRBuilder<>(BB).CreateGlobalStringPtr(F->getName()), "", 
-                     BB); 
-    new UnreachableInst(*Ctx, BB); 
-  } else { 
-    std::vector<Value *> Args; 
-    unsigned n = FT->getNumParams(); 
-    for (Function::arg_iterator ai = NewF->arg_begin(); n != 0; ++ai, --n) 
-      Args.push_back(&*ai); 
-    CallInst *CI = CallInst::Create(F, Args, "", BB); 
-    if (FT->getReturnType()->isVoidTy()) 
-      ReturnInst::Create(*Ctx, BB); 
-    else 
-      ReturnInst::Create(*Ctx, CI, BB); 
-  } 
- 
-  return NewF; 
-} 
- 
-Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT, 
-                                                          StringRef FName) { 
-  FunctionType *FTT = getTrampolineFunctionType(FT); 
-  FunctionCallee C = Mod->getOrInsertFunction(FName, FTT); 
-  Function *F = dyn_cast<Function>(C.getCallee()); 
-  if (F && F->isDeclaration()) { 
-    F->setLinkage(GlobalValue::LinkOnceODRLinkage); 
-    BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", F); 
-    std::vector<Value *> Args; 
-    Function::arg_iterator AI = F->arg_begin(); ++AI; 
-    for (unsigned N = FT->getNumParams(); N != 0; ++AI, --N) 
-      Args.push_back(&*AI); 
-    CallInst *CI = CallInst::Create(FT, &*F->arg_begin(), Args, "", BB); 
-    ReturnInst *RI; 
-    if (FT->getReturnType()->isVoidTy()) 
-      RI = ReturnInst::Create(*Ctx, BB); 
-    else 
-      RI = ReturnInst::Create(*Ctx, CI, BB); 
- 
+  DFSanMemTransferCallbackFnTy =
+      FunctionType::get(Type::getVoidTy(*Ctx), DFSanMemTransferCallbackArgs,
+                        /*isVarArg=*/false);
+
+  ColdCallWeights = MDBuilder(*Ctx).createBranchWeights(1, 1000);
+  return true;
+}
+
+bool DataFlowSanitizer::isInstrumented(const Function *F) {
+  return !ABIList.isIn(*F, "uninstrumented");
+}
+
+bool DataFlowSanitizer::isInstrumented(const GlobalAlias *GA) {
+  return !ABIList.isIn(*GA, "uninstrumented");
+}
+
+DataFlowSanitizer::InstrumentedABI DataFlowSanitizer::getInstrumentedABI() {
+  return ClArgsABI ? IA_Args : IA_TLS;
+}
+
+DataFlowSanitizer::WrapperKind DataFlowSanitizer::getWrapperKind(Function *F) {
+  if (ABIList.isIn(*F, "functional"))
+    return WK_Functional;
+  if (ABIList.isIn(*F, "discard"))
+    return WK_Discard;
+  if (ABIList.isIn(*F, "custom"))
+    return WK_Custom;
+
+  return WK_Warning;
+}
+
+void DataFlowSanitizer::addGlobalNamePrefix(GlobalValue *GV) {
+  std::string GVName = std::string(GV->getName()), Prefix = "dfs$";
+  GV->setName(Prefix + GVName);
+
+  // Try to change the name of the function in module inline asm.  We only do
+  // this for specific asm directives, currently only ".symver", to try to avoid
+  // corrupting asm which happens to contain the symbol name as a substring.
+  // Note that the substitution for .symver assumes that the versioned symbol
+  // also has an instrumented name.
+  std::string Asm = GV->getParent()->getModuleInlineAsm();
+  std::string SearchStr = ".symver " + GVName + ",";
+  size_t Pos = Asm.find(SearchStr);
+  if (Pos != std::string::npos) {
+    Asm.replace(Pos, SearchStr.size(),
+                ".symver " + Prefix + GVName + "," + Prefix);
+    GV->getParent()->setModuleInlineAsm(Asm);
+  }
+}
+
+Function *
+DataFlowSanitizer::buildWrapperFunction(Function *F, StringRef NewFName,
+                                        GlobalValue::LinkageTypes NewFLink,
+                                        FunctionType *NewFT) {
+  FunctionType *FT = F->getFunctionType();
+  Function *NewF = Function::Create(NewFT, NewFLink, F->getAddressSpace(),
+                                    NewFName, F->getParent());
+  NewF->copyAttributesFrom(F);
+  NewF->removeAttributes(
+      AttributeList::ReturnIndex,
+      AttributeFuncs::typeIncompatible(NewFT->getReturnType()));
+
+  BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", NewF);
+  if (F->isVarArg()) {
+    NewF->removeAttributes(AttributeList::FunctionIndex,
+                           AttrBuilder().addAttribute("split-stack"));
+    CallInst::Create(DFSanVarargWrapperFn,
+                     IRBuilder<>(BB).CreateGlobalStringPtr(F->getName()), "",
+                     BB);
+    new UnreachableInst(*Ctx, BB);
+  } else {
+    std::vector<Value *> Args;
+    unsigned n = FT->getNumParams();
+    for (Function::arg_iterator ai = NewF->arg_begin(); n != 0; ++ai, --n)
+      Args.push_back(&*ai);
+    CallInst *CI = CallInst::Create(F, Args, "", BB);
+    if (FT->getReturnType()->isVoidTy())
+      ReturnInst::Create(*Ctx, BB);
+    else
+      ReturnInst::Create(*Ctx, CI, BB);
+  }
+
+  return NewF;
+}
+
+Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT,
+                                                          StringRef FName) {
+  FunctionType *FTT = getTrampolineFunctionType(FT);
+  FunctionCallee C = Mod->getOrInsertFunction(FName, FTT);
+  Function *F = dyn_cast<Function>(C.getCallee());
+  if (F && F->isDeclaration()) {
+    F->setLinkage(GlobalValue::LinkOnceODRLinkage);
+    BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", F);
+    std::vector<Value *> Args;
+    Function::arg_iterator AI = F->arg_begin(); ++AI;
+    for (unsigned N = FT->getNumParams(); N != 0; ++AI, --N)
+      Args.push_back(&*AI);
+    CallInst *CI = CallInst::Create(FT, &*F->arg_begin(), Args, "", BB);
+    ReturnInst *RI;
+    if (FT->getReturnType()->isVoidTy())
+      RI = ReturnInst::Create(*Ctx, BB);
+    else
+      RI = ReturnInst::Create(*Ctx, CI, BB);
+
     // F is called by a wrapped custom function with primitive shadows. So
     // its arguments and return value need conversion.
-    DFSanFunction DFSF(*this, F, /*IsNativeABI=*/true); 
-    Function::arg_iterator ValAI = F->arg_begin(), ShadowAI = AI; ++ValAI; 
+    DFSanFunction DFSF(*this, F, /*IsNativeABI=*/true);
+    Function::arg_iterator ValAI = F->arg_begin(), ShadowAI = AI; ++ValAI;
     for (unsigned N = FT->getNumParams(); N != 0; ++ValAI, ++ShadowAI, --N) {
       Value *Shadow =
           DFSF.expandFromPrimitiveShadow(ValAI->getType(), &*ShadowAI, CI);
       DFSF.ValShadowMap[&*ValAI] = Shadow;
     }
-    DFSanVisitor(DFSF).visitCallInst(*CI); 
+    DFSanVisitor(DFSF).visitCallInst(*CI);
     if (!FT->getReturnType()->isVoidTy()) {
       Value *PrimitiveShadow = DFSF.collapseToPrimitiveShadow(
           DFSF.getShadow(RI->getReturnValue()), RI);
       new StoreInst(PrimitiveShadow, &*std::prev(F->arg_end()), RI);
     }
-  } 
- 
-  return cast<Constant>(C.getCallee()); 
-} 
- 
-// Initialize DataFlowSanitizer runtime functions and declare them in the module 
-void DataFlowSanitizer::initializeRuntimeFunctions(Module &M) { 
-  { 
-    AttributeList AL; 
-    AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex, 
-                         Attribute::NoUnwind); 
-    AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex, 
-                         Attribute::ReadNone); 
-    AL = AL.addAttribute(M.getContext(), AttributeList::ReturnIndex, 
-                         Attribute::ZExt); 
-    AL = AL.addParamAttribute(M.getContext(), 0, Attribute::ZExt); 
-    AL = AL.addParamAttribute(M.getContext(), 1, Attribute::ZExt); 
-    DFSanUnionFn = 
-        Mod->getOrInsertFunction("__dfsan_union", DFSanUnionFnTy, AL); 
-  } 
-  { 
-    AttributeList AL; 
-    AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex, 
-                         Attribute::NoUnwind); 
-    AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex, 
-                         Attribute::ReadNone); 
-    AL = AL.addAttribute(M.getContext(), AttributeList::ReturnIndex, 
-                         Attribute::ZExt); 
-    AL = AL.addParamAttribute(M.getContext(), 0, Attribute::ZExt); 
-    AL = AL.addParamAttribute(M.getContext(), 1, Attribute::ZExt); 
-    DFSanCheckedUnionFn = 
-        Mod->getOrInsertFunction("dfsan_union", DFSanUnionFnTy, AL); 
-  } 
-  { 
-    AttributeList AL; 
-    AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex, 
-                         Attribute::NoUnwind); 
-    AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex, 
-                         Attribute::ReadOnly); 
-    AL = AL.addAttribute(M.getContext(), AttributeList::ReturnIndex, 
-                         Attribute::ZExt); 
-    DFSanUnionLoadFn = 
-        Mod->getOrInsertFunction("__dfsan_union_load", DFSanUnionLoadFnTy, AL); 
-  } 
+  }
+
+  return cast<Constant>(C.getCallee());
+}
+
+// Initialize DataFlowSanitizer runtime functions and declare them in the module
+void DataFlowSanitizer::initializeRuntimeFunctions(Module &M) {
+  {
+    AttributeList AL;
+    AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
+                         Attribute::NoUnwind);
+    AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
+                         Attribute::ReadNone);
+    AL = AL.addAttribute(M.getContext(), AttributeList::ReturnIndex,
+                         Attribute::ZExt);
+    AL = AL.addParamAttribute(M.getContext(), 0, Attribute::ZExt);
+    AL = AL.addParamAttribute(M.getContext(), 1, Attribute::ZExt);
+    DFSanUnionFn =
+        Mod->getOrInsertFunction("__dfsan_union", DFSanUnionFnTy, AL);
+  }
+  {
+    AttributeList AL;
+    AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
+                         Attribute::NoUnwind);
+    AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
+                         Attribute::ReadNone);
+    AL = AL.addAttribute(M.getContext(), AttributeList::ReturnIndex,
+                         Attribute::ZExt);
+    AL = AL.addParamAttribute(M.getContext(), 0, Attribute::ZExt);
+    AL = AL.addParamAttribute(M.getContext(), 1, Attribute::ZExt);
+    DFSanCheckedUnionFn =
+        Mod->getOrInsertFunction("dfsan_union", DFSanUnionFnTy, AL);
+  }
+  {
+    AttributeList AL;
+    AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
+                         Attribute::NoUnwind);
+    AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
+                         Attribute::ReadOnly);
+    AL = AL.addAttribute(M.getContext(), AttributeList::ReturnIndex,
+                         Attribute::ZExt);
+    DFSanUnionLoadFn =
+        Mod->getOrInsertFunction("__dfsan_union_load", DFSanUnionLoadFnTy, AL);
+  }
   {
     AttributeList AL;
     AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
@@ -1024,285 +1024,285 @@ void DataFlowSanitizer::initializeRuntimeFunctions(Module &M) {
     DFSanUnionLoadFast16LabelsFn = Mod->getOrInsertFunction(
         "__dfsan_union_load_fast16labels", DFSanUnionLoadFnTy, AL);
   }
-  DFSanUnimplementedFn = 
-      Mod->getOrInsertFunction("__dfsan_unimplemented", DFSanUnimplementedFnTy); 
-  { 
-    AttributeList AL; 
-    AL = AL.addParamAttribute(M.getContext(), 0, Attribute::ZExt); 
-    DFSanSetLabelFn = 
-        Mod->getOrInsertFunction("__dfsan_set_label", DFSanSetLabelFnTy, AL); 
-  } 
-  DFSanNonzeroLabelFn = 
-      Mod->getOrInsertFunction("__dfsan_nonzero_label", DFSanNonzeroLabelFnTy); 
-  DFSanVarargWrapperFn = Mod->getOrInsertFunction("__dfsan_vararg_wrapper", 
-                                                  DFSanVarargWrapperFnTy); 
-} 
- 
-// Initializes event callback functions and declare them in the module 
-void DataFlowSanitizer::initializeCallbackFunctions(Module &M) { 
-  DFSanLoadCallbackFn = Mod->getOrInsertFunction("__dfsan_load_callback", 
+  DFSanUnimplementedFn =
+      Mod->getOrInsertFunction("__dfsan_unimplemented", DFSanUnimplementedFnTy);
+  {
+    AttributeList AL;
+    AL = AL.addParamAttribute(M.getContext(), 0, Attribute::ZExt);
+    DFSanSetLabelFn =
+        Mod->getOrInsertFunction("__dfsan_set_label", DFSanSetLabelFnTy, AL);
+  }
+  DFSanNonzeroLabelFn =
+      Mod->getOrInsertFunction("__dfsan_nonzero_label", DFSanNonzeroLabelFnTy);
+  DFSanVarargWrapperFn = Mod->getOrInsertFunction("__dfsan_vararg_wrapper",
+                                                  DFSanVarargWrapperFnTy);
+}
+
+// Initializes event callback functions and declare them in the module
+void DataFlowSanitizer::initializeCallbackFunctions(Module &M) {
+  DFSanLoadCallbackFn = Mod->getOrInsertFunction("__dfsan_load_callback",
                                                  DFSanLoadStoreCallbackFnTy);
   DFSanStoreCallbackFn = Mod->getOrInsertFunction("__dfsan_store_callback",
                                                   DFSanLoadStoreCallbackFnTy);
-  DFSanMemTransferCallbackFn = Mod->getOrInsertFunction( 
-      "__dfsan_mem_transfer_callback", DFSanMemTransferCallbackFnTy); 
+  DFSanMemTransferCallbackFn = Mod->getOrInsertFunction(
+      "__dfsan_mem_transfer_callback", DFSanMemTransferCallbackFnTy);
   DFSanCmpCallbackFn =
       Mod->getOrInsertFunction("__dfsan_cmp_callback", DFSanCmpCallbackFnTy);
-} 
- 
+}
+
 bool DataFlowSanitizer::runImpl(Module &M) {
   init(M);
 
-  if (ABIList.isIn(M, "skip")) 
-    return false; 
- 
-  const unsigned InitialGlobalSize = M.global_size(); 
-  const unsigned InitialModuleSize = M.size(); 
- 
-  bool Changed = false; 
- 
+  if (ABIList.isIn(M, "skip"))
+    return false;
+
+  const unsigned InitialGlobalSize = M.global_size();
+  const unsigned InitialModuleSize = M.size();
+
+  bool Changed = false;
+
   Type *ArgTLSTy = ArrayType::get(Type::getInt64Ty(*Ctx), kArgTLSSize / 8);
   ArgTLS = Mod->getOrInsertGlobal("__dfsan_arg_tls", ArgTLSTy);
   if (GlobalVariable *G = dyn_cast<GlobalVariable>(ArgTLS)) {
     Changed |= G->getThreadLocalMode() != GlobalVariable::InitialExecTLSModel;
     G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel);
-  } 
+  }
   Type *RetvalTLSTy =
       ArrayType::get(Type::getInt64Ty(*Ctx), kRetvalTLSSize / 8);
   RetvalTLS = Mod->getOrInsertGlobal("__dfsan_retval_tls", RetvalTLSTy);
   if (GlobalVariable *G = dyn_cast<GlobalVariable>(RetvalTLS)) {
     Changed |= G->getThreadLocalMode() != GlobalVariable::InitialExecTLSModel;
     G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel);
-  } 
- 
-  ExternalShadowMask = 
-      Mod->getOrInsertGlobal(kDFSanExternShadowPtrMask, IntptrTy); 
- 
-  initializeCallbackFunctions(M); 
-  initializeRuntimeFunctions(M); 
- 
-  std::vector<Function *> FnsToInstrument; 
-  SmallPtrSet<Function *, 2> FnsWithNativeABI; 
-  for (Function &i : M) { 
-    if (!i.isIntrinsic() && 
-        &i != DFSanUnionFn.getCallee()->stripPointerCasts() && 
-        &i != DFSanCheckedUnionFn.getCallee()->stripPointerCasts() && 
-        &i != DFSanUnionLoadFn.getCallee()->stripPointerCasts() && 
+  }
+
+  ExternalShadowMask =
+      Mod->getOrInsertGlobal(kDFSanExternShadowPtrMask, IntptrTy);
+
+  initializeCallbackFunctions(M);
+  initializeRuntimeFunctions(M);
+
+  std::vector<Function *> FnsToInstrument;
+  SmallPtrSet<Function *, 2> FnsWithNativeABI;
+  for (Function &i : M) {
+    if (!i.isIntrinsic() &&
+        &i != DFSanUnionFn.getCallee()->stripPointerCasts() &&
+        &i != DFSanCheckedUnionFn.getCallee()->stripPointerCasts() &&
+        &i != DFSanUnionLoadFn.getCallee()->stripPointerCasts() &&
         &i != DFSanUnionLoadFast16LabelsFn.getCallee()->stripPointerCasts() &&
-        &i != DFSanUnimplementedFn.getCallee()->stripPointerCasts() && 
-        &i != DFSanSetLabelFn.getCallee()->stripPointerCasts() && 
-        &i != DFSanNonzeroLabelFn.getCallee()->stripPointerCasts() && 
-        &i != DFSanVarargWrapperFn.getCallee()->stripPointerCasts() && 
-        &i != DFSanLoadCallbackFn.getCallee()->stripPointerCasts() && 
-        &i != DFSanStoreCallbackFn.getCallee()->stripPointerCasts() && 
-        &i != DFSanMemTransferCallbackFn.getCallee()->stripPointerCasts() && 
-        &i != DFSanCmpCallbackFn.getCallee()->stripPointerCasts()) 
-      FnsToInstrument.push_back(&i); 
-  } 
- 
-  // Give function aliases prefixes when necessary, and build wrappers where the 
-  // instrumentedness is inconsistent. 
-  for (Module::alias_iterator i = M.alias_begin(), e = M.alias_end(); i != e;) { 
-    GlobalAlias *GA = &*i; 
-    ++i; 
-    // Don't stop on weak.  We assume people aren't playing games with the 
-    // instrumentedness of overridden weak aliases. 
-    if (auto F = dyn_cast<Function>(GA->getBaseObject())) { 
-      bool GAInst = isInstrumented(GA), FInst = isInstrumented(F); 
-      if (GAInst && FInst) { 
-        addGlobalNamePrefix(GA); 
-      } else if (GAInst != FInst) { 
-        // Non-instrumented alias of an instrumented function, or vice versa. 
-        // Replace the alias with a native-ABI wrapper of the aliasee.  The pass 
-        // below will take care of instrumenting it. 
-        Function *NewF = 
-            buildWrapperFunction(F, "", GA->getLinkage(), F->getFunctionType()); 
-        GA->replaceAllUsesWith(ConstantExpr::getBitCast(NewF, GA->getType())); 
-        NewF->takeName(GA); 
-        GA->eraseFromParent(); 
-        FnsToInstrument.push_back(NewF); 
-      } 
-    } 
-  } 
- 
-  ReadOnlyNoneAttrs.addAttribute(Attribute::ReadOnly) 
-      .addAttribute(Attribute::ReadNone); 
- 
-  // First, change the ABI of every function in the module.  ABI-listed 
-  // functions keep their original ABI and get a wrapper function. 
-  for (std::vector<Function *>::iterator i = FnsToInstrument.begin(), 
-                                         e = FnsToInstrument.end(); 
-       i != e; ++i) { 
-    Function &F = **i; 
-    FunctionType *FT = F.getFunctionType(); 
- 
-    bool IsZeroArgsVoidRet = (FT->getNumParams() == 0 && !FT->isVarArg() && 
-                              FT->getReturnType()->isVoidTy()); 
- 
-    if (isInstrumented(&F)) { 
-      // Instrumented functions get a 'dfs$' prefix.  This allows us to more 
-      // easily identify cases of mismatching ABIs. 
-      if (getInstrumentedABI() == IA_Args && !IsZeroArgsVoidRet) { 
-        FunctionType *NewFT = getArgsFunctionType(FT); 
-        Function *NewF = Function::Create(NewFT, F.getLinkage(), 
-                                          F.getAddressSpace(), "", &M); 
-        NewF->copyAttributesFrom(&F); 
-        NewF->removeAttributes( 
-            AttributeList::ReturnIndex, 
-            AttributeFuncs::typeIncompatible(NewFT->getReturnType())); 
-        for (Function::arg_iterator FArg = F.arg_begin(), 
-                                    NewFArg = NewF->arg_begin(), 
-                                    FArgEnd = F.arg_end(); 
-             FArg != FArgEnd; ++FArg, ++NewFArg) { 
-          FArg->replaceAllUsesWith(&*NewFArg); 
-        } 
-        NewF->getBasicBlockList().splice(NewF->begin(), F.getBasicBlockList()); 
- 
-        for (Function::user_iterator UI = F.user_begin(), UE = F.user_end(); 
-             UI != UE;) { 
-          BlockAddress *BA = dyn_cast<BlockAddress>(*UI); 
-          ++UI; 
-          if (BA) { 
-            BA->replaceAllUsesWith( 
-                BlockAddress::get(NewF, BA->getBasicBlock())); 
-            delete BA; 
-          } 
-        } 
-        F.replaceAllUsesWith( 
-            ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT))); 
-        NewF->takeName(&F); 
-        F.eraseFromParent(); 
-        *i = NewF; 
-        addGlobalNamePrefix(NewF); 
-      } else { 
-        addGlobalNamePrefix(&F); 
-      } 
-    } else if (!IsZeroArgsVoidRet || getWrapperKind(&F) == WK_Custom) { 
-      // Build a wrapper function for F.  The wrapper simply calls F, and is 
-      // added to FnsToInstrument so that any instrumentation according to its 
-      // WrapperKind is done in the second pass below. 
-      FunctionType *NewFT = getInstrumentedABI() == IA_Args 
-                                ? getArgsFunctionType(FT) 
-                                : FT; 
- 
-      // If the function being wrapped has local linkage, then preserve the 
-      // function's linkage in the wrapper function. 
-      GlobalValue::LinkageTypes wrapperLinkage = 
-          F.hasLocalLinkage() 
-              ? F.getLinkage() 
-              : GlobalValue::LinkOnceODRLinkage; 
- 
-      Function *NewF = buildWrapperFunction( 
-          &F, std::string("dfsw$") + std::string(F.getName()), 
-          wrapperLinkage, NewFT); 
-      if (getInstrumentedABI() == IA_TLS) 
-        NewF->removeAttributes(AttributeList::FunctionIndex, ReadOnlyNoneAttrs); 
- 
-      Value *WrappedFnCst = 
-          ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT)); 
-      F.replaceAllUsesWith(WrappedFnCst); 
- 
-      UnwrappedFnMap[WrappedFnCst] = &F; 
-      *i = NewF; 
- 
-      if (!F.isDeclaration()) { 
-        // This function is probably defining an interposition of an 
-        // uninstrumented function and hence needs to keep the original ABI. 
-        // But any functions it may call need to use the instrumented ABI, so 
-        // we instrument it in a mode which preserves the original ABI. 
-        FnsWithNativeABI.insert(&F); 
- 
-        // This code needs to rebuild the iterators, as they may be invalidated 
-        // by the push_back, taking care that the new range does not include 
-        // any functions added by this code. 
-        size_t N = i - FnsToInstrument.begin(), 
-               Count = e - FnsToInstrument.begin(); 
-        FnsToInstrument.push_back(&F); 
-        i = FnsToInstrument.begin() + N; 
-        e = FnsToInstrument.begin() + Count; 
-      } 
-               // Hopefully, nobody will try to indirectly call a vararg 
-               // function... yet. 
-    } else if (FT->isVarArg()) { 
-      UnwrappedFnMap[&F] = &F; 
-      *i = nullptr; 
-    } 
-  } 
- 
-  for (Function *i : FnsToInstrument) { 
-    if (!i || i->isDeclaration()) 
-      continue; 
- 
-    removeUnreachableBlocks(*i); 
- 
-    DFSanFunction DFSF(*this, i, FnsWithNativeABI.count(i)); 
- 
-    // DFSanVisitor may create new basic blocks, which confuses df_iterator. 
-    // Build a copy of the list before iterating over it. 
-    SmallVector<BasicBlock *, 4> BBList(depth_first(&i->getEntryBlock())); 
- 
-    for (BasicBlock *i : BBList) { 
-      Instruction *Inst = &i->front(); 
-      while (true) { 
-        // DFSanVisitor may split the current basic block, changing the current 
-        // instruction's next pointer and moving the next instruction to the 
-        // tail block from which we should continue. 
-        Instruction *Next = Inst->getNextNode(); 
-        // DFSanVisitor may delete Inst, so keep track of whether it was a 
-        // terminator. 
-        bool IsTerminator = Inst->isTerminator(); 
-        if (!DFSF.SkipInsts.count(Inst)) 
-          DFSanVisitor(DFSF).visit(Inst); 
-        if (IsTerminator) 
-          break; 
-        Inst = Next; 
-      } 
-    } 
- 
-    // We will not necessarily be able to compute the shadow for every phi node 
-    // until we have visited every block.  Therefore, the code that handles phi 
-    // nodes adds them to the PHIFixups list so that they can be properly 
-    // handled here. 
-    for (std::vector<std::pair<PHINode *, PHINode *>>::iterator 
-             i = DFSF.PHIFixups.begin(), 
-             e = DFSF.PHIFixups.end(); 
-         i != e; ++i) { 
-      for (unsigned val = 0, n = i->first->getNumIncomingValues(); val != n; 
-           ++val) { 
-        i->second->setIncomingValue( 
-            val, DFSF.getShadow(i->first->getIncomingValue(val))); 
-      } 
-    } 
- 
-    // -dfsan-debug-nonzero-labels will split the CFG in all kinds of crazy 
-    // places (i.e. instructions in basic blocks we haven't even begun visiting 
-    // yet).  To make our life easier, do this work in a pass after the main 
-    // instrumentation. 
-    if (ClDebugNonzeroLabels) { 
-      for (Value *V : DFSF.NonZeroChecks) { 
-        Instruction *Pos; 
-        if (Instruction *I = dyn_cast<Instruction>(V)) 
-          Pos = I->getNextNode(); 
-        else 
-          Pos = &DFSF.F->getEntryBlock().front(); 
-        while (isa<PHINode>(Pos) || isa<AllocaInst>(Pos)) 
-          Pos = Pos->getNextNode(); 
-        IRBuilder<> IRB(Pos); 
+        &i != DFSanUnimplementedFn.getCallee()->stripPointerCasts() &&
+        &i != DFSanSetLabelFn.getCallee()->stripPointerCasts() &&
+        &i != DFSanNonzeroLabelFn.getCallee()->stripPointerCasts() &&
+        &i != DFSanVarargWrapperFn.getCallee()->stripPointerCasts() &&
+        &i != DFSanLoadCallbackFn.getCallee()->stripPointerCasts() &&
+        &i != DFSanStoreCallbackFn.getCallee()->stripPointerCasts() &&
+        &i != DFSanMemTransferCallbackFn.getCallee()->stripPointerCasts() &&
+        &i != DFSanCmpCallbackFn.getCallee()->stripPointerCasts())
+      FnsToInstrument.push_back(&i);
+  }
+
+  // Give function aliases prefixes when necessary, and build wrappers where the
+  // instrumentedness is inconsistent.
+  for (Module::alias_iterator i = M.alias_begin(), e = M.alias_end(); i != e;) {
+    GlobalAlias *GA = &*i;
+    ++i;
+    // Don't stop on weak.  We assume people aren't playing games with the
+    // instrumentedness of overridden weak aliases.
+    if (auto F = dyn_cast<Function>(GA->getBaseObject())) {
+      bool GAInst = isInstrumented(GA), FInst = isInstrumented(F);
+      if (GAInst && FInst) {
+        addGlobalNamePrefix(GA);
+      } else if (GAInst != FInst) {
+        // Non-instrumented alias of an instrumented function, or vice versa.
+        // Replace the alias with a native-ABI wrapper of the aliasee.  The pass
+        // below will take care of instrumenting it.
+        Function *NewF =
+            buildWrapperFunction(F, "", GA->getLinkage(), F->getFunctionType());
+        GA->replaceAllUsesWith(ConstantExpr::getBitCast(NewF, GA->getType()));
+        NewF->takeName(GA);
+        GA->eraseFromParent();
+        FnsToInstrument.push_back(NewF);
+      }
+    }
+  }
+
+  ReadOnlyNoneAttrs.addAttribute(Attribute::ReadOnly)
+      .addAttribute(Attribute::ReadNone);
+
+  // First, change the ABI of every function in the module.  ABI-listed
+  // functions keep their original ABI and get a wrapper function.
+  for (std::vector<Function *>::iterator i = FnsToInstrument.begin(),
+                                         e = FnsToInstrument.end();
+       i != e; ++i) {
+    Function &F = **i;
+    FunctionType *FT = F.getFunctionType();
+
+    bool IsZeroArgsVoidRet = (FT->getNumParams() == 0 && !FT->isVarArg() &&
+                              FT->getReturnType()->isVoidTy());
+
+    if (isInstrumented(&F)) {
+      // Instrumented functions get a 'dfs$' prefix.  This allows us to more
+      // easily identify cases of mismatching ABIs.
+      if (getInstrumentedABI() == IA_Args && !IsZeroArgsVoidRet) {
+        FunctionType *NewFT = getArgsFunctionType(FT);
+        Function *NewF = Function::Create(NewFT, F.getLinkage(),
+                                          F.getAddressSpace(), "", &M);
+        NewF->copyAttributesFrom(&F);
+        NewF->removeAttributes(
+            AttributeList::ReturnIndex,
+            AttributeFuncs::typeIncompatible(NewFT->getReturnType()));
+        for (Function::arg_iterator FArg = F.arg_begin(),
+                                    NewFArg = NewF->arg_begin(),
+                                    FArgEnd = F.arg_end();
+             FArg != FArgEnd; ++FArg, ++NewFArg) {
+          FArg->replaceAllUsesWith(&*NewFArg);
+        }
+        NewF->getBasicBlockList().splice(NewF->begin(), F.getBasicBlockList());
+
+        for (Function::user_iterator UI = F.user_begin(), UE = F.user_end();
+             UI != UE;) {
+          BlockAddress *BA = dyn_cast<BlockAddress>(*UI);
+          ++UI;
+          if (BA) {
+            BA->replaceAllUsesWith(
+                BlockAddress::get(NewF, BA->getBasicBlock()));
+            delete BA;
+          }
+        }
+        F.replaceAllUsesWith(
+            ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT)));
+        NewF->takeName(&F);
+        F.eraseFromParent();
+        *i = NewF;
+        addGlobalNamePrefix(NewF);
+      } else {
+        addGlobalNamePrefix(&F);
+      }
+    } else if (!IsZeroArgsVoidRet || getWrapperKind(&F) == WK_Custom) {
+      // Build a wrapper function for F.  The wrapper simply calls F, and is
+      // added to FnsToInstrument so that any instrumentation according to its
+      // WrapperKind is done in the second pass below.
+      FunctionType *NewFT = getInstrumentedABI() == IA_Args
+                                ? getArgsFunctionType(FT)
+                                : FT;
+
+      // If the function being wrapped has local linkage, then preserve the
+      // function's linkage in the wrapper function.
+      GlobalValue::LinkageTypes wrapperLinkage =
+          F.hasLocalLinkage()
+              ? F.getLinkage()
+              : GlobalValue::LinkOnceODRLinkage;
+
+      Function *NewF = buildWrapperFunction(
+          &F, std::string("dfsw$") + std::string(F.getName()),
+          wrapperLinkage, NewFT);
+      if (getInstrumentedABI() == IA_TLS)
+        NewF->removeAttributes(AttributeList::FunctionIndex, ReadOnlyNoneAttrs);
+
+      Value *WrappedFnCst =
+          ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT));
+      F.replaceAllUsesWith(WrappedFnCst);
+
+      UnwrappedFnMap[WrappedFnCst] = &F;
+      *i = NewF;
+
+      if (!F.isDeclaration()) {
+        // This function is probably defining an interposition of an
+        // uninstrumented function and hence needs to keep the original ABI.
+        // But any functions it may call need to use the instrumented ABI, so
+        // we instrument it in a mode which preserves the original ABI.
+        FnsWithNativeABI.insert(&F);
+
+        // This code needs to rebuild the iterators, as they may be invalidated
+        // by the push_back, taking care that the new range does not include
+        // any functions added by this code.
+        size_t N = i - FnsToInstrument.begin(),
+               Count = e - FnsToInstrument.begin();
+        FnsToInstrument.push_back(&F);
+        i = FnsToInstrument.begin() + N;
+        e = FnsToInstrument.begin() + Count;
+      }
+               // Hopefully, nobody will try to indirectly call a vararg
+               // function... yet.
+    } else if (FT->isVarArg()) {
+      UnwrappedFnMap[&F] = &F;
+      *i = nullptr;
+    }
+  }
+
+  for (Function *i : FnsToInstrument) {
+    if (!i || i->isDeclaration())
+      continue;
+
+    removeUnreachableBlocks(*i);
+
+    DFSanFunction DFSF(*this, i, FnsWithNativeABI.count(i));
+
+    // DFSanVisitor may create new basic blocks, which confuses df_iterator.
+    // Build a copy of the list before iterating over it.
+    SmallVector<BasicBlock *, 4> BBList(depth_first(&i->getEntryBlock()));
+
+    for (BasicBlock *i : BBList) {
+      Instruction *Inst = &i->front();
+      while (true) {
+        // DFSanVisitor may split the current basic block, changing the current
+        // instruction's next pointer and moving the next instruction to the
+        // tail block from which we should continue.
+        Instruction *Next = Inst->getNextNode();
+        // DFSanVisitor may delete Inst, so keep track of whether it was a
+        // terminator.
+        bool IsTerminator = Inst->isTerminator();
+        if (!DFSF.SkipInsts.count(Inst))
+          DFSanVisitor(DFSF).visit(Inst);
+        if (IsTerminator)
+          break;
+        Inst = Next;
+      }
+    }
+
+    // We will not necessarily be able to compute the shadow for every phi node
+    // until we have visited every block.  Therefore, the code that handles phi
+    // nodes adds them to the PHIFixups list so that they can be properly
+    // handled here.
+    for (std::vector<std::pair<PHINode *, PHINode *>>::iterator
+             i = DFSF.PHIFixups.begin(),
+             e = DFSF.PHIFixups.end();
+         i != e; ++i) {
+      for (unsigned val = 0, n = i->first->getNumIncomingValues(); val != n;
+           ++val) {
+        i->second->setIncomingValue(
+            val, DFSF.getShadow(i->first->getIncomingValue(val)));
+      }
+    }
+
+    // -dfsan-debug-nonzero-labels will split the CFG in all kinds of crazy
+    // places (i.e. instructions in basic blocks we haven't even begun visiting
+    // yet).  To make our life easier, do this work in a pass after the main
+    // instrumentation.
+    if (ClDebugNonzeroLabels) {
+      for (Value *V : DFSF.NonZeroChecks) {
+        Instruction *Pos;
+        if (Instruction *I = dyn_cast<Instruction>(V))
+          Pos = I->getNextNode();
+        else
+          Pos = &DFSF.F->getEntryBlock().front();
+        while (isa<PHINode>(Pos) || isa<AllocaInst>(Pos))
+          Pos = Pos->getNextNode();
+        IRBuilder<> IRB(Pos);
         Value *PrimitiveShadow = DFSF.collapseToPrimitiveShadow(V, Pos);
         Value *Ne =
             IRB.CreateICmpNE(PrimitiveShadow, DFSF.DFS.ZeroPrimitiveShadow);
-        BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen( 
-            Ne, Pos, /*Unreachable=*/false, ColdCallWeights)); 
-        IRBuilder<> ThenIRB(BI); 
-        ThenIRB.CreateCall(DFSF.DFS.DFSanNonzeroLabelFn, {}); 
-      } 
-    } 
-  } 
- 
-  return Changed || !FnsToInstrument.empty() || 
-         M.global_size() != InitialGlobalSize || M.size() != InitialModuleSize; 
-} 
- 
+        BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen(
+            Ne, Pos, /*Unreachable=*/false, ColdCallWeights));
+        IRBuilder<> ThenIRB(BI);
+        ThenIRB.CreateCall(DFSF.DFS.DFSanNonzeroLabelFn, {});
+      }
+    }
+  }
+
+  return Changed || !FnsToInstrument.empty() ||
+         M.global_size() != InitialGlobalSize || M.size() != InitialModuleSize;
+}
+
 Value *DFSanFunction::getArgTLS(Type *T, unsigned ArgOffset, IRBuilder<> &IRB) {
   Value *Base = IRB.CreatePointerCast(DFS.ArgTLS, DFS.IntptrTy);
   if (ArgOffset)
@@ -1310,12 +1310,12 @@ Value *DFSanFunction::getArgTLS(Type *T, unsigned ArgOffset, IRBuilder<> &IRB) {
   return IRB.CreateIntToPtr(Base, PointerType::get(DFS.getShadowTy(T), 0),
                             "_dfsarg");
 }
- 
+
 Value *DFSanFunction::getRetvalTLS(Type *T, IRBuilder<> &IRB) {
   return IRB.CreatePointerCast(
       DFS.RetvalTLS, PointerType::get(DFS.getShadowTy(T), 0), "_dfsret");
-} 
- 
+}
+
 Value *DFSanFunction::getShadowForTLSArgument(Argument *A) {
   unsigned ArgOffset = 0;
   const DataLayout &DL = F->getParent()->getDataLayout();
@@ -1325,7 +1325,7 @@ Value *DFSanFunction::getShadowForTLSArgument(Argument *A) {
         break;
       continue;
     }
- 
+
     unsigned Size = DL.getTypeAllocSize(DFS.getShadowTy(&FArg));
     if (A != &FArg) {
       ArgOffset += alignTo(Size, kShadowTLSAlignment);
@@ -1333,7 +1333,7 @@ Value *DFSanFunction::getShadowForTLSArgument(Argument *A) {
         break; // ArgTLS overflows, uses a zero shadow.
       continue;
     }
- 
+
     if (ArgOffset + Size > kArgTLSSize)
       break; // ArgTLS overflows, uses a zero shadow.
 
@@ -1345,224 +1345,224 @@ Value *DFSanFunction::getShadowForTLSArgument(Argument *A) {
   }
 
   return DFS.getZeroShadow(A);
-} 
- 
-Value *DFSanFunction::getShadow(Value *V) { 
-  if (!isa<Argument>(V) && !isa<Instruction>(V)) 
+}
+
+Value *DFSanFunction::getShadow(Value *V) {
+  if (!isa<Argument>(V) && !isa<Instruction>(V))
     return DFS.getZeroShadow(V);
-  Value *&Shadow = ValShadowMap[V]; 
-  if (!Shadow) { 
-    if (Argument *A = dyn_cast<Argument>(V)) { 
-      if (IsNativeABI) 
+  Value *&Shadow = ValShadowMap[V];
+  if (!Shadow) {
+    if (Argument *A = dyn_cast<Argument>(V)) {
+      if (IsNativeABI)
         return DFS.getZeroShadow(V);
-      switch (IA) { 
-      case DataFlowSanitizer::IA_TLS: { 
+      switch (IA) {
+      case DataFlowSanitizer::IA_TLS: {
         Shadow = getShadowForTLSArgument(A);
-        break; 
-      } 
-      case DataFlowSanitizer::IA_Args: { 
-        unsigned ArgIdx = A->getArgNo() + F->arg_size() / 2; 
-        Function::arg_iterator i = F->arg_begin(); 
-        while (ArgIdx--) 
-          ++i; 
-        Shadow = &*i; 
+        break;
+      }
+      case DataFlowSanitizer::IA_Args: {
+        unsigned ArgIdx = A->getArgNo() + F->arg_size() / 2;
+        Function::arg_iterator i = F->arg_begin();
+        while (ArgIdx--)
+          ++i;
+        Shadow = &*i;
         assert(Shadow->getType() == DFS.PrimitiveShadowTy);
-        break; 
-      } 
-      } 
-      NonZeroChecks.push_back(Shadow); 
-    } else { 
+        break;
+      }
+      }
+      NonZeroChecks.push_back(Shadow);
+    } else {
       Shadow = DFS.getZeroShadow(V);
-    } 
-  } 
-  return Shadow; 
-} 
- 
-void DFSanFunction::setShadow(Instruction *I, Value *Shadow) { 
-  assert(!ValShadowMap.count(I)); 
+    }
+  }
+  return Shadow;
+}
+
+void DFSanFunction::setShadow(Instruction *I, Value *Shadow) {
+  assert(!ValShadowMap.count(I));
   assert(DFS.shouldTrackFieldsAndIndices() ||
          Shadow->getType() == DFS.PrimitiveShadowTy);
-  ValShadowMap[I] = Shadow; 
-} 
- 
-Value *DataFlowSanitizer::getShadowAddress(Value *Addr, Instruction *Pos) { 
-  assert(Addr != RetvalTLS && "Reinstrumenting?"); 
-  IRBuilder<> IRB(Pos); 
-  Value *ShadowPtrMaskValue; 
-  if (DFSanRuntimeShadowMask) 
-    ShadowPtrMaskValue = IRB.CreateLoad(IntptrTy, ExternalShadowMask); 
-  else 
-    ShadowPtrMaskValue = ShadowPtrMask; 
-  return IRB.CreateIntToPtr( 
-      IRB.CreateMul( 
-          IRB.CreateAnd(IRB.CreatePtrToInt(Addr, IntptrTy), 
-                        IRB.CreatePtrToInt(ShadowPtrMaskValue, IntptrTy)), 
-          ShadowPtrMul), 
+  ValShadowMap[I] = Shadow;
+}
+
+Value *DataFlowSanitizer::getShadowAddress(Value *Addr, Instruction *Pos) {
+  assert(Addr != RetvalTLS && "Reinstrumenting?");
+  IRBuilder<> IRB(Pos);
+  Value *ShadowPtrMaskValue;
+  if (DFSanRuntimeShadowMask)
+    ShadowPtrMaskValue = IRB.CreateLoad(IntptrTy, ExternalShadowMask);
+  else
+    ShadowPtrMaskValue = ShadowPtrMask;
+  return IRB.CreateIntToPtr(
+      IRB.CreateMul(
+          IRB.CreateAnd(IRB.CreatePtrToInt(Addr, IntptrTy),
+                        IRB.CreatePtrToInt(ShadowPtrMaskValue, IntptrTy)),
+          ShadowPtrMul),
       PrimitiveShadowPtrTy);
-} 
- 
+}
+
 Value *DFSanFunction::combineShadowsThenConvert(Type *T, Value *V1, Value *V2,
                                                 Instruction *Pos) {
   Value *PrimitiveValue = combineShadows(V1, V2, Pos);
   return expandFromPrimitiveShadow(T, PrimitiveValue, Pos);
 }
 
-// Generates IR to compute the union of the two given shadows, inserting it 
+// Generates IR to compute the union of the two given shadows, inserting it
 // before Pos. The combined value is with primitive type.
-Value *DFSanFunction::combineShadows(Value *V1, Value *V2, Instruction *Pos) { 
+Value *DFSanFunction::combineShadows(Value *V1, Value *V2, Instruction *Pos) {
   if (DFS.isZeroShadow(V1))
     return collapseToPrimitiveShadow(V2, Pos);
   if (DFS.isZeroShadow(V2))
     return collapseToPrimitiveShadow(V1, Pos);
-  if (V1 == V2) 
+  if (V1 == V2)
     return collapseToPrimitiveShadow(V1, Pos);
- 
-  auto V1Elems = ShadowElements.find(V1); 
-  auto V2Elems = ShadowElements.find(V2); 
-  if (V1Elems != ShadowElements.end() && V2Elems != ShadowElements.end()) { 
-    if (std::includes(V1Elems->second.begin(), V1Elems->second.end(), 
-                      V2Elems->second.begin(), V2Elems->second.end())) { 
+
+  auto V1Elems = ShadowElements.find(V1);
+  auto V2Elems = ShadowElements.find(V2);
+  if (V1Elems != ShadowElements.end() && V2Elems != ShadowElements.end()) {
+    if (std::includes(V1Elems->second.begin(), V1Elems->second.end(),
+                      V2Elems->second.begin(), V2Elems->second.end())) {
       return collapseToPrimitiveShadow(V1, Pos);
-    } else if (std::includes(V2Elems->second.begin(), V2Elems->second.end(), 
-                             V1Elems->second.begin(), V1Elems->second.end())) { 
+    } else if (std::includes(V2Elems->second.begin(), V2Elems->second.end(),
+                             V1Elems->second.begin(), V1Elems->second.end())) {
       return collapseToPrimitiveShadow(V2, Pos);
-    } 
-  } else if (V1Elems != ShadowElements.end()) { 
-    if (V1Elems->second.count(V2)) 
+    }
+  } else if (V1Elems != ShadowElements.end()) {
+    if (V1Elems->second.count(V2))
       return collapseToPrimitiveShadow(V1, Pos);
-  } else if (V2Elems != ShadowElements.end()) { 
-    if (V2Elems->second.count(V1)) 
+  } else if (V2Elems != ShadowElements.end()) {
+    if (V2Elems->second.count(V1))
       return collapseToPrimitiveShadow(V2, Pos);
-  } 
- 
-  auto Key = std::make_pair(V1, V2); 
-  if (V1 > V2) 
-    std::swap(Key.first, Key.second); 
+  }
+
+  auto Key = std::make_pair(V1, V2);
+  if (V1 > V2)
+    std::swap(Key.first, Key.second);
   CachedShadow &CCS = CachedShadows[Key];
-  if (CCS.Block && DT.dominates(CCS.Block, Pos->getParent())) 
-    return CCS.Shadow; 
- 
+  if (CCS.Block && DT.dominates(CCS.Block, Pos->getParent()))
+    return CCS.Shadow;
+
   // Converts inputs shadows to shadows with primitive types.
   Value *PV1 = collapseToPrimitiveShadow(V1, Pos);
   Value *PV2 = collapseToPrimitiveShadow(V2, Pos);
 
-  IRBuilder<> IRB(Pos); 
+  IRBuilder<> IRB(Pos);
   if (ClFast16Labels) {
     CCS.Block = Pos->getParent();
     CCS.Shadow = IRB.CreateOr(PV1, PV2);
   } else if (AvoidNewBlocks) {
     CallInst *Call = IRB.CreateCall(DFS.DFSanCheckedUnionFn, {PV1, PV2});
-    Call->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt); 
-    Call->addParamAttr(0, Attribute::ZExt); 
-    Call->addParamAttr(1, Attribute::ZExt); 
- 
-    CCS.Block = Pos->getParent(); 
-    CCS.Shadow = Call; 
-  } else { 
-    BasicBlock *Head = Pos->getParent(); 
+    Call->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
+    Call->addParamAttr(0, Attribute::ZExt);
+    Call->addParamAttr(1, Attribute::ZExt);
+
+    CCS.Block = Pos->getParent();
+    CCS.Shadow = Call;
+  } else {
+    BasicBlock *Head = Pos->getParent();
     Value *Ne = IRB.CreateICmpNE(PV1, PV2);
-    BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen( 
-        Ne, Pos, /*Unreachable=*/false, DFS.ColdCallWeights, &DT)); 
-    IRBuilder<> ThenIRB(BI); 
+    BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen(
+        Ne, Pos, /*Unreachable=*/false, DFS.ColdCallWeights, &DT));
+    IRBuilder<> ThenIRB(BI);
     CallInst *Call = ThenIRB.CreateCall(DFS.DFSanUnionFn, {PV1, PV2});
-    Call->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt); 
-    Call->addParamAttr(0, Attribute::ZExt); 
-    Call->addParamAttr(1, Attribute::ZExt); 
- 
-    BasicBlock *Tail = BI->getSuccessor(0); 
+    Call->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
+    Call->addParamAttr(0, Attribute::ZExt);
+    Call->addParamAttr(1, Attribute::ZExt);
+
+    BasicBlock *Tail = BI->getSuccessor(0);
     PHINode *Phi =
         PHINode::Create(DFS.PrimitiveShadowTy, 2, "", &Tail->front());
-    Phi->addIncoming(Call, Call->getParent()); 
+    Phi->addIncoming(Call, Call->getParent());
     Phi->addIncoming(PV1, Head);
- 
-    CCS.Block = Tail; 
-    CCS.Shadow = Phi; 
-  } 
- 
-  std::set<Value *> UnionElems; 
-  if (V1Elems != ShadowElements.end()) { 
-    UnionElems = V1Elems->second; 
-  } else { 
-    UnionElems.insert(V1); 
-  } 
-  if (V2Elems != ShadowElements.end()) { 
-    UnionElems.insert(V2Elems->second.begin(), V2Elems->second.end()); 
-  } else { 
-    UnionElems.insert(V2); 
-  } 
-  ShadowElements[CCS.Shadow] = std::move(UnionElems); 
- 
-  return CCS.Shadow; 
-} 
- 
-// A convenience function which folds the shadows of each of the operands 
-// of the provided instruction Inst, inserting the IR before Inst.  Returns 
-// the computed union Value. 
-Value *DFSanFunction::combineOperandShadows(Instruction *Inst) { 
-  if (Inst->getNumOperands() == 0) 
+
+    CCS.Block = Tail;
+    CCS.Shadow = Phi;
+  }
+
+  std::set<Value *> UnionElems;
+  if (V1Elems != ShadowElements.end()) {
+    UnionElems = V1Elems->second;
+  } else {
+    UnionElems.insert(V1);
+  }
+  if (V2Elems != ShadowElements.end()) {
+    UnionElems.insert(V2Elems->second.begin(), V2Elems->second.end());
+  } else {
+    UnionElems.insert(V2);
+  }
+  ShadowElements[CCS.Shadow] = std::move(UnionElems);
+
+  return CCS.Shadow;
+}
+
+// A convenience function which folds the shadows of each of the operands
+// of the provided instruction Inst, inserting the IR before Inst.  Returns
+// the computed union Value.
+Value *DFSanFunction::combineOperandShadows(Instruction *Inst) {
+  if (Inst->getNumOperands() == 0)
     return DFS.getZeroShadow(Inst);
- 
-  Value *Shadow = getShadow(Inst->getOperand(0)); 
-  for (unsigned i = 1, n = Inst->getNumOperands(); i != n; ++i) { 
-    Shadow = combineShadows(Shadow, getShadow(Inst->getOperand(i)), Inst); 
-  } 
+
+  Value *Shadow = getShadow(Inst->getOperand(0));
+  for (unsigned i = 1, n = Inst->getNumOperands(); i != n; ++i) {
+    Shadow = combineShadows(Shadow, getShadow(Inst->getOperand(i)), Inst);
+  }
   return expandFromPrimitiveShadow(Inst->getType(), Shadow, Inst);
-} 
- 
-Value *DFSanVisitor::visitOperandShadowInst(Instruction &I) { 
-  Value *CombinedShadow = DFSF.combineOperandShadows(&I); 
-  DFSF.setShadow(&I, CombinedShadow); 
-  return CombinedShadow; 
-} 
- 
-// Generates IR to load shadow corresponding to bytes [Addr, Addr+Size), where 
+}
+
+Value *DFSanVisitor::visitOperandShadowInst(Instruction &I) {
+  Value *CombinedShadow = DFSF.combineOperandShadows(&I);
+  DFSF.setShadow(&I, CombinedShadow);
+  return CombinedShadow;
+}
+
+// Generates IR to load shadow corresponding to bytes [Addr, Addr+Size), where
 // Addr has alignment Align, and take the union of each of those shadows. The
 // returned shadow always has primitive type.
-Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align, 
-                                 Instruction *Pos) { 
-  if (AllocaInst *AI = dyn_cast<AllocaInst>(Addr)) { 
-    const auto i = AllocaShadowMap.find(AI); 
-    if (i != AllocaShadowMap.end()) { 
-      IRBuilder<> IRB(Pos); 
+Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,
+                                 Instruction *Pos) {
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(Addr)) {
+    const auto i = AllocaShadowMap.find(AI);
+    if (i != AllocaShadowMap.end()) {
+      IRBuilder<> IRB(Pos);
       return IRB.CreateLoad(DFS.PrimitiveShadowTy, i->second);
-    } 
-  } 
- 
-  const llvm::Align ShadowAlign(Align * DFS.ShadowWidthBytes); 
-  SmallVector<const Value *, 2> Objs; 
+    }
+  }
+
+  const llvm::Align ShadowAlign(Align * DFS.ShadowWidthBytes);
+  SmallVector<const Value *, 2> Objs;
   getUnderlyingObjects(Addr, Objs);
-  bool AllConstants = true; 
-  for (const Value *Obj : Objs) { 
-    if (isa<Function>(Obj) || isa<BlockAddress>(Obj)) 
-      continue; 
-    if (isa<GlobalVariable>(Obj) && cast<GlobalVariable>(Obj)->isConstant()) 
-      continue; 
- 
-    AllConstants = false; 
-    break; 
-  } 
-  if (AllConstants) 
+  bool AllConstants = true;
+  for (const Value *Obj : Objs) {
+    if (isa<Function>(Obj) || isa<BlockAddress>(Obj))
+      continue;
+    if (isa<GlobalVariable>(Obj) && cast<GlobalVariable>(Obj)->isConstant())
+      continue;
+
+    AllConstants = false;
+    break;
+  }
+  if (AllConstants)
     return DFS.ZeroPrimitiveShadow;
- 
-  Value *ShadowAddr = DFS.getShadowAddress(Addr, Pos); 
-  switch (Size) { 
-  case 0: 
+
+  Value *ShadowAddr = DFS.getShadowAddress(Addr, Pos);
+  switch (Size) {
+  case 0:
     return DFS.ZeroPrimitiveShadow;
-  case 1: { 
+  case 1: {
     LoadInst *LI = new LoadInst(DFS.PrimitiveShadowTy, ShadowAddr, "", Pos);
-    LI->setAlignment(ShadowAlign); 
-    return LI; 
-  } 
-  case 2: { 
-    IRBuilder<> IRB(Pos); 
+    LI->setAlignment(ShadowAlign);
+    return LI;
+  }
+  case 2: {
+    IRBuilder<> IRB(Pos);
     Value *ShadowAddr1 = IRB.CreateGEP(DFS.PrimitiveShadowTy, ShadowAddr,
-                                       ConstantInt::get(DFS.IntptrTy, 1)); 
-    return combineShadows( 
+                                       ConstantInt::get(DFS.IntptrTy, 1));
+    return combineShadows(
         IRB.CreateAlignedLoad(DFS.PrimitiveShadowTy, ShadowAddr, ShadowAlign),
         IRB.CreateAlignedLoad(DFS.PrimitiveShadowTy, ShadowAddr1, ShadowAlign),
         Pos);
-  } 
-  } 
+  }
+  }
 
   if (ClFast16Labels && Size % (64 / DFS.ShadowWidthBits) == 0) {
     // First OR all the WideShadows, then OR individual shadows within the
@@ -1587,226 +1587,226 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,
     }
     return IRB.CreateTrunc(CombinedWideShadow, DFS.PrimitiveShadowTy);
   }
-  if (!AvoidNewBlocks && Size % (64 / DFS.ShadowWidthBits) == 0) { 
-    // Fast path for the common case where each byte has identical shadow: load 
-    // shadow 64 bits at a time, fall out to a __dfsan_union_load call if any 
-    // shadow is non-equal. 
-    BasicBlock *FallbackBB = BasicBlock::Create(*DFS.Ctx, "", F); 
-    IRBuilder<> FallbackIRB(FallbackBB); 
-    CallInst *FallbackCall = FallbackIRB.CreateCall( 
-        DFS.DFSanUnionLoadFn, 
-        {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)}); 
-    FallbackCall->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt); 
- 
-    // Compare each of the shadows stored in the loaded 64 bits to each other, 
-    // by computing (WideShadow rotl ShadowWidthBits) == WideShadow. 
-    IRBuilder<> IRB(Pos); 
-    Value *WideAddr = 
-        IRB.CreateBitCast(ShadowAddr, Type::getInt64PtrTy(*DFS.Ctx)); 
-    Value *WideShadow = 
-        IRB.CreateAlignedLoad(IRB.getInt64Ty(), WideAddr, ShadowAlign); 
+  if (!AvoidNewBlocks && Size % (64 / DFS.ShadowWidthBits) == 0) {
+    // Fast path for the common case where each byte has identical shadow: load
+    // shadow 64 bits at a time, fall out to a __dfsan_union_load call if any
+    // shadow is non-equal.
+    BasicBlock *FallbackBB = BasicBlock::Create(*DFS.Ctx, "", F);
+    IRBuilder<> FallbackIRB(FallbackBB);
+    CallInst *FallbackCall = FallbackIRB.CreateCall(
+        DFS.DFSanUnionLoadFn,
+        {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)});
+    FallbackCall->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
+
+    // Compare each of the shadows stored in the loaded 64 bits to each other,
+    // by computing (WideShadow rotl ShadowWidthBits) == WideShadow.
+    IRBuilder<> IRB(Pos);
+    Value *WideAddr =
+        IRB.CreateBitCast(ShadowAddr, Type::getInt64PtrTy(*DFS.Ctx));
+    Value *WideShadow =
+        IRB.CreateAlignedLoad(IRB.getInt64Ty(), WideAddr, ShadowAlign);
     Value *TruncShadow = IRB.CreateTrunc(WideShadow, DFS.PrimitiveShadowTy);
-    Value *ShlShadow = IRB.CreateShl(WideShadow, DFS.ShadowWidthBits); 
-    Value *ShrShadow = IRB.CreateLShr(WideShadow, 64 - DFS.ShadowWidthBits); 
-    Value *RotShadow = IRB.CreateOr(ShlShadow, ShrShadow); 
-    Value *ShadowsEq = IRB.CreateICmpEQ(WideShadow, RotShadow); 
- 
-    BasicBlock *Head = Pos->getParent(); 
-    BasicBlock *Tail = Head->splitBasicBlock(Pos->getIterator()); 
- 
-    if (DomTreeNode *OldNode = DT.getNode(Head)) { 
-      std::vector<DomTreeNode *> Children(OldNode->begin(), OldNode->end()); 
- 
-      DomTreeNode *NewNode = DT.addNewBlock(Tail, Head); 
-      for (auto Child : Children) 
-        DT.changeImmediateDominator(Child, NewNode); 
-    } 
- 
-    // In the following code LastBr will refer to the previous basic block's 
-    // conditional branch instruction, whose true successor is fixed up to point 
-    // to the next block during the loop below or to the tail after the final 
-    // iteration. 
-    BranchInst *LastBr = BranchInst::Create(FallbackBB, FallbackBB, ShadowsEq); 
-    ReplaceInstWithInst(Head->getTerminator(), LastBr); 
-    DT.addNewBlock(FallbackBB, Head); 
- 
-    for (uint64_t Ofs = 64 / DFS.ShadowWidthBits; Ofs != Size; 
-         Ofs += 64 / DFS.ShadowWidthBits) { 
-      BasicBlock *NextBB = BasicBlock::Create(*DFS.Ctx, "", F); 
-      DT.addNewBlock(NextBB, LastBr->getParent()); 
-      IRBuilder<> NextIRB(NextBB); 
-      WideAddr = NextIRB.CreateGEP(Type::getInt64Ty(*DFS.Ctx), WideAddr, 
-                                   ConstantInt::get(DFS.IntptrTy, 1)); 
-      Value *NextWideShadow = NextIRB.CreateAlignedLoad(NextIRB.getInt64Ty(), 
-                                                        WideAddr, ShadowAlign); 
-      ShadowsEq = NextIRB.CreateICmpEQ(WideShadow, NextWideShadow); 
-      LastBr->setSuccessor(0, NextBB); 
-      LastBr = NextIRB.CreateCondBr(ShadowsEq, FallbackBB, FallbackBB); 
-    } 
- 
-    LastBr->setSuccessor(0, Tail); 
-    FallbackIRB.CreateBr(Tail); 
+    Value *ShlShadow = IRB.CreateShl(WideShadow, DFS.ShadowWidthBits);
+    Value *ShrShadow = IRB.CreateLShr(WideShadow, 64 - DFS.ShadowWidthBits);
+    Value *RotShadow = IRB.CreateOr(ShlShadow, ShrShadow);
+    Value *ShadowsEq = IRB.CreateICmpEQ(WideShadow, RotShadow);
+
+    BasicBlock *Head = Pos->getParent();
+    BasicBlock *Tail = Head->splitBasicBlock(Pos->getIterator());
+
+    if (DomTreeNode *OldNode = DT.getNode(Head)) {
+      std::vector<DomTreeNode *> Children(OldNode->begin(), OldNode->end());
+
+      DomTreeNode *NewNode = DT.addNewBlock(Tail, Head);
+      for (auto Child : Children)
+        DT.changeImmediateDominator(Child, NewNode);
+    }
+
+    // In the following code LastBr will refer to the previous basic block's
+    // conditional branch instruction, whose true successor is fixed up to point
+    // to the next block during the loop below or to the tail after the final
+    // iteration.
+    BranchInst *LastBr = BranchInst::Create(FallbackBB, FallbackBB, ShadowsEq);
+    ReplaceInstWithInst(Head->getTerminator(), LastBr);
+    DT.addNewBlock(FallbackBB, Head);
+
+    for (uint64_t Ofs = 64 / DFS.ShadowWidthBits; Ofs != Size;
+         Ofs += 64 / DFS.ShadowWidthBits) {
+      BasicBlock *NextBB = BasicBlock::Create(*DFS.Ctx, "", F);
+      DT.addNewBlock(NextBB, LastBr->getParent());
+      IRBuilder<> NextIRB(NextBB);
+      WideAddr = NextIRB.CreateGEP(Type::getInt64Ty(*DFS.Ctx), WideAddr,
+                                   ConstantInt::get(DFS.IntptrTy, 1));
+      Value *NextWideShadow = NextIRB.CreateAlignedLoad(NextIRB.getInt64Ty(),
+                                                        WideAddr, ShadowAlign);
+      ShadowsEq = NextIRB.CreateICmpEQ(WideShadow, NextWideShadow);
+      LastBr->setSuccessor(0, NextBB);
+      LastBr = NextIRB.CreateCondBr(ShadowsEq, FallbackBB, FallbackBB);
+    }
+
+    LastBr->setSuccessor(0, Tail);
+    FallbackIRB.CreateBr(Tail);
     PHINode *Shadow =
         PHINode::Create(DFS.PrimitiveShadowTy, 2, "", &Tail->front());
-    Shadow->addIncoming(FallbackCall, FallbackBB); 
-    Shadow->addIncoming(TruncShadow, LastBr->getParent()); 
-    return Shadow; 
-  } 
- 
-  IRBuilder<> IRB(Pos); 
+    Shadow->addIncoming(FallbackCall, FallbackBB);
+    Shadow->addIncoming(TruncShadow, LastBr->getParent());
+    return Shadow;
+  }
+
+  IRBuilder<> IRB(Pos);
   FunctionCallee &UnionLoadFn =
       ClFast16Labels ? DFS.DFSanUnionLoadFast16LabelsFn : DFS.DFSanUnionLoadFn;
-  CallInst *FallbackCall = IRB.CreateCall( 
+  CallInst *FallbackCall = IRB.CreateCall(
       UnionLoadFn, {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)});
-  FallbackCall->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt); 
-  return FallbackCall; 
-} 
- 
-void DFSanVisitor::visitLoadInst(LoadInst &LI) { 
-  auto &DL = LI.getModule()->getDataLayout(); 
-  uint64_t Size = DL.getTypeStoreSize(LI.getType()); 
-  if (Size == 0) { 
+  FallbackCall->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
+  return FallbackCall;
+}
+
+void DFSanVisitor::visitLoadInst(LoadInst &LI) {
+  auto &DL = LI.getModule()->getDataLayout();
+  uint64_t Size = DL.getTypeStoreSize(LI.getType());
+  if (Size == 0) {
     DFSF.setShadow(&LI, DFSF.DFS.getZeroShadow(&LI));
-    return; 
-  } 
- 
-  Align Alignment = ClPreserveAlignment ? LI.getAlign() : Align(1); 
+    return;
+  }
+
+  Align Alignment = ClPreserveAlignment ? LI.getAlign() : Align(1);
   Value *PrimitiveShadow =
-      DFSF.loadShadow(LI.getPointerOperand(), Size, Alignment.value(), &LI); 
-  if (ClCombinePointerLabelsOnLoad) { 
-    Value *PtrShadow = DFSF.getShadow(LI.getPointerOperand()); 
+      DFSF.loadShadow(LI.getPointerOperand(), Size, Alignment.value(), &LI);
+  if (ClCombinePointerLabelsOnLoad) {
+    Value *PtrShadow = DFSF.getShadow(LI.getPointerOperand());
     PrimitiveShadow = DFSF.combineShadows(PrimitiveShadow, PtrShadow, &LI);
-  } 
+  }
   if (!DFSF.DFS.isZeroShadow(PrimitiveShadow))
     DFSF.NonZeroChecks.push_back(PrimitiveShadow);
- 
+
   Value *Shadow =
       DFSF.expandFromPrimitiveShadow(LI.getType(), PrimitiveShadow, &LI);
-  DFSF.setShadow(&LI, Shadow); 
-  if (ClEventCallbacks) { 
-    IRBuilder<> IRB(&LI); 
+  DFSF.setShadow(&LI, Shadow);
+  if (ClEventCallbacks) {
+    IRBuilder<> IRB(&LI);
     Value *Addr8 = IRB.CreateBitCast(LI.getPointerOperand(), DFSF.DFS.Int8Ptr);
     IRB.CreateCall(DFSF.DFS.DFSanLoadCallbackFn, {PrimitiveShadow, Addr8});
-  } 
-} 
- 
+  }
+}
+
 void DFSanFunction::storePrimitiveShadow(Value *Addr, uint64_t Size,
                                          Align Alignment,
                                          Value *PrimitiveShadow,
                                          Instruction *Pos) {
-  if (AllocaInst *AI = dyn_cast<AllocaInst>(Addr)) { 
-    const auto i = AllocaShadowMap.find(AI); 
-    if (i != AllocaShadowMap.end()) { 
-      IRBuilder<> IRB(Pos); 
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(Addr)) {
+    const auto i = AllocaShadowMap.find(AI);
+    if (i != AllocaShadowMap.end()) {
+      IRBuilder<> IRB(Pos);
       IRB.CreateStore(PrimitiveShadow, i->second);
-      return; 
-    } 
-  } 
- 
-  const Align ShadowAlign(Alignment.value() * DFS.ShadowWidthBytes); 
-  IRBuilder<> IRB(Pos); 
-  Value *ShadowAddr = DFS.getShadowAddress(Addr, Pos); 
+      return;
+    }
+  }
+
+  const Align ShadowAlign(Alignment.value() * DFS.ShadowWidthBytes);
+  IRBuilder<> IRB(Pos);
+  Value *ShadowAddr = DFS.getShadowAddress(Addr, Pos);
   if (DFS.isZeroShadow(PrimitiveShadow)) {
-    IntegerType *ShadowTy = 
-        IntegerType::get(*DFS.Ctx, Size * DFS.ShadowWidthBits); 
-    Value *ExtZeroShadow = ConstantInt::get(ShadowTy, 0); 
-    Value *ExtShadowAddr = 
-        IRB.CreateBitCast(ShadowAddr, PointerType::getUnqual(ShadowTy)); 
-    IRB.CreateAlignedStore(ExtZeroShadow, ExtShadowAddr, ShadowAlign); 
-    return; 
-  } 
- 
-  const unsigned ShadowVecSize = 128 / DFS.ShadowWidthBits; 
-  uint64_t Offset = 0; 
-  if (Size >= ShadowVecSize) { 
+    IntegerType *ShadowTy =
+        IntegerType::get(*DFS.Ctx, Size * DFS.ShadowWidthBits);
+    Value *ExtZeroShadow = ConstantInt::get(ShadowTy, 0);
+    Value *ExtShadowAddr =
+        IRB.CreateBitCast(ShadowAddr, PointerType::getUnqual(ShadowTy));
+    IRB.CreateAlignedStore(ExtZeroShadow, ExtShadowAddr, ShadowAlign);
+    return;
+  }
+
+  const unsigned ShadowVecSize = 128 / DFS.ShadowWidthBits;
+  uint64_t Offset = 0;
+  if (Size >= ShadowVecSize) {
     auto *ShadowVecTy =
         FixedVectorType::get(DFS.PrimitiveShadowTy, ShadowVecSize);
-    Value *ShadowVec = UndefValue::get(ShadowVecTy); 
-    for (unsigned i = 0; i != ShadowVecSize; ++i) { 
-      ShadowVec = IRB.CreateInsertElement( 
+    Value *ShadowVec = UndefValue::get(ShadowVecTy);
+    for (unsigned i = 0; i != ShadowVecSize; ++i) {
+      ShadowVec = IRB.CreateInsertElement(
           ShadowVec, PrimitiveShadow,
           ConstantInt::get(Type::getInt32Ty(*DFS.Ctx), i));
-    } 
-    Value *ShadowVecAddr = 
-        IRB.CreateBitCast(ShadowAddr, PointerType::getUnqual(ShadowVecTy)); 
-    do { 
-      Value *CurShadowVecAddr = 
-          IRB.CreateConstGEP1_32(ShadowVecTy, ShadowVecAddr, Offset); 
-      IRB.CreateAlignedStore(ShadowVec, CurShadowVecAddr, ShadowAlign); 
-      Size -= ShadowVecSize; 
-      ++Offset; 
-    } while (Size >= ShadowVecSize); 
-    Offset *= ShadowVecSize; 
-  } 
-  while (Size > 0) { 
-    Value *CurShadowAddr = 
+    }
+    Value *ShadowVecAddr =
+        IRB.CreateBitCast(ShadowAddr, PointerType::getUnqual(ShadowVecTy));
+    do {
+      Value *CurShadowVecAddr =
+          IRB.CreateConstGEP1_32(ShadowVecTy, ShadowVecAddr, Offset);
+      IRB.CreateAlignedStore(ShadowVec, CurShadowVecAddr, ShadowAlign);
+      Size -= ShadowVecSize;
+      ++Offset;
+    } while (Size >= ShadowVecSize);
+    Offset *= ShadowVecSize;
+  }
+  while (Size > 0) {
+    Value *CurShadowAddr =
         IRB.CreateConstGEP1_32(DFS.PrimitiveShadowTy, ShadowAddr, Offset);
     IRB.CreateAlignedStore(PrimitiveShadow, CurShadowAddr, ShadowAlign);
-    --Size; 
-    ++Offset; 
-  } 
-} 
- 
-void DFSanVisitor::visitStoreInst(StoreInst &SI) { 
-  auto &DL = SI.getModule()->getDataLayout(); 
-  uint64_t Size = DL.getTypeStoreSize(SI.getValueOperand()->getType()); 
-  if (Size == 0) 
-    return; 
- 
-  const Align Alignment = ClPreserveAlignment ? SI.getAlign() : Align(1); 
- 
-  Value* Shadow = DFSF.getShadow(SI.getValueOperand()); 
+    --Size;
+    ++Offset;
+  }
+}
+
+void DFSanVisitor::visitStoreInst(StoreInst &SI) {
+  auto &DL = SI.getModule()->getDataLayout();
+  uint64_t Size = DL.getTypeStoreSize(SI.getValueOperand()->getType());
+  if (Size == 0)
+    return;
+
+  const Align Alignment = ClPreserveAlignment ? SI.getAlign() : Align(1);
+
+  Value* Shadow = DFSF.getShadow(SI.getValueOperand());
   Value *PrimitiveShadow;
-  if (ClCombinePointerLabelsOnStore) { 
-    Value *PtrShadow = DFSF.getShadow(SI.getPointerOperand()); 
+  if (ClCombinePointerLabelsOnStore) {
+    Value *PtrShadow = DFSF.getShadow(SI.getPointerOperand());
     PrimitiveShadow = DFSF.combineShadows(Shadow, PtrShadow, &SI);
   } else {
     PrimitiveShadow = DFSF.collapseToPrimitiveShadow(Shadow, &SI);
-  } 
+  }
   DFSF.storePrimitiveShadow(SI.getPointerOperand(), Size, Alignment,
                             PrimitiveShadow, &SI);
-  if (ClEventCallbacks) { 
-    IRBuilder<> IRB(&SI); 
+  if (ClEventCallbacks) {
+    IRBuilder<> IRB(&SI);
     Value *Addr8 = IRB.CreateBitCast(SI.getPointerOperand(), DFSF.DFS.Int8Ptr);
     IRB.CreateCall(DFSF.DFS.DFSanStoreCallbackFn, {PrimitiveShadow, Addr8});
-  } 
-} 
- 
-void DFSanVisitor::visitUnaryOperator(UnaryOperator &UO) { 
-  visitOperandShadowInst(UO); 
-} 
- 
-void DFSanVisitor::visitBinaryOperator(BinaryOperator &BO) { 
-  visitOperandShadowInst(BO); 
-} 
- 
-void DFSanVisitor::visitCastInst(CastInst &CI) { visitOperandShadowInst(CI); } 
- 
-void DFSanVisitor::visitCmpInst(CmpInst &CI) { 
-  Value *CombinedShadow = visitOperandShadowInst(CI); 
-  if (ClEventCallbacks) { 
-    IRBuilder<> IRB(&CI); 
-    IRB.CreateCall(DFSF.DFS.DFSanCmpCallbackFn, CombinedShadow); 
-  } 
-} 
- 
-void DFSanVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) { 
-  visitOperandShadowInst(GEPI); 
-} 
- 
-void DFSanVisitor::visitExtractElementInst(ExtractElementInst &I) { 
-  visitOperandShadowInst(I); 
-} 
- 
-void DFSanVisitor::visitInsertElementInst(InsertElementInst &I) { 
-  visitOperandShadowInst(I); 
-} 
- 
-void DFSanVisitor::visitShuffleVectorInst(ShuffleVectorInst &I) { 
-  visitOperandShadowInst(I); 
-} 
- 
-void DFSanVisitor::visitExtractValueInst(ExtractValueInst &I) { 
+  }
+}
+
+void DFSanVisitor::visitUnaryOperator(UnaryOperator &UO) {
+  visitOperandShadowInst(UO);
+}
+
+void DFSanVisitor::visitBinaryOperator(BinaryOperator &BO) {
+  visitOperandShadowInst(BO);
+}
+
+void DFSanVisitor::visitCastInst(CastInst &CI) { visitOperandShadowInst(CI); }
+
+void DFSanVisitor::visitCmpInst(CmpInst &CI) {
+  Value *CombinedShadow = visitOperandShadowInst(CI);
+  if (ClEventCallbacks) {
+    IRBuilder<> IRB(&CI);
+    IRB.CreateCall(DFSF.DFS.DFSanCmpCallbackFn, CombinedShadow);
+  }
+}
+
+void DFSanVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+  visitOperandShadowInst(GEPI);
+}
+
+void DFSanVisitor::visitExtractElementInst(ExtractElementInst &I) {
+  visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitInsertElementInst(InsertElementInst &I) {
+  visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitShuffleVectorInst(ShuffleVectorInst &I) {
+  visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitExtractValueInst(ExtractValueInst &I) {
   if (!DFSF.DFS.shouldTrackFieldsAndIndices()) {
     visitOperandShadowInst(I);
     return;
@@ -1817,9 +1817,9 @@ void DFSanVisitor::visitExtractValueInst(ExtractValueInst &I) {
   Value *AggShadow = DFSF.getShadow(Agg);
   Value *ResShadow = IRB.CreateExtractValue(AggShadow, I.getIndices());
   DFSF.setShadow(&I, ResShadow);
-} 
- 
-void DFSanVisitor::visitInsertValueInst(InsertValueInst &I) { 
+}
+
+void DFSanVisitor::visitInsertValueInst(InsertValueInst &I) {
   if (!DFSF.DFS.shouldTrackFieldsAndIndices()) {
     visitOperandShadowInst(I);
     return;
@@ -1830,93 +1830,93 @@ void DFSanVisitor::visitInsertValueInst(InsertValueInst &I) {
   Value *InsShadow = DFSF.getShadow(I.getInsertedValueOperand());
   Value *Res = IRB.CreateInsertValue(AggShadow, InsShadow, I.getIndices());
   DFSF.setShadow(&I, Res);
-} 
- 
-void DFSanVisitor::visitAllocaInst(AllocaInst &I) { 
-  bool AllLoadsStores = true; 
-  for (User *U : I.users()) { 
-    if (isa<LoadInst>(U)) 
-      continue; 
- 
-    if (StoreInst *SI = dyn_cast<StoreInst>(U)) { 
-      if (SI->getPointerOperand() == &I) 
-        continue; 
-    } 
- 
-    AllLoadsStores = false; 
-    break; 
-  } 
-  if (AllLoadsStores) { 
-    IRBuilder<> IRB(&I); 
+}
+
+void DFSanVisitor::visitAllocaInst(AllocaInst &I) {
+  bool AllLoadsStores = true;
+  for (User *U : I.users()) {
+    if (isa<LoadInst>(U))
+      continue;
+
+    if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      if (SI->getPointerOperand() == &I)
+        continue;
+    }
+
+    AllLoadsStores = false;
+    break;
+  }
+  if (AllLoadsStores) {
+    IRBuilder<> IRB(&I);
     DFSF.AllocaShadowMap[&I] = IRB.CreateAlloca(DFSF.DFS.PrimitiveShadowTy);
-  } 
+  }
   DFSF.setShadow(&I, DFSF.DFS.ZeroPrimitiveShadow);
-} 
- 
-void DFSanVisitor::visitSelectInst(SelectInst &I) { 
-  Value *CondShadow = DFSF.getShadow(I.getCondition()); 
-  Value *TrueShadow = DFSF.getShadow(I.getTrueValue()); 
-  Value *FalseShadow = DFSF.getShadow(I.getFalseValue()); 
+}
+
+void DFSanVisitor::visitSelectInst(SelectInst &I) {
+  Value *CondShadow = DFSF.getShadow(I.getCondition());
+  Value *TrueShadow = DFSF.getShadow(I.getTrueValue());
+  Value *FalseShadow = DFSF.getShadow(I.getFalseValue());
   Value *ShadowSel = nullptr;
- 
-  if (isa<VectorType>(I.getCondition()->getType())) { 
+
+  if (isa<VectorType>(I.getCondition()->getType())) {
     ShadowSel = DFSF.combineShadowsThenConvert(I.getType(), TrueShadow,
                                                FalseShadow, &I);
-  } else { 
-    if (TrueShadow == FalseShadow) { 
-      ShadowSel = TrueShadow; 
-    } else { 
-      ShadowSel = 
-          SelectInst::Create(I.getCondition(), TrueShadow, FalseShadow, "", &I); 
-    } 
-  } 
+  } else {
+    if (TrueShadow == FalseShadow) {
+      ShadowSel = TrueShadow;
+    } else {
+      ShadowSel =
+          SelectInst::Create(I.getCondition(), TrueShadow, FalseShadow, "", &I);
+    }
+  }
   DFSF.setShadow(&I, ClTrackSelectControlFlow
                          ? DFSF.combineShadowsThenConvert(
                                I.getType(), CondShadow, ShadowSel, &I)
                          : ShadowSel);
-} 
- 
-void DFSanVisitor::visitMemSetInst(MemSetInst &I) { 
-  IRBuilder<> IRB(&I); 
-  Value *ValShadow = DFSF.getShadow(I.getValue()); 
-  IRB.CreateCall(DFSF.DFS.DFSanSetLabelFn, 
-                 {ValShadow, IRB.CreateBitCast(I.getDest(), Type::getInt8PtrTy( 
-                                                                *DFSF.DFS.Ctx)), 
-                  IRB.CreateZExtOrTrunc(I.getLength(), DFSF.DFS.IntptrTy)}); 
-} 
- 
-void DFSanVisitor::visitMemTransferInst(MemTransferInst &I) { 
-  IRBuilder<> IRB(&I); 
-  Value *RawDestShadow = DFSF.DFS.getShadowAddress(I.getDest(), &I); 
-  Value *SrcShadow = DFSF.DFS.getShadowAddress(I.getSource(), &I); 
-  Value *LenShadow = 
-      IRB.CreateMul(I.getLength(), ConstantInt::get(I.getLength()->getType(), 
-                                                    DFSF.DFS.ShadowWidthBytes)); 
-  Type *Int8Ptr = Type::getInt8PtrTy(*DFSF.DFS.Ctx); 
-  Value *DestShadow = IRB.CreateBitCast(RawDestShadow, Int8Ptr); 
-  SrcShadow = IRB.CreateBitCast(SrcShadow, Int8Ptr); 
-  auto *MTI = cast<MemTransferInst>( 
-      IRB.CreateCall(I.getFunctionType(), I.getCalledOperand(), 
-                     {DestShadow, SrcShadow, LenShadow, I.getVolatileCst()})); 
-  if (ClPreserveAlignment) { 
-    MTI->setDestAlignment(I.getDestAlign() * DFSF.DFS.ShadowWidthBytes); 
-    MTI->setSourceAlignment(I.getSourceAlign() * DFSF.DFS.ShadowWidthBytes); 
-  } else { 
-    MTI->setDestAlignment(Align(DFSF.DFS.ShadowWidthBytes)); 
-    MTI->setSourceAlignment(Align(DFSF.DFS.ShadowWidthBytes)); 
-  } 
-  if (ClEventCallbacks) { 
-    IRB.CreateCall(DFSF.DFS.DFSanMemTransferCallbackFn, 
-                   {RawDestShadow, I.getLength()}); 
-  } 
-} 
- 
-void DFSanVisitor::visitReturnInst(ReturnInst &RI) { 
-  if (!DFSF.IsNativeABI && RI.getReturnValue()) { 
-    switch (DFSF.IA) { 
-    case DataFlowSanitizer::IA_TLS: { 
-      Value *S = DFSF.getShadow(RI.getReturnValue()); 
-      IRBuilder<> IRB(&RI); 
+}
+
+void DFSanVisitor::visitMemSetInst(MemSetInst &I) {
+  IRBuilder<> IRB(&I);
+  Value *ValShadow = DFSF.getShadow(I.getValue());
+  IRB.CreateCall(DFSF.DFS.DFSanSetLabelFn,
+                 {ValShadow, IRB.CreateBitCast(I.getDest(), Type::getInt8PtrTy(
+                                                                *DFSF.DFS.Ctx)),
+                  IRB.CreateZExtOrTrunc(I.getLength(), DFSF.DFS.IntptrTy)});
+}
+
+void DFSanVisitor::visitMemTransferInst(MemTransferInst &I) {
+  IRBuilder<> IRB(&I);
+  Value *RawDestShadow = DFSF.DFS.getShadowAddress(I.getDest(), &I);
+  Value *SrcShadow = DFSF.DFS.getShadowAddress(I.getSource(), &I);
+  Value *LenShadow =
+      IRB.CreateMul(I.getLength(), ConstantInt::get(I.getLength()->getType(),
+                                                    DFSF.DFS.ShadowWidthBytes));
+  Type *Int8Ptr = Type::getInt8PtrTy(*DFSF.DFS.Ctx);
+  Value *DestShadow = IRB.CreateBitCast(RawDestShadow, Int8Ptr);
+  SrcShadow = IRB.CreateBitCast(SrcShadow, Int8Ptr);
+  auto *MTI = cast<MemTransferInst>(
+      IRB.CreateCall(I.getFunctionType(), I.getCalledOperand(),
+                     {DestShadow, SrcShadow, LenShadow, I.getVolatileCst()}));
+  if (ClPreserveAlignment) {
+    MTI->setDestAlignment(I.getDestAlign() * DFSF.DFS.ShadowWidthBytes);
+    MTI->setSourceAlignment(I.getSourceAlign() * DFSF.DFS.ShadowWidthBytes);
+  } else {
+    MTI->setDestAlignment(Align(DFSF.DFS.ShadowWidthBytes));
+    MTI->setSourceAlignment(Align(DFSF.DFS.ShadowWidthBytes));
+  }
+  if (ClEventCallbacks) {
+    IRB.CreateCall(DFSF.DFS.DFSanMemTransferCallbackFn,
+                   {RawDestShadow, I.getLength()});
+  }
+}
+
+void DFSanVisitor::visitReturnInst(ReturnInst &RI) {
+  if (!DFSF.IsNativeABI && RI.getReturnValue()) {
+    switch (DFSF.IA) {
+    case DataFlowSanitizer::IA_TLS: {
+      Value *S = DFSF.getShadow(RI.getReturnValue());
+      IRBuilder<> IRB(&RI);
       Type *RT = DFSF.F->getFunctionType()->getReturnType();
       unsigned Size =
           getDataLayout().getTypeAllocSize(DFSF.DFS.getShadowTy(RT));
@@ -1926,166 +1926,166 @@ void DFSanVisitor::visitReturnInst(ReturnInst &RI) {
         IRB.CreateAlignedStore(S, DFSF.getRetvalTLS(RT, IRB),
                                kShadowTLSAlignment);
       }
-      break; 
-    } 
-    case DataFlowSanitizer::IA_Args: { 
-      IRBuilder<> IRB(&RI); 
-      Type *RT = DFSF.F->getFunctionType()->getReturnType(); 
-      Value *InsVal = 
-          IRB.CreateInsertValue(UndefValue::get(RT), RI.getReturnValue(), 0); 
-      Value *InsShadow = 
-          IRB.CreateInsertValue(InsVal, DFSF.getShadow(RI.getReturnValue()), 1); 
-      RI.setOperand(0, InsShadow); 
-      break; 
-    } 
-    } 
-  } 
-} 
- 
-void DFSanVisitor::visitCallBase(CallBase &CB) { 
-  Function *F = CB.getCalledFunction(); 
-  if ((F && F->isIntrinsic()) || CB.isInlineAsm()) { 
-    visitOperandShadowInst(CB); 
-    return; 
-  } 
- 
-  // Calls to this function are synthesized in wrappers, and we shouldn't 
-  // instrument them. 
-  if (F == DFSF.DFS.DFSanVarargWrapperFn.getCallee()->stripPointerCasts()) 
-    return; 
- 
-  IRBuilder<> IRB(&CB); 
- 
-  DenseMap<Value *, Function *>::iterator i = 
-      DFSF.DFS.UnwrappedFnMap.find(CB.getCalledOperand()); 
-  if (i != DFSF.DFS.UnwrappedFnMap.end()) { 
-    Function *F = i->second; 
-    switch (DFSF.DFS.getWrapperKind(F)) { 
-    case DataFlowSanitizer::WK_Warning: 
-      CB.setCalledFunction(F); 
-      IRB.CreateCall(DFSF.DFS.DFSanUnimplementedFn, 
-                     IRB.CreateGlobalStringPtr(F->getName())); 
+      break;
+    }
+    case DataFlowSanitizer::IA_Args: {
+      IRBuilder<> IRB(&RI);
+      Type *RT = DFSF.F->getFunctionType()->getReturnType();
+      Value *InsVal =
+          IRB.CreateInsertValue(UndefValue::get(RT), RI.getReturnValue(), 0);
+      Value *InsShadow =
+          IRB.CreateInsertValue(InsVal, DFSF.getShadow(RI.getReturnValue()), 1);
+      RI.setOperand(0, InsShadow);
+      break;
+    }
+    }
+  }
+}
+
+void DFSanVisitor::visitCallBase(CallBase &CB) {
+  Function *F = CB.getCalledFunction();
+  if ((F && F->isIntrinsic()) || CB.isInlineAsm()) {
+    visitOperandShadowInst(CB);
+    return;
+  }
+
+  // Calls to this function are synthesized in wrappers, and we shouldn't
+  // instrument them.
+  if (F == DFSF.DFS.DFSanVarargWrapperFn.getCallee()->stripPointerCasts())
+    return;
+
+  IRBuilder<> IRB(&CB);
+
+  DenseMap<Value *, Function *>::iterator i =
+      DFSF.DFS.UnwrappedFnMap.find(CB.getCalledOperand());
+  if (i != DFSF.DFS.UnwrappedFnMap.end()) {
+    Function *F = i->second;
+    switch (DFSF.DFS.getWrapperKind(F)) {
+    case DataFlowSanitizer::WK_Warning:
+      CB.setCalledFunction(F);
+      IRB.CreateCall(DFSF.DFS.DFSanUnimplementedFn,
+                     IRB.CreateGlobalStringPtr(F->getName()));
       DFSF.setShadow(&CB, DFSF.DFS.getZeroShadow(&CB));
-      return; 
-    case DataFlowSanitizer::WK_Discard: 
-      CB.setCalledFunction(F); 
+      return;
+    case DataFlowSanitizer::WK_Discard:
+      CB.setCalledFunction(F);
       DFSF.setShadow(&CB, DFSF.DFS.getZeroShadow(&CB));
-      return; 
-    case DataFlowSanitizer::WK_Functional: 
-      CB.setCalledFunction(F); 
-      visitOperandShadowInst(CB); 
-      return; 
-    case DataFlowSanitizer::WK_Custom: 
-      // Don't try to handle invokes of custom functions, it's too complicated. 
-      // Instead, invoke the dfsw$ wrapper, which will in turn call the __dfsw_ 
-      // wrapper. 
-      if (CallInst *CI = dyn_cast<CallInst>(&CB)) { 
-        FunctionType *FT = F->getFunctionType(); 
-        TransformedFunction CustomFn = DFSF.DFS.getCustomFunctionType(FT); 
-        std::string CustomFName = "__dfsw_"; 
-        CustomFName += F->getName(); 
-        FunctionCallee CustomF = DFSF.DFS.Mod->getOrInsertFunction( 
-            CustomFName, CustomFn.TransformedType); 
-        if (Function *CustomFn = dyn_cast<Function>(CustomF.getCallee())) { 
-          CustomFn->copyAttributesFrom(F); 
- 
-          // Custom functions returning non-void will write to the return label. 
-          if (!FT->getReturnType()->isVoidTy()) { 
-            CustomFn->removeAttributes(AttributeList::FunctionIndex, 
-                                       DFSF.DFS.ReadOnlyNoneAttrs); 
-          } 
-        } 
- 
-        std::vector<Value *> Args; 
- 
-        auto i = CB.arg_begin(); 
-        for (unsigned n = FT->getNumParams(); n != 0; ++i, --n) { 
-          Type *T = (*i)->getType(); 
-          FunctionType *ParamFT; 
-          if (isa<PointerType>(T) && 
-              (ParamFT = dyn_cast<FunctionType>( 
-                   cast<PointerType>(T)->getElementType()))) { 
-            std::string TName = "dfst"; 
-            TName += utostr(FT->getNumParams() - n); 
-            TName += "$"; 
-            TName += F->getName(); 
-            Constant *T = DFSF.DFS.getOrBuildTrampolineFunction(ParamFT, TName); 
-            Args.push_back(T); 
-            Args.push_back( 
-                IRB.CreateBitCast(*i, Type::getInt8PtrTy(*DFSF.DFS.Ctx))); 
-          } else { 
-            Args.push_back(*i); 
-          } 
-        } 
- 
-        i = CB.arg_begin(); 
-        const unsigned ShadowArgStart = Args.size(); 
-        for (unsigned n = FT->getNumParams(); n != 0; ++i, --n) 
+      return;
+    case DataFlowSanitizer::WK_Functional:
+      CB.setCalledFunction(F);
+      visitOperandShadowInst(CB);
+      return;
+    case DataFlowSanitizer::WK_Custom:
+      // Don't try to handle invokes of custom functions, it's too complicated.
+      // Instead, invoke the dfsw$ wrapper, which will in turn call the __dfsw_
+      // wrapper.
+      if (CallInst *CI = dyn_cast<CallInst>(&CB)) {
+        FunctionType *FT = F->getFunctionType();
+        TransformedFunction CustomFn = DFSF.DFS.getCustomFunctionType(FT);
+        std::string CustomFName = "__dfsw_";
+        CustomFName += F->getName();
+        FunctionCallee CustomF = DFSF.DFS.Mod->getOrInsertFunction(
+            CustomFName, CustomFn.TransformedType);
+        if (Function *CustomFn = dyn_cast<Function>(CustomF.getCallee())) {
+          CustomFn->copyAttributesFrom(F);
+
+          // Custom functions returning non-void will write to the return label.
+          if (!FT->getReturnType()->isVoidTy()) {
+            CustomFn->removeAttributes(AttributeList::FunctionIndex,
+                                       DFSF.DFS.ReadOnlyNoneAttrs);
+          }
+        }
+
+        std::vector<Value *> Args;
+
+        auto i = CB.arg_begin();
+        for (unsigned n = FT->getNumParams(); n != 0; ++i, --n) {
+          Type *T = (*i)->getType();
+          FunctionType *ParamFT;
+          if (isa<PointerType>(T) &&
+              (ParamFT = dyn_cast<FunctionType>(
+                   cast<PointerType>(T)->getElementType()))) {
+            std::string TName = "dfst";
+            TName += utostr(FT->getNumParams() - n);
+            TName += "$";
+            TName += F->getName();
+            Constant *T = DFSF.DFS.getOrBuildTrampolineFunction(ParamFT, TName);
+            Args.push_back(T);
+            Args.push_back(
+                IRB.CreateBitCast(*i, Type::getInt8PtrTy(*DFSF.DFS.Ctx)));
+          } else {
+            Args.push_back(*i);
+          }
+        }
+
+        i = CB.arg_begin();
+        const unsigned ShadowArgStart = Args.size();
+        for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
           Args.push_back(
               DFSF.collapseToPrimitiveShadow(DFSF.getShadow(*i), &CB));
- 
-        if (FT->isVarArg()) { 
+
+        if (FT->isVarArg()) {
           auto *LabelVATy = ArrayType::get(DFSF.DFS.PrimitiveShadowTy,
-                                           CB.arg_size() - FT->getNumParams()); 
-          auto *LabelVAAlloca = new AllocaInst( 
-              LabelVATy, getDataLayout().getAllocaAddrSpace(), 
-              "labelva", &DFSF.F->getEntryBlock().front()); 
- 
-          for (unsigned n = 0; i != CB.arg_end(); ++i, ++n) { 
-            auto LabelVAPtr = IRB.CreateStructGEP(LabelVATy, LabelVAAlloca, n); 
+                                           CB.arg_size() - FT->getNumParams());
+          auto *LabelVAAlloca = new AllocaInst(
+              LabelVATy, getDataLayout().getAllocaAddrSpace(),
+              "labelva", &DFSF.F->getEntryBlock().front());
+
+          for (unsigned n = 0; i != CB.arg_end(); ++i, ++n) {
+            auto LabelVAPtr = IRB.CreateStructGEP(LabelVATy, LabelVAAlloca, n);
             IRB.CreateStore(
                 DFSF.collapseToPrimitiveShadow(DFSF.getShadow(*i), &CB),
                 LabelVAPtr);
-          } 
- 
-          Args.push_back(IRB.CreateStructGEP(LabelVATy, LabelVAAlloca, 0)); 
-        } 
- 
-        if (!FT->getReturnType()->isVoidTy()) { 
-          if (!DFSF.LabelReturnAlloca) { 
-            DFSF.LabelReturnAlloca = 
+          }
+
+          Args.push_back(IRB.CreateStructGEP(LabelVATy, LabelVAAlloca, 0));
+        }
+
+        if (!FT->getReturnType()->isVoidTy()) {
+          if (!DFSF.LabelReturnAlloca) {
+            DFSF.LabelReturnAlloca =
                 new AllocaInst(DFSF.DFS.PrimitiveShadowTy,
                                getDataLayout().getAllocaAddrSpace(),
                                "labelreturn", &DFSF.F->getEntryBlock().front());
-          } 
-          Args.push_back(DFSF.LabelReturnAlloca); 
-        } 
- 
-        for (i = CB.arg_begin() + FT->getNumParams(); i != CB.arg_end(); ++i) 
-          Args.push_back(*i); 
- 
-        CallInst *CustomCI = IRB.CreateCall(CustomF, Args); 
-        CustomCI->setCallingConv(CI->getCallingConv()); 
-        CustomCI->setAttributes(TransformFunctionAttributes(CustomFn, 
-            CI->getContext(), CI->getAttributes())); 
- 
-        // Update the parameter attributes of the custom call instruction to 
-        // zero extend the shadow parameters. This is required for targets 
+          }
+          Args.push_back(DFSF.LabelReturnAlloca);
+        }
+
+        for (i = CB.arg_begin() + FT->getNumParams(); i != CB.arg_end(); ++i)
+          Args.push_back(*i);
+
+        CallInst *CustomCI = IRB.CreateCall(CustomF, Args);
+        CustomCI->setCallingConv(CI->getCallingConv());
+        CustomCI->setAttributes(TransformFunctionAttributes(CustomFn,
+            CI->getContext(), CI->getAttributes()));
+
+        // Update the parameter attributes of the custom call instruction to
+        // zero extend the shadow parameters. This is required for targets
         // which consider PrimitiveShadowTy an illegal type.
-        for (unsigned n = 0; n < FT->getNumParams(); n++) { 
-          const unsigned ArgNo = ShadowArgStart + n; 
+        for (unsigned n = 0; n < FT->getNumParams(); n++) {
+          const unsigned ArgNo = ShadowArgStart + n;
           if (CustomCI->getArgOperand(ArgNo)->getType() ==
               DFSF.DFS.PrimitiveShadowTy)
-            CustomCI->addParamAttr(ArgNo, Attribute::ZExt); 
-        } 
- 
-        if (!FT->getReturnType()->isVoidTy()) { 
+            CustomCI->addParamAttr(ArgNo, Attribute::ZExt);
+        }
+
+        if (!FT->getReturnType()->isVoidTy()) {
           LoadInst *LabelLoad = IRB.CreateLoad(DFSF.DFS.PrimitiveShadowTy,
                                                DFSF.LabelReturnAlloca);
           DFSF.setShadow(CustomCI, DFSF.expandFromPrimitiveShadow(
                                        FT->getReturnType(), LabelLoad, &CB));
-        } 
- 
-        CI->replaceAllUsesWith(CustomCI); 
-        CI->eraseFromParent(); 
-        return; 
-      } 
-      break; 
-    } 
-  } 
- 
-  FunctionType *FT = CB.getFunctionType(); 
-  if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_TLS) { 
+        }
+
+        CI->replaceAllUsesWith(CustomCI);
+        CI->eraseFromParent();
+        return;
+      }
+      break;
+    }
+  }
+
+  FunctionType *FT = CB.getFunctionType();
+  if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_TLS) {
     unsigned ArgOffset = 0;
     const DataLayout &DL = getDataLayout();
     for (unsigned I = 0, N = FT->getNumParams(); I != N; ++I) {
@@ -2100,26 +2100,26 @@ void DFSanVisitor::visitCallBase(CallBase &CB) {
           DFSF.getArgTLS(FT->getParamType(I), ArgOffset, IRB),
           kShadowTLSAlignment);
       ArgOffset += alignTo(Size, kShadowTLSAlignment);
-    } 
-  } 
- 
-  Instruction *Next = nullptr; 
-  if (!CB.getType()->isVoidTy()) { 
-    if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) { 
-      if (II->getNormalDest()->getSinglePredecessor()) { 
-        Next = &II->getNormalDest()->front(); 
-      } else { 
-        BasicBlock *NewBB = 
-            SplitEdge(II->getParent(), II->getNormalDest(), &DFSF.DT); 
-        Next = &NewBB->front(); 
-      } 
-    } else { 
-      assert(CB.getIterator() != CB.getParent()->end()); 
-      Next = CB.getNextNode(); 
-    } 
- 
-    if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_TLS) { 
-      IRBuilder<> NextIRB(Next); 
+    }
+  }
+
+  Instruction *Next = nullptr;
+  if (!CB.getType()->isVoidTy()) {
+    if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
+      if (II->getNormalDest()->getSinglePredecessor()) {
+        Next = &II->getNormalDest()->front();
+      } else {
+        BasicBlock *NewBB =
+            SplitEdge(II->getParent(), II->getNormalDest(), &DFSF.DT);
+        Next = &NewBB->front();
+      }
+    } else {
+      assert(CB.getIterator() != CB.getParent()->end());
+      Next = CB.getNextNode();
+    }
+
+    if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_TLS) {
+      IRBuilder<> NextIRB(Next);
       const DataLayout &DL = getDataLayout();
       unsigned Size = DL.getTypeAllocSize(DFSF.DFS.getShadowTy(&CB));
       if (Size > kRetvalTLSSize) {
@@ -2133,83 +2133,83 @@ void DFSanVisitor::visitCallBase(CallBase &CB) {
         DFSF.setShadow(&CB, LI);
         DFSF.NonZeroChecks.push_back(LI);
       }
-    } 
-  } 
- 
-  // Do all instrumentation for IA_Args down here to defer tampering with the 
-  // CFG in a way that SplitEdge may be able to detect. 
-  if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_Args) { 
-    FunctionType *NewFT = DFSF.DFS.getArgsFunctionType(FT); 
-    Value *Func = 
-        IRB.CreateBitCast(CB.getCalledOperand(), PointerType::getUnqual(NewFT)); 
-    std::vector<Value *> Args; 
- 
-    auto i = CB.arg_begin(), E = CB.arg_end(); 
-    for (unsigned n = FT->getNumParams(); n != 0; ++i, --n) 
-      Args.push_back(*i); 
- 
-    i = CB.arg_begin(); 
-    for (unsigned n = FT->getNumParams(); n != 0; ++i, --n) 
-      Args.push_back(DFSF.getShadow(*i)); 
- 
-    if (FT->isVarArg()) { 
-      unsigned VarArgSize = CB.arg_size() - FT->getNumParams(); 
+    }
+  }
+
+  // Do all instrumentation for IA_Args down here to defer tampering with the
+  // CFG in a way that SplitEdge may be able to detect.
+  if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_Args) {
+    FunctionType *NewFT = DFSF.DFS.getArgsFunctionType(FT);
+    Value *Func =
+        IRB.CreateBitCast(CB.getCalledOperand(), PointerType::getUnqual(NewFT));
+    std::vector<Value *> Args;
+
+    auto i = CB.arg_begin(), E = CB.arg_end();
+    for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
+      Args.push_back(*i);
+
+    i = CB.arg_begin();
+    for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
+      Args.push_back(DFSF.getShadow(*i));
+
+    if (FT->isVarArg()) {
+      unsigned VarArgSize = CB.arg_size() - FT->getNumParams();
       ArrayType *VarArgArrayTy =
           ArrayType::get(DFSF.DFS.PrimitiveShadowTy, VarArgSize);
-      AllocaInst *VarArgShadow = 
-        new AllocaInst(VarArgArrayTy, getDataLayout().getAllocaAddrSpace(), 
-                       "", &DFSF.F->getEntryBlock().front()); 
-      Args.push_back(IRB.CreateConstGEP2_32(VarArgArrayTy, VarArgShadow, 0, 0)); 
-      for (unsigned n = 0; i != E; ++i, ++n) { 
-        IRB.CreateStore( 
-            DFSF.getShadow(*i), 
-            IRB.CreateConstGEP2_32(VarArgArrayTy, VarArgShadow, 0, n)); 
-        Args.push_back(*i); 
-      } 
-    } 
- 
-    CallBase *NewCB; 
-    if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) { 
-      NewCB = IRB.CreateInvoke(NewFT, Func, II->getNormalDest(), 
-                               II->getUnwindDest(), Args); 
-    } else { 
-      NewCB = IRB.CreateCall(NewFT, Func, Args); 
-    } 
-    NewCB->setCallingConv(CB.getCallingConv()); 
-    NewCB->setAttributes(CB.getAttributes().removeAttributes( 
-        *DFSF.DFS.Ctx, AttributeList::ReturnIndex, 
-        AttributeFuncs::typeIncompatible(NewCB->getType()))); 
- 
-    if (Next) { 
-      ExtractValueInst *ExVal = ExtractValueInst::Create(NewCB, 0, "", Next); 
-      DFSF.SkipInsts.insert(ExVal); 
-      ExtractValueInst *ExShadow = ExtractValueInst::Create(NewCB, 1, "", Next); 
-      DFSF.SkipInsts.insert(ExShadow); 
-      DFSF.setShadow(ExVal, ExShadow); 
-      DFSF.NonZeroChecks.push_back(ExShadow); 
- 
-      CB.replaceAllUsesWith(ExVal); 
-    } 
- 
-    CB.eraseFromParent(); 
-  } 
-} 
- 
-void DFSanVisitor::visitPHINode(PHINode &PN) { 
+      AllocaInst *VarArgShadow =
+        new AllocaInst(VarArgArrayTy, getDataLayout().getAllocaAddrSpace(),
+                       "", &DFSF.F->getEntryBlock().front());
+      Args.push_back(IRB.CreateConstGEP2_32(VarArgArrayTy, VarArgShadow, 0, 0));
+      for (unsigned n = 0; i != E; ++i, ++n) {
+        IRB.CreateStore(
+            DFSF.getShadow(*i),
+            IRB.CreateConstGEP2_32(VarArgArrayTy, VarArgShadow, 0, n));
+        Args.push_back(*i);
+      }
+    }
+
+    CallBase *NewCB;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
+      NewCB = IRB.CreateInvoke(NewFT, Func, II->getNormalDest(),
+                               II->getUnwindDest(), Args);
+    } else {
+      NewCB = IRB.CreateCall(NewFT, Func, Args);
+    }
+    NewCB->setCallingConv(CB.getCallingConv());
+    NewCB->setAttributes(CB.getAttributes().removeAttributes(
+        *DFSF.DFS.Ctx, AttributeList::ReturnIndex,
+        AttributeFuncs::typeIncompatible(NewCB->getType())));
+
+    if (Next) {
+      ExtractValueInst *ExVal = ExtractValueInst::Create(NewCB, 0, "", Next);
+      DFSF.SkipInsts.insert(ExVal);
+      ExtractValueInst *ExShadow = ExtractValueInst::Create(NewCB, 1, "", Next);
+      DFSF.SkipInsts.insert(ExShadow);
+      DFSF.setShadow(ExVal, ExShadow);
+      DFSF.NonZeroChecks.push_back(ExShadow);
+
+      CB.replaceAllUsesWith(ExVal);
+    }
+
+    CB.eraseFromParent();
+  }
+}
+
+void DFSanVisitor::visitPHINode(PHINode &PN) {
   Type *ShadowTy = DFSF.DFS.getShadowTy(&PN);
-  PHINode *ShadowPN = 
+  PHINode *ShadowPN =
       PHINode::Create(ShadowTy, PN.getNumIncomingValues(), "", &PN);
- 
-  // Give the shadow phi node valid predecessors to fool SplitEdge into working. 
+
+  // Give the shadow phi node valid predecessors to fool SplitEdge into working.
   Value *UndefShadow = UndefValue::get(ShadowTy);
-  for (PHINode::block_iterator i = PN.block_begin(), e = PN.block_end(); i != e; 
-       ++i) { 
-    ShadowPN->addIncoming(UndefShadow, *i); 
-  } 
- 
-  DFSF.PHIFixups.push_back(std::make_pair(&PN, ShadowPN)); 
-  DFSF.setShadow(&PN, ShadowPN); 
-} 
+  for (PHINode::block_iterator i = PN.block_begin(), e = PN.block_end(); i != e;
+       ++i) {
+    ShadowPN->addIncoming(UndefShadow, *i);
+  }
+
+  DFSF.PHIFixups.push_back(std::make_pair(&PN, ShadowPN));
+  DFSF.setShadow(&PN, ShadowPN);
+}
 
 namespace {
 class DataFlowSanitizerLegacyPass : public ModulePass {
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 8d53a5d27f..527644a69d 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -1,185 +1,185 @@
-//===- GCOVProfiling.cpp - Insert edge counters for gcov profiling --------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass implements GCOV-style profiling. When this pass is run it emits 
-// "gcno" files next to the existing source, and instruments the code that runs 
-// to records the edges between blocks that run and emit a complementary "gcda" 
-// file on exit. 
-// 
-//===----------------------------------------------------------------------===// 
- 
+//===- GCOVProfiling.cpp - Insert edge counters for gcov profiling --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements GCOV-style profiling. When this pass is run it emits
+// "gcno" files next to the existing source, and instruments the code that runs
+// to records the edges between blocks that run and emit a complementary "gcda"
+// file on exit.
+//
+//===----------------------------------------------------------------------===//
+
 #include "CFGMST.h"
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/Hashing.h" 
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/Sequence.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/StringExtras.h" 
-#include "llvm/ADT/StringMap.h" 
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Analysis/EHPersonalities.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/DebugInfo.h" 
-#include "llvm/IR/DebugLoc.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstIterator.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CRC.h"
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/FileSystem.h" 
-#include "llvm/Support/Path.h" 
-#include "llvm/Support/Regex.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Instrumentation.h" 
-#include "llvm/Transforms/Instrumentation/GCOVProfiler.h" 
-#include "llvm/Transforms/Utils/ModuleUtils.h" 
-#include <algorithm> 
-#include <memory> 
-#include <string> 
-#include <utility> 
- 
-using namespace llvm; 
-namespace endian = llvm::support::endian; 
- 
-#define DEBUG_TYPE "insert-gcov-profiling" 
- 
-enum : uint32_t { 
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Instrumentation/GCOVProfiler.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <utility>
+
+using namespace llvm;
+namespace endian = llvm::support::endian;
+
+#define DEBUG_TYPE "insert-gcov-profiling"
+
+enum : uint32_t {
   GCOV_ARC_ON_TREE = 1 << 0,
 
-  GCOV_TAG_FUNCTION = 0x01000000, 
-  GCOV_TAG_BLOCKS = 0x01410000, 
-  GCOV_TAG_ARCS = 0x01430000, 
-  GCOV_TAG_LINES = 0x01450000, 
-}; 
- 
-static cl::opt<std::string> DefaultGCOVVersion("default-gcov-version", 
-                                               cl::init("408*"), cl::Hidden, 
-                                               cl::ValueRequired); 
- 
+  GCOV_TAG_FUNCTION = 0x01000000,
+  GCOV_TAG_BLOCKS = 0x01410000,
+  GCOV_TAG_ARCS = 0x01430000,
+  GCOV_TAG_LINES = 0x01450000,
+};
+
+static cl::opt<std::string> DefaultGCOVVersion("default-gcov-version",
+                                               cl::init("408*"), cl::Hidden,
+                                               cl::ValueRequired);
+
 static cl::opt<bool> AtomicCounter("gcov-atomic-counter", cl::Hidden,
                                    cl::desc("Make counter updates atomic"));
 
-// Returns the number of words which will be used to represent this string. 
-static unsigned wordsOfString(StringRef s) { 
-  // Length + NUL-terminated string + 0~3 padding NULs. 
-  return (s.size() / 4) + 2; 
-} 
- 
-GCOVOptions GCOVOptions::getDefault() { 
-  GCOVOptions Options; 
-  Options.EmitNotes = true; 
-  Options.EmitData = true; 
-  Options.NoRedZone = false; 
+// Returns the number of words which will be used to represent this string.
+static unsigned wordsOfString(StringRef s) {
+  // Length + NUL-terminated string + 0~3 padding NULs.
+  return (s.size() / 4) + 2;
+}
+
+GCOVOptions GCOVOptions::getDefault() {
+  GCOVOptions Options;
+  Options.EmitNotes = true;
+  Options.EmitData = true;
+  Options.NoRedZone = false;
   Options.Atomic = AtomicCounter;
- 
-  if (DefaultGCOVVersion.size() != 4) { 
-    llvm::report_fatal_error(std::string("Invalid -default-gcov-version: ") + 
-                             DefaultGCOVVersion); 
-  } 
-  memcpy(Options.Version, DefaultGCOVVersion.c_str(), 4); 
-  return Options; 
-} 
- 
-namespace { 
-class GCOVFunction; 
- 
-class GCOVProfiler { 
-public: 
-  GCOVProfiler() : GCOVProfiler(GCOVOptions::getDefault()) {} 
-  GCOVProfiler(const GCOVOptions &Opts) : Options(Opts) {} 
-  bool 
+
+  if (DefaultGCOVVersion.size() != 4) {
+    llvm::report_fatal_error(std::string("Invalid -default-gcov-version: ") +
+                             DefaultGCOVVersion);
+  }
+  memcpy(Options.Version, DefaultGCOVVersion.c_str(), 4);
+  return Options;
+}
+
+namespace {
+class GCOVFunction;
+
+class GCOVProfiler {
+public:
+  GCOVProfiler() : GCOVProfiler(GCOVOptions::getDefault()) {}
+  GCOVProfiler(const GCOVOptions &Opts) : Options(Opts) {}
+  bool
   runOnModule(Module &M, function_ref<BlockFrequencyInfo *(Function &F)> GetBFI,
               function_ref<BranchProbabilityInfo *(Function &F)> GetBPI,
-              std::function<const TargetLibraryInfo &(Function &F)> GetTLI); 
- 
-  void write(uint32_t i) { 
-    char Bytes[4]; 
-    endian::write32(Bytes, i, Endian); 
-    os->write(Bytes, 4); 
-  } 
-  void writeString(StringRef s) { 
-    write(wordsOfString(s) - 1); 
-    os->write(s.data(), s.size()); 
-    os->write_zeros(4 - s.size() % 4); 
-  } 
-  void writeBytes(const char *Bytes, int Size) { os->write(Bytes, Size); } 
- 
-private: 
-  // Create the .gcno files for the Module based on DebugInfo. 
+              std::function<const TargetLibraryInfo &(Function &F)> GetTLI);
+
+  void write(uint32_t i) {
+    char Bytes[4];
+    endian::write32(Bytes, i, Endian);
+    os->write(Bytes, 4);
+  }
+  void writeString(StringRef s) {
+    write(wordsOfString(s) - 1);
+    os->write(s.data(), s.size());
+    os->write_zeros(4 - s.size() % 4);
+  }
+  void writeBytes(const char *Bytes, int Size) { os->write(Bytes, Size); }
+
+private:
+  // Create the .gcno files for the Module based on DebugInfo.
   bool
   emitProfileNotes(NamedMDNode *CUNode, bool HasExecOrFork,
                    function_ref<BlockFrequencyInfo *(Function &F)> GetBFI,
                    function_ref<BranchProbabilityInfo *(Function &F)> GetBPI,
                    function_ref<const TargetLibraryInfo &(Function &F)> GetTLI);
- 
+
   void emitGlobalConstructor(
       SmallVectorImpl<std::pair<GlobalVariable *, MDNode *>> &CountersBySP);
- 
-  bool isFunctionInstrumented(const Function &F); 
-  std::vector<Regex> createRegexesFromString(StringRef RegexesStr); 
-  static bool doesFilenameMatchARegex(StringRef Filename, 
-                                      std::vector<Regex> &Regexes); 
- 
-  // Get pointers to the functions in the runtime library. 
-  FunctionCallee getStartFileFunc(const TargetLibraryInfo *TLI); 
-  FunctionCallee getEmitFunctionFunc(const TargetLibraryInfo *TLI); 
-  FunctionCallee getEmitArcsFunc(const TargetLibraryInfo *TLI); 
-  FunctionCallee getSummaryInfoFunc(); 
-  FunctionCallee getEndFileFunc(); 
- 
-  // Add the function to write out all our counters to the global destructor 
-  // list. 
-  Function * 
-  insertCounterWriteout(ArrayRef<std::pair<GlobalVariable *, MDNode *>>); 
-  Function *insertReset(ArrayRef<std::pair<GlobalVariable *, MDNode *>>); 
- 
-  bool AddFlushBeforeForkAndExec(); 
- 
-  enum class GCovFileType { GCNO, GCDA }; 
-  std::string mangleName(const DICompileUnit *CU, GCovFileType FileType); 
- 
-  GCOVOptions Options; 
-  support::endianness Endian; 
-  raw_ostream *os; 
- 
-  // Checksum, produced by hash of EdgeDestinations 
-  SmallVector<uint32_t, 4> FileChecksums; 
- 
-  Module *M = nullptr; 
-  std::function<const TargetLibraryInfo &(Function &F)> GetTLI; 
-  LLVMContext *Ctx = nullptr; 
-  SmallVector<std::unique_ptr<GCOVFunction>, 16> Funcs; 
-  std::vector<Regex> FilterRe; 
-  std::vector<Regex> ExcludeRe; 
+
+  bool isFunctionInstrumented(const Function &F);
+  std::vector<Regex> createRegexesFromString(StringRef RegexesStr);
+  static bool doesFilenameMatchARegex(StringRef Filename,
+                                      std::vector<Regex> &Regexes);
+
+  // Get pointers to the functions in the runtime library.
+  FunctionCallee getStartFileFunc(const TargetLibraryInfo *TLI);
+  FunctionCallee getEmitFunctionFunc(const TargetLibraryInfo *TLI);
+  FunctionCallee getEmitArcsFunc(const TargetLibraryInfo *TLI);
+  FunctionCallee getSummaryInfoFunc();
+  FunctionCallee getEndFileFunc();
+
+  // Add the function to write out all our counters to the global destructor
+  // list.
+  Function *
+  insertCounterWriteout(ArrayRef<std::pair<GlobalVariable *, MDNode *>>);
+  Function *insertReset(ArrayRef<std::pair<GlobalVariable *, MDNode *>>);
+
+  bool AddFlushBeforeForkAndExec();
+
+  enum class GCovFileType { GCNO, GCDA };
+  std::string mangleName(const DICompileUnit *CU, GCovFileType FileType);
+
+  GCOVOptions Options;
+  support::endianness Endian;
+  raw_ostream *os;
+
+  // Checksum, produced by hash of EdgeDestinations
+  SmallVector<uint32_t, 4> FileChecksums;
+
+  Module *M = nullptr;
+  std::function<const TargetLibraryInfo &(Function &F)> GetTLI;
+  LLVMContext *Ctx = nullptr;
+  SmallVector<std::unique_ptr<GCOVFunction>, 16> Funcs;
+  std::vector<Regex> FilterRe;
+  std::vector<Regex> ExcludeRe;
   DenseSet<const BasicBlock *> ExecBlocks;
-  StringMap<bool> InstrumentedFiles; 
-}; 
- 
-class GCOVProfilerLegacyPass : public ModulePass { 
-public: 
-  static char ID; 
-  GCOVProfilerLegacyPass() 
-      : GCOVProfilerLegacyPass(GCOVOptions::getDefault()) {} 
-  GCOVProfilerLegacyPass(const GCOVOptions &Opts) 
-      : ModulePass(ID), Profiler(Opts) { 
-    initializeGCOVProfilerLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
-  StringRef getPassName() const override { return "GCOV Profiler"; } 
- 
-  bool runOnModule(Module &M) override { 
+  StringMap<bool> InstrumentedFiles;
+};
+
+class GCOVProfilerLegacyPass : public ModulePass {
+public:
+  static char ID;
+  GCOVProfilerLegacyPass()
+      : GCOVProfilerLegacyPass(GCOVOptions::getDefault()) {}
+  GCOVProfilerLegacyPass(const GCOVOptions &Opts)
+      : ModulePass(ID), Profiler(Opts) {
+    initializeGCOVProfilerLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+  StringRef getPassName() const override { return "GCOV Profiler"; }
+
+  bool runOnModule(Module &M) override {
     auto GetBFI = [this](Function &F) {
       return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
     };
@@ -190,16 +190,16 @@ public:
       return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
     };
     return Profiler.runOnModule(M, GetBFI, GetBPI, GetTLI);
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<BlockFrequencyInfoWrapperPass>();
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-  } 
- 
-private: 
-  GCOVProfiler Profiler; 
-}; 
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+
+private:
+  GCOVProfiler Profiler;
+};
 
 struct BBInfo {
   BBInfo *Group;
@@ -234,225 +234,225 @@ struct Edge {
         .str();
   }
 };
-} 
- 
-char GCOVProfilerLegacyPass::ID = 0; 
-INITIALIZE_PASS_BEGIN( 
-    GCOVProfilerLegacyPass, "insert-gcov-profiling", 
-    "Insert instrumentation for GCOV profiling", false, false) 
+}
+
+char GCOVProfilerLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(
+    GCOVProfilerLegacyPass, "insert-gcov-profiling",
+    "Insert instrumentation for GCOV profiling", false, false)
 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_END( 
-    GCOVProfilerLegacyPass, "insert-gcov-profiling", 
-    "Insert instrumentation for GCOV profiling", false, false) 
- 
-ModulePass *llvm::createGCOVProfilerPass(const GCOVOptions &Options) { 
-  return new GCOVProfilerLegacyPass(Options); 
-} 
- 
-static StringRef getFunctionName(const DISubprogram *SP) { 
-  if (!SP->getLinkageName().empty()) 
-    return SP->getLinkageName(); 
-  return SP->getName(); 
-} 
- 
-/// Extract a filename for a DISubprogram. 
-/// 
-/// Prefer relative paths in the coverage notes. Clang also may split 
-/// up absolute paths into a directory and filename component. When 
-/// the relative path doesn't exist, reconstruct the absolute path. 
-static SmallString<128> getFilename(const DISubprogram *SP) { 
-  SmallString<128> Path; 
-  StringRef RelPath = SP->getFilename(); 
-  if (sys::fs::exists(RelPath)) 
-    Path = RelPath; 
-  else 
-    sys::path::append(Path, SP->getDirectory(), SP->getFilename()); 
-  return Path; 
-} 
- 
-namespace { 
-  class GCOVRecord { 
-  protected: 
-    GCOVProfiler *P; 
- 
-    GCOVRecord(GCOVProfiler *P) : P(P) {} 
- 
-    void write(uint32_t i) { P->write(i); } 
-    void writeString(StringRef s) { P->writeString(s); } 
-    void writeBytes(const char *Bytes, int Size) { P->writeBytes(Bytes, Size); } 
-  }; 
- 
-  class GCOVFunction; 
-  class GCOVBlock; 
- 
-  // Constructed only by requesting it from a GCOVBlock, this object stores a 
-  // list of line numbers and a single filename, representing lines that belong 
-  // to the block. 
-  class GCOVLines : public GCOVRecord { 
-   public: 
-    void addLine(uint32_t Line) { 
-      assert(Line != 0 && "Line zero is not a valid real line number."); 
-      Lines.push_back(Line); 
-    } 
- 
-    uint32_t length() const { 
-      return 1 + wordsOfString(Filename) + Lines.size(); 
-    } 
- 
-    void writeOut() { 
-      write(0); 
-      writeString(Filename); 
-      for (int i = 0, e = Lines.size(); i != e; ++i) 
-        write(Lines[i]); 
-    } 
- 
-    GCOVLines(GCOVProfiler *P, StringRef F) 
-        : GCOVRecord(P), Filename(std::string(F)) {} 
- 
-  private: 
-    std::string Filename; 
-    SmallVector<uint32_t, 32> Lines; 
-  }; 
- 
- 
-  // Represent a basic block in GCOV. Each block has a unique number in the 
-  // function, number of lines belonging to each block, and a set of edges to 
-  // other blocks. 
-  class GCOVBlock : public GCOVRecord { 
-   public: 
-    GCOVLines &getFile(StringRef Filename) { 
-      return LinesByFile.try_emplace(Filename, P, Filename).first->second; 
-    } 
- 
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(
+    GCOVProfilerLegacyPass, "insert-gcov-profiling",
+    "Insert instrumentation for GCOV profiling", false, false)
+
+ModulePass *llvm::createGCOVProfilerPass(const GCOVOptions &Options) {
+  return new GCOVProfilerLegacyPass(Options);
+}
+
+static StringRef getFunctionName(const DISubprogram *SP) {
+  if (!SP->getLinkageName().empty())
+    return SP->getLinkageName();
+  return SP->getName();
+}
+
+/// Extract a filename for a DISubprogram.
+///
+/// Prefer relative paths in the coverage notes. Clang also may split
+/// up absolute paths into a directory and filename component. When
+/// the relative path doesn't exist, reconstruct the absolute path.
+static SmallString<128> getFilename(const DISubprogram *SP) {
+  SmallString<128> Path;
+  StringRef RelPath = SP->getFilename();
+  if (sys::fs::exists(RelPath))
+    Path = RelPath;
+  else
+    sys::path::append(Path, SP->getDirectory(), SP->getFilename());
+  return Path;
+}
+
+namespace {
+  class GCOVRecord {
+  protected:
+    GCOVProfiler *P;
+
+    GCOVRecord(GCOVProfiler *P) : P(P) {}
+
+    void write(uint32_t i) { P->write(i); }
+    void writeString(StringRef s) { P->writeString(s); }
+    void writeBytes(const char *Bytes, int Size) { P->writeBytes(Bytes, Size); }
+  };
+
+  class GCOVFunction;
+  class GCOVBlock;
+
+  // Constructed only by requesting it from a GCOVBlock, this object stores a
+  // list of line numbers and a single filename, representing lines that belong
+  // to the block.
+  class GCOVLines : public GCOVRecord {
+   public:
+    void addLine(uint32_t Line) {
+      assert(Line != 0 && "Line zero is not a valid real line number.");
+      Lines.push_back(Line);
+    }
+
+    uint32_t length() const {
+      return 1 + wordsOfString(Filename) + Lines.size();
+    }
+
+    void writeOut() {
+      write(0);
+      writeString(Filename);
+      for (int i = 0, e = Lines.size(); i != e; ++i)
+        write(Lines[i]);
+    }
+
+    GCOVLines(GCOVProfiler *P, StringRef F)
+        : GCOVRecord(P), Filename(std::string(F)) {}
+
+  private:
+    std::string Filename;
+    SmallVector<uint32_t, 32> Lines;
+  };
+
+
+  // Represent a basic block in GCOV. Each block has a unique number in the
+  // function, number of lines belonging to each block, and a set of edges to
+  // other blocks.
+  class GCOVBlock : public GCOVRecord {
+   public:
+    GCOVLines &getFile(StringRef Filename) {
+      return LinesByFile.try_emplace(Filename, P, Filename).first->second;
+    }
+
     void addEdge(GCOVBlock &Successor, uint32_t Flags) {
       OutEdges.emplace_back(&Successor, Flags);
-    } 
- 
-    void writeOut() { 
-      uint32_t Len = 3; 
-      SmallVector<StringMapEntry<GCOVLines> *, 32> SortedLinesByFile; 
-      for (auto &I : LinesByFile) { 
-        Len += I.second.length(); 
-        SortedLinesByFile.push_back(&I); 
-      } 
- 
-      write(GCOV_TAG_LINES); 
-      write(Len); 
-      write(Number); 
- 
-      llvm::sort(SortedLinesByFile, [](StringMapEntry<GCOVLines> *LHS, 
-                                       StringMapEntry<GCOVLines> *RHS) { 
-        return LHS->getKey() < RHS->getKey(); 
-      }); 
-      for (auto &I : SortedLinesByFile) 
-        I->getValue().writeOut(); 
-      write(0); 
-      write(0); 
-    } 
- 
-    GCOVBlock(const GCOVBlock &RHS) : GCOVRecord(RHS), Number(RHS.Number) { 
-      // Only allow copy before edges and lines have been added. After that, 
-      // there are inter-block pointers (eg: edges) that won't take kindly to 
-      // blocks being copied or moved around. 
-      assert(LinesByFile.empty()); 
-      assert(OutEdges.empty()); 
-    } 
- 
+    }
+
+    void writeOut() {
+      uint32_t Len = 3;
+      SmallVector<StringMapEntry<GCOVLines> *, 32> SortedLinesByFile;
+      for (auto &I : LinesByFile) {
+        Len += I.second.length();
+        SortedLinesByFile.push_back(&I);
+      }
+
+      write(GCOV_TAG_LINES);
+      write(Len);
+      write(Number);
+
+      llvm::sort(SortedLinesByFile, [](StringMapEntry<GCOVLines> *LHS,
+                                       StringMapEntry<GCOVLines> *RHS) {
+        return LHS->getKey() < RHS->getKey();
+      });
+      for (auto &I : SortedLinesByFile)
+        I->getValue().writeOut();
+      write(0);
+      write(0);
+    }
+
+    GCOVBlock(const GCOVBlock &RHS) : GCOVRecord(RHS), Number(RHS.Number) {
+      // Only allow copy before edges and lines have been added. After that,
+      // there are inter-block pointers (eg: edges) that won't take kindly to
+      // blocks being copied or moved around.
+      assert(LinesByFile.empty());
+      assert(OutEdges.empty());
+    }
+
     uint32_t Number;
     SmallVector<std::pair<GCOVBlock *, uint32_t>, 4> OutEdges;
 
   private:
-    friend class GCOVFunction; 
- 
-    GCOVBlock(GCOVProfiler *P, uint32_t Number) 
-        : GCOVRecord(P), Number(Number) {} 
- 
-    StringMap<GCOVLines> LinesByFile; 
-  }; 
- 
-  // A function has a unique identifier, a checksum (we leave as zero) and a 
-  // set of blocks and a map of edges between blocks. This is the only GCOV 
-  // object users can construct, the blocks and lines will be rooted here. 
-  class GCOVFunction : public GCOVRecord { 
-  public: 
-    GCOVFunction(GCOVProfiler *P, Function *F, const DISubprogram *SP, 
-                 unsigned EndLine, uint32_t Ident, int Version) 
-        : GCOVRecord(P), SP(SP), EndLine(EndLine), Ident(Ident), 
+    friend class GCOVFunction;
+
+    GCOVBlock(GCOVProfiler *P, uint32_t Number)
+        : GCOVRecord(P), Number(Number) {}
+
+    StringMap<GCOVLines> LinesByFile;
+  };
+
+  // A function has a unique identifier, a checksum (we leave as zero) and a
+  // set of blocks and a map of edges between blocks. This is the only GCOV
+  // object users can construct, the blocks and lines will be rooted here.
+  class GCOVFunction : public GCOVRecord {
+  public:
+    GCOVFunction(GCOVProfiler *P, Function *F, const DISubprogram *SP,
+                 unsigned EndLine, uint32_t Ident, int Version)
+        : GCOVRecord(P), SP(SP), EndLine(EndLine), Ident(Ident),
           Version(Version), EntryBlock(P, 0), ReturnBlock(P, 1) {
-      LLVM_DEBUG(dbgs() << "Function: " << getFunctionName(SP) << "\n"); 
-      bool ExitBlockBeforeBody = Version >= 48; 
+      LLVM_DEBUG(dbgs() << "Function: " << getFunctionName(SP) << "\n");
+      bool ExitBlockBeforeBody = Version >= 48;
       uint32_t i = ExitBlockBeforeBody ? 2 : 1;
       for (BasicBlock &BB : *F)
-        Blocks.insert(std::make_pair(&BB, GCOVBlock(P, i++))); 
-      if (!ExitBlockBeforeBody) 
-        ReturnBlock.Number = i; 
- 
-      std::string FunctionNameAndLine; 
-      raw_string_ostream FNLOS(FunctionNameAndLine); 
-      FNLOS << getFunctionName(SP) << SP->getLine(); 
-      FNLOS.flush(); 
-      FuncChecksum = hash_value(FunctionNameAndLine); 
-    } 
- 
+        Blocks.insert(std::make_pair(&BB, GCOVBlock(P, i++)));
+      if (!ExitBlockBeforeBody)
+        ReturnBlock.Number = i;
+
+      std::string FunctionNameAndLine;
+      raw_string_ostream FNLOS(FunctionNameAndLine);
+      FNLOS << getFunctionName(SP) << SP->getLine();
+      FNLOS.flush();
+      FuncChecksum = hash_value(FunctionNameAndLine);
+    }
+
     GCOVBlock &getBlock(const BasicBlock *BB) {
       return Blocks.find(const_cast<BasicBlock *>(BB))->second;
-    } 
- 
+    }
+
     GCOVBlock &getEntryBlock() { return EntryBlock; }
-    GCOVBlock &getReturnBlock() { 
-      return ReturnBlock; 
-    } 
- 
-    uint32_t getFuncChecksum() const { 
-      return FuncChecksum; 
-    } 
- 
-    void writeOut(uint32_t CfgChecksum) { 
-      write(GCOV_TAG_FUNCTION); 
-      SmallString<128> Filename = getFilename(SP); 
-      uint32_t BlockLen = 
-          2 + (Version >= 47) + wordsOfString(getFunctionName(SP)); 
-      if (Version < 80) 
-        BlockLen += wordsOfString(Filename) + 1; 
-      else 
-        BlockLen += 1 + wordsOfString(Filename) + 3 + (Version >= 90); 
- 
-      write(BlockLen); 
-      write(Ident); 
-      write(FuncChecksum); 
-      if (Version >= 47) 
-        write(CfgChecksum); 
-      writeString(getFunctionName(SP)); 
-      if (Version < 80) { 
-        writeString(Filename); 
-        write(SP->getLine()); 
-      } else { 
-        write(SP->isArtificial()); // artificial 
-        writeString(Filename); 
-        write(SP->getLine()); // start_line 
-        write(0);             // start_column 
-        // EndLine is the last line with !dbg. It is not the } line as in GCC, 
-        // but good enough. 
-        write(EndLine); 
-        if (Version >= 90) 
-          write(0); // end_column 
-      } 
- 
-      // Emit count of blocks. 
-      write(GCOV_TAG_BLOCKS); 
-      if (Version < 80) { 
+    GCOVBlock &getReturnBlock() {
+      return ReturnBlock;
+    }
+
+    uint32_t getFuncChecksum() const {
+      return FuncChecksum;
+    }
+
+    void writeOut(uint32_t CfgChecksum) {
+      write(GCOV_TAG_FUNCTION);
+      SmallString<128> Filename = getFilename(SP);
+      uint32_t BlockLen =
+          2 + (Version >= 47) + wordsOfString(getFunctionName(SP));
+      if (Version < 80)
+        BlockLen += wordsOfString(Filename) + 1;
+      else
+        BlockLen += 1 + wordsOfString(Filename) + 3 + (Version >= 90);
+
+      write(BlockLen);
+      write(Ident);
+      write(FuncChecksum);
+      if (Version >= 47)
+        write(CfgChecksum);
+      writeString(getFunctionName(SP));
+      if (Version < 80) {
+        writeString(Filename);
+        write(SP->getLine());
+      } else {
+        write(SP->isArtificial()); // artificial
+        writeString(Filename);
+        write(SP->getLine()); // start_line
+        write(0);             // start_column
+        // EndLine is the last line with !dbg. It is not the } line as in GCC,
+        // but good enough.
+        write(EndLine);
+        if (Version >= 90)
+          write(0); // end_column
+      }
+
+      // Emit count of blocks.
+      write(GCOV_TAG_BLOCKS);
+      if (Version < 80) {
         write(Blocks.size() + 2);
         for (int i = Blocks.size() + 2; i; --i)
-          write(0); 
-      } else { 
-        write(1); 
+          write(0);
+      } else {
+        write(1);
         write(Blocks.size() + 2);
-      } 
-      LLVM_DEBUG(dbgs() << (Blocks.size() + 1) << " blocks\n"); 
- 
-      // Emit edges between blocks. 
+      }
+      LLVM_DEBUG(dbgs() << (Blocks.size() + 1) << " blocks\n");
+
+      // Emit edges between blocks.
       const uint32_t Outgoing = EntryBlock.OutEdges.size();
       if (Outgoing) {
         write(GCOV_TAG_ARCS);
@@ -465,169 +465,169 @@ namespace {
       }
       for (auto &It : Blocks) {
         const GCOVBlock &Block = It.second;
-        if (Block.OutEdges.empty()) continue; 
- 
-        write(GCOV_TAG_ARCS); 
-        write(Block.OutEdges.size() * 2 + 1); 
-        write(Block.Number); 
+        if (Block.OutEdges.empty()) continue;
+
+        write(GCOV_TAG_ARCS);
+        write(Block.OutEdges.size() * 2 + 1);
+        write(Block.Number);
         for (const auto &E : Block.OutEdges) {
           write(E.first->Number);
           write(E.second);
-        } 
-      } 
- 
-      // Emit lines for each block. 
+        }
+      }
+
+      // Emit lines for each block.
       for (auto &It : Blocks)
         It.second.writeOut();
-    } 
- 
+    }
+
   public:
-    const DISubprogram *SP; 
-    unsigned EndLine; 
-    uint32_t Ident; 
-    uint32_t FuncChecksum; 
-    int Version; 
+    const DISubprogram *SP;
+    unsigned EndLine;
+    uint32_t Ident;
+    uint32_t FuncChecksum;
+    int Version;
     MapVector<BasicBlock *, GCOVBlock> Blocks;
     GCOVBlock EntryBlock;
-    GCOVBlock ReturnBlock; 
-  }; 
-} 
- 
-// RegexesStr is a string containing differents regex separated by a semi-colon. 
-// For example "foo\..*$;bar\..*$". 
-std::vector<Regex> GCOVProfiler::createRegexesFromString(StringRef RegexesStr) { 
-  std::vector<Regex> Regexes; 
-  while (!RegexesStr.empty()) { 
-    std::pair<StringRef, StringRef> HeadTail = RegexesStr.split(';'); 
-    if (!HeadTail.first.empty()) { 
-      Regex Re(HeadTail.first); 
-      std::string Err; 
-      if (!Re.isValid(Err)) { 
-        Ctx->emitError(Twine("Regex ") + HeadTail.first + 
-                       " is not valid: " + Err); 
-      } 
-      Regexes.emplace_back(std::move(Re)); 
-    } 
-    RegexesStr = HeadTail.second; 
-  } 
-  return Regexes; 
-} 
- 
-bool GCOVProfiler::doesFilenameMatchARegex(StringRef Filename, 
-                                           std::vector<Regex> &Regexes) { 
-  for (Regex &Re : Regexes) 
-    if (Re.match(Filename)) 
-      return true; 
-  return false; 
-} 
- 
-bool GCOVProfiler::isFunctionInstrumented(const Function &F) { 
-  if (FilterRe.empty() && ExcludeRe.empty()) { 
-    return true; 
-  } 
-  SmallString<128> Filename = getFilename(F.getSubprogram()); 
-  auto It = InstrumentedFiles.find(Filename); 
-  if (It != InstrumentedFiles.end()) { 
-    return It->second; 
-  } 
- 
-  SmallString<256> RealPath; 
-  StringRef RealFilename; 
- 
-  // Path can be 
-  // /usr/lib/gcc/x86_64-linux-gnu/8/../../../../include/c++/8/bits/*.h so for 
-  // such a case we must get the real_path. 
-  if (sys::fs::real_path(Filename, RealPath)) { 
-    // real_path can fail with path like "foo.c". 
-    RealFilename = Filename; 
-  } else { 
-    RealFilename = RealPath; 
-  } 
- 
-  bool ShouldInstrument; 
-  if (FilterRe.empty()) { 
-    ShouldInstrument = !doesFilenameMatchARegex(RealFilename, ExcludeRe); 
-  } else if (ExcludeRe.empty()) { 
-    ShouldInstrument = doesFilenameMatchARegex(RealFilename, FilterRe); 
-  } else { 
-    ShouldInstrument = doesFilenameMatchARegex(RealFilename, FilterRe) && 
-                       !doesFilenameMatchARegex(RealFilename, ExcludeRe); 
-  } 
-  InstrumentedFiles[Filename] = ShouldInstrument; 
-  return ShouldInstrument; 
-} 
- 
-std::string GCOVProfiler::mangleName(const DICompileUnit *CU, 
-                                     GCovFileType OutputType) { 
-  bool Notes = OutputType == GCovFileType::GCNO; 
- 
-  if (NamedMDNode *GCov = M->getNamedMetadata("llvm.gcov")) { 
-    for (int i = 0, e = GCov->getNumOperands(); i != e; ++i) { 
-      MDNode *N = GCov->getOperand(i); 
-      bool ThreeElement = N->getNumOperands() == 3; 
-      if (!ThreeElement && N->getNumOperands() != 2) 
-        continue; 
-      if (dyn_cast<MDNode>(N->getOperand(ThreeElement ? 2 : 1)) != CU) 
-        continue; 
- 
-      if (ThreeElement) { 
-        // These nodes have no mangling to apply, it's stored mangled in the 
-        // bitcode. 
-        MDString *NotesFile = dyn_cast<MDString>(N->getOperand(0)); 
-        MDString *DataFile = dyn_cast<MDString>(N->getOperand(1)); 
-        if (!NotesFile || !DataFile) 
-          continue; 
-        return std::string(Notes ? NotesFile->getString() 
-                                 : DataFile->getString()); 
-      } 
- 
-      MDString *GCovFile = dyn_cast<MDString>(N->getOperand(0)); 
-      if (!GCovFile) 
-        continue; 
- 
-      SmallString<128> Filename = GCovFile->getString(); 
-      sys::path::replace_extension(Filename, Notes ? "gcno" : "gcda"); 
-      return std::string(Filename.str()); 
-    } 
-  } 
- 
-  SmallString<128> Filename = CU->getFilename(); 
-  sys::path::replace_extension(Filename, Notes ? "gcno" : "gcda"); 
-  StringRef FName = sys::path::filename(Filename); 
-  SmallString<128> CurPath; 
-  if (sys::fs::current_path(CurPath)) 
-    return std::string(FName); 
-  sys::path::append(CurPath, FName); 
-  return std::string(CurPath.str()); 
-} 
- 
-bool GCOVProfiler::runOnModule( 
+    GCOVBlock ReturnBlock;
+  };
+}
+
+// RegexesStr is a string containing differents regex separated by a semi-colon.
+// For example "foo\..*$;bar\..*$".
+std::vector<Regex> GCOVProfiler::createRegexesFromString(StringRef RegexesStr) {
+  std::vector<Regex> Regexes;
+  while (!RegexesStr.empty()) {
+    std::pair<StringRef, StringRef> HeadTail = RegexesStr.split(';');
+    if (!HeadTail.first.empty()) {
+      Regex Re(HeadTail.first);
+      std::string Err;
+      if (!Re.isValid(Err)) {
+        Ctx->emitError(Twine("Regex ") + HeadTail.first +
+                       " is not valid: " + Err);
+      }
+      Regexes.emplace_back(std::move(Re));
+    }
+    RegexesStr = HeadTail.second;
+  }
+  return Regexes;
+}
+
+bool GCOVProfiler::doesFilenameMatchARegex(StringRef Filename,
+                                           std::vector<Regex> &Regexes) {
+  for (Regex &Re : Regexes)
+    if (Re.match(Filename))
+      return true;
+  return false;
+}
+
+bool GCOVProfiler::isFunctionInstrumented(const Function &F) {
+  if (FilterRe.empty() && ExcludeRe.empty()) {
+    return true;
+  }
+  SmallString<128> Filename = getFilename(F.getSubprogram());
+  auto It = InstrumentedFiles.find(Filename);
+  if (It != InstrumentedFiles.end()) {
+    return It->second;
+  }
+
+  SmallString<256> RealPath;
+  StringRef RealFilename;
+
+  // Path can be
+  // /usr/lib/gcc/x86_64-linux-gnu/8/../../../../include/c++/8/bits/*.h so for
+  // such a case we must get the real_path.
+  if (sys::fs::real_path(Filename, RealPath)) {
+    // real_path can fail with path like "foo.c".
+    RealFilename = Filename;
+  } else {
+    RealFilename = RealPath;
+  }
+
+  bool ShouldInstrument;
+  if (FilterRe.empty()) {
+    ShouldInstrument = !doesFilenameMatchARegex(RealFilename, ExcludeRe);
+  } else if (ExcludeRe.empty()) {
+    ShouldInstrument = doesFilenameMatchARegex(RealFilename, FilterRe);
+  } else {
+    ShouldInstrument = doesFilenameMatchARegex(RealFilename, FilterRe) &&
+                       !doesFilenameMatchARegex(RealFilename, ExcludeRe);
+  }
+  InstrumentedFiles[Filename] = ShouldInstrument;
+  return ShouldInstrument;
+}
+
+std::string GCOVProfiler::mangleName(const DICompileUnit *CU,
+                                     GCovFileType OutputType) {
+  bool Notes = OutputType == GCovFileType::GCNO;
+
+  if (NamedMDNode *GCov = M->getNamedMetadata("llvm.gcov")) {
+    for (int i = 0, e = GCov->getNumOperands(); i != e; ++i) {
+      MDNode *N = GCov->getOperand(i);
+      bool ThreeElement = N->getNumOperands() == 3;
+      if (!ThreeElement && N->getNumOperands() != 2)
+        continue;
+      if (dyn_cast<MDNode>(N->getOperand(ThreeElement ? 2 : 1)) != CU)
+        continue;
+
+      if (ThreeElement) {
+        // These nodes have no mangling to apply, it's stored mangled in the
+        // bitcode.
+        MDString *NotesFile = dyn_cast<MDString>(N->getOperand(0));
+        MDString *DataFile = dyn_cast<MDString>(N->getOperand(1));
+        if (!NotesFile || !DataFile)
+          continue;
+        return std::string(Notes ? NotesFile->getString()
+                                 : DataFile->getString());
+      }
+
+      MDString *GCovFile = dyn_cast<MDString>(N->getOperand(0));
+      if (!GCovFile)
+        continue;
+
+      SmallString<128> Filename = GCovFile->getString();
+      sys::path::replace_extension(Filename, Notes ? "gcno" : "gcda");
+      return std::string(Filename.str());
+    }
+  }
+
+  SmallString<128> Filename = CU->getFilename();
+  sys::path::replace_extension(Filename, Notes ? "gcno" : "gcda");
+  StringRef FName = sys::path::filename(Filename);
+  SmallString<128> CurPath;
+  if (sys::fs::current_path(CurPath))
+    return std::string(FName);
+  sys::path::append(CurPath, FName);
+  return std::string(CurPath.str());
+}
+
+bool GCOVProfiler::runOnModule(
     Module &M, function_ref<BlockFrequencyInfo *(Function &F)> GetBFI,
     function_ref<BranchProbabilityInfo *(Function &F)> GetBPI,
     std::function<const TargetLibraryInfo &(Function &F)> GetTLI) {
-  this->M = &M; 
-  this->GetTLI = std::move(GetTLI); 
-  Ctx = &M.getContext(); 
- 
+  this->M = &M;
+  this->GetTLI = std::move(GetTLI);
+  Ctx = &M.getContext();
+
   NamedMDNode *CUNode = M.getNamedMetadata("llvm.dbg.cu");
   if (!CUNode || (!Options.EmitNotes && !Options.EmitData))
     return false;
- 
+
   bool HasExecOrFork = AddFlushBeforeForkAndExec();
 
-  FilterRe = createRegexesFromString(Options.Filter); 
-  ExcludeRe = createRegexesFromString(Options.Exclude); 
+  FilterRe = createRegexesFromString(Options.Filter);
+  ExcludeRe = createRegexesFromString(Options.Exclude);
   emitProfileNotes(CUNode, HasExecOrFork, GetBFI, GetBPI, this->GetTLI);
   return true;
-} 
- 
-PreservedAnalyses GCOVProfilerPass::run(Module &M, 
-                                        ModuleAnalysisManager &AM) { 
- 
-  GCOVProfiler Profiler(GCOVOpts); 
-  FunctionAnalysisManager &FAM = 
-      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 
- 
+}
+
+PreservedAnalyses GCOVProfilerPass::run(Module &M,
+                                        ModuleAnalysisManager &AM) {
+
+  GCOVProfiler Profiler(GCOVOpts);
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+
   auto GetBFI = [&FAM](Function &F) {
     return &FAM.getResult<BlockFrequencyAnalysis>(F);
   };
@@ -639,124 +639,124 @@ PreservedAnalyses GCOVProfilerPass::run(Module &M,
   };
 
   if (!Profiler.runOnModule(M, GetBFI, GetBPI, GetTLI))
-    return PreservedAnalyses::all(); 
- 
-  return PreservedAnalyses::none(); 
-} 
- 
-static bool functionHasLines(const Function &F, unsigned &EndLine) { 
-  // Check whether this function actually has any source lines. Not only 
-  // do these waste space, they also can crash gcov. 
-  EndLine = 0; 
-  for (auto &BB : F) { 
-    for (auto &I : BB) { 
-      // Debug intrinsic locations correspond to the location of the 
-      // declaration, not necessarily any statements or expressions. 
-      if (isa<DbgInfoIntrinsic>(&I)) continue; 
- 
-      const DebugLoc &Loc = I.getDebugLoc(); 
-      if (!Loc) 
-        continue; 
- 
-      // Artificial lines such as calls to the global constructors. 
-      if (Loc.getLine() == 0) continue; 
-      EndLine = std::max(EndLine, Loc.getLine()); 
- 
-      return true; 
-    } 
-  } 
-  return false; 
-} 
- 
-static bool isUsingScopeBasedEH(Function &F) { 
-  if (!F.hasPersonalityFn()) return false; 
- 
-  EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn()); 
-  return isScopedEHPersonality(Personality); 
-} 
- 
-bool GCOVProfiler::AddFlushBeforeForkAndExec() { 
-  SmallVector<CallInst *, 2> Forks; 
-  SmallVector<CallInst *, 2> Execs; 
-  for (auto &F : M->functions()) { 
-    auto *TLI = &GetTLI(F); 
-    for (auto &I : instructions(F)) { 
-      if (CallInst *CI = dyn_cast<CallInst>(&I)) { 
-        if (Function *Callee = CI->getCalledFunction()) { 
-          LibFunc LF; 
-          if (TLI->getLibFunc(*Callee, LF)) { 
-            if (LF == LibFunc_fork) { 
-#if !defined(_WIN32) 
-              Forks.push_back(CI); 
-#endif 
-            } else if (LF == LibFunc_execl || LF == LibFunc_execle || 
-                       LF == LibFunc_execlp || LF == LibFunc_execv || 
-                       LF == LibFunc_execvp || LF == LibFunc_execve || 
-                       LF == LibFunc_execvpe || LF == LibFunc_execvP) { 
-              Execs.push_back(CI); 
-            } 
-          } 
-        } 
-      } 
-    } 
-  } 
- 
-  for (auto F : Forks) { 
-    IRBuilder<> Builder(F); 
-    BasicBlock *Parent = F->getParent(); 
-    auto NextInst = ++F->getIterator(); 
- 
-    // We've a fork so just reset the counters in the child process 
-    FunctionType *FTy = FunctionType::get(Builder.getInt32Ty(), {}, false); 
-    FunctionCallee GCOVFork = M->getOrInsertFunction("__gcov_fork", FTy); 
-    F->setCalledFunction(GCOVFork); 
- 
-    // We split just after the fork to have a counter for the lines after 
-    // Anyway there's a bug: 
-    // void foo() { fork(); } 
-    // void bar() { foo(); blah(); } 
-    // then "blah();" will be called 2 times but showed as 1 
-    // because "blah()" belongs to the same block as "foo();" 
-    Parent->splitBasicBlock(NextInst); 
- 
-    // back() is a br instruction with a debug location 
-    // equals to the one from NextAfterFork 
-    // So to avoid to have two debug locs on two blocks just change it 
-    DebugLoc Loc = F->getDebugLoc(); 
-    Parent->back().setDebugLoc(Loc); 
-  } 
- 
-  for (auto E : Execs) { 
-    IRBuilder<> Builder(E); 
-    BasicBlock *Parent = E->getParent(); 
-    auto NextInst = ++E->getIterator(); 
- 
-    // Since the process is replaced by a new one we need to write out gcdas 
-    // No need to reset the counters since they'll be lost after the exec** 
-    FunctionType *FTy = FunctionType::get(Builder.getVoidTy(), {}, false); 
-    FunctionCallee WriteoutF = 
-        M->getOrInsertFunction("llvm_writeout_files", FTy); 
-    Builder.CreateCall(WriteoutF); 
- 
-    DebugLoc Loc = E->getDebugLoc(); 
-    Builder.SetInsertPoint(&*NextInst); 
-    // If the exec** fails we must reset the counters since they've been 
-    // dumped 
-    FunctionCallee ResetF = M->getOrInsertFunction("llvm_reset_counters", FTy); 
-    Builder.CreateCall(ResetF)->setDebugLoc(Loc); 
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+
+static bool functionHasLines(const Function &F, unsigned &EndLine) {
+  // Check whether this function actually has any source lines. Not only
+  // do these waste space, they also can crash gcov.
+  EndLine = 0;
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      // Debug intrinsic locations correspond to the location of the
+      // declaration, not necessarily any statements or expressions.
+      if (isa<DbgInfoIntrinsic>(&I)) continue;
+
+      const DebugLoc &Loc = I.getDebugLoc();
+      if (!Loc)
+        continue;
+
+      // Artificial lines such as calls to the global constructors.
+      if (Loc.getLine() == 0) continue;
+      EndLine = std::max(EndLine, Loc.getLine());
+
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool isUsingScopeBasedEH(Function &F) {
+  if (!F.hasPersonalityFn()) return false;
+
+  EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
+  return isScopedEHPersonality(Personality);
+}
+
+bool GCOVProfiler::AddFlushBeforeForkAndExec() {
+  SmallVector<CallInst *, 2> Forks;
+  SmallVector<CallInst *, 2> Execs;
+  for (auto &F : M->functions()) {
+    auto *TLI = &GetTLI(F);
+    for (auto &I : instructions(F)) {
+      if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+        if (Function *Callee = CI->getCalledFunction()) {
+          LibFunc LF;
+          if (TLI->getLibFunc(*Callee, LF)) {
+            if (LF == LibFunc_fork) {
+#if !defined(_WIN32)
+              Forks.push_back(CI);
+#endif
+            } else if (LF == LibFunc_execl || LF == LibFunc_execle ||
+                       LF == LibFunc_execlp || LF == LibFunc_execv ||
+                       LF == LibFunc_execvp || LF == LibFunc_execve ||
+                       LF == LibFunc_execvpe || LF == LibFunc_execvP) {
+              Execs.push_back(CI);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  for (auto F : Forks) {
+    IRBuilder<> Builder(F);
+    BasicBlock *Parent = F->getParent();
+    auto NextInst = ++F->getIterator();
+
+    // We've a fork so just reset the counters in the child process
+    FunctionType *FTy = FunctionType::get(Builder.getInt32Ty(), {}, false);
+    FunctionCallee GCOVFork = M->getOrInsertFunction("__gcov_fork", FTy);
+    F->setCalledFunction(GCOVFork);
+
+    // We split just after the fork to have a counter for the lines after
+    // Anyway there's a bug:
+    // void foo() { fork(); }
+    // void bar() { foo(); blah(); }
+    // then "blah();" will be called 2 times but showed as 1
+    // because "blah()" belongs to the same block as "foo();"
+    Parent->splitBasicBlock(NextInst);
+
+    // back() is a br instruction with a debug location
+    // equals to the one from NextAfterFork
+    // So to avoid to have two debug locs on two blocks just change it
+    DebugLoc Loc = F->getDebugLoc();
+    Parent->back().setDebugLoc(Loc);
+  }
+
+  for (auto E : Execs) {
+    IRBuilder<> Builder(E);
+    BasicBlock *Parent = E->getParent();
+    auto NextInst = ++E->getIterator();
+
+    // Since the process is replaced by a new one we need to write out gcdas
+    // No need to reset the counters since they'll be lost after the exec**
+    FunctionType *FTy = FunctionType::get(Builder.getVoidTy(), {}, false);
+    FunctionCallee WriteoutF =
+        M->getOrInsertFunction("llvm_writeout_files", FTy);
+    Builder.CreateCall(WriteoutF);
+
+    DebugLoc Loc = E->getDebugLoc();
+    Builder.SetInsertPoint(&*NextInst);
+    // If the exec** fails we must reset the counters since they've been
+    // dumped
+    FunctionCallee ResetF = M->getOrInsertFunction("llvm_reset_counters", FTy);
+    Builder.CreateCall(ResetF)->setDebugLoc(Loc);
     ExecBlocks.insert(Parent);
-    Parent->splitBasicBlock(NextInst); 
-    Parent->back().setDebugLoc(Loc); 
-  } 
- 
-  return !Forks.empty() || !Execs.empty(); 
-} 
- 
+    Parent->splitBasicBlock(NextInst);
+    Parent->back().setDebugLoc(Loc);
+  }
+
+  return !Forks.empty() || !Execs.empty();
+}
+
 static BasicBlock *getInstrBB(CFGMST<Edge, BBInfo> &MST, Edge &E,
                               const DenseSet<const BasicBlock *> &ExecBlocks) {
   if (E.InMST || E.Removed)
     return nullptr;
- 
+
   BasicBlock *SrcBB = const_cast<BasicBlock *>(E.SrcBB);
   BasicBlock *DestBB = const_cast<BasicBlock *>(E.DestBB);
   // For a fake edge, instrument the real BB.
@@ -813,42 +813,42 @@ bool GCOVProfiler::emitProfileNotes(
     function_ref<BlockFrequencyInfo *(Function &F)> GetBFI,
     function_ref<BranchProbabilityInfo *(Function &F)> GetBPI,
     function_ref<const TargetLibraryInfo &(Function &F)> GetTLI) {
-  int Version; 
-  { 
-    uint8_t c3 = Options.Version[0]; 
-    uint8_t c2 = Options.Version[1]; 
-    uint8_t c1 = Options.Version[2]; 
-    Version = c3 >= 'A' ? (c3 - 'A') * 100 + (c2 - '0') * 10 + c1 - '0' 
-                        : (c3 - '0') * 10 + c1 - '0'; 
-  } 
- 
+  int Version;
+  {
+    uint8_t c3 = Options.Version[0];
+    uint8_t c2 = Options.Version[1];
+    uint8_t c1 = Options.Version[2];
+    Version = c3 >= 'A' ? (c3 - 'A') * 100 + (c2 - '0') * 10 + c1 - '0'
+                        : (c3 - '0') * 10 + c1 - '0';
+  }
+
   bool EmitGCDA = Options.EmitData;
   for (unsigned i = 0, e = CUNode->getNumOperands(); i != e; ++i) {
-    // Each compile unit gets its own .gcno file. This means that whether we run 
-    // this pass over the original .o's as they're produced, or run it after 
-    // LTO, we'll generate the same .gcno files. 
- 
+    // Each compile unit gets its own .gcno file. This means that whether we run
+    // this pass over the original .o's as they're produced, or run it after
+    // LTO, we'll generate the same .gcno files.
+
     auto *CU = cast<DICompileUnit>(CUNode->getOperand(i));
- 
-    // Skip module skeleton (and module) CUs. 
-    if (CU->getDWOId()) 
-      continue; 
- 
+
+    // Skip module skeleton (and module) CUs.
+    if (CU->getDWOId())
+      continue;
+
     std::vector<uint8_t> EdgeDestinations;
     SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> CountersBySP;
- 
-    Endian = M->getDataLayout().isLittleEndian() ? support::endianness::little 
-                                                 : support::endianness::big; 
-    unsigned FunctionIdent = 0; 
-    for (auto &F : M->functions()) { 
-      DISubprogram *SP = F.getSubprogram(); 
-      unsigned EndLine; 
-      if (!SP) continue; 
-      if (!functionHasLines(F, EndLine) || !isFunctionInstrumented(F)) 
-        continue; 
-      // TODO: Functions using scope-based EH are currently not supported. 
-      if (isUsingScopeBasedEH(F)) continue; 
- 
+
+    Endian = M->getDataLayout().isLittleEndian() ? support::endianness::little
+                                                 : support::endianness::big;
+    unsigned FunctionIdent = 0;
+    for (auto &F : M->functions()) {
+      DISubprogram *SP = F.getSubprogram();
+      unsigned EndLine;
+      if (!SP) continue;
+      if (!functionHasLines(F, EndLine) || !isFunctionInstrumented(F))
+        continue;
+      // TODO: Functions using scope-based EH are currently not supported.
+      if (isUsingScopeBasedEH(F)) continue;
+
       // Add the function line number to the lines of the entry block
       // to have a counter for the function definition.
       uint32_t Line = SP->getLine();
@@ -873,11 +873,11 @@ bool GCOVProfiler::emitProfileNotes(
         E.Place = getInstrBB(MST, E, ExecBlocks);
       }
       // Basic blocks in F are finalized at this point.
-      BasicBlock &EntryBlock = F.getEntryBlock(); 
-      Funcs.push_back(std::make_unique<GCOVFunction>(this, &F, SP, EndLine, 
-                                                     FunctionIdent++, Version)); 
-      GCOVFunction &Func = *Funcs.back(); 
- 
+      BasicBlock &EntryBlock = F.getEntryBlock();
+      Funcs.push_back(std::make_unique<GCOVFunction>(this, &F, SP, EndLine,
+                                                     FunctionIdent++, Version));
+      GCOVFunction &Func = *Funcs.back();
+
       // Some non-tree edges are IndirectBr which cannot be split. Ignore them
       // as well.
       llvm::erase_if(MST.AllEdges, [](std::unique_ptr<Edge> &E) {
@@ -903,7 +903,7 @@ bool GCOVProfiler::emitProfileNotes(
             return L->SrcNumber != R->SrcNumber ? L->SrcNumber < R->SrcNumber
                                                 : L->DstNumber < R->DstNumber;
           });
- 
+
       for (const Edge &E : make_pointee_range(MST.AllEdges)) {
         GCOVBlock &Src =
             E.SrcBB ? Func.getBlock(E.SrcBB) : Func.getEntryBlock();
@@ -912,10 +912,10 @@ bool GCOVProfiler::emitProfileNotes(
         Src.addEdge(Dst, E.Place ? 0 : uint32_t(GCOV_ARC_ON_TREE));
       }
 
-      // Artificial functions such as global initializers 
-      if (!SP->isArtificial()) 
-        Func.getBlock(&EntryBlock).getFile(Filename).addLine(Line); 
- 
+      // Artificial functions such as global initializers
+      if (!SP->isArtificial())
+        Func.getBlock(&EntryBlock).getFile(Filename).addLine(Line);
+
       LLVM_DEBUG(dumpEdges(MST, Func));
 
       for (auto &GB : Func.Blocks) {
@@ -925,31 +925,31 @@ bool GCOVProfiler::emitProfileNotes(
           uint32_t Idx = Succ.first->Number;
           do EdgeDestinations.push_back(Idx & 255);
           while ((Idx >>= 8) > 0);
-        } 
- 
-        for (auto &I : BB) { 
-          // Debug intrinsic locations correspond to the location of the 
-          // declaration, not necessarily any statements or expressions. 
-          if (isa<DbgInfoIntrinsic>(&I)) continue; 
- 
-          const DebugLoc &Loc = I.getDebugLoc(); 
-          if (!Loc) 
-            continue; 
- 
-          // Artificial lines such as calls to the global constructors. 
-          if (Loc.getLine() == 0 || Loc.isImplicitCode()) 
-            continue; 
- 
-          if (Line == Loc.getLine()) continue; 
-          Line = Loc.getLine(); 
-          if (SP != getDISubprogram(Loc.getScope())) 
-            continue; 
- 
-          GCOVLines &Lines = Block.getFile(Filename); 
-          Lines.addLine(Loc.getLine()); 
-        } 
-        Line = 0; 
-      } 
+        }
+
+        for (auto &I : BB) {
+          // Debug intrinsic locations correspond to the location of the
+          // declaration, not necessarily any statements or expressions.
+          if (isa<DbgInfoIntrinsic>(&I)) continue;
+
+          const DebugLoc &Loc = I.getDebugLoc();
+          if (!Loc)
+            continue;
+
+          // Artificial lines such as calls to the global constructors.
+          if (Loc.getLine() == 0 || Loc.isImplicitCode())
+            continue;
+
+          if (Line == Loc.getLine()) continue;
+          Line = Loc.getLine();
+          if (SP != getDISubprogram(Loc.getScope()))
+            continue;
+
+          GCOVLines &Lines = Block.getFile(Filename);
+          Lines.addLine(Loc.getLine());
+        }
+        Line = 0;
+      }
       if (EmitGCDA) {
         DISubprogram *SP = F.getSubprogram();
         ArrayType *CounterTy = ArrayType::get(Type::getInt64Ty(*Ctx), Measured);
@@ -974,14 +974,14 @@ bool GCOVProfiler::emitProfileNotes(
           }
         }
       }
-    } 
- 
-    char Tmp[4]; 
+    }
+
+    char Tmp[4];
     JamCRC JC;
     JC.update(EdgeDestinations);
     uint32_t Stamp = JC.getCRC();
-    FileChecksums.push_back(Stamp); 
- 
+    FileChecksums.push_back(Stamp);
+
     if (Options.EmitNotes) {
       std::error_code EC;
       raw_fd_ostream out(mangleName(CU, GCovFileType::GCNO), EC,
@@ -990,8 +990,8 @@ bool GCOVProfiler::emitProfileNotes(
         Ctx->emitError(
             Twine("failed to open coverage notes file for writing: ") +
             EC.message());
-        continue; 
-      } 
+        continue;
+      }
       os = &out;
       if (Endian == support::endianness::big) {
         out.write("gcno", 4);
@@ -1006,28 +1006,28 @@ bool GCOVProfiler::emitProfileNotes(
         writeString(""); // unuseful current_working_directory
       if (Version >= 80)
         write(0); // unuseful has_unexecuted_blocks
- 
+
       for (auto &Func : Funcs)
         Func->writeOut(Stamp);
- 
+
       write(0);
       write(0);
       out.close();
     }
- 
+
     if (EmitGCDA) {
       emitGlobalConstructor(CountersBySP);
       EmitGCDA = false;
-    } 
+    }
   }
   return true;
 }
- 
+
 void GCOVProfiler::emitGlobalConstructor(
     SmallVectorImpl<std::pair<GlobalVariable *, MDNode *>> &CountersBySP) {
   Function *WriteoutF = insertCounterWriteout(CountersBySP);
   Function *ResetF = insertReset(CountersBySP);
- 
+
   // Create a small bit of code that registers the "__llvm_gcov_writeout" to
   // be executed at exit and the "__llvm_gcov_flush" function to be executed
   // when "__gcov_flush" is called.
@@ -1039,355 +1039,355 @@ void GCOVProfiler::emitGlobalConstructor(
   F->addFnAttr(Attribute::NoInline);
   if (Options.NoRedZone)
     F->addFnAttr(Attribute::NoRedZone);
- 
+
   BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", F);
   IRBuilder<> Builder(BB);
- 
+
   FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
   auto *PFTy = PointerType::get(FTy, 0);
   FTy = FunctionType::get(Builder.getVoidTy(), {PFTy, PFTy}, false);
- 
+
   // Initialize the environment and register the local writeout, flush and
   // reset functions.
   FunctionCallee GCOVInit = M->getOrInsertFunction("llvm_gcov_init", FTy);
   Builder.CreateCall(GCOVInit, {WriteoutF, ResetF});
   Builder.CreateRetVoid();
- 
+
   appendToGlobalCtors(*M, F, 0);
-} 
- 
-FunctionCallee GCOVProfiler::getStartFileFunc(const TargetLibraryInfo *TLI) { 
-  Type *Args[] = { 
-      Type::getInt8PtrTy(*Ctx), // const char *orig_filename 
-      Type::getInt32Ty(*Ctx),   // uint32_t version 
-      Type::getInt32Ty(*Ctx),   // uint32_t checksum 
-  }; 
-  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false); 
-  AttributeList AL; 
-  if (auto AK = TLI->getExtAttrForI32Param(false)) 
-    AL = AL.addParamAttribute(*Ctx, 2, AK); 
-  FunctionCallee Res = M->getOrInsertFunction("llvm_gcda_start_file", FTy, AL); 
-  return Res; 
-} 
- 
-FunctionCallee GCOVProfiler::getEmitFunctionFunc(const TargetLibraryInfo *TLI) { 
-  Type *Args[] = { 
-    Type::getInt32Ty(*Ctx),    // uint32_t ident 
-    Type::getInt32Ty(*Ctx),    // uint32_t func_checksum 
-    Type::getInt32Ty(*Ctx),    // uint32_t cfg_checksum 
-  }; 
-  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false); 
-  AttributeList AL; 
-  if (auto AK = TLI->getExtAttrForI32Param(false)) { 
-    AL = AL.addParamAttribute(*Ctx, 0, AK); 
-    AL = AL.addParamAttribute(*Ctx, 1, AK); 
-    AL = AL.addParamAttribute(*Ctx, 2, AK); 
-  } 
-  return M->getOrInsertFunction("llvm_gcda_emit_function", FTy); 
-} 
- 
-FunctionCallee GCOVProfiler::getEmitArcsFunc(const TargetLibraryInfo *TLI) { 
-  Type *Args[] = { 
-    Type::getInt32Ty(*Ctx),     // uint32_t num_counters 
-    Type::getInt64PtrTy(*Ctx),  // uint64_t *counters 
-  }; 
-  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false); 
-  AttributeList AL; 
-  if (auto AK = TLI->getExtAttrForI32Param(false)) 
-    AL = AL.addParamAttribute(*Ctx, 0, AK); 
-  return M->getOrInsertFunction("llvm_gcda_emit_arcs", FTy, AL); 
-} 
- 
-FunctionCallee GCOVProfiler::getSummaryInfoFunc() { 
-  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false); 
-  return M->getOrInsertFunction("llvm_gcda_summary_info", FTy); 
-} 
- 
-FunctionCallee GCOVProfiler::getEndFileFunc() { 
-  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false); 
-  return M->getOrInsertFunction("llvm_gcda_end_file", FTy); 
-} 
- 
-Function *GCOVProfiler::insertCounterWriteout( 
-    ArrayRef<std::pair<GlobalVariable *, MDNode *> > CountersBySP) { 
-  FunctionType *WriteoutFTy = FunctionType::get(Type::getVoidTy(*Ctx), false); 
-  Function *WriteoutF = M->getFunction("__llvm_gcov_writeout"); 
-  if (!WriteoutF) 
-    WriteoutF = Function::Create(WriteoutFTy, GlobalValue::InternalLinkage, 
-                                 "__llvm_gcov_writeout", M); 
-  WriteoutF->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); 
-  WriteoutF->addFnAttr(Attribute::NoInline); 
-  if (Options.NoRedZone) 
-    WriteoutF->addFnAttr(Attribute::NoRedZone); 
- 
-  BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", WriteoutF); 
-  IRBuilder<> Builder(BB); 
- 
-  auto *TLI = &GetTLI(*WriteoutF); 
- 
-  FunctionCallee StartFile = getStartFileFunc(TLI); 
-  FunctionCallee EmitFunction = getEmitFunctionFunc(TLI); 
-  FunctionCallee EmitArcs = getEmitArcsFunc(TLI); 
-  FunctionCallee SummaryInfo = getSummaryInfoFunc(); 
-  FunctionCallee EndFile = getEndFileFunc(); 
- 
-  NamedMDNode *CUNodes = M->getNamedMetadata("llvm.dbg.cu"); 
-  if (!CUNodes) { 
-    Builder.CreateRetVoid(); 
-    return WriteoutF; 
-  } 
- 
-  // Collect the relevant data into a large constant data structure that we can 
-  // walk to write out everything. 
-  StructType *StartFileCallArgsTy = StructType::create( 
+}
+
+FunctionCallee GCOVProfiler::getStartFileFunc(const TargetLibraryInfo *TLI) {
+  Type *Args[] = {
+      Type::getInt8PtrTy(*Ctx), // const char *orig_filename
+      Type::getInt32Ty(*Ctx),   // uint32_t version
+      Type::getInt32Ty(*Ctx),   // uint32_t checksum
+  };
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
+  AttributeList AL;
+  if (auto AK = TLI->getExtAttrForI32Param(false))
+    AL = AL.addParamAttribute(*Ctx, 2, AK);
+  FunctionCallee Res = M->getOrInsertFunction("llvm_gcda_start_file", FTy, AL);
+  return Res;
+}
+
+FunctionCallee GCOVProfiler::getEmitFunctionFunc(const TargetLibraryInfo *TLI) {
+  Type *Args[] = {
+    Type::getInt32Ty(*Ctx),    // uint32_t ident
+    Type::getInt32Ty(*Ctx),    // uint32_t func_checksum
+    Type::getInt32Ty(*Ctx),    // uint32_t cfg_checksum
+  };
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
+  AttributeList AL;
+  if (auto AK = TLI->getExtAttrForI32Param(false)) {
+    AL = AL.addParamAttribute(*Ctx, 0, AK);
+    AL = AL.addParamAttribute(*Ctx, 1, AK);
+    AL = AL.addParamAttribute(*Ctx, 2, AK);
+  }
+  return M->getOrInsertFunction("llvm_gcda_emit_function", FTy);
+}
+
+FunctionCallee GCOVProfiler::getEmitArcsFunc(const TargetLibraryInfo *TLI) {
+  Type *Args[] = {
+    Type::getInt32Ty(*Ctx),     // uint32_t num_counters
+    Type::getInt64PtrTy(*Ctx),  // uint64_t *counters
+  };
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
+  AttributeList AL;
+  if (auto AK = TLI->getExtAttrForI32Param(false))
+    AL = AL.addParamAttribute(*Ctx, 0, AK);
+  return M->getOrInsertFunction("llvm_gcda_emit_arcs", FTy, AL);
+}
+
+FunctionCallee GCOVProfiler::getSummaryInfoFunc() {
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  return M->getOrInsertFunction("llvm_gcda_summary_info", FTy);
+}
+
+FunctionCallee GCOVProfiler::getEndFileFunc() {
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  return M->getOrInsertFunction("llvm_gcda_end_file", FTy);
+}
+
+Function *GCOVProfiler::insertCounterWriteout(
+    ArrayRef<std::pair<GlobalVariable *, MDNode *> > CountersBySP) {
+  FunctionType *WriteoutFTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  Function *WriteoutF = M->getFunction("__llvm_gcov_writeout");
+  if (!WriteoutF)
+    WriteoutF = Function::Create(WriteoutFTy, GlobalValue::InternalLinkage,
+                                 "__llvm_gcov_writeout", M);
+  WriteoutF->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  WriteoutF->addFnAttr(Attribute::NoInline);
+  if (Options.NoRedZone)
+    WriteoutF->addFnAttr(Attribute::NoRedZone);
+
+  BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", WriteoutF);
+  IRBuilder<> Builder(BB);
+
+  auto *TLI = &GetTLI(*WriteoutF);
+
+  FunctionCallee StartFile = getStartFileFunc(TLI);
+  FunctionCallee EmitFunction = getEmitFunctionFunc(TLI);
+  FunctionCallee EmitArcs = getEmitArcsFunc(TLI);
+  FunctionCallee SummaryInfo = getSummaryInfoFunc();
+  FunctionCallee EndFile = getEndFileFunc();
+
+  NamedMDNode *CUNodes = M->getNamedMetadata("llvm.dbg.cu");
+  if (!CUNodes) {
+    Builder.CreateRetVoid();
+    return WriteoutF;
+  }
+
+  // Collect the relevant data into a large constant data structure that we can
+  // walk to write out everything.
+  StructType *StartFileCallArgsTy = StructType::create(
       {Builder.getInt8PtrTy(), Builder.getInt32Ty(), Builder.getInt32Ty()},
       "start_file_args_ty");
-  StructType *EmitFunctionCallArgsTy = StructType::create( 
+  StructType *EmitFunctionCallArgsTy = StructType::create(
       {Builder.getInt32Ty(), Builder.getInt32Ty(), Builder.getInt32Ty()},
       "emit_function_args_ty");
-  StructType *EmitArcsCallArgsTy = StructType::create( 
+  StructType *EmitArcsCallArgsTy = StructType::create(
       {Builder.getInt32Ty(), Builder.getInt64Ty()->getPointerTo()},
       "emit_arcs_args_ty");
-  StructType *FileInfoTy = 
-      StructType::create({StartFileCallArgsTy, Builder.getInt32Ty(), 
-                          EmitFunctionCallArgsTy->getPointerTo(), 
+  StructType *FileInfoTy =
+      StructType::create({StartFileCallArgsTy, Builder.getInt32Ty(),
+                          EmitFunctionCallArgsTy->getPointerTo(),
                           EmitArcsCallArgsTy->getPointerTo()},
                          "file_info");
- 
-  Constant *Zero32 = Builder.getInt32(0); 
-  // Build an explicit array of two zeros for use in ConstantExpr GEP building. 
-  Constant *TwoZero32s[] = {Zero32, Zero32}; 
- 
-  SmallVector<Constant *, 8> FileInfos; 
-  for (int i : llvm::seq<int>(0, CUNodes->getNumOperands())) { 
-    auto *CU = cast<DICompileUnit>(CUNodes->getOperand(i)); 
- 
-    // Skip module skeleton (and module) CUs. 
-    if (CU->getDWOId()) 
-      continue; 
- 
-    std::string FilenameGcda = mangleName(CU, GCovFileType::GCDA); 
-    uint32_t CfgChecksum = FileChecksums.empty() ? 0 : FileChecksums[i]; 
-    auto *StartFileCallArgs = ConstantStruct::get( 
-        StartFileCallArgsTy, 
-        {Builder.CreateGlobalStringPtr(FilenameGcda), 
-         Builder.getInt32(endian::read32be(Options.Version)), 
-         Builder.getInt32(CfgChecksum)}); 
- 
-    SmallVector<Constant *, 8> EmitFunctionCallArgsArray; 
-    SmallVector<Constant *, 8> EmitArcsCallArgsArray; 
-    for (int j : llvm::seq<int>(0, CountersBySP.size())) { 
-      uint32_t FuncChecksum = Funcs.empty() ? 0 : Funcs[j]->getFuncChecksum(); 
-      EmitFunctionCallArgsArray.push_back(ConstantStruct::get( 
-          EmitFunctionCallArgsTy, 
-          {Builder.getInt32(j), 
-           Builder.getInt32(FuncChecksum), 
-           Builder.getInt32(CfgChecksum)})); 
- 
-      GlobalVariable *GV = CountersBySP[j].first; 
-      unsigned Arcs = cast<ArrayType>(GV->getValueType())->getNumElements(); 
-      EmitArcsCallArgsArray.push_back(ConstantStruct::get( 
-          EmitArcsCallArgsTy, 
-          {Builder.getInt32(Arcs), ConstantExpr::getInBoundsGetElementPtr( 
-                                       GV->getValueType(), GV, TwoZero32s)})); 
-    } 
-    // Create global arrays for the two emit calls. 
-    int CountersSize = CountersBySP.size(); 
-    assert(CountersSize == (int)EmitFunctionCallArgsArray.size() && 
-           "Mismatched array size!"); 
-    assert(CountersSize == (int)EmitArcsCallArgsArray.size() && 
-           "Mismatched array size!"); 
-    auto *EmitFunctionCallArgsArrayTy = 
-        ArrayType::get(EmitFunctionCallArgsTy, CountersSize); 
-    auto *EmitFunctionCallArgsArrayGV = new GlobalVariable( 
-        *M, EmitFunctionCallArgsArrayTy, /*isConstant*/ true, 
-        GlobalValue::InternalLinkage, 
-        ConstantArray::get(EmitFunctionCallArgsArrayTy, 
-                           EmitFunctionCallArgsArray), 
-        Twine("__llvm_internal_gcov_emit_function_args.") + Twine(i)); 
-    auto *EmitArcsCallArgsArrayTy = 
-        ArrayType::get(EmitArcsCallArgsTy, CountersSize); 
-    EmitFunctionCallArgsArrayGV->setUnnamedAddr( 
-        GlobalValue::UnnamedAddr::Global); 
-    auto *EmitArcsCallArgsArrayGV = new GlobalVariable( 
-        *M, EmitArcsCallArgsArrayTy, /*isConstant*/ true, 
-        GlobalValue::InternalLinkage, 
-        ConstantArray::get(EmitArcsCallArgsArrayTy, EmitArcsCallArgsArray), 
-        Twine("__llvm_internal_gcov_emit_arcs_args.") + Twine(i)); 
-    EmitArcsCallArgsArrayGV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); 
- 
-    FileInfos.push_back(ConstantStruct::get( 
-        FileInfoTy, 
-        {StartFileCallArgs, Builder.getInt32(CountersSize), 
-         ConstantExpr::getInBoundsGetElementPtr(EmitFunctionCallArgsArrayTy, 
-                                                EmitFunctionCallArgsArrayGV, 
-                                                TwoZero32s), 
-         ConstantExpr::getInBoundsGetElementPtr( 
-             EmitArcsCallArgsArrayTy, EmitArcsCallArgsArrayGV, TwoZero32s)})); 
-  } 
- 
-  // If we didn't find anything to actually emit, bail on out. 
-  if (FileInfos.empty()) { 
-    Builder.CreateRetVoid(); 
-    return WriteoutF; 
-  } 
- 
-  // To simplify code, we cap the number of file infos we write out to fit 
-  // easily in a 32-bit signed integer. This gives consistent behavior between 
-  // 32-bit and 64-bit systems without requiring (potentially very slow) 64-bit 
-  // operations on 32-bit systems. It also seems unreasonable to try to handle 
-  // more than 2 billion files. 
-  if ((int64_t)FileInfos.size() > (int64_t)INT_MAX) 
-    FileInfos.resize(INT_MAX); 
- 
-  // Create a global for the entire data structure so we can walk it more 
-  // easily. 
-  auto *FileInfoArrayTy = ArrayType::get(FileInfoTy, FileInfos.size()); 
-  auto *FileInfoArrayGV = new GlobalVariable( 
-      *M, FileInfoArrayTy, /*isConstant*/ true, GlobalValue::InternalLinkage, 
-      ConstantArray::get(FileInfoArrayTy, FileInfos), 
-      "__llvm_internal_gcov_emit_file_info"); 
-  FileInfoArrayGV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); 
- 
-  // Create the CFG for walking this data structure. 
-  auto *FileLoopHeader = 
-      BasicBlock::Create(*Ctx, "file.loop.header", WriteoutF); 
-  auto *CounterLoopHeader = 
-      BasicBlock::Create(*Ctx, "counter.loop.header", WriteoutF); 
-  auto *FileLoopLatch = BasicBlock::Create(*Ctx, "file.loop.latch", WriteoutF); 
-  auto *ExitBB = BasicBlock::Create(*Ctx, "exit", WriteoutF); 
- 
-  // We always have at least one file, so just branch to the header. 
-  Builder.CreateBr(FileLoopHeader); 
- 
-  // The index into the files structure is our loop induction variable. 
-  Builder.SetInsertPoint(FileLoopHeader); 
+
+  Constant *Zero32 = Builder.getInt32(0);
+  // Build an explicit array of two zeros for use in ConstantExpr GEP building.
+  Constant *TwoZero32s[] = {Zero32, Zero32};
+
+  SmallVector<Constant *, 8> FileInfos;
+  for (int i : llvm::seq<int>(0, CUNodes->getNumOperands())) {
+    auto *CU = cast<DICompileUnit>(CUNodes->getOperand(i));
+
+    // Skip module skeleton (and module) CUs.
+    if (CU->getDWOId())
+      continue;
+
+    std::string FilenameGcda = mangleName(CU, GCovFileType::GCDA);
+    uint32_t CfgChecksum = FileChecksums.empty() ? 0 : FileChecksums[i];
+    auto *StartFileCallArgs = ConstantStruct::get(
+        StartFileCallArgsTy,
+        {Builder.CreateGlobalStringPtr(FilenameGcda),
+         Builder.getInt32(endian::read32be(Options.Version)),
+         Builder.getInt32(CfgChecksum)});
+
+    SmallVector<Constant *, 8> EmitFunctionCallArgsArray;
+    SmallVector<Constant *, 8> EmitArcsCallArgsArray;
+    for (int j : llvm::seq<int>(0, CountersBySP.size())) {
+      uint32_t FuncChecksum = Funcs.empty() ? 0 : Funcs[j]->getFuncChecksum();
+      EmitFunctionCallArgsArray.push_back(ConstantStruct::get(
+          EmitFunctionCallArgsTy,
+          {Builder.getInt32(j),
+           Builder.getInt32(FuncChecksum),
+           Builder.getInt32(CfgChecksum)}));
+
+      GlobalVariable *GV = CountersBySP[j].first;
+      unsigned Arcs = cast<ArrayType>(GV->getValueType())->getNumElements();
+      EmitArcsCallArgsArray.push_back(ConstantStruct::get(
+          EmitArcsCallArgsTy,
+          {Builder.getInt32(Arcs), ConstantExpr::getInBoundsGetElementPtr(
+                                       GV->getValueType(), GV, TwoZero32s)}));
+    }
+    // Create global arrays for the two emit calls.
+    int CountersSize = CountersBySP.size();
+    assert(CountersSize == (int)EmitFunctionCallArgsArray.size() &&
+           "Mismatched array size!");
+    assert(CountersSize == (int)EmitArcsCallArgsArray.size() &&
+           "Mismatched array size!");
+    auto *EmitFunctionCallArgsArrayTy =
+        ArrayType::get(EmitFunctionCallArgsTy, CountersSize);
+    auto *EmitFunctionCallArgsArrayGV = new GlobalVariable(
+        *M, EmitFunctionCallArgsArrayTy, /*isConstant*/ true,
+        GlobalValue::InternalLinkage,
+        ConstantArray::get(EmitFunctionCallArgsArrayTy,
+                           EmitFunctionCallArgsArray),
+        Twine("__llvm_internal_gcov_emit_function_args.") + Twine(i));
+    auto *EmitArcsCallArgsArrayTy =
+        ArrayType::get(EmitArcsCallArgsTy, CountersSize);
+    EmitFunctionCallArgsArrayGV->setUnnamedAddr(
+        GlobalValue::UnnamedAddr::Global);
+    auto *EmitArcsCallArgsArrayGV = new GlobalVariable(
+        *M, EmitArcsCallArgsArrayTy, /*isConstant*/ true,
+        GlobalValue::InternalLinkage,
+        ConstantArray::get(EmitArcsCallArgsArrayTy, EmitArcsCallArgsArray),
+        Twine("__llvm_internal_gcov_emit_arcs_args.") + Twine(i));
+    EmitArcsCallArgsArrayGV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+
+    FileInfos.push_back(ConstantStruct::get(
+        FileInfoTy,
+        {StartFileCallArgs, Builder.getInt32(CountersSize),
+         ConstantExpr::getInBoundsGetElementPtr(EmitFunctionCallArgsArrayTy,
+                                                EmitFunctionCallArgsArrayGV,
+                                                TwoZero32s),
+         ConstantExpr::getInBoundsGetElementPtr(
+             EmitArcsCallArgsArrayTy, EmitArcsCallArgsArrayGV, TwoZero32s)}));
+  }
+
+  // If we didn't find anything to actually emit, bail on out.
+  if (FileInfos.empty()) {
+    Builder.CreateRetVoid();
+    return WriteoutF;
+  }
+
+  // To simplify code, we cap the number of file infos we write out to fit
+  // easily in a 32-bit signed integer. This gives consistent behavior between
+  // 32-bit and 64-bit systems without requiring (potentially very slow) 64-bit
+  // operations on 32-bit systems. It also seems unreasonable to try to handle
+  // more than 2 billion files.
+  if ((int64_t)FileInfos.size() > (int64_t)INT_MAX)
+    FileInfos.resize(INT_MAX);
+
+  // Create a global for the entire data structure so we can walk it more
+  // easily.
+  auto *FileInfoArrayTy = ArrayType::get(FileInfoTy, FileInfos.size());
+  auto *FileInfoArrayGV = new GlobalVariable(
+      *M, FileInfoArrayTy, /*isConstant*/ true, GlobalValue::InternalLinkage,
+      ConstantArray::get(FileInfoArrayTy, FileInfos),
+      "__llvm_internal_gcov_emit_file_info");
+  FileInfoArrayGV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+
+  // Create the CFG for walking this data structure.
+  auto *FileLoopHeader =
+      BasicBlock::Create(*Ctx, "file.loop.header", WriteoutF);
+  auto *CounterLoopHeader =
+      BasicBlock::Create(*Ctx, "counter.loop.header", WriteoutF);
+  auto *FileLoopLatch = BasicBlock::Create(*Ctx, "file.loop.latch", WriteoutF);
+  auto *ExitBB = BasicBlock::Create(*Ctx, "exit", WriteoutF);
+
+  // We always have at least one file, so just branch to the header.
+  Builder.CreateBr(FileLoopHeader);
+
+  // The index into the files structure is our loop induction variable.
+  Builder.SetInsertPoint(FileLoopHeader);
   PHINode *IV = Builder.CreatePHI(Builder.getInt32Ty(), /*NumReservedValues*/ 2,
                                   "file_idx");
-  IV->addIncoming(Builder.getInt32(0), BB); 
-  auto *FileInfoPtr = Builder.CreateInBoundsGEP( 
-      FileInfoArrayTy, FileInfoArrayGV, {Builder.getInt32(0), IV}); 
-  auto *StartFileCallArgsPtr = 
+  IV->addIncoming(Builder.getInt32(0), BB);
+  auto *FileInfoPtr = Builder.CreateInBoundsGEP(
+      FileInfoArrayTy, FileInfoArrayGV, {Builder.getInt32(0), IV});
+  auto *StartFileCallArgsPtr =
       Builder.CreateStructGEP(FileInfoTy, FileInfoPtr, 0, "start_file_args");
-  auto *StartFileCall = Builder.CreateCall( 
-      StartFile, 
-      {Builder.CreateLoad(StartFileCallArgsTy->getElementType(0), 
-                          Builder.CreateStructGEP(StartFileCallArgsTy, 
+  auto *StartFileCall = Builder.CreateCall(
+      StartFile,
+      {Builder.CreateLoad(StartFileCallArgsTy->getElementType(0),
+                          Builder.CreateStructGEP(StartFileCallArgsTy,
                                                   StartFileCallArgsPtr, 0),
                           "filename"),
-       Builder.CreateLoad(StartFileCallArgsTy->getElementType(1), 
-                          Builder.CreateStructGEP(StartFileCallArgsTy, 
+       Builder.CreateLoad(StartFileCallArgsTy->getElementType(1),
+                          Builder.CreateStructGEP(StartFileCallArgsTy,
                                                   StartFileCallArgsPtr, 1),
                           "version"),
-       Builder.CreateLoad(StartFileCallArgsTy->getElementType(2), 
-                          Builder.CreateStructGEP(StartFileCallArgsTy, 
+       Builder.CreateLoad(StartFileCallArgsTy->getElementType(2),
+                          Builder.CreateStructGEP(StartFileCallArgsTy,
                                                   StartFileCallArgsPtr, 2),
                           "stamp")});
-  if (auto AK = TLI->getExtAttrForI32Param(false)) 
-    StartFileCall->addParamAttr(2, AK); 
+  if (auto AK = TLI->getExtAttrForI32Param(false))
+    StartFileCall->addParamAttr(2, AK);
   auto *NumCounters = Builder.CreateLoad(
       FileInfoTy->getElementType(1),
       Builder.CreateStructGEP(FileInfoTy, FileInfoPtr, 1), "num_ctrs");
-  auto *EmitFunctionCallArgsArray = 
-      Builder.CreateLoad(FileInfoTy->getElementType(2), 
+  auto *EmitFunctionCallArgsArray =
+      Builder.CreateLoad(FileInfoTy->getElementType(2),
                          Builder.CreateStructGEP(FileInfoTy, FileInfoPtr, 2),
                          "emit_function_args");
   auto *EmitArcsCallArgsArray = Builder.CreateLoad(
       FileInfoTy->getElementType(3),
       Builder.CreateStructGEP(FileInfoTy, FileInfoPtr, 3), "emit_arcs_args");
-  auto *EnterCounterLoopCond = 
-      Builder.CreateICmpSLT(Builder.getInt32(0), NumCounters); 
-  Builder.CreateCondBr(EnterCounterLoopCond, CounterLoopHeader, FileLoopLatch); 
- 
-  Builder.SetInsertPoint(CounterLoopHeader); 
+  auto *EnterCounterLoopCond =
+      Builder.CreateICmpSLT(Builder.getInt32(0), NumCounters);
+  Builder.CreateCondBr(EnterCounterLoopCond, CounterLoopHeader, FileLoopLatch);
+
+  Builder.SetInsertPoint(CounterLoopHeader);
   auto *JV = Builder.CreatePHI(Builder.getInt32Ty(), /*NumReservedValues*/ 2,
                                "ctr_idx");
-  JV->addIncoming(Builder.getInt32(0), FileLoopHeader); 
-  auto *EmitFunctionCallArgsPtr = Builder.CreateInBoundsGEP( 
-      EmitFunctionCallArgsTy, EmitFunctionCallArgsArray, JV); 
-  auto *EmitFunctionCall = Builder.CreateCall( 
-      EmitFunction, 
-      {Builder.CreateLoad(EmitFunctionCallArgsTy->getElementType(0), 
-                          Builder.CreateStructGEP(EmitFunctionCallArgsTy, 
+  JV->addIncoming(Builder.getInt32(0), FileLoopHeader);
+  auto *EmitFunctionCallArgsPtr = Builder.CreateInBoundsGEP(
+      EmitFunctionCallArgsTy, EmitFunctionCallArgsArray, JV);
+  auto *EmitFunctionCall = Builder.CreateCall(
+      EmitFunction,
+      {Builder.CreateLoad(EmitFunctionCallArgsTy->getElementType(0),
+                          Builder.CreateStructGEP(EmitFunctionCallArgsTy,
                                                   EmitFunctionCallArgsPtr, 0),
                           "ident"),
-       Builder.CreateLoad(EmitFunctionCallArgsTy->getElementType(1), 
-                          Builder.CreateStructGEP(EmitFunctionCallArgsTy, 
+       Builder.CreateLoad(EmitFunctionCallArgsTy->getElementType(1),
+                          Builder.CreateStructGEP(EmitFunctionCallArgsTy,
                                                   EmitFunctionCallArgsPtr, 1),
                           "func_checkssum"),
-       Builder.CreateLoad(EmitFunctionCallArgsTy->getElementType(2), 
-                          Builder.CreateStructGEP(EmitFunctionCallArgsTy, 
+       Builder.CreateLoad(EmitFunctionCallArgsTy->getElementType(2),
+                          Builder.CreateStructGEP(EmitFunctionCallArgsTy,
                                                   EmitFunctionCallArgsPtr, 2),
                           "cfg_checksum")});
-  if (auto AK = TLI->getExtAttrForI32Param(false)) { 
-    EmitFunctionCall->addParamAttr(0, AK); 
-    EmitFunctionCall->addParamAttr(1, AK); 
-    EmitFunctionCall->addParamAttr(2, AK); 
-  } 
-  auto *EmitArcsCallArgsPtr = 
-      Builder.CreateInBoundsGEP(EmitArcsCallArgsTy, EmitArcsCallArgsArray, JV); 
-  auto *EmitArcsCall = Builder.CreateCall( 
-      EmitArcs, 
-      {Builder.CreateLoad( 
-           EmitArcsCallArgsTy->getElementType(0), 
+  if (auto AK = TLI->getExtAttrForI32Param(false)) {
+    EmitFunctionCall->addParamAttr(0, AK);
+    EmitFunctionCall->addParamAttr(1, AK);
+    EmitFunctionCall->addParamAttr(2, AK);
+  }
+  auto *EmitArcsCallArgsPtr =
+      Builder.CreateInBoundsGEP(EmitArcsCallArgsTy, EmitArcsCallArgsArray, JV);
+  auto *EmitArcsCall = Builder.CreateCall(
+      EmitArcs,
+      {Builder.CreateLoad(
+           EmitArcsCallArgsTy->getElementType(0),
            Builder.CreateStructGEP(EmitArcsCallArgsTy, EmitArcsCallArgsPtr, 0),
            "num_counters"),
        Builder.CreateLoad(
            EmitArcsCallArgsTy->getElementType(1),
            Builder.CreateStructGEP(EmitArcsCallArgsTy, EmitArcsCallArgsPtr, 1),
            "counters")});
-  if (auto AK = TLI->getExtAttrForI32Param(false)) 
-    EmitArcsCall->addParamAttr(0, AK); 
-  auto *NextJV = Builder.CreateAdd(JV, Builder.getInt32(1)); 
-  auto *CounterLoopCond = Builder.CreateICmpSLT(NextJV, NumCounters); 
-  Builder.CreateCondBr(CounterLoopCond, CounterLoopHeader, FileLoopLatch); 
-  JV->addIncoming(NextJV, CounterLoopHeader); 
- 
-  Builder.SetInsertPoint(FileLoopLatch); 
-  Builder.CreateCall(SummaryInfo, {}); 
-  Builder.CreateCall(EndFile, {}); 
+  if (auto AK = TLI->getExtAttrForI32Param(false))
+    EmitArcsCall->addParamAttr(0, AK);
+  auto *NextJV = Builder.CreateAdd(JV, Builder.getInt32(1));
+  auto *CounterLoopCond = Builder.CreateICmpSLT(NextJV, NumCounters);
+  Builder.CreateCondBr(CounterLoopCond, CounterLoopHeader, FileLoopLatch);
+  JV->addIncoming(NextJV, CounterLoopHeader);
+
+  Builder.SetInsertPoint(FileLoopLatch);
+  Builder.CreateCall(SummaryInfo, {});
+  Builder.CreateCall(EndFile, {});
   auto *NextIV = Builder.CreateAdd(IV, Builder.getInt32(1), "next_file_idx");
-  auto *FileLoopCond = 
-      Builder.CreateICmpSLT(NextIV, Builder.getInt32(FileInfos.size())); 
-  Builder.CreateCondBr(FileLoopCond, FileLoopHeader, ExitBB); 
-  IV->addIncoming(NextIV, FileLoopLatch); 
- 
-  Builder.SetInsertPoint(ExitBB); 
-  Builder.CreateRetVoid(); 
- 
-  return WriteoutF; 
-} 
- 
-Function *GCOVProfiler::insertReset( 
-    ArrayRef<std::pair<GlobalVariable *, MDNode *>> CountersBySP) { 
-  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false); 
-  Function *ResetF = M->getFunction("__llvm_gcov_reset"); 
-  if (!ResetF) 
-    ResetF = Function::Create(FTy, GlobalValue::InternalLinkage, 
-                              "__llvm_gcov_reset", M); 
-  ResetF->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); 
-  ResetF->addFnAttr(Attribute::NoInline); 
-  if (Options.NoRedZone) 
-    ResetF->addFnAttr(Attribute::NoRedZone); 
- 
-  BasicBlock *Entry = BasicBlock::Create(*Ctx, "entry", ResetF); 
-  IRBuilder<> Builder(Entry); 
- 
-  // Zero out the counters. 
-  for (const auto &I : CountersBySP) { 
-    GlobalVariable *GV = I.first; 
-    Constant *Null = Constant::getNullValue(GV->getValueType()); 
-    Builder.CreateStore(Null, GV); 
-  } 
- 
-  Type *RetTy = ResetF->getReturnType(); 
-  if (RetTy->isVoidTy()) 
-    Builder.CreateRetVoid(); 
-  else if (RetTy->isIntegerTy()) 
-    // Used if __llvm_gcov_reset was implicitly declared. 
-    Builder.CreateRet(ConstantInt::get(RetTy, 0)); 
-  else 
-    report_fatal_error("invalid return type for __llvm_gcov_reset"); 
- 
-  return ResetF; 
-} 
+  auto *FileLoopCond =
+      Builder.CreateICmpSLT(NextIV, Builder.getInt32(FileInfos.size()));
+  Builder.CreateCondBr(FileLoopCond, FileLoopHeader, ExitBB);
+  IV->addIncoming(NextIV, FileLoopLatch);
+
+  Builder.SetInsertPoint(ExitBB);
+  Builder.CreateRetVoid();
+
+  return WriteoutF;
+}
+
+Function *GCOVProfiler::insertReset(
+    ArrayRef<std::pair<GlobalVariable *, MDNode *>> CountersBySP) {
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  Function *ResetF = M->getFunction("__llvm_gcov_reset");
+  if (!ResetF)
+    ResetF = Function::Create(FTy, GlobalValue::InternalLinkage,
+                              "__llvm_gcov_reset", M);
+  ResetF->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  ResetF->addFnAttr(Attribute::NoInline);
+  if (Options.NoRedZone)
+    ResetF->addFnAttr(Attribute::NoRedZone);
+
+  BasicBlock *Entry = BasicBlock::Create(*Ctx, "entry", ResetF);
+  IRBuilder<> Builder(Entry);
+
+  // Zero out the counters.
+  for (const auto &I : CountersBySP) {
+    GlobalVariable *GV = I.first;
+    Constant *Null = Constant::getNullValue(GV->getValueType());
+    Builder.CreateStore(Null, GV);
+  }
+
+  Type *RetTy = ResetF->getReturnType();
+  if (RetTy->isVoidTy())
+    Builder.CreateRetVoid();
+  else if (RetTy->isIntegerTy())
+    // Used if __llvm_gcov_reset was implicitly declared.
+    Builder.CreateRet(ConstantInt::get(RetTy, 0));
+  else
+    report_fatal_error("invalid return type for __llvm_gcov_reset");
+
+  return ResetF;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 1dffdacc3a..fedd9bfc97 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1,375 +1,375 @@
-//===- HWAddressSanitizer.cpp - detector of uninitialized reads -------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-/// \file 
-/// This file is a part of HWAddressSanitizer, an address sanity checker 
-/// based on tagged addressing. 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h" 
-#include "llvm/ADT/MapVector.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/StringExtras.h" 
-#include "llvm/ADT/StringRef.h" 
-#include "llvm/ADT/Triple.h" 
-#include "llvm/BinaryFormat/ELF.h" 
-#include "llvm/IR/Attributes.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DebugInfoMetadata.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InlineAsm.h" 
-#include "llvm/IR/InstVisitor.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/MDBuilder.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Instrumentation.h" 
-#include "llvm/Transforms/Instrumentation/AddressSanitizerCommon.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/ModuleUtils.h" 
-#include "llvm/Transforms/Utils/PromoteMemToReg.h" 
-#include <sstream> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "hwasan" 
- 
+//===- HWAddressSanitizer.cpp - detector of uninitialized reads -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file is a part of HWAddressSanitizer, an address sanity checker
+/// based on tagged addressing.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Instrumentation/AddressSanitizerCommon.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include <sstream>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hwasan"
+
 const char kHwasanModuleCtorName[] = "hwasan.module_ctor";
 const char kHwasanNoteName[] = "hwasan.note";
 const char kHwasanInitName[] = "__hwasan_init";
 const char kHwasanPersonalityThunkName[] = "__hwasan_personality_thunk";
- 
+
 const char kHwasanShadowMemoryDynamicAddress[] =
-    "__hwasan_shadow_memory_dynamic_address"; 
- 
-// Accesses sizes are powers of two: 1, 2, 4, 8, 16. 
-static const size_t kNumberOfAccessSizes = 5; 
- 
-static const size_t kDefaultShadowScale = 4; 
-static const uint64_t kDynamicShadowSentinel = 
-    std::numeric_limits<uint64_t>::max(); 
-static const unsigned kPointerTagShift = 56; 
- 
-static const unsigned kShadowBaseAlignment = 32; 
- 
-static cl::opt<std::string> ClMemoryAccessCallbackPrefix( 
-    "hwasan-memory-access-callback-prefix", 
-    cl::desc("Prefix for memory access callbacks"), cl::Hidden, 
-    cl::init("__hwasan_")); 
- 
-static cl::opt<bool> 
-    ClInstrumentWithCalls("hwasan-instrument-with-calls", 
-                cl::desc("instrument reads and writes with callbacks"), 
-                cl::Hidden, cl::init(false)); 
- 
-static cl::opt<bool> ClInstrumentReads("hwasan-instrument-reads", 
-                                       cl::desc("instrument read instructions"), 
-                                       cl::Hidden, cl::init(true)); 
- 
-static cl::opt<bool> ClInstrumentWrites( 
-    "hwasan-instrument-writes", cl::desc("instrument write instructions"), 
-    cl::Hidden, cl::init(true)); 
- 
-static cl::opt<bool> ClInstrumentAtomics( 
-    "hwasan-instrument-atomics", 
-    cl::desc("instrument atomic instructions (rmw, cmpxchg)"), cl::Hidden, 
-    cl::init(true)); 
- 
-static cl::opt<bool> ClInstrumentByval("hwasan-instrument-byval", 
-                                       cl::desc("instrument byval arguments"), 
-                                       cl::Hidden, cl::init(true)); 
- 
-static cl::opt<bool> ClRecover( 
-    "hwasan-recover", 
-    cl::desc("Enable recovery mode (continue-after-error)."), 
-    cl::Hidden, cl::init(false)); 
- 
-static cl::opt<bool> ClInstrumentStack("hwasan-instrument-stack", 
-                                       cl::desc("instrument stack (allocas)"), 
-                                       cl::Hidden, cl::init(true)); 
- 
-static cl::opt<bool> ClUARRetagToZero( 
-    "hwasan-uar-retag-to-zero", 
-    cl::desc("Clear alloca tags before returning from the function to allow " 
-             "non-instrumented and instrumented function calls mix. When set " 
-             "to false, allocas are retagged before returning from the " 
-             "function to detect use after return."), 
-    cl::Hidden, cl::init(true)); 
- 
-static cl::opt<bool> ClGenerateTagsWithCalls( 
-    "hwasan-generate-tags-with-calls", 
-    cl::desc("generate new tags with runtime library calls"), cl::Hidden, 
-    cl::init(false)); 
- 
-static cl::opt<bool> ClGlobals("hwasan-globals", cl::desc("Instrument globals"), 
-                               cl::Hidden, cl::init(false), cl::ZeroOrMore); 
- 
-static cl::opt<int> ClMatchAllTag( 
-    "hwasan-match-all-tag", 
-    cl::desc("don't report bad accesses via pointers with this tag"), 
-    cl::Hidden, cl::init(-1)); 
- 
-static cl::opt<bool> ClEnableKhwasan( 
-    "hwasan-kernel", 
-    cl::desc("Enable KernelHWAddressSanitizer instrumentation"), 
-    cl::Hidden, cl::init(false)); 
- 
-// These flags allow to change the shadow mapping and control how shadow memory 
-// is accessed. The shadow mapping looks like: 
-//    Shadow = (Mem >> scale) + offset 
- 
-static cl::opt<uint64_t> 
-    ClMappingOffset("hwasan-mapping-offset", 
-                    cl::desc("HWASan shadow mapping offset [EXPERIMENTAL]"), 
-                    cl::Hidden, cl::init(0)); 
- 
-static cl::opt<bool> 
-    ClWithIfunc("hwasan-with-ifunc", 
-                cl::desc("Access dynamic shadow through an ifunc global on " 
-                         "platforms that support this"), 
-                cl::Hidden, cl::init(false)); 
- 
-static cl::opt<bool> ClWithTls( 
-    "hwasan-with-tls", 
-    cl::desc("Access dynamic shadow through an thread-local pointer on " 
-             "platforms that support this"), 
-    cl::Hidden, cl::init(true)); 
- 
-static cl::opt<bool> 
-    ClRecordStackHistory("hwasan-record-stack-history", 
-                         cl::desc("Record stack frames with tagged allocations " 
-                                  "in a thread-local ring buffer"), 
-                         cl::Hidden, cl::init(true)); 
-static cl::opt<bool> 
-    ClInstrumentMemIntrinsics("hwasan-instrument-mem-intrinsics", 
-                              cl::desc("instrument memory intrinsics"), 
-                              cl::Hidden, cl::init(true)); 
- 
-static cl::opt<bool> 
-    ClInstrumentLandingPads("hwasan-instrument-landing-pads", 
-                            cl::desc("instrument landing pads"), cl::Hidden, 
-                            cl::init(false), cl::ZeroOrMore); 
- 
-static cl::opt<bool> ClUseShortGranules( 
-    "hwasan-use-short-granules", 
-    cl::desc("use short granules in allocas and outlined checks"), cl::Hidden, 
-    cl::init(false), cl::ZeroOrMore); 
- 
-static cl::opt<bool> ClInstrumentPersonalityFunctions( 
-    "hwasan-instrument-personality-functions", 
-    cl::desc("instrument personality functions"), cl::Hidden, cl::init(false), 
-    cl::ZeroOrMore); 
- 
-static cl::opt<bool> ClInlineAllChecks("hwasan-inline-all-checks", 
-                                       cl::desc("inline all checks"), 
-                                       cl::Hidden, cl::init(false)); 
- 
-namespace { 
- 
-/// An instrumentation pass implementing detection of addressability bugs 
-/// using tagged pointers. 
-class HWAddressSanitizer { 
-public: 
-  explicit HWAddressSanitizer(Module &M, bool CompileKernel = false, 
-                              bool Recover = false) : M(M) { 
-    this->Recover = ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover; 
-    this->CompileKernel = ClEnableKhwasan.getNumOccurrences() > 0 ? 
-        ClEnableKhwasan : CompileKernel; 
- 
-    initializeModule(); 
-  } 
- 
-  bool sanitizeFunction(Function &F); 
-  void initializeModule(); 
+    "__hwasan_shadow_memory_dynamic_address";
+
+// Accesses sizes are powers of two: 1, 2, 4, 8, 16.
+static const size_t kNumberOfAccessSizes = 5;
+
+static const size_t kDefaultShadowScale = 4;
+static const uint64_t kDynamicShadowSentinel =
+    std::numeric_limits<uint64_t>::max();
+static const unsigned kPointerTagShift = 56;
+
+static const unsigned kShadowBaseAlignment = 32;
+
+static cl::opt<std::string> ClMemoryAccessCallbackPrefix(
+    "hwasan-memory-access-callback-prefix",
+    cl::desc("Prefix for memory access callbacks"), cl::Hidden,
+    cl::init("__hwasan_"));
+
+static cl::opt<bool>
+    ClInstrumentWithCalls("hwasan-instrument-with-calls",
+                cl::desc("instrument reads and writes with callbacks"),
+                cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClInstrumentReads("hwasan-instrument-reads",
+                                       cl::desc("instrument read instructions"),
+                                       cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClInstrumentWrites(
+    "hwasan-instrument-writes", cl::desc("instrument write instructions"),
+    cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClInstrumentAtomics(
+    "hwasan-instrument-atomics",
+    cl::desc("instrument atomic instructions (rmw, cmpxchg)"), cl::Hidden,
+    cl::init(true));
+
+static cl::opt<bool> ClInstrumentByval("hwasan-instrument-byval",
+                                       cl::desc("instrument byval arguments"),
+                                       cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClRecover(
+    "hwasan-recover",
+    cl::desc("Enable recovery mode (continue-after-error)."),
+    cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClInstrumentStack("hwasan-instrument-stack",
+                                       cl::desc("instrument stack (allocas)"),
+                                       cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClUARRetagToZero(
+    "hwasan-uar-retag-to-zero",
+    cl::desc("Clear alloca tags before returning from the function to allow "
+             "non-instrumented and instrumented function calls mix. When set "
+             "to false, allocas are retagged before returning from the "
+             "function to detect use after return."),
+    cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClGenerateTagsWithCalls(
+    "hwasan-generate-tags-with-calls",
+    cl::desc("generate new tags with runtime library calls"), cl::Hidden,
+    cl::init(false));
+
+static cl::opt<bool> ClGlobals("hwasan-globals", cl::desc("Instrument globals"),
+                               cl::Hidden, cl::init(false), cl::ZeroOrMore);
+
+static cl::opt<int> ClMatchAllTag(
+    "hwasan-match-all-tag",
+    cl::desc("don't report bad accesses via pointers with this tag"),
+    cl::Hidden, cl::init(-1));
+
+static cl::opt<bool> ClEnableKhwasan(
+    "hwasan-kernel",
+    cl::desc("Enable KernelHWAddressSanitizer instrumentation"),
+    cl::Hidden, cl::init(false));
+
+// These flags allow to change the shadow mapping and control how shadow memory
+// is accessed. The shadow mapping looks like:
+//    Shadow = (Mem >> scale) + offset
+
+static cl::opt<uint64_t>
+    ClMappingOffset("hwasan-mapping-offset",
+                    cl::desc("HWASan shadow mapping offset [EXPERIMENTAL]"),
+                    cl::Hidden, cl::init(0));
+
+static cl::opt<bool>
+    ClWithIfunc("hwasan-with-ifunc",
+                cl::desc("Access dynamic shadow through an ifunc global on "
+                         "platforms that support this"),
+                cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClWithTls(
+    "hwasan-with-tls",
+    cl::desc("Access dynamic shadow through an thread-local pointer on "
+             "platforms that support this"),
+    cl::Hidden, cl::init(true));
+
+static cl::opt<bool>
+    ClRecordStackHistory("hwasan-record-stack-history",
+                         cl::desc("Record stack frames with tagged allocations "
+                                  "in a thread-local ring buffer"),
+                         cl::Hidden, cl::init(true));
+static cl::opt<bool>
+    ClInstrumentMemIntrinsics("hwasan-instrument-mem-intrinsics",
+                              cl::desc("instrument memory intrinsics"),
+                              cl::Hidden, cl::init(true));
+
+static cl::opt<bool>
+    ClInstrumentLandingPads("hwasan-instrument-landing-pads",
+                            cl::desc("instrument landing pads"), cl::Hidden,
+                            cl::init(false), cl::ZeroOrMore);
+
+static cl::opt<bool> ClUseShortGranules(
+    "hwasan-use-short-granules",
+    cl::desc("use short granules in allocas and outlined checks"), cl::Hidden,
+    cl::init(false), cl::ZeroOrMore);
+
+static cl::opt<bool> ClInstrumentPersonalityFunctions(
+    "hwasan-instrument-personality-functions",
+    cl::desc("instrument personality functions"), cl::Hidden, cl::init(false),
+    cl::ZeroOrMore);
+
+static cl::opt<bool> ClInlineAllChecks("hwasan-inline-all-checks",
+                                       cl::desc("inline all checks"),
+                                       cl::Hidden, cl::init(false));
+
+namespace {
+
+/// An instrumentation pass implementing detection of addressability bugs
+/// using tagged pointers.
+class HWAddressSanitizer {
+public:
+  explicit HWAddressSanitizer(Module &M, bool CompileKernel = false,
+                              bool Recover = false) : M(M) {
+    this->Recover = ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover;
+    this->CompileKernel = ClEnableKhwasan.getNumOccurrences() > 0 ?
+        ClEnableKhwasan : CompileKernel;
+
+    initializeModule();
+  }
+
+  bool sanitizeFunction(Function &F);
+  void initializeModule();
   void createHwasanCtorComdat();
- 
-  void initializeCallbacks(Module &M); 
- 
+
+  void initializeCallbacks(Module &M);
+
   Value *getOpaqueNoopCast(IRBuilder<> &IRB, Value *Val);
 
-  Value *getDynamicShadowIfunc(IRBuilder<> &IRB); 
+  Value *getDynamicShadowIfunc(IRBuilder<> &IRB);
   Value *getShadowNonTls(IRBuilder<> &IRB);
- 
-  void untagPointerOperand(Instruction *I, Value *Addr); 
-  Value *memToShadow(Value *Shadow, IRBuilder<> &IRB); 
-  void instrumentMemAccessInline(Value *Ptr, bool IsWrite, 
-                                 unsigned AccessSizeIndex, 
-                                 Instruction *InsertBefore); 
-  void instrumentMemIntrinsic(MemIntrinsic *MI); 
-  bool instrumentMemAccess(InterestingMemoryOperand &O); 
-  bool ignoreAccess(Value *Ptr); 
-  void getInterestingMemoryOperands( 
-      Instruction *I, SmallVectorImpl<InterestingMemoryOperand> &Interesting); 
- 
-  bool isInterestingAlloca(const AllocaInst &AI); 
-  bool tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag, size_t Size); 
-  Value *tagPointer(IRBuilder<> &IRB, Type *Ty, Value *PtrLong, Value *Tag); 
-  Value *untagPointer(IRBuilder<> &IRB, Value *PtrLong); 
-  bool instrumentStack( 
-      SmallVectorImpl<AllocaInst *> &Allocas, 
-      DenseMap<AllocaInst *, std::vector<DbgVariableIntrinsic *>> &AllocaDbgMap, 
-      SmallVectorImpl<Instruction *> &RetVec, Value *StackTag); 
-  Value *readRegister(IRBuilder<> &IRB, StringRef Name); 
-  bool instrumentLandingPads(SmallVectorImpl<Instruction *> &RetVec); 
-  Value *getNextTagWithCall(IRBuilder<> &IRB); 
-  Value *getStackBaseTag(IRBuilder<> &IRB); 
-  Value *getAllocaTag(IRBuilder<> &IRB, Value *StackTag, AllocaInst *AI, 
-                     unsigned AllocaNo); 
-  Value *getUARTag(IRBuilder<> &IRB, Value *StackTag); 
- 
-  Value *getHwasanThreadSlotPtr(IRBuilder<> &IRB, Type *Ty); 
-  void emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord); 
- 
-  void instrumentGlobal(GlobalVariable *GV, uint8_t Tag); 
-  void instrumentGlobals(); 
- 
-  void instrumentPersonalityFunctions(); 
- 
-private: 
-  LLVMContext *C; 
-  Module &M; 
-  Triple TargetTriple; 
-  FunctionCallee HWAsanMemmove, HWAsanMemcpy, HWAsanMemset; 
-  FunctionCallee HWAsanHandleVfork; 
- 
-  /// This struct defines the shadow mapping using the rule: 
-  ///   shadow = (mem >> Scale) + Offset. 
-  /// If InGlobal is true, then 
-  ///   extern char __hwasan_shadow[]; 
-  ///   shadow = (mem >> Scale) + &__hwasan_shadow 
-  /// If InTls is true, then 
-  ///   extern char *__hwasan_tls; 
-  ///   shadow = (mem>>Scale) + align_up(__hwasan_shadow, kShadowBaseAlignment) 
-  struct ShadowMapping { 
-    int Scale; 
-    uint64_t Offset; 
-    bool InGlobal; 
-    bool InTls; 
- 
-    void init(Triple &TargetTriple); 
-    unsigned getObjectAlignment() const { return 1U << Scale; } 
-  }; 
-  ShadowMapping Mapping; 
- 
-  Type *VoidTy = Type::getVoidTy(M.getContext()); 
-  Type *IntptrTy; 
-  Type *Int8PtrTy; 
-  Type *Int8Ty; 
-  Type *Int32Ty; 
-  Type *Int64Ty = Type::getInt64Ty(M.getContext()); 
- 
-  bool CompileKernel; 
-  bool Recover; 
+
+  void untagPointerOperand(Instruction *I, Value *Addr);
+  Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
+  void instrumentMemAccessInline(Value *Ptr, bool IsWrite,
+                                 unsigned AccessSizeIndex,
+                                 Instruction *InsertBefore);
+  void instrumentMemIntrinsic(MemIntrinsic *MI);
+  bool instrumentMemAccess(InterestingMemoryOperand &O);
+  bool ignoreAccess(Value *Ptr);
+  void getInterestingMemoryOperands(
+      Instruction *I, SmallVectorImpl<InterestingMemoryOperand> &Interesting);
+
+  bool isInterestingAlloca(const AllocaInst &AI);
+  bool tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag, size_t Size);
+  Value *tagPointer(IRBuilder<> &IRB, Type *Ty, Value *PtrLong, Value *Tag);
+  Value *untagPointer(IRBuilder<> &IRB, Value *PtrLong);
+  bool instrumentStack(
+      SmallVectorImpl<AllocaInst *> &Allocas,
+      DenseMap<AllocaInst *, std::vector<DbgVariableIntrinsic *>> &AllocaDbgMap,
+      SmallVectorImpl<Instruction *> &RetVec, Value *StackTag);
+  Value *readRegister(IRBuilder<> &IRB, StringRef Name);
+  bool instrumentLandingPads(SmallVectorImpl<Instruction *> &RetVec);
+  Value *getNextTagWithCall(IRBuilder<> &IRB);
+  Value *getStackBaseTag(IRBuilder<> &IRB);
+  Value *getAllocaTag(IRBuilder<> &IRB, Value *StackTag, AllocaInst *AI,
+                     unsigned AllocaNo);
+  Value *getUARTag(IRBuilder<> &IRB, Value *StackTag);
+
+  Value *getHwasanThreadSlotPtr(IRBuilder<> &IRB, Type *Ty);
+  void emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord);
+
+  void instrumentGlobal(GlobalVariable *GV, uint8_t Tag);
+  void instrumentGlobals();
+
+  void instrumentPersonalityFunctions();
+
+private:
+  LLVMContext *C;
+  Module &M;
+  Triple TargetTriple;
+  FunctionCallee HWAsanMemmove, HWAsanMemcpy, HWAsanMemset;
+  FunctionCallee HWAsanHandleVfork;
+
+  /// This struct defines the shadow mapping using the rule:
+  ///   shadow = (mem >> Scale) + Offset.
+  /// If InGlobal is true, then
+  ///   extern char __hwasan_shadow[];
+  ///   shadow = (mem >> Scale) + &__hwasan_shadow
+  /// If InTls is true, then
+  ///   extern char *__hwasan_tls;
+  ///   shadow = (mem>>Scale) + align_up(__hwasan_shadow, kShadowBaseAlignment)
+  struct ShadowMapping {
+    int Scale;
+    uint64_t Offset;
+    bool InGlobal;
+    bool InTls;
+
+    void init(Triple &TargetTriple);
+    unsigned getObjectAlignment() const { return 1U << Scale; }
+  };
+  ShadowMapping Mapping;
+
+  Type *VoidTy = Type::getVoidTy(M.getContext());
+  Type *IntptrTy;
+  Type *Int8PtrTy;
+  Type *Int8Ty;
+  Type *Int32Ty;
+  Type *Int64Ty = Type::getInt64Ty(M.getContext());
+
+  bool CompileKernel;
+  bool Recover;
   bool OutlinedChecks;
-  bool UseShortGranules; 
-  bool InstrumentLandingPads; 
- 
+  bool UseShortGranules;
+  bool InstrumentLandingPads;
+
   bool HasMatchAllTag = false;
   uint8_t MatchAllTag = 0;
 
-  Function *HwasanCtorFunction; 
- 
-  FunctionCallee HwasanMemoryAccessCallback[2][kNumberOfAccessSizes]; 
-  FunctionCallee HwasanMemoryAccessCallbackSized[2]; 
- 
-  FunctionCallee HwasanTagMemoryFunc; 
-  FunctionCallee HwasanGenerateTagFunc; 
- 
-  Constant *ShadowGlobal; 
- 
+  Function *HwasanCtorFunction;
+
+  FunctionCallee HwasanMemoryAccessCallback[2][kNumberOfAccessSizes];
+  FunctionCallee HwasanMemoryAccessCallbackSized[2];
+
+  FunctionCallee HwasanTagMemoryFunc;
+  FunctionCallee HwasanGenerateTagFunc;
+
+  Constant *ShadowGlobal;
+
   Value *ShadowBase = nullptr;
-  Value *StackBaseTag = nullptr; 
-  GlobalValue *ThreadPtrGlobal = nullptr; 
-}; 
- 
-class HWAddressSanitizerLegacyPass : public FunctionPass { 
-public: 
-  // Pass identification, replacement for typeid. 
-  static char ID; 
- 
-  explicit HWAddressSanitizerLegacyPass(bool CompileKernel = false, 
-                                        bool Recover = false) 
-      : FunctionPass(ID), CompileKernel(CompileKernel), Recover(Recover) { 
-    initializeHWAddressSanitizerLegacyPassPass( 
-        *PassRegistry::getPassRegistry()); 
-  } 
- 
-  StringRef getPassName() const override { return "HWAddressSanitizer"; } 
- 
-  bool doInitialization(Module &M) override { 
-    HWASan = std::make_unique<HWAddressSanitizer>(M, CompileKernel, Recover); 
-    return true; 
-  } 
- 
-  bool runOnFunction(Function &F) override { 
-    return HWASan->sanitizeFunction(F); 
-  } 
- 
-  bool doFinalization(Module &M) override { 
-    HWASan.reset(); 
-    return false; 
-  } 
- 
-private: 
-  std::unique_ptr<HWAddressSanitizer> HWASan; 
-  bool CompileKernel; 
-  bool Recover; 
-}; 
- 
-} // end anonymous namespace 
- 
-char HWAddressSanitizerLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN( 
-    HWAddressSanitizerLegacyPass, "hwasan", 
-    "HWAddressSanitizer: detect memory bugs using tagged addressing.", false, 
-    false) 
-INITIALIZE_PASS_END( 
-    HWAddressSanitizerLegacyPass, "hwasan", 
-    "HWAddressSanitizer: detect memory bugs using tagged addressing.", false, 
-    false) 
- 
-FunctionPass *llvm::createHWAddressSanitizerLegacyPassPass(bool CompileKernel, 
-                                                           bool Recover) { 
-  assert(!CompileKernel || Recover); 
-  return new HWAddressSanitizerLegacyPass(CompileKernel, Recover); 
-} 
- 
-HWAddressSanitizerPass::HWAddressSanitizerPass(bool CompileKernel, bool Recover) 
-    : CompileKernel(CompileKernel), Recover(Recover) {} 
- 
-PreservedAnalyses HWAddressSanitizerPass::run(Module &M, 
-                                              ModuleAnalysisManager &MAM) { 
-  HWAddressSanitizer HWASan(M, CompileKernel, Recover); 
-  bool Modified = false; 
-  for (Function &F : M) 
-    Modified |= HWASan.sanitizeFunction(F); 
-  if (Modified) 
-    return PreservedAnalyses::none(); 
-  return PreservedAnalyses::all(); 
-} 
- 
+  Value *StackBaseTag = nullptr;
+  GlobalValue *ThreadPtrGlobal = nullptr;
+};
+
+class HWAddressSanitizerLegacyPass : public FunctionPass {
+public:
+  // Pass identification, replacement for typeid.
+  static char ID;
+
+  explicit HWAddressSanitizerLegacyPass(bool CompileKernel = false,
+                                        bool Recover = false)
+      : FunctionPass(ID), CompileKernel(CompileKernel), Recover(Recover) {
+    initializeHWAddressSanitizerLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "HWAddressSanitizer"; }
+
+  bool doInitialization(Module &M) override {
+    HWASan = std::make_unique<HWAddressSanitizer>(M, CompileKernel, Recover);
+    return true;
+  }
+
+  bool runOnFunction(Function &F) override {
+    return HWASan->sanitizeFunction(F);
+  }
+
+  bool doFinalization(Module &M) override {
+    HWASan.reset();
+    return false;
+  }
+
+private:
+  std::unique_ptr<HWAddressSanitizer> HWASan;
+  bool CompileKernel;
+  bool Recover;
+};
+
+} // end anonymous namespace
+
+char HWAddressSanitizerLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(
+    HWAddressSanitizerLegacyPass, "hwasan",
+    "HWAddressSanitizer: detect memory bugs using tagged addressing.", false,
+    false)
+INITIALIZE_PASS_END(
+    HWAddressSanitizerLegacyPass, "hwasan",
+    "HWAddressSanitizer: detect memory bugs using tagged addressing.", false,
+    false)
+
+FunctionPass *llvm::createHWAddressSanitizerLegacyPassPass(bool CompileKernel,
+                                                           bool Recover) {
+  assert(!CompileKernel || Recover);
+  return new HWAddressSanitizerLegacyPass(CompileKernel, Recover);
+}
+
+HWAddressSanitizerPass::HWAddressSanitizerPass(bool CompileKernel, bool Recover)
+    : CompileKernel(CompileKernel), Recover(Recover) {}
+
+PreservedAnalyses HWAddressSanitizerPass::run(Module &M,
+                                              ModuleAnalysisManager &MAM) {
+  HWAddressSanitizer HWASan(M, CompileKernel, Recover);
+  bool Modified = false;
+  for (Function &F : M)
+    Modified |= HWASan.sanitizeFunction(F);
+  if (Modified)
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
 void HWAddressSanitizer::createHwasanCtorComdat() {
   std::tie(HwasanCtorFunction, std::ignore) =
       getOrCreateSanitizerCtorAndInitFunctions(
@@ -470,38 +470,38 @@ void HWAddressSanitizer::createHwasanCtorComdat() {
   appendToCompilerUsed(M, Dummy);
 }
 
-/// Module-level initialization. 
-/// 
-/// inserts a call to __hwasan_init to the module's constructor list. 
-void HWAddressSanitizer::initializeModule() { 
-  LLVM_DEBUG(dbgs() << "Init " << M.getName() << "\n"); 
-  auto &DL = M.getDataLayout(); 
- 
-  TargetTriple = Triple(M.getTargetTriple()); 
- 
-  Mapping.init(TargetTriple); 
- 
-  C = &(M.getContext()); 
-  IRBuilder<> IRB(*C); 
-  IntptrTy = IRB.getIntPtrTy(DL); 
-  Int8PtrTy = IRB.getInt8PtrTy(); 
-  Int8Ty = IRB.getInt8Ty(); 
-  Int32Ty = IRB.getInt32Ty(); 
- 
-  HwasanCtorFunction = nullptr; 
- 
-  // Older versions of Android do not have the required runtime support for 
-  // short granules, global or personality function instrumentation. On other 
-  // platforms we currently require using the latest version of the runtime. 
-  bool NewRuntime = 
-      !TargetTriple.isAndroid() || !TargetTriple.isAndroidVersionLT(30); 
- 
-  UseShortGranules = 
-      ClUseShortGranules.getNumOccurrences() ? ClUseShortGranules : NewRuntime; 
+/// Module-level initialization.
+///
+/// inserts a call to __hwasan_init to the module's constructor list.
+void HWAddressSanitizer::initializeModule() {
+  LLVM_DEBUG(dbgs() << "Init " << M.getName() << "\n");
+  auto &DL = M.getDataLayout();
+
+  TargetTriple = Triple(M.getTargetTriple());
+
+  Mapping.init(TargetTriple);
+
+  C = &(M.getContext());
+  IRBuilder<> IRB(*C);
+  IntptrTy = IRB.getIntPtrTy(DL);
+  Int8PtrTy = IRB.getInt8PtrTy();
+  Int8Ty = IRB.getInt8Ty();
+  Int32Ty = IRB.getInt32Ty();
+
+  HwasanCtorFunction = nullptr;
+
+  // Older versions of Android do not have the required runtime support for
+  // short granules, global or personality function instrumentation. On other
+  // platforms we currently require using the latest version of the runtime.
+  bool NewRuntime =
+      !TargetTriple.isAndroid() || !TargetTriple.isAndroidVersionLT(30);
+
+  UseShortGranules =
+      ClUseShortGranules.getNumOccurrences() ? ClUseShortGranules : NewRuntime;
   OutlinedChecks =
       TargetTriple.isAArch64() && TargetTriple.isOSBinFormatELF() &&
       (ClInlineAllChecks.getNumOccurrences() ? !ClInlineAllChecks : !Recover);
- 
+
   if (ClMatchAllTag.getNumOccurrences()) {
     if (ClMatchAllTag != -1) {
       HasMatchAllTag = true;
@@ -512,86 +512,86 @@ void HWAddressSanitizer::initializeModule() {
     MatchAllTag = 0xFF;
   }
 
-  // If we don't have personality function support, fall back to landing pads. 
-  InstrumentLandingPads = ClInstrumentLandingPads.getNumOccurrences() 
-                              ? ClInstrumentLandingPads 
-                              : !NewRuntime; 
- 
-  if (!CompileKernel) { 
+  // If we don't have personality function support, fall back to landing pads.
+  InstrumentLandingPads = ClInstrumentLandingPads.getNumOccurrences()
+                              ? ClInstrumentLandingPads
+                              : !NewRuntime;
+
+  if (!CompileKernel) {
     createHwasanCtorComdat();
-    bool InstrumentGlobals = 
-        ClGlobals.getNumOccurrences() ? ClGlobals : NewRuntime; 
-    if (InstrumentGlobals) 
-      instrumentGlobals(); 
- 
-    bool InstrumentPersonalityFunctions = 
-        ClInstrumentPersonalityFunctions.getNumOccurrences() 
-            ? ClInstrumentPersonalityFunctions 
-            : NewRuntime; 
-    if (InstrumentPersonalityFunctions) 
-      instrumentPersonalityFunctions(); 
-  } 
- 
-  if (!TargetTriple.isAndroid()) { 
-    Constant *C = M.getOrInsertGlobal("__hwasan_tls", IntptrTy, [&] { 
-      auto *GV = new GlobalVariable(M, IntptrTy, /*isConstant=*/false, 
-                                    GlobalValue::ExternalLinkage, nullptr, 
-                                    "__hwasan_tls", nullptr, 
-                                    GlobalVariable::InitialExecTLSModel); 
-      appendToCompilerUsed(M, GV); 
-      return GV; 
-    }); 
-    ThreadPtrGlobal = cast<GlobalVariable>(C); 
-  } 
-} 
- 
-void HWAddressSanitizer::initializeCallbacks(Module &M) { 
-  IRBuilder<> IRB(*C); 
-  for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) { 
-    const std::string TypeStr = AccessIsWrite ? "store" : "load"; 
-    const std::string EndingStr = Recover ? "_noabort" : ""; 
- 
-    HwasanMemoryAccessCallbackSized[AccessIsWrite] = M.getOrInsertFunction( 
-        ClMemoryAccessCallbackPrefix + TypeStr + "N" + EndingStr, 
-        FunctionType::get(IRB.getVoidTy(), {IntptrTy, IntptrTy}, false)); 
- 
-    for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes; 
-         AccessSizeIndex++) { 
-      HwasanMemoryAccessCallback[AccessIsWrite][AccessSizeIndex] = 
-          M.getOrInsertFunction( 
-              ClMemoryAccessCallbackPrefix + TypeStr + 
-                  itostr(1ULL << AccessSizeIndex) + EndingStr, 
-              FunctionType::get(IRB.getVoidTy(), {IntptrTy}, false)); 
-    } 
-  } 
- 
-  HwasanTagMemoryFunc = M.getOrInsertFunction( 
-      "__hwasan_tag_memory", IRB.getVoidTy(), Int8PtrTy, Int8Ty, IntptrTy); 
-  HwasanGenerateTagFunc = 
-      M.getOrInsertFunction("__hwasan_generate_tag", Int8Ty); 
- 
-  ShadowGlobal = M.getOrInsertGlobal("__hwasan_shadow", 
-                                     ArrayType::get(IRB.getInt8Ty(), 0)); 
- 
-  const std::string MemIntrinCallbackPrefix = 
-      CompileKernel ? std::string("") : ClMemoryAccessCallbackPrefix; 
-  HWAsanMemmove = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memmove", 
-                                        IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), 
-                                        IRB.getInt8PtrTy(), IntptrTy); 
-  HWAsanMemcpy = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memcpy", 
-                                       IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), 
-                                       IRB.getInt8PtrTy(), IntptrTy); 
-  HWAsanMemset = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memset", 
-                                       IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), 
-                                       IRB.getInt32Ty(), IntptrTy); 
- 
-  HWAsanHandleVfork = 
-      M.getOrInsertFunction("__hwasan_handle_vfork", IRB.getVoidTy(), IntptrTy); 
-} 
- 
+    bool InstrumentGlobals =
+        ClGlobals.getNumOccurrences() ? ClGlobals : NewRuntime;
+    if (InstrumentGlobals)
+      instrumentGlobals();
+
+    bool InstrumentPersonalityFunctions =
+        ClInstrumentPersonalityFunctions.getNumOccurrences()
+            ? ClInstrumentPersonalityFunctions
+            : NewRuntime;
+    if (InstrumentPersonalityFunctions)
+      instrumentPersonalityFunctions();
+  }
+
+  if (!TargetTriple.isAndroid()) {
+    Constant *C = M.getOrInsertGlobal("__hwasan_tls", IntptrTy, [&] {
+      auto *GV = new GlobalVariable(M, IntptrTy, /*isConstant=*/false,
+                                    GlobalValue::ExternalLinkage, nullptr,
+                                    "__hwasan_tls", nullptr,
+                                    GlobalVariable::InitialExecTLSModel);
+      appendToCompilerUsed(M, GV);
+      return GV;
+    });
+    ThreadPtrGlobal = cast<GlobalVariable>(C);
+  }
+}
+
+void HWAddressSanitizer::initializeCallbacks(Module &M) {
+  IRBuilder<> IRB(*C);
+  for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) {
+    const std::string TypeStr = AccessIsWrite ? "store" : "load";
+    const std::string EndingStr = Recover ? "_noabort" : "";
+
+    HwasanMemoryAccessCallbackSized[AccessIsWrite] = M.getOrInsertFunction(
+        ClMemoryAccessCallbackPrefix + TypeStr + "N" + EndingStr,
+        FunctionType::get(IRB.getVoidTy(), {IntptrTy, IntptrTy}, false));
+
+    for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
+         AccessSizeIndex++) {
+      HwasanMemoryAccessCallback[AccessIsWrite][AccessSizeIndex] =
+          M.getOrInsertFunction(
+              ClMemoryAccessCallbackPrefix + TypeStr +
+                  itostr(1ULL << AccessSizeIndex) + EndingStr,
+              FunctionType::get(IRB.getVoidTy(), {IntptrTy}, false));
+    }
+  }
+
+  HwasanTagMemoryFunc = M.getOrInsertFunction(
+      "__hwasan_tag_memory", IRB.getVoidTy(), Int8PtrTy, Int8Ty, IntptrTy);
+  HwasanGenerateTagFunc =
+      M.getOrInsertFunction("__hwasan_generate_tag", Int8Ty);
+
+  ShadowGlobal = M.getOrInsertGlobal("__hwasan_shadow",
+                                     ArrayType::get(IRB.getInt8Ty(), 0));
+
+  const std::string MemIntrinCallbackPrefix =
+      CompileKernel ? std::string("") : ClMemoryAccessCallbackPrefix;
+  HWAsanMemmove = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memmove",
+                                        IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                                        IRB.getInt8PtrTy(), IntptrTy);
+  HWAsanMemcpy = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memcpy",
+                                       IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                                       IRB.getInt8PtrTy(), IntptrTy);
+  HWAsanMemset = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memset",
+                                       IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                                       IRB.getInt32Ty(), IntptrTy);
+
+  HWAsanHandleVfork =
+      M.getOrInsertFunction("__hwasan_handle_vfork", IRB.getVoidTy(), IntptrTy);
+}
+
 Value *HWAddressSanitizer::getOpaqueNoopCast(IRBuilder<> &IRB, Value *Val) {
-  // An empty inline asm with input reg == output reg. 
-  // An opaque no-op cast, basically. 
+  // An empty inline asm with input reg == output reg.
+  // An opaque no-op cast, basically.
   // This prevents code bloat as a result of rematerializing trivial definitions
   // such as constants or global addresses at every load and store.
   InlineAsm *Asm =
@@ -599,128 +599,128 @@ Value *HWAddressSanitizer::getOpaqueNoopCast(IRBuilder<> &IRB, Value *Val) {
                      StringRef(""), StringRef("=r,0"),
                      /*hasSideEffects=*/false);
   return IRB.CreateCall(Asm, {Val}, ".hwasan.shadow");
-} 
- 
+}
+
 Value *HWAddressSanitizer::getDynamicShadowIfunc(IRBuilder<> &IRB) {
   return getOpaqueNoopCast(IRB, ShadowGlobal);
 }
 
 Value *HWAddressSanitizer::getShadowNonTls(IRBuilder<> &IRB) {
-  if (Mapping.Offset != kDynamicShadowSentinel) 
+  if (Mapping.Offset != kDynamicShadowSentinel)
     return getOpaqueNoopCast(
         IRB, ConstantExpr::getIntToPtr(
                  ConstantInt::get(IntptrTy, Mapping.Offset), Int8PtrTy));
- 
-  if (Mapping.InGlobal) { 
-    return getDynamicShadowIfunc(IRB); 
-  } else { 
-    Value *GlobalDynamicAddress = 
-        IRB.GetInsertBlock()->getParent()->getParent()->getOrInsertGlobal( 
-            kHwasanShadowMemoryDynamicAddress, Int8PtrTy); 
-    return IRB.CreateLoad(Int8PtrTy, GlobalDynamicAddress); 
-  } 
-} 
- 
-bool HWAddressSanitizer::ignoreAccess(Value *Ptr) { 
-  // Do not instrument acesses from different address spaces; we cannot deal 
-  // with them. 
-  Type *PtrTy = cast<PointerType>(Ptr->getType()->getScalarType()); 
-  if (PtrTy->getPointerAddressSpace() != 0) 
-    return true; 
- 
-  // Ignore swifterror addresses. 
-  // swifterror memory addresses are mem2reg promoted by instruction 
-  // selection. As such they cannot have regular uses like an instrumentation 
-  // function and it makes no sense to track them as memory. 
-  if (Ptr->isSwiftError()) 
-    return true; 
- 
-  return false; 
-} 
- 
-void HWAddressSanitizer::getInterestingMemoryOperands( 
-    Instruction *I, SmallVectorImpl<InterestingMemoryOperand> &Interesting) { 
-  // Skip memory accesses inserted by another instrumentation. 
-  if (I->hasMetadata("nosanitize")) 
-    return; 
- 
-  // Do not instrument the load fetching the dynamic shadow address. 
+
+  if (Mapping.InGlobal) {
+    return getDynamicShadowIfunc(IRB);
+  } else {
+    Value *GlobalDynamicAddress =
+        IRB.GetInsertBlock()->getParent()->getParent()->getOrInsertGlobal(
+            kHwasanShadowMemoryDynamicAddress, Int8PtrTy);
+    return IRB.CreateLoad(Int8PtrTy, GlobalDynamicAddress);
+  }
+}
+
+bool HWAddressSanitizer::ignoreAccess(Value *Ptr) {
+  // Do not instrument acesses from different address spaces; we cannot deal
+  // with them.
+  Type *PtrTy = cast<PointerType>(Ptr->getType()->getScalarType());
+  if (PtrTy->getPointerAddressSpace() != 0)
+    return true;
+
+  // Ignore swifterror addresses.
+  // swifterror memory addresses are mem2reg promoted by instruction
+  // selection. As such they cannot have regular uses like an instrumentation
+  // function and it makes no sense to track them as memory.
+  if (Ptr->isSwiftError())
+    return true;
+
+  return false;
+}
+
+void HWAddressSanitizer::getInterestingMemoryOperands(
+    Instruction *I, SmallVectorImpl<InterestingMemoryOperand> &Interesting) {
+  // Skip memory accesses inserted by another instrumentation.
+  if (I->hasMetadata("nosanitize"))
+    return;
+
+  // Do not instrument the load fetching the dynamic shadow address.
   if (ShadowBase == I)
-    return; 
- 
-  if (LoadInst *LI = dyn_cast<LoadInst>(I)) { 
-    if (!ClInstrumentReads || ignoreAccess(LI->getPointerOperand())) 
-      return; 
-    Interesting.emplace_back(I, LI->getPointerOperandIndex(), false, 
-                             LI->getType(), LI->getAlign()); 
-  } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) { 
-    if (!ClInstrumentWrites || ignoreAccess(SI->getPointerOperand())) 
-      return; 
-    Interesting.emplace_back(I, SI->getPointerOperandIndex(), true, 
-                             SI->getValueOperand()->getType(), SI->getAlign()); 
-  } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) { 
-    if (!ClInstrumentAtomics || ignoreAccess(RMW->getPointerOperand())) 
-      return; 
-    Interesting.emplace_back(I, RMW->getPointerOperandIndex(), true, 
-                             RMW->getValOperand()->getType(), None); 
-  } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) { 
-    if (!ClInstrumentAtomics || ignoreAccess(XCHG->getPointerOperand())) 
-      return; 
-    Interesting.emplace_back(I, XCHG->getPointerOperandIndex(), true, 
-                             XCHG->getCompareOperand()->getType(), None); 
-  } else if (auto CI = dyn_cast<CallInst>(I)) { 
-    for (unsigned ArgNo = 0; ArgNo < CI->getNumArgOperands(); ArgNo++) { 
-      if (!ClInstrumentByval || !CI->isByValArgument(ArgNo) || 
-          ignoreAccess(CI->getArgOperand(ArgNo))) 
-        continue; 
-      Type *Ty = CI->getParamByValType(ArgNo); 
-      Interesting.emplace_back(I, ArgNo, false, Ty, Align(1)); 
-    } 
-  } 
-} 
- 
-static unsigned getPointerOperandIndex(Instruction *I) { 
-  if (LoadInst *LI = dyn_cast<LoadInst>(I)) 
-    return LI->getPointerOperandIndex(); 
-  if (StoreInst *SI = dyn_cast<StoreInst>(I)) 
-    return SI->getPointerOperandIndex(); 
-  if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) 
-    return RMW->getPointerOperandIndex(); 
-  if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) 
-    return XCHG->getPointerOperandIndex(); 
-  report_fatal_error("Unexpected instruction"); 
-  return -1; 
-} 
- 
-static size_t TypeSizeToSizeIndex(uint32_t TypeSize) { 
-  size_t Res = countTrailingZeros(TypeSize / 8); 
-  assert(Res < kNumberOfAccessSizes); 
-  return Res; 
-} 
- 
-void HWAddressSanitizer::untagPointerOperand(Instruction *I, Value *Addr) { 
-  if (TargetTriple.isAArch64()) 
-    return; 
- 
-  IRBuilder<> IRB(I); 
-  Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy); 
-  Value *UntaggedPtr = 
-      IRB.CreateIntToPtr(untagPointer(IRB, AddrLong), Addr->getType()); 
-  I->setOperand(getPointerOperandIndex(I), UntaggedPtr); 
-} 
- 
-Value *HWAddressSanitizer::memToShadow(Value *Mem, IRBuilder<> &IRB) { 
-  // Mem >> Scale 
-  Value *Shadow = IRB.CreateLShr(Mem, Mapping.Scale); 
-  if (Mapping.Offset == 0) 
-    return IRB.CreateIntToPtr(Shadow, Int8PtrTy); 
-  // (Mem >> Scale) + Offset 
+    return;
+
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    if (!ClInstrumentReads || ignoreAccess(LI->getPointerOperand()))
+      return;
+    Interesting.emplace_back(I, LI->getPointerOperandIndex(), false,
+                             LI->getType(), LI->getAlign());
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    if (!ClInstrumentWrites || ignoreAccess(SI->getPointerOperand()))
+      return;
+    Interesting.emplace_back(I, SI->getPointerOperandIndex(), true,
+                             SI->getValueOperand()->getType(), SI->getAlign());
+  } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
+    if (!ClInstrumentAtomics || ignoreAccess(RMW->getPointerOperand()))
+      return;
+    Interesting.emplace_back(I, RMW->getPointerOperandIndex(), true,
+                             RMW->getValOperand()->getType(), None);
+  } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
+    if (!ClInstrumentAtomics || ignoreAccess(XCHG->getPointerOperand()))
+      return;
+    Interesting.emplace_back(I, XCHG->getPointerOperandIndex(), true,
+                             XCHG->getCompareOperand()->getType(), None);
+  } else if (auto CI = dyn_cast<CallInst>(I)) {
+    for (unsigned ArgNo = 0; ArgNo < CI->getNumArgOperands(); ArgNo++) {
+      if (!ClInstrumentByval || !CI->isByValArgument(ArgNo) ||
+          ignoreAccess(CI->getArgOperand(ArgNo)))
+        continue;
+      Type *Ty = CI->getParamByValType(ArgNo);
+      Interesting.emplace_back(I, ArgNo, false, Ty, Align(1));
+    }
+  }
+}
+
+static unsigned getPointerOperandIndex(Instruction *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->getPointerOperandIndex();
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->getPointerOperandIndex();
+  if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I))
+    return RMW->getPointerOperandIndex();
+  if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I))
+    return XCHG->getPointerOperandIndex();
+  report_fatal_error("Unexpected instruction");
+  return -1;
+}
+
+static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
+  size_t Res = countTrailingZeros(TypeSize / 8);
+  assert(Res < kNumberOfAccessSizes);
+  return Res;
+}
+
+void HWAddressSanitizer::untagPointerOperand(Instruction *I, Value *Addr) {
+  if (TargetTriple.isAArch64())
+    return;
+
+  IRBuilder<> IRB(I);
+  Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
+  Value *UntaggedPtr =
+      IRB.CreateIntToPtr(untagPointer(IRB, AddrLong), Addr->getType());
+  I->setOperand(getPointerOperandIndex(I), UntaggedPtr);
+}
+
+Value *HWAddressSanitizer::memToShadow(Value *Mem, IRBuilder<> &IRB) {
+  // Mem >> Scale
+  Value *Shadow = IRB.CreateLShr(Mem, Mapping.Scale);
+  if (Mapping.Offset == 0)
+    return IRB.CreateIntToPtr(Shadow, Int8PtrTy);
+  // (Mem >> Scale) + Offset
   return IRB.CreateGEP(Int8Ty, ShadowBase, Shadow);
-} 
- 
-void HWAddressSanitizer::instrumentMemAccessInline(Value *Ptr, bool IsWrite, 
-                                                   unsigned AccessSizeIndex, 
-                                                   Instruction *InsertBefore) { 
+}
+
+void HWAddressSanitizer::instrumentMemAccessInline(Value *Ptr, bool IsWrite,
+                                                   unsigned AccessSizeIndex,
+                                                   Instruction *InsertBefore) {
   const int64_t AccessInfo =
       (CompileKernel << HWASanAccessInfo::CompileKernelShift) +
       (HasMatchAllTag << HWASanAccessInfo::HasMatchAllShift) +
@@ -728,809 +728,809 @@ void HWAddressSanitizer::instrumentMemAccessInline(Value *Ptr, bool IsWrite,
       (Recover << HWASanAccessInfo::RecoverShift) +
       (IsWrite << HWASanAccessInfo::IsWriteShift) +
       (AccessSizeIndex << HWASanAccessInfo::AccessSizeShift);
-  IRBuilder<> IRB(InsertBefore); 
- 
+  IRBuilder<> IRB(InsertBefore);
+
   if (OutlinedChecks) {
-    Module *M = IRB.GetInsertBlock()->getParent()->getParent(); 
-    Ptr = IRB.CreateBitCast(Ptr, Int8PtrTy); 
-    IRB.CreateCall(Intrinsic::getDeclaration( 
-                       M, UseShortGranules 
-                              ? Intrinsic::hwasan_check_memaccess_shortgranules 
-                              : Intrinsic::hwasan_check_memaccess), 
+    Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+    Ptr = IRB.CreateBitCast(Ptr, Int8PtrTy);
+    IRB.CreateCall(Intrinsic::getDeclaration(
+                       M, UseShortGranules
+                              ? Intrinsic::hwasan_check_memaccess_shortgranules
+                              : Intrinsic::hwasan_check_memaccess),
                    {ShadowBase, Ptr, ConstantInt::get(Int32Ty, AccessInfo)});
-    return; 
-  } 
- 
-  Value *PtrLong = IRB.CreatePointerCast(Ptr, IntptrTy); 
-  Value *PtrTag = IRB.CreateTrunc(IRB.CreateLShr(PtrLong, kPointerTagShift), 
-                                  IRB.getInt8Ty()); 
-  Value *AddrLong = untagPointer(IRB, PtrLong); 
-  Value *Shadow = memToShadow(AddrLong, IRB); 
-  Value *MemTag = IRB.CreateLoad(Int8Ty, Shadow); 
-  Value *TagMismatch = IRB.CreateICmpNE(PtrTag, MemTag); 
- 
+    return;
+  }
+
+  Value *PtrLong = IRB.CreatePointerCast(Ptr, IntptrTy);
+  Value *PtrTag = IRB.CreateTrunc(IRB.CreateLShr(PtrLong, kPointerTagShift),
+                                  IRB.getInt8Ty());
+  Value *AddrLong = untagPointer(IRB, PtrLong);
+  Value *Shadow = memToShadow(AddrLong, IRB);
+  Value *MemTag = IRB.CreateLoad(Int8Ty, Shadow);
+  Value *TagMismatch = IRB.CreateICmpNE(PtrTag, MemTag);
+
   if (HasMatchAllTag) {
     Value *TagNotIgnored = IRB.CreateICmpNE(
         PtrTag, ConstantInt::get(PtrTag->getType(), MatchAllTag));
-    TagMismatch = IRB.CreateAnd(TagMismatch, TagNotIgnored); 
-  } 
- 
-  Instruction *CheckTerm = 
-      SplitBlockAndInsertIfThen(TagMismatch, InsertBefore, false, 
-                                MDBuilder(*C).createBranchWeights(1, 100000)); 
- 
-  IRB.SetInsertPoint(CheckTerm); 
-  Value *OutOfShortGranuleTagRange = 
-      IRB.CreateICmpUGT(MemTag, ConstantInt::get(Int8Ty, 15)); 
-  Instruction *CheckFailTerm = 
-      SplitBlockAndInsertIfThen(OutOfShortGranuleTagRange, CheckTerm, !Recover, 
-                                MDBuilder(*C).createBranchWeights(1, 100000)); 
- 
-  IRB.SetInsertPoint(CheckTerm); 
-  Value *PtrLowBits = IRB.CreateTrunc(IRB.CreateAnd(PtrLong, 15), Int8Ty); 
-  PtrLowBits = IRB.CreateAdd( 
-      PtrLowBits, ConstantInt::get(Int8Ty, (1 << AccessSizeIndex) - 1)); 
-  Value *PtrLowBitsOOB = IRB.CreateICmpUGE(PtrLowBits, MemTag); 
-  SplitBlockAndInsertIfThen(PtrLowBitsOOB, CheckTerm, false, 
-                            MDBuilder(*C).createBranchWeights(1, 100000), 
+    TagMismatch = IRB.CreateAnd(TagMismatch, TagNotIgnored);
+  }
+
+  Instruction *CheckTerm =
+      SplitBlockAndInsertIfThen(TagMismatch, InsertBefore, false,
+                                MDBuilder(*C).createBranchWeights(1, 100000));
+
+  IRB.SetInsertPoint(CheckTerm);
+  Value *OutOfShortGranuleTagRange =
+      IRB.CreateICmpUGT(MemTag, ConstantInt::get(Int8Ty, 15));
+  Instruction *CheckFailTerm =
+      SplitBlockAndInsertIfThen(OutOfShortGranuleTagRange, CheckTerm, !Recover,
+                                MDBuilder(*C).createBranchWeights(1, 100000));
+
+  IRB.SetInsertPoint(CheckTerm);
+  Value *PtrLowBits = IRB.CreateTrunc(IRB.CreateAnd(PtrLong, 15), Int8Ty);
+  PtrLowBits = IRB.CreateAdd(
+      PtrLowBits, ConstantInt::get(Int8Ty, (1 << AccessSizeIndex) - 1));
+  Value *PtrLowBitsOOB = IRB.CreateICmpUGE(PtrLowBits, MemTag);
+  SplitBlockAndInsertIfThen(PtrLowBitsOOB, CheckTerm, false,
+                            MDBuilder(*C).createBranchWeights(1, 100000),
                             (DomTreeUpdater *)nullptr, nullptr,
                             CheckFailTerm->getParent());
- 
-  IRB.SetInsertPoint(CheckTerm); 
-  Value *InlineTagAddr = IRB.CreateOr(AddrLong, 15); 
-  InlineTagAddr = IRB.CreateIntToPtr(InlineTagAddr, Int8PtrTy); 
-  Value *InlineTag = IRB.CreateLoad(Int8Ty, InlineTagAddr); 
-  Value *InlineTagMismatch = IRB.CreateICmpNE(PtrTag, InlineTag); 
-  SplitBlockAndInsertIfThen(InlineTagMismatch, CheckTerm, false, 
-                            MDBuilder(*C).createBranchWeights(1, 100000), 
+
+  IRB.SetInsertPoint(CheckTerm);
+  Value *InlineTagAddr = IRB.CreateOr(AddrLong, 15);
+  InlineTagAddr = IRB.CreateIntToPtr(InlineTagAddr, Int8PtrTy);
+  Value *InlineTag = IRB.CreateLoad(Int8Ty, InlineTagAddr);
+  Value *InlineTagMismatch = IRB.CreateICmpNE(PtrTag, InlineTag);
+  SplitBlockAndInsertIfThen(InlineTagMismatch, CheckTerm, false,
+                            MDBuilder(*C).createBranchWeights(1, 100000),
                             (DomTreeUpdater *)nullptr, nullptr,
                             CheckFailTerm->getParent());
- 
-  IRB.SetInsertPoint(CheckFailTerm); 
-  InlineAsm *Asm; 
-  switch (TargetTriple.getArch()) { 
-    case Triple::x86_64: 
-      // The signal handler will find the data address in rdi. 
-      Asm = InlineAsm::get( 
-          FunctionType::get(IRB.getVoidTy(), {PtrLong->getType()}, false), 
+
+  IRB.SetInsertPoint(CheckFailTerm);
+  InlineAsm *Asm;
+  switch (TargetTriple.getArch()) {
+    case Triple::x86_64:
+      // The signal handler will find the data address in rdi.
+      Asm = InlineAsm::get(
+          FunctionType::get(IRB.getVoidTy(), {PtrLong->getType()}, false),
           "int3\nnopl " +
               itostr(0x40 + (AccessInfo & HWASanAccessInfo::RuntimeMask)) +
               "(%rax)",
-          "{rdi}", 
-          /*hasSideEffects=*/true); 
-      break; 
-    case Triple::aarch64: 
-    case Triple::aarch64_be: 
-      // The signal handler will find the data address in x0. 
-      Asm = InlineAsm::get( 
-          FunctionType::get(IRB.getVoidTy(), {PtrLong->getType()}, false), 
+          "{rdi}",
+          /*hasSideEffects=*/true);
+      break;
+    case Triple::aarch64:
+    case Triple::aarch64_be:
+      // The signal handler will find the data address in x0.
+      Asm = InlineAsm::get(
+          FunctionType::get(IRB.getVoidTy(), {PtrLong->getType()}, false),
           "brk #" +
               itostr(0x900 + (AccessInfo & HWASanAccessInfo::RuntimeMask)),
-          "{x0}", 
-          /*hasSideEffects=*/true); 
-      break; 
-    default: 
-      report_fatal_error("unsupported architecture"); 
-  } 
-  IRB.CreateCall(Asm, PtrLong); 
-  if (Recover) 
-    cast<BranchInst>(CheckFailTerm)->setSuccessor(0, CheckTerm->getParent()); 
-} 
- 
-void HWAddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) { 
-  IRBuilder<> IRB(MI); 
-  if (isa<MemTransferInst>(MI)) { 
-    IRB.CreateCall( 
-        isa<MemMoveInst>(MI) ? HWAsanMemmove : HWAsanMemcpy, 
-        {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()), 
-         IRB.CreatePointerCast(MI->getOperand(1), IRB.getInt8PtrTy()), 
-         IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)}); 
-  } else if (isa<MemSetInst>(MI)) { 
-    IRB.CreateCall( 
-        HWAsanMemset, 
-        {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()), 
-         IRB.CreateIntCast(MI->getOperand(1), IRB.getInt32Ty(), false), 
-         IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)}); 
-  } 
-  MI->eraseFromParent(); 
-} 
- 
-bool HWAddressSanitizer::instrumentMemAccess(InterestingMemoryOperand &O) { 
-  Value *Addr = O.getPtr(); 
- 
-  LLVM_DEBUG(dbgs() << "Instrumenting: " << O.getInsn() << "\n"); 
- 
-  if (O.MaybeMask) 
-    return false; //FIXME 
- 
-  IRBuilder<> IRB(O.getInsn()); 
-  if (isPowerOf2_64(O.TypeSize) && 
-      (O.TypeSize / 8 <= (1ULL << (kNumberOfAccessSizes - 1))) && 
-      (!O.Alignment || *O.Alignment >= (1ULL << Mapping.Scale) || 
-       *O.Alignment >= O.TypeSize / 8)) { 
-    size_t AccessSizeIndex = TypeSizeToSizeIndex(O.TypeSize); 
-    if (ClInstrumentWithCalls) { 
-      IRB.CreateCall(HwasanMemoryAccessCallback[O.IsWrite][AccessSizeIndex], 
-                     IRB.CreatePointerCast(Addr, IntptrTy)); 
-    } else { 
-      instrumentMemAccessInline(Addr, O.IsWrite, AccessSizeIndex, O.getInsn()); 
-    } 
-  } else { 
-    IRB.CreateCall(HwasanMemoryAccessCallbackSized[O.IsWrite], 
-                   {IRB.CreatePointerCast(Addr, IntptrTy), 
-                    ConstantInt::get(IntptrTy, O.TypeSize / 8)}); 
-  } 
-  untagPointerOperand(O.getInsn(), Addr); 
- 
-  return true; 
-} 
- 
-static uint64_t getAllocaSizeInBytes(const AllocaInst &AI) { 
-  uint64_t ArraySize = 1; 
-  if (AI.isArrayAllocation()) { 
-    const ConstantInt *CI = dyn_cast<ConstantInt>(AI.getArraySize()); 
-    assert(CI && "non-constant array size"); 
-    ArraySize = CI->getZExtValue(); 
-  } 
-  Type *Ty = AI.getAllocatedType(); 
-  uint64_t SizeInBytes = AI.getModule()->getDataLayout().getTypeAllocSize(Ty); 
-  return SizeInBytes * ArraySize; 
-} 
- 
-bool HWAddressSanitizer::tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, 
-                                   Value *Tag, size_t Size) { 
-  size_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment()); 
-  if (!UseShortGranules) 
-    Size = AlignedSize; 
- 
-  Value *JustTag = IRB.CreateTrunc(Tag, IRB.getInt8Ty()); 
-  if (ClInstrumentWithCalls) { 
-    IRB.CreateCall(HwasanTagMemoryFunc, 
-                   {IRB.CreatePointerCast(AI, Int8PtrTy), JustTag, 
-                    ConstantInt::get(IntptrTy, AlignedSize)}); 
-  } else { 
-    size_t ShadowSize = Size >> Mapping.Scale; 
-    Value *ShadowPtr = memToShadow(IRB.CreatePointerCast(AI, IntptrTy), IRB); 
-    // If this memset is not inlined, it will be intercepted in the hwasan 
-    // runtime library. That's OK, because the interceptor skips the checks if 
-    // the address is in the shadow region. 
-    // FIXME: the interceptor is not as fast as real memset. Consider lowering 
-    // llvm.memset right here into either a sequence of stores, or a call to 
-    // hwasan_tag_memory. 
-    if (ShadowSize) 
-      IRB.CreateMemSet(ShadowPtr, JustTag, ShadowSize, Align(1)); 
-    if (Size != AlignedSize) { 
-      IRB.CreateStore( 
-          ConstantInt::get(Int8Ty, Size % Mapping.getObjectAlignment()), 
-          IRB.CreateConstGEP1_32(Int8Ty, ShadowPtr, ShadowSize)); 
-      IRB.CreateStore(JustTag, IRB.CreateConstGEP1_32( 
-                                   Int8Ty, IRB.CreateBitCast(AI, Int8PtrTy), 
-                                   AlignedSize - 1)); 
-    } 
-  } 
-  return true; 
-} 
- 
-static unsigned RetagMask(unsigned AllocaNo) { 
-  // A list of 8-bit numbers that have at most one run of non-zero bits. 
-  // x = x ^ (mask << 56) can be encoded as a single armv8 instruction for these 
-  // masks. 
-  // The list does not include the value 255, which is used for UAR. 
-  // 
-  // Because we are more likely to use earlier elements of this list than later 
-  // ones, it is sorted in increasing order of probability of collision with a 
-  // mask allocated (temporally) nearby. The program that generated this list 
-  // can be found at: 
-  // https://github.com/google/sanitizers/blob/master/hwaddress-sanitizer/sort_masks.py 
-  static unsigned FastMasks[] = {0,  128, 64,  192, 32,  96,  224, 112, 240, 
-                                 48, 16,  120, 248, 56,  24,  8,   124, 252, 
-                                 60, 28,  12,  4,   126, 254, 62,  30,  14, 
-                                 6,  2,   127, 63,  31,  15,  7,   3,   1}; 
-  return FastMasks[AllocaNo % (sizeof(FastMasks) / sizeof(FastMasks[0]))]; 
-} 
- 
-Value *HWAddressSanitizer::getNextTagWithCall(IRBuilder<> &IRB) { 
-  return IRB.CreateZExt(IRB.CreateCall(HwasanGenerateTagFunc), IntptrTy); 
-} 
- 
-Value *HWAddressSanitizer::getStackBaseTag(IRBuilder<> &IRB) { 
-  if (ClGenerateTagsWithCalls) 
-    return getNextTagWithCall(IRB); 
-  if (StackBaseTag) 
-    return StackBaseTag; 
-  // FIXME: use addressofreturnaddress (but implement it in aarch64 backend 
-  // first). 
-  Module *M = IRB.GetInsertBlock()->getParent()->getParent(); 
-  auto GetStackPointerFn = Intrinsic::getDeclaration( 
-      M, Intrinsic::frameaddress, 
-      IRB.getInt8PtrTy(M->getDataLayout().getAllocaAddrSpace())); 
-  Value *StackPointer = IRB.CreateCall( 
-      GetStackPointerFn, {Constant::getNullValue(IRB.getInt32Ty())}); 
- 
-  // Extract some entropy from the stack pointer for the tags. 
-  // Take bits 20..28 (ASLR entropy) and xor with bits 0..8 (these differ 
-  // between functions). 
-  Value *StackPointerLong = IRB.CreatePointerCast(StackPointer, IntptrTy); 
-  Value *StackTag = 
-      IRB.CreateXor(StackPointerLong, IRB.CreateLShr(StackPointerLong, 20), 
-                    "hwasan.stack.base.tag"); 
-  return StackTag; 
-} 
- 
-Value *HWAddressSanitizer::getAllocaTag(IRBuilder<> &IRB, Value *StackTag, 
-                                        AllocaInst *AI, unsigned AllocaNo) { 
-  if (ClGenerateTagsWithCalls) 
-    return getNextTagWithCall(IRB); 
-  return IRB.CreateXor(StackTag, 
-                       ConstantInt::get(IntptrTy, RetagMask(AllocaNo))); 
-} 
- 
-Value *HWAddressSanitizer::getUARTag(IRBuilder<> &IRB, Value *StackTag) { 
-  if (ClUARRetagToZero) 
-    return ConstantInt::get(IntptrTy, 0); 
-  if (ClGenerateTagsWithCalls) 
-    return getNextTagWithCall(IRB); 
-  return IRB.CreateXor(StackTag, ConstantInt::get(IntptrTy, 0xFFU)); 
-} 
- 
-// Add a tag to an address. 
-Value *HWAddressSanitizer::tagPointer(IRBuilder<> &IRB, Type *Ty, 
-                                      Value *PtrLong, Value *Tag) { 
-  Value *TaggedPtrLong; 
-  if (CompileKernel) { 
-    // Kernel addresses have 0xFF in the most significant byte. 
-    Value *ShiftedTag = IRB.CreateOr( 
-        IRB.CreateShl(Tag, kPointerTagShift), 
-        ConstantInt::get(IntptrTy, (1ULL << kPointerTagShift) - 1)); 
-    TaggedPtrLong = IRB.CreateAnd(PtrLong, ShiftedTag); 
-  } else { 
-    // Userspace can simply do OR (tag << 56); 
-    Value *ShiftedTag = IRB.CreateShl(Tag, kPointerTagShift); 
-    TaggedPtrLong = IRB.CreateOr(PtrLong, ShiftedTag); 
-  } 
-  return IRB.CreateIntToPtr(TaggedPtrLong, Ty); 
-} 
- 
-// Remove tag from an address. 
-Value *HWAddressSanitizer::untagPointer(IRBuilder<> &IRB, Value *PtrLong) { 
-  Value *UntaggedPtrLong; 
-  if (CompileKernel) { 
-    // Kernel addresses have 0xFF in the most significant byte. 
-    UntaggedPtrLong = IRB.CreateOr(PtrLong, 
-        ConstantInt::get(PtrLong->getType(), 0xFFULL << kPointerTagShift)); 
-  } else { 
-    // Userspace addresses have 0x00. 
-    UntaggedPtrLong = IRB.CreateAnd(PtrLong, 
-        ConstantInt::get(PtrLong->getType(), ~(0xFFULL << kPointerTagShift))); 
-  } 
-  return UntaggedPtrLong; 
-} 
- 
-Value *HWAddressSanitizer::getHwasanThreadSlotPtr(IRBuilder<> &IRB, Type *Ty) { 
-  Module *M = IRB.GetInsertBlock()->getParent()->getParent(); 
-  if (TargetTriple.isAArch64() && TargetTriple.isAndroid()) { 
-    // Android provides a fixed TLS slot for sanitizers. See TLS_SLOT_SANITIZER 
-    // in Bionic's libc/private/bionic_tls.h. 
-    Function *ThreadPointerFunc = 
-        Intrinsic::getDeclaration(M, Intrinsic::thread_pointer); 
-    Value *SlotPtr = IRB.CreatePointerCast( 
-        IRB.CreateConstGEP1_32(IRB.getInt8Ty(), 
-                               IRB.CreateCall(ThreadPointerFunc), 0x30), 
-        Ty->getPointerTo(0)); 
-    return SlotPtr; 
-  } 
-  if (ThreadPtrGlobal) 
-    return ThreadPtrGlobal; 
- 
- 
-  return nullptr; 
-} 
- 
-void HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord) { 
-  if (!Mapping.InTls) { 
+          "{x0}",
+          /*hasSideEffects=*/true);
+      break;
+    default:
+      report_fatal_error("unsupported architecture");
+  }
+  IRB.CreateCall(Asm, PtrLong);
+  if (Recover)
+    cast<BranchInst>(CheckFailTerm)->setSuccessor(0, CheckTerm->getParent());
+}
+
+void HWAddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
+  IRBuilder<> IRB(MI);
+  if (isa<MemTransferInst>(MI)) {
+    IRB.CreateCall(
+        isa<MemMoveInst>(MI) ? HWAsanMemmove : HWAsanMemcpy,
+        {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreatePointerCast(MI->getOperand(1), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
+  } else if (isa<MemSetInst>(MI)) {
+    IRB.CreateCall(
+        HWAsanMemset,
+        {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(MI->getOperand(1), IRB.getInt32Ty(), false),
+         IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
+  }
+  MI->eraseFromParent();
+}
+
+bool HWAddressSanitizer::instrumentMemAccess(InterestingMemoryOperand &O) {
+  Value *Addr = O.getPtr();
+
+  LLVM_DEBUG(dbgs() << "Instrumenting: " << O.getInsn() << "\n");
+
+  if (O.MaybeMask)
+    return false; //FIXME
+
+  IRBuilder<> IRB(O.getInsn());
+  if (isPowerOf2_64(O.TypeSize) &&
+      (O.TypeSize / 8 <= (1ULL << (kNumberOfAccessSizes - 1))) &&
+      (!O.Alignment || *O.Alignment >= (1ULL << Mapping.Scale) ||
+       *O.Alignment >= O.TypeSize / 8)) {
+    size_t AccessSizeIndex = TypeSizeToSizeIndex(O.TypeSize);
+    if (ClInstrumentWithCalls) {
+      IRB.CreateCall(HwasanMemoryAccessCallback[O.IsWrite][AccessSizeIndex],
+                     IRB.CreatePointerCast(Addr, IntptrTy));
+    } else {
+      instrumentMemAccessInline(Addr, O.IsWrite, AccessSizeIndex, O.getInsn());
+    }
+  } else {
+    IRB.CreateCall(HwasanMemoryAccessCallbackSized[O.IsWrite],
+                   {IRB.CreatePointerCast(Addr, IntptrTy),
+                    ConstantInt::get(IntptrTy, O.TypeSize / 8)});
+  }
+  untagPointerOperand(O.getInsn(), Addr);
+
+  return true;
+}
+
+static uint64_t getAllocaSizeInBytes(const AllocaInst &AI) {
+  uint64_t ArraySize = 1;
+  if (AI.isArrayAllocation()) {
+    const ConstantInt *CI = dyn_cast<ConstantInt>(AI.getArraySize());
+    assert(CI && "non-constant array size");
+    ArraySize = CI->getZExtValue();
+  }
+  Type *Ty = AI.getAllocatedType();
+  uint64_t SizeInBytes = AI.getModule()->getDataLayout().getTypeAllocSize(Ty);
+  return SizeInBytes * ArraySize;
+}
+
+bool HWAddressSanitizer::tagAlloca(IRBuilder<> &IRB, AllocaInst *AI,
+                                   Value *Tag, size_t Size) {
+  size_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
+  if (!UseShortGranules)
+    Size = AlignedSize;
+
+  Value *JustTag = IRB.CreateTrunc(Tag, IRB.getInt8Ty());
+  if (ClInstrumentWithCalls) {
+    IRB.CreateCall(HwasanTagMemoryFunc,
+                   {IRB.CreatePointerCast(AI, Int8PtrTy), JustTag,
+                    ConstantInt::get(IntptrTy, AlignedSize)});
+  } else {
+    size_t ShadowSize = Size >> Mapping.Scale;
+    Value *ShadowPtr = memToShadow(IRB.CreatePointerCast(AI, IntptrTy), IRB);
+    // If this memset is not inlined, it will be intercepted in the hwasan
+    // runtime library. That's OK, because the interceptor skips the checks if
+    // the address is in the shadow region.
+    // FIXME: the interceptor is not as fast as real memset. Consider lowering
+    // llvm.memset right here into either a sequence of stores, or a call to
+    // hwasan_tag_memory.
+    if (ShadowSize)
+      IRB.CreateMemSet(ShadowPtr, JustTag, ShadowSize, Align(1));
+    if (Size != AlignedSize) {
+      IRB.CreateStore(
+          ConstantInt::get(Int8Ty, Size % Mapping.getObjectAlignment()),
+          IRB.CreateConstGEP1_32(Int8Ty, ShadowPtr, ShadowSize));
+      IRB.CreateStore(JustTag, IRB.CreateConstGEP1_32(
+                                   Int8Ty, IRB.CreateBitCast(AI, Int8PtrTy),
+                                   AlignedSize - 1));
+    }
+  }
+  return true;
+}
+
+static unsigned RetagMask(unsigned AllocaNo) {
+  // A list of 8-bit numbers that have at most one run of non-zero bits.
+  // x = x ^ (mask << 56) can be encoded as a single armv8 instruction for these
+  // masks.
+  // The list does not include the value 255, which is used for UAR.
+  //
+  // Because we are more likely to use earlier elements of this list than later
+  // ones, it is sorted in increasing order of probability of collision with a
+  // mask allocated (temporally) nearby. The program that generated this list
+  // can be found at:
+  // https://github.com/google/sanitizers/blob/master/hwaddress-sanitizer/sort_masks.py
+  static unsigned FastMasks[] = {0,  128, 64,  192, 32,  96,  224, 112, 240,
+                                 48, 16,  120, 248, 56,  24,  8,   124, 252,
+                                 60, 28,  12,  4,   126, 254, 62,  30,  14,
+                                 6,  2,   127, 63,  31,  15,  7,   3,   1};
+  return FastMasks[AllocaNo % (sizeof(FastMasks) / sizeof(FastMasks[0]))];
+}
+
+Value *HWAddressSanitizer::getNextTagWithCall(IRBuilder<> &IRB) {
+  return IRB.CreateZExt(IRB.CreateCall(HwasanGenerateTagFunc), IntptrTy);
+}
+
+Value *HWAddressSanitizer::getStackBaseTag(IRBuilder<> &IRB) {
+  if (ClGenerateTagsWithCalls)
+    return getNextTagWithCall(IRB);
+  if (StackBaseTag)
+    return StackBaseTag;
+  // FIXME: use addressofreturnaddress (but implement it in aarch64 backend
+  // first).
+  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+  auto GetStackPointerFn = Intrinsic::getDeclaration(
+      M, Intrinsic::frameaddress,
+      IRB.getInt8PtrTy(M->getDataLayout().getAllocaAddrSpace()));
+  Value *StackPointer = IRB.CreateCall(
+      GetStackPointerFn, {Constant::getNullValue(IRB.getInt32Ty())});
+
+  // Extract some entropy from the stack pointer for the tags.
+  // Take bits 20..28 (ASLR entropy) and xor with bits 0..8 (these differ
+  // between functions).
+  Value *StackPointerLong = IRB.CreatePointerCast(StackPointer, IntptrTy);
+  Value *StackTag =
+      IRB.CreateXor(StackPointerLong, IRB.CreateLShr(StackPointerLong, 20),
+                    "hwasan.stack.base.tag");
+  return StackTag;
+}
+
+Value *HWAddressSanitizer::getAllocaTag(IRBuilder<> &IRB, Value *StackTag,
+                                        AllocaInst *AI, unsigned AllocaNo) {
+  if (ClGenerateTagsWithCalls)
+    return getNextTagWithCall(IRB);
+  return IRB.CreateXor(StackTag,
+                       ConstantInt::get(IntptrTy, RetagMask(AllocaNo)));
+}
+
+Value *HWAddressSanitizer::getUARTag(IRBuilder<> &IRB, Value *StackTag) {
+  if (ClUARRetagToZero)
+    return ConstantInt::get(IntptrTy, 0);
+  if (ClGenerateTagsWithCalls)
+    return getNextTagWithCall(IRB);
+  return IRB.CreateXor(StackTag, ConstantInt::get(IntptrTy, 0xFFU));
+}
+
+// Add a tag to an address.
+Value *HWAddressSanitizer::tagPointer(IRBuilder<> &IRB, Type *Ty,
+                                      Value *PtrLong, Value *Tag) {
+  Value *TaggedPtrLong;
+  if (CompileKernel) {
+    // Kernel addresses have 0xFF in the most significant byte.
+    Value *ShiftedTag = IRB.CreateOr(
+        IRB.CreateShl(Tag, kPointerTagShift),
+        ConstantInt::get(IntptrTy, (1ULL << kPointerTagShift) - 1));
+    TaggedPtrLong = IRB.CreateAnd(PtrLong, ShiftedTag);
+  } else {
+    // Userspace can simply do OR (tag << 56);
+    Value *ShiftedTag = IRB.CreateShl(Tag, kPointerTagShift);
+    TaggedPtrLong = IRB.CreateOr(PtrLong, ShiftedTag);
+  }
+  return IRB.CreateIntToPtr(TaggedPtrLong, Ty);
+}
+
+// Remove tag from an address.
+Value *HWAddressSanitizer::untagPointer(IRBuilder<> &IRB, Value *PtrLong) {
+  Value *UntaggedPtrLong;
+  if (CompileKernel) {
+    // Kernel addresses have 0xFF in the most significant byte.
+    UntaggedPtrLong = IRB.CreateOr(PtrLong,
+        ConstantInt::get(PtrLong->getType(), 0xFFULL << kPointerTagShift));
+  } else {
+    // Userspace addresses have 0x00.
+    UntaggedPtrLong = IRB.CreateAnd(PtrLong,
+        ConstantInt::get(PtrLong->getType(), ~(0xFFULL << kPointerTagShift)));
+  }
+  return UntaggedPtrLong;
+}
+
+Value *HWAddressSanitizer::getHwasanThreadSlotPtr(IRBuilder<> &IRB, Type *Ty) {
+  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+  if (TargetTriple.isAArch64() && TargetTriple.isAndroid()) {
+    // Android provides a fixed TLS slot for sanitizers. See TLS_SLOT_SANITIZER
+    // in Bionic's libc/private/bionic_tls.h.
+    Function *ThreadPointerFunc =
+        Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
+    Value *SlotPtr = IRB.CreatePointerCast(
+        IRB.CreateConstGEP1_32(IRB.getInt8Ty(),
+                               IRB.CreateCall(ThreadPointerFunc), 0x30),
+        Ty->getPointerTo(0));
+    return SlotPtr;
+  }
+  if (ThreadPtrGlobal)
+    return ThreadPtrGlobal;
+
+
+  return nullptr;
+}
+
+void HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord) {
+  if (!Mapping.InTls) {
     ShadowBase = getShadowNonTls(IRB);
-    return; 
-  } 
- 
-  if (!WithFrameRecord && TargetTriple.isAndroid()) { 
+    return;
+  }
+
+  if (!WithFrameRecord && TargetTriple.isAndroid()) {
     ShadowBase = getDynamicShadowIfunc(IRB);
-    return; 
-  } 
- 
-  Value *SlotPtr = getHwasanThreadSlotPtr(IRB, IntptrTy); 
-  assert(SlotPtr); 
- 
-  Value *ThreadLong = IRB.CreateLoad(IntptrTy, SlotPtr); 
-  // Extract the address field from ThreadLong. Unnecessary on AArch64 with TBI. 
-  Value *ThreadLongMaybeUntagged = 
-      TargetTriple.isAArch64() ? ThreadLong : untagPointer(IRB, ThreadLong); 
- 
-  if (WithFrameRecord) { 
-    Function *F = IRB.GetInsertBlock()->getParent(); 
-    StackBaseTag = IRB.CreateAShr(ThreadLong, 3); 
- 
-    // Prepare ring buffer data. 
-    Value *PC; 
-    if (TargetTriple.getArch() == Triple::aarch64) 
-      PC = readRegister(IRB, "pc"); 
-    else 
-      PC = IRB.CreatePtrToInt(F, IntptrTy); 
-    Module *M = F->getParent(); 
-    auto GetStackPointerFn = Intrinsic::getDeclaration( 
-        M, Intrinsic::frameaddress, 
-        IRB.getInt8PtrTy(M->getDataLayout().getAllocaAddrSpace())); 
-    Value *SP = IRB.CreatePtrToInt( 
-        IRB.CreateCall(GetStackPointerFn, 
-                       {Constant::getNullValue(IRB.getInt32Ty())}), 
-        IntptrTy); 
-    // Mix SP and PC. 
-    // Assumptions: 
-    // PC is 0x0000PPPPPPPPPPPP  (48 bits are meaningful, others are zero) 
-    // SP is 0xsssssssssssSSSS0  (4 lower bits are zero) 
-    // We only really need ~20 lower non-zero bits (SSSS), so we mix like this: 
-    //       0xSSSSPPPPPPPPPPPP 
-    SP = IRB.CreateShl(SP, 44); 
- 
-    // Store data to ring buffer. 
-    Value *RecordPtr = 
-        IRB.CreateIntToPtr(ThreadLongMaybeUntagged, IntptrTy->getPointerTo(0)); 
-    IRB.CreateStore(IRB.CreateOr(PC, SP), RecordPtr); 
- 
-    // Update the ring buffer. Top byte of ThreadLong defines the size of the 
-    // buffer in pages, it must be a power of two, and the start of the buffer 
-    // must be aligned by twice that much. Therefore wrap around of the ring 
-    // buffer is simply Addr &= ~((ThreadLong >> 56) << 12). 
-    // The use of AShr instead of LShr is due to 
-    //   https://bugs.llvm.org/show_bug.cgi?id=39030 
-    // Runtime library makes sure not to use the highest bit. 
-    Value *WrapMask = IRB.CreateXor( 
-        IRB.CreateShl(IRB.CreateAShr(ThreadLong, 56), 12, "", true, true), 
-        ConstantInt::get(IntptrTy, (uint64_t)-1)); 
-    Value *ThreadLongNew = IRB.CreateAnd( 
-        IRB.CreateAdd(ThreadLong, ConstantInt::get(IntptrTy, 8)), WrapMask); 
-    IRB.CreateStore(ThreadLongNew, SlotPtr); 
-  } 
- 
-  // Get shadow base address by aligning RecordPtr up. 
-  // Note: this is not correct if the pointer is already aligned. 
-  // Runtime library will make sure this never happens. 
+    return;
+  }
+
+  Value *SlotPtr = getHwasanThreadSlotPtr(IRB, IntptrTy);
+  assert(SlotPtr);
+
+  Value *ThreadLong = IRB.CreateLoad(IntptrTy, SlotPtr);
+  // Extract the address field from ThreadLong. Unnecessary on AArch64 with TBI.
+  Value *ThreadLongMaybeUntagged =
+      TargetTriple.isAArch64() ? ThreadLong : untagPointer(IRB, ThreadLong);
+
+  if (WithFrameRecord) {
+    Function *F = IRB.GetInsertBlock()->getParent();
+    StackBaseTag = IRB.CreateAShr(ThreadLong, 3);
+
+    // Prepare ring buffer data.
+    Value *PC;
+    if (TargetTriple.getArch() == Triple::aarch64)
+      PC = readRegister(IRB, "pc");
+    else
+      PC = IRB.CreatePtrToInt(F, IntptrTy);
+    Module *M = F->getParent();
+    auto GetStackPointerFn = Intrinsic::getDeclaration(
+        M, Intrinsic::frameaddress,
+        IRB.getInt8PtrTy(M->getDataLayout().getAllocaAddrSpace()));
+    Value *SP = IRB.CreatePtrToInt(
+        IRB.CreateCall(GetStackPointerFn,
+                       {Constant::getNullValue(IRB.getInt32Ty())}),
+        IntptrTy);
+    // Mix SP and PC.
+    // Assumptions:
+    // PC is 0x0000PPPPPPPPPPPP  (48 bits are meaningful, others are zero)
+    // SP is 0xsssssssssssSSSS0  (4 lower bits are zero)
+    // We only really need ~20 lower non-zero bits (SSSS), so we mix like this:
+    //       0xSSSSPPPPPPPPPPPP
+    SP = IRB.CreateShl(SP, 44);
+
+    // Store data to ring buffer.
+    Value *RecordPtr =
+        IRB.CreateIntToPtr(ThreadLongMaybeUntagged, IntptrTy->getPointerTo(0));
+    IRB.CreateStore(IRB.CreateOr(PC, SP), RecordPtr);
+
+    // Update the ring buffer. Top byte of ThreadLong defines the size of the
+    // buffer in pages, it must be a power of two, and the start of the buffer
+    // must be aligned by twice that much. Therefore wrap around of the ring
+    // buffer is simply Addr &= ~((ThreadLong >> 56) << 12).
+    // The use of AShr instead of LShr is due to
+    //   https://bugs.llvm.org/show_bug.cgi?id=39030
+    // Runtime library makes sure not to use the highest bit.
+    Value *WrapMask = IRB.CreateXor(
+        IRB.CreateShl(IRB.CreateAShr(ThreadLong, 56), 12, "", true, true),
+        ConstantInt::get(IntptrTy, (uint64_t)-1));
+    Value *ThreadLongNew = IRB.CreateAnd(
+        IRB.CreateAdd(ThreadLong, ConstantInt::get(IntptrTy, 8)), WrapMask);
+    IRB.CreateStore(ThreadLongNew, SlotPtr);
+  }
+
+  // Get shadow base address by aligning RecordPtr up.
+  // Note: this is not correct if the pointer is already aligned.
+  // Runtime library will make sure this never happens.
   ShadowBase = IRB.CreateAdd(
-      IRB.CreateOr( 
-          ThreadLongMaybeUntagged, 
-          ConstantInt::get(IntptrTy, (1ULL << kShadowBaseAlignment) - 1)), 
-      ConstantInt::get(IntptrTy, 1), "hwasan.shadow"); 
+      IRB.CreateOr(
+          ThreadLongMaybeUntagged,
+          ConstantInt::get(IntptrTy, (1ULL << kShadowBaseAlignment) - 1)),
+      ConstantInt::get(IntptrTy, 1), "hwasan.shadow");
   ShadowBase = IRB.CreateIntToPtr(ShadowBase, Int8PtrTy);
-} 
- 
-Value *HWAddressSanitizer::readRegister(IRBuilder<> &IRB, StringRef Name) { 
-  Module *M = IRB.GetInsertBlock()->getParent()->getParent(); 
-  Function *ReadRegister = 
-      Intrinsic::getDeclaration(M, Intrinsic::read_register, IntptrTy); 
-  MDNode *MD = MDNode::get(*C, {MDString::get(*C, Name)}); 
-  Value *Args[] = {MetadataAsValue::get(*C, MD)}; 
-  return IRB.CreateCall(ReadRegister, Args); 
-} 
- 
-bool HWAddressSanitizer::instrumentLandingPads( 
-    SmallVectorImpl<Instruction *> &LandingPadVec) { 
-  for (auto *LP : LandingPadVec) { 
-    IRBuilder<> IRB(LP->getNextNode()); 
-    IRB.CreateCall( 
-        HWAsanHandleVfork, 
-        {readRegister(IRB, (TargetTriple.getArch() == Triple::x86_64) ? "rsp" 
-                                                                      : "sp")}); 
-  } 
-  return true; 
-} 
- 
-bool HWAddressSanitizer::instrumentStack( 
-    SmallVectorImpl<AllocaInst *> &Allocas, 
-    DenseMap<AllocaInst *, std::vector<DbgVariableIntrinsic *>> &AllocaDbgMap, 
-    SmallVectorImpl<Instruction *> &RetVec, Value *StackTag) { 
-  // Ideally, we want to calculate tagged stack base pointer, and rewrite all 
-  // alloca addresses using that. Unfortunately, offsets are not known yet 
-  // (unless we use ASan-style mega-alloca). Instead we keep the base tag in a 
-  // temp, shift-OR it into each alloca address and xor with the retag mask. 
-  // This generates one extra instruction per alloca use. 
-  for (unsigned N = 0; N < Allocas.size(); ++N) { 
-    auto *AI = Allocas[N]; 
-    IRBuilder<> IRB(AI->getNextNode()); 
- 
-    // Replace uses of the alloca with tagged address. 
-    Value *Tag = getAllocaTag(IRB, StackTag, AI, N); 
-    Value *AILong = IRB.CreatePointerCast(AI, IntptrTy); 
-    Value *Replacement = tagPointer(IRB, AI->getType(), AILong, Tag); 
-    std::string Name = 
-        AI->hasName() ? AI->getName().str() : "alloca." + itostr(N); 
-    Replacement->setName(Name + ".hwasan"); 
- 
-    AI->replaceUsesWithIf(Replacement, 
-                          [AILong](Use &U) { return U.getUser() != AILong; }); 
- 
-    for (auto *DDI : AllocaDbgMap.lookup(AI)) { 
-      // Prepend "tag_offset, N" to the dwarf expression. 
-      // Tag offset logically applies to the alloca pointer, and it makes sense 
-      // to put it at the beginning of the expression. 
-      SmallVector<uint64_t, 8> NewOps = {dwarf::DW_OP_LLVM_tag_offset, 
-                                         RetagMask(N)}; 
-      DDI->setArgOperand( 
-          2, MetadataAsValue::get(*C, DIExpression::prependOpcodes( 
-                                          DDI->getExpression(), NewOps))); 
-    } 
- 
-    size_t Size = getAllocaSizeInBytes(*AI); 
-    tagAlloca(IRB, AI, Tag, Size); 
- 
-    for (auto RI : RetVec) { 
-      IRB.SetInsertPoint(RI); 
- 
-      // Re-tag alloca memory with the special UAR tag. 
-      Value *Tag = getUARTag(IRB, StackTag); 
-      tagAlloca(IRB, AI, Tag, alignTo(Size, Mapping.getObjectAlignment())); 
-    } 
-  } 
- 
-  return true; 
-} 
- 
-bool HWAddressSanitizer::isInterestingAlloca(const AllocaInst &AI) { 
-  return (AI.getAllocatedType()->isSized() && 
-          // FIXME: instrument dynamic allocas, too 
-          AI.isStaticAlloca() && 
-          // alloca() may be called with 0 size, ignore it. 
-          getAllocaSizeInBytes(AI) > 0 && 
-          // We are only interested in allocas not promotable to registers. 
-          // Promotable allocas are common under -O0. 
-          !isAllocaPromotable(&AI) && 
-          // inalloca allocas are not treated as static, and we don't want 
-          // dynamic alloca instrumentation for them as well. 
-          !AI.isUsedWithInAlloca() && 
-          // swifterror allocas are register promoted by ISel 
-          !AI.isSwiftError()); 
-} 
- 
-bool HWAddressSanitizer::sanitizeFunction(Function &F) { 
-  if (&F == HwasanCtorFunction) 
-    return false; 
- 
-  if (!F.hasFnAttribute(Attribute::SanitizeHWAddress)) 
-    return false; 
- 
-  LLVM_DEBUG(dbgs() << "Function: " << F.getName() << "\n"); 
- 
-  SmallVector<InterestingMemoryOperand, 16> OperandsToInstrument; 
-  SmallVector<MemIntrinsic *, 16> IntrinToInstrument; 
-  SmallVector<AllocaInst*, 8> AllocasToInstrument; 
-  SmallVector<Instruction*, 8> RetVec; 
-  SmallVector<Instruction*, 8> LandingPadVec; 
-  DenseMap<AllocaInst *, std::vector<DbgVariableIntrinsic *>> AllocaDbgMap; 
-  for (auto &BB : F) { 
-    for (auto &Inst : BB) { 
-      if (ClInstrumentStack) 
-        if (AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) { 
-          if (isInterestingAlloca(*AI)) 
-            AllocasToInstrument.push_back(AI); 
-          continue; 
-        } 
- 
-      if (isa<ReturnInst>(Inst) || isa<ResumeInst>(Inst) || 
-          isa<CleanupReturnInst>(Inst)) 
-        RetVec.push_back(&Inst); 
- 
-      if (auto *DDI = dyn_cast<DbgVariableIntrinsic>(&Inst)) 
-        if (auto *Alloca = 
-                dyn_cast_or_null<AllocaInst>(DDI->getVariableLocation())) 
-          AllocaDbgMap[Alloca].push_back(DDI); 
- 
-      if (InstrumentLandingPads && isa<LandingPadInst>(Inst)) 
-        LandingPadVec.push_back(&Inst); 
- 
-      getInterestingMemoryOperands(&Inst, OperandsToInstrument); 
- 
-      if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(&Inst)) 
-        IntrinToInstrument.push_back(MI); 
-    } 
-  } 
- 
-  initializeCallbacks(*F.getParent()); 
- 
-  bool Changed = false; 
- 
-  if (!LandingPadVec.empty()) 
-    Changed |= instrumentLandingPads(LandingPadVec); 
- 
-  if (AllocasToInstrument.empty() && F.hasPersonalityFn() && 
-      F.getPersonalityFn()->getName() == kHwasanPersonalityThunkName) { 
-    // __hwasan_personality_thunk is a no-op for functions without an 
-    // instrumented stack, so we can drop it. 
-    F.setPersonalityFn(nullptr); 
-    Changed = true; 
-  } 
- 
-  if (AllocasToInstrument.empty() && OperandsToInstrument.empty() && 
-      IntrinToInstrument.empty()) 
-    return Changed; 
- 
+}
+
+Value *HWAddressSanitizer::readRegister(IRBuilder<> &IRB, StringRef Name) {
+  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+  Function *ReadRegister =
+      Intrinsic::getDeclaration(M, Intrinsic::read_register, IntptrTy);
+  MDNode *MD = MDNode::get(*C, {MDString::get(*C, Name)});
+  Value *Args[] = {MetadataAsValue::get(*C, MD)};
+  return IRB.CreateCall(ReadRegister, Args);
+}
+
+bool HWAddressSanitizer::instrumentLandingPads(
+    SmallVectorImpl<Instruction *> &LandingPadVec) {
+  for (auto *LP : LandingPadVec) {
+    IRBuilder<> IRB(LP->getNextNode());
+    IRB.CreateCall(
+        HWAsanHandleVfork,
+        {readRegister(IRB, (TargetTriple.getArch() == Triple::x86_64) ? "rsp"
+                                                                      : "sp")});
+  }
+  return true;
+}
+
+bool HWAddressSanitizer::instrumentStack(
+    SmallVectorImpl<AllocaInst *> &Allocas,
+    DenseMap<AllocaInst *, std::vector<DbgVariableIntrinsic *>> &AllocaDbgMap,
+    SmallVectorImpl<Instruction *> &RetVec, Value *StackTag) {
+  // Ideally, we want to calculate tagged stack base pointer, and rewrite all
+  // alloca addresses using that. Unfortunately, offsets are not known yet
+  // (unless we use ASan-style mega-alloca). Instead we keep the base tag in a
+  // temp, shift-OR it into each alloca address and xor with the retag mask.
+  // This generates one extra instruction per alloca use.
+  for (unsigned N = 0; N < Allocas.size(); ++N) {
+    auto *AI = Allocas[N];
+    IRBuilder<> IRB(AI->getNextNode());
+
+    // Replace uses of the alloca with tagged address.
+    Value *Tag = getAllocaTag(IRB, StackTag, AI, N);
+    Value *AILong = IRB.CreatePointerCast(AI, IntptrTy);
+    Value *Replacement = tagPointer(IRB, AI->getType(), AILong, Tag);
+    std::string Name =
+        AI->hasName() ? AI->getName().str() : "alloca." + itostr(N);
+    Replacement->setName(Name + ".hwasan");
+
+    AI->replaceUsesWithIf(Replacement,
+                          [AILong](Use &U) { return U.getUser() != AILong; });
+
+    for (auto *DDI : AllocaDbgMap.lookup(AI)) {
+      // Prepend "tag_offset, N" to the dwarf expression.
+      // Tag offset logically applies to the alloca pointer, and it makes sense
+      // to put it at the beginning of the expression.
+      SmallVector<uint64_t, 8> NewOps = {dwarf::DW_OP_LLVM_tag_offset,
+                                         RetagMask(N)};
+      DDI->setArgOperand(
+          2, MetadataAsValue::get(*C, DIExpression::prependOpcodes(
+                                          DDI->getExpression(), NewOps)));
+    }
+
+    size_t Size = getAllocaSizeInBytes(*AI);
+    tagAlloca(IRB, AI, Tag, Size);
+
+    for (auto RI : RetVec) {
+      IRB.SetInsertPoint(RI);
+
+      // Re-tag alloca memory with the special UAR tag.
+      Value *Tag = getUARTag(IRB, StackTag);
+      tagAlloca(IRB, AI, Tag, alignTo(Size, Mapping.getObjectAlignment()));
+    }
+  }
+
+  return true;
+}
+
+bool HWAddressSanitizer::isInterestingAlloca(const AllocaInst &AI) {
+  return (AI.getAllocatedType()->isSized() &&
+          // FIXME: instrument dynamic allocas, too
+          AI.isStaticAlloca() &&
+          // alloca() may be called with 0 size, ignore it.
+          getAllocaSizeInBytes(AI) > 0 &&
+          // We are only interested in allocas not promotable to registers.
+          // Promotable allocas are common under -O0.
+          !isAllocaPromotable(&AI) &&
+          // inalloca allocas are not treated as static, and we don't want
+          // dynamic alloca instrumentation for them as well.
+          !AI.isUsedWithInAlloca() &&
+          // swifterror allocas are register promoted by ISel
+          !AI.isSwiftError());
+}
+
+bool HWAddressSanitizer::sanitizeFunction(Function &F) {
+  if (&F == HwasanCtorFunction)
+    return false;
+
+  if (!F.hasFnAttribute(Attribute::SanitizeHWAddress))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Function: " << F.getName() << "\n");
+
+  SmallVector<InterestingMemoryOperand, 16> OperandsToInstrument;
+  SmallVector<MemIntrinsic *, 16> IntrinToInstrument;
+  SmallVector<AllocaInst*, 8> AllocasToInstrument;
+  SmallVector<Instruction*, 8> RetVec;
+  SmallVector<Instruction*, 8> LandingPadVec;
+  DenseMap<AllocaInst *, std::vector<DbgVariableIntrinsic *>> AllocaDbgMap;
+  for (auto &BB : F) {
+    for (auto &Inst : BB) {
+      if (ClInstrumentStack)
+        if (AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
+          if (isInterestingAlloca(*AI))
+            AllocasToInstrument.push_back(AI);
+          continue;
+        }
+
+      if (isa<ReturnInst>(Inst) || isa<ResumeInst>(Inst) ||
+          isa<CleanupReturnInst>(Inst))
+        RetVec.push_back(&Inst);
+
+      if (auto *DDI = dyn_cast<DbgVariableIntrinsic>(&Inst))
+        if (auto *Alloca =
+                dyn_cast_or_null<AllocaInst>(DDI->getVariableLocation()))
+          AllocaDbgMap[Alloca].push_back(DDI);
+
+      if (InstrumentLandingPads && isa<LandingPadInst>(Inst))
+        LandingPadVec.push_back(&Inst);
+
+      getInterestingMemoryOperands(&Inst, OperandsToInstrument);
+
+      if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(&Inst))
+        IntrinToInstrument.push_back(MI);
+    }
+  }
+
+  initializeCallbacks(*F.getParent());
+
+  bool Changed = false;
+
+  if (!LandingPadVec.empty())
+    Changed |= instrumentLandingPads(LandingPadVec);
+
+  if (AllocasToInstrument.empty() && F.hasPersonalityFn() &&
+      F.getPersonalityFn()->getName() == kHwasanPersonalityThunkName) {
+    // __hwasan_personality_thunk is a no-op for functions without an
+    // instrumented stack, so we can drop it.
+    F.setPersonalityFn(nullptr);
+    Changed = true;
+  }
+
+  if (AllocasToInstrument.empty() && OperandsToInstrument.empty() &&
+      IntrinToInstrument.empty())
+    return Changed;
+
   assert(!ShadowBase);
- 
-  Instruction *InsertPt = &*F.getEntryBlock().begin(); 
-  IRBuilder<> EntryIRB(InsertPt); 
-  emitPrologue(EntryIRB, 
-               /*WithFrameRecord*/ ClRecordStackHistory && 
-                   !AllocasToInstrument.empty()); 
- 
-  if (!AllocasToInstrument.empty()) { 
-    Value *StackTag = 
-        ClGenerateTagsWithCalls ? nullptr : getStackBaseTag(EntryIRB); 
-    instrumentStack(AllocasToInstrument, AllocaDbgMap, RetVec, StackTag); 
-  } 
-  // Pad and align each of the allocas that we instrumented to stop small 
-  // uninteresting allocas from hiding in instrumented alloca's padding and so 
-  // that we have enough space to store real tags for short granules. 
-  DenseMap<AllocaInst *, AllocaInst *> AllocaToPaddedAllocaMap; 
-  for (AllocaInst *AI : AllocasToInstrument) { 
-    uint64_t Size = getAllocaSizeInBytes(*AI); 
-    uint64_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment()); 
-    AI->setAlignment( 
-        Align(std::max(AI->getAlignment(), Mapping.getObjectAlignment()))); 
-    if (Size != AlignedSize) { 
-      Type *AllocatedType = AI->getAllocatedType(); 
-      if (AI->isArrayAllocation()) { 
-        uint64_t ArraySize = 
-            cast<ConstantInt>(AI->getArraySize())->getZExtValue(); 
-        AllocatedType = ArrayType::get(AllocatedType, ArraySize); 
-      } 
-      Type *TypeWithPadding = StructType::get( 
-          AllocatedType, ArrayType::get(Int8Ty, AlignedSize - Size)); 
-      auto *NewAI = new AllocaInst( 
-          TypeWithPadding, AI->getType()->getAddressSpace(), nullptr, "", AI); 
-      NewAI->takeName(AI); 
-      NewAI->setAlignment(AI->getAlign()); 
-      NewAI->setUsedWithInAlloca(AI->isUsedWithInAlloca()); 
-      NewAI->setSwiftError(AI->isSwiftError()); 
-      NewAI->copyMetadata(*AI); 
-      auto *Bitcast = new BitCastInst(NewAI, AI->getType(), "", AI); 
-      AI->replaceAllUsesWith(Bitcast); 
-      AllocaToPaddedAllocaMap[AI] = NewAI; 
-    } 
-  } 
- 
-  if (!AllocaToPaddedAllocaMap.empty()) { 
-    for (auto &BB : F) 
-      for (auto &Inst : BB) 
-        if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&Inst)) 
-          if (auto *AI = 
-                  dyn_cast_or_null<AllocaInst>(DVI->getVariableLocation())) 
-            if (auto *NewAI = AllocaToPaddedAllocaMap.lookup(AI)) 
-              DVI->setArgOperand( 
-                  0, MetadataAsValue::get(*C, LocalAsMetadata::get(NewAI))); 
-    for (auto &P : AllocaToPaddedAllocaMap) 
-      P.first->eraseFromParent(); 
-  } 
- 
-  // If we split the entry block, move any allocas that were originally in the 
-  // entry block back into the entry block so that they aren't treated as 
-  // dynamic allocas. 
-  if (EntryIRB.GetInsertBlock() != &F.getEntryBlock()) { 
-    InsertPt = &*F.getEntryBlock().begin(); 
-    for (auto II = EntryIRB.GetInsertBlock()->begin(), 
-              IE = EntryIRB.GetInsertBlock()->end(); 
-         II != IE;) { 
-      Instruction *I = &*II++; 
-      if (auto *AI = dyn_cast<AllocaInst>(I)) 
-        if (isa<ConstantInt>(AI->getArraySize())) 
-          I->moveBefore(InsertPt); 
-    } 
-  } 
- 
-  for (auto &Operand : OperandsToInstrument) 
-    instrumentMemAccess(Operand); 
- 
-  if (ClInstrumentMemIntrinsics && !IntrinToInstrument.empty()) { 
-    for (auto Inst : IntrinToInstrument) 
-      instrumentMemIntrinsic(cast<MemIntrinsic>(Inst)); 
-  } 
- 
+
+  Instruction *InsertPt = &*F.getEntryBlock().begin();
+  IRBuilder<> EntryIRB(InsertPt);
+  emitPrologue(EntryIRB,
+               /*WithFrameRecord*/ ClRecordStackHistory &&
+                   !AllocasToInstrument.empty());
+
+  if (!AllocasToInstrument.empty()) {
+    Value *StackTag =
+        ClGenerateTagsWithCalls ? nullptr : getStackBaseTag(EntryIRB);
+    instrumentStack(AllocasToInstrument, AllocaDbgMap, RetVec, StackTag);
+  }
+  // Pad and align each of the allocas that we instrumented to stop small
+  // uninteresting allocas from hiding in instrumented alloca's padding and so
+  // that we have enough space to store real tags for short granules.
+  DenseMap<AllocaInst *, AllocaInst *> AllocaToPaddedAllocaMap;
+  for (AllocaInst *AI : AllocasToInstrument) {
+    uint64_t Size = getAllocaSizeInBytes(*AI);
+    uint64_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
+    AI->setAlignment(
+        Align(std::max(AI->getAlignment(), Mapping.getObjectAlignment())));
+    if (Size != AlignedSize) {
+      Type *AllocatedType = AI->getAllocatedType();
+      if (AI->isArrayAllocation()) {
+        uint64_t ArraySize =
+            cast<ConstantInt>(AI->getArraySize())->getZExtValue();
+        AllocatedType = ArrayType::get(AllocatedType, ArraySize);
+      }
+      Type *TypeWithPadding = StructType::get(
+          AllocatedType, ArrayType::get(Int8Ty, AlignedSize - Size));
+      auto *NewAI = new AllocaInst(
+          TypeWithPadding, AI->getType()->getAddressSpace(), nullptr, "", AI);
+      NewAI->takeName(AI);
+      NewAI->setAlignment(AI->getAlign());
+      NewAI->setUsedWithInAlloca(AI->isUsedWithInAlloca());
+      NewAI->setSwiftError(AI->isSwiftError());
+      NewAI->copyMetadata(*AI);
+      auto *Bitcast = new BitCastInst(NewAI, AI->getType(), "", AI);
+      AI->replaceAllUsesWith(Bitcast);
+      AllocaToPaddedAllocaMap[AI] = NewAI;
+    }
+  }
+
+  if (!AllocaToPaddedAllocaMap.empty()) {
+    for (auto &BB : F)
+      for (auto &Inst : BB)
+        if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&Inst))
+          if (auto *AI =
+                  dyn_cast_or_null<AllocaInst>(DVI->getVariableLocation()))
+            if (auto *NewAI = AllocaToPaddedAllocaMap.lookup(AI))
+              DVI->setArgOperand(
+                  0, MetadataAsValue::get(*C, LocalAsMetadata::get(NewAI)));
+    for (auto &P : AllocaToPaddedAllocaMap)
+      P.first->eraseFromParent();
+  }
+
+  // If we split the entry block, move any allocas that were originally in the
+  // entry block back into the entry block so that they aren't treated as
+  // dynamic allocas.
+  if (EntryIRB.GetInsertBlock() != &F.getEntryBlock()) {
+    InsertPt = &*F.getEntryBlock().begin();
+    for (auto II = EntryIRB.GetInsertBlock()->begin(),
+              IE = EntryIRB.GetInsertBlock()->end();
+         II != IE;) {
+      Instruction *I = &*II++;
+      if (auto *AI = dyn_cast<AllocaInst>(I))
+        if (isa<ConstantInt>(AI->getArraySize()))
+          I->moveBefore(InsertPt);
+    }
+  }
+
+  for (auto &Operand : OperandsToInstrument)
+    instrumentMemAccess(Operand);
+
+  if (ClInstrumentMemIntrinsics && !IntrinToInstrument.empty()) {
+    for (auto Inst : IntrinToInstrument)
+      instrumentMemIntrinsic(cast<MemIntrinsic>(Inst));
+  }
+
   ShadowBase = nullptr;
-  StackBaseTag = nullptr; 
- 
-  return true; 
-} 
- 
-void HWAddressSanitizer::instrumentGlobal(GlobalVariable *GV, uint8_t Tag) { 
-  Constant *Initializer = GV->getInitializer(); 
-  uint64_t SizeInBytes = 
-      M.getDataLayout().getTypeAllocSize(Initializer->getType()); 
-  uint64_t NewSize = alignTo(SizeInBytes, Mapping.getObjectAlignment()); 
-  if (SizeInBytes != NewSize) { 
-    // Pad the initializer out to the next multiple of 16 bytes and add the 
-    // required short granule tag. 
-    std::vector<uint8_t> Init(NewSize - SizeInBytes, 0); 
-    Init.back() = Tag; 
-    Constant *Padding = ConstantDataArray::get(*C, Init); 
-    Initializer = ConstantStruct::getAnon({Initializer, Padding}); 
-  } 
- 
-  auto *NewGV = new GlobalVariable(M, Initializer->getType(), GV->isConstant(), 
-                                   GlobalValue::ExternalLinkage, Initializer, 
-                                   GV->getName() + ".hwasan"); 
-  NewGV->copyAttributesFrom(GV); 
-  NewGV->setLinkage(GlobalValue::PrivateLinkage); 
-  NewGV->copyMetadata(GV, 0); 
-  NewGV->setAlignment( 
-      MaybeAlign(std::max(GV->getAlignment(), Mapping.getObjectAlignment()))); 
- 
-  // It is invalid to ICF two globals that have different tags. In the case 
-  // where the size of the global is a multiple of the tag granularity the 
-  // contents of the globals may be the same but the tags (i.e. symbol values) 
-  // may be different, and the symbols are not considered during ICF. In the 
-  // case where the size is not a multiple of the granularity, the short granule 
-  // tags would discriminate two globals with different tags, but there would 
-  // otherwise be nothing stopping such a global from being incorrectly ICF'd 
-  // with an uninstrumented (i.e. tag 0) global that happened to have the short 
-  // granule tag in the last byte. 
-  NewGV->setUnnamedAddr(GlobalValue::UnnamedAddr::None); 
- 
-  // Descriptor format (assuming little-endian): 
-  // bytes 0-3: relative address of global 
-  // bytes 4-6: size of global (16MB ought to be enough for anyone, but in case 
-  // it isn't, we create multiple descriptors) 
-  // byte 7: tag 
-  auto *DescriptorTy = StructType::get(Int32Ty, Int32Ty); 
-  const uint64_t MaxDescriptorSize = 0xfffff0; 
-  for (uint64_t DescriptorPos = 0; DescriptorPos < SizeInBytes; 
-       DescriptorPos += MaxDescriptorSize) { 
-    auto *Descriptor = 
-        new GlobalVariable(M, DescriptorTy, true, GlobalValue::PrivateLinkage, 
-                           nullptr, GV->getName() + ".hwasan.descriptor"); 
-    auto *GVRelPtr = ConstantExpr::getTrunc( 
-        ConstantExpr::getAdd( 
-            ConstantExpr::getSub( 
-                ConstantExpr::getPtrToInt(NewGV, Int64Ty), 
-                ConstantExpr::getPtrToInt(Descriptor, Int64Ty)), 
-            ConstantInt::get(Int64Ty, DescriptorPos)), 
-        Int32Ty); 
-    uint32_t Size = std::min(SizeInBytes - DescriptorPos, MaxDescriptorSize); 
-    auto *SizeAndTag = ConstantInt::get(Int32Ty, Size | (uint32_t(Tag) << 24)); 
-    Descriptor->setComdat(NewGV->getComdat()); 
-    Descriptor->setInitializer(ConstantStruct::getAnon({GVRelPtr, SizeAndTag})); 
-    Descriptor->setSection("hwasan_globals"); 
-    Descriptor->setMetadata(LLVMContext::MD_associated, 
-                            MDNode::get(*C, ValueAsMetadata::get(NewGV))); 
-    appendToCompilerUsed(M, Descriptor); 
-  } 
- 
-  Constant *Aliasee = ConstantExpr::getIntToPtr( 
-      ConstantExpr::getAdd( 
-          ConstantExpr::getPtrToInt(NewGV, Int64Ty), 
-          ConstantInt::get(Int64Ty, uint64_t(Tag) << kPointerTagShift)), 
-      GV->getType()); 
-  auto *Alias = GlobalAlias::create(GV->getValueType(), GV->getAddressSpace(), 
-                                    GV->getLinkage(), "", Aliasee, &M); 
-  Alias->setVisibility(GV->getVisibility()); 
-  Alias->takeName(GV); 
-  GV->replaceAllUsesWith(Alias); 
-  GV->eraseFromParent(); 
-} 
- 
-void HWAddressSanitizer::instrumentGlobals() { 
-  std::vector<GlobalVariable *> Globals; 
-  for (GlobalVariable &GV : M.globals()) { 
-    if (GV.isDeclarationForLinker() || GV.getName().startswith("llvm.") || 
-        GV.isThreadLocal()) 
-      continue; 
- 
-    // Common symbols can't have aliases point to them, so they can't be tagged. 
-    if (GV.hasCommonLinkage()) 
-      continue; 
- 
-    // Globals with custom sections may be used in __start_/__stop_ enumeration, 
-    // which would be broken both by adding tags and potentially by the extra 
-    // padding/alignment that we insert. 
-    if (GV.hasSection()) 
-      continue; 
- 
-    Globals.push_back(&GV); 
-  } 
- 
-  MD5 Hasher; 
-  Hasher.update(M.getSourceFileName()); 
-  MD5::MD5Result Hash; 
-  Hasher.final(Hash); 
-  uint8_t Tag = Hash[0]; 
- 
-  for (GlobalVariable *GV : Globals) { 
-    // Skip tag 0 in order to avoid collisions with untagged memory. 
-    if (Tag == 0) 
-      Tag = 1; 
-    instrumentGlobal(GV, Tag++); 
-  } 
-} 
- 
-void HWAddressSanitizer::instrumentPersonalityFunctions() { 
-  // We need to untag stack frames as we unwind past them. That is the job of 
-  // the personality function wrapper, which either wraps an existing 
-  // personality function or acts as a personality function on its own. Each 
-  // function that has a personality function or that can be unwound past has 
-  // its personality function changed to a thunk that calls the personality 
-  // function wrapper in the runtime. 
-  MapVector<Constant *, std::vector<Function *>> PersonalityFns; 
-  for (Function &F : M) { 
-    if (F.isDeclaration() || !F.hasFnAttribute(Attribute::SanitizeHWAddress)) 
-      continue; 
- 
-    if (F.hasPersonalityFn()) { 
-      PersonalityFns[F.getPersonalityFn()->stripPointerCasts()].push_back(&F); 
-    } else if (!F.hasFnAttribute(Attribute::NoUnwind)) { 
-      PersonalityFns[nullptr].push_back(&F); 
-    } 
-  } 
- 
-  if (PersonalityFns.empty()) 
-    return; 
- 
-  FunctionCallee HwasanPersonalityWrapper = M.getOrInsertFunction( 
-      "__hwasan_personality_wrapper", Int32Ty, Int32Ty, Int32Ty, Int64Ty, 
-      Int8PtrTy, Int8PtrTy, Int8PtrTy, Int8PtrTy, Int8PtrTy); 
-  FunctionCallee UnwindGetGR = M.getOrInsertFunction("_Unwind_GetGR", VoidTy); 
-  FunctionCallee UnwindGetCFA = M.getOrInsertFunction("_Unwind_GetCFA", VoidTy); 
- 
-  for (auto &P : PersonalityFns) { 
-    std::string ThunkName = kHwasanPersonalityThunkName; 
-    if (P.first) 
-      ThunkName += ("." + P.first->getName()).str(); 
-    FunctionType *ThunkFnTy = FunctionType::get( 
-        Int32Ty, {Int32Ty, Int32Ty, Int64Ty, Int8PtrTy, Int8PtrTy}, false); 
-    bool IsLocal = P.first && (!isa<GlobalValue>(P.first) || 
-                               cast<GlobalValue>(P.first)->hasLocalLinkage()); 
-    auto *ThunkFn = Function::Create(ThunkFnTy, 
-                                     IsLocal ? GlobalValue::InternalLinkage 
-                                             : GlobalValue::LinkOnceODRLinkage, 
-                                     ThunkName, &M); 
-    if (!IsLocal) { 
-      ThunkFn->setVisibility(GlobalValue::HiddenVisibility); 
-      ThunkFn->setComdat(M.getOrInsertComdat(ThunkName)); 
-    } 
- 
-    auto *BB = BasicBlock::Create(*C, "entry", ThunkFn); 
-    IRBuilder<> IRB(BB); 
-    CallInst *WrapperCall = IRB.CreateCall( 
-        HwasanPersonalityWrapper, 
-        {ThunkFn->getArg(0), ThunkFn->getArg(1), ThunkFn->getArg(2), 
-         ThunkFn->getArg(3), ThunkFn->getArg(4), 
-         P.first ? IRB.CreateBitCast(P.first, Int8PtrTy) 
-                 : Constant::getNullValue(Int8PtrTy), 
-         IRB.CreateBitCast(UnwindGetGR.getCallee(), Int8PtrTy), 
-         IRB.CreateBitCast(UnwindGetCFA.getCallee(), Int8PtrTy)}); 
-    WrapperCall->setTailCall(); 
-    IRB.CreateRet(WrapperCall); 
- 
-    for (Function *F : P.second) 
-      F->setPersonalityFn(ThunkFn); 
-  } 
-} 
- 
-void HWAddressSanitizer::ShadowMapping::init(Triple &TargetTriple) { 
-  Scale = kDefaultShadowScale; 
-  if (ClMappingOffset.getNumOccurrences() > 0) { 
-    InGlobal = false; 
-    InTls = false; 
-    Offset = ClMappingOffset; 
-  } else if (ClEnableKhwasan || ClInstrumentWithCalls) { 
-    InGlobal = false; 
-    InTls = false; 
-    Offset = 0; 
-  } else if (ClWithIfunc) { 
-    InGlobal = true; 
-    InTls = false; 
-    Offset = kDynamicShadowSentinel; 
-  } else if (ClWithTls) { 
-    InGlobal = false; 
-    InTls = true; 
-    Offset = kDynamicShadowSentinel; 
-  } else { 
-    InGlobal = false; 
-    InTls = false; 
-    Offset = kDynamicShadowSentinel; 
-  } 
-} 
+  StackBaseTag = nullptr;
+
+  return true;
+}
+
+void HWAddressSanitizer::instrumentGlobal(GlobalVariable *GV, uint8_t Tag) {
+  Constant *Initializer = GV->getInitializer();
+  uint64_t SizeInBytes =
+      M.getDataLayout().getTypeAllocSize(Initializer->getType());
+  uint64_t NewSize = alignTo(SizeInBytes, Mapping.getObjectAlignment());
+  if (SizeInBytes != NewSize) {
+    // Pad the initializer out to the next multiple of 16 bytes and add the
+    // required short granule tag.
+    std::vector<uint8_t> Init(NewSize - SizeInBytes, 0);
+    Init.back() = Tag;
+    Constant *Padding = ConstantDataArray::get(*C, Init);
+    Initializer = ConstantStruct::getAnon({Initializer, Padding});
+  }
+
+  auto *NewGV = new GlobalVariable(M, Initializer->getType(), GV->isConstant(),
+                                   GlobalValue::ExternalLinkage, Initializer,
+                                   GV->getName() + ".hwasan");
+  NewGV->copyAttributesFrom(GV);
+  NewGV->setLinkage(GlobalValue::PrivateLinkage);
+  NewGV->copyMetadata(GV, 0);
+  NewGV->setAlignment(
+      MaybeAlign(std::max(GV->getAlignment(), Mapping.getObjectAlignment())));
+
+  // It is invalid to ICF two globals that have different tags. In the case
+  // where the size of the global is a multiple of the tag granularity the
+  // contents of the globals may be the same but the tags (i.e. symbol values)
+  // may be different, and the symbols are not considered during ICF. In the
+  // case where the size is not a multiple of the granularity, the short granule
+  // tags would discriminate two globals with different tags, but there would
+  // otherwise be nothing stopping such a global from being incorrectly ICF'd
+  // with an uninstrumented (i.e. tag 0) global that happened to have the short
+  // granule tag in the last byte.
+  NewGV->setUnnamedAddr(GlobalValue::UnnamedAddr::None);
+
+  // Descriptor format (assuming little-endian):
+  // bytes 0-3: relative address of global
+  // bytes 4-6: size of global (16MB ought to be enough for anyone, but in case
+  // it isn't, we create multiple descriptors)
+  // byte 7: tag
+  auto *DescriptorTy = StructType::get(Int32Ty, Int32Ty);
+  const uint64_t MaxDescriptorSize = 0xfffff0;
+  for (uint64_t DescriptorPos = 0; DescriptorPos < SizeInBytes;
+       DescriptorPos += MaxDescriptorSize) {
+    auto *Descriptor =
+        new GlobalVariable(M, DescriptorTy, true, GlobalValue::PrivateLinkage,
+                           nullptr, GV->getName() + ".hwasan.descriptor");
+    auto *GVRelPtr = ConstantExpr::getTrunc(
+        ConstantExpr::getAdd(
+            ConstantExpr::getSub(
+                ConstantExpr::getPtrToInt(NewGV, Int64Ty),
+                ConstantExpr::getPtrToInt(Descriptor, Int64Ty)),
+            ConstantInt::get(Int64Ty, DescriptorPos)),
+        Int32Ty);
+    uint32_t Size = std::min(SizeInBytes - DescriptorPos, MaxDescriptorSize);
+    auto *SizeAndTag = ConstantInt::get(Int32Ty, Size | (uint32_t(Tag) << 24));
+    Descriptor->setComdat(NewGV->getComdat());
+    Descriptor->setInitializer(ConstantStruct::getAnon({GVRelPtr, SizeAndTag}));
+    Descriptor->setSection("hwasan_globals");
+    Descriptor->setMetadata(LLVMContext::MD_associated,
+                            MDNode::get(*C, ValueAsMetadata::get(NewGV)));
+    appendToCompilerUsed(M, Descriptor);
+  }
+
+  Constant *Aliasee = ConstantExpr::getIntToPtr(
+      ConstantExpr::getAdd(
+          ConstantExpr::getPtrToInt(NewGV, Int64Ty),
+          ConstantInt::get(Int64Ty, uint64_t(Tag) << kPointerTagShift)),
+      GV->getType());
+  auto *Alias = GlobalAlias::create(GV->getValueType(), GV->getAddressSpace(),
+                                    GV->getLinkage(), "", Aliasee, &M);
+  Alias->setVisibility(GV->getVisibility());
+  Alias->takeName(GV);
+  GV->replaceAllUsesWith(Alias);
+  GV->eraseFromParent();
+}
+
+void HWAddressSanitizer::instrumentGlobals() {
+  std::vector<GlobalVariable *> Globals;
+  for (GlobalVariable &GV : M.globals()) {
+    if (GV.isDeclarationForLinker() || GV.getName().startswith("llvm.") ||
+        GV.isThreadLocal())
+      continue;
+
+    // Common symbols can't have aliases point to them, so they can't be tagged.
+    if (GV.hasCommonLinkage())
+      continue;
+
+    // Globals with custom sections may be used in __start_/__stop_ enumeration,
+    // which would be broken both by adding tags and potentially by the extra
+    // padding/alignment that we insert.
+    if (GV.hasSection())
+      continue;
+
+    Globals.push_back(&GV);
+  }
+
+  MD5 Hasher;
+  Hasher.update(M.getSourceFileName());
+  MD5::MD5Result Hash;
+  Hasher.final(Hash);
+  uint8_t Tag = Hash[0];
+
+  for (GlobalVariable *GV : Globals) {
+    // Skip tag 0 in order to avoid collisions with untagged memory.
+    if (Tag == 0)
+      Tag = 1;
+    instrumentGlobal(GV, Tag++);
+  }
+}
+
+void HWAddressSanitizer::instrumentPersonalityFunctions() {
+  // We need to untag stack frames as we unwind past them. That is the job of
+  // the personality function wrapper, which either wraps an existing
+  // personality function or acts as a personality function on its own. Each
+  // function that has a personality function or that can be unwound past has
+  // its personality function changed to a thunk that calls the personality
+  // function wrapper in the runtime.
+  MapVector<Constant *, std::vector<Function *>> PersonalityFns;
+  for (Function &F : M) {
+    if (F.isDeclaration() || !F.hasFnAttribute(Attribute::SanitizeHWAddress))
+      continue;
+
+    if (F.hasPersonalityFn()) {
+      PersonalityFns[F.getPersonalityFn()->stripPointerCasts()].push_back(&F);
+    } else if (!F.hasFnAttribute(Attribute::NoUnwind)) {
+      PersonalityFns[nullptr].push_back(&F);
+    }
+  }
+
+  if (PersonalityFns.empty())
+    return;
+
+  FunctionCallee HwasanPersonalityWrapper = M.getOrInsertFunction(
+      "__hwasan_personality_wrapper", Int32Ty, Int32Ty, Int32Ty, Int64Ty,
+      Int8PtrTy, Int8PtrTy, Int8PtrTy, Int8PtrTy, Int8PtrTy);
+  FunctionCallee UnwindGetGR = M.getOrInsertFunction("_Unwind_GetGR", VoidTy);
+  FunctionCallee UnwindGetCFA = M.getOrInsertFunction("_Unwind_GetCFA", VoidTy);
+
+  for (auto &P : PersonalityFns) {
+    std::string ThunkName = kHwasanPersonalityThunkName;
+    if (P.first)
+      ThunkName += ("." + P.first->getName()).str();
+    FunctionType *ThunkFnTy = FunctionType::get(
+        Int32Ty, {Int32Ty, Int32Ty, Int64Ty, Int8PtrTy, Int8PtrTy}, false);
+    bool IsLocal = P.first && (!isa<GlobalValue>(P.first) ||
+                               cast<GlobalValue>(P.first)->hasLocalLinkage());
+    auto *ThunkFn = Function::Create(ThunkFnTy,
+                                     IsLocal ? GlobalValue::InternalLinkage
+                                             : GlobalValue::LinkOnceODRLinkage,
+                                     ThunkName, &M);
+    if (!IsLocal) {
+      ThunkFn->setVisibility(GlobalValue::HiddenVisibility);
+      ThunkFn->setComdat(M.getOrInsertComdat(ThunkName));
+    }
+
+    auto *BB = BasicBlock::Create(*C, "entry", ThunkFn);
+    IRBuilder<> IRB(BB);
+    CallInst *WrapperCall = IRB.CreateCall(
+        HwasanPersonalityWrapper,
+        {ThunkFn->getArg(0), ThunkFn->getArg(1), ThunkFn->getArg(2),
+         ThunkFn->getArg(3), ThunkFn->getArg(4),
+         P.first ? IRB.CreateBitCast(P.first, Int8PtrTy)
+                 : Constant::getNullValue(Int8PtrTy),
+         IRB.CreateBitCast(UnwindGetGR.getCallee(), Int8PtrTy),
+         IRB.CreateBitCast(UnwindGetCFA.getCallee(), Int8PtrTy)});
+    WrapperCall->setTailCall();
+    IRB.CreateRet(WrapperCall);
+
+    for (Function *F : P.second)
+      F->setPersonalityFn(ThunkFn);
+  }
+}
+
+void HWAddressSanitizer::ShadowMapping::init(Triple &TargetTriple) {
+  Scale = kDefaultShadowScale;
+  if (ClMappingOffset.getNumOccurrences() > 0) {
+    InGlobal = false;
+    InTls = false;
+    Offset = ClMappingOffset;
+  } else if (ClEnableKhwasan || ClInstrumentWithCalls) {
+    InGlobal = false;
+    InTls = false;
+    Offset = 0;
+  } else if (ClWithIfunc) {
+    InGlobal = true;
+    InTls = false;
+    Offset = kDynamicShadowSentinel;
+  } else if (ClWithTls) {
+    InGlobal = false;
+    InTls = true;
+    Offset = kDynamicShadowSentinel;
+  } else {
+    InGlobal = false;
+    InTls = false;
+    Offset = kDynamicShadowSentinel;
+  }
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index 6baf7e7dae..5b9557a9b3 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -1,268 +1,268 @@
-//===- IndirectCallPromotion.cpp - Optimizations based on value profiling -===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the transformation that promotes indirect calls to 
-// conditional direct calls when the indirect-call value profile metadata is 
-// available. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/StringRef.h" 
-#include "llvm/Analysis/IndirectCallPromotionAnalysis.h" 
-#include "llvm/Analysis/IndirectCallVisitor.h" 
-#include "llvm/Analysis/OptimizationRemarkEmitter.h" 
-#include "llvm/Analysis/ProfileSummaryInfo.h" 
-#include "llvm/IR/Attributes.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/DiagnosticInfo.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/MDBuilder.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/ProfileData/InstrProf.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/Error.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Instrumentation.h" 
-#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/CallPromotionUtils.h" 
-#include <cassert> 
-#include <cstdint> 
-#include <memory> 
-#include <string> 
-#include <utility> 
-#include <vector> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "pgo-icall-prom" 
- 
-STATISTIC(NumOfPGOICallPromotion, "Number of indirect call promotions."); 
-STATISTIC(NumOfPGOICallsites, "Number of indirect call candidate sites."); 
- 
-// Command line option to disable indirect-call promotion with the default as 
-// false. This is for debug purpose. 
-static cl::opt<bool> DisableICP("disable-icp", cl::init(false), cl::Hidden, 
-                                cl::desc("Disable indirect call promotion")); 
- 
-// Set the cutoff value for the promotion. If the value is other than 0, we 
-// stop the transformation once the total number of promotions equals the cutoff 
-// value. 
-// For debug use only. 
-static cl::opt<unsigned> 
-    ICPCutOff("icp-cutoff", cl::init(0), cl::Hidden, cl::ZeroOrMore, 
-              cl::desc("Max number of promotions for this compilation")); 
- 
-// If ICPCSSkip is non zero, the first ICPCSSkip callsites will be skipped. 
-// For debug use only. 
-static cl::opt<unsigned> 
-    ICPCSSkip("icp-csskip", cl::init(0), cl::Hidden, cl::ZeroOrMore, 
-              cl::desc("Skip Callsite up to this number for this compilation")); 
- 
-// Set if the pass is called in LTO optimization. The difference for LTO mode 
-// is the pass won't prefix the source module name to the internal linkage 
-// symbols. 
-static cl::opt<bool> ICPLTOMode("icp-lto", cl::init(false), cl::Hidden, 
-                                cl::desc("Run indirect-call promotion in LTO " 
-                                         "mode")); 
- 
-// Set if the pass is called in SamplePGO mode. The difference for SamplePGO 
-// mode is it will add prof metadatato the created direct call. 
-static cl::opt<bool> 
-    ICPSamplePGOMode("icp-samplepgo", cl::init(false), cl::Hidden, 
-                     cl::desc("Run indirect-call promotion in SamplePGO mode")); 
- 
-// If the option is set to true, only call instructions will be considered for 
-// transformation -- invoke instructions will be ignored. 
-static cl::opt<bool> 
-    ICPCallOnly("icp-call-only", cl::init(false), cl::Hidden, 
-                cl::desc("Run indirect-call promotion for call instructions " 
-                         "only")); 
- 
-// If the option is set to true, only invoke instructions will be considered for 
-// transformation -- call instructions will be ignored. 
-static cl::opt<bool> ICPInvokeOnly("icp-invoke-only", cl::init(false), 
-                                   cl::Hidden, 
-                                   cl::desc("Run indirect-call promotion for " 
-                                            "invoke instruction only")); 
- 
-// Dump the function level IR if the transformation happened in this 
-// function. For debug use only. 
-static cl::opt<bool> 
-    ICPDUMPAFTER("icp-dumpafter", cl::init(false), cl::Hidden, 
-                 cl::desc("Dump IR after transformation happens")); 
- 
-namespace { 
- 
-class PGOIndirectCallPromotionLegacyPass : public ModulePass { 
-public: 
-  static char ID; 
- 
-  PGOIndirectCallPromotionLegacyPass(bool InLTO = false, bool SamplePGO = false) 
-      : ModulePass(ID), InLTO(InLTO), SamplePGO(SamplePGO) { 
-    initializePGOIndirectCallPromotionLegacyPassPass( 
-        *PassRegistry::getPassRegistry()); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<ProfileSummaryInfoWrapperPass>(); 
-  } 
- 
-  StringRef getPassName() const override { return "PGOIndirectCallPromotion"; } 
- 
-private: 
-  bool runOnModule(Module &M) override; 
- 
-  // If this pass is called in LTO. We need to special handling the PGOFuncName 
-  // for the static variables due to LTO's internalization. 
-  bool InLTO; 
- 
-  // If this pass is called in SamplePGO. We need to add the prof metadata to 
-  // the promoted direct call. 
-  bool SamplePGO; 
-}; 
- 
-} // end anonymous namespace 
- 
-char PGOIndirectCallPromotionLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(PGOIndirectCallPromotionLegacyPass, "pgo-icall-prom", 
-                      "Use PGO instrumentation profile to promote indirect " 
-                      "calls to direct calls.", 
-                      false, false) 
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 
-INITIALIZE_PASS_END(PGOIndirectCallPromotionLegacyPass, "pgo-icall-prom", 
-                    "Use PGO instrumentation profile to promote indirect " 
-                    "calls to direct calls.", 
-                    false, false) 
- 
-ModulePass *llvm::createPGOIndirectCallPromotionLegacyPass(bool InLTO, 
-                                                           bool SamplePGO) { 
-  return new PGOIndirectCallPromotionLegacyPass(InLTO, SamplePGO); 
-} 
- 
-namespace { 
- 
-// The class for main data structure to promote indirect calls to conditional 
-// direct calls. 
-class ICallPromotionFunc { 
-private: 
-  Function &F; 
-  Module *M; 
- 
-  // Symtab that maps indirect call profile values to function names and 
-  // defines. 
-  InstrProfSymtab *Symtab; 
- 
-  bool SamplePGO; 
- 
-  OptimizationRemarkEmitter &ORE; 
- 
-  // A struct that records the direct target and it's call count. 
-  struct PromotionCandidate { 
-    Function *TargetFunction; 
-    uint64_t Count; 
- 
-    PromotionCandidate(Function *F, uint64_t C) : TargetFunction(F), Count(C) {} 
-  }; 
- 
-  // Check if the indirect-call call site should be promoted. Return the number 
-  // of promotions. Inst is the candidate indirect call, ValueDataRef 
-  // contains the array of value profile data for profiled targets, 
-  // TotalCount is the total profiled count of call executions, and 
-  // NumCandidates is the number of candidate entries in ValueDataRef. 
-  std::vector<PromotionCandidate> getPromotionCandidatesForCallSite( 
-      const CallBase &CB, const ArrayRef<InstrProfValueData> &ValueDataRef, 
-      uint64_t TotalCount, uint32_t NumCandidates); 
- 
-  // Promote a list of targets for one indirect-call callsite. Return 
-  // the number of promotions. 
-  uint32_t tryToPromote(CallBase &CB, 
-                        const std::vector<PromotionCandidate> &Candidates, 
-                        uint64_t &TotalCount); 
- 
-public: 
-  ICallPromotionFunc(Function &Func, Module *Modu, InstrProfSymtab *Symtab, 
-                     bool SamplePGO, OptimizationRemarkEmitter &ORE) 
-      : F(Func), M(Modu), Symtab(Symtab), SamplePGO(SamplePGO), ORE(ORE) {} 
-  ICallPromotionFunc(const ICallPromotionFunc &) = delete; 
-  ICallPromotionFunc &operator=(const ICallPromotionFunc &) = delete; 
- 
-  bool processFunction(ProfileSummaryInfo *PSI); 
-}; 
- 
-} // end anonymous namespace 
- 
-// Indirect-call promotion heuristic. The direct targets are sorted based on 
-// the count. Stop at the first target that is not promoted. 
-std::vector<ICallPromotionFunc::PromotionCandidate> 
-ICallPromotionFunc::getPromotionCandidatesForCallSite( 
-    const CallBase &CB, const ArrayRef<InstrProfValueData> &ValueDataRef, 
-    uint64_t TotalCount, uint32_t NumCandidates) { 
-  std::vector<PromotionCandidate> Ret; 
- 
-  LLVM_DEBUG(dbgs() << " \nWork on callsite #" << NumOfPGOICallsites << CB 
-                    << " Num_targets: " << ValueDataRef.size() 
-                    << " Num_candidates: " << NumCandidates << "\n"); 
-  NumOfPGOICallsites++; 
-  if (ICPCSSkip != 0 && NumOfPGOICallsites <= ICPCSSkip) { 
-    LLVM_DEBUG(dbgs() << " Skip: User options.\n"); 
-    return Ret; 
-  } 
- 
-  for (uint32_t I = 0; I < NumCandidates; I++) { 
-    uint64_t Count = ValueDataRef[I].Count; 
-    assert(Count <= TotalCount); 
-    uint64_t Target = ValueDataRef[I].Value; 
-    LLVM_DEBUG(dbgs() << " Candidate " << I << " Count=" << Count 
-                      << "  Target_func: " << Target << "\n"); 
- 
-    if (ICPInvokeOnly && isa<CallInst>(CB)) { 
-      LLVM_DEBUG(dbgs() << " Not promote: User options.\n"); 
-      ORE.emit([&]() { 
-        return OptimizationRemarkMissed(DEBUG_TYPE, "UserOptions", &CB) 
-               << " Not promote: User options"; 
-      }); 
-      break; 
-    } 
-    if (ICPCallOnly && isa<InvokeInst>(CB)) { 
-      LLVM_DEBUG(dbgs() << " Not promote: User option.\n"); 
-      ORE.emit([&]() { 
-        return OptimizationRemarkMissed(DEBUG_TYPE, "UserOptions", &CB) 
-               << " Not promote: User options"; 
-      }); 
-      break; 
-    } 
-    if (ICPCutOff != 0 && NumOfPGOICallPromotion >= ICPCutOff) { 
-      LLVM_DEBUG(dbgs() << " Not promote: Cutoff reached.\n"); 
-      ORE.emit([&]() { 
-        return OptimizationRemarkMissed(DEBUG_TYPE, "CutOffReached", &CB) 
-               << " Not promote: Cutoff reached"; 
-      }); 
-      break; 
-    } 
- 
+//===- IndirectCallPromotion.cpp - Optimizations based on value profiling -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the transformation that promotes indirect calls to
+// conditional direct calls when the indirect-call value profile metadata is
+// available.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/IndirectCallPromotionAnalysis.h"
+#include "llvm/Analysis/IndirectCallVisitor.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/CallPromotionUtils.h"
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "pgo-icall-prom"
+
+STATISTIC(NumOfPGOICallPromotion, "Number of indirect call promotions.");
+STATISTIC(NumOfPGOICallsites, "Number of indirect call candidate sites.");
+
+// Command line option to disable indirect-call promotion with the default as
+// false. This is for debug purpose.
+static cl::opt<bool> DisableICP("disable-icp", cl::init(false), cl::Hidden,
+                                cl::desc("Disable indirect call promotion"));
+
+// Set the cutoff value for the promotion. If the value is other than 0, we
+// stop the transformation once the total number of promotions equals the cutoff
+// value.
+// For debug use only.
+static cl::opt<unsigned>
+    ICPCutOff("icp-cutoff", cl::init(0), cl::Hidden, cl::ZeroOrMore,
+              cl::desc("Max number of promotions for this compilation"));
+
+// If ICPCSSkip is non zero, the first ICPCSSkip callsites will be skipped.
+// For debug use only.
+static cl::opt<unsigned>
+    ICPCSSkip("icp-csskip", cl::init(0), cl::Hidden, cl::ZeroOrMore,
+              cl::desc("Skip Callsite up to this number for this compilation"));
+
+// Set if the pass is called in LTO optimization. The difference for LTO mode
+// is the pass won't prefix the source module name to the internal linkage
+// symbols.
+static cl::opt<bool> ICPLTOMode("icp-lto", cl::init(false), cl::Hidden,
+                                cl::desc("Run indirect-call promotion in LTO "
+                                         "mode"));
+
+// Set if the pass is called in SamplePGO mode. The difference for SamplePGO
+// mode is it will add prof metadatato the created direct call.
+static cl::opt<bool>
+    ICPSamplePGOMode("icp-samplepgo", cl::init(false), cl::Hidden,
+                     cl::desc("Run indirect-call promotion in SamplePGO mode"));
+
+// If the option is set to true, only call instructions will be considered for
+// transformation -- invoke instructions will be ignored.
+static cl::opt<bool>
+    ICPCallOnly("icp-call-only", cl::init(false), cl::Hidden,
+                cl::desc("Run indirect-call promotion for call instructions "
+                         "only"));
+
+// If the option is set to true, only invoke instructions will be considered for
+// transformation -- call instructions will be ignored.
+static cl::opt<bool> ICPInvokeOnly("icp-invoke-only", cl::init(false),
+                                   cl::Hidden,
+                                   cl::desc("Run indirect-call promotion for "
+                                            "invoke instruction only"));
+
+// Dump the function level IR if the transformation happened in this
+// function. For debug use only.
+static cl::opt<bool>
+    ICPDUMPAFTER("icp-dumpafter", cl::init(false), cl::Hidden,
+                 cl::desc("Dump IR after transformation happens"));
+
+namespace {
+
+class PGOIndirectCallPromotionLegacyPass : public ModulePass {
+public:
+  static char ID;
+
+  PGOIndirectCallPromotionLegacyPass(bool InLTO = false, bool SamplePGO = false)
+      : ModulePass(ID), InLTO(InLTO), SamplePGO(SamplePGO) {
+    initializePGOIndirectCallPromotionLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
+  }
+
+  StringRef getPassName() const override { return "PGOIndirectCallPromotion"; }
+
+private:
+  bool runOnModule(Module &M) override;
+
+  // If this pass is called in LTO. We need to special handling the PGOFuncName
+  // for the static variables due to LTO's internalization.
+  bool InLTO;
+
+  // If this pass is called in SamplePGO. We need to add the prof metadata to
+  // the promoted direct call.
+  bool SamplePGO;
+};
+
+} // end anonymous namespace
+
+char PGOIndirectCallPromotionLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(PGOIndirectCallPromotionLegacyPass, "pgo-icall-prom",
+                      "Use PGO instrumentation profile to promote indirect "
+                      "calls to direct calls.",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_END(PGOIndirectCallPromotionLegacyPass, "pgo-icall-prom",
+                    "Use PGO instrumentation profile to promote indirect "
+                    "calls to direct calls.",
+                    false, false)
+
+ModulePass *llvm::createPGOIndirectCallPromotionLegacyPass(bool InLTO,
+                                                           bool SamplePGO) {
+  return new PGOIndirectCallPromotionLegacyPass(InLTO, SamplePGO);
+}
+
+namespace {
+
+// The class for main data structure to promote indirect calls to conditional
+// direct calls.
+class ICallPromotionFunc {
+private:
+  Function &F;
+  Module *M;
+
+  // Symtab that maps indirect call profile values to function names and
+  // defines.
+  InstrProfSymtab *Symtab;
+
+  bool SamplePGO;
+
+  OptimizationRemarkEmitter &ORE;
+
+  // A struct that records the direct target and it's call count.
+  struct PromotionCandidate {
+    Function *TargetFunction;
+    uint64_t Count;
+
+    PromotionCandidate(Function *F, uint64_t C) : TargetFunction(F), Count(C) {}
+  };
+
+  // Check if the indirect-call call site should be promoted. Return the number
+  // of promotions. Inst is the candidate indirect call, ValueDataRef
+  // contains the array of value profile data for profiled targets,
+  // TotalCount is the total profiled count of call executions, and
+  // NumCandidates is the number of candidate entries in ValueDataRef.
+  std::vector<PromotionCandidate> getPromotionCandidatesForCallSite(
+      const CallBase &CB, const ArrayRef<InstrProfValueData> &ValueDataRef,
+      uint64_t TotalCount, uint32_t NumCandidates);
+
+  // Promote a list of targets for one indirect-call callsite. Return
+  // the number of promotions.
+  uint32_t tryToPromote(CallBase &CB,
+                        const std::vector<PromotionCandidate> &Candidates,
+                        uint64_t &TotalCount);
+
+public:
+  ICallPromotionFunc(Function &Func, Module *Modu, InstrProfSymtab *Symtab,
+                     bool SamplePGO, OptimizationRemarkEmitter &ORE)
+      : F(Func), M(Modu), Symtab(Symtab), SamplePGO(SamplePGO), ORE(ORE) {}
+  ICallPromotionFunc(const ICallPromotionFunc &) = delete;
+  ICallPromotionFunc &operator=(const ICallPromotionFunc &) = delete;
+
+  bool processFunction(ProfileSummaryInfo *PSI);
+};
+
+} // end anonymous namespace
+
+// Indirect-call promotion heuristic. The direct targets are sorted based on
+// the count. Stop at the first target that is not promoted.
+std::vector<ICallPromotionFunc::PromotionCandidate>
+ICallPromotionFunc::getPromotionCandidatesForCallSite(
+    const CallBase &CB, const ArrayRef<InstrProfValueData> &ValueDataRef,
+    uint64_t TotalCount, uint32_t NumCandidates) {
+  std::vector<PromotionCandidate> Ret;
+
+  LLVM_DEBUG(dbgs() << " \nWork on callsite #" << NumOfPGOICallsites << CB
+                    << " Num_targets: " << ValueDataRef.size()
+                    << " Num_candidates: " << NumCandidates << "\n");
+  NumOfPGOICallsites++;
+  if (ICPCSSkip != 0 && NumOfPGOICallsites <= ICPCSSkip) {
+    LLVM_DEBUG(dbgs() << " Skip: User options.\n");
+    return Ret;
+  }
+
+  for (uint32_t I = 0; I < NumCandidates; I++) {
+    uint64_t Count = ValueDataRef[I].Count;
+    assert(Count <= TotalCount);
+    uint64_t Target = ValueDataRef[I].Value;
+    LLVM_DEBUG(dbgs() << " Candidate " << I << " Count=" << Count
+                      << "  Target_func: " << Target << "\n");
+
+    if (ICPInvokeOnly && isa<CallInst>(CB)) {
+      LLVM_DEBUG(dbgs() << " Not promote: User options.\n");
+      ORE.emit([&]() {
+        return OptimizationRemarkMissed(DEBUG_TYPE, "UserOptions", &CB)
+               << " Not promote: User options";
+      });
+      break;
+    }
+    if (ICPCallOnly && isa<InvokeInst>(CB)) {
+      LLVM_DEBUG(dbgs() << " Not promote: User option.\n");
+      ORE.emit([&]() {
+        return OptimizationRemarkMissed(DEBUG_TYPE, "UserOptions", &CB)
+               << " Not promote: User options";
+      });
+      break;
+    }
+    if (ICPCutOff != 0 && NumOfPGOICallPromotion >= ICPCutOff) {
+      LLVM_DEBUG(dbgs() << " Not promote: Cutoff reached.\n");
+      ORE.emit([&]() {
+        return OptimizationRemarkMissed(DEBUG_TYPE, "CutOffReached", &CB)
+               << " Not promote: Cutoff reached";
+      });
+      break;
+    }
+
     // Don't promote if the symbol is not defined in the module. This avoids
     // creating a reference to a symbol that doesn't exist in the module
     // This can happen when we compile with a sample profile collected from
@@ -270,180 +270,180 @@ ICallPromotionFunc::getPromotionCandidatesForCallSite(
     // aren't used in the new binary. We might have a declaration initially in
     // the case where the symbol is globally dead in the binary and removed by
     // ThinLTO.
-    Function *TargetFunction = Symtab->getFunction(Target); 
+    Function *TargetFunction = Symtab->getFunction(Target);
     if (TargetFunction == nullptr || TargetFunction->isDeclaration()) {
-      LLVM_DEBUG(dbgs() << " Not promote: Cannot find the target\n"); 
-      ORE.emit([&]() { 
-        return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", &CB) 
-               << "Cannot promote indirect call: target with md5sum " 
-               << ore::NV("target md5sum", Target) << " not found"; 
-      }); 
-      break; 
-    } 
- 
-    const char *Reason = nullptr; 
-    if (!isLegalToPromote(CB, TargetFunction, &Reason)) { 
-      using namespace ore; 
- 
-      ORE.emit([&]() { 
-        return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", &CB) 
-               << "Cannot promote indirect call to " 
-               << NV("TargetFunction", TargetFunction) << " with count of " 
-               << NV("Count", Count) << ": " << Reason; 
-      }); 
-      break; 
-    } 
- 
-    Ret.push_back(PromotionCandidate(TargetFunction, Count)); 
-    TotalCount -= Count; 
-  } 
-  return Ret; 
-} 
- 
-CallBase &llvm::pgo::promoteIndirectCall(CallBase &CB, Function *DirectCallee, 
-                                         uint64_t Count, uint64_t TotalCount, 
-                                         bool AttachProfToDirectCall, 
-                                         OptimizationRemarkEmitter *ORE) { 
- 
-  uint64_t ElseCount = TotalCount - Count; 
-  uint64_t MaxCount = (Count >= ElseCount ? Count : ElseCount); 
-  uint64_t Scale = calculateCountScale(MaxCount); 
-  MDBuilder MDB(CB.getContext()); 
-  MDNode *BranchWeights = MDB.createBranchWeights( 
-      scaleBranchCount(Count, Scale), scaleBranchCount(ElseCount, Scale)); 
- 
-  CallBase &NewInst = 
-      promoteCallWithIfThenElse(CB, DirectCallee, BranchWeights); 
- 
-  if (AttachProfToDirectCall) { 
-    MDBuilder MDB(NewInst.getContext()); 
-    NewInst.setMetadata( 
-        LLVMContext::MD_prof, 
-        MDB.createBranchWeights({static_cast<uint32_t>(Count)})); 
-  } 
- 
-  using namespace ore; 
- 
-  if (ORE) 
-    ORE->emit([&]() { 
-      return OptimizationRemark(DEBUG_TYPE, "Promoted", &CB) 
-             << "Promote indirect call to " << NV("DirectCallee", DirectCallee) 
-             << " with count " << NV("Count", Count) << " out of " 
-             << NV("TotalCount", TotalCount); 
-    }); 
-  return NewInst; 
-} 
- 
-// Promote indirect-call to conditional direct-call for one callsite. 
-uint32_t ICallPromotionFunc::tryToPromote( 
-    CallBase &CB, const std::vector<PromotionCandidate> &Candidates, 
-    uint64_t &TotalCount) { 
-  uint32_t NumPromoted = 0; 
- 
-  for (auto &C : Candidates) { 
-    uint64_t Count = C.Count; 
-    pgo::promoteIndirectCall(CB, C.TargetFunction, Count, TotalCount, SamplePGO, 
-                             &ORE); 
-    assert(TotalCount >= Count); 
-    TotalCount -= Count; 
-    NumOfPGOICallPromotion++; 
-    NumPromoted++; 
-  } 
-  return NumPromoted; 
-} 
- 
-// Traverse all the indirect-call callsite and get the value profile 
-// annotation to perform indirect-call promotion. 
-bool ICallPromotionFunc::processFunction(ProfileSummaryInfo *PSI) { 
-  bool Changed = false; 
-  ICallPromotionAnalysis ICallAnalysis; 
-  for (auto *CB : findIndirectCalls(F)) { 
-    uint32_t NumVals, NumCandidates; 
-    uint64_t TotalCount; 
-    auto ICallProfDataRef = ICallAnalysis.getPromotionCandidatesForInstruction( 
-        CB, NumVals, TotalCount, NumCandidates); 
-    if (!NumCandidates || 
-        (PSI && PSI->hasProfileSummary() && !PSI->isHotCount(TotalCount))) 
-      continue; 
-    auto PromotionCandidates = getPromotionCandidatesForCallSite( 
-        *CB, ICallProfDataRef, TotalCount, NumCandidates); 
-    uint32_t NumPromoted = tryToPromote(*CB, PromotionCandidates, TotalCount); 
-    if (NumPromoted == 0) 
-      continue; 
- 
-    Changed = true; 
-    // Adjust the MD.prof metadata. First delete the old one. 
-    CB->setMetadata(LLVMContext::MD_prof, nullptr); 
-    // If all promoted, we don't need the MD.prof metadata. 
-    if (TotalCount == 0 || NumPromoted == NumVals) 
-      continue; 
-    // Otherwise we need update with the un-promoted records back. 
-    annotateValueSite(*M, *CB, ICallProfDataRef.slice(NumPromoted), TotalCount, 
-                      IPVK_IndirectCallTarget, NumCandidates); 
-  } 
-  return Changed; 
-} 
- 
-// A wrapper function that does the actual work. 
-static bool promoteIndirectCalls(Module &M, ProfileSummaryInfo *PSI, 
-                                 bool InLTO, bool SamplePGO, 
-                                 ModuleAnalysisManager *AM = nullptr) { 
-  if (DisableICP) 
-    return false; 
-  InstrProfSymtab Symtab; 
-  if (Error E = Symtab.create(M, InLTO)) { 
-    std::string SymtabFailure = toString(std::move(E)); 
-    LLVM_DEBUG(dbgs() << "Failed to create symtab: " << SymtabFailure << "\n"); 
-    (void)SymtabFailure; 
-    return false; 
-  } 
-  bool Changed = false; 
-  for (auto &F : M) { 
-    if (F.isDeclaration() || F.hasOptNone()) 
-      continue; 
- 
-    std::unique_ptr<OptimizationRemarkEmitter> OwnedORE; 
-    OptimizationRemarkEmitter *ORE; 
-    if (AM) { 
-      auto &FAM = 
-          AM->getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 
-      ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); 
-    } else { 
-      OwnedORE = std::make_unique<OptimizationRemarkEmitter>(&F); 
-      ORE = OwnedORE.get(); 
-    } 
- 
-    ICallPromotionFunc ICallPromotion(F, &M, &Symtab, SamplePGO, *ORE); 
-    bool FuncChanged = ICallPromotion.processFunction(PSI); 
-    if (ICPDUMPAFTER && FuncChanged) { 
-      LLVM_DEBUG(dbgs() << "\n== IR Dump After =="; F.print(dbgs())); 
-      LLVM_DEBUG(dbgs() << "\n"); 
-    } 
-    Changed |= FuncChanged; 
-    if (ICPCutOff != 0 && NumOfPGOICallPromotion >= ICPCutOff) { 
-      LLVM_DEBUG(dbgs() << " Stop: Cutoff reached.\n"); 
-      break; 
-    } 
-  } 
-  return Changed; 
-} 
- 
-bool PGOIndirectCallPromotionLegacyPass::runOnModule(Module &M) { 
-  ProfileSummaryInfo *PSI = 
-      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 
- 
-  // Command-line option has the priority for InLTO. 
-  return promoteIndirectCalls(M, PSI, InLTO | ICPLTOMode, 
-                              SamplePGO | ICPSamplePGOMode); 
-} 
- 
-PreservedAnalyses PGOIndirectCallPromotion::run(Module &M, 
-                                                ModuleAnalysisManager &AM) { 
-  ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M); 
- 
-  if (!promoteIndirectCalls(M, PSI, InLTO | ICPLTOMode, 
-                            SamplePGO | ICPSamplePGOMode, &AM)) 
-    return PreservedAnalyses::all(); 
- 
-  return PreservedAnalyses::none(); 
-} 
+      LLVM_DEBUG(dbgs() << " Not promote: Cannot find the target\n");
+      ORE.emit([&]() {
+        return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", &CB)
+               << "Cannot promote indirect call: target with md5sum "
+               << ore::NV("target md5sum", Target) << " not found";
+      });
+      break;
+    }
+
+    const char *Reason = nullptr;
+    if (!isLegalToPromote(CB, TargetFunction, &Reason)) {
+      using namespace ore;
+
+      ORE.emit([&]() {
+        return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", &CB)
+               << "Cannot promote indirect call to "
+               << NV("TargetFunction", TargetFunction) << " with count of "
+               << NV("Count", Count) << ": " << Reason;
+      });
+      break;
+    }
+
+    Ret.push_back(PromotionCandidate(TargetFunction, Count));
+    TotalCount -= Count;
+  }
+  return Ret;
+}
+
+CallBase &llvm::pgo::promoteIndirectCall(CallBase &CB, Function *DirectCallee,
+                                         uint64_t Count, uint64_t TotalCount,
+                                         bool AttachProfToDirectCall,
+                                         OptimizationRemarkEmitter *ORE) {
+
+  uint64_t ElseCount = TotalCount - Count;
+  uint64_t MaxCount = (Count >= ElseCount ? Count : ElseCount);
+  uint64_t Scale = calculateCountScale(MaxCount);
+  MDBuilder MDB(CB.getContext());
+  MDNode *BranchWeights = MDB.createBranchWeights(
+      scaleBranchCount(Count, Scale), scaleBranchCount(ElseCount, Scale));
+
+  CallBase &NewInst =
+      promoteCallWithIfThenElse(CB, DirectCallee, BranchWeights);
+
+  if (AttachProfToDirectCall) {
+    MDBuilder MDB(NewInst.getContext());
+    NewInst.setMetadata(
+        LLVMContext::MD_prof,
+        MDB.createBranchWeights({static_cast<uint32_t>(Count)}));
+  }
+
+  using namespace ore;
+
+  if (ORE)
+    ORE->emit([&]() {
+      return OptimizationRemark(DEBUG_TYPE, "Promoted", &CB)
+             << "Promote indirect call to " << NV("DirectCallee", DirectCallee)
+             << " with count " << NV("Count", Count) << " out of "
+             << NV("TotalCount", TotalCount);
+    });
+  return NewInst;
+}
+
+// Promote indirect-call to conditional direct-call for one callsite.
+uint32_t ICallPromotionFunc::tryToPromote(
+    CallBase &CB, const std::vector<PromotionCandidate> &Candidates,
+    uint64_t &TotalCount) {
+  uint32_t NumPromoted = 0;
+
+  for (auto &C : Candidates) {
+    uint64_t Count = C.Count;
+    pgo::promoteIndirectCall(CB, C.TargetFunction, Count, TotalCount, SamplePGO,
+                             &ORE);
+    assert(TotalCount >= Count);
+    TotalCount -= Count;
+    NumOfPGOICallPromotion++;
+    NumPromoted++;
+  }
+  return NumPromoted;
+}
+
+// Traverse all the indirect-call callsite and get the value profile
+// annotation to perform indirect-call promotion.
+bool ICallPromotionFunc::processFunction(ProfileSummaryInfo *PSI) {
+  bool Changed = false;
+  ICallPromotionAnalysis ICallAnalysis;
+  for (auto *CB : findIndirectCalls(F)) {
+    uint32_t NumVals, NumCandidates;
+    uint64_t TotalCount;
+    auto ICallProfDataRef = ICallAnalysis.getPromotionCandidatesForInstruction(
+        CB, NumVals, TotalCount, NumCandidates);
+    if (!NumCandidates ||
+        (PSI && PSI->hasProfileSummary() && !PSI->isHotCount(TotalCount)))
+      continue;
+    auto PromotionCandidates = getPromotionCandidatesForCallSite(
+        *CB, ICallProfDataRef, TotalCount, NumCandidates);
+    uint32_t NumPromoted = tryToPromote(*CB, PromotionCandidates, TotalCount);
+    if (NumPromoted == 0)
+      continue;
+
+    Changed = true;
+    // Adjust the MD.prof metadata. First delete the old one.
+    CB->setMetadata(LLVMContext::MD_prof, nullptr);
+    // If all promoted, we don't need the MD.prof metadata.
+    if (TotalCount == 0 || NumPromoted == NumVals)
+      continue;
+    // Otherwise we need update with the un-promoted records back.
+    annotateValueSite(*M, *CB, ICallProfDataRef.slice(NumPromoted), TotalCount,
+                      IPVK_IndirectCallTarget, NumCandidates);
+  }
+  return Changed;
+}
+
+// A wrapper function that does the actual work.
+static bool promoteIndirectCalls(Module &M, ProfileSummaryInfo *PSI,
+                                 bool InLTO, bool SamplePGO,
+                                 ModuleAnalysisManager *AM = nullptr) {
+  if (DisableICP)
+    return false;
+  InstrProfSymtab Symtab;
+  if (Error E = Symtab.create(M, InLTO)) {
+    std::string SymtabFailure = toString(std::move(E));
+    LLVM_DEBUG(dbgs() << "Failed to create symtab: " << SymtabFailure << "\n");
+    (void)SymtabFailure;
+    return false;
+  }
+  bool Changed = false;
+  for (auto &F : M) {
+    if (F.isDeclaration() || F.hasOptNone())
+      continue;
+
+    std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
+    OptimizationRemarkEmitter *ORE;
+    if (AM) {
+      auto &FAM =
+          AM->getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+      ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+    } else {
+      OwnedORE = std::make_unique<OptimizationRemarkEmitter>(&F);
+      ORE = OwnedORE.get();
+    }
+
+    ICallPromotionFunc ICallPromotion(F, &M, &Symtab, SamplePGO, *ORE);
+    bool FuncChanged = ICallPromotion.processFunction(PSI);
+    if (ICPDUMPAFTER && FuncChanged) {
+      LLVM_DEBUG(dbgs() << "\n== IR Dump After =="; F.print(dbgs()));
+      LLVM_DEBUG(dbgs() << "\n");
+    }
+    Changed |= FuncChanged;
+    if (ICPCutOff != 0 && NumOfPGOICallPromotion >= ICPCutOff) {
+      LLVM_DEBUG(dbgs() << " Stop: Cutoff reached.\n");
+      break;
+    }
+  }
+  return Changed;
+}
+
+bool PGOIndirectCallPromotionLegacyPass::runOnModule(Module &M) {
+  ProfileSummaryInfo *PSI =
+      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+
+  // Command-line option has the priority for InLTO.
+  return promoteIndirectCalls(M, PSI, InLTO | ICPLTOMode,
+                              SamplePGO | ICPSamplePGOMode);
+}
+
+PreservedAnalyses PGOIndirectCallPromotion::run(Module &M,
+                                                ModuleAnalysisManager &AM) {
+  ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
+
+  if (!promoteIndirectCalls(M, PSI, InLTO | ICPLTOMode,
+                            SamplePGO | ICPSamplePGOMode, &AM))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/InstrOrderFile.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/InstrOrderFile.cpp
index 0addfb46b2..853385fbf8 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/InstrOrderFile.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/InstrOrderFile.cpp
@@ -1,212 +1,212 @@
-//===- InstrOrderFile.cpp ---- Late IR instrumentation for order file ----===// 
-// 
-//                     The LLVM Compiler Infrastructure 
-// 
-// This file is distributed under the University of Illinois Open Source 
-// License. See LICENSE.TXT for details. 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Instrumentation/InstrOrderFile.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GlobalValue.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/PassRegistry.h" 
-#include "llvm/ProfileData/InstrProf.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/FileSystem.h" 
-#include "llvm/Support/Path.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Instrumentation.h" 
-#include <fstream> 
-#include <map> 
-#include <mutex> 
-#include <set> 
-#include <sstream> 
- 
-using namespace llvm; 
-#define DEBUG_TYPE "instrorderfile" 
- 
-static cl::opt<std::string> ClOrderFileWriteMapping( 
-    "orderfile-write-mapping", cl::init(""), 
-    cl::desc( 
-        "Dump functions and their MD5 hash to deobfuscate profile data"), 
-    cl::Hidden); 
- 
-namespace { 
- 
-// We need a global bitmap to tell if a function is executed. We also 
-// need a global variable to save the order of functions. We can use a 
-// fixed-size buffer that saves the MD5 hash of the function. We need 
-// a global variable to save the index into the buffer. 
- 
-std::mutex MappingMutex; 
- 
-struct InstrOrderFile { 
-private: 
-  GlobalVariable *OrderFileBuffer; 
-  GlobalVariable *BufferIdx; 
-  GlobalVariable *BitMap; 
-  ArrayType *BufferTy; 
-  ArrayType *MapTy; 
- 
-public: 
-  InstrOrderFile() {} 
- 
-  void createOrderFileData(Module &M) { 
-    LLVMContext &Ctx = M.getContext(); 
-    int NumFunctions = 0; 
-    for (Function &F : M) { 
-      if (!F.isDeclaration()) 
-        NumFunctions++; 
-    } 
- 
-    BufferTy = 
-        ArrayType::get(Type::getInt64Ty(Ctx), INSTR_ORDER_FILE_BUFFER_SIZE); 
-    Type *IdxTy = Type::getInt32Ty(Ctx); 
-    MapTy = ArrayType::get(Type::getInt8Ty(Ctx), NumFunctions); 
- 
-    // Create the global variables. 
-    std::string SymbolName = INSTR_PROF_ORDERFILE_BUFFER_NAME_STR; 
-    OrderFileBuffer = new GlobalVariable(M, BufferTy, false, GlobalValue::LinkOnceODRLinkage, 
-                           Constant::getNullValue(BufferTy), SymbolName); 
-    Triple TT = Triple(M.getTargetTriple()); 
-    OrderFileBuffer->setSection( 
-        getInstrProfSectionName(IPSK_orderfile, TT.getObjectFormat())); 
- 
-    std::string IndexName = INSTR_PROF_ORDERFILE_BUFFER_IDX_NAME_STR; 
-    BufferIdx = new GlobalVariable(M, IdxTy, false, GlobalValue::LinkOnceODRLinkage, 
-                           Constant::getNullValue(IdxTy), IndexName); 
- 
-    std::string BitMapName = "bitmap_0"; 
-    BitMap = new GlobalVariable(M, MapTy, false, GlobalValue::PrivateLinkage, 
-                                Constant::getNullValue(MapTy), BitMapName); 
-  } 
- 
-  // Generate the code sequence in the entry block of each function to 
-  // update the buffer. 
-  void generateCodeSequence(Module &M, Function &F, int FuncId) { 
-    if (!ClOrderFileWriteMapping.empty()) { 
-      std::lock_guard<std::mutex> LogLock(MappingMutex); 
-      std::error_code EC; 
-      llvm::raw_fd_ostream OS(ClOrderFileWriteMapping, EC, 
-                              llvm::sys::fs::OF_Append); 
-      if (EC) { 
-        report_fatal_error(Twine("Failed to open ") + ClOrderFileWriteMapping + 
-                           " to save mapping file for order file instrumentation\n"); 
-      } else { 
-        std::stringstream stream; 
-        stream << std::hex << MD5Hash(F.getName()); 
-        std::string singleLine = "MD5 " + stream.str() + " " + 
-                                 std::string(F.getName()) + '\n'; 
-        OS << singleLine; 
-      } 
-    } 
- 
-    BasicBlock *OrigEntry = &F.getEntryBlock(); 
- 
-    LLVMContext &Ctx = M.getContext(); 
-    IntegerType *Int32Ty = Type::getInt32Ty(Ctx); 
-    IntegerType *Int8Ty = Type::getInt8Ty(Ctx); 
- 
-    // Create a new entry block for instrumentation. We will check the bitmap 
-    // in this basic block. 
-    BasicBlock *NewEntry = 
-        BasicBlock::Create(M.getContext(), "order_file_entry", &F, OrigEntry); 
-    IRBuilder<> entryB(NewEntry); 
-    // Create a basic block for updating the circular buffer. 
-    BasicBlock *UpdateOrderFileBB = 
-        BasicBlock::Create(M.getContext(), "order_file_set", &F, OrigEntry); 
-    IRBuilder<> updateB(UpdateOrderFileBB); 
- 
-    // Check the bitmap, if it is already 1, do nothing. 
-    // Otherwise, set the bit, grab the index, update the buffer. 
-    Value *IdxFlags[] = {ConstantInt::get(Int32Ty, 0), 
-                         ConstantInt::get(Int32Ty, FuncId)}; 
-    Value *MapAddr = entryB.CreateGEP(MapTy, BitMap, IdxFlags, ""); 
-    LoadInst *loadBitMap = entryB.CreateLoad(Int8Ty, MapAddr, ""); 
-    entryB.CreateStore(ConstantInt::get(Int8Ty, 1), MapAddr); 
-    Value *IsNotExecuted = 
-        entryB.CreateICmpEQ(loadBitMap, ConstantInt::get(Int8Ty, 0)); 
-    entryB.CreateCondBr(IsNotExecuted, UpdateOrderFileBB, OrigEntry); 
- 
-    // Fill up UpdateOrderFileBB: grab the index, update the buffer! 
-    Value *IdxVal = updateB.CreateAtomicRMW( 
-        AtomicRMWInst::Add, BufferIdx, ConstantInt::get(Int32Ty, 1), 
-        AtomicOrdering::SequentiallyConsistent); 
-    // We need to wrap around the index to fit it inside the buffer. 
-    Value *WrappedIdx = updateB.CreateAnd( 
-        IdxVal, ConstantInt::get(Int32Ty, INSTR_ORDER_FILE_BUFFER_MASK)); 
-    Value *BufferGEPIdx[] = {ConstantInt::get(Int32Ty, 0), WrappedIdx}; 
-    Value *BufferAddr = 
-        updateB.CreateGEP(BufferTy, OrderFileBuffer, BufferGEPIdx, ""); 
-    updateB.CreateStore(ConstantInt::get(Type::getInt64Ty(Ctx), MD5Hash(F.getName())), 
-                        BufferAddr); 
-    updateB.CreateBr(OrigEntry); 
-  } 
- 
-  bool run(Module &M) { 
-    createOrderFileData(M); 
- 
-    int FuncId = 0; 
-    for (Function &F : M) { 
-      if (F.isDeclaration()) 
-        continue; 
-      generateCodeSequence(M, F, FuncId); 
-      ++FuncId; 
-    } 
- 
-    return true; 
-  } 
- 
-}; // End of InstrOrderFile struct 
- 
-class InstrOrderFileLegacyPass : public ModulePass { 
-public: 
-  static char ID; 
- 
-  InstrOrderFileLegacyPass() : ModulePass(ID) { 
-    initializeInstrOrderFileLegacyPassPass( 
-        *PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnModule(Module &M) override; 
-}; 
- 
-} // End anonymous namespace 
- 
-bool InstrOrderFileLegacyPass::runOnModule(Module &M) { 
-  if (skipModule(M)) 
-    return false; 
- 
-  return InstrOrderFile().run(M); 
-} 
- 
-PreservedAnalyses 
-InstrOrderFilePass::run(Module &M, ModuleAnalysisManager &AM) { 
-  if (InstrOrderFile().run(M)) 
-    return PreservedAnalyses::none(); 
-  return PreservedAnalyses::all(); 
-} 
- 
-INITIALIZE_PASS_BEGIN(InstrOrderFileLegacyPass, "instrorderfile", 
-                      "Instrumentation for Order File", false, false) 
-INITIALIZE_PASS_END(InstrOrderFileLegacyPass, "instrorderfile", 
-                    "Instrumentation for Order File", false, false) 
- 
-char InstrOrderFileLegacyPass::ID = 0; 
- 
-ModulePass *llvm::createInstrOrderFilePass() { 
-  return new InstrOrderFileLegacyPass(); 
-} 
+//===- InstrOrderFile.cpp ---- Late IR instrumentation for order file ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/InstrOrderFile.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include <fstream>
+#include <map>
+#include <mutex>
+#include <set>
+#include <sstream>
+
+using namespace llvm;
+#define DEBUG_TYPE "instrorderfile"
+
+static cl::opt<std::string> ClOrderFileWriteMapping(
+    "orderfile-write-mapping", cl::init(""),
+    cl::desc(
+        "Dump functions and their MD5 hash to deobfuscate profile data"),
+    cl::Hidden);
+
+namespace {
+
+// We need a global bitmap to tell if a function is executed. We also
+// need a global variable to save the order of functions. We can use a
+// fixed-size buffer that saves the MD5 hash of the function. We need
+// a global variable to save the index into the buffer.
+
+std::mutex MappingMutex;
+
+struct InstrOrderFile {
+private:
+  GlobalVariable *OrderFileBuffer;
+  GlobalVariable *BufferIdx;
+  GlobalVariable *BitMap;
+  ArrayType *BufferTy;
+  ArrayType *MapTy;
+
+public:
+  InstrOrderFile() {}
+
+  void createOrderFileData(Module &M) {
+    LLVMContext &Ctx = M.getContext();
+    int NumFunctions = 0;
+    for (Function &F : M) {
+      if (!F.isDeclaration())
+        NumFunctions++;
+    }
+
+    BufferTy =
+        ArrayType::get(Type::getInt64Ty(Ctx), INSTR_ORDER_FILE_BUFFER_SIZE);
+    Type *IdxTy = Type::getInt32Ty(Ctx);
+    MapTy = ArrayType::get(Type::getInt8Ty(Ctx), NumFunctions);
+
+    // Create the global variables.
+    std::string SymbolName = INSTR_PROF_ORDERFILE_BUFFER_NAME_STR;
+    OrderFileBuffer = new GlobalVariable(M, BufferTy, false, GlobalValue::LinkOnceODRLinkage,
+                           Constant::getNullValue(BufferTy), SymbolName);
+    Triple TT = Triple(M.getTargetTriple());
+    OrderFileBuffer->setSection(
+        getInstrProfSectionName(IPSK_orderfile, TT.getObjectFormat()));
+
+    std::string IndexName = INSTR_PROF_ORDERFILE_BUFFER_IDX_NAME_STR;
+    BufferIdx = new GlobalVariable(M, IdxTy, false, GlobalValue::LinkOnceODRLinkage,
+                           Constant::getNullValue(IdxTy), IndexName);
+
+    std::string BitMapName = "bitmap_0";
+    BitMap = new GlobalVariable(M, MapTy, false, GlobalValue::PrivateLinkage,
+                                Constant::getNullValue(MapTy), BitMapName);
+  }
+
+  // Generate the code sequence in the entry block of each function to
+  // update the buffer.
+  void generateCodeSequence(Module &M, Function &F, int FuncId) {
+    if (!ClOrderFileWriteMapping.empty()) {
+      std::lock_guard<std::mutex> LogLock(MappingMutex);
+      std::error_code EC;
+      llvm::raw_fd_ostream OS(ClOrderFileWriteMapping, EC,
+                              llvm::sys::fs::OF_Append);
+      if (EC) {
+        report_fatal_error(Twine("Failed to open ") + ClOrderFileWriteMapping +
+                           " to save mapping file for order file instrumentation\n");
+      } else {
+        std::stringstream stream;
+        stream << std::hex << MD5Hash(F.getName());
+        std::string singleLine = "MD5 " + stream.str() + " " +
+                                 std::string(F.getName()) + '\n';
+        OS << singleLine;
+      }
+    }
+
+    BasicBlock *OrigEntry = &F.getEntryBlock();
+
+    LLVMContext &Ctx = M.getContext();
+    IntegerType *Int32Ty = Type::getInt32Ty(Ctx);
+    IntegerType *Int8Ty = Type::getInt8Ty(Ctx);
+
+    // Create a new entry block for instrumentation. We will check the bitmap
+    // in this basic block.
+    BasicBlock *NewEntry =
+        BasicBlock::Create(M.getContext(), "order_file_entry", &F, OrigEntry);
+    IRBuilder<> entryB(NewEntry);
+    // Create a basic block for updating the circular buffer.
+    BasicBlock *UpdateOrderFileBB =
+        BasicBlock::Create(M.getContext(), "order_file_set", &F, OrigEntry);
+    IRBuilder<> updateB(UpdateOrderFileBB);
+
+    // Check the bitmap, if it is already 1, do nothing.
+    // Otherwise, set the bit, grab the index, update the buffer.
+    Value *IdxFlags[] = {ConstantInt::get(Int32Ty, 0),
+                         ConstantInt::get(Int32Ty, FuncId)};
+    Value *MapAddr = entryB.CreateGEP(MapTy, BitMap, IdxFlags, "");
+    LoadInst *loadBitMap = entryB.CreateLoad(Int8Ty, MapAddr, "");
+    entryB.CreateStore(ConstantInt::get(Int8Ty, 1), MapAddr);
+    Value *IsNotExecuted =
+        entryB.CreateICmpEQ(loadBitMap, ConstantInt::get(Int8Ty, 0));
+    entryB.CreateCondBr(IsNotExecuted, UpdateOrderFileBB, OrigEntry);
+
+    // Fill up UpdateOrderFileBB: grab the index, update the buffer!
+    Value *IdxVal = updateB.CreateAtomicRMW(
+        AtomicRMWInst::Add, BufferIdx, ConstantInt::get(Int32Ty, 1),
+        AtomicOrdering::SequentiallyConsistent);
+    // We need to wrap around the index to fit it inside the buffer.
+    Value *WrappedIdx = updateB.CreateAnd(
+        IdxVal, ConstantInt::get(Int32Ty, INSTR_ORDER_FILE_BUFFER_MASK));
+    Value *BufferGEPIdx[] = {ConstantInt::get(Int32Ty, 0), WrappedIdx};
+    Value *BufferAddr =
+        updateB.CreateGEP(BufferTy, OrderFileBuffer, BufferGEPIdx, "");
+    updateB.CreateStore(ConstantInt::get(Type::getInt64Ty(Ctx), MD5Hash(F.getName())),
+                        BufferAddr);
+    updateB.CreateBr(OrigEntry);
+  }
+
+  bool run(Module &M) {
+    createOrderFileData(M);
+
+    int FuncId = 0;
+    for (Function &F : M) {
+      if (F.isDeclaration())
+        continue;
+      generateCodeSequence(M, F, FuncId);
+      ++FuncId;
+    }
+
+    return true;
+  }
+
+}; // End of InstrOrderFile struct
+
+class InstrOrderFileLegacyPass : public ModulePass {
+public:
+  static char ID;
+
+  InstrOrderFileLegacyPass() : ModulePass(ID) {
+    initializeInstrOrderFileLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override;
+};
+
+} // End anonymous namespace
+
+bool InstrOrderFileLegacyPass::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  return InstrOrderFile().run(M);
+}
+
+PreservedAnalyses
+InstrOrderFilePass::run(Module &M, ModuleAnalysisManager &AM) {
+  if (InstrOrderFile().run(M))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+INITIALIZE_PASS_BEGIN(InstrOrderFileLegacyPass, "instrorderfile",
+                      "Instrumentation for Order File", false, false)
+INITIALIZE_PASS_END(InstrOrderFileLegacyPass, "instrorderfile",
+                    "Instrumentation for Order File", false, false)
+
+char InstrOrderFileLegacyPass::ID = 0;
+
+ModulePass *llvm::createInstrOrderFilePass() {
+  return new InstrOrderFileLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/InstrProfiling.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 925c018135..9efc7d1ac5 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -1,266 +1,266 @@
-//===-- InstrProfiling.cpp - Frontend instrumentation based profiling -----===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass lowers instrprof_* intrinsics emitted by a frontend for profiling. 
-// It also builds the data structures and initialization code needed for 
-// updating execution counts and emitting the profile at runtime. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Instrumentation/InstrProfiling.h" 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/StringRef.h" 
-#include "llvm/ADT/Triple.h" 
-#include "llvm/ADT/Twine.h" 
-#include "llvm/Analysis/BlockFrequencyInfo.h" 
-#include "llvm/Analysis/BranchProbabilityInfo.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/IR/Attributes.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GlobalValue.h" 
-#include "llvm/IR/GlobalVariable.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/ProfileData/InstrProf.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Error.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/ModuleUtils.h" 
-#include "llvm/Transforms/Utils/SSAUpdater.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <cstddef> 
-#include <cstdint> 
-#include <string> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "instrprof" 
- 
-namespace { 
- 
-cl::opt<bool> DoHashBasedCounterSplit( 
-    "hash-based-counter-split", 
-    cl::desc("Rename counter variable of a comdat function based on cfg hash"), 
-    cl::init(true)); 
- 
-cl::opt<bool> RuntimeCounterRelocation( 
-    "runtime-counter-relocation", 
-    cl::desc("Enable relocating counters at runtime."), 
-    cl::init(false)); 
- 
-cl::opt<bool> ValueProfileStaticAlloc( 
-    "vp-static-alloc", 
-    cl::desc("Do static counter allocation for value profiler"), 
-    cl::init(true)); 
- 
-cl::opt<double> NumCountersPerValueSite( 
-    "vp-counters-per-site", 
-    cl::desc("The average number of profile counters allocated " 
-             "per value profiling site."), 
-    // This is set to a very small value because in real programs, only 
-    // a very small percentage of value sites have non-zero targets, e.g, 1/30. 
-    // For those sites with non-zero profile, the average number of targets 
-    // is usually smaller than 2. 
-    cl::init(1.0)); 
- 
-cl::opt<bool> AtomicCounterUpdateAll( 
-    "instrprof-atomic-counter-update-all", cl::ZeroOrMore, 
-    cl::desc("Make all profile counter updates atomic (for testing only)"), 
-    cl::init(false)); 
- 
-cl::opt<bool> AtomicCounterUpdatePromoted( 
-    "atomic-counter-update-promoted", cl::ZeroOrMore, 
-    cl::desc("Do counter update using atomic fetch add " 
-             " for promoted counters only"), 
-    cl::init(false)); 
- 
-cl::opt<bool> AtomicFirstCounter( 
-    "atomic-first-counter", cl::ZeroOrMore, 
-    cl::desc("Use atomic fetch add for first counter in a function (usually " 
-             "the entry counter)"), 
-    cl::init(false)); 
- 
-// If the option is not specified, the default behavior about whether 
-// counter promotion is done depends on how instrumentaiton lowering 
-// pipeline is setup, i.e., the default value of true of this option 
-// does not mean the promotion will be done by default. Explicitly 
-// setting this option can override the default behavior. 
-cl::opt<bool> DoCounterPromotion("do-counter-promotion", cl::ZeroOrMore, 
-                                 cl::desc("Do counter register promotion"), 
-                                 cl::init(false)); 
-cl::opt<unsigned> MaxNumOfPromotionsPerLoop( 
-    cl::ZeroOrMore, "max-counter-promotions-per-loop", cl::init(20), 
-    cl::desc("Max number counter promotions per loop to avoid" 
-             " increasing register pressure too much")); 
- 
-// A debug option 
-cl::opt<int> 
-    MaxNumOfPromotions(cl::ZeroOrMore, "max-counter-promotions", cl::init(-1), 
-                       cl::desc("Max number of allowed counter promotions")); 
- 
-cl::opt<unsigned> SpeculativeCounterPromotionMaxExiting( 
-    cl::ZeroOrMore, "speculative-counter-promotion-max-exiting", cl::init(3), 
-    cl::desc("The max number of exiting blocks of a loop to allow " 
-             " speculative counter promotion")); 
- 
-cl::opt<bool> SpeculativeCounterPromotionToLoop( 
-    cl::ZeroOrMore, "speculative-counter-promotion-to-loop", cl::init(false), 
-    cl::desc("When the option is false, if the target block is in a loop, " 
-             "the promotion will be disallowed unless the promoted counter " 
-             " update can be further/iteratively promoted into an acyclic " 
-             " region.")); 
- 
-cl::opt<bool> IterativeCounterPromotion( 
-    cl::ZeroOrMore, "iterative-counter-promotion", cl::init(true), 
-    cl::desc("Allow counter promotion across the whole loop nest.")); 
- 
+//===-- InstrProfiling.cpp - Frontend instrumentation based profiling -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers instrprof_* intrinsics emitted by a frontend for profiling.
+// It also builds the data structures and initialization code needed for
+// updating execution counts and emitting the profile at runtime.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/InstrProfiling.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "instrprof"
+
+namespace {
+
+cl::opt<bool> DoHashBasedCounterSplit(
+    "hash-based-counter-split",
+    cl::desc("Rename counter variable of a comdat function based on cfg hash"),
+    cl::init(true));
+
+cl::opt<bool> RuntimeCounterRelocation(
+    "runtime-counter-relocation",
+    cl::desc("Enable relocating counters at runtime."),
+    cl::init(false));
+
+cl::opt<bool> ValueProfileStaticAlloc(
+    "vp-static-alloc",
+    cl::desc("Do static counter allocation for value profiler"),
+    cl::init(true));
+
+cl::opt<double> NumCountersPerValueSite(
+    "vp-counters-per-site",
+    cl::desc("The average number of profile counters allocated "
+             "per value profiling site."),
+    // This is set to a very small value because in real programs, only
+    // a very small percentage of value sites have non-zero targets, e.g, 1/30.
+    // For those sites with non-zero profile, the average number of targets
+    // is usually smaller than 2.
+    cl::init(1.0));
+
+cl::opt<bool> AtomicCounterUpdateAll(
+    "instrprof-atomic-counter-update-all", cl::ZeroOrMore,
+    cl::desc("Make all profile counter updates atomic (for testing only)"),
+    cl::init(false));
+
+cl::opt<bool> AtomicCounterUpdatePromoted(
+    "atomic-counter-update-promoted", cl::ZeroOrMore,
+    cl::desc("Do counter update using atomic fetch add "
+             " for promoted counters only"),
+    cl::init(false));
+
+cl::opt<bool> AtomicFirstCounter(
+    "atomic-first-counter", cl::ZeroOrMore,
+    cl::desc("Use atomic fetch add for first counter in a function (usually "
+             "the entry counter)"),
+    cl::init(false));
+
+// If the option is not specified, the default behavior about whether
+// counter promotion is done depends on how instrumentaiton lowering
+// pipeline is setup, i.e., the default value of true of this option
+// does not mean the promotion will be done by default. Explicitly
+// setting this option can override the default behavior.
+cl::opt<bool> DoCounterPromotion("do-counter-promotion", cl::ZeroOrMore,
+                                 cl::desc("Do counter register promotion"),
+                                 cl::init(false));
+cl::opt<unsigned> MaxNumOfPromotionsPerLoop(
+    cl::ZeroOrMore, "max-counter-promotions-per-loop", cl::init(20),
+    cl::desc("Max number counter promotions per loop to avoid"
+             " increasing register pressure too much"));
+
+// A debug option
+cl::opt<int>
+    MaxNumOfPromotions(cl::ZeroOrMore, "max-counter-promotions", cl::init(-1),
+                       cl::desc("Max number of allowed counter promotions"));
+
+cl::opt<unsigned> SpeculativeCounterPromotionMaxExiting(
+    cl::ZeroOrMore, "speculative-counter-promotion-max-exiting", cl::init(3),
+    cl::desc("The max number of exiting blocks of a loop to allow "
+             " speculative counter promotion"));
+
+cl::opt<bool> SpeculativeCounterPromotionToLoop(
+    cl::ZeroOrMore, "speculative-counter-promotion-to-loop", cl::init(false),
+    cl::desc("When the option is false, if the target block is in a loop, "
+             "the promotion will be disallowed unless the promoted counter "
+             " update can be further/iteratively promoted into an acyclic "
+             " region."));
+
+cl::opt<bool> IterativeCounterPromotion(
+    cl::ZeroOrMore, "iterative-counter-promotion", cl::init(true),
+    cl::desc("Allow counter promotion across the whole loop nest."));
+
 cl::opt<bool> SkipRetExitBlock(
     cl::ZeroOrMore, "skip-ret-exit-block", cl::init(true),
     cl::desc("Suppress counter promotion if exit blocks contain ret."));
 
-class InstrProfilingLegacyPass : public ModulePass { 
-  InstrProfiling InstrProf; 
- 
-public: 
-  static char ID; 
- 
-  InstrProfilingLegacyPass() : ModulePass(ID) {} 
-  InstrProfilingLegacyPass(const InstrProfOptions &Options, bool IsCS = false) 
-      : ModulePass(ID), InstrProf(Options, IsCS) { 
-    initializeInstrProfilingLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  StringRef getPassName() const override { 
-    return "Frontend instrumentation-based coverage lowering"; 
-  } 
- 
-  bool runOnModule(Module &M) override { 
-    auto GetTLI = [this](Function &F) -> TargetLibraryInfo & { 
-      return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 
-    }; 
-    return InstrProf.run(M, GetTLI); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.setPreservesCFG(); 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-  } 
-}; 
- 
-/// 
-/// A helper class to promote one counter RMW operation in the loop 
-/// into register update. 
-/// 
-/// RWM update for the counter will be sinked out of the loop after 
-/// the transformation. 
-/// 
-class PGOCounterPromoterHelper : public LoadAndStorePromoter { 
-public: 
-  PGOCounterPromoterHelper( 
-      Instruction *L, Instruction *S, SSAUpdater &SSA, Value *Init, 
-      BasicBlock *PH, ArrayRef<BasicBlock *> ExitBlocks, 
-      ArrayRef<Instruction *> InsertPts, 
-      DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCands, 
-      LoopInfo &LI) 
-      : LoadAndStorePromoter({L, S}, SSA), Store(S), ExitBlocks(ExitBlocks), 
-        InsertPts(InsertPts), LoopToCandidates(LoopToCands), LI(LI) { 
-    assert(isa<LoadInst>(L)); 
-    assert(isa<StoreInst>(S)); 
-    SSA.AddAvailableValue(PH, Init); 
-  } 
- 
-  void doExtraRewritesBeforeFinalDeletion() override { 
-    for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) { 
-      BasicBlock *ExitBlock = ExitBlocks[i]; 
-      Instruction *InsertPos = InsertPts[i]; 
-      // Get LiveIn value into the ExitBlock. If there are multiple 
-      // predecessors, the value is defined by a PHI node in this 
-      // block. 
-      Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock); 
-      Value *Addr = cast<StoreInst>(Store)->getPointerOperand(); 
-      Type *Ty = LiveInValue->getType(); 
-      IRBuilder<> Builder(InsertPos); 
-      if (AtomicCounterUpdatePromoted) 
-        // automic update currently can only be promoted across the current 
-        // loop, not the whole loop nest. 
-        Builder.CreateAtomicRMW(AtomicRMWInst::Add, Addr, LiveInValue, 
-                                AtomicOrdering::SequentiallyConsistent); 
-      else { 
-        LoadInst *OldVal = Builder.CreateLoad(Ty, Addr, "pgocount.promoted"); 
-        auto *NewVal = Builder.CreateAdd(OldVal, LiveInValue); 
-        auto *NewStore = Builder.CreateStore(NewVal, Addr); 
- 
-        // Now update the parent loop's candidate list: 
-        if (IterativeCounterPromotion) { 
-          auto *TargetLoop = LI.getLoopFor(ExitBlock); 
-          if (TargetLoop) 
-            LoopToCandidates[TargetLoop].emplace_back(OldVal, NewStore); 
-        } 
-      } 
-    } 
-  } 
- 
-private: 
-  Instruction *Store; 
-  ArrayRef<BasicBlock *> ExitBlocks; 
-  ArrayRef<Instruction *> InsertPts; 
-  DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCandidates; 
-  LoopInfo &LI; 
-}; 
- 
-/// A helper class to do register promotion for all profile counter 
-/// updates in a loop. 
-/// 
-class PGOCounterPromoter { 
-public: 
-  PGOCounterPromoter( 
-      DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCands, 
-      Loop &CurLoop, LoopInfo &LI, BlockFrequencyInfo *BFI) 
-      : LoopToCandidates(LoopToCands), ExitBlocks(), InsertPts(), L(CurLoop), 
-        LI(LI), BFI(BFI) { 
- 
-    // Skip collection of ExitBlocks and InsertPts for loops that will not be 
-    // able to have counters promoted. 
-    SmallVector<BasicBlock *, 8> LoopExitBlocks; 
-    SmallPtrSet<BasicBlock *, 8> BlockSet; 
- 
-    L.getExitBlocks(LoopExitBlocks); 
-    if (!isPromotionPossible(&L, LoopExitBlocks)) 
-      return; 
- 
-    for (BasicBlock *ExitBlock : LoopExitBlocks) { 
-      if (BlockSet.insert(ExitBlock).second) { 
-        ExitBlocks.push_back(ExitBlock); 
-        InsertPts.push_back(&*ExitBlock->getFirstInsertionPt()); 
-      } 
-    } 
-  } 
- 
-  bool run(int64_t *NumPromoted) { 
-    // Skip 'infinite' loops: 
-    if (ExitBlocks.size() == 0) 
-      return false; 
+class InstrProfilingLegacyPass : public ModulePass {
+  InstrProfiling InstrProf;
+
+public:
+  static char ID;
+
+  InstrProfilingLegacyPass() : ModulePass(ID) {}
+  InstrProfilingLegacyPass(const InstrProfOptions &Options, bool IsCS = false)
+      : ModulePass(ID), InstrProf(Options, IsCS) {
+    initializeInstrProfilingLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "Frontend instrumentation-based coverage lowering";
+  }
+
+  bool runOnModule(Module &M) override {
+    auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
+      return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    };
+    return InstrProf.run(M, GetTLI);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+};
+
+///
+/// A helper class to promote one counter RMW operation in the loop
+/// into register update.
+///
+/// RWM update for the counter will be sinked out of the loop after
+/// the transformation.
+///
+class PGOCounterPromoterHelper : public LoadAndStorePromoter {
+public:
+  PGOCounterPromoterHelper(
+      Instruction *L, Instruction *S, SSAUpdater &SSA, Value *Init,
+      BasicBlock *PH, ArrayRef<BasicBlock *> ExitBlocks,
+      ArrayRef<Instruction *> InsertPts,
+      DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCands,
+      LoopInfo &LI)
+      : LoadAndStorePromoter({L, S}, SSA), Store(S), ExitBlocks(ExitBlocks),
+        InsertPts(InsertPts), LoopToCandidates(LoopToCands), LI(LI) {
+    assert(isa<LoadInst>(L));
+    assert(isa<StoreInst>(S));
+    SSA.AddAvailableValue(PH, Init);
+  }
+
+  void doExtraRewritesBeforeFinalDeletion() override {
+    for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+      BasicBlock *ExitBlock = ExitBlocks[i];
+      Instruction *InsertPos = InsertPts[i];
+      // Get LiveIn value into the ExitBlock. If there are multiple
+      // predecessors, the value is defined by a PHI node in this
+      // block.
+      Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock);
+      Value *Addr = cast<StoreInst>(Store)->getPointerOperand();
+      Type *Ty = LiveInValue->getType();
+      IRBuilder<> Builder(InsertPos);
+      if (AtomicCounterUpdatePromoted)
+        // automic update currently can only be promoted across the current
+        // loop, not the whole loop nest.
+        Builder.CreateAtomicRMW(AtomicRMWInst::Add, Addr, LiveInValue,
+                                AtomicOrdering::SequentiallyConsistent);
+      else {
+        LoadInst *OldVal = Builder.CreateLoad(Ty, Addr, "pgocount.promoted");
+        auto *NewVal = Builder.CreateAdd(OldVal, LiveInValue);
+        auto *NewStore = Builder.CreateStore(NewVal, Addr);
+
+        // Now update the parent loop's candidate list:
+        if (IterativeCounterPromotion) {
+          auto *TargetLoop = LI.getLoopFor(ExitBlock);
+          if (TargetLoop)
+            LoopToCandidates[TargetLoop].emplace_back(OldVal, NewStore);
+        }
+      }
+    }
+  }
+
+private:
+  Instruction *Store;
+  ArrayRef<BasicBlock *> ExitBlocks;
+  ArrayRef<Instruction *> InsertPts;
+  DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCandidates;
+  LoopInfo &LI;
+};
+
+/// A helper class to do register promotion for all profile counter
+/// updates in a loop.
+///
+class PGOCounterPromoter {
+public:
+  PGOCounterPromoter(
+      DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCands,
+      Loop &CurLoop, LoopInfo &LI, BlockFrequencyInfo *BFI)
+      : LoopToCandidates(LoopToCands), ExitBlocks(), InsertPts(), L(CurLoop),
+        LI(LI), BFI(BFI) {
+
+    // Skip collection of ExitBlocks and InsertPts for loops that will not be
+    // able to have counters promoted.
+    SmallVector<BasicBlock *, 8> LoopExitBlocks;
+    SmallPtrSet<BasicBlock *, 8> BlockSet;
+
+    L.getExitBlocks(LoopExitBlocks);
+    if (!isPromotionPossible(&L, LoopExitBlocks))
+      return;
+
+    for (BasicBlock *ExitBlock : LoopExitBlocks) {
+      if (BlockSet.insert(ExitBlock).second) {
+        ExitBlocks.push_back(ExitBlock);
+        InsertPts.push_back(&*ExitBlock->getFirstInsertionPt());
+      }
+    }
+  }
+
+  bool run(int64_t *NumPromoted) {
+    // Skip 'infinite' loops:
+    if (ExitBlocks.size() == 0)
+      return false;
 
     // Skip if any of the ExitBlocks contains a ret instruction.
     // This is to prevent dumping of incomplete profile -- if the
@@ -273,129 +273,129 @@ public:
           return false;
     }
 
-    unsigned MaxProm = getMaxNumOfPromotionsInLoop(&L); 
-    if (MaxProm == 0) 
-      return false; 
- 
-    unsigned Promoted = 0; 
-    for (auto &Cand : LoopToCandidates[&L]) { 
- 
-      SmallVector<PHINode *, 4> NewPHIs; 
-      SSAUpdater SSA(&NewPHIs); 
-      Value *InitVal = ConstantInt::get(Cand.first->getType(), 0); 
- 
-      // If BFI is set, we will use it to guide the promotions. 
-      if (BFI) { 
-        auto *BB = Cand.first->getParent(); 
-        auto InstrCount = BFI->getBlockProfileCount(BB); 
-        if (!InstrCount) 
-          continue; 
-        auto PreheaderCount = BFI->getBlockProfileCount(L.getLoopPreheader()); 
-        // If the average loop trip count is not greater than 1.5, we skip 
-        // promotion. 
-        if (PreheaderCount && 
-            (PreheaderCount.getValue() * 3) >= (InstrCount.getValue() * 2)) 
-          continue; 
-      } 
- 
-      PGOCounterPromoterHelper Promoter(Cand.first, Cand.second, SSA, InitVal, 
-                                        L.getLoopPreheader(), ExitBlocks, 
-                                        InsertPts, LoopToCandidates, LI); 
-      Promoter.run(SmallVector<Instruction *, 2>({Cand.first, Cand.second})); 
-      Promoted++; 
-      if (Promoted >= MaxProm) 
-        break; 
- 
-      (*NumPromoted)++; 
-      if (MaxNumOfPromotions != -1 && *NumPromoted >= MaxNumOfPromotions) 
-        break; 
-    } 
- 
-    LLVM_DEBUG(dbgs() << Promoted << " counters promoted for loop (depth=" 
-                      << L.getLoopDepth() << ")\n"); 
-    return Promoted != 0; 
-  } 
- 
-private: 
-  bool allowSpeculativeCounterPromotion(Loop *LP) { 
-    SmallVector<BasicBlock *, 8> ExitingBlocks; 
-    L.getExitingBlocks(ExitingBlocks); 
-    // Not considierered speculative. 
-    if (ExitingBlocks.size() == 1) 
-      return true; 
-    if (ExitingBlocks.size() > SpeculativeCounterPromotionMaxExiting) 
-      return false; 
-    return true; 
-  } 
- 
-  // Check whether the loop satisfies the basic conditions needed to perform 
-  // Counter Promotions. 
-  bool isPromotionPossible(Loop *LP, 
-                           const SmallVectorImpl<BasicBlock *> &LoopExitBlocks) { 
-    // We can't insert into a catchswitch. 
-    if (llvm::any_of(LoopExitBlocks, [](BasicBlock *Exit) { 
-          return isa<CatchSwitchInst>(Exit->getTerminator()); 
-        })) 
-      return false; 
- 
-    if (!LP->hasDedicatedExits()) 
-      return false; 
- 
-    BasicBlock *PH = LP->getLoopPreheader(); 
-    if (!PH) 
-      return false; 
- 
-    return true; 
-  } 
- 
-  // Returns the max number of Counter Promotions for LP. 
-  unsigned getMaxNumOfPromotionsInLoop(Loop *LP) { 
-    SmallVector<BasicBlock *, 8> LoopExitBlocks; 
-    LP->getExitBlocks(LoopExitBlocks); 
-    if (!isPromotionPossible(LP, LoopExitBlocks)) 
-      return 0; 
- 
-    SmallVector<BasicBlock *, 8> ExitingBlocks; 
-    LP->getExitingBlocks(ExitingBlocks); 
- 
-    // If BFI is set, we do more aggressive promotions based on BFI. 
-    if (BFI) 
-      return (unsigned)-1; 
- 
-    // Not considierered speculative. 
-    if (ExitingBlocks.size() == 1) 
-      return MaxNumOfPromotionsPerLoop; 
- 
-    if (ExitingBlocks.size() > SpeculativeCounterPromotionMaxExiting) 
-      return 0; 
- 
-    // Whether the target block is in a loop does not matter: 
-    if (SpeculativeCounterPromotionToLoop) 
-      return MaxNumOfPromotionsPerLoop; 
- 
-    // Now check the target block: 
-    unsigned MaxProm = MaxNumOfPromotionsPerLoop; 
-    for (auto *TargetBlock : LoopExitBlocks) { 
-      auto *TargetLoop = LI.getLoopFor(TargetBlock); 
-      if (!TargetLoop) 
-        continue; 
-      unsigned MaxPromForTarget = getMaxNumOfPromotionsInLoop(TargetLoop); 
-      unsigned PendingCandsInTarget = LoopToCandidates[TargetLoop].size(); 
-      MaxProm = 
-          std::min(MaxProm, std::max(MaxPromForTarget, PendingCandsInTarget) - 
-                                PendingCandsInTarget); 
-    } 
-    return MaxProm; 
-  } 
- 
-  DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCandidates; 
-  SmallVector<BasicBlock *, 8> ExitBlocks; 
-  SmallVector<Instruction *, 8> InsertPts; 
-  Loop &L; 
-  LoopInfo &LI; 
-  BlockFrequencyInfo *BFI; 
-}; 
- 
+    unsigned MaxProm = getMaxNumOfPromotionsInLoop(&L);
+    if (MaxProm == 0)
+      return false;
+
+    unsigned Promoted = 0;
+    for (auto &Cand : LoopToCandidates[&L]) {
+
+      SmallVector<PHINode *, 4> NewPHIs;
+      SSAUpdater SSA(&NewPHIs);
+      Value *InitVal = ConstantInt::get(Cand.first->getType(), 0);
+
+      // If BFI is set, we will use it to guide the promotions.
+      if (BFI) {
+        auto *BB = Cand.first->getParent();
+        auto InstrCount = BFI->getBlockProfileCount(BB);
+        if (!InstrCount)
+          continue;
+        auto PreheaderCount = BFI->getBlockProfileCount(L.getLoopPreheader());
+        // If the average loop trip count is not greater than 1.5, we skip
+        // promotion.
+        if (PreheaderCount &&
+            (PreheaderCount.getValue() * 3) >= (InstrCount.getValue() * 2))
+          continue;
+      }
+
+      PGOCounterPromoterHelper Promoter(Cand.first, Cand.second, SSA, InitVal,
+                                        L.getLoopPreheader(), ExitBlocks,
+                                        InsertPts, LoopToCandidates, LI);
+      Promoter.run(SmallVector<Instruction *, 2>({Cand.first, Cand.second}));
+      Promoted++;
+      if (Promoted >= MaxProm)
+        break;
+
+      (*NumPromoted)++;
+      if (MaxNumOfPromotions != -1 && *NumPromoted >= MaxNumOfPromotions)
+        break;
+    }
+
+    LLVM_DEBUG(dbgs() << Promoted << " counters promoted for loop (depth="
+                      << L.getLoopDepth() << ")\n");
+    return Promoted != 0;
+  }
+
+private:
+  bool allowSpeculativeCounterPromotion(Loop *LP) {
+    SmallVector<BasicBlock *, 8> ExitingBlocks;
+    L.getExitingBlocks(ExitingBlocks);
+    // Not considierered speculative.
+    if (ExitingBlocks.size() == 1)
+      return true;
+    if (ExitingBlocks.size() > SpeculativeCounterPromotionMaxExiting)
+      return false;
+    return true;
+  }
+
+  // Check whether the loop satisfies the basic conditions needed to perform
+  // Counter Promotions.
+  bool isPromotionPossible(Loop *LP,
+                           const SmallVectorImpl<BasicBlock *> &LoopExitBlocks) {
+    // We can't insert into a catchswitch.
+    if (llvm::any_of(LoopExitBlocks, [](BasicBlock *Exit) {
+          return isa<CatchSwitchInst>(Exit->getTerminator());
+        }))
+      return false;
+
+    if (!LP->hasDedicatedExits())
+      return false;
+
+    BasicBlock *PH = LP->getLoopPreheader();
+    if (!PH)
+      return false;
+
+    return true;
+  }
+
+  // Returns the max number of Counter Promotions for LP.
+  unsigned getMaxNumOfPromotionsInLoop(Loop *LP) {
+    SmallVector<BasicBlock *, 8> LoopExitBlocks;
+    LP->getExitBlocks(LoopExitBlocks);
+    if (!isPromotionPossible(LP, LoopExitBlocks))
+      return 0;
+
+    SmallVector<BasicBlock *, 8> ExitingBlocks;
+    LP->getExitingBlocks(ExitingBlocks);
+
+    // If BFI is set, we do more aggressive promotions based on BFI.
+    if (BFI)
+      return (unsigned)-1;
+
+    // Not considierered speculative.
+    if (ExitingBlocks.size() == 1)
+      return MaxNumOfPromotionsPerLoop;
+
+    if (ExitingBlocks.size() > SpeculativeCounterPromotionMaxExiting)
+      return 0;
+
+    // Whether the target block is in a loop does not matter:
+    if (SpeculativeCounterPromotionToLoop)
+      return MaxNumOfPromotionsPerLoop;
+
+    // Now check the target block:
+    unsigned MaxProm = MaxNumOfPromotionsPerLoop;
+    for (auto *TargetBlock : LoopExitBlocks) {
+      auto *TargetLoop = LI.getLoopFor(TargetBlock);
+      if (!TargetLoop)
+        continue;
+      unsigned MaxPromForTarget = getMaxNumOfPromotionsInLoop(TargetLoop);
+      unsigned PendingCandsInTarget = LoopToCandidates[TargetLoop].size();
+      MaxProm =
+          std::min(MaxProm, std::max(MaxPromForTarget, PendingCandsInTarget) -
+                                PendingCandsInTarget);
+    }
+    return MaxProm;
+  }
+
+  DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCandidates;
+  SmallVector<BasicBlock *, 8> ExitBlocks;
+  SmallVector<Instruction *, 8> InsertPts;
+  Loop &L;
+  LoopInfo &LI;
+  BlockFrequencyInfo *BFI;
+};
+
 enum class ValueProfilingCallType {
   // Individual values are tracked. Currently used for indiret call target
   // profiling.
@@ -405,204 +405,204 @@ enum class ValueProfilingCallType {
   MemOp
 };
 
-} // end anonymous namespace 
- 
-PreservedAnalyses InstrProfiling::run(Module &M, ModuleAnalysisManager &AM) { 
-  FunctionAnalysisManager &FAM = 
-      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 
-  auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & { 
-    return FAM.getResult<TargetLibraryAnalysis>(F); 
-  }; 
-  if (!run(M, GetTLI)) 
-    return PreservedAnalyses::all(); 
- 
-  return PreservedAnalyses::none(); 
-} 
- 
-char InstrProfilingLegacyPass::ID = 0; 
-INITIALIZE_PASS_BEGIN( 
-    InstrProfilingLegacyPass, "instrprof", 
-    "Frontend instrumentation-based coverage lowering.", false, false) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_END( 
-    InstrProfilingLegacyPass, "instrprof", 
-    "Frontend instrumentation-based coverage lowering.", false, false) 
- 
-ModulePass * 
-llvm::createInstrProfilingLegacyPass(const InstrProfOptions &Options, 
-                                     bool IsCS) { 
-  return new InstrProfilingLegacyPass(Options, IsCS); 
-} 
- 
-static InstrProfIncrementInst *castToIncrementInst(Instruction *Instr) { 
-  InstrProfIncrementInst *Inc = dyn_cast<InstrProfIncrementInstStep>(Instr); 
-  if (Inc) 
-    return Inc; 
-  return dyn_cast<InstrProfIncrementInst>(Instr); 
-} 
- 
-bool InstrProfiling::lowerIntrinsics(Function *F) { 
-  bool MadeChange = false; 
-  PromotionCandidates.clear(); 
-  for (BasicBlock &BB : *F) { 
-    for (auto I = BB.begin(), E = BB.end(); I != E;) { 
-      auto Instr = I++; 
-      InstrProfIncrementInst *Inc = castToIncrementInst(&*Instr); 
-      if (Inc) { 
-        lowerIncrement(Inc); 
-        MadeChange = true; 
-      } else if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(Instr)) { 
-        lowerValueProfileInst(Ind); 
-        MadeChange = true; 
-      } 
-    } 
-  } 
- 
-  if (!MadeChange) 
-    return false; 
- 
-  promoteCounterLoadStores(F); 
-  return true; 
-} 
- 
-bool InstrProfiling::isRuntimeCounterRelocationEnabled() const { 
-  if (RuntimeCounterRelocation.getNumOccurrences() > 0) 
-    return RuntimeCounterRelocation; 
- 
-  return TT.isOSFuchsia(); 
-} 
- 
-bool InstrProfiling::isCounterPromotionEnabled() const { 
-  if (DoCounterPromotion.getNumOccurrences() > 0) 
-    return DoCounterPromotion; 
- 
-  return Options.DoCounterPromotion; 
-} 
- 
-void InstrProfiling::promoteCounterLoadStores(Function *F) { 
-  if (!isCounterPromotionEnabled()) 
-    return; 
- 
-  DominatorTree DT(*F); 
-  LoopInfo LI(DT); 
-  DenseMap<Loop *, SmallVector<LoadStorePair, 8>> LoopPromotionCandidates; 
- 
-  std::unique_ptr<BlockFrequencyInfo> BFI; 
-  if (Options.UseBFIInPromotion) { 
-    std::unique_ptr<BranchProbabilityInfo> BPI; 
-    BPI.reset(new BranchProbabilityInfo(*F, LI, &GetTLI(*F))); 
-    BFI.reset(new BlockFrequencyInfo(*F, *BPI, LI)); 
-  } 
- 
-  for (const auto &LoadStore : PromotionCandidates) { 
-    auto *CounterLoad = LoadStore.first; 
-    auto *CounterStore = LoadStore.second; 
-    BasicBlock *BB = CounterLoad->getParent(); 
-    Loop *ParentLoop = LI.getLoopFor(BB); 
-    if (!ParentLoop) 
-      continue; 
-    LoopPromotionCandidates[ParentLoop].emplace_back(CounterLoad, CounterStore); 
-  } 
- 
-  SmallVector<Loop *, 4> Loops = LI.getLoopsInPreorder(); 
- 
-  // Do a post-order traversal of the loops so that counter updates can be 
-  // iteratively hoisted outside the loop nest. 
-  for (auto *Loop : llvm::reverse(Loops)) { 
-    PGOCounterPromoter Promoter(LoopPromotionCandidates, *Loop, LI, BFI.get()); 
-    Promoter.run(&TotalCountersPromoted); 
-  } 
-} 
- 
-/// Check if the module contains uses of any profiling intrinsics. 
-static bool containsProfilingIntrinsics(Module &M) { 
-  if (auto *F = M.getFunction( 
-          Intrinsic::getName(llvm::Intrinsic::instrprof_increment))) 
-    if (!F->use_empty()) 
-      return true; 
-  if (auto *F = M.getFunction( 
-          Intrinsic::getName(llvm::Intrinsic::instrprof_increment_step))) 
-    if (!F->use_empty()) 
-      return true; 
-  if (auto *F = M.getFunction( 
-          Intrinsic::getName(llvm::Intrinsic::instrprof_value_profile))) 
-    if (!F->use_empty()) 
-      return true; 
-  return false; 
-} 
- 
-bool InstrProfiling::run( 
-    Module &M, std::function<const TargetLibraryInfo &(Function &F)> GetTLI) { 
-  this->M = &M; 
-  this->GetTLI = std::move(GetTLI); 
-  NamesVar = nullptr; 
-  NamesSize = 0; 
-  ProfileDataMap.clear(); 
-  UsedVars.clear(); 
-  TT = Triple(M.getTargetTriple()); 
- 
-  // Emit the runtime hook even if no counters are present. 
-  bool MadeChange = emitRuntimeHook(); 
- 
-  // Improve compile time by avoiding linear scans when there is no work. 
-  GlobalVariable *CoverageNamesVar = 
-      M.getNamedGlobal(getCoverageUnusedNamesVarName()); 
-  if (!containsProfilingIntrinsics(M) && !CoverageNamesVar) 
-    return MadeChange; 
- 
-  // We did not know how many value sites there would be inside 
-  // the instrumented function. This is counting the number of instrumented 
-  // target value sites to enter it as field in the profile data variable. 
-  for (Function &F : M) { 
-    InstrProfIncrementInst *FirstProfIncInst = nullptr; 
-    for (BasicBlock &BB : F) 
-      for (auto I = BB.begin(), E = BB.end(); I != E; I++) 
-        if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(I)) 
-          computeNumValueSiteCounts(Ind); 
-        else if (FirstProfIncInst == nullptr) 
-          FirstProfIncInst = dyn_cast<InstrProfIncrementInst>(I); 
- 
-    // Value profiling intrinsic lowering requires per-function profile data 
-    // variable to be created first. 
-    if (FirstProfIncInst != nullptr) 
-      static_cast<void>(getOrCreateRegionCounters(FirstProfIncInst)); 
-  } 
- 
-  for (Function &F : M) 
-    MadeChange |= lowerIntrinsics(&F); 
- 
-  if (CoverageNamesVar) { 
-    lowerCoverageData(CoverageNamesVar); 
-    MadeChange = true; 
-  } 
- 
-  if (!MadeChange) 
-    return false; 
- 
-  emitVNodes(); 
-  emitNameData(); 
-  emitRegistration(); 
-  emitUses(); 
-  emitInitialization(); 
-  return true; 
-} 
- 
+} // end anonymous namespace
+
+PreservedAnalyses InstrProfiling::run(Module &M, ModuleAnalysisManager &AM) {
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+    return FAM.getResult<TargetLibraryAnalysis>(F);
+  };
+  if (!run(M, GetTLI))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+
+char InstrProfilingLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(
+    InstrProfilingLegacyPass, "instrprof",
+    "Frontend instrumentation-based coverage lowering.", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(
+    InstrProfilingLegacyPass, "instrprof",
+    "Frontend instrumentation-based coverage lowering.", false, false)
+
+ModulePass *
+llvm::createInstrProfilingLegacyPass(const InstrProfOptions &Options,
+                                     bool IsCS) {
+  return new InstrProfilingLegacyPass(Options, IsCS);
+}
+
+static InstrProfIncrementInst *castToIncrementInst(Instruction *Instr) {
+  InstrProfIncrementInst *Inc = dyn_cast<InstrProfIncrementInstStep>(Instr);
+  if (Inc)
+    return Inc;
+  return dyn_cast<InstrProfIncrementInst>(Instr);
+}
+
+bool InstrProfiling::lowerIntrinsics(Function *F) {
+  bool MadeChange = false;
+  PromotionCandidates.clear();
+  for (BasicBlock &BB : *F) {
+    for (auto I = BB.begin(), E = BB.end(); I != E;) {
+      auto Instr = I++;
+      InstrProfIncrementInst *Inc = castToIncrementInst(&*Instr);
+      if (Inc) {
+        lowerIncrement(Inc);
+        MadeChange = true;
+      } else if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(Instr)) {
+        lowerValueProfileInst(Ind);
+        MadeChange = true;
+      }
+    }
+  }
+
+  if (!MadeChange)
+    return false;
+
+  promoteCounterLoadStores(F);
+  return true;
+}
+
+bool InstrProfiling::isRuntimeCounterRelocationEnabled() const {
+  if (RuntimeCounterRelocation.getNumOccurrences() > 0)
+    return RuntimeCounterRelocation;
+
+  return TT.isOSFuchsia();
+}
+
+bool InstrProfiling::isCounterPromotionEnabled() const {
+  if (DoCounterPromotion.getNumOccurrences() > 0)
+    return DoCounterPromotion;
+
+  return Options.DoCounterPromotion;
+}
+
+void InstrProfiling::promoteCounterLoadStores(Function *F) {
+  if (!isCounterPromotionEnabled())
+    return;
+
+  DominatorTree DT(*F);
+  LoopInfo LI(DT);
+  DenseMap<Loop *, SmallVector<LoadStorePair, 8>> LoopPromotionCandidates;
+
+  std::unique_ptr<BlockFrequencyInfo> BFI;
+  if (Options.UseBFIInPromotion) {
+    std::unique_ptr<BranchProbabilityInfo> BPI;
+    BPI.reset(new BranchProbabilityInfo(*F, LI, &GetTLI(*F)));
+    BFI.reset(new BlockFrequencyInfo(*F, *BPI, LI));
+  }
+
+  for (const auto &LoadStore : PromotionCandidates) {
+    auto *CounterLoad = LoadStore.first;
+    auto *CounterStore = LoadStore.second;
+    BasicBlock *BB = CounterLoad->getParent();
+    Loop *ParentLoop = LI.getLoopFor(BB);
+    if (!ParentLoop)
+      continue;
+    LoopPromotionCandidates[ParentLoop].emplace_back(CounterLoad, CounterStore);
+  }
+
+  SmallVector<Loop *, 4> Loops = LI.getLoopsInPreorder();
+
+  // Do a post-order traversal of the loops so that counter updates can be
+  // iteratively hoisted outside the loop nest.
+  for (auto *Loop : llvm::reverse(Loops)) {
+    PGOCounterPromoter Promoter(LoopPromotionCandidates, *Loop, LI, BFI.get());
+    Promoter.run(&TotalCountersPromoted);
+  }
+}
+
+/// Check if the module contains uses of any profiling intrinsics.
+static bool containsProfilingIntrinsics(Module &M) {
+  if (auto *F = M.getFunction(
+          Intrinsic::getName(llvm::Intrinsic::instrprof_increment)))
+    if (!F->use_empty())
+      return true;
+  if (auto *F = M.getFunction(
+          Intrinsic::getName(llvm::Intrinsic::instrprof_increment_step)))
+    if (!F->use_empty())
+      return true;
+  if (auto *F = M.getFunction(
+          Intrinsic::getName(llvm::Intrinsic::instrprof_value_profile)))
+    if (!F->use_empty())
+      return true;
+  return false;
+}
+
+bool InstrProfiling::run(
+    Module &M, std::function<const TargetLibraryInfo &(Function &F)> GetTLI) {
+  this->M = &M;
+  this->GetTLI = std::move(GetTLI);
+  NamesVar = nullptr;
+  NamesSize = 0;
+  ProfileDataMap.clear();
+  UsedVars.clear();
+  TT = Triple(M.getTargetTriple());
+
+  // Emit the runtime hook even if no counters are present.
+  bool MadeChange = emitRuntimeHook();
+
+  // Improve compile time by avoiding linear scans when there is no work.
+  GlobalVariable *CoverageNamesVar =
+      M.getNamedGlobal(getCoverageUnusedNamesVarName());
+  if (!containsProfilingIntrinsics(M) && !CoverageNamesVar)
+    return MadeChange;
+
+  // We did not know how many value sites there would be inside
+  // the instrumented function. This is counting the number of instrumented
+  // target value sites to enter it as field in the profile data variable.
+  for (Function &F : M) {
+    InstrProfIncrementInst *FirstProfIncInst = nullptr;
+    for (BasicBlock &BB : F)
+      for (auto I = BB.begin(), E = BB.end(); I != E; I++)
+        if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(I))
+          computeNumValueSiteCounts(Ind);
+        else if (FirstProfIncInst == nullptr)
+          FirstProfIncInst = dyn_cast<InstrProfIncrementInst>(I);
+
+    // Value profiling intrinsic lowering requires per-function profile data
+    // variable to be created first.
+    if (FirstProfIncInst != nullptr)
+      static_cast<void>(getOrCreateRegionCounters(FirstProfIncInst));
+  }
+
+  for (Function &F : M)
+    MadeChange |= lowerIntrinsics(&F);
+
+  if (CoverageNamesVar) {
+    lowerCoverageData(CoverageNamesVar);
+    MadeChange = true;
+  }
+
+  if (!MadeChange)
+    return false;
+
+  emitVNodes();
+  emitNameData();
+  emitRegistration();
+  emitUses();
+  emitInitialization();
+  return true;
+}
+
 static FunctionCallee getOrInsertValueProfilingCall(
     Module &M, const TargetLibraryInfo &TLI,
     ValueProfilingCallType CallType = ValueProfilingCallType::Default) {
-  LLVMContext &Ctx = M.getContext(); 
-  auto *ReturnTy = Type::getVoidTy(M.getContext()); 
- 
-  AttributeList AL; 
-  if (auto AK = TLI.getExtAttrForI32Param(false)) 
-    AL = AL.addParamAttribute(M.getContext(), 2, AK); 
- 
+  LLVMContext &Ctx = M.getContext();
+  auto *ReturnTy = Type::getVoidTy(M.getContext());
+
+  AttributeList AL;
+  if (auto AK = TLI.getExtAttrForI32Param(false))
+    AL = AL.addParamAttribute(M.getContext(), 2, AK);
+
   assert((CallType == ValueProfilingCallType::Default ||
           CallType == ValueProfilingCallType::MemOp) &&
          "Must be Default or MemOp");
   Type *ParamTypes[] = {
-#define VALUE_PROF_FUNC_PARAM(ParamType, ParamName, ParamLLVMType) ParamLLVMType 
-#include "llvm/ProfileData/InstrProfData.inc" 
+#define VALUE_PROF_FUNC_PARAM(ParamType, ParamName, ParamLLVMType) ParamLLVMType
+#include "llvm/ProfileData/InstrProfData.inc"
   };
   auto *ValueProfilingCallTy =
       FunctionType::get(ReturnTy, makeArrayRef(ParamTypes), false);
@@ -610,501 +610,501 @@ static FunctionCallee getOrInsertValueProfilingCall(
                            ? getInstrProfValueProfFuncName()
                            : getInstrProfValueProfMemOpFuncName();
   return M.getOrInsertFunction(FuncName, ValueProfilingCallTy, AL);
-} 
- 
-void InstrProfiling::computeNumValueSiteCounts(InstrProfValueProfileInst *Ind) { 
-  GlobalVariable *Name = Ind->getName(); 
-  uint64_t ValueKind = Ind->getValueKind()->getZExtValue(); 
-  uint64_t Index = Ind->getIndex()->getZExtValue(); 
-  auto It = ProfileDataMap.find(Name); 
-  if (It == ProfileDataMap.end()) { 
-    PerFunctionProfileData PD; 
-    PD.NumValueSites[ValueKind] = Index + 1; 
-    ProfileDataMap[Name] = PD; 
-  } else if (It->second.NumValueSites[ValueKind] <= Index) 
-    It->second.NumValueSites[ValueKind] = Index + 1; 
-} 
- 
-void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) { 
-  GlobalVariable *Name = Ind->getName(); 
-  auto It = ProfileDataMap.find(Name); 
-  assert(It != ProfileDataMap.end() && It->second.DataVar && 
-         "value profiling detected in function with no counter incerement"); 
- 
-  GlobalVariable *DataVar = It->second.DataVar; 
-  uint64_t ValueKind = Ind->getValueKind()->getZExtValue(); 
-  uint64_t Index = Ind->getIndex()->getZExtValue(); 
-  for (uint32_t Kind = IPVK_First; Kind < ValueKind; ++Kind) 
-    Index += It->second.NumValueSites[Kind]; 
- 
-  IRBuilder<> Builder(Ind); 
+}
+
+void InstrProfiling::computeNumValueSiteCounts(InstrProfValueProfileInst *Ind) {
+  GlobalVariable *Name = Ind->getName();
+  uint64_t ValueKind = Ind->getValueKind()->getZExtValue();
+  uint64_t Index = Ind->getIndex()->getZExtValue();
+  auto It = ProfileDataMap.find(Name);
+  if (It == ProfileDataMap.end()) {
+    PerFunctionProfileData PD;
+    PD.NumValueSites[ValueKind] = Index + 1;
+    ProfileDataMap[Name] = PD;
+  } else if (It->second.NumValueSites[ValueKind] <= Index)
+    It->second.NumValueSites[ValueKind] = Index + 1;
+}
+
+void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
+  GlobalVariable *Name = Ind->getName();
+  auto It = ProfileDataMap.find(Name);
+  assert(It != ProfileDataMap.end() && It->second.DataVar &&
+         "value profiling detected in function with no counter incerement");
+
+  GlobalVariable *DataVar = It->second.DataVar;
+  uint64_t ValueKind = Ind->getValueKind()->getZExtValue();
+  uint64_t Index = Ind->getIndex()->getZExtValue();
+  for (uint32_t Kind = IPVK_First; Kind < ValueKind; ++Kind)
+    Index += It->second.NumValueSites[Kind];
+
+  IRBuilder<> Builder(Ind);
   bool IsMemOpSize = (Ind->getValueKind()->getZExtValue() ==
                       llvm::InstrProfValueKind::IPVK_MemOPSize);
-  CallInst *Call = nullptr; 
-  auto *TLI = &GetTLI(*Ind->getFunction()); 
- 
-  // To support value profiling calls within Windows exception handlers, funclet 
-  // information contained within operand bundles needs to be copied over to 
-  // the library call. This is required for the IR to be processed by the 
-  // WinEHPrepare pass. 
-  SmallVector<OperandBundleDef, 1> OpBundles; 
-  Ind->getOperandBundlesAsDefs(OpBundles); 
+  CallInst *Call = nullptr;
+  auto *TLI = &GetTLI(*Ind->getFunction());
+
+  // To support value profiling calls within Windows exception handlers, funclet
+  // information contained within operand bundles needs to be copied over to
+  // the library call. This is required for the IR to be processed by the
+  // WinEHPrepare pass.
+  SmallVector<OperandBundleDef, 1> OpBundles;
+  Ind->getOperandBundlesAsDefs(OpBundles);
   if (!IsMemOpSize) {
-    Value *Args[3] = {Ind->getTargetValue(), 
-                      Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()), 
-                      Builder.getInt32(Index)}; 
-    Call = Builder.CreateCall(getOrInsertValueProfilingCall(*M, *TLI), Args, 
-                              OpBundles); 
-  } else { 
+    Value *Args[3] = {Ind->getTargetValue(),
+                      Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()),
+                      Builder.getInt32(Index)};
+    Call = Builder.CreateCall(getOrInsertValueProfilingCall(*M, *TLI), Args,
+                              OpBundles);
+  } else {
     Value *Args[3] = {Ind->getTargetValue(),
                       Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()),
                       Builder.getInt32(Index)};
     Call = Builder.CreateCall(
         getOrInsertValueProfilingCall(*M, *TLI, ValueProfilingCallType::MemOp),
         Args, OpBundles);
-  } 
-  if (auto AK = TLI->getExtAttrForI32Param(false)) 
-    Call->addParamAttr(2, AK); 
-  Ind->replaceAllUsesWith(Call); 
-  Ind->eraseFromParent(); 
-} 
- 
-void InstrProfiling::lowerIncrement(InstrProfIncrementInst *Inc) { 
-  GlobalVariable *Counters = getOrCreateRegionCounters(Inc); 
- 
-  IRBuilder<> Builder(Inc); 
-  uint64_t Index = Inc->getIndex()->getZExtValue(); 
-  Value *Addr = Builder.CreateConstInBoundsGEP2_64(Counters->getValueType(), 
-                                                   Counters, 0, Index); 
- 
-  if (isRuntimeCounterRelocationEnabled()) { 
-    Type *Int64Ty = Type::getInt64Ty(M->getContext()); 
-    Type *Int64PtrTy = Type::getInt64PtrTy(M->getContext()); 
-    Function *Fn = Inc->getParent()->getParent(); 
-    Instruction &I = Fn->getEntryBlock().front(); 
-    LoadInst *LI = dyn_cast<LoadInst>(&I); 
-    if (!LI) { 
-      IRBuilder<> Builder(&I); 
-      Type *Int64Ty = Type::getInt64Ty(M->getContext()); 
-      GlobalVariable *Bias = M->getGlobalVariable(getInstrProfCounterBiasVarName()); 
-      if (!Bias) { 
-        Bias = new GlobalVariable(*M, Int64Ty, false, GlobalValue::LinkOnceODRLinkage, 
-                                  Constant::getNullValue(Int64Ty), 
-                                  getInstrProfCounterBiasVarName()); 
-        Bias->setVisibility(GlobalVariable::HiddenVisibility); 
-      } 
-      LI = Builder.CreateLoad(Int64Ty, Bias); 
-    } 
-    auto *Add = Builder.CreateAdd(Builder.CreatePtrToInt(Addr, Int64Ty), LI); 
-    Addr = Builder.CreateIntToPtr(Add, Int64PtrTy); 
-  } 
- 
-  if (Options.Atomic || AtomicCounterUpdateAll || 
-      (Index == 0 && AtomicFirstCounter)) { 
-    Builder.CreateAtomicRMW(AtomicRMWInst::Add, Addr, Inc->getStep(), 
-                            AtomicOrdering::Monotonic); 
-  } else { 
-    Value *IncStep = Inc->getStep(); 
-    Value *Load = Builder.CreateLoad(IncStep->getType(), Addr, "pgocount"); 
-    auto *Count = Builder.CreateAdd(Load, Inc->getStep()); 
-    auto *Store = Builder.CreateStore(Count, Addr); 
-    if (isCounterPromotionEnabled()) 
-      PromotionCandidates.emplace_back(cast<Instruction>(Load), Store); 
-  } 
-  Inc->eraseFromParent(); 
-} 
- 
-void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageNamesVar) { 
-  ConstantArray *Names = 
-      cast<ConstantArray>(CoverageNamesVar->getInitializer()); 
-  for (unsigned I = 0, E = Names->getNumOperands(); I < E; ++I) { 
-    Constant *NC = Names->getOperand(I); 
-    Value *V = NC->stripPointerCasts(); 
-    assert(isa<GlobalVariable>(V) && "Missing reference to function name"); 
-    GlobalVariable *Name = cast<GlobalVariable>(V); 
- 
-    Name->setLinkage(GlobalValue::PrivateLinkage); 
-    ReferencedNames.push_back(Name); 
-    NC->dropAllReferences(); 
-  } 
-  CoverageNamesVar->eraseFromParent(); 
-} 
- 
-/// Get the name of a profiling variable for a particular function. 
-static std::string getVarName(InstrProfIncrementInst *Inc, StringRef Prefix) { 
-  StringRef NamePrefix = getInstrProfNameVarPrefix(); 
-  StringRef Name = Inc->getName()->getName().substr(NamePrefix.size()); 
-  Function *F = Inc->getParent()->getParent(); 
-  Module *M = F->getParent(); 
-  if (!DoHashBasedCounterSplit || !isIRPGOFlagSet(M) || 
-      !canRenameComdatFunc(*F)) 
-    return (Prefix + Name).str(); 
-  uint64_t FuncHash = Inc->getHash()->getZExtValue(); 
-  SmallVector<char, 24> HashPostfix; 
-  if (Name.endswith((Twine(".") + Twine(FuncHash)).toStringRef(HashPostfix))) 
-    return (Prefix + Name).str(); 
-  return (Prefix + Name + "." + Twine(FuncHash)).str(); 
-} 
- 
-static inline bool shouldRecordFunctionAddr(Function *F) { 
-  // Check the linkage 
-  bool HasAvailableExternallyLinkage = F->hasAvailableExternallyLinkage(); 
-  if (!F->hasLinkOnceLinkage() && !F->hasLocalLinkage() && 
-      !HasAvailableExternallyLinkage) 
-    return true; 
- 
-  // A function marked 'alwaysinline' with available_externally linkage can't 
-  // have its address taken. Doing so would create an undefined external ref to 
-  // the function, which would fail to link. 
-  if (HasAvailableExternallyLinkage && 
-      F->hasFnAttribute(Attribute::AlwaysInline)) 
-    return false; 
- 
-  // Prohibit function address recording if the function is both internal and 
-  // COMDAT. This avoids the profile data variable referencing internal symbols 
-  // in COMDAT. 
-  if (F->hasLocalLinkage() && F->hasComdat()) 
-    return false; 
- 
-  // Check uses of this function for other than direct calls or invokes to it. 
-  // Inline virtual functions have linkeOnceODR linkage. When a key method 
-  // exists, the vtable will only be emitted in the TU where the key method 
-  // is defined. In a TU where vtable is not available, the function won't 
-  // be 'addresstaken'. If its address is not recorded here, the profile data 
-  // with missing address may be picked by the linker leading  to missing 
-  // indirect call target info. 
-  return F->hasAddressTaken() || F->hasLinkOnceLinkage(); 
-} 
- 
-static bool needsRuntimeRegistrationOfSectionRange(const Triple &TT) { 
-  // Don't do this for Darwin.  compiler-rt uses linker magic. 
-  if (TT.isOSDarwin()) 
-    return false; 
-  // Use linker script magic to get data/cnts/name start/end. 
-  if (TT.isOSLinux() || TT.isOSFreeBSD() || TT.isOSNetBSD() || 
-      TT.isOSSolaris() || TT.isOSFuchsia() || TT.isPS4CPU() || 
-      TT.isOSWindows()) 
-    return false; 
- 
-  return true; 
-} 
- 
-GlobalVariable * 
-InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) { 
-  GlobalVariable *NamePtr = Inc->getName(); 
-  auto It = ProfileDataMap.find(NamePtr); 
-  PerFunctionProfileData PD; 
-  if (It != ProfileDataMap.end()) { 
-    if (It->second.RegionCounters) 
-      return It->second.RegionCounters; 
-    PD = It->second; 
-  } 
- 
-  // Match the linkage and visibility of the name global. COFF supports using 
-  // comdats with internal symbols, so do that if we can. 
-  Function *Fn = Inc->getParent()->getParent(); 
-  GlobalValue::LinkageTypes Linkage = NamePtr->getLinkage(); 
-  GlobalValue::VisibilityTypes Visibility = NamePtr->getVisibility(); 
-  if (TT.isOSBinFormatCOFF()) { 
-    Linkage = GlobalValue::InternalLinkage; 
-    Visibility = GlobalValue::DefaultVisibility; 
-  } 
- 
-  // Move the name variable to the right section. Place them in a COMDAT group 
-  // if the associated function is a COMDAT. This will make sure that only one 
-  // copy of counters of the COMDAT function will be emitted after linking. Keep 
-  // in mind that this pass may run before the inliner, so we need to create a 
-  // new comdat group for the counters and profiling data. If we use the comdat 
-  // of the parent function, that will result in relocations against discarded 
-  // sections. 
-  bool NeedComdat = needsComdatForCounter(*Fn, *M); 
-  if (NeedComdat) { 
-    if (TT.isOSBinFormatCOFF()) { 
-      // For COFF, put the counters, data, and values each into their own 
-      // comdats. We can't use a group because the Visual C++ linker will 
-      // report duplicate symbol errors if there are multiple external symbols 
-      // with the same name marked IMAGE_COMDAT_SELECT_ASSOCIATIVE. 
-      Linkage = GlobalValue::LinkOnceODRLinkage; 
-      Visibility = GlobalValue::HiddenVisibility; 
-    } 
-  } 
+  }
+  if (auto AK = TLI->getExtAttrForI32Param(false))
+    Call->addParamAttr(2, AK);
+  Ind->replaceAllUsesWith(Call);
+  Ind->eraseFromParent();
+}
+
+void InstrProfiling::lowerIncrement(InstrProfIncrementInst *Inc) {
+  GlobalVariable *Counters = getOrCreateRegionCounters(Inc);
+
+  IRBuilder<> Builder(Inc);
+  uint64_t Index = Inc->getIndex()->getZExtValue();
+  Value *Addr = Builder.CreateConstInBoundsGEP2_64(Counters->getValueType(),
+                                                   Counters, 0, Index);
+
+  if (isRuntimeCounterRelocationEnabled()) {
+    Type *Int64Ty = Type::getInt64Ty(M->getContext());
+    Type *Int64PtrTy = Type::getInt64PtrTy(M->getContext());
+    Function *Fn = Inc->getParent()->getParent();
+    Instruction &I = Fn->getEntryBlock().front();
+    LoadInst *LI = dyn_cast<LoadInst>(&I);
+    if (!LI) {
+      IRBuilder<> Builder(&I);
+      Type *Int64Ty = Type::getInt64Ty(M->getContext());
+      GlobalVariable *Bias = M->getGlobalVariable(getInstrProfCounterBiasVarName());
+      if (!Bias) {
+        Bias = new GlobalVariable(*M, Int64Ty, false, GlobalValue::LinkOnceODRLinkage,
+                                  Constant::getNullValue(Int64Ty),
+                                  getInstrProfCounterBiasVarName());
+        Bias->setVisibility(GlobalVariable::HiddenVisibility);
+      }
+      LI = Builder.CreateLoad(Int64Ty, Bias);
+    }
+    auto *Add = Builder.CreateAdd(Builder.CreatePtrToInt(Addr, Int64Ty), LI);
+    Addr = Builder.CreateIntToPtr(Add, Int64PtrTy);
+  }
+
+  if (Options.Atomic || AtomicCounterUpdateAll ||
+      (Index == 0 && AtomicFirstCounter)) {
+    Builder.CreateAtomicRMW(AtomicRMWInst::Add, Addr, Inc->getStep(),
+                            AtomicOrdering::Monotonic);
+  } else {
+    Value *IncStep = Inc->getStep();
+    Value *Load = Builder.CreateLoad(IncStep->getType(), Addr, "pgocount");
+    auto *Count = Builder.CreateAdd(Load, Inc->getStep());
+    auto *Store = Builder.CreateStore(Count, Addr);
+    if (isCounterPromotionEnabled())
+      PromotionCandidates.emplace_back(cast<Instruction>(Load), Store);
+  }
+  Inc->eraseFromParent();
+}
+
+void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageNamesVar) {
+  ConstantArray *Names =
+      cast<ConstantArray>(CoverageNamesVar->getInitializer());
+  for (unsigned I = 0, E = Names->getNumOperands(); I < E; ++I) {
+    Constant *NC = Names->getOperand(I);
+    Value *V = NC->stripPointerCasts();
+    assert(isa<GlobalVariable>(V) && "Missing reference to function name");
+    GlobalVariable *Name = cast<GlobalVariable>(V);
+
+    Name->setLinkage(GlobalValue::PrivateLinkage);
+    ReferencedNames.push_back(Name);
+    NC->dropAllReferences();
+  }
+  CoverageNamesVar->eraseFromParent();
+}
+
+/// Get the name of a profiling variable for a particular function.
+static std::string getVarName(InstrProfIncrementInst *Inc, StringRef Prefix) {
+  StringRef NamePrefix = getInstrProfNameVarPrefix();
+  StringRef Name = Inc->getName()->getName().substr(NamePrefix.size());
+  Function *F = Inc->getParent()->getParent();
+  Module *M = F->getParent();
+  if (!DoHashBasedCounterSplit || !isIRPGOFlagSet(M) ||
+      !canRenameComdatFunc(*F))
+    return (Prefix + Name).str();
+  uint64_t FuncHash = Inc->getHash()->getZExtValue();
+  SmallVector<char, 24> HashPostfix;
+  if (Name.endswith((Twine(".") + Twine(FuncHash)).toStringRef(HashPostfix)))
+    return (Prefix + Name).str();
+  return (Prefix + Name + "." + Twine(FuncHash)).str();
+}
+
+static inline bool shouldRecordFunctionAddr(Function *F) {
+  // Check the linkage
+  bool HasAvailableExternallyLinkage = F->hasAvailableExternallyLinkage();
+  if (!F->hasLinkOnceLinkage() && !F->hasLocalLinkage() &&
+      !HasAvailableExternallyLinkage)
+    return true;
+
+  // A function marked 'alwaysinline' with available_externally linkage can't
+  // have its address taken. Doing so would create an undefined external ref to
+  // the function, which would fail to link.
+  if (HasAvailableExternallyLinkage &&
+      F->hasFnAttribute(Attribute::AlwaysInline))
+    return false;
+
+  // Prohibit function address recording if the function is both internal and
+  // COMDAT. This avoids the profile data variable referencing internal symbols
+  // in COMDAT.
+  if (F->hasLocalLinkage() && F->hasComdat())
+    return false;
+
+  // Check uses of this function for other than direct calls or invokes to it.
+  // Inline virtual functions have linkeOnceODR linkage. When a key method
+  // exists, the vtable will only be emitted in the TU where the key method
+  // is defined. In a TU where vtable is not available, the function won't
+  // be 'addresstaken'. If its address is not recorded here, the profile data
+  // with missing address may be picked by the linker leading  to missing
+  // indirect call target info.
+  return F->hasAddressTaken() || F->hasLinkOnceLinkage();
+}
+
+static bool needsRuntimeRegistrationOfSectionRange(const Triple &TT) {
+  // Don't do this for Darwin.  compiler-rt uses linker magic.
+  if (TT.isOSDarwin())
+    return false;
+  // Use linker script magic to get data/cnts/name start/end.
+  if (TT.isOSLinux() || TT.isOSFreeBSD() || TT.isOSNetBSD() ||
+      TT.isOSSolaris() || TT.isOSFuchsia() || TT.isPS4CPU() ||
+      TT.isOSWindows())
+    return false;
+
+  return true;
+}
+
+GlobalVariable *
+InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
+  GlobalVariable *NamePtr = Inc->getName();
+  auto It = ProfileDataMap.find(NamePtr);
+  PerFunctionProfileData PD;
+  if (It != ProfileDataMap.end()) {
+    if (It->second.RegionCounters)
+      return It->second.RegionCounters;
+    PD = It->second;
+  }
+
+  // Match the linkage and visibility of the name global. COFF supports using
+  // comdats with internal symbols, so do that if we can.
+  Function *Fn = Inc->getParent()->getParent();
+  GlobalValue::LinkageTypes Linkage = NamePtr->getLinkage();
+  GlobalValue::VisibilityTypes Visibility = NamePtr->getVisibility();
+  if (TT.isOSBinFormatCOFF()) {
+    Linkage = GlobalValue::InternalLinkage;
+    Visibility = GlobalValue::DefaultVisibility;
+  }
+
+  // Move the name variable to the right section. Place them in a COMDAT group
+  // if the associated function is a COMDAT. This will make sure that only one
+  // copy of counters of the COMDAT function will be emitted after linking. Keep
+  // in mind that this pass may run before the inliner, so we need to create a
+  // new comdat group for the counters and profiling data. If we use the comdat
+  // of the parent function, that will result in relocations against discarded
+  // sections.
+  bool NeedComdat = needsComdatForCounter(*Fn, *M);
+  if (NeedComdat) {
+    if (TT.isOSBinFormatCOFF()) {
+      // For COFF, put the counters, data, and values each into their own
+      // comdats. We can't use a group because the Visual C++ linker will
+      // report duplicate symbol errors if there are multiple external symbols
+      // with the same name marked IMAGE_COMDAT_SELECT_ASSOCIATIVE.
+      Linkage = GlobalValue::LinkOnceODRLinkage;
+      Visibility = GlobalValue::HiddenVisibility;
+    }
+  }
   std::string DataVarName = getVarName(Inc, getInstrProfDataVarPrefix());
-  auto MaybeSetComdat = [=](GlobalVariable *GV) { 
-    if (NeedComdat) 
+  auto MaybeSetComdat = [=](GlobalVariable *GV) {
+    if (NeedComdat)
       GV->setComdat(M->getOrInsertComdat(TT.isOSBinFormatCOFF() ? GV->getName()
                                                                 : DataVarName));
-  }; 
- 
-  uint64_t NumCounters = Inc->getNumCounters()->getZExtValue(); 
-  LLVMContext &Ctx = M->getContext(); 
-  ArrayType *CounterTy = ArrayType::get(Type::getInt64Ty(Ctx), NumCounters); 
- 
-  // Create the counters variable. 
-  auto *CounterPtr = 
-      new GlobalVariable(*M, CounterTy, false, Linkage, 
-                         Constant::getNullValue(CounterTy), 
-                         getVarName(Inc, getInstrProfCountersVarPrefix())); 
-  CounterPtr->setVisibility(Visibility); 
-  CounterPtr->setSection( 
-      getInstrProfSectionName(IPSK_cnts, TT.getObjectFormat())); 
-  CounterPtr->setAlignment(Align(8)); 
-  MaybeSetComdat(CounterPtr); 
-  CounterPtr->setLinkage(Linkage); 
- 
-  auto *Int8PtrTy = Type::getInt8PtrTy(Ctx); 
-  // Allocate statically the array of pointers to value profile nodes for 
-  // the current function. 
-  Constant *ValuesPtrExpr = ConstantPointerNull::get(Int8PtrTy); 
-  if (ValueProfileStaticAlloc && !needsRuntimeRegistrationOfSectionRange(TT)) { 
-    uint64_t NS = 0; 
-    for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) 
-      NS += PD.NumValueSites[Kind]; 
-    if (NS) { 
-      ArrayType *ValuesTy = ArrayType::get(Type::getInt64Ty(Ctx), NS); 
- 
-      auto *ValuesVar = 
-          new GlobalVariable(*M, ValuesTy, false, Linkage, 
-                             Constant::getNullValue(ValuesTy), 
-                             getVarName(Inc, getInstrProfValuesVarPrefix())); 
-      ValuesVar->setVisibility(Visibility); 
-      ValuesVar->setSection( 
-          getInstrProfSectionName(IPSK_vals, TT.getObjectFormat())); 
-      ValuesVar->setAlignment(Align(8)); 
-      MaybeSetComdat(ValuesVar); 
-      ValuesPtrExpr = 
-          ConstantExpr::getBitCast(ValuesVar, Type::getInt8PtrTy(Ctx)); 
-    } 
-  } 
- 
-  // Create data variable. 
-  auto *Int16Ty = Type::getInt16Ty(Ctx); 
-  auto *Int16ArrayTy = ArrayType::get(Int16Ty, IPVK_Last + 1); 
-  Type *DataTypes[] = { 
-#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) LLVMType, 
-#include "llvm/ProfileData/InstrProfData.inc" 
-  }; 
-  auto *DataTy = StructType::get(Ctx, makeArrayRef(DataTypes)); 
- 
-  Constant *FunctionAddr = shouldRecordFunctionAddr(Fn) 
-                               ? ConstantExpr::getBitCast(Fn, Int8PtrTy) 
-                               : ConstantPointerNull::get(Int8PtrTy); 
- 
-  Constant *Int16ArrayVals[IPVK_Last + 1]; 
-  for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) 
-    Int16ArrayVals[Kind] = ConstantInt::get(Int16Ty, PD.NumValueSites[Kind]); 
- 
-  Constant *DataVals[] = { 
-#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Init, 
-#include "llvm/ProfileData/InstrProfData.inc" 
-  }; 
+  };
+
+  uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
+  LLVMContext &Ctx = M->getContext();
+  ArrayType *CounterTy = ArrayType::get(Type::getInt64Ty(Ctx), NumCounters);
+
+  // Create the counters variable.
+  auto *CounterPtr =
+      new GlobalVariable(*M, CounterTy, false, Linkage,
+                         Constant::getNullValue(CounterTy),
+                         getVarName(Inc, getInstrProfCountersVarPrefix()));
+  CounterPtr->setVisibility(Visibility);
+  CounterPtr->setSection(
+      getInstrProfSectionName(IPSK_cnts, TT.getObjectFormat()));
+  CounterPtr->setAlignment(Align(8));
+  MaybeSetComdat(CounterPtr);
+  CounterPtr->setLinkage(Linkage);
+
+  auto *Int8PtrTy = Type::getInt8PtrTy(Ctx);
+  // Allocate statically the array of pointers to value profile nodes for
+  // the current function.
+  Constant *ValuesPtrExpr = ConstantPointerNull::get(Int8PtrTy);
+  if (ValueProfileStaticAlloc && !needsRuntimeRegistrationOfSectionRange(TT)) {
+    uint64_t NS = 0;
+    for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
+      NS += PD.NumValueSites[Kind];
+    if (NS) {
+      ArrayType *ValuesTy = ArrayType::get(Type::getInt64Ty(Ctx), NS);
+
+      auto *ValuesVar =
+          new GlobalVariable(*M, ValuesTy, false, Linkage,
+                             Constant::getNullValue(ValuesTy),
+                             getVarName(Inc, getInstrProfValuesVarPrefix()));
+      ValuesVar->setVisibility(Visibility);
+      ValuesVar->setSection(
+          getInstrProfSectionName(IPSK_vals, TT.getObjectFormat()));
+      ValuesVar->setAlignment(Align(8));
+      MaybeSetComdat(ValuesVar);
+      ValuesPtrExpr =
+          ConstantExpr::getBitCast(ValuesVar, Type::getInt8PtrTy(Ctx));
+    }
+  }
+
+  // Create data variable.
+  auto *Int16Ty = Type::getInt16Ty(Ctx);
+  auto *Int16ArrayTy = ArrayType::get(Int16Ty, IPVK_Last + 1);
+  Type *DataTypes[] = {
+#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) LLVMType,
+#include "llvm/ProfileData/InstrProfData.inc"
+  };
+  auto *DataTy = StructType::get(Ctx, makeArrayRef(DataTypes));
+
+  Constant *FunctionAddr = shouldRecordFunctionAddr(Fn)
+                               ? ConstantExpr::getBitCast(Fn, Int8PtrTy)
+                               : ConstantPointerNull::get(Int8PtrTy);
+
+  Constant *Int16ArrayVals[IPVK_Last + 1];
+  for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
+    Int16ArrayVals[Kind] = ConstantInt::get(Int16Ty, PD.NumValueSites[Kind]);
+
+  Constant *DataVals[] = {
+#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Init,
+#include "llvm/ProfileData/InstrProfData.inc"
+  };
   auto *Data =
       new GlobalVariable(*M, DataTy, false, Linkage,
                          ConstantStruct::get(DataTy, DataVals), DataVarName);
-  Data->setVisibility(Visibility); 
-  Data->setSection(getInstrProfSectionName(IPSK_data, TT.getObjectFormat())); 
-  Data->setAlignment(Align(INSTR_PROF_DATA_ALIGNMENT)); 
-  MaybeSetComdat(Data); 
-  Data->setLinkage(Linkage); 
- 
-  PD.RegionCounters = CounterPtr; 
-  PD.DataVar = Data; 
-  ProfileDataMap[NamePtr] = PD; 
- 
-  // Mark the data variable as used so that it isn't stripped out. 
-  UsedVars.push_back(Data); 
-  // Now that the linkage set by the FE has been passed to the data and counter 
-  // variables, reset Name variable's linkage and visibility to private so that 
-  // it can be removed later by the compiler. 
-  NamePtr->setLinkage(GlobalValue::PrivateLinkage); 
-  // Collect the referenced names to be used by emitNameData. 
-  ReferencedNames.push_back(NamePtr); 
- 
-  return CounterPtr; 
-} 
- 
-void InstrProfiling::emitVNodes() { 
-  if (!ValueProfileStaticAlloc) 
-    return; 
- 
-  // For now only support this on platforms that do 
-  // not require runtime registration to discover 
-  // named section start/end. 
-  if (needsRuntimeRegistrationOfSectionRange(TT)) 
-    return; 
- 
-  size_t TotalNS = 0; 
-  for (auto &PD : ProfileDataMap) { 
-    for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) 
-      TotalNS += PD.second.NumValueSites[Kind]; 
-  } 
- 
-  if (!TotalNS) 
-    return; 
- 
-  uint64_t NumCounters = TotalNS * NumCountersPerValueSite; 
-// Heuristic for small programs with very few total value sites. 
-// The default value of vp-counters-per-site is chosen based on 
-// the observation that large apps usually have a low percentage 
-// of value sites that actually have any profile data, and thus 
-// the average number of counters per site is low. For small 
-// apps with very few sites, this may not be true. Bump up the 
-// number of counters in this case. 
-#define INSTR_PROF_MIN_VAL_COUNTS 10 
-  if (NumCounters < INSTR_PROF_MIN_VAL_COUNTS) 
-    NumCounters = std::max(INSTR_PROF_MIN_VAL_COUNTS, (int)NumCounters * 2); 
- 
-  auto &Ctx = M->getContext(); 
-  Type *VNodeTypes[] = { 
-#define INSTR_PROF_VALUE_NODE(Type, LLVMType, Name, Init) LLVMType, 
-#include "llvm/ProfileData/InstrProfData.inc" 
-  }; 
-  auto *VNodeTy = StructType::get(Ctx, makeArrayRef(VNodeTypes)); 
- 
-  ArrayType *VNodesTy = ArrayType::get(VNodeTy, NumCounters); 
-  auto *VNodesVar = new GlobalVariable( 
-      *M, VNodesTy, false, GlobalValue::PrivateLinkage, 
-      Constant::getNullValue(VNodesTy), getInstrProfVNodesVarName()); 
-  VNodesVar->setSection( 
-      getInstrProfSectionName(IPSK_vnodes, TT.getObjectFormat())); 
-  UsedVars.push_back(VNodesVar); 
-} 
- 
-void InstrProfiling::emitNameData() { 
-  std::string UncompressedData; 
- 
-  if (ReferencedNames.empty()) 
-    return; 
- 
-  std::string CompressedNameStr; 
-  if (Error E = collectPGOFuncNameStrings(ReferencedNames, CompressedNameStr, 
-                                          DoInstrProfNameCompression)) { 
-    report_fatal_error(toString(std::move(E)), false); 
-  } 
- 
-  auto &Ctx = M->getContext(); 
-  auto *NamesVal = ConstantDataArray::getString( 
-      Ctx, StringRef(CompressedNameStr), false); 
-  NamesVar = new GlobalVariable(*M, NamesVal->getType(), true, 
-                                GlobalValue::PrivateLinkage, NamesVal, 
-                                getInstrProfNamesVarName()); 
-  NamesSize = CompressedNameStr.size(); 
-  NamesVar->setSection( 
-      getInstrProfSectionName(IPSK_name, TT.getObjectFormat())); 
-  // On COFF, it's important to reduce the alignment down to 1 to prevent the 
-  // linker from inserting padding before the start of the names section or 
-  // between names entries. 
-  NamesVar->setAlignment(Align(1)); 
-  UsedVars.push_back(NamesVar); 
- 
-  for (auto *NamePtr : ReferencedNames) 
-    NamePtr->eraseFromParent(); 
-} 
- 
-void InstrProfiling::emitRegistration() { 
-  if (!needsRuntimeRegistrationOfSectionRange(TT)) 
-    return; 
- 
-  // Construct the function. 
-  auto *VoidTy = Type::getVoidTy(M->getContext()); 
-  auto *VoidPtrTy = Type::getInt8PtrTy(M->getContext()); 
-  auto *Int64Ty = Type::getInt64Ty(M->getContext()); 
-  auto *RegisterFTy = FunctionType::get(VoidTy, false); 
-  auto *RegisterF = Function::Create(RegisterFTy, GlobalValue::InternalLinkage, 
-                                     getInstrProfRegFuncsName(), M); 
-  RegisterF->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); 
-  if (Options.NoRedZone) 
-    RegisterF->addFnAttr(Attribute::NoRedZone); 
- 
-  auto *RuntimeRegisterTy = FunctionType::get(VoidTy, VoidPtrTy, false); 
-  auto *RuntimeRegisterF = 
-      Function::Create(RuntimeRegisterTy, GlobalVariable::ExternalLinkage, 
-                       getInstrProfRegFuncName(), M); 
- 
-  IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", RegisterF)); 
-  for (Value *Data : UsedVars) 
-    if (Data != NamesVar && !isa<Function>(Data)) 
-      IRB.CreateCall(RuntimeRegisterF, IRB.CreateBitCast(Data, VoidPtrTy)); 
- 
-  if (NamesVar) { 
-    Type *ParamTypes[] = {VoidPtrTy, Int64Ty}; 
-    auto *NamesRegisterTy = 
-        FunctionType::get(VoidTy, makeArrayRef(ParamTypes), false); 
-    auto *NamesRegisterF = 
-        Function::Create(NamesRegisterTy, GlobalVariable::ExternalLinkage, 
-                         getInstrProfNamesRegFuncName(), M); 
-    IRB.CreateCall(NamesRegisterF, {IRB.CreateBitCast(NamesVar, VoidPtrTy), 
-                                    IRB.getInt64(NamesSize)}); 
-  } 
- 
-  IRB.CreateRetVoid(); 
-} 
- 
-bool InstrProfiling::emitRuntimeHook() { 
-  // We expect the linker to be invoked with -u<hook_var> flag for Linux or 
-  // Fuchsia, in which case there is no need to emit the user function. 
-  if (TT.isOSLinux() || TT.isOSFuchsia()) 
-    return false; 
- 
-  // If the module's provided its own runtime, we don't need to do anything. 
-  if (M->getGlobalVariable(getInstrProfRuntimeHookVarName())) 
-    return false; 
- 
-  // Declare an external variable that will pull in the runtime initialization. 
-  auto *Int32Ty = Type::getInt32Ty(M->getContext()); 
-  auto *Var = 
-      new GlobalVariable(*M, Int32Ty, false, GlobalValue::ExternalLinkage, 
-                         nullptr, getInstrProfRuntimeHookVarName()); 
- 
-  // Make a function that uses it. 
-  auto *User = Function::Create(FunctionType::get(Int32Ty, false), 
-                                GlobalValue::LinkOnceODRLinkage, 
-                                getInstrProfRuntimeHookVarUseFuncName(), M); 
-  User->addFnAttr(Attribute::NoInline); 
-  if (Options.NoRedZone) 
-    User->addFnAttr(Attribute::NoRedZone); 
-  User->setVisibility(GlobalValue::HiddenVisibility); 
-  if (TT.supportsCOMDAT()) 
-    User->setComdat(M->getOrInsertComdat(User->getName())); 
- 
-  IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", User)); 
-  auto *Load = IRB.CreateLoad(Int32Ty, Var); 
-  IRB.CreateRet(Load); 
- 
-  // Mark the user variable as used so that it isn't stripped out. 
-  UsedVars.push_back(User); 
-  return true; 
-} 
- 
-void InstrProfiling::emitUses() { 
-  if (!UsedVars.empty()) 
-    appendToUsed(*M, UsedVars); 
-} 
- 
-void InstrProfiling::emitInitialization() { 
-  // Create ProfileFileName variable. Don't don't this for the 
-  // context-sensitive instrumentation lowering: This lowering is after 
-  // LTO/ThinLTO linking. Pass PGOInstrumentationGenCreateVar should 
-  // have already create the variable before LTO/ThinLTO linking. 
-  if (!IsCS) 
-    createProfileFileNameVar(*M, Options.InstrProfileOutput); 
-  Function *RegisterF = M->getFunction(getInstrProfRegFuncsName()); 
-  if (!RegisterF) 
-    return; 
- 
-  // Create the initialization function. 
-  auto *VoidTy = Type::getVoidTy(M->getContext()); 
-  auto *F = Function::Create(FunctionType::get(VoidTy, false), 
-                             GlobalValue::InternalLinkage, 
-                             getInstrProfInitFuncName(), M); 
-  F->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); 
-  F->addFnAttr(Attribute::NoInline); 
-  if (Options.NoRedZone) 
-    F->addFnAttr(Attribute::NoRedZone); 
- 
-  // Add the basic block and the necessary calls. 
-  IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", F)); 
-  IRB.CreateCall(RegisterF, {}); 
-  IRB.CreateRetVoid(); 
- 
-  appendToGlobalCtors(*M, F, 0); 
-} 
+  Data->setVisibility(Visibility);
+  Data->setSection(getInstrProfSectionName(IPSK_data, TT.getObjectFormat()));
+  Data->setAlignment(Align(INSTR_PROF_DATA_ALIGNMENT));
+  MaybeSetComdat(Data);
+  Data->setLinkage(Linkage);
+
+  PD.RegionCounters = CounterPtr;
+  PD.DataVar = Data;
+  ProfileDataMap[NamePtr] = PD;
+
+  // Mark the data variable as used so that it isn't stripped out.
+  UsedVars.push_back(Data);
+  // Now that the linkage set by the FE has been passed to the data and counter
+  // variables, reset Name variable's linkage and visibility to private so that
+  // it can be removed later by the compiler.
+  NamePtr->setLinkage(GlobalValue::PrivateLinkage);
+  // Collect the referenced names to be used by emitNameData.
+  ReferencedNames.push_back(NamePtr);
+
+  return CounterPtr;
+}
+
+void InstrProfiling::emitVNodes() {
+  if (!ValueProfileStaticAlloc)
+    return;
+
+  // For now only support this on platforms that do
+  // not require runtime registration to discover
+  // named section start/end.
+  if (needsRuntimeRegistrationOfSectionRange(TT))
+    return;
+
+  size_t TotalNS = 0;
+  for (auto &PD : ProfileDataMap) {
+    for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
+      TotalNS += PD.second.NumValueSites[Kind];
+  }
+
+  if (!TotalNS)
+    return;
+
+  uint64_t NumCounters = TotalNS * NumCountersPerValueSite;
+// Heuristic for small programs with very few total value sites.
+// The default value of vp-counters-per-site is chosen based on
+// the observation that large apps usually have a low percentage
+// of value sites that actually have any profile data, and thus
+// the average number of counters per site is low. For small
+// apps with very few sites, this may not be true. Bump up the
+// number of counters in this case.
+#define INSTR_PROF_MIN_VAL_COUNTS 10
+  if (NumCounters < INSTR_PROF_MIN_VAL_COUNTS)
+    NumCounters = std::max(INSTR_PROF_MIN_VAL_COUNTS, (int)NumCounters * 2);
+
+  auto &Ctx = M->getContext();
+  Type *VNodeTypes[] = {
+#define INSTR_PROF_VALUE_NODE(Type, LLVMType, Name, Init) LLVMType,
+#include "llvm/ProfileData/InstrProfData.inc"
+  };
+  auto *VNodeTy = StructType::get(Ctx, makeArrayRef(VNodeTypes));
+
+  ArrayType *VNodesTy = ArrayType::get(VNodeTy, NumCounters);
+  auto *VNodesVar = new GlobalVariable(
+      *M, VNodesTy, false, GlobalValue::PrivateLinkage,
+      Constant::getNullValue(VNodesTy), getInstrProfVNodesVarName());
+  VNodesVar->setSection(
+      getInstrProfSectionName(IPSK_vnodes, TT.getObjectFormat()));
+  UsedVars.push_back(VNodesVar);
+}
+
+void InstrProfiling::emitNameData() {
+  std::string UncompressedData;
+
+  if (ReferencedNames.empty())
+    return;
+
+  std::string CompressedNameStr;
+  if (Error E = collectPGOFuncNameStrings(ReferencedNames, CompressedNameStr,
+                                          DoInstrProfNameCompression)) {
+    report_fatal_error(toString(std::move(E)), false);
+  }
+
+  auto &Ctx = M->getContext();
+  auto *NamesVal = ConstantDataArray::getString(
+      Ctx, StringRef(CompressedNameStr), false);
+  NamesVar = new GlobalVariable(*M, NamesVal->getType(), true,
+                                GlobalValue::PrivateLinkage, NamesVal,
+                                getInstrProfNamesVarName());
+  NamesSize = CompressedNameStr.size();
+  NamesVar->setSection(
+      getInstrProfSectionName(IPSK_name, TT.getObjectFormat()));
+  // On COFF, it's important to reduce the alignment down to 1 to prevent the
+  // linker from inserting padding before the start of the names section or
+  // between names entries.
+  NamesVar->setAlignment(Align(1));
+  UsedVars.push_back(NamesVar);
+
+  for (auto *NamePtr : ReferencedNames)
+    NamePtr->eraseFromParent();
+}
+
+void InstrProfiling::emitRegistration() {
+  if (!needsRuntimeRegistrationOfSectionRange(TT))
+    return;
+
+  // Construct the function.
+  auto *VoidTy = Type::getVoidTy(M->getContext());
+  auto *VoidPtrTy = Type::getInt8PtrTy(M->getContext());
+  auto *Int64Ty = Type::getInt64Ty(M->getContext());
+  auto *RegisterFTy = FunctionType::get(VoidTy, false);
+  auto *RegisterF = Function::Create(RegisterFTy, GlobalValue::InternalLinkage,
+                                     getInstrProfRegFuncsName(), M);
+  RegisterF->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  if (Options.NoRedZone)
+    RegisterF->addFnAttr(Attribute::NoRedZone);
+
+  auto *RuntimeRegisterTy = FunctionType::get(VoidTy, VoidPtrTy, false);
+  auto *RuntimeRegisterF =
+      Function::Create(RuntimeRegisterTy, GlobalVariable::ExternalLinkage,
+                       getInstrProfRegFuncName(), M);
+
+  IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", RegisterF));
+  for (Value *Data : UsedVars)
+    if (Data != NamesVar && !isa<Function>(Data))
+      IRB.CreateCall(RuntimeRegisterF, IRB.CreateBitCast(Data, VoidPtrTy));
+
+  if (NamesVar) {
+    Type *ParamTypes[] = {VoidPtrTy, Int64Ty};
+    auto *NamesRegisterTy =
+        FunctionType::get(VoidTy, makeArrayRef(ParamTypes), false);
+    auto *NamesRegisterF =
+        Function::Create(NamesRegisterTy, GlobalVariable::ExternalLinkage,
+                         getInstrProfNamesRegFuncName(), M);
+    IRB.CreateCall(NamesRegisterF, {IRB.CreateBitCast(NamesVar, VoidPtrTy),
+                                    IRB.getInt64(NamesSize)});
+  }
+
+  IRB.CreateRetVoid();
+}
+
+bool InstrProfiling::emitRuntimeHook() {
+  // We expect the linker to be invoked with -u<hook_var> flag for Linux or
+  // Fuchsia, in which case there is no need to emit the user function.
+  if (TT.isOSLinux() || TT.isOSFuchsia())
+    return false;
+
+  // If the module's provided its own runtime, we don't need to do anything.
+  if (M->getGlobalVariable(getInstrProfRuntimeHookVarName()))
+    return false;
+
+  // Declare an external variable that will pull in the runtime initialization.
+  auto *Int32Ty = Type::getInt32Ty(M->getContext());
+  auto *Var =
+      new GlobalVariable(*M, Int32Ty, false, GlobalValue::ExternalLinkage,
+                         nullptr, getInstrProfRuntimeHookVarName());
+
+  // Make a function that uses it.
+  auto *User = Function::Create(FunctionType::get(Int32Ty, false),
+                                GlobalValue::LinkOnceODRLinkage,
+                                getInstrProfRuntimeHookVarUseFuncName(), M);
+  User->addFnAttr(Attribute::NoInline);
+  if (Options.NoRedZone)
+    User->addFnAttr(Attribute::NoRedZone);
+  User->setVisibility(GlobalValue::HiddenVisibility);
+  if (TT.supportsCOMDAT())
+    User->setComdat(M->getOrInsertComdat(User->getName()));
+
+  IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", User));
+  auto *Load = IRB.CreateLoad(Int32Ty, Var);
+  IRB.CreateRet(Load);
+
+  // Mark the user variable as used so that it isn't stripped out.
+  UsedVars.push_back(User);
+  return true;
+}
+
+void InstrProfiling::emitUses() {
+  if (!UsedVars.empty())
+    appendToUsed(*M, UsedVars);
+}
+
+void InstrProfiling::emitInitialization() {
+  // Create ProfileFileName variable. Don't don't this for the
+  // context-sensitive instrumentation lowering: This lowering is after
+  // LTO/ThinLTO linking. Pass PGOInstrumentationGenCreateVar should
+  // have already create the variable before LTO/ThinLTO linking.
+  if (!IsCS)
+    createProfileFileNameVar(*M, Options.InstrProfileOutput);
+  Function *RegisterF = M->getFunction(getInstrProfRegFuncsName());
+  if (!RegisterF)
+    return;
+
+  // Create the initialization function.
+  auto *VoidTy = Type::getVoidTy(M->getContext());
+  auto *F = Function::Create(FunctionType::get(VoidTy, false),
+                             GlobalValue::InternalLinkage,
+                             getInstrProfInitFuncName(), M);
+  F->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  F->addFnAttr(Attribute::NoInline);
+  if (Options.NoRedZone)
+    F->addFnAttr(Attribute::NoRedZone);
+
+  // Add the basic block and the necessary calls.
+  IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", F));
+  IRB.CreateCall(RegisterF, {});
+  IRB.CreateRetVoid();
+
+  appendToGlobalCtors(*M, F, 0);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/Instrumentation.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/Instrumentation.cpp
index 08137cf836..cfdf3cad97 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -1,131 +1,131 @@
-//===-- Instrumentation.cpp - TransformUtils Infrastructure ---------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file defines the common initialization infrastructure for the 
-// Instrumentation library. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Instrumentation.h" 
-#include "llvm-c/Initialization.h" 
-#include "llvm/ADT/Triple.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/PassRegistry.h" 
- 
-using namespace llvm; 
- 
-/// Moves I before IP. Returns new insert point. 
-static BasicBlock::iterator moveBeforeInsertPoint(BasicBlock::iterator I, BasicBlock::iterator IP) { 
-  // If I is IP, move the insert point down. 
-  if (I == IP) { 
-    ++IP; 
-  } else { 
-    // Otherwise, move I before IP and return IP. 
-    I->moveBefore(&*IP); 
-  } 
-  return IP; 
-} 
- 
-/// Instrumentation passes often insert conditional checks into entry blocks. 
-/// Call this function before splitting the entry block to move instructions 
-/// that must remain in the entry block up before the split point. Static 
-/// allocas and llvm.localescape calls, for example, must remain in the entry 
-/// block. 
-BasicBlock::iterator llvm::PrepareToSplitEntryBlock(BasicBlock &BB, 
-                                                    BasicBlock::iterator IP) { 
-  assert(&BB.getParent()->getEntryBlock() == &BB); 
-  for (auto I = IP, E = BB.end(); I != E; ++I) { 
-    bool KeepInEntry = false; 
-    if (auto *AI = dyn_cast<AllocaInst>(I)) { 
-      if (AI->isStaticAlloca()) 
-        KeepInEntry = true; 
-    } else if (auto *II = dyn_cast<IntrinsicInst>(I)) { 
-      if (II->getIntrinsicID() == llvm::Intrinsic::localescape) 
-        KeepInEntry = true; 
-    } 
-    if (KeepInEntry) 
-      IP = moveBeforeInsertPoint(I, IP); 
-  } 
-  return IP; 
-} 
- 
-// Create a constant for Str so that we can pass it to the run-time lib. 
-GlobalVariable *llvm::createPrivateGlobalForString(Module &M, StringRef Str, 
-                                                   bool AllowMerging, 
-                                                   const char *NamePrefix) { 
-  Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str); 
-  // We use private linkage for module-local strings. If they can be merged 
-  // with another one, we set the unnamed_addr attribute. 
-  GlobalVariable *GV = 
-      new GlobalVariable(M, StrConst->getType(), true, 
-                         GlobalValue::PrivateLinkage, StrConst, NamePrefix); 
-  if (AllowMerging) 
-    GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); 
-  GV->setAlignment(Align(1)); // Strings may not be merged w/o setting 
-                              // alignment explicitly. 
-  return GV; 
-} 
- 
-Comdat *llvm::GetOrCreateFunctionComdat(Function &F, Triple &T, 
-                                        const std::string &ModuleId) { 
-  if (auto Comdat = F.getComdat()) return Comdat; 
-  assert(F.hasName()); 
-  Module *M = F.getParent(); 
-  std::string Name = std::string(F.getName()); 
- 
-  // Make a unique comdat name for internal linkage things on ELF. On COFF, the 
-  // name of the comdat group identifies the leader symbol of the comdat group. 
-  // The linkage of the leader symbol is considered during comdat resolution, 
-  // and internal symbols with the same name from different objects will not be 
-  // merged. 
-  if (T.isOSBinFormatELF() && F.hasLocalLinkage()) { 
-    if (ModuleId.empty()) 
-      return nullptr; 
-    Name += ModuleId; 
-  } 
- 
-  // Make a new comdat for the function. Use the "no duplicates" selection kind 
-  // for non-weak symbols if the object file format supports it. 
-  Comdat *C = M->getOrInsertComdat(Name); 
-  if (T.isOSBinFormatCOFF() && !F.isWeakForLinker()) 
-    C->setSelectionKind(Comdat::NoDuplicates); 
-  F.setComdat(C); 
-  return C; 
-} 
- 
-/// initializeInstrumentation - Initialize all passes in the TransformUtils 
-/// library. 
-void llvm::initializeInstrumentation(PassRegistry &Registry) { 
-  initializeAddressSanitizerLegacyPassPass(Registry); 
-  initializeModuleAddressSanitizerLegacyPassPass(Registry); 
+//===-- Instrumentation.cpp - TransformUtils Infrastructure ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the common initialization infrastructure for the
+// Instrumentation library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm-c/Initialization.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/PassRegistry.h"
+
+using namespace llvm;
+
+/// Moves I before IP. Returns new insert point.
+static BasicBlock::iterator moveBeforeInsertPoint(BasicBlock::iterator I, BasicBlock::iterator IP) {
+  // If I is IP, move the insert point down.
+  if (I == IP) {
+    ++IP;
+  } else {
+    // Otherwise, move I before IP and return IP.
+    I->moveBefore(&*IP);
+  }
+  return IP;
+}
+
+/// Instrumentation passes often insert conditional checks into entry blocks.
+/// Call this function before splitting the entry block to move instructions
+/// that must remain in the entry block up before the split point. Static
+/// allocas and llvm.localescape calls, for example, must remain in the entry
+/// block.
+BasicBlock::iterator llvm::PrepareToSplitEntryBlock(BasicBlock &BB,
+                                                    BasicBlock::iterator IP) {
+  assert(&BB.getParent()->getEntryBlock() == &BB);
+  for (auto I = IP, E = BB.end(); I != E; ++I) {
+    bool KeepInEntry = false;
+    if (auto *AI = dyn_cast<AllocaInst>(I)) {
+      if (AI->isStaticAlloca())
+        KeepInEntry = true;
+    } else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+      if (II->getIntrinsicID() == llvm::Intrinsic::localescape)
+        KeepInEntry = true;
+    }
+    if (KeepInEntry)
+      IP = moveBeforeInsertPoint(I, IP);
+  }
+  return IP;
+}
+
+// Create a constant for Str so that we can pass it to the run-time lib.
+GlobalVariable *llvm::createPrivateGlobalForString(Module &M, StringRef Str,
+                                                   bool AllowMerging,
+                                                   const char *NamePrefix) {
+  Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
+  // We use private linkage for module-local strings. If they can be merged
+  // with another one, we set the unnamed_addr attribute.
+  GlobalVariable *GV =
+      new GlobalVariable(M, StrConst->getType(), true,
+                         GlobalValue::PrivateLinkage, StrConst, NamePrefix);
+  if (AllowMerging)
+    GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  GV->setAlignment(Align(1)); // Strings may not be merged w/o setting
+                              // alignment explicitly.
+  return GV;
+}
+
+Comdat *llvm::GetOrCreateFunctionComdat(Function &F, Triple &T,
+                                        const std::string &ModuleId) {
+  if (auto Comdat = F.getComdat()) return Comdat;
+  assert(F.hasName());
+  Module *M = F.getParent();
+  std::string Name = std::string(F.getName());
+
+  // Make a unique comdat name for internal linkage things on ELF. On COFF, the
+  // name of the comdat group identifies the leader symbol of the comdat group.
+  // The linkage of the leader symbol is considered during comdat resolution,
+  // and internal symbols with the same name from different objects will not be
+  // merged.
+  if (T.isOSBinFormatELF() && F.hasLocalLinkage()) {
+    if (ModuleId.empty())
+      return nullptr;
+    Name += ModuleId;
+  }
+
+  // Make a new comdat for the function. Use the "no duplicates" selection kind
+  // for non-weak symbols if the object file format supports it.
+  Comdat *C = M->getOrInsertComdat(Name);
+  if (T.isOSBinFormatCOFF() && !F.isWeakForLinker())
+    C->setSelectionKind(Comdat::NoDuplicates);
+  F.setComdat(C);
+  return C;
+}
+
+/// initializeInstrumentation - Initialize all passes in the TransformUtils
+/// library.
+void llvm::initializeInstrumentation(PassRegistry &Registry) {
+  initializeAddressSanitizerLegacyPassPass(Registry);
+  initializeModuleAddressSanitizerLegacyPassPass(Registry);
   initializeMemProfilerLegacyPassPass(Registry);
   initializeModuleMemProfilerLegacyPassPass(Registry);
-  initializeBoundsCheckingLegacyPassPass(Registry); 
-  initializeControlHeightReductionLegacyPassPass(Registry); 
-  initializeGCOVProfilerLegacyPassPass(Registry); 
-  initializePGOInstrumentationGenLegacyPassPass(Registry); 
-  initializePGOInstrumentationUseLegacyPassPass(Registry); 
-  initializePGOIndirectCallPromotionLegacyPassPass(Registry); 
-  initializePGOMemOPSizeOptLegacyPassPass(Registry); 
-  initializeCGProfileLegacyPassPass(Registry); 
-  initializeInstrOrderFileLegacyPassPass(Registry); 
-  initializeInstrProfilingLegacyPassPass(Registry); 
-  initializeMemorySanitizerLegacyPassPass(Registry); 
-  initializeHWAddressSanitizerLegacyPassPass(Registry); 
-  initializeThreadSanitizerLegacyPassPass(Registry); 
-  initializeModuleSanitizerCoverageLegacyPassPass(Registry); 
+  initializeBoundsCheckingLegacyPassPass(Registry);
+  initializeControlHeightReductionLegacyPassPass(Registry);
+  initializeGCOVProfilerLegacyPassPass(Registry);
+  initializePGOInstrumentationGenLegacyPassPass(Registry);
+  initializePGOInstrumentationUseLegacyPassPass(Registry);
+  initializePGOIndirectCallPromotionLegacyPassPass(Registry);
+  initializePGOMemOPSizeOptLegacyPassPass(Registry);
+  initializeCGProfileLegacyPassPass(Registry);
+  initializeInstrOrderFileLegacyPassPass(Registry);
+  initializeInstrProfilingLegacyPassPass(Registry);
+  initializeMemorySanitizerLegacyPassPass(Registry);
+  initializeHWAddressSanitizerLegacyPassPass(Registry);
+  initializeThreadSanitizerLegacyPassPass(Registry);
+  initializeModuleSanitizerCoverageLegacyPassPass(Registry);
   initializeDataFlowSanitizerLegacyPassPass(Registry);
-} 
- 
-/// LLVMInitializeInstrumentation - C binding for 
-/// initializeInstrumentation. 
-void LLVMInitializeInstrumentation(LLVMPassRegistryRef R) { 
-  initializeInstrumentation(*unwrap(R)); 
-} 
+}
+
+/// LLVMInitializeInstrumentation - C binding for
+/// initializeInstrumentation.
+void LLVMInitializeInstrumentation(LLVMPassRegistryRef R) {
+  initializeInstrumentation(*unwrap(R));
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 4159f82db5..7a6874584d 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -1,1176 +1,1176 @@
-//===- MemorySanitizer.cpp - detector of uninitialized reads --------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-/// \file 
-/// This file is a part of MemorySanitizer, a detector of uninitialized 
-/// reads. 
-/// 
-/// The algorithm of the tool is similar to Memcheck 
-/// (http://goo.gl/QKbem). We associate a few shadow bits with every 
-/// byte of the application memory, poison the shadow of the malloc-ed 
-/// or alloca-ed memory, load the shadow bits on every memory read, 
-/// propagate the shadow bits through some of the arithmetic 
-/// instruction (including MOV), store the shadow bits on every memory 
-/// write, report a bug on some other instructions (e.g. JMP) if the 
-/// associated shadow is poisoned. 
-/// 
-/// But there are differences too. The first and the major one: 
-/// compiler instrumentation instead of binary instrumentation. This 
-/// gives us much better register allocation, possible compiler 
-/// optimizations and a fast start-up. But this brings the major issue 
-/// as well: msan needs to see all program events, including system 
-/// calls and reads/writes in system libraries, so we either need to 
-/// compile *everything* with msan or use a binary translation 
-/// component (e.g. DynamoRIO) to instrument pre-built libraries. 
-/// Another difference from Memcheck is that we use 8 shadow bits per 
-/// byte of application memory and use a direct shadow mapping. This 
-/// greatly simplifies the instrumentation code and avoids races on 
-/// shadow updates (Memcheck is single-threaded so races are not a 
-/// concern there. Memcheck uses 2 shadow bits per byte with a slow 
-/// path storage that uses 8 bits per byte). 
-/// 
-/// The default value of shadow is 0, which means "clean" (not poisoned). 
-/// 
-/// Every module initializer should call __msan_init to ensure that the 
-/// shadow memory is ready. On error, __msan_warning is called. Since 
-/// parameters and return values may be passed via registers, we have a 
-/// specialized thread-local shadow for return values 
-/// (__msan_retval_tls) and parameters (__msan_param_tls). 
-/// 
-///                           Origin tracking. 
-/// 
-/// MemorySanitizer can track origins (allocation points) of all uninitialized 
-/// values. This behavior is controlled with a flag (msan-track-origins) and is 
-/// disabled by default. 
-/// 
-/// Origins are 4-byte values created and interpreted by the runtime library. 
-/// They are stored in a second shadow mapping, one 4-byte value for 4 bytes 
-/// of application memory. Propagation of origins is basically a bunch of 
-/// "select" instructions that pick the origin of a dirty argument, if an 
-/// instruction has one. 
-/// 
-/// Every 4 aligned, consecutive bytes of application memory have one origin 
-/// value associated with them. If these bytes contain uninitialized data 
-/// coming from 2 different allocations, the last store wins. Because of this, 
-/// MemorySanitizer reports can show unrelated origins, but this is unlikely in 
-/// practice. 
-/// 
-/// Origins are meaningless for fully initialized values, so MemorySanitizer 
-/// avoids storing origin to memory when a fully initialized value is stored. 
-/// This way it avoids needless overwriting origin of the 4-byte region on 
-/// a short (i.e. 1 byte) clean store, and it is also good for performance. 
-/// 
-///                            Atomic handling. 
-/// 
-/// Ideally, every atomic store of application value should update the 
-/// corresponding shadow location in an atomic way. Unfortunately, atomic store 
-/// of two disjoint locations can not be done without severe slowdown. 
-/// 
-/// Therefore, we implement an approximation that may err on the safe side. 
-/// In this implementation, every atomically accessed location in the program 
-/// may only change from (partially) uninitialized to fully initialized, but 
-/// not the other way around. We load the shadow _after_ the application load, 
-/// and we store the shadow _before_ the app store. Also, we always store clean 
-/// shadow (if the application store is atomic). This way, if the store-load 
-/// pair constitutes a happens-before arc, shadow store and load are correctly 
-/// ordered such that the load will get either the value that was stored, or 
-/// some later value (which is always clean). 
-/// 
-/// This does not work very well with Compare-And-Swap (CAS) and 
-/// Read-Modify-Write (RMW) operations. To follow the above logic, CAS and RMW 
-/// must store the new shadow before the app operation, and load the shadow 
-/// after the app operation. Computers don't work this way. Current 
-/// implementation ignores the load aspect of CAS/RMW, always returning a clean 
-/// value. It implements the store part as a simple atomic store by storing a 
-/// clean shadow. 
-/// 
-///                      Instrumenting inline assembly. 
-/// 
-/// For inline assembly code LLVM has little idea about which memory locations 
-/// become initialized depending on the arguments. It can be possible to figure 
-/// out which arguments are meant to point to inputs and outputs, but the 
-/// actual semantics can be only visible at runtime. In the Linux kernel it's 
-/// also possible that the arguments only indicate the offset for a base taken 
-/// from a segment register, so it's dangerous to treat any asm() arguments as 
-/// pointers. We take a conservative approach generating calls to 
-///   __msan_instrument_asm_store(ptr, size) 
-/// , which defer the memory unpoisoning to the runtime library. 
-/// The latter can perform more complex address checks to figure out whether 
-/// it's safe to touch the shadow memory. 
-/// Like with atomic operations, we call __msan_instrument_asm_store() before 
-/// the assembly call, so that changes to the shadow memory will be seen by 
-/// other threads together with main memory initialization. 
-/// 
-///                  KernelMemorySanitizer (KMSAN) implementation. 
-/// 
-/// The major differences between KMSAN and MSan instrumentation are: 
-///  - KMSAN always tracks the origins and implies msan-keep-going=true; 
-///  - KMSAN allocates shadow and origin memory for each page separately, so 
-///    there are no explicit accesses to shadow and origin in the 
-///    instrumentation. 
-///    Shadow and origin values for a particular X-byte memory location 
-///    (X=1,2,4,8) are accessed through pointers obtained via the 
-///      __msan_metadata_ptr_for_load_X(ptr) 
-///      __msan_metadata_ptr_for_store_X(ptr) 
-///    functions. The corresponding functions check that the X-byte accesses 
-///    are possible and returns the pointers to shadow and origin memory. 
-///    Arbitrary sized accesses are handled with: 
-///      __msan_metadata_ptr_for_load_n(ptr, size) 
-///      __msan_metadata_ptr_for_store_n(ptr, size); 
-///  - TLS variables are stored in a single per-task struct. A call to a 
-///    function __msan_get_context_state() returning a pointer to that struct 
-///    is inserted into every instrumented function before the entry block; 
-///  - __msan_warning() takes a 32-bit origin parameter; 
-///  - local variables are poisoned with __msan_poison_alloca() upon function 
-///    entry and unpoisoned with __msan_unpoison_alloca() before leaving the 
-///    function; 
-///  - the pass doesn't declare any global variables or add global constructors 
-///    to the translation unit. 
-/// 
-/// Also, KMSAN currently ignores uninitialized memory passed into inline asm 
-/// calls, making sure we're on the safe side wrt. possible false positives. 
-/// 
-///  KernelMemorySanitizer only supports X86_64 at the moment. 
-/// 
-// 
-// FIXME: This sanitizer does not yet handle scalable vectors 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Instrumentation/MemorySanitizer.h" 
-#include "llvm/ADT/APInt.h" 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/DepthFirstIterator.h" 
-#include "llvm/ADT/SmallSet.h" 
-#include "llvm/ADT/SmallString.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/StringExtras.h" 
-#include "llvm/ADT/StringRef.h" 
-#include "llvm/ADT/Triple.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
+//===- MemorySanitizer.cpp - detector of uninitialized reads --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file is a part of MemorySanitizer, a detector of uninitialized
+/// reads.
+///
+/// The algorithm of the tool is similar to Memcheck
+/// (http://goo.gl/QKbem). We associate a few shadow bits with every
+/// byte of the application memory, poison the shadow of the malloc-ed
+/// or alloca-ed memory, load the shadow bits on every memory read,
+/// propagate the shadow bits through some of the arithmetic
+/// instruction (including MOV), store the shadow bits on every memory
+/// write, report a bug on some other instructions (e.g. JMP) if the
+/// associated shadow is poisoned.
+///
+/// But there are differences too. The first and the major one:
+/// compiler instrumentation instead of binary instrumentation. This
+/// gives us much better register allocation, possible compiler
+/// optimizations and a fast start-up. But this brings the major issue
+/// as well: msan needs to see all program events, including system
+/// calls and reads/writes in system libraries, so we either need to
+/// compile *everything* with msan or use a binary translation
+/// component (e.g. DynamoRIO) to instrument pre-built libraries.
+/// Another difference from Memcheck is that we use 8 shadow bits per
+/// byte of application memory and use a direct shadow mapping. This
+/// greatly simplifies the instrumentation code and avoids races on
+/// shadow updates (Memcheck is single-threaded so races are not a
+/// concern there. Memcheck uses 2 shadow bits per byte with a slow
+/// path storage that uses 8 bits per byte).
+///
+/// The default value of shadow is 0, which means "clean" (not poisoned).
+///
+/// Every module initializer should call __msan_init to ensure that the
+/// shadow memory is ready. On error, __msan_warning is called. Since
+/// parameters and return values may be passed via registers, we have a
+/// specialized thread-local shadow for return values
+/// (__msan_retval_tls) and parameters (__msan_param_tls).
+///
+///                           Origin tracking.
+///
+/// MemorySanitizer can track origins (allocation points) of all uninitialized
+/// values. This behavior is controlled with a flag (msan-track-origins) and is
+/// disabled by default.
+///
+/// Origins are 4-byte values created and interpreted by the runtime library.
+/// They are stored in a second shadow mapping, one 4-byte value for 4 bytes
+/// of application memory. Propagation of origins is basically a bunch of
+/// "select" instructions that pick the origin of a dirty argument, if an
+/// instruction has one.
+///
+/// Every 4 aligned, consecutive bytes of application memory have one origin
+/// value associated with them. If these bytes contain uninitialized data
+/// coming from 2 different allocations, the last store wins. Because of this,
+/// MemorySanitizer reports can show unrelated origins, but this is unlikely in
+/// practice.
+///
+/// Origins are meaningless for fully initialized values, so MemorySanitizer
+/// avoids storing origin to memory when a fully initialized value is stored.
+/// This way it avoids needless overwriting origin of the 4-byte region on
+/// a short (i.e. 1 byte) clean store, and it is also good for performance.
+///
+///                            Atomic handling.
+///
+/// Ideally, every atomic store of application value should update the
+/// corresponding shadow location in an atomic way. Unfortunately, atomic store
+/// of two disjoint locations can not be done without severe slowdown.
+///
+/// Therefore, we implement an approximation that may err on the safe side.
+/// In this implementation, every atomically accessed location in the program
+/// may only change from (partially) uninitialized to fully initialized, but
+/// not the other way around. We load the shadow _after_ the application load,
+/// and we store the shadow _before_ the app store. Also, we always store clean
+/// shadow (if the application store is atomic). This way, if the store-load
+/// pair constitutes a happens-before arc, shadow store and load are correctly
+/// ordered such that the load will get either the value that was stored, or
+/// some later value (which is always clean).
+///
+/// This does not work very well with Compare-And-Swap (CAS) and
+/// Read-Modify-Write (RMW) operations. To follow the above logic, CAS and RMW
+/// must store the new shadow before the app operation, and load the shadow
+/// after the app operation. Computers don't work this way. Current
+/// implementation ignores the load aspect of CAS/RMW, always returning a clean
+/// value. It implements the store part as a simple atomic store by storing a
+/// clean shadow.
+///
+///                      Instrumenting inline assembly.
+///
+/// For inline assembly code LLVM has little idea about which memory locations
+/// become initialized depending on the arguments. It can be possible to figure
+/// out which arguments are meant to point to inputs and outputs, but the
+/// actual semantics can be only visible at runtime. In the Linux kernel it's
+/// also possible that the arguments only indicate the offset for a base taken
+/// from a segment register, so it's dangerous to treat any asm() arguments as
+/// pointers. We take a conservative approach generating calls to
+///   __msan_instrument_asm_store(ptr, size)
+/// , which defer the memory unpoisoning to the runtime library.
+/// The latter can perform more complex address checks to figure out whether
+/// it's safe to touch the shadow memory.
+/// Like with atomic operations, we call __msan_instrument_asm_store() before
+/// the assembly call, so that changes to the shadow memory will be seen by
+/// other threads together with main memory initialization.
+///
+///                  KernelMemorySanitizer (KMSAN) implementation.
+///
+/// The major differences between KMSAN and MSan instrumentation are:
+///  - KMSAN always tracks the origins and implies msan-keep-going=true;
+///  - KMSAN allocates shadow and origin memory for each page separately, so
+///    there are no explicit accesses to shadow and origin in the
+///    instrumentation.
+///    Shadow and origin values for a particular X-byte memory location
+///    (X=1,2,4,8) are accessed through pointers obtained via the
+///      __msan_metadata_ptr_for_load_X(ptr)
+///      __msan_metadata_ptr_for_store_X(ptr)
+///    functions. The corresponding functions check that the X-byte accesses
+///    are possible and returns the pointers to shadow and origin memory.
+///    Arbitrary sized accesses are handled with:
+///      __msan_metadata_ptr_for_load_n(ptr, size)
+///      __msan_metadata_ptr_for_store_n(ptr, size);
+///  - TLS variables are stored in a single per-task struct. A call to a
+///    function __msan_get_context_state() returning a pointer to that struct
+///    is inserted into every instrumented function before the entry block;
+///  - __msan_warning() takes a 32-bit origin parameter;
+///  - local variables are poisoned with __msan_poison_alloca() upon function
+///    entry and unpoisoned with __msan_unpoison_alloca() before leaving the
+///    function;
+///  - the pass doesn't declare any global variables or add global constructors
+///    to the translation unit.
+///
+/// Also, KMSAN currently ignores uninitialized memory passed into inline asm
+/// calls, making sure we're on the safe side wrt. possible false positives.
+///
+///  KernelMemorySanitizer only supports X86_64 at the moment.
+///
+//
+// FIXME: This sanitizer does not yet handle scalable vectors
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/MemorySanitizer.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Argument.h" 
-#include "llvm/IR/Attributes.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/CallingConv.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GlobalValue.h" 
-#include "llvm/IR/GlobalVariable.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InlineAsm.h" 
-#include "llvm/IR/InstVisitor.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/IntrinsicsX86.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/MDBuilder.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/IR/ValueMap.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/AtomicOrdering.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Compiler.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/MathExtras.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Instrumentation.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Transforms/Utils/ModuleUtils.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <cstddef> 
-#include <cstdint> 
-#include <memory> 
-#include <string> 
-#include <tuple> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "msan" 
- 
-static const unsigned kOriginSize = 4; 
-static const Align kMinOriginAlignment = Align(4); 
-static const Align kShadowTLSAlignment = Align(8); 
- 
-// These constants must be kept in sync with the ones in msan.h. 
-static const unsigned kParamTLSSize = 800; 
-static const unsigned kRetvalTLSSize = 800; 
- 
-// Accesses sizes are powers of two: 1, 2, 4, 8. 
-static const size_t kNumberOfAccessSizes = 4; 
- 
-/// Track origins of uninitialized values. 
-/// 
-/// Adds a section to MemorySanitizer report that points to the allocation 
-/// (stack or heap) the uninitialized bits came from originally. 
-static cl::opt<int> ClTrackOrigins("msan-track-origins", 
-       cl::desc("Track origins (allocation sites) of poisoned memory"), 
-       cl::Hidden, cl::init(0)); 
- 
-static cl::opt<bool> ClKeepGoing("msan-keep-going", 
-       cl::desc("keep going after reporting a UMR"), 
-       cl::Hidden, cl::init(false)); 
- 
-static cl::opt<bool> ClPoisonStack("msan-poison-stack", 
-       cl::desc("poison uninitialized stack variables"), 
-       cl::Hidden, cl::init(true)); 
- 
-static cl::opt<bool> ClPoisonStackWithCall("msan-poison-stack-with-call", 
-       cl::desc("poison uninitialized stack variables with a call"), 
-       cl::Hidden, cl::init(false)); 
- 
-static cl::opt<int> ClPoisonStackPattern("msan-poison-stack-pattern", 
-       cl::desc("poison uninitialized stack variables with the given pattern"), 
-       cl::Hidden, cl::init(0xff)); 
- 
-static cl::opt<bool> ClPoisonUndef("msan-poison-undef", 
-       cl::desc("poison undef temps"), 
-       cl::Hidden, cl::init(true)); 
- 
-static cl::opt<bool> ClHandleICmp("msan-handle-icmp", 
-       cl::desc("propagate shadow through ICmpEQ and ICmpNE"), 
-       cl::Hidden, cl::init(true)); 
- 
-static cl::opt<bool> ClHandleICmpExact("msan-handle-icmp-exact", 
-       cl::desc("exact handling of relational integer ICmp"), 
-       cl::Hidden, cl::init(false)); 
- 
-static cl::opt<bool> ClHandleLifetimeIntrinsics( 
-    "msan-handle-lifetime-intrinsics", 
-    cl::desc( 
-        "when possible, poison scoped variables at the beginning of the scope " 
-        "(slower, but more precise)"), 
-    cl::Hidden, cl::init(true)); 
- 
-// When compiling the Linux kernel, we sometimes see false positives related to 
-// MSan being unable to understand that inline assembly calls may initialize 
-// local variables. 
-// This flag makes the compiler conservatively unpoison every memory location 
-// passed into an assembly call. Note that this may cause false positives. 
-// Because it's impossible to figure out the array sizes, we can only unpoison 
-// the first sizeof(type) bytes for each type* pointer. 
-// The instrumentation is only enabled in KMSAN builds, and only if 
-// -msan-handle-asm-conservative is on. This is done because we may want to 
-// quickly disable assembly instrumentation when it breaks. 
-static cl::opt<bool> ClHandleAsmConservative( 
-    "msan-handle-asm-conservative", 
-    cl::desc("conservative handling of inline assembly"), cl::Hidden, 
-    cl::init(true)); 
- 
-// This flag controls whether we check the shadow of the address 
-// operand of load or store. Such bugs are very rare, since load from 
-// a garbage address typically results in SEGV, but still happen 
-// (e.g. only lower bits of address are garbage, or the access happens 
-// early at program startup where malloc-ed memory is more likely to 
-// be zeroed. As of 2012-08-28 this flag adds 20% slowdown. 
-static cl::opt<bool> ClCheckAccessAddress("msan-check-access-address", 
-       cl::desc("report accesses through a pointer which has poisoned shadow"), 
-       cl::Hidden, cl::init(true)); 
- 
-static cl::opt<bool> ClEagerChecks( 
-    "msan-eager-checks", 
-    cl::desc("check arguments and return values at function call boundaries"), 
-    cl::Hidden, cl::init(false)); 
- 
-static cl::opt<bool> ClDumpStrictInstructions("msan-dump-strict-instructions", 
-       cl::desc("print out instructions with default strict semantics"), 
-       cl::Hidden, cl::init(false)); 
- 
-static cl::opt<int> ClInstrumentationWithCallThreshold( 
-    "msan-instrumentation-with-call-threshold", 
-    cl::desc( 
-        "If the function being instrumented requires more than " 
-        "this number of checks and origin stores, use callbacks instead of " 
-        "inline checks (-1 means never use callbacks)."), 
-    cl::Hidden, cl::init(3500)); 
- 
-static cl::opt<bool> 
-    ClEnableKmsan("msan-kernel", 
-                  cl::desc("Enable KernelMemorySanitizer instrumentation"), 
-                  cl::Hidden, cl::init(false)); 
- 
-// This is an experiment to enable handling of cases where shadow is a non-zero 
-// compile-time constant. For some unexplainable reason they were silently 
-// ignored in the instrumentation. 
-static cl::opt<bool> ClCheckConstantShadow("msan-check-constant-shadow", 
-       cl::desc("Insert checks for constant shadow values"), 
-       cl::Hidden, cl::init(false)); 
- 
-// This is off by default because of a bug in gold: 
-// https://sourceware.org/bugzilla/show_bug.cgi?id=19002 
-static cl::opt<bool> ClWithComdat("msan-with-comdat", 
-       cl::desc("Place MSan constructors in comdat sections"), 
-       cl::Hidden, cl::init(false)); 
- 
-// These options allow to specify custom memory map parameters 
-// See MemoryMapParams for details. 
-static cl::opt<uint64_t> ClAndMask("msan-and-mask", 
-                                   cl::desc("Define custom MSan AndMask"), 
-                                   cl::Hidden, cl::init(0)); 
- 
-static cl::opt<uint64_t> ClXorMask("msan-xor-mask", 
-                                   cl::desc("Define custom MSan XorMask"), 
-                                   cl::Hidden, cl::init(0)); 
- 
-static cl::opt<uint64_t> ClShadowBase("msan-shadow-base", 
-                                      cl::desc("Define custom MSan ShadowBase"), 
-                                      cl::Hidden, cl::init(0)); 
- 
-static cl::opt<uint64_t> ClOriginBase("msan-origin-base", 
-                                      cl::desc("Define custom MSan OriginBase"), 
-                                      cl::Hidden, cl::init(0)); 
- 
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <tuple>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "msan"
+
+static const unsigned kOriginSize = 4;
+static const Align kMinOriginAlignment = Align(4);
+static const Align kShadowTLSAlignment = Align(8);
+
+// These constants must be kept in sync with the ones in msan.h.
+static const unsigned kParamTLSSize = 800;
+static const unsigned kRetvalTLSSize = 800;
+
+// Accesses sizes are powers of two: 1, 2, 4, 8.
+static const size_t kNumberOfAccessSizes = 4;
+
+/// Track origins of uninitialized values.
+///
+/// Adds a section to MemorySanitizer report that points to the allocation
+/// (stack or heap) the uninitialized bits came from originally.
+static cl::opt<int> ClTrackOrigins("msan-track-origins",
+       cl::desc("Track origins (allocation sites) of poisoned memory"),
+       cl::Hidden, cl::init(0));
+
+static cl::opt<bool> ClKeepGoing("msan-keep-going",
+       cl::desc("keep going after reporting a UMR"),
+       cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClPoisonStack("msan-poison-stack",
+       cl::desc("poison uninitialized stack variables"),
+       cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClPoisonStackWithCall("msan-poison-stack-with-call",
+       cl::desc("poison uninitialized stack variables with a call"),
+       cl::Hidden, cl::init(false));
+
+static cl::opt<int> ClPoisonStackPattern("msan-poison-stack-pattern",
+       cl::desc("poison uninitialized stack variables with the given pattern"),
+       cl::Hidden, cl::init(0xff));
+
+static cl::opt<bool> ClPoisonUndef("msan-poison-undef",
+       cl::desc("poison undef temps"),
+       cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClHandleICmp("msan-handle-icmp",
+       cl::desc("propagate shadow through ICmpEQ and ICmpNE"),
+       cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClHandleICmpExact("msan-handle-icmp-exact",
+       cl::desc("exact handling of relational integer ICmp"),
+       cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClHandleLifetimeIntrinsics(
+    "msan-handle-lifetime-intrinsics",
+    cl::desc(
+        "when possible, poison scoped variables at the beginning of the scope "
+        "(slower, but more precise)"),
+    cl::Hidden, cl::init(true));
+
+// When compiling the Linux kernel, we sometimes see false positives related to
+// MSan being unable to understand that inline assembly calls may initialize
+// local variables.
+// This flag makes the compiler conservatively unpoison every memory location
+// passed into an assembly call. Note that this may cause false positives.
+// Because it's impossible to figure out the array sizes, we can only unpoison
+// the first sizeof(type) bytes for each type* pointer.
+// The instrumentation is only enabled in KMSAN builds, and only if
+// -msan-handle-asm-conservative is on. This is done because we may want to
+// quickly disable assembly instrumentation when it breaks.
+static cl::opt<bool> ClHandleAsmConservative(
+    "msan-handle-asm-conservative",
+    cl::desc("conservative handling of inline assembly"), cl::Hidden,
+    cl::init(true));
+
+// This flag controls whether we check the shadow of the address
+// operand of load or store. Such bugs are very rare, since load from
+// a garbage address typically results in SEGV, but still happen
+// (e.g. only lower bits of address are garbage, or the access happens
+// early at program startup where malloc-ed memory is more likely to
+// be zeroed. As of 2012-08-28 this flag adds 20% slowdown.
+static cl::opt<bool> ClCheckAccessAddress("msan-check-access-address",
+       cl::desc("report accesses through a pointer which has poisoned shadow"),
+       cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClEagerChecks(
+    "msan-eager-checks",
+    cl::desc("check arguments and return values at function call boundaries"),
+    cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClDumpStrictInstructions("msan-dump-strict-instructions",
+       cl::desc("print out instructions with default strict semantics"),
+       cl::Hidden, cl::init(false));
+
+static cl::opt<int> ClInstrumentationWithCallThreshold(
+    "msan-instrumentation-with-call-threshold",
+    cl::desc(
+        "If the function being instrumented requires more than "
+        "this number of checks and origin stores, use callbacks instead of "
+        "inline checks (-1 means never use callbacks)."),
+    cl::Hidden, cl::init(3500));
+
+static cl::opt<bool>
+    ClEnableKmsan("msan-kernel",
+                  cl::desc("Enable KernelMemorySanitizer instrumentation"),
+                  cl::Hidden, cl::init(false));
+
+// This is an experiment to enable handling of cases where shadow is a non-zero
+// compile-time constant. For some unexplainable reason they were silently
+// ignored in the instrumentation.
+static cl::opt<bool> ClCheckConstantShadow("msan-check-constant-shadow",
+       cl::desc("Insert checks for constant shadow values"),
+       cl::Hidden, cl::init(false));
+
+// This is off by default because of a bug in gold:
+// https://sourceware.org/bugzilla/show_bug.cgi?id=19002
+static cl::opt<bool> ClWithComdat("msan-with-comdat",
+       cl::desc("Place MSan constructors in comdat sections"),
+       cl::Hidden, cl::init(false));
+
+// These options allow to specify custom memory map parameters
+// See MemoryMapParams for details.
+static cl::opt<uint64_t> ClAndMask("msan-and-mask",
+                                   cl::desc("Define custom MSan AndMask"),
+                                   cl::Hidden, cl::init(0));
+
+static cl::opt<uint64_t> ClXorMask("msan-xor-mask",
+                                   cl::desc("Define custom MSan XorMask"),
+                                   cl::Hidden, cl::init(0));
+
+static cl::opt<uint64_t> ClShadowBase("msan-shadow-base",
+                                      cl::desc("Define custom MSan ShadowBase"),
+                                      cl::Hidden, cl::init(0));
+
+static cl::opt<uint64_t> ClOriginBase("msan-origin-base",
+                                      cl::desc("Define custom MSan OriginBase"),
+                                      cl::Hidden, cl::init(0));
+
 const char kMsanModuleCtorName[] = "msan.module_ctor";
 const char kMsanInitName[] = "__msan_init";
- 
-namespace { 
- 
-// Memory map parameters used in application-to-shadow address calculation. 
-// Offset = (Addr & ~AndMask) ^ XorMask 
-// Shadow = ShadowBase + Offset 
-// Origin = OriginBase + Offset 
-struct MemoryMapParams { 
-  uint64_t AndMask; 
-  uint64_t XorMask; 
-  uint64_t ShadowBase; 
-  uint64_t OriginBase; 
-}; 
- 
-struct PlatformMemoryMapParams { 
-  const MemoryMapParams *bits32; 
-  const MemoryMapParams *bits64; 
-}; 
- 
-} // end anonymous namespace 
- 
-// i386 Linux 
-static const MemoryMapParams Linux_I386_MemoryMapParams = { 
-  0x000080000000,  // AndMask 
-  0,               // XorMask (not used) 
-  0,               // ShadowBase (not used) 
-  0x000040000000,  // OriginBase 
-}; 
- 
-// x86_64 Linux 
-static const MemoryMapParams Linux_X86_64_MemoryMapParams = { 
-#ifdef MSAN_LINUX_X86_64_OLD_MAPPING 
-  0x400000000000,  // AndMask 
-  0,               // XorMask (not used) 
-  0,               // ShadowBase (not used) 
-  0x200000000000,  // OriginBase 
-#else 
-  0,               // AndMask (not used) 
-  0x500000000000,  // XorMask 
-  0,               // ShadowBase (not used) 
-  0x100000000000,  // OriginBase 
-#endif 
-}; 
- 
-// mips64 Linux 
-static const MemoryMapParams Linux_MIPS64_MemoryMapParams = { 
-  0,               // AndMask (not used) 
-  0x008000000000,  // XorMask 
-  0,               // ShadowBase (not used) 
-  0x002000000000,  // OriginBase 
-}; 
- 
-// ppc64 Linux 
-static const MemoryMapParams Linux_PowerPC64_MemoryMapParams = { 
-  0xE00000000000,  // AndMask 
-  0x100000000000,  // XorMask 
-  0x080000000000,  // ShadowBase 
-  0x1C0000000000,  // OriginBase 
-}; 
- 
-// s390x Linux 
-static const MemoryMapParams Linux_S390X_MemoryMapParams = { 
-    0xC00000000000, // AndMask 
-    0,              // XorMask (not used) 
-    0x080000000000, // ShadowBase 
-    0x1C0000000000, // OriginBase 
-}; 
- 
-// aarch64 Linux 
-static const MemoryMapParams Linux_AArch64_MemoryMapParams = { 
-  0,               // AndMask (not used) 
-  0x06000000000,   // XorMask 
-  0,               // ShadowBase (not used) 
-  0x01000000000,   // OriginBase 
-}; 
- 
-// i386 FreeBSD 
-static const MemoryMapParams FreeBSD_I386_MemoryMapParams = { 
-  0x000180000000,  // AndMask 
-  0x000040000000,  // XorMask 
-  0x000020000000,  // ShadowBase 
-  0x000700000000,  // OriginBase 
-}; 
- 
-// x86_64 FreeBSD 
-static const MemoryMapParams FreeBSD_X86_64_MemoryMapParams = { 
-  0xc00000000000,  // AndMask 
-  0x200000000000,  // XorMask 
-  0x100000000000,  // ShadowBase 
-  0x380000000000,  // OriginBase 
-}; 
- 
-// x86_64 NetBSD 
-static const MemoryMapParams NetBSD_X86_64_MemoryMapParams = { 
-  0,               // AndMask 
-  0x500000000000,  // XorMask 
-  0,               // ShadowBase 
-  0x100000000000,  // OriginBase 
-}; 
- 
-static const PlatformMemoryMapParams Linux_X86_MemoryMapParams = { 
-  &Linux_I386_MemoryMapParams, 
-  &Linux_X86_64_MemoryMapParams, 
-}; 
- 
-static const PlatformMemoryMapParams Linux_MIPS_MemoryMapParams = { 
-  nullptr, 
-  &Linux_MIPS64_MemoryMapParams, 
-}; 
- 
-static const PlatformMemoryMapParams Linux_PowerPC_MemoryMapParams = { 
-  nullptr, 
-  &Linux_PowerPC64_MemoryMapParams, 
-}; 
- 
-static const PlatformMemoryMapParams Linux_S390_MemoryMapParams = { 
-    nullptr, 
-    &Linux_S390X_MemoryMapParams, 
-}; 
- 
-static const PlatformMemoryMapParams Linux_ARM_MemoryMapParams = { 
-  nullptr, 
-  &Linux_AArch64_MemoryMapParams, 
-}; 
- 
-static const PlatformMemoryMapParams FreeBSD_X86_MemoryMapParams = { 
-  &FreeBSD_I386_MemoryMapParams, 
-  &FreeBSD_X86_64_MemoryMapParams, 
-}; 
- 
-static const PlatformMemoryMapParams NetBSD_X86_MemoryMapParams = { 
-  nullptr, 
-  &NetBSD_X86_64_MemoryMapParams, 
-}; 
- 
-namespace { 
- 
-/// Instrument functions of a module to detect uninitialized reads. 
-/// 
-/// Instantiating MemorySanitizer inserts the msan runtime library API function 
-/// declarations into the module if they don't exist already. Instantiating 
-/// ensures the __msan_init function is in the list of global constructors for 
-/// the module. 
-class MemorySanitizer { 
-public: 
-  MemorySanitizer(Module &M, MemorySanitizerOptions Options) 
-      : CompileKernel(Options.Kernel), TrackOrigins(Options.TrackOrigins), 
-        Recover(Options.Recover) { 
-    initializeModule(M); 
-  } 
- 
-  // MSan cannot be moved or copied because of MapParams. 
-  MemorySanitizer(MemorySanitizer &&) = delete; 
-  MemorySanitizer &operator=(MemorySanitizer &&) = delete; 
-  MemorySanitizer(const MemorySanitizer &) = delete; 
-  MemorySanitizer &operator=(const MemorySanitizer &) = delete; 
- 
-  bool sanitizeFunction(Function &F, TargetLibraryInfo &TLI); 
- 
-private: 
-  friend struct MemorySanitizerVisitor; 
-  friend struct VarArgAMD64Helper; 
-  friend struct VarArgMIPS64Helper; 
-  friend struct VarArgAArch64Helper; 
-  friend struct VarArgPowerPC64Helper; 
-  friend struct VarArgSystemZHelper; 
- 
-  void initializeModule(Module &M); 
-  void initializeCallbacks(Module &M); 
-  void createKernelApi(Module &M); 
-  void createUserspaceApi(Module &M); 
- 
-  /// True if we're compiling the Linux kernel. 
-  bool CompileKernel; 
-  /// Track origins (allocation points) of uninitialized values. 
-  int TrackOrigins; 
-  bool Recover; 
- 
-  LLVMContext *C; 
-  Type *IntptrTy; 
-  Type *OriginTy; 
- 
-  // XxxTLS variables represent the per-thread state in MSan and per-task state 
-  // in KMSAN. 
-  // For the userspace these point to thread-local globals. In the kernel land 
-  // they point to the members of a per-task struct obtained via a call to 
-  // __msan_get_context_state(). 
- 
-  /// Thread-local shadow storage for function parameters. 
-  Value *ParamTLS; 
- 
-  /// Thread-local origin storage for function parameters. 
-  Value *ParamOriginTLS; 
- 
-  /// Thread-local shadow storage for function return value. 
-  Value *RetvalTLS; 
- 
-  /// Thread-local origin storage for function return value. 
-  Value *RetvalOriginTLS; 
- 
-  /// Thread-local shadow storage for in-register va_arg function 
-  /// parameters (x86_64-specific). 
-  Value *VAArgTLS; 
- 
-  /// Thread-local shadow storage for in-register va_arg function 
-  /// parameters (x86_64-specific). 
-  Value *VAArgOriginTLS; 
- 
-  /// Thread-local shadow storage for va_arg overflow area 
-  /// (x86_64-specific). 
-  Value *VAArgOverflowSizeTLS; 
- 
-  /// Are the instrumentation callbacks set up? 
-  bool CallbacksInitialized = false; 
- 
-  /// The run-time callback to print a warning. 
-  FunctionCallee WarningFn; 
- 
-  // These arrays are indexed by log2(AccessSize). 
-  FunctionCallee MaybeWarningFn[kNumberOfAccessSizes]; 
-  FunctionCallee MaybeStoreOriginFn[kNumberOfAccessSizes]; 
- 
-  /// Run-time helper that generates a new origin value for a stack 
-  /// allocation. 
-  FunctionCallee MsanSetAllocaOrigin4Fn; 
- 
-  /// Run-time helper that poisons stack on function entry. 
-  FunctionCallee MsanPoisonStackFn; 
- 
-  /// Run-time helper that records a store (or any event) of an 
-  /// uninitialized value and returns an updated origin id encoding this info. 
-  FunctionCallee MsanChainOriginFn; 
- 
+
+namespace {
+
+// Memory map parameters used in application-to-shadow address calculation.
+// Offset = (Addr & ~AndMask) ^ XorMask
+// Shadow = ShadowBase + Offset
+// Origin = OriginBase + Offset
+struct MemoryMapParams {
+  uint64_t AndMask;
+  uint64_t XorMask;
+  uint64_t ShadowBase;
+  uint64_t OriginBase;
+};
+
+struct PlatformMemoryMapParams {
+  const MemoryMapParams *bits32;
+  const MemoryMapParams *bits64;
+};
+
+} // end anonymous namespace
+
+// i386 Linux
+static const MemoryMapParams Linux_I386_MemoryMapParams = {
+  0x000080000000,  // AndMask
+  0,               // XorMask (not used)
+  0,               // ShadowBase (not used)
+  0x000040000000,  // OriginBase
+};
+
+// x86_64 Linux
+static const MemoryMapParams Linux_X86_64_MemoryMapParams = {
+#ifdef MSAN_LINUX_X86_64_OLD_MAPPING
+  0x400000000000,  // AndMask
+  0,               // XorMask (not used)
+  0,               // ShadowBase (not used)
+  0x200000000000,  // OriginBase
+#else
+  0,               // AndMask (not used)
+  0x500000000000,  // XorMask
+  0,               // ShadowBase (not used)
+  0x100000000000,  // OriginBase
+#endif
+};
+
+// mips64 Linux
+static const MemoryMapParams Linux_MIPS64_MemoryMapParams = {
+  0,               // AndMask (not used)
+  0x008000000000,  // XorMask
+  0,               // ShadowBase (not used)
+  0x002000000000,  // OriginBase
+};
+
+// ppc64 Linux
+static const MemoryMapParams Linux_PowerPC64_MemoryMapParams = {
+  0xE00000000000,  // AndMask
+  0x100000000000,  // XorMask
+  0x080000000000,  // ShadowBase
+  0x1C0000000000,  // OriginBase
+};
+
+// s390x Linux
+static const MemoryMapParams Linux_S390X_MemoryMapParams = {
+    0xC00000000000, // AndMask
+    0,              // XorMask (not used)
+    0x080000000000, // ShadowBase
+    0x1C0000000000, // OriginBase
+};
+
+// aarch64 Linux
+static const MemoryMapParams Linux_AArch64_MemoryMapParams = {
+  0,               // AndMask (not used)
+  0x06000000000,   // XorMask
+  0,               // ShadowBase (not used)
+  0x01000000000,   // OriginBase
+};
+
+// i386 FreeBSD
+static const MemoryMapParams FreeBSD_I386_MemoryMapParams = {
+  0x000180000000,  // AndMask
+  0x000040000000,  // XorMask
+  0x000020000000,  // ShadowBase
+  0x000700000000,  // OriginBase
+};
+
+// x86_64 FreeBSD
+static const MemoryMapParams FreeBSD_X86_64_MemoryMapParams = {
+  0xc00000000000,  // AndMask
+  0x200000000000,  // XorMask
+  0x100000000000,  // ShadowBase
+  0x380000000000,  // OriginBase
+};
+
+// x86_64 NetBSD
+static const MemoryMapParams NetBSD_X86_64_MemoryMapParams = {
+  0,               // AndMask
+  0x500000000000,  // XorMask
+  0,               // ShadowBase
+  0x100000000000,  // OriginBase
+};
+
+static const PlatformMemoryMapParams Linux_X86_MemoryMapParams = {
+  &Linux_I386_MemoryMapParams,
+  &Linux_X86_64_MemoryMapParams,
+};
+
+static const PlatformMemoryMapParams Linux_MIPS_MemoryMapParams = {
+  nullptr,
+  &Linux_MIPS64_MemoryMapParams,
+};
+
+static const PlatformMemoryMapParams Linux_PowerPC_MemoryMapParams = {
+  nullptr,
+  &Linux_PowerPC64_MemoryMapParams,
+};
+
+static const PlatformMemoryMapParams Linux_S390_MemoryMapParams = {
+    nullptr,
+    &Linux_S390X_MemoryMapParams,
+};
+
+static const PlatformMemoryMapParams Linux_ARM_MemoryMapParams = {
+  nullptr,
+  &Linux_AArch64_MemoryMapParams,
+};
+
+static const PlatformMemoryMapParams FreeBSD_X86_MemoryMapParams = {
+  &FreeBSD_I386_MemoryMapParams,
+  &FreeBSD_X86_64_MemoryMapParams,
+};
+
+static const PlatformMemoryMapParams NetBSD_X86_MemoryMapParams = {
+  nullptr,
+  &NetBSD_X86_64_MemoryMapParams,
+};
+
+namespace {
+
+/// Instrument functions of a module to detect uninitialized reads.
+///
+/// Instantiating MemorySanitizer inserts the msan runtime library API function
+/// declarations into the module if they don't exist already. Instantiating
+/// ensures the __msan_init function is in the list of global constructors for
+/// the module.
+class MemorySanitizer {
+public:
+  MemorySanitizer(Module &M, MemorySanitizerOptions Options)
+      : CompileKernel(Options.Kernel), TrackOrigins(Options.TrackOrigins),
+        Recover(Options.Recover) {
+    initializeModule(M);
+  }
+
+  // MSan cannot be moved or copied because of MapParams.
+  MemorySanitizer(MemorySanitizer &&) = delete;
+  MemorySanitizer &operator=(MemorySanitizer &&) = delete;
+  MemorySanitizer(const MemorySanitizer &) = delete;
+  MemorySanitizer &operator=(const MemorySanitizer &) = delete;
+
+  bool sanitizeFunction(Function &F, TargetLibraryInfo &TLI);
+
+private:
+  friend struct MemorySanitizerVisitor;
+  friend struct VarArgAMD64Helper;
+  friend struct VarArgMIPS64Helper;
+  friend struct VarArgAArch64Helper;
+  friend struct VarArgPowerPC64Helper;
+  friend struct VarArgSystemZHelper;
+
+  void initializeModule(Module &M);
+  void initializeCallbacks(Module &M);
+  void createKernelApi(Module &M);
+  void createUserspaceApi(Module &M);
+
+  /// True if we're compiling the Linux kernel.
+  bool CompileKernel;
+  /// Track origins (allocation points) of uninitialized values.
+  int TrackOrigins;
+  bool Recover;
+
+  LLVMContext *C;
+  Type *IntptrTy;
+  Type *OriginTy;
+
+  // XxxTLS variables represent the per-thread state in MSan and per-task state
+  // in KMSAN.
+  // For the userspace these point to thread-local globals. In the kernel land
+  // they point to the members of a per-task struct obtained via a call to
+  // __msan_get_context_state().
+
+  /// Thread-local shadow storage for function parameters.
+  Value *ParamTLS;
+
+  /// Thread-local origin storage for function parameters.
+  Value *ParamOriginTLS;
+
+  /// Thread-local shadow storage for function return value.
+  Value *RetvalTLS;
+
+  /// Thread-local origin storage for function return value.
+  Value *RetvalOriginTLS;
+
+  /// Thread-local shadow storage for in-register va_arg function
+  /// parameters (x86_64-specific).
+  Value *VAArgTLS;
+
+  /// Thread-local shadow storage for in-register va_arg function
+  /// parameters (x86_64-specific).
+  Value *VAArgOriginTLS;
+
+  /// Thread-local shadow storage for va_arg overflow area
+  /// (x86_64-specific).
+  Value *VAArgOverflowSizeTLS;
+
+  /// Are the instrumentation callbacks set up?
+  bool CallbacksInitialized = false;
+
+  /// The run-time callback to print a warning.
+  FunctionCallee WarningFn;
+
+  // These arrays are indexed by log2(AccessSize).
+  FunctionCallee MaybeWarningFn[kNumberOfAccessSizes];
+  FunctionCallee MaybeStoreOriginFn[kNumberOfAccessSizes];
+
+  /// Run-time helper that generates a new origin value for a stack
+  /// allocation.
+  FunctionCallee MsanSetAllocaOrigin4Fn;
+
+  /// Run-time helper that poisons stack on function entry.
+  FunctionCallee MsanPoisonStackFn;
+
+  /// Run-time helper that records a store (or any event) of an
+  /// uninitialized value and returns an updated origin id encoding this info.
+  FunctionCallee MsanChainOriginFn;
+
   /// Run-time helper that paints an origin over a region.
   FunctionCallee MsanSetOriginFn;
 
-  /// MSan runtime replacements for memmove, memcpy and memset. 
-  FunctionCallee MemmoveFn, MemcpyFn, MemsetFn; 
- 
-  /// KMSAN callback for task-local function argument shadow. 
-  StructType *MsanContextStateTy; 
-  FunctionCallee MsanGetContextStateFn; 
- 
-  /// Functions for poisoning/unpoisoning local variables 
-  FunctionCallee MsanPoisonAllocaFn, MsanUnpoisonAllocaFn; 
- 
-  /// Each of the MsanMetadataPtrXxx functions returns a pair of shadow/origin 
-  /// pointers. 
-  FunctionCallee MsanMetadataPtrForLoadN, MsanMetadataPtrForStoreN; 
-  FunctionCallee MsanMetadataPtrForLoad_1_8[4]; 
-  FunctionCallee MsanMetadataPtrForStore_1_8[4]; 
-  FunctionCallee MsanInstrumentAsmStoreFn; 
- 
-  /// Helper to choose between different MsanMetadataPtrXxx(). 
-  FunctionCallee getKmsanShadowOriginAccessFn(bool isStore, int size); 
- 
-  /// Memory map parameters used in application-to-shadow calculation. 
-  const MemoryMapParams *MapParams; 
- 
-  /// Custom memory map parameters used when -msan-shadow-base or 
-  // -msan-origin-base is provided. 
-  MemoryMapParams CustomMapParams; 
- 
-  MDNode *ColdCallWeights; 
- 
-  /// Branch weights for origin store. 
-  MDNode *OriginStoreWeights; 
-}; 
- 
-void insertModuleCtor(Module &M) { 
-  getOrCreateSanitizerCtorAndInitFunctions( 
-      M, kMsanModuleCtorName, kMsanInitName, 
-      /*InitArgTypes=*/{}, 
-      /*InitArgs=*/{}, 
-      // This callback is invoked when the functions are created the first 
-      // time. Hook them into the global ctors list in that case: 
-      [&](Function *Ctor, FunctionCallee) { 
-        if (!ClWithComdat) { 
-          appendToGlobalCtors(M, Ctor, 0); 
-          return; 
-        } 
-        Comdat *MsanCtorComdat = M.getOrInsertComdat(kMsanModuleCtorName); 
-        Ctor->setComdat(MsanCtorComdat); 
-        appendToGlobalCtors(M, Ctor, 0, Ctor); 
-      }); 
-} 
- 
-/// A legacy function pass for msan instrumentation. 
-/// 
-/// Instruments functions to detect uninitialized reads. 
-struct MemorySanitizerLegacyPass : public FunctionPass { 
-  // Pass identification, replacement for typeid. 
-  static char ID; 
- 
-  MemorySanitizerLegacyPass(MemorySanitizerOptions Options = {}) 
-      : FunctionPass(ID), Options(Options) { 
-    initializeMemorySanitizerLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
-  StringRef getPassName() const override { return "MemorySanitizerLegacyPass"; } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-  } 
- 
-  bool runOnFunction(Function &F) override { 
-    return MSan->sanitizeFunction( 
-        F, getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F)); 
-  } 
-  bool doInitialization(Module &M) override; 
- 
-  Optional<MemorySanitizer> MSan; 
-  MemorySanitizerOptions Options; 
-}; 
- 
-template <class T> T getOptOrDefault(const cl::opt<T> &Opt, T Default) { 
-  return (Opt.getNumOccurrences() > 0) ? Opt : Default; 
-} 
- 
-} // end anonymous namespace 
- 
-MemorySanitizerOptions::MemorySanitizerOptions(int TO, bool R, bool K) 
-    : Kernel(getOptOrDefault(ClEnableKmsan, K)), 
-      TrackOrigins(getOptOrDefault(ClTrackOrigins, Kernel ? 2 : TO)), 
-      Recover(getOptOrDefault(ClKeepGoing, Kernel || R)) {} 
- 
-PreservedAnalyses MemorySanitizerPass::run(Function &F, 
-                                           FunctionAnalysisManager &FAM) { 
-  MemorySanitizer Msan(*F.getParent(), Options); 
-  if (Msan.sanitizeFunction(F, FAM.getResult<TargetLibraryAnalysis>(F))) 
-    return PreservedAnalyses::none(); 
-  return PreservedAnalyses::all(); 
-} 
- 
-PreservedAnalyses MemorySanitizerPass::run(Module &M, 
-                                           ModuleAnalysisManager &AM) { 
-  if (Options.Kernel) 
-    return PreservedAnalyses::all(); 
-  insertModuleCtor(M); 
-  return PreservedAnalyses::none(); 
-} 
- 
-char MemorySanitizerLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(MemorySanitizerLegacyPass, "msan", 
-                      "MemorySanitizer: detects uninitialized reads.", false, 
-                      false) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_END(MemorySanitizerLegacyPass, "msan", 
-                    "MemorySanitizer: detects uninitialized reads.", false, 
-                    false) 
- 
-FunctionPass * 
-llvm::createMemorySanitizerLegacyPassPass(MemorySanitizerOptions Options) { 
-  return new MemorySanitizerLegacyPass(Options); 
-} 
- 
-/// Create a non-const global initialized with the given string. 
-/// 
-/// Creates a writable global for Str so that we can pass it to the 
-/// run-time lib. Runtime uses first 4 bytes of the string to store the 
-/// frame ID, so the string needs to be mutable. 
-static GlobalVariable *createPrivateNonConstGlobalForString(Module &M, 
-                                                            StringRef Str) { 
-  Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str); 
-  return new GlobalVariable(M, StrConst->getType(), /*isConstant=*/false, 
-                            GlobalValue::PrivateLinkage, StrConst, ""); 
-} 
- 
-/// Create KMSAN API callbacks. 
-void MemorySanitizer::createKernelApi(Module &M) { 
-  IRBuilder<> IRB(*C); 
- 
-  // These will be initialized in insertKmsanPrologue(). 
-  RetvalTLS = nullptr; 
-  RetvalOriginTLS = nullptr; 
-  ParamTLS = nullptr; 
-  ParamOriginTLS = nullptr; 
-  VAArgTLS = nullptr; 
-  VAArgOriginTLS = nullptr; 
-  VAArgOverflowSizeTLS = nullptr; 
- 
-  WarningFn = M.getOrInsertFunction("__msan_warning", IRB.getVoidTy(), 
-                                    IRB.getInt32Ty()); 
-  // Requests the per-task context state (kmsan_context_state*) from the 
-  // runtime library. 
-  MsanContextStateTy = StructType::get( 
-      ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8), 
-      ArrayType::get(IRB.getInt64Ty(), kRetvalTLSSize / 8), 
-      ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8), 
-      ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8), /* va_arg_origin */ 
-      IRB.getInt64Ty(), ArrayType::get(OriginTy, kParamTLSSize / 4), OriginTy, 
-      OriginTy); 
-  MsanGetContextStateFn = M.getOrInsertFunction( 
-      "__msan_get_context_state", PointerType::get(MsanContextStateTy, 0)); 
- 
-  Type *RetTy = StructType::get(PointerType::get(IRB.getInt8Ty(), 0), 
-                                PointerType::get(IRB.getInt32Ty(), 0)); 
- 
-  for (int ind = 0, size = 1; ind < 4; ind++, size <<= 1) { 
-    std::string name_load = 
-        "__msan_metadata_ptr_for_load_" + std::to_string(size); 
-    std::string name_store = 
-        "__msan_metadata_ptr_for_store_" + std::to_string(size); 
-    MsanMetadataPtrForLoad_1_8[ind] = M.getOrInsertFunction( 
-        name_load, RetTy, PointerType::get(IRB.getInt8Ty(), 0)); 
-    MsanMetadataPtrForStore_1_8[ind] = M.getOrInsertFunction( 
-        name_store, RetTy, PointerType::get(IRB.getInt8Ty(), 0)); 
-  } 
- 
-  MsanMetadataPtrForLoadN = M.getOrInsertFunction( 
-      "__msan_metadata_ptr_for_load_n", RetTy, 
-      PointerType::get(IRB.getInt8Ty(), 0), IRB.getInt64Ty()); 
-  MsanMetadataPtrForStoreN = M.getOrInsertFunction( 
-      "__msan_metadata_ptr_for_store_n", RetTy, 
-      PointerType::get(IRB.getInt8Ty(), 0), IRB.getInt64Ty()); 
- 
-  // Functions for poisoning and unpoisoning memory. 
-  MsanPoisonAllocaFn = 
-      M.getOrInsertFunction("__msan_poison_alloca", IRB.getVoidTy(), 
-                            IRB.getInt8PtrTy(), IntptrTy, IRB.getInt8PtrTy()); 
-  MsanUnpoisonAllocaFn = M.getOrInsertFunction( 
-      "__msan_unpoison_alloca", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy); 
-} 
- 
-static Constant *getOrInsertGlobal(Module &M, StringRef Name, Type *Ty) { 
-  return M.getOrInsertGlobal(Name, Ty, [&] { 
-    return new GlobalVariable(M, Ty, false, GlobalVariable::ExternalLinkage, 
-                              nullptr, Name, nullptr, 
-                              GlobalVariable::InitialExecTLSModel); 
-  }); 
-} 
- 
-/// Insert declarations for userspace-specific functions and globals. 
-void MemorySanitizer::createUserspaceApi(Module &M) { 
-  IRBuilder<> IRB(*C); 
- 
-  // Create the callback. 
-  // FIXME: this function should have "Cold" calling conv, 
-  // which is not yet implemented. 
-  StringRef WarningFnName = Recover ? "__msan_warning_with_origin" 
-                                    : "__msan_warning_with_origin_noreturn"; 
-  WarningFn = 
-      M.getOrInsertFunction(WarningFnName, IRB.getVoidTy(), IRB.getInt32Ty()); 
- 
-  // Create the global TLS variables. 
-  RetvalTLS = 
-      getOrInsertGlobal(M, "__msan_retval_tls", 
-                        ArrayType::get(IRB.getInt64Ty(), kRetvalTLSSize / 8)); 
- 
-  RetvalOriginTLS = getOrInsertGlobal(M, "__msan_retval_origin_tls", OriginTy); 
- 
-  ParamTLS = 
-      getOrInsertGlobal(M, "__msan_param_tls", 
-                        ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8)); 
- 
-  ParamOriginTLS = 
-      getOrInsertGlobal(M, "__msan_param_origin_tls", 
-                        ArrayType::get(OriginTy, kParamTLSSize / 4)); 
- 
-  VAArgTLS = 
-      getOrInsertGlobal(M, "__msan_va_arg_tls", 
-                        ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8)); 
- 
-  VAArgOriginTLS = 
-      getOrInsertGlobal(M, "__msan_va_arg_origin_tls", 
-                        ArrayType::get(OriginTy, kParamTLSSize / 4)); 
- 
-  VAArgOverflowSizeTLS = 
-      getOrInsertGlobal(M, "__msan_va_arg_overflow_size_tls", IRB.getInt64Ty()); 
- 
-  for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes; 
-       AccessSizeIndex++) { 
-    unsigned AccessSize = 1 << AccessSizeIndex; 
-    std::string FunctionName = "__msan_maybe_warning_" + itostr(AccessSize); 
-    SmallVector<std::pair<unsigned, Attribute>, 2> MaybeWarningFnAttrs; 
-    MaybeWarningFnAttrs.push_back(std::make_pair( 
-        AttributeList::FirstArgIndex, Attribute::get(*C, Attribute::ZExt))); 
-    MaybeWarningFnAttrs.push_back(std::make_pair( 
-        AttributeList::FirstArgIndex + 1, Attribute::get(*C, Attribute::ZExt))); 
-    MaybeWarningFn[AccessSizeIndex] = M.getOrInsertFunction( 
-        FunctionName, AttributeList::get(*C, MaybeWarningFnAttrs), 
-        IRB.getVoidTy(), IRB.getIntNTy(AccessSize * 8), IRB.getInt32Ty()); 
- 
-    FunctionName = "__msan_maybe_store_origin_" + itostr(AccessSize); 
-    SmallVector<std::pair<unsigned, Attribute>, 2> MaybeStoreOriginFnAttrs; 
-    MaybeStoreOriginFnAttrs.push_back(std::make_pair( 
-        AttributeList::FirstArgIndex, Attribute::get(*C, Attribute::ZExt))); 
-    MaybeStoreOriginFnAttrs.push_back(std::make_pair( 
-        AttributeList::FirstArgIndex + 2, Attribute::get(*C, Attribute::ZExt))); 
-    MaybeStoreOriginFn[AccessSizeIndex] = M.getOrInsertFunction( 
-        FunctionName, AttributeList::get(*C, MaybeStoreOriginFnAttrs), 
-        IRB.getVoidTy(), IRB.getIntNTy(AccessSize * 8), IRB.getInt8PtrTy(), 
-        IRB.getInt32Ty()); 
-  } 
- 
-  MsanSetAllocaOrigin4Fn = M.getOrInsertFunction( 
-    "__msan_set_alloca_origin4", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy, 
-    IRB.getInt8PtrTy(), IntptrTy); 
-  MsanPoisonStackFn = 
-      M.getOrInsertFunction("__msan_poison_stack", IRB.getVoidTy(), 
-                            IRB.getInt8PtrTy(), IntptrTy); 
-} 
- 
-/// Insert extern declaration of runtime-provided functions and globals. 
-void MemorySanitizer::initializeCallbacks(Module &M) { 
-  // Only do this once. 
-  if (CallbacksInitialized) 
-    return; 
- 
-  IRBuilder<> IRB(*C); 
-  // Initialize callbacks that are common for kernel and userspace 
-  // instrumentation. 
-  MsanChainOriginFn = M.getOrInsertFunction( 
-    "__msan_chain_origin", IRB.getInt32Ty(), IRB.getInt32Ty()); 
+  /// MSan runtime replacements for memmove, memcpy and memset.
+  FunctionCallee MemmoveFn, MemcpyFn, MemsetFn;
+
+  /// KMSAN callback for task-local function argument shadow.
+  StructType *MsanContextStateTy;
+  FunctionCallee MsanGetContextStateFn;
+
+  /// Functions for poisoning/unpoisoning local variables
+  FunctionCallee MsanPoisonAllocaFn, MsanUnpoisonAllocaFn;
+
+  /// Each of the MsanMetadataPtrXxx functions returns a pair of shadow/origin
+  /// pointers.
+  FunctionCallee MsanMetadataPtrForLoadN, MsanMetadataPtrForStoreN;
+  FunctionCallee MsanMetadataPtrForLoad_1_8[4];
+  FunctionCallee MsanMetadataPtrForStore_1_8[4];
+  FunctionCallee MsanInstrumentAsmStoreFn;
+
+  /// Helper to choose between different MsanMetadataPtrXxx().
+  FunctionCallee getKmsanShadowOriginAccessFn(bool isStore, int size);
+
+  /// Memory map parameters used in application-to-shadow calculation.
+  const MemoryMapParams *MapParams;
+
+  /// Custom memory map parameters used when -msan-shadow-base or
+  // -msan-origin-base is provided.
+  MemoryMapParams CustomMapParams;
+
+  MDNode *ColdCallWeights;
+
+  /// Branch weights for origin store.
+  MDNode *OriginStoreWeights;
+};
+
+void insertModuleCtor(Module &M) {
+  getOrCreateSanitizerCtorAndInitFunctions(
+      M, kMsanModuleCtorName, kMsanInitName,
+      /*InitArgTypes=*/{},
+      /*InitArgs=*/{},
+      // This callback is invoked when the functions are created the first
+      // time. Hook them into the global ctors list in that case:
+      [&](Function *Ctor, FunctionCallee) {
+        if (!ClWithComdat) {
+          appendToGlobalCtors(M, Ctor, 0);
+          return;
+        }
+        Comdat *MsanCtorComdat = M.getOrInsertComdat(kMsanModuleCtorName);
+        Ctor->setComdat(MsanCtorComdat);
+        appendToGlobalCtors(M, Ctor, 0, Ctor);
+      });
+}
+
+/// A legacy function pass for msan instrumentation.
+///
+/// Instruments functions to detect uninitialized reads.
+struct MemorySanitizerLegacyPass : public FunctionPass {
+  // Pass identification, replacement for typeid.
+  static char ID;
+
+  MemorySanitizerLegacyPass(MemorySanitizerOptions Options = {})
+      : FunctionPass(ID), Options(Options) {
+    initializeMemorySanitizerLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+  StringRef getPassName() const override { return "MemorySanitizerLegacyPass"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override {
+    return MSan->sanitizeFunction(
+        F, getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F));
+  }
+  bool doInitialization(Module &M) override;
+
+  Optional<MemorySanitizer> MSan;
+  MemorySanitizerOptions Options;
+};
+
+template <class T> T getOptOrDefault(const cl::opt<T> &Opt, T Default) {
+  return (Opt.getNumOccurrences() > 0) ? Opt : Default;
+}
+
+} // end anonymous namespace
+
+MemorySanitizerOptions::MemorySanitizerOptions(int TO, bool R, bool K)
+    : Kernel(getOptOrDefault(ClEnableKmsan, K)),
+      TrackOrigins(getOptOrDefault(ClTrackOrigins, Kernel ? 2 : TO)),
+      Recover(getOptOrDefault(ClKeepGoing, Kernel || R)) {}
+
+PreservedAnalyses MemorySanitizerPass::run(Function &F,
+                                           FunctionAnalysisManager &FAM) {
+  MemorySanitizer Msan(*F.getParent(), Options);
+  if (Msan.sanitizeFunction(F, FAM.getResult<TargetLibraryAnalysis>(F)))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+PreservedAnalyses MemorySanitizerPass::run(Module &M,
+                                           ModuleAnalysisManager &AM) {
+  if (Options.Kernel)
+    return PreservedAnalyses::all();
+  insertModuleCtor(M);
+  return PreservedAnalyses::none();
+}
+
+char MemorySanitizerLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(MemorySanitizerLegacyPass, "msan",
+                      "MemorySanitizer: detects uninitialized reads.", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(MemorySanitizerLegacyPass, "msan",
+                    "MemorySanitizer: detects uninitialized reads.", false,
+                    false)
+
+FunctionPass *
+llvm::createMemorySanitizerLegacyPassPass(MemorySanitizerOptions Options) {
+  return new MemorySanitizerLegacyPass(Options);
+}
+
+/// Create a non-const global initialized with the given string.
+///
+/// Creates a writable global for Str so that we can pass it to the
+/// run-time lib. Runtime uses first 4 bytes of the string to store the
+/// frame ID, so the string needs to be mutable.
+static GlobalVariable *createPrivateNonConstGlobalForString(Module &M,
+                                                            StringRef Str) {
+  Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
+  return new GlobalVariable(M, StrConst->getType(), /*isConstant=*/false,
+                            GlobalValue::PrivateLinkage, StrConst, "");
+}
+
+/// Create KMSAN API callbacks.
+void MemorySanitizer::createKernelApi(Module &M) {
+  IRBuilder<> IRB(*C);
+
+  // These will be initialized in insertKmsanPrologue().
+  RetvalTLS = nullptr;
+  RetvalOriginTLS = nullptr;
+  ParamTLS = nullptr;
+  ParamOriginTLS = nullptr;
+  VAArgTLS = nullptr;
+  VAArgOriginTLS = nullptr;
+  VAArgOverflowSizeTLS = nullptr;
+
+  WarningFn = M.getOrInsertFunction("__msan_warning", IRB.getVoidTy(),
+                                    IRB.getInt32Ty());
+  // Requests the per-task context state (kmsan_context_state*) from the
+  // runtime library.
+  MsanContextStateTy = StructType::get(
+      ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8),
+      ArrayType::get(IRB.getInt64Ty(), kRetvalTLSSize / 8),
+      ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8),
+      ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8), /* va_arg_origin */
+      IRB.getInt64Ty(), ArrayType::get(OriginTy, kParamTLSSize / 4), OriginTy,
+      OriginTy);
+  MsanGetContextStateFn = M.getOrInsertFunction(
+      "__msan_get_context_state", PointerType::get(MsanContextStateTy, 0));
+
+  Type *RetTy = StructType::get(PointerType::get(IRB.getInt8Ty(), 0),
+                                PointerType::get(IRB.getInt32Ty(), 0));
+
+  for (int ind = 0, size = 1; ind < 4; ind++, size <<= 1) {
+    std::string name_load =
+        "__msan_metadata_ptr_for_load_" + std::to_string(size);
+    std::string name_store =
+        "__msan_metadata_ptr_for_store_" + std::to_string(size);
+    MsanMetadataPtrForLoad_1_8[ind] = M.getOrInsertFunction(
+        name_load, RetTy, PointerType::get(IRB.getInt8Ty(), 0));
+    MsanMetadataPtrForStore_1_8[ind] = M.getOrInsertFunction(
+        name_store, RetTy, PointerType::get(IRB.getInt8Ty(), 0));
+  }
+
+  MsanMetadataPtrForLoadN = M.getOrInsertFunction(
+      "__msan_metadata_ptr_for_load_n", RetTy,
+      PointerType::get(IRB.getInt8Ty(), 0), IRB.getInt64Ty());
+  MsanMetadataPtrForStoreN = M.getOrInsertFunction(
+      "__msan_metadata_ptr_for_store_n", RetTy,
+      PointerType::get(IRB.getInt8Ty(), 0), IRB.getInt64Ty());
+
+  // Functions for poisoning and unpoisoning memory.
+  MsanPoisonAllocaFn =
+      M.getOrInsertFunction("__msan_poison_alloca", IRB.getVoidTy(),
+                            IRB.getInt8PtrTy(), IntptrTy, IRB.getInt8PtrTy());
+  MsanUnpoisonAllocaFn = M.getOrInsertFunction(
+      "__msan_unpoison_alloca", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy);
+}
+
+static Constant *getOrInsertGlobal(Module &M, StringRef Name, Type *Ty) {
+  return M.getOrInsertGlobal(Name, Ty, [&] {
+    return new GlobalVariable(M, Ty, false, GlobalVariable::ExternalLinkage,
+                              nullptr, Name, nullptr,
+                              GlobalVariable::InitialExecTLSModel);
+  });
+}
+
+/// Insert declarations for userspace-specific functions and globals.
+void MemorySanitizer::createUserspaceApi(Module &M) {
+  IRBuilder<> IRB(*C);
+
+  // Create the callback.
+  // FIXME: this function should have "Cold" calling conv,
+  // which is not yet implemented.
+  StringRef WarningFnName = Recover ? "__msan_warning_with_origin"
+                                    : "__msan_warning_with_origin_noreturn";
+  WarningFn =
+      M.getOrInsertFunction(WarningFnName, IRB.getVoidTy(), IRB.getInt32Ty());
+
+  // Create the global TLS variables.
+  RetvalTLS =
+      getOrInsertGlobal(M, "__msan_retval_tls",
+                        ArrayType::get(IRB.getInt64Ty(), kRetvalTLSSize / 8));
+
+  RetvalOriginTLS = getOrInsertGlobal(M, "__msan_retval_origin_tls", OriginTy);
+
+  ParamTLS =
+      getOrInsertGlobal(M, "__msan_param_tls",
+                        ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8));
+
+  ParamOriginTLS =
+      getOrInsertGlobal(M, "__msan_param_origin_tls",
+                        ArrayType::get(OriginTy, kParamTLSSize / 4));
+
+  VAArgTLS =
+      getOrInsertGlobal(M, "__msan_va_arg_tls",
+                        ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8));
+
+  VAArgOriginTLS =
+      getOrInsertGlobal(M, "__msan_va_arg_origin_tls",
+                        ArrayType::get(OriginTy, kParamTLSSize / 4));
+
+  VAArgOverflowSizeTLS =
+      getOrInsertGlobal(M, "__msan_va_arg_overflow_size_tls", IRB.getInt64Ty());
+
+  for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
+       AccessSizeIndex++) {
+    unsigned AccessSize = 1 << AccessSizeIndex;
+    std::string FunctionName = "__msan_maybe_warning_" + itostr(AccessSize);
+    SmallVector<std::pair<unsigned, Attribute>, 2> MaybeWarningFnAttrs;
+    MaybeWarningFnAttrs.push_back(std::make_pair(
+        AttributeList::FirstArgIndex, Attribute::get(*C, Attribute::ZExt)));
+    MaybeWarningFnAttrs.push_back(std::make_pair(
+        AttributeList::FirstArgIndex + 1, Attribute::get(*C, Attribute::ZExt)));
+    MaybeWarningFn[AccessSizeIndex] = M.getOrInsertFunction(
+        FunctionName, AttributeList::get(*C, MaybeWarningFnAttrs),
+        IRB.getVoidTy(), IRB.getIntNTy(AccessSize * 8), IRB.getInt32Ty());
+
+    FunctionName = "__msan_maybe_store_origin_" + itostr(AccessSize);
+    SmallVector<std::pair<unsigned, Attribute>, 2> MaybeStoreOriginFnAttrs;
+    MaybeStoreOriginFnAttrs.push_back(std::make_pair(
+        AttributeList::FirstArgIndex, Attribute::get(*C, Attribute::ZExt)));
+    MaybeStoreOriginFnAttrs.push_back(std::make_pair(
+        AttributeList::FirstArgIndex + 2, Attribute::get(*C, Attribute::ZExt)));
+    MaybeStoreOriginFn[AccessSizeIndex] = M.getOrInsertFunction(
+        FunctionName, AttributeList::get(*C, MaybeStoreOriginFnAttrs),
+        IRB.getVoidTy(), IRB.getIntNTy(AccessSize * 8), IRB.getInt8PtrTy(),
+        IRB.getInt32Ty());
+  }
+
+  MsanSetAllocaOrigin4Fn = M.getOrInsertFunction(
+    "__msan_set_alloca_origin4", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy,
+    IRB.getInt8PtrTy(), IntptrTy);
+  MsanPoisonStackFn =
+      M.getOrInsertFunction("__msan_poison_stack", IRB.getVoidTy(),
+                            IRB.getInt8PtrTy(), IntptrTy);
+}
+
+/// Insert extern declaration of runtime-provided functions and globals.
+void MemorySanitizer::initializeCallbacks(Module &M) {
+  // Only do this once.
+  if (CallbacksInitialized)
+    return;
+
+  IRBuilder<> IRB(*C);
+  // Initialize callbacks that are common for kernel and userspace
+  // instrumentation.
+  MsanChainOriginFn = M.getOrInsertFunction(
+    "__msan_chain_origin", IRB.getInt32Ty(), IRB.getInt32Ty());
   MsanSetOriginFn =
       M.getOrInsertFunction("__msan_set_origin", IRB.getVoidTy(),
                             IRB.getInt8PtrTy(), IntptrTy, IRB.getInt32Ty());
-  MemmoveFn = M.getOrInsertFunction( 
-    "__msan_memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), 
-    IRB.getInt8PtrTy(), IntptrTy); 
-  MemcpyFn = M.getOrInsertFunction( 
-    "__msan_memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), 
-    IntptrTy); 
-  MemsetFn = M.getOrInsertFunction( 
-    "__msan_memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt32Ty(), 
-    IntptrTy); 
- 
-  MsanInstrumentAsmStoreFn = 
-      M.getOrInsertFunction("__msan_instrument_asm_store", IRB.getVoidTy(), 
-                            PointerType::get(IRB.getInt8Ty(), 0), IntptrTy); 
- 
-  if (CompileKernel) { 
-    createKernelApi(M); 
-  } else { 
-    createUserspaceApi(M); 
-  } 
-  CallbacksInitialized = true; 
-} 
- 
-FunctionCallee MemorySanitizer::getKmsanShadowOriginAccessFn(bool isStore, 
-                                                             int size) { 
-  FunctionCallee *Fns = 
-      isStore ? MsanMetadataPtrForStore_1_8 : MsanMetadataPtrForLoad_1_8; 
-  switch (size) { 
-  case 1: 
-    return Fns[0]; 
-  case 2: 
-    return Fns[1]; 
-  case 4: 
-    return Fns[2]; 
-  case 8: 
-    return Fns[3]; 
-  default: 
-    return nullptr; 
-  } 
-} 
- 
-/// Module-level initialization. 
-/// 
-/// inserts a call to __msan_init to the module's constructor list. 
-void MemorySanitizer::initializeModule(Module &M) { 
-  auto &DL = M.getDataLayout(); 
- 
-  bool ShadowPassed = ClShadowBase.getNumOccurrences() > 0; 
-  bool OriginPassed = ClOriginBase.getNumOccurrences() > 0; 
-  // Check the overrides first 
-  if (ShadowPassed || OriginPassed) { 
-    CustomMapParams.AndMask = ClAndMask; 
-    CustomMapParams.XorMask = ClXorMask; 
-    CustomMapParams.ShadowBase = ClShadowBase; 
-    CustomMapParams.OriginBase = ClOriginBase; 
-    MapParams = &CustomMapParams; 
-  } else { 
-    Triple TargetTriple(M.getTargetTriple()); 
-    switch (TargetTriple.getOS()) { 
-      case Triple::FreeBSD: 
-        switch (TargetTriple.getArch()) { 
-          case Triple::x86_64: 
-            MapParams = FreeBSD_X86_MemoryMapParams.bits64; 
-            break; 
-          case Triple::x86: 
-            MapParams = FreeBSD_X86_MemoryMapParams.bits32; 
-            break; 
-          default: 
-            report_fatal_error("unsupported architecture"); 
-        } 
-        break; 
-      case Triple::NetBSD: 
-        switch (TargetTriple.getArch()) { 
-          case Triple::x86_64: 
-            MapParams = NetBSD_X86_MemoryMapParams.bits64; 
-            break; 
-          default: 
-            report_fatal_error("unsupported architecture"); 
-        } 
-        break; 
-      case Triple::Linux: 
-        switch (TargetTriple.getArch()) { 
-          case Triple::x86_64: 
-            MapParams = Linux_X86_MemoryMapParams.bits64; 
-            break; 
-          case Triple::x86: 
-            MapParams = Linux_X86_MemoryMapParams.bits32; 
-            break; 
-          case Triple::mips64: 
-          case Triple::mips64el: 
-            MapParams = Linux_MIPS_MemoryMapParams.bits64; 
-            break; 
-          case Triple::ppc64: 
-          case Triple::ppc64le: 
-            MapParams = Linux_PowerPC_MemoryMapParams.bits64; 
-            break; 
-          case Triple::systemz: 
-            MapParams = Linux_S390_MemoryMapParams.bits64; 
-            break; 
-          case Triple::aarch64: 
-          case Triple::aarch64_be: 
-            MapParams = Linux_ARM_MemoryMapParams.bits64; 
-            break; 
-          default: 
-            report_fatal_error("unsupported architecture"); 
-        } 
-        break; 
-      default: 
-        report_fatal_error("unsupported operating system"); 
-    } 
-  } 
- 
-  C = &(M.getContext()); 
-  IRBuilder<> IRB(*C); 
-  IntptrTy = IRB.getIntPtrTy(DL); 
-  OriginTy = IRB.getInt32Ty(); 
- 
-  ColdCallWeights = MDBuilder(*C).createBranchWeights(1, 1000); 
-  OriginStoreWeights = MDBuilder(*C).createBranchWeights(1, 1000); 
- 
-  if (!CompileKernel) { 
-    if (TrackOrigins) 
-      M.getOrInsertGlobal("__msan_track_origins", IRB.getInt32Ty(), [&] { 
-        return new GlobalVariable( 
-            M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage, 
-            IRB.getInt32(TrackOrigins), "__msan_track_origins"); 
-      }); 
- 
-    if (Recover) 
-      M.getOrInsertGlobal("__msan_keep_going", IRB.getInt32Ty(), [&] { 
-        return new GlobalVariable(M, IRB.getInt32Ty(), true, 
-                                  GlobalValue::WeakODRLinkage, 
-                                  IRB.getInt32(Recover), "__msan_keep_going"); 
-      }); 
-} 
-} 
- 
-bool MemorySanitizerLegacyPass::doInitialization(Module &M) { 
-  if (!Options.Kernel) 
-    insertModuleCtor(M); 
-  MSan.emplace(M, Options); 
-  return true; 
-} 
- 
-namespace { 
- 
-/// A helper class that handles instrumentation of VarArg 
-/// functions on a particular platform. 
-/// 
-/// Implementations are expected to insert the instrumentation 
-/// necessary to propagate argument shadow through VarArg function 
-/// calls. Visit* methods are called during an InstVisitor pass over 
-/// the function, and should avoid creating new basic blocks. A new 
-/// instance of this class is created for each instrumented function. 
-struct VarArgHelper { 
-  virtual ~VarArgHelper() = default; 
- 
-  /// Visit a CallBase. 
-  virtual void visitCallBase(CallBase &CB, IRBuilder<> &IRB) = 0; 
- 
-  /// Visit a va_start call. 
-  virtual void visitVAStartInst(VAStartInst &I) = 0; 
- 
-  /// Visit a va_copy call. 
-  virtual void visitVACopyInst(VACopyInst &I) = 0; 
- 
-  /// Finalize function instrumentation. 
-  /// 
-  /// This method is called after visiting all interesting (see above) 
-  /// instructions in a function. 
-  virtual void finalizeInstrumentation() = 0; 
-}; 
- 
-struct MemorySanitizerVisitor; 
- 
-} // end anonymous namespace 
- 
-static VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan, 
-                                        MemorySanitizerVisitor &Visitor); 
- 
-static unsigned TypeSizeToSizeIndex(unsigned TypeSize) { 
-  if (TypeSize <= 8) return 0; 
-  return Log2_32_Ceil((TypeSize + 7) / 8); 
-} 
- 
-namespace { 
- 
-/// This class does all the work for a given function. Store and Load 
-/// instructions store and load corresponding shadow and origin 
-/// values. Most instructions propagate shadow from arguments to their 
-/// return values. Certain instructions (most importantly, BranchInst) 
-/// test their argument shadow and print reports (with a runtime call) if it's 
-/// non-zero. 
-struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { 
-  Function &F; 
-  MemorySanitizer &MS; 
-  SmallVector<PHINode *, 16> ShadowPHINodes, OriginPHINodes; 
-  ValueMap<Value*, Value*> ShadowMap, OriginMap; 
-  std::unique_ptr<VarArgHelper> VAHelper; 
-  const TargetLibraryInfo *TLI; 
+  MemmoveFn = M.getOrInsertFunction(
+    "__msan_memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+    IRB.getInt8PtrTy(), IntptrTy);
+  MemcpyFn = M.getOrInsertFunction(
+    "__msan_memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+    IntptrTy);
+  MemsetFn = M.getOrInsertFunction(
+    "__msan_memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt32Ty(),
+    IntptrTy);
+
+  MsanInstrumentAsmStoreFn =
+      M.getOrInsertFunction("__msan_instrument_asm_store", IRB.getVoidTy(),
+                            PointerType::get(IRB.getInt8Ty(), 0), IntptrTy);
+
+  if (CompileKernel) {
+    createKernelApi(M);
+  } else {
+    createUserspaceApi(M);
+  }
+  CallbacksInitialized = true;
+}
+
+FunctionCallee MemorySanitizer::getKmsanShadowOriginAccessFn(bool isStore,
+                                                             int size) {
+  FunctionCallee *Fns =
+      isStore ? MsanMetadataPtrForStore_1_8 : MsanMetadataPtrForLoad_1_8;
+  switch (size) {
+  case 1:
+    return Fns[0];
+  case 2:
+    return Fns[1];
+  case 4:
+    return Fns[2];
+  case 8:
+    return Fns[3];
+  default:
+    return nullptr;
+  }
+}
+
+/// Module-level initialization.
+///
+/// inserts a call to __msan_init to the module's constructor list.
+void MemorySanitizer::initializeModule(Module &M) {
+  auto &DL = M.getDataLayout();
+
+  bool ShadowPassed = ClShadowBase.getNumOccurrences() > 0;
+  bool OriginPassed = ClOriginBase.getNumOccurrences() > 0;
+  // Check the overrides first
+  if (ShadowPassed || OriginPassed) {
+    CustomMapParams.AndMask = ClAndMask;
+    CustomMapParams.XorMask = ClXorMask;
+    CustomMapParams.ShadowBase = ClShadowBase;
+    CustomMapParams.OriginBase = ClOriginBase;
+    MapParams = &CustomMapParams;
+  } else {
+    Triple TargetTriple(M.getTargetTriple());
+    switch (TargetTriple.getOS()) {
+      case Triple::FreeBSD:
+        switch (TargetTriple.getArch()) {
+          case Triple::x86_64:
+            MapParams = FreeBSD_X86_MemoryMapParams.bits64;
+            break;
+          case Triple::x86:
+            MapParams = FreeBSD_X86_MemoryMapParams.bits32;
+            break;
+          default:
+            report_fatal_error("unsupported architecture");
+        }
+        break;
+      case Triple::NetBSD:
+        switch (TargetTriple.getArch()) {
+          case Triple::x86_64:
+            MapParams = NetBSD_X86_MemoryMapParams.bits64;
+            break;
+          default:
+            report_fatal_error("unsupported architecture");
+        }
+        break;
+      case Triple::Linux:
+        switch (TargetTriple.getArch()) {
+          case Triple::x86_64:
+            MapParams = Linux_X86_MemoryMapParams.bits64;
+            break;
+          case Triple::x86:
+            MapParams = Linux_X86_MemoryMapParams.bits32;
+            break;
+          case Triple::mips64:
+          case Triple::mips64el:
+            MapParams = Linux_MIPS_MemoryMapParams.bits64;
+            break;
+          case Triple::ppc64:
+          case Triple::ppc64le:
+            MapParams = Linux_PowerPC_MemoryMapParams.bits64;
+            break;
+          case Triple::systemz:
+            MapParams = Linux_S390_MemoryMapParams.bits64;
+            break;
+          case Triple::aarch64:
+          case Triple::aarch64_be:
+            MapParams = Linux_ARM_MemoryMapParams.bits64;
+            break;
+          default:
+            report_fatal_error("unsupported architecture");
+        }
+        break;
+      default:
+        report_fatal_error("unsupported operating system");
+    }
+  }
+
+  C = &(M.getContext());
+  IRBuilder<> IRB(*C);
+  IntptrTy = IRB.getIntPtrTy(DL);
+  OriginTy = IRB.getInt32Ty();
+
+  ColdCallWeights = MDBuilder(*C).createBranchWeights(1, 1000);
+  OriginStoreWeights = MDBuilder(*C).createBranchWeights(1, 1000);
+
+  if (!CompileKernel) {
+    if (TrackOrigins)
+      M.getOrInsertGlobal("__msan_track_origins", IRB.getInt32Ty(), [&] {
+        return new GlobalVariable(
+            M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage,
+            IRB.getInt32(TrackOrigins), "__msan_track_origins");
+      });
+
+    if (Recover)
+      M.getOrInsertGlobal("__msan_keep_going", IRB.getInt32Ty(), [&] {
+        return new GlobalVariable(M, IRB.getInt32Ty(), true,
+                                  GlobalValue::WeakODRLinkage,
+                                  IRB.getInt32(Recover), "__msan_keep_going");
+      });
+}
+}
+
+bool MemorySanitizerLegacyPass::doInitialization(Module &M) {
+  if (!Options.Kernel)
+    insertModuleCtor(M);
+  MSan.emplace(M, Options);
+  return true;
+}
+
+namespace {
+
+/// A helper class that handles instrumentation of VarArg
+/// functions on a particular platform.
+///
+/// Implementations are expected to insert the instrumentation
+/// necessary to propagate argument shadow through VarArg function
+/// calls. Visit* methods are called during an InstVisitor pass over
+/// the function, and should avoid creating new basic blocks. A new
+/// instance of this class is created for each instrumented function.
+struct VarArgHelper {
+  virtual ~VarArgHelper() = default;
+
+  /// Visit a CallBase.
+  virtual void visitCallBase(CallBase &CB, IRBuilder<> &IRB) = 0;
+
+  /// Visit a va_start call.
+  virtual void visitVAStartInst(VAStartInst &I) = 0;
+
+  /// Visit a va_copy call.
+  virtual void visitVACopyInst(VACopyInst &I) = 0;
+
+  /// Finalize function instrumentation.
+  ///
+  /// This method is called after visiting all interesting (see above)
+  /// instructions in a function.
+  virtual void finalizeInstrumentation() = 0;
+};
+
+struct MemorySanitizerVisitor;
+
+} // end anonymous namespace
+
+static VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
+                                        MemorySanitizerVisitor &Visitor);
+
+static unsigned TypeSizeToSizeIndex(unsigned TypeSize) {
+  if (TypeSize <= 8) return 0;
+  return Log2_32_Ceil((TypeSize + 7) / 8);
+}
+
+namespace {
+
+/// This class does all the work for a given function. Store and Load
+/// instructions store and load corresponding shadow and origin
+/// values. Most instructions propagate shadow from arguments to their
+/// return values. Certain instructions (most importantly, BranchInst)
+/// test their argument shadow and print reports (with a runtime call) if it's
+/// non-zero.
+struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
+  Function &F;
+  MemorySanitizer &MS;
+  SmallVector<PHINode *, 16> ShadowPHINodes, OriginPHINodes;
+  ValueMap<Value*, Value*> ShadowMap, OriginMap;
+  std::unique_ptr<VarArgHelper> VAHelper;
+  const TargetLibraryInfo *TLI;
   Instruction *FnPrologueEnd;
- 
-  // The following flags disable parts of MSan instrumentation based on 
-  // exclusion list contents and command-line options. 
-  bool InsertChecks; 
-  bool PropagateShadow; 
-  bool PoisonStack; 
-  bool PoisonUndef; 
- 
-  struct ShadowOriginAndInsertPoint { 
-    Value *Shadow; 
-    Value *Origin; 
-    Instruction *OrigIns; 
- 
-    ShadowOriginAndInsertPoint(Value *S, Value *O, Instruction *I) 
-      : Shadow(S), Origin(O), OrigIns(I) {} 
-  }; 
-  SmallVector<ShadowOriginAndInsertPoint, 16> InstrumentationList; 
-  bool InstrumentLifetimeStart = ClHandleLifetimeIntrinsics; 
-  SmallSet<AllocaInst *, 16> AllocaSet; 
-  SmallVector<std::pair<IntrinsicInst *, AllocaInst *>, 16> LifetimeStartList; 
-  SmallVector<StoreInst *, 16> StoreList; 
- 
-  MemorySanitizerVisitor(Function &F, MemorySanitizer &MS, 
-                         const TargetLibraryInfo &TLI) 
-      : F(F), MS(MS), VAHelper(CreateVarArgHelper(F, MS, *this)), TLI(&TLI) { 
-    bool SanitizeFunction = F.hasFnAttribute(Attribute::SanitizeMemory); 
-    InsertChecks = SanitizeFunction; 
-    PropagateShadow = SanitizeFunction; 
-    PoisonStack = SanitizeFunction && ClPoisonStack; 
-    PoisonUndef = SanitizeFunction && ClPoisonUndef; 
- 
+
+  // The following flags disable parts of MSan instrumentation based on
+  // exclusion list contents and command-line options.
+  bool InsertChecks;
+  bool PropagateShadow;
+  bool PoisonStack;
+  bool PoisonUndef;
+
+  struct ShadowOriginAndInsertPoint {
+    Value *Shadow;
+    Value *Origin;
+    Instruction *OrigIns;
+
+    ShadowOriginAndInsertPoint(Value *S, Value *O, Instruction *I)
+      : Shadow(S), Origin(O), OrigIns(I) {}
+  };
+  SmallVector<ShadowOriginAndInsertPoint, 16> InstrumentationList;
+  bool InstrumentLifetimeStart = ClHandleLifetimeIntrinsics;
+  SmallSet<AllocaInst *, 16> AllocaSet;
+  SmallVector<std::pair<IntrinsicInst *, AllocaInst *>, 16> LifetimeStartList;
+  SmallVector<StoreInst *, 16> StoreList;
+
+  MemorySanitizerVisitor(Function &F, MemorySanitizer &MS,
+                         const TargetLibraryInfo &TLI)
+      : F(F), MS(MS), VAHelper(CreateVarArgHelper(F, MS, *this)), TLI(&TLI) {
+    bool SanitizeFunction = F.hasFnAttribute(Attribute::SanitizeMemory);
+    InsertChecks = SanitizeFunction;
+    PropagateShadow = SanitizeFunction;
+    PoisonStack = SanitizeFunction && ClPoisonStack;
+    PoisonUndef = SanitizeFunction && ClPoisonUndef;
+
     // In the presence of unreachable blocks, we may see Phi nodes with
     // incoming nodes from such blocks. Since InstVisitor skips unreachable
     // blocks, such nodes will not have any shadow value associated with them.
     // It's easier to remove unreachable blocks than deal with missing shadow.
     removeUnreachableBlocks(F);
 
-    MS.initializeCallbacks(*F.getParent()); 
+    MS.initializeCallbacks(*F.getParent());
     FnPrologueEnd = IRBuilder<>(F.getEntryBlock().getFirstNonPHI())
                         .CreateIntrinsic(Intrinsic::donothing, {}, {});
- 
+
     if (MS.CompileKernel) {
       IRBuilder<> IRB(FnPrologueEnd);
       insertKmsanPrologue(IRB);
     }
 
-    LLVM_DEBUG(if (!InsertChecks) dbgs() 
-               << "MemorySanitizer is not inserting checks into '" 
-               << F.getName() << "'\n"); 
-  } 
- 
+    LLVM_DEBUG(if (!InsertChecks) dbgs()
+               << "MemorySanitizer is not inserting checks into '"
+               << F.getName() << "'\n");
+  }
+
   bool isInPrologue(Instruction &I) {
     return I.getParent() == FnPrologueEnd->getParent() &&
            (&I == FnPrologueEnd || I.comesBefore(FnPrologueEnd));
   }
 
-  Value *updateOrigin(Value *V, IRBuilder<> &IRB) { 
-    if (MS.TrackOrigins <= 1) return V; 
-    return IRB.CreateCall(MS.MsanChainOriginFn, V); 
-  } 
- 
-  Value *originToIntptr(IRBuilder<> &IRB, Value *Origin) { 
-    const DataLayout &DL = F.getParent()->getDataLayout(); 
-    unsigned IntptrSize = DL.getTypeStoreSize(MS.IntptrTy); 
-    if (IntptrSize == kOriginSize) return Origin; 
-    assert(IntptrSize == kOriginSize * 2); 
-    Origin = IRB.CreateIntCast(Origin, MS.IntptrTy, /* isSigned */ false); 
-    return IRB.CreateOr(Origin, IRB.CreateShl(Origin, kOriginSize * 8)); 
-  } 
- 
-  /// Fill memory range with the given origin value. 
-  void paintOrigin(IRBuilder<> &IRB, Value *Origin, Value *OriginPtr, 
-                   unsigned Size, Align Alignment) { 
-    const DataLayout &DL = F.getParent()->getDataLayout(); 
-    const Align IntptrAlignment = DL.getABITypeAlign(MS.IntptrTy); 
-    unsigned IntptrSize = DL.getTypeStoreSize(MS.IntptrTy); 
-    assert(IntptrAlignment >= kMinOriginAlignment); 
-    assert(IntptrSize >= kOriginSize); 
- 
-    unsigned Ofs = 0; 
-    Align CurrentAlignment = Alignment; 
-    if (Alignment >= IntptrAlignment && IntptrSize > kOriginSize) { 
-      Value *IntptrOrigin = originToIntptr(IRB, Origin); 
-      Value *IntptrOriginPtr = 
-          IRB.CreatePointerCast(OriginPtr, PointerType::get(MS.IntptrTy, 0)); 
-      for (unsigned i = 0; i < Size / IntptrSize; ++i) { 
-        Value *Ptr = i ? IRB.CreateConstGEP1_32(MS.IntptrTy, IntptrOriginPtr, i) 
-                       : IntptrOriginPtr; 
-        IRB.CreateAlignedStore(IntptrOrigin, Ptr, CurrentAlignment); 
-        Ofs += IntptrSize / kOriginSize; 
-        CurrentAlignment = IntptrAlignment; 
-      } 
-    } 
- 
-    for (unsigned i = Ofs; i < (Size + kOriginSize - 1) / kOriginSize; ++i) { 
-      Value *GEP = 
-          i ? IRB.CreateConstGEP1_32(MS.OriginTy, OriginPtr, i) : OriginPtr; 
-      IRB.CreateAlignedStore(Origin, GEP, CurrentAlignment); 
-      CurrentAlignment = kMinOriginAlignment; 
-    } 
-  } 
- 
-  void storeOrigin(IRBuilder<> &IRB, Value *Addr, Value *Shadow, Value *Origin, 
-                   Value *OriginPtr, Align Alignment, bool AsCall) { 
-    const DataLayout &DL = F.getParent()->getDataLayout(); 
-    const Align OriginAlignment = std::max(kMinOriginAlignment, Alignment); 
-    unsigned StoreSize = DL.getTypeStoreSize(Shadow->getType()); 
+  Value *updateOrigin(Value *V, IRBuilder<> &IRB) {
+    if (MS.TrackOrigins <= 1) return V;
+    return IRB.CreateCall(MS.MsanChainOriginFn, V);
+  }
+
+  Value *originToIntptr(IRBuilder<> &IRB, Value *Origin) {
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    unsigned IntptrSize = DL.getTypeStoreSize(MS.IntptrTy);
+    if (IntptrSize == kOriginSize) return Origin;
+    assert(IntptrSize == kOriginSize * 2);
+    Origin = IRB.CreateIntCast(Origin, MS.IntptrTy, /* isSigned */ false);
+    return IRB.CreateOr(Origin, IRB.CreateShl(Origin, kOriginSize * 8));
+  }
+
+  /// Fill memory range with the given origin value.
+  void paintOrigin(IRBuilder<> &IRB, Value *Origin, Value *OriginPtr,
+                   unsigned Size, Align Alignment) {
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    const Align IntptrAlignment = DL.getABITypeAlign(MS.IntptrTy);
+    unsigned IntptrSize = DL.getTypeStoreSize(MS.IntptrTy);
+    assert(IntptrAlignment >= kMinOriginAlignment);
+    assert(IntptrSize >= kOriginSize);
+
+    unsigned Ofs = 0;
+    Align CurrentAlignment = Alignment;
+    if (Alignment >= IntptrAlignment && IntptrSize > kOriginSize) {
+      Value *IntptrOrigin = originToIntptr(IRB, Origin);
+      Value *IntptrOriginPtr =
+          IRB.CreatePointerCast(OriginPtr, PointerType::get(MS.IntptrTy, 0));
+      for (unsigned i = 0; i < Size / IntptrSize; ++i) {
+        Value *Ptr = i ? IRB.CreateConstGEP1_32(MS.IntptrTy, IntptrOriginPtr, i)
+                       : IntptrOriginPtr;
+        IRB.CreateAlignedStore(IntptrOrigin, Ptr, CurrentAlignment);
+        Ofs += IntptrSize / kOriginSize;
+        CurrentAlignment = IntptrAlignment;
+      }
+    }
+
+    for (unsigned i = Ofs; i < (Size + kOriginSize - 1) / kOriginSize; ++i) {
+      Value *GEP =
+          i ? IRB.CreateConstGEP1_32(MS.OriginTy, OriginPtr, i) : OriginPtr;
+      IRB.CreateAlignedStore(Origin, GEP, CurrentAlignment);
+      CurrentAlignment = kMinOriginAlignment;
+    }
+  }
+
+  void storeOrigin(IRBuilder<> &IRB, Value *Addr, Value *Shadow, Value *Origin,
+                   Value *OriginPtr, Align Alignment, bool AsCall) {
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    const Align OriginAlignment = std::max(kMinOriginAlignment, Alignment);
+    unsigned StoreSize = DL.getTypeStoreSize(Shadow->getType());
     Value *ConvertedShadow = convertShadowToScalar(Shadow, IRB);
     if (auto *ConstantShadow = dyn_cast<Constant>(ConvertedShadow)) {
       if (ClCheckConstantShadow && !ConstantShadow->isZeroValue())
         paintOrigin(IRB, updateOrigin(Origin, IRB), OriginPtr, StoreSize,
-                    OriginAlignment); 
+                    OriginAlignment);
       return;
-    } 
+    }
 
     unsigned TypeSizeInBits = DL.getTypeSizeInBits(ConvertedShadow->getType());
     unsigned SizeIndex = TypeSizeToSizeIndex(TypeSizeInBits);
@@ -1189,206 +1189,206 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       paintOrigin(IRBNew, updateOrigin(Origin, IRBNew), OriginPtr, StoreSize,
                   OriginAlignment);
     }
-  } 
- 
-  void materializeStores(bool InstrumentWithCalls) { 
-    for (StoreInst *SI : StoreList) { 
-      IRBuilder<> IRB(SI); 
-      Value *Val = SI->getValueOperand(); 
-      Value *Addr = SI->getPointerOperand(); 
-      Value *Shadow = SI->isAtomic() ? getCleanShadow(Val) : getShadow(Val); 
-      Value *ShadowPtr, *OriginPtr; 
-      Type *ShadowTy = Shadow->getType(); 
-      const Align Alignment = assumeAligned(SI->getAlignment()); 
-      const Align OriginAlignment = std::max(kMinOriginAlignment, Alignment); 
-      std::tie(ShadowPtr, OriginPtr) = 
-          getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ true); 
- 
-      StoreInst *NewSI = IRB.CreateAlignedStore(Shadow, ShadowPtr, Alignment); 
-      LLVM_DEBUG(dbgs() << "  STORE: " << *NewSI << "\n"); 
-      (void)NewSI; 
- 
-      if (SI->isAtomic()) 
-        SI->setOrdering(addReleaseOrdering(SI->getOrdering())); 
- 
-      if (MS.TrackOrigins && !SI->isAtomic()) 
-        storeOrigin(IRB, Addr, Shadow, getOrigin(Val), OriginPtr, 
-                    OriginAlignment, InstrumentWithCalls); 
-    } 
-  } 
- 
-  /// Helper function to insert a warning at IRB's current insert point. 
-  void insertWarningFn(IRBuilder<> &IRB, Value *Origin) { 
-    if (!Origin) 
-      Origin = (Value *)IRB.getInt32(0); 
-    assert(Origin->getType()->isIntegerTy()); 
-    IRB.CreateCall(MS.WarningFn, Origin)->setCannotMerge(); 
-    // FIXME: Insert UnreachableInst if !MS.Recover? 
-    // This may invalidate some of the following checks and needs to be done 
-    // at the very end. 
-  } 
- 
-  void materializeOneCheck(Instruction *OrigIns, Value *Shadow, Value *Origin, 
-                           bool AsCall) { 
-    IRBuilder<> IRB(OrigIns); 
-    LLVM_DEBUG(dbgs() << "  SHAD0 : " << *Shadow << "\n"); 
+  }
+
+  void materializeStores(bool InstrumentWithCalls) {
+    for (StoreInst *SI : StoreList) {
+      IRBuilder<> IRB(SI);
+      Value *Val = SI->getValueOperand();
+      Value *Addr = SI->getPointerOperand();
+      Value *Shadow = SI->isAtomic() ? getCleanShadow(Val) : getShadow(Val);
+      Value *ShadowPtr, *OriginPtr;
+      Type *ShadowTy = Shadow->getType();
+      const Align Alignment = assumeAligned(SI->getAlignment());
+      const Align OriginAlignment = std::max(kMinOriginAlignment, Alignment);
+      std::tie(ShadowPtr, OriginPtr) =
+          getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ true);
+
+      StoreInst *NewSI = IRB.CreateAlignedStore(Shadow, ShadowPtr, Alignment);
+      LLVM_DEBUG(dbgs() << "  STORE: " << *NewSI << "\n");
+      (void)NewSI;
+
+      if (SI->isAtomic())
+        SI->setOrdering(addReleaseOrdering(SI->getOrdering()));
+
+      if (MS.TrackOrigins && !SI->isAtomic())
+        storeOrigin(IRB, Addr, Shadow, getOrigin(Val), OriginPtr,
+                    OriginAlignment, InstrumentWithCalls);
+    }
+  }
+
+  /// Helper function to insert a warning at IRB's current insert point.
+  void insertWarningFn(IRBuilder<> &IRB, Value *Origin) {
+    if (!Origin)
+      Origin = (Value *)IRB.getInt32(0);
+    assert(Origin->getType()->isIntegerTy());
+    IRB.CreateCall(MS.WarningFn, Origin)->setCannotMerge();
+    // FIXME: Insert UnreachableInst if !MS.Recover?
+    // This may invalidate some of the following checks and needs to be done
+    // at the very end.
+  }
+
+  void materializeOneCheck(Instruction *OrigIns, Value *Shadow, Value *Origin,
+                           bool AsCall) {
+    IRBuilder<> IRB(OrigIns);
+    LLVM_DEBUG(dbgs() << "  SHAD0 : " << *Shadow << "\n");
     Value *ConvertedShadow = convertShadowToScalar(Shadow, IRB);
-    LLVM_DEBUG(dbgs() << "  SHAD1 : " << *ConvertedShadow << "\n"); 
- 
-    if (auto *ConstantShadow = dyn_cast<Constant>(ConvertedShadow)) { 
-      if (ClCheckConstantShadow && !ConstantShadow->isZeroValue()) { 
-        insertWarningFn(IRB, Origin); 
-      } 
-      return; 
-    } 
- 
-    const DataLayout &DL = OrigIns->getModule()->getDataLayout(); 
- 
-    unsigned TypeSizeInBits = DL.getTypeSizeInBits(ConvertedShadow->getType()); 
-    unsigned SizeIndex = TypeSizeToSizeIndex(TypeSizeInBits); 
-    if (AsCall && SizeIndex < kNumberOfAccessSizes && !MS.CompileKernel) { 
-      FunctionCallee Fn = MS.MaybeWarningFn[SizeIndex]; 
-      Value *ConvertedShadow2 = 
-          IRB.CreateZExt(ConvertedShadow, IRB.getIntNTy(8 * (1 << SizeIndex))); 
-      IRB.CreateCall(Fn, {ConvertedShadow2, MS.TrackOrigins && Origin 
-                                                ? Origin 
-                                                : (Value *)IRB.getInt32(0)}); 
-    } else { 
+    LLVM_DEBUG(dbgs() << "  SHAD1 : " << *ConvertedShadow << "\n");
+
+    if (auto *ConstantShadow = dyn_cast<Constant>(ConvertedShadow)) {
+      if (ClCheckConstantShadow && !ConstantShadow->isZeroValue()) {
+        insertWarningFn(IRB, Origin);
+      }
+      return;
+    }
+
+    const DataLayout &DL = OrigIns->getModule()->getDataLayout();
+
+    unsigned TypeSizeInBits = DL.getTypeSizeInBits(ConvertedShadow->getType());
+    unsigned SizeIndex = TypeSizeToSizeIndex(TypeSizeInBits);
+    if (AsCall && SizeIndex < kNumberOfAccessSizes && !MS.CompileKernel) {
+      FunctionCallee Fn = MS.MaybeWarningFn[SizeIndex];
+      Value *ConvertedShadow2 =
+          IRB.CreateZExt(ConvertedShadow, IRB.getIntNTy(8 * (1 << SizeIndex)));
+      IRB.CreateCall(Fn, {ConvertedShadow2, MS.TrackOrigins && Origin
+                                                ? Origin
+                                                : (Value *)IRB.getInt32(0)});
+    } else {
       Value *Cmp = convertToBool(ConvertedShadow, IRB, "_mscmp");
-      Instruction *CheckTerm = SplitBlockAndInsertIfThen( 
-          Cmp, OrigIns, 
-          /* Unreachable */ !MS.Recover, MS.ColdCallWeights); 
- 
-      IRB.SetInsertPoint(CheckTerm); 
-      insertWarningFn(IRB, Origin); 
-      LLVM_DEBUG(dbgs() << "  CHECK: " << *Cmp << "\n"); 
-    } 
-  } 
- 
-  void materializeChecks(bool InstrumentWithCalls) { 
-    for (const auto &ShadowData : InstrumentationList) { 
-      Instruction *OrigIns = ShadowData.OrigIns; 
-      Value *Shadow = ShadowData.Shadow; 
-      Value *Origin = ShadowData.Origin; 
-      materializeOneCheck(OrigIns, Shadow, Origin, InstrumentWithCalls); 
-    } 
-    LLVM_DEBUG(dbgs() << "DONE:\n" << F); 
-  } 
- 
+      Instruction *CheckTerm = SplitBlockAndInsertIfThen(
+          Cmp, OrigIns,
+          /* Unreachable */ !MS.Recover, MS.ColdCallWeights);
+
+      IRB.SetInsertPoint(CheckTerm);
+      insertWarningFn(IRB, Origin);
+      LLVM_DEBUG(dbgs() << "  CHECK: " << *Cmp << "\n");
+    }
+  }
+
+  void materializeChecks(bool InstrumentWithCalls) {
+    for (const auto &ShadowData : InstrumentationList) {
+      Instruction *OrigIns = ShadowData.OrigIns;
+      Value *Shadow = ShadowData.Shadow;
+      Value *Origin = ShadowData.Origin;
+      materializeOneCheck(OrigIns, Shadow, Origin, InstrumentWithCalls);
+    }
+    LLVM_DEBUG(dbgs() << "DONE:\n" << F);
+  }
+
   // Returns the last instruction in the new prologue
   void insertKmsanPrologue(IRBuilder<> &IRB) {
-    Value *ContextState = IRB.CreateCall(MS.MsanGetContextStateFn, {}); 
-    Constant *Zero = IRB.getInt32(0); 
-    MS.ParamTLS = IRB.CreateGEP(MS.MsanContextStateTy, ContextState, 
-                                {Zero, IRB.getInt32(0)}, "param_shadow"); 
-    MS.RetvalTLS = IRB.CreateGEP(MS.MsanContextStateTy, ContextState, 
-                                 {Zero, IRB.getInt32(1)}, "retval_shadow"); 
-    MS.VAArgTLS = IRB.CreateGEP(MS.MsanContextStateTy, ContextState, 
-                                {Zero, IRB.getInt32(2)}, "va_arg_shadow"); 
-    MS.VAArgOriginTLS = IRB.CreateGEP(MS.MsanContextStateTy, ContextState, 
-                                      {Zero, IRB.getInt32(3)}, "va_arg_origin"); 
-    MS.VAArgOverflowSizeTLS = 
-        IRB.CreateGEP(MS.MsanContextStateTy, ContextState, 
-                      {Zero, IRB.getInt32(4)}, "va_arg_overflow_size"); 
-    MS.ParamOriginTLS = IRB.CreateGEP(MS.MsanContextStateTy, ContextState, 
-                                      {Zero, IRB.getInt32(5)}, "param_origin"); 
-    MS.RetvalOriginTLS = 
-        IRB.CreateGEP(MS.MsanContextStateTy, ContextState, 
-                      {Zero, IRB.getInt32(6)}, "retval_origin"); 
-  } 
- 
-  /// Add MemorySanitizer instrumentation to a function. 
-  bool runOnFunction() { 
-    // Iterate all BBs in depth-first order and create shadow instructions 
-    // for all instructions (where applicable). 
-    // For PHI nodes we create dummy shadow PHIs which will be finalized later. 
+    Value *ContextState = IRB.CreateCall(MS.MsanGetContextStateFn, {});
+    Constant *Zero = IRB.getInt32(0);
+    MS.ParamTLS = IRB.CreateGEP(MS.MsanContextStateTy, ContextState,
+                                {Zero, IRB.getInt32(0)}, "param_shadow");
+    MS.RetvalTLS = IRB.CreateGEP(MS.MsanContextStateTy, ContextState,
+                                 {Zero, IRB.getInt32(1)}, "retval_shadow");
+    MS.VAArgTLS = IRB.CreateGEP(MS.MsanContextStateTy, ContextState,
+                                {Zero, IRB.getInt32(2)}, "va_arg_shadow");
+    MS.VAArgOriginTLS = IRB.CreateGEP(MS.MsanContextStateTy, ContextState,
+                                      {Zero, IRB.getInt32(3)}, "va_arg_origin");
+    MS.VAArgOverflowSizeTLS =
+        IRB.CreateGEP(MS.MsanContextStateTy, ContextState,
+                      {Zero, IRB.getInt32(4)}, "va_arg_overflow_size");
+    MS.ParamOriginTLS = IRB.CreateGEP(MS.MsanContextStateTy, ContextState,
+                                      {Zero, IRB.getInt32(5)}, "param_origin");
+    MS.RetvalOriginTLS =
+        IRB.CreateGEP(MS.MsanContextStateTy, ContextState,
+                      {Zero, IRB.getInt32(6)}, "retval_origin");
+  }
+
+  /// Add MemorySanitizer instrumentation to a function.
+  bool runOnFunction() {
+    // Iterate all BBs in depth-first order and create shadow instructions
+    // for all instructions (where applicable).
+    // For PHI nodes we create dummy shadow PHIs which will be finalized later.
     for (BasicBlock *BB : depth_first(FnPrologueEnd->getParent()))
-      visit(*BB); 
- 
-    // Finalize PHI nodes. 
-    for (PHINode *PN : ShadowPHINodes) { 
-      PHINode *PNS = cast<PHINode>(getShadow(PN)); 
-      PHINode *PNO = MS.TrackOrigins ? cast<PHINode>(getOrigin(PN)) : nullptr; 
-      size_t NumValues = PN->getNumIncomingValues(); 
-      for (size_t v = 0; v < NumValues; v++) { 
-        PNS->addIncoming(getShadow(PN, v), PN->getIncomingBlock(v)); 
-        if (PNO) PNO->addIncoming(getOrigin(PN, v), PN->getIncomingBlock(v)); 
-      } 
-    } 
- 
-    VAHelper->finalizeInstrumentation(); 
- 
-    // Poison llvm.lifetime.start intrinsics, if we haven't fallen back to 
-    // instrumenting only allocas. 
-    if (InstrumentLifetimeStart) { 
-      for (auto Item : LifetimeStartList) { 
-        instrumentAlloca(*Item.second, Item.first); 
-        AllocaSet.erase(Item.second); 
-      } 
-    } 
-    // Poison the allocas for which we didn't instrument the corresponding 
-    // lifetime intrinsics. 
-    for (AllocaInst *AI : AllocaSet) 
-      instrumentAlloca(*AI); 
- 
-    bool InstrumentWithCalls = ClInstrumentationWithCallThreshold >= 0 && 
-                               InstrumentationList.size() + StoreList.size() > 
-                                   (unsigned)ClInstrumentationWithCallThreshold; 
- 
-    // Insert shadow value checks. 
-    materializeChecks(InstrumentWithCalls); 
- 
-    // Delayed instrumentation of StoreInst. 
-    // This may not add new address checks. 
-    materializeStores(InstrumentWithCalls); 
- 
-    return true; 
-  } 
- 
-  /// Compute the shadow type that corresponds to a given Value. 
-  Type *getShadowTy(Value *V) { 
-    return getShadowTy(V->getType()); 
-  } 
- 
-  /// Compute the shadow type that corresponds to a given Type. 
-  Type *getShadowTy(Type *OrigTy) { 
-    if (!OrigTy->isSized()) { 
-      return nullptr; 
-    } 
-    // For integer type, shadow is the same as the original type. 
-    // This may return weird-sized types like i1. 
-    if (IntegerType *IT = dyn_cast<IntegerType>(OrigTy)) 
-      return IT; 
-    const DataLayout &DL = F.getParent()->getDataLayout(); 
-    if (VectorType *VT = dyn_cast<VectorType>(OrigTy)) { 
-      uint32_t EltSize = DL.getTypeSizeInBits(VT->getElementType()); 
-      return FixedVectorType::get(IntegerType::get(*MS.C, EltSize), 
-                                  cast<FixedVectorType>(VT)->getNumElements()); 
-    } 
-    if (ArrayType *AT = dyn_cast<ArrayType>(OrigTy)) { 
-      return ArrayType::get(getShadowTy(AT->getElementType()), 
-                            AT->getNumElements()); 
-    } 
-    if (StructType *ST = dyn_cast<StructType>(OrigTy)) { 
-      SmallVector<Type*, 4> Elements; 
-      for (unsigned i = 0, n = ST->getNumElements(); i < n; i++) 
-        Elements.push_back(getShadowTy(ST->getElementType(i))); 
-      StructType *Res = StructType::get(*MS.C, Elements, ST->isPacked()); 
-      LLVM_DEBUG(dbgs() << "getShadowTy: " << *ST << " ===> " << *Res << "\n"); 
-      return Res; 
-    } 
-    uint32_t TypeSize = DL.getTypeSizeInBits(OrigTy); 
-    return IntegerType::get(*MS.C, TypeSize); 
-  } 
- 
-  /// Flatten a vector type. 
-  Type *getShadowTyNoVec(Type *ty) { 
-    if (VectorType *vt = dyn_cast<VectorType>(ty)) 
-      return IntegerType::get(*MS.C, 
-                              vt->getPrimitiveSizeInBits().getFixedSize()); 
-    return ty; 
-  } 
- 
+      visit(*BB);
+
+    // Finalize PHI nodes.
+    for (PHINode *PN : ShadowPHINodes) {
+      PHINode *PNS = cast<PHINode>(getShadow(PN));
+      PHINode *PNO = MS.TrackOrigins ? cast<PHINode>(getOrigin(PN)) : nullptr;
+      size_t NumValues = PN->getNumIncomingValues();
+      for (size_t v = 0; v < NumValues; v++) {
+        PNS->addIncoming(getShadow(PN, v), PN->getIncomingBlock(v));
+        if (PNO) PNO->addIncoming(getOrigin(PN, v), PN->getIncomingBlock(v));
+      }
+    }
+
+    VAHelper->finalizeInstrumentation();
+
+    // Poison llvm.lifetime.start intrinsics, if we haven't fallen back to
+    // instrumenting only allocas.
+    if (InstrumentLifetimeStart) {
+      for (auto Item : LifetimeStartList) {
+        instrumentAlloca(*Item.second, Item.first);
+        AllocaSet.erase(Item.second);
+      }
+    }
+    // Poison the allocas for which we didn't instrument the corresponding
+    // lifetime intrinsics.
+    for (AllocaInst *AI : AllocaSet)
+      instrumentAlloca(*AI);
+
+    bool InstrumentWithCalls = ClInstrumentationWithCallThreshold >= 0 &&
+                               InstrumentationList.size() + StoreList.size() >
+                                   (unsigned)ClInstrumentationWithCallThreshold;
+
+    // Insert shadow value checks.
+    materializeChecks(InstrumentWithCalls);
+
+    // Delayed instrumentation of StoreInst.
+    // This may not add new address checks.
+    materializeStores(InstrumentWithCalls);
+
+    return true;
+  }
+
+  /// Compute the shadow type that corresponds to a given Value.
+  Type *getShadowTy(Value *V) {
+    return getShadowTy(V->getType());
+  }
+
+  /// Compute the shadow type that corresponds to a given Type.
+  Type *getShadowTy(Type *OrigTy) {
+    if (!OrigTy->isSized()) {
+      return nullptr;
+    }
+    // For integer type, shadow is the same as the original type.
+    // This may return weird-sized types like i1.
+    if (IntegerType *IT = dyn_cast<IntegerType>(OrigTy))
+      return IT;
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    if (VectorType *VT = dyn_cast<VectorType>(OrigTy)) {
+      uint32_t EltSize = DL.getTypeSizeInBits(VT->getElementType());
+      return FixedVectorType::get(IntegerType::get(*MS.C, EltSize),
+                                  cast<FixedVectorType>(VT)->getNumElements());
+    }
+    if (ArrayType *AT = dyn_cast<ArrayType>(OrigTy)) {
+      return ArrayType::get(getShadowTy(AT->getElementType()),
+                            AT->getNumElements());
+    }
+    if (StructType *ST = dyn_cast<StructType>(OrigTy)) {
+      SmallVector<Type*, 4> Elements;
+      for (unsigned i = 0, n = ST->getNumElements(); i < n; i++)
+        Elements.push_back(getShadowTy(ST->getElementType(i)));
+      StructType *Res = StructType::get(*MS.C, Elements, ST->isPacked());
+      LLVM_DEBUG(dbgs() << "getShadowTy: " << *ST << " ===> " << *Res << "\n");
+      return Res;
+    }
+    uint32_t TypeSize = DL.getTypeSizeInBits(OrigTy);
+    return IntegerType::get(*MS.C, TypeSize);
+  }
+
+  /// Flatten a vector type.
+  Type *getShadowTyNoVec(Type *ty) {
+    if (VectorType *vt = dyn_cast<VectorType>(ty))
+      return IntegerType::get(*MS.C,
+                              vt->getPrimitiveSizeInBits().getFixedSize());
+    return ty;
+  }
+
   /// Extract combined shadow of struct elements as a bool
   Value *collapseStructShadow(StructType *Struct, Value *Shadow,
                               IRBuilder<> &IRB) {
@@ -1435,12 +1435,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       return collapseStructShadow(Struct, V, IRB);
     if (ArrayType *Array = dyn_cast<ArrayType>(V->getType()))
       return collapseArrayShadow(Array, V, IRB);
-    Type *Ty = V->getType(); 
-    Type *NoVecTy = getShadowTyNoVec(Ty); 
-    if (Ty == NoVecTy) return V; 
-    return IRB.CreateBitCast(V, NoVecTy); 
-  } 
- 
+    Type *Ty = V->getType();
+    Type *NoVecTy = getShadowTyNoVec(Ty);
+    if (Ty == NoVecTy) return V;
+    return IRB.CreateBitCast(V, NoVecTy);
+  }
+
   // Convert a scalar value to an i1 by comparing with 0
   Value *convertToBool(Value *V, IRBuilder<> &IRB, const Twine &name = "") {
     Type *VTy = V->getType();
@@ -1451,386 +1451,386 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     return IRB.CreateICmpNE(V, ConstantInt::get(VTy, 0), name);
   }
 
-  /// Compute the integer shadow offset that corresponds to a given 
-  /// application address. 
-  /// 
-  /// Offset = (Addr & ~AndMask) ^ XorMask 
-  Value *getShadowPtrOffset(Value *Addr, IRBuilder<> &IRB) { 
-    Value *OffsetLong = IRB.CreatePointerCast(Addr, MS.IntptrTy); 
- 
-    uint64_t AndMask = MS.MapParams->AndMask; 
-    if (AndMask) 
-      OffsetLong = 
-          IRB.CreateAnd(OffsetLong, ConstantInt::get(MS.IntptrTy, ~AndMask)); 
- 
-    uint64_t XorMask = MS.MapParams->XorMask; 
-    if (XorMask) 
-      OffsetLong = 
-          IRB.CreateXor(OffsetLong, ConstantInt::get(MS.IntptrTy, XorMask)); 
-    return OffsetLong; 
-  } 
- 
-  /// Compute the shadow and origin addresses corresponding to a given 
-  /// application address. 
-  /// 
-  /// Shadow = ShadowBase + Offset 
-  /// Origin = (OriginBase + Offset) & ~3ULL 
-  std::pair<Value *, Value *> 
-  getShadowOriginPtrUserspace(Value *Addr, IRBuilder<> &IRB, Type *ShadowTy, 
-                              MaybeAlign Alignment) { 
-    Value *ShadowOffset = getShadowPtrOffset(Addr, IRB); 
-    Value *ShadowLong = ShadowOffset; 
-    uint64_t ShadowBase = MS.MapParams->ShadowBase; 
-    if (ShadowBase != 0) { 
-      ShadowLong = 
-        IRB.CreateAdd(ShadowLong, 
-                      ConstantInt::get(MS.IntptrTy, ShadowBase)); 
-    } 
-    Value *ShadowPtr = 
-        IRB.CreateIntToPtr(ShadowLong, PointerType::get(ShadowTy, 0)); 
-    Value *OriginPtr = nullptr; 
-    if (MS.TrackOrigins) { 
-      Value *OriginLong = ShadowOffset; 
-      uint64_t OriginBase = MS.MapParams->OriginBase; 
-      if (OriginBase != 0) 
-        OriginLong = IRB.CreateAdd(OriginLong, 
-                                   ConstantInt::get(MS.IntptrTy, OriginBase)); 
-      if (!Alignment || *Alignment < kMinOriginAlignment) { 
-        uint64_t Mask = kMinOriginAlignment.value() - 1; 
-        OriginLong = 
-            IRB.CreateAnd(OriginLong, ConstantInt::get(MS.IntptrTy, ~Mask)); 
-      } 
-      OriginPtr = 
-          IRB.CreateIntToPtr(OriginLong, PointerType::get(MS.OriginTy, 0)); 
-    } 
-    return std::make_pair(ShadowPtr, OriginPtr); 
-  } 
- 
-  std::pair<Value *, Value *> getShadowOriginPtrKernel(Value *Addr, 
-                                                       IRBuilder<> &IRB, 
-                                                       Type *ShadowTy, 
-                                                       bool isStore) { 
-    Value *ShadowOriginPtrs; 
-    const DataLayout &DL = F.getParent()->getDataLayout(); 
-    int Size = DL.getTypeStoreSize(ShadowTy); 
- 
-    FunctionCallee Getter = MS.getKmsanShadowOriginAccessFn(isStore, Size); 
-    Value *AddrCast = 
-        IRB.CreatePointerCast(Addr, PointerType::get(IRB.getInt8Ty(), 0)); 
-    if (Getter) { 
-      ShadowOriginPtrs = IRB.CreateCall(Getter, AddrCast); 
-    } else { 
-      Value *SizeVal = ConstantInt::get(MS.IntptrTy, Size); 
-      ShadowOriginPtrs = IRB.CreateCall(isStore ? MS.MsanMetadataPtrForStoreN 
-                                                : MS.MsanMetadataPtrForLoadN, 
-                                        {AddrCast, SizeVal}); 
-    } 
-    Value *ShadowPtr = IRB.CreateExtractValue(ShadowOriginPtrs, 0); 
-    ShadowPtr = IRB.CreatePointerCast(ShadowPtr, PointerType::get(ShadowTy, 0)); 
-    Value *OriginPtr = IRB.CreateExtractValue(ShadowOriginPtrs, 1); 
- 
-    return std::make_pair(ShadowPtr, OriginPtr); 
-  } 
- 
-  std::pair<Value *, Value *> getShadowOriginPtr(Value *Addr, IRBuilder<> &IRB, 
-                                                 Type *ShadowTy, 
-                                                 MaybeAlign Alignment, 
-                                                 bool isStore) { 
-    if (MS.CompileKernel) 
-      return getShadowOriginPtrKernel(Addr, IRB, ShadowTy, isStore); 
-    return getShadowOriginPtrUserspace(Addr, IRB, ShadowTy, Alignment); 
-  } 
- 
-  /// Compute the shadow address for a given function argument. 
-  /// 
-  /// Shadow = ParamTLS+ArgOffset. 
-  Value *getShadowPtrForArgument(Value *A, IRBuilder<> &IRB, 
-                                 int ArgOffset) { 
-    Value *Base = IRB.CreatePointerCast(MS.ParamTLS, MS.IntptrTy); 
-    if (ArgOffset) 
-      Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset)); 
-    return IRB.CreateIntToPtr(Base, PointerType::get(getShadowTy(A), 0), 
-                              "_msarg"); 
-  } 
- 
-  /// Compute the origin address for a given function argument. 
-  Value *getOriginPtrForArgument(Value *A, IRBuilder<> &IRB, 
-                                 int ArgOffset) { 
-    if (!MS.TrackOrigins) 
-      return nullptr; 
-    Value *Base = IRB.CreatePointerCast(MS.ParamOriginTLS, MS.IntptrTy); 
-    if (ArgOffset) 
-      Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset)); 
-    return IRB.CreateIntToPtr(Base, PointerType::get(MS.OriginTy, 0), 
-                              "_msarg_o"); 
-  } 
- 
-  /// Compute the shadow address for a retval. 
-  Value *getShadowPtrForRetval(Value *A, IRBuilder<> &IRB) { 
-    return IRB.CreatePointerCast(MS.RetvalTLS, 
-                                 PointerType::get(getShadowTy(A), 0), 
-                                 "_msret"); 
-  } 
- 
-  /// Compute the origin address for a retval. 
-  Value *getOriginPtrForRetval(IRBuilder<> &IRB) { 
-    // We keep a single origin for the entire retval. Might be too optimistic. 
-    return MS.RetvalOriginTLS; 
-  } 
- 
-  /// Set SV to be the shadow value for V. 
-  void setShadow(Value *V, Value *SV) { 
-    assert(!ShadowMap.count(V) && "Values may only have one shadow"); 
-    ShadowMap[V] = PropagateShadow ? SV : getCleanShadow(V); 
-  } 
- 
-  /// Set Origin to be the origin value for V. 
-  void setOrigin(Value *V, Value *Origin) { 
-    if (!MS.TrackOrigins) return; 
-    assert(!OriginMap.count(V) && "Values may only have one origin"); 
-    LLVM_DEBUG(dbgs() << "ORIGIN: " << *V << "  ==> " << *Origin << "\n"); 
-    OriginMap[V] = Origin; 
-  } 
- 
-  Constant *getCleanShadow(Type *OrigTy) { 
-    Type *ShadowTy = getShadowTy(OrigTy); 
-    if (!ShadowTy) 
-      return nullptr; 
-    return Constant::getNullValue(ShadowTy); 
-  } 
- 
-  /// Create a clean shadow value for a given value. 
-  /// 
-  /// Clean shadow (all zeroes) means all bits of the value are defined 
-  /// (initialized). 
-  Constant *getCleanShadow(Value *V) { 
-    return getCleanShadow(V->getType()); 
-  } 
- 
-  /// Create a dirty shadow of a given shadow type. 
-  Constant *getPoisonedShadow(Type *ShadowTy) { 
-    assert(ShadowTy); 
-    if (isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy)) 
-      return Constant::getAllOnesValue(ShadowTy); 
-    if (ArrayType *AT = dyn_cast<ArrayType>(ShadowTy)) { 
-      SmallVector<Constant *, 4> Vals(AT->getNumElements(), 
-                                      getPoisonedShadow(AT->getElementType())); 
-      return ConstantArray::get(AT, Vals); 
-    } 
-    if (StructType *ST = dyn_cast<StructType>(ShadowTy)) { 
-      SmallVector<Constant *, 4> Vals; 
-      for (unsigned i = 0, n = ST->getNumElements(); i < n; i++) 
-        Vals.push_back(getPoisonedShadow(ST->getElementType(i))); 
-      return ConstantStruct::get(ST, Vals); 
-    } 
-    llvm_unreachable("Unexpected shadow type"); 
-  } 
- 
-  /// Create a dirty shadow for a given value. 
-  Constant *getPoisonedShadow(Value *V) { 
-    Type *ShadowTy = getShadowTy(V); 
-    if (!ShadowTy) 
-      return nullptr; 
-    return getPoisonedShadow(ShadowTy); 
-  } 
- 
-  /// Create a clean (zero) origin. 
-  Value *getCleanOrigin() { 
-    return Constant::getNullValue(MS.OriginTy); 
-  } 
- 
-  /// Get the shadow value for a given Value. 
-  /// 
-  /// This function either returns the value set earlier with setShadow, 
-  /// or extracts if from ParamTLS (for function arguments). 
-  Value *getShadow(Value *V) { 
-    if (!PropagateShadow) return getCleanShadow(V); 
-    if (Instruction *I = dyn_cast<Instruction>(V)) { 
-      if (I->getMetadata("nosanitize")) 
-        return getCleanShadow(V); 
-      // For instructions the shadow is already stored in the map. 
-      Value *Shadow = ShadowMap[V]; 
-      if (!Shadow) { 
-        LLVM_DEBUG(dbgs() << "No shadow: " << *V << "\n" << *(I->getParent())); 
-        (void)I; 
-        assert(Shadow && "No shadow for a value"); 
-      } 
-      return Shadow; 
-    } 
-    if (UndefValue *U = dyn_cast<UndefValue>(V)) { 
-      Value *AllOnes = PoisonUndef ? getPoisonedShadow(V) : getCleanShadow(V); 
-      LLVM_DEBUG(dbgs() << "Undef: " << *U << " ==> " << *AllOnes << "\n"); 
-      (void)U; 
-      return AllOnes; 
-    } 
-    if (Argument *A = dyn_cast<Argument>(V)) { 
-      // For arguments we compute the shadow on demand and store it in the map. 
-      Value **ShadowPtr = &ShadowMap[V]; 
-      if (*ShadowPtr) 
-        return *ShadowPtr; 
-      Function *F = A->getParent(); 
+  /// Compute the integer shadow offset that corresponds to a given
+  /// application address.
+  ///
+  /// Offset = (Addr & ~AndMask) ^ XorMask
+  Value *getShadowPtrOffset(Value *Addr, IRBuilder<> &IRB) {
+    Value *OffsetLong = IRB.CreatePointerCast(Addr, MS.IntptrTy);
+
+    uint64_t AndMask = MS.MapParams->AndMask;
+    if (AndMask)
+      OffsetLong =
+          IRB.CreateAnd(OffsetLong, ConstantInt::get(MS.IntptrTy, ~AndMask));
+
+    uint64_t XorMask = MS.MapParams->XorMask;
+    if (XorMask)
+      OffsetLong =
+          IRB.CreateXor(OffsetLong, ConstantInt::get(MS.IntptrTy, XorMask));
+    return OffsetLong;
+  }
+
+  /// Compute the shadow and origin addresses corresponding to a given
+  /// application address.
+  ///
+  /// Shadow = ShadowBase + Offset
+  /// Origin = (OriginBase + Offset) & ~3ULL
+  std::pair<Value *, Value *>
+  getShadowOriginPtrUserspace(Value *Addr, IRBuilder<> &IRB, Type *ShadowTy,
+                              MaybeAlign Alignment) {
+    Value *ShadowOffset = getShadowPtrOffset(Addr, IRB);
+    Value *ShadowLong = ShadowOffset;
+    uint64_t ShadowBase = MS.MapParams->ShadowBase;
+    if (ShadowBase != 0) {
+      ShadowLong =
+        IRB.CreateAdd(ShadowLong,
+                      ConstantInt::get(MS.IntptrTy, ShadowBase));
+    }
+    Value *ShadowPtr =
+        IRB.CreateIntToPtr(ShadowLong, PointerType::get(ShadowTy, 0));
+    Value *OriginPtr = nullptr;
+    if (MS.TrackOrigins) {
+      Value *OriginLong = ShadowOffset;
+      uint64_t OriginBase = MS.MapParams->OriginBase;
+      if (OriginBase != 0)
+        OriginLong = IRB.CreateAdd(OriginLong,
+                                   ConstantInt::get(MS.IntptrTy, OriginBase));
+      if (!Alignment || *Alignment < kMinOriginAlignment) {
+        uint64_t Mask = kMinOriginAlignment.value() - 1;
+        OriginLong =
+            IRB.CreateAnd(OriginLong, ConstantInt::get(MS.IntptrTy, ~Mask));
+      }
+      OriginPtr =
+          IRB.CreateIntToPtr(OriginLong, PointerType::get(MS.OriginTy, 0));
+    }
+    return std::make_pair(ShadowPtr, OriginPtr);
+  }
+
+  std::pair<Value *, Value *> getShadowOriginPtrKernel(Value *Addr,
+                                                       IRBuilder<> &IRB,
+                                                       Type *ShadowTy,
+                                                       bool isStore) {
+    Value *ShadowOriginPtrs;
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    int Size = DL.getTypeStoreSize(ShadowTy);
+
+    FunctionCallee Getter = MS.getKmsanShadowOriginAccessFn(isStore, Size);
+    Value *AddrCast =
+        IRB.CreatePointerCast(Addr, PointerType::get(IRB.getInt8Ty(), 0));
+    if (Getter) {
+      ShadowOriginPtrs = IRB.CreateCall(Getter, AddrCast);
+    } else {
+      Value *SizeVal = ConstantInt::get(MS.IntptrTy, Size);
+      ShadowOriginPtrs = IRB.CreateCall(isStore ? MS.MsanMetadataPtrForStoreN
+                                                : MS.MsanMetadataPtrForLoadN,
+                                        {AddrCast, SizeVal});
+    }
+    Value *ShadowPtr = IRB.CreateExtractValue(ShadowOriginPtrs, 0);
+    ShadowPtr = IRB.CreatePointerCast(ShadowPtr, PointerType::get(ShadowTy, 0));
+    Value *OriginPtr = IRB.CreateExtractValue(ShadowOriginPtrs, 1);
+
+    return std::make_pair(ShadowPtr, OriginPtr);
+  }
+
+  std::pair<Value *, Value *> getShadowOriginPtr(Value *Addr, IRBuilder<> &IRB,
+                                                 Type *ShadowTy,
+                                                 MaybeAlign Alignment,
+                                                 bool isStore) {
+    if (MS.CompileKernel)
+      return getShadowOriginPtrKernel(Addr, IRB, ShadowTy, isStore);
+    return getShadowOriginPtrUserspace(Addr, IRB, ShadowTy, Alignment);
+  }
+
+  /// Compute the shadow address for a given function argument.
+  ///
+  /// Shadow = ParamTLS+ArgOffset.
+  Value *getShadowPtrForArgument(Value *A, IRBuilder<> &IRB,
+                                 int ArgOffset) {
+    Value *Base = IRB.CreatePointerCast(MS.ParamTLS, MS.IntptrTy);
+    if (ArgOffset)
+      Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(getShadowTy(A), 0),
+                              "_msarg");
+  }
+
+  /// Compute the origin address for a given function argument.
+  Value *getOriginPtrForArgument(Value *A, IRBuilder<> &IRB,
+                                 int ArgOffset) {
+    if (!MS.TrackOrigins)
+      return nullptr;
+    Value *Base = IRB.CreatePointerCast(MS.ParamOriginTLS, MS.IntptrTy);
+    if (ArgOffset)
+      Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(MS.OriginTy, 0),
+                              "_msarg_o");
+  }
+
+  /// Compute the shadow address for a retval.
+  Value *getShadowPtrForRetval(Value *A, IRBuilder<> &IRB) {
+    return IRB.CreatePointerCast(MS.RetvalTLS,
+                                 PointerType::get(getShadowTy(A), 0),
+                                 "_msret");
+  }
+
+  /// Compute the origin address for a retval.
+  Value *getOriginPtrForRetval(IRBuilder<> &IRB) {
+    // We keep a single origin for the entire retval. Might be too optimistic.
+    return MS.RetvalOriginTLS;
+  }
+
+  /// Set SV to be the shadow value for V.
+  void setShadow(Value *V, Value *SV) {
+    assert(!ShadowMap.count(V) && "Values may only have one shadow");
+    ShadowMap[V] = PropagateShadow ? SV : getCleanShadow(V);
+  }
+
+  /// Set Origin to be the origin value for V.
+  void setOrigin(Value *V, Value *Origin) {
+    if (!MS.TrackOrigins) return;
+    assert(!OriginMap.count(V) && "Values may only have one origin");
+    LLVM_DEBUG(dbgs() << "ORIGIN: " << *V << "  ==> " << *Origin << "\n");
+    OriginMap[V] = Origin;
+  }
+
+  Constant *getCleanShadow(Type *OrigTy) {
+    Type *ShadowTy = getShadowTy(OrigTy);
+    if (!ShadowTy)
+      return nullptr;
+    return Constant::getNullValue(ShadowTy);
+  }
+
+  /// Create a clean shadow value for a given value.
+  ///
+  /// Clean shadow (all zeroes) means all bits of the value are defined
+  /// (initialized).
+  Constant *getCleanShadow(Value *V) {
+    return getCleanShadow(V->getType());
+  }
+
+  /// Create a dirty shadow of a given shadow type.
+  Constant *getPoisonedShadow(Type *ShadowTy) {
+    assert(ShadowTy);
+    if (isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy))
+      return Constant::getAllOnesValue(ShadowTy);
+    if (ArrayType *AT = dyn_cast<ArrayType>(ShadowTy)) {
+      SmallVector<Constant *, 4> Vals(AT->getNumElements(),
+                                      getPoisonedShadow(AT->getElementType()));
+      return ConstantArray::get(AT, Vals);
+    }
+    if (StructType *ST = dyn_cast<StructType>(ShadowTy)) {
+      SmallVector<Constant *, 4> Vals;
+      for (unsigned i = 0, n = ST->getNumElements(); i < n; i++)
+        Vals.push_back(getPoisonedShadow(ST->getElementType(i)));
+      return ConstantStruct::get(ST, Vals);
+    }
+    llvm_unreachable("Unexpected shadow type");
+  }
+
+  /// Create a dirty shadow for a given value.
+  Constant *getPoisonedShadow(Value *V) {
+    Type *ShadowTy = getShadowTy(V);
+    if (!ShadowTy)
+      return nullptr;
+    return getPoisonedShadow(ShadowTy);
+  }
+
+  /// Create a clean (zero) origin.
+  Value *getCleanOrigin() {
+    return Constant::getNullValue(MS.OriginTy);
+  }
+
+  /// Get the shadow value for a given Value.
+  ///
+  /// This function either returns the value set earlier with setShadow,
+  /// or extracts if from ParamTLS (for function arguments).
+  Value *getShadow(Value *V) {
+    if (!PropagateShadow) return getCleanShadow(V);
+    if (Instruction *I = dyn_cast<Instruction>(V)) {
+      if (I->getMetadata("nosanitize"))
+        return getCleanShadow(V);
+      // For instructions the shadow is already stored in the map.
+      Value *Shadow = ShadowMap[V];
+      if (!Shadow) {
+        LLVM_DEBUG(dbgs() << "No shadow: " << *V << "\n" << *(I->getParent()));
+        (void)I;
+        assert(Shadow && "No shadow for a value");
+      }
+      return Shadow;
+    }
+    if (UndefValue *U = dyn_cast<UndefValue>(V)) {
+      Value *AllOnes = PoisonUndef ? getPoisonedShadow(V) : getCleanShadow(V);
+      LLVM_DEBUG(dbgs() << "Undef: " << *U << " ==> " << *AllOnes << "\n");
+      (void)U;
+      return AllOnes;
+    }
+    if (Argument *A = dyn_cast<Argument>(V)) {
+      // For arguments we compute the shadow on demand and store it in the map.
+      Value **ShadowPtr = &ShadowMap[V];
+      if (*ShadowPtr)
+        return *ShadowPtr;
+      Function *F = A->getParent();
       IRBuilder<> EntryIRB(FnPrologueEnd);
-      unsigned ArgOffset = 0; 
-      const DataLayout &DL = F->getParent()->getDataLayout(); 
-      for (auto &FArg : F->args()) { 
-        if (!FArg.getType()->isSized()) { 
-          LLVM_DEBUG(dbgs() << "Arg is not sized\n"); 
-          continue; 
-        } 
- 
-        bool FArgByVal = FArg.hasByValAttr(); 
-        bool FArgNoUndef = FArg.hasAttribute(Attribute::NoUndef); 
-        bool FArgEagerCheck = ClEagerChecks && !FArgByVal && FArgNoUndef; 
-        unsigned Size = 
-            FArg.hasByValAttr() 
-                ? DL.getTypeAllocSize(FArg.getParamByValType()) 
-                : DL.getTypeAllocSize(FArg.getType()); 
- 
-        if (A == &FArg) { 
-          bool Overflow = ArgOffset + Size > kParamTLSSize; 
-          if (FArgEagerCheck) { 
-            *ShadowPtr = getCleanShadow(V); 
-            setOrigin(A, getCleanOrigin()); 
-            continue; 
-          } else if (FArgByVal) { 
-            Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset); 
-            // ByVal pointer itself has clean shadow. We copy the actual 
-            // argument shadow to the underlying memory. 
-            // Figure out maximal valid memcpy alignment. 
-            const Align ArgAlign = DL.getValueOrABITypeAlignment( 
-                MaybeAlign(FArg.getParamAlignment()), FArg.getParamByValType()); 
-            Value *CpShadowPtr = 
-                getShadowOriginPtr(V, EntryIRB, EntryIRB.getInt8Ty(), ArgAlign, 
-                                   /*isStore*/ true) 
-                    .first; 
-            // TODO(glider): need to copy origins. 
-            if (Overflow) { 
-              // ParamTLS overflow. 
-              EntryIRB.CreateMemSet( 
-                  CpShadowPtr, Constant::getNullValue(EntryIRB.getInt8Ty()), 
-                  Size, ArgAlign); 
-            } else { 
-              const Align CopyAlign = std::min(ArgAlign, kShadowTLSAlignment); 
-              Value *Cpy = EntryIRB.CreateMemCpy(CpShadowPtr, CopyAlign, Base, 
-                                                 CopyAlign, Size); 
-              LLVM_DEBUG(dbgs() << "  ByValCpy: " << *Cpy << "\n"); 
-              (void)Cpy; 
-            } 
-            *ShadowPtr = getCleanShadow(V); 
-          } else { 
-            // Shadow over TLS 
-            Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset); 
-            if (Overflow) { 
-              // ParamTLS overflow. 
-              *ShadowPtr = getCleanShadow(V); 
-            } else { 
-              *ShadowPtr = EntryIRB.CreateAlignedLoad(getShadowTy(&FArg), Base, 
-                                                      kShadowTLSAlignment); 
-            } 
-          } 
-          LLVM_DEBUG(dbgs() 
-                     << "  ARG:    " << FArg << " ==> " << **ShadowPtr << "\n"); 
-          if (MS.TrackOrigins && !Overflow) { 
-            Value *OriginPtr = 
-                getOriginPtrForArgument(&FArg, EntryIRB, ArgOffset); 
-            setOrigin(A, EntryIRB.CreateLoad(MS.OriginTy, OriginPtr)); 
-          } else { 
-            setOrigin(A, getCleanOrigin()); 
-          } 
+      unsigned ArgOffset = 0;
+      const DataLayout &DL = F->getParent()->getDataLayout();
+      for (auto &FArg : F->args()) {
+        if (!FArg.getType()->isSized()) {
+          LLVM_DEBUG(dbgs() << "Arg is not sized\n");
+          continue;
+        }
+
+        bool FArgByVal = FArg.hasByValAttr();
+        bool FArgNoUndef = FArg.hasAttribute(Attribute::NoUndef);
+        bool FArgEagerCheck = ClEagerChecks && !FArgByVal && FArgNoUndef;
+        unsigned Size =
+            FArg.hasByValAttr()
+                ? DL.getTypeAllocSize(FArg.getParamByValType())
+                : DL.getTypeAllocSize(FArg.getType());
+
+        if (A == &FArg) {
+          bool Overflow = ArgOffset + Size > kParamTLSSize;
+          if (FArgEagerCheck) {
+            *ShadowPtr = getCleanShadow(V);
+            setOrigin(A, getCleanOrigin());
+            continue;
+          } else if (FArgByVal) {
+            Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset);
+            // ByVal pointer itself has clean shadow. We copy the actual
+            // argument shadow to the underlying memory.
+            // Figure out maximal valid memcpy alignment.
+            const Align ArgAlign = DL.getValueOrABITypeAlignment(
+                MaybeAlign(FArg.getParamAlignment()), FArg.getParamByValType());
+            Value *CpShadowPtr =
+                getShadowOriginPtr(V, EntryIRB, EntryIRB.getInt8Ty(), ArgAlign,
+                                   /*isStore*/ true)
+                    .first;
+            // TODO(glider): need to copy origins.
+            if (Overflow) {
+              // ParamTLS overflow.
+              EntryIRB.CreateMemSet(
+                  CpShadowPtr, Constant::getNullValue(EntryIRB.getInt8Ty()),
+                  Size, ArgAlign);
+            } else {
+              const Align CopyAlign = std::min(ArgAlign, kShadowTLSAlignment);
+              Value *Cpy = EntryIRB.CreateMemCpy(CpShadowPtr, CopyAlign, Base,
+                                                 CopyAlign, Size);
+              LLVM_DEBUG(dbgs() << "  ByValCpy: " << *Cpy << "\n");
+              (void)Cpy;
+            }
+            *ShadowPtr = getCleanShadow(V);
+          } else {
+            // Shadow over TLS
+            Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset);
+            if (Overflow) {
+              // ParamTLS overflow.
+              *ShadowPtr = getCleanShadow(V);
+            } else {
+              *ShadowPtr = EntryIRB.CreateAlignedLoad(getShadowTy(&FArg), Base,
+                                                      kShadowTLSAlignment);
+            }
+          }
+          LLVM_DEBUG(dbgs()
+                     << "  ARG:    " << FArg << " ==> " << **ShadowPtr << "\n");
+          if (MS.TrackOrigins && !Overflow) {
+            Value *OriginPtr =
+                getOriginPtrForArgument(&FArg, EntryIRB, ArgOffset);
+            setOrigin(A, EntryIRB.CreateLoad(MS.OriginTy, OriginPtr));
+          } else {
+            setOrigin(A, getCleanOrigin());
+          }
 
           break;
-        } 
- 
-        if (!FArgEagerCheck) 
-          ArgOffset += alignTo(Size, kShadowTLSAlignment); 
-      } 
-      assert(*ShadowPtr && "Could not find shadow for an argument"); 
-      return *ShadowPtr; 
-    } 
-    // For everything else the shadow is zero. 
-    return getCleanShadow(V); 
-  } 
- 
-  /// Get the shadow for i-th argument of the instruction I. 
-  Value *getShadow(Instruction *I, int i) { 
-    return getShadow(I->getOperand(i)); 
-  } 
- 
-  /// Get the origin for a value. 
-  Value *getOrigin(Value *V) { 
-    if (!MS.TrackOrigins) return nullptr; 
-    if (!PropagateShadow) return getCleanOrigin(); 
-    if (isa<Constant>(V)) return getCleanOrigin(); 
-    assert((isa<Instruction>(V) || isa<Argument>(V)) && 
-           "Unexpected value type in getOrigin()"); 
-    if (Instruction *I = dyn_cast<Instruction>(V)) { 
-      if (I->getMetadata("nosanitize")) 
-        return getCleanOrigin(); 
-    } 
-    Value *Origin = OriginMap[V]; 
-    assert(Origin && "Missing origin"); 
-    return Origin; 
-  } 
- 
-  /// Get the origin for i-th argument of the instruction I. 
-  Value *getOrigin(Instruction *I, int i) { 
-    return getOrigin(I->getOperand(i)); 
-  } 
- 
-  /// Remember the place where a shadow check should be inserted. 
-  /// 
-  /// This location will be later instrumented with a check that will print a 
-  /// UMR warning in runtime if the shadow value is not 0. 
-  void insertShadowCheck(Value *Shadow, Value *Origin, Instruction *OrigIns) { 
-    assert(Shadow); 
-    if (!InsertChecks) return; 
-#ifndef NDEBUG 
-    Type *ShadowTy = Shadow->getType(); 
+        }
+
+        if (!FArgEagerCheck)
+          ArgOffset += alignTo(Size, kShadowTLSAlignment);
+      }
+      assert(*ShadowPtr && "Could not find shadow for an argument");
+      return *ShadowPtr;
+    }
+    // For everything else the shadow is zero.
+    return getCleanShadow(V);
+  }
+
+  /// Get the shadow for i-th argument of the instruction I.
+  Value *getShadow(Instruction *I, int i) {
+    return getShadow(I->getOperand(i));
+  }
+
+  /// Get the origin for a value.
+  Value *getOrigin(Value *V) {
+    if (!MS.TrackOrigins) return nullptr;
+    if (!PropagateShadow) return getCleanOrigin();
+    if (isa<Constant>(V)) return getCleanOrigin();
+    assert((isa<Instruction>(V) || isa<Argument>(V)) &&
+           "Unexpected value type in getOrigin()");
+    if (Instruction *I = dyn_cast<Instruction>(V)) {
+      if (I->getMetadata("nosanitize"))
+        return getCleanOrigin();
+    }
+    Value *Origin = OriginMap[V];
+    assert(Origin && "Missing origin");
+    return Origin;
+  }
+
+  /// Get the origin for i-th argument of the instruction I.
+  Value *getOrigin(Instruction *I, int i) {
+    return getOrigin(I->getOperand(i));
+  }
+
+  /// Remember the place where a shadow check should be inserted.
+  ///
+  /// This location will be later instrumented with a check that will print a
+  /// UMR warning in runtime if the shadow value is not 0.
+  void insertShadowCheck(Value *Shadow, Value *Origin, Instruction *OrigIns) {
+    assert(Shadow);
+    if (!InsertChecks) return;
+#ifndef NDEBUG
+    Type *ShadowTy = Shadow->getType();
     assert((isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy) ||
             isa<StructType>(ShadowTy) || isa<ArrayType>(ShadowTy)) &&
            "Can only insert checks for integer, vector, and aggregate shadow "
            "types");
-#endif 
-    InstrumentationList.push_back( 
-        ShadowOriginAndInsertPoint(Shadow, Origin, OrigIns)); 
-  } 
- 
-  /// Remember the place where a shadow check should be inserted. 
-  /// 
-  /// This location will be later instrumented with a check that will print a 
-  /// UMR warning in runtime if the value is not fully defined. 
-  void insertShadowCheck(Value *Val, Instruction *OrigIns) { 
-    assert(Val); 
-    Value *Shadow, *Origin; 
-    if (ClCheckConstantShadow) { 
-      Shadow = getShadow(Val); 
-      if (!Shadow) return; 
-      Origin = getOrigin(Val); 
-    } else { 
-      Shadow = dyn_cast_or_null<Instruction>(getShadow(Val)); 
-      if (!Shadow) return; 
-      Origin = dyn_cast_or_null<Instruction>(getOrigin(Val)); 
-    } 
-    insertShadowCheck(Shadow, Origin, OrigIns); 
-  } 
- 
-  AtomicOrdering addReleaseOrdering(AtomicOrdering a) { 
-    switch (a) { 
-      case AtomicOrdering::NotAtomic: 
-        return AtomicOrdering::NotAtomic; 
-      case AtomicOrdering::Unordered: 
-      case AtomicOrdering::Monotonic: 
-      case AtomicOrdering::Release: 
-        return AtomicOrdering::Release; 
-      case AtomicOrdering::Acquire: 
-      case AtomicOrdering::AcquireRelease: 
-        return AtomicOrdering::AcquireRelease; 
-      case AtomicOrdering::SequentiallyConsistent: 
-        return AtomicOrdering::SequentiallyConsistent; 
-    } 
-    llvm_unreachable("Unknown ordering"); 
-  } 
- 
+#endif
+    InstrumentationList.push_back(
+        ShadowOriginAndInsertPoint(Shadow, Origin, OrigIns));
+  }
+
+  /// Remember the place where a shadow check should be inserted.
+  ///
+  /// This location will be later instrumented with a check that will print a
+  /// UMR warning in runtime if the value is not fully defined.
+  void insertShadowCheck(Value *Val, Instruction *OrigIns) {
+    assert(Val);
+    Value *Shadow, *Origin;
+    if (ClCheckConstantShadow) {
+      Shadow = getShadow(Val);
+      if (!Shadow) return;
+      Origin = getOrigin(Val);
+    } else {
+      Shadow = dyn_cast_or_null<Instruction>(getShadow(Val));
+      if (!Shadow) return;
+      Origin = dyn_cast_or_null<Instruction>(getOrigin(Val));
+    }
+    insertShadowCheck(Shadow, Origin, OrigIns);
+  }
+
+  AtomicOrdering addReleaseOrdering(AtomicOrdering a) {
+    switch (a) {
+      case AtomicOrdering::NotAtomic:
+        return AtomicOrdering::NotAtomic;
+      case AtomicOrdering::Unordered:
+      case AtomicOrdering::Monotonic:
+      case AtomicOrdering::Release:
+        return AtomicOrdering::Release;
+      case AtomicOrdering::Acquire:
+      case AtomicOrdering::AcquireRelease:
+        return AtomicOrdering::AcquireRelease;
+      case AtomicOrdering::SequentiallyConsistent:
+        return AtomicOrdering::SequentiallyConsistent;
+    }
+    llvm_unreachable("Unknown ordering");
+  }
+
   Value *makeAddReleaseOrderingTable(IRBuilder<> &IRB) {
     constexpr int NumOrderings = (int)AtomicOrderingCABI::seq_cst + 1;
     uint32_t OrderingTable[NumOrderings] = {};
@@ -1849,23 +1849,23 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
                                    makeArrayRef(OrderingTable, NumOrderings));
   }
 
-  AtomicOrdering addAcquireOrdering(AtomicOrdering a) { 
-    switch (a) { 
-      case AtomicOrdering::NotAtomic: 
-        return AtomicOrdering::NotAtomic; 
-      case AtomicOrdering::Unordered: 
-      case AtomicOrdering::Monotonic: 
-      case AtomicOrdering::Acquire: 
-        return AtomicOrdering::Acquire; 
-      case AtomicOrdering::Release: 
-      case AtomicOrdering::AcquireRelease: 
-        return AtomicOrdering::AcquireRelease; 
-      case AtomicOrdering::SequentiallyConsistent: 
-        return AtomicOrdering::SequentiallyConsistent; 
-    } 
-    llvm_unreachable("Unknown ordering"); 
-  } 
- 
+  AtomicOrdering addAcquireOrdering(AtomicOrdering a) {
+    switch (a) {
+      case AtomicOrdering::NotAtomic:
+        return AtomicOrdering::NotAtomic;
+      case AtomicOrdering::Unordered:
+      case AtomicOrdering::Monotonic:
+      case AtomicOrdering::Acquire:
+        return AtomicOrdering::Acquire;
+      case AtomicOrdering::Release:
+      case AtomicOrdering::AcquireRelease:
+        return AtomicOrdering::AcquireRelease;
+      case AtomicOrdering::SequentiallyConsistent:
+        return AtomicOrdering::SequentiallyConsistent;
+    }
+    llvm_unreachable("Unknown ordering");
+  }
+
   Value *makeAddAcquireOrderingTable(IRBuilder<> &IRB) {
     constexpr int NumOrderings = (int)AtomicOrderingCABI::seq_cst + 1;
     uint32_t OrderingTable[NumOrderings] = {};
@@ -1884,1353 +1884,1353 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
                                    makeArrayRef(OrderingTable, NumOrderings));
   }
 
-  // ------------------- Visitors. 
-  using InstVisitor<MemorySanitizerVisitor>::visit; 
-  void visit(Instruction &I) { 
+  // ------------------- Visitors.
+  using InstVisitor<MemorySanitizerVisitor>::visit;
+  void visit(Instruction &I) {
     if (I.getMetadata("nosanitize"))
       return;
     // Don't want to visit if we're in the prologue
     if (isInPrologue(I))
       return;
     InstVisitor<MemorySanitizerVisitor>::visit(I);
-  } 
- 
-  /// Instrument LoadInst 
-  /// 
-  /// Loads the corresponding shadow and (optionally) origin. 
-  /// Optionally, checks that the load address is fully defined. 
-  void visitLoadInst(LoadInst &I) { 
-    assert(I.getType()->isSized() && "Load type must have size"); 
-    assert(!I.getMetadata("nosanitize")); 
-    IRBuilder<> IRB(I.getNextNode()); 
-    Type *ShadowTy = getShadowTy(&I); 
-    Value *Addr = I.getPointerOperand(); 
-    Value *ShadowPtr = nullptr, *OriginPtr = nullptr; 
-    const Align Alignment = assumeAligned(I.getAlignment()); 
-    if (PropagateShadow) { 
-      std::tie(ShadowPtr, OriginPtr) = 
-          getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ false); 
-      setShadow(&I, 
-                IRB.CreateAlignedLoad(ShadowTy, ShadowPtr, Alignment, "_msld")); 
-    } else { 
-      setShadow(&I, getCleanShadow(&I)); 
-    } 
- 
-    if (ClCheckAccessAddress) 
-      insertShadowCheck(I.getPointerOperand(), &I); 
- 
-    if (I.isAtomic()) 
-      I.setOrdering(addAcquireOrdering(I.getOrdering())); 
- 
-    if (MS.TrackOrigins) { 
-      if (PropagateShadow) { 
-        const Align OriginAlignment = std::max(kMinOriginAlignment, Alignment); 
-        setOrigin( 
-            &I, IRB.CreateAlignedLoad(MS.OriginTy, OriginPtr, OriginAlignment)); 
-      } else { 
-        setOrigin(&I, getCleanOrigin()); 
-      } 
-    } 
-  } 
- 
-  /// Instrument StoreInst 
-  /// 
-  /// Stores the corresponding shadow and (optionally) origin. 
-  /// Optionally, checks that the store address is fully defined. 
-  void visitStoreInst(StoreInst &I) { 
-    StoreList.push_back(&I); 
-    if (ClCheckAccessAddress) 
-      insertShadowCheck(I.getPointerOperand(), &I); 
-  } 
- 
-  void handleCASOrRMW(Instruction &I) { 
-    assert(isa<AtomicRMWInst>(I) || isa<AtomicCmpXchgInst>(I)); 
- 
-    IRBuilder<> IRB(&I); 
-    Value *Addr = I.getOperand(0); 
-    Value *ShadowPtr = getShadowOriginPtr(Addr, IRB, I.getType(), Align(1), 
-                                          /*isStore*/ true) 
-                           .first; 
- 
-    if (ClCheckAccessAddress) 
-      insertShadowCheck(Addr, &I); 
- 
-    // Only test the conditional argument of cmpxchg instruction. 
-    // The other argument can potentially be uninitialized, but we can not 
-    // detect this situation reliably without possible false positives. 
-    if (isa<AtomicCmpXchgInst>(I)) 
-      insertShadowCheck(I.getOperand(1), &I); 
- 
-    IRB.CreateStore(getCleanShadow(&I), ShadowPtr); 
- 
-    setShadow(&I, getCleanShadow(&I)); 
-    setOrigin(&I, getCleanOrigin()); 
-  } 
- 
-  void visitAtomicRMWInst(AtomicRMWInst &I) { 
-    handleCASOrRMW(I); 
-    I.setOrdering(addReleaseOrdering(I.getOrdering())); 
-  } 
- 
-  void visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) { 
-    handleCASOrRMW(I); 
-    I.setSuccessOrdering(addReleaseOrdering(I.getSuccessOrdering())); 
-  } 
- 
-  // Vector manipulation. 
-  void visitExtractElementInst(ExtractElementInst &I) { 
-    insertShadowCheck(I.getOperand(1), &I); 
-    IRBuilder<> IRB(&I); 
-    setShadow(&I, IRB.CreateExtractElement(getShadow(&I, 0), I.getOperand(1), 
-              "_msprop")); 
-    setOrigin(&I, getOrigin(&I, 0)); 
-  } 
- 
-  void visitInsertElementInst(InsertElementInst &I) { 
-    insertShadowCheck(I.getOperand(2), &I); 
-    IRBuilder<> IRB(&I); 
-    setShadow(&I, IRB.CreateInsertElement(getShadow(&I, 0), getShadow(&I, 1), 
-              I.getOperand(2), "_msprop")); 
-    setOriginForNaryOp(I); 
-  } 
- 
-  void visitShuffleVectorInst(ShuffleVectorInst &I) { 
-    IRBuilder<> IRB(&I); 
-    setShadow(&I, IRB.CreateShuffleVector(getShadow(&I, 0), getShadow(&I, 1), 
-                                          I.getShuffleMask(), "_msprop")); 
-    setOriginForNaryOp(I); 
-  } 
- 
-  // Casts. 
-  void visitSExtInst(SExtInst &I) { 
-    IRBuilder<> IRB(&I); 
-    setShadow(&I, IRB.CreateSExt(getShadow(&I, 0), I.getType(), "_msprop")); 
-    setOrigin(&I, getOrigin(&I, 0)); 
-  } 
- 
-  void visitZExtInst(ZExtInst &I) { 
-    IRBuilder<> IRB(&I); 
-    setShadow(&I, IRB.CreateZExt(getShadow(&I, 0), I.getType(), "_msprop")); 
-    setOrigin(&I, getOrigin(&I, 0)); 
-  } 
- 
-  void visitTruncInst(TruncInst &I) { 
-    IRBuilder<> IRB(&I); 
-    setShadow(&I, IRB.CreateTrunc(getShadow(&I, 0), I.getType(), "_msprop")); 
-    setOrigin(&I, getOrigin(&I, 0)); 
-  } 
- 
-  void visitBitCastInst(BitCastInst &I) { 
-    // Special case: if this is the bitcast (there is exactly 1 allowed) between 
-    // a musttail call and a ret, don't instrument. New instructions are not 
-    // allowed after a musttail call. 
-    if (auto *CI = dyn_cast<CallInst>(I.getOperand(0))) 
-      if (CI->isMustTailCall()) 
-        return; 
-    IRBuilder<> IRB(&I); 
-    setShadow(&I, IRB.CreateBitCast(getShadow(&I, 0), getShadowTy(&I))); 
-    setOrigin(&I, getOrigin(&I, 0)); 
-  } 
- 
-  void visitPtrToIntInst(PtrToIntInst &I) { 
-    IRBuilder<> IRB(&I); 
-    setShadow(&I, IRB.CreateIntCast(getShadow(&I, 0), getShadowTy(&I), false, 
-             "_msprop_ptrtoint")); 
-    setOrigin(&I, getOrigin(&I, 0)); 
-  } 
- 
-  void visitIntToPtrInst(IntToPtrInst &I) { 
-    IRBuilder<> IRB(&I); 
-    setShadow(&I, IRB.CreateIntCast(getShadow(&I, 0), getShadowTy(&I), false, 
-             "_msprop_inttoptr")); 
-    setOrigin(&I, getOrigin(&I, 0)); 
-  } 
- 
-  void visitFPToSIInst(CastInst& I) { handleShadowOr(I); } 
-  void visitFPToUIInst(CastInst& I) { handleShadowOr(I); } 
-  void visitSIToFPInst(CastInst& I) { handleShadowOr(I); } 
-  void visitUIToFPInst(CastInst& I) { handleShadowOr(I); } 
-  void visitFPExtInst(CastInst& I) { handleShadowOr(I); } 
-  void visitFPTruncInst(CastInst& I) { handleShadowOr(I); } 
- 
-  /// Propagate shadow for bitwise AND. 
-  /// 
-  /// This code is exact, i.e. if, for example, a bit in the left argument 
-  /// is defined and 0, then neither the value not definedness of the 
-  /// corresponding bit in B don't affect the resulting shadow. 
-  void visitAnd(BinaryOperator &I) { 
-    IRBuilder<> IRB(&I); 
-    //  "And" of 0 and a poisoned value results in unpoisoned value. 
-    //  1&1 => 1;     0&1 => 0;     p&1 => p; 
-    //  1&0 => 0;     0&0 => 0;     p&0 => 0; 
-    //  1&p => p;     0&p => 0;     p&p => p; 
-    //  S = (S1 & S2) | (V1 & S2) | (S1 & V2) 
-    Value *S1 = getShadow(&I, 0); 
-    Value *S2 = getShadow(&I, 1); 
-    Value *V1 = I.getOperand(0); 
-    Value *V2 = I.getOperand(1); 
-    if (V1->getType() != S1->getType()) { 
-      V1 = IRB.CreateIntCast(V1, S1->getType(), false); 
-      V2 = IRB.CreateIntCast(V2, S2->getType(), false); 
-    } 
-    Value *S1S2 = IRB.CreateAnd(S1, S2); 
-    Value *V1S2 = IRB.CreateAnd(V1, S2); 
-    Value *S1V2 = IRB.CreateAnd(S1, V2); 
-    setShadow(&I, IRB.CreateOr({S1S2, V1S2, S1V2})); 
-    setOriginForNaryOp(I); 
-  } 
- 
-  void visitOr(BinaryOperator &I) { 
-    IRBuilder<> IRB(&I); 
-    //  "Or" of 1 and a poisoned value results in unpoisoned value. 
-    //  1|1 => 1;     0|1 => 1;     p|1 => 1; 
-    //  1|0 => 1;     0|0 => 0;     p|0 => p; 
-    //  1|p => 1;     0|p => p;     p|p => p; 
-    //  S = (S1 & S2) | (~V1 & S2) | (S1 & ~V2) 
-    Value *S1 = getShadow(&I, 0); 
-    Value *S2 = getShadow(&I, 1); 
-    Value *V1 = IRB.CreateNot(I.getOperand(0)); 
-    Value *V2 = IRB.CreateNot(I.getOperand(1)); 
-    if (V1->getType() != S1->getType()) { 
-      V1 = IRB.CreateIntCast(V1, S1->getType(), false); 
-      V2 = IRB.CreateIntCast(V2, S2->getType(), false); 
-    } 
-    Value *S1S2 = IRB.CreateAnd(S1, S2); 
-    Value *V1S2 = IRB.CreateAnd(V1, S2); 
-    Value *S1V2 = IRB.CreateAnd(S1, V2); 
-    setShadow(&I, IRB.CreateOr({S1S2, V1S2, S1V2})); 
-    setOriginForNaryOp(I); 
-  } 
- 
-  /// Default propagation of shadow and/or origin. 
-  /// 
-  /// This class implements the general case of shadow propagation, used in all 
-  /// cases where we don't know and/or don't care about what the operation 
-  /// actually does. It converts all input shadow values to a common type 
-  /// (extending or truncating as necessary), and bitwise OR's them. 
-  /// 
-  /// This is much cheaper than inserting checks (i.e. requiring inputs to be 
-  /// fully initialized), and less prone to false positives. 
-  /// 
-  /// This class also implements the general case of origin propagation. For a 
-  /// Nary operation, result origin is set to the origin of an argument that is 
-  /// not entirely initialized. If there is more than one such arguments, the 
-  /// rightmost of them is picked. It does not matter which one is picked if all 
-  /// arguments are initialized. 
-  template <bool CombineShadow> 
-  class Combiner { 
-    Value *Shadow = nullptr; 
-    Value *Origin = nullptr; 
-    IRBuilder<> &IRB; 
-    MemorySanitizerVisitor *MSV; 
- 
-  public: 
-    Combiner(MemorySanitizerVisitor *MSV, IRBuilder<> &IRB) 
-        : IRB(IRB), MSV(MSV) {} 
- 
-    /// Add a pair of shadow and origin values to the mix. 
-    Combiner &Add(Value *OpShadow, Value *OpOrigin) { 
-      if (CombineShadow) { 
-        assert(OpShadow); 
-        if (!Shadow) 
-          Shadow = OpShadow; 
-        else { 
-          OpShadow = MSV->CreateShadowCast(IRB, OpShadow, Shadow->getType()); 
-          Shadow = IRB.CreateOr(Shadow, OpShadow, "_msprop"); 
-        } 
-      } 
- 
-      if (MSV->MS.TrackOrigins) { 
-        assert(OpOrigin); 
-        if (!Origin) { 
-          Origin = OpOrigin; 
-        } else { 
-          Constant *ConstOrigin = dyn_cast<Constant>(OpOrigin); 
-          // No point in adding something that might result in 0 origin value. 
-          if (!ConstOrigin || !ConstOrigin->isNullValue()) { 
+  }
+
+  /// Instrument LoadInst
+  ///
+  /// Loads the corresponding shadow and (optionally) origin.
+  /// Optionally, checks that the load address is fully defined.
+  void visitLoadInst(LoadInst &I) {
+    assert(I.getType()->isSized() && "Load type must have size");
+    assert(!I.getMetadata("nosanitize"));
+    IRBuilder<> IRB(I.getNextNode());
+    Type *ShadowTy = getShadowTy(&I);
+    Value *Addr = I.getPointerOperand();
+    Value *ShadowPtr = nullptr, *OriginPtr = nullptr;
+    const Align Alignment = assumeAligned(I.getAlignment());
+    if (PropagateShadow) {
+      std::tie(ShadowPtr, OriginPtr) =
+          getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ false);
+      setShadow(&I,
+                IRB.CreateAlignedLoad(ShadowTy, ShadowPtr, Alignment, "_msld"));
+    } else {
+      setShadow(&I, getCleanShadow(&I));
+    }
+
+    if (ClCheckAccessAddress)
+      insertShadowCheck(I.getPointerOperand(), &I);
+
+    if (I.isAtomic())
+      I.setOrdering(addAcquireOrdering(I.getOrdering()));
+
+    if (MS.TrackOrigins) {
+      if (PropagateShadow) {
+        const Align OriginAlignment = std::max(kMinOriginAlignment, Alignment);
+        setOrigin(
+            &I, IRB.CreateAlignedLoad(MS.OriginTy, OriginPtr, OriginAlignment));
+      } else {
+        setOrigin(&I, getCleanOrigin());
+      }
+    }
+  }
+
+  /// Instrument StoreInst
+  ///
+  /// Stores the corresponding shadow and (optionally) origin.
+  /// Optionally, checks that the store address is fully defined.
+  void visitStoreInst(StoreInst &I) {
+    StoreList.push_back(&I);
+    if (ClCheckAccessAddress)
+      insertShadowCheck(I.getPointerOperand(), &I);
+  }
+
+  void handleCASOrRMW(Instruction &I) {
+    assert(isa<AtomicRMWInst>(I) || isa<AtomicCmpXchgInst>(I));
+
+    IRBuilder<> IRB(&I);
+    Value *Addr = I.getOperand(0);
+    Value *ShadowPtr = getShadowOriginPtr(Addr, IRB, I.getType(), Align(1),
+                                          /*isStore*/ true)
+                           .first;
+
+    if (ClCheckAccessAddress)
+      insertShadowCheck(Addr, &I);
+
+    // Only test the conditional argument of cmpxchg instruction.
+    // The other argument can potentially be uninitialized, but we can not
+    // detect this situation reliably without possible false positives.
+    if (isa<AtomicCmpXchgInst>(I))
+      insertShadowCheck(I.getOperand(1), &I);
+
+    IRB.CreateStore(getCleanShadow(&I), ShadowPtr);
+
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+  }
+
+  void visitAtomicRMWInst(AtomicRMWInst &I) {
+    handleCASOrRMW(I);
+    I.setOrdering(addReleaseOrdering(I.getOrdering()));
+  }
+
+  void visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) {
+    handleCASOrRMW(I);
+    I.setSuccessOrdering(addReleaseOrdering(I.getSuccessOrdering()));
+  }
+
+  // Vector manipulation.
+  void visitExtractElementInst(ExtractElementInst &I) {
+    insertShadowCheck(I.getOperand(1), &I);
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateExtractElement(getShadow(&I, 0), I.getOperand(1),
+              "_msprop"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitInsertElementInst(InsertElementInst &I) {
+    insertShadowCheck(I.getOperand(2), &I);
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateInsertElement(getShadow(&I, 0), getShadow(&I, 1),
+              I.getOperand(2), "_msprop"));
+    setOriginForNaryOp(I);
+  }
+
+  void visitShuffleVectorInst(ShuffleVectorInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateShuffleVector(getShadow(&I, 0), getShadow(&I, 1),
+                                          I.getShuffleMask(), "_msprop"));
+    setOriginForNaryOp(I);
+  }
+
+  // Casts.
+  void visitSExtInst(SExtInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateSExt(getShadow(&I, 0), I.getType(), "_msprop"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitZExtInst(ZExtInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateZExt(getShadow(&I, 0), I.getType(), "_msprop"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitTruncInst(TruncInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateTrunc(getShadow(&I, 0), I.getType(), "_msprop"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitBitCastInst(BitCastInst &I) {
+    // Special case: if this is the bitcast (there is exactly 1 allowed) between
+    // a musttail call and a ret, don't instrument. New instructions are not
+    // allowed after a musttail call.
+    if (auto *CI = dyn_cast<CallInst>(I.getOperand(0)))
+      if (CI->isMustTailCall())
+        return;
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateBitCast(getShadow(&I, 0), getShadowTy(&I)));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitPtrToIntInst(PtrToIntInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateIntCast(getShadow(&I, 0), getShadowTy(&I), false,
+             "_msprop_ptrtoint"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitIntToPtrInst(IntToPtrInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateIntCast(getShadow(&I, 0), getShadowTy(&I), false,
+             "_msprop_inttoptr"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitFPToSIInst(CastInst& I) { handleShadowOr(I); }
+  void visitFPToUIInst(CastInst& I) { handleShadowOr(I); }
+  void visitSIToFPInst(CastInst& I) { handleShadowOr(I); }
+  void visitUIToFPInst(CastInst& I) { handleShadowOr(I); }
+  void visitFPExtInst(CastInst& I) { handleShadowOr(I); }
+  void visitFPTruncInst(CastInst& I) { handleShadowOr(I); }
+
+  /// Propagate shadow for bitwise AND.
+  ///
+  /// This code is exact, i.e. if, for example, a bit in the left argument
+  /// is defined and 0, then neither the value not definedness of the
+  /// corresponding bit in B don't affect the resulting shadow.
+  void visitAnd(BinaryOperator &I) {
+    IRBuilder<> IRB(&I);
+    //  "And" of 0 and a poisoned value results in unpoisoned value.
+    //  1&1 => 1;     0&1 => 0;     p&1 => p;
+    //  1&0 => 0;     0&0 => 0;     p&0 => 0;
+    //  1&p => p;     0&p => 0;     p&p => p;
+    //  S = (S1 & S2) | (V1 & S2) | (S1 & V2)
+    Value *S1 = getShadow(&I, 0);
+    Value *S2 = getShadow(&I, 1);
+    Value *V1 = I.getOperand(0);
+    Value *V2 = I.getOperand(1);
+    if (V1->getType() != S1->getType()) {
+      V1 = IRB.CreateIntCast(V1, S1->getType(), false);
+      V2 = IRB.CreateIntCast(V2, S2->getType(), false);
+    }
+    Value *S1S2 = IRB.CreateAnd(S1, S2);
+    Value *V1S2 = IRB.CreateAnd(V1, S2);
+    Value *S1V2 = IRB.CreateAnd(S1, V2);
+    setShadow(&I, IRB.CreateOr({S1S2, V1S2, S1V2}));
+    setOriginForNaryOp(I);
+  }
+
+  void visitOr(BinaryOperator &I) {
+    IRBuilder<> IRB(&I);
+    //  "Or" of 1 and a poisoned value results in unpoisoned value.
+    //  1|1 => 1;     0|1 => 1;     p|1 => 1;
+    //  1|0 => 1;     0|0 => 0;     p|0 => p;
+    //  1|p => 1;     0|p => p;     p|p => p;
+    //  S = (S1 & S2) | (~V1 & S2) | (S1 & ~V2)
+    Value *S1 = getShadow(&I, 0);
+    Value *S2 = getShadow(&I, 1);
+    Value *V1 = IRB.CreateNot(I.getOperand(0));
+    Value *V2 = IRB.CreateNot(I.getOperand(1));
+    if (V1->getType() != S1->getType()) {
+      V1 = IRB.CreateIntCast(V1, S1->getType(), false);
+      V2 = IRB.CreateIntCast(V2, S2->getType(), false);
+    }
+    Value *S1S2 = IRB.CreateAnd(S1, S2);
+    Value *V1S2 = IRB.CreateAnd(V1, S2);
+    Value *S1V2 = IRB.CreateAnd(S1, V2);
+    setShadow(&I, IRB.CreateOr({S1S2, V1S2, S1V2}));
+    setOriginForNaryOp(I);
+  }
+
+  /// Default propagation of shadow and/or origin.
+  ///
+  /// This class implements the general case of shadow propagation, used in all
+  /// cases where we don't know and/or don't care about what the operation
+  /// actually does. It converts all input shadow values to a common type
+  /// (extending or truncating as necessary), and bitwise OR's them.
+  ///
+  /// This is much cheaper than inserting checks (i.e. requiring inputs to be
+  /// fully initialized), and less prone to false positives.
+  ///
+  /// This class also implements the general case of origin propagation. For a
+  /// Nary operation, result origin is set to the origin of an argument that is
+  /// not entirely initialized. If there is more than one such arguments, the
+  /// rightmost of them is picked. It does not matter which one is picked if all
+  /// arguments are initialized.
+  template <bool CombineShadow>
+  class Combiner {
+    Value *Shadow = nullptr;
+    Value *Origin = nullptr;
+    IRBuilder<> &IRB;
+    MemorySanitizerVisitor *MSV;
+
+  public:
+    Combiner(MemorySanitizerVisitor *MSV, IRBuilder<> &IRB)
+        : IRB(IRB), MSV(MSV) {}
+
+    /// Add a pair of shadow and origin values to the mix.
+    Combiner &Add(Value *OpShadow, Value *OpOrigin) {
+      if (CombineShadow) {
+        assert(OpShadow);
+        if (!Shadow)
+          Shadow = OpShadow;
+        else {
+          OpShadow = MSV->CreateShadowCast(IRB, OpShadow, Shadow->getType());
+          Shadow = IRB.CreateOr(Shadow, OpShadow, "_msprop");
+        }
+      }
+
+      if (MSV->MS.TrackOrigins) {
+        assert(OpOrigin);
+        if (!Origin) {
+          Origin = OpOrigin;
+        } else {
+          Constant *ConstOrigin = dyn_cast<Constant>(OpOrigin);
+          // No point in adding something that might result in 0 origin value.
+          if (!ConstOrigin || !ConstOrigin->isNullValue()) {
             Value *FlatShadow = MSV->convertShadowToScalar(OpShadow, IRB);
-            Value *Cond = 
-                IRB.CreateICmpNE(FlatShadow, MSV->getCleanShadow(FlatShadow)); 
-            Origin = IRB.CreateSelect(Cond, OpOrigin, Origin); 
-          } 
-        } 
-      } 
-      return *this; 
-    } 
- 
-    /// Add an application value to the mix. 
-    Combiner &Add(Value *V) { 
-      Value *OpShadow = MSV->getShadow(V); 
-      Value *OpOrigin = MSV->MS.TrackOrigins ? MSV->getOrigin(V) : nullptr; 
-      return Add(OpShadow, OpOrigin); 
-    } 
- 
-    /// Set the current combined values as the given instruction's shadow 
-    /// and origin. 
-    void Done(Instruction *I) { 
-      if (CombineShadow) { 
-        assert(Shadow); 
-        Shadow = MSV->CreateShadowCast(IRB, Shadow, MSV->getShadowTy(I)); 
-        MSV->setShadow(I, Shadow); 
-      } 
-      if (MSV->MS.TrackOrigins) { 
-        assert(Origin); 
-        MSV->setOrigin(I, Origin); 
-      } 
-    } 
-  }; 
- 
-  using ShadowAndOriginCombiner = Combiner<true>; 
-  using OriginCombiner = Combiner<false>; 
- 
-  /// Propagate origin for arbitrary operation. 
-  void setOriginForNaryOp(Instruction &I) { 
-    if (!MS.TrackOrigins) return; 
-    IRBuilder<> IRB(&I); 
-    OriginCombiner OC(this, IRB); 
-    for (Instruction::op_iterator OI = I.op_begin(); OI != I.op_end(); ++OI) 
-      OC.Add(OI->get()); 
-    OC.Done(&I); 
-  } 
- 
-  size_t VectorOrPrimitiveTypeSizeInBits(Type *Ty) { 
-    assert(!(Ty->isVectorTy() && Ty->getScalarType()->isPointerTy()) && 
-           "Vector of pointers is not a valid shadow type"); 
-    return Ty->isVectorTy() ? cast<FixedVectorType>(Ty)->getNumElements() * 
-                                  Ty->getScalarSizeInBits() 
-                            : Ty->getPrimitiveSizeInBits(); 
-  } 
- 
-  /// Cast between two shadow types, extending or truncating as 
-  /// necessary. 
-  Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy, 
-                          bool Signed = false) { 
-    Type *srcTy = V->getType(); 
-    size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy); 
-    size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy); 
-    if (srcSizeInBits > 1 && dstSizeInBits == 1) 
-      return IRB.CreateICmpNE(V, getCleanShadow(V)); 
- 
-    if (dstTy->isIntegerTy() && srcTy->isIntegerTy()) 
-      return IRB.CreateIntCast(V, dstTy, Signed); 
-    if (dstTy->isVectorTy() && srcTy->isVectorTy() && 
-        cast<FixedVectorType>(dstTy)->getNumElements() == 
-            cast<FixedVectorType>(srcTy)->getNumElements()) 
-      return IRB.CreateIntCast(V, dstTy, Signed); 
-    Value *V1 = IRB.CreateBitCast(V, Type::getIntNTy(*MS.C, srcSizeInBits)); 
-    Value *V2 = 
-      IRB.CreateIntCast(V1, Type::getIntNTy(*MS.C, dstSizeInBits), Signed); 
-    return IRB.CreateBitCast(V2, dstTy); 
-    // TODO: handle struct types. 
-  } 
- 
-  /// Cast an application value to the type of its own shadow. 
-  Value *CreateAppToShadowCast(IRBuilder<> &IRB, Value *V) { 
-    Type *ShadowTy = getShadowTy(V); 
-    if (V->getType() == ShadowTy) 
-      return V; 
-    if (V->getType()->isPtrOrPtrVectorTy()) 
-      return IRB.CreatePtrToInt(V, ShadowTy); 
-    else 
-      return IRB.CreateBitCast(V, ShadowTy); 
-  } 
- 
-  /// Propagate shadow for arbitrary operation. 
-  void handleShadowOr(Instruction &I) { 
-    IRBuilder<> IRB(&I); 
-    ShadowAndOriginCombiner SC(this, IRB); 
-    for (Instruction::op_iterator OI = I.op_begin(); OI != I.op_end(); ++OI) 
-      SC.Add(OI->get()); 
-    SC.Done(&I); 
-  } 
- 
-  void visitFNeg(UnaryOperator &I) { handleShadowOr(I); } 
- 
-  // Handle multiplication by constant. 
-  // 
-  // Handle a special case of multiplication by constant that may have one or 
-  // more zeros in the lower bits. This makes corresponding number of lower bits 
-  // of the result zero as well. We model it by shifting the other operand 
-  // shadow left by the required number of bits. Effectively, we transform 
-  // (X * (A * 2**B)) to ((X << B) * A) and instrument (X << B) as (Sx << B). 
-  // We use multiplication by 2**N instead of shift to cover the case of 
-  // multiplication by 0, which may occur in some elements of a vector operand. 
-  void handleMulByConstant(BinaryOperator &I, Constant *ConstArg, 
-                           Value *OtherArg) { 
-    Constant *ShadowMul; 
-    Type *Ty = ConstArg->getType(); 
-    if (auto *VTy = dyn_cast<VectorType>(Ty)) { 
-      unsigned NumElements = cast<FixedVectorType>(VTy)->getNumElements(); 
-      Type *EltTy = VTy->getElementType(); 
-      SmallVector<Constant *, 16> Elements; 
-      for (unsigned Idx = 0; Idx < NumElements; ++Idx) { 
-        if (ConstantInt *Elt = 
-                dyn_cast<ConstantInt>(ConstArg->getAggregateElement(Idx))) { 
-          const APInt &V = Elt->getValue(); 
-          APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros(); 
-          Elements.push_back(ConstantInt::get(EltTy, V2)); 
-        } else { 
-          Elements.push_back(ConstantInt::get(EltTy, 1)); 
-        } 
-      } 
-      ShadowMul = ConstantVector::get(Elements); 
-    } else { 
-      if (ConstantInt *Elt = dyn_cast<ConstantInt>(ConstArg)) { 
-        const APInt &V = Elt->getValue(); 
-        APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros(); 
-        ShadowMul = ConstantInt::get(Ty, V2); 
-      } else { 
-        ShadowMul = ConstantInt::get(Ty, 1); 
-      } 
-    } 
- 
-    IRBuilder<> IRB(&I); 
-    setShadow(&I, 
-              IRB.CreateMul(getShadow(OtherArg), ShadowMul, "msprop_mul_cst")); 
-    setOrigin(&I, getOrigin(OtherArg)); 
-  } 
- 
-  void visitMul(BinaryOperator &I) { 
-    Constant *constOp0 = dyn_cast<Constant>(I.getOperand(0)); 
-    Constant *constOp1 = dyn_cast<Constant>(I.getOperand(1)); 
-    if (constOp0 && !constOp1) 
-      handleMulByConstant(I, constOp0, I.getOperand(1)); 
-    else if (constOp1 && !constOp0) 
-      handleMulByConstant(I, constOp1, I.getOperand(0)); 
-    else 
-      handleShadowOr(I); 
-  } 
- 
-  void visitFAdd(BinaryOperator &I) { handleShadowOr(I); } 
-  void visitFSub(BinaryOperator &I) { handleShadowOr(I); } 
-  void visitFMul(BinaryOperator &I) { handleShadowOr(I); } 
-  void visitAdd(BinaryOperator &I) { handleShadowOr(I); } 
-  void visitSub(BinaryOperator &I) { handleShadowOr(I); } 
-  void visitXor(BinaryOperator &I) { handleShadowOr(I); } 
- 
-  void handleIntegerDiv(Instruction &I) { 
-    IRBuilder<> IRB(&I); 
-    // Strict on the second argument. 
-    insertShadowCheck(I.getOperand(1), &I); 
-    setShadow(&I, getShadow(&I, 0)); 
-    setOrigin(&I, getOrigin(&I, 0)); 
-  } 
- 
-  void visitUDiv(BinaryOperator &I) { handleIntegerDiv(I); } 
-  void visitSDiv(BinaryOperator &I) { handleIntegerDiv(I); } 
-  void visitURem(BinaryOperator &I) { handleIntegerDiv(I); } 
-  void visitSRem(BinaryOperator &I) { handleIntegerDiv(I); } 
- 
-  // Floating point division is side-effect free. We can not require that the 
-  // divisor is fully initialized and must propagate shadow. See PR37523. 
-  void visitFDiv(BinaryOperator &I) { handleShadowOr(I); } 
-  void visitFRem(BinaryOperator &I) { handleShadowOr(I); } 
- 
-  /// Instrument == and != comparisons. 
-  /// 
-  /// Sometimes the comparison result is known even if some of the bits of the 
-  /// arguments are not. 
-  void handleEqualityComparison(ICmpInst &I) { 
-    IRBuilder<> IRB(&I); 
-    Value *A = I.getOperand(0); 
-    Value *B = I.getOperand(1); 
-    Value *Sa = getShadow(A); 
-    Value *Sb = getShadow(B); 
- 
-    // Get rid of pointers and vectors of pointers. 
-    // For ints (and vectors of ints), types of A and Sa match, 
-    // and this is a no-op. 
-    A = IRB.CreatePointerCast(A, Sa->getType()); 
-    B = IRB.CreatePointerCast(B, Sb->getType()); 
- 
-    // A == B  <==>  (C = A^B) == 0 
-    // A != B  <==>  (C = A^B) != 0 
-    // Sc = Sa | Sb 
-    Value *C = IRB.CreateXor(A, B); 
-    Value *Sc = IRB.CreateOr(Sa, Sb); 
-    // Now dealing with i = (C == 0) comparison (or C != 0, does not matter now) 
-    // Result is defined if one of the following is true 
-    // * there is a defined 1 bit in C 
-    // * C is fully defined 
-    // Si = !(C & ~Sc) && Sc 
-    Value *Zero = Constant::getNullValue(Sc->getType()); 
-    Value *MinusOne = Constant::getAllOnesValue(Sc->getType()); 
-    Value *Si = 
-      IRB.CreateAnd(IRB.CreateICmpNE(Sc, Zero), 
-                    IRB.CreateICmpEQ( 
-                      IRB.CreateAnd(IRB.CreateXor(Sc, MinusOne), C), Zero)); 
-    Si->setName("_msprop_icmp"); 
-    setShadow(&I, Si); 
-    setOriginForNaryOp(I); 
-  } 
- 
-  /// Build the lowest possible value of V, taking into account V's 
-  ///        uninitialized bits. 
-  Value *getLowestPossibleValue(IRBuilder<> &IRB, Value *A, Value *Sa, 
-                                bool isSigned) { 
-    if (isSigned) { 
-      // Split shadow into sign bit and other bits. 
-      Value *SaOtherBits = IRB.CreateLShr(IRB.CreateShl(Sa, 1), 1); 
-      Value *SaSignBit = IRB.CreateXor(Sa, SaOtherBits); 
-      // Maximise the undefined shadow bit, minimize other undefined bits. 
-      return 
-        IRB.CreateOr(IRB.CreateAnd(A, IRB.CreateNot(SaOtherBits)), SaSignBit); 
-    } else { 
-      // Minimize undefined bits. 
-      return IRB.CreateAnd(A, IRB.CreateNot(Sa)); 
-    } 
-  } 
- 
-  /// Build the highest possible value of V, taking into account V's 
-  ///        uninitialized bits. 
-  Value *getHighestPossibleValue(IRBuilder<> &IRB, Value *A, Value *Sa, 
-                                bool isSigned) { 
-    if (isSigned) { 
-      // Split shadow into sign bit and other bits. 
-      Value *SaOtherBits = IRB.CreateLShr(IRB.CreateShl(Sa, 1), 1); 
-      Value *SaSignBit = IRB.CreateXor(Sa, SaOtherBits); 
-      // Minimise the undefined shadow bit, maximise other undefined bits. 
-      return 
-        IRB.CreateOr(IRB.CreateAnd(A, IRB.CreateNot(SaSignBit)), SaOtherBits); 
-    } else { 
-      // Maximize undefined bits. 
-      return IRB.CreateOr(A, Sa); 
-    } 
-  } 
- 
-  /// Instrument relational comparisons. 
-  /// 
-  /// This function does exact shadow propagation for all relational 
-  /// comparisons of integers, pointers and vectors of those. 
-  /// FIXME: output seems suboptimal when one of the operands is a constant 
-  void handleRelationalComparisonExact(ICmpInst &I) { 
-    IRBuilder<> IRB(&I); 
-    Value *A = I.getOperand(0); 
-    Value *B = I.getOperand(1); 
-    Value *Sa = getShadow(A); 
-    Value *Sb = getShadow(B); 
- 
-    // Get rid of pointers and vectors of pointers. 
-    // For ints (and vectors of ints), types of A and Sa match, 
-    // and this is a no-op. 
-    A = IRB.CreatePointerCast(A, Sa->getType()); 
-    B = IRB.CreatePointerCast(B, Sb->getType()); 
- 
-    // Let [a0, a1] be the interval of possible values of A, taking into account 
-    // its undefined bits. Let [b0, b1] be the interval of possible values of B. 
-    // Then (A cmp B) is defined iff (a0 cmp b1) == (a1 cmp b0). 
-    bool IsSigned = I.isSigned(); 
-    Value *S1 = IRB.CreateICmp(I.getPredicate(), 
-                               getLowestPossibleValue(IRB, A, Sa, IsSigned), 
-                               getHighestPossibleValue(IRB, B, Sb, IsSigned)); 
-    Value *S2 = IRB.CreateICmp(I.getPredicate(), 
-                               getHighestPossibleValue(IRB, A, Sa, IsSigned), 
-                               getLowestPossibleValue(IRB, B, Sb, IsSigned)); 
-    Value *Si = IRB.CreateXor(S1, S2); 
-    setShadow(&I, Si); 
-    setOriginForNaryOp(I); 
-  } 
- 
-  /// Instrument signed relational comparisons. 
-  /// 
-  /// Handle sign bit tests: x<0, x>=0, x<=-1, x>-1 by propagating the highest 
-  /// bit of the shadow. Everything else is delegated to handleShadowOr(). 
-  void handleSignedRelationalComparison(ICmpInst &I) { 
-    Constant *constOp; 
-    Value *op = nullptr; 
-    CmpInst::Predicate pre; 
-    if ((constOp = dyn_cast<Constant>(I.getOperand(1)))) { 
-      op = I.getOperand(0); 
-      pre = I.getPredicate(); 
-    } else if ((constOp = dyn_cast<Constant>(I.getOperand(0)))) { 
-      op = I.getOperand(1); 
-      pre = I.getSwappedPredicate(); 
-    } else { 
-      handleShadowOr(I); 
-      return; 
-    } 
- 
-    if ((constOp->isNullValue() && 
-         (pre == CmpInst::ICMP_SLT || pre == CmpInst::ICMP_SGE)) || 
-        (constOp->isAllOnesValue() && 
-         (pre == CmpInst::ICMP_SGT || pre == CmpInst::ICMP_SLE))) { 
-      IRBuilder<> IRB(&I); 
-      Value *Shadow = IRB.CreateICmpSLT(getShadow(op), getCleanShadow(op), 
-                                        "_msprop_icmp_s"); 
-      setShadow(&I, Shadow); 
-      setOrigin(&I, getOrigin(op)); 
-    } else { 
-      handleShadowOr(I); 
-    } 
-  } 
- 
-  void visitICmpInst(ICmpInst &I) { 
-    if (!ClHandleICmp) { 
-      handleShadowOr(I); 
-      return; 
-    } 
-    if (I.isEquality()) { 
-      handleEqualityComparison(I); 
-      return; 
-    } 
- 
-    assert(I.isRelational()); 
-    if (ClHandleICmpExact) { 
-      handleRelationalComparisonExact(I); 
-      return; 
-    } 
-    if (I.isSigned()) { 
-      handleSignedRelationalComparison(I); 
-      return; 
-    } 
- 
-    assert(I.isUnsigned()); 
-    if ((isa<Constant>(I.getOperand(0)) || isa<Constant>(I.getOperand(1)))) { 
-      handleRelationalComparisonExact(I); 
-      return; 
-    } 
- 
-    handleShadowOr(I); 
-  } 
- 
-  void visitFCmpInst(FCmpInst &I) { 
-    handleShadowOr(I); 
-  } 
- 
-  void handleShift(BinaryOperator &I) { 
-    IRBuilder<> IRB(&I); 
-    // If any of the S2 bits are poisoned, the whole thing is poisoned. 
-    // Otherwise perform the same shift on S1. 
-    Value *S1 = getShadow(&I, 0); 
-    Value *S2 = getShadow(&I, 1); 
-    Value *S2Conv = IRB.CreateSExt(IRB.CreateICmpNE(S2, getCleanShadow(S2)), 
-                                   S2->getType()); 
-    Value *V2 = I.getOperand(1); 
-    Value *Shift = IRB.CreateBinOp(I.getOpcode(), S1, V2); 
-    setShadow(&I, IRB.CreateOr(Shift, S2Conv)); 
-    setOriginForNaryOp(I); 
-  } 
- 
-  void visitShl(BinaryOperator &I) { handleShift(I); } 
-  void visitAShr(BinaryOperator &I) { handleShift(I); } 
-  void visitLShr(BinaryOperator &I) { handleShift(I); } 
- 
-  /// Instrument llvm.memmove 
-  /// 
-  /// At this point we don't know if llvm.memmove will be inlined or not. 
-  /// If we don't instrument it and it gets inlined, 
-  /// our interceptor will not kick in and we will lose the memmove. 
-  /// If we instrument the call here, but it does not get inlined, 
-  /// we will memove the shadow twice: which is bad in case 
-  /// of overlapping regions. So, we simply lower the intrinsic to a call. 
-  /// 
-  /// Similar situation exists for memcpy and memset. 
-  void visitMemMoveInst(MemMoveInst &I) { 
-    IRBuilder<> IRB(&I); 
-    IRB.CreateCall( 
-        MS.MemmoveFn, 
-        {IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()), 
-         IRB.CreatePointerCast(I.getArgOperand(1), IRB.getInt8PtrTy()), 
-         IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false)}); 
-    I.eraseFromParent(); 
-  } 
- 
-  // Similar to memmove: avoid copying shadow twice. 
-  // This is somewhat unfortunate as it may slowdown small constant memcpys. 
-  // FIXME: consider doing manual inline for small constant sizes and proper 
-  // alignment. 
-  void visitMemCpyInst(MemCpyInst &I) { 
-    IRBuilder<> IRB(&I); 
-    IRB.CreateCall( 
-        MS.MemcpyFn, 
-        {IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()), 
-         IRB.CreatePointerCast(I.getArgOperand(1), IRB.getInt8PtrTy()), 
-         IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false)}); 
-    I.eraseFromParent(); 
-  } 
- 
-  // Same as memcpy. 
-  void visitMemSetInst(MemSetInst &I) { 
-    IRBuilder<> IRB(&I); 
-    IRB.CreateCall( 
-        MS.MemsetFn, 
-        {IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()), 
-         IRB.CreateIntCast(I.getArgOperand(1), IRB.getInt32Ty(), false), 
-         IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false)}); 
-    I.eraseFromParent(); 
-  } 
- 
-  void visitVAStartInst(VAStartInst &I) { 
-    VAHelper->visitVAStartInst(I); 
-  } 
- 
-  void visitVACopyInst(VACopyInst &I) { 
-    VAHelper->visitVACopyInst(I); 
-  } 
- 
-  /// Handle vector store-like intrinsics. 
-  /// 
-  /// Instrument intrinsics that look like a simple SIMD store: writes memory, 
-  /// has 1 pointer argument and 1 vector argument, returns void. 
-  bool handleVectorStoreIntrinsic(IntrinsicInst &I) { 
-    IRBuilder<> IRB(&I); 
-    Value* Addr = I.getArgOperand(0); 
-    Value *Shadow = getShadow(&I, 1); 
-    Value *ShadowPtr, *OriginPtr; 
- 
-    // We don't know the pointer alignment (could be unaligned SSE store!). 
-    // Have to assume to worst case. 
-    std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr( 
-        Addr, IRB, Shadow->getType(), Align(1), /*isStore*/ true); 
-    IRB.CreateAlignedStore(Shadow, ShadowPtr, Align(1)); 
- 
-    if (ClCheckAccessAddress) 
-      insertShadowCheck(Addr, &I); 
- 
-    // FIXME: factor out common code from materializeStores 
-    if (MS.TrackOrigins) IRB.CreateStore(getOrigin(&I, 1), OriginPtr); 
-    return true; 
-  } 
- 
-  /// Handle vector load-like intrinsics. 
-  /// 
-  /// Instrument intrinsics that look like a simple SIMD load: reads memory, 
-  /// has 1 pointer argument, returns a vector. 
-  bool handleVectorLoadIntrinsic(IntrinsicInst &I) { 
-    IRBuilder<> IRB(&I); 
-    Value *Addr = I.getArgOperand(0); 
- 
-    Type *ShadowTy = getShadowTy(&I); 
-    Value *ShadowPtr = nullptr, *OriginPtr = nullptr; 
-    if (PropagateShadow) { 
-      // We don't know the pointer alignment (could be unaligned SSE load!). 
-      // Have to assume to worst case. 
-      const Align Alignment = Align(1); 
-      std::tie(ShadowPtr, OriginPtr) = 
-          getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ false); 
-      setShadow(&I, 
-                IRB.CreateAlignedLoad(ShadowTy, ShadowPtr, Alignment, "_msld")); 
-    } else { 
-      setShadow(&I, getCleanShadow(&I)); 
-    } 
- 
-    if (ClCheckAccessAddress) 
-      insertShadowCheck(Addr, &I); 
- 
-    if (MS.TrackOrigins) { 
-      if (PropagateShadow) 
-        setOrigin(&I, IRB.CreateLoad(MS.OriginTy, OriginPtr)); 
-      else 
-        setOrigin(&I, getCleanOrigin()); 
-    } 
-    return true; 
-  } 
- 
-  /// Handle (SIMD arithmetic)-like intrinsics. 
-  /// 
-  /// Instrument intrinsics with any number of arguments of the same type, 
-  /// equal to the return type. The type should be simple (no aggregates or 
-  /// pointers; vectors are fine). 
-  /// Caller guarantees that this intrinsic does not access memory. 
-  bool maybeHandleSimpleNomemIntrinsic(IntrinsicInst &I) { 
-    Type *RetTy = I.getType(); 
-    if (!(RetTy->isIntOrIntVectorTy() || 
-          RetTy->isFPOrFPVectorTy() || 
-          RetTy->isX86_MMXTy())) 
-      return false; 
- 
-    unsigned NumArgOperands = I.getNumArgOperands(); 
-    for (unsigned i = 0; i < NumArgOperands; ++i) { 
-      Type *Ty = I.getArgOperand(i)->getType(); 
-      if (Ty != RetTy) 
-        return false; 
-    } 
- 
-    IRBuilder<> IRB(&I); 
-    ShadowAndOriginCombiner SC(this, IRB); 
-    for (unsigned i = 0; i < NumArgOperands; ++i) 
-      SC.Add(I.getArgOperand(i)); 
-    SC.Done(&I); 
- 
-    return true; 
-  } 
- 
-  /// Heuristically instrument unknown intrinsics. 
-  /// 
-  /// The main purpose of this code is to do something reasonable with all 
-  /// random intrinsics we might encounter, most importantly - SIMD intrinsics. 
-  /// We recognize several classes of intrinsics by their argument types and 
-  /// ModRefBehaviour and apply special instrumentation when we are reasonably 
-  /// sure that we know what the intrinsic does. 
-  /// 
-  /// We special-case intrinsics where this approach fails. See llvm.bswap 
-  /// handling as an example of that. 
-  bool handleUnknownIntrinsic(IntrinsicInst &I) { 
-    unsigned NumArgOperands = I.getNumArgOperands(); 
-    if (NumArgOperands == 0) 
-      return false; 
- 
-    if (NumArgOperands == 2 && 
-        I.getArgOperand(0)->getType()->isPointerTy() && 
-        I.getArgOperand(1)->getType()->isVectorTy() && 
-        I.getType()->isVoidTy() && 
-        !I.onlyReadsMemory()) { 
-      // This looks like a vector store. 
-      return handleVectorStoreIntrinsic(I); 
-    } 
- 
-    if (NumArgOperands == 1 && 
-        I.getArgOperand(0)->getType()->isPointerTy() && 
-        I.getType()->isVectorTy() && 
-        I.onlyReadsMemory()) { 
-      // This looks like a vector load. 
-      return handleVectorLoadIntrinsic(I); 
-    } 
- 
-    if (I.doesNotAccessMemory()) 
-      if (maybeHandleSimpleNomemIntrinsic(I)) 
-        return true; 
- 
-    // FIXME: detect and handle SSE maskstore/maskload 
-    return false; 
-  } 
- 
-  void handleInvariantGroup(IntrinsicInst &I) { 
-    setShadow(&I, getShadow(&I, 0)); 
-    setOrigin(&I, getOrigin(&I, 0)); 
-  } 
- 
-  void handleLifetimeStart(IntrinsicInst &I) { 
-    if (!PoisonStack) 
-      return; 
+            Value *Cond =
+                IRB.CreateICmpNE(FlatShadow, MSV->getCleanShadow(FlatShadow));
+            Origin = IRB.CreateSelect(Cond, OpOrigin, Origin);
+          }
+        }
+      }
+      return *this;
+    }
+
+    /// Add an application value to the mix.
+    Combiner &Add(Value *V) {
+      Value *OpShadow = MSV->getShadow(V);
+      Value *OpOrigin = MSV->MS.TrackOrigins ? MSV->getOrigin(V) : nullptr;
+      return Add(OpShadow, OpOrigin);
+    }
+
+    /// Set the current combined values as the given instruction's shadow
+    /// and origin.
+    void Done(Instruction *I) {
+      if (CombineShadow) {
+        assert(Shadow);
+        Shadow = MSV->CreateShadowCast(IRB, Shadow, MSV->getShadowTy(I));
+        MSV->setShadow(I, Shadow);
+      }
+      if (MSV->MS.TrackOrigins) {
+        assert(Origin);
+        MSV->setOrigin(I, Origin);
+      }
+    }
+  };
+
+  using ShadowAndOriginCombiner = Combiner<true>;
+  using OriginCombiner = Combiner<false>;
+
+  /// Propagate origin for arbitrary operation.
+  void setOriginForNaryOp(Instruction &I) {
+    if (!MS.TrackOrigins) return;
+    IRBuilder<> IRB(&I);
+    OriginCombiner OC(this, IRB);
+    for (Instruction::op_iterator OI = I.op_begin(); OI != I.op_end(); ++OI)
+      OC.Add(OI->get());
+    OC.Done(&I);
+  }
+
+  size_t VectorOrPrimitiveTypeSizeInBits(Type *Ty) {
+    assert(!(Ty->isVectorTy() && Ty->getScalarType()->isPointerTy()) &&
+           "Vector of pointers is not a valid shadow type");
+    return Ty->isVectorTy() ? cast<FixedVectorType>(Ty)->getNumElements() *
+                                  Ty->getScalarSizeInBits()
+                            : Ty->getPrimitiveSizeInBits();
+  }
+
+  /// Cast between two shadow types, extending or truncating as
+  /// necessary.
+  Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy,
+                          bool Signed = false) {
+    Type *srcTy = V->getType();
+    size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy);
+    size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy);
+    if (srcSizeInBits > 1 && dstSizeInBits == 1)
+      return IRB.CreateICmpNE(V, getCleanShadow(V));
+
+    if (dstTy->isIntegerTy() && srcTy->isIntegerTy())
+      return IRB.CreateIntCast(V, dstTy, Signed);
+    if (dstTy->isVectorTy() && srcTy->isVectorTy() &&
+        cast<FixedVectorType>(dstTy)->getNumElements() ==
+            cast<FixedVectorType>(srcTy)->getNumElements())
+      return IRB.CreateIntCast(V, dstTy, Signed);
+    Value *V1 = IRB.CreateBitCast(V, Type::getIntNTy(*MS.C, srcSizeInBits));
+    Value *V2 =
+      IRB.CreateIntCast(V1, Type::getIntNTy(*MS.C, dstSizeInBits), Signed);
+    return IRB.CreateBitCast(V2, dstTy);
+    // TODO: handle struct types.
+  }
+
+  /// Cast an application value to the type of its own shadow.
+  Value *CreateAppToShadowCast(IRBuilder<> &IRB, Value *V) {
+    Type *ShadowTy = getShadowTy(V);
+    if (V->getType() == ShadowTy)
+      return V;
+    if (V->getType()->isPtrOrPtrVectorTy())
+      return IRB.CreatePtrToInt(V, ShadowTy);
+    else
+      return IRB.CreateBitCast(V, ShadowTy);
+  }
+
+  /// Propagate shadow for arbitrary operation.
+  void handleShadowOr(Instruction &I) {
+    IRBuilder<> IRB(&I);
+    ShadowAndOriginCombiner SC(this, IRB);
+    for (Instruction::op_iterator OI = I.op_begin(); OI != I.op_end(); ++OI)
+      SC.Add(OI->get());
+    SC.Done(&I);
+  }
+
+  void visitFNeg(UnaryOperator &I) { handleShadowOr(I); }
+
+  // Handle multiplication by constant.
+  //
+  // Handle a special case of multiplication by constant that may have one or
+  // more zeros in the lower bits. This makes corresponding number of lower bits
+  // of the result zero as well. We model it by shifting the other operand
+  // shadow left by the required number of bits. Effectively, we transform
+  // (X * (A * 2**B)) to ((X << B) * A) and instrument (X << B) as (Sx << B).
+  // We use multiplication by 2**N instead of shift to cover the case of
+  // multiplication by 0, which may occur in some elements of a vector operand.
+  void handleMulByConstant(BinaryOperator &I, Constant *ConstArg,
+                           Value *OtherArg) {
+    Constant *ShadowMul;
+    Type *Ty = ConstArg->getType();
+    if (auto *VTy = dyn_cast<VectorType>(Ty)) {
+      unsigned NumElements = cast<FixedVectorType>(VTy)->getNumElements();
+      Type *EltTy = VTy->getElementType();
+      SmallVector<Constant *, 16> Elements;
+      for (unsigned Idx = 0; Idx < NumElements; ++Idx) {
+        if (ConstantInt *Elt =
+                dyn_cast<ConstantInt>(ConstArg->getAggregateElement(Idx))) {
+          const APInt &V = Elt->getValue();
+          APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros();
+          Elements.push_back(ConstantInt::get(EltTy, V2));
+        } else {
+          Elements.push_back(ConstantInt::get(EltTy, 1));
+        }
+      }
+      ShadowMul = ConstantVector::get(Elements);
+    } else {
+      if (ConstantInt *Elt = dyn_cast<ConstantInt>(ConstArg)) {
+        const APInt &V = Elt->getValue();
+        APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros();
+        ShadowMul = ConstantInt::get(Ty, V2);
+      } else {
+        ShadowMul = ConstantInt::get(Ty, 1);
+      }
+    }
+
+    IRBuilder<> IRB(&I);
+    setShadow(&I,
+              IRB.CreateMul(getShadow(OtherArg), ShadowMul, "msprop_mul_cst"));
+    setOrigin(&I, getOrigin(OtherArg));
+  }
+
+  void visitMul(BinaryOperator &I) {
+    Constant *constOp0 = dyn_cast<Constant>(I.getOperand(0));
+    Constant *constOp1 = dyn_cast<Constant>(I.getOperand(1));
+    if (constOp0 && !constOp1)
+      handleMulByConstant(I, constOp0, I.getOperand(1));
+    else if (constOp1 && !constOp0)
+      handleMulByConstant(I, constOp1, I.getOperand(0));
+    else
+      handleShadowOr(I);
+  }
+
+  void visitFAdd(BinaryOperator &I) { handleShadowOr(I); }
+  void visitFSub(BinaryOperator &I) { handleShadowOr(I); }
+  void visitFMul(BinaryOperator &I) { handleShadowOr(I); }
+  void visitAdd(BinaryOperator &I) { handleShadowOr(I); }
+  void visitSub(BinaryOperator &I) { handleShadowOr(I); }
+  void visitXor(BinaryOperator &I) { handleShadowOr(I); }
+
+  void handleIntegerDiv(Instruction &I) {
+    IRBuilder<> IRB(&I);
+    // Strict on the second argument.
+    insertShadowCheck(I.getOperand(1), &I);
+    setShadow(&I, getShadow(&I, 0));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitUDiv(BinaryOperator &I) { handleIntegerDiv(I); }
+  void visitSDiv(BinaryOperator &I) { handleIntegerDiv(I); }
+  void visitURem(BinaryOperator &I) { handleIntegerDiv(I); }
+  void visitSRem(BinaryOperator &I) { handleIntegerDiv(I); }
+
+  // Floating point division is side-effect free. We can not require that the
+  // divisor is fully initialized and must propagate shadow. See PR37523.
+  void visitFDiv(BinaryOperator &I) { handleShadowOr(I); }
+  void visitFRem(BinaryOperator &I) { handleShadowOr(I); }
+
+  /// Instrument == and != comparisons.
+  ///
+  /// Sometimes the comparison result is known even if some of the bits of the
+  /// arguments are not.
+  void handleEqualityComparison(ICmpInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *A = I.getOperand(0);
+    Value *B = I.getOperand(1);
+    Value *Sa = getShadow(A);
+    Value *Sb = getShadow(B);
+
+    // Get rid of pointers and vectors of pointers.
+    // For ints (and vectors of ints), types of A and Sa match,
+    // and this is a no-op.
+    A = IRB.CreatePointerCast(A, Sa->getType());
+    B = IRB.CreatePointerCast(B, Sb->getType());
+
+    // A == B  <==>  (C = A^B) == 0
+    // A != B  <==>  (C = A^B) != 0
+    // Sc = Sa | Sb
+    Value *C = IRB.CreateXor(A, B);
+    Value *Sc = IRB.CreateOr(Sa, Sb);
+    // Now dealing with i = (C == 0) comparison (or C != 0, does not matter now)
+    // Result is defined if one of the following is true
+    // * there is a defined 1 bit in C
+    // * C is fully defined
+    // Si = !(C & ~Sc) && Sc
+    Value *Zero = Constant::getNullValue(Sc->getType());
+    Value *MinusOne = Constant::getAllOnesValue(Sc->getType());
+    Value *Si =
+      IRB.CreateAnd(IRB.CreateICmpNE(Sc, Zero),
+                    IRB.CreateICmpEQ(
+                      IRB.CreateAnd(IRB.CreateXor(Sc, MinusOne), C), Zero));
+    Si->setName("_msprop_icmp");
+    setShadow(&I, Si);
+    setOriginForNaryOp(I);
+  }
+
+  /// Build the lowest possible value of V, taking into account V's
+  ///        uninitialized bits.
+  Value *getLowestPossibleValue(IRBuilder<> &IRB, Value *A, Value *Sa,
+                                bool isSigned) {
+    if (isSigned) {
+      // Split shadow into sign bit and other bits.
+      Value *SaOtherBits = IRB.CreateLShr(IRB.CreateShl(Sa, 1), 1);
+      Value *SaSignBit = IRB.CreateXor(Sa, SaOtherBits);
+      // Maximise the undefined shadow bit, minimize other undefined bits.
+      return
+        IRB.CreateOr(IRB.CreateAnd(A, IRB.CreateNot(SaOtherBits)), SaSignBit);
+    } else {
+      // Minimize undefined bits.
+      return IRB.CreateAnd(A, IRB.CreateNot(Sa));
+    }
+  }
+
+  /// Build the highest possible value of V, taking into account V's
+  ///        uninitialized bits.
+  Value *getHighestPossibleValue(IRBuilder<> &IRB, Value *A, Value *Sa,
+                                bool isSigned) {
+    if (isSigned) {
+      // Split shadow into sign bit and other bits.
+      Value *SaOtherBits = IRB.CreateLShr(IRB.CreateShl(Sa, 1), 1);
+      Value *SaSignBit = IRB.CreateXor(Sa, SaOtherBits);
+      // Minimise the undefined shadow bit, maximise other undefined bits.
+      return
+        IRB.CreateOr(IRB.CreateAnd(A, IRB.CreateNot(SaSignBit)), SaOtherBits);
+    } else {
+      // Maximize undefined bits.
+      return IRB.CreateOr(A, Sa);
+    }
+  }
+
+  /// Instrument relational comparisons.
+  ///
+  /// This function does exact shadow propagation for all relational
+  /// comparisons of integers, pointers and vectors of those.
+  /// FIXME: output seems suboptimal when one of the operands is a constant
+  void handleRelationalComparisonExact(ICmpInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *A = I.getOperand(0);
+    Value *B = I.getOperand(1);
+    Value *Sa = getShadow(A);
+    Value *Sb = getShadow(B);
+
+    // Get rid of pointers and vectors of pointers.
+    // For ints (and vectors of ints), types of A and Sa match,
+    // and this is a no-op.
+    A = IRB.CreatePointerCast(A, Sa->getType());
+    B = IRB.CreatePointerCast(B, Sb->getType());
+
+    // Let [a0, a1] be the interval of possible values of A, taking into account
+    // its undefined bits. Let [b0, b1] be the interval of possible values of B.
+    // Then (A cmp B) is defined iff (a0 cmp b1) == (a1 cmp b0).
+    bool IsSigned = I.isSigned();
+    Value *S1 = IRB.CreateICmp(I.getPredicate(),
+                               getLowestPossibleValue(IRB, A, Sa, IsSigned),
+                               getHighestPossibleValue(IRB, B, Sb, IsSigned));
+    Value *S2 = IRB.CreateICmp(I.getPredicate(),
+                               getHighestPossibleValue(IRB, A, Sa, IsSigned),
+                               getLowestPossibleValue(IRB, B, Sb, IsSigned));
+    Value *Si = IRB.CreateXor(S1, S2);
+    setShadow(&I, Si);
+    setOriginForNaryOp(I);
+  }
+
+  /// Instrument signed relational comparisons.
+  ///
+  /// Handle sign bit tests: x<0, x>=0, x<=-1, x>-1 by propagating the highest
+  /// bit of the shadow. Everything else is delegated to handleShadowOr().
+  void handleSignedRelationalComparison(ICmpInst &I) {
+    Constant *constOp;
+    Value *op = nullptr;
+    CmpInst::Predicate pre;
+    if ((constOp = dyn_cast<Constant>(I.getOperand(1)))) {
+      op = I.getOperand(0);
+      pre = I.getPredicate();
+    } else if ((constOp = dyn_cast<Constant>(I.getOperand(0)))) {
+      op = I.getOperand(1);
+      pre = I.getSwappedPredicate();
+    } else {
+      handleShadowOr(I);
+      return;
+    }
+
+    if ((constOp->isNullValue() &&
+         (pre == CmpInst::ICMP_SLT || pre == CmpInst::ICMP_SGE)) ||
+        (constOp->isAllOnesValue() &&
+         (pre == CmpInst::ICMP_SGT || pre == CmpInst::ICMP_SLE))) {
+      IRBuilder<> IRB(&I);
+      Value *Shadow = IRB.CreateICmpSLT(getShadow(op), getCleanShadow(op),
+                                        "_msprop_icmp_s");
+      setShadow(&I, Shadow);
+      setOrigin(&I, getOrigin(op));
+    } else {
+      handleShadowOr(I);
+    }
+  }
+
+  void visitICmpInst(ICmpInst &I) {
+    if (!ClHandleICmp) {
+      handleShadowOr(I);
+      return;
+    }
+    if (I.isEquality()) {
+      handleEqualityComparison(I);
+      return;
+    }
+
+    assert(I.isRelational());
+    if (ClHandleICmpExact) {
+      handleRelationalComparisonExact(I);
+      return;
+    }
+    if (I.isSigned()) {
+      handleSignedRelationalComparison(I);
+      return;
+    }
+
+    assert(I.isUnsigned());
+    if ((isa<Constant>(I.getOperand(0)) || isa<Constant>(I.getOperand(1)))) {
+      handleRelationalComparisonExact(I);
+      return;
+    }
+
+    handleShadowOr(I);
+  }
+
+  void visitFCmpInst(FCmpInst &I) {
+    handleShadowOr(I);
+  }
+
+  void handleShift(BinaryOperator &I) {
+    IRBuilder<> IRB(&I);
+    // If any of the S2 bits are poisoned, the whole thing is poisoned.
+    // Otherwise perform the same shift on S1.
+    Value *S1 = getShadow(&I, 0);
+    Value *S2 = getShadow(&I, 1);
+    Value *S2Conv = IRB.CreateSExt(IRB.CreateICmpNE(S2, getCleanShadow(S2)),
+                                   S2->getType());
+    Value *V2 = I.getOperand(1);
+    Value *Shift = IRB.CreateBinOp(I.getOpcode(), S1, V2);
+    setShadow(&I, IRB.CreateOr(Shift, S2Conv));
+    setOriginForNaryOp(I);
+  }
+
+  void visitShl(BinaryOperator &I) { handleShift(I); }
+  void visitAShr(BinaryOperator &I) { handleShift(I); }
+  void visitLShr(BinaryOperator &I) { handleShift(I); }
+
+  /// Instrument llvm.memmove
+  ///
+  /// At this point we don't know if llvm.memmove will be inlined or not.
+  /// If we don't instrument it and it gets inlined,
+  /// our interceptor will not kick in and we will lose the memmove.
+  /// If we instrument the call here, but it does not get inlined,
+  /// we will memove the shadow twice: which is bad in case
+  /// of overlapping regions. So, we simply lower the intrinsic to a call.
+  ///
+  /// Similar situation exists for memcpy and memset.
+  void visitMemMoveInst(MemMoveInst &I) {
+    IRBuilder<> IRB(&I);
+    IRB.CreateCall(
+        MS.MemmoveFn,
+        {IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreatePointerCast(I.getArgOperand(1), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false)});
+    I.eraseFromParent();
+  }
+
+  // Similar to memmove: avoid copying shadow twice.
+  // This is somewhat unfortunate as it may slowdown small constant memcpys.
+  // FIXME: consider doing manual inline for small constant sizes and proper
+  // alignment.
+  void visitMemCpyInst(MemCpyInst &I) {
+    IRBuilder<> IRB(&I);
+    IRB.CreateCall(
+        MS.MemcpyFn,
+        {IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreatePointerCast(I.getArgOperand(1), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false)});
+    I.eraseFromParent();
+  }
+
+  // Same as memcpy.
+  void visitMemSetInst(MemSetInst &I) {
+    IRBuilder<> IRB(&I);
+    IRB.CreateCall(
+        MS.MemsetFn,
+        {IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(I.getArgOperand(1), IRB.getInt32Ty(), false),
+         IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false)});
+    I.eraseFromParent();
+  }
+
+  void visitVAStartInst(VAStartInst &I) {
+    VAHelper->visitVAStartInst(I);
+  }
+
+  void visitVACopyInst(VACopyInst &I) {
+    VAHelper->visitVACopyInst(I);
+  }
+
+  /// Handle vector store-like intrinsics.
+  ///
+  /// Instrument intrinsics that look like a simple SIMD store: writes memory,
+  /// has 1 pointer argument and 1 vector argument, returns void.
+  bool handleVectorStoreIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value* Addr = I.getArgOperand(0);
+    Value *Shadow = getShadow(&I, 1);
+    Value *ShadowPtr, *OriginPtr;
+
+    // We don't know the pointer alignment (could be unaligned SSE store!).
+    // Have to assume to worst case.
+    std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr(
+        Addr, IRB, Shadow->getType(), Align(1), /*isStore*/ true);
+    IRB.CreateAlignedStore(Shadow, ShadowPtr, Align(1));
+
+    if (ClCheckAccessAddress)
+      insertShadowCheck(Addr, &I);
+
+    // FIXME: factor out common code from materializeStores
+    if (MS.TrackOrigins) IRB.CreateStore(getOrigin(&I, 1), OriginPtr);
+    return true;
+  }
+
+  /// Handle vector load-like intrinsics.
+  ///
+  /// Instrument intrinsics that look like a simple SIMD load: reads memory,
+  /// has 1 pointer argument, returns a vector.
+  bool handleVectorLoadIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *Addr = I.getArgOperand(0);
+
+    Type *ShadowTy = getShadowTy(&I);
+    Value *ShadowPtr = nullptr, *OriginPtr = nullptr;
+    if (PropagateShadow) {
+      // We don't know the pointer alignment (could be unaligned SSE load!).
+      // Have to assume to worst case.
+      const Align Alignment = Align(1);
+      std::tie(ShadowPtr, OriginPtr) =
+          getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ false);
+      setShadow(&I,
+                IRB.CreateAlignedLoad(ShadowTy, ShadowPtr, Alignment, "_msld"));
+    } else {
+      setShadow(&I, getCleanShadow(&I));
+    }
+
+    if (ClCheckAccessAddress)
+      insertShadowCheck(Addr, &I);
+
+    if (MS.TrackOrigins) {
+      if (PropagateShadow)
+        setOrigin(&I, IRB.CreateLoad(MS.OriginTy, OriginPtr));
+      else
+        setOrigin(&I, getCleanOrigin());
+    }
+    return true;
+  }
+
+  /// Handle (SIMD arithmetic)-like intrinsics.
+  ///
+  /// Instrument intrinsics with any number of arguments of the same type,
+  /// equal to the return type. The type should be simple (no aggregates or
+  /// pointers; vectors are fine).
+  /// Caller guarantees that this intrinsic does not access memory.
+  bool maybeHandleSimpleNomemIntrinsic(IntrinsicInst &I) {
+    Type *RetTy = I.getType();
+    if (!(RetTy->isIntOrIntVectorTy() ||
+          RetTy->isFPOrFPVectorTy() ||
+          RetTy->isX86_MMXTy()))
+      return false;
+
+    unsigned NumArgOperands = I.getNumArgOperands();
+    for (unsigned i = 0; i < NumArgOperands; ++i) {
+      Type *Ty = I.getArgOperand(i)->getType();
+      if (Ty != RetTy)
+        return false;
+    }
+
+    IRBuilder<> IRB(&I);
+    ShadowAndOriginCombiner SC(this, IRB);
+    for (unsigned i = 0; i < NumArgOperands; ++i)
+      SC.Add(I.getArgOperand(i));
+    SC.Done(&I);
+
+    return true;
+  }
+
+  /// Heuristically instrument unknown intrinsics.
+  ///
+  /// The main purpose of this code is to do something reasonable with all
+  /// random intrinsics we might encounter, most importantly - SIMD intrinsics.
+  /// We recognize several classes of intrinsics by their argument types and
+  /// ModRefBehaviour and apply special instrumentation when we are reasonably
+  /// sure that we know what the intrinsic does.
+  ///
+  /// We special-case intrinsics where this approach fails. See llvm.bswap
+  /// handling as an example of that.
+  bool handleUnknownIntrinsic(IntrinsicInst &I) {
+    unsigned NumArgOperands = I.getNumArgOperands();
+    if (NumArgOperands == 0)
+      return false;
+
+    if (NumArgOperands == 2 &&
+        I.getArgOperand(0)->getType()->isPointerTy() &&
+        I.getArgOperand(1)->getType()->isVectorTy() &&
+        I.getType()->isVoidTy() &&
+        !I.onlyReadsMemory()) {
+      // This looks like a vector store.
+      return handleVectorStoreIntrinsic(I);
+    }
+
+    if (NumArgOperands == 1 &&
+        I.getArgOperand(0)->getType()->isPointerTy() &&
+        I.getType()->isVectorTy() &&
+        I.onlyReadsMemory()) {
+      // This looks like a vector load.
+      return handleVectorLoadIntrinsic(I);
+    }
+
+    if (I.doesNotAccessMemory())
+      if (maybeHandleSimpleNomemIntrinsic(I))
+        return true;
+
+    // FIXME: detect and handle SSE maskstore/maskload
+    return false;
+  }
+
+  void handleInvariantGroup(IntrinsicInst &I) {
+    setShadow(&I, getShadow(&I, 0));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void handleLifetimeStart(IntrinsicInst &I) {
+    if (!PoisonStack)
+      return;
     AllocaInst *AI = llvm::findAllocaForValue(I.getArgOperand(1));
-    if (!AI) 
-      InstrumentLifetimeStart = false; 
-    LifetimeStartList.push_back(std::make_pair(&I, AI)); 
-  } 
- 
-  void handleBswap(IntrinsicInst &I) { 
-    IRBuilder<> IRB(&I); 
-    Value *Op = I.getArgOperand(0); 
-    Type *OpType = Op->getType(); 
-    Function *BswapFunc = Intrinsic::getDeclaration( 
-      F.getParent(), Intrinsic::bswap, makeArrayRef(&OpType, 1)); 
-    setShadow(&I, IRB.CreateCall(BswapFunc, getShadow(Op))); 
-    setOrigin(&I, getOrigin(Op)); 
-  } 
- 
-  // Instrument vector convert intrinsic. 
-  // 
-  // This function instruments intrinsics like cvtsi2ss: 
-  // %Out = int_xxx_cvtyyy(%ConvertOp) 
-  // or 
-  // %Out = int_xxx_cvtyyy(%CopyOp, %ConvertOp) 
-  // Intrinsic converts \p NumUsedElements elements of \p ConvertOp to the same 
-  // number \p Out elements, and (if has 2 arguments) copies the rest of the 
-  // elements from \p CopyOp. 
-  // In most cases conversion involves floating-point value which may trigger a 
-  // hardware exception when not fully initialized. For this reason we require 
-  // \p ConvertOp[0:NumUsedElements] to be fully initialized and trap otherwise. 
-  // We copy the shadow of \p CopyOp[NumUsedElements:] to \p 
-  // Out[NumUsedElements:]. This means that intrinsics without \p CopyOp always 
-  // return a fully initialized value. 
+    if (!AI)
+      InstrumentLifetimeStart = false;
+    LifetimeStartList.push_back(std::make_pair(&I, AI));
+  }
+
+  void handleBswap(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *Op = I.getArgOperand(0);
+    Type *OpType = Op->getType();
+    Function *BswapFunc = Intrinsic::getDeclaration(
+      F.getParent(), Intrinsic::bswap, makeArrayRef(&OpType, 1));
+    setShadow(&I, IRB.CreateCall(BswapFunc, getShadow(Op)));
+    setOrigin(&I, getOrigin(Op));
+  }
+
+  // Instrument vector convert intrinsic.
+  //
+  // This function instruments intrinsics like cvtsi2ss:
+  // %Out = int_xxx_cvtyyy(%ConvertOp)
+  // or
+  // %Out = int_xxx_cvtyyy(%CopyOp, %ConvertOp)
+  // Intrinsic converts \p NumUsedElements elements of \p ConvertOp to the same
+  // number \p Out elements, and (if has 2 arguments) copies the rest of the
+  // elements from \p CopyOp.
+  // In most cases conversion involves floating-point value which may trigger a
+  // hardware exception when not fully initialized. For this reason we require
+  // \p ConvertOp[0:NumUsedElements] to be fully initialized and trap otherwise.
+  // We copy the shadow of \p CopyOp[NumUsedElements:] to \p
+  // Out[NumUsedElements:]. This means that intrinsics without \p CopyOp always
+  // return a fully initialized value.
   void handleVectorConvertIntrinsic(IntrinsicInst &I, int NumUsedElements,
                                     bool HasRoundingMode = false) {
-    IRBuilder<> IRB(&I); 
-    Value *CopyOp, *ConvertOp; 
- 
+    IRBuilder<> IRB(&I);
+    Value *CopyOp, *ConvertOp;
+
     assert((!HasRoundingMode ||
             isa<ConstantInt>(I.getArgOperand(I.getNumArgOperands() - 1))) &&
            "Invalid rounding mode");
 
     switch (I.getNumArgOperands() - HasRoundingMode) {
-    case 2: 
-      CopyOp = I.getArgOperand(0); 
-      ConvertOp = I.getArgOperand(1); 
-      break; 
-    case 1: 
-      ConvertOp = I.getArgOperand(0); 
-      CopyOp = nullptr; 
-      break; 
-    default: 
-      llvm_unreachable("Cvt intrinsic with unsupported number of arguments."); 
-    } 
- 
-    // The first *NumUsedElements* elements of ConvertOp are converted to the 
-    // same number of output elements. The rest of the output is copied from 
-    // CopyOp, or (if not available) filled with zeroes. 
-    // Combine shadow for elements of ConvertOp that are used in this operation, 
-    // and insert a check. 
-    // FIXME: consider propagating shadow of ConvertOp, at least in the case of 
-    // int->any conversion. 
-    Value *ConvertShadow = getShadow(ConvertOp); 
-    Value *AggShadow = nullptr; 
-    if (ConvertOp->getType()->isVectorTy()) { 
-      AggShadow = IRB.CreateExtractElement( 
-          ConvertShadow, ConstantInt::get(IRB.getInt32Ty(), 0)); 
-      for (int i = 1; i < NumUsedElements; ++i) { 
-        Value *MoreShadow = IRB.CreateExtractElement( 
-            ConvertShadow, ConstantInt::get(IRB.getInt32Ty(), i)); 
-        AggShadow = IRB.CreateOr(AggShadow, MoreShadow); 
-      } 
-    } else { 
-      AggShadow = ConvertShadow; 
-    } 
-    assert(AggShadow->getType()->isIntegerTy()); 
-    insertShadowCheck(AggShadow, getOrigin(ConvertOp), &I); 
- 
-    // Build result shadow by zero-filling parts of CopyOp shadow that come from 
-    // ConvertOp. 
-    if (CopyOp) { 
-      assert(CopyOp->getType() == I.getType()); 
-      assert(CopyOp->getType()->isVectorTy()); 
-      Value *ResultShadow = getShadow(CopyOp); 
-      Type *EltTy = cast<VectorType>(ResultShadow->getType())->getElementType(); 
-      for (int i = 0; i < NumUsedElements; ++i) { 
-        ResultShadow = IRB.CreateInsertElement( 
-            ResultShadow, ConstantInt::getNullValue(EltTy), 
-            ConstantInt::get(IRB.getInt32Ty(), i)); 
-      } 
-      setShadow(&I, ResultShadow); 
-      setOrigin(&I, getOrigin(CopyOp)); 
-    } else { 
-      setShadow(&I, getCleanShadow(&I)); 
-      setOrigin(&I, getCleanOrigin()); 
-    } 
-  } 
- 
-  // Given a scalar or vector, extract lower 64 bits (or less), and return all 
-  // zeroes if it is zero, and all ones otherwise. 
-  Value *Lower64ShadowExtend(IRBuilder<> &IRB, Value *S, Type *T) { 
-    if (S->getType()->isVectorTy()) 
-      S = CreateShadowCast(IRB, S, IRB.getInt64Ty(), /* Signed */ true); 
-    assert(S->getType()->getPrimitiveSizeInBits() <= 64); 
-    Value *S2 = IRB.CreateICmpNE(S, getCleanShadow(S)); 
-    return CreateShadowCast(IRB, S2, T, /* Signed */ true); 
-  } 
- 
-  // Given a vector, extract its first element, and return all 
-  // zeroes if it is zero, and all ones otherwise. 
-  Value *LowerElementShadowExtend(IRBuilder<> &IRB, Value *S, Type *T) { 
-    Value *S1 = IRB.CreateExtractElement(S, (uint64_t)0); 
-    Value *S2 = IRB.CreateICmpNE(S1, getCleanShadow(S1)); 
-    return CreateShadowCast(IRB, S2, T, /* Signed */ true); 
-  } 
- 
-  Value *VariableShadowExtend(IRBuilder<> &IRB, Value *S) { 
-    Type *T = S->getType(); 
-    assert(T->isVectorTy()); 
-    Value *S2 = IRB.CreateICmpNE(S, getCleanShadow(S)); 
-    return IRB.CreateSExt(S2, T); 
-  } 
- 
-  // Instrument vector shift intrinsic. 
-  // 
-  // This function instruments intrinsics like int_x86_avx2_psll_w. 
-  // Intrinsic shifts %In by %ShiftSize bits. 
-  // %ShiftSize may be a vector. In that case the lower 64 bits determine shift 
-  // size, and the rest is ignored. Behavior is defined even if shift size is 
-  // greater than register (or field) width. 
-  void handleVectorShiftIntrinsic(IntrinsicInst &I, bool Variable) { 
-    assert(I.getNumArgOperands() == 2); 
-    IRBuilder<> IRB(&I); 
-    // If any of the S2 bits are poisoned, the whole thing is poisoned. 
-    // Otherwise perform the same shift on S1. 
-    Value *S1 = getShadow(&I, 0); 
-    Value *S2 = getShadow(&I, 1); 
-    Value *S2Conv = Variable ? VariableShadowExtend(IRB, S2) 
-                             : Lower64ShadowExtend(IRB, S2, getShadowTy(&I)); 
-    Value *V1 = I.getOperand(0); 
-    Value *V2 = I.getOperand(1); 
-    Value *Shift = IRB.CreateCall(I.getFunctionType(), I.getCalledOperand(), 
-                                  {IRB.CreateBitCast(S1, V1->getType()), V2}); 
-    Shift = IRB.CreateBitCast(Shift, getShadowTy(&I)); 
-    setShadow(&I, IRB.CreateOr(Shift, S2Conv)); 
-    setOriginForNaryOp(I); 
-  } 
- 
-  // Get an X86_MMX-sized vector type. 
-  Type *getMMXVectorTy(unsigned EltSizeInBits) { 
-    const unsigned X86_MMXSizeInBits = 64; 
-    assert(EltSizeInBits != 0 && (X86_MMXSizeInBits % EltSizeInBits) == 0 && 
-           "Illegal MMX vector element size"); 
-    return FixedVectorType::get(IntegerType::get(*MS.C, EltSizeInBits), 
-                                X86_MMXSizeInBits / EltSizeInBits); 
-  } 
- 
-  // Returns a signed counterpart for an (un)signed-saturate-and-pack 
-  // intrinsic. 
-  Intrinsic::ID getSignedPackIntrinsic(Intrinsic::ID id) { 
-    switch (id) { 
-      case Intrinsic::x86_sse2_packsswb_128: 
-      case Intrinsic::x86_sse2_packuswb_128: 
-        return Intrinsic::x86_sse2_packsswb_128; 
- 
-      case Intrinsic::x86_sse2_packssdw_128: 
-      case Intrinsic::x86_sse41_packusdw: 
-        return Intrinsic::x86_sse2_packssdw_128; 
- 
-      case Intrinsic::x86_avx2_packsswb: 
-      case Intrinsic::x86_avx2_packuswb: 
-        return Intrinsic::x86_avx2_packsswb; 
- 
-      case Intrinsic::x86_avx2_packssdw: 
-      case Intrinsic::x86_avx2_packusdw: 
-        return Intrinsic::x86_avx2_packssdw; 
- 
-      case Intrinsic::x86_mmx_packsswb: 
-      case Intrinsic::x86_mmx_packuswb: 
-        return Intrinsic::x86_mmx_packsswb; 
- 
-      case Intrinsic::x86_mmx_packssdw: 
-        return Intrinsic::x86_mmx_packssdw; 
-      default: 
-        llvm_unreachable("unexpected intrinsic id"); 
-    } 
-  } 
- 
-  // Instrument vector pack intrinsic. 
-  // 
-  // This function instruments intrinsics like x86_mmx_packsswb, that 
-  // packs elements of 2 input vectors into half as many bits with saturation. 
-  // Shadow is propagated with the signed variant of the same intrinsic applied 
-  // to sext(Sa != zeroinitializer), sext(Sb != zeroinitializer). 
-  // EltSizeInBits is used only for x86mmx arguments. 
-  void handleVectorPackIntrinsic(IntrinsicInst &I, unsigned EltSizeInBits = 0) { 
-    assert(I.getNumArgOperands() == 2); 
-    bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy(); 
-    IRBuilder<> IRB(&I); 
-    Value *S1 = getShadow(&I, 0); 
-    Value *S2 = getShadow(&I, 1); 
-    assert(isX86_MMX || S1->getType()->isVectorTy()); 
- 
-    // SExt and ICmpNE below must apply to individual elements of input vectors. 
-    // In case of x86mmx arguments, cast them to appropriate vector types and 
-    // back. 
-    Type *T = isX86_MMX ? getMMXVectorTy(EltSizeInBits) : S1->getType(); 
-    if (isX86_MMX) { 
-      S1 = IRB.CreateBitCast(S1, T); 
-      S2 = IRB.CreateBitCast(S2, T); 
-    } 
-    Value *S1_ext = IRB.CreateSExt( 
-        IRB.CreateICmpNE(S1, Constant::getNullValue(T)), T); 
-    Value *S2_ext = IRB.CreateSExt( 
-        IRB.CreateICmpNE(S2, Constant::getNullValue(T)), T); 
-    if (isX86_MMX) { 
-      Type *X86_MMXTy = Type::getX86_MMXTy(*MS.C); 
-      S1_ext = IRB.CreateBitCast(S1_ext, X86_MMXTy); 
-      S2_ext = IRB.CreateBitCast(S2_ext, X86_MMXTy); 
-    } 
- 
-    Function *ShadowFn = Intrinsic::getDeclaration( 
-        F.getParent(), getSignedPackIntrinsic(I.getIntrinsicID())); 
- 
-    Value *S = 
-        IRB.CreateCall(ShadowFn, {S1_ext, S2_ext}, "_msprop_vector_pack"); 
-    if (isX86_MMX) S = IRB.CreateBitCast(S, getShadowTy(&I)); 
-    setShadow(&I, S); 
-    setOriginForNaryOp(I); 
-  } 
- 
-  // Instrument sum-of-absolute-differences intrinsic. 
-  void handleVectorSadIntrinsic(IntrinsicInst &I) { 
-    const unsigned SignificantBitsPerResultElement = 16; 
-    bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy(); 
-    Type *ResTy = isX86_MMX ? IntegerType::get(*MS.C, 64) : I.getType(); 
-    unsigned ZeroBitsPerResultElement = 
-        ResTy->getScalarSizeInBits() - SignificantBitsPerResultElement; 
- 
-    IRBuilder<> IRB(&I); 
-    Value *S = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1)); 
-    S = IRB.CreateBitCast(S, ResTy); 
-    S = IRB.CreateSExt(IRB.CreateICmpNE(S, Constant::getNullValue(ResTy)), 
-                       ResTy); 
-    S = IRB.CreateLShr(S, ZeroBitsPerResultElement); 
-    S = IRB.CreateBitCast(S, getShadowTy(&I)); 
-    setShadow(&I, S); 
-    setOriginForNaryOp(I); 
-  } 
- 
-  // Instrument multiply-add intrinsic. 
-  void handleVectorPmaddIntrinsic(IntrinsicInst &I, 
-                                  unsigned EltSizeInBits = 0) { 
-    bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy(); 
-    Type *ResTy = isX86_MMX ? getMMXVectorTy(EltSizeInBits * 2) : I.getType(); 
-    IRBuilder<> IRB(&I); 
-    Value *S = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1)); 
-    S = IRB.CreateBitCast(S, ResTy); 
-    S = IRB.CreateSExt(IRB.CreateICmpNE(S, Constant::getNullValue(ResTy)), 
-                       ResTy); 
-    S = IRB.CreateBitCast(S, getShadowTy(&I)); 
-    setShadow(&I, S); 
-    setOriginForNaryOp(I); 
-  } 
- 
-  // Instrument compare-packed intrinsic. 
-  // Basically, an or followed by sext(icmp ne 0) to end up with all-zeros or 
-  // all-ones shadow. 
-  void handleVectorComparePackedIntrinsic(IntrinsicInst &I) { 
-    IRBuilder<> IRB(&I); 
-    Type *ResTy = getShadowTy(&I); 
-    Value *S0 = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1)); 
-    Value *S = IRB.CreateSExt( 
-        IRB.CreateICmpNE(S0, Constant::getNullValue(ResTy)), ResTy); 
-    setShadow(&I, S); 
-    setOriginForNaryOp(I); 
-  } 
- 
-  // Instrument compare-scalar intrinsic. 
-  // This handles both cmp* intrinsics which return the result in the first 
-  // element of a vector, and comi* which return the result as i32. 
-  void handleVectorCompareScalarIntrinsic(IntrinsicInst &I) { 
-    IRBuilder<> IRB(&I); 
-    Value *S0 = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1)); 
-    Value *S = LowerElementShadowExtend(IRB, S0, getShadowTy(&I)); 
-    setShadow(&I, S); 
-    setOriginForNaryOp(I); 
-  } 
- 
-  // Instrument generic vector reduction intrinsics 
-  // by ORing together all their fields. 
-  void handleVectorReduceIntrinsic(IntrinsicInst &I) { 
-    IRBuilder<> IRB(&I); 
-    Value *S = IRB.CreateOrReduce(getShadow(&I, 0)); 
-    setShadow(&I, S); 
-    setOrigin(&I, getOrigin(&I, 0)); 
-  } 
- 
+    case 2:
+      CopyOp = I.getArgOperand(0);
+      ConvertOp = I.getArgOperand(1);
+      break;
+    case 1:
+      ConvertOp = I.getArgOperand(0);
+      CopyOp = nullptr;
+      break;
+    default:
+      llvm_unreachable("Cvt intrinsic with unsupported number of arguments.");
+    }
+
+    // The first *NumUsedElements* elements of ConvertOp are converted to the
+    // same number of output elements. The rest of the output is copied from
+    // CopyOp, or (if not available) filled with zeroes.
+    // Combine shadow for elements of ConvertOp that are used in this operation,
+    // and insert a check.
+    // FIXME: consider propagating shadow of ConvertOp, at least in the case of
+    // int->any conversion.
+    Value *ConvertShadow = getShadow(ConvertOp);
+    Value *AggShadow = nullptr;
+    if (ConvertOp->getType()->isVectorTy()) {
+      AggShadow = IRB.CreateExtractElement(
+          ConvertShadow, ConstantInt::get(IRB.getInt32Ty(), 0));
+      for (int i = 1; i < NumUsedElements; ++i) {
+        Value *MoreShadow = IRB.CreateExtractElement(
+            ConvertShadow, ConstantInt::get(IRB.getInt32Ty(), i));
+        AggShadow = IRB.CreateOr(AggShadow, MoreShadow);
+      }
+    } else {
+      AggShadow = ConvertShadow;
+    }
+    assert(AggShadow->getType()->isIntegerTy());
+    insertShadowCheck(AggShadow, getOrigin(ConvertOp), &I);
+
+    // Build result shadow by zero-filling parts of CopyOp shadow that come from
+    // ConvertOp.
+    if (CopyOp) {
+      assert(CopyOp->getType() == I.getType());
+      assert(CopyOp->getType()->isVectorTy());
+      Value *ResultShadow = getShadow(CopyOp);
+      Type *EltTy = cast<VectorType>(ResultShadow->getType())->getElementType();
+      for (int i = 0; i < NumUsedElements; ++i) {
+        ResultShadow = IRB.CreateInsertElement(
+            ResultShadow, ConstantInt::getNullValue(EltTy),
+            ConstantInt::get(IRB.getInt32Ty(), i));
+      }
+      setShadow(&I, ResultShadow);
+      setOrigin(&I, getOrigin(CopyOp));
+    } else {
+      setShadow(&I, getCleanShadow(&I));
+      setOrigin(&I, getCleanOrigin());
+    }
+  }
+
+  // Given a scalar or vector, extract lower 64 bits (or less), and return all
+  // zeroes if it is zero, and all ones otherwise.
+  Value *Lower64ShadowExtend(IRBuilder<> &IRB, Value *S, Type *T) {
+    if (S->getType()->isVectorTy())
+      S = CreateShadowCast(IRB, S, IRB.getInt64Ty(), /* Signed */ true);
+    assert(S->getType()->getPrimitiveSizeInBits() <= 64);
+    Value *S2 = IRB.CreateICmpNE(S, getCleanShadow(S));
+    return CreateShadowCast(IRB, S2, T, /* Signed */ true);
+  }
+
+  // Given a vector, extract its first element, and return all
+  // zeroes if it is zero, and all ones otherwise.
+  Value *LowerElementShadowExtend(IRBuilder<> &IRB, Value *S, Type *T) {
+    Value *S1 = IRB.CreateExtractElement(S, (uint64_t)0);
+    Value *S2 = IRB.CreateICmpNE(S1, getCleanShadow(S1));
+    return CreateShadowCast(IRB, S2, T, /* Signed */ true);
+  }
+
+  Value *VariableShadowExtend(IRBuilder<> &IRB, Value *S) {
+    Type *T = S->getType();
+    assert(T->isVectorTy());
+    Value *S2 = IRB.CreateICmpNE(S, getCleanShadow(S));
+    return IRB.CreateSExt(S2, T);
+  }
+
+  // Instrument vector shift intrinsic.
+  //
+  // This function instruments intrinsics like int_x86_avx2_psll_w.
+  // Intrinsic shifts %In by %ShiftSize bits.
+  // %ShiftSize may be a vector. In that case the lower 64 bits determine shift
+  // size, and the rest is ignored. Behavior is defined even if shift size is
+  // greater than register (or field) width.
+  void handleVectorShiftIntrinsic(IntrinsicInst &I, bool Variable) {
+    assert(I.getNumArgOperands() == 2);
+    IRBuilder<> IRB(&I);
+    // If any of the S2 bits are poisoned, the whole thing is poisoned.
+    // Otherwise perform the same shift on S1.
+    Value *S1 = getShadow(&I, 0);
+    Value *S2 = getShadow(&I, 1);
+    Value *S2Conv = Variable ? VariableShadowExtend(IRB, S2)
+                             : Lower64ShadowExtend(IRB, S2, getShadowTy(&I));
+    Value *V1 = I.getOperand(0);
+    Value *V2 = I.getOperand(1);
+    Value *Shift = IRB.CreateCall(I.getFunctionType(), I.getCalledOperand(),
+                                  {IRB.CreateBitCast(S1, V1->getType()), V2});
+    Shift = IRB.CreateBitCast(Shift, getShadowTy(&I));
+    setShadow(&I, IRB.CreateOr(Shift, S2Conv));
+    setOriginForNaryOp(I);
+  }
+
+  // Get an X86_MMX-sized vector type.
+  Type *getMMXVectorTy(unsigned EltSizeInBits) {
+    const unsigned X86_MMXSizeInBits = 64;
+    assert(EltSizeInBits != 0 && (X86_MMXSizeInBits % EltSizeInBits) == 0 &&
+           "Illegal MMX vector element size");
+    return FixedVectorType::get(IntegerType::get(*MS.C, EltSizeInBits),
+                                X86_MMXSizeInBits / EltSizeInBits);
+  }
+
+  // Returns a signed counterpart for an (un)signed-saturate-and-pack
+  // intrinsic.
+  Intrinsic::ID getSignedPackIntrinsic(Intrinsic::ID id) {
+    switch (id) {
+      case Intrinsic::x86_sse2_packsswb_128:
+      case Intrinsic::x86_sse2_packuswb_128:
+        return Intrinsic::x86_sse2_packsswb_128;
+
+      case Intrinsic::x86_sse2_packssdw_128:
+      case Intrinsic::x86_sse41_packusdw:
+        return Intrinsic::x86_sse2_packssdw_128;
+
+      case Intrinsic::x86_avx2_packsswb:
+      case Intrinsic::x86_avx2_packuswb:
+        return Intrinsic::x86_avx2_packsswb;
+
+      case Intrinsic::x86_avx2_packssdw:
+      case Intrinsic::x86_avx2_packusdw:
+        return Intrinsic::x86_avx2_packssdw;
+
+      case Intrinsic::x86_mmx_packsswb:
+      case Intrinsic::x86_mmx_packuswb:
+        return Intrinsic::x86_mmx_packsswb;
+
+      case Intrinsic::x86_mmx_packssdw:
+        return Intrinsic::x86_mmx_packssdw;
+      default:
+        llvm_unreachable("unexpected intrinsic id");
+    }
+  }
+
+  // Instrument vector pack intrinsic.
+  //
+  // This function instruments intrinsics like x86_mmx_packsswb, that
+  // packs elements of 2 input vectors into half as many bits with saturation.
+  // Shadow is propagated with the signed variant of the same intrinsic applied
+  // to sext(Sa != zeroinitializer), sext(Sb != zeroinitializer).
+  // EltSizeInBits is used only for x86mmx arguments.
+  void handleVectorPackIntrinsic(IntrinsicInst &I, unsigned EltSizeInBits = 0) {
+    assert(I.getNumArgOperands() == 2);
+    bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
+    IRBuilder<> IRB(&I);
+    Value *S1 = getShadow(&I, 0);
+    Value *S2 = getShadow(&I, 1);
+    assert(isX86_MMX || S1->getType()->isVectorTy());
+
+    // SExt and ICmpNE below must apply to individual elements of input vectors.
+    // In case of x86mmx arguments, cast them to appropriate vector types and
+    // back.
+    Type *T = isX86_MMX ? getMMXVectorTy(EltSizeInBits) : S1->getType();
+    if (isX86_MMX) {
+      S1 = IRB.CreateBitCast(S1, T);
+      S2 = IRB.CreateBitCast(S2, T);
+    }
+    Value *S1_ext = IRB.CreateSExt(
+        IRB.CreateICmpNE(S1, Constant::getNullValue(T)), T);
+    Value *S2_ext = IRB.CreateSExt(
+        IRB.CreateICmpNE(S2, Constant::getNullValue(T)), T);
+    if (isX86_MMX) {
+      Type *X86_MMXTy = Type::getX86_MMXTy(*MS.C);
+      S1_ext = IRB.CreateBitCast(S1_ext, X86_MMXTy);
+      S2_ext = IRB.CreateBitCast(S2_ext, X86_MMXTy);
+    }
+
+    Function *ShadowFn = Intrinsic::getDeclaration(
+        F.getParent(), getSignedPackIntrinsic(I.getIntrinsicID()));
+
+    Value *S =
+        IRB.CreateCall(ShadowFn, {S1_ext, S2_ext}, "_msprop_vector_pack");
+    if (isX86_MMX) S = IRB.CreateBitCast(S, getShadowTy(&I));
+    setShadow(&I, S);
+    setOriginForNaryOp(I);
+  }
+
+  // Instrument sum-of-absolute-differences intrinsic.
+  void handleVectorSadIntrinsic(IntrinsicInst &I) {
+    const unsigned SignificantBitsPerResultElement = 16;
+    bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
+    Type *ResTy = isX86_MMX ? IntegerType::get(*MS.C, 64) : I.getType();
+    unsigned ZeroBitsPerResultElement =
+        ResTy->getScalarSizeInBits() - SignificantBitsPerResultElement;
+
+    IRBuilder<> IRB(&I);
+    Value *S = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1));
+    S = IRB.CreateBitCast(S, ResTy);
+    S = IRB.CreateSExt(IRB.CreateICmpNE(S, Constant::getNullValue(ResTy)),
+                       ResTy);
+    S = IRB.CreateLShr(S, ZeroBitsPerResultElement);
+    S = IRB.CreateBitCast(S, getShadowTy(&I));
+    setShadow(&I, S);
+    setOriginForNaryOp(I);
+  }
+
+  // Instrument multiply-add intrinsic.
+  void handleVectorPmaddIntrinsic(IntrinsicInst &I,
+                                  unsigned EltSizeInBits = 0) {
+    bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
+    Type *ResTy = isX86_MMX ? getMMXVectorTy(EltSizeInBits * 2) : I.getType();
+    IRBuilder<> IRB(&I);
+    Value *S = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1));
+    S = IRB.CreateBitCast(S, ResTy);
+    S = IRB.CreateSExt(IRB.CreateICmpNE(S, Constant::getNullValue(ResTy)),
+                       ResTy);
+    S = IRB.CreateBitCast(S, getShadowTy(&I));
+    setShadow(&I, S);
+    setOriginForNaryOp(I);
+  }
+
+  // Instrument compare-packed intrinsic.
+  // Basically, an or followed by sext(icmp ne 0) to end up with all-zeros or
+  // all-ones shadow.
+  void handleVectorComparePackedIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Type *ResTy = getShadowTy(&I);
+    Value *S0 = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1));
+    Value *S = IRB.CreateSExt(
+        IRB.CreateICmpNE(S0, Constant::getNullValue(ResTy)), ResTy);
+    setShadow(&I, S);
+    setOriginForNaryOp(I);
+  }
+
+  // Instrument compare-scalar intrinsic.
+  // This handles both cmp* intrinsics which return the result in the first
+  // element of a vector, and comi* which return the result as i32.
+  void handleVectorCompareScalarIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *S0 = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1));
+    Value *S = LowerElementShadowExtend(IRB, S0, getShadowTy(&I));
+    setShadow(&I, S);
+    setOriginForNaryOp(I);
+  }
+
+  // Instrument generic vector reduction intrinsics
+  // by ORing together all their fields.
+  void handleVectorReduceIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *S = IRB.CreateOrReduce(getShadow(&I, 0));
+    setShadow(&I, S);
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
   // Instrument vector.reduce.or intrinsic.
-  // Valid (non-poisoned) set bits in the operand pull low the 
-  // corresponding shadow bits. 
-  void handleVectorReduceOrIntrinsic(IntrinsicInst &I) { 
-    IRBuilder<> IRB(&I); 
-    Value *OperandShadow = getShadow(&I, 0); 
-    Value *OperandUnsetBits = IRB.CreateNot(I.getOperand(0)); 
-    Value *OperandUnsetOrPoison = IRB.CreateOr(OperandUnsetBits, OperandShadow); 
-    // Bit N is clean if any field's bit N is 1 and unpoison 
-    Value *OutShadowMask = IRB.CreateAndReduce(OperandUnsetOrPoison); 
-    // Otherwise, it is clean if every field's bit N is unpoison 
-    Value *OrShadow = IRB.CreateOrReduce(OperandShadow); 
-    Value *S = IRB.CreateAnd(OutShadowMask, OrShadow); 
- 
-    setShadow(&I, S); 
-    setOrigin(&I, getOrigin(&I, 0)); 
-  } 
- 
+  // Valid (non-poisoned) set bits in the operand pull low the
+  // corresponding shadow bits.
+  void handleVectorReduceOrIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *OperandShadow = getShadow(&I, 0);
+    Value *OperandUnsetBits = IRB.CreateNot(I.getOperand(0));
+    Value *OperandUnsetOrPoison = IRB.CreateOr(OperandUnsetBits, OperandShadow);
+    // Bit N is clean if any field's bit N is 1 and unpoison
+    Value *OutShadowMask = IRB.CreateAndReduce(OperandUnsetOrPoison);
+    // Otherwise, it is clean if every field's bit N is unpoison
+    Value *OrShadow = IRB.CreateOrReduce(OperandShadow);
+    Value *S = IRB.CreateAnd(OutShadowMask, OrShadow);
+
+    setShadow(&I, S);
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
   // Instrument vector.reduce.and intrinsic.
-  // Valid (non-poisoned) unset bits in the operand pull down the 
-  // corresponding shadow bits. 
-  void handleVectorReduceAndIntrinsic(IntrinsicInst &I) { 
-    IRBuilder<> IRB(&I); 
-    Value *OperandShadow = getShadow(&I, 0); 
-    Value *OperandSetOrPoison = IRB.CreateOr(I.getOperand(0), OperandShadow); 
-    // Bit N is clean if any field's bit N is 0 and unpoison 
-    Value *OutShadowMask = IRB.CreateAndReduce(OperandSetOrPoison); 
-    // Otherwise, it is clean if every field's bit N is unpoison 
-    Value *OrShadow = IRB.CreateOrReduce(OperandShadow); 
-    Value *S = IRB.CreateAnd(OutShadowMask, OrShadow); 
- 
-    setShadow(&I, S); 
-    setOrigin(&I, getOrigin(&I, 0)); 
-  } 
- 
-  void handleStmxcsr(IntrinsicInst &I) { 
-    IRBuilder<> IRB(&I); 
-    Value* Addr = I.getArgOperand(0); 
-    Type *Ty = IRB.getInt32Ty(); 
-    Value *ShadowPtr = 
-        getShadowOriginPtr(Addr, IRB, Ty, Align(1), /*isStore*/ true).first; 
- 
-    IRB.CreateStore(getCleanShadow(Ty), 
-                    IRB.CreatePointerCast(ShadowPtr, Ty->getPointerTo())); 
- 
-    if (ClCheckAccessAddress) 
-      insertShadowCheck(Addr, &I); 
-  } 
- 
-  void handleLdmxcsr(IntrinsicInst &I) { 
-    if (!InsertChecks) return; 
- 
-    IRBuilder<> IRB(&I); 
-    Value *Addr = I.getArgOperand(0); 
-    Type *Ty = IRB.getInt32Ty(); 
-    const Align Alignment = Align(1); 
-    Value *ShadowPtr, *OriginPtr; 
-    std::tie(ShadowPtr, OriginPtr) = 
-        getShadowOriginPtr(Addr, IRB, Ty, Alignment, /*isStore*/ false); 
- 
-    if (ClCheckAccessAddress) 
-      insertShadowCheck(Addr, &I); 
- 
-    Value *Shadow = IRB.CreateAlignedLoad(Ty, ShadowPtr, Alignment, "_ldmxcsr"); 
-    Value *Origin = MS.TrackOrigins ? IRB.CreateLoad(MS.OriginTy, OriginPtr) 
-                                    : getCleanOrigin(); 
-    insertShadowCheck(Shadow, Origin, &I); 
-  } 
- 
-  void handleMaskedStore(IntrinsicInst &I) { 
-    IRBuilder<> IRB(&I); 
-    Value *V = I.getArgOperand(0); 
-    Value *Addr = I.getArgOperand(1); 
-    const Align Alignment( 
-        cast<ConstantInt>(I.getArgOperand(2))->getZExtValue()); 
-    Value *Mask = I.getArgOperand(3); 
-    Value *Shadow = getShadow(V); 
- 
-    Value *ShadowPtr; 
-    Value *OriginPtr; 
-    std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr( 
-        Addr, IRB, Shadow->getType(), Alignment, /*isStore*/ true); 
- 
-    if (ClCheckAccessAddress) { 
-      insertShadowCheck(Addr, &I); 
-      // Uninitialized mask is kind of like uninitialized address, but not as 
-      // scary. 
-      insertShadowCheck(Mask, &I); 
-    } 
- 
-    IRB.CreateMaskedStore(Shadow, ShadowPtr, Alignment, Mask); 
- 
-    if (MS.TrackOrigins) { 
-      auto &DL = F.getParent()->getDataLayout(); 
-      paintOrigin(IRB, getOrigin(V), OriginPtr, 
-                  DL.getTypeStoreSize(Shadow->getType()), 
-                  std::max(Alignment, kMinOriginAlignment)); 
-    } 
-  } 
- 
-  bool handleMaskedLoad(IntrinsicInst &I) { 
-    IRBuilder<> IRB(&I); 
-    Value *Addr = I.getArgOperand(0); 
-    const Align Alignment( 
-        cast<ConstantInt>(I.getArgOperand(1))->getZExtValue()); 
-    Value *Mask = I.getArgOperand(2); 
-    Value *PassThru = I.getArgOperand(3); 
- 
-    Type *ShadowTy = getShadowTy(&I); 
-    Value *ShadowPtr, *OriginPtr; 
-    if (PropagateShadow) { 
-      std::tie(ShadowPtr, OriginPtr) = 
-          getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ false); 
-      setShadow(&I, IRB.CreateMaskedLoad(ShadowPtr, Alignment, Mask, 
-                                         getShadow(PassThru), "_msmaskedld")); 
-    } else { 
-      setShadow(&I, getCleanShadow(&I)); 
-    } 
- 
-    if (ClCheckAccessAddress) { 
-      insertShadowCheck(Addr, &I); 
-      insertShadowCheck(Mask, &I); 
-    } 
- 
-    if (MS.TrackOrigins) { 
-      if (PropagateShadow) { 
-        // Choose between PassThru's and the loaded value's origins. 
-        Value *MaskedPassThruShadow = IRB.CreateAnd( 
-            getShadow(PassThru), IRB.CreateSExt(IRB.CreateNeg(Mask), ShadowTy)); 
- 
-        Value *Acc = IRB.CreateExtractElement( 
-            MaskedPassThruShadow, ConstantInt::get(IRB.getInt32Ty(), 0)); 
-        for (int i = 1, N = cast<FixedVectorType>(PassThru->getType()) 
-                                ->getNumElements(); 
-             i < N; ++i) { 
-          Value *More = IRB.CreateExtractElement( 
-              MaskedPassThruShadow, ConstantInt::get(IRB.getInt32Ty(), i)); 
-          Acc = IRB.CreateOr(Acc, More); 
-        } 
- 
-        Value *Origin = IRB.CreateSelect( 
-            IRB.CreateICmpNE(Acc, Constant::getNullValue(Acc->getType())), 
-            getOrigin(PassThru), IRB.CreateLoad(MS.OriginTy, OriginPtr)); 
- 
-        setOrigin(&I, Origin); 
-      } else { 
-        setOrigin(&I, getCleanOrigin()); 
-      } 
-    } 
-    return true; 
-  } 
- 
-  // Instrument BMI / BMI2 intrinsics. 
-  // All of these intrinsics are Z = I(X, Y) 
-  // where the types of all operands and the result match, and are either i32 or i64. 
-  // The following instrumentation happens to work for all of them: 
-  //   Sz = I(Sx, Y) | (sext (Sy != 0)) 
-  void handleBmiIntrinsic(IntrinsicInst &I) { 
-    IRBuilder<> IRB(&I); 
-    Type *ShadowTy = getShadowTy(&I); 
- 
-    // If any bit of the mask operand is poisoned, then the whole thing is. 
-    Value *SMask = getShadow(&I, 1); 
-    SMask = IRB.CreateSExt(IRB.CreateICmpNE(SMask, getCleanShadow(ShadowTy)), 
-                           ShadowTy); 
-    // Apply the same intrinsic to the shadow of the first operand. 
-    Value *S = IRB.CreateCall(I.getCalledFunction(), 
-                              {getShadow(&I, 0), I.getOperand(1)}); 
-    S = IRB.CreateOr(SMask, S); 
-    setShadow(&I, S); 
-    setOriginForNaryOp(I); 
-  } 
- 
-  SmallVector<int, 8> getPclmulMask(unsigned Width, bool OddElements) { 
-    SmallVector<int, 8> Mask; 
-    for (unsigned X = OddElements ? 1 : 0; X < Width; X += 2) { 
-      Mask.append(2, X); 
-    } 
-    return Mask; 
-  } 
- 
-  // Instrument pclmul intrinsics. 
-  // These intrinsics operate either on odd or on even elements of the input 
-  // vectors, depending on the constant in the 3rd argument, ignoring the rest. 
-  // Replace the unused elements with copies of the used ones, ex: 
-  //   (0, 1, 2, 3) -> (0, 0, 2, 2) (even case) 
-  // or 
-  //   (0, 1, 2, 3) -> (1, 1, 3, 3) (odd case) 
-  // and then apply the usual shadow combining logic. 
-  void handlePclmulIntrinsic(IntrinsicInst &I) { 
-    IRBuilder<> IRB(&I); 
-    unsigned Width = 
-        cast<FixedVectorType>(I.getArgOperand(0)->getType())->getNumElements(); 
-    assert(isa<ConstantInt>(I.getArgOperand(2)) && 
-           "pclmul 3rd operand must be a constant"); 
-    unsigned Imm = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue(); 
+  // Valid (non-poisoned) unset bits in the operand pull down the
+  // corresponding shadow bits.
+  void handleVectorReduceAndIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *OperandShadow = getShadow(&I, 0);
+    Value *OperandSetOrPoison = IRB.CreateOr(I.getOperand(0), OperandShadow);
+    // Bit N is clean if any field's bit N is 0 and unpoison
+    Value *OutShadowMask = IRB.CreateAndReduce(OperandSetOrPoison);
+    // Otherwise, it is clean if every field's bit N is unpoison
+    Value *OrShadow = IRB.CreateOrReduce(OperandShadow);
+    Value *S = IRB.CreateAnd(OutShadowMask, OrShadow);
+
+    setShadow(&I, S);
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void handleStmxcsr(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value* Addr = I.getArgOperand(0);
+    Type *Ty = IRB.getInt32Ty();
+    Value *ShadowPtr =
+        getShadowOriginPtr(Addr, IRB, Ty, Align(1), /*isStore*/ true).first;
+
+    IRB.CreateStore(getCleanShadow(Ty),
+                    IRB.CreatePointerCast(ShadowPtr, Ty->getPointerTo()));
+
+    if (ClCheckAccessAddress)
+      insertShadowCheck(Addr, &I);
+  }
+
+  void handleLdmxcsr(IntrinsicInst &I) {
+    if (!InsertChecks) return;
+
+    IRBuilder<> IRB(&I);
+    Value *Addr = I.getArgOperand(0);
+    Type *Ty = IRB.getInt32Ty();
+    const Align Alignment = Align(1);
+    Value *ShadowPtr, *OriginPtr;
+    std::tie(ShadowPtr, OriginPtr) =
+        getShadowOriginPtr(Addr, IRB, Ty, Alignment, /*isStore*/ false);
+
+    if (ClCheckAccessAddress)
+      insertShadowCheck(Addr, &I);
+
+    Value *Shadow = IRB.CreateAlignedLoad(Ty, ShadowPtr, Alignment, "_ldmxcsr");
+    Value *Origin = MS.TrackOrigins ? IRB.CreateLoad(MS.OriginTy, OriginPtr)
+                                    : getCleanOrigin();
+    insertShadowCheck(Shadow, Origin, &I);
+  }
+
+  void handleMaskedStore(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *V = I.getArgOperand(0);
+    Value *Addr = I.getArgOperand(1);
+    const Align Alignment(
+        cast<ConstantInt>(I.getArgOperand(2))->getZExtValue());
+    Value *Mask = I.getArgOperand(3);
+    Value *Shadow = getShadow(V);
+
+    Value *ShadowPtr;
+    Value *OriginPtr;
+    std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr(
+        Addr, IRB, Shadow->getType(), Alignment, /*isStore*/ true);
+
+    if (ClCheckAccessAddress) {
+      insertShadowCheck(Addr, &I);
+      // Uninitialized mask is kind of like uninitialized address, but not as
+      // scary.
+      insertShadowCheck(Mask, &I);
+    }
+
+    IRB.CreateMaskedStore(Shadow, ShadowPtr, Alignment, Mask);
+
+    if (MS.TrackOrigins) {
+      auto &DL = F.getParent()->getDataLayout();
+      paintOrigin(IRB, getOrigin(V), OriginPtr,
+                  DL.getTypeStoreSize(Shadow->getType()),
+                  std::max(Alignment, kMinOriginAlignment));
+    }
+  }
+
+  bool handleMaskedLoad(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *Addr = I.getArgOperand(0);
+    const Align Alignment(
+        cast<ConstantInt>(I.getArgOperand(1))->getZExtValue());
+    Value *Mask = I.getArgOperand(2);
+    Value *PassThru = I.getArgOperand(3);
+
+    Type *ShadowTy = getShadowTy(&I);
+    Value *ShadowPtr, *OriginPtr;
+    if (PropagateShadow) {
+      std::tie(ShadowPtr, OriginPtr) =
+          getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ false);
+      setShadow(&I, IRB.CreateMaskedLoad(ShadowPtr, Alignment, Mask,
+                                         getShadow(PassThru), "_msmaskedld"));
+    } else {
+      setShadow(&I, getCleanShadow(&I));
+    }
+
+    if (ClCheckAccessAddress) {
+      insertShadowCheck(Addr, &I);
+      insertShadowCheck(Mask, &I);
+    }
+
+    if (MS.TrackOrigins) {
+      if (PropagateShadow) {
+        // Choose between PassThru's and the loaded value's origins.
+        Value *MaskedPassThruShadow = IRB.CreateAnd(
+            getShadow(PassThru), IRB.CreateSExt(IRB.CreateNeg(Mask), ShadowTy));
+
+        Value *Acc = IRB.CreateExtractElement(
+            MaskedPassThruShadow, ConstantInt::get(IRB.getInt32Ty(), 0));
+        for (int i = 1, N = cast<FixedVectorType>(PassThru->getType())
+                                ->getNumElements();
+             i < N; ++i) {
+          Value *More = IRB.CreateExtractElement(
+              MaskedPassThruShadow, ConstantInt::get(IRB.getInt32Ty(), i));
+          Acc = IRB.CreateOr(Acc, More);
+        }
+
+        Value *Origin = IRB.CreateSelect(
+            IRB.CreateICmpNE(Acc, Constant::getNullValue(Acc->getType())),
+            getOrigin(PassThru), IRB.CreateLoad(MS.OriginTy, OriginPtr));
+
+        setOrigin(&I, Origin);
+      } else {
+        setOrigin(&I, getCleanOrigin());
+      }
+    }
+    return true;
+  }
+
+  // Instrument BMI / BMI2 intrinsics.
+  // All of these intrinsics are Z = I(X, Y)
+  // where the types of all operands and the result match, and are either i32 or i64.
+  // The following instrumentation happens to work for all of them:
+  //   Sz = I(Sx, Y) | (sext (Sy != 0))
+  void handleBmiIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Type *ShadowTy = getShadowTy(&I);
+
+    // If any bit of the mask operand is poisoned, then the whole thing is.
+    Value *SMask = getShadow(&I, 1);
+    SMask = IRB.CreateSExt(IRB.CreateICmpNE(SMask, getCleanShadow(ShadowTy)),
+                           ShadowTy);
+    // Apply the same intrinsic to the shadow of the first operand.
+    Value *S = IRB.CreateCall(I.getCalledFunction(),
+                              {getShadow(&I, 0), I.getOperand(1)});
+    S = IRB.CreateOr(SMask, S);
+    setShadow(&I, S);
+    setOriginForNaryOp(I);
+  }
+
+  SmallVector<int, 8> getPclmulMask(unsigned Width, bool OddElements) {
+    SmallVector<int, 8> Mask;
+    for (unsigned X = OddElements ? 1 : 0; X < Width; X += 2) {
+      Mask.append(2, X);
+    }
+    return Mask;
+  }
+
+  // Instrument pclmul intrinsics.
+  // These intrinsics operate either on odd or on even elements of the input
+  // vectors, depending on the constant in the 3rd argument, ignoring the rest.
+  // Replace the unused elements with copies of the used ones, ex:
+  //   (0, 1, 2, 3) -> (0, 0, 2, 2) (even case)
+  // or
+  //   (0, 1, 2, 3) -> (1, 1, 3, 3) (odd case)
+  // and then apply the usual shadow combining logic.
+  void handlePclmulIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    unsigned Width =
+        cast<FixedVectorType>(I.getArgOperand(0)->getType())->getNumElements();
+    assert(isa<ConstantInt>(I.getArgOperand(2)) &&
+           "pclmul 3rd operand must be a constant");
+    unsigned Imm = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
     Value *Shuf0 = IRB.CreateShuffleVector(getShadow(&I, 0),
                                            getPclmulMask(Width, Imm & 0x01));
     Value *Shuf1 = IRB.CreateShuffleVector(getShadow(&I, 1),
                                            getPclmulMask(Width, Imm & 0x10));
-    ShadowAndOriginCombiner SOC(this, IRB); 
-    SOC.Add(Shuf0, getOrigin(&I, 0)); 
-    SOC.Add(Shuf1, getOrigin(&I, 1)); 
-    SOC.Done(&I); 
-  } 
- 
-  // Instrument _mm_*_sd intrinsics 
-  void handleUnarySdIntrinsic(IntrinsicInst &I) { 
-    IRBuilder<> IRB(&I); 
-    Value *First = getShadow(&I, 0); 
-    Value *Second = getShadow(&I, 1); 
-    // High word of first operand, low word of second 
-    Value *Shadow = 
-        IRB.CreateShuffleVector(First, Second, llvm::makeArrayRef<int>({2, 1})); 
- 
-    setShadow(&I, Shadow); 
-    setOriginForNaryOp(I); 
-  } 
- 
-  void handleBinarySdIntrinsic(IntrinsicInst &I) { 
-    IRBuilder<> IRB(&I); 
-    Value *First = getShadow(&I, 0); 
-    Value *Second = getShadow(&I, 1); 
-    Value *OrShadow = IRB.CreateOr(First, Second); 
-    // High word of first operand, low word of both OR'd together 
-    Value *Shadow = IRB.CreateShuffleVector(First, OrShadow, 
-                                            llvm::makeArrayRef<int>({2, 1})); 
- 
-    setShadow(&I, Shadow); 
-    setOriginForNaryOp(I); 
-  } 
- 
+    ShadowAndOriginCombiner SOC(this, IRB);
+    SOC.Add(Shuf0, getOrigin(&I, 0));
+    SOC.Add(Shuf1, getOrigin(&I, 1));
+    SOC.Done(&I);
+  }
+
+  // Instrument _mm_*_sd intrinsics
+  void handleUnarySdIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *First = getShadow(&I, 0);
+    Value *Second = getShadow(&I, 1);
+    // High word of first operand, low word of second
+    Value *Shadow =
+        IRB.CreateShuffleVector(First, Second, llvm::makeArrayRef<int>({2, 1}));
+
+    setShadow(&I, Shadow);
+    setOriginForNaryOp(I);
+  }
+
+  void handleBinarySdIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *First = getShadow(&I, 0);
+    Value *Second = getShadow(&I, 1);
+    Value *OrShadow = IRB.CreateOr(First, Second);
+    // High word of first operand, low word of both OR'd together
+    Value *Shadow = IRB.CreateShuffleVector(First, OrShadow,
+                                            llvm::makeArrayRef<int>({2, 1}));
+
+    setShadow(&I, Shadow);
+    setOriginForNaryOp(I);
+  }
+
   // Instrument abs intrinsic.
   // handleUnknownIntrinsic can't handle it because of the last
   // is_int_min_poison argument which does not match the result type.
@@ -3244,282 +3244,282 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOrigin(&I, getOrigin(&I, 0));
   }
 
-  void visitIntrinsicInst(IntrinsicInst &I) { 
-    switch (I.getIntrinsicID()) { 
+  void visitIntrinsicInst(IntrinsicInst &I) {
+    switch (I.getIntrinsicID()) {
     case Intrinsic::abs:
       handleAbsIntrinsic(I);
       break;
-    case Intrinsic::lifetime_start: 
-      handleLifetimeStart(I); 
-      break; 
-    case Intrinsic::launder_invariant_group: 
-    case Intrinsic::strip_invariant_group: 
-      handleInvariantGroup(I); 
-      break; 
-    case Intrinsic::bswap: 
-      handleBswap(I); 
-      break; 
-    case Intrinsic::masked_store: 
-      handleMaskedStore(I); 
-      break; 
-    case Intrinsic::masked_load: 
-      handleMaskedLoad(I); 
-      break; 
+    case Intrinsic::lifetime_start:
+      handleLifetimeStart(I);
+      break;
+    case Intrinsic::launder_invariant_group:
+    case Intrinsic::strip_invariant_group:
+      handleInvariantGroup(I);
+      break;
+    case Intrinsic::bswap:
+      handleBswap(I);
+      break;
+    case Intrinsic::masked_store:
+      handleMaskedStore(I);
+      break;
+    case Intrinsic::masked_load:
+      handleMaskedLoad(I);
+      break;
     case Intrinsic::vector_reduce_and:
-      handleVectorReduceAndIntrinsic(I); 
-      break; 
+      handleVectorReduceAndIntrinsic(I);
+      break;
     case Intrinsic::vector_reduce_or:
-      handleVectorReduceOrIntrinsic(I); 
-      break; 
+      handleVectorReduceOrIntrinsic(I);
+      break;
     case Intrinsic::vector_reduce_add:
     case Intrinsic::vector_reduce_xor:
     case Intrinsic::vector_reduce_mul:
-      handleVectorReduceIntrinsic(I); 
-      break; 
-    case Intrinsic::x86_sse_stmxcsr: 
-      handleStmxcsr(I); 
-      break; 
-    case Intrinsic::x86_sse_ldmxcsr: 
-      handleLdmxcsr(I); 
-      break; 
-    case Intrinsic::x86_avx512_vcvtsd2usi64: 
-    case Intrinsic::x86_avx512_vcvtsd2usi32: 
-    case Intrinsic::x86_avx512_vcvtss2usi64: 
-    case Intrinsic::x86_avx512_vcvtss2usi32: 
-    case Intrinsic::x86_avx512_cvttss2usi64: 
-    case Intrinsic::x86_avx512_cvttss2usi: 
-    case Intrinsic::x86_avx512_cvttsd2usi64: 
-    case Intrinsic::x86_avx512_cvttsd2usi: 
-    case Intrinsic::x86_avx512_cvtusi2ss: 
-    case Intrinsic::x86_avx512_cvtusi642sd: 
-    case Intrinsic::x86_avx512_cvtusi642ss: 
+      handleVectorReduceIntrinsic(I);
+      break;
+    case Intrinsic::x86_sse_stmxcsr:
+      handleStmxcsr(I);
+      break;
+    case Intrinsic::x86_sse_ldmxcsr:
+      handleLdmxcsr(I);
+      break;
+    case Intrinsic::x86_avx512_vcvtsd2usi64:
+    case Intrinsic::x86_avx512_vcvtsd2usi32:
+    case Intrinsic::x86_avx512_vcvtss2usi64:
+    case Intrinsic::x86_avx512_vcvtss2usi32:
+    case Intrinsic::x86_avx512_cvttss2usi64:
+    case Intrinsic::x86_avx512_cvttss2usi:
+    case Intrinsic::x86_avx512_cvttsd2usi64:
+    case Intrinsic::x86_avx512_cvttsd2usi:
+    case Intrinsic::x86_avx512_cvtusi2ss:
+    case Intrinsic::x86_avx512_cvtusi642sd:
+    case Intrinsic::x86_avx512_cvtusi642ss:
       handleVectorConvertIntrinsic(I, 1, true);
       break;
-    case Intrinsic::x86_sse2_cvtsd2si64: 
-    case Intrinsic::x86_sse2_cvtsd2si: 
-    case Intrinsic::x86_sse2_cvtsd2ss: 
-    case Intrinsic::x86_sse2_cvttsd2si64: 
-    case Intrinsic::x86_sse2_cvttsd2si: 
-    case Intrinsic::x86_sse_cvtss2si64: 
-    case Intrinsic::x86_sse_cvtss2si: 
-    case Intrinsic::x86_sse_cvttss2si64: 
-    case Intrinsic::x86_sse_cvttss2si: 
-      handleVectorConvertIntrinsic(I, 1); 
-      break; 
-    case Intrinsic::x86_sse_cvtps2pi: 
-    case Intrinsic::x86_sse_cvttps2pi: 
-      handleVectorConvertIntrinsic(I, 2); 
-      break; 
- 
-    case Intrinsic::x86_avx512_psll_w_512: 
-    case Intrinsic::x86_avx512_psll_d_512: 
-    case Intrinsic::x86_avx512_psll_q_512: 
-    case Intrinsic::x86_avx512_pslli_w_512: 
-    case Intrinsic::x86_avx512_pslli_d_512: 
-    case Intrinsic::x86_avx512_pslli_q_512: 
-    case Intrinsic::x86_avx512_psrl_w_512: 
-    case Intrinsic::x86_avx512_psrl_d_512: 
-    case Intrinsic::x86_avx512_psrl_q_512: 
-    case Intrinsic::x86_avx512_psra_w_512: 
-    case Intrinsic::x86_avx512_psra_d_512: 
-    case Intrinsic::x86_avx512_psra_q_512: 
-    case Intrinsic::x86_avx512_psrli_w_512: 
-    case Intrinsic::x86_avx512_psrli_d_512: 
-    case Intrinsic::x86_avx512_psrli_q_512: 
-    case Intrinsic::x86_avx512_psrai_w_512: 
-    case Intrinsic::x86_avx512_psrai_d_512: 
-    case Intrinsic::x86_avx512_psrai_q_512: 
-    case Intrinsic::x86_avx512_psra_q_256: 
-    case Intrinsic::x86_avx512_psra_q_128: 
-    case Intrinsic::x86_avx512_psrai_q_256: 
-    case Intrinsic::x86_avx512_psrai_q_128: 
-    case Intrinsic::x86_avx2_psll_w: 
-    case Intrinsic::x86_avx2_psll_d: 
-    case Intrinsic::x86_avx2_psll_q: 
-    case Intrinsic::x86_avx2_pslli_w: 
-    case Intrinsic::x86_avx2_pslli_d: 
-    case Intrinsic::x86_avx2_pslli_q: 
-    case Intrinsic::x86_avx2_psrl_w: 
-    case Intrinsic::x86_avx2_psrl_d: 
-    case Intrinsic::x86_avx2_psrl_q: 
-    case Intrinsic::x86_avx2_psra_w: 
-    case Intrinsic::x86_avx2_psra_d: 
-    case Intrinsic::x86_avx2_psrli_w: 
-    case Intrinsic::x86_avx2_psrli_d: 
-    case Intrinsic::x86_avx2_psrli_q: 
-    case Intrinsic::x86_avx2_psrai_w: 
-    case Intrinsic::x86_avx2_psrai_d: 
-    case Intrinsic::x86_sse2_psll_w: 
-    case Intrinsic::x86_sse2_psll_d: 
-    case Intrinsic::x86_sse2_psll_q: 
-    case Intrinsic::x86_sse2_pslli_w: 
-    case Intrinsic::x86_sse2_pslli_d: 
-    case Intrinsic::x86_sse2_pslli_q: 
-    case Intrinsic::x86_sse2_psrl_w: 
-    case Intrinsic::x86_sse2_psrl_d: 
-    case Intrinsic::x86_sse2_psrl_q: 
-    case Intrinsic::x86_sse2_psra_w: 
-    case Intrinsic::x86_sse2_psra_d: 
-    case Intrinsic::x86_sse2_psrli_w: 
-    case Intrinsic::x86_sse2_psrli_d: 
-    case Intrinsic::x86_sse2_psrli_q: 
-    case Intrinsic::x86_sse2_psrai_w: 
-    case Intrinsic::x86_sse2_psrai_d: 
-    case Intrinsic::x86_mmx_psll_w: 
-    case Intrinsic::x86_mmx_psll_d: 
-    case Intrinsic::x86_mmx_psll_q: 
-    case Intrinsic::x86_mmx_pslli_w: 
-    case Intrinsic::x86_mmx_pslli_d: 
-    case Intrinsic::x86_mmx_pslli_q: 
-    case Intrinsic::x86_mmx_psrl_w: 
-    case Intrinsic::x86_mmx_psrl_d: 
-    case Intrinsic::x86_mmx_psrl_q: 
-    case Intrinsic::x86_mmx_psra_w: 
-    case Intrinsic::x86_mmx_psra_d: 
-    case Intrinsic::x86_mmx_psrli_w: 
-    case Intrinsic::x86_mmx_psrli_d: 
-    case Intrinsic::x86_mmx_psrli_q: 
-    case Intrinsic::x86_mmx_psrai_w: 
-    case Intrinsic::x86_mmx_psrai_d: 
-      handleVectorShiftIntrinsic(I, /* Variable */ false); 
-      break; 
-    case Intrinsic::x86_avx2_psllv_d: 
-    case Intrinsic::x86_avx2_psllv_d_256: 
-    case Intrinsic::x86_avx512_psllv_d_512: 
-    case Intrinsic::x86_avx2_psllv_q: 
-    case Intrinsic::x86_avx2_psllv_q_256: 
-    case Intrinsic::x86_avx512_psllv_q_512: 
-    case Intrinsic::x86_avx2_psrlv_d: 
-    case Intrinsic::x86_avx2_psrlv_d_256: 
-    case Intrinsic::x86_avx512_psrlv_d_512: 
-    case Intrinsic::x86_avx2_psrlv_q: 
-    case Intrinsic::x86_avx2_psrlv_q_256: 
-    case Intrinsic::x86_avx512_psrlv_q_512: 
-    case Intrinsic::x86_avx2_psrav_d: 
-    case Intrinsic::x86_avx2_psrav_d_256: 
-    case Intrinsic::x86_avx512_psrav_d_512: 
-    case Intrinsic::x86_avx512_psrav_q_128: 
-    case Intrinsic::x86_avx512_psrav_q_256: 
-    case Intrinsic::x86_avx512_psrav_q_512: 
-      handleVectorShiftIntrinsic(I, /* Variable */ true); 
-      break; 
- 
-    case Intrinsic::x86_sse2_packsswb_128: 
-    case Intrinsic::x86_sse2_packssdw_128: 
-    case Intrinsic::x86_sse2_packuswb_128: 
-    case Intrinsic::x86_sse41_packusdw: 
-    case Intrinsic::x86_avx2_packsswb: 
-    case Intrinsic::x86_avx2_packssdw: 
-    case Intrinsic::x86_avx2_packuswb: 
-    case Intrinsic::x86_avx2_packusdw: 
-      handleVectorPackIntrinsic(I); 
-      break; 
- 
-    case Intrinsic::x86_mmx_packsswb: 
-    case Intrinsic::x86_mmx_packuswb: 
-      handleVectorPackIntrinsic(I, 16); 
-      break; 
- 
-    case Intrinsic::x86_mmx_packssdw: 
-      handleVectorPackIntrinsic(I, 32); 
-      break; 
- 
-    case Intrinsic::x86_mmx_psad_bw: 
-    case Intrinsic::x86_sse2_psad_bw: 
-    case Intrinsic::x86_avx2_psad_bw: 
-      handleVectorSadIntrinsic(I); 
-      break; 
- 
-    case Intrinsic::x86_sse2_pmadd_wd: 
-    case Intrinsic::x86_avx2_pmadd_wd: 
-    case Intrinsic::x86_ssse3_pmadd_ub_sw_128: 
-    case Intrinsic::x86_avx2_pmadd_ub_sw: 
-      handleVectorPmaddIntrinsic(I); 
-      break; 
- 
-    case Intrinsic::x86_ssse3_pmadd_ub_sw: 
-      handleVectorPmaddIntrinsic(I, 8); 
-      break; 
- 
-    case Intrinsic::x86_mmx_pmadd_wd: 
-      handleVectorPmaddIntrinsic(I, 16); 
-      break; 
- 
-    case Intrinsic::x86_sse_cmp_ss: 
-    case Intrinsic::x86_sse2_cmp_sd: 
-    case Intrinsic::x86_sse_comieq_ss: 
-    case Intrinsic::x86_sse_comilt_ss: 
-    case Intrinsic::x86_sse_comile_ss: 
-    case Intrinsic::x86_sse_comigt_ss: 
-    case Intrinsic::x86_sse_comige_ss: 
-    case Intrinsic::x86_sse_comineq_ss: 
-    case Intrinsic::x86_sse_ucomieq_ss: 
-    case Intrinsic::x86_sse_ucomilt_ss: 
-    case Intrinsic::x86_sse_ucomile_ss: 
-    case Intrinsic::x86_sse_ucomigt_ss: 
-    case Intrinsic::x86_sse_ucomige_ss: 
-    case Intrinsic::x86_sse_ucomineq_ss: 
-    case Intrinsic::x86_sse2_comieq_sd: 
-    case Intrinsic::x86_sse2_comilt_sd: 
-    case Intrinsic::x86_sse2_comile_sd: 
-    case Intrinsic::x86_sse2_comigt_sd: 
-    case Intrinsic::x86_sse2_comige_sd: 
-    case Intrinsic::x86_sse2_comineq_sd: 
-    case Intrinsic::x86_sse2_ucomieq_sd: 
-    case Intrinsic::x86_sse2_ucomilt_sd: 
-    case Intrinsic::x86_sse2_ucomile_sd: 
-    case Intrinsic::x86_sse2_ucomigt_sd: 
-    case Intrinsic::x86_sse2_ucomige_sd: 
-    case Intrinsic::x86_sse2_ucomineq_sd: 
-      handleVectorCompareScalarIntrinsic(I); 
-      break; 
- 
-    case Intrinsic::x86_sse_cmp_ps: 
-    case Intrinsic::x86_sse2_cmp_pd: 
-      // FIXME: For x86_avx_cmp_pd_256 and x86_avx_cmp_ps_256 this function 
-      // generates reasonably looking IR that fails in the backend with "Do not 
-      // know how to split the result of this operator!". 
-      handleVectorComparePackedIntrinsic(I); 
-      break; 
- 
-    case Intrinsic::x86_bmi_bextr_32: 
-    case Intrinsic::x86_bmi_bextr_64: 
-    case Intrinsic::x86_bmi_bzhi_32: 
-    case Intrinsic::x86_bmi_bzhi_64: 
-    case Intrinsic::x86_bmi_pdep_32: 
-    case Intrinsic::x86_bmi_pdep_64: 
-    case Intrinsic::x86_bmi_pext_32: 
-    case Intrinsic::x86_bmi_pext_64: 
-      handleBmiIntrinsic(I); 
-      break; 
- 
-    case Intrinsic::x86_pclmulqdq: 
-    case Intrinsic::x86_pclmulqdq_256: 
-    case Intrinsic::x86_pclmulqdq_512: 
-      handlePclmulIntrinsic(I); 
-      break; 
- 
-    case Intrinsic::x86_sse41_round_sd: 
-      handleUnarySdIntrinsic(I); 
-      break; 
-    case Intrinsic::x86_sse2_max_sd: 
-    case Intrinsic::x86_sse2_min_sd: 
-      handleBinarySdIntrinsic(I); 
-      break; 
- 
-    case Intrinsic::is_constant: 
-      // The result of llvm.is.constant() is always defined. 
-      setShadow(&I, getCleanShadow(&I)); 
-      setOrigin(&I, getCleanOrigin()); 
-      break; 
- 
-    default: 
-      if (!handleUnknownIntrinsic(I)) 
-        visitInstruction(I); 
-      break; 
-    } 
-  } 
- 
+    case Intrinsic::x86_sse2_cvtsd2si64:
+    case Intrinsic::x86_sse2_cvtsd2si:
+    case Intrinsic::x86_sse2_cvtsd2ss:
+    case Intrinsic::x86_sse2_cvttsd2si64:
+    case Intrinsic::x86_sse2_cvttsd2si:
+    case Intrinsic::x86_sse_cvtss2si64:
+    case Intrinsic::x86_sse_cvtss2si:
+    case Intrinsic::x86_sse_cvttss2si64:
+    case Intrinsic::x86_sse_cvttss2si:
+      handleVectorConvertIntrinsic(I, 1);
+      break;
+    case Intrinsic::x86_sse_cvtps2pi:
+    case Intrinsic::x86_sse_cvttps2pi:
+      handleVectorConvertIntrinsic(I, 2);
+      break;
+
+    case Intrinsic::x86_avx512_psll_w_512:
+    case Intrinsic::x86_avx512_psll_d_512:
+    case Intrinsic::x86_avx512_psll_q_512:
+    case Intrinsic::x86_avx512_pslli_w_512:
+    case Intrinsic::x86_avx512_pslli_d_512:
+    case Intrinsic::x86_avx512_pslli_q_512:
+    case Intrinsic::x86_avx512_psrl_w_512:
+    case Intrinsic::x86_avx512_psrl_d_512:
+    case Intrinsic::x86_avx512_psrl_q_512:
+    case Intrinsic::x86_avx512_psra_w_512:
+    case Intrinsic::x86_avx512_psra_d_512:
+    case Intrinsic::x86_avx512_psra_q_512:
+    case Intrinsic::x86_avx512_psrli_w_512:
+    case Intrinsic::x86_avx512_psrli_d_512:
+    case Intrinsic::x86_avx512_psrli_q_512:
+    case Intrinsic::x86_avx512_psrai_w_512:
+    case Intrinsic::x86_avx512_psrai_d_512:
+    case Intrinsic::x86_avx512_psrai_q_512:
+    case Intrinsic::x86_avx512_psra_q_256:
+    case Intrinsic::x86_avx512_psra_q_128:
+    case Intrinsic::x86_avx512_psrai_q_256:
+    case Intrinsic::x86_avx512_psrai_q_128:
+    case Intrinsic::x86_avx2_psll_w:
+    case Intrinsic::x86_avx2_psll_d:
+    case Intrinsic::x86_avx2_psll_q:
+    case Intrinsic::x86_avx2_pslli_w:
+    case Intrinsic::x86_avx2_pslli_d:
+    case Intrinsic::x86_avx2_pslli_q:
+    case Intrinsic::x86_avx2_psrl_w:
+    case Intrinsic::x86_avx2_psrl_d:
+    case Intrinsic::x86_avx2_psrl_q:
+    case Intrinsic::x86_avx2_psra_w:
+    case Intrinsic::x86_avx2_psra_d:
+    case Intrinsic::x86_avx2_psrli_w:
+    case Intrinsic::x86_avx2_psrli_d:
+    case Intrinsic::x86_avx2_psrli_q:
+    case Intrinsic::x86_avx2_psrai_w:
+    case Intrinsic::x86_avx2_psrai_d:
+    case Intrinsic::x86_sse2_psll_w:
+    case Intrinsic::x86_sse2_psll_d:
+    case Intrinsic::x86_sse2_psll_q:
+    case Intrinsic::x86_sse2_pslli_w:
+    case Intrinsic::x86_sse2_pslli_d:
+    case Intrinsic::x86_sse2_pslli_q:
+    case Intrinsic::x86_sse2_psrl_w:
+    case Intrinsic::x86_sse2_psrl_d:
+    case Intrinsic::x86_sse2_psrl_q:
+    case Intrinsic::x86_sse2_psra_w:
+    case Intrinsic::x86_sse2_psra_d:
+    case Intrinsic::x86_sse2_psrli_w:
+    case Intrinsic::x86_sse2_psrli_d:
+    case Intrinsic::x86_sse2_psrli_q:
+    case Intrinsic::x86_sse2_psrai_w:
+    case Intrinsic::x86_sse2_psrai_d:
+    case Intrinsic::x86_mmx_psll_w:
+    case Intrinsic::x86_mmx_psll_d:
+    case Intrinsic::x86_mmx_psll_q:
+    case Intrinsic::x86_mmx_pslli_w:
+    case Intrinsic::x86_mmx_pslli_d:
+    case Intrinsic::x86_mmx_pslli_q:
+    case Intrinsic::x86_mmx_psrl_w:
+    case Intrinsic::x86_mmx_psrl_d:
+    case Intrinsic::x86_mmx_psrl_q:
+    case Intrinsic::x86_mmx_psra_w:
+    case Intrinsic::x86_mmx_psra_d:
+    case Intrinsic::x86_mmx_psrli_w:
+    case Intrinsic::x86_mmx_psrli_d:
+    case Intrinsic::x86_mmx_psrli_q:
+    case Intrinsic::x86_mmx_psrai_w:
+    case Intrinsic::x86_mmx_psrai_d:
+      handleVectorShiftIntrinsic(I, /* Variable */ false);
+      break;
+    case Intrinsic::x86_avx2_psllv_d:
+    case Intrinsic::x86_avx2_psllv_d_256:
+    case Intrinsic::x86_avx512_psllv_d_512:
+    case Intrinsic::x86_avx2_psllv_q:
+    case Intrinsic::x86_avx2_psllv_q_256:
+    case Intrinsic::x86_avx512_psllv_q_512:
+    case Intrinsic::x86_avx2_psrlv_d:
+    case Intrinsic::x86_avx2_psrlv_d_256:
+    case Intrinsic::x86_avx512_psrlv_d_512:
+    case Intrinsic::x86_avx2_psrlv_q:
+    case Intrinsic::x86_avx2_psrlv_q_256:
+    case Intrinsic::x86_avx512_psrlv_q_512:
+    case Intrinsic::x86_avx2_psrav_d:
+    case Intrinsic::x86_avx2_psrav_d_256:
+    case Intrinsic::x86_avx512_psrav_d_512:
+    case Intrinsic::x86_avx512_psrav_q_128:
+    case Intrinsic::x86_avx512_psrav_q_256:
+    case Intrinsic::x86_avx512_psrav_q_512:
+      handleVectorShiftIntrinsic(I, /* Variable */ true);
+      break;
+
+    case Intrinsic::x86_sse2_packsswb_128:
+    case Intrinsic::x86_sse2_packssdw_128:
+    case Intrinsic::x86_sse2_packuswb_128:
+    case Intrinsic::x86_sse41_packusdw:
+    case Intrinsic::x86_avx2_packsswb:
+    case Intrinsic::x86_avx2_packssdw:
+    case Intrinsic::x86_avx2_packuswb:
+    case Intrinsic::x86_avx2_packusdw:
+      handleVectorPackIntrinsic(I);
+      break;
+
+    case Intrinsic::x86_mmx_packsswb:
+    case Intrinsic::x86_mmx_packuswb:
+      handleVectorPackIntrinsic(I, 16);
+      break;
+
+    case Intrinsic::x86_mmx_packssdw:
+      handleVectorPackIntrinsic(I, 32);
+      break;
+
+    case Intrinsic::x86_mmx_psad_bw:
+    case Intrinsic::x86_sse2_psad_bw:
+    case Intrinsic::x86_avx2_psad_bw:
+      handleVectorSadIntrinsic(I);
+      break;
+
+    case Intrinsic::x86_sse2_pmadd_wd:
+    case Intrinsic::x86_avx2_pmadd_wd:
+    case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
+    case Intrinsic::x86_avx2_pmadd_ub_sw:
+      handleVectorPmaddIntrinsic(I);
+      break;
+
+    case Intrinsic::x86_ssse3_pmadd_ub_sw:
+      handleVectorPmaddIntrinsic(I, 8);
+      break;
+
+    case Intrinsic::x86_mmx_pmadd_wd:
+      handleVectorPmaddIntrinsic(I, 16);
+      break;
+
+    case Intrinsic::x86_sse_cmp_ss:
+    case Intrinsic::x86_sse2_cmp_sd:
+    case Intrinsic::x86_sse_comieq_ss:
+    case Intrinsic::x86_sse_comilt_ss:
+    case Intrinsic::x86_sse_comile_ss:
+    case Intrinsic::x86_sse_comigt_ss:
+    case Intrinsic::x86_sse_comige_ss:
+    case Intrinsic::x86_sse_comineq_ss:
+    case Intrinsic::x86_sse_ucomieq_ss:
+    case Intrinsic::x86_sse_ucomilt_ss:
+    case Intrinsic::x86_sse_ucomile_ss:
+    case Intrinsic::x86_sse_ucomigt_ss:
+    case Intrinsic::x86_sse_ucomige_ss:
+    case Intrinsic::x86_sse_ucomineq_ss:
+    case Intrinsic::x86_sse2_comieq_sd:
+    case Intrinsic::x86_sse2_comilt_sd:
+    case Intrinsic::x86_sse2_comile_sd:
+    case Intrinsic::x86_sse2_comigt_sd:
+    case Intrinsic::x86_sse2_comige_sd:
+    case Intrinsic::x86_sse2_comineq_sd:
+    case Intrinsic::x86_sse2_ucomieq_sd:
+    case Intrinsic::x86_sse2_ucomilt_sd:
+    case Intrinsic::x86_sse2_ucomile_sd:
+    case Intrinsic::x86_sse2_ucomigt_sd:
+    case Intrinsic::x86_sse2_ucomige_sd:
+    case Intrinsic::x86_sse2_ucomineq_sd:
+      handleVectorCompareScalarIntrinsic(I);
+      break;
+
+    case Intrinsic::x86_sse_cmp_ps:
+    case Intrinsic::x86_sse2_cmp_pd:
+      // FIXME: For x86_avx_cmp_pd_256 and x86_avx_cmp_ps_256 this function
+      // generates reasonably looking IR that fails in the backend with "Do not
+      // know how to split the result of this operator!".
+      handleVectorComparePackedIntrinsic(I);
+      break;
+
+    case Intrinsic::x86_bmi_bextr_32:
+    case Intrinsic::x86_bmi_bextr_64:
+    case Intrinsic::x86_bmi_bzhi_32:
+    case Intrinsic::x86_bmi_bzhi_64:
+    case Intrinsic::x86_bmi_pdep_32:
+    case Intrinsic::x86_bmi_pdep_64:
+    case Intrinsic::x86_bmi_pext_32:
+    case Intrinsic::x86_bmi_pext_64:
+      handleBmiIntrinsic(I);
+      break;
+
+    case Intrinsic::x86_pclmulqdq:
+    case Intrinsic::x86_pclmulqdq_256:
+    case Intrinsic::x86_pclmulqdq_512:
+      handlePclmulIntrinsic(I);
+      break;
+
+    case Intrinsic::x86_sse41_round_sd:
+      handleUnarySdIntrinsic(I);
+      break;
+    case Intrinsic::x86_sse2_max_sd:
+    case Intrinsic::x86_sse2_min_sd:
+      handleBinarySdIntrinsic(I);
+      break;
+
+    case Intrinsic::is_constant:
+      // The result of llvm.is.constant() is always defined.
+      setShadow(&I, getCleanShadow(&I));
+      setOrigin(&I, getCleanOrigin());
+      break;
+
+    default:
+      if (!handleUnknownIntrinsic(I))
+        visitInstruction(I);
+      break;
+    }
+  }
+
   void visitLibAtomicLoad(CallBase &CB) {
     // Since we use getNextNode here, we can't have CB terminate the BB.
     assert(isa<CallInst>(CB));
@@ -3577,19 +3577,19 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
                      Align(1));
   }
 
-  void visitCallBase(CallBase &CB) { 
-    assert(!CB.getMetadata("nosanitize")); 
-    if (CB.isInlineAsm()) { 
-      // For inline asm (either a call to asm function, or callbr instruction), 
-      // do the usual thing: check argument shadow and mark all outputs as 
-      // clean. Note that any side effects of the inline asm that are not 
-      // immediately visible in its constraints are not handled. 
-      if (ClHandleAsmConservative && MS.CompileKernel) 
-        visitAsmInstruction(CB); 
-      else 
-        visitInstruction(CB); 
-      return; 
-    } 
+  void visitCallBase(CallBase &CB) {
+    assert(!CB.getMetadata("nosanitize"));
+    if (CB.isInlineAsm()) {
+      // For inline asm (either a call to asm function, or callbr instruction),
+      // do the usual thing: check argument shadow and mark all outputs as
+      // clean. Note that any side effects of the inline asm that are not
+      // immediately visible in its constraints are not handled.
+      if (ClHandleAsmConservative && MS.CompileKernel)
+        visitAsmInstruction(CB);
+      else
+        visitInstruction(CB);
+      return;
+    }
     LibFunc LF;
     if (TLI->getLibFunc(CB, LF)) {
       // libatomic.a functions need to have special handling because there isn't
@@ -3612,13 +3612,13 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       }
     }
 
-    if (auto *Call = dyn_cast<CallInst>(&CB)) { 
-      assert(!isa<IntrinsicInst>(Call) && "intrinsics are handled elsewhere"); 
- 
-      // We are going to insert code that relies on the fact that the callee 
-      // will become a non-readonly function after it is instrumented by us. To 
-      // prevent this code from being optimized out, mark that function 
-      // non-readonly in advance. 
+    if (auto *Call = dyn_cast<CallInst>(&CB)) {
+      assert(!isa<IntrinsicInst>(Call) && "intrinsics are handled elsewhere");
+
+      // We are going to insert code that relies on the fact that the callee
+      // will become a non-readonly function after it is instrumented by us. To
+      // prevent this code from being optimized out, mark that function
+      // non-readonly in advance.
       AttrBuilder B;
       B.addAttribute(Attribute::ReadOnly)
           .addAttribute(Attribute::ReadNone)
@@ -3627,1693 +3627,1693 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
           .addAttribute(Attribute::Speculatable);
 
       Call->removeAttributes(AttributeList::FunctionIndex, B);
-      if (Function *Func = Call->getCalledFunction()) { 
-        Func->removeAttributes(AttributeList::FunctionIndex, B); 
-      } 
- 
-      maybeMarkSanitizerLibraryCallNoBuiltin(Call, TLI); 
-    } 
-    IRBuilder<> IRB(&CB); 
+      if (Function *Func = Call->getCalledFunction()) {
+        Func->removeAttributes(AttributeList::FunctionIndex, B);
+      }
+
+      maybeMarkSanitizerLibraryCallNoBuiltin(Call, TLI);
+    }
+    IRBuilder<> IRB(&CB);
     bool MayCheckCall = ClEagerChecks;
     if (Function *Func = CB.getCalledFunction()) {
       // __sanitizer_unaligned_{load,store} functions may be called by users
       // and always expects shadows in the TLS. So don't check them.
       MayCheckCall &= !Func->getName().startswith("__sanitizer_unaligned_");
     }
- 
-    unsigned ArgOffset = 0; 
-    LLVM_DEBUG(dbgs() << "  CallSite: " << CB << "\n"); 
-    for (auto ArgIt = CB.arg_begin(), End = CB.arg_end(); ArgIt != End; 
-         ++ArgIt) { 
-      Value *A = *ArgIt; 
-      unsigned i = ArgIt - CB.arg_begin(); 
-      if (!A->getType()->isSized()) { 
-        LLVM_DEBUG(dbgs() << "Arg " << i << " is not sized: " << CB << "\n"); 
-        continue; 
-      } 
-      unsigned Size = 0; 
-      Value *Store = nullptr; 
-      // Compute the Shadow for arg even if it is ByVal, because 
-      // in that case getShadow() will copy the actual arg shadow to 
-      // __msan_param_tls. 
-      Value *ArgShadow = getShadow(A); 
-      Value *ArgShadowBase = getShadowPtrForArgument(A, IRB, ArgOffset); 
-      LLVM_DEBUG(dbgs() << "  Arg#" << i << ": " << *A 
-                        << " Shadow: " << *ArgShadow << "\n"); 
-      bool ArgIsInitialized = false; 
-      const DataLayout &DL = F.getParent()->getDataLayout(); 
- 
-      bool ByVal = CB.paramHasAttr(i, Attribute::ByVal); 
-      bool NoUndef = CB.paramHasAttr(i, Attribute::NoUndef); 
+
+    unsigned ArgOffset = 0;
+    LLVM_DEBUG(dbgs() << "  CallSite: " << CB << "\n");
+    for (auto ArgIt = CB.arg_begin(), End = CB.arg_end(); ArgIt != End;
+         ++ArgIt) {
+      Value *A = *ArgIt;
+      unsigned i = ArgIt - CB.arg_begin();
+      if (!A->getType()->isSized()) {
+        LLVM_DEBUG(dbgs() << "Arg " << i << " is not sized: " << CB << "\n");
+        continue;
+      }
+      unsigned Size = 0;
+      Value *Store = nullptr;
+      // Compute the Shadow for arg even if it is ByVal, because
+      // in that case getShadow() will copy the actual arg shadow to
+      // __msan_param_tls.
+      Value *ArgShadow = getShadow(A);
+      Value *ArgShadowBase = getShadowPtrForArgument(A, IRB, ArgOffset);
+      LLVM_DEBUG(dbgs() << "  Arg#" << i << ": " << *A
+                        << " Shadow: " << *ArgShadow << "\n");
+      bool ArgIsInitialized = false;
+      const DataLayout &DL = F.getParent()->getDataLayout();
+
+      bool ByVal = CB.paramHasAttr(i, Attribute::ByVal);
+      bool NoUndef = CB.paramHasAttr(i, Attribute::NoUndef);
       bool EagerCheck = MayCheckCall && !ByVal && NoUndef;
- 
-      if (EagerCheck) { 
-        insertShadowCheck(A, &CB); 
-        continue; 
-      } 
-      if (ByVal) { 
-        // ByVal requires some special handling as it's too big for a single 
-        // load 
-        assert(A->getType()->isPointerTy() && 
-               "ByVal argument is not a pointer!"); 
-        Size = DL.getTypeAllocSize(CB.getParamByValType(i)); 
-        if (ArgOffset + Size > kParamTLSSize) break; 
-        const MaybeAlign ParamAlignment(CB.getParamAlign(i)); 
-        MaybeAlign Alignment = llvm::None; 
-        if (ParamAlignment) 
-          Alignment = std::min(*ParamAlignment, kShadowTLSAlignment); 
-        Value *AShadowPtr = 
-            getShadowOriginPtr(A, IRB, IRB.getInt8Ty(), Alignment, 
-                               /*isStore*/ false) 
-                .first; 
- 
-        Store = IRB.CreateMemCpy(ArgShadowBase, Alignment, AShadowPtr, 
-                                 Alignment, Size); 
-        // TODO(glider): need to copy origins. 
-      } else { 
-        // Any other parameters mean we need bit-grained tracking of uninit data 
-        Size = DL.getTypeAllocSize(A->getType()); 
-        if (ArgOffset + Size > kParamTLSSize) break; 
-        Store = IRB.CreateAlignedStore(ArgShadow, ArgShadowBase, 
-                                       kShadowTLSAlignment); 
-        Constant *Cst = dyn_cast<Constant>(ArgShadow); 
-        if (Cst && Cst->isNullValue()) ArgIsInitialized = true; 
-      } 
-      if (MS.TrackOrigins && !ArgIsInitialized) 
-        IRB.CreateStore(getOrigin(A), 
-                        getOriginPtrForArgument(A, IRB, ArgOffset)); 
-      (void)Store; 
-      assert(Size != 0 && Store != nullptr); 
-      LLVM_DEBUG(dbgs() << "  Param:" << *Store << "\n"); 
+
+      if (EagerCheck) {
+        insertShadowCheck(A, &CB);
+        continue;
+      }
+      if (ByVal) {
+        // ByVal requires some special handling as it's too big for a single
+        // load
+        assert(A->getType()->isPointerTy() &&
+               "ByVal argument is not a pointer!");
+        Size = DL.getTypeAllocSize(CB.getParamByValType(i));
+        if (ArgOffset + Size > kParamTLSSize) break;
+        const MaybeAlign ParamAlignment(CB.getParamAlign(i));
+        MaybeAlign Alignment = llvm::None;
+        if (ParamAlignment)
+          Alignment = std::min(*ParamAlignment, kShadowTLSAlignment);
+        Value *AShadowPtr =
+            getShadowOriginPtr(A, IRB, IRB.getInt8Ty(), Alignment,
+                               /*isStore*/ false)
+                .first;
+
+        Store = IRB.CreateMemCpy(ArgShadowBase, Alignment, AShadowPtr,
+                                 Alignment, Size);
+        // TODO(glider): need to copy origins.
+      } else {
+        // Any other parameters mean we need bit-grained tracking of uninit data
+        Size = DL.getTypeAllocSize(A->getType());
+        if (ArgOffset + Size > kParamTLSSize) break;
+        Store = IRB.CreateAlignedStore(ArgShadow, ArgShadowBase,
+                                       kShadowTLSAlignment);
+        Constant *Cst = dyn_cast<Constant>(ArgShadow);
+        if (Cst && Cst->isNullValue()) ArgIsInitialized = true;
+      }
+      if (MS.TrackOrigins && !ArgIsInitialized)
+        IRB.CreateStore(getOrigin(A),
+                        getOriginPtrForArgument(A, IRB, ArgOffset));
+      (void)Store;
+      assert(Size != 0 && Store != nullptr);
+      LLVM_DEBUG(dbgs() << "  Param:" << *Store << "\n");
       ArgOffset += alignTo(Size, kShadowTLSAlignment);
-    } 
-    LLVM_DEBUG(dbgs() << "  done with call args\n"); 
- 
-    FunctionType *FT = CB.getFunctionType(); 
-    if (FT->isVarArg()) { 
-      VAHelper->visitCallBase(CB, IRB); 
-    } 
- 
-    // Now, get the shadow for the RetVal. 
-    if (!CB.getType()->isSized()) 
-      return; 
-    // Don't emit the epilogue for musttail call returns. 
-    if (isa<CallInst>(CB) && cast<CallInst>(CB).isMustTailCall()) 
-      return; 
- 
+    }
+    LLVM_DEBUG(dbgs() << "  done with call args\n");
+
+    FunctionType *FT = CB.getFunctionType();
+    if (FT->isVarArg()) {
+      VAHelper->visitCallBase(CB, IRB);
+    }
+
+    // Now, get the shadow for the RetVal.
+    if (!CB.getType()->isSized())
+      return;
+    // Don't emit the epilogue for musttail call returns.
+    if (isa<CallInst>(CB) && cast<CallInst>(CB).isMustTailCall())
+      return;
+
     if (MayCheckCall && CB.hasRetAttr(Attribute::NoUndef)) {
-      setShadow(&CB, getCleanShadow(&CB)); 
-      setOrigin(&CB, getCleanOrigin()); 
-      return; 
-    } 
- 
-    IRBuilder<> IRBBefore(&CB); 
-    // Until we have full dynamic coverage, make sure the retval shadow is 0. 
-    Value *Base = getShadowPtrForRetval(&CB, IRBBefore); 
-    IRBBefore.CreateAlignedStore(getCleanShadow(&CB), Base, 
-                                 kShadowTLSAlignment); 
-    BasicBlock::iterator NextInsn; 
-    if (isa<CallInst>(CB)) { 
-      NextInsn = ++CB.getIterator(); 
-      assert(NextInsn != CB.getParent()->end()); 
-    } else { 
-      BasicBlock *NormalDest = cast<InvokeInst>(CB).getNormalDest(); 
-      if (!NormalDest->getSinglePredecessor()) { 
-        // FIXME: this case is tricky, so we are just conservative here. 
-        // Perhaps we need to split the edge between this BB and NormalDest, 
-        // but a naive attempt to use SplitEdge leads to a crash. 
-        setShadow(&CB, getCleanShadow(&CB)); 
-        setOrigin(&CB, getCleanOrigin()); 
-        return; 
-      } 
-      // FIXME: NextInsn is likely in a basic block that has not been visited yet. 
-      // Anything inserted there will be instrumented by MSan later! 
-      NextInsn = NormalDest->getFirstInsertionPt(); 
-      assert(NextInsn != NormalDest->end() && 
-             "Could not find insertion point for retval shadow load"); 
-    } 
-    IRBuilder<> IRBAfter(&*NextInsn); 
-    Value *RetvalShadow = IRBAfter.CreateAlignedLoad( 
-        getShadowTy(&CB), getShadowPtrForRetval(&CB, IRBAfter), 
-        kShadowTLSAlignment, "_msret"); 
-    setShadow(&CB, RetvalShadow); 
-    if (MS.TrackOrigins) 
-      setOrigin(&CB, IRBAfter.CreateLoad(MS.OriginTy, 
-                                         getOriginPtrForRetval(IRBAfter))); 
-  } 
- 
-  bool isAMustTailRetVal(Value *RetVal) { 
-    if (auto *I = dyn_cast<BitCastInst>(RetVal)) { 
-      RetVal = I->getOperand(0); 
-    } 
-    if (auto *I = dyn_cast<CallInst>(RetVal)) { 
-      return I->isMustTailCall(); 
-    } 
-    return false; 
-  } 
- 
-  void visitReturnInst(ReturnInst &I) { 
-    IRBuilder<> IRB(&I); 
-    Value *RetVal = I.getReturnValue(); 
-    if (!RetVal) return; 
-    // Don't emit the epilogue for musttail call returns. 
-    if (isAMustTailRetVal(RetVal)) return; 
-    Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB); 
-    bool HasNoUndef = 
-        F.hasAttribute(AttributeList::ReturnIndex, Attribute::NoUndef); 
-    bool StoreShadow = !(ClEagerChecks && HasNoUndef); 
-    // FIXME: Consider using SpecialCaseList to specify a list of functions that 
-    // must always return fully initialized values. For now, we hardcode "main". 
-    bool EagerCheck = (ClEagerChecks && HasNoUndef) || (F.getName() == "main"); 
- 
-    Value *Shadow = getShadow(RetVal); 
-    bool StoreOrigin = true; 
-    if (EagerCheck) { 
-      insertShadowCheck(RetVal, &I); 
-      Shadow = getCleanShadow(RetVal); 
-      StoreOrigin = false; 
-    } 
- 
-    // The caller may still expect information passed over TLS if we pass our 
-    // check 
-    if (StoreShadow) { 
-      IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment); 
-      if (MS.TrackOrigins && StoreOrigin) 
-        IRB.CreateStore(getOrigin(RetVal), getOriginPtrForRetval(IRB)); 
-    } 
-  } 
- 
-  void visitPHINode(PHINode &I) { 
-    IRBuilder<> IRB(&I); 
-    if (!PropagateShadow) { 
-      setShadow(&I, getCleanShadow(&I)); 
-      setOrigin(&I, getCleanOrigin()); 
-      return; 
-    } 
- 
-    ShadowPHINodes.push_back(&I); 
-    setShadow(&I, IRB.CreatePHI(getShadowTy(&I), I.getNumIncomingValues(), 
-                                "_msphi_s")); 
-    if (MS.TrackOrigins) 
-      setOrigin(&I, IRB.CreatePHI(MS.OriginTy, I.getNumIncomingValues(), 
-                                  "_msphi_o")); 
-  } 
- 
-  Value *getLocalVarDescription(AllocaInst &I) { 
-    SmallString<2048> StackDescriptionStorage; 
-    raw_svector_ostream StackDescription(StackDescriptionStorage); 
-    // We create a string with a description of the stack allocation and 
-    // pass it into __msan_set_alloca_origin. 
-    // It will be printed by the run-time if stack-originated UMR is found. 
-    // The first 4 bytes of the string are set to '----' and will be replaced 
-    // by __msan_va_arg_overflow_size_tls at the first call. 
-    StackDescription << "----" << I.getName() << "@" << F.getName(); 
-    return createPrivateNonConstGlobalForString(*F.getParent(), 
-                                                StackDescription.str()); 
-  } 
- 
-  void poisonAllocaUserspace(AllocaInst &I, IRBuilder<> &IRB, Value *Len) { 
-    if (PoisonStack && ClPoisonStackWithCall) { 
-      IRB.CreateCall(MS.MsanPoisonStackFn, 
-                     {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), Len}); 
-    } else { 
-      Value *ShadowBase, *OriginBase; 
-      std::tie(ShadowBase, OriginBase) = getShadowOriginPtr( 
-          &I, IRB, IRB.getInt8Ty(), Align(1), /*isStore*/ true); 
- 
-      Value *PoisonValue = IRB.getInt8(PoisonStack ? ClPoisonStackPattern : 0); 
-      IRB.CreateMemSet(ShadowBase, PoisonValue, Len, 
-                       MaybeAlign(I.getAlignment())); 
-    } 
- 
-    if (PoisonStack && MS.TrackOrigins) { 
-      Value *Descr = getLocalVarDescription(I); 
-      IRB.CreateCall(MS.MsanSetAllocaOrigin4Fn, 
-                     {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), Len, 
-                      IRB.CreatePointerCast(Descr, IRB.getInt8PtrTy()), 
-                      IRB.CreatePointerCast(&F, MS.IntptrTy)}); 
-    } 
-  } 
- 
-  void poisonAllocaKmsan(AllocaInst &I, IRBuilder<> &IRB, Value *Len) { 
-    Value *Descr = getLocalVarDescription(I); 
-    if (PoisonStack) { 
-      IRB.CreateCall(MS.MsanPoisonAllocaFn, 
-                     {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), Len, 
-                      IRB.CreatePointerCast(Descr, IRB.getInt8PtrTy())}); 
-    } else { 
-      IRB.CreateCall(MS.MsanUnpoisonAllocaFn, 
-                     {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), Len}); 
-    } 
-  } 
- 
-  void instrumentAlloca(AllocaInst &I, Instruction *InsPoint = nullptr) { 
-    if (!InsPoint) 
-      InsPoint = &I; 
-    IRBuilder<> IRB(InsPoint->getNextNode()); 
-    const DataLayout &DL = F.getParent()->getDataLayout(); 
-    uint64_t TypeSize = DL.getTypeAllocSize(I.getAllocatedType()); 
-    Value *Len = ConstantInt::get(MS.IntptrTy, TypeSize); 
-    if (I.isArrayAllocation()) 
-      Len = IRB.CreateMul(Len, I.getArraySize()); 
- 
-    if (MS.CompileKernel) 
-      poisonAllocaKmsan(I, IRB, Len); 
-    else 
-      poisonAllocaUserspace(I, IRB, Len); 
-  } 
- 
-  void visitAllocaInst(AllocaInst &I) { 
-    setShadow(&I, getCleanShadow(&I)); 
-    setOrigin(&I, getCleanOrigin()); 
-    // We'll get to this alloca later unless it's poisoned at the corresponding 
-    // llvm.lifetime.start. 
-    AllocaSet.insert(&I); 
-  } 
- 
-  void visitSelectInst(SelectInst& I) { 
-    IRBuilder<> IRB(&I); 
-    // a = select b, c, d 
-    Value *B = I.getCondition(); 
-    Value *C = I.getTrueValue(); 
-    Value *D = I.getFalseValue(); 
-    Value *Sb = getShadow(B); 
-    Value *Sc = getShadow(C); 
-    Value *Sd = getShadow(D); 
- 
-    // Result shadow if condition shadow is 0. 
-    Value *Sa0 = IRB.CreateSelect(B, Sc, Sd); 
-    Value *Sa1; 
-    if (I.getType()->isAggregateType()) { 
-      // To avoid "sign extending" i1 to an arbitrary aggregate type, we just do 
-      // an extra "select". This results in much more compact IR. 
-      // Sa = select Sb, poisoned, (select b, Sc, Sd) 
-      Sa1 = getPoisonedShadow(getShadowTy(I.getType())); 
-    } else { 
-      // Sa = select Sb, [ (c^d) | Sc | Sd ], [ b ? Sc : Sd ] 
-      // If Sb (condition is poisoned), look for bits in c and d that are equal 
-      // and both unpoisoned. 
-      // If !Sb (condition is unpoisoned), simply pick one of Sc and Sd. 
- 
-      // Cast arguments to shadow-compatible type. 
-      C = CreateAppToShadowCast(IRB, C); 
-      D = CreateAppToShadowCast(IRB, D); 
- 
-      // Result shadow if condition shadow is 1. 
-      Sa1 = IRB.CreateOr({IRB.CreateXor(C, D), Sc, Sd}); 
-    } 
-    Value *Sa = IRB.CreateSelect(Sb, Sa1, Sa0, "_msprop_select"); 
-    setShadow(&I, Sa); 
-    if (MS.TrackOrigins) { 
-      // Origins are always i32, so any vector conditions must be flattened. 
-      // FIXME: consider tracking vector origins for app vectors? 
-      if (B->getType()->isVectorTy()) { 
-        Type *FlatTy = getShadowTyNoVec(B->getType()); 
-        B = IRB.CreateICmpNE(IRB.CreateBitCast(B, FlatTy), 
-                                ConstantInt::getNullValue(FlatTy)); 
-        Sb = IRB.CreateICmpNE(IRB.CreateBitCast(Sb, FlatTy), 
-                                      ConstantInt::getNullValue(FlatTy)); 
-      } 
-      // a = select b, c, d 
-      // Oa = Sb ? Ob : (b ? Oc : Od) 
-      setOrigin( 
-          &I, IRB.CreateSelect(Sb, getOrigin(I.getCondition()), 
-                               IRB.CreateSelect(B, getOrigin(I.getTrueValue()), 
-                                                getOrigin(I.getFalseValue())))); 
-    } 
-  } 
- 
-  void visitLandingPadInst(LandingPadInst &I) { 
-    // Do nothing. 
-    // See https://github.com/google/sanitizers/issues/504 
-    setShadow(&I, getCleanShadow(&I)); 
-    setOrigin(&I, getCleanOrigin()); 
-  } 
- 
-  void visitCatchSwitchInst(CatchSwitchInst &I) { 
-    setShadow(&I, getCleanShadow(&I)); 
-    setOrigin(&I, getCleanOrigin()); 
-  } 
- 
-  void visitFuncletPadInst(FuncletPadInst &I) { 
-    setShadow(&I, getCleanShadow(&I)); 
-    setOrigin(&I, getCleanOrigin()); 
-  } 
- 
-  void visitGetElementPtrInst(GetElementPtrInst &I) { 
-    handleShadowOr(I); 
-  } 
- 
-  void visitExtractValueInst(ExtractValueInst &I) { 
-    IRBuilder<> IRB(&I); 
-    Value *Agg = I.getAggregateOperand(); 
-    LLVM_DEBUG(dbgs() << "ExtractValue:  " << I << "\n"); 
-    Value *AggShadow = getShadow(Agg); 
-    LLVM_DEBUG(dbgs() << "   AggShadow:  " << *AggShadow << "\n"); 
-    Value *ResShadow = IRB.CreateExtractValue(AggShadow, I.getIndices()); 
-    LLVM_DEBUG(dbgs() << "   ResShadow:  " << *ResShadow << "\n"); 
-    setShadow(&I, ResShadow); 
-    setOriginForNaryOp(I); 
-  } 
- 
-  void visitInsertValueInst(InsertValueInst &I) { 
-    IRBuilder<> IRB(&I); 
-    LLVM_DEBUG(dbgs() << "InsertValue:  " << I << "\n"); 
-    Value *AggShadow = getShadow(I.getAggregateOperand()); 
-    Value *InsShadow = getShadow(I.getInsertedValueOperand()); 
-    LLVM_DEBUG(dbgs() << "   AggShadow:  " << *AggShadow << "\n"); 
-    LLVM_DEBUG(dbgs() << "   InsShadow:  " << *InsShadow << "\n"); 
-    Value *Res = IRB.CreateInsertValue(AggShadow, InsShadow, I.getIndices()); 
-    LLVM_DEBUG(dbgs() << "   Res:        " << *Res << "\n"); 
-    setShadow(&I, Res); 
-    setOriginForNaryOp(I); 
-  } 
- 
-  void dumpInst(Instruction &I) { 
-    if (CallInst *CI = dyn_cast<CallInst>(&I)) { 
-      errs() << "ZZZ call " << CI->getCalledFunction()->getName() << "\n"; 
-    } else { 
-      errs() << "ZZZ " << I.getOpcodeName() << "\n"; 
-    } 
-    errs() << "QQQ " << I << "\n"; 
-  } 
- 
-  void visitResumeInst(ResumeInst &I) { 
-    LLVM_DEBUG(dbgs() << "Resume: " << I << "\n"); 
-    // Nothing to do here. 
-  } 
- 
-  void visitCleanupReturnInst(CleanupReturnInst &CRI) { 
-    LLVM_DEBUG(dbgs() << "CleanupReturn: " << CRI << "\n"); 
-    // Nothing to do here. 
-  } 
- 
-  void visitCatchReturnInst(CatchReturnInst &CRI) { 
-    LLVM_DEBUG(dbgs() << "CatchReturn: " << CRI << "\n"); 
-    // Nothing to do here. 
-  } 
- 
-  void instrumentAsmArgument(Value *Operand, Instruction &I, IRBuilder<> &IRB, 
-                             const DataLayout &DL, bool isOutput) { 
-    // For each assembly argument, we check its value for being initialized. 
-    // If the argument is a pointer, we assume it points to a single element 
-    // of the corresponding type (or to a 8-byte word, if the type is unsized). 
-    // Each such pointer is instrumented with a call to the runtime library. 
-    Type *OpType = Operand->getType(); 
-    // Check the operand value itself. 
-    insertShadowCheck(Operand, &I); 
-    if (!OpType->isPointerTy() || !isOutput) { 
-      assert(!isOutput); 
-      return; 
-    } 
-    Type *ElType = OpType->getPointerElementType(); 
-    if (!ElType->isSized()) 
-      return; 
-    int Size = DL.getTypeStoreSize(ElType); 
-    Value *Ptr = IRB.CreatePointerCast(Operand, IRB.getInt8PtrTy()); 
-    Value *SizeVal = ConstantInt::get(MS.IntptrTy, Size); 
-    IRB.CreateCall(MS.MsanInstrumentAsmStoreFn, {Ptr, SizeVal}); 
-  } 
- 
-  /// Get the number of output arguments returned by pointers. 
-  int getNumOutputArgs(InlineAsm *IA, CallBase *CB) { 
-    int NumRetOutputs = 0; 
-    int NumOutputs = 0; 
-    Type *RetTy = cast<Value>(CB)->getType(); 
-    if (!RetTy->isVoidTy()) { 
-      // Register outputs are returned via the CallInst return value. 
-      auto *ST = dyn_cast<StructType>(RetTy); 
-      if (ST) 
-        NumRetOutputs = ST->getNumElements(); 
-      else 
-        NumRetOutputs = 1; 
-    } 
-    InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); 
-    for (size_t i = 0, n = Constraints.size(); i < n; i++) { 
-      InlineAsm::ConstraintInfo Info = Constraints[i]; 
-      switch (Info.Type) { 
-      case InlineAsm::isOutput: 
-        NumOutputs++; 
-        break; 
-      default: 
-        break; 
-      } 
-    } 
-    return NumOutputs - NumRetOutputs; 
-  } 
- 
-  void visitAsmInstruction(Instruction &I) { 
-    // Conservative inline assembly handling: check for poisoned shadow of 
-    // asm() arguments, then unpoison the result and all the memory locations 
-    // pointed to by those arguments. 
-    // An inline asm() statement in C++ contains lists of input and output 
-    // arguments used by the assembly code. These are mapped to operands of the 
-    // CallInst as follows: 
-    //  - nR register outputs ("=r) are returned by value in a single structure 
-    //  (SSA value of the CallInst); 
-    //  - nO other outputs ("=m" and others) are returned by pointer as first 
-    // nO operands of the CallInst; 
-    //  - nI inputs ("r", "m" and others) are passed to CallInst as the 
-    // remaining nI operands. 
-    // The total number of asm() arguments in the source is nR+nO+nI, and the 
-    // corresponding CallInst has nO+nI+1 operands (the last operand is the 
-    // function to be called). 
-    const DataLayout &DL = F.getParent()->getDataLayout(); 
-    CallBase *CB = cast<CallBase>(&I); 
-    IRBuilder<> IRB(&I); 
-    InlineAsm *IA = cast<InlineAsm>(CB->getCalledOperand()); 
-    int OutputArgs = getNumOutputArgs(IA, CB); 
-    // The last operand of a CallInst is the function itself. 
-    int NumOperands = CB->getNumOperands() - 1; 
- 
-    // Check input arguments. Doing so before unpoisoning output arguments, so 
-    // that we won't overwrite uninit values before checking them. 
-    for (int i = OutputArgs; i < NumOperands; i++) { 
-      Value *Operand = CB->getOperand(i); 
-      instrumentAsmArgument(Operand, I, IRB, DL, /*isOutput*/ false); 
-    } 
-    // Unpoison output arguments. This must happen before the actual InlineAsm 
-    // call, so that the shadow for memory published in the asm() statement 
-    // remains valid. 
-    for (int i = 0; i < OutputArgs; i++) { 
-      Value *Operand = CB->getOperand(i); 
-      instrumentAsmArgument(Operand, I, IRB, DL, /*isOutput*/ true); 
-    } 
- 
-    setShadow(&I, getCleanShadow(&I)); 
-    setOrigin(&I, getCleanOrigin()); 
-  } 
- 
+      setShadow(&CB, getCleanShadow(&CB));
+      setOrigin(&CB, getCleanOrigin());
+      return;
+    }
+
+    IRBuilder<> IRBBefore(&CB);
+    // Until we have full dynamic coverage, make sure the retval shadow is 0.
+    Value *Base = getShadowPtrForRetval(&CB, IRBBefore);
+    IRBBefore.CreateAlignedStore(getCleanShadow(&CB), Base,
+                                 kShadowTLSAlignment);
+    BasicBlock::iterator NextInsn;
+    if (isa<CallInst>(CB)) {
+      NextInsn = ++CB.getIterator();
+      assert(NextInsn != CB.getParent()->end());
+    } else {
+      BasicBlock *NormalDest = cast<InvokeInst>(CB).getNormalDest();
+      if (!NormalDest->getSinglePredecessor()) {
+        // FIXME: this case is tricky, so we are just conservative here.
+        // Perhaps we need to split the edge between this BB and NormalDest,
+        // but a naive attempt to use SplitEdge leads to a crash.
+        setShadow(&CB, getCleanShadow(&CB));
+        setOrigin(&CB, getCleanOrigin());
+        return;
+      }
+      // FIXME: NextInsn is likely in a basic block that has not been visited yet.
+      // Anything inserted there will be instrumented by MSan later!
+      NextInsn = NormalDest->getFirstInsertionPt();
+      assert(NextInsn != NormalDest->end() &&
+             "Could not find insertion point for retval shadow load");
+    }
+    IRBuilder<> IRBAfter(&*NextInsn);
+    Value *RetvalShadow = IRBAfter.CreateAlignedLoad(
+        getShadowTy(&CB), getShadowPtrForRetval(&CB, IRBAfter),
+        kShadowTLSAlignment, "_msret");
+    setShadow(&CB, RetvalShadow);
+    if (MS.TrackOrigins)
+      setOrigin(&CB, IRBAfter.CreateLoad(MS.OriginTy,
+                                         getOriginPtrForRetval(IRBAfter)));
+  }
+
+  bool isAMustTailRetVal(Value *RetVal) {
+    if (auto *I = dyn_cast<BitCastInst>(RetVal)) {
+      RetVal = I->getOperand(0);
+    }
+    if (auto *I = dyn_cast<CallInst>(RetVal)) {
+      return I->isMustTailCall();
+    }
+    return false;
+  }
+
+  void visitReturnInst(ReturnInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *RetVal = I.getReturnValue();
+    if (!RetVal) return;
+    // Don't emit the epilogue for musttail call returns.
+    if (isAMustTailRetVal(RetVal)) return;
+    Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB);
+    bool HasNoUndef =
+        F.hasAttribute(AttributeList::ReturnIndex, Attribute::NoUndef);
+    bool StoreShadow = !(ClEagerChecks && HasNoUndef);
+    // FIXME: Consider using SpecialCaseList to specify a list of functions that
+    // must always return fully initialized values. For now, we hardcode "main".
+    bool EagerCheck = (ClEagerChecks && HasNoUndef) || (F.getName() == "main");
+
+    Value *Shadow = getShadow(RetVal);
+    bool StoreOrigin = true;
+    if (EagerCheck) {
+      insertShadowCheck(RetVal, &I);
+      Shadow = getCleanShadow(RetVal);
+      StoreOrigin = false;
+    }
+
+    // The caller may still expect information passed over TLS if we pass our
+    // check
+    if (StoreShadow) {
+      IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment);
+      if (MS.TrackOrigins && StoreOrigin)
+        IRB.CreateStore(getOrigin(RetVal), getOriginPtrForRetval(IRB));
+    }
+  }
+
+  void visitPHINode(PHINode &I) {
+    IRBuilder<> IRB(&I);
+    if (!PropagateShadow) {
+      setShadow(&I, getCleanShadow(&I));
+      setOrigin(&I, getCleanOrigin());
+      return;
+    }
+
+    ShadowPHINodes.push_back(&I);
+    setShadow(&I, IRB.CreatePHI(getShadowTy(&I), I.getNumIncomingValues(),
+                                "_msphi_s"));
+    if (MS.TrackOrigins)
+      setOrigin(&I, IRB.CreatePHI(MS.OriginTy, I.getNumIncomingValues(),
+                                  "_msphi_o"));
+  }
+
+  Value *getLocalVarDescription(AllocaInst &I) {
+    SmallString<2048> StackDescriptionStorage;
+    raw_svector_ostream StackDescription(StackDescriptionStorage);
+    // We create a string with a description of the stack allocation and
+    // pass it into __msan_set_alloca_origin.
+    // It will be printed by the run-time if stack-originated UMR is found.
+    // The first 4 bytes of the string are set to '----' and will be replaced
+    // by __msan_va_arg_overflow_size_tls at the first call.
+    StackDescription << "----" << I.getName() << "@" << F.getName();
+    return createPrivateNonConstGlobalForString(*F.getParent(),
+                                                StackDescription.str());
+  }
+
+  void poisonAllocaUserspace(AllocaInst &I, IRBuilder<> &IRB, Value *Len) {
+    if (PoisonStack && ClPoisonStackWithCall) {
+      IRB.CreateCall(MS.MsanPoisonStackFn,
+                     {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), Len});
+    } else {
+      Value *ShadowBase, *OriginBase;
+      std::tie(ShadowBase, OriginBase) = getShadowOriginPtr(
+          &I, IRB, IRB.getInt8Ty(), Align(1), /*isStore*/ true);
+
+      Value *PoisonValue = IRB.getInt8(PoisonStack ? ClPoisonStackPattern : 0);
+      IRB.CreateMemSet(ShadowBase, PoisonValue, Len,
+                       MaybeAlign(I.getAlignment()));
+    }
+
+    if (PoisonStack && MS.TrackOrigins) {
+      Value *Descr = getLocalVarDescription(I);
+      IRB.CreateCall(MS.MsanSetAllocaOrigin4Fn,
+                     {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), Len,
+                      IRB.CreatePointerCast(Descr, IRB.getInt8PtrTy()),
+                      IRB.CreatePointerCast(&F, MS.IntptrTy)});
+    }
+  }
+
+  void poisonAllocaKmsan(AllocaInst &I, IRBuilder<> &IRB, Value *Len) {
+    Value *Descr = getLocalVarDescription(I);
+    if (PoisonStack) {
+      IRB.CreateCall(MS.MsanPoisonAllocaFn,
+                     {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), Len,
+                      IRB.CreatePointerCast(Descr, IRB.getInt8PtrTy())});
+    } else {
+      IRB.CreateCall(MS.MsanUnpoisonAllocaFn,
+                     {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), Len});
+    }
+  }
+
+  void instrumentAlloca(AllocaInst &I, Instruction *InsPoint = nullptr) {
+    if (!InsPoint)
+      InsPoint = &I;
+    IRBuilder<> IRB(InsPoint->getNextNode());
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    uint64_t TypeSize = DL.getTypeAllocSize(I.getAllocatedType());
+    Value *Len = ConstantInt::get(MS.IntptrTy, TypeSize);
+    if (I.isArrayAllocation())
+      Len = IRB.CreateMul(Len, I.getArraySize());
+
+    if (MS.CompileKernel)
+      poisonAllocaKmsan(I, IRB, Len);
+    else
+      poisonAllocaUserspace(I, IRB, Len);
+  }
+
+  void visitAllocaInst(AllocaInst &I) {
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+    // We'll get to this alloca later unless it's poisoned at the corresponding
+    // llvm.lifetime.start.
+    AllocaSet.insert(&I);
+  }
+
+  void visitSelectInst(SelectInst& I) {
+    IRBuilder<> IRB(&I);
+    // a = select b, c, d
+    Value *B = I.getCondition();
+    Value *C = I.getTrueValue();
+    Value *D = I.getFalseValue();
+    Value *Sb = getShadow(B);
+    Value *Sc = getShadow(C);
+    Value *Sd = getShadow(D);
+
+    // Result shadow if condition shadow is 0.
+    Value *Sa0 = IRB.CreateSelect(B, Sc, Sd);
+    Value *Sa1;
+    if (I.getType()->isAggregateType()) {
+      // To avoid "sign extending" i1 to an arbitrary aggregate type, we just do
+      // an extra "select". This results in much more compact IR.
+      // Sa = select Sb, poisoned, (select b, Sc, Sd)
+      Sa1 = getPoisonedShadow(getShadowTy(I.getType()));
+    } else {
+      // Sa = select Sb, [ (c^d) | Sc | Sd ], [ b ? Sc : Sd ]
+      // If Sb (condition is poisoned), look for bits in c and d that are equal
+      // and both unpoisoned.
+      // If !Sb (condition is unpoisoned), simply pick one of Sc and Sd.
+
+      // Cast arguments to shadow-compatible type.
+      C = CreateAppToShadowCast(IRB, C);
+      D = CreateAppToShadowCast(IRB, D);
+
+      // Result shadow if condition shadow is 1.
+      Sa1 = IRB.CreateOr({IRB.CreateXor(C, D), Sc, Sd});
+    }
+    Value *Sa = IRB.CreateSelect(Sb, Sa1, Sa0, "_msprop_select");
+    setShadow(&I, Sa);
+    if (MS.TrackOrigins) {
+      // Origins are always i32, so any vector conditions must be flattened.
+      // FIXME: consider tracking vector origins for app vectors?
+      if (B->getType()->isVectorTy()) {
+        Type *FlatTy = getShadowTyNoVec(B->getType());
+        B = IRB.CreateICmpNE(IRB.CreateBitCast(B, FlatTy),
+                                ConstantInt::getNullValue(FlatTy));
+        Sb = IRB.CreateICmpNE(IRB.CreateBitCast(Sb, FlatTy),
+                                      ConstantInt::getNullValue(FlatTy));
+      }
+      // a = select b, c, d
+      // Oa = Sb ? Ob : (b ? Oc : Od)
+      setOrigin(
+          &I, IRB.CreateSelect(Sb, getOrigin(I.getCondition()),
+                               IRB.CreateSelect(B, getOrigin(I.getTrueValue()),
+                                                getOrigin(I.getFalseValue()))));
+    }
+  }
+
+  void visitLandingPadInst(LandingPadInst &I) {
+    // Do nothing.
+    // See https://github.com/google/sanitizers/issues/504
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+  }
+
+  void visitCatchSwitchInst(CatchSwitchInst &I) {
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+  }
+
+  void visitFuncletPadInst(FuncletPadInst &I) {
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+  }
+
+  void visitGetElementPtrInst(GetElementPtrInst &I) {
+    handleShadowOr(I);
+  }
+
+  void visitExtractValueInst(ExtractValueInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *Agg = I.getAggregateOperand();
+    LLVM_DEBUG(dbgs() << "ExtractValue:  " << I << "\n");
+    Value *AggShadow = getShadow(Agg);
+    LLVM_DEBUG(dbgs() << "   AggShadow:  " << *AggShadow << "\n");
+    Value *ResShadow = IRB.CreateExtractValue(AggShadow, I.getIndices());
+    LLVM_DEBUG(dbgs() << "   ResShadow:  " << *ResShadow << "\n");
+    setShadow(&I, ResShadow);
+    setOriginForNaryOp(I);
+  }
+
+  void visitInsertValueInst(InsertValueInst &I) {
+    IRBuilder<> IRB(&I);
+    LLVM_DEBUG(dbgs() << "InsertValue:  " << I << "\n");
+    Value *AggShadow = getShadow(I.getAggregateOperand());
+    Value *InsShadow = getShadow(I.getInsertedValueOperand());
+    LLVM_DEBUG(dbgs() << "   AggShadow:  " << *AggShadow << "\n");
+    LLVM_DEBUG(dbgs() << "   InsShadow:  " << *InsShadow << "\n");
+    Value *Res = IRB.CreateInsertValue(AggShadow, InsShadow, I.getIndices());
+    LLVM_DEBUG(dbgs() << "   Res:        " << *Res << "\n");
+    setShadow(&I, Res);
+    setOriginForNaryOp(I);
+  }
+
+  void dumpInst(Instruction &I) {
+    if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+      errs() << "ZZZ call " << CI->getCalledFunction()->getName() << "\n";
+    } else {
+      errs() << "ZZZ " << I.getOpcodeName() << "\n";
+    }
+    errs() << "QQQ " << I << "\n";
+  }
+
+  void visitResumeInst(ResumeInst &I) {
+    LLVM_DEBUG(dbgs() << "Resume: " << I << "\n");
+    // Nothing to do here.
+  }
+
+  void visitCleanupReturnInst(CleanupReturnInst &CRI) {
+    LLVM_DEBUG(dbgs() << "CleanupReturn: " << CRI << "\n");
+    // Nothing to do here.
+  }
+
+  void visitCatchReturnInst(CatchReturnInst &CRI) {
+    LLVM_DEBUG(dbgs() << "CatchReturn: " << CRI << "\n");
+    // Nothing to do here.
+  }
+
+  void instrumentAsmArgument(Value *Operand, Instruction &I, IRBuilder<> &IRB,
+                             const DataLayout &DL, bool isOutput) {
+    // For each assembly argument, we check its value for being initialized.
+    // If the argument is a pointer, we assume it points to a single element
+    // of the corresponding type (or to a 8-byte word, if the type is unsized).
+    // Each such pointer is instrumented with a call to the runtime library.
+    Type *OpType = Operand->getType();
+    // Check the operand value itself.
+    insertShadowCheck(Operand, &I);
+    if (!OpType->isPointerTy() || !isOutput) {
+      assert(!isOutput);
+      return;
+    }
+    Type *ElType = OpType->getPointerElementType();
+    if (!ElType->isSized())
+      return;
+    int Size = DL.getTypeStoreSize(ElType);
+    Value *Ptr = IRB.CreatePointerCast(Operand, IRB.getInt8PtrTy());
+    Value *SizeVal = ConstantInt::get(MS.IntptrTy, Size);
+    IRB.CreateCall(MS.MsanInstrumentAsmStoreFn, {Ptr, SizeVal});
+  }
+
+  /// Get the number of output arguments returned by pointers.
+  int getNumOutputArgs(InlineAsm *IA, CallBase *CB) {
+    int NumRetOutputs = 0;
+    int NumOutputs = 0;
+    Type *RetTy = cast<Value>(CB)->getType();
+    if (!RetTy->isVoidTy()) {
+      // Register outputs are returned via the CallInst return value.
+      auto *ST = dyn_cast<StructType>(RetTy);
+      if (ST)
+        NumRetOutputs = ST->getNumElements();
+      else
+        NumRetOutputs = 1;
+    }
+    InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
+    for (size_t i = 0, n = Constraints.size(); i < n; i++) {
+      InlineAsm::ConstraintInfo Info = Constraints[i];
+      switch (Info.Type) {
+      case InlineAsm::isOutput:
+        NumOutputs++;
+        break;
+      default:
+        break;
+      }
+    }
+    return NumOutputs - NumRetOutputs;
+  }
+
+  void visitAsmInstruction(Instruction &I) {
+    // Conservative inline assembly handling: check for poisoned shadow of
+    // asm() arguments, then unpoison the result and all the memory locations
+    // pointed to by those arguments.
+    // An inline asm() statement in C++ contains lists of input and output
+    // arguments used by the assembly code. These are mapped to operands of the
+    // CallInst as follows:
+    //  - nR register outputs ("=r) are returned by value in a single structure
+    //  (SSA value of the CallInst);
+    //  - nO other outputs ("=m" and others) are returned by pointer as first
+    // nO operands of the CallInst;
+    //  - nI inputs ("r", "m" and others) are passed to CallInst as the
+    // remaining nI operands.
+    // The total number of asm() arguments in the source is nR+nO+nI, and the
+    // corresponding CallInst has nO+nI+1 operands (the last operand is the
+    // function to be called).
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    CallBase *CB = cast<CallBase>(&I);
+    IRBuilder<> IRB(&I);
+    InlineAsm *IA = cast<InlineAsm>(CB->getCalledOperand());
+    int OutputArgs = getNumOutputArgs(IA, CB);
+    // The last operand of a CallInst is the function itself.
+    int NumOperands = CB->getNumOperands() - 1;
+
+    // Check input arguments. Doing so before unpoisoning output arguments, so
+    // that we won't overwrite uninit values before checking them.
+    for (int i = OutputArgs; i < NumOperands; i++) {
+      Value *Operand = CB->getOperand(i);
+      instrumentAsmArgument(Operand, I, IRB, DL, /*isOutput*/ false);
+    }
+    // Unpoison output arguments. This must happen before the actual InlineAsm
+    // call, so that the shadow for memory published in the asm() statement
+    // remains valid.
+    for (int i = 0; i < OutputArgs; i++) {
+      Value *Operand = CB->getOperand(i);
+      instrumentAsmArgument(Operand, I, IRB, DL, /*isOutput*/ true);
+    }
+
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+  }
+
   void visitFreezeInst(FreezeInst &I) {
     // Freeze always returns a fully defined value.
     setShadow(&I, getCleanShadow(&I));
     setOrigin(&I, getCleanOrigin());
   }
 
-  void visitInstruction(Instruction &I) { 
-    // Everything else: stop propagating and check for poisoned shadow. 
-    if (ClDumpStrictInstructions) 
-      dumpInst(I); 
-    LLVM_DEBUG(dbgs() << "DEFAULT: " << I << "\n"); 
-    for (size_t i = 0, n = I.getNumOperands(); i < n; i++) { 
-      Value *Operand = I.getOperand(i); 
-      if (Operand->getType()->isSized()) 
-        insertShadowCheck(Operand, &I); 
-    } 
-    setShadow(&I, getCleanShadow(&I)); 
-    setOrigin(&I, getCleanOrigin()); 
-  } 
-}; 
- 
-/// AMD64-specific implementation of VarArgHelper. 
-struct VarArgAMD64Helper : public VarArgHelper { 
-  // An unfortunate workaround for asymmetric lowering of va_arg stuff. 
-  // See a comment in visitCallBase for more details. 
-  static const unsigned AMD64GpEndOffset = 48;  // AMD64 ABI Draft 0.99.6 p3.5.7 
-  static const unsigned AMD64FpEndOffsetSSE = 176; 
-  // If SSE is disabled, fp_offset in va_list is zero. 
-  static const unsigned AMD64FpEndOffsetNoSSE = AMD64GpEndOffset; 
- 
-  unsigned AMD64FpEndOffset; 
-  Function &F; 
-  MemorySanitizer &MS; 
-  MemorySanitizerVisitor &MSV; 
-  Value *VAArgTLSCopy = nullptr; 
-  Value *VAArgTLSOriginCopy = nullptr; 
-  Value *VAArgOverflowSize = nullptr; 
- 
-  SmallVector<CallInst*, 16> VAStartInstrumentationList; 
- 
-  enum ArgKind { AK_GeneralPurpose, AK_FloatingPoint, AK_Memory }; 
- 
-  VarArgAMD64Helper(Function &F, MemorySanitizer &MS, 
-                    MemorySanitizerVisitor &MSV) 
-      : F(F), MS(MS), MSV(MSV) { 
-    AMD64FpEndOffset = AMD64FpEndOffsetSSE; 
-    for (const auto &Attr : F.getAttributes().getFnAttributes()) { 
-      if (Attr.isStringAttribute() && 
-          (Attr.getKindAsString() == "target-features")) { 
-        if (Attr.getValueAsString().contains("-sse")) 
-          AMD64FpEndOffset = AMD64FpEndOffsetNoSSE; 
-        break; 
-      } 
-    } 
-  } 
- 
-  ArgKind classifyArgument(Value* arg) { 
-    // A very rough approximation of X86_64 argument classification rules. 
-    Type *T = arg->getType(); 
-    if (T->isFPOrFPVectorTy() || T->isX86_MMXTy()) 
-      return AK_FloatingPoint; 
-    if (T->isIntegerTy() && T->getPrimitiveSizeInBits() <= 64) 
-      return AK_GeneralPurpose; 
-    if (T->isPointerTy()) 
-      return AK_GeneralPurpose; 
-    return AK_Memory; 
-  } 
- 
-  // For VarArg functions, store the argument shadow in an ABI-specific format 
-  // that corresponds to va_list layout. 
-  // We do this because Clang lowers va_arg in the frontend, and this pass 
-  // only sees the low level code that deals with va_list internals. 
-  // A much easier alternative (provided that Clang emits va_arg instructions) 
-  // would have been to associate each live instance of va_list with a copy of 
-  // MSanParamTLS, and extract shadow on va_arg() call in the argument list 
-  // order. 
-  void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override { 
-    unsigned GpOffset = 0; 
-    unsigned FpOffset = AMD64GpEndOffset; 
-    unsigned OverflowOffset = AMD64FpEndOffset; 
-    const DataLayout &DL = F.getParent()->getDataLayout(); 
-    for (auto ArgIt = CB.arg_begin(), End = CB.arg_end(); ArgIt != End; 
-         ++ArgIt) { 
-      Value *A = *ArgIt; 
-      unsigned ArgNo = CB.getArgOperandNo(ArgIt); 
-      bool IsFixed = ArgNo < CB.getFunctionType()->getNumParams(); 
-      bool IsByVal = CB.paramHasAttr(ArgNo, Attribute::ByVal); 
-      if (IsByVal) { 
-        // ByVal arguments always go to the overflow area. 
-        // Fixed arguments passed through the overflow area will be stepped 
-        // over by va_start, so don't count them towards the offset. 
-        if (IsFixed) 
-          continue; 
-        assert(A->getType()->isPointerTy()); 
-        Type *RealTy = CB.getParamByValType(ArgNo); 
-        uint64_t ArgSize = DL.getTypeAllocSize(RealTy); 
-        Value *ShadowBase = getShadowPtrForVAArgument( 
-            RealTy, IRB, OverflowOffset, alignTo(ArgSize, 8)); 
-        Value *OriginBase = nullptr; 
-        if (MS.TrackOrigins) 
-          OriginBase = getOriginPtrForVAArgument(RealTy, IRB, OverflowOffset); 
-        OverflowOffset += alignTo(ArgSize, 8); 
-        if (!ShadowBase) 
-          continue; 
-        Value *ShadowPtr, *OriginPtr; 
-        std::tie(ShadowPtr, OriginPtr) = 
-            MSV.getShadowOriginPtr(A, IRB, IRB.getInt8Ty(), kShadowTLSAlignment, 
-                                   /*isStore*/ false); 
- 
-        IRB.CreateMemCpy(ShadowBase, kShadowTLSAlignment, ShadowPtr, 
-                         kShadowTLSAlignment, ArgSize); 
-        if (MS.TrackOrigins) 
-          IRB.CreateMemCpy(OriginBase, kShadowTLSAlignment, OriginPtr, 
-                           kShadowTLSAlignment, ArgSize); 
-      } else { 
-        ArgKind AK = classifyArgument(A); 
-        if (AK == AK_GeneralPurpose && GpOffset >= AMD64GpEndOffset) 
-          AK = AK_Memory; 
-        if (AK == AK_FloatingPoint && FpOffset >= AMD64FpEndOffset) 
-          AK = AK_Memory; 
-        Value *ShadowBase, *OriginBase = nullptr; 
-        switch (AK) { 
-          case AK_GeneralPurpose: 
-            ShadowBase = 
-                getShadowPtrForVAArgument(A->getType(), IRB, GpOffset, 8); 
-            if (MS.TrackOrigins) 
-              OriginBase = 
-                  getOriginPtrForVAArgument(A->getType(), IRB, GpOffset); 
-            GpOffset += 8; 
-            break; 
-          case AK_FloatingPoint: 
-            ShadowBase = 
-                getShadowPtrForVAArgument(A->getType(), IRB, FpOffset, 16); 
-            if (MS.TrackOrigins) 
-              OriginBase = 
-                  getOriginPtrForVAArgument(A->getType(), IRB, FpOffset); 
-            FpOffset += 16; 
-            break; 
-          case AK_Memory: 
-            if (IsFixed) 
-              continue; 
-            uint64_t ArgSize = DL.getTypeAllocSize(A->getType()); 
-            ShadowBase = 
-                getShadowPtrForVAArgument(A->getType(), IRB, OverflowOffset, 8); 
-            if (MS.TrackOrigins) 
-              OriginBase = 
-                  getOriginPtrForVAArgument(A->getType(), IRB, OverflowOffset); 
-            OverflowOffset += alignTo(ArgSize, 8); 
-        } 
-        // Take fixed arguments into account for GpOffset and FpOffset, 
-        // but don't actually store shadows for them. 
-        // TODO(glider): don't call get*PtrForVAArgument() for them. 
-        if (IsFixed) 
-          continue; 
-        if (!ShadowBase) 
-          continue; 
-        Value *Shadow = MSV.getShadow(A); 
-        IRB.CreateAlignedStore(Shadow, ShadowBase, kShadowTLSAlignment); 
-        if (MS.TrackOrigins) { 
-          Value *Origin = MSV.getOrigin(A); 
-          unsigned StoreSize = DL.getTypeStoreSize(Shadow->getType()); 
-          MSV.paintOrigin(IRB, Origin, OriginBase, StoreSize, 
-                          std::max(kShadowTLSAlignment, kMinOriginAlignment)); 
-        } 
-      } 
-    } 
-    Constant *OverflowSize = 
-      ConstantInt::get(IRB.getInt64Ty(), OverflowOffset - AMD64FpEndOffset); 
-    IRB.CreateStore(OverflowSize, MS.VAArgOverflowSizeTLS); 
-  } 
- 
-  /// Compute the shadow address for a given va_arg. 
-  Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB, 
-                                   unsigned ArgOffset, unsigned ArgSize) { 
-    // Make sure we don't overflow __msan_va_arg_tls. 
-    if (ArgOffset + ArgSize > kParamTLSSize) 
-      return nullptr; 
-    Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy); 
-    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset)); 
-    return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0), 
-                              "_msarg_va_s"); 
-  } 
- 
-  /// Compute the origin address for a given va_arg. 
-  Value *getOriginPtrForVAArgument(Type *Ty, IRBuilder<> &IRB, int ArgOffset) { 
-    Value *Base = IRB.CreatePointerCast(MS.VAArgOriginTLS, MS.IntptrTy); 
-    // getOriginPtrForVAArgument() is always called after 
-    // getShadowPtrForVAArgument(), so __msan_va_arg_origin_tls can never 
-    // overflow. 
-    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset)); 
-    return IRB.CreateIntToPtr(Base, PointerType::get(MS.OriginTy, 0), 
-                              "_msarg_va_o"); 
-  } 
- 
-  void unpoisonVAListTagForInst(IntrinsicInst &I) { 
-    IRBuilder<> IRB(&I); 
-    Value *VAListTag = I.getArgOperand(0); 
-    Value *ShadowPtr, *OriginPtr; 
-    const Align Alignment = Align(8); 
-    std::tie(ShadowPtr, OriginPtr) = 
-        MSV.getShadowOriginPtr(VAListTag, IRB, IRB.getInt8Ty(), Alignment, 
-                               /*isStore*/ true); 
- 
-    // Unpoison the whole __va_list_tag. 
-    // FIXME: magic ABI constants. 
-    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()), 
-                     /* size */ 24, Alignment, false); 
-    // We shouldn't need to zero out the origins, as they're only checked for 
-    // nonzero shadow. 
-  } 
- 
-  void visitVAStartInst(VAStartInst &I) override { 
-    if (F.getCallingConv() == CallingConv::Win64) 
-      return; 
-    VAStartInstrumentationList.push_back(&I); 
-    unpoisonVAListTagForInst(I); 
-  } 
- 
-  void visitVACopyInst(VACopyInst &I) override { 
-    if (F.getCallingConv() == CallingConv::Win64) return; 
-    unpoisonVAListTagForInst(I); 
-  } 
- 
-  void finalizeInstrumentation() override { 
-    assert(!VAArgOverflowSize && !VAArgTLSCopy && 
-           "finalizeInstrumentation called twice"); 
-    if (!VAStartInstrumentationList.empty()) { 
-      // If there is a va_start in this function, make a backup copy of 
-      // va_arg_tls somewhere in the function entry block. 
+  void visitInstruction(Instruction &I) {
+    // Everything else: stop propagating and check for poisoned shadow.
+    if (ClDumpStrictInstructions)
+      dumpInst(I);
+    LLVM_DEBUG(dbgs() << "DEFAULT: " << I << "\n");
+    for (size_t i = 0, n = I.getNumOperands(); i < n; i++) {
+      Value *Operand = I.getOperand(i);
+      if (Operand->getType()->isSized())
+        insertShadowCheck(Operand, &I);
+    }
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+  }
+};
+
+/// AMD64-specific implementation of VarArgHelper.
+struct VarArgAMD64Helper : public VarArgHelper {
+  // An unfortunate workaround for asymmetric lowering of va_arg stuff.
+  // See a comment in visitCallBase for more details.
+  static const unsigned AMD64GpEndOffset = 48;  // AMD64 ABI Draft 0.99.6 p3.5.7
+  static const unsigned AMD64FpEndOffsetSSE = 176;
+  // If SSE is disabled, fp_offset in va_list is zero.
+  static const unsigned AMD64FpEndOffsetNoSSE = AMD64GpEndOffset;
+
+  unsigned AMD64FpEndOffset;
+  Function &F;
+  MemorySanitizer &MS;
+  MemorySanitizerVisitor &MSV;
+  Value *VAArgTLSCopy = nullptr;
+  Value *VAArgTLSOriginCopy = nullptr;
+  Value *VAArgOverflowSize = nullptr;
+
+  SmallVector<CallInst*, 16> VAStartInstrumentationList;
+
+  enum ArgKind { AK_GeneralPurpose, AK_FloatingPoint, AK_Memory };
+
+  VarArgAMD64Helper(Function &F, MemorySanitizer &MS,
+                    MemorySanitizerVisitor &MSV)
+      : F(F), MS(MS), MSV(MSV) {
+    AMD64FpEndOffset = AMD64FpEndOffsetSSE;
+    for (const auto &Attr : F.getAttributes().getFnAttributes()) {
+      if (Attr.isStringAttribute() &&
+          (Attr.getKindAsString() == "target-features")) {
+        if (Attr.getValueAsString().contains("-sse"))
+          AMD64FpEndOffset = AMD64FpEndOffsetNoSSE;
+        break;
+      }
+    }
+  }
+
+  ArgKind classifyArgument(Value* arg) {
+    // A very rough approximation of X86_64 argument classification rules.
+    Type *T = arg->getType();
+    if (T->isFPOrFPVectorTy() || T->isX86_MMXTy())
+      return AK_FloatingPoint;
+    if (T->isIntegerTy() && T->getPrimitiveSizeInBits() <= 64)
+      return AK_GeneralPurpose;
+    if (T->isPointerTy())
+      return AK_GeneralPurpose;
+    return AK_Memory;
+  }
+
+  // For VarArg functions, store the argument shadow in an ABI-specific format
+  // that corresponds to va_list layout.
+  // We do this because Clang lowers va_arg in the frontend, and this pass
+  // only sees the low level code that deals with va_list internals.
+  // A much easier alternative (provided that Clang emits va_arg instructions)
+  // would have been to associate each live instance of va_list with a copy of
+  // MSanParamTLS, and extract shadow on va_arg() call in the argument list
+  // order.
+  void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {
+    unsigned GpOffset = 0;
+    unsigned FpOffset = AMD64GpEndOffset;
+    unsigned OverflowOffset = AMD64FpEndOffset;
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    for (auto ArgIt = CB.arg_begin(), End = CB.arg_end(); ArgIt != End;
+         ++ArgIt) {
+      Value *A = *ArgIt;
+      unsigned ArgNo = CB.getArgOperandNo(ArgIt);
+      bool IsFixed = ArgNo < CB.getFunctionType()->getNumParams();
+      bool IsByVal = CB.paramHasAttr(ArgNo, Attribute::ByVal);
+      if (IsByVal) {
+        // ByVal arguments always go to the overflow area.
+        // Fixed arguments passed through the overflow area will be stepped
+        // over by va_start, so don't count them towards the offset.
+        if (IsFixed)
+          continue;
+        assert(A->getType()->isPointerTy());
+        Type *RealTy = CB.getParamByValType(ArgNo);
+        uint64_t ArgSize = DL.getTypeAllocSize(RealTy);
+        Value *ShadowBase = getShadowPtrForVAArgument(
+            RealTy, IRB, OverflowOffset, alignTo(ArgSize, 8));
+        Value *OriginBase = nullptr;
+        if (MS.TrackOrigins)
+          OriginBase = getOriginPtrForVAArgument(RealTy, IRB, OverflowOffset);
+        OverflowOffset += alignTo(ArgSize, 8);
+        if (!ShadowBase)
+          continue;
+        Value *ShadowPtr, *OriginPtr;
+        std::tie(ShadowPtr, OriginPtr) =
+            MSV.getShadowOriginPtr(A, IRB, IRB.getInt8Ty(), kShadowTLSAlignment,
+                                   /*isStore*/ false);
+
+        IRB.CreateMemCpy(ShadowBase, kShadowTLSAlignment, ShadowPtr,
+                         kShadowTLSAlignment, ArgSize);
+        if (MS.TrackOrigins)
+          IRB.CreateMemCpy(OriginBase, kShadowTLSAlignment, OriginPtr,
+                           kShadowTLSAlignment, ArgSize);
+      } else {
+        ArgKind AK = classifyArgument(A);
+        if (AK == AK_GeneralPurpose && GpOffset >= AMD64GpEndOffset)
+          AK = AK_Memory;
+        if (AK == AK_FloatingPoint && FpOffset >= AMD64FpEndOffset)
+          AK = AK_Memory;
+        Value *ShadowBase, *OriginBase = nullptr;
+        switch (AK) {
+          case AK_GeneralPurpose:
+            ShadowBase =
+                getShadowPtrForVAArgument(A->getType(), IRB, GpOffset, 8);
+            if (MS.TrackOrigins)
+              OriginBase =
+                  getOriginPtrForVAArgument(A->getType(), IRB, GpOffset);
+            GpOffset += 8;
+            break;
+          case AK_FloatingPoint:
+            ShadowBase =
+                getShadowPtrForVAArgument(A->getType(), IRB, FpOffset, 16);
+            if (MS.TrackOrigins)
+              OriginBase =
+                  getOriginPtrForVAArgument(A->getType(), IRB, FpOffset);
+            FpOffset += 16;
+            break;
+          case AK_Memory:
+            if (IsFixed)
+              continue;
+            uint64_t ArgSize = DL.getTypeAllocSize(A->getType());
+            ShadowBase =
+                getShadowPtrForVAArgument(A->getType(), IRB, OverflowOffset, 8);
+            if (MS.TrackOrigins)
+              OriginBase =
+                  getOriginPtrForVAArgument(A->getType(), IRB, OverflowOffset);
+            OverflowOffset += alignTo(ArgSize, 8);
+        }
+        // Take fixed arguments into account for GpOffset and FpOffset,
+        // but don't actually store shadows for them.
+        // TODO(glider): don't call get*PtrForVAArgument() for them.
+        if (IsFixed)
+          continue;
+        if (!ShadowBase)
+          continue;
+        Value *Shadow = MSV.getShadow(A);
+        IRB.CreateAlignedStore(Shadow, ShadowBase, kShadowTLSAlignment);
+        if (MS.TrackOrigins) {
+          Value *Origin = MSV.getOrigin(A);
+          unsigned StoreSize = DL.getTypeStoreSize(Shadow->getType());
+          MSV.paintOrigin(IRB, Origin, OriginBase, StoreSize,
+                          std::max(kShadowTLSAlignment, kMinOriginAlignment));
+        }
+      }
+    }
+    Constant *OverflowSize =
+      ConstantInt::get(IRB.getInt64Ty(), OverflowOffset - AMD64FpEndOffset);
+    IRB.CreateStore(OverflowSize, MS.VAArgOverflowSizeTLS);
+  }
+
+  /// Compute the shadow address for a given va_arg.
+  Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB,
+                                   unsigned ArgOffset, unsigned ArgSize) {
+    // Make sure we don't overflow __msan_va_arg_tls.
+    if (ArgOffset + ArgSize > kParamTLSSize)
+      return nullptr;
+    Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0),
+                              "_msarg_va_s");
+  }
+
+  /// Compute the origin address for a given va_arg.
+  Value *getOriginPtrForVAArgument(Type *Ty, IRBuilder<> &IRB, int ArgOffset) {
+    Value *Base = IRB.CreatePointerCast(MS.VAArgOriginTLS, MS.IntptrTy);
+    // getOriginPtrForVAArgument() is always called after
+    // getShadowPtrForVAArgument(), so __msan_va_arg_origin_tls can never
+    // overflow.
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(MS.OriginTy, 0),
+                              "_msarg_va_o");
+  }
+
+  void unpoisonVAListTagForInst(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr, *OriginPtr;
+    const Align Alignment = Align(8);
+    std::tie(ShadowPtr, OriginPtr) =
+        MSV.getShadowOriginPtr(VAListTag, IRB, IRB.getInt8Ty(), Alignment,
+                               /*isStore*/ true);
+
+    // Unpoison the whole __va_list_tag.
+    // FIXME: magic ABI constants.
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */ 24, Alignment, false);
+    // We shouldn't need to zero out the origins, as they're only checked for
+    // nonzero shadow.
+  }
+
+  void visitVAStartInst(VAStartInst &I) override {
+    if (F.getCallingConv() == CallingConv::Win64)
+      return;
+    VAStartInstrumentationList.push_back(&I);
+    unpoisonVAListTagForInst(I);
+  }
+
+  void visitVACopyInst(VACopyInst &I) override {
+    if (F.getCallingConv() == CallingConv::Win64) return;
+    unpoisonVAListTagForInst(I);
+  }
+
+  void finalizeInstrumentation() override {
+    assert(!VAArgOverflowSize && !VAArgTLSCopy &&
+           "finalizeInstrumentation called twice");
+    if (!VAStartInstrumentationList.empty()) {
+      // If there is a va_start in this function, make a backup copy of
+      // va_arg_tls somewhere in the function entry block.
       IRBuilder<> IRB(MSV.FnPrologueEnd);
-      VAArgOverflowSize = 
-          IRB.CreateLoad(IRB.getInt64Ty(), MS.VAArgOverflowSizeTLS); 
-      Value *CopySize = 
-        IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, AMD64FpEndOffset), 
-                      VAArgOverflowSize); 
-      VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize); 
-      IRB.CreateMemCpy(VAArgTLSCopy, Align(8), MS.VAArgTLS, Align(8), CopySize); 
-      if (MS.TrackOrigins) { 
-        VAArgTLSOriginCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize); 
-        IRB.CreateMemCpy(VAArgTLSOriginCopy, Align(8), MS.VAArgOriginTLS, 
-                         Align(8), CopySize); 
-      } 
-    } 
- 
-    // Instrument va_start. 
-    // Copy va_list shadow from the backup copy of the TLS contents. 
-    for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) { 
-      CallInst *OrigInst = VAStartInstrumentationList[i]; 
-      IRBuilder<> IRB(OrigInst->getNextNode()); 
-      Value *VAListTag = OrigInst->getArgOperand(0); 
- 
-      Type *RegSaveAreaPtrTy = Type::getInt64PtrTy(*MS.C); 
-      Value *RegSaveAreaPtrPtr = IRB.CreateIntToPtr( 
-          IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), 
-                        ConstantInt::get(MS.IntptrTy, 16)), 
-          PointerType::get(RegSaveAreaPtrTy, 0)); 
-      Value *RegSaveAreaPtr = 
-          IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr); 
-      Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr; 
-      const Align Alignment = Align(16); 
-      std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) = 
-          MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(), 
-                                 Alignment, /*isStore*/ true); 
-      IRB.CreateMemCpy(RegSaveAreaShadowPtr, Alignment, VAArgTLSCopy, Alignment, 
-                       AMD64FpEndOffset); 
-      if (MS.TrackOrigins) 
-        IRB.CreateMemCpy(RegSaveAreaOriginPtr, Alignment, VAArgTLSOriginCopy, 
-                         Alignment, AMD64FpEndOffset); 
-      Type *OverflowArgAreaPtrTy = Type::getInt64PtrTy(*MS.C); 
-      Value *OverflowArgAreaPtrPtr = IRB.CreateIntToPtr( 
-          IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), 
-                        ConstantInt::get(MS.IntptrTy, 8)), 
-          PointerType::get(OverflowArgAreaPtrTy, 0)); 
-      Value *OverflowArgAreaPtr = 
-          IRB.CreateLoad(OverflowArgAreaPtrTy, OverflowArgAreaPtrPtr); 
-      Value *OverflowArgAreaShadowPtr, *OverflowArgAreaOriginPtr; 
-      std::tie(OverflowArgAreaShadowPtr, OverflowArgAreaOriginPtr) = 
-          MSV.getShadowOriginPtr(OverflowArgAreaPtr, IRB, IRB.getInt8Ty(), 
-                                 Alignment, /*isStore*/ true); 
-      Value *SrcPtr = IRB.CreateConstGEP1_32(IRB.getInt8Ty(), VAArgTLSCopy, 
-                                             AMD64FpEndOffset); 
-      IRB.CreateMemCpy(OverflowArgAreaShadowPtr, Alignment, SrcPtr, Alignment, 
-                       VAArgOverflowSize); 
-      if (MS.TrackOrigins) { 
-        SrcPtr = IRB.CreateConstGEP1_32(IRB.getInt8Ty(), VAArgTLSOriginCopy, 
-                                        AMD64FpEndOffset); 
-        IRB.CreateMemCpy(OverflowArgAreaOriginPtr, Alignment, SrcPtr, Alignment, 
-                         VAArgOverflowSize); 
-      } 
-    } 
-  } 
-}; 
- 
-/// MIPS64-specific implementation of VarArgHelper. 
-struct VarArgMIPS64Helper : public VarArgHelper { 
-  Function &F; 
-  MemorySanitizer &MS; 
-  MemorySanitizerVisitor &MSV; 
-  Value *VAArgTLSCopy = nullptr; 
-  Value *VAArgSize = nullptr; 
- 
-  SmallVector<CallInst*, 16> VAStartInstrumentationList; 
- 
-  VarArgMIPS64Helper(Function &F, MemorySanitizer &MS, 
-                    MemorySanitizerVisitor &MSV) : F(F), MS(MS), MSV(MSV) {} 
- 
-  void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override { 
-    unsigned VAArgOffset = 0; 
-    const DataLayout &DL = F.getParent()->getDataLayout(); 
-    for (auto ArgIt = CB.arg_begin() + CB.getFunctionType()->getNumParams(), 
-              End = CB.arg_end(); 
-         ArgIt != End; ++ArgIt) { 
-      Triple TargetTriple(F.getParent()->getTargetTriple()); 
-      Value *A = *ArgIt; 
-      Value *Base; 
-      uint64_t ArgSize = DL.getTypeAllocSize(A->getType()); 
-      if (TargetTriple.getArch() == Triple::mips64) { 
-        // Adjusting the shadow for argument with size < 8 to match the placement 
-        // of bits in big endian system 
-        if (ArgSize < 8) 
-          VAArgOffset += (8 - ArgSize); 
-      } 
-      Base = getShadowPtrForVAArgument(A->getType(), IRB, VAArgOffset, ArgSize); 
-      VAArgOffset += ArgSize; 
-      VAArgOffset = alignTo(VAArgOffset, 8); 
-      if (!Base) 
-        continue; 
-      IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment); 
-    } 
- 
-    Constant *TotalVAArgSize = ConstantInt::get(IRB.getInt64Ty(), VAArgOffset); 
-    // Here using VAArgOverflowSizeTLS as VAArgSizeTLS to avoid creation of 
-    // a new class member i.e. it is the total size of all VarArgs. 
-    IRB.CreateStore(TotalVAArgSize, MS.VAArgOverflowSizeTLS); 
-  } 
- 
-  /// Compute the shadow address for a given va_arg. 
-  Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB, 
-                                   unsigned ArgOffset, unsigned ArgSize) { 
-    // Make sure we don't overflow __msan_va_arg_tls. 
-    if (ArgOffset + ArgSize > kParamTLSSize) 
-      return nullptr; 
-    Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy); 
-    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset)); 
-    return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0), 
-                              "_msarg"); 
-  } 
- 
-  void visitVAStartInst(VAStartInst &I) override { 
-    IRBuilder<> IRB(&I); 
-    VAStartInstrumentationList.push_back(&I); 
-    Value *VAListTag = I.getArgOperand(0); 
-    Value *ShadowPtr, *OriginPtr; 
-    const Align Alignment = Align(8); 
-    std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr( 
-        VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true); 
-    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()), 
-                     /* size */ 8, Alignment, false); 
-  } 
- 
-  void visitVACopyInst(VACopyInst &I) override { 
-    IRBuilder<> IRB(&I); 
-    VAStartInstrumentationList.push_back(&I); 
-    Value *VAListTag = I.getArgOperand(0); 
-    Value *ShadowPtr, *OriginPtr; 
-    const Align Alignment = Align(8); 
-    std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr( 
-        VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true); 
-    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()), 
-                     /* size */ 8, Alignment, false); 
-  } 
- 
-  void finalizeInstrumentation() override { 
-    assert(!VAArgSize && !VAArgTLSCopy && 
-           "finalizeInstrumentation called twice"); 
+      VAArgOverflowSize =
+          IRB.CreateLoad(IRB.getInt64Ty(), MS.VAArgOverflowSizeTLS);
+      Value *CopySize =
+        IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, AMD64FpEndOffset),
+                      VAArgOverflowSize);
+      VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
+      IRB.CreateMemCpy(VAArgTLSCopy, Align(8), MS.VAArgTLS, Align(8), CopySize);
+      if (MS.TrackOrigins) {
+        VAArgTLSOriginCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
+        IRB.CreateMemCpy(VAArgTLSOriginCopy, Align(8), MS.VAArgOriginTLS,
+                         Align(8), CopySize);
+      }
+    }
+
+    // Instrument va_start.
+    // Copy va_list shadow from the backup copy of the TLS contents.
+    for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) {
+      CallInst *OrigInst = VAStartInstrumentationList[i];
+      IRBuilder<> IRB(OrigInst->getNextNode());
+      Value *VAListTag = OrigInst->getArgOperand(0);
+
+      Type *RegSaveAreaPtrTy = Type::getInt64PtrTy(*MS.C);
+      Value *RegSaveAreaPtrPtr = IRB.CreateIntToPtr(
+          IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+                        ConstantInt::get(MS.IntptrTy, 16)),
+          PointerType::get(RegSaveAreaPtrTy, 0));
+      Value *RegSaveAreaPtr =
+          IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr);
+      Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr;
+      const Align Alignment = Align(16);
+      std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) =
+          MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(),
+                                 Alignment, /*isStore*/ true);
+      IRB.CreateMemCpy(RegSaveAreaShadowPtr, Alignment, VAArgTLSCopy, Alignment,
+                       AMD64FpEndOffset);
+      if (MS.TrackOrigins)
+        IRB.CreateMemCpy(RegSaveAreaOriginPtr, Alignment, VAArgTLSOriginCopy,
+                         Alignment, AMD64FpEndOffset);
+      Type *OverflowArgAreaPtrTy = Type::getInt64PtrTy(*MS.C);
+      Value *OverflowArgAreaPtrPtr = IRB.CreateIntToPtr(
+          IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+                        ConstantInt::get(MS.IntptrTy, 8)),
+          PointerType::get(OverflowArgAreaPtrTy, 0));
+      Value *OverflowArgAreaPtr =
+          IRB.CreateLoad(OverflowArgAreaPtrTy, OverflowArgAreaPtrPtr);
+      Value *OverflowArgAreaShadowPtr, *OverflowArgAreaOriginPtr;
+      std::tie(OverflowArgAreaShadowPtr, OverflowArgAreaOriginPtr) =
+          MSV.getShadowOriginPtr(OverflowArgAreaPtr, IRB, IRB.getInt8Ty(),
+                                 Alignment, /*isStore*/ true);
+      Value *SrcPtr = IRB.CreateConstGEP1_32(IRB.getInt8Ty(), VAArgTLSCopy,
+                                             AMD64FpEndOffset);
+      IRB.CreateMemCpy(OverflowArgAreaShadowPtr, Alignment, SrcPtr, Alignment,
+                       VAArgOverflowSize);
+      if (MS.TrackOrigins) {
+        SrcPtr = IRB.CreateConstGEP1_32(IRB.getInt8Ty(), VAArgTLSOriginCopy,
+                                        AMD64FpEndOffset);
+        IRB.CreateMemCpy(OverflowArgAreaOriginPtr, Alignment, SrcPtr, Alignment,
+                         VAArgOverflowSize);
+      }
+    }
+  }
+};
+
+/// MIPS64-specific implementation of VarArgHelper.
+struct VarArgMIPS64Helper : public VarArgHelper {
+  Function &F;
+  MemorySanitizer &MS;
+  MemorySanitizerVisitor &MSV;
+  Value *VAArgTLSCopy = nullptr;
+  Value *VAArgSize = nullptr;
+
+  SmallVector<CallInst*, 16> VAStartInstrumentationList;
+
+  VarArgMIPS64Helper(Function &F, MemorySanitizer &MS,
+                    MemorySanitizerVisitor &MSV) : F(F), MS(MS), MSV(MSV) {}
+
+  void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {
+    unsigned VAArgOffset = 0;
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    for (auto ArgIt = CB.arg_begin() + CB.getFunctionType()->getNumParams(),
+              End = CB.arg_end();
+         ArgIt != End; ++ArgIt) {
+      Triple TargetTriple(F.getParent()->getTargetTriple());
+      Value *A = *ArgIt;
+      Value *Base;
+      uint64_t ArgSize = DL.getTypeAllocSize(A->getType());
+      if (TargetTriple.getArch() == Triple::mips64) {
+        // Adjusting the shadow for argument with size < 8 to match the placement
+        // of bits in big endian system
+        if (ArgSize < 8)
+          VAArgOffset += (8 - ArgSize);
+      }
+      Base = getShadowPtrForVAArgument(A->getType(), IRB, VAArgOffset, ArgSize);
+      VAArgOffset += ArgSize;
+      VAArgOffset = alignTo(VAArgOffset, 8);
+      if (!Base)
+        continue;
+      IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
+    }
+
+    Constant *TotalVAArgSize = ConstantInt::get(IRB.getInt64Ty(), VAArgOffset);
+    // Here using VAArgOverflowSizeTLS as VAArgSizeTLS to avoid creation of
+    // a new class member i.e. it is the total size of all VarArgs.
+    IRB.CreateStore(TotalVAArgSize, MS.VAArgOverflowSizeTLS);
+  }
+
+  /// Compute the shadow address for a given va_arg.
+  Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB,
+                                   unsigned ArgOffset, unsigned ArgSize) {
+    // Make sure we don't overflow __msan_va_arg_tls.
+    if (ArgOffset + ArgSize > kParamTLSSize)
+      return nullptr;
+    Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0),
+                              "_msarg");
+  }
+
+  void visitVAStartInst(VAStartInst &I) override {
+    IRBuilder<> IRB(&I);
+    VAStartInstrumentationList.push_back(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr, *OriginPtr;
+    const Align Alignment = Align(8);
+    std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
+        VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */ 8, Alignment, false);
+  }
+
+  void visitVACopyInst(VACopyInst &I) override {
+    IRBuilder<> IRB(&I);
+    VAStartInstrumentationList.push_back(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr, *OriginPtr;
+    const Align Alignment = Align(8);
+    std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
+        VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */ 8, Alignment, false);
+  }
+
+  void finalizeInstrumentation() override {
+    assert(!VAArgSize && !VAArgTLSCopy &&
+           "finalizeInstrumentation called twice");
     IRBuilder<> IRB(MSV.FnPrologueEnd);
-    VAArgSize = IRB.CreateLoad(IRB.getInt64Ty(), MS.VAArgOverflowSizeTLS); 
-    Value *CopySize = IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, 0), 
-                                    VAArgSize); 
- 
-    if (!VAStartInstrumentationList.empty()) { 
-      // If there is a va_start in this function, make a backup copy of 
-      // va_arg_tls somewhere in the function entry block. 
-      VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize); 
-      IRB.CreateMemCpy(VAArgTLSCopy, Align(8), MS.VAArgTLS, Align(8), CopySize); 
-    } 
- 
-    // Instrument va_start. 
-    // Copy va_list shadow from the backup copy of the TLS contents. 
-    for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) { 
-      CallInst *OrigInst = VAStartInstrumentationList[i]; 
-      IRBuilder<> IRB(OrigInst->getNextNode()); 
-      Value *VAListTag = OrigInst->getArgOperand(0); 
-      Type *RegSaveAreaPtrTy = Type::getInt64PtrTy(*MS.C); 
-      Value *RegSaveAreaPtrPtr = 
-          IRB.CreateIntToPtr(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), 
-                             PointerType::get(RegSaveAreaPtrTy, 0)); 
-      Value *RegSaveAreaPtr = 
-          IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr); 
-      Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr; 
-      const Align Alignment = Align(8); 
-      std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) = 
-          MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(), 
-                                 Alignment, /*isStore*/ true); 
-      IRB.CreateMemCpy(RegSaveAreaShadowPtr, Alignment, VAArgTLSCopy, Alignment, 
-                       CopySize); 
-    } 
-  } 
-}; 
- 
-/// AArch64-specific implementation of VarArgHelper. 
-struct VarArgAArch64Helper : public VarArgHelper { 
-  static const unsigned kAArch64GrArgSize = 64; 
-  static const unsigned kAArch64VrArgSize = 128; 
- 
-  static const unsigned AArch64GrBegOffset = 0; 
-  static const unsigned AArch64GrEndOffset = kAArch64GrArgSize; 
-  // Make VR space aligned to 16 bytes. 
-  static const unsigned AArch64VrBegOffset = AArch64GrEndOffset; 
-  static const unsigned AArch64VrEndOffset = AArch64VrBegOffset 
-                                             + kAArch64VrArgSize; 
-  static const unsigned AArch64VAEndOffset = AArch64VrEndOffset; 
- 
-  Function &F; 
-  MemorySanitizer &MS; 
-  MemorySanitizerVisitor &MSV; 
-  Value *VAArgTLSCopy = nullptr; 
-  Value *VAArgOverflowSize = nullptr; 
- 
-  SmallVector<CallInst*, 16> VAStartInstrumentationList; 
- 
-  enum ArgKind { AK_GeneralPurpose, AK_FloatingPoint, AK_Memory }; 
- 
-  VarArgAArch64Helper(Function &F, MemorySanitizer &MS, 
-                    MemorySanitizerVisitor &MSV) : F(F), MS(MS), MSV(MSV) {} 
- 
-  ArgKind classifyArgument(Value* arg) { 
-    Type *T = arg->getType(); 
-    if (T->isFPOrFPVectorTy()) 
-      return AK_FloatingPoint; 
-    if ((T->isIntegerTy() && T->getPrimitiveSizeInBits() <= 64) 
-        || (T->isPointerTy())) 
-      return AK_GeneralPurpose; 
-    return AK_Memory; 
-  } 
- 
-  // The instrumentation stores the argument shadow in a non ABI-specific 
-  // format because it does not know which argument is named (since Clang, 
-  // like x86_64 case, lowers the va_args in the frontend and this pass only 
-  // sees the low level code that deals with va_list internals). 
-  // The first seven GR registers are saved in the first 56 bytes of the 
-  // va_arg tls arra, followers by the first 8 FP/SIMD registers, and then 
-  // the remaining arguments. 
-  // Using constant offset within the va_arg TLS array allows fast copy 
-  // in the finalize instrumentation. 
-  void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override { 
-    unsigned GrOffset = AArch64GrBegOffset; 
-    unsigned VrOffset = AArch64VrBegOffset; 
-    unsigned OverflowOffset = AArch64VAEndOffset; 
- 
-    const DataLayout &DL = F.getParent()->getDataLayout(); 
-    for (auto ArgIt = CB.arg_begin(), End = CB.arg_end(); ArgIt != End; 
-         ++ArgIt) { 
-      Value *A = *ArgIt; 
-      unsigned ArgNo = CB.getArgOperandNo(ArgIt); 
-      bool IsFixed = ArgNo < CB.getFunctionType()->getNumParams(); 
-      ArgKind AK = classifyArgument(A); 
-      if (AK == AK_GeneralPurpose && GrOffset >= AArch64GrEndOffset) 
-        AK = AK_Memory; 
-      if (AK == AK_FloatingPoint && VrOffset >= AArch64VrEndOffset) 
-        AK = AK_Memory; 
-      Value *Base; 
-      switch (AK) { 
-        case AK_GeneralPurpose: 
-          Base = getShadowPtrForVAArgument(A->getType(), IRB, GrOffset, 8); 
-          GrOffset += 8; 
-          break; 
-        case AK_FloatingPoint: 
-          Base = getShadowPtrForVAArgument(A->getType(), IRB, VrOffset, 8); 
-          VrOffset += 16; 
-          break; 
-        case AK_Memory: 
-          // Don't count fixed arguments in the overflow area - va_start will 
-          // skip right over them. 
-          if (IsFixed) 
-            continue; 
-          uint64_t ArgSize = DL.getTypeAllocSize(A->getType()); 
-          Base = getShadowPtrForVAArgument(A->getType(), IRB, OverflowOffset, 
-                                           alignTo(ArgSize, 8)); 
-          OverflowOffset += alignTo(ArgSize, 8); 
-          break; 
-      } 
-      // Count Gp/Vr fixed arguments to their respective offsets, but don't 
-      // bother to actually store a shadow. 
-      if (IsFixed) 
-        continue; 
-      if (!Base) 
-        continue; 
-      IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment); 
-    } 
-    Constant *OverflowSize = 
-      ConstantInt::get(IRB.getInt64Ty(), OverflowOffset - AArch64VAEndOffset); 
-    IRB.CreateStore(OverflowSize, MS.VAArgOverflowSizeTLS); 
-  } 
- 
-  /// Compute the shadow address for a given va_arg. 
-  Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB, 
-                                   unsigned ArgOffset, unsigned ArgSize) { 
-    // Make sure we don't overflow __msan_va_arg_tls. 
-    if (ArgOffset + ArgSize > kParamTLSSize) 
-      return nullptr; 
-    Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy); 
-    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset)); 
-    return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0), 
-                              "_msarg"); 
-  } 
- 
-  void visitVAStartInst(VAStartInst &I) override { 
-    IRBuilder<> IRB(&I); 
-    VAStartInstrumentationList.push_back(&I); 
-    Value *VAListTag = I.getArgOperand(0); 
-    Value *ShadowPtr, *OriginPtr; 
-    const Align Alignment = Align(8); 
-    std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr( 
-        VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true); 
-    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()), 
-                     /* size */ 32, Alignment, false); 
-  } 
- 
-  void visitVACopyInst(VACopyInst &I) override { 
-    IRBuilder<> IRB(&I); 
-    VAStartInstrumentationList.push_back(&I); 
-    Value *VAListTag = I.getArgOperand(0); 
-    Value *ShadowPtr, *OriginPtr; 
-    const Align Alignment = Align(8); 
-    std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr( 
-        VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true); 
-    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()), 
-                     /* size */ 32, Alignment, false); 
-  } 
- 
-  // Retrieve a va_list field of 'void*' size. 
-  Value* getVAField64(IRBuilder<> &IRB, Value *VAListTag, int offset) { 
-    Value *SaveAreaPtrPtr = 
-      IRB.CreateIntToPtr( 
-        IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), 
-                      ConstantInt::get(MS.IntptrTy, offset)), 
-        Type::getInt64PtrTy(*MS.C)); 
-    return IRB.CreateLoad(Type::getInt64Ty(*MS.C), SaveAreaPtrPtr); 
-  } 
- 
-  // Retrieve a va_list field of 'int' size. 
-  Value* getVAField32(IRBuilder<> &IRB, Value *VAListTag, int offset) { 
-    Value *SaveAreaPtr = 
-      IRB.CreateIntToPtr( 
-        IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), 
-                      ConstantInt::get(MS.IntptrTy, offset)), 
-        Type::getInt32PtrTy(*MS.C)); 
-    Value *SaveArea32 = IRB.CreateLoad(IRB.getInt32Ty(), SaveAreaPtr); 
-    return IRB.CreateSExt(SaveArea32, MS.IntptrTy); 
-  } 
- 
-  void finalizeInstrumentation() override { 
-    assert(!VAArgOverflowSize && !VAArgTLSCopy && 
-           "finalizeInstrumentation called twice"); 
-    if (!VAStartInstrumentationList.empty()) { 
-      // If there is a va_start in this function, make a backup copy of 
-      // va_arg_tls somewhere in the function entry block. 
+    VAArgSize = IRB.CreateLoad(IRB.getInt64Ty(), MS.VAArgOverflowSizeTLS);
+    Value *CopySize = IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, 0),
+                                    VAArgSize);
+
+    if (!VAStartInstrumentationList.empty()) {
+      // If there is a va_start in this function, make a backup copy of
+      // va_arg_tls somewhere in the function entry block.
+      VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
+      IRB.CreateMemCpy(VAArgTLSCopy, Align(8), MS.VAArgTLS, Align(8), CopySize);
+    }
+
+    // Instrument va_start.
+    // Copy va_list shadow from the backup copy of the TLS contents.
+    for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) {
+      CallInst *OrigInst = VAStartInstrumentationList[i];
+      IRBuilder<> IRB(OrigInst->getNextNode());
+      Value *VAListTag = OrigInst->getArgOperand(0);
+      Type *RegSaveAreaPtrTy = Type::getInt64PtrTy(*MS.C);
+      Value *RegSaveAreaPtrPtr =
+          IRB.CreateIntToPtr(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+                             PointerType::get(RegSaveAreaPtrTy, 0));
+      Value *RegSaveAreaPtr =
+          IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr);
+      Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr;
+      const Align Alignment = Align(8);
+      std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) =
+          MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(),
+                                 Alignment, /*isStore*/ true);
+      IRB.CreateMemCpy(RegSaveAreaShadowPtr, Alignment, VAArgTLSCopy, Alignment,
+                       CopySize);
+    }
+  }
+};
+
+/// AArch64-specific implementation of VarArgHelper.
+struct VarArgAArch64Helper : public VarArgHelper {
+  static const unsigned kAArch64GrArgSize = 64;
+  static const unsigned kAArch64VrArgSize = 128;
+
+  static const unsigned AArch64GrBegOffset = 0;
+  static const unsigned AArch64GrEndOffset = kAArch64GrArgSize;
+  // Make VR space aligned to 16 bytes.
+  static const unsigned AArch64VrBegOffset = AArch64GrEndOffset;
+  static const unsigned AArch64VrEndOffset = AArch64VrBegOffset
+                                             + kAArch64VrArgSize;
+  static const unsigned AArch64VAEndOffset = AArch64VrEndOffset;
+
+  Function &F;
+  MemorySanitizer &MS;
+  MemorySanitizerVisitor &MSV;
+  Value *VAArgTLSCopy = nullptr;
+  Value *VAArgOverflowSize = nullptr;
+
+  SmallVector<CallInst*, 16> VAStartInstrumentationList;
+
+  enum ArgKind { AK_GeneralPurpose, AK_FloatingPoint, AK_Memory };
+
+  VarArgAArch64Helper(Function &F, MemorySanitizer &MS,
+                    MemorySanitizerVisitor &MSV) : F(F), MS(MS), MSV(MSV) {}
+
+  ArgKind classifyArgument(Value* arg) {
+    Type *T = arg->getType();
+    if (T->isFPOrFPVectorTy())
+      return AK_FloatingPoint;
+    if ((T->isIntegerTy() && T->getPrimitiveSizeInBits() <= 64)
+        || (T->isPointerTy()))
+      return AK_GeneralPurpose;
+    return AK_Memory;
+  }
+
+  // The instrumentation stores the argument shadow in a non ABI-specific
+  // format because it does not know which argument is named (since Clang,
+  // like x86_64 case, lowers the va_args in the frontend and this pass only
+  // sees the low level code that deals with va_list internals).
+  // The first seven GR registers are saved in the first 56 bytes of the
+  // va_arg tls arra, followers by the first 8 FP/SIMD registers, and then
+  // the remaining arguments.
+  // Using constant offset within the va_arg TLS array allows fast copy
+  // in the finalize instrumentation.
+  void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {
+    unsigned GrOffset = AArch64GrBegOffset;
+    unsigned VrOffset = AArch64VrBegOffset;
+    unsigned OverflowOffset = AArch64VAEndOffset;
+
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    for (auto ArgIt = CB.arg_begin(), End = CB.arg_end(); ArgIt != End;
+         ++ArgIt) {
+      Value *A = *ArgIt;
+      unsigned ArgNo = CB.getArgOperandNo(ArgIt);
+      bool IsFixed = ArgNo < CB.getFunctionType()->getNumParams();
+      ArgKind AK = classifyArgument(A);
+      if (AK == AK_GeneralPurpose && GrOffset >= AArch64GrEndOffset)
+        AK = AK_Memory;
+      if (AK == AK_FloatingPoint && VrOffset >= AArch64VrEndOffset)
+        AK = AK_Memory;
+      Value *Base;
+      switch (AK) {
+        case AK_GeneralPurpose:
+          Base = getShadowPtrForVAArgument(A->getType(), IRB, GrOffset, 8);
+          GrOffset += 8;
+          break;
+        case AK_FloatingPoint:
+          Base = getShadowPtrForVAArgument(A->getType(), IRB, VrOffset, 8);
+          VrOffset += 16;
+          break;
+        case AK_Memory:
+          // Don't count fixed arguments in the overflow area - va_start will
+          // skip right over them.
+          if (IsFixed)
+            continue;
+          uint64_t ArgSize = DL.getTypeAllocSize(A->getType());
+          Base = getShadowPtrForVAArgument(A->getType(), IRB, OverflowOffset,
+                                           alignTo(ArgSize, 8));
+          OverflowOffset += alignTo(ArgSize, 8);
+          break;
+      }
+      // Count Gp/Vr fixed arguments to their respective offsets, but don't
+      // bother to actually store a shadow.
+      if (IsFixed)
+        continue;
+      if (!Base)
+        continue;
+      IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
+    }
+    Constant *OverflowSize =
+      ConstantInt::get(IRB.getInt64Ty(), OverflowOffset - AArch64VAEndOffset);
+    IRB.CreateStore(OverflowSize, MS.VAArgOverflowSizeTLS);
+  }
+
+  /// Compute the shadow address for a given va_arg.
+  Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB,
+                                   unsigned ArgOffset, unsigned ArgSize) {
+    // Make sure we don't overflow __msan_va_arg_tls.
+    if (ArgOffset + ArgSize > kParamTLSSize)
+      return nullptr;
+    Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0),
+                              "_msarg");
+  }
+
+  void visitVAStartInst(VAStartInst &I) override {
+    IRBuilder<> IRB(&I);
+    VAStartInstrumentationList.push_back(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr, *OriginPtr;
+    const Align Alignment = Align(8);
+    std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
+        VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */ 32, Alignment, false);
+  }
+
+  void visitVACopyInst(VACopyInst &I) override {
+    IRBuilder<> IRB(&I);
+    VAStartInstrumentationList.push_back(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr, *OriginPtr;
+    const Align Alignment = Align(8);
+    std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
+        VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */ 32, Alignment, false);
+  }
+
+  // Retrieve a va_list field of 'void*' size.
+  Value* getVAField64(IRBuilder<> &IRB, Value *VAListTag, int offset) {
+    Value *SaveAreaPtrPtr =
+      IRB.CreateIntToPtr(
+        IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+                      ConstantInt::get(MS.IntptrTy, offset)),
+        Type::getInt64PtrTy(*MS.C));
+    return IRB.CreateLoad(Type::getInt64Ty(*MS.C), SaveAreaPtrPtr);
+  }
+
+  // Retrieve a va_list field of 'int' size.
+  Value* getVAField32(IRBuilder<> &IRB, Value *VAListTag, int offset) {
+    Value *SaveAreaPtr =
+      IRB.CreateIntToPtr(
+        IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+                      ConstantInt::get(MS.IntptrTy, offset)),
+        Type::getInt32PtrTy(*MS.C));
+    Value *SaveArea32 = IRB.CreateLoad(IRB.getInt32Ty(), SaveAreaPtr);
+    return IRB.CreateSExt(SaveArea32, MS.IntptrTy);
+  }
+
+  void finalizeInstrumentation() override {
+    assert(!VAArgOverflowSize && !VAArgTLSCopy &&
+           "finalizeInstrumentation called twice");
+    if (!VAStartInstrumentationList.empty()) {
+      // If there is a va_start in this function, make a backup copy of
+      // va_arg_tls somewhere in the function entry block.
       IRBuilder<> IRB(MSV.FnPrologueEnd);
-      VAArgOverflowSize = 
-          IRB.CreateLoad(IRB.getInt64Ty(), MS.VAArgOverflowSizeTLS); 
-      Value *CopySize = 
-        IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, AArch64VAEndOffset), 
-                      VAArgOverflowSize); 
-      VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize); 
-      IRB.CreateMemCpy(VAArgTLSCopy, Align(8), MS.VAArgTLS, Align(8), CopySize); 
-    } 
- 
-    Value *GrArgSize = ConstantInt::get(MS.IntptrTy, kAArch64GrArgSize); 
-    Value *VrArgSize = ConstantInt::get(MS.IntptrTy, kAArch64VrArgSize); 
- 
-    // Instrument va_start, copy va_list shadow from the backup copy of 
-    // the TLS contents. 
-    for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) { 
-      CallInst *OrigInst = VAStartInstrumentationList[i]; 
-      IRBuilder<> IRB(OrigInst->getNextNode()); 
- 
-      Value *VAListTag = OrigInst->getArgOperand(0); 
- 
-      // The variadic ABI for AArch64 creates two areas to save the incoming 
-      // argument registers (one for 64-bit general register xn-x7 and another 
-      // for 128-bit FP/SIMD vn-v7). 
-      // We need then to propagate the shadow arguments on both regions 
-      // 'va::__gr_top + va::__gr_offs' and 'va::__vr_top + va::__vr_offs'. 
-      // The remaining arguments are saved on shadow for 'va::stack'. 
-      // One caveat is it requires only to propagate the non-named arguments, 
-      // however on the call site instrumentation 'all' the arguments are 
-      // saved. So to copy the shadow values from the va_arg TLS array 
-      // we need to adjust the offset for both GR and VR fields based on 
-      // the __{gr,vr}_offs value (since they are stores based on incoming 
-      // named arguments). 
- 
-      // Read the stack pointer from the va_list. 
-      Value *StackSaveAreaPtr = getVAField64(IRB, VAListTag, 0); 
- 
-      // Read both the __gr_top and __gr_off and add them up. 
-      Value *GrTopSaveAreaPtr = getVAField64(IRB, VAListTag, 8); 
-      Value *GrOffSaveArea = getVAField32(IRB, VAListTag, 24); 
- 
-      Value *GrRegSaveAreaPtr = IRB.CreateAdd(GrTopSaveAreaPtr, GrOffSaveArea); 
- 
-      // Read both the __vr_top and __vr_off and add them up. 
-      Value *VrTopSaveAreaPtr = getVAField64(IRB, VAListTag, 16); 
-      Value *VrOffSaveArea = getVAField32(IRB, VAListTag, 28); 
- 
-      Value *VrRegSaveAreaPtr = IRB.CreateAdd(VrTopSaveAreaPtr, VrOffSaveArea); 
- 
-      // It does not know how many named arguments is being used and, on the 
-      // callsite all the arguments were saved.  Since __gr_off is defined as 
-      // '0 - ((8 - named_gr) * 8)', the idea is to just propagate the variadic 
-      // argument by ignoring the bytes of shadow from named arguments. 
-      Value *GrRegSaveAreaShadowPtrOff = 
-        IRB.CreateAdd(GrArgSize, GrOffSaveArea); 
- 
-      Value *GrRegSaveAreaShadowPtr = 
-          MSV.getShadowOriginPtr(GrRegSaveAreaPtr, IRB, IRB.getInt8Ty(), 
-                                 Align(8), /*isStore*/ true) 
-              .first; 
- 
-      Value *GrSrcPtr = IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy, 
-                                              GrRegSaveAreaShadowPtrOff); 
-      Value *GrCopySize = IRB.CreateSub(GrArgSize, GrRegSaveAreaShadowPtrOff); 
- 
-      IRB.CreateMemCpy(GrRegSaveAreaShadowPtr, Align(8), GrSrcPtr, Align(8), 
-                       GrCopySize); 
- 
-      // Again, but for FP/SIMD values. 
-      Value *VrRegSaveAreaShadowPtrOff = 
-          IRB.CreateAdd(VrArgSize, VrOffSaveArea); 
- 
-      Value *VrRegSaveAreaShadowPtr = 
-          MSV.getShadowOriginPtr(VrRegSaveAreaPtr, IRB, IRB.getInt8Ty(), 
-                                 Align(8), /*isStore*/ true) 
-              .first; 
- 
-      Value *VrSrcPtr = IRB.CreateInBoundsGEP( 
-        IRB.getInt8Ty(), 
-        IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy, 
-                              IRB.getInt32(AArch64VrBegOffset)), 
-        VrRegSaveAreaShadowPtrOff); 
-      Value *VrCopySize = IRB.CreateSub(VrArgSize, VrRegSaveAreaShadowPtrOff); 
- 
-      IRB.CreateMemCpy(VrRegSaveAreaShadowPtr, Align(8), VrSrcPtr, Align(8), 
-                       VrCopySize); 
- 
-      // And finally for remaining arguments. 
-      Value *StackSaveAreaShadowPtr = 
-          MSV.getShadowOriginPtr(StackSaveAreaPtr, IRB, IRB.getInt8Ty(), 
-                                 Align(16), /*isStore*/ true) 
-              .first; 
- 
-      Value *StackSrcPtr = 
-        IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy, 
-                              IRB.getInt32(AArch64VAEndOffset)); 
- 
-      IRB.CreateMemCpy(StackSaveAreaShadowPtr, Align(16), StackSrcPtr, 
-                       Align(16), VAArgOverflowSize); 
-    } 
-  } 
-}; 
- 
-/// PowerPC64-specific implementation of VarArgHelper. 
-struct VarArgPowerPC64Helper : public VarArgHelper { 
-  Function &F; 
-  MemorySanitizer &MS; 
-  MemorySanitizerVisitor &MSV; 
-  Value *VAArgTLSCopy = nullptr; 
-  Value *VAArgSize = nullptr; 
- 
-  SmallVector<CallInst*, 16> VAStartInstrumentationList; 
- 
-  VarArgPowerPC64Helper(Function &F, MemorySanitizer &MS, 
-                    MemorySanitizerVisitor &MSV) : F(F), MS(MS), MSV(MSV) {} 
- 
-  void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override { 
-    // For PowerPC, we need to deal with alignment of stack arguments - 
-    // they are mostly aligned to 8 bytes, but vectors and i128 arrays 
-    // are aligned to 16 bytes, byvals can be aligned to 8 or 16 bytes, 
+      VAArgOverflowSize =
+          IRB.CreateLoad(IRB.getInt64Ty(), MS.VAArgOverflowSizeTLS);
+      Value *CopySize =
+        IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, AArch64VAEndOffset),
+                      VAArgOverflowSize);
+      VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
+      IRB.CreateMemCpy(VAArgTLSCopy, Align(8), MS.VAArgTLS, Align(8), CopySize);
+    }
+
+    Value *GrArgSize = ConstantInt::get(MS.IntptrTy, kAArch64GrArgSize);
+    Value *VrArgSize = ConstantInt::get(MS.IntptrTy, kAArch64VrArgSize);
+
+    // Instrument va_start, copy va_list shadow from the backup copy of
+    // the TLS contents.
+    for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) {
+      CallInst *OrigInst = VAStartInstrumentationList[i];
+      IRBuilder<> IRB(OrigInst->getNextNode());
+
+      Value *VAListTag = OrigInst->getArgOperand(0);
+
+      // The variadic ABI for AArch64 creates two areas to save the incoming
+      // argument registers (one for 64-bit general register xn-x7 and another
+      // for 128-bit FP/SIMD vn-v7).
+      // We need then to propagate the shadow arguments on both regions
+      // 'va::__gr_top + va::__gr_offs' and 'va::__vr_top + va::__vr_offs'.
+      // The remaining arguments are saved on shadow for 'va::stack'.
+      // One caveat is it requires only to propagate the non-named arguments,
+      // however on the call site instrumentation 'all' the arguments are
+      // saved. So to copy the shadow values from the va_arg TLS array
+      // we need to adjust the offset for both GR and VR fields based on
+      // the __{gr,vr}_offs value (since they are stores based on incoming
+      // named arguments).
+
+      // Read the stack pointer from the va_list.
+      Value *StackSaveAreaPtr = getVAField64(IRB, VAListTag, 0);
+
+      // Read both the __gr_top and __gr_off and add them up.
+      Value *GrTopSaveAreaPtr = getVAField64(IRB, VAListTag, 8);
+      Value *GrOffSaveArea = getVAField32(IRB, VAListTag, 24);
+
+      Value *GrRegSaveAreaPtr = IRB.CreateAdd(GrTopSaveAreaPtr, GrOffSaveArea);
+
+      // Read both the __vr_top and __vr_off and add them up.
+      Value *VrTopSaveAreaPtr = getVAField64(IRB, VAListTag, 16);
+      Value *VrOffSaveArea = getVAField32(IRB, VAListTag, 28);
+
+      Value *VrRegSaveAreaPtr = IRB.CreateAdd(VrTopSaveAreaPtr, VrOffSaveArea);
+
+      // It does not know how many named arguments is being used and, on the
+      // callsite all the arguments were saved.  Since __gr_off is defined as
+      // '0 - ((8 - named_gr) * 8)', the idea is to just propagate the variadic
+      // argument by ignoring the bytes of shadow from named arguments.
+      Value *GrRegSaveAreaShadowPtrOff =
+        IRB.CreateAdd(GrArgSize, GrOffSaveArea);
+
+      Value *GrRegSaveAreaShadowPtr =
+          MSV.getShadowOriginPtr(GrRegSaveAreaPtr, IRB, IRB.getInt8Ty(),
+                                 Align(8), /*isStore*/ true)
+              .first;
+
+      Value *GrSrcPtr = IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy,
+                                              GrRegSaveAreaShadowPtrOff);
+      Value *GrCopySize = IRB.CreateSub(GrArgSize, GrRegSaveAreaShadowPtrOff);
+
+      IRB.CreateMemCpy(GrRegSaveAreaShadowPtr, Align(8), GrSrcPtr, Align(8),
+                       GrCopySize);
+
+      // Again, but for FP/SIMD values.
+      Value *VrRegSaveAreaShadowPtrOff =
+          IRB.CreateAdd(VrArgSize, VrOffSaveArea);
+
+      Value *VrRegSaveAreaShadowPtr =
+          MSV.getShadowOriginPtr(VrRegSaveAreaPtr, IRB, IRB.getInt8Ty(),
+                                 Align(8), /*isStore*/ true)
+              .first;
+
+      Value *VrSrcPtr = IRB.CreateInBoundsGEP(
+        IRB.getInt8Ty(),
+        IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy,
+                              IRB.getInt32(AArch64VrBegOffset)),
+        VrRegSaveAreaShadowPtrOff);
+      Value *VrCopySize = IRB.CreateSub(VrArgSize, VrRegSaveAreaShadowPtrOff);
+
+      IRB.CreateMemCpy(VrRegSaveAreaShadowPtr, Align(8), VrSrcPtr, Align(8),
+                       VrCopySize);
+
+      // And finally for remaining arguments.
+      Value *StackSaveAreaShadowPtr =
+          MSV.getShadowOriginPtr(StackSaveAreaPtr, IRB, IRB.getInt8Ty(),
+                                 Align(16), /*isStore*/ true)
+              .first;
+
+      Value *StackSrcPtr =
+        IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy,
+                              IRB.getInt32(AArch64VAEndOffset));
+
+      IRB.CreateMemCpy(StackSaveAreaShadowPtr, Align(16), StackSrcPtr,
+                       Align(16), VAArgOverflowSize);
+    }
+  }
+};
+
+/// PowerPC64-specific implementation of VarArgHelper.
+struct VarArgPowerPC64Helper : public VarArgHelper {
+  Function &F;
+  MemorySanitizer &MS;
+  MemorySanitizerVisitor &MSV;
+  Value *VAArgTLSCopy = nullptr;
+  Value *VAArgSize = nullptr;
+
+  SmallVector<CallInst*, 16> VAStartInstrumentationList;
+
+  VarArgPowerPC64Helper(Function &F, MemorySanitizer &MS,
+                    MemorySanitizerVisitor &MSV) : F(F), MS(MS), MSV(MSV) {}
+
+  void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {
+    // For PowerPC, we need to deal with alignment of stack arguments -
+    // they are mostly aligned to 8 bytes, but vectors and i128 arrays
+    // are aligned to 16 bytes, byvals can be aligned to 8 or 16 bytes,
     // For that reason, we compute current offset from stack pointer (which is
     // always properly aligned), and offset for the first vararg, then subtract
     // them.
-    unsigned VAArgBase; 
-    Triple TargetTriple(F.getParent()->getTargetTriple()); 
-    // Parameter save area starts at 48 bytes from frame pointer for ABIv1, 
-    // and 32 bytes for ABIv2.  This is usually determined by target 
-    // endianness, but in theory could be overridden by function attribute. 
-    if (TargetTriple.getArch() == Triple::ppc64) 
-      VAArgBase = 48; 
-    else 
-      VAArgBase = 32; 
-    unsigned VAArgOffset = VAArgBase; 
-    const DataLayout &DL = F.getParent()->getDataLayout(); 
-    for (auto ArgIt = CB.arg_begin(), End = CB.arg_end(); ArgIt != End; 
-         ++ArgIt) { 
-      Value *A = *ArgIt; 
-      unsigned ArgNo = CB.getArgOperandNo(ArgIt); 
-      bool IsFixed = ArgNo < CB.getFunctionType()->getNumParams(); 
-      bool IsByVal = CB.paramHasAttr(ArgNo, Attribute::ByVal); 
-      if (IsByVal) { 
-        assert(A->getType()->isPointerTy()); 
-        Type *RealTy = CB.getParamByValType(ArgNo); 
-        uint64_t ArgSize = DL.getTypeAllocSize(RealTy); 
-        MaybeAlign ArgAlign = CB.getParamAlign(ArgNo); 
-        if (!ArgAlign || *ArgAlign < Align(8)) 
-          ArgAlign = Align(8); 
-        VAArgOffset = alignTo(VAArgOffset, ArgAlign); 
-        if (!IsFixed) { 
-          Value *Base = getShadowPtrForVAArgument( 
-              RealTy, IRB, VAArgOffset - VAArgBase, ArgSize); 
-          if (Base) { 
-            Value *AShadowPtr, *AOriginPtr; 
-            std::tie(AShadowPtr, AOriginPtr) = 
-                MSV.getShadowOriginPtr(A, IRB, IRB.getInt8Ty(), 
-                                       kShadowTLSAlignment, /*isStore*/ false); 
- 
-            IRB.CreateMemCpy(Base, kShadowTLSAlignment, AShadowPtr, 
-                             kShadowTLSAlignment, ArgSize); 
-          } 
-        } 
-        VAArgOffset += alignTo(ArgSize, 8); 
-      } else { 
-        Value *Base; 
-        uint64_t ArgSize = DL.getTypeAllocSize(A->getType()); 
-        uint64_t ArgAlign = 8; 
-        if (A->getType()->isArrayTy()) { 
-          // Arrays are aligned to element size, except for long double 
-          // arrays, which are aligned to 8 bytes. 
-          Type *ElementTy = A->getType()->getArrayElementType(); 
-          if (!ElementTy->isPPC_FP128Ty()) 
-            ArgAlign = DL.getTypeAllocSize(ElementTy); 
-        } else if (A->getType()->isVectorTy()) { 
-          // Vectors are naturally aligned. 
-          ArgAlign = DL.getTypeAllocSize(A->getType()); 
-        } 
-        if (ArgAlign < 8) 
-          ArgAlign = 8; 
-        VAArgOffset = alignTo(VAArgOffset, ArgAlign); 
-        if (DL.isBigEndian()) { 
-          // Adjusting the shadow for argument with size < 8 to match the placement 
-          // of bits in big endian system 
-          if (ArgSize < 8) 
-            VAArgOffset += (8 - ArgSize); 
-        } 
-        if (!IsFixed) { 
-          Base = getShadowPtrForVAArgument(A->getType(), IRB, 
-                                           VAArgOffset - VAArgBase, ArgSize); 
-          if (Base) 
-            IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment); 
-        } 
-        VAArgOffset += ArgSize; 
-        VAArgOffset = alignTo(VAArgOffset, 8); 
-      } 
-      if (IsFixed) 
-        VAArgBase = VAArgOffset; 
-    } 
- 
-    Constant *TotalVAArgSize = ConstantInt::get(IRB.getInt64Ty(), 
-                                                VAArgOffset - VAArgBase); 
-    // Here using VAArgOverflowSizeTLS as VAArgSizeTLS to avoid creation of 
-    // a new class member i.e. it is the total size of all VarArgs. 
-    IRB.CreateStore(TotalVAArgSize, MS.VAArgOverflowSizeTLS); 
-  } 
- 
-  /// Compute the shadow address for a given va_arg. 
-  Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB, 
-                                   unsigned ArgOffset, unsigned ArgSize) { 
-    // Make sure we don't overflow __msan_va_arg_tls. 
-    if (ArgOffset + ArgSize > kParamTLSSize) 
-      return nullptr; 
-    Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy); 
-    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset)); 
-    return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0), 
-                              "_msarg"); 
-  } 
- 
-  void visitVAStartInst(VAStartInst &I) override { 
-    IRBuilder<> IRB(&I); 
-    VAStartInstrumentationList.push_back(&I); 
-    Value *VAListTag = I.getArgOperand(0); 
-    Value *ShadowPtr, *OriginPtr; 
-    const Align Alignment = Align(8); 
-    std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr( 
-        VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true); 
-    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()), 
-                     /* size */ 8, Alignment, false); 
-  } 
- 
-  void visitVACopyInst(VACopyInst &I) override { 
-    IRBuilder<> IRB(&I); 
-    Value *VAListTag = I.getArgOperand(0); 
-    Value *ShadowPtr, *OriginPtr; 
-    const Align Alignment = Align(8); 
-    std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr( 
-        VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true); 
-    // Unpoison the whole __va_list_tag. 
-    // FIXME: magic ABI constants. 
-    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()), 
-                     /* size */ 8, Alignment, false); 
-  } 
- 
-  void finalizeInstrumentation() override { 
-    assert(!VAArgSize && !VAArgTLSCopy && 
-           "finalizeInstrumentation called twice"); 
+    unsigned VAArgBase;
+    Triple TargetTriple(F.getParent()->getTargetTriple());
+    // Parameter save area starts at 48 bytes from frame pointer for ABIv1,
+    // and 32 bytes for ABIv2.  This is usually determined by target
+    // endianness, but in theory could be overridden by function attribute.
+    if (TargetTriple.getArch() == Triple::ppc64)
+      VAArgBase = 48;
+    else
+      VAArgBase = 32;
+    unsigned VAArgOffset = VAArgBase;
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    for (auto ArgIt = CB.arg_begin(), End = CB.arg_end(); ArgIt != End;
+         ++ArgIt) {
+      Value *A = *ArgIt;
+      unsigned ArgNo = CB.getArgOperandNo(ArgIt);
+      bool IsFixed = ArgNo < CB.getFunctionType()->getNumParams();
+      bool IsByVal = CB.paramHasAttr(ArgNo, Attribute::ByVal);
+      if (IsByVal) {
+        assert(A->getType()->isPointerTy());
+        Type *RealTy = CB.getParamByValType(ArgNo);
+        uint64_t ArgSize = DL.getTypeAllocSize(RealTy);
+        MaybeAlign ArgAlign = CB.getParamAlign(ArgNo);
+        if (!ArgAlign || *ArgAlign < Align(8))
+          ArgAlign = Align(8);
+        VAArgOffset = alignTo(VAArgOffset, ArgAlign);
+        if (!IsFixed) {
+          Value *Base = getShadowPtrForVAArgument(
+              RealTy, IRB, VAArgOffset - VAArgBase, ArgSize);
+          if (Base) {
+            Value *AShadowPtr, *AOriginPtr;
+            std::tie(AShadowPtr, AOriginPtr) =
+                MSV.getShadowOriginPtr(A, IRB, IRB.getInt8Ty(),
+                                       kShadowTLSAlignment, /*isStore*/ false);
+
+            IRB.CreateMemCpy(Base, kShadowTLSAlignment, AShadowPtr,
+                             kShadowTLSAlignment, ArgSize);
+          }
+        }
+        VAArgOffset += alignTo(ArgSize, 8);
+      } else {
+        Value *Base;
+        uint64_t ArgSize = DL.getTypeAllocSize(A->getType());
+        uint64_t ArgAlign = 8;
+        if (A->getType()->isArrayTy()) {
+          // Arrays are aligned to element size, except for long double
+          // arrays, which are aligned to 8 bytes.
+          Type *ElementTy = A->getType()->getArrayElementType();
+          if (!ElementTy->isPPC_FP128Ty())
+            ArgAlign = DL.getTypeAllocSize(ElementTy);
+        } else if (A->getType()->isVectorTy()) {
+          // Vectors are naturally aligned.
+          ArgAlign = DL.getTypeAllocSize(A->getType());
+        }
+        if (ArgAlign < 8)
+          ArgAlign = 8;
+        VAArgOffset = alignTo(VAArgOffset, ArgAlign);
+        if (DL.isBigEndian()) {
+          // Adjusting the shadow for argument with size < 8 to match the placement
+          // of bits in big endian system
+          if (ArgSize < 8)
+            VAArgOffset += (8 - ArgSize);
+        }
+        if (!IsFixed) {
+          Base = getShadowPtrForVAArgument(A->getType(), IRB,
+                                           VAArgOffset - VAArgBase, ArgSize);
+          if (Base)
+            IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
+        }
+        VAArgOffset += ArgSize;
+        VAArgOffset = alignTo(VAArgOffset, 8);
+      }
+      if (IsFixed)
+        VAArgBase = VAArgOffset;
+    }
+
+    Constant *TotalVAArgSize = ConstantInt::get(IRB.getInt64Ty(),
+                                                VAArgOffset - VAArgBase);
+    // Here using VAArgOverflowSizeTLS as VAArgSizeTLS to avoid creation of
+    // a new class member i.e. it is the total size of all VarArgs.
+    IRB.CreateStore(TotalVAArgSize, MS.VAArgOverflowSizeTLS);
+  }
+
+  /// Compute the shadow address for a given va_arg.
+  Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB,
+                                   unsigned ArgOffset, unsigned ArgSize) {
+    // Make sure we don't overflow __msan_va_arg_tls.
+    if (ArgOffset + ArgSize > kParamTLSSize)
+      return nullptr;
+    Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0),
+                              "_msarg");
+  }
+
+  void visitVAStartInst(VAStartInst &I) override {
+    IRBuilder<> IRB(&I);
+    VAStartInstrumentationList.push_back(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr, *OriginPtr;
+    const Align Alignment = Align(8);
+    std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
+        VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */ 8, Alignment, false);
+  }
+
+  void visitVACopyInst(VACopyInst &I) override {
+    IRBuilder<> IRB(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr, *OriginPtr;
+    const Align Alignment = Align(8);
+    std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
+        VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
+    // Unpoison the whole __va_list_tag.
+    // FIXME: magic ABI constants.
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */ 8, Alignment, false);
+  }
+
+  void finalizeInstrumentation() override {
+    assert(!VAArgSize && !VAArgTLSCopy &&
+           "finalizeInstrumentation called twice");
     IRBuilder<> IRB(MSV.FnPrologueEnd);
-    VAArgSize = IRB.CreateLoad(IRB.getInt64Ty(), MS.VAArgOverflowSizeTLS); 
-    Value *CopySize = IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, 0), 
-                                    VAArgSize); 
- 
-    if (!VAStartInstrumentationList.empty()) { 
-      // If there is a va_start in this function, make a backup copy of 
-      // va_arg_tls somewhere in the function entry block. 
-      VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize); 
-      IRB.CreateMemCpy(VAArgTLSCopy, Align(8), MS.VAArgTLS, Align(8), CopySize); 
-    } 
- 
-    // Instrument va_start. 
-    // Copy va_list shadow from the backup copy of the TLS contents. 
-    for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) { 
-      CallInst *OrigInst = VAStartInstrumentationList[i]; 
-      IRBuilder<> IRB(OrigInst->getNextNode()); 
-      Value *VAListTag = OrigInst->getArgOperand(0); 
-      Type *RegSaveAreaPtrTy = Type::getInt64PtrTy(*MS.C); 
-      Value *RegSaveAreaPtrPtr = 
-          IRB.CreateIntToPtr(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), 
-                             PointerType::get(RegSaveAreaPtrTy, 0)); 
-      Value *RegSaveAreaPtr = 
-          IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr); 
-      Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr; 
-      const Align Alignment = Align(8); 
-      std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) = 
-          MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(), 
-                                 Alignment, /*isStore*/ true); 
-      IRB.CreateMemCpy(RegSaveAreaShadowPtr, Alignment, VAArgTLSCopy, Alignment, 
-                       CopySize); 
-    } 
-  } 
-}; 
- 
-/// SystemZ-specific implementation of VarArgHelper. 
-struct VarArgSystemZHelper : public VarArgHelper { 
-  static const unsigned SystemZGpOffset = 16; 
-  static const unsigned SystemZGpEndOffset = 56; 
-  static const unsigned SystemZFpOffset = 128; 
-  static const unsigned SystemZFpEndOffset = 160; 
-  static const unsigned SystemZMaxVrArgs = 8; 
-  static const unsigned SystemZRegSaveAreaSize = 160; 
-  static const unsigned SystemZOverflowOffset = 160; 
-  static const unsigned SystemZVAListTagSize = 32; 
-  static const unsigned SystemZOverflowArgAreaPtrOffset = 16; 
-  static const unsigned SystemZRegSaveAreaPtrOffset = 24; 
- 
-  Function &F; 
-  MemorySanitizer &MS; 
-  MemorySanitizerVisitor &MSV; 
-  Value *VAArgTLSCopy = nullptr; 
-  Value *VAArgTLSOriginCopy = nullptr; 
-  Value *VAArgOverflowSize = nullptr; 
- 
-  SmallVector<CallInst *, 16> VAStartInstrumentationList; 
- 
-  enum class ArgKind { 
-    GeneralPurpose, 
-    FloatingPoint, 
-    Vector, 
-    Memory, 
-    Indirect, 
-  }; 
- 
-  enum class ShadowExtension { None, Zero, Sign }; 
- 
-  VarArgSystemZHelper(Function &F, MemorySanitizer &MS, 
-                      MemorySanitizerVisitor &MSV) 
-      : F(F), MS(MS), MSV(MSV) {} 
- 
-  ArgKind classifyArgument(Type *T, bool IsSoftFloatABI) { 
-    // T is a SystemZABIInfo::classifyArgumentType() output, and there are 
-    // only a few possibilities of what it can be. In particular, enums, single 
-    // element structs and large types have already been taken care of. 
- 
-    // Some i128 and fp128 arguments are converted to pointers only in the 
-    // back end. 
-    if (T->isIntegerTy(128) || T->isFP128Ty()) 
-      return ArgKind::Indirect; 
-    if (T->isFloatingPointTy()) 
-      return IsSoftFloatABI ? ArgKind::GeneralPurpose : ArgKind::FloatingPoint; 
-    if (T->isIntegerTy() || T->isPointerTy()) 
-      return ArgKind::GeneralPurpose; 
-    if (T->isVectorTy()) 
-      return ArgKind::Vector; 
-    return ArgKind::Memory; 
-  } 
- 
-  ShadowExtension getShadowExtension(const CallBase &CB, unsigned ArgNo) { 
-    // ABI says: "One of the simple integer types no more than 64 bits wide. 
-    // ... If such an argument is shorter than 64 bits, replace it by a full 
-    // 64-bit integer representing the same number, using sign or zero 
-    // extension". Shadow for an integer argument has the same type as the 
-    // argument itself, so it can be sign or zero extended as well. 
-    bool ZExt = CB.paramHasAttr(ArgNo, Attribute::ZExt); 
-    bool SExt = CB.paramHasAttr(ArgNo, Attribute::SExt); 
-    if (ZExt) { 
-      assert(!SExt); 
-      return ShadowExtension::Zero; 
-    } 
-    if (SExt) { 
-      assert(!ZExt); 
-      return ShadowExtension::Sign; 
-    } 
-    return ShadowExtension::None; 
-  } 
- 
-  void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override { 
-    bool IsSoftFloatABI = CB.getCalledFunction() 
-                              ->getFnAttribute("use-soft-float") 
-                              .getValueAsString() == "true"; 
-    unsigned GpOffset = SystemZGpOffset; 
-    unsigned FpOffset = SystemZFpOffset; 
-    unsigned VrIndex = 0; 
-    unsigned OverflowOffset = SystemZOverflowOffset; 
-    const DataLayout &DL = F.getParent()->getDataLayout(); 
-    for (auto ArgIt = CB.arg_begin(), End = CB.arg_end(); ArgIt != End; 
-         ++ArgIt) { 
-      Value *A = *ArgIt; 
-      unsigned ArgNo = CB.getArgOperandNo(ArgIt); 
-      bool IsFixed = ArgNo < CB.getFunctionType()->getNumParams(); 
-      // SystemZABIInfo does not produce ByVal parameters. 
-      assert(!CB.paramHasAttr(ArgNo, Attribute::ByVal)); 
-      Type *T = A->getType(); 
-      ArgKind AK = classifyArgument(T, IsSoftFloatABI); 
-      if (AK == ArgKind::Indirect) { 
-        T = PointerType::get(T, 0); 
-        AK = ArgKind::GeneralPurpose; 
-      } 
-      if (AK == ArgKind::GeneralPurpose && GpOffset >= SystemZGpEndOffset) 
-        AK = ArgKind::Memory; 
-      if (AK == ArgKind::FloatingPoint && FpOffset >= SystemZFpEndOffset) 
-        AK = ArgKind::Memory; 
-      if (AK == ArgKind::Vector && (VrIndex >= SystemZMaxVrArgs || !IsFixed)) 
-        AK = ArgKind::Memory; 
-      Value *ShadowBase = nullptr; 
-      Value *OriginBase = nullptr; 
-      ShadowExtension SE = ShadowExtension::None; 
-      switch (AK) { 
-      case ArgKind::GeneralPurpose: { 
-        // Always keep track of GpOffset, but store shadow only for varargs. 
-        uint64_t ArgSize = 8; 
-        if (GpOffset + ArgSize <= kParamTLSSize) { 
-          if (!IsFixed) { 
-            SE = getShadowExtension(CB, ArgNo); 
-            uint64_t GapSize = 0; 
-            if (SE == ShadowExtension::None) { 
-              uint64_t ArgAllocSize = DL.getTypeAllocSize(T); 
-              assert(ArgAllocSize <= ArgSize); 
-              GapSize = ArgSize - ArgAllocSize; 
-            } 
-            ShadowBase = getShadowAddrForVAArgument(IRB, GpOffset + GapSize); 
-            if (MS.TrackOrigins) 
-              OriginBase = getOriginPtrForVAArgument(IRB, GpOffset + GapSize); 
-          } 
-          GpOffset += ArgSize; 
-        } else { 
-          GpOffset = kParamTLSSize; 
-        } 
-        break; 
-      } 
-      case ArgKind::FloatingPoint: { 
-        // Always keep track of FpOffset, but store shadow only for varargs. 
-        uint64_t ArgSize = 8; 
-        if (FpOffset + ArgSize <= kParamTLSSize) { 
-          if (!IsFixed) { 
-            // PoP says: "A short floating-point datum requires only the 
-            // left-most 32 bit positions of a floating-point register". 
-            // Therefore, in contrast to AK_GeneralPurpose and AK_Memory, 
-            // don't extend shadow and don't mind the gap. 
-            ShadowBase = getShadowAddrForVAArgument(IRB, FpOffset); 
-            if (MS.TrackOrigins) 
-              OriginBase = getOriginPtrForVAArgument(IRB, FpOffset); 
-          } 
-          FpOffset += ArgSize; 
-        } else { 
-          FpOffset = kParamTLSSize; 
-        } 
-        break; 
-      } 
-      case ArgKind::Vector: { 
-        // Keep track of VrIndex. No need to store shadow, since vector varargs 
-        // go through AK_Memory. 
-        assert(IsFixed); 
-        VrIndex++; 
-        break; 
-      } 
-      case ArgKind::Memory: { 
-        // Keep track of OverflowOffset and store shadow only for varargs. 
-        // Ignore fixed args, since we need to copy only the vararg portion of 
-        // the overflow area shadow. 
-        if (!IsFixed) { 
-          uint64_t ArgAllocSize = DL.getTypeAllocSize(T); 
-          uint64_t ArgSize = alignTo(ArgAllocSize, 8); 
-          if (OverflowOffset + ArgSize <= kParamTLSSize) { 
-            SE = getShadowExtension(CB, ArgNo); 
-            uint64_t GapSize = 
-                SE == ShadowExtension::None ? ArgSize - ArgAllocSize : 0; 
-            ShadowBase = 
-                getShadowAddrForVAArgument(IRB, OverflowOffset + GapSize); 
-            if (MS.TrackOrigins) 
-              OriginBase = 
-                  getOriginPtrForVAArgument(IRB, OverflowOffset + GapSize); 
-            OverflowOffset += ArgSize; 
-          } else { 
-            OverflowOffset = kParamTLSSize; 
-          } 
-        } 
-        break; 
-      } 
-      case ArgKind::Indirect: 
-        llvm_unreachable("Indirect must be converted to GeneralPurpose"); 
-      } 
-      if (ShadowBase == nullptr) 
-        continue; 
-      Value *Shadow = MSV.getShadow(A); 
-      if (SE != ShadowExtension::None) 
-        Shadow = MSV.CreateShadowCast(IRB, Shadow, IRB.getInt64Ty(), 
-                                      /*Signed*/ SE == ShadowExtension::Sign); 
-      ShadowBase = IRB.CreateIntToPtr( 
-          ShadowBase, PointerType::get(Shadow->getType(), 0), "_msarg_va_s"); 
-      IRB.CreateStore(Shadow, ShadowBase); 
-      if (MS.TrackOrigins) { 
-        Value *Origin = MSV.getOrigin(A); 
-        unsigned StoreSize = DL.getTypeStoreSize(Shadow->getType()); 
-        MSV.paintOrigin(IRB, Origin, OriginBase, StoreSize, 
-                        kMinOriginAlignment); 
-      } 
-    } 
-    Constant *OverflowSize = ConstantInt::get( 
-        IRB.getInt64Ty(), OverflowOffset - SystemZOverflowOffset); 
-    IRB.CreateStore(OverflowSize, MS.VAArgOverflowSizeTLS); 
-  } 
- 
-  Value *getShadowAddrForVAArgument(IRBuilder<> &IRB, unsigned ArgOffset) { 
-    Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy); 
-    return IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset)); 
-  } 
- 
-  Value *getOriginPtrForVAArgument(IRBuilder<> &IRB, int ArgOffset) { 
-    Value *Base = IRB.CreatePointerCast(MS.VAArgOriginTLS, MS.IntptrTy); 
-    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset)); 
-    return IRB.CreateIntToPtr(Base, PointerType::get(MS.OriginTy, 0), 
-                              "_msarg_va_o"); 
-  } 
- 
-  void unpoisonVAListTagForInst(IntrinsicInst &I) { 
-    IRBuilder<> IRB(&I); 
-    Value *VAListTag = I.getArgOperand(0); 
-    Value *ShadowPtr, *OriginPtr; 
-    const Align Alignment = Align(8); 
-    std::tie(ShadowPtr, OriginPtr) = 
-        MSV.getShadowOriginPtr(VAListTag, IRB, IRB.getInt8Ty(), Alignment, 
-                               /*isStore*/ true); 
-    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()), 
-                     SystemZVAListTagSize, Alignment, false); 
-  } 
- 
-  void visitVAStartInst(VAStartInst &I) override { 
-    VAStartInstrumentationList.push_back(&I); 
-    unpoisonVAListTagForInst(I); 
-  } 
- 
-  void visitVACopyInst(VACopyInst &I) override { unpoisonVAListTagForInst(I); } 
- 
-  void copyRegSaveArea(IRBuilder<> &IRB, Value *VAListTag) { 
-    Type *RegSaveAreaPtrTy = Type::getInt64PtrTy(*MS.C); 
-    Value *RegSaveAreaPtrPtr = IRB.CreateIntToPtr( 
-        IRB.CreateAdd( 
-            IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), 
-            ConstantInt::get(MS.IntptrTy, SystemZRegSaveAreaPtrOffset)), 
-        PointerType::get(RegSaveAreaPtrTy, 0)); 
-    Value *RegSaveAreaPtr = IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr); 
-    Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr; 
-    const Align Alignment = Align(8); 
-    std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) = 
-        MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(), Alignment, 
-                               /*isStore*/ true); 
-    // TODO(iii): copy only fragments filled by visitCallBase() 
-    IRB.CreateMemCpy(RegSaveAreaShadowPtr, Alignment, VAArgTLSCopy, Alignment, 
-                     SystemZRegSaveAreaSize); 
-    if (MS.TrackOrigins) 
-      IRB.CreateMemCpy(RegSaveAreaOriginPtr, Alignment, VAArgTLSOriginCopy, 
-                       Alignment, SystemZRegSaveAreaSize); 
-  } 
- 
-  void copyOverflowArea(IRBuilder<> &IRB, Value *VAListTag) { 
-    Type *OverflowArgAreaPtrTy = Type::getInt64PtrTy(*MS.C); 
-    Value *OverflowArgAreaPtrPtr = IRB.CreateIntToPtr( 
-        IRB.CreateAdd( 
-            IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), 
-            ConstantInt::get(MS.IntptrTy, SystemZOverflowArgAreaPtrOffset)), 
-        PointerType::get(OverflowArgAreaPtrTy, 0)); 
-    Value *OverflowArgAreaPtr = 
-        IRB.CreateLoad(OverflowArgAreaPtrTy, OverflowArgAreaPtrPtr); 
-    Value *OverflowArgAreaShadowPtr, *OverflowArgAreaOriginPtr; 
-    const Align Alignment = Align(8); 
-    std::tie(OverflowArgAreaShadowPtr, OverflowArgAreaOriginPtr) = 
-        MSV.getShadowOriginPtr(OverflowArgAreaPtr, IRB, IRB.getInt8Ty(), 
-                               Alignment, /*isStore*/ true); 
-    Value *SrcPtr = IRB.CreateConstGEP1_32(IRB.getInt8Ty(), VAArgTLSCopy, 
-                                           SystemZOverflowOffset); 
-    IRB.CreateMemCpy(OverflowArgAreaShadowPtr, Alignment, SrcPtr, Alignment, 
-                     VAArgOverflowSize); 
-    if (MS.TrackOrigins) { 
-      SrcPtr = IRB.CreateConstGEP1_32(IRB.getInt8Ty(), VAArgTLSOriginCopy, 
-                                      SystemZOverflowOffset); 
-      IRB.CreateMemCpy(OverflowArgAreaOriginPtr, Alignment, SrcPtr, Alignment, 
-                       VAArgOverflowSize); 
-    } 
-  } 
- 
-  void finalizeInstrumentation() override { 
-    assert(!VAArgOverflowSize && !VAArgTLSCopy && 
-           "finalizeInstrumentation called twice"); 
-    if (!VAStartInstrumentationList.empty()) { 
-      // If there is a va_start in this function, make a backup copy of 
-      // va_arg_tls somewhere in the function entry block. 
+    VAArgSize = IRB.CreateLoad(IRB.getInt64Ty(), MS.VAArgOverflowSizeTLS);
+    Value *CopySize = IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, 0),
+                                    VAArgSize);
+
+    if (!VAStartInstrumentationList.empty()) {
+      // If there is a va_start in this function, make a backup copy of
+      // va_arg_tls somewhere in the function entry block.
+      VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
+      IRB.CreateMemCpy(VAArgTLSCopy, Align(8), MS.VAArgTLS, Align(8), CopySize);
+    }
+
+    // Instrument va_start.
+    // Copy va_list shadow from the backup copy of the TLS contents.
+    for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) {
+      CallInst *OrigInst = VAStartInstrumentationList[i];
+      IRBuilder<> IRB(OrigInst->getNextNode());
+      Value *VAListTag = OrigInst->getArgOperand(0);
+      Type *RegSaveAreaPtrTy = Type::getInt64PtrTy(*MS.C);
+      Value *RegSaveAreaPtrPtr =
+          IRB.CreateIntToPtr(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+                             PointerType::get(RegSaveAreaPtrTy, 0));
+      Value *RegSaveAreaPtr =
+          IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr);
+      Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr;
+      const Align Alignment = Align(8);
+      std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) =
+          MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(),
+                                 Alignment, /*isStore*/ true);
+      IRB.CreateMemCpy(RegSaveAreaShadowPtr, Alignment, VAArgTLSCopy, Alignment,
+                       CopySize);
+    }
+  }
+};
+
+/// SystemZ-specific implementation of VarArgHelper.
+struct VarArgSystemZHelper : public VarArgHelper {
+  static const unsigned SystemZGpOffset = 16;
+  static const unsigned SystemZGpEndOffset = 56;
+  static const unsigned SystemZFpOffset = 128;
+  static const unsigned SystemZFpEndOffset = 160;
+  static const unsigned SystemZMaxVrArgs = 8;
+  static const unsigned SystemZRegSaveAreaSize = 160;
+  static const unsigned SystemZOverflowOffset = 160;
+  static const unsigned SystemZVAListTagSize = 32;
+  static const unsigned SystemZOverflowArgAreaPtrOffset = 16;
+  static const unsigned SystemZRegSaveAreaPtrOffset = 24;
+
+  Function &F;
+  MemorySanitizer &MS;
+  MemorySanitizerVisitor &MSV;
+  Value *VAArgTLSCopy = nullptr;
+  Value *VAArgTLSOriginCopy = nullptr;
+  Value *VAArgOverflowSize = nullptr;
+
+  SmallVector<CallInst *, 16> VAStartInstrumentationList;
+
+  enum class ArgKind {
+    GeneralPurpose,
+    FloatingPoint,
+    Vector,
+    Memory,
+    Indirect,
+  };
+
+  enum class ShadowExtension { None, Zero, Sign };
+
+  VarArgSystemZHelper(Function &F, MemorySanitizer &MS,
+                      MemorySanitizerVisitor &MSV)
+      : F(F), MS(MS), MSV(MSV) {}
+
+  ArgKind classifyArgument(Type *T, bool IsSoftFloatABI) {
+    // T is a SystemZABIInfo::classifyArgumentType() output, and there are
+    // only a few possibilities of what it can be. In particular, enums, single
+    // element structs and large types have already been taken care of.
+
+    // Some i128 and fp128 arguments are converted to pointers only in the
+    // back end.
+    if (T->isIntegerTy(128) || T->isFP128Ty())
+      return ArgKind::Indirect;
+    if (T->isFloatingPointTy())
+      return IsSoftFloatABI ? ArgKind::GeneralPurpose : ArgKind::FloatingPoint;
+    if (T->isIntegerTy() || T->isPointerTy())
+      return ArgKind::GeneralPurpose;
+    if (T->isVectorTy())
+      return ArgKind::Vector;
+    return ArgKind::Memory;
+  }
+
+  ShadowExtension getShadowExtension(const CallBase &CB, unsigned ArgNo) {
+    // ABI says: "One of the simple integer types no more than 64 bits wide.
+    // ... If such an argument is shorter than 64 bits, replace it by a full
+    // 64-bit integer representing the same number, using sign or zero
+    // extension". Shadow for an integer argument has the same type as the
+    // argument itself, so it can be sign or zero extended as well.
+    bool ZExt = CB.paramHasAttr(ArgNo, Attribute::ZExt);
+    bool SExt = CB.paramHasAttr(ArgNo, Attribute::SExt);
+    if (ZExt) {
+      assert(!SExt);
+      return ShadowExtension::Zero;
+    }
+    if (SExt) {
+      assert(!ZExt);
+      return ShadowExtension::Sign;
+    }
+    return ShadowExtension::None;
+  }
+
+  void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {
+    bool IsSoftFloatABI = CB.getCalledFunction()
+                              ->getFnAttribute("use-soft-float")
+                              .getValueAsString() == "true";
+    unsigned GpOffset = SystemZGpOffset;
+    unsigned FpOffset = SystemZFpOffset;
+    unsigned VrIndex = 0;
+    unsigned OverflowOffset = SystemZOverflowOffset;
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    for (auto ArgIt = CB.arg_begin(), End = CB.arg_end(); ArgIt != End;
+         ++ArgIt) {
+      Value *A = *ArgIt;
+      unsigned ArgNo = CB.getArgOperandNo(ArgIt);
+      bool IsFixed = ArgNo < CB.getFunctionType()->getNumParams();
+      // SystemZABIInfo does not produce ByVal parameters.
+      assert(!CB.paramHasAttr(ArgNo, Attribute::ByVal));
+      Type *T = A->getType();
+      ArgKind AK = classifyArgument(T, IsSoftFloatABI);
+      if (AK == ArgKind::Indirect) {
+        T = PointerType::get(T, 0);
+        AK = ArgKind::GeneralPurpose;
+      }
+      if (AK == ArgKind::GeneralPurpose && GpOffset >= SystemZGpEndOffset)
+        AK = ArgKind::Memory;
+      if (AK == ArgKind::FloatingPoint && FpOffset >= SystemZFpEndOffset)
+        AK = ArgKind::Memory;
+      if (AK == ArgKind::Vector && (VrIndex >= SystemZMaxVrArgs || !IsFixed))
+        AK = ArgKind::Memory;
+      Value *ShadowBase = nullptr;
+      Value *OriginBase = nullptr;
+      ShadowExtension SE = ShadowExtension::None;
+      switch (AK) {
+      case ArgKind::GeneralPurpose: {
+        // Always keep track of GpOffset, but store shadow only for varargs.
+        uint64_t ArgSize = 8;
+        if (GpOffset + ArgSize <= kParamTLSSize) {
+          if (!IsFixed) {
+            SE = getShadowExtension(CB, ArgNo);
+            uint64_t GapSize = 0;
+            if (SE == ShadowExtension::None) {
+              uint64_t ArgAllocSize = DL.getTypeAllocSize(T);
+              assert(ArgAllocSize <= ArgSize);
+              GapSize = ArgSize - ArgAllocSize;
+            }
+            ShadowBase = getShadowAddrForVAArgument(IRB, GpOffset + GapSize);
+            if (MS.TrackOrigins)
+              OriginBase = getOriginPtrForVAArgument(IRB, GpOffset + GapSize);
+          }
+          GpOffset += ArgSize;
+        } else {
+          GpOffset = kParamTLSSize;
+        }
+        break;
+      }
+      case ArgKind::FloatingPoint: {
+        // Always keep track of FpOffset, but store shadow only for varargs.
+        uint64_t ArgSize = 8;
+        if (FpOffset + ArgSize <= kParamTLSSize) {
+          if (!IsFixed) {
+            // PoP says: "A short floating-point datum requires only the
+            // left-most 32 bit positions of a floating-point register".
+            // Therefore, in contrast to AK_GeneralPurpose and AK_Memory,
+            // don't extend shadow and don't mind the gap.
+            ShadowBase = getShadowAddrForVAArgument(IRB, FpOffset);
+            if (MS.TrackOrigins)
+              OriginBase = getOriginPtrForVAArgument(IRB, FpOffset);
+          }
+          FpOffset += ArgSize;
+        } else {
+          FpOffset = kParamTLSSize;
+        }
+        break;
+      }
+      case ArgKind::Vector: {
+        // Keep track of VrIndex. No need to store shadow, since vector varargs
+        // go through AK_Memory.
+        assert(IsFixed);
+        VrIndex++;
+        break;
+      }
+      case ArgKind::Memory: {
+        // Keep track of OverflowOffset and store shadow only for varargs.
+        // Ignore fixed args, since we need to copy only the vararg portion of
+        // the overflow area shadow.
+        if (!IsFixed) {
+          uint64_t ArgAllocSize = DL.getTypeAllocSize(T);
+          uint64_t ArgSize = alignTo(ArgAllocSize, 8);
+          if (OverflowOffset + ArgSize <= kParamTLSSize) {
+            SE = getShadowExtension(CB, ArgNo);
+            uint64_t GapSize =
+                SE == ShadowExtension::None ? ArgSize - ArgAllocSize : 0;
+            ShadowBase =
+                getShadowAddrForVAArgument(IRB, OverflowOffset + GapSize);
+            if (MS.TrackOrigins)
+              OriginBase =
+                  getOriginPtrForVAArgument(IRB, OverflowOffset + GapSize);
+            OverflowOffset += ArgSize;
+          } else {
+            OverflowOffset = kParamTLSSize;
+          }
+        }
+        break;
+      }
+      case ArgKind::Indirect:
+        llvm_unreachable("Indirect must be converted to GeneralPurpose");
+      }
+      if (ShadowBase == nullptr)
+        continue;
+      Value *Shadow = MSV.getShadow(A);
+      if (SE != ShadowExtension::None)
+        Shadow = MSV.CreateShadowCast(IRB, Shadow, IRB.getInt64Ty(),
+                                      /*Signed*/ SE == ShadowExtension::Sign);
+      ShadowBase = IRB.CreateIntToPtr(
+          ShadowBase, PointerType::get(Shadow->getType(), 0), "_msarg_va_s");
+      IRB.CreateStore(Shadow, ShadowBase);
+      if (MS.TrackOrigins) {
+        Value *Origin = MSV.getOrigin(A);
+        unsigned StoreSize = DL.getTypeStoreSize(Shadow->getType());
+        MSV.paintOrigin(IRB, Origin, OriginBase, StoreSize,
+                        kMinOriginAlignment);
+      }
+    }
+    Constant *OverflowSize = ConstantInt::get(
+        IRB.getInt64Ty(), OverflowOffset - SystemZOverflowOffset);
+    IRB.CreateStore(OverflowSize, MS.VAArgOverflowSizeTLS);
+  }
+
+  Value *getShadowAddrForVAArgument(IRBuilder<> &IRB, unsigned ArgOffset) {
+    Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
+    return IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+  }
+
+  Value *getOriginPtrForVAArgument(IRBuilder<> &IRB, int ArgOffset) {
+    Value *Base = IRB.CreatePointerCast(MS.VAArgOriginTLS, MS.IntptrTy);
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(MS.OriginTy, 0),
+                              "_msarg_va_o");
+  }
+
+  void unpoisonVAListTagForInst(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr, *OriginPtr;
+    const Align Alignment = Align(8);
+    std::tie(ShadowPtr, OriginPtr) =
+        MSV.getShadowOriginPtr(VAListTag, IRB, IRB.getInt8Ty(), Alignment,
+                               /*isStore*/ true);
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     SystemZVAListTagSize, Alignment, false);
+  }
+
+  void visitVAStartInst(VAStartInst &I) override {
+    VAStartInstrumentationList.push_back(&I);
+    unpoisonVAListTagForInst(I);
+  }
+
+  void visitVACopyInst(VACopyInst &I) override { unpoisonVAListTagForInst(I); }
+
+  void copyRegSaveArea(IRBuilder<> &IRB, Value *VAListTag) {
+    Type *RegSaveAreaPtrTy = Type::getInt64PtrTy(*MS.C);
+    Value *RegSaveAreaPtrPtr = IRB.CreateIntToPtr(
+        IRB.CreateAdd(
+            IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+            ConstantInt::get(MS.IntptrTy, SystemZRegSaveAreaPtrOffset)),
+        PointerType::get(RegSaveAreaPtrTy, 0));
+    Value *RegSaveAreaPtr = IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr);
+    Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr;
+    const Align Alignment = Align(8);
+    std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) =
+        MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(), Alignment,
+                               /*isStore*/ true);
+    // TODO(iii): copy only fragments filled by visitCallBase()
+    IRB.CreateMemCpy(RegSaveAreaShadowPtr, Alignment, VAArgTLSCopy, Alignment,
+                     SystemZRegSaveAreaSize);
+    if (MS.TrackOrigins)
+      IRB.CreateMemCpy(RegSaveAreaOriginPtr, Alignment, VAArgTLSOriginCopy,
+                       Alignment, SystemZRegSaveAreaSize);
+  }
+
+  void copyOverflowArea(IRBuilder<> &IRB, Value *VAListTag) {
+    Type *OverflowArgAreaPtrTy = Type::getInt64PtrTy(*MS.C);
+    Value *OverflowArgAreaPtrPtr = IRB.CreateIntToPtr(
+        IRB.CreateAdd(
+            IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+            ConstantInt::get(MS.IntptrTy, SystemZOverflowArgAreaPtrOffset)),
+        PointerType::get(OverflowArgAreaPtrTy, 0));
+    Value *OverflowArgAreaPtr =
+        IRB.CreateLoad(OverflowArgAreaPtrTy, OverflowArgAreaPtrPtr);
+    Value *OverflowArgAreaShadowPtr, *OverflowArgAreaOriginPtr;
+    const Align Alignment = Align(8);
+    std::tie(OverflowArgAreaShadowPtr, OverflowArgAreaOriginPtr) =
+        MSV.getShadowOriginPtr(OverflowArgAreaPtr, IRB, IRB.getInt8Ty(),
+                               Alignment, /*isStore*/ true);
+    Value *SrcPtr = IRB.CreateConstGEP1_32(IRB.getInt8Ty(), VAArgTLSCopy,
+                                           SystemZOverflowOffset);
+    IRB.CreateMemCpy(OverflowArgAreaShadowPtr, Alignment, SrcPtr, Alignment,
+                     VAArgOverflowSize);
+    if (MS.TrackOrigins) {
+      SrcPtr = IRB.CreateConstGEP1_32(IRB.getInt8Ty(), VAArgTLSOriginCopy,
+                                      SystemZOverflowOffset);
+      IRB.CreateMemCpy(OverflowArgAreaOriginPtr, Alignment, SrcPtr, Alignment,
+                       VAArgOverflowSize);
+    }
+  }
+
+  void finalizeInstrumentation() override {
+    assert(!VAArgOverflowSize && !VAArgTLSCopy &&
+           "finalizeInstrumentation called twice");
+    if (!VAStartInstrumentationList.empty()) {
+      // If there is a va_start in this function, make a backup copy of
+      // va_arg_tls somewhere in the function entry block.
       IRBuilder<> IRB(MSV.FnPrologueEnd);
-      VAArgOverflowSize = 
-          IRB.CreateLoad(IRB.getInt64Ty(), MS.VAArgOverflowSizeTLS); 
-      Value *CopySize = 
-          IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, SystemZOverflowOffset), 
-                        VAArgOverflowSize); 
-      VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize); 
-      IRB.CreateMemCpy(VAArgTLSCopy, Align(8), MS.VAArgTLS, Align(8), CopySize); 
-      if (MS.TrackOrigins) { 
-        VAArgTLSOriginCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize); 
-        IRB.CreateMemCpy(VAArgTLSOriginCopy, Align(8), MS.VAArgOriginTLS, 
-                         Align(8), CopySize); 
-      } 
-    } 
- 
-    // Instrument va_start. 
-    // Copy va_list shadow from the backup copy of the TLS contents. 
-    for (size_t VaStartNo = 0, VaStartNum = VAStartInstrumentationList.size(); 
-         VaStartNo < VaStartNum; VaStartNo++) { 
-      CallInst *OrigInst = VAStartInstrumentationList[VaStartNo]; 
-      IRBuilder<> IRB(OrigInst->getNextNode()); 
-      Value *VAListTag = OrigInst->getArgOperand(0); 
-      copyRegSaveArea(IRB, VAListTag); 
-      copyOverflowArea(IRB, VAListTag); 
-    } 
-  } 
-}; 
- 
-/// A no-op implementation of VarArgHelper. 
-struct VarArgNoOpHelper : public VarArgHelper { 
-  VarArgNoOpHelper(Function &F, MemorySanitizer &MS, 
-                   MemorySanitizerVisitor &MSV) {} 
- 
-  void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {} 
- 
-  void visitVAStartInst(VAStartInst &I) override {} 
- 
-  void visitVACopyInst(VACopyInst &I) override {} 
- 
-  void finalizeInstrumentation() override {} 
-}; 
- 
-} // end anonymous namespace 
- 
-static VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan, 
-                                        MemorySanitizerVisitor &Visitor) { 
-  // VarArg handling is only implemented on AMD64. False positives are possible 
-  // on other platforms. 
-  Triple TargetTriple(Func.getParent()->getTargetTriple()); 
-  if (TargetTriple.getArch() == Triple::x86_64) 
-    return new VarArgAMD64Helper(Func, Msan, Visitor); 
-  else if (TargetTriple.isMIPS64()) 
-    return new VarArgMIPS64Helper(Func, Msan, Visitor); 
-  else if (TargetTriple.getArch() == Triple::aarch64) 
-    return new VarArgAArch64Helper(Func, Msan, Visitor); 
-  else if (TargetTriple.getArch() == Triple::ppc64 || 
-           TargetTriple.getArch() == Triple::ppc64le) 
-    return new VarArgPowerPC64Helper(Func, Msan, Visitor); 
-  else if (TargetTriple.getArch() == Triple::systemz) 
-    return new VarArgSystemZHelper(Func, Msan, Visitor); 
-  else 
-    return new VarArgNoOpHelper(Func, Msan, Visitor); 
-} 
- 
-bool MemorySanitizer::sanitizeFunction(Function &F, TargetLibraryInfo &TLI) { 
-  if (!CompileKernel && F.getName() == kMsanModuleCtorName) 
-    return false; 
- 
-  MemorySanitizerVisitor Visitor(F, *this, TLI); 
- 
-  // Clear out readonly/readnone attributes. 
-  AttrBuilder B; 
-  B.addAttribute(Attribute::ReadOnly) 
-      .addAttribute(Attribute::ReadNone) 
-      .addAttribute(Attribute::WriteOnly) 
-      .addAttribute(Attribute::ArgMemOnly) 
-      .addAttribute(Attribute::Speculatable); 
-  F.removeAttributes(AttributeList::FunctionIndex, B); 
- 
-  return Visitor.runOnFunction(); 
-} 
+      VAArgOverflowSize =
+          IRB.CreateLoad(IRB.getInt64Ty(), MS.VAArgOverflowSizeTLS);
+      Value *CopySize =
+          IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, SystemZOverflowOffset),
+                        VAArgOverflowSize);
+      VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
+      IRB.CreateMemCpy(VAArgTLSCopy, Align(8), MS.VAArgTLS, Align(8), CopySize);
+      if (MS.TrackOrigins) {
+        VAArgTLSOriginCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
+        IRB.CreateMemCpy(VAArgTLSOriginCopy, Align(8), MS.VAArgOriginTLS,
+                         Align(8), CopySize);
+      }
+    }
+
+    // Instrument va_start.
+    // Copy va_list shadow from the backup copy of the TLS contents.
+    for (size_t VaStartNo = 0, VaStartNum = VAStartInstrumentationList.size();
+         VaStartNo < VaStartNum; VaStartNo++) {
+      CallInst *OrigInst = VAStartInstrumentationList[VaStartNo];
+      IRBuilder<> IRB(OrigInst->getNextNode());
+      Value *VAListTag = OrigInst->getArgOperand(0);
+      copyRegSaveArea(IRB, VAListTag);
+      copyOverflowArea(IRB, VAListTag);
+    }
+  }
+};
+
+/// A no-op implementation of VarArgHelper.
+struct VarArgNoOpHelper : public VarArgHelper {
+  VarArgNoOpHelper(Function &F, MemorySanitizer &MS,
+                   MemorySanitizerVisitor &MSV) {}
+
+  void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {}
+
+  void visitVAStartInst(VAStartInst &I) override {}
+
+  void visitVACopyInst(VACopyInst &I) override {}
+
+  void finalizeInstrumentation() override {}
+};
+
+} // end anonymous namespace
+
+static VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
+                                        MemorySanitizerVisitor &Visitor) {
+  // VarArg handling is only implemented on AMD64. False positives are possible
+  // on other platforms.
+  Triple TargetTriple(Func.getParent()->getTargetTriple());
+  if (TargetTriple.getArch() == Triple::x86_64)
+    return new VarArgAMD64Helper(Func, Msan, Visitor);
+  else if (TargetTriple.isMIPS64())
+    return new VarArgMIPS64Helper(Func, Msan, Visitor);
+  else if (TargetTriple.getArch() == Triple::aarch64)
+    return new VarArgAArch64Helper(Func, Msan, Visitor);
+  else if (TargetTriple.getArch() == Triple::ppc64 ||
+           TargetTriple.getArch() == Triple::ppc64le)
+    return new VarArgPowerPC64Helper(Func, Msan, Visitor);
+  else if (TargetTriple.getArch() == Triple::systemz)
+    return new VarArgSystemZHelper(Func, Msan, Visitor);
+  else
+    return new VarArgNoOpHelper(Func, Msan, Visitor);
+}
+
+bool MemorySanitizer::sanitizeFunction(Function &F, TargetLibraryInfo &TLI) {
+  if (!CompileKernel && F.getName() == kMsanModuleCtorName)
+    return false;
+
+  MemorySanitizerVisitor Visitor(F, *this, TLI);
+
+  // Clear out readonly/readnone attributes.
+  AttrBuilder B;
+  B.addAttribute(Attribute::ReadOnly)
+      .addAttribute(Attribute::ReadNone)
+      .addAttribute(Attribute::WriteOnly)
+      .addAttribute(Attribute::ArgMemOnly)
+      .addAttribute(Attribute::Speculatable);
+  F.removeAttributes(AttributeList::FunctionIndex, B);
+
+  return Visitor.runOnFunction();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 002a03afad..be6c8c6310 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -1,253 +1,253 @@
-//===- PGOInstrumentation.cpp - MST-based PGO Instrumentation -------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements PGO instrumentation using a minimum spanning tree based 
-// on the following paper: 
-//   [1] Donald E. Knuth, Francis R. Stevenson. Optimal measurement of points 
-//   for program frequency counts. BIT Numerical Mathematics 1973, Volume 13, 
-//   Issue 3, pp 313-322 
-// The idea of the algorithm based on the fact that for each node (except for 
-// the entry and exit), the sum of incoming edge counts equals the sum of 
-// outgoing edge counts. The count of edge on spanning tree can be derived from 
-// those edges not on the spanning tree. Knuth proves this method instruments 
-// the minimum number of edges. 
-// 
-// The minimal spanning tree here is actually a maximum weight tree -- on-tree 
-// edges have higher frequencies (more likely to execute). The idea is to 
-// instrument those less frequently executed edges to reduce the runtime 
-// overhead of instrumented binaries. 
-// 
-// This file contains two passes: 
-// (1) Pass PGOInstrumentationGen which instruments the IR to generate edge 
-// count profile, and generates the instrumentation for indirect call 
-// profiling. 
-// (2) Pass PGOInstrumentationUse which reads the edge count profile and 
-// annotates the branch weights. It also reads the indirect call value 
-// profiling records and annotate the indirect call instructions. 
-// 
-// To get the precise counter information, These two passes need to invoke at 
-// the same compilation point (so they see the same IR). For pass 
-// PGOInstrumentationGen, the real work is done in instrumentOneFunc(). For 
-// pass PGOInstrumentationUse, the real work in done in class PGOUseFunc and 
-// the profile is opened in module level and passed to each PGOUseFunc instance. 
-// The shared code for PGOInstrumentationGen and PGOInstrumentationUse is put 
-// in class FuncPGOInstrumentation. 
-// 
-// Class PGOEdge represents a CFG edge and some auxiliary information. Class 
-// BBInfo contains auxiliary information for each BB. These two classes are used 
-// in pass PGOInstrumentationGen. Class PGOUseEdge and UseBBInfo are the derived 
-// class of PGOEdge and BBInfo, respectively. They contains extra data structure 
-// used in populating profile counters. 
-// The MST implementation is in Class CFGMST (CFGMST.h). 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" 
-#include "CFGMST.h" 
-#include "ValueProfileCollector.h" 
-#include "llvm/ADT/APInt.h" 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/MapVector.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/StringRef.h" 
-#include "llvm/ADT/Triple.h" 
-#include "llvm/ADT/Twine.h" 
-#include "llvm/ADT/iterator.h" 
-#include "llvm/ADT/iterator_range.h" 
-#include "llvm/Analysis/BlockFrequencyInfo.h" 
-#include "llvm/Analysis/BranchProbabilityInfo.h" 
-#include "llvm/Analysis/CFG.h" 
-#include "llvm/Analysis/EHPersonalities.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/OptimizationRemarkEmitter.h" 
-#include "llvm/Analysis/ProfileSummaryInfo.h" 
-#include "llvm/IR/Attributes.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/Comdat.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DiagnosticInfo.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GlobalAlias.h" 
-#include "llvm/IR/GlobalValue.h" 
-#include "llvm/IR/GlobalVariable.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstVisitor.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/MDBuilder.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/IR/ProfileSummary.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/ProfileData/InstrProf.h" 
-#include "llvm/ProfileData/InstrProfReader.h" 
-#include "llvm/Support/BranchProbability.h" 
-#include "llvm/Support/CRC.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/DOTGraphTraits.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/Error.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/GraphWriter.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Instrumentation.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <cstdint> 
-#include <memory> 
-#include <numeric> 
-#include <string> 
-#include <unordered_map> 
-#include <utility> 
-#include <vector> 
- 
-using namespace llvm; 
-using ProfileCount = Function::ProfileCount; 
-using VPCandidateInfo = ValueProfileCollector::CandidateInfo; 
- 
-#define DEBUG_TYPE "pgo-instrumentation" 
- 
-STATISTIC(NumOfPGOInstrument, "Number of edges instrumented."); 
-STATISTIC(NumOfPGOSelectInsts, "Number of select instruction instrumented."); 
-STATISTIC(NumOfPGOMemIntrinsics, "Number of mem intrinsics instrumented."); 
-STATISTIC(NumOfPGOEdge, "Number of edges."); 
-STATISTIC(NumOfPGOBB, "Number of basic-blocks."); 
-STATISTIC(NumOfPGOSplit, "Number of critical edge splits."); 
-STATISTIC(NumOfPGOFunc, "Number of functions having valid profile counts."); 
-STATISTIC(NumOfPGOMismatch, "Number of functions having mismatch profile."); 
-STATISTIC(NumOfPGOMissing, "Number of functions without profile."); 
-STATISTIC(NumOfPGOICall, "Number of indirect call value instrumentations."); 
-STATISTIC(NumOfCSPGOInstrument, "Number of edges instrumented in CSPGO."); 
-STATISTIC(NumOfCSPGOSelectInsts, 
-          "Number of select instruction instrumented in CSPGO."); 
-STATISTIC(NumOfCSPGOMemIntrinsics, 
-          "Number of mem intrinsics instrumented in CSPGO."); 
-STATISTIC(NumOfCSPGOEdge, "Number of edges in CSPGO."); 
-STATISTIC(NumOfCSPGOBB, "Number of basic-blocks in CSPGO."); 
-STATISTIC(NumOfCSPGOSplit, "Number of critical edge splits in CSPGO."); 
-STATISTIC(NumOfCSPGOFunc, 
-          "Number of functions having valid profile counts in CSPGO."); 
-STATISTIC(NumOfCSPGOMismatch, 
-          "Number of functions having mismatch profile in CSPGO."); 
-STATISTIC(NumOfCSPGOMissing, "Number of functions without profile in CSPGO."); 
- 
-// Command line option to specify the file to read profile from. This is 
-// mainly used for testing. 
-static cl::opt<std::string> 
-    PGOTestProfileFile("pgo-test-profile-file", cl::init(""), cl::Hidden, 
-                       cl::value_desc("filename"), 
-                       cl::desc("Specify the path of profile data file. This is" 
-                                "mainly for test purpose.")); 
-static cl::opt<std::string> PGOTestProfileRemappingFile( 
-    "pgo-test-profile-remapping-file", cl::init(""), cl::Hidden, 
-    cl::value_desc("filename"), 
-    cl::desc("Specify the path of profile remapping file. This is mainly for " 
-             "test purpose.")); 
- 
-// Command line option to disable value profiling. The default is false: 
-// i.e. value profiling is enabled by default. This is for debug purpose. 
-static cl::opt<bool> DisableValueProfiling("disable-vp", cl::init(false), 
-                                           cl::Hidden, 
-                                           cl::desc("Disable Value Profiling")); 
- 
-// Command line option to set the maximum number of VP annotations to write to 
-// the metadata for a single indirect call callsite. 
-static cl::opt<unsigned> MaxNumAnnotations( 
-    "icp-max-annotations", cl::init(3), cl::Hidden, cl::ZeroOrMore, 
-    cl::desc("Max number of annotations for a single indirect " 
-             "call callsite")); 
- 
-// Command line option to set the maximum number of value annotations 
-// to write to the metadata for a single memop intrinsic. 
-static cl::opt<unsigned> MaxNumMemOPAnnotations( 
-    "memop-max-annotations", cl::init(4), cl::Hidden, cl::ZeroOrMore, 
-    cl::desc("Max number of preicise value annotations for a single memop" 
-             "intrinsic")); 
- 
-// Command line option to control appending FunctionHash to the name of a COMDAT 
-// function. This is to avoid the hash mismatch caused by the preinliner. 
-static cl::opt<bool> DoComdatRenaming( 
-    "do-comdat-renaming", cl::init(false), cl::Hidden, 
-    cl::desc("Append function hash to the name of COMDAT function to avoid " 
-             "function hash mismatch due to the preinliner")); 
- 
-// Command line option to enable/disable the warning about missing profile 
-// information. 
-static cl::opt<bool> 
-    PGOWarnMissing("pgo-warn-missing-function", cl::init(false), cl::Hidden, 
-                   cl::desc("Use this option to turn on/off " 
-                            "warnings about missing profile data for " 
-                            "functions.")); 
- 
-// Command line option to enable/disable the warning about a hash mismatch in 
-// the profile data. 
-static cl::opt<bool> 
-    NoPGOWarnMismatch("no-pgo-warn-mismatch", cl::init(false), cl::Hidden, 
-                      cl::desc("Use this option to turn off/on " 
-                               "warnings about profile cfg mismatch.")); 
- 
-// Command line option to enable/disable the warning about a hash mismatch in 
-// the profile data for Comdat functions, which often turns out to be false 
-// positive due to the pre-instrumentation inline. 
-static cl::opt<bool> 
-    NoPGOWarnMismatchComdat("no-pgo-warn-mismatch-comdat", cl::init(true), 
-                            cl::Hidden, 
-                            cl::desc("The option is used to turn on/off " 
-                                     "warnings about hash mismatch for comdat " 
-                                     "functions.")); 
- 
-// Command line option to enable/disable select instruction instrumentation. 
-static cl::opt<bool> 
-    PGOInstrSelect("pgo-instr-select", cl::init(true), cl::Hidden, 
-                   cl::desc("Use this option to turn on/off SELECT " 
-                            "instruction instrumentation. ")); 
- 
-// Command line option to turn on CFG dot or text dump of raw profile counts 
-static cl::opt<PGOViewCountsType> PGOViewRawCounts( 
-    "pgo-view-raw-counts", cl::Hidden, 
-    cl::desc("A boolean option to show CFG dag or text " 
-             "with raw profile counts from " 
-             "profile data. See also option " 
-             "-pgo-view-counts. To limit graph " 
-             "display to only one function, use " 
-             "filtering option -view-bfi-func-name."), 
-    cl::values(clEnumValN(PGOVCT_None, "none", "do not show."), 
-               clEnumValN(PGOVCT_Graph, "graph", "show a graph."), 
-               clEnumValN(PGOVCT_Text, "text", "show in text."))); 
- 
-// Command line option to enable/disable memop intrinsic call.size profiling. 
-static cl::opt<bool> 
-    PGOInstrMemOP("pgo-instr-memop", cl::init(true), cl::Hidden, 
-                  cl::desc("Use this option to turn on/off " 
-                           "memory intrinsic size profiling.")); 
- 
-// Emit branch probability as optimization remarks. 
-static cl::opt<bool> 
-    EmitBranchProbability("pgo-emit-branch-prob", cl::init(false), cl::Hidden, 
-                          cl::desc("When this option is on, the annotated " 
-                                   "branch probability will be emitted as " 
-                                   "optimization remarks: -{Rpass|" 
-                                   "pass-remarks}=pgo-instrumentation")); 
- 
+//===- PGOInstrumentation.cpp - MST-based PGO Instrumentation -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements PGO instrumentation using a minimum spanning tree based
+// on the following paper:
+//   [1] Donald E. Knuth, Francis R. Stevenson. Optimal measurement of points
+//   for program frequency counts. BIT Numerical Mathematics 1973, Volume 13,
+//   Issue 3, pp 313-322
+// The idea of the algorithm based on the fact that for each node (except for
+// the entry and exit), the sum of incoming edge counts equals the sum of
+// outgoing edge counts. The count of edge on spanning tree can be derived from
+// those edges not on the spanning tree. Knuth proves this method instruments
+// the minimum number of edges.
+//
+// The minimal spanning tree here is actually a maximum weight tree -- on-tree
+// edges have higher frequencies (more likely to execute). The idea is to
+// instrument those less frequently executed edges to reduce the runtime
+// overhead of instrumented binaries.
+//
+// This file contains two passes:
+// (1) Pass PGOInstrumentationGen which instruments the IR to generate edge
+// count profile, and generates the instrumentation for indirect call
+// profiling.
+// (2) Pass PGOInstrumentationUse which reads the edge count profile and
+// annotates the branch weights. It also reads the indirect call value
+// profiling records and annotate the indirect call instructions.
+//
+// To get the precise counter information, These two passes need to invoke at
+// the same compilation point (so they see the same IR). For pass
+// PGOInstrumentationGen, the real work is done in instrumentOneFunc(). For
+// pass PGOInstrumentationUse, the real work in done in class PGOUseFunc and
+// the profile is opened in module level and passed to each PGOUseFunc instance.
+// The shared code for PGOInstrumentationGen and PGOInstrumentationUse is put
+// in class FuncPGOInstrumentation.
+//
+// Class PGOEdge represents a CFG edge and some auxiliary information. Class
+// BBInfo contains auxiliary information for each BB. These two classes are used
+// in pass PGOInstrumentationGen. Class PGOUseEdge and UseBBInfo are the derived
+// class of PGOEdge and BBInfo, respectively. They contains extra data structure
+// used in populating profile counters.
+// The MST implementation is in Class CFGMST (CFGMST.h).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
+#include "CFGMST.h"
+#include "ValueProfileCollector.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Comdat.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/ProfileSummary.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/InstrProfReader.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CRC.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/DOTGraphTraits.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using ProfileCount = Function::ProfileCount;
+using VPCandidateInfo = ValueProfileCollector::CandidateInfo;
+
+#define DEBUG_TYPE "pgo-instrumentation"
+
+STATISTIC(NumOfPGOInstrument, "Number of edges instrumented.");
+STATISTIC(NumOfPGOSelectInsts, "Number of select instruction instrumented.");
+STATISTIC(NumOfPGOMemIntrinsics, "Number of mem intrinsics instrumented.");
+STATISTIC(NumOfPGOEdge, "Number of edges.");
+STATISTIC(NumOfPGOBB, "Number of basic-blocks.");
+STATISTIC(NumOfPGOSplit, "Number of critical edge splits.");
+STATISTIC(NumOfPGOFunc, "Number of functions having valid profile counts.");
+STATISTIC(NumOfPGOMismatch, "Number of functions having mismatch profile.");
+STATISTIC(NumOfPGOMissing, "Number of functions without profile.");
+STATISTIC(NumOfPGOICall, "Number of indirect call value instrumentations.");
+STATISTIC(NumOfCSPGOInstrument, "Number of edges instrumented in CSPGO.");
+STATISTIC(NumOfCSPGOSelectInsts,
+          "Number of select instruction instrumented in CSPGO.");
+STATISTIC(NumOfCSPGOMemIntrinsics,
+          "Number of mem intrinsics instrumented in CSPGO.");
+STATISTIC(NumOfCSPGOEdge, "Number of edges in CSPGO.");
+STATISTIC(NumOfCSPGOBB, "Number of basic-blocks in CSPGO.");
+STATISTIC(NumOfCSPGOSplit, "Number of critical edge splits in CSPGO.");
+STATISTIC(NumOfCSPGOFunc,
+          "Number of functions having valid profile counts in CSPGO.");
+STATISTIC(NumOfCSPGOMismatch,
+          "Number of functions having mismatch profile in CSPGO.");
+STATISTIC(NumOfCSPGOMissing, "Number of functions without profile in CSPGO.");
+
+// Command line option to specify the file to read profile from. This is
+// mainly used for testing.
+static cl::opt<std::string>
+    PGOTestProfileFile("pgo-test-profile-file", cl::init(""), cl::Hidden,
+                       cl::value_desc("filename"),
+                       cl::desc("Specify the path of profile data file. This is"
+                                "mainly for test purpose."));
+static cl::opt<std::string> PGOTestProfileRemappingFile(
+    "pgo-test-profile-remapping-file", cl::init(""), cl::Hidden,
+    cl::value_desc("filename"),
+    cl::desc("Specify the path of profile remapping file. This is mainly for "
+             "test purpose."));
+
+// Command line option to disable value profiling. The default is false:
+// i.e. value profiling is enabled by default. This is for debug purpose.
+static cl::opt<bool> DisableValueProfiling("disable-vp", cl::init(false),
+                                           cl::Hidden,
+                                           cl::desc("Disable Value Profiling"));
+
+// Command line option to set the maximum number of VP annotations to write to
+// the metadata for a single indirect call callsite.
+static cl::opt<unsigned> MaxNumAnnotations(
+    "icp-max-annotations", cl::init(3), cl::Hidden, cl::ZeroOrMore,
+    cl::desc("Max number of annotations for a single indirect "
+             "call callsite"));
+
+// Command line option to set the maximum number of value annotations
+// to write to the metadata for a single memop intrinsic.
+static cl::opt<unsigned> MaxNumMemOPAnnotations(
+    "memop-max-annotations", cl::init(4), cl::Hidden, cl::ZeroOrMore,
+    cl::desc("Max number of preicise value annotations for a single memop"
+             "intrinsic"));
+
+// Command line option to control appending FunctionHash to the name of a COMDAT
+// function. This is to avoid the hash mismatch caused by the preinliner.
+static cl::opt<bool> DoComdatRenaming(
+    "do-comdat-renaming", cl::init(false), cl::Hidden,
+    cl::desc("Append function hash to the name of COMDAT function to avoid "
+             "function hash mismatch due to the preinliner"));
+
+// Command line option to enable/disable the warning about missing profile
+// information.
+static cl::opt<bool>
+    PGOWarnMissing("pgo-warn-missing-function", cl::init(false), cl::Hidden,
+                   cl::desc("Use this option to turn on/off "
+                            "warnings about missing profile data for "
+                            "functions."));
+
+// Command line option to enable/disable the warning about a hash mismatch in
+// the profile data.
+static cl::opt<bool>
+    NoPGOWarnMismatch("no-pgo-warn-mismatch", cl::init(false), cl::Hidden,
+                      cl::desc("Use this option to turn off/on "
+                               "warnings about profile cfg mismatch."));
+
+// Command line option to enable/disable the warning about a hash mismatch in
+// the profile data for Comdat functions, which often turns out to be false
+// positive due to the pre-instrumentation inline.
+static cl::opt<bool>
+    NoPGOWarnMismatchComdat("no-pgo-warn-mismatch-comdat", cl::init(true),
+                            cl::Hidden,
+                            cl::desc("The option is used to turn on/off "
+                                     "warnings about hash mismatch for comdat "
+                                     "functions."));
+
+// Command line option to enable/disable select instruction instrumentation.
+static cl::opt<bool>
+    PGOInstrSelect("pgo-instr-select", cl::init(true), cl::Hidden,
+                   cl::desc("Use this option to turn on/off SELECT "
+                            "instruction instrumentation. "));
+
+// Command line option to turn on CFG dot or text dump of raw profile counts
+static cl::opt<PGOViewCountsType> PGOViewRawCounts(
+    "pgo-view-raw-counts", cl::Hidden,
+    cl::desc("A boolean option to show CFG dag or text "
+             "with raw profile counts from "
+             "profile data. See also option "
+             "-pgo-view-counts. To limit graph "
+             "display to only one function, use "
+             "filtering option -view-bfi-func-name."),
+    cl::values(clEnumValN(PGOVCT_None, "none", "do not show."),
+               clEnumValN(PGOVCT_Graph, "graph", "show a graph."),
+               clEnumValN(PGOVCT_Text, "text", "show in text.")));
+
+// Command line option to enable/disable memop intrinsic call.size profiling.
+static cl::opt<bool>
+    PGOInstrMemOP("pgo-instr-memop", cl::init(true), cl::Hidden,
+                  cl::desc("Use this option to turn on/off "
+                           "memory intrinsic size profiling."));
+
+// Emit branch probability as optimization remarks.
+static cl::opt<bool>
+    EmitBranchProbability("pgo-emit-branch-prob", cl::init(false), cl::Hidden,
+                          cl::desc("When this option is on, the annotated "
+                                   "branch probability will be emitted as "
+                                   "optimization remarks: -{Rpass|"
+                                   "pass-remarks}=pgo-instrumentation"));
+
 static cl::opt<bool> PGOInstrumentEntry(
     "pgo-instrument-entry", cl::init(false), cl::Hidden,
     cl::desc("Force to instrument function entry basicblock."));
@@ -280,394 +280,394 @@ static cl::opt<unsigned> PGOVerifyBFICutoff(
     cl::desc("Set the threshold for pgo-verify-bfi -- skip the counts whose "
              "profile count value is below."));
 
-// Command line option to turn on CFG dot dump after profile annotation. 
-// Defined in Analysis/BlockFrequencyInfo.cpp:  -pgo-view-counts 
-extern cl::opt<PGOViewCountsType> PGOViewCounts; 
- 
-// Command line option to specify the name of the function for CFG dump 
-// Defined in Analysis/BlockFrequencyInfo.cpp:  -view-bfi-func-name= 
-extern cl::opt<std::string> ViewBlockFreqFuncName; 
- 
+// Command line option to turn on CFG dot dump after profile annotation.
+// Defined in Analysis/BlockFrequencyInfo.cpp:  -pgo-view-counts
+extern cl::opt<PGOViewCountsType> PGOViewCounts;
+
+// Command line option to specify the name of the function for CFG dump
+// Defined in Analysis/BlockFrequencyInfo.cpp:  -view-bfi-func-name=
+extern cl::opt<std::string> ViewBlockFreqFuncName;
+
 static cl::opt<bool>
     PGOOldCFGHashing("pgo-instr-old-cfg-hashing", cl::init(false), cl::Hidden,
                      cl::desc("Use the old CFG function hashing"));
 
-// Return a string describing the branch condition that can be 
-// used in static branch probability heuristics: 
-static std::string getBranchCondString(Instruction *TI) { 
-  BranchInst *BI = dyn_cast<BranchInst>(TI); 
-  if (!BI || !BI->isConditional()) 
-    return std::string(); 
- 
-  Value *Cond = BI->getCondition(); 
-  ICmpInst *CI = dyn_cast<ICmpInst>(Cond); 
-  if (!CI) 
-    return std::string(); 
- 
-  std::string result; 
-  raw_string_ostream OS(result); 
-  OS << CmpInst::getPredicateName(CI->getPredicate()) << "_"; 
-  CI->getOperand(0)->getType()->print(OS, true); 
- 
-  Value *RHS = CI->getOperand(1); 
-  ConstantInt *CV = dyn_cast<ConstantInt>(RHS); 
-  if (CV) { 
-    if (CV->isZero()) 
-      OS << "_Zero"; 
-    else if (CV->isOne()) 
-      OS << "_One"; 
-    else if (CV->isMinusOne()) 
-      OS << "_MinusOne"; 
-    else 
-      OS << "_Const"; 
-  } 
-  OS.flush(); 
-  return result; 
-} 
- 
-static const char *ValueProfKindDescr[] = { 
-#define VALUE_PROF_KIND(Enumerator, Value, Descr) Descr, 
-#include "llvm/ProfileData/InstrProfData.inc" 
-}; 
- 
-namespace { 
- 
-/// The select instruction visitor plays three roles specified 
-/// by the mode. In \c VM_counting mode, it simply counts the number of 
-/// select instructions. In \c VM_instrument mode, it inserts code to count 
-/// the number times TrueValue of select is taken. In \c VM_annotate mode, 
-/// it reads the profile data and annotate the select instruction with metadata. 
-enum VisitMode { VM_counting, VM_instrument, VM_annotate }; 
-class PGOUseFunc; 
- 
-/// Instruction Visitor class to visit select instructions. 
-struct SelectInstVisitor : public InstVisitor<SelectInstVisitor> { 
-  Function &F; 
-  unsigned NSIs = 0;             // Number of select instructions instrumented. 
-  VisitMode Mode = VM_counting;  // Visiting mode. 
-  unsigned *CurCtrIdx = nullptr; // Pointer to current counter index. 
-  unsigned TotalNumCtrs = 0;     // Total number of counters 
-  GlobalVariable *FuncNameVar = nullptr; 
-  uint64_t FuncHash = 0; 
-  PGOUseFunc *UseFunc = nullptr; 
- 
-  SelectInstVisitor(Function &Func) : F(Func) {} 
- 
-  void countSelects(Function &Func) { 
-    NSIs = 0; 
-    Mode = VM_counting; 
-    visit(Func); 
-  } 
- 
-  // Visit the IR stream and instrument all select instructions. \p 
-  // Ind is a pointer to the counter index variable; \p TotalNC 
-  // is the total number of counters; \p FNV is the pointer to the 
-  // PGO function name var; \p FHash is the function hash. 
-  void instrumentSelects(Function &Func, unsigned *Ind, unsigned TotalNC, 
-                         GlobalVariable *FNV, uint64_t FHash) { 
-    Mode = VM_instrument; 
-    CurCtrIdx = Ind; 
-    TotalNumCtrs = TotalNC; 
-    FuncHash = FHash; 
-    FuncNameVar = FNV; 
-    visit(Func); 
-  } 
- 
-  // Visit the IR stream and annotate all select instructions. 
-  void annotateSelects(Function &Func, PGOUseFunc *UF, unsigned *Ind) { 
-    Mode = VM_annotate; 
-    UseFunc = UF; 
-    CurCtrIdx = Ind; 
-    visit(Func); 
-  } 
- 
-  void instrumentOneSelectInst(SelectInst &SI); 
-  void annotateOneSelectInst(SelectInst &SI); 
- 
-  // Visit \p SI instruction and perform tasks according to visit mode. 
-  void visitSelectInst(SelectInst &SI); 
- 
-  // Return the number of select instructions. This needs be called after 
-  // countSelects(). 
-  unsigned getNumOfSelectInsts() const { return NSIs; } 
-}; 
- 
- 
-class PGOInstrumentationGenLegacyPass : public ModulePass { 
-public: 
-  static char ID; 
- 
-  PGOInstrumentationGenLegacyPass(bool IsCS = false) 
-      : ModulePass(ID), IsCS(IsCS) { 
-    initializePGOInstrumentationGenLegacyPassPass( 
-        *PassRegistry::getPassRegistry()); 
-  } 
- 
-  StringRef getPassName() const override { return "PGOInstrumentationGenPass"; } 
- 
-private: 
-  // Is this is context-sensitive instrumentation. 
-  bool IsCS; 
-  bool runOnModule(Module &M) override; 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<BlockFrequencyInfoWrapperPass>(); 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-  } 
-}; 
- 
-class PGOInstrumentationUseLegacyPass : public ModulePass { 
-public: 
-  static char ID; 
- 
-  // Provide the profile filename as the parameter. 
-  PGOInstrumentationUseLegacyPass(std::string Filename = "", bool IsCS = false) 
-      : ModulePass(ID), ProfileFileName(std::move(Filename)), IsCS(IsCS) { 
-    if (!PGOTestProfileFile.empty()) 
-      ProfileFileName = PGOTestProfileFile; 
-    initializePGOInstrumentationUseLegacyPassPass( 
-        *PassRegistry::getPassRegistry()); 
-  } 
- 
-  StringRef getPassName() const override { return "PGOInstrumentationUsePass"; } 
- 
-private: 
-  std::string ProfileFileName; 
-  // Is this is context-sensitive instrumentation use. 
-  bool IsCS; 
- 
-  bool runOnModule(Module &M) override; 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<ProfileSummaryInfoWrapperPass>(); 
-    AU.addRequired<BlockFrequencyInfoWrapperPass>(); 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-  } 
-}; 
- 
-class PGOInstrumentationGenCreateVarLegacyPass : public ModulePass { 
-public: 
-  static char ID; 
-  StringRef getPassName() const override { 
-    return "PGOInstrumentationGenCreateVarPass"; 
-  } 
-  PGOInstrumentationGenCreateVarLegacyPass(std::string CSInstrName = "") 
-      : ModulePass(ID), InstrProfileOutput(CSInstrName) { 
-    initializePGOInstrumentationGenCreateVarLegacyPassPass( 
-        *PassRegistry::getPassRegistry()); 
-  } 
- 
-private: 
-  bool runOnModule(Module &M) override { 
-    createProfileFileNameVar(M, InstrProfileOutput); 
+// Return a string describing the branch condition that can be
+// used in static branch probability heuristics:
+static std::string getBranchCondString(Instruction *TI) {
+  BranchInst *BI = dyn_cast<BranchInst>(TI);
+  if (!BI || !BI->isConditional())
+    return std::string();
+
+  Value *Cond = BI->getCondition();
+  ICmpInst *CI = dyn_cast<ICmpInst>(Cond);
+  if (!CI)
+    return std::string();
+
+  std::string result;
+  raw_string_ostream OS(result);
+  OS << CmpInst::getPredicateName(CI->getPredicate()) << "_";
+  CI->getOperand(0)->getType()->print(OS, true);
+
+  Value *RHS = CI->getOperand(1);
+  ConstantInt *CV = dyn_cast<ConstantInt>(RHS);
+  if (CV) {
+    if (CV->isZero())
+      OS << "_Zero";
+    else if (CV->isOne())
+      OS << "_One";
+    else if (CV->isMinusOne())
+      OS << "_MinusOne";
+    else
+      OS << "_Const";
+  }
+  OS.flush();
+  return result;
+}
+
+static const char *ValueProfKindDescr[] = {
+#define VALUE_PROF_KIND(Enumerator, Value, Descr) Descr,
+#include "llvm/ProfileData/InstrProfData.inc"
+};
+
+namespace {
+
+/// The select instruction visitor plays three roles specified
+/// by the mode. In \c VM_counting mode, it simply counts the number of
+/// select instructions. In \c VM_instrument mode, it inserts code to count
+/// the number times TrueValue of select is taken. In \c VM_annotate mode,
+/// it reads the profile data and annotate the select instruction with metadata.
+enum VisitMode { VM_counting, VM_instrument, VM_annotate };
+class PGOUseFunc;
+
+/// Instruction Visitor class to visit select instructions.
+struct SelectInstVisitor : public InstVisitor<SelectInstVisitor> {
+  Function &F;
+  unsigned NSIs = 0;             // Number of select instructions instrumented.
+  VisitMode Mode = VM_counting;  // Visiting mode.
+  unsigned *CurCtrIdx = nullptr; // Pointer to current counter index.
+  unsigned TotalNumCtrs = 0;     // Total number of counters
+  GlobalVariable *FuncNameVar = nullptr;
+  uint64_t FuncHash = 0;
+  PGOUseFunc *UseFunc = nullptr;
+
+  SelectInstVisitor(Function &Func) : F(Func) {}
+
+  void countSelects(Function &Func) {
+    NSIs = 0;
+    Mode = VM_counting;
+    visit(Func);
+  }
+
+  // Visit the IR stream and instrument all select instructions. \p
+  // Ind is a pointer to the counter index variable; \p TotalNC
+  // is the total number of counters; \p FNV is the pointer to the
+  // PGO function name var; \p FHash is the function hash.
+  void instrumentSelects(Function &Func, unsigned *Ind, unsigned TotalNC,
+                         GlobalVariable *FNV, uint64_t FHash) {
+    Mode = VM_instrument;
+    CurCtrIdx = Ind;
+    TotalNumCtrs = TotalNC;
+    FuncHash = FHash;
+    FuncNameVar = FNV;
+    visit(Func);
+  }
+
+  // Visit the IR stream and annotate all select instructions.
+  void annotateSelects(Function &Func, PGOUseFunc *UF, unsigned *Ind) {
+    Mode = VM_annotate;
+    UseFunc = UF;
+    CurCtrIdx = Ind;
+    visit(Func);
+  }
+
+  void instrumentOneSelectInst(SelectInst &SI);
+  void annotateOneSelectInst(SelectInst &SI);
+
+  // Visit \p SI instruction and perform tasks according to visit mode.
+  void visitSelectInst(SelectInst &SI);
+
+  // Return the number of select instructions. This needs be called after
+  // countSelects().
+  unsigned getNumOfSelectInsts() const { return NSIs; }
+};
+
+
+class PGOInstrumentationGenLegacyPass : public ModulePass {
+public:
+  static char ID;
+
+  PGOInstrumentationGenLegacyPass(bool IsCS = false)
+      : ModulePass(ID), IsCS(IsCS) {
+    initializePGOInstrumentationGenLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "PGOInstrumentationGenPass"; }
+
+private:
+  // Is this is context-sensitive instrumentation.
+  bool IsCS;
+  bool runOnModule(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+};
+
+class PGOInstrumentationUseLegacyPass : public ModulePass {
+public:
+  static char ID;
+
+  // Provide the profile filename as the parameter.
+  PGOInstrumentationUseLegacyPass(std::string Filename = "", bool IsCS = false)
+      : ModulePass(ID), ProfileFileName(std::move(Filename)), IsCS(IsCS) {
+    if (!PGOTestProfileFile.empty())
+      ProfileFileName = PGOTestProfileFile;
+    initializePGOInstrumentationUseLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "PGOInstrumentationUsePass"; }
+
+private:
+  std::string ProfileFileName;
+  // Is this is context-sensitive instrumentation use.
+  bool IsCS;
+
+  bool runOnModule(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+};
+
+class PGOInstrumentationGenCreateVarLegacyPass : public ModulePass {
+public:
+  static char ID;
+  StringRef getPassName() const override {
+    return "PGOInstrumentationGenCreateVarPass";
+  }
+  PGOInstrumentationGenCreateVarLegacyPass(std::string CSInstrName = "")
+      : ModulePass(ID), InstrProfileOutput(CSInstrName) {
+    initializePGOInstrumentationGenCreateVarLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+private:
+  bool runOnModule(Module &M) override {
+    createProfileFileNameVar(M, InstrProfileOutput);
     createIRLevelProfileFlagVar(M, /* IsCS */ true, PGOInstrumentEntry);
-    return false; 
-  } 
-  std::string InstrProfileOutput; 
-}; 
- 
-} // end anonymous namespace 
- 
-char PGOInstrumentationGenLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(PGOInstrumentationGenLegacyPass, "pgo-instr-gen", 
-                      "PGO instrumentation.", false, false) 
-INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_END(PGOInstrumentationGenLegacyPass, "pgo-instr-gen", 
-                    "PGO instrumentation.", false, false) 
- 
-ModulePass *llvm::createPGOInstrumentationGenLegacyPass(bool IsCS) { 
-  return new PGOInstrumentationGenLegacyPass(IsCS); 
-} 
- 
-char PGOInstrumentationUseLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(PGOInstrumentationUseLegacyPass, "pgo-instr-use", 
-                      "Read PGO instrumentation profile.", false, false) 
-INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 
-INITIALIZE_PASS_END(PGOInstrumentationUseLegacyPass, "pgo-instr-use", 
-                    "Read PGO instrumentation profile.", false, false) 
- 
-ModulePass *llvm::createPGOInstrumentationUseLegacyPass(StringRef Filename, 
-                                                        bool IsCS) { 
-  return new PGOInstrumentationUseLegacyPass(Filename.str(), IsCS); 
-} 
- 
-char PGOInstrumentationGenCreateVarLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS(PGOInstrumentationGenCreateVarLegacyPass, 
-                "pgo-instr-gen-create-var", 
-                "Create PGO instrumentation version variable for CSPGO.", false, 
-                false) 
- 
-ModulePass * 
-llvm::createPGOInstrumentationGenCreateVarLegacyPass(StringRef CSInstrName) { 
-  return new PGOInstrumentationGenCreateVarLegacyPass(std::string(CSInstrName)); 
-} 
- 
-namespace { 
- 
-/// An MST based instrumentation for PGO 
-/// 
-/// Implements a Minimum Spanning Tree (MST) based instrumentation for PGO 
-/// in the function level. 
-struct PGOEdge { 
-  // This class implements the CFG edges. Note the CFG can be a multi-graph. 
-  // So there might be multiple edges with same SrcBB and DestBB. 
-  const BasicBlock *SrcBB; 
-  const BasicBlock *DestBB; 
-  uint64_t Weight; 
-  bool InMST = false; 
-  bool Removed = false; 
-  bool IsCritical = false; 
- 
-  PGOEdge(const BasicBlock *Src, const BasicBlock *Dest, uint64_t W = 1) 
-      : SrcBB(Src), DestBB(Dest), Weight(W) {} 
- 
-  // Return the information string of an edge. 
-  const std::string infoString() const { 
-    return (Twine(Removed ? "-" : " ") + (InMST ? " " : "*") + 
-            (IsCritical ? "c" : " ") + "  W=" + Twine(Weight)).str(); 
-  } 
-}; 
- 
-// This class stores the auxiliary information for each BB. 
-struct BBInfo { 
-  BBInfo *Group; 
-  uint32_t Index; 
-  uint32_t Rank = 0; 
- 
-  BBInfo(unsigned IX) : Group(this), Index(IX) {} 
- 
-  // Return the information string of this object. 
-  const std::string infoString() const { 
-    return (Twine("Index=") + Twine(Index)).str(); 
-  } 
- 
-  // Empty function -- only applicable to UseBBInfo. 
-  void addOutEdge(PGOEdge *E LLVM_ATTRIBUTE_UNUSED) {} 
- 
-  // Empty function -- only applicable to UseBBInfo. 
-  void addInEdge(PGOEdge *E LLVM_ATTRIBUTE_UNUSED) {} 
-}; 
- 
-// This class implements the CFG edges. Note the CFG can be a multi-graph. 
-template <class Edge, class BBInfo> class FuncPGOInstrumentation { 
-private: 
-  Function &F; 
- 
-  // Is this is context-sensitive instrumentation. 
-  bool IsCS; 
- 
-  // A map that stores the Comdat group in function F. 
-  std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers; 
- 
-  ValueProfileCollector VPC; 
- 
-  void computeCFGHash(); 
-  void renameComdatFunction(); 
- 
-public: 
-  std::vector<std::vector<VPCandidateInfo>> ValueSites; 
-  SelectInstVisitor SIVisitor; 
-  std::string FuncName; 
-  GlobalVariable *FuncNameVar; 
- 
-  // CFG hash value for this function. 
-  uint64_t FunctionHash = 0; 
- 
-  // The Minimum Spanning Tree of function CFG. 
-  CFGMST<Edge, BBInfo> MST; 
- 
-  // Collect all the BBs that will be instrumented, and store them in 
-  // InstrumentBBs. 
-  void getInstrumentBBs(std::vector<BasicBlock *> &InstrumentBBs); 
- 
-  // Give an edge, find the BB that will be instrumented. 
-  // Return nullptr if there is no BB to be instrumented. 
-  BasicBlock *getInstrBB(Edge *E); 
- 
-  // Return the auxiliary BB information. 
-  BBInfo &getBBInfo(const BasicBlock *BB) const { return MST.getBBInfo(BB); } 
- 
-  // Return the auxiliary BB information if available. 
-  BBInfo *findBBInfo(const BasicBlock *BB) const { return MST.findBBInfo(BB); } 
- 
-  // Dump edges and BB information. 
-  void dumpInfo(std::string Str = "") const { 
-    MST.dumpEdges(dbgs(), Twine("Dump Function ") + FuncName + " Hash: " + 
-                              Twine(FunctionHash) + "\t" + Str); 
-  } 
- 
-  FuncPGOInstrumentation( 
-      Function &Func, TargetLibraryInfo &TLI, 
-      std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers, 
-      bool CreateGlobalVar = false, BranchProbabilityInfo *BPI = nullptr, 
+    return false;
+  }
+  std::string InstrProfileOutput;
+};
+
+} // end anonymous namespace
+
+char PGOInstrumentationGenLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(PGOInstrumentationGenLegacyPass, "pgo-instr-gen",
+                      "PGO instrumentation.", false, false)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(PGOInstrumentationGenLegacyPass, "pgo-instr-gen",
+                    "PGO instrumentation.", false, false)
+
+ModulePass *llvm::createPGOInstrumentationGenLegacyPass(bool IsCS) {
+  return new PGOInstrumentationGenLegacyPass(IsCS);
+}
+
+char PGOInstrumentationUseLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(PGOInstrumentationUseLegacyPass, "pgo-instr-use",
+                      "Read PGO instrumentation profile.", false, false)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_END(PGOInstrumentationUseLegacyPass, "pgo-instr-use",
+                    "Read PGO instrumentation profile.", false, false)
+
+ModulePass *llvm::createPGOInstrumentationUseLegacyPass(StringRef Filename,
+                                                        bool IsCS) {
+  return new PGOInstrumentationUseLegacyPass(Filename.str(), IsCS);
+}
+
+char PGOInstrumentationGenCreateVarLegacyPass::ID = 0;
+
+INITIALIZE_PASS(PGOInstrumentationGenCreateVarLegacyPass,
+                "pgo-instr-gen-create-var",
+                "Create PGO instrumentation version variable for CSPGO.", false,
+                false)
+
+ModulePass *
+llvm::createPGOInstrumentationGenCreateVarLegacyPass(StringRef CSInstrName) {
+  return new PGOInstrumentationGenCreateVarLegacyPass(std::string(CSInstrName));
+}
+
+namespace {
+
+/// An MST based instrumentation for PGO
+///
+/// Implements a Minimum Spanning Tree (MST) based instrumentation for PGO
+/// in the function level.
+struct PGOEdge {
+  // This class implements the CFG edges. Note the CFG can be a multi-graph.
+  // So there might be multiple edges with same SrcBB and DestBB.
+  const BasicBlock *SrcBB;
+  const BasicBlock *DestBB;
+  uint64_t Weight;
+  bool InMST = false;
+  bool Removed = false;
+  bool IsCritical = false;
+
+  PGOEdge(const BasicBlock *Src, const BasicBlock *Dest, uint64_t W = 1)
+      : SrcBB(Src), DestBB(Dest), Weight(W) {}
+
+  // Return the information string of an edge.
+  const std::string infoString() const {
+    return (Twine(Removed ? "-" : " ") + (InMST ? " " : "*") +
+            (IsCritical ? "c" : " ") + "  W=" + Twine(Weight)).str();
+  }
+};
+
+// This class stores the auxiliary information for each BB.
+struct BBInfo {
+  BBInfo *Group;
+  uint32_t Index;
+  uint32_t Rank = 0;
+
+  BBInfo(unsigned IX) : Group(this), Index(IX) {}
+
+  // Return the information string of this object.
+  const std::string infoString() const {
+    return (Twine("Index=") + Twine(Index)).str();
+  }
+
+  // Empty function -- only applicable to UseBBInfo.
+  void addOutEdge(PGOEdge *E LLVM_ATTRIBUTE_UNUSED) {}
+
+  // Empty function -- only applicable to UseBBInfo.
+  void addInEdge(PGOEdge *E LLVM_ATTRIBUTE_UNUSED) {}
+};
+
+// This class implements the CFG edges. Note the CFG can be a multi-graph.
+template <class Edge, class BBInfo> class FuncPGOInstrumentation {
+private:
+  Function &F;
+
+  // Is this is context-sensitive instrumentation.
+  bool IsCS;
+
+  // A map that stores the Comdat group in function F.
+  std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers;
+
+  ValueProfileCollector VPC;
+
+  void computeCFGHash();
+  void renameComdatFunction();
+
+public:
+  std::vector<std::vector<VPCandidateInfo>> ValueSites;
+  SelectInstVisitor SIVisitor;
+  std::string FuncName;
+  GlobalVariable *FuncNameVar;
+
+  // CFG hash value for this function.
+  uint64_t FunctionHash = 0;
+
+  // The Minimum Spanning Tree of function CFG.
+  CFGMST<Edge, BBInfo> MST;
+
+  // Collect all the BBs that will be instrumented, and store them in
+  // InstrumentBBs.
+  void getInstrumentBBs(std::vector<BasicBlock *> &InstrumentBBs);
+
+  // Give an edge, find the BB that will be instrumented.
+  // Return nullptr if there is no BB to be instrumented.
+  BasicBlock *getInstrBB(Edge *E);
+
+  // Return the auxiliary BB information.
+  BBInfo &getBBInfo(const BasicBlock *BB) const { return MST.getBBInfo(BB); }
+
+  // Return the auxiliary BB information if available.
+  BBInfo *findBBInfo(const BasicBlock *BB) const { return MST.findBBInfo(BB); }
+
+  // Dump edges and BB information.
+  void dumpInfo(std::string Str = "") const {
+    MST.dumpEdges(dbgs(), Twine("Dump Function ") + FuncName + " Hash: " +
+                              Twine(FunctionHash) + "\t" + Str);
+  }
+
+  FuncPGOInstrumentation(
+      Function &Func, TargetLibraryInfo &TLI,
+      std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
+      bool CreateGlobalVar = false, BranchProbabilityInfo *BPI = nullptr,
       BlockFrequencyInfo *BFI = nullptr, bool IsCS = false,
       bool InstrumentFuncEntry = true)
-      : F(Func), IsCS(IsCS), ComdatMembers(ComdatMembers), VPC(Func, TLI), 
+      : F(Func), IsCS(IsCS), ComdatMembers(ComdatMembers), VPC(Func, TLI),
         ValueSites(IPVK_Last + 1), SIVisitor(Func),
         MST(F, InstrumentFuncEntry, BPI, BFI) {
-    // This should be done before CFG hash computation. 
-    SIVisitor.countSelects(Func); 
-    ValueSites[IPVK_MemOPSize] = VPC.get(IPVK_MemOPSize); 
-    if (!IsCS) { 
-      NumOfPGOSelectInsts += SIVisitor.getNumOfSelectInsts(); 
-      NumOfPGOMemIntrinsics += ValueSites[IPVK_MemOPSize].size(); 
-      NumOfPGOBB += MST.BBInfos.size(); 
-      ValueSites[IPVK_IndirectCallTarget] = VPC.get(IPVK_IndirectCallTarget); 
-    } else { 
-      NumOfCSPGOSelectInsts += SIVisitor.getNumOfSelectInsts(); 
-      NumOfCSPGOMemIntrinsics += ValueSites[IPVK_MemOPSize].size(); 
-      NumOfCSPGOBB += MST.BBInfos.size(); 
-    } 
- 
-    FuncName = getPGOFuncName(F); 
-    computeCFGHash(); 
-    if (!ComdatMembers.empty()) 
-      renameComdatFunction(); 
-    LLVM_DEBUG(dumpInfo("after CFGMST")); 
- 
-    for (auto &E : MST.AllEdges) { 
-      if (E->Removed) 
-        continue; 
-      IsCS ? NumOfCSPGOEdge++ : NumOfPGOEdge++; 
-      if (!E->InMST) 
-        IsCS ? NumOfCSPGOInstrument++ : NumOfPGOInstrument++; 
-    } 
- 
-    if (CreateGlobalVar) 
-      FuncNameVar = createPGOFuncNameVar(F, FuncName); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-// Compute Hash value for the CFG: the lower 32 bits are CRC32 of the index 
+    // This should be done before CFG hash computation.
+    SIVisitor.countSelects(Func);
+    ValueSites[IPVK_MemOPSize] = VPC.get(IPVK_MemOPSize);
+    if (!IsCS) {
+      NumOfPGOSelectInsts += SIVisitor.getNumOfSelectInsts();
+      NumOfPGOMemIntrinsics += ValueSites[IPVK_MemOPSize].size();
+      NumOfPGOBB += MST.BBInfos.size();
+      ValueSites[IPVK_IndirectCallTarget] = VPC.get(IPVK_IndirectCallTarget);
+    } else {
+      NumOfCSPGOSelectInsts += SIVisitor.getNumOfSelectInsts();
+      NumOfCSPGOMemIntrinsics += ValueSites[IPVK_MemOPSize].size();
+      NumOfCSPGOBB += MST.BBInfos.size();
+    }
+
+    FuncName = getPGOFuncName(F);
+    computeCFGHash();
+    if (!ComdatMembers.empty())
+      renameComdatFunction();
+    LLVM_DEBUG(dumpInfo("after CFGMST"));
+
+    for (auto &E : MST.AllEdges) {
+      if (E->Removed)
+        continue;
+      IsCS ? NumOfCSPGOEdge++ : NumOfPGOEdge++;
+      if (!E->InMST)
+        IsCS ? NumOfCSPGOInstrument++ : NumOfPGOInstrument++;
+    }
+
+    if (CreateGlobalVar)
+      FuncNameVar = createPGOFuncNameVar(F, FuncName);
+  }
+};
+
+} // end anonymous namespace
+
+// Compute Hash value for the CFG: the lower 32 bits are CRC32 of the index
 // value of each BB in the CFG. The higher 32 bits are the CRC32 of the numbers
 // of selects, indirect calls, mem ops and edges.
-template <class Edge, class BBInfo> 
-void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() { 
-  std::vector<uint8_t> Indexes; 
-  JamCRC JC; 
-  for (auto &BB : F) { 
-    const Instruction *TI = BB.getTerminator(); 
-    for (unsigned I = 0, E = TI->getNumSuccessors(); I != E; ++I) { 
-      BasicBlock *Succ = TI->getSuccessor(I); 
-      auto BI = findBBInfo(Succ); 
-      if (BI == nullptr) 
-        continue; 
-      uint32_t Index = BI->Index; 
-      for (int J = 0; J < 4; J++) 
-        Indexes.push_back((uint8_t)(Index >> (J * 8))); 
-    } 
-  } 
-  JC.update(Indexes); 
- 
+template <class Edge, class BBInfo>
+void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
+  std::vector<uint8_t> Indexes;
+  JamCRC JC;
+  for (auto &BB : F) {
+    const Instruction *TI = BB.getTerminator();
+    for (unsigned I = 0, E = TI->getNumSuccessors(); I != E; ++I) {
+      BasicBlock *Succ = TI->getSuccessor(I);
+      auto BI = findBBInfo(Succ);
+      if (BI == nullptr)
+        continue;
+      uint32_t Index = BI->Index;
+      for (int J = 0; J < 4; J++)
+        Indexes.push_back((uint8_t)(Index >> (J * 8)));
+    }
+  }
+  JC.update(Indexes);
+
   JamCRC JCH;
   if (PGOOldCFGHashing) {
     // Hash format for context sensitive profile. Reserve 4 bits for other
@@ -693,956 +693,956 @@ void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
     FunctionHash = (((uint64_t)JCH.getCRC()) << 28) + JC.getCRC();
   }
 
-  // Reserve bit 60-63 for other information purpose. 
-  FunctionHash &= 0x0FFFFFFFFFFFFFFF; 
-  if (IsCS) 
-    NamedInstrProfRecord::setCSFlagInHash(FunctionHash); 
-  LLVM_DEBUG(dbgs() << "Function Hash Computation for " << F.getName() << ":\n" 
-                    << " CRC = " << JC.getCRC() 
-                    << ", Selects = " << SIVisitor.getNumOfSelectInsts() 
-                    << ", Edges = " << MST.AllEdges.size() << ", ICSites = " 
+  // Reserve bit 60-63 for other information purpose.
+  FunctionHash &= 0x0FFFFFFFFFFFFFFF;
+  if (IsCS)
+    NamedInstrProfRecord::setCSFlagInHash(FunctionHash);
+  LLVM_DEBUG(dbgs() << "Function Hash Computation for " << F.getName() << ":\n"
+                    << " CRC = " << JC.getCRC()
+                    << ", Selects = " << SIVisitor.getNumOfSelectInsts()
+                    << ", Edges = " << MST.AllEdges.size() << ", ICSites = "
                     << ValueSites[IPVK_IndirectCallTarget].size());
   if (!PGOOldCFGHashing) {
     LLVM_DEBUG(dbgs() << ", Memops = " << ValueSites[IPVK_MemOPSize].size()
                       << ", High32 CRC = " << JCH.getCRC());
   }
   LLVM_DEBUG(dbgs() << ", Hash = " << FunctionHash << "\n";);
-} 
- 
-// Check if we can safely rename this Comdat function. 
-static bool canRenameComdat( 
-    Function &F, 
-    std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers) { 
-  if (!DoComdatRenaming || !canRenameComdatFunc(F, true)) 
-    return false; 
- 
-  // FIXME: Current only handle those Comdat groups that only containing one 
+}
+
+// Check if we can safely rename this Comdat function.
+static bool canRenameComdat(
+    Function &F,
+    std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers) {
+  if (!DoComdatRenaming || !canRenameComdatFunc(F, true))
+    return false;
+
+  // FIXME: Current only handle those Comdat groups that only containing one
   // function.
-  // (1) For a Comdat group containing multiple functions, we need to have a 
-  // unique postfix based on the hashes for each function. There is a 
-  // non-trivial code refactoring to do this efficiently. 
-  // (2) Variables can not be renamed, so we can not rename Comdat function in a 
-  // group including global vars. 
-  Comdat *C = F.getComdat(); 
-  for (auto &&CM : make_range(ComdatMembers.equal_range(C))) { 
+  // (1) For a Comdat group containing multiple functions, we need to have a
+  // unique postfix based on the hashes for each function. There is a
+  // non-trivial code refactoring to do this efficiently.
+  // (2) Variables can not be renamed, so we can not rename Comdat function in a
+  // group including global vars.
+  Comdat *C = F.getComdat();
+  for (auto &&CM : make_range(ComdatMembers.equal_range(C))) {
     assert(!isa<GlobalAlias>(CM.second));
-    Function *FM = dyn_cast<Function>(CM.second); 
-    if (FM != &F) 
-      return false; 
-  } 
-  return true; 
-} 
- 
-// Append the CFGHash to the Comdat function name. 
-template <class Edge, class BBInfo> 
-void FuncPGOInstrumentation<Edge, BBInfo>::renameComdatFunction() { 
-  if (!canRenameComdat(F, ComdatMembers)) 
-    return; 
-  std::string OrigName = F.getName().str(); 
-  std::string NewFuncName = 
-      Twine(F.getName() + "." + Twine(FunctionHash)).str(); 
-  F.setName(Twine(NewFuncName)); 
-  GlobalAlias::create(GlobalValue::WeakAnyLinkage, OrigName, &F); 
-  FuncName = Twine(FuncName + "." + Twine(FunctionHash)).str(); 
-  Comdat *NewComdat; 
-  Module *M = F.getParent(); 
-  // For AvailableExternallyLinkage functions, change the linkage to 
-  // LinkOnceODR and put them into comdat. This is because after renaming, there 
-  // is no backup external copy available for the function. 
-  if (!F.hasComdat()) { 
-    assert(F.getLinkage() == GlobalValue::AvailableExternallyLinkage); 
-    NewComdat = M->getOrInsertComdat(StringRef(NewFuncName)); 
-    F.setLinkage(GlobalValue::LinkOnceODRLinkage); 
-    F.setComdat(NewComdat); 
-    return; 
-  } 
- 
-  // This function belongs to a single function Comdat group. 
-  Comdat *OrigComdat = F.getComdat(); 
-  std::string NewComdatName = 
-      Twine(OrigComdat->getName() + "." + Twine(FunctionHash)).str(); 
-  NewComdat = M->getOrInsertComdat(StringRef(NewComdatName)); 
-  NewComdat->setSelectionKind(OrigComdat->getSelectionKind()); 
- 
-  for (auto &&CM : make_range(ComdatMembers.equal_range(OrigComdat))) { 
-    // Must be a function. 
+    Function *FM = dyn_cast<Function>(CM.second);
+    if (FM != &F)
+      return false;
+  }
+  return true;
+}
+
+// Append the CFGHash to the Comdat function name.
+template <class Edge, class BBInfo>
+void FuncPGOInstrumentation<Edge, BBInfo>::renameComdatFunction() {
+  if (!canRenameComdat(F, ComdatMembers))
+    return;
+  std::string OrigName = F.getName().str();
+  std::string NewFuncName =
+      Twine(F.getName() + "." + Twine(FunctionHash)).str();
+  F.setName(Twine(NewFuncName));
+  GlobalAlias::create(GlobalValue::WeakAnyLinkage, OrigName, &F);
+  FuncName = Twine(FuncName + "." + Twine(FunctionHash)).str();
+  Comdat *NewComdat;
+  Module *M = F.getParent();
+  // For AvailableExternallyLinkage functions, change the linkage to
+  // LinkOnceODR and put them into comdat. This is because after renaming, there
+  // is no backup external copy available for the function.
+  if (!F.hasComdat()) {
+    assert(F.getLinkage() == GlobalValue::AvailableExternallyLinkage);
+    NewComdat = M->getOrInsertComdat(StringRef(NewFuncName));
+    F.setLinkage(GlobalValue::LinkOnceODRLinkage);
+    F.setComdat(NewComdat);
+    return;
+  }
+
+  // This function belongs to a single function Comdat group.
+  Comdat *OrigComdat = F.getComdat();
+  std::string NewComdatName =
+      Twine(OrigComdat->getName() + "." + Twine(FunctionHash)).str();
+  NewComdat = M->getOrInsertComdat(StringRef(NewComdatName));
+  NewComdat->setSelectionKind(OrigComdat->getSelectionKind());
+
+  for (auto &&CM : make_range(ComdatMembers.equal_range(OrigComdat))) {
+    // Must be a function.
     cast<Function>(CM.second)->setComdat(NewComdat);
-  } 
-} 
- 
-// Collect all the BBs that will be instruments and return them in 
-// InstrumentBBs and setup InEdges/OutEdge for UseBBInfo. 
-template <class Edge, class BBInfo> 
-void FuncPGOInstrumentation<Edge, BBInfo>::getInstrumentBBs( 
-    std::vector<BasicBlock *> &InstrumentBBs) { 
-  // Use a worklist as we will update the vector during the iteration. 
-  std::vector<Edge *> EdgeList; 
-  EdgeList.reserve(MST.AllEdges.size()); 
-  for (auto &E : MST.AllEdges) 
-    EdgeList.push_back(E.get()); 
- 
-  for (auto &E : EdgeList) { 
-    BasicBlock *InstrBB = getInstrBB(E); 
-    if (InstrBB) 
-      InstrumentBBs.push_back(InstrBB); 
-  } 
- 
-  // Set up InEdges/OutEdges for all BBs. 
-  for (auto &E : MST.AllEdges) { 
-    if (E->Removed) 
-      continue; 
-    const BasicBlock *SrcBB = E->SrcBB; 
-    const BasicBlock *DestBB = E->DestBB; 
-    BBInfo &SrcInfo = getBBInfo(SrcBB); 
-    BBInfo &DestInfo = getBBInfo(DestBB); 
-    SrcInfo.addOutEdge(E.get()); 
-    DestInfo.addInEdge(E.get()); 
-  } 
-} 
- 
-// Given a CFG E to be instrumented, find which BB to place the instrumented 
-// code. The function will split the critical edge if necessary. 
-template <class Edge, class BBInfo> 
-BasicBlock *FuncPGOInstrumentation<Edge, BBInfo>::getInstrBB(Edge *E) { 
-  if (E->InMST || E->Removed) 
-    return nullptr; 
- 
-  BasicBlock *SrcBB = const_cast<BasicBlock *>(E->SrcBB); 
-  BasicBlock *DestBB = const_cast<BasicBlock *>(E->DestBB); 
-  // For a fake edge, instrument the real BB. 
-  if (SrcBB == nullptr) 
-    return DestBB; 
-  if (DestBB == nullptr) 
-    return SrcBB; 
- 
-  auto canInstrument = [](BasicBlock *BB) -> BasicBlock * { 
-    // There are basic blocks (such as catchswitch) cannot be instrumented. 
-    // If the returned first insertion point is the end of BB, skip this BB. 
-    if (BB->getFirstInsertionPt() == BB->end()) 
-      return nullptr; 
-    return BB; 
-  }; 
- 
-  // Instrument the SrcBB if it has a single successor, 
-  // otherwise, the DestBB if this is not a critical edge. 
-  Instruction *TI = SrcBB->getTerminator(); 
-  if (TI->getNumSuccessors() <= 1) 
-    return canInstrument(SrcBB); 
-  if (!E->IsCritical) 
-    return canInstrument(DestBB); 
- 
+  }
+}
+
+// Collect all the BBs that will be instruments and return them in
+// InstrumentBBs and setup InEdges/OutEdge for UseBBInfo.
+template <class Edge, class BBInfo>
+void FuncPGOInstrumentation<Edge, BBInfo>::getInstrumentBBs(
+    std::vector<BasicBlock *> &InstrumentBBs) {
+  // Use a worklist as we will update the vector during the iteration.
+  std::vector<Edge *> EdgeList;
+  EdgeList.reserve(MST.AllEdges.size());
+  for (auto &E : MST.AllEdges)
+    EdgeList.push_back(E.get());
+
+  for (auto &E : EdgeList) {
+    BasicBlock *InstrBB = getInstrBB(E);
+    if (InstrBB)
+      InstrumentBBs.push_back(InstrBB);
+  }
+
+  // Set up InEdges/OutEdges for all BBs.
+  for (auto &E : MST.AllEdges) {
+    if (E->Removed)
+      continue;
+    const BasicBlock *SrcBB = E->SrcBB;
+    const BasicBlock *DestBB = E->DestBB;
+    BBInfo &SrcInfo = getBBInfo(SrcBB);
+    BBInfo &DestInfo = getBBInfo(DestBB);
+    SrcInfo.addOutEdge(E.get());
+    DestInfo.addInEdge(E.get());
+  }
+}
+
+// Given a CFG E to be instrumented, find which BB to place the instrumented
+// code. The function will split the critical edge if necessary.
+template <class Edge, class BBInfo>
+BasicBlock *FuncPGOInstrumentation<Edge, BBInfo>::getInstrBB(Edge *E) {
+  if (E->InMST || E->Removed)
+    return nullptr;
+
+  BasicBlock *SrcBB = const_cast<BasicBlock *>(E->SrcBB);
+  BasicBlock *DestBB = const_cast<BasicBlock *>(E->DestBB);
+  // For a fake edge, instrument the real BB.
+  if (SrcBB == nullptr)
+    return DestBB;
+  if (DestBB == nullptr)
+    return SrcBB;
+
+  auto canInstrument = [](BasicBlock *BB) -> BasicBlock * {
+    // There are basic blocks (such as catchswitch) cannot be instrumented.
+    // If the returned first insertion point is the end of BB, skip this BB.
+    if (BB->getFirstInsertionPt() == BB->end())
+      return nullptr;
+    return BB;
+  };
+
+  // Instrument the SrcBB if it has a single successor,
+  // otherwise, the DestBB if this is not a critical edge.
+  Instruction *TI = SrcBB->getTerminator();
+  if (TI->getNumSuccessors() <= 1)
+    return canInstrument(SrcBB);
+  if (!E->IsCritical)
+    return canInstrument(DestBB);
+
   // Some IndirectBr critical edges cannot be split by the previous
   // SplitIndirectBrCriticalEdges call. Bail out.
-  unsigned SuccNum = GetSuccessorNumber(SrcBB, DestBB); 
+  unsigned SuccNum = GetSuccessorNumber(SrcBB, DestBB);
   BasicBlock *InstrBB =
       isa<IndirectBrInst>(TI) ? nullptr : SplitCriticalEdge(TI, SuccNum);
-  if (!InstrBB) { 
-    LLVM_DEBUG( 
-        dbgs() << "Fail to split critical edge: not instrument this edge.\n"); 
-    return nullptr; 
-  } 
-  // For a critical edge, we have to split. Instrument the newly 
-  // created BB. 
-  IsCS ? NumOfCSPGOSplit++ : NumOfPGOSplit++; 
-  LLVM_DEBUG(dbgs() << "Split critical edge: " << getBBInfo(SrcBB).Index 
-                    << " --> " << getBBInfo(DestBB).Index << "\n"); 
-  // Need to add two new edges. First one: Add new edge of SrcBB->InstrBB. 
-  MST.addEdge(SrcBB, InstrBB, 0); 
-  // Second one: Add new edge of InstrBB->DestBB. 
-  Edge &NewEdge1 = MST.addEdge(InstrBB, DestBB, 0); 
-  NewEdge1.InMST = true; 
-  E->Removed = true; 
- 
-  return canInstrument(InstrBB); 
-} 
- 
-// When generating value profiling calls on Windows routines that make use of 
-// handler funclets for exception processing an operand bundle needs to attached 
-// to the called function. This routine will set \p OpBundles to contain the 
-// funclet information, if any is needed, that should be placed on the generated 
-// value profiling call for the value profile candidate call. 
-static void 
-populateEHOperandBundle(VPCandidateInfo &Cand, 
-                        DenseMap<BasicBlock *, ColorVector> &BlockColors, 
-                        SmallVectorImpl<OperandBundleDef> &OpBundles) { 
-  auto *OrigCall = dyn_cast<CallBase>(Cand.AnnotatedInst); 
-  if (OrigCall && !isa<IntrinsicInst>(OrigCall)) { 
-    // The instrumentation call should belong to the same funclet as a 
-    // non-intrinsic call, so just copy the operand bundle, if any exists. 
-    Optional<OperandBundleUse> ParentFunclet = 
-        OrigCall->getOperandBundle(LLVMContext::OB_funclet); 
-    if (ParentFunclet) 
-      OpBundles.emplace_back(OperandBundleDef(*ParentFunclet)); 
-  } else { 
-    // Intrinsics or other instructions do not get funclet information from the 
-    // front-end. Need to use the BlockColors that was computed by the routine 
-    // colorEHFunclets to determine whether a funclet is needed. 
-    if (!BlockColors.empty()) { 
-      const ColorVector &CV = BlockColors.find(OrigCall->getParent())->second; 
-      assert(CV.size() == 1 && "non-unique color for block!"); 
-      Instruction *EHPad = CV.front()->getFirstNonPHI(); 
-      if (EHPad->isEHPad()) 
-        OpBundles.emplace_back("funclet", EHPad); 
-    } 
-  } 
-} 
- 
-// Visit all edge and instrument the edges not in MST, and do value profiling. 
-// Critical edges will be split. 
-static void instrumentOneFunc( 
-    Function &F, Module *M, TargetLibraryInfo &TLI, BranchProbabilityInfo *BPI, 
-    BlockFrequencyInfo *BFI, 
-    std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers, 
-    bool IsCS) { 
-  // Split indirectbr critical edges here before computing the MST rather than 
-  // later in getInstrBB() to avoid invalidating it. 
-  SplitIndirectBrCriticalEdges(F, BPI, BFI); 
- 
+  if (!InstrBB) {
+    LLVM_DEBUG(
+        dbgs() << "Fail to split critical edge: not instrument this edge.\n");
+    return nullptr;
+  }
+  // For a critical edge, we have to split. Instrument the newly
+  // created BB.
+  IsCS ? NumOfCSPGOSplit++ : NumOfPGOSplit++;
+  LLVM_DEBUG(dbgs() << "Split critical edge: " << getBBInfo(SrcBB).Index
+                    << " --> " << getBBInfo(DestBB).Index << "\n");
+  // Need to add two new edges. First one: Add new edge of SrcBB->InstrBB.
+  MST.addEdge(SrcBB, InstrBB, 0);
+  // Second one: Add new edge of InstrBB->DestBB.
+  Edge &NewEdge1 = MST.addEdge(InstrBB, DestBB, 0);
+  NewEdge1.InMST = true;
+  E->Removed = true;
+
+  return canInstrument(InstrBB);
+}
+
+// When generating value profiling calls on Windows routines that make use of
+// handler funclets for exception processing an operand bundle needs to attached
+// to the called function. This routine will set \p OpBundles to contain the
+// funclet information, if any is needed, that should be placed on the generated
+// value profiling call for the value profile candidate call.
+static void
+populateEHOperandBundle(VPCandidateInfo &Cand,
+                        DenseMap<BasicBlock *, ColorVector> &BlockColors,
+                        SmallVectorImpl<OperandBundleDef> &OpBundles) {
+  auto *OrigCall = dyn_cast<CallBase>(Cand.AnnotatedInst);
+  if (OrigCall && !isa<IntrinsicInst>(OrigCall)) {
+    // The instrumentation call should belong to the same funclet as a
+    // non-intrinsic call, so just copy the operand bundle, if any exists.
+    Optional<OperandBundleUse> ParentFunclet =
+        OrigCall->getOperandBundle(LLVMContext::OB_funclet);
+    if (ParentFunclet)
+      OpBundles.emplace_back(OperandBundleDef(*ParentFunclet));
+  } else {
+    // Intrinsics or other instructions do not get funclet information from the
+    // front-end. Need to use the BlockColors that was computed by the routine
+    // colorEHFunclets to determine whether a funclet is needed.
+    if (!BlockColors.empty()) {
+      const ColorVector &CV = BlockColors.find(OrigCall->getParent())->second;
+      assert(CV.size() == 1 && "non-unique color for block!");
+      Instruction *EHPad = CV.front()->getFirstNonPHI();
+      if (EHPad->isEHPad())
+        OpBundles.emplace_back("funclet", EHPad);
+    }
+  }
+}
+
+// Visit all edge and instrument the edges not in MST, and do value profiling.
+// Critical edges will be split.
+static void instrumentOneFunc(
+    Function &F, Module *M, TargetLibraryInfo &TLI, BranchProbabilityInfo *BPI,
+    BlockFrequencyInfo *BFI,
+    std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
+    bool IsCS) {
+  // Split indirectbr critical edges here before computing the MST rather than
+  // later in getInstrBB() to avoid invalidating it.
+  SplitIndirectBrCriticalEdges(F, BPI, BFI);
+
   FuncPGOInstrumentation<PGOEdge, BBInfo> FuncInfo(
       F, TLI, ComdatMembers, true, BPI, BFI, IsCS, PGOInstrumentEntry);
-  std::vector<BasicBlock *> InstrumentBBs; 
-  FuncInfo.getInstrumentBBs(InstrumentBBs); 
-  unsigned NumCounters = 
-      InstrumentBBs.size() + FuncInfo.SIVisitor.getNumOfSelectInsts(); 
- 
-  uint32_t I = 0; 
-  Type *I8PtrTy = Type::getInt8PtrTy(M->getContext()); 
-  for (auto *InstrBB : InstrumentBBs) { 
-    IRBuilder<> Builder(InstrBB, InstrBB->getFirstInsertionPt()); 
-    assert(Builder.GetInsertPoint() != InstrBB->end() && 
-           "Cannot get the Instrumentation point"); 
-    Builder.CreateCall( 
-        Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment), 
-        {ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy), 
-         Builder.getInt64(FuncInfo.FunctionHash), Builder.getInt32(NumCounters), 
-         Builder.getInt32(I++)}); 
-  } 
- 
-  // Now instrument select instructions: 
-  FuncInfo.SIVisitor.instrumentSelects(F, &I, NumCounters, FuncInfo.FuncNameVar, 
-                                       FuncInfo.FunctionHash); 
-  assert(I == NumCounters); 
- 
-  if (DisableValueProfiling) 
-    return; 
- 
-  NumOfPGOICall += FuncInfo.ValueSites[IPVK_IndirectCallTarget].size(); 
- 
-  // Intrinsic function calls do not have funclet operand bundles needed for 
-  // Windows exception handling attached to them. However, if value profiling is 
-  // inserted for one of these calls, then a funclet value will need to be set 
-  // on the instrumentation call based on the funclet coloring. 
-  DenseMap<BasicBlock *, ColorVector> BlockColors; 
-  if (F.hasPersonalityFn() && 
-      isFuncletEHPersonality(classifyEHPersonality(F.getPersonalityFn()))) 
-    BlockColors = colorEHFunclets(F); 
- 
-  // For each VP Kind, walk the VP candidates and instrument each one. 
-  for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) { 
-    unsigned SiteIndex = 0; 
-    if (Kind == IPVK_MemOPSize && !PGOInstrMemOP) 
-      continue; 
- 
-    for (VPCandidateInfo Cand : FuncInfo.ValueSites[Kind]) { 
-      LLVM_DEBUG(dbgs() << "Instrument one VP " << ValueProfKindDescr[Kind] 
-                        << " site: CallSite Index = " << SiteIndex << "\n"); 
- 
-      IRBuilder<> Builder(Cand.InsertPt); 
-      assert(Builder.GetInsertPoint() != Cand.InsertPt->getParent()->end() && 
-             "Cannot get the Instrumentation point"); 
- 
-      Value *ToProfile = nullptr; 
-      if (Cand.V->getType()->isIntegerTy()) 
-        ToProfile = Builder.CreateZExtOrTrunc(Cand.V, Builder.getInt64Ty()); 
-      else if (Cand.V->getType()->isPointerTy()) 
-        ToProfile = Builder.CreatePtrToInt(Cand.V, Builder.getInt64Ty()); 
-      assert(ToProfile && "value profiling Value is of unexpected type"); 
- 
-      SmallVector<OperandBundleDef, 1> OpBundles; 
-      populateEHOperandBundle(Cand, BlockColors, OpBundles); 
-      Builder.CreateCall( 
-          Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile), 
-          {ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy), 
-           Builder.getInt64(FuncInfo.FunctionHash), ToProfile, 
-           Builder.getInt32(Kind), Builder.getInt32(SiteIndex++)}, 
-          OpBundles); 
-    } 
-  } // IPVK_First <= Kind <= IPVK_Last 
-} 
- 
-namespace { 
- 
-// This class represents a CFG edge in profile use compilation. 
-struct PGOUseEdge : public PGOEdge { 
-  bool CountValid = false; 
-  uint64_t CountValue = 0; 
- 
-  PGOUseEdge(const BasicBlock *Src, const BasicBlock *Dest, uint64_t W = 1) 
-      : PGOEdge(Src, Dest, W) {} 
- 
-  // Set edge count value 
-  void setEdgeCount(uint64_t Value) { 
-    CountValue = Value; 
-    CountValid = true; 
-  } 
- 
-  // Return the information string for this object. 
-  const std::string infoString() const { 
-    if (!CountValid) 
-      return PGOEdge::infoString(); 
-    return (Twine(PGOEdge::infoString()) + "  Count=" + Twine(CountValue)) 
-        .str(); 
-  } 
-}; 
- 
-using DirectEdges = SmallVector<PGOUseEdge *, 2>; 
- 
-// This class stores the auxiliary information for each BB. 
-struct UseBBInfo : public BBInfo { 
-  uint64_t CountValue = 0; 
-  bool CountValid; 
-  int32_t UnknownCountInEdge = 0; 
-  int32_t UnknownCountOutEdge = 0; 
-  DirectEdges InEdges; 
-  DirectEdges OutEdges; 
- 
-  UseBBInfo(unsigned IX) : BBInfo(IX), CountValid(false) {} 
- 
-  UseBBInfo(unsigned IX, uint64_t C) 
-      : BBInfo(IX), CountValue(C), CountValid(true) {} 
- 
-  // Set the profile count value for this BB. 
-  void setBBInfoCount(uint64_t Value) { 
-    CountValue = Value; 
-    CountValid = true; 
-  } 
- 
-  // Return the information string of this object. 
-  const std::string infoString() const { 
-    if (!CountValid) 
-      return BBInfo::infoString(); 
-    return (Twine(BBInfo::infoString()) + "  Count=" + Twine(CountValue)).str(); 
-  } 
- 
-  // Add an OutEdge and update the edge count. 
-  void addOutEdge(PGOUseEdge *E) { 
-    OutEdges.push_back(E); 
-    UnknownCountOutEdge++; 
-  } 
- 
-  // Add an InEdge and update the edge count. 
-  void addInEdge(PGOUseEdge *E) { 
-    InEdges.push_back(E); 
-    UnknownCountInEdge++; 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-// Sum up the count values for all the edges. 
-static uint64_t sumEdgeCount(const ArrayRef<PGOUseEdge *> Edges) { 
-  uint64_t Total = 0; 
-  for (auto &E : Edges) { 
-    if (E->Removed) 
-      continue; 
-    Total += E->CountValue; 
-  } 
-  return Total; 
-} 
- 
-namespace { 
- 
-class PGOUseFunc { 
-public: 
-  PGOUseFunc(Function &Func, Module *Modu, TargetLibraryInfo &TLI, 
-             std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers, 
-             BranchProbabilityInfo *BPI, BlockFrequencyInfo *BFIin, 
+  std::vector<BasicBlock *> InstrumentBBs;
+  FuncInfo.getInstrumentBBs(InstrumentBBs);
+  unsigned NumCounters =
+      InstrumentBBs.size() + FuncInfo.SIVisitor.getNumOfSelectInsts();
+
+  uint32_t I = 0;
+  Type *I8PtrTy = Type::getInt8PtrTy(M->getContext());
+  for (auto *InstrBB : InstrumentBBs) {
+    IRBuilder<> Builder(InstrBB, InstrBB->getFirstInsertionPt());
+    assert(Builder.GetInsertPoint() != InstrBB->end() &&
+           "Cannot get the Instrumentation point");
+    Builder.CreateCall(
+        Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment),
+        {ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy),
+         Builder.getInt64(FuncInfo.FunctionHash), Builder.getInt32(NumCounters),
+         Builder.getInt32(I++)});
+  }
+
+  // Now instrument select instructions:
+  FuncInfo.SIVisitor.instrumentSelects(F, &I, NumCounters, FuncInfo.FuncNameVar,
+                                       FuncInfo.FunctionHash);
+  assert(I == NumCounters);
+
+  if (DisableValueProfiling)
+    return;
+
+  NumOfPGOICall += FuncInfo.ValueSites[IPVK_IndirectCallTarget].size();
+
+  // Intrinsic function calls do not have funclet operand bundles needed for
+  // Windows exception handling attached to them. However, if value profiling is
+  // inserted for one of these calls, then a funclet value will need to be set
+  // on the instrumentation call based on the funclet coloring.
+  DenseMap<BasicBlock *, ColorVector> BlockColors;
+  if (F.hasPersonalityFn() &&
+      isFuncletEHPersonality(classifyEHPersonality(F.getPersonalityFn())))
+    BlockColors = colorEHFunclets(F);
+
+  // For each VP Kind, walk the VP candidates and instrument each one.
+  for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) {
+    unsigned SiteIndex = 0;
+    if (Kind == IPVK_MemOPSize && !PGOInstrMemOP)
+      continue;
+
+    for (VPCandidateInfo Cand : FuncInfo.ValueSites[Kind]) {
+      LLVM_DEBUG(dbgs() << "Instrument one VP " << ValueProfKindDescr[Kind]
+                        << " site: CallSite Index = " << SiteIndex << "\n");
+
+      IRBuilder<> Builder(Cand.InsertPt);
+      assert(Builder.GetInsertPoint() != Cand.InsertPt->getParent()->end() &&
+             "Cannot get the Instrumentation point");
+
+      Value *ToProfile = nullptr;
+      if (Cand.V->getType()->isIntegerTy())
+        ToProfile = Builder.CreateZExtOrTrunc(Cand.V, Builder.getInt64Ty());
+      else if (Cand.V->getType()->isPointerTy())
+        ToProfile = Builder.CreatePtrToInt(Cand.V, Builder.getInt64Ty());
+      assert(ToProfile && "value profiling Value is of unexpected type");
+
+      SmallVector<OperandBundleDef, 1> OpBundles;
+      populateEHOperandBundle(Cand, BlockColors, OpBundles);
+      Builder.CreateCall(
+          Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile),
+          {ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy),
+           Builder.getInt64(FuncInfo.FunctionHash), ToProfile,
+           Builder.getInt32(Kind), Builder.getInt32(SiteIndex++)},
+          OpBundles);
+    }
+  } // IPVK_First <= Kind <= IPVK_Last
+}
+
+namespace {
+
+// This class represents a CFG edge in profile use compilation.
+struct PGOUseEdge : public PGOEdge {
+  bool CountValid = false;
+  uint64_t CountValue = 0;
+
+  PGOUseEdge(const BasicBlock *Src, const BasicBlock *Dest, uint64_t W = 1)
+      : PGOEdge(Src, Dest, W) {}
+
+  // Set edge count value
+  void setEdgeCount(uint64_t Value) {
+    CountValue = Value;
+    CountValid = true;
+  }
+
+  // Return the information string for this object.
+  const std::string infoString() const {
+    if (!CountValid)
+      return PGOEdge::infoString();
+    return (Twine(PGOEdge::infoString()) + "  Count=" + Twine(CountValue))
+        .str();
+  }
+};
+
+using DirectEdges = SmallVector<PGOUseEdge *, 2>;
+
+// This class stores the auxiliary information for each BB.
+struct UseBBInfo : public BBInfo {
+  uint64_t CountValue = 0;
+  bool CountValid;
+  int32_t UnknownCountInEdge = 0;
+  int32_t UnknownCountOutEdge = 0;
+  DirectEdges InEdges;
+  DirectEdges OutEdges;
+
+  UseBBInfo(unsigned IX) : BBInfo(IX), CountValid(false) {}
+
+  UseBBInfo(unsigned IX, uint64_t C)
+      : BBInfo(IX), CountValue(C), CountValid(true) {}
+
+  // Set the profile count value for this BB.
+  void setBBInfoCount(uint64_t Value) {
+    CountValue = Value;
+    CountValid = true;
+  }
+
+  // Return the information string of this object.
+  const std::string infoString() const {
+    if (!CountValid)
+      return BBInfo::infoString();
+    return (Twine(BBInfo::infoString()) + "  Count=" + Twine(CountValue)).str();
+  }
+
+  // Add an OutEdge and update the edge count.
+  void addOutEdge(PGOUseEdge *E) {
+    OutEdges.push_back(E);
+    UnknownCountOutEdge++;
+  }
+
+  // Add an InEdge and update the edge count.
+  void addInEdge(PGOUseEdge *E) {
+    InEdges.push_back(E);
+    UnknownCountInEdge++;
+  }
+};
+
+} // end anonymous namespace
+
+// Sum up the count values for all the edges.
+static uint64_t sumEdgeCount(const ArrayRef<PGOUseEdge *> Edges) {
+  uint64_t Total = 0;
+  for (auto &E : Edges) {
+    if (E->Removed)
+      continue;
+    Total += E->CountValue;
+  }
+  return Total;
+}
+
+namespace {
+
+class PGOUseFunc {
+public:
+  PGOUseFunc(Function &Func, Module *Modu, TargetLibraryInfo &TLI,
+             std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
+             BranchProbabilityInfo *BPI, BlockFrequencyInfo *BFIin,
              ProfileSummaryInfo *PSI, bool IsCS, bool InstrumentFuncEntry)
-      : F(Func), M(Modu), BFI(BFIin), PSI(PSI), 
+      : F(Func), M(Modu), BFI(BFIin), PSI(PSI),
         FuncInfo(Func, TLI, ComdatMembers, false, BPI, BFIin, IsCS,
                  InstrumentFuncEntry),
-        FreqAttr(FFA_Normal), IsCS(IsCS) {} 
- 
-  // Read counts for the instrumented BB from profile. 
+        FreqAttr(FFA_Normal), IsCS(IsCS) {}
+
+  // Read counts for the instrumented BB from profile.
   bool readCounters(IndexedInstrProfReader *PGOReader, bool &AllZeros,
                     bool &AllMinusOnes);
- 
-  // Populate the counts for all BBs. 
-  void populateCounters(); 
- 
-  // Set the branch weights based on the count values. 
-  void setBranchWeights(); 
- 
-  // Annotate the value profile call sites for all value kind. 
-  void annotateValueSites(); 
- 
-  // Annotate the value profile call sites for one value kind. 
-  void annotateValueSites(uint32_t Kind); 
- 
-  // Annotate the irreducible loop header weights. 
-  void annotateIrrLoopHeaderWeights(); 
- 
-  // The hotness of the function from the profile count. 
-  enum FuncFreqAttr { FFA_Normal, FFA_Cold, FFA_Hot }; 
- 
-  // Return the function hotness from the profile. 
-  FuncFreqAttr getFuncFreqAttr() const { return FreqAttr; } 
- 
-  // Return the function hash. 
-  uint64_t getFuncHash() const { return FuncInfo.FunctionHash; } 
- 
-  // Return the profile record for this function; 
-  InstrProfRecord &getProfileRecord() { return ProfileRecord; } 
- 
-  // Return the auxiliary BB information. 
-  UseBBInfo &getBBInfo(const BasicBlock *BB) const { 
-    return FuncInfo.getBBInfo(BB); 
-  } 
- 
-  // Return the auxiliary BB information if available. 
-  UseBBInfo *findBBInfo(const BasicBlock *BB) const { 
-    return FuncInfo.findBBInfo(BB); 
-  } 
- 
-  Function &getFunc() const { return F; } 
- 
-  void dumpInfo(std::string Str = "") const { 
-    FuncInfo.dumpInfo(Str); 
-  } 
- 
-  uint64_t getProgramMaxCount() const { return ProgramMaxCount; } 
-private: 
-  Function &F; 
-  Module *M; 
-  BlockFrequencyInfo *BFI; 
-  ProfileSummaryInfo *PSI; 
- 
-  // This member stores the shared information with class PGOGenFunc. 
-  FuncPGOInstrumentation<PGOUseEdge, UseBBInfo> FuncInfo; 
- 
-  // The maximum count value in the profile. This is only used in PGO use 
-  // compilation. 
-  uint64_t ProgramMaxCount; 
- 
-  // Position of counter that remains to be read. 
-  uint32_t CountPosition = 0; 
- 
-  // Total size of the profile count for this function. 
-  uint32_t ProfileCountSize = 0; 
- 
-  // ProfileRecord for this function. 
-  InstrProfRecord ProfileRecord; 
- 
-  // Function hotness info derived from profile. 
-  FuncFreqAttr FreqAttr; 
- 
-  // Is to use the context sensitive profile. 
-  bool IsCS; 
- 
-  // Find the Instrumented BB and set the value. Return false on error. 
-  bool setInstrumentedCounts(const std::vector<uint64_t> &CountFromProfile); 
- 
-  // Set the edge counter value for the unknown edge -- there should be only 
-  // one unknown edge. 
-  void setEdgeCount(DirectEdges &Edges, uint64_t Value); 
- 
-  // Return FuncName string; 
-  const std::string getFuncName() const { return FuncInfo.FuncName; } 
- 
-  // Set the hot/cold inline hints based on the count values. 
-  // FIXME: This function should be removed once the functionality in 
-  // the inliner is implemented. 
-  void markFunctionAttributes(uint64_t EntryCount, uint64_t MaxCount) { 
-    if (PSI->isHotCount(EntryCount)) 
-      FreqAttr = FFA_Hot; 
-    else if (PSI->isColdCount(MaxCount)) 
-      FreqAttr = FFA_Cold; 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-// Visit all the edges and assign the count value for the instrumented 
-// edges and the BB. Return false on error. 
-bool PGOUseFunc::setInstrumentedCounts( 
-    const std::vector<uint64_t> &CountFromProfile) { 
- 
-  std::vector<BasicBlock *> InstrumentBBs; 
-  FuncInfo.getInstrumentBBs(InstrumentBBs); 
-  unsigned NumCounters = 
-      InstrumentBBs.size() + FuncInfo.SIVisitor.getNumOfSelectInsts(); 
-  // The number of counters here should match the number of counters 
-  // in profile. Return if they mismatch. 
-  if (NumCounters != CountFromProfile.size()) { 
-    return false; 
-  } 
+
+  // Populate the counts for all BBs.
+  void populateCounters();
+
+  // Set the branch weights based on the count values.
+  void setBranchWeights();
+
+  // Annotate the value profile call sites for all value kind.
+  void annotateValueSites();
+
+  // Annotate the value profile call sites for one value kind.
+  void annotateValueSites(uint32_t Kind);
+
+  // Annotate the irreducible loop header weights.
+  void annotateIrrLoopHeaderWeights();
+
+  // The hotness of the function from the profile count.
+  enum FuncFreqAttr { FFA_Normal, FFA_Cold, FFA_Hot };
+
+  // Return the function hotness from the profile.
+  FuncFreqAttr getFuncFreqAttr() const { return FreqAttr; }
+
+  // Return the function hash.
+  uint64_t getFuncHash() const { return FuncInfo.FunctionHash; }
+
+  // Return the profile record for this function;
+  InstrProfRecord &getProfileRecord() { return ProfileRecord; }
+
+  // Return the auxiliary BB information.
+  UseBBInfo &getBBInfo(const BasicBlock *BB) const {
+    return FuncInfo.getBBInfo(BB);
+  }
+
+  // Return the auxiliary BB information if available.
+  UseBBInfo *findBBInfo(const BasicBlock *BB) const {
+    return FuncInfo.findBBInfo(BB);
+  }
+
+  Function &getFunc() const { return F; }
+
+  void dumpInfo(std::string Str = "") const {
+    FuncInfo.dumpInfo(Str);
+  }
+
+  uint64_t getProgramMaxCount() const { return ProgramMaxCount; }
+private:
+  Function &F;
+  Module *M;
+  BlockFrequencyInfo *BFI;
+  ProfileSummaryInfo *PSI;
+
+  // This member stores the shared information with class PGOGenFunc.
+  FuncPGOInstrumentation<PGOUseEdge, UseBBInfo> FuncInfo;
+
+  // The maximum count value in the profile. This is only used in PGO use
+  // compilation.
+  uint64_t ProgramMaxCount;
+
+  // Position of counter that remains to be read.
+  uint32_t CountPosition = 0;
+
+  // Total size of the profile count for this function.
+  uint32_t ProfileCountSize = 0;
+
+  // ProfileRecord for this function.
+  InstrProfRecord ProfileRecord;
+
+  // Function hotness info derived from profile.
+  FuncFreqAttr FreqAttr;
+
+  // Is to use the context sensitive profile.
+  bool IsCS;
+
+  // Find the Instrumented BB and set the value. Return false on error.
+  bool setInstrumentedCounts(const std::vector<uint64_t> &CountFromProfile);
+
+  // Set the edge counter value for the unknown edge -- there should be only
+  // one unknown edge.
+  void setEdgeCount(DirectEdges &Edges, uint64_t Value);
+
+  // Return FuncName string;
+  const std::string getFuncName() const { return FuncInfo.FuncName; }
+
+  // Set the hot/cold inline hints based on the count values.
+  // FIXME: This function should be removed once the functionality in
+  // the inliner is implemented.
+  void markFunctionAttributes(uint64_t EntryCount, uint64_t MaxCount) {
+    if (PSI->isHotCount(EntryCount))
+      FreqAttr = FFA_Hot;
+    else if (PSI->isColdCount(MaxCount))
+      FreqAttr = FFA_Cold;
+  }
+};
+
+} // end anonymous namespace
+
+// Visit all the edges and assign the count value for the instrumented
+// edges and the BB. Return false on error.
+bool PGOUseFunc::setInstrumentedCounts(
+    const std::vector<uint64_t> &CountFromProfile) {
+
+  std::vector<BasicBlock *> InstrumentBBs;
+  FuncInfo.getInstrumentBBs(InstrumentBBs);
+  unsigned NumCounters =
+      InstrumentBBs.size() + FuncInfo.SIVisitor.getNumOfSelectInsts();
+  // The number of counters here should match the number of counters
+  // in profile. Return if they mismatch.
+  if (NumCounters != CountFromProfile.size()) {
+    return false;
+  }
   auto *FuncEntry = &*F.begin();
 
-  // Set the profile count to the Instrumented BBs. 
-  uint32_t I = 0; 
-  for (BasicBlock *InstrBB : InstrumentBBs) { 
-    uint64_t CountValue = CountFromProfile[I++]; 
-    UseBBInfo &Info = getBBInfo(InstrBB); 
+  // Set the profile count to the Instrumented BBs.
+  uint32_t I = 0;
+  for (BasicBlock *InstrBB : InstrumentBBs) {
+    uint64_t CountValue = CountFromProfile[I++];
+    UseBBInfo &Info = getBBInfo(InstrBB);
     // If we reach here, we know that we have some nonzero count
     // values in this function. The entry count should not be 0.
     // Fix it if necessary.
     if (InstrBB == FuncEntry && CountValue == 0)
       CountValue = 1;
-    Info.setBBInfoCount(CountValue); 
-  } 
-  ProfileCountSize = CountFromProfile.size(); 
-  CountPosition = I; 
- 
-  // Set the edge count and update the count of unknown edges for BBs. 
-  auto setEdgeCount = [this](PGOUseEdge *E, uint64_t Value) -> void { 
-    E->setEdgeCount(Value); 
-    this->getBBInfo(E->SrcBB).UnknownCountOutEdge--; 
-    this->getBBInfo(E->DestBB).UnknownCountInEdge--; 
-  }; 
- 
-  // Set the profile count the Instrumented edges. There are BBs that not in 
-  // MST but not instrumented. Need to set the edge count value so that we can 
-  // populate the profile counts later. 
-  for (auto &E : FuncInfo.MST.AllEdges) { 
-    if (E->Removed || E->InMST) 
-      continue; 
-    const BasicBlock *SrcBB = E->SrcBB; 
-    UseBBInfo &SrcInfo = getBBInfo(SrcBB); 
- 
-    // If only one out-edge, the edge profile count should be the same as BB 
-    // profile count. 
-    if (SrcInfo.CountValid && SrcInfo.OutEdges.size() == 1) 
-      setEdgeCount(E.get(), SrcInfo.CountValue); 
-    else { 
-      const BasicBlock *DestBB = E->DestBB; 
-      UseBBInfo &DestInfo = getBBInfo(DestBB); 
-      // If only one in-edge, the edge profile count should be the same as BB 
-      // profile count. 
-      if (DestInfo.CountValid && DestInfo.InEdges.size() == 1) 
-        setEdgeCount(E.get(), DestInfo.CountValue); 
-    } 
-    if (E->CountValid) 
-      continue; 
-    // E's count should have been set from profile. If not, this meenas E skips 
-    // the instrumentation. We set the count to 0. 
-    setEdgeCount(E.get(), 0); 
-  } 
-  return true; 
-} 
- 
-// Set the count value for the unknown edge. There should be one and only one 
-// unknown edge in Edges vector. 
-void PGOUseFunc::setEdgeCount(DirectEdges &Edges, uint64_t Value) { 
-  for (auto &E : Edges) { 
-    if (E->CountValid) 
-      continue; 
-    E->setEdgeCount(Value); 
- 
-    getBBInfo(E->SrcBB).UnknownCountOutEdge--; 
-    getBBInfo(E->DestBB).UnknownCountInEdge--; 
-    return; 
-  } 
-  llvm_unreachable("Cannot find the unknown count edge"); 
-} 
- 
-// Read the profile from ProfileFileName and assign the value to the 
-// instrumented BB and the edges. This function also updates ProgramMaxCount. 
-// Return true if the profile are successfully read, and false on errors. 
+    Info.setBBInfoCount(CountValue);
+  }
+  ProfileCountSize = CountFromProfile.size();
+  CountPosition = I;
+
+  // Set the edge count and update the count of unknown edges for BBs.
+  auto setEdgeCount = [this](PGOUseEdge *E, uint64_t Value) -> void {
+    E->setEdgeCount(Value);
+    this->getBBInfo(E->SrcBB).UnknownCountOutEdge--;
+    this->getBBInfo(E->DestBB).UnknownCountInEdge--;
+  };
+
+  // Set the profile count the Instrumented edges. There are BBs that not in
+  // MST but not instrumented. Need to set the edge count value so that we can
+  // populate the profile counts later.
+  for (auto &E : FuncInfo.MST.AllEdges) {
+    if (E->Removed || E->InMST)
+      continue;
+    const BasicBlock *SrcBB = E->SrcBB;
+    UseBBInfo &SrcInfo = getBBInfo(SrcBB);
+
+    // If only one out-edge, the edge profile count should be the same as BB
+    // profile count.
+    if (SrcInfo.CountValid && SrcInfo.OutEdges.size() == 1)
+      setEdgeCount(E.get(), SrcInfo.CountValue);
+    else {
+      const BasicBlock *DestBB = E->DestBB;
+      UseBBInfo &DestInfo = getBBInfo(DestBB);
+      // If only one in-edge, the edge profile count should be the same as BB
+      // profile count.
+      if (DestInfo.CountValid && DestInfo.InEdges.size() == 1)
+        setEdgeCount(E.get(), DestInfo.CountValue);
+    }
+    if (E->CountValid)
+      continue;
+    // E's count should have been set from profile. If not, this meenas E skips
+    // the instrumentation. We set the count to 0.
+    setEdgeCount(E.get(), 0);
+  }
+  return true;
+}
+
+// Set the count value for the unknown edge. There should be one and only one
+// unknown edge in Edges vector.
+void PGOUseFunc::setEdgeCount(DirectEdges &Edges, uint64_t Value) {
+  for (auto &E : Edges) {
+    if (E->CountValid)
+      continue;
+    E->setEdgeCount(Value);
+
+    getBBInfo(E->SrcBB).UnknownCountOutEdge--;
+    getBBInfo(E->DestBB).UnknownCountInEdge--;
+    return;
+  }
+  llvm_unreachable("Cannot find the unknown count edge");
+}
+
+// Read the profile from ProfileFileName and assign the value to the
+// instrumented BB and the edges. This function also updates ProgramMaxCount.
+// Return true if the profile are successfully read, and false on errors.
 bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader, bool &AllZeros,
                               bool &AllMinusOnes) {
-  auto &Ctx = M->getContext(); 
-  Expected<InstrProfRecord> Result = 
-      PGOReader->getInstrProfRecord(FuncInfo.FuncName, FuncInfo.FunctionHash); 
-  if (Error E = Result.takeError()) { 
-    handleAllErrors(std::move(E), [&](const InstrProfError &IPE) { 
-      auto Err = IPE.get(); 
-      bool SkipWarning = false; 
-      LLVM_DEBUG(dbgs() << "Error in reading profile for Func " 
-                        << FuncInfo.FuncName << ": "); 
-      if (Err == instrprof_error::unknown_function) { 
-        IsCS ? NumOfCSPGOMissing++ : NumOfPGOMissing++; 
-        SkipWarning = !PGOWarnMissing; 
-        LLVM_DEBUG(dbgs() << "unknown function"); 
-      } else if (Err == instrprof_error::hash_mismatch || 
-                 Err == instrprof_error::malformed) { 
-        IsCS ? NumOfCSPGOMismatch++ : NumOfPGOMismatch++; 
-        SkipWarning = 
-            NoPGOWarnMismatch || 
-            (NoPGOWarnMismatchComdat && 
-             (F.hasComdat() || 
-              F.getLinkage() == GlobalValue::AvailableExternallyLinkage)); 
-        LLVM_DEBUG(dbgs() << "hash mismatch (skip=" << SkipWarning << ")"); 
-      } 
- 
-      LLVM_DEBUG(dbgs() << " IsCS=" << IsCS << "\n"); 
-      if (SkipWarning) 
-        return; 
- 
-      std::string Msg = IPE.message() + std::string(" ") + F.getName().str() + 
-                        std::string(" Hash = ") + 
-                        std::to_string(FuncInfo.FunctionHash); 
- 
-      Ctx.diagnose( 
-          DiagnosticInfoPGOProfile(M->getName().data(), Msg, DS_Warning)); 
-    }); 
-    return false; 
-  } 
-  ProfileRecord = std::move(Result.get()); 
-  std::vector<uint64_t> &CountFromProfile = ProfileRecord.Counts; 
- 
-  IsCS ? NumOfCSPGOFunc++ : NumOfPGOFunc++; 
-  LLVM_DEBUG(dbgs() << CountFromProfile.size() << " counts\n"); 
+  auto &Ctx = M->getContext();
+  Expected<InstrProfRecord> Result =
+      PGOReader->getInstrProfRecord(FuncInfo.FuncName, FuncInfo.FunctionHash);
+  if (Error E = Result.takeError()) {
+    handleAllErrors(std::move(E), [&](const InstrProfError &IPE) {
+      auto Err = IPE.get();
+      bool SkipWarning = false;
+      LLVM_DEBUG(dbgs() << "Error in reading profile for Func "
+                        << FuncInfo.FuncName << ": ");
+      if (Err == instrprof_error::unknown_function) {
+        IsCS ? NumOfCSPGOMissing++ : NumOfPGOMissing++;
+        SkipWarning = !PGOWarnMissing;
+        LLVM_DEBUG(dbgs() << "unknown function");
+      } else if (Err == instrprof_error::hash_mismatch ||
+                 Err == instrprof_error::malformed) {
+        IsCS ? NumOfCSPGOMismatch++ : NumOfPGOMismatch++;
+        SkipWarning =
+            NoPGOWarnMismatch ||
+            (NoPGOWarnMismatchComdat &&
+             (F.hasComdat() ||
+              F.getLinkage() == GlobalValue::AvailableExternallyLinkage));
+        LLVM_DEBUG(dbgs() << "hash mismatch (skip=" << SkipWarning << ")");
+      }
+
+      LLVM_DEBUG(dbgs() << " IsCS=" << IsCS << "\n");
+      if (SkipWarning)
+        return;
+
+      std::string Msg = IPE.message() + std::string(" ") + F.getName().str() +
+                        std::string(" Hash = ") +
+                        std::to_string(FuncInfo.FunctionHash);
+
+      Ctx.diagnose(
+          DiagnosticInfoPGOProfile(M->getName().data(), Msg, DS_Warning));
+    });
+    return false;
+  }
+  ProfileRecord = std::move(Result.get());
+  std::vector<uint64_t> &CountFromProfile = ProfileRecord.Counts;
+
+  IsCS ? NumOfCSPGOFunc++ : NumOfPGOFunc++;
+  LLVM_DEBUG(dbgs() << CountFromProfile.size() << " counts\n");
   AllMinusOnes = (CountFromProfile.size() > 0);
-  uint64_t ValueSum = 0; 
-  for (unsigned I = 0, S = CountFromProfile.size(); I < S; I++) { 
-    LLVM_DEBUG(dbgs() << "  " << I << ": " << CountFromProfile[I] << "\n"); 
-    ValueSum += CountFromProfile[I]; 
+  uint64_t ValueSum = 0;
+  for (unsigned I = 0, S = CountFromProfile.size(); I < S; I++) {
+    LLVM_DEBUG(dbgs() << "  " << I << ": " << CountFromProfile[I] << "\n");
+    ValueSum += CountFromProfile[I];
     if (CountFromProfile[I] != (uint64_t)-1)
       AllMinusOnes = false;
-  } 
-  AllZeros = (ValueSum == 0); 
- 
-  LLVM_DEBUG(dbgs() << "SUM =  " << ValueSum << "\n"); 
- 
-  getBBInfo(nullptr).UnknownCountOutEdge = 2; 
-  getBBInfo(nullptr).UnknownCountInEdge = 2; 
- 
-  if (!setInstrumentedCounts(CountFromProfile)) { 
-    LLVM_DEBUG( 
-        dbgs() << "Inconsistent number of counts, skipping this function"); 
-    Ctx.diagnose(DiagnosticInfoPGOProfile( 
-        M->getName().data(), 
-        Twine("Inconsistent number of counts in ") + F.getName().str() 
-        + Twine(": the profile may be stale or there is a function name collision."), 
-        DS_Warning)); 
-    return false; 
-  } 
-  ProgramMaxCount = PGOReader->getMaximumFunctionCount(IsCS); 
-  return true; 
-} 
- 
-// Populate the counters from instrumented BBs to all BBs. 
-// In the end of this operation, all BBs should have a valid count value. 
-void PGOUseFunc::populateCounters() { 
-  bool Changes = true; 
-  unsigned NumPasses = 0; 
-  while (Changes) { 
-    NumPasses++; 
-    Changes = false; 
- 
-    // For efficient traversal, it's better to start from the end as most 
-    // of the instrumented edges are at the end. 
-    for (auto &BB : reverse(F)) { 
-      UseBBInfo *Count = findBBInfo(&BB); 
-      if (Count == nullptr) 
-        continue; 
-      if (!Count->CountValid) { 
-        if (Count->UnknownCountOutEdge == 0) { 
-          Count->CountValue = sumEdgeCount(Count->OutEdges); 
-          Count->CountValid = true; 
-          Changes = true; 
-        } else if (Count->UnknownCountInEdge == 0) { 
-          Count->CountValue = sumEdgeCount(Count->InEdges); 
-          Count->CountValid = true; 
-          Changes = true; 
-        } 
-      } 
-      if (Count->CountValid) { 
-        if (Count->UnknownCountOutEdge == 1) { 
-          uint64_t Total = 0; 
-          uint64_t OutSum = sumEdgeCount(Count->OutEdges); 
-          // If the one of the successor block can early terminate (no-return), 
-          // we can end up with situation where out edge sum count is larger as 
-          // the source BB's count is collected by a post-dominated block. 
-          if (Count->CountValue > OutSum) 
-            Total = Count->CountValue - OutSum; 
-          setEdgeCount(Count->OutEdges, Total); 
-          Changes = true; 
-        } 
-        if (Count->UnknownCountInEdge == 1) { 
-          uint64_t Total = 0; 
-          uint64_t InSum = sumEdgeCount(Count->InEdges); 
-          if (Count->CountValue > InSum) 
-            Total = Count->CountValue - InSum; 
-          setEdgeCount(Count->InEdges, Total); 
-          Changes = true; 
-        } 
-      } 
-    } 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "Populate counts in " << NumPasses << " passes.\n"); 
-#ifndef NDEBUG 
-  // Assert every BB has a valid counter. 
-  for (auto &BB : F) { 
-    auto BI = findBBInfo(&BB); 
-    if (BI == nullptr) 
-      continue; 
-    assert(BI->CountValid && "BB count is not valid"); 
-  } 
-#endif 
-  uint64_t FuncEntryCount = getBBInfo(&*F.begin()).CountValue; 
-  uint64_t FuncMaxCount = FuncEntryCount; 
-  for (auto &BB : F) { 
-    auto BI = findBBInfo(&BB); 
-    if (BI == nullptr) 
-      continue; 
-    FuncMaxCount = std::max(FuncMaxCount, BI->CountValue); 
-  } 
+  }
+  AllZeros = (ValueSum == 0);
+
+  LLVM_DEBUG(dbgs() << "SUM =  " << ValueSum << "\n");
+
+  getBBInfo(nullptr).UnknownCountOutEdge = 2;
+  getBBInfo(nullptr).UnknownCountInEdge = 2;
+
+  if (!setInstrumentedCounts(CountFromProfile)) {
+    LLVM_DEBUG(
+        dbgs() << "Inconsistent number of counts, skipping this function");
+    Ctx.diagnose(DiagnosticInfoPGOProfile(
+        M->getName().data(),
+        Twine("Inconsistent number of counts in ") + F.getName().str()
+        + Twine(": the profile may be stale or there is a function name collision."),
+        DS_Warning));
+    return false;
+  }
+  ProgramMaxCount = PGOReader->getMaximumFunctionCount(IsCS);
+  return true;
+}
+
+// Populate the counters from instrumented BBs to all BBs.
+// In the end of this operation, all BBs should have a valid count value.
+void PGOUseFunc::populateCounters() {
+  bool Changes = true;
+  unsigned NumPasses = 0;
+  while (Changes) {
+    NumPasses++;
+    Changes = false;
+
+    // For efficient traversal, it's better to start from the end as most
+    // of the instrumented edges are at the end.
+    for (auto &BB : reverse(F)) {
+      UseBBInfo *Count = findBBInfo(&BB);
+      if (Count == nullptr)
+        continue;
+      if (!Count->CountValid) {
+        if (Count->UnknownCountOutEdge == 0) {
+          Count->CountValue = sumEdgeCount(Count->OutEdges);
+          Count->CountValid = true;
+          Changes = true;
+        } else if (Count->UnknownCountInEdge == 0) {
+          Count->CountValue = sumEdgeCount(Count->InEdges);
+          Count->CountValid = true;
+          Changes = true;
+        }
+      }
+      if (Count->CountValid) {
+        if (Count->UnknownCountOutEdge == 1) {
+          uint64_t Total = 0;
+          uint64_t OutSum = sumEdgeCount(Count->OutEdges);
+          // If the one of the successor block can early terminate (no-return),
+          // we can end up with situation where out edge sum count is larger as
+          // the source BB's count is collected by a post-dominated block.
+          if (Count->CountValue > OutSum)
+            Total = Count->CountValue - OutSum;
+          setEdgeCount(Count->OutEdges, Total);
+          Changes = true;
+        }
+        if (Count->UnknownCountInEdge == 1) {
+          uint64_t Total = 0;
+          uint64_t InSum = sumEdgeCount(Count->InEdges);
+          if (Count->CountValue > InSum)
+            Total = Count->CountValue - InSum;
+          setEdgeCount(Count->InEdges, Total);
+          Changes = true;
+        }
+      }
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "Populate counts in " << NumPasses << " passes.\n");
+#ifndef NDEBUG
+  // Assert every BB has a valid counter.
+  for (auto &BB : F) {
+    auto BI = findBBInfo(&BB);
+    if (BI == nullptr)
+      continue;
+    assert(BI->CountValid && "BB count is not valid");
+  }
+#endif
+  uint64_t FuncEntryCount = getBBInfo(&*F.begin()).CountValue;
+  uint64_t FuncMaxCount = FuncEntryCount;
+  for (auto &BB : F) {
+    auto BI = findBBInfo(&BB);
+    if (BI == nullptr)
+      continue;
+    FuncMaxCount = std::max(FuncMaxCount, BI->CountValue);
+  }
 
   // Fix the obviously inconsistent entry count.
   if (FuncMaxCount > 0 && FuncEntryCount == 0)
     FuncEntryCount = 1;
   F.setEntryCount(ProfileCount(FuncEntryCount, Function::PCT_Real));
-  markFunctionAttributes(FuncEntryCount, FuncMaxCount); 
- 
-  // Now annotate select instructions 
-  FuncInfo.SIVisitor.annotateSelects(F, this, &CountPosition); 
-  assert(CountPosition == ProfileCountSize); 
- 
-  LLVM_DEBUG(FuncInfo.dumpInfo("after reading profile.")); 
-} 
- 
-// Assign the scaled count values to the BB with multiple out edges. 
-void PGOUseFunc::setBranchWeights() { 
-  // Generate MD_prof metadata for every branch instruction. 
-  LLVM_DEBUG(dbgs() << "\nSetting branch weights for func " << F.getName() 
-                    << " IsCS=" << IsCS << "\n"); 
-  for (auto &BB : F) { 
-    Instruction *TI = BB.getTerminator(); 
-    if (TI->getNumSuccessors() < 2) 
-      continue; 
-    if (!(isa<BranchInst>(TI) || isa<SwitchInst>(TI) || 
-          isa<IndirectBrInst>(TI) || isa<InvokeInst>(TI))) 
-      continue; 
- 
-    if (getBBInfo(&BB).CountValue == 0) 
-      continue; 
- 
-    // We have a non-zero Branch BB. 
-    const UseBBInfo &BBCountInfo = getBBInfo(&BB); 
-    unsigned Size = BBCountInfo.OutEdges.size(); 
-    SmallVector<uint64_t, 2> EdgeCounts(Size, 0); 
-    uint64_t MaxCount = 0; 
-    for (unsigned s = 0; s < Size; s++) { 
-      const PGOUseEdge *E = BBCountInfo.OutEdges[s]; 
-      const BasicBlock *SrcBB = E->SrcBB; 
-      const BasicBlock *DestBB = E->DestBB; 
-      if (DestBB == nullptr) 
-        continue; 
-      unsigned SuccNum = GetSuccessorNumber(SrcBB, DestBB); 
-      uint64_t EdgeCount = E->CountValue; 
-      if (EdgeCount > MaxCount) 
-        MaxCount = EdgeCount; 
-      EdgeCounts[SuccNum] = EdgeCount; 
-    } 
-    setProfMetadata(M, TI, EdgeCounts, MaxCount); 
-  } 
-} 
- 
-static bool isIndirectBrTarget(BasicBlock *BB) { 
-  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { 
-    if (isa<IndirectBrInst>((*PI)->getTerminator())) 
-      return true; 
-  } 
-  return false; 
-} 
- 
-void PGOUseFunc::annotateIrrLoopHeaderWeights() { 
-  LLVM_DEBUG(dbgs() << "\nAnnotating irreducible loop header weights.\n"); 
-  // Find irr loop headers 
-  for (auto &BB : F) { 
-    // As a heuristic also annotate indrectbr targets as they have a high chance 
-    // to become an irreducible loop header after the indirectbr tail 
-    // duplication. 
-    if (BFI->isIrrLoopHeader(&BB) || isIndirectBrTarget(&BB)) { 
-      Instruction *TI = BB.getTerminator(); 
-      const UseBBInfo &BBCountInfo = getBBInfo(&BB); 
-      setIrrLoopHeaderMetadata(M, TI, BBCountInfo.CountValue); 
-    } 
-  } 
-} 
- 
-void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) { 
-  Module *M = F.getParent(); 
-  IRBuilder<> Builder(&SI); 
-  Type *Int64Ty = Builder.getInt64Ty(); 
-  Type *I8PtrTy = Builder.getInt8PtrTy(); 
-  auto *Step = Builder.CreateZExt(SI.getCondition(), Int64Ty); 
-  Builder.CreateCall( 
-      Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment_step), 
-      {ConstantExpr::getBitCast(FuncNameVar, I8PtrTy), 
-       Builder.getInt64(FuncHash), Builder.getInt32(TotalNumCtrs), 
-       Builder.getInt32(*CurCtrIdx), Step}); 
-  ++(*CurCtrIdx); 
-} 
- 
-void SelectInstVisitor::annotateOneSelectInst(SelectInst &SI) { 
-  std::vector<uint64_t> &CountFromProfile = UseFunc->getProfileRecord().Counts; 
-  assert(*CurCtrIdx < CountFromProfile.size() && 
-         "Out of bound access of counters"); 
-  uint64_t SCounts[2]; 
-  SCounts[0] = CountFromProfile[*CurCtrIdx]; // True count 
-  ++(*CurCtrIdx); 
-  uint64_t TotalCount = 0; 
-  auto BI = UseFunc->findBBInfo(SI.getParent()); 
-  if (BI != nullptr) 
-    TotalCount = BI->CountValue; 
-  // False Count 
-  SCounts[1] = (TotalCount > SCounts[0] ? TotalCount - SCounts[0] : 0); 
-  uint64_t MaxCount = std::max(SCounts[0], SCounts[1]); 
-  if (MaxCount) 
-    setProfMetadata(F.getParent(), &SI, SCounts, MaxCount); 
-} 
- 
-void SelectInstVisitor::visitSelectInst(SelectInst &SI) { 
-  if (!PGOInstrSelect) 
-    return; 
-  // FIXME: do not handle this yet. 
-  if (SI.getCondition()->getType()->isVectorTy()) 
-    return; 
- 
-  switch (Mode) { 
-  case VM_counting: 
-    NSIs++; 
-    return; 
-  case VM_instrument: 
-    instrumentOneSelectInst(SI); 
-    return; 
-  case VM_annotate: 
-    annotateOneSelectInst(SI); 
-    return; 
-  } 
- 
-  llvm_unreachable("Unknown visiting mode"); 
-} 
- 
-// Traverse all valuesites and annotate the instructions for all value kind. 
-void PGOUseFunc::annotateValueSites() { 
-  if (DisableValueProfiling) 
-    return; 
- 
-  // Create the PGOFuncName meta data. 
-  createPGOFuncNameMetadata(F, FuncInfo.FuncName); 
- 
-  for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) 
-    annotateValueSites(Kind); 
-} 
- 
-// Annotate the instructions for a specific value kind. 
-void PGOUseFunc::annotateValueSites(uint32_t Kind) { 
-  assert(Kind <= IPVK_Last); 
-  unsigned ValueSiteIndex = 0; 
-  auto &ValueSites = FuncInfo.ValueSites[Kind]; 
-  unsigned NumValueSites = ProfileRecord.getNumValueSites(Kind); 
-  if (NumValueSites != ValueSites.size()) { 
-    auto &Ctx = M->getContext(); 
-    Ctx.diagnose(DiagnosticInfoPGOProfile( 
-        M->getName().data(), 
-        Twine("Inconsistent number of value sites for ") + 
-            Twine(ValueProfKindDescr[Kind]) + 
-            Twine(" profiling in \"") + F.getName().str() + 
-            Twine("\", possibly due to the use of a stale profile."), 
-        DS_Warning)); 
-    return; 
-  } 
- 
-  for (VPCandidateInfo &I : ValueSites) { 
-    LLVM_DEBUG(dbgs() << "Read one value site profile (kind = " << Kind 
-                      << "): Index = " << ValueSiteIndex << " out of " 
-                      << NumValueSites << "\n"); 
-    annotateValueSite(*M, *I.AnnotatedInst, ProfileRecord, 
-                      static_cast<InstrProfValueKind>(Kind), ValueSiteIndex, 
-                      Kind == IPVK_MemOPSize ? MaxNumMemOPAnnotations 
-                                             : MaxNumAnnotations); 
-    ValueSiteIndex++; 
-  } 
-} 
- 
-// Collect the set of members for each Comdat in module M and store 
-// in ComdatMembers. 
-static void collectComdatMembers( 
-    Module &M, 
-    std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers) { 
-  if (!DoComdatRenaming) 
-    return; 
-  for (Function &F : M) 
-    if (Comdat *C = F.getComdat()) 
-      ComdatMembers.insert(std::make_pair(C, &F)); 
-  for (GlobalVariable &GV : M.globals()) 
-    if (Comdat *C = GV.getComdat()) 
-      ComdatMembers.insert(std::make_pair(C, &GV)); 
-  for (GlobalAlias &GA : M.aliases()) 
-    if (Comdat *C = GA.getComdat()) 
-      ComdatMembers.insert(std::make_pair(C, &GA)); 
-} 
- 
-static bool InstrumentAllFunctions( 
-    Module &M, function_ref<TargetLibraryInfo &(Function &)> LookupTLI, 
-    function_ref<BranchProbabilityInfo *(Function &)> LookupBPI, 
-    function_ref<BlockFrequencyInfo *(Function &)> LookupBFI, bool IsCS) { 
-  // For the context-sensitve instrumentation, we should have a separated pass 
-  // (before LTO/ThinLTO linking) to create these variables. 
-  if (!IsCS) 
+  markFunctionAttributes(FuncEntryCount, FuncMaxCount);
+
+  // Now annotate select instructions
+  FuncInfo.SIVisitor.annotateSelects(F, this, &CountPosition);
+  assert(CountPosition == ProfileCountSize);
+
+  LLVM_DEBUG(FuncInfo.dumpInfo("after reading profile."));
+}
+
+// Assign the scaled count values to the BB with multiple out edges.
+void PGOUseFunc::setBranchWeights() {
+  // Generate MD_prof metadata for every branch instruction.
+  LLVM_DEBUG(dbgs() << "\nSetting branch weights for func " << F.getName()
+                    << " IsCS=" << IsCS << "\n");
+  for (auto &BB : F) {
+    Instruction *TI = BB.getTerminator();
+    if (TI->getNumSuccessors() < 2)
+      continue;
+    if (!(isa<BranchInst>(TI) || isa<SwitchInst>(TI) ||
+          isa<IndirectBrInst>(TI) || isa<InvokeInst>(TI)))
+      continue;
+
+    if (getBBInfo(&BB).CountValue == 0)
+      continue;
+
+    // We have a non-zero Branch BB.
+    const UseBBInfo &BBCountInfo = getBBInfo(&BB);
+    unsigned Size = BBCountInfo.OutEdges.size();
+    SmallVector<uint64_t, 2> EdgeCounts(Size, 0);
+    uint64_t MaxCount = 0;
+    for (unsigned s = 0; s < Size; s++) {
+      const PGOUseEdge *E = BBCountInfo.OutEdges[s];
+      const BasicBlock *SrcBB = E->SrcBB;
+      const BasicBlock *DestBB = E->DestBB;
+      if (DestBB == nullptr)
+        continue;
+      unsigned SuccNum = GetSuccessorNumber(SrcBB, DestBB);
+      uint64_t EdgeCount = E->CountValue;
+      if (EdgeCount > MaxCount)
+        MaxCount = EdgeCount;
+      EdgeCounts[SuccNum] = EdgeCount;
+    }
+    setProfMetadata(M, TI, EdgeCounts, MaxCount);
+  }
+}
+
+static bool isIndirectBrTarget(BasicBlock *BB) {
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+    if (isa<IndirectBrInst>((*PI)->getTerminator()))
+      return true;
+  }
+  return false;
+}
+
+void PGOUseFunc::annotateIrrLoopHeaderWeights() {
+  LLVM_DEBUG(dbgs() << "\nAnnotating irreducible loop header weights.\n");
+  // Find irr loop headers
+  for (auto &BB : F) {
+    // As a heuristic also annotate indrectbr targets as they have a high chance
+    // to become an irreducible loop header after the indirectbr tail
+    // duplication.
+    if (BFI->isIrrLoopHeader(&BB) || isIndirectBrTarget(&BB)) {
+      Instruction *TI = BB.getTerminator();
+      const UseBBInfo &BBCountInfo = getBBInfo(&BB);
+      setIrrLoopHeaderMetadata(M, TI, BBCountInfo.CountValue);
+    }
+  }
+}
+
+void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) {
+  Module *M = F.getParent();
+  IRBuilder<> Builder(&SI);
+  Type *Int64Ty = Builder.getInt64Ty();
+  Type *I8PtrTy = Builder.getInt8PtrTy();
+  auto *Step = Builder.CreateZExt(SI.getCondition(), Int64Ty);
+  Builder.CreateCall(
+      Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment_step),
+      {ConstantExpr::getBitCast(FuncNameVar, I8PtrTy),
+       Builder.getInt64(FuncHash), Builder.getInt32(TotalNumCtrs),
+       Builder.getInt32(*CurCtrIdx), Step});
+  ++(*CurCtrIdx);
+}
+
+void SelectInstVisitor::annotateOneSelectInst(SelectInst &SI) {
+  std::vector<uint64_t> &CountFromProfile = UseFunc->getProfileRecord().Counts;
+  assert(*CurCtrIdx < CountFromProfile.size() &&
+         "Out of bound access of counters");
+  uint64_t SCounts[2];
+  SCounts[0] = CountFromProfile[*CurCtrIdx]; // True count
+  ++(*CurCtrIdx);
+  uint64_t TotalCount = 0;
+  auto BI = UseFunc->findBBInfo(SI.getParent());
+  if (BI != nullptr)
+    TotalCount = BI->CountValue;
+  // False Count
+  SCounts[1] = (TotalCount > SCounts[0] ? TotalCount - SCounts[0] : 0);
+  uint64_t MaxCount = std::max(SCounts[0], SCounts[1]);
+  if (MaxCount)
+    setProfMetadata(F.getParent(), &SI, SCounts, MaxCount);
+}
+
+void SelectInstVisitor::visitSelectInst(SelectInst &SI) {
+  if (!PGOInstrSelect)
+    return;
+  // FIXME: do not handle this yet.
+  if (SI.getCondition()->getType()->isVectorTy())
+    return;
+
+  switch (Mode) {
+  case VM_counting:
+    NSIs++;
+    return;
+  case VM_instrument:
+    instrumentOneSelectInst(SI);
+    return;
+  case VM_annotate:
+    annotateOneSelectInst(SI);
+    return;
+  }
+
+  llvm_unreachable("Unknown visiting mode");
+}
+
+// Traverse all valuesites and annotate the instructions for all value kind.
+void PGOUseFunc::annotateValueSites() {
+  if (DisableValueProfiling)
+    return;
+
+  // Create the PGOFuncName meta data.
+  createPGOFuncNameMetadata(F, FuncInfo.FuncName);
+
+  for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
+    annotateValueSites(Kind);
+}
+
+// Annotate the instructions for a specific value kind.
+void PGOUseFunc::annotateValueSites(uint32_t Kind) {
+  assert(Kind <= IPVK_Last);
+  unsigned ValueSiteIndex = 0;
+  auto &ValueSites = FuncInfo.ValueSites[Kind];
+  unsigned NumValueSites = ProfileRecord.getNumValueSites(Kind);
+  if (NumValueSites != ValueSites.size()) {
+    auto &Ctx = M->getContext();
+    Ctx.diagnose(DiagnosticInfoPGOProfile(
+        M->getName().data(),
+        Twine("Inconsistent number of value sites for ") +
+            Twine(ValueProfKindDescr[Kind]) +
+            Twine(" profiling in \"") + F.getName().str() +
+            Twine("\", possibly due to the use of a stale profile."),
+        DS_Warning));
+    return;
+  }
+
+  for (VPCandidateInfo &I : ValueSites) {
+    LLVM_DEBUG(dbgs() << "Read one value site profile (kind = " << Kind
+                      << "): Index = " << ValueSiteIndex << " out of "
+                      << NumValueSites << "\n");
+    annotateValueSite(*M, *I.AnnotatedInst, ProfileRecord,
+                      static_cast<InstrProfValueKind>(Kind), ValueSiteIndex,
+                      Kind == IPVK_MemOPSize ? MaxNumMemOPAnnotations
+                                             : MaxNumAnnotations);
+    ValueSiteIndex++;
+  }
+}
+
+// Collect the set of members for each Comdat in module M and store
+// in ComdatMembers.
+static void collectComdatMembers(
+    Module &M,
+    std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers) {
+  if (!DoComdatRenaming)
+    return;
+  for (Function &F : M)
+    if (Comdat *C = F.getComdat())
+      ComdatMembers.insert(std::make_pair(C, &F));
+  for (GlobalVariable &GV : M.globals())
+    if (Comdat *C = GV.getComdat())
+      ComdatMembers.insert(std::make_pair(C, &GV));
+  for (GlobalAlias &GA : M.aliases())
+    if (Comdat *C = GA.getComdat())
+      ComdatMembers.insert(std::make_pair(C, &GA));
+}
+
+static bool InstrumentAllFunctions(
+    Module &M, function_ref<TargetLibraryInfo &(Function &)> LookupTLI,
+    function_ref<BranchProbabilityInfo *(Function &)> LookupBPI,
+    function_ref<BlockFrequencyInfo *(Function &)> LookupBFI, bool IsCS) {
+  // For the context-sensitve instrumentation, we should have a separated pass
+  // (before LTO/ThinLTO linking) to create these variables.
+  if (!IsCS)
     createIRLevelProfileFlagVar(M, /* IsCS */ false, PGOInstrumentEntry);
-  std::unordered_multimap<Comdat *, GlobalValue *> ComdatMembers; 
-  collectComdatMembers(M, ComdatMembers); 
- 
-  for (auto &F : M) { 
-    if (F.isDeclaration()) 
-      continue; 
+  std::unordered_multimap<Comdat *, GlobalValue *> ComdatMembers;
+  collectComdatMembers(M, ComdatMembers);
+
+  for (auto &F : M) {
+    if (F.isDeclaration())
+      continue;
     if (F.hasFnAttribute(llvm::Attribute::NoProfile))
       continue;
-    auto &TLI = LookupTLI(F); 
-    auto *BPI = LookupBPI(F); 
-    auto *BFI = LookupBFI(F); 
-    instrumentOneFunc(F, &M, TLI, BPI, BFI, ComdatMembers, IsCS); 
-  } 
-  return true; 
-} 
- 
-PreservedAnalyses 
-PGOInstrumentationGenCreateVar::run(Module &M, ModuleAnalysisManager &AM) { 
-  createProfileFileNameVar(M, CSInstrName); 
+    auto &TLI = LookupTLI(F);
+    auto *BPI = LookupBPI(F);
+    auto *BFI = LookupBFI(F);
+    instrumentOneFunc(F, &M, TLI, BPI, BFI, ComdatMembers, IsCS);
+  }
+  return true;
+}
+
+PreservedAnalyses
+PGOInstrumentationGenCreateVar::run(Module &M, ModuleAnalysisManager &AM) {
+  createProfileFileNameVar(M, CSInstrName);
   createIRLevelProfileFlagVar(M, /* IsCS */ true, PGOInstrumentEntry);
-  return PreservedAnalyses::all(); 
-} 
- 
-bool PGOInstrumentationGenLegacyPass::runOnModule(Module &M) { 
-  if (skipModule(M)) 
-    return false; 
- 
-  auto LookupTLI = [this](Function &F) -> TargetLibraryInfo & { 
-    return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 
-  }; 
-  auto LookupBPI = [this](Function &F) { 
-    return &this->getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI(); 
-  }; 
-  auto LookupBFI = [this](Function &F) { 
-    return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI(); 
-  }; 
-  return InstrumentAllFunctions(M, LookupTLI, LookupBPI, LookupBFI, IsCS); 
-} 
- 
-PreservedAnalyses PGOInstrumentationGen::run(Module &M, 
-                                             ModuleAnalysisManager &AM) { 
-  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 
-  auto LookupTLI = [&FAM](Function &F) -> TargetLibraryInfo & { 
-    return FAM.getResult<TargetLibraryAnalysis>(F); 
-  }; 
-  auto LookupBPI = [&FAM](Function &F) { 
-    return &FAM.getResult<BranchProbabilityAnalysis>(F); 
-  }; 
-  auto LookupBFI = [&FAM](Function &F) { 
-    return &FAM.getResult<BlockFrequencyAnalysis>(F); 
-  }; 
- 
-  if (!InstrumentAllFunctions(M, LookupTLI, LookupBPI, LookupBFI, IsCS)) 
-    return PreservedAnalyses::all(); 
- 
-  return PreservedAnalyses::none(); 
-} 
- 
+  return PreservedAnalyses::all();
+}
+
+bool PGOInstrumentationGenLegacyPass::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  auto LookupTLI = [this](Function &F) -> TargetLibraryInfo & {
+    return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  };
+  auto LookupBPI = [this](Function &F) {
+    return &this->getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI();
+  };
+  auto LookupBFI = [this](Function &F) {
+    return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
+  };
+  return InstrumentAllFunctions(M, LookupTLI, LookupBPI, LookupBFI, IsCS);
+}
+
+PreservedAnalyses PGOInstrumentationGen::run(Module &M,
+                                             ModuleAnalysisManager &AM) {
+  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto LookupTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+    return FAM.getResult<TargetLibraryAnalysis>(F);
+  };
+  auto LookupBPI = [&FAM](Function &F) {
+    return &FAM.getResult<BranchProbabilityAnalysis>(F);
+  };
+  auto LookupBFI = [&FAM](Function &F) {
+    return &FAM.getResult<BlockFrequencyAnalysis>(F);
+  };
+
+  if (!InstrumentAllFunctions(M, LookupTLI, LookupBPI, LookupBFI, IsCS))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+
 // Using the ratio b/w sums of profile count values and BFI count values to
 // adjust the func entry count.
 static void fixFuncEntryCount(PGOUseFunc &Func, LoopInfo &LI,
@@ -1766,69 +1766,69 @@ static void verifyFuncBFI(PGOUseFunc &Func, LoopInfo &LI,
     });
 }
 
-static bool annotateAllFunctions( 
-    Module &M, StringRef ProfileFileName, StringRef ProfileRemappingFileName, 
-    function_ref<TargetLibraryInfo &(Function &)> LookupTLI, 
-    function_ref<BranchProbabilityInfo *(Function &)> LookupBPI, 
-    function_ref<BlockFrequencyInfo *(Function &)> LookupBFI, 
-    ProfileSummaryInfo *PSI, bool IsCS) { 
-  LLVM_DEBUG(dbgs() << "Read in profile counters: "); 
-  auto &Ctx = M.getContext(); 
-  // Read the counter array from file. 
-  auto ReaderOrErr = 
-      IndexedInstrProfReader::create(ProfileFileName, ProfileRemappingFileName); 
-  if (Error E = ReaderOrErr.takeError()) { 
-    handleAllErrors(std::move(E), [&](const ErrorInfoBase &EI) { 
-      Ctx.diagnose( 
-          DiagnosticInfoPGOProfile(ProfileFileName.data(), EI.message())); 
-    }); 
-    return false; 
-  } 
- 
-  std::unique_ptr<IndexedInstrProfReader> PGOReader = 
-      std::move(ReaderOrErr.get()); 
-  if (!PGOReader) { 
-    Ctx.diagnose(DiagnosticInfoPGOProfile(ProfileFileName.data(), 
-                                          StringRef("Cannot get PGOReader"))); 
-    return false; 
-  } 
-  if (!PGOReader->hasCSIRLevelProfile() && IsCS) 
-    return false; 
- 
-  // TODO: might need to change the warning once the clang option is finalized. 
-  if (!PGOReader->isIRLevelProfile()) { 
-    Ctx.diagnose(DiagnosticInfoPGOProfile( 
-        ProfileFileName.data(), "Not an IR level instrumentation profile")); 
-    return false; 
-  } 
- 
-  // Add the profile summary (read from the header of the indexed summary) here 
-  // so that we can use it below when reading counters (which checks if the 
-  // function should be marked with a cold or inlinehint attribute). 
-  M.setProfileSummary(PGOReader->getSummary(IsCS).getMD(M.getContext()), 
-                      IsCS ? ProfileSummary::PSK_CSInstr 
-                           : ProfileSummary::PSK_Instr); 
-  PSI->refresh(); 
- 
-  std::unordered_multimap<Comdat *, GlobalValue *> ComdatMembers; 
-  collectComdatMembers(M, ComdatMembers); 
-  std::vector<Function *> HotFunctions; 
-  std::vector<Function *> ColdFunctions; 
+static bool annotateAllFunctions(
+    Module &M, StringRef ProfileFileName, StringRef ProfileRemappingFileName,
+    function_ref<TargetLibraryInfo &(Function &)> LookupTLI,
+    function_ref<BranchProbabilityInfo *(Function &)> LookupBPI,
+    function_ref<BlockFrequencyInfo *(Function &)> LookupBFI,
+    ProfileSummaryInfo *PSI, bool IsCS) {
+  LLVM_DEBUG(dbgs() << "Read in profile counters: ");
+  auto &Ctx = M.getContext();
+  // Read the counter array from file.
+  auto ReaderOrErr =
+      IndexedInstrProfReader::create(ProfileFileName, ProfileRemappingFileName);
+  if (Error E = ReaderOrErr.takeError()) {
+    handleAllErrors(std::move(E), [&](const ErrorInfoBase &EI) {
+      Ctx.diagnose(
+          DiagnosticInfoPGOProfile(ProfileFileName.data(), EI.message()));
+    });
+    return false;
+  }
+
+  std::unique_ptr<IndexedInstrProfReader> PGOReader =
+      std::move(ReaderOrErr.get());
+  if (!PGOReader) {
+    Ctx.diagnose(DiagnosticInfoPGOProfile(ProfileFileName.data(),
+                                          StringRef("Cannot get PGOReader")));
+    return false;
+  }
+  if (!PGOReader->hasCSIRLevelProfile() && IsCS)
+    return false;
+
+  // TODO: might need to change the warning once the clang option is finalized.
+  if (!PGOReader->isIRLevelProfile()) {
+    Ctx.diagnose(DiagnosticInfoPGOProfile(
+        ProfileFileName.data(), "Not an IR level instrumentation profile"));
+    return false;
+  }
+
+  // Add the profile summary (read from the header of the indexed summary) here
+  // so that we can use it below when reading counters (which checks if the
+  // function should be marked with a cold or inlinehint attribute).
+  M.setProfileSummary(PGOReader->getSummary(IsCS).getMD(M.getContext()),
+                      IsCS ? ProfileSummary::PSK_CSInstr
+                           : ProfileSummary::PSK_Instr);
+  PSI->refresh();
+
+  std::unordered_multimap<Comdat *, GlobalValue *> ComdatMembers;
+  collectComdatMembers(M, ComdatMembers);
+  std::vector<Function *> HotFunctions;
+  std::vector<Function *> ColdFunctions;
 
   // If the profile marked as always instrument the entry BB, do the
   // same. Note this can be overwritten by the internal option in CFGMST.h
   bool InstrumentFuncEntry = PGOReader->instrEntryBBEnabled();
   if (PGOInstrumentEntry.getNumOccurrences() > 0)
     InstrumentFuncEntry = PGOInstrumentEntry;
-  for (auto &F : M) { 
-    if (F.isDeclaration()) 
-      continue; 
-    auto &TLI = LookupTLI(F); 
-    auto *BPI = LookupBPI(F); 
-    auto *BFI = LookupBFI(F); 
-    // Split indirectbr critical edges here before computing the MST rather than 
-    // later in getInstrBB() to avoid invalidating it. 
-    SplitIndirectBrCriticalEdges(F, BPI, BFI); 
+  for (auto &F : M) {
+    if (F.isDeclaration())
+      continue;
+    auto &TLI = LookupTLI(F);
+    auto *BPI = LookupBPI(F);
+    auto *BFI = LookupBFI(F);
+    // Split indirectbr critical edges here before computing the MST rather than
+    // later in getInstrBB() to avoid invalidating it.
+    SplitIndirectBrCriticalEdges(F, BPI, BFI);
     PGOUseFunc Func(F, &M, TLI, ComdatMembers, BPI, BFI, PSI, IsCS,
                     InstrumentFuncEntry);
     // When AllMinusOnes is true, it means the profile for the function
@@ -1836,15 +1836,15 @@ static bool annotateAllFunctions(
     // entry count of the function to be multiple times of hot threshold
     // and drop all its internal counters.
     bool AllMinusOnes = false;
-    bool AllZeros = false; 
+    bool AllZeros = false;
     if (!Func.readCounters(PGOReader.get(), AllZeros, AllMinusOnes))
-      continue; 
-    if (AllZeros) { 
-      F.setEntryCount(ProfileCount(0, Function::PCT_Real)); 
-      if (Func.getProgramMaxCount() != 0) 
-        ColdFunctions.push_back(&F); 
-      continue; 
-    } 
+      continue;
+    if (AllZeros) {
+      F.setEntryCount(ProfileCount(0, Function::PCT_Real));
+      if (Func.getProgramMaxCount() != 0)
+        ColdFunctions.push_back(&F);
+      continue;
+    }
     const unsigned MultiplyFactor = 3;
     if (AllMinusOnes) {
       uint64_t HotThreshold = PSI->getHotCountThreshold();
@@ -1854,43 +1854,43 @@ static bool annotateAllFunctions(
       HotFunctions.push_back(&F);
       continue;
     }
-    Func.populateCounters(); 
-    Func.setBranchWeights(); 
-    Func.annotateValueSites(); 
-    Func.annotateIrrLoopHeaderWeights(); 
-    PGOUseFunc::FuncFreqAttr FreqAttr = Func.getFuncFreqAttr(); 
-    if (FreqAttr == PGOUseFunc::FFA_Cold) 
-      ColdFunctions.push_back(&F); 
-    else if (FreqAttr == PGOUseFunc::FFA_Hot) 
-      HotFunctions.push_back(&F); 
-    if (PGOViewCounts != PGOVCT_None && 
-        (ViewBlockFreqFuncName.empty() || 
-         F.getName().equals(ViewBlockFreqFuncName))) { 
-      LoopInfo LI{DominatorTree(F)}; 
-      std::unique_ptr<BranchProbabilityInfo> NewBPI = 
-          std::make_unique<BranchProbabilityInfo>(F, LI); 
-      std::unique_ptr<BlockFrequencyInfo> NewBFI = 
-          std::make_unique<BlockFrequencyInfo>(F, *NewBPI, LI); 
-      if (PGOViewCounts == PGOVCT_Graph) 
-        NewBFI->view(); 
-      else if (PGOViewCounts == PGOVCT_Text) { 
-        dbgs() << "pgo-view-counts: " << Func.getFunc().getName() << "\n"; 
-        NewBFI->print(dbgs()); 
-      } 
-    } 
-    if (PGOViewRawCounts != PGOVCT_None && 
-        (ViewBlockFreqFuncName.empty() || 
-         F.getName().equals(ViewBlockFreqFuncName))) { 
-      if (PGOViewRawCounts == PGOVCT_Graph) 
-        if (ViewBlockFreqFuncName.empty()) 
-          WriteGraph(&Func, Twine("PGORawCounts_") + Func.getFunc().getName()); 
-        else 
-          ViewGraph(&Func, Twine("PGORawCounts_") + Func.getFunc().getName()); 
-      else if (PGOViewRawCounts == PGOVCT_Text) { 
-        dbgs() << "pgo-view-raw-counts: " << Func.getFunc().getName() << "\n"; 
-        Func.dumpInfo(); 
-      } 
-    } 
+    Func.populateCounters();
+    Func.setBranchWeights();
+    Func.annotateValueSites();
+    Func.annotateIrrLoopHeaderWeights();
+    PGOUseFunc::FuncFreqAttr FreqAttr = Func.getFuncFreqAttr();
+    if (FreqAttr == PGOUseFunc::FFA_Cold)
+      ColdFunctions.push_back(&F);
+    else if (FreqAttr == PGOUseFunc::FFA_Hot)
+      HotFunctions.push_back(&F);
+    if (PGOViewCounts != PGOVCT_None &&
+        (ViewBlockFreqFuncName.empty() ||
+         F.getName().equals(ViewBlockFreqFuncName))) {
+      LoopInfo LI{DominatorTree(F)};
+      std::unique_ptr<BranchProbabilityInfo> NewBPI =
+          std::make_unique<BranchProbabilityInfo>(F, LI);
+      std::unique_ptr<BlockFrequencyInfo> NewBFI =
+          std::make_unique<BlockFrequencyInfo>(F, *NewBPI, LI);
+      if (PGOViewCounts == PGOVCT_Graph)
+        NewBFI->view();
+      else if (PGOViewCounts == PGOVCT_Text) {
+        dbgs() << "pgo-view-counts: " << Func.getFunc().getName() << "\n";
+        NewBFI->print(dbgs());
+      }
+    }
+    if (PGOViewRawCounts != PGOVCT_None &&
+        (ViewBlockFreqFuncName.empty() ||
+         F.getName().equals(ViewBlockFreqFuncName))) {
+      if (PGOViewRawCounts == PGOVCT_Graph)
+        if (ViewBlockFreqFuncName.empty())
+          WriteGraph(&Func, Twine("PGORawCounts_") + Func.getFunc().getName());
+        else
+          ViewGraph(&Func, Twine("PGORawCounts_") + Func.getFunc().getName());
+      else if (PGOViewRawCounts == PGOVCT_Text) {
+        dbgs() << "pgo-view-raw-counts: " << Func.getFunc().getName() << "\n";
+        Func.dumpInfo();
+      }
+    }
 
     if (PGOVerifyBFI || PGOVerifyHotBFI || PGOFixEntryCount) {
       LoopInfo LI{DominatorTree(F)};
@@ -1908,18 +1908,18 @@ static bool annotateAllFunctions(
       }
       verifyFuncBFI(Func, LI, NBPI, HotCountThreshold, ColdCountThreshold);
     }
-  } 
- 
-  // Set function hotness attribute from the profile. 
-  // We have to apply these attributes at the end because their presence 
-  // can affect the BranchProbabilityInfo of any callers, resulting in an 
-  // inconsistent MST between prof-gen and prof-use. 
-  for (auto &F : HotFunctions) { 
-    F->addFnAttr(Attribute::InlineHint); 
-    LLVM_DEBUG(dbgs() << "Set inline attribute to function: " << F->getName() 
-                      << "\n"); 
-  } 
-  for (auto &F : ColdFunctions) { 
+  }
+
+  // Set function hotness attribute from the profile.
+  // We have to apply these attributes at the end because their presence
+  // can affect the BranchProbabilityInfo of any callers, resulting in an
+  // inconsistent MST between prof-gen and prof-use.
+  for (auto &F : HotFunctions) {
+    F->addFnAttr(Attribute::InlineHint);
+    LLVM_DEBUG(dbgs() << "Set inline attribute to function: " << F->getName()
+                      << "\n");
+  }
+  for (auto &F : ColdFunctions) {
     // Only set when there is no Attribute::Hot set by the user. For Hot
     // attribute, user's annotation has the precedence over the profile.
     if (F->hasFnAttribute(Attribute::Hot)) {
@@ -1931,190 +1931,190 @@ static bool annotateAllFunctions(
           DiagnosticInfoPGOProfile(M.getName().data(), Msg, DS_Warning));
       continue;
     }
-    F->addFnAttr(Attribute::Cold); 
-    LLVM_DEBUG(dbgs() << "Set cold attribute to function: " << F->getName() 
-                      << "\n"); 
-  } 
-  return true; 
-} 
- 
-PGOInstrumentationUse::PGOInstrumentationUse(std::string Filename, 
-                                             std::string RemappingFilename, 
-                                             bool IsCS) 
-    : ProfileFileName(std::move(Filename)), 
-      ProfileRemappingFileName(std::move(RemappingFilename)), IsCS(IsCS) { 
-  if (!PGOTestProfileFile.empty()) 
-    ProfileFileName = PGOTestProfileFile; 
-  if (!PGOTestProfileRemappingFile.empty()) 
-    ProfileRemappingFileName = PGOTestProfileRemappingFile; 
-} 
- 
-PreservedAnalyses PGOInstrumentationUse::run(Module &M, 
-                                             ModuleAnalysisManager &AM) { 
- 
-  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 
-  auto LookupTLI = [&FAM](Function &F) -> TargetLibraryInfo & { 
-    return FAM.getResult<TargetLibraryAnalysis>(F); 
-  }; 
-  auto LookupBPI = [&FAM](Function &F) { 
-    return &FAM.getResult<BranchProbabilityAnalysis>(F); 
-  }; 
-  auto LookupBFI = [&FAM](Function &F) { 
-    return &FAM.getResult<BlockFrequencyAnalysis>(F); 
-  }; 
- 
-  auto *PSI = &AM.getResult<ProfileSummaryAnalysis>(M); 
- 
-  if (!annotateAllFunctions(M, ProfileFileName, ProfileRemappingFileName, 
-                            LookupTLI, LookupBPI, LookupBFI, PSI, IsCS)) 
-    return PreservedAnalyses::all(); 
- 
-  return PreservedAnalyses::none(); 
-} 
- 
-bool PGOInstrumentationUseLegacyPass::runOnModule(Module &M) { 
-  if (skipModule(M)) 
-    return false; 
- 
-  auto LookupTLI = [this](Function &F) -> TargetLibraryInfo & { 
-    return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 
-  }; 
-  auto LookupBPI = [this](Function &F) { 
-    return &this->getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI(); 
-  }; 
-  auto LookupBFI = [this](Function &F) { 
-    return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI(); 
-  }; 
- 
-  auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 
-  return annotateAllFunctions(M, ProfileFileName, "", LookupTLI, LookupBPI, 
-                              LookupBFI, PSI, IsCS); 
-} 
- 
-static std::string getSimpleNodeName(const BasicBlock *Node) { 
-  if (!Node->getName().empty()) 
-    return std::string(Node->getName()); 
- 
-  std::string SimpleNodeName; 
-  raw_string_ostream OS(SimpleNodeName); 
-  Node->printAsOperand(OS, false); 
-  return OS.str(); 
-} 
- 
-void llvm::setProfMetadata(Module *M, Instruction *TI, 
-                           ArrayRef<uint64_t> EdgeCounts, 
-                           uint64_t MaxCount) { 
-  MDBuilder MDB(M->getContext()); 
-  assert(MaxCount > 0 && "Bad max count"); 
-  uint64_t Scale = calculateCountScale(MaxCount); 
-  SmallVector<unsigned, 4> Weights; 
-  for (const auto &ECI : EdgeCounts) 
-    Weights.push_back(scaleBranchCount(ECI, Scale)); 
- 
-  LLVM_DEBUG(dbgs() << "Weight is: "; for (const auto &W 
-                                           : Weights) { 
-    dbgs() << W << " "; 
-  } dbgs() << "\n";); 
- 
-  TI->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights)); 
-  if (EmitBranchProbability) { 
-    std::string BrCondStr = getBranchCondString(TI); 
-    if (BrCondStr.empty()) 
-      return; 
- 
-    uint64_t WSum = 
-        std::accumulate(Weights.begin(), Weights.end(), (uint64_t)0, 
-                        [](uint64_t w1, uint64_t w2) { return w1 + w2; }); 
-    uint64_t TotalCount = 
-        std::accumulate(EdgeCounts.begin(), EdgeCounts.end(), (uint64_t)0, 
-                        [](uint64_t c1, uint64_t c2) { return c1 + c2; }); 
-    Scale = calculateCountScale(WSum); 
-    BranchProbability BP(scaleBranchCount(Weights[0], Scale), 
-                         scaleBranchCount(WSum, Scale)); 
-    std::string BranchProbStr; 
-    raw_string_ostream OS(BranchProbStr); 
-    OS << BP; 
-    OS << " (total count : " << TotalCount << ")"; 
-    OS.flush(); 
-    Function *F = TI->getParent()->getParent(); 
-    OptimizationRemarkEmitter ORE(F); 
-    ORE.emit([&]() { 
-      return OptimizationRemark(DEBUG_TYPE, "pgo-instrumentation", TI) 
-             << BrCondStr << " is true with probability : " << BranchProbStr; 
-    }); 
-  } 
-} 
- 
-namespace llvm { 
- 
-void setIrrLoopHeaderMetadata(Module *M, Instruction *TI, uint64_t Count) { 
-  MDBuilder MDB(M->getContext()); 
-  TI->setMetadata(llvm::LLVMContext::MD_irr_loop, 
-                  MDB.createIrrLoopHeaderWeight(Count)); 
-} 
- 
-template <> struct GraphTraits<PGOUseFunc *> { 
-  using NodeRef = const BasicBlock *; 
-  using ChildIteratorType = const_succ_iterator; 
-  using nodes_iterator = pointer_iterator<Function::const_iterator>; 
- 
-  static NodeRef getEntryNode(const PGOUseFunc *G) { 
-    return &G->getFunc().front(); 
-  } 
- 
-  static ChildIteratorType child_begin(const NodeRef N) { 
-    return succ_begin(N); 
-  } 
- 
-  static ChildIteratorType child_end(const NodeRef N) { return succ_end(N); } 
- 
-  static nodes_iterator nodes_begin(const PGOUseFunc *G) { 
-    return nodes_iterator(G->getFunc().begin()); 
-  } 
- 
-  static nodes_iterator nodes_end(const PGOUseFunc *G) { 
-    return nodes_iterator(G->getFunc().end()); 
-  } 
-}; 
- 
-template <> struct DOTGraphTraits<PGOUseFunc *> : DefaultDOTGraphTraits { 
-  explicit DOTGraphTraits(bool isSimple = false) 
-      : DefaultDOTGraphTraits(isSimple) {} 
- 
-  static std::string getGraphName(const PGOUseFunc *G) { 
-    return std::string(G->getFunc().getName()); 
-  } 
- 
-  std::string getNodeLabel(const BasicBlock *Node, const PGOUseFunc *Graph) { 
-    std::string Result; 
-    raw_string_ostream OS(Result); 
- 
-    OS << getSimpleNodeName(Node) << ":\\l"; 
-    UseBBInfo *BI = Graph->findBBInfo(Node); 
-    OS << "Count : "; 
-    if (BI && BI->CountValid) 
-      OS << BI->CountValue << "\\l"; 
-    else 
-      OS << "Unknown\\l"; 
- 
-    if (!PGOInstrSelect) 
-      return Result; 
- 
-    for (auto BI = Node->begin(); BI != Node->end(); ++BI) { 
-      auto *I = &*BI; 
-      if (!isa<SelectInst>(I)) 
-        continue; 
-      // Display scaled counts for SELECT instruction: 
-      OS << "SELECT : { T = "; 
-      uint64_t TC, FC; 
-      bool HasProf = I->extractProfMetadata(TC, FC); 
-      if (!HasProf) 
-        OS << "Unknown, F = Unknown }\\l"; 
-      else 
-        OS << TC << ", F = " << FC << " }\\l"; 
-    } 
-    return Result; 
-  } 
-}; 
- 
-} // end namespace llvm 
+    F->addFnAttr(Attribute::Cold);
+    LLVM_DEBUG(dbgs() << "Set cold attribute to function: " << F->getName()
+                      << "\n");
+  }
+  return true;
+}
+
+PGOInstrumentationUse::PGOInstrumentationUse(std::string Filename,
+                                             std::string RemappingFilename,
+                                             bool IsCS)
+    : ProfileFileName(std::move(Filename)),
+      ProfileRemappingFileName(std::move(RemappingFilename)), IsCS(IsCS) {
+  if (!PGOTestProfileFile.empty())
+    ProfileFileName = PGOTestProfileFile;
+  if (!PGOTestProfileRemappingFile.empty())
+    ProfileRemappingFileName = PGOTestProfileRemappingFile;
+}
+
+PreservedAnalyses PGOInstrumentationUse::run(Module &M,
+                                             ModuleAnalysisManager &AM) {
+
+  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto LookupTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+    return FAM.getResult<TargetLibraryAnalysis>(F);
+  };
+  auto LookupBPI = [&FAM](Function &F) {
+    return &FAM.getResult<BranchProbabilityAnalysis>(F);
+  };
+  auto LookupBFI = [&FAM](Function &F) {
+    return &FAM.getResult<BlockFrequencyAnalysis>(F);
+  };
+
+  auto *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
+
+  if (!annotateAllFunctions(M, ProfileFileName, ProfileRemappingFileName,
+                            LookupTLI, LookupBPI, LookupBFI, PSI, IsCS))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+
+bool PGOInstrumentationUseLegacyPass::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  auto LookupTLI = [this](Function &F) -> TargetLibraryInfo & {
+    return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  };
+  auto LookupBPI = [this](Function &F) {
+    return &this->getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI();
+  };
+  auto LookupBFI = [this](Function &F) {
+    return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
+  };
+
+  auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  return annotateAllFunctions(M, ProfileFileName, "", LookupTLI, LookupBPI,
+                              LookupBFI, PSI, IsCS);
+}
+
+static std::string getSimpleNodeName(const BasicBlock *Node) {
+  if (!Node->getName().empty())
+    return std::string(Node->getName());
+
+  std::string SimpleNodeName;
+  raw_string_ostream OS(SimpleNodeName);
+  Node->printAsOperand(OS, false);
+  return OS.str();
+}
+
+void llvm::setProfMetadata(Module *M, Instruction *TI,
+                           ArrayRef<uint64_t> EdgeCounts,
+                           uint64_t MaxCount) {
+  MDBuilder MDB(M->getContext());
+  assert(MaxCount > 0 && "Bad max count");
+  uint64_t Scale = calculateCountScale(MaxCount);
+  SmallVector<unsigned, 4> Weights;
+  for (const auto &ECI : EdgeCounts)
+    Weights.push_back(scaleBranchCount(ECI, Scale));
+
+  LLVM_DEBUG(dbgs() << "Weight is: "; for (const auto &W
+                                           : Weights) {
+    dbgs() << W << " ";
+  } dbgs() << "\n";);
+
+  TI->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
+  if (EmitBranchProbability) {
+    std::string BrCondStr = getBranchCondString(TI);
+    if (BrCondStr.empty())
+      return;
+
+    uint64_t WSum =
+        std::accumulate(Weights.begin(), Weights.end(), (uint64_t)0,
+                        [](uint64_t w1, uint64_t w2) { return w1 + w2; });
+    uint64_t TotalCount =
+        std::accumulate(EdgeCounts.begin(), EdgeCounts.end(), (uint64_t)0,
+                        [](uint64_t c1, uint64_t c2) { return c1 + c2; });
+    Scale = calculateCountScale(WSum);
+    BranchProbability BP(scaleBranchCount(Weights[0], Scale),
+                         scaleBranchCount(WSum, Scale));
+    std::string BranchProbStr;
+    raw_string_ostream OS(BranchProbStr);
+    OS << BP;
+    OS << " (total count : " << TotalCount << ")";
+    OS.flush();
+    Function *F = TI->getParent()->getParent();
+    OptimizationRemarkEmitter ORE(F);
+    ORE.emit([&]() {
+      return OptimizationRemark(DEBUG_TYPE, "pgo-instrumentation", TI)
+             << BrCondStr << " is true with probability : " << BranchProbStr;
+    });
+  }
+}
+
+namespace llvm {
+
+void setIrrLoopHeaderMetadata(Module *M, Instruction *TI, uint64_t Count) {
+  MDBuilder MDB(M->getContext());
+  TI->setMetadata(llvm::LLVMContext::MD_irr_loop,
+                  MDB.createIrrLoopHeaderWeight(Count));
+}
+
+template <> struct GraphTraits<PGOUseFunc *> {
+  using NodeRef = const BasicBlock *;
+  using ChildIteratorType = const_succ_iterator;
+  using nodes_iterator = pointer_iterator<Function::const_iterator>;
+
+  static NodeRef getEntryNode(const PGOUseFunc *G) {
+    return &G->getFunc().front();
+  }
+
+  static ChildIteratorType child_begin(const NodeRef N) {
+    return succ_begin(N);
+  }
+
+  static ChildIteratorType child_end(const NodeRef N) { return succ_end(N); }
+
+  static nodes_iterator nodes_begin(const PGOUseFunc *G) {
+    return nodes_iterator(G->getFunc().begin());
+  }
+
+  static nodes_iterator nodes_end(const PGOUseFunc *G) {
+    return nodes_iterator(G->getFunc().end());
+  }
+};
+
+template <> struct DOTGraphTraits<PGOUseFunc *> : DefaultDOTGraphTraits {
+  explicit DOTGraphTraits(bool isSimple = false)
+      : DefaultDOTGraphTraits(isSimple) {}
+
+  static std::string getGraphName(const PGOUseFunc *G) {
+    return std::string(G->getFunc().getName());
+  }
+
+  std::string getNodeLabel(const BasicBlock *Node, const PGOUseFunc *Graph) {
+    std::string Result;
+    raw_string_ostream OS(Result);
+
+    OS << getSimpleNodeName(Node) << ":\\l";
+    UseBBInfo *BI = Graph->findBBInfo(Node);
+    OS << "Count : ";
+    if (BI && BI->CountValid)
+      OS << BI->CountValue << "\\l";
+    else
+      OS << "Unknown\\l";
+
+    if (!PGOInstrSelect)
+      return Result;
+
+    for (auto BI = Node->begin(); BI != Node->end(); ++BI) {
+      auto *I = &*BI;
+      if (!isa<SelectInst>(I))
+        continue;
+      // Display scaled counts for SELECT instruction:
+      OS << "SELECT : { T = ";
+      uint64_t TC, FC;
+      bool HasProf = I->extractProfMetadata(TC, FC);
+      if (!HasProf)
+        OS << "Unknown, F = Unknown }\\l";
+      else
+        OS << TC << ", F = " << FC << " }\\l";
+    }
+    return Result;
+  }
+};
+
+} // end namespace llvm
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
index edc72d79eb..55a93b6152 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
@@ -1,527 +1,527 @@
-//===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the transformation that optimizes memory intrinsics 
-// such as memcpy using the size value profile. When memory intrinsic size 
-// value profile metadata is available, a single memory intrinsic is expanded 
-// to a sequence of guarded specialized versions that are called with the 
-// hottest size(s), for later expansion into more optimal inline sequences. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/StringRef.h" 
-#include "llvm/ADT/Twine.h" 
-#include "llvm/Analysis/BlockFrequencyInfo.h" 
-#include "llvm/Analysis/DomTreeUpdater.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/OptimizationRemarkEmitter.h" 
+//===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the transformation that optimizes memory intrinsics
+// such as memcpy using the size value profile. When memory intrinsic size
+// value profile metadata is available, a single memory intrinsic is expanded
+// to a sequence of guarded specialized versions that are called with the
+// hottest size(s), for later expansion into more optimal inline sequences.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstVisitor.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/PassRegistry.h" 
-#include "llvm/ProfileData/InstrProf.h" 
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/ProfileData/InstrProf.h"
 #define INSTR_PROF_VALUE_PROF_MEMOP_API
 #include "llvm/ProfileData/InstrProfData.inc"
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/MathExtras.h" 
-#include "llvm/Transforms/Instrumentation.h" 
-#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include <cassert> 
-#include <cstdint> 
-#include <vector> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "pgo-memop-opt" 
- 
-STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized."); 
-STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated."); 
- 
-// The minimum call count to optimize memory intrinsic calls. 
-static cl::opt<unsigned> 
-    MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore, 
-                        cl::init(1000), 
-                        cl::desc("The minimum count to optimize memory " 
-                                 "intrinsic calls")); 
- 
-// Command line option to disable memory intrinsic optimization. The default is 
-// false. This is for debug purpose. 
-static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false), 
-                                     cl::Hidden, cl::desc("Disable optimize")); 
- 
-// The percent threshold to optimize memory intrinsic calls. 
-static cl::opt<unsigned> 
-    MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40), 
-                          cl::Hidden, cl::ZeroOrMore, 
-                          cl::desc("The percentage threshold for the " 
-                                   "memory intrinsic calls optimization")); 
- 
-// Maximum number of versions for optimizing memory intrinsic call. 
-static cl::opt<unsigned> 
-    MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden, 
-                    cl::ZeroOrMore, 
-                    cl::desc("The max version for the optimized memory " 
-                             " intrinsic calls")); 
- 
-// Scale the counts from the annotation using the BB count value. 
-static cl::opt<bool> 
-    MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden, 
-                    cl::desc("Scale the memop size counts using the basic " 
-                             " block count value")); 
- 
-cl::opt<bool> 
-    MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(true), 
-                       cl::Hidden, 
-                       cl::desc("Size-specialize memcmp and bcmp calls")); 
- 
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <cassert>
+#include <cstdint>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "pgo-memop-opt"
+
+STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized.");
+STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
+
+// The minimum call count to optimize memory intrinsic calls.
+static cl::opt<unsigned>
+    MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore,
+                        cl::init(1000),
+                        cl::desc("The minimum count to optimize memory "
+                                 "intrinsic calls"));
+
+// Command line option to disable memory intrinsic optimization. The default is
+// false. This is for debug purpose.
+static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false),
+                                     cl::Hidden, cl::desc("Disable optimize"));
+
+// The percent threshold to optimize memory intrinsic calls.
+static cl::opt<unsigned>
+    MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40),
+                          cl::Hidden, cl::ZeroOrMore,
+                          cl::desc("The percentage threshold for the "
+                                   "memory intrinsic calls optimization"));
+
+// Maximum number of versions for optimizing memory intrinsic call.
+static cl::opt<unsigned>
+    MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden,
+                    cl::ZeroOrMore,
+                    cl::desc("The max version for the optimized memory "
+                             " intrinsic calls"));
+
+// Scale the counts from the annotation using the BB count value.
+static cl::opt<bool>
+    MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden,
+                    cl::desc("Scale the memop size counts using the basic "
+                             " block count value"));
+
+cl::opt<bool>
+    MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(true),
+                       cl::Hidden,
+                       cl::desc("Size-specialize memcmp and bcmp calls"));
+
 static cl::opt<unsigned>
     MemOpMaxOptSize("memop-value-prof-max-opt-size", cl::Hidden, cl::init(128),
                     cl::desc("Optimize the memop size <= this value"));
 
-namespace { 
-class PGOMemOPSizeOptLegacyPass : public FunctionPass { 
-public: 
-  static char ID; 
- 
-  PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) { 
-    initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  StringRef getPassName() const override { return "PGOMemOPSize"; } 
- 
-private: 
-  bool runOnFunction(Function &F) override; 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<BlockFrequencyInfoWrapperPass>(); 
-    AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 
-    AU.addPreserved<GlobalsAAWrapperPass>(); 
-    AU.addPreserved<DominatorTreeWrapperPass>(); 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-  } 
-}; 
-} // end anonymous namespace 
- 
-char PGOMemOPSizeOptLegacyPass::ID = 0; 
-INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt", 
-                      "Optimize memory intrinsic using its size value profile", 
-                      false, false) 
-INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt", 
-                    "Optimize memory intrinsic using its size value profile", 
-                    false, false) 
- 
-FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() { 
-  return new PGOMemOPSizeOptLegacyPass(); 
-} 
- 
-namespace { 
- 
-static const char *getMIName(const MemIntrinsic *MI) { 
-  switch (MI->getIntrinsicID()) { 
-  case Intrinsic::memcpy: 
-    return "memcpy"; 
-  case Intrinsic::memmove: 
-    return "memmove"; 
-  case Intrinsic::memset: 
-    return "memset"; 
-  default: 
-    return "unknown"; 
-  } 
-} 
- 
-// A class that abstracts a memop (memcpy, memmove, memset, memcmp and bcmp). 
-struct MemOp { 
-  Instruction *I; 
-  MemOp(MemIntrinsic *MI) : I(MI) {} 
-  MemOp(CallInst *CI) : I(CI) {} 
-  MemIntrinsic *asMI() { return dyn_cast<MemIntrinsic>(I); } 
-  CallInst *asCI() { return cast<CallInst>(I); } 
-  MemOp clone() { 
-    if (auto MI = asMI()) 
-      return MemOp(cast<MemIntrinsic>(MI->clone())); 
-    return MemOp(cast<CallInst>(asCI()->clone())); 
-  } 
-  Value *getLength() { 
-    if (auto MI = asMI()) 
-      return MI->getLength(); 
-    return asCI()->getArgOperand(2); 
-  } 
-  void setLength(Value *Length) { 
-    if (auto MI = asMI()) 
-      return MI->setLength(Length); 
-    asCI()->setArgOperand(2, Length); 
-  } 
-  StringRef getFuncName() { 
-    if (auto MI = asMI()) 
-      return MI->getCalledFunction()->getName(); 
-    return asCI()->getCalledFunction()->getName(); 
-  } 
-  bool isMemmove() { 
-    if (auto MI = asMI()) 
-      if (MI->getIntrinsicID() == Intrinsic::memmove) 
-        return true; 
-    return false; 
-  } 
-  bool isMemcmp(TargetLibraryInfo &TLI) { 
-    LibFunc Func; 
-    if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) && 
-        Func == LibFunc_memcmp) { 
-      return true; 
-    } 
-    return false; 
-  } 
-  bool isBcmp(TargetLibraryInfo &TLI) { 
-    LibFunc Func; 
-    if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) && 
-        Func == LibFunc_bcmp) { 
-      return true; 
-    } 
-    return false; 
-  } 
-  const char *getName(TargetLibraryInfo &TLI) { 
-    if (auto MI = asMI()) 
-      return getMIName(MI); 
-    LibFunc Func; 
-    if (TLI.getLibFunc(*asCI(), Func)) { 
-      if (Func == LibFunc_memcmp) 
-        return "memcmp"; 
-      if (Func == LibFunc_bcmp) 
-        return "bcmp"; 
-    } 
-    llvm_unreachable("Must be MemIntrinsic or memcmp/bcmp CallInst"); 
-    return nullptr; 
-  } 
-}; 
- 
-class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> { 
-public: 
-  MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI, 
-               OptimizationRemarkEmitter &ORE, DominatorTree *DT, 
-               TargetLibraryInfo &TLI) 
-      : Func(Func), BFI(BFI), ORE(ORE), DT(DT), TLI(TLI), Changed(false) { 
-    ValueDataArray = 
-        std::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2); 
-  } 
-  bool isChanged() const { return Changed; } 
-  void perform() { 
-    WorkList.clear(); 
-    visit(Func); 
- 
-    for (auto &MO : WorkList) { 
-      ++NumOfPGOMemOPAnnotate; 
-      if (perform(MO)) { 
-        Changed = true; 
-        ++NumOfPGOMemOPOpt; 
-        LLVM_DEBUG(dbgs() << "MemOP call: " << MO.getFuncName() 
-                          << "is Transformed.\n"); 
-      } 
-    } 
-  } 
- 
-  void visitMemIntrinsic(MemIntrinsic &MI) { 
-    Value *Length = MI.getLength(); 
-    // Not perform on constant length calls. 
-    if (dyn_cast<ConstantInt>(Length)) 
-      return; 
-    WorkList.push_back(MemOp(&MI)); 
-  } 
- 
-  void visitCallInst(CallInst &CI) { 
-    LibFunc Func; 
-    if (TLI.getLibFunc(CI, Func) && 
-        (Func == LibFunc_memcmp || Func == LibFunc_bcmp) && 
+namespace {
+class PGOMemOPSizeOptLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) {
+    initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "PGOMemOPSize"; }
+
+private:
+  bool runOnFunction(Function &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+};
+} // end anonymous namespace
+
+char PGOMemOPSizeOptLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
+                      "Optimize memory intrinsic using its size value profile",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
+                    "Optimize memory intrinsic using its size value profile",
+                    false, false)
+
+FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() {
+  return new PGOMemOPSizeOptLegacyPass();
+}
+
+namespace {
+
+static const char *getMIName(const MemIntrinsic *MI) {
+  switch (MI->getIntrinsicID()) {
+  case Intrinsic::memcpy:
+    return "memcpy";
+  case Intrinsic::memmove:
+    return "memmove";
+  case Intrinsic::memset:
+    return "memset";
+  default:
+    return "unknown";
+  }
+}
+
+// A class that abstracts a memop (memcpy, memmove, memset, memcmp and bcmp).
+struct MemOp {
+  Instruction *I;
+  MemOp(MemIntrinsic *MI) : I(MI) {}
+  MemOp(CallInst *CI) : I(CI) {}
+  MemIntrinsic *asMI() { return dyn_cast<MemIntrinsic>(I); }
+  CallInst *asCI() { return cast<CallInst>(I); }
+  MemOp clone() {
+    if (auto MI = asMI())
+      return MemOp(cast<MemIntrinsic>(MI->clone()));
+    return MemOp(cast<CallInst>(asCI()->clone()));
+  }
+  Value *getLength() {
+    if (auto MI = asMI())
+      return MI->getLength();
+    return asCI()->getArgOperand(2);
+  }
+  void setLength(Value *Length) {
+    if (auto MI = asMI())
+      return MI->setLength(Length);
+    asCI()->setArgOperand(2, Length);
+  }
+  StringRef getFuncName() {
+    if (auto MI = asMI())
+      return MI->getCalledFunction()->getName();
+    return asCI()->getCalledFunction()->getName();
+  }
+  bool isMemmove() {
+    if (auto MI = asMI())
+      if (MI->getIntrinsicID() == Intrinsic::memmove)
+        return true;
+    return false;
+  }
+  bool isMemcmp(TargetLibraryInfo &TLI) {
+    LibFunc Func;
+    if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) &&
+        Func == LibFunc_memcmp) {
+      return true;
+    }
+    return false;
+  }
+  bool isBcmp(TargetLibraryInfo &TLI) {
+    LibFunc Func;
+    if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) &&
+        Func == LibFunc_bcmp) {
+      return true;
+    }
+    return false;
+  }
+  const char *getName(TargetLibraryInfo &TLI) {
+    if (auto MI = asMI())
+      return getMIName(MI);
+    LibFunc Func;
+    if (TLI.getLibFunc(*asCI(), Func)) {
+      if (Func == LibFunc_memcmp)
+        return "memcmp";
+      if (Func == LibFunc_bcmp)
+        return "bcmp";
+    }
+    llvm_unreachable("Must be MemIntrinsic or memcmp/bcmp CallInst");
+    return nullptr;
+  }
+};
+
+class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
+public:
+  MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI,
+               OptimizationRemarkEmitter &ORE, DominatorTree *DT,
+               TargetLibraryInfo &TLI)
+      : Func(Func), BFI(BFI), ORE(ORE), DT(DT), TLI(TLI), Changed(false) {
+    ValueDataArray =
+        std::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2);
+  }
+  bool isChanged() const { return Changed; }
+  void perform() {
+    WorkList.clear();
+    visit(Func);
+
+    for (auto &MO : WorkList) {
+      ++NumOfPGOMemOPAnnotate;
+      if (perform(MO)) {
+        Changed = true;
+        ++NumOfPGOMemOPOpt;
+        LLVM_DEBUG(dbgs() << "MemOP call: " << MO.getFuncName()
+                          << "is Transformed.\n");
+      }
+    }
+  }
+
+  void visitMemIntrinsic(MemIntrinsic &MI) {
+    Value *Length = MI.getLength();
+    // Not perform on constant length calls.
+    if (dyn_cast<ConstantInt>(Length))
+      return;
+    WorkList.push_back(MemOp(&MI));
+  }
+
+  void visitCallInst(CallInst &CI) {
+    LibFunc Func;
+    if (TLI.getLibFunc(CI, Func) &&
+        (Func == LibFunc_memcmp || Func == LibFunc_bcmp) &&
         !isa<ConstantInt>(CI.getArgOperand(2))) {
-      WorkList.push_back(MemOp(&CI)); 
-    } 
-  } 
- 
-private: 
-  Function &Func; 
-  BlockFrequencyInfo &BFI; 
-  OptimizationRemarkEmitter &ORE; 
-  DominatorTree *DT; 
-  TargetLibraryInfo &TLI; 
-  bool Changed; 
-  std::vector<MemOp> WorkList; 
-  // The space to read the profile annotation. 
-  std::unique_ptr<InstrProfValueData[]> ValueDataArray; 
-  bool perform(MemOp MO); 
-}; 
- 
-static bool isProfitable(uint64_t Count, uint64_t TotalCount) { 
-  assert(Count <= TotalCount); 
-  if (Count < MemOPCountThreshold) 
-    return false; 
-  if (Count < TotalCount * MemOPPercentThreshold / 100) 
-    return false; 
-  return true; 
-} 
- 
-static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num, 
-                                      uint64_t Denom) { 
-  if (!MemOPScaleCount) 
-    return Count; 
-  bool Overflowed; 
-  uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed); 
-  return ScaleCount / Denom; 
-} 
- 
-bool MemOPSizeOpt::perform(MemOp MO) { 
-  assert(MO.I); 
-  if (MO.isMemmove()) 
-    return false; 
-  if (!MemOPOptMemcmpBcmp && (MO.isMemcmp(TLI) || MO.isBcmp(TLI))) 
-    return false; 
- 
-  uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2; 
-  uint64_t TotalCount; 
-  if (!getValueProfDataFromInst(*MO.I, IPVK_MemOPSize, MaxNumPromotions, 
-                                ValueDataArray.get(), NumVals, TotalCount)) 
-    return false; 
- 
-  uint64_t ActualCount = TotalCount; 
-  uint64_t SavedTotalCount = TotalCount; 
-  if (MemOPScaleCount) { 
-    auto BBEdgeCount = BFI.getBlockProfileCount(MO.I->getParent()); 
-    if (!BBEdgeCount) 
-      return false; 
-    ActualCount = *BBEdgeCount; 
-  } 
- 
-  ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals); 
-  LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count " 
-                    << ActualCount << "\n"); 
-  LLVM_DEBUG( 
-      for (auto &VD 
-           : VDs) { dbgs() << "  (" << VD.Value << "," << VD.Count << ")\n"; }); 
- 
-  if (ActualCount < MemOPCountThreshold) 
-    return false; 
-  // Skip if the total value profiled count is 0, in which case we can't 
-  // scale up the counts properly (and there is no profitable transformation). 
-  if (TotalCount == 0) 
-    return false; 
- 
-  TotalCount = ActualCount; 
-  if (MemOPScaleCount) 
-    LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount 
-                      << " denominator = " << SavedTotalCount << "\n"); 
- 
-  // Keeping track of the count of the default case: 
-  uint64_t RemainCount = TotalCount; 
-  uint64_t SavedRemainCount = SavedTotalCount; 
-  SmallVector<uint64_t, 16> SizeIds; 
-  SmallVector<uint64_t, 16> CaseCounts; 
-  uint64_t MaxCount = 0; 
-  unsigned Version = 0; 
-  // Default case is in the front -- save the slot here. 
-  CaseCounts.push_back(0); 
-  for (auto &VD : VDs) { 
-    int64_t V = VD.Value; 
-    uint64_t C = VD.Count; 
-    if (MemOPScaleCount) 
-      C = getScaledCount(C, ActualCount, SavedTotalCount); 
- 
+      WorkList.push_back(MemOp(&CI));
+    }
+  }
+
+private:
+  Function &Func;
+  BlockFrequencyInfo &BFI;
+  OptimizationRemarkEmitter &ORE;
+  DominatorTree *DT;
+  TargetLibraryInfo &TLI;
+  bool Changed;
+  std::vector<MemOp> WorkList;
+  // The space to read the profile annotation.
+  std::unique_ptr<InstrProfValueData[]> ValueDataArray;
+  bool perform(MemOp MO);
+};
+
+static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
+  assert(Count <= TotalCount);
+  if (Count < MemOPCountThreshold)
+    return false;
+  if (Count < TotalCount * MemOPPercentThreshold / 100)
+    return false;
+  return true;
+}
+
+static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num,
+                                      uint64_t Denom) {
+  if (!MemOPScaleCount)
+    return Count;
+  bool Overflowed;
+  uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed);
+  return ScaleCount / Denom;
+}
+
+bool MemOPSizeOpt::perform(MemOp MO) {
+  assert(MO.I);
+  if (MO.isMemmove())
+    return false;
+  if (!MemOPOptMemcmpBcmp && (MO.isMemcmp(TLI) || MO.isBcmp(TLI)))
+    return false;
+
+  uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2;
+  uint64_t TotalCount;
+  if (!getValueProfDataFromInst(*MO.I, IPVK_MemOPSize, MaxNumPromotions,
+                                ValueDataArray.get(), NumVals, TotalCount))
+    return false;
+
+  uint64_t ActualCount = TotalCount;
+  uint64_t SavedTotalCount = TotalCount;
+  if (MemOPScaleCount) {
+    auto BBEdgeCount = BFI.getBlockProfileCount(MO.I->getParent());
+    if (!BBEdgeCount)
+      return false;
+    ActualCount = *BBEdgeCount;
+  }
+
+  ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals);
+  LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count "
+                    << ActualCount << "\n");
+  LLVM_DEBUG(
+      for (auto &VD
+           : VDs) { dbgs() << "  (" << VD.Value << "," << VD.Count << ")\n"; });
+
+  if (ActualCount < MemOPCountThreshold)
+    return false;
+  // Skip if the total value profiled count is 0, in which case we can't
+  // scale up the counts properly (and there is no profitable transformation).
+  if (TotalCount == 0)
+    return false;
+
+  TotalCount = ActualCount;
+  if (MemOPScaleCount)
+    LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
+                      << " denominator = " << SavedTotalCount << "\n");
+
+  // Keeping track of the count of the default case:
+  uint64_t RemainCount = TotalCount;
+  uint64_t SavedRemainCount = SavedTotalCount;
+  SmallVector<uint64_t, 16> SizeIds;
+  SmallVector<uint64_t, 16> CaseCounts;
+  uint64_t MaxCount = 0;
+  unsigned Version = 0;
+  // Default case is in the front -- save the slot here.
+  CaseCounts.push_back(0);
+  for (auto &VD : VDs) {
+    int64_t V = VD.Value;
+    uint64_t C = VD.Count;
+    if (MemOPScaleCount)
+      C = getScaledCount(C, ActualCount, SavedTotalCount);
+
     if (!InstrProfIsSingleValRange(V) || V > MemOpMaxOptSize)
-      continue; 
- 
-    // ValueCounts are sorted on the count. Break at the first un-profitable 
-    // value. 
-    if (!isProfitable(C, RemainCount)) 
-      break; 
- 
-    SizeIds.push_back(V); 
-    CaseCounts.push_back(C); 
-    if (C > MaxCount) 
-      MaxCount = C; 
- 
-    assert(RemainCount >= C); 
-    RemainCount -= C; 
-    assert(SavedRemainCount >= VD.Count); 
-    SavedRemainCount -= VD.Count; 
- 
-    if (++Version > MemOPMaxVersion && MemOPMaxVersion != 0) 
-      break; 
-  } 
- 
-  if (Version == 0) 
-    return false; 
- 
-  CaseCounts[0] = RemainCount; 
-  if (RemainCount > MaxCount) 
-    MaxCount = RemainCount; 
- 
-  uint64_t SumForOpt = TotalCount - RemainCount; 
- 
-  LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version 
-                    << " Versions (covering " << SumForOpt << " out of " 
-                    << TotalCount << ")\n"); 
- 
-  // mem_op(..., size) 
-  // ==> 
-  // switch (size) { 
-  //   case s1: 
-  //      mem_op(..., s1); 
-  //      goto merge_bb; 
-  //   case s2: 
-  //      mem_op(..., s2); 
-  //      goto merge_bb; 
-  //   ... 
-  //   default: 
-  //      mem_op(..., size); 
-  //      goto merge_bb; 
-  // } 
-  // merge_bb: 
- 
-  BasicBlock *BB = MO.I->getParent(); 
-  LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n"); 
-  LLVM_DEBUG(dbgs() << *BB << "\n"); 
-  auto OrigBBFreq = BFI.getBlockFreq(BB); 
- 
-  BasicBlock *DefaultBB = SplitBlock(BB, MO.I, DT); 
-  BasicBlock::iterator It(*MO.I); 
-  ++It; 
-  assert(It != DefaultBB->end()); 
-  BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT); 
-  MergeBB->setName("MemOP.Merge"); 
-  BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency()); 
-  DefaultBB->setName("MemOP.Default"); 
- 
-  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); 
-  auto &Ctx = Func.getContext(); 
-  IRBuilder<> IRB(BB); 
-  BB->getTerminator()->eraseFromParent(); 
-  Value *SizeVar = MO.getLength(); 
-  SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size()); 
-  Type *MemOpTy = MO.I->getType(); 
-  PHINode *PHI = nullptr; 
-  if (!MemOpTy->isVoidTy()) { 
-    // Insert a phi for the return values at the merge block. 
-    IRBuilder<> IRBM(MergeBB->getFirstNonPHI()); 
-    PHI = IRBM.CreatePHI(MemOpTy, SizeIds.size() + 1, "MemOP.RVMerge"); 
-    MO.I->replaceAllUsesWith(PHI); 
-    PHI->addIncoming(MO.I, DefaultBB); 
-  } 
- 
-  // Clear the value profile data. 
-  MO.I->setMetadata(LLVMContext::MD_prof, nullptr); 
-  // If all promoted, we don't need the MD.prof metadata. 
-  if (SavedRemainCount > 0 || Version != NumVals) 
-    // Otherwise we need update with the un-promoted records back. 
-    annotateValueSite(*Func.getParent(), *MO.I, VDs.slice(Version), 
-                      SavedRemainCount, IPVK_MemOPSize, NumVals); 
- 
-  LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n"); 
- 
-  std::vector<DominatorTree::UpdateType> Updates; 
-  if (DT) 
-    Updates.reserve(2 * SizeIds.size()); 
- 
-  for (uint64_t SizeId : SizeIds) { 
-    BasicBlock *CaseBB = BasicBlock::Create( 
-        Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB); 
-    MemOp NewMO = MO.clone(); 
-    // Fix the argument. 
-    auto *SizeType = dyn_cast<IntegerType>(NewMO.getLength()->getType()); 
-    assert(SizeType && "Expected integer type size argument."); 
-    ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId); 
-    NewMO.setLength(CaseSizeId); 
-    CaseBB->getInstList().push_back(NewMO.I); 
-    IRBuilder<> IRBCase(CaseBB); 
-    IRBCase.CreateBr(MergeBB); 
-    SI->addCase(CaseSizeId, CaseBB); 
-    if (!MemOpTy->isVoidTy()) 
-      PHI->addIncoming(NewMO.I, CaseBB); 
-    if (DT) { 
-      Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB}); 
-      Updates.push_back({DominatorTree::Insert, BB, CaseBB}); 
-    } 
-    LLVM_DEBUG(dbgs() << *CaseBB << "\n"); 
-  } 
-  DTU.applyUpdates(Updates); 
-  Updates.clear(); 
- 
-  setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount); 
- 
-  LLVM_DEBUG(dbgs() << *BB << "\n"); 
-  LLVM_DEBUG(dbgs() << *DefaultBB << "\n"); 
-  LLVM_DEBUG(dbgs() << *MergeBB << "\n"); 
- 
-  ORE.emit([&]() { 
-    using namespace ore; 
-    return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MO.I) 
-           << "optimized " << NV("Memop", MO.getName(TLI)) << " with count " 
-           << NV("Count", SumForOpt) << " out of " << NV("Total", TotalCount) 
-           << " for " << NV("Versions", Version) << " versions"; 
-  }); 
- 
-  return true; 
-} 
-} // namespace 
- 
-static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI, 
-                                OptimizationRemarkEmitter &ORE, 
-                                DominatorTree *DT, TargetLibraryInfo &TLI) { 
-  if (DisableMemOPOPT) 
-    return false; 
- 
-  if (F.hasFnAttribute(Attribute::OptimizeForSize)) 
-    return false; 
-  MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI); 
-  MemOPSizeOpt.perform(); 
-  return MemOPSizeOpt.isChanged(); 
-} 
- 
-bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) { 
-  BlockFrequencyInfo &BFI = 
-      getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 
-  auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 
-  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 
-  DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; 
-  TargetLibraryInfo &TLI = 
-      getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 
-  return PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI); 
-} 
- 
-namespace llvm { 
-char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID; 
- 
-PreservedAnalyses PGOMemOPSizeOpt::run(Function &F, 
-                                       FunctionAnalysisManager &FAM) { 
-  auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F); 
-  auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); 
-  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F); 
-  auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F); 
-  bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI); 
-  if (!Changed) 
-    return PreservedAnalyses::all(); 
-  auto PA = PreservedAnalyses(); 
-  PA.preserve<GlobalsAA>(); 
-  PA.preserve<DominatorTreeAnalysis>(); 
-  return PA; 
-} 
-} // namespace llvm 
+      continue;
+
+    // ValueCounts are sorted on the count. Break at the first un-profitable
+    // value.
+    if (!isProfitable(C, RemainCount))
+      break;
+
+    SizeIds.push_back(V);
+    CaseCounts.push_back(C);
+    if (C > MaxCount)
+      MaxCount = C;
+
+    assert(RemainCount >= C);
+    RemainCount -= C;
+    assert(SavedRemainCount >= VD.Count);
+    SavedRemainCount -= VD.Count;
+
+    if (++Version > MemOPMaxVersion && MemOPMaxVersion != 0)
+      break;
+  }
+
+  if (Version == 0)
+    return false;
+
+  CaseCounts[0] = RemainCount;
+  if (RemainCount > MaxCount)
+    MaxCount = RemainCount;
+
+  uint64_t SumForOpt = TotalCount - RemainCount;
+
+  LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
+                    << " Versions (covering " << SumForOpt << " out of "
+                    << TotalCount << ")\n");
+
+  // mem_op(..., size)
+  // ==>
+  // switch (size) {
+  //   case s1:
+  //      mem_op(..., s1);
+  //      goto merge_bb;
+  //   case s2:
+  //      mem_op(..., s2);
+  //      goto merge_bb;
+  //   ...
+  //   default:
+  //      mem_op(..., size);
+  //      goto merge_bb;
+  // }
+  // merge_bb:
+
+  BasicBlock *BB = MO.I->getParent();
+  LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
+  LLVM_DEBUG(dbgs() << *BB << "\n");
+  auto OrigBBFreq = BFI.getBlockFreq(BB);
+
+  BasicBlock *DefaultBB = SplitBlock(BB, MO.I, DT);
+  BasicBlock::iterator It(*MO.I);
+  ++It;
+  assert(It != DefaultBB->end());
+  BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT);
+  MergeBB->setName("MemOP.Merge");
+  BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency());
+  DefaultBB->setName("MemOP.Default");
+
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+  auto &Ctx = Func.getContext();
+  IRBuilder<> IRB(BB);
+  BB->getTerminator()->eraseFromParent();
+  Value *SizeVar = MO.getLength();
+  SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size());
+  Type *MemOpTy = MO.I->getType();
+  PHINode *PHI = nullptr;
+  if (!MemOpTy->isVoidTy()) {
+    // Insert a phi for the return values at the merge block.
+    IRBuilder<> IRBM(MergeBB->getFirstNonPHI());
+    PHI = IRBM.CreatePHI(MemOpTy, SizeIds.size() + 1, "MemOP.RVMerge");
+    MO.I->replaceAllUsesWith(PHI);
+    PHI->addIncoming(MO.I, DefaultBB);
+  }
+
+  // Clear the value profile data.
+  MO.I->setMetadata(LLVMContext::MD_prof, nullptr);
+  // If all promoted, we don't need the MD.prof metadata.
+  if (SavedRemainCount > 0 || Version != NumVals)
+    // Otherwise we need update with the un-promoted records back.
+    annotateValueSite(*Func.getParent(), *MO.I, VDs.slice(Version),
+                      SavedRemainCount, IPVK_MemOPSize, NumVals);
+
+  LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n");
+
+  std::vector<DominatorTree::UpdateType> Updates;
+  if (DT)
+    Updates.reserve(2 * SizeIds.size());
+
+  for (uint64_t SizeId : SizeIds) {
+    BasicBlock *CaseBB = BasicBlock::Create(
+        Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB);
+    MemOp NewMO = MO.clone();
+    // Fix the argument.
+    auto *SizeType = dyn_cast<IntegerType>(NewMO.getLength()->getType());
+    assert(SizeType && "Expected integer type size argument.");
+    ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId);
+    NewMO.setLength(CaseSizeId);
+    CaseBB->getInstList().push_back(NewMO.I);
+    IRBuilder<> IRBCase(CaseBB);
+    IRBCase.CreateBr(MergeBB);
+    SI->addCase(CaseSizeId, CaseBB);
+    if (!MemOpTy->isVoidTy())
+      PHI->addIncoming(NewMO.I, CaseBB);
+    if (DT) {
+      Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB});
+      Updates.push_back({DominatorTree::Insert, BB, CaseBB});
+    }
+    LLVM_DEBUG(dbgs() << *CaseBB << "\n");
+  }
+  DTU.applyUpdates(Updates);
+  Updates.clear();
+
+  setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount);
+
+  LLVM_DEBUG(dbgs() << *BB << "\n");
+  LLVM_DEBUG(dbgs() << *DefaultBB << "\n");
+  LLVM_DEBUG(dbgs() << *MergeBB << "\n");
+
+  ORE.emit([&]() {
+    using namespace ore;
+    return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MO.I)
+           << "optimized " << NV("Memop", MO.getName(TLI)) << " with count "
+           << NV("Count", SumForOpt) << " out of " << NV("Total", TotalCount)
+           << " for " << NV("Versions", Version) << " versions";
+  });
+
+  return true;
+}
+} // namespace
+
+static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI,
+                                OptimizationRemarkEmitter &ORE,
+                                DominatorTree *DT, TargetLibraryInfo &TLI) {
+  if (DisableMemOPOPT)
+    return false;
+
+  if (F.hasFnAttribute(Attribute::OptimizeForSize))
+    return false;
+  MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI);
+  MemOPSizeOpt.perform();
+  return MemOPSizeOpt.isChanged();
+}
+
+bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) {
+  BlockFrequencyInfo &BFI =
+      getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
+  auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+  TargetLibraryInfo &TLI =
+      getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  return PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI);
+}
+
+namespace llvm {
+char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID;
+
+PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
+                                       FunctionAnalysisManager &FAM) {
+  auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+  auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
+  auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
+  bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI);
+  if (!Changed)
+    return PreservedAnalyses::all();
+  auto PA = PreservedAnalyses();
+  PA.preserve<GlobalsAA>();
+  PA.preserve<DominatorTreeAnalysis>();
+  return PA;
+}
+} // namespace llvm
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/PoisonChecking.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/PoisonChecking.cpp
index bb822f7b27..fc52672618 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/PoisonChecking.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/PoisonChecking.cpp
@@ -1,359 +1,359 @@
-//===- PoisonChecking.cpp - -----------------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// Implements a transform pass which instruments IR such that poison semantics 
-// are made explicit.  That is, it provides a (possibly partial) executable 
-// semantics for every instruction w.r.t. poison as specified in the LLVM 
-// LangRef.  There are obvious parallels to the sanitizer tools, but this pass 
-// is focused purely on the semantics of LLVM IR, not any particular source 
-// language.   If you're looking for something to see if your C/C++ contains 
-// UB, this is not it. 
-// 
-// The rewritten semantics of each instruction will include the following 
-// components: 
-// 
-// 1) The original instruction, unmodified. 
-// 2) A propagation rule which translates dynamic information about the poison 
-//    state of each input to whether the dynamic output of the instruction 
-//    produces poison. 
-// 3) A creation rule which validates any poison producing flags on the 
-//    instruction itself (e.g. checks for overflow on nsw). 
-// 4) A check rule which traps (to a handler function) if this instruction must 
-//    execute undefined behavior given the poison state of it's inputs. 
-// 
-// This is a must analysis based transform; that is, the resulting code may 
-// produce a false negative result (not report UB when actually exists 
-// according to the LangRef spec), but should never produce a false positive 
-// (report UB where it doesn't exist). 
-// 
-// Use cases for this pass include: 
-// - Understanding (and testing!) the implications of the definition of poison 
-//   from the LangRef. 
-// - Validating the output of a IR fuzzer to ensure that all programs produced 
-//   are well defined on the specific input used. 
-// - Finding/confirming poison specific miscompiles by checking the poison 
-//   status of an input/IR pair is the same before and after an optimization 
-//   transform. 
-// - Checking that a bugpoint reduction does not introduce UB which didn't 
-//   exist in the original program being reduced. 
-// 
-// The major sources of inaccuracy are currently: 
-// - Most validation rules not yet implemented for instructions with poison 
-//   relavant flags.  At the moment, only nsw/nuw on add/sub are supported. 
-// - UB which is control dependent on a branch on poison is not yet 
-//   reported. Currently, only data flow dependence is modeled. 
-// - Poison which is propagated through memory is not modeled.  As such, 
-//   storing poison to memory and then reloading it will cause a false negative 
-//   as we consider the reloaded value to not be poisoned. 
-// - Poison propagation across function boundaries is not modeled.  At the 
-//   moment, all arguments and return values are assumed not to be poison. 
-// - Undef is not modeled.  In particular, the optimizer's freedom to pick 
-//   concrete values for undef bits so as to maximize potential for producing 
-//   poison is not modeled. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Instrumentation/PoisonChecking.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/MemoryBuiltins.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstVisitor.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "poison-checking" 
- 
-static cl::opt<bool> 
-LocalCheck("poison-checking-function-local", 
-           cl::init(false), 
-           cl::desc("Check that returns are non-poison (for testing)")); 
- 
- 
-static bool isConstantFalse(Value* V) { 
-  assert(V->getType()->isIntegerTy(1)); 
-  if (auto *CI = dyn_cast<ConstantInt>(V)) 
-    return CI->isZero(); 
-  return false; 
-} 
- 
-static Value *buildOrChain(IRBuilder<> &B, ArrayRef<Value*> Ops) { 
-  if (Ops.size() == 0) 
-    return B.getFalse(); 
-  unsigned i = 0; 
-  for (; i < Ops.size() && isConstantFalse(Ops[i]); i++) {} 
-  if (i == Ops.size()) 
-    return B.getFalse(); 
-  Value *Accum = Ops[i++]; 
-  for (; i < Ops.size(); i++) 
-    if (!isConstantFalse(Ops[i])) 
-      Accum = B.CreateOr(Accum, Ops[i]); 
-  return Accum; 
-} 
- 
-static void generateCreationChecksForBinOp(Instruction &I, 
-                                           SmallVectorImpl<Value*> &Checks) { 
-  assert(isa<BinaryOperator>(I)); 
- 
-  IRBuilder<> B(&I); 
-  Value *LHS = I.getOperand(0); 
-  Value *RHS = I.getOperand(1); 
-  switch (I.getOpcode()) { 
-  default: 
-    return; 
-  case Instruction::Add: { 
-    if (I.hasNoSignedWrap()) { 
-      auto *OverflowOp = 
-        B.CreateBinaryIntrinsic(Intrinsic::sadd_with_overflow, LHS, RHS); 
-      Checks.push_back(B.CreateExtractValue(OverflowOp, 1)); 
-    } 
-    if (I.hasNoUnsignedWrap()) { 
-      auto *OverflowOp = 
-        B.CreateBinaryIntrinsic(Intrinsic::uadd_with_overflow, LHS, RHS); 
-      Checks.push_back(B.CreateExtractValue(OverflowOp, 1)); 
-    } 
-    break; 
-  } 
-  case Instruction::Sub: { 
-    if (I.hasNoSignedWrap()) { 
-      auto *OverflowOp = 
-        B.CreateBinaryIntrinsic(Intrinsic::ssub_with_overflow, LHS, RHS); 
-      Checks.push_back(B.CreateExtractValue(OverflowOp, 1)); 
-    } 
-    if (I.hasNoUnsignedWrap()) { 
-      auto *OverflowOp = 
-        B.CreateBinaryIntrinsic(Intrinsic::usub_with_overflow, LHS, RHS); 
-      Checks.push_back(B.CreateExtractValue(OverflowOp, 1)); 
-    } 
-    break; 
-  } 
-  case Instruction::Mul: { 
-    if (I.hasNoSignedWrap()) { 
-      auto *OverflowOp = 
-        B.CreateBinaryIntrinsic(Intrinsic::smul_with_overflow, LHS, RHS); 
-      Checks.push_back(B.CreateExtractValue(OverflowOp, 1)); 
-    } 
-    if (I.hasNoUnsignedWrap()) { 
-      auto *OverflowOp = 
-        B.CreateBinaryIntrinsic(Intrinsic::umul_with_overflow, LHS, RHS); 
-      Checks.push_back(B.CreateExtractValue(OverflowOp, 1)); 
-    } 
-    break; 
-  } 
-  case Instruction::UDiv: { 
-    if (I.isExact()) { 
-      auto *Check = 
-        B.CreateICmp(ICmpInst::ICMP_NE, B.CreateURem(LHS, RHS), 
-                     ConstantInt::get(LHS->getType(), 0)); 
-      Checks.push_back(Check); 
-    } 
-    break; 
-  } 
-  case Instruction::SDiv: { 
-    if (I.isExact()) { 
-      auto *Check = 
-        B.CreateICmp(ICmpInst::ICMP_NE, B.CreateSRem(LHS, RHS), 
-                     ConstantInt::get(LHS->getType(), 0)); 
-      Checks.push_back(Check); 
-    } 
-    break; 
-  } 
-  case Instruction::AShr: 
-  case Instruction::LShr: 
-  case Instruction::Shl: { 
-    Value *ShiftCheck = 
-      B.CreateICmp(ICmpInst::ICMP_UGE, RHS, 
-                   ConstantInt::get(RHS->getType(), 
-                                    LHS->getType()->getScalarSizeInBits())); 
-    Checks.push_back(ShiftCheck); 
-    break; 
-  } 
-  }; 
-} 
- 
-/// Given an instruction which can produce poison on non-poison inputs 
-/// (i.e. canCreatePoison returns true), generate runtime checks to produce 
-/// boolean indicators of when poison would result. 
-static void generateCreationChecks(Instruction &I, 
-                                   SmallVectorImpl<Value*> &Checks) { 
-  IRBuilder<> B(&I); 
-  if (isa<BinaryOperator>(I) && !I.getType()->isVectorTy()) 
-    generateCreationChecksForBinOp(I, Checks); 
- 
-  // Handle non-binops separately 
-  switch (I.getOpcode()) { 
-  default: 
-    // Note there are a couple of missing cases here, once implemented, this 
-    // should become an llvm_unreachable. 
-    break; 
-  case Instruction::ExtractElement: { 
-    Value *Vec = I.getOperand(0); 
-    auto *VecVTy = dyn_cast<FixedVectorType>(Vec->getType()); 
-    if (!VecVTy) 
-      break; 
-    Value *Idx = I.getOperand(1); 
-    unsigned NumElts = VecVTy->getNumElements(); 
-    Value *Check = 
-      B.CreateICmp(ICmpInst::ICMP_UGE, Idx, 
-                   ConstantInt::get(Idx->getType(), NumElts)); 
-    Checks.push_back(Check); 
-    break; 
-  } 
-  case Instruction::InsertElement: { 
-    Value *Vec = I.getOperand(0); 
-    auto *VecVTy = dyn_cast<FixedVectorType>(Vec->getType()); 
-    if (!VecVTy) 
-      break; 
-    Value *Idx = I.getOperand(2); 
-    unsigned NumElts = VecVTy->getNumElements(); 
-    Value *Check = 
-      B.CreateICmp(ICmpInst::ICMP_UGE, Idx, 
-                   ConstantInt::get(Idx->getType(), NumElts)); 
-    Checks.push_back(Check); 
-    break; 
-  } 
-  }; 
-} 
- 
-static Value *getPoisonFor(DenseMap<Value *, Value *> &ValToPoison, Value *V) { 
-  auto Itr = ValToPoison.find(V); 
-  if (Itr != ValToPoison.end()) 
-    return Itr->second; 
-  if (isa<Constant>(V)) { 
-    return ConstantInt::getFalse(V->getContext()); 
-  } 
-  // Return false for unknwon values - this implements a non-strict mode where 
-  // unhandled IR constructs are simply considered to never produce poison.  At 
-  // some point in the future, we probably want a "strict mode" for testing if 
-  // nothing else. 
-  return ConstantInt::getFalse(V->getContext()); 
-} 
- 
-static void CreateAssert(IRBuilder<> &B, Value *Cond) { 
-  assert(Cond->getType()->isIntegerTy(1)); 
-  if (auto *CI = dyn_cast<ConstantInt>(Cond)) 
-    if (CI->isAllOnesValue()) 
-      return; 
- 
-  Module *M = B.GetInsertBlock()->getModule(); 
-  M->getOrInsertFunction("__poison_checker_assert", 
-                         Type::getVoidTy(M->getContext()), 
-                         Type::getInt1Ty(M->getContext())); 
-  Function *TrapFunc = M->getFunction("__poison_checker_assert"); 
-  B.CreateCall(TrapFunc, Cond); 
-} 
- 
-static void CreateAssertNot(IRBuilder<> &B, Value *Cond) { 
-  assert(Cond->getType()->isIntegerTy(1)); 
-  CreateAssert(B, B.CreateNot(Cond)); 
-} 
- 
-static bool rewrite(Function &F) { 
-  auto * const Int1Ty = Type::getInt1Ty(F.getContext()); 
- 
-  DenseMap<Value *, Value *> ValToPoison; 
- 
-  for (BasicBlock &BB : F) 
-    for (auto I = BB.begin(); isa<PHINode>(&*I); I++) { 
-      auto *OldPHI = cast<PHINode>(&*I); 
-      auto *NewPHI = PHINode::Create(Int1Ty, OldPHI->getNumIncomingValues()); 
-      for (unsigned i = 0; i < OldPHI->getNumIncomingValues(); i++) 
-        NewPHI->addIncoming(UndefValue::get(Int1Ty), 
-                            OldPHI->getIncomingBlock(i)); 
-      NewPHI->insertBefore(OldPHI); 
-      ValToPoison[OldPHI] = NewPHI; 
-    } 
- 
-  for (BasicBlock &BB : F) 
-    for (Instruction &I : BB) { 
-      if (isa<PHINode>(I)) continue; 
- 
-      IRBuilder<> B(cast<Instruction>(&I)); 
- 
-      // Note: There are many more sources of documented UB, but this pass only 
-      // attempts to find UB triggered by propagation of poison. 
+//===- PoisonChecking.cpp - -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements a transform pass which instruments IR such that poison semantics
+// are made explicit.  That is, it provides a (possibly partial) executable
+// semantics for every instruction w.r.t. poison as specified in the LLVM
+// LangRef.  There are obvious parallels to the sanitizer tools, but this pass
+// is focused purely on the semantics of LLVM IR, not any particular source
+// language.   If you're looking for something to see if your C/C++ contains
+// UB, this is not it.
+//
+// The rewritten semantics of each instruction will include the following
+// components:
+//
+// 1) The original instruction, unmodified.
+// 2) A propagation rule which translates dynamic information about the poison
+//    state of each input to whether the dynamic output of the instruction
+//    produces poison.
+// 3) A creation rule which validates any poison producing flags on the
+//    instruction itself (e.g. checks for overflow on nsw).
+// 4) A check rule which traps (to a handler function) if this instruction must
+//    execute undefined behavior given the poison state of it's inputs.
+//
+// This is a must analysis based transform; that is, the resulting code may
+// produce a false negative result (not report UB when actually exists
+// according to the LangRef spec), but should never produce a false positive
+// (report UB where it doesn't exist).
+//
+// Use cases for this pass include:
+// - Understanding (and testing!) the implications of the definition of poison
+//   from the LangRef.
+// - Validating the output of a IR fuzzer to ensure that all programs produced
+//   are well defined on the specific input used.
+// - Finding/confirming poison specific miscompiles by checking the poison
+//   status of an input/IR pair is the same before and after an optimization
+//   transform.
+// - Checking that a bugpoint reduction does not introduce UB which didn't
+//   exist in the original program being reduced.
+//
+// The major sources of inaccuracy are currently:
+// - Most validation rules not yet implemented for instructions with poison
+//   relavant flags.  At the moment, only nsw/nuw on add/sub are supported.
+// - UB which is control dependent on a branch on poison is not yet
+//   reported. Currently, only data flow dependence is modeled.
+// - Poison which is propagated through memory is not modeled.  As such,
+//   storing poison to memory and then reloading it will cause a false negative
+//   as we consider the reloaded value to not be poisoned.
+// - Poison propagation across function boundaries is not modeled.  At the
+//   moment, all arguments and return values are assumed not to be poison.
+// - Undef is not modeled.  In particular, the optimizer's freedom to pick
+//   concrete values for undef bits so as to maximize potential for producing
+//   poison is not modeled.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/PoisonChecking.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "poison-checking"
+
+static cl::opt<bool>
+LocalCheck("poison-checking-function-local",
+           cl::init(false),
+           cl::desc("Check that returns are non-poison (for testing)"));
+
+
+static bool isConstantFalse(Value* V) {
+  assert(V->getType()->isIntegerTy(1));
+  if (auto *CI = dyn_cast<ConstantInt>(V))
+    return CI->isZero();
+  return false;
+}
+
+static Value *buildOrChain(IRBuilder<> &B, ArrayRef<Value*> Ops) {
+  if (Ops.size() == 0)
+    return B.getFalse();
+  unsigned i = 0;
+  for (; i < Ops.size() && isConstantFalse(Ops[i]); i++) {}
+  if (i == Ops.size())
+    return B.getFalse();
+  Value *Accum = Ops[i++];
+  for (; i < Ops.size(); i++)
+    if (!isConstantFalse(Ops[i]))
+      Accum = B.CreateOr(Accum, Ops[i]);
+  return Accum;
+}
+
+static void generateCreationChecksForBinOp(Instruction &I,
+                                           SmallVectorImpl<Value*> &Checks) {
+  assert(isa<BinaryOperator>(I));
+
+  IRBuilder<> B(&I);
+  Value *LHS = I.getOperand(0);
+  Value *RHS = I.getOperand(1);
+  switch (I.getOpcode()) {
+  default:
+    return;
+  case Instruction::Add: {
+    if (I.hasNoSignedWrap()) {
+      auto *OverflowOp =
+        B.CreateBinaryIntrinsic(Intrinsic::sadd_with_overflow, LHS, RHS);
+      Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
+    }
+    if (I.hasNoUnsignedWrap()) {
+      auto *OverflowOp =
+        B.CreateBinaryIntrinsic(Intrinsic::uadd_with_overflow, LHS, RHS);
+      Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
+    }
+    break;
+  }
+  case Instruction::Sub: {
+    if (I.hasNoSignedWrap()) {
+      auto *OverflowOp =
+        B.CreateBinaryIntrinsic(Intrinsic::ssub_with_overflow, LHS, RHS);
+      Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
+    }
+    if (I.hasNoUnsignedWrap()) {
+      auto *OverflowOp =
+        B.CreateBinaryIntrinsic(Intrinsic::usub_with_overflow, LHS, RHS);
+      Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
+    }
+    break;
+  }
+  case Instruction::Mul: {
+    if (I.hasNoSignedWrap()) {
+      auto *OverflowOp =
+        B.CreateBinaryIntrinsic(Intrinsic::smul_with_overflow, LHS, RHS);
+      Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
+    }
+    if (I.hasNoUnsignedWrap()) {
+      auto *OverflowOp =
+        B.CreateBinaryIntrinsic(Intrinsic::umul_with_overflow, LHS, RHS);
+      Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
+    }
+    break;
+  }
+  case Instruction::UDiv: {
+    if (I.isExact()) {
+      auto *Check =
+        B.CreateICmp(ICmpInst::ICMP_NE, B.CreateURem(LHS, RHS),
+                     ConstantInt::get(LHS->getType(), 0));
+      Checks.push_back(Check);
+    }
+    break;
+  }
+  case Instruction::SDiv: {
+    if (I.isExact()) {
+      auto *Check =
+        B.CreateICmp(ICmpInst::ICMP_NE, B.CreateSRem(LHS, RHS),
+                     ConstantInt::get(LHS->getType(), 0));
+      Checks.push_back(Check);
+    }
+    break;
+  }
+  case Instruction::AShr:
+  case Instruction::LShr:
+  case Instruction::Shl: {
+    Value *ShiftCheck =
+      B.CreateICmp(ICmpInst::ICMP_UGE, RHS,
+                   ConstantInt::get(RHS->getType(),
+                                    LHS->getType()->getScalarSizeInBits()));
+    Checks.push_back(ShiftCheck);
+    break;
+  }
+  };
+}
+
+/// Given an instruction which can produce poison on non-poison inputs
+/// (i.e. canCreatePoison returns true), generate runtime checks to produce
+/// boolean indicators of when poison would result.
+static void generateCreationChecks(Instruction &I,
+                                   SmallVectorImpl<Value*> &Checks) {
+  IRBuilder<> B(&I);
+  if (isa<BinaryOperator>(I) && !I.getType()->isVectorTy())
+    generateCreationChecksForBinOp(I, Checks);
+
+  // Handle non-binops separately
+  switch (I.getOpcode()) {
+  default:
+    // Note there are a couple of missing cases here, once implemented, this
+    // should become an llvm_unreachable.
+    break;
+  case Instruction::ExtractElement: {
+    Value *Vec = I.getOperand(0);
+    auto *VecVTy = dyn_cast<FixedVectorType>(Vec->getType());
+    if (!VecVTy)
+      break;
+    Value *Idx = I.getOperand(1);
+    unsigned NumElts = VecVTy->getNumElements();
+    Value *Check =
+      B.CreateICmp(ICmpInst::ICMP_UGE, Idx,
+                   ConstantInt::get(Idx->getType(), NumElts));
+    Checks.push_back(Check);
+    break;
+  }
+  case Instruction::InsertElement: {
+    Value *Vec = I.getOperand(0);
+    auto *VecVTy = dyn_cast<FixedVectorType>(Vec->getType());
+    if (!VecVTy)
+      break;
+    Value *Idx = I.getOperand(2);
+    unsigned NumElts = VecVTy->getNumElements();
+    Value *Check =
+      B.CreateICmp(ICmpInst::ICMP_UGE, Idx,
+                   ConstantInt::get(Idx->getType(), NumElts));
+    Checks.push_back(Check);
+    break;
+  }
+  };
+}
+
+static Value *getPoisonFor(DenseMap<Value *, Value *> &ValToPoison, Value *V) {
+  auto Itr = ValToPoison.find(V);
+  if (Itr != ValToPoison.end())
+    return Itr->second;
+  if (isa<Constant>(V)) {
+    return ConstantInt::getFalse(V->getContext());
+  }
+  // Return false for unknwon values - this implements a non-strict mode where
+  // unhandled IR constructs are simply considered to never produce poison.  At
+  // some point in the future, we probably want a "strict mode" for testing if
+  // nothing else.
+  return ConstantInt::getFalse(V->getContext());
+}
+
+static void CreateAssert(IRBuilder<> &B, Value *Cond) {
+  assert(Cond->getType()->isIntegerTy(1));
+  if (auto *CI = dyn_cast<ConstantInt>(Cond))
+    if (CI->isAllOnesValue())
+      return;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  M->getOrInsertFunction("__poison_checker_assert",
+                         Type::getVoidTy(M->getContext()),
+                         Type::getInt1Ty(M->getContext()));
+  Function *TrapFunc = M->getFunction("__poison_checker_assert");
+  B.CreateCall(TrapFunc, Cond);
+}
+
+static void CreateAssertNot(IRBuilder<> &B, Value *Cond) {
+  assert(Cond->getType()->isIntegerTy(1));
+  CreateAssert(B, B.CreateNot(Cond));
+}
+
+static bool rewrite(Function &F) {
+  auto * const Int1Ty = Type::getInt1Ty(F.getContext());
+
+  DenseMap<Value *, Value *> ValToPoison;
+
+  for (BasicBlock &BB : F)
+    for (auto I = BB.begin(); isa<PHINode>(&*I); I++) {
+      auto *OldPHI = cast<PHINode>(&*I);
+      auto *NewPHI = PHINode::Create(Int1Ty, OldPHI->getNumIncomingValues());
+      for (unsigned i = 0; i < OldPHI->getNumIncomingValues(); i++)
+        NewPHI->addIncoming(UndefValue::get(Int1Ty),
+                            OldPHI->getIncomingBlock(i));
+      NewPHI->insertBefore(OldPHI);
+      ValToPoison[OldPHI] = NewPHI;
+    }
+
+  for (BasicBlock &BB : F)
+    for (Instruction &I : BB) {
+      if (isa<PHINode>(I)) continue;
+
+      IRBuilder<> B(cast<Instruction>(&I));
+
+      // Note: There are many more sources of documented UB, but this pass only
+      // attempts to find UB triggered by propagation of poison.
       SmallPtrSet<const Value *, 4> NonPoisonOps;
       getGuaranteedNonPoisonOps(&I, NonPoisonOps);
       for (const Value *Op : NonPoisonOps)
         CreateAssertNot(B, getPoisonFor(ValToPoison, const_cast<Value *>(Op)));
- 
-      if (LocalCheck) 
-        if (auto *RI = dyn_cast<ReturnInst>(&I)) 
-          if (RI->getNumOperands() != 0) { 
-            Value *Op = RI->getOperand(0); 
-            CreateAssertNot(B, getPoisonFor(ValToPoison, Op)); 
-          } 
- 
-      SmallVector<Value*, 4> Checks; 
+
+      if (LocalCheck)
+        if (auto *RI = dyn_cast<ReturnInst>(&I))
+          if (RI->getNumOperands() != 0) {
+            Value *Op = RI->getOperand(0);
+            CreateAssertNot(B, getPoisonFor(ValToPoison, Op));
+          }
+
+      SmallVector<Value*, 4> Checks;
       if (propagatesPoison(cast<Operator>(&I)))
-        for (Value *V : I.operands()) 
-          Checks.push_back(getPoisonFor(ValToPoison, V)); 
- 
+        for (Value *V : I.operands())
+          Checks.push_back(getPoisonFor(ValToPoison, V));
+
       if (canCreatePoison(cast<Operator>(&I)))
-        generateCreationChecks(I, Checks); 
-      ValToPoison[&I] = buildOrChain(B, Checks); 
-    } 
- 
-  for (BasicBlock &BB : F) 
-    for (auto I = BB.begin(); isa<PHINode>(&*I); I++) { 
-      auto *OldPHI = cast<PHINode>(&*I); 
-      if (!ValToPoison.count(OldPHI)) 
-        continue; // skip the newly inserted phis 
-      auto *NewPHI = cast<PHINode>(ValToPoison[OldPHI]); 
-      for (unsigned i = 0; i < OldPHI->getNumIncomingValues(); i++) { 
-        auto *OldVal = OldPHI->getIncomingValue(i); 
-        NewPHI->setIncomingValue(i, getPoisonFor(ValToPoison, OldVal)); 
-      } 
-    } 
-  return true; 
-} 
- 
- 
-PreservedAnalyses PoisonCheckingPass::run(Module &M, 
-                                          ModuleAnalysisManager &AM) { 
-  bool Changed = false; 
-  for (auto &F : M) 
-    Changed |= rewrite(F); 
- 
-  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); 
-} 
- 
-PreservedAnalyses PoisonCheckingPass::run(Function &F, 
-                                          FunctionAnalysisManager &AM) { 
-  return rewrite(F) ? PreservedAnalyses::none() : PreservedAnalyses::all(); 
-} 
- 
-/* Major TODO Items: 
-   - Control dependent poison UB 
-   - Strict mode - (i.e. must analyze every operand) 
-     - Poison through memory 
-     - Function ABIs 
-     - Full coverage of intrinsics, etc.. (ouch) 
- 
-   Instructions w/Unclear Semantics: 
-   - shufflevector - It would seem reasonable for an out of bounds mask element 
-     to produce poison, but the LangRef does not state. 
-   - all binary ops w/vector operands - The likely interpretation would be that 
-     any element overflowing should produce poison for the entire result, but 
-     the LangRef does not state. 
-   - Floating point binary ops w/fmf flags other than (nnan, noinfs).  It seems 
-     strange that only certian flags should be documented as producing poison. 
- 
-   Cases of clear poison semantics not yet implemented: 
-   - Exact flags on ashr/lshr produce poison 
-   - NSW/NUW flags on shl produce poison 
-   - Inbounds flag on getelementptr produce poison 
-   - fptosi/fptoui (out of bounds input) produce poison 
-   - Scalable vector types for insertelement/extractelement 
-   - Floating point binary ops w/fmf nnan/noinfs flags produce poison 
- */ 
+        generateCreationChecks(I, Checks);
+      ValToPoison[&I] = buildOrChain(B, Checks);
+    }
+
+  for (BasicBlock &BB : F)
+    for (auto I = BB.begin(); isa<PHINode>(&*I); I++) {
+      auto *OldPHI = cast<PHINode>(&*I);
+      if (!ValToPoison.count(OldPHI))
+        continue; // skip the newly inserted phis
+      auto *NewPHI = cast<PHINode>(ValToPoison[OldPHI]);
+      for (unsigned i = 0; i < OldPHI->getNumIncomingValues(); i++) {
+        auto *OldVal = OldPHI->getIncomingValue(i);
+        NewPHI->setIncomingValue(i, getPoisonFor(ValToPoison, OldVal));
+      }
+    }
+  return true;
+}
+
+
+PreservedAnalyses PoisonCheckingPass::run(Module &M,
+                                          ModuleAnalysisManager &AM) {
+  bool Changed = false;
+  for (auto &F : M)
+    Changed |= rewrite(F);
+
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
+
+PreservedAnalyses PoisonCheckingPass::run(Function &F,
+                                          FunctionAnalysisManager &AM) {
+  return rewrite(F) ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
+
+/* Major TODO Items:
+   - Control dependent poison UB
+   - Strict mode - (i.e. must analyze every operand)
+     - Poison through memory
+     - Function ABIs
+     - Full coverage of intrinsics, etc.. (ouch)
+
+   Instructions w/Unclear Semantics:
+   - shufflevector - It would seem reasonable for an out of bounds mask element
+     to produce poison, but the LangRef does not state.
+   - all binary ops w/vector operands - The likely interpretation would be that
+     any element overflowing should produce poison for the entire result, but
+     the LangRef does not state.
+   - Floating point binary ops w/fmf flags other than (nnan, noinfs).  It seems
+     strange that only certian flags should be documented as producing poison.
+
+   Cases of clear poison semantics not yet implemented:
+   - Exact flags on ashr/lshr produce poison
+   - NSW/NUW flags on shl produce poison
+   - Inbounds flag on getelementptr produce poison
+   - fptosi/fptoui (out of bounds input) produce poison
+   - Scalable vector types for insertelement/extractelement
+   - Floating point binary ops w/fmf nnan/noinfs flags produce poison
+ */
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index 656cf6267b..2d4b079394 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -1,50 +1,50 @@
-//===-- SanitizerCoverage.cpp - coverage instrumentation for sanitizers ---===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// Coverage instrumentation done on LLVM IR level, works with Sanitizers. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Instrumentation/SanitizerCoverage.h" 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/Analysis/EHPersonalities.h" 
-#include "llvm/Analysis/PostDominators.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DebugInfo.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GlobalVariable.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InlineAsm.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/MDBuilder.h" 
-#include "llvm/IR/Mangler.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/SpecialCaseList.h" 
-#include "llvm/Support/VirtualFileSystem.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Instrumentation.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/ModuleUtils.h" 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "sancov" 
- 
+//===-- SanitizerCoverage.cpp - coverage instrumentation for sanitizers ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Coverage instrumentation done on LLVM IR level, works with Sanitizers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/SanitizerCoverage.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/SpecialCaseList.h"
+#include "llvm/Support/VirtualFileSystem.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "sancov"
+
 const char SanCovTracePCIndirName[] = "__sanitizer_cov_trace_pc_indir";
 const char SanCovTracePCName[] = "__sanitizer_cov_trace_pc";
 const char SanCovTraceCmp1[] = "__sanitizer_cov_trace_cmp1";
@@ -60,935 +60,935 @@ const char SanCovTraceDiv8[] = "__sanitizer_cov_trace_div8";
 const char SanCovTraceGep[] = "__sanitizer_cov_trace_gep";
 const char SanCovTraceSwitchName[] = "__sanitizer_cov_trace_switch";
 const char SanCovModuleCtorTracePcGuardName[] =
-    "sancov.module_ctor_trace_pc_guard"; 
+    "sancov.module_ctor_trace_pc_guard";
 const char SanCovModuleCtor8bitCountersName[] =
-    "sancov.module_ctor_8bit_counters"; 
+    "sancov.module_ctor_8bit_counters";
 const char SanCovModuleCtorBoolFlagName[] = "sancov.module_ctor_bool_flag";
-static const uint64_t SanCtorAndDtorPriority = 2; 
- 
+static const uint64_t SanCtorAndDtorPriority = 2;
+
 const char SanCovTracePCGuardName[] = "__sanitizer_cov_trace_pc_guard";
 const char SanCovTracePCGuardInitName[] = "__sanitizer_cov_trace_pc_guard_init";
 const char SanCov8bitCountersInitName[] = "__sanitizer_cov_8bit_counters_init";
 const char SanCovBoolFlagInitName[] = "__sanitizer_cov_bool_flag_init";
 const char SanCovPCsInitName[] = "__sanitizer_cov_pcs_init";
- 
+
 const char SanCovGuardsSectionName[] = "sancov_guards";
 const char SanCovCountersSectionName[] = "sancov_cntrs";
 const char SanCovBoolFlagSectionName[] = "sancov_bools";
 const char SanCovPCsSectionName[] = "sancov_pcs";
- 
+
 const char SanCovLowestStackName[] = "__sancov_lowest_stack";
- 
-static cl::opt<int> ClCoverageLevel( 
-    "sanitizer-coverage-level", 
-    cl::desc("Sanitizer Coverage. 0: none, 1: entry block, 2: all blocks, " 
-             "3: all blocks and critical edges"), 
-    cl::Hidden, cl::init(0)); 
- 
-static cl::opt<bool> ClTracePC("sanitizer-coverage-trace-pc", 
-                               cl::desc("Experimental pc tracing"), cl::Hidden, 
-                               cl::init(false)); 
- 
-static cl::opt<bool> ClTracePCGuard("sanitizer-coverage-trace-pc-guard", 
-                                    cl::desc("pc tracing with a guard"), 
-                                    cl::Hidden, cl::init(false)); 
- 
-// If true, we create a global variable that contains PCs of all instrumented 
-// BBs, put this global into a named section, and pass this section's bounds 
-// to __sanitizer_cov_pcs_init. 
-// This way the coverage instrumentation does not need to acquire the PCs 
-// at run-time. Works with trace-pc-guard, inline-8bit-counters, and 
-// inline-bool-flag. 
-static cl::opt<bool> ClCreatePCTable("sanitizer-coverage-pc-table", 
-                                     cl::desc("create a static PC table"), 
-                                     cl::Hidden, cl::init(false)); 
- 
-static cl::opt<bool> 
-    ClInline8bitCounters("sanitizer-coverage-inline-8bit-counters", 
-                         cl::desc("increments 8-bit counter for every edge"), 
-                         cl::Hidden, cl::init(false)); 
- 
-static cl::opt<bool> 
-    ClInlineBoolFlag("sanitizer-coverage-inline-bool-flag", 
-                     cl::desc("sets a boolean flag for every edge"), cl::Hidden, 
-                     cl::init(false)); 
- 
-static cl::opt<bool> 
-    ClCMPTracing("sanitizer-coverage-trace-compares", 
-                 cl::desc("Tracing of CMP and similar instructions"), 
-                 cl::Hidden, cl::init(false)); 
- 
-static cl::opt<bool> ClDIVTracing("sanitizer-coverage-trace-divs", 
-                                  cl::desc("Tracing of DIV instructions"), 
-                                  cl::Hidden, cl::init(false)); 
- 
-static cl::opt<bool> ClGEPTracing("sanitizer-coverage-trace-geps", 
-                                  cl::desc("Tracing of GEP instructions"), 
-                                  cl::Hidden, cl::init(false)); 
- 
-static cl::opt<bool> 
-    ClPruneBlocks("sanitizer-coverage-prune-blocks", 
-                  cl::desc("Reduce the number of instrumented blocks"), 
-                  cl::Hidden, cl::init(true)); 
- 
-static cl::opt<bool> ClStackDepth("sanitizer-coverage-stack-depth", 
-                                  cl::desc("max stack depth tracing"), 
-                                  cl::Hidden, cl::init(false)); 
- 
-namespace { 
- 
-SanitizerCoverageOptions getOptions(int LegacyCoverageLevel) { 
-  SanitizerCoverageOptions Res; 
-  switch (LegacyCoverageLevel) { 
-  case 0: 
-    Res.CoverageType = SanitizerCoverageOptions::SCK_None; 
-    break; 
-  case 1: 
-    Res.CoverageType = SanitizerCoverageOptions::SCK_Function; 
-    break; 
-  case 2: 
-    Res.CoverageType = SanitizerCoverageOptions::SCK_BB; 
-    break; 
-  case 3: 
-    Res.CoverageType = SanitizerCoverageOptions::SCK_Edge; 
-    break; 
-  case 4: 
-    Res.CoverageType = SanitizerCoverageOptions::SCK_Edge; 
-    Res.IndirectCalls = true; 
-    break; 
-  } 
-  return Res; 
-} 
- 
-SanitizerCoverageOptions OverrideFromCL(SanitizerCoverageOptions Options) { 
-  // Sets CoverageType and IndirectCalls. 
-  SanitizerCoverageOptions CLOpts = getOptions(ClCoverageLevel); 
-  Options.CoverageType = std::max(Options.CoverageType, CLOpts.CoverageType); 
-  Options.IndirectCalls |= CLOpts.IndirectCalls; 
-  Options.TraceCmp |= ClCMPTracing; 
-  Options.TraceDiv |= ClDIVTracing; 
-  Options.TraceGep |= ClGEPTracing; 
-  Options.TracePC |= ClTracePC; 
-  Options.TracePCGuard |= ClTracePCGuard; 
-  Options.Inline8bitCounters |= ClInline8bitCounters; 
-  Options.InlineBoolFlag |= ClInlineBoolFlag; 
-  Options.PCTable |= ClCreatePCTable; 
-  Options.NoPrune |= !ClPruneBlocks; 
-  Options.StackDepth |= ClStackDepth; 
-  if (!Options.TracePCGuard && !Options.TracePC && 
-      !Options.Inline8bitCounters && !Options.StackDepth && 
-      !Options.InlineBoolFlag) 
-    Options.TracePCGuard = true; // TracePCGuard is default. 
-  return Options; 
-} 
- 
-using DomTreeCallback = function_ref<const DominatorTree *(Function &F)>; 
-using PostDomTreeCallback = 
-    function_ref<const PostDominatorTree *(Function &F)>; 
- 
-class ModuleSanitizerCoverage { 
-public: 
-  ModuleSanitizerCoverage( 
-      const SanitizerCoverageOptions &Options = SanitizerCoverageOptions(), 
-      const SpecialCaseList *Allowlist = nullptr, 
-      const SpecialCaseList *Blocklist = nullptr) 
-      : Options(OverrideFromCL(Options)), Allowlist(Allowlist), 
-        Blocklist(Blocklist) {} 
-  bool instrumentModule(Module &M, DomTreeCallback DTCallback, 
-                        PostDomTreeCallback PDTCallback); 
- 
-private: 
-  void instrumentFunction(Function &F, DomTreeCallback DTCallback, 
-                          PostDomTreeCallback PDTCallback); 
-  void InjectCoverageForIndirectCalls(Function &F, 
-                                      ArrayRef<Instruction *> IndirCalls); 
-  void InjectTraceForCmp(Function &F, ArrayRef<Instruction *> CmpTraceTargets); 
-  void InjectTraceForDiv(Function &F, 
-                         ArrayRef<BinaryOperator *> DivTraceTargets); 
-  void InjectTraceForGep(Function &F, 
-                         ArrayRef<GetElementPtrInst *> GepTraceTargets); 
-  void InjectTraceForSwitch(Function &F, 
-                            ArrayRef<Instruction *> SwitchTraceTargets); 
-  bool InjectCoverage(Function &F, ArrayRef<BasicBlock *> AllBlocks, 
-                      bool IsLeafFunc = true); 
-  GlobalVariable *CreateFunctionLocalArrayInSection(size_t NumElements, 
-                                                    Function &F, Type *Ty, 
-                                                    const char *Section); 
-  GlobalVariable *CreatePCArray(Function &F, ArrayRef<BasicBlock *> AllBlocks); 
-  void CreateFunctionLocalArrays(Function &F, ArrayRef<BasicBlock *> AllBlocks); 
-  void InjectCoverageAtBlock(Function &F, BasicBlock &BB, size_t Idx, 
-                             bool IsLeafFunc = true); 
-  Function *CreateInitCallsForSections(Module &M, const char *CtorName, 
-                                       const char *InitFunctionName, Type *Ty, 
-                                       const char *Section); 
-  std::pair<Value *, Value *> CreateSecStartEnd(Module &M, const char *Section, 
-                                                Type *Ty); 
- 
-  void SetNoSanitizeMetadata(Instruction *I) { 
-    I->setMetadata(I->getModule()->getMDKindID("nosanitize"), 
-                   MDNode::get(*C, None)); 
-  } 
- 
-  std::string getSectionName(const std::string &Section) const; 
-  std::string getSectionStart(const std::string &Section) const; 
-  std::string getSectionEnd(const std::string &Section) const; 
-  FunctionCallee SanCovTracePCIndir; 
-  FunctionCallee SanCovTracePC, SanCovTracePCGuard; 
-  FunctionCallee SanCovTraceCmpFunction[4]; 
-  FunctionCallee SanCovTraceConstCmpFunction[4]; 
-  FunctionCallee SanCovTraceDivFunction[2]; 
-  FunctionCallee SanCovTraceGepFunction; 
-  FunctionCallee SanCovTraceSwitchFunction; 
-  GlobalVariable *SanCovLowestStack; 
-  Type *IntptrTy, *IntptrPtrTy, *Int64Ty, *Int64PtrTy, *Int32Ty, *Int32PtrTy, 
-      *Int16Ty, *Int8Ty, *Int8PtrTy, *Int1Ty, *Int1PtrTy; 
-  Module *CurModule; 
-  std::string CurModuleUniqueId; 
-  Triple TargetTriple; 
-  LLVMContext *C; 
-  const DataLayout *DL; 
- 
-  GlobalVariable *FunctionGuardArray;  // for trace-pc-guard. 
-  GlobalVariable *Function8bitCounterArray;  // for inline-8bit-counters. 
-  GlobalVariable *FunctionBoolArray;         // for inline-bool-flag. 
-  GlobalVariable *FunctionPCsArray;  // for pc-table. 
-  SmallVector<GlobalValue *, 20> GlobalsToAppendToUsed; 
-  SmallVector<GlobalValue *, 20> GlobalsToAppendToCompilerUsed; 
- 
-  SanitizerCoverageOptions Options; 
- 
-  const SpecialCaseList *Allowlist; 
-  const SpecialCaseList *Blocklist; 
-}; 
- 
-class ModuleSanitizerCoverageLegacyPass : public ModulePass { 
-public: 
-  ModuleSanitizerCoverageLegacyPass( 
-      const SanitizerCoverageOptions &Options = SanitizerCoverageOptions(), 
-      const std::vector<std::string> &AllowlistFiles = 
-          std::vector<std::string>(), 
-      const std::vector<std::string> &BlocklistFiles = 
-          std::vector<std::string>()) 
-      : ModulePass(ID), Options(Options) { 
-    if (AllowlistFiles.size() > 0) 
-      Allowlist = SpecialCaseList::createOrDie(AllowlistFiles, 
-                                               *vfs::getRealFileSystem()); 
-    if (BlocklistFiles.size() > 0) 
-      Blocklist = SpecialCaseList::createOrDie(BlocklistFiles, 
-                                               *vfs::getRealFileSystem()); 
-    initializeModuleSanitizerCoverageLegacyPassPass( 
-        *PassRegistry::getPassRegistry()); 
-  } 
-  bool runOnModule(Module &M) override { 
-    ModuleSanitizerCoverage ModuleSancov(Options, Allowlist.get(), 
-                                         Blocklist.get()); 
-    auto DTCallback = [this](Function &F) -> const DominatorTree * { 
-      return &this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree(); 
-    }; 
-    auto PDTCallback = [this](Function &F) -> const PostDominatorTree * { 
-      return &this->getAnalysis<PostDominatorTreeWrapperPass>(F) 
-                  .getPostDomTree(); 
-    }; 
-    return ModuleSancov.instrumentModule(M, DTCallback, PDTCallback); 
-  } 
- 
-  static char ID; // Pass identification, replacement for typeid 
-  StringRef getPassName() const override { return "ModuleSanitizerCoverage"; } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addRequired<PostDominatorTreeWrapperPass>(); 
-  } 
- 
-private: 
-  SanitizerCoverageOptions Options; 
- 
-  std::unique_ptr<SpecialCaseList> Allowlist; 
-  std::unique_ptr<SpecialCaseList> Blocklist; 
-}; 
- 
-} // namespace 
- 
-PreservedAnalyses ModuleSanitizerCoveragePass::run(Module &M, 
-                                                   ModuleAnalysisManager &MAM) { 
-  ModuleSanitizerCoverage ModuleSancov(Options, Allowlist.get(), 
-                                       Blocklist.get()); 
-  auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 
-  auto DTCallback = [&FAM](Function &F) -> const DominatorTree * { 
-    return &FAM.getResult<DominatorTreeAnalysis>(F); 
-  }; 
-  auto PDTCallback = [&FAM](Function &F) -> const PostDominatorTree * { 
-    return &FAM.getResult<PostDominatorTreeAnalysis>(F); 
-  }; 
-  if (ModuleSancov.instrumentModule(M, DTCallback, PDTCallback)) 
-    return PreservedAnalyses::none(); 
-  return PreservedAnalyses::all(); 
-} 
- 
-std::pair<Value *, Value *> 
-ModuleSanitizerCoverage::CreateSecStartEnd(Module &M, const char *Section, 
-                                           Type *Ty) { 
+
+static cl::opt<int> ClCoverageLevel(
+    "sanitizer-coverage-level",
+    cl::desc("Sanitizer Coverage. 0: none, 1: entry block, 2: all blocks, "
+             "3: all blocks and critical edges"),
+    cl::Hidden, cl::init(0));
+
+static cl::opt<bool> ClTracePC("sanitizer-coverage-trace-pc",
+                               cl::desc("Experimental pc tracing"), cl::Hidden,
+                               cl::init(false));
+
+static cl::opt<bool> ClTracePCGuard("sanitizer-coverage-trace-pc-guard",
+                                    cl::desc("pc tracing with a guard"),
+                                    cl::Hidden, cl::init(false));
+
+// If true, we create a global variable that contains PCs of all instrumented
+// BBs, put this global into a named section, and pass this section's bounds
+// to __sanitizer_cov_pcs_init.
+// This way the coverage instrumentation does not need to acquire the PCs
+// at run-time. Works with trace-pc-guard, inline-8bit-counters, and
+// inline-bool-flag.
+static cl::opt<bool> ClCreatePCTable("sanitizer-coverage-pc-table",
+                                     cl::desc("create a static PC table"),
+                                     cl::Hidden, cl::init(false));
+
+static cl::opt<bool>
+    ClInline8bitCounters("sanitizer-coverage-inline-8bit-counters",
+                         cl::desc("increments 8-bit counter for every edge"),
+                         cl::Hidden, cl::init(false));
+
+static cl::opt<bool>
+    ClInlineBoolFlag("sanitizer-coverage-inline-bool-flag",
+                     cl::desc("sets a boolean flag for every edge"), cl::Hidden,
+                     cl::init(false));
+
+static cl::opt<bool>
+    ClCMPTracing("sanitizer-coverage-trace-compares",
+                 cl::desc("Tracing of CMP and similar instructions"),
+                 cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClDIVTracing("sanitizer-coverage-trace-divs",
+                                  cl::desc("Tracing of DIV instructions"),
+                                  cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClGEPTracing("sanitizer-coverage-trace-geps",
+                                  cl::desc("Tracing of GEP instructions"),
+                                  cl::Hidden, cl::init(false));
+
+static cl::opt<bool>
+    ClPruneBlocks("sanitizer-coverage-prune-blocks",
+                  cl::desc("Reduce the number of instrumented blocks"),
+                  cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClStackDepth("sanitizer-coverage-stack-depth",
+                                  cl::desc("max stack depth tracing"),
+                                  cl::Hidden, cl::init(false));
+
+namespace {
+
+SanitizerCoverageOptions getOptions(int LegacyCoverageLevel) {
+  SanitizerCoverageOptions Res;
+  switch (LegacyCoverageLevel) {
+  case 0:
+    Res.CoverageType = SanitizerCoverageOptions::SCK_None;
+    break;
+  case 1:
+    Res.CoverageType = SanitizerCoverageOptions::SCK_Function;
+    break;
+  case 2:
+    Res.CoverageType = SanitizerCoverageOptions::SCK_BB;
+    break;
+  case 3:
+    Res.CoverageType = SanitizerCoverageOptions::SCK_Edge;
+    break;
+  case 4:
+    Res.CoverageType = SanitizerCoverageOptions::SCK_Edge;
+    Res.IndirectCalls = true;
+    break;
+  }
+  return Res;
+}
+
+SanitizerCoverageOptions OverrideFromCL(SanitizerCoverageOptions Options) {
+  // Sets CoverageType and IndirectCalls.
+  SanitizerCoverageOptions CLOpts = getOptions(ClCoverageLevel);
+  Options.CoverageType = std::max(Options.CoverageType, CLOpts.CoverageType);
+  Options.IndirectCalls |= CLOpts.IndirectCalls;
+  Options.TraceCmp |= ClCMPTracing;
+  Options.TraceDiv |= ClDIVTracing;
+  Options.TraceGep |= ClGEPTracing;
+  Options.TracePC |= ClTracePC;
+  Options.TracePCGuard |= ClTracePCGuard;
+  Options.Inline8bitCounters |= ClInline8bitCounters;
+  Options.InlineBoolFlag |= ClInlineBoolFlag;
+  Options.PCTable |= ClCreatePCTable;
+  Options.NoPrune |= !ClPruneBlocks;
+  Options.StackDepth |= ClStackDepth;
+  if (!Options.TracePCGuard && !Options.TracePC &&
+      !Options.Inline8bitCounters && !Options.StackDepth &&
+      !Options.InlineBoolFlag)
+    Options.TracePCGuard = true; // TracePCGuard is default.
+  return Options;
+}
+
+using DomTreeCallback = function_ref<const DominatorTree *(Function &F)>;
+using PostDomTreeCallback =
+    function_ref<const PostDominatorTree *(Function &F)>;
+
+class ModuleSanitizerCoverage {
+public:
+  ModuleSanitizerCoverage(
+      const SanitizerCoverageOptions &Options = SanitizerCoverageOptions(),
+      const SpecialCaseList *Allowlist = nullptr,
+      const SpecialCaseList *Blocklist = nullptr)
+      : Options(OverrideFromCL(Options)), Allowlist(Allowlist),
+        Blocklist(Blocklist) {}
+  bool instrumentModule(Module &M, DomTreeCallback DTCallback,
+                        PostDomTreeCallback PDTCallback);
+
+private:
+  void instrumentFunction(Function &F, DomTreeCallback DTCallback,
+                          PostDomTreeCallback PDTCallback);
+  void InjectCoverageForIndirectCalls(Function &F,
+                                      ArrayRef<Instruction *> IndirCalls);
+  void InjectTraceForCmp(Function &F, ArrayRef<Instruction *> CmpTraceTargets);
+  void InjectTraceForDiv(Function &F,
+                         ArrayRef<BinaryOperator *> DivTraceTargets);
+  void InjectTraceForGep(Function &F,
+                         ArrayRef<GetElementPtrInst *> GepTraceTargets);
+  void InjectTraceForSwitch(Function &F,
+                            ArrayRef<Instruction *> SwitchTraceTargets);
+  bool InjectCoverage(Function &F, ArrayRef<BasicBlock *> AllBlocks,
+                      bool IsLeafFunc = true);
+  GlobalVariable *CreateFunctionLocalArrayInSection(size_t NumElements,
+                                                    Function &F, Type *Ty,
+                                                    const char *Section);
+  GlobalVariable *CreatePCArray(Function &F, ArrayRef<BasicBlock *> AllBlocks);
+  void CreateFunctionLocalArrays(Function &F, ArrayRef<BasicBlock *> AllBlocks);
+  void InjectCoverageAtBlock(Function &F, BasicBlock &BB, size_t Idx,
+                             bool IsLeafFunc = true);
+  Function *CreateInitCallsForSections(Module &M, const char *CtorName,
+                                       const char *InitFunctionName, Type *Ty,
+                                       const char *Section);
+  std::pair<Value *, Value *> CreateSecStartEnd(Module &M, const char *Section,
+                                                Type *Ty);
+
+  void SetNoSanitizeMetadata(Instruction *I) {
+    I->setMetadata(I->getModule()->getMDKindID("nosanitize"),
+                   MDNode::get(*C, None));
+  }
+
+  std::string getSectionName(const std::string &Section) const;
+  std::string getSectionStart(const std::string &Section) const;
+  std::string getSectionEnd(const std::string &Section) const;
+  FunctionCallee SanCovTracePCIndir;
+  FunctionCallee SanCovTracePC, SanCovTracePCGuard;
+  FunctionCallee SanCovTraceCmpFunction[4];
+  FunctionCallee SanCovTraceConstCmpFunction[4];
+  FunctionCallee SanCovTraceDivFunction[2];
+  FunctionCallee SanCovTraceGepFunction;
+  FunctionCallee SanCovTraceSwitchFunction;
+  GlobalVariable *SanCovLowestStack;
+  Type *IntptrTy, *IntptrPtrTy, *Int64Ty, *Int64PtrTy, *Int32Ty, *Int32PtrTy,
+      *Int16Ty, *Int8Ty, *Int8PtrTy, *Int1Ty, *Int1PtrTy;
+  Module *CurModule;
+  std::string CurModuleUniqueId;
+  Triple TargetTriple;
+  LLVMContext *C;
+  const DataLayout *DL;
+
+  GlobalVariable *FunctionGuardArray;  // for trace-pc-guard.
+  GlobalVariable *Function8bitCounterArray;  // for inline-8bit-counters.
+  GlobalVariable *FunctionBoolArray;         // for inline-bool-flag.
+  GlobalVariable *FunctionPCsArray;  // for pc-table.
+  SmallVector<GlobalValue *, 20> GlobalsToAppendToUsed;
+  SmallVector<GlobalValue *, 20> GlobalsToAppendToCompilerUsed;
+
+  SanitizerCoverageOptions Options;
+
+  const SpecialCaseList *Allowlist;
+  const SpecialCaseList *Blocklist;
+};
+
+class ModuleSanitizerCoverageLegacyPass : public ModulePass {
+public:
+  ModuleSanitizerCoverageLegacyPass(
+      const SanitizerCoverageOptions &Options = SanitizerCoverageOptions(),
+      const std::vector<std::string> &AllowlistFiles =
+          std::vector<std::string>(),
+      const std::vector<std::string> &BlocklistFiles =
+          std::vector<std::string>())
+      : ModulePass(ID), Options(Options) {
+    if (AllowlistFiles.size() > 0)
+      Allowlist = SpecialCaseList::createOrDie(AllowlistFiles,
+                                               *vfs::getRealFileSystem());
+    if (BlocklistFiles.size() > 0)
+      Blocklist = SpecialCaseList::createOrDie(BlocklistFiles,
+                                               *vfs::getRealFileSystem());
+    initializeModuleSanitizerCoverageLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+  bool runOnModule(Module &M) override {
+    ModuleSanitizerCoverage ModuleSancov(Options, Allowlist.get(),
+                                         Blocklist.get());
+    auto DTCallback = [this](Function &F) -> const DominatorTree * {
+      return &this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+    };
+    auto PDTCallback = [this](Function &F) -> const PostDominatorTree * {
+      return &this->getAnalysis<PostDominatorTreeWrapperPass>(F)
+                  .getPostDomTree();
+    };
+    return ModuleSancov.instrumentModule(M, DTCallback, PDTCallback);
+  }
+
+  static char ID; // Pass identification, replacement for typeid
+  StringRef getPassName() const override { return "ModuleSanitizerCoverage"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<PostDominatorTreeWrapperPass>();
+  }
+
+private:
+  SanitizerCoverageOptions Options;
+
+  std::unique_ptr<SpecialCaseList> Allowlist;
+  std::unique_ptr<SpecialCaseList> Blocklist;
+};
+
+} // namespace
+
+PreservedAnalyses ModuleSanitizerCoveragePass::run(Module &M,
+                                                   ModuleAnalysisManager &MAM) {
+  ModuleSanitizerCoverage ModuleSancov(Options, Allowlist.get(),
+                                       Blocklist.get());
+  auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto DTCallback = [&FAM](Function &F) -> const DominatorTree * {
+    return &FAM.getResult<DominatorTreeAnalysis>(F);
+  };
+  auto PDTCallback = [&FAM](Function &F) -> const PostDominatorTree * {
+    return &FAM.getResult<PostDominatorTreeAnalysis>(F);
+  };
+  if (ModuleSancov.instrumentModule(M, DTCallback, PDTCallback))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+std::pair<Value *, Value *>
+ModuleSanitizerCoverage::CreateSecStartEnd(Module &M, const char *Section,
+                                           Type *Ty) {
   GlobalVariable *SecStart = new GlobalVariable(
       M, Ty->getPointerElementType(), false, GlobalVariable::ExternalLinkage,
       nullptr, getSectionStart(Section));
-  SecStart->setVisibility(GlobalValue::HiddenVisibility); 
+  SecStart->setVisibility(GlobalValue::HiddenVisibility);
   GlobalVariable *SecEnd = new GlobalVariable(
       M, Ty->getPointerElementType(), false, GlobalVariable::ExternalLinkage,
       nullptr, getSectionEnd(Section));
-  SecEnd->setVisibility(GlobalValue::HiddenVisibility); 
-  IRBuilder<> IRB(M.getContext()); 
-  if (!TargetTriple.isOSBinFormatCOFF()) 
+  SecEnd->setVisibility(GlobalValue::HiddenVisibility);
+  IRBuilder<> IRB(M.getContext());
+  if (!TargetTriple.isOSBinFormatCOFF())
     return std::make_pair(SecStart, SecEnd);
- 
-  // Account for the fact that on windows-msvc __start_* symbols actually 
-  // point to a uint64_t before the start of the array. 
-  auto SecStartI8Ptr = IRB.CreatePointerCast(SecStart, Int8PtrTy); 
-  auto GEP = IRB.CreateGEP(Int8Ty, SecStartI8Ptr, 
-                           ConstantInt::get(IntptrTy, sizeof(uint64_t))); 
+
+  // Account for the fact that on windows-msvc __start_* symbols actually
+  // point to a uint64_t before the start of the array.
+  auto SecStartI8Ptr = IRB.CreatePointerCast(SecStart, Int8PtrTy);
+  auto GEP = IRB.CreateGEP(Int8Ty, SecStartI8Ptr,
+                           ConstantInt::get(IntptrTy, sizeof(uint64_t)));
   return std::make_pair(IRB.CreatePointerCast(GEP, Ty), SecEnd);
-} 
- 
-Function *ModuleSanitizerCoverage::CreateInitCallsForSections( 
-    Module &M, const char *CtorName, const char *InitFunctionName, Type *Ty, 
-    const char *Section) { 
-  auto SecStartEnd = CreateSecStartEnd(M, Section, Ty); 
-  auto SecStart = SecStartEnd.first; 
-  auto SecEnd = SecStartEnd.second; 
-  Function *CtorFunc; 
-  std::tie(CtorFunc, std::ignore) = createSanitizerCtorAndInitFunctions( 
-      M, CtorName, InitFunctionName, {Ty, Ty}, {SecStart, SecEnd}); 
-  assert(CtorFunc->getName() == CtorName); 
- 
-  if (TargetTriple.supportsCOMDAT()) { 
-    // Use comdat to dedup CtorFunc. 
-    CtorFunc->setComdat(M.getOrInsertComdat(CtorName)); 
-    appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority, CtorFunc); 
-  } else { 
-    appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority); 
-  } 
- 
-  if (TargetTriple.isOSBinFormatCOFF()) { 
-    // In COFF files, if the contructors are set as COMDAT (they are because 
-    // COFF supports COMDAT) and the linker flag /OPT:REF (strip unreferenced 
-    // functions and data) is used, the constructors get stripped. To prevent 
-    // this, give the constructors weak ODR linkage and ensure the linker knows 
-    // to include the sancov constructor. This way the linker can deduplicate 
-    // the constructors but always leave one copy. 
-    CtorFunc->setLinkage(GlobalValue::WeakODRLinkage); 
-    appendToUsed(M, CtorFunc); 
-  } 
-  return CtorFunc; 
-} 
- 
-bool ModuleSanitizerCoverage::instrumentModule( 
-    Module &M, DomTreeCallback DTCallback, PostDomTreeCallback PDTCallback) { 
-  if (Options.CoverageType == SanitizerCoverageOptions::SCK_None) 
-    return false; 
-  if (Allowlist && 
-      !Allowlist->inSection("coverage", "src", M.getSourceFileName())) 
-    return false; 
-  if (Blocklist && 
-      Blocklist->inSection("coverage", "src", M.getSourceFileName())) 
-    return false; 
-  C = &(M.getContext()); 
-  DL = &M.getDataLayout(); 
-  CurModule = &M; 
-  CurModuleUniqueId = getUniqueModuleId(CurModule); 
-  TargetTriple = Triple(M.getTargetTriple()); 
-  FunctionGuardArray = nullptr; 
-  Function8bitCounterArray = nullptr; 
-  FunctionBoolArray = nullptr; 
-  FunctionPCsArray = nullptr; 
-  IntptrTy = Type::getIntNTy(*C, DL->getPointerSizeInBits()); 
-  IntptrPtrTy = PointerType::getUnqual(IntptrTy); 
-  Type *VoidTy = Type::getVoidTy(*C); 
-  IRBuilder<> IRB(*C); 
-  Int64PtrTy = PointerType::getUnqual(IRB.getInt64Ty()); 
-  Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty()); 
-  Int8PtrTy = PointerType::getUnqual(IRB.getInt8Ty()); 
-  Int1PtrTy = PointerType::getUnqual(IRB.getInt1Ty()); 
-  Int64Ty = IRB.getInt64Ty(); 
-  Int32Ty = IRB.getInt32Ty(); 
-  Int16Ty = IRB.getInt16Ty(); 
-  Int8Ty = IRB.getInt8Ty(); 
-  Int1Ty = IRB.getInt1Ty(); 
- 
-  SanCovTracePCIndir = 
-      M.getOrInsertFunction(SanCovTracePCIndirName, VoidTy, IntptrTy); 
+}
+
+Function *ModuleSanitizerCoverage::CreateInitCallsForSections(
+    Module &M, const char *CtorName, const char *InitFunctionName, Type *Ty,
+    const char *Section) {
+  auto SecStartEnd = CreateSecStartEnd(M, Section, Ty);
+  auto SecStart = SecStartEnd.first;
+  auto SecEnd = SecStartEnd.second;
+  Function *CtorFunc;
+  std::tie(CtorFunc, std::ignore) = createSanitizerCtorAndInitFunctions(
+      M, CtorName, InitFunctionName, {Ty, Ty}, {SecStart, SecEnd});
+  assert(CtorFunc->getName() == CtorName);
+
+  if (TargetTriple.supportsCOMDAT()) {
+    // Use comdat to dedup CtorFunc.
+    CtorFunc->setComdat(M.getOrInsertComdat(CtorName));
+    appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority, CtorFunc);
+  } else {
+    appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority);
+  }
+
+  if (TargetTriple.isOSBinFormatCOFF()) {
+    // In COFF files, if the contructors are set as COMDAT (they are because
+    // COFF supports COMDAT) and the linker flag /OPT:REF (strip unreferenced
+    // functions and data) is used, the constructors get stripped. To prevent
+    // this, give the constructors weak ODR linkage and ensure the linker knows
+    // to include the sancov constructor. This way the linker can deduplicate
+    // the constructors but always leave one copy.
+    CtorFunc->setLinkage(GlobalValue::WeakODRLinkage);
+    appendToUsed(M, CtorFunc);
+  }
+  return CtorFunc;
+}
+
+bool ModuleSanitizerCoverage::instrumentModule(
+    Module &M, DomTreeCallback DTCallback, PostDomTreeCallback PDTCallback) {
+  if (Options.CoverageType == SanitizerCoverageOptions::SCK_None)
+    return false;
+  if (Allowlist &&
+      !Allowlist->inSection("coverage", "src", M.getSourceFileName()))
+    return false;
+  if (Blocklist &&
+      Blocklist->inSection("coverage", "src", M.getSourceFileName()))
+    return false;
+  C = &(M.getContext());
+  DL = &M.getDataLayout();
+  CurModule = &M;
+  CurModuleUniqueId = getUniqueModuleId(CurModule);
+  TargetTriple = Triple(M.getTargetTriple());
+  FunctionGuardArray = nullptr;
+  Function8bitCounterArray = nullptr;
+  FunctionBoolArray = nullptr;
+  FunctionPCsArray = nullptr;
+  IntptrTy = Type::getIntNTy(*C, DL->getPointerSizeInBits());
+  IntptrPtrTy = PointerType::getUnqual(IntptrTy);
+  Type *VoidTy = Type::getVoidTy(*C);
+  IRBuilder<> IRB(*C);
+  Int64PtrTy = PointerType::getUnqual(IRB.getInt64Ty());
+  Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty());
+  Int8PtrTy = PointerType::getUnqual(IRB.getInt8Ty());
+  Int1PtrTy = PointerType::getUnqual(IRB.getInt1Ty());
+  Int64Ty = IRB.getInt64Ty();
+  Int32Ty = IRB.getInt32Ty();
+  Int16Ty = IRB.getInt16Ty();
+  Int8Ty = IRB.getInt8Ty();
+  Int1Ty = IRB.getInt1Ty();
+
+  SanCovTracePCIndir =
+      M.getOrInsertFunction(SanCovTracePCIndirName, VoidTy, IntptrTy);
   // Make sure smaller parameters are zero-extended to i64 if required by the
   // target ABI.
-  AttributeList SanCovTraceCmpZeroExtAL; 
+  AttributeList SanCovTraceCmpZeroExtAL;
   SanCovTraceCmpZeroExtAL =
       SanCovTraceCmpZeroExtAL.addParamAttribute(*C, 0, Attribute::ZExt);
   SanCovTraceCmpZeroExtAL =
       SanCovTraceCmpZeroExtAL.addParamAttribute(*C, 1, Attribute::ZExt);
- 
-  SanCovTraceCmpFunction[0] = 
-      M.getOrInsertFunction(SanCovTraceCmp1, SanCovTraceCmpZeroExtAL, VoidTy, 
-                            IRB.getInt8Ty(), IRB.getInt8Ty()); 
-  SanCovTraceCmpFunction[1] = 
-      M.getOrInsertFunction(SanCovTraceCmp2, SanCovTraceCmpZeroExtAL, VoidTy, 
-                            IRB.getInt16Ty(), IRB.getInt16Ty()); 
-  SanCovTraceCmpFunction[2] = 
-      M.getOrInsertFunction(SanCovTraceCmp4, SanCovTraceCmpZeroExtAL, VoidTy, 
-                            IRB.getInt32Ty(), IRB.getInt32Ty()); 
-  SanCovTraceCmpFunction[3] = 
-      M.getOrInsertFunction(SanCovTraceCmp8, VoidTy, Int64Ty, Int64Ty); 
- 
-  SanCovTraceConstCmpFunction[0] = M.getOrInsertFunction( 
-      SanCovTraceConstCmp1, SanCovTraceCmpZeroExtAL, VoidTy, Int8Ty, Int8Ty); 
-  SanCovTraceConstCmpFunction[1] = M.getOrInsertFunction( 
-      SanCovTraceConstCmp2, SanCovTraceCmpZeroExtAL, VoidTy, Int16Ty, Int16Ty); 
-  SanCovTraceConstCmpFunction[2] = M.getOrInsertFunction( 
-      SanCovTraceConstCmp4, SanCovTraceCmpZeroExtAL, VoidTy, Int32Ty, Int32Ty); 
-  SanCovTraceConstCmpFunction[3] = 
-      M.getOrInsertFunction(SanCovTraceConstCmp8, VoidTy, Int64Ty, Int64Ty); 
- 
-  { 
-    AttributeList AL; 
+
+  SanCovTraceCmpFunction[0] =
+      M.getOrInsertFunction(SanCovTraceCmp1, SanCovTraceCmpZeroExtAL, VoidTy,
+                            IRB.getInt8Ty(), IRB.getInt8Ty());
+  SanCovTraceCmpFunction[1] =
+      M.getOrInsertFunction(SanCovTraceCmp2, SanCovTraceCmpZeroExtAL, VoidTy,
+                            IRB.getInt16Ty(), IRB.getInt16Ty());
+  SanCovTraceCmpFunction[2] =
+      M.getOrInsertFunction(SanCovTraceCmp4, SanCovTraceCmpZeroExtAL, VoidTy,
+                            IRB.getInt32Ty(), IRB.getInt32Ty());
+  SanCovTraceCmpFunction[3] =
+      M.getOrInsertFunction(SanCovTraceCmp8, VoidTy, Int64Ty, Int64Ty);
+
+  SanCovTraceConstCmpFunction[0] = M.getOrInsertFunction(
+      SanCovTraceConstCmp1, SanCovTraceCmpZeroExtAL, VoidTy, Int8Ty, Int8Ty);
+  SanCovTraceConstCmpFunction[1] = M.getOrInsertFunction(
+      SanCovTraceConstCmp2, SanCovTraceCmpZeroExtAL, VoidTy, Int16Ty, Int16Ty);
+  SanCovTraceConstCmpFunction[2] = M.getOrInsertFunction(
+      SanCovTraceConstCmp4, SanCovTraceCmpZeroExtAL, VoidTy, Int32Ty, Int32Ty);
+  SanCovTraceConstCmpFunction[3] =
+      M.getOrInsertFunction(SanCovTraceConstCmp8, VoidTy, Int64Ty, Int64Ty);
+
+  {
+    AttributeList AL;
     AL = AL.addParamAttribute(*C, 0, Attribute::ZExt);
-    SanCovTraceDivFunction[0] = 
-        M.getOrInsertFunction(SanCovTraceDiv4, AL, VoidTy, IRB.getInt32Ty()); 
-  } 
-  SanCovTraceDivFunction[1] = 
-      M.getOrInsertFunction(SanCovTraceDiv8, VoidTy, Int64Ty); 
-  SanCovTraceGepFunction = 
-      M.getOrInsertFunction(SanCovTraceGep, VoidTy, IntptrTy); 
-  SanCovTraceSwitchFunction = 
-      M.getOrInsertFunction(SanCovTraceSwitchName, VoidTy, Int64Ty, Int64PtrTy); 
- 
-  Constant *SanCovLowestStackConstant = 
-      M.getOrInsertGlobal(SanCovLowestStackName, IntptrTy); 
-  SanCovLowestStack = dyn_cast<GlobalVariable>(SanCovLowestStackConstant); 
-  if (!SanCovLowestStack) { 
-    C->emitError(StringRef("'") + SanCovLowestStackName + 
-                 "' should not be declared by the user"); 
-    return true; 
-  } 
-  SanCovLowestStack->setThreadLocalMode( 
-      GlobalValue::ThreadLocalMode::InitialExecTLSModel); 
-  if (Options.StackDepth && !SanCovLowestStack->isDeclaration()) 
-    SanCovLowestStack->setInitializer(Constant::getAllOnesValue(IntptrTy)); 
- 
-  SanCovTracePC = M.getOrInsertFunction(SanCovTracePCName, VoidTy); 
-  SanCovTracePCGuard = 
-      M.getOrInsertFunction(SanCovTracePCGuardName, VoidTy, Int32PtrTy); 
- 
-  for (auto &F : M) 
-    instrumentFunction(F, DTCallback, PDTCallback); 
- 
-  Function *Ctor = nullptr; 
- 
-  if (FunctionGuardArray) 
-    Ctor = CreateInitCallsForSections(M, SanCovModuleCtorTracePcGuardName, 
-                                      SanCovTracePCGuardInitName, Int32PtrTy, 
-                                      SanCovGuardsSectionName); 
-  if (Function8bitCounterArray) 
-    Ctor = CreateInitCallsForSections(M, SanCovModuleCtor8bitCountersName, 
-                                      SanCov8bitCountersInitName, Int8PtrTy, 
-                                      SanCovCountersSectionName); 
-  if (FunctionBoolArray) { 
-    Ctor = CreateInitCallsForSections(M, SanCovModuleCtorBoolFlagName, 
-                                      SanCovBoolFlagInitName, Int1PtrTy, 
-                                      SanCovBoolFlagSectionName); 
-  } 
-  if (Ctor && Options.PCTable) { 
-    auto SecStartEnd = CreateSecStartEnd(M, SanCovPCsSectionName, IntptrPtrTy); 
-    FunctionCallee InitFunction = declareSanitizerInitFunction( 
-        M, SanCovPCsInitName, {IntptrPtrTy, IntptrPtrTy}); 
-    IRBuilder<> IRBCtor(Ctor->getEntryBlock().getTerminator()); 
-    IRBCtor.CreateCall(InitFunction, {SecStartEnd.first, SecStartEnd.second}); 
-  } 
-  // We don't reference these arrays directly in any of our runtime functions, 
-  // so we need to prevent them from being dead stripped. 
-  if (TargetTriple.isOSBinFormatMachO()) 
-    appendToUsed(M, GlobalsToAppendToUsed); 
-  appendToCompilerUsed(M, GlobalsToAppendToCompilerUsed); 
-  return true; 
-} 
- 
-// True if block has successors and it dominates all of them. 
-static bool isFullDominator(const BasicBlock *BB, const DominatorTree *DT) { 
+    SanCovTraceDivFunction[0] =
+        M.getOrInsertFunction(SanCovTraceDiv4, AL, VoidTy, IRB.getInt32Ty());
+  }
+  SanCovTraceDivFunction[1] =
+      M.getOrInsertFunction(SanCovTraceDiv8, VoidTy, Int64Ty);
+  SanCovTraceGepFunction =
+      M.getOrInsertFunction(SanCovTraceGep, VoidTy, IntptrTy);
+  SanCovTraceSwitchFunction =
+      M.getOrInsertFunction(SanCovTraceSwitchName, VoidTy, Int64Ty, Int64PtrTy);
+
+  Constant *SanCovLowestStackConstant =
+      M.getOrInsertGlobal(SanCovLowestStackName, IntptrTy);
+  SanCovLowestStack = dyn_cast<GlobalVariable>(SanCovLowestStackConstant);
+  if (!SanCovLowestStack) {
+    C->emitError(StringRef("'") + SanCovLowestStackName +
+                 "' should not be declared by the user");
+    return true;
+  }
+  SanCovLowestStack->setThreadLocalMode(
+      GlobalValue::ThreadLocalMode::InitialExecTLSModel);
+  if (Options.StackDepth && !SanCovLowestStack->isDeclaration())
+    SanCovLowestStack->setInitializer(Constant::getAllOnesValue(IntptrTy));
+
+  SanCovTracePC = M.getOrInsertFunction(SanCovTracePCName, VoidTy);
+  SanCovTracePCGuard =
+      M.getOrInsertFunction(SanCovTracePCGuardName, VoidTy, Int32PtrTy);
+
+  for (auto &F : M)
+    instrumentFunction(F, DTCallback, PDTCallback);
+
+  Function *Ctor = nullptr;
+
+  if (FunctionGuardArray)
+    Ctor = CreateInitCallsForSections(M, SanCovModuleCtorTracePcGuardName,
+                                      SanCovTracePCGuardInitName, Int32PtrTy,
+                                      SanCovGuardsSectionName);
+  if (Function8bitCounterArray)
+    Ctor = CreateInitCallsForSections(M, SanCovModuleCtor8bitCountersName,
+                                      SanCov8bitCountersInitName, Int8PtrTy,
+                                      SanCovCountersSectionName);
+  if (FunctionBoolArray) {
+    Ctor = CreateInitCallsForSections(M, SanCovModuleCtorBoolFlagName,
+                                      SanCovBoolFlagInitName, Int1PtrTy,
+                                      SanCovBoolFlagSectionName);
+  }
+  if (Ctor && Options.PCTable) {
+    auto SecStartEnd = CreateSecStartEnd(M, SanCovPCsSectionName, IntptrPtrTy);
+    FunctionCallee InitFunction = declareSanitizerInitFunction(
+        M, SanCovPCsInitName, {IntptrPtrTy, IntptrPtrTy});
+    IRBuilder<> IRBCtor(Ctor->getEntryBlock().getTerminator());
+    IRBCtor.CreateCall(InitFunction, {SecStartEnd.first, SecStartEnd.second});
+  }
+  // We don't reference these arrays directly in any of our runtime functions,
+  // so we need to prevent them from being dead stripped.
+  if (TargetTriple.isOSBinFormatMachO())
+    appendToUsed(M, GlobalsToAppendToUsed);
+  appendToCompilerUsed(M, GlobalsToAppendToCompilerUsed);
+  return true;
+}
+
+// True if block has successors and it dominates all of them.
+static bool isFullDominator(const BasicBlock *BB, const DominatorTree *DT) {
   if (succ_empty(BB))
-    return false; 
- 
+    return false;
+
   return llvm::all_of(successors(BB), [&](const BasicBlock *SUCC) {
     return DT->dominates(BB, SUCC);
   });
-} 
- 
-// True if block has predecessors and it postdominates all of them. 
-static bool isFullPostDominator(const BasicBlock *BB, 
-                                const PostDominatorTree *PDT) { 
+}
+
+// True if block has predecessors and it postdominates all of them.
+static bool isFullPostDominator(const BasicBlock *BB,
+                                const PostDominatorTree *PDT) {
   if (pred_empty(BB))
-    return false; 
- 
+    return false;
+
   return llvm::all_of(predecessors(BB), [&](const BasicBlock *PRED) {
     return PDT->dominates(BB, PRED);
   });
-} 
- 
-static bool shouldInstrumentBlock(const Function &F, const BasicBlock *BB, 
-                                  const DominatorTree *DT, 
-                                  const PostDominatorTree *PDT, 
-                                  const SanitizerCoverageOptions &Options) { 
-  // Don't insert coverage for blocks containing nothing but unreachable: we 
-  // will never call __sanitizer_cov() for them, so counting them in 
-  // NumberOfInstrumentedBlocks() might complicate calculation of code coverage 
-  // percentage. Also, unreachable instructions frequently have no debug 
-  // locations. 
-  if (isa<UnreachableInst>(BB->getFirstNonPHIOrDbgOrLifetime())) 
-    return false; 
- 
-  // Don't insert coverage into blocks without a valid insertion point 
-  // (catchswitch blocks). 
-  if (BB->getFirstInsertionPt() == BB->end()) 
-    return false; 
- 
-  if (Options.NoPrune || &F.getEntryBlock() == BB) 
-    return true; 
- 
-  if (Options.CoverageType == SanitizerCoverageOptions::SCK_Function && 
-      &F.getEntryBlock() != BB) 
-    return false; 
- 
-  // Do not instrument full dominators, or full post-dominators with multiple 
-  // predecessors. 
-  return !isFullDominator(BB, DT) 
-    && !(isFullPostDominator(BB, PDT) && !BB->getSinglePredecessor()); 
-} 
- 
- 
-// Returns true iff From->To is a backedge. 
-// A twist here is that we treat From->To as a backedge if 
-//   * To dominates From or 
-//   * To->UniqueSuccessor dominates From 
-static bool IsBackEdge(BasicBlock *From, BasicBlock *To, 
-                       const DominatorTree *DT) { 
-  if (DT->dominates(To, From)) 
-    return true; 
-  if (auto Next = To->getUniqueSuccessor()) 
-    if (DT->dominates(Next, From)) 
-      return true; 
-  return false; 
-} 
- 
-// Prunes uninteresting Cmp instrumentation: 
-//   * CMP instructions that feed into loop backedge branch. 
-// 
-// Note that Cmp pruning is controlled by the same flag as the 
-// BB pruning. 
-static bool IsInterestingCmp(ICmpInst *CMP, const DominatorTree *DT, 
-                             const SanitizerCoverageOptions &Options) { 
-  if (!Options.NoPrune) 
-    if (CMP->hasOneUse()) 
-      if (auto BR = dyn_cast<BranchInst>(CMP->user_back())) 
-        for (BasicBlock *B : BR->successors()) 
-          if (IsBackEdge(BR->getParent(), B, DT)) 
-            return false; 
-  return true; 
-} 
- 
-void ModuleSanitizerCoverage::instrumentFunction( 
-    Function &F, DomTreeCallback DTCallback, PostDomTreeCallback PDTCallback) { 
-  if (F.empty()) 
-    return; 
-  if (F.getName().find(".module_ctor") != std::string::npos) 
-    return; // Should not instrument sanitizer init functions. 
-  if (F.getName().startswith("__sanitizer_")) 
-    return; // Don't instrument __sanitizer_* callbacks. 
-  // Don't touch available_externally functions, their actual body is elewhere. 
-  if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) 
-    return; 
-  // Don't instrument MSVC CRT configuration helpers. They may run before normal 
-  // initialization. 
-  if (F.getName() == "__local_stdio_printf_options" || 
-      F.getName() == "__local_stdio_scanf_options") 
-    return; 
-  if (isa<UnreachableInst>(F.getEntryBlock().getTerminator())) 
-    return; 
-  // Don't instrument functions using SEH for now. Splitting basic blocks like 
-  // we do for coverage breaks WinEHPrepare. 
-  // FIXME: Remove this when SEH no longer uses landingpad pattern matching. 
-  if (F.hasPersonalityFn() && 
-      isAsynchronousEHPersonality(classifyEHPersonality(F.getPersonalityFn()))) 
-    return; 
-  if (Allowlist && !Allowlist->inSection("coverage", "fun", F.getName())) 
-    return; 
-  if (Blocklist && Blocklist->inSection("coverage", "fun", F.getName())) 
-    return; 
-  if (Options.CoverageType >= SanitizerCoverageOptions::SCK_Edge) 
-    SplitAllCriticalEdges(F, CriticalEdgeSplittingOptions().setIgnoreUnreachableDests()); 
-  SmallVector<Instruction *, 8> IndirCalls; 
-  SmallVector<BasicBlock *, 16> BlocksToInstrument; 
-  SmallVector<Instruction *, 8> CmpTraceTargets; 
-  SmallVector<Instruction *, 8> SwitchTraceTargets; 
-  SmallVector<BinaryOperator *, 8> DivTraceTargets; 
-  SmallVector<GetElementPtrInst *, 8> GepTraceTargets; 
- 
-  const DominatorTree *DT = DTCallback(F); 
-  const PostDominatorTree *PDT = PDTCallback(F); 
-  bool IsLeafFunc = true; 
- 
-  for (auto &BB : F) { 
-    if (shouldInstrumentBlock(F, &BB, DT, PDT, Options)) 
-      BlocksToInstrument.push_back(&BB); 
-    for (auto &Inst : BB) { 
-      if (Options.IndirectCalls) { 
-        CallBase *CB = dyn_cast<CallBase>(&Inst); 
-        if (CB && !CB->getCalledFunction()) 
-          IndirCalls.push_back(&Inst); 
-      } 
-      if (Options.TraceCmp) { 
-        if (ICmpInst *CMP = dyn_cast<ICmpInst>(&Inst)) 
-          if (IsInterestingCmp(CMP, DT, Options)) 
-            CmpTraceTargets.push_back(&Inst); 
-        if (isa<SwitchInst>(&Inst)) 
-          SwitchTraceTargets.push_back(&Inst); 
-      } 
-      if (Options.TraceDiv) 
-        if (BinaryOperator *BO = dyn_cast<BinaryOperator>(&Inst)) 
-          if (BO->getOpcode() == Instruction::SDiv || 
-              BO->getOpcode() == Instruction::UDiv) 
-            DivTraceTargets.push_back(BO); 
-      if (Options.TraceGep) 
-        if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Inst)) 
-          GepTraceTargets.push_back(GEP); 
-      if (Options.StackDepth) 
-        if (isa<InvokeInst>(Inst) || 
-            (isa<CallInst>(Inst) && !isa<IntrinsicInst>(Inst))) 
-          IsLeafFunc = false; 
-    } 
-  } 
- 
-  InjectCoverage(F, BlocksToInstrument, IsLeafFunc); 
-  InjectCoverageForIndirectCalls(F, IndirCalls); 
-  InjectTraceForCmp(F, CmpTraceTargets); 
-  InjectTraceForSwitch(F, SwitchTraceTargets); 
-  InjectTraceForDiv(F, DivTraceTargets); 
-  InjectTraceForGep(F, GepTraceTargets); 
-} 
- 
-GlobalVariable *ModuleSanitizerCoverage::CreateFunctionLocalArrayInSection( 
-    size_t NumElements, Function &F, Type *Ty, const char *Section) { 
-  ArrayType *ArrayTy = ArrayType::get(Ty, NumElements); 
-  auto Array = new GlobalVariable( 
-      *CurModule, ArrayTy, false, GlobalVariable::PrivateLinkage, 
-      Constant::getNullValue(ArrayTy), "__sancov_gen_"); 
- 
-  if (TargetTriple.supportsCOMDAT() && !F.isInterposable()) 
-    if (auto Comdat = 
-            GetOrCreateFunctionComdat(F, TargetTriple, CurModuleUniqueId)) 
-      Array->setComdat(Comdat); 
-  Array->setSection(getSectionName(Section)); 
-  Array->setAlignment(Align(DL->getTypeStoreSize(Ty).getFixedSize())); 
-  GlobalsToAppendToUsed.push_back(Array); 
-  GlobalsToAppendToCompilerUsed.push_back(Array); 
-  MDNode *MD = MDNode::get(F.getContext(), ValueAsMetadata::get(&F)); 
-  Array->addMetadata(LLVMContext::MD_associated, *MD); 
- 
-  return Array; 
-} 
- 
-GlobalVariable * 
-ModuleSanitizerCoverage::CreatePCArray(Function &F, 
-                                       ArrayRef<BasicBlock *> AllBlocks) { 
-  size_t N = AllBlocks.size(); 
-  assert(N); 
-  SmallVector<Constant *, 32> PCs; 
-  IRBuilder<> IRB(&*F.getEntryBlock().getFirstInsertionPt()); 
-  for (size_t i = 0; i < N; i++) { 
-    if (&F.getEntryBlock() == AllBlocks[i]) { 
-      PCs.push_back((Constant *)IRB.CreatePointerCast(&F, IntptrPtrTy)); 
-      PCs.push_back((Constant *)IRB.CreateIntToPtr( 
-          ConstantInt::get(IntptrTy, 1), IntptrPtrTy)); 
-    } else { 
-      PCs.push_back((Constant *)IRB.CreatePointerCast( 
-          BlockAddress::get(AllBlocks[i]), IntptrPtrTy)); 
-      PCs.push_back((Constant *)IRB.CreateIntToPtr( 
-          ConstantInt::get(IntptrTy, 0), IntptrPtrTy)); 
-    } 
-  } 
-  auto *PCArray = CreateFunctionLocalArrayInSection(N * 2, F, IntptrPtrTy, 
-                                                    SanCovPCsSectionName); 
-  PCArray->setInitializer( 
-      ConstantArray::get(ArrayType::get(IntptrPtrTy, N * 2), PCs)); 
-  PCArray->setConstant(true); 
- 
-  return PCArray; 
-} 
- 
-void ModuleSanitizerCoverage::CreateFunctionLocalArrays( 
-    Function &F, ArrayRef<BasicBlock *> AllBlocks) { 
-  if (Options.TracePCGuard) 
-    FunctionGuardArray = CreateFunctionLocalArrayInSection( 
-        AllBlocks.size(), F, Int32Ty, SanCovGuardsSectionName); 
- 
-  if (Options.Inline8bitCounters) 
-    Function8bitCounterArray = CreateFunctionLocalArrayInSection( 
-        AllBlocks.size(), F, Int8Ty, SanCovCountersSectionName); 
-  if (Options.InlineBoolFlag) 
-    FunctionBoolArray = CreateFunctionLocalArrayInSection( 
-        AllBlocks.size(), F, Int1Ty, SanCovBoolFlagSectionName); 
- 
-  if (Options.PCTable) 
-    FunctionPCsArray = CreatePCArray(F, AllBlocks); 
-} 
- 
-bool ModuleSanitizerCoverage::InjectCoverage(Function &F, 
-                                             ArrayRef<BasicBlock *> AllBlocks, 
-                                             bool IsLeafFunc) { 
-  if (AllBlocks.empty()) return false; 
-  CreateFunctionLocalArrays(F, AllBlocks); 
-  for (size_t i = 0, N = AllBlocks.size(); i < N; i++) 
-    InjectCoverageAtBlock(F, *AllBlocks[i], i, IsLeafFunc); 
-  return true; 
-} 
- 
-// On every indirect call we call a run-time function 
-// __sanitizer_cov_indir_call* with two parameters: 
-//   - callee address, 
-//   - global cache array that contains CacheSize pointers (zero-initialized). 
-//     The cache is used to speed up recording the caller-callee pairs. 
-// The address of the caller is passed implicitly via caller PC. 
-// CacheSize is encoded in the name of the run-time function. 
-void ModuleSanitizerCoverage::InjectCoverageForIndirectCalls( 
-    Function &F, ArrayRef<Instruction *> IndirCalls) { 
-  if (IndirCalls.empty()) 
-    return; 
-  assert(Options.TracePC || Options.TracePCGuard || 
-         Options.Inline8bitCounters || Options.InlineBoolFlag); 
-  for (auto I : IndirCalls) { 
-    IRBuilder<> IRB(I); 
-    CallBase &CB = cast<CallBase>(*I); 
-    Value *Callee = CB.getCalledOperand(); 
-    if (isa<InlineAsm>(Callee)) 
-      continue; 
-    IRB.CreateCall(SanCovTracePCIndir, IRB.CreatePointerCast(Callee, IntptrTy)); 
-  } 
-} 
- 
-// For every switch statement we insert a call: 
-// __sanitizer_cov_trace_switch(CondValue, 
-//      {NumCases, ValueSizeInBits, Case0Value, Case1Value, Case2Value, ... }) 
- 
-void ModuleSanitizerCoverage::InjectTraceForSwitch( 
-    Function &, ArrayRef<Instruction *> SwitchTraceTargets) { 
-  for (auto I : SwitchTraceTargets) { 
-    if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) { 
-      IRBuilder<> IRB(I); 
-      SmallVector<Constant *, 16> Initializers; 
-      Value *Cond = SI->getCondition(); 
-      if (Cond->getType()->getScalarSizeInBits() > 
-          Int64Ty->getScalarSizeInBits()) 
-        continue; 
-      Initializers.push_back(ConstantInt::get(Int64Ty, SI->getNumCases())); 
-      Initializers.push_back( 
-          ConstantInt::get(Int64Ty, Cond->getType()->getScalarSizeInBits())); 
-      if (Cond->getType()->getScalarSizeInBits() < 
-          Int64Ty->getScalarSizeInBits()) 
-        Cond = IRB.CreateIntCast(Cond, Int64Ty, false); 
-      for (auto It : SI->cases()) { 
-        Constant *C = It.getCaseValue(); 
-        if (C->getType()->getScalarSizeInBits() < 
-            Int64Ty->getScalarSizeInBits()) 
-          C = ConstantExpr::getCast(CastInst::ZExt, It.getCaseValue(), Int64Ty); 
-        Initializers.push_back(C); 
-      } 
+}
+
+static bool shouldInstrumentBlock(const Function &F, const BasicBlock *BB,
+                                  const DominatorTree *DT,
+                                  const PostDominatorTree *PDT,
+                                  const SanitizerCoverageOptions &Options) {
+  // Don't insert coverage for blocks containing nothing but unreachable: we
+  // will never call __sanitizer_cov() for them, so counting them in
+  // NumberOfInstrumentedBlocks() might complicate calculation of code coverage
+  // percentage. Also, unreachable instructions frequently have no debug
+  // locations.
+  if (isa<UnreachableInst>(BB->getFirstNonPHIOrDbgOrLifetime()))
+    return false;
+
+  // Don't insert coverage into blocks without a valid insertion point
+  // (catchswitch blocks).
+  if (BB->getFirstInsertionPt() == BB->end())
+    return false;
+
+  if (Options.NoPrune || &F.getEntryBlock() == BB)
+    return true;
+
+  if (Options.CoverageType == SanitizerCoverageOptions::SCK_Function &&
+      &F.getEntryBlock() != BB)
+    return false;
+
+  // Do not instrument full dominators, or full post-dominators with multiple
+  // predecessors.
+  return !isFullDominator(BB, DT)
+    && !(isFullPostDominator(BB, PDT) && !BB->getSinglePredecessor());
+}
+
+
+// Returns true iff From->To is a backedge.
+// A twist here is that we treat From->To as a backedge if
+//   * To dominates From or
+//   * To->UniqueSuccessor dominates From
+static bool IsBackEdge(BasicBlock *From, BasicBlock *To,
+                       const DominatorTree *DT) {
+  if (DT->dominates(To, From))
+    return true;
+  if (auto Next = To->getUniqueSuccessor())
+    if (DT->dominates(Next, From))
+      return true;
+  return false;
+}
+
+// Prunes uninteresting Cmp instrumentation:
+//   * CMP instructions that feed into loop backedge branch.
+//
+// Note that Cmp pruning is controlled by the same flag as the
+// BB pruning.
+static bool IsInterestingCmp(ICmpInst *CMP, const DominatorTree *DT,
+                             const SanitizerCoverageOptions &Options) {
+  if (!Options.NoPrune)
+    if (CMP->hasOneUse())
+      if (auto BR = dyn_cast<BranchInst>(CMP->user_back()))
+        for (BasicBlock *B : BR->successors())
+          if (IsBackEdge(BR->getParent(), B, DT))
+            return false;
+  return true;
+}
+
+void ModuleSanitizerCoverage::instrumentFunction(
+    Function &F, DomTreeCallback DTCallback, PostDomTreeCallback PDTCallback) {
+  if (F.empty())
+    return;
+  if (F.getName().find(".module_ctor") != std::string::npos)
+    return; // Should not instrument sanitizer init functions.
+  if (F.getName().startswith("__sanitizer_"))
+    return; // Don't instrument __sanitizer_* callbacks.
+  // Don't touch available_externally functions, their actual body is elewhere.
+  if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage)
+    return;
+  // Don't instrument MSVC CRT configuration helpers. They may run before normal
+  // initialization.
+  if (F.getName() == "__local_stdio_printf_options" ||
+      F.getName() == "__local_stdio_scanf_options")
+    return;
+  if (isa<UnreachableInst>(F.getEntryBlock().getTerminator()))
+    return;
+  // Don't instrument functions using SEH for now. Splitting basic blocks like
+  // we do for coverage breaks WinEHPrepare.
+  // FIXME: Remove this when SEH no longer uses landingpad pattern matching.
+  if (F.hasPersonalityFn() &&
+      isAsynchronousEHPersonality(classifyEHPersonality(F.getPersonalityFn())))
+    return;
+  if (Allowlist && !Allowlist->inSection("coverage", "fun", F.getName()))
+    return;
+  if (Blocklist && Blocklist->inSection("coverage", "fun", F.getName()))
+    return;
+  if (Options.CoverageType >= SanitizerCoverageOptions::SCK_Edge)
+    SplitAllCriticalEdges(F, CriticalEdgeSplittingOptions().setIgnoreUnreachableDests());
+  SmallVector<Instruction *, 8> IndirCalls;
+  SmallVector<BasicBlock *, 16> BlocksToInstrument;
+  SmallVector<Instruction *, 8> CmpTraceTargets;
+  SmallVector<Instruction *, 8> SwitchTraceTargets;
+  SmallVector<BinaryOperator *, 8> DivTraceTargets;
+  SmallVector<GetElementPtrInst *, 8> GepTraceTargets;
+
+  const DominatorTree *DT = DTCallback(F);
+  const PostDominatorTree *PDT = PDTCallback(F);
+  bool IsLeafFunc = true;
+
+  for (auto &BB : F) {
+    if (shouldInstrumentBlock(F, &BB, DT, PDT, Options))
+      BlocksToInstrument.push_back(&BB);
+    for (auto &Inst : BB) {
+      if (Options.IndirectCalls) {
+        CallBase *CB = dyn_cast<CallBase>(&Inst);
+        if (CB && !CB->getCalledFunction())
+          IndirCalls.push_back(&Inst);
+      }
+      if (Options.TraceCmp) {
+        if (ICmpInst *CMP = dyn_cast<ICmpInst>(&Inst))
+          if (IsInterestingCmp(CMP, DT, Options))
+            CmpTraceTargets.push_back(&Inst);
+        if (isa<SwitchInst>(&Inst))
+          SwitchTraceTargets.push_back(&Inst);
+      }
+      if (Options.TraceDiv)
+        if (BinaryOperator *BO = dyn_cast<BinaryOperator>(&Inst))
+          if (BO->getOpcode() == Instruction::SDiv ||
+              BO->getOpcode() == Instruction::UDiv)
+            DivTraceTargets.push_back(BO);
+      if (Options.TraceGep)
+        if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Inst))
+          GepTraceTargets.push_back(GEP);
+      if (Options.StackDepth)
+        if (isa<InvokeInst>(Inst) ||
+            (isa<CallInst>(Inst) && !isa<IntrinsicInst>(Inst)))
+          IsLeafFunc = false;
+    }
+  }
+
+  InjectCoverage(F, BlocksToInstrument, IsLeafFunc);
+  InjectCoverageForIndirectCalls(F, IndirCalls);
+  InjectTraceForCmp(F, CmpTraceTargets);
+  InjectTraceForSwitch(F, SwitchTraceTargets);
+  InjectTraceForDiv(F, DivTraceTargets);
+  InjectTraceForGep(F, GepTraceTargets);
+}
+
+GlobalVariable *ModuleSanitizerCoverage::CreateFunctionLocalArrayInSection(
+    size_t NumElements, Function &F, Type *Ty, const char *Section) {
+  ArrayType *ArrayTy = ArrayType::get(Ty, NumElements);
+  auto Array = new GlobalVariable(
+      *CurModule, ArrayTy, false, GlobalVariable::PrivateLinkage,
+      Constant::getNullValue(ArrayTy), "__sancov_gen_");
+
+  if (TargetTriple.supportsCOMDAT() && !F.isInterposable())
+    if (auto Comdat =
+            GetOrCreateFunctionComdat(F, TargetTriple, CurModuleUniqueId))
+      Array->setComdat(Comdat);
+  Array->setSection(getSectionName(Section));
+  Array->setAlignment(Align(DL->getTypeStoreSize(Ty).getFixedSize()));
+  GlobalsToAppendToUsed.push_back(Array);
+  GlobalsToAppendToCompilerUsed.push_back(Array);
+  MDNode *MD = MDNode::get(F.getContext(), ValueAsMetadata::get(&F));
+  Array->addMetadata(LLVMContext::MD_associated, *MD);
+
+  return Array;
+}
+
+GlobalVariable *
+ModuleSanitizerCoverage::CreatePCArray(Function &F,
+                                       ArrayRef<BasicBlock *> AllBlocks) {
+  size_t N = AllBlocks.size();
+  assert(N);
+  SmallVector<Constant *, 32> PCs;
+  IRBuilder<> IRB(&*F.getEntryBlock().getFirstInsertionPt());
+  for (size_t i = 0; i < N; i++) {
+    if (&F.getEntryBlock() == AllBlocks[i]) {
+      PCs.push_back((Constant *)IRB.CreatePointerCast(&F, IntptrPtrTy));
+      PCs.push_back((Constant *)IRB.CreateIntToPtr(
+          ConstantInt::get(IntptrTy, 1), IntptrPtrTy));
+    } else {
+      PCs.push_back((Constant *)IRB.CreatePointerCast(
+          BlockAddress::get(AllBlocks[i]), IntptrPtrTy));
+      PCs.push_back((Constant *)IRB.CreateIntToPtr(
+          ConstantInt::get(IntptrTy, 0), IntptrPtrTy));
+    }
+  }
+  auto *PCArray = CreateFunctionLocalArrayInSection(N * 2, F, IntptrPtrTy,
+                                                    SanCovPCsSectionName);
+  PCArray->setInitializer(
+      ConstantArray::get(ArrayType::get(IntptrPtrTy, N * 2), PCs));
+  PCArray->setConstant(true);
+
+  return PCArray;
+}
+
+void ModuleSanitizerCoverage::CreateFunctionLocalArrays(
+    Function &F, ArrayRef<BasicBlock *> AllBlocks) {
+  if (Options.TracePCGuard)
+    FunctionGuardArray = CreateFunctionLocalArrayInSection(
+        AllBlocks.size(), F, Int32Ty, SanCovGuardsSectionName);
+
+  if (Options.Inline8bitCounters)
+    Function8bitCounterArray = CreateFunctionLocalArrayInSection(
+        AllBlocks.size(), F, Int8Ty, SanCovCountersSectionName);
+  if (Options.InlineBoolFlag)
+    FunctionBoolArray = CreateFunctionLocalArrayInSection(
+        AllBlocks.size(), F, Int1Ty, SanCovBoolFlagSectionName);
+
+  if (Options.PCTable)
+    FunctionPCsArray = CreatePCArray(F, AllBlocks);
+}
+
+bool ModuleSanitizerCoverage::InjectCoverage(Function &F,
+                                             ArrayRef<BasicBlock *> AllBlocks,
+                                             bool IsLeafFunc) {
+  if (AllBlocks.empty()) return false;
+  CreateFunctionLocalArrays(F, AllBlocks);
+  for (size_t i = 0, N = AllBlocks.size(); i < N; i++)
+    InjectCoverageAtBlock(F, *AllBlocks[i], i, IsLeafFunc);
+  return true;
+}
+
+// On every indirect call we call a run-time function
+// __sanitizer_cov_indir_call* with two parameters:
+//   - callee address,
+//   - global cache array that contains CacheSize pointers (zero-initialized).
+//     The cache is used to speed up recording the caller-callee pairs.
+// The address of the caller is passed implicitly via caller PC.
+// CacheSize is encoded in the name of the run-time function.
+void ModuleSanitizerCoverage::InjectCoverageForIndirectCalls(
+    Function &F, ArrayRef<Instruction *> IndirCalls) {
+  if (IndirCalls.empty())
+    return;
+  assert(Options.TracePC || Options.TracePCGuard ||
+         Options.Inline8bitCounters || Options.InlineBoolFlag);
+  for (auto I : IndirCalls) {
+    IRBuilder<> IRB(I);
+    CallBase &CB = cast<CallBase>(*I);
+    Value *Callee = CB.getCalledOperand();
+    if (isa<InlineAsm>(Callee))
+      continue;
+    IRB.CreateCall(SanCovTracePCIndir, IRB.CreatePointerCast(Callee, IntptrTy));
+  }
+}
+
+// For every switch statement we insert a call:
+// __sanitizer_cov_trace_switch(CondValue,
+//      {NumCases, ValueSizeInBits, Case0Value, Case1Value, Case2Value, ... })
+
+void ModuleSanitizerCoverage::InjectTraceForSwitch(
+    Function &, ArrayRef<Instruction *> SwitchTraceTargets) {
+  for (auto I : SwitchTraceTargets) {
+    if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+      IRBuilder<> IRB(I);
+      SmallVector<Constant *, 16> Initializers;
+      Value *Cond = SI->getCondition();
+      if (Cond->getType()->getScalarSizeInBits() >
+          Int64Ty->getScalarSizeInBits())
+        continue;
+      Initializers.push_back(ConstantInt::get(Int64Ty, SI->getNumCases()));
+      Initializers.push_back(
+          ConstantInt::get(Int64Ty, Cond->getType()->getScalarSizeInBits()));
+      if (Cond->getType()->getScalarSizeInBits() <
+          Int64Ty->getScalarSizeInBits())
+        Cond = IRB.CreateIntCast(Cond, Int64Ty, false);
+      for (auto It : SI->cases()) {
+        Constant *C = It.getCaseValue();
+        if (C->getType()->getScalarSizeInBits() <
+            Int64Ty->getScalarSizeInBits())
+          C = ConstantExpr::getCast(CastInst::ZExt, It.getCaseValue(), Int64Ty);
+        Initializers.push_back(C);
+      }
       llvm::sort(drop_begin(Initializers, 2),
-                 [](const Constant *A, const Constant *B) { 
-                   return cast<ConstantInt>(A)->getLimitedValue() < 
-                          cast<ConstantInt>(B)->getLimitedValue(); 
-                 }); 
-      ArrayType *ArrayOfInt64Ty = ArrayType::get(Int64Ty, Initializers.size()); 
-      GlobalVariable *GV = new GlobalVariable( 
-          *CurModule, ArrayOfInt64Ty, false, GlobalVariable::InternalLinkage, 
-          ConstantArray::get(ArrayOfInt64Ty, Initializers), 
-          "__sancov_gen_cov_switch_values"); 
-      IRB.CreateCall(SanCovTraceSwitchFunction, 
-                     {Cond, IRB.CreatePointerCast(GV, Int64PtrTy)}); 
-    } 
-  } 
-} 
- 
-void ModuleSanitizerCoverage::InjectTraceForDiv( 
-    Function &, ArrayRef<BinaryOperator *> DivTraceTargets) { 
-  for (auto BO : DivTraceTargets) { 
-    IRBuilder<> IRB(BO); 
-    Value *A1 = BO->getOperand(1); 
-    if (isa<ConstantInt>(A1)) continue; 
-    if (!A1->getType()->isIntegerTy()) 
-      continue; 
-    uint64_t TypeSize = DL->getTypeStoreSizeInBits(A1->getType()); 
-    int CallbackIdx = TypeSize == 32 ? 0 : 
-        TypeSize == 64 ? 1 : -1; 
-    if (CallbackIdx < 0) continue; 
-    auto Ty = Type::getIntNTy(*C, TypeSize); 
-    IRB.CreateCall(SanCovTraceDivFunction[CallbackIdx], 
-                   {IRB.CreateIntCast(A1, Ty, true)}); 
-  } 
-} 
- 
-void ModuleSanitizerCoverage::InjectTraceForGep( 
-    Function &, ArrayRef<GetElementPtrInst *> GepTraceTargets) { 
-  for (auto GEP : GepTraceTargets) { 
-    IRBuilder<> IRB(GEP); 
-    for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I) 
-      if (!isa<ConstantInt>(*I) && (*I)->getType()->isIntegerTy()) 
-        IRB.CreateCall(SanCovTraceGepFunction, 
-                       {IRB.CreateIntCast(*I, IntptrTy, true)}); 
-  } 
-} 
- 
-void ModuleSanitizerCoverage::InjectTraceForCmp( 
-    Function &, ArrayRef<Instruction *> CmpTraceTargets) { 
-  for (auto I : CmpTraceTargets) { 
-    if (ICmpInst *ICMP = dyn_cast<ICmpInst>(I)) { 
-      IRBuilder<> IRB(ICMP); 
-      Value *A0 = ICMP->getOperand(0); 
-      Value *A1 = ICMP->getOperand(1); 
-      if (!A0->getType()->isIntegerTy()) 
-        continue; 
-      uint64_t TypeSize = DL->getTypeStoreSizeInBits(A0->getType()); 
-      int CallbackIdx = TypeSize == 8 ? 0 : 
-                        TypeSize == 16 ? 1 : 
-                        TypeSize == 32 ? 2 : 
-                        TypeSize == 64 ? 3 : -1; 
-      if (CallbackIdx < 0) continue; 
-      // __sanitizer_cov_trace_cmp((type_size << 32) | predicate, A0, A1); 
-      auto CallbackFunc = SanCovTraceCmpFunction[CallbackIdx]; 
-      bool FirstIsConst = isa<ConstantInt>(A0); 
-      bool SecondIsConst = isa<ConstantInt>(A1); 
-      // If both are const, then we don't need such a comparison. 
-      if (FirstIsConst && SecondIsConst) continue; 
-      // If only one is const, then make it the first callback argument. 
-      if (FirstIsConst || SecondIsConst) { 
-        CallbackFunc = SanCovTraceConstCmpFunction[CallbackIdx]; 
-        if (SecondIsConst) 
-          std::swap(A0, A1); 
-      } 
- 
-      auto Ty = Type::getIntNTy(*C, TypeSize); 
-      IRB.CreateCall(CallbackFunc, {IRB.CreateIntCast(A0, Ty, true), 
-              IRB.CreateIntCast(A1, Ty, true)}); 
-    } 
-  } 
-} 
- 
-void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB, 
-                                                    size_t Idx, 
-                                                    bool IsLeafFunc) { 
-  BasicBlock::iterator IP = BB.getFirstInsertionPt(); 
-  bool IsEntryBB = &BB == &F.getEntryBlock(); 
-  DebugLoc EntryLoc; 
-  if (IsEntryBB) { 
-    if (auto SP = F.getSubprogram()) 
+                 [](const Constant *A, const Constant *B) {
+                   return cast<ConstantInt>(A)->getLimitedValue() <
+                          cast<ConstantInt>(B)->getLimitedValue();
+                 });
+      ArrayType *ArrayOfInt64Ty = ArrayType::get(Int64Ty, Initializers.size());
+      GlobalVariable *GV = new GlobalVariable(
+          *CurModule, ArrayOfInt64Ty, false, GlobalVariable::InternalLinkage,
+          ConstantArray::get(ArrayOfInt64Ty, Initializers),
+          "__sancov_gen_cov_switch_values");
+      IRB.CreateCall(SanCovTraceSwitchFunction,
+                     {Cond, IRB.CreatePointerCast(GV, Int64PtrTy)});
+    }
+  }
+}
+
+void ModuleSanitizerCoverage::InjectTraceForDiv(
+    Function &, ArrayRef<BinaryOperator *> DivTraceTargets) {
+  for (auto BO : DivTraceTargets) {
+    IRBuilder<> IRB(BO);
+    Value *A1 = BO->getOperand(1);
+    if (isa<ConstantInt>(A1)) continue;
+    if (!A1->getType()->isIntegerTy())
+      continue;
+    uint64_t TypeSize = DL->getTypeStoreSizeInBits(A1->getType());
+    int CallbackIdx = TypeSize == 32 ? 0 :
+        TypeSize == 64 ? 1 : -1;
+    if (CallbackIdx < 0) continue;
+    auto Ty = Type::getIntNTy(*C, TypeSize);
+    IRB.CreateCall(SanCovTraceDivFunction[CallbackIdx],
+                   {IRB.CreateIntCast(A1, Ty, true)});
+  }
+}
+
+void ModuleSanitizerCoverage::InjectTraceForGep(
+    Function &, ArrayRef<GetElementPtrInst *> GepTraceTargets) {
+  for (auto GEP : GepTraceTargets) {
+    IRBuilder<> IRB(GEP);
+    for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I)
+      if (!isa<ConstantInt>(*I) && (*I)->getType()->isIntegerTy())
+        IRB.CreateCall(SanCovTraceGepFunction,
+                       {IRB.CreateIntCast(*I, IntptrTy, true)});
+  }
+}
+
+void ModuleSanitizerCoverage::InjectTraceForCmp(
+    Function &, ArrayRef<Instruction *> CmpTraceTargets) {
+  for (auto I : CmpTraceTargets) {
+    if (ICmpInst *ICMP = dyn_cast<ICmpInst>(I)) {
+      IRBuilder<> IRB(ICMP);
+      Value *A0 = ICMP->getOperand(0);
+      Value *A1 = ICMP->getOperand(1);
+      if (!A0->getType()->isIntegerTy())
+        continue;
+      uint64_t TypeSize = DL->getTypeStoreSizeInBits(A0->getType());
+      int CallbackIdx = TypeSize == 8 ? 0 :
+                        TypeSize == 16 ? 1 :
+                        TypeSize == 32 ? 2 :
+                        TypeSize == 64 ? 3 : -1;
+      if (CallbackIdx < 0) continue;
+      // __sanitizer_cov_trace_cmp((type_size << 32) | predicate, A0, A1);
+      auto CallbackFunc = SanCovTraceCmpFunction[CallbackIdx];
+      bool FirstIsConst = isa<ConstantInt>(A0);
+      bool SecondIsConst = isa<ConstantInt>(A1);
+      // If both are const, then we don't need such a comparison.
+      if (FirstIsConst && SecondIsConst) continue;
+      // If only one is const, then make it the first callback argument.
+      if (FirstIsConst || SecondIsConst) {
+        CallbackFunc = SanCovTraceConstCmpFunction[CallbackIdx];
+        if (SecondIsConst)
+          std::swap(A0, A1);
+      }
+
+      auto Ty = Type::getIntNTy(*C, TypeSize);
+      IRB.CreateCall(CallbackFunc, {IRB.CreateIntCast(A0, Ty, true),
+              IRB.CreateIntCast(A1, Ty, true)});
+    }
+  }
+}
+
+void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
+                                                    size_t Idx,
+                                                    bool IsLeafFunc) {
+  BasicBlock::iterator IP = BB.getFirstInsertionPt();
+  bool IsEntryBB = &BB == &F.getEntryBlock();
+  DebugLoc EntryLoc;
+  if (IsEntryBB) {
+    if (auto SP = F.getSubprogram())
       EntryLoc = DILocation::get(SP->getContext(), SP->getScopeLine(), 0, SP);
-    // Keep static allocas and llvm.localescape calls in the entry block.  Even 
-    // if we aren't splitting the block, it's nice for allocas to be before 
-    // calls. 
-    IP = PrepareToSplitEntryBlock(BB, IP); 
-  } else { 
-    EntryLoc = IP->getDebugLoc(); 
-  } 
- 
-  IRBuilder<> IRB(&*IP); 
-  IRB.SetCurrentDebugLocation(EntryLoc); 
-  if (Options.TracePC) { 
-    IRB.CreateCall(SanCovTracePC) 
-        ->setCannotMerge(); // gets the PC using GET_CALLER_PC. 
-  } 
-  if (Options.TracePCGuard) { 
-    auto GuardPtr = IRB.CreateIntToPtr( 
-        IRB.CreateAdd(IRB.CreatePointerCast(FunctionGuardArray, IntptrTy), 
-                      ConstantInt::get(IntptrTy, Idx * 4)), 
-        Int32PtrTy); 
-    IRB.CreateCall(SanCovTracePCGuard, GuardPtr)->setCannotMerge(); 
-  } 
-  if (Options.Inline8bitCounters) { 
-    auto CounterPtr = IRB.CreateGEP( 
-        Function8bitCounterArray->getValueType(), Function8bitCounterArray, 
-        {ConstantInt::get(IntptrTy, 0), ConstantInt::get(IntptrTy, Idx)}); 
-    auto Load = IRB.CreateLoad(Int8Ty, CounterPtr); 
-    auto Inc = IRB.CreateAdd(Load, ConstantInt::get(Int8Ty, 1)); 
-    auto Store = IRB.CreateStore(Inc, CounterPtr); 
-    SetNoSanitizeMetadata(Load); 
-    SetNoSanitizeMetadata(Store); 
-  } 
-  if (Options.InlineBoolFlag) { 
-    auto FlagPtr = IRB.CreateGEP( 
-        FunctionBoolArray->getValueType(), FunctionBoolArray, 
-        {ConstantInt::get(IntptrTy, 0), ConstantInt::get(IntptrTy, Idx)}); 
-    auto Load = IRB.CreateLoad(Int1Ty, FlagPtr); 
-    auto ThenTerm = 
-        SplitBlockAndInsertIfThen(IRB.CreateIsNull(Load), &*IP, false); 
-    IRBuilder<> ThenIRB(ThenTerm); 
-    auto Store = ThenIRB.CreateStore(ConstantInt::getTrue(Int1Ty), FlagPtr); 
-    SetNoSanitizeMetadata(Load); 
-    SetNoSanitizeMetadata(Store); 
-  } 
-  if (Options.StackDepth && IsEntryBB && !IsLeafFunc) { 
-    // Check stack depth.  If it's the deepest so far, record it. 
-    Module *M = F.getParent(); 
-    Function *GetFrameAddr = Intrinsic::getDeclaration( 
-        M, Intrinsic::frameaddress, 
-        IRB.getInt8PtrTy(M->getDataLayout().getAllocaAddrSpace())); 
-    auto FrameAddrPtr = 
-        IRB.CreateCall(GetFrameAddr, {Constant::getNullValue(Int32Ty)}); 
-    auto FrameAddrInt = IRB.CreatePtrToInt(FrameAddrPtr, IntptrTy); 
-    auto LowestStack = IRB.CreateLoad(IntptrTy, SanCovLowestStack); 
-    auto IsStackLower = IRB.CreateICmpULT(FrameAddrInt, LowestStack); 
-    auto ThenTerm = SplitBlockAndInsertIfThen(IsStackLower, &*IP, false); 
-    IRBuilder<> ThenIRB(ThenTerm); 
-    auto Store = ThenIRB.CreateStore(FrameAddrInt, SanCovLowestStack); 
-    SetNoSanitizeMetadata(LowestStack); 
-    SetNoSanitizeMetadata(Store); 
-  } 
-} 
- 
-std::string 
-ModuleSanitizerCoverage::getSectionName(const std::string &Section) const { 
-  if (TargetTriple.isOSBinFormatCOFF()) { 
-    if (Section == SanCovCountersSectionName) 
-      return ".SCOV$CM"; 
-    if (Section == SanCovBoolFlagSectionName) 
-      return ".SCOV$BM"; 
-    if (Section == SanCovPCsSectionName) 
-      return ".SCOVP$M"; 
-    return ".SCOV$GM"; // For SanCovGuardsSectionName. 
-  } 
-  if (TargetTriple.isOSBinFormatMachO()) 
-    return "__DATA,__" + Section; 
-  return "__" + Section; 
-} 
- 
-std::string 
-ModuleSanitizerCoverage::getSectionStart(const std::string &Section) const { 
-  if (TargetTriple.isOSBinFormatMachO()) 
-    return "\1section$start$__DATA$__" + Section; 
-  return "__start___" + Section; 
-} 
- 
-std::string 
-ModuleSanitizerCoverage::getSectionEnd(const std::string &Section) const { 
-  if (TargetTriple.isOSBinFormatMachO()) 
-    return "\1section$end$__DATA$__" + Section; 
-  return "__stop___" + Section; 
-} 
- 
-char ModuleSanitizerCoverageLegacyPass::ID = 0; 
-INITIALIZE_PASS_BEGIN(ModuleSanitizerCoverageLegacyPass, "sancov", 
-                      "Pass for instrumenting coverage on functions", false, 
-                      false) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) 
-INITIALIZE_PASS_END(ModuleSanitizerCoverageLegacyPass, "sancov", 
-                    "Pass for instrumenting coverage on functions", false, 
-                    false) 
-ModulePass *llvm::createModuleSanitizerCoverageLegacyPassPass( 
-    const SanitizerCoverageOptions &Options, 
-    const std::vector<std::string> &AllowlistFiles, 
-    const std::vector<std::string> &BlocklistFiles) { 
-  return new ModuleSanitizerCoverageLegacyPass(Options, AllowlistFiles, 
-                                               BlocklistFiles); 
-} 
+    // Keep static allocas and llvm.localescape calls in the entry block.  Even
+    // if we aren't splitting the block, it's nice for allocas to be before
+    // calls.
+    IP = PrepareToSplitEntryBlock(BB, IP);
+  } else {
+    EntryLoc = IP->getDebugLoc();
+  }
+
+  IRBuilder<> IRB(&*IP);
+  IRB.SetCurrentDebugLocation(EntryLoc);
+  if (Options.TracePC) {
+    IRB.CreateCall(SanCovTracePC)
+        ->setCannotMerge(); // gets the PC using GET_CALLER_PC.
+  }
+  if (Options.TracePCGuard) {
+    auto GuardPtr = IRB.CreateIntToPtr(
+        IRB.CreateAdd(IRB.CreatePointerCast(FunctionGuardArray, IntptrTy),
+                      ConstantInt::get(IntptrTy, Idx * 4)),
+        Int32PtrTy);
+    IRB.CreateCall(SanCovTracePCGuard, GuardPtr)->setCannotMerge();
+  }
+  if (Options.Inline8bitCounters) {
+    auto CounterPtr = IRB.CreateGEP(
+        Function8bitCounterArray->getValueType(), Function8bitCounterArray,
+        {ConstantInt::get(IntptrTy, 0), ConstantInt::get(IntptrTy, Idx)});
+    auto Load = IRB.CreateLoad(Int8Ty, CounterPtr);
+    auto Inc = IRB.CreateAdd(Load, ConstantInt::get(Int8Ty, 1));
+    auto Store = IRB.CreateStore(Inc, CounterPtr);
+    SetNoSanitizeMetadata(Load);
+    SetNoSanitizeMetadata(Store);
+  }
+  if (Options.InlineBoolFlag) {
+    auto FlagPtr = IRB.CreateGEP(
+        FunctionBoolArray->getValueType(), FunctionBoolArray,
+        {ConstantInt::get(IntptrTy, 0), ConstantInt::get(IntptrTy, Idx)});
+    auto Load = IRB.CreateLoad(Int1Ty, FlagPtr);
+    auto ThenTerm =
+        SplitBlockAndInsertIfThen(IRB.CreateIsNull(Load), &*IP, false);
+    IRBuilder<> ThenIRB(ThenTerm);
+    auto Store = ThenIRB.CreateStore(ConstantInt::getTrue(Int1Ty), FlagPtr);
+    SetNoSanitizeMetadata(Load);
+    SetNoSanitizeMetadata(Store);
+  }
+  if (Options.StackDepth && IsEntryBB && !IsLeafFunc) {
+    // Check stack depth.  If it's the deepest so far, record it.
+    Module *M = F.getParent();
+    Function *GetFrameAddr = Intrinsic::getDeclaration(
+        M, Intrinsic::frameaddress,
+        IRB.getInt8PtrTy(M->getDataLayout().getAllocaAddrSpace()));
+    auto FrameAddrPtr =
+        IRB.CreateCall(GetFrameAddr, {Constant::getNullValue(Int32Ty)});
+    auto FrameAddrInt = IRB.CreatePtrToInt(FrameAddrPtr, IntptrTy);
+    auto LowestStack = IRB.CreateLoad(IntptrTy, SanCovLowestStack);
+    auto IsStackLower = IRB.CreateICmpULT(FrameAddrInt, LowestStack);
+    auto ThenTerm = SplitBlockAndInsertIfThen(IsStackLower, &*IP, false);
+    IRBuilder<> ThenIRB(ThenTerm);
+    auto Store = ThenIRB.CreateStore(FrameAddrInt, SanCovLowestStack);
+    SetNoSanitizeMetadata(LowestStack);
+    SetNoSanitizeMetadata(Store);
+  }
+}
+
+std::string
+ModuleSanitizerCoverage::getSectionName(const std::string &Section) const {
+  if (TargetTriple.isOSBinFormatCOFF()) {
+    if (Section == SanCovCountersSectionName)
+      return ".SCOV$CM";
+    if (Section == SanCovBoolFlagSectionName)
+      return ".SCOV$BM";
+    if (Section == SanCovPCsSectionName)
+      return ".SCOVP$M";
+    return ".SCOV$GM"; // For SanCovGuardsSectionName.
+  }
+  if (TargetTriple.isOSBinFormatMachO())
+    return "__DATA,__" + Section;
+  return "__" + Section;
+}
+
+std::string
+ModuleSanitizerCoverage::getSectionStart(const std::string &Section) const {
+  if (TargetTriple.isOSBinFormatMachO())
+    return "\1section$start$__DATA$__" + Section;
+  return "__start___" + Section;
+}
+
+std::string
+ModuleSanitizerCoverage::getSectionEnd(const std::string &Section) const {
+  if (TargetTriple.isOSBinFormatMachO())
+    return "\1section$end$__DATA$__" + Section;
+  return "__stop___" + Section;
+}
+
+char ModuleSanitizerCoverageLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(ModuleSanitizerCoverageLegacyPass, "sancov",
+                      "Pass for instrumenting coverage on functions", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_END(ModuleSanitizerCoverageLegacyPass, "sancov",
+                    "Pass for instrumenting coverage on functions", false,
+                    false)
+ModulePass *llvm::createModuleSanitizerCoverageLegacyPassPass(
+    const SanitizerCoverageOptions &Options,
+    const std::vector<std::string> &AllowlistFiles,
+    const std::vector<std::string> &BlocklistFiles) {
+  return new ModuleSanitizerCoverageLegacyPass(Options, AllowlistFiles,
+                                               BlocklistFiles);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 0b53ff8a83..783878cf1e 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -1,113 +1,113 @@
-//===-- ThreadSanitizer.cpp - race detector -------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file is a part of ThreadSanitizer, a race detector. 
-// 
-// The tool is under development, for the details about previous versions see 
-// http://code.google.com/p/data-race-test 
-// 
-// The instrumentation phase is quite simple: 
-//   - Insert calls to run-time library before every memory access. 
-//      - Optimizations may apply to avoid instrumenting some of the accesses. 
-//   - Insert calls at function entry/exit. 
-// The rest is handled by the run-time library. 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Instrumentation/ThreadSanitizer.h" 
+//===-- ThreadSanitizer.cpp - race detector -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of ThreadSanitizer, a race detector.
+//
+// The tool is under development, for the details about previous versions see
+// http://code.google.com/p/data-race-test
+//
+// The instrumentation phase is quite simple:
+//   - Insert calls to run-time library before every memory access.
+//      - Optimizations may apply to avoid instrumenting some of the accesses.
+//   - Insert calls at function entry/exit.
+// The rest is handled by the run-time library.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/ThreadSanitizer.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
-#include "llvm/ADT/SmallString.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/StringExtras.h" 
-#include "llvm/Analysis/CaptureTracking.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/ProfileData/InstrProf.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/MathExtras.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Instrumentation.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/EscapeEnumerator.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Transforms/Utils/ModuleUtils.h" 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "tsan" 
- 
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/EscapeEnumerator.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "tsan"
+
 static cl::opt<bool> ClInstrumentMemoryAccesses(
-    "tsan-instrument-memory-accesses", cl::init(true), 
-    cl::desc("Instrument memory accesses"), cl::Hidden); 
+    "tsan-instrument-memory-accesses", cl::init(true),
+    cl::desc("Instrument memory accesses"), cl::Hidden);
 static cl::opt<bool>
     ClInstrumentFuncEntryExit("tsan-instrument-func-entry-exit", cl::init(true),
                               cl::desc("Instrument function entry and exit"),
                               cl::Hidden);
 static cl::opt<bool> ClHandleCxxExceptions(
-    "tsan-handle-cxx-exceptions", cl::init(true), 
-    cl::desc("Handle C++ exceptions (insert cleanup blocks for unwinding)"), 
-    cl::Hidden); 
+    "tsan-handle-cxx-exceptions", cl::init(true),
+    cl::desc("Handle C++ exceptions (insert cleanup blocks for unwinding)"),
+    cl::Hidden);
 static cl::opt<bool> ClInstrumentAtomics("tsan-instrument-atomics",
                                          cl::init(true),
                                          cl::desc("Instrument atomics"),
                                          cl::Hidden);
 static cl::opt<bool> ClInstrumentMemIntrinsics(
-    "tsan-instrument-memintrinsics", cl::init(true), 
-    cl::desc("Instrument memintrinsics (memset/memcpy/memmove)"), cl::Hidden); 
+    "tsan-instrument-memintrinsics", cl::init(true),
+    cl::desc("Instrument memintrinsics (memset/memcpy/memmove)"), cl::Hidden);
 static cl::opt<bool> ClDistinguishVolatile(
-    "tsan-distinguish-volatile", cl::init(false), 
-    cl::desc("Emit special instrumentation for accesses to volatiles"), 
-    cl::Hidden); 
+    "tsan-distinguish-volatile", cl::init(false),
+    cl::desc("Emit special instrumentation for accesses to volatiles"),
+    cl::Hidden);
 static cl::opt<bool> ClInstrumentReadBeforeWrite(
-    "tsan-instrument-read-before-write", cl::init(false), 
-    cl::desc("Do not eliminate read instrumentation for read-before-writes"), 
-    cl::Hidden); 
+    "tsan-instrument-read-before-write", cl::init(false),
+    cl::desc("Do not eliminate read instrumentation for read-before-writes"),
+    cl::Hidden);
 static cl::opt<bool> ClCompoundReadBeforeWrite(
     "tsan-compound-read-before-write", cl::init(false),
     cl::desc("Emit special compound instrumentation for reads-before-writes"),
     cl::Hidden);
- 
-STATISTIC(NumInstrumentedReads, "Number of instrumented reads"); 
-STATISTIC(NumInstrumentedWrites, "Number of instrumented writes"); 
-STATISTIC(NumOmittedReadsBeforeWrite, 
-          "Number of reads ignored due to following writes"); 
-STATISTIC(NumAccessesWithBadSize, "Number of accesses with bad size"); 
-STATISTIC(NumInstrumentedVtableWrites, "Number of vtable ptr writes"); 
-STATISTIC(NumInstrumentedVtableReads, "Number of vtable ptr reads"); 
-STATISTIC(NumOmittedReadsFromConstantGlobals, 
-          "Number of reads from constant globals"); 
-STATISTIC(NumOmittedReadsFromVtable, "Number of vtable reads"); 
-STATISTIC(NumOmittedNonCaptured, "Number of accesses ignored due to capturing"); 
- 
+
+STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
+STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
+STATISTIC(NumOmittedReadsBeforeWrite,
+          "Number of reads ignored due to following writes");
+STATISTIC(NumAccessesWithBadSize, "Number of accesses with bad size");
+STATISTIC(NumInstrumentedVtableWrites, "Number of vtable ptr writes");
+STATISTIC(NumInstrumentedVtableReads, "Number of vtable ptr reads");
+STATISTIC(NumOmittedReadsFromConstantGlobals,
+          "Number of reads from constant globals");
+STATISTIC(NumOmittedReadsFromVtable, "Number of vtable reads");
+STATISTIC(NumOmittedNonCaptured, "Number of accesses ignored due to capturing");
+
 const char kTsanModuleCtorName[] = "tsan.module_ctor";
 const char kTsanInitName[] = "__tsan_init";
- 
-namespace { 
- 
-/// ThreadSanitizer: instrument the code in module to find races. 
-/// 
-/// Instantiating ThreadSanitizer inserts the tsan runtime library API function 
-/// declarations into the module if they don't exist already. Instantiating 
-/// ensures the __tsan_init function is in the list of global constructors for 
-/// the module. 
-struct ThreadSanitizer { 
+
+namespace {
+
+/// ThreadSanitizer: instrument the code in module to find races.
+///
+/// Instantiating ThreadSanitizer inserts the tsan runtime library API function
+/// declarations into the module if they don't exist already. Instantiating
+/// ensures the __tsan_init function is in the list of global constructors for
+/// the module.
+struct ThreadSanitizer {
   ThreadSanitizer() {
     // Sanity check options and warn user.
     if (ClInstrumentReadBeforeWrite && ClCompoundReadBeforeWrite) {
@@ -117,9 +117,9 @@ struct ThreadSanitizer {
     }
   }
 
-  bool sanitizeFunction(Function &F, const TargetLibraryInfo &TLI); 
- 
-private: 
+  bool sanitizeFunction(Function &F, const TargetLibraryInfo &TLI);
+
+private:
   // Internal Instruction wrapper that contains more information about the
   // Instruction from prior analysis.
   struct InstructionInfo {
@@ -133,172 +133,172 @@ private:
     unsigned Flags = 0;
   };
 
-  void initialize(Module &M); 
+  void initialize(Module &M);
   bool instrumentLoadOrStore(const InstructionInfo &II, const DataLayout &DL);
-  bool instrumentAtomic(Instruction *I, const DataLayout &DL); 
-  bool instrumentMemIntrinsic(Instruction *I); 
-  void chooseInstructionsToInstrument(SmallVectorImpl<Instruction *> &Local, 
+  bool instrumentAtomic(Instruction *I, const DataLayout &DL);
+  bool instrumentMemIntrinsic(Instruction *I);
+  void chooseInstructionsToInstrument(SmallVectorImpl<Instruction *> &Local,
                                       SmallVectorImpl<InstructionInfo> &All,
-                                      const DataLayout &DL); 
-  bool addrPointsToConstantData(Value *Addr); 
-  int getMemoryAccessFuncIndex(Value *Addr, const DataLayout &DL); 
-  void InsertRuntimeIgnores(Function &F); 
- 
-  Type *IntptrTy; 
-  FunctionCallee TsanFuncEntry; 
-  FunctionCallee TsanFuncExit; 
-  FunctionCallee TsanIgnoreBegin; 
-  FunctionCallee TsanIgnoreEnd; 
-  // Accesses sizes are powers of two: 1, 2, 4, 8, 16. 
-  static const size_t kNumberOfAccessSizes = 5; 
-  FunctionCallee TsanRead[kNumberOfAccessSizes]; 
-  FunctionCallee TsanWrite[kNumberOfAccessSizes]; 
-  FunctionCallee TsanUnalignedRead[kNumberOfAccessSizes]; 
-  FunctionCallee TsanUnalignedWrite[kNumberOfAccessSizes]; 
-  FunctionCallee TsanVolatileRead[kNumberOfAccessSizes]; 
-  FunctionCallee TsanVolatileWrite[kNumberOfAccessSizes]; 
-  FunctionCallee TsanUnalignedVolatileRead[kNumberOfAccessSizes]; 
-  FunctionCallee TsanUnalignedVolatileWrite[kNumberOfAccessSizes]; 
+                                      const DataLayout &DL);
+  bool addrPointsToConstantData(Value *Addr);
+  int getMemoryAccessFuncIndex(Value *Addr, const DataLayout &DL);
+  void InsertRuntimeIgnores(Function &F);
+
+  Type *IntptrTy;
+  FunctionCallee TsanFuncEntry;
+  FunctionCallee TsanFuncExit;
+  FunctionCallee TsanIgnoreBegin;
+  FunctionCallee TsanIgnoreEnd;
+  // Accesses sizes are powers of two: 1, 2, 4, 8, 16.
+  static const size_t kNumberOfAccessSizes = 5;
+  FunctionCallee TsanRead[kNumberOfAccessSizes];
+  FunctionCallee TsanWrite[kNumberOfAccessSizes];
+  FunctionCallee TsanUnalignedRead[kNumberOfAccessSizes];
+  FunctionCallee TsanUnalignedWrite[kNumberOfAccessSizes];
+  FunctionCallee TsanVolatileRead[kNumberOfAccessSizes];
+  FunctionCallee TsanVolatileWrite[kNumberOfAccessSizes];
+  FunctionCallee TsanUnalignedVolatileRead[kNumberOfAccessSizes];
+  FunctionCallee TsanUnalignedVolatileWrite[kNumberOfAccessSizes];
   FunctionCallee TsanCompoundRW[kNumberOfAccessSizes];
   FunctionCallee TsanUnalignedCompoundRW[kNumberOfAccessSizes];
-  FunctionCallee TsanAtomicLoad[kNumberOfAccessSizes]; 
-  FunctionCallee TsanAtomicStore[kNumberOfAccessSizes]; 
-  FunctionCallee TsanAtomicRMW[AtomicRMWInst::LAST_BINOP + 1] 
-                              [kNumberOfAccessSizes]; 
-  FunctionCallee TsanAtomicCAS[kNumberOfAccessSizes]; 
-  FunctionCallee TsanAtomicThreadFence; 
-  FunctionCallee TsanAtomicSignalFence; 
-  FunctionCallee TsanVptrUpdate; 
-  FunctionCallee TsanVptrLoad; 
-  FunctionCallee MemmoveFn, MemcpyFn, MemsetFn; 
-}; 
- 
-struct ThreadSanitizerLegacyPass : FunctionPass { 
-  ThreadSanitizerLegacyPass() : FunctionPass(ID) { 
-    initializeThreadSanitizerLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
-  StringRef getPassName() const override; 
-  void getAnalysisUsage(AnalysisUsage &AU) const override; 
-  bool runOnFunction(Function &F) override; 
-  bool doInitialization(Module &M) override; 
-  static char ID; // Pass identification, replacement for typeid. 
-private: 
-  Optional<ThreadSanitizer> TSan; 
-}; 
- 
-void insertModuleCtor(Module &M) { 
-  getOrCreateSanitizerCtorAndInitFunctions( 
-      M, kTsanModuleCtorName, kTsanInitName, /*InitArgTypes=*/{}, 
-      /*InitArgs=*/{}, 
-      // This callback is invoked when the functions are created the first 
-      // time. Hook them into the global ctors list in that case: 
-      [&](Function *Ctor, FunctionCallee) { appendToGlobalCtors(M, Ctor, 0); }); 
-} 
- 
-}  // namespace 
- 
-PreservedAnalyses ThreadSanitizerPass::run(Function &F, 
-                                           FunctionAnalysisManager &FAM) { 
-  ThreadSanitizer TSan; 
-  if (TSan.sanitizeFunction(F, FAM.getResult<TargetLibraryAnalysis>(F))) 
-    return PreservedAnalyses::none(); 
-  return PreservedAnalyses::all(); 
-} 
- 
-PreservedAnalyses ThreadSanitizerPass::run(Module &M, 
-                                           ModuleAnalysisManager &MAM) { 
-  insertModuleCtor(M); 
-  return PreservedAnalyses::none(); 
-} 
- 
-char ThreadSanitizerLegacyPass::ID = 0; 
-INITIALIZE_PASS_BEGIN(ThreadSanitizerLegacyPass, "tsan", 
-                      "ThreadSanitizer: detects data races.", false, false) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_END(ThreadSanitizerLegacyPass, "tsan", 
-                    "ThreadSanitizer: detects data races.", false, false) 
- 
-StringRef ThreadSanitizerLegacyPass::getPassName() const { 
-  return "ThreadSanitizerLegacyPass"; 
-} 
- 
-void ThreadSanitizerLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const { 
-  AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-} 
- 
-bool ThreadSanitizerLegacyPass::doInitialization(Module &M) { 
-  insertModuleCtor(M); 
-  TSan.emplace(); 
-  return true; 
-} 
- 
-bool ThreadSanitizerLegacyPass::runOnFunction(Function &F) { 
-  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 
-  TSan->sanitizeFunction(F, TLI); 
-  return true; 
-} 
- 
-FunctionPass *llvm::createThreadSanitizerLegacyPassPass() { 
-  return new ThreadSanitizerLegacyPass(); 
-} 
- 
-void ThreadSanitizer::initialize(Module &M) { 
-  const DataLayout &DL = M.getDataLayout(); 
-  IntptrTy = DL.getIntPtrType(M.getContext()); 
- 
-  IRBuilder<> IRB(M.getContext()); 
-  AttributeList Attr; 
-  Attr = Attr.addAttribute(M.getContext(), AttributeList::FunctionIndex, 
-                           Attribute::NoUnwind); 
-  // Initialize the callbacks. 
-  TsanFuncEntry = M.getOrInsertFunction("__tsan_func_entry", Attr, 
-                                        IRB.getVoidTy(), IRB.getInt8PtrTy()); 
-  TsanFuncExit = 
-      M.getOrInsertFunction("__tsan_func_exit", Attr, IRB.getVoidTy()); 
-  TsanIgnoreBegin = M.getOrInsertFunction("__tsan_ignore_thread_begin", Attr, 
-                                          IRB.getVoidTy()); 
-  TsanIgnoreEnd = 
-      M.getOrInsertFunction("__tsan_ignore_thread_end", Attr, IRB.getVoidTy()); 
-  IntegerType *OrdTy = IRB.getInt32Ty(); 
-  for (size_t i = 0; i < kNumberOfAccessSizes; ++i) { 
-    const unsigned ByteSize = 1U << i; 
-    const unsigned BitSize = ByteSize * 8; 
-    std::string ByteSizeStr = utostr(ByteSize); 
-    std::string BitSizeStr = utostr(BitSize); 
-    SmallString<32> ReadName("__tsan_read" + ByteSizeStr); 
-    TsanRead[i] = M.getOrInsertFunction(ReadName, Attr, IRB.getVoidTy(), 
-                                        IRB.getInt8PtrTy()); 
- 
-    SmallString<32> WriteName("__tsan_write" + ByteSizeStr); 
-    TsanWrite[i] = M.getOrInsertFunction(WriteName, Attr, IRB.getVoidTy(), 
-                                         IRB.getInt8PtrTy()); 
- 
-    SmallString<64> UnalignedReadName("__tsan_unaligned_read" + ByteSizeStr); 
-    TsanUnalignedRead[i] = M.getOrInsertFunction( 
-        UnalignedReadName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy()); 
- 
-    SmallString<64> UnalignedWriteName("__tsan_unaligned_write" + ByteSizeStr); 
-    TsanUnalignedWrite[i] = M.getOrInsertFunction( 
-        UnalignedWriteName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy()); 
- 
-    SmallString<64> VolatileReadName("__tsan_volatile_read" + ByteSizeStr); 
-    TsanVolatileRead[i] = M.getOrInsertFunction( 
-        VolatileReadName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy()); 
- 
-    SmallString<64> VolatileWriteName("__tsan_volatile_write" + ByteSizeStr); 
-    TsanVolatileWrite[i] = M.getOrInsertFunction( 
-        VolatileWriteName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy()); 
- 
-    SmallString<64> UnalignedVolatileReadName("__tsan_unaligned_volatile_read" + 
-                                              ByteSizeStr); 
-    TsanUnalignedVolatileRead[i] = M.getOrInsertFunction( 
-        UnalignedVolatileReadName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy()); 
- 
-    SmallString<64> UnalignedVolatileWriteName( 
-        "__tsan_unaligned_volatile_write" + ByteSizeStr); 
-    TsanUnalignedVolatileWrite[i] = M.getOrInsertFunction( 
-        UnalignedVolatileWriteName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy()); 
- 
+  FunctionCallee TsanAtomicLoad[kNumberOfAccessSizes];
+  FunctionCallee TsanAtomicStore[kNumberOfAccessSizes];
+  FunctionCallee TsanAtomicRMW[AtomicRMWInst::LAST_BINOP + 1]
+                              [kNumberOfAccessSizes];
+  FunctionCallee TsanAtomicCAS[kNumberOfAccessSizes];
+  FunctionCallee TsanAtomicThreadFence;
+  FunctionCallee TsanAtomicSignalFence;
+  FunctionCallee TsanVptrUpdate;
+  FunctionCallee TsanVptrLoad;
+  FunctionCallee MemmoveFn, MemcpyFn, MemsetFn;
+};
+
+struct ThreadSanitizerLegacyPass : FunctionPass {
+  ThreadSanitizerLegacyPass() : FunctionPass(ID) {
+    initializeThreadSanitizerLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+  StringRef getPassName() const override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnFunction(Function &F) override;
+  bool doInitialization(Module &M) override;
+  static char ID; // Pass identification, replacement for typeid.
+private:
+  Optional<ThreadSanitizer> TSan;
+};
+
+void insertModuleCtor(Module &M) {
+  getOrCreateSanitizerCtorAndInitFunctions(
+      M, kTsanModuleCtorName, kTsanInitName, /*InitArgTypes=*/{},
+      /*InitArgs=*/{},
+      // This callback is invoked when the functions are created the first
+      // time. Hook them into the global ctors list in that case:
+      [&](Function *Ctor, FunctionCallee) { appendToGlobalCtors(M, Ctor, 0); });
+}
+
+}  // namespace
+
+PreservedAnalyses ThreadSanitizerPass::run(Function &F,
+                                           FunctionAnalysisManager &FAM) {
+  ThreadSanitizer TSan;
+  if (TSan.sanitizeFunction(F, FAM.getResult<TargetLibraryAnalysis>(F)))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+PreservedAnalyses ThreadSanitizerPass::run(Module &M,
+                                           ModuleAnalysisManager &MAM) {
+  insertModuleCtor(M);
+  return PreservedAnalyses::none();
+}
+
+char ThreadSanitizerLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(ThreadSanitizerLegacyPass, "tsan",
+                      "ThreadSanitizer: detects data races.", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(ThreadSanitizerLegacyPass, "tsan",
+                    "ThreadSanitizer: detects data races.", false, false)
+
+StringRef ThreadSanitizerLegacyPass::getPassName() const {
+  return "ThreadSanitizerLegacyPass";
+}
+
+void ThreadSanitizerLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
+}
+
+bool ThreadSanitizerLegacyPass::doInitialization(Module &M) {
+  insertModuleCtor(M);
+  TSan.emplace();
+  return true;
+}
+
+bool ThreadSanitizerLegacyPass::runOnFunction(Function &F) {
+  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  TSan->sanitizeFunction(F, TLI);
+  return true;
+}
+
+FunctionPass *llvm::createThreadSanitizerLegacyPassPass() {
+  return new ThreadSanitizerLegacyPass();
+}
+
+void ThreadSanitizer::initialize(Module &M) {
+  const DataLayout &DL = M.getDataLayout();
+  IntptrTy = DL.getIntPtrType(M.getContext());
+
+  IRBuilder<> IRB(M.getContext());
+  AttributeList Attr;
+  Attr = Attr.addAttribute(M.getContext(), AttributeList::FunctionIndex,
+                           Attribute::NoUnwind);
+  // Initialize the callbacks.
+  TsanFuncEntry = M.getOrInsertFunction("__tsan_func_entry", Attr,
+                                        IRB.getVoidTy(), IRB.getInt8PtrTy());
+  TsanFuncExit =
+      M.getOrInsertFunction("__tsan_func_exit", Attr, IRB.getVoidTy());
+  TsanIgnoreBegin = M.getOrInsertFunction("__tsan_ignore_thread_begin", Attr,
+                                          IRB.getVoidTy());
+  TsanIgnoreEnd =
+      M.getOrInsertFunction("__tsan_ignore_thread_end", Attr, IRB.getVoidTy());
+  IntegerType *OrdTy = IRB.getInt32Ty();
+  for (size_t i = 0; i < kNumberOfAccessSizes; ++i) {
+    const unsigned ByteSize = 1U << i;
+    const unsigned BitSize = ByteSize * 8;
+    std::string ByteSizeStr = utostr(ByteSize);
+    std::string BitSizeStr = utostr(BitSize);
+    SmallString<32> ReadName("__tsan_read" + ByteSizeStr);
+    TsanRead[i] = M.getOrInsertFunction(ReadName, Attr, IRB.getVoidTy(),
+                                        IRB.getInt8PtrTy());
+
+    SmallString<32> WriteName("__tsan_write" + ByteSizeStr);
+    TsanWrite[i] = M.getOrInsertFunction(WriteName, Attr, IRB.getVoidTy(),
+                                         IRB.getInt8PtrTy());
+
+    SmallString<64> UnalignedReadName("__tsan_unaligned_read" + ByteSizeStr);
+    TsanUnalignedRead[i] = M.getOrInsertFunction(
+        UnalignedReadName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy());
+
+    SmallString<64> UnalignedWriteName("__tsan_unaligned_write" + ByteSizeStr);
+    TsanUnalignedWrite[i] = M.getOrInsertFunction(
+        UnalignedWriteName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy());
+
+    SmallString<64> VolatileReadName("__tsan_volatile_read" + ByteSizeStr);
+    TsanVolatileRead[i] = M.getOrInsertFunction(
+        VolatileReadName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy());
+
+    SmallString<64> VolatileWriteName("__tsan_volatile_write" + ByteSizeStr);
+    TsanVolatileWrite[i] = M.getOrInsertFunction(
+        VolatileWriteName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy());
+
+    SmallString<64> UnalignedVolatileReadName("__tsan_unaligned_volatile_read" +
+                                              ByteSizeStr);
+    TsanUnalignedVolatileRead[i] = M.getOrInsertFunction(
+        UnalignedVolatileReadName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy());
+
+    SmallString<64> UnalignedVolatileWriteName(
+        "__tsan_unaligned_volatile_write" + ByteSizeStr);
+    TsanUnalignedVolatileWrite[i] = M.getOrInsertFunction(
+        UnalignedVolatileWriteName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy());
+
     SmallString<64> CompoundRWName("__tsan_read_write" + ByteSizeStr);
     TsanCompoundRW[i] = M.getOrInsertFunction(
         CompoundRWName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy());
@@ -308,145 +308,145 @@ void ThreadSanitizer::initialize(Module &M) {
     TsanUnalignedCompoundRW[i] = M.getOrInsertFunction(
         UnalignedCompoundRWName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy());
 
-    Type *Ty = Type::getIntNTy(M.getContext(), BitSize); 
-    Type *PtrTy = Ty->getPointerTo(); 
-    SmallString<32> AtomicLoadName("__tsan_atomic" + BitSizeStr + "_load"); 
-    TsanAtomicLoad[i] = 
-        M.getOrInsertFunction(AtomicLoadName, Attr, Ty, PtrTy, OrdTy); 
- 
-    SmallString<32> AtomicStoreName("__tsan_atomic" + BitSizeStr + "_store"); 
-    TsanAtomicStore[i] = M.getOrInsertFunction( 
-        AtomicStoreName, Attr, IRB.getVoidTy(), PtrTy, Ty, OrdTy); 
- 
-    for (unsigned Op = AtomicRMWInst::FIRST_BINOP; 
-         Op <= AtomicRMWInst::LAST_BINOP; ++Op) { 
-      TsanAtomicRMW[Op][i] = nullptr; 
-      const char *NamePart = nullptr; 
-      if (Op == AtomicRMWInst::Xchg) 
-        NamePart = "_exchange"; 
-      else if (Op == AtomicRMWInst::Add) 
-        NamePart = "_fetch_add"; 
-      else if (Op == AtomicRMWInst::Sub) 
-        NamePart = "_fetch_sub"; 
-      else if (Op == AtomicRMWInst::And) 
-        NamePart = "_fetch_and"; 
-      else if (Op == AtomicRMWInst::Or) 
-        NamePart = "_fetch_or"; 
-      else if (Op == AtomicRMWInst::Xor) 
-        NamePart = "_fetch_xor"; 
-      else if (Op == AtomicRMWInst::Nand) 
-        NamePart = "_fetch_nand"; 
-      else 
-        continue; 
-      SmallString<32> RMWName("__tsan_atomic" + itostr(BitSize) + NamePart); 
-      TsanAtomicRMW[Op][i] = 
-          M.getOrInsertFunction(RMWName, Attr, Ty, PtrTy, Ty, OrdTy); 
-    } 
- 
-    SmallString<32> AtomicCASName("__tsan_atomic" + BitSizeStr + 
-                                  "_compare_exchange_val"); 
-    TsanAtomicCAS[i] = M.getOrInsertFunction(AtomicCASName, Attr, Ty, PtrTy, Ty, 
-                                             Ty, OrdTy, OrdTy); 
-  } 
-  TsanVptrUpdate = 
-      M.getOrInsertFunction("__tsan_vptr_update", Attr, IRB.getVoidTy(), 
-                            IRB.getInt8PtrTy(), IRB.getInt8PtrTy()); 
-  TsanVptrLoad = M.getOrInsertFunction("__tsan_vptr_read", Attr, 
-                                       IRB.getVoidTy(), IRB.getInt8PtrTy()); 
-  TsanAtomicThreadFence = M.getOrInsertFunction("__tsan_atomic_thread_fence", 
-                                                Attr, IRB.getVoidTy(), OrdTy); 
-  TsanAtomicSignalFence = M.getOrInsertFunction("__tsan_atomic_signal_fence", 
-                                                Attr, IRB.getVoidTy(), OrdTy); 
- 
-  MemmoveFn = 
-      M.getOrInsertFunction("memmove", Attr, IRB.getInt8PtrTy(), 
-                            IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy); 
-  MemcpyFn = 
-      M.getOrInsertFunction("memcpy", Attr, IRB.getInt8PtrTy(), 
-                            IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy); 
-  MemsetFn = 
-      M.getOrInsertFunction("memset", Attr, IRB.getInt8PtrTy(), 
-                            IRB.getInt8PtrTy(), IRB.getInt32Ty(), IntptrTy); 
-} 
- 
-static bool isVtableAccess(Instruction *I) { 
-  if (MDNode *Tag = I->getMetadata(LLVMContext::MD_tbaa)) 
-    return Tag->isTBAAVtableAccess(); 
-  return false; 
-} 
- 
-// Do not instrument known races/"benign races" that come from compiler 
-// instrumentatin. The user has no way of suppressing them. 
-static bool shouldInstrumentReadWriteFromAddress(const Module *M, Value *Addr) { 
-  // Peel off GEPs and BitCasts. 
-  Addr = Addr->stripInBoundsOffsets(); 
- 
-  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) { 
-    if (GV->hasSection()) { 
-      StringRef SectionName = GV->getSection(); 
-      // Check if the global is in the PGO counters section. 
-      auto OF = Triple(M->getTargetTriple()).getObjectFormat(); 
-      if (SectionName.endswith( 
-              getInstrProfSectionName(IPSK_cnts, OF, /*AddSegmentInfo=*/false))) 
-        return false; 
-    } 
- 
-    // Check if the global is private gcov data. 
-    if (GV->getName().startswith("__llvm_gcov") || 
-        GV->getName().startswith("__llvm_gcda")) 
-      return false; 
-  } 
- 
-  // Do not instrument acesses from different address spaces; we cannot deal 
-  // with them. 
-  if (Addr) { 
-    Type *PtrTy = cast<PointerType>(Addr->getType()->getScalarType()); 
-    if (PtrTy->getPointerAddressSpace() != 0) 
-      return false; 
-  } 
- 
-  return true; 
-} 
- 
-bool ThreadSanitizer::addrPointsToConstantData(Value *Addr) { 
-  // If this is a GEP, just analyze its pointer operand. 
-  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Addr)) 
-    Addr = GEP->getPointerOperand(); 
- 
-  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) { 
-    if (GV->isConstant()) { 
-      // Reads from constant globals can not race with any writes. 
-      NumOmittedReadsFromConstantGlobals++; 
-      return true; 
-    } 
-  } else if (LoadInst *L = dyn_cast<LoadInst>(Addr)) { 
-    if (isVtableAccess(L)) { 
-      // Reads from a vtable pointer can not race with any writes. 
-      NumOmittedReadsFromVtable++; 
-      return true; 
-    } 
-  } 
-  return false; 
-} 
- 
-// Instrumenting some of the accesses may be proven redundant. 
-// Currently handled: 
-//  - read-before-write (within same BB, no calls between) 
-//  - not captured variables 
-// 
-// We do not handle some of the patterns that should not survive 
-// after the classic compiler optimizations. 
-// E.g. two reads from the same temp should be eliminated by CSE, 
-// two writes should be eliminated by DSE, etc. 
-// 
-// 'Local' is a vector of insns within the same BB (no calls between). 
-// 'All' is a vector of insns that will be instrumented. 
-void ThreadSanitizer::chooseInstructionsToInstrument( 
+    Type *Ty = Type::getIntNTy(M.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    SmallString<32> AtomicLoadName("__tsan_atomic" + BitSizeStr + "_load");
+    TsanAtomicLoad[i] =
+        M.getOrInsertFunction(AtomicLoadName, Attr, Ty, PtrTy, OrdTy);
+
+    SmallString<32> AtomicStoreName("__tsan_atomic" + BitSizeStr + "_store");
+    TsanAtomicStore[i] = M.getOrInsertFunction(
+        AtomicStoreName, Attr, IRB.getVoidTy(), PtrTy, Ty, OrdTy);
+
+    for (unsigned Op = AtomicRMWInst::FIRST_BINOP;
+         Op <= AtomicRMWInst::LAST_BINOP; ++Op) {
+      TsanAtomicRMW[Op][i] = nullptr;
+      const char *NamePart = nullptr;
+      if (Op == AtomicRMWInst::Xchg)
+        NamePart = "_exchange";
+      else if (Op == AtomicRMWInst::Add)
+        NamePart = "_fetch_add";
+      else if (Op == AtomicRMWInst::Sub)
+        NamePart = "_fetch_sub";
+      else if (Op == AtomicRMWInst::And)
+        NamePart = "_fetch_and";
+      else if (Op == AtomicRMWInst::Or)
+        NamePart = "_fetch_or";
+      else if (Op == AtomicRMWInst::Xor)
+        NamePart = "_fetch_xor";
+      else if (Op == AtomicRMWInst::Nand)
+        NamePart = "_fetch_nand";
+      else
+        continue;
+      SmallString<32> RMWName("__tsan_atomic" + itostr(BitSize) + NamePart);
+      TsanAtomicRMW[Op][i] =
+          M.getOrInsertFunction(RMWName, Attr, Ty, PtrTy, Ty, OrdTy);
+    }
+
+    SmallString<32> AtomicCASName("__tsan_atomic" + BitSizeStr +
+                                  "_compare_exchange_val");
+    TsanAtomicCAS[i] = M.getOrInsertFunction(AtomicCASName, Attr, Ty, PtrTy, Ty,
+                                             Ty, OrdTy, OrdTy);
+  }
+  TsanVptrUpdate =
+      M.getOrInsertFunction("__tsan_vptr_update", Attr, IRB.getVoidTy(),
+                            IRB.getInt8PtrTy(), IRB.getInt8PtrTy());
+  TsanVptrLoad = M.getOrInsertFunction("__tsan_vptr_read", Attr,
+                                       IRB.getVoidTy(), IRB.getInt8PtrTy());
+  TsanAtomicThreadFence = M.getOrInsertFunction("__tsan_atomic_thread_fence",
+                                                Attr, IRB.getVoidTy(), OrdTy);
+  TsanAtomicSignalFence = M.getOrInsertFunction("__tsan_atomic_signal_fence",
+                                                Attr, IRB.getVoidTy(), OrdTy);
+
+  MemmoveFn =
+      M.getOrInsertFunction("memmove", Attr, IRB.getInt8PtrTy(),
+                            IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy);
+  MemcpyFn =
+      M.getOrInsertFunction("memcpy", Attr, IRB.getInt8PtrTy(),
+                            IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy);
+  MemsetFn =
+      M.getOrInsertFunction("memset", Attr, IRB.getInt8PtrTy(),
+                            IRB.getInt8PtrTy(), IRB.getInt32Ty(), IntptrTy);
+}
+
+static bool isVtableAccess(Instruction *I) {
+  if (MDNode *Tag = I->getMetadata(LLVMContext::MD_tbaa))
+    return Tag->isTBAAVtableAccess();
+  return false;
+}
+
+// Do not instrument known races/"benign races" that come from compiler
+// instrumentatin. The user has no way of suppressing them.
+static bool shouldInstrumentReadWriteFromAddress(const Module *M, Value *Addr) {
+  // Peel off GEPs and BitCasts.
+  Addr = Addr->stripInBoundsOffsets();
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
+    if (GV->hasSection()) {
+      StringRef SectionName = GV->getSection();
+      // Check if the global is in the PGO counters section.
+      auto OF = Triple(M->getTargetTriple()).getObjectFormat();
+      if (SectionName.endswith(
+              getInstrProfSectionName(IPSK_cnts, OF, /*AddSegmentInfo=*/false)))
+        return false;
+    }
+
+    // Check if the global is private gcov data.
+    if (GV->getName().startswith("__llvm_gcov") ||
+        GV->getName().startswith("__llvm_gcda"))
+      return false;
+  }
+
+  // Do not instrument acesses from different address spaces; we cannot deal
+  // with them.
+  if (Addr) {
+    Type *PtrTy = cast<PointerType>(Addr->getType()->getScalarType());
+    if (PtrTy->getPointerAddressSpace() != 0)
+      return false;
+  }
+
+  return true;
+}
+
+bool ThreadSanitizer::addrPointsToConstantData(Value *Addr) {
+  // If this is a GEP, just analyze its pointer operand.
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Addr))
+    Addr = GEP->getPointerOperand();
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
+    if (GV->isConstant()) {
+      // Reads from constant globals can not race with any writes.
+      NumOmittedReadsFromConstantGlobals++;
+      return true;
+    }
+  } else if (LoadInst *L = dyn_cast<LoadInst>(Addr)) {
+    if (isVtableAccess(L)) {
+      // Reads from a vtable pointer can not race with any writes.
+      NumOmittedReadsFromVtable++;
+      return true;
+    }
+  }
+  return false;
+}
+
+// Instrumenting some of the accesses may be proven redundant.
+// Currently handled:
+//  - read-before-write (within same BB, no calls between)
+//  - not captured variables
+//
+// We do not handle some of the patterns that should not survive
+// after the classic compiler optimizations.
+// E.g. two reads from the same temp should be eliminated by CSE,
+// two writes should be eliminated by DSE, etc.
+//
+// 'Local' is a vector of insns within the same BB (no calls between).
+// 'All' is a vector of insns that will be instrumented.
+void ThreadSanitizer::chooseInstructionsToInstrument(
     SmallVectorImpl<Instruction *> &Local,
     SmallVectorImpl<InstructionInfo> &All, const DataLayout &DL) {
   DenseMap<Value *, size_t> WriteTargets; // Map of addresses to index in All
-  // Iterate from the end. 
-  for (Instruction *I : reverse(Local)) { 
+  // Iterate from the end.
+  for (Instruction *I : reverse(Local)) {
     const bool IsWrite = isa<StoreInst>(*I);
     Value *Addr = IsWrite ? cast<StoreInst>(I)->getPointerOperand()
                           : cast<LoadInst>(I)->getPointerOperand();
@@ -470,22 +470,22 @@ void ThreadSanitizer::chooseInstructionsToInstrument(
           NumOmittedReadsBeforeWrite++;
           continue;
         }
-      } 
+      }
 
-      if (addrPointsToConstantData(Addr)) { 
-        // Addr points to some constant data -- it can not race with any writes. 
-        continue; 
-      } 
-    } 
+      if (addrPointsToConstantData(Addr)) {
+        // Addr points to some constant data -- it can not race with any writes.
+        continue;
+      }
+    }
 
     if (isa<AllocaInst>(getUnderlyingObject(Addr)) &&
-        !PointerMayBeCaptured(Addr, true, true)) { 
-      // The variable is addressable but not captured, so it cannot be 
-      // referenced from a different thread and participate in a data race 
-      // (see llvm/Analysis/CaptureTracking.h for details). 
-      NumOmittedNonCaptured++; 
-      continue; 
-    } 
+        !PointerMayBeCaptured(Addr, true, true)) {
+      // The variable is addressable but not captured, so it cannot be
+      // referenced from a different thread and participate in a data race
+      // (see llvm/Analysis/CaptureTracking.h for details).
+      NumOmittedNonCaptured++;
+      continue;
+    }
 
     // Instrument this instruction.
     All.emplace_back(I);
@@ -494,160 +494,160 @@ void ThreadSanitizer::chooseInstructionsToInstrument(
       // write target, and we can override any previous entry if it exists.
       WriteTargets[Addr] = All.size() - 1;
     }
-  } 
-  Local.clear(); 
-} 
- 
-static bool isAtomic(Instruction *I) { 
-  // TODO: Ask TTI whether synchronization scope is between threads. 
-  if (LoadInst *LI = dyn_cast<LoadInst>(I)) 
-    return LI->isAtomic() && LI->getSyncScopeID() != SyncScope::SingleThread; 
-  if (StoreInst *SI = dyn_cast<StoreInst>(I)) 
-    return SI->isAtomic() && SI->getSyncScopeID() != SyncScope::SingleThread; 
-  if (isa<AtomicRMWInst>(I)) 
-    return true; 
-  if (isa<AtomicCmpXchgInst>(I)) 
-    return true; 
-  if (isa<FenceInst>(I)) 
-    return true; 
-  return false; 
-} 
- 
-void ThreadSanitizer::InsertRuntimeIgnores(Function &F) { 
-  IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI()); 
-  IRB.CreateCall(TsanIgnoreBegin); 
-  EscapeEnumerator EE(F, "tsan_ignore_cleanup", ClHandleCxxExceptions); 
-  while (IRBuilder<> *AtExit = EE.Next()) { 
-    AtExit->CreateCall(TsanIgnoreEnd); 
-  } 
-} 
- 
-bool ThreadSanitizer::sanitizeFunction(Function &F, 
-                                       const TargetLibraryInfo &TLI) { 
-  // This is required to prevent instrumenting call to __tsan_init from within 
-  // the module constructor. 
-  if (F.getName() == kTsanModuleCtorName) 
-    return false; 
-  // Naked functions can not have prologue/epilogue 
-  // (__tsan_func_entry/__tsan_func_exit) generated, so don't instrument them at 
-  // all. 
-  if (F.hasFnAttribute(Attribute::Naked)) 
-    return false; 
-  initialize(*F.getParent()); 
+  }
+  Local.clear();
+}
+
+static bool isAtomic(Instruction *I) {
+  // TODO: Ask TTI whether synchronization scope is between threads.
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->isAtomic() && LI->getSyncScopeID() != SyncScope::SingleThread;
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->isAtomic() && SI->getSyncScopeID() != SyncScope::SingleThread;
+  if (isa<AtomicRMWInst>(I))
+    return true;
+  if (isa<AtomicCmpXchgInst>(I))
+    return true;
+  if (isa<FenceInst>(I))
+    return true;
+  return false;
+}
+
+void ThreadSanitizer::InsertRuntimeIgnores(Function &F) {
+  IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+  IRB.CreateCall(TsanIgnoreBegin);
+  EscapeEnumerator EE(F, "tsan_ignore_cleanup", ClHandleCxxExceptions);
+  while (IRBuilder<> *AtExit = EE.Next()) {
+    AtExit->CreateCall(TsanIgnoreEnd);
+  }
+}
+
+bool ThreadSanitizer::sanitizeFunction(Function &F,
+                                       const TargetLibraryInfo &TLI) {
+  // This is required to prevent instrumenting call to __tsan_init from within
+  // the module constructor.
+  if (F.getName() == kTsanModuleCtorName)
+    return false;
+  // Naked functions can not have prologue/epilogue
+  // (__tsan_func_entry/__tsan_func_exit) generated, so don't instrument them at
+  // all.
+  if (F.hasFnAttribute(Attribute::Naked))
+    return false;
+  initialize(*F.getParent());
   SmallVector<InstructionInfo, 8> AllLoadsAndStores;
-  SmallVector<Instruction*, 8> LocalLoadsAndStores; 
-  SmallVector<Instruction*, 8> AtomicAccesses; 
-  SmallVector<Instruction*, 8> MemIntrinCalls; 
-  bool Res = false; 
-  bool HasCalls = false; 
-  bool SanitizeFunction = F.hasFnAttribute(Attribute::SanitizeThread); 
-  const DataLayout &DL = F.getParent()->getDataLayout(); 
- 
-  // Traverse all instructions, collect loads/stores/returns, check for calls. 
-  for (auto &BB : F) { 
-    for (auto &Inst : BB) { 
-      if (isAtomic(&Inst)) 
-        AtomicAccesses.push_back(&Inst); 
-      else if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst)) 
-        LocalLoadsAndStores.push_back(&Inst); 
-      else if (isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) { 
-        if (CallInst *CI = dyn_cast<CallInst>(&Inst)) 
-          maybeMarkSanitizerLibraryCallNoBuiltin(CI, &TLI); 
-        if (isa<MemIntrinsic>(Inst)) 
-          MemIntrinCalls.push_back(&Inst); 
-        HasCalls = true; 
-        chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores, 
-                                       DL); 
-      } 
-    } 
-    chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores, DL); 
-  } 
- 
-  // We have collected all loads and stores. 
-  // FIXME: many of these accesses do not need to be checked for races 
-  // (e.g. variables that do not escape, etc). 
- 
-  // Instrument memory accesses only if we want to report bugs in the function. 
-  if (ClInstrumentMemoryAccesses && SanitizeFunction) 
+  SmallVector<Instruction*, 8> LocalLoadsAndStores;
+  SmallVector<Instruction*, 8> AtomicAccesses;
+  SmallVector<Instruction*, 8> MemIntrinCalls;
+  bool Res = false;
+  bool HasCalls = false;
+  bool SanitizeFunction = F.hasFnAttribute(Attribute::SanitizeThread);
+  const DataLayout &DL = F.getParent()->getDataLayout();
+
+  // Traverse all instructions, collect loads/stores/returns, check for calls.
+  for (auto &BB : F) {
+    for (auto &Inst : BB) {
+      if (isAtomic(&Inst))
+        AtomicAccesses.push_back(&Inst);
+      else if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst))
+        LocalLoadsAndStores.push_back(&Inst);
+      else if (isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) {
+        if (CallInst *CI = dyn_cast<CallInst>(&Inst))
+          maybeMarkSanitizerLibraryCallNoBuiltin(CI, &TLI);
+        if (isa<MemIntrinsic>(Inst))
+          MemIntrinCalls.push_back(&Inst);
+        HasCalls = true;
+        chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores,
+                                       DL);
+      }
+    }
+    chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores, DL);
+  }
+
+  // We have collected all loads and stores.
+  // FIXME: many of these accesses do not need to be checked for races
+  // (e.g. variables that do not escape, etc).
+
+  // Instrument memory accesses only if we want to report bugs in the function.
+  if (ClInstrumentMemoryAccesses && SanitizeFunction)
     for (const auto &II : AllLoadsAndStores) {
       Res |= instrumentLoadOrStore(II, DL);
-    } 
- 
-  // Instrument atomic memory accesses in any case (they can be used to 
-  // implement synchronization). 
-  if (ClInstrumentAtomics) 
-    for (auto Inst : AtomicAccesses) { 
-      Res |= instrumentAtomic(Inst, DL); 
-    } 
- 
-  if (ClInstrumentMemIntrinsics && SanitizeFunction) 
-    for (auto Inst : MemIntrinCalls) { 
-      Res |= instrumentMemIntrinsic(Inst); 
-    } 
- 
-  if (F.hasFnAttribute("sanitize_thread_no_checking_at_run_time")) { 
-    assert(!F.hasFnAttribute(Attribute::SanitizeThread)); 
-    if (HasCalls) 
-      InsertRuntimeIgnores(F); 
-  } 
- 
-  // Instrument function entry/exit points if there were instrumented accesses. 
-  if ((Res || HasCalls) && ClInstrumentFuncEntryExit) { 
-    IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI()); 
-    Value *ReturnAddress = IRB.CreateCall( 
-        Intrinsic::getDeclaration(F.getParent(), Intrinsic::returnaddress), 
-        IRB.getInt32(0)); 
-    IRB.CreateCall(TsanFuncEntry, ReturnAddress); 
- 
-    EscapeEnumerator EE(F, "tsan_cleanup", ClHandleCxxExceptions); 
-    while (IRBuilder<> *AtExit = EE.Next()) { 
-      AtExit->CreateCall(TsanFuncExit, {}); 
-    } 
-    Res = true; 
-  } 
-  return Res; 
-} 
- 
+    }
+
+  // Instrument atomic memory accesses in any case (they can be used to
+  // implement synchronization).
+  if (ClInstrumentAtomics)
+    for (auto Inst : AtomicAccesses) {
+      Res |= instrumentAtomic(Inst, DL);
+    }
+
+  if (ClInstrumentMemIntrinsics && SanitizeFunction)
+    for (auto Inst : MemIntrinCalls) {
+      Res |= instrumentMemIntrinsic(Inst);
+    }
+
+  if (F.hasFnAttribute("sanitize_thread_no_checking_at_run_time")) {
+    assert(!F.hasFnAttribute(Attribute::SanitizeThread));
+    if (HasCalls)
+      InsertRuntimeIgnores(F);
+  }
+
+  // Instrument function entry/exit points if there were instrumented accesses.
+  if ((Res || HasCalls) && ClInstrumentFuncEntryExit) {
+    IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+    Value *ReturnAddress = IRB.CreateCall(
+        Intrinsic::getDeclaration(F.getParent(), Intrinsic::returnaddress),
+        IRB.getInt32(0));
+    IRB.CreateCall(TsanFuncEntry, ReturnAddress);
+
+    EscapeEnumerator EE(F, "tsan_cleanup", ClHandleCxxExceptions);
+    while (IRBuilder<> *AtExit = EE.Next()) {
+      AtExit->CreateCall(TsanFuncExit, {});
+    }
+    Res = true;
+  }
+  return Res;
+}
+
 bool ThreadSanitizer::instrumentLoadOrStore(const InstructionInfo &II,
-                                            const DataLayout &DL) { 
+                                            const DataLayout &DL) {
   IRBuilder<> IRB(II.Inst);
   const bool IsWrite = isa<StoreInst>(*II.Inst);
   Value *Addr = IsWrite ? cast<StoreInst>(II.Inst)->getPointerOperand()
                         : cast<LoadInst>(II.Inst)->getPointerOperand();
- 
-  // swifterror memory addresses are mem2reg promoted by instruction selection. 
-  // As such they cannot have regular uses like an instrumentation function and 
-  // it makes no sense to track them as memory. 
-  if (Addr->isSwiftError()) 
-    return false; 
- 
-  int Idx = getMemoryAccessFuncIndex(Addr, DL); 
-  if (Idx < 0) 
-    return false; 
+
+  // swifterror memory addresses are mem2reg promoted by instruction selection.
+  // As such they cannot have regular uses like an instrumentation function and
+  // it makes no sense to track them as memory.
+  if (Addr->isSwiftError())
+    return false;
+
+  int Idx = getMemoryAccessFuncIndex(Addr, DL);
+  if (Idx < 0)
+    return false;
   if (IsWrite && isVtableAccess(II.Inst)) {
     LLVM_DEBUG(dbgs() << "  VPTR : " << *II.Inst << "\n");
     Value *StoredValue = cast<StoreInst>(II.Inst)->getValueOperand();
-    // StoredValue may be a vector type if we are storing several vptrs at once. 
-    // In this case, just take the first element of the vector since this is 
-    // enough to find vptr races. 
-    if (isa<VectorType>(StoredValue->getType())) 
-      StoredValue = IRB.CreateExtractElement( 
-          StoredValue, ConstantInt::get(IRB.getInt32Ty(), 0)); 
-    if (StoredValue->getType()->isIntegerTy()) 
-      StoredValue = IRB.CreateIntToPtr(StoredValue, IRB.getInt8PtrTy()); 
-    // Call TsanVptrUpdate. 
-    IRB.CreateCall(TsanVptrUpdate, 
-                   {IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()), 
-                    IRB.CreatePointerCast(StoredValue, IRB.getInt8PtrTy())}); 
-    NumInstrumentedVtableWrites++; 
-    return true; 
-  } 
+    // StoredValue may be a vector type if we are storing several vptrs at once.
+    // In this case, just take the first element of the vector since this is
+    // enough to find vptr races.
+    if (isa<VectorType>(StoredValue->getType()))
+      StoredValue = IRB.CreateExtractElement(
+          StoredValue, ConstantInt::get(IRB.getInt32Ty(), 0));
+    if (StoredValue->getType()->isIntegerTy())
+      StoredValue = IRB.CreateIntToPtr(StoredValue, IRB.getInt8PtrTy());
+    // Call TsanVptrUpdate.
+    IRB.CreateCall(TsanVptrUpdate,
+                   {IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()),
+                    IRB.CreatePointerCast(StoredValue, IRB.getInt8PtrTy())});
+    NumInstrumentedVtableWrites++;
+    return true;
+  }
   if (!IsWrite && isVtableAccess(II.Inst)) {
-    IRB.CreateCall(TsanVptrLoad, 
-                   IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy())); 
-    NumInstrumentedVtableReads++; 
-    return true; 
-  } 
+    IRB.CreateCall(TsanVptrLoad,
+                   IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()));
+    NumInstrumentedVtableReads++;
+    return true;
+  }
 
   const unsigned Alignment = IsWrite ? cast<StoreInst>(II.Inst)->getAlignment()
                                      : cast<LoadInst>(II.Inst)->getAlignment();
@@ -658,191 +658,191 @@ bool ThreadSanitizer::instrumentLoadOrStore(const InstructionInfo &II,
                                    : cast<LoadInst>(II.Inst)->isVolatile());
   assert((!IsVolatile || !IsCompoundRW) && "Compound volatile invalid!");
 
-  Type *OrigTy = cast<PointerType>(Addr->getType())->getElementType(); 
-  const uint32_t TypeSize = DL.getTypeStoreSizeInBits(OrigTy); 
-  FunctionCallee OnAccessFunc = nullptr; 
-  if (Alignment == 0 || Alignment >= 8 || (Alignment % (TypeSize / 8)) == 0) { 
+  Type *OrigTy = cast<PointerType>(Addr->getType())->getElementType();
+  const uint32_t TypeSize = DL.getTypeStoreSizeInBits(OrigTy);
+  FunctionCallee OnAccessFunc = nullptr;
+  if (Alignment == 0 || Alignment >= 8 || (Alignment % (TypeSize / 8)) == 0) {
     if (IsCompoundRW)
       OnAccessFunc = TsanCompoundRW[Idx];
     else if (IsVolatile)
-      OnAccessFunc = IsWrite ? TsanVolatileWrite[Idx] : TsanVolatileRead[Idx]; 
-    else 
-      OnAccessFunc = IsWrite ? TsanWrite[Idx] : TsanRead[Idx]; 
-  } else { 
+      OnAccessFunc = IsWrite ? TsanVolatileWrite[Idx] : TsanVolatileRead[Idx];
+    else
+      OnAccessFunc = IsWrite ? TsanWrite[Idx] : TsanRead[Idx];
+  } else {
     if (IsCompoundRW)
       OnAccessFunc = TsanUnalignedCompoundRW[Idx];
     else if (IsVolatile)
-      OnAccessFunc = IsWrite ? TsanUnalignedVolatileWrite[Idx] 
-                             : TsanUnalignedVolatileRead[Idx]; 
-    else 
-      OnAccessFunc = IsWrite ? TsanUnalignedWrite[Idx] : TsanUnalignedRead[Idx]; 
-  } 
-  IRB.CreateCall(OnAccessFunc, IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy())); 
+      OnAccessFunc = IsWrite ? TsanUnalignedVolatileWrite[Idx]
+                             : TsanUnalignedVolatileRead[Idx];
+    else
+      OnAccessFunc = IsWrite ? TsanUnalignedWrite[Idx] : TsanUnalignedRead[Idx];
+  }
+  IRB.CreateCall(OnAccessFunc, IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()));
   if (IsCompoundRW || IsWrite)
     NumInstrumentedWrites++;
   if (IsCompoundRW || !IsWrite)
     NumInstrumentedReads++;
-  return true; 
-} 
- 
-static ConstantInt *createOrdering(IRBuilder<> *IRB, AtomicOrdering ord) { 
-  uint32_t v = 0; 
-  switch (ord) { 
-    case AtomicOrdering::NotAtomic: 
-      llvm_unreachable("unexpected atomic ordering!"); 
-    case AtomicOrdering::Unordered:              LLVM_FALLTHROUGH; 
-    case AtomicOrdering::Monotonic:              v = 0; break; 
-    // Not specified yet: 
-    // case AtomicOrdering::Consume:                v = 1; break; 
-    case AtomicOrdering::Acquire:                v = 2; break; 
-    case AtomicOrdering::Release:                v = 3; break; 
-    case AtomicOrdering::AcquireRelease:         v = 4; break; 
-    case AtomicOrdering::SequentiallyConsistent: v = 5; break; 
-  } 
-  return IRB->getInt32(v); 
-} 
- 
-// If a memset intrinsic gets inlined by the code gen, we will miss races on it. 
-// So, we either need to ensure the intrinsic is not inlined, or instrument it. 
-// We do not instrument memset/memmove/memcpy intrinsics (too complicated), 
-// instead we simply replace them with regular function calls, which are then 
-// intercepted by the run-time. 
-// Since tsan is running after everyone else, the calls should not be 
-// replaced back with intrinsics. If that becomes wrong at some point, 
-// we will need to call e.g. __tsan_memset to avoid the intrinsics. 
-bool ThreadSanitizer::instrumentMemIntrinsic(Instruction *I) { 
-  IRBuilder<> IRB(I); 
-  if (MemSetInst *M = dyn_cast<MemSetInst>(I)) { 
-    IRB.CreateCall( 
-        MemsetFn, 
-        {IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()), 
-         IRB.CreateIntCast(M->getArgOperand(1), IRB.getInt32Ty(), false), 
-         IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false)}); 
-    I->eraseFromParent(); 
-  } else if (MemTransferInst *M = dyn_cast<MemTransferInst>(I)) { 
-    IRB.CreateCall( 
-        isa<MemCpyInst>(M) ? MemcpyFn : MemmoveFn, 
-        {IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()), 
-         IRB.CreatePointerCast(M->getArgOperand(1), IRB.getInt8PtrTy()), 
-         IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false)}); 
-    I->eraseFromParent(); 
-  } 
-  return false; 
-} 
- 
-// Both llvm and ThreadSanitizer atomic operations are based on C++11/C1x 
-// standards.  For background see C++11 standard.  A slightly older, publicly 
-// available draft of the standard (not entirely up-to-date, but close enough 
-// for casual browsing) is available here: 
-// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2011/n3242.pdf 
-// The following page contains more background information: 
-// http://www.hpl.hp.com/personal/Hans_Boehm/c++mm/ 
- 
-bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) { 
-  IRBuilder<> IRB(I); 
-  if (LoadInst *LI = dyn_cast<LoadInst>(I)) { 
-    Value *Addr = LI->getPointerOperand(); 
-    int Idx = getMemoryAccessFuncIndex(Addr, DL); 
-    if (Idx < 0) 
-      return false; 
-    const unsigned ByteSize = 1U << Idx; 
-    const unsigned BitSize = ByteSize * 8; 
-    Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize); 
-    Type *PtrTy = Ty->getPointerTo(); 
-    Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy), 
-                     createOrdering(&IRB, LI->getOrdering())}; 
-    Type *OrigTy = cast<PointerType>(Addr->getType())->getElementType(); 
-    Value *C = IRB.CreateCall(TsanAtomicLoad[Idx], Args); 
-    Value *Cast = IRB.CreateBitOrPointerCast(C, OrigTy); 
-    I->replaceAllUsesWith(Cast); 
-  } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) { 
-    Value *Addr = SI->getPointerOperand(); 
-    int Idx = getMemoryAccessFuncIndex(Addr, DL); 
-    if (Idx < 0) 
-      return false; 
-    const unsigned ByteSize = 1U << Idx; 
-    const unsigned BitSize = ByteSize * 8; 
-    Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize); 
-    Type *PtrTy = Ty->getPointerTo(); 
-    Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy), 
-                     IRB.CreateBitOrPointerCast(SI->getValueOperand(), Ty), 
-                     createOrdering(&IRB, SI->getOrdering())}; 
-    CallInst *C = CallInst::Create(TsanAtomicStore[Idx], Args); 
-    ReplaceInstWithInst(I, C); 
-  } else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I)) { 
-    Value *Addr = RMWI->getPointerOperand(); 
-    int Idx = getMemoryAccessFuncIndex(Addr, DL); 
-    if (Idx < 0) 
-      return false; 
-    FunctionCallee F = TsanAtomicRMW[RMWI->getOperation()][Idx]; 
-    if (!F) 
-      return false; 
-    const unsigned ByteSize = 1U << Idx; 
-    const unsigned BitSize = ByteSize * 8; 
-    Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize); 
-    Type *PtrTy = Ty->getPointerTo(); 
-    Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy), 
-                     IRB.CreateIntCast(RMWI->getValOperand(), Ty, false), 
-                     createOrdering(&IRB, RMWI->getOrdering())}; 
-    CallInst *C = CallInst::Create(F, Args); 
-    ReplaceInstWithInst(I, C); 
-  } else if (AtomicCmpXchgInst *CASI = dyn_cast<AtomicCmpXchgInst>(I)) { 
-    Value *Addr = CASI->getPointerOperand(); 
-    int Idx = getMemoryAccessFuncIndex(Addr, DL); 
-    if (Idx < 0) 
-      return false; 
-    const unsigned ByteSize = 1U << Idx; 
-    const unsigned BitSize = ByteSize * 8; 
-    Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize); 
-    Type *PtrTy = Ty->getPointerTo(); 
-    Value *CmpOperand = 
-      IRB.CreateBitOrPointerCast(CASI->getCompareOperand(), Ty); 
-    Value *NewOperand = 
-      IRB.CreateBitOrPointerCast(CASI->getNewValOperand(), Ty); 
-    Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy), 
-                     CmpOperand, 
-                     NewOperand, 
-                     createOrdering(&IRB, CASI->getSuccessOrdering()), 
-                     createOrdering(&IRB, CASI->getFailureOrdering())}; 
-    CallInst *C = IRB.CreateCall(TsanAtomicCAS[Idx], Args); 
-    Value *Success = IRB.CreateICmpEQ(C, CmpOperand); 
-    Value *OldVal = C; 
-    Type *OrigOldValTy = CASI->getNewValOperand()->getType(); 
-    if (Ty != OrigOldValTy) { 
-      // The value is a pointer, so we need to cast the return value. 
-      OldVal = IRB.CreateIntToPtr(C, OrigOldValTy); 
-    } 
- 
-    Value *Res = 
-      IRB.CreateInsertValue(UndefValue::get(CASI->getType()), OldVal, 0); 
-    Res = IRB.CreateInsertValue(Res, Success, 1); 
- 
-    I->replaceAllUsesWith(Res); 
-    I->eraseFromParent(); 
-  } else if (FenceInst *FI = dyn_cast<FenceInst>(I)) { 
-    Value *Args[] = {createOrdering(&IRB, FI->getOrdering())}; 
-    FunctionCallee F = FI->getSyncScopeID() == SyncScope::SingleThread 
-                           ? TsanAtomicSignalFence 
-                           : TsanAtomicThreadFence; 
-    CallInst *C = CallInst::Create(F, Args); 
-    ReplaceInstWithInst(I, C); 
-  } 
-  return true; 
-} 
- 
-int ThreadSanitizer::getMemoryAccessFuncIndex(Value *Addr, 
-                                              const DataLayout &DL) { 
-  Type *OrigPtrTy = Addr->getType(); 
-  Type *OrigTy = cast<PointerType>(OrigPtrTy)->getElementType(); 
-  assert(OrigTy->isSized()); 
-  uint32_t TypeSize = DL.getTypeStoreSizeInBits(OrigTy); 
-  if (TypeSize != 8  && TypeSize != 16 && 
-      TypeSize != 32 && TypeSize != 64 && TypeSize != 128) { 
-    NumAccessesWithBadSize++; 
-    // Ignore all unusual sizes. 
-    return -1; 
-  } 
-  size_t Idx = countTrailingZeros(TypeSize / 8); 
-  assert(Idx < kNumberOfAccessSizes); 
-  return Idx; 
-} 
+  return true;
+}
+
+static ConstantInt *createOrdering(IRBuilder<> *IRB, AtomicOrdering ord) {
+  uint32_t v = 0;
+  switch (ord) {
+    case AtomicOrdering::NotAtomic:
+      llvm_unreachable("unexpected atomic ordering!");
+    case AtomicOrdering::Unordered:              LLVM_FALLTHROUGH;
+    case AtomicOrdering::Monotonic:              v = 0; break;
+    // Not specified yet:
+    // case AtomicOrdering::Consume:                v = 1; break;
+    case AtomicOrdering::Acquire:                v = 2; break;
+    case AtomicOrdering::Release:                v = 3; break;
+    case AtomicOrdering::AcquireRelease:         v = 4; break;
+    case AtomicOrdering::SequentiallyConsistent: v = 5; break;
+  }
+  return IRB->getInt32(v);
+}
+
+// If a memset intrinsic gets inlined by the code gen, we will miss races on it.
+// So, we either need to ensure the intrinsic is not inlined, or instrument it.
+// We do not instrument memset/memmove/memcpy intrinsics (too complicated),
+// instead we simply replace them with regular function calls, which are then
+// intercepted by the run-time.
+// Since tsan is running after everyone else, the calls should not be
+// replaced back with intrinsics. If that becomes wrong at some point,
+// we will need to call e.g. __tsan_memset to avoid the intrinsics.
+bool ThreadSanitizer::instrumentMemIntrinsic(Instruction *I) {
+  IRBuilder<> IRB(I);
+  if (MemSetInst *M = dyn_cast<MemSetInst>(I)) {
+    IRB.CreateCall(
+        MemsetFn,
+        {IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(M->getArgOperand(1), IRB.getInt32Ty(), false),
+         IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false)});
+    I->eraseFromParent();
+  } else if (MemTransferInst *M = dyn_cast<MemTransferInst>(I)) {
+    IRB.CreateCall(
+        isa<MemCpyInst>(M) ? MemcpyFn : MemmoveFn,
+        {IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreatePointerCast(M->getArgOperand(1), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false)});
+    I->eraseFromParent();
+  }
+  return false;
+}
+
+// Both llvm and ThreadSanitizer atomic operations are based on C++11/C1x
+// standards.  For background see C++11 standard.  A slightly older, publicly
+// available draft of the standard (not entirely up-to-date, but close enough
+// for casual browsing) is available here:
+// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2011/n3242.pdf
+// The following page contains more background information:
+// http://www.hpl.hp.com/personal/Hans_Boehm/c++mm/
+
+bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) {
+  IRBuilder<> IRB(I);
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    Value *Addr = LI->getPointerOperand();
+    int Idx = getMemoryAccessFuncIndex(Addr, DL);
+    if (Idx < 0)
+      return false;
+    const unsigned ByteSize = 1U << Idx;
+    const unsigned BitSize = ByteSize * 8;
+    Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+                     createOrdering(&IRB, LI->getOrdering())};
+    Type *OrigTy = cast<PointerType>(Addr->getType())->getElementType();
+    Value *C = IRB.CreateCall(TsanAtomicLoad[Idx], Args);
+    Value *Cast = IRB.CreateBitOrPointerCast(C, OrigTy);
+    I->replaceAllUsesWith(Cast);
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    Value *Addr = SI->getPointerOperand();
+    int Idx = getMemoryAccessFuncIndex(Addr, DL);
+    if (Idx < 0)
+      return false;
+    const unsigned ByteSize = 1U << Idx;
+    const unsigned BitSize = ByteSize * 8;
+    Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+                     IRB.CreateBitOrPointerCast(SI->getValueOperand(), Ty),
+                     createOrdering(&IRB, SI->getOrdering())};
+    CallInst *C = CallInst::Create(TsanAtomicStore[Idx], Args);
+    ReplaceInstWithInst(I, C);
+  } else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I)) {
+    Value *Addr = RMWI->getPointerOperand();
+    int Idx = getMemoryAccessFuncIndex(Addr, DL);
+    if (Idx < 0)
+      return false;
+    FunctionCallee F = TsanAtomicRMW[RMWI->getOperation()][Idx];
+    if (!F)
+      return false;
+    const unsigned ByteSize = 1U << Idx;
+    const unsigned BitSize = ByteSize * 8;
+    Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+                     IRB.CreateIntCast(RMWI->getValOperand(), Ty, false),
+                     createOrdering(&IRB, RMWI->getOrdering())};
+    CallInst *C = CallInst::Create(F, Args);
+    ReplaceInstWithInst(I, C);
+  } else if (AtomicCmpXchgInst *CASI = dyn_cast<AtomicCmpXchgInst>(I)) {
+    Value *Addr = CASI->getPointerOperand();
+    int Idx = getMemoryAccessFuncIndex(Addr, DL);
+    if (Idx < 0)
+      return false;
+    const unsigned ByteSize = 1U << Idx;
+    const unsigned BitSize = ByteSize * 8;
+    Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    Value *CmpOperand =
+      IRB.CreateBitOrPointerCast(CASI->getCompareOperand(), Ty);
+    Value *NewOperand =
+      IRB.CreateBitOrPointerCast(CASI->getNewValOperand(), Ty);
+    Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+                     CmpOperand,
+                     NewOperand,
+                     createOrdering(&IRB, CASI->getSuccessOrdering()),
+                     createOrdering(&IRB, CASI->getFailureOrdering())};
+    CallInst *C = IRB.CreateCall(TsanAtomicCAS[Idx], Args);
+    Value *Success = IRB.CreateICmpEQ(C, CmpOperand);
+    Value *OldVal = C;
+    Type *OrigOldValTy = CASI->getNewValOperand()->getType();
+    if (Ty != OrigOldValTy) {
+      // The value is a pointer, so we need to cast the return value.
+      OldVal = IRB.CreateIntToPtr(C, OrigOldValTy);
+    }
+
+    Value *Res =
+      IRB.CreateInsertValue(UndefValue::get(CASI->getType()), OldVal, 0);
+    Res = IRB.CreateInsertValue(Res, Success, 1);
+
+    I->replaceAllUsesWith(Res);
+    I->eraseFromParent();
+  } else if (FenceInst *FI = dyn_cast<FenceInst>(I)) {
+    Value *Args[] = {createOrdering(&IRB, FI->getOrdering())};
+    FunctionCallee F = FI->getSyncScopeID() == SyncScope::SingleThread
+                           ? TsanAtomicSignalFence
+                           : TsanAtomicThreadFence;
+    CallInst *C = CallInst::Create(F, Args);
+    ReplaceInstWithInst(I, C);
+  }
+  return true;
+}
+
+int ThreadSanitizer::getMemoryAccessFuncIndex(Value *Addr,
+                                              const DataLayout &DL) {
+  Type *OrigPtrTy = Addr->getType();
+  Type *OrigTy = cast<PointerType>(OrigPtrTy)->getElementType();
+  assert(OrigTy->isSized());
+  uint32_t TypeSize = DL.getTypeStoreSizeInBits(OrigTy);
+  if (TypeSize != 8  && TypeSize != 16 &&
+      TypeSize != 32 && TypeSize != 64 && TypeSize != 128) {
+    NumAccessesWithBadSize++;
+    // Ignore all unusual sizes.
+    return -1;
+  }
+  size_t Idx = countTrailingZeros(TypeSize / 8);
+  assert(Idx < kNumberOfAccessSizes);
+  return Idx;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/ValueProfileCollector.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/ValueProfileCollector.cpp
index d7d10fb5d5..fb6216bb21 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/ValueProfileCollector.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/ValueProfileCollector.cpp
@@ -1,80 +1,80 @@
-//===- ValueProfileCollector.cpp - determine what to value profile --------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// The implementation of the ValueProfileCollector via ValueProfileCollectorImpl 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "ValueProfilePlugins.inc" 
+//===- ValueProfileCollector.cpp - determine what to value profile --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The implementation of the ValueProfileCollector via ValueProfileCollectorImpl
+//
+//===----------------------------------------------------------------------===//
+
+#include "ValueProfilePlugins.inc"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/InstIterator.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/InitializePasses.h" 
-#include <cassert> 
- 
-using namespace llvm; 
- 
-namespace { 
- 
-/// A plugin-based class that takes an arbitrary number of Plugin types. 
-/// Each plugin type must satisfy the following API: 
-///  1) the constructor must take a `Function &f`. Typically, the plugin would 
-///     scan the function looking for candidates. 
-///  2) contain a member function with the following signature and name: 
-///        void run(std::vector<CandidateInfo> &Candidates); 
-///    such that the plugin would append its result into the vector parameter. 
-/// 
-/// Plugins are defined in ValueProfilePlugins.inc 
-template <class... Ts> class PluginChain; 
- 
-/// The type PluginChainFinal is the final chain of plugins that will be used by 
-/// ValueProfileCollectorImpl. 
-using PluginChainFinal = PluginChain<VP_PLUGIN_LIST>; 
- 
-template <> class PluginChain<> { 
-public: 
-  PluginChain(Function &F, TargetLibraryInfo &TLI) {} 
-  void get(InstrProfValueKind K, std::vector<CandidateInfo> &Candidates) {} 
-}; 
- 
-template <class PluginT, class... Ts> 
-class PluginChain<PluginT, Ts...> : public PluginChain<Ts...> { 
-  PluginT Plugin; 
-  using Base = PluginChain<Ts...>; 
- 
-public: 
-  PluginChain(Function &F, TargetLibraryInfo &TLI) 
-      : PluginChain<Ts...>(F, TLI), Plugin(F, TLI) {} 
- 
-  void get(InstrProfValueKind K, std::vector<CandidateInfo> &Candidates) { 
-    if (K == PluginT::Kind) 
-      Plugin.run(Candidates); 
-    Base::get(K, Candidates); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-/// ValueProfileCollectorImpl inherits the API of PluginChainFinal. 
-class ValueProfileCollector::ValueProfileCollectorImpl : public PluginChainFinal { 
-public: 
-  using PluginChainFinal::PluginChainFinal; 
-}; 
- 
-ValueProfileCollector::ValueProfileCollector(Function &F, 
-                                             TargetLibraryInfo &TLI) 
-    : PImpl(new ValueProfileCollectorImpl(F, TLI)) {} 
- 
-ValueProfileCollector::~ValueProfileCollector() = default; 
- 
-std::vector<CandidateInfo> 
-ValueProfileCollector::get(InstrProfValueKind Kind) const { 
-  std::vector<CandidateInfo> Result; 
-  PImpl->get(Kind, Result); 
-  return Result; 
-} 
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/InitializePasses.h"
+#include <cassert>
+
+using namespace llvm;
+
+namespace {
+
+/// A plugin-based class that takes an arbitrary number of Plugin types.
+/// Each plugin type must satisfy the following API:
+///  1) the constructor must take a `Function &f`. Typically, the plugin would
+///     scan the function looking for candidates.
+///  2) contain a member function with the following signature and name:
+///        void run(std::vector<CandidateInfo> &Candidates);
+///    such that the plugin would append its result into the vector parameter.
+///
+/// Plugins are defined in ValueProfilePlugins.inc
+template <class... Ts> class PluginChain;
+
+/// The type PluginChainFinal is the final chain of plugins that will be used by
+/// ValueProfileCollectorImpl.
+using PluginChainFinal = PluginChain<VP_PLUGIN_LIST>;
+
+template <> class PluginChain<> {
+public:
+  PluginChain(Function &F, TargetLibraryInfo &TLI) {}
+  void get(InstrProfValueKind K, std::vector<CandidateInfo> &Candidates) {}
+};
+
+template <class PluginT, class... Ts>
+class PluginChain<PluginT, Ts...> : public PluginChain<Ts...> {
+  PluginT Plugin;
+  using Base = PluginChain<Ts...>;
+
+public:
+  PluginChain(Function &F, TargetLibraryInfo &TLI)
+      : PluginChain<Ts...>(F, TLI), Plugin(F, TLI) {}
+
+  void get(InstrProfValueKind K, std::vector<CandidateInfo> &Candidates) {
+    if (K == PluginT::Kind)
+      Plugin.run(Candidates);
+    Base::get(K, Candidates);
+  }
+};
+
+} // end anonymous namespace
+
+/// ValueProfileCollectorImpl inherits the API of PluginChainFinal.
+class ValueProfileCollector::ValueProfileCollectorImpl : public PluginChainFinal {
+public:
+  using PluginChainFinal::PluginChainFinal;
+};
+
+ValueProfileCollector::ValueProfileCollector(Function &F,
+                                             TargetLibraryInfo &TLI)
+    : PImpl(new ValueProfileCollectorImpl(F, TLI)) {}
+
+ValueProfileCollector::~ValueProfileCollector() = default;
+
+std::vector<CandidateInfo>
+ValueProfileCollector::get(InstrProfValueKind Kind) const {
+  std::vector<CandidateInfo> Result;
+  PImpl->get(Kind, Result);
+  return Result;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/ValueProfileCollector.h b/contrib/libs/llvm12/lib/Transforms/Instrumentation/ValueProfileCollector.h
index 40f5006007..584a60ab45 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/ValueProfileCollector.h
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/ValueProfileCollector.h
@@ -1,83 +1,83 @@
-//===- ValueProfileCollector.h - determine what to value profile ----------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file contains a utility class, ValueProfileCollector, that is used to 
-// determine what kind of llvm::Value's are worth value-profiling, at which 
-// point in the program, and which instruction holds the Value Profile metadata. 
-// Currently, the only users of this utility is the PGOInstrumentation[Gen|Use] 
-// passes. 
-//===----------------------------------------------------------------------===// 
- 
-#ifndef LLVM_ANALYSIS_PROFILE_GEN_ANALYSIS_H 
-#define LLVM_ANALYSIS_PROFILE_GEN_ANALYSIS_H 
- 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/ProfileData/InstrProf.h" 
+//===- ValueProfileCollector.h - determine what to value profile ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a utility class, ValueProfileCollector, that is used to
+// determine what kind of llvm::Value's are worth value-profiling, at which
+// point in the program, and which instruction holds the Value Profile metadata.
+// Currently, the only users of this utility is the PGOInstrumentation[Gen|Use]
+// passes.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_PROFILE_GEN_ANALYSIS_H
+#define LLVM_ANALYSIS_PROFILE_GEN_ANALYSIS_H
+
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/ProfileData/InstrProf.h"
 #include <memory>
 #include <vector>
- 
-namespace llvm { 
- 
+
+namespace llvm {
+
 class Function;
 class Instruction;
 class Value;
 
-/// Utility analysis that determines what values are worth profiling. 
-/// The actual logic is inside the ValueProfileCollectorImpl, whose job is to 
-/// populate the Candidates vector. 
-/// 
-/// Value profiling an expression means to track the values that this expression 
-/// takes at runtime and the frequency of each value. 
-/// It is important to distinguish between two sets of value profiles for a 
-/// particular expression: 
-///  1) The set of values at the point of evaluation. 
-///  2) The set of values at the point of use. 
-/// In some cases, the two sets are identical, but it's not unusual for the two 
-/// to differ. 
-/// 
-/// To elaborate more, consider this C code, and focus on the expression `nn`: 
-///  void foo(int nn, bool b) { 
-///    if (b)  memcpy(x, y, nn); 
-///  } 
-/// The point of evaluation can be as early as the start of the function, and 
-/// let's say the value profile for `nn` is: 
-///     total=100; (value,freq) set = {(8,10), (32,50)} 
-/// The point of use is right before we call memcpy, and since we execute the 
-/// memcpy conditionally, the value profile of `nn` can be: 
-///     total=15; (value,freq) set = {(8,10), (4,5)} 
-/// 
-/// For this reason, a plugin is responsible for computing the insertion point 
-/// for each value to be profiled. The `CandidateInfo` structure encapsulates 
-/// all the information needed for each value profile site. 
-class ValueProfileCollector { 
-public: 
-  struct CandidateInfo { 
-    Value *V;                   // The value to profile. 
-    Instruction *InsertPt;      // Insert the VP lib call before this instr. 
-    Instruction *AnnotatedInst; // Where metadata is attached. 
-  }; 
- 
-  ValueProfileCollector(Function &Fn, TargetLibraryInfo &TLI); 
-  ValueProfileCollector(ValueProfileCollector &&) = delete; 
-  ValueProfileCollector &operator=(ValueProfileCollector &&) = delete; 
- 
-  ValueProfileCollector(const ValueProfileCollector &) = delete; 
-  ValueProfileCollector &operator=(const ValueProfileCollector &) = delete; 
-  ~ValueProfileCollector(); 
- 
-  /// returns a list of value profiling candidates of the given kind 
-  std::vector<CandidateInfo> get(InstrProfValueKind Kind) const; 
- 
-private: 
-  class ValueProfileCollectorImpl; 
-  std::unique_ptr<ValueProfileCollectorImpl> PImpl; 
-}; 
- 
-} // namespace llvm 
- 
-#endif 
+/// Utility analysis that determines what values are worth profiling.
+/// The actual logic is inside the ValueProfileCollectorImpl, whose job is to
+/// populate the Candidates vector.
+///
+/// Value profiling an expression means to track the values that this expression
+/// takes at runtime and the frequency of each value.
+/// It is important to distinguish between two sets of value profiles for a
+/// particular expression:
+///  1) The set of values at the point of evaluation.
+///  2) The set of values at the point of use.
+/// In some cases, the two sets are identical, but it's not unusual for the two
+/// to differ.
+///
+/// To elaborate more, consider this C code, and focus on the expression `nn`:
+///  void foo(int nn, bool b) {
+///    if (b)  memcpy(x, y, nn);
+///  }
+/// The point of evaluation can be as early as the start of the function, and
+/// let's say the value profile for `nn` is:
+///     total=100; (value,freq) set = {(8,10), (32,50)}
+/// The point of use is right before we call memcpy, and since we execute the
+/// memcpy conditionally, the value profile of `nn` can be:
+///     total=15; (value,freq) set = {(8,10), (4,5)}
+///
+/// For this reason, a plugin is responsible for computing the insertion point
+/// for each value to be profiled. The `CandidateInfo` structure encapsulates
+/// all the information needed for each value profile site.
+class ValueProfileCollector {
+public:
+  struct CandidateInfo {
+    Value *V;                   // The value to profile.
+    Instruction *InsertPt;      // Insert the VP lib call before this instr.
+    Instruction *AnnotatedInst; // Where metadata is attached.
+  };
+
+  ValueProfileCollector(Function &Fn, TargetLibraryInfo &TLI);
+  ValueProfileCollector(ValueProfileCollector &&) = delete;
+  ValueProfileCollector &operator=(ValueProfileCollector &&) = delete;
+
+  ValueProfileCollector(const ValueProfileCollector &) = delete;
+  ValueProfileCollector &operator=(const ValueProfileCollector &) = delete;
+  ~ValueProfileCollector();
+
+  /// returns a list of value profiling candidates of the given kind
+  std::vector<CandidateInfo> get(InstrProfValueKind Kind) const;
+
+private:
+  class ValueProfileCollectorImpl;
+  std::unique_ptr<ValueProfileCollectorImpl> PImpl;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/ValueProfilePlugins.inc b/contrib/libs/llvm12/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
index 0277494895..8d0cf5843e 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
@@ -1,97 +1,97 @@
-//=== ValueProfilePlugins.inc - set of plugins used by ValueProfileCollector =// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file contains a set of plugin classes used in ValueProfileCollectorImpl. 
-// Each plugin is responsible for collecting Value Profiling candidates for a 
-// particular optimization. 
-// Each plugin must satisfy the interface described in ValueProfileCollector.cpp 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "ValueProfileCollector.h" 
-#include "llvm/Analysis/IndirectCallVisitor.h" 
-#include "llvm/IR/InstVisitor.h" 
- 
-using namespace llvm; 
-using CandidateInfo = ValueProfileCollector::CandidateInfo; 
- 
-extern cl::opt<bool> MemOPOptMemcmpBcmp; 
- 
-///--------------------------- MemIntrinsicPlugin ------------------------------ 
-class MemIntrinsicPlugin : public InstVisitor<MemIntrinsicPlugin> { 
-  Function &F; 
-  TargetLibraryInfo &TLI; 
-  std::vector<CandidateInfo> *Candidates; 
- 
-public: 
-  static constexpr InstrProfValueKind Kind = IPVK_MemOPSize; 
- 
-  MemIntrinsicPlugin(Function &Fn, TargetLibraryInfo &TLI) 
-      : F(Fn), TLI(TLI), Candidates(nullptr) {} 
- 
-  void run(std::vector<CandidateInfo> &Cs) { 
-    Candidates = &Cs; 
-    visit(F); 
-    Candidates = nullptr; 
-  } 
-  void visitMemIntrinsic(MemIntrinsic &MI) { 
-    Value *Length = MI.getLength(); 
-    // Not instrument constant length calls. 
-    if (dyn_cast<ConstantInt>(Length)) 
-      return; 
- 
-    Instruction *InsertPt = &MI; 
-    Instruction *AnnotatedInst = &MI; 
-    Candidates->emplace_back(CandidateInfo{Length, InsertPt, AnnotatedInst}); 
-  } 
-  void visitCallInst(CallInst &CI) { 
-    if (!MemOPOptMemcmpBcmp) 
-      return; 
-    auto *F = CI.getCalledFunction(); 
-    if (!F) 
-      return; 
-    LibFunc Func; 
-    if (TLI.getLibFunc(CI, Func) && 
-        (Func == LibFunc_memcmp || Func == LibFunc_bcmp)) { 
-      Value *Length = CI.getArgOperand(2); 
-      // Not instrument constant length calls. 
-      if (dyn_cast<ConstantInt>(Length)) 
-        return; 
-      Instruction *InsertPt = &CI; 
-      Instruction *AnnotatedInst = &CI; 
-      Candidates->emplace_back(CandidateInfo{Length, InsertPt, AnnotatedInst}); 
-    } 
-  } 
-}; 
- 
-///------------------------ IndirectCallPromotionPlugin ------------------------ 
-class IndirectCallPromotionPlugin { 
-  Function &F; 
- 
-public: 
-  static constexpr InstrProfValueKind Kind = IPVK_IndirectCallTarget; 
- 
-  IndirectCallPromotionPlugin(Function &Fn, TargetLibraryInfo &TLI) : F(Fn) {} 
- 
-  void run(std::vector<CandidateInfo> &Candidates) { 
-    std::vector<CallBase *> Result = findIndirectCalls(F); 
-    for (Instruction *I : Result) { 
-      Value *Callee = cast<CallBase>(I)->getCalledOperand(); 
-      Instruction *InsertPt = I; 
-      Instruction *AnnotatedInst = I; 
-      Candidates.emplace_back(CandidateInfo{Callee, InsertPt, AnnotatedInst}); 
-    } 
-  } 
-}; 
- 
-///----------------------- Registration of the plugins ------------------------- 
-/// For now, registering a plugin with the ValueProfileCollector is done by 
-/// adding the plugin type to the VP_PLUGIN_LIST macro. 
-#define VP_PLUGIN_LIST           \ 
-    MemIntrinsicPlugin,          \ 
-    IndirectCallPromotionPlugin 
+//=== ValueProfilePlugins.inc - set of plugins used by ValueProfileCollector =//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a set of plugin classes used in ValueProfileCollectorImpl.
+// Each plugin is responsible for collecting Value Profiling candidates for a
+// particular optimization.
+// Each plugin must satisfy the interface described in ValueProfileCollector.cpp
+//
+//===----------------------------------------------------------------------===//
+
+#include "ValueProfileCollector.h"
+#include "llvm/Analysis/IndirectCallVisitor.h"
+#include "llvm/IR/InstVisitor.h"
+
+using namespace llvm;
+using CandidateInfo = ValueProfileCollector::CandidateInfo;
+
+extern cl::opt<bool> MemOPOptMemcmpBcmp;
+
+///--------------------------- MemIntrinsicPlugin ------------------------------
+class MemIntrinsicPlugin : public InstVisitor<MemIntrinsicPlugin> {
+  Function &F;
+  TargetLibraryInfo &TLI;
+  std::vector<CandidateInfo> *Candidates;
+
+public:
+  static constexpr InstrProfValueKind Kind = IPVK_MemOPSize;
+
+  MemIntrinsicPlugin(Function &Fn, TargetLibraryInfo &TLI)
+      : F(Fn), TLI(TLI), Candidates(nullptr) {}
+
+  void run(std::vector<CandidateInfo> &Cs) {
+    Candidates = &Cs;
+    visit(F);
+    Candidates = nullptr;
+  }
+  void visitMemIntrinsic(MemIntrinsic &MI) {
+    Value *Length = MI.getLength();
+    // Not instrument constant length calls.
+    if (dyn_cast<ConstantInt>(Length))
+      return;
+
+    Instruction *InsertPt = &MI;
+    Instruction *AnnotatedInst = &MI;
+    Candidates->emplace_back(CandidateInfo{Length, InsertPt, AnnotatedInst});
+  }
+  void visitCallInst(CallInst &CI) {
+    if (!MemOPOptMemcmpBcmp)
+      return;
+    auto *F = CI.getCalledFunction();
+    if (!F)
+      return;
+    LibFunc Func;
+    if (TLI.getLibFunc(CI, Func) &&
+        (Func == LibFunc_memcmp || Func == LibFunc_bcmp)) {
+      Value *Length = CI.getArgOperand(2);
+      // Not instrument constant length calls.
+      if (dyn_cast<ConstantInt>(Length))
+        return;
+      Instruction *InsertPt = &CI;
+      Instruction *AnnotatedInst = &CI;
+      Candidates->emplace_back(CandidateInfo{Length, InsertPt, AnnotatedInst});
+    }
+  }
+};
+
+///------------------------ IndirectCallPromotionPlugin ------------------------
+class IndirectCallPromotionPlugin {
+  Function &F;
+
+public:
+  static constexpr InstrProfValueKind Kind = IPVK_IndirectCallTarget;
+
+  IndirectCallPromotionPlugin(Function &Fn, TargetLibraryInfo &TLI) : F(Fn) {}
+
+  void run(std::vector<CandidateInfo> &Candidates) {
+    std::vector<CallBase *> Result = findIndirectCalls(F);
+    for (Instruction *I : Result) {
+      Value *Callee = cast<CallBase>(I)->getCalledOperand();
+      Instruction *InsertPt = I;
+      Instruction *AnnotatedInst = I;
+      Candidates.emplace_back(CandidateInfo{Callee, InsertPt, AnnotatedInst});
+    }
+  }
+};
+
+///----------------------- Registration of the plugins -------------------------
+/// For now, registering a plugin with the ValueProfileCollector is done by
+/// adding the plugin type to the VP_PLUGIN_LIST macro.
+#define VP_PLUGIN_LIST           \
+    MemIntrinsicPlugin,          \
+    IndirectCallPromotionPlugin
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/ya.make b/contrib/libs/llvm12/lib/Transforms/Instrumentation/ya.make
index 10b7425404..39dab1eb7d 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/ya.make
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/ya.make
@@ -1,12 +1,12 @@
-# Generated by devtools/yamaker. 
- 
-LIBRARY() 
- 
+# Generated by devtools/yamaker.
+
+LIBRARY()
+
 OWNER(
     orivej
     g:cpp-contrib
 )
- 
+
 LICENSE(
     Apache-2.0 WITH LLVM-exception AND
     NCSA
@@ -14,7 +14,7 @@ LICENSE(
 
 LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
 
-PEERDIR( 
+PEERDIR(
     contrib/libs/llvm12
     contrib/libs/llvm12/include
     contrib/libs/llvm12/lib/Analysis
@@ -23,36 +23,36 @@ PEERDIR(
     contrib/libs/llvm12/lib/ProfileData
     contrib/libs/llvm12/lib/Support
     contrib/libs/llvm12/lib/Transforms/Utils
-) 
- 
+)
+
 ADDINCL(
     contrib/libs/llvm12/lib/Transforms/Instrumentation
 )
- 
-NO_COMPILER_WARNINGS() 
- 
-NO_UTIL() 
- 
-SRCS( 
-    AddressSanitizer.cpp 
-    BoundsChecking.cpp 
-    CGProfile.cpp 
-    ControlHeightReduction.cpp 
-    DataFlowSanitizer.cpp 
-    GCOVProfiling.cpp 
-    HWAddressSanitizer.cpp 
-    IndirectCallPromotion.cpp 
-    InstrOrderFile.cpp 
-    InstrProfiling.cpp 
-    Instrumentation.cpp 
+
+NO_COMPILER_WARNINGS()
+
+NO_UTIL()
+
+SRCS(
+    AddressSanitizer.cpp
+    BoundsChecking.cpp
+    CGProfile.cpp
+    ControlHeightReduction.cpp
+    DataFlowSanitizer.cpp
+    GCOVProfiling.cpp
+    HWAddressSanitizer.cpp
+    IndirectCallPromotion.cpp
+    InstrOrderFile.cpp
+    InstrProfiling.cpp
+    Instrumentation.cpp
     MemProfiler.cpp
-    MemorySanitizer.cpp 
-    PGOInstrumentation.cpp 
-    PGOMemOPSizeOpt.cpp 
-    PoisonChecking.cpp 
-    SanitizerCoverage.cpp 
-    ThreadSanitizer.cpp 
-    ValueProfileCollector.cpp 
-) 
- 
-END() 
+    MemorySanitizer.cpp
+    PGOInstrumentation.cpp
+    PGOMemOPSizeOpt.cpp
+    PoisonChecking.cpp
+    SanitizerCoverage.cpp
+    ThreadSanitizer.cpp
+    ValueProfileCollector.cpp
+)
+
+END()
diff --git a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
index c02799f3b2..258dc92408 100644
--- a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
+++ b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
@@ -1,142 +1,142 @@
-//===- ARCRuntimeEntryPoints.h - ObjC ARC Optimization ----------*- C++ -*-===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-/// \file 
-/// This file contains a class ARCRuntimeEntryPoints for use in 
-/// creating/managing references to entry points to the arc objective c runtime. 
-/// 
-/// WARNING: This file knows about certain library functions. It recognizes them 
-/// by name, and hardwires knowledge of their semantics. 
-/// 
-/// WARNING: This file knows about how certain Objective-C library functions are 
-/// used. Naive LLVM IR transformations which would otherwise be 
-/// behavior-preserving may break these assumptions. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_ARCRUNTIMEENTRYPOINTS_H 
-#define LLVM_LIB_TRANSFORMS_OBJCARC_ARCRUNTIMEENTRYPOINTS_H 
- 
-#include "llvm/IR/Attributes.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include <cassert> 
- 
-namespace llvm { 
- 
-class Function; 
-class Module; 
- 
-namespace objcarc { 
- 
-enum class ARCRuntimeEntryPointKind { 
-  AutoreleaseRV, 
-  Release, 
-  Retain, 
-  RetainBlock, 
-  Autorelease, 
-  StoreStrong, 
-  RetainRV, 
-  RetainAutorelease, 
-  RetainAutoreleaseRV, 
-}; 
- 
-/// Declarations for ObjC runtime functions and constants. These are initialized 
-/// lazily to avoid cluttering up the Module with unused declarations. 
-class ARCRuntimeEntryPoints { 
-public: 
-  ARCRuntimeEntryPoints() = default; 
- 
-  void init(Module *M) { 
-    TheModule = M; 
-    AutoreleaseRV = nullptr; 
-    Release = nullptr; 
-    Retain = nullptr; 
-    RetainBlock = nullptr; 
-    Autorelease = nullptr; 
-    StoreStrong = nullptr; 
-    RetainRV = nullptr; 
-    RetainAutorelease = nullptr; 
-    RetainAutoreleaseRV = nullptr; 
-  } 
- 
-  Function *get(ARCRuntimeEntryPointKind kind) { 
-    assert(TheModule != nullptr && "Not initialized."); 
- 
-    switch (kind) { 
-    case ARCRuntimeEntryPointKind::AutoreleaseRV: 
-      return getIntrinsicEntryPoint(AutoreleaseRV, 
-                                    Intrinsic::objc_autoreleaseReturnValue); 
-    case ARCRuntimeEntryPointKind::Release: 
-      return getIntrinsicEntryPoint(Release, Intrinsic::objc_release); 
-    case ARCRuntimeEntryPointKind::Retain: 
-      return getIntrinsicEntryPoint(Retain, Intrinsic::objc_retain); 
-    case ARCRuntimeEntryPointKind::RetainBlock: 
-      return getIntrinsicEntryPoint(RetainBlock, Intrinsic::objc_retainBlock); 
-    case ARCRuntimeEntryPointKind::Autorelease: 
-      return getIntrinsicEntryPoint(Autorelease, Intrinsic::objc_autorelease); 
-    case ARCRuntimeEntryPointKind::StoreStrong: 
-      return getIntrinsicEntryPoint(StoreStrong, Intrinsic::objc_storeStrong); 
-    case ARCRuntimeEntryPointKind::RetainRV: 
-      return getIntrinsicEntryPoint(RetainRV, 
-                                Intrinsic::objc_retainAutoreleasedReturnValue); 
-    case ARCRuntimeEntryPointKind::RetainAutorelease: 
-      return getIntrinsicEntryPoint(RetainAutorelease, 
-                                    Intrinsic::objc_retainAutorelease); 
-    case ARCRuntimeEntryPointKind::RetainAutoreleaseRV: 
-      return getIntrinsicEntryPoint(RetainAutoreleaseRV, 
-                                Intrinsic::objc_retainAutoreleaseReturnValue); 
-    } 
- 
-    llvm_unreachable("Switch should be a covered switch."); 
-  } 
- 
-private: 
-  /// Cached reference to the module which we will insert declarations into. 
-  Module *TheModule = nullptr; 
- 
-  /// Declaration for ObjC runtime function objc_autoreleaseReturnValue. 
-  Function *AutoreleaseRV = nullptr; 
- 
-  /// Declaration for ObjC runtime function objc_release. 
-  Function *Release = nullptr; 
- 
-  /// Declaration for ObjC runtime function objc_retain. 
-  Function *Retain = nullptr; 
- 
-  /// Declaration for ObjC runtime function objc_retainBlock. 
-  Function *RetainBlock = nullptr; 
- 
-  /// Declaration for ObjC runtime function objc_autorelease. 
-  Function *Autorelease = nullptr; 
- 
-  /// Declaration for objc_storeStrong(). 
-  Function *StoreStrong = nullptr; 
- 
-  /// Declaration for objc_retainAutoreleasedReturnValue(). 
-  Function *RetainRV = nullptr; 
- 
-  /// Declaration for objc_retainAutorelease(). 
-  Function *RetainAutorelease = nullptr; 
- 
-  /// Declaration for objc_retainAutoreleaseReturnValue(). 
-  Function *RetainAutoreleaseRV = nullptr; 
- 
-  Function *getIntrinsicEntryPoint(Function *&Decl, Intrinsic::ID IntID) { 
-    if (Decl) 
-      return Decl; 
- 
-    return Decl = Intrinsic::getDeclaration(TheModule, IntID); 
-  } 
-}; 
- 
-} // end namespace objcarc 
- 
-} // end namespace llvm 
- 
-#endif // LLVM_LIB_TRANSFORMS_OBJCARC_ARCRUNTIMEENTRYPOINTS_H 
+//===- ARCRuntimeEntryPoints.h - ObjC ARC Optimization ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file contains a class ARCRuntimeEntryPoints for use in
+/// creating/managing references to entry points to the arc objective c runtime.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_ARCRUNTIMEENTRYPOINTS_H
+#define LLVM_LIB_TRANSFORMS_OBJCARC_ARCRUNTIMEENTRYPOINTS_H
+
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+
+namespace llvm {
+
+class Function;
+class Module;
+
+namespace objcarc {
+
+enum class ARCRuntimeEntryPointKind {
+  AutoreleaseRV,
+  Release,
+  Retain,
+  RetainBlock,
+  Autorelease,
+  StoreStrong,
+  RetainRV,
+  RetainAutorelease,
+  RetainAutoreleaseRV,
+};
+
+/// Declarations for ObjC runtime functions and constants. These are initialized
+/// lazily to avoid cluttering up the Module with unused declarations.
+class ARCRuntimeEntryPoints {
+public:
+  ARCRuntimeEntryPoints() = default;
+
+  void init(Module *M) {
+    TheModule = M;
+    AutoreleaseRV = nullptr;
+    Release = nullptr;
+    Retain = nullptr;
+    RetainBlock = nullptr;
+    Autorelease = nullptr;
+    StoreStrong = nullptr;
+    RetainRV = nullptr;
+    RetainAutorelease = nullptr;
+    RetainAutoreleaseRV = nullptr;
+  }
+
+  Function *get(ARCRuntimeEntryPointKind kind) {
+    assert(TheModule != nullptr && "Not initialized.");
+
+    switch (kind) {
+    case ARCRuntimeEntryPointKind::AutoreleaseRV:
+      return getIntrinsicEntryPoint(AutoreleaseRV,
+                                    Intrinsic::objc_autoreleaseReturnValue);
+    case ARCRuntimeEntryPointKind::Release:
+      return getIntrinsicEntryPoint(Release, Intrinsic::objc_release);
+    case ARCRuntimeEntryPointKind::Retain:
+      return getIntrinsicEntryPoint(Retain, Intrinsic::objc_retain);
+    case ARCRuntimeEntryPointKind::RetainBlock:
+      return getIntrinsicEntryPoint(RetainBlock, Intrinsic::objc_retainBlock);
+    case ARCRuntimeEntryPointKind::Autorelease:
+      return getIntrinsicEntryPoint(Autorelease, Intrinsic::objc_autorelease);
+    case ARCRuntimeEntryPointKind::StoreStrong:
+      return getIntrinsicEntryPoint(StoreStrong, Intrinsic::objc_storeStrong);
+    case ARCRuntimeEntryPointKind::RetainRV:
+      return getIntrinsicEntryPoint(RetainRV,
+                                Intrinsic::objc_retainAutoreleasedReturnValue);
+    case ARCRuntimeEntryPointKind::RetainAutorelease:
+      return getIntrinsicEntryPoint(RetainAutorelease,
+                                    Intrinsic::objc_retainAutorelease);
+    case ARCRuntimeEntryPointKind::RetainAutoreleaseRV:
+      return getIntrinsicEntryPoint(RetainAutoreleaseRV,
+                                Intrinsic::objc_retainAutoreleaseReturnValue);
+    }
+
+    llvm_unreachable("Switch should be a covered switch.");
+  }
+
+private:
+  /// Cached reference to the module which we will insert declarations into.
+  Module *TheModule = nullptr;
+
+  /// Declaration for ObjC runtime function objc_autoreleaseReturnValue.
+  Function *AutoreleaseRV = nullptr;
+
+  /// Declaration for ObjC runtime function objc_release.
+  Function *Release = nullptr;
+
+  /// Declaration for ObjC runtime function objc_retain.
+  Function *Retain = nullptr;
+
+  /// Declaration for ObjC runtime function objc_retainBlock.
+  Function *RetainBlock = nullptr;
+
+  /// Declaration for ObjC runtime function objc_autorelease.
+  Function *Autorelease = nullptr;
+
+  /// Declaration for objc_storeStrong().
+  Function *StoreStrong = nullptr;
+
+  /// Declaration for objc_retainAutoreleasedReturnValue().
+  Function *RetainRV = nullptr;
+
+  /// Declaration for objc_retainAutorelease().
+  Function *RetainAutorelease = nullptr;
+
+  /// Declaration for objc_retainAutoreleaseReturnValue().
+  Function *RetainAutoreleaseRV = nullptr;
+
+  Function *getIntrinsicEntryPoint(Function *&Decl, Intrinsic::ID IntID) {
+    if (Decl)
+      return Decl;
+
+    return Decl = Intrinsic::getDeclaration(TheModule, IntID);
+  }
+};
+
+} // end namespace objcarc
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TRANSFORMS_OBJCARC_ARCRUNTIMEENTRYPOINTS_H
diff --git a/contrib/libs/llvm12/lib/Transforms/ObjCARC/BlotMapVector.h b/contrib/libs/llvm12/lib/Transforms/ObjCARC/BlotMapVector.h
index 9b144aaac6..2fa07cfb32 100644
--- a/contrib/libs/llvm12/lib/Transforms/ObjCARC/BlotMapVector.h
+++ b/contrib/libs/llvm12/lib/Transforms/ObjCARC/BlotMapVector.h
@@ -1,117 +1,117 @@
-//===- BlotMapVector.h - A MapVector with the blot operation ----*- C++ -*-===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_BLOTMAPVECTOR_H 
-#define LLVM_LIB_TRANSFORMS_OBJCARC_BLOTMAPVECTOR_H 
- 
-#include "llvm/ADT/DenseMap.h" 
-#include <cassert> 
-#include <cstddef> 
-#include <utility> 
-#include <vector> 
- 
-namespace llvm { 
- 
-/// An associative container with fast insertion-order (deterministic) 
-/// iteration over its elements. Plus the special blot operation. 
-template <class KeyT, class ValueT> class BlotMapVector { 
-  /// Map keys to indices in Vector. 
-  using MapTy = DenseMap<KeyT, size_t>; 
-  MapTy Map; 
- 
-  /// Keys and values. 
-  using VectorTy = std::vector<std::pair<KeyT, ValueT>>; 
-  VectorTy Vector; 
- 
-public: 
-#ifdef EXPENSIVE_CHECKS 
-  ~BlotMapVector() { 
-    assert(Vector.size() >= Map.size()); // May differ due to blotting. 
-    for (typename MapTy::const_iterator I = Map.begin(), E = Map.end(); I != E; 
-         ++I) { 
-      assert(I->second < Vector.size()); 
-      assert(Vector[I->second].first == I->first); 
-    } 
-    for (typename VectorTy::const_iterator I = Vector.begin(), E = Vector.end(); 
-         I != E; ++I) 
-      assert(!I->first || (Map.count(I->first) && 
-                           Map[I->first] == size_t(I - Vector.begin()))); 
-  } 
-#endif 
- 
-  using iterator = typename VectorTy::iterator; 
-  using const_iterator = typename VectorTy::const_iterator; 
- 
-  iterator begin() { return Vector.begin(); } 
-  iterator end() { return Vector.end(); } 
-  const_iterator begin() const { return Vector.begin(); } 
-  const_iterator end() const { return Vector.end(); } 
- 
-  ValueT &operator[](const KeyT &Arg) { 
-    std::pair<typename MapTy::iterator, bool> Pair = 
-        Map.insert(std::make_pair(Arg, size_t(0))); 
-    if (Pair.second) { 
-      size_t Num = Vector.size(); 
-      Pair.first->second = Num; 
-      Vector.push_back(std::make_pair(Arg, ValueT())); 
-      return Vector[Num].second; 
-    } 
-    return Vector[Pair.first->second].second; 
-  } 
- 
-  std::pair<iterator, bool> insert(const std::pair<KeyT, ValueT> &InsertPair) { 
-    std::pair<typename MapTy::iterator, bool> Pair = 
-        Map.insert(std::make_pair(InsertPair.first, size_t(0))); 
-    if (Pair.second) { 
-      size_t Num = Vector.size(); 
-      Pair.first->second = Num; 
-      Vector.push_back(InsertPair); 
-      return std::make_pair(Vector.begin() + Num, true); 
-    } 
-    return std::make_pair(Vector.begin() + Pair.first->second, false); 
-  } 
- 
-  iterator find(const KeyT &Key) { 
-    typename MapTy::iterator It = Map.find(Key); 
-    if (It == Map.end()) 
-      return Vector.end(); 
-    return Vector.begin() + It->second; 
-  } 
- 
-  const_iterator find(const KeyT &Key) const { 
-    typename MapTy::const_iterator It = Map.find(Key); 
-    if (It == Map.end()) 
-      return Vector.end(); 
-    return Vector.begin() + It->second; 
-  } 
- 
-  /// This is similar to erase, but instead of removing the element from the 
-  /// vector, it just zeros out the key in the vector. This leaves iterators 
-  /// intact, but clients must be prepared for zeroed-out keys when iterating. 
-  void blot(const KeyT &Key) { 
-    typename MapTy::iterator It = Map.find(Key); 
-    if (It == Map.end()) 
-      return; 
-    Vector[It->second].first = KeyT(); 
-    Map.erase(It); 
-  } 
- 
-  void clear() { 
-    Map.clear(); 
-    Vector.clear(); 
-  } 
- 
-  bool empty() const { 
-    assert(Map.empty() == Vector.empty()); 
-    return Map.empty(); 
-  } 
-}; 
- 
-} // end namespace llvm 
- 
-#endif // LLVM_LIB_TRANSFORMS_OBJCARC_BLOTMAPVECTOR_H 
+//===- BlotMapVector.h - A MapVector with the blot operation ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_BLOTMAPVECTOR_H
+#define LLVM_LIB_TRANSFORMS_OBJCARC_BLOTMAPVECTOR_H
+
+#include "llvm/ADT/DenseMap.h"
+#include <cassert>
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+namespace llvm {
+
+/// An associative container with fast insertion-order (deterministic)
+/// iteration over its elements. Plus the special blot operation.
+template <class KeyT, class ValueT> class BlotMapVector {
+  /// Map keys to indices in Vector.
+  using MapTy = DenseMap<KeyT, size_t>;
+  MapTy Map;
+
+  /// Keys and values.
+  using VectorTy = std::vector<std::pair<KeyT, ValueT>>;
+  VectorTy Vector;
+
+public:
+#ifdef EXPENSIVE_CHECKS
+  ~BlotMapVector() {
+    assert(Vector.size() >= Map.size()); // May differ due to blotting.
+    for (typename MapTy::const_iterator I = Map.begin(), E = Map.end(); I != E;
+         ++I) {
+      assert(I->second < Vector.size());
+      assert(Vector[I->second].first == I->first);
+    }
+    for (typename VectorTy::const_iterator I = Vector.begin(), E = Vector.end();
+         I != E; ++I)
+      assert(!I->first || (Map.count(I->first) &&
+                           Map[I->first] == size_t(I - Vector.begin())));
+  }
+#endif
+
+  using iterator = typename VectorTy::iterator;
+  using const_iterator = typename VectorTy::const_iterator;
+
+  iterator begin() { return Vector.begin(); }
+  iterator end() { return Vector.end(); }
+  const_iterator begin() const { return Vector.begin(); }
+  const_iterator end() const { return Vector.end(); }
+
+  ValueT &operator[](const KeyT &Arg) {
+    std::pair<typename MapTy::iterator, bool> Pair =
+        Map.insert(std::make_pair(Arg, size_t(0)));
+    if (Pair.second) {
+      size_t Num = Vector.size();
+      Pair.first->second = Num;
+      Vector.push_back(std::make_pair(Arg, ValueT()));
+      return Vector[Num].second;
+    }
+    return Vector[Pair.first->second].second;
+  }
+
+  std::pair<iterator, bool> insert(const std::pair<KeyT, ValueT> &InsertPair) {
+    std::pair<typename MapTy::iterator, bool> Pair =
+        Map.insert(std::make_pair(InsertPair.first, size_t(0)));
+    if (Pair.second) {
+      size_t Num = Vector.size();
+      Pair.first->second = Num;
+      Vector.push_back(InsertPair);
+      return std::make_pair(Vector.begin() + Num, true);
+    }
+    return std::make_pair(Vector.begin() + Pair.first->second, false);
+  }
+
+  iterator find(const KeyT &Key) {
+    typename MapTy::iterator It = Map.find(Key);
+    if (It == Map.end())
+      return Vector.end();
+    return Vector.begin() + It->second;
+  }
+
+  const_iterator find(const KeyT &Key) const {
+    typename MapTy::const_iterator It = Map.find(Key);
+    if (It == Map.end())
+      return Vector.end();
+    return Vector.begin() + It->second;
+  }
+
+  /// This is similar to erase, but instead of removing the element from the
+  /// vector, it just zeros out the key in the vector. This leaves iterators
+  /// intact, but clients must be prepared for zeroed-out keys when iterating.
+  void blot(const KeyT &Key) {
+    typename MapTy::iterator It = Map.find(Key);
+    if (It == Map.end())
+      return;
+    Vector[It->second].first = KeyT();
+    Map.erase(It);
+  }
+
+  void clear() {
+    Map.clear();
+    Vector.clear();
+  }
+
+  bool empty() const {
+    assert(Map.empty() == Vector.empty());
+    return Map.empty();
+  }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TRANSFORMS_OBJCARC_BLOTMAPVECTOR_H
diff --git a/contrib/libs/llvm12/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/contrib/libs/llvm12/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
index c621c56c05..7f7f2dc89b 100644
--- a/contrib/libs/llvm12/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
@@ -1,233 +1,233 @@
-//===- DependencyAnalysis.cpp - ObjC ARC Optimization ---------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-/// \file 
-/// 
-/// This file defines special dependency analysis routines used in Objective C 
-/// ARC Optimizations. 
-/// 
-/// WARNING: This file knows about certain library functions. It recognizes them 
-/// by name, and hardwires knowledge of their semantics. 
-/// 
-/// WARNING: This file knows about how certain Objective-C library functions are 
-/// used. Naive LLVM IR transformations which would otherwise be 
-/// behavior-preserving may break these assumptions. 
-/// 
-//===----------------------------------------------------------------------===// 
- 
-#include "DependencyAnalysis.h" 
-#include "ObjCARC.h" 
-#include "ProvenanceAnalysis.h" 
+//===- DependencyAnalysis.cpp - ObjC ARC Optimization ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines special dependency analysis routines used in Objective C
+/// ARC Optimizations.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "DependencyAnalysis.h"
+#include "ObjCARC.h"
+#include "ProvenanceAnalysis.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/IR/CFG.h" 
- 
-using namespace llvm; 
-using namespace llvm::objcarc; 
- 
-#define DEBUG_TYPE "objc-arc-dependency" 
- 
-/// Test whether the given instruction can result in a reference count 
-/// modification (positive or negative) for the pointer's object. 
-bool llvm::objcarc::CanAlterRefCount(const Instruction *Inst, const Value *Ptr, 
-                                     ProvenanceAnalysis &PA, 
-                                     ARCInstKind Class) { 
-  switch (Class) { 
-  case ARCInstKind::Autorelease: 
-  case ARCInstKind::AutoreleaseRV: 
-  case ARCInstKind::IntrinsicUser: 
-  case ARCInstKind::User: 
-    // These operations never directly modify a reference count. 
-    return false; 
-  default: break; 
-  } 
- 
-  const auto *Call = cast<CallBase>(Inst); 
- 
-  // See if AliasAnalysis can help us with the call. 
-  FunctionModRefBehavior MRB = PA.getAA()->getModRefBehavior(Call); 
-  if (AliasAnalysis::onlyReadsMemory(MRB)) 
-    return false; 
-  if (AliasAnalysis::onlyAccessesArgPointees(MRB)) { 
-    for (const Value *Op : Call->args()) { 
+#include "llvm/IR/CFG.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+#define DEBUG_TYPE "objc-arc-dependency"
+
+/// Test whether the given instruction can result in a reference count
+/// modification (positive or negative) for the pointer's object.
+bool llvm::objcarc::CanAlterRefCount(const Instruction *Inst, const Value *Ptr,
+                                     ProvenanceAnalysis &PA,
+                                     ARCInstKind Class) {
+  switch (Class) {
+  case ARCInstKind::Autorelease:
+  case ARCInstKind::AutoreleaseRV:
+  case ARCInstKind::IntrinsicUser:
+  case ARCInstKind::User:
+    // These operations never directly modify a reference count.
+    return false;
+  default: break;
+  }
+
+  const auto *Call = cast<CallBase>(Inst);
+
+  // See if AliasAnalysis can help us with the call.
+  FunctionModRefBehavior MRB = PA.getAA()->getModRefBehavior(Call);
+  if (AliasAnalysis::onlyReadsMemory(MRB))
+    return false;
+  if (AliasAnalysis::onlyAccessesArgPointees(MRB)) {
+    for (const Value *Op : Call->args()) {
       if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op))
-        return true; 
-    } 
-    return false; 
-  } 
- 
-  // Assume the worst. 
-  return true; 
-} 
- 
-bool llvm::objcarc::CanDecrementRefCount(const Instruction *Inst, 
-                                         const Value *Ptr, 
-                                         ProvenanceAnalysis &PA, 
-                                         ARCInstKind Class) { 
-  // First perform a quick check if Class can not touch ref counts. 
-  if (!CanDecrementRefCount(Class)) 
-    return false; 
- 
-  // Otherwise, just use CanAlterRefCount for now. 
-  return CanAlterRefCount(Inst, Ptr, PA, Class); 
-} 
- 
-/// Test whether the given instruction can "use" the given pointer's object in a 
-/// way that requires the reference count to be positive. 
-bool llvm::objcarc::CanUse(const Instruction *Inst, const Value *Ptr, 
-                           ProvenanceAnalysis &PA, ARCInstKind Class) { 
-  // ARCInstKind::Call operations (as opposed to 
-  // ARCInstKind::CallOrUser) never "use" objc pointers. 
-  if (Class == ARCInstKind::Call) 
-    return false; 
- 
-  // Consider various instructions which may have pointer arguments which are 
-  // not "uses". 
-  if (const ICmpInst *ICI = dyn_cast<ICmpInst>(Inst)) { 
-    // Comparing a pointer with null, or any other constant, isn't really a use, 
-    // because we don't care what the pointer points to, or about the values 
-    // of any other dynamic reference-counted pointers. 
-    if (!IsPotentialRetainableObjPtr(ICI->getOperand(1), *PA.getAA())) 
-      return false; 
-  } else if (const auto *CS = dyn_cast<CallBase>(Inst)) { 
-    // For calls, just check the arguments (and not the callee operand). 
-    for (auto OI = CS->arg_begin(), OE = CS->arg_end(); OI != OE; ++OI) { 
-      const Value *Op = *OI; 
+        return true;
+    }
+    return false;
+  }
+
+  // Assume the worst.
+  return true;
+}
+
+bool llvm::objcarc::CanDecrementRefCount(const Instruction *Inst,
+                                         const Value *Ptr,
+                                         ProvenanceAnalysis &PA,
+                                         ARCInstKind Class) {
+  // First perform a quick check if Class can not touch ref counts.
+  if (!CanDecrementRefCount(Class))
+    return false;
+
+  // Otherwise, just use CanAlterRefCount for now.
+  return CanAlterRefCount(Inst, Ptr, PA, Class);
+}
+
+/// Test whether the given instruction can "use" the given pointer's object in a
+/// way that requires the reference count to be positive.
+bool llvm::objcarc::CanUse(const Instruction *Inst, const Value *Ptr,
+                           ProvenanceAnalysis &PA, ARCInstKind Class) {
+  // ARCInstKind::Call operations (as opposed to
+  // ARCInstKind::CallOrUser) never "use" objc pointers.
+  if (Class == ARCInstKind::Call)
+    return false;
+
+  // Consider various instructions which may have pointer arguments which are
+  // not "uses".
+  if (const ICmpInst *ICI = dyn_cast<ICmpInst>(Inst)) {
+    // Comparing a pointer with null, or any other constant, isn't really a use,
+    // because we don't care what the pointer points to, or about the values
+    // of any other dynamic reference-counted pointers.
+    if (!IsPotentialRetainableObjPtr(ICI->getOperand(1), *PA.getAA()))
+      return false;
+  } else if (const auto *CS = dyn_cast<CallBase>(Inst)) {
+    // For calls, just check the arguments (and not the callee operand).
+    for (auto OI = CS->arg_begin(), OE = CS->arg_end(); OI != OE; ++OI) {
+      const Value *Op = *OI;
       if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op))
-        return true; 
-    } 
-    return false; 
-  } else if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) { 
-    // Special-case stores, because we don't care about the stored value, just 
-    // the store address. 
+        return true;
+    }
+    return false;
+  } else if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+    // Special-case stores, because we don't care about the stored value, just
+    // the store address.
     const Value *Op = GetUnderlyingObjCPtr(SI->getPointerOperand());
-    // If we can't tell what the underlying object was, assume there is a 
-    // dependence. 
+    // If we can't tell what the underlying object was, assume there is a
+    // dependence.
     return IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Op, Ptr);
-  } 
- 
-  // Check each operand for a match. 
-  for (User::const_op_iterator OI = Inst->op_begin(), OE = Inst->op_end(); 
-       OI != OE; ++OI) { 
-    const Value *Op = *OI; 
+  }
+
+  // Check each operand for a match.
+  for (User::const_op_iterator OI = Inst->op_begin(), OE = Inst->op_end();
+       OI != OE; ++OI) {
+    const Value *Op = *OI;
     if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op))
-      return true; 
-  } 
-  return false; 
-} 
- 
-/// Test if there can be dependencies on Inst through Arg. This function only 
-/// tests dependencies relevant for removing pairs of calls. 
-bool 
-llvm::objcarc::Depends(DependenceKind Flavor, Instruction *Inst, 
-                       const Value *Arg, ProvenanceAnalysis &PA) { 
-  // If we've reached the definition of Arg, stop. 
-  if (Inst == Arg) 
-    return true; 
- 
-  switch (Flavor) { 
-  case NeedsPositiveRetainCount: { 
-    ARCInstKind Class = GetARCInstKind(Inst); 
-    switch (Class) { 
-    case ARCInstKind::AutoreleasepoolPop: 
-    case ARCInstKind::AutoreleasepoolPush: 
-    case ARCInstKind::None: 
-      return false; 
-    default: 
-      return CanUse(Inst, Arg, PA, Class); 
-    } 
-  } 
- 
-  case AutoreleasePoolBoundary: { 
-    ARCInstKind Class = GetARCInstKind(Inst); 
-    switch (Class) { 
-    case ARCInstKind::AutoreleasepoolPop: 
-    case ARCInstKind::AutoreleasepoolPush: 
-      // These mark the end and begin of an autorelease pool scope. 
-      return true; 
-    default: 
-      // Nothing else does this. 
-      return false; 
-    } 
-  } 
- 
-  case CanChangeRetainCount: { 
-    ARCInstKind Class = GetARCInstKind(Inst); 
-    switch (Class) { 
-    case ARCInstKind::AutoreleasepoolPop: 
-      // Conservatively assume this can decrement any count. 
-      return true; 
-    case ARCInstKind::AutoreleasepoolPush: 
-    case ARCInstKind::None: 
-      return false; 
-    default: 
-      return CanAlterRefCount(Inst, Arg, PA, Class); 
-    } 
-  } 
- 
-  case RetainAutoreleaseDep: 
-    switch (GetBasicARCInstKind(Inst)) { 
-    case ARCInstKind::AutoreleasepoolPop: 
-    case ARCInstKind::AutoreleasepoolPush: 
-      // Don't merge an objc_autorelease with an objc_retain inside a different 
-      // autoreleasepool scope. 
-      return true; 
-    case ARCInstKind::Retain: 
-    case ARCInstKind::RetainRV: 
-      // Check for a retain of the same pointer for merging. 
-      return GetArgRCIdentityRoot(Inst) == Arg; 
-    default: 
-      // Nothing else matters for objc_retainAutorelease formation. 
-      return false; 
-    } 
- 
-  case RetainAutoreleaseRVDep: { 
-    ARCInstKind Class = GetBasicARCInstKind(Inst); 
-    switch (Class) { 
-    case ARCInstKind::Retain: 
-    case ARCInstKind::RetainRV: 
-      // Check for a retain of the same pointer for merging. 
-      return GetArgRCIdentityRoot(Inst) == Arg; 
-    default: 
-      // Anything that can autorelease interrupts 
-      // retainAutoreleaseReturnValue formation. 
-      return CanInterruptRV(Class); 
-    } 
-  } 
- 
-  case RetainRVDep: 
-    return CanInterruptRV(GetBasicARCInstKind(Inst)); 
-  } 
- 
-  llvm_unreachable("Invalid dependence flavor"); 
-} 
- 
-/// Walk up the CFG from StartPos (which is in StartBB) and find local and 
-/// non-local dependencies on Arg. 
-/// 
-/// TODO: Cache results? 
+      return true;
+  }
+  return false;
+}
+
+/// Test if there can be dependencies on Inst through Arg. This function only
+/// tests dependencies relevant for removing pairs of calls.
+bool
+llvm::objcarc::Depends(DependenceKind Flavor, Instruction *Inst,
+                       const Value *Arg, ProvenanceAnalysis &PA) {
+  // If we've reached the definition of Arg, stop.
+  if (Inst == Arg)
+    return true;
+
+  switch (Flavor) {
+  case NeedsPositiveRetainCount: {
+    ARCInstKind Class = GetARCInstKind(Inst);
+    switch (Class) {
+    case ARCInstKind::AutoreleasepoolPop:
+    case ARCInstKind::AutoreleasepoolPush:
+    case ARCInstKind::None:
+      return false;
+    default:
+      return CanUse(Inst, Arg, PA, Class);
+    }
+  }
+
+  case AutoreleasePoolBoundary: {
+    ARCInstKind Class = GetARCInstKind(Inst);
+    switch (Class) {
+    case ARCInstKind::AutoreleasepoolPop:
+    case ARCInstKind::AutoreleasepoolPush:
+      // These mark the end and begin of an autorelease pool scope.
+      return true;
+    default:
+      // Nothing else does this.
+      return false;
+    }
+  }
+
+  case CanChangeRetainCount: {
+    ARCInstKind Class = GetARCInstKind(Inst);
+    switch (Class) {
+    case ARCInstKind::AutoreleasepoolPop:
+      // Conservatively assume this can decrement any count.
+      return true;
+    case ARCInstKind::AutoreleasepoolPush:
+    case ARCInstKind::None:
+      return false;
+    default:
+      return CanAlterRefCount(Inst, Arg, PA, Class);
+    }
+  }
+
+  case RetainAutoreleaseDep:
+    switch (GetBasicARCInstKind(Inst)) {
+    case ARCInstKind::AutoreleasepoolPop:
+    case ARCInstKind::AutoreleasepoolPush:
+      // Don't merge an objc_autorelease with an objc_retain inside a different
+      // autoreleasepool scope.
+      return true;
+    case ARCInstKind::Retain:
+    case ARCInstKind::RetainRV:
+      // Check for a retain of the same pointer for merging.
+      return GetArgRCIdentityRoot(Inst) == Arg;
+    default:
+      // Nothing else matters for objc_retainAutorelease formation.
+      return false;
+    }
+
+  case RetainAutoreleaseRVDep: {
+    ARCInstKind Class = GetBasicARCInstKind(Inst);
+    switch (Class) {
+    case ARCInstKind::Retain:
+    case ARCInstKind::RetainRV:
+      // Check for a retain of the same pointer for merging.
+      return GetArgRCIdentityRoot(Inst) == Arg;
+    default:
+      // Anything that can autorelease interrupts
+      // retainAutoreleaseReturnValue formation.
+      return CanInterruptRV(Class);
+    }
+  }
+
+  case RetainRVDep:
+    return CanInterruptRV(GetBasicARCInstKind(Inst));
+  }
+
+  llvm_unreachable("Invalid dependence flavor");
+}
+
+/// Walk up the CFG from StartPos (which is in StartBB) and find local and
+/// non-local dependencies on Arg.
+///
+/// TODO: Cache results?
 static bool findDependencies(DependenceKind Flavor, const Value *Arg,
                              BasicBlock *StartBB, Instruction *StartInst,
                              SmallPtrSetImpl<Instruction *> &DependingInsts,
                              ProvenanceAnalysis &PA) {
-  BasicBlock::iterator StartPos = StartInst->getIterator(); 
- 
+  BasicBlock::iterator StartPos = StartInst->getIterator();
+
   SmallPtrSet<const BasicBlock *, 4> Visited;
-  SmallVector<std::pair<BasicBlock *, BasicBlock::iterator>, 4> Worklist; 
-  Worklist.push_back(std::make_pair(StartBB, StartPos)); 
-  do { 
-    std::pair<BasicBlock *, BasicBlock::iterator> Pair = 
-      Worklist.pop_back_val(); 
-    BasicBlock *LocalStartBB = Pair.first; 
-    BasicBlock::iterator LocalStartPos = Pair.second; 
-    BasicBlock::iterator StartBBBegin = LocalStartBB->begin(); 
-    for (;;) { 
-      if (LocalStartPos == StartBBBegin) { 
-        pred_iterator PI(LocalStartBB), PE(LocalStartBB, false); 
-        if (PI == PE) 
+  SmallVector<std::pair<BasicBlock *, BasicBlock::iterator>, 4> Worklist;
+  Worklist.push_back(std::make_pair(StartBB, StartPos));
+  do {
+    std::pair<BasicBlock *, BasicBlock::iterator> Pair =
+      Worklist.pop_back_val();
+    BasicBlock *LocalStartBB = Pair.first;
+    BasicBlock::iterator LocalStartPos = Pair.second;
+    BasicBlock::iterator StartBBBegin = LocalStartBB->begin();
+    for (;;) {
+      if (LocalStartPos == StartBBBegin) {
+        pred_iterator PI(LocalStartBB), PE(LocalStartBB, false);
+        if (PI == PE)
           // Return if we've reached the function entry.
           return false;
         // Add the predecessors to the worklist.
@@ -236,30 +236,30 @@ static bool findDependencies(DependenceKind Flavor, const Value *Arg,
           if (Visited.insert(PredBB).second)
             Worklist.push_back(std::make_pair(PredBB, PredBB->end()));
         } while (++PI != PE);
-        break; 
-      } 
- 
-      Instruction *Inst = &*--LocalStartPos; 
-      if (Depends(Flavor, Inst, Arg, PA)) { 
-        DependingInsts.insert(Inst); 
-        break; 
-      } 
-    } 
-  } while (!Worklist.empty()); 
- 
-  // Determine whether the original StartBB post-dominates all of the blocks we 
-  // visited. If not, insert a sentinal indicating that most optimizations are 
-  // not safe. 
-  for (const BasicBlock *BB : Visited) { 
-    if (BB == StartBB) 
-      continue; 
-    for (const BasicBlock *Succ : successors(BB)) 
+        break;
+      }
+
+      Instruction *Inst = &*--LocalStartPos;
+      if (Depends(Flavor, Inst, Arg, PA)) {
+        DependingInsts.insert(Inst);
+        break;
+      }
+    }
+  } while (!Worklist.empty());
+
+  // Determine whether the original StartBB post-dominates all of the blocks we
+  // visited. If not, insert a sentinal indicating that most optimizations are
+  // not safe.
+  for (const BasicBlock *BB : Visited) {
+    if (BB == StartBB)
+      continue;
+    for (const BasicBlock *Succ : successors(BB))
       if (Succ != StartBB && !Visited.count(Succ))
         return false;
-  } 
+  }
 
   return true;
-} 
+}
 
 llvm::Instruction *llvm::objcarc::findSingleDependency(DependenceKind Flavor,
                                                        const Value *Arg,
diff --git a/contrib/libs/llvm12/lib/Transforms/ObjCARC/DependencyAnalysis.h b/contrib/libs/llvm12/lib/Transforms/ObjCARC/DependencyAnalysis.h
index 2a51683c38..cf4c05ebe9 100644
--- a/contrib/libs/llvm12/lib/Transforms/ObjCARC/DependencyAnalysis.h
+++ b/contrib/libs/llvm12/lib/Transforms/ObjCARC/DependencyAnalysis.h
@@ -1,88 +1,88 @@
-//===- DependencyAnalysis.h - ObjC ARC Optimization ---*- C++ -*-----------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-/// \file 
-/// 
-/// This file declares special dependency analysis routines used in Objective C 
-/// ARC Optimizations. 
-/// 
-/// WARNING: This file knows about certain library functions. It recognizes them 
-/// by name, and hardwires knowledge of their semantics. 
-/// 
-/// WARNING: This file knows about how certain Objective-C library functions are 
-/// used. Naive LLVM IR transformations which would otherwise be 
-/// behavior-preserving may break these assumptions. 
-/// 
-//===----------------------------------------------------------------------===// 
- 
-#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_DEPENDENCYANALYSIS_H 
-#define LLVM_LIB_TRANSFORMS_OBJCARC_DEPENDENCYANALYSIS_H 
- 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/Analysis/ObjCARCInstKind.h" 
- 
-namespace llvm { 
-  class BasicBlock; 
-  class Instruction; 
-  class Value; 
-} 
- 
-namespace llvm { 
-namespace objcarc { 
- 
-class ProvenanceAnalysis; 
- 
-/// \enum DependenceKind 
-/// Defines different dependence kinds among various ARC constructs. 
-/// 
-/// There are several kinds of dependence-like concepts in use here. 
-/// 
-enum DependenceKind { 
-  NeedsPositiveRetainCount, 
-  AutoreleasePoolBoundary, 
-  CanChangeRetainCount, 
-  RetainAutoreleaseDep,       ///< Blocks objc_retainAutorelease. 
-  RetainAutoreleaseRVDep,     ///< Blocks objc_retainAutoreleaseReturnValue. 
-  RetainRVDep                 ///< Blocks objc_retainAutoreleasedReturnValue. 
-}; 
- 
+//===- DependencyAnalysis.h - ObjC ARC Optimization ---*- C++ -*-----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file declares special dependency analysis routines used in Objective C
+/// ARC Optimizations.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_DEPENDENCYANALYSIS_H
+#define LLVM_LIB_TRANSFORMS_OBJCARC_DEPENDENCYANALYSIS_H
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/ObjCARCInstKind.h"
+
+namespace llvm {
+  class BasicBlock;
+  class Instruction;
+  class Value;
+}
+
+namespace llvm {
+namespace objcarc {
+
+class ProvenanceAnalysis;
+
+/// \enum DependenceKind
+/// Defines different dependence kinds among various ARC constructs.
+///
+/// There are several kinds of dependence-like concepts in use here.
+///
+enum DependenceKind {
+  NeedsPositiveRetainCount,
+  AutoreleasePoolBoundary,
+  CanChangeRetainCount,
+  RetainAutoreleaseDep,       ///< Blocks objc_retainAutorelease.
+  RetainAutoreleaseRVDep,     ///< Blocks objc_retainAutoreleaseReturnValue.
+  RetainRVDep                 ///< Blocks objc_retainAutoreleasedReturnValue.
+};
+
 /// Find dependent instructions. If there is exactly one dependent instruction,
 /// return it. Otherwise, return null.
 llvm::Instruction *findSingleDependency(DependenceKind Flavor, const Value *Arg,
                                         BasicBlock *StartBB,
                                         Instruction *StartInst,
                                         ProvenanceAnalysis &PA);
- 
-bool 
-Depends(DependenceKind Flavor, Instruction *Inst, const Value *Arg, 
-        ProvenanceAnalysis &PA); 
- 
-/// Test whether the given instruction can "use" the given pointer's object in a 
-/// way that requires the reference count to be positive. 
-bool CanUse(const Instruction *Inst, const Value *Ptr, ProvenanceAnalysis &PA, 
-            ARCInstKind Class); 
- 
-/// Test whether the given instruction can result in a reference count 
-/// modification (positive or negative) for the pointer's object. 
-bool CanAlterRefCount(const Instruction *Inst, const Value *Ptr, 
-                      ProvenanceAnalysis &PA, ARCInstKind Class); 
- 
-/// Returns true if we can not conservatively prove that Inst can not decrement 
-/// the reference count of Ptr. Returns false if we can. 
-bool CanDecrementRefCount(const Instruction *Inst, const Value *Ptr, 
-                          ProvenanceAnalysis &PA, ARCInstKind Class); 
- 
-static inline bool CanDecrementRefCount(const Instruction *Inst, 
-                                        const Value *Ptr, 
-                                        ProvenanceAnalysis &PA) { 
-  return CanDecrementRefCount(Inst, Ptr, PA, GetARCInstKind(Inst)); 
-} 
- 
-} // namespace objcarc 
-} // namespace llvm 
- 
-#endif 
+
+bool
+Depends(DependenceKind Flavor, Instruction *Inst, const Value *Arg,
+        ProvenanceAnalysis &PA);
+
+/// Test whether the given instruction can "use" the given pointer's object in a
+/// way that requires the reference count to be positive.
+bool CanUse(const Instruction *Inst, const Value *Ptr, ProvenanceAnalysis &PA,
+            ARCInstKind Class);
+
+/// Test whether the given instruction can result in a reference count
+/// modification (positive or negative) for the pointer's object.
+bool CanAlterRefCount(const Instruction *Inst, const Value *Ptr,
+                      ProvenanceAnalysis &PA, ARCInstKind Class);
+
+/// Returns true if we can not conservatively prove that Inst can not decrement
+/// the reference count of Ptr. Returns false if we can.
+bool CanDecrementRefCount(const Instruction *Inst, const Value *Ptr,
+                          ProvenanceAnalysis &PA, ARCInstKind Class);
+
+static inline bool CanDecrementRefCount(const Instruction *Inst,
+                                        const Value *Ptr,
+                                        ProvenanceAnalysis &PA) {
+  return CanDecrementRefCount(Inst, Ptr, PA, GetARCInstKind(Inst));
+}
+
+} // namespace objcarc
+} // namespace llvm
+
+#endif
diff --git a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARC.cpp b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARC.cpp
index cbc1eca728..970136392f 100644
--- a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARC.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARC.cpp
@@ -1,39 +1,39 @@
-//===-- ObjCARC.cpp -------------------------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements common infrastructure for libLLVMObjCARCOpts.a, which 
-// implements several scalar transformations over the LLVM intermediate 
-// representation, including the C bindings for that library. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "ObjCARC.h" 
-#include "llvm-c/Initialization.h" 
-#include "llvm/InitializePasses.h" 
- 
-namespace llvm { 
-  class PassRegistry; 
-} 
- 
-using namespace llvm; 
-using namespace llvm::objcarc; 
- 
-/// initializeObjCARCOptsPasses - Initialize all passes linked into the 
-/// ObjCARCOpts library. 
-void llvm::initializeObjCARCOpts(PassRegistry &Registry) { 
-  initializeObjCARCAAWrapperPassPass(Registry); 
-  initializeObjCARCAPElimPass(Registry); 
-  initializeObjCARCExpandPass(Registry); 
+//===-- ObjCARC.cpp -------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements common infrastructure for libLLVMObjCARCOpts.a, which
+// implements several scalar transformations over the LLVM intermediate
+// representation, including the C bindings for that library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ObjCARC.h"
+#include "llvm-c/Initialization.h"
+#include "llvm/InitializePasses.h"
+
+namespace llvm {
+  class PassRegistry;
+}
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+/// initializeObjCARCOptsPasses - Initialize all passes linked into the
+/// ObjCARCOpts library.
+void llvm::initializeObjCARCOpts(PassRegistry &Registry) {
+  initializeObjCARCAAWrapperPassPass(Registry);
+  initializeObjCARCAPElimPass(Registry);
+  initializeObjCARCExpandPass(Registry);
   initializeObjCARCContractLegacyPassPass(Registry);
   initializeObjCARCOptLegacyPassPass(Registry);
-  initializePAEvalPass(Registry); 
-} 
- 
-void LLVMInitializeObjCARCOpts(LLVMPassRegistryRef R) { 
-  initializeObjCARCOpts(*unwrap(R)); 
-} 
+  initializePAEvalPass(Registry);
+}
+
+void LLVMInitializeObjCARCOpts(LLVMPassRegistryRef R) {
+  initializeObjCARCOpts(*unwrap(R));
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARC.h b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARC.h
index c80f5f597b..8227a8c6f7 100644
--- a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARC.h
+++ b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARC.h
@@ -1,93 +1,93 @@
-//===- ObjCARC.h - ObjC ARC Optimization --------------*- C++ -*-----------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-/// \file 
-/// This file defines common definitions/declarations used by the ObjC ARC 
-/// Optimizer. ARC stands for Automatic Reference Counting and is a system for 
-/// managing reference counts for objects in Objective C. 
-/// 
-/// WARNING: This file knows about certain library functions. It recognizes them 
-/// by name, and hardwires knowledge of their semantics. 
-/// 
-/// WARNING: This file knows about how certain Objective-C library functions are 
-/// used. Naive LLVM IR transformations which would otherwise be 
-/// behavior-preserving may break these assumptions. 
-/// 
-//===----------------------------------------------------------------------===// 
- 
-#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_OBJCARC_H 
-#define LLVM_LIB_TRANSFORMS_OBJCARC_OBJCARC_H 
- 
-#include "llvm/Analysis/ObjCARCAnalysisUtils.h" 
-#include "llvm/Transforms/Utils/Local.h" 
- 
-namespace llvm { 
-namespace objcarc { 
- 
-/// Erase the given instruction. 
-/// 
-/// Many ObjC calls return their argument verbatim, 
-/// so if it's such a call and the return value has users, replace them with the 
-/// argument value. 
-/// 
-static inline void EraseInstruction(Instruction *CI) { 
-  Value *OldArg = cast<CallInst>(CI)->getArgOperand(0); 
- 
-  bool Unused = CI->use_empty(); 
- 
-  if (!Unused) { 
-    // Replace the return value with the argument. 
-    assert((IsForwarding(GetBasicARCInstKind(CI)) || 
-            (IsNoopOnNull(GetBasicARCInstKind(CI)) && 
-             IsNullOrUndef(OldArg->stripPointerCasts()))) && 
-           "Can't delete non-forwarding instruction with users!"); 
-    CI->replaceAllUsesWith(OldArg); 
-  } 
- 
-  CI->eraseFromParent(); 
- 
-  if (Unused) 
-    RecursivelyDeleteTriviallyDeadInstructions(OldArg); 
-} 
- 
-/// If Inst is a ReturnRV and its operand is a call or invoke, return the 
-/// operand. Otherwise return null. 
-static inline const Instruction *getreturnRVOperand(const Instruction &Inst, 
-                                                    ARCInstKind Class) { 
-  if (Class != ARCInstKind::RetainRV) 
-    return nullptr; 
- 
-  const auto *Opnd = Inst.getOperand(0)->stripPointerCasts(); 
-  if (const auto *C = dyn_cast<CallInst>(Opnd)) 
-    return C; 
-  return dyn_cast<InvokeInst>(Opnd); 
-} 
- 
-/// Return the list of PHI nodes that are equivalent to PN. 
-template<class PHINodeTy, class VectorTy> 
-void getEquivalentPHIs(PHINodeTy &PN, VectorTy &PHIList) { 
-  auto *BB = PN.getParent(); 
-  for (auto &P : BB->phis()) { 
-    if (&P == &PN) // Do not add PN to the list. 
-      continue; 
-    unsigned I = 0, E = PN.getNumIncomingValues(); 
-    for (; I < E; ++I) { 
-      auto *BB = PN.getIncomingBlock(I); 
-      auto *PNOpnd = PN.getIncomingValue(I)->stripPointerCasts(); 
-      auto *POpnd = P.getIncomingValueForBlock(BB)->stripPointerCasts(); 
-      if (PNOpnd != POpnd) 
-        break; 
-    } 
-    if (I == E) 
-      PHIList.push_back(&P); 
-  } 
-} 
- 
-} // end namespace objcarc 
-} // end namespace llvm 
- 
-#endif 
+//===- ObjCARC.h - ObjC ARC Optimization --------------*- C++ -*-----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines common definitions/declarations used by the ObjC ARC
+/// Optimizer. ARC stands for Automatic Reference Counting and is a system for
+/// managing reference counts for objects in Objective C.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_OBJCARC_H
+#define LLVM_LIB_TRANSFORMS_OBJCARC_OBJCARC_H
+
+#include "llvm/Analysis/ObjCARCAnalysisUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+namespace llvm {
+namespace objcarc {
+
+/// Erase the given instruction.
+///
+/// Many ObjC calls return their argument verbatim,
+/// so if it's such a call and the return value has users, replace them with the
+/// argument value.
+///
+static inline void EraseInstruction(Instruction *CI) {
+  Value *OldArg = cast<CallInst>(CI)->getArgOperand(0);
+
+  bool Unused = CI->use_empty();
+
+  if (!Unused) {
+    // Replace the return value with the argument.
+    assert((IsForwarding(GetBasicARCInstKind(CI)) ||
+            (IsNoopOnNull(GetBasicARCInstKind(CI)) &&
+             IsNullOrUndef(OldArg->stripPointerCasts()))) &&
+           "Can't delete non-forwarding instruction with users!");
+    CI->replaceAllUsesWith(OldArg);
+  }
+
+  CI->eraseFromParent();
+
+  if (Unused)
+    RecursivelyDeleteTriviallyDeadInstructions(OldArg);
+}
+
+/// If Inst is a ReturnRV and its operand is a call or invoke, return the
+/// operand. Otherwise return null.
+static inline const Instruction *getreturnRVOperand(const Instruction &Inst,
+                                                    ARCInstKind Class) {
+  if (Class != ARCInstKind::RetainRV)
+    return nullptr;
+
+  const auto *Opnd = Inst.getOperand(0)->stripPointerCasts();
+  if (const auto *C = dyn_cast<CallInst>(Opnd))
+    return C;
+  return dyn_cast<InvokeInst>(Opnd);
+}
+
+/// Return the list of PHI nodes that are equivalent to PN.
+template<class PHINodeTy, class VectorTy>
+void getEquivalentPHIs(PHINodeTy &PN, VectorTy &PHIList) {
+  auto *BB = PN.getParent();
+  for (auto &P : BB->phis()) {
+    if (&P == &PN) // Do not add PN to the list.
+      continue;
+    unsigned I = 0, E = PN.getNumIncomingValues();
+    for (; I < E; ++I) {
+      auto *BB = PN.getIncomingBlock(I);
+      auto *PNOpnd = PN.getIncomingValue(I)->stripPointerCasts();
+      auto *POpnd = P.getIncomingValueForBlock(BB)->stripPointerCasts();
+      if (PNOpnd != POpnd)
+        break;
+    }
+    if (I == E)
+      PHIList.push_back(&P);
+  }
+}
+
+} // end namespace objcarc
+} // end namespace llvm
+
+#endif
diff --git a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
index 3a4aea7574..6a928f2c7f 100644
--- a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
@@ -1,149 +1,149 @@
-//===- ObjCARCAPElim.cpp - ObjC ARC Optimization --------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-/// \file 
-/// 
-/// This file defines ObjC ARC optimizations. ARC stands for Automatic 
-/// Reference Counting and is a system for managing reference counts for objects 
-/// in Objective C. 
-/// 
-/// This specific file implements optimizations which remove extraneous 
-/// autorelease pools. 
-/// 
-/// WARNING: This file knows about certain library functions. It recognizes them 
-/// by name, and hardwires knowledge of their semantics. 
-/// 
-/// WARNING: This file knows about how certain Objective-C library functions are 
-/// used. Naive LLVM IR transformations which would otherwise be 
-/// behavior-preserving may break these assumptions. 
-/// 
-//===----------------------------------------------------------------------===// 
- 
-#include "ObjCARC.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/IR/Constants.h" 
+//===- ObjCARCAPElim.cpp - ObjC ARC Optimization --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines ObjC ARC optimizations. ARC stands for Automatic
+/// Reference Counting and is a system for managing reference counts for objects
+/// in Objective C.
+///
+/// This specific file implements optimizations which remove extraneous
+/// autorelease pools.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ObjCARC.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/InitializePasses.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/ObjCARC.h"
- 
-using namespace llvm; 
-using namespace llvm::objcarc; 
- 
-#define DEBUG_TYPE "objc-arc-ap-elim" 
- 
-namespace { 
- 
-/// Interprocedurally determine if calls made by the given call site can 
-/// possibly produce autoreleases. 
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+#define DEBUG_TYPE "objc-arc-ap-elim"
+
+namespace {
+
+/// Interprocedurally determine if calls made by the given call site can
+/// possibly produce autoreleases.
 bool MayAutorelease(const CallBase &CB, unsigned Depth = 0) {
-  if (const Function *Callee = CB.getCalledFunction()) { 
-    if (!Callee->hasExactDefinition()) 
-      return true; 
-    for (const BasicBlock &BB : *Callee) { 
-      for (const Instruction &I : BB) 
-        if (const CallBase *JCB = dyn_cast<CallBase>(&I)) 
-          // This recursion depth limit is arbitrary. It's just great 
-          // enough to cover known interesting testcases. 
-          if (Depth < 3 && !JCB->onlyReadsMemory() && 
-              MayAutorelease(*JCB, Depth + 1)) 
-            return true; 
-    } 
-    return false; 
-  } 
- 
-  return true; 
-} 
- 
+  if (const Function *Callee = CB.getCalledFunction()) {
+    if (!Callee->hasExactDefinition())
+      return true;
+    for (const BasicBlock &BB : *Callee) {
+      for (const Instruction &I : BB)
+        if (const CallBase *JCB = dyn_cast<CallBase>(&I))
+          // This recursion depth limit is arbitrary. It's just great
+          // enough to cover known interesting testcases.
+          if (Depth < 3 && !JCB->onlyReadsMemory() &&
+              MayAutorelease(*JCB, Depth + 1))
+            return true;
+    }
+    return false;
+  }
+
+  return true;
+}
+
 bool OptimizeBB(BasicBlock *BB) {
-  bool Changed = false; 
- 
-  Instruction *Push = nullptr; 
-  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) { 
-    Instruction *Inst = &*I++; 
-    switch (GetBasicARCInstKind(Inst)) { 
-    case ARCInstKind::AutoreleasepoolPush: 
-      Push = Inst; 
-      break; 
-    case ARCInstKind::AutoreleasepoolPop: 
-      // If this pop matches a push and nothing in between can autorelease, 
-      // zap the pair. 
-      if (Push && cast<CallInst>(Inst)->getArgOperand(0) == Push) { 
-        Changed = true; 
-        LLVM_DEBUG(dbgs() << "ObjCARCAPElim::OptimizeBB: Zapping push pop " 
-                             "autorelease pair:\n" 
-                             "                           Pop: " 
-                          << *Inst << "\n" 
-                          << "                           Push: " << *Push 
-                          << "\n"); 
-        Inst->eraseFromParent(); 
-        Push->eraseFromParent(); 
-      } 
-      Push = nullptr; 
-      break; 
-    case ARCInstKind::CallOrUser: 
-      if (MayAutorelease(cast<CallBase>(*Inst))) 
-        Push = nullptr; 
-      break; 
-    default: 
-      break; 
-    } 
-  } 
- 
-  return Changed; 
-} 
- 
+  bool Changed = false;
+
+  Instruction *Push = nullptr;
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
+    Instruction *Inst = &*I++;
+    switch (GetBasicARCInstKind(Inst)) {
+    case ARCInstKind::AutoreleasepoolPush:
+      Push = Inst;
+      break;
+    case ARCInstKind::AutoreleasepoolPop:
+      // If this pop matches a push and nothing in between can autorelease,
+      // zap the pair.
+      if (Push && cast<CallInst>(Inst)->getArgOperand(0) == Push) {
+        Changed = true;
+        LLVM_DEBUG(dbgs() << "ObjCARCAPElim::OptimizeBB: Zapping push pop "
+                             "autorelease pair:\n"
+                             "                           Pop: "
+                          << *Inst << "\n"
+                          << "                           Push: " << *Push
+                          << "\n");
+        Inst->eraseFromParent();
+        Push->eraseFromParent();
+      }
+      Push = nullptr;
+      break;
+    case ARCInstKind::CallOrUser:
+      if (MayAutorelease(cast<CallBase>(*Inst)))
+        Push = nullptr;
+      break;
+    default:
+      break;
+    }
+  }
+
+  return Changed;
+}
+
 bool runImpl(Module &M) {
-  if (!EnableARCOpts) 
-    return false; 
- 
-  // If nothing in the Module uses ARC, don't do anything. 
-  if (!ModuleHasARC(M)) 
-    return false; 
-  // Find the llvm.global_ctors variable, as the first step in 
-  // identifying the global constructors. In theory, unnecessary autorelease 
-  // pools could occur anywhere, but in practice it's pretty rare. Global 
-  // ctors are a place where autorelease pools get inserted automatically, 
-  // so it's pretty common for them to be unnecessary, and it's pretty 
-  // profitable to eliminate them. 
-  GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors"); 
-  if (!GV) 
-    return false; 
- 
-  assert(GV->hasDefinitiveInitializer() && 
-         "llvm.global_ctors is uncooperative!"); 
- 
-  bool Changed = false; 
- 
-  // Dig the constructor functions out of GV's initializer. 
-  ConstantArray *Init = cast<ConstantArray>(GV->getInitializer()); 
-  for (User::op_iterator OI = Init->op_begin(), OE = Init->op_end(); 
-       OI != OE; ++OI) { 
-    Value *Op = *OI; 
-    // llvm.global_ctors is an array of three-field structs where the second 
-    // members are constructor functions. 
-    Function *F = dyn_cast<Function>(cast<ConstantStruct>(Op)->getOperand(1)); 
-    // If the user used a constructor function with the wrong signature and 
-    // it got bitcasted or whatever, look the other way. 
-    if (!F) 
-      continue; 
-    // Only look at function definitions. 
-    if (F->isDeclaration()) 
-      continue; 
-    // Only look at functions with one basic block. 
-    if (std::next(F->begin()) != F->end()) 
-      continue; 
-    // Ok, a single-block constructor function definition. Try to optimize it. 
-    Changed |= OptimizeBB(&F->front()); 
-  } 
- 
-  return Changed; 
-} 
+  if (!EnableARCOpts)
+    return false;
+
+  // If nothing in the Module uses ARC, don't do anything.
+  if (!ModuleHasARC(M))
+    return false;
+  // Find the llvm.global_ctors variable, as the first step in
+  // identifying the global constructors. In theory, unnecessary autorelease
+  // pools could occur anywhere, but in practice it's pretty rare. Global
+  // ctors are a place where autorelease pools get inserted automatically,
+  // so it's pretty common for them to be unnecessary, and it's pretty
+  // profitable to eliminate them.
+  GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
+  if (!GV)
+    return false;
+
+  assert(GV->hasDefinitiveInitializer() &&
+         "llvm.global_ctors is uncooperative!");
+
+  bool Changed = false;
+
+  // Dig the constructor functions out of GV's initializer.
+  ConstantArray *Init = cast<ConstantArray>(GV->getInitializer());
+  for (User::op_iterator OI = Init->op_begin(), OE = Init->op_end();
+       OI != OE; ++OI) {
+    Value *Op = *OI;
+    // llvm.global_ctors is an array of three-field structs where the second
+    // members are constructor functions.
+    Function *F = dyn_cast<Function>(cast<ConstantStruct>(Op)->getOperand(1));
+    // If the user used a constructor function with the wrong signature and
+    // it got bitcasted or whatever, look the other way.
+    if (!F)
+      continue;
+    // Only look at function definitions.
+    if (F->isDeclaration())
+      continue;
+    // Only look at functions with one basic block.
+    if (std::next(F->begin()) != F->end())
+      continue;
+    // Ok, a single-block constructor function definition. Try to optimize it.
+    Changed |= OptimizeBB(&F->front());
+  }
+
+  return Changed;
+}
 
 /// Autorelease pool elimination.
 class ObjCARCAPElim : public ModulePass {
diff --git a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCContract.cpp
index 1419e4dacb..86d161116e 100644
--- a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCContract.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCContract.cpp
@@ -1,100 +1,100 @@
-//===- ObjCARCContract.cpp - ObjC ARC Optimization ------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-/// \file 
-/// This file defines late ObjC ARC optimizations. ARC stands for Automatic 
-/// Reference Counting and is a system for managing reference counts for objects 
-/// in Objective C. 
-/// 
-/// This specific file mainly deals with ``contracting'' multiple lower level 
-/// operations into singular higher level operations through pattern matching. 
-/// 
-/// WARNING: This file knows about certain library functions. It recognizes them 
-/// by name, and hardwires knowledge of their semantics. 
-/// 
-/// WARNING: This file knows about how certain Objective-C library functions are 
-/// used. Naive LLVM IR transformations which would otherwise be 
-/// behavior-preserving may break these assumptions. 
-/// 
-//===----------------------------------------------------------------------===// 
- 
-// TODO: ObjCARCContract could insert PHI nodes when uses aren't 
-// dominated by single calls. 
- 
-#include "ARCRuntimeEntryPoints.h" 
-#include "DependencyAnalysis.h" 
-#include "ObjCARC.h" 
-#include "ProvenanceAnalysis.h" 
-#include "llvm/ADT/Statistic.h" 
+//===- ObjCARCContract.cpp - ObjC ARC Optimization ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines late ObjC ARC optimizations. ARC stands for Automatic
+/// Reference Counting and is a system for managing reference counts for objects
+/// in Objective C.
+///
+/// This specific file mainly deals with ``contracting'' multiple lower level
+/// operations into singular higher level operations through pattern matching.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+// TODO: ObjCARCContract could insert PHI nodes when uses aren't
+// dominated by single calls.
+
+#include "ARCRuntimeEntryPoints.h"
+#include "DependencyAnalysis.h"
+#include "ObjCARC.h"
+#include "ProvenanceAnalysis.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/EHPersonalities.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/InlineAsm.h" 
-#include "llvm/IR/InstIterator.h" 
-#include "llvm/IR/Operator.h" 
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Operator.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/InitializePasses.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/ObjCARC.h"
- 
-using namespace llvm; 
-using namespace llvm::objcarc; 
- 
-#define DEBUG_TYPE "objc-arc-contract" 
- 
-STATISTIC(NumPeeps,       "Number of calls peephole-optimized"); 
-STATISTIC(NumStoreStrongs, "Number objc_storeStrong calls formed"); 
- 
-//===----------------------------------------------------------------------===// 
-//                                Declarations 
-//===----------------------------------------------------------------------===// 
- 
-namespace { 
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+#define DEBUG_TYPE "objc-arc-contract"
+
+STATISTIC(NumPeeps,       "Number of calls peephole-optimized");
+STATISTIC(NumStoreStrongs, "Number objc_storeStrong calls formed");
+
+//===----------------------------------------------------------------------===//
+//                                Declarations
+//===----------------------------------------------------------------------===//
+
+namespace {
 /// Late ARC optimizations
 ///
 /// These change the IR in a way that makes it difficult to be analyzed by
 /// ObjCARCOpt, so it's run late.
- 
+
 class ObjCARCContract {
   bool Changed;
   AAResults *AA;
   DominatorTree *DT;
   ProvenanceAnalysis PA;
   ARCRuntimeEntryPoints EP;
- 
+
   /// A flag indicating whether this optimization pass should run.
   bool Run;
- 
+
   /// The inline asm string to insert between calls and RetainRV calls to make
   /// the optimization work on targets which need it.
   const MDString *RVInstMarker;
- 
+
   /// The set of inserted objc_storeStrong calls. If at the end of walking the
   /// function we have found no alloca instructions, these calls can be marked
   /// "tail".
   SmallPtrSet<CallInst *, 8> StoreStrongCalls;
- 
+
   /// Returns true if we eliminated Inst.
   bool tryToPeepholeInstruction(
       Function &F, Instruction *Inst, inst_iterator &Iter,
       bool &TailOkForStoreStrong,
       const DenseMap<BasicBlock *, ColorVector> &BlockColors);
- 
+
   bool optimizeRetainCall(Function &F, Instruction *Retain);
- 
+
   bool contractAutorelease(Function &F, Instruction *Autorelease,
                            ARCInstKind Class);
- 
+
   void tryToContractReleaseIntoStoreStrong(
       Instruction *Release, inst_iterator &Iter,
       const DenseMap<BasicBlock *, ColorVector> &BlockColors);
- 
+
 public:
   bool init(Module &M);
   bool run(Function &F, AAResults *AA, DominatorTree *DT);
@@ -113,426 +113,426 @@ public:
     initializeObjCARCContractLegacyPassPass(*PassRegistry::getPassRegistry());
   }
 };
-} 
- 
-//===----------------------------------------------------------------------===// 
-//                               Implementation 
-//===----------------------------------------------------------------------===// 
- 
-/// Turn objc_retain into objc_retainAutoreleasedReturnValue if the operand is a 
-/// return value. We do this late so we do not disrupt the dataflow analysis in 
-/// ObjCARCOpt. 
-bool ObjCARCContract::optimizeRetainCall(Function &F, Instruction *Retain) { 
-  const auto *Call = dyn_cast<CallBase>(GetArgRCIdentityRoot(Retain)); 
-  if (!Call) 
-    return false; 
-  if (Call->getParent() != Retain->getParent()) 
-    return false; 
- 
-  // Check that the call is next to the retain. 
-  BasicBlock::const_iterator I = ++Call->getIterator(); 
-  while (IsNoopInstruction(&*I)) 
-    ++I; 
-  if (&*I != Retain) 
-    return false; 
- 
-  // Turn it to an objc_retainAutoreleasedReturnValue. 
-  Changed = true; 
-  ++NumPeeps; 
- 
-  LLVM_DEBUG( 
-      dbgs() << "Transforming objc_retain => " 
-                "objc_retainAutoreleasedReturnValue since the operand is a " 
-                "return value.\nOld: " 
-             << *Retain << "\n"); 
- 
-  // We do not have to worry about tail calls/does not throw since 
-  // retain/retainRV have the same properties. 
-  Function *Decl = EP.get(ARCRuntimeEntryPointKind::RetainRV); 
-  cast<CallInst>(Retain)->setCalledFunction(Decl); 
- 
-  LLVM_DEBUG(dbgs() << "New: " << *Retain << "\n"); 
-  return true; 
-} 
- 
-/// Merge an autorelease with a retain into a fused call. 
+}
+
+//===----------------------------------------------------------------------===//
+//                               Implementation
+//===----------------------------------------------------------------------===//
+
+/// Turn objc_retain into objc_retainAutoreleasedReturnValue if the operand is a
+/// return value. We do this late so we do not disrupt the dataflow analysis in
+/// ObjCARCOpt.
+bool ObjCARCContract::optimizeRetainCall(Function &F, Instruction *Retain) {
+  const auto *Call = dyn_cast<CallBase>(GetArgRCIdentityRoot(Retain));
+  if (!Call)
+    return false;
+  if (Call->getParent() != Retain->getParent())
+    return false;
+
+  // Check that the call is next to the retain.
+  BasicBlock::const_iterator I = ++Call->getIterator();
+  while (IsNoopInstruction(&*I))
+    ++I;
+  if (&*I != Retain)
+    return false;
+
+  // Turn it to an objc_retainAutoreleasedReturnValue.
+  Changed = true;
+  ++NumPeeps;
+
+  LLVM_DEBUG(
+      dbgs() << "Transforming objc_retain => "
+                "objc_retainAutoreleasedReturnValue since the operand is a "
+                "return value.\nOld: "
+             << *Retain << "\n");
+
+  // We do not have to worry about tail calls/does not throw since
+  // retain/retainRV have the same properties.
+  Function *Decl = EP.get(ARCRuntimeEntryPointKind::RetainRV);
+  cast<CallInst>(Retain)->setCalledFunction(Decl);
+
+  LLVM_DEBUG(dbgs() << "New: " << *Retain << "\n");
+  return true;
+}
+
+/// Merge an autorelease with a retain into a fused call.
 bool ObjCARCContract::contractAutorelease(Function &F, Instruction *Autorelease,
                                           ARCInstKind Class) {
-  const Value *Arg = GetArgRCIdentityRoot(Autorelease); 
- 
-  // Check that there are no instructions between the retain and the autorelease 
-  // (such as an autorelease_pop) which may change the count. 
+  const Value *Arg = GetArgRCIdentityRoot(Autorelease);
+
+  // Check that there are no instructions between the retain and the autorelease
+  // (such as an autorelease_pop) which may change the count.
   DependenceKind DK = Class == ARCInstKind::AutoreleaseRV
                           ? RetainAutoreleaseRVDep
                           : RetainAutoreleaseDep;
   auto *Retain = dyn_cast_or_null<CallInst>(
       findSingleDependency(DK, Arg, Autorelease->getParent(), Autorelease, PA));
- 
-  if (!Retain || GetBasicARCInstKind(Retain) != ARCInstKind::Retain || 
-      GetArgRCIdentityRoot(Retain) != Arg) 
-    return false; 
- 
-  Changed = true; 
-  ++NumPeeps; 
- 
-  LLVM_DEBUG(dbgs() << "    Fusing retain/autorelease!\n" 
-                       "        Autorelease:" 
-                    << *Autorelease 
-                    << "\n" 
-                       "        Retain: " 
-                    << *Retain << "\n"); 
- 
-  Function *Decl = EP.get(Class == ARCInstKind::AutoreleaseRV 
-                              ? ARCRuntimeEntryPointKind::RetainAutoreleaseRV 
-                              : ARCRuntimeEntryPointKind::RetainAutorelease); 
-  Retain->setCalledFunction(Decl); 
- 
-  LLVM_DEBUG(dbgs() << "        New RetainAutorelease: " << *Retain << "\n"); 
- 
-  EraseInstruction(Autorelease); 
-  return true; 
-} 
- 
-static StoreInst *findSafeStoreForStoreStrongContraction(LoadInst *Load, 
-                                                         Instruction *Release, 
-                                                         ProvenanceAnalysis &PA, 
+
+  if (!Retain || GetBasicARCInstKind(Retain) != ARCInstKind::Retain ||
+      GetArgRCIdentityRoot(Retain) != Arg)
+    return false;
+
+  Changed = true;
+  ++NumPeeps;
+
+  LLVM_DEBUG(dbgs() << "    Fusing retain/autorelease!\n"
+                       "        Autorelease:"
+                    << *Autorelease
+                    << "\n"
+                       "        Retain: "
+                    << *Retain << "\n");
+
+  Function *Decl = EP.get(Class == ARCInstKind::AutoreleaseRV
+                              ? ARCRuntimeEntryPointKind::RetainAutoreleaseRV
+                              : ARCRuntimeEntryPointKind::RetainAutorelease);
+  Retain->setCalledFunction(Decl);
+
+  LLVM_DEBUG(dbgs() << "        New RetainAutorelease: " << *Retain << "\n");
+
+  EraseInstruction(Autorelease);
+  return true;
+}
+
+static StoreInst *findSafeStoreForStoreStrongContraction(LoadInst *Load,
+                                                         Instruction *Release,
+                                                         ProvenanceAnalysis &PA,
                                                          AAResults *AA) {
-  StoreInst *Store = nullptr; 
-  bool SawRelease = false; 
- 
-  // Get the location associated with Load. 
-  MemoryLocation Loc = MemoryLocation::get(Load); 
-  auto *LocPtr = Loc.Ptr->stripPointerCasts(); 
- 
-  // Walk down to find the store and the release, which may be in either order. 
-  for (auto I = std::next(BasicBlock::iterator(Load)), 
-            E = Load->getParent()->end(); 
-       I != E; ++I) { 
-    // If we found the store we were looking for and saw the release, 
-    // break. There is no more work to be done. 
-    if (Store && SawRelease) 
-      break; 
- 
-    // Now we know that we have not seen either the store or the release. If I 
-    // is the release, mark that we saw the release and continue. 
-    Instruction *Inst = &*I; 
-    if (Inst == Release) { 
-      SawRelease = true; 
-      continue; 
-    } 
- 
-    // Otherwise, we check if Inst is a "good" store. Grab the instruction class 
-    // of Inst. 
-    ARCInstKind Class = GetBasicARCInstKind(Inst); 
- 
-    // If Inst is an unrelated retain, we don't care about it. 
-    // 
-    // TODO: This is one area where the optimization could be made more 
-    // aggressive. 
-    if (IsRetain(Class)) 
-      continue; 
- 
-    // If we have seen the store, but not the release... 
-    if (Store) { 
-      // We need to make sure that it is safe to move the release from its 
-      // current position to the store. This implies proving that any 
-      // instruction in between Store and the Release conservatively can not use 
-      // the RCIdentityRoot of Release. If we can prove we can ignore Inst, so 
-      // continue... 
-      if (!CanUse(Inst, Load, PA, Class)) { 
-        continue; 
-      } 
- 
-      // Otherwise, be conservative and return nullptr. 
-      return nullptr; 
-    } 
- 
-    // Ok, now we know we have not seen a store yet. See if Inst can write to 
-    // our load location, if it can not, just ignore the instruction. 
-    if (!isModSet(AA->getModRefInfo(Inst, Loc))) 
-      continue; 
- 
-    Store = dyn_cast<StoreInst>(Inst); 
- 
-    // If Inst can, then check if Inst is a simple store. If Inst is not a 
-    // store or a store that is not simple, then we have some we do not 
-    // understand writing to this memory implying we can not move the load 
-    // over the write to any subsequent store that we may find. 
-    if (!Store || !Store->isSimple()) 
-      return nullptr; 
- 
-    // Then make sure that the pointer we are storing to is Ptr. If so, we 
-    // found our Store! 
-    if (Store->getPointerOperand()->stripPointerCasts() == LocPtr) 
-      continue; 
- 
-    // Otherwise, we have an unknown store to some other ptr that clobbers 
-    // Loc.Ptr. Bail! 
-    return nullptr; 
-  } 
- 
-  // If we did not find the store or did not see the release, fail. 
-  if (!Store || !SawRelease) 
-    return nullptr; 
- 
-  // We succeeded! 
-  return Store; 
-} 
- 
-static Instruction * 
-findRetainForStoreStrongContraction(Value *New, StoreInst *Store, 
-                                    Instruction *Release, 
-                                    ProvenanceAnalysis &PA) { 
-  // Walk up from the Store to find the retain. 
-  BasicBlock::iterator I = Store->getIterator(); 
-  BasicBlock::iterator Begin = Store->getParent()->begin(); 
-  while (I != Begin && GetBasicARCInstKind(&*I) != ARCInstKind::Retain) { 
-    Instruction *Inst = &*I; 
- 
-    // It is only safe to move the retain to the store if we can prove 
-    // conservatively that nothing besides the release can decrement reference 
-    // counts in between the retain and the store. 
-    if (CanDecrementRefCount(Inst, New, PA) && Inst != Release) 
-      return nullptr; 
-    --I; 
-  } 
-  Instruction *Retain = &*I; 
-  if (GetBasicARCInstKind(Retain) != ARCInstKind::Retain) 
-    return nullptr; 
-  if (GetArgRCIdentityRoot(Retain) != New) 
-    return nullptr; 
-  return Retain; 
-} 
- 
-/// Create a call instruction with the correct funclet token. Should be used 
-/// instead of calling CallInst::Create directly. 
-static CallInst * 
-createCallInst(FunctionType *FTy, Value *Func, ArrayRef<Value *> Args, 
-               const Twine &NameStr, Instruction *InsertBefore, 
-               const DenseMap<BasicBlock *, ColorVector> &BlockColors) { 
-  SmallVector<OperandBundleDef, 1> OpBundles; 
-  if (!BlockColors.empty()) { 
-    const ColorVector &CV = BlockColors.find(InsertBefore->getParent())->second; 
-    assert(CV.size() == 1 && "non-unique color for block!"); 
-    Instruction *EHPad = CV.front()->getFirstNonPHI(); 
-    if (EHPad->isEHPad()) 
-      OpBundles.emplace_back("funclet", EHPad); 
-  } 
- 
-  return CallInst::Create(FTy, Func, Args, OpBundles, NameStr, InsertBefore); 
-} 
- 
-static CallInst * 
-createCallInst(FunctionCallee Func, ArrayRef<Value *> Args, const Twine &NameStr, 
-               Instruction *InsertBefore, 
-               const DenseMap<BasicBlock *, ColorVector> &BlockColors) { 
-  return createCallInst(Func.getFunctionType(), Func.getCallee(), Args, NameStr, 
-                        InsertBefore, BlockColors); 
-} 
- 
-/// Attempt to merge an objc_release with a store, load, and objc_retain to form 
-/// an objc_storeStrong. An objc_storeStrong: 
-/// 
-///   objc_storeStrong(i8** %old_ptr, i8* new_value) 
-/// 
-/// is equivalent to the following IR sequence: 
-/// 
-///   ; Load old value. 
-///   %old_value = load i8** %old_ptr               (1) 
-/// 
-///   ; Increment the new value and then release the old value. This must occur 
-///   ; in order in case old_value releases new_value in its destructor causing 
-///   ; us to potentially have a dangling ptr. 
-///   tail call i8* @objc_retain(i8* %new_value)    (2) 
-///   tail call void @objc_release(i8* %old_value)  (3) 
-/// 
-///   ; Store the new_value into old_ptr 
-///   store i8* %new_value, i8** %old_ptr           (4) 
-/// 
-/// The safety of this optimization is based around the following 
-/// considerations: 
-/// 
-///  1. We are forming the store strong at the store. Thus to perform this 
-///     optimization it must be safe to move the retain, load, and release to 
-///     (4). 
-///  2. We need to make sure that any re-orderings of (1), (2), (3), (4) are 
-///     safe. 
-void ObjCARCContract::tryToContractReleaseIntoStoreStrong( 
-    Instruction *Release, inst_iterator &Iter, 
-    const DenseMap<BasicBlock *, ColorVector> &BlockColors) { 
-  // See if we are releasing something that we just loaded. 
-  auto *Load = dyn_cast<LoadInst>(GetArgRCIdentityRoot(Release)); 
-  if (!Load || !Load->isSimple()) 
-    return; 
- 
-  // For now, require everything to be in one basic block. 
-  BasicBlock *BB = Release->getParent(); 
-  if (Load->getParent() != BB) 
-    return; 
- 
-  // First scan down the BB from Load, looking for a store of the RCIdentityRoot 
-  // of Load's 
-  StoreInst *Store = 
-      findSafeStoreForStoreStrongContraction(Load, Release, PA, AA); 
-  // If we fail, bail. 
-  if (!Store) 
-    return; 
- 
-  // Then find what new_value's RCIdentity Root is. 
-  Value *New = GetRCIdentityRoot(Store->getValueOperand()); 
- 
-  // Then walk up the BB and look for a retain on New without any intervening 
-  // instructions which conservatively might decrement ref counts. 
-  Instruction *Retain = 
-      findRetainForStoreStrongContraction(New, Store, Release, PA); 
- 
-  // If we fail, bail. 
-  if (!Retain) 
-    return; 
- 
-  Changed = true; 
-  ++NumStoreStrongs; 
- 
-  LLVM_DEBUG( 
-      llvm::dbgs() << "    Contracting retain, release into objc_storeStrong.\n" 
-                   << "        Old:\n" 
-                   << "            Store:   " << *Store << "\n" 
-                   << "            Release: " << *Release << "\n" 
-                   << "            Retain:  " << *Retain << "\n" 
-                   << "            Load:    " << *Load << "\n"); 
- 
-  LLVMContext &C = Release->getContext(); 
-  Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); 
-  Type *I8XX = PointerType::getUnqual(I8X); 
- 
-  Value *Args[] = { Load->getPointerOperand(), New }; 
-  if (Args[0]->getType() != I8XX) 
-    Args[0] = new BitCastInst(Args[0], I8XX, "", Store); 
-  if (Args[1]->getType() != I8X) 
-    Args[1] = new BitCastInst(Args[1], I8X, "", Store); 
-  Function *Decl = EP.get(ARCRuntimeEntryPointKind::StoreStrong); 
-  CallInst *StoreStrong = createCallInst(Decl, Args, "", Store, BlockColors); 
-  StoreStrong->setDoesNotThrow(); 
-  StoreStrong->setDebugLoc(Store->getDebugLoc()); 
- 
-  // We can't set the tail flag yet, because we haven't yet determined 
-  // whether there are any escaping allocas. Remember this call, so that 
-  // we can set the tail flag once we know it's safe. 
-  StoreStrongCalls.insert(StoreStrong); 
- 
-  LLVM_DEBUG(llvm::dbgs() << "        New Store Strong: " << *StoreStrong 
-                          << "\n"); 
- 
-  if (&*Iter == Retain) ++Iter; 
-  if (&*Iter == Store) ++Iter; 
-  Store->eraseFromParent(); 
-  Release->eraseFromParent(); 
-  EraseInstruction(Retain); 
-  if (Load->use_empty()) 
-    Load->eraseFromParent(); 
-} 
- 
-bool ObjCARCContract::tryToPeepholeInstruction( 
-    Function &F, Instruction *Inst, inst_iterator &Iter, 
+  StoreInst *Store = nullptr;
+  bool SawRelease = false;
+
+  // Get the location associated with Load.
+  MemoryLocation Loc = MemoryLocation::get(Load);
+  auto *LocPtr = Loc.Ptr->stripPointerCasts();
+
+  // Walk down to find the store and the release, which may be in either order.
+  for (auto I = std::next(BasicBlock::iterator(Load)),
+            E = Load->getParent()->end();
+       I != E; ++I) {
+    // If we found the store we were looking for and saw the release,
+    // break. There is no more work to be done.
+    if (Store && SawRelease)
+      break;
+
+    // Now we know that we have not seen either the store or the release. If I
+    // is the release, mark that we saw the release and continue.
+    Instruction *Inst = &*I;
+    if (Inst == Release) {
+      SawRelease = true;
+      continue;
+    }
+
+    // Otherwise, we check if Inst is a "good" store. Grab the instruction class
+    // of Inst.
+    ARCInstKind Class = GetBasicARCInstKind(Inst);
+
+    // If Inst is an unrelated retain, we don't care about it.
+    //
+    // TODO: This is one area where the optimization could be made more
+    // aggressive.
+    if (IsRetain(Class))
+      continue;
+
+    // If we have seen the store, but not the release...
+    if (Store) {
+      // We need to make sure that it is safe to move the release from its
+      // current position to the store. This implies proving that any
+      // instruction in between Store and the Release conservatively can not use
+      // the RCIdentityRoot of Release. If we can prove we can ignore Inst, so
+      // continue...
+      if (!CanUse(Inst, Load, PA, Class)) {
+        continue;
+      }
+
+      // Otherwise, be conservative and return nullptr.
+      return nullptr;
+    }
+
+    // Ok, now we know we have not seen a store yet. See if Inst can write to
+    // our load location, if it can not, just ignore the instruction.
+    if (!isModSet(AA->getModRefInfo(Inst, Loc)))
+      continue;
+
+    Store = dyn_cast<StoreInst>(Inst);
+
+    // If Inst can, then check if Inst is a simple store. If Inst is not a
+    // store or a store that is not simple, then we have some we do not
+    // understand writing to this memory implying we can not move the load
+    // over the write to any subsequent store that we may find.
+    if (!Store || !Store->isSimple())
+      return nullptr;
+
+    // Then make sure that the pointer we are storing to is Ptr. If so, we
+    // found our Store!
+    if (Store->getPointerOperand()->stripPointerCasts() == LocPtr)
+      continue;
+
+    // Otherwise, we have an unknown store to some other ptr that clobbers
+    // Loc.Ptr. Bail!
+    return nullptr;
+  }
+
+  // If we did not find the store or did not see the release, fail.
+  if (!Store || !SawRelease)
+    return nullptr;
+
+  // We succeeded!
+  return Store;
+}
+
+static Instruction *
+findRetainForStoreStrongContraction(Value *New, StoreInst *Store,
+                                    Instruction *Release,
+                                    ProvenanceAnalysis &PA) {
+  // Walk up from the Store to find the retain.
+  BasicBlock::iterator I = Store->getIterator();
+  BasicBlock::iterator Begin = Store->getParent()->begin();
+  while (I != Begin && GetBasicARCInstKind(&*I) != ARCInstKind::Retain) {
+    Instruction *Inst = &*I;
+
+    // It is only safe to move the retain to the store if we can prove
+    // conservatively that nothing besides the release can decrement reference
+    // counts in between the retain and the store.
+    if (CanDecrementRefCount(Inst, New, PA) && Inst != Release)
+      return nullptr;
+    --I;
+  }
+  Instruction *Retain = &*I;
+  if (GetBasicARCInstKind(Retain) != ARCInstKind::Retain)
+    return nullptr;
+  if (GetArgRCIdentityRoot(Retain) != New)
+    return nullptr;
+  return Retain;
+}
+
+/// Create a call instruction with the correct funclet token. Should be used
+/// instead of calling CallInst::Create directly.
+static CallInst *
+createCallInst(FunctionType *FTy, Value *Func, ArrayRef<Value *> Args,
+               const Twine &NameStr, Instruction *InsertBefore,
+               const DenseMap<BasicBlock *, ColorVector> &BlockColors) {
+  SmallVector<OperandBundleDef, 1> OpBundles;
+  if (!BlockColors.empty()) {
+    const ColorVector &CV = BlockColors.find(InsertBefore->getParent())->second;
+    assert(CV.size() == 1 && "non-unique color for block!");
+    Instruction *EHPad = CV.front()->getFirstNonPHI();
+    if (EHPad->isEHPad())
+      OpBundles.emplace_back("funclet", EHPad);
+  }
+
+  return CallInst::Create(FTy, Func, Args, OpBundles, NameStr, InsertBefore);
+}
+
+static CallInst *
+createCallInst(FunctionCallee Func, ArrayRef<Value *> Args, const Twine &NameStr,
+               Instruction *InsertBefore,
+               const DenseMap<BasicBlock *, ColorVector> &BlockColors) {
+  return createCallInst(Func.getFunctionType(), Func.getCallee(), Args, NameStr,
+                        InsertBefore, BlockColors);
+}
+
+/// Attempt to merge an objc_release with a store, load, and objc_retain to form
+/// an objc_storeStrong. An objc_storeStrong:
+///
+///   objc_storeStrong(i8** %old_ptr, i8* new_value)
+///
+/// is equivalent to the following IR sequence:
+///
+///   ; Load old value.
+///   %old_value = load i8** %old_ptr               (1)
+///
+///   ; Increment the new value and then release the old value. This must occur
+///   ; in order in case old_value releases new_value in its destructor causing
+///   ; us to potentially have a dangling ptr.
+///   tail call i8* @objc_retain(i8* %new_value)    (2)
+///   tail call void @objc_release(i8* %old_value)  (3)
+///
+///   ; Store the new_value into old_ptr
+///   store i8* %new_value, i8** %old_ptr           (4)
+///
+/// The safety of this optimization is based around the following
+/// considerations:
+///
+///  1. We are forming the store strong at the store. Thus to perform this
+///     optimization it must be safe to move the retain, load, and release to
+///     (4).
+///  2. We need to make sure that any re-orderings of (1), (2), (3), (4) are
+///     safe.
+void ObjCARCContract::tryToContractReleaseIntoStoreStrong(
+    Instruction *Release, inst_iterator &Iter,
+    const DenseMap<BasicBlock *, ColorVector> &BlockColors) {
+  // See if we are releasing something that we just loaded.
+  auto *Load = dyn_cast<LoadInst>(GetArgRCIdentityRoot(Release));
+  if (!Load || !Load->isSimple())
+    return;
+
+  // For now, require everything to be in one basic block.
+  BasicBlock *BB = Release->getParent();
+  if (Load->getParent() != BB)
+    return;
+
+  // First scan down the BB from Load, looking for a store of the RCIdentityRoot
+  // of Load's
+  StoreInst *Store =
+      findSafeStoreForStoreStrongContraction(Load, Release, PA, AA);
+  // If we fail, bail.
+  if (!Store)
+    return;
+
+  // Then find what new_value's RCIdentity Root is.
+  Value *New = GetRCIdentityRoot(Store->getValueOperand());
+
+  // Then walk up the BB and look for a retain on New without any intervening
+  // instructions which conservatively might decrement ref counts.
+  Instruction *Retain =
+      findRetainForStoreStrongContraction(New, Store, Release, PA);
+
+  // If we fail, bail.
+  if (!Retain)
+    return;
+
+  Changed = true;
+  ++NumStoreStrongs;
+
+  LLVM_DEBUG(
+      llvm::dbgs() << "    Contracting retain, release into objc_storeStrong.\n"
+                   << "        Old:\n"
+                   << "            Store:   " << *Store << "\n"
+                   << "            Release: " << *Release << "\n"
+                   << "            Retain:  " << *Retain << "\n"
+                   << "            Load:    " << *Load << "\n");
+
+  LLVMContext &C = Release->getContext();
+  Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+  Type *I8XX = PointerType::getUnqual(I8X);
+
+  Value *Args[] = { Load->getPointerOperand(), New };
+  if (Args[0]->getType() != I8XX)
+    Args[0] = new BitCastInst(Args[0], I8XX, "", Store);
+  if (Args[1]->getType() != I8X)
+    Args[1] = new BitCastInst(Args[1], I8X, "", Store);
+  Function *Decl = EP.get(ARCRuntimeEntryPointKind::StoreStrong);
+  CallInst *StoreStrong = createCallInst(Decl, Args, "", Store, BlockColors);
+  StoreStrong->setDoesNotThrow();
+  StoreStrong->setDebugLoc(Store->getDebugLoc());
+
+  // We can't set the tail flag yet, because we haven't yet determined
+  // whether there are any escaping allocas. Remember this call, so that
+  // we can set the tail flag once we know it's safe.
+  StoreStrongCalls.insert(StoreStrong);
+
+  LLVM_DEBUG(llvm::dbgs() << "        New Store Strong: " << *StoreStrong
+                          << "\n");
+
+  if (&*Iter == Retain) ++Iter;
+  if (&*Iter == Store) ++Iter;
+  Store->eraseFromParent();
+  Release->eraseFromParent();
+  EraseInstruction(Retain);
+  if (Load->use_empty())
+    Load->eraseFromParent();
+}
+
+bool ObjCARCContract::tryToPeepholeInstruction(
+    Function &F, Instruction *Inst, inst_iterator &Iter,
     bool &TailOkForStoreStrongs,
-    const DenseMap<BasicBlock *, ColorVector> &BlockColors) { 
-  // Only these library routines return their argument. In particular, 
-  // objc_retainBlock does not necessarily return its argument. 
-  ARCInstKind Class = GetBasicARCInstKind(Inst); 
-  switch (Class) { 
-  case ARCInstKind::FusedRetainAutorelease: 
-  case ARCInstKind::FusedRetainAutoreleaseRV: 
-    return false; 
-  case ARCInstKind::Autorelease: 
-  case ARCInstKind::AutoreleaseRV: 
+    const DenseMap<BasicBlock *, ColorVector> &BlockColors) {
+  // Only these library routines return their argument. In particular,
+  // objc_retainBlock does not necessarily return its argument.
+  ARCInstKind Class = GetBasicARCInstKind(Inst);
+  switch (Class) {
+  case ARCInstKind::FusedRetainAutorelease:
+  case ARCInstKind::FusedRetainAutoreleaseRV:
+    return false;
+  case ARCInstKind::Autorelease:
+  case ARCInstKind::AutoreleaseRV:
     return contractAutorelease(F, Inst, Class);
-  case ARCInstKind::Retain: 
-    // Attempt to convert retains to retainrvs if they are next to function 
-    // calls. 
-    if (!optimizeRetainCall(F, Inst)) 
-      return false; 
-    // If we succeed in our optimization, fall through. 
-    LLVM_FALLTHROUGH; 
-  case ARCInstKind::RetainRV: 
-  case ARCInstKind::ClaimRV: { 
-    // If we're compiling for a target which needs a special inline-asm 
-    // marker to do the return value optimization, insert it now. 
-    if (!RVInstMarker) 
-      return false; 
-    BasicBlock::iterator BBI = Inst->getIterator(); 
-    BasicBlock *InstParent = Inst->getParent(); 
- 
-    // Step up to see if the call immediately precedes the RV call. 
-    // If it's an invoke, we have to cross a block boundary. And we have 
-    // to carefully dodge no-op instructions. 
-    do { 
-      if (BBI == InstParent->begin()) { 
-        BasicBlock *Pred = InstParent->getSinglePredecessor(); 
-        if (!Pred) 
-          goto decline_rv_optimization; 
-        BBI = Pred->getTerminator()->getIterator(); 
-        break; 
-      } 
-      --BBI; 
-    } while (IsNoopInstruction(&*BBI)); 
- 
+  case ARCInstKind::Retain:
+    // Attempt to convert retains to retainrvs if they are next to function
+    // calls.
+    if (!optimizeRetainCall(F, Inst))
+      return false;
+    // If we succeed in our optimization, fall through.
+    LLVM_FALLTHROUGH;
+  case ARCInstKind::RetainRV:
+  case ARCInstKind::ClaimRV: {
+    // If we're compiling for a target which needs a special inline-asm
+    // marker to do the return value optimization, insert it now.
+    if (!RVInstMarker)
+      return false;
+    BasicBlock::iterator BBI = Inst->getIterator();
+    BasicBlock *InstParent = Inst->getParent();
+
+    // Step up to see if the call immediately precedes the RV call.
+    // If it's an invoke, we have to cross a block boundary. And we have
+    // to carefully dodge no-op instructions.
+    do {
+      if (BBI == InstParent->begin()) {
+        BasicBlock *Pred = InstParent->getSinglePredecessor();
+        if (!Pred)
+          goto decline_rv_optimization;
+        BBI = Pred->getTerminator()->getIterator();
+        break;
+      }
+      --BBI;
+    } while (IsNoopInstruction(&*BBI));
+
     if (GetRCIdentityRoot(&*BBI) == GetArgRCIdentityRoot(Inst)) {
-      LLVM_DEBUG(dbgs() << "Adding inline asm marker for the return value " 
-                           "optimization.\n"); 
-      Changed = true; 
-      InlineAsm *IA = 
-          InlineAsm::get(FunctionType::get(Type::getVoidTy(Inst->getContext()), 
-                                           /*isVarArg=*/false), 
-                         RVInstMarker->getString(), 
-                         /*Constraints=*/"", /*hasSideEffects=*/true); 
- 
-      createCallInst(IA, None, "", Inst, BlockColors); 
-    } 
-  decline_rv_optimization: 
-    return false; 
-  } 
-  case ARCInstKind::InitWeak: { 
-    // objc_initWeak(p, null) => *p = null 
-    CallInst *CI = cast<CallInst>(Inst); 
-    if (IsNullOrUndef(CI->getArgOperand(1))) { 
-      Value *Null = ConstantPointerNull::get(cast<PointerType>(CI->getType())); 
-      Changed = true; 
-      new StoreInst(Null, CI->getArgOperand(0), CI); 
- 
-      LLVM_DEBUG(dbgs() << "OBJCARCContract: Old = " << *CI << "\n" 
-                        << "                 New = " << *Null << "\n"); 
- 
-      CI->replaceAllUsesWith(Null); 
-      CI->eraseFromParent(); 
-    } 
-    return true; 
-  } 
-  case ARCInstKind::Release: 
-    // Try to form an objc store strong from our release. If we fail, there is 
-    // nothing further to do below, so continue. 
-    tryToContractReleaseIntoStoreStrong(Inst, Iter, BlockColors); 
-    return true; 
-  case ARCInstKind::User: 
-    // Be conservative if the function has any alloca instructions. 
-    // Technically we only care about escaping alloca instructions, 
-    // but this is sufficient to handle some interesting cases. 
-    if (isa<AllocaInst>(Inst)) 
-      TailOkForStoreStrongs = false; 
-    return true; 
-  case ARCInstKind::IntrinsicUser: 
-    // Remove calls to @llvm.objc.clang.arc.use(...). 
-    Changed = true; 
-    Inst->eraseFromParent(); 
-    return true; 
-  default: 
-    return true; 
-  } 
-} 
- 
-//===----------------------------------------------------------------------===// 
-//                              Top Level Driver 
-//===----------------------------------------------------------------------===// 
- 
+      LLVM_DEBUG(dbgs() << "Adding inline asm marker for the return value "
+                           "optimization.\n");
+      Changed = true;
+      InlineAsm *IA =
+          InlineAsm::get(FunctionType::get(Type::getVoidTy(Inst->getContext()),
+                                           /*isVarArg=*/false),
+                         RVInstMarker->getString(),
+                         /*Constraints=*/"", /*hasSideEffects=*/true);
+
+      createCallInst(IA, None, "", Inst, BlockColors);
+    }
+  decline_rv_optimization:
+    return false;
+  }
+  case ARCInstKind::InitWeak: {
+    // objc_initWeak(p, null) => *p = null
+    CallInst *CI = cast<CallInst>(Inst);
+    if (IsNullOrUndef(CI->getArgOperand(1))) {
+      Value *Null = ConstantPointerNull::get(cast<PointerType>(CI->getType()));
+      Changed = true;
+      new StoreInst(Null, CI->getArgOperand(0), CI);
+
+      LLVM_DEBUG(dbgs() << "OBJCARCContract: Old = " << *CI << "\n"
+                        << "                 New = " << *Null << "\n");
+
+      CI->replaceAllUsesWith(Null);
+      CI->eraseFromParent();
+    }
+    return true;
+  }
+  case ARCInstKind::Release:
+    // Try to form an objc store strong from our release. If we fail, there is
+    // nothing further to do below, so continue.
+    tryToContractReleaseIntoStoreStrong(Inst, Iter, BlockColors);
+    return true;
+  case ARCInstKind::User:
+    // Be conservative if the function has any alloca instructions.
+    // Technically we only care about escaping alloca instructions,
+    // but this is sufficient to handle some interesting cases.
+    if (isa<AllocaInst>(Inst))
+      TailOkForStoreStrongs = false;
+    return true;
+  case ARCInstKind::IntrinsicUser:
+    // Remove calls to @llvm.objc.clang.arc.use(...).
+    Changed = true;
+    Inst->eraseFromParent();
+    return true;
+  default:
+    return true;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                              Top Level Driver
+//===----------------------------------------------------------------------===//
+
 bool ObjCARCContract::init(Module &M) {
   // If nothing in the Module uses ARC, don't do anything.
   Run = ModuleHasARC(M);
@@ -549,212 +549,212 @@ bool ObjCARCContract::init(Module &M) {
 }
 
 bool ObjCARCContract::run(Function &F, AAResults *A, DominatorTree *D) {
-  if (!EnableARCOpts) 
-    return false; 
- 
-  // If nothing in the Module uses ARC, don't do anything. 
-  if (!Run) 
-    return false; 
- 
-  Changed = false; 
+  if (!EnableARCOpts)
+    return false;
+
+  // If nothing in the Module uses ARC, don't do anything.
+  if (!Run)
+    return false;
+
+  Changed = false;
   AA = A;
   DT = D;
   PA.setAA(A);
- 
-  DenseMap<BasicBlock *, ColorVector> BlockColors; 
-  if (F.hasPersonalityFn() && 
-      isScopedEHPersonality(classifyEHPersonality(F.getPersonalityFn()))) 
-    BlockColors = colorEHFunclets(F); 
- 
-  LLVM_DEBUG(llvm::dbgs() << "**** ObjCARC Contract ****\n"); 
- 
-  // Track whether it's ok to mark objc_storeStrong calls with the "tail" 
-  // keyword. Be conservative if the function has variadic arguments. 
-  // It seems that functions which "return twice" are also unsafe for the 
-  // "tail" argument, because they are setjmp, which could need to 
-  // return to an earlier stack state. 
-  bool TailOkForStoreStrongs = 
-      !F.isVarArg() && !F.callsFunctionThatReturnsTwice(); 
- 
-  // For ObjC library calls which return their argument, replace uses of the 
-  // argument with uses of the call return value, if it dominates the use. This 
-  // reduces register pressure. 
-  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E;) { 
-    Instruction *Inst = &*I++; 
- 
-    LLVM_DEBUG(dbgs() << "Visiting: " << *Inst << "\n"); 
- 
-    // First try to peephole Inst. If there is nothing further we can do in 
-    // terms of undoing objc-arc-expand, process the next inst. 
+
+  DenseMap<BasicBlock *, ColorVector> BlockColors;
+  if (F.hasPersonalityFn() &&
+      isScopedEHPersonality(classifyEHPersonality(F.getPersonalityFn())))
+    BlockColors = colorEHFunclets(F);
+
+  LLVM_DEBUG(llvm::dbgs() << "**** ObjCARC Contract ****\n");
+
+  // Track whether it's ok to mark objc_storeStrong calls with the "tail"
+  // keyword. Be conservative if the function has variadic arguments.
+  // It seems that functions which "return twice" are also unsafe for the
+  // "tail" argument, because they are setjmp, which could need to
+  // return to an earlier stack state.
+  bool TailOkForStoreStrongs =
+      !F.isVarArg() && !F.callsFunctionThatReturnsTwice();
+
+  // For ObjC library calls which return their argument, replace uses of the
+  // argument with uses of the call return value, if it dominates the use. This
+  // reduces register pressure.
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E;) {
+    Instruction *Inst = &*I++;
+
+    LLVM_DEBUG(dbgs() << "Visiting: " << *Inst << "\n");
+
+    // First try to peephole Inst. If there is nothing further we can do in
+    // terms of undoing objc-arc-expand, process the next inst.
     if (tryToPeepholeInstruction(F, Inst, I, TailOkForStoreStrongs,
                                  BlockColors))
-      continue; 
- 
-    // Otherwise, try to undo objc-arc-expand. 
- 
-    // Don't use GetArgRCIdentityRoot because we don't want to look through bitcasts 
-    // and such; to do the replacement, the argument must have type i8*. 
- 
-    // Function for replacing uses of Arg dominated by Inst. 
-    auto ReplaceArgUses = [Inst, this](Value *Arg) { 
-      // If we're compiling bugpointed code, don't get in trouble. 
-      if (!isa<Instruction>(Arg) && !isa<Argument>(Arg)) 
-        return; 
- 
-      // Look through the uses of the pointer. 
-      for (Value::use_iterator UI = Arg->use_begin(), UE = Arg->use_end(); 
-           UI != UE; ) { 
-        // Increment UI now, because we may unlink its element. 
-        Use &U = *UI++; 
-        unsigned OperandNo = U.getOperandNo(); 
- 
-        // If the call's return value dominates a use of the call's argument 
-        // value, rewrite the use to use the return value. We check for 
-        // reachability here because an unreachable call is considered to 
-        // trivially dominate itself, which would lead us to rewriting its 
-        // argument in terms of its return value, which would lead to 
-        // infinite loops in GetArgRCIdentityRoot. 
-        if (!DT->isReachableFromEntry(U) || !DT->dominates(Inst, U)) 
-          continue; 
- 
-        Changed = true; 
-        Instruction *Replacement = Inst; 
-        Type *UseTy = U.get()->getType(); 
-        if (PHINode *PHI = dyn_cast<PHINode>(U.getUser())) { 
-          // For PHI nodes, insert the bitcast in the predecessor block. 
-          unsigned ValNo = PHINode::getIncomingValueNumForOperand(OperandNo); 
-          BasicBlock *IncomingBB = PHI->getIncomingBlock(ValNo); 
-          if (Replacement->getType() != UseTy) { 
-            // A catchswitch is both a pad and a terminator, meaning a basic 
-            // block with a catchswitch has no insertion point. Keep going up 
-            // the dominator tree until we find a non-catchswitch. 
-            BasicBlock *InsertBB = IncomingBB; 
-            while (isa<CatchSwitchInst>(InsertBB->getFirstNonPHI())) { 
-              InsertBB = DT->getNode(InsertBB)->getIDom()->getBlock(); 
-            } 
- 
-            assert(DT->dominates(Inst, &InsertBB->back()) && 
-                   "Invalid insertion point for bitcast"); 
-            Replacement = 
-                new BitCastInst(Replacement, UseTy, "", &InsertBB->back()); 
-          } 
- 
-          // While we're here, rewrite all edges for this PHI, rather 
-          // than just one use at a time, to minimize the number of 
-          // bitcasts we emit. 
-          for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i) 
-            if (PHI->getIncomingBlock(i) == IncomingBB) { 
-              // Keep the UI iterator valid. 
-              if (UI != UE && 
-                  &PHI->getOperandUse( 
-                      PHINode::getOperandNumForIncomingValue(i)) == &*UI) 
-                ++UI; 
-              PHI->setIncomingValue(i, Replacement); 
-            } 
-        } else { 
-          if (Replacement->getType() != UseTy) 
-            Replacement = new BitCastInst(Replacement, UseTy, "", 
-                                          cast<Instruction>(U.getUser())); 
-          U.set(Replacement); 
-        } 
-      } 
-    }; 
- 
-    Value *Arg = cast<CallInst>(Inst)->getArgOperand(0); 
-    Value *OrigArg = Arg; 
- 
-    // TODO: Change this to a do-while. 
-    for (;;) { 
-      ReplaceArgUses(Arg); 
- 
-      // If Arg is a no-op casted pointer, strip one level of casts and iterate. 
-      if (const BitCastInst *BI = dyn_cast<BitCastInst>(Arg)) 
-        Arg = BI->getOperand(0); 
-      else if (isa<GEPOperator>(Arg) && 
-               cast<GEPOperator>(Arg)->hasAllZeroIndices()) 
-        Arg = cast<GEPOperator>(Arg)->getPointerOperand(); 
-      else if (isa<GlobalAlias>(Arg) && 
-               !cast<GlobalAlias>(Arg)->isInterposable()) 
-        Arg = cast<GlobalAlias>(Arg)->getAliasee(); 
-      else { 
-        // If Arg is a PHI node, get PHIs that are equivalent to it and replace 
-        // their uses. 
-        if (PHINode *PN = dyn_cast<PHINode>(Arg)) { 
-          SmallVector<Value *, 1> PHIList; 
-          getEquivalentPHIs(*PN, PHIList); 
-          for (Value *PHI : PHIList) 
-            ReplaceArgUses(PHI); 
-        } 
-        break; 
-      } 
-    } 
- 
-    // Replace bitcast users of Arg that are dominated by Inst. 
-    SmallVector<BitCastInst *, 2> BitCastUsers; 
- 
-    // Add all bitcast users of the function argument first. 
-    for (User *U : OrigArg->users()) 
-      if (auto *BC = dyn_cast<BitCastInst>(U)) 
-        BitCastUsers.push_back(BC); 
- 
-    // Replace the bitcasts with the call return. Iterate until list is empty. 
-    while (!BitCastUsers.empty()) { 
-      auto *BC = BitCastUsers.pop_back_val(); 
-      for (User *U : BC->users()) 
-        if (auto *B = dyn_cast<BitCastInst>(U)) 
-          BitCastUsers.push_back(B); 
- 
-      ReplaceArgUses(BC); 
-    } 
-  } 
- 
-  // If this function has no escaping allocas or suspicious vararg usage, 
-  // objc_storeStrong calls can be marked with the "tail" keyword. 
-  if (TailOkForStoreStrongs) 
-    for (CallInst *CI : StoreStrongCalls) 
-      CI->setTailCall(); 
-  StoreStrongCalls.clear(); 
- 
-  return Changed; 
-} 
- 
-//===----------------------------------------------------------------------===// 
-//                             Misc Pass Manager 
-//===----------------------------------------------------------------------===// 
- 
+      continue;
+
+    // Otherwise, try to undo objc-arc-expand.
+
+    // Don't use GetArgRCIdentityRoot because we don't want to look through bitcasts
+    // and such; to do the replacement, the argument must have type i8*.
+
+    // Function for replacing uses of Arg dominated by Inst.
+    auto ReplaceArgUses = [Inst, this](Value *Arg) {
+      // If we're compiling bugpointed code, don't get in trouble.
+      if (!isa<Instruction>(Arg) && !isa<Argument>(Arg))
+        return;
+
+      // Look through the uses of the pointer.
+      for (Value::use_iterator UI = Arg->use_begin(), UE = Arg->use_end();
+           UI != UE; ) {
+        // Increment UI now, because we may unlink its element.
+        Use &U = *UI++;
+        unsigned OperandNo = U.getOperandNo();
+
+        // If the call's return value dominates a use of the call's argument
+        // value, rewrite the use to use the return value. We check for
+        // reachability here because an unreachable call is considered to
+        // trivially dominate itself, which would lead us to rewriting its
+        // argument in terms of its return value, which would lead to
+        // infinite loops in GetArgRCIdentityRoot.
+        if (!DT->isReachableFromEntry(U) || !DT->dominates(Inst, U))
+          continue;
+
+        Changed = true;
+        Instruction *Replacement = Inst;
+        Type *UseTy = U.get()->getType();
+        if (PHINode *PHI = dyn_cast<PHINode>(U.getUser())) {
+          // For PHI nodes, insert the bitcast in the predecessor block.
+          unsigned ValNo = PHINode::getIncomingValueNumForOperand(OperandNo);
+          BasicBlock *IncomingBB = PHI->getIncomingBlock(ValNo);
+          if (Replacement->getType() != UseTy) {
+            // A catchswitch is both a pad and a terminator, meaning a basic
+            // block with a catchswitch has no insertion point. Keep going up
+            // the dominator tree until we find a non-catchswitch.
+            BasicBlock *InsertBB = IncomingBB;
+            while (isa<CatchSwitchInst>(InsertBB->getFirstNonPHI())) {
+              InsertBB = DT->getNode(InsertBB)->getIDom()->getBlock();
+            }
+
+            assert(DT->dominates(Inst, &InsertBB->back()) &&
+                   "Invalid insertion point for bitcast");
+            Replacement =
+                new BitCastInst(Replacement, UseTy, "", &InsertBB->back());
+          }
+
+          // While we're here, rewrite all edges for this PHI, rather
+          // than just one use at a time, to minimize the number of
+          // bitcasts we emit.
+          for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i)
+            if (PHI->getIncomingBlock(i) == IncomingBB) {
+              // Keep the UI iterator valid.
+              if (UI != UE &&
+                  &PHI->getOperandUse(
+                      PHINode::getOperandNumForIncomingValue(i)) == &*UI)
+                ++UI;
+              PHI->setIncomingValue(i, Replacement);
+            }
+        } else {
+          if (Replacement->getType() != UseTy)
+            Replacement = new BitCastInst(Replacement, UseTy, "",
+                                          cast<Instruction>(U.getUser()));
+          U.set(Replacement);
+        }
+      }
+    };
+
+    Value *Arg = cast<CallInst>(Inst)->getArgOperand(0);
+    Value *OrigArg = Arg;
+
+    // TODO: Change this to a do-while.
+    for (;;) {
+      ReplaceArgUses(Arg);
+
+      // If Arg is a no-op casted pointer, strip one level of casts and iterate.
+      if (const BitCastInst *BI = dyn_cast<BitCastInst>(Arg))
+        Arg = BI->getOperand(0);
+      else if (isa<GEPOperator>(Arg) &&
+               cast<GEPOperator>(Arg)->hasAllZeroIndices())
+        Arg = cast<GEPOperator>(Arg)->getPointerOperand();
+      else if (isa<GlobalAlias>(Arg) &&
+               !cast<GlobalAlias>(Arg)->isInterposable())
+        Arg = cast<GlobalAlias>(Arg)->getAliasee();
+      else {
+        // If Arg is a PHI node, get PHIs that are equivalent to it and replace
+        // their uses.
+        if (PHINode *PN = dyn_cast<PHINode>(Arg)) {
+          SmallVector<Value *, 1> PHIList;
+          getEquivalentPHIs(*PN, PHIList);
+          for (Value *PHI : PHIList)
+            ReplaceArgUses(PHI);
+        }
+        break;
+      }
+    }
+
+    // Replace bitcast users of Arg that are dominated by Inst.
+    SmallVector<BitCastInst *, 2> BitCastUsers;
+
+    // Add all bitcast users of the function argument first.
+    for (User *U : OrigArg->users())
+      if (auto *BC = dyn_cast<BitCastInst>(U))
+        BitCastUsers.push_back(BC);
+
+    // Replace the bitcasts with the call return. Iterate until list is empty.
+    while (!BitCastUsers.empty()) {
+      auto *BC = BitCastUsers.pop_back_val();
+      for (User *U : BC->users())
+        if (auto *B = dyn_cast<BitCastInst>(U))
+          BitCastUsers.push_back(B);
+
+      ReplaceArgUses(BC);
+    }
+  }
+
+  // If this function has no escaping allocas or suspicious vararg usage,
+  // objc_storeStrong calls can be marked with the "tail" keyword.
+  if (TailOkForStoreStrongs)
+    for (CallInst *CI : StoreStrongCalls)
+      CI->setTailCall();
+  StoreStrongCalls.clear();
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+//                             Misc Pass Manager
+//===----------------------------------------------------------------------===//
+
 char ObjCARCContractLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(ObjCARCContractLegacyPass, "objc-arc-contract",
-                      "ObjC ARC contraction", false, false) 
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
+                      "ObjC ARC contraction", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_END(ObjCARCContractLegacyPass, "objc-arc-contract",
-                    "ObjC ARC contraction", false, false) 
- 
+                    "ObjC ARC contraction", false, false)
+
 void ObjCARCContractLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<AAResultsWrapperPass>(); 
-  AU.addRequired<DominatorTreeWrapperPass>(); 
-  AU.setPreservesCFG(); 
-} 
- 
+  AU.addRequired<AAResultsWrapperPass>();
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.setPreservesCFG();
+}
+
 Pass *llvm::createObjCARCContractPass() {
   return new ObjCARCContractLegacyPass();
 }
- 
+
 bool ObjCARCContractLegacyPass::doInitialization(Module &M) {
   return OCARCC.init(M);
 }
- 
+
 bool ObjCARCContractLegacyPass::runOnFunction(Function &F) {
   auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   return OCARCC.run(F, AA, DT);
 }
- 
+
 PreservedAnalyses ObjCARCContractPass::run(Function &F,
                                            FunctionAnalysisManager &AM) {
   ObjCARCContract OCAC;
   OCAC.init(*F.getParent());
- 
+
   bool Changed = OCAC.run(F, &AM.getResult<AAManager>(F),
                           &AM.getResult<DominatorTreeAnalysis>(F));
   if (Changed) {
@@ -763,4 +763,4 @@ PreservedAnalyses ObjCARCContractPass::run(Function &F,
     return PA;
   }
   return PreservedAnalyses::all();
-} 
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCExpand.cpp b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
index 1f757198fe..d2121dcebe 100644
--- a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
@@ -1,95 +1,95 @@
-//===- ObjCARCExpand.cpp - ObjC ARC Optimization --------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-/// \file 
-/// This file defines ObjC ARC optimizations. ARC stands for Automatic 
-/// Reference Counting and is a system for managing reference counts for objects 
-/// in Objective C. 
-/// 
-/// This specific file deals with early optimizations which perform certain 
-/// cleanup operations. 
-/// 
-/// WARNING: This file knows about certain library functions. It recognizes them 
-/// by name, and hardwires knowledge of their semantics. 
-/// 
-/// WARNING: This file knows about how certain Objective-C library functions are 
-/// used. Naive LLVM IR transformations which would otherwise be 
-/// behavior-preserving may break these assumptions. 
-/// 
-//===----------------------------------------------------------------------===// 
- 
-#include "ObjCARC.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/InstIterator.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
+//===- ObjCARCExpand.cpp - ObjC ARC Optimization --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines ObjC ARC optimizations. ARC stands for Automatic
+/// Reference Counting and is a system for managing reference counts for objects
+/// in Objective C.
+///
+/// This specific file deals with early optimizations which perform certain
+/// cleanup operations.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ObjCARC.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/PassRegistry.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/ObjCARC.h"
- 
-#define DEBUG_TYPE "objc-arc-expand" 
- 
-using namespace llvm; 
-using namespace llvm::objcarc; 
- 
-namespace { 
+
+#define DEBUG_TYPE "objc-arc-expand"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+namespace {
 static bool runImpl(Function &F) {
-  if (!EnableARCOpts) 
-    return false; 
- 
-  // If nothing in the Module uses ARC, don't do anything. 
+  if (!EnableARCOpts)
+    return false;
+
+  // If nothing in the Module uses ARC, don't do anything.
   if (!ModuleHasARC(*F.getParent()))
-    return false; 
- 
-  bool Changed = false; 
- 
-  LLVM_DEBUG(dbgs() << "ObjCARCExpand: Visiting Function: " << F.getName() 
-                    << "\n"); 
- 
-  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) { 
-    Instruction *Inst = &*I; 
- 
-    LLVM_DEBUG(dbgs() << "ObjCARCExpand: Visiting: " << *Inst << "\n"); 
- 
-    switch (GetBasicARCInstKind(Inst)) { 
-    case ARCInstKind::Retain: 
-    case ARCInstKind::RetainRV: 
-    case ARCInstKind::Autorelease: 
-    case ARCInstKind::AutoreleaseRV: 
-    case ARCInstKind::FusedRetainAutorelease: 
-    case ARCInstKind::FusedRetainAutoreleaseRV: { 
-      // These calls return their argument verbatim, as a low-level 
-      // optimization. However, this makes high-level optimizations 
-      // harder. Undo any uses of this optimization that the front-end 
-      // emitted here. We'll redo them in the contract pass. 
-      Changed = true; 
-      Value *Value = cast<CallInst>(Inst)->getArgOperand(0); 
-      LLVM_DEBUG(dbgs() << "ObjCARCExpand: Old = " << *Inst 
-                        << "\n" 
-                           "               New = " 
-                        << *Value << "\n"); 
-      Inst->replaceAllUsesWith(Value); 
-      break; 
-    } 
-    default: 
-      break; 
-    } 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "ObjCARCExpand: Finished List.\n\n"); 
- 
-  return Changed; 
-} 
+    return false;
+
+  bool Changed = false;
+
+  LLVM_DEBUG(dbgs() << "ObjCARCExpand: Visiting Function: " << F.getName()
+                    << "\n");
+
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) {
+    Instruction *Inst = &*I;
+
+    LLVM_DEBUG(dbgs() << "ObjCARCExpand: Visiting: " << *Inst << "\n");
+
+    switch (GetBasicARCInstKind(Inst)) {
+    case ARCInstKind::Retain:
+    case ARCInstKind::RetainRV:
+    case ARCInstKind::Autorelease:
+    case ARCInstKind::AutoreleaseRV:
+    case ARCInstKind::FusedRetainAutorelease:
+    case ARCInstKind::FusedRetainAutoreleaseRV: {
+      // These calls return their argument verbatim, as a low-level
+      // optimization. However, this makes high-level optimizations
+      // harder. Undo any uses of this optimization that the front-end
+      // emitted here. We'll redo them in the contract pass.
+      Changed = true;
+      Value *Value = cast<CallInst>(Inst)->getArgOperand(0);
+      LLVM_DEBUG(dbgs() << "ObjCARCExpand: Old = " << *Inst
+                        << "\n"
+                           "               New = "
+                        << *Value << "\n");
+      Inst->replaceAllUsesWith(Value);
+      break;
+    }
+    default:
+      break;
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "ObjCARCExpand: Finished List.\n\n");
+
+  return Changed;
+}
 
 /// Early ARC transformations.
 class ObjCARCExpand : public FunctionPass {
diff --git a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index 823d3fad2b..1c44749951 100644
--- a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -1,519 +1,519 @@
-//===- ObjCARCOpts.cpp - ObjC ARC Optimization ----------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-/// \file 
-/// This file defines ObjC ARC optimizations. ARC stands for Automatic 
-/// Reference Counting and is a system for managing reference counts for objects 
-/// in Objective C. 
-/// 
-/// The optimizations performed include elimination of redundant, partially 
-/// redundant, and inconsequential reference count operations, elimination of 
-/// redundant weak pointer operations, and numerous minor simplifications. 
-/// 
-/// WARNING: This file knows about certain library functions. It recognizes them 
-/// by name, and hardwires knowledge of their semantics. 
-/// 
-/// WARNING: This file knows about how certain Objective-C library functions are 
-/// used. Naive LLVM IR transformations which would otherwise be 
-/// behavior-preserving may break these assumptions. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "ARCRuntimeEntryPoints.h" 
-#include "BlotMapVector.h" 
-#include "DependencyAnalysis.h" 
-#include "ObjCARC.h" 
-#include "ProvenanceAnalysis.h" 
-#include "PtrState.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/None.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/AliasAnalysis.h" 
-#include "llvm/Analysis/EHPersonalities.h" 
-#include "llvm/Analysis/ObjCARCAliasAnalysis.h" 
-#include "llvm/Analysis/ObjCARCAnalysisUtils.h" 
-#include "llvm/Analysis/ObjCARCInstKind.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GlobalVariable.h" 
-#include "llvm/IR/InstIterator.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Compiler.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/raw_ostream.h" 
+//===- ObjCARCOpts.cpp - ObjC ARC Optimization ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file defines ObjC ARC optimizations. ARC stands for Automatic
+/// Reference Counting and is a system for managing reference counts for objects
+/// in Objective C.
+///
+/// The optimizations performed include elimination of redundant, partially
+/// redundant, and inconsequential reference count operations, elimination of
+/// redundant weak pointer operations, and numerous minor simplifications.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARCRuntimeEntryPoints.h"
+#include "BlotMapVector.h"
+#include "DependencyAnalysis.h"
+#include "ObjCARC.h"
+#include "ProvenanceAnalysis.h"
+#include "PtrState.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/ObjCARCAliasAnalysis.h"
+#include "llvm/Analysis/ObjCARCAnalysisUtils.h"
+#include "llvm/Analysis/ObjCARCInstKind.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/ObjCARC.h"
-#include <cassert> 
-#include <iterator> 
-#include <utility> 
- 
-using namespace llvm; 
-using namespace llvm::objcarc; 
- 
-#define DEBUG_TYPE "objc-arc-opts" 
- 
-static cl::opt<unsigned> MaxPtrStates("arc-opt-max-ptr-states", 
-    cl::Hidden, 
-    cl::desc("Maximum number of ptr states the optimizer keeps track of"), 
-    cl::init(4095)); 
- 
-/// \defgroup ARCUtilities Utility declarations/definitions specific to ARC. 
-/// @{ 
- 
-/// This is similar to GetRCIdentityRoot but it stops as soon 
-/// as it finds a value with multiple uses. 
-static const Value *FindSingleUseIdentifiedObject(const Value *Arg) { 
-  // ConstantData (like ConstantPointerNull and UndefValue) is used across 
-  // modules.  It's never a single-use value. 
-  if (isa<ConstantData>(Arg)) 
-    return nullptr; 
- 
-  if (Arg->hasOneUse()) { 
-    if (const BitCastInst *BC = dyn_cast<BitCastInst>(Arg)) 
-      return FindSingleUseIdentifiedObject(BC->getOperand(0)); 
-    if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Arg)) 
-      if (GEP->hasAllZeroIndices()) 
-        return FindSingleUseIdentifiedObject(GEP->getPointerOperand()); 
-    if (IsForwarding(GetBasicARCInstKind(Arg))) 
-      return FindSingleUseIdentifiedObject( 
-               cast<CallInst>(Arg)->getArgOperand(0)); 
-    if (!IsObjCIdentifiedObject(Arg)) 
-      return nullptr; 
-    return Arg; 
-  } 
- 
-  // If we found an identifiable object but it has multiple uses, but they are 
-  // trivial uses, we can still consider this to be a single-use value. 
-  if (IsObjCIdentifiedObject(Arg)) { 
-    for (const User *U : Arg->users()) 
-      if (!U->use_empty() || GetRCIdentityRoot(U) != Arg) 
-         return nullptr; 
- 
-    return Arg; 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// @} 
-/// 
-/// \defgroup ARCOpt ARC Optimization. 
-/// @{ 
- 
-// TODO: On code like this: 
-// 
-// objc_retain(%x) 
-// stuff_that_cannot_release() 
-// objc_autorelease(%x) 
-// stuff_that_cannot_release() 
-// objc_retain(%x) 
-// stuff_that_cannot_release() 
-// objc_autorelease(%x) 
-// 
-// The second retain and autorelease can be deleted. 
- 
-// TODO: It should be possible to delete 
-// objc_autoreleasePoolPush and objc_autoreleasePoolPop 
-// pairs if nothing is actually autoreleased between them. Also, autorelease 
-// calls followed by objc_autoreleasePoolPop calls (perhaps in ObjC++ code 
-// after inlining) can be turned into plain release calls. 
- 
-// TODO: Critical-edge splitting. If the optimial insertion point is 
-// a critical edge, the current algorithm has to fail, because it doesn't 
-// know how to split edges. It should be possible to make the optimizer 
-// think in terms of edges, rather than blocks, and then split critical 
-// edges on demand. 
- 
-// TODO: OptimizeSequences could generalized to be Interprocedural. 
- 
-// TODO: Recognize that a bunch of other objc runtime calls have 
-// non-escaping arguments and non-releasing arguments, and may be 
-// non-autoreleasing. 
- 
-// TODO: Sink autorelease calls as far as possible. Unfortunately we 
-// usually can't sink them past other calls, which would be the main 
-// case where it would be useful. 
- 
-// TODO: The pointer returned from objc_loadWeakRetained is retained. 
- 
-// TODO: Delete release+retain pairs (rare). 
- 
-STATISTIC(NumNoops,       "Number of no-op objc calls eliminated"); 
-STATISTIC(NumPartialNoops, "Number of partially no-op objc calls eliminated"); 
-STATISTIC(NumAutoreleases,"Number of autoreleases converted to releases"); 
-STATISTIC(NumRets,        "Number of return value forwarding " 
-                          "retain+autoreleases eliminated"); 
-STATISTIC(NumRRs,         "Number of retain+release paths eliminated"); 
-STATISTIC(NumPeeps,       "Number of calls peephole-optimized"); 
-#ifndef NDEBUG 
-STATISTIC(NumRetainsBeforeOpt, 
-          "Number of retains before optimization"); 
-STATISTIC(NumReleasesBeforeOpt, 
-          "Number of releases before optimization"); 
-STATISTIC(NumRetainsAfterOpt, 
-          "Number of retains after optimization"); 
-STATISTIC(NumReleasesAfterOpt, 
-          "Number of releases after optimization"); 
-#endif 
- 
-namespace { 
- 
-  /// Per-BasicBlock state. 
-  class BBState { 
-    /// The number of unique control paths from the entry which can reach this 
-    /// block. 
-    unsigned TopDownPathCount = 0; 
- 
-    /// The number of unique control paths to exits from this block. 
-    unsigned BottomUpPathCount = 0; 
- 
-    /// The top-down traversal uses this to record information known about a 
-    /// pointer at the bottom of each block. 
-    BlotMapVector<const Value *, TopDownPtrState> PerPtrTopDown; 
- 
-    /// The bottom-up traversal uses this to record information known about a 
-    /// pointer at the top of each block. 
-    BlotMapVector<const Value *, BottomUpPtrState> PerPtrBottomUp; 
- 
-    /// Effective predecessors of the current block ignoring ignorable edges and 
-    /// ignored backedges. 
-    SmallVector<BasicBlock *, 2> Preds; 
- 
-    /// Effective successors of the current block ignoring ignorable edges and 
-    /// ignored backedges. 
-    SmallVector<BasicBlock *, 2> Succs; 
- 
-  public: 
-    static const unsigned OverflowOccurredValue; 
- 
-    BBState() = default; 
- 
-    using top_down_ptr_iterator = decltype(PerPtrTopDown)::iterator; 
-    using const_top_down_ptr_iterator = decltype(PerPtrTopDown)::const_iterator; 
- 
-    top_down_ptr_iterator top_down_ptr_begin() { return PerPtrTopDown.begin(); } 
-    top_down_ptr_iterator top_down_ptr_end() { return PerPtrTopDown.end(); } 
-    const_top_down_ptr_iterator top_down_ptr_begin() const { 
-      return PerPtrTopDown.begin(); 
-    } 
-    const_top_down_ptr_iterator top_down_ptr_end() const { 
-      return PerPtrTopDown.end(); 
-    } 
-    bool hasTopDownPtrs() const { 
-      return !PerPtrTopDown.empty(); 
-    } 
- 
-    unsigned top_down_ptr_list_size() const { 
-      return std::distance(top_down_ptr_begin(), top_down_ptr_end()); 
-    } 
- 
-    using bottom_up_ptr_iterator = decltype(PerPtrBottomUp)::iterator; 
-    using const_bottom_up_ptr_iterator = 
-        decltype(PerPtrBottomUp)::const_iterator; 
- 
-    bottom_up_ptr_iterator bottom_up_ptr_begin() { 
-      return PerPtrBottomUp.begin(); 
-    } 
-    bottom_up_ptr_iterator bottom_up_ptr_end() { return PerPtrBottomUp.end(); } 
-    const_bottom_up_ptr_iterator bottom_up_ptr_begin() const { 
-      return PerPtrBottomUp.begin(); 
-    } 
-    const_bottom_up_ptr_iterator bottom_up_ptr_end() const { 
-      return PerPtrBottomUp.end(); 
-    } 
-    bool hasBottomUpPtrs() const { 
-      return !PerPtrBottomUp.empty(); 
-    } 
- 
-    unsigned bottom_up_ptr_list_size() const { 
-      return std::distance(bottom_up_ptr_begin(), bottom_up_ptr_end()); 
-    } 
- 
-    /// Mark this block as being an entry block, which has one path from the 
-    /// entry by definition. 
-    void SetAsEntry() { TopDownPathCount = 1; } 
- 
-    /// Mark this block as being an exit block, which has one path to an exit by 
-    /// definition. 
-    void SetAsExit()  { BottomUpPathCount = 1; } 
- 
-    /// Attempt to find the PtrState object describing the top down state for 
-    /// pointer Arg. Return a new initialized PtrState describing the top down 
-    /// state for Arg if we do not find one. 
-    TopDownPtrState &getPtrTopDownState(const Value *Arg) { 
-      return PerPtrTopDown[Arg]; 
-    } 
- 
-    /// Attempt to find the PtrState object describing the bottom up state for 
-    /// pointer Arg. Return a new initialized PtrState describing the bottom up 
-    /// state for Arg if we do not find one. 
-    BottomUpPtrState &getPtrBottomUpState(const Value *Arg) { 
-      return PerPtrBottomUp[Arg]; 
-    } 
- 
-    /// Attempt to find the PtrState object describing the bottom up state for 
-    /// pointer Arg. 
-    bottom_up_ptr_iterator findPtrBottomUpState(const Value *Arg) { 
-      return PerPtrBottomUp.find(Arg); 
-    } 
- 
-    void clearBottomUpPointers() { 
-      PerPtrBottomUp.clear(); 
-    } 
- 
-    void clearTopDownPointers() { 
-      PerPtrTopDown.clear(); 
-    } 
- 
-    void InitFromPred(const BBState &Other); 
-    void InitFromSucc(const BBState &Other); 
-    void MergePred(const BBState &Other); 
-    void MergeSucc(const BBState &Other); 
- 
-    /// Compute the number of possible unique paths from an entry to an exit 
-    /// which pass through this block. This is only valid after both the 
-    /// top-down and bottom-up traversals are complete. 
-    /// 
-    /// Returns true if overflow occurred. Returns false if overflow did not 
-    /// occur. 
-    bool GetAllPathCountWithOverflow(unsigned &PathCount) const { 
-      if (TopDownPathCount == OverflowOccurredValue || 
-          BottomUpPathCount == OverflowOccurredValue) 
-        return true; 
-      unsigned long long Product = 
-        (unsigned long long)TopDownPathCount*BottomUpPathCount; 
-      // Overflow occurred if any of the upper bits of Product are set or if all 
-      // the lower bits of Product are all set. 
-      return (Product >> 32) || 
-             ((PathCount = Product) == OverflowOccurredValue); 
-    } 
- 
-    // Specialized CFG utilities. 
-    using edge_iterator = SmallVectorImpl<BasicBlock *>::const_iterator; 
- 
-    edge_iterator pred_begin() const { return Preds.begin(); } 
-    edge_iterator pred_end() const { return Preds.end(); } 
-    edge_iterator succ_begin() const { return Succs.begin(); } 
-    edge_iterator succ_end() const { return Succs.end(); } 
- 
-    void addSucc(BasicBlock *Succ) { Succs.push_back(Succ); } 
-    void addPred(BasicBlock *Pred) { Preds.push_back(Pred); } 
- 
-    bool isExit() const { return Succs.empty(); } 
-  }; 
- 
-} // end anonymous namespace 
- 
-const unsigned BBState::OverflowOccurredValue = 0xffffffff; 
- 
-namespace llvm { 
- 
-raw_ostream &operator<<(raw_ostream &OS, 
-                        BBState &BBState) LLVM_ATTRIBUTE_UNUSED; 
- 
-} // end namespace llvm 
- 
-void BBState::InitFromPred(const BBState &Other) { 
-  PerPtrTopDown = Other.PerPtrTopDown; 
-  TopDownPathCount = Other.TopDownPathCount; 
-} 
- 
-void BBState::InitFromSucc(const BBState &Other) { 
-  PerPtrBottomUp = Other.PerPtrBottomUp; 
-  BottomUpPathCount = Other.BottomUpPathCount; 
-} 
- 
-/// The top-down traversal uses this to merge information about predecessors to 
-/// form the initial state for a new block. 
-void BBState::MergePred(const BBState &Other) { 
-  if (TopDownPathCount == OverflowOccurredValue) 
-    return; 
- 
-  // Other.TopDownPathCount can be 0, in which case it is either dead or a 
-  // loop backedge. Loop backedges are special. 
-  TopDownPathCount += Other.TopDownPathCount; 
- 
-  // In order to be consistent, we clear the top down pointers when by adding 
-  // TopDownPathCount becomes OverflowOccurredValue even though "true" overflow 
-  // has not occurred. 
-  if (TopDownPathCount == OverflowOccurredValue) { 
-    clearTopDownPointers(); 
-    return; 
-  } 
- 
-  // Check for overflow. If we have overflow, fall back to conservative 
-  // behavior. 
-  if (TopDownPathCount < Other.TopDownPathCount) { 
-    TopDownPathCount = OverflowOccurredValue; 
-    clearTopDownPointers(); 
-    return; 
-  } 
- 
-  // For each entry in the other set, if our set has an entry with the same key, 
-  // merge the entries. Otherwise, copy the entry and merge it with an empty 
-  // entry. 
-  for (auto MI = Other.top_down_ptr_begin(), ME = Other.top_down_ptr_end(); 
-       MI != ME; ++MI) { 
-    auto Pair = PerPtrTopDown.insert(*MI); 
-    Pair.first->second.Merge(Pair.second ? TopDownPtrState() : MI->second, 
-                             /*TopDown=*/true); 
-  } 
- 
-  // For each entry in our set, if the other set doesn't have an entry with the 
-  // same key, force it to merge with an empty entry. 
-  for (auto MI = top_down_ptr_begin(), ME = top_down_ptr_end(); MI != ME; ++MI) 
-    if (Other.PerPtrTopDown.find(MI->first) == Other.PerPtrTopDown.end()) 
-      MI->second.Merge(TopDownPtrState(), /*TopDown=*/true); 
-} 
- 
-/// The bottom-up traversal uses this to merge information about successors to 
-/// form the initial state for a new block. 
-void BBState::MergeSucc(const BBState &Other) { 
-  if (BottomUpPathCount == OverflowOccurredValue) 
-    return; 
- 
-  // Other.BottomUpPathCount can be 0, in which case it is either dead or a 
-  // loop backedge. Loop backedges are special. 
-  BottomUpPathCount += Other.BottomUpPathCount; 
- 
-  // In order to be consistent, we clear the top down pointers when by adding 
-  // BottomUpPathCount becomes OverflowOccurredValue even though "true" overflow 
-  // has not occurred. 
-  if (BottomUpPathCount == OverflowOccurredValue) { 
-    clearBottomUpPointers(); 
-    return; 
-  } 
- 
-  // Check for overflow. If we have overflow, fall back to conservative 
-  // behavior. 
-  if (BottomUpPathCount < Other.BottomUpPathCount) { 
-    BottomUpPathCount = OverflowOccurredValue; 
-    clearBottomUpPointers(); 
-    return; 
-  } 
- 
-  // For each entry in the other set, if our set has an entry with the 
-  // same key, merge the entries. Otherwise, copy the entry and merge 
-  // it with an empty entry. 
-  for (auto MI = Other.bottom_up_ptr_begin(), ME = Other.bottom_up_ptr_end(); 
-       MI != ME; ++MI) { 
-    auto Pair = PerPtrBottomUp.insert(*MI); 
-    Pair.first->second.Merge(Pair.second ? BottomUpPtrState() : MI->second, 
-                             /*TopDown=*/false); 
-  } 
- 
-  // For each entry in our set, if the other set doesn't have an entry 
-  // with the same key, force it to merge with an empty entry. 
-  for (auto MI = bottom_up_ptr_begin(), ME = bottom_up_ptr_end(); MI != ME; 
-       ++MI) 
-    if (Other.PerPtrBottomUp.find(MI->first) == Other.PerPtrBottomUp.end()) 
-      MI->second.Merge(BottomUpPtrState(), /*TopDown=*/false); 
-} 
- 
-raw_ostream &llvm::operator<<(raw_ostream &OS, BBState &BBInfo) { 
-  // Dump the pointers we are tracking. 
-  OS << "    TopDown State:\n"; 
-  if (!BBInfo.hasTopDownPtrs()) { 
-    LLVM_DEBUG(dbgs() << "        NONE!\n"); 
-  } else { 
-    for (auto I = BBInfo.top_down_ptr_begin(), E = BBInfo.top_down_ptr_end(); 
-         I != E; ++I) { 
-      const PtrState &P = I->second; 
-      OS << "        Ptr: " << *I->first 
-         << "\n            KnownSafe:        " << (P.IsKnownSafe()?"true":"false") 
-         << "\n            ImpreciseRelease: " 
-           << (P.IsTrackingImpreciseReleases()?"true":"false") << "\n" 
-         << "            HasCFGHazards:    " 
-           << (P.IsCFGHazardAfflicted()?"true":"false") << "\n" 
-         << "            KnownPositive:    " 
-           << (P.HasKnownPositiveRefCount()?"true":"false") << "\n" 
-         << "            Seq:              " 
-         << P.GetSeq() << "\n"; 
-    } 
-  } 
- 
-  OS << "    BottomUp State:\n"; 
-  if (!BBInfo.hasBottomUpPtrs()) { 
-    LLVM_DEBUG(dbgs() << "        NONE!\n"); 
-  } else { 
-    for (auto I = BBInfo.bottom_up_ptr_begin(), E = BBInfo.bottom_up_ptr_end(); 
-         I != E; ++I) { 
-      const PtrState &P = I->second; 
-      OS << "        Ptr: " << *I->first 
-         << "\n            KnownSafe:        " << (P.IsKnownSafe()?"true":"false") 
-         << "\n            ImpreciseRelease: " 
-           << (P.IsTrackingImpreciseReleases()?"true":"false") << "\n" 
-         << "            HasCFGHazards:    " 
-           << (P.IsCFGHazardAfflicted()?"true":"false") << "\n" 
-         << "            KnownPositive:    " 
-           << (P.HasKnownPositiveRefCount()?"true":"false") << "\n" 
-         << "            Seq:              " 
-         << P.GetSeq() << "\n"; 
-    } 
-  } 
- 
-  return OS; 
-} 
- 
-namespace { 
- 
-  /// The main ARC optimization pass. 
+#include <cassert>
+#include <iterator>
+#include <utility>
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+#define DEBUG_TYPE "objc-arc-opts"
+
+static cl::opt<unsigned> MaxPtrStates("arc-opt-max-ptr-states",
+    cl::Hidden,
+    cl::desc("Maximum number of ptr states the optimizer keeps track of"),
+    cl::init(4095));
+
+/// \defgroup ARCUtilities Utility declarations/definitions specific to ARC.
+/// @{
+
+/// This is similar to GetRCIdentityRoot but it stops as soon
+/// as it finds a value with multiple uses.
+static const Value *FindSingleUseIdentifiedObject(const Value *Arg) {
+  // ConstantData (like ConstantPointerNull and UndefValue) is used across
+  // modules.  It's never a single-use value.
+  if (isa<ConstantData>(Arg))
+    return nullptr;
+
+  if (Arg->hasOneUse()) {
+    if (const BitCastInst *BC = dyn_cast<BitCastInst>(Arg))
+      return FindSingleUseIdentifiedObject(BC->getOperand(0));
+    if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Arg))
+      if (GEP->hasAllZeroIndices())
+        return FindSingleUseIdentifiedObject(GEP->getPointerOperand());
+    if (IsForwarding(GetBasicARCInstKind(Arg)))
+      return FindSingleUseIdentifiedObject(
+               cast<CallInst>(Arg)->getArgOperand(0));
+    if (!IsObjCIdentifiedObject(Arg))
+      return nullptr;
+    return Arg;
+  }
+
+  // If we found an identifiable object but it has multiple uses, but they are
+  // trivial uses, we can still consider this to be a single-use value.
+  if (IsObjCIdentifiedObject(Arg)) {
+    for (const User *U : Arg->users())
+      if (!U->use_empty() || GetRCIdentityRoot(U) != Arg)
+         return nullptr;
+
+    return Arg;
+  }
+
+  return nullptr;
+}
+
+/// @}
+///
+/// \defgroup ARCOpt ARC Optimization.
+/// @{
+
+// TODO: On code like this:
+//
+// objc_retain(%x)
+// stuff_that_cannot_release()
+// objc_autorelease(%x)
+// stuff_that_cannot_release()
+// objc_retain(%x)
+// stuff_that_cannot_release()
+// objc_autorelease(%x)
+//
+// The second retain and autorelease can be deleted.
+
+// TODO: It should be possible to delete
+// objc_autoreleasePoolPush and objc_autoreleasePoolPop
+// pairs if nothing is actually autoreleased between them. Also, autorelease
+// calls followed by objc_autoreleasePoolPop calls (perhaps in ObjC++ code
+// after inlining) can be turned into plain release calls.
+
+// TODO: Critical-edge splitting. If the optimial insertion point is
+// a critical edge, the current algorithm has to fail, because it doesn't
+// know how to split edges. It should be possible to make the optimizer
+// think in terms of edges, rather than blocks, and then split critical
+// edges on demand.
+
+// TODO: OptimizeSequences could generalized to be Interprocedural.
+
+// TODO: Recognize that a bunch of other objc runtime calls have
+// non-escaping arguments and non-releasing arguments, and may be
+// non-autoreleasing.
+
+// TODO: Sink autorelease calls as far as possible. Unfortunately we
+// usually can't sink them past other calls, which would be the main
+// case where it would be useful.
+
+// TODO: The pointer returned from objc_loadWeakRetained is retained.
+
+// TODO: Delete release+retain pairs (rare).
+
+STATISTIC(NumNoops,       "Number of no-op objc calls eliminated");
+STATISTIC(NumPartialNoops, "Number of partially no-op objc calls eliminated");
+STATISTIC(NumAutoreleases,"Number of autoreleases converted to releases");
+STATISTIC(NumRets,        "Number of return value forwarding "
+                          "retain+autoreleases eliminated");
+STATISTIC(NumRRs,         "Number of retain+release paths eliminated");
+STATISTIC(NumPeeps,       "Number of calls peephole-optimized");
+#ifndef NDEBUG
+STATISTIC(NumRetainsBeforeOpt,
+          "Number of retains before optimization");
+STATISTIC(NumReleasesBeforeOpt,
+          "Number of releases before optimization");
+STATISTIC(NumRetainsAfterOpt,
+          "Number of retains after optimization");
+STATISTIC(NumReleasesAfterOpt,
+          "Number of releases after optimization");
+#endif
+
+namespace {
+
+  /// Per-BasicBlock state.
+  class BBState {
+    /// The number of unique control paths from the entry which can reach this
+    /// block.
+    unsigned TopDownPathCount = 0;
+
+    /// The number of unique control paths to exits from this block.
+    unsigned BottomUpPathCount = 0;
+
+    /// The top-down traversal uses this to record information known about a
+    /// pointer at the bottom of each block.
+    BlotMapVector<const Value *, TopDownPtrState> PerPtrTopDown;
+
+    /// The bottom-up traversal uses this to record information known about a
+    /// pointer at the top of each block.
+    BlotMapVector<const Value *, BottomUpPtrState> PerPtrBottomUp;
+
+    /// Effective predecessors of the current block ignoring ignorable edges and
+    /// ignored backedges.
+    SmallVector<BasicBlock *, 2> Preds;
+
+    /// Effective successors of the current block ignoring ignorable edges and
+    /// ignored backedges.
+    SmallVector<BasicBlock *, 2> Succs;
+
+  public:
+    static const unsigned OverflowOccurredValue;
+
+    BBState() = default;
+
+    using top_down_ptr_iterator = decltype(PerPtrTopDown)::iterator;
+    using const_top_down_ptr_iterator = decltype(PerPtrTopDown)::const_iterator;
+
+    top_down_ptr_iterator top_down_ptr_begin() { return PerPtrTopDown.begin(); }
+    top_down_ptr_iterator top_down_ptr_end() { return PerPtrTopDown.end(); }
+    const_top_down_ptr_iterator top_down_ptr_begin() const {
+      return PerPtrTopDown.begin();
+    }
+    const_top_down_ptr_iterator top_down_ptr_end() const {
+      return PerPtrTopDown.end();
+    }
+    bool hasTopDownPtrs() const {
+      return !PerPtrTopDown.empty();
+    }
+
+    unsigned top_down_ptr_list_size() const {
+      return std::distance(top_down_ptr_begin(), top_down_ptr_end());
+    }
+
+    using bottom_up_ptr_iterator = decltype(PerPtrBottomUp)::iterator;
+    using const_bottom_up_ptr_iterator =
+        decltype(PerPtrBottomUp)::const_iterator;
+
+    bottom_up_ptr_iterator bottom_up_ptr_begin() {
+      return PerPtrBottomUp.begin();
+    }
+    bottom_up_ptr_iterator bottom_up_ptr_end() { return PerPtrBottomUp.end(); }
+    const_bottom_up_ptr_iterator bottom_up_ptr_begin() const {
+      return PerPtrBottomUp.begin();
+    }
+    const_bottom_up_ptr_iterator bottom_up_ptr_end() const {
+      return PerPtrBottomUp.end();
+    }
+    bool hasBottomUpPtrs() const {
+      return !PerPtrBottomUp.empty();
+    }
+
+    unsigned bottom_up_ptr_list_size() const {
+      return std::distance(bottom_up_ptr_begin(), bottom_up_ptr_end());
+    }
+
+    /// Mark this block as being an entry block, which has one path from the
+    /// entry by definition.
+    void SetAsEntry() { TopDownPathCount = 1; }
+
+    /// Mark this block as being an exit block, which has one path to an exit by
+    /// definition.
+    void SetAsExit()  { BottomUpPathCount = 1; }
+
+    /// Attempt to find the PtrState object describing the top down state for
+    /// pointer Arg. Return a new initialized PtrState describing the top down
+    /// state for Arg if we do not find one.
+    TopDownPtrState &getPtrTopDownState(const Value *Arg) {
+      return PerPtrTopDown[Arg];
+    }
+
+    /// Attempt to find the PtrState object describing the bottom up state for
+    /// pointer Arg. Return a new initialized PtrState describing the bottom up
+    /// state for Arg if we do not find one.
+    BottomUpPtrState &getPtrBottomUpState(const Value *Arg) {
+      return PerPtrBottomUp[Arg];
+    }
+
+    /// Attempt to find the PtrState object describing the bottom up state for
+    /// pointer Arg.
+    bottom_up_ptr_iterator findPtrBottomUpState(const Value *Arg) {
+      return PerPtrBottomUp.find(Arg);
+    }
+
+    void clearBottomUpPointers() {
+      PerPtrBottomUp.clear();
+    }
+
+    void clearTopDownPointers() {
+      PerPtrTopDown.clear();
+    }
+
+    void InitFromPred(const BBState &Other);
+    void InitFromSucc(const BBState &Other);
+    void MergePred(const BBState &Other);
+    void MergeSucc(const BBState &Other);
+
+    /// Compute the number of possible unique paths from an entry to an exit
+    /// which pass through this block. This is only valid after both the
+    /// top-down and bottom-up traversals are complete.
+    ///
+    /// Returns true if overflow occurred. Returns false if overflow did not
+    /// occur.
+    bool GetAllPathCountWithOverflow(unsigned &PathCount) const {
+      if (TopDownPathCount == OverflowOccurredValue ||
+          BottomUpPathCount == OverflowOccurredValue)
+        return true;
+      unsigned long long Product =
+        (unsigned long long)TopDownPathCount*BottomUpPathCount;
+      // Overflow occurred if any of the upper bits of Product are set or if all
+      // the lower bits of Product are all set.
+      return (Product >> 32) ||
+             ((PathCount = Product) == OverflowOccurredValue);
+    }
+
+    // Specialized CFG utilities.
+    using edge_iterator = SmallVectorImpl<BasicBlock *>::const_iterator;
+
+    edge_iterator pred_begin() const { return Preds.begin(); }
+    edge_iterator pred_end() const { return Preds.end(); }
+    edge_iterator succ_begin() const { return Succs.begin(); }
+    edge_iterator succ_end() const { return Succs.end(); }
+
+    void addSucc(BasicBlock *Succ) { Succs.push_back(Succ); }
+    void addPred(BasicBlock *Pred) { Preds.push_back(Pred); }
+
+    bool isExit() const { return Succs.empty(); }
+  };
+
+} // end anonymous namespace
+
+const unsigned BBState::OverflowOccurredValue = 0xffffffff;
+
+namespace llvm {
+
+raw_ostream &operator<<(raw_ostream &OS,
+                        BBState &BBState) LLVM_ATTRIBUTE_UNUSED;
+
+} // end namespace llvm
+
+void BBState::InitFromPred(const BBState &Other) {
+  PerPtrTopDown = Other.PerPtrTopDown;
+  TopDownPathCount = Other.TopDownPathCount;
+}
+
+void BBState::InitFromSucc(const BBState &Other) {
+  PerPtrBottomUp = Other.PerPtrBottomUp;
+  BottomUpPathCount = Other.BottomUpPathCount;
+}
+
+/// The top-down traversal uses this to merge information about predecessors to
+/// form the initial state for a new block.
+void BBState::MergePred(const BBState &Other) {
+  if (TopDownPathCount == OverflowOccurredValue)
+    return;
+
+  // Other.TopDownPathCount can be 0, in which case it is either dead or a
+  // loop backedge. Loop backedges are special.
+  TopDownPathCount += Other.TopDownPathCount;
+
+  // In order to be consistent, we clear the top down pointers when by adding
+  // TopDownPathCount becomes OverflowOccurredValue even though "true" overflow
+  // has not occurred.
+  if (TopDownPathCount == OverflowOccurredValue) {
+    clearTopDownPointers();
+    return;
+  }
+
+  // Check for overflow. If we have overflow, fall back to conservative
+  // behavior.
+  if (TopDownPathCount < Other.TopDownPathCount) {
+    TopDownPathCount = OverflowOccurredValue;
+    clearTopDownPointers();
+    return;
+  }
+
+  // For each entry in the other set, if our set has an entry with the same key,
+  // merge the entries. Otherwise, copy the entry and merge it with an empty
+  // entry.
+  for (auto MI = Other.top_down_ptr_begin(), ME = Other.top_down_ptr_end();
+       MI != ME; ++MI) {
+    auto Pair = PerPtrTopDown.insert(*MI);
+    Pair.first->second.Merge(Pair.second ? TopDownPtrState() : MI->second,
+                             /*TopDown=*/true);
+  }
+
+  // For each entry in our set, if the other set doesn't have an entry with the
+  // same key, force it to merge with an empty entry.
+  for (auto MI = top_down_ptr_begin(), ME = top_down_ptr_end(); MI != ME; ++MI)
+    if (Other.PerPtrTopDown.find(MI->first) == Other.PerPtrTopDown.end())
+      MI->second.Merge(TopDownPtrState(), /*TopDown=*/true);
+}
+
+/// The bottom-up traversal uses this to merge information about successors to
+/// form the initial state for a new block.
+void BBState::MergeSucc(const BBState &Other) {
+  if (BottomUpPathCount == OverflowOccurredValue)
+    return;
+
+  // Other.BottomUpPathCount can be 0, in which case it is either dead or a
+  // loop backedge. Loop backedges are special.
+  BottomUpPathCount += Other.BottomUpPathCount;
+
+  // In order to be consistent, we clear the top down pointers when by adding
+  // BottomUpPathCount becomes OverflowOccurredValue even though "true" overflow
+  // has not occurred.
+  if (BottomUpPathCount == OverflowOccurredValue) {
+    clearBottomUpPointers();
+    return;
+  }
+
+  // Check for overflow. If we have overflow, fall back to conservative
+  // behavior.
+  if (BottomUpPathCount < Other.BottomUpPathCount) {
+    BottomUpPathCount = OverflowOccurredValue;
+    clearBottomUpPointers();
+    return;
+  }
+
+  // For each entry in the other set, if our set has an entry with the
+  // same key, merge the entries. Otherwise, copy the entry and merge
+  // it with an empty entry.
+  for (auto MI = Other.bottom_up_ptr_begin(), ME = Other.bottom_up_ptr_end();
+       MI != ME; ++MI) {
+    auto Pair = PerPtrBottomUp.insert(*MI);
+    Pair.first->second.Merge(Pair.second ? BottomUpPtrState() : MI->second,
+                             /*TopDown=*/false);
+  }
+
+  // For each entry in our set, if the other set doesn't have an entry
+  // with the same key, force it to merge with an empty entry.
+  for (auto MI = bottom_up_ptr_begin(), ME = bottom_up_ptr_end(); MI != ME;
+       ++MI)
+    if (Other.PerPtrBottomUp.find(MI->first) == Other.PerPtrBottomUp.end())
+      MI->second.Merge(BottomUpPtrState(), /*TopDown=*/false);
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, BBState &BBInfo) {
+  // Dump the pointers we are tracking.
+  OS << "    TopDown State:\n";
+  if (!BBInfo.hasTopDownPtrs()) {
+    LLVM_DEBUG(dbgs() << "        NONE!\n");
+  } else {
+    for (auto I = BBInfo.top_down_ptr_begin(), E = BBInfo.top_down_ptr_end();
+         I != E; ++I) {
+      const PtrState &P = I->second;
+      OS << "        Ptr: " << *I->first
+         << "\n            KnownSafe:        " << (P.IsKnownSafe()?"true":"false")
+         << "\n            ImpreciseRelease: "
+           << (P.IsTrackingImpreciseReleases()?"true":"false") << "\n"
+         << "            HasCFGHazards:    "
+           << (P.IsCFGHazardAfflicted()?"true":"false") << "\n"
+         << "            KnownPositive:    "
+           << (P.HasKnownPositiveRefCount()?"true":"false") << "\n"
+         << "            Seq:              "
+         << P.GetSeq() << "\n";
+    }
+  }
+
+  OS << "    BottomUp State:\n";
+  if (!BBInfo.hasBottomUpPtrs()) {
+    LLVM_DEBUG(dbgs() << "        NONE!\n");
+  } else {
+    for (auto I = BBInfo.bottom_up_ptr_begin(), E = BBInfo.bottom_up_ptr_end();
+         I != E; ++I) {
+      const PtrState &P = I->second;
+      OS << "        Ptr: " << *I->first
+         << "\n            KnownSafe:        " << (P.IsKnownSafe()?"true":"false")
+         << "\n            ImpreciseRelease: "
+           << (P.IsTrackingImpreciseReleases()?"true":"false") << "\n"
+         << "            HasCFGHazards:    "
+           << (P.IsCFGHazardAfflicted()?"true":"false") << "\n"
+         << "            KnownPositive:    "
+           << (P.HasKnownPositiveRefCount()?"true":"false") << "\n"
+         << "            Seq:              "
+         << P.GetSeq() << "\n";
+    }
+  }
+
+  return OS;
+}
+
+namespace {
+
+  /// The main ARC optimization pass.
 class ObjCARCOpt {
   bool Changed;
   ProvenanceAnalysis PA;
- 
+
   /// A cache of references to runtime entry point constants.
   ARCRuntimeEntryPoints EP;
- 
+
   /// A cache of MDKinds that can be passed into other functions to propagate
   /// MDKind identifiers.
   ARCMDKindCache MDKindCache;
- 
+
   /// A flag indicating whether this optimization pass should run.
   bool Run;
- 
+
   /// A flag indicating whether the optimization that removes or moves
   /// retain/release pairs should be performed.
   bool DisableRetainReleasePairing = false;
- 
+
   /// Flags which determine whether each of the interesting runtime functions
   /// is in fact used in the current function.
   unsigned UsedInThisFunction;
- 
+
   bool OptimizeRetainRVCall(Function &F, Instruction *RetainRV);
   void OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV,
                                  ARCInstKind &Class);
   void OptimizeIndividualCalls(Function &F);
- 
+
   /// Optimize an individual call, optionally passing the
   /// GetArgRCIdentityRoot if it has already been computed.
   void OptimizeIndividualCallImpl(
       Function &F, DenseMap<BasicBlock *, ColorVector> &BlockColors,
       Instruction *Inst, ARCInstKind Class, const Value *Arg);
- 
+
   /// Try to optimize an AutoreleaseRV with a RetainRV or ClaimRV.  If the
   /// optimization occurs, returns true to indicate that the caller should
   /// assume the instructions are dead.
@@ -521,7 +521,7 @@ class ObjCARCOpt {
       Function &F, DenseMap<BasicBlock *, ColorVector> &BlockColors,
       Instruction *Inst, const Value *&Arg, ARCInstKind Class,
       Instruction *AutoreleaseRV, const Value *&AutoreleaseRVArg);
- 
+
   void CheckForCFGHazards(const BasicBlock *BB,
                           DenseMap<const BasicBlock *, BBState> &BBStates,
                           BBState &MyStates) const;
@@ -540,12 +540,12 @@ class ObjCARCOpt {
   bool Visit(Function &F, DenseMap<const BasicBlock *, BBState> &BBStates,
              BlotMapVector<Value *, RRInfo> &Retains,
              DenseMap<Value *, RRInfo> &Releases);
- 
+
   void MoveCalls(Value *Arg, RRInfo &RetainsToMove, RRInfo &ReleasesToMove,
                  BlotMapVector<Value *, RRInfo> &Retains,
                  DenseMap<Value *, RRInfo> &Releases,
                  SmallVectorImpl<Instruction *> &DeadInsts, Module *M);
- 
+
   bool PairUpRetainsAndReleases(DenseMap<const BasicBlock *, BBState> &BBStates,
                                 BlotMapVector<Value *, RRInfo> &Retains,
                                 DenseMap<Value *, RRInfo> &Releases, Module *M,
@@ -554,27 +554,27 @@ class ObjCARCOpt {
                                 RRInfo &RetainsToMove, RRInfo &ReleasesToMove,
                                 Value *Arg, bool KnownSafe,
                                 bool &AnyPairsCompletelyEliminated);
- 
+
   bool PerformCodePlacement(DenseMap<const BasicBlock *, BBState> &BBStates,
                             BlotMapVector<Value *, RRInfo> &Retains,
                             DenseMap<Value *, RRInfo> &Releases, Module *M);
- 
+
   void OptimizeWeakCalls(Function &F);
- 
+
   bool OptimizeSequences(Function &F);
- 
+
   void OptimizeReturns(Function &F);
- 
-#ifndef NDEBUG 
+
+#ifndef NDEBUG
   void GatherStatistics(Function &F, bool AfterOptimization = false);
-#endif 
- 
-  public: 
+#endif
+
+  public:
     void init(Module &M);
     bool run(Function &F, AAResults &AA);
     void releaseMemory();
 };
- 
+
 /// The main ARC optimization pass.
 class ObjCARCOptLegacyPass : public FunctionPass {
 public:
@@ -591,1876 +591,1876 @@ public:
   }
   void releaseMemory() override { OCAO.releaseMemory(); }
   static char ID;
- 
+
 private:
   ObjCARCOpt OCAO;
 };
-} // end anonymous namespace 
- 
+} // end anonymous namespace
+
 char ObjCARCOptLegacyPass::ID = 0;
- 
+
 INITIALIZE_PASS_BEGIN(ObjCARCOptLegacyPass, "objc-arc", "ObjC ARC optimization",
                       false, false)
-INITIALIZE_PASS_DEPENDENCY(ObjCARCAAWrapperPass) 
+INITIALIZE_PASS_DEPENDENCY(ObjCARCAAWrapperPass)
 INITIALIZE_PASS_END(ObjCARCOptLegacyPass, "objc-arc", "ObjC ARC optimization",
                     false, false)
- 
+
 Pass *llvm::createObjCARCOptPass() { return new ObjCARCOptLegacyPass(); }
- 
+
 void ObjCARCOptLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<ObjCARCAAWrapperPass>(); 
-  AU.addRequired<AAResultsWrapperPass>(); 
-  // ARC optimization doesn't currently split critical edges. 
-  AU.setPreservesCFG(); 
-} 
- 
-/// Turn objc_retainAutoreleasedReturnValue into objc_retain if the operand is 
-/// not a return value. 
-bool 
-ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) { 
-  // Check for the argument being from an immediately preceding call or invoke. 
-  const Value *Arg = GetArgRCIdentityRoot(RetainRV); 
-  if (const Instruction *Call = dyn_cast<CallBase>(Arg)) { 
-    if (Call->getParent() == RetainRV->getParent()) { 
-      BasicBlock::const_iterator I(Call); 
-      ++I; 
-      while (IsNoopInstruction(&*I)) 
-        ++I; 
-      if (&*I == RetainRV) 
-        return false; 
-    } else if (const InvokeInst *II = dyn_cast<InvokeInst>(Call)) { 
-      BasicBlock *RetainRVParent = RetainRV->getParent(); 
-      if (II->getNormalDest() == RetainRVParent) { 
-        BasicBlock::const_iterator I = RetainRVParent->begin(); 
-        while (IsNoopInstruction(&*I)) 
-          ++I; 
-        if (&*I == RetainRV) 
-          return false; 
-      } 
-    } 
-  } 
- 
-  // Turn it to a plain objc_retain. 
-  Changed = true; 
-  ++NumPeeps; 
- 
-  LLVM_DEBUG(dbgs() << "Transforming objc_retainAutoreleasedReturnValue => " 
-                       "objc_retain since the operand is not a return value.\n" 
-                       "Old = " 
-                    << *RetainRV << "\n"); 
- 
-  Function *NewDecl = EP.get(ARCRuntimeEntryPointKind::Retain); 
-  cast<CallInst>(RetainRV)->setCalledFunction(NewDecl); 
- 
-  LLVM_DEBUG(dbgs() << "New = " << *RetainRV << "\n"); 
- 
-  return false; 
-} 
- 
-bool ObjCARCOpt::OptimizeInlinedAutoreleaseRVCall( 
-    Function &F, DenseMap<BasicBlock *, ColorVector> &BlockColors, 
-    Instruction *Inst, const Value *&Arg, ARCInstKind Class, 
-    Instruction *AutoreleaseRV, const Value *&AutoreleaseRVArg) { 
-  // Must be in the same basic block. 
-  assert(Inst->getParent() == AutoreleaseRV->getParent()); 
- 
-  // Must operate on the same root. 
-  Arg = GetArgRCIdentityRoot(Inst); 
-  AutoreleaseRVArg = GetArgRCIdentityRoot(AutoreleaseRV); 
-  if (Arg != AutoreleaseRVArg) { 
-    // If there isn't an exact match, check if we have equivalent PHIs. 
-    const PHINode *PN = dyn_cast<PHINode>(Arg); 
-    if (!PN) 
-      return false; 
- 
-    SmallVector<const Value *, 4> ArgUsers; 
-    getEquivalentPHIs(*PN, ArgUsers); 
+  AU.addRequired<ObjCARCAAWrapperPass>();
+  AU.addRequired<AAResultsWrapperPass>();
+  // ARC optimization doesn't currently split critical edges.
+  AU.setPreservesCFG();
+}
+
+/// Turn objc_retainAutoreleasedReturnValue into objc_retain if the operand is
+/// not a return value.
+bool
+ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
+  // Check for the argument being from an immediately preceding call or invoke.
+  const Value *Arg = GetArgRCIdentityRoot(RetainRV);
+  if (const Instruction *Call = dyn_cast<CallBase>(Arg)) {
+    if (Call->getParent() == RetainRV->getParent()) {
+      BasicBlock::const_iterator I(Call);
+      ++I;
+      while (IsNoopInstruction(&*I))
+        ++I;
+      if (&*I == RetainRV)
+        return false;
+    } else if (const InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
+      BasicBlock *RetainRVParent = RetainRV->getParent();
+      if (II->getNormalDest() == RetainRVParent) {
+        BasicBlock::const_iterator I = RetainRVParent->begin();
+        while (IsNoopInstruction(&*I))
+          ++I;
+        if (&*I == RetainRV)
+          return false;
+      }
+    }
+  }
+
+  // Turn it to a plain objc_retain.
+  Changed = true;
+  ++NumPeeps;
+
+  LLVM_DEBUG(dbgs() << "Transforming objc_retainAutoreleasedReturnValue => "
+                       "objc_retain since the operand is not a return value.\n"
+                       "Old = "
+                    << *RetainRV << "\n");
+
+  Function *NewDecl = EP.get(ARCRuntimeEntryPointKind::Retain);
+  cast<CallInst>(RetainRV)->setCalledFunction(NewDecl);
+
+  LLVM_DEBUG(dbgs() << "New = " << *RetainRV << "\n");
+
+  return false;
+}
+
+bool ObjCARCOpt::OptimizeInlinedAutoreleaseRVCall(
+    Function &F, DenseMap<BasicBlock *, ColorVector> &BlockColors,
+    Instruction *Inst, const Value *&Arg, ARCInstKind Class,
+    Instruction *AutoreleaseRV, const Value *&AutoreleaseRVArg) {
+  // Must be in the same basic block.
+  assert(Inst->getParent() == AutoreleaseRV->getParent());
+
+  // Must operate on the same root.
+  Arg = GetArgRCIdentityRoot(Inst);
+  AutoreleaseRVArg = GetArgRCIdentityRoot(AutoreleaseRV);
+  if (Arg != AutoreleaseRVArg) {
+    // If there isn't an exact match, check if we have equivalent PHIs.
+    const PHINode *PN = dyn_cast<PHINode>(Arg);
+    if (!PN)
+      return false;
+
+    SmallVector<const Value *, 4> ArgUsers;
+    getEquivalentPHIs(*PN, ArgUsers);
     if (!llvm::is_contained(ArgUsers, AutoreleaseRVArg))
-      return false; 
-  } 
- 
-  // Okay, this is a match.  Merge them. 
-  ++NumPeeps; 
-  LLVM_DEBUG(dbgs() << "Found inlined objc_autoreleaseReturnValue '" 
-                    << *AutoreleaseRV << "' paired with '" << *Inst << "'\n"); 
- 
-  // Delete the RV pair, starting with the AutoreleaseRV. 
-  AutoreleaseRV->replaceAllUsesWith( 
-      cast<CallInst>(AutoreleaseRV)->getArgOperand(0)); 
-  Changed = true; 
-  EraseInstruction(AutoreleaseRV); 
-  if (Class == ARCInstKind::RetainRV) { 
-    // AutoreleaseRV and RetainRV cancel out.  Delete the RetainRV. 
-    Inst->replaceAllUsesWith(cast<CallInst>(Inst)->getArgOperand(0)); 
-    EraseInstruction(Inst); 
-    return true; 
-  } 
- 
-  // ClaimRV is a frontend peephole for RetainRV + Release.  Since the 
-  // AutoreleaseRV and RetainRV cancel out, replace the ClaimRV with a Release. 
-  assert(Class == ARCInstKind::ClaimRV); 
-  Value *CallArg = cast<CallInst>(Inst)->getArgOperand(0); 
-  CallInst *Release = CallInst::Create( 
-      EP.get(ARCRuntimeEntryPointKind::Release), CallArg, "", Inst); 
-  assert(IsAlwaysTail(ARCInstKind::ClaimRV) && 
-         "Expected ClaimRV to be safe to tail call"); 
-  Release->setTailCall(); 
-  Inst->replaceAllUsesWith(CallArg); 
-  EraseInstruction(Inst); 
- 
-  // Run the normal optimizations on Release. 
-  OptimizeIndividualCallImpl(F, BlockColors, Release, ARCInstKind::Release, 
-                             Arg); 
-  return true; 
-} 
- 
-/// Turn objc_autoreleaseReturnValue into objc_autorelease if the result is not 
-/// used as a return value. 
-void ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, 
-                                           Instruction *AutoreleaseRV, 
-                                           ARCInstKind &Class) { 
-  // Check for a return of the pointer value. 
-  const Value *Ptr = GetArgRCIdentityRoot(AutoreleaseRV); 
- 
-  // If the argument is ConstantPointerNull or UndefValue, its other users 
-  // aren't actually interesting to look at. 
-  if (isa<ConstantData>(Ptr)) 
-    return; 
- 
-  SmallVector<const Value *, 2> Users; 
-  Users.push_back(Ptr); 
- 
-  // Add PHIs that are equivalent to Ptr to Users. 
-  if (const PHINode *PN = dyn_cast<PHINode>(Ptr)) 
-    getEquivalentPHIs(*PN, Users); 
- 
-  do { 
-    Ptr = Users.pop_back_val(); 
-    for (const User *U : Ptr->users()) { 
-      if (isa<ReturnInst>(U) || GetBasicARCInstKind(U) == ARCInstKind::RetainRV) 
-        return; 
-      if (isa<BitCastInst>(U)) 
-        Users.push_back(U); 
-    } 
-  } while (!Users.empty()); 
- 
-  Changed = true; 
-  ++NumPeeps; 
- 
-  LLVM_DEBUG( 
-      dbgs() << "Transforming objc_autoreleaseReturnValue => " 
-                "objc_autorelease since its operand is not used as a return " 
-                "value.\n" 
-                "Old = " 
-             << *AutoreleaseRV << "\n"); 
- 
-  CallInst *AutoreleaseRVCI = cast<CallInst>(AutoreleaseRV); 
-  Function *NewDecl = EP.get(ARCRuntimeEntryPointKind::Autorelease); 
-  AutoreleaseRVCI->setCalledFunction(NewDecl); 
-  AutoreleaseRVCI->setTailCall(false); // Never tail call objc_autorelease. 
-  Class = ARCInstKind::Autorelease; 
- 
-  LLVM_DEBUG(dbgs() << "New: " << *AutoreleaseRV << "\n"); 
-} 
- 
-namespace { 
-Instruction * 
-CloneCallInstForBB(CallInst &CI, BasicBlock &BB, 
-                   const DenseMap<BasicBlock *, ColorVector> &BlockColors) { 
-  SmallVector<OperandBundleDef, 1> OpBundles; 
-  for (unsigned I = 0, E = CI.getNumOperandBundles(); I != E; ++I) { 
-    auto Bundle = CI.getOperandBundleAt(I); 
-    // Funclets will be reassociated in the future. 
-    if (Bundle.getTagID() == LLVMContext::OB_funclet) 
-      continue; 
-    OpBundles.emplace_back(Bundle); 
-  } 
- 
-  if (!BlockColors.empty()) { 
-    const ColorVector &CV = BlockColors.find(&BB)->second; 
-    assert(CV.size() == 1 && "non-unique color for block!"); 
-    Instruction *EHPad = CV.front()->getFirstNonPHI(); 
-    if (EHPad->isEHPad()) 
-      OpBundles.emplace_back("funclet", EHPad); 
-  } 
- 
-  return CallInst::Create(&CI, OpBundles); 
-} 
-} 
- 
-/// Visit each call, one at a time, and make simplifications without doing any 
-/// additional analysis. 
-void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { 
-  LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeIndividualCalls ==\n"); 
-  // Reset all the flags in preparation for recomputing them. 
-  UsedInThisFunction = 0; 
- 
-  DenseMap<BasicBlock *, ColorVector> BlockColors; 
-  if (F.hasPersonalityFn() && 
-      isScopedEHPersonality(classifyEHPersonality(F.getPersonalityFn()))) 
-    BlockColors = colorEHFunclets(F); 
- 
-  // Store any delayed AutoreleaseRV intrinsics, so they can be easily paired 
-  // with RetainRV and ClaimRV. 
-  Instruction *DelayedAutoreleaseRV = nullptr; 
-  const Value *DelayedAutoreleaseRVArg = nullptr; 
-  auto setDelayedAutoreleaseRV = [&](Instruction *AutoreleaseRV) { 
-    assert(!DelayedAutoreleaseRV || !AutoreleaseRV); 
-    DelayedAutoreleaseRV = AutoreleaseRV; 
-    DelayedAutoreleaseRVArg = nullptr; 
-  }; 
-  auto optimizeDelayedAutoreleaseRV = [&]() { 
-    if (!DelayedAutoreleaseRV) 
-      return; 
-    OptimizeIndividualCallImpl(F, BlockColors, DelayedAutoreleaseRV, 
-                               ARCInstKind::AutoreleaseRV, 
-                               DelayedAutoreleaseRVArg); 
-    setDelayedAutoreleaseRV(nullptr); 
-  }; 
-  auto shouldDelayAutoreleaseRV = [&](Instruction *NonARCInst) { 
-    // Nothing to delay, but we may as well skip the logic below. 
-    if (!DelayedAutoreleaseRV) 
-      return true; 
- 
-    // If we hit the end of the basic block we're not going to find an RV-pair. 
-    // Stop delaying. 
-    if (NonARCInst->isTerminator()) 
-      return false; 
- 
-    // Given the frontend rules for emitting AutoreleaseRV, RetainRV, and 
-    // ClaimRV, it's probably safe to skip over even opaque function calls 
-    // here since OptimizeInlinedAutoreleaseRVCall will confirm that they 
-    // have the same RCIdentityRoot.  However, what really matters is 
-    // skipping instructions or intrinsics that the inliner could leave behind; 
-    // be conservative for now and don't skip over opaque calls, which could 
-    // potentially include other ARC calls. 
-    auto *CB = dyn_cast<CallBase>(NonARCInst); 
-    if (!CB) 
-      return true; 
-    return CB->getIntrinsicID() != Intrinsic::not_intrinsic; 
-  }; 
- 
-  // Visit all objc_* calls in F. 
-  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) { 
-    Instruction *Inst = &*I++; 
- 
-    ARCInstKind Class = GetBasicARCInstKind(Inst); 
- 
-    // Skip this loop if this instruction isn't itself an ARC intrinsic. 
-    const Value *Arg = nullptr; 
-    switch (Class) { 
-    default: 
-      optimizeDelayedAutoreleaseRV(); 
-      break; 
-    case ARCInstKind::CallOrUser: 
-    case ARCInstKind::User: 
-    case ARCInstKind::None: 
-      // This is a non-ARC instruction.  If we're delaying an AutoreleaseRV, 
-      // check if it's safe to skip over it; if not, optimize the AutoreleaseRV 
-      // now. 
-      if (!shouldDelayAutoreleaseRV(Inst)) 
-        optimizeDelayedAutoreleaseRV(); 
-      continue; 
-    case ARCInstKind::AutoreleaseRV: 
-      optimizeDelayedAutoreleaseRV(); 
-      setDelayedAutoreleaseRV(Inst); 
-      continue; 
-    case ARCInstKind::RetainRV: 
-    case ARCInstKind::ClaimRV: 
-      if (DelayedAutoreleaseRV) { 
-        // We have a potential RV pair.  Check if they cancel out. 
-        if (OptimizeInlinedAutoreleaseRVCall(F, BlockColors, Inst, Arg, Class, 
-                                             DelayedAutoreleaseRV, 
-                                             DelayedAutoreleaseRVArg)) { 
-          setDelayedAutoreleaseRV(nullptr); 
-          continue; 
-        } 
-        optimizeDelayedAutoreleaseRV(); 
-      } 
-      break; 
-    } 
- 
-    OptimizeIndividualCallImpl(F, BlockColors, Inst, Class, Arg); 
-  } 
- 
-  // Catch the final delayed AutoreleaseRV. 
-  optimizeDelayedAutoreleaseRV(); 
-} 
- 
-/// This function returns true if the value is inert. An ObjC ARC runtime call 
-/// taking an inert operand can be safely deleted. 
-static bool isInertARCValue(Value *V, SmallPtrSet<Value *, 1> &VisitedPhis) { 
-  V = V->stripPointerCasts(); 
- 
-  if (IsNullOrUndef(V)) 
-    return true; 
- 
-  // See if this is a global attribute annotated with an 'objc_arc_inert'. 
-  if (auto *GV = dyn_cast<GlobalVariable>(V)) 
-    if (GV->hasAttribute("objc_arc_inert")) 
-      return true; 
- 
-  if (auto PN = dyn_cast<PHINode>(V)) { 
-    // Ignore this phi if it has already been discovered. 
-    if (!VisitedPhis.insert(PN).second) 
-      return true; 
-    // Look through phis's operands. 
-    for (Value *Opnd : PN->incoming_values()) 
-      if (!isInertARCValue(Opnd, VisitedPhis)) 
-        return false; 
-    return true; 
-  } 
- 
-  return false; 
-} 
- 
-void ObjCARCOpt::OptimizeIndividualCallImpl( 
-    Function &F, DenseMap<BasicBlock *, ColorVector> &BlockColors, 
-    Instruction *Inst, ARCInstKind Class, const Value *Arg) { 
-  LLVM_DEBUG(dbgs() << "Visiting: Class: " << Class << "; " << *Inst << "\n"); 
- 
-  // We can delete this call if it takes an inert value. 
-  SmallPtrSet<Value *, 1> VisitedPhis; 
- 
-  if (IsNoopOnGlobal(Class)) 
-    if (isInertARCValue(Inst->getOperand(0), VisitedPhis)) { 
-      if (!Inst->getType()->isVoidTy()) 
-        Inst->replaceAllUsesWith(Inst->getOperand(0)); 
-      Inst->eraseFromParent(); 
-      Changed = true; 
-      return; 
-    } 
- 
-  switch (Class) { 
-  default: 
-    break; 
- 
-  // Delete no-op casts. These function calls have special semantics, but 
-  // the semantics are entirely implemented via lowering in the front-end, 
-  // so by the time they reach the optimizer, they are just no-op calls 
-  // which return their argument. 
-  // 
-  // There are gray areas here, as the ability to cast reference-counted 
-  // pointers to raw void* and back allows code to break ARC assumptions, 
-  // however these are currently considered to be unimportant. 
-  case ARCInstKind::NoopCast: 
-    Changed = true; 
-    ++NumNoops; 
-    LLVM_DEBUG(dbgs() << "Erasing no-op cast: " << *Inst << "\n"); 
-    EraseInstruction(Inst); 
-    return; 
- 
-  // If the pointer-to-weak-pointer is null, it's undefined behavior. 
-  case ARCInstKind::StoreWeak: 
-  case ARCInstKind::LoadWeak: 
-  case ARCInstKind::LoadWeakRetained: 
-  case ARCInstKind::InitWeak: 
-  case ARCInstKind::DestroyWeak: { 
-    CallInst *CI = cast<CallInst>(Inst); 
-    if (IsNullOrUndef(CI->getArgOperand(0))) { 
-      Changed = true; 
-      Type *Ty = CI->getArgOperand(0)->getType(); 
-      new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()), 
-                    Constant::getNullValue(Ty), CI); 
-      Value *NewValue = UndefValue::get(CI->getType()); 
-      LLVM_DEBUG( 
-          dbgs() << "A null pointer-to-weak-pointer is undefined behavior." 
-                    "\nOld = " 
-                 << *CI << "\nNew = " << *NewValue << "\n"); 
-      CI->replaceAllUsesWith(NewValue); 
-      CI->eraseFromParent(); 
-      return; 
-    } 
-    break; 
-  } 
-  case ARCInstKind::CopyWeak: 
-  case ARCInstKind::MoveWeak: { 
-    CallInst *CI = cast<CallInst>(Inst); 
-    if (IsNullOrUndef(CI->getArgOperand(0)) || 
-        IsNullOrUndef(CI->getArgOperand(1))) { 
-      Changed = true; 
-      Type *Ty = CI->getArgOperand(0)->getType(); 
-      new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()), 
-                    Constant::getNullValue(Ty), CI); 
- 
-      Value *NewValue = UndefValue::get(CI->getType()); 
-      LLVM_DEBUG( 
-          dbgs() << "A null pointer-to-weak-pointer is undefined behavior." 
-                    "\nOld = " 
-                 << *CI << "\nNew = " << *NewValue << "\n"); 
- 
-      CI->replaceAllUsesWith(NewValue); 
-      CI->eraseFromParent(); 
-      return; 
-    } 
-    break; 
-  } 
-  case ARCInstKind::RetainRV: 
-    if (OptimizeRetainRVCall(F, Inst)) 
-      return; 
-    break; 
-  case ARCInstKind::AutoreleaseRV: 
-    OptimizeAutoreleaseRVCall(F, Inst, Class); 
-    break; 
-  } 
- 
-  // objc_autorelease(x) -> objc_release(x) if x is otherwise unused. 
-  if (IsAutorelease(Class) && Inst->use_empty()) { 
-    CallInst *Call = cast<CallInst>(Inst); 
-    const Value *Arg = Call->getArgOperand(0); 
-    Arg = FindSingleUseIdentifiedObject(Arg); 
-    if (Arg) { 
-      Changed = true; 
-      ++NumAutoreleases; 
- 
-      // Create the declaration lazily. 
-      LLVMContext &C = Inst->getContext(); 
- 
-      Function *Decl = EP.get(ARCRuntimeEntryPointKind::Release); 
-      CallInst *NewCall = 
-          CallInst::Create(Decl, Call->getArgOperand(0), "", Call); 
-      NewCall->setMetadata(MDKindCache.get(ARCMDKindID::ImpreciseRelease), 
-                           MDNode::get(C, None)); 
- 
-      LLVM_DEBUG(dbgs() << "Replacing autorelease{,RV}(x) with objc_release(x) " 
-                           "since x is otherwise unused.\nOld: " 
-                        << *Call << "\nNew: " << *NewCall << "\n"); 
- 
-      EraseInstruction(Call); 
-      Inst = NewCall; 
-      Class = ARCInstKind::Release; 
-    } 
-  } 
- 
-  // For functions which can never be passed stack arguments, add 
-  // a tail keyword. 
-  if (IsAlwaysTail(Class) && !cast<CallInst>(Inst)->isNoTailCall()) { 
-    Changed = true; 
-    LLVM_DEBUG( 
-        dbgs() << "Adding tail keyword to function since it can never be " 
-                  "passed stack args: " 
-               << *Inst << "\n"); 
-    cast<CallInst>(Inst)->setTailCall(); 
-  } 
- 
-  // Ensure that functions that can never have a "tail" keyword due to the 
-  // semantics of ARC truly do not do so. 
-  if (IsNeverTail(Class)) { 
-    Changed = true; 
-    LLVM_DEBUG(dbgs() << "Removing tail keyword from function: " << *Inst 
-                      << "\n"); 
-    cast<CallInst>(Inst)->setTailCall(false); 
-  } 
- 
-  // Set nounwind as needed. 
-  if (IsNoThrow(Class)) { 
-    Changed = true; 
-    LLVM_DEBUG(dbgs() << "Found no throw class. Setting nounwind on: " << *Inst 
-                      << "\n"); 
-    cast<CallInst>(Inst)->setDoesNotThrow(); 
-  } 
- 
-  // Note: This catches instructions unrelated to ARC. 
-  if (!IsNoopOnNull(Class)) { 
-    UsedInThisFunction |= 1 << unsigned(Class); 
-    return; 
-  } 
- 
-  // If we haven't already looked up the root, look it up now. 
-  if (!Arg) 
-    Arg = GetArgRCIdentityRoot(Inst); 
- 
-  // ARC calls with null are no-ops. Delete them. 
-  if (IsNullOrUndef(Arg)) { 
-    Changed = true; 
-    ++NumNoops; 
-    LLVM_DEBUG(dbgs() << "ARC calls with  null are no-ops. Erasing: " << *Inst 
-                      << "\n"); 
-    EraseInstruction(Inst); 
-    return; 
-  } 
- 
-  // Keep track of which of retain, release, autorelease, and retain_block 
-  // are actually present in this function. 
-  UsedInThisFunction |= 1 << unsigned(Class); 
- 
-  // If Arg is a PHI, and one or more incoming values to the 
-  // PHI are null, and the call is control-equivalent to the PHI, and there 
-  // are no relevant side effects between the PHI and the call, and the call 
-  // is not a release that doesn't have the clang.imprecise_release tag, the 
-  // call could be pushed up to just those paths with non-null incoming 
-  // values. For now, don't bother splitting critical edges for this. 
-  if (Class == ARCInstKind::Release && 
-      !Inst->getMetadata(MDKindCache.get(ARCMDKindID::ImpreciseRelease))) 
-    return; 
- 
-  SmallVector<std::pair<Instruction *, const Value *>, 4> Worklist; 
-  Worklist.push_back(std::make_pair(Inst, Arg)); 
-  do { 
-    std::pair<Instruction *, const Value *> Pair = Worklist.pop_back_val(); 
-    Inst = Pair.first; 
-    Arg = Pair.second; 
- 
-    const PHINode *PN = dyn_cast<PHINode>(Arg); 
-    if (!PN) 
-      continue; 
- 
-    // Determine if the PHI has any null operands, or any incoming 
-    // critical edges. 
-    bool HasNull = false; 
-    bool HasCriticalEdges = false; 
-    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { 
-      Value *Incoming = GetRCIdentityRoot(PN->getIncomingValue(i)); 
-      if (IsNullOrUndef(Incoming)) 
-        HasNull = true; 
-      else if (PN->getIncomingBlock(i)->getTerminator()->getNumSuccessors() != 
-               1) { 
-        HasCriticalEdges = true; 
-        break; 
-      } 
-    } 
-    // If we have null operands and no critical edges, optimize. 
-    if (HasCriticalEdges) 
-      continue; 
-    if (!HasNull) 
-      continue; 
- 
+      return false;
+  }
+
+  // Okay, this is a match.  Merge them.
+  ++NumPeeps;
+  LLVM_DEBUG(dbgs() << "Found inlined objc_autoreleaseReturnValue '"
+                    << *AutoreleaseRV << "' paired with '" << *Inst << "'\n");
+
+  // Delete the RV pair, starting with the AutoreleaseRV.
+  AutoreleaseRV->replaceAllUsesWith(
+      cast<CallInst>(AutoreleaseRV)->getArgOperand(0));
+  Changed = true;
+  EraseInstruction(AutoreleaseRV);
+  if (Class == ARCInstKind::RetainRV) {
+    // AutoreleaseRV and RetainRV cancel out.  Delete the RetainRV.
+    Inst->replaceAllUsesWith(cast<CallInst>(Inst)->getArgOperand(0));
+    EraseInstruction(Inst);
+    return true;
+  }
+
+  // ClaimRV is a frontend peephole for RetainRV + Release.  Since the
+  // AutoreleaseRV and RetainRV cancel out, replace the ClaimRV with a Release.
+  assert(Class == ARCInstKind::ClaimRV);
+  Value *CallArg = cast<CallInst>(Inst)->getArgOperand(0);
+  CallInst *Release = CallInst::Create(
+      EP.get(ARCRuntimeEntryPointKind::Release), CallArg, "", Inst);
+  assert(IsAlwaysTail(ARCInstKind::ClaimRV) &&
+         "Expected ClaimRV to be safe to tail call");
+  Release->setTailCall();
+  Inst->replaceAllUsesWith(CallArg);
+  EraseInstruction(Inst);
+
+  // Run the normal optimizations on Release.
+  OptimizeIndividualCallImpl(F, BlockColors, Release, ARCInstKind::Release,
+                             Arg);
+  return true;
+}
+
+/// Turn objc_autoreleaseReturnValue into objc_autorelease if the result is not
+/// used as a return value.
+void ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F,
+                                           Instruction *AutoreleaseRV,
+                                           ARCInstKind &Class) {
+  // Check for a return of the pointer value.
+  const Value *Ptr = GetArgRCIdentityRoot(AutoreleaseRV);
+
+  // If the argument is ConstantPointerNull or UndefValue, its other users
+  // aren't actually interesting to look at.
+  if (isa<ConstantData>(Ptr))
+    return;
+
+  SmallVector<const Value *, 2> Users;
+  Users.push_back(Ptr);
+
+  // Add PHIs that are equivalent to Ptr to Users.
+  if (const PHINode *PN = dyn_cast<PHINode>(Ptr))
+    getEquivalentPHIs(*PN, Users);
+
+  do {
+    Ptr = Users.pop_back_val();
+    for (const User *U : Ptr->users()) {
+      if (isa<ReturnInst>(U) || GetBasicARCInstKind(U) == ARCInstKind::RetainRV)
+        return;
+      if (isa<BitCastInst>(U))
+        Users.push_back(U);
+    }
+  } while (!Users.empty());
+
+  Changed = true;
+  ++NumPeeps;
+
+  LLVM_DEBUG(
+      dbgs() << "Transforming objc_autoreleaseReturnValue => "
+                "objc_autorelease since its operand is not used as a return "
+                "value.\n"
+                "Old = "
+             << *AutoreleaseRV << "\n");
+
+  CallInst *AutoreleaseRVCI = cast<CallInst>(AutoreleaseRV);
+  Function *NewDecl = EP.get(ARCRuntimeEntryPointKind::Autorelease);
+  AutoreleaseRVCI->setCalledFunction(NewDecl);
+  AutoreleaseRVCI->setTailCall(false); // Never tail call objc_autorelease.
+  Class = ARCInstKind::Autorelease;
+
+  LLVM_DEBUG(dbgs() << "New: " << *AutoreleaseRV << "\n");
+}
+
+namespace {
+Instruction *
+CloneCallInstForBB(CallInst &CI, BasicBlock &BB,
+                   const DenseMap<BasicBlock *, ColorVector> &BlockColors) {
+  SmallVector<OperandBundleDef, 1> OpBundles;
+  for (unsigned I = 0, E = CI.getNumOperandBundles(); I != E; ++I) {
+    auto Bundle = CI.getOperandBundleAt(I);
+    // Funclets will be reassociated in the future.
+    if (Bundle.getTagID() == LLVMContext::OB_funclet)
+      continue;
+    OpBundles.emplace_back(Bundle);
+  }
+
+  if (!BlockColors.empty()) {
+    const ColorVector &CV = BlockColors.find(&BB)->second;
+    assert(CV.size() == 1 && "non-unique color for block!");
+    Instruction *EHPad = CV.front()->getFirstNonPHI();
+    if (EHPad->isEHPad())
+      OpBundles.emplace_back("funclet", EHPad);
+  }
+
+  return CallInst::Create(&CI, OpBundles);
+}
+}
+
+/// Visit each call, one at a time, and make simplifications without doing any
+/// additional analysis.
+void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
+  LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeIndividualCalls ==\n");
+  // Reset all the flags in preparation for recomputing them.
+  UsedInThisFunction = 0;
+
+  DenseMap<BasicBlock *, ColorVector> BlockColors;
+  if (F.hasPersonalityFn() &&
+      isScopedEHPersonality(classifyEHPersonality(F.getPersonalityFn())))
+    BlockColors = colorEHFunclets(F);
+
+  // Store any delayed AutoreleaseRV intrinsics, so they can be easily paired
+  // with RetainRV and ClaimRV.
+  Instruction *DelayedAutoreleaseRV = nullptr;
+  const Value *DelayedAutoreleaseRVArg = nullptr;
+  auto setDelayedAutoreleaseRV = [&](Instruction *AutoreleaseRV) {
+    assert(!DelayedAutoreleaseRV || !AutoreleaseRV);
+    DelayedAutoreleaseRV = AutoreleaseRV;
+    DelayedAutoreleaseRVArg = nullptr;
+  };
+  auto optimizeDelayedAutoreleaseRV = [&]() {
+    if (!DelayedAutoreleaseRV)
+      return;
+    OptimizeIndividualCallImpl(F, BlockColors, DelayedAutoreleaseRV,
+                               ARCInstKind::AutoreleaseRV,
+                               DelayedAutoreleaseRVArg);
+    setDelayedAutoreleaseRV(nullptr);
+  };
+  auto shouldDelayAutoreleaseRV = [&](Instruction *NonARCInst) {
+    // Nothing to delay, but we may as well skip the logic below.
+    if (!DelayedAutoreleaseRV)
+      return true;
+
+    // If we hit the end of the basic block we're not going to find an RV-pair.
+    // Stop delaying.
+    if (NonARCInst->isTerminator())
+      return false;
+
+    // Given the frontend rules for emitting AutoreleaseRV, RetainRV, and
+    // ClaimRV, it's probably safe to skip over even opaque function calls
+    // here since OptimizeInlinedAutoreleaseRVCall will confirm that they
+    // have the same RCIdentityRoot.  However, what really matters is
+    // skipping instructions or intrinsics that the inliner could leave behind;
+    // be conservative for now and don't skip over opaque calls, which could
+    // potentially include other ARC calls.
+    auto *CB = dyn_cast<CallBase>(NonARCInst);
+    if (!CB)
+      return true;
+    return CB->getIntrinsicID() != Intrinsic::not_intrinsic;
+  };
+
+  // Visit all objc_* calls in F.
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+    Instruction *Inst = &*I++;
+
+    ARCInstKind Class = GetBasicARCInstKind(Inst);
+
+    // Skip this loop if this instruction isn't itself an ARC intrinsic.
+    const Value *Arg = nullptr;
+    switch (Class) {
+    default:
+      optimizeDelayedAutoreleaseRV();
+      break;
+    case ARCInstKind::CallOrUser:
+    case ARCInstKind::User:
+    case ARCInstKind::None:
+      // This is a non-ARC instruction.  If we're delaying an AutoreleaseRV,
+      // check if it's safe to skip over it; if not, optimize the AutoreleaseRV
+      // now.
+      if (!shouldDelayAutoreleaseRV(Inst))
+        optimizeDelayedAutoreleaseRV();
+      continue;
+    case ARCInstKind::AutoreleaseRV:
+      optimizeDelayedAutoreleaseRV();
+      setDelayedAutoreleaseRV(Inst);
+      continue;
+    case ARCInstKind::RetainRV:
+    case ARCInstKind::ClaimRV:
+      if (DelayedAutoreleaseRV) {
+        // We have a potential RV pair.  Check if they cancel out.
+        if (OptimizeInlinedAutoreleaseRVCall(F, BlockColors, Inst, Arg, Class,
+                                             DelayedAutoreleaseRV,
+                                             DelayedAutoreleaseRVArg)) {
+          setDelayedAutoreleaseRV(nullptr);
+          continue;
+        }
+        optimizeDelayedAutoreleaseRV();
+      }
+      break;
+    }
+
+    OptimizeIndividualCallImpl(F, BlockColors, Inst, Class, Arg);
+  }
+
+  // Catch the final delayed AutoreleaseRV.
+  optimizeDelayedAutoreleaseRV();
+}
+
+/// This function returns true if the value is inert. An ObjC ARC runtime call
+/// taking an inert operand can be safely deleted.
+static bool isInertARCValue(Value *V, SmallPtrSet<Value *, 1> &VisitedPhis) {
+  V = V->stripPointerCasts();
+
+  if (IsNullOrUndef(V))
+    return true;
+
+  // See if this is a global attribute annotated with an 'objc_arc_inert'.
+  if (auto *GV = dyn_cast<GlobalVariable>(V))
+    if (GV->hasAttribute("objc_arc_inert"))
+      return true;
+
+  if (auto PN = dyn_cast<PHINode>(V)) {
+    // Ignore this phi if it has already been discovered.
+    if (!VisitedPhis.insert(PN).second)
+      return true;
+    // Look through phis's operands.
+    for (Value *Opnd : PN->incoming_values())
+      if (!isInertARCValue(Opnd, VisitedPhis))
+        return false;
+    return true;
+  }
+
+  return false;
+}
+
+void ObjCARCOpt::OptimizeIndividualCallImpl(
+    Function &F, DenseMap<BasicBlock *, ColorVector> &BlockColors,
+    Instruction *Inst, ARCInstKind Class, const Value *Arg) {
+  LLVM_DEBUG(dbgs() << "Visiting: Class: " << Class << "; " << *Inst << "\n");
+
+  // We can delete this call if it takes an inert value.
+  SmallPtrSet<Value *, 1> VisitedPhis;
+
+  if (IsNoopOnGlobal(Class))
+    if (isInertARCValue(Inst->getOperand(0), VisitedPhis)) {
+      if (!Inst->getType()->isVoidTy())
+        Inst->replaceAllUsesWith(Inst->getOperand(0));
+      Inst->eraseFromParent();
+      Changed = true;
+      return;
+    }
+
+  switch (Class) {
+  default:
+    break;
+
+  // Delete no-op casts. These function calls have special semantics, but
+  // the semantics are entirely implemented via lowering in the front-end,
+  // so by the time they reach the optimizer, they are just no-op calls
+  // which return their argument.
+  //
+  // There are gray areas here, as the ability to cast reference-counted
+  // pointers to raw void* and back allows code to break ARC assumptions,
+  // however these are currently considered to be unimportant.
+  case ARCInstKind::NoopCast:
+    Changed = true;
+    ++NumNoops;
+    LLVM_DEBUG(dbgs() << "Erasing no-op cast: " << *Inst << "\n");
+    EraseInstruction(Inst);
+    return;
+
+  // If the pointer-to-weak-pointer is null, it's undefined behavior.
+  case ARCInstKind::StoreWeak:
+  case ARCInstKind::LoadWeak:
+  case ARCInstKind::LoadWeakRetained:
+  case ARCInstKind::InitWeak:
+  case ARCInstKind::DestroyWeak: {
+    CallInst *CI = cast<CallInst>(Inst);
+    if (IsNullOrUndef(CI->getArgOperand(0))) {
+      Changed = true;
+      Type *Ty = CI->getArgOperand(0)->getType();
+      new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
+                    Constant::getNullValue(Ty), CI);
+      Value *NewValue = UndefValue::get(CI->getType());
+      LLVM_DEBUG(
+          dbgs() << "A null pointer-to-weak-pointer is undefined behavior."
+                    "\nOld = "
+                 << *CI << "\nNew = " << *NewValue << "\n");
+      CI->replaceAllUsesWith(NewValue);
+      CI->eraseFromParent();
+      return;
+    }
+    break;
+  }
+  case ARCInstKind::CopyWeak:
+  case ARCInstKind::MoveWeak: {
+    CallInst *CI = cast<CallInst>(Inst);
+    if (IsNullOrUndef(CI->getArgOperand(0)) ||
+        IsNullOrUndef(CI->getArgOperand(1))) {
+      Changed = true;
+      Type *Ty = CI->getArgOperand(0)->getType();
+      new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
+                    Constant::getNullValue(Ty), CI);
+
+      Value *NewValue = UndefValue::get(CI->getType());
+      LLVM_DEBUG(
+          dbgs() << "A null pointer-to-weak-pointer is undefined behavior."
+                    "\nOld = "
+                 << *CI << "\nNew = " << *NewValue << "\n");
+
+      CI->replaceAllUsesWith(NewValue);
+      CI->eraseFromParent();
+      return;
+    }
+    break;
+  }
+  case ARCInstKind::RetainRV:
+    if (OptimizeRetainRVCall(F, Inst))
+      return;
+    break;
+  case ARCInstKind::AutoreleaseRV:
+    OptimizeAutoreleaseRVCall(F, Inst, Class);
+    break;
+  }
+
+  // objc_autorelease(x) -> objc_release(x) if x is otherwise unused.
+  if (IsAutorelease(Class) && Inst->use_empty()) {
+    CallInst *Call = cast<CallInst>(Inst);
+    const Value *Arg = Call->getArgOperand(0);
+    Arg = FindSingleUseIdentifiedObject(Arg);
+    if (Arg) {
+      Changed = true;
+      ++NumAutoreleases;
+
+      // Create the declaration lazily.
+      LLVMContext &C = Inst->getContext();
+
+      Function *Decl = EP.get(ARCRuntimeEntryPointKind::Release);
+      CallInst *NewCall =
+          CallInst::Create(Decl, Call->getArgOperand(0), "", Call);
+      NewCall->setMetadata(MDKindCache.get(ARCMDKindID::ImpreciseRelease),
+                           MDNode::get(C, None));
+
+      LLVM_DEBUG(dbgs() << "Replacing autorelease{,RV}(x) with objc_release(x) "
+                           "since x is otherwise unused.\nOld: "
+                        << *Call << "\nNew: " << *NewCall << "\n");
+
+      EraseInstruction(Call);
+      Inst = NewCall;
+      Class = ARCInstKind::Release;
+    }
+  }
+
+  // For functions which can never be passed stack arguments, add
+  // a tail keyword.
+  if (IsAlwaysTail(Class) && !cast<CallInst>(Inst)->isNoTailCall()) {
+    Changed = true;
+    LLVM_DEBUG(
+        dbgs() << "Adding tail keyword to function since it can never be "
+                  "passed stack args: "
+               << *Inst << "\n");
+    cast<CallInst>(Inst)->setTailCall();
+  }
+
+  // Ensure that functions that can never have a "tail" keyword due to the
+  // semantics of ARC truly do not do so.
+  if (IsNeverTail(Class)) {
+    Changed = true;
+    LLVM_DEBUG(dbgs() << "Removing tail keyword from function: " << *Inst
+                      << "\n");
+    cast<CallInst>(Inst)->setTailCall(false);
+  }
+
+  // Set nounwind as needed.
+  if (IsNoThrow(Class)) {
+    Changed = true;
+    LLVM_DEBUG(dbgs() << "Found no throw class. Setting nounwind on: " << *Inst
+                      << "\n");
+    cast<CallInst>(Inst)->setDoesNotThrow();
+  }
+
+  // Note: This catches instructions unrelated to ARC.
+  if (!IsNoopOnNull(Class)) {
+    UsedInThisFunction |= 1 << unsigned(Class);
+    return;
+  }
+
+  // If we haven't already looked up the root, look it up now.
+  if (!Arg)
+    Arg = GetArgRCIdentityRoot(Inst);
+
+  // ARC calls with null are no-ops. Delete them.
+  if (IsNullOrUndef(Arg)) {
+    Changed = true;
+    ++NumNoops;
+    LLVM_DEBUG(dbgs() << "ARC calls with  null are no-ops. Erasing: " << *Inst
+                      << "\n");
+    EraseInstruction(Inst);
+    return;
+  }
+
+  // Keep track of which of retain, release, autorelease, and retain_block
+  // are actually present in this function.
+  UsedInThisFunction |= 1 << unsigned(Class);
+
+  // If Arg is a PHI, and one or more incoming values to the
+  // PHI are null, and the call is control-equivalent to the PHI, and there
+  // are no relevant side effects between the PHI and the call, and the call
+  // is not a release that doesn't have the clang.imprecise_release tag, the
+  // call could be pushed up to just those paths with non-null incoming
+  // values. For now, don't bother splitting critical edges for this.
+  if (Class == ARCInstKind::Release &&
+      !Inst->getMetadata(MDKindCache.get(ARCMDKindID::ImpreciseRelease)))
+    return;
+
+  SmallVector<std::pair<Instruction *, const Value *>, 4> Worklist;
+  Worklist.push_back(std::make_pair(Inst, Arg));
+  do {
+    std::pair<Instruction *, const Value *> Pair = Worklist.pop_back_val();
+    Inst = Pair.first;
+    Arg = Pair.second;
+
+    const PHINode *PN = dyn_cast<PHINode>(Arg);
+    if (!PN)
+      continue;
+
+    // Determine if the PHI has any null operands, or any incoming
+    // critical edges.
+    bool HasNull = false;
+    bool HasCriticalEdges = false;
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      Value *Incoming = GetRCIdentityRoot(PN->getIncomingValue(i));
+      if (IsNullOrUndef(Incoming))
+        HasNull = true;
+      else if (PN->getIncomingBlock(i)->getTerminator()->getNumSuccessors() !=
+               1) {
+        HasCriticalEdges = true;
+        break;
+      }
+    }
+    // If we have null operands and no critical edges, optimize.
+    if (HasCriticalEdges)
+      continue;
+    if (!HasNull)
+      continue;
+
     Instruction *DepInst = nullptr;
- 
-    // Check that there is nothing that cares about the reference 
-    // count between the call and the phi. 
-    switch (Class) { 
-    case ARCInstKind::Retain: 
-    case ARCInstKind::RetainBlock: 
-      // These can always be moved up. 
-      break; 
-    case ARCInstKind::Release: 
-      // These can't be moved across things that care about the retain 
-      // count. 
+
+    // Check that there is nothing that cares about the reference
+    // count between the call and the phi.
+    switch (Class) {
+    case ARCInstKind::Retain:
+    case ARCInstKind::RetainBlock:
+      // These can always be moved up.
+      break;
+    case ARCInstKind::Release:
+      // These can't be moved across things that care about the retain
+      // count.
       DepInst = findSingleDependency(NeedsPositiveRetainCount, Arg,
                                      Inst->getParent(), Inst, PA);
-      break; 
-    case ARCInstKind::Autorelease: 
-      // These can't be moved across autorelease pool scope boundaries. 
+      break;
+    case ARCInstKind::Autorelease:
+      // These can't be moved across autorelease pool scope boundaries.
       DepInst = findSingleDependency(AutoreleasePoolBoundary, Arg,
                                      Inst->getParent(), Inst, PA);
-      break; 
-    case ARCInstKind::ClaimRV: 
-    case ARCInstKind::RetainRV: 
-    case ARCInstKind::AutoreleaseRV: 
-      // Don't move these; the RV optimization depends on the autoreleaseRV 
-      // being tail called, and the retainRV being immediately after a call 
-      // (which might still happen if we get lucky with codegen layout, but 
-      // it's not worth taking the chance). 
-      continue; 
-    default: 
-      llvm_unreachable("Invalid dependence flavor"); 
-    } 
- 
+      break;
+    case ARCInstKind::ClaimRV:
+    case ARCInstKind::RetainRV:
+    case ARCInstKind::AutoreleaseRV:
+      // Don't move these; the RV optimization depends on the autoreleaseRV
+      // being tail called, and the retainRV being immediately after a call
+      // (which might still happen if we get lucky with codegen layout, but
+      // it's not worth taking the chance).
+      continue;
+    default:
+      llvm_unreachable("Invalid dependence flavor");
+    }
+
     if (DepInst != PN)
-      continue; 
- 
-    Changed = true; 
-    ++NumPartialNoops; 
-    // Clone the call into each predecessor that has a non-null value. 
-    CallInst *CInst = cast<CallInst>(Inst); 
-    Type *ParamTy = CInst->getArgOperand(0)->getType(); 
-    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { 
-      Value *Incoming = GetRCIdentityRoot(PN->getIncomingValue(i)); 
-      if (IsNullOrUndef(Incoming)) 
-        continue; 
-      Value *Op = PN->getIncomingValue(i); 
-      Instruction *InsertPos = &PN->getIncomingBlock(i)->back(); 
-      CallInst *Clone = cast<CallInst>( 
-          CloneCallInstForBB(*CInst, *InsertPos->getParent(), BlockColors)); 
-      if (Op->getType() != ParamTy) 
-        Op = new BitCastInst(Op, ParamTy, "", InsertPos); 
-      Clone->setArgOperand(0, Op); 
-      Clone->insertBefore(InsertPos); 
- 
-      LLVM_DEBUG(dbgs() << "Cloning " << *CInst << "\n" 
-                                                   "And inserting clone at " 
-                        << *InsertPos << "\n"); 
-      Worklist.push_back(std::make_pair(Clone, Incoming)); 
-    } 
-    // Erase the original call. 
-    LLVM_DEBUG(dbgs() << "Erasing: " << *CInst << "\n"); 
-    EraseInstruction(CInst); 
-  } while (!Worklist.empty()); 
-} 
- 
-/// If we have a top down pointer in the S_Use state, make sure that there are 
-/// no CFG hazards by checking the states of various bottom up pointers. 
-static void CheckForUseCFGHazard(const Sequence SuccSSeq, 
-                                 const bool SuccSRRIKnownSafe, 
-                                 TopDownPtrState &S, 
-                                 bool &SomeSuccHasSame, 
-                                 bool &AllSuccsHaveSame, 
-                                 bool &NotAllSeqEqualButKnownSafe, 
-                                 bool &ShouldContinue) { 
-  switch (SuccSSeq) { 
-  case S_CanRelease: { 
-    if (!S.IsKnownSafe() && !SuccSRRIKnownSafe) { 
-      S.ClearSequenceProgress(); 
-      break; 
-    } 
-    S.SetCFGHazardAfflicted(true); 
-    ShouldContinue = true; 
-    break; 
-  } 
-  case S_Use: 
-    SomeSuccHasSame = true; 
-    break; 
-  case S_Stop: 
-  case S_Release: 
-  case S_MovableRelease: 
-    if (!S.IsKnownSafe() && !SuccSRRIKnownSafe) 
-      AllSuccsHaveSame = false; 
-    else 
-      NotAllSeqEqualButKnownSafe = true; 
-    break; 
-  case S_Retain: 
-    llvm_unreachable("bottom-up pointer in retain state!"); 
-  case S_None: 
-    llvm_unreachable("This should have been handled earlier."); 
-  } 
-} 
- 
-/// If we have a Top Down pointer in the S_CanRelease state, make sure that 
-/// there are no CFG hazards by checking the states of various bottom up 
-/// pointers. 
-static void CheckForCanReleaseCFGHazard(const Sequence SuccSSeq, 
-                                        const bool SuccSRRIKnownSafe, 
-                                        TopDownPtrState &S, 
-                                        bool &SomeSuccHasSame, 
-                                        bool &AllSuccsHaveSame, 
-                                        bool &NotAllSeqEqualButKnownSafe) { 
-  switch (SuccSSeq) { 
-  case S_CanRelease: 
-    SomeSuccHasSame = true; 
-    break; 
-  case S_Stop: 
-  case S_Release: 
-  case S_MovableRelease: 
-  case S_Use: 
-    if (!S.IsKnownSafe() && !SuccSRRIKnownSafe) 
-      AllSuccsHaveSame = false; 
-    else 
-      NotAllSeqEqualButKnownSafe = true; 
-    break; 
-  case S_Retain: 
-    llvm_unreachable("bottom-up pointer in retain state!"); 
-  case S_None: 
-    llvm_unreachable("This should have been handled earlier."); 
-  } 
-} 
- 
-/// Check for critical edges, loop boundaries, irreducible control flow, or 
-/// other CFG structures where moving code across the edge would result in it 
-/// being executed more. 
-void 
-ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB, 
-                               DenseMap<const BasicBlock *, BBState> &BBStates, 
-                               BBState &MyStates) const { 
-  // If any top-down local-use or possible-dec has a succ which is earlier in 
-  // the sequence, forget it. 
-  for (auto I = MyStates.top_down_ptr_begin(), E = MyStates.top_down_ptr_end(); 
-       I != E; ++I) { 
-    TopDownPtrState &S = I->second; 
-    const Sequence Seq = I->second.GetSeq(); 
- 
-    // We only care about S_Retain, S_CanRelease, and S_Use. 
-    if (Seq == S_None) 
-      continue; 
- 
-    // Make sure that if extra top down states are added in the future that this 
-    // code is updated to handle it. 
-    assert((Seq == S_Retain || Seq == S_CanRelease || Seq == S_Use) && 
-           "Unknown top down sequence state."); 
- 
-    const Value *Arg = I->first; 
-    bool SomeSuccHasSame = false; 
-    bool AllSuccsHaveSame = true; 
-    bool NotAllSeqEqualButKnownSafe = false; 
- 
-    for (const BasicBlock *Succ : successors(BB)) { 
-      // If VisitBottomUp has pointer information for this successor, take 
-      // what we know about it. 
-      const DenseMap<const BasicBlock *, BBState>::iterator BBI = 
-          BBStates.find(Succ); 
-      assert(BBI != BBStates.end()); 
-      const BottomUpPtrState &SuccS = BBI->second.getPtrBottomUpState(Arg); 
-      const Sequence SuccSSeq = SuccS.GetSeq(); 
- 
-      // If bottom up, the pointer is in an S_None state, clear the sequence 
-      // progress since the sequence in the bottom up state finished 
-      // suggesting a mismatch in between retains/releases. This is true for 
-      // all three cases that we are handling here: S_Retain, S_Use, and 
-      // S_CanRelease. 
-      if (SuccSSeq == S_None) { 
-        S.ClearSequenceProgress(); 
-        continue; 
-      } 
- 
-      // If we have S_Use or S_CanRelease, perform our check for cfg hazard 
-      // checks. 
-      const bool SuccSRRIKnownSafe = SuccS.IsKnownSafe(); 
- 
-      // *NOTE* We do not use Seq from above here since we are allowing for 
-      // S.GetSeq() to change while we are visiting basic blocks. 
-      switch(S.GetSeq()) { 
-      case S_Use: { 
-        bool ShouldContinue = false; 
-        CheckForUseCFGHazard(SuccSSeq, SuccSRRIKnownSafe, S, SomeSuccHasSame, 
-                             AllSuccsHaveSame, NotAllSeqEqualButKnownSafe, 
-                             ShouldContinue); 
-        if (ShouldContinue) 
-          continue; 
-        break; 
-      } 
-      case S_CanRelease: 
-        CheckForCanReleaseCFGHazard(SuccSSeq, SuccSRRIKnownSafe, S, 
-                                    SomeSuccHasSame, AllSuccsHaveSame, 
-                                    NotAllSeqEqualButKnownSafe); 
-        break; 
-      case S_Retain: 
-      case S_None: 
-      case S_Stop: 
-      case S_Release: 
-      case S_MovableRelease: 
-        break; 
-      } 
-    } 
- 
-    // If the state at the other end of any of the successor edges 
-    // matches the current state, require all edges to match. This 
-    // guards against loops in the middle of a sequence. 
-    if (SomeSuccHasSame && !AllSuccsHaveSame) { 
-      S.ClearSequenceProgress(); 
-    } else if (NotAllSeqEqualButKnownSafe) { 
-      // If we would have cleared the state foregoing the fact that we are known 
-      // safe, stop code motion. This is because whether or not it is safe to 
-      // remove RR pairs via KnownSafe is an orthogonal concept to whether we 
-      // are allowed to perform code motion. 
-      S.SetCFGHazardAfflicted(true); 
-    } 
-  } 
-} 
- 
-bool ObjCARCOpt::VisitInstructionBottomUp( 
-    Instruction *Inst, BasicBlock *BB, BlotMapVector<Value *, RRInfo> &Retains, 
-    BBState &MyStates) { 
-  bool NestingDetected = false; 
-  ARCInstKind Class = GetARCInstKind(Inst); 
-  const Value *Arg = nullptr; 
- 
-  LLVM_DEBUG(dbgs() << "        Class: " << Class << "\n"); 
- 
-  switch (Class) { 
-  case ARCInstKind::Release: { 
-    Arg = GetArgRCIdentityRoot(Inst); 
- 
-    BottomUpPtrState &S = MyStates.getPtrBottomUpState(Arg); 
-    NestingDetected |= S.InitBottomUp(MDKindCache, Inst); 
-    break; 
-  } 
-  case ARCInstKind::RetainBlock: 
-    // In OptimizeIndividualCalls, we have strength reduced all optimizable 
-    // objc_retainBlocks to objc_retains. Thus at this point any 
-    // objc_retainBlocks that we see are not optimizable. 
-    break; 
-  case ARCInstKind::Retain: 
-  case ARCInstKind::RetainRV: { 
-    Arg = GetArgRCIdentityRoot(Inst); 
-    BottomUpPtrState &S = MyStates.getPtrBottomUpState(Arg); 
-    if (S.MatchWithRetain()) { 
-      // Don't do retain+release tracking for ARCInstKind::RetainRV, because 
-      // it's better to let it remain as the first instruction after a call. 
-      if (Class != ARCInstKind::RetainRV) { 
-        LLVM_DEBUG(dbgs() << "        Matching with: " << *Inst << "\n"); 
-        Retains[Inst] = S.GetRRInfo(); 
-      } 
-      S.ClearSequenceProgress(); 
-    } 
-    // A retain moving bottom up can be a use. 
-    break; 
-  } 
-  case ARCInstKind::AutoreleasepoolPop: 
-    // Conservatively, clear MyStates for all known pointers. 
-    MyStates.clearBottomUpPointers(); 
-    return NestingDetected; 
-  case ARCInstKind::AutoreleasepoolPush: 
-  case ARCInstKind::None: 
-    // These are irrelevant. 
-    return NestingDetected; 
-  default: 
-    break; 
-  } 
- 
-  // Consider any other possible effects of this instruction on each 
-  // pointer being tracked. 
-  for (auto MI = MyStates.bottom_up_ptr_begin(), 
-            ME = MyStates.bottom_up_ptr_end(); 
-       MI != ME; ++MI) { 
-    const Value *Ptr = MI->first; 
-    if (Ptr == Arg) 
-      continue; // Handled above. 
-    BottomUpPtrState &S = MI->second; 
- 
-    if (S.HandlePotentialAlterRefCount(Inst, Ptr, PA, Class)) 
-      continue; 
- 
-    S.HandlePotentialUse(BB, Inst, Ptr, PA, Class); 
-  } 
- 
-  return NestingDetected; 
-} 
- 
-bool ObjCARCOpt::VisitBottomUp(BasicBlock *BB, 
-                               DenseMap<const BasicBlock *, BBState> &BBStates, 
-                               BlotMapVector<Value *, RRInfo> &Retains) { 
-  LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::VisitBottomUp ==\n"); 
- 
-  bool NestingDetected = false; 
-  BBState &MyStates = BBStates[BB]; 
- 
-  // Merge the states from each successor to compute the initial state 
-  // for the current block. 
-  BBState::edge_iterator SI(MyStates.succ_begin()), 
-                         SE(MyStates.succ_end()); 
-  if (SI != SE) { 
-    const BasicBlock *Succ = *SI; 
-    DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Succ); 
-    assert(I != BBStates.end()); 
-    MyStates.InitFromSucc(I->second); 
-    ++SI; 
-    for (; SI != SE; ++SI) { 
-      Succ = *SI; 
-      I = BBStates.find(Succ); 
-      assert(I != BBStates.end()); 
-      MyStates.MergeSucc(I->second); 
-    } 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "Before:\n" 
-                    << BBStates[BB] << "\n" 
-                    << "Performing Dataflow:\n"); 
- 
-  // Visit all the instructions, bottom-up. 
-  for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; --I) { 
-    Instruction *Inst = &*std::prev(I); 
- 
-    // Invoke instructions are visited as part of their successors (below). 
-    if (isa<InvokeInst>(Inst)) 
-      continue; 
- 
-    LLVM_DEBUG(dbgs() << "    Visiting " << *Inst << "\n"); 
- 
-    NestingDetected |= VisitInstructionBottomUp(Inst, BB, Retains, MyStates); 
- 
-    // Bail out if the number of pointers being tracked becomes too large so 
-    // that this pass can complete in a reasonable amount of time. 
-    if (MyStates.bottom_up_ptr_list_size() > MaxPtrStates) { 
-      DisableRetainReleasePairing = true; 
-      return false; 
-    } 
-  } 
- 
-  // If there's a predecessor with an invoke, visit the invoke as if it were 
-  // part of this block, since we can't insert code after an invoke in its own 
-  // block, and we don't want to split critical edges. 
-  for (BBState::edge_iterator PI(MyStates.pred_begin()), 
-       PE(MyStates.pred_end()); PI != PE; ++PI) { 
-    BasicBlock *Pred = *PI; 
-    if (InvokeInst *II = dyn_cast<InvokeInst>(&Pred->back())) 
-      NestingDetected |= VisitInstructionBottomUp(II, BB, Retains, MyStates); 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "\nFinal State:\n" << BBStates[BB] << "\n"); 
- 
-  return NestingDetected; 
-} 
- 
-bool 
-ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst, 
-                                    DenseMap<Value *, RRInfo> &Releases, 
-                                    BBState &MyStates) { 
-  bool NestingDetected = false; 
-  ARCInstKind Class = GetARCInstKind(Inst); 
-  const Value *Arg = nullptr; 
- 
-  LLVM_DEBUG(dbgs() << "        Class: " << Class << "\n"); 
- 
-  switch (Class) { 
-  case ARCInstKind::RetainBlock: 
-    // In OptimizeIndividualCalls, we have strength reduced all optimizable 
-    // objc_retainBlocks to objc_retains. Thus at this point any 
-    // objc_retainBlocks that we see are not optimizable. We need to break since 
-    // a retain can be a potential use. 
-    break; 
-  case ARCInstKind::Retain: 
-  case ARCInstKind::RetainRV: { 
-    Arg = GetArgRCIdentityRoot(Inst); 
-    TopDownPtrState &S = MyStates.getPtrTopDownState(Arg); 
-    NestingDetected |= S.InitTopDown(Class, Inst); 
-    // A retain can be a potential use; proceed to the generic checking 
-    // code below. 
-    break; 
-  } 
-  case ARCInstKind::Release: { 
-    Arg = GetArgRCIdentityRoot(Inst); 
-    TopDownPtrState &S = MyStates.getPtrTopDownState(Arg); 
-    // Try to form a tentative pair in between this release instruction and the 
-    // top down pointers that we are tracking. 
-    if (S.MatchWithRelease(MDKindCache, Inst)) { 
-      // If we succeed, copy S's RRInfo into the Release -> {Retain Set 
-      // Map}. Then we clear S. 
-      LLVM_DEBUG(dbgs() << "        Matching with: " << *Inst << "\n"); 
-      Releases[Inst] = S.GetRRInfo(); 
-      S.ClearSequenceProgress(); 
-    } 
-    break; 
-  } 
-  case ARCInstKind::AutoreleasepoolPop: 
-    // Conservatively, clear MyStates for all known pointers. 
-    MyStates.clearTopDownPointers(); 
-    return false; 
-  case ARCInstKind::AutoreleasepoolPush: 
-  case ARCInstKind::None: 
-    // These can not be uses of 
-    return false; 
-  default: 
-    break; 
-  } 
- 
-  // Consider any other possible effects of this instruction on each 
-  // pointer being tracked. 
-  for (auto MI = MyStates.top_down_ptr_begin(), 
-            ME = MyStates.top_down_ptr_end(); 
-       MI != ME; ++MI) { 
-    const Value *Ptr = MI->first; 
-    if (Ptr == Arg) 
-      continue; // Handled above. 
-    TopDownPtrState &S = MI->second; 
-    if (S.HandlePotentialAlterRefCount(Inst, Ptr, PA, Class)) 
-      continue; 
- 
-    S.HandlePotentialUse(Inst, Ptr, PA, Class); 
-  } 
- 
-  return NestingDetected; 
-} 
- 
-bool 
-ObjCARCOpt::VisitTopDown(BasicBlock *BB, 
-                         DenseMap<const BasicBlock *, BBState> &BBStates, 
-                         DenseMap<Value *, RRInfo> &Releases) { 
-  LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::VisitTopDown ==\n"); 
-  bool NestingDetected = false; 
-  BBState &MyStates = BBStates[BB]; 
- 
-  // Merge the states from each predecessor to compute the initial state 
-  // for the current block. 
-  BBState::edge_iterator PI(MyStates.pred_begin()), 
-                         PE(MyStates.pred_end()); 
-  if (PI != PE) { 
-    const BasicBlock *Pred = *PI; 
-    DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred); 
-    assert(I != BBStates.end()); 
-    MyStates.InitFromPred(I->second); 
-    ++PI; 
-    for (; PI != PE; ++PI) { 
-      Pred = *PI; 
-      I = BBStates.find(Pred); 
-      assert(I != BBStates.end()); 
-      MyStates.MergePred(I->second); 
-    } 
-  } 
- 
-  // Check that BB and MyStates have the same number of predecessors. This 
-  // prevents retain calls that live outside a loop from being moved into the 
-  // loop. 
-  if (!BB->hasNPredecessors(MyStates.pred_end() - MyStates.pred_begin())) 
-    for (auto I = MyStates.top_down_ptr_begin(), 
-              E = MyStates.top_down_ptr_end(); 
-         I != E; ++I) 
-      I->second.SetCFGHazardAfflicted(true); 
- 
-  LLVM_DEBUG(dbgs() << "Before:\n" 
-                    << BBStates[BB] << "\n" 
-                    << "Performing Dataflow:\n"); 
- 
-  // Visit all the instructions, top-down. 
-  for (Instruction &Inst : *BB) { 
-    LLVM_DEBUG(dbgs() << "    Visiting " << Inst << "\n"); 
- 
-    NestingDetected |= VisitInstructionTopDown(&Inst, Releases, MyStates); 
- 
-    // Bail out if the number of pointers being tracked becomes too large so 
-    // that this pass can complete in a reasonable amount of time. 
-    if (MyStates.top_down_ptr_list_size() > MaxPtrStates) { 
-      DisableRetainReleasePairing = true; 
-      return false; 
-    } 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "\nState Before Checking for CFG Hazards:\n" 
-                    << BBStates[BB] << "\n\n"); 
-  CheckForCFGHazards(BB, BBStates, MyStates); 
-  LLVM_DEBUG(dbgs() << "Final State:\n" << BBStates[BB] << "\n"); 
-  return NestingDetected; 
-} 
- 
-static void 
-ComputePostOrders(Function &F, 
-                  SmallVectorImpl<BasicBlock *> &PostOrder, 
-                  SmallVectorImpl<BasicBlock *> &ReverseCFGPostOrder, 
-                  unsigned NoObjCARCExceptionsMDKind, 
-                  DenseMap<const BasicBlock *, BBState> &BBStates) { 
-  /// The visited set, for doing DFS walks. 
-  SmallPtrSet<BasicBlock *, 16> Visited; 
- 
-  // Do DFS, computing the PostOrder. 
-  SmallPtrSet<BasicBlock *, 16> OnStack; 
-  SmallVector<std::pair<BasicBlock *, succ_iterator>, 16> SuccStack; 
- 
-  // Functions always have exactly one entry block, and we don't have 
-  // any other block that we treat like an entry block. 
-  BasicBlock *EntryBB = &F.getEntryBlock(); 
-  BBState &MyStates = BBStates[EntryBB]; 
-  MyStates.SetAsEntry(); 
-  Instruction *EntryTI = EntryBB->getTerminator(); 
-  SuccStack.push_back(std::make_pair(EntryBB, succ_iterator(EntryTI))); 
-  Visited.insert(EntryBB); 
-  OnStack.insert(EntryBB); 
-  do { 
-  dfs_next_succ: 
-    BasicBlock *CurrBB = SuccStack.back().first; 
-    succ_iterator SE(CurrBB->getTerminator(), false); 
- 
-    while (SuccStack.back().second != SE) { 
-      BasicBlock *SuccBB = *SuccStack.back().second++; 
-      if (Visited.insert(SuccBB).second) { 
-        SuccStack.push_back( 
-            std::make_pair(SuccBB, succ_iterator(SuccBB->getTerminator()))); 
-        BBStates[CurrBB].addSucc(SuccBB); 
-        BBState &SuccStates = BBStates[SuccBB]; 
-        SuccStates.addPred(CurrBB); 
-        OnStack.insert(SuccBB); 
-        goto dfs_next_succ; 
-      } 
- 
-      if (!OnStack.count(SuccBB)) { 
-        BBStates[CurrBB].addSucc(SuccBB); 
-        BBStates[SuccBB].addPred(CurrBB); 
-      } 
-    } 
-    OnStack.erase(CurrBB); 
-    PostOrder.push_back(CurrBB); 
-    SuccStack.pop_back(); 
-  } while (!SuccStack.empty()); 
- 
-  Visited.clear(); 
- 
-  // Do reverse-CFG DFS, computing the reverse-CFG PostOrder. 
-  // Functions may have many exits, and there also blocks which we treat 
-  // as exits due to ignored edges. 
-  SmallVector<std::pair<BasicBlock *, BBState::edge_iterator>, 16> PredStack; 
-  for (BasicBlock &ExitBB : F) { 
-    BBState &MyStates = BBStates[&ExitBB]; 
-    if (!MyStates.isExit()) 
-      continue; 
- 
-    MyStates.SetAsExit(); 
- 
-    PredStack.push_back(std::make_pair(&ExitBB, MyStates.pred_begin())); 
-    Visited.insert(&ExitBB); 
-    while (!PredStack.empty()) { 
-    reverse_dfs_next_succ: 
-      BBState::edge_iterator PE = BBStates[PredStack.back().first].pred_end(); 
-      while (PredStack.back().second != PE) { 
-        BasicBlock *BB = *PredStack.back().second++; 
-        if (Visited.insert(BB).second) { 
-          PredStack.push_back(std::make_pair(BB, BBStates[BB].pred_begin())); 
-          goto reverse_dfs_next_succ; 
-        } 
-      } 
-      ReverseCFGPostOrder.push_back(PredStack.pop_back_val().first); 
-    } 
-  } 
-} 
- 
-// Visit the function both top-down and bottom-up. 
-bool ObjCARCOpt::Visit(Function &F, 
-                       DenseMap<const BasicBlock *, BBState> &BBStates, 
-                       BlotMapVector<Value *, RRInfo> &Retains, 
-                       DenseMap<Value *, RRInfo> &Releases) { 
-  // Use reverse-postorder traversals, because we magically know that loops 
-  // will be well behaved, i.e. they won't repeatedly call retain on a single 
-  // pointer without doing a release. We can't use the ReversePostOrderTraversal 
-  // class here because we want the reverse-CFG postorder to consider each 
-  // function exit point, and we want to ignore selected cycle edges. 
-  SmallVector<BasicBlock *, 16> PostOrder; 
-  SmallVector<BasicBlock *, 16> ReverseCFGPostOrder; 
-  ComputePostOrders(F, PostOrder, ReverseCFGPostOrder, 
-                    MDKindCache.get(ARCMDKindID::NoObjCARCExceptions), 
-                    BBStates); 
- 
-  // Use reverse-postorder on the reverse CFG for bottom-up. 
-  bool BottomUpNestingDetected = false; 
-  for (BasicBlock *BB : llvm::reverse(ReverseCFGPostOrder)) { 
-    BottomUpNestingDetected |= VisitBottomUp(BB, BBStates, Retains); 
-    if (DisableRetainReleasePairing) 
-      return false; 
-  } 
- 
-  // Use reverse-postorder for top-down. 
-  bool TopDownNestingDetected = false; 
-  for (BasicBlock *BB : llvm::reverse(PostOrder)) { 
-    TopDownNestingDetected |= VisitTopDown(BB, BBStates, Releases); 
-    if (DisableRetainReleasePairing) 
-      return false; 
-  } 
- 
-  return TopDownNestingDetected && BottomUpNestingDetected; 
-} 
- 
-/// Move the calls in RetainsToMove and ReleasesToMove. 
-void ObjCARCOpt::MoveCalls(Value *Arg, RRInfo &RetainsToMove, 
-                           RRInfo &ReleasesToMove, 
-                           BlotMapVector<Value *, RRInfo> &Retains, 
-                           DenseMap<Value *, RRInfo> &Releases, 
-                           SmallVectorImpl<Instruction *> &DeadInsts, 
-                           Module *M) { 
-  Type *ArgTy = Arg->getType(); 
-  Type *ParamTy = PointerType::getUnqual(Type::getInt8Ty(ArgTy->getContext())); 
- 
-  LLVM_DEBUG(dbgs() << "== ObjCARCOpt::MoveCalls ==\n"); 
- 
-  // Insert the new retain and release calls. 
-  for (Instruction *InsertPt : ReleasesToMove.ReverseInsertPts) { 
-    Value *MyArg = ArgTy == ParamTy ? Arg : 
-                   new BitCastInst(Arg, ParamTy, "", InsertPt); 
-    Function *Decl = EP.get(ARCRuntimeEntryPointKind::Retain); 
-    CallInst *Call = CallInst::Create(Decl, MyArg, "", InsertPt); 
-    Call->setDoesNotThrow(); 
-    Call->setTailCall(); 
- 
-    LLVM_DEBUG(dbgs() << "Inserting new Retain: " << *Call 
-                      << "\n" 
-                         "At insertion point: " 
-                      << *InsertPt << "\n"); 
-  } 
-  for (Instruction *InsertPt : RetainsToMove.ReverseInsertPts) { 
-    Value *MyArg = ArgTy == ParamTy ? Arg : 
-                   new BitCastInst(Arg, ParamTy, "", InsertPt); 
-    Function *Decl = EP.get(ARCRuntimeEntryPointKind::Release); 
-    CallInst *Call = CallInst::Create(Decl, MyArg, "", InsertPt); 
-    // Attach a clang.imprecise_release metadata tag, if appropriate. 
-    if (MDNode *M = ReleasesToMove.ReleaseMetadata) 
-      Call->setMetadata(MDKindCache.get(ARCMDKindID::ImpreciseRelease), M); 
-    Call->setDoesNotThrow(); 
-    if (ReleasesToMove.IsTailCallRelease) 
-      Call->setTailCall(); 
- 
-    LLVM_DEBUG(dbgs() << "Inserting new Release: " << *Call 
-                      << "\n" 
-                         "At insertion point: " 
-                      << *InsertPt << "\n"); 
-  } 
- 
-  // Delete the original retain and release calls. 
-  for (Instruction *OrigRetain : RetainsToMove.Calls) { 
-    Retains.blot(OrigRetain); 
-    DeadInsts.push_back(OrigRetain); 
-    LLVM_DEBUG(dbgs() << "Deleting retain: " << *OrigRetain << "\n"); 
-  } 
-  for (Instruction *OrigRelease : ReleasesToMove.Calls) { 
-    Releases.erase(OrigRelease); 
-    DeadInsts.push_back(OrigRelease); 
-    LLVM_DEBUG(dbgs() << "Deleting release: " << *OrigRelease << "\n"); 
-  } 
-} 
- 
-bool ObjCARCOpt::PairUpRetainsAndReleases( 
-    DenseMap<const BasicBlock *, BBState> &BBStates, 
-    BlotMapVector<Value *, RRInfo> &Retains, 
-    DenseMap<Value *, RRInfo> &Releases, Module *M, 
-    Instruction *Retain, 
-    SmallVectorImpl<Instruction *> &DeadInsts, RRInfo &RetainsToMove, 
-    RRInfo &ReleasesToMove, Value *Arg, bool KnownSafe, 
-    bool &AnyPairsCompletelyEliminated) { 
-  // If a pair happens in a region where it is known that the reference count 
-  // is already incremented, we can similarly ignore possible decrements unless 
-  // we are dealing with a retainable object with multiple provenance sources. 
-  bool KnownSafeTD = true, KnownSafeBU = true; 
-  bool CFGHazardAfflicted = false; 
- 
-  // Connect the dots between the top-down-collected RetainsToMove and 
-  // bottom-up-collected ReleasesToMove to form sets of related calls. 
-  // This is an iterative process so that we connect multiple releases 
-  // to multiple retains if needed. 
-  unsigned OldDelta = 0; 
-  unsigned NewDelta = 0; 
-  unsigned OldCount = 0; 
-  unsigned NewCount = 0; 
-  bool FirstRelease = true; 
-  for (SmallVector<Instruction *, 4> NewRetains{Retain};;) { 
-    SmallVector<Instruction *, 4> NewReleases; 
-    for (Instruction *NewRetain : NewRetains) { 
-      auto It = Retains.find(NewRetain); 
-      assert(It != Retains.end()); 
-      const RRInfo &NewRetainRRI = It->second; 
-      KnownSafeTD &= NewRetainRRI.KnownSafe; 
-      CFGHazardAfflicted |= NewRetainRRI.CFGHazardAfflicted; 
-      for (Instruction *NewRetainRelease : NewRetainRRI.Calls) { 
-        auto Jt = Releases.find(NewRetainRelease); 
-        if (Jt == Releases.end()) 
-          return false; 
-        const RRInfo &NewRetainReleaseRRI = Jt->second; 
- 
-        // If the release does not have a reference to the retain as well, 
-        // something happened which is unaccounted for. Do not do anything. 
-        // 
-        // This can happen if we catch an additive overflow during path count 
-        // merging. 
-        if (!NewRetainReleaseRRI.Calls.count(NewRetain)) 
-          return false; 
- 
-        if (ReleasesToMove.Calls.insert(NewRetainRelease).second) { 
-          // If we overflow when we compute the path count, don't remove/move 
-          // anything. 
-          const BBState &NRRBBState = BBStates[NewRetainRelease->getParent()]; 
-          unsigned PathCount = BBState::OverflowOccurredValue; 
-          if (NRRBBState.GetAllPathCountWithOverflow(PathCount)) 
-            return false; 
-          assert(PathCount != BBState::OverflowOccurredValue && 
-                 "PathCount at this point can not be " 
-                 "OverflowOccurredValue."); 
-          OldDelta -= PathCount; 
- 
-          // Merge the ReleaseMetadata and IsTailCallRelease values. 
-          if (FirstRelease) { 
-            ReleasesToMove.ReleaseMetadata = 
-              NewRetainReleaseRRI.ReleaseMetadata; 
-            ReleasesToMove.IsTailCallRelease = 
-              NewRetainReleaseRRI.IsTailCallRelease; 
-            FirstRelease = false; 
-          } else { 
-            if (ReleasesToMove.ReleaseMetadata != 
-                NewRetainReleaseRRI.ReleaseMetadata) 
-              ReleasesToMove.ReleaseMetadata = nullptr; 
-            if (ReleasesToMove.IsTailCallRelease != 
-                NewRetainReleaseRRI.IsTailCallRelease) 
-              ReleasesToMove.IsTailCallRelease = false; 
-          } 
- 
-          // Collect the optimal insertion points. 
-          if (!KnownSafe) 
-            for (Instruction *RIP : NewRetainReleaseRRI.ReverseInsertPts) { 
-              if (ReleasesToMove.ReverseInsertPts.insert(RIP).second) { 
-                // If we overflow when we compute the path count, don't 
-                // remove/move anything. 
-                const BBState &RIPBBState = BBStates[RIP->getParent()]; 
-                PathCount = BBState::OverflowOccurredValue; 
-                if (RIPBBState.GetAllPathCountWithOverflow(PathCount)) 
-                  return false; 
-                assert(PathCount != BBState::OverflowOccurredValue && 
-                       "PathCount at this point can not be " 
-                       "OverflowOccurredValue."); 
-                NewDelta -= PathCount; 
-              } 
-            } 
-          NewReleases.push_back(NewRetainRelease); 
-        } 
-      } 
-    } 
-    NewRetains.clear(); 
-    if (NewReleases.empty()) break; 
- 
-    // Back the other way. 
-    for (Instruction *NewRelease : NewReleases) { 
-      auto It = Releases.find(NewRelease); 
-      assert(It != Releases.end()); 
-      const RRInfo &NewReleaseRRI = It->second; 
-      KnownSafeBU &= NewReleaseRRI.KnownSafe; 
-      CFGHazardAfflicted |= NewReleaseRRI.CFGHazardAfflicted; 
-      for (Instruction *NewReleaseRetain : NewReleaseRRI.Calls) { 
-        auto Jt = Retains.find(NewReleaseRetain); 
-        if (Jt == Retains.end()) 
-          return false; 
-        const RRInfo &NewReleaseRetainRRI = Jt->second; 
- 
-        // If the retain does not have a reference to the release as well, 
-        // something happened which is unaccounted for. Do not do anything. 
-        // 
-        // This can happen if we catch an additive overflow during path count 
-        // merging. 
-        if (!NewReleaseRetainRRI.Calls.count(NewRelease)) 
-          return false; 
- 
-        if (RetainsToMove.Calls.insert(NewReleaseRetain).second) { 
-          // If we overflow when we compute the path count, don't remove/move 
-          // anything. 
-          const BBState &NRRBBState = BBStates[NewReleaseRetain->getParent()]; 
-          unsigned PathCount = BBState::OverflowOccurredValue; 
-          if (NRRBBState.GetAllPathCountWithOverflow(PathCount)) 
-            return false; 
-          assert(PathCount != BBState::OverflowOccurredValue && 
-                 "PathCount at this point can not be " 
-                 "OverflowOccurredValue."); 
-          OldDelta += PathCount; 
-          OldCount += PathCount; 
- 
-          // Collect the optimal insertion points. 
-          if (!KnownSafe) 
-            for (Instruction *RIP : NewReleaseRetainRRI.ReverseInsertPts) { 
-              if (RetainsToMove.ReverseInsertPts.insert(RIP).second) { 
-                // If we overflow when we compute the path count, don't 
-                // remove/move anything. 
-                const BBState &RIPBBState = BBStates[RIP->getParent()]; 
- 
-                PathCount = BBState::OverflowOccurredValue; 
-                if (RIPBBState.GetAllPathCountWithOverflow(PathCount)) 
-                  return false; 
-                assert(PathCount != BBState::OverflowOccurredValue && 
-                       "PathCount at this point can not be " 
-                       "OverflowOccurredValue."); 
-                NewDelta += PathCount; 
-                NewCount += PathCount; 
-              } 
-            } 
-          NewRetains.push_back(NewReleaseRetain); 
-        } 
-      } 
-    } 
-    if (NewRetains.empty()) break; 
-  } 
- 
-  // We can only remove pointers if we are known safe in both directions. 
-  bool UnconditionallySafe = KnownSafeTD && KnownSafeBU; 
-  if (UnconditionallySafe) { 
-    RetainsToMove.ReverseInsertPts.clear(); 
-    ReleasesToMove.ReverseInsertPts.clear(); 
-    NewCount = 0; 
-  } else { 
-    // Determine whether the new insertion points we computed preserve the 
-    // balance of retain and release calls through the program. 
-    // TODO: If the fully aggressive solution isn't valid, try to find a 
-    // less aggressive solution which is. 
-    if (NewDelta != 0) 
-      return false; 
- 
-    // At this point, we are not going to remove any RR pairs, but we still are 
-    // able to move RR pairs. If one of our pointers is afflicted with 
-    // CFGHazards, we cannot perform such code motion so exit early. 
-    const bool WillPerformCodeMotion = 
-        !RetainsToMove.ReverseInsertPts.empty() || 
-        !ReleasesToMove.ReverseInsertPts.empty(); 
-    if (CFGHazardAfflicted && WillPerformCodeMotion) 
-      return false; 
-  } 
- 
-  // Determine whether the original call points are balanced in the retain and 
-  // release calls through the program. If not, conservatively don't touch 
-  // them. 
-  // TODO: It's theoretically possible to do code motion in this case, as 
-  // long as the existing imbalances are maintained. 
-  if (OldDelta != 0) 
-    return false; 
- 
-  Changed = true; 
-  assert(OldCount != 0 && "Unreachable code?"); 
-  NumRRs += OldCount - NewCount; 
-  // Set to true if we completely removed any RR pairs. 
-  AnyPairsCompletelyEliminated = NewCount == 0; 
- 
-  // We can move calls! 
-  return true; 
-} 
- 
-/// Identify pairings between the retains and releases, and delete and/or move 
-/// them. 
-bool ObjCARCOpt::PerformCodePlacement( 
-    DenseMap<const BasicBlock *, BBState> &BBStates, 
-    BlotMapVector<Value *, RRInfo> &Retains, 
-    DenseMap<Value *, RRInfo> &Releases, Module *M) { 
-  LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::PerformCodePlacement ==\n"); 
- 
-  bool AnyPairsCompletelyEliminated = false; 
-  SmallVector<Instruction *, 8> DeadInsts; 
- 
-  // Visit each retain. 
-  for (BlotMapVector<Value *, RRInfo>::const_iterator I = Retains.begin(), 
-                                                      E = Retains.end(); 
-       I != E; ++I) { 
-    Value *V = I->first; 
-    if (!V) continue; // blotted 
- 
-    Instruction *Retain = cast<Instruction>(V); 
- 
-    LLVM_DEBUG(dbgs() << "Visiting: " << *Retain << "\n"); 
- 
-    Value *Arg = GetArgRCIdentityRoot(Retain); 
- 
-    // If the object being released is in static or stack storage, we know it's 
-    // not being managed by ObjC reference counting, so we can delete pairs 
-    // regardless of what possible decrements or uses lie between them. 
-    bool KnownSafe = isa<Constant>(Arg) || isa<AllocaInst>(Arg); 
- 
-    // A constant pointer can't be pointing to an object on the heap. It may 
-    // be reference-counted, but it won't be deleted. 
-    if (const LoadInst *LI = dyn_cast<LoadInst>(Arg)) 
-      if (const GlobalVariable *GV = 
-            dyn_cast<GlobalVariable>( 
-              GetRCIdentityRoot(LI->getPointerOperand()))) 
-        if (GV->isConstant()) 
-          KnownSafe = true; 
- 
-    // Connect the dots between the top-down-collected RetainsToMove and 
-    // bottom-up-collected ReleasesToMove to form sets of related calls. 
-    RRInfo RetainsToMove, ReleasesToMove; 
- 
-    bool PerformMoveCalls = PairUpRetainsAndReleases( 
-        BBStates, Retains, Releases, M, Retain, DeadInsts, 
-        RetainsToMove, ReleasesToMove, Arg, KnownSafe, 
-        AnyPairsCompletelyEliminated); 
- 
-    if (PerformMoveCalls) { 
-      // Ok, everything checks out and we're all set. Let's move/delete some 
-      // code! 
-      MoveCalls(Arg, RetainsToMove, ReleasesToMove, 
-                Retains, Releases, DeadInsts, M); 
-    } 
-  } 
- 
-  // Now that we're done moving everything, we can delete the newly dead 
-  // instructions, as we no longer need them as insert points. 
-  while (!DeadInsts.empty()) 
-    EraseInstruction(DeadInsts.pop_back_val()); 
- 
-  return AnyPairsCompletelyEliminated; 
-} 
- 
-/// Weak pointer optimizations. 
-void ObjCARCOpt::OptimizeWeakCalls(Function &F) { 
-  LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeWeakCalls ==\n"); 
- 
-  // First, do memdep-style RLE and S2L optimizations. We can't use memdep 
-  // itself because it uses AliasAnalysis and we need to do provenance 
-  // queries instead. 
-  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) { 
-    Instruction *Inst = &*I++; 
- 
-    LLVM_DEBUG(dbgs() << "Visiting: " << *Inst << "\n"); 
- 
-    ARCInstKind Class = GetBasicARCInstKind(Inst); 
-    if (Class != ARCInstKind::LoadWeak && 
-        Class != ARCInstKind::LoadWeakRetained) 
-      continue; 
- 
-    // Delete objc_loadWeak calls with no users. 
-    if (Class == ARCInstKind::LoadWeak && Inst->use_empty()) { 
-      Inst->eraseFromParent(); 
-      Changed = true; 
-      continue; 
-    } 
- 
-    // TODO: For now, just look for an earlier available version of this value 
-    // within the same block. Theoretically, we could do memdep-style non-local 
-    // analysis too, but that would want caching. A better approach would be to 
-    // use the technique that EarlyCSE uses. 
-    inst_iterator Current = std::prev(I); 
-    BasicBlock *CurrentBB = &*Current.getBasicBlockIterator(); 
-    for (BasicBlock::iterator B = CurrentBB->begin(), 
-                              J = Current.getInstructionIterator(); 
-         J != B; --J) { 
-      Instruction *EarlierInst = &*std::prev(J); 
-      ARCInstKind EarlierClass = GetARCInstKind(EarlierInst); 
-      switch (EarlierClass) { 
-      case ARCInstKind::LoadWeak: 
-      case ARCInstKind::LoadWeakRetained: { 
-        // If this is loading from the same pointer, replace this load's value 
-        // with that one. 
-        CallInst *Call = cast<CallInst>(Inst); 
-        CallInst *EarlierCall = cast<CallInst>(EarlierInst); 
-        Value *Arg = Call->getArgOperand(0); 
-        Value *EarlierArg = EarlierCall->getArgOperand(0); 
-        switch (PA.getAA()->alias(Arg, EarlierArg)) { 
-        case MustAlias: 
-          Changed = true; 
-          // If the load has a builtin retain, insert a plain retain for it. 
-          if (Class == ARCInstKind::LoadWeakRetained) { 
-            Function *Decl = EP.get(ARCRuntimeEntryPointKind::Retain); 
-            CallInst *CI = CallInst::Create(Decl, EarlierCall, "", Call); 
-            CI->setTailCall(); 
-          } 
-          // Zap the fully redundant load. 
-          Call->replaceAllUsesWith(EarlierCall); 
-          Call->eraseFromParent(); 
-          goto clobbered; 
-        case MayAlias: 
-        case PartialAlias: 
-          goto clobbered; 
-        case NoAlias: 
-          break; 
-        } 
-        break; 
-      } 
-      case ARCInstKind::StoreWeak: 
-      case ARCInstKind::InitWeak: { 
-        // If this is storing to the same pointer and has the same size etc. 
-        // replace this load's value with the stored value. 
-        CallInst *Call = cast<CallInst>(Inst); 
-        CallInst *EarlierCall = cast<CallInst>(EarlierInst); 
-        Value *Arg = Call->getArgOperand(0); 
-        Value *EarlierArg = EarlierCall->getArgOperand(0); 
-        switch (PA.getAA()->alias(Arg, EarlierArg)) { 
-        case MustAlias: 
-          Changed = true; 
-          // If the load has a builtin retain, insert a plain retain for it. 
-          if (Class == ARCInstKind::LoadWeakRetained) { 
-            Function *Decl = EP.get(ARCRuntimeEntryPointKind::Retain); 
-            CallInst *CI = CallInst::Create(Decl, EarlierCall, "", Call); 
-            CI->setTailCall(); 
-          } 
-          // Zap the fully redundant load. 
-          Call->replaceAllUsesWith(EarlierCall->getArgOperand(1)); 
-          Call->eraseFromParent(); 
-          goto clobbered; 
-        case MayAlias: 
-        case PartialAlias: 
-          goto clobbered; 
-        case NoAlias: 
-          break; 
-        } 
-        break; 
-      } 
-      case ARCInstKind::MoveWeak: 
-      case ARCInstKind::CopyWeak: 
-        // TOOD: Grab the copied value. 
-        goto clobbered; 
-      case ARCInstKind::AutoreleasepoolPush: 
-      case ARCInstKind::None: 
-      case ARCInstKind::IntrinsicUser: 
-      case ARCInstKind::User: 
-        // Weak pointers are only modified through the weak entry points 
-        // (and arbitrary calls, which could call the weak entry points). 
-        break; 
-      default: 
-        // Anything else could modify the weak pointer. 
-        goto clobbered; 
-      } 
-    } 
-  clobbered:; 
-  } 
- 
-  // Then, for each destroyWeak with an alloca operand, check to see if 
-  // the alloca and all its users can be zapped. 
-  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) { 
-    Instruction *Inst = &*I++; 
-    ARCInstKind Class = GetBasicARCInstKind(Inst); 
-    if (Class != ARCInstKind::DestroyWeak) 
-      continue; 
- 
-    CallInst *Call = cast<CallInst>(Inst); 
-    Value *Arg = Call->getArgOperand(0); 
-    if (AllocaInst *Alloca = dyn_cast<AllocaInst>(Arg)) { 
-      for (User *U : Alloca->users()) { 
-        const Instruction *UserInst = cast<Instruction>(U); 
-        switch (GetBasicARCInstKind(UserInst)) { 
-        case ARCInstKind::InitWeak: 
-        case ARCInstKind::StoreWeak: 
-        case ARCInstKind::DestroyWeak: 
-          continue; 
-        default: 
-          goto done; 
-        } 
-      } 
-      Changed = true; 
-      for (auto UI = Alloca->user_begin(), UE = Alloca->user_end(); UI != UE;) { 
-        CallInst *UserInst = cast<CallInst>(*UI++); 
-        switch (GetBasicARCInstKind(UserInst)) { 
-        case ARCInstKind::InitWeak: 
-        case ARCInstKind::StoreWeak: 
-          // These functions return their second argument. 
-          UserInst->replaceAllUsesWith(UserInst->getArgOperand(1)); 
-          break; 
-        case ARCInstKind::DestroyWeak: 
-          // No return value. 
-          break; 
-        default: 
-          llvm_unreachable("alloca really is used!"); 
-        } 
-        UserInst->eraseFromParent(); 
-      } 
-      Alloca->eraseFromParent(); 
-    done:; 
-    } 
-  } 
-} 
- 
-/// Identify program paths which execute sequences of retains and releases which 
-/// can be eliminated. 
-bool ObjCARCOpt::OptimizeSequences(Function &F) { 
-  // Releases, Retains - These are used to store the results of the main flow 
-  // analysis. These use Value* as the key instead of Instruction* so that the 
-  // map stays valid when we get around to rewriting code and calls get 
-  // replaced by arguments. 
-  DenseMap<Value *, RRInfo> Releases; 
-  BlotMapVector<Value *, RRInfo> Retains; 
- 
-  // This is used during the traversal of the function to track the 
-  // states for each identified object at each block. 
-  DenseMap<const BasicBlock *, BBState> BBStates; 
- 
-  // Analyze the CFG of the function, and all instructions. 
-  bool NestingDetected = Visit(F, BBStates, Retains, Releases); 
- 
-  if (DisableRetainReleasePairing) 
-    return false; 
- 
-  // Transform. 
-  bool AnyPairsCompletelyEliminated = PerformCodePlacement(BBStates, Retains, 
-                                                           Releases, 
-                                                           F.getParent()); 
- 
-  return AnyPairsCompletelyEliminated && NestingDetected; 
-} 
- 
-/// Check if there is a dependent call earlier that does not have anything in 
-/// between the Retain and the call that can affect the reference count of their 
-/// shared pointer argument. Note that Retain need not be in BB. 
+      continue;
+
+    Changed = true;
+    ++NumPartialNoops;
+    // Clone the call into each predecessor that has a non-null value.
+    CallInst *CInst = cast<CallInst>(Inst);
+    Type *ParamTy = CInst->getArgOperand(0)->getType();
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      Value *Incoming = GetRCIdentityRoot(PN->getIncomingValue(i));
+      if (IsNullOrUndef(Incoming))
+        continue;
+      Value *Op = PN->getIncomingValue(i);
+      Instruction *InsertPos = &PN->getIncomingBlock(i)->back();
+      CallInst *Clone = cast<CallInst>(
+          CloneCallInstForBB(*CInst, *InsertPos->getParent(), BlockColors));
+      if (Op->getType() != ParamTy)
+        Op = new BitCastInst(Op, ParamTy, "", InsertPos);
+      Clone->setArgOperand(0, Op);
+      Clone->insertBefore(InsertPos);
+
+      LLVM_DEBUG(dbgs() << "Cloning " << *CInst << "\n"
+                                                   "And inserting clone at "
+                        << *InsertPos << "\n");
+      Worklist.push_back(std::make_pair(Clone, Incoming));
+    }
+    // Erase the original call.
+    LLVM_DEBUG(dbgs() << "Erasing: " << *CInst << "\n");
+    EraseInstruction(CInst);
+  } while (!Worklist.empty());
+}
+
+/// If we have a top down pointer in the S_Use state, make sure that there are
+/// no CFG hazards by checking the states of various bottom up pointers.
+static void CheckForUseCFGHazard(const Sequence SuccSSeq,
+                                 const bool SuccSRRIKnownSafe,
+                                 TopDownPtrState &S,
+                                 bool &SomeSuccHasSame,
+                                 bool &AllSuccsHaveSame,
+                                 bool &NotAllSeqEqualButKnownSafe,
+                                 bool &ShouldContinue) {
+  switch (SuccSSeq) {
+  case S_CanRelease: {
+    if (!S.IsKnownSafe() && !SuccSRRIKnownSafe) {
+      S.ClearSequenceProgress();
+      break;
+    }
+    S.SetCFGHazardAfflicted(true);
+    ShouldContinue = true;
+    break;
+  }
+  case S_Use:
+    SomeSuccHasSame = true;
+    break;
+  case S_Stop:
+  case S_Release:
+  case S_MovableRelease:
+    if (!S.IsKnownSafe() && !SuccSRRIKnownSafe)
+      AllSuccsHaveSame = false;
+    else
+      NotAllSeqEqualButKnownSafe = true;
+    break;
+  case S_Retain:
+    llvm_unreachable("bottom-up pointer in retain state!");
+  case S_None:
+    llvm_unreachable("This should have been handled earlier.");
+  }
+}
+
+/// If we have a Top Down pointer in the S_CanRelease state, make sure that
+/// there are no CFG hazards by checking the states of various bottom up
+/// pointers.
+static void CheckForCanReleaseCFGHazard(const Sequence SuccSSeq,
+                                        const bool SuccSRRIKnownSafe,
+                                        TopDownPtrState &S,
+                                        bool &SomeSuccHasSame,
+                                        bool &AllSuccsHaveSame,
+                                        bool &NotAllSeqEqualButKnownSafe) {
+  switch (SuccSSeq) {
+  case S_CanRelease:
+    SomeSuccHasSame = true;
+    break;
+  case S_Stop:
+  case S_Release:
+  case S_MovableRelease:
+  case S_Use:
+    if (!S.IsKnownSafe() && !SuccSRRIKnownSafe)
+      AllSuccsHaveSame = false;
+    else
+      NotAllSeqEqualButKnownSafe = true;
+    break;
+  case S_Retain:
+    llvm_unreachable("bottom-up pointer in retain state!");
+  case S_None:
+    llvm_unreachable("This should have been handled earlier.");
+  }
+}
+
+/// Check for critical edges, loop boundaries, irreducible control flow, or
+/// other CFG structures where moving code across the edge would result in it
+/// being executed more.
+void
+ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
+                               DenseMap<const BasicBlock *, BBState> &BBStates,
+                               BBState &MyStates) const {
+  // If any top-down local-use or possible-dec has a succ which is earlier in
+  // the sequence, forget it.
+  for (auto I = MyStates.top_down_ptr_begin(), E = MyStates.top_down_ptr_end();
+       I != E; ++I) {
+    TopDownPtrState &S = I->second;
+    const Sequence Seq = I->second.GetSeq();
+
+    // We only care about S_Retain, S_CanRelease, and S_Use.
+    if (Seq == S_None)
+      continue;
+
+    // Make sure that if extra top down states are added in the future that this
+    // code is updated to handle it.
+    assert((Seq == S_Retain || Seq == S_CanRelease || Seq == S_Use) &&
+           "Unknown top down sequence state.");
+
+    const Value *Arg = I->first;
+    bool SomeSuccHasSame = false;
+    bool AllSuccsHaveSame = true;
+    bool NotAllSeqEqualButKnownSafe = false;
+
+    for (const BasicBlock *Succ : successors(BB)) {
+      // If VisitBottomUp has pointer information for this successor, take
+      // what we know about it.
+      const DenseMap<const BasicBlock *, BBState>::iterator BBI =
+          BBStates.find(Succ);
+      assert(BBI != BBStates.end());
+      const BottomUpPtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
+      const Sequence SuccSSeq = SuccS.GetSeq();
+
+      // If bottom up, the pointer is in an S_None state, clear the sequence
+      // progress since the sequence in the bottom up state finished
+      // suggesting a mismatch in between retains/releases. This is true for
+      // all three cases that we are handling here: S_Retain, S_Use, and
+      // S_CanRelease.
+      if (SuccSSeq == S_None) {
+        S.ClearSequenceProgress();
+        continue;
+      }
+
+      // If we have S_Use or S_CanRelease, perform our check for cfg hazard
+      // checks.
+      const bool SuccSRRIKnownSafe = SuccS.IsKnownSafe();
+
+      // *NOTE* We do not use Seq from above here since we are allowing for
+      // S.GetSeq() to change while we are visiting basic blocks.
+      switch(S.GetSeq()) {
+      case S_Use: {
+        bool ShouldContinue = false;
+        CheckForUseCFGHazard(SuccSSeq, SuccSRRIKnownSafe, S, SomeSuccHasSame,
+                             AllSuccsHaveSame, NotAllSeqEqualButKnownSafe,
+                             ShouldContinue);
+        if (ShouldContinue)
+          continue;
+        break;
+      }
+      case S_CanRelease:
+        CheckForCanReleaseCFGHazard(SuccSSeq, SuccSRRIKnownSafe, S,
+                                    SomeSuccHasSame, AllSuccsHaveSame,
+                                    NotAllSeqEqualButKnownSafe);
+        break;
+      case S_Retain:
+      case S_None:
+      case S_Stop:
+      case S_Release:
+      case S_MovableRelease:
+        break;
+      }
+    }
+
+    // If the state at the other end of any of the successor edges
+    // matches the current state, require all edges to match. This
+    // guards against loops in the middle of a sequence.
+    if (SomeSuccHasSame && !AllSuccsHaveSame) {
+      S.ClearSequenceProgress();
+    } else if (NotAllSeqEqualButKnownSafe) {
+      // If we would have cleared the state foregoing the fact that we are known
+      // safe, stop code motion. This is because whether or not it is safe to
+      // remove RR pairs via KnownSafe is an orthogonal concept to whether we
+      // are allowed to perform code motion.
+      S.SetCFGHazardAfflicted(true);
+    }
+  }
+}
+
+bool ObjCARCOpt::VisitInstructionBottomUp(
+    Instruction *Inst, BasicBlock *BB, BlotMapVector<Value *, RRInfo> &Retains,
+    BBState &MyStates) {
+  bool NestingDetected = false;
+  ARCInstKind Class = GetARCInstKind(Inst);
+  const Value *Arg = nullptr;
+
+  LLVM_DEBUG(dbgs() << "        Class: " << Class << "\n");
+
+  switch (Class) {
+  case ARCInstKind::Release: {
+    Arg = GetArgRCIdentityRoot(Inst);
+
+    BottomUpPtrState &S = MyStates.getPtrBottomUpState(Arg);
+    NestingDetected |= S.InitBottomUp(MDKindCache, Inst);
+    break;
+  }
+  case ARCInstKind::RetainBlock:
+    // In OptimizeIndividualCalls, we have strength reduced all optimizable
+    // objc_retainBlocks to objc_retains. Thus at this point any
+    // objc_retainBlocks that we see are not optimizable.
+    break;
+  case ARCInstKind::Retain:
+  case ARCInstKind::RetainRV: {
+    Arg = GetArgRCIdentityRoot(Inst);
+    BottomUpPtrState &S = MyStates.getPtrBottomUpState(Arg);
+    if (S.MatchWithRetain()) {
+      // Don't do retain+release tracking for ARCInstKind::RetainRV, because
+      // it's better to let it remain as the first instruction after a call.
+      if (Class != ARCInstKind::RetainRV) {
+        LLVM_DEBUG(dbgs() << "        Matching with: " << *Inst << "\n");
+        Retains[Inst] = S.GetRRInfo();
+      }
+      S.ClearSequenceProgress();
+    }
+    // A retain moving bottom up can be a use.
+    break;
+  }
+  case ARCInstKind::AutoreleasepoolPop:
+    // Conservatively, clear MyStates for all known pointers.
+    MyStates.clearBottomUpPointers();
+    return NestingDetected;
+  case ARCInstKind::AutoreleasepoolPush:
+  case ARCInstKind::None:
+    // These are irrelevant.
+    return NestingDetected;
+  default:
+    break;
+  }
+
+  // Consider any other possible effects of this instruction on each
+  // pointer being tracked.
+  for (auto MI = MyStates.bottom_up_ptr_begin(),
+            ME = MyStates.bottom_up_ptr_end();
+       MI != ME; ++MI) {
+    const Value *Ptr = MI->first;
+    if (Ptr == Arg)
+      continue; // Handled above.
+    BottomUpPtrState &S = MI->second;
+
+    if (S.HandlePotentialAlterRefCount(Inst, Ptr, PA, Class))
+      continue;
+
+    S.HandlePotentialUse(BB, Inst, Ptr, PA, Class);
+  }
+
+  return NestingDetected;
+}
+
+bool ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
+                               DenseMap<const BasicBlock *, BBState> &BBStates,
+                               BlotMapVector<Value *, RRInfo> &Retains) {
+  LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::VisitBottomUp ==\n");
+
+  bool NestingDetected = false;
+  BBState &MyStates = BBStates[BB];
+
+  // Merge the states from each successor to compute the initial state
+  // for the current block.
+  BBState::edge_iterator SI(MyStates.succ_begin()),
+                         SE(MyStates.succ_end());
+  if (SI != SE) {
+    const BasicBlock *Succ = *SI;
+    DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Succ);
+    assert(I != BBStates.end());
+    MyStates.InitFromSucc(I->second);
+    ++SI;
+    for (; SI != SE; ++SI) {
+      Succ = *SI;
+      I = BBStates.find(Succ);
+      assert(I != BBStates.end());
+      MyStates.MergeSucc(I->second);
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "Before:\n"
+                    << BBStates[BB] << "\n"
+                    << "Performing Dataflow:\n");
+
+  // Visit all the instructions, bottom-up.
+  for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; --I) {
+    Instruction *Inst = &*std::prev(I);
+
+    // Invoke instructions are visited as part of their successors (below).
+    if (isa<InvokeInst>(Inst))
+      continue;
+
+    LLVM_DEBUG(dbgs() << "    Visiting " << *Inst << "\n");
+
+    NestingDetected |= VisitInstructionBottomUp(Inst, BB, Retains, MyStates);
+
+    // Bail out if the number of pointers being tracked becomes too large so
+    // that this pass can complete in a reasonable amount of time.
+    if (MyStates.bottom_up_ptr_list_size() > MaxPtrStates) {
+      DisableRetainReleasePairing = true;
+      return false;
+    }
+  }
+
+  // If there's a predecessor with an invoke, visit the invoke as if it were
+  // part of this block, since we can't insert code after an invoke in its own
+  // block, and we don't want to split critical edges.
+  for (BBState::edge_iterator PI(MyStates.pred_begin()),
+       PE(MyStates.pred_end()); PI != PE; ++PI) {
+    BasicBlock *Pred = *PI;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(&Pred->back()))
+      NestingDetected |= VisitInstructionBottomUp(II, BB, Retains, MyStates);
+  }
+
+  LLVM_DEBUG(dbgs() << "\nFinal State:\n" << BBStates[BB] << "\n");
+
+  return NestingDetected;
+}
+
+bool
+ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
+                                    DenseMap<Value *, RRInfo> &Releases,
+                                    BBState &MyStates) {
+  bool NestingDetected = false;
+  ARCInstKind Class = GetARCInstKind(Inst);
+  const Value *Arg = nullptr;
+
+  LLVM_DEBUG(dbgs() << "        Class: " << Class << "\n");
+
+  switch (Class) {
+  case ARCInstKind::RetainBlock:
+    // In OptimizeIndividualCalls, we have strength reduced all optimizable
+    // objc_retainBlocks to objc_retains. Thus at this point any
+    // objc_retainBlocks that we see are not optimizable. We need to break since
+    // a retain can be a potential use.
+    break;
+  case ARCInstKind::Retain:
+  case ARCInstKind::RetainRV: {
+    Arg = GetArgRCIdentityRoot(Inst);
+    TopDownPtrState &S = MyStates.getPtrTopDownState(Arg);
+    NestingDetected |= S.InitTopDown(Class, Inst);
+    // A retain can be a potential use; proceed to the generic checking
+    // code below.
+    break;
+  }
+  case ARCInstKind::Release: {
+    Arg = GetArgRCIdentityRoot(Inst);
+    TopDownPtrState &S = MyStates.getPtrTopDownState(Arg);
+    // Try to form a tentative pair in between this release instruction and the
+    // top down pointers that we are tracking.
+    if (S.MatchWithRelease(MDKindCache, Inst)) {
+      // If we succeed, copy S's RRInfo into the Release -> {Retain Set
+      // Map}. Then we clear S.
+      LLVM_DEBUG(dbgs() << "        Matching with: " << *Inst << "\n");
+      Releases[Inst] = S.GetRRInfo();
+      S.ClearSequenceProgress();
+    }
+    break;
+  }
+  case ARCInstKind::AutoreleasepoolPop:
+    // Conservatively, clear MyStates for all known pointers.
+    MyStates.clearTopDownPointers();
+    return false;
+  case ARCInstKind::AutoreleasepoolPush:
+  case ARCInstKind::None:
+    // These can not be uses of
+    return false;
+  default:
+    break;
+  }
+
+  // Consider any other possible effects of this instruction on each
+  // pointer being tracked.
+  for (auto MI = MyStates.top_down_ptr_begin(),
+            ME = MyStates.top_down_ptr_end();
+       MI != ME; ++MI) {
+    const Value *Ptr = MI->first;
+    if (Ptr == Arg)
+      continue; // Handled above.
+    TopDownPtrState &S = MI->second;
+    if (S.HandlePotentialAlterRefCount(Inst, Ptr, PA, Class))
+      continue;
+
+    S.HandlePotentialUse(Inst, Ptr, PA, Class);
+  }
+
+  return NestingDetected;
+}
+
+bool
+ObjCARCOpt::VisitTopDown(BasicBlock *BB,
+                         DenseMap<const BasicBlock *, BBState> &BBStates,
+                         DenseMap<Value *, RRInfo> &Releases) {
+  LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::VisitTopDown ==\n");
+  bool NestingDetected = false;
+  BBState &MyStates = BBStates[BB];
+
+  // Merge the states from each predecessor to compute the initial state
+  // for the current block.
+  BBState::edge_iterator PI(MyStates.pred_begin()),
+                         PE(MyStates.pred_end());
+  if (PI != PE) {
+    const BasicBlock *Pred = *PI;
+    DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred);
+    assert(I != BBStates.end());
+    MyStates.InitFromPred(I->second);
+    ++PI;
+    for (; PI != PE; ++PI) {
+      Pred = *PI;
+      I = BBStates.find(Pred);
+      assert(I != BBStates.end());
+      MyStates.MergePred(I->second);
+    }
+  }
+
+  // Check that BB and MyStates have the same number of predecessors. This
+  // prevents retain calls that live outside a loop from being moved into the
+  // loop.
+  if (!BB->hasNPredecessors(MyStates.pred_end() - MyStates.pred_begin()))
+    for (auto I = MyStates.top_down_ptr_begin(),
+              E = MyStates.top_down_ptr_end();
+         I != E; ++I)
+      I->second.SetCFGHazardAfflicted(true);
+
+  LLVM_DEBUG(dbgs() << "Before:\n"
+                    << BBStates[BB] << "\n"
+                    << "Performing Dataflow:\n");
+
+  // Visit all the instructions, top-down.
+  for (Instruction &Inst : *BB) {
+    LLVM_DEBUG(dbgs() << "    Visiting " << Inst << "\n");
+
+    NestingDetected |= VisitInstructionTopDown(&Inst, Releases, MyStates);
+
+    // Bail out if the number of pointers being tracked becomes too large so
+    // that this pass can complete in a reasonable amount of time.
+    if (MyStates.top_down_ptr_list_size() > MaxPtrStates) {
+      DisableRetainReleasePairing = true;
+      return false;
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "\nState Before Checking for CFG Hazards:\n"
+                    << BBStates[BB] << "\n\n");
+  CheckForCFGHazards(BB, BBStates, MyStates);
+  LLVM_DEBUG(dbgs() << "Final State:\n" << BBStates[BB] << "\n");
+  return NestingDetected;
+}
+
+static void
+ComputePostOrders(Function &F,
+                  SmallVectorImpl<BasicBlock *> &PostOrder,
+                  SmallVectorImpl<BasicBlock *> &ReverseCFGPostOrder,
+                  unsigned NoObjCARCExceptionsMDKind,
+                  DenseMap<const BasicBlock *, BBState> &BBStates) {
+  /// The visited set, for doing DFS walks.
+  SmallPtrSet<BasicBlock *, 16> Visited;
+
+  // Do DFS, computing the PostOrder.
+  SmallPtrSet<BasicBlock *, 16> OnStack;
+  SmallVector<std::pair<BasicBlock *, succ_iterator>, 16> SuccStack;
+
+  // Functions always have exactly one entry block, and we don't have
+  // any other block that we treat like an entry block.
+  BasicBlock *EntryBB = &F.getEntryBlock();
+  BBState &MyStates = BBStates[EntryBB];
+  MyStates.SetAsEntry();
+  Instruction *EntryTI = EntryBB->getTerminator();
+  SuccStack.push_back(std::make_pair(EntryBB, succ_iterator(EntryTI)));
+  Visited.insert(EntryBB);
+  OnStack.insert(EntryBB);
+  do {
+  dfs_next_succ:
+    BasicBlock *CurrBB = SuccStack.back().first;
+    succ_iterator SE(CurrBB->getTerminator(), false);
+
+    while (SuccStack.back().second != SE) {
+      BasicBlock *SuccBB = *SuccStack.back().second++;
+      if (Visited.insert(SuccBB).second) {
+        SuccStack.push_back(
+            std::make_pair(SuccBB, succ_iterator(SuccBB->getTerminator())));
+        BBStates[CurrBB].addSucc(SuccBB);
+        BBState &SuccStates = BBStates[SuccBB];
+        SuccStates.addPred(CurrBB);
+        OnStack.insert(SuccBB);
+        goto dfs_next_succ;
+      }
+
+      if (!OnStack.count(SuccBB)) {
+        BBStates[CurrBB].addSucc(SuccBB);
+        BBStates[SuccBB].addPred(CurrBB);
+      }
+    }
+    OnStack.erase(CurrBB);
+    PostOrder.push_back(CurrBB);
+    SuccStack.pop_back();
+  } while (!SuccStack.empty());
+
+  Visited.clear();
+
+  // Do reverse-CFG DFS, computing the reverse-CFG PostOrder.
+  // Functions may have many exits, and there also blocks which we treat
+  // as exits due to ignored edges.
+  SmallVector<std::pair<BasicBlock *, BBState::edge_iterator>, 16> PredStack;
+  for (BasicBlock &ExitBB : F) {
+    BBState &MyStates = BBStates[&ExitBB];
+    if (!MyStates.isExit())
+      continue;
+
+    MyStates.SetAsExit();
+
+    PredStack.push_back(std::make_pair(&ExitBB, MyStates.pred_begin()));
+    Visited.insert(&ExitBB);
+    while (!PredStack.empty()) {
+    reverse_dfs_next_succ:
+      BBState::edge_iterator PE = BBStates[PredStack.back().first].pred_end();
+      while (PredStack.back().second != PE) {
+        BasicBlock *BB = *PredStack.back().second++;
+        if (Visited.insert(BB).second) {
+          PredStack.push_back(std::make_pair(BB, BBStates[BB].pred_begin()));
+          goto reverse_dfs_next_succ;
+        }
+      }
+      ReverseCFGPostOrder.push_back(PredStack.pop_back_val().first);
+    }
+  }
+}
+
+// Visit the function both top-down and bottom-up.
+bool ObjCARCOpt::Visit(Function &F,
+                       DenseMap<const BasicBlock *, BBState> &BBStates,
+                       BlotMapVector<Value *, RRInfo> &Retains,
+                       DenseMap<Value *, RRInfo> &Releases) {
+  // Use reverse-postorder traversals, because we magically know that loops
+  // will be well behaved, i.e. they won't repeatedly call retain on a single
+  // pointer without doing a release. We can't use the ReversePostOrderTraversal
+  // class here because we want the reverse-CFG postorder to consider each
+  // function exit point, and we want to ignore selected cycle edges.
+  SmallVector<BasicBlock *, 16> PostOrder;
+  SmallVector<BasicBlock *, 16> ReverseCFGPostOrder;
+  ComputePostOrders(F, PostOrder, ReverseCFGPostOrder,
+                    MDKindCache.get(ARCMDKindID::NoObjCARCExceptions),
+                    BBStates);
+
+  // Use reverse-postorder on the reverse CFG for bottom-up.
+  bool BottomUpNestingDetected = false;
+  for (BasicBlock *BB : llvm::reverse(ReverseCFGPostOrder)) {
+    BottomUpNestingDetected |= VisitBottomUp(BB, BBStates, Retains);
+    if (DisableRetainReleasePairing)
+      return false;
+  }
+
+  // Use reverse-postorder for top-down.
+  bool TopDownNestingDetected = false;
+  for (BasicBlock *BB : llvm::reverse(PostOrder)) {
+    TopDownNestingDetected |= VisitTopDown(BB, BBStates, Releases);
+    if (DisableRetainReleasePairing)
+      return false;
+  }
+
+  return TopDownNestingDetected && BottomUpNestingDetected;
+}
+
+/// Move the calls in RetainsToMove and ReleasesToMove.
+void ObjCARCOpt::MoveCalls(Value *Arg, RRInfo &RetainsToMove,
+                           RRInfo &ReleasesToMove,
+                           BlotMapVector<Value *, RRInfo> &Retains,
+                           DenseMap<Value *, RRInfo> &Releases,
+                           SmallVectorImpl<Instruction *> &DeadInsts,
+                           Module *M) {
+  Type *ArgTy = Arg->getType();
+  Type *ParamTy = PointerType::getUnqual(Type::getInt8Ty(ArgTy->getContext()));
+
+  LLVM_DEBUG(dbgs() << "== ObjCARCOpt::MoveCalls ==\n");
+
+  // Insert the new retain and release calls.
+  for (Instruction *InsertPt : ReleasesToMove.ReverseInsertPts) {
+    Value *MyArg = ArgTy == ParamTy ? Arg :
+                   new BitCastInst(Arg, ParamTy, "", InsertPt);
+    Function *Decl = EP.get(ARCRuntimeEntryPointKind::Retain);
+    CallInst *Call = CallInst::Create(Decl, MyArg, "", InsertPt);
+    Call->setDoesNotThrow();
+    Call->setTailCall();
+
+    LLVM_DEBUG(dbgs() << "Inserting new Retain: " << *Call
+                      << "\n"
+                         "At insertion point: "
+                      << *InsertPt << "\n");
+  }
+  for (Instruction *InsertPt : RetainsToMove.ReverseInsertPts) {
+    Value *MyArg = ArgTy == ParamTy ? Arg :
+                   new BitCastInst(Arg, ParamTy, "", InsertPt);
+    Function *Decl = EP.get(ARCRuntimeEntryPointKind::Release);
+    CallInst *Call = CallInst::Create(Decl, MyArg, "", InsertPt);
+    // Attach a clang.imprecise_release metadata tag, if appropriate.
+    if (MDNode *M = ReleasesToMove.ReleaseMetadata)
+      Call->setMetadata(MDKindCache.get(ARCMDKindID::ImpreciseRelease), M);
+    Call->setDoesNotThrow();
+    if (ReleasesToMove.IsTailCallRelease)
+      Call->setTailCall();
+
+    LLVM_DEBUG(dbgs() << "Inserting new Release: " << *Call
+                      << "\n"
+                         "At insertion point: "
+                      << *InsertPt << "\n");
+  }
+
+  // Delete the original retain and release calls.
+  for (Instruction *OrigRetain : RetainsToMove.Calls) {
+    Retains.blot(OrigRetain);
+    DeadInsts.push_back(OrigRetain);
+    LLVM_DEBUG(dbgs() << "Deleting retain: " << *OrigRetain << "\n");
+  }
+  for (Instruction *OrigRelease : ReleasesToMove.Calls) {
+    Releases.erase(OrigRelease);
+    DeadInsts.push_back(OrigRelease);
+    LLVM_DEBUG(dbgs() << "Deleting release: " << *OrigRelease << "\n");
+  }
+}
+
+bool ObjCARCOpt::PairUpRetainsAndReleases(
+    DenseMap<const BasicBlock *, BBState> &BBStates,
+    BlotMapVector<Value *, RRInfo> &Retains,
+    DenseMap<Value *, RRInfo> &Releases, Module *M,
+    Instruction *Retain,
+    SmallVectorImpl<Instruction *> &DeadInsts, RRInfo &RetainsToMove,
+    RRInfo &ReleasesToMove, Value *Arg, bool KnownSafe,
+    bool &AnyPairsCompletelyEliminated) {
+  // If a pair happens in a region where it is known that the reference count
+  // is already incremented, we can similarly ignore possible decrements unless
+  // we are dealing with a retainable object with multiple provenance sources.
+  bool KnownSafeTD = true, KnownSafeBU = true;
+  bool CFGHazardAfflicted = false;
+
+  // Connect the dots between the top-down-collected RetainsToMove and
+  // bottom-up-collected ReleasesToMove to form sets of related calls.
+  // This is an iterative process so that we connect multiple releases
+  // to multiple retains if needed.
+  unsigned OldDelta = 0;
+  unsigned NewDelta = 0;
+  unsigned OldCount = 0;
+  unsigned NewCount = 0;
+  bool FirstRelease = true;
+  for (SmallVector<Instruction *, 4> NewRetains{Retain};;) {
+    SmallVector<Instruction *, 4> NewReleases;
+    for (Instruction *NewRetain : NewRetains) {
+      auto It = Retains.find(NewRetain);
+      assert(It != Retains.end());
+      const RRInfo &NewRetainRRI = It->second;
+      KnownSafeTD &= NewRetainRRI.KnownSafe;
+      CFGHazardAfflicted |= NewRetainRRI.CFGHazardAfflicted;
+      for (Instruction *NewRetainRelease : NewRetainRRI.Calls) {
+        auto Jt = Releases.find(NewRetainRelease);
+        if (Jt == Releases.end())
+          return false;
+        const RRInfo &NewRetainReleaseRRI = Jt->second;
+
+        // If the release does not have a reference to the retain as well,
+        // something happened which is unaccounted for. Do not do anything.
+        //
+        // This can happen if we catch an additive overflow during path count
+        // merging.
+        if (!NewRetainReleaseRRI.Calls.count(NewRetain))
+          return false;
+
+        if (ReleasesToMove.Calls.insert(NewRetainRelease).second) {
+          // If we overflow when we compute the path count, don't remove/move
+          // anything.
+          const BBState &NRRBBState = BBStates[NewRetainRelease->getParent()];
+          unsigned PathCount = BBState::OverflowOccurredValue;
+          if (NRRBBState.GetAllPathCountWithOverflow(PathCount))
+            return false;
+          assert(PathCount != BBState::OverflowOccurredValue &&
+                 "PathCount at this point can not be "
+                 "OverflowOccurredValue.");
+          OldDelta -= PathCount;
+
+          // Merge the ReleaseMetadata and IsTailCallRelease values.
+          if (FirstRelease) {
+            ReleasesToMove.ReleaseMetadata =
+              NewRetainReleaseRRI.ReleaseMetadata;
+            ReleasesToMove.IsTailCallRelease =
+              NewRetainReleaseRRI.IsTailCallRelease;
+            FirstRelease = false;
+          } else {
+            if (ReleasesToMove.ReleaseMetadata !=
+                NewRetainReleaseRRI.ReleaseMetadata)
+              ReleasesToMove.ReleaseMetadata = nullptr;
+            if (ReleasesToMove.IsTailCallRelease !=
+                NewRetainReleaseRRI.IsTailCallRelease)
+              ReleasesToMove.IsTailCallRelease = false;
+          }
+
+          // Collect the optimal insertion points.
+          if (!KnownSafe)
+            for (Instruction *RIP : NewRetainReleaseRRI.ReverseInsertPts) {
+              if (ReleasesToMove.ReverseInsertPts.insert(RIP).second) {
+                // If we overflow when we compute the path count, don't
+                // remove/move anything.
+                const BBState &RIPBBState = BBStates[RIP->getParent()];
+                PathCount = BBState::OverflowOccurredValue;
+                if (RIPBBState.GetAllPathCountWithOverflow(PathCount))
+                  return false;
+                assert(PathCount != BBState::OverflowOccurredValue &&
+                       "PathCount at this point can not be "
+                       "OverflowOccurredValue.");
+                NewDelta -= PathCount;
+              }
+            }
+          NewReleases.push_back(NewRetainRelease);
+        }
+      }
+    }
+    NewRetains.clear();
+    if (NewReleases.empty()) break;
+
+    // Back the other way.
+    for (Instruction *NewRelease : NewReleases) {
+      auto It = Releases.find(NewRelease);
+      assert(It != Releases.end());
+      const RRInfo &NewReleaseRRI = It->second;
+      KnownSafeBU &= NewReleaseRRI.KnownSafe;
+      CFGHazardAfflicted |= NewReleaseRRI.CFGHazardAfflicted;
+      for (Instruction *NewReleaseRetain : NewReleaseRRI.Calls) {
+        auto Jt = Retains.find(NewReleaseRetain);
+        if (Jt == Retains.end())
+          return false;
+        const RRInfo &NewReleaseRetainRRI = Jt->second;
+
+        // If the retain does not have a reference to the release as well,
+        // something happened which is unaccounted for. Do not do anything.
+        //
+        // This can happen if we catch an additive overflow during path count
+        // merging.
+        if (!NewReleaseRetainRRI.Calls.count(NewRelease))
+          return false;
+
+        if (RetainsToMove.Calls.insert(NewReleaseRetain).second) {
+          // If we overflow when we compute the path count, don't remove/move
+          // anything.
+          const BBState &NRRBBState = BBStates[NewReleaseRetain->getParent()];
+          unsigned PathCount = BBState::OverflowOccurredValue;
+          if (NRRBBState.GetAllPathCountWithOverflow(PathCount))
+            return false;
+          assert(PathCount != BBState::OverflowOccurredValue &&
+                 "PathCount at this point can not be "
+                 "OverflowOccurredValue.");
+          OldDelta += PathCount;
+          OldCount += PathCount;
+
+          // Collect the optimal insertion points.
+          if (!KnownSafe)
+            for (Instruction *RIP : NewReleaseRetainRRI.ReverseInsertPts) {
+              if (RetainsToMove.ReverseInsertPts.insert(RIP).second) {
+                // If we overflow when we compute the path count, don't
+                // remove/move anything.
+                const BBState &RIPBBState = BBStates[RIP->getParent()];
+
+                PathCount = BBState::OverflowOccurredValue;
+                if (RIPBBState.GetAllPathCountWithOverflow(PathCount))
+                  return false;
+                assert(PathCount != BBState::OverflowOccurredValue &&
+                       "PathCount at this point can not be "
+                       "OverflowOccurredValue.");
+                NewDelta += PathCount;
+                NewCount += PathCount;
+              }
+            }
+          NewRetains.push_back(NewReleaseRetain);
+        }
+      }
+    }
+    if (NewRetains.empty()) break;
+  }
+
+  // We can only remove pointers if we are known safe in both directions.
+  bool UnconditionallySafe = KnownSafeTD && KnownSafeBU;
+  if (UnconditionallySafe) {
+    RetainsToMove.ReverseInsertPts.clear();
+    ReleasesToMove.ReverseInsertPts.clear();
+    NewCount = 0;
+  } else {
+    // Determine whether the new insertion points we computed preserve the
+    // balance of retain and release calls through the program.
+    // TODO: If the fully aggressive solution isn't valid, try to find a
+    // less aggressive solution which is.
+    if (NewDelta != 0)
+      return false;
+
+    // At this point, we are not going to remove any RR pairs, but we still are
+    // able to move RR pairs. If one of our pointers is afflicted with
+    // CFGHazards, we cannot perform such code motion so exit early.
+    const bool WillPerformCodeMotion =
+        !RetainsToMove.ReverseInsertPts.empty() ||
+        !ReleasesToMove.ReverseInsertPts.empty();
+    if (CFGHazardAfflicted && WillPerformCodeMotion)
+      return false;
+  }
+
+  // Determine whether the original call points are balanced in the retain and
+  // release calls through the program. If not, conservatively don't touch
+  // them.
+  // TODO: It's theoretically possible to do code motion in this case, as
+  // long as the existing imbalances are maintained.
+  if (OldDelta != 0)
+    return false;
+
+  Changed = true;
+  assert(OldCount != 0 && "Unreachable code?");
+  NumRRs += OldCount - NewCount;
+  // Set to true if we completely removed any RR pairs.
+  AnyPairsCompletelyEliminated = NewCount == 0;
+
+  // We can move calls!
+  return true;
+}
+
+/// Identify pairings between the retains and releases, and delete and/or move
+/// them.
+bool ObjCARCOpt::PerformCodePlacement(
+    DenseMap<const BasicBlock *, BBState> &BBStates,
+    BlotMapVector<Value *, RRInfo> &Retains,
+    DenseMap<Value *, RRInfo> &Releases, Module *M) {
+  LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::PerformCodePlacement ==\n");
+
+  bool AnyPairsCompletelyEliminated = false;
+  SmallVector<Instruction *, 8> DeadInsts;
+
+  // Visit each retain.
+  for (BlotMapVector<Value *, RRInfo>::const_iterator I = Retains.begin(),
+                                                      E = Retains.end();
+       I != E; ++I) {
+    Value *V = I->first;
+    if (!V) continue; // blotted
+
+    Instruction *Retain = cast<Instruction>(V);
+
+    LLVM_DEBUG(dbgs() << "Visiting: " << *Retain << "\n");
+
+    Value *Arg = GetArgRCIdentityRoot(Retain);
+
+    // If the object being released is in static or stack storage, we know it's
+    // not being managed by ObjC reference counting, so we can delete pairs
+    // regardless of what possible decrements or uses lie between them.
+    bool KnownSafe = isa<Constant>(Arg) || isa<AllocaInst>(Arg);
+
+    // A constant pointer can't be pointing to an object on the heap. It may
+    // be reference-counted, but it won't be deleted.
+    if (const LoadInst *LI = dyn_cast<LoadInst>(Arg))
+      if (const GlobalVariable *GV =
+            dyn_cast<GlobalVariable>(
+              GetRCIdentityRoot(LI->getPointerOperand())))
+        if (GV->isConstant())
+          KnownSafe = true;
+
+    // Connect the dots between the top-down-collected RetainsToMove and
+    // bottom-up-collected ReleasesToMove to form sets of related calls.
+    RRInfo RetainsToMove, ReleasesToMove;
+
+    bool PerformMoveCalls = PairUpRetainsAndReleases(
+        BBStates, Retains, Releases, M, Retain, DeadInsts,
+        RetainsToMove, ReleasesToMove, Arg, KnownSafe,
+        AnyPairsCompletelyEliminated);
+
+    if (PerformMoveCalls) {
+      // Ok, everything checks out and we're all set. Let's move/delete some
+      // code!
+      MoveCalls(Arg, RetainsToMove, ReleasesToMove,
+                Retains, Releases, DeadInsts, M);
+    }
+  }
+
+  // Now that we're done moving everything, we can delete the newly dead
+  // instructions, as we no longer need them as insert points.
+  while (!DeadInsts.empty())
+    EraseInstruction(DeadInsts.pop_back_val());
+
+  return AnyPairsCompletelyEliminated;
+}
+
+/// Weak pointer optimizations.
+void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
+  LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeWeakCalls ==\n");
+
+  // First, do memdep-style RLE and S2L optimizations. We can't use memdep
+  // itself because it uses AliasAnalysis and we need to do provenance
+  // queries instead.
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+    Instruction *Inst = &*I++;
+
+    LLVM_DEBUG(dbgs() << "Visiting: " << *Inst << "\n");
+
+    ARCInstKind Class = GetBasicARCInstKind(Inst);
+    if (Class != ARCInstKind::LoadWeak &&
+        Class != ARCInstKind::LoadWeakRetained)
+      continue;
+
+    // Delete objc_loadWeak calls with no users.
+    if (Class == ARCInstKind::LoadWeak && Inst->use_empty()) {
+      Inst->eraseFromParent();
+      Changed = true;
+      continue;
+    }
+
+    // TODO: For now, just look for an earlier available version of this value
+    // within the same block. Theoretically, we could do memdep-style non-local
+    // analysis too, but that would want caching. A better approach would be to
+    // use the technique that EarlyCSE uses.
+    inst_iterator Current = std::prev(I);
+    BasicBlock *CurrentBB = &*Current.getBasicBlockIterator();
+    for (BasicBlock::iterator B = CurrentBB->begin(),
+                              J = Current.getInstructionIterator();
+         J != B; --J) {
+      Instruction *EarlierInst = &*std::prev(J);
+      ARCInstKind EarlierClass = GetARCInstKind(EarlierInst);
+      switch (EarlierClass) {
+      case ARCInstKind::LoadWeak:
+      case ARCInstKind::LoadWeakRetained: {
+        // If this is loading from the same pointer, replace this load's value
+        // with that one.
+        CallInst *Call = cast<CallInst>(Inst);
+        CallInst *EarlierCall = cast<CallInst>(EarlierInst);
+        Value *Arg = Call->getArgOperand(0);
+        Value *EarlierArg = EarlierCall->getArgOperand(0);
+        switch (PA.getAA()->alias(Arg, EarlierArg)) {
+        case MustAlias:
+          Changed = true;
+          // If the load has a builtin retain, insert a plain retain for it.
+          if (Class == ARCInstKind::LoadWeakRetained) {
+            Function *Decl = EP.get(ARCRuntimeEntryPointKind::Retain);
+            CallInst *CI = CallInst::Create(Decl, EarlierCall, "", Call);
+            CI->setTailCall();
+          }
+          // Zap the fully redundant load.
+          Call->replaceAllUsesWith(EarlierCall);
+          Call->eraseFromParent();
+          goto clobbered;
+        case MayAlias:
+        case PartialAlias:
+          goto clobbered;
+        case NoAlias:
+          break;
+        }
+        break;
+      }
+      case ARCInstKind::StoreWeak:
+      case ARCInstKind::InitWeak: {
+        // If this is storing to the same pointer and has the same size etc.
+        // replace this load's value with the stored value.
+        CallInst *Call = cast<CallInst>(Inst);
+        CallInst *EarlierCall = cast<CallInst>(EarlierInst);
+        Value *Arg = Call->getArgOperand(0);
+        Value *EarlierArg = EarlierCall->getArgOperand(0);
+        switch (PA.getAA()->alias(Arg, EarlierArg)) {
+        case MustAlias:
+          Changed = true;
+          // If the load has a builtin retain, insert a plain retain for it.
+          if (Class == ARCInstKind::LoadWeakRetained) {
+            Function *Decl = EP.get(ARCRuntimeEntryPointKind::Retain);
+            CallInst *CI = CallInst::Create(Decl, EarlierCall, "", Call);
+            CI->setTailCall();
+          }
+          // Zap the fully redundant load.
+          Call->replaceAllUsesWith(EarlierCall->getArgOperand(1));
+          Call->eraseFromParent();
+          goto clobbered;
+        case MayAlias:
+        case PartialAlias:
+          goto clobbered;
+        case NoAlias:
+          break;
+        }
+        break;
+      }
+      case ARCInstKind::MoveWeak:
+      case ARCInstKind::CopyWeak:
+        // TOOD: Grab the copied value.
+        goto clobbered;
+      case ARCInstKind::AutoreleasepoolPush:
+      case ARCInstKind::None:
+      case ARCInstKind::IntrinsicUser:
+      case ARCInstKind::User:
+        // Weak pointers are only modified through the weak entry points
+        // (and arbitrary calls, which could call the weak entry points).
+        break;
+      default:
+        // Anything else could modify the weak pointer.
+        goto clobbered;
+      }
+    }
+  clobbered:;
+  }
+
+  // Then, for each destroyWeak with an alloca operand, check to see if
+  // the alloca and all its users can be zapped.
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+    Instruction *Inst = &*I++;
+    ARCInstKind Class = GetBasicARCInstKind(Inst);
+    if (Class != ARCInstKind::DestroyWeak)
+      continue;
+
+    CallInst *Call = cast<CallInst>(Inst);
+    Value *Arg = Call->getArgOperand(0);
+    if (AllocaInst *Alloca = dyn_cast<AllocaInst>(Arg)) {
+      for (User *U : Alloca->users()) {
+        const Instruction *UserInst = cast<Instruction>(U);
+        switch (GetBasicARCInstKind(UserInst)) {
+        case ARCInstKind::InitWeak:
+        case ARCInstKind::StoreWeak:
+        case ARCInstKind::DestroyWeak:
+          continue;
+        default:
+          goto done;
+        }
+      }
+      Changed = true;
+      for (auto UI = Alloca->user_begin(), UE = Alloca->user_end(); UI != UE;) {
+        CallInst *UserInst = cast<CallInst>(*UI++);
+        switch (GetBasicARCInstKind(UserInst)) {
+        case ARCInstKind::InitWeak:
+        case ARCInstKind::StoreWeak:
+          // These functions return their second argument.
+          UserInst->replaceAllUsesWith(UserInst->getArgOperand(1));
+          break;
+        case ARCInstKind::DestroyWeak:
+          // No return value.
+          break;
+        default:
+          llvm_unreachable("alloca really is used!");
+        }
+        UserInst->eraseFromParent();
+      }
+      Alloca->eraseFromParent();
+    done:;
+    }
+  }
+}
+
+/// Identify program paths which execute sequences of retains and releases which
+/// can be eliminated.
+bool ObjCARCOpt::OptimizeSequences(Function &F) {
+  // Releases, Retains - These are used to store the results of the main flow
+  // analysis. These use Value* as the key instead of Instruction* so that the
+  // map stays valid when we get around to rewriting code and calls get
+  // replaced by arguments.
+  DenseMap<Value *, RRInfo> Releases;
+  BlotMapVector<Value *, RRInfo> Retains;
+
+  // This is used during the traversal of the function to track the
+  // states for each identified object at each block.
+  DenseMap<const BasicBlock *, BBState> BBStates;
+
+  // Analyze the CFG of the function, and all instructions.
+  bool NestingDetected = Visit(F, BBStates, Retains, Releases);
+
+  if (DisableRetainReleasePairing)
+    return false;
+
+  // Transform.
+  bool AnyPairsCompletelyEliminated = PerformCodePlacement(BBStates, Retains,
+                                                           Releases,
+                                                           F.getParent());
+
+  return AnyPairsCompletelyEliminated && NestingDetected;
+}
+
+/// Check if there is a dependent call earlier that does not have anything in
+/// between the Retain and the call that can affect the reference count of their
+/// shared pointer argument. Note that Retain need not be in BB.
 static CallInst *HasSafePathToPredecessorCall(const Value *Arg,
                                               Instruction *Retain,
                                               ProvenanceAnalysis &PA) {
   auto *Call = dyn_cast_or_null<CallInst>(findSingleDependency(
       CanChangeRetainCount, Arg, Retain->getParent(), Retain, PA));
- 
-  // Check that the pointer is the return value of the call. 
-  if (!Call || Arg != Call) 
+
+  // Check that the pointer is the return value of the call.
+  if (!Call || Arg != Call)
     return nullptr;
- 
-  // Check that the call is a regular call. 
-  ARCInstKind Class = GetBasicARCInstKind(Call); 
+
+  // Check that the call is a regular call.
+  ARCInstKind Class = GetBasicARCInstKind(Call);
   return Class == ARCInstKind::CallOrUser || Class == ARCInstKind::Call
              ? Call
              : nullptr;
-} 
- 
-/// Find a dependent retain that precedes the given autorelease for which there 
-/// is nothing in between the two instructions that can affect the ref count of 
-/// Arg. 
-static CallInst * 
-FindPredecessorRetainWithSafePath(const Value *Arg, BasicBlock *BB, 
-                                  Instruction *Autorelease, 
-                                  ProvenanceAnalysis &PA) { 
+}
+
+/// Find a dependent retain that precedes the given autorelease for which there
+/// is nothing in between the two instructions that can affect the ref count of
+/// Arg.
+static CallInst *
+FindPredecessorRetainWithSafePath(const Value *Arg, BasicBlock *BB,
+                                  Instruction *Autorelease,
+                                  ProvenanceAnalysis &PA) {
   auto *Retain = dyn_cast_or_null<CallInst>(
       findSingleDependency(CanChangeRetainCount, Arg, BB, Autorelease, PA));
- 
-  // Check that we found a retain with the same argument. 
-  if (!Retain || !IsRetain(GetBasicARCInstKind(Retain)) || 
-      GetArgRCIdentityRoot(Retain) != Arg) { 
-    return nullptr; 
-  } 
- 
-  return Retain; 
-} 
- 
-/// Look for an ``autorelease'' instruction dependent on Arg such that there are 
-/// no instructions dependent on Arg that need a positive ref count in between 
-/// the autorelease and the ret. 
-static CallInst * 
-FindPredecessorAutoreleaseWithSafePath(const Value *Arg, BasicBlock *BB, 
-                                       ReturnInst *Ret, 
-                                       ProvenanceAnalysis &PA) { 
+
+  // Check that we found a retain with the same argument.
+  if (!Retain || !IsRetain(GetBasicARCInstKind(Retain)) ||
+      GetArgRCIdentityRoot(Retain) != Arg) {
+    return nullptr;
+  }
+
+  return Retain;
+}
+
+/// Look for an ``autorelease'' instruction dependent on Arg such that there are
+/// no instructions dependent on Arg that need a positive ref count in between
+/// the autorelease and the ret.
+static CallInst *
+FindPredecessorAutoreleaseWithSafePath(const Value *Arg, BasicBlock *BB,
+                                       ReturnInst *Ret,
+                                       ProvenanceAnalysis &PA) {
   SmallPtrSet<Instruction *, 4> DepInsts;
   auto *Autorelease = dyn_cast_or_null<CallInst>(
       findSingleDependency(NeedsPositiveRetainCount, Arg, BB, Ret, PA));
- 
-  if (!Autorelease) 
-    return nullptr; 
-  ARCInstKind AutoreleaseClass = GetBasicARCInstKind(Autorelease); 
-  if (!IsAutorelease(AutoreleaseClass)) 
-    return nullptr; 
-  if (GetArgRCIdentityRoot(Autorelease) != Arg) 
-    return nullptr; 
- 
-  return Autorelease; 
-} 
- 
-/// Look for this pattern: 
-/// \code 
-///    %call = call i8* @something(...) 
-///    %2 = call i8* @objc_retain(i8* %call) 
-///    %3 = call i8* @objc_autorelease(i8* %2) 
-///    ret i8* %3 
-/// \endcode 
-/// And delete the retain and autorelease. 
-void ObjCARCOpt::OptimizeReturns(Function &F) { 
-  if (!F.getReturnType()->isPointerTy()) 
-    return; 
- 
-  LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeReturns ==\n"); 
- 
-  for (BasicBlock &BB: F) { 
-    ReturnInst *Ret = dyn_cast<ReturnInst>(&BB.back()); 
-    if (!Ret) 
-      continue; 
- 
-    LLVM_DEBUG(dbgs() << "Visiting: " << *Ret << "\n"); 
- 
-    const Value *Arg = GetRCIdentityRoot(Ret->getOperand(0)); 
- 
-    // Look for an ``autorelease'' instruction that is a predecessor of Ret and 
-    // dependent on Arg such that there are no instructions dependent on Arg 
-    // that need a positive ref count in between the autorelease and Ret. 
+
+  if (!Autorelease)
+    return nullptr;
+  ARCInstKind AutoreleaseClass = GetBasicARCInstKind(Autorelease);
+  if (!IsAutorelease(AutoreleaseClass))
+    return nullptr;
+  if (GetArgRCIdentityRoot(Autorelease) != Arg)
+    return nullptr;
+
+  return Autorelease;
+}
+
+/// Look for this pattern:
+/// \code
+///    %call = call i8* @something(...)
+///    %2 = call i8* @objc_retain(i8* %call)
+///    %3 = call i8* @objc_autorelease(i8* %2)
+///    ret i8* %3
+/// \endcode
+/// And delete the retain and autorelease.
+void ObjCARCOpt::OptimizeReturns(Function &F) {
+  if (!F.getReturnType()->isPointerTy())
+    return;
+
+  LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeReturns ==\n");
+
+  for (BasicBlock &BB: F) {
+    ReturnInst *Ret = dyn_cast<ReturnInst>(&BB.back());
+    if (!Ret)
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Visiting: " << *Ret << "\n");
+
+    const Value *Arg = GetRCIdentityRoot(Ret->getOperand(0));
+
+    // Look for an ``autorelease'' instruction that is a predecessor of Ret and
+    // dependent on Arg such that there are no instructions dependent on Arg
+    // that need a positive ref count in between the autorelease and Ret.
     CallInst *Autorelease =
         FindPredecessorAutoreleaseWithSafePath(Arg, &BB, Ret, PA);
- 
-    if (!Autorelease) 
-      continue; 
- 
-    CallInst *Retain = FindPredecessorRetainWithSafePath( 
+
+    if (!Autorelease)
+      continue;
+
+    CallInst *Retain = FindPredecessorRetainWithSafePath(
         Arg, Autorelease->getParent(), Autorelease, PA);
- 
-    if (!Retain) 
-      continue; 
- 
-    // Check that there is nothing that can affect the reference count 
-    // between the retain and the call.  Note that Retain need not be in BB. 
+
+    if (!Retain)
+      continue;
+
+    // Check that there is nothing that can affect the reference count
+    // between the retain and the call.  Note that Retain need not be in BB.
     CallInst *Call = HasSafePathToPredecessorCall(Arg, Retain, PA);
- 
-    // Don't remove retainRV/autoreleaseRV pairs if the call isn't a tail call. 
+
+    // Don't remove retainRV/autoreleaseRV pairs if the call isn't a tail call.
     if (!Call ||
         (!Call->isTailCall() &&
          GetBasicARCInstKind(Retain) == ARCInstKind::RetainRV &&
          GetBasicARCInstKind(Autorelease) == ARCInstKind::AutoreleaseRV))
-      continue; 
- 
-    // If so, we can zap the retain and autorelease. 
-    Changed = true; 
-    ++NumRets; 
-    LLVM_DEBUG(dbgs() << "Erasing: " << *Retain << "\nErasing: " << *Autorelease 
-                      << "\n"); 
-    EraseInstruction(Retain); 
-    EraseInstruction(Autorelease); 
-  } 
-} 
- 
-#ifndef NDEBUG 
-void 
-ObjCARCOpt::GatherStatistics(Function &F, bool AfterOptimization) { 
-  Statistic &NumRetains = 
-      AfterOptimization ? NumRetainsAfterOpt : NumRetainsBeforeOpt; 
-  Statistic &NumReleases = 
-      AfterOptimization ? NumReleasesAfterOpt : NumReleasesBeforeOpt; 
- 
-  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) { 
-    Instruction *Inst = &*I++; 
-    switch (GetBasicARCInstKind(Inst)) { 
-    default: 
-      break; 
-    case ARCInstKind::Retain: 
-      ++NumRetains; 
-      break; 
-    case ARCInstKind::Release: 
-      ++NumReleases; 
-      break; 
-    } 
-  } 
-} 
-#endif 
- 
+      continue;
+
+    // If so, we can zap the retain and autorelease.
+    Changed = true;
+    ++NumRets;
+    LLVM_DEBUG(dbgs() << "Erasing: " << *Retain << "\nErasing: " << *Autorelease
+                      << "\n");
+    EraseInstruction(Retain);
+    EraseInstruction(Autorelease);
+  }
+}
+
+#ifndef NDEBUG
+void
+ObjCARCOpt::GatherStatistics(Function &F, bool AfterOptimization) {
+  Statistic &NumRetains =
+      AfterOptimization ? NumRetainsAfterOpt : NumRetainsBeforeOpt;
+  Statistic &NumReleases =
+      AfterOptimization ? NumReleasesAfterOpt : NumReleasesBeforeOpt;
+
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+    Instruction *Inst = &*I++;
+    switch (GetBasicARCInstKind(Inst)) {
+    default:
+      break;
+    case ARCInstKind::Retain:
+      ++NumRetains;
+      break;
+    case ARCInstKind::Release:
+      ++NumReleases;
+      break;
+    }
+  }
+}
+#endif
+
 void ObjCARCOpt::init(Module &M) {
-  if (!EnableARCOpts) 
+  if (!EnableARCOpts)
     return;
- 
-  // If nothing in the Module uses ARC, don't do anything. 
-  Run = ModuleHasARC(M); 
-  if (!Run) 
+
+  // If nothing in the Module uses ARC, don't do anything.
+  Run = ModuleHasARC(M);
+  if (!Run)
     return;
- 
-  // Intuitively, objc_retain and others are nocapture, however in practice 
-  // they are not, because they return their argument value. And objc_release 
-  // calls finalizers which can have arbitrary side effects. 
-  MDKindCache.init(&M); 
- 
-  // Initialize our runtime entry point cache. 
-  EP.init(&M); 
-} 
- 
+
+  // Intuitively, objc_retain and others are nocapture, however in practice
+  // they are not, because they return their argument value. And objc_release
+  // calls finalizers which can have arbitrary side effects.
+  MDKindCache.init(&M);
+
+  // Initialize our runtime entry point cache.
+  EP.init(&M);
+}
+
 bool ObjCARCOpt::run(Function &F, AAResults &AA) {
-  if (!EnableARCOpts) 
-    return false; 
- 
-  // If nothing in the Module uses ARC, don't do anything. 
-  if (!Run) 
-    return false; 
- 
-  Changed = false; 
- 
-  LLVM_DEBUG(dbgs() << "<<< ObjCARCOpt: Visiting Function: " << F.getName() 
-                    << " >>>" 
-                       "\n"); 
- 
+  if (!EnableARCOpts)
+    return false;
+
+  // If nothing in the Module uses ARC, don't do anything.
+  if (!Run)
+    return false;
+
+  Changed = false;
+
+  LLVM_DEBUG(dbgs() << "<<< ObjCARCOpt: Visiting Function: " << F.getName()
+                    << " >>>"
+                       "\n");
+
   PA.setAA(&AA);
- 
-#ifndef NDEBUG 
-  if (AreStatisticsEnabled()) { 
-    GatherStatistics(F, false); 
-  } 
-#endif 
- 
-  // This pass performs several distinct transformations. As a compile-time aid 
-  // when compiling code that isn't ObjC, skip these if the relevant ObjC 
-  // library functions aren't declared. 
- 
-  // Preliminary optimizations. This also computes UsedInThisFunction. 
-  OptimizeIndividualCalls(F); 
- 
-  // Optimizations for weak pointers. 
-  if (UsedInThisFunction & ((1 << unsigned(ARCInstKind::LoadWeak)) | 
-                            (1 << unsigned(ARCInstKind::LoadWeakRetained)) | 
-                            (1 << unsigned(ARCInstKind::StoreWeak)) | 
-                            (1 << unsigned(ARCInstKind::InitWeak)) | 
-                            (1 << unsigned(ARCInstKind::CopyWeak)) | 
-                            (1 << unsigned(ARCInstKind::MoveWeak)) | 
-                            (1 << unsigned(ARCInstKind::DestroyWeak)))) 
-    OptimizeWeakCalls(F); 
- 
-  // Optimizations for retain+release pairs. 
-  if (UsedInThisFunction & ((1 << unsigned(ARCInstKind::Retain)) | 
-                            (1 << unsigned(ARCInstKind::RetainRV)) | 
-                            (1 << unsigned(ARCInstKind::RetainBlock)))) 
-    if (UsedInThisFunction & (1 << unsigned(ARCInstKind::Release))) 
-      // Run OptimizeSequences until it either stops making changes or 
-      // no retain+release pair nesting is detected. 
-      while (OptimizeSequences(F)) {} 
- 
-  // Optimizations if objc_autorelease is used. 
-  if (UsedInThisFunction & ((1 << unsigned(ARCInstKind::Autorelease)) | 
-                            (1 << unsigned(ARCInstKind::AutoreleaseRV)))) 
-    OptimizeReturns(F); 
- 
-  // Gather statistics after optimization. 
-#ifndef NDEBUG 
-  if (AreStatisticsEnabled()) { 
-    GatherStatistics(F, true); 
-  } 
-#endif 
- 
-  LLVM_DEBUG(dbgs() << "\n"); 
- 
-  return Changed; 
-} 
- 
-void ObjCARCOpt::releaseMemory() { 
-  PA.clear(); 
-} 
- 
-/// @} 
-/// 
+
+#ifndef NDEBUG
+  if (AreStatisticsEnabled()) {
+    GatherStatistics(F, false);
+  }
+#endif
+
+  // This pass performs several distinct transformations. As a compile-time aid
+  // when compiling code that isn't ObjC, skip these if the relevant ObjC
+  // library functions aren't declared.
+
+  // Preliminary optimizations. This also computes UsedInThisFunction.
+  OptimizeIndividualCalls(F);
+
+  // Optimizations for weak pointers.
+  if (UsedInThisFunction & ((1 << unsigned(ARCInstKind::LoadWeak)) |
+                            (1 << unsigned(ARCInstKind::LoadWeakRetained)) |
+                            (1 << unsigned(ARCInstKind::StoreWeak)) |
+                            (1 << unsigned(ARCInstKind::InitWeak)) |
+                            (1 << unsigned(ARCInstKind::CopyWeak)) |
+                            (1 << unsigned(ARCInstKind::MoveWeak)) |
+                            (1 << unsigned(ARCInstKind::DestroyWeak))))
+    OptimizeWeakCalls(F);
+
+  // Optimizations for retain+release pairs.
+  if (UsedInThisFunction & ((1 << unsigned(ARCInstKind::Retain)) |
+                            (1 << unsigned(ARCInstKind::RetainRV)) |
+                            (1 << unsigned(ARCInstKind::RetainBlock))))
+    if (UsedInThisFunction & (1 << unsigned(ARCInstKind::Release)))
+      // Run OptimizeSequences until it either stops making changes or
+      // no retain+release pair nesting is detected.
+      while (OptimizeSequences(F)) {}
+
+  // Optimizations if objc_autorelease is used.
+  if (UsedInThisFunction & ((1 << unsigned(ARCInstKind::Autorelease)) |
+                            (1 << unsigned(ARCInstKind::AutoreleaseRV))))
+    OptimizeReturns(F);
+
+  // Gather statistics after optimization.
+#ifndef NDEBUG
+  if (AreStatisticsEnabled()) {
+    GatherStatistics(F, true);
+  }
+#endif
+
+  LLVM_DEBUG(dbgs() << "\n");
+
+  return Changed;
+}
+
+void ObjCARCOpt::releaseMemory() {
+  PA.clear();
+}
+
+/// @}
+///
 
 PreservedAnalyses ObjCARCOptPass::run(Function &F,
                                       FunctionAnalysisManager &AM) {
diff --git a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp
index 1e4fb458a3..3d59b2edc5 100644
--- a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp
@@ -1,179 +1,179 @@
-//===- ProvenanceAnalysis.cpp - ObjC ARC Optimization ---------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-/// \file 
-/// 
-/// This file defines a special form of Alias Analysis called ``Provenance 
-/// Analysis''. The word ``provenance'' refers to the history of the ownership 
-/// of an object. Thus ``Provenance Analysis'' is an analysis which attempts to 
-/// use various techniques to determine if locally 
-/// 
-/// WARNING: This file knows about certain library functions. It recognizes them 
-/// by name, and hardwires knowledge of their semantics. 
-/// 
-/// WARNING: This file knows about how certain Objective-C library functions are 
-/// used. Naive LLVM IR transformations which would otherwise be 
-/// behavior-preserving may break these assumptions. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "ProvenanceAnalysis.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/Analysis/AliasAnalysis.h" 
-#include "llvm/Analysis/ObjCARCAnalysisUtils.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/Support/Casting.h" 
-#include <utility> 
- 
-using namespace llvm; 
-using namespace llvm::objcarc; 
- 
-bool ProvenanceAnalysis::relatedSelect(const SelectInst *A, 
-                                       const Value *B) { 
-  // If the values are Selects with the same condition, we can do a more precise 
-  // check: just check for relations between the values on corresponding arms. 
-  if (const SelectInst *SB = dyn_cast<SelectInst>(B)) 
-    if (A->getCondition() == SB->getCondition()) 
+//===- ProvenanceAnalysis.cpp - ObjC ARC Optimization ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// This file defines a special form of Alias Analysis called ``Provenance
+/// Analysis''. The word ``provenance'' refers to the history of the ownership
+/// of an object. Thus ``Provenance Analysis'' is an analysis which attempts to
+/// use various techniques to determine if locally
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ProvenanceAnalysis.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ObjCARCAnalysisUtils.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include <utility>
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+bool ProvenanceAnalysis::relatedSelect(const SelectInst *A,
+                                       const Value *B) {
+  // If the values are Selects with the same condition, we can do a more precise
+  // check: just check for relations between the values on corresponding arms.
+  if (const SelectInst *SB = dyn_cast<SelectInst>(B))
+    if (A->getCondition() == SB->getCondition())
       return related(A->getTrueValue(), SB->getTrueValue()) ||
              related(A->getFalseValue(), SB->getFalseValue());
- 
-  // Check both arms of the Select node individually. 
+
+  // Check both arms of the Select node individually.
   return related(A->getTrueValue(), B) || related(A->getFalseValue(), B);
-} 
- 
-bool ProvenanceAnalysis::relatedPHI(const PHINode *A, 
-                                    const Value *B) { 
-  // If the values are PHIs in the same block, we can do a more precise as well 
-  // as efficient check: just check for relations between the values on 
-  // corresponding edges. 
-  if (const PHINode *PNB = dyn_cast<PHINode>(B)) 
-    if (PNB->getParent() == A->getParent()) { 
-      for (unsigned i = 0, e = A->getNumIncomingValues(); i != e; ++i) 
-        if (related(A->getIncomingValue(i), 
+}
+
+bool ProvenanceAnalysis::relatedPHI(const PHINode *A,
+                                    const Value *B) {
+  // If the values are PHIs in the same block, we can do a more precise as well
+  // as efficient check: just check for relations between the values on
+  // corresponding edges.
+  if (const PHINode *PNB = dyn_cast<PHINode>(B))
+    if (PNB->getParent() == A->getParent()) {
+      for (unsigned i = 0, e = A->getNumIncomingValues(); i != e; ++i)
+        if (related(A->getIncomingValue(i),
                     PNB->getIncomingValueForBlock(A->getIncomingBlock(i))))
-          return true; 
-      return false; 
-    } 
- 
-  // Check each unique source of the PHI node against B. 
-  SmallPtrSet<const Value *, 4> UniqueSrc; 
-  for (Value *PV1 : A->incoming_values()) { 
+          return true;
+      return false;
+    }
+
+  // Check each unique source of the PHI node against B.
+  SmallPtrSet<const Value *, 4> UniqueSrc;
+  for (Value *PV1 : A->incoming_values()) {
     if (UniqueSrc.insert(PV1).second && related(PV1, B))
-      return true; 
-  } 
- 
-  // All of the arms checked out. 
-  return false; 
-} 
- 
-/// Test if the value of P, or any value covered by its provenance, is ever 
-/// stored within the function (not counting callees). 
-static bool IsStoredObjCPointer(const Value *P) { 
-  SmallPtrSet<const Value *, 8> Visited; 
-  SmallVector<const Value *, 8> Worklist; 
-  Worklist.push_back(P); 
-  Visited.insert(P); 
-  do { 
-    P = Worklist.pop_back_val(); 
-    for (const Use &U : P->uses()) { 
-      const User *Ur = U.getUser(); 
-      if (isa<StoreInst>(Ur)) { 
-        if (U.getOperandNo() == 0) 
-          // The pointer is stored. 
-          return true; 
-        // The pointed is stored through. 
-        continue; 
-      } 
-      if (isa<CallInst>(Ur)) 
-        // The pointer is passed as an argument, ignore this. 
-        continue; 
-      if (isa<PtrToIntInst>(P)) 
-        // Assume the worst. 
-        return true; 
-      if (Visited.insert(Ur).second) 
-        Worklist.push_back(Ur); 
-    } 
-  } while (!Worklist.empty()); 
- 
-  // Everything checked out. 
-  return false; 
-} 
- 
+      return true;
+  }
+
+  // All of the arms checked out.
+  return false;
+}
+
+/// Test if the value of P, or any value covered by its provenance, is ever
+/// stored within the function (not counting callees).
+static bool IsStoredObjCPointer(const Value *P) {
+  SmallPtrSet<const Value *, 8> Visited;
+  SmallVector<const Value *, 8> Worklist;
+  Worklist.push_back(P);
+  Visited.insert(P);
+  do {
+    P = Worklist.pop_back_val();
+    for (const Use &U : P->uses()) {
+      const User *Ur = U.getUser();
+      if (isa<StoreInst>(Ur)) {
+        if (U.getOperandNo() == 0)
+          // The pointer is stored.
+          return true;
+        // The pointed is stored through.
+        continue;
+      }
+      if (isa<CallInst>(Ur))
+        // The pointer is passed as an argument, ignore this.
+        continue;
+      if (isa<PtrToIntInst>(P))
+        // Assume the worst.
+        return true;
+      if (Visited.insert(Ur).second)
+        Worklist.push_back(Ur);
+    }
+  } while (!Worklist.empty());
+
+  // Everything checked out.
+  return false;
+}
+
 bool ProvenanceAnalysis::relatedCheck(const Value *A, const Value *B) {
-  // Ask regular AliasAnalysis, for a first approximation. 
-  switch (AA->alias(A, B)) { 
-  case NoAlias: 
-    return false; 
-  case MustAlias: 
-  case PartialAlias: 
-    return true; 
-  case MayAlias: 
-    break; 
-  } 
- 
-  bool AIsIdentified = IsObjCIdentifiedObject(A); 
-  bool BIsIdentified = IsObjCIdentifiedObject(B); 
- 
-  // An ObjC-Identified object can't alias a load if it is never locally stored. 
-  if (AIsIdentified) { 
-    // Check for an obvious escape. 
-    if (isa<LoadInst>(B)) 
-      return IsStoredObjCPointer(A); 
-    if (BIsIdentified) { 
-      // Check for an obvious escape. 
-      if (isa<LoadInst>(A)) 
-        return IsStoredObjCPointer(B); 
-      // Both pointers are identified and escapes aren't an evident problem. 
-      return false; 
-    } 
-  } else if (BIsIdentified) { 
-    // Check for an obvious escape. 
-    if (isa<LoadInst>(A)) 
-      return IsStoredObjCPointer(B); 
-  } 
- 
-   // Special handling for PHI and Select. 
-  if (const PHINode *PN = dyn_cast<PHINode>(A)) 
-    return relatedPHI(PN, B); 
-  if (const PHINode *PN = dyn_cast<PHINode>(B)) 
-    return relatedPHI(PN, A); 
-  if (const SelectInst *S = dyn_cast<SelectInst>(A)) 
-    return relatedSelect(S, B); 
-  if (const SelectInst *S = dyn_cast<SelectInst>(B)) 
-    return relatedSelect(S, A); 
- 
-  // Conservative. 
-  return true; 
-} 
- 
+  // Ask regular AliasAnalysis, for a first approximation.
+  switch (AA->alias(A, B)) {
+  case NoAlias:
+    return false;
+  case MustAlias:
+  case PartialAlias:
+    return true;
+  case MayAlias:
+    break;
+  }
+
+  bool AIsIdentified = IsObjCIdentifiedObject(A);
+  bool BIsIdentified = IsObjCIdentifiedObject(B);
+
+  // An ObjC-Identified object can't alias a load if it is never locally stored.
+  if (AIsIdentified) {
+    // Check for an obvious escape.
+    if (isa<LoadInst>(B))
+      return IsStoredObjCPointer(A);
+    if (BIsIdentified) {
+      // Check for an obvious escape.
+      if (isa<LoadInst>(A))
+        return IsStoredObjCPointer(B);
+      // Both pointers are identified and escapes aren't an evident problem.
+      return false;
+    }
+  } else if (BIsIdentified) {
+    // Check for an obvious escape.
+    if (isa<LoadInst>(A))
+      return IsStoredObjCPointer(B);
+  }
+
+   // Special handling for PHI and Select.
+  if (const PHINode *PN = dyn_cast<PHINode>(A))
+    return relatedPHI(PN, B);
+  if (const PHINode *PN = dyn_cast<PHINode>(B))
+    return relatedPHI(PN, A);
+  if (const SelectInst *S = dyn_cast<SelectInst>(A))
+    return relatedSelect(S, B);
+  if (const SelectInst *S = dyn_cast<SelectInst>(B))
+    return relatedSelect(S, A);
+
+  // Conservative.
+  return true;
+}
+
 bool ProvenanceAnalysis::related(const Value *A, const Value *B) {
   A = GetUnderlyingObjCPtrCached(A, UnderlyingObjCPtrCache);
   B = GetUnderlyingObjCPtrCached(B, UnderlyingObjCPtrCache);
- 
-  // Quick check. 
-  if (A == B) 
-    return true; 
- 
-  // Begin by inserting a conservative value into the map. If the insertion 
-  // fails, we have the answer already. If it succeeds, leave it there until we 
-  // compute the real answer to guard against recursive queries. 
-  if (A > B) std::swap(A, B); 
-  std::pair<CachedResultsTy::iterator, bool> Pair = 
-    CachedResults.insert(std::make_pair(ValuePairTy(A, B), true)); 
-  if (!Pair.second) 
-    return Pair.first->second; 
- 
+
+  // Quick check.
+  if (A == B)
+    return true;
+
+  // Begin by inserting a conservative value into the map. If the insertion
+  // fails, we have the answer already. If it succeeds, leave it there until we
+  // compute the real answer to guard against recursive queries.
+  if (A > B) std::swap(A, B);
+  std::pair<CachedResultsTy::iterator, bool> Pair =
+    CachedResults.insert(std::make_pair(ValuePairTy(A, B), true));
+  if (!Pair.second)
+    return Pair.first->second;
+
   bool Result = relatedCheck(A, B);
-  CachedResults[ValuePairTy(A, B)] = Result; 
-  return Result; 
-} 
+  CachedResults[ValuePairTy(A, B)] = Result;
+  return Result;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ProvenanceAnalysis.h b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
index 0957031aa7..a63e356ce1 100644
--- a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
+++ b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
@@ -1,86 +1,86 @@
-//===- ProvenanceAnalysis.h - ObjC ARC Optimization -------------*- C++ -*-===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-/// \file 
-/// 
-/// This file declares a special form of Alias Analysis called ``Provenance 
-/// Analysis''. The word ``provenance'' refers to the history of the ownership 
-/// of an object. Thus ``Provenance Analysis'' is an analysis which attempts to 
-/// use various techniques to determine if locally 
-/// 
-/// WARNING: This file knows about certain library functions. It recognizes them 
-/// by name, and hardwires knowledge of their semantics. 
-/// 
-/// WARNING: This file knows about how certain Objective-C library functions are 
-/// used. Naive LLVM IR transformations which would otherwise be 
-/// behavior-preserving may break these assumptions. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H 
-#define LLVM_LIB_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H 
- 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/IR/ValueHandle.h" 
-#include <utility> 
- 
-namespace llvm { 
- 
+//===- ProvenanceAnalysis.h - ObjC ARC Optimization -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// This file declares a special form of Alias Analysis called ``Provenance
+/// Analysis''. The word ``provenance'' refers to the history of the ownership
+/// of an object. Thus ``Provenance Analysis'' is an analysis which attempts to
+/// use various techniques to determine if locally
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H
+#define LLVM_LIB_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/ValueHandle.h"
+#include <utility>
+
+namespace llvm {
+
 class AAResults;
-class DataLayout; 
-class PHINode; 
-class SelectInst; 
-class Value; 
- 
-namespace objcarc { 
- 
-/// This is similar to BasicAliasAnalysis, and it uses many of the same 
-/// techniques, except it uses special ObjC-specific reasoning about pointer 
-/// relationships. 
-/// 
-/// In this context ``Provenance'' is defined as the history of an object's 
-/// ownership. Thus ``Provenance Analysis'' is defined by using the notion of 
-/// an ``independent provenance source'' of a pointer to determine whether or 
-/// not two pointers have the same provenance source and thus could 
-/// potentially be related. 
-class ProvenanceAnalysis { 
+class DataLayout;
+class PHINode;
+class SelectInst;
+class Value;
+
+namespace objcarc {
+
+/// This is similar to BasicAliasAnalysis, and it uses many of the same
+/// techniques, except it uses special ObjC-specific reasoning about pointer
+/// relationships.
+///
+/// In this context ``Provenance'' is defined as the history of an object's
+/// ownership. Thus ``Provenance Analysis'' is defined by using the notion of
+/// an ``independent provenance source'' of a pointer to determine whether or
+/// not two pointers have the same provenance source and thus could
+/// potentially be related.
+class ProvenanceAnalysis {
   AAResults *AA;
- 
-  using ValuePairTy = std::pair<const Value *, const Value *>; 
-  using CachedResultsTy = DenseMap<ValuePairTy, bool>; 
- 
-  CachedResultsTy CachedResults; 
- 
-  DenseMap<const Value *, WeakTrackingVH> UnderlyingObjCPtrCache; 
- 
+
+  using ValuePairTy = std::pair<const Value *, const Value *>;
+  using CachedResultsTy = DenseMap<ValuePairTy, bool>;
+
+  CachedResultsTy CachedResults;
+
+  DenseMap<const Value *, WeakTrackingVH> UnderlyingObjCPtrCache;
+
   bool relatedCheck(const Value *A, const Value *B);
-  bool relatedSelect(const SelectInst *A, const Value *B); 
-  bool relatedPHI(const PHINode *A, const Value *B); 
- 
-public: 
-  ProvenanceAnalysis() = default; 
-  ProvenanceAnalysis(const ProvenanceAnalysis &) = delete; 
-  ProvenanceAnalysis &operator=(const ProvenanceAnalysis &) = delete; 
- 
+  bool relatedSelect(const SelectInst *A, const Value *B);
+  bool relatedPHI(const PHINode *A, const Value *B);
+
+public:
+  ProvenanceAnalysis() = default;
+  ProvenanceAnalysis(const ProvenanceAnalysis &) = delete;
+  ProvenanceAnalysis &operator=(const ProvenanceAnalysis &) = delete;
+
   void setAA(AAResults *aa) { AA = aa; }
- 
+
   AAResults *getAA() const { return AA; }
- 
+
   bool related(const Value *A, const Value *B);
- 
-  void clear() { 
-    CachedResults.clear(); 
-    UnderlyingObjCPtrCache.clear(); 
-  } 
-}; 
- 
-} // end namespace objcarc 
- 
-} // end namespace llvm 
- 
-#endif // LLVM_LIB_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H 
+
+  void clear() {
+    CachedResults.clear();
+    UnderlyingObjCPtrCache.clear();
+  }
+};
+
+} // end namespace objcarc
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H
diff --git a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp
index fff773908d..6fdfe787d4 100644
--- a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp
@@ -1,94 +1,94 @@
-//===- ProvenanceAnalysisEvaluator.cpp - ObjC ARC Optimization ------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "ProvenanceAnalysis.h" 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/Analysis/AliasAnalysis.h" 
-#include "llvm/Analysis/Passes.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/InstIterator.h" 
+//===- ProvenanceAnalysisEvaluator.cpp - ObjC ARC Optimization ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ProvenanceAnalysis.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Module.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/raw_ostream.h" 
- 
-using namespace llvm; 
-using namespace llvm::objcarc; 
- 
-namespace { 
-class PAEval : public FunctionPass { 
- 
-public: 
-  static char ID; 
-  PAEval(); 
-  void getAnalysisUsage(AnalysisUsage &AU) const override; 
-  bool runOnFunction(Function &F) override; 
-}; 
-} 
- 
-char PAEval::ID = 0; 
-PAEval::PAEval() : FunctionPass(ID) {} 
- 
-void PAEval::getAnalysisUsage(AnalysisUsage &AU) const { 
-  AU.addRequired<AAResultsWrapperPass>(); 
-} 
- 
-static StringRef getName(Value *V) { 
-  StringRef Name = V->getName(); 
-  if (Name.startswith("\1")) 
-    return Name.substr(1); 
-  return Name; 
-} 
- 
-static void insertIfNamed(SetVector<Value *> &Values, Value *V) { 
-  if (!V->hasName()) 
-    return; 
-  Values.insert(V); 
-} 
- 
-bool PAEval::runOnFunction(Function &F) { 
-  SetVector<Value *> Values; 
- 
-  for (auto &Arg : F.args()) 
-    insertIfNamed(Values, &Arg); 
- 
-  for (auto I = inst_begin(F), E = inst_end(F); I != E; ++I) { 
-    insertIfNamed(Values, &*I); 
- 
-    for (auto &Op : I->operands()) 
-    insertIfNamed(Values, Op); 
-  } 
- 
-  ProvenanceAnalysis PA; 
-  PA.setAA(&getAnalysis<AAResultsWrapperPass>().getAAResults()); 
- 
-  for (Value *V1 : Values) { 
-    StringRef NameV1 = getName(V1); 
-    for (Value *V2 : Values) { 
-      StringRef NameV2 = getName(V2); 
-      if (NameV1 >= NameV2) 
-        continue; 
-      errs() << NameV1 << " and " << NameV2; 
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+namespace {
+class PAEval : public FunctionPass {
+
+public:
+  static char ID;
+  PAEval();
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnFunction(Function &F) override;
+};
+}
+
+char PAEval::ID = 0;
+PAEval::PAEval() : FunctionPass(ID) {}
+
+void PAEval::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<AAResultsWrapperPass>();
+}
+
+static StringRef getName(Value *V) {
+  StringRef Name = V->getName();
+  if (Name.startswith("\1"))
+    return Name.substr(1);
+  return Name;
+}
+
+static void insertIfNamed(SetVector<Value *> &Values, Value *V) {
+  if (!V->hasName())
+    return;
+  Values.insert(V);
+}
+
+bool PAEval::runOnFunction(Function &F) {
+  SetVector<Value *> Values;
+
+  for (auto &Arg : F.args())
+    insertIfNamed(Values, &Arg);
+
+  for (auto I = inst_begin(F), E = inst_end(F); I != E; ++I) {
+    insertIfNamed(Values, &*I);
+
+    for (auto &Op : I->operands())
+    insertIfNamed(Values, Op);
+  }
+
+  ProvenanceAnalysis PA;
+  PA.setAA(&getAnalysis<AAResultsWrapperPass>().getAAResults());
+
+  for (Value *V1 : Values) {
+    StringRef NameV1 = getName(V1);
+    for (Value *V2 : Values) {
+      StringRef NameV2 = getName(V2);
+      if (NameV1 >= NameV2)
+        continue;
+      errs() << NameV1 << " and " << NameV2;
       if (PA.related(V1, V2))
-        errs() << " are related.\n"; 
-      else 
-        errs() << " are not related.\n"; 
-    } 
-  } 
- 
-  return false; 
-} 
- 
-FunctionPass *llvm::createPAEvalPass() { return new PAEval(); } 
- 
-INITIALIZE_PASS_BEGIN(PAEval, "pa-eval", 
-                      "Evaluate ProvenanceAnalysis on all pairs", false, true) 
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 
-INITIALIZE_PASS_END(PAEval, "pa-eval", 
-                    "Evaluate ProvenanceAnalysis on all pairs", false, true) 
+        errs() << " are related.\n";
+      else
+        errs() << " are not related.\n";
+    }
+  }
+
+  return false;
+}
+
+FunctionPass *llvm::createPAEvalPass() { return new PAEval(); }
+
+INITIALIZE_PASS_BEGIN(PAEval, "pa-eval",
+                      "Evaluate ProvenanceAnalysis on all pairs", false, true)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(PAEval, "pa-eval",
+                    "Evaluate ProvenanceAnalysis on all pairs", false, true)
diff --git a/contrib/libs/llvm12/lib/Transforms/ObjCARC/PtrState.cpp b/contrib/libs/llvm12/lib/Transforms/ObjCARC/PtrState.cpp
index 513be76e6c..6071ec3e4d 100644
--- a/contrib/libs/llvm12/lib/Transforms/ObjCARC/PtrState.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/ObjCARC/PtrState.cpp
@@ -1,436 +1,436 @@
-//===- PtrState.cpp -------------------------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "PtrState.h" 
-#include "DependencyAnalysis.h" 
-#include "ObjCARC.h" 
-#include "llvm/Analysis/ObjCARCAnalysisUtils.h" 
-#include "llvm/Analysis/ObjCARCInstKind.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/Compiler.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include <cassert> 
-#include <iterator> 
-#include <utility> 
- 
-using namespace llvm; 
-using namespace llvm::objcarc; 
- 
-#define DEBUG_TYPE "objc-arc-ptr-state" 
- 
-//===----------------------------------------------------------------------===// 
-//                                  Utility 
-//===----------------------------------------------------------------------===// 
- 
-raw_ostream &llvm::objcarc::operator<<(raw_ostream &OS, const Sequence S) { 
-  switch (S) { 
-  case S_None: 
-    return OS << "S_None"; 
-  case S_Retain: 
-    return OS << "S_Retain"; 
-  case S_CanRelease: 
-    return OS << "S_CanRelease"; 
-  case S_Use: 
-    return OS << "S_Use"; 
-  case S_Release: 
-    return OS << "S_Release"; 
-  case S_MovableRelease: 
-    return OS << "S_MovableRelease"; 
-  case S_Stop: 
-    return OS << "S_Stop"; 
-  } 
-  llvm_unreachable("Unknown sequence type."); 
-} 
- 
-//===----------------------------------------------------------------------===// 
-//                                  Sequence 
-//===----------------------------------------------------------------------===// 
- 
-static Sequence MergeSeqs(Sequence A, Sequence B, bool TopDown) { 
-  // The easy cases. 
-  if (A == B) 
-    return A; 
-  if (A == S_None || B == S_None) 
-    return S_None; 
- 
-  if (A > B) 
-    std::swap(A, B); 
-  if (TopDown) { 
-    // Choose the side which is further along in the sequence. 
-    if ((A == S_Retain || A == S_CanRelease) && 
-        (B == S_CanRelease || B == S_Use)) 
-      return B; 
-  } else { 
-    // Choose the side which is further along in the sequence. 
-    if ((A == S_Use || A == S_CanRelease) && 
-        (B == S_Use || B == S_Release || B == S_Stop || B == S_MovableRelease)) 
-      return A; 
-    // If both sides are releases, choose the more conservative one. 
-    if (A == S_Stop && (B == S_Release || B == S_MovableRelease)) 
-      return A; 
-    if (A == S_Release && B == S_MovableRelease) 
-      return A; 
-  } 
- 
-  return S_None; 
-} 
- 
-//===----------------------------------------------------------------------===// 
-//                                   RRInfo 
-//===----------------------------------------------------------------------===// 
- 
-void RRInfo::clear() { 
-  KnownSafe = false; 
-  IsTailCallRelease = false; 
-  ReleaseMetadata = nullptr; 
-  Calls.clear(); 
-  ReverseInsertPts.clear(); 
-  CFGHazardAfflicted = false; 
-} 
- 
-bool RRInfo::Merge(const RRInfo &Other) { 
-  // Conservatively merge the ReleaseMetadata information. 
-  if (ReleaseMetadata != Other.ReleaseMetadata) 
-    ReleaseMetadata = nullptr; 
- 
-  // Conservatively merge the boolean state. 
-  KnownSafe &= Other.KnownSafe; 
-  IsTailCallRelease &= Other.IsTailCallRelease; 
-  CFGHazardAfflicted |= Other.CFGHazardAfflicted; 
- 
-  // Merge the call sets. 
-  Calls.insert(Other.Calls.begin(), Other.Calls.end()); 
- 
-  // Merge the insert point sets. If there are any differences, 
-  // that makes this a partial merge. 
-  bool Partial = ReverseInsertPts.size() != Other.ReverseInsertPts.size(); 
-  for (Instruction *Inst : Other.ReverseInsertPts) 
-    Partial |= ReverseInsertPts.insert(Inst).second; 
-  return Partial; 
-} 
- 
-//===----------------------------------------------------------------------===// 
-//                                  PtrState 
-//===----------------------------------------------------------------------===// 
- 
-void PtrState::SetKnownPositiveRefCount() { 
-  LLVM_DEBUG(dbgs() << "        Setting Known Positive.\n"); 
-  KnownPositiveRefCount = true; 
-} 
- 
-void PtrState::ClearKnownPositiveRefCount() { 
-  LLVM_DEBUG(dbgs() << "        Clearing Known Positive.\n"); 
-  KnownPositiveRefCount = false; 
-} 
- 
-void PtrState::SetSeq(Sequence NewSeq) { 
-  LLVM_DEBUG(dbgs() << "            Old: " << GetSeq() << "; New: " << NewSeq 
-                    << "\n"); 
-  Seq = NewSeq; 
-} 
- 
-void PtrState::ResetSequenceProgress(Sequence NewSeq) { 
-  LLVM_DEBUG(dbgs() << "        Resetting sequence progress.\n"); 
-  SetSeq(NewSeq); 
-  Partial = false; 
-  RRI.clear(); 
-} 
- 
-void PtrState::Merge(const PtrState &Other, bool TopDown) { 
-  Seq = MergeSeqs(GetSeq(), Other.GetSeq(), TopDown); 
-  KnownPositiveRefCount &= Other.KnownPositiveRefCount; 
- 
-  // If we're not in a sequence (anymore), drop all associated state. 
-  if (Seq == S_None) { 
-    Partial = false; 
-    RRI.clear(); 
-  } else if (Partial || Other.Partial) { 
-    // If we're doing a merge on a path that's previously seen a partial 
-    // merge, conservatively drop the sequence, to avoid doing partial 
-    // RR elimination. If the branch predicates for the two merge differ, 
-    // mixing them is unsafe. 
-    ClearSequenceProgress(); 
-  } else { 
-    // Otherwise merge the other PtrState's RRInfo into our RRInfo. At this 
-    // point, we know that currently we are not partial. Stash whether or not 
-    // the merge operation caused us to undergo a partial merging of reverse 
-    // insertion points. 
-    Partial = RRI.Merge(Other.RRI); 
-  } 
-} 
- 
-//===----------------------------------------------------------------------===// 
-//                              BottomUpPtrState 
-//===----------------------------------------------------------------------===// 
- 
-bool BottomUpPtrState::InitBottomUp(ARCMDKindCache &Cache, Instruction *I) { 
-  // If we see two releases in a row on the same pointer. If so, make 
-  // a note, and we'll cicle back to revisit it after we've 
-  // hopefully eliminated the second release, which may allow us to 
-  // eliminate the first release too. 
-  // Theoretically we could implement removal of nested retain+release 
-  // pairs by making PtrState hold a stack of states, but this is 
-  // simple and avoids adding overhead for the non-nested case. 
-  bool NestingDetected = false; 
-  if (GetSeq() == S_Release || GetSeq() == S_MovableRelease) { 
-    LLVM_DEBUG( 
-        dbgs() << "        Found nested releases (i.e. a release pair)\n"); 
-    NestingDetected = true; 
-  } 
- 
-  MDNode *ReleaseMetadata = 
-      I->getMetadata(Cache.get(ARCMDKindID::ImpreciseRelease)); 
-  Sequence NewSeq = ReleaseMetadata ? S_MovableRelease : S_Release; 
-  ResetSequenceProgress(NewSeq); 
-  SetReleaseMetadata(ReleaseMetadata); 
-  SetKnownSafe(HasKnownPositiveRefCount()); 
-  SetTailCallRelease(cast<CallInst>(I)->isTailCall()); 
-  InsertCall(I); 
-  SetKnownPositiveRefCount(); 
-  return NestingDetected; 
-} 
- 
-bool BottomUpPtrState::MatchWithRetain() { 
-  SetKnownPositiveRefCount(); 
- 
-  Sequence OldSeq = GetSeq(); 
-  switch (OldSeq) { 
-  case S_Stop: 
-  case S_Release: 
-  case S_MovableRelease: 
-  case S_Use: 
-    // If OldSeq is not S_Use or OldSeq is S_Use and we are tracking an 
-    // imprecise release, clear our reverse insertion points. 
-    if (OldSeq != S_Use || IsTrackingImpreciseReleases()) 
-      ClearReverseInsertPts(); 
-    LLVM_FALLTHROUGH; 
-  case S_CanRelease: 
-    return true; 
-  case S_None: 
-    return false; 
-  case S_Retain: 
-    llvm_unreachable("bottom-up pointer in retain state!"); 
-  } 
-  llvm_unreachable("Sequence unknown enum value"); 
-} 
- 
-bool BottomUpPtrState::HandlePotentialAlterRefCount(Instruction *Inst, 
-                                                    const Value *Ptr, 
-                                                    ProvenanceAnalysis &PA, 
-                                                    ARCInstKind Class) { 
-  Sequence S = GetSeq(); 
- 
-  // Check for possible releases. 
+//===- PtrState.cpp -------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "PtrState.h"
+#include "DependencyAnalysis.h"
+#include "ObjCARC.h"
+#include "llvm/Analysis/ObjCARCAnalysisUtils.h"
+#include "llvm/Analysis/ObjCARCInstKind.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <iterator>
+#include <utility>
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+#define DEBUG_TYPE "objc-arc-ptr-state"
+
+//===----------------------------------------------------------------------===//
+//                                  Utility
+//===----------------------------------------------------------------------===//
+
+raw_ostream &llvm::objcarc::operator<<(raw_ostream &OS, const Sequence S) {
+  switch (S) {
+  case S_None:
+    return OS << "S_None";
+  case S_Retain:
+    return OS << "S_Retain";
+  case S_CanRelease:
+    return OS << "S_CanRelease";
+  case S_Use:
+    return OS << "S_Use";
+  case S_Release:
+    return OS << "S_Release";
+  case S_MovableRelease:
+    return OS << "S_MovableRelease";
+  case S_Stop:
+    return OS << "S_Stop";
+  }
+  llvm_unreachable("Unknown sequence type.");
+}
+
+//===----------------------------------------------------------------------===//
+//                                  Sequence
+//===----------------------------------------------------------------------===//
+
+static Sequence MergeSeqs(Sequence A, Sequence B, bool TopDown) {
+  // The easy cases.
+  if (A == B)
+    return A;
+  if (A == S_None || B == S_None)
+    return S_None;
+
+  if (A > B)
+    std::swap(A, B);
+  if (TopDown) {
+    // Choose the side which is further along in the sequence.
+    if ((A == S_Retain || A == S_CanRelease) &&
+        (B == S_CanRelease || B == S_Use))
+      return B;
+  } else {
+    // Choose the side which is further along in the sequence.
+    if ((A == S_Use || A == S_CanRelease) &&
+        (B == S_Use || B == S_Release || B == S_Stop || B == S_MovableRelease))
+      return A;
+    // If both sides are releases, choose the more conservative one.
+    if (A == S_Stop && (B == S_Release || B == S_MovableRelease))
+      return A;
+    if (A == S_Release && B == S_MovableRelease)
+      return A;
+  }
+
+  return S_None;
+}
+
+//===----------------------------------------------------------------------===//
+//                                   RRInfo
+//===----------------------------------------------------------------------===//
+
+void RRInfo::clear() {
+  KnownSafe = false;
+  IsTailCallRelease = false;
+  ReleaseMetadata = nullptr;
+  Calls.clear();
+  ReverseInsertPts.clear();
+  CFGHazardAfflicted = false;
+}
+
+bool RRInfo::Merge(const RRInfo &Other) {
+  // Conservatively merge the ReleaseMetadata information.
+  if (ReleaseMetadata != Other.ReleaseMetadata)
+    ReleaseMetadata = nullptr;
+
+  // Conservatively merge the boolean state.
+  KnownSafe &= Other.KnownSafe;
+  IsTailCallRelease &= Other.IsTailCallRelease;
+  CFGHazardAfflicted |= Other.CFGHazardAfflicted;
+
+  // Merge the call sets.
+  Calls.insert(Other.Calls.begin(), Other.Calls.end());
+
+  // Merge the insert point sets. If there are any differences,
+  // that makes this a partial merge.
+  bool Partial = ReverseInsertPts.size() != Other.ReverseInsertPts.size();
+  for (Instruction *Inst : Other.ReverseInsertPts)
+    Partial |= ReverseInsertPts.insert(Inst).second;
+  return Partial;
+}
+
+//===----------------------------------------------------------------------===//
+//                                  PtrState
+//===----------------------------------------------------------------------===//
+
+void PtrState::SetKnownPositiveRefCount() {
+  LLVM_DEBUG(dbgs() << "        Setting Known Positive.\n");
+  KnownPositiveRefCount = true;
+}
+
+void PtrState::ClearKnownPositiveRefCount() {
+  LLVM_DEBUG(dbgs() << "        Clearing Known Positive.\n");
+  KnownPositiveRefCount = false;
+}
+
+void PtrState::SetSeq(Sequence NewSeq) {
+  LLVM_DEBUG(dbgs() << "            Old: " << GetSeq() << "; New: " << NewSeq
+                    << "\n");
+  Seq = NewSeq;
+}
+
+void PtrState::ResetSequenceProgress(Sequence NewSeq) {
+  LLVM_DEBUG(dbgs() << "        Resetting sequence progress.\n");
+  SetSeq(NewSeq);
+  Partial = false;
+  RRI.clear();
+}
+
+void PtrState::Merge(const PtrState &Other, bool TopDown) {
+  Seq = MergeSeqs(GetSeq(), Other.GetSeq(), TopDown);
+  KnownPositiveRefCount &= Other.KnownPositiveRefCount;
+
+  // If we're not in a sequence (anymore), drop all associated state.
+  if (Seq == S_None) {
+    Partial = false;
+    RRI.clear();
+  } else if (Partial || Other.Partial) {
+    // If we're doing a merge on a path that's previously seen a partial
+    // merge, conservatively drop the sequence, to avoid doing partial
+    // RR elimination. If the branch predicates for the two merge differ,
+    // mixing them is unsafe.
+    ClearSequenceProgress();
+  } else {
+    // Otherwise merge the other PtrState's RRInfo into our RRInfo. At this
+    // point, we know that currently we are not partial. Stash whether or not
+    // the merge operation caused us to undergo a partial merging of reverse
+    // insertion points.
+    Partial = RRI.Merge(Other.RRI);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                              BottomUpPtrState
+//===----------------------------------------------------------------------===//
+
+bool BottomUpPtrState::InitBottomUp(ARCMDKindCache &Cache, Instruction *I) {
+  // If we see two releases in a row on the same pointer. If so, make
+  // a note, and we'll cicle back to revisit it after we've
+  // hopefully eliminated the second release, which may allow us to
+  // eliminate the first release too.
+  // Theoretically we could implement removal of nested retain+release
+  // pairs by making PtrState hold a stack of states, but this is
+  // simple and avoids adding overhead for the non-nested case.
+  bool NestingDetected = false;
+  if (GetSeq() == S_Release || GetSeq() == S_MovableRelease) {
+    LLVM_DEBUG(
+        dbgs() << "        Found nested releases (i.e. a release pair)\n");
+    NestingDetected = true;
+  }
+
+  MDNode *ReleaseMetadata =
+      I->getMetadata(Cache.get(ARCMDKindID::ImpreciseRelease));
+  Sequence NewSeq = ReleaseMetadata ? S_MovableRelease : S_Release;
+  ResetSequenceProgress(NewSeq);
+  SetReleaseMetadata(ReleaseMetadata);
+  SetKnownSafe(HasKnownPositiveRefCount());
+  SetTailCallRelease(cast<CallInst>(I)->isTailCall());
+  InsertCall(I);
+  SetKnownPositiveRefCount();
+  return NestingDetected;
+}
+
+bool BottomUpPtrState::MatchWithRetain() {
+  SetKnownPositiveRefCount();
+
+  Sequence OldSeq = GetSeq();
+  switch (OldSeq) {
+  case S_Stop:
+  case S_Release:
+  case S_MovableRelease:
+  case S_Use:
+    // If OldSeq is not S_Use or OldSeq is S_Use and we are tracking an
+    // imprecise release, clear our reverse insertion points.
+    if (OldSeq != S_Use || IsTrackingImpreciseReleases())
+      ClearReverseInsertPts();
+    LLVM_FALLTHROUGH;
+  case S_CanRelease:
+    return true;
+  case S_None:
+    return false;
+  case S_Retain:
+    llvm_unreachable("bottom-up pointer in retain state!");
+  }
+  llvm_unreachable("Sequence unknown enum value");
+}
+
+bool BottomUpPtrState::HandlePotentialAlterRefCount(Instruction *Inst,
+                                                    const Value *Ptr,
+                                                    ProvenanceAnalysis &PA,
+                                                    ARCInstKind Class) {
+  Sequence S = GetSeq();
+
+  // Check for possible releases.
   if (!CanDecrementRefCount(Inst, Ptr, PA, Class))
-    return false; 
- 
-  LLVM_DEBUG(dbgs() << "            CanAlterRefCount: Seq: " << S << "; " 
-                    << *Ptr << "\n"); 
-  switch (S) { 
-  case S_Use: 
-    SetSeq(S_CanRelease); 
-    return true; 
-  case S_CanRelease: 
-  case S_Release: 
-  case S_MovableRelease: 
-  case S_Stop: 
-  case S_None: 
-    return false; 
-  case S_Retain: 
-    llvm_unreachable("bottom-up pointer in retain state!"); 
-  } 
-  llvm_unreachable("Sequence unknown enum value"); 
-} 
- 
-void BottomUpPtrState::HandlePotentialUse(BasicBlock *BB, Instruction *Inst, 
-                                          const Value *Ptr, 
-                                          ProvenanceAnalysis &PA, 
-                                          ARCInstKind Class) { 
-  auto SetSeqAndInsertReverseInsertPt = [&](Sequence NewSeq){ 
-    assert(!HasReverseInsertPts()); 
-    SetSeq(NewSeq); 
-    // If this is an invoke instruction, we're scanning it as part of 
-    // one of its successor blocks, since we can't insert code after it 
-    // in its own block, and we don't want to split critical edges. 
-    BasicBlock::iterator InsertAfter; 
-    if (isa<InvokeInst>(Inst)) { 
-      const auto IP = BB->getFirstInsertionPt(); 
-      InsertAfter = IP == BB->end() ? std::prev(BB->end()) : IP; 
-      if (isa<CatchSwitchInst>(InsertAfter)) 
-        // A catchswitch must be the only non-phi instruction in its basic 
-        // block, so attempting to insert an instruction into such a block would 
-        // produce invalid IR. 
-        SetCFGHazardAfflicted(true); 
-    } else { 
-      InsertAfter = std::next(Inst->getIterator()); 
-    } 
- 
-    if (InsertAfter != BB->end()) 
-      InsertAfter = skipDebugIntrinsics(InsertAfter); 
- 
-    InsertReverseInsertPt(&*InsertAfter); 
-  }; 
- 
-  // Check for possible direct uses. 
-  switch (GetSeq()) { 
-  case S_Release: 
-  case S_MovableRelease: 
-    if (CanUse(Inst, Ptr, PA, Class)) { 
-      LLVM_DEBUG(dbgs() << "            CanUse: Seq: " << GetSeq() << "; " 
-                        << *Ptr << "\n"); 
-      SetSeqAndInsertReverseInsertPt(S_Use); 
-    } else if (Seq == S_Release && IsUser(Class)) { 
-      LLVM_DEBUG(dbgs() << "            PreciseReleaseUse: Seq: " << GetSeq() 
-                        << "; " << *Ptr << "\n"); 
-      // Non-movable releases depend on any possible objc pointer use. 
-      SetSeqAndInsertReverseInsertPt(S_Stop); 
-    } else if (const auto *Call = getreturnRVOperand(*Inst, Class)) { 
-      if (CanUse(Call, Ptr, PA, GetBasicARCInstKind(Call))) { 
-        LLVM_DEBUG(dbgs() << "            ReleaseUse: Seq: " << GetSeq() << "; " 
-                          << *Ptr << "\n"); 
-        SetSeqAndInsertReverseInsertPt(S_Stop); 
-      } 
-    } 
-    break; 
-  case S_Stop: 
-    if (CanUse(Inst, Ptr, PA, Class)) { 
-      LLVM_DEBUG(dbgs() << "            PreciseStopUse: Seq: " << GetSeq() 
-                        << "; " << *Ptr << "\n"); 
-      SetSeq(S_Use); 
-    } 
-    break; 
-  case S_CanRelease: 
-  case S_Use: 
-  case S_None: 
-    break; 
-  case S_Retain: 
-    llvm_unreachable("bottom-up pointer in retain state!"); 
-  } 
-} 
- 
-//===----------------------------------------------------------------------===// 
-//                              TopDownPtrState 
-//===----------------------------------------------------------------------===// 
- 
-bool TopDownPtrState::InitTopDown(ARCInstKind Kind, Instruction *I) { 
-  bool NestingDetected = false; 
-  // Don't do retain+release tracking for ARCInstKind::RetainRV, because 
-  // it's 
-  // better to let it remain as the first instruction after a call. 
-  if (Kind != ARCInstKind::RetainRV) { 
-    // If we see two retains in a row on the same pointer. If so, make 
-    // a note, and we'll cicle back to revisit it after we've 
-    // hopefully eliminated the second retain, which may allow us to 
-    // eliminate the first retain too. 
-    // Theoretically we could implement removal of nested retain+release 
-    // pairs by making PtrState hold a stack of states, but this is 
-    // simple and avoids adding overhead for the non-nested case. 
-    if (GetSeq() == S_Retain) 
-      NestingDetected = true; 
- 
-    ResetSequenceProgress(S_Retain); 
-    SetKnownSafe(HasKnownPositiveRefCount()); 
-    InsertCall(I); 
-  } 
- 
-  SetKnownPositiveRefCount(); 
-  return NestingDetected; 
-} 
- 
-bool TopDownPtrState::MatchWithRelease(ARCMDKindCache &Cache, 
-                                       Instruction *Release) { 
-  ClearKnownPositiveRefCount(); 
- 
-  Sequence OldSeq = GetSeq(); 
- 
-  MDNode *ReleaseMetadata = 
-      Release->getMetadata(Cache.get(ARCMDKindID::ImpreciseRelease)); 
- 
-  switch (OldSeq) { 
-  case S_Retain: 
-  case S_CanRelease: 
-    if (OldSeq == S_Retain || ReleaseMetadata != nullptr) 
-      ClearReverseInsertPts(); 
-    LLVM_FALLTHROUGH; 
-  case S_Use: 
-    SetReleaseMetadata(ReleaseMetadata); 
-    SetTailCallRelease(cast<CallInst>(Release)->isTailCall()); 
-    return true; 
-  case S_None: 
-    return false; 
-  case S_Stop: 
-  case S_Release: 
-  case S_MovableRelease: 
-    llvm_unreachable("top-down pointer in bottom up state!"); 
-  } 
-  llvm_unreachable("Sequence unknown enum value"); 
-} 
- 
-bool TopDownPtrState::HandlePotentialAlterRefCount(Instruction *Inst, 
-                                                   const Value *Ptr, 
-                                                   ProvenanceAnalysis &PA, 
-                                                   ARCInstKind Class) { 
-  // Check for possible releases. Treat clang.arc.use as a releasing instruction 
-  // to prevent sinking a retain past it. 
+    return false;
+
+  LLVM_DEBUG(dbgs() << "            CanAlterRefCount: Seq: " << S << "; "
+                    << *Ptr << "\n");
+  switch (S) {
+  case S_Use:
+    SetSeq(S_CanRelease);
+    return true;
+  case S_CanRelease:
+  case S_Release:
+  case S_MovableRelease:
+  case S_Stop:
+  case S_None:
+    return false;
+  case S_Retain:
+    llvm_unreachable("bottom-up pointer in retain state!");
+  }
+  llvm_unreachable("Sequence unknown enum value");
+}
+
+void BottomUpPtrState::HandlePotentialUse(BasicBlock *BB, Instruction *Inst,
+                                          const Value *Ptr,
+                                          ProvenanceAnalysis &PA,
+                                          ARCInstKind Class) {
+  auto SetSeqAndInsertReverseInsertPt = [&](Sequence NewSeq){
+    assert(!HasReverseInsertPts());
+    SetSeq(NewSeq);
+    // If this is an invoke instruction, we're scanning it as part of
+    // one of its successor blocks, since we can't insert code after it
+    // in its own block, and we don't want to split critical edges.
+    BasicBlock::iterator InsertAfter;
+    if (isa<InvokeInst>(Inst)) {
+      const auto IP = BB->getFirstInsertionPt();
+      InsertAfter = IP == BB->end() ? std::prev(BB->end()) : IP;
+      if (isa<CatchSwitchInst>(InsertAfter))
+        // A catchswitch must be the only non-phi instruction in its basic
+        // block, so attempting to insert an instruction into such a block would
+        // produce invalid IR.
+        SetCFGHazardAfflicted(true);
+    } else {
+      InsertAfter = std::next(Inst->getIterator());
+    }
+
+    if (InsertAfter != BB->end())
+      InsertAfter = skipDebugIntrinsics(InsertAfter);
+
+    InsertReverseInsertPt(&*InsertAfter);
+  };
+
+  // Check for possible direct uses.
+  switch (GetSeq()) {
+  case S_Release:
+  case S_MovableRelease:
+    if (CanUse(Inst, Ptr, PA, Class)) {
+      LLVM_DEBUG(dbgs() << "            CanUse: Seq: " << GetSeq() << "; "
+                        << *Ptr << "\n");
+      SetSeqAndInsertReverseInsertPt(S_Use);
+    } else if (Seq == S_Release && IsUser(Class)) {
+      LLVM_DEBUG(dbgs() << "            PreciseReleaseUse: Seq: " << GetSeq()
+                        << "; " << *Ptr << "\n");
+      // Non-movable releases depend on any possible objc pointer use.
+      SetSeqAndInsertReverseInsertPt(S_Stop);
+    } else if (const auto *Call = getreturnRVOperand(*Inst, Class)) {
+      if (CanUse(Call, Ptr, PA, GetBasicARCInstKind(Call))) {
+        LLVM_DEBUG(dbgs() << "            ReleaseUse: Seq: " << GetSeq() << "; "
+                          << *Ptr << "\n");
+        SetSeqAndInsertReverseInsertPt(S_Stop);
+      }
+    }
+    break;
+  case S_Stop:
+    if (CanUse(Inst, Ptr, PA, Class)) {
+      LLVM_DEBUG(dbgs() << "            PreciseStopUse: Seq: " << GetSeq()
+                        << "; " << *Ptr << "\n");
+      SetSeq(S_Use);
+    }
+    break;
+  case S_CanRelease:
+  case S_Use:
+  case S_None:
+    break;
+  case S_Retain:
+    llvm_unreachable("bottom-up pointer in retain state!");
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                              TopDownPtrState
+//===----------------------------------------------------------------------===//
+
+bool TopDownPtrState::InitTopDown(ARCInstKind Kind, Instruction *I) {
+  bool NestingDetected = false;
+  // Don't do retain+release tracking for ARCInstKind::RetainRV, because
+  // it's
+  // better to let it remain as the first instruction after a call.
+  if (Kind != ARCInstKind::RetainRV) {
+    // If we see two retains in a row on the same pointer. If so, make
+    // a note, and we'll cicle back to revisit it after we've
+    // hopefully eliminated the second retain, which may allow us to
+    // eliminate the first retain too.
+    // Theoretically we could implement removal of nested retain+release
+    // pairs by making PtrState hold a stack of states, but this is
+    // simple and avoids adding overhead for the non-nested case.
+    if (GetSeq() == S_Retain)
+      NestingDetected = true;
+
+    ResetSequenceProgress(S_Retain);
+    SetKnownSafe(HasKnownPositiveRefCount());
+    InsertCall(I);
+  }
+
+  SetKnownPositiveRefCount();
+  return NestingDetected;
+}
+
+bool TopDownPtrState::MatchWithRelease(ARCMDKindCache &Cache,
+                                       Instruction *Release) {
+  ClearKnownPositiveRefCount();
+
+  Sequence OldSeq = GetSeq();
+
+  MDNode *ReleaseMetadata =
+      Release->getMetadata(Cache.get(ARCMDKindID::ImpreciseRelease));
+
+  switch (OldSeq) {
+  case S_Retain:
+  case S_CanRelease:
+    if (OldSeq == S_Retain || ReleaseMetadata != nullptr)
+      ClearReverseInsertPts();
+    LLVM_FALLTHROUGH;
+  case S_Use:
+    SetReleaseMetadata(ReleaseMetadata);
+    SetTailCallRelease(cast<CallInst>(Release)->isTailCall());
+    return true;
+  case S_None:
+    return false;
+  case S_Stop:
+  case S_Release:
+  case S_MovableRelease:
+    llvm_unreachable("top-down pointer in bottom up state!");
+  }
+  llvm_unreachable("Sequence unknown enum value");
+}
+
+bool TopDownPtrState::HandlePotentialAlterRefCount(Instruction *Inst,
+                                                   const Value *Ptr,
+                                                   ProvenanceAnalysis &PA,
+                                                   ARCInstKind Class) {
+  // Check for possible releases. Treat clang.arc.use as a releasing instruction
+  // to prevent sinking a retain past it.
   if (!CanDecrementRefCount(Inst, Ptr, PA, Class) &&
-      Class != ARCInstKind::IntrinsicUser) 
-    return false; 
- 
-  LLVM_DEBUG(dbgs() << "            CanAlterRefCount: Seq: " << GetSeq() << "; " 
-                    << *Ptr << "\n"); 
-  ClearKnownPositiveRefCount(); 
-  switch (GetSeq()) { 
-  case S_Retain: 
-    SetSeq(S_CanRelease); 
-    assert(!HasReverseInsertPts()); 
-    InsertReverseInsertPt(Inst); 
- 
-    // One call can't cause a transition from S_Retain to S_CanRelease 
-    // and S_CanRelease to S_Use. If we've made the first transition, 
-    // we're done. 
-    return true; 
-  case S_Use: 
-  case S_CanRelease: 
-  case S_None: 
-    return false; 
-  case S_Stop: 
-  case S_Release: 
-  case S_MovableRelease: 
-    llvm_unreachable("top-down pointer in release state!"); 
-  } 
-  llvm_unreachable("covered switch is not covered!?"); 
-} 
- 
-void TopDownPtrState::HandlePotentialUse(Instruction *Inst, const Value *Ptr, 
-                                         ProvenanceAnalysis &PA, 
-                                         ARCInstKind Class) { 
-  // Check for possible direct uses. 
-  switch (GetSeq()) { 
-  case S_CanRelease: 
-    if (!CanUse(Inst, Ptr, PA, Class)) 
-      return; 
-    LLVM_DEBUG(dbgs() << "             CanUse: Seq: " << GetSeq() << "; " 
-                      << *Ptr << "\n"); 
-    SetSeq(S_Use); 
-    return; 
-  case S_Retain: 
-  case S_Use: 
-  case S_None: 
-    return; 
-  case S_Stop: 
-  case S_Release: 
-  case S_MovableRelease: 
-    llvm_unreachable("top-down pointer in release state!"); 
-  } 
-} 
+      Class != ARCInstKind::IntrinsicUser)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "            CanAlterRefCount: Seq: " << GetSeq() << "; "
+                    << *Ptr << "\n");
+  ClearKnownPositiveRefCount();
+  switch (GetSeq()) {
+  case S_Retain:
+    SetSeq(S_CanRelease);
+    assert(!HasReverseInsertPts());
+    InsertReverseInsertPt(Inst);
+
+    // One call can't cause a transition from S_Retain to S_CanRelease
+    // and S_CanRelease to S_Use. If we've made the first transition,
+    // we're done.
+    return true;
+  case S_Use:
+  case S_CanRelease:
+  case S_None:
+    return false;
+  case S_Stop:
+  case S_Release:
+  case S_MovableRelease:
+    llvm_unreachable("top-down pointer in release state!");
+  }
+  llvm_unreachable("covered switch is not covered!?");
+}
+
+void TopDownPtrState::HandlePotentialUse(Instruction *Inst, const Value *Ptr,
+                                         ProvenanceAnalysis &PA,
+                                         ARCInstKind Class) {
+  // Check for possible direct uses.
+  switch (GetSeq()) {
+  case S_CanRelease:
+    if (!CanUse(Inst, Ptr, PA, Class))
+      return;
+    LLVM_DEBUG(dbgs() << "             CanUse: Seq: " << GetSeq() << "; "
+                      << *Ptr << "\n");
+    SetSeq(S_Use);
+    return;
+  case S_Retain:
+  case S_Use:
+  case S_None:
+    return;
+  case S_Stop:
+  case S_Release:
+  case S_MovableRelease:
+    llvm_unreachable("top-down pointer in release state!");
+  }
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/ObjCARC/PtrState.h b/contrib/libs/llvm12/lib/Transforms/ObjCARC/PtrState.h
index e6856ba615..66614c06cb 100644
--- a/contrib/libs/llvm12/lib/Transforms/ObjCARC/PtrState.h
+++ b/contrib/libs/llvm12/lib/Transforms/ObjCARC/PtrState.h
@@ -1,212 +1,212 @@
-//===- PtrState.h - ARC State for a Ptr -------------------------*- C++ -*-===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-//  This file contains declarations for the ARC state associated with a ptr. It 
-//  is only used by the ARC Sequence Dataflow computation. By separating this 
-//  from the actual dataflow, it is easier to consider the mechanics of the ARC 
-//  optimization separate from the actual predicates being used. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_PTRSTATE_H 
-#define LLVM_LIB_TRANSFORMS_OBJCARC_PTRSTATE_H 
- 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/Analysis/ObjCARCInstKind.h" 
-#include "llvm/Support/Compiler.h" 
- 
-namespace llvm { 
- 
-class BasicBlock; 
-class Instruction; 
-class MDNode; 
-class raw_ostream; 
-class Value; 
- 
-namespace objcarc { 
- 
-class ARCMDKindCache; 
-class ProvenanceAnalysis; 
- 
-/// \enum Sequence 
-/// 
-/// A sequence of states that a pointer may go through in which an 
-/// objc_retain and objc_release are actually needed. 
-enum Sequence { 
-  S_None, 
-  S_Retain,        ///< objc_retain(x). 
-  S_CanRelease,    ///< foo(x) -- x could possibly see a ref count decrement. 
-  S_Use,           ///< any use of x. 
-  S_Stop,          ///< like S_Release, but code motion is stopped. 
-  S_Release,       ///< objc_release(x). 
-  S_MovableRelease ///< objc_release(x), !clang.imprecise_release. 
-}; 
- 
-raw_ostream &operator<<(raw_ostream &OS, 
-                        const Sequence S) LLVM_ATTRIBUTE_UNUSED; 
- 
-/// Unidirectional information about either a 
-/// retain-decrement-use-release sequence or release-use-decrement-retain 
-/// reverse sequence. 
-struct RRInfo { 
-  /// After an objc_retain, the reference count of the referenced 
-  /// object is known to be positive. Similarly, before an objc_release, the 
-  /// reference count of the referenced object is known to be positive. If 
-  /// there are retain-release pairs in code regions where the retain count 
-  /// is known to be positive, they can be eliminated, regardless of any side 
-  /// effects between them. 
-  /// 
-  /// Also, a retain+release pair nested within another retain+release 
-  /// pair all on the known same pointer value can be eliminated, regardless 
-  /// of any intervening side effects. 
-  /// 
-  /// KnownSafe is true when either of these conditions is satisfied. 
-  bool KnownSafe = false; 
- 
-  /// True of the objc_release calls are all marked with the "tail" keyword. 
-  bool IsTailCallRelease = false; 
- 
-  /// If the Calls are objc_release calls and they all have a 
-  /// clang.imprecise_release tag, this is the metadata tag. 
-  MDNode *ReleaseMetadata = nullptr; 
- 
-  /// For a top-down sequence, the set of objc_retains or 
-  /// objc_retainBlocks. For bottom-up, the set of objc_releases. 
-  SmallPtrSet<Instruction *, 2> Calls; 
- 
-  /// The set of optimal insert positions for moving calls in the opposite 
-  /// sequence. 
-  SmallPtrSet<Instruction *, 2> ReverseInsertPts; 
- 
-  /// If this is true, we cannot perform code motion but can still remove 
-  /// retain/release pairs. 
-  bool CFGHazardAfflicted = false; 
- 
-  RRInfo() = default; 
- 
-  void clear(); 
- 
-  /// Conservatively merge the two RRInfo. Returns true if a partial merge has 
-  /// occurred, false otherwise. 
-  bool Merge(const RRInfo &Other); 
-}; 
- 
-/// This class summarizes several per-pointer runtime properties which 
-/// are propagated through the flow graph. 
-class PtrState { 
-protected: 
-  /// True if the reference count is known to be incremented. 
-  bool KnownPositiveRefCount = false; 
- 
-  /// True if we've seen an opportunity for partial RR elimination, such as 
-  /// pushing calls into a CFG triangle or into one side of a CFG diamond. 
-  bool Partial = false; 
- 
-  /// The current position in the sequence. 
-  unsigned char Seq : 8; 
- 
-  /// Unidirectional information about the current sequence. 
-  RRInfo RRI; 
- 
-  PtrState() : Seq(S_None) {} 
- 
-public: 
-  bool IsKnownSafe() const { return RRI.KnownSafe; } 
- 
-  void SetKnownSafe(const bool NewValue) { RRI.KnownSafe = NewValue; } 
- 
-  bool IsTailCallRelease() const { return RRI.IsTailCallRelease; } 
- 
-  void SetTailCallRelease(const bool NewValue) { 
-    RRI.IsTailCallRelease = NewValue; 
-  } 
- 
-  bool IsTrackingImpreciseReleases() const { 
-    return RRI.ReleaseMetadata != nullptr; 
-  } 
- 
-  const MDNode *GetReleaseMetadata() const { return RRI.ReleaseMetadata; } 
- 
-  void SetReleaseMetadata(MDNode *NewValue) { RRI.ReleaseMetadata = NewValue; } 
- 
-  bool IsCFGHazardAfflicted() const { return RRI.CFGHazardAfflicted; } 
- 
-  void SetCFGHazardAfflicted(const bool NewValue) { 
-    RRI.CFGHazardAfflicted = NewValue; 
-  } 
- 
-  void SetKnownPositiveRefCount(); 
-  void ClearKnownPositiveRefCount(); 
- 
-  bool HasKnownPositiveRefCount() const { return KnownPositiveRefCount; } 
- 
-  void SetSeq(Sequence NewSeq); 
- 
-  Sequence GetSeq() const { return static_cast<Sequence>(Seq); } 
- 
-  void ClearSequenceProgress() { ResetSequenceProgress(S_None); } 
- 
-  void ResetSequenceProgress(Sequence NewSeq); 
-  void Merge(const PtrState &Other, bool TopDown); 
- 
-  void InsertCall(Instruction *I) { RRI.Calls.insert(I); } 
- 
-  void InsertReverseInsertPt(Instruction *I) { RRI.ReverseInsertPts.insert(I); } 
- 
-  void ClearReverseInsertPts() { RRI.ReverseInsertPts.clear(); } 
- 
-  bool HasReverseInsertPts() const { return !RRI.ReverseInsertPts.empty(); } 
- 
-  const RRInfo &GetRRInfo() const { return RRI; } 
-}; 
- 
-struct BottomUpPtrState : PtrState { 
-  BottomUpPtrState() = default; 
- 
-  /// (Re-)Initialize this bottom up pointer returning true if we detected a 
-  /// pointer with nested releases. 
-  bool InitBottomUp(ARCMDKindCache &Cache, Instruction *I); 
- 
-  /// Return true if this set of releases can be paired with a release. Modifies 
-  /// state appropriately to reflect that the matching occurred if it is 
-  /// successful. 
-  /// 
-  /// It is assumed that one has already checked that the RCIdentity of the 
-  /// retain and the RCIdentity of this ptr state are the same. 
-  bool MatchWithRetain(); 
- 
-  void HandlePotentialUse(BasicBlock *BB, Instruction *Inst, const Value *Ptr, 
-                          ProvenanceAnalysis &PA, ARCInstKind Class); 
-  bool HandlePotentialAlterRefCount(Instruction *Inst, const Value *Ptr, 
-                                    ProvenanceAnalysis &PA, ARCInstKind Class); 
-}; 
- 
-struct TopDownPtrState : PtrState { 
-  TopDownPtrState() = default; 
- 
-  /// (Re-)Initialize this bottom up pointer returning true if we detected a 
-  /// pointer with nested releases. 
-  bool InitTopDown(ARCInstKind Kind, Instruction *I); 
- 
-  /// Return true if this set of retains can be paired with the given 
-  /// release. Modifies state appropriately to reflect that the matching 
-  /// occurred. 
-  bool MatchWithRelease(ARCMDKindCache &Cache, Instruction *Release); 
- 
-  void HandlePotentialUse(Instruction *Inst, const Value *Ptr, 
-                          ProvenanceAnalysis &PA, ARCInstKind Class); 
- 
-  bool HandlePotentialAlterRefCount(Instruction *Inst, const Value *Ptr, 
-                                    ProvenanceAnalysis &PA, ARCInstKind Class); 
-}; 
- 
-} // end namespace objcarc 
- 
-} // end namespace llvm 
- 
-#endif // LLVM_LIB_TRANSFORMS_OBJCARC_PTRSTATE_H 
+//===- PtrState.h - ARC State for a Ptr -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file contains declarations for the ARC state associated with a ptr. It
+//  is only used by the ARC Sequence Dataflow computation. By separating this
+//  from the actual dataflow, it is easier to consider the mechanics of the ARC
+//  optimization separate from the actual predicates being used.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_PTRSTATE_H
+#define LLVM_LIB_TRANSFORMS_OBJCARC_PTRSTATE_H
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/ObjCARCInstKind.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+class BasicBlock;
+class Instruction;
+class MDNode;
+class raw_ostream;
+class Value;
+
+namespace objcarc {
+
+class ARCMDKindCache;
+class ProvenanceAnalysis;
+
+/// \enum Sequence
+///
+/// A sequence of states that a pointer may go through in which an
+/// objc_retain and objc_release are actually needed.
+enum Sequence {
+  S_None,
+  S_Retain,        ///< objc_retain(x).
+  S_CanRelease,    ///< foo(x) -- x could possibly see a ref count decrement.
+  S_Use,           ///< any use of x.
+  S_Stop,          ///< like S_Release, but code motion is stopped.
+  S_Release,       ///< objc_release(x).
+  S_MovableRelease ///< objc_release(x), !clang.imprecise_release.
+};
+
+raw_ostream &operator<<(raw_ostream &OS,
+                        const Sequence S) LLVM_ATTRIBUTE_UNUSED;
+
+/// Unidirectional information about either a
+/// retain-decrement-use-release sequence or release-use-decrement-retain
+/// reverse sequence.
+struct RRInfo {
+  /// After an objc_retain, the reference count of the referenced
+  /// object is known to be positive. Similarly, before an objc_release, the
+  /// reference count of the referenced object is known to be positive. If
+  /// there are retain-release pairs in code regions where the retain count
+  /// is known to be positive, they can be eliminated, regardless of any side
+  /// effects between them.
+  ///
+  /// Also, a retain+release pair nested within another retain+release
+  /// pair all on the known same pointer value can be eliminated, regardless
+  /// of any intervening side effects.
+  ///
+  /// KnownSafe is true when either of these conditions is satisfied.
+  bool KnownSafe = false;
+
+  /// True of the objc_release calls are all marked with the "tail" keyword.
+  bool IsTailCallRelease = false;
+
+  /// If the Calls are objc_release calls and they all have a
+  /// clang.imprecise_release tag, this is the metadata tag.
+  MDNode *ReleaseMetadata = nullptr;
+
+  /// For a top-down sequence, the set of objc_retains or
+  /// objc_retainBlocks. For bottom-up, the set of objc_releases.
+  SmallPtrSet<Instruction *, 2> Calls;
+
+  /// The set of optimal insert positions for moving calls in the opposite
+  /// sequence.
+  SmallPtrSet<Instruction *, 2> ReverseInsertPts;
+
+  /// If this is true, we cannot perform code motion but can still remove
+  /// retain/release pairs.
+  bool CFGHazardAfflicted = false;
+
+  RRInfo() = default;
+
+  void clear();
+
+  /// Conservatively merge the two RRInfo. Returns true if a partial merge has
+  /// occurred, false otherwise.
+  bool Merge(const RRInfo &Other);
+};
+
+/// This class summarizes several per-pointer runtime properties which
+/// are propagated through the flow graph.
+class PtrState {
+protected:
+  /// True if the reference count is known to be incremented.
+  bool KnownPositiveRefCount = false;
+
+  /// True if we've seen an opportunity for partial RR elimination, such as
+  /// pushing calls into a CFG triangle or into one side of a CFG diamond.
+  bool Partial = false;
+
+  /// The current position in the sequence.
+  unsigned char Seq : 8;
+
+  /// Unidirectional information about the current sequence.
+  RRInfo RRI;
+
+  PtrState() : Seq(S_None) {}
+
+public:
+  bool IsKnownSafe() const { return RRI.KnownSafe; }
+
+  void SetKnownSafe(const bool NewValue) { RRI.KnownSafe = NewValue; }
+
+  bool IsTailCallRelease() const { return RRI.IsTailCallRelease; }
+
+  void SetTailCallRelease(const bool NewValue) {
+    RRI.IsTailCallRelease = NewValue;
+  }
+
+  bool IsTrackingImpreciseReleases() const {
+    return RRI.ReleaseMetadata != nullptr;
+  }
+
+  const MDNode *GetReleaseMetadata() const { return RRI.ReleaseMetadata; }
+
+  void SetReleaseMetadata(MDNode *NewValue) { RRI.ReleaseMetadata = NewValue; }
+
+  bool IsCFGHazardAfflicted() const { return RRI.CFGHazardAfflicted; }
+
+  void SetCFGHazardAfflicted(const bool NewValue) {
+    RRI.CFGHazardAfflicted = NewValue;
+  }
+
+  void SetKnownPositiveRefCount();
+  void ClearKnownPositiveRefCount();
+
+  bool HasKnownPositiveRefCount() const { return KnownPositiveRefCount; }
+
+  void SetSeq(Sequence NewSeq);
+
+  Sequence GetSeq() const { return static_cast<Sequence>(Seq); }
+
+  void ClearSequenceProgress() { ResetSequenceProgress(S_None); }
+
+  void ResetSequenceProgress(Sequence NewSeq);
+  void Merge(const PtrState &Other, bool TopDown);
+
+  void InsertCall(Instruction *I) { RRI.Calls.insert(I); }
+
+  void InsertReverseInsertPt(Instruction *I) { RRI.ReverseInsertPts.insert(I); }
+
+  void ClearReverseInsertPts() { RRI.ReverseInsertPts.clear(); }
+
+  bool HasReverseInsertPts() const { return !RRI.ReverseInsertPts.empty(); }
+
+  const RRInfo &GetRRInfo() const { return RRI; }
+};
+
+struct BottomUpPtrState : PtrState {
+  BottomUpPtrState() = default;
+
+  /// (Re-)Initialize this bottom up pointer returning true if we detected a
+  /// pointer with nested releases.
+  bool InitBottomUp(ARCMDKindCache &Cache, Instruction *I);
+
+  /// Return true if this set of releases can be paired with a release. Modifies
+  /// state appropriately to reflect that the matching occurred if it is
+  /// successful.
+  ///
+  /// It is assumed that one has already checked that the RCIdentity of the
+  /// retain and the RCIdentity of this ptr state are the same.
+  bool MatchWithRetain();
+
+  void HandlePotentialUse(BasicBlock *BB, Instruction *Inst, const Value *Ptr,
+                          ProvenanceAnalysis &PA, ARCInstKind Class);
+  bool HandlePotentialAlterRefCount(Instruction *Inst, const Value *Ptr,
+                                    ProvenanceAnalysis &PA, ARCInstKind Class);
+};
+
+struct TopDownPtrState : PtrState {
+  TopDownPtrState() = default;
+
+  /// (Re-)Initialize this bottom up pointer returning true if we detected a
+  /// pointer with nested releases.
+  bool InitTopDown(ARCInstKind Kind, Instruction *I);
+
+  /// Return true if this set of retains can be paired with the given
+  /// release. Modifies state appropriately to reflect that the matching
+  /// occurred.
+  bool MatchWithRelease(ARCMDKindCache &Cache, Instruction *Release);
+
+  void HandlePotentialUse(Instruction *Inst, const Value *Ptr,
+                          ProvenanceAnalysis &PA, ARCInstKind Class);
+
+  bool HandlePotentialAlterRefCount(Instruction *Inst, const Value *Ptr,
+                                    ProvenanceAnalysis &PA, ARCInstKind Class);
+};
+
+} // end namespace objcarc
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TRANSFORMS_OBJCARC_PTRSTATE_H
diff --git a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ya.make b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ya.make
index 7c6fa10925..727ec42c3f 100644
--- a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ya.make
+++ b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ya.make
@@ -1,43 +1,43 @@
-# Generated by devtools/yamaker. 
- 
-LIBRARY() 
- 
+# Generated by devtools/yamaker.
+
+LIBRARY()
+
 OWNER(
     orivej
     g:cpp-contrib
 )
- 
+
 LICENSE(Apache-2.0 WITH LLVM-exception)
 
 LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
 
-PEERDIR( 
+PEERDIR(
     contrib/libs/llvm12
     contrib/libs/llvm12/include
     contrib/libs/llvm12/lib/Analysis
     contrib/libs/llvm12/lib/IR
     contrib/libs/llvm12/lib/Support
     contrib/libs/llvm12/lib/Transforms/Utils
-) 
- 
+)
+
 ADDINCL(
     contrib/libs/llvm12/lib/Transforms/ObjCARC
 )
- 
-NO_COMPILER_WARNINGS() 
- 
-NO_UTIL() 
- 
-SRCS( 
-    DependencyAnalysis.cpp 
-    ObjCARC.cpp 
-    ObjCARCAPElim.cpp 
-    ObjCARCContract.cpp 
-    ObjCARCExpand.cpp 
-    ObjCARCOpts.cpp 
-    ProvenanceAnalysis.cpp 
-    ProvenanceAnalysisEvaluator.cpp 
-    PtrState.cpp 
-) 
- 
-END() 
+
+NO_COMPILER_WARNINGS()
+
+NO_UTIL()
+
+SRCS(
+    DependencyAnalysis.cpp
+    ObjCARC.cpp
+    ObjCARCAPElim.cpp
+    ObjCARCContract.cpp
+    ObjCARCExpand.cpp
+    ObjCARCOpts.cpp
+    ProvenanceAnalysis.cpp
+    ProvenanceAnalysisEvaluator.cpp
+    PtrState.cpp
+)
+
+END()
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/ADCE.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/ADCE.cpp
index 768fab6e7d..ce4e5e575f 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/ADCE.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/ADCE.cpp
@@ -1,747 +1,747 @@
-//===- ADCE.cpp - Code to perform dead code elimination -------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the Aggressive Dead Code Elimination pass.  This pass 
-// optimistically assumes that all instructions are dead until proven otherwise, 
-// allowing it to eliminate dead computations that other DCE passes do not 
-// catch, particularly involving loop computations. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/ADCE.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/DepthFirstIterator.h" 
-#include "llvm/ADT/GraphTraits.h" 
-#include "llvm/ADT/MapVector.h" 
-#include "llvm/ADT/PostOrderIterator.h" 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/DomTreeUpdater.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/IteratedDominanceFrontier.h" 
-#include "llvm/Analysis/PostDominators.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/DebugInfoMetadata.h" 
-#include "llvm/IR/DebugLoc.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstIterator.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/ProfileData/InstrProf.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include <cassert> 
-#include <cstddef> 
-#include <utility> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "adce" 
- 
-STATISTIC(NumRemoved, "Number of instructions removed"); 
-STATISTIC(NumBranchesRemoved, "Number of branch instructions removed"); 
- 
-// This is a temporary option until we change the interface to this pass based 
-// on optimization level. 
-static cl::opt<bool> RemoveControlFlowFlag("adce-remove-control-flow", 
-                                           cl::init(true), cl::Hidden); 
- 
-// This option enables removing of may-be-infinite loops which have no other 
-// effect. 
-static cl::opt<bool> RemoveLoops("adce-remove-loops", cl::init(false), 
-                                 cl::Hidden); 
- 
-namespace { 
- 
-/// Information about Instructions 
-struct InstInfoType { 
-  /// True if the associated instruction is live. 
-  bool Live = false; 
- 
-  /// Quick access to information for block containing associated Instruction. 
-  struct BlockInfoType *Block = nullptr; 
-}; 
- 
-/// Information about basic blocks relevant to dead code elimination. 
-struct BlockInfoType { 
-  /// True when this block contains a live instructions. 
-  bool Live = false; 
- 
-  /// True when this block ends in an unconditional branch. 
-  bool UnconditionalBranch = false; 
- 
-  /// True when this block is known to have live PHI nodes. 
-  bool HasLivePhiNodes = false; 
- 
-  /// Control dependence sources need to be live for this block. 
-  bool CFLive = false; 
- 
-  /// Quick access to the LiveInfo for the terminator, 
-  /// holds the value &InstInfo[Terminator] 
-  InstInfoType *TerminatorLiveInfo = nullptr; 
- 
-  /// Corresponding BasicBlock. 
-  BasicBlock *BB = nullptr; 
- 
-  /// Cache of BB->getTerminator(). 
-  Instruction *Terminator = nullptr; 
- 
-  /// Post-order numbering of reverse control flow graph. 
-  unsigned PostOrder; 
- 
-  bool terminatorIsLive() const { return TerminatorLiveInfo->Live; } 
-}; 
- 
-class AggressiveDeadCodeElimination { 
-  Function &F; 
- 
-  // ADCE does not use DominatorTree per se, but it updates it to preserve the 
-  // analysis. 
-  DominatorTree *DT; 
-  PostDominatorTree &PDT; 
- 
-  /// Mapping of blocks to associated information, an element in BlockInfoVec. 
-  /// Use MapVector to get deterministic iteration order. 
-  MapVector<BasicBlock *, BlockInfoType> BlockInfo; 
-  bool isLive(BasicBlock *BB) { return BlockInfo[BB].Live; } 
- 
-  /// Mapping of instructions to associated information. 
-  DenseMap<Instruction *, InstInfoType> InstInfo; 
-  bool isLive(Instruction *I) { return InstInfo[I].Live; } 
- 
-  /// Instructions known to be live where we need to mark 
-  /// reaching definitions as live. 
-  SmallVector<Instruction *, 128> Worklist; 
- 
-  /// Debug info scopes around a live instruction. 
-  SmallPtrSet<const Metadata *, 32> AliveScopes; 
- 
-  /// Set of blocks with not known to have live terminators. 
-  SmallSetVector<BasicBlock *, 16> BlocksWithDeadTerminators; 
- 
-  /// The set of blocks which we have determined whose control 
-  /// dependence sources must be live and which have not had 
-  /// those dependences analyzed. 
-  SmallPtrSet<BasicBlock *, 16> NewLiveBlocks; 
- 
-  /// Set up auxiliary data structures for Instructions and BasicBlocks and 
-  /// initialize the Worklist to the set of must-be-live Instruscions. 
-  void initialize(); 
- 
-  /// Return true for operations which are always treated as live. 
-  bool isAlwaysLive(Instruction &I); 
- 
-  /// Return true for instrumentation instructions for value profiling. 
-  bool isInstrumentsConstant(Instruction &I); 
- 
-  /// Propagate liveness to reaching definitions. 
-  void markLiveInstructions(); 
- 
-  /// Mark an instruction as live. 
-  void markLive(Instruction *I); 
- 
-  /// Mark a block as live. 
-  void markLive(BlockInfoType &BB); 
-  void markLive(BasicBlock *BB) { markLive(BlockInfo[BB]); } 
- 
-  /// Mark terminators of control predecessors of a PHI node live. 
-  void markPhiLive(PHINode *PN); 
- 
-  /// Record the Debug Scopes which surround live debug information. 
-  void collectLiveScopes(const DILocalScope &LS); 
-  void collectLiveScopes(const DILocation &DL); 
- 
-  /// Analyze dead branches to find those whose branches are the sources 
-  /// of control dependences impacting a live block. Those branches are 
-  /// marked live. 
-  void markLiveBranchesFromControlDependences(); 
- 
-  /// Remove instructions not marked live, return if any instruction was 
-  /// removed. 
-  bool removeDeadInstructions(); 
- 
-  /// Identify connected sections of the control flow graph which have 
-  /// dead terminators and rewrite the control flow graph to remove them. 
-  bool updateDeadRegions(); 
- 
-  /// Set the BlockInfo::PostOrder field based on a post-order 
-  /// numbering of the reverse control flow graph. 
-  void computeReversePostOrder(); 
- 
-  /// Make the terminator of this block an unconditional branch to \p Target. 
-  void makeUnconditional(BasicBlock *BB, BasicBlock *Target); 
- 
-public: 
-  AggressiveDeadCodeElimination(Function &F, DominatorTree *DT, 
-                                PostDominatorTree &PDT) 
-      : F(F), DT(DT), PDT(PDT) {} 
- 
-  bool performDeadCodeElimination(); 
-}; 
- 
-} // end anonymous namespace 
- 
-bool AggressiveDeadCodeElimination::performDeadCodeElimination() { 
-  initialize(); 
-  markLiveInstructions(); 
-  return removeDeadInstructions(); 
-} 
- 
-static bool isUnconditionalBranch(Instruction *Term) { 
-  auto *BR = dyn_cast<BranchInst>(Term); 
-  return BR && BR->isUnconditional(); 
-} 
- 
-void AggressiveDeadCodeElimination::initialize() { 
-  auto NumBlocks = F.size(); 
- 
-  // We will have an entry in the map for each block so we grow the 
-  // structure to twice that size to keep the load factor low in the hash table. 
-  BlockInfo.reserve(NumBlocks); 
-  size_t NumInsts = 0; 
- 
-  // Iterate over blocks and initialize BlockInfoVec entries, count 
-  // instructions to size the InstInfo hash table. 
-  for (auto &BB : F) { 
-    NumInsts += BB.size(); 
-    auto &Info = BlockInfo[&BB]; 
-    Info.BB = &BB; 
-    Info.Terminator = BB.getTerminator(); 
-    Info.UnconditionalBranch = isUnconditionalBranch(Info.Terminator); 
-  } 
- 
-  // Initialize instruction map and set pointers to block info. 
-  InstInfo.reserve(NumInsts); 
-  for (auto &BBInfo : BlockInfo) 
-    for (Instruction &I : *BBInfo.second.BB) 
-      InstInfo[&I].Block = &BBInfo.second; 
- 
-  // Since BlockInfoVec holds pointers into InstInfo and vice-versa, we may not 
-  // add any more elements to either after this point. 
-  for (auto &BBInfo : BlockInfo) 
-    BBInfo.second.TerminatorLiveInfo = &InstInfo[BBInfo.second.Terminator]; 
- 
-  // Collect the set of "root" instructions that are known live. 
-  for (Instruction &I : instructions(F)) 
-    if (isAlwaysLive(I)) 
-      markLive(&I); 
- 
-  if (!RemoveControlFlowFlag) 
-    return; 
- 
-  if (!RemoveLoops) { 
-    // This stores state for the depth-first iterator. In addition 
-    // to recording which nodes have been visited we also record whether 
-    // a node is currently on the "stack" of active ancestors of the current 
-    // node. 
-    using StatusMap = DenseMap<BasicBlock *, bool>; 
- 
-    class DFState : public StatusMap { 
-    public: 
-      std::pair<StatusMap::iterator, bool> insert(BasicBlock *BB) { 
-        return StatusMap::insert(std::make_pair(BB, true)); 
-      } 
- 
-      // Invoked after we have visited all children of a node. 
-      void completed(BasicBlock *BB) { (*this)[BB] = false; } 
- 
-      // Return true if \p BB is currently on the active stack 
-      // of ancestors. 
-      bool onStack(BasicBlock *BB) { 
-        auto Iter = find(BB); 
-        return Iter != end() && Iter->second; 
-      } 
-    } State; 
- 
-    State.reserve(F.size()); 
-    // Iterate over blocks in depth-first pre-order and 
-    // treat all edges to a block already seen as loop back edges 
-    // and mark the branch live it if there is a back edge. 
-    for (auto *BB: depth_first_ext(&F.getEntryBlock(), State)) { 
-      Instruction *Term = BB->getTerminator(); 
-      if (isLive(Term)) 
-        continue; 
- 
-      for (auto *Succ : successors(BB)) 
-        if (State.onStack(Succ)) { 
-          // back edge.... 
-          markLive(Term); 
-          break; 
-        } 
-    } 
-  } 
- 
-  // Mark blocks live if there is no path from the block to a 
-  // return of the function. 
-  // We do this by seeing which of the postdomtree root children exit the 
-  // program, and for all others, mark the subtree live. 
-  for (auto &PDTChild : children<DomTreeNode *>(PDT.getRootNode())) { 
-    auto *BB = PDTChild->getBlock(); 
-    auto &Info = BlockInfo[BB]; 
-    // Real function return 
-    if (isa<ReturnInst>(Info.Terminator)) { 
-      LLVM_DEBUG(dbgs() << "post-dom root child is a return: " << BB->getName() 
-                        << '\n';); 
-      continue; 
-    } 
- 
-    // This child is something else, like an infinite loop. 
-    for (auto DFNode : depth_first(PDTChild)) 
-      markLive(BlockInfo[DFNode->getBlock()].Terminator); 
-  } 
- 
-  // Treat the entry block as always live 
-  auto *BB = &F.getEntryBlock(); 
-  auto &EntryInfo = BlockInfo[BB]; 
-  EntryInfo.Live = true; 
-  if (EntryInfo.UnconditionalBranch) 
-    markLive(EntryInfo.Terminator); 
- 
-  // Build initial collection of blocks with dead terminators 
-  for (auto &BBInfo : BlockInfo) 
-    if (!BBInfo.second.terminatorIsLive()) 
-      BlocksWithDeadTerminators.insert(BBInfo.second.BB); 
-} 
- 
-bool AggressiveDeadCodeElimination::isAlwaysLive(Instruction &I) { 
-  // TODO -- use llvm::isInstructionTriviallyDead 
+//===- ADCE.cpp - Code to perform dead code elimination -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Aggressive Dead Code Elimination pass.  This pass
+// optimistically assumes that all instructions are dead until proven otherwise,
+// allowing it to eliminate dead computations that other DCE passes do not
+// catch, particularly involving loop computations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/ADCE.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/IteratedDominanceFrontier.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include <cassert>
+#include <cstddef>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "adce"
+
+STATISTIC(NumRemoved, "Number of instructions removed");
+STATISTIC(NumBranchesRemoved, "Number of branch instructions removed");
+
+// This is a temporary option until we change the interface to this pass based
+// on optimization level.
+static cl::opt<bool> RemoveControlFlowFlag("adce-remove-control-flow",
+                                           cl::init(true), cl::Hidden);
+
+// This option enables removing of may-be-infinite loops which have no other
+// effect.
+static cl::opt<bool> RemoveLoops("adce-remove-loops", cl::init(false),
+                                 cl::Hidden);
+
+namespace {
+
+/// Information about Instructions
+struct InstInfoType {
+  /// True if the associated instruction is live.
+  bool Live = false;
+
+  /// Quick access to information for block containing associated Instruction.
+  struct BlockInfoType *Block = nullptr;
+};
+
+/// Information about basic blocks relevant to dead code elimination.
+struct BlockInfoType {
+  /// True when this block contains a live instructions.
+  bool Live = false;
+
+  /// True when this block ends in an unconditional branch.
+  bool UnconditionalBranch = false;
+
+  /// True when this block is known to have live PHI nodes.
+  bool HasLivePhiNodes = false;
+
+  /// Control dependence sources need to be live for this block.
+  bool CFLive = false;
+
+  /// Quick access to the LiveInfo for the terminator,
+  /// holds the value &InstInfo[Terminator]
+  InstInfoType *TerminatorLiveInfo = nullptr;
+
+  /// Corresponding BasicBlock.
+  BasicBlock *BB = nullptr;
+
+  /// Cache of BB->getTerminator().
+  Instruction *Terminator = nullptr;
+
+  /// Post-order numbering of reverse control flow graph.
+  unsigned PostOrder;
+
+  bool terminatorIsLive() const { return TerminatorLiveInfo->Live; }
+};
+
+class AggressiveDeadCodeElimination {
+  Function &F;
+
+  // ADCE does not use DominatorTree per se, but it updates it to preserve the
+  // analysis.
+  DominatorTree *DT;
+  PostDominatorTree &PDT;
+
+  /// Mapping of blocks to associated information, an element in BlockInfoVec.
+  /// Use MapVector to get deterministic iteration order.
+  MapVector<BasicBlock *, BlockInfoType> BlockInfo;
+  bool isLive(BasicBlock *BB) { return BlockInfo[BB].Live; }
+
+  /// Mapping of instructions to associated information.
+  DenseMap<Instruction *, InstInfoType> InstInfo;
+  bool isLive(Instruction *I) { return InstInfo[I].Live; }
+
+  /// Instructions known to be live where we need to mark
+  /// reaching definitions as live.
+  SmallVector<Instruction *, 128> Worklist;
+
+  /// Debug info scopes around a live instruction.
+  SmallPtrSet<const Metadata *, 32> AliveScopes;
+
+  /// Set of blocks with not known to have live terminators.
+  SmallSetVector<BasicBlock *, 16> BlocksWithDeadTerminators;
+
+  /// The set of blocks which we have determined whose control
+  /// dependence sources must be live and which have not had
+  /// those dependences analyzed.
+  SmallPtrSet<BasicBlock *, 16> NewLiveBlocks;
+
+  /// Set up auxiliary data structures for Instructions and BasicBlocks and
+  /// initialize the Worklist to the set of must-be-live Instruscions.
+  void initialize();
+
+  /// Return true for operations which are always treated as live.
+  bool isAlwaysLive(Instruction &I);
+
+  /// Return true for instrumentation instructions for value profiling.
+  bool isInstrumentsConstant(Instruction &I);
+
+  /// Propagate liveness to reaching definitions.
+  void markLiveInstructions();
+
+  /// Mark an instruction as live.
+  void markLive(Instruction *I);
+
+  /// Mark a block as live.
+  void markLive(BlockInfoType &BB);
+  void markLive(BasicBlock *BB) { markLive(BlockInfo[BB]); }
+
+  /// Mark terminators of control predecessors of a PHI node live.
+  void markPhiLive(PHINode *PN);
+
+  /// Record the Debug Scopes which surround live debug information.
+  void collectLiveScopes(const DILocalScope &LS);
+  void collectLiveScopes(const DILocation &DL);
+
+  /// Analyze dead branches to find those whose branches are the sources
+  /// of control dependences impacting a live block. Those branches are
+  /// marked live.
+  void markLiveBranchesFromControlDependences();
+
+  /// Remove instructions not marked live, return if any instruction was
+  /// removed.
+  bool removeDeadInstructions();
+
+  /// Identify connected sections of the control flow graph which have
+  /// dead terminators and rewrite the control flow graph to remove them.
+  bool updateDeadRegions();
+
+  /// Set the BlockInfo::PostOrder field based on a post-order
+  /// numbering of the reverse control flow graph.
+  void computeReversePostOrder();
+
+  /// Make the terminator of this block an unconditional branch to \p Target.
+  void makeUnconditional(BasicBlock *BB, BasicBlock *Target);
+
+public:
+  AggressiveDeadCodeElimination(Function &F, DominatorTree *DT,
+                                PostDominatorTree &PDT)
+      : F(F), DT(DT), PDT(PDT) {}
+
+  bool performDeadCodeElimination();
+};
+
+} // end anonymous namespace
+
+bool AggressiveDeadCodeElimination::performDeadCodeElimination() {
+  initialize();
+  markLiveInstructions();
+  return removeDeadInstructions();
+}
+
+static bool isUnconditionalBranch(Instruction *Term) {
+  auto *BR = dyn_cast<BranchInst>(Term);
+  return BR && BR->isUnconditional();
+}
+
+void AggressiveDeadCodeElimination::initialize() {
+  auto NumBlocks = F.size();
+
+  // We will have an entry in the map for each block so we grow the
+  // structure to twice that size to keep the load factor low in the hash table.
+  BlockInfo.reserve(NumBlocks);
+  size_t NumInsts = 0;
+
+  // Iterate over blocks and initialize BlockInfoVec entries, count
+  // instructions to size the InstInfo hash table.
+  for (auto &BB : F) {
+    NumInsts += BB.size();
+    auto &Info = BlockInfo[&BB];
+    Info.BB = &BB;
+    Info.Terminator = BB.getTerminator();
+    Info.UnconditionalBranch = isUnconditionalBranch(Info.Terminator);
+  }
+
+  // Initialize instruction map and set pointers to block info.
+  InstInfo.reserve(NumInsts);
+  for (auto &BBInfo : BlockInfo)
+    for (Instruction &I : *BBInfo.second.BB)
+      InstInfo[&I].Block = &BBInfo.second;
+
+  // Since BlockInfoVec holds pointers into InstInfo and vice-versa, we may not
+  // add any more elements to either after this point.
+  for (auto &BBInfo : BlockInfo)
+    BBInfo.second.TerminatorLiveInfo = &InstInfo[BBInfo.second.Terminator];
+
+  // Collect the set of "root" instructions that are known live.
+  for (Instruction &I : instructions(F))
+    if (isAlwaysLive(I))
+      markLive(&I);
+
+  if (!RemoveControlFlowFlag)
+    return;
+
+  if (!RemoveLoops) {
+    // This stores state for the depth-first iterator. In addition
+    // to recording which nodes have been visited we also record whether
+    // a node is currently on the "stack" of active ancestors of the current
+    // node.
+    using StatusMap = DenseMap<BasicBlock *, bool>;
+
+    class DFState : public StatusMap {
+    public:
+      std::pair<StatusMap::iterator, bool> insert(BasicBlock *BB) {
+        return StatusMap::insert(std::make_pair(BB, true));
+      }
+
+      // Invoked after we have visited all children of a node.
+      void completed(BasicBlock *BB) { (*this)[BB] = false; }
+
+      // Return true if \p BB is currently on the active stack
+      // of ancestors.
+      bool onStack(BasicBlock *BB) {
+        auto Iter = find(BB);
+        return Iter != end() && Iter->second;
+      }
+    } State;
+
+    State.reserve(F.size());
+    // Iterate over blocks in depth-first pre-order and
+    // treat all edges to a block already seen as loop back edges
+    // and mark the branch live it if there is a back edge.
+    for (auto *BB: depth_first_ext(&F.getEntryBlock(), State)) {
+      Instruction *Term = BB->getTerminator();
+      if (isLive(Term))
+        continue;
+
+      for (auto *Succ : successors(BB))
+        if (State.onStack(Succ)) {
+          // back edge....
+          markLive(Term);
+          break;
+        }
+    }
+  }
+
+  // Mark blocks live if there is no path from the block to a
+  // return of the function.
+  // We do this by seeing which of the postdomtree root children exit the
+  // program, and for all others, mark the subtree live.
+  for (auto &PDTChild : children<DomTreeNode *>(PDT.getRootNode())) {
+    auto *BB = PDTChild->getBlock();
+    auto &Info = BlockInfo[BB];
+    // Real function return
+    if (isa<ReturnInst>(Info.Terminator)) {
+      LLVM_DEBUG(dbgs() << "post-dom root child is a return: " << BB->getName()
+                        << '\n';);
+      continue;
+    }
+
+    // This child is something else, like an infinite loop.
+    for (auto DFNode : depth_first(PDTChild))
+      markLive(BlockInfo[DFNode->getBlock()].Terminator);
+  }
+
+  // Treat the entry block as always live
+  auto *BB = &F.getEntryBlock();
+  auto &EntryInfo = BlockInfo[BB];
+  EntryInfo.Live = true;
+  if (EntryInfo.UnconditionalBranch)
+    markLive(EntryInfo.Terminator);
+
+  // Build initial collection of blocks with dead terminators
+  for (auto &BBInfo : BlockInfo)
+    if (!BBInfo.second.terminatorIsLive())
+      BlocksWithDeadTerminators.insert(BBInfo.second.BB);
+}
+
+bool AggressiveDeadCodeElimination::isAlwaysLive(Instruction &I) {
+  // TODO -- use llvm::isInstructionTriviallyDead
   if (I.isEHPad() || I.mayHaveSideEffects() || !I.willReturn()) {
-    // Skip any value profile instrumentation calls if they are 
-    // instrumenting constants. 
-    if (isInstrumentsConstant(I)) 
-      return false; 
-    return true; 
-  } 
-  if (!I.isTerminator()) 
-    return false; 
-  if (RemoveControlFlowFlag && (isa<BranchInst>(I) || isa<SwitchInst>(I))) 
-    return false; 
-  return true; 
-} 
- 
-// Check if this instruction is a runtime call for value profiling and 
-// if it's instrumenting a constant. 
-bool AggressiveDeadCodeElimination::isInstrumentsConstant(Instruction &I) { 
-  // TODO -- move this test into llvm::isInstructionTriviallyDead 
-  if (CallInst *CI = dyn_cast<CallInst>(&I)) 
-    if (Function *Callee = CI->getCalledFunction()) 
-      if (Callee->getName().equals(getInstrProfValueProfFuncName())) 
-        if (isa<Constant>(CI->getArgOperand(0))) 
-          return true; 
-  return false; 
-} 
- 
-void AggressiveDeadCodeElimination::markLiveInstructions() { 
-  // Propagate liveness backwards to operands. 
-  do { 
-    // Worklist holds newly discovered live instructions 
-    // where we need to mark the inputs as live. 
-    while (!Worklist.empty()) { 
-      Instruction *LiveInst = Worklist.pop_back_val(); 
-      LLVM_DEBUG(dbgs() << "work live: "; LiveInst->dump();); 
- 
-      for (Use &OI : LiveInst->operands()) 
-        if (Instruction *Inst = dyn_cast<Instruction>(OI)) 
-          markLive(Inst); 
- 
-      if (auto *PN = dyn_cast<PHINode>(LiveInst)) 
-        markPhiLive(PN); 
-    } 
- 
-    // After data flow liveness has been identified, examine which branch 
-    // decisions are required to determine live instructions are executed. 
-    markLiveBranchesFromControlDependences(); 
- 
-  } while (!Worklist.empty()); 
-} 
- 
-void AggressiveDeadCodeElimination::markLive(Instruction *I) { 
-  auto &Info = InstInfo[I]; 
-  if (Info.Live) 
-    return; 
- 
-  LLVM_DEBUG(dbgs() << "mark live: "; I->dump()); 
-  Info.Live = true; 
-  Worklist.push_back(I); 
- 
-  // Collect the live debug info scopes attached to this instruction. 
-  if (const DILocation *DL = I->getDebugLoc()) 
-    collectLiveScopes(*DL); 
- 
-  // Mark the containing block live 
-  auto &BBInfo = *Info.Block; 
-  if (BBInfo.Terminator == I) { 
-    BlocksWithDeadTerminators.remove(BBInfo.BB); 
-    // For live terminators, mark destination blocks 
-    // live to preserve this control flow edges. 
-    if (!BBInfo.UnconditionalBranch) 
-      for (auto *BB : successors(I->getParent())) 
-        markLive(BB); 
-  } 
-  markLive(BBInfo); 
-} 
- 
-void AggressiveDeadCodeElimination::markLive(BlockInfoType &BBInfo) { 
-  if (BBInfo.Live) 
-    return; 
-  LLVM_DEBUG(dbgs() << "mark block live: " << BBInfo.BB->getName() << '\n'); 
-  BBInfo.Live = true; 
-  if (!BBInfo.CFLive) { 
-    BBInfo.CFLive = true; 
-    NewLiveBlocks.insert(BBInfo.BB); 
-  } 
- 
-  // Mark unconditional branches at the end of live 
-  // blocks as live since there is no work to do for them later 
-  if (BBInfo.UnconditionalBranch) 
-    markLive(BBInfo.Terminator); 
-} 
- 
-void AggressiveDeadCodeElimination::collectLiveScopes(const DILocalScope &LS) { 
-  if (!AliveScopes.insert(&LS).second) 
-    return; 
- 
-  if (isa<DISubprogram>(LS)) 
-    return; 
- 
-  // Tail-recurse through the scope chain. 
-  collectLiveScopes(cast<DILocalScope>(*LS.getScope())); 
-} 
- 
-void AggressiveDeadCodeElimination::collectLiveScopes(const DILocation &DL) { 
-  // Even though DILocations are not scopes, shove them into AliveScopes so we 
-  // don't revisit them. 
-  if (!AliveScopes.insert(&DL).second) 
-    return; 
- 
-  // Collect live scopes from the scope chain. 
-  collectLiveScopes(*DL.getScope()); 
- 
-  // Tail-recurse through the inlined-at chain. 
-  if (const DILocation *IA = DL.getInlinedAt()) 
-    collectLiveScopes(*IA); 
-} 
- 
-void AggressiveDeadCodeElimination::markPhiLive(PHINode *PN) { 
-  auto &Info = BlockInfo[PN->getParent()]; 
-  // Only need to check this once per block. 
-  if (Info.HasLivePhiNodes) 
-    return; 
-  Info.HasLivePhiNodes = true; 
- 
-  // If a predecessor block is not live, mark it as control-flow live 
-  // which will trigger marking live branches upon which 
-  // that block is control dependent. 
-  for (auto *PredBB : predecessors(Info.BB)) { 
-    auto &Info = BlockInfo[PredBB]; 
-    if (!Info.CFLive) { 
-      Info.CFLive = true; 
-      NewLiveBlocks.insert(PredBB); 
-    } 
-  } 
-} 
- 
-void AggressiveDeadCodeElimination::markLiveBranchesFromControlDependences() { 
-  if (BlocksWithDeadTerminators.empty()) 
-    return; 
- 
-  LLVM_DEBUG({ 
-    dbgs() << "new live blocks:\n"; 
-    for (auto *BB : NewLiveBlocks) 
-      dbgs() << "\t" << BB->getName() << '\n'; 
-    dbgs() << "dead terminator blocks:\n"; 
-    for (auto *BB : BlocksWithDeadTerminators) 
-      dbgs() << "\t" << BB->getName() << '\n'; 
-  }); 
- 
-  // The dominance frontier of a live block X in the reverse 
-  // control graph is the set of blocks upon which X is control 
-  // dependent. The following sequence computes the set of blocks 
-  // which currently have dead terminators that are control 
-  // dependence sources of a block which is in NewLiveBlocks. 
- 
-  const SmallPtrSet<BasicBlock *, 16> BWDT{ 
-      BlocksWithDeadTerminators.begin(), 
-      BlocksWithDeadTerminators.end() 
-  }; 
-  SmallVector<BasicBlock *, 32> IDFBlocks; 
-  ReverseIDFCalculator IDFs(PDT); 
-  IDFs.setDefiningBlocks(NewLiveBlocks); 
-  IDFs.setLiveInBlocks(BWDT); 
-  IDFs.calculate(IDFBlocks); 
-  NewLiveBlocks.clear(); 
- 
-  // Dead terminators which control live blocks are now marked live. 
-  for (auto *BB : IDFBlocks) { 
-    LLVM_DEBUG(dbgs() << "live control in: " << BB->getName() << '\n'); 
-    markLive(BB->getTerminator()); 
-  } 
-} 
- 
-//===----------------------------------------------------------------------===// 
-// 
-//  Routines to update the CFG and SSA information before removing dead code. 
-// 
-//===----------------------------------------------------------------------===// 
-bool AggressiveDeadCodeElimination::removeDeadInstructions() { 
-  // Updates control and dataflow around dead blocks 
-  bool RegionsUpdated = updateDeadRegions(); 
- 
-  LLVM_DEBUG({ 
-    for (Instruction &I : instructions(F)) { 
-      // Check if the instruction is alive. 
-      if (isLive(&I)) 
-        continue; 
- 
-      if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&I)) { 
-        // Check if the scope of this variable location is alive. 
-        if (AliveScopes.count(DII->getDebugLoc()->getScope())) 
-          continue; 
- 
-        // If intrinsic is pointing at a live SSA value, there may be an 
-        // earlier optimization bug: if we know the location of the variable, 
-        // why isn't the scope of the location alive? 
-        if (Value *V = DII->getVariableLocation()) 
-          if (Instruction *II = dyn_cast<Instruction>(V)) 
-            if (isLive(II)) 
-              dbgs() << "Dropping debug info for " << *DII << "\n"; 
-      } 
-    } 
-  }); 
- 
-  // The inverse of the live set is the dead set.  These are those instructions 
-  // that have no side effects and do not influence the control flow or return 
-  // value of the function, and may therefore be deleted safely. 
-  // NOTE: We reuse the Worklist vector here for memory efficiency. 
-  for (Instruction &I : instructions(F)) { 
-    // Check if the instruction is alive. 
-    if (isLive(&I)) 
-      continue; 
- 
-    if (auto *DII = dyn_cast<DbgInfoIntrinsic>(&I)) { 
-      // Check if the scope of this variable location is alive. 
-      if (AliveScopes.count(DII->getDebugLoc()->getScope())) 
-        continue; 
- 
-      // Fallthrough and drop the intrinsic. 
-    } 
- 
-    // Prepare to delete. 
-    Worklist.push_back(&I); 
-    I.dropAllReferences(); 
-  } 
- 
-  for (Instruction *&I : Worklist) { 
-    ++NumRemoved; 
-    I->eraseFromParent(); 
-  } 
- 
-  return !Worklist.empty() || RegionsUpdated; 
-} 
- 
-// A dead region is the set of dead blocks with a common live post-dominator. 
-bool AggressiveDeadCodeElimination::updateDeadRegions() { 
-  LLVM_DEBUG({ 
-    dbgs() << "final dead terminator blocks: " << '\n'; 
-    for (auto *BB : BlocksWithDeadTerminators) 
-      dbgs() << '\t' << BB->getName() 
-             << (BlockInfo[BB].Live ? " LIVE\n" : "\n"); 
-  }); 
- 
-  // Don't compute the post ordering unless we needed it. 
-  bool HavePostOrder = false; 
-  bool Changed = false; 
- 
-  for (auto *BB : BlocksWithDeadTerminators) { 
-    auto &Info = BlockInfo[BB]; 
-    if (Info.UnconditionalBranch) { 
-      InstInfo[Info.Terminator].Live = true; 
-      continue; 
-    } 
- 
-    if (!HavePostOrder) { 
-      computeReversePostOrder(); 
-      HavePostOrder = true; 
-    } 
- 
-    // Add an unconditional branch to the successor closest to the 
-    // end of the function which insures a path to the exit for each 
-    // live edge. 
-    BlockInfoType *PreferredSucc = nullptr; 
-    for (auto *Succ : successors(BB)) { 
-      auto *Info = &BlockInfo[Succ]; 
-      if (!PreferredSucc || PreferredSucc->PostOrder < Info->PostOrder) 
-        PreferredSucc = Info; 
-    } 
-    assert((PreferredSucc && PreferredSucc->PostOrder > 0) && 
-           "Failed to find safe successor for dead branch"); 
- 
-    // Collect removed successors to update the (Post)DominatorTrees. 
-    SmallPtrSet<BasicBlock *, 4> RemovedSuccessors; 
-    bool First = true; 
-    for (auto *Succ : successors(BB)) { 
-      if (!First || Succ != PreferredSucc->BB) { 
-        Succ->removePredecessor(BB); 
-        RemovedSuccessors.insert(Succ); 
-      } else 
-        First = false; 
-    } 
-    makeUnconditional(BB, PreferredSucc->BB); 
- 
-    // Inform the dominators about the deleted CFG edges. 
-    SmallVector<DominatorTree::UpdateType, 4> DeletedEdges; 
-    for (auto *Succ : RemovedSuccessors) { 
-      // It might have happened that the same successor appeared multiple times 
-      // and the CFG edge wasn't really removed. 
-      if (Succ != PreferredSucc->BB) { 
-        LLVM_DEBUG(dbgs() << "ADCE: (Post)DomTree edge enqueued for deletion" 
-                          << BB->getName() << " -> " << Succ->getName() 
-                          << "\n"); 
-        DeletedEdges.push_back({DominatorTree::Delete, BB, Succ}); 
-      } 
-    } 
- 
-    DomTreeUpdater(DT, &PDT, DomTreeUpdater::UpdateStrategy::Eager) 
-        .applyUpdates(DeletedEdges); 
- 
-    NumBranchesRemoved += 1; 
-    Changed = true; 
-  } 
- 
-  return Changed; 
-} 
- 
-// reverse top-sort order 
-void AggressiveDeadCodeElimination::computeReversePostOrder() { 
-  // This provides a post-order numbering of the reverse control flow graph 
-  // Note that it is incomplete in the presence of infinite loops but we don't 
-  // need numbers blocks which don't reach the end of the functions since 
-  // all branches in those blocks are forced live. 
- 
-  // For each block without successors, extend the DFS from the block 
-  // backward through the graph 
-  SmallPtrSet<BasicBlock*, 16> Visited; 
-  unsigned PostOrder = 0; 
-  for (auto &BB : F) { 
+    // Skip any value profile instrumentation calls if they are
+    // instrumenting constants.
+    if (isInstrumentsConstant(I))
+      return false;
+    return true;
+  }
+  if (!I.isTerminator())
+    return false;
+  if (RemoveControlFlowFlag && (isa<BranchInst>(I) || isa<SwitchInst>(I)))
+    return false;
+  return true;
+}
+
+// Check if this instruction is a runtime call for value profiling and
+// if it's instrumenting a constant.
+bool AggressiveDeadCodeElimination::isInstrumentsConstant(Instruction &I) {
+  // TODO -- move this test into llvm::isInstructionTriviallyDead
+  if (CallInst *CI = dyn_cast<CallInst>(&I))
+    if (Function *Callee = CI->getCalledFunction())
+      if (Callee->getName().equals(getInstrProfValueProfFuncName()))
+        if (isa<Constant>(CI->getArgOperand(0)))
+          return true;
+  return false;
+}
+
+void AggressiveDeadCodeElimination::markLiveInstructions() {
+  // Propagate liveness backwards to operands.
+  do {
+    // Worklist holds newly discovered live instructions
+    // where we need to mark the inputs as live.
+    while (!Worklist.empty()) {
+      Instruction *LiveInst = Worklist.pop_back_val();
+      LLVM_DEBUG(dbgs() << "work live: "; LiveInst->dump(););
+
+      for (Use &OI : LiveInst->operands())
+        if (Instruction *Inst = dyn_cast<Instruction>(OI))
+          markLive(Inst);
+
+      if (auto *PN = dyn_cast<PHINode>(LiveInst))
+        markPhiLive(PN);
+    }
+
+    // After data flow liveness has been identified, examine which branch
+    // decisions are required to determine live instructions are executed.
+    markLiveBranchesFromControlDependences();
+
+  } while (!Worklist.empty());
+}
+
+void AggressiveDeadCodeElimination::markLive(Instruction *I) {
+  auto &Info = InstInfo[I];
+  if (Info.Live)
+    return;
+
+  LLVM_DEBUG(dbgs() << "mark live: "; I->dump());
+  Info.Live = true;
+  Worklist.push_back(I);
+
+  // Collect the live debug info scopes attached to this instruction.
+  if (const DILocation *DL = I->getDebugLoc())
+    collectLiveScopes(*DL);
+
+  // Mark the containing block live
+  auto &BBInfo = *Info.Block;
+  if (BBInfo.Terminator == I) {
+    BlocksWithDeadTerminators.remove(BBInfo.BB);
+    // For live terminators, mark destination blocks
+    // live to preserve this control flow edges.
+    if (!BBInfo.UnconditionalBranch)
+      for (auto *BB : successors(I->getParent()))
+        markLive(BB);
+  }
+  markLive(BBInfo);
+}
+
+void AggressiveDeadCodeElimination::markLive(BlockInfoType &BBInfo) {
+  if (BBInfo.Live)
+    return;
+  LLVM_DEBUG(dbgs() << "mark block live: " << BBInfo.BB->getName() << '\n');
+  BBInfo.Live = true;
+  if (!BBInfo.CFLive) {
+    BBInfo.CFLive = true;
+    NewLiveBlocks.insert(BBInfo.BB);
+  }
+
+  // Mark unconditional branches at the end of live
+  // blocks as live since there is no work to do for them later
+  if (BBInfo.UnconditionalBranch)
+    markLive(BBInfo.Terminator);
+}
+
+void AggressiveDeadCodeElimination::collectLiveScopes(const DILocalScope &LS) {
+  if (!AliveScopes.insert(&LS).second)
+    return;
+
+  if (isa<DISubprogram>(LS))
+    return;
+
+  // Tail-recurse through the scope chain.
+  collectLiveScopes(cast<DILocalScope>(*LS.getScope()));
+}
+
+void AggressiveDeadCodeElimination::collectLiveScopes(const DILocation &DL) {
+  // Even though DILocations are not scopes, shove them into AliveScopes so we
+  // don't revisit them.
+  if (!AliveScopes.insert(&DL).second)
+    return;
+
+  // Collect live scopes from the scope chain.
+  collectLiveScopes(*DL.getScope());
+
+  // Tail-recurse through the inlined-at chain.
+  if (const DILocation *IA = DL.getInlinedAt())
+    collectLiveScopes(*IA);
+}
+
+void AggressiveDeadCodeElimination::markPhiLive(PHINode *PN) {
+  auto &Info = BlockInfo[PN->getParent()];
+  // Only need to check this once per block.
+  if (Info.HasLivePhiNodes)
+    return;
+  Info.HasLivePhiNodes = true;
+
+  // If a predecessor block is not live, mark it as control-flow live
+  // which will trigger marking live branches upon which
+  // that block is control dependent.
+  for (auto *PredBB : predecessors(Info.BB)) {
+    auto &Info = BlockInfo[PredBB];
+    if (!Info.CFLive) {
+      Info.CFLive = true;
+      NewLiveBlocks.insert(PredBB);
+    }
+  }
+}
+
+void AggressiveDeadCodeElimination::markLiveBranchesFromControlDependences() {
+  if (BlocksWithDeadTerminators.empty())
+    return;
+
+  LLVM_DEBUG({
+    dbgs() << "new live blocks:\n";
+    for (auto *BB : NewLiveBlocks)
+      dbgs() << "\t" << BB->getName() << '\n';
+    dbgs() << "dead terminator blocks:\n";
+    for (auto *BB : BlocksWithDeadTerminators)
+      dbgs() << "\t" << BB->getName() << '\n';
+  });
+
+  // The dominance frontier of a live block X in the reverse
+  // control graph is the set of blocks upon which X is control
+  // dependent. The following sequence computes the set of blocks
+  // which currently have dead terminators that are control
+  // dependence sources of a block which is in NewLiveBlocks.
+
+  const SmallPtrSet<BasicBlock *, 16> BWDT{
+      BlocksWithDeadTerminators.begin(),
+      BlocksWithDeadTerminators.end()
+  };
+  SmallVector<BasicBlock *, 32> IDFBlocks;
+  ReverseIDFCalculator IDFs(PDT);
+  IDFs.setDefiningBlocks(NewLiveBlocks);
+  IDFs.setLiveInBlocks(BWDT);
+  IDFs.calculate(IDFBlocks);
+  NewLiveBlocks.clear();
+
+  // Dead terminators which control live blocks are now marked live.
+  for (auto *BB : IDFBlocks) {
+    LLVM_DEBUG(dbgs() << "live control in: " << BB->getName() << '\n');
+    markLive(BB->getTerminator());
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//
+//  Routines to update the CFG and SSA information before removing dead code.
+//
+//===----------------------------------------------------------------------===//
+bool AggressiveDeadCodeElimination::removeDeadInstructions() {
+  // Updates control and dataflow around dead blocks
+  bool RegionsUpdated = updateDeadRegions();
+
+  LLVM_DEBUG({
+    for (Instruction &I : instructions(F)) {
+      // Check if the instruction is alive.
+      if (isLive(&I))
+        continue;
+
+      if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&I)) {
+        // Check if the scope of this variable location is alive.
+        if (AliveScopes.count(DII->getDebugLoc()->getScope()))
+          continue;
+
+        // If intrinsic is pointing at a live SSA value, there may be an
+        // earlier optimization bug: if we know the location of the variable,
+        // why isn't the scope of the location alive?
+        if (Value *V = DII->getVariableLocation())
+          if (Instruction *II = dyn_cast<Instruction>(V))
+            if (isLive(II))
+              dbgs() << "Dropping debug info for " << *DII << "\n";
+      }
+    }
+  });
+
+  // The inverse of the live set is the dead set.  These are those instructions
+  // that have no side effects and do not influence the control flow or return
+  // value of the function, and may therefore be deleted safely.
+  // NOTE: We reuse the Worklist vector here for memory efficiency.
+  for (Instruction &I : instructions(F)) {
+    // Check if the instruction is alive.
+    if (isLive(&I))
+      continue;
+
+    if (auto *DII = dyn_cast<DbgInfoIntrinsic>(&I)) {
+      // Check if the scope of this variable location is alive.
+      if (AliveScopes.count(DII->getDebugLoc()->getScope()))
+        continue;
+
+      // Fallthrough and drop the intrinsic.
+    }
+
+    // Prepare to delete.
+    Worklist.push_back(&I);
+    I.dropAllReferences();
+  }
+
+  for (Instruction *&I : Worklist) {
+    ++NumRemoved;
+    I->eraseFromParent();
+  }
+
+  return !Worklist.empty() || RegionsUpdated;
+}
+
+// A dead region is the set of dead blocks with a common live post-dominator.
+bool AggressiveDeadCodeElimination::updateDeadRegions() {
+  LLVM_DEBUG({
+    dbgs() << "final dead terminator blocks: " << '\n';
+    for (auto *BB : BlocksWithDeadTerminators)
+      dbgs() << '\t' << BB->getName()
+             << (BlockInfo[BB].Live ? " LIVE\n" : "\n");
+  });
+
+  // Don't compute the post ordering unless we needed it.
+  bool HavePostOrder = false;
+  bool Changed = false;
+
+  for (auto *BB : BlocksWithDeadTerminators) {
+    auto &Info = BlockInfo[BB];
+    if (Info.UnconditionalBranch) {
+      InstInfo[Info.Terminator].Live = true;
+      continue;
+    }
+
+    if (!HavePostOrder) {
+      computeReversePostOrder();
+      HavePostOrder = true;
+    }
+
+    // Add an unconditional branch to the successor closest to the
+    // end of the function which insures a path to the exit for each
+    // live edge.
+    BlockInfoType *PreferredSucc = nullptr;
+    for (auto *Succ : successors(BB)) {
+      auto *Info = &BlockInfo[Succ];
+      if (!PreferredSucc || PreferredSucc->PostOrder < Info->PostOrder)
+        PreferredSucc = Info;
+    }
+    assert((PreferredSucc && PreferredSucc->PostOrder > 0) &&
+           "Failed to find safe successor for dead branch");
+
+    // Collect removed successors to update the (Post)DominatorTrees.
+    SmallPtrSet<BasicBlock *, 4> RemovedSuccessors;
+    bool First = true;
+    for (auto *Succ : successors(BB)) {
+      if (!First || Succ != PreferredSucc->BB) {
+        Succ->removePredecessor(BB);
+        RemovedSuccessors.insert(Succ);
+      } else
+        First = false;
+    }
+    makeUnconditional(BB, PreferredSucc->BB);
+
+    // Inform the dominators about the deleted CFG edges.
+    SmallVector<DominatorTree::UpdateType, 4> DeletedEdges;
+    for (auto *Succ : RemovedSuccessors) {
+      // It might have happened that the same successor appeared multiple times
+      // and the CFG edge wasn't really removed.
+      if (Succ != PreferredSucc->BB) {
+        LLVM_DEBUG(dbgs() << "ADCE: (Post)DomTree edge enqueued for deletion"
+                          << BB->getName() << " -> " << Succ->getName()
+                          << "\n");
+        DeletedEdges.push_back({DominatorTree::Delete, BB, Succ});
+      }
+    }
+
+    DomTreeUpdater(DT, &PDT, DomTreeUpdater::UpdateStrategy::Eager)
+        .applyUpdates(DeletedEdges);
+
+    NumBranchesRemoved += 1;
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+// reverse top-sort order
+void AggressiveDeadCodeElimination::computeReversePostOrder() {
+  // This provides a post-order numbering of the reverse control flow graph
+  // Note that it is incomplete in the presence of infinite loops but we don't
+  // need numbers blocks which don't reach the end of the functions since
+  // all branches in those blocks are forced live.
+
+  // For each block without successors, extend the DFS from the block
+  // backward through the graph
+  SmallPtrSet<BasicBlock*, 16> Visited;
+  unsigned PostOrder = 0;
+  for (auto &BB : F) {
     if (!succ_empty(&BB))
-      continue; 
-    for (BasicBlock *Block : inverse_post_order_ext(&BB,Visited)) 
-      BlockInfo[Block].PostOrder = PostOrder++; 
-  } 
-} 
- 
-void AggressiveDeadCodeElimination::makeUnconditional(BasicBlock *BB, 
-                                                      BasicBlock *Target) { 
-  Instruction *PredTerm = BB->getTerminator(); 
-  // Collect the live debug info scopes attached to this instruction. 
-  if (const DILocation *DL = PredTerm->getDebugLoc()) 
-    collectLiveScopes(*DL); 
- 
-  // Just mark live an existing unconditional branch 
-  if (isUnconditionalBranch(PredTerm)) { 
-    PredTerm->setSuccessor(0, Target); 
-    InstInfo[PredTerm].Live = true; 
-    return; 
-  } 
-  LLVM_DEBUG(dbgs() << "making unconditional " << BB->getName() << '\n'); 
-  NumBranchesRemoved += 1; 
-  IRBuilder<> Builder(PredTerm); 
-  auto *NewTerm = Builder.CreateBr(Target); 
-  InstInfo[NewTerm].Live = true; 
-  if (const DILocation *DL = PredTerm->getDebugLoc()) 
-    NewTerm->setDebugLoc(DL); 
- 
-  InstInfo.erase(PredTerm); 
-  PredTerm->eraseFromParent(); 
-} 
- 
-//===----------------------------------------------------------------------===// 
-// 
-// Pass Manager integration code 
-// 
-//===----------------------------------------------------------------------===// 
-PreservedAnalyses ADCEPass::run(Function &F, FunctionAnalysisManager &FAM) { 
-  // ADCE does not need DominatorTree, but require DominatorTree here 
-  // to update analysis if it is already available. 
-  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F); 
-  auto &PDT = FAM.getResult<PostDominatorTreeAnalysis>(F); 
-  if (!AggressiveDeadCodeElimination(F, DT, PDT).performDeadCodeElimination()) 
-    return PreservedAnalyses::all(); 
- 
-  PreservedAnalyses PA; 
-  // TODO: We could track if we have actually done CFG changes. 
-  if (!RemoveControlFlowFlag) 
-    PA.preserveSet<CFGAnalyses>(); 
-  else { 
-    PA.preserve<DominatorTreeAnalysis>(); 
-    PA.preserve<PostDominatorTreeAnalysis>(); 
-  } 
-  PA.preserve<GlobalsAA>(); 
-  return PA; 
-} 
- 
-namespace { 
- 
-struct ADCELegacyPass : public FunctionPass { 
-  static char ID; // Pass identification, replacement for typeid 
- 
-  ADCELegacyPass() : FunctionPass(ID) { 
-    initializeADCELegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnFunction(Function &F) override { 
-    if (skipFunction(F)) 
-      return false; 
- 
-    // ADCE does not need DominatorTree, but require DominatorTree here 
-    // to update analysis if it is already available. 
-    auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 
-    auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; 
-    auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(); 
-    return AggressiveDeadCodeElimination(F, DT, PDT) 
-        .performDeadCodeElimination(); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<PostDominatorTreeWrapperPass>(); 
-    if (!RemoveControlFlowFlag) 
-      AU.setPreservesCFG(); 
-    else { 
-      AU.addPreserved<DominatorTreeWrapperPass>(); 
-      AU.addPreserved<PostDominatorTreeWrapperPass>(); 
-    } 
-    AU.addPreserved<GlobalsAAWrapperPass>(); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-char ADCELegacyPass::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(ADCELegacyPass, "adce", 
-                      "Aggressive Dead Code Elimination", false, false) 
-INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) 
-INITIALIZE_PASS_END(ADCELegacyPass, "adce", "Aggressive Dead Code Elimination", 
-                    false, false) 
- 
-FunctionPass *llvm::createAggressiveDCEPass() { return new ADCELegacyPass(); } 
+      continue;
+    for (BasicBlock *Block : inverse_post_order_ext(&BB,Visited))
+      BlockInfo[Block].PostOrder = PostOrder++;
+  }
+}
+
+void AggressiveDeadCodeElimination::makeUnconditional(BasicBlock *BB,
+                                                      BasicBlock *Target) {
+  Instruction *PredTerm = BB->getTerminator();
+  // Collect the live debug info scopes attached to this instruction.
+  if (const DILocation *DL = PredTerm->getDebugLoc())
+    collectLiveScopes(*DL);
+
+  // Just mark live an existing unconditional branch
+  if (isUnconditionalBranch(PredTerm)) {
+    PredTerm->setSuccessor(0, Target);
+    InstInfo[PredTerm].Live = true;
+    return;
+  }
+  LLVM_DEBUG(dbgs() << "making unconditional " << BB->getName() << '\n');
+  NumBranchesRemoved += 1;
+  IRBuilder<> Builder(PredTerm);
+  auto *NewTerm = Builder.CreateBr(Target);
+  InstInfo[NewTerm].Live = true;
+  if (const DILocation *DL = PredTerm->getDebugLoc())
+    NewTerm->setDebugLoc(DL);
+
+  InstInfo.erase(PredTerm);
+  PredTerm->eraseFromParent();
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Pass Manager integration code
+//
+//===----------------------------------------------------------------------===//
+PreservedAnalyses ADCEPass::run(Function &F, FunctionAnalysisManager &FAM) {
+  // ADCE does not need DominatorTree, but require DominatorTree here
+  // to update analysis if it is already available.
+  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
+  auto &PDT = FAM.getResult<PostDominatorTreeAnalysis>(F);
+  if (!AggressiveDeadCodeElimination(F, DT, PDT).performDeadCodeElimination())
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  // TODO: We could track if we have actually done CFG changes.
+  if (!RemoveControlFlowFlag)
+    PA.preserveSet<CFGAnalyses>();
+  else {
+    PA.preserve<DominatorTreeAnalysis>();
+    PA.preserve<PostDominatorTreeAnalysis>();
+  }
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+
+namespace {
+
+struct ADCELegacyPass : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+
+  ADCELegacyPass() : FunctionPass(ID) {
+    initializeADCELegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    // ADCE does not need DominatorTree, but require DominatorTree here
+    // to update analysis if it is already available.
+    auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+    auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+    auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+    return AggressiveDeadCodeElimination(F, DT, PDT)
+        .performDeadCodeElimination();
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<PostDominatorTreeWrapperPass>();
+    if (!RemoveControlFlowFlag)
+      AU.setPreservesCFG();
+    else {
+      AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addPreserved<PostDominatorTreeWrapperPass>();
+    }
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+
+} // end anonymous namespace
+
+char ADCELegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(ADCELegacyPass, "adce",
+                      "Aggressive Dead Code Elimination", false, false)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_END(ADCELegacyPass, "adce", "Aggressive Dead Code Elimination",
+                    false, false)
+
+FunctionPass *llvm::createAggressiveDCEPass() { return new ADCELegacyPass(); }
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
index b05b073c02..bccf94fc21 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
@@ -1,217 +1,217 @@
-//===----------------------- AlignmentFromAssumptions.cpp -----------------===// 
-//                  Set Load/Store Alignments From Assumptions 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements a ScalarEvolution-based transformation to set 
-// the alignments of load, stores and memory intrinsics based on the truth 
-// expressions of assume intrinsics. The primary motivation is to handle 
-// complex alignment assumptions that apply to vector loads and stores that 
-// appear after vectorization and unrolling. 
-// 
-//===----------------------------------------------------------------------===// 
- 
+//===----------------------- AlignmentFromAssumptions.cpp -----------------===//
+//                  Set Load/Store Alignments From Assumptions
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a ScalarEvolution-based transformation to set
+// the alignments of load, stores and memory intrinsics based on the truth
+// expressions of assume intrinsics. The primary motivation is to handle
+// complex alignment assumptions that apply to vector loads and stores that
+// appear after vectorization and unrolling.
+//
+//===----------------------------------------------------------------------===//
+
 #include "llvm/IR/Instructions.h"
-#include "llvm/InitializePasses.h" 
-#define AA_NAME "alignment-from-assumptions" 
-#define DEBUG_TYPE AA_NAME 
-#include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/AliasAnalysis.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/ScalarEvolutionExpressions.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-using namespace llvm; 
- 
-STATISTIC(NumLoadAlignChanged, 
-  "Number of loads changed by alignment assumptions"); 
-STATISTIC(NumStoreAlignChanged, 
-  "Number of stores changed by alignment assumptions"); 
-STATISTIC(NumMemIntAlignChanged, 
-  "Number of memory intrinsics changed by alignment assumptions"); 
- 
-namespace { 
-struct AlignmentFromAssumptions : public FunctionPass { 
-  static char ID; // Pass identification, replacement for typeid 
-  AlignmentFromAssumptions() : FunctionPass(ID) { 
-    initializeAlignmentFromAssumptionsPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnFunction(Function &F) override; 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<AssumptionCacheTracker>(); 
-    AU.addRequired<ScalarEvolutionWrapperPass>(); 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
- 
-    AU.setPreservesCFG(); 
-    AU.addPreserved<AAResultsWrapperPass>(); 
-    AU.addPreserved<GlobalsAAWrapperPass>(); 
-    AU.addPreserved<LoopInfoWrapperPass>(); 
-    AU.addPreserved<DominatorTreeWrapperPass>(); 
-    AU.addPreserved<ScalarEvolutionWrapperPass>(); 
-  } 
- 
-  AlignmentFromAssumptionsPass Impl; 
-}; 
-} 
- 
-char AlignmentFromAssumptions::ID = 0; 
-static const char aip_name[] = "Alignment from assumptions"; 
-INITIALIZE_PASS_BEGIN(AlignmentFromAssumptions, AA_NAME, 
-                      aip_name, false, false) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 
-INITIALIZE_PASS_END(AlignmentFromAssumptions, AA_NAME, 
-                    aip_name, false, false) 
- 
-FunctionPass *llvm::createAlignmentFromAssumptionsPass() { 
-  return new AlignmentFromAssumptions(); 
-} 
- 
-// Given an expression for the (constant) alignment, AlignSCEV, and an 
-// expression for the displacement between a pointer and the aligned address, 
-// DiffSCEV, compute the alignment of the displaced pointer if it can be reduced 
-// to a constant. Using SCEV to compute alignment handles the case where 
-// DiffSCEV is a recurrence with constant start such that the aligned offset 
-// is constant. e.g. {16,+,32} % 32 -> 16. 
-static MaybeAlign getNewAlignmentDiff(const SCEV *DiffSCEV, 
-                                      const SCEV *AlignSCEV, 
-                                      ScalarEvolution *SE) { 
-  // DiffUnits = Diff % int64_t(Alignment) 
-  const SCEV *DiffUnitsSCEV = SE->getURemExpr(DiffSCEV, AlignSCEV); 
- 
-  LLVM_DEBUG(dbgs() << "\talignment relative to " << *AlignSCEV << " is " 
-                    << *DiffUnitsSCEV << " (diff: " << *DiffSCEV << ")\n"); 
- 
-  if (const SCEVConstant *ConstDUSCEV = 
-      dyn_cast<SCEVConstant>(DiffUnitsSCEV)) { 
-    int64_t DiffUnits = ConstDUSCEV->getValue()->getSExtValue(); 
- 
-    // If the displacement is an exact multiple of the alignment, then the 
-    // displaced pointer has the same alignment as the aligned pointer, so 
-    // return the alignment value. 
-    if (!DiffUnits) 
-      return cast<SCEVConstant>(AlignSCEV)->getValue()->getAlignValue(); 
- 
-    // If the displacement is not an exact multiple, but the remainder is a 
-    // constant, then return this remainder (but only if it is a power of 2). 
-    uint64_t DiffUnitsAbs = std::abs(DiffUnits); 
-    if (isPowerOf2_64(DiffUnitsAbs)) 
-      return Align(DiffUnitsAbs); 
-  } 
- 
-  return None; 
-} 
- 
-// There is an address given by an offset OffSCEV from AASCEV which has an 
-// alignment AlignSCEV. Use that information, if possible, to compute a new 
-// alignment for Ptr. 
-static Align getNewAlignment(const SCEV *AASCEV, const SCEV *AlignSCEV, 
-                             const SCEV *OffSCEV, Value *Ptr, 
-                             ScalarEvolution *SE) { 
-  const SCEV *PtrSCEV = SE->getSCEV(Ptr); 
-  // On a platform with 32-bit allocas, but 64-bit flat/global pointer sizes 
-  // (*cough* AMDGPU), the effective SCEV type of AASCEV and PtrSCEV 
-  // may disagree. Trunc/extend so they agree. 
-  PtrSCEV = SE->getTruncateOrZeroExtend( 
-      PtrSCEV, SE->getEffectiveSCEVType(AASCEV->getType())); 
-  const SCEV *DiffSCEV = SE->getMinusSCEV(PtrSCEV, AASCEV); 
- 
-  // On 32-bit platforms, DiffSCEV might now have type i32 -- we've always 
-  // sign-extended OffSCEV to i64, so make sure they agree again. 
-  DiffSCEV = SE->getNoopOrSignExtend(DiffSCEV, OffSCEV->getType()); 
- 
-  // What we really want to know is the overall offset to the aligned 
-  // address. This address is displaced by the provided offset. 
-  DiffSCEV = SE->getMinusSCEV(DiffSCEV, OffSCEV); 
- 
-  LLVM_DEBUG(dbgs() << "AFI: alignment of " << *Ptr << " relative to " 
-                    << *AlignSCEV << " and offset " << *OffSCEV 
-                    << " using diff " << *DiffSCEV << "\n"); 
- 
-  if (MaybeAlign NewAlignment = getNewAlignmentDiff(DiffSCEV, AlignSCEV, SE)) { 
-    LLVM_DEBUG(dbgs() << "\tnew alignment: " << DebugStr(NewAlignment) << "\n"); 
-    return *NewAlignment; 
-  } 
- 
-  if (const SCEVAddRecExpr *DiffARSCEV = dyn_cast<SCEVAddRecExpr>(DiffSCEV)) { 
-    // The relative offset to the alignment assumption did not yield a constant, 
-    // but we should try harder: if we assume that a is 32-byte aligned, then in 
-    // for (i = 0; i < 1024; i += 4) r += a[i]; not all of the loads from a are 
-    // 32-byte aligned, but instead alternate between 32 and 16-byte alignment. 
-    // As a result, the new alignment will not be a constant, but can still 
-    // be improved over the default (of 4) to 16. 
- 
-    const SCEV *DiffStartSCEV = DiffARSCEV->getStart(); 
-    const SCEV *DiffIncSCEV = DiffARSCEV->getStepRecurrence(*SE); 
- 
-    LLVM_DEBUG(dbgs() << "\ttrying start/inc alignment using start " 
-                      << *DiffStartSCEV << " and inc " << *DiffIncSCEV << "\n"); 
- 
-    // Now compute the new alignment using the displacement to the value in the 
-    // first iteration, and also the alignment using the per-iteration delta. 
-    // If these are the same, then use that answer. Otherwise, use the smaller 
-    // one, but only if it divides the larger one. 
-    MaybeAlign NewAlignment = getNewAlignmentDiff(DiffStartSCEV, AlignSCEV, SE); 
-    MaybeAlign NewIncAlignment = 
-        getNewAlignmentDiff(DiffIncSCEV, AlignSCEV, SE); 
- 
-    LLVM_DEBUG(dbgs() << "\tnew start alignment: " << DebugStr(NewAlignment) 
-                      << "\n"); 
-    LLVM_DEBUG(dbgs() << "\tnew inc alignment: " << DebugStr(NewIncAlignment) 
-                      << "\n"); 
- 
-    if (!NewAlignment || !NewIncAlignment) 
-      return Align(1); 
- 
-    const Align NewAlign = *NewAlignment; 
-    const Align NewIncAlign = *NewIncAlignment; 
-    if (NewAlign > NewIncAlign) { 
-      LLVM_DEBUG(dbgs() << "\tnew start/inc alignment: " 
-                        << DebugStr(NewIncAlign) << "\n"); 
-      return NewIncAlign; 
-    } 
-    if (NewIncAlign > NewAlign) { 
-      LLVM_DEBUG(dbgs() << "\tnew start/inc alignment: " << DebugStr(NewAlign) 
-                        << "\n"); 
-      return NewAlign; 
-    } 
-    assert(NewIncAlign == NewAlign); 
-    LLVM_DEBUG(dbgs() << "\tnew start/inc alignment: " << DebugStr(NewAlign) 
-                      << "\n"); 
-    return NewAlign; 
-  } 
- 
-  return Align(1); 
-} 
- 
-bool AlignmentFromAssumptionsPass::extractAlignmentInfo(CallInst *I, 
+#include "llvm/InitializePasses.h"
+#define AA_NAME "alignment-from-assumptions"
+#define DEBUG_TYPE AA_NAME
+#include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+STATISTIC(NumLoadAlignChanged,
+  "Number of loads changed by alignment assumptions");
+STATISTIC(NumStoreAlignChanged,
+  "Number of stores changed by alignment assumptions");
+STATISTIC(NumMemIntAlignChanged,
+  "Number of memory intrinsics changed by alignment assumptions");
+
+namespace {
+struct AlignmentFromAssumptions : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  AlignmentFromAssumptions() : FunctionPass(ID) {
+    initializeAlignmentFromAssumptionsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+
+    AU.setPreservesCFG();
+    AU.addPreserved<AAResultsWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<ScalarEvolutionWrapperPass>();
+  }
+
+  AlignmentFromAssumptionsPass Impl;
+};
+}
+
+char AlignmentFromAssumptions::ID = 0;
+static const char aip_name[] = "Alignment from assumptions";
+INITIALIZE_PASS_BEGIN(AlignmentFromAssumptions, AA_NAME,
+                      aip_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(AlignmentFromAssumptions, AA_NAME,
+                    aip_name, false, false)
+
+FunctionPass *llvm::createAlignmentFromAssumptionsPass() {
+  return new AlignmentFromAssumptions();
+}
+
+// Given an expression for the (constant) alignment, AlignSCEV, and an
+// expression for the displacement between a pointer and the aligned address,
+// DiffSCEV, compute the alignment of the displaced pointer if it can be reduced
+// to a constant. Using SCEV to compute alignment handles the case where
+// DiffSCEV is a recurrence with constant start such that the aligned offset
+// is constant. e.g. {16,+,32} % 32 -> 16.
+static MaybeAlign getNewAlignmentDiff(const SCEV *DiffSCEV,
+                                      const SCEV *AlignSCEV,
+                                      ScalarEvolution *SE) {
+  // DiffUnits = Diff % int64_t(Alignment)
+  const SCEV *DiffUnitsSCEV = SE->getURemExpr(DiffSCEV, AlignSCEV);
+
+  LLVM_DEBUG(dbgs() << "\talignment relative to " << *AlignSCEV << " is "
+                    << *DiffUnitsSCEV << " (diff: " << *DiffSCEV << ")\n");
+
+  if (const SCEVConstant *ConstDUSCEV =
+      dyn_cast<SCEVConstant>(DiffUnitsSCEV)) {
+    int64_t DiffUnits = ConstDUSCEV->getValue()->getSExtValue();
+
+    // If the displacement is an exact multiple of the alignment, then the
+    // displaced pointer has the same alignment as the aligned pointer, so
+    // return the alignment value.
+    if (!DiffUnits)
+      return cast<SCEVConstant>(AlignSCEV)->getValue()->getAlignValue();
+
+    // If the displacement is not an exact multiple, but the remainder is a
+    // constant, then return this remainder (but only if it is a power of 2).
+    uint64_t DiffUnitsAbs = std::abs(DiffUnits);
+    if (isPowerOf2_64(DiffUnitsAbs))
+      return Align(DiffUnitsAbs);
+  }
+
+  return None;
+}
+
+// There is an address given by an offset OffSCEV from AASCEV which has an
+// alignment AlignSCEV. Use that information, if possible, to compute a new
+// alignment for Ptr.
+static Align getNewAlignment(const SCEV *AASCEV, const SCEV *AlignSCEV,
+                             const SCEV *OffSCEV, Value *Ptr,
+                             ScalarEvolution *SE) {
+  const SCEV *PtrSCEV = SE->getSCEV(Ptr);
+  // On a platform with 32-bit allocas, but 64-bit flat/global pointer sizes
+  // (*cough* AMDGPU), the effective SCEV type of AASCEV and PtrSCEV
+  // may disagree. Trunc/extend so they agree.
+  PtrSCEV = SE->getTruncateOrZeroExtend(
+      PtrSCEV, SE->getEffectiveSCEVType(AASCEV->getType()));
+  const SCEV *DiffSCEV = SE->getMinusSCEV(PtrSCEV, AASCEV);
+
+  // On 32-bit platforms, DiffSCEV might now have type i32 -- we've always
+  // sign-extended OffSCEV to i64, so make sure they agree again.
+  DiffSCEV = SE->getNoopOrSignExtend(DiffSCEV, OffSCEV->getType());
+
+  // What we really want to know is the overall offset to the aligned
+  // address. This address is displaced by the provided offset.
+  DiffSCEV = SE->getMinusSCEV(DiffSCEV, OffSCEV);
+
+  LLVM_DEBUG(dbgs() << "AFI: alignment of " << *Ptr << " relative to "
+                    << *AlignSCEV << " and offset " << *OffSCEV
+                    << " using diff " << *DiffSCEV << "\n");
+
+  if (MaybeAlign NewAlignment = getNewAlignmentDiff(DiffSCEV, AlignSCEV, SE)) {
+    LLVM_DEBUG(dbgs() << "\tnew alignment: " << DebugStr(NewAlignment) << "\n");
+    return *NewAlignment;
+  }
+
+  if (const SCEVAddRecExpr *DiffARSCEV = dyn_cast<SCEVAddRecExpr>(DiffSCEV)) {
+    // The relative offset to the alignment assumption did not yield a constant,
+    // but we should try harder: if we assume that a is 32-byte aligned, then in
+    // for (i = 0; i < 1024; i += 4) r += a[i]; not all of the loads from a are
+    // 32-byte aligned, but instead alternate between 32 and 16-byte alignment.
+    // As a result, the new alignment will not be a constant, but can still
+    // be improved over the default (of 4) to 16.
+
+    const SCEV *DiffStartSCEV = DiffARSCEV->getStart();
+    const SCEV *DiffIncSCEV = DiffARSCEV->getStepRecurrence(*SE);
+
+    LLVM_DEBUG(dbgs() << "\ttrying start/inc alignment using start "
+                      << *DiffStartSCEV << " and inc " << *DiffIncSCEV << "\n");
+
+    // Now compute the new alignment using the displacement to the value in the
+    // first iteration, and also the alignment using the per-iteration delta.
+    // If these are the same, then use that answer. Otherwise, use the smaller
+    // one, but only if it divides the larger one.
+    MaybeAlign NewAlignment = getNewAlignmentDiff(DiffStartSCEV, AlignSCEV, SE);
+    MaybeAlign NewIncAlignment =
+        getNewAlignmentDiff(DiffIncSCEV, AlignSCEV, SE);
+
+    LLVM_DEBUG(dbgs() << "\tnew start alignment: " << DebugStr(NewAlignment)
+                      << "\n");
+    LLVM_DEBUG(dbgs() << "\tnew inc alignment: " << DebugStr(NewIncAlignment)
+                      << "\n");
+
+    if (!NewAlignment || !NewIncAlignment)
+      return Align(1);
+
+    const Align NewAlign = *NewAlignment;
+    const Align NewIncAlign = *NewIncAlignment;
+    if (NewAlign > NewIncAlign) {
+      LLVM_DEBUG(dbgs() << "\tnew start/inc alignment: "
+                        << DebugStr(NewIncAlign) << "\n");
+      return NewIncAlign;
+    }
+    if (NewIncAlign > NewAlign) {
+      LLVM_DEBUG(dbgs() << "\tnew start/inc alignment: " << DebugStr(NewAlign)
+                        << "\n");
+      return NewAlign;
+    }
+    assert(NewIncAlign == NewAlign);
+    LLVM_DEBUG(dbgs() << "\tnew start/inc alignment: " << DebugStr(NewAlign)
+                      << "\n");
+    return NewAlign;
+  }
+
+  return Align(1);
+}
+
+bool AlignmentFromAssumptionsPass::extractAlignmentInfo(CallInst *I,
                                                         unsigned Idx,
-                                                        Value *&AAPtr, 
-                                                        const SCEV *&AlignSCEV, 
-                                                        const SCEV *&OffSCEV) { 
+                                                        Value *&AAPtr,
+                                                        const SCEV *&AlignSCEV,
+                                                        const SCEV *&OffSCEV) {
   Type *Int64Ty = Type::getInt64Ty(I->getContext());
   OperandBundleUse AlignOB = I->getOperandBundleAt(Idx);
   if (AlignOB.getTagName() != "align")
-    return false; 
+    return false;
   assert(AlignOB.Inputs.size() >= 2);
   AAPtr = AlignOB.Inputs[0].get();
   // TODO: Consider accumulating the offset to the base.
@@ -221,139 +221,139 @@ bool AlignmentFromAssumptionsPass::extractAlignmentInfo(CallInst *I,
   if (AlignOB.Inputs.size() == 3)
     OffSCEV = SE->getSCEV(AlignOB.Inputs[2].get());
   else
-    OffSCEV = SE->getZero(Int64Ty); 
+    OffSCEV = SE->getZero(Int64Ty);
   OffSCEV = SE->getTruncateOrZeroExtend(OffSCEV, Int64Ty);
-  return true; 
-} 
- 
+  return true;
+}
+
 bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall,
                                                      unsigned Idx) {
-  Value *AAPtr; 
-  const SCEV *AlignSCEV, *OffSCEV; 
+  Value *AAPtr;
+  const SCEV *AlignSCEV, *OffSCEV;
   if (!extractAlignmentInfo(ACall, Idx, AAPtr, AlignSCEV, OffSCEV))
-    return false; 
- 
-  // Skip ConstantPointerNull and UndefValue.  Assumptions on these shouldn't 
-  // affect other users. 
-  if (isa<ConstantData>(AAPtr)) 
-    return false; 
- 
-  const SCEV *AASCEV = SE->getSCEV(AAPtr); 
- 
-  // Apply the assumption to all other users of the specified pointer. 
-  SmallPtrSet<Instruction *, 32> Visited; 
-  SmallVector<Instruction*, 16> WorkList; 
-  for (User *J : AAPtr->users()) { 
-    if (J == ACall) 
-      continue; 
- 
-    if (Instruction *K = dyn_cast<Instruction>(J)) 
-        WorkList.push_back(K); 
-  } 
- 
-  while (!WorkList.empty()) { 
-    Instruction *J = WorkList.pop_back_val(); 
-    if (LoadInst *LI = dyn_cast<LoadInst>(J)) { 
+    return false;
+
+  // Skip ConstantPointerNull and UndefValue.  Assumptions on these shouldn't
+  // affect other users.
+  if (isa<ConstantData>(AAPtr))
+    return false;
+
+  const SCEV *AASCEV = SE->getSCEV(AAPtr);
+
+  // Apply the assumption to all other users of the specified pointer.
+  SmallPtrSet<Instruction *, 32> Visited;
+  SmallVector<Instruction*, 16> WorkList;
+  for (User *J : AAPtr->users()) {
+    if (J == ACall)
+      continue;
+
+    if (Instruction *K = dyn_cast<Instruction>(J))
+        WorkList.push_back(K);
+  }
+
+  while (!WorkList.empty()) {
+    Instruction *J = WorkList.pop_back_val();
+    if (LoadInst *LI = dyn_cast<LoadInst>(J)) {
       if (!isValidAssumeForContext(ACall, J, DT))
         continue;
-      Align NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV, 
-                                           LI->getPointerOperand(), SE); 
-      if (NewAlignment > LI->getAlign()) { 
-        LI->setAlignment(NewAlignment); 
-        ++NumLoadAlignChanged; 
-      } 
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(J)) { 
+      Align NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+                                           LI->getPointerOperand(), SE);
+      if (NewAlignment > LI->getAlign()) {
+        LI->setAlignment(NewAlignment);
+        ++NumLoadAlignChanged;
+      }
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(J)) {
       if (!isValidAssumeForContext(ACall, J, DT))
         continue;
-      Align NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV, 
-                                           SI->getPointerOperand(), SE); 
-      if (NewAlignment > SI->getAlign()) { 
-        SI->setAlignment(NewAlignment); 
-        ++NumStoreAlignChanged; 
-      } 
-    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(J)) { 
+      Align NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+                                           SI->getPointerOperand(), SE);
+      if (NewAlignment > SI->getAlign()) {
+        SI->setAlignment(NewAlignment);
+        ++NumStoreAlignChanged;
+      }
+    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(J)) {
       if (!isValidAssumeForContext(ACall, J, DT))
         continue;
-      Align NewDestAlignment = 
-          getNewAlignment(AASCEV, AlignSCEV, OffSCEV, MI->getDest(), SE); 
- 
-      LLVM_DEBUG(dbgs() << "\tmem inst: " << DebugStr(NewDestAlignment) 
-                        << "\n";); 
-      if (NewDestAlignment > *MI->getDestAlign()) { 
-        MI->setDestAlignment(NewDestAlignment); 
-        ++NumMemIntAlignChanged; 
-      } 
- 
-      // For memory transfers, there is also a source alignment that 
-      // can be set. 
-      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) { 
-        Align NewSrcAlignment = 
-            getNewAlignment(AASCEV, AlignSCEV, OffSCEV, MTI->getSource(), SE); 
- 
-        LLVM_DEBUG(dbgs() << "\tmem trans: " << DebugStr(NewSrcAlignment) 
-                          << "\n";); 
- 
-        if (NewSrcAlignment > *MTI->getSourceAlign()) { 
-          MTI->setSourceAlignment(NewSrcAlignment); 
-          ++NumMemIntAlignChanged; 
-        } 
-      } 
-    } 
- 
-    // Now that we've updated that use of the pointer, look for other uses of 
-    // the pointer to update. 
-    Visited.insert(J); 
-    for (User *UJ : J->users()) { 
-      Instruction *K = cast<Instruction>(UJ); 
+      Align NewDestAlignment =
+          getNewAlignment(AASCEV, AlignSCEV, OffSCEV, MI->getDest(), SE);
+
+      LLVM_DEBUG(dbgs() << "\tmem inst: " << DebugStr(NewDestAlignment)
+                        << "\n";);
+      if (NewDestAlignment > *MI->getDestAlign()) {
+        MI->setDestAlignment(NewDestAlignment);
+        ++NumMemIntAlignChanged;
+      }
+
+      // For memory transfers, there is also a source alignment that
+      // can be set.
+      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
+        Align NewSrcAlignment =
+            getNewAlignment(AASCEV, AlignSCEV, OffSCEV, MTI->getSource(), SE);
+
+        LLVM_DEBUG(dbgs() << "\tmem trans: " << DebugStr(NewSrcAlignment)
+                          << "\n";);
+
+        if (NewSrcAlignment > *MTI->getSourceAlign()) {
+          MTI->setSourceAlignment(NewSrcAlignment);
+          ++NumMemIntAlignChanged;
+        }
+      }
+    }
+
+    // Now that we've updated that use of the pointer, look for other uses of
+    // the pointer to update.
+    Visited.insert(J);
+    for (User *UJ : J->users()) {
+      Instruction *K = cast<Instruction>(UJ);
       if (!Visited.count(K))
-        WorkList.push_back(K); 
-    } 
-  } 
- 
-  return true; 
-} 
- 
-bool AlignmentFromAssumptions::runOnFunction(Function &F) { 
-  if (skipFunction(F)) 
-    return false; 
- 
-  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 
-  ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 
-  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
- 
-  return Impl.runImpl(F, AC, SE, DT); 
-} 
- 
-bool AlignmentFromAssumptionsPass::runImpl(Function &F, AssumptionCache &AC, 
-                                           ScalarEvolution *SE_, 
-                                           DominatorTree *DT_) { 
-  SE = SE_; 
-  DT = DT_; 
- 
-  bool Changed = false; 
-  for (auto &AssumeVH : AC.assumptions()) 
+        WorkList.push_back(K);
+    }
+  }
+
+  return true;
+}
+
+bool AlignmentFromAssumptions::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+  return Impl.runImpl(F, AC, SE, DT);
+}
+
+bool AlignmentFromAssumptionsPass::runImpl(Function &F, AssumptionCache &AC,
+                                           ScalarEvolution *SE_,
+                                           DominatorTree *DT_) {
+  SE = SE_;
+  DT = DT_;
+
+  bool Changed = false;
+  for (auto &AssumeVH : AC.assumptions())
     if (AssumeVH) {
       CallInst *Call = cast<CallInst>(AssumeVH);
       for (unsigned Idx = 0; Idx < Call->getNumOperandBundles(); Idx++)
         Changed |= processAssumption(Call, Idx);
     }
- 
-  return Changed; 
-} 
- 
-PreservedAnalyses 
-AlignmentFromAssumptionsPass::run(Function &F, FunctionAnalysisManager &AM) { 
- 
-  AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F); 
-  ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 
-  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F); 
-  if (!runImpl(F, AC, &SE, &DT)) 
-    return PreservedAnalyses::all(); 
- 
-  PreservedAnalyses PA; 
-  PA.preserveSet<CFGAnalyses>(); 
-  PA.preserve<AAManager>(); 
-  PA.preserve<ScalarEvolutionAnalysis>(); 
-  PA.preserve<GlobalsAA>(); 
-  return PA; 
-} 
+
+  return Changed;
+}
+
+PreservedAnalyses
+AlignmentFromAssumptionsPass::run(Function &F, FunctionAnalysisManager &AM) {
+
+  AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
+  ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  if (!runImpl(F, AC, &SE, &DT))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<AAManager>();
+  PA.preserve<ScalarEvolutionAnalysis>();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/BDCE.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/BDCE.cpp
index 9dde869bb6..767c7656dc 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/BDCE.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/BDCE.cpp
@@ -1,206 +1,206 @@
-//===---- BDCE.cpp - Bit-tracking dead code elimination -------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the Bit-Tracking Dead Code Elimination pass. Some 
-// instructions (shifts, some ands, ors, etc.) kill some of their input bits. 
-// We track these dead bits and remove instructions that compute only these 
-// dead bits. We also simplify sext that generates unused extension bits, 
-// converting it to a zext. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/BDCE.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/DemandedBits.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstIterator.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "bdce" 
- 
-STATISTIC(NumRemoved, "Number of instructions removed (unused)"); 
-STATISTIC(NumSimplified, "Number of instructions trivialized (dead bits)"); 
-STATISTIC(NumSExt2ZExt, 
-          "Number of sign extension instructions converted to zero extension"); 
- 
-/// If an instruction is trivialized (dead), then the chain of users of that 
-/// instruction may need to be cleared of assumptions that can no longer be 
-/// guaranteed correct. 
-static void clearAssumptionsOfUsers(Instruction *I, DemandedBits &DB) { 
-  assert(I->getType()->isIntOrIntVectorTy() && 
-         "Trivializing a non-integer value?"); 
- 
-  // Initialize the worklist with eligible direct users. 
-  SmallPtrSet<Instruction *, 16> Visited; 
-  SmallVector<Instruction *, 16> WorkList; 
-  for (User *JU : I->users()) { 
-    // If all bits of a user are demanded, then we know that nothing below that 
-    // in the def-use chain needs to be changed. 
-    auto *J = dyn_cast<Instruction>(JU); 
-    if (J && J->getType()->isIntOrIntVectorTy() && 
-        !DB.getDemandedBits(J).isAllOnesValue()) { 
-      Visited.insert(J); 
-      WorkList.push_back(J); 
-    } 
- 
-    // Note that we need to check for non-int types above before asking for 
-    // demanded bits. Normally, the only way to reach an instruction with an 
-    // non-int type is via an instruction that has side effects (or otherwise 
-    // will demand its input bits). However, if we have a readnone function 
-    // that returns an unsized type (e.g., void), we must avoid asking for the 
-    // demanded bits of the function call's return value. A void-returning 
-    // readnone function is always dead (and so we can stop walking the use/def 
-    // chain here), but the check is necessary to avoid asserting. 
-  } 
- 
-  // DFS through subsequent users while tracking visits to avoid cycles. 
-  while (!WorkList.empty()) { 
-    Instruction *J = WorkList.pop_back_val(); 
- 
-    // NSW, NUW, and exact are based on operands that might have changed. 
-    J->dropPoisonGeneratingFlags(); 
- 
-    // We do not have to worry about llvm.assume or range metadata: 
-    // 1. llvm.assume demands its operand, so trivializing can't change it. 
-    // 2. range metadata only applies to memory accesses which demand all bits. 
- 
-    for (User *KU : J->users()) { 
-      // If all bits of a user are demanded, then we know that nothing below 
-      // that in the def-use chain needs to be changed. 
-      auto *K = dyn_cast<Instruction>(KU); 
-      if (K && Visited.insert(K).second && K->getType()->isIntOrIntVectorTy() && 
-          !DB.getDemandedBits(K).isAllOnesValue()) 
-        WorkList.push_back(K); 
-    } 
-  } 
-} 
- 
-static bool bitTrackingDCE(Function &F, DemandedBits &DB) { 
-  SmallVector<Instruction*, 128> Worklist; 
-  bool Changed = false; 
-  for (Instruction &I : instructions(F)) { 
-    // If the instruction has side effects and no non-dbg uses, 
-    // skip it. This way we avoid computing known bits on an instruction 
-    // that will not help us. 
-    if (I.mayHaveSideEffects() && I.use_empty()) 
-      continue; 
- 
-    // Remove instructions that are dead, either because they were not reached 
-    // during analysis or have no demanded bits. 
-    if (DB.isInstructionDead(&I) || 
-        (I.getType()->isIntOrIntVectorTy() && 
-         DB.getDemandedBits(&I).isNullValue() && 
-         wouldInstructionBeTriviallyDead(&I))) { 
-      salvageDebugInfo(I); 
-      Worklist.push_back(&I); 
-      I.dropAllReferences(); 
-      Changed = true; 
-      continue; 
-    } 
- 
-    // Convert SExt into ZExt if none of the extension bits is required 
-    if (SExtInst *SE = dyn_cast<SExtInst>(&I)) { 
-      APInt Demanded = DB.getDemandedBits(SE); 
-      const uint32_t SrcBitSize = SE->getSrcTy()->getScalarSizeInBits(); 
-      auto *const DstTy = SE->getDestTy(); 
-      const uint32_t DestBitSize = DstTy->getScalarSizeInBits(); 
-      if (Demanded.countLeadingZeros() >= (DestBitSize - SrcBitSize)) { 
-        clearAssumptionsOfUsers(SE, DB); 
-        IRBuilder<> Builder(SE); 
-        I.replaceAllUsesWith( 
-            Builder.CreateZExt(SE->getOperand(0), DstTy, SE->getName())); 
-        Worklist.push_back(SE); 
-        Changed = true; 
-        NumSExt2ZExt++; 
-        continue; 
-      } 
-    } 
- 
-    for (Use &U : I.operands()) { 
-      // DemandedBits only detects dead integer uses. 
-      if (!U->getType()->isIntOrIntVectorTy()) 
-        continue; 
- 
-      if (!isa<Instruction>(U) && !isa<Argument>(U)) 
-        continue; 
- 
-      if (!DB.isUseDead(&U)) 
-        continue; 
- 
-      LLVM_DEBUG(dbgs() << "BDCE: Trivializing: " << U << " (all bits dead)\n"); 
- 
-      clearAssumptionsOfUsers(&I, DB); 
- 
-      // FIXME: In theory we could substitute undef here instead of zero. 
-      // This should be reconsidered once we settle on the semantics of 
-      // undef, poison, etc. 
-      U.set(ConstantInt::get(U->getType(), 0)); 
-      ++NumSimplified; 
-      Changed = true; 
-    } 
-  } 
- 
-  for (Instruction *&I : Worklist) { 
-    ++NumRemoved; 
-    I->eraseFromParent(); 
-  } 
- 
-  return Changed; 
-} 
- 
-PreservedAnalyses BDCEPass::run(Function &F, FunctionAnalysisManager &AM) { 
-  auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 
-  if (!bitTrackingDCE(F, DB)) 
-    return PreservedAnalyses::all(); 
- 
-  PreservedAnalyses PA; 
-  PA.preserveSet<CFGAnalyses>(); 
-  PA.preserve<GlobalsAA>(); 
-  return PA; 
-} 
- 
-namespace { 
-struct BDCELegacyPass : public FunctionPass { 
-  static char ID; // Pass identification, replacement for typeid 
-  BDCELegacyPass() : FunctionPass(ID) { 
-    initializeBDCELegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnFunction(Function &F) override { 
-    if (skipFunction(F)) 
-      return false; 
-    auto &DB = getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 
-    return bitTrackingDCE(F, DB); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.setPreservesCFG(); 
-    AU.addRequired<DemandedBitsWrapperPass>(); 
-    AU.addPreserved<GlobalsAAWrapperPass>(); 
-  } 
-}; 
-} 
- 
-char BDCELegacyPass::ID = 0; 
-INITIALIZE_PASS_BEGIN(BDCELegacyPass, "bdce", 
-                      "Bit-Tracking Dead Code Elimination", false, false) 
-INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 
-INITIALIZE_PASS_END(BDCELegacyPass, "bdce", 
-                    "Bit-Tracking Dead Code Elimination", false, false) 
- 
-FunctionPass *llvm::createBitTrackingDCEPass() { return new BDCELegacyPass(); } 
+//===---- BDCE.cpp - Bit-tracking dead code elimination -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Bit-Tracking Dead Code Elimination pass. Some
+// instructions (shifts, some ands, ors, etc.) kill some of their input bits.
+// We track these dead bits and remove instructions that compute only these
+// dead bits. We also simplify sext that generates unused extension bits,
+// converting it to a zext.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/BDCE.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/DemandedBits.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "bdce"
+
+STATISTIC(NumRemoved, "Number of instructions removed (unused)");
+STATISTIC(NumSimplified, "Number of instructions trivialized (dead bits)");
+STATISTIC(NumSExt2ZExt,
+          "Number of sign extension instructions converted to zero extension");
+
+/// If an instruction is trivialized (dead), then the chain of users of that
+/// instruction may need to be cleared of assumptions that can no longer be
+/// guaranteed correct.
+static void clearAssumptionsOfUsers(Instruction *I, DemandedBits &DB) {
+  assert(I->getType()->isIntOrIntVectorTy() &&
+         "Trivializing a non-integer value?");
+
+  // Initialize the worklist with eligible direct users.
+  SmallPtrSet<Instruction *, 16> Visited;
+  SmallVector<Instruction *, 16> WorkList;
+  for (User *JU : I->users()) {
+    // If all bits of a user are demanded, then we know that nothing below that
+    // in the def-use chain needs to be changed.
+    auto *J = dyn_cast<Instruction>(JU);
+    if (J && J->getType()->isIntOrIntVectorTy() &&
+        !DB.getDemandedBits(J).isAllOnesValue()) {
+      Visited.insert(J);
+      WorkList.push_back(J);
+    }
+
+    // Note that we need to check for non-int types above before asking for
+    // demanded bits. Normally, the only way to reach an instruction with an
+    // non-int type is via an instruction that has side effects (or otherwise
+    // will demand its input bits). However, if we have a readnone function
+    // that returns an unsized type (e.g., void), we must avoid asking for the
+    // demanded bits of the function call's return value. A void-returning
+    // readnone function is always dead (and so we can stop walking the use/def
+    // chain here), but the check is necessary to avoid asserting.
+  }
+
+  // DFS through subsequent users while tracking visits to avoid cycles.
+  while (!WorkList.empty()) {
+    Instruction *J = WorkList.pop_back_val();
+
+    // NSW, NUW, and exact are based on operands that might have changed.
+    J->dropPoisonGeneratingFlags();
+
+    // We do not have to worry about llvm.assume or range metadata:
+    // 1. llvm.assume demands its operand, so trivializing can't change it.
+    // 2. range metadata only applies to memory accesses which demand all bits.
+
+    for (User *KU : J->users()) {
+      // If all bits of a user are demanded, then we know that nothing below
+      // that in the def-use chain needs to be changed.
+      auto *K = dyn_cast<Instruction>(KU);
+      if (K && Visited.insert(K).second && K->getType()->isIntOrIntVectorTy() &&
+          !DB.getDemandedBits(K).isAllOnesValue())
+        WorkList.push_back(K);
+    }
+  }
+}
+
+static bool bitTrackingDCE(Function &F, DemandedBits &DB) {
+  SmallVector<Instruction*, 128> Worklist;
+  bool Changed = false;
+  for (Instruction &I : instructions(F)) {
+    // If the instruction has side effects and no non-dbg uses,
+    // skip it. This way we avoid computing known bits on an instruction
+    // that will not help us.
+    if (I.mayHaveSideEffects() && I.use_empty())
+      continue;
+
+    // Remove instructions that are dead, either because they were not reached
+    // during analysis or have no demanded bits.
+    if (DB.isInstructionDead(&I) ||
+        (I.getType()->isIntOrIntVectorTy() &&
+         DB.getDemandedBits(&I).isNullValue() &&
+         wouldInstructionBeTriviallyDead(&I))) {
+      salvageDebugInfo(I);
+      Worklist.push_back(&I);
+      I.dropAllReferences();
+      Changed = true;
+      continue;
+    }
+
+    // Convert SExt into ZExt if none of the extension bits is required
+    if (SExtInst *SE = dyn_cast<SExtInst>(&I)) {
+      APInt Demanded = DB.getDemandedBits(SE);
+      const uint32_t SrcBitSize = SE->getSrcTy()->getScalarSizeInBits();
+      auto *const DstTy = SE->getDestTy();
+      const uint32_t DestBitSize = DstTy->getScalarSizeInBits();
+      if (Demanded.countLeadingZeros() >= (DestBitSize - SrcBitSize)) {
+        clearAssumptionsOfUsers(SE, DB);
+        IRBuilder<> Builder(SE);
+        I.replaceAllUsesWith(
+            Builder.CreateZExt(SE->getOperand(0), DstTy, SE->getName()));
+        Worklist.push_back(SE);
+        Changed = true;
+        NumSExt2ZExt++;
+        continue;
+      }
+    }
+
+    for (Use &U : I.operands()) {
+      // DemandedBits only detects dead integer uses.
+      if (!U->getType()->isIntOrIntVectorTy())
+        continue;
+
+      if (!isa<Instruction>(U) && !isa<Argument>(U))
+        continue;
+
+      if (!DB.isUseDead(&U))
+        continue;
+
+      LLVM_DEBUG(dbgs() << "BDCE: Trivializing: " << U << " (all bits dead)\n");
+
+      clearAssumptionsOfUsers(&I, DB);
+
+      // FIXME: In theory we could substitute undef here instead of zero.
+      // This should be reconsidered once we settle on the semantics of
+      // undef, poison, etc.
+      U.set(ConstantInt::get(U->getType(), 0));
+      ++NumSimplified;
+      Changed = true;
+    }
+  }
+
+  for (Instruction *&I : Worklist) {
+    ++NumRemoved;
+    I->eraseFromParent();
+  }
+
+  return Changed;
+}
+
+PreservedAnalyses BDCEPass::run(Function &F, FunctionAnalysisManager &AM) {
+  auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
+  if (!bitTrackingDCE(F, DB))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+
+namespace {
+struct BDCELegacyPass : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  BDCELegacyPass() : FunctionPass(ID) {
+    initializeBDCELegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    auto &DB = getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
+    return bitTrackingDCE(F, DB);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<DemandedBitsWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+}
+
+char BDCELegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(BDCELegacyPass, "bdce",
+                      "Bit-Tracking Dead Code Elimination", false, false)
+INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
+INITIALIZE_PASS_END(BDCELegacyPass, "bdce",
+                    "Bit-Tracking Dead Code Elimination", false, false)
+
+FunctionPass *llvm::createBitTrackingDCEPass() { return new BDCELegacyPass(); }
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/CallSiteSplitting.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/CallSiteSplitting.cpp
index 743c43d3f3..2eb94b721d 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/CallSiteSplitting.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -1,590 +1,590 @@
-//===- CallSiteSplitting.cpp ----------------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements a transformation that tries to split a call-site to pass 
-// more constrained arguments if its argument is predicated in the control flow 
-// so that we can expose better context to the later passes (e.g, inliner, jump 
-// threading, or IPA-CP based function cloning, etc.). 
-// As of now we support two cases : 
-// 
-// 1) Try to a split call-site with constrained arguments, if any constraints 
-// on any argument can be found by following the single predecessors of the 
-// all site's predecessors. Currently this pass only handles call-sites with 2 
-// predecessors. For example, in the code below, we try to split the call-site 
-// since we can predicate the argument(ptr) based on the OR condition. 
-// 
-// Split from : 
-//   if (!ptr || c) 
-//     callee(ptr); 
-// to : 
-//   if (!ptr) 
-//     callee(null)         // set the known constant value 
-//   else if (c) 
-//     callee(nonnull ptr)  // set non-null attribute in the argument 
-// 
-// 2) We can also split a call-site based on constant incoming values of a PHI 
-// For example, 
-// from : 
-//   Header: 
-//    %c = icmp eq i32 %i1, %i2 
-//    br i1 %c, label %Tail, label %TBB 
-//   TBB: 
-//    br label Tail% 
-//   Tail: 
-//    %p = phi i32 [ 0, %Header], [ 1, %TBB] 
-//    call void @bar(i32 %p) 
-// to 
-//   Header: 
-//    %c = icmp eq i32 %i1, %i2 
-//    br i1 %c, label %Tail-split0, label %TBB 
-//   TBB: 
-//    br label %Tail-split1 
-//   Tail-split0: 
-//    call void @bar(i32 0) 
-//    br label %Tail 
-//   Tail-split1: 
-//    call void @bar(i32 1) 
-//    br label %Tail 
-//   Tail: 
-//    %p = phi i32 [ 0, %Tail-split0 ], [ 1, %Tail-split1 ] 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/CallSiteSplitting.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/Cloning.h" 
-#include "llvm/Transforms/Utils/Local.h" 
- 
-using namespace llvm; 
-using namespace PatternMatch; 
- 
-#define DEBUG_TYPE "callsite-splitting" 
- 
-STATISTIC(NumCallSiteSplit, "Number of call-site split"); 
- 
-/// Only allow instructions before a call, if their CodeSize cost is below 
-/// DuplicationThreshold. Those instructions need to be duplicated in all 
-/// split blocks. 
-static cl::opt<unsigned> 
-    DuplicationThreshold("callsite-splitting-duplication-threshold", cl::Hidden, 
-                         cl::desc("Only allow instructions before a call, if " 
-                                  "their cost is below DuplicationThreshold"), 
-                         cl::init(5)); 
- 
-static void addNonNullAttribute(CallBase &CB, Value *Op) { 
-  unsigned ArgNo = 0; 
-  for (auto &I : CB.args()) { 
-    if (&*I == Op) 
-      CB.addParamAttr(ArgNo, Attribute::NonNull); 
-    ++ArgNo; 
-  } 
-} 
- 
-static void setConstantInArgument(CallBase &CB, Value *Op, 
-                                  Constant *ConstValue) { 
-  unsigned ArgNo = 0; 
-  for (auto &I : CB.args()) { 
-    if (&*I == Op) { 
-      // It is possible we have already added the non-null attribute to the 
-      // parameter by using an earlier constraining condition. 
-      CB.removeParamAttr(ArgNo, Attribute::NonNull); 
-      CB.setArgOperand(ArgNo, ConstValue); 
-    } 
-    ++ArgNo; 
-  } 
-} 
- 
-static bool isCondRelevantToAnyCallArgument(ICmpInst *Cmp, CallBase &CB) { 
-  assert(isa<Constant>(Cmp->getOperand(1)) && "Expected a constant operand."); 
-  Value *Op0 = Cmp->getOperand(0); 
-  unsigned ArgNo = 0; 
-  for (auto I = CB.arg_begin(), E = CB.arg_end(); I != E; ++I, ++ArgNo) { 
-    // Don't consider constant or arguments that are already known non-null. 
-    if (isa<Constant>(*I) || CB.paramHasAttr(ArgNo, Attribute::NonNull)) 
-      continue; 
- 
-    if (*I == Op0) 
-      return true; 
-  } 
-  return false; 
-} 
- 
-typedef std::pair<ICmpInst *, unsigned> ConditionTy; 
-typedef SmallVector<ConditionTy, 2> ConditionsTy; 
- 
-/// If From has a conditional jump to To, add the condition to Conditions, 
-/// if it is relevant to any argument at CB. 
-static void recordCondition(CallBase &CB, BasicBlock *From, BasicBlock *To, 
-                            ConditionsTy &Conditions) { 
-  auto *BI = dyn_cast<BranchInst>(From->getTerminator()); 
-  if (!BI || !BI->isConditional()) 
-    return; 
- 
-  CmpInst::Predicate Pred; 
-  Value *Cond = BI->getCondition(); 
-  if (!match(Cond, m_ICmp(Pred, m_Value(), m_Constant()))) 
-    return; 
- 
-  ICmpInst *Cmp = cast<ICmpInst>(Cond); 
-  if (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) 
-    if (isCondRelevantToAnyCallArgument(Cmp, CB)) 
-      Conditions.push_back({Cmp, From->getTerminator()->getSuccessor(0) == To 
-                                     ? Pred 
-                                     : Cmp->getInversePredicate()}); 
-} 
- 
-/// Record ICmp conditions relevant to any argument in CB following Pred's 
-/// single predecessors. If there are conflicting conditions along a path, like 
-/// x == 1 and x == 0, the first condition will be used. We stop once we reach 
-/// an edge to StopAt. 
-static void recordConditions(CallBase &CB, BasicBlock *Pred, 
-                             ConditionsTy &Conditions, BasicBlock *StopAt) { 
-  BasicBlock *From = Pred; 
-  BasicBlock *To = Pred; 
-  SmallPtrSet<BasicBlock *, 4> Visited; 
-  while (To != StopAt && !Visited.count(From->getSinglePredecessor()) && 
-         (From = From->getSinglePredecessor())) { 
-    recordCondition(CB, From, To, Conditions); 
-    Visited.insert(From); 
-    To = From; 
-  } 
-} 
- 
-static void addConditions(CallBase &CB, const ConditionsTy &Conditions) { 
-  for (auto &Cond : Conditions) { 
-    Value *Arg = Cond.first->getOperand(0); 
-    Constant *ConstVal = cast<Constant>(Cond.first->getOperand(1)); 
-    if (Cond.second == ICmpInst::ICMP_EQ) 
-      setConstantInArgument(CB, Arg, ConstVal); 
-    else if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) { 
-      assert(Cond.second == ICmpInst::ICMP_NE); 
-      addNonNullAttribute(CB, Arg); 
-    } 
-  } 
-} 
- 
-static SmallVector<BasicBlock *, 2> getTwoPredecessors(BasicBlock *BB) { 
-  SmallVector<BasicBlock *, 2> Preds(predecessors((BB))); 
-  assert(Preds.size() == 2 && "Expected exactly 2 predecessors!"); 
-  return Preds; 
-} 
- 
-static bool canSplitCallSite(CallBase &CB, TargetTransformInfo &TTI) { 
-  if (CB.isConvergent() || CB.cannotDuplicate()) 
-    return false; 
- 
-  // FIXME: As of now we handle only CallInst. InvokeInst could be handled 
-  // without too much effort. 
-  if (!isa<CallInst>(CB)) 
-    return false; 
- 
-  BasicBlock *CallSiteBB = CB.getParent(); 
-  // Need 2 predecessors and cannot split an edge from an IndirectBrInst. 
-  SmallVector<BasicBlock *, 2> Preds(predecessors(CallSiteBB)); 
-  if (Preds.size() != 2 || isa<IndirectBrInst>(Preds[0]->getTerminator()) || 
-      isa<IndirectBrInst>(Preds[1]->getTerminator())) 
-    return false; 
- 
-  // BasicBlock::canSplitPredecessors is more aggressive, so checking for 
-  // BasicBlock::isEHPad as well. 
-  if (!CallSiteBB->canSplitPredecessors() || CallSiteBB->isEHPad()) 
-    return false; 
- 
-  // Allow splitting a call-site only when the CodeSize cost of the 
-  // instructions before the call is less then DuplicationThreshold. The 
-  // instructions before the call will be duplicated in the split blocks and 
-  // corresponding uses will be updated. 
+//===- CallSiteSplitting.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a transformation that tries to split a call-site to pass
+// more constrained arguments if its argument is predicated in the control flow
+// so that we can expose better context to the later passes (e.g, inliner, jump
+// threading, or IPA-CP based function cloning, etc.).
+// As of now we support two cases :
+//
+// 1) Try to a split call-site with constrained arguments, if any constraints
+// on any argument can be found by following the single predecessors of the
+// all site's predecessors. Currently this pass only handles call-sites with 2
+// predecessors. For example, in the code below, we try to split the call-site
+// since we can predicate the argument(ptr) based on the OR condition.
+//
+// Split from :
+//   if (!ptr || c)
+//     callee(ptr);
+// to :
+//   if (!ptr)
+//     callee(null)         // set the known constant value
+//   else if (c)
+//     callee(nonnull ptr)  // set non-null attribute in the argument
+//
+// 2) We can also split a call-site based on constant incoming values of a PHI
+// For example,
+// from :
+//   Header:
+//    %c = icmp eq i32 %i1, %i2
+//    br i1 %c, label %Tail, label %TBB
+//   TBB:
+//    br label Tail%
+//   Tail:
+//    %p = phi i32 [ 0, %Header], [ 1, %TBB]
+//    call void @bar(i32 %p)
+// to
+//   Header:
+//    %c = icmp eq i32 %i1, %i2
+//    br i1 %c, label %Tail-split0, label %TBB
+//   TBB:
+//    br label %Tail-split1
+//   Tail-split0:
+//    call void @bar(i32 0)
+//    br label %Tail
+//   Tail-split1:
+//    call void @bar(i32 1)
+//    br label %Tail
+//   Tail:
+//    %p = phi i32 [ 0, %Tail-split0 ], [ 1, %Tail-split1 ]
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/CallSiteSplitting.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "callsite-splitting"
+
+STATISTIC(NumCallSiteSplit, "Number of call-site split");
+
+/// Only allow instructions before a call, if their CodeSize cost is below
+/// DuplicationThreshold. Those instructions need to be duplicated in all
+/// split blocks.
+static cl::opt<unsigned>
+    DuplicationThreshold("callsite-splitting-duplication-threshold", cl::Hidden,
+                         cl::desc("Only allow instructions before a call, if "
+                                  "their cost is below DuplicationThreshold"),
+                         cl::init(5));
+
+static void addNonNullAttribute(CallBase &CB, Value *Op) {
+  unsigned ArgNo = 0;
+  for (auto &I : CB.args()) {
+    if (&*I == Op)
+      CB.addParamAttr(ArgNo, Attribute::NonNull);
+    ++ArgNo;
+  }
+}
+
+static void setConstantInArgument(CallBase &CB, Value *Op,
+                                  Constant *ConstValue) {
+  unsigned ArgNo = 0;
+  for (auto &I : CB.args()) {
+    if (&*I == Op) {
+      // It is possible we have already added the non-null attribute to the
+      // parameter by using an earlier constraining condition.
+      CB.removeParamAttr(ArgNo, Attribute::NonNull);
+      CB.setArgOperand(ArgNo, ConstValue);
+    }
+    ++ArgNo;
+  }
+}
+
+static bool isCondRelevantToAnyCallArgument(ICmpInst *Cmp, CallBase &CB) {
+  assert(isa<Constant>(Cmp->getOperand(1)) && "Expected a constant operand.");
+  Value *Op0 = Cmp->getOperand(0);
+  unsigned ArgNo = 0;
+  for (auto I = CB.arg_begin(), E = CB.arg_end(); I != E; ++I, ++ArgNo) {
+    // Don't consider constant or arguments that are already known non-null.
+    if (isa<Constant>(*I) || CB.paramHasAttr(ArgNo, Attribute::NonNull))
+      continue;
+
+    if (*I == Op0)
+      return true;
+  }
+  return false;
+}
+
+typedef std::pair<ICmpInst *, unsigned> ConditionTy;
+typedef SmallVector<ConditionTy, 2> ConditionsTy;
+
+/// If From has a conditional jump to To, add the condition to Conditions,
+/// if it is relevant to any argument at CB.
+static void recordCondition(CallBase &CB, BasicBlock *From, BasicBlock *To,
+                            ConditionsTy &Conditions) {
+  auto *BI = dyn_cast<BranchInst>(From->getTerminator());
+  if (!BI || !BI->isConditional())
+    return;
+
+  CmpInst::Predicate Pred;
+  Value *Cond = BI->getCondition();
+  if (!match(Cond, m_ICmp(Pred, m_Value(), m_Constant())))
+    return;
+
+  ICmpInst *Cmp = cast<ICmpInst>(Cond);
+  if (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE)
+    if (isCondRelevantToAnyCallArgument(Cmp, CB))
+      Conditions.push_back({Cmp, From->getTerminator()->getSuccessor(0) == To
+                                     ? Pred
+                                     : Cmp->getInversePredicate()});
+}
+
+/// Record ICmp conditions relevant to any argument in CB following Pred's
+/// single predecessors. If there are conflicting conditions along a path, like
+/// x == 1 and x == 0, the first condition will be used. We stop once we reach
+/// an edge to StopAt.
+static void recordConditions(CallBase &CB, BasicBlock *Pred,
+                             ConditionsTy &Conditions, BasicBlock *StopAt) {
+  BasicBlock *From = Pred;
+  BasicBlock *To = Pred;
+  SmallPtrSet<BasicBlock *, 4> Visited;
+  while (To != StopAt && !Visited.count(From->getSinglePredecessor()) &&
+         (From = From->getSinglePredecessor())) {
+    recordCondition(CB, From, To, Conditions);
+    Visited.insert(From);
+    To = From;
+  }
+}
+
+static void addConditions(CallBase &CB, const ConditionsTy &Conditions) {
+  for (auto &Cond : Conditions) {
+    Value *Arg = Cond.first->getOperand(0);
+    Constant *ConstVal = cast<Constant>(Cond.first->getOperand(1));
+    if (Cond.second == ICmpInst::ICMP_EQ)
+      setConstantInArgument(CB, Arg, ConstVal);
+    else if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) {
+      assert(Cond.second == ICmpInst::ICMP_NE);
+      addNonNullAttribute(CB, Arg);
+    }
+  }
+}
+
+static SmallVector<BasicBlock *, 2> getTwoPredecessors(BasicBlock *BB) {
+  SmallVector<BasicBlock *, 2> Preds(predecessors((BB)));
+  assert(Preds.size() == 2 && "Expected exactly 2 predecessors!");
+  return Preds;
+}
+
+static bool canSplitCallSite(CallBase &CB, TargetTransformInfo &TTI) {
+  if (CB.isConvergent() || CB.cannotDuplicate())
+    return false;
+
+  // FIXME: As of now we handle only CallInst. InvokeInst could be handled
+  // without too much effort.
+  if (!isa<CallInst>(CB))
+    return false;
+
+  BasicBlock *CallSiteBB = CB.getParent();
+  // Need 2 predecessors and cannot split an edge from an IndirectBrInst.
+  SmallVector<BasicBlock *, 2> Preds(predecessors(CallSiteBB));
+  if (Preds.size() != 2 || isa<IndirectBrInst>(Preds[0]->getTerminator()) ||
+      isa<IndirectBrInst>(Preds[1]->getTerminator()))
+    return false;
+
+  // BasicBlock::canSplitPredecessors is more aggressive, so checking for
+  // BasicBlock::isEHPad as well.
+  if (!CallSiteBB->canSplitPredecessors() || CallSiteBB->isEHPad())
+    return false;
+
+  // Allow splitting a call-site only when the CodeSize cost of the
+  // instructions before the call is less then DuplicationThreshold. The
+  // instructions before the call will be duplicated in the split blocks and
+  // corresponding uses will be updated.
   InstructionCost Cost = 0;
-  for (auto &InstBeforeCall : 
-       llvm::make_range(CallSiteBB->begin(), CB.getIterator())) { 
-    Cost += TTI.getInstructionCost(&InstBeforeCall, 
-                                   TargetTransformInfo::TCK_CodeSize); 
-    if (Cost >= DuplicationThreshold) 
-      return false; 
-  } 
- 
-  return true; 
-} 
- 
-static Instruction *cloneInstForMustTail(Instruction *I, Instruction *Before, 
-                                         Value *V) { 
-  Instruction *Copy = I->clone(); 
-  Copy->setName(I->getName()); 
-  Copy->insertBefore(Before); 
-  if (V) 
-    Copy->setOperand(0, V); 
-  return Copy; 
-} 
- 
-/// Copy mandatory `musttail` return sequence that follows original `CI`, and 
-/// link it up to `NewCI` value instead: 
-/// 
-///   * (optional) `bitcast NewCI to ...` 
-///   * `ret bitcast or NewCI` 
-/// 
-/// Insert this sequence right before `SplitBB`'s terminator, which will be 
-/// cleaned up later in `splitCallSite` below. 
-static void copyMustTailReturn(BasicBlock *SplitBB, Instruction *CI, 
-                               Instruction *NewCI) { 
-  bool IsVoid = SplitBB->getParent()->getReturnType()->isVoidTy(); 
-  auto II = std::next(CI->getIterator()); 
- 
-  BitCastInst* BCI = dyn_cast<BitCastInst>(&*II); 
-  if (BCI) 
-    ++II; 
- 
-  ReturnInst* RI = dyn_cast<ReturnInst>(&*II); 
-  assert(RI && "`musttail` call must be followed by `ret` instruction"); 
- 
-  Instruction *TI = SplitBB->getTerminator(); 
-  Value *V = NewCI; 
-  if (BCI) 
-    V = cloneInstForMustTail(BCI, TI, V); 
-  cloneInstForMustTail(RI, TI, IsVoid ? nullptr : V); 
- 
-  // FIXME: remove TI here, `DuplicateInstructionsInSplitBetween` has a bug 
-  // that prevents doing this now. 
-} 
- 
-/// For each (predecessor, conditions from predecessors) pair, it will split the 
-/// basic block containing the call site, hook it up to the predecessor and 
-/// replace the call instruction with new call instructions, which contain 
-/// constraints based on the conditions from their predecessors. 
-/// For example, in the IR below with an OR condition, the call-site can 
-/// be split. In this case, Preds for Tail is [(Header, a == null), 
-/// (TBB, a != null, b == null)]. Tail is replaced by 2 split blocks, containing 
-/// CallInst1, which has constraints based on the conditions from Head and 
-/// CallInst2, which has constraints based on the conditions coming from TBB. 
-/// 
-/// From : 
-/// 
-///   Header: 
-///     %c = icmp eq i32* %a, null 
-///     br i1 %c %Tail, %TBB 
-///   TBB: 
-///     %c2 = icmp eq i32* %b, null 
-///     br i1 %c %Tail, %End 
-///   Tail: 
-///     %ca = call i1  @callee (i32* %a, i32* %b) 
-/// 
-///  to : 
-/// 
-///   Header:                          // PredBB1 is Header 
-///     %c = icmp eq i32* %a, null 
-///     br i1 %c %Tail-split1, %TBB 
-///   TBB:                             // PredBB2 is TBB 
-///     %c2 = icmp eq i32* %b, null 
-///     br i1 %c %Tail-split2, %End 
-///   Tail-split1: 
-///     %ca1 = call @callee (i32* null, i32* %b)         // CallInst1 
-///    br %Tail 
-///   Tail-split2: 
-///     %ca2 = call @callee (i32* nonnull %a, i32* null) // CallInst2 
-///    br %Tail 
-///   Tail: 
-///    %p = phi i1 [%ca1, %Tail-split1],[%ca2, %Tail-split2] 
-/// 
-/// Note that in case any arguments at the call-site are constrained by its 
-/// predecessors, new call-sites with more constrained arguments will be 
-/// created in createCallSitesOnPredicatedArgument(). 
-static void splitCallSite( 
-    CallBase &CB, 
-    const SmallVectorImpl<std::pair<BasicBlock *, ConditionsTy>> &Preds, 
-    DomTreeUpdater &DTU) { 
-  BasicBlock *TailBB = CB.getParent(); 
-  bool IsMustTailCall = CB.isMustTailCall(); 
- 
-  PHINode *CallPN = nullptr; 
- 
-  // `musttail` calls must be followed by optional `bitcast`, and `ret`. The 
-  // split blocks will be terminated right after that so there're no users for 
-  // this phi in a `TailBB`. 
-  if (!IsMustTailCall && !CB.use_empty()) { 
-    CallPN = PHINode::Create(CB.getType(), Preds.size(), "phi.call"); 
-    CallPN->setDebugLoc(CB.getDebugLoc()); 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "split call-site : " << CB << " into \n"); 
- 
-  assert(Preds.size() == 2 && "The ValueToValueMaps array has size 2."); 
-  // ValueToValueMapTy is neither copy nor moveable, so we use a simple array 
-  // here. 
-  ValueToValueMapTy ValueToValueMaps[2]; 
-  for (unsigned i = 0; i < Preds.size(); i++) { 
-    BasicBlock *PredBB = Preds[i].first; 
-    BasicBlock *SplitBlock = DuplicateInstructionsInSplitBetween( 
-        TailBB, PredBB, &*std::next(CB.getIterator()), ValueToValueMaps[i], 
-        DTU); 
-    assert(SplitBlock && "Unexpected new basic block split."); 
- 
-    auto *NewCI = 
-        cast<CallBase>(&*std::prev(SplitBlock->getTerminator()->getIterator())); 
-    addConditions(*NewCI, Preds[i].second); 
- 
-    // Handle PHIs used as arguments in the call-site. 
-    for (PHINode &PN : TailBB->phis()) { 
-      unsigned ArgNo = 0; 
-      for (auto &CI : CB.args()) { 
-        if (&*CI == &PN) { 
-          NewCI->setArgOperand(ArgNo, PN.getIncomingValueForBlock(SplitBlock)); 
-        } 
-        ++ArgNo; 
-      } 
-    } 
-    LLVM_DEBUG(dbgs() << "    " << *NewCI << " in " << SplitBlock->getName() 
-                      << "\n"); 
-    if (CallPN) 
-      CallPN->addIncoming(NewCI, SplitBlock); 
- 
-    // Clone and place bitcast and return instructions before `TI` 
-    if (IsMustTailCall) 
-      copyMustTailReturn(SplitBlock, &CB, NewCI); 
-  } 
- 
-  NumCallSiteSplit++; 
- 
-  // FIXME: remove TI in `copyMustTailReturn` 
-  if (IsMustTailCall) { 
-    // Remove superfluous `br` terminators from the end of the Split blocks 
-    // NOTE: Removing terminator removes the SplitBlock from the TailBB's 
-    // predecessors. Therefore we must get complete list of Splits before 
-    // attempting removal. 
-    SmallVector<BasicBlock *, 2> Splits(predecessors((TailBB))); 
-    assert(Splits.size() == 2 && "Expected exactly 2 splits!"); 
-    for (unsigned i = 0; i < Splits.size(); i++) { 
-      Splits[i]->getTerminator()->eraseFromParent(); 
-      DTU.applyUpdatesPermissive({{DominatorTree::Delete, Splits[i], TailBB}}); 
-    } 
- 
-    // Erase the tail block once done with musttail patching 
-    DTU.deleteBB(TailBB); 
-    return; 
-  } 
- 
-  auto *OriginalBegin = &*TailBB->begin(); 
-  // Replace users of the original call with a PHI mering call-sites split. 
-  if (CallPN) { 
-    CallPN->insertBefore(OriginalBegin); 
-    CB.replaceAllUsesWith(CallPN); 
-  } 
- 
-  // Remove instructions moved to split blocks from TailBB, from the duplicated 
-  // call instruction to the beginning of the basic block. If an instruction 
-  // has any uses, add a new PHI node to combine the values coming from the 
-  // split blocks. The new PHI nodes are placed before the first original 
-  // instruction, so we do not end up deleting them. By using reverse-order, we 
-  // do not introduce unnecessary PHI nodes for def-use chains from the call 
-  // instruction to the beginning of the block. 
-  auto I = CB.getReverseIterator(); 
-  while (I != TailBB->rend()) { 
-    Instruction *CurrentI = &*I++; 
-    if (!CurrentI->use_empty()) { 
-      // If an existing PHI has users after the call, there is no need to create 
-      // a new one. 
-      if (isa<PHINode>(CurrentI)) 
-        continue; 
-      PHINode *NewPN = PHINode::Create(CurrentI->getType(), Preds.size()); 
-      NewPN->setDebugLoc(CurrentI->getDebugLoc()); 
-      for (auto &Mapping : ValueToValueMaps) 
-        NewPN->addIncoming(Mapping[CurrentI], 
-                           cast<Instruction>(Mapping[CurrentI])->getParent()); 
-      NewPN->insertBefore(&*TailBB->begin()); 
-      CurrentI->replaceAllUsesWith(NewPN); 
-    } 
-    CurrentI->eraseFromParent(); 
-    // We are done once we handled the first original instruction in TailBB. 
-    if (CurrentI == OriginalBegin) 
-      break; 
-  } 
-} 
- 
-// Return true if the call-site has an argument which is a PHI with only 
-// constant incoming values. 
-static bool isPredicatedOnPHI(CallBase &CB) { 
-  BasicBlock *Parent = CB.getParent(); 
-  if (&CB != Parent->getFirstNonPHIOrDbg()) 
-    return false; 
- 
-  for (auto &PN : Parent->phis()) { 
-    for (auto &Arg : CB.args()) { 
-      if (&*Arg != &PN) 
-        continue; 
-      assert(PN.getNumIncomingValues() == 2 && 
-             "Unexpected number of incoming values"); 
-      if (PN.getIncomingBlock(0) == PN.getIncomingBlock(1)) 
-        return false; 
-      if (PN.getIncomingValue(0) == PN.getIncomingValue(1)) 
-        continue; 
-      if (isa<Constant>(PN.getIncomingValue(0)) && 
-          isa<Constant>(PN.getIncomingValue(1))) 
-        return true; 
-    } 
-  } 
-  return false; 
-} 
- 
-using PredsWithCondsTy = SmallVector<std::pair<BasicBlock *, ConditionsTy>, 2>; 
- 
-// Check if any of the arguments in CS are predicated on a PHI node and return 
-// the set of predecessors we should use for splitting. 
-static PredsWithCondsTy shouldSplitOnPHIPredicatedArgument(CallBase &CB) { 
-  if (!isPredicatedOnPHI(CB)) 
-    return {}; 
- 
-  auto Preds = getTwoPredecessors(CB.getParent()); 
-  return {{Preds[0], {}}, {Preds[1], {}}}; 
-} 
- 
-// Checks if any of the arguments in CS are predicated in a predecessor and 
-// returns a list of predecessors with the conditions that hold on their edges 
-// to CS. 
-static PredsWithCondsTy shouldSplitOnPredicatedArgument(CallBase &CB, 
-                                                        DomTreeUpdater &DTU) { 
-  auto Preds = getTwoPredecessors(CB.getParent()); 
-  if (Preds[0] == Preds[1]) 
-    return {}; 
- 
-  // We can stop recording conditions once we reached the immediate dominator 
-  // for the block containing the call site. Conditions in predecessors of the 
-  // that node will be the same for all paths to the call site and splitting 
-  // is not beneficial. 
-  assert(DTU.hasDomTree() && "We need a DTU with a valid DT!"); 
-  auto *CSDTNode = DTU.getDomTree().getNode(CB.getParent()); 
-  BasicBlock *StopAt = CSDTNode ? CSDTNode->getIDom()->getBlock() : nullptr; 
- 
-  SmallVector<std::pair<BasicBlock *, ConditionsTy>, 2> PredsCS; 
-  for (auto *Pred : make_range(Preds.rbegin(), Preds.rend())) { 
-    ConditionsTy Conditions; 
-    // Record condition on edge BB(CS) <- Pred 
-    recordCondition(CB, Pred, CB.getParent(), Conditions); 
-    // Record conditions following Pred's single predecessors. 
-    recordConditions(CB, Pred, Conditions, StopAt); 
-    PredsCS.push_back({Pred, Conditions}); 
-  } 
- 
-  if (all_of(PredsCS, [](const std::pair<BasicBlock *, ConditionsTy> &P) { 
-        return P.second.empty(); 
-      })) 
-    return {}; 
- 
-  return PredsCS; 
-} 
- 
-static bool tryToSplitCallSite(CallBase &CB, TargetTransformInfo &TTI, 
-                               DomTreeUpdater &DTU) { 
-  // Check if we can split the call site. 
-  if (!CB.arg_size() || !canSplitCallSite(CB, TTI)) 
-    return false; 
- 
-  auto PredsWithConds = shouldSplitOnPredicatedArgument(CB, DTU); 
-  if (PredsWithConds.empty()) 
-    PredsWithConds = shouldSplitOnPHIPredicatedArgument(CB); 
-  if (PredsWithConds.empty()) 
-    return false; 
- 
-  splitCallSite(CB, PredsWithConds, DTU); 
-  return true; 
-} 
- 
-static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI, 
-                                TargetTransformInfo &TTI, DominatorTree &DT) { 
- 
-  DomTreeUpdater DTU(&DT, DomTreeUpdater::UpdateStrategy::Lazy); 
-  bool Changed = false; 
-  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;) { 
-    BasicBlock &BB = *BI++; 
-    auto II = BB.getFirstNonPHIOrDbg()->getIterator(); 
-    auto IE = BB.getTerminator()->getIterator(); 
-    // Iterate until we reach the terminator instruction. tryToSplitCallSite 
-    // can replace BB's terminator in case BB is a successor of itself. In that 
-    // case, IE will be invalidated and we also have to check the current 
-    // terminator. 
-    while (II != IE && &*II != BB.getTerminator()) { 
-      CallBase *CB = dyn_cast<CallBase>(&*II++); 
-      if (!CB || isa<IntrinsicInst>(CB) || isInstructionTriviallyDead(CB, &TLI)) 
-        continue; 
- 
-      Function *Callee = CB->getCalledFunction(); 
-      if (!Callee || Callee->isDeclaration()) 
-        continue; 
- 
-      // Successful musttail call-site splits result in erased CI and erased BB. 
-      // Check if such path is possible before attempting the splitting. 
-      bool IsMustTail = CB->isMustTailCall(); 
- 
-      Changed |= tryToSplitCallSite(*CB, TTI, DTU); 
- 
-      // There're no interesting instructions after this. The call site 
-      // itself might have been erased on splitting. 
-      if (IsMustTail) 
-        break; 
-    } 
-  } 
-  return Changed; 
-} 
- 
-namespace { 
-struct CallSiteSplittingLegacyPass : public FunctionPass { 
-  static char ID; 
-  CallSiteSplittingLegacyPass() : FunctionPass(ID) { 
-    initializeCallSiteSplittingLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-    AU.addRequired<TargetTransformInfoWrapperPass>(); 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addPreserved<DominatorTreeWrapperPass>(); 
-    FunctionPass::getAnalysisUsage(AU); 
-  } 
- 
-  bool runOnFunction(Function &F) override { 
-    if (skipFunction(F)) 
-      return false; 
- 
-    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 
-    auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 
-    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-    return doCallSiteSplitting(F, TLI, TTI, DT); 
-  } 
-}; 
-} // namespace 
- 
-char CallSiteSplittingLegacyPass::ID = 0; 
-INITIALIZE_PASS_BEGIN(CallSiteSplittingLegacyPass, "callsite-splitting", 
-                      "Call-site splitting", false, false) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_END(CallSiteSplittingLegacyPass, "callsite-splitting", 
-                    "Call-site splitting", false, false) 
-FunctionPass *llvm::createCallSiteSplittingPass() { 
-  return new CallSiteSplittingLegacyPass(); 
-} 
- 
-PreservedAnalyses CallSiteSplittingPass::run(Function &F, 
-                                             FunctionAnalysisManager &AM) { 
-  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 
-  auto &TTI = AM.getResult<TargetIRAnalysis>(F); 
-  auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 
- 
-  if (!doCallSiteSplitting(F, TLI, TTI, DT)) 
-    return PreservedAnalyses::all(); 
-  PreservedAnalyses PA; 
-  PA.preserve<DominatorTreeAnalysis>(); 
-  return PA; 
-} 
+  for (auto &InstBeforeCall :
+       llvm::make_range(CallSiteBB->begin(), CB.getIterator())) {
+    Cost += TTI.getInstructionCost(&InstBeforeCall,
+                                   TargetTransformInfo::TCK_CodeSize);
+    if (Cost >= DuplicationThreshold)
+      return false;
+  }
+
+  return true;
+}
+
+static Instruction *cloneInstForMustTail(Instruction *I, Instruction *Before,
+                                         Value *V) {
+  Instruction *Copy = I->clone();
+  Copy->setName(I->getName());
+  Copy->insertBefore(Before);
+  if (V)
+    Copy->setOperand(0, V);
+  return Copy;
+}
+
+/// Copy mandatory `musttail` return sequence that follows original `CI`, and
+/// link it up to `NewCI` value instead:
+///
+///   * (optional) `bitcast NewCI to ...`
+///   * `ret bitcast or NewCI`
+///
+/// Insert this sequence right before `SplitBB`'s terminator, which will be
+/// cleaned up later in `splitCallSite` below.
+static void copyMustTailReturn(BasicBlock *SplitBB, Instruction *CI,
+                               Instruction *NewCI) {
+  bool IsVoid = SplitBB->getParent()->getReturnType()->isVoidTy();
+  auto II = std::next(CI->getIterator());
+
+  BitCastInst* BCI = dyn_cast<BitCastInst>(&*II);
+  if (BCI)
+    ++II;
+
+  ReturnInst* RI = dyn_cast<ReturnInst>(&*II);
+  assert(RI && "`musttail` call must be followed by `ret` instruction");
+
+  Instruction *TI = SplitBB->getTerminator();
+  Value *V = NewCI;
+  if (BCI)
+    V = cloneInstForMustTail(BCI, TI, V);
+  cloneInstForMustTail(RI, TI, IsVoid ? nullptr : V);
+
+  // FIXME: remove TI here, `DuplicateInstructionsInSplitBetween` has a bug
+  // that prevents doing this now.
+}
+
+/// For each (predecessor, conditions from predecessors) pair, it will split the
+/// basic block containing the call site, hook it up to the predecessor and
+/// replace the call instruction with new call instructions, which contain
+/// constraints based on the conditions from their predecessors.
+/// For example, in the IR below with an OR condition, the call-site can
+/// be split. In this case, Preds for Tail is [(Header, a == null),
+/// (TBB, a != null, b == null)]. Tail is replaced by 2 split blocks, containing
+/// CallInst1, which has constraints based on the conditions from Head and
+/// CallInst2, which has constraints based on the conditions coming from TBB.
+///
+/// From :
+///
+///   Header:
+///     %c = icmp eq i32* %a, null
+///     br i1 %c %Tail, %TBB
+///   TBB:
+///     %c2 = icmp eq i32* %b, null
+///     br i1 %c %Tail, %End
+///   Tail:
+///     %ca = call i1  @callee (i32* %a, i32* %b)
+///
+///  to :
+///
+///   Header:                          // PredBB1 is Header
+///     %c = icmp eq i32* %a, null
+///     br i1 %c %Tail-split1, %TBB
+///   TBB:                             // PredBB2 is TBB
+///     %c2 = icmp eq i32* %b, null
+///     br i1 %c %Tail-split2, %End
+///   Tail-split1:
+///     %ca1 = call @callee (i32* null, i32* %b)         // CallInst1
+///    br %Tail
+///   Tail-split2:
+///     %ca2 = call @callee (i32* nonnull %a, i32* null) // CallInst2
+///    br %Tail
+///   Tail:
+///    %p = phi i1 [%ca1, %Tail-split1],[%ca2, %Tail-split2]
+///
+/// Note that in case any arguments at the call-site are constrained by its
+/// predecessors, new call-sites with more constrained arguments will be
+/// created in createCallSitesOnPredicatedArgument().
+static void splitCallSite(
+    CallBase &CB,
+    const SmallVectorImpl<std::pair<BasicBlock *, ConditionsTy>> &Preds,
+    DomTreeUpdater &DTU) {
+  BasicBlock *TailBB = CB.getParent();
+  bool IsMustTailCall = CB.isMustTailCall();
+
+  PHINode *CallPN = nullptr;
+
+  // `musttail` calls must be followed by optional `bitcast`, and `ret`. The
+  // split blocks will be terminated right after that so there're no users for
+  // this phi in a `TailBB`.
+  if (!IsMustTailCall && !CB.use_empty()) {
+    CallPN = PHINode::Create(CB.getType(), Preds.size(), "phi.call");
+    CallPN->setDebugLoc(CB.getDebugLoc());
+  }
+
+  LLVM_DEBUG(dbgs() << "split call-site : " << CB << " into \n");
+
+  assert(Preds.size() == 2 && "The ValueToValueMaps array has size 2.");
+  // ValueToValueMapTy is neither copy nor moveable, so we use a simple array
+  // here.
+  ValueToValueMapTy ValueToValueMaps[2];
+  for (unsigned i = 0; i < Preds.size(); i++) {
+    BasicBlock *PredBB = Preds[i].first;
+    BasicBlock *SplitBlock = DuplicateInstructionsInSplitBetween(
+        TailBB, PredBB, &*std::next(CB.getIterator()), ValueToValueMaps[i],
+        DTU);
+    assert(SplitBlock && "Unexpected new basic block split.");
+
+    auto *NewCI =
+        cast<CallBase>(&*std::prev(SplitBlock->getTerminator()->getIterator()));
+    addConditions(*NewCI, Preds[i].second);
+
+    // Handle PHIs used as arguments in the call-site.
+    for (PHINode &PN : TailBB->phis()) {
+      unsigned ArgNo = 0;
+      for (auto &CI : CB.args()) {
+        if (&*CI == &PN) {
+          NewCI->setArgOperand(ArgNo, PN.getIncomingValueForBlock(SplitBlock));
+        }
+        ++ArgNo;
+      }
+    }
+    LLVM_DEBUG(dbgs() << "    " << *NewCI << " in " << SplitBlock->getName()
+                      << "\n");
+    if (CallPN)
+      CallPN->addIncoming(NewCI, SplitBlock);
+
+    // Clone and place bitcast and return instructions before `TI`
+    if (IsMustTailCall)
+      copyMustTailReturn(SplitBlock, &CB, NewCI);
+  }
+
+  NumCallSiteSplit++;
+
+  // FIXME: remove TI in `copyMustTailReturn`
+  if (IsMustTailCall) {
+    // Remove superfluous `br` terminators from the end of the Split blocks
+    // NOTE: Removing terminator removes the SplitBlock from the TailBB's
+    // predecessors. Therefore we must get complete list of Splits before
+    // attempting removal.
+    SmallVector<BasicBlock *, 2> Splits(predecessors((TailBB)));
+    assert(Splits.size() == 2 && "Expected exactly 2 splits!");
+    for (unsigned i = 0; i < Splits.size(); i++) {
+      Splits[i]->getTerminator()->eraseFromParent();
+      DTU.applyUpdatesPermissive({{DominatorTree::Delete, Splits[i], TailBB}});
+    }
+
+    // Erase the tail block once done with musttail patching
+    DTU.deleteBB(TailBB);
+    return;
+  }
+
+  auto *OriginalBegin = &*TailBB->begin();
+  // Replace users of the original call with a PHI mering call-sites split.
+  if (CallPN) {
+    CallPN->insertBefore(OriginalBegin);
+    CB.replaceAllUsesWith(CallPN);
+  }
+
+  // Remove instructions moved to split blocks from TailBB, from the duplicated
+  // call instruction to the beginning of the basic block. If an instruction
+  // has any uses, add a new PHI node to combine the values coming from the
+  // split blocks. The new PHI nodes are placed before the first original
+  // instruction, so we do not end up deleting them. By using reverse-order, we
+  // do not introduce unnecessary PHI nodes for def-use chains from the call
+  // instruction to the beginning of the block.
+  auto I = CB.getReverseIterator();
+  while (I != TailBB->rend()) {
+    Instruction *CurrentI = &*I++;
+    if (!CurrentI->use_empty()) {
+      // If an existing PHI has users after the call, there is no need to create
+      // a new one.
+      if (isa<PHINode>(CurrentI))
+        continue;
+      PHINode *NewPN = PHINode::Create(CurrentI->getType(), Preds.size());
+      NewPN->setDebugLoc(CurrentI->getDebugLoc());
+      for (auto &Mapping : ValueToValueMaps)
+        NewPN->addIncoming(Mapping[CurrentI],
+                           cast<Instruction>(Mapping[CurrentI])->getParent());
+      NewPN->insertBefore(&*TailBB->begin());
+      CurrentI->replaceAllUsesWith(NewPN);
+    }
+    CurrentI->eraseFromParent();
+    // We are done once we handled the first original instruction in TailBB.
+    if (CurrentI == OriginalBegin)
+      break;
+  }
+}
+
+// Return true if the call-site has an argument which is a PHI with only
+// constant incoming values.
+static bool isPredicatedOnPHI(CallBase &CB) {
+  BasicBlock *Parent = CB.getParent();
+  if (&CB != Parent->getFirstNonPHIOrDbg())
+    return false;
+
+  for (auto &PN : Parent->phis()) {
+    for (auto &Arg : CB.args()) {
+      if (&*Arg != &PN)
+        continue;
+      assert(PN.getNumIncomingValues() == 2 &&
+             "Unexpected number of incoming values");
+      if (PN.getIncomingBlock(0) == PN.getIncomingBlock(1))
+        return false;
+      if (PN.getIncomingValue(0) == PN.getIncomingValue(1))
+        continue;
+      if (isa<Constant>(PN.getIncomingValue(0)) &&
+          isa<Constant>(PN.getIncomingValue(1)))
+        return true;
+    }
+  }
+  return false;
+}
+
+using PredsWithCondsTy = SmallVector<std::pair<BasicBlock *, ConditionsTy>, 2>;
+
+// Check if any of the arguments in CS are predicated on a PHI node and return
+// the set of predecessors we should use for splitting.
+static PredsWithCondsTy shouldSplitOnPHIPredicatedArgument(CallBase &CB) {
+  if (!isPredicatedOnPHI(CB))
+    return {};
+
+  auto Preds = getTwoPredecessors(CB.getParent());
+  return {{Preds[0], {}}, {Preds[1], {}}};
+}
+
+// Checks if any of the arguments in CS are predicated in a predecessor and
+// returns a list of predecessors with the conditions that hold on their edges
+// to CS.
+static PredsWithCondsTy shouldSplitOnPredicatedArgument(CallBase &CB,
+                                                        DomTreeUpdater &DTU) {
+  auto Preds = getTwoPredecessors(CB.getParent());
+  if (Preds[0] == Preds[1])
+    return {};
+
+  // We can stop recording conditions once we reached the immediate dominator
+  // for the block containing the call site. Conditions in predecessors of the
+  // that node will be the same for all paths to the call site and splitting
+  // is not beneficial.
+  assert(DTU.hasDomTree() && "We need a DTU with a valid DT!");
+  auto *CSDTNode = DTU.getDomTree().getNode(CB.getParent());
+  BasicBlock *StopAt = CSDTNode ? CSDTNode->getIDom()->getBlock() : nullptr;
+
+  SmallVector<std::pair<BasicBlock *, ConditionsTy>, 2> PredsCS;
+  for (auto *Pred : make_range(Preds.rbegin(), Preds.rend())) {
+    ConditionsTy Conditions;
+    // Record condition on edge BB(CS) <- Pred
+    recordCondition(CB, Pred, CB.getParent(), Conditions);
+    // Record conditions following Pred's single predecessors.
+    recordConditions(CB, Pred, Conditions, StopAt);
+    PredsCS.push_back({Pred, Conditions});
+  }
+
+  if (all_of(PredsCS, [](const std::pair<BasicBlock *, ConditionsTy> &P) {
+        return P.second.empty();
+      }))
+    return {};
+
+  return PredsCS;
+}
+
+static bool tryToSplitCallSite(CallBase &CB, TargetTransformInfo &TTI,
+                               DomTreeUpdater &DTU) {
+  // Check if we can split the call site.
+  if (!CB.arg_size() || !canSplitCallSite(CB, TTI))
+    return false;
+
+  auto PredsWithConds = shouldSplitOnPredicatedArgument(CB, DTU);
+  if (PredsWithConds.empty())
+    PredsWithConds = shouldSplitOnPHIPredicatedArgument(CB);
+  if (PredsWithConds.empty())
+    return false;
+
+  splitCallSite(CB, PredsWithConds, DTU);
+  return true;
+}
+
+static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI,
+                                TargetTransformInfo &TTI, DominatorTree &DT) {
+
+  DomTreeUpdater DTU(&DT, DomTreeUpdater::UpdateStrategy::Lazy);
+  bool Changed = false;
+  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;) {
+    BasicBlock &BB = *BI++;
+    auto II = BB.getFirstNonPHIOrDbg()->getIterator();
+    auto IE = BB.getTerminator()->getIterator();
+    // Iterate until we reach the terminator instruction. tryToSplitCallSite
+    // can replace BB's terminator in case BB is a successor of itself. In that
+    // case, IE will be invalidated and we also have to check the current
+    // terminator.
+    while (II != IE && &*II != BB.getTerminator()) {
+      CallBase *CB = dyn_cast<CallBase>(&*II++);
+      if (!CB || isa<IntrinsicInst>(CB) || isInstructionTriviallyDead(CB, &TLI))
+        continue;
+
+      Function *Callee = CB->getCalledFunction();
+      if (!Callee || Callee->isDeclaration())
+        continue;
+
+      // Successful musttail call-site splits result in erased CI and erased BB.
+      // Check if such path is possible before attempting the splitting.
+      bool IsMustTail = CB->isMustTailCall();
+
+      Changed |= tryToSplitCallSite(*CB, TTI, DTU);
+
+      // There're no interesting instructions after this. The call site
+      // itself might have been erased on splitting.
+      if (IsMustTail)
+        break;
+    }
+  }
+  return Changed;
+}
+
+namespace {
+struct CallSiteSplittingLegacyPass : public FunctionPass {
+  static char ID;
+  CallSiteSplittingLegacyPass() : FunctionPass(ID) {
+    initializeCallSiteSplittingLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    return doCallSiteSplitting(F, TLI, TTI, DT);
+  }
+};
+} // namespace
+
+char CallSiteSplittingLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(CallSiteSplittingLegacyPass, "callsite-splitting",
+                      "Call-site splitting", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(CallSiteSplittingLegacyPass, "callsite-splitting",
+                    "Call-site splitting", false, false)
+FunctionPass *llvm::createCallSiteSplittingPass() {
+  return new CallSiteSplittingLegacyPass();
+}
+
+PreservedAnalyses CallSiteSplittingPass::run(Function &F,
+                                             FunctionAnalysisManager &AM) {
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+
+  if (!doCallSiteSplitting(F, TLI, TTI, DT))
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/ConstantHoisting.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/ConstantHoisting.cpp
index 57fb8492d7..fdab74fc94 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -1,991 +1,991 @@
-//===- ConstantHoisting.cpp - Prepare code for expensive constants --------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass identifies expensive constants to hoist and coalesces them to 
-// better prepare it for SelectionDAG-based code generation. This works around 
-// the limitations of the basic-block-at-a-time approach. 
-// 
-// First it scans all instructions for integer constants and calculates its 
-// cost. If the constant can be folded into the instruction (the cost is 
-// TCC_Free) or the cost is just a simple operation (TCC_BASIC), then we don't 
-// consider it expensive and leave it alone. This is the default behavior and 
-// the default implementation of getIntImmCostInst will always return TCC_Free. 
-// 
-// If the cost is more than TCC_BASIC, then the integer constant can't be folded 
-// into the instruction and it might be beneficial to hoist the constant. 
-// Similar constants are coalesced to reduce register pressure and 
-// materialization code. 
-// 
-// When a constant is hoisted, it is also hidden behind a bitcast to force it to 
-// be live-out of the basic block. Otherwise the constant would be just 
-// duplicated and each basic block would have its own copy in the SelectionDAG. 
-// The SelectionDAG recognizes such constants as opaque and doesn't perform 
-// certain transformations on them, which would create a new expensive constant. 
-// 
-// This optimization is only applied to integer constants in instructions and 
-// simple (this means not nested) constant cast expressions. For example: 
-// %0 = load i64* inttoptr (i64 big_constant to i64*) 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/ConstantHoisting.h" 
-#include "llvm/ADT/APInt.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/None.h" 
-#include "llvm/ADT/Optional.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/BlockFrequencyInfo.h" 
-#include "llvm/Analysis/ProfileSummaryInfo.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DebugInfoMetadata.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/BlockFrequency.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Transforms/Utils/SizeOpts.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <cstdint> 
-#include <iterator> 
-#include <tuple> 
-#include <utility> 
- 
-using namespace llvm; 
-using namespace consthoist; 
- 
-#define DEBUG_TYPE "consthoist" 
- 
-STATISTIC(NumConstantsHoisted, "Number of constants hoisted"); 
-STATISTIC(NumConstantsRebased, "Number of constants rebased"); 
- 
-static cl::opt<bool> ConstHoistWithBlockFrequency( 
-    "consthoist-with-block-frequency", cl::init(true), cl::Hidden, 
-    cl::desc("Enable the use of the block frequency analysis to reduce the " 
-             "chance to execute const materialization more frequently than " 
-             "without hoisting.")); 
- 
-static cl::opt<bool> ConstHoistGEP( 
-    "consthoist-gep", cl::init(false), cl::Hidden, 
-    cl::desc("Try hoisting constant gep expressions")); 
- 
-static cl::opt<unsigned> 
-MinNumOfDependentToRebase("consthoist-min-num-to-rebase", 
-    cl::desc("Do not rebase if number of dependent constants of a Base is less " 
-             "than this number."), 
-    cl::init(0), cl::Hidden); 
- 
-namespace { 
- 
-/// The constant hoisting pass. 
-class ConstantHoistingLegacyPass : public FunctionPass { 
-public: 
-  static char ID; // Pass identification, replacement for typeid 
- 
-  ConstantHoistingLegacyPass() : FunctionPass(ID) { 
-    initializeConstantHoistingLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnFunction(Function &Fn) override; 
- 
-  StringRef getPassName() const override { return "Constant Hoisting"; } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.setPreservesCFG(); 
-    if (ConstHoistWithBlockFrequency) 
-      AU.addRequired<BlockFrequencyInfoWrapperPass>(); 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addRequired<ProfileSummaryInfoWrapperPass>(); 
-    AU.addRequired<TargetTransformInfoWrapperPass>(); 
-  } 
- 
-private: 
-  ConstantHoistingPass Impl; 
-}; 
- 
-} // end anonymous namespace 
- 
-char ConstantHoistingLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(ConstantHoistingLegacyPass, "consthoist", 
-                      "Constant Hoisting", false, false) 
-INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 
-INITIALIZE_PASS_END(ConstantHoistingLegacyPass, "consthoist", 
-                    "Constant Hoisting", false, false) 
- 
-FunctionPass *llvm::createConstantHoistingPass() { 
-  return new ConstantHoistingLegacyPass(); 
-} 
- 
-/// Perform the constant hoisting optimization for the given function. 
-bool ConstantHoistingLegacyPass::runOnFunction(Function &Fn) { 
-  if (skipFunction(Fn)) 
-    return false; 
- 
-  LLVM_DEBUG(dbgs() << "********** Begin Constant Hoisting **********\n"); 
-  LLVM_DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n'); 
- 
-  bool MadeChange = 
-      Impl.runImpl(Fn, getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn), 
-                   getAnalysis<DominatorTreeWrapperPass>().getDomTree(), 
-                   ConstHoistWithBlockFrequency 
-                       ? &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI() 
-                       : nullptr, 
-                   Fn.getEntryBlock(), 
-                   &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI()); 
- 
-  if (MadeChange) { 
-    LLVM_DEBUG(dbgs() << "********** Function after Constant Hoisting: " 
-                      << Fn.getName() << '\n'); 
-    LLVM_DEBUG(dbgs() << Fn); 
-  } 
-  LLVM_DEBUG(dbgs() << "********** End Constant Hoisting **********\n"); 
- 
-  return MadeChange; 
-} 
- 
-/// Find the constant materialization insertion point. 
-Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst, 
-                                                   unsigned Idx) const { 
-  // If the operand is a cast instruction, then we have to materialize the 
-  // constant before the cast instruction. 
-  if (Idx != ~0U) { 
-    Value *Opnd = Inst->getOperand(Idx); 
-    if (auto CastInst = dyn_cast<Instruction>(Opnd)) 
-      if (CastInst->isCast()) 
-        return CastInst; 
-  } 
- 
-  // The simple and common case. This also includes constant expressions. 
-  if (!isa<PHINode>(Inst) && !Inst->isEHPad()) 
-    return Inst; 
- 
-  // We can't insert directly before a phi node or an eh pad. Insert before 
-  // the terminator of the incoming or dominating block. 
-  assert(Entry != Inst->getParent() && "PHI or landing pad in entry block!"); 
-  if (Idx != ~0U && isa<PHINode>(Inst)) 
-    return cast<PHINode>(Inst)->getIncomingBlock(Idx)->getTerminator(); 
- 
-  // This must be an EH pad. Iterate over immediate dominators until we find a 
-  // non-EH pad. We need to skip over catchswitch blocks, which are both EH pads 
-  // and terminators. 
-  auto IDom = DT->getNode(Inst->getParent())->getIDom(); 
-  while (IDom->getBlock()->isEHPad()) { 
-    assert(Entry != IDom->getBlock() && "eh pad in entry block"); 
-    IDom = IDom->getIDom(); 
-  } 
- 
-  return IDom->getBlock()->getTerminator(); 
-} 
- 
-/// Given \p BBs as input, find another set of BBs which collectively 
-/// dominates \p BBs and have the minimal sum of frequencies. Return the BB 
-/// set found in \p BBs. 
-static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI, 
-                                 BasicBlock *Entry, 
-                                 SetVector<BasicBlock *> &BBs) { 
-  assert(!BBs.count(Entry) && "Assume Entry is not in BBs"); 
-  // Nodes on the current path to the root. 
-  SmallPtrSet<BasicBlock *, 8> Path; 
-  // Candidates includes any block 'BB' in set 'BBs' that is not strictly 
-  // dominated by any other blocks in set 'BBs', and all nodes in the path 
-  // in the dominator tree from Entry to 'BB'. 
-  SmallPtrSet<BasicBlock *, 16> Candidates; 
-  for (auto BB : BBs) { 
-    // Ignore unreachable basic blocks. 
-    if (!DT.isReachableFromEntry(BB)) 
-      continue; 
-    Path.clear(); 
-    // Walk up the dominator tree until Entry or another BB in BBs 
-    // is reached. Insert the nodes on the way to the Path. 
-    BasicBlock *Node = BB; 
-    // The "Path" is a candidate path to be added into Candidates set. 
-    bool isCandidate = false; 
-    do { 
-      Path.insert(Node); 
-      if (Node == Entry || Candidates.count(Node)) { 
-        isCandidate = true; 
-        break; 
-      } 
-      assert(DT.getNode(Node)->getIDom() && 
-             "Entry doens't dominate current Node"); 
-      Node = DT.getNode(Node)->getIDom()->getBlock(); 
-    } while (!BBs.count(Node)); 
- 
-    // If isCandidate is false, Node is another Block in BBs dominating 
-    // current 'BB'. Drop the nodes on the Path. 
-    if (!isCandidate) 
-      continue; 
- 
-    // Add nodes on the Path into Candidates. 
-    Candidates.insert(Path.begin(), Path.end()); 
-  } 
- 
-  // Sort the nodes in Candidates in top-down order and save the nodes 
-  // in Orders. 
-  unsigned Idx = 0; 
-  SmallVector<BasicBlock *, 16> Orders; 
-  Orders.push_back(Entry); 
-  while (Idx != Orders.size()) { 
-    BasicBlock *Node = Orders[Idx++]; 
-    for (auto ChildDomNode : DT.getNode(Node)->children()) { 
-      if (Candidates.count(ChildDomNode->getBlock())) 
-        Orders.push_back(ChildDomNode->getBlock()); 
-    } 
-  } 
- 
-  // Visit Orders in bottom-up order. 
-  using InsertPtsCostPair = 
-      std::pair<SetVector<BasicBlock *>, BlockFrequency>; 
- 
-  // InsertPtsMap is a map from a BB to the best insertion points for the 
-  // subtree of BB (subtree not including the BB itself). 
-  DenseMap<BasicBlock *, InsertPtsCostPair> InsertPtsMap; 
-  InsertPtsMap.reserve(Orders.size() + 1); 
-  for (auto RIt = Orders.rbegin(); RIt != Orders.rend(); RIt++) { 
-    BasicBlock *Node = *RIt; 
-    bool NodeInBBs = BBs.count(Node); 
-    auto &InsertPts = InsertPtsMap[Node].first; 
-    BlockFrequency &InsertPtsFreq = InsertPtsMap[Node].second; 
- 
-    // Return the optimal insert points in BBs. 
-    if (Node == Entry) { 
-      BBs.clear(); 
-      if (InsertPtsFreq > BFI.getBlockFreq(Node) || 
-          (InsertPtsFreq == BFI.getBlockFreq(Node) && InsertPts.size() > 1)) 
-        BBs.insert(Entry); 
-      else 
-        BBs.insert(InsertPts.begin(), InsertPts.end()); 
-      break; 
-    } 
- 
-    BasicBlock *Parent = DT.getNode(Node)->getIDom()->getBlock(); 
-    // Initially, ParentInsertPts is empty and ParentPtsFreq is 0. Every child 
-    // will update its parent's ParentInsertPts and ParentPtsFreq. 
-    auto &ParentInsertPts = InsertPtsMap[Parent].first; 
-    BlockFrequency &ParentPtsFreq = InsertPtsMap[Parent].second; 
-    // Choose to insert in Node or in subtree of Node. 
-    // Don't hoist to EHPad because we may not find a proper place to insert 
-    // in EHPad. 
-    // If the total frequency of InsertPts is the same as the frequency of the 
-    // target Node, and InsertPts contains more than one nodes, choose hoisting 
-    // to reduce code size. 
-    if (NodeInBBs || 
-        (!Node->isEHPad() && 
-         (InsertPtsFreq > BFI.getBlockFreq(Node) || 
-          (InsertPtsFreq == BFI.getBlockFreq(Node) && InsertPts.size() > 1)))) { 
-      ParentInsertPts.insert(Node); 
-      ParentPtsFreq += BFI.getBlockFreq(Node); 
-    } else { 
-      ParentInsertPts.insert(InsertPts.begin(), InsertPts.end()); 
-      ParentPtsFreq += InsertPtsFreq; 
-    } 
-  } 
-} 
- 
-/// Find an insertion point that dominates all uses. 
-SetVector<Instruction *> ConstantHoistingPass::findConstantInsertionPoint( 
-    const ConstantInfo &ConstInfo) const { 
-  assert(!ConstInfo.RebasedConstants.empty() && "Invalid constant info entry."); 
-  // Collect all basic blocks. 
-  SetVector<BasicBlock *> BBs; 
-  SetVector<Instruction *> InsertPts; 
-  for (auto const &RCI : ConstInfo.RebasedConstants) 
-    for (auto const &U : RCI.Uses) 
-      BBs.insert(findMatInsertPt(U.Inst, U.OpndIdx)->getParent()); 
- 
-  if (BBs.count(Entry)) { 
-    InsertPts.insert(&Entry->front()); 
-    return InsertPts; 
-  } 
- 
-  if (BFI) { 
-    findBestInsertionSet(*DT, *BFI, Entry, BBs); 
-    for (auto BB : BBs) { 
-      BasicBlock::iterator InsertPt = BB->begin(); 
-      for (; isa<PHINode>(InsertPt) || InsertPt->isEHPad(); ++InsertPt) 
-        ; 
-      InsertPts.insert(&*InsertPt); 
-    } 
-    return InsertPts; 
-  } 
- 
-  while (BBs.size() >= 2) { 
-    BasicBlock *BB, *BB1, *BB2; 
-    BB1 = BBs.pop_back_val(); 
-    BB2 = BBs.pop_back_val(); 
-    BB = DT->findNearestCommonDominator(BB1, BB2); 
-    if (BB == Entry) { 
-      InsertPts.insert(&Entry->front()); 
-      return InsertPts; 
-    } 
-    BBs.insert(BB); 
-  } 
-  assert((BBs.size() == 1) && "Expected only one element."); 
-  Instruction &FirstInst = (*BBs.begin())->front(); 
-  InsertPts.insert(findMatInsertPt(&FirstInst)); 
-  return InsertPts; 
-} 
- 
-/// Record constant integer ConstInt for instruction Inst at operand 
-/// index Idx. 
-/// 
-/// The operand at index Idx is not necessarily the constant integer itself. It 
-/// could also be a cast instruction or a constant expression that uses the 
-/// constant integer. 
-void ConstantHoistingPass::collectConstantCandidates( 
-    ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx, 
-    ConstantInt *ConstInt) { 
-  unsigned Cost; 
-  // Ask the target about the cost of materializing the constant for the given 
-  // instruction and operand index. 
-  if (auto IntrInst = dyn_cast<IntrinsicInst>(Inst)) 
-    Cost = TTI->getIntImmCostIntrin(IntrInst->getIntrinsicID(), Idx, 
-                                    ConstInt->getValue(), ConstInt->getType(), 
-                                    TargetTransformInfo::TCK_SizeAndLatency); 
-  else 
+//===- ConstantHoisting.cpp - Prepare code for expensive constants --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass identifies expensive constants to hoist and coalesces them to
+// better prepare it for SelectionDAG-based code generation. This works around
+// the limitations of the basic-block-at-a-time approach.
+//
+// First it scans all instructions for integer constants and calculates its
+// cost. If the constant can be folded into the instruction (the cost is
+// TCC_Free) or the cost is just a simple operation (TCC_BASIC), then we don't
+// consider it expensive and leave it alone. This is the default behavior and
+// the default implementation of getIntImmCostInst will always return TCC_Free.
+//
+// If the cost is more than TCC_BASIC, then the integer constant can't be folded
+// into the instruction and it might be beneficial to hoist the constant.
+// Similar constants are coalesced to reduce register pressure and
+// materialization code.
+//
+// When a constant is hoisted, it is also hidden behind a bitcast to force it to
+// be live-out of the basic block. Otherwise the constant would be just
+// duplicated and each basic block would have its own copy in the SelectionDAG.
+// The SelectionDAG recognizes such constants as opaque and doesn't perform
+// certain transformations on them, which would create a new expensive constant.
+//
+// This optimization is only applied to integer constants in instructions and
+// simple (this means not nested) constant cast expressions. For example:
+// %0 = load i64* inttoptr (i64 big_constant to i64*)
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/ConstantHoisting.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+using namespace consthoist;
+
+#define DEBUG_TYPE "consthoist"
+
+STATISTIC(NumConstantsHoisted, "Number of constants hoisted");
+STATISTIC(NumConstantsRebased, "Number of constants rebased");
+
+static cl::opt<bool> ConstHoistWithBlockFrequency(
+    "consthoist-with-block-frequency", cl::init(true), cl::Hidden,
+    cl::desc("Enable the use of the block frequency analysis to reduce the "
+             "chance to execute const materialization more frequently than "
+             "without hoisting."));
+
+static cl::opt<bool> ConstHoistGEP(
+    "consthoist-gep", cl::init(false), cl::Hidden,
+    cl::desc("Try hoisting constant gep expressions"));
+
+static cl::opt<unsigned>
+MinNumOfDependentToRebase("consthoist-min-num-to-rebase",
+    cl::desc("Do not rebase if number of dependent constants of a Base is less "
+             "than this number."),
+    cl::init(0), cl::Hidden);
+
+namespace {
+
+/// The constant hoisting pass.
+class ConstantHoistingLegacyPass : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  ConstantHoistingLegacyPass() : FunctionPass(ID) {
+    initializeConstantHoistingLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &Fn) override;
+
+  StringRef getPassName() const override { return "Constant Hoisting"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    if (ConstHoistWithBlockFrequency)
+      AU.addRequired<BlockFrequencyInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
+
+private:
+  ConstantHoistingPass Impl;
+};
+
+} // end anonymous namespace
+
+char ConstantHoistingLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(ConstantHoistingLegacyPass, "consthoist",
+                      "Constant Hoisting", false, false)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(ConstantHoistingLegacyPass, "consthoist",
+                    "Constant Hoisting", false, false)
+
+FunctionPass *llvm::createConstantHoistingPass() {
+  return new ConstantHoistingLegacyPass();
+}
+
+/// Perform the constant hoisting optimization for the given function.
+bool ConstantHoistingLegacyPass::runOnFunction(Function &Fn) {
+  if (skipFunction(Fn))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "********** Begin Constant Hoisting **********\n");
+  LLVM_DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n');
+
+  bool MadeChange =
+      Impl.runImpl(Fn, getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn),
+                   getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+                   ConstHoistWithBlockFrequency
+                       ? &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI()
+                       : nullptr,
+                   Fn.getEntryBlock(),
+                   &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI());
+
+  if (MadeChange) {
+    LLVM_DEBUG(dbgs() << "********** Function after Constant Hoisting: "
+                      << Fn.getName() << '\n');
+    LLVM_DEBUG(dbgs() << Fn);
+  }
+  LLVM_DEBUG(dbgs() << "********** End Constant Hoisting **********\n");
+
+  return MadeChange;
+}
+
+/// Find the constant materialization insertion point.
+Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst,
+                                                   unsigned Idx) const {
+  // If the operand is a cast instruction, then we have to materialize the
+  // constant before the cast instruction.
+  if (Idx != ~0U) {
+    Value *Opnd = Inst->getOperand(Idx);
+    if (auto CastInst = dyn_cast<Instruction>(Opnd))
+      if (CastInst->isCast())
+        return CastInst;
+  }
+
+  // The simple and common case. This also includes constant expressions.
+  if (!isa<PHINode>(Inst) && !Inst->isEHPad())
+    return Inst;
+
+  // We can't insert directly before a phi node or an eh pad. Insert before
+  // the terminator of the incoming or dominating block.
+  assert(Entry != Inst->getParent() && "PHI or landing pad in entry block!");
+  if (Idx != ~0U && isa<PHINode>(Inst))
+    return cast<PHINode>(Inst)->getIncomingBlock(Idx)->getTerminator();
+
+  // This must be an EH pad. Iterate over immediate dominators until we find a
+  // non-EH pad. We need to skip over catchswitch blocks, which are both EH pads
+  // and terminators.
+  auto IDom = DT->getNode(Inst->getParent())->getIDom();
+  while (IDom->getBlock()->isEHPad()) {
+    assert(Entry != IDom->getBlock() && "eh pad in entry block");
+    IDom = IDom->getIDom();
+  }
+
+  return IDom->getBlock()->getTerminator();
+}
+
+/// Given \p BBs as input, find another set of BBs which collectively
+/// dominates \p BBs and have the minimal sum of frequencies. Return the BB
+/// set found in \p BBs.
+static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
+                                 BasicBlock *Entry,
+                                 SetVector<BasicBlock *> &BBs) {
+  assert(!BBs.count(Entry) && "Assume Entry is not in BBs");
+  // Nodes on the current path to the root.
+  SmallPtrSet<BasicBlock *, 8> Path;
+  // Candidates includes any block 'BB' in set 'BBs' that is not strictly
+  // dominated by any other blocks in set 'BBs', and all nodes in the path
+  // in the dominator tree from Entry to 'BB'.
+  SmallPtrSet<BasicBlock *, 16> Candidates;
+  for (auto BB : BBs) {
+    // Ignore unreachable basic blocks.
+    if (!DT.isReachableFromEntry(BB))
+      continue;
+    Path.clear();
+    // Walk up the dominator tree until Entry or another BB in BBs
+    // is reached. Insert the nodes on the way to the Path.
+    BasicBlock *Node = BB;
+    // The "Path" is a candidate path to be added into Candidates set.
+    bool isCandidate = false;
+    do {
+      Path.insert(Node);
+      if (Node == Entry || Candidates.count(Node)) {
+        isCandidate = true;
+        break;
+      }
+      assert(DT.getNode(Node)->getIDom() &&
+             "Entry doens't dominate current Node");
+      Node = DT.getNode(Node)->getIDom()->getBlock();
+    } while (!BBs.count(Node));
+
+    // If isCandidate is false, Node is another Block in BBs dominating
+    // current 'BB'. Drop the nodes on the Path.
+    if (!isCandidate)
+      continue;
+
+    // Add nodes on the Path into Candidates.
+    Candidates.insert(Path.begin(), Path.end());
+  }
+
+  // Sort the nodes in Candidates in top-down order and save the nodes
+  // in Orders.
+  unsigned Idx = 0;
+  SmallVector<BasicBlock *, 16> Orders;
+  Orders.push_back(Entry);
+  while (Idx != Orders.size()) {
+    BasicBlock *Node = Orders[Idx++];
+    for (auto ChildDomNode : DT.getNode(Node)->children()) {
+      if (Candidates.count(ChildDomNode->getBlock()))
+        Orders.push_back(ChildDomNode->getBlock());
+    }
+  }
+
+  // Visit Orders in bottom-up order.
+  using InsertPtsCostPair =
+      std::pair<SetVector<BasicBlock *>, BlockFrequency>;
+
+  // InsertPtsMap is a map from a BB to the best insertion points for the
+  // subtree of BB (subtree not including the BB itself).
+  DenseMap<BasicBlock *, InsertPtsCostPair> InsertPtsMap;
+  InsertPtsMap.reserve(Orders.size() + 1);
+  for (auto RIt = Orders.rbegin(); RIt != Orders.rend(); RIt++) {
+    BasicBlock *Node = *RIt;
+    bool NodeInBBs = BBs.count(Node);
+    auto &InsertPts = InsertPtsMap[Node].first;
+    BlockFrequency &InsertPtsFreq = InsertPtsMap[Node].second;
+
+    // Return the optimal insert points in BBs.
+    if (Node == Entry) {
+      BBs.clear();
+      if (InsertPtsFreq > BFI.getBlockFreq(Node) ||
+          (InsertPtsFreq == BFI.getBlockFreq(Node) && InsertPts.size() > 1))
+        BBs.insert(Entry);
+      else
+        BBs.insert(InsertPts.begin(), InsertPts.end());
+      break;
+    }
+
+    BasicBlock *Parent = DT.getNode(Node)->getIDom()->getBlock();
+    // Initially, ParentInsertPts is empty and ParentPtsFreq is 0. Every child
+    // will update its parent's ParentInsertPts and ParentPtsFreq.
+    auto &ParentInsertPts = InsertPtsMap[Parent].first;
+    BlockFrequency &ParentPtsFreq = InsertPtsMap[Parent].second;
+    // Choose to insert in Node or in subtree of Node.
+    // Don't hoist to EHPad because we may not find a proper place to insert
+    // in EHPad.
+    // If the total frequency of InsertPts is the same as the frequency of the
+    // target Node, and InsertPts contains more than one nodes, choose hoisting
+    // to reduce code size.
+    if (NodeInBBs ||
+        (!Node->isEHPad() &&
+         (InsertPtsFreq > BFI.getBlockFreq(Node) ||
+          (InsertPtsFreq == BFI.getBlockFreq(Node) && InsertPts.size() > 1)))) {
+      ParentInsertPts.insert(Node);
+      ParentPtsFreq += BFI.getBlockFreq(Node);
+    } else {
+      ParentInsertPts.insert(InsertPts.begin(), InsertPts.end());
+      ParentPtsFreq += InsertPtsFreq;
+    }
+  }
+}
+
+/// Find an insertion point that dominates all uses.
+SetVector<Instruction *> ConstantHoistingPass::findConstantInsertionPoint(
+    const ConstantInfo &ConstInfo) const {
+  assert(!ConstInfo.RebasedConstants.empty() && "Invalid constant info entry.");
+  // Collect all basic blocks.
+  SetVector<BasicBlock *> BBs;
+  SetVector<Instruction *> InsertPts;
+  for (auto const &RCI : ConstInfo.RebasedConstants)
+    for (auto const &U : RCI.Uses)
+      BBs.insert(findMatInsertPt(U.Inst, U.OpndIdx)->getParent());
+
+  if (BBs.count(Entry)) {
+    InsertPts.insert(&Entry->front());
+    return InsertPts;
+  }
+
+  if (BFI) {
+    findBestInsertionSet(*DT, *BFI, Entry, BBs);
+    for (auto BB : BBs) {
+      BasicBlock::iterator InsertPt = BB->begin();
+      for (; isa<PHINode>(InsertPt) || InsertPt->isEHPad(); ++InsertPt)
+        ;
+      InsertPts.insert(&*InsertPt);
+    }
+    return InsertPts;
+  }
+
+  while (BBs.size() >= 2) {
+    BasicBlock *BB, *BB1, *BB2;
+    BB1 = BBs.pop_back_val();
+    BB2 = BBs.pop_back_val();
+    BB = DT->findNearestCommonDominator(BB1, BB2);
+    if (BB == Entry) {
+      InsertPts.insert(&Entry->front());
+      return InsertPts;
+    }
+    BBs.insert(BB);
+  }
+  assert((BBs.size() == 1) && "Expected only one element.");
+  Instruction &FirstInst = (*BBs.begin())->front();
+  InsertPts.insert(findMatInsertPt(&FirstInst));
+  return InsertPts;
+}
+
+/// Record constant integer ConstInt for instruction Inst at operand
+/// index Idx.
+///
+/// The operand at index Idx is not necessarily the constant integer itself. It
+/// could also be a cast instruction or a constant expression that uses the
+/// constant integer.
+void ConstantHoistingPass::collectConstantCandidates(
+    ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx,
+    ConstantInt *ConstInt) {
+  unsigned Cost;
+  // Ask the target about the cost of materializing the constant for the given
+  // instruction and operand index.
+  if (auto IntrInst = dyn_cast<IntrinsicInst>(Inst))
+    Cost = TTI->getIntImmCostIntrin(IntrInst->getIntrinsicID(), Idx,
+                                    ConstInt->getValue(), ConstInt->getType(),
+                                    TargetTransformInfo::TCK_SizeAndLatency);
+  else
     Cost = TTI->getIntImmCostInst(
         Inst->getOpcode(), Idx, ConstInt->getValue(), ConstInt->getType(),
         TargetTransformInfo::TCK_SizeAndLatency, Inst);
- 
-  // Ignore cheap integer constants. 
-  if (Cost > TargetTransformInfo::TCC_Basic) { 
-    ConstCandMapType::iterator Itr; 
-    bool Inserted; 
-    ConstPtrUnionType Cand = ConstInt; 
-    std::tie(Itr, Inserted) = ConstCandMap.insert(std::make_pair(Cand, 0)); 
-    if (Inserted) { 
-      ConstIntCandVec.push_back(ConstantCandidate(ConstInt)); 
-      Itr->second = ConstIntCandVec.size() - 1; 
-    } 
-    ConstIntCandVec[Itr->second].addUser(Inst, Idx, Cost); 
-    LLVM_DEBUG(if (isa<ConstantInt>(Inst->getOperand(Idx))) dbgs() 
-                   << "Collect constant " << *ConstInt << " from " << *Inst 
-                   << " with cost " << Cost << '\n'; 
-               else dbgs() << "Collect constant " << *ConstInt 
-                           << " indirectly from " << *Inst << " via " 
-                           << *Inst->getOperand(Idx) << " with cost " << Cost 
-                           << '\n';); 
-  } 
-} 
- 
-/// Record constant GEP expression for instruction Inst at operand index Idx. 
-void ConstantHoistingPass::collectConstantCandidates( 
-    ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx, 
-    ConstantExpr *ConstExpr) { 
-  // TODO: Handle vector GEPs 
-  if (ConstExpr->getType()->isVectorTy()) 
-    return; 
- 
-  GlobalVariable *BaseGV = dyn_cast<GlobalVariable>(ConstExpr->getOperand(0)); 
-  if (!BaseGV) 
-    return; 
- 
-  // Get offset from the base GV. 
-  PointerType *GVPtrTy = cast<PointerType>(BaseGV->getType()); 
-  IntegerType *PtrIntTy = DL->getIntPtrType(*Ctx, GVPtrTy->getAddressSpace()); 
-  APInt Offset(DL->getTypeSizeInBits(PtrIntTy), /*val*/0, /*isSigned*/true); 
-  auto *GEPO = cast<GEPOperator>(ConstExpr); 
-  if (!GEPO->accumulateConstantOffset(*DL, Offset)) 
-    return; 
- 
-  if (!Offset.isIntN(32)) 
-    return; 
- 
-  // A constant GEP expression that has a GlobalVariable as base pointer is 
-  // usually lowered to a load from constant pool. Such operation is unlikely 
-  // to be cheaper than compute it by <Base + Offset>, which can be lowered to 
-  // an ADD instruction or folded into Load/Store instruction. 
+
+  // Ignore cheap integer constants.
+  if (Cost > TargetTransformInfo::TCC_Basic) {
+    ConstCandMapType::iterator Itr;
+    bool Inserted;
+    ConstPtrUnionType Cand = ConstInt;
+    std::tie(Itr, Inserted) = ConstCandMap.insert(std::make_pair(Cand, 0));
+    if (Inserted) {
+      ConstIntCandVec.push_back(ConstantCandidate(ConstInt));
+      Itr->second = ConstIntCandVec.size() - 1;
+    }
+    ConstIntCandVec[Itr->second].addUser(Inst, Idx, Cost);
+    LLVM_DEBUG(if (isa<ConstantInt>(Inst->getOperand(Idx))) dbgs()
+                   << "Collect constant " << *ConstInt << " from " << *Inst
+                   << " with cost " << Cost << '\n';
+               else dbgs() << "Collect constant " << *ConstInt
+                           << " indirectly from " << *Inst << " via "
+                           << *Inst->getOperand(Idx) << " with cost " << Cost
+                           << '\n';);
+  }
+}
+
+/// Record constant GEP expression for instruction Inst at operand index Idx.
+void ConstantHoistingPass::collectConstantCandidates(
+    ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx,
+    ConstantExpr *ConstExpr) {
+  // TODO: Handle vector GEPs
+  if (ConstExpr->getType()->isVectorTy())
+    return;
+
+  GlobalVariable *BaseGV = dyn_cast<GlobalVariable>(ConstExpr->getOperand(0));
+  if (!BaseGV)
+    return;
+
+  // Get offset from the base GV.
+  PointerType *GVPtrTy = cast<PointerType>(BaseGV->getType());
+  IntegerType *PtrIntTy = DL->getIntPtrType(*Ctx, GVPtrTy->getAddressSpace());
+  APInt Offset(DL->getTypeSizeInBits(PtrIntTy), /*val*/0, /*isSigned*/true);
+  auto *GEPO = cast<GEPOperator>(ConstExpr);
+  if (!GEPO->accumulateConstantOffset(*DL, Offset))
+    return;
+
+  if (!Offset.isIntN(32))
+    return;
+
+  // A constant GEP expression that has a GlobalVariable as base pointer is
+  // usually lowered to a load from constant pool. Such operation is unlikely
+  // to be cheaper than compute it by <Base + Offset>, which can be lowered to
+  // an ADD instruction or folded into Load/Store instruction.
   int Cost =
       TTI->getIntImmCostInst(Instruction::Add, 1, Offset, PtrIntTy,
                              TargetTransformInfo::TCK_SizeAndLatency, Inst);
-  ConstCandVecType &ExprCandVec = ConstGEPCandMap[BaseGV]; 
-  ConstCandMapType::iterator Itr; 
-  bool Inserted; 
-  ConstPtrUnionType Cand = ConstExpr; 
-  std::tie(Itr, Inserted) = ConstCandMap.insert(std::make_pair(Cand, 0)); 
-  if (Inserted) { 
-    ExprCandVec.push_back(ConstantCandidate( 
-        ConstantInt::get(Type::getInt32Ty(*Ctx), Offset.getLimitedValue()), 
-        ConstExpr)); 
-    Itr->second = ExprCandVec.size() - 1; 
-  } 
-  ExprCandVec[Itr->second].addUser(Inst, Idx, Cost); 
-} 
- 
-/// Check the operand for instruction Inst at index Idx. 
-void ConstantHoistingPass::collectConstantCandidates( 
-    ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx) { 
-  Value *Opnd = Inst->getOperand(Idx); 
- 
-  // Visit constant integers. 
-  if (auto ConstInt = dyn_cast<ConstantInt>(Opnd)) { 
-    collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt); 
-    return; 
-  } 
- 
-  // Visit cast instructions that have constant integers. 
-  if (auto CastInst = dyn_cast<Instruction>(Opnd)) { 
-    // Only visit cast instructions, which have been skipped. All other 
-    // instructions should have already been visited. 
-    if (!CastInst->isCast()) 
-      return; 
- 
-    if (auto *ConstInt = dyn_cast<ConstantInt>(CastInst->getOperand(0))) { 
-      // Pretend the constant is directly used by the instruction and ignore 
-      // the cast instruction. 
-      collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt); 
-      return; 
-    } 
-  } 
- 
-  // Visit constant expressions that have constant integers. 
-  if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) { 
-    // Handle constant gep expressions. 
-    if (ConstHoistGEP && ConstExpr->isGEPWithNoNotionalOverIndexing()) 
-      collectConstantCandidates(ConstCandMap, Inst, Idx, ConstExpr); 
- 
-    // Only visit constant cast expressions. 
-    if (!ConstExpr->isCast()) 
-      return; 
- 
-    if (auto ConstInt = dyn_cast<ConstantInt>(ConstExpr->getOperand(0))) { 
-      // Pretend the constant is directly used by the instruction and ignore 
-      // the constant expression. 
-      collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt); 
-      return; 
-    } 
-  } 
-} 
- 
-/// Scan the instruction for expensive integer constants and record them 
-/// in the constant candidate vector. 
-void ConstantHoistingPass::collectConstantCandidates( 
-    ConstCandMapType &ConstCandMap, Instruction *Inst) { 
-  // Skip all cast instructions. They are visited indirectly later on. 
-  if (Inst->isCast()) 
-    return; 
- 
-  // Scan all operands. 
-  for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) { 
-    // The cost of materializing the constants (defined in 
-    // `TargetTransformInfo::getIntImmCostInst`) for instructions which only 
-    // take constant variables is lower than `TargetTransformInfo::TCC_Basic`. 
-    // So it's safe for us to collect constant candidates from all 
-    // IntrinsicInsts. 
-    if (canReplaceOperandWithVariable(Inst, Idx)) { 
-      collectConstantCandidates(ConstCandMap, Inst, Idx); 
-    } 
-  } // end of for all operands 
-} 
- 
-/// Collect all integer constants in the function that cannot be folded 
-/// into an instruction itself. 
-void ConstantHoistingPass::collectConstantCandidates(Function &Fn) { 
-  ConstCandMapType ConstCandMap; 
-  for (BasicBlock &BB : Fn) { 
-    // Ignore unreachable basic blocks. 
-    if (!DT->isReachableFromEntry(&BB)) 
-      continue; 
-    for (Instruction &Inst : BB) 
-      collectConstantCandidates(ConstCandMap, &Inst); 
-  } 
-} 
- 
-// This helper function is necessary to deal with values that have different 
-// bit widths (APInt Operator- does not like that). If the value cannot be 
-// represented in uint64 we return an "empty" APInt. This is then interpreted 
-// as the value is not in range. 
-static Optional<APInt> calculateOffsetDiff(const APInt &V1, const APInt &V2) { 
-  Optional<APInt> Res = None; 
-  unsigned BW = V1.getBitWidth() > V2.getBitWidth() ? 
-                V1.getBitWidth() : V2.getBitWidth(); 
-  uint64_t LimVal1 = V1.getLimitedValue(); 
-  uint64_t LimVal2 = V2.getLimitedValue(); 
- 
-  if (LimVal1 == ~0ULL || LimVal2 == ~0ULL) 
-    return Res; 
- 
-  uint64_t Diff = LimVal1 - LimVal2; 
-  return APInt(BW, Diff, true); 
-} 
- 
-// From a list of constants, one needs to picked as the base and the other 
-// constants will be transformed into an offset from that base constant. The 
-// question is which we can pick best? For example, consider these constants 
-// and their number of uses: 
-// 
-//  Constants| 2 | 4 | 12 | 42 | 
-//  NumUses  | 3 | 2 |  8 |  7 | 
-// 
-// Selecting constant 12 because it has the most uses will generate negative 
-// offsets for constants 2 and 4 (i.e. -10 and -8 respectively). If negative 
-// offsets lead to less optimal code generation, then there might be better 
-// solutions. Suppose immediates in the range of 0..35 are most optimally 
-// supported by the architecture, then selecting constant 2 is most optimal 
-// because this will generate offsets: 0, 2, 10, 40. Offsets 0, 2 and 10 are in 
-// range 0..35, and thus 3 + 2 + 8 = 13 uses are in range. Selecting 12 would 
-// have only 8 uses in range, so choosing 2 as a base is more optimal. Thus, in 
-// selecting the base constant the range of the offsets is a very important 
-// factor too that we take into account here. This algorithm calculates a total 
-// costs for selecting a constant as the base and substract the costs if 
-// immediates are out of range. It has quadratic complexity, so we call this 
-// function only when we're optimising for size and there are less than 100 
-// constants, we fall back to the straightforward algorithm otherwise 
-// which does not do all the offset calculations. 
-unsigned 
-ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S, 
-                                           ConstCandVecType::iterator E, 
-                                           ConstCandVecType::iterator &MaxCostItr) { 
-  unsigned NumUses = 0; 
- 
-  bool OptForSize = Entry->getParent()->hasOptSize() || 
-                    llvm::shouldOptimizeForSize(Entry->getParent(), PSI, BFI, 
-                                                PGSOQueryType::IRPass); 
-  if (!OptForSize || std::distance(S,E) > 100) { 
-    for (auto ConstCand = S; ConstCand != E; ++ConstCand) { 
-      NumUses += ConstCand->Uses.size(); 
-      if (ConstCand->CumulativeCost > MaxCostItr->CumulativeCost) 
-        MaxCostItr = ConstCand; 
-    } 
-    return NumUses; 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "== Maximize constants in range ==\n"); 
-  int MaxCost = -1; 
-  for (auto ConstCand = S; ConstCand != E; ++ConstCand) { 
-    auto Value = ConstCand->ConstInt->getValue(); 
-    Type *Ty = ConstCand->ConstInt->getType(); 
-    int Cost = 0; 
-    NumUses += ConstCand->Uses.size(); 
-    LLVM_DEBUG(dbgs() << "= Constant: " << ConstCand->ConstInt->getValue() 
-                      << "\n"); 
- 
-    for (auto User : ConstCand->Uses) { 
-      unsigned Opcode = User.Inst->getOpcode(); 
-      unsigned OpndIdx = User.OpndIdx; 
-      Cost += TTI->getIntImmCostInst(Opcode, OpndIdx, Value, Ty, 
-                                     TargetTransformInfo::TCK_SizeAndLatency); 
-      LLVM_DEBUG(dbgs() << "Cost: " << Cost << "\n"); 
- 
-      for (auto C2 = S; C2 != E; ++C2) { 
-        Optional<APInt> Diff = calculateOffsetDiff( 
-                                   C2->ConstInt->getValue(), 
-                                   ConstCand->ConstInt->getValue()); 
-        if (Diff) { 
-          const int ImmCosts = 
-            TTI->getIntImmCodeSizeCost(Opcode, OpndIdx, Diff.getValue(), Ty); 
-          Cost -= ImmCosts; 
-          LLVM_DEBUG(dbgs() << "Offset " << Diff.getValue() << " " 
-                            << "has penalty: " << ImmCosts << "\n" 
-                            << "Adjusted cost: " << Cost << "\n"); 
-        } 
-      } 
-    } 
-    LLVM_DEBUG(dbgs() << "Cumulative cost: " << Cost << "\n"); 
-    if (Cost > MaxCost) { 
-      MaxCost = Cost; 
-      MaxCostItr = ConstCand; 
-      LLVM_DEBUG(dbgs() << "New candidate: " << MaxCostItr->ConstInt->getValue() 
-                        << "\n"); 
-    } 
-  } 
-  return NumUses; 
-} 
- 
-/// Find the base constant within the given range and rebase all other 
-/// constants with respect to the base constant. 
-void ConstantHoistingPass::findAndMakeBaseConstant( 
-    ConstCandVecType::iterator S, ConstCandVecType::iterator E, 
-    SmallVectorImpl<consthoist::ConstantInfo> &ConstInfoVec) { 
-  auto MaxCostItr = S; 
-  unsigned NumUses = maximizeConstantsInRange(S, E, MaxCostItr); 
- 
-  // Don't hoist constants that have only one use. 
-  if (NumUses <= 1) 
-    return; 
- 
-  ConstantInt *ConstInt = MaxCostItr->ConstInt; 
-  ConstantExpr *ConstExpr = MaxCostItr->ConstExpr; 
-  ConstantInfo ConstInfo; 
-  ConstInfo.BaseInt = ConstInt; 
-  ConstInfo.BaseExpr = ConstExpr; 
-  Type *Ty = ConstInt->getType(); 
- 
-  // Rebase the constants with respect to the base constant. 
-  for (auto ConstCand = S; ConstCand != E; ++ConstCand) { 
-    APInt Diff = ConstCand->ConstInt->getValue() - ConstInt->getValue(); 
-    Constant *Offset = Diff == 0 ? nullptr : ConstantInt::get(Ty, Diff); 
-    Type *ConstTy = 
-        ConstCand->ConstExpr ? ConstCand->ConstExpr->getType() : nullptr; 
-    ConstInfo.RebasedConstants.push_back( 
-      RebasedConstantInfo(std::move(ConstCand->Uses), Offset, ConstTy)); 
-  } 
-  ConstInfoVec.push_back(std::move(ConstInfo)); 
-} 
- 
-/// Finds and combines constant candidates that can be easily 
-/// rematerialized with an add from a common base constant. 
-void ConstantHoistingPass::findBaseConstants(GlobalVariable *BaseGV) { 
-  // If BaseGV is nullptr, find base among candidate constant integers; 
-  // Otherwise find base among constant GEPs that share the same BaseGV. 
-  ConstCandVecType &ConstCandVec = BaseGV ? 
-      ConstGEPCandMap[BaseGV] : ConstIntCandVec; 
-  ConstInfoVecType &ConstInfoVec = BaseGV ? 
-      ConstGEPInfoMap[BaseGV] : ConstIntInfoVec; 
- 
-  // Sort the constants by value and type. This invalidates the mapping! 
-  llvm::stable_sort(ConstCandVec, [](const ConstantCandidate &LHS, 
-                                     const ConstantCandidate &RHS) { 
-    if (LHS.ConstInt->getType() != RHS.ConstInt->getType()) 
-      return LHS.ConstInt->getType()->getBitWidth() < 
-             RHS.ConstInt->getType()->getBitWidth(); 
-    return LHS.ConstInt->getValue().ult(RHS.ConstInt->getValue()); 
-  }); 
- 
-  // Simple linear scan through the sorted constant candidate vector for viable 
-  // merge candidates. 
-  auto MinValItr = ConstCandVec.begin(); 
-  for (auto CC = std::next(ConstCandVec.begin()), E = ConstCandVec.end(); 
-       CC != E; ++CC) { 
-    if (MinValItr->ConstInt->getType() == CC->ConstInt->getType()) { 
-      Type *MemUseValTy = nullptr; 
-      for (auto &U : CC->Uses) { 
-        auto *UI = U.Inst; 
-        if (LoadInst *LI = dyn_cast<LoadInst>(UI)) { 
-          MemUseValTy = LI->getType(); 
-          break; 
-        } else if (StoreInst *SI = dyn_cast<StoreInst>(UI)) { 
-          // Make sure the constant is used as pointer operand of the StoreInst. 
-          if (SI->getPointerOperand() == SI->getOperand(U.OpndIdx)) { 
-            MemUseValTy = SI->getValueOperand()->getType(); 
-            break; 
-          } 
-        } 
-      } 
- 
-      // Check if the constant is in range of an add with immediate. 
-      APInt Diff = CC->ConstInt->getValue() - MinValItr->ConstInt->getValue(); 
-      if ((Diff.getBitWidth() <= 64) && 
-          TTI->isLegalAddImmediate(Diff.getSExtValue()) && 
-          // Check if Diff can be used as offset in addressing mode of the user 
-          // memory instruction. 
-          (!MemUseValTy || TTI->isLegalAddressingMode(MemUseValTy, 
-           /*BaseGV*/nullptr, /*BaseOffset*/Diff.getSExtValue(), 
-           /*HasBaseReg*/true, /*Scale*/0))) 
-        continue; 
-    } 
-    // We either have now a different constant type or the constant is not in 
-    // range of an add with immediate anymore. 
-    findAndMakeBaseConstant(MinValItr, CC, ConstInfoVec); 
-    // Start a new base constant search. 
-    MinValItr = CC; 
-  } 
-  // Finalize the last base constant search. 
-  findAndMakeBaseConstant(MinValItr, ConstCandVec.end(), ConstInfoVec); 
-} 
- 
-/// Updates the operand at Idx in instruction Inst with the result of 
-///        instruction Mat. If the instruction is a PHI node then special 
-///        handling for duplicate values form the same incoming basic block is 
-///        required. 
-/// \return The update will always succeed, but the return value indicated if 
-///         Mat was used for the update or not. 
-static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat) { 
-  if (auto PHI = dyn_cast<PHINode>(Inst)) { 
-    // Check if any previous operand of the PHI node has the same incoming basic 
-    // block. This is a very odd case that happens when the incoming basic block 
-    // has a switch statement. In this case use the same value as the previous 
-    // operand(s), otherwise we will fail verification due to different values. 
-    // The values are actually the same, but the variable names are different 
-    // and the verifier doesn't like that. 
-    BasicBlock *IncomingBB = PHI->getIncomingBlock(Idx); 
-    for (unsigned i = 0; i < Idx; ++i) { 
-      if (PHI->getIncomingBlock(i) == IncomingBB) { 
-        Value *IncomingVal = PHI->getIncomingValue(i); 
-        Inst->setOperand(Idx, IncomingVal); 
-        return false; 
-      } 
-    } 
-  } 
- 
-  Inst->setOperand(Idx, Mat); 
-  return true; 
-} 
- 
-/// Emit materialization code for all rebased constants and update their 
-/// users. 
-void ConstantHoistingPass::emitBaseConstants(Instruction *Base, 
-                                             Constant *Offset, 
-                                             Type *Ty, 
-                                             const ConstantUser &ConstUser) { 
-  Instruction *Mat = Base; 
- 
-  // The same offset can be dereferenced to different types in nested struct. 
-  if (!Offset && Ty && Ty != Base->getType()) 
-    Offset = ConstantInt::get(Type::getInt32Ty(*Ctx), 0); 
- 
-  if (Offset) { 
-    Instruction *InsertionPt = findMatInsertPt(ConstUser.Inst, 
-                                               ConstUser.OpndIdx); 
-    if (Ty) { 
-      // Constant being rebased is a ConstantExpr. 
-      PointerType *Int8PtrTy = Type::getInt8PtrTy(*Ctx, 
-          cast<PointerType>(Ty)->getAddressSpace()); 
-      Base = new BitCastInst(Base, Int8PtrTy, "base_bitcast", InsertionPt); 
-      Mat = GetElementPtrInst::Create(Int8PtrTy->getElementType(), Base, 
-          Offset, "mat_gep", InsertionPt); 
-      Mat = new BitCastInst(Mat, Ty, "mat_bitcast", InsertionPt); 
-    } else 
-      // Constant being rebased is a ConstantInt. 
-      Mat = BinaryOperator::Create(Instruction::Add, Base, Offset, 
-                                 "const_mat", InsertionPt); 
- 
-    LLVM_DEBUG(dbgs() << "Materialize constant (" << *Base->getOperand(0) 
-                      << " + " << *Offset << ") in BB " 
-                      << Mat->getParent()->getName() << '\n' 
-                      << *Mat << '\n'); 
-    Mat->setDebugLoc(ConstUser.Inst->getDebugLoc()); 
-  } 
-  Value *Opnd = ConstUser.Inst->getOperand(ConstUser.OpndIdx); 
- 
-  // Visit constant integer. 
-  if (isa<ConstantInt>(Opnd)) { 
-    LLVM_DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n'); 
-    if (!updateOperand(ConstUser.Inst, ConstUser.OpndIdx, Mat) && Offset) 
-      Mat->eraseFromParent(); 
-    LLVM_DEBUG(dbgs() << "To    : " << *ConstUser.Inst << '\n'); 
-    return; 
-  } 
- 
-  // Visit cast instruction. 
-  if (auto CastInst = dyn_cast<Instruction>(Opnd)) { 
-    assert(CastInst->isCast() && "Expected an cast instruction!"); 
-    // Check if we already have visited this cast instruction before to avoid 
-    // unnecessary cloning. 
-    Instruction *&ClonedCastInst = ClonedCastMap[CastInst]; 
-    if (!ClonedCastInst) { 
-      ClonedCastInst = CastInst->clone(); 
-      ClonedCastInst->setOperand(0, Mat); 
-      ClonedCastInst->insertAfter(CastInst); 
-      // Use the same debug location as the original cast instruction. 
-      ClonedCastInst->setDebugLoc(CastInst->getDebugLoc()); 
-      LLVM_DEBUG(dbgs() << "Clone instruction: " << *CastInst << '\n' 
-                        << "To               : " << *ClonedCastInst << '\n'); 
-    } 
- 
-    LLVM_DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n'); 
-    updateOperand(ConstUser.Inst, ConstUser.OpndIdx, ClonedCastInst); 
-    LLVM_DEBUG(dbgs() << "To    : " << *ConstUser.Inst << '\n'); 
-    return; 
-  } 
- 
-  // Visit constant expression. 
-  if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) { 
-    if (ConstExpr->isGEPWithNoNotionalOverIndexing()) { 
-      // Operand is a ConstantGEP, replace it. 
-      updateOperand(ConstUser.Inst, ConstUser.OpndIdx, Mat); 
-      return; 
-    } 
- 
-    // Aside from constant GEPs, only constant cast expressions are collected. 
-    assert(ConstExpr->isCast() && "ConstExpr should be a cast"); 
-    Instruction *ConstExprInst = ConstExpr->getAsInstruction(); 
-    ConstExprInst->setOperand(0, Mat); 
-    ConstExprInst->insertBefore(findMatInsertPt(ConstUser.Inst, 
-                                                ConstUser.OpndIdx)); 
- 
-    // Use the same debug location as the instruction we are about to update. 
-    ConstExprInst->setDebugLoc(ConstUser.Inst->getDebugLoc()); 
- 
-    LLVM_DEBUG(dbgs() << "Create instruction: " << *ConstExprInst << '\n' 
-                      << "From              : " << *ConstExpr << '\n'); 
-    LLVM_DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n'); 
-    if (!updateOperand(ConstUser.Inst, ConstUser.OpndIdx, ConstExprInst)) { 
-      ConstExprInst->eraseFromParent(); 
-      if (Offset) 
-        Mat->eraseFromParent(); 
-    } 
-    LLVM_DEBUG(dbgs() << "To    : " << *ConstUser.Inst << '\n'); 
-    return; 
-  } 
-} 
- 
-/// Hoist and hide the base constant behind a bitcast and emit 
-/// materialization code for derived constants. 
-bool ConstantHoistingPass::emitBaseConstants(GlobalVariable *BaseGV) { 
-  bool MadeChange = false; 
-  SmallVectorImpl<consthoist::ConstantInfo> &ConstInfoVec = 
-      BaseGV ? ConstGEPInfoMap[BaseGV] : ConstIntInfoVec; 
-  for (auto const &ConstInfo : ConstInfoVec) { 
-    SetVector<Instruction *> IPSet = findConstantInsertionPoint(ConstInfo); 
-    // We can have an empty set if the function contains unreachable blocks. 
-    if (IPSet.empty()) 
-      continue; 
- 
-    unsigned UsesNum = 0; 
-    unsigned ReBasesNum = 0; 
-    unsigned NotRebasedNum = 0; 
-    for (Instruction *IP : IPSet) { 
-      // First, collect constants depending on this IP of the base. 
-      unsigned Uses = 0; 
-      using RebasedUse = std::tuple<Constant *, Type *, ConstantUser>; 
-      SmallVector<RebasedUse, 4> ToBeRebased; 
-      for (auto const &RCI : ConstInfo.RebasedConstants) { 
-        for (auto const &U : RCI.Uses) { 
-          Uses++; 
-          BasicBlock *OrigMatInsertBB = 
-              findMatInsertPt(U.Inst, U.OpndIdx)->getParent(); 
-          // If Base constant is to be inserted in multiple places, 
-          // generate rebase for U using the Base dominating U. 
-          if (IPSet.size() == 1 || 
-              DT->dominates(IP->getParent(), OrigMatInsertBB)) 
-            ToBeRebased.push_back(RebasedUse(RCI.Offset, RCI.Ty, U)); 
-        } 
-      } 
-      UsesNum = Uses; 
- 
-      // If only few constants depend on this IP of base, skip rebasing, 
-      // assuming the base and the rebased have the same materialization cost. 
-      if (ToBeRebased.size() < MinNumOfDependentToRebase) { 
-        NotRebasedNum += ToBeRebased.size(); 
-        continue; 
-      } 
- 
-      // Emit an instance of the base at this IP. 
-      Instruction *Base = nullptr; 
-      // Hoist and hide the base constant behind a bitcast. 
-      if (ConstInfo.BaseExpr) { 
-        assert(BaseGV && "A base constant expression must have an base GV"); 
-        Type *Ty = ConstInfo.BaseExpr->getType(); 
-        Base = new BitCastInst(ConstInfo.BaseExpr, Ty, "const", IP); 
-      } else { 
-        IntegerType *Ty = ConstInfo.BaseInt->getType(); 
-        Base = new BitCastInst(ConstInfo.BaseInt, Ty, "const", IP); 
-      } 
- 
-      Base->setDebugLoc(IP->getDebugLoc()); 
- 
-      LLVM_DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseInt 
-                        << ") to BB " << IP->getParent()->getName() << '\n' 
-                        << *Base << '\n'); 
- 
-      // Emit materialization code for rebased constants depending on this IP. 
-      for (auto const &R : ToBeRebased) { 
-        Constant *Off = std::get<0>(R); 
-        Type *Ty = std::get<1>(R); 
-        ConstantUser U = std::get<2>(R); 
-        emitBaseConstants(Base, Off, Ty, U); 
-        ReBasesNum++; 
-        // Use the same debug location as the last user of the constant. 
-        Base->setDebugLoc(DILocation::getMergedLocation( 
-            Base->getDebugLoc(), U.Inst->getDebugLoc())); 
-      } 
-      assert(!Base->use_empty() && "The use list is empty!?"); 
-      assert(isa<Instruction>(Base->user_back()) && 
-             "All uses should be instructions."); 
-    } 
-    (void)UsesNum; 
-    (void)ReBasesNum; 
-    (void)NotRebasedNum; 
-    // Expect all uses are rebased after rebase is done. 
-    assert(UsesNum == (ReBasesNum + NotRebasedNum) && 
-           "Not all uses are rebased"); 
- 
-    NumConstantsHoisted++; 
- 
-    // Base constant is also included in ConstInfo.RebasedConstants, so 
-    // deduct 1 from ConstInfo.RebasedConstants.size(). 
-    NumConstantsRebased += ConstInfo.RebasedConstants.size() - 1; 
- 
-    MadeChange = true; 
-  } 
-  return MadeChange; 
-} 
- 
-/// Check all cast instructions we made a copy of and remove them if they 
-/// have no more users. 
-void ConstantHoistingPass::deleteDeadCastInst() const { 
-  for (auto const &I : ClonedCastMap) 
-    if (I.first->use_empty()) 
-      I.first->eraseFromParent(); 
-} 
- 
-/// Optimize expensive integer constants in the given function. 
-bool ConstantHoistingPass::runImpl(Function &Fn, TargetTransformInfo &TTI, 
-                                   DominatorTree &DT, BlockFrequencyInfo *BFI, 
-                                   BasicBlock &Entry, ProfileSummaryInfo *PSI) { 
-  this->TTI = &TTI; 
-  this->DT = &DT; 
-  this->BFI = BFI; 
-  this->DL = &Fn.getParent()->getDataLayout(); 
-  this->Ctx = &Fn.getContext(); 
-  this->Entry = &Entry; 
-  this->PSI = PSI; 
-  // Collect all constant candidates. 
-  collectConstantCandidates(Fn); 
- 
-  // Combine constants that can be easily materialized with an add from a common 
-  // base constant. 
-  if (!ConstIntCandVec.empty()) 
-    findBaseConstants(nullptr); 
+  ConstCandVecType &ExprCandVec = ConstGEPCandMap[BaseGV];
+  ConstCandMapType::iterator Itr;
+  bool Inserted;
+  ConstPtrUnionType Cand = ConstExpr;
+  std::tie(Itr, Inserted) = ConstCandMap.insert(std::make_pair(Cand, 0));
+  if (Inserted) {
+    ExprCandVec.push_back(ConstantCandidate(
+        ConstantInt::get(Type::getInt32Ty(*Ctx), Offset.getLimitedValue()),
+        ConstExpr));
+    Itr->second = ExprCandVec.size() - 1;
+  }
+  ExprCandVec[Itr->second].addUser(Inst, Idx, Cost);
+}
+
+/// Check the operand for instruction Inst at index Idx.
+void ConstantHoistingPass::collectConstantCandidates(
+    ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx) {
+  Value *Opnd = Inst->getOperand(Idx);
+
+  // Visit constant integers.
+  if (auto ConstInt = dyn_cast<ConstantInt>(Opnd)) {
+    collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
+    return;
+  }
+
+  // Visit cast instructions that have constant integers.
+  if (auto CastInst = dyn_cast<Instruction>(Opnd)) {
+    // Only visit cast instructions, which have been skipped. All other
+    // instructions should have already been visited.
+    if (!CastInst->isCast())
+      return;
+
+    if (auto *ConstInt = dyn_cast<ConstantInt>(CastInst->getOperand(0))) {
+      // Pretend the constant is directly used by the instruction and ignore
+      // the cast instruction.
+      collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
+      return;
+    }
+  }
+
+  // Visit constant expressions that have constant integers.
+  if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) {
+    // Handle constant gep expressions.
+    if (ConstHoistGEP && ConstExpr->isGEPWithNoNotionalOverIndexing())
+      collectConstantCandidates(ConstCandMap, Inst, Idx, ConstExpr);
+
+    // Only visit constant cast expressions.
+    if (!ConstExpr->isCast())
+      return;
+
+    if (auto ConstInt = dyn_cast<ConstantInt>(ConstExpr->getOperand(0))) {
+      // Pretend the constant is directly used by the instruction and ignore
+      // the constant expression.
+      collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
+      return;
+    }
+  }
+}
+
+/// Scan the instruction for expensive integer constants and record them
+/// in the constant candidate vector.
+void ConstantHoistingPass::collectConstantCandidates(
+    ConstCandMapType &ConstCandMap, Instruction *Inst) {
+  // Skip all cast instructions. They are visited indirectly later on.
+  if (Inst->isCast())
+    return;
+
+  // Scan all operands.
+  for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) {
+    // The cost of materializing the constants (defined in
+    // `TargetTransformInfo::getIntImmCostInst`) for instructions which only
+    // take constant variables is lower than `TargetTransformInfo::TCC_Basic`.
+    // So it's safe for us to collect constant candidates from all
+    // IntrinsicInsts.
+    if (canReplaceOperandWithVariable(Inst, Idx)) {
+      collectConstantCandidates(ConstCandMap, Inst, Idx);
+    }
+  } // end of for all operands
+}
+
+/// Collect all integer constants in the function that cannot be folded
+/// into an instruction itself.
+void ConstantHoistingPass::collectConstantCandidates(Function &Fn) {
+  ConstCandMapType ConstCandMap;
+  for (BasicBlock &BB : Fn) {
+    // Ignore unreachable basic blocks.
+    if (!DT->isReachableFromEntry(&BB))
+      continue;
+    for (Instruction &Inst : BB)
+      collectConstantCandidates(ConstCandMap, &Inst);
+  }
+}
+
+// This helper function is necessary to deal with values that have different
+// bit widths (APInt Operator- does not like that). If the value cannot be
+// represented in uint64 we return an "empty" APInt. This is then interpreted
+// as the value is not in range.
+static Optional<APInt> calculateOffsetDiff(const APInt &V1, const APInt &V2) {
+  Optional<APInt> Res = None;
+  unsigned BW = V1.getBitWidth() > V2.getBitWidth() ?
+                V1.getBitWidth() : V2.getBitWidth();
+  uint64_t LimVal1 = V1.getLimitedValue();
+  uint64_t LimVal2 = V2.getLimitedValue();
+
+  if (LimVal1 == ~0ULL || LimVal2 == ~0ULL)
+    return Res;
+
+  uint64_t Diff = LimVal1 - LimVal2;
+  return APInt(BW, Diff, true);
+}
+
+// From a list of constants, one needs to picked as the base and the other
+// constants will be transformed into an offset from that base constant. The
+// question is which we can pick best? For example, consider these constants
+// and their number of uses:
+//
+//  Constants| 2 | 4 | 12 | 42 |
+//  NumUses  | 3 | 2 |  8 |  7 |
+//
+// Selecting constant 12 because it has the most uses will generate negative
+// offsets for constants 2 and 4 (i.e. -10 and -8 respectively). If negative
+// offsets lead to less optimal code generation, then there might be better
+// solutions. Suppose immediates in the range of 0..35 are most optimally
+// supported by the architecture, then selecting constant 2 is most optimal
+// because this will generate offsets: 0, 2, 10, 40. Offsets 0, 2 and 10 are in
+// range 0..35, and thus 3 + 2 + 8 = 13 uses are in range. Selecting 12 would
+// have only 8 uses in range, so choosing 2 as a base is more optimal. Thus, in
+// selecting the base constant the range of the offsets is a very important
+// factor too that we take into account here. This algorithm calculates a total
+// costs for selecting a constant as the base and substract the costs if
+// immediates are out of range. It has quadratic complexity, so we call this
+// function only when we're optimising for size and there are less than 100
+// constants, we fall back to the straightforward algorithm otherwise
+// which does not do all the offset calculations.
+unsigned
+ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S,
+                                           ConstCandVecType::iterator E,
+                                           ConstCandVecType::iterator &MaxCostItr) {
+  unsigned NumUses = 0;
+
+  bool OptForSize = Entry->getParent()->hasOptSize() ||
+                    llvm::shouldOptimizeForSize(Entry->getParent(), PSI, BFI,
+                                                PGSOQueryType::IRPass);
+  if (!OptForSize || std::distance(S,E) > 100) {
+    for (auto ConstCand = S; ConstCand != E; ++ConstCand) {
+      NumUses += ConstCand->Uses.size();
+      if (ConstCand->CumulativeCost > MaxCostItr->CumulativeCost)
+        MaxCostItr = ConstCand;
+    }
+    return NumUses;
+  }
+
+  LLVM_DEBUG(dbgs() << "== Maximize constants in range ==\n");
+  int MaxCost = -1;
+  for (auto ConstCand = S; ConstCand != E; ++ConstCand) {
+    auto Value = ConstCand->ConstInt->getValue();
+    Type *Ty = ConstCand->ConstInt->getType();
+    int Cost = 0;
+    NumUses += ConstCand->Uses.size();
+    LLVM_DEBUG(dbgs() << "= Constant: " << ConstCand->ConstInt->getValue()
+                      << "\n");
+
+    for (auto User : ConstCand->Uses) {
+      unsigned Opcode = User.Inst->getOpcode();
+      unsigned OpndIdx = User.OpndIdx;
+      Cost += TTI->getIntImmCostInst(Opcode, OpndIdx, Value, Ty,
+                                     TargetTransformInfo::TCK_SizeAndLatency);
+      LLVM_DEBUG(dbgs() << "Cost: " << Cost << "\n");
+
+      for (auto C2 = S; C2 != E; ++C2) {
+        Optional<APInt> Diff = calculateOffsetDiff(
+                                   C2->ConstInt->getValue(),
+                                   ConstCand->ConstInt->getValue());
+        if (Diff) {
+          const int ImmCosts =
+            TTI->getIntImmCodeSizeCost(Opcode, OpndIdx, Diff.getValue(), Ty);
+          Cost -= ImmCosts;
+          LLVM_DEBUG(dbgs() << "Offset " << Diff.getValue() << " "
+                            << "has penalty: " << ImmCosts << "\n"
+                            << "Adjusted cost: " << Cost << "\n");
+        }
+      }
+    }
+    LLVM_DEBUG(dbgs() << "Cumulative cost: " << Cost << "\n");
+    if (Cost > MaxCost) {
+      MaxCost = Cost;
+      MaxCostItr = ConstCand;
+      LLVM_DEBUG(dbgs() << "New candidate: " << MaxCostItr->ConstInt->getValue()
+                        << "\n");
+    }
+  }
+  return NumUses;
+}
+
+/// Find the base constant within the given range and rebase all other
+/// constants with respect to the base constant.
+void ConstantHoistingPass::findAndMakeBaseConstant(
+    ConstCandVecType::iterator S, ConstCandVecType::iterator E,
+    SmallVectorImpl<consthoist::ConstantInfo> &ConstInfoVec) {
+  auto MaxCostItr = S;
+  unsigned NumUses = maximizeConstantsInRange(S, E, MaxCostItr);
+
+  // Don't hoist constants that have only one use.
+  if (NumUses <= 1)
+    return;
+
+  ConstantInt *ConstInt = MaxCostItr->ConstInt;
+  ConstantExpr *ConstExpr = MaxCostItr->ConstExpr;
+  ConstantInfo ConstInfo;
+  ConstInfo.BaseInt = ConstInt;
+  ConstInfo.BaseExpr = ConstExpr;
+  Type *Ty = ConstInt->getType();
+
+  // Rebase the constants with respect to the base constant.
+  for (auto ConstCand = S; ConstCand != E; ++ConstCand) {
+    APInt Diff = ConstCand->ConstInt->getValue() - ConstInt->getValue();
+    Constant *Offset = Diff == 0 ? nullptr : ConstantInt::get(Ty, Diff);
+    Type *ConstTy =
+        ConstCand->ConstExpr ? ConstCand->ConstExpr->getType() : nullptr;
+    ConstInfo.RebasedConstants.push_back(
+      RebasedConstantInfo(std::move(ConstCand->Uses), Offset, ConstTy));
+  }
+  ConstInfoVec.push_back(std::move(ConstInfo));
+}
+
+/// Finds and combines constant candidates that can be easily
+/// rematerialized with an add from a common base constant.
+void ConstantHoistingPass::findBaseConstants(GlobalVariable *BaseGV) {
+  // If BaseGV is nullptr, find base among candidate constant integers;
+  // Otherwise find base among constant GEPs that share the same BaseGV.
+  ConstCandVecType &ConstCandVec = BaseGV ?
+      ConstGEPCandMap[BaseGV] : ConstIntCandVec;
+  ConstInfoVecType &ConstInfoVec = BaseGV ?
+      ConstGEPInfoMap[BaseGV] : ConstIntInfoVec;
+
+  // Sort the constants by value and type. This invalidates the mapping!
+  llvm::stable_sort(ConstCandVec, [](const ConstantCandidate &LHS,
+                                     const ConstantCandidate &RHS) {
+    if (LHS.ConstInt->getType() != RHS.ConstInt->getType())
+      return LHS.ConstInt->getType()->getBitWidth() <
+             RHS.ConstInt->getType()->getBitWidth();
+    return LHS.ConstInt->getValue().ult(RHS.ConstInt->getValue());
+  });
+
+  // Simple linear scan through the sorted constant candidate vector for viable
+  // merge candidates.
+  auto MinValItr = ConstCandVec.begin();
+  for (auto CC = std::next(ConstCandVec.begin()), E = ConstCandVec.end();
+       CC != E; ++CC) {
+    if (MinValItr->ConstInt->getType() == CC->ConstInt->getType()) {
+      Type *MemUseValTy = nullptr;
+      for (auto &U : CC->Uses) {
+        auto *UI = U.Inst;
+        if (LoadInst *LI = dyn_cast<LoadInst>(UI)) {
+          MemUseValTy = LI->getType();
+          break;
+        } else if (StoreInst *SI = dyn_cast<StoreInst>(UI)) {
+          // Make sure the constant is used as pointer operand of the StoreInst.
+          if (SI->getPointerOperand() == SI->getOperand(U.OpndIdx)) {
+            MemUseValTy = SI->getValueOperand()->getType();
+            break;
+          }
+        }
+      }
+
+      // Check if the constant is in range of an add with immediate.
+      APInt Diff = CC->ConstInt->getValue() - MinValItr->ConstInt->getValue();
+      if ((Diff.getBitWidth() <= 64) &&
+          TTI->isLegalAddImmediate(Diff.getSExtValue()) &&
+          // Check if Diff can be used as offset in addressing mode of the user
+          // memory instruction.
+          (!MemUseValTy || TTI->isLegalAddressingMode(MemUseValTy,
+           /*BaseGV*/nullptr, /*BaseOffset*/Diff.getSExtValue(),
+           /*HasBaseReg*/true, /*Scale*/0)))
+        continue;
+    }
+    // We either have now a different constant type or the constant is not in
+    // range of an add with immediate anymore.
+    findAndMakeBaseConstant(MinValItr, CC, ConstInfoVec);
+    // Start a new base constant search.
+    MinValItr = CC;
+  }
+  // Finalize the last base constant search.
+  findAndMakeBaseConstant(MinValItr, ConstCandVec.end(), ConstInfoVec);
+}
+
+/// Updates the operand at Idx in instruction Inst with the result of
+///        instruction Mat. If the instruction is a PHI node then special
+///        handling for duplicate values form the same incoming basic block is
+///        required.
+/// \return The update will always succeed, but the return value indicated if
+///         Mat was used for the update or not.
+static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat) {
+  if (auto PHI = dyn_cast<PHINode>(Inst)) {
+    // Check if any previous operand of the PHI node has the same incoming basic
+    // block. This is a very odd case that happens when the incoming basic block
+    // has a switch statement. In this case use the same value as the previous
+    // operand(s), otherwise we will fail verification due to different values.
+    // The values are actually the same, but the variable names are different
+    // and the verifier doesn't like that.
+    BasicBlock *IncomingBB = PHI->getIncomingBlock(Idx);
+    for (unsigned i = 0; i < Idx; ++i) {
+      if (PHI->getIncomingBlock(i) == IncomingBB) {
+        Value *IncomingVal = PHI->getIncomingValue(i);
+        Inst->setOperand(Idx, IncomingVal);
+        return false;
+      }
+    }
+  }
+
+  Inst->setOperand(Idx, Mat);
+  return true;
+}
+
+/// Emit materialization code for all rebased constants and update their
+/// users.
+void ConstantHoistingPass::emitBaseConstants(Instruction *Base,
+                                             Constant *Offset,
+                                             Type *Ty,
+                                             const ConstantUser &ConstUser) {
+  Instruction *Mat = Base;
+
+  // The same offset can be dereferenced to different types in nested struct.
+  if (!Offset && Ty && Ty != Base->getType())
+    Offset = ConstantInt::get(Type::getInt32Ty(*Ctx), 0);
+
+  if (Offset) {
+    Instruction *InsertionPt = findMatInsertPt(ConstUser.Inst,
+                                               ConstUser.OpndIdx);
+    if (Ty) {
+      // Constant being rebased is a ConstantExpr.
+      PointerType *Int8PtrTy = Type::getInt8PtrTy(*Ctx,
+          cast<PointerType>(Ty)->getAddressSpace());
+      Base = new BitCastInst(Base, Int8PtrTy, "base_bitcast", InsertionPt);
+      Mat = GetElementPtrInst::Create(Int8PtrTy->getElementType(), Base,
+          Offset, "mat_gep", InsertionPt);
+      Mat = new BitCastInst(Mat, Ty, "mat_bitcast", InsertionPt);
+    } else
+      // Constant being rebased is a ConstantInt.
+      Mat = BinaryOperator::Create(Instruction::Add, Base, Offset,
+                                 "const_mat", InsertionPt);
+
+    LLVM_DEBUG(dbgs() << "Materialize constant (" << *Base->getOperand(0)
+                      << " + " << *Offset << ") in BB "
+                      << Mat->getParent()->getName() << '\n'
+                      << *Mat << '\n');
+    Mat->setDebugLoc(ConstUser.Inst->getDebugLoc());
+  }
+  Value *Opnd = ConstUser.Inst->getOperand(ConstUser.OpndIdx);
+
+  // Visit constant integer.
+  if (isa<ConstantInt>(Opnd)) {
+    LLVM_DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
+    if (!updateOperand(ConstUser.Inst, ConstUser.OpndIdx, Mat) && Offset)
+      Mat->eraseFromParent();
+    LLVM_DEBUG(dbgs() << "To    : " << *ConstUser.Inst << '\n');
+    return;
+  }
+
+  // Visit cast instruction.
+  if (auto CastInst = dyn_cast<Instruction>(Opnd)) {
+    assert(CastInst->isCast() && "Expected an cast instruction!");
+    // Check if we already have visited this cast instruction before to avoid
+    // unnecessary cloning.
+    Instruction *&ClonedCastInst = ClonedCastMap[CastInst];
+    if (!ClonedCastInst) {
+      ClonedCastInst = CastInst->clone();
+      ClonedCastInst->setOperand(0, Mat);
+      ClonedCastInst->insertAfter(CastInst);
+      // Use the same debug location as the original cast instruction.
+      ClonedCastInst->setDebugLoc(CastInst->getDebugLoc());
+      LLVM_DEBUG(dbgs() << "Clone instruction: " << *CastInst << '\n'
+                        << "To               : " << *ClonedCastInst << '\n');
+    }
+
+    LLVM_DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
+    updateOperand(ConstUser.Inst, ConstUser.OpndIdx, ClonedCastInst);
+    LLVM_DEBUG(dbgs() << "To    : " << *ConstUser.Inst << '\n');
+    return;
+  }
+
+  // Visit constant expression.
+  if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) {
+    if (ConstExpr->isGEPWithNoNotionalOverIndexing()) {
+      // Operand is a ConstantGEP, replace it.
+      updateOperand(ConstUser.Inst, ConstUser.OpndIdx, Mat);
+      return;
+    }
+
+    // Aside from constant GEPs, only constant cast expressions are collected.
+    assert(ConstExpr->isCast() && "ConstExpr should be a cast");
+    Instruction *ConstExprInst = ConstExpr->getAsInstruction();
+    ConstExprInst->setOperand(0, Mat);
+    ConstExprInst->insertBefore(findMatInsertPt(ConstUser.Inst,
+                                                ConstUser.OpndIdx));
+
+    // Use the same debug location as the instruction we are about to update.
+    ConstExprInst->setDebugLoc(ConstUser.Inst->getDebugLoc());
+
+    LLVM_DEBUG(dbgs() << "Create instruction: " << *ConstExprInst << '\n'
+                      << "From              : " << *ConstExpr << '\n');
+    LLVM_DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
+    if (!updateOperand(ConstUser.Inst, ConstUser.OpndIdx, ConstExprInst)) {
+      ConstExprInst->eraseFromParent();
+      if (Offset)
+        Mat->eraseFromParent();
+    }
+    LLVM_DEBUG(dbgs() << "To    : " << *ConstUser.Inst << '\n');
+    return;
+  }
+}
+
+/// Hoist and hide the base constant behind a bitcast and emit
+/// materialization code for derived constants.
+bool ConstantHoistingPass::emitBaseConstants(GlobalVariable *BaseGV) {
+  bool MadeChange = false;
+  SmallVectorImpl<consthoist::ConstantInfo> &ConstInfoVec =
+      BaseGV ? ConstGEPInfoMap[BaseGV] : ConstIntInfoVec;
+  for (auto const &ConstInfo : ConstInfoVec) {
+    SetVector<Instruction *> IPSet = findConstantInsertionPoint(ConstInfo);
+    // We can have an empty set if the function contains unreachable blocks.
+    if (IPSet.empty())
+      continue;
+
+    unsigned UsesNum = 0;
+    unsigned ReBasesNum = 0;
+    unsigned NotRebasedNum = 0;
+    for (Instruction *IP : IPSet) {
+      // First, collect constants depending on this IP of the base.
+      unsigned Uses = 0;
+      using RebasedUse = std::tuple<Constant *, Type *, ConstantUser>;
+      SmallVector<RebasedUse, 4> ToBeRebased;
+      for (auto const &RCI : ConstInfo.RebasedConstants) {
+        for (auto const &U : RCI.Uses) {
+          Uses++;
+          BasicBlock *OrigMatInsertBB =
+              findMatInsertPt(U.Inst, U.OpndIdx)->getParent();
+          // If Base constant is to be inserted in multiple places,
+          // generate rebase for U using the Base dominating U.
+          if (IPSet.size() == 1 ||
+              DT->dominates(IP->getParent(), OrigMatInsertBB))
+            ToBeRebased.push_back(RebasedUse(RCI.Offset, RCI.Ty, U));
+        }
+      }
+      UsesNum = Uses;
+
+      // If only few constants depend on this IP of base, skip rebasing,
+      // assuming the base and the rebased have the same materialization cost.
+      if (ToBeRebased.size() < MinNumOfDependentToRebase) {
+        NotRebasedNum += ToBeRebased.size();
+        continue;
+      }
+
+      // Emit an instance of the base at this IP.
+      Instruction *Base = nullptr;
+      // Hoist and hide the base constant behind a bitcast.
+      if (ConstInfo.BaseExpr) {
+        assert(BaseGV && "A base constant expression must have an base GV");
+        Type *Ty = ConstInfo.BaseExpr->getType();
+        Base = new BitCastInst(ConstInfo.BaseExpr, Ty, "const", IP);
+      } else {
+        IntegerType *Ty = ConstInfo.BaseInt->getType();
+        Base = new BitCastInst(ConstInfo.BaseInt, Ty, "const", IP);
+      }
+
+      Base->setDebugLoc(IP->getDebugLoc());
+
+      LLVM_DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseInt
+                        << ") to BB " << IP->getParent()->getName() << '\n'
+                        << *Base << '\n');
+
+      // Emit materialization code for rebased constants depending on this IP.
+      for (auto const &R : ToBeRebased) {
+        Constant *Off = std::get<0>(R);
+        Type *Ty = std::get<1>(R);
+        ConstantUser U = std::get<2>(R);
+        emitBaseConstants(Base, Off, Ty, U);
+        ReBasesNum++;
+        // Use the same debug location as the last user of the constant.
+        Base->setDebugLoc(DILocation::getMergedLocation(
+            Base->getDebugLoc(), U.Inst->getDebugLoc()));
+      }
+      assert(!Base->use_empty() && "The use list is empty!?");
+      assert(isa<Instruction>(Base->user_back()) &&
+             "All uses should be instructions.");
+    }
+    (void)UsesNum;
+    (void)ReBasesNum;
+    (void)NotRebasedNum;
+    // Expect all uses are rebased after rebase is done.
+    assert(UsesNum == (ReBasesNum + NotRebasedNum) &&
+           "Not all uses are rebased");
+
+    NumConstantsHoisted++;
+
+    // Base constant is also included in ConstInfo.RebasedConstants, so
+    // deduct 1 from ConstInfo.RebasedConstants.size().
+    NumConstantsRebased += ConstInfo.RebasedConstants.size() - 1;
+
+    MadeChange = true;
+  }
+  return MadeChange;
+}
+
+/// Check all cast instructions we made a copy of and remove them if they
+/// have no more users.
+void ConstantHoistingPass::deleteDeadCastInst() const {
+  for (auto const &I : ClonedCastMap)
+    if (I.first->use_empty())
+      I.first->eraseFromParent();
+}
+
+/// Optimize expensive integer constants in the given function.
+bool ConstantHoistingPass::runImpl(Function &Fn, TargetTransformInfo &TTI,
+                                   DominatorTree &DT, BlockFrequencyInfo *BFI,
+                                   BasicBlock &Entry, ProfileSummaryInfo *PSI) {
+  this->TTI = &TTI;
+  this->DT = &DT;
+  this->BFI = BFI;
+  this->DL = &Fn.getParent()->getDataLayout();
+  this->Ctx = &Fn.getContext();
+  this->Entry = &Entry;
+  this->PSI = PSI;
+  // Collect all constant candidates.
+  collectConstantCandidates(Fn);
+
+  // Combine constants that can be easily materialized with an add from a common
+  // base constant.
+  if (!ConstIntCandVec.empty())
+    findBaseConstants(nullptr);
   for (const auto &MapEntry : ConstGEPCandMap)
-    if (!MapEntry.second.empty()) 
-      findBaseConstants(MapEntry.first); 
- 
-  // Finally hoist the base constant and emit materialization code for dependent 
-  // constants. 
-  bool MadeChange = false; 
-  if (!ConstIntInfoVec.empty()) 
-    MadeChange = emitBaseConstants(nullptr); 
+    if (!MapEntry.second.empty())
+      findBaseConstants(MapEntry.first);
+
+  // Finally hoist the base constant and emit materialization code for dependent
+  // constants.
+  bool MadeChange = false;
+  if (!ConstIntInfoVec.empty())
+    MadeChange = emitBaseConstants(nullptr);
   for (const auto &MapEntry : ConstGEPInfoMap)
-    if (!MapEntry.second.empty()) 
-      MadeChange |= emitBaseConstants(MapEntry.first); 
- 
- 
-  // Cleanup dead instructions. 
-  deleteDeadCastInst(); 
- 
-  cleanup(); 
- 
-  return MadeChange; 
-} 
- 
-PreservedAnalyses ConstantHoistingPass::run(Function &F, 
-                                            FunctionAnalysisManager &AM) { 
-  auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 
-  auto &TTI = AM.getResult<TargetIRAnalysis>(F); 
-  auto BFI = ConstHoistWithBlockFrequency 
-                 ? &AM.getResult<BlockFrequencyAnalysis>(F) 
-                 : nullptr; 
-  auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 
-  auto *PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 
-  if (!runImpl(F, TTI, DT, BFI, F.getEntryBlock(), PSI)) 
-    return PreservedAnalyses::all(); 
- 
-  PreservedAnalyses PA; 
-  PA.preserveSet<CFGAnalyses>(); 
-  return PA; 
-} 
+    if (!MapEntry.second.empty())
+      MadeChange |= emitBaseConstants(MapEntry.first);
+
+
+  // Cleanup dead instructions.
+  deleteDeadCastInst();
+
+  cleanup();
+
+  return MadeChange;
+}
+
+PreservedAnalyses ConstantHoistingPass::run(Function &F,
+                                            FunctionAnalysisManager &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  auto BFI = ConstHoistWithBlockFrequency
+                 ? &AM.getResult<BlockFrequencyAnalysis>(F)
+                 : nullptr;
+  auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
+  auto *PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+  if (!runImpl(F, TTI, DT, BFI, F.getEntryBlock(), PSI))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 432c7efe57..b671d68031 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -1,563 +1,563 @@
-//===- CorrelatedValuePropagation.cpp - Propagate CFG-derived info --------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the Correlated Value Propagation pass. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h" 
-#include "llvm/ADT/DepthFirstIterator.h" 
-#include "llvm/ADT/Optional.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/DomTreeUpdater.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/Analysis/LazyValueInfo.h" 
-#include "llvm/IR/Attributes.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/ConstantRange.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Operator.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include <cassert> 
-#include <utility> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "correlated-value-propagation" 
- 
-STATISTIC(NumPhis,      "Number of phis propagated"); 
-STATISTIC(NumPhiCommon, "Number of phis deleted via common incoming value"); 
-STATISTIC(NumSelects,   "Number of selects propagated"); 
-STATISTIC(NumMemAccess, "Number of memory access targets propagated"); 
-STATISTIC(NumCmps,      "Number of comparisons propagated"); 
-STATISTIC(NumReturns,   "Number of return values propagated"); 
-STATISTIC(NumDeadCases, "Number of switch cases removed"); 
+//===- CorrelatedValuePropagation.cpp - Propagate CFG-derived info --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Correlated Value Propagation pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "correlated-value-propagation"
+
+STATISTIC(NumPhis,      "Number of phis propagated");
+STATISTIC(NumPhiCommon, "Number of phis deleted via common incoming value");
+STATISTIC(NumSelects,   "Number of selects propagated");
+STATISTIC(NumMemAccess, "Number of memory access targets propagated");
+STATISTIC(NumCmps,      "Number of comparisons propagated");
+STATISTIC(NumReturns,   "Number of return values propagated");
+STATISTIC(NumDeadCases, "Number of switch cases removed");
 STATISTIC(NumSDivSRemsNarrowed,
           "Number of sdivs/srems whose width was decreased");
-STATISTIC(NumSDivs,     "Number of sdiv converted to udiv"); 
+STATISTIC(NumSDivs,     "Number of sdiv converted to udiv");
 STATISTIC(NumUDivURemsNarrowed,
           "Number of udivs/urems whose width was decreased");
-STATISTIC(NumAShrs,     "Number of ashr converted to lshr"); 
-STATISTIC(NumSRems,     "Number of srem converted to urem"); 
-STATISTIC(NumSExt,      "Number of sext converted to zext"); 
-STATISTIC(NumAnd,       "Number of ands removed"); 
-STATISTIC(NumNW,        "Number of no-wrap deductions"); 
-STATISTIC(NumNSW,       "Number of no-signed-wrap deductions"); 
-STATISTIC(NumNUW,       "Number of no-unsigned-wrap deductions"); 
-STATISTIC(NumAddNW,     "Number of no-wrap deductions for add"); 
-STATISTIC(NumAddNSW,    "Number of no-signed-wrap deductions for add"); 
-STATISTIC(NumAddNUW,    "Number of no-unsigned-wrap deductions for add"); 
-STATISTIC(NumSubNW,     "Number of no-wrap deductions for sub"); 
-STATISTIC(NumSubNSW,    "Number of no-signed-wrap deductions for sub"); 
-STATISTIC(NumSubNUW,    "Number of no-unsigned-wrap deductions for sub"); 
-STATISTIC(NumMulNW,     "Number of no-wrap deductions for mul"); 
-STATISTIC(NumMulNSW,    "Number of no-signed-wrap deductions for mul"); 
-STATISTIC(NumMulNUW,    "Number of no-unsigned-wrap deductions for mul"); 
-STATISTIC(NumShlNW,     "Number of no-wrap deductions for shl"); 
-STATISTIC(NumShlNSW,    "Number of no-signed-wrap deductions for shl"); 
-STATISTIC(NumShlNUW,    "Number of no-unsigned-wrap deductions for shl"); 
-STATISTIC(NumOverflows, "Number of overflow checks removed"); 
-STATISTIC(NumSaturating, 
-    "Number of saturating arithmetics converted to normal arithmetics"); 
- 
-static cl::opt<bool> DontAddNoWrapFlags("cvp-dont-add-nowrap-flags", cl::init(false)); 
- 
-namespace { 
- 
-  class CorrelatedValuePropagation : public FunctionPass { 
-  public: 
-    static char ID; 
- 
-    CorrelatedValuePropagation(): FunctionPass(ID) { 
-     initializeCorrelatedValuePropagationPass(*PassRegistry::getPassRegistry()); 
-    } 
- 
-    bool runOnFunction(Function &F) override; 
- 
-    void getAnalysisUsage(AnalysisUsage &AU) const override { 
-      AU.addRequired<DominatorTreeWrapperPass>(); 
-      AU.addRequired<LazyValueInfoWrapperPass>(); 
-      AU.addPreserved<GlobalsAAWrapperPass>(); 
-      AU.addPreserved<DominatorTreeWrapperPass>(); 
-      AU.addPreserved<LazyValueInfoWrapperPass>(); 
-    } 
-  }; 
- 
-} // end anonymous namespace 
- 
-char CorrelatedValuePropagation::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(CorrelatedValuePropagation, "correlated-propagation", 
-                "Value Propagation", false, false) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass) 
-INITIALIZE_PASS_END(CorrelatedValuePropagation, "correlated-propagation", 
-                "Value Propagation", false, false) 
- 
-// Public interface to the Value Propagation pass 
-Pass *llvm::createCorrelatedValuePropagationPass() { 
-  return new CorrelatedValuePropagation(); 
-} 
- 
-static bool processSelect(SelectInst *S, LazyValueInfo *LVI) { 
-  if (S->getType()->isVectorTy()) return false; 
-  if (isa<Constant>(S->getCondition())) return false; 
- 
+STATISTIC(NumAShrs,     "Number of ashr converted to lshr");
+STATISTIC(NumSRems,     "Number of srem converted to urem");
+STATISTIC(NumSExt,      "Number of sext converted to zext");
+STATISTIC(NumAnd,       "Number of ands removed");
+STATISTIC(NumNW,        "Number of no-wrap deductions");
+STATISTIC(NumNSW,       "Number of no-signed-wrap deductions");
+STATISTIC(NumNUW,       "Number of no-unsigned-wrap deductions");
+STATISTIC(NumAddNW,     "Number of no-wrap deductions for add");
+STATISTIC(NumAddNSW,    "Number of no-signed-wrap deductions for add");
+STATISTIC(NumAddNUW,    "Number of no-unsigned-wrap deductions for add");
+STATISTIC(NumSubNW,     "Number of no-wrap deductions for sub");
+STATISTIC(NumSubNSW,    "Number of no-signed-wrap deductions for sub");
+STATISTIC(NumSubNUW,    "Number of no-unsigned-wrap deductions for sub");
+STATISTIC(NumMulNW,     "Number of no-wrap deductions for mul");
+STATISTIC(NumMulNSW,    "Number of no-signed-wrap deductions for mul");
+STATISTIC(NumMulNUW,    "Number of no-unsigned-wrap deductions for mul");
+STATISTIC(NumShlNW,     "Number of no-wrap deductions for shl");
+STATISTIC(NumShlNSW,    "Number of no-signed-wrap deductions for shl");
+STATISTIC(NumShlNUW,    "Number of no-unsigned-wrap deductions for shl");
+STATISTIC(NumOverflows, "Number of overflow checks removed");
+STATISTIC(NumSaturating,
+    "Number of saturating arithmetics converted to normal arithmetics");
+
+static cl::opt<bool> DontAddNoWrapFlags("cvp-dont-add-nowrap-flags", cl::init(false));
+
+namespace {
+
+  class CorrelatedValuePropagation : public FunctionPass {
+  public:
+    static char ID;
+
+    CorrelatedValuePropagation(): FunctionPass(ID) {
+     initializeCorrelatedValuePropagationPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addRequired<LazyValueInfoWrapperPass>();
+      AU.addPreserved<GlobalsAAWrapperPass>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addPreserved<LazyValueInfoWrapperPass>();
+    }
+  };
+
+} // end anonymous namespace
+
+char CorrelatedValuePropagation::ID = 0;
+
+INITIALIZE_PASS_BEGIN(CorrelatedValuePropagation, "correlated-propagation",
+                "Value Propagation", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
+INITIALIZE_PASS_END(CorrelatedValuePropagation, "correlated-propagation",
+                "Value Propagation", false, false)
+
+// Public interface to the Value Propagation pass
+Pass *llvm::createCorrelatedValuePropagationPass() {
+  return new CorrelatedValuePropagation();
+}
+
+static bool processSelect(SelectInst *S, LazyValueInfo *LVI) {
+  if (S->getType()->isVectorTy()) return false;
+  if (isa<Constant>(S->getCondition())) return false;
+
   Constant *C = LVI->getConstant(S->getCondition(), S);
-  if (!C) return false; 
- 
-  ConstantInt *CI = dyn_cast<ConstantInt>(C); 
-  if (!CI) return false; 
- 
-  Value *ReplaceWith = CI->isOne() ? S->getTrueValue() : S->getFalseValue(); 
-  S->replaceAllUsesWith(ReplaceWith); 
-  S->eraseFromParent(); 
- 
-  ++NumSelects; 
- 
-  return true; 
-} 
- 
-/// Try to simplify a phi with constant incoming values that match the edge 
-/// values of a non-constant value on all other edges: 
-/// bb0: 
-///   %isnull = icmp eq i8* %x, null 
-///   br i1 %isnull, label %bb2, label %bb1 
-/// bb1: 
-///   br label %bb2 
-/// bb2: 
-///   %r = phi i8* [ %x, %bb1 ], [ null, %bb0 ] 
-/// --> 
-///   %r = %x 
-static bool simplifyCommonValuePhi(PHINode *P, LazyValueInfo *LVI, 
-                                   DominatorTree *DT) { 
-  // Collect incoming constants and initialize possible common value. 
-  SmallVector<std::pair<Constant *, unsigned>, 4> IncomingConstants; 
-  Value *CommonValue = nullptr; 
-  for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i) { 
-    Value *Incoming = P->getIncomingValue(i); 
-    if (auto *IncomingConstant = dyn_cast<Constant>(Incoming)) { 
-      IncomingConstants.push_back(std::make_pair(IncomingConstant, i)); 
-    } else if (!CommonValue) { 
-      // The potential common value is initialized to the first non-constant. 
-      CommonValue = Incoming; 
-    } else if (Incoming != CommonValue) { 
-      // There can be only one non-constant common value. 
-      return false; 
-    } 
-  } 
- 
-  if (!CommonValue || IncomingConstants.empty()) 
-    return false; 
- 
-  // The common value must be valid in all incoming blocks. 
-  BasicBlock *ToBB = P->getParent(); 
-  if (auto *CommonInst = dyn_cast<Instruction>(CommonValue)) 
-    if (!DT->dominates(CommonInst, ToBB)) 
-      return false; 
- 
-  // We have a phi with exactly 1 variable incoming value and 1 or more constant 
-  // incoming values. See if all constant incoming values can be mapped back to 
-  // the same incoming variable value. 
-  for (auto &IncomingConstant : IncomingConstants) { 
-    Constant *C = IncomingConstant.first; 
-    BasicBlock *IncomingBB = P->getIncomingBlock(IncomingConstant.second); 
-    if (C != LVI->getConstantOnEdge(CommonValue, IncomingBB, ToBB, P)) 
-      return false; 
-  } 
- 
-  // All constant incoming values map to the same variable along the incoming 
-  // edges of the phi. The phi is unnecessary. However, we must drop all 
-  // poison-generating flags to ensure that no poison is propagated to the phi 
-  // location by performing this substitution. 
-  // Warning: If the underlying analysis changes, this may not be enough to 
-  //          guarantee that poison is not propagated. 
-  // TODO: We may be able to re-infer flags by re-analyzing the instruction. 
-  if (auto *CommonInst = dyn_cast<Instruction>(CommonValue)) 
-    CommonInst->dropPoisonGeneratingFlags(); 
-  P->replaceAllUsesWith(CommonValue); 
-  P->eraseFromParent(); 
-  ++NumPhiCommon; 
-  return true; 
-} 
- 
-static bool processPHI(PHINode *P, LazyValueInfo *LVI, DominatorTree *DT, 
-                       const SimplifyQuery &SQ) { 
-  bool Changed = false; 
- 
-  BasicBlock *BB = P->getParent(); 
-  for (unsigned i = 0, e = P->getNumIncomingValues(); i < e; ++i) { 
-    Value *Incoming = P->getIncomingValue(i); 
-    if (isa<Constant>(Incoming)) continue; 
- 
-    Value *V = LVI->getConstantOnEdge(Incoming, P->getIncomingBlock(i), BB, P); 
- 
-    // Look if the incoming value is a select with a scalar condition for which 
-    // LVI can tells us the value. In that case replace the incoming value with 
-    // the appropriate value of the select. This often allows us to remove the 
-    // select later. 
-    if (!V) { 
-      SelectInst *SI = dyn_cast<SelectInst>(Incoming); 
-      if (!SI) continue; 
- 
-      Value *Condition = SI->getCondition(); 
-      if (!Condition->getType()->isVectorTy()) { 
-        if (Constant *C = LVI->getConstantOnEdge( 
-                Condition, P->getIncomingBlock(i), BB, P)) { 
-          if (C->isOneValue()) { 
-            V = SI->getTrueValue(); 
-          } else if (C->isZeroValue()) { 
-            V = SI->getFalseValue(); 
-          } 
-          // Once LVI learns to handle vector types, we could also add support 
-          // for vector type constants that are not all zeroes or all ones. 
-        } 
-      } 
- 
-      // Look if the select has a constant but LVI tells us that the incoming 
-      // value can never be that constant. In that case replace the incoming 
-      // value with the other value of the select. This often allows us to 
-      // remove the select later. 
-      if (!V) { 
-        Constant *C = dyn_cast<Constant>(SI->getFalseValue()); 
-        if (!C) continue; 
- 
-        if (LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C, 
-              P->getIncomingBlock(i), BB, P) != 
-            LazyValueInfo::False) 
-          continue; 
-        V = SI->getTrueValue(); 
-      } 
- 
-      LLVM_DEBUG(dbgs() << "CVP: Threading PHI over " << *SI << '\n'); 
-    } 
- 
-    P->setIncomingValue(i, V); 
-    Changed = true; 
-  } 
- 
-  if (Value *V = SimplifyInstruction(P, SQ)) { 
-    P->replaceAllUsesWith(V); 
-    P->eraseFromParent(); 
-    Changed = true; 
-  } 
- 
-  if (!Changed) 
-    Changed = simplifyCommonValuePhi(P, LVI, DT); 
- 
-  if (Changed) 
-    ++NumPhis; 
- 
-  return Changed; 
-} 
- 
-static bool processMemAccess(Instruction *I, LazyValueInfo *LVI) { 
-  Value *Pointer = nullptr; 
-  if (LoadInst *L = dyn_cast<LoadInst>(I)) 
-    Pointer = L->getPointerOperand(); 
-  else 
-    Pointer = cast<StoreInst>(I)->getPointerOperand(); 
- 
-  if (isa<Constant>(Pointer)) return false; 
- 
+  if (!C) return false;
+
+  ConstantInt *CI = dyn_cast<ConstantInt>(C);
+  if (!CI) return false;
+
+  Value *ReplaceWith = CI->isOne() ? S->getTrueValue() : S->getFalseValue();
+  S->replaceAllUsesWith(ReplaceWith);
+  S->eraseFromParent();
+
+  ++NumSelects;
+
+  return true;
+}
+
+/// Try to simplify a phi with constant incoming values that match the edge
+/// values of a non-constant value on all other edges:
+/// bb0:
+///   %isnull = icmp eq i8* %x, null
+///   br i1 %isnull, label %bb2, label %bb1
+/// bb1:
+///   br label %bb2
+/// bb2:
+///   %r = phi i8* [ %x, %bb1 ], [ null, %bb0 ]
+/// -->
+///   %r = %x
+static bool simplifyCommonValuePhi(PHINode *P, LazyValueInfo *LVI,
+                                   DominatorTree *DT) {
+  // Collect incoming constants and initialize possible common value.
+  SmallVector<std::pair<Constant *, unsigned>, 4> IncomingConstants;
+  Value *CommonValue = nullptr;
+  for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i) {
+    Value *Incoming = P->getIncomingValue(i);
+    if (auto *IncomingConstant = dyn_cast<Constant>(Incoming)) {
+      IncomingConstants.push_back(std::make_pair(IncomingConstant, i));
+    } else if (!CommonValue) {
+      // The potential common value is initialized to the first non-constant.
+      CommonValue = Incoming;
+    } else if (Incoming != CommonValue) {
+      // There can be only one non-constant common value.
+      return false;
+    }
+  }
+
+  if (!CommonValue || IncomingConstants.empty())
+    return false;
+
+  // The common value must be valid in all incoming blocks.
+  BasicBlock *ToBB = P->getParent();
+  if (auto *CommonInst = dyn_cast<Instruction>(CommonValue))
+    if (!DT->dominates(CommonInst, ToBB))
+      return false;
+
+  // We have a phi with exactly 1 variable incoming value and 1 or more constant
+  // incoming values. See if all constant incoming values can be mapped back to
+  // the same incoming variable value.
+  for (auto &IncomingConstant : IncomingConstants) {
+    Constant *C = IncomingConstant.first;
+    BasicBlock *IncomingBB = P->getIncomingBlock(IncomingConstant.second);
+    if (C != LVI->getConstantOnEdge(CommonValue, IncomingBB, ToBB, P))
+      return false;
+  }
+
+  // All constant incoming values map to the same variable along the incoming
+  // edges of the phi. The phi is unnecessary. However, we must drop all
+  // poison-generating flags to ensure that no poison is propagated to the phi
+  // location by performing this substitution.
+  // Warning: If the underlying analysis changes, this may not be enough to
+  //          guarantee that poison is not propagated.
+  // TODO: We may be able to re-infer flags by re-analyzing the instruction.
+  if (auto *CommonInst = dyn_cast<Instruction>(CommonValue))
+    CommonInst->dropPoisonGeneratingFlags();
+  P->replaceAllUsesWith(CommonValue);
+  P->eraseFromParent();
+  ++NumPhiCommon;
+  return true;
+}
+
+static bool processPHI(PHINode *P, LazyValueInfo *LVI, DominatorTree *DT,
+                       const SimplifyQuery &SQ) {
+  bool Changed = false;
+
+  BasicBlock *BB = P->getParent();
+  for (unsigned i = 0, e = P->getNumIncomingValues(); i < e; ++i) {
+    Value *Incoming = P->getIncomingValue(i);
+    if (isa<Constant>(Incoming)) continue;
+
+    Value *V = LVI->getConstantOnEdge(Incoming, P->getIncomingBlock(i), BB, P);
+
+    // Look if the incoming value is a select with a scalar condition for which
+    // LVI can tells us the value. In that case replace the incoming value with
+    // the appropriate value of the select. This often allows us to remove the
+    // select later.
+    if (!V) {
+      SelectInst *SI = dyn_cast<SelectInst>(Incoming);
+      if (!SI) continue;
+
+      Value *Condition = SI->getCondition();
+      if (!Condition->getType()->isVectorTy()) {
+        if (Constant *C = LVI->getConstantOnEdge(
+                Condition, P->getIncomingBlock(i), BB, P)) {
+          if (C->isOneValue()) {
+            V = SI->getTrueValue();
+          } else if (C->isZeroValue()) {
+            V = SI->getFalseValue();
+          }
+          // Once LVI learns to handle vector types, we could also add support
+          // for vector type constants that are not all zeroes or all ones.
+        }
+      }
+
+      // Look if the select has a constant but LVI tells us that the incoming
+      // value can never be that constant. In that case replace the incoming
+      // value with the other value of the select. This often allows us to
+      // remove the select later.
+      if (!V) {
+        Constant *C = dyn_cast<Constant>(SI->getFalseValue());
+        if (!C) continue;
+
+        if (LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C,
+              P->getIncomingBlock(i), BB, P) !=
+            LazyValueInfo::False)
+          continue;
+        V = SI->getTrueValue();
+      }
+
+      LLVM_DEBUG(dbgs() << "CVP: Threading PHI over " << *SI << '\n');
+    }
+
+    P->setIncomingValue(i, V);
+    Changed = true;
+  }
+
+  if (Value *V = SimplifyInstruction(P, SQ)) {
+    P->replaceAllUsesWith(V);
+    P->eraseFromParent();
+    Changed = true;
+  }
+
+  if (!Changed)
+    Changed = simplifyCommonValuePhi(P, LVI, DT);
+
+  if (Changed)
+    ++NumPhis;
+
+  return Changed;
+}
+
+static bool processMemAccess(Instruction *I, LazyValueInfo *LVI) {
+  Value *Pointer = nullptr;
+  if (LoadInst *L = dyn_cast<LoadInst>(I))
+    Pointer = L->getPointerOperand();
+  else
+    Pointer = cast<StoreInst>(I)->getPointerOperand();
+
+  if (isa<Constant>(Pointer)) return false;
+
   Constant *C = LVI->getConstant(Pointer, I);
-  if (!C) return false; 
- 
-  ++NumMemAccess; 
-  I->replaceUsesOfWith(Pointer, C); 
-  return true; 
-} 
- 
-/// See if LazyValueInfo's ability to exploit edge conditions or range 
-/// information is sufficient to prove this comparison. Even for local 
-/// conditions, this can sometimes prove conditions instcombine can't by 
-/// exploiting range information. 
-static bool processCmp(CmpInst *Cmp, LazyValueInfo *LVI) { 
-  Value *Op0 = Cmp->getOperand(0); 
-  auto *C = dyn_cast<Constant>(Cmp->getOperand(1)); 
-  if (!C) 
-    return false; 
- 
-  LazyValueInfo::Tristate Result = 
+  if (!C) return false;
+
+  ++NumMemAccess;
+  I->replaceUsesOfWith(Pointer, C);
+  return true;
+}
+
+/// See if LazyValueInfo's ability to exploit edge conditions or range
+/// information is sufficient to prove this comparison. Even for local
+/// conditions, this can sometimes prove conditions instcombine can't by
+/// exploiting range information.
+static bool processCmp(CmpInst *Cmp, LazyValueInfo *LVI) {
+  Value *Op0 = Cmp->getOperand(0);
+  auto *C = dyn_cast<Constant>(Cmp->getOperand(1));
+  if (!C)
+    return false;
+
+  LazyValueInfo::Tristate Result =
       LVI->getPredicateAt(Cmp->getPredicate(), Op0, C, Cmp,
                           /*UseBlockValue=*/true);
-  if (Result == LazyValueInfo::Unknown) 
-    return false; 
- 
-  ++NumCmps; 
-  Constant *TorF = ConstantInt::get(Type::getInt1Ty(Cmp->getContext()), Result); 
-  Cmp->replaceAllUsesWith(TorF); 
-  Cmp->eraseFromParent(); 
-  return true; 
-} 
- 
-/// Simplify a switch instruction by removing cases which can never fire. If the 
-/// uselessness of a case could be determined locally then constant propagation 
-/// would already have figured it out. Instead, walk the predecessors and 
-/// statically evaluate cases based on information available on that edge. Cases 
-/// that cannot fire no matter what the incoming edge can safely be removed. If 
-/// a case fires on every incoming edge then the entire switch can be removed 
-/// and replaced with a branch to the case destination. 
-static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI, 
-                          DominatorTree *DT) { 
-  DomTreeUpdater DTU(*DT, DomTreeUpdater::UpdateStrategy::Lazy); 
-  Value *Cond = I->getCondition(); 
-  BasicBlock *BB = I->getParent(); 
- 
-  // Analyse each switch case in turn. 
-  bool Changed = false; 
-  DenseMap<BasicBlock*, int> SuccessorsCount; 
-  for (auto *Succ : successors(BB)) 
-    SuccessorsCount[Succ]++; 
- 
-  { // Scope for SwitchInstProfUpdateWrapper. It must not live during 
-    // ConstantFoldTerminator() as the underlying SwitchInst can be changed. 
-    SwitchInstProfUpdateWrapper SI(*I); 
- 
-    for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) { 
-      ConstantInt *Case = CI->getCaseValue(); 
+  if (Result == LazyValueInfo::Unknown)
+    return false;
+
+  ++NumCmps;
+  Constant *TorF = ConstantInt::get(Type::getInt1Ty(Cmp->getContext()), Result);
+  Cmp->replaceAllUsesWith(TorF);
+  Cmp->eraseFromParent();
+  return true;
+}
+
+/// Simplify a switch instruction by removing cases which can never fire. If the
+/// uselessness of a case could be determined locally then constant propagation
+/// would already have figured it out. Instead, walk the predecessors and
+/// statically evaluate cases based on information available on that edge. Cases
+/// that cannot fire no matter what the incoming edge can safely be removed. If
+/// a case fires on every incoming edge then the entire switch can be removed
+/// and replaced with a branch to the case destination.
+static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI,
+                          DominatorTree *DT) {
+  DomTreeUpdater DTU(*DT, DomTreeUpdater::UpdateStrategy::Lazy);
+  Value *Cond = I->getCondition();
+  BasicBlock *BB = I->getParent();
+
+  // Analyse each switch case in turn.
+  bool Changed = false;
+  DenseMap<BasicBlock*, int> SuccessorsCount;
+  for (auto *Succ : successors(BB))
+    SuccessorsCount[Succ]++;
+
+  { // Scope for SwitchInstProfUpdateWrapper. It must not live during
+    // ConstantFoldTerminator() as the underlying SwitchInst can be changed.
+    SwitchInstProfUpdateWrapper SI(*I);
+
+    for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) {
+      ConstantInt *Case = CI->getCaseValue();
       LazyValueInfo::Tristate State =
           LVI->getPredicateAt(CmpInst::ICMP_EQ, Cond, Case, I,
                               /* UseBlockValue */ true);
- 
-      if (State == LazyValueInfo::False) { 
-        // This case never fires - remove it. 
-        BasicBlock *Succ = CI->getCaseSuccessor(); 
-        Succ->removePredecessor(BB); 
-        CI = SI.removeCase(CI); 
-        CE = SI->case_end(); 
- 
-        // The condition can be modified by removePredecessor's PHI simplification 
-        // logic. 
-        Cond = SI->getCondition(); 
- 
-        ++NumDeadCases; 
-        Changed = true; 
-        if (--SuccessorsCount[Succ] == 0) 
-          DTU.applyUpdatesPermissive({{DominatorTree::Delete, BB, Succ}}); 
-        continue; 
-      } 
-      if (State == LazyValueInfo::True) { 
-        // This case always fires.  Arrange for the switch to be turned into an 
-        // unconditional branch by replacing the switch condition with the case 
-        // value. 
-        SI->setCondition(Case); 
-        NumDeadCases += SI->getNumCases(); 
-        Changed = true; 
-        break; 
-      } 
- 
-      // Increment the case iterator since we didn't delete it. 
-      ++CI; 
-    } 
-  } 
- 
-  if (Changed) 
-    // If the switch has been simplified to the point where it can be replaced 
-    // by a branch then do so now. 
-    ConstantFoldTerminator(BB, /*DeleteDeadConditions = */ false, 
-                           /*TLI = */ nullptr, &DTU); 
-  return Changed; 
-} 
- 
-// See if we can prove that the given binary op intrinsic will not overflow. 
-static bool willNotOverflow(BinaryOpIntrinsic *BO, LazyValueInfo *LVI) { 
+
+      if (State == LazyValueInfo::False) {
+        // This case never fires - remove it.
+        BasicBlock *Succ = CI->getCaseSuccessor();
+        Succ->removePredecessor(BB);
+        CI = SI.removeCase(CI);
+        CE = SI->case_end();
+
+        // The condition can be modified by removePredecessor's PHI simplification
+        // logic.
+        Cond = SI->getCondition();
+
+        ++NumDeadCases;
+        Changed = true;
+        if (--SuccessorsCount[Succ] == 0)
+          DTU.applyUpdatesPermissive({{DominatorTree::Delete, BB, Succ}});
+        continue;
+      }
+      if (State == LazyValueInfo::True) {
+        // This case always fires.  Arrange for the switch to be turned into an
+        // unconditional branch by replacing the switch condition with the case
+        // value.
+        SI->setCondition(Case);
+        NumDeadCases += SI->getNumCases();
+        Changed = true;
+        break;
+      }
+
+      // Increment the case iterator since we didn't delete it.
+      ++CI;
+    }
+  }
+
+  if (Changed)
+    // If the switch has been simplified to the point where it can be replaced
+    // by a branch then do so now.
+    ConstantFoldTerminator(BB, /*DeleteDeadConditions = */ false,
+                           /*TLI = */ nullptr, &DTU);
+  return Changed;
+}
+
+// See if we can prove that the given binary op intrinsic will not overflow.
+static bool willNotOverflow(BinaryOpIntrinsic *BO, LazyValueInfo *LVI) {
   ConstantRange LRange = LVI->getConstantRange(BO->getLHS(), BO);
   ConstantRange RRange = LVI->getConstantRange(BO->getRHS(), BO);
-  ConstantRange NWRegion = ConstantRange::makeGuaranteedNoWrapRegion( 
-      BO->getBinaryOp(), RRange, BO->getNoWrapKind()); 
-  return NWRegion.contains(LRange); 
-} 
- 
-static void setDeducedOverflowingFlags(Value *V, Instruction::BinaryOps Opcode, 
-                                       bool NewNSW, bool NewNUW) { 
-  Statistic *OpcNW, *OpcNSW, *OpcNUW; 
-  switch (Opcode) { 
-  case Instruction::Add: 
-    OpcNW = &NumAddNW; 
-    OpcNSW = &NumAddNSW; 
-    OpcNUW = &NumAddNUW; 
-    break; 
-  case Instruction::Sub: 
-    OpcNW = &NumSubNW; 
-    OpcNSW = &NumSubNSW; 
-    OpcNUW = &NumSubNUW; 
-    break; 
-  case Instruction::Mul: 
-    OpcNW = &NumMulNW; 
-    OpcNSW = &NumMulNSW; 
-    OpcNUW = &NumMulNUW; 
-    break; 
-  case Instruction::Shl: 
-    OpcNW = &NumShlNW; 
-    OpcNSW = &NumShlNSW; 
-    OpcNUW = &NumShlNUW; 
-    break; 
-  default: 
-    llvm_unreachable("Will not be called with other binops"); 
-  } 
- 
-  auto *Inst = dyn_cast<Instruction>(V); 
-  if (NewNSW) { 
-    ++NumNW; 
-    ++*OpcNW; 
-    ++NumNSW; 
-    ++*OpcNSW; 
-    if (Inst) 
-      Inst->setHasNoSignedWrap(); 
-  } 
-  if (NewNUW) { 
-    ++NumNW; 
-    ++*OpcNW; 
-    ++NumNUW; 
-    ++*OpcNUW; 
-    if (Inst) 
-      Inst->setHasNoUnsignedWrap(); 
-  } 
-} 
- 
-static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI); 
- 
-// Rewrite this with.overflow intrinsic as non-overflowing. 
-static void processOverflowIntrinsic(WithOverflowInst *WO, LazyValueInfo *LVI) { 
-  IRBuilder<> B(WO); 
-  Instruction::BinaryOps Opcode = WO->getBinaryOp(); 
-  bool NSW = WO->isSigned(); 
-  bool NUW = !WO->isSigned(); 
- 
-  Value *NewOp = 
-      B.CreateBinOp(Opcode, WO->getLHS(), WO->getRHS(), WO->getName()); 
-  setDeducedOverflowingFlags(NewOp, Opcode, NSW, NUW); 
- 
-  StructType *ST = cast<StructType>(WO->getType()); 
-  Constant *Struct = ConstantStruct::get(ST, 
-      { UndefValue::get(ST->getElementType(0)), 
-        ConstantInt::getFalse(ST->getElementType(1)) }); 
-  Value *NewI = B.CreateInsertValue(Struct, NewOp, 0); 
-  WO->replaceAllUsesWith(NewI); 
-  WO->eraseFromParent(); 
-  ++NumOverflows; 
- 
-  // See if we can infer the other no-wrap too. 
-  if (auto *BO = dyn_cast<BinaryOperator>(NewOp)) 
-    processBinOp(BO, LVI); 
-} 
- 
-static void processSaturatingInst(SaturatingInst *SI, LazyValueInfo *LVI) { 
-  Instruction::BinaryOps Opcode = SI->getBinaryOp(); 
-  bool NSW = SI->isSigned(); 
-  bool NUW = !SI->isSigned(); 
-  BinaryOperator *BinOp = BinaryOperator::Create( 
-      Opcode, SI->getLHS(), SI->getRHS(), SI->getName(), SI); 
-  BinOp->setDebugLoc(SI->getDebugLoc()); 
-  setDeducedOverflowingFlags(BinOp, Opcode, NSW, NUW); 
- 
-  SI->replaceAllUsesWith(BinOp); 
-  SI->eraseFromParent(); 
-  ++NumSaturating; 
- 
-  // See if we can infer the other no-wrap too. 
-  if (auto *BO = dyn_cast<BinaryOperator>(BinOp)) 
-    processBinOp(BO, LVI); 
-} 
- 
-/// Infer nonnull attributes for the arguments at the specified callsite. 
-static bool processCallSite(CallBase &CB, LazyValueInfo *LVI) { 
- 
-  if (auto *WO = dyn_cast<WithOverflowInst>(&CB)) { 
-    if (WO->getLHS()->getType()->isIntegerTy() && willNotOverflow(WO, LVI)) { 
-      processOverflowIntrinsic(WO, LVI); 
-      return true; 
-    } 
-  } 
- 
-  if (auto *SI = dyn_cast<SaturatingInst>(&CB)) { 
-    if (SI->getType()->isIntegerTy() && willNotOverflow(SI, LVI)) { 
-      processSaturatingInst(SI, LVI); 
-      return true; 
-    } 
-  } 
- 
+  ConstantRange NWRegion = ConstantRange::makeGuaranteedNoWrapRegion(
+      BO->getBinaryOp(), RRange, BO->getNoWrapKind());
+  return NWRegion.contains(LRange);
+}
+
+static void setDeducedOverflowingFlags(Value *V, Instruction::BinaryOps Opcode,
+                                       bool NewNSW, bool NewNUW) {
+  Statistic *OpcNW, *OpcNSW, *OpcNUW;
+  switch (Opcode) {
+  case Instruction::Add:
+    OpcNW = &NumAddNW;
+    OpcNSW = &NumAddNSW;
+    OpcNUW = &NumAddNUW;
+    break;
+  case Instruction::Sub:
+    OpcNW = &NumSubNW;
+    OpcNSW = &NumSubNSW;
+    OpcNUW = &NumSubNUW;
+    break;
+  case Instruction::Mul:
+    OpcNW = &NumMulNW;
+    OpcNSW = &NumMulNSW;
+    OpcNUW = &NumMulNUW;
+    break;
+  case Instruction::Shl:
+    OpcNW = &NumShlNW;
+    OpcNSW = &NumShlNSW;
+    OpcNUW = &NumShlNUW;
+    break;
+  default:
+    llvm_unreachable("Will not be called with other binops");
+  }
+
+  auto *Inst = dyn_cast<Instruction>(V);
+  if (NewNSW) {
+    ++NumNW;
+    ++*OpcNW;
+    ++NumNSW;
+    ++*OpcNSW;
+    if (Inst)
+      Inst->setHasNoSignedWrap();
+  }
+  if (NewNUW) {
+    ++NumNW;
+    ++*OpcNW;
+    ++NumNUW;
+    ++*OpcNUW;
+    if (Inst)
+      Inst->setHasNoUnsignedWrap();
+  }
+}
+
+static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI);
+
+// Rewrite this with.overflow intrinsic as non-overflowing.
+static void processOverflowIntrinsic(WithOverflowInst *WO, LazyValueInfo *LVI) {
+  IRBuilder<> B(WO);
+  Instruction::BinaryOps Opcode = WO->getBinaryOp();
+  bool NSW = WO->isSigned();
+  bool NUW = !WO->isSigned();
+
+  Value *NewOp =
+      B.CreateBinOp(Opcode, WO->getLHS(), WO->getRHS(), WO->getName());
+  setDeducedOverflowingFlags(NewOp, Opcode, NSW, NUW);
+
+  StructType *ST = cast<StructType>(WO->getType());
+  Constant *Struct = ConstantStruct::get(ST,
+      { UndefValue::get(ST->getElementType(0)),
+        ConstantInt::getFalse(ST->getElementType(1)) });
+  Value *NewI = B.CreateInsertValue(Struct, NewOp, 0);
+  WO->replaceAllUsesWith(NewI);
+  WO->eraseFromParent();
+  ++NumOverflows;
+
+  // See if we can infer the other no-wrap too.
+  if (auto *BO = dyn_cast<BinaryOperator>(NewOp))
+    processBinOp(BO, LVI);
+}
+
+static void processSaturatingInst(SaturatingInst *SI, LazyValueInfo *LVI) {
+  Instruction::BinaryOps Opcode = SI->getBinaryOp();
+  bool NSW = SI->isSigned();
+  bool NUW = !SI->isSigned();
+  BinaryOperator *BinOp = BinaryOperator::Create(
+      Opcode, SI->getLHS(), SI->getRHS(), SI->getName(), SI);
+  BinOp->setDebugLoc(SI->getDebugLoc());
+  setDeducedOverflowingFlags(BinOp, Opcode, NSW, NUW);
+
+  SI->replaceAllUsesWith(BinOp);
+  SI->eraseFromParent();
+  ++NumSaturating;
+
+  // See if we can infer the other no-wrap too.
+  if (auto *BO = dyn_cast<BinaryOperator>(BinOp))
+    processBinOp(BO, LVI);
+}
+
+/// Infer nonnull attributes for the arguments at the specified callsite.
+static bool processCallSite(CallBase &CB, LazyValueInfo *LVI) {
+
+  if (auto *WO = dyn_cast<WithOverflowInst>(&CB)) {
+    if (WO->getLHS()->getType()->isIntegerTy() && willNotOverflow(WO, LVI)) {
+      processOverflowIntrinsic(WO, LVI);
+      return true;
+    }
+  }
+
+  if (auto *SI = dyn_cast<SaturatingInst>(&CB)) {
+    if (SI->getType()->isIntegerTy() && willNotOverflow(SI, LVI)) {
+      processSaturatingInst(SI, LVI);
+      return true;
+    }
+  }
+
   bool Changed = false;
 
-  // Deopt bundle operands are intended to capture state with minimal 
-  // perturbance of the code otherwise.  If we can find a constant value for 
-  // any such operand and remove a use of the original value, that's 
-  // desireable since it may allow further optimization of that value (e.g. via 
-  // single use rules in instcombine).  Since deopt uses tend to, 
-  // idiomatically, appear along rare conditional paths, it's reasonable likely 
-  // we may have a conditional fact with which LVI can fold. 
-  if (auto DeoptBundle = CB.getOperandBundle(LLVMContext::OB_deopt)) { 
-    for (const Use &ConstU : DeoptBundle->Inputs) { 
-      Use &U = const_cast<Use&>(ConstU); 
-      Value *V = U.get(); 
-      if (V->getType()->isVectorTy()) continue; 
-      if (isa<Constant>(V)) continue; 
- 
+  // Deopt bundle operands are intended to capture state with minimal
+  // perturbance of the code otherwise.  If we can find a constant value for
+  // any such operand and remove a use of the original value, that's
+  // desireable since it may allow further optimization of that value (e.g. via
+  // single use rules in instcombine).  Since deopt uses tend to,
+  // idiomatically, appear along rare conditional paths, it's reasonable likely
+  // we may have a conditional fact with which LVI can fold.
+  if (auto DeoptBundle = CB.getOperandBundle(LLVMContext::OB_deopt)) {
+    for (const Use &ConstU : DeoptBundle->Inputs) {
+      Use &U = const_cast<Use&>(ConstU);
+      Value *V = U.get();
+      if (V->getType()->isVectorTy()) continue;
+      if (isa<Constant>(V)) continue;
+
       Constant *C = LVI->getConstant(V, &CB);
-      if (!C) continue; 
-      U.set(C); 
+      if (!C) continue;
+      U.set(C);
       Changed = true;
-    } 
-  } 
- 
+    }
+  }
+
   SmallVector<unsigned, 4> ArgNos;
   unsigned ArgNo = 0;
 
-  for (Value *V : CB.args()) { 
-    PointerType *Type = dyn_cast<PointerType>(V->getType()); 
-    // Try to mark pointer typed parameters as non-null.  We skip the 
-    // relatively expensive analysis for constants which are obviously either 
-    // null or non-null to start with. 
-    if (Type && !CB.paramHasAttr(ArgNo, Attribute::NonNull) && 
-        !isa<Constant>(V) && 
-        LVI->getPredicateAt(ICmpInst::ICMP_EQ, V, 
-                            ConstantPointerNull::get(Type), 
-                            &CB) == LazyValueInfo::False) 
-      ArgNos.push_back(ArgNo); 
-    ArgNo++; 
-  } 
- 
-  assert(ArgNo == CB.arg_size() && "sanity check"); 
- 
-  if (ArgNos.empty()) 
+  for (Value *V : CB.args()) {
+    PointerType *Type = dyn_cast<PointerType>(V->getType());
+    // Try to mark pointer typed parameters as non-null.  We skip the
+    // relatively expensive analysis for constants which are obviously either
+    // null or non-null to start with.
+    if (Type && !CB.paramHasAttr(ArgNo, Attribute::NonNull) &&
+        !isa<Constant>(V) &&
+        LVI->getPredicateAt(ICmpInst::ICMP_EQ, V,
+                            ConstantPointerNull::get(Type),
+                            &CB) == LazyValueInfo::False)
+      ArgNos.push_back(ArgNo);
+    ArgNo++;
+  }
+
+  assert(ArgNo == CB.arg_size() && "sanity check");
+
+  if (ArgNos.empty())
     return Changed;
- 
-  AttributeList AS = CB.getAttributes(); 
-  LLVMContext &Ctx = CB.getContext(); 
-  AS = AS.addParamAttribute(Ctx, ArgNos, 
-                            Attribute::get(Ctx, Attribute::NonNull)); 
-  CB.setAttributes(AS); 
- 
-  return true; 
-} 
- 
+
+  AttributeList AS = CB.getAttributes();
+  LLVMContext &Ctx = CB.getContext();
+  AS = AS.addParamAttribute(Ctx, ArgNos,
+                            Attribute::get(Ctx, Attribute::NonNull));
+  CB.setAttributes(AS);
+
+  return true;
+}
+
 static bool isNonNegative(Value *V, LazyValueInfo *LVI, Instruction *CxtI) {
   Constant *Zero = ConstantInt::get(V->getType(), 0);
   auto Result = LVI->getPredicateAt(ICmpInst::ICMP_SGE, V, Zero, CxtI);
@@ -599,7 +599,7 @@ static bool narrowSDivOrSRem(BinaryOperator *Instr, LazyValueInfo *LVI) {
   for (auto I : zip(Instr->operands(), CRs)) {
     std::get<1>(I) = LVI->getConstantRange(std::get<0>(I), Instr);
     MinSignedBits = std::max(std::get<1>(I)->getMinSignedBits(), MinSignedBits);
-  } 
+  }
 
   // sdiv/srem is UB if divisor is -1 and divident is INT_MIN, so unless we can
   // prove that such a combination is impossible, we need to bump the bitwidth.
@@ -631,58 +631,58 @@ static bool narrowSDivOrSRem(BinaryOperator *Instr, LazyValueInfo *LVI) {
 
   Instr->replaceAllUsesWith(Sext);
   Instr->eraseFromParent();
-  return true; 
-} 
- 
-/// Try to shrink a udiv/urem's width down to the smallest power of two that's 
-/// sufficient to contain its operands. 
-static bool processUDivOrURem(BinaryOperator *Instr, LazyValueInfo *LVI) { 
-  assert(Instr->getOpcode() == Instruction::UDiv || 
-         Instr->getOpcode() == Instruction::URem); 
-  if (Instr->getType()->isVectorTy()) 
-    return false; 
- 
-  // Find the smallest power of two bitwidth that's sufficient to hold Instr's 
-  // operands. 
+  return true;
+}
+
+/// Try to shrink a udiv/urem's width down to the smallest power of two that's
+/// sufficient to contain its operands.
+static bool processUDivOrURem(BinaryOperator *Instr, LazyValueInfo *LVI) {
+  assert(Instr->getOpcode() == Instruction::UDiv ||
+         Instr->getOpcode() == Instruction::URem);
+  if (Instr->getType()->isVectorTy())
+    return false;
+
+  // Find the smallest power of two bitwidth that's sufficient to hold Instr's
+  // operands.
 
   // What is the smallest bit width that can accomodate the entire value ranges
   // of both of the operands?
   unsigned MaxActiveBits = 0;
-  for (Value *Operand : Instr->operands()) { 
+  for (Value *Operand : Instr->operands()) {
     ConstantRange CR = LVI->getConstantRange(Operand, Instr);
     MaxActiveBits = std::max(CR.getActiveBits(), MaxActiveBits);
-  } 
-  // Don't shrink below 8 bits wide. 
+  }
+  // Don't shrink below 8 bits wide.
   unsigned NewWidth = std::max<unsigned>(PowerOf2Ceil(MaxActiveBits), 8);
 
-  // NewWidth might be greater than OrigWidth if OrigWidth is not a power of 
-  // two. 
+  // NewWidth might be greater than OrigWidth if OrigWidth is not a power of
+  // two.
   if (NewWidth >= Instr->getType()->getIntegerBitWidth())
-    return false; 
- 
+    return false;
+
   ++NumUDivURemsNarrowed;
-  IRBuilder<> B{Instr}; 
-  auto *TruncTy = Type::getIntNTy(Instr->getContext(), NewWidth); 
-  auto *LHS = B.CreateTruncOrBitCast(Instr->getOperand(0), TruncTy, 
-                                     Instr->getName() + ".lhs.trunc"); 
-  auto *RHS = B.CreateTruncOrBitCast(Instr->getOperand(1), TruncTy, 
-                                     Instr->getName() + ".rhs.trunc"); 
-  auto *BO = B.CreateBinOp(Instr->getOpcode(), LHS, RHS, Instr->getName()); 
-  auto *Zext = B.CreateZExt(BO, Instr->getType(), Instr->getName() + ".zext"); 
-  if (auto *BinOp = dyn_cast<BinaryOperator>(BO)) 
-    if (BinOp->getOpcode() == Instruction::UDiv) 
-      BinOp->setIsExact(Instr->isExact()); 
- 
-  Instr->replaceAllUsesWith(Zext); 
-  Instr->eraseFromParent(); 
-  return true; 
-} 
- 
-static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) { 
+  IRBuilder<> B{Instr};
+  auto *TruncTy = Type::getIntNTy(Instr->getContext(), NewWidth);
+  auto *LHS = B.CreateTruncOrBitCast(Instr->getOperand(0), TruncTy,
+                                     Instr->getName() + ".lhs.trunc");
+  auto *RHS = B.CreateTruncOrBitCast(Instr->getOperand(1), TruncTy,
+                                     Instr->getName() + ".rhs.trunc");
+  auto *BO = B.CreateBinOp(Instr->getOpcode(), LHS, RHS, Instr->getName());
+  auto *Zext = B.CreateZExt(BO, Instr->getType(), Instr->getName() + ".zext");
+  if (auto *BinOp = dyn_cast<BinaryOperator>(BO))
+    if (BinOp->getOpcode() == Instruction::UDiv)
+      BinOp->setIsExact(Instr->isExact());
+
+  Instr->replaceAllUsesWith(Zext);
+  Instr->eraseFromParent();
+  return true;
+}
+
+static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) {
   assert(SDI->getOpcode() == Instruction::SRem);
   if (SDI->getType()->isVectorTy())
-    return false; 
- 
+    return false;
+
   struct Operand {
     Value *V;
     Domain D;
@@ -698,7 +698,7 @@ static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) {
   }
 
   // We know domains of both of the operands!
-  ++NumSRems; 
+  ++NumSRems;
 
   // We need operands to be non-negative, so negate each one that isn't.
   for (Operand &Op : Ops) {
@@ -721,24 +721,24 @@ static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) {
     Res = BinaryOperator::CreateNeg(Res, Res->getName() + ".neg", SDI);
 
   SDI->replaceAllUsesWith(Res);
-  SDI->eraseFromParent(); 
- 
+  SDI->eraseFromParent();
+
   // Try to simplify our new urem.
   processUDivOrURem(URem, LVI);
- 
-  return true; 
-} 
- 
-/// See if LazyValueInfo's ability to exploit edge conditions or range 
+
+  return true;
+}
+
+/// See if LazyValueInfo's ability to exploit edge conditions or range
 /// information is sufficient to prove the signs of both operands of this SDiv.
 /// If this is the case, replace the SDiv with a UDiv. Even for local
-/// conditions, this can sometimes prove conditions instcombine can't by 
-/// exploiting range information. 
-static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) { 
+/// conditions, this can sometimes prove conditions instcombine can't by
+/// exploiting range information.
+static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) {
   assert(SDI->getOpcode() == Instruction::SDiv);
   if (SDI->getType()->isVectorTy())
-    return false; 
- 
+    return false;
+
   struct Operand {
     Value *V;
     Domain D;
@@ -754,7 +754,7 @@ static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) {
   }
 
   // We know domains of both of the operands!
-  ++NumSDivs; 
+  ++NumSDivs;
 
   // We need operands to be non-negative, so negate each one that isn't.
   for (Operand &Op : Ops) {
@@ -778,14 +778,14 @@ static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) {
     Res = BinaryOperator::CreateNeg(Res, Res->getName() + ".neg", SDI);
 
   SDI->replaceAllUsesWith(Res);
-  SDI->eraseFromParent(); 
- 
-  // Try to simplify our new udiv. 
+  SDI->eraseFromParent();
+
+  // Try to simplify our new udiv.
   processUDivOrURem(UDiv, LVI);
- 
-  return true; 
-} 
- 
+
+  return true;
+}
+
 static bool processSDivOrSRem(BinaryOperator *Instr, LazyValueInfo *LVI) {
   assert(Instr->getOpcode() == Instruction::SDiv ||
          Instr->getOpcode() == Instruction::SRem);
@@ -803,234 +803,234 @@ static bool processSDivOrSRem(BinaryOperator *Instr, LazyValueInfo *LVI) {
   return narrowSDivOrSRem(Instr, LVI);
 }
 
-static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) { 
-  if (SDI->getType()->isVectorTy()) 
-    return false; 
- 
+static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) {
+  if (SDI->getType()->isVectorTy())
+    return false;
+
   if (!isNonNegative(SDI->getOperand(0), LVI, SDI))
-    return false; 
- 
-  ++NumAShrs; 
-  auto *BO = BinaryOperator::CreateLShr(SDI->getOperand(0), SDI->getOperand(1), 
-                                        SDI->getName(), SDI); 
-  BO->setDebugLoc(SDI->getDebugLoc()); 
-  BO->setIsExact(SDI->isExact()); 
-  SDI->replaceAllUsesWith(BO); 
-  SDI->eraseFromParent(); 
- 
-  return true; 
-} 
- 
-static bool processSExt(SExtInst *SDI, LazyValueInfo *LVI) { 
-  if (SDI->getType()->isVectorTy()) 
-    return false; 
- 
-  Value *Base = SDI->getOperand(0); 
- 
+    return false;
+
+  ++NumAShrs;
+  auto *BO = BinaryOperator::CreateLShr(SDI->getOperand(0), SDI->getOperand(1),
+                                        SDI->getName(), SDI);
+  BO->setDebugLoc(SDI->getDebugLoc());
+  BO->setIsExact(SDI->isExact());
+  SDI->replaceAllUsesWith(BO);
+  SDI->eraseFromParent();
+
+  return true;
+}
+
+static bool processSExt(SExtInst *SDI, LazyValueInfo *LVI) {
+  if (SDI->getType()->isVectorTy())
+    return false;
+
+  Value *Base = SDI->getOperand(0);
+
   if (!isNonNegative(Base, LVI, SDI))
-    return false; 
- 
-  ++NumSExt; 
-  auto *ZExt = 
-      CastInst::CreateZExtOrBitCast(Base, SDI->getType(), SDI->getName(), SDI); 
-  ZExt->setDebugLoc(SDI->getDebugLoc()); 
-  SDI->replaceAllUsesWith(ZExt); 
-  SDI->eraseFromParent(); 
- 
-  return true; 
-} 
- 
-static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI) { 
-  using OBO = OverflowingBinaryOperator; 
- 
-  if (DontAddNoWrapFlags) 
-    return false; 
- 
-  if (BinOp->getType()->isVectorTy()) 
-    return false; 
- 
-  bool NSW = BinOp->hasNoSignedWrap(); 
-  bool NUW = BinOp->hasNoUnsignedWrap(); 
-  if (NSW && NUW) 
-    return false; 
- 
-  Instruction::BinaryOps Opcode = BinOp->getOpcode(); 
-  Value *LHS = BinOp->getOperand(0); 
-  Value *RHS = BinOp->getOperand(1); 
- 
+    return false;
+
+  ++NumSExt;
+  auto *ZExt =
+      CastInst::CreateZExtOrBitCast(Base, SDI->getType(), SDI->getName(), SDI);
+  ZExt->setDebugLoc(SDI->getDebugLoc());
+  SDI->replaceAllUsesWith(ZExt);
+  SDI->eraseFromParent();
+
+  return true;
+}
+
+static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI) {
+  using OBO = OverflowingBinaryOperator;
+
+  if (DontAddNoWrapFlags)
+    return false;
+
+  if (BinOp->getType()->isVectorTy())
+    return false;
+
+  bool NSW = BinOp->hasNoSignedWrap();
+  bool NUW = BinOp->hasNoUnsignedWrap();
+  if (NSW && NUW)
+    return false;
+
+  Instruction::BinaryOps Opcode = BinOp->getOpcode();
+  Value *LHS = BinOp->getOperand(0);
+  Value *RHS = BinOp->getOperand(1);
+
   ConstantRange LRange = LVI->getConstantRange(LHS, BinOp);
   ConstantRange RRange = LVI->getConstantRange(RHS, BinOp);
- 
-  bool Changed = false; 
-  bool NewNUW = false, NewNSW = false; 
-  if (!NUW) { 
-    ConstantRange NUWRange = ConstantRange::makeGuaranteedNoWrapRegion( 
-        Opcode, RRange, OBO::NoUnsignedWrap); 
-    NewNUW = NUWRange.contains(LRange); 
-    Changed |= NewNUW; 
-  } 
-  if (!NSW) { 
-    ConstantRange NSWRange = ConstantRange::makeGuaranteedNoWrapRegion( 
-        Opcode, RRange, OBO::NoSignedWrap); 
-    NewNSW = NSWRange.contains(LRange); 
-    Changed |= NewNSW; 
-  } 
- 
-  setDeducedOverflowingFlags(BinOp, Opcode, NewNSW, NewNUW); 
- 
-  return Changed; 
-} 
- 
-static bool processAnd(BinaryOperator *BinOp, LazyValueInfo *LVI) { 
-  if (BinOp->getType()->isVectorTy()) 
-    return false; 
- 
-  // Pattern match (and lhs, C) where C includes a superset of bits which might 
-  // be set in lhs.  This is a common truncation idiom created by instcombine. 
-  Value *LHS = BinOp->getOperand(0); 
-  ConstantInt *RHS = dyn_cast<ConstantInt>(BinOp->getOperand(1)); 
-  if (!RHS || !RHS->getValue().isMask()) 
-    return false; 
- 
-  // We can only replace the AND with LHS based on range info if the range does 
-  // not include undef. 
-  ConstantRange LRange = 
+
+  bool Changed = false;
+  bool NewNUW = false, NewNSW = false;
+  if (!NUW) {
+    ConstantRange NUWRange = ConstantRange::makeGuaranteedNoWrapRegion(
+        Opcode, RRange, OBO::NoUnsignedWrap);
+    NewNUW = NUWRange.contains(LRange);
+    Changed |= NewNUW;
+  }
+  if (!NSW) {
+    ConstantRange NSWRange = ConstantRange::makeGuaranteedNoWrapRegion(
+        Opcode, RRange, OBO::NoSignedWrap);
+    NewNSW = NSWRange.contains(LRange);
+    Changed |= NewNSW;
+  }
+
+  setDeducedOverflowingFlags(BinOp, Opcode, NewNSW, NewNUW);
+
+  return Changed;
+}
+
+static bool processAnd(BinaryOperator *BinOp, LazyValueInfo *LVI) {
+  if (BinOp->getType()->isVectorTy())
+    return false;
+
+  // Pattern match (and lhs, C) where C includes a superset of bits which might
+  // be set in lhs.  This is a common truncation idiom created by instcombine.
+  Value *LHS = BinOp->getOperand(0);
+  ConstantInt *RHS = dyn_cast<ConstantInt>(BinOp->getOperand(1));
+  if (!RHS || !RHS->getValue().isMask())
+    return false;
+
+  // We can only replace the AND with LHS based on range info if the range does
+  // not include undef.
+  ConstantRange LRange =
       LVI->getConstantRange(LHS, BinOp, /*UndefAllowed=*/false);
-  if (!LRange.getUnsignedMax().ule(RHS->getValue())) 
-    return false; 
- 
-  BinOp->replaceAllUsesWith(LHS); 
-  BinOp->eraseFromParent(); 
-  NumAnd++; 
-  return true; 
-} 
- 
- 
-static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) { 
+  if (!LRange.getUnsignedMax().ule(RHS->getValue()))
+    return false;
+
+  BinOp->replaceAllUsesWith(LHS);
+  BinOp->eraseFromParent();
+  NumAnd++;
+  return true;
+}
+
+
+static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) {
   if (Constant *C = LVI->getConstant(V, At))
-    return C; 
- 
-  // TODO: The following really should be sunk inside LVI's core algorithm, or 
-  // at least the outer shims around such. 
-  auto *C = dyn_cast<CmpInst>(V); 
-  if (!C) return nullptr; 
- 
-  Value *Op0 = C->getOperand(0); 
-  Constant *Op1 = dyn_cast<Constant>(C->getOperand(1)); 
-  if (!Op1) return nullptr; 
- 
-  LazyValueInfo::Tristate Result = 
-    LVI->getPredicateAt(C->getPredicate(), Op0, Op1, At); 
-  if (Result == LazyValueInfo::Unknown) 
-    return nullptr; 
- 
-  return (Result == LazyValueInfo::True) ? 
-    ConstantInt::getTrue(C->getContext()) : 
-    ConstantInt::getFalse(C->getContext()); 
-} 
- 
-static bool runImpl(Function &F, LazyValueInfo *LVI, DominatorTree *DT, 
-                    const SimplifyQuery &SQ) { 
-  bool FnChanged = false; 
-  // Visiting in a pre-order depth-first traversal causes us to simplify early 
-  // blocks before querying later blocks (which require us to analyze early 
-  // blocks).  Eagerly simplifying shallow blocks means there is strictly less 
-  // work to do for deep blocks.  This also means we don't visit unreachable 
-  // blocks. 
-  for (BasicBlock *BB : depth_first(&F.getEntryBlock())) { 
-    bool BBChanged = false; 
-    for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { 
-      Instruction *II = &*BI++; 
-      switch (II->getOpcode()) { 
-      case Instruction::Select: 
-        BBChanged |= processSelect(cast<SelectInst>(II), LVI); 
-        break; 
-      case Instruction::PHI: 
-        BBChanged |= processPHI(cast<PHINode>(II), LVI, DT, SQ); 
-        break; 
-      case Instruction::ICmp: 
-      case Instruction::FCmp: 
-        BBChanged |= processCmp(cast<CmpInst>(II), LVI); 
-        break; 
-      case Instruction::Load: 
-      case Instruction::Store: 
-        BBChanged |= processMemAccess(II, LVI); 
-        break; 
-      case Instruction::Call: 
-      case Instruction::Invoke: 
-        BBChanged |= processCallSite(cast<CallBase>(*II), LVI); 
-        break; 
-      case Instruction::SRem: 
-      case Instruction::SDiv: 
+    return C;
+
+  // TODO: The following really should be sunk inside LVI's core algorithm, or
+  // at least the outer shims around such.
+  auto *C = dyn_cast<CmpInst>(V);
+  if (!C) return nullptr;
+
+  Value *Op0 = C->getOperand(0);
+  Constant *Op1 = dyn_cast<Constant>(C->getOperand(1));
+  if (!Op1) return nullptr;
+
+  LazyValueInfo::Tristate Result =
+    LVI->getPredicateAt(C->getPredicate(), Op0, Op1, At);
+  if (Result == LazyValueInfo::Unknown)
+    return nullptr;
+
+  return (Result == LazyValueInfo::True) ?
+    ConstantInt::getTrue(C->getContext()) :
+    ConstantInt::getFalse(C->getContext());
+}
+
+static bool runImpl(Function &F, LazyValueInfo *LVI, DominatorTree *DT,
+                    const SimplifyQuery &SQ) {
+  bool FnChanged = false;
+  // Visiting in a pre-order depth-first traversal causes us to simplify early
+  // blocks before querying later blocks (which require us to analyze early
+  // blocks).  Eagerly simplifying shallow blocks means there is strictly less
+  // work to do for deep blocks.  This also means we don't visit unreachable
+  // blocks.
+  for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
+    bool BBChanged = false;
+    for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
+      Instruction *II = &*BI++;
+      switch (II->getOpcode()) {
+      case Instruction::Select:
+        BBChanged |= processSelect(cast<SelectInst>(II), LVI);
+        break;
+      case Instruction::PHI:
+        BBChanged |= processPHI(cast<PHINode>(II), LVI, DT, SQ);
+        break;
+      case Instruction::ICmp:
+      case Instruction::FCmp:
+        BBChanged |= processCmp(cast<CmpInst>(II), LVI);
+        break;
+      case Instruction::Load:
+      case Instruction::Store:
+        BBChanged |= processMemAccess(II, LVI);
+        break;
+      case Instruction::Call:
+      case Instruction::Invoke:
+        BBChanged |= processCallSite(cast<CallBase>(*II), LVI);
+        break;
+      case Instruction::SRem:
+      case Instruction::SDiv:
         BBChanged |= processSDivOrSRem(cast<BinaryOperator>(II), LVI);
-        break; 
-      case Instruction::UDiv: 
-      case Instruction::URem: 
-        BBChanged |= processUDivOrURem(cast<BinaryOperator>(II), LVI); 
-        break; 
-      case Instruction::AShr: 
-        BBChanged |= processAShr(cast<BinaryOperator>(II), LVI); 
-        break; 
-      case Instruction::SExt: 
-        BBChanged |= processSExt(cast<SExtInst>(II), LVI); 
-        break; 
-      case Instruction::Add: 
-      case Instruction::Sub: 
-      case Instruction::Mul: 
-      case Instruction::Shl: 
-        BBChanged |= processBinOp(cast<BinaryOperator>(II), LVI); 
-        break; 
-      case Instruction::And: 
-        BBChanged |= processAnd(cast<BinaryOperator>(II), LVI); 
-        break; 
-      } 
-    } 
- 
-    Instruction *Term = BB->getTerminator(); 
-    switch (Term->getOpcode()) { 
-    case Instruction::Switch: 
-      BBChanged |= processSwitch(cast<SwitchInst>(Term), LVI, DT); 
-      break; 
-    case Instruction::Ret: { 
-      auto *RI = cast<ReturnInst>(Term); 
-      // Try to determine the return value if we can.  This is mainly here to 
-      // simplify the writing of unit tests, but also helps to enable IPO by 
-      // constant folding the return values of callees. 
-      auto *RetVal = RI->getReturnValue(); 
-      if (!RetVal) break; // handle "ret void" 
-      if (isa<Constant>(RetVal)) break; // nothing to do 
-      if (auto *C = getConstantAt(RetVal, RI, LVI)) { 
-        ++NumReturns; 
-        RI->replaceUsesOfWith(RetVal, C); 
-        BBChanged = true; 
-      } 
-    } 
-    } 
- 
-    FnChanged |= BBChanged; 
-  } 
- 
-  return FnChanged; 
-} 
- 
-bool CorrelatedValuePropagation::runOnFunction(Function &F) { 
-  if (skipFunction(F)) 
-    return false; 
- 
-  LazyValueInfo *LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI(); 
-  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
- 
-  return runImpl(F, LVI, DT, getBestSimplifyQuery(*this, F)); 
-} 
- 
-PreservedAnalyses 
-CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) { 
-  LazyValueInfo *LVI = &AM.getResult<LazyValueAnalysis>(F); 
-  DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F); 
- 
-  bool Changed = runImpl(F, LVI, DT, getBestSimplifyQuery(AM, F)); 
- 
-  PreservedAnalyses PA; 
+        break;
+      case Instruction::UDiv:
+      case Instruction::URem:
+        BBChanged |= processUDivOrURem(cast<BinaryOperator>(II), LVI);
+        break;
+      case Instruction::AShr:
+        BBChanged |= processAShr(cast<BinaryOperator>(II), LVI);
+        break;
+      case Instruction::SExt:
+        BBChanged |= processSExt(cast<SExtInst>(II), LVI);
+        break;
+      case Instruction::Add:
+      case Instruction::Sub:
+      case Instruction::Mul:
+      case Instruction::Shl:
+        BBChanged |= processBinOp(cast<BinaryOperator>(II), LVI);
+        break;
+      case Instruction::And:
+        BBChanged |= processAnd(cast<BinaryOperator>(II), LVI);
+        break;
+      }
+    }
+
+    Instruction *Term = BB->getTerminator();
+    switch (Term->getOpcode()) {
+    case Instruction::Switch:
+      BBChanged |= processSwitch(cast<SwitchInst>(Term), LVI, DT);
+      break;
+    case Instruction::Ret: {
+      auto *RI = cast<ReturnInst>(Term);
+      // Try to determine the return value if we can.  This is mainly here to
+      // simplify the writing of unit tests, but also helps to enable IPO by
+      // constant folding the return values of callees.
+      auto *RetVal = RI->getReturnValue();
+      if (!RetVal) break; // handle "ret void"
+      if (isa<Constant>(RetVal)) break; // nothing to do
+      if (auto *C = getConstantAt(RetVal, RI, LVI)) {
+        ++NumReturns;
+        RI->replaceUsesOfWith(RetVal, C);
+        BBChanged = true;
+      }
+    }
+    }
+
+    FnChanged |= BBChanged;
+  }
+
+  return FnChanged;
+}
+
+bool CorrelatedValuePropagation::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  LazyValueInfo *LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
+  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+  return runImpl(F, LVI, DT, getBestSimplifyQuery(*this, F));
+}
+
+PreservedAnalyses
+CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) {
+  LazyValueInfo *LVI = &AM.getResult<LazyValueAnalysis>(F);
+  DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+
+  bool Changed = runImpl(F, LVI, DT, getBestSimplifyQuery(AM, F));
+
+  PreservedAnalyses PA;
   if (!Changed) {
     PA = PreservedAnalyses::all();
   } else {
@@ -1044,5 +1044,5 @@ CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) {
   // LVI, we know that passes after JumpThreading+CVP will not need the result
   // of this analysis, so we forcefully discard it early.
   PA.abandon<LazyValueAnalysis>();
-  return PA; 
-} 
+  return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/DCE.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/DCE.cpp
index 20f04a2e14..d55adf7c2d 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/DCE.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/DCE.cpp
@@ -1,74 +1,74 @@
-//===- DCE.cpp - Code to perform dead code elimination --------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements dead inst elimination and dead code elimination. 
-// 
-// Dead Inst Elimination performs a single pass over the function removing 
-// instructions that are obviously dead.  Dead Code Elimination is similar, but 
-// it rechecks instructions that were used by removed instructions to see if 
-// they are newly dead. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/DCE.h" 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/IR/InstIterator.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/DebugCounter.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/AssumeBundleBuilder.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "dce" 
- 
-STATISTIC(DCEEliminated, "Number of insts removed"); 
-DEBUG_COUNTER(DCECounter, "dce-transform", 
-              "Controls which instructions are eliminated"); 
- 
-//===--------------------------------------------------------------------===// 
-// RedundantDbgInstElimination pass implementation 
-// 
- 
-namespace { 
-struct RedundantDbgInstElimination : public FunctionPass { 
-  static char ID; // Pass identification, replacement for typeid 
-  RedundantDbgInstElimination() : FunctionPass(ID) { 
-    initializeRedundantDbgInstEliminationPass(*PassRegistry::getPassRegistry()); 
-  } 
-  bool runOnFunction(Function &F) override { 
-    if (skipFunction(F)) 
-      return false; 
-    bool Changed = false; 
-    for (auto &BB : F) 
-      Changed |= RemoveRedundantDbgInstrs(&BB); 
-    return Changed; 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.setPreservesCFG(); 
-  } 
-}; 
-} 
- 
-char RedundantDbgInstElimination::ID = 0; 
-INITIALIZE_PASS(RedundantDbgInstElimination, "redundant-dbg-inst-elim", 
-                "Redundant Dbg Instruction Elimination", false, false) 
- 
-Pass *llvm::createRedundantDbgInstEliminationPass() { 
-  return new RedundantDbgInstElimination(); 
-} 
- 
+//===- DCE.cpp - Code to perform dead code elimination --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements dead inst elimination and dead code elimination.
+//
+// Dead Inst Elimination performs a single pass over the function removing
+// instructions that are obviously dead.  Dead Code Elimination is similar, but
+// it rechecks instructions that were used by removed instructions to see if
+// they are newly dead.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/DCE.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "dce"
+
+STATISTIC(DCEEliminated, "Number of insts removed");
+DEBUG_COUNTER(DCECounter, "dce-transform",
+              "Controls which instructions are eliminated");
+
+//===--------------------------------------------------------------------===//
+// RedundantDbgInstElimination pass implementation
+//
+
+namespace {
+struct RedundantDbgInstElimination : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  RedundantDbgInstElimination() : FunctionPass(ID) {
+    initializeRedundantDbgInstEliminationPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    bool Changed = false;
+    for (auto &BB : F)
+      Changed |= RemoveRedundantDbgInstrs(&BB);
+    return Changed;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+  }
+};
+}
+
+char RedundantDbgInstElimination::ID = 0;
+INITIALIZE_PASS(RedundantDbgInstElimination, "redundant-dbg-inst-elim",
+                "Redundant Dbg Instruction Elimination", false, false)
+
+Pass *llvm::createRedundantDbgInstEliminationPass() {
+  return new RedundantDbgInstElimination();
+}
+
 PreservedAnalyses
 RedundantDbgInstEliminationPass::run(Function &F, FunctionAnalysisManager &AM) {
   bool Changed = false;
@@ -81,103 +81,103 @@ RedundantDbgInstEliminationPass::run(Function &F, FunctionAnalysisManager &AM) {
   return PA;
 }
 
-//===--------------------------------------------------------------------===// 
-// DeadCodeElimination pass implementation 
-// 
- 
-static bool DCEInstruction(Instruction *I, 
-                           SmallSetVector<Instruction *, 16> &WorkList, 
-                           const TargetLibraryInfo *TLI) { 
-  if (isInstructionTriviallyDead(I, TLI)) { 
-    if (!DebugCounter::shouldExecute(DCECounter)) 
-      return false; 
- 
-    salvageDebugInfo(*I); 
-    salvageKnowledge(I); 
- 
-    // Null out all of the instruction's operands to see if any operand becomes 
-    // dead as we go. 
-    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { 
-      Value *OpV = I->getOperand(i); 
-      I->setOperand(i, nullptr); 
- 
-      if (!OpV->use_empty() || I == OpV) 
-        continue; 
- 
-      // If the operand is an instruction that became dead as we nulled out the 
-      // operand, and if it is 'trivially' dead, delete it in a future loop 
-      // iteration. 
-      if (Instruction *OpI = dyn_cast<Instruction>(OpV)) 
-        if (isInstructionTriviallyDead(OpI, TLI)) 
-          WorkList.insert(OpI); 
-    } 
- 
-    I->eraseFromParent(); 
-    ++DCEEliminated; 
-    return true; 
-  } 
-  return false; 
-} 
- 
-static bool eliminateDeadCode(Function &F, TargetLibraryInfo *TLI) { 
-  bool MadeChange = false; 
-  SmallSetVector<Instruction *, 16> WorkList; 
-  // Iterate over the original function, only adding insts to the worklist 
-  // if they actually need to be revisited. This avoids having to pre-init 
-  // the worklist with the entire function's worth of instructions. 
-  for (inst_iterator FI = inst_begin(F), FE = inst_end(F); FI != FE;) { 
-    Instruction *I = &*FI; 
-    ++FI; 
- 
-    // We're visiting this instruction now, so make sure it's not in the 
-    // worklist from an earlier visit. 
-    if (!WorkList.count(I)) 
-      MadeChange |= DCEInstruction(I, WorkList, TLI); 
-  } 
- 
-  while (!WorkList.empty()) { 
-    Instruction *I = WorkList.pop_back_val(); 
-    MadeChange |= DCEInstruction(I, WorkList, TLI); 
-  } 
-  return MadeChange; 
-} 
- 
-PreservedAnalyses DCEPass::run(Function &F, FunctionAnalysisManager &AM) { 
+//===--------------------------------------------------------------------===//
+// DeadCodeElimination pass implementation
+//
+
+static bool DCEInstruction(Instruction *I,
+                           SmallSetVector<Instruction *, 16> &WorkList,
+                           const TargetLibraryInfo *TLI) {
+  if (isInstructionTriviallyDead(I, TLI)) {
+    if (!DebugCounter::shouldExecute(DCECounter))
+      return false;
+
+    salvageDebugInfo(*I);
+    salvageKnowledge(I);
+
+    // Null out all of the instruction's operands to see if any operand becomes
+    // dead as we go.
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+      Value *OpV = I->getOperand(i);
+      I->setOperand(i, nullptr);
+
+      if (!OpV->use_empty() || I == OpV)
+        continue;
+
+      // If the operand is an instruction that became dead as we nulled out the
+      // operand, and if it is 'trivially' dead, delete it in a future loop
+      // iteration.
+      if (Instruction *OpI = dyn_cast<Instruction>(OpV))
+        if (isInstructionTriviallyDead(OpI, TLI))
+          WorkList.insert(OpI);
+    }
+
+    I->eraseFromParent();
+    ++DCEEliminated;
+    return true;
+  }
+  return false;
+}
+
+static bool eliminateDeadCode(Function &F, TargetLibraryInfo *TLI) {
+  bool MadeChange = false;
+  SmallSetVector<Instruction *, 16> WorkList;
+  // Iterate over the original function, only adding insts to the worklist
+  // if they actually need to be revisited. This avoids having to pre-init
+  // the worklist with the entire function's worth of instructions.
+  for (inst_iterator FI = inst_begin(F), FE = inst_end(F); FI != FE;) {
+    Instruction *I = &*FI;
+    ++FI;
+
+    // We're visiting this instruction now, so make sure it's not in the
+    // worklist from an earlier visit.
+    if (!WorkList.count(I))
+      MadeChange |= DCEInstruction(I, WorkList, TLI);
+  }
+
+  while (!WorkList.empty()) {
+    Instruction *I = WorkList.pop_back_val();
+    MadeChange |= DCEInstruction(I, WorkList, TLI);
+  }
+  return MadeChange;
+}
+
+PreservedAnalyses DCEPass::run(Function &F, FunctionAnalysisManager &AM) {
   if (!eliminateDeadCode(F, &AM.getResult<TargetLibraryAnalysis>(F)))
-    return PreservedAnalyses::all(); 
- 
-  PreservedAnalyses PA; 
-  PA.preserveSet<CFGAnalyses>(); 
-  return PA; 
-} 
- 
-namespace { 
-struct DCELegacyPass : public FunctionPass { 
-  static char ID; // Pass identification, replacement for typeid 
-  DCELegacyPass() : FunctionPass(ID) { 
-    initializeDCELegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnFunction(Function &F) override { 
-    if (skipFunction(F)) 
-      return false; 
- 
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
+namespace {
+struct DCELegacyPass : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  DCELegacyPass() : FunctionPass(ID) {
+    initializeDCELegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
     TargetLibraryInfo *TLI =
         &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
- 
-    return eliminateDeadCode(F, TLI); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
+
+    return eliminateDeadCode(F, TLI);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetLibraryInfoWrapperPass>();
-    AU.setPreservesCFG(); 
-  } 
-}; 
-} 
- 
-char DCELegacyPass::ID = 0; 
-INITIALIZE_PASS(DCELegacyPass, "dce", "Dead Code Elimination", false, false) 
- 
-FunctionPass *llvm::createDeadCodeEliminationPass() { 
-  return new DCELegacyPass(); 
-} 
+    AU.setPreservesCFG();
+  }
+};
+}
+
+char DCELegacyPass::ID = 0;
+INITIALIZE_PASS(DCELegacyPass, "dce", "Dead Code Elimination", false, false)
+
+FunctionPass *llvm::createDeadCodeEliminationPass() {
+  return new DCELegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 3d34beb8d9..2979225c60 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -1,134 +1,134 @@
-//===- DeadStoreElimination.cpp - Fast Dead Store Elimination -------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements a trivial dead store elimination that only considers 
-// basic-block local redundant stores. 
-// 
-// FIXME: This should eventually be extended to be a post-dominator tree 
-// traversal.  Doing so would be pretty trivial. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/DeadStoreElimination.h" 
-#include "llvm/ADT/APInt.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/MapVector.h" 
-#include "llvm/ADT/PostOrderIterator.h" 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/StringRef.h" 
-#include "llvm/Analysis/AliasAnalysis.h" 
-#include "llvm/Analysis/CaptureTracking.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/MemoryBuiltins.h" 
-#include "llvm/Analysis/MemoryDependenceAnalysis.h" 
-#include "llvm/Analysis/MemoryLocation.h" 
-#include "llvm/Analysis/MemorySSA.h" 
-#include "llvm/Analysis/MemorySSAUpdater.h" 
-#include "llvm/Analysis/PostDominators.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/Argument.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/InstIterator.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/DebugCounter.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/MathExtras.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/AssumeBundleBuilder.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <cstddef> 
-#include <cstdint> 
-#include <iterator> 
-#include <map> 
-#include <utility> 
- 
-using namespace llvm; 
-using namespace PatternMatch; 
- 
-#define DEBUG_TYPE "dse" 
- 
-STATISTIC(NumRemainingStores, "Number of stores remaining after DSE"); 
-STATISTIC(NumRedundantStores, "Number of redundant stores deleted"); 
-STATISTIC(NumFastStores, "Number of stores deleted"); 
-STATISTIC(NumFastOther, "Number of other instrs removed"); 
-STATISTIC(NumCompletePartials, "Number of stores dead by later partials"); 
-STATISTIC(NumModifiedStores, "Number of stores modified"); 
-STATISTIC(NumCFGChecks, "Number of stores modified"); 
-STATISTIC(NumCFGTries, "Number of stores modified"); 
-STATISTIC(NumCFGSuccess, "Number of stores modified"); 
+//===- DeadStoreElimination.cpp - Fast Dead Store Elimination -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a trivial dead store elimination that only considers
+// basic-block local redundant stores.
+//
+// FIXME: This should eventually be extended to be a post-dominator tree
+// traversal.  Doing so would be pretty trivial.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/DeadStoreElimination.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <map>
+#include <utility>
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "dse"
+
+STATISTIC(NumRemainingStores, "Number of stores remaining after DSE");
+STATISTIC(NumRedundantStores, "Number of redundant stores deleted");
+STATISTIC(NumFastStores, "Number of stores deleted");
+STATISTIC(NumFastOther, "Number of other instrs removed");
+STATISTIC(NumCompletePartials, "Number of stores dead by later partials");
+STATISTIC(NumModifiedStores, "Number of stores modified");
+STATISTIC(NumCFGChecks, "Number of stores modified");
+STATISTIC(NumCFGTries, "Number of stores modified");
+STATISTIC(NumCFGSuccess, "Number of stores modified");
 STATISTIC(NumGetDomMemoryDefPassed,
           "Number of times a valid candidate is returned from getDomMemoryDef");
 STATISTIC(NumDomMemDefChecks,
           "Number iterations check for reads in getDomMemoryDef");
- 
-DEBUG_COUNTER(MemorySSACounter, "dse-memoryssa", 
-              "Controls which MemoryDefs are eliminated."); 
- 
-static cl::opt<bool> 
-EnablePartialOverwriteTracking("enable-dse-partial-overwrite-tracking", 
-  cl::init(true), cl::Hidden, 
-  cl::desc("Enable partial-overwrite tracking in DSE")); 
- 
-static cl::opt<bool> 
-EnablePartialStoreMerging("enable-dse-partial-store-merging", 
-  cl::init(true), cl::Hidden, 
-  cl::desc("Enable partial store merging in DSE")); 
- 
-static cl::opt<bool> 
+
+DEBUG_COUNTER(MemorySSACounter, "dse-memoryssa",
+              "Controls which MemoryDefs are eliminated.");
+
+static cl::opt<bool>
+EnablePartialOverwriteTracking("enable-dse-partial-overwrite-tracking",
+  cl::init(true), cl::Hidden,
+  cl::desc("Enable partial-overwrite tracking in DSE"));
+
+static cl::opt<bool>
+EnablePartialStoreMerging("enable-dse-partial-store-merging",
+  cl::init(true), cl::Hidden,
+  cl::desc("Enable partial store merging in DSE"));
+
+static cl::opt<bool>
     EnableMemorySSA("enable-dse-memoryssa", cl::init(true), cl::Hidden,
-                    cl::desc("Use the new MemorySSA-backed DSE.")); 
- 
-static cl::opt<unsigned> 
+                    cl::desc("Use the new MemorySSA-backed DSE."));
+
+static cl::opt<unsigned>
     MemorySSAScanLimit("dse-memoryssa-scanlimit", cl::init(150), cl::Hidden,
-                       cl::desc("The number of memory instructions to scan for " 
-                                "dead store elimination (default = 100)")); 
+                       cl::desc("The number of memory instructions to scan for "
+                                "dead store elimination (default = 100)"));
 static cl::opt<unsigned> MemorySSAUpwardsStepLimit(
     "dse-memoryssa-walklimit", cl::init(90), cl::Hidden,
     cl::desc("The maximum number of steps while walking upwards to find "
              "MemoryDefs that may be killed (default = 90)"));
- 
+
 static cl::opt<unsigned> MemorySSAPartialStoreLimit(
     "dse-memoryssa-partial-store-limit", cl::init(5), cl::Hidden,
     cl::desc("The maximum number candidates that only partially overwrite the "
              "killing MemoryDef to consider"
              " (default = 5)"));
 
-static cl::opt<unsigned> MemorySSADefsPerBlockLimit( 
-    "dse-memoryssa-defs-per-block-limit", cl::init(5000), cl::Hidden, 
-    cl::desc("The number of MemoryDefs we consider as candidates to eliminated " 
-             "other stores per basic block (default = 5000)")); 
- 
+static cl::opt<unsigned> MemorySSADefsPerBlockLimit(
+    "dse-memoryssa-defs-per-block-limit", cl::init(5000), cl::Hidden,
+    cl::desc("The number of MemoryDefs we consider as candidates to eliminated "
+             "other stores per basic block (default = 5000)"));
+
 static cl::opt<unsigned> MemorySSASameBBStepCost(
     "dse-memoryssa-samebb-cost", cl::init(1), cl::Hidden,
     cl::desc(
@@ -142,273 +142,273 @@ static cl::opt<unsigned>
                                       "block than the killing MemoryDef"
                                       "(default = 5)"));
 
-static cl::opt<unsigned> MemorySSAPathCheckLimit( 
-    "dse-memoryssa-path-check-limit", cl::init(50), cl::Hidden, 
-    cl::desc("The maximum number of blocks to check when trying to prove that " 
-             "all paths to an exit go through a killing block (default = 50)")); 
- 
-//===----------------------------------------------------------------------===// 
-// Helper functions 
-//===----------------------------------------------------------------------===// 
-using OverlapIntervalsTy = std::map<int64_t, int64_t>; 
-using InstOverlapIntervalsTy = DenseMap<Instruction *, OverlapIntervalsTy>; 
- 
-/// Delete this instruction.  Before we do, go through and zero out all the 
-/// operands of this instruction.  If any of them become dead, delete them and 
-/// the computation tree that feeds them. 
-/// If ValueSet is non-null, remove any deleted instructions from it as well. 
-static void 
-deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI, 
-                      MemoryDependenceResults &MD, const TargetLibraryInfo &TLI, 
-                      InstOverlapIntervalsTy &IOL, 
-                      MapVector<Instruction *, bool> &ThrowableInst, 
-                      SmallSetVector<const Value *, 16> *ValueSet = nullptr) { 
-  SmallVector<Instruction*, 32> NowDeadInsts; 
- 
-  NowDeadInsts.push_back(I); 
-  --NumFastOther; 
- 
-  // Keeping the iterator straight is a pain, so we let this routine tell the 
-  // caller what the next instruction is after we're done mucking about. 
-  BasicBlock::iterator NewIter = *BBI; 
- 
-  // Before we touch this instruction, remove it from memdep! 
-  do { 
-    Instruction *DeadInst = NowDeadInsts.pop_back_val(); 
-    // Mark the DeadInst as dead in the list of throwable instructions. 
-    auto It = ThrowableInst.find(DeadInst); 
-    if (It != ThrowableInst.end()) 
-      ThrowableInst[It->first] = false; 
-    ++NumFastOther; 
- 
-    // Try to preserve debug information attached to the dead instruction. 
-    salvageDebugInfo(*DeadInst); 
-    salvageKnowledge(DeadInst); 
- 
-    // This instruction is dead, zap it, in stages.  Start by removing it from 
-    // MemDep, which needs to know the operands and needs it to be in the 
-    // function. 
-    MD.removeInstruction(DeadInst); 
- 
-    for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) { 
-      Value *Op = DeadInst->getOperand(op); 
-      DeadInst->setOperand(op, nullptr); 
- 
-      // If this operand just became dead, add it to the NowDeadInsts list. 
-      if (!Op->use_empty()) continue; 
- 
-      if (Instruction *OpI = dyn_cast<Instruction>(Op)) 
-        if (isInstructionTriviallyDead(OpI, &TLI)) 
-          NowDeadInsts.push_back(OpI); 
-    } 
- 
-    if (ValueSet) ValueSet->remove(DeadInst); 
-    IOL.erase(DeadInst); 
- 
-    if (NewIter == DeadInst->getIterator()) 
-      NewIter = DeadInst->eraseFromParent(); 
-    else 
-      DeadInst->eraseFromParent(); 
-  } while (!NowDeadInsts.empty()); 
-  *BBI = NewIter; 
-  // Pop dead entries from back of ThrowableInst till we find an alive entry. 
-  while (!ThrowableInst.empty() && !ThrowableInst.back().second) 
-    ThrowableInst.pop_back(); 
-} 
- 
-/// Does this instruction write some memory?  This only returns true for things 
-/// that we can analyze with other helpers below. 
-static bool hasAnalyzableMemoryWrite(Instruction *I, 
-                                     const TargetLibraryInfo &TLI) { 
-  if (isa<StoreInst>(I)) 
-    return true; 
-  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 
-    switch (II->getIntrinsicID()) { 
-    default: 
-      return false; 
-    case Intrinsic::memset: 
-    case Intrinsic::memmove: 
-    case Intrinsic::memcpy: 
+static cl::opt<unsigned> MemorySSAPathCheckLimit(
+    "dse-memoryssa-path-check-limit", cl::init(50), cl::Hidden,
+    cl::desc("The maximum number of blocks to check when trying to prove that "
+             "all paths to an exit go through a killing block (default = 50)"));
+
+//===----------------------------------------------------------------------===//
+// Helper functions
+//===----------------------------------------------------------------------===//
+using OverlapIntervalsTy = std::map<int64_t, int64_t>;
+using InstOverlapIntervalsTy = DenseMap<Instruction *, OverlapIntervalsTy>;
+
+/// Delete this instruction.  Before we do, go through and zero out all the
+/// operands of this instruction.  If any of them become dead, delete them and
+/// the computation tree that feeds them.
+/// If ValueSet is non-null, remove any deleted instructions from it as well.
+static void
+deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI,
+                      MemoryDependenceResults &MD, const TargetLibraryInfo &TLI,
+                      InstOverlapIntervalsTy &IOL,
+                      MapVector<Instruction *, bool> &ThrowableInst,
+                      SmallSetVector<const Value *, 16> *ValueSet = nullptr) {
+  SmallVector<Instruction*, 32> NowDeadInsts;
+
+  NowDeadInsts.push_back(I);
+  --NumFastOther;
+
+  // Keeping the iterator straight is a pain, so we let this routine tell the
+  // caller what the next instruction is after we're done mucking about.
+  BasicBlock::iterator NewIter = *BBI;
+
+  // Before we touch this instruction, remove it from memdep!
+  do {
+    Instruction *DeadInst = NowDeadInsts.pop_back_val();
+    // Mark the DeadInst as dead in the list of throwable instructions.
+    auto It = ThrowableInst.find(DeadInst);
+    if (It != ThrowableInst.end())
+      ThrowableInst[It->first] = false;
+    ++NumFastOther;
+
+    // Try to preserve debug information attached to the dead instruction.
+    salvageDebugInfo(*DeadInst);
+    salvageKnowledge(DeadInst);
+
+    // This instruction is dead, zap it, in stages.  Start by removing it from
+    // MemDep, which needs to know the operands and needs it to be in the
+    // function.
+    MD.removeInstruction(DeadInst);
+
+    for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) {
+      Value *Op = DeadInst->getOperand(op);
+      DeadInst->setOperand(op, nullptr);
+
+      // If this operand just became dead, add it to the NowDeadInsts list.
+      if (!Op->use_empty()) continue;
+
+      if (Instruction *OpI = dyn_cast<Instruction>(Op))
+        if (isInstructionTriviallyDead(OpI, &TLI))
+          NowDeadInsts.push_back(OpI);
+    }
+
+    if (ValueSet) ValueSet->remove(DeadInst);
+    IOL.erase(DeadInst);
+
+    if (NewIter == DeadInst->getIterator())
+      NewIter = DeadInst->eraseFromParent();
+    else
+      DeadInst->eraseFromParent();
+  } while (!NowDeadInsts.empty());
+  *BBI = NewIter;
+  // Pop dead entries from back of ThrowableInst till we find an alive entry.
+  while (!ThrowableInst.empty() && !ThrowableInst.back().second)
+    ThrowableInst.pop_back();
+}
+
+/// Does this instruction write some memory?  This only returns true for things
+/// that we can analyze with other helpers below.
+static bool hasAnalyzableMemoryWrite(Instruction *I,
+                                     const TargetLibraryInfo &TLI) {
+  if (isa<StoreInst>(I))
+    return true;
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+    default:
+      return false;
+    case Intrinsic::memset:
+    case Intrinsic::memmove:
+    case Intrinsic::memcpy:
     case Intrinsic::memcpy_inline:
-    case Intrinsic::memcpy_element_unordered_atomic: 
-    case Intrinsic::memmove_element_unordered_atomic: 
-    case Intrinsic::memset_element_unordered_atomic: 
-    case Intrinsic::init_trampoline: 
-    case Intrinsic::lifetime_end: 
+    case Intrinsic::memcpy_element_unordered_atomic:
+    case Intrinsic::memmove_element_unordered_atomic:
+    case Intrinsic::memset_element_unordered_atomic:
+    case Intrinsic::init_trampoline:
+    case Intrinsic::lifetime_end:
     case Intrinsic::masked_store:
-      return true; 
-    } 
-  } 
-  if (auto *CB = dyn_cast<CallBase>(I)) { 
-    LibFunc LF; 
-    if (TLI.getLibFunc(*CB, LF) && TLI.has(LF)) { 
-      switch (LF) { 
-      case LibFunc_strcpy: 
-      case LibFunc_strncpy: 
-      case LibFunc_strcat: 
-      case LibFunc_strncat: 
-        return true; 
-      default: 
-        return false; 
-      } 
-    } 
-  } 
-  return false; 
-} 
- 
-/// Return a Location stored to by the specified instruction. If isRemovable 
-/// returns true, this function and getLocForRead completely describe the memory 
-/// operations for this instruction. 
+      return true;
+    }
+  }
+  if (auto *CB = dyn_cast<CallBase>(I)) {
+    LibFunc LF;
+    if (TLI.getLibFunc(*CB, LF) && TLI.has(LF)) {
+      switch (LF) {
+      case LibFunc_strcpy:
+      case LibFunc_strncpy:
+      case LibFunc_strcat:
+      case LibFunc_strncat:
+        return true;
+      default:
+        return false;
+      }
+    }
+  }
+  return false;
+}
+
+/// Return a Location stored to by the specified instruction. If isRemovable
+/// returns true, this function and getLocForRead completely describe the memory
+/// operations for this instruction.
 static MemoryLocation getLocForWrite(Instruction *Inst,
                                      const TargetLibraryInfo &TLI) {
-  if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) 
-    return MemoryLocation::get(SI); 
- 
+  if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+    return MemoryLocation::get(SI);
+
   // memcpy/memmove/memset.
   if (auto *MI = dyn_cast<AnyMemIntrinsic>(Inst))
     return MemoryLocation::getForDest(MI);
- 
-  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { 
-    switch (II->getIntrinsicID()) { 
-    default: 
-      return MemoryLocation(); // Unhandled intrinsic. 
-    case Intrinsic::init_trampoline: 
+
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+    switch (II->getIntrinsicID()) {
+    default:
+      return MemoryLocation(); // Unhandled intrinsic.
+    case Intrinsic::init_trampoline:
       return MemoryLocation::getAfter(II->getArgOperand(0));
     case Intrinsic::masked_store:
       return MemoryLocation::getForArgument(II, 1, TLI);
-    case Intrinsic::lifetime_end: { 
-      uint64_t Len = cast<ConstantInt>(II->getArgOperand(0))->getZExtValue(); 
-      return MemoryLocation(II->getArgOperand(1), Len); 
-    } 
-    } 
-  } 
-  if (auto *CB = dyn_cast<CallBase>(Inst)) 
-    // All the supported TLI functions so far happen to have dest as their 
-    // first argument. 
+    case Intrinsic::lifetime_end: {
+      uint64_t Len = cast<ConstantInt>(II->getArgOperand(0))->getZExtValue();
+      return MemoryLocation(II->getArgOperand(1), Len);
+    }
+    }
+  }
+  if (auto *CB = dyn_cast<CallBase>(Inst))
+    // All the supported TLI functions so far happen to have dest as their
+    // first argument.
     return MemoryLocation::getAfter(CB->getArgOperand(0));
-  return MemoryLocation(); 
-} 
- 
-/// Return the location read by the specified "hasAnalyzableMemoryWrite" 
-/// instruction if any. 
-static MemoryLocation getLocForRead(Instruction *Inst, 
-                                    const TargetLibraryInfo &TLI) { 
-  assert(hasAnalyzableMemoryWrite(Inst, TLI) && "Unknown instruction case"); 
- 
-  // The only instructions that both read and write are the mem transfer 
-  // instructions (memcpy/memmove). 
-  if (auto *MTI = dyn_cast<AnyMemTransferInst>(Inst)) 
-    return MemoryLocation::getForSource(MTI); 
-  return MemoryLocation(); 
-} 
- 
-/// If the value of this instruction and the memory it writes to is unused, may 
-/// we delete this instruction? 
-static bool isRemovable(Instruction *I) { 
-  // Don't remove volatile/atomic stores. 
-  if (StoreInst *SI = dyn_cast<StoreInst>(I)) 
-    return SI->isUnordered(); 
- 
-  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 
-    switch (II->getIntrinsicID()) { 
-    default: llvm_unreachable("doesn't pass 'hasAnalyzableMemoryWrite' predicate"); 
-    case Intrinsic::lifetime_end: 
-      // Never remove dead lifetime_end's, e.g. because it is followed by a 
-      // free. 
-      return false; 
-    case Intrinsic::init_trampoline: 
-      // Always safe to remove init_trampoline. 
-      return true; 
-    case Intrinsic::memset: 
-    case Intrinsic::memmove: 
-    case Intrinsic::memcpy: 
+  return MemoryLocation();
+}
+
+/// Return the location read by the specified "hasAnalyzableMemoryWrite"
+/// instruction if any.
+static MemoryLocation getLocForRead(Instruction *Inst,
+                                    const TargetLibraryInfo &TLI) {
+  assert(hasAnalyzableMemoryWrite(Inst, TLI) && "Unknown instruction case");
+
+  // The only instructions that both read and write are the mem transfer
+  // instructions (memcpy/memmove).
+  if (auto *MTI = dyn_cast<AnyMemTransferInst>(Inst))
+    return MemoryLocation::getForSource(MTI);
+  return MemoryLocation();
+}
+
+/// If the value of this instruction and the memory it writes to is unused, may
+/// we delete this instruction?
+static bool isRemovable(Instruction *I) {
+  // Don't remove volatile/atomic stores.
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->isUnordered();
+
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+    default: llvm_unreachable("doesn't pass 'hasAnalyzableMemoryWrite' predicate");
+    case Intrinsic::lifetime_end:
+      // Never remove dead lifetime_end's, e.g. because it is followed by a
+      // free.
+      return false;
+    case Intrinsic::init_trampoline:
+      // Always safe to remove init_trampoline.
+      return true;
+    case Intrinsic::memset:
+    case Intrinsic::memmove:
+    case Intrinsic::memcpy:
     case Intrinsic::memcpy_inline:
-      // Don't remove volatile memory intrinsics. 
-      return !cast<MemIntrinsic>(II)->isVolatile(); 
-    case Intrinsic::memcpy_element_unordered_atomic: 
-    case Intrinsic::memmove_element_unordered_atomic: 
-    case Intrinsic::memset_element_unordered_atomic: 
+      // Don't remove volatile memory intrinsics.
+      return !cast<MemIntrinsic>(II)->isVolatile();
+    case Intrinsic::memcpy_element_unordered_atomic:
+    case Intrinsic::memmove_element_unordered_atomic:
+    case Intrinsic::memset_element_unordered_atomic:
     case Intrinsic::masked_store:
-      return true; 
-    } 
-  } 
- 
-  // note: only get here for calls with analyzable writes - i.e. libcalls 
-  if (auto *CB = dyn_cast<CallBase>(I)) 
-    return CB->use_empty(); 
- 
-  return false; 
-} 
- 
-/// Returns true if the end of this instruction can be safely shortened in 
-/// length. 
-static bool isShortenableAtTheEnd(Instruction *I) { 
-  // Don't shorten stores for now 
-  if (isa<StoreInst>(I)) 
-    return false; 
- 
-  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 
-    switch (II->getIntrinsicID()) { 
-      default: return false; 
-      case Intrinsic::memset: 
-      case Intrinsic::memcpy: 
-      case Intrinsic::memcpy_element_unordered_atomic: 
-      case Intrinsic::memset_element_unordered_atomic: 
-        // Do shorten memory intrinsics. 
-        // FIXME: Add memmove if it's also safe to transform. 
-        return true; 
-    } 
-  } 
- 
-  // Don't shorten libcalls calls for now. 
- 
-  return false; 
-} 
- 
-/// Returns true if the beginning of this instruction can be safely shortened 
-/// in length. 
-static bool isShortenableAtTheBeginning(Instruction *I) { 
-  // FIXME: Handle only memset for now. Supporting memcpy/memmove should be 
-  // easily done by offsetting the source address. 
-  return isa<AnyMemSetInst>(I); 
-} 
- 
-/// Return the pointer that is being written to. 
+      return true;
+    }
+  }
+
+  // note: only get here for calls with analyzable writes - i.e. libcalls
+  if (auto *CB = dyn_cast<CallBase>(I))
+    return CB->use_empty();
+
+  return false;
+}
+
+/// Returns true if the end of this instruction can be safely shortened in
+/// length.
+static bool isShortenableAtTheEnd(Instruction *I) {
+  // Don't shorten stores for now
+  if (isa<StoreInst>(I))
+    return false;
+
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+      default: return false;
+      case Intrinsic::memset:
+      case Intrinsic::memcpy:
+      case Intrinsic::memcpy_element_unordered_atomic:
+      case Intrinsic::memset_element_unordered_atomic:
+        // Do shorten memory intrinsics.
+        // FIXME: Add memmove if it's also safe to transform.
+        return true;
+    }
+  }
+
+  // Don't shorten libcalls calls for now.
+
+  return false;
+}
+
+/// Returns true if the beginning of this instruction can be safely shortened
+/// in length.
+static bool isShortenableAtTheBeginning(Instruction *I) {
+  // FIXME: Handle only memset for now. Supporting memcpy/memmove should be
+  // easily done by offsetting the source address.
+  return isa<AnyMemSetInst>(I);
+}
+
+/// Return the pointer that is being written to.
 static Value *getStoredPointerOperand(Instruction *I,
                                       const TargetLibraryInfo &TLI) {
-  //TODO: factor this to reuse getLocForWrite 
+  //TODO: factor this to reuse getLocForWrite
   MemoryLocation Loc = getLocForWrite(I, TLI);
-  assert(Loc.Ptr && 
-         "unable to find pointer written for analyzable instruction?"); 
-  // TODO: most APIs don't expect const Value * 
-  return const_cast<Value*>(Loc.Ptr); 
-} 
- 
-static uint64_t getPointerSize(const Value *V, const DataLayout &DL, 
-                               const TargetLibraryInfo &TLI, 
-                               const Function *F) { 
-  uint64_t Size; 
-  ObjectSizeOpts Opts; 
-  Opts.NullIsUnknownSize = NullPointerIsDefined(F); 
- 
-  if (getObjectSize(V, Size, DL, &TLI, Opts)) 
-    return Size; 
-  return MemoryLocation::UnknownSize; 
-} 
- 
-namespace { 
- 
-enum OverwriteResult { 
-  OW_Begin, 
-  OW_Complete, 
-  OW_End, 
-  OW_PartialEarlierWithFullLater, 
+  assert(Loc.Ptr &&
+         "unable to find pointer written for analyzable instruction?");
+  // TODO: most APIs don't expect const Value *
+  return const_cast<Value*>(Loc.Ptr);
+}
+
+static uint64_t getPointerSize(const Value *V, const DataLayout &DL,
+                               const TargetLibraryInfo &TLI,
+                               const Function *F) {
+  uint64_t Size;
+  ObjectSizeOpts Opts;
+  Opts.NullIsUnknownSize = NullPointerIsDefined(F);
+
+  if (getObjectSize(V, Size, DL, &TLI, Opts))
+    return Size;
+  return MemoryLocation::UnknownSize;
+}
+
+namespace {
+
+enum OverwriteResult {
+  OW_Begin,
+  OW_Complete,
+  OW_End,
+  OW_PartialEarlierWithFullLater,
   OW_MaybePartial,
-  OW_Unknown 
-}; 
- 
-} // end anonymous namespace 
- 
+  OW_Unknown
+};
+
+} // end anonymous namespace
+
 /// Check if two instruction are masked stores that completely
 /// overwrite one another. More specifically, \p Later has to
 /// overwrite \p Earlier.
@@ -449,56 +449,56 @@ isOverwrite(const Instruction *LaterI, const Instruction *EarlierI,
             const DataLayout &DL, const TargetLibraryInfo &TLI,
             int64_t &EarlierOff, int64_t &LaterOff, AATy &AA,
             const Function *F) {
-  // FIXME: Vet that this works for size upper-bounds. Seems unlikely that we'll 
-  // get imprecise values here, though (except for unknown sizes). 
+  // FIXME: Vet that this works for size upper-bounds. Seems unlikely that we'll
+  // get imprecise values here, though (except for unknown sizes).
   if (!Later.Size.isPrecise() || !Earlier.Size.isPrecise()) {
     // Masked stores have imprecise locations, but we can reason about them
     // to some extent.
     return isMaskedStoreOverwrite(LaterI, EarlierI, AA);
   }
- 
-  const uint64_t LaterSize = Later.Size.getValue(); 
-  const uint64_t EarlierSize = Earlier.Size.getValue(); 
- 
-  const Value *P1 = Earlier.Ptr->stripPointerCasts(); 
-  const Value *P2 = Later.Ptr->stripPointerCasts(); 
- 
-  // If the start pointers are the same, we just have to compare sizes to see if 
-  // the later store was larger than the earlier store. 
-  if (P1 == P2 || AA.isMustAlias(P1, P2)) { 
-    // Make sure that the Later size is >= the Earlier size. 
-    if (LaterSize >= EarlierSize) 
-      return OW_Complete; 
-  } 
- 
-  // Check to see if the later store is to the entire object (either a global, 
-  // an alloca, or a byval/inalloca argument).  If so, then it clearly 
-  // overwrites any other store to the same object. 
+
+  const uint64_t LaterSize = Later.Size.getValue();
+  const uint64_t EarlierSize = Earlier.Size.getValue();
+
+  const Value *P1 = Earlier.Ptr->stripPointerCasts();
+  const Value *P2 = Later.Ptr->stripPointerCasts();
+
+  // If the start pointers are the same, we just have to compare sizes to see if
+  // the later store was larger than the earlier store.
+  if (P1 == P2 || AA.isMustAlias(P1, P2)) {
+    // Make sure that the Later size is >= the Earlier size.
+    if (LaterSize >= EarlierSize)
+      return OW_Complete;
+  }
+
+  // Check to see if the later store is to the entire object (either a global,
+  // an alloca, or a byval/inalloca argument).  If so, then it clearly
+  // overwrites any other store to the same object.
   const Value *UO1 = getUnderlyingObject(P1), *UO2 = getUnderlyingObject(P2);
- 
-  // If we can't resolve the same pointers to the same object, then we can't 
-  // analyze them at all. 
-  if (UO1 != UO2) 
-    return OW_Unknown; 
- 
-  // If the "Later" store is to a recognizable object, get its size. 
-  uint64_t ObjectSize = getPointerSize(UO2, DL, TLI, F); 
-  if (ObjectSize != MemoryLocation::UnknownSize) 
-    if (ObjectSize == LaterSize && ObjectSize >= EarlierSize) 
-      return OW_Complete; 
- 
-  // Okay, we have stores to two completely different pointers.  Try to 
-  // decompose the pointer into a "base + constant_offset" form.  If the base 
-  // pointers are equal, then we can reason about the two stores. 
-  EarlierOff = 0; 
-  LaterOff = 0; 
-  const Value *BP1 = GetPointerBaseWithConstantOffset(P1, EarlierOff, DL); 
-  const Value *BP2 = GetPointerBaseWithConstantOffset(P2, LaterOff, DL); 
- 
-  // If the base pointers still differ, we have two completely different stores. 
-  if (BP1 != BP2) 
-    return OW_Unknown; 
- 
+
+  // If we can't resolve the same pointers to the same object, then we can't
+  // analyze them at all.
+  if (UO1 != UO2)
+    return OW_Unknown;
+
+  // If the "Later" store is to a recognizable object, get its size.
+  uint64_t ObjectSize = getPointerSize(UO2, DL, TLI, F);
+  if (ObjectSize != MemoryLocation::UnknownSize)
+    if (ObjectSize == LaterSize && ObjectSize >= EarlierSize)
+      return OW_Complete;
+
+  // Okay, we have stores to two completely different pointers.  Try to
+  // decompose the pointer into a "base + constant_offset" form.  If the base
+  // pointers are equal, then we can reason about the two stores.
+  EarlierOff = 0;
+  LaterOff = 0;
+  const Value *BP1 = GetPointerBaseWithConstantOffset(P1, EarlierOff, DL);
+  const Value *BP2 = GetPointerBaseWithConstantOffset(P2, LaterOff, DL);
+
+  // If the base pointers still differ, we have two completely different stores.
+  if (BP1 != BP2)
+    return OW_Unknown;
+
   // The later access completely overlaps the earlier store if and only if
   // both start and end of the earlier one is "inside" the later one:
   //    |<->|--earlier--|<->|
@@ -510,9 +510,9 @@ isOverwrite(const Instruction *LaterI, const Instruction *EarlierI,
   //           OR
   //    |----- earlier -----|
   //    |<->|---later---|<----->|
-  // 
-  // We have to be careful here as *Off is signed while *.Size is unsigned. 
- 
+  //
+  // We have to be careful here as *Off is signed while *.Size is unsigned.
+
   // Check if the earlier access starts "not before" the later one.
   if (EarlierOff >= LaterOff) {
     // If the earlier access ends "not after" the later access then the earlier
@@ -552,587 +552,587 @@ static OverwriteResult isPartialOverwrite(const MemoryLocation &Later,
                                           InstOverlapIntervalsTy &IOL) {
   const uint64_t LaterSize = Later.Size.getValue();
   const uint64_t EarlierSize = Earlier.Size.getValue();
-  // We may now overlap, although the overlap is not complete. There might also 
-  // be other incomplete overlaps, and together, they might cover the complete 
-  // earlier write. 
-  // Note: The correctness of this logic depends on the fact that this function 
-  // is not even called providing DepWrite when there are any intervening reads. 
-  if (EnablePartialOverwriteTracking && 
-      LaterOff < int64_t(EarlierOff + EarlierSize) && 
-      int64_t(LaterOff + LaterSize) >= EarlierOff) { 
- 
-    // Insert our part of the overlap into the map. 
-    auto &IM = IOL[DepWrite]; 
-    LLVM_DEBUG(dbgs() << "DSE: Partial overwrite: Earlier [" << EarlierOff 
-                      << ", " << int64_t(EarlierOff + EarlierSize) 
-                      << ") Later [" << LaterOff << ", " 
-                      << int64_t(LaterOff + LaterSize) << ")\n"); 
- 
-    // Make sure that we only insert non-overlapping intervals and combine 
-    // adjacent intervals. The intervals are stored in the map with the ending 
-    // offset as the key (in the half-open sense) and the starting offset as 
-    // the value. 
-    int64_t LaterIntStart = LaterOff, LaterIntEnd = LaterOff + LaterSize; 
- 
-    // Find any intervals ending at, or after, LaterIntStart which start 
-    // before LaterIntEnd. 
-    auto ILI = IM.lower_bound(LaterIntStart); 
-    if (ILI != IM.end() && ILI->second <= LaterIntEnd) { 
-      // This existing interval is overlapped with the current store somewhere 
-      // in [LaterIntStart, LaterIntEnd]. Merge them by erasing the existing 
-      // intervals and adjusting our start and end. 
-      LaterIntStart = std::min(LaterIntStart, ILI->second); 
-      LaterIntEnd = std::max(LaterIntEnd, ILI->first); 
-      ILI = IM.erase(ILI); 
- 
-      // Continue erasing and adjusting our end in case other previous 
-      // intervals are also overlapped with the current store. 
-      // 
-      // |--- ealier 1 ---|  |--- ealier 2 ---| 
-      //     |------- later---------| 
-      // 
-      while (ILI != IM.end() && ILI->second <= LaterIntEnd) { 
-        assert(ILI->second > LaterIntStart && "Unexpected interval"); 
-        LaterIntEnd = std::max(LaterIntEnd, ILI->first); 
-        ILI = IM.erase(ILI); 
-      } 
-    } 
- 
-    IM[LaterIntEnd] = LaterIntStart; 
- 
-    ILI = IM.begin(); 
-    if (ILI->second <= EarlierOff && 
-        ILI->first >= int64_t(EarlierOff + EarlierSize)) { 
-      LLVM_DEBUG(dbgs() << "DSE: Full overwrite from partials: Earlier [" 
-                        << EarlierOff << ", " 
-                        << int64_t(EarlierOff + EarlierSize) 
-                        << ") Composite Later [" << ILI->second << ", " 
-                        << ILI->first << ")\n"); 
-      ++NumCompletePartials; 
-      return OW_Complete; 
-    } 
-  } 
- 
-  // Check for an earlier store which writes to all the memory locations that 
-  // the later store writes to. 
-  if (EnablePartialStoreMerging && LaterOff >= EarlierOff && 
-      int64_t(EarlierOff + EarlierSize) > LaterOff && 
-      uint64_t(LaterOff - EarlierOff) + LaterSize <= EarlierSize) { 
-    LLVM_DEBUG(dbgs() << "DSE: Partial overwrite an earlier load [" 
-                      << EarlierOff << ", " 
-                      << int64_t(EarlierOff + EarlierSize) 
-                      << ") by a later store [" << LaterOff << ", " 
-                      << int64_t(LaterOff + LaterSize) << ")\n"); 
-    // TODO: Maybe come up with a better name? 
-    return OW_PartialEarlierWithFullLater; 
-  } 
- 
-  // Another interesting case is if the later store overwrites the end of the 
-  // earlier store. 
-  // 
-  //      |--earlier--| 
-  //                |--   later   --| 
-  // 
-  // In this case we may want to trim the size of earlier to avoid generating 
-  // writes to addresses which will definitely be overwritten later 
-  if (!EnablePartialOverwriteTracking && 
-      (LaterOff > EarlierOff && LaterOff < int64_t(EarlierOff + EarlierSize) && 
-       int64_t(LaterOff + LaterSize) >= int64_t(EarlierOff + EarlierSize))) 
-    return OW_End; 
- 
-  // Finally, we also need to check if the later store overwrites the beginning 
-  // of the earlier store. 
-  // 
-  //                |--earlier--| 
-  //      |--   later   --| 
-  // 
-  // In this case we may want to move the destination address and trim the size 
-  // of earlier to avoid generating writes to addresses which will definitely 
-  // be overwritten later. 
-  if (!EnablePartialOverwriteTracking && 
-      (LaterOff <= EarlierOff && int64_t(LaterOff + LaterSize) > EarlierOff)) { 
-    assert(int64_t(LaterOff + LaterSize) < int64_t(EarlierOff + EarlierSize) && 
-           "Expect to be handled as OW_Complete"); 
-    return OW_Begin; 
-  } 
-  // Otherwise, they don't completely overlap. 
-  return OW_Unknown; 
-} 
- 
-/// If 'Inst' might be a self read (i.e. a noop copy of a 
-/// memory region into an identical pointer) then it doesn't actually make its 
-/// input dead in the traditional sense.  Consider this case: 
-/// 
-///   memmove(A <- B) 
-///   memmove(A <- A) 
-/// 
-/// In this case, the second store to A does not make the first store to A dead. 
-/// The usual situation isn't an explicit A<-A store like this (which can be 
-/// trivially removed) but a case where two pointers may alias. 
-/// 
-/// This function detects when it is unsafe to remove a dependent instruction 
-/// because the DSE inducing instruction may be a self-read. 
-static bool isPossibleSelfRead(Instruction *Inst, 
-                               const MemoryLocation &InstStoreLoc, 
-                               Instruction *DepWrite, 
-                               const TargetLibraryInfo &TLI, 
-                               AliasAnalysis &AA) { 
-  // Self reads can only happen for instructions that read memory.  Get the 
-  // location read. 
-  MemoryLocation InstReadLoc = getLocForRead(Inst, TLI); 
-  if (!InstReadLoc.Ptr) 
-    return false; // Not a reading instruction. 
- 
-  // If the read and written loc obviously don't alias, it isn't a read. 
-  if (AA.isNoAlias(InstReadLoc, InstStoreLoc)) 
-    return false; 
- 
-  if (isa<AnyMemCpyInst>(Inst)) { 
-    // LLVM's memcpy overlap semantics are not fully fleshed out (see PR11763) 
-    // but in practice memcpy(A <- B) either means that A and B are disjoint or 
-    // are equal (i.e. there are not partial overlaps).  Given that, if we have: 
-    // 
-    //   memcpy/memmove(A <- B)  // DepWrite 
-    //   memcpy(A <- B)  // Inst 
-    // 
-    // with Inst reading/writing a >= size than DepWrite, we can reason as 
-    // follows: 
-    // 
-    //   - If A == B then both the copies are no-ops, so the DepWrite can be 
-    //     removed. 
-    //   - If A != B then A and B are disjoint locations in Inst.  Since 
-    //     Inst.size >= DepWrite.size A and B are disjoint in DepWrite too. 
-    //     Therefore DepWrite can be removed. 
-    MemoryLocation DepReadLoc = getLocForRead(DepWrite, TLI); 
- 
-    if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr)) 
-      return false; 
-  } 
- 
-  // If DepWrite doesn't read memory or if we can't prove it is a must alias, 
-  // then it can't be considered dead. 
-  return true; 
-} 
- 
-/// Returns true if the memory which is accessed by the second instruction is not 
-/// modified between the first and the second instruction. 
-/// Precondition: Second instruction must be dominated by the first 
-/// instruction. 
+  // We may now overlap, although the overlap is not complete. There might also
+  // be other incomplete overlaps, and together, they might cover the complete
+  // earlier write.
+  // Note: The correctness of this logic depends on the fact that this function
+  // is not even called providing DepWrite when there are any intervening reads.
+  if (EnablePartialOverwriteTracking &&
+      LaterOff < int64_t(EarlierOff + EarlierSize) &&
+      int64_t(LaterOff + LaterSize) >= EarlierOff) {
+
+    // Insert our part of the overlap into the map.
+    auto &IM = IOL[DepWrite];
+    LLVM_DEBUG(dbgs() << "DSE: Partial overwrite: Earlier [" << EarlierOff
+                      << ", " << int64_t(EarlierOff + EarlierSize)
+                      << ") Later [" << LaterOff << ", "
+                      << int64_t(LaterOff + LaterSize) << ")\n");
+
+    // Make sure that we only insert non-overlapping intervals and combine
+    // adjacent intervals. The intervals are stored in the map with the ending
+    // offset as the key (in the half-open sense) and the starting offset as
+    // the value.
+    int64_t LaterIntStart = LaterOff, LaterIntEnd = LaterOff + LaterSize;
+
+    // Find any intervals ending at, or after, LaterIntStart which start
+    // before LaterIntEnd.
+    auto ILI = IM.lower_bound(LaterIntStart);
+    if (ILI != IM.end() && ILI->second <= LaterIntEnd) {
+      // This existing interval is overlapped with the current store somewhere
+      // in [LaterIntStart, LaterIntEnd]. Merge them by erasing the existing
+      // intervals and adjusting our start and end.
+      LaterIntStart = std::min(LaterIntStart, ILI->second);
+      LaterIntEnd = std::max(LaterIntEnd, ILI->first);
+      ILI = IM.erase(ILI);
+
+      // Continue erasing and adjusting our end in case other previous
+      // intervals are also overlapped with the current store.
+      //
+      // |--- ealier 1 ---|  |--- ealier 2 ---|
+      //     |------- later---------|
+      //
+      while (ILI != IM.end() && ILI->second <= LaterIntEnd) {
+        assert(ILI->second > LaterIntStart && "Unexpected interval");
+        LaterIntEnd = std::max(LaterIntEnd, ILI->first);
+        ILI = IM.erase(ILI);
+      }
+    }
+
+    IM[LaterIntEnd] = LaterIntStart;
+
+    ILI = IM.begin();
+    if (ILI->second <= EarlierOff &&
+        ILI->first >= int64_t(EarlierOff + EarlierSize)) {
+      LLVM_DEBUG(dbgs() << "DSE: Full overwrite from partials: Earlier ["
+                        << EarlierOff << ", "
+                        << int64_t(EarlierOff + EarlierSize)
+                        << ") Composite Later [" << ILI->second << ", "
+                        << ILI->first << ")\n");
+      ++NumCompletePartials;
+      return OW_Complete;
+    }
+  }
+
+  // Check for an earlier store which writes to all the memory locations that
+  // the later store writes to.
+  if (EnablePartialStoreMerging && LaterOff >= EarlierOff &&
+      int64_t(EarlierOff + EarlierSize) > LaterOff &&
+      uint64_t(LaterOff - EarlierOff) + LaterSize <= EarlierSize) {
+    LLVM_DEBUG(dbgs() << "DSE: Partial overwrite an earlier load ["
+                      << EarlierOff << ", "
+                      << int64_t(EarlierOff + EarlierSize)
+                      << ") by a later store [" << LaterOff << ", "
+                      << int64_t(LaterOff + LaterSize) << ")\n");
+    // TODO: Maybe come up with a better name?
+    return OW_PartialEarlierWithFullLater;
+  }
+
+  // Another interesting case is if the later store overwrites the end of the
+  // earlier store.
+  //
+  //      |--earlier--|
+  //                |--   later   --|
+  //
+  // In this case we may want to trim the size of earlier to avoid generating
+  // writes to addresses which will definitely be overwritten later
+  if (!EnablePartialOverwriteTracking &&
+      (LaterOff > EarlierOff && LaterOff < int64_t(EarlierOff + EarlierSize) &&
+       int64_t(LaterOff + LaterSize) >= int64_t(EarlierOff + EarlierSize)))
+    return OW_End;
+
+  // Finally, we also need to check if the later store overwrites the beginning
+  // of the earlier store.
+  //
+  //                |--earlier--|
+  //      |--   later   --|
+  //
+  // In this case we may want to move the destination address and trim the size
+  // of earlier to avoid generating writes to addresses which will definitely
+  // be overwritten later.
+  if (!EnablePartialOverwriteTracking &&
+      (LaterOff <= EarlierOff && int64_t(LaterOff + LaterSize) > EarlierOff)) {
+    assert(int64_t(LaterOff + LaterSize) < int64_t(EarlierOff + EarlierSize) &&
+           "Expect to be handled as OW_Complete");
+    return OW_Begin;
+  }
+  // Otherwise, they don't completely overlap.
+  return OW_Unknown;
+}
+
+/// If 'Inst' might be a self read (i.e. a noop copy of a
+/// memory region into an identical pointer) then it doesn't actually make its
+/// input dead in the traditional sense.  Consider this case:
+///
+///   memmove(A <- B)
+///   memmove(A <- A)
+///
+/// In this case, the second store to A does not make the first store to A dead.
+/// The usual situation isn't an explicit A<-A store like this (which can be
+/// trivially removed) but a case where two pointers may alias.
+///
+/// This function detects when it is unsafe to remove a dependent instruction
+/// because the DSE inducing instruction may be a self-read.
+static bool isPossibleSelfRead(Instruction *Inst,
+                               const MemoryLocation &InstStoreLoc,
+                               Instruction *DepWrite,
+                               const TargetLibraryInfo &TLI,
+                               AliasAnalysis &AA) {
+  // Self reads can only happen for instructions that read memory.  Get the
+  // location read.
+  MemoryLocation InstReadLoc = getLocForRead(Inst, TLI);
+  if (!InstReadLoc.Ptr)
+    return false; // Not a reading instruction.
+
+  // If the read and written loc obviously don't alias, it isn't a read.
+  if (AA.isNoAlias(InstReadLoc, InstStoreLoc))
+    return false;
+
+  if (isa<AnyMemCpyInst>(Inst)) {
+    // LLVM's memcpy overlap semantics are not fully fleshed out (see PR11763)
+    // but in practice memcpy(A <- B) either means that A and B are disjoint or
+    // are equal (i.e. there are not partial overlaps).  Given that, if we have:
+    //
+    //   memcpy/memmove(A <- B)  // DepWrite
+    //   memcpy(A <- B)  // Inst
+    //
+    // with Inst reading/writing a >= size than DepWrite, we can reason as
+    // follows:
+    //
+    //   - If A == B then both the copies are no-ops, so the DepWrite can be
+    //     removed.
+    //   - If A != B then A and B are disjoint locations in Inst.  Since
+    //     Inst.size >= DepWrite.size A and B are disjoint in DepWrite too.
+    //     Therefore DepWrite can be removed.
+    MemoryLocation DepReadLoc = getLocForRead(DepWrite, TLI);
+
+    if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr))
+      return false;
+  }
+
+  // If DepWrite doesn't read memory or if we can't prove it is a must alias,
+  // then it can't be considered dead.
+  return true;
+}
+
+/// Returns true if the memory which is accessed by the second instruction is not
+/// modified between the first and the second instruction.
+/// Precondition: Second instruction must be dominated by the first
+/// instruction.
 template <typename AATy>
 static bool
 memoryIsNotModifiedBetween(Instruction *FirstI, Instruction *SecondI, AATy &AA,
                            const DataLayout &DL, DominatorTree *DT) {
-  // Do a backwards scan through the CFG from SecondI to FirstI. Look for 
-  // instructions which can modify the memory location accessed by SecondI. 
-  // 
-  // While doing the walk keep track of the address to check. It might be 
-  // different in different basic blocks due to PHI translation. 
-  using BlockAddressPair = std::pair<BasicBlock *, PHITransAddr>; 
-  SmallVector<BlockAddressPair, 16> WorkList; 
-  // Keep track of the address we visited each block with. Bail out if we 
-  // visit a block with different addresses. 
-  DenseMap<BasicBlock *, Value *> Visited; 
- 
-  BasicBlock::iterator FirstBBI(FirstI); 
-  ++FirstBBI; 
-  BasicBlock::iterator SecondBBI(SecondI); 
-  BasicBlock *FirstBB = FirstI->getParent(); 
-  BasicBlock *SecondBB = SecondI->getParent(); 
-  MemoryLocation MemLoc = MemoryLocation::get(SecondI); 
-  auto *MemLocPtr = const_cast<Value *>(MemLoc.Ptr); 
- 
-  // Start checking the SecondBB. 
-  WorkList.push_back( 
-      std::make_pair(SecondBB, PHITransAddr(MemLocPtr, DL, nullptr))); 
-  bool isFirstBlock = true; 
- 
-  // Check all blocks going backward until we reach the FirstBB. 
-  while (!WorkList.empty()) { 
-    BlockAddressPair Current = WorkList.pop_back_val(); 
-    BasicBlock *B = Current.first; 
-    PHITransAddr &Addr = Current.second; 
-    Value *Ptr = Addr.getAddr(); 
- 
-    // Ignore instructions before FirstI if this is the FirstBB. 
-    BasicBlock::iterator BI = (B == FirstBB ? FirstBBI : B->begin()); 
- 
-    BasicBlock::iterator EI; 
-    if (isFirstBlock) { 
-      // Ignore instructions after SecondI if this is the first visit of SecondBB. 
-      assert(B == SecondBB && "first block is not the store block"); 
-      EI = SecondBBI; 
-      isFirstBlock = false; 
-    } else { 
-      // It's not SecondBB or (in case of a loop) the second visit of SecondBB. 
-      // In this case we also have to look at instructions after SecondI. 
-      EI = B->end(); 
-    } 
-    for (; BI != EI; ++BI) { 
-      Instruction *I = &*BI; 
-      if (I->mayWriteToMemory() && I != SecondI) 
+  // Do a backwards scan through the CFG from SecondI to FirstI. Look for
+  // instructions which can modify the memory location accessed by SecondI.
+  //
+  // While doing the walk keep track of the address to check. It might be
+  // different in different basic blocks due to PHI translation.
+  using BlockAddressPair = std::pair<BasicBlock *, PHITransAddr>;
+  SmallVector<BlockAddressPair, 16> WorkList;
+  // Keep track of the address we visited each block with. Bail out if we
+  // visit a block with different addresses.
+  DenseMap<BasicBlock *, Value *> Visited;
+
+  BasicBlock::iterator FirstBBI(FirstI);
+  ++FirstBBI;
+  BasicBlock::iterator SecondBBI(SecondI);
+  BasicBlock *FirstBB = FirstI->getParent();
+  BasicBlock *SecondBB = SecondI->getParent();
+  MemoryLocation MemLoc = MemoryLocation::get(SecondI);
+  auto *MemLocPtr = const_cast<Value *>(MemLoc.Ptr);
+
+  // Start checking the SecondBB.
+  WorkList.push_back(
+      std::make_pair(SecondBB, PHITransAddr(MemLocPtr, DL, nullptr)));
+  bool isFirstBlock = true;
+
+  // Check all blocks going backward until we reach the FirstBB.
+  while (!WorkList.empty()) {
+    BlockAddressPair Current = WorkList.pop_back_val();
+    BasicBlock *B = Current.first;
+    PHITransAddr &Addr = Current.second;
+    Value *Ptr = Addr.getAddr();
+
+    // Ignore instructions before FirstI if this is the FirstBB.
+    BasicBlock::iterator BI = (B == FirstBB ? FirstBBI : B->begin());
+
+    BasicBlock::iterator EI;
+    if (isFirstBlock) {
+      // Ignore instructions after SecondI if this is the first visit of SecondBB.
+      assert(B == SecondBB && "first block is not the store block");
+      EI = SecondBBI;
+      isFirstBlock = false;
+    } else {
+      // It's not SecondBB or (in case of a loop) the second visit of SecondBB.
+      // In this case we also have to look at instructions after SecondI.
+      EI = B->end();
+    }
+    for (; BI != EI; ++BI) {
+      Instruction *I = &*BI;
+      if (I->mayWriteToMemory() && I != SecondI)
         if (isModSet(AA.getModRefInfo(I, MemLoc.getWithNewPtr(Ptr))))
-          return false; 
-    } 
-    if (B != FirstBB) { 
-      assert(B != &FirstBB->getParent()->getEntryBlock() && 
-          "Should not hit the entry block because SI must be dominated by LI"); 
-      for (auto PredI = pred_begin(B), PE = pred_end(B); PredI != PE; ++PredI) { 
-        PHITransAddr PredAddr = Addr; 
-        if (PredAddr.NeedsPHITranslationFromBlock(B)) { 
-          if (!PredAddr.IsPotentiallyPHITranslatable()) 
-            return false; 
-          if (PredAddr.PHITranslateValue(B, *PredI, DT, false)) 
-            return false; 
-        } 
-        Value *TranslatedPtr = PredAddr.getAddr(); 
-        auto Inserted = Visited.insert(std::make_pair(*PredI, TranslatedPtr)); 
-        if (!Inserted.second) { 
-          // We already visited this block before. If it was with a different 
-          // address - bail out! 
-          if (TranslatedPtr != Inserted.first->second) 
-            return false; 
-          // ... otherwise just skip it. 
-          continue; 
-        } 
-        WorkList.push_back(std::make_pair(*PredI, PredAddr)); 
-      } 
-    } 
-  } 
-  return true; 
-} 
- 
-/// Find all blocks that will unconditionally lead to the block BB and append 
-/// them to F. 
-static void findUnconditionalPreds(SmallVectorImpl<BasicBlock *> &Blocks, 
-                                   BasicBlock *BB, DominatorTree *DT) { 
-  for (pred_iterator I = pred_begin(BB), E = pred_end(BB); I != E; ++I) { 
-    BasicBlock *Pred = *I; 
-    if (Pred == BB) continue; 
-    Instruction *PredTI = Pred->getTerminator(); 
-    if (PredTI->getNumSuccessors() != 1) 
-      continue; 
- 
-    if (DT->isReachableFromEntry(Pred)) 
-      Blocks.push_back(Pred); 
-  } 
-} 
- 
-/// Handle frees of entire structures whose dependency is a store 
-/// to a field of that structure. 
-static bool handleFree(CallInst *F, AliasAnalysis *AA, 
-                       MemoryDependenceResults *MD, DominatorTree *DT, 
-                       const TargetLibraryInfo *TLI, 
-                       InstOverlapIntervalsTy &IOL, 
-                       MapVector<Instruction *, bool> &ThrowableInst) { 
-  bool MadeChange = false; 
- 
+          return false;
+    }
+    if (B != FirstBB) {
+      assert(B != &FirstBB->getParent()->getEntryBlock() &&
+          "Should not hit the entry block because SI must be dominated by LI");
+      for (auto PredI = pred_begin(B), PE = pred_end(B); PredI != PE; ++PredI) {
+        PHITransAddr PredAddr = Addr;
+        if (PredAddr.NeedsPHITranslationFromBlock(B)) {
+          if (!PredAddr.IsPotentiallyPHITranslatable())
+            return false;
+          if (PredAddr.PHITranslateValue(B, *PredI, DT, false))
+            return false;
+        }
+        Value *TranslatedPtr = PredAddr.getAddr();
+        auto Inserted = Visited.insert(std::make_pair(*PredI, TranslatedPtr));
+        if (!Inserted.second) {
+          // We already visited this block before. If it was with a different
+          // address - bail out!
+          if (TranslatedPtr != Inserted.first->second)
+            return false;
+          // ... otherwise just skip it.
+          continue;
+        }
+        WorkList.push_back(std::make_pair(*PredI, PredAddr));
+      }
+    }
+  }
+  return true;
+}
+
+/// Find all blocks that will unconditionally lead to the block BB and append
+/// them to F.
+static void findUnconditionalPreds(SmallVectorImpl<BasicBlock *> &Blocks,
+                                   BasicBlock *BB, DominatorTree *DT) {
+  for (pred_iterator I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
+    BasicBlock *Pred = *I;
+    if (Pred == BB) continue;
+    Instruction *PredTI = Pred->getTerminator();
+    if (PredTI->getNumSuccessors() != 1)
+      continue;
+
+    if (DT->isReachableFromEntry(Pred))
+      Blocks.push_back(Pred);
+  }
+}
+
+/// Handle frees of entire structures whose dependency is a store
+/// to a field of that structure.
+static bool handleFree(CallInst *F, AliasAnalysis *AA,
+                       MemoryDependenceResults *MD, DominatorTree *DT,
+                       const TargetLibraryInfo *TLI,
+                       InstOverlapIntervalsTy &IOL,
+                       MapVector<Instruction *, bool> &ThrowableInst) {
+  bool MadeChange = false;
+
   MemoryLocation Loc = MemoryLocation::getAfter(F->getOperand(0));
-  SmallVector<BasicBlock *, 16> Blocks; 
-  Blocks.push_back(F->getParent()); 
- 
-  while (!Blocks.empty()) { 
-    BasicBlock *BB = Blocks.pop_back_val(); 
-    Instruction *InstPt = BB->getTerminator(); 
-    if (BB == F->getParent()) InstPt = F; 
- 
-    MemDepResult Dep = 
-        MD->getPointerDependencyFrom(Loc, false, InstPt->getIterator(), BB); 
-    while (Dep.isDef() || Dep.isClobber()) { 
-      Instruction *Dependency = Dep.getInst(); 
-      if (!hasAnalyzableMemoryWrite(Dependency, *TLI) || 
-          !isRemovable(Dependency)) 
-        break; 
- 
-      Value *DepPointer = 
+  SmallVector<BasicBlock *, 16> Blocks;
+  Blocks.push_back(F->getParent());
+
+  while (!Blocks.empty()) {
+    BasicBlock *BB = Blocks.pop_back_val();
+    Instruction *InstPt = BB->getTerminator();
+    if (BB == F->getParent()) InstPt = F;
+
+    MemDepResult Dep =
+        MD->getPointerDependencyFrom(Loc, false, InstPt->getIterator(), BB);
+    while (Dep.isDef() || Dep.isClobber()) {
+      Instruction *Dependency = Dep.getInst();
+      if (!hasAnalyzableMemoryWrite(Dependency, *TLI) ||
+          !isRemovable(Dependency))
+        break;
+
+      Value *DepPointer =
           getUnderlyingObject(getStoredPointerOperand(Dependency, *TLI));
- 
-      // Check for aliasing. 
-      if (!AA->isMustAlias(F->getArgOperand(0), DepPointer)) 
-        break; 
- 
-      LLVM_DEBUG( 
-          dbgs() << "DSE: Dead Store to soon to be freed memory:\n  DEAD: " 
-                 << *Dependency << '\n'); 
- 
-      // DCE instructions only used to calculate that store. 
-      BasicBlock::iterator BBI(Dependency); 
-      deleteDeadInstruction(Dependency, &BBI, *MD, *TLI, IOL, 
-                            ThrowableInst); 
-      ++NumFastStores; 
-      MadeChange = true; 
- 
-      // Inst's old Dependency is now deleted. Compute the next dependency, 
-      // which may also be dead, as in 
-      //    s[0] = 0; 
-      //    s[1] = 0; // This has just been deleted. 
-      //    free(s); 
-      Dep = MD->getPointerDependencyFrom(Loc, false, BBI, BB); 
-    } 
- 
-    if (Dep.isNonLocal()) 
-      findUnconditionalPreds(Blocks, BB, DT); 
-  } 
- 
-  return MadeChange; 
-} 
- 
-/// Check to see if the specified location may alias any of the stack objects in 
-/// the DeadStackObjects set. If so, they become live because the location is 
-/// being loaded. 
-static void removeAccessedObjects(const MemoryLocation &LoadedLoc, 
-                                  SmallSetVector<const Value *, 16> &DeadStackObjects, 
-                                  const DataLayout &DL, AliasAnalysis *AA, 
-                                  const TargetLibraryInfo *TLI, 
-                                  const Function *F) { 
+
+      // Check for aliasing.
+      if (!AA->isMustAlias(F->getArgOperand(0), DepPointer))
+        break;
+
+      LLVM_DEBUG(
+          dbgs() << "DSE: Dead Store to soon to be freed memory:\n  DEAD: "
+                 << *Dependency << '\n');
+
+      // DCE instructions only used to calculate that store.
+      BasicBlock::iterator BBI(Dependency);
+      deleteDeadInstruction(Dependency, &BBI, *MD, *TLI, IOL,
+                            ThrowableInst);
+      ++NumFastStores;
+      MadeChange = true;
+
+      // Inst's old Dependency is now deleted. Compute the next dependency,
+      // which may also be dead, as in
+      //    s[0] = 0;
+      //    s[1] = 0; // This has just been deleted.
+      //    free(s);
+      Dep = MD->getPointerDependencyFrom(Loc, false, BBI, BB);
+    }
+
+    if (Dep.isNonLocal())
+      findUnconditionalPreds(Blocks, BB, DT);
+  }
+
+  return MadeChange;
+}
+
+/// Check to see if the specified location may alias any of the stack objects in
+/// the DeadStackObjects set. If so, they become live because the location is
+/// being loaded.
+static void removeAccessedObjects(const MemoryLocation &LoadedLoc,
+                                  SmallSetVector<const Value *, 16> &DeadStackObjects,
+                                  const DataLayout &DL, AliasAnalysis *AA,
+                                  const TargetLibraryInfo *TLI,
+                                  const Function *F) {
   const Value *UnderlyingPointer = getUnderlyingObject(LoadedLoc.Ptr);
- 
-  // A constant can't be in the dead pointer set. 
-  if (isa<Constant>(UnderlyingPointer)) 
-    return; 
- 
-  // If the kill pointer can be easily reduced to an alloca, don't bother doing 
-  // extraneous AA queries. 
-  if (isa<AllocaInst>(UnderlyingPointer) || isa<Argument>(UnderlyingPointer)) { 
-    DeadStackObjects.remove(UnderlyingPointer); 
-    return; 
-  } 
- 
-  // Remove objects that could alias LoadedLoc. 
-  DeadStackObjects.remove_if([&](const Value *I) { 
-    // See if the loaded location could alias the stack location. 
-    MemoryLocation StackLoc(I, getPointerSize(I, DL, *TLI, F)); 
-    return !AA->isNoAlias(StackLoc, LoadedLoc); 
-  }); 
-} 
- 
-/// Remove dead stores to stack-allocated locations in the function end block. 
-/// Ex: 
-/// %A = alloca i32 
-/// ... 
-/// store i32 1, i32* %A 
-/// ret void 
-static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA, 
-                           MemoryDependenceResults *MD, 
-                           const TargetLibraryInfo *TLI, 
-                           InstOverlapIntervalsTy &IOL, 
-                           MapVector<Instruction *, bool> &ThrowableInst) { 
-  bool MadeChange = false; 
- 
-  // Keep track of all of the stack objects that are dead at the end of the 
-  // function. 
-  SmallSetVector<const Value*, 16> DeadStackObjects; 
- 
-  // Find all of the alloca'd pointers in the entry block. 
-  BasicBlock &Entry = BB.getParent()->front(); 
-  for (Instruction &I : Entry) { 
-    if (isa<AllocaInst>(&I)) 
-      DeadStackObjects.insert(&I); 
- 
-    // Okay, so these are dead heap objects, but if the pointer never escapes 
-    // then it's leaked by this function anyways. 
-    else if (isAllocLikeFn(&I, TLI) && !PointerMayBeCaptured(&I, true, true)) 
-      DeadStackObjects.insert(&I); 
-  } 
- 
-  // Treat byval or inalloca arguments the same, stores to them are dead at the 
-  // end of the function. 
-  for (Argument &AI : BB.getParent()->args()) 
+
+  // A constant can't be in the dead pointer set.
+  if (isa<Constant>(UnderlyingPointer))
+    return;
+
+  // If the kill pointer can be easily reduced to an alloca, don't bother doing
+  // extraneous AA queries.
+  if (isa<AllocaInst>(UnderlyingPointer) || isa<Argument>(UnderlyingPointer)) {
+    DeadStackObjects.remove(UnderlyingPointer);
+    return;
+  }
+
+  // Remove objects that could alias LoadedLoc.
+  DeadStackObjects.remove_if([&](const Value *I) {
+    // See if the loaded location could alias the stack location.
+    MemoryLocation StackLoc(I, getPointerSize(I, DL, *TLI, F));
+    return !AA->isNoAlias(StackLoc, LoadedLoc);
+  });
+}
+
+/// Remove dead stores to stack-allocated locations in the function end block.
+/// Ex:
+/// %A = alloca i32
+/// ...
+/// store i32 1, i32* %A
+/// ret void
+static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
+                           MemoryDependenceResults *MD,
+                           const TargetLibraryInfo *TLI,
+                           InstOverlapIntervalsTy &IOL,
+                           MapVector<Instruction *, bool> &ThrowableInst) {
+  bool MadeChange = false;
+
+  // Keep track of all of the stack objects that are dead at the end of the
+  // function.
+  SmallSetVector<const Value*, 16> DeadStackObjects;
+
+  // Find all of the alloca'd pointers in the entry block.
+  BasicBlock &Entry = BB.getParent()->front();
+  for (Instruction &I : Entry) {
+    if (isa<AllocaInst>(&I))
+      DeadStackObjects.insert(&I);
+
+    // Okay, so these are dead heap objects, but if the pointer never escapes
+    // then it's leaked by this function anyways.
+    else if (isAllocLikeFn(&I, TLI) && !PointerMayBeCaptured(&I, true, true))
+      DeadStackObjects.insert(&I);
+  }
+
+  // Treat byval or inalloca arguments the same, stores to them are dead at the
+  // end of the function.
+  for (Argument &AI : BB.getParent()->args())
     if (AI.hasPassPointeeByValueCopyAttr())
-      DeadStackObjects.insert(&AI); 
- 
-  const DataLayout &DL = BB.getModule()->getDataLayout(); 
- 
-  // Scan the basic block backwards 
-  for (BasicBlock::iterator BBI = BB.end(); BBI != BB.begin(); ){ 
-    --BBI; 
- 
-    // If we find a store, check to see if it points into a dead stack value. 
-    if (hasAnalyzableMemoryWrite(&*BBI, *TLI) && isRemovable(&*BBI)) { 
-      // See through pointer-to-pointer bitcasts 
-      SmallVector<const Value *, 4> Pointers; 
+      DeadStackObjects.insert(&AI);
+
+  const DataLayout &DL = BB.getModule()->getDataLayout();
+
+  // Scan the basic block backwards
+  for (BasicBlock::iterator BBI = BB.end(); BBI != BB.begin(); ){
+    --BBI;
+
+    // If we find a store, check to see if it points into a dead stack value.
+    if (hasAnalyzableMemoryWrite(&*BBI, *TLI) && isRemovable(&*BBI)) {
+      // See through pointer-to-pointer bitcasts
+      SmallVector<const Value *, 4> Pointers;
       getUnderlyingObjects(getStoredPointerOperand(&*BBI, *TLI), Pointers);
- 
-      // Stores to stack values are valid candidates for removal. 
-      bool AllDead = true; 
-      for (const Value *Pointer : Pointers) 
-        if (!DeadStackObjects.count(Pointer)) { 
-          AllDead = false; 
-          break; 
-        } 
- 
-      if (AllDead) { 
-        Instruction *Dead = &*BBI; 
- 
-        LLVM_DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n  DEAD: " 
-                          << *Dead << "\n  Objects: "; 
-                   for (SmallVectorImpl<const Value *>::iterator I = 
-                            Pointers.begin(), 
-                        E = Pointers.end(); 
-                        I != E; ++I) { 
-                     dbgs() << **I; 
-                     if (std::next(I) != E) 
-                       dbgs() << ", "; 
-                   } dbgs() 
-                   << '\n'); 
- 
-        // DCE instructions only used to calculate that store. 
-        deleteDeadInstruction(Dead, &BBI, *MD, *TLI, IOL, ThrowableInst, 
-                              &DeadStackObjects); 
-        ++NumFastStores; 
-        MadeChange = true; 
-        continue; 
-      } 
-    } 
- 
-    // Remove any dead non-memory-mutating instructions. 
-    if (isInstructionTriviallyDead(&*BBI, TLI)) { 
-      LLVM_DEBUG(dbgs() << "DSE: Removing trivially dead instruction:\n  DEAD: " 
-                        << *&*BBI << '\n'); 
-      deleteDeadInstruction(&*BBI, &BBI, *MD, *TLI, IOL, ThrowableInst, 
-                            &DeadStackObjects); 
-      ++NumFastOther; 
-      MadeChange = true; 
-      continue; 
-    } 
- 
-    if (isa<AllocaInst>(BBI)) { 
-      // Remove allocas from the list of dead stack objects; there can't be 
-      // any references before the definition. 
-      DeadStackObjects.remove(&*BBI); 
-      continue; 
-    } 
- 
-    if (auto *Call = dyn_cast<CallBase>(&*BBI)) { 
-      // Remove allocation function calls from the list of dead stack objects; 
-      // there can't be any references before the definition. 
-      if (isAllocLikeFn(&*BBI, TLI)) 
-        DeadStackObjects.remove(&*BBI); 
- 
-      // If this call does not access memory, it can't be loading any of our 
-      // pointers. 
-      if (AA->doesNotAccessMemory(Call)) 
-        continue; 
- 
-      // If the call might load from any of our allocas, then any store above 
-      // the call is live. 
-      DeadStackObjects.remove_if([&](const Value *I) { 
-        // See if the call site touches the value. 
-        return isRefSet(AA->getModRefInfo( 
-            Call, I, getPointerSize(I, DL, *TLI, BB.getParent()))); 
-      }); 
- 
-      // If all of the allocas were clobbered by the call then we're not going 
-      // to find anything else to process. 
-      if (DeadStackObjects.empty()) 
-        break; 
- 
-      continue; 
-    } 
- 
-    // We can remove the dead stores, irrespective of the fence and its ordering 
-    // (release/acquire/seq_cst). Fences only constraints the ordering of 
-    // already visible stores, it does not make a store visible to other 
-    // threads. So, skipping over a fence does not change a store from being 
-    // dead. 
-    if (isa<FenceInst>(*BBI)) 
-      continue; 
- 
-    MemoryLocation LoadedLoc; 
- 
-    // If we encounter a use of the pointer, it is no longer considered dead 
-    if (LoadInst *L = dyn_cast<LoadInst>(BBI)) { 
-      if (!L->isUnordered()) // Be conservative with atomic/volatile load 
-        break; 
-      LoadedLoc = MemoryLocation::get(L); 
-    } else if (VAArgInst *V = dyn_cast<VAArgInst>(BBI)) { 
-      LoadedLoc = MemoryLocation::get(V); 
-    } else if (!BBI->mayReadFromMemory()) { 
-      // Instruction doesn't read memory.  Note that stores that weren't removed 
-      // above will hit this case. 
-      continue; 
-    } else { 
-      // Unknown inst; assume it clobbers everything. 
-      break; 
-    } 
- 
-    // Remove any allocas from the DeadPointer set that are loaded, as this 
-    // makes any stores above the access live. 
-    removeAccessedObjects(LoadedLoc, DeadStackObjects, DL, AA, TLI, BB.getParent()); 
- 
-    // If all of the allocas were clobbered by the access then we're not going 
-    // to find anything else to process. 
-    if (DeadStackObjects.empty()) 
-      break; 
-  } 
- 
-  return MadeChange; 
-} 
- 
-static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierOffset, 
+
+      // Stores to stack values are valid candidates for removal.
+      bool AllDead = true;
+      for (const Value *Pointer : Pointers)
+        if (!DeadStackObjects.count(Pointer)) {
+          AllDead = false;
+          break;
+        }
+
+      if (AllDead) {
+        Instruction *Dead = &*BBI;
+
+        LLVM_DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n  DEAD: "
+                          << *Dead << "\n  Objects: ";
+                   for (SmallVectorImpl<const Value *>::iterator I =
+                            Pointers.begin(),
+                        E = Pointers.end();
+                        I != E; ++I) {
+                     dbgs() << **I;
+                     if (std::next(I) != E)
+                       dbgs() << ", ";
+                   } dbgs()
+                   << '\n');
+
+        // DCE instructions only used to calculate that store.
+        deleteDeadInstruction(Dead, &BBI, *MD, *TLI, IOL, ThrowableInst,
+                              &DeadStackObjects);
+        ++NumFastStores;
+        MadeChange = true;
+        continue;
+      }
+    }
+
+    // Remove any dead non-memory-mutating instructions.
+    if (isInstructionTriviallyDead(&*BBI, TLI)) {
+      LLVM_DEBUG(dbgs() << "DSE: Removing trivially dead instruction:\n  DEAD: "
+                        << *&*BBI << '\n');
+      deleteDeadInstruction(&*BBI, &BBI, *MD, *TLI, IOL, ThrowableInst,
+                            &DeadStackObjects);
+      ++NumFastOther;
+      MadeChange = true;
+      continue;
+    }
+
+    if (isa<AllocaInst>(BBI)) {
+      // Remove allocas from the list of dead stack objects; there can't be
+      // any references before the definition.
+      DeadStackObjects.remove(&*BBI);
+      continue;
+    }
+
+    if (auto *Call = dyn_cast<CallBase>(&*BBI)) {
+      // Remove allocation function calls from the list of dead stack objects;
+      // there can't be any references before the definition.
+      if (isAllocLikeFn(&*BBI, TLI))
+        DeadStackObjects.remove(&*BBI);
+
+      // If this call does not access memory, it can't be loading any of our
+      // pointers.
+      if (AA->doesNotAccessMemory(Call))
+        continue;
+
+      // If the call might load from any of our allocas, then any store above
+      // the call is live.
+      DeadStackObjects.remove_if([&](const Value *I) {
+        // See if the call site touches the value.
+        return isRefSet(AA->getModRefInfo(
+            Call, I, getPointerSize(I, DL, *TLI, BB.getParent())));
+      });
+
+      // If all of the allocas were clobbered by the call then we're not going
+      // to find anything else to process.
+      if (DeadStackObjects.empty())
+        break;
+
+      continue;
+    }
+
+    // We can remove the dead stores, irrespective of the fence and its ordering
+    // (release/acquire/seq_cst). Fences only constraints the ordering of
+    // already visible stores, it does not make a store visible to other
+    // threads. So, skipping over a fence does not change a store from being
+    // dead.
+    if (isa<FenceInst>(*BBI))
+      continue;
+
+    MemoryLocation LoadedLoc;
+
+    // If we encounter a use of the pointer, it is no longer considered dead
+    if (LoadInst *L = dyn_cast<LoadInst>(BBI)) {
+      if (!L->isUnordered()) // Be conservative with atomic/volatile load
+        break;
+      LoadedLoc = MemoryLocation::get(L);
+    } else if (VAArgInst *V = dyn_cast<VAArgInst>(BBI)) {
+      LoadedLoc = MemoryLocation::get(V);
+    } else if (!BBI->mayReadFromMemory()) {
+      // Instruction doesn't read memory.  Note that stores that weren't removed
+      // above will hit this case.
+      continue;
+    } else {
+      // Unknown inst; assume it clobbers everything.
+      break;
+    }
+
+    // Remove any allocas from the DeadPointer set that are loaded, as this
+    // makes any stores above the access live.
+    removeAccessedObjects(LoadedLoc, DeadStackObjects, DL, AA, TLI, BB.getParent());
+
+    // If all of the allocas were clobbered by the access then we're not going
+    // to find anything else to process.
+    if (DeadStackObjects.empty())
+      break;
+  }
+
+  return MadeChange;
+}
+
+static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierOffset,
                          uint64_t &EarlierSize, int64_t LaterOffset,
                          uint64_t LaterSize, bool IsOverwriteEnd) {
-  // TODO: base this on the target vector size so that if the earlier 
-  // store was too small to get vector writes anyway then its likely 
-  // a good idea to shorten it 
-  // Power of 2 vector writes are probably always a bad idea to optimize 
-  // as any store/memset/memcpy is likely using vector instructions so 
-  // shortening it to not vector size is likely to be slower 
-  auto *EarlierIntrinsic = cast<AnyMemIntrinsic>(EarlierWrite); 
-  unsigned EarlierWriteAlign = EarlierIntrinsic->getDestAlignment(); 
-  if (!IsOverwriteEnd) 
-    LaterOffset = int64_t(LaterOffset + LaterSize); 
- 
-  if (!(isPowerOf2_64(LaterOffset) && EarlierWriteAlign <= LaterOffset) && 
-      !((EarlierWriteAlign != 0) && LaterOffset % EarlierWriteAlign == 0)) 
-    return false; 
- 
-  int64_t NewLength = IsOverwriteEnd 
-                          ? LaterOffset - EarlierOffset 
-                          : EarlierSize - (LaterOffset - EarlierOffset); 
- 
-  if (auto *AMI = dyn_cast<AtomicMemIntrinsic>(EarlierWrite)) { 
-    // When shortening an atomic memory intrinsic, the newly shortened 
-    // length must remain an integer multiple of the element size. 
-    const uint32_t ElementSize = AMI->getElementSizeInBytes(); 
-    if (0 != NewLength % ElementSize) 
-      return false; 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  OW " 
-                    << (IsOverwriteEnd ? "END" : "BEGIN") << ": " 
-                    << *EarlierWrite << "\n  KILLER (offset " << LaterOffset 
-                    << ", " << EarlierSize << ")\n"); 
- 
-  Value *EarlierWriteLength = EarlierIntrinsic->getLength(); 
-  Value *TrimmedLength = 
-      ConstantInt::get(EarlierWriteLength->getType(), NewLength); 
-  EarlierIntrinsic->setLength(TrimmedLength); 
- 
-  EarlierSize = NewLength; 
-  if (!IsOverwriteEnd) { 
-    int64_t OffsetMoved = (LaterOffset - EarlierOffset); 
-    Value *Indices[1] = { 
-        ConstantInt::get(EarlierWriteLength->getType(), OffsetMoved)}; 
-    GetElementPtrInst *NewDestGEP = GetElementPtrInst::CreateInBounds( 
-        EarlierIntrinsic->getRawDest()->getType()->getPointerElementType(), 
-        EarlierIntrinsic->getRawDest(), Indices, "", EarlierWrite); 
-    NewDestGEP->setDebugLoc(EarlierIntrinsic->getDebugLoc()); 
-    EarlierIntrinsic->setDest(NewDestGEP); 
-    EarlierOffset = EarlierOffset + OffsetMoved; 
-  } 
-  return true; 
-} 
- 
-static bool tryToShortenEnd(Instruction *EarlierWrite, 
-                            OverlapIntervalsTy &IntervalMap, 
+  // TODO: base this on the target vector size so that if the earlier
+  // store was too small to get vector writes anyway then its likely
+  // a good idea to shorten it
+  // Power of 2 vector writes are probably always a bad idea to optimize
+  // as any store/memset/memcpy is likely using vector instructions so
+  // shortening it to not vector size is likely to be slower
+  auto *EarlierIntrinsic = cast<AnyMemIntrinsic>(EarlierWrite);
+  unsigned EarlierWriteAlign = EarlierIntrinsic->getDestAlignment();
+  if (!IsOverwriteEnd)
+    LaterOffset = int64_t(LaterOffset + LaterSize);
+
+  if (!(isPowerOf2_64(LaterOffset) && EarlierWriteAlign <= LaterOffset) &&
+      !((EarlierWriteAlign != 0) && LaterOffset % EarlierWriteAlign == 0))
+    return false;
+
+  int64_t NewLength = IsOverwriteEnd
+                          ? LaterOffset - EarlierOffset
+                          : EarlierSize - (LaterOffset - EarlierOffset);
+
+  if (auto *AMI = dyn_cast<AtomicMemIntrinsic>(EarlierWrite)) {
+    // When shortening an atomic memory intrinsic, the newly shortened
+    // length must remain an integer multiple of the element size.
+    const uint32_t ElementSize = AMI->getElementSizeInBytes();
+    if (0 != NewLength % ElementSize)
+      return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  OW "
+                    << (IsOverwriteEnd ? "END" : "BEGIN") << ": "
+                    << *EarlierWrite << "\n  KILLER (offset " << LaterOffset
+                    << ", " << EarlierSize << ")\n");
+
+  Value *EarlierWriteLength = EarlierIntrinsic->getLength();
+  Value *TrimmedLength =
+      ConstantInt::get(EarlierWriteLength->getType(), NewLength);
+  EarlierIntrinsic->setLength(TrimmedLength);
+
+  EarlierSize = NewLength;
+  if (!IsOverwriteEnd) {
+    int64_t OffsetMoved = (LaterOffset - EarlierOffset);
+    Value *Indices[1] = {
+        ConstantInt::get(EarlierWriteLength->getType(), OffsetMoved)};
+    GetElementPtrInst *NewDestGEP = GetElementPtrInst::CreateInBounds(
+        EarlierIntrinsic->getRawDest()->getType()->getPointerElementType(),
+        EarlierIntrinsic->getRawDest(), Indices, "", EarlierWrite);
+    NewDestGEP->setDebugLoc(EarlierIntrinsic->getDebugLoc());
+    EarlierIntrinsic->setDest(NewDestGEP);
+    EarlierOffset = EarlierOffset + OffsetMoved;
+  }
+  return true;
+}
+
+static bool tryToShortenEnd(Instruction *EarlierWrite,
+                            OverlapIntervalsTy &IntervalMap,
                             int64_t &EarlierStart, uint64_t &EarlierSize) {
-  if (IntervalMap.empty() || !isShortenableAtTheEnd(EarlierWrite)) 
-    return false; 
- 
-  OverlapIntervalsTy::iterator OII = --IntervalMap.end(); 
-  int64_t LaterStart = OII->second; 
+  if (IntervalMap.empty() || !isShortenableAtTheEnd(EarlierWrite))
+    return false;
+
+  OverlapIntervalsTy::iterator OII = --IntervalMap.end();
+  int64_t LaterStart = OII->second;
   uint64_t LaterSize = OII->first - LaterStart;
- 
+
   assert(OII->first - LaterStart >= 0 && "Size expected to be positive");
 
   if (LaterStart > EarlierStart &&
@@ -1142,25 +1142,25 @@ static bool tryToShortenEnd(Instruction *EarlierWrite,
       // Note: "EarlierSize - (uint64_t)(LaterStart - EarlierStart)" is known to
       // be non negative due to preceding checks.
       LaterSize >= EarlierSize - (uint64_t)(LaterStart - EarlierStart)) {
-    if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart, 
-                     LaterSize, true)) { 
-      IntervalMap.erase(OII); 
-      return true; 
-    } 
-  } 
-  return false; 
-} 
- 
-static bool tryToShortenBegin(Instruction *EarlierWrite, 
-                              OverlapIntervalsTy &IntervalMap, 
+    if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart,
+                     LaterSize, true)) {
+      IntervalMap.erase(OII);
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool tryToShortenBegin(Instruction *EarlierWrite,
+                              OverlapIntervalsTy &IntervalMap,
                               int64_t &EarlierStart, uint64_t &EarlierSize) {
-  if (IntervalMap.empty() || !isShortenableAtTheBeginning(EarlierWrite)) 
-    return false; 
- 
-  OverlapIntervalsTy::iterator OII = IntervalMap.begin(); 
-  int64_t LaterStart = OII->second; 
+  if (IntervalMap.empty() || !isShortenableAtTheBeginning(EarlierWrite))
+    return false;
+
+  OverlapIntervalsTy::iterator OII = IntervalMap.begin();
+  int64_t LaterStart = OII->second;
   uint64_t LaterSize = OII->first - LaterStart;
- 
+
   assert(OII->first - LaterStart >= 0 && "Size expected to be positive");
 
   if (LaterStart <= EarlierStart &&
@@ -1170,433 +1170,433 @@ static bool tryToShortenBegin(Instruction *EarlierWrite,
     // Note: "LaterSize - (uint64_t)(EarlierStart - LaterStart)" is known to be
     // positive due to preceding checks.
     assert(LaterSize - (uint64_t)(EarlierStart - LaterStart) < EarlierSize &&
-           "Should have been handled as OW_Complete"); 
-    if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart, 
-                     LaterSize, false)) { 
-      IntervalMap.erase(OII); 
-      return true; 
-    } 
-  } 
-  return false; 
-} 
- 
+           "Should have been handled as OW_Complete");
+    if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart,
+                     LaterSize, false)) {
+      IntervalMap.erase(OII);
+      return true;
+    }
+  }
+  return false;
+}
+
 static bool removePartiallyOverlappedStores(const DataLayout &DL,
                                             InstOverlapIntervalsTy &IOL,
                                             const TargetLibraryInfo &TLI) {
-  bool Changed = false; 
-  for (auto OI : IOL) { 
-    Instruction *EarlierWrite = OI.first; 
+  bool Changed = false;
+  for (auto OI : IOL) {
+    Instruction *EarlierWrite = OI.first;
     MemoryLocation Loc = getLocForWrite(EarlierWrite, TLI);
-    assert(isRemovable(EarlierWrite) && "Expect only removable instruction"); 
- 
-    const Value *Ptr = Loc.Ptr->stripPointerCasts(); 
-    int64_t EarlierStart = 0; 
+    assert(isRemovable(EarlierWrite) && "Expect only removable instruction");
+
+    const Value *Ptr = Loc.Ptr->stripPointerCasts();
+    int64_t EarlierStart = 0;
     uint64_t EarlierSize = Loc.Size.getValue();
-    GetPointerBaseWithConstantOffset(Ptr, EarlierStart, DL); 
-    OverlapIntervalsTy &IntervalMap = OI.second; 
-    Changed |= 
-        tryToShortenEnd(EarlierWrite, IntervalMap, EarlierStart, EarlierSize); 
-    if (IntervalMap.empty()) 
-      continue; 
-    Changed |= 
-        tryToShortenBegin(EarlierWrite, IntervalMap, EarlierStart, EarlierSize); 
-  } 
-  return Changed; 
-} 
- 
-static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI, 
-                               AliasAnalysis *AA, MemoryDependenceResults *MD, 
-                               const DataLayout &DL, 
-                               const TargetLibraryInfo *TLI, 
-                               InstOverlapIntervalsTy &IOL, 
-                               MapVector<Instruction *, bool> &ThrowableInst, 
-                               DominatorTree *DT) { 
-  // Must be a store instruction. 
-  StoreInst *SI = dyn_cast<StoreInst>(Inst); 
-  if (!SI) 
-    return false; 
- 
-  // If we're storing the same value back to a pointer that we just loaded from, 
-  // then the store can be removed. 
-  if (LoadInst *DepLoad = dyn_cast<LoadInst>(SI->getValueOperand())) { 
-    if (SI->getPointerOperand() == DepLoad->getPointerOperand() && 
-        isRemovable(SI) && 
+    GetPointerBaseWithConstantOffset(Ptr, EarlierStart, DL);
+    OverlapIntervalsTy &IntervalMap = OI.second;
+    Changed |=
+        tryToShortenEnd(EarlierWrite, IntervalMap, EarlierStart, EarlierSize);
+    if (IntervalMap.empty())
+      continue;
+    Changed |=
+        tryToShortenBegin(EarlierWrite, IntervalMap, EarlierStart, EarlierSize);
+  }
+  return Changed;
+}
+
+static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI,
+                               AliasAnalysis *AA, MemoryDependenceResults *MD,
+                               const DataLayout &DL,
+                               const TargetLibraryInfo *TLI,
+                               InstOverlapIntervalsTy &IOL,
+                               MapVector<Instruction *, bool> &ThrowableInst,
+                               DominatorTree *DT) {
+  // Must be a store instruction.
+  StoreInst *SI = dyn_cast<StoreInst>(Inst);
+  if (!SI)
+    return false;
+
+  // If we're storing the same value back to a pointer that we just loaded from,
+  // then the store can be removed.
+  if (LoadInst *DepLoad = dyn_cast<LoadInst>(SI->getValueOperand())) {
+    if (SI->getPointerOperand() == DepLoad->getPointerOperand() &&
+        isRemovable(SI) &&
         memoryIsNotModifiedBetween(DepLoad, SI, *AA, DL, DT)) {
- 
-      LLVM_DEBUG( 
-          dbgs() << "DSE: Remove Store Of Load from same pointer:\n  LOAD: " 
-                 << *DepLoad << "\n  STORE: " << *SI << '\n'); 
- 
-      deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, ThrowableInst); 
-      ++NumRedundantStores; 
-      return true; 
-    } 
-  } 
- 
-  // Remove null stores into the calloc'ed objects 
-  Constant *StoredConstant = dyn_cast<Constant>(SI->getValueOperand()); 
-  if (StoredConstant && StoredConstant->isNullValue() && isRemovable(SI)) { 
-    Instruction *UnderlyingPointer = 
+
+      LLVM_DEBUG(
+          dbgs() << "DSE: Remove Store Of Load from same pointer:\n  LOAD: "
+                 << *DepLoad << "\n  STORE: " << *SI << '\n');
+
+      deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, ThrowableInst);
+      ++NumRedundantStores;
+      return true;
+    }
+  }
+
+  // Remove null stores into the calloc'ed objects
+  Constant *StoredConstant = dyn_cast<Constant>(SI->getValueOperand());
+  if (StoredConstant && StoredConstant->isNullValue() && isRemovable(SI)) {
+    Instruction *UnderlyingPointer =
         dyn_cast<Instruction>(getUnderlyingObject(SI->getPointerOperand()));
- 
-    if (UnderlyingPointer && isCallocLikeFn(UnderlyingPointer, TLI) && 
+
+    if (UnderlyingPointer && isCallocLikeFn(UnderlyingPointer, TLI) &&
         memoryIsNotModifiedBetween(UnderlyingPointer, SI, *AA, DL, DT)) {
-      LLVM_DEBUG( 
-          dbgs() << "DSE: Remove null store to the calloc'ed object:\n  DEAD: " 
-                 << *Inst << "\n  OBJECT: " << *UnderlyingPointer << '\n'); 
- 
-      deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, ThrowableInst); 
-      ++NumRedundantStores; 
-      return true; 
-    } 
-  } 
-  return false; 
-} 
- 
+      LLVM_DEBUG(
+          dbgs() << "DSE: Remove null store to the calloc'ed object:\n  DEAD: "
+                 << *Inst << "\n  OBJECT: " << *UnderlyingPointer << '\n');
+
+      deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, ThrowableInst);
+      ++NumRedundantStores;
+      return true;
+    }
+  }
+  return false;
+}
+
 template <typename AATy>
 static Constant *tryToMergePartialOverlappingStores(
     StoreInst *Earlier, StoreInst *Later, int64_t InstWriteOffset,
     int64_t DepWriteOffset, const DataLayout &DL, AATy &AA, DominatorTree *DT) {
- 
-  if (Earlier && isa<ConstantInt>(Earlier->getValueOperand()) && 
-      DL.typeSizeEqualsStoreSize(Earlier->getValueOperand()->getType()) && 
-      Later && isa<ConstantInt>(Later->getValueOperand()) && 
-      DL.typeSizeEqualsStoreSize(Later->getValueOperand()->getType()) && 
-      memoryIsNotModifiedBetween(Earlier, Later, AA, DL, DT)) { 
-    // If the store we find is: 
-    //   a) partially overwritten by the store to 'Loc' 
-    //   b) the later store is fully contained in the earlier one and 
-    //   c) they both have a constant value 
-    //   d) none of the two stores need padding 
-    // Merge the two stores, replacing the earlier store's value with a 
-    // merge of both values. 
-    // TODO: Deal with other constant types (vectors, etc), and probably 
-    // some mem intrinsics (if needed) 
- 
-    APInt EarlierValue = 
-        cast<ConstantInt>(Earlier->getValueOperand())->getValue(); 
-    APInt LaterValue = cast<ConstantInt>(Later->getValueOperand())->getValue(); 
-    unsigned LaterBits = LaterValue.getBitWidth(); 
-    assert(EarlierValue.getBitWidth() > LaterValue.getBitWidth()); 
-    LaterValue = LaterValue.zext(EarlierValue.getBitWidth()); 
- 
-    // Offset of the smaller store inside the larger store 
-    unsigned BitOffsetDiff = (InstWriteOffset - DepWriteOffset) * 8; 
-    unsigned LShiftAmount = DL.isBigEndian() ? EarlierValue.getBitWidth() - 
-                                                   BitOffsetDiff - LaterBits 
-                                             : BitOffsetDiff; 
-    APInt Mask = APInt::getBitsSet(EarlierValue.getBitWidth(), LShiftAmount, 
-                                   LShiftAmount + LaterBits); 
-    // Clear the bits we'll be replacing, then OR with the smaller 
-    // store, shifted appropriately. 
-    APInt Merged = (EarlierValue & ~Mask) | (LaterValue << LShiftAmount); 
-    LLVM_DEBUG(dbgs() << "DSE: Merge Stores:\n  Earlier: " << *Earlier 
-                      << "\n  Later: " << *Later 
-                      << "\n  Merged Value: " << Merged << '\n'); 
-    return ConstantInt::get(Earlier->getValueOperand()->getType(), Merged); 
-  } 
-  return nullptr; 
-} 
- 
-static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, 
-                                MemoryDependenceResults *MD, DominatorTree *DT, 
-                                const TargetLibraryInfo *TLI) { 
-  const DataLayout &DL = BB.getModule()->getDataLayout(); 
-  bool MadeChange = false; 
- 
-  MapVector<Instruction *, bool> ThrowableInst; 
- 
-  // A map of interval maps representing partially-overwritten value parts. 
-  InstOverlapIntervalsTy IOL; 
- 
-  // Do a top-down walk on the BB. 
-  for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) { 
-    // Handle 'free' calls specially. 
-    if (CallInst *F = isFreeCall(&*BBI, TLI)) { 
-      MadeChange |= handleFree(F, AA, MD, DT, TLI, IOL, ThrowableInst); 
-      // Increment BBI after handleFree has potentially deleted instructions. 
-      // This ensures we maintain a valid iterator. 
-      ++BBI; 
-      continue; 
-    } 
- 
-    Instruction *Inst = &*BBI++; 
- 
-    if (Inst->mayThrow()) { 
-      ThrowableInst[Inst] = true; 
-      continue; 
-    } 
- 
-    // Check to see if Inst writes to memory.  If not, continue. 
-    if (!hasAnalyzableMemoryWrite(Inst, *TLI)) 
-      continue; 
- 
-    // eliminateNoopStore will update in iterator, if necessary. 
-    if (eliminateNoopStore(Inst, BBI, AA, MD, DL, TLI, IOL, 
-                           ThrowableInst, DT)) { 
-      MadeChange = true; 
-      continue; 
-    } 
- 
-    // If we find something that writes memory, get its memory dependence. 
-    MemDepResult InstDep = MD->getDependency(Inst); 
- 
-    // Ignore any store where we can't find a local dependence. 
-    // FIXME: cross-block DSE would be fun. :) 
-    if (!InstDep.isDef() && !InstDep.isClobber()) 
-      continue; 
- 
-    // Figure out what location is being stored to. 
+
+  if (Earlier && isa<ConstantInt>(Earlier->getValueOperand()) &&
+      DL.typeSizeEqualsStoreSize(Earlier->getValueOperand()->getType()) &&
+      Later && isa<ConstantInt>(Later->getValueOperand()) &&
+      DL.typeSizeEqualsStoreSize(Later->getValueOperand()->getType()) &&
+      memoryIsNotModifiedBetween(Earlier, Later, AA, DL, DT)) {
+    // If the store we find is:
+    //   a) partially overwritten by the store to 'Loc'
+    //   b) the later store is fully contained in the earlier one and
+    //   c) they both have a constant value
+    //   d) none of the two stores need padding
+    // Merge the two stores, replacing the earlier store's value with a
+    // merge of both values.
+    // TODO: Deal with other constant types (vectors, etc), and probably
+    // some mem intrinsics (if needed)
+
+    APInt EarlierValue =
+        cast<ConstantInt>(Earlier->getValueOperand())->getValue();
+    APInt LaterValue = cast<ConstantInt>(Later->getValueOperand())->getValue();
+    unsigned LaterBits = LaterValue.getBitWidth();
+    assert(EarlierValue.getBitWidth() > LaterValue.getBitWidth());
+    LaterValue = LaterValue.zext(EarlierValue.getBitWidth());
+
+    // Offset of the smaller store inside the larger store
+    unsigned BitOffsetDiff = (InstWriteOffset - DepWriteOffset) * 8;
+    unsigned LShiftAmount = DL.isBigEndian() ? EarlierValue.getBitWidth() -
+                                                   BitOffsetDiff - LaterBits
+                                             : BitOffsetDiff;
+    APInt Mask = APInt::getBitsSet(EarlierValue.getBitWidth(), LShiftAmount,
+                                   LShiftAmount + LaterBits);
+    // Clear the bits we'll be replacing, then OR with the smaller
+    // store, shifted appropriately.
+    APInt Merged = (EarlierValue & ~Mask) | (LaterValue << LShiftAmount);
+    LLVM_DEBUG(dbgs() << "DSE: Merge Stores:\n  Earlier: " << *Earlier
+                      << "\n  Later: " << *Later
+                      << "\n  Merged Value: " << Merged << '\n');
+    return ConstantInt::get(Earlier->getValueOperand()->getType(), Merged);
+  }
+  return nullptr;
+}
+
+static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
+                                MemoryDependenceResults *MD, DominatorTree *DT,
+                                const TargetLibraryInfo *TLI) {
+  const DataLayout &DL = BB.getModule()->getDataLayout();
+  bool MadeChange = false;
+
+  MapVector<Instruction *, bool> ThrowableInst;
+
+  // A map of interval maps representing partially-overwritten value parts.
+  InstOverlapIntervalsTy IOL;
+
+  // Do a top-down walk on the BB.
+  for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) {
+    // Handle 'free' calls specially.
+    if (CallInst *F = isFreeCall(&*BBI, TLI)) {
+      MadeChange |= handleFree(F, AA, MD, DT, TLI, IOL, ThrowableInst);
+      // Increment BBI after handleFree has potentially deleted instructions.
+      // This ensures we maintain a valid iterator.
+      ++BBI;
+      continue;
+    }
+
+    Instruction *Inst = &*BBI++;
+
+    if (Inst->mayThrow()) {
+      ThrowableInst[Inst] = true;
+      continue;
+    }
+
+    // Check to see if Inst writes to memory.  If not, continue.
+    if (!hasAnalyzableMemoryWrite(Inst, *TLI))
+      continue;
+
+    // eliminateNoopStore will update in iterator, if necessary.
+    if (eliminateNoopStore(Inst, BBI, AA, MD, DL, TLI, IOL,
+                           ThrowableInst, DT)) {
+      MadeChange = true;
+      continue;
+    }
+
+    // If we find something that writes memory, get its memory dependence.
+    MemDepResult InstDep = MD->getDependency(Inst);
+
+    // Ignore any store where we can't find a local dependence.
+    // FIXME: cross-block DSE would be fun. :)
+    if (!InstDep.isDef() && !InstDep.isClobber())
+      continue;
+
+    // Figure out what location is being stored to.
     MemoryLocation Loc = getLocForWrite(Inst, *TLI);
- 
-    // If we didn't get a useful location, fail. 
-    if (!Loc.Ptr) 
-      continue; 
- 
-    // Loop until we find a store we can eliminate or a load that 
-    // invalidates the analysis. Without an upper bound on the number of 
-    // instructions examined, this analysis can become very time-consuming. 
-    // However, the potential gain diminishes as we process more instructions 
-    // without eliminating any of them. Therefore, we limit the number of 
-    // instructions we look at. 
-    auto Limit = MD->getDefaultBlockScanLimit(); 
-    while (InstDep.isDef() || InstDep.isClobber()) { 
-      // Get the memory clobbered by the instruction we depend on.  MemDep will 
-      // skip any instructions that 'Loc' clearly doesn't interact with.  If we 
-      // end up depending on a may- or must-aliased load, then we can't optimize 
-      // away the store and we bail out.  However, if we depend on something 
-      // that overwrites the memory location we *can* potentially optimize it. 
-      // 
-      // Find out what memory location the dependent instruction stores. 
-      Instruction *DepWrite = InstDep.getInst(); 
-      if (!hasAnalyzableMemoryWrite(DepWrite, *TLI)) 
-        break; 
+
+    // If we didn't get a useful location, fail.
+    if (!Loc.Ptr)
+      continue;
+
+    // Loop until we find a store we can eliminate or a load that
+    // invalidates the analysis. Without an upper bound on the number of
+    // instructions examined, this analysis can become very time-consuming.
+    // However, the potential gain diminishes as we process more instructions
+    // without eliminating any of them. Therefore, we limit the number of
+    // instructions we look at.
+    auto Limit = MD->getDefaultBlockScanLimit();
+    while (InstDep.isDef() || InstDep.isClobber()) {
+      // Get the memory clobbered by the instruction we depend on.  MemDep will
+      // skip any instructions that 'Loc' clearly doesn't interact with.  If we
+      // end up depending on a may- or must-aliased load, then we can't optimize
+      // away the store and we bail out.  However, if we depend on something
+      // that overwrites the memory location we *can* potentially optimize it.
+      //
+      // Find out what memory location the dependent instruction stores.
+      Instruction *DepWrite = InstDep.getInst();
+      if (!hasAnalyzableMemoryWrite(DepWrite, *TLI))
+        break;
       MemoryLocation DepLoc = getLocForWrite(DepWrite, *TLI);
-      // If we didn't get a useful location, or if it isn't a size, bail out. 
-      if (!DepLoc.Ptr) 
-        break; 
- 
-      // Find the last throwable instruction not removed by call to 
-      // deleteDeadInstruction. 
-      Instruction *LastThrowing = nullptr; 
-      if (!ThrowableInst.empty()) 
-        LastThrowing = ThrowableInst.back().first; 
- 
-      // Make sure we don't look past a call which might throw. This is an 
-      // issue because MemoryDependenceAnalysis works in the wrong direction: 
-      // it finds instructions which dominate the current instruction, rather than 
-      // instructions which are post-dominated by the current instruction. 
-      // 
-      // If the underlying object is a non-escaping memory allocation, any store 
-      // to it is dead along the unwind edge. Otherwise, we need to preserve 
-      // the store. 
-      if (LastThrowing && DepWrite->comesBefore(LastThrowing)) { 
+      // If we didn't get a useful location, or if it isn't a size, bail out.
+      if (!DepLoc.Ptr)
+        break;
+
+      // Find the last throwable instruction not removed by call to
+      // deleteDeadInstruction.
+      Instruction *LastThrowing = nullptr;
+      if (!ThrowableInst.empty())
+        LastThrowing = ThrowableInst.back().first;
+
+      // Make sure we don't look past a call which might throw. This is an
+      // issue because MemoryDependenceAnalysis works in the wrong direction:
+      // it finds instructions which dominate the current instruction, rather than
+      // instructions which are post-dominated by the current instruction.
+      //
+      // If the underlying object is a non-escaping memory allocation, any store
+      // to it is dead along the unwind edge. Otherwise, we need to preserve
+      // the store.
+      if (LastThrowing && DepWrite->comesBefore(LastThrowing)) {
         const Value *Underlying = getUnderlyingObject(DepLoc.Ptr);
-        bool IsStoreDeadOnUnwind = isa<AllocaInst>(Underlying); 
-        if (!IsStoreDeadOnUnwind) { 
-            // We're looking for a call to an allocation function 
-            // where the allocation doesn't escape before the last 
-            // throwing instruction; PointerMayBeCaptured 
-            // reasonably fast approximation. 
-            IsStoreDeadOnUnwind = isAllocLikeFn(Underlying, TLI) && 
-                !PointerMayBeCaptured(Underlying, false, true); 
-        } 
-        if (!IsStoreDeadOnUnwind) 
-          break; 
-      } 
- 
-      // If we find a write that is a) removable (i.e., non-volatile), b) is 
-      // completely obliterated by the store to 'Loc', and c) which we know that 
-      // 'Inst' doesn't load from, then we can remove it. 
-      // Also try to merge two stores if a later one only touches memory written 
-      // to by the earlier one. 
-      if (isRemovable(DepWrite) && 
-          !isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) { 
-        int64_t InstWriteOffset, DepWriteOffset; 
+        bool IsStoreDeadOnUnwind = isa<AllocaInst>(Underlying);
+        if (!IsStoreDeadOnUnwind) {
+            // We're looking for a call to an allocation function
+            // where the allocation doesn't escape before the last
+            // throwing instruction; PointerMayBeCaptured
+            // reasonably fast approximation.
+            IsStoreDeadOnUnwind = isAllocLikeFn(Underlying, TLI) &&
+                !PointerMayBeCaptured(Underlying, false, true);
+        }
+        if (!IsStoreDeadOnUnwind)
+          break;
+      }
+
+      // If we find a write that is a) removable (i.e., non-volatile), b) is
+      // completely obliterated by the store to 'Loc', and c) which we know that
+      // 'Inst' doesn't load from, then we can remove it.
+      // Also try to merge two stores if a later one only touches memory written
+      // to by the earlier one.
+      if (isRemovable(DepWrite) &&
+          !isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) {
+        int64_t InstWriteOffset, DepWriteOffset;
         OverwriteResult OR = isOverwrite(Inst, DepWrite, Loc, DepLoc, DL, *TLI,
                                          DepWriteOffset, InstWriteOffset, *AA,
-                                         BB.getParent()); 
+                                         BB.getParent());
         if (OR == OW_MaybePartial)
           OR = isPartialOverwrite(Loc, DepLoc, DepWriteOffset, InstWriteOffset,
                                   DepWrite, IOL);
 
-        if (OR == OW_Complete) { 
-          LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: " << *DepWrite 
-                            << "\n  KILLER: " << *Inst << '\n'); 
- 
-          // Delete the store and now-dead instructions that feed it. 
-          deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL, 
-                                ThrowableInst); 
-          ++NumFastStores; 
-          MadeChange = true; 
- 
-          // We erased DepWrite; start over. 
-          InstDep = MD->getDependency(Inst); 
-          continue; 
-        } else if ((OR == OW_End && isShortenableAtTheEnd(DepWrite)) || 
-                   ((OR == OW_Begin && 
-                     isShortenableAtTheBeginning(DepWrite)))) { 
-          assert(!EnablePartialOverwriteTracking && "Do not expect to perform " 
-                                                    "when partial-overwrite " 
-                                                    "tracking is enabled"); 
-          // The overwrite result is known, so these must be known, too. 
+        if (OR == OW_Complete) {
+          LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: " << *DepWrite
+                            << "\n  KILLER: " << *Inst << '\n');
+
+          // Delete the store and now-dead instructions that feed it.
+          deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL,
+                                ThrowableInst);
+          ++NumFastStores;
+          MadeChange = true;
+
+          // We erased DepWrite; start over.
+          InstDep = MD->getDependency(Inst);
+          continue;
+        } else if ((OR == OW_End && isShortenableAtTheEnd(DepWrite)) ||
+                   ((OR == OW_Begin &&
+                     isShortenableAtTheBeginning(DepWrite)))) {
+          assert(!EnablePartialOverwriteTracking && "Do not expect to perform "
+                                                    "when partial-overwrite "
+                                                    "tracking is enabled");
+          // The overwrite result is known, so these must be known, too.
           uint64_t EarlierSize = DepLoc.Size.getValue();
           uint64_t LaterSize = Loc.Size.getValue();
-          bool IsOverwriteEnd = (OR == OW_End); 
-          MadeChange |= tryToShorten(DepWrite, DepWriteOffset, EarlierSize, 
-                                    InstWriteOffset, LaterSize, IsOverwriteEnd); 
-        } else if (EnablePartialStoreMerging && 
-                   OR == OW_PartialEarlierWithFullLater) { 
-          auto *Earlier = dyn_cast<StoreInst>(DepWrite); 
-          auto *Later = dyn_cast<StoreInst>(Inst); 
-          if (Constant *C = tryToMergePartialOverlappingStores( 
+          bool IsOverwriteEnd = (OR == OW_End);
+          MadeChange |= tryToShorten(DepWrite, DepWriteOffset, EarlierSize,
+                                    InstWriteOffset, LaterSize, IsOverwriteEnd);
+        } else if (EnablePartialStoreMerging &&
+                   OR == OW_PartialEarlierWithFullLater) {
+          auto *Earlier = dyn_cast<StoreInst>(DepWrite);
+          auto *Later = dyn_cast<StoreInst>(Inst);
+          if (Constant *C = tryToMergePartialOverlappingStores(
                   Earlier, Later, InstWriteOffset, DepWriteOffset, DL, *AA,
-                  DT)) { 
-            auto *SI = new StoreInst( 
-                C, Earlier->getPointerOperand(), false, Earlier->getAlign(), 
-                Earlier->getOrdering(), Earlier->getSyncScopeID(), DepWrite); 
- 
-            unsigned MDToKeep[] = {LLVMContext::MD_dbg, LLVMContext::MD_tbaa, 
-                                   LLVMContext::MD_alias_scope, 
-                                   LLVMContext::MD_noalias, 
-                                   LLVMContext::MD_nontemporal}; 
-            SI->copyMetadata(*DepWrite, MDToKeep); 
-            ++NumModifiedStores; 
- 
-            // Delete the old stores and now-dead instructions that feed them. 
-            deleteDeadInstruction(Inst, &BBI, *MD, *TLI, IOL, 
-                                  ThrowableInst); 
-            deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL, 
-                                  ThrowableInst); 
-            MadeChange = true; 
- 
-            // We erased DepWrite and Inst (Loc); start over. 
-            break; 
-          } 
-        } 
-      } 
- 
-      // If this is a may-aliased store that is clobbering the store value, we 
-      // can keep searching past it for another must-aliased pointer that stores 
-      // to the same location.  For example, in: 
-      //   store -> P 
-      //   store -> Q 
-      //   store -> P 
-      // we can remove the first store to P even though we don't know if P and Q 
-      // alias. 
-      if (DepWrite == &BB.front()) break; 
- 
-      // Can't look past this instruction if it might read 'Loc'. 
-      if (isRefSet(AA->getModRefInfo(DepWrite, Loc))) 
-        break; 
- 
-      InstDep = MD->getPointerDependencyFrom(Loc, /*isLoad=*/ false, 
-                                             DepWrite->getIterator(), &BB, 
-                                             /*QueryInst=*/ nullptr, &Limit); 
-    } 
-  } 
- 
-  if (EnablePartialOverwriteTracking) 
+                  DT)) {
+            auto *SI = new StoreInst(
+                C, Earlier->getPointerOperand(), false, Earlier->getAlign(),
+                Earlier->getOrdering(), Earlier->getSyncScopeID(), DepWrite);
+
+            unsigned MDToKeep[] = {LLVMContext::MD_dbg, LLVMContext::MD_tbaa,
+                                   LLVMContext::MD_alias_scope,
+                                   LLVMContext::MD_noalias,
+                                   LLVMContext::MD_nontemporal};
+            SI->copyMetadata(*DepWrite, MDToKeep);
+            ++NumModifiedStores;
+
+            // Delete the old stores and now-dead instructions that feed them.
+            deleteDeadInstruction(Inst, &BBI, *MD, *TLI, IOL,
+                                  ThrowableInst);
+            deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL,
+                                  ThrowableInst);
+            MadeChange = true;
+
+            // We erased DepWrite and Inst (Loc); start over.
+            break;
+          }
+        }
+      }
+
+      // If this is a may-aliased store that is clobbering the store value, we
+      // can keep searching past it for another must-aliased pointer that stores
+      // to the same location.  For example, in:
+      //   store -> P
+      //   store -> Q
+      //   store -> P
+      // we can remove the first store to P even though we don't know if P and Q
+      // alias.
+      if (DepWrite == &BB.front()) break;
+
+      // Can't look past this instruction if it might read 'Loc'.
+      if (isRefSet(AA->getModRefInfo(DepWrite, Loc)))
+        break;
+
+      InstDep = MD->getPointerDependencyFrom(Loc, /*isLoad=*/ false,
+                                             DepWrite->getIterator(), &BB,
+                                             /*QueryInst=*/ nullptr, &Limit);
+    }
+  }
+
+  if (EnablePartialOverwriteTracking)
     MadeChange |= removePartiallyOverlappedStores(DL, IOL, *TLI);
- 
-  // If this block ends in a return, unwind, or unreachable, all allocas are 
-  // dead at its end, which means stores to them are also dead. 
-  if (BB.getTerminator()->getNumSuccessors() == 0) 
-    MadeChange |= handleEndBlock(BB, AA, MD, TLI, IOL, ThrowableInst); 
- 
-  return MadeChange; 
-} 
- 
-static bool eliminateDeadStores(Function &F, AliasAnalysis *AA, 
-                                MemoryDependenceResults *MD, DominatorTree *DT, 
-                                const TargetLibraryInfo *TLI) { 
-  bool MadeChange = false; 
-  for (BasicBlock &BB : F) 
-    // Only check non-dead blocks.  Dead blocks may have strange pointer 
-    // cycles that will confuse alias analysis. 
-    if (DT->isReachableFromEntry(&BB)) 
-      MadeChange |= eliminateDeadStores(BB, AA, MD, DT, TLI); 
- 
-  return MadeChange; 
-} 
- 
-namespace { 
-//============================================================================= 
-// MemorySSA backed dead store elimination. 
-// 
-// The code below implements dead store elimination using MemorySSA. It uses 
-// the following general approach: given a MemoryDef, walk upwards to find 
-// clobbering MemoryDefs that may be killed by the starting def. Then check 
-// that there are no uses that may read the location of the original MemoryDef 
-// in between both MemoryDefs. A bit more concretely: 
-// 
-// For all MemoryDefs StartDef: 
+
+  // If this block ends in a return, unwind, or unreachable, all allocas are
+  // dead at its end, which means stores to them are also dead.
+  if (BB.getTerminator()->getNumSuccessors() == 0)
+    MadeChange |= handleEndBlock(BB, AA, MD, TLI, IOL, ThrowableInst);
+
+  return MadeChange;
+}
+
+static bool eliminateDeadStores(Function &F, AliasAnalysis *AA,
+                                MemoryDependenceResults *MD, DominatorTree *DT,
+                                const TargetLibraryInfo *TLI) {
+  bool MadeChange = false;
+  for (BasicBlock &BB : F)
+    // Only check non-dead blocks.  Dead blocks may have strange pointer
+    // cycles that will confuse alias analysis.
+    if (DT->isReachableFromEntry(&BB))
+      MadeChange |= eliminateDeadStores(BB, AA, MD, DT, TLI);
+
+  return MadeChange;
+}
+
+namespace {
+//=============================================================================
+// MemorySSA backed dead store elimination.
+//
+// The code below implements dead store elimination using MemorySSA. It uses
+// the following general approach: given a MemoryDef, walk upwards to find
+// clobbering MemoryDefs that may be killed by the starting def. Then check
+// that there are no uses that may read the location of the original MemoryDef
+// in between both MemoryDefs. A bit more concretely:
+//
+// For all MemoryDefs StartDef:
 // 1. Get the next dominating clobbering MemoryDef (EarlierAccess) by walking
-//    upwards. 
+//    upwards.
 // 2. Check that there are no reads between EarlierAccess and the StartDef by
 //    checking all uses starting at EarlierAccess and walking until we see
 //    StartDef.
 // 3. For each found CurrentDef, check that:
 //   1. There are no barrier instructions between CurrentDef and StartDef (like
-//       throws or stores with ordering constraints). 
+//       throws or stores with ordering constraints).
 //   2. StartDef is executed whenever CurrentDef is executed.
 //   3. StartDef completely overwrites CurrentDef.
 // 4. Erase CurrentDef from the function and MemorySSA.
- 
+
 // Returns true if \p I is an intrisnic that does not read or write memory.
 bool isNoopIntrinsic(Instruction *I) {
   if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
-    switch (II->getIntrinsicID()) { 
-    case Intrinsic::lifetime_start: 
-    case Intrinsic::lifetime_end: 
-    case Intrinsic::invariant_end: 
-    case Intrinsic::launder_invariant_group: 
-    case Intrinsic::assume: 
-      return true; 
-    case Intrinsic::dbg_addr: 
-    case Intrinsic::dbg_declare: 
-    case Intrinsic::dbg_label: 
-    case Intrinsic::dbg_value: 
-      llvm_unreachable("Intrinsic should not be modeled in MemorySSA"); 
-    default: 
-      return false; 
-    } 
-  } 
-  return false; 
-} 
- 
-// Check if we can ignore \p D for DSE. 
-bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) { 
-  Instruction *DI = D->getMemoryInst(); 
-  // Calls that only access inaccessible memory cannot read or write any memory 
-  // locations we consider for elimination. 
-  if (auto *CB = dyn_cast<CallBase>(DI)) 
-    if (CB->onlyAccessesInaccessibleMemory()) 
-      return true; 
- 
-  // We can eliminate stores to locations not visible to the caller across 
-  // throwing instructions. 
-  if (DI->mayThrow() && !DefVisibleToCaller) 
-    return true; 
- 
-  // We can remove the dead stores, irrespective of the fence and its ordering 
-  // (release/acquire/seq_cst). Fences only constraints the ordering of 
-  // already visible stores, it does not make a store visible to other 
-  // threads. So, skipping over a fence does not change a store from being 
-  // dead. 
-  if (isa<FenceInst>(DI)) 
-    return true; 
- 
-  // Skip intrinsics that do not really read or modify memory. 
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
+    case Intrinsic::invariant_end:
+    case Intrinsic::launder_invariant_group:
+    case Intrinsic::assume:
+      return true;
+    case Intrinsic::dbg_addr:
+    case Intrinsic::dbg_declare:
+    case Intrinsic::dbg_label:
+    case Intrinsic::dbg_value:
+      llvm_unreachable("Intrinsic should not be modeled in MemorySSA");
+    default:
+      return false;
+    }
+  }
+  return false;
+}
+
+// Check if we can ignore \p D for DSE.
+bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) {
+  Instruction *DI = D->getMemoryInst();
+  // Calls that only access inaccessible memory cannot read or write any memory
+  // locations we consider for elimination.
+  if (auto *CB = dyn_cast<CallBase>(DI))
+    if (CB->onlyAccessesInaccessibleMemory())
+      return true;
+
+  // We can eliminate stores to locations not visible to the caller across
+  // throwing instructions.
+  if (DI->mayThrow() && !DefVisibleToCaller)
+    return true;
+
+  // We can remove the dead stores, irrespective of the fence and its ordering
+  // (release/acquire/seq_cst). Fences only constraints the ordering of
+  // already visible stores, it does not make a store visible to other
+  // threads. So, skipping over a fence does not change a store from being
+  // dead.
+  if (isa<FenceInst>(DI))
+    return true;
+
+  // Skip intrinsics that do not really read or modify memory.
   if (isNoopIntrinsic(D->getMemoryInst()))
-    return true; 
- 
-  return false; 
-} 
- 
-struct DSEState { 
-  Function &F; 
-  AliasAnalysis &AA; 
+    return true;
+
+  return false;
+}
+
+struct DSEState {
+  Function &F;
+  AliasAnalysis &AA;
 
   /// The single BatchAA instance that is used to cache AA queries. It will
   /// not be invalidated over the whole run. This is safe, because:
@@ -1607,72 +1607,72 @@ struct DSEState {
   ///    value pointer.
   BatchAAResults BatchAA;
 
-  MemorySSA &MSSA; 
-  DominatorTree &DT; 
-  PostDominatorTree &PDT; 
-  const TargetLibraryInfo &TLI; 
+  MemorySSA &MSSA;
+  DominatorTree &DT;
+  PostDominatorTree &PDT;
+  const TargetLibraryInfo &TLI;
   const DataLayout &DL;
- 
-  // All MemoryDefs that potentially could kill other MemDefs. 
-  SmallVector<MemoryDef *, 64> MemDefs; 
-  // Any that should be skipped as they are already deleted 
-  SmallPtrSet<MemoryAccess *, 4> SkipStores; 
-  // Keep track of all of the objects that are invisible to the caller before 
-  // the function returns. 
+
+  // All MemoryDefs that potentially could kill other MemDefs.
+  SmallVector<MemoryDef *, 64> MemDefs;
+  // Any that should be skipped as they are already deleted
+  SmallPtrSet<MemoryAccess *, 4> SkipStores;
+  // Keep track of all of the objects that are invisible to the caller before
+  // the function returns.
   // SmallPtrSet<const Value *, 16> InvisibleToCallerBeforeRet;
   DenseMap<const Value *, bool> InvisibleToCallerBeforeRet;
-  // Keep track of all of the objects that are invisible to the caller after 
-  // the function returns. 
+  // Keep track of all of the objects that are invisible to the caller after
+  // the function returns.
   DenseMap<const Value *, bool> InvisibleToCallerAfterRet;
-  // Keep track of blocks with throwing instructions not modeled in MemorySSA. 
-  SmallPtrSet<BasicBlock *, 16> ThrowingBlocks; 
-  // Post-order numbers for each basic block. Used to figure out if memory 
-  // accesses are executed before another access. 
-  DenseMap<BasicBlock *, unsigned> PostOrderNumbers; 
- 
-  /// Keep track of instructions (partly) overlapping with killing MemoryDefs per 
-  /// basic block. 
-  DenseMap<BasicBlock *, InstOverlapIntervalsTy> IOLs; 
- 
-  DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT, 
-           PostDominatorTree &PDT, const TargetLibraryInfo &TLI) 
+  // Keep track of blocks with throwing instructions not modeled in MemorySSA.
+  SmallPtrSet<BasicBlock *, 16> ThrowingBlocks;
+  // Post-order numbers for each basic block. Used to figure out if memory
+  // accesses are executed before another access.
+  DenseMap<BasicBlock *, unsigned> PostOrderNumbers;
+
+  /// Keep track of instructions (partly) overlapping with killing MemoryDefs per
+  /// basic block.
+  DenseMap<BasicBlock *, InstOverlapIntervalsTy> IOLs;
+
+  DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT,
+           PostDominatorTree &PDT, const TargetLibraryInfo &TLI)
       : F(F), AA(AA), BatchAA(AA), MSSA(MSSA), DT(DT), PDT(PDT), TLI(TLI),
         DL(F.getParent()->getDataLayout()) {}
- 
-  static DSEState get(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, 
-                      DominatorTree &DT, PostDominatorTree &PDT, 
-                      const TargetLibraryInfo &TLI) { 
-    DSEState State(F, AA, MSSA, DT, PDT, TLI); 
-    // Collect blocks with throwing instructions not modeled in MemorySSA and 
-    // alloc-like objects. 
-    unsigned PO = 0; 
-    for (BasicBlock *BB : post_order(&F)) { 
-      State.PostOrderNumbers[BB] = PO++; 
-      for (Instruction &I : *BB) { 
-        MemoryAccess *MA = MSSA.getMemoryAccess(&I); 
-        if (I.mayThrow() && !MA) 
-          State.ThrowingBlocks.insert(I.getParent()); 
- 
-        auto *MD = dyn_cast_or_null<MemoryDef>(MA); 
-        if (MD && State.MemDefs.size() < MemorySSADefsPerBlockLimit && 
-            (State.getLocForWriteEx(&I) || State.isMemTerminatorInst(&I))) 
-          State.MemDefs.push_back(MD); 
-      } 
-    } 
- 
-    // Treat byval or inalloca arguments the same as Allocas, stores to them are 
-    // dead at the end of the function. 
-    for (Argument &AI : F.args()) 
+
+  static DSEState get(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
+                      DominatorTree &DT, PostDominatorTree &PDT,
+                      const TargetLibraryInfo &TLI) {
+    DSEState State(F, AA, MSSA, DT, PDT, TLI);
+    // Collect blocks with throwing instructions not modeled in MemorySSA and
+    // alloc-like objects.
+    unsigned PO = 0;
+    for (BasicBlock *BB : post_order(&F)) {
+      State.PostOrderNumbers[BB] = PO++;
+      for (Instruction &I : *BB) {
+        MemoryAccess *MA = MSSA.getMemoryAccess(&I);
+        if (I.mayThrow() && !MA)
+          State.ThrowingBlocks.insert(I.getParent());
+
+        auto *MD = dyn_cast_or_null<MemoryDef>(MA);
+        if (MD && State.MemDefs.size() < MemorySSADefsPerBlockLimit &&
+            (State.getLocForWriteEx(&I) || State.isMemTerminatorInst(&I)))
+          State.MemDefs.push_back(MD);
+      }
+    }
+
+    // Treat byval or inalloca arguments the same as Allocas, stores to them are
+    // dead at the end of the function.
+    for (Argument &AI : F.args())
       if (AI.hasPassPointeeByValueCopyAttr()) {
-        // For byval, the caller doesn't know the address of the allocation. 
-        if (AI.hasByValAttr()) 
+        // For byval, the caller doesn't know the address of the allocation.
+        if (AI.hasByValAttr())
           State.InvisibleToCallerBeforeRet.insert({&AI, true});
         State.InvisibleToCallerAfterRet.insert({&AI, true});
-      } 
- 
-    return State; 
-  } 
- 
+      }
+
+    return State;
+  }
+
   bool isInvisibleToCallerAfterRet(const Value *V) {
     if (isa<AllocaInst>(V))
       return true;
@@ -1705,31 +1705,31 @@ struct DSEState {
     return I.first->second;
   }
 
-  Optional<MemoryLocation> getLocForWriteEx(Instruction *I) const { 
-    if (!I->mayWriteToMemory()) 
-      return None; 
- 
-    if (auto *MTI = dyn_cast<AnyMemIntrinsic>(I)) 
-      return {MemoryLocation::getForDest(MTI)}; 
- 
-    if (auto *CB = dyn_cast<CallBase>(I)) { 
+  Optional<MemoryLocation> getLocForWriteEx(Instruction *I) const {
+    if (!I->mayWriteToMemory())
+      return None;
+
+    if (auto *MTI = dyn_cast<AnyMemIntrinsic>(I))
+      return {MemoryLocation::getForDest(MTI)};
+
+    if (auto *CB = dyn_cast<CallBase>(I)) {
       // If the functions may write to memory we do not know about, bail out.
       if (!CB->onlyAccessesArgMemory() &&
           !CB->onlyAccessesInaccessibleMemOrArgMem())
         return None;
 
-      LibFunc LF; 
-      if (TLI.getLibFunc(*CB, LF) && TLI.has(LF)) { 
-        switch (LF) { 
-        case LibFunc_strcpy: 
-        case LibFunc_strncpy: 
-        case LibFunc_strcat: 
-        case LibFunc_strncat: 
+      LibFunc LF;
+      if (TLI.getLibFunc(*CB, LF) && TLI.has(LF)) {
+        switch (LF) {
+        case LibFunc_strcpy:
+        case LibFunc_strncpy:
+        case LibFunc_strcat:
+        case LibFunc_strncat:
           return {MemoryLocation::getAfter(CB->getArgOperand(0))};
-        default: 
-          break; 
-        } 
-      } 
+        default:
+          break;
+        }
+      }
       switch (CB->getIntrinsicID()) {
       case Intrinsic::init_trampoline:
         return {MemoryLocation::getAfter(CB->getArgOperand(0))};
@@ -1738,138 +1738,138 @@ struct DSEState {
       default:
         break;
       }
-      return None; 
-    } 
- 
-    return MemoryLocation::getOrNone(I); 
-  } 
- 
+      return None;
+    }
+
+    return MemoryLocation::getOrNone(I);
+  }
+
   /// Returns true if \p UseInst completely overwrites \p DefLoc
   /// (stored by \p DefInst).
   bool isCompleteOverwrite(const MemoryLocation &DefLoc, Instruction *DefInst,
                            Instruction *UseInst) {
-    // UseInst has a MemoryDef associated in MemorySSA. It's possible for a 
-    // MemoryDef to not write to memory, e.g. a volatile load is modeled as a 
-    // MemoryDef. 
-    if (!UseInst->mayWriteToMemory()) 
-      return false; 
- 
-    if (auto *CB = dyn_cast<CallBase>(UseInst)) 
-      if (CB->onlyAccessesInaccessibleMemory()) 
-        return false; 
- 
-    int64_t InstWriteOffset, DepWriteOffset; 
+    // UseInst has a MemoryDef associated in MemorySSA. It's possible for a
+    // MemoryDef to not write to memory, e.g. a volatile load is modeled as a
+    // MemoryDef.
+    if (!UseInst->mayWriteToMemory())
+      return false;
+
+    if (auto *CB = dyn_cast<CallBase>(UseInst))
+      if (CB->onlyAccessesInaccessibleMemory())
+        return false;
+
+    int64_t InstWriteOffset, DepWriteOffset;
     if (auto CC = getLocForWriteEx(UseInst))
       return isOverwrite(UseInst, DefInst, *CC, DefLoc, DL, TLI, DepWriteOffset,
                          InstWriteOffset, BatchAA, &F) == OW_Complete;
     return false;
-  } 
- 
-  /// Returns true if \p Def is not read before returning from the function. 
-  bool isWriteAtEndOfFunction(MemoryDef *Def) { 
-    LLVM_DEBUG(dbgs() << "  Check if def " << *Def << " (" 
-                      << *Def->getMemoryInst() 
-                      << ") is at the end the function \n"); 
- 
-    auto MaybeLoc = getLocForWriteEx(Def->getMemoryInst()); 
-    if (!MaybeLoc) { 
-      LLVM_DEBUG(dbgs() << "  ... could not get location for write.\n"); 
-      return false; 
-    } 
- 
-    SmallVector<MemoryAccess *, 4> WorkList; 
-    SmallPtrSet<MemoryAccess *, 8> Visited; 
-    auto PushMemUses = [&WorkList, &Visited](MemoryAccess *Acc) { 
-      if (!Visited.insert(Acc).second) 
-        return; 
-      for (Use &U : Acc->uses()) 
-        WorkList.push_back(cast<MemoryAccess>(U.getUser())); 
-    }; 
-    PushMemUses(Def); 
-    for (unsigned I = 0; I < WorkList.size(); I++) { 
-      if (WorkList.size() >= MemorySSAScanLimit) { 
-        LLVM_DEBUG(dbgs() << "  ... hit exploration limit.\n"); 
-        return false; 
-      } 
- 
-      MemoryAccess *UseAccess = WorkList[I]; 
+  }
+
+  /// Returns true if \p Def is not read before returning from the function.
+  bool isWriteAtEndOfFunction(MemoryDef *Def) {
+    LLVM_DEBUG(dbgs() << "  Check if def " << *Def << " ("
+                      << *Def->getMemoryInst()
+                      << ") is at the end the function \n");
+
+    auto MaybeLoc = getLocForWriteEx(Def->getMemoryInst());
+    if (!MaybeLoc) {
+      LLVM_DEBUG(dbgs() << "  ... could not get location for write.\n");
+      return false;
+    }
+
+    SmallVector<MemoryAccess *, 4> WorkList;
+    SmallPtrSet<MemoryAccess *, 8> Visited;
+    auto PushMemUses = [&WorkList, &Visited](MemoryAccess *Acc) {
+      if (!Visited.insert(Acc).second)
+        return;
+      for (Use &U : Acc->uses())
+        WorkList.push_back(cast<MemoryAccess>(U.getUser()));
+    };
+    PushMemUses(Def);
+    for (unsigned I = 0; I < WorkList.size(); I++) {
+      if (WorkList.size() >= MemorySSAScanLimit) {
+        LLVM_DEBUG(dbgs() << "  ... hit exploration limit.\n");
+        return false;
+      }
+
+      MemoryAccess *UseAccess = WorkList[I];
       // Simply adding the users of MemoryPhi to the worklist is not enough,
       // because we might miss read clobbers in different iterations of a loop,
       // for example.
       // TODO: Add support for phi translation to handle the loop case.
       if (isa<MemoryPhi>(UseAccess))
         return false;
- 
-      // TODO: Checking for aliasing is expensive. Consider reducing the amount 
-      // of times this is called and/or caching it. 
-      Instruction *UseInst = cast<MemoryUseOrDef>(UseAccess)->getMemoryInst(); 
-      if (isReadClobber(*MaybeLoc, UseInst)) { 
-        LLVM_DEBUG(dbgs() << "  ... hit read clobber " << *UseInst << ".\n"); 
-        return false; 
-      } 
- 
-      if (MemoryDef *UseDef = dyn_cast<MemoryDef>(UseAccess)) 
-        PushMemUses(UseDef); 
-    } 
-    return true; 
-  } 
- 
-  /// If \p I is a memory  terminator like llvm.lifetime.end or free, return a 
-  /// pair with the MemoryLocation terminated by \p I and a boolean flag 
-  /// indicating whether \p I is a free-like call. 
-  Optional<std::pair<MemoryLocation, bool>> 
-  getLocForTerminator(Instruction *I) const { 
-    uint64_t Len; 
-    Value *Ptr; 
-    if (match(I, m_Intrinsic<Intrinsic::lifetime_end>(m_ConstantInt(Len), 
-                                                      m_Value(Ptr)))) 
-      return {std::make_pair(MemoryLocation(Ptr, Len), false)}; 
- 
-    if (auto *CB = dyn_cast<CallBase>(I)) { 
-      if (isFreeCall(I, &TLI)) 
+
+      // TODO: Checking for aliasing is expensive. Consider reducing the amount
+      // of times this is called and/or caching it.
+      Instruction *UseInst = cast<MemoryUseOrDef>(UseAccess)->getMemoryInst();
+      if (isReadClobber(*MaybeLoc, UseInst)) {
+        LLVM_DEBUG(dbgs() << "  ... hit read clobber " << *UseInst << ".\n");
+        return false;
+      }
+
+      if (MemoryDef *UseDef = dyn_cast<MemoryDef>(UseAccess))
+        PushMemUses(UseDef);
+    }
+    return true;
+  }
+
+  /// If \p I is a memory  terminator like llvm.lifetime.end or free, return a
+  /// pair with the MemoryLocation terminated by \p I and a boolean flag
+  /// indicating whether \p I is a free-like call.
+  Optional<std::pair<MemoryLocation, bool>>
+  getLocForTerminator(Instruction *I) const {
+    uint64_t Len;
+    Value *Ptr;
+    if (match(I, m_Intrinsic<Intrinsic::lifetime_end>(m_ConstantInt(Len),
+                                                      m_Value(Ptr))))
+      return {std::make_pair(MemoryLocation(Ptr, Len), false)};
+
+    if (auto *CB = dyn_cast<CallBase>(I)) {
+      if (isFreeCall(I, &TLI))
         return {std::make_pair(MemoryLocation::getAfter(CB->getArgOperand(0)),
                                true)};
-    } 
- 
-    return None; 
-  } 
- 
-  /// Returns true if \p I is a memory terminator instruction like 
-  /// llvm.lifetime.end or free. 
-  bool isMemTerminatorInst(Instruction *I) const { 
-    IntrinsicInst *II = dyn_cast<IntrinsicInst>(I); 
-    return (II && II->getIntrinsicID() == Intrinsic::lifetime_end) || 
-           isFreeCall(I, &TLI); 
-  } 
- 
+    }
+
+    return None;
+  }
+
+  /// Returns true if \p I is a memory terminator instruction like
+  /// llvm.lifetime.end or free.
+  bool isMemTerminatorInst(Instruction *I) const {
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
+    return (II && II->getIntrinsicID() == Intrinsic::lifetime_end) ||
+           isFreeCall(I, &TLI);
+  }
+
   /// Returns true if \p MaybeTerm is a memory terminator for \p Loc from
   /// instruction \p AccessI.
   bool isMemTerminator(const MemoryLocation &Loc, Instruction *AccessI,
                        Instruction *MaybeTerm) {
-    Optional<std::pair<MemoryLocation, bool>> MaybeTermLoc = 
-        getLocForTerminator(MaybeTerm); 
- 
-    if (!MaybeTermLoc) 
-      return false; 
- 
-    // If the terminator is a free-like call, all accesses to the underlying 
-    // object can be considered terminated. 
+    Optional<std::pair<MemoryLocation, bool>> MaybeTermLoc =
+        getLocForTerminator(MaybeTerm);
+
+    if (!MaybeTermLoc)
+      return false;
+
+    // If the terminator is a free-like call, all accesses to the underlying
+    // object can be considered terminated.
     if (getUnderlyingObject(Loc.Ptr) !=
         getUnderlyingObject(MaybeTermLoc->first.Ptr))
       return false;
 
     auto TermLoc = MaybeTermLoc->first;
-    if (MaybeTermLoc->second) { 
+    if (MaybeTermLoc->second) {
       const Value *LocUO = getUnderlyingObject(Loc.Ptr);
       return BatchAA.isMustAlias(TermLoc.Ptr, LocUO);
-    } 
+    }
     int64_t InstWriteOffset, DepWriteOffset;
     return isOverwrite(MaybeTerm, AccessI, TermLoc, Loc, DL, TLI,
                        DepWriteOffset, InstWriteOffset, BatchAA,
                        &F) == OW_Complete;
-  } 
- 
-  // Returns true if \p Use may read from \p DefLoc. 
+  }
+
+  // Returns true if \p Use may read from \p DefLoc.
   bool isReadClobber(const MemoryLocation &DefLoc, Instruction *UseInst) {
     if (isNoopIntrinsic(UseInst))
       return false;
@@ -1879,20 +1879,20 @@ struct DSEState {
     if (auto SI = dyn_cast<StoreInst>(UseInst))
       return isStrongerThan(SI->getOrdering(), AtomicOrdering::Monotonic);
 
-    if (!UseInst->mayReadFromMemory()) 
-      return false; 
- 
-    if (auto *CB = dyn_cast<CallBase>(UseInst)) 
-      if (CB->onlyAccessesInaccessibleMemory()) 
-        return false; 
- 
+    if (!UseInst->mayReadFromMemory())
+      return false;
+
+    if (auto *CB = dyn_cast<CallBase>(UseInst))
+      if (CB->onlyAccessesInaccessibleMemory())
+        return false;
+
     // NOTE: For calls, the number of stores removed could be slightly improved
     // by using AA.callCapturesBefore(UseInst, DefLoc, &DT), but that showed to
     // be expensive compared to the benefits in practice. For now, avoid more
     // expensive analysis to limit compile-time.
     return isRefSet(BatchAA.getModRefInfo(UseInst, DefLoc));
-  } 
- 
+  }
+
   /// Returns true if \p Ptr is guaranteed to be loop invariant for any possible
   /// loop. In particular, this guarantees that it only references a single
   /// MemoryLocation during execution of the containing function.
@@ -1925,7 +1925,7 @@ struct DSEState {
   // such MemoryDef, return None. The returned value may not (completely)
   // overwrite \p DefLoc. Currently we bail out when we encounter an aliasing
   // MemoryUse (read).
-  Optional<MemoryAccess *> 
+  Optional<MemoryAccess *>
   getDomMemoryDef(MemoryDef *KillingDef, MemoryAccess *StartAccess,
                   const MemoryLocation &DefLoc, const Value *DefUO,
                   unsigned &ScanLimit, unsigned &WalkerStepLimit,
@@ -1937,13 +1937,13 @@ struct DSEState {
 
     MemoryAccess *Current = StartAccess;
     Instruction *KillingI = KillingDef->getMemoryInst();
-    bool StepAgain; 
+    bool StepAgain;
     LLVM_DEBUG(dbgs() << "  trying to get dominating access\n");
 
     // Find the next clobbering Mod access for DefLoc, starting at StartAccess.
     Optional<MemoryLocation> CurrentLoc;
-    do { 
-      StepAgain = false; 
+    do {
+      StepAgain = false;
       LLVM_DEBUG({
         dbgs() << "   visiting " << *Current;
         if (!MSSA.isLiveOnEntryDef(Current) && isa<MemoryUseOrDef>(Current))
@@ -1952,12 +1952,12 @@ struct DSEState {
         dbgs() << "\n";
       });
 
-      // Reached TOP. 
+      // Reached TOP.
       if (MSSA.isLiveOnEntryDef(Current)) {
         LLVM_DEBUG(dbgs() << "   ...  found LiveOnEntryDef\n");
-        return None; 
+        return None;
       }
- 
+
       // Cost of a step. Accesses in the same block are more likely to be valid
       // candidates for elimination, hence consider them cheaper.
       unsigned StepCost = KillingDef->getBlock() == Current->getBlock()
@@ -1971,10 +1971,10 @@ struct DSEState {
 
       // Return for MemoryPhis. They cannot be eliminated directly and the
       // caller is responsible for traversing them.
-      if (isa<MemoryPhi>(Current)) { 
+      if (isa<MemoryPhi>(Current)) {
         LLVM_DEBUG(dbgs() << "   ...  found MemoryPhi\n");
         return Current;
-      } 
+      }
 
       // Below, check if CurrentDef is a valid candidate to be eliminated by
       // KillingDef. If it is not, check the next candidate.
@@ -1991,16 +1991,16 @@ struct DSEState {
       // instructions that block us from DSEing
       if (mayThrowBetween(KillingI, CurrentI, DefUO)) {
         LLVM_DEBUG(dbgs() << "  ... skip, may throw!\n");
-        return None; 
+        return None;
       }
- 
+
       // Check for anything that looks like it will be a barrier to further
       // removal
       if (isDSEBarrier(DefUO, CurrentI)) {
         LLVM_DEBUG(dbgs() << "  ... skip, barrier\n");
         return None;
       }
- 
+
       // If Current is known to be on path that reads DefLoc or is a read
       // clobber, bail out, as the path is not profitable. We skip this check
       // for intrinsic calls, because the code knows how to handle memcpy
@@ -2022,11 +2022,11 @@ struct DSEState {
       // If Current cannot be analyzed or is not removable, check the next
       // candidate.
       if (!hasAnalyzableMemoryWrite(CurrentI, TLI) || !isRemovable(CurrentI)) {
-        StepAgain = true; 
+        StepAgain = true;
         Current = CurrentDef->getDefiningAccess();
         continue;
-      } 
- 
+      }
+
       // If Current does not have an analyzable write location, skip it
       CurrentLoc = getLocForWriteEx(CurrentI);
       if (!CurrentLoc) {
@@ -2078,11 +2078,11 @@ struct DSEState {
           PartialLimit -= 1;
         }
       }
-    } while (StepAgain); 
- 
-    // Accesses to objects accessible after the function returns can only be 
-    // eliminated if the access is killed along all paths to the exit. Collect 
-    // the blocks with killing (=completely overwriting MemoryDefs) and check if 
+    } while (StepAgain);
+
+    // Accesses to objects accessible after the function returns can only be
+    // eliminated if the access is killed along all paths to the exit. Collect
+    // the blocks with killing (=completely overwriting MemoryDefs) and check if
     // they cover all paths from EarlierAccess to any function exit.
     SmallPtrSet<Instruction *, 16> KillingDefs;
     KillingDefs.insert(KillingDef->getMemoryInst());
@@ -2091,34 +2091,34 @@ struct DSEState {
         cast<MemoryDef>(EarlierAccess)->getMemoryInst();
     LLVM_DEBUG(dbgs() << "  Checking for reads of " << *EarlierAccess << " ("
                       << *EarlierMemInst << ")\n");
- 
-    SmallSetVector<MemoryAccess *, 32> WorkList; 
-    auto PushMemUses = [&WorkList](MemoryAccess *Acc) { 
-      for (Use &U : Acc->uses()) 
-        WorkList.insert(cast<MemoryAccess>(U.getUser())); 
-    }; 
+
+    SmallSetVector<MemoryAccess *, 32> WorkList;
+    auto PushMemUses = [&WorkList](MemoryAccess *Acc) {
+      for (Use &U : Acc->uses())
+        WorkList.insert(cast<MemoryAccess>(U.getUser()));
+    };
     PushMemUses(EarlierAccess);
- 
+
     // Optimistically collect all accesses for reads. If we do not find any
     // read clobbers, add them to the cache.
     SmallPtrSet<MemoryAccess *, 16> KnownNoReads;
     if (!EarlierMemInst->mayReadFromMemory())
       KnownNoReads.insert(EarlierAccess);
     // Check if EarlierDef may be read.
-    for (unsigned I = 0; I < WorkList.size(); I++) { 
-      MemoryAccess *UseAccess = WorkList[I]; 
- 
-      LLVM_DEBUG(dbgs() << "   " << *UseAccess); 
+    for (unsigned I = 0; I < WorkList.size(); I++) {
+      MemoryAccess *UseAccess = WorkList[I];
+
+      LLVM_DEBUG(dbgs() << "   " << *UseAccess);
       // Bail out if the number of accesses to check exceeds the scan limit.
       if (ScanLimit < (WorkList.size() - I)) {
-        LLVM_DEBUG(dbgs() << "\n    ...  hit scan limit\n"); 
-        return None; 
-      } 
+        LLVM_DEBUG(dbgs() << "\n    ...  hit scan limit\n");
+        return None;
+      }
       --ScanLimit;
       NumDomMemDefChecks++;
       KnownNoReads.insert(UseAccess);
- 
-      if (isa<MemoryPhi>(UseAccess)) { 
+
+      if (isa<MemoryPhi>(UseAccess)) {
         if (any_of(KillingDefs, [this, UseAccess](Instruction *KI) {
               return DT.properlyDominates(KI->getParent(),
                                           UseAccess->getBlock());
@@ -2126,30 +2126,30 @@ struct DSEState {
           LLVM_DEBUG(dbgs() << " ... skipping, dominated by killing block\n");
           continue;
         }
-        LLVM_DEBUG(dbgs() << "\n    ... adding PHI uses\n"); 
-        PushMemUses(UseAccess); 
-        continue; 
-      } 
- 
-      Instruction *UseInst = cast<MemoryUseOrDef>(UseAccess)->getMemoryInst(); 
-      LLVM_DEBUG(dbgs() << " (" << *UseInst << ")\n"); 
- 
+        LLVM_DEBUG(dbgs() << "\n    ... adding PHI uses\n");
+        PushMemUses(UseAccess);
+        continue;
+      }
+
+      Instruction *UseInst = cast<MemoryUseOrDef>(UseAccess)->getMemoryInst();
+      LLVM_DEBUG(dbgs() << " (" << *UseInst << ")\n");
+
       if (any_of(KillingDefs, [this, UseInst](Instruction *KI) {
             return DT.dominates(KI, UseInst);
           })) {
         LLVM_DEBUG(dbgs() << " ... skipping, dominated by killing def\n");
-        continue; 
-      } 
- 
-      // A memory terminator kills all preceeding MemoryDefs and all succeeding 
-      // MemoryAccesses. We do not have to check it's users. 
+        continue;
+      }
+
+      // A memory terminator kills all preceeding MemoryDefs and all succeeding
+      // MemoryAccesses. We do not have to check it's users.
       if (isMemTerminator(*CurrentLoc, EarlierMemInst, UseInst)) {
         LLVM_DEBUG(
             dbgs()
             << " ... skipping, memterminator invalidates following accesses\n");
-        continue; 
+        continue;
       }
- 
+
       if (isNoopIntrinsic(cast<MemoryUseOrDef>(UseAccess)->getMemoryInst())) {
         LLVM_DEBUG(dbgs() << "    ... adding uses of intrinsic\n");
         PushMemUses(UseAccess);
@@ -2161,218 +2161,218 @@ struct DSEState {
         return None;
       }
 
-      // Uses which may read the original MemoryDef mean we cannot eliminate the 
-      // original MD. Stop walk. 
+      // Uses which may read the original MemoryDef mean we cannot eliminate the
+      // original MD. Stop walk.
       if (isReadClobber(*CurrentLoc, UseInst)) {
-        LLVM_DEBUG(dbgs() << "    ... found read clobber\n"); 
-        return None; 
-      } 
- 
+        LLVM_DEBUG(dbgs() << "    ... found read clobber\n");
+        return None;
+      }
+
       // For the KillingDef and EarlierAccess we only have to check if it reads
       // the memory location.
-      // TODO: It would probably be better to check for self-reads before 
-      // calling the function. 
+      // TODO: It would probably be better to check for self-reads before
+      // calling the function.
       if (KillingDef == UseAccess || EarlierAccess == UseAccess) {
-        LLVM_DEBUG(dbgs() << "    ... skipping killing def/dom access\n"); 
-        continue; 
-      } 
- 
-      // Check all uses for MemoryDefs, except for defs completely overwriting 
-      // the original location. Otherwise we have to check uses of *all* 
-      // MemoryDefs we discover, including non-aliasing ones. Otherwise we might 
-      // miss cases like the following 
+        LLVM_DEBUG(dbgs() << "    ... skipping killing def/dom access\n");
+        continue;
+      }
+
+      // Check all uses for MemoryDefs, except for defs completely overwriting
+      // the original location. Otherwise we have to check uses of *all*
+      // MemoryDefs we discover, including non-aliasing ones. Otherwise we might
+      // miss cases like the following
       //   1 = Def(LoE) ; <----- EarlierDef stores [0,1]
-      //   2 = Def(1)   ; (2, 1) = NoAlias,   stores [2,3] 
-      //   Use(2)       ; MayAlias 2 *and* 1, loads [0, 3]. 
-      //                  (The Use points to the *first* Def it may alias) 
-      //   3 = Def(1)   ; <---- Current  (3, 2) = NoAlias, (3,1) = MayAlias, 
-      //                  stores [0,1] 
-      if (MemoryDef *UseDef = dyn_cast<MemoryDef>(UseAccess)) { 
+      //   2 = Def(1)   ; (2, 1) = NoAlias,   stores [2,3]
+      //   Use(2)       ; MayAlias 2 *and* 1, loads [0, 3].
+      //                  (The Use points to the *first* Def it may alias)
+      //   3 = Def(1)   ; <---- Current  (3, 2) = NoAlias, (3,1) = MayAlias,
+      //                  stores [0,1]
+      if (MemoryDef *UseDef = dyn_cast<MemoryDef>(UseAccess)) {
         if (isCompleteOverwrite(*CurrentLoc, EarlierMemInst, UseInst)) {
           if (!isInvisibleToCallerAfterRet(DefUO) &&
               UseAccess != EarlierAccess) {
-            BasicBlock *MaybeKillingBlock = UseInst->getParent(); 
-            if (PostOrderNumbers.find(MaybeKillingBlock)->second < 
+            BasicBlock *MaybeKillingBlock = UseInst->getParent();
+            if (PostOrderNumbers.find(MaybeKillingBlock)->second <
                 PostOrderNumbers.find(EarlierAccess->getBlock())->second) {
- 
+
               LLVM_DEBUG(dbgs()
                          << "    ... found killing def " << *UseInst << "\n");
               KillingDefs.insert(UseInst);
-            } 
-          } 
-        } else 
-          PushMemUses(UseDef); 
-      } 
-    } 
- 
-    // For accesses to locations visible after the function returns, make sure 
+            }
+          }
+        } else
+          PushMemUses(UseDef);
+      }
+    }
+
+    // For accesses to locations visible after the function returns, make sure
     // that the location is killed (=overwritten) along all paths from
     // EarlierAccess to the exit.
     if (!isInvisibleToCallerAfterRet(DefUO)) {
       SmallPtrSet<BasicBlock *, 16> KillingBlocks;
       for (Instruction *KD : KillingDefs)
         KillingBlocks.insert(KD->getParent());
-      assert(!KillingBlocks.empty() && 
-             "Expected at least a single killing block"); 
-
-      // Find the common post-dominator of all killing blocks. 
-      BasicBlock *CommonPred = *KillingBlocks.begin(); 
-      for (auto I = std::next(KillingBlocks.begin()), E = KillingBlocks.end(); 
-           I != E; I++) { 
-        if (!CommonPred) 
-          break; 
-        CommonPred = PDT.findNearestCommonDominator(CommonPred, *I); 
-      } 
- 
-      // If CommonPred is in the set of killing blocks, just check if it 
+      assert(!KillingBlocks.empty() &&
+             "Expected at least a single killing block");
+
+      // Find the common post-dominator of all killing blocks.
+      BasicBlock *CommonPred = *KillingBlocks.begin();
+      for (auto I = std::next(KillingBlocks.begin()), E = KillingBlocks.end();
+           I != E; I++) {
+        if (!CommonPred)
+          break;
+        CommonPred = PDT.findNearestCommonDominator(CommonPred, *I);
+      }
+
+      // If CommonPred is in the set of killing blocks, just check if it
       // post-dominates EarlierAccess.
-      if (KillingBlocks.count(CommonPred)) { 
+      if (KillingBlocks.count(CommonPred)) {
         if (PDT.dominates(CommonPred, EarlierAccess->getBlock()))
           return {EarlierAccess};
-        return None; 
-      } 
- 
+        return None;
+      }
+
       // If the common post-dominator does not post-dominate EarlierAccess,
       // there is a path from EarlierAccess to an exit not going through a
       // killing block.
       if (PDT.dominates(CommonPred, EarlierAccess->getBlock())) {
-        SetVector<BasicBlock *> WorkList; 
- 
-        // If CommonPred is null, there are multiple exits from the function. 
-        // They all have to be added to the worklist. 
-        if (CommonPred) 
-          WorkList.insert(CommonPred); 
-        else 
-          for (BasicBlock *R : PDT.roots()) 
-            WorkList.insert(R); 
- 
-        NumCFGTries++; 
-        // Check if all paths starting from an exit node go through one of the 
+        SetVector<BasicBlock *> WorkList;
+
+        // If CommonPred is null, there are multiple exits from the function.
+        // They all have to be added to the worklist.
+        if (CommonPred)
+          WorkList.insert(CommonPred);
+        else
+          for (BasicBlock *R : PDT.roots())
+            WorkList.insert(R);
+
+        NumCFGTries++;
+        // Check if all paths starting from an exit node go through one of the
         // killing blocks before reaching EarlierAccess.
-        for (unsigned I = 0; I < WorkList.size(); I++) { 
-          NumCFGChecks++; 
-          BasicBlock *Current = WorkList[I]; 
-          if (KillingBlocks.count(Current)) 
-            continue; 
+        for (unsigned I = 0; I < WorkList.size(); I++) {
+          NumCFGChecks++;
+          BasicBlock *Current = WorkList[I];
+          if (KillingBlocks.count(Current))
+            continue;
           if (Current == EarlierAccess->getBlock())
-            return None; 
- 
+            return None;
+
           // EarlierAccess is reachable from the entry, so we don't have to
           // explore unreachable blocks further.
-          if (!DT.isReachableFromEntry(Current)) 
-            continue; 
- 
-          for (BasicBlock *Pred : predecessors(Current)) 
-            WorkList.insert(Pred); 
- 
-          if (WorkList.size() >= MemorySSAPathCheckLimit) 
-            return None; 
-        } 
-        NumCFGSuccess++; 
+          if (!DT.isReachableFromEntry(Current))
+            continue;
+
+          for (BasicBlock *Pred : predecessors(Current))
+            WorkList.insert(Pred);
+
+          if (WorkList.size() >= MemorySSAPathCheckLimit)
+            return None;
+        }
+        NumCFGSuccess++;
         return {EarlierAccess};
-      } 
-      return None; 
-    } 
- 
+      }
+      return None;
+    }
+
     // No aliasing MemoryUses of EarlierAccess found, EarlierAccess is
     // potentially dead.
     return {EarlierAccess};
-  } 
- 
-  // Delete dead memory defs 
-  void deleteDeadInstruction(Instruction *SI) { 
-    MemorySSAUpdater Updater(&MSSA); 
-    SmallVector<Instruction *, 32> NowDeadInsts; 
-    NowDeadInsts.push_back(SI); 
-    --NumFastOther; 
- 
-    while (!NowDeadInsts.empty()) { 
-      Instruction *DeadInst = NowDeadInsts.pop_back_val(); 
-      ++NumFastOther; 
- 
-      // Try to preserve debug information attached to the dead instruction. 
-      salvageDebugInfo(*DeadInst); 
-      salvageKnowledge(DeadInst); 
- 
-      // Remove the Instruction from MSSA. 
-      if (MemoryAccess *MA = MSSA.getMemoryAccess(DeadInst)) { 
-        if (MemoryDef *MD = dyn_cast<MemoryDef>(MA)) { 
-          SkipStores.insert(MD); 
-        } 
-        Updater.removeMemoryAccess(MA); 
-      } 
- 
-      auto I = IOLs.find(DeadInst->getParent()); 
-      if (I != IOLs.end()) 
-        I->second.erase(DeadInst); 
-      // Remove its operands 
-      for (Use &O : DeadInst->operands()) 
-        if (Instruction *OpI = dyn_cast<Instruction>(O)) { 
-          O = nullptr; 
-          if (isInstructionTriviallyDead(OpI, &TLI)) 
-            NowDeadInsts.push_back(OpI); 
-        } 
- 
-      DeadInst->eraseFromParent(); 
-    } 
-  } 
- 
-  // Check for any extra throws between SI and NI that block DSE.  This only 
-  // checks extra maythrows (those that aren't MemoryDef's). MemoryDef that may 
-  // throw are handled during the walk from one def to the next. 
-  bool mayThrowBetween(Instruction *SI, Instruction *NI, 
+  }
+
+  // Delete dead memory defs
+  void deleteDeadInstruction(Instruction *SI) {
+    MemorySSAUpdater Updater(&MSSA);
+    SmallVector<Instruction *, 32> NowDeadInsts;
+    NowDeadInsts.push_back(SI);
+    --NumFastOther;
+
+    while (!NowDeadInsts.empty()) {
+      Instruction *DeadInst = NowDeadInsts.pop_back_val();
+      ++NumFastOther;
+
+      // Try to preserve debug information attached to the dead instruction.
+      salvageDebugInfo(*DeadInst);
+      salvageKnowledge(DeadInst);
+
+      // Remove the Instruction from MSSA.
+      if (MemoryAccess *MA = MSSA.getMemoryAccess(DeadInst)) {
+        if (MemoryDef *MD = dyn_cast<MemoryDef>(MA)) {
+          SkipStores.insert(MD);
+        }
+        Updater.removeMemoryAccess(MA);
+      }
+
+      auto I = IOLs.find(DeadInst->getParent());
+      if (I != IOLs.end())
+        I->second.erase(DeadInst);
+      // Remove its operands
+      for (Use &O : DeadInst->operands())
+        if (Instruction *OpI = dyn_cast<Instruction>(O)) {
+          O = nullptr;
+          if (isInstructionTriviallyDead(OpI, &TLI))
+            NowDeadInsts.push_back(OpI);
+        }
+
+      DeadInst->eraseFromParent();
+    }
+  }
+
+  // Check for any extra throws between SI and NI that block DSE.  This only
+  // checks extra maythrows (those that aren't MemoryDef's). MemoryDef that may
+  // throw are handled during the walk from one def to the next.
+  bool mayThrowBetween(Instruction *SI, Instruction *NI,
                        const Value *SILocUnd) {
-    // First see if we can ignore it by using the fact that SI is an 
-    // alloca/alloca like object that is not visible to the caller during 
-    // execution of the function. 
+    // First see if we can ignore it by using the fact that SI is an
+    // alloca/alloca like object that is not visible to the caller during
+    // execution of the function.
     if (SILocUnd && isInvisibleToCallerBeforeRet(SILocUnd))
-      return false; 
- 
-    if (SI->getParent() == NI->getParent()) 
-      return ThrowingBlocks.count(SI->getParent()); 
-    return !ThrowingBlocks.empty(); 
-  } 
- 
-  // Check if \p NI acts as a DSE barrier for \p SI. The following instructions 
-  // act as barriers: 
-  //  * A memory instruction that may throw and \p SI accesses a non-stack 
-  //  object. 
-  //  * Atomic stores stronger that monotonic. 
+      return false;
+
+    if (SI->getParent() == NI->getParent())
+      return ThrowingBlocks.count(SI->getParent());
+    return !ThrowingBlocks.empty();
+  }
+
+  // Check if \p NI acts as a DSE barrier for \p SI. The following instructions
+  // act as barriers:
+  //  * A memory instruction that may throw and \p SI accesses a non-stack
+  //  object.
+  //  * Atomic stores stronger that monotonic.
   bool isDSEBarrier(const Value *SILocUnd, Instruction *NI) {
-    // If NI may throw it acts as a barrier, unless we are to an alloca/alloca 
-    // like object that does not escape. 
+    // If NI may throw it acts as a barrier, unless we are to an alloca/alloca
+    // like object that does not escape.
     if (NI->mayThrow() && !isInvisibleToCallerBeforeRet(SILocUnd))
-      return true; 
- 
-    // If NI is an atomic load/store stronger than monotonic, do not try to 
-    // eliminate/reorder it. 
-    if (NI->isAtomic()) { 
-      if (auto *LI = dyn_cast<LoadInst>(NI)) 
-        return isStrongerThanMonotonic(LI->getOrdering()); 
-      if (auto *SI = dyn_cast<StoreInst>(NI)) 
-        return isStrongerThanMonotonic(SI->getOrdering()); 
+      return true;
+
+    // If NI is an atomic load/store stronger than monotonic, do not try to
+    // eliminate/reorder it.
+    if (NI->isAtomic()) {
+      if (auto *LI = dyn_cast<LoadInst>(NI))
+        return isStrongerThanMonotonic(LI->getOrdering());
+      if (auto *SI = dyn_cast<StoreInst>(NI))
+        return isStrongerThanMonotonic(SI->getOrdering());
       if (auto *ARMW = dyn_cast<AtomicRMWInst>(NI))
         return isStrongerThanMonotonic(ARMW->getOrdering());
       if (auto *CmpXchg = dyn_cast<AtomicCmpXchgInst>(NI))
         return isStrongerThanMonotonic(CmpXchg->getSuccessOrdering()) ||
                isStrongerThanMonotonic(CmpXchg->getFailureOrdering());
-      llvm_unreachable("other instructions should be skipped in MemorySSA"); 
-    } 
-    return false; 
-  } 
- 
-  /// Eliminate writes to objects that are not visible in the caller and are not 
-  /// accessed before returning from the function. 
-  bool eliminateDeadWritesAtEndOfFunction() { 
-    bool MadeChange = false; 
-    LLVM_DEBUG( 
-        dbgs() 
-        << "Trying to eliminate MemoryDefs at the end of the function\n"); 
-    for (int I = MemDefs.size() - 1; I >= 0; I--) { 
-      MemoryDef *Def = MemDefs[I]; 
+      llvm_unreachable("other instructions should be skipped in MemorySSA");
+    }
+    return false;
+  }
+
+  /// Eliminate writes to objects that are not visible in the caller and are not
+  /// accessed before returning from the function.
+  bool eliminateDeadWritesAtEndOfFunction() {
+    bool MadeChange = false;
+    LLVM_DEBUG(
+        dbgs()
+        << "Trying to eliminate MemoryDefs at the end of the function\n");
+    for (int I = MemDefs.size() - 1; I >= 0; I--) {
+      MemoryDef *Def = MemDefs[I];
       if (SkipStores.contains(Def) || !isRemovable(Def->getMemoryInst()))
-        continue; 
- 
+        continue;
+
       Instruction *DefI = Def->getMemoryInst();
       SmallVector<const Value *, 4> Pointers;
       auto DefLoc = getLocForWriteEx(DefI);
@@ -2388,30 +2388,30 @@ struct DSEState {
       if (!UO || !isInvisibleToCallerAfterRet(UO))
         continue;
 
-      if (isWriteAtEndOfFunction(Def)) { 
-        // See through pointer-to-pointer bitcasts 
-        LLVM_DEBUG(dbgs() << "   ... MemoryDef is not accessed until the end " 
-                             "of the function\n"); 
+      if (isWriteAtEndOfFunction(Def)) {
+        // See through pointer-to-pointer bitcasts
+        LLVM_DEBUG(dbgs() << "   ... MemoryDef is not accessed until the end "
+                             "of the function\n");
         deleteDeadInstruction(DefI);
         ++NumFastStores;
         MadeChange = true;
-      } 
-    } 
-    return MadeChange; 
-  } 
- 
-  /// \returns true if \p Def is a no-op store, either because it 
-  /// directly stores back a loaded value or stores zero to a calloced object. 
+      }
+    }
+    return MadeChange;
+  }
+
+  /// \returns true if \p Def is a no-op store, either because it
+  /// directly stores back a loaded value or stores zero to a calloced object.
   bool storeIsNoop(MemoryDef *Def, const MemoryLocation &DefLoc,
                    const Value *DefUO) {
-    StoreInst *Store = dyn_cast<StoreInst>(Def->getMemoryInst()); 
-    if (!Store) 
-      return false; 
- 
-    if (auto *LoadI = dyn_cast<LoadInst>(Store->getOperand(0))) { 
-      if (LoadI->getPointerOperand() == Store->getOperand(1)) { 
+    StoreInst *Store = dyn_cast<StoreInst>(Def->getMemoryInst());
+    if (!Store)
+      return false;
+
+    if (auto *LoadI = dyn_cast<LoadInst>(Store->getOperand(0))) {
+      if (LoadI->getPointerOperand() == Store->getOperand(1)) {
         // Get the defining access for the load.
-        auto *LoadAccess = MSSA.getMemoryAccess(LoadI)->getDefiningAccess(); 
+        auto *LoadAccess = MSSA.getMemoryAccess(LoadI)->getDefiningAccess();
         // Fast path: the defining accesses are the same.
         if (LoadAccess == Def->getDefiningAccess())
           return true;
@@ -2449,126 +2449,126 @@ struct DSEState {
             return false;
         }
         return true;
-      } 
-    } 
- 
-    Constant *StoredConstant = dyn_cast<Constant>(Store->getOperand(0)); 
-    if (StoredConstant && StoredConstant->isNullValue()) { 
-      auto *DefUOInst = dyn_cast<Instruction>(DefUO); 
-      if (DefUOInst && isCallocLikeFn(DefUOInst, &TLI)) { 
-        auto *UnderlyingDef = cast<MemoryDef>(MSSA.getMemoryAccess(DefUOInst)); 
-        // If UnderlyingDef is the clobbering access of Def, no instructions 
-        // between them can modify the memory location. 
-        auto *ClobberDef = 
-            MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(Def); 
-        return UnderlyingDef == ClobberDef; 
-      } 
-    } 
-    return false; 
-  } 
-}; 
- 
-bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA, 
-                                  MemorySSA &MSSA, DominatorTree &DT, 
-                                  PostDominatorTree &PDT, 
-                                  const TargetLibraryInfo &TLI) { 
-  bool MadeChange = false; 
- 
-  DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, TLI); 
-  // For each store: 
-  for (unsigned I = 0; I < State.MemDefs.size(); I++) { 
-    MemoryDef *KillingDef = State.MemDefs[I]; 
-    if (State.SkipStores.count(KillingDef)) 
-      continue; 
-    Instruction *SI = KillingDef->getMemoryInst(); 
- 
+      }
+    }
+
+    Constant *StoredConstant = dyn_cast<Constant>(Store->getOperand(0));
+    if (StoredConstant && StoredConstant->isNullValue()) {
+      auto *DefUOInst = dyn_cast<Instruction>(DefUO);
+      if (DefUOInst && isCallocLikeFn(DefUOInst, &TLI)) {
+        auto *UnderlyingDef = cast<MemoryDef>(MSSA.getMemoryAccess(DefUOInst));
+        // If UnderlyingDef is the clobbering access of Def, no instructions
+        // between them can modify the memory location.
+        auto *ClobberDef =
+            MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(Def);
+        return UnderlyingDef == ClobberDef;
+      }
+    }
+    return false;
+  }
+};
+
+bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
+                                  MemorySSA &MSSA, DominatorTree &DT,
+                                  PostDominatorTree &PDT,
+                                  const TargetLibraryInfo &TLI) {
+  bool MadeChange = false;
+
+  DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, TLI);
+  // For each store:
+  for (unsigned I = 0; I < State.MemDefs.size(); I++) {
+    MemoryDef *KillingDef = State.MemDefs[I];
+    if (State.SkipStores.count(KillingDef))
+      continue;
+    Instruction *SI = KillingDef->getMemoryInst();
+
     Optional<MemoryLocation> MaybeSILoc;
-    if (State.isMemTerminatorInst(SI)) 
-      MaybeSILoc = State.getLocForTerminator(SI).map( 
-          [](const std::pair<MemoryLocation, bool> &P) { return P.first; }); 
-    else 
-      MaybeSILoc = State.getLocForWriteEx(SI); 
- 
-    if (!MaybeSILoc) { 
-      LLVM_DEBUG(dbgs() << "Failed to find analyzable write location for " 
-                        << *SI << "\n"); 
-      continue; 
-    } 
-    MemoryLocation SILoc = *MaybeSILoc; 
-    assert(SILoc.Ptr && "SILoc should not be null"); 
+    if (State.isMemTerminatorInst(SI))
+      MaybeSILoc = State.getLocForTerminator(SI).map(
+          [](const std::pair<MemoryLocation, bool> &P) { return P.first; });
+    else
+      MaybeSILoc = State.getLocForWriteEx(SI);
+
+    if (!MaybeSILoc) {
+      LLVM_DEBUG(dbgs() << "Failed to find analyzable write location for "
+                        << *SI << "\n");
+      continue;
+    }
+    MemoryLocation SILoc = *MaybeSILoc;
+    assert(SILoc.Ptr && "SILoc should not be null");
     const Value *SILocUnd = getUnderlyingObject(SILoc.Ptr);
- 
-    MemoryAccess *Current = KillingDef; 
-    LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs killed by " 
-                      << *KillingDef << " (" << *SI << ")\n"); 
- 
+
+    MemoryAccess *Current = KillingDef;
+    LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs killed by "
+                      << *KillingDef << " (" << *SI << ")\n");
+
     unsigned ScanLimit = MemorySSAScanLimit;
     unsigned WalkerStepLimit = MemorySSAUpwardsStepLimit;
     unsigned PartialLimit = MemorySSAPartialStoreLimit;
-    // Worklist of MemoryAccesses that may be killed by KillingDef. 
-    SetVector<MemoryAccess *> ToCheck; 
- 
+    // Worklist of MemoryAccesses that may be killed by KillingDef.
+    SetVector<MemoryAccess *> ToCheck;
+
     if (SILocUnd)
       ToCheck.insert(KillingDef->getDefiningAccess());
 
     bool Shortend = false;
     bool IsMemTerm = State.isMemTerminatorInst(SI);
-    // Check if MemoryAccesses in the worklist are killed by KillingDef. 
-    for (unsigned I = 0; I < ToCheck.size(); I++) { 
-      Current = ToCheck[I]; 
-      if (State.SkipStores.count(Current)) 
-        continue; 
- 
-      Optional<MemoryAccess *> Next = State.getDomMemoryDef( 
+    // Check if MemoryAccesses in the worklist are killed by KillingDef.
+    for (unsigned I = 0; I < ToCheck.size(); I++) {
+      Current = ToCheck[I];
+      if (State.SkipStores.count(Current))
+        continue;
+
+      Optional<MemoryAccess *> Next = State.getDomMemoryDef(
           KillingDef, Current, SILoc, SILocUnd, ScanLimit, WalkerStepLimit,
           IsMemTerm, PartialLimit);
- 
-      if (!Next) { 
-        LLVM_DEBUG(dbgs() << "  finished walk\n"); 
-        continue; 
-      } 
- 
+
+      if (!Next) {
+        LLVM_DEBUG(dbgs() << "  finished walk\n");
+        continue;
+      }
+
       MemoryAccess *EarlierAccess = *Next;
       LLVM_DEBUG(dbgs() << " Checking if we can kill " << *EarlierAccess);
       if (isa<MemoryPhi>(EarlierAccess)) {
-        LLVM_DEBUG(dbgs() << "\n  ... adding incoming values to worklist\n"); 
+        LLVM_DEBUG(dbgs() << "\n  ... adding incoming values to worklist\n");
         for (Value *V : cast<MemoryPhi>(EarlierAccess)->incoming_values()) {
-          MemoryAccess *IncomingAccess = cast<MemoryAccess>(V); 
-          BasicBlock *IncomingBlock = IncomingAccess->getBlock(); 
+          MemoryAccess *IncomingAccess = cast<MemoryAccess>(V);
+          BasicBlock *IncomingBlock = IncomingAccess->getBlock();
           BasicBlock *PhiBlock = EarlierAccess->getBlock();
- 
-          // We only consider incoming MemoryAccesses that come before the 
-          // MemoryPhi. Otherwise we could discover candidates that do not 
-          // strictly dominate our starting def. 
-          if (State.PostOrderNumbers[IncomingBlock] > 
-              State.PostOrderNumbers[PhiBlock]) 
-            ToCheck.insert(IncomingAccess); 
-        } 
-        continue; 
-      } 
+
+          // We only consider incoming MemoryAccesses that come before the
+          // MemoryPhi. Otherwise we could discover candidates that do not
+          // strictly dominate our starting def.
+          if (State.PostOrderNumbers[IncomingBlock] >
+              State.PostOrderNumbers[PhiBlock])
+            ToCheck.insert(IncomingAccess);
+        }
+        continue;
+      }
       auto *NextDef = cast<MemoryDef>(EarlierAccess);
-      Instruction *NI = NextDef->getMemoryInst(); 
-      LLVM_DEBUG(dbgs() << " (" << *NI << ")\n"); 
-      ToCheck.insert(NextDef->getDefiningAccess()); 
+      Instruction *NI = NextDef->getMemoryInst();
+      LLVM_DEBUG(dbgs() << " (" << *NI << ")\n");
+      ToCheck.insert(NextDef->getDefiningAccess());
       NumGetDomMemoryDefPassed++;
- 
-      if (!DebugCounter::shouldExecute(MemorySSACounter)) 
-        continue; 
- 
-      MemoryLocation NILoc = *State.getLocForWriteEx(NI); 
- 
+
+      if (!DebugCounter::shouldExecute(MemorySSACounter))
+        continue;
+
+      MemoryLocation NILoc = *State.getLocForWriteEx(NI);
+
       if (IsMemTerm) {
         const Value *NIUnd = getUnderlyingObject(NILoc.Ptr);
         if (SILocUnd != NIUnd)
-          continue; 
-        LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: " << *NI 
-                          << "\n  KILLER: " << *SI << '\n'); 
-        State.deleteDeadInstruction(NI); 
-        ++NumFastStores; 
-        MadeChange = true; 
-      } else { 
-        // Check if NI overwrites SI. 
-        int64_t InstWriteOffset, DepWriteOffset; 
+          continue;
+        LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: " << *NI
+                          << "\n  KILLER: " << *SI << '\n');
+        State.deleteDeadInstruction(NI);
+        ++NumFastStores;
+        MadeChange = true;
+      } else {
+        // Check if NI overwrites SI.
+        int64_t InstWriteOffset, DepWriteOffset;
         OverwriteResult OR =
             isOverwrite(SI, NI, SILoc, NILoc, State.DL, TLI, DepWriteOffset,
                         InstWriteOffset, State.BatchAA, &F);
@@ -2580,10 +2580,10 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
           OR = isPartialOverwrite(SILoc, NILoc, DepWriteOffset, InstWriteOffset,
                                   NI, IOL);
         }
- 
-        if (EnablePartialStoreMerging && OR == OW_PartialEarlierWithFullLater) { 
-          auto *Earlier = dyn_cast<StoreInst>(NI); 
-          auto *Later = dyn_cast<StoreInst>(SI); 
+
+        if (EnablePartialStoreMerging && OR == OW_PartialEarlierWithFullLater) {
+          auto *Earlier = dyn_cast<StoreInst>(NI);
+          auto *Later = dyn_cast<StoreInst>(SI);
           // We are re-using tryToMergePartialOverlappingStores, which requires
           // Earlier to domiante Later.
           // TODO: implement tryToMergeParialOverlappingStores using MemorySSA.
@@ -2591,12 +2591,12 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
             if (Constant *Merged = tryToMergePartialOverlappingStores(
                     Earlier, Later, InstWriteOffset, DepWriteOffset, State.DL,
                     State.BatchAA, &DT)) {
- 
+
               // Update stored value of earlier store to merged constant.
               Earlier->setOperand(0, Merged);
               ++NumModifiedStores;
               MadeChange = true;
- 
+
               Shortend = true;
               // Remove later store and remove any outstanding overlap intervals
               // for the updated store.
@@ -2606,18 +2606,18 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
                 I->second.erase(Earlier);
               break;
             }
-          } 
-        } 
- 
-        if (OR == OW_Complete) { 
-          LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: " << *NI 
-                            << "\n  KILLER: " << *SI << '\n'); 
-          State.deleteDeadInstruction(NI); 
-          ++NumFastStores; 
-          MadeChange = true; 
-        } 
-      } 
-    } 
+          }
+        }
+
+        if (OR == OW_Complete) {
+          LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: " << *NI
+                            << "\n  KILLER: " << *SI << '\n');
+          State.deleteDeadInstruction(NI);
+          ++NumFastStores;
+          MadeChange = true;
+        }
+      }
+    }
 
     // Check if the store is a no-op.
     if (!Shortend && isRemovable(SI) &&
@@ -2628,135 +2628,135 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
       MadeChange = true;
       continue;
     }
-  } 
- 
-  if (EnablePartialOverwriteTracking) 
-    for (auto &KV : State.IOLs) 
+  }
+
+  if (EnablePartialOverwriteTracking)
+    for (auto &KV : State.IOLs)
       MadeChange |= removePartiallyOverlappedStores(State.DL, KV.second, TLI);
- 
-  MadeChange |= State.eliminateDeadWritesAtEndOfFunction(); 
-  return MadeChange; 
-} 
-} // end anonymous namespace 
- 
-//===----------------------------------------------------------------------===// 
-// DSE Pass 
-//===----------------------------------------------------------------------===// 
-PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) { 
-  AliasAnalysis &AA = AM.getResult<AAManager>(F); 
-  const TargetLibraryInfo &TLI = AM.getResult<TargetLibraryAnalysis>(F); 
-  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F); 
- 
-  bool Changed = false; 
-  if (EnableMemorySSA) { 
-    MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA(); 
-    PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F); 
- 
-    Changed = eliminateDeadStoresMemorySSA(F, AA, MSSA, DT, PDT, TLI); 
-  } else { 
-    MemoryDependenceResults &MD = AM.getResult<MemoryDependenceAnalysis>(F); 
- 
-    Changed = eliminateDeadStores(F, &AA, &MD, &DT, &TLI); 
-  } 
- 
-#ifdef LLVM_ENABLE_STATS 
-  if (AreStatisticsEnabled()) 
-    for (auto &I : instructions(F)) 
-      NumRemainingStores += isa<StoreInst>(&I); 
-#endif 
- 
-  if (!Changed) 
-    return PreservedAnalyses::all(); 
- 
-  PreservedAnalyses PA; 
-  PA.preserveSet<CFGAnalyses>(); 
-  PA.preserve<GlobalsAA>(); 
-  if (EnableMemorySSA) 
-    PA.preserve<MemorySSAAnalysis>(); 
-  else 
-    PA.preserve<MemoryDependenceAnalysis>(); 
-  return PA; 
-} 
- 
-namespace { 
- 
-/// A legacy pass for the legacy pass manager that wraps \c DSEPass. 
-class DSELegacyPass : public FunctionPass { 
-public: 
-  static char ID; // Pass identification, replacement for typeid 
- 
-  DSELegacyPass() : FunctionPass(ID) { 
-    initializeDSELegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnFunction(Function &F) override { 
-    if (skipFunction(F)) 
-      return false; 
- 
-    AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); 
-    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-    const TargetLibraryInfo &TLI = 
-        getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 
- 
-    bool Changed = false; 
-    if (EnableMemorySSA) { 
-      MemorySSA &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA(); 
-      PostDominatorTree &PDT = 
-          getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(); 
- 
-      Changed = eliminateDeadStoresMemorySSA(F, AA, MSSA, DT, PDT, TLI); 
-    } else { 
-      MemoryDependenceResults &MD = 
-          getAnalysis<MemoryDependenceWrapperPass>().getMemDep(); 
- 
-      Changed = eliminateDeadStores(F, &AA, &MD, &DT, &TLI); 
-    } 
- 
-#ifdef LLVM_ENABLE_STATS 
-    if (AreStatisticsEnabled()) 
-      for (auto &I : instructions(F)) 
-        NumRemainingStores += isa<StoreInst>(&I); 
-#endif 
- 
-    return Changed; 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.setPreservesCFG(); 
-    AU.addRequired<AAResultsWrapperPass>(); 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-    AU.addPreserved<GlobalsAAWrapperPass>(); 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addPreserved<DominatorTreeWrapperPass>(); 
- 
-    if (EnableMemorySSA) { 
-      AU.addRequired<PostDominatorTreeWrapperPass>(); 
-      AU.addRequired<MemorySSAWrapperPass>(); 
-      AU.addPreserved<PostDominatorTreeWrapperPass>(); 
-      AU.addPreserved<MemorySSAWrapperPass>(); 
-    } else { 
-      AU.addRequired<MemoryDependenceWrapperPass>(); 
-      AU.addPreserved<MemoryDependenceWrapperPass>(); 
-    } 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-char DSELegacyPass::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(DSELegacyPass, "dse", "Dead Store Elimination", false, 
-                      false) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false, 
-                    false) 
- 
-FunctionPass *llvm::createDeadStoreEliminationPass() { 
-  return new DSELegacyPass(); 
-} 
+
+  MadeChange |= State.eliminateDeadWritesAtEndOfFunction();
+  return MadeChange;
+}
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// DSE Pass
+//===----------------------------------------------------------------------===//
+PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
+  AliasAnalysis &AA = AM.getResult<AAManager>(F);
+  const TargetLibraryInfo &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+
+  bool Changed = false;
+  if (EnableMemorySSA) {
+    MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
+    PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+
+    Changed = eliminateDeadStoresMemorySSA(F, AA, MSSA, DT, PDT, TLI);
+  } else {
+    MemoryDependenceResults &MD = AM.getResult<MemoryDependenceAnalysis>(F);
+
+    Changed = eliminateDeadStores(F, &AA, &MD, &DT, &TLI);
+  }
+
+#ifdef LLVM_ENABLE_STATS
+  if (AreStatisticsEnabled())
+    for (auto &I : instructions(F))
+      NumRemainingStores += isa<StoreInst>(&I);
+#endif
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<GlobalsAA>();
+  if (EnableMemorySSA)
+    PA.preserve<MemorySSAAnalysis>();
+  else
+    PA.preserve<MemoryDependenceAnalysis>();
+  return PA;
+}
+
+namespace {
+
+/// A legacy pass for the legacy pass manager that wraps \c DSEPass.
+class DSELegacyPass : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  DSELegacyPass() : FunctionPass(ID) {
+    initializeDSELegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    const TargetLibraryInfo &TLI =
+        getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+
+    bool Changed = false;
+    if (EnableMemorySSA) {
+      MemorySSA &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
+      PostDominatorTree &PDT =
+          getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+
+      Changed = eliminateDeadStoresMemorySSA(F, AA, MSSA, DT, PDT, TLI);
+    } else {
+      MemoryDependenceResults &MD =
+          getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
+
+      Changed = eliminateDeadStores(F, &AA, &MD, &DT, &TLI);
+    }
+
+#ifdef LLVM_ENABLE_STATS
+    if (AreStatisticsEnabled())
+      for (auto &I : instructions(F))
+        NumRemainingStores += isa<StoreInst>(&I);
+#endif
+
+    return Changed;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+
+    if (EnableMemorySSA) {
+      AU.addRequired<PostDominatorTreeWrapperPass>();
+      AU.addRequired<MemorySSAWrapperPass>();
+      AU.addPreserved<PostDominatorTreeWrapperPass>();
+      AU.addPreserved<MemorySSAWrapperPass>();
+    } else {
+      AU.addRequired<MemoryDependenceWrapperPass>();
+      AU.addPreserved<MemoryDependenceWrapperPass>();
+    }
+  }
+};
+
+} // end anonymous namespace
+
+char DSELegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(DSELegacyPass, "dse", "Dead Store Elimination", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false,
+                    false)
+
+FunctionPass *llvm::createDeadStoreEliminationPass() {
+  return new DSELegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/DivRemPairs.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/DivRemPairs.cpp
index 0132ac83bb..3c6c444d66 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/DivRemPairs.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/DivRemPairs.cpp
@@ -1,399 +1,399 @@
-//===- DivRemPairs.cpp - Hoist/[dr]ecompose division and remainder --------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass hoists and/or decomposes/recomposes integer division and remainder 
-// instructions to enable CFG improvements and better codegen. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/DivRemPairs.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/MapVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/DebugCounter.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/BypassSlowDivision.h" 
- 
-using namespace llvm; 
-using namespace llvm::PatternMatch; 
- 
-#define DEBUG_TYPE "div-rem-pairs" 
-STATISTIC(NumPairs, "Number of div/rem pairs"); 
-STATISTIC(NumRecomposed, "Number of instructions recomposed"); 
-STATISTIC(NumHoisted, "Number of instructions hoisted"); 
-STATISTIC(NumDecomposed, "Number of instructions decomposed"); 
-DEBUG_COUNTER(DRPCounter, "div-rem-pairs-transform", 
-              "Controls transformations in div-rem-pairs pass"); 
- 
-namespace { 
-struct ExpandedMatch { 
-  DivRemMapKey Key; 
-  Instruction *Value; 
-}; 
-} // namespace 
- 
-/// See if we can match: (which is the form we expand into) 
-///   X - ((X ?/ Y) * Y) 
-/// which is equivalent to: 
-///   X ?% Y 
-static llvm::Optional<ExpandedMatch> matchExpandedRem(Instruction &I) { 
-  Value *Dividend, *XroundedDownToMultipleOfY; 
-  if (!match(&I, m_Sub(m_Value(Dividend), m_Value(XroundedDownToMultipleOfY)))) 
-    return llvm::None; 
- 
-  Value *Divisor; 
-  Instruction *Div; 
-  // Look for  ((X / Y) * Y) 
-  if (!match( 
-          XroundedDownToMultipleOfY, 
-          m_c_Mul(m_CombineAnd(m_IDiv(m_Specific(Dividend), m_Value(Divisor)), 
-                               m_Instruction(Div)), 
-                  m_Deferred(Divisor)))) 
-    return llvm::None; 
- 
-  ExpandedMatch M; 
-  M.Key.SignedOp = Div->getOpcode() == Instruction::SDiv; 
-  M.Key.Dividend = Dividend; 
-  M.Key.Divisor = Divisor; 
-  M.Value = &I; 
-  return M; 
-} 
- 
-namespace { 
-/// A thin wrapper to store two values that we matched as div-rem pair. 
-/// We want this extra indirection to avoid dealing with RAUW'ing the map keys. 
-struct DivRemPairWorklistEntry { 
-  /// The actual udiv/sdiv instruction. Source of truth. 
-  AssertingVH<Instruction> DivInst; 
- 
-  /// The instruction that we have matched as a remainder instruction. 
-  /// Should only be used as Value, don't introspect it. 
-  AssertingVH<Instruction> RemInst; 
- 
-  DivRemPairWorklistEntry(Instruction *DivInst_, Instruction *RemInst_) 
-      : DivInst(DivInst_), RemInst(RemInst_) { 
-    assert((DivInst->getOpcode() == Instruction::UDiv || 
-            DivInst->getOpcode() == Instruction::SDiv) && 
-           "Not a division."); 
-    assert(DivInst->getType() == RemInst->getType() && "Types should match."); 
-    // We can't check anything else about remainder instruction, 
-    // it's not strictly required to be a urem/srem. 
-  } 
- 
-  /// The type for this pair, identical for both the div and rem. 
-  Type *getType() const { return DivInst->getType(); } 
- 
-  /// Is this pair signed or unsigned? 
-  bool isSigned() const { return DivInst->getOpcode() == Instruction::SDiv; } 
- 
-  /// In this pair, what are the divident and divisor? 
-  Value *getDividend() const { return DivInst->getOperand(0); } 
-  Value *getDivisor() const { return DivInst->getOperand(1); } 
- 
-  bool isRemExpanded() const { 
-    switch (RemInst->getOpcode()) { 
-    case Instruction::SRem: 
-    case Instruction::URem: 
-      return false; // single 'rem' instruction - unexpanded form. 
-    default: 
-      return true; // anything else means we have remainder in expanded form. 
-    } 
-  } 
-}; 
-} // namespace 
-using DivRemWorklistTy = SmallVector<DivRemPairWorklistEntry, 4>; 
- 
-/// Find matching pairs of integer div/rem ops (they have the same numerator, 
-/// denominator, and signedness). Place those pairs into a worklist for further 
-/// processing. This indirection is needed because we have to use TrackingVH<> 
-/// because we will be doing RAUW, and if one of the rem instructions we change 
-/// happens to be an input to another div/rem in the maps, we'd have problems. 
-static DivRemWorklistTy getWorklist(Function &F) { 
-  // Insert all divide and remainder instructions into maps keyed by their 
-  // operands and opcode (signed or unsigned). 
-  DenseMap<DivRemMapKey, Instruction *> DivMap; 
-  // Use a MapVector for RemMap so that instructions are moved/inserted in a 
-  // deterministic order. 
-  MapVector<DivRemMapKey, Instruction *> RemMap; 
-  for (auto &BB : F) { 
-    for (auto &I : BB) { 
-      if (I.getOpcode() == Instruction::SDiv) 
-        DivMap[DivRemMapKey(true, I.getOperand(0), I.getOperand(1))] = &I; 
-      else if (I.getOpcode() == Instruction::UDiv) 
-        DivMap[DivRemMapKey(false, I.getOperand(0), I.getOperand(1))] = &I; 
-      else if (I.getOpcode() == Instruction::SRem) 
-        RemMap[DivRemMapKey(true, I.getOperand(0), I.getOperand(1))] = &I; 
-      else if (I.getOpcode() == Instruction::URem) 
-        RemMap[DivRemMapKey(false, I.getOperand(0), I.getOperand(1))] = &I; 
-      else if (auto Match = matchExpandedRem(I)) 
-        RemMap[Match->Key] = Match->Value; 
-    } 
-  } 
- 
-  // We'll accumulate the matching pairs of div-rem instructions here. 
-  DivRemWorklistTy Worklist; 
- 
-  // We can iterate over either map because we are only looking for matched 
-  // pairs. Choose remainders for efficiency because they are usually even more 
-  // rare than division. 
-  for (auto &RemPair : RemMap) { 
-    // Find the matching division instruction from the division map. 
+//===- DivRemPairs.cpp - Hoist/[dr]ecompose division and remainder --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass hoists and/or decomposes/recomposes integer division and remainder
+// instructions to enable CFG improvements and better codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/DivRemPairs.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BypassSlowDivision.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "div-rem-pairs"
+STATISTIC(NumPairs, "Number of div/rem pairs");
+STATISTIC(NumRecomposed, "Number of instructions recomposed");
+STATISTIC(NumHoisted, "Number of instructions hoisted");
+STATISTIC(NumDecomposed, "Number of instructions decomposed");
+DEBUG_COUNTER(DRPCounter, "div-rem-pairs-transform",
+              "Controls transformations in div-rem-pairs pass");
+
+namespace {
+struct ExpandedMatch {
+  DivRemMapKey Key;
+  Instruction *Value;
+};
+} // namespace
+
+/// See if we can match: (which is the form we expand into)
+///   X - ((X ?/ Y) * Y)
+/// which is equivalent to:
+///   X ?% Y
+static llvm::Optional<ExpandedMatch> matchExpandedRem(Instruction &I) {
+  Value *Dividend, *XroundedDownToMultipleOfY;
+  if (!match(&I, m_Sub(m_Value(Dividend), m_Value(XroundedDownToMultipleOfY))))
+    return llvm::None;
+
+  Value *Divisor;
+  Instruction *Div;
+  // Look for  ((X / Y) * Y)
+  if (!match(
+          XroundedDownToMultipleOfY,
+          m_c_Mul(m_CombineAnd(m_IDiv(m_Specific(Dividend), m_Value(Divisor)),
+                               m_Instruction(Div)),
+                  m_Deferred(Divisor))))
+    return llvm::None;
+
+  ExpandedMatch M;
+  M.Key.SignedOp = Div->getOpcode() == Instruction::SDiv;
+  M.Key.Dividend = Dividend;
+  M.Key.Divisor = Divisor;
+  M.Value = &I;
+  return M;
+}
+
+namespace {
+/// A thin wrapper to store two values that we matched as div-rem pair.
+/// We want this extra indirection to avoid dealing with RAUW'ing the map keys.
+struct DivRemPairWorklistEntry {
+  /// The actual udiv/sdiv instruction. Source of truth.
+  AssertingVH<Instruction> DivInst;
+
+  /// The instruction that we have matched as a remainder instruction.
+  /// Should only be used as Value, don't introspect it.
+  AssertingVH<Instruction> RemInst;
+
+  DivRemPairWorklistEntry(Instruction *DivInst_, Instruction *RemInst_)
+      : DivInst(DivInst_), RemInst(RemInst_) {
+    assert((DivInst->getOpcode() == Instruction::UDiv ||
+            DivInst->getOpcode() == Instruction::SDiv) &&
+           "Not a division.");
+    assert(DivInst->getType() == RemInst->getType() && "Types should match.");
+    // We can't check anything else about remainder instruction,
+    // it's not strictly required to be a urem/srem.
+  }
+
+  /// The type for this pair, identical for both the div and rem.
+  Type *getType() const { return DivInst->getType(); }
+
+  /// Is this pair signed or unsigned?
+  bool isSigned() const { return DivInst->getOpcode() == Instruction::SDiv; }
+
+  /// In this pair, what are the divident and divisor?
+  Value *getDividend() const { return DivInst->getOperand(0); }
+  Value *getDivisor() const { return DivInst->getOperand(1); }
+
+  bool isRemExpanded() const {
+    switch (RemInst->getOpcode()) {
+    case Instruction::SRem:
+    case Instruction::URem:
+      return false; // single 'rem' instruction - unexpanded form.
+    default:
+      return true; // anything else means we have remainder in expanded form.
+    }
+  }
+};
+} // namespace
+using DivRemWorklistTy = SmallVector<DivRemPairWorklistEntry, 4>;
+
+/// Find matching pairs of integer div/rem ops (they have the same numerator,
+/// denominator, and signedness). Place those pairs into a worklist for further
+/// processing. This indirection is needed because we have to use TrackingVH<>
+/// because we will be doing RAUW, and if one of the rem instructions we change
+/// happens to be an input to another div/rem in the maps, we'd have problems.
+static DivRemWorklistTy getWorklist(Function &F) {
+  // Insert all divide and remainder instructions into maps keyed by their
+  // operands and opcode (signed or unsigned).
+  DenseMap<DivRemMapKey, Instruction *> DivMap;
+  // Use a MapVector for RemMap so that instructions are moved/inserted in a
+  // deterministic order.
+  MapVector<DivRemMapKey, Instruction *> RemMap;
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      if (I.getOpcode() == Instruction::SDiv)
+        DivMap[DivRemMapKey(true, I.getOperand(0), I.getOperand(1))] = &I;
+      else if (I.getOpcode() == Instruction::UDiv)
+        DivMap[DivRemMapKey(false, I.getOperand(0), I.getOperand(1))] = &I;
+      else if (I.getOpcode() == Instruction::SRem)
+        RemMap[DivRemMapKey(true, I.getOperand(0), I.getOperand(1))] = &I;
+      else if (I.getOpcode() == Instruction::URem)
+        RemMap[DivRemMapKey(false, I.getOperand(0), I.getOperand(1))] = &I;
+      else if (auto Match = matchExpandedRem(I))
+        RemMap[Match->Key] = Match->Value;
+    }
+  }
+
+  // We'll accumulate the matching pairs of div-rem instructions here.
+  DivRemWorklistTy Worklist;
+
+  // We can iterate over either map because we are only looking for matched
+  // pairs. Choose remainders for efficiency because they are usually even more
+  // rare than division.
+  for (auto &RemPair : RemMap) {
+    // Find the matching division instruction from the division map.
     auto It = DivMap.find(RemPair.first);
     if (It == DivMap.end())
-      continue; 
- 
-    // We have a matching pair of div/rem instructions. 
-    NumPairs++; 
-    Instruction *RemInst = RemPair.second; 
- 
-    // Place it in the worklist. 
+      continue;
+
+    // We have a matching pair of div/rem instructions.
+    NumPairs++;
+    Instruction *RemInst = RemPair.second;
+
+    // Place it in the worklist.
     Worklist.emplace_back(It->second, RemInst);
-  } 
- 
-  return Worklist; 
-} 
- 
-/// Find matching pairs of integer div/rem ops (they have the same numerator, 
-/// denominator, and signedness). If they exist in different basic blocks, bring 
-/// them together by hoisting or replace the common division operation that is 
-/// implicit in the remainder: 
-/// X % Y <--> X - ((X / Y) * Y). 
-/// 
-/// We can largely ignore the normal safety and cost constraints on speculation 
-/// of these ops when we find a matching pair. This is because we are already 
-/// guaranteed that any exceptions and most cost are already incurred by the 
-/// first member of the pair. 
-/// 
-/// Note: This transform could be an oddball enhancement to EarlyCSE, GVN, or 
-/// SimplifyCFG, but it's split off on its own because it's different enough 
-/// that it doesn't quite match the stated objectives of those passes. 
-static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI, 
-                           const DominatorTree &DT) { 
-  bool Changed = false; 
- 
-  // Get the matching pairs of div-rem instructions. We want this extra 
-  // indirection to avoid dealing with having to RAUW the keys of the maps. 
-  DivRemWorklistTy Worklist = getWorklist(F); 
- 
-  // Process each entry in the worklist. 
-  for (DivRemPairWorklistEntry &E : Worklist) { 
-    if (!DebugCounter::shouldExecute(DRPCounter)) 
-      continue; 
- 
-    bool HasDivRemOp = TTI.hasDivRemOp(E.getType(), E.isSigned()); 
- 
-    auto &DivInst = E.DivInst; 
-    auto &RemInst = E.RemInst; 
- 
-    const bool RemOriginallyWasInExpandedForm = E.isRemExpanded(); 
-    (void)RemOriginallyWasInExpandedForm; // suppress unused variable warning 
- 
-    if (HasDivRemOp && E.isRemExpanded()) { 
-      // The target supports div+rem but the rem is expanded. 
-      // We should recompose it first. 
-      Value *X = E.getDividend(); 
-      Value *Y = E.getDivisor(); 
-      Instruction *RealRem = E.isSigned() ? BinaryOperator::CreateSRem(X, Y) 
-                                          : BinaryOperator::CreateURem(X, Y); 
-      // Note that we place it right next to the original expanded instruction, 
-      // and letting further handling to move it if needed. 
-      RealRem->setName(RemInst->getName() + ".recomposed"); 
-      RealRem->insertAfter(RemInst); 
-      Instruction *OrigRemInst = RemInst; 
-      // Update AssertingVH<> with new instruction so it doesn't assert. 
-      RemInst = RealRem; 
-      // And replace the original instruction with the new one. 
-      OrigRemInst->replaceAllUsesWith(RealRem); 
-      OrigRemInst->eraseFromParent(); 
-      NumRecomposed++; 
-      // Note that we have left ((X / Y) * Y) around. 
-      // If it had other uses we could rewrite it as X - X % Y 
-      Changed = true; 
-    } 
- 
-    assert((!E.isRemExpanded() || !HasDivRemOp) && 
-           "*If* the target supports div-rem, then by now the RemInst *is* " 
-           "Instruction::[US]Rem."); 
- 
-    // If the target supports div+rem and the instructions are in the same block 
-    // already, there's nothing to do. The backend should handle this. If the 
-    // target does not support div+rem, then we will decompose the rem. 
-    if (HasDivRemOp && RemInst->getParent() == DivInst->getParent()) 
-      continue; 
- 
-    bool DivDominates = DT.dominates(DivInst, RemInst); 
-    if (!DivDominates && !DT.dominates(RemInst, DivInst)) { 
-      // We have matching div-rem pair, but they are in two different blocks, 
-      // neither of which dominates one another. 
-      // FIXME: We could hoist both ops to the common predecessor block? 
-      continue; 
-    } 
- 
-    // The target does not have a single div/rem operation, 
-    // and the rem is already in expanded form. Nothing to do. 
-    if (!HasDivRemOp && E.isRemExpanded()) 
-      continue; 
- 
-    if (HasDivRemOp) { 
-      // The target has a single div/rem operation. Hoist the lower instruction 
-      // to make the matched pair visible to the backend. 
-      if (DivDominates) 
-        RemInst->moveAfter(DivInst); 
-      else 
-        DivInst->moveAfter(RemInst); 
-      NumHoisted++; 
-    } else { 
-      // The target does not have a single div/rem operation, 
-      // and the rem is *not* in a already-expanded form. 
-      // Decompose the remainder calculation as: 
-      // X % Y --> X - ((X / Y) * Y). 
- 
-      assert(!RemOriginallyWasInExpandedForm && 
-             "We should not be expanding if the rem was in expanded form to " 
-             "begin with."); 
- 
-      Value *X = E.getDividend(); 
-      Value *Y = E.getDivisor(); 
-      Instruction *Mul = BinaryOperator::CreateMul(DivInst, Y); 
-      Instruction *Sub = BinaryOperator::CreateSub(X, Mul); 
- 
-      // If the remainder dominates, then hoist the division up to that block: 
-      // 
-      // bb1: 
-      //   %rem = srem %x, %y 
-      // bb2: 
-      //   %div = sdiv %x, %y 
-      // --> 
-      // bb1: 
-      //   %div = sdiv %x, %y 
-      //   %mul = mul %div, %y 
-      //   %rem = sub %x, %mul 
-      // 
-      // If the division dominates, it's already in the right place. The mul+sub 
-      // will be in a different block because we don't assume that they are 
-      // cheap to speculatively execute: 
-      // 
-      // bb1: 
-      //   %div = sdiv %x, %y 
-      // bb2: 
-      //   %rem = srem %x, %y 
-      // --> 
-      // bb1: 
-      //   %div = sdiv %x, %y 
-      // bb2: 
-      //   %mul = mul %div, %y 
-      //   %rem = sub %x, %mul 
-      // 
-      // If the div and rem are in the same block, we do the same transform, 
-      // but any code movement would be within the same block. 
- 
-      if (!DivDominates) 
-        DivInst->moveBefore(RemInst); 
-      Mul->insertAfter(RemInst); 
-      Sub->insertAfter(Mul); 
- 
-      // If X can be undef, X should be frozen first. 
-      // For example, let's assume that Y = 1 & X = undef: 
-      //   %div = sdiv undef, 1 // %div = undef 
-      //   %rem = srem undef, 1 // %rem = 0 
-      // => 
-      //   %div = sdiv undef, 1 // %div = undef 
-      //   %mul = mul %div, 1   // %mul = undef 
-      //   %rem = sub %x, %mul  // %rem = undef - undef = undef 
-      // If X is not frozen, %rem becomes undef after transformation. 
-      // TODO: We need a undef-specific checking function in ValueTracking 
+  }
+
+  return Worklist;
+}
+
+/// Find matching pairs of integer div/rem ops (they have the same numerator,
+/// denominator, and signedness). If they exist in different basic blocks, bring
+/// them together by hoisting or replace the common division operation that is
+/// implicit in the remainder:
+/// X % Y <--> X - ((X / Y) * Y).
+///
+/// We can largely ignore the normal safety and cost constraints on speculation
+/// of these ops when we find a matching pair. This is because we are already
+/// guaranteed that any exceptions and most cost are already incurred by the
+/// first member of the pair.
+///
+/// Note: This transform could be an oddball enhancement to EarlyCSE, GVN, or
+/// SimplifyCFG, but it's split off on its own because it's different enough
+/// that it doesn't quite match the stated objectives of those passes.
+static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
+                           const DominatorTree &DT) {
+  bool Changed = false;
+
+  // Get the matching pairs of div-rem instructions. We want this extra
+  // indirection to avoid dealing with having to RAUW the keys of the maps.
+  DivRemWorklistTy Worklist = getWorklist(F);
+
+  // Process each entry in the worklist.
+  for (DivRemPairWorklistEntry &E : Worklist) {
+    if (!DebugCounter::shouldExecute(DRPCounter))
+      continue;
+
+    bool HasDivRemOp = TTI.hasDivRemOp(E.getType(), E.isSigned());
+
+    auto &DivInst = E.DivInst;
+    auto &RemInst = E.RemInst;
+
+    const bool RemOriginallyWasInExpandedForm = E.isRemExpanded();
+    (void)RemOriginallyWasInExpandedForm; // suppress unused variable warning
+
+    if (HasDivRemOp && E.isRemExpanded()) {
+      // The target supports div+rem but the rem is expanded.
+      // We should recompose it first.
+      Value *X = E.getDividend();
+      Value *Y = E.getDivisor();
+      Instruction *RealRem = E.isSigned() ? BinaryOperator::CreateSRem(X, Y)
+                                          : BinaryOperator::CreateURem(X, Y);
+      // Note that we place it right next to the original expanded instruction,
+      // and letting further handling to move it if needed.
+      RealRem->setName(RemInst->getName() + ".recomposed");
+      RealRem->insertAfter(RemInst);
+      Instruction *OrigRemInst = RemInst;
+      // Update AssertingVH<> with new instruction so it doesn't assert.
+      RemInst = RealRem;
+      // And replace the original instruction with the new one.
+      OrigRemInst->replaceAllUsesWith(RealRem);
+      OrigRemInst->eraseFromParent();
+      NumRecomposed++;
+      // Note that we have left ((X / Y) * Y) around.
+      // If it had other uses we could rewrite it as X - X % Y
+      Changed = true;
+    }
+
+    assert((!E.isRemExpanded() || !HasDivRemOp) &&
+           "*If* the target supports div-rem, then by now the RemInst *is* "
+           "Instruction::[US]Rem.");
+
+    // If the target supports div+rem and the instructions are in the same block
+    // already, there's nothing to do. The backend should handle this. If the
+    // target does not support div+rem, then we will decompose the rem.
+    if (HasDivRemOp && RemInst->getParent() == DivInst->getParent())
+      continue;
+
+    bool DivDominates = DT.dominates(DivInst, RemInst);
+    if (!DivDominates && !DT.dominates(RemInst, DivInst)) {
+      // We have matching div-rem pair, but they are in two different blocks,
+      // neither of which dominates one another.
+      // FIXME: We could hoist both ops to the common predecessor block?
+      continue;
+    }
+
+    // The target does not have a single div/rem operation,
+    // and the rem is already in expanded form. Nothing to do.
+    if (!HasDivRemOp && E.isRemExpanded())
+      continue;
+
+    if (HasDivRemOp) {
+      // The target has a single div/rem operation. Hoist the lower instruction
+      // to make the matched pair visible to the backend.
+      if (DivDominates)
+        RemInst->moveAfter(DivInst);
+      else
+        DivInst->moveAfter(RemInst);
+      NumHoisted++;
+    } else {
+      // The target does not have a single div/rem operation,
+      // and the rem is *not* in a already-expanded form.
+      // Decompose the remainder calculation as:
+      // X % Y --> X - ((X / Y) * Y).
+
+      assert(!RemOriginallyWasInExpandedForm &&
+             "We should not be expanding if the rem was in expanded form to "
+             "begin with.");
+
+      Value *X = E.getDividend();
+      Value *Y = E.getDivisor();
+      Instruction *Mul = BinaryOperator::CreateMul(DivInst, Y);
+      Instruction *Sub = BinaryOperator::CreateSub(X, Mul);
+
+      // If the remainder dominates, then hoist the division up to that block:
+      //
+      // bb1:
+      //   %rem = srem %x, %y
+      // bb2:
+      //   %div = sdiv %x, %y
+      // -->
+      // bb1:
+      //   %div = sdiv %x, %y
+      //   %mul = mul %div, %y
+      //   %rem = sub %x, %mul
+      //
+      // If the division dominates, it's already in the right place. The mul+sub
+      // will be in a different block because we don't assume that they are
+      // cheap to speculatively execute:
+      //
+      // bb1:
+      //   %div = sdiv %x, %y
+      // bb2:
+      //   %rem = srem %x, %y
+      // -->
+      // bb1:
+      //   %div = sdiv %x, %y
+      // bb2:
+      //   %mul = mul %div, %y
+      //   %rem = sub %x, %mul
+      //
+      // If the div and rem are in the same block, we do the same transform,
+      // but any code movement would be within the same block.
+
+      if (!DivDominates)
+        DivInst->moveBefore(RemInst);
+      Mul->insertAfter(RemInst);
+      Sub->insertAfter(Mul);
+
+      // If X can be undef, X should be frozen first.
+      // For example, let's assume that Y = 1 & X = undef:
+      //   %div = sdiv undef, 1 // %div = undef
+      //   %rem = srem undef, 1 // %rem = 0
+      // =>
+      //   %div = sdiv undef, 1 // %div = undef
+      //   %mul = mul %div, 1   // %mul = undef
+      //   %rem = sub %x, %mul  // %rem = undef - undef = undef
+      // If X is not frozen, %rem becomes undef after transformation.
+      // TODO: We need a undef-specific checking function in ValueTracking
       if (!isGuaranteedNotToBeUndefOrPoison(X, nullptr, DivInst, &DT)) {
-        auto *FrX = new FreezeInst(X, X->getName() + ".frozen", DivInst); 
-        DivInst->setOperand(0, FrX); 
-        Sub->setOperand(0, FrX); 
-      } 
-      // Same for Y. If X = 1 and Y = (undef | 1), %rem in src is either 1 or 0, 
-      // but %rem in tgt can be one of many integer values. 
+        auto *FrX = new FreezeInst(X, X->getName() + ".frozen", DivInst);
+        DivInst->setOperand(0, FrX);
+        Sub->setOperand(0, FrX);
+      }
+      // Same for Y. If X = 1 and Y = (undef | 1), %rem in src is either 1 or 0,
+      // but %rem in tgt can be one of many integer values.
       if (!isGuaranteedNotToBeUndefOrPoison(Y, nullptr, DivInst, &DT)) {
-        auto *FrY = new FreezeInst(Y, Y->getName() + ".frozen", DivInst); 
-        DivInst->setOperand(1, FrY); 
-        Mul->setOperand(1, FrY); 
-      } 
- 
-      // Now kill the explicit remainder. We have replaced it with: 
-      // (sub X, (mul (div X, Y), Y) 
-      Sub->setName(RemInst->getName() + ".decomposed"); 
-      Instruction *OrigRemInst = RemInst; 
-      // Update AssertingVH<> with new instruction so it doesn't assert. 
-      RemInst = Sub; 
-      // And replace the original instruction with the new one. 
-      OrigRemInst->replaceAllUsesWith(Sub); 
-      OrigRemInst->eraseFromParent(); 
-      NumDecomposed++; 
-    } 
-    Changed = true; 
-  } 
- 
-  return Changed; 
-} 
- 
-// Pass manager boilerplate below here. 
- 
-namespace { 
-struct DivRemPairsLegacyPass : public FunctionPass { 
-  static char ID; 
-  DivRemPairsLegacyPass() : FunctionPass(ID) { 
-    initializeDivRemPairsLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addRequired<TargetTransformInfoWrapperPass>(); 
-    AU.setPreservesCFG(); 
-    AU.addPreserved<DominatorTreeWrapperPass>(); 
-    AU.addPreserved<GlobalsAAWrapperPass>(); 
-    FunctionPass::getAnalysisUsage(AU); 
-  } 
- 
-  bool runOnFunction(Function &F) override { 
-    if (skipFunction(F)) 
-      return false; 
-    auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 
-    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-    return optimizeDivRem(F, TTI, DT); 
-  } 
-}; 
-} // namespace 
- 
-char DivRemPairsLegacyPass::ID = 0; 
-INITIALIZE_PASS_BEGIN(DivRemPairsLegacyPass, "div-rem-pairs", 
-                      "Hoist/decompose integer division and remainder", false, 
-                      false) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_END(DivRemPairsLegacyPass, "div-rem-pairs", 
-                    "Hoist/decompose integer division and remainder", false, 
-                    false) 
-FunctionPass *llvm::createDivRemPairsPass() { 
-  return new DivRemPairsLegacyPass(); 
-} 
- 
-PreservedAnalyses DivRemPairsPass::run(Function &F, 
-                                       FunctionAnalysisManager &FAM) { 
-  TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F); 
-  DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F); 
-  if (!optimizeDivRem(F, TTI, DT)) 
-    return PreservedAnalyses::all(); 
-  // TODO: This pass just hoists/replaces math ops - all analyses are preserved? 
-  PreservedAnalyses PA; 
-  PA.preserveSet<CFGAnalyses>(); 
-  PA.preserve<GlobalsAA>(); 
-  return PA; 
-} 
+        auto *FrY = new FreezeInst(Y, Y->getName() + ".frozen", DivInst);
+        DivInst->setOperand(1, FrY);
+        Mul->setOperand(1, FrY);
+      }
+
+      // Now kill the explicit remainder. We have replaced it with:
+      // (sub X, (mul (div X, Y), Y)
+      Sub->setName(RemInst->getName() + ".decomposed");
+      Instruction *OrigRemInst = RemInst;
+      // Update AssertingVH<> with new instruction so it doesn't assert.
+      RemInst = Sub;
+      // And replace the original instruction with the new one.
+      OrigRemInst->replaceAllUsesWith(Sub);
+      OrigRemInst->eraseFromParent();
+      NumDecomposed++;
+    }
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+// Pass manager boilerplate below here.
+
+namespace {
+struct DivRemPairsLegacyPass : public FunctionPass {
+  static char ID;
+  DivRemPairsLegacyPass() : FunctionPass(ID) {
+    initializeDivRemPairsLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.setPreservesCFG();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    return optimizeDivRem(F, TTI, DT);
+  }
+};
+} // namespace
+
+char DivRemPairsLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(DivRemPairsLegacyPass, "div-rem-pairs",
+                      "Hoist/decompose integer division and remainder", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(DivRemPairsLegacyPass, "div-rem-pairs",
+                    "Hoist/decompose integer division and remainder", false,
+                    false)
+FunctionPass *llvm::createDivRemPairsPass() {
+  return new DivRemPairsLegacyPass();
+}
+
+PreservedAnalyses DivRemPairsPass::run(Function &F,
+                                       FunctionAnalysisManager &FAM) {
+  TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
+  DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+  if (!optimizeDivRem(F, TTI, DT))
+    return PreservedAnalyses::all();
+  // TODO: This pass just hoists/replaces math ops - all analyses are preserved?
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/EarlyCSE.cpp
index 07a84445eb..180a82917f 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -1,274 +1,274 @@
-//===- EarlyCSE.cpp - Simple and fast CSE pass ----------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass performs a simple dominator tree walk that eliminates trivially 
-// redundant instructions. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/EarlyCSE.h" 
-#include "llvm/ADT/DenseMapInfo.h" 
-#include "llvm/ADT/Hashing.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/ScopedHashTable.h" 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/GuardUtils.h" 
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/Analysis/MemorySSA.h" 
-#include "llvm/Analysis/MemorySSAUpdater.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/IR/Statepoint.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Allocator.h" 
-#include "llvm/Support/AtomicOrdering.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/DebugCounter.h" 
-#include "llvm/Support/RecyclingAllocator.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/AssumeBundleBuilder.h" 
-#include "llvm/Transforms/Utils/GuardUtils.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include <cassert> 
-#include <deque> 
-#include <memory> 
-#include <utility> 
- 
-using namespace llvm; 
-using namespace llvm::PatternMatch; 
- 
-#define DEBUG_TYPE "early-cse" 
- 
-STATISTIC(NumSimplify, "Number of instructions simplified or DCE'd"); 
-STATISTIC(NumCSE,      "Number of instructions CSE'd"); 
-STATISTIC(NumCSECVP,   "Number of compare instructions CVP'd"); 
-STATISTIC(NumCSELoad,  "Number of load instructions CSE'd"); 
-STATISTIC(NumCSECall,  "Number of call instructions CSE'd"); 
-STATISTIC(NumDSE,      "Number of trivial dead stores removed"); 
- 
-DEBUG_COUNTER(CSECounter, "early-cse", 
-              "Controls which instructions are removed"); 
- 
-static cl::opt<unsigned> EarlyCSEMssaOptCap( 
-    "earlycse-mssa-optimization-cap", cl::init(500), cl::Hidden, 
-    cl::desc("Enable imprecision in EarlyCSE in pathological cases, in exchange " 
-             "for faster compile. Caps the MemorySSA clobbering calls.")); 
- 
-static cl::opt<bool> EarlyCSEDebugHash( 
-    "earlycse-debug-hash", cl::init(false), cl::Hidden, 
-    cl::desc("Perform extra assertion checking to verify that SimpleValue's hash " 
-             "function is well-behaved w.r.t. its isEqual predicate")); 
- 
-//===----------------------------------------------------------------------===// 
-// SimpleValue 
-//===----------------------------------------------------------------------===// 
- 
-namespace { 
- 
-/// Struct representing the available values in the scoped hash table. 
-struct SimpleValue { 
-  Instruction *Inst; 
- 
-  SimpleValue(Instruction *I) : Inst(I) { 
-    assert((isSentinel() || canHandle(I)) && "Inst can't be handled!"); 
-  } 
- 
-  bool isSentinel() const { 
-    return Inst == DenseMapInfo<Instruction *>::getEmptyKey() || 
-           Inst == DenseMapInfo<Instruction *>::getTombstoneKey(); 
-  } 
- 
-  static bool canHandle(Instruction *Inst) { 
-    // This can only handle non-void readnone functions. 
-    if (CallInst *CI = dyn_cast<CallInst>(Inst)) 
-      return CI->doesNotAccessMemory() && !CI->getType()->isVoidTy(); 
-    return isa<CastInst>(Inst) || isa<UnaryOperator>(Inst) || 
-           isa<BinaryOperator>(Inst) || isa<GetElementPtrInst>(Inst) || 
-           isa<CmpInst>(Inst) || isa<SelectInst>(Inst) || 
-           isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) || 
-           isa<ShuffleVectorInst>(Inst) || isa<ExtractValueInst>(Inst) || 
-           isa<InsertValueInst>(Inst) || isa<FreezeInst>(Inst); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-namespace llvm { 
- 
-template <> struct DenseMapInfo<SimpleValue> { 
-  static inline SimpleValue getEmptyKey() { 
-    return DenseMapInfo<Instruction *>::getEmptyKey(); 
-  } 
- 
-  static inline SimpleValue getTombstoneKey() { 
-    return DenseMapInfo<Instruction *>::getTombstoneKey(); 
-  } 
- 
-  static unsigned getHashValue(SimpleValue Val); 
-  static bool isEqual(SimpleValue LHS, SimpleValue RHS); 
-}; 
- 
-} // end namespace llvm 
- 
-/// Match a 'select' including an optional 'not's of the condition. 
-static bool matchSelectWithOptionalNotCond(Value *V, Value *&Cond, Value *&A, 
-                                           Value *&B, 
-                                           SelectPatternFlavor &Flavor) { 
-  // Return false if V is not even a select. 
-  if (!match(V, m_Select(m_Value(Cond), m_Value(A), m_Value(B)))) 
-    return false; 
- 
-  // Look through a 'not' of the condition operand by swapping A/B. 
-  Value *CondNot; 
-  if (match(Cond, m_Not(m_Value(CondNot)))) { 
-    Cond = CondNot; 
-    std::swap(A, B); 
-  } 
- 
+//===- EarlyCSE.cpp - Simple and fast CSE pass ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs a simple dominator tree walk that eliminates trivially
+// redundant instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/EarlyCSE.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/GuardUtils.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Support/RecyclingAllocator.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
+#include "llvm/Transforms/Utils/GuardUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <deque>
+#include <memory>
+#include <utility>
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "early-cse"
+
+STATISTIC(NumSimplify, "Number of instructions simplified or DCE'd");
+STATISTIC(NumCSE,      "Number of instructions CSE'd");
+STATISTIC(NumCSECVP,   "Number of compare instructions CVP'd");
+STATISTIC(NumCSELoad,  "Number of load instructions CSE'd");
+STATISTIC(NumCSECall,  "Number of call instructions CSE'd");
+STATISTIC(NumDSE,      "Number of trivial dead stores removed");
+
+DEBUG_COUNTER(CSECounter, "early-cse",
+              "Controls which instructions are removed");
+
+static cl::opt<unsigned> EarlyCSEMssaOptCap(
+    "earlycse-mssa-optimization-cap", cl::init(500), cl::Hidden,
+    cl::desc("Enable imprecision in EarlyCSE in pathological cases, in exchange "
+             "for faster compile. Caps the MemorySSA clobbering calls."));
+
+static cl::opt<bool> EarlyCSEDebugHash(
+    "earlycse-debug-hash", cl::init(false), cl::Hidden,
+    cl::desc("Perform extra assertion checking to verify that SimpleValue's hash "
+             "function is well-behaved w.r.t. its isEqual predicate"));
+
+//===----------------------------------------------------------------------===//
+// SimpleValue
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// Struct representing the available values in the scoped hash table.
+struct SimpleValue {
+  Instruction *Inst;
+
+  SimpleValue(Instruction *I) : Inst(I) {
+    assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
+  }
+
+  bool isSentinel() const {
+    return Inst == DenseMapInfo<Instruction *>::getEmptyKey() ||
+           Inst == DenseMapInfo<Instruction *>::getTombstoneKey();
+  }
+
+  static bool canHandle(Instruction *Inst) {
+    // This can only handle non-void readnone functions.
+    if (CallInst *CI = dyn_cast<CallInst>(Inst))
+      return CI->doesNotAccessMemory() && !CI->getType()->isVoidTy();
+    return isa<CastInst>(Inst) || isa<UnaryOperator>(Inst) ||
+           isa<BinaryOperator>(Inst) || isa<GetElementPtrInst>(Inst) ||
+           isa<CmpInst>(Inst) || isa<SelectInst>(Inst) ||
+           isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
+           isa<ShuffleVectorInst>(Inst) || isa<ExtractValueInst>(Inst) ||
+           isa<InsertValueInst>(Inst) || isa<FreezeInst>(Inst);
+  }
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+template <> struct DenseMapInfo<SimpleValue> {
+  static inline SimpleValue getEmptyKey() {
+    return DenseMapInfo<Instruction *>::getEmptyKey();
+  }
+
+  static inline SimpleValue getTombstoneKey() {
+    return DenseMapInfo<Instruction *>::getTombstoneKey();
+  }
+
+  static unsigned getHashValue(SimpleValue Val);
+  static bool isEqual(SimpleValue LHS, SimpleValue RHS);
+};
+
+} // end namespace llvm
+
+/// Match a 'select' including an optional 'not's of the condition.
+static bool matchSelectWithOptionalNotCond(Value *V, Value *&Cond, Value *&A,
+                                           Value *&B,
+                                           SelectPatternFlavor &Flavor) {
+  // Return false if V is not even a select.
+  if (!match(V, m_Select(m_Value(Cond), m_Value(A), m_Value(B))))
+    return false;
+
+  // Look through a 'not' of the condition operand by swapping A/B.
+  Value *CondNot;
+  if (match(Cond, m_Not(m_Value(CondNot)))) {
+    Cond = CondNot;
+    std::swap(A, B);
+  }
+
   // Match canonical forms of min/max. We are not using ValueTracking's
-  // more powerful matchSelectPattern() because it may rely on instruction flags 
-  // such as "nsw". That would be incompatible with the current hashing 
-  // mechanism that may remove flags to increase the likelihood of CSE. 
- 
-  Flavor = SPF_UNKNOWN; 
-  CmpInst::Predicate Pred; 
- 
-  if (!match(Cond, m_ICmp(Pred, m_Specific(A), m_Specific(B)))) { 
-    // Check for commuted variants of min/max by swapping predicate. 
-    // If we do not match the standard or commuted patterns, this is not a 
-    // recognized form of min/max, but it is still a select, so return true. 
-    if (!match(Cond, m_ICmp(Pred, m_Specific(B), m_Specific(A)))) 
-      return true; 
-    Pred = ICmpInst::getSwappedPredicate(Pred); 
-  } 
- 
-  switch (Pred) { 
-  case CmpInst::ICMP_UGT: Flavor = SPF_UMAX; break; 
-  case CmpInst::ICMP_ULT: Flavor = SPF_UMIN; break; 
-  case CmpInst::ICMP_SGT: Flavor = SPF_SMAX; break; 
-  case CmpInst::ICMP_SLT: Flavor = SPF_SMIN; break; 
+  // more powerful matchSelectPattern() because it may rely on instruction flags
+  // such as "nsw". That would be incompatible with the current hashing
+  // mechanism that may remove flags to increase the likelihood of CSE.
+
+  Flavor = SPF_UNKNOWN;
+  CmpInst::Predicate Pred;
+
+  if (!match(Cond, m_ICmp(Pred, m_Specific(A), m_Specific(B)))) {
+    // Check for commuted variants of min/max by swapping predicate.
+    // If we do not match the standard or commuted patterns, this is not a
+    // recognized form of min/max, but it is still a select, so return true.
+    if (!match(Cond, m_ICmp(Pred, m_Specific(B), m_Specific(A))))
+      return true;
+    Pred = ICmpInst::getSwappedPredicate(Pred);
+  }
+
+  switch (Pred) {
+  case CmpInst::ICMP_UGT: Flavor = SPF_UMAX; break;
+  case CmpInst::ICMP_ULT: Flavor = SPF_UMIN; break;
+  case CmpInst::ICMP_SGT: Flavor = SPF_SMAX; break;
+  case CmpInst::ICMP_SLT: Flavor = SPF_SMIN; break;
   // Non-strict inequalities.
   case CmpInst::ICMP_ULE: Flavor = SPF_UMIN; break;
   case CmpInst::ICMP_UGE: Flavor = SPF_UMAX; break;
   case CmpInst::ICMP_SLE: Flavor = SPF_SMIN; break;
   case CmpInst::ICMP_SGE: Flavor = SPF_SMAX; break;
-  default: break; 
-  } 
- 
-  return true; 
-} 
- 
-static unsigned getHashValueImpl(SimpleValue Val) { 
-  Instruction *Inst = Val.Inst; 
-  // Hash in all of the operands as pointers. 
-  if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst)) { 
-    Value *LHS = BinOp->getOperand(0); 
-    Value *RHS = BinOp->getOperand(1); 
-    if (BinOp->isCommutative() && BinOp->getOperand(0) > BinOp->getOperand(1)) 
-      std::swap(LHS, RHS); 
- 
-    return hash_combine(BinOp->getOpcode(), LHS, RHS); 
-  } 
- 
-  if (CmpInst *CI = dyn_cast<CmpInst>(Inst)) { 
-    // Compares can be commuted by swapping the comparands and 
-    // updating the predicate.  Choose the form that has the 
-    // comparands in sorted order, or in the case of a tie, the 
-    // one with the lower predicate. 
-    Value *LHS = CI->getOperand(0); 
-    Value *RHS = CI->getOperand(1); 
-    CmpInst::Predicate Pred = CI->getPredicate(); 
-    CmpInst::Predicate SwappedPred = CI->getSwappedPredicate(); 
-    if (std::tie(LHS, Pred) > std::tie(RHS, SwappedPred)) { 
-      std::swap(LHS, RHS); 
-      Pred = SwappedPred; 
-    } 
-    return hash_combine(Inst->getOpcode(), Pred, LHS, RHS); 
-  } 
- 
-  // Hash general selects to allow matching commuted true/false operands. 
-  SelectPatternFlavor SPF; 
-  Value *Cond, *A, *B; 
-  if (matchSelectWithOptionalNotCond(Inst, Cond, A, B, SPF)) { 
+  default: break;
+  }
+
+  return true;
+}
+
+static unsigned getHashValueImpl(SimpleValue Val) {
+  Instruction *Inst = Val.Inst;
+  // Hash in all of the operands as pointers.
+  if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst)) {
+    Value *LHS = BinOp->getOperand(0);
+    Value *RHS = BinOp->getOperand(1);
+    if (BinOp->isCommutative() && BinOp->getOperand(0) > BinOp->getOperand(1))
+      std::swap(LHS, RHS);
+
+    return hash_combine(BinOp->getOpcode(), LHS, RHS);
+  }
+
+  if (CmpInst *CI = dyn_cast<CmpInst>(Inst)) {
+    // Compares can be commuted by swapping the comparands and
+    // updating the predicate.  Choose the form that has the
+    // comparands in sorted order, or in the case of a tie, the
+    // one with the lower predicate.
+    Value *LHS = CI->getOperand(0);
+    Value *RHS = CI->getOperand(1);
+    CmpInst::Predicate Pred = CI->getPredicate();
+    CmpInst::Predicate SwappedPred = CI->getSwappedPredicate();
+    if (std::tie(LHS, Pred) > std::tie(RHS, SwappedPred)) {
+      std::swap(LHS, RHS);
+      Pred = SwappedPred;
+    }
+    return hash_combine(Inst->getOpcode(), Pred, LHS, RHS);
+  }
+
+  // Hash general selects to allow matching commuted true/false operands.
+  SelectPatternFlavor SPF;
+  Value *Cond, *A, *B;
+  if (matchSelectWithOptionalNotCond(Inst, Cond, A, B, SPF)) {
     // Hash min/max (cmp + select) to allow for commuted operands.
-    // Min/max may also have non-canonical compare predicate (eg, the compare for 
-    // smin may use 'sgt' rather than 'slt'), and non-canonical operands in the 
-    // compare. 
-    // TODO: We should also detect FP min/max. 
-    if (SPF == SPF_SMIN || SPF == SPF_SMAX || 
-        SPF == SPF_UMIN || SPF == SPF_UMAX) { 
-      if (A > B) 
-        std::swap(A, B); 
-      return hash_combine(Inst->getOpcode(), SPF, A, B); 
-    } 
- 
-    // Hash general selects to allow matching commuted true/false operands. 
- 
-    // If we do not have a compare as the condition, just hash in the condition. 
-    CmpInst::Predicate Pred; 
-    Value *X, *Y; 
-    if (!match(Cond, m_Cmp(Pred, m_Value(X), m_Value(Y)))) 
-      return hash_combine(Inst->getOpcode(), Cond, A, B); 
- 
-    // Similar to cmp normalization (above) - canonicalize the predicate value: 
-    // select (icmp Pred, X, Y), A, B --> select (icmp InvPred, X, Y), B, A 
-    if (CmpInst::getInversePredicate(Pred) < Pred) { 
-      Pred = CmpInst::getInversePredicate(Pred); 
-      std::swap(A, B); 
-    } 
-    return hash_combine(Inst->getOpcode(), Pred, X, Y, A, B); 
-  } 
- 
-  if (CastInst *CI = dyn_cast<CastInst>(Inst)) 
-    return hash_combine(CI->getOpcode(), CI->getType(), CI->getOperand(0)); 
- 
-  if (FreezeInst *FI = dyn_cast<FreezeInst>(Inst)) 
-    return hash_combine(FI->getOpcode(), FI->getOperand(0)); 
- 
-  if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(Inst)) 
-    return hash_combine(EVI->getOpcode(), EVI->getOperand(0), 
-                        hash_combine_range(EVI->idx_begin(), EVI->idx_end())); 
- 
-  if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(Inst)) 
-    return hash_combine(IVI->getOpcode(), IVI->getOperand(0), 
-                        IVI->getOperand(1), 
-                        hash_combine_range(IVI->idx_begin(), IVI->idx_end())); 
- 
-  assert((isa<CallInst>(Inst) || isa<GetElementPtrInst>(Inst) || 
-          isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) || 
-          isa<ShuffleVectorInst>(Inst) || isa<UnaryOperator>(Inst) || 
-          isa<FreezeInst>(Inst)) && 
-         "Invalid/unknown instruction"); 
- 
+    // Min/max may also have non-canonical compare predicate (eg, the compare for
+    // smin may use 'sgt' rather than 'slt'), and non-canonical operands in the
+    // compare.
+    // TODO: We should also detect FP min/max.
+    if (SPF == SPF_SMIN || SPF == SPF_SMAX ||
+        SPF == SPF_UMIN || SPF == SPF_UMAX) {
+      if (A > B)
+        std::swap(A, B);
+      return hash_combine(Inst->getOpcode(), SPF, A, B);
+    }
+
+    // Hash general selects to allow matching commuted true/false operands.
+
+    // If we do not have a compare as the condition, just hash in the condition.
+    CmpInst::Predicate Pred;
+    Value *X, *Y;
+    if (!match(Cond, m_Cmp(Pred, m_Value(X), m_Value(Y))))
+      return hash_combine(Inst->getOpcode(), Cond, A, B);
+
+    // Similar to cmp normalization (above) - canonicalize the predicate value:
+    // select (icmp Pred, X, Y), A, B --> select (icmp InvPred, X, Y), B, A
+    if (CmpInst::getInversePredicate(Pred) < Pred) {
+      Pred = CmpInst::getInversePredicate(Pred);
+      std::swap(A, B);
+    }
+    return hash_combine(Inst->getOpcode(), Pred, X, Y, A, B);
+  }
+
+  if (CastInst *CI = dyn_cast<CastInst>(Inst))
+    return hash_combine(CI->getOpcode(), CI->getType(), CI->getOperand(0));
+
+  if (FreezeInst *FI = dyn_cast<FreezeInst>(Inst))
+    return hash_combine(FI->getOpcode(), FI->getOperand(0));
+
+  if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(Inst))
+    return hash_combine(EVI->getOpcode(), EVI->getOperand(0),
+                        hash_combine_range(EVI->idx_begin(), EVI->idx_end()));
+
+  if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(Inst))
+    return hash_combine(IVI->getOpcode(), IVI->getOperand(0),
+                        IVI->getOperand(1),
+                        hash_combine_range(IVI->idx_begin(), IVI->idx_end()));
+
+  assert((isa<CallInst>(Inst) || isa<GetElementPtrInst>(Inst) ||
+          isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
+          isa<ShuffleVectorInst>(Inst) || isa<UnaryOperator>(Inst) ||
+          isa<FreezeInst>(Inst)) &&
+         "Invalid/unknown instruction");
+
   // Handle intrinsics with commutative operands.
   // TODO: Extend this to handle intrinsics with >2 operands where the 1st
   //       2 operands are commutative.
@@ -280,58 +280,58 @@ static unsigned getHashValueImpl(SimpleValue Val) {
     return hash_combine(II->getOpcode(), LHS, RHS);
   }
 
-  // Mix in the opcode. 
-  return hash_combine( 
-      Inst->getOpcode(), 
-      hash_combine_range(Inst->value_op_begin(), Inst->value_op_end())); 
-} 
- 
-unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) { 
-#ifndef NDEBUG 
-  // If -earlycse-debug-hash was specified, return a constant -- this 
-  // will force all hashing to collide, so we'll exhaustively search 
-  // the table for a match, and the assertion in isEqual will fire if 
-  // there's a bug causing equal keys to hash differently. 
-  if (EarlyCSEDebugHash) 
-    return 0; 
-#endif 
-  return getHashValueImpl(Val); 
-} 
- 
-static bool isEqualImpl(SimpleValue LHS, SimpleValue RHS) { 
-  Instruction *LHSI = LHS.Inst, *RHSI = RHS.Inst; 
- 
-  if (LHS.isSentinel() || RHS.isSentinel()) 
-    return LHSI == RHSI; 
- 
-  if (LHSI->getOpcode() != RHSI->getOpcode()) 
-    return false; 
-  if (LHSI->isIdenticalToWhenDefined(RHSI)) 
-    return true; 
- 
-  // If we're not strictly identical, we still might be a commutable instruction 
-  if (BinaryOperator *LHSBinOp = dyn_cast<BinaryOperator>(LHSI)) { 
-    if (!LHSBinOp->isCommutative()) 
-      return false; 
- 
-    assert(isa<BinaryOperator>(RHSI) && 
-           "same opcode, but different instruction type?"); 
-    BinaryOperator *RHSBinOp = cast<BinaryOperator>(RHSI); 
- 
-    // Commuted equality 
-    return LHSBinOp->getOperand(0) == RHSBinOp->getOperand(1) && 
-           LHSBinOp->getOperand(1) == RHSBinOp->getOperand(0); 
-  } 
-  if (CmpInst *LHSCmp = dyn_cast<CmpInst>(LHSI)) { 
-    assert(isa<CmpInst>(RHSI) && 
-           "same opcode, but different instruction type?"); 
-    CmpInst *RHSCmp = cast<CmpInst>(RHSI); 
-    // Commuted equality 
-    return LHSCmp->getOperand(0) == RHSCmp->getOperand(1) && 
-           LHSCmp->getOperand(1) == RHSCmp->getOperand(0) && 
-           LHSCmp->getSwappedPredicate() == RHSCmp->getPredicate(); 
-  } 
- 
+  // Mix in the opcode.
+  return hash_combine(
+      Inst->getOpcode(),
+      hash_combine_range(Inst->value_op_begin(), Inst->value_op_end()));
+}
+
+unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) {
+#ifndef NDEBUG
+  // If -earlycse-debug-hash was specified, return a constant -- this
+  // will force all hashing to collide, so we'll exhaustively search
+  // the table for a match, and the assertion in isEqual will fire if
+  // there's a bug causing equal keys to hash differently.
+  if (EarlyCSEDebugHash)
+    return 0;
+#endif
+  return getHashValueImpl(Val);
+}
+
+static bool isEqualImpl(SimpleValue LHS, SimpleValue RHS) {
+  Instruction *LHSI = LHS.Inst, *RHSI = RHS.Inst;
+
+  if (LHS.isSentinel() || RHS.isSentinel())
+    return LHSI == RHSI;
+
+  if (LHSI->getOpcode() != RHSI->getOpcode())
+    return false;
+  if (LHSI->isIdenticalToWhenDefined(RHSI))
+    return true;
+
+  // If we're not strictly identical, we still might be a commutable instruction
+  if (BinaryOperator *LHSBinOp = dyn_cast<BinaryOperator>(LHSI)) {
+    if (!LHSBinOp->isCommutative())
+      return false;
+
+    assert(isa<BinaryOperator>(RHSI) &&
+           "same opcode, but different instruction type?");
+    BinaryOperator *RHSBinOp = cast<BinaryOperator>(RHSI);
+
+    // Commuted equality
+    return LHSBinOp->getOperand(0) == RHSBinOp->getOperand(1) &&
+           LHSBinOp->getOperand(1) == RHSBinOp->getOperand(0);
+  }
+  if (CmpInst *LHSCmp = dyn_cast<CmpInst>(LHSI)) {
+    assert(isa<CmpInst>(RHSI) &&
+           "same opcode, but different instruction type?");
+    CmpInst *RHSCmp = cast<CmpInst>(RHSI);
+    // Commuted equality
+    return LHSCmp->getOperand(0) == RHSCmp->getOperand(1) &&
+           LHSCmp->getOperand(1) == RHSCmp->getOperand(0) &&
+           LHSCmp->getSwappedPredicate() == RHSCmp->getPredicate();
+  }
+
   // TODO: Extend this for >2 args by matching the trailing N-2 args.
   auto *LII = dyn_cast<IntrinsicInst>(LHSI);
   auto *RII = dyn_cast<IntrinsicInst>(RHSI);
@@ -342,326 +342,326 @@ static bool isEqualImpl(SimpleValue LHS, SimpleValue RHS) {
   }
 
   // Min/max can occur with commuted operands, non-canonical predicates,
-  // and/or non-canonical operands. 
-  // Selects can be non-trivially equivalent via inverted conditions and swaps. 
-  SelectPatternFlavor LSPF, RSPF; 
-  Value *CondL, *CondR, *LHSA, *RHSA, *LHSB, *RHSB; 
-  if (matchSelectWithOptionalNotCond(LHSI, CondL, LHSA, LHSB, LSPF) && 
-      matchSelectWithOptionalNotCond(RHSI, CondR, RHSA, RHSB, RSPF)) { 
-    if (LSPF == RSPF) { 
-      // TODO: We should also detect FP min/max. 
-      if (LSPF == SPF_SMIN || LSPF == SPF_SMAX || 
-          LSPF == SPF_UMIN || LSPF == SPF_UMAX) 
-        return ((LHSA == RHSA && LHSB == RHSB) || 
-                (LHSA == RHSB && LHSB == RHSA)); 
- 
-      // select Cond, A, B <--> select not(Cond), B, A 
-      if (CondL == CondR && LHSA == RHSA && LHSB == RHSB) 
-        return true; 
-    } 
- 
-    // If the true/false operands are swapped and the conditions are compares 
-    // with inverted predicates, the selects are equal: 
-    // select (icmp Pred, X, Y), A, B <--> select (icmp InvPred, X, Y), B, A 
-    // 
-    // This also handles patterns with a double-negation in the sense of not + 
-    // inverse, because we looked through a 'not' in the matching function and 
-    // swapped A/B: 
-    // select (cmp Pred, X, Y), A, B <--> select (not (cmp InvPred, X, Y)), B, A 
-    // 
-    // This intentionally does NOT handle patterns with a double-negation in 
-    // the sense of not + not, because doing so could result in values 
-    // comparing 
+  // and/or non-canonical operands.
+  // Selects can be non-trivially equivalent via inverted conditions and swaps.
+  SelectPatternFlavor LSPF, RSPF;
+  Value *CondL, *CondR, *LHSA, *RHSA, *LHSB, *RHSB;
+  if (matchSelectWithOptionalNotCond(LHSI, CondL, LHSA, LHSB, LSPF) &&
+      matchSelectWithOptionalNotCond(RHSI, CondR, RHSA, RHSB, RSPF)) {
+    if (LSPF == RSPF) {
+      // TODO: We should also detect FP min/max.
+      if (LSPF == SPF_SMIN || LSPF == SPF_SMAX ||
+          LSPF == SPF_UMIN || LSPF == SPF_UMAX)
+        return ((LHSA == RHSA && LHSB == RHSB) ||
+                (LHSA == RHSB && LHSB == RHSA));
+
+      // select Cond, A, B <--> select not(Cond), B, A
+      if (CondL == CondR && LHSA == RHSA && LHSB == RHSB)
+        return true;
+    }
+
+    // If the true/false operands are swapped and the conditions are compares
+    // with inverted predicates, the selects are equal:
+    // select (icmp Pred, X, Y), A, B <--> select (icmp InvPred, X, Y), B, A
+    //
+    // This also handles patterns with a double-negation in the sense of not +
+    // inverse, because we looked through a 'not' in the matching function and
+    // swapped A/B:
+    // select (cmp Pred, X, Y), A, B <--> select (not (cmp InvPred, X, Y)), B, A
+    //
+    // This intentionally does NOT handle patterns with a double-negation in
+    // the sense of not + not, because doing so could result in values
+    // comparing
     // as equal that hash differently in the min/max cases like:
-    // select (cmp slt, X, Y), X, Y <--> select (not (not (cmp slt, X, Y))), X, Y 
-    //   ^ hashes as min                  ^ would not hash as min 
-    // In the context of the EarlyCSE pass, however, such cases never reach 
-    // this code, as we simplify the double-negation before hashing the second 
-    // select (and so still succeed at CSEing them). 
-    if (LHSA == RHSB && LHSB == RHSA) { 
-      CmpInst::Predicate PredL, PredR; 
-      Value *X, *Y; 
-      if (match(CondL, m_Cmp(PredL, m_Value(X), m_Value(Y))) && 
-          match(CondR, m_Cmp(PredR, m_Specific(X), m_Specific(Y))) && 
-          CmpInst::getInversePredicate(PredL) == PredR) 
-        return true; 
-    } 
-  } 
- 
-  return false; 
-} 
- 
-bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) { 
-  // These comparisons are nontrivial, so assert that equality implies 
-  // hash equality (DenseMap demands this as an invariant). 
-  bool Result = isEqualImpl(LHS, RHS); 
-  assert(!Result || (LHS.isSentinel() && LHS.Inst == RHS.Inst) || 
-         getHashValueImpl(LHS) == getHashValueImpl(RHS)); 
-  return Result; 
-} 
- 
-//===----------------------------------------------------------------------===// 
-// CallValue 
-//===----------------------------------------------------------------------===// 
- 
-namespace { 
- 
-/// Struct representing the available call values in the scoped hash 
-/// table. 
-struct CallValue { 
-  Instruction *Inst; 
- 
-  CallValue(Instruction *I) : Inst(I) { 
-    assert((isSentinel() || canHandle(I)) && "Inst can't be handled!"); 
-  } 
- 
-  bool isSentinel() const { 
-    return Inst == DenseMapInfo<Instruction *>::getEmptyKey() || 
-           Inst == DenseMapInfo<Instruction *>::getTombstoneKey(); 
-  } 
- 
-  static bool canHandle(Instruction *Inst) { 
-    // Don't value number anything that returns void. 
-    if (Inst->getType()->isVoidTy()) 
-      return false; 
- 
-    CallInst *CI = dyn_cast<CallInst>(Inst); 
-    if (!CI || !CI->onlyReadsMemory()) 
-      return false; 
-    return true; 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-namespace llvm { 
- 
-template <> struct DenseMapInfo<CallValue> { 
-  static inline CallValue getEmptyKey() { 
-    return DenseMapInfo<Instruction *>::getEmptyKey(); 
-  } 
- 
-  static inline CallValue getTombstoneKey() { 
-    return DenseMapInfo<Instruction *>::getTombstoneKey(); 
-  } 
- 
-  static unsigned getHashValue(CallValue Val); 
-  static bool isEqual(CallValue LHS, CallValue RHS); 
-}; 
- 
-} // end namespace llvm 
- 
-unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) { 
-  Instruction *Inst = Val.Inst; 
- 
-  // gc.relocate is 'special' call: its second and third operands are 
-  // not real values, but indices into statepoint's argument list. 
-  // Get values they point to. 
-  if (const GCRelocateInst *GCR = dyn_cast<GCRelocateInst>(Inst)) 
-    return hash_combine(GCR->getOpcode(), GCR->getOperand(0), 
-                        GCR->getBasePtr(), GCR->getDerivedPtr()); 
- 
-  // Hash all of the operands as pointers and mix in the opcode. 
-  return hash_combine( 
-      Inst->getOpcode(), 
-      hash_combine_range(Inst->value_op_begin(), Inst->value_op_end())); 
-} 
- 
-bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) { 
-  Instruction *LHSI = LHS.Inst, *RHSI = RHS.Inst; 
-  if (LHS.isSentinel() || RHS.isSentinel()) 
-    return LHSI == RHSI; 
- 
-  // See comment above in `getHashValue()`. 
-  if (const GCRelocateInst *GCR1 = dyn_cast<GCRelocateInst>(LHSI)) 
-    if (const GCRelocateInst *GCR2 = dyn_cast<GCRelocateInst>(RHSI)) 
-      return GCR1->getOperand(0) == GCR2->getOperand(0) && 
-             GCR1->getBasePtr() == GCR2->getBasePtr() && 
-             GCR1->getDerivedPtr() == GCR2->getDerivedPtr(); 
- 
-  return LHSI->isIdenticalTo(RHSI); 
-} 
- 
-//===----------------------------------------------------------------------===// 
-// EarlyCSE implementation 
-//===----------------------------------------------------------------------===// 
- 
-namespace { 
- 
-/// A simple and fast domtree-based CSE pass. 
-/// 
-/// This pass does a simple depth-first walk over the dominator tree, 
-/// eliminating trivially redundant instructions and using instsimplify to 
-/// canonicalize things as it goes. It is intended to be fast and catch obvious 
-/// cases so that instcombine and other passes are more effective. It is 
-/// expected that a later pass of GVN will catch the interesting/hard cases. 
-class EarlyCSE { 
-public: 
-  const TargetLibraryInfo &TLI; 
-  const TargetTransformInfo &TTI; 
-  DominatorTree &DT; 
-  AssumptionCache &AC; 
-  const SimplifyQuery SQ; 
-  MemorySSA *MSSA; 
-  std::unique_ptr<MemorySSAUpdater> MSSAUpdater; 
- 
-  using AllocatorTy = 
-      RecyclingAllocator<BumpPtrAllocator, 
-                         ScopedHashTableVal<SimpleValue, Value *>>; 
-  using ScopedHTType = 
-      ScopedHashTable<SimpleValue, Value *, DenseMapInfo<SimpleValue>, 
-                      AllocatorTy>; 
- 
-  /// A scoped hash table of the current values of all of our simple 
-  /// scalar expressions. 
-  /// 
-  /// As we walk down the domtree, we look to see if instructions are in this: 
-  /// if so, we replace them with what we find, otherwise we insert them so 
-  /// that dominated values can succeed in their lookup. 
-  ScopedHTType AvailableValues; 
- 
-  /// A scoped hash table of the current values of previously encountered 
-  /// memory locations. 
-  /// 
-  /// This allows us to get efficient access to dominating loads or stores when 
-  /// we have a fully redundant load.  In addition to the most recent load, we 
-  /// keep track of a generation count of the read, which is compared against 
-  /// the current generation count.  The current generation count is incremented 
-  /// after every possibly writing memory operation, which ensures that we only 
-  /// CSE loads with other loads that have no intervening store.  Ordering 
-  /// events (such as fences or atomic instructions) increment the generation 
-  /// count as well; essentially, we model these as writes to all possible 
-  /// locations.  Note that atomic and/or volatile loads and stores can be 
-  /// present the table; it is the responsibility of the consumer to inspect 
-  /// the atomicity/volatility if needed. 
-  struct LoadValue { 
-    Instruction *DefInst = nullptr; 
-    unsigned Generation = 0; 
-    int MatchingId = -1; 
-    bool IsAtomic = false; 
- 
-    LoadValue() = default; 
-    LoadValue(Instruction *Inst, unsigned Generation, unsigned MatchingId, 
-              bool IsAtomic) 
-        : DefInst(Inst), Generation(Generation), MatchingId(MatchingId), 
-          IsAtomic(IsAtomic) {} 
-  }; 
- 
-  using LoadMapAllocator = 
-      RecyclingAllocator<BumpPtrAllocator, 
-                         ScopedHashTableVal<Value *, LoadValue>>; 
-  using LoadHTType = 
-      ScopedHashTable<Value *, LoadValue, DenseMapInfo<Value *>, 
-                      LoadMapAllocator>; 
- 
-  LoadHTType AvailableLoads; 
- 
-  // A scoped hash table mapping memory locations (represented as typed 
-  // addresses) to generation numbers at which that memory location became 
-  // (henceforth indefinitely) invariant. 
-  using InvariantMapAllocator = 
-      RecyclingAllocator<BumpPtrAllocator, 
-                         ScopedHashTableVal<MemoryLocation, unsigned>>; 
-  using InvariantHTType = 
-      ScopedHashTable<MemoryLocation, unsigned, DenseMapInfo<MemoryLocation>, 
-                      InvariantMapAllocator>; 
-  InvariantHTType AvailableInvariants; 
- 
-  /// A scoped hash table of the current values of read-only call 
-  /// values. 
-  /// 
-  /// It uses the same generation count as loads. 
-  using CallHTType = 
-      ScopedHashTable<CallValue, std::pair<Instruction *, unsigned>>; 
-  CallHTType AvailableCalls; 
- 
-  /// This is the current generation of the memory value. 
-  unsigned CurrentGeneration = 0; 
- 
-  /// Set up the EarlyCSE runner for a particular function. 
-  EarlyCSE(const DataLayout &DL, const TargetLibraryInfo &TLI, 
-           const TargetTransformInfo &TTI, DominatorTree &DT, 
-           AssumptionCache &AC, MemorySSA *MSSA) 
-      : TLI(TLI), TTI(TTI), DT(DT), AC(AC), SQ(DL, &TLI, &DT, &AC), MSSA(MSSA), 
-        MSSAUpdater(std::make_unique<MemorySSAUpdater>(MSSA)) {} 
- 
-  bool run(); 
- 
-private: 
-  unsigned ClobberCounter = 0; 
-  // Almost a POD, but needs to call the constructors for the scoped hash 
-  // tables so that a new scope gets pushed on. These are RAII so that the 
-  // scope gets popped when the NodeScope is destroyed. 
-  class NodeScope { 
-  public: 
-    NodeScope(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads, 
-              InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls) 
-      : Scope(AvailableValues), LoadScope(AvailableLoads), 
-        InvariantScope(AvailableInvariants), CallScope(AvailableCalls) {} 
-    NodeScope(const NodeScope &) = delete; 
-    NodeScope &operator=(const NodeScope &) = delete; 
- 
-  private: 
-    ScopedHTType::ScopeTy Scope; 
-    LoadHTType::ScopeTy LoadScope; 
-    InvariantHTType::ScopeTy InvariantScope; 
-    CallHTType::ScopeTy CallScope; 
-  }; 
- 
-  // Contains all the needed information to create a stack for doing a depth 
-  // first traversal of the tree. This includes scopes for values, loads, and 
-  // calls as well as the generation. There is a child iterator so that the 
-  // children do not need to be store separately. 
-  class StackNode { 
-  public: 
-    StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads, 
-              InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls, 
-              unsigned cg, DomTreeNode *n, DomTreeNode::const_iterator child, 
-              DomTreeNode::const_iterator end) 
-        : CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child), 
-          EndIter(end), 
-          Scopes(AvailableValues, AvailableLoads, AvailableInvariants, 
-                 AvailableCalls) 
-          {} 
-    StackNode(const StackNode &) = delete; 
-    StackNode &operator=(const StackNode &) = delete; 
- 
-    // Accessors. 
+    // select (cmp slt, X, Y), X, Y <--> select (not (not (cmp slt, X, Y))), X, Y
+    //   ^ hashes as min                  ^ would not hash as min
+    // In the context of the EarlyCSE pass, however, such cases never reach
+    // this code, as we simplify the double-negation before hashing the second
+    // select (and so still succeed at CSEing them).
+    if (LHSA == RHSB && LHSB == RHSA) {
+      CmpInst::Predicate PredL, PredR;
+      Value *X, *Y;
+      if (match(CondL, m_Cmp(PredL, m_Value(X), m_Value(Y))) &&
+          match(CondR, m_Cmp(PredR, m_Specific(X), m_Specific(Y))) &&
+          CmpInst::getInversePredicate(PredL) == PredR)
+        return true;
+    }
+  }
+
+  return false;
+}
+
+bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
+  // These comparisons are nontrivial, so assert that equality implies
+  // hash equality (DenseMap demands this as an invariant).
+  bool Result = isEqualImpl(LHS, RHS);
+  assert(!Result || (LHS.isSentinel() && LHS.Inst == RHS.Inst) ||
+         getHashValueImpl(LHS) == getHashValueImpl(RHS));
+  return Result;
+}
+
+//===----------------------------------------------------------------------===//
+// CallValue
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// Struct representing the available call values in the scoped hash
+/// table.
+struct CallValue {
+  Instruction *Inst;
+
+  CallValue(Instruction *I) : Inst(I) {
+    assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
+  }
+
+  bool isSentinel() const {
+    return Inst == DenseMapInfo<Instruction *>::getEmptyKey() ||
+           Inst == DenseMapInfo<Instruction *>::getTombstoneKey();
+  }
+
+  static bool canHandle(Instruction *Inst) {
+    // Don't value number anything that returns void.
+    if (Inst->getType()->isVoidTy())
+      return false;
+
+    CallInst *CI = dyn_cast<CallInst>(Inst);
+    if (!CI || !CI->onlyReadsMemory())
+      return false;
+    return true;
+  }
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+template <> struct DenseMapInfo<CallValue> {
+  static inline CallValue getEmptyKey() {
+    return DenseMapInfo<Instruction *>::getEmptyKey();
+  }
+
+  static inline CallValue getTombstoneKey() {
+    return DenseMapInfo<Instruction *>::getTombstoneKey();
+  }
+
+  static unsigned getHashValue(CallValue Val);
+  static bool isEqual(CallValue LHS, CallValue RHS);
+};
+
+} // end namespace llvm
+
+unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) {
+  Instruction *Inst = Val.Inst;
+
+  // gc.relocate is 'special' call: its second and third operands are
+  // not real values, but indices into statepoint's argument list.
+  // Get values they point to.
+  if (const GCRelocateInst *GCR = dyn_cast<GCRelocateInst>(Inst))
+    return hash_combine(GCR->getOpcode(), GCR->getOperand(0),
+                        GCR->getBasePtr(), GCR->getDerivedPtr());
+
+  // Hash all of the operands as pointers and mix in the opcode.
+  return hash_combine(
+      Inst->getOpcode(),
+      hash_combine_range(Inst->value_op_begin(), Inst->value_op_end()));
+}
+
+bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) {
+  Instruction *LHSI = LHS.Inst, *RHSI = RHS.Inst;
+  if (LHS.isSentinel() || RHS.isSentinel())
+    return LHSI == RHSI;
+
+  // See comment above in `getHashValue()`.
+  if (const GCRelocateInst *GCR1 = dyn_cast<GCRelocateInst>(LHSI))
+    if (const GCRelocateInst *GCR2 = dyn_cast<GCRelocateInst>(RHSI))
+      return GCR1->getOperand(0) == GCR2->getOperand(0) &&
+             GCR1->getBasePtr() == GCR2->getBasePtr() &&
+             GCR1->getDerivedPtr() == GCR2->getDerivedPtr();
+
+  return LHSI->isIdenticalTo(RHSI);
+}
+
+//===----------------------------------------------------------------------===//
+// EarlyCSE implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// A simple and fast domtree-based CSE pass.
+///
+/// This pass does a simple depth-first walk over the dominator tree,
+/// eliminating trivially redundant instructions and using instsimplify to
+/// canonicalize things as it goes. It is intended to be fast and catch obvious
+/// cases so that instcombine and other passes are more effective. It is
+/// expected that a later pass of GVN will catch the interesting/hard cases.
+class EarlyCSE {
+public:
+  const TargetLibraryInfo &TLI;
+  const TargetTransformInfo &TTI;
+  DominatorTree &DT;
+  AssumptionCache &AC;
+  const SimplifyQuery SQ;
+  MemorySSA *MSSA;
+  std::unique_ptr<MemorySSAUpdater> MSSAUpdater;
+
+  using AllocatorTy =
+      RecyclingAllocator<BumpPtrAllocator,
+                         ScopedHashTableVal<SimpleValue, Value *>>;
+  using ScopedHTType =
+      ScopedHashTable<SimpleValue, Value *, DenseMapInfo<SimpleValue>,
+                      AllocatorTy>;
+
+  /// A scoped hash table of the current values of all of our simple
+  /// scalar expressions.
+  ///
+  /// As we walk down the domtree, we look to see if instructions are in this:
+  /// if so, we replace them with what we find, otherwise we insert them so
+  /// that dominated values can succeed in their lookup.
+  ScopedHTType AvailableValues;
+
+  /// A scoped hash table of the current values of previously encountered
+  /// memory locations.
+  ///
+  /// This allows us to get efficient access to dominating loads or stores when
+  /// we have a fully redundant load.  In addition to the most recent load, we
+  /// keep track of a generation count of the read, which is compared against
+  /// the current generation count.  The current generation count is incremented
+  /// after every possibly writing memory operation, which ensures that we only
+  /// CSE loads with other loads that have no intervening store.  Ordering
+  /// events (such as fences or atomic instructions) increment the generation
+  /// count as well; essentially, we model these as writes to all possible
+  /// locations.  Note that atomic and/or volatile loads and stores can be
+  /// present the table; it is the responsibility of the consumer to inspect
+  /// the atomicity/volatility if needed.
+  struct LoadValue {
+    Instruction *DefInst = nullptr;
+    unsigned Generation = 0;
+    int MatchingId = -1;
+    bool IsAtomic = false;
+
+    LoadValue() = default;
+    LoadValue(Instruction *Inst, unsigned Generation, unsigned MatchingId,
+              bool IsAtomic)
+        : DefInst(Inst), Generation(Generation), MatchingId(MatchingId),
+          IsAtomic(IsAtomic) {}
+  };
+
+  using LoadMapAllocator =
+      RecyclingAllocator<BumpPtrAllocator,
+                         ScopedHashTableVal<Value *, LoadValue>>;
+  using LoadHTType =
+      ScopedHashTable<Value *, LoadValue, DenseMapInfo<Value *>,
+                      LoadMapAllocator>;
+
+  LoadHTType AvailableLoads;
+
+  // A scoped hash table mapping memory locations (represented as typed
+  // addresses) to generation numbers at which that memory location became
+  // (henceforth indefinitely) invariant.
+  using InvariantMapAllocator =
+      RecyclingAllocator<BumpPtrAllocator,
+                         ScopedHashTableVal<MemoryLocation, unsigned>>;
+  using InvariantHTType =
+      ScopedHashTable<MemoryLocation, unsigned, DenseMapInfo<MemoryLocation>,
+                      InvariantMapAllocator>;
+  InvariantHTType AvailableInvariants;
+
+  /// A scoped hash table of the current values of read-only call
+  /// values.
+  ///
+  /// It uses the same generation count as loads.
+  using CallHTType =
+      ScopedHashTable<CallValue, std::pair<Instruction *, unsigned>>;
+  CallHTType AvailableCalls;
+
+  /// This is the current generation of the memory value.
+  unsigned CurrentGeneration = 0;
+
+  /// Set up the EarlyCSE runner for a particular function.
+  EarlyCSE(const DataLayout &DL, const TargetLibraryInfo &TLI,
+           const TargetTransformInfo &TTI, DominatorTree &DT,
+           AssumptionCache &AC, MemorySSA *MSSA)
+      : TLI(TLI), TTI(TTI), DT(DT), AC(AC), SQ(DL, &TLI, &DT, &AC), MSSA(MSSA),
+        MSSAUpdater(std::make_unique<MemorySSAUpdater>(MSSA)) {}
+
+  bool run();
+
+private:
+  unsigned ClobberCounter = 0;
+  // Almost a POD, but needs to call the constructors for the scoped hash
+  // tables so that a new scope gets pushed on. These are RAII so that the
+  // scope gets popped when the NodeScope is destroyed.
+  class NodeScope {
+  public:
+    NodeScope(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
+              InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls)
+      : Scope(AvailableValues), LoadScope(AvailableLoads),
+        InvariantScope(AvailableInvariants), CallScope(AvailableCalls) {}
+    NodeScope(const NodeScope &) = delete;
+    NodeScope &operator=(const NodeScope &) = delete;
+
+  private:
+    ScopedHTType::ScopeTy Scope;
+    LoadHTType::ScopeTy LoadScope;
+    InvariantHTType::ScopeTy InvariantScope;
+    CallHTType::ScopeTy CallScope;
+  };
+
+  // Contains all the needed information to create a stack for doing a depth
+  // first traversal of the tree. This includes scopes for values, loads, and
+  // calls as well as the generation. There is a child iterator so that the
+  // children do not need to be store separately.
+  class StackNode {
+  public:
+    StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
+              InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls,
+              unsigned cg, DomTreeNode *n, DomTreeNode::const_iterator child,
+              DomTreeNode::const_iterator end)
+        : CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child),
+          EndIter(end),
+          Scopes(AvailableValues, AvailableLoads, AvailableInvariants,
+                 AvailableCalls)
+          {}
+    StackNode(const StackNode &) = delete;
+    StackNode &operator=(const StackNode &) = delete;
+
+    // Accessors.
     unsigned currentGeneration() const { return CurrentGeneration; }
     unsigned childGeneration() const { return ChildGeneration; }
-    void childGeneration(unsigned generation) { ChildGeneration = generation; } 
-    DomTreeNode *node() { return Node; } 
+    void childGeneration(unsigned generation) { ChildGeneration = generation; }
+    DomTreeNode *node() { return Node; }
     DomTreeNode::const_iterator childIter() const { return ChildIter; }
- 
-    DomTreeNode *nextChild() { 
-      DomTreeNode *child = *ChildIter; 
-      ++ChildIter; 
-      return child; 
-    } 
- 
+
+    DomTreeNode *nextChild() {
+      DomTreeNode *child = *ChildIter;
+      ++ChildIter;
+      return child;
+    }
+
     DomTreeNode::const_iterator end() const { return EndIter; }
     bool isProcessed() const { return Processed; }
-    void process() { Processed = true; } 
- 
-  private: 
-    unsigned CurrentGeneration; 
-    unsigned ChildGeneration; 
-    DomTreeNode *Node; 
-    DomTreeNode::const_iterator ChildIter; 
-    DomTreeNode::const_iterator EndIter; 
-    NodeScope Scopes; 
-    bool Processed = false; 
-  }; 
- 
-  /// Wrapper class to handle memory instructions, including loads, 
-  /// stores and intrinsic loads and stores defined by the target. 
-  class ParseMemoryInst { 
-  public: 
-    ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI) 
-      : Inst(Inst) { 
+    void process() { Processed = true; }
+
+  private:
+    unsigned CurrentGeneration;
+    unsigned ChildGeneration;
+    DomTreeNode *Node;
+    DomTreeNode::const_iterator ChildIter;
+    DomTreeNode::const_iterator EndIter;
+    NodeScope Scopes;
+    bool Processed = false;
+  };
+
+  /// Wrapper class to handle memory instructions, including loads,
+  /// stores and intrinsic loads and stores defined by the target.
+  class ParseMemoryInst {
+  public:
+    ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI)
+      : Inst(Inst) {
       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
         IntrID = II->getIntrinsicID();
-        if (TTI.getTgtMemIntrinsic(II, Info)) 
+        if (TTI.getTgtMemIntrinsic(II, Info))
           return;
         if (isHandledNonTargetIntrinsic(IntrID)) {
           switch (IntrID) {
@@ -688,97 +688,97 @@ private:
           }
         }
       }
-    } 
- 
+    }
+
     Instruction *get() { return Inst; }
     const Instruction *get() const { return Inst; }
 
-    bool isLoad() const { 
+    bool isLoad() const {
       if (IntrID != 0)
         return Info.ReadMem;
-      return isa<LoadInst>(Inst); 
-    } 
- 
-    bool isStore() const { 
+      return isa<LoadInst>(Inst);
+    }
+
+    bool isStore() const {
       if (IntrID != 0)
         return Info.WriteMem;
-      return isa<StoreInst>(Inst); 
-    } 
- 
-    bool isAtomic() const { 
+      return isa<StoreInst>(Inst);
+    }
+
+    bool isAtomic() const {
       if (IntrID != 0)
-        return Info.Ordering != AtomicOrdering::NotAtomic; 
-      return Inst->isAtomic(); 
-    } 
- 
-    bool isUnordered() const { 
+        return Info.Ordering != AtomicOrdering::NotAtomic;
+      return Inst->isAtomic();
+    }
+
+    bool isUnordered() const {
       if (IntrID != 0)
-        return Info.isUnordered(); 
- 
-      if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { 
-        return LI->isUnordered(); 
-      } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { 
-        return SI->isUnordered(); 
-      } 
-      // Conservative answer 
-      return !Inst->isAtomic(); 
-    } 
- 
-    bool isVolatile() const { 
+        return Info.isUnordered();
+
+      if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+        return LI->isUnordered();
+      } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+        return SI->isUnordered();
+      }
+      // Conservative answer
+      return !Inst->isAtomic();
+    }
+
+    bool isVolatile() const {
       if (IntrID != 0)
-        return Info.IsVolatile; 
- 
-      if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { 
-        return LI->isVolatile(); 
-      } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { 
-        return SI->isVolatile(); 
-      } 
-      // Conservative answer 
-      return true; 
-    } 
- 
-    bool isInvariantLoad() const { 
-      if (auto *LI = dyn_cast<LoadInst>(Inst)) 
-        return LI->hasMetadata(LLVMContext::MD_invariant_load); 
-      return false; 
-    } 
- 
-    bool isValid() const { return getPointerOperand() != nullptr; } 
- 
-    // For regular (non-intrinsic) loads/stores, this is set to -1. For 
-    // intrinsic loads/stores, the id is retrieved from the corresponding 
-    // field in the MemIntrinsicInfo structure.  That field contains 
-    // non-negative values only. 
-    int getMatchingId() const { 
+        return Info.IsVolatile;
+
+      if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+        return LI->isVolatile();
+      } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+        return SI->isVolatile();
+      }
+      // Conservative answer
+      return true;
+    }
+
+    bool isInvariantLoad() const {
+      if (auto *LI = dyn_cast<LoadInst>(Inst))
+        return LI->hasMetadata(LLVMContext::MD_invariant_load);
+      return false;
+    }
+
+    bool isValid() const { return getPointerOperand() != nullptr; }
+
+    // For regular (non-intrinsic) loads/stores, this is set to -1. For
+    // intrinsic loads/stores, the id is retrieved from the corresponding
+    // field in the MemIntrinsicInfo structure.  That field contains
+    // non-negative values only.
+    int getMatchingId() const {
       if (IntrID != 0)
         return Info.MatchingId;
-      return -1; 
-    } 
- 
-    Value *getPointerOperand() const { 
+      return -1;
+    }
+
+    Value *getPointerOperand() const {
       if (IntrID != 0)
         return Info.PtrVal;
-      return getLoadStorePointerOperand(Inst); 
-    } 
- 
-    bool mayReadFromMemory() const { 
+      return getLoadStorePointerOperand(Inst);
+    }
+
+    bool mayReadFromMemory() const {
       if (IntrID != 0)
         return Info.ReadMem;
-      return Inst->mayReadFromMemory(); 
-    } 
- 
-    bool mayWriteToMemory() const { 
+      return Inst->mayReadFromMemory();
+    }
+
+    bool mayWriteToMemory() const {
       if (IntrID != 0)
         return Info.WriteMem;
-      return Inst->mayWriteToMemory(); 
-    } 
- 
-  private: 
+      return Inst->mayWriteToMemory();
+    }
+
+  private:
     Intrinsic::ID IntrID = 0;
-    MemIntrinsicInfo Info; 
-    Instruction *Inst; 
-  }; 
- 
+    MemIntrinsicInfo Info;
+    Instruction *Inst;
+  };
+
   // This function is to prevent accidentally passing a non-target
   // intrinsic ID to TargetTransformInfo.
   static bool isHandledNonTargetIntrinsic(Intrinsic::ID ID) {
@@ -795,29 +795,29 @@ private:
     return false;
   }
 
-  bool processNode(DomTreeNode *Node); 
- 
-  bool handleBranchCondition(Instruction *CondInst, const BranchInst *BI, 
-                             const BasicBlock *BB, const BasicBlock *Pred); 
- 
+  bool processNode(DomTreeNode *Node);
+
+  bool handleBranchCondition(Instruction *CondInst, const BranchInst *BI,
+                             const BasicBlock *BB, const BasicBlock *Pred);
+
   Value *getMatchingValue(LoadValue &InVal, ParseMemoryInst &MemInst,
                           unsigned CurrentGeneration);
 
   bool overridingStores(const ParseMemoryInst &Earlier,
                         const ParseMemoryInst &Later);
 
-  Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const { 
-    if (auto *LI = dyn_cast<LoadInst>(Inst)) 
-      return LI; 
-    if (auto *SI = dyn_cast<StoreInst>(Inst)) 
-      return SI->getValueOperand(); 
-    assert(isa<IntrinsicInst>(Inst) && "Instruction not supported"); 
+  Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const {
+    if (auto *LI = dyn_cast<LoadInst>(Inst))
+      return LI;
+    if (auto *SI = dyn_cast<StoreInst>(Inst))
+      return SI->getValueOperand();
+    assert(isa<IntrinsicInst>(Inst) && "Instruction not supported");
     auto *II = cast<IntrinsicInst>(Inst);
     if (isHandledNonTargetIntrinsic(II->getIntrinsicID()))
       return getOrCreateResultNonTargetMemIntrinsic(II, ExpectedType);
     return TTI.getOrCreateResultFromMemIntrinsic(II, ExpectedType);
-  } 
- 
+  }
+
   Value *getOrCreateResultNonTargetMemIntrinsic(IntrinsicInst *II,
                                                 Type *ExpectedType) const {
     switch (II->getIntrinsicID()) {
@@ -829,13 +829,13 @@ private:
     return nullptr;
   }
 
-  /// Return true if the instruction is known to only operate on memory 
-  /// provably invariant in the given "generation". 
-  bool isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt); 
- 
-  bool isSameMemGeneration(unsigned EarlierGeneration, unsigned LaterGeneration, 
-                           Instruction *EarlierInst, Instruction *LaterInst); 
- 
+  /// Return true if the instruction is known to only operate on memory
+  /// provably invariant in the given "generation".
+  bool isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt);
+
+  bool isSameMemGeneration(unsigned EarlierGeneration, unsigned LaterGeneration,
+                           Instruction *EarlierInst, Instruction *LaterInst);
+
   bool isNonTargetIntrinsicMatch(const IntrinsicInst *Earlier,
                                  const IntrinsicInst *Later) {
     auto IsSubmask = [](const Value *Mask0, const Value *Mask1) {
@@ -931,108 +931,108 @@ private:
     return false;
   }
 
-  void removeMSSA(Instruction &Inst) { 
-    if (!MSSA) 
-      return; 
-    if (VerifyMemorySSA) 
-      MSSA->verifyMemorySSA(); 
-    // Removing a store here can leave MemorySSA in an unoptimized state by 
-    // creating MemoryPhis that have identical arguments and by creating 
-    // MemoryUses whose defining access is not an actual clobber. The phi case 
-    // is handled by MemorySSA when passing OptimizePhis = true to 
-    // removeMemoryAccess.  The non-optimized MemoryUse case is lazily updated 
-    // by MemorySSA's getClobberingMemoryAccess. 
-    MSSAUpdater->removeMemoryAccess(&Inst, true); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-/// Determine if the memory referenced by LaterInst is from the same heap 
-/// version as EarlierInst. 
-/// This is currently called in two scenarios: 
-/// 
-///   load p 
-///   ... 
-///   load p 
-/// 
-/// and 
-/// 
-///   x = load p 
-///   ... 
-///   store x, p 
-/// 
-/// in both cases we want to verify that there are no possible writes to the 
-/// memory referenced by p between the earlier and later instruction. 
-bool EarlyCSE::isSameMemGeneration(unsigned EarlierGeneration, 
-                                   unsigned LaterGeneration, 
-                                   Instruction *EarlierInst, 
-                                   Instruction *LaterInst) { 
-  // Check the simple memory generation tracking first. 
-  if (EarlierGeneration == LaterGeneration) 
-    return true; 
- 
-  if (!MSSA) 
-    return false; 
- 
-  // If MemorySSA has determined that one of EarlierInst or LaterInst does not 
-  // read/write memory, then we can safely return true here. 
-  // FIXME: We could be more aggressive when checking doesNotAccessMemory(), 
-  // onlyReadsMemory(), mayReadFromMemory(), and mayWriteToMemory() in this pass 
-  // by also checking the MemorySSA MemoryAccess on the instruction.  Initial 
-  // experiments suggest this isn't worthwhile, at least for C/C++ code compiled 
-  // with the default optimization pipeline. 
-  auto *EarlierMA = MSSA->getMemoryAccess(EarlierInst); 
-  if (!EarlierMA) 
-    return true; 
-  auto *LaterMA = MSSA->getMemoryAccess(LaterInst); 
-  if (!LaterMA) 
-    return true; 
- 
-  // Since we know LaterDef dominates LaterInst and EarlierInst dominates 
-  // LaterInst, if LaterDef dominates EarlierInst then it can't occur between 
-  // EarlierInst and LaterInst and neither can any other write that potentially 
-  // clobbers LaterInst. 
-  MemoryAccess *LaterDef; 
-  if (ClobberCounter < EarlyCSEMssaOptCap) { 
-    LaterDef = MSSA->getWalker()->getClobberingMemoryAccess(LaterInst); 
-    ClobberCounter++; 
-  } else 
-    LaterDef = LaterMA->getDefiningAccess(); 
- 
-  return MSSA->dominates(LaterDef, EarlierMA); 
-} 
- 
-bool EarlyCSE::isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt) { 
-  // A location loaded from with an invariant_load is assumed to *never* change 
-  // within the visible scope of the compilation. 
-  if (auto *LI = dyn_cast<LoadInst>(I)) 
-    if (LI->hasMetadata(LLVMContext::MD_invariant_load)) 
-      return true; 
- 
-  auto MemLocOpt = MemoryLocation::getOrNone(I); 
-  if (!MemLocOpt) 
-    // "target" intrinsic forms of loads aren't currently known to 
-    // MemoryLocation::get.  TODO 
-    return false; 
-  MemoryLocation MemLoc = *MemLocOpt; 
-  if (!AvailableInvariants.count(MemLoc)) 
-    return false; 
- 
-  // Is the generation at which this became invariant older than the 
-  // current one? 
-  return AvailableInvariants.lookup(MemLoc) <= GenAt; 
-} 
- 
-bool EarlyCSE::handleBranchCondition(Instruction *CondInst, 
-                                     const BranchInst *BI, const BasicBlock *BB, 
-                                     const BasicBlock *Pred) { 
-  assert(BI->isConditional() && "Should be a conditional branch!"); 
-  assert(BI->getCondition() == CondInst && "Wrong condition?"); 
-  assert(BI->getSuccessor(0) == BB || BI->getSuccessor(1) == BB); 
-  auto *TorF = (BI->getSuccessor(0) == BB) 
-                   ? ConstantInt::getTrue(BB->getContext()) 
-                   : ConstantInt::getFalse(BB->getContext()); 
+  void removeMSSA(Instruction &Inst) {
+    if (!MSSA)
+      return;
+    if (VerifyMemorySSA)
+      MSSA->verifyMemorySSA();
+    // Removing a store here can leave MemorySSA in an unoptimized state by
+    // creating MemoryPhis that have identical arguments and by creating
+    // MemoryUses whose defining access is not an actual clobber. The phi case
+    // is handled by MemorySSA when passing OptimizePhis = true to
+    // removeMemoryAccess.  The non-optimized MemoryUse case is lazily updated
+    // by MemorySSA's getClobberingMemoryAccess.
+    MSSAUpdater->removeMemoryAccess(&Inst, true);
+  }
+};
+
+} // end anonymous namespace
+
+/// Determine if the memory referenced by LaterInst is from the same heap
+/// version as EarlierInst.
+/// This is currently called in two scenarios:
+///
+///   load p
+///   ...
+///   load p
+///
+/// and
+///
+///   x = load p
+///   ...
+///   store x, p
+///
+/// in both cases we want to verify that there are no possible writes to the
+/// memory referenced by p between the earlier and later instruction.
+bool EarlyCSE::isSameMemGeneration(unsigned EarlierGeneration,
+                                   unsigned LaterGeneration,
+                                   Instruction *EarlierInst,
+                                   Instruction *LaterInst) {
+  // Check the simple memory generation tracking first.
+  if (EarlierGeneration == LaterGeneration)
+    return true;
+
+  if (!MSSA)
+    return false;
+
+  // If MemorySSA has determined that one of EarlierInst or LaterInst does not
+  // read/write memory, then we can safely return true here.
+  // FIXME: We could be more aggressive when checking doesNotAccessMemory(),
+  // onlyReadsMemory(), mayReadFromMemory(), and mayWriteToMemory() in this pass
+  // by also checking the MemorySSA MemoryAccess on the instruction.  Initial
+  // experiments suggest this isn't worthwhile, at least for C/C++ code compiled
+  // with the default optimization pipeline.
+  auto *EarlierMA = MSSA->getMemoryAccess(EarlierInst);
+  if (!EarlierMA)
+    return true;
+  auto *LaterMA = MSSA->getMemoryAccess(LaterInst);
+  if (!LaterMA)
+    return true;
+
+  // Since we know LaterDef dominates LaterInst and EarlierInst dominates
+  // LaterInst, if LaterDef dominates EarlierInst then it can't occur between
+  // EarlierInst and LaterInst and neither can any other write that potentially
+  // clobbers LaterInst.
+  MemoryAccess *LaterDef;
+  if (ClobberCounter < EarlyCSEMssaOptCap) {
+    LaterDef = MSSA->getWalker()->getClobberingMemoryAccess(LaterInst);
+    ClobberCounter++;
+  } else
+    LaterDef = LaterMA->getDefiningAccess();
+
+  return MSSA->dominates(LaterDef, EarlierMA);
+}
+
+bool EarlyCSE::isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt) {
+  // A location loaded from with an invariant_load is assumed to *never* change
+  // within the visible scope of the compilation.
+  if (auto *LI = dyn_cast<LoadInst>(I))
+    if (LI->hasMetadata(LLVMContext::MD_invariant_load))
+      return true;
+
+  auto MemLocOpt = MemoryLocation::getOrNone(I);
+  if (!MemLocOpt)
+    // "target" intrinsic forms of loads aren't currently known to
+    // MemoryLocation::get.  TODO
+    return false;
+  MemoryLocation MemLoc = *MemLocOpt;
+  if (!AvailableInvariants.count(MemLoc))
+    return false;
+
+  // Is the generation at which this became invariant older than the
+  // current one?
+  return AvailableInvariants.lookup(MemLoc) <= GenAt;
+}
+
+bool EarlyCSE::handleBranchCondition(Instruction *CondInst,
+                                     const BranchInst *BI, const BasicBlock *BB,
+                                     const BasicBlock *Pred) {
+  assert(BI->isConditional() && "Should be a conditional branch!");
+  assert(BI->getCondition() == CondInst && "Wrong condition?");
+  assert(BI->getSuccessor(0) == BB || BI->getSuccessor(1) == BB);
+  auto *TorF = (BI->getSuccessor(0) == BB)
+                   ? ConstantInt::getTrue(BB->getContext())
+                   : ConstantInt::getFalse(BB->getContext());
   auto MatchBinOp = [](Instruction *I, unsigned Opcode, Value *&LHS,
                        Value *&RHS) {
     if (Opcode == Instruction::And &&
@@ -1041,47 +1041,47 @@ bool EarlyCSE::handleBranchCondition(Instruction *CondInst,
     else if (Opcode == Instruction::Or &&
              match(I, m_LogicalOr(m_Value(LHS), m_Value(RHS))))
       return true;
-    return false; 
-  }; 
-  // If the condition is AND operation, we can propagate its operands into the 
-  // true branch. If it is OR operation, we can propagate them into the false 
-  // branch. 
-  unsigned PropagateOpcode = 
-      (BI->getSuccessor(0) == BB) ? Instruction::And : Instruction::Or; 
- 
-  bool MadeChanges = false; 
-  SmallVector<Instruction *, 4> WorkList; 
-  SmallPtrSet<Instruction *, 4> Visited; 
-  WorkList.push_back(CondInst); 
-  while (!WorkList.empty()) { 
-    Instruction *Curr = WorkList.pop_back_val(); 
- 
-    AvailableValues.insert(Curr, TorF); 
-    LLVM_DEBUG(dbgs() << "EarlyCSE CVP: Add conditional value for '" 
-                      << Curr->getName() << "' as " << *TorF << " in " 
-                      << BB->getName() << "\n"); 
-    if (!DebugCounter::shouldExecute(CSECounter)) { 
-      LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n"); 
-    } else { 
-      // Replace all dominated uses with the known value. 
-      if (unsigned Count = replaceDominatedUsesWith(Curr, TorF, DT, 
-                                                    BasicBlockEdge(Pred, BB))) { 
-        NumCSECVP += Count; 
-        MadeChanges = true; 
-      } 
-    } 
- 
+    return false;
+  };
+  // If the condition is AND operation, we can propagate its operands into the
+  // true branch. If it is OR operation, we can propagate them into the false
+  // branch.
+  unsigned PropagateOpcode =
+      (BI->getSuccessor(0) == BB) ? Instruction::And : Instruction::Or;
+
+  bool MadeChanges = false;
+  SmallVector<Instruction *, 4> WorkList;
+  SmallPtrSet<Instruction *, 4> Visited;
+  WorkList.push_back(CondInst);
+  while (!WorkList.empty()) {
+    Instruction *Curr = WorkList.pop_back_val();
+
+    AvailableValues.insert(Curr, TorF);
+    LLVM_DEBUG(dbgs() << "EarlyCSE CVP: Add conditional value for '"
+                      << Curr->getName() << "' as " << *TorF << " in "
+                      << BB->getName() << "\n");
+    if (!DebugCounter::shouldExecute(CSECounter)) {
+      LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+    } else {
+      // Replace all dominated uses with the known value.
+      if (unsigned Count = replaceDominatedUsesWith(Curr, TorF, DT,
+                                                    BasicBlockEdge(Pred, BB))) {
+        NumCSECVP += Count;
+        MadeChanges = true;
+      }
+    }
+
     Value *LHS, *RHS;
     if (MatchBinOp(Curr, PropagateOpcode, LHS, RHS))
       for (auto &Op : { LHS, RHS })
-        if (Instruction *OPI = dyn_cast<Instruction>(Op)) 
-          if (SimpleValue::canHandle(OPI) && Visited.insert(OPI).second) 
-            WorkList.push_back(OPI); 
-  } 
- 
-  return MadeChanges; 
-} 
- 
+        if (Instruction *OPI = dyn_cast<Instruction>(Op))
+          if (SimpleValue::canHandle(OPI) && Visited.insert(OPI).second)
+            WorkList.push_back(OPI);
+  }
+
+  return MadeChanges;
+}
+
 Value *EarlyCSE::getMatchingValue(LoadValue &InVal, ParseMemoryInst &MemInst,
                                   unsigned CurrentGeneration) {
   if (InVal.DefInst == nullptr)
@@ -1162,76 +1162,76 @@ bool EarlyCSE::overridingStores(const ParseMemoryInst &Earlier,
   return ENTI == LNTI;
 }
 
-bool EarlyCSE::processNode(DomTreeNode *Node) { 
-  bool Changed = false; 
-  BasicBlock *BB = Node->getBlock(); 
- 
-  // If this block has a single predecessor, then the predecessor is the parent 
-  // of the domtree node and all of the live out memory values are still current 
-  // in this block.  If this block has multiple predecessors, then they could 
-  // have invalidated the live-out memory values of our parent value.  For now, 
-  // just be conservative and invalidate memory if this block has multiple 
-  // predecessors. 
-  if (!BB->getSinglePredecessor()) 
-    ++CurrentGeneration; 
- 
-  // If this node has a single predecessor which ends in a conditional branch, 
-  // we can infer the value of the branch condition given that we took this 
-  // path.  We need the single predecessor to ensure there's not another path 
-  // which reaches this block where the condition might hold a different 
-  // value.  Since we're adding this to the scoped hash table (like any other 
-  // def), it will have been popped if we encounter a future merge block. 
-  if (BasicBlock *Pred = BB->getSinglePredecessor()) { 
-    auto *BI = dyn_cast<BranchInst>(Pred->getTerminator()); 
-    if (BI && BI->isConditional()) { 
-      auto *CondInst = dyn_cast<Instruction>(BI->getCondition()); 
-      if (CondInst && SimpleValue::canHandle(CondInst)) 
-        Changed |= handleBranchCondition(CondInst, BI, BB, Pred); 
-    } 
-  } 
- 
-  /// LastStore - Keep track of the last non-volatile store that we saw... for 
-  /// as long as there in no instruction that reads memory.  If we see a store 
-  /// to the same location, we delete the dead store.  This zaps trivial dead 
-  /// stores which can occur in bitfield code among other things. 
-  Instruction *LastStore = nullptr; 
- 
-  // See if any instructions in the block can be eliminated.  If so, do it.  If 
-  // not, add them to AvailableValues. 
-  for (Instruction &Inst : make_early_inc_range(BB->getInstList())) { 
-    // Dead instructions should just be removed. 
-    if (isInstructionTriviallyDead(&Inst, &TLI)) { 
-      LLVM_DEBUG(dbgs() << "EarlyCSE DCE: " << Inst << '\n'); 
-      if (!DebugCounter::shouldExecute(CSECounter)) { 
-        LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n"); 
-        continue; 
-      } 
- 
-      salvageKnowledge(&Inst, &AC); 
-      salvageDebugInfo(Inst); 
-      removeMSSA(Inst); 
-      Inst.eraseFromParent(); 
-      Changed = true; 
-      ++NumSimplify; 
-      continue; 
-    } 
- 
-    // Skip assume intrinsics, they don't really have side effects (although 
-    // they're marked as such to ensure preservation of control dependencies), 
-    // and this pass will not bother with its removal. However, we should mark 
-    // its condition as true for all dominated blocks. 
-    if (match(&Inst, m_Intrinsic<Intrinsic::assume>())) { 
-      auto *CondI = 
-          dyn_cast<Instruction>(cast<CallInst>(Inst).getArgOperand(0)); 
-      if (CondI && SimpleValue::canHandle(CondI)) { 
-        LLVM_DEBUG(dbgs() << "EarlyCSE considering assumption: " << Inst 
-                          << '\n'); 
-        AvailableValues.insert(CondI, ConstantInt::getTrue(BB->getContext())); 
-      } else 
-        LLVM_DEBUG(dbgs() << "EarlyCSE skipping assumption: " << Inst << '\n'); 
-      continue; 
-    } 
- 
+bool EarlyCSE::processNode(DomTreeNode *Node) {
+  bool Changed = false;
+  BasicBlock *BB = Node->getBlock();
+
+  // If this block has a single predecessor, then the predecessor is the parent
+  // of the domtree node and all of the live out memory values are still current
+  // in this block.  If this block has multiple predecessors, then they could
+  // have invalidated the live-out memory values of our parent value.  For now,
+  // just be conservative and invalidate memory if this block has multiple
+  // predecessors.
+  if (!BB->getSinglePredecessor())
+    ++CurrentGeneration;
+
+  // If this node has a single predecessor which ends in a conditional branch,
+  // we can infer the value of the branch condition given that we took this
+  // path.  We need the single predecessor to ensure there's not another path
+  // which reaches this block where the condition might hold a different
+  // value.  Since we're adding this to the scoped hash table (like any other
+  // def), it will have been popped if we encounter a future merge block.
+  if (BasicBlock *Pred = BB->getSinglePredecessor()) {
+    auto *BI = dyn_cast<BranchInst>(Pred->getTerminator());
+    if (BI && BI->isConditional()) {
+      auto *CondInst = dyn_cast<Instruction>(BI->getCondition());
+      if (CondInst && SimpleValue::canHandle(CondInst))
+        Changed |= handleBranchCondition(CondInst, BI, BB, Pred);
+    }
+  }
+
+  /// LastStore - Keep track of the last non-volatile store that we saw... for
+  /// as long as there in no instruction that reads memory.  If we see a store
+  /// to the same location, we delete the dead store.  This zaps trivial dead
+  /// stores which can occur in bitfield code among other things.
+  Instruction *LastStore = nullptr;
+
+  // See if any instructions in the block can be eliminated.  If so, do it.  If
+  // not, add them to AvailableValues.
+  for (Instruction &Inst : make_early_inc_range(BB->getInstList())) {
+    // Dead instructions should just be removed.
+    if (isInstructionTriviallyDead(&Inst, &TLI)) {
+      LLVM_DEBUG(dbgs() << "EarlyCSE DCE: " << Inst << '\n');
+      if (!DebugCounter::shouldExecute(CSECounter)) {
+        LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+        continue;
+      }
+
+      salvageKnowledge(&Inst, &AC);
+      salvageDebugInfo(Inst);
+      removeMSSA(Inst);
+      Inst.eraseFromParent();
+      Changed = true;
+      ++NumSimplify;
+      continue;
+    }
+
+    // Skip assume intrinsics, they don't really have side effects (although
+    // they're marked as such to ensure preservation of control dependencies),
+    // and this pass will not bother with its removal. However, we should mark
+    // its condition as true for all dominated blocks.
+    if (match(&Inst, m_Intrinsic<Intrinsic::assume>())) {
+      auto *CondI =
+          dyn_cast<Instruction>(cast<CallInst>(Inst).getArgOperand(0));
+      if (CondI && SimpleValue::canHandle(CondI)) {
+        LLVM_DEBUG(dbgs() << "EarlyCSE considering assumption: " << Inst
+                          << '\n');
+        AvailableValues.insert(CondI, ConstantInt::getTrue(BB->getContext()));
+      } else
+        LLVM_DEBUG(dbgs() << "EarlyCSE skipping assumption: " << Inst << '\n');
+      continue;
+    }
+
     // Likewise, noalias intrinsics don't actually write.
     if (match(&Inst,
               m_Intrinsic<Intrinsic::experimental_noalias_scope_decl>())) {
@@ -1240,159 +1240,159 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       continue;
     }
 
-    // Skip sideeffect intrinsics, for the same reason as assume intrinsics. 
-    if (match(&Inst, m_Intrinsic<Intrinsic::sideeffect>())) { 
-      LLVM_DEBUG(dbgs() << "EarlyCSE skipping sideeffect: " << Inst << '\n'); 
-      continue; 
-    } 
- 
-    // We can skip all invariant.start intrinsics since they only read memory, 
-    // and we can forward values across it. For invariant starts without 
-    // invariant ends, we can use the fact that the invariantness never ends to 
-    // start a scope in the current generaton which is true for all future 
-    // generations.  Also, we dont need to consume the last store since the 
-    // semantics of invariant.start allow us to perform   DSE of the last 
-    // store, if there was a store following invariant.start. Consider: 
-    // 
-    // store 30, i8* p 
-    // invariant.start(p) 
-    // store 40, i8* p 
-    // We can DSE the store to 30, since the store 40 to invariant location p 
-    // causes undefined behaviour. 
-    if (match(&Inst, m_Intrinsic<Intrinsic::invariant_start>())) { 
-      // If there are any uses, the scope might end. 
-      if (!Inst.use_empty()) 
-        continue; 
-      MemoryLocation MemLoc = 
-          MemoryLocation::getForArgument(&cast<CallInst>(Inst), 1, TLI); 
-      // Don't start a scope if we already have a better one pushed 
-      if (!AvailableInvariants.count(MemLoc)) 
-        AvailableInvariants.insert(MemLoc, CurrentGeneration); 
-      continue; 
-    } 
- 
-    if (isGuard(&Inst)) { 
-      if (auto *CondI = 
-              dyn_cast<Instruction>(cast<CallInst>(Inst).getArgOperand(0))) { 
-        if (SimpleValue::canHandle(CondI)) { 
-          // Do we already know the actual value of this condition? 
-          if (auto *KnownCond = AvailableValues.lookup(CondI)) { 
-            // Is the condition known to be true? 
-            if (isa<ConstantInt>(KnownCond) && 
-                cast<ConstantInt>(KnownCond)->isOne()) { 
-              LLVM_DEBUG(dbgs() 
-                         << "EarlyCSE removing guard: " << Inst << '\n'); 
-              salvageKnowledge(&Inst, &AC); 
-              removeMSSA(Inst); 
-              Inst.eraseFromParent(); 
-              Changed = true; 
-              continue; 
-            } else 
-              // Use the known value if it wasn't true. 
-              cast<CallInst>(Inst).setArgOperand(0, KnownCond); 
-          } 
-          // The condition we're on guarding here is true for all dominated 
-          // locations. 
-          AvailableValues.insert(CondI, ConstantInt::getTrue(BB->getContext())); 
-        } 
-      } 
- 
-      // Guard intrinsics read all memory, but don't write any memory. 
-      // Accordingly, don't update the generation but consume the last store (to 
-      // avoid an incorrect DSE). 
-      LastStore = nullptr; 
-      continue; 
-    } 
- 
-    // If the instruction can be simplified (e.g. X+0 = X) then replace it with 
-    // its simpler value. 
-    if (Value *V = SimplifyInstruction(&Inst, SQ)) { 
-      LLVM_DEBUG(dbgs() << "EarlyCSE Simplify: " << Inst << "  to: " << *V 
-                        << '\n'); 
-      if (!DebugCounter::shouldExecute(CSECounter)) { 
-        LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n"); 
-      } else { 
-        bool Killed = false; 
-        if (!Inst.use_empty()) { 
-          Inst.replaceAllUsesWith(V); 
-          Changed = true; 
-        } 
-        if (isInstructionTriviallyDead(&Inst, &TLI)) { 
-          salvageKnowledge(&Inst, &AC); 
-          removeMSSA(Inst); 
-          Inst.eraseFromParent(); 
-          Changed = true; 
-          Killed = true; 
-        } 
-        if (Changed) 
-          ++NumSimplify; 
-        if (Killed) 
-          continue; 
-      } 
-    } 
- 
-    // If this is a simple instruction that we can value number, process it. 
-    if (SimpleValue::canHandle(&Inst)) { 
-      // See if the instruction has an available value.  If so, use it. 
-      if (Value *V = AvailableValues.lookup(&Inst)) { 
-        LLVM_DEBUG(dbgs() << "EarlyCSE CSE: " << Inst << "  to: " << *V 
-                          << '\n'); 
-        if (!DebugCounter::shouldExecute(CSECounter)) { 
-          LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n"); 
-          continue; 
-        } 
-        if (auto *I = dyn_cast<Instruction>(V)) 
-          I->andIRFlags(&Inst); 
-        Inst.replaceAllUsesWith(V); 
-        salvageKnowledge(&Inst, &AC); 
-        removeMSSA(Inst); 
-        Inst.eraseFromParent(); 
-        Changed = true; 
-        ++NumCSE; 
-        continue; 
-      } 
- 
-      // Otherwise, just remember that this value is available. 
-      AvailableValues.insert(&Inst, &Inst); 
-      continue; 
-    } 
- 
-    ParseMemoryInst MemInst(&Inst, TTI); 
-    // If this is a non-volatile load, process it. 
-    if (MemInst.isValid() && MemInst.isLoad()) { 
-      // (conservatively) we can't peak past the ordering implied by this 
-      // operation, but we can add this load to our set of available values 
-      if (MemInst.isVolatile() || !MemInst.isUnordered()) { 
-        LastStore = nullptr; 
-        ++CurrentGeneration; 
-      } 
- 
-      if (MemInst.isInvariantLoad()) { 
-        // If we pass an invariant load, we know that memory location is 
-        // indefinitely constant from the moment of first dereferenceability. 
-        // We conservatively treat the invariant_load as that moment.  If we 
-        // pass a invariant load after already establishing a scope, don't 
-        // restart it since we want to preserve the earliest point seen. 
-        auto MemLoc = MemoryLocation::get(&Inst); 
-        if (!AvailableInvariants.count(MemLoc)) 
-          AvailableInvariants.insert(MemLoc, CurrentGeneration); 
-      } 
- 
-      // If we have an available version of this load, and if it is the right 
-      // generation or the load is known to be from an invariant location, 
-      // replace this instruction. 
-      // 
-      // If either the dominating load or the current load are invariant, then 
-      // we can assume the current load loads the same value as the dominating 
-      // load. 
-      LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand()); 
+    // Skip sideeffect intrinsics, for the same reason as assume intrinsics.
+    if (match(&Inst, m_Intrinsic<Intrinsic::sideeffect>())) {
+      LLVM_DEBUG(dbgs() << "EarlyCSE skipping sideeffect: " << Inst << '\n');
+      continue;
+    }
+
+    // We can skip all invariant.start intrinsics since they only read memory,
+    // and we can forward values across it. For invariant starts without
+    // invariant ends, we can use the fact that the invariantness never ends to
+    // start a scope in the current generaton which is true for all future
+    // generations.  Also, we dont need to consume the last store since the
+    // semantics of invariant.start allow us to perform   DSE of the last
+    // store, if there was a store following invariant.start. Consider:
+    //
+    // store 30, i8* p
+    // invariant.start(p)
+    // store 40, i8* p
+    // We can DSE the store to 30, since the store 40 to invariant location p
+    // causes undefined behaviour.
+    if (match(&Inst, m_Intrinsic<Intrinsic::invariant_start>())) {
+      // If there are any uses, the scope might end.
+      if (!Inst.use_empty())
+        continue;
+      MemoryLocation MemLoc =
+          MemoryLocation::getForArgument(&cast<CallInst>(Inst), 1, TLI);
+      // Don't start a scope if we already have a better one pushed
+      if (!AvailableInvariants.count(MemLoc))
+        AvailableInvariants.insert(MemLoc, CurrentGeneration);
+      continue;
+    }
+
+    if (isGuard(&Inst)) {
+      if (auto *CondI =
+              dyn_cast<Instruction>(cast<CallInst>(Inst).getArgOperand(0))) {
+        if (SimpleValue::canHandle(CondI)) {
+          // Do we already know the actual value of this condition?
+          if (auto *KnownCond = AvailableValues.lookup(CondI)) {
+            // Is the condition known to be true?
+            if (isa<ConstantInt>(KnownCond) &&
+                cast<ConstantInt>(KnownCond)->isOne()) {
+              LLVM_DEBUG(dbgs()
+                         << "EarlyCSE removing guard: " << Inst << '\n');
+              salvageKnowledge(&Inst, &AC);
+              removeMSSA(Inst);
+              Inst.eraseFromParent();
+              Changed = true;
+              continue;
+            } else
+              // Use the known value if it wasn't true.
+              cast<CallInst>(Inst).setArgOperand(0, KnownCond);
+          }
+          // The condition we're on guarding here is true for all dominated
+          // locations.
+          AvailableValues.insert(CondI, ConstantInt::getTrue(BB->getContext()));
+        }
+      }
+
+      // Guard intrinsics read all memory, but don't write any memory.
+      // Accordingly, don't update the generation but consume the last store (to
+      // avoid an incorrect DSE).
+      LastStore = nullptr;
+      continue;
+    }
+
+    // If the instruction can be simplified (e.g. X+0 = X) then replace it with
+    // its simpler value.
+    if (Value *V = SimplifyInstruction(&Inst, SQ)) {
+      LLVM_DEBUG(dbgs() << "EarlyCSE Simplify: " << Inst << "  to: " << *V
+                        << '\n');
+      if (!DebugCounter::shouldExecute(CSECounter)) {
+        LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+      } else {
+        bool Killed = false;
+        if (!Inst.use_empty()) {
+          Inst.replaceAllUsesWith(V);
+          Changed = true;
+        }
+        if (isInstructionTriviallyDead(&Inst, &TLI)) {
+          salvageKnowledge(&Inst, &AC);
+          removeMSSA(Inst);
+          Inst.eraseFromParent();
+          Changed = true;
+          Killed = true;
+        }
+        if (Changed)
+          ++NumSimplify;
+        if (Killed)
+          continue;
+      }
+    }
+
+    // If this is a simple instruction that we can value number, process it.
+    if (SimpleValue::canHandle(&Inst)) {
+      // See if the instruction has an available value.  If so, use it.
+      if (Value *V = AvailableValues.lookup(&Inst)) {
+        LLVM_DEBUG(dbgs() << "EarlyCSE CSE: " << Inst << "  to: " << *V
+                          << '\n');
+        if (!DebugCounter::shouldExecute(CSECounter)) {
+          LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+          continue;
+        }
+        if (auto *I = dyn_cast<Instruction>(V))
+          I->andIRFlags(&Inst);
+        Inst.replaceAllUsesWith(V);
+        salvageKnowledge(&Inst, &AC);
+        removeMSSA(Inst);
+        Inst.eraseFromParent();
+        Changed = true;
+        ++NumCSE;
+        continue;
+      }
+
+      // Otherwise, just remember that this value is available.
+      AvailableValues.insert(&Inst, &Inst);
+      continue;
+    }
+
+    ParseMemoryInst MemInst(&Inst, TTI);
+    // If this is a non-volatile load, process it.
+    if (MemInst.isValid() && MemInst.isLoad()) {
+      // (conservatively) we can't peak past the ordering implied by this
+      // operation, but we can add this load to our set of available values
+      if (MemInst.isVolatile() || !MemInst.isUnordered()) {
+        LastStore = nullptr;
+        ++CurrentGeneration;
+      }
+
+      if (MemInst.isInvariantLoad()) {
+        // If we pass an invariant load, we know that memory location is
+        // indefinitely constant from the moment of first dereferenceability.
+        // We conservatively treat the invariant_load as that moment.  If we
+        // pass a invariant load after already establishing a scope, don't
+        // restart it since we want to preserve the earliest point seen.
+        auto MemLoc = MemoryLocation::get(&Inst);
+        if (!AvailableInvariants.count(MemLoc))
+          AvailableInvariants.insert(MemLoc, CurrentGeneration);
+      }
+
+      // If we have an available version of this load, and if it is the right
+      // generation or the load is known to be from an invariant location,
+      // replace this instruction.
+      //
+      // If either the dominating load or the current load are invariant, then
+      // we can assume the current load loads the same value as the dominating
+      // load.
+      LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
       if (Value *Op = getMatchingValue(InVal, MemInst, CurrentGeneration)) {
         LLVM_DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << Inst
                           << "  to: " << *InVal.DefInst << '\n');
         if (!DebugCounter::shouldExecute(CSECounter)) {
           LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
-          continue; 
-        } 
+          continue;
+        }
         if (!Inst.use_empty())
           Inst.replaceAllUsesWith(Op);
         salvageKnowledge(&Inst, &AC);
@@ -1401,317 +1401,317 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
         Changed = true;
         ++NumCSELoad;
         continue;
-      } 
- 
-      // Otherwise, remember that we have this instruction. 
-      AvailableLoads.insert(MemInst.getPointerOperand(), 
-                            LoadValue(&Inst, CurrentGeneration, 
-                                      MemInst.getMatchingId(), 
-                                      MemInst.isAtomic())); 
-      LastStore = nullptr; 
-      continue; 
-    } 
- 
-    // If this instruction may read from memory or throw (and potentially read 
-    // from memory in the exception handler), forget LastStore.  Load/store 
-    // intrinsics will indicate both a read and a write to memory.  The target 
-    // may override this (e.g. so that a store intrinsic does not read from 
-    // memory, and thus will be treated the same as a regular store for 
-    // commoning purposes). 
-    if ((Inst.mayReadFromMemory() || Inst.mayThrow()) && 
-        !(MemInst.isValid() && !MemInst.mayReadFromMemory())) 
-      LastStore = nullptr; 
- 
-    // If this is a read-only call, process it. 
-    if (CallValue::canHandle(&Inst)) { 
-      // If we have an available version of this call, and if it is the right 
-      // generation, replace this instruction. 
-      std::pair<Instruction *, unsigned> InVal = AvailableCalls.lookup(&Inst); 
-      if (InVal.first != nullptr && 
-          isSameMemGeneration(InVal.second, CurrentGeneration, InVal.first, 
-                              &Inst)) { 
-        LLVM_DEBUG(dbgs() << "EarlyCSE CSE CALL: " << Inst 
-                          << "  to: " << *InVal.first << '\n'); 
-        if (!DebugCounter::shouldExecute(CSECounter)) { 
-          LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n"); 
-          continue; 
-        } 
-        if (!Inst.use_empty()) 
-          Inst.replaceAllUsesWith(InVal.first); 
-        salvageKnowledge(&Inst, &AC); 
-        removeMSSA(Inst); 
-        Inst.eraseFromParent(); 
-        Changed = true; 
-        ++NumCSECall; 
-        continue; 
-      } 
- 
-      // Otherwise, remember that we have this instruction. 
-      AvailableCalls.insert(&Inst, std::make_pair(&Inst, CurrentGeneration)); 
-      continue; 
-    } 
- 
-    // A release fence requires that all stores complete before it, but does 
-    // not prevent the reordering of following loads 'before' the fence.  As a 
-    // result, we don't need to consider it as writing to memory and don't need 
-    // to advance the generation.  We do need to prevent DSE across the fence, 
-    // but that's handled above. 
-    if (auto *FI = dyn_cast<FenceInst>(&Inst)) 
-      if (FI->getOrdering() == AtomicOrdering::Release) { 
-        assert(Inst.mayReadFromMemory() && "relied on to prevent DSE above"); 
-        continue; 
-      } 
- 
-    // write back DSE - If we write back the same value we just loaded from 
-    // the same location and haven't passed any intervening writes or ordering 
-    // operations, we can remove the write.  The primary benefit is in allowing 
-    // the available load table to remain valid and value forward past where 
-    // the store originally was. 
-    if (MemInst.isValid() && MemInst.isStore()) { 
-      LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand()); 
-      if (InVal.DefInst && 
+      }
+
+      // Otherwise, remember that we have this instruction.
+      AvailableLoads.insert(MemInst.getPointerOperand(),
+                            LoadValue(&Inst, CurrentGeneration,
+                                      MemInst.getMatchingId(),
+                                      MemInst.isAtomic()));
+      LastStore = nullptr;
+      continue;
+    }
+
+    // If this instruction may read from memory or throw (and potentially read
+    // from memory in the exception handler), forget LastStore.  Load/store
+    // intrinsics will indicate both a read and a write to memory.  The target
+    // may override this (e.g. so that a store intrinsic does not read from
+    // memory, and thus will be treated the same as a regular store for
+    // commoning purposes).
+    if ((Inst.mayReadFromMemory() || Inst.mayThrow()) &&
+        !(MemInst.isValid() && !MemInst.mayReadFromMemory()))
+      LastStore = nullptr;
+
+    // If this is a read-only call, process it.
+    if (CallValue::canHandle(&Inst)) {
+      // If we have an available version of this call, and if it is the right
+      // generation, replace this instruction.
+      std::pair<Instruction *, unsigned> InVal = AvailableCalls.lookup(&Inst);
+      if (InVal.first != nullptr &&
+          isSameMemGeneration(InVal.second, CurrentGeneration, InVal.first,
+                              &Inst)) {
+        LLVM_DEBUG(dbgs() << "EarlyCSE CSE CALL: " << Inst
+                          << "  to: " << *InVal.first << '\n');
+        if (!DebugCounter::shouldExecute(CSECounter)) {
+          LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+          continue;
+        }
+        if (!Inst.use_empty())
+          Inst.replaceAllUsesWith(InVal.first);
+        salvageKnowledge(&Inst, &AC);
+        removeMSSA(Inst);
+        Inst.eraseFromParent();
+        Changed = true;
+        ++NumCSECall;
+        continue;
+      }
+
+      // Otherwise, remember that we have this instruction.
+      AvailableCalls.insert(&Inst, std::make_pair(&Inst, CurrentGeneration));
+      continue;
+    }
+
+    // A release fence requires that all stores complete before it, but does
+    // not prevent the reordering of following loads 'before' the fence.  As a
+    // result, we don't need to consider it as writing to memory and don't need
+    // to advance the generation.  We do need to prevent DSE across the fence,
+    // but that's handled above.
+    if (auto *FI = dyn_cast<FenceInst>(&Inst))
+      if (FI->getOrdering() == AtomicOrdering::Release) {
+        assert(Inst.mayReadFromMemory() && "relied on to prevent DSE above");
+        continue;
+      }
+
+    // write back DSE - If we write back the same value we just loaded from
+    // the same location and haven't passed any intervening writes or ordering
+    // operations, we can remove the write.  The primary benefit is in allowing
+    // the available load table to remain valid and value forward past where
+    // the store originally was.
+    if (MemInst.isValid() && MemInst.isStore()) {
+      LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
+      if (InVal.DefInst &&
           InVal.DefInst == getMatchingValue(InVal, MemInst, CurrentGeneration)) {
-        // It is okay to have a LastStore to a different pointer here if MemorySSA 
-        // tells us that the load and store are from the same memory generation. 
-        // In that case, LastStore should keep its present value since we're 
-        // removing the current store. 
-        assert((!LastStore || 
-                ParseMemoryInst(LastStore, TTI).getPointerOperand() == 
-                    MemInst.getPointerOperand() || 
-                MSSA) && 
-               "can't have an intervening store if not using MemorySSA!"); 
-        LLVM_DEBUG(dbgs() << "EarlyCSE DSE (writeback): " << Inst << '\n'); 
-        if (!DebugCounter::shouldExecute(CSECounter)) { 
-          LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n"); 
-          continue; 
-        } 
-        salvageKnowledge(&Inst, &AC); 
-        removeMSSA(Inst); 
-        Inst.eraseFromParent(); 
-        Changed = true; 
-        ++NumDSE; 
-        // We can avoid incrementing the generation count since we were able 
-        // to eliminate this store. 
-        continue; 
-      } 
-    } 
- 
-    // Okay, this isn't something we can CSE at all.  Check to see if it is 
-    // something that could modify memory.  If so, our available memory values 
-    // cannot be used so bump the generation count. 
-    if (Inst.mayWriteToMemory()) { 
-      ++CurrentGeneration; 
- 
-      if (MemInst.isValid() && MemInst.isStore()) { 
-        // We do a trivial form of DSE if there are two stores to the same 
-        // location with no intervening loads.  Delete the earlier store. 
-        if (LastStore) { 
+        // It is okay to have a LastStore to a different pointer here if MemorySSA
+        // tells us that the load and store are from the same memory generation.
+        // In that case, LastStore should keep its present value since we're
+        // removing the current store.
+        assert((!LastStore ||
+                ParseMemoryInst(LastStore, TTI).getPointerOperand() ==
+                    MemInst.getPointerOperand() ||
+                MSSA) &&
+               "can't have an intervening store if not using MemorySSA!");
+        LLVM_DEBUG(dbgs() << "EarlyCSE DSE (writeback): " << Inst << '\n');
+        if (!DebugCounter::shouldExecute(CSECounter)) {
+          LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+          continue;
+        }
+        salvageKnowledge(&Inst, &AC);
+        removeMSSA(Inst);
+        Inst.eraseFromParent();
+        Changed = true;
+        ++NumDSE;
+        // We can avoid incrementing the generation count since we were able
+        // to eliminate this store.
+        continue;
+      }
+    }
+
+    // Okay, this isn't something we can CSE at all.  Check to see if it is
+    // something that could modify memory.  If so, our available memory values
+    // cannot be used so bump the generation count.
+    if (Inst.mayWriteToMemory()) {
+      ++CurrentGeneration;
+
+      if (MemInst.isValid() && MemInst.isStore()) {
+        // We do a trivial form of DSE if there are two stores to the same
+        // location with no intervening loads.  Delete the earlier store.
+        if (LastStore) {
           if (overridingStores(ParseMemoryInst(LastStore, TTI), MemInst)) {
-            LLVM_DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore 
-                              << "  due to: " << Inst << '\n'); 
-            if (!DebugCounter::shouldExecute(CSECounter)) { 
-              LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n"); 
-            } else { 
-              salvageKnowledge(&Inst, &AC); 
-              removeMSSA(*LastStore); 
-              LastStore->eraseFromParent(); 
-              Changed = true; 
-              ++NumDSE; 
-              LastStore = nullptr; 
-            } 
-          } 
-          // fallthrough - we can exploit information about this store 
-        } 
- 
-        // Okay, we just invalidated anything we knew about loaded values.  Try 
-        // to salvage *something* by remembering that the stored value is a live 
-        // version of the pointer.  It is safe to forward from volatile stores 
-        // to non-volatile loads, so we don't have to check for volatility of 
-        // the store. 
-        AvailableLoads.insert(MemInst.getPointerOperand(), 
-                              LoadValue(&Inst, CurrentGeneration, 
-                                        MemInst.getMatchingId(), 
-                                        MemInst.isAtomic())); 
- 
-        // Remember that this was the last unordered store we saw for DSE. We 
-        // don't yet handle DSE on ordered or volatile stores since we don't 
-        // have a good way to model the ordering requirement for following 
-        // passes  once the store is removed.  We could insert a fence, but 
-        // since fences are slightly stronger than stores in their ordering, 
-        // it's not clear this is a profitable transform. Another option would 
-        // be to merge the ordering with that of the post dominating store. 
-        if (MemInst.isUnordered() && !MemInst.isVolatile()) 
-          LastStore = &Inst; 
-        else 
-          LastStore = nullptr; 
-      } 
-    } 
-  } 
- 
-  return Changed; 
-} 
- 
-bool EarlyCSE::run() { 
-  // Note, deque is being used here because there is significant performance 
-  // gains over vector when the container becomes very large due to the 
-  // specific access patterns. For more information see the mailing list 
-  // discussion on this: 
-  // http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20120116/135228.html 
-  std::deque<StackNode *> nodesToProcess; 
- 
-  bool Changed = false; 
- 
-  // Process the root node. 
-  nodesToProcess.push_back(new StackNode( 
-      AvailableValues, AvailableLoads, AvailableInvariants, AvailableCalls, 
-      CurrentGeneration, DT.getRootNode(), 
-      DT.getRootNode()->begin(), DT.getRootNode()->end())); 
- 
-  assert(!CurrentGeneration && "Create a new EarlyCSE instance to rerun it."); 
- 
-  // Process the stack. 
-  while (!nodesToProcess.empty()) { 
-    // Grab the first item off the stack. Set the current generation, remove 
-    // the node from the stack, and process it. 
-    StackNode *NodeToProcess = nodesToProcess.back(); 
- 
-    // Initialize class members. 
-    CurrentGeneration = NodeToProcess->currentGeneration(); 
- 
-    // Check if the node needs to be processed. 
-    if (!NodeToProcess->isProcessed()) { 
-      // Process the node. 
-      Changed |= processNode(NodeToProcess->node()); 
-      NodeToProcess->childGeneration(CurrentGeneration); 
-      NodeToProcess->process(); 
-    } else if (NodeToProcess->childIter() != NodeToProcess->end()) { 
-      // Push the next child onto the stack. 
-      DomTreeNode *child = NodeToProcess->nextChild(); 
-      nodesToProcess.push_back( 
-          new StackNode(AvailableValues, AvailableLoads, AvailableInvariants, 
-                        AvailableCalls, NodeToProcess->childGeneration(), 
-                        child, child->begin(), child->end())); 
-    } else { 
-      // It has been processed, and there are no more children to process, 
-      // so delete it and pop it off the stack. 
-      delete NodeToProcess; 
-      nodesToProcess.pop_back(); 
-    } 
-  } // while (!nodes...) 
- 
-  return Changed; 
-} 
- 
-PreservedAnalyses EarlyCSEPass::run(Function &F, 
-                                    FunctionAnalysisManager &AM) { 
-  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 
-  auto &TTI = AM.getResult<TargetIRAnalysis>(F); 
-  auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 
-  auto &AC = AM.getResult<AssumptionAnalysis>(F); 
-  auto *MSSA = 
-      UseMemorySSA ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() : nullptr; 
- 
-  EarlyCSE CSE(F.getParent()->getDataLayout(), TLI, TTI, DT, AC, MSSA); 
- 
-  if (!CSE.run()) 
-    return PreservedAnalyses::all(); 
- 
-  PreservedAnalyses PA; 
-  PA.preserveSet<CFGAnalyses>(); 
-  PA.preserve<GlobalsAA>(); 
-  if (UseMemorySSA) 
-    PA.preserve<MemorySSAAnalysis>(); 
-  return PA; 
-} 
- 
-namespace { 
- 
-/// A simple and fast domtree-based CSE pass. 
-/// 
-/// This pass does a simple depth-first walk over the dominator tree, 
-/// eliminating trivially redundant instructions and using instsimplify to 
-/// canonicalize things as it goes. It is intended to be fast and catch obvious 
-/// cases so that instcombine and other passes are more effective. It is 
-/// expected that a later pass of GVN will catch the interesting/hard cases. 
-template<bool UseMemorySSA> 
-class EarlyCSELegacyCommonPass : public FunctionPass { 
-public: 
-  static char ID; 
- 
-  EarlyCSELegacyCommonPass() : FunctionPass(ID) { 
-    if (UseMemorySSA) 
-      initializeEarlyCSEMemSSALegacyPassPass(*PassRegistry::getPassRegistry()); 
-    else 
-      initializeEarlyCSELegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnFunction(Function &F) override { 
-    if (skipFunction(F)) 
-      return false; 
- 
-    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 
-    auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 
-    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-    auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 
-    auto *MSSA = 
-        UseMemorySSA ? &getAnalysis<MemorySSAWrapperPass>().getMSSA() : nullptr; 
- 
-    EarlyCSE CSE(F.getParent()->getDataLayout(), TLI, TTI, DT, AC, MSSA); 
- 
-    return CSE.run(); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<AssumptionCacheTracker>(); 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-    AU.addRequired<TargetTransformInfoWrapperPass>(); 
-    if (UseMemorySSA) { 
+            LLVM_DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
+                              << "  due to: " << Inst << '\n');
+            if (!DebugCounter::shouldExecute(CSECounter)) {
+              LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+            } else {
+              salvageKnowledge(&Inst, &AC);
+              removeMSSA(*LastStore);
+              LastStore->eraseFromParent();
+              Changed = true;
+              ++NumDSE;
+              LastStore = nullptr;
+            }
+          }
+          // fallthrough - we can exploit information about this store
+        }
+
+        // Okay, we just invalidated anything we knew about loaded values.  Try
+        // to salvage *something* by remembering that the stored value is a live
+        // version of the pointer.  It is safe to forward from volatile stores
+        // to non-volatile loads, so we don't have to check for volatility of
+        // the store.
+        AvailableLoads.insert(MemInst.getPointerOperand(),
+                              LoadValue(&Inst, CurrentGeneration,
+                                        MemInst.getMatchingId(),
+                                        MemInst.isAtomic()));
+
+        // Remember that this was the last unordered store we saw for DSE. We
+        // don't yet handle DSE on ordered or volatile stores since we don't
+        // have a good way to model the ordering requirement for following
+        // passes  once the store is removed.  We could insert a fence, but
+        // since fences are slightly stronger than stores in their ordering,
+        // it's not clear this is a profitable transform. Another option would
+        // be to merge the ordering with that of the post dominating store.
+        if (MemInst.isUnordered() && !MemInst.isVolatile())
+          LastStore = &Inst;
+        else
+          LastStore = nullptr;
+      }
+    }
+  }
+
+  return Changed;
+}
+
+bool EarlyCSE::run() {
+  // Note, deque is being used here because there is significant performance
+  // gains over vector when the container becomes very large due to the
+  // specific access patterns. For more information see the mailing list
+  // discussion on this:
+  // http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20120116/135228.html
+  std::deque<StackNode *> nodesToProcess;
+
+  bool Changed = false;
+
+  // Process the root node.
+  nodesToProcess.push_back(new StackNode(
+      AvailableValues, AvailableLoads, AvailableInvariants, AvailableCalls,
+      CurrentGeneration, DT.getRootNode(),
+      DT.getRootNode()->begin(), DT.getRootNode()->end()));
+
+  assert(!CurrentGeneration && "Create a new EarlyCSE instance to rerun it.");
+
+  // Process the stack.
+  while (!nodesToProcess.empty()) {
+    // Grab the first item off the stack. Set the current generation, remove
+    // the node from the stack, and process it.
+    StackNode *NodeToProcess = nodesToProcess.back();
+
+    // Initialize class members.
+    CurrentGeneration = NodeToProcess->currentGeneration();
+
+    // Check if the node needs to be processed.
+    if (!NodeToProcess->isProcessed()) {
+      // Process the node.
+      Changed |= processNode(NodeToProcess->node());
+      NodeToProcess->childGeneration(CurrentGeneration);
+      NodeToProcess->process();
+    } else if (NodeToProcess->childIter() != NodeToProcess->end()) {
+      // Push the next child onto the stack.
+      DomTreeNode *child = NodeToProcess->nextChild();
+      nodesToProcess.push_back(
+          new StackNode(AvailableValues, AvailableLoads, AvailableInvariants,
+                        AvailableCalls, NodeToProcess->childGeneration(),
+                        child, child->begin(), child->end()));
+    } else {
+      // It has been processed, and there are no more children to process,
+      // so delete it and pop it off the stack.
+      delete NodeToProcess;
+      nodesToProcess.pop_back();
+    }
+  } // while (!nodes...)
+
+  return Changed;
+}
+
+PreservedAnalyses EarlyCSEPass::run(Function &F,
+                                    FunctionAnalysisManager &AM) {
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  auto *MSSA =
+      UseMemorySSA ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() : nullptr;
+
+  EarlyCSE CSE(F.getParent()->getDataLayout(), TLI, TTI, DT, AC, MSSA);
+
+  if (!CSE.run())
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<GlobalsAA>();
+  if (UseMemorySSA)
+    PA.preserve<MemorySSAAnalysis>();
+  return PA;
+}
+
+namespace {
+
+/// A simple and fast domtree-based CSE pass.
+///
+/// This pass does a simple depth-first walk over the dominator tree,
+/// eliminating trivially redundant instructions and using instsimplify to
+/// canonicalize things as it goes. It is intended to be fast and catch obvious
+/// cases so that instcombine and other passes are more effective. It is
+/// expected that a later pass of GVN will catch the interesting/hard cases.
+template<bool UseMemorySSA>
+class EarlyCSELegacyCommonPass : public FunctionPass {
+public:
+  static char ID;
+
+  EarlyCSELegacyCommonPass() : FunctionPass(ID) {
+    if (UseMemorySSA)
+      initializeEarlyCSEMemSSALegacyPassPass(*PassRegistry::getPassRegistry());
+    else
+      initializeEarlyCSELegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    auto *MSSA =
+        UseMemorySSA ? &getAnalysis<MemorySSAWrapperPass>().getMSSA() : nullptr;
+
+    EarlyCSE CSE(F.getParent()->getDataLayout(), TLI, TTI, DT, AC, MSSA);
+
+    return CSE.run();
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    if (UseMemorySSA) {
       AU.addRequired<AAResultsWrapperPass>();
-      AU.addRequired<MemorySSAWrapperPass>(); 
-      AU.addPreserved<MemorySSAWrapperPass>(); 
-    } 
-    AU.addPreserved<GlobalsAAWrapperPass>(); 
-    AU.addPreserved<AAResultsWrapperPass>(); 
-    AU.setPreservesCFG(); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-using EarlyCSELegacyPass = EarlyCSELegacyCommonPass</*UseMemorySSA=*/false>; 
- 
-template<> 
-char EarlyCSELegacyPass::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(EarlyCSELegacyPass, "early-cse", "Early CSE", false, 
-                      false) 
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_END(EarlyCSELegacyPass, "early-cse", "Early CSE", false, false) 
- 
-using EarlyCSEMemSSALegacyPass = 
-    EarlyCSELegacyCommonPass</*UseMemorySSA=*/true>; 
- 
-template<> 
-char EarlyCSEMemSSALegacyPass::ID = 0; 
- 
-FunctionPass *llvm::createEarlyCSEPass(bool UseMemorySSA) { 
-  if (UseMemorySSA) 
-    return new EarlyCSEMemSSALegacyPass(); 
-  else 
-    return new EarlyCSELegacyPass(); 
-} 
- 
-INITIALIZE_PASS_BEGIN(EarlyCSEMemSSALegacyPass, "early-cse-memssa", 
-                      "Early CSE w/ MemorySSA", false, false) 
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
+      AU.addRequired<MemorySSAWrapperPass>();
+      AU.addPreserved<MemorySSAWrapperPass>();
+    }
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<AAResultsWrapperPass>();
+    AU.setPreservesCFG();
+  }
+};
+
+} // end anonymous namespace
+
+using EarlyCSELegacyPass = EarlyCSELegacyCommonPass</*UseMemorySSA=*/false>;
+
+template<>
+char EarlyCSELegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(EarlyCSELegacyPass, "early-cse", "Early CSE", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(EarlyCSELegacyPass, "early-cse", "Early CSE", false, false)
+
+using EarlyCSEMemSSALegacyPass =
+    EarlyCSELegacyCommonPass</*UseMemorySSA=*/true>;
+
+template<>
+char EarlyCSEMemSSALegacyPass::ID = 0;
+
+FunctionPass *llvm::createEarlyCSEPass(bool UseMemorySSA) {
+  if (UseMemorySSA)
+    return new EarlyCSEMemSSALegacyPass();
+  else
+    return new EarlyCSELegacyPass();
+}
+
+INITIALIZE_PASS_BEGIN(EarlyCSEMemSSALegacyPass, "early-cse-memssa",
+                      "Early CSE w/ MemorySSA", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) 
-INITIALIZE_PASS_END(EarlyCSEMemSSALegacyPass, "early-cse-memssa", 
-                    "Early CSE w/ MemorySSA", false, false) 
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_END(EarlyCSEMemSSALegacyPass, "early-cse-memssa",
+                    "Early CSE w/ MemorySSA", false, false)
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/FlattenCFGPass.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/FlattenCFGPass.cpp
index e2c126223d..e54a270fb2 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/FlattenCFGPass.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/FlattenCFGPass.cpp
@@ -1,91 +1,91 @@
-//===- FlattenCFGPass.cpp - CFG Flatten Pass ----------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements flattening of CFG. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Analysis/AliasAnalysis.h" 
-#include "llvm/IR/CFG.h" 
+//===- FlattenCFGPass.cpp - CFG Flatten Pass ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements flattening of CFG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/IR/CFG.h"
 #include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/ValueHandle.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/Local.h" 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "flattencfg" 
- 
-namespace { 
-struct FlattenCFGPass : public FunctionPass { 
-  static char ID; // Pass identification, replacement for typeid 
-public: 
-  FlattenCFGPass() : FunctionPass(ID) { 
-    initializeFlattenCFGPassPass(*PassRegistry::getPassRegistry()); 
-  } 
-  bool runOnFunction(Function &F) override; 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<AAResultsWrapperPass>(); 
-  } 
- 
-private: 
-  AliasAnalysis *AA; 
-}; 
-} 
- 
-char FlattenCFGPass::ID = 0; 
-INITIALIZE_PASS_BEGIN(FlattenCFGPass, "flattencfg", "Flatten the CFG", false, 
-                      false) 
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 
-INITIALIZE_PASS_END(FlattenCFGPass, "flattencfg", "Flatten the CFG", false, 
-                    false) 
- 
-// Public interface to the FlattenCFG pass 
-FunctionPass *llvm::createFlattenCFGPass() { return new FlattenCFGPass(); } 
- 
-/// iterativelyFlattenCFG - Call FlattenCFG on all the blocks in the function, 
-/// iterating until no more changes are made. 
-static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) { 
-  bool Changed = false; 
-  bool LocalChange = true; 
- 
-  // Use block handles instead of iterating over function blocks directly 
-  // to avoid using iterators invalidated by erasing blocks. 
-  std::vector<WeakVH> Blocks; 
-  Blocks.reserve(F.size()); 
-  for (auto &BB : F) 
-    Blocks.push_back(&BB); 
- 
-  while (LocalChange) { 
-    LocalChange = false; 
- 
-    // Loop over all of the basic blocks and try to flatten them. 
-    for (WeakVH &BlockHandle : Blocks) { 
-      // Skip blocks erased by FlattenCFG. 
-      if (auto *BB = cast_or_null<BasicBlock>(BlockHandle)) 
-        if (FlattenCFG(BB, AA)) 
-          LocalChange = true; 
-    } 
-    Changed |= LocalChange; 
-  } 
-  return Changed; 
-} 
- 
-bool FlattenCFGPass::runOnFunction(Function &F) { 
-  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 
-  bool EverChanged = false; 
-  // iterativelyFlattenCFG can make some blocks dead. 
-  while (iterativelyFlattenCFG(F, AA)) { 
-    removeUnreachableBlocks(F); 
-    EverChanged = true; 
-  } 
-  return EverChanged; 
-} 
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "flattencfg"
+
+namespace {
+struct FlattenCFGPass : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+public:
+  FlattenCFGPass() : FunctionPass(ID) {
+    initializeFlattenCFGPassPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AAResultsWrapperPass>();
+  }
+
+private:
+  AliasAnalysis *AA;
+};
+}
+
+char FlattenCFGPass::ID = 0;
+INITIALIZE_PASS_BEGIN(FlattenCFGPass, "flattencfg", "Flatten the CFG", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(FlattenCFGPass, "flattencfg", "Flatten the CFG", false,
+                    false)
+
+// Public interface to the FlattenCFG pass
+FunctionPass *llvm::createFlattenCFGPass() { return new FlattenCFGPass(); }
+
+/// iterativelyFlattenCFG - Call FlattenCFG on all the blocks in the function,
+/// iterating until no more changes are made.
+static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) {
+  bool Changed = false;
+  bool LocalChange = true;
+
+  // Use block handles instead of iterating over function blocks directly
+  // to avoid using iterators invalidated by erasing blocks.
+  std::vector<WeakVH> Blocks;
+  Blocks.reserve(F.size());
+  for (auto &BB : F)
+    Blocks.push_back(&BB);
+
+  while (LocalChange) {
+    LocalChange = false;
+
+    // Loop over all of the basic blocks and try to flatten them.
+    for (WeakVH &BlockHandle : Blocks) {
+      // Skip blocks erased by FlattenCFG.
+      if (auto *BB = cast_or_null<BasicBlock>(BlockHandle))
+        if (FlattenCFG(BB, AA))
+          LocalChange = true;
+    }
+    Changed |= LocalChange;
+  }
+  return Changed;
+}
+
+bool FlattenCFGPass::runOnFunction(Function &F) {
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  bool EverChanged = false;
+  // iterativelyFlattenCFG can make some blocks dead.
+  while (iterativelyFlattenCFG(F, AA)) {
+    removeUnreachableBlocks(F);
+    EverChanged = true;
+  }
+  return EverChanged;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/Float2Int.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/Float2Int.cpp
index 341a4c8220..b6d82685e8 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/Float2Int.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/Float2Int.cpp
@@ -1,550 +1,550 @@
-//===- Float2Int.cpp - Demote floating point ops to work on integers ------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the Float2Int pass, which aims to demote floating 
-// point operations to work on integers, where that is losslessly possible. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Support/CommandLine.h" 
-#define DEBUG_TYPE "float2int" 
- 
-#include "llvm/Transforms/Scalar/Float2Int.h" 
-#include "llvm/ADT/APInt.h" 
-#include "llvm/ADT/APSInt.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstIterator.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include <deque> 
-#include <functional> // For std::function 
-using namespace llvm; 
- 
-// The algorithm is simple. Start at instructions that convert from the 
-// float to the int domain: fptoui, fptosi and fcmp. Walk up the def-use 
-// graph, using an equivalence datastructure to unify graphs that interfere. 
-// 
-// Mappable instructions are those with an integer corrollary that, given 
-// integer domain inputs, produce an integer output; fadd, for example. 
-// 
-// If a non-mappable instruction is seen, this entire def-use graph is marked 
-// as non-transformable. If we see an instruction that converts from the 
-// integer domain to FP domain (uitofp,sitofp), we terminate our walk. 
- 
-/// The largest integer type worth dealing with. 
-static cl::opt<unsigned> 
-MaxIntegerBW("float2int-max-integer-bw", cl::init(64), cl::Hidden, 
-             cl::desc("Max integer bitwidth to consider in float2int" 
-                      "(default=64)")); 
- 
-namespace { 
-  struct Float2IntLegacyPass : public FunctionPass { 
-    static char ID; // Pass identification, replacement for typeid 
-    Float2IntLegacyPass() : FunctionPass(ID) { 
-      initializeFloat2IntLegacyPassPass(*PassRegistry::getPassRegistry()); 
-    } 
- 
-    bool runOnFunction(Function &F) override { 
-      if (skipFunction(F)) 
-        return false; 
- 
-      const DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-      return Impl.runImpl(F, DT); 
-    } 
- 
-    void getAnalysisUsage(AnalysisUsage &AU) const override { 
-      AU.setPreservesCFG(); 
-      AU.addRequired<DominatorTreeWrapperPass>(); 
-      AU.addPreserved<GlobalsAAWrapperPass>(); 
-    } 
- 
-  private: 
-    Float2IntPass Impl; 
-  }; 
-} 
- 
-char Float2IntLegacyPass::ID = 0; 
-INITIALIZE_PASS(Float2IntLegacyPass, "float2int", "Float to int", false, false) 
- 
-// Given a FCmp predicate, return a matching ICmp predicate if one 
-// exists, otherwise return BAD_ICMP_PREDICATE. 
-static CmpInst::Predicate mapFCmpPred(CmpInst::Predicate P) { 
-  switch (P) { 
-  case CmpInst::FCMP_OEQ: 
-  case CmpInst::FCMP_UEQ: 
-    return CmpInst::ICMP_EQ; 
-  case CmpInst::FCMP_OGT: 
-  case CmpInst::FCMP_UGT: 
-    return CmpInst::ICMP_SGT; 
-  case CmpInst::FCMP_OGE: 
-  case CmpInst::FCMP_UGE: 
-    return CmpInst::ICMP_SGE; 
-  case CmpInst::FCMP_OLT: 
-  case CmpInst::FCMP_ULT: 
-    return CmpInst::ICMP_SLT; 
-  case CmpInst::FCMP_OLE: 
-  case CmpInst::FCMP_ULE: 
-    return CmpInst::ICMP_SLE; 
-  case CmpInst::FCMP_ONE: 
-  case CmpInst::FCMP_UNE: 
-    return CmpInst::ICMP_NE; 
-  default: 
-    return CmpInst::BAD_ICMP_PREDICATE; 
-  } 
-} 
- 
-// Given a floating point binary operator, return the matching 
-// integer version. 
-static Instruction::BinaryOps mapBinOpcode(unsigned Opcode) { 
-  switch (Opcode) { 
-  default: llvm_unreachable("Unhandled opcode!"); 
-  case Instruction::FAdd: return Instruction::Add; 
-  case Instruction::FSub: return Instruction::Sub; 
-  case Instruction::FMul: return Instruction::Mul; 
-  } 
-} 
- 
-// Find the roots - instructions that convert from the FP domain to 
-// integer domain. 
-void Float2IntPass::findRoots(Function &F, const DominatorTree &DT) { 
-  for (BasicBlock &BB : F) { 
-    // Unreachable code can take on strange forms that we are not prepared to 
-    // handle. For example, an instruction may have itself as an operand. 
-    if (!DT.isReachableFromEntry(&BB)) 
-      continue; 
- 
-    for (Instruction &I : BB) { 
-      if (isa<VectorType>(I.getType())) 
-        continue; 
-      switch (I.getOpcode()) { 
-      default: break; 
-      case Instruction::FPToUI: 
-      case Instruction::FPToSI: 
-        Roots.insert(&I); 
-        break; 
-      case Instruction::FCmp: 
-        if (mapFCmpPred(cast<CmpInst>(&I)->getPredicate()) != 
-            CmpInst::BAD_ICMP_PREDICATE) 
-          Roots.insert(&I); 
-        break; 
-      } 
-    } 
-  } 
-} 
- 
-// Helper - mark I as having been traversed, having range R. 
-void Float2IntPass::seen(Instruction *I, ConstantRange R) { 
-  LLVM_DEBUG(dbgs() << "F2I: " << *I << ":" << R << "\n"); 
-  auto IT = SeenInsts.find(I); 
-  if (IT != SeenInsts.end()) 
-    IT->second = std::move(R); 
-  else 
-    SeenInsts.insert(std::make_pair(I, std::move(R))); 
-} 
- 
-// Helper - get a range representing a poison value. 
-ConstantRange Float2IntPass::badRange() { 
-  return ConstantRange::getFull(MaxIntegerBW + 1); 
-} 
-ConstantRange Float2IntPass::unknownRange() { 
-  return ConstantRange::getEmpty(MaxIntegerBW + 1); 
-} 
-ConstantRange Float2IntPass::validateRange(ConstantRange R) { 
-  if (R.getBitWidth() > MaxIntegerBW + 1) 
-    return badRange(); 
-  return R; 
-} 
- 
-// The most obvious way to structure the search is a depth-first, eager 
-// search from each root. However, that require direct recursion and so 
-// can only handle small instruction sequences. Instead, we split the search 
-// up into two phases: 
-//   - walkBackwards:  A breadth-first walk of the use-def graph starting from 
-//                     the roots. Populate "SeenInsts" with interesting 
-//                     instructions and poison values if they're obvious and 
-//                     cheap to compute. Calculate the equivalance set structure 
-//                     while we're here too. 
-//   - walkForwards:  Iterate over SeenInsts in reverse order, so we visit 
-//                     defs before their uses. Calculate the real range info. 
- 
-// Breadth-first walk of the use-def graph; determine the set of nodes 
-// we care about and eagerly determine if some of them are poisonous. 
-void Float2IntPass::walkBackwards() { 
-  std::deque<Instruction*> Worklist(Roots.begin(), Roots.end()); 
-  while (!Worklist.empty()) { 
-    Instruction *I = Worklist.back(); 
-    Worklist.pop_back(); 
- 
-    if (SeenInsts.find(I) != SeenInsts.end()) 
-      // Seen already. 
-      continue; 
- 
-    switch (I->getOpcode()) { 
-      // FIXME: Handle select and phi nodes. 
-    default: 
-      // Path terminated uncleanly. 
-      seen(I, badRange()); 
-      break; 
- 
-    case Instruction::UIToFP: 
-    case Instruction::SIToFP: { 
-      // Path terminated cleanly - use the type of the integer input to seed 
-      // the analysis. 
-      unsigned BW = I->getOperand(0)->getType()->getPrimitiveSizeInBits(); 
-      auto Input = ConstantRange::getFull(BW); 
-      auto CastOp = (Instruction::CastOps)I->getOpcode(); 
-      seen(I, validateRange(Input.castOp(CastOp, MaxIntegerBW+1))); 
-      continue; 
-    } 
- 
-    case Instruction::FNeg: 
-    case Instruction::FAdd: 
-    case Instruction::FSub: 
-    case Instruction::FMul: 
-    case Instruction::FPToUI: 
-    case Instruction::FPToSI: 
-    case Instruction::FCmp: 
-      seen(I, unknownRange()); 
-      break; 
-    } 
- 
-    for (Value *O : I->operands()) { 
-      if (Instruction *OI = dyn_cast<Instruction>(O)) { 
-        // Unify def-use chains if they interfere. 
-        ECs.unionSets(I, OI); 
-        if (SeenInsts.find(I)->second != badRange()) 
-          Worklist.push_back(OI); 
-      } else if (!isa<ConstantFP>(O)) { 
-        // Not an instruction or ConstantFP? we can't do anything. 
-        seen(I, badRange()); 
-      } 
-    } 
-  } 
-} 
- 
-// Walk forwards down the list of seen instructions, so we visit defs before 
-// uses. 
-void Float2IntPass::walkForwards() { 
-  for (auto &It : reverse(SeenInsts)) { 
-    if (It.second != unknownRange()) 
-      continue; 
- 
-    Instruction *I = It.first; 
-    std::function<ConstantRange(ArrayRef<ConstantRange>)> Op; 
-    switch (I->getOpcode()) { 
-      // FIXME: Handle select and phi nodes. 
-    default: 
-    case Instruction::UIToFP: 
-    case Instruction::SIToFP: 
-      llvm_unreachable("Should have been handled in walkForwards!"); 
- 
-    case Instruction::FNeg: 
-      Op = [](ArrayRef<ConstantRange> Ops) { 
-        assert(Ops.size() == 1 && "FNeg is a unary operator!"); 
-        unsigned Size = Ops[0].getBitWidth(); 
-        auto Zero = ConstantRange(APInt::getNullValue(Size)); 
-        return Zero.sub(Ops[0]); 
-      }; 
-      break; 
- 
-    case Instruction::FAdd: 
-    case Instruction::FSub: 
-    case Instruction::FMul: 
-      Op = [I](ArrayRef<ConstantRange> Ops) { 
-        assert(Ops.size() == 2 && "its a binary operator!"); 
-        auto BinOp = (Instruction::BinaryOps) I->getOpcode(); 
-        return Ops[0].binaryOp(BinOp, Ops[1]); 
-      }; 
-      break; 
- 
-    // 
-    // Root-only instructions - we'll only see these if they're the 
-    //                          first node in a walk. 
-    // 
-    case Instruction::FPToUI: 
-    case Instruction::FPToSI: 
-      Op = [I](ArrayRef<ConstantRange> Ops) { 
-        assert(Ops.size() == 1 && "FPTo[US]I is a unary operator!"); 
-        // Note: We're ignoring the casts output size here as that's what the 
-        // caller expects. 
-        auto CastOp = (Instruction::CastOps)I->getOpcode(); 
-        return Ops[0].castOp(CastOp, MaxIntegerBW+1); 
-      }; 
-      break; 
- 
-    case Instruction::FCmp: 
-      Op = [](ArrayRef<ConstantRange> Ops) { 
-        assert(Ops.size() == 2 && "FCmp is a binary operator!"); 
-        return Ops[0].unionWith(Ops[1]); 
-      }; 
-      break; 
-    } 
- 
-    bool Abort = false; 
-    SmallVector<ConstantRange,4> OpRanges; 
-    for (Value *O : I->operands()) { 
-      if (Instruction *OI = dyn_cast<Instruction>(O)) { 
-        assert(SeenInsts.find(OI) != SeenInsts.end() && 
-               "def not seen before use!"); 
-        OpRanges.push_back(SeenInsts.find(OI)->second); 
-      } else if (ConstantFP *CF = dyn_cast<ConstantFP>(O)) { 
-        // Work out if the floating point number can be losslessly represented 
-        // as an integer. 
-        // APFloat::convertToInteger(&Exact) purports to do what we want, but 
-        // the exactness can be too precise. For example, negative zero can 
-        // never be exactly converted to an integer. 
-        // 
-        // Instead, we ask APFloat to round itself to an integral value - this 
-        // preserves sign-of-zero - then compare the result with the original. 
-        // 
-        const APFloat &F = CF->getValueAPF(); 
- 
-        // First, weed out obviously incorrect values. Non-finite numbers 
-        // can't be represented and neither can negative zero, unless 
-        // we're in fast math mode. 
-        if (!F.isFinite() || 
-            (F.isZero() && F.isNegative() && isa<FPMathOperator>(I) && 
-             !I->hasNoSignedZeros())) { 
-          seen(I, badRange()); 
-          Abort = true; 
-          break; 
-        } 
- 
-        APFloat NewF = F; 
-        auto Res = NewF.roundToIntegral(APFloat::rmNearestTiesToEven); 
-        if (Res != APFloat::opOK || NewF != F) { 
-          seen(I, badRange()); 
-          Abort = true; 
-          break; 
-        } 
-        // OK, it's representable. Now get it. 
-        APSInt Int(MaxIntegerBW+1, false); 
-        bool Exact; 
-        CF->getValueAPF().convertToInteger(Int, 
-                                           APFloat::rmNearestTiesToEven, 
-                                           &Exact); 
-        OpRanges.push_back(ConstantRange(Int)); 
-      } else { 
-        llvm_unreachable("Should have already marked this as badRange!"); 
-      } 
-    } 
- 
-    // Reduce the operands' ranges to a single range and return. 
-    if (!Abort) 
-      seen(I, Op(OpRanges)); 
-  } 
-} 
- 
-// If there is a valid transform to be done, do it. 
-bool Float2IntPass::validateAndTransform() { 
-  bool MadeChange = false; 
- 
-  // Iterate over every disjoint partition of the def-use graph. 
-  for (auto It = ECs.begin(), E = ECs.end(); It != E; ++It) { 
-    ConstantRange R(MaxIntegerBW + 1, false); 
-    bool Fail = false; 
-    Type *ConvertedToTy = nullptr; 
- 
-    // For every member of the partition, union all the ranges together. 
-    for (auto MI = ECs.member_begin(It), ME = ECs.member_end(); 
-         MI != ME; ++MI) { 
-      Instruction *I = *MI; 
-      auto SeenI = SeenInsts.find(I); 
-      if (SeenI == SeenInsts.end()) 
-        continue; 
- 
-      R = R.unionWith(SeenI->second); 
-      // We need to ensure I has no users that have not been seen. 
-      // If it does, transformation would be illegal. 
-      // 
-      // Don't count the roots, as they terminate the graphs. 
-      if (Roots.count(I) == 0) { 
-        // Set the type of the conversion while we're here. 
-        if (!ConvertedToTy) 
-          ConvertedToTy = I->getType(); 
-        for (User *U : I->users()) { 
-          Instruction *UI = dyn_cast<Instruction>(U); 
-          if (!UI || SeenInsts.find(UI) == SeenInsts.end()) { 
-            LLVM_DEBUG(dbgs() << "F2I: Failing because of " << *U << "\n"); 
-            Fail = true; 
-            break; 
-          } 
-        } 
-      } 
-      if (Fail) 
-        break; 
-    } 
- 
-    // If the set was empty, or we failed, or the range is poisonous, 
-    // bail out. 
-    if (ECs.member_begin(It) == ECs.member_end() || Fail || 
-        R.isFullSet() || R.isSignWrappedSet()) 
-      continue; 
-    assert(ConvertedToTy && "Must have set the convertedtoty by this point!"); 
- 
-    // The number of bits required is the maximum of the upper and 
-    // lower limits, plus one so it can be signed. 
-    unsigned MinBW = std::max(R.getLower().getMinSignedBits(), 
-                              R.getUpper().getMinSignedBits()) + 1; 
-    LLVM_DEBUG(dbgs() << "F2I: MinBitwidth=" << MinBW << ", R: " << R << "\n"); 
- 
-    // If we've run off the realms of the exactly representable integers, 
-    // the floating point result will differ from an integer approximation. 
- 
-    // Do we need more bits than are in the mantissa of the type we converted 
-    // to? semanticsPrecision returns the number of mantissa bits plus one 
-    // for the sign bit. 
-    unsigned MaxRepresentableBits 
-      = APFloat::semanticsPrecision(ConvertedToTy->getFltSemantics()) - 1; 
-    if (MinBW > MaxRepresentableBits) { 
-      LLVM_DEBUG(dbgs() << "F2I: Value not guaranteed to be representable!\n"); 
-      continue; 
-    } 
-    if (MinBW > 64) { 
-      LLVM_DEBUG( 
-          dbgs() << "F2I: Value requires more than 64 bits to represent!\n"); 
-      continue; 
-    } 
- 
-    // OK, R is known to be representable. Now pick a type for it. 
-    // FIXME: Pick the smallest legal type that will fit. 
-    Type *Ty = (MinBW > 32) ? Type::getInt64Ty(*Ctx) : Type::getInt32Ty(*Ctx); 
- 
-    for (auto MI = ECs.member_begin(It), ME = ECs.member_end(); 
-         MI != ME; ++MI) 
-      convert(*MI, Ty); 
-    MadeChange = true; 
-  } 
- 
-  return MadeChange; 
-} 
- 
-Value *Float2IntPass::convert(Instruction *I, Type *ToTy) { 
-  if (ConvertedInsts.find(I) != ConvertedInsts.end()) 
-    // Already converted this instruction. 
-    return ConvertedInsts[I]; 
- 
-  SmallVector<Value*,4> NewOperands; 
-  for (Value *V : I->operands()) { 
-    // Don't recurse if we're an instruction that terminates the path. 
-    if (I->getOpcode() == Instruction::UIToFP || 
-        I->getOpcode() == Instruction::SIToFP) { 
-      NewOperands.push_back(V); 
-    } else if (Instruction *VI = dyn_cast<Instruction>(V)) { 
-      NewOperands.push_back(convert(VI, ToTy)); 
-    } else if (ConstantFP *CF = dyn_cast<ConstantFP>(V)) { 
-      APSInt Val(ToTy->getPrimitiveSizeInBits(), /*isUnsigned=*/false); 
-      bool Exact; 
-      CF->getValueAPF().convertToInteger(Val, 
-                                         APFloat::rmNearestTiesToEven, 
-                                         &Exact); 
-      NewOperands.push_back(ConstantInt::get(ToTy, Val)); 
-    } else { 
-      llvm_unreachable("Unhandled operand type?"); 
-    } 
-  } 
- 
-  // Now create a new instruction. 
-  IRBuilder<> IRB(I); 
-  Value *NewV = nullptr; 
-  switch (I->getOpcode()) { 
-  default: llvm_unreachable("Unhandled instruction!"); 
- 
-  case Instruction::FPToUI: 
-    NewV = IRB.CreateZExtOrTrunc(NewOperands[0], I->getType()); 
-    break; 
- 
-  case Instruction::FPToSI: 
-    NewV = IRB.CreateSExtOrTrunc(NewOperands[0], I->getType()); 
-    break; 
- 
-  case Instruction::FCmp: { 
-    CmpInst::Predicate P = mapFCmpPred(cast<CmpInst>(I)->getPredicate()); 
-    assert(P != CmpInst::BAD_ICMP_PREDICATE && "Unhandled predicate!"); 
-    NewV = IRB.CreateICmp(P, NewOperands[0], NewOperands[1], I->getName()); 
-    break; 
-  } 
- 
-  case Instruction::UIToFP: 
-    NewV = IRB.CreateZExtOrTrunc(NewOperands[0], ToTy); 
-    break; 
- 
-  case Instruction::SIToFP: 
-    NewV = IRB.CreateSExtOrTrunc(NewOperands[0], ToTy); 
-    break; 
- 
-  case Instruction::FNeg: 
-    NewV = IRB.CreateNeg(NewOperands[0], I->getName()); 
-    break; 
- 
-  case Instruction::FAdd: 
-  case Instruction::FSub: 
-  case Instruction::FMul: 
-    NewV = IRB.CreateBinOp(mapBinOpcode(I->getOpcode()), 
-                           NewOperands[0], NewOperands[1], 
-                           I->getName()); 
-    break; 
-  } 
- 
-  // If we're a root instruction, RAUW. 
-  if (Roots.count(I)) 
-    I->replaceAllUsesWith(NewV); 
- 
-  ConvertedInsts[I] = NewV; 
-  return NewV; 
-} 
- 
-// Perform dead code elimination on the instructions we just modified. 
-void Float2IntPass::cleanup() { 
-  for (auto &I : reverse(ConvertedInsts)) 
-    I.first->eraseFromParent(); 
-} 
- 
-bool Float2IntPass::runImpl(Function &F, const DominatorTree &DT) { 
-  LLVM_DEBUG(dbgs() << "F2I: Looking at function " << F.getName() << "\n"); 
-  // Clear out all state. 
-  ECs = EquivalenceClasses<Instruction*>(); 
-  SeenInsts.clear(); 
-  ConvertedInsts.clear(); 
-  Roots.clear(); 
- 
-  Ctx = &F.getParent()->getContext(); 
- 
-  findRoots(F, DT); 
- 
-  walkBackwards(); 
-  walkForwards(); 
- 
-  bool Modified = validateAndTransform(); 
-  if (Modified) 
-    cleanup(); 
-  return Modified; 
-} 
- 
-namespace llvm { 
-FunctionPass *createFloat2IntPass() { return new Float2IntLegacyPass(); } 
- 
-PreservedAnalyses Float2IntPass::run(Function &F, FunctionAnalysisManager &AM) { 
-  const DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F); 
-  if (!runImpl(F, DT)) 
-    return PreservedAnalyses::all(); 
- 
-  PreservedAnalyses PA; 
-  PA.preserveSet<CFGAnalyses>(); 
-  PA.preserve<GlobalsAA>(); 
-  return PA; 
-} 
-} // End namespace llvm 
+//===- Float2Int.cpp - Demote floating point ops to work on integers ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Float2Int pass, which aims to demote floating
+// point operations to work on integers, where that is losslessly possible.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#define DEBUG_TYPE "float2int"
+
+#include "llvm/Transforms/Scalar/Float2Int.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include <deque>
+#include <functional> // For std::function
+using namespace llvm;
+
+// The algorithm is simple. Start at instructions that convert from the
+// float to the int domain: fptoui, fptosi and fcmp. Walk up the def-use
+// graph, using an equivalence datastructure to unify graphs that interfere.
+//
+// Mappable instructions are those with an integer corrollary that, given
+// integer domain inputs, produce an integer output; fadd, for example.
+//
+// If a non-mappable instruction is seen, this entire def-use graph is marked
+// as non-transformable. If we see an instruction that converts from the
+// integer domain to FP domain (uitofp,sitofp), we terminate our walk.
+
+/// The largest integer type worth dealing with.
+static cl::opt<unsigned>
+MaxIntegerBW("float2int-max-integer-bw", cl::init(64), cl::Hidden,
+             cl::desc("Max integer bitwidth to consider in float2int"
+                      "(default=64)"));
+
+namespace {
+  struct Float2IntLegacyPass : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    Float2IntLegacyPass() : FunctionPass(ID) {
+      initializeFloat2IntLegacyPassPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override {
+      if (skipFunction(F))
+        return false;
+
+      const DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+      return Impl.runImpl(F, DT);
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addPreserved<GlobalsAAWrapperPass>();
+    }
+
+  private:
+    Float2IntPass Impl;
+  };
+}
+
+char Float2IntLegacyPass::ID = 0;
+INITIALIZE_PASS(Float2IntLegacyPass, "float2int", "Float to int", false, false)
+
+// Given a FCmp predicate, return a matching ICmp predicate if one
+// exists, otherwise return BAD_ICMP_PREDICATE.
+static CmpInst::Predicate mapFCmpPred(CmpInst::Predicate P) {
+  switch (P) {
+  case CmpInst::FCMP_OEQ:
+  case CmpInst::FCMP_UEQ:
+    return CmpInst::ICMP_EQ;
+  case CmpInst::FCMP_OGT:
+  case CmpInst::FCMP_UGT:
+    return CmpInst::ICMP_SGT;
+  case CmpInst::FCMP_OGE:
+  case CmpInst::FCMP_UGE:
+    return CmpInst::ICMP_SGE;
+  case CmpInst::FCMP_OLT:
+  case CmpInst::FCMP_ULT:
+    return CmpInst::ICMP_SLT;
+  case CmpInst::FCMP_OLE:
+  case CmpInst::FCMP_ULE:
+    return CmpInst::ICMP_SLE;
+  case CmpInst::FCMP_ONE:
+  case CmpInst::FCMP_UNE:
+    return CmpInst::ICMP_NE;
+  default:
+    return CmpInst::BAD_ICMP_PREDICATE;
+  }
+}
+
+// Given a floating point binary operator, return the matching
+// integer version.
+static Instruction::BinaryOps mapBinOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  default: llvm_unreachable("Unhandled opcode!");
+  case Instruction::FAdd: return Instruction::Add;
+  case Instruction::FSub: return Instruction::Sub;
+  case Instruction::FMul: return Instruction::Mul;
+  }
+}
+
+// Find the roots - instructions that convert from the FP domain to
+// integer domain.
+void Float2IntPass::findRoots(Function &F, const DominatorTree &DT) {
+  for (BasicBlock &BB : F) {
+    // Unreachable code can take on strange forms that we are not prepared to
+    // handle. For example, an instruction may have itself as an operand.
+    if (!DT.isReachableFromEntry(&BB))
+      continue;
+
+    for (Instruction &I : BB) {
+      if (isa<VectorType>(I.getType()))
+        continue;
+      switch (I.getOpcode()) {
+      default: break;
+      case Instruction::FPToUI:
+      case Instruction::FPToSI:
+        Roots.insert(&I);
+        break;
+      case Instruction::FCmp:
+        if (mapFCmpPred(cast<CmpInst>(&I)->getPredicate()) !=
+            CmpInst::BAD_ICMP_PREDICATE)
+          Roots.insert(&I);
+        break;
+      }
+    }
+  }
+}
+
+// Helper - mark I as having been traversed, having range R.
+void Float2IntPass::seen(Instruction *I, ConstantRange R) {
+  LLVM_DEBUG(dbgs() << "F2I: " << *I << ":" << R << "\n");
+  auto IT = SeenInsts.find(I);
+  if (IT != SeenInsts.end())
+    IT->second = std::move(R);
+  else
+    SeenInsts.insert(std::make_pair(I, std::move(R)));
+}
+
+// Helper - get a range representing a poison value.
+ConstantRange Float2IntPass::badRange() {
+  return ConstantRange::getFull(MaxIntegerBW + 1);
+}
+ConstantRange Float2IntPass::unknownRange() {
+  return ConstantRange::getEmpty(MaxIntegerBW + 1);
+}
+ConstantRange Float2IntPass::validateRange(ConstantRange R) {
+  if (R.getBitWidth() > MaxIntegerBW + 1)
+    return badRange();
+  return R;
+}
+
+// The most obvious way to structure the search is a depth-first, eager
+// search from each root. However, that require direct recursion and so
+// can only handle small instruction sequences. Instead, we split the search
+// up into two phases:
+//   - walkBackwards:  A breadth-first walk of the use-def graph starting from
+//                     the roots. Populate "SeenInsts" with interesting
+//                     instructions and poison values if they're obvious and
+//                     cheap to compute. Calculate the equivalance set structure
+//                     while we're here too.
+//   - walkForwards:  Iterate over SeenInsts in reverse order, so we visit
+//                     defs before their uses. Calculate the real range info.
+
+// Breadth-first walk of the use-def graph; determine the set of nodes
+// we care about and eagerly determine if some of them are poisonous.
+void Float2IntPass::walkBackwards() {
+  std::deque<Instruction*> Worklist(Roots.begin(), Roots.end());
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.back();
+    Worklist.pop_back();
+
+    if (SeenInsts.find(I) != SeenInsts.end())
+      // Seen already.
+      continue;
+
+    switch (I->getOpcode()) {
+      // FIXME: Handle select and phi nodes.
+    default:
+      // Path terminated uncleanly.
+      seen(I, badRange());
+      break;
+
+    case Instruction::UIToFP:
+    case Instruction::SIToFP: {
+      // Path terminated cleanly - use the type of the integer input to seed
+      // the analysis.
+      unsigned BW = I->getOperand(0)->getType()->getPrimitiveSizeInBits();
+      auto Input = ConstantRange::getFull(BW);
+      auto CastOp = (Instruction::CastOps)I->getOpcode();
+      seen(I, validateRange(Input.castOp(CastOp, MaxIntegerBW+1)));
+      continue;
+    }
+
+    case Instruction::FNeg:
+    case Instruction::FAdd:
+    case Instruction::FSub:
+    case Instruction::FMul:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::FCmp:
+      seen(I, unknownRange());
+      break;
+    }
+
+    for (Value *O : I->operands()) {
+      if (Instruction *OI = dyn_cast<Instruction>(O)) {
+        // Unify def-use chains if they interfere.
+        ECs.unionSets(I, OI);
+        if (SeenInsts.find(I)->second != badRange())
+          Worklist.push_back(OI);
+      } else if (!isa<ConstantFP>(O)) {
+        // Not an instruction or ConstantFP? we can't do anything.
+        seen(I, badRange());
+      }
+    }
+  }
+}
+
+// Walk forwards down the list of seen instructions, so we visit defs before
+// uses.
+void Float2IntPass::walkForwards() {
+  for (auto &It : reverse(SeenInsts)) {
+    if (It.second != unknownRange())
+      continue;
+
+    Instruction *I = It.first;
+    std::function<ConstantRange(ArrayRef<ConstantRange>)> Op;
+    switch (I->getOpcode()) {
+      // FIXME: Handle select and phi nodes.
+    default:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+      llvm_unreachable("Should have been handled in walkForwards!");
+
+    case Instruction::FNeg:
+      Op = [](ArrayRef<ConstantRange> Ops) {
+        assert(Ops.size() == 1 && "FNeg is a unary operator!");
+        unsigned Size = Ops[0].getBitWidth();
+        auto Zero = ConstantRange(APInt::getNullValue(Size));
+        return Zero.sub(Ops[0]);
+      };
+      break;
+
+    case Instruction::FAdd:
+    case Instruction::FSub:
+    case Instruction::FMul:
+      Op = [I](ArrayRef<ConstantRange> Ops) {
+        assert(Ops.size() == 2 && "its a binary operator!");
+        auto BinOp = (Instruction::BinaryOps) I->getOpcode();
+        return Ops[0].binaryOp(BinOp, Ops[1]);
+      };
+      break;
+
+    //
+    // Root-only instructions - we'll only see these if they're the
+    //                          first node in a walk.
+    //
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+      Op = [I](ArrayRef<ConstantRange> Ops) {
+        assert(Ops.size() == 1 && "FPTo[US]I is a unary operator!");
+        // Note: We're ignoring the casts output size here as that's what the
+        // caller expects.
+        auto CastOp = (Instruction::CastOps)I->getOpcode();
+        return Ops[0].castOp(CastOp, MaxIntegerBW+1);
+      };
+      break;
+
+    case Instruction::FCmp:
+      Op = [](ArrayRef<ConstantRange> Ops) {
+        assert(Ops.size() == 2 && "FCmp is a binary operator!");
+        return Ops[0].unionWith(Ops[1]);
+      };
+      break;
+    }
+
+    bool Abort = false;
+    SmallVector<ConstantRange,4> OpRanges;
+    for (Value *O : I->operands()) {
+      if (Instruction *OI = dyn_cast<Instruction>(O)) {
+        assert(SeenInsts.find(OI) != SeenInsts.end() &&
+               "def not seen before use!");
+        OpRanges.push_back(SeenInsts.find(OI)->second);
+      } else if (ConstantFP *CF = dyn_cast<ConstantFP>(O)) {
+        // Work out if the floating point number can be losslessly represented
+        // as an integer.
+        // APFloat::convertToInteger(&Exact) purports to do what we want, but
+        // the exactness can be too precise. For example, negative zero can
+        // never be exactly converted to an integer.
+        //
+        // Instead, we ask APFloat to round itself to an integral value - this
+        // preserves sign-of-zero - then compare the result with the original.
+        //
+        const APFloat &F = CF->getValueAPF();
+
+        // First, weed out obviously incorrect values. Non-finite numbers
+        // can't be represented and neither can negative zero, unless
+        // we're in fast math mode.
+        if (!F.isFinite() ||
+            (F.isZero() && F.isNegative() && isa<FPMathOperator>(I) &&
+             !I->hasNoSignedZeros())) {
+          seen(I, badRange());
+          Abort = true;
+          break;
+        }
+
+        APFloat NewF = F;
+        auto Res = NewF.roundToIntegral(APFloat::rmNearestTiesToEven);
+        if (Res != APFloat::opOK || NewF != F) {
+          seen(I, badRange());
+          Abort = true;
+          break;
+        }
+        // OK, it's representable. Now get it.
+        APSInt Int(MaxIntegerBW+1, false);
+        bool Exact;
+        CF->getValueAPF().convertToInteger(Int,
+                                           APFloat::rmNearestTiesToEven,
+                                           &Exact);
+        OpRanges.push_back(ConstantRange(Int));
+      } else {
+        llvm_unreachable("Should have already marked this as badRange!");
+      }
+    }
+
+    // Reduce the operands' ranges to a single range and return.
+    if (!Abort)
+      seen(I, Op(OpRanges));
+  }
+}
+
+// If there is a valid transform to be done, do it.
+bool Float2IntPass::validateAndTransform() {
+  bool MadeChange = false;
+
+  // Iterate over every disjoint partition of the def-use graph.
+  for (auto It = ECs.begin(), E = ECs.end(); It != E; ++It) {
+    ConstantRange R(MaxIntegerBW + 1, false);
+    bool Fail = false;
+    Type *ConvertedToTy = nullptr;
+
+    // For every member of the partition, union all the ranges together.
+    for (auto MI = ECs.member_begin(It), ME = ECs.member_end();
+         MI != ME; ++MI) {
+      Instruction *I = *MI;
+      auto SeenI = SeenInsts.find(I);
+      if (SeenI == SeenInsts.end())
+        continue;
+
+      R = R.unionWith(SeenI->second);
+      // We need to ensure I has no users that have not been seen.
+      // If it does, transformation would be illegal.
+      //
+      // Don't count the roots, as they terminate the graphs.
+      if (Roots.count(I) == 0) {
+        // Set the type of the conversion while we're here.
+        if (!ConvertedToTy)
+          ConvertedToTy = I->getType();
+        for (User *U : I->users()) {
+          Instruction *UI = dyn_cast<Instruction>(U);
+          if (!UI || SeenInsts.find(UI) == SeenInsts.end()) {
+            LLVM_DEBUG(dbgs() << "F2I: Failing because of " << *U << "\n");
+            Fail = true;
+            break;
+          }
+        }
+      }
+      if (Fail)
+        break;
+    }
+
+    // If the set was empty, or we failed, or the range is poisonous,
+    // bail out.
+    if (ECs.member_begin(It) == ECs.member_end() || Fail ||
+        R.isFullSet() || R.isSignWrappedSet())
+      continue;
+    assert(ConvertedToTy && "Must have set the convertedtoty by this point!");
+
+    // The number of bits required is the maximum of the upper and
+    // lower limits, plus one so it can be signed.
+    unsigned MinBW = std::max(R.getLower().getMinSignedBits(),
+                              R.getUpper().getMinSignedBits()) + 1;
+    LLVM_DEBUG(dbgs() << "F2I: MinBitwidth=" << MinBW << ", R: " << R << "\n");
+
+    // If we've run off the realms of the exactly representable integers,
+    // the floating point result will differ from an integer approximation.
+
+    // Do we need more bits than are in the mantissa of the type we converted
+    // to? semanticsPrecision returns the number of mantissa bits plus one
+    // for the sign bit.
+    unsigned MaxRepresentableBits
+      = APFloat::semanticsPrecision(ConvertedToTy->getFltSemantics()) - 1;
+    if (MinBW > MaxRepresentableBits) {
+      LLVM_DEBUG(dbgs() << "F2I: Value not guaranteed to be representable!\n");
+      continue;
+    }
+    if (MinBW > 64) {
+      LLVM_DEBUG(
+          dbgs() << "F2I: Value requires more than 64 bits to represent!\n");
+      continue;
+    }
+
+    // OK, R is known to be representable. Now pick a type for it.
+    // FIXME: Pick the smallest legal type that will fit.
+    Type *Ty = (MinBW > 32) ? Type::getInt64Ty(*Ctx) : Type::getInt32Ty(*Ctx);
+
+    for (auto MI = ECs.member_begin(It), ME = ECs.member_end();
+         MI != ME; ++MI)
+      convert(*MI, Ty);
+    MadeChange = true;
+  }
+
+  return MadeChange;
+}
+
+Value *Float2IntPass::convert(Instruction *I, Type *ToTy) {
+  if (ConvertedInsts.find(I) != ConvertedInsts.end())
+    // Already converted this instruction.
+    return ConvertedInsts[I];
+
+  SmallVector<Value*,4> NewOperands;
+  for (Value *V : I->operands()) {
+    // Don't recurse if we're an instruction that terminates the path.
+    if (I->getOpcode() == Instruction::UIToFP ||
+        I->getOpcode() == Instruction::SIToFP) {
+      NewOperands.push_back(V);
+    } else if (Instruction *VI = dyn_cast<Instruction>(V)) {
+      NewOperands.push_back(convert(VI, ToTy));
+    } else if (ConstantFP *CF = dyn_cast<ConstantFP>(V)) {
+      APSInt Val(ToTy->getPrimitiveSizeInBits(), /*isUnsigned=*/false);
+      bool Exact;
+      CF->getValueAPF().convertToInteger(Val,
+                                         APFloat::rmNearestTiesToEven,
+                                         &Exact);
+      NewOperands.push_back(ConstantInt::get(ToTy, Val));
+    } else {
+      llvm_unreachable("Unhandled operand type?");
+    }
+  }
+
+  // Now create a new instruction.
+  IRBuilder<> IRB(I);
+  Value *NewV = nullptr;
+  switch (I->getOpcode()) {
+  default: llvm_unreachable("Unhandled instruction!");
+
+  case Instruction::FPToUI:
+    NewV = IRB.CreateZExtOrTrunc(NewOperands[0], I->getType());
+    break;
+
+  case Instruction::FPToSI:
+    NewV = IRB.CreateSExtOrTrunc(NewOperands[0], I->getType());
+    break;
+
+  case Instruction::FCmp: {
+    CmpInst::Predicate P = mapFCmpPred(cast<CmpInst>(I)->getPredicate());
+    assert(P != CmpInst::BAD_ICMP_PREDICATE && "Unhandled predicate!");
+    NewV = IRB.CreateICmp(P, NewOperands[0], NewOperands[1], I->getName());
+    break;
+  }
+
+  case Instruction::UIToFP:
+    NewV = IRB.CreateZExtOrTrunc(NewOperands[0], ToTy);
+    break;
+
+  case Instruction::SIToFP:
+    NewV = IRB.CreateSExtOrTrunc(NewOperands[0], ToTy);
+    break;
+
+  case Instruction::FNeg:
+    NewV = IRB.CreateNeg(NewOperands[0], I->getName());
+    break;
+
+  case Instruction::FAdd:
+  case Instruction::FSub:
+  case Instruction::FMul:
+    NewV = IRB.CreateBinOp(mapBinOpcode(I->getOpcode()),
+                           NewOperands[0], NewOperands[1],
+                           I->getName());
+    break;
+  }
+
+  // If we're a root instruction, RAUW.
+  if (Roots.count(I))
+    I->replaceAllUsesWith(NewV);
+
+  ConvertedInsts[I] = NewV;
+  return NewV;
+}
+
+// Perform dead code elimination on the instructions we just modified.
+void Float2IntPass::cleanup() {
+  for (auto &I : reverse(ConvertedInsts))
+    I.first->eraseFromParent();
+}
+
+bool Float2IntPass::runImpl(Function &F, const DominatorTree &DT) {
+  LLVM_DEBUG(dbgs() << "F2I: Looking at function " << F.getName() << "\n");
+  // Clear out all state.
+  ECs = EquivalenceClasses<Instruction*>();
+  SeenInsts.clear();
+  ConvertedInsts.clear();
+  Roots.clear();
+
+  Ctx = &F.getParent()->getContext();
+
+  findRoots(F, DT);
+
+  walkBackwards();
+  walkForwards();
+
+  bool Modified = validateAndTransform();
+  if (Modified)
+    cleanup();
+  return Modified;
+}
+
+namespace llvm {
+FunctionPass *createFloat2IntPass() { return new Float2IntLegacyPass(); }
+
+PreservedAnalyses Float2IntPass::run(Function &F, FunctionAnalysisManager &AM) {
+  const DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  if (!runImpl(F, DT))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+} // End namespace llvm
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/GVN.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/GVN.cpp
index 90795c40d6..c6b6d75aef 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/GVN.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/GVN.cpp
@@ -1,104 +1,104 @@
-//===- GVN.cpp - Eliminate redundant values and loads ---------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass performs global value numbering to eliminate fully redundant 
-// instructions.  It also performs simple dead load elimination. 
-// 
-// Note that this pass does the value numbering itself; it does not use the 
-// ValueNumbering analysis passes. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/GVN.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/DepthFirstIterator.h" 
-#include "llvm/ADT/Hashing.h" 
-#include "llvm/ADT/MapVector.h" 
-#include "llvm/ADT/PointerIntPair.h" 
-#include "llvm/ADT/PostOrderIterator.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
+//===- GVN.cpp - Eliminate redundant values and loads ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs global value numbering to eliminate fully redundant
+// instructions.  It also performs simple dead load elimination.
+//
+// Note that this pass does the value numbering itself; it does not use the
+// ValueNumbering analysis passes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumeBundleQueries.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/CFG.h" 
-#include "llvm/Analysis/DomTreeUpdater.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/MemoryBuiltins.h" 
-#include "llvm/Analysis/MemoryDependenceAnalysis.h" 
+#include "llvm/Analysis/AssumeBundleQueries.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h" 
-#include "llvm/Analysis/PHITransAddr.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/Config/llvm-config.h" 
-#include "llvm/IR/Attributes.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DebugLoc.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Operator.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Compiler.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Utils.h" 
-#include "llvm/Transforms/Utils/AssumeBundleBuilder.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Transforms/Utils/SSAUpdater.h" 
-#include "llvm/Transforms/Utils/VNCoercion.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <cstdint> 
-#include <utility> 
-#include <vector> 
- 
-using namespace llvm; 
-using namespace llvm::gvn; 
-using namespace llvm::VNCoercion; 
-using namespace PatternMatch; 
- 
-#define DEBUG_TYPE "gvn" 
- 
-STATISTIC(NumGVNInstr,  "Number of instructions deleted"); 
-STATISTIC(NumGVNLoad,   "Number of loads deleted"); 
-STATISTIC(NumGVNPRE,    "Number of instructions PRE'd"); 
-STATISTIC(NumGVNBlocks, "Number of blocks merged"); 
-STATISTIC(NumGVNSimpl,  "Number of instructions simplified"); 
-STATISTIC(NumGVNEqProp, "Number of equalities propagated"); 
-STATISTIC(NumPRELoad,   "Number of loads PRE'd"); 
- 
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/PHITransAddr.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/VNCoercion.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::gvn;
+using namespace llvm::VNCoercion;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "gvn"
+
+STATISTIC(NumGVNInstr,  "Number of instructions deleted");
+STATISTIC(NumGVNLoad,   "Number of loads deleted");
+STATISTIC(NumGVNPRE,    "Number of instructions PRE'd");
+STATISTIC(NumGVNBlocks, "Number of blocks merged");
+STATISTIC(NumGVNSimpl,  "Number of instructions simplified");
+STATISTIC(NumGVNEqProp, "Number of equalities propagated");
+STATISTIC(NumPRELoad,   "Number of loads PRE'd");
+
 STATISTIC(IsValueFullyAvailableInBlockNumSpeculationsMax,
           "Number of blocks speculated as available in "
           "IsValueFullyAvailableInBlock(), max");
@@ -106,19 +106,19 @@ STATISTIC(MaxBBSpeculationCutoffReachedTimes,
           "Number of times we we reached gvn-max-block-speculations cut-off "
           "preventing further exploration");
 
-static cl::opt<bool> GVNEnablePRE("enable-pre", cl::init(true), cl::Hidden); 
-static cl::opt<bool> GVNEnableLoadPRE("enable-load-pre", cl::init(true)); 
-static cl::opt<bool> GVNEnableLoadInLoopPRE("enable-load-in-loop-pre", 
-                                            cl::init(true)); 
+static cl::opt<bool> GVNEnablePRE("enable-pre", cl::init(true), cl::Hidden);
+static cl::opt<bool> GVNEnableLoadPRE("enable-load-pre", cl::init(true));
+static cl::opt<bool> GVNEnableLoadInLoopPRE("enable-load-in-loop-pre",
+                                            cl::init(true));
 static cl::opt<bool>
 GVNEnableSplitBackedgeInLoadPRE("enable-split-backedge-in-load-pre",
                                 cl::init(true));
-static cl::opt<bool> GVNEnableMemDep("enable-gvn-memdep", cl::init(true)); 
- 
-static cl::opt<uint32_t> MaxNumDeps( 
-    "gvn-max-num-deps", cl::Hidden, cl::init(100), cl::ZeroOrMore, 
-    cl::desc("Max number of dependences to attempt Load PRE (default = 100)")); 
- 
+static cl::opt<bool> GVNEnableMemDep("enable-gvn-memdep", cl::init(true));
+
+static cl::opt<uint32_t> MaxNumDeps(
+    "gvn-max-num-deps", cl::Hidden, cl::init(100), cl::ZeroOrMore,
+    cl::desc("Max number of dependences to attempt Load PRE (default = 100)"));
+
 // This is based on IsValueFullyAvailableInBlockNumSpeculationsMax stat.
 static cl::opt<uint32_t> MaxBBSpeculations(
     "gvn-max-block-speculations", cl::Hidden, cl::init(600), cl::ZeroOrMore,
@@ -126,570 +126,570 @@ static cl::opt<uint32_t> MaxBBSpeculations(
              "into) when deducing if a value is fully available or not in GVN "
              "(default = 600)"));
 
-struct llvm::GVN::Expression { 
-  uint32_t opcode; 
-  bool commutative = false; 
-  Type *type = nullptr; 
-  SmallVector<uint32_t, 4> varargs; 
- 
-  Expression(uint32_t o = ~2U) : opcode(o) {} 
- 
-  bool operator==(const Expression &other) const { 
-    if (opcode != other.opcode) 
-      return false; 
-    if (opcode == ~0U || opcode == ~1U) 
-      return true; 
-    if (type != other.type) 
-      return false; 
-    if (varargs != other.varargs) 
-      return false; 
-    return true; 
-  } 
- 
-  friend hash_code hash_value(const Expression &Value) { 
-    return hash_combine( 
-        Value.opcode, Value.type, 
-        hash_combine_range(Value.varargs.begin(), Value.varargs.end())); 
-  } 
-}; 
- 
-namespace llvm { 
- 
-template <> struct DenseMapInfo<GVN::Expression> { 
-  static inline GVN::Expression getEmptyKey() { return ~0U; } 
-  static inline GVN::Expression getTombstoneKey() { return ~1U; } 
- 
-  static unsigned getHashValue(const GVN::Expression &e) { 
-    using llvm::hash_value; 
- 
-    return static_cast<unsigned>(hash_value(e)); 
-  } 
- 
-  static bool isEqual(const GVN::Expression &LHS, const GVN::Expression &RHS) { 
-    return LHS == RHS; 
-  } 
-}; 
- 
-} // end namespace llvm 
- 
-/// Represents a particular available value that we know how to materialize. 
-/// Materialization of an AvailableValue never fails.  An AvailableValue is 
-/// implicitly associated with a rematerialization point which is the 
-/// location of the instruction from which it was formed. 
-struct llvm::gvn::AvailableValue { 
-  enum ValType { 
-    SimpleVal, // A simple offsetted value that is accessed. 
-    LoadVal,   // A value produced by a load. 
-    MemIntrin, // A memory intrinsic which is loaded from. 
-    UndefVal   // A UndefValue representing a value from dead block (which 
-               // is not yet physically removed from the CFG). 
-  }; 
- 
-  /// V - The value that is live out of the block. 
-  PointerIntPair<Value *, 2, ValType> Val; 
- 
-  /// Offset - The byte offset in Val that is interesting for the load query. 
-  unsigned Offset = 0; 
- 
-  static AvailableValue get(Value *V, unsigned Offset = 0) { 
-    AvailableValue Res; 
-    Res.Val.setPointer(V); 
-    Res.Val.setInt(SimpleVal); 
-    Res.Offset = Offset; 
-    return Res; 
-  } 
- 
-  static AvailableValue getMI(MemIntrinsic *MI, unsigned Offset = 0) { 
-    AvailableValue Res; 
-    Res.Val.setPointer(MI); 
-    Res.Val.setInt(MemIntrin); 
-    Res.Offset = Offset; 
-    return Res; 
-  } 
- 
-  static AvailableValue getLoad(LoadInst *LI, unsigned Offset = 0) { 
-    AvailableValue Res; 
-    Res.Val.setPointer(LI); 
-    Res.Val.setInt(LoadVal); 
-    Res.Offset = Offset; 
-    return Res; 
-  } 
- 
-  static AvailableValue getUndef() { 
-    AvailableValue Res; 
-    Res.Val.setPointer(nullptr); 
-    Res.Val.setInt(UndefVal); 
-    Res.Offset = 0; 
-    return Res; 
-  } 
- 
-  bool isSimpleValue() const { return Val.getInt() == SimpleVal; } 
-  bool isCoercedLoadValue() const { return Val.getInt() == LoadVal; } 
-  bool isMemIntrinValue() const { return Val.getInt() == MemIntrin; } 
-  bool isUndefValue() const { return Val.getInt() == UndefVal; } 
- 
-  Value *getSimpleValue() const { 
-    assert(isSimpleValue() && "Wrong accessor"); 
-    return Val.getPointer(); 
-  } 
- 
-  LoadInst *getCoercedLoadValue() const { 
-    assert(isCoercedLoadValue() && "Wrong accessor"); 
-    return cast<LoadInst>(Val.getPointer()); 
-  } 
- 
-  MemIntrinsic *getMemIntrinValue() const { 
-    assert(isMemIntrinValue() && "Wrong accessor"); 
-    return cast<MemIntrinsic>(Val.getPointer()); 
-  } 
- 
-  /// Emit code at the specified insertion point to adjust the value defined 
-  /// here to the specified type. This handles various coercion cases. 
-  Value *MaterializeAdjustedValue(LoadInst *LI, Instruction *InsertPt, 
-                                  GVN &gvn) const; 
-}; 
- 
-/// Represents an AvailableValue which can be rematerialized at the end of 
-/// the associated BasicBlock. 
-struct llvm::gvn::AvailableValueInBlock { 
-  /// BB - The basic block in question. 
-  BasicBlock *BB = nullptr; 
- 
-  /// AV - The actual available value 
-  AvailableValue AV; 
- 
-  static AvailableValueInBlock get(BasicBlock *BB, AvailableValue &&AV) { 
-    AvailableValueInBlock Res; 
-    Res.BB = BB; 
-    Res.AV = std::move(AV); 
-    return Res; 
-  } 
- 
-  static AvailableValueInBlock get(BasicBlock *BB, Value *V, 
-                                   unsigned Offset = 0) { 
-    return get(BB, AvailableValue::get(V, Offset)); 
-  } 
- 
-  static AvailableValueInBlock getUndef(BasicBlock *BB) { 
-    return get(BB, AvailableValue::getUndef()); 
-  } 
- 
-  /// Emit code at the end of this block to adjust the value defined here to 
-  /// the specified type. This handles various coercion cases. 
-  Value *MaterializeAdjustedValue(LoadInst *LI, GVN &gvn) const { 
-    return AV.MaterializeAdjustedValue(LI, BB->getTerminator(), gvn); 
-  } 
-}; 
- 
-//===----------------------------------------------------------------------===// 
-//                     ValueTable Internal Functions 
-//===----------------------------------------------------------------------===// 
- 
-GVN::Expression GVN::ValueTable::createExpr(Instruction *I) { 
-  Expression e; 
-  e.type = I->getType(); 
-  e.opcode = I->getOpcode(); 
-  for (Instruction::op_iterator OI = I->op_begin(), OE = I->op_end(); 
-       OI != OE; ++OI) 
-    e.varargs.push_back(lookupOrAdd(*OI)); 
-  if (I->isCommutative()) { 
-    // Ensure that commutative instructions that only differ by a permutation 
-    // of their operands get the same value number by sorting the operand value 
+struct llvm::GVN::Expression {
+  uint32_t opcode;
+  bool commutative = false;
+  Type *type = nullptr;
+  SmallVector<uint32_t, 4> varargs;
+
+  Expression(uint32_t o = ~2U) : opcode(o) {}
+
+  bool operator==(const Expression &other) const {
+    if (opcode != other.opcode)
+      return false;
+    if (opcode == ~0U || opcode == ~1U)
+      return true;
+    if (type != other.type)
+      return false;
+    if (varargs != other.varargs)
+      return false;
+    return true;
+  }
+
+  friend hash_code hash_value(const Expression &Value) {
+    return hash_combine(
+        Value.opcode, Value.type,
+        hash_combine_range(Value.varargs.begin(), Value.varargs.end()));
+  }
+};
+
+namespace llvm {
+
+template <> struct DenseMapInfo<GVN::Expression> {
+  static inline GVN::Expression getEmptyKey() { return ~0U; }
+  static inline GVN::Expression getTombstoneKey() { return ~1U; }
+
+  static unsigned getHashValue(const GVN::Expression &e) {
+    using llvm::hash_value;
+
+    return static_cast<unsigned>(hash_value(e));
+  }
+
+  static bool isEqual(const GVN::Expression &LHS, const GVN::Expression &RHS) {
+    return LHS == RHS;
+  }
+};
+
+} // end namespace llvm
+
+/// Represents a particular available value that we know how to materialize.
+/// Materialization of an AvailableValue never fails.  An AvailableValue is
+/// implicitly associated with a rematerialization point which is the
+/// location of the instruction from which it was formed.
+struct llvm::gvn::AvailableValue {
+  enum ValType {
+    SimpleVal, // A simple offsetted value that is accessed.
+    LoadVal,   // A value produced by a load.
+    MemIntrin, // A memory intrinsic which is loaded from.
+    UndefVal   // A UndefValue representing a value from dead block (which
+               // is not yet physically removed from the CFG).
+  };
+
+  /// V - The value that is live out of the block.
+  PointerIntPair<Value *, 2, ValType> Val;
+
+  /// Offset - The byte offset in Val that is interesting for the load query.
+  unsigned Offset = 0;
+
+  static AvailableValue get(Value *V, unsigned Offset = 0) {
+    AvailableValue Res;
+    Res.Val.setPointer(V);
+    Res.Val.setInt(SimpleVal);
+    Res.Offset = Offset;
+    return Res;
+  }
+
+  static AvailableValue getMI(MemIntrinsic *MI, unsigned Offset = 0) {
+    AvailableValue Res;
+    Res.Val.setPointer(MI);
+    Res.Val.setInt(MemIntrin);
+    Res.Offset = Offset;
+    return Res;
+  }
+
+  static AvailableValue getLoad(LoadInst *LI, unsigned Offset = 0) {
+    AvailableValue Res;
+    Res.Val.setPointer(LI);
+    Res.Val.setInt(LoadVal);
+    Res.Offset = Offset;
+    return Res;
+  }
+
+  static AvailableValue getUndef() {
+    AvailableValue Res;
+    Res.Val.setPointer(nullptr);
+    Res.Val.setInt(UndefVal);
+    Res.Offset = 0;
+    return Res;
+  }
+
+  bool isSimpleValue() const { return Val.getInt() == SimpleVal; }
+  bool isCoercedLoadValue() const { return Val.getInt() == LoadVal; }
+  bool isMemIntrinValue() const { return Val.getInt() == MemIntrin; }
+  bool isUndefValue() const { return Val.getInt() == UndefVal; }
+
+  Value *getSimpleValue() const {
+    assert(isSimpleValue() && "Wrong accessor");
+    return Val.getPointer();
+  }
+
+  LoadInst *getCoercedLoadValue() const {
+    assert(isCoercedLoadValue() && "Wrong accessor");
+    return cast<LoadInst>(Val.getPointer());
+  }
+
+  MemIntrinsic *getMemIntrinValue() const {
+    assert(isMemIntrinValue() && "Wrong accessor");
+    return cast<MemIntrinsic>(Val.getPointer());
+  }
+
+  /// Emit code at the specified insertion point to adjust the value defined
+  /// here to the specified type. This handles various coercion cases.
+  Value *MaterializeAdjustedValue(LoadInst *LI, Instruction *InsertPt,
+                                  GVN &gvn) const;
+};
+
+/// Represents an AvailableValue which can be rematerialized at the end of
+/// the associated BasicBlock.
+struct llvm::gvn::AvailableValueInBlock {
+  /// BB - The basic block in question.
+  BasicBlock *BB = nullptr;
+
+  /// AV - The actual available value
+  AvailableValue AV;
+
+  static AvailableValueInBlock get(BasicBlock *BB, AvailableValue &&AV) {
+    AvailableValueInBlock Res;
+    Res.BB = BB;
+    Res.AV = std::move(AV);
+    return Res;
+  }
+
+  static AvailableValueInBlock get(BasicBlock *BB, Value *V,
+                                   unsigned Offset = 0) {
+    return get(BB, AvailableValue::get(V, Offset));
+  }
+
+  static AvailableValueInBlock getUndef(BasicBlock *BB) {
+    return get(BB, AvailableValue::getUndef());
+  }
+
+  /// Emit code at the end of this block to adjust the value defined here to
+  /// the specified type. This handles various coercion cases.
+  Value *MaterializeAdjustedValue(LoadInst *LI, GVN &gvn) const {
+    return AV.MaterializeAdjustedValue(LI, BB->getTerminator(), gvn);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+//                     ValueTable Internal Functions
+//===----------------------------------------------------------------------===//
+
+GVN::Expression GVN::ValueTable::createExpr(Instruction *I) {
+  Expression e;
+  e.type = I->getType();
+  e.opcode = I->getOpcode();
+  for (Instruction::op_iterator OI = I->op_begin(), OE = I->op_end();
+       OI != OE; ++OI)
+    e.varargs.push_back(lookupOrAdd(*OI));
+  if (I->isCommutative()) {
+    // Ensure that commutative instructions that only differ by a permutation
+    // of their operands get the same value number by sorting the operand value
     // numbers.  Since commutative operands are the 1st two operands it is more
-    // efficient to sort by hand rather than using, say, std::sort. 
+    // efficient to sort by hand rather than using, say, std::sort.
     assert(I->getNumOperands() >= 2 && "Unsupported commutative instruction!");
-    if (e.varargs[0] > e.varargs[1]) 
-      std::swap(e.varargs[0], e.varargs[1]); 
-    e.commutative = true; 
-  } 
- 
-  if (auto *C = dyn_cast<CmpInst>(I)) { 
-    // Sort the operand value numbers so x<y and y>x get the same value number. 
-    CmpInst::Predicate Predicate = C->getPredicate(); 
-    if (e.varargs[0] > e.varargs[1]) { 
-      std::swap(e.varargs[0], e.varargs[1]); 
-      Predicate = CmpInst::getSwappedPredicate(Predicate); 
-    } 
-    e.opcode = (C->getOpcode() << 8) | Predicate; 
-    e.commutative = true; 
-  } else if (auto *E = dyn_cast<InsertValueInst>(I)) { 
-    e.varargs.append(E->idx_begin(), E->idx_end()); 
-  } else if (auto *SVI = dyn_cast<ShuffleVectorInst>(I)) { 
-    ArrayRef<int> ShuffleMask = SVI->getShuffleMask(); 
-    e.varargs.append(ShuffleMask.begin(), ShuffleMask.end()); 
-  } 
- 
-  return e; 
-} 
- 
-GVN::Expression GVN::ValueTable::createCmpExpr(unsigned Opcode, 
-                                               CmpInst::Predicate Predicate, 
-                                               Value *LHS, Value *RHS) { 
-  assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && 
-         "Not a comparison!"); 
-  Expression e; 
-  e.type = CmpInst::makeCmpResultType(LHS->getType()); 
-  e.varargs.push_back(lookupOrAdd(LHS)); 
-  e.varargs.push_back(lookupOrAdd(RHS)); 
- 
-  // Sort the operand value numbers so x<y and y>x get the same value number. 
-  if (e.varargs[0] > e.varargs[1]) { 
-    std::swap(e.varargs[0], e.varargs[1]); 
-    Predicate = CmpInst::getSwappedPredicate(Predicate); 
-  } 
-  e.opcode = (Opcode << 8) | Predicate; 
-  e.commutative = true; 
-  return e; 
-} 
- 
-GVN::Expression GVN::ValueTable::createExtractvalueExpr(ExtractValueInst *EI) { 
-  assert(EI && "Not an ExtractValueInst?"); 
-  Expression e; 
-  e.type = EI->getType(); 
-  e.opcode = 0; 
- 
-  WithOverflowInst *WO = dyn_cast<WithOverflowInst>(EI->getAggregateOperand()); 
-  if (WO != nullptr && EI->getNumIndices() == 1 && *EI->idx_begin() == 0) { 
-    // EI is an extract from one of our with.overflow intrinsics. Synthesize 
-    // a semantically equivalent expression instead of an extract value 
-    // expression. 
-    e.opcode = WO->getBinaryOp(); 
-    e.varargs.push_back(lookupOrAdd(WO->getLHS())); 
-    e.varargs.push_back(lookupOrAdd(WO->getRHS())); 
-    return e; 
-  } 
- 
-  // Not a recognised intrinsic. Fall back to producing an extract value 
-  // expression. 
-  e.opcode = EI->getOpcode(); 
-  for (Instruction::op_iterator OI = EI->op_begin(), OE = EI->op_end(); 
-       OI != OE; ++OI) 
-    e.varargs.push_back(lookupOrAdd(*OI)); 
- 
+    if (e.varargs[0] > e.varargs[1])
+      std::swap(e.varargs[0], e.varargs[1]);
+    e.commutative = true;
+  }
+
+  if (auto *C = dyn_cast<CmpInst>(I)) {
+    // Sort the operand value numbers so x<y and y>x get the same value number.
+    CmpInst::Predicate Predicate = C->getPredicate();
+    if (e.varargs[0] > e.varargs[1]) {
+      std::swap(e.varargs[0], e.varargs[1]);
+      Predicate = CmpInst::getSwappedPredicate(Predicate);
+    }
+    e.opcode = (C->getOpcode() << 8) | Predicate;
+    e.commutative = true;
+  } else if (auto *E = dyn_cast<InsertValueInst>(I)) {
+    e.varargs.append(E->idx_begin(), E->idx_end());
+  } else if (auto *SVI = dyn_cast<ShuffleVectorInst>(I)) {
+    ArrayRef<int> ShuffleMask = SVI->getShuffleMask();
+    e.varargs.append(ShuffleMask.begin(), ShuffleMask.end());
+  }
+
+  return e;
+}
+
+GVN::Expression GVN::ValueTable::createCmpExpr(unsigned Opcode,
+                                               CmpInst::Predicate Predicate,
+                                               Value *LHS, Value *RHS) {
+  assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
+         "Not a comparison!");
+  Expression e;
+  e.type = CmpInst::makeCmpResultType(LHS->getType());
+  e.varargs.push_back(lookupOrAdd(LHS));
+  e.varargs.push_back(lookupOrAdd(RHS));
+
+  // Sort the operand value numbers so x<y and y>x get the same value number.
+  if (e.varargs[0] > e.varargs[1]) {
+    std::swap(e.varargs[0], e.varargs[1]);
+    Predicate = CmpInst::getSwappedPredicate(Predicate);
+  }
+  e.opcode = (Opcode << 8) | Predicate;
+  e.commutative = true;
+  return e;
+}
+
+GVN::Expression GVN::ValueTable::createExtractvalueExpr(ExtractValueInst *EI) {
+  assert(EI && "Not an ExtractValueInst?");
+  Expression e;
+  e.type = EI->getType();
+  e.opcode = 0;
+
+  WithOverflowInst *WO = dyn_cast<WithOverflowInst>(EI->getAggregateOperand());
+  if (WO != nullptr && EI->getNumIndices() == 1 && *EI->idx_begin() == 0) {
+    // EI is an extract from one of our with.overflow intrinsics. Synthesize
+    // a semantically equivalent expression instead of an extract value
+    // expression.
+    e.opcode = WO->getBinaryOp();
+    e.varargs.push_back(lookupOrAdd(WO->getLHS()));
+    e.varargs.push_back(lookupOrAdd(WO->getRHS()));
+    return e;
+  }
+
+  // Not a recognised intrinsic. Fall back to producing an extract value
+  // expression.
+  e.opcode = EI->getOpcode();
+  for (Instruction::op_iterator OI = EI->op_begin(), OE = EI->op_end();
+       OI != OE; ++OI)
+    e.varargs.push_back(lookupOrAdd(*OI));
+
   append_range(e.varargs, EI->indices());
- 
-  return e; 
-} 
- 
-//===----------------------------------------------------------------------===// 
-//                     ValueTable External Functions 
-//===----------------------------------------------------------------------===// 
- 
-GVN::ValueTable::ValueTable() = default; 
-GVN::ValueTable::ValueTable(const ValueTable &) = default; 
-GVN::ValueTable::ValueTable(ValueTable &&) = default; 
-GVN::ValueTable::~ValueTable() = default; 
-GVN::ValueTable &GVN::ValueTable::operator=(const GVN::ValueTable &Arg) = default; 
- 
-/// add - Insert a value into the table with a specified value number. 
-void GVN::ValueTable::add(Value *V, uint32_t num) { 
-  valueNumbering.insert(std::make_pair(V, num)); 
-  if (PHINode *PN = dyn_cast<PHINode>(V)) 
-    NumberingPhi[num] = PN; 
-} 
- 
-uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) { 
-  if (AA->doesNotAccessMemory(C)) { 
-    Expression exp = createExpr(C); 
-    uint32_t e = assignExpNewValueNum(exp).first; 
-    valueNumbering[C] = e; 
-    return e; 
-  } else if (MD && AA->onlyReadsMemory(C)) { 
-    Expression exp = createExpr(C); 
-    auto ValNum = assignExpNewValueNum(exp); 
-    if (ValNum.second) { 
-      valueNumbering[C] = ValNum.first; 
-      return ValNum.first; 
-    } 
- 
-    MemDepResult local_dep = MD->getDependency(C); 
- 
-    if (!local_dep.isDef() && !local_dep.isNonLocal()) { 
-      valueNumbering[C] =  nextValueNumber; 
-      return nextValueNumber++; 
-    } 
- 
-    if (local_dep.isDef()) { 
+
+  return e;
+}
+
+//===----------------------------------------------------------------------===//
+//                     ValueTable External Functions
+//===----------------------------------------------------------------------===//
+
+GVN::ValueTable::ValueTable() = default;
+GVN::ValueTable::ValueTable(const ValueTable &) = default;
+GVN::ValueTable::ValueTable(ValueTable &&) = default;
+GVN::ValueTable::~ValueTable() = default;
+GVN::ValueTable &GVN::ValueTable::operator=(const GVN::ValueTable &Arg) = default;
+
+/// add - Insert a value into the table with a specified value number.
+void GVN::ValueTable::add(Value *V, uint32_t num) {
+  valueNumbering.insert(std::make_pair(V, num));
+  if (PHINode *PN = dyn_cast<PHINode>(V))
+    NumberingPhi[num] = PN;
+}
+
+uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) {
+  if (AA->doesNotAccessMemory(C)) {
+    Expression exp = createExpr(C);
+    uint32_t e = assignExpNewValueNum(exp).first;
+    valueNumbering[C] = e;
+    return e;
+  } else if (MD && AA->onlyReadsMemory(C)) {
+    Expression exp = createExpr(C);
+    auto ValNum = assignExpNewValueNum(exp);
+    if (ValNum.second) {
+      valueNumbering[C] = ValNum.first;
+      return ValNum.first;
+    }
+
+    MemDepResult local_dep = MD->getDependency(C);
+
+    if (!local_dep.isDef() && !local_dep.isNonLocal()) {
+      valueNumbering[C] =  nextValueNumber;
+      return nextValueNumber++;
+    }
+
+    if (local_dep.isDef()) {
       // For masked load/store intrinsics, the local_dep may actully be
       // a normal load or store instruction.
       CallInst *local_cdep = dyn_cast<CallInst>(local_dep.getInst());
- 
+
       if (!local_cdep ||
           local_cdep->getNumArgOperands() != C->getNumArgOperands()) {
-        valueNumbering[C] = nextValueNumber; 
-        return nextValueNumber++; 
-      } 
- 
-      for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) { 
-        uint32_t c_vn = lookupOrAdd(C->getArgOperand(i)); 
-        uint32_t cd_vn = lookupOrAdd(local_cdep->getArgOperand(i)); 
-        if (c_vn != cd_vn) { 
-          valueNumbering[C] = nextValueNumber; 
-          return nextValueNumber++; 
-        } 
-      } 
- 
-      uint32_t v = lookupOrAdd(local_cdep); 
-      valueNumbering[C] = v; 
-      return v; 
-    } 
- 
-    // Non-local case. 
-    const MemoryDependenceResults::NonLocalDepInfo &deps = 
-        MD->getNonLocalCallDependency(C); 
-    // FIXME: Move the checking logic to MemDep! 
-    CallInst* cdep = nullptr; 
- 
-    // Check to see if we have a single dominating call instruction that is 
-    // identical to C. 
-    for (unsigned i = 0, e = deps.size(); i != e; ++i) { 
-      const NonLocalDepEntry *I = &deps[i]; 
-      if (I->getResult().isNonLocal()) 
-        continue; 
- 
-      // We don't handle non-definitions.  If we already have a call, reject 
-      // instruction dependencies. 
-      if (!I->getResult().isDef() || cdep != nullptr) { 
-        cdep = nullptr; 
-        break; 
-      } 
- 
-      CallInst *NonLocalDepCall = dyn_cast<CallInst>(I->getResult().getInst()); 
-      // FIXME: All duplicated with non-local case. 
-      if (NonLocalDepCall && DT->properlyDominates(I->getBB(), C->getParent())){ 
-        cdep = NonLocalDepCall; 
-        continue; 
-      } 
- 
-      cdep = nullptr; 
-      break; 
-    } 
- 
-    if (!cdep) { 
-      valueNumbering[C] = nextValueNumber; 
-      return nextValueNumber++; 
-    } 
- 
-    if (cdep->getNumArgOperands() != C->getNumArgOperands()) { 
-      valueNumbering[C] = nextValueNumber; 
-      return nextValueNumber++; 
-    } 
-    for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) { 
-      uint32_t c_vn = lookupOrAdd(C->getArgOperand(i)); 
-      uint32_t cd_vn = lookupOrAdd(cdep->getArgOperand(i)); 
-      if (c_vn != cd_vn) { 
-        valueNumbering[C] = nextValueNumber; 
-        return nextValueNumber++; 
-      } 
-    } 
- 
-    uint32_t v = lookupOrAdd(cdep); 
-    valueNumbering[C] = v; 
-    return v; 
-  } else { 
-    valueNumbering[C] = nextValueNumber; 
-    return nextValueNumber++; 
-  } 
-} 
- 
-/// Returns true if a value number exists for the specified value. 
-bool GVN::ValueTable::exists(Value *V) const { return valueNumbering.count(V) != 0; } 
- 
-/// lookup_or_add - Returns the value number for the specified value, assigning 
-/// it a new number if it did not have one before. 
-uint32_t GVN::ValueTable::lookupOrAdd(Value *V) { 
-  DenseMap<Value*, uint32_t>::iterator VI = valueNumbering.find(V); 
-  if (VI != valueNumbering.end()) 
-    return VI->second; 
- 
-  if (!isa<Instruction>(V)) { 
-    valueNumbering[V] = nextValueNumber; 
-    return nextValueNumber++; 
-  } 
- 
-  Instruction* I = cast<Instruction>(V); 
-  Expression exp; 
-  switch (I->getOpcode()) { 
-    case Instruction::Call: 
-      return lookupOrAddCall(cast<CallInst>(I)); 
-    case Instruction::FNeg: 
-    case Instruction::Add: 
-    case Instruction::FAdd: 
-    case Instruction::Sub: 
-    case Instruction::FSub: 
-    case Instruction::Mul: 
-    case Instruction::FMul: 
-    case Instruction::UDiv: 
-    case Instruction::SDiv: 
-    case Instruction::FDiv: 
-    case Instruction::URem: 
-    case Instruction::SRem: 
-    case Instruction::FRem: 
-    case Instruction::Shl: 
-    case Instruction::LShr: 
-    case Instruction::AShr: 
-    case Instruction::And: 
-    case Instruction::Or: 
-    case Instruction::Xor: 
-    case Instruction::ICmp: 
-    case Instruction::FCmp: 
-    case Instruction::Trunc: 
-    case Instruction::ZExt: 
-    case Instruction::SExt: 
-    case Instruction::FPToUI: 
-    case Instruction::FPToSI: 
-    case Instruction::UIToFP: 
-    case Instruction::SIToFP: 
-    case Instruction::FPTrunc: 
-    case Instruction::FPExt: 
-    case Instruction::PtrToInt: 
-    case Instruction::IntToPtr: 
-    case Instruction::AddrSpaceCast: 
-    case Instruction::BitCast: 
-    case Instruction::Select: 
-    case Instruction::Freeze: 
-    case Instruction::ExtractElement: 
-    case Instruction::InsertElement: 
-    case Instruction::ShuffleVector: 
-    case Instruction::InsertValue: 
-    case Instruction::GetElementPtr: 
-      exp = createExpr(I); 
-      break; 
-    case Instruction::ExtractValue: 
-      exp = createExtractvalueExpr(cast<ExtractValueInst>(I)); 
-      break; 
-    case Instruction::PHI: 
-      valueNumbering[V] = nextValueNumber; 
-      NumberingPhi[nextValueNumber] = cast<PHINode>(V); 
-      return nextValueNumber++; 
-    default: 
-      valueNumbering[V] = nextValueNumber; 
-      return nextValueNumber++; 
-  } 
- 
-  uint32_t e = assignExpNewValueNum(exp).first; 
-  valueNumbering[V] = e; 
-  return e; 
-} 
- 
-/// Returns the value number of the specified value. Fails if 
-/// the value has not yet been numbered. 
-uint32_t GVN::ValueTable::lookup(Value *V, bool Verify) const { 
-  DenseMap<Value*, uint32_t>::const_iterator VI = valueNumbering.find(V); 
-  if (Verify) { 
-    assert(VI != valueNumbering.end() && "Value not numbered?"); 
-    return VI->second; 
-  } 
-  return (VI != valueNumbering.end()) ? VI->second : 0; 
-} 
- 
-/// Returns the value number of the given comparison, 
-/// assigning it a new number if it did not have one before.  Useful when 
-/// we deduced the result of a comparison, but don't immediately have an 
-/// instruction realizing that comparison to hand. 
-uint32_t GVN::ValueTable::lookupOrAddCmp(unsigned Opcode, 
-                                         CmpInst::Predicate Predicate, 
-                                         Value *LHS, Value *RHS) { 
-  Expression exp = createCmpExpr(Opcode, Predicate, LHS, RHS); 
-  return assignExpNewValueNum(exp).first; 
-} 
- 
-/// Remove all entries from the ValueTable. 
-void GVN::ValueTable::clear() { 
-  valueNumbering.clear(); 
-  expressionNumbering.clear(); 
-  NumberingPhi.clear(); 
-  PhiTranslateTable.clear(); 
-  nextValueNumber = 1; 
-  Expressions.clear(); 
-  ExprIdx.clear(); 
-  nextExprNumber = 0; 
-} 
- 
-/// Remove a value from the value numbering. 
-void GVN::ValueTable::erase(Value *V) { 
-  uint32_t Num = valueNumbering.lookup(V); 
-  valueNumbering.erase(V); 
-  // If V is PHINode, V <--> value number is an one-to-one mapping. 
-  if (isa<PHINode>(V)) 
-    NumberingPhi.erase(Num); 
-} 
- 
-/// verifyRemoved - Verify that the value is removed from all internal data 
-/// structures. 
-void GVN::ValueTable::verifyRemoved(const Value *V) const { 
-  for (DenseMap<Value*, uint32_t>::const_iterator 
-         I = valueNumbering.begin(), E = valueNumbering.end(); I != E; ++I) { 
-    assert(I->first != V && "Inst still occurs in value numbering map!"); 
-  } 
-} 
- 
-//===----------------------------------------------------------------------===// 
-//                                GVN Pass 
-//===----------------------------------------------------------------------===// 
- 
-bool GVN::isPREEnabled() const { 
-  return Options.AllowPRE.getValueOr(GVNEnablePRE); 
-} 
- 
-bool GVN::isLoadPREEnabled() const { 
-  return Options.AllowLoadPRE.getValueOr(GVNEnableLoadPRE); 
-} 
- 
-bool GVN::isLoadInLoopPREEnabled() const { 
-  return Options.AllowLoadInLoopPRE.getValueOr(GVNEnableLoadInLoopPRE); 
-} 
- 
+        valueNumbering[C] = nextValueNumber;
+        return nextValueNumber++;
+      }
+
+      for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) {
+        uint32_t c_vn = lookupOrAdd(C->getArgOperand(i));
+        uint32_t cd_vn = lookupOrAdd(local_cdep->getArgOperand(i));
+        if (c_vn != cd_vn) {
+          valueNumbering[C] = nextValueNumber;
+          return nextValueNumber++;
+        }
+      }
+
+      uint32_t v = lookupOrAdd(local_cdep);
+      valueNumbering[C] = v;
+      return v;
+    }
+
+    // Non-local case.
+    const MemoryDependenceResults::NonLocalDepInfo &deps =
+        MD->getNonLocalCallDependency(C);
+    // FIXME: Move the checking logic to MemDep!
+    CallInst* cdep = nullptr;
+
+    // Check to see if we have a single dominating call instruction that is
+    // identical to C.
+    for (unsigned i = 0, e = deps.size(); i != e; ++i) {
+      const NonLocalDepEntry *I = &deps[i];
+      if (I->getResult().isNonLocal())
+        continue;
+
+      // We don't handle non-definitions.  If we already have a call, reject
+      // instruction dependencies.
+      if (!I->getResult().isDef() || cdep != nullptr) {
+        cdep = nullptr;
+        break;
+      }
+
+      CallInst *NonLocalDepCall = dyn_cast<CallInst>(I->getResult().getInst());
+      // FIXME: All duplicated with non-local case.
+      if (NonLocalDepCall && DT->properlyDominates(I->getBB(), C->getParent())){
+        cdep = NonLocalDepCall;
+        continue;
+      }
+
+      cdep = nullptr;
+      break;
+    }
+
+    if (!cdep) {
+      valueNumbering[C] = nextValueNumber;
+      return nextValueNumber++;
+    }
+
+    if (cdep->getNumArgOperands() != C->getNumArgOperands()) {
+      valueNumbering[C] = nextValueNumber;
+      return nextValueNumber++;
+    }
+    for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) {
+      uint32_t c_vn = lookupOrAdd(C->getArgOperand(i));
+      uint32_t cd_vn = lookupOrAdd(cdep->getArgOperand(i));
+      if (c_vn != cd_vn) {
+        valueNumbering[C] = nextValueNumber;
+        return nextValueNumber++;
+      }
+    }
+
+    uint32_t v = lookupOrAdd(cdep);
+    valueNumbering[C] = v;
+    return v;
+  } else {
+    valueNumbering[C] = nextValueNumber;
+    return nextValueNumber++;
+  }
+}
+
+/// Returns true if a value number exists for the specified value.
+bool GVN::ValueTable::exists(Value *V) const { return valueNumbering.count(V) != 0; }
+
+/// lookup_or_add - Returns the value number for the specified value, assigning
+/// it a new number if it did not have one before.
+uint32_t GVN::ValueTable::lookupOrAdd(Value *V) {
+  DenseMap<Value*, uint32_t>::iterator VI = valueNumbering.find(V);
+  if (VI != valueNumbering.end())
+    return VI->second;
+
+  if (!isa<Instruction>(V)) {
+    valueNumbering[V] = nextValueNumber;
+    return nextValueNumber++;
+  }
+
+  Instruction* I = cast<Instruction>(V);
+  Expression exp;
+  switch (I->getOpcode()) {
+    case Instruction::Call:
+      return lookupOrAddCall(cast<CallInst>(I));
+    case Instruction::FNeg:
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::ICmp:
+    case Instruction::FCmp:
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::AddrSpaceCast:
+    case Instruction::BitCast:
+    case Instruction::Select:
+    case Instruction::Freeze:
+    case Instruction::ExtractElement:
+    case Instruction::InsertElement:
+    case Instruction::ShuffleVector:
+    case Instruction::InsertValue:
+    case Instruction::GetElementPtr:
+      exp = createExpr(I);
+      break;
+    case Instruction::ExtractValue:
+      exp = createExtractvalueExpr(cast<ExtractValueInst>(I));
+      break;
+    case Instruction::PHI:
+      valueNumbering[V] = nextValueNumber;
+      NumberingPhi[nextValueNumber] = cast<PHINode>(V);
+      return nextValueNumber++;
+    default:
+      valueNumbering[V] = nextValueNumber;
+      return nextValueNumber++;
+  }
+
+  uint32_t e = assignExpNewValueNum(exp).first;
+  valueNumbering[V] = e;
+  return e;
+}
+
+/// Returns the value number of the specified value. Fails if
+/// the value has not yet been numbered.
+uint32_t GVN::ValueTable::lookup(Value *V, bool Verify) const {
+  DenseMap<Value*, uint32_t>::const_iterator VI = valueNumbering.find(V);
+  if (Verify) {
+    assert(VI != valueNumbering.end() && "Value not numbered?");
+    return VI->second;
+  }
+  return (VI != valueNumbering.end()) ? VI->second : 0;
+}
+
+/// Returns the value number of the given comparison,
+/// assigning it a new number if it did not have one before.  Useful when
+/// we deduced the result of a comparison, but don't immediately have an
+/// instruction realizing that comparison to hand.
+uint32_t GVN::ValueTable::lookupOrAddCmp(unsigned Opcode,
+                                         CmpInst::Predicate Predicate,
+                                         Value *LHS, Value *RHS) {
+  Expression exp = createCmpExpr(Opcode, Predicate, LHS, RHS);
+  return assignExpNewValueNum(exp).first;
+}
+
+/// Remove all entries from the ValueTable.
+void GVN::ValueTable::clear() {
+  valueNumbering.clear();
+  expressionNumbering.clear();
+  NumberingPhi.clear();
+  PhiTranslateTable.clear();
+  nextValueNumber = 1;
+  Expressions.clear();
+  ExprIdx.clear();
+  nextExprNumber = 0;
+}
+
+/// Remove a value from the value numbering.
+void GVN::ValueTable::erase(Value *V) {
+  uint32_t Num = valueNumbering.lookup(V);
+  valueNumbering.erase(V);
+  // If V is PHINode, V <--> value number is an one-to-one mapping.
+  if (isa<PHINode>(V))
+    NumberingPhi.erase(Num);
+}
+
+/// verifyRemoved - Verify that the value is removed from all internal data
+/// structures.
+void GVN::ValueTable::verifyRemoved(const Value *V) const {
+  for (DenseMap<Value*, uint32_t>::const_iterator
+         I = valueNumbering.begin(), E = valueNumbering.end(); I != E; ++I) {
+    assert(I->first != V && "Inst still occurs in value numbering map!");
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                                GVN Pass
+//===----------------------------------------------------------------------===//
+
+bool GVN::isPREEnabled() const {
+  return Options.AllowPRE.getValueOr(GVNEnablePRE);
+}
+
+bool GVN::isLoadPREEnabled() const {
+  return Options.AllowLoadPRE.getValueOr(GVNEnableLoadPRE);
+}
+
+bool GVN::isLoadInLoopPREEnabled() const {
+  return Options.AllowLoadInLoopPRE.getValueOr(GVNEnableLoadInLoopPRE);
+}
+
 bool GVN::isLoadPRESplitBackedgeEnabled() const {
   return Options.AllowLoadPRESplitBackedge.getValueOr(
       GVNEnableSplitBackedgeInLoadPRE);
 }
 
-bool GVN::isMemDepEnabled() const { 
-  return Options.AllowMemDep.getValueOr(GVNEnableMemDep); 
-} 
- 
-PreservedAnalyses GVN::run(Function &F, FunctionAnalysisManager &AM) { 
-  // FIXME: The order of evaluation of these 'getResult' calls is very 
-  // significant! Re-ordering these variables will cause GVN when run alone to 
-  // be less effective! We should fix memdep and basic-aa to not exhibit this 
-  // behavior, but until then don't change the order here. 
-  auto &AC = AM.getResult<AssumptionAnalysis>(F); 
-  auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 
-  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 
-  auto &AA = AM.getResult<AAManager>(F); 
-  auto *MemDep = 
-      isMemDepEnabled() ? &AM.getResult<MemoryDependenceAnalysis>(F) : nullptr; 
-  auto *LI = AM.getCachedResult<LoopAnalysis>(F); 
+bool GVN::isMemDepEnabled() const {
+  return Options.AllowMemDep.getValueOr(GVNEnableMemDep);
+}
+
+PreservedAnalyses GVN::run(Function &F, FunctionAnalysisManager &AM) {
+  // FIXME: The order of evaluation of these 'getResult' calls is very
+  // significant! Re-ordering these variables will cause GVN when run alone to
+  // be less effective! We should fix memdep and basic-aa to not exhibit this
+  // behavior, but until then don't change the order here.
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &AA = AM.getResult<AAManager>(F);
+  auto *MemDep =
+      isMemDepEnabled() ? &AM.getResult<MemoryDependenceAnalysis>(F) : nullptr;
+  auto *LI = AM.getCachedResult<LoopAnalysis>(F);
   auto *MSSA = AM.getCachedResult<MemorySSAAnalysis>(F);
-  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 
+  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
   bool Changed = runImpl(F, AC, DT, TLI, AA, MemDep, LI, &ORE,
                          MSSA ? &MSSA->getMSSA() : nullptr);
-  if (!Changed) 
-    return PreservedAnalyses::all(); 
-  PreservedAnalyses PA; 
-  PA.preserve<DominatorTreeAnalysis>(); 
-  PA.preserve<GlobalsAA>(); 
-  PA.preserve<TargetLibraryAnalysis>(); 
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<GlobalsAA>();
+  PA.preserve<TargetLibraryAnalysis>();
   if (MSSA)
     PA.preserve<MemorySSAAnalysis>();
-  if (LI) 
-    PA.preserve<LoopAnalysis>(); 
-  return PA; 
-} 
- 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 
-LLVM_DUMP_METHOD void GVN::dump(DenseMap<uint32_t, Value*>& d) const { 
-  errs() << "{\n"; 
-  for (DenseMap<uint32_t, Value*>::iterator I = d.begin(), 
-       E = d.end(); I != E; ++I) { 
-      errs() << I->first << "\n"; 
-      I->second->dump(); 
-  } 
-  errs() << "}\n"; 
-} 
-#endif 
- 
+  if (LI)
+    PA.preserve<LoopAnalysis>();
+  return PA;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void GVN::dump(DenseMap<uint32_t, Value*>& d) const {
+  errs() << "{\n";
+  for (DenseMap<uint32_t, Value*>::iterator I = d.begin(),
+       E = d.end(); I != E; ++I) {
+      errs() << I->first << "\n";
+      I->second->dump();
+  }
+  errs() << "}\n";
+}
+#endif
+
 enum class AvailabilityState : char {
   /// We know the block *is not* fully available. This is a fixpoint.
   Unavailable = 0,
@@ -702,29 +702,29 @@ enum class AvailabilityState : char {
   SpeculativelyAvailable = 2,
 };
 
-/// Return true if we can prove that the value 
-/// we're analyzing is fully available in the specified block.  As we go, keep 
-/// track of which blocks we know are fully alive in FullyAvailableBlocks.  This 
-/// map is actually a tri-state map with the following values: 
-///   0) we know the block *is not* fully available. 
-///   1) we know the block *is* fully available. 
-///   2) we do not know whether the block is fully available or not, but we are 
-///      currently speculating that it will be. 
+/// Return true if we can prove that the value
+/// we're analyzing is fully available in the specified block.  As we go, keep
+/// track of which blocks we know are fully alive in FullyAvailableBlocks.  This
+/// map is actually a tri-state map with the following values:
+///   0) we know the block *is not* fully available.
+///   1) we know the block *is* fully available.
+///   2) we do not know whether the block is fully available or not, but we are
+///      currently speculating that it will be.
 static bool IsValueFullyAvailableInBlock(
     BasicBlock *BB,
     DenseMap<BasicBlock *, AvailabilityState> &FullyAvailableBlocks) {
   SmallVector<BasicBlock *, 32> Worklist;
   Optional<BasicBlock *> UnavailableBB;
- 
+
   // The number of times we didn't find an entry for a block in a map and
   // optimistically inserted an entry marking block as speculatively available.
   unsigned NumNewNewSpeculativelyAvailableBBs = 0;
- 
+
 #ifndef NDEBUG
   SmallSet<BasicBlock *, 32> NewSpeculativelyAvailableBBs;
   SmallVector<BasicBlock *, 32> AvailableBBs;
 #endif
- 
+
   Worklist.emplace_back(BB);
   while (!Worklist.empty()) {
     BasicBlock *CurrBB = Worklist.pop_back_val(); // LIFO - depth-first!
@@ -734,24 +734,24 @@ static bool IsValueFullyAvailableInBlock(
         FullyAvailableBlocks.try_emplace(
             CurrBB, AvailabilityState::SpeculativelyAvailable);
     AvailabilityState &State = IV.first->second;
- 
+
     // Did the entry already exist for this block?
     if (!IV.second) {
       if (State == AvailabilityState::Unavailable) {
         UnavailableBB = CurrBB;
         break; // Backpropagate unavailability info.
       }
- 
+
 #ifndef NDEBUG
       AvailableBBs.emplace_back(CurrBB);
 #endif
       continue; // Don't recurse further, but continue processing worklist.
     }
- 
+
     // No entry found for block.
     ++NumNewNewSpeculativelyAvailableBBs;
     bool OutOfBudget = NumNewNewSpeculativelyAvailableBBs > MaxBBSpeculations;
- 
+
     // If we have exhausted our budget, mark this block as unavailable.
     // Also, if this block has no predecessors, the value isn't live-in here.
     if (OutOfBudget || pred_empty(CurrBB)) {
@@ -760,20 +760,20 @@ static bool IsValueFullyAvailableInBlock(
       UnavailableBB = CurrBB;
       break; // Backpropagate unavailability info.
     }
- 
+
     // Tentatively consider this block as speculatively available.
 #ifndef NDEBUG
     NewSpeculativelyAvailableBBs.insert(CurrBB);
 #endif
     // And further recurse into block's predecessors, in depth-first order!
     Worklist.append(pred_begin(CurrBB), pred_end(CurrBB));
-  } 
- 
+  }
+
 #if LLVM_ENABLE_STATS
   IsValueFullyAvailableInBlockNumSpeculationsMax.updateMax(
       NumNewNewSpeculativelyAvailableBBs);
 #endif
- 
+
   // If the block isn't marked as fixpoint yet
   // (the Unavailable and Available states are fixpoints)
   auto MarkAsFixpointAndEnqueueSuccessors =
@@ -796,7 +796,7 @@ static bool IsValueFullyAvailableInBlock(
           return;
         }
       };
- 
+
   if (UnavailableBB) {
     // Okay, we have encountered an unavailable block.
     // Mark speculatively available blocks reachable from UnavailableBB as
@@ -808,7 +808,7 @@ static bool IsValueFullyAvailableInBlock(
       MarkAsFixpointAndEnqueueSuccessors(Worklist.pop_back_val(),
                                          AvailabilityState::Unavailable);
   }
- 
+
 #ifndef NDEBUG
   Worklist.clear();
   for (BasicBlock *AvailableBB : AvailableBBs)
@@ -816,418 +816,418 @@ static bool IsValueFullyAvailableInBlock(
   while (!Worklist.empty())
     MarkAsFixpointAndEnqueueSuccessors(Worklist.pop_back_val(),
                                        AvailabilityState::Available);
- 
+
   assert(NewSpeculativelyAvailableBBs.empty() &&
          "Must have fixed all the new speculatively available blocks.");
 #endif
 
   return !UnavailableBB;
-} 
- 
-/// Given a set of loads specified by ValuesPerBlock, 
-/// construct SSA form, allowing us to eliminate LI.  This returns the value 
-/// that should be used at LI's definition site. 
-static Value *ConstructSSAForLoadSet(LoadInst *LI, 
-                         SmallVectorImpl<AvailableValueInBlock> &ValuesPerBlock, 
-                                     GVN &gvn) { 
-  // Check for the fully redundant, dominating load case.  In this case, we can 
-  // just use the dominating value directly. 
-  if (ValuesPerBlock.size() == 1 && 
-      gvn.getDominatorTree().properlyDominates(ValuesPerBlock[0].BB, 
-                                               LI->getParent())) { 
-    assert(!ValuesPerBlock[0].AV.isUndefValue() && 
-           "Dead BB dominate this block"); 
-    return ValuesPerBlock[0].MaterializeAdjustedValue(LI, gvn); 
-  } 
- 
-  // Otherwise, we have to construct SSA form. 
-  SmallVector<PHINode*, 8> NewPHIs; 
-  SSAUpdater SSAUpdate(&NewPHIs); 
-  SSAUpdate.Initialize(LI->getType(), LI->getName()); 
- 
-  for (const AvailableValueInBlock &AV : ValuesPerBlock) { 
-    BasicBlock *BB = AV.BB; 
- 
-    if (SSAUpdate.HasValueForBlock(BB)) 
-      continue; 
- 
-    // If the value is the load that we will be eliminating, and the block it's 
-    // available in is the block that the load is in, then don't add it as 
-    // SSAUpdater will resolve the value to the relevant phi which may let it 
-    // avoid phi construction entirely if there's actually only one value. 
-    if (BB == LI->getParent() && 
-        ((AV.AV.isSimpleValue() && AV.AV.getSimpleValue() == LI) || 
-         (AV.AV.isCoercedLoadValue() && AV.AV.getCoercedLoadValue() == LI))) 
-      continue; 
- 
-    SSAUpdate.AddAvailableValue(BB, AV.MaterializeAdjustedValue(LI, gvn)); 
-  } 
- 
-  // Perform PHI construction. 
-  return SSAUpdate.GetValueInMiddleOfBlock(LI->getParent()); 
-} 
- 
-Value *AvailableValue::MaterializeAdjustedValue(LoadInst *LI, 
-                                                Instruction *InsertPt, 
-                                                GVN &gvn) const { 
-  Value *Res; 
-  Type *LoadTy = LI->getType(); 
-  const DataLayout &DL = LI->getModule()->getDataLayout(); 
-  if (isSimpleValue()) { 
-    Res = getSimpleValue(); 
-    if (Res->getType() != LoadTy) { 
-      Res = getStoreValueForLoad(Res, Offset, LoadTy, InsertPt, DL); 
- 
-      LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset 
-                        << "  " << *getSimpleValue() << '\n' 
-                        << *Res << '\n' 
-                        << "\n\n\n"); 
-    } 
-  } else if (isCoercedLoadValue()) { 
-    LoadInst *Load = getCoercedLoadValue(); 
-    if (Load->getType() == LoadTy && Offset == 0) { 
-      Res = Load; 
-    } else { 
-      Res = getLoadValueForLoad(Load, Offset, LoadTy, InsertPt, DL); 
-      // We would like to use gvn.markInstructionForDeletion here, but we can't 
-      // because the load is already memoized into the leader map table that GVN 
-      // tracks.  It is potentially possible to remove the load from the table, 
-      // but then there all of the operations based on it would need to be 
-      // rehashed.  Just leave the dead load around. 
-      gvn.getMemDep().removeInstruction(Load); 
-      LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset 
-                        << "  " << *getCoercedLoadValue() << '\n' 
-                        << *Res << '\n' 
-                        << "\n\n\n"); 
-    } 
-  } else if (isMemIntrinValue()) { 
-    Res = getMemInstValueForLoad(getMemIntrinValue(), Offset, LoadTy, 
-                                 InsertPt, DL); 
-    LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset 
-                      << "  " << *getMemIntrinValue() << '\n' 
-                      << *Res << '\n' 
-                      << "\n\n\n"); 
-  } else { 
-    assert(isUndefValue() && "Should be UndefVal"); 
-    LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL Undef:\n";); 
-    return UndefValue::get(LoadTy); 
-  } 
-  assert(Res && "failed to materialize?"); 
-  return Res; 
-} 
- 
-static bool isLifetimeStart(const Instruction *Inst) { 
-  if (const IntrinsicInst* II = dyn_cast<IntrinsicInst>(Inst)) 
-    return II->getIntrinsicID() == Intrinsic::lifetime_start; 
-  return false; 
-} 
- 
-/// Try to locate the three instruction involved in a missed 
-/// load-elimination case that is due to an intervening store. 
-static void reportMayClobberedLoad(LoadInst *LI, MemDepResult DepInfo, 
-                                   DominatorTree *DT, 
-                                   OptimizationRemarkEmitter *ORE) { 
-  using namespace ore; 
- 
-  User *OtherAccess = nullptr; 
- 
-  OptimizationRemarkMissed R(DEBUG_TYPE, "LoadClobbered", LI); 
-  R << "load of type " << NV("Type", LI->getType()) << " not eliminated" 
-    << setExtraArgs(); 
- 
-  for (auto *U : LI->getPointerOperand()->users()) 
-    if (U != LI && (isa<LoadInst>(U) || isa<StoreInst>(U)) && 
-        DT->dominates(cast<Instruction>(U), LI)) { 
-      // FIXME: for now give up if there are multiple memory accesses that 
-      // dominate the load.  We need further analysis to decide which one is 
-      // that we're forwarding from. 
-      if (OtherAccess) 
-        OtherAccess = nullptr; 
-      else 
-        OtherAccess = U; 
-    } 
- 
-  if (OtherAccess) 
-    R << " in favor of " << NV("OtherAccess", OtherAccess); 
- 
-  R << " because it is clobbered by " << NV("ClobberedBy", DepInfo.getInst()); 
- 
-  ORE->emit(R); 
-} 
- 
-bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo, 
-                                  Value *Address, AvailableValue &Res) { 
-  assert((DepInfo.isDef() || DepInfo.isClobber()) && 
-         "expected a local dependence"); 
-  assert(LI->isUnordered() && "rules below are incorrect for ordered access"); 
- 
-  const DataLayout &DL = LI->getModule()->getDataLayout(); 
- 
-  Instruction *DepInst = DepInfo.getInst(); 
-  if (DepInfo.isClobber()) { 
-    // If the dependence is to a store that writes to a superset of the bits 
-    // read by the load, we can extract the bits we need for the load from the 
-    // stored value. 
-    if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInst)) { 
-      // Can't forward from non-atomic to atomic without violating memory model. 
-      if (Address && LI->isAtomic() <= DepSI->isAtomic()) { 
-        int Offset = 
-          analyzeLoadFromClobberingStore(LI->getType(), Address, DepSI, DL); 
-        if (Offset != -1) { 
-          Res = AvailableValue::get(DepSI->getValueOperand(), Offset); 
-          return true; 
-        } 
-      } 
-    } 
- 
-    // Check to see if we have something like this: 
-    //    load i32* P 
-    //    load i8* (P+1) 
-    // if we have this, replace the later with an extraction from the former. 
-    if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInst)) { 
-      // If this is a clobber and L is the first instruction in its block, then 
-      // we have the first instruction in the entry block. 
-      // Can't forward from non-atomic to atomic without violating memory model. 
-      if (DepLI != LI && Address && LI->isAtomic() <= DepLI->isAtomic()) { 
-        int Offset = 
-          analyzeLoadFromClobberingLoad(LI->getType(), Address, DepLI, DL); 
- 
-        if (Offset != -1) { 
-          Res = AvailableValue::getLoad(DepLI, Offset); 
-          return true; 
-        } 
-      } 
-    } 
- 
-    // If the clobbering value is a memset/memcpy/memmove, see if we can 
-    // forward a value on from it. 
-    if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInst)) { 
-      if (Address && !LI->isAtomic()) { 
-        int Offset = analyzeLoadFromClobberingMemInst(LI->getType(), Address, 
-                                                      DepMI, DL); 
-        if (Offset != -1) { 
-          Res = AvailableValue::getMI(DepMI, Offset); 
-          return true; 
-        } 
-      } 
-    } 
-    // Nothing known about this clobber, have to be conservative 
-    LLVM_DEBUG( 
-        // fast print dep, using operator<< on instruction is too slow. 
-        dbgs() << "GVN: load "; LI->printAsOperand(dbgs()); 
-        dbgs() << " is clobbered by " << *DepInst << '\n';); 
-    if (ORE->allowExtraAnalysis(DEBUG_TYPE)) 
-      reportMayClobberedLoad(LI, DepInfo, DT, ORE); 
- 
-    return false; 
-  } 
-  assert(DepInfo.isDef() && "follows from above"); 
- 
-  // Loading the allocation -> undef. 
-  if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) || 
-      isAlignedAllocLikeFn(DepInst, TLI) || 
-      // Loading immediately after lifetime begin -> undef. 
-      isLifetimeStart(DepInst)) { 
-    Res = AvailableValue::get(UndefValue::get(LI->getType())); 
-    return true; 
-  } 
- 
-  // Loading from calloc (which zero initializes memory) -> zero 
-  if (isCallocLikeFn(DepInst, TLI)) { 
-    Res = AvailableValue::get(Constant::getNullValue(LI->getType())); 
-    return true; 
-  } 
- 
-  if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) { 
-    // Reject loads and stores that are to the same address but are of 
+}
+
+/// Given a set of loads specified by ValuesPerBlock,
+/// construct SSA form, allowing us to eliminate LI.  This returns the value
+/// that should be used at LI's definition site.
+static Value *ConstructSSAForLoadSet(LoadInst *LI,
+                         SmallVectorImpl<AvailableValueInBlock> &ValuesPerBlock,
+                                     GVN &gvn) {
+  // Check for the fully redundant, dominating load case.  In this case, we can
+  // just use the dominating value directly.
+  if (ValuesPerBlock.size() == 1 &&
+      gvn.getDominatorTree().properlyDominates(ValuesPerBlock[0].BB,
+                                               LI->getParent())) {
+    assert(!ValuesPerBlock[0].AV.isUndefValue() &&
+           "Dead BB dominate this block");
+    return ValuesPerBlock[0].MaterializeAdjustedValue(LI, gvn);
+  }
+
+  // Otherwise, we have to construct SSA form.
+  SmallVector<PHINode*, 8> NewPHIs;
+  SSAUpdater SSAUpdate(&NewPHIs);
+  SSAUpdate.Initialize(LI->getType(), LI->getName());
+
+  for (const AvailableValueInBlock &AV : ValuesPerBlock) {
+    BasicBlock *BB = AV.BB;
+
+    if (SSAUpdate.HasValueForBlock(BB))
+      continue;
+
+    // If the value is the load that we will be eliminating, and the block it's
+    // available in is the block that the load is in, then don't add it as
+    // SSAUpdater will resolve the value to the relevant phi which may let it
+    // avoid phi construction entirely if there's actually only one value.
+    if (BB == LI->getParent() &&
+        ((AV.AV.isSimpleValue() && AV.AV.getSimpleValue() == LI) ||
+         (AV.AV.isCoercedLoadValue() && AV.AV.getCoercedLoadValue() == LI)))
+      continue;
+
+    SSAUpdate.AddAvailableValue(BB, AV.MaterializeAdjustedValue(LI, gvn));
+  }
+
+  // Perform PHI construction.
+  return SSAUpdate.GetValueInMiddleOfBlock(LI->getParent());
+}
+
+Value *AvailableValue::MaterializeAdjustedValue(LoadInst *LI,
+                                                Instruction *InsertPt,
+                                                GVN &gvn) const {
+  Value *Res;
+  Type *LoadTy = LI->getType();
+  const DataLayout &DL = LI->getModule()->getDataLayout();
+  if (isSimpleValue()) {
+    Res = getSimpleValue();
+    if (Res->getType() != LoadTy) {
+      Res = getStoreValueForLoad(Res, Offset, LoadTy, InsertPt, DL);
+
+      LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset
+                        << "  " << *getSimpleValue() << '\n'
+                        << *Res << '\n'
+                        << "\n\n\n");
+    }
+  } else if (isCoercedLoadValue()) {
+    LoadInst *Load = getCoercedLoadValue();
+    if (Load->getType() == LoadTy && Offset == 0) {
+      Res = Load;
+    } else {
+      Res = getLoadValueForLoad(Load, Offset, LoadTy, InsertPt, DL);
+      // We would like to use gvn.markInstructionForDeletion here, but we can't
+      // because the load is already memoized into the leader map table that GVN
+      // tracks.  It is potentially possible to remove the load from the table,
+      // but then there all of the operations based on it would need to be
+      // rehashed.  Just leave the dead load around.
+      gvn.getMemDep().removeInstruction(Load);
+      LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset
+                        << "  " << *getCoercedLoadValue() << '\n'
+                        << *Res << '\n'
+                        << "\n\n\n");
+    }
+  } else if (isMemIntrinValue()) {
+    Res = getMemInstValueForLoad(getMemIntrinValue(), Offset, LoadTy,
+                                 InsertPt, DL);
+    LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
+                      << "  " << *getMemIntrinValue() << '\n'
+                      << *Res << '\n'
+                      << "\n\n\n");
+  } else {
+    assert(isUndefValue() && "Should be UndefVal");
+    LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL Undef:\n";);
+    return UndefValue::get(LoadTy);
+  }
+  assert(Res && "failed to materialize?");
+  return Res;
+}
+
+static bool isLifetimeStart(const Instruction *Inst) {
+  if (const IntrinsicInst* II = dyn_cast<IntrinsicInst>(Inst))
+    return II->getIntrinsicID() == Intrinsic::lifetime_start;
+  return false;
+}
+
+/// Try to locate the three instruction involved in a missed
+/// load-elimination case that is due to an intervening store.
+static void reportMayClobberedLoad(LoadInst *LI, MemDepResult DepInfo,
+                                   DominatorTree *DT,
+                                   OptimizationRemarkEmitter *ORE) {
+  using namespace ore;
+
+  User *OtherAccess = nullptr;
+
+  OptimizationRemarkMissed R(DEBUG_TYPE, "LoadClobbered", LI);
+  R << "load of type " << NV("Type", LI->getType()) << " not eliminated"
+    << setExtraArgs();
+
+  for (auto *U : LI->getPointerOperand()->users())
+    if (U != LI && (isa<LoadInst>(U) || isa<StoreInst>(U)) &&
+        DT->dominates(cast<Instruction>(U), LI)) {
+      // FIXME: for now give up if there are multiple memory accesses that
+      // dominate the load.  We need further analysis to decide which one is
+      // that we're forwarding from.
+      if (OtherAccess)
+        OtherAccess = nullptr;
+      else
+        OtherAccess = U;
+    }
+
+  if (OtherAccess)
+    R << " in favor of " << NV("OtherAccess", OtherAccess);
+
+  R << " because it is clobbered by " << NV("ClobberedBy", DepInfo.getInst());
+
+  ORE->emit(R);
+}
+
+bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
+                                  Value *Address, AvailableValue &Res) {
+  assert((DepInfo.isDef() || DepInfo.isClobber()) &&
+         "expected a local dependence");
+  assert(LI->isUnordered() && "rules below are incorrect for ordered access");
+
+  const DataLayout &DL = LI->getModule()->getDataLayout();
+
+  Instruction *DepInst = DepInfo.getInst();
+  if (DepInfo.isClobber()) {
+    // If the dependence is to a store that writes to a superset of the bits
+    // read by the load, we can extract the bits we need for the load from the
+    // stored value.
+    if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInst)) {
+      // Can't forward from non-atomic to atomic without violating memory model.
+      if (Address && LI->isAtomic() <= DepSI->isAtomic()) {
+        int Offset =
+          analyzeLoadFromClobberingStore(LI->getType(), Address, DepSI, DL);
+        if (Offset != -1) {
+          Res = AvailableValue::get(DepSI->getValueOperand(), Offset);
+          return true;
+        }
+      }
+    }
+
+    // Check to see if we have something like this:
+    //    load i32* P
+    //    load i8* (P+1)
+    // if we have this, replace the later with an extraction from the former.
+    if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInst)) {
+      // If this is a clobber and L is the first instruction in its block, then
+      // we have the first instruction in the entry block.
+      // Can't forward from non-atomic to atomic without violating memory model.
+      if (DepLI != LI && Address && LI->isAtomic() <= DepLI->isAtomic()) {
+        int Offset =
+          analyzeLoadFromClobberingLoad(LI->getType(), Address, DepLI, DL);
+
+        if (Offset != -1) {
+          Res = AvailableValue::getLoad(DepLI, Offset);
+          return true;
+        }
+      }
+    }
+
+    // If the clobbering value is a memset/memcpy/memmove, see if we can
+    // forward a value on from it.
+    if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInst)) {
+      if (Address && !LI->isAtomic()) {
+        int Offset = analyzeLoadFromClobberingMemInst(LI->getType(), Address,
+                                                      DepMI, DL);
+        if (Offset != -1) {
+          Res = AvailableValue::getMI(DepMI, Offset);
+          return true;
+        }
+      }
+    }
+    // Nothing known about this clobber, have to be conservative
+    LLVM_DEBUG(
+        // fast print dep, using operator<< on instruction is too slow.
+        dbgs() << "GVN: load "; LI->printAsOperand(dbgs());
+        dbgs() << " is clobbered by " << *DepInst << '\n';);
+    if (ORE->allowExtraAnalysis(DEBUG_TYPE))
+      reportMayClobberedLoad(LI, DepInfo, DT, ORE);
+
+    return false;
+  }
+  assert(DepInfo.isDef() && "follows from above");
+
+  // Loading the allocation -> undef.
+  if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) ||
+      isAlignedAllocLikeFn(DepInst, TLI) ||
+      // Loading immediately after lifetime begin -> undef.
+      isLifetimeStart(DepInst)) {
+    Res = AvailableValue::get(UndefValue::get(LI->getType()));
+    return true;
+  }
+
+  // Loading from calloc (which zero initializes memory) -> zero
+  if (isCallocLikeFn(DepInst, TLI)) {
+    Res = AvailableValue::get(Constant::getNullValue(LI->getType()));
+    return true;
+  }
+
+  if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) {
+    // Reject loads and stores that are to the same address but are of
     // different types if we have to. If the stored value is convertable to
-    // the loaded value, we can reuse it. 
-    if (!canCoerceMustAliasedValueToLoad(S->getValueOperand(), LI->getType(), 
-                                         DL)) 
-      return false; 
- 
-    // Can't forward from non-atomic to atomic without violating memory model. 
-    if (S->isAtomic() < LI->isAtomic()) 
-      return false; 
- 
-    Res = AvailableValue::get(S->getValueOperand()); 
-    return true; 
-  } 
- 
-  if (LoadInst *LD = dyn_cast<LoadInst>(DepInst)) { 
-    // If the types mismatch and we can't handle it, reject reuse of the load. 
-    // If the stored value is larger or equal to the loaded value, we can reuse 
-    // it. 
-    if (!canCoerceMustAliasedValueToLoad(LD, LI->getType(), DL)) 
-      return false; 
- 
-    // Can't forward from non-atomic to atomic without violating memory model. 
-    if (LD->isAtomic() < LI->isAtomic()) 
-      return false; 
- 
-    Res = AvailableValue::getLoad(LD); 
-    return true; 
-  } 
- 
-  // Unknown def - must be conservative 
-  LLVM_DEBUG( 
-      // fast print dep, using operator<< on instruction is too slow. 
-      dbgs() << "GVN: load "; LI->printAsOperand(dbgs()); 
-      dbgs() << " has unknown def " << *DepInst << '\n';); 
-  return false; 
-} 
- 
-void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, 
-                                  AvailValInBlkVect &ValuesPerBlock, 
-                                  UnavailBlkVect &UnavailableBlocks) { 
-  // Filter out useless results (non-locals, etc).  Keep track of the blocks 
-  // where we have a value available in repl, also keep track of whether we see 
-  // dependencies that produce an unknown value for the load (such as a call 
-  // that could potentially clobber the load). 
-  unsigned NumDeps = Deps.size(); 
-  for (unsigned i = 0, e = NumDeps; i != e; ++i) { 
-    BasicBlock *DepBB = Deps[i].getBB(); 
-    MemDepResult DepInfo = Deps[i].getResult(); 
- 
-    if (DeadBlocks.count(DepBB)) { 
-      // Dead dependent mem-op disguise as a load evaluating the same value 
-      // as the load in question. 
-      ValuesPerBlock.push_back(AvailableValueInBlock::getUndef(DepBB)); 
-      continue; 
-    } 
- 
-    if (!DepInfo.isDef() && !DepInfo.isClobber()) { 
-      UnavailableBlocks.push_back(DepBB); 
-      continue; 
-    } 
- 
-    // The address being loaded in this non-local block may not be the same as 
-    // the pointer operand of the load if PHI translation occurs.  Make sure 
-    // to consider the right address. 
-    Value *Address = Deps[i].getAddress(); 
- 
-    AvailableValue AV; 
-    if (AnalyzeLoadAvailability(LI, DepInfo, Address, AV)) { 
-      // subtlety: because we know this was a non-local dependency, we know 
-      // it's safe to materialize anywhere between the instruction within 
-      // DepInfo and the end of it's block. 
-      ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB, 
-                                                          std::move(AV))); 
-    } else { 
-      UnavailableBlocks.push_back(DepBB); 
-    } 
-  } 
- 
-  assert(NumDeps == ValuesPerBlock.size() + UnavailableBlocks.size() && 
-         "post condition violation"); 
-} 
- 
-bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, 
-                         UnavailBlkVect &UnavailableBlocks) { 
-  // Okay, we have *some* definitions of the value.  This means that the value 
-  // is available in some of our (transitive) predecessors.  Lets think about 
-  // doing PRE of this load.  This will involve inserting a new load into the 
-  // predecessor when it's not available.  We could do this in general, but 
-  // prefer to not increase code size.  As such, we only do this when we know 
-  // that we only have to insert *one* load (which means we're basically moving 
-  // the load, not inserting a new one). 
- 
-  SmallPtrSet<BasicBlock *, 4> Blockers(UnavailableBlocks.begin(), 
-                                        UnavailableBlocks.end()); 
- 
-  // Let's find the first basic block with more than one predecessor.  Walk 
-  // backwards through predecessors if needed. 
-  BasicBlock *LoadBB = LI->getParent(); 
-  BasicBlock *TmpBB = LoadBB; 
- 
-  // Check that there is no implicit control flow instructions above our load in 
-  // its block. If there is an instruction that doesn't always pass the 
-  // execution to the following instruction, then moving through it may become 
-  // invalid. For example: 
-  // 
-  // int arr[LEN]; 
-  // int index = ???; 
-  // ... 
-  // guard(0 <= index && index < LEN); 
-  // use(arr[index]); 
-  // 
-  // It is illegal to move the array access to any point above the guard, 
-  // because if the index is out of bounds we should deoptimize rather than 
-  // access the array. 
-  // Check that there is no guard in this block above our instruction. 
+    // the loaded value, we can reuse it.
+    if (!canCoerceMustAliasedValueToLoad(S->getValueOperand(), LI->getType(),
+                                         DL))
+      return false;
+
+    // Can't forward from non-atomic to atomic without violating memory model.
+    if (S->isAtomic() < LI->isAtomic())
+      return false;
+
+    Res = AvailableValue::get(S->getValueOperand());
+    return true;
+  }
+
+  if (LoadInst *LD = dyn_cast<LoadInst>(DepInst)) {
+    // If the types mismatch and we can't handle it, reject reuse of the load.
+    // If the stored value is larger or equal to the loaded value, we can reuse
+    // it.
+    if (!canCoerceMustAliasedValueToLoad(LD, LI->getType(), DL))
+      return false;
+
+    // Can't forward from non-atomic to atomic without violating memory model.
+    if (LD->isAtomic() < LI->isAtomic())
+      return false;
+
+    Res = AvailableValue::getLoad(LD);
+    return true;
+  }
+
+  // Unknown def - must be conservative
+  LLVM_DEBUG(
+      // fast print dep, using operator<< on instruction is too slow.
+      dbgs() << "GVN: load "; LI->printAsOperand(dbgs());
+      dbgs() << " has unknown def " << *DepInst << '\n';);
+  return false;
+}
+
+void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps,
+                                  AvailValInBlkVect &ValuesPerBlock,
+                                  UnavailBlkVect &UnavailableBlocks) {
+  // Filter out useless results (non-locals, etc).  Keep track of the blocks
+  // where we have a value available in repl, also keep track of whether we see
+  // dependencies that produce an unknown value for the load (such as a call
+  // that could potentially clobber the load).
+  unsigned NumDeps = Deps.size();
+  for (unsigned i = 0, e = NumDeps; i != e; ++i) {
+    BasicBlock *DepBB = Deps[i].getBB();
+    MemDepResult DepInfo = Deps[i].getResult();
+
+    if (DeadBlocks.count(DepBB)) {
+      // Dead dependent mem-op disguise as a load evaluating the same value
+      // as the load in question.
+      ValuesPerBlock.push_back(AvailableValueInBlock::getUndef(DepBB));
+      continue;
+    }
+
+    if (!DepInfo.isDef() && !DepInfo.isClobber()) {
+      UnavailableBlocks.push_back(DepBB);
+      continue;
+    }
+
+    // The address being loaded in this non-local block may not be the same as
+    // the pointer operand of the load if PHI translation occurs.  Make sure
+    // to consider the right address.
+    Value *Address = Deps[i].getAddress();
+
+    AvailableValue AV;
+    if (AnalyzeLoadAvailability(LI, DepInfo, Address, AV)) {
+      // subtlety: because we know this was a non-local dependency, we know
+      // it's safe to materialize anywhere between the instruction within
+      // DepInfo and the end of it's block.
+      ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB,
+                                                          std::move(AV)));
+    } else {
+      UnavailableBlocks.push_back(DepBB);
+    }
+  }
+
+  assert(NumDeps == ValuesPerBlock.size() + UnavailableBlocks.size() &&
+         "post condition violation");
+}
+
+bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
+                         UnavailBlkVect &UnavailableBlocks) {
+  // Okay, we have *some* definitions of the value.  This means that the value
+  // is available in some of our (transitive) predecessors.  Lets think about
+  // doing PRE of this load.  This will involve inserting a new load into the
+  // predecessor when it's not available.  We could do this in general, but
+  // prefer to not increase code size.  As such, we only do this when we know
+  // that we only have to insert *one* load (which means we're basically moving
+  // the load, not inserting a new one).
+
+  SmallPtrSet<BasicBlock *, 4> Blockers(UnavailableBlocks.begin(),
+                                        UnavailableBlocks.end());
+
+  // Let's find the first basic block with more than one predecessor.  Walk
+  // backwards through predecessors if needed.
+  BasicBlock *LoadBB = LI->getParent();
+  BasicBlock *TmpBB = LoadBB;
+
+  // Check that there is no implicit control flow instructions above our load in
+  // its block. If there is an instruction that doesn't always pass the
+  // execution to the following instruction, then moving through it may become
+  // invalid. For example:
+  //
+  // int arr[LEN];
+  // int index = ???;
+  // ...
+  // guard(0 <= index && index < LEN);
+  // use(arr[index]);
+  //
+  // It is illegal to move the array access to any point above the guard,
+  // because if the index is out of bounds we should deoptimize rather than
+  // access the array.
+  // Check that there is no guard in this block above our instruction.
   bool MustEnsureSafetyOfSpeculativeExecution =
       ICF->isDominatedByICFIFromSameBlock(LI);
 
-  while (TmpBB->getSinglePredecessor()) { 
-    TmpBB = TmpBB->getSinglePredecessor(); 
-    if (TmpBB == LoadBB) // Infinite (unreachable) loop. 
-      return false; 
-    if (Blockers.count(TmpBB)) 
-      return false; 
- 
-    // If any of these blocks has more than one successor (i.e. if the edge we 
-    // just traversed was critical), then there are other paths through this 
-    // block along which the load may not be anticipated.  Hoisting the load 
-    // above this block would be adding the load to execution paths along 
-    // which it was not previously executed. 
-    if (TmpBB->getTerminator()->getNumSuccessors() != 1) 
-      return false; 
- 
-    // Check that there is no implicit control flow in a block above. 
+  while (TmpBB->getSinglePredecessor()) {
+    TmpBB = TmpBB->getSinglePredecessor();
+    if (TmpBB == LoadBB) // Infinite (unreachable) loop.
+      return false;
+    if (Blockers.count(TmpBB))
+      return false;
+
+    // If any of these blocks has more than one successor (i.e. if the edge we
+    // just traversed was critical), then there are other paths through this
+    // block along which the load may not be anticipated.  Hoisting the load
+    // above this block would be adding the load to execution paths along
+    // which it was not previously executed.
+    if (TmpBB->getTerminator()->getNumSuccessors() != 1)
+      return false;
+
+    // Check that there is no implicit control flow in a block above.
     MustEnsureSafetyOfSpeculativeExecution =
         MustEnsureSafetyOfSpeculativeExecution || ICF->hasICF(TmpBB);
-  } 
- 
-  assert(TmpBB); 
-  LoadBB = TmpBB; 
- 
-  // Check to see how many predecessors have the loaded value fully 
-  // available. 
-  MapVector<BasicBlock *, Value *> PredLoads; 
+  }
+
+  assert(TmpBB);
+  LoadBB = TmpBB;
+
+  // Check to see how many predecessors have the loaded value fully
+  // available.
+  MapVector<BasicBlock *, Value *> PredLoads;
   DenseMap<BasicBlock *, AvailabilityState> FullyAvailableBlocks;
-  for (const AvailableValueInBlock &AV : ValuesPerBlock) 
+  for (const AvailableValueInBlock &AV : ValuesPerBlock)
     FullyAvailableBlocks[AV.BB] = AvailabilityState::Available;
-  for (BasicBlock *UnavailableBB : UnavailableBlocks) 
+  for (BasicBlock *UnavailableBB : UnavailableBlocks)
     FullyAvailableBlocks[UnavailableBB] = AvailabilityState::Unavailable;
- 
-  SmallVector<BasicBlock *, 4> CriticalEdgePred; 
-  for (BasicBlock *Pred : predecessors(LoadBB)) { 
-    // If any predecessor block is an EH pad that does not allow non-PHI 
-    // instructions before the terminator, we can't PRE the load. 
-    if (Pred->getTerminator()->isEHPad()) { 
-      LLVM_DEBUG( 
-          dbgs() << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD PREDECESSOR '" 
-                 << Pred->getName() << "': " << *LI << '\n'); 
-      return false; 
-    } 
- 
+
+  SmallVector<BasicBlock *, 4> CriticalEdgePred;
+  for (BasicBlock *Pred : predecessors(LoadBB)) {
+    // If any predecessor block is an EH pad that does not allow non-PHI
+    // instructions before the terminator, we can't PRE the load.
+    if (Pred->getTerminator()->isEHPad()) {
+      LLVM_DEBUG(
+          dbgs() << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD PREDECESSOR '"
+                 << Pred->getName() << "': " << *LI << '\n');
+      return false;
+    }
+
     if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks)) {
-      continue; 
-    } 
- 
-    if (Pred->getTerminator()->getNumSuccessors() != 1) { 
-      if (isa<IndirectBrInst>(Pred->getTerminator())) { 
-        LLVM_DEBUG( 
-            dbgs() << "COULD NOT PRE LOAD BECAUSE OF INDBR CRITICAL EDGE '" 
-                   << Pred->getName() << "': " << *LI << '\n'); 
-        return false; 
-      } 
- 
-      // FIXME: Can we support the fallthrough edge? 
-      if (isa<CallBrInst>(Pred->getTerminator())) { 
-        LLVM_DEBUG( 
-            dbgs() << "COULD NOT PRE LOAD BECAUSE OF CALLBR CRITICAL EDGE '" 
-                   << Pred->getName() << "': " << *LI << '\n'); 
-        return false; 
-      } 
- 
-      if (LoadBB->isEHPad()) { 
-        LLVM_DEBUG( 
-            dbgs() << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD CRITICAL EDGE '" 
-                   << Pred->getName() << "': " << *LI << '\n'); 
-        return false; 
-      } 
- 
+      continue;
+    }
+
+    if (Pred->getTerminator()->getNumSuccessors() != 1) {
+      if (isa<IndirectBrInst>(Pred->getTerminator())) {
+        LLVM_DEBUG(
+            dbgs() << "COULD NOT PRE LOAD BECAUSE OF INDBR CRITICAL EDGE '"
+                   << Pred->getName() << "': " << *LI << '\n');
+        return false;
+      }
+
+      // FIXME: Can we support the fallthrough edge?
+      if (isa<CallBrInst>(Pred->getTerminator())) {
+        LLVM_DEBUG(
+            dbgs() << "COULD NOT PRE LOAD BECAUSE OF CALLBR CRITICAL EDGE '"
+                   << Pred->getName() << "': " << *LI << '\n');
+        return false;
+      }
+
+      if (LoadBB->isEHPad()) {
+        LLVM_DEBUG(
+            dbgs() << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD CRITICAL EDGE '"
+                   << Pred->getName() << "': " << *LI << '\n');
+        return false;
+      }
+
       // Do not split backedge as it will break the canonical loop form.
       if (!isLoadPRESplitBackedgeEnabled())
         if (DT->dominates(LoadBB, Pred)) {
@@ -1238,25 +1238,25 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
           return false;
         }
 
-      CriticalEdgePred.push_back(Pred); 
-    } else { 
-      // Only add the predecessors that will not be split for now. 
-      PredLoads[Pred] = nullptr; 
-    } 
-  } 
- 
-  // Decide whether PRE is profitable for this load. 
-  unsigned NumUnavailablePreds = PredLoads.size() + CriticalEdgePred.size(); 
-  assert(NumUnavailablePreds != 0 && 
-         "Fully available value should already be eliminated!"); 
- 
-  // If this load is unavailable in multiple predecessors, reject it. 
-  // FIXME: If we could restructure the CFG, we could make a common pred with 
-  // all the preds that don't have an available LI and insert a new load into 
-  // that one block. 
-  if (NumUnavailablePreds != 1) 
-      return false; 
- 
+      CriticalEdgePred.push_back(Pred);
+    } else {
+      // Only add the predecessors that will not be split for now.
+      PredLoads[Pred] = nullptr;
+    }
+  }
+
+  // Decide whether PRE is profitable for this load.
+  unsigned NumUnavailablePreds = PredLoads.size() + CriticalEdgePred.size();
+  assert(NumUnavailablePreds != 0 &&
+         "Fully available value should already be eliminated!");
+
+  // If this load is unavailable in multiple predecessors, reject it.
+  // FIXME: If we could restructure the CFG, we could make a common pred with
+  // all the preds that don't have an available LI and insert a new load into
+  // that one block.
+  if (NumUnavailablePreds != 1)
+      return false;
+
   // Now we know where we will insert load. We must ensure that it is safe
   // to speculatively execute the load at that points.
   if (MustEnsureSafetyOfSpeculativeExecution) {
@@ -1268,105 +1268,105 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
         return false;
   }
 
-  // Split critical edges, and update the unavailable predecessors accordingly. 
-  for (BasicBlock *OrigPred : CriticalEdgePred) { 
-    BasicBlock *NewPred = splitCriticalEdges(OrigPred, LoadBB); 
-    assert(!PredLoads.count(OrigPred) && "Split edges shouldn't be in map!"); 
-    PredLoads[NewPred] = nullptr; 
-    LLVM_DEBUG(dbgs() << "Split critical edge " << OrigPred->getName() << "->" 
-                      << LoadBB->getName() << '\n'); 
-  } 
- 
-  // Check if the load can safely be moved to all the unavailable predecessors. 
-  bool CanDoPRE = true; 
-  const DataLayout &DL = LI->getModule()->getDataLayout(); 
-  SmallVector<Instruction*, 8> NewInsts; 
-  for (auto &PredLoad : PredLoads) { 
-    BasicBlock *UnavailablePred = PredLoad.first; 
- 
-    // Do PHI translation to get its value in the predecessor if necessary.  The 
-    // returned pointer (if non-null) is guaranteed to dominate UnavailablePred. 
-    // We do the translation for each edge we skipped by going from LI's block 
-    // to LoadBB, otherwise we might miss pieces needing translation. 
- 
-    // If all preds have a single successor, then we know it is safe to insert 
-    // the load on the pred (?!?), so we can insert code to materialize the 
-    // pointer if it is not available. 
-    Value *LoadPtr = LI->getPointerOperand(); 
-    BasicBlock *Cur = LI->getParent(); 
-    while (Cur != LoadBB) { 
-      PHITransAddr Address(LoadPtr, DL, AC); 
-      LoadPtr = Address.PHITranslateWithInsertion( 
-          Cur, Cur->getSinglePredecessor(), *DT, NewInsts); 
-      if (!LoadPtr) { 
-        CanDoPRE = false; 
-        break; 
-      } 
-      Cur = Cur->getSinglePredecessor(); 
-    } 
- 
-    if (LoadPtr) { 
-      PHITransAddr Address(LoadPtr, DL, AC); 
-      LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred, *DT, 
-                                                  NewInsts); 
-    } 
-    // If we couldn't find or insert a computation of this phi translated value, 
-    // we fail PRE. 
-    if (!LoadPtr) { 
-      LLVM_DEBUG(dbgs() << "COULDN'T INSERT PHI TRANSLATED VALUE OF: " 
-                        << *LI->getPointerOperand() << "\n"); 
-      CanDoPRE = false; 
-      break; 
-    } 
- 
-    PredLoad.second = LoadPtr; 
-  } 
- 
-  if (!CanDoPRE) { 
-    while (!NewInsts.empty()) { 
-      // Erase instructions generated by the failed PHI translation before 
-      // trying to number them. PHI translation might insert instructions 
-      // in basic blocks other than the current one, and we delete them 
-      // directly, as markInstructionForDeletion only allows removing from the 
-      // current basic block. 
-      NewInsts.pop_back_val()->eraseFromParent(); 
-    } 
-    // HINT: Don't revert the edge-splitting as following transformation may 
-    // also need to split these critical edges. 
-    return !CriticalEdgePred.empty(); 
-  } 
- 
-  // Okay, we can eliminate this load by inserting a reload in the predecessor 
-  // and using PHI construction to get the value in the other predecessors, do 
-  // it. 
-  LLVM_DEBUG(dbgs() << "GVN REMOVING PRE LOAD: " << *LI << '\n'); 
-  LLVM_DEBUG(if (!NewInsts.empty()) dbgs() 
-             << "INSERTED " << NewInsts.size() << " INSTS: " << *NewInsts.back() 
-             << '\n'); 
- 
-  // Assign value numbers to the new instructions. 
-  for (Instruction *I : NewInsts) { 
-    // Instructions that have been inserted in predecessor(s) to materialize 
-    // the load address do not retain their original debug locations. Doing 
-    // so could lead to confusing (but correct) source attributions. 
+  // Split critical edges, and update the unavailable predecessors accordingly.
+  for (BasicBlock *OrigPred : CriticalEdgePred) {
+    BasicBlock *NewPred = splitCriticalEdges(OrigPred, LoadBB);
+    assert(!PredLoads.count(OrigPred) && "Split edges shouldn't be in map!");
+    PredLoads[NewPred] = nullptr;
+    LLVM_DEBUG(dbgs() << "Split critical edge " << OrigPred->getName() << "->"
+                      << LoadBB->getName() << '\n');
+  }
+
+  // Check if the load can safely be moved to all the unavailable predecessors.
+  bool CanDoPRE = true;
+  const DataLayout &DL = LI->getModule()->getDataLayout();
+  SmallVector<Instruction*, 8> NewInsts;
+  for (auto &PredLoad : PredLoads) {
+    BasicBlock *UnavailablePred = PredLoad.first;
+
+    // Do PHI translation to get its value in the predecessor if necessary.  The
+    // returned pointer (if non-null) is guaranteed to dominate UnavailablePred.
+    // We do the translation for each edge we skipped by going from LI's block
+    // to LoadBB, otherwise we might miss pieces needing translation.
+
+    // If all preds have a single successor, then we know it is safe to insert
+    // the load on the pred (?!?), so we can insert code to materialize the
+    // pointer if it is not available.
+    Value *LoadPtr = LI->getPointerOperand();
+    BasicBlock *Cur = LI->getParent();
+    while (Cur != LoadBB) {
+      PHITransAddr Address(LoadPtr, DL, AC);
+      LoadPtr = Address.PHITranslateWithInsertion(
+          Cur, Cur->getSinglePredecessor(), *DT, NewInsts);
+      if (!LoadPtr) {
+        CanDoPRE = false;
+        break;
+      }
+      Cur = Cur->getSinglePredecessor();
+    }
+
+    if (LoadPtr) {
+      PHITransAddr Address(LoadPtr, DL, AC);
+      LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred, *DT,
+                                                  NewInsts);
+    }
+    // If we couldn't find or insert a computation of this phi translated value,
+    // we fail PRE.
+    if (!LoadPtr) {
+      LLVM_DEBUG(dbgs() << "COULDN'T INSERT PHI TRANSLATED VALUE OF: "
+                        << *LI->getPointerOperand() << "\n");
+      CanDoPRE = false;
+      break;
+    }
+
+    PredLoad.second = LoadPtr;
+  }
+
+  if (!CanDoPRE) {
+    while (!NewInsts.empty()) {
+      // Erase instructions generated by the failed PHI translation before
+      // trying to number them. PHI translation might insert instructions
+      // in basic blocks other than the current one, and we delete them
+      // directly, as markInstructionForDeletion only allows removing from the
+      // current basic block.
+      NewInsts.pop_back_val()->eraseFromParent();
+    }
+    // HINT: Don't revert the edge-splitting as following transformation may
+    // also need to split these critical edges.
+    return !CriticalEdgePred.empty();
+  }
+
+  // Okay, we can eliminate this load by inserting a reload in the predecessor
+  // and using PHI construction to get the value in the other predecessors, do
+  // it.
+  LLVM_DEBUG(dbgs() << "GVN REMOVING PRE LOAD: " << *LI << '\n');
+  LLVM_DEBUG(if (!NewInsts.empty()) dbgs()
+             << "INSERTED " << NewInsts.size() << " INSTS: " << *NewInsts.back()
+             << '\n');
+
+  // Assign value numbers to the new instructions.
+  for (Instruction *I : NewInsts) {
+    // Instructions that have been inserted in predecessor(s) to materialize
+    // the load address do not retain their original debug locations. Doing
+    // so could lead to confusing (but correct) source attributions.
     I->updateLocationAfterHoist();
- 
-    // FIXME: We really _ought_ to insert these value numbers into their 
-    // parent's availability map.  However, in doing so, we risk getting into 
-    // ordering issues.  If a block hasn't been processed yet, we would be 
-    // marking a value as AVAIL-IN, which isn't what we intend. 
-    VN.lookupOrAdd(I); 
-  } 
- 
-  for (const auto &PredLoad : PredLoads) { 
-    BasicBlock *UnavailablePred = PredLoad.first; 
-    Value *LoadPtr = PredLoad.second; 
- 
-    auto *NewLoad = new LoadInst( 
-        LI->getType(), LoadPtr, LI->getName() + ".pre", LI->isVolatile(), 
-        LI->getAlign(), LI->getOrdering(), LI->getSyncScopeID(), 
-        UnavailablePred->getTerminator()); 
-    NewLoad->setDebugLoc(LI->getDebugLoc()); 
+
+    // FIXME: We really _ought_ to insert these value numbers into their
+    // parent's availability map.  However, in doing so, we risk getting into
+    // ordering issues.  If a block hasn't been processed yet, we would be
+    // marking a value as AVAIL-IN, which isn't what we intend.
+    VN.lookupOrAdd(I);
+  }
+
+  for (const auto &PredLoad : PredLoads) {
+    BasicBlock *UnavailablePred = PredLoad.first;
+    Value *LoadPtr = PredLoad.second;
+
+    auto *NewLoad = new LoadInst(
+        LI->getType(), LoadPtr, LI->getName() + ".pre", LI->isVolatile(),
+        LI->getAlign(), LI->getOrdering(), LI->getSyncScopeID(),
+        UnavailablePred->getTerminator());
+    NewLoad->setDebugLoc(LI->getDebugLoc());
     if (MSSAU) {
       auto *MSSA = MSSAU->getMemorySSA();
       // Get the defining access of the original load or use the load if it is a
@@ -1383,223 +1383,223 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
       else
         MSSAU->insertUse(cast<MemoryUse>(NewAccess), /*RenameUses=*/true);
     }
- 
-    // Transfer the old load's AA tags to the new load. 
-    AAMDNodes Tags; 
-    LI->getAAMetadata(Tags); 
-    if (Tags) 
-      NewLoad->setAAMetadata(Tags); 
- 
-    if (auto *MD = LI->getMetadata(LLVMContext::MD_invariant_load)) 
-      NewLoad->setMetadata(LLVMContext::MD_invariant_load, MD); 
-    if (auto *InvGroupMD = LI->getMetadata(LLVMContext::MD_invariant_group)) 
-      NewLoad->setMetadata(LLVMContext::MD_invariant_group, InvGroupMD); 
-    if (auto *RangeMD = LI->getMetadata(LLVMContext::MD_range)) 
-      NewLoad->setMetadata(LLVMContext::MD_range, RangeMD); 
- 
-    // We do not propagate the old load's debug location, because the new 
-    // load now lives in a different BB, and we want to avoid a jumpy line 
-    // table. 
-    // FIXME: How do we retain source locations without causing poor debugging 
-    // behavior? 
- 
-    // Add the newly created load. 
-    ValuesPerBlock.push_back(AvailableValueInBlock::get(UnavailablePred, 
-                                                        NewLoad)); 
-    MD->invalidateCachedPointerInfo(LoadPtr); 
-    LLVM_DEBUG(dbgs() << "GVN INSERTED " << *NewLoad << '\n'); 
-  } 
- 
-  // Perform PHI construction. 
-  Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this); 
-  LI->replaceAllUsesWith(V); 
-  if (isa<PHINode>(V)) 
-    V->takeName(LI); 
-  if (Instruction *I = dyn_cast<Instruction>(V)) 
-    I->setDebugLoc(LI->getDebugLoc()); 
-  if (V->getType()->isPtrOrPtrVectorTy()) 
-    MD->invalidateCachedPointerInfo(V); 
-  markInstructionForDeletion(LI); 
-  ORE->emit([&]() { 
-    return OptimizationRemark(DEBUG_TYPE, "LoadPRE", LI) 
-           << "load eliminated by PRE"; 
-  }); 
-  ++NumPRELoad; 
-  return true; 
-} 
- 
-static void reportLoadElim(LoadInst *LI, Value *AvailableValue, 
-                           OptimizationRemarkEmitter *ORE) { 
-  using namespace ore; 
- 
-  ORE->emit([&]() { 
-    return OptimizationRemark(DEBUG_TYPE, "LoadElim", LI) 
-           << "load of type " << NV("Type", LI->getType()) << " eliminated" 
-           << setExtraArgs() << " in favor of " 
-           << NV("InfavorOfValue", AvailableValue); 
-  }); 
-} 
- 
-/// Attempt to eliminate a load whose dependencies are 
-/// non-local by performing PHI construction. 
-bool GVN::processNonLocalLoad(LoadInst *LI) { 
-  // non-local speculations are not allowed under asan. 
-  if (LI->getParent()->getParent()->hasFnAttribute( 
-          Attribute::SanitizeAddress) || 
-      LI->getParent()->getParent()->hasFnAttribute( 
-          Attribute::SanitizeHWAddress)) 
-    return false; 
- 
-  // Step 1: Find the non-local dependencies of the load. 
-  LoadDepVect Deps; 
-  MD->getNonLocalPointerDependency(LI, Deps); 
- 
-  // If we had to process more than one hundred blocks to find the 
-  // dependencies, this load isn't worth worrying about.  Optimizing 
-  // it will be too expensive. 
-  unsigned NumDeps = Deps.size(); 
-  if (NumDeps > MaxNumDeps) 
-    return false; 
- 
-  // If we had a phi translation failure, we'll have a single entry which is a 
-  // clobber in the current block.  Reject this early. 
-  if (NumDeps == 1 && 
-      !Deps[0].getResult().isDef() && !Deps[0].getResult().isClobber()) { 
-    LLVM_DEBUG(dbgs() << "GVN: non-local load "; LI->printAsOperand(dbgs()); 
-               dbgs() << " has unknown dependencies\n";); 
-    return false; 
-  } 
- 
+
+    // Transfer the old load's AA tags to the new load.
+    AAMDNodes Tags;
+    LI->getAAMetadata(Tags);
+    if (Tags)
+      NewLoad->setAAMetadata(Tags);
+
+    if (auto *MD = LI->getMetadata(LLVMContext::MD_invariant_load))
+      NewLoad->setMetadata(LLVMContext::MD_invariant_load, MD);
+    if (auto *InvGroupMD = LI->getMetadata(LLVMContext::MD_invariant_group))
+      NewLoad->setMetadata(LLVMContext::MD_invariant_group, InvGroupMD);
+    if (auto *RangeMD = LI->getMetadata(LLVMContext::MD_range))
+      NewLoad->setMetadata(LLVMContext::MD_range, RangeMD);
+
+    // We do not propagate the old load's debug location, because the new
+    // load now lives in a different BB, and we want to avoid a jumpy line
+    // table.
+    // FIXME: How do we retain source locations without causing poor debugging
+    // behavior?
+
+    // Add the newly created load.
+    ValuesPerBlock.push_back(AvailableValueInBlock::get(UnavailablePred,
+                                                        NewLoad));
+    MD->invalidateCachedPointerInfo(LoadPtr);
+    LLVM_DEBUG(dbgs() << "GVN INSERTED " << *NewLoad << '\n');
+  }
+
+  // Perform PHI construction.
+  Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this);
+  LI->replaceAllUsesWith(V);
+  if (isa<PHINode>(V))
+    V->takeName(LI);
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    I->setDebugLoc(LI->getDebugLoc());
+  if (V->getType()->isPtrOrPtrVectorTy())
+    MD->invalidateCachedPointerInfo(V);
+  markInstructionForDeletion(LI);
+  ORE->emit([&]() {
+    return OptimizationRemark(DEBUG_TYPE, "LoadPRE", LI)
+           << "load eliminated by PRE";
+  });
+  ++NumPRELoad;
+  return true;
+}
+
+static void reportLoadElim(LoadInst *LI, Value *AvailableValue,
+                           OptimizationRemarkEmitter *ORE) {
+  using namespace ore;
+
+  ORE->emit([&]() {
+    return OptimizationRemark(DEBUG_TYPE, "LoadElim", LI)
+           << "load of type " << NV("Type", LI->getType()) << " eliminated"
+           << setExtraArgs() << " in favor of "
+           << NV("InfavorOfValue", AvailableValue);
+  });
+}
+
+/// Attempt to eliminate a load whose dependencies are
+/// non-local by performing PHI construction.
+bool GVN::processNonLocalLoad(LoadInst *LI) {
+  // non-local speculations are not allowed under asan.
+  if (LI->getParent()->getParent()->hasFnAttribute(
+          Attribute::SanitizeAddress) ||
+      LI->getParent()->getParent()->hasFnAttribute(
+          Attribute::SanitizeHWAddress))
+    return false;
+
+  // Step 1: Find the non-local dependencies of the load.
+  LoadDepVect Deps;
+  MD->getNonLocalPointerDependency(LI, Deps);
+
+  // If we had to process more than one hundred blocks to find the
+  // dependencies, this load isn't worth worrying about.  Optimizing
+  // it will be too expensive.
+  unsigned NumDeps = Deps.size();
+  if (NumDeps > MaxNumDeps)
+    return false;
+
+  // If we had a phi translation failure, we'll have a single entry which is a
+  // clobber in the current block.  Reject this early.
+  if (NumDeps == 1 &&
+      !Deps[0].getResult().isDef() && !Deps[0].getResult().isClobber()) {
+    LLVM_DEBUG(dbgs() << "GVN: non-local load "; LI->printAsOperand(dbgs());
+               dbgs() << " has unknown dependencies\n";);
+    return false;
+  }
+
   bool Changed = false;
-  // If this load follows a GEP, see if we can PRE the indices before analyzing. 
-  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0))) { 
-    for (GetElementPtrInst::op_iterator OI = GEP->idx_begin(), 
-                                        OE = GEP->idx_end(); 
-         OI != OE; ++OI) 
-      if (Instruction *I = dyn_cast<Instruction>(OI->get())) 
+  // If this load follows a GEP, see if we can PRE the indices before analyzing.
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0))) {
+    for (GetElementPtrInst::op_iterator OI = GEP->idx_begin(),
+                                        OE = GEP->idx_end();
+         OI != OE; ++OI)
+      if (Instruction *I = dyn_cast<Instruction>(OI->get()))
         Changed |= performScalarPRE(I);
-  } 
- 
-  // Step 2: Analyze the availability of the load 
-  AvailValInBlkVect ValuesPerBlock; 
-  UnavailBlkVect UnavailableBlocks; 
-  AnalyzeLoadAvailability(LI, Deps, ValuesPerBlock, UnavailableBlocks); 
- 
-  // If we have no predecessors that produce a known value for this load, exit 
-  // early. 
-  if (ValuesPerBlock.empty()) 
+  }
+
+  // Step 2: Analyze the availability of the load
+  AvailValInBlkVect ValuesPerBlock;
+  UnavailBlkVect UnavailableBlocks;
+  AnalyzeLoadAvailability(LI, Deps, ValuesPerBlock, UnavailableBlocks);
+
+  // If we have no predecessors that produce a known value for this load, exit
+  // early.
+  if (ValuesPerBlock.empty())
     return Changed;
- 
-  // Step 3: Eliminate fully redundancy. 
-  // 
-  // If all of the instructions we depend on produce a known value for this 
-  // load, then it is fully redundant and we can use PHI insertion to compute 
-  // its value.  Insert PHIs and remove the fully redundant value now. 
-  if (UnavailableBlocks.empty()) { 
-    LLVM_DEBUG(dbgs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n'); 
- 
-    // Perform PHI construction. 
-    Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this); 
-    LI->replaceAllUsesWith(V); 
- 
-    if (isa<PHINode>(V)) 
-      V->takeName(LI); 
-    if (Instruction *I = dyn_cast<Instruction>(V)) 
-      // If instruction I has debug info, then we should not update it. 
-      // Also, if I has a null DebugLoc, then it is still potentially incorrect 
-      // to propagate LI's DebugLoc because LI may not post-dominate I. 
-      if (LI->getDebugLoc() && LI->getParent() == I->getParent()) 
-        I->setDebugLoc(LI->getDebugLoc()); 
-    if (V->getType()->isPtrOrPtrVectorTy()) 
-      MD->invalidateCachedPointerInfo(V); 
-    markInstructionForDeletion(LI); 
-    ++NumGVNLoad; 
-    reportLoadElim(LI, V, ORE); 
-    return true; 
-  } 
- 
-  // Step 4: Eliminate partial redundancy. 
-  if (!isPREEnabled() || !isLoadPREEnabled()) 
+
+  // Step 3: Eliminate fully redundancy.
+  //
+  // If all of the instructions we depend on produce a known value for this
+  // load, then it is fully redundant and we can use PHI insertion to compute
+  // its value.  Insert PHIs and remove the fully redundant value now.
+  if (UnavailableBlocks.empty()) {
+    LLVM_DEBUG(dbgs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n');
+
+    // Perform PHI construction.
+    Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this);
+    LI->replaceAllUsesWith(V);
+
+    if (isa<PHINode>(V))
+      V->takeName(LI);
+    if (Instruction *I = dyn_cast<Instruction>(V))
+      // If instruction I has debug info, then we should not update it.
+      // Also, if I has a null DebugLoc, then it is still potentially incorrect
+      // to propagate LI's DebugLoc because LI may not post-dominate I.
+      if (LI->getDebugLoc() && LI->getParent() == I->getParent())
+        I->setDebugLoc(LI->getDebugLoc());
+    if (V->getType()->isPtrOrPtrVectorTy())
+      MD->invalidateCachedPointerInfo(V);
+    markInstructionForDeletion(LI);
+    ++NumGVNLoad;
+    reportLoadElim(LI, V, ORE);
+    return true;
+  }
+
+  // Step 4: Eliminate partial redundancy.
+  if (!isPREEnabled() || !isLoadPREEnabled())
     return Changed;
-  if (!isLoadInLoopPREEnabled() && this->LI && 
-      this->LI->getLoopFor(LI->getParent())) 
+  if (!isLoadInLoopPREEnabled() && this->LI &&
+      this->LI->getLoopFor(LI->getParent()))
     return Changed;
- 
+
   return Changed || PerformLoadPRE(LI, ValuesPerBlock, UnavailableBlocks);
-} 
- 
-static bool impliesEquivalanceIfTrue(CmpInst* Cmp) { 
-  if (Cmp->getPredicate() == CmpInst::Predicate::ICMP_EQ) 
-    return true; 
- 
-  // Floating point comparisons can be equal, but not equivalent.  Cases: 
-  // NaNs for unordered operators 
-  // +0.0 vs 0.0 for all operators 
-  if (Cmp->getPredicate() == CmpInst::Predicate::FCMP_OEQ || 
-      (Cmp->getPredicate() == CmpInst::Predicate::FCMP_UEQ && 
-       Cmp->getFastMathFlags().noNaNs())) { 
-      Value *LHS = Cmp->getOperand(0); 
-      Value *RHS = Cmp->getOperand(1); 
-      // If we can prove either side non-zero, then equality must imply 
-      // equivalence. 
-      // FIXME: We should do this optimization if 'no signed zeros' is 
-      // applicable via an instruction-level fast-math-flag or some other 
-      // indicator that relaxed FP semantics are being used. 
-      if (isa<ConstantFP>(LHS) && !cast<ConstantFP>(LHS)->isZero()) 
-        return true; 
-      if (isa<ConstantFP>(RHS) && !cast<ConstantFP>(RHS)->isZero()) 
-        return true;; 
-      // TODO: Handle vector floating point constants 
-  } 
-  return false; 
-} 
- 
-static bool impliesEquivalanceIfFalse(CmpInst* Cmp) { 
-  if (Cmp->getPredicate() == CmpInst::Predicate::ICMP_NE) 
-    return true; 
- 
-  // Floating point comparisons can be equal, but not equivelent.  Cases: 
-  // NaNs for unordered operators 
-  // +0.0 vs 0.0 for all operators 
-  if ((Cmp->getPredicate() == CmpInst::Predicate::FCMP_ONE && 
-       Cmp->getFastMathFlags().noNaNs()) || 
-      Cmp->getPredicate() == CmpInst::Predicate::FCMP_UNE) { 
-      Value *LHS = Cmp->getOperand(0); 
-      Value *RHS = Cmp->getOperand(1); 
-      // If we can prove either side non-zero, then equality must imply 
-      // equivalence. 
-      // FIXME: We should do this optimization if 'no signed zeros' is 
-      // applicable via an instruction-level fast-math-flag or some other 
-      // indicator that relaxed FP semantics are being used. 
-      if (isa<ConstantFP>(LHS) && !cast<ConstantFP>(LHS)->isZero()) 
-        return true; 
-      if (isa<ConstantFP>(RHS) && !cast<ConstantFP>(RHS)->isZero()) 
-        return true;; 
-      // TODO: Handle vector floating point constants 
-  } 
-  return false; 
-} 
- 
- 
-static bool hasUsersIn(Value *V, BasicBlock *BB) { 
-  for (User *U : V->users()) 
-    if (isa<Instruction>(U) && 
-        cast<Instruction>(U)->getParent() == BB) 
-      return true; 
-  return false; 
-} 
- 
-bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) { 
-  assert(IntrinsicI->getIntrinsicID() == Intrinsic::assume && 
-         "This function can only be called with llvm.assume intrinsic"); 
-  Value *V = IntrinsicI->getArgOperand(0); 
- 
-  if (ConstantInt *Cond = dyn_cast<ConstantInt>(V)) { 
-    if (Cond->isZero()) { 
-      Type *Int8Ty = Type::getInt8Ty(V->getContext()); 
-      // Insert a new store to null instruction before the load to indicate that 
-      // this code is not reachable.  FIXME: We could insert unreachable 
-      // instruction directly because we can modify the CFG. 
+}
+
+static bool impliesEquivalanceIfTrue(CmpInst* Cmp) {
+  if (Cmp->getPredicate() == CmpInst::Predicate::ICMP_EQ)
+    return true;
+
+  // Floating point comparisons can be equal, but not equivalent.  Cases:
+  // NaNs for unordered operators
+  // +0.0 vs 0.0 for all operators
+  if (Cmp->getPredicate() == CmpInst::Predicate::FCMP_OEQ ||
+      (Cmp->getPredicate() == CmpInst::Predicate::FCMP_UEQ &&
+       Cmp->getFastMathFlags().noNaNs())) {
+      Value *LHS = Cmp->getOperand(0);
+      Value *RHS = Cmp->getOperand(1);
+      // If we can prove either side non-zero, then equality must imply
+      // equivalence.
+      // FIXME: We should do this optimization if 'no signed zeros' is
+      // applicable via an instruction-level fast-math-flag or some other
+      // indicator that relaxed FP semantics are being used.
+      if (isa<ConstantFP>(LHS) && !cast<ConstantFP>(LHS)->isZero())
+        return true;
+      if (isa<ConstantFP>(RHS) && !cast<ConstantFP>(RHS)->isZero())
+        return true;;
+      // TODO: Handle vector floating point constants
+  }
+  return false;
+}
+
+static bool impliesEquivalanceIfFalse(CmpInst* Cmp) {
+  if (Cmp->getPredicate() == CmpInst::Predicate::ICMP_NE)
+    return true;
+
+  // Floating point comparisons can be equal, but not equivelent.  Cases:
+  // NaNs for unordered operators
+  // +0.0 vs 0.0 for all operators
+  if ((Cmp->getPredicate() == CmpInst::Predicate::FCMP_ONE &&
+       Cmp->getFastMathFlags().noNaNs()) ||
+      Cmp->getPredicate() == CmpInst::Predicate::FCMP_UNE) {
+      Value *LHS = Cmp->getOperand(0);
+      Value *RHS = Cmp->getOperand(1);
+      // If we can prove either side non-zero, then equality must imply
+      // equivalence.
+      // FIXME: We should do this optimization if 'no signed zeros' is
+      // applicable via an instruction-level fast-math-flag or some other
+      // indicator that relaxed FP semantics are being used.
+      if (isa<ConstantFP>(LHS) && !cast<ConstantFP>(LHS)->isZero())
+        return true;
+      if (isa<ConstantFP>(RHS) && !cast<ConstantFP>(RHS)->isZero())
+        return true;;
+      // TODO: Handle vector floating point constants
+  }
+  return false;
+}
+
+
+static bool hasUsersIn(Value *V, BasicBlock *BB) {
+  for (User *U : V->users())
+    if (isa<Instruction>(U) &&
+        cast<Instruction>(U)->getParent() == BB)
+      return true;
+  return false;
+}
+
+bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) {
+  assert(IntrinsicI->getIntrinsicID() == Intrinsic::assume &&
+         "This function can only be called with llvm.assume intrinsic");
+  Value *V = IntrinsicI->getArgOperand(0);
+
+  if (ConstantInt *Cond = dyn_cast<ConstantInt>(V)) {
+    if (Cond->isZero()) {
+      Type *Int8Ty = Type::getInt8Ty(V->getContext());
+      // Insert a new store to null instruction before the load to indicate that
+      // this code is not reachable.  FIXME: We could insert unreachable
+      // instruction directly because we can modify the CFG.
       auto *NewS = new StoreInst(UndefValue::get(Int8Ty),
                                  Constant::getNullValue(Int8Ty->getPointerTo()),
                                  IntrinsicI);
@@ -1634,1312 +1634,1312 @@ bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) {
 
         MSSAU->insertDef(cast<MemoryDef>(NewDef), /*RenameUses=*/false);
       }
-    } 
-    if (isAssumeWithEmptyBundle(*IntrinsicI)) 
-      markInstructionForDeletion(IntrinsicI); 
-    return false; 
-  } else if (isa<Constant>(V)) { 
-    // If it's not false, and constant, it must evaluate to true. This means our 
-    // assume is assume(true), and thus, pointless, and we don't want to do 
-    // anything more here. 
-    return false; 
-  } 
- 
-  Constant *True = ConstantInt::getTrue(V->getContext()); 
-  bool Changed = false; 
- 
-  for (BasicBlock *Successor : successors(IntrinsicI->getParent())) { 
-    BasicBlockEdge Edge(IntrinsicI->getParent(), Successor); 
- 
-    // This property is only true in dominated successors, propagateEquality 
-    // will check dominance for us. 
-    Changed |= propagateEquality(V, True, Edge, false); 
-  } 
- 
-  // We can replace assume value with true, which covers cases like this: 
-  // call void @llvm.assume(i1 %cmp) 
-  // br i1 %cmp, label %bb1, label %bb2 ; will change %cmp to true 
-  ReplaceOperandsWithMap[V] = True; 
- 
+    }
+    if (isAssumeWithEmptyBundle(*IntrinsicI))
+      markInstructionForDeletion(IntrinsicI);
+    return false;
+  } else if (isa<Constant>(V)) {
+    // If it's not false, and constant, it must evaluate to true. This means our
+    // assume is assume(true), and thus, pointless, and we don't want to do
+    // anything more here.
+    return false;
+  }
+
+  Constant *True = ConstantInt::getTrue(V->getContext());
+  bool Changed = false;
+
+  for (BasicBlock *Successor : successors(IntrinsicI->getParent())) {
+    BasicBlockEdge Edge(IntrinsicI->getParent(), Successor);
+
+    // This property is only true in dominated successors, propagateEquality
+    // will check dominance for us.
+    Changed |= propagateEquality(V, True, Edge, false);
+  }
+
+  // We can replace assume value with true, which covers cases like this:
+  // call void @llvm.assume(i1 %cmp)
+  // br i1 %cmp, label %bb1, label %bb2 ; will change %cmp to true
+  ReplaceOperandsWithMap[V] = True;
+
   // Similarly, after assume(!NotV) we know that NotV == false.
   Value *NotV;
   if (match(V, m_Not(m_Value(NotV))))
     ReplaceOperandsWithMap[NotV] = ConstantInt::getFalse(V->getContext());
 
-  // If we find an equality fact, canonicalize all dominated uses in this block 
-  // to one of the two values.  We heuristically choice the "oldest" of the 
-  // two where age is determined by value number. (Note that propagateEquality 
-  // above handles the cross block case.) 
-  // 
-  // Key case to cover are: 
-  // 1) 
-  // %cmp = fcmp oeq float 3.000000e+00, %0 ; const on lhs could happen 
-  // call void @llvm.assume(i1 %cmp) 
-  // ret float %0 ; will change it to ret float 3.000000e+00 
-  // 2) 
-  // %load = load float, float* %addr 
-  // %cmp = fcmp oeq float %load, %0 
-  // call void @llvm.assume(i1 %cmp) 
-  // ret float %load ; will change it to ret float %0 
-  if (auto *CmpI = dyn_cast<CmpInst>(V)) { 
-    if (impliesEquivalanceIfTrue(CmpI)) { 
-      Value *CmpLHS = CmpI->getOperand(0); 
-      Value *CmpRHS = CmpI->getOperand(1); 
-      // Heuristically pick the better replacement -- the choice of heuristic 
-      // isn't terribly important here, but the fact we canonicalize on some 
-      // replacement is for exposing other simplifications. 
-      // TODO: pull this out as a helper function and reuse w/existing 
-      // (slightly different) logic. 
-      if (isa<Constant>(CmpLHS) && !isa<Constant>(CmpRHS)) 
-        std::swap(CmpLHS, CmpRHS); 
-      if (!isa<Instruction>(CmpLHS) && isa<Instruction>(CmpRHS)) 
-        std::swap(CmpLHS, CmpRHS); 
-      if ((isa<Argument>(CmpLHS) && isa<Argument>(CmpRHS)) || 
-          (isa<Instruction>(CmpLHS) && isa<Instruction>(CmpRHS))) { 
-        // Move the 'oldest' value to the right-hand side, using the value 
-        // number as a proxy for age. 
-        uint32_t LVN = VN.lookupOrAdd(CmpLHS); 
-        uint32_t RVN = VN.lookupOrAdd(CmpRHS); 
-        if (LVN < RVN) 
-          std::swap(CmpLHS, CmpRHS); 
-      } 
- 
-      // Handle degenerate case where we either haven't pruned a dead path or a 
-      // removed a trivial assume yet. 
-      if (isa<Constant>(CmpLHS) && isa<Constant>(CmpRHS)) 
-        return Changed; 
- 
-      LLVM_DEBUG(dbgs() << "Replacing dominated uses of " 
-                 << *CmpLHS << " with " 
-                 << *CmpRHS << " in block " 
-                 << IntrinsicI->getParent()->getName() << "\n"); 
- 
- 
-      // Setup the replacement map - this handles uses within the same block 
-      if (hasUsersIn(CmpLHS, IntrinsicI->getParent())) 
-        ReplaceOperandsWithMap[CmpLHS] = CmpRHS; 
- 
-      // NOTE: The non-block local cases are handled by the call to 
-      // propagateEquality above; this block is just about handling the block 
-      // local cases.  TODO: There's a bunch of logic in propagateEqualiy which 
-      // isn't duplicated for the block local case, can we share it somehow? 
-    } 
-  } 
-  return Changed; 
-} 
- 
-static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) { 
-  patchReplacementInstruction(I, Repl); 
-  I->replaceAllUsesWith(Repl); 
-} 
- 
-/// Attempt to eliminate a load, first by eliminating it 
-/// locally, and then attempting non-local elimination if that fails. 
-bool GVN::processLoad(LoadInst *L) { 
-  if (!MD) 
-    return false; 
- 
-  // This code hasn't been audited for ordered or volatile memory access 
-  if (!L->isUnordered()) 
-    return false; 
- 
-  if (L->use_empty()) { 
-    markInstructionForDeletion(L); 
-    return true; 
-  } 
- 
-  // ... to a pointer that has been loaded from before... 
-  MemDepResult Dep = MD->getDependency(L); 
- 
-  // If it is defined in another block, try harder. 
-  if (Dep.isNonLocal()) 
-    return processNonLocalLoad(L); 
- 
-  // Only handle the local case below 
-  if (!Dep.isDef() && !Dep.isClobber()) { 
-    // This might be a NonFuncLocal or an Unknown 
-    LLVM_DEBUG( 
-        // fast print dep, using operator<< on instruction is too slow. 
-        dbgs() << "GVN: load "; L->printAsOperand(dbgs()); 
-        dbgs() << " has unknown dependence\n";); 
-    return false; 
-  } 
- 
-  AvailableValue AV; 
-  if (AnalyzeLoadAvailability(L, Dep, L->getPointerOperand(), AV)) { 
-    Value *AvailableValue = AV.MaterializeAdjustedValue(L, L, *this); 
- 
-    // Replace the load! 
-    patchAndReplaceAllUsesWith(L, AvailableValue); 
-    markInstructionForDeletion(L); 
+  // If we find an equality fact, canonicalize all dominated uses in this block
+  // to one of the two values.  We heuristically choice the "oldest" of the
+  // two where age is determined by value number. (Note that propagateEquality
+  // above handles the cross block case.)
+  //
+  // Key case to cover are:
+  // 1)
+  // %cmp = fcmp oeq float 3.000000e+00, %0 ; const on lhs could happen
+  // call void @llvm.assume(i1 %cmp)
+  // ret float %0 ; will change it to ret float 3.000000e+00
+  // 2)
+  // %load = load float, float* %addr
+  // %cmp = fcmp oeq float %load, %0
+  // call void @llvm.assume(i1 %cmp)
+  // ret float %load ; will change it to ret float %0
+  if (auto *CmpI = dyn_cast<CmpInst>(V)) {
+    if (impliesEquivalanceIfTrue(CmpI)) {
+      Value *CmpLHS = CmpI->getOperand(0);
+      Value *CmpRHS = CmpI->getOperand(1);
+      // Heuristically pick the better replacement -- the choice of heuristic
+      // isn't terribly important here, but the fact we canonicalize on some
+      // replacement is for exposing other simplifications.
+      // TODO: pull this out as a helper function and reuse w/existing
+      // (slightly different) logic.
+      if (isa<Constant>(CmpLHS) && !isa<Constant>(CmpRHS))
+        std::swap(CmpLHS, CmpRHS);
+      if (!isa<Instruction>(CmpLHS) && isa<Instruction>(CmpRHS))
+        std::swap(CmpLHS, CmpRHS);
+      if ((isa<Argument>(CmpLHS) && isa<Argument>(CmpRHS)) ||
+          (isa<Instruction>(CmpLHS) && isa<Instruction>(CmpRHS))) {
+        // Move the 'oldest' value to the right-hand side, using the value
+        // number as a proxy for age.
+        uint32_t LVN = VN.lookupOrAdd(CmpLHS);
+        uint32_t RVN = VN.lookupOrAdd(CmpRHS);
+        if (LVN < RVN)
+          std::swap(CmpLHS, CmpRHS);
+      }
+
+      // Handle degenerate case where we either haven't pruned a dead path or a
+      // removed a trivial assume yet.
+      if (isa<Constant>(CmpLHS) && isa<Constant>(CmpRHS))
+        return Changed;
+
+      LLVM_DEBUG(dbgs() << "Replacing dominated uses of "
+                 << *CmpLHS << " with "
+                 << *CmpRHS << " in block "
+                 << IntrinsicI->getParent()->getName() << "\n");
+
+
+      // Setup the replacement map - this handles uses within the same block
+      if (hasUsersIn(CmpLHS, IntrinsicI->getParent()))
+        ReplaceOperandsWithMap[CmpLHS] = CmpRHS;
+
+      // NOTE: The non-block local cases are handled by the call to
+      // propagateEquality above; this block is just about handling the block
+      // local cases.  TODO: There's a bunch of logic in propagateEqualiy which
+      // isn't duplicated for the block local case, can we share it somehow?
+    }
+  }
+  return Changed;
+}
+
+static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) {
+  patchReplacementInstruction(I, Repl);
+  I->replaceAllUsesWith(Repl);
+}
+
+/// Attempt to eliminate a load, first by eliminating it
+/// locally, and then attempting non-local elimination if that fails.
+bool GVN::processLoad(LoadInst *L) {
+  if (!MD)
+    return false;
+
+  // This code hasn't been audited for ordered or volatile memory access
+  if (!L->isUnordered())
+    return false;
+
+  if (L->use_empty()) {
+    markInstructionForDeletion(L);
+    return true;
+  }
+
+  // ... to a pointer that has been loaded from before...
+  MemDepResult Dep = MD->getDependency(L);
+
+  // If it is defined in another block, try harder.
+  if (Dep.isNonLocal())
+    return processNonLocalLoad(L);
+
+  // Only handle the local case below
+  if (!Dep.isDef() && !Dep.isClobber()) {
+    // This might be a NonFuncLocal or an Unknown
+    LLVM_DEBUG(
+        // fast print dep, using operator<< on instruction is too slow.
+        dbgs() << "GVN: load "; L->printAsOperand(dbgs());
+        dbgs() << " has unknown dependence\n";);
+    return false;
+  }
+
+  AvailableValue AV;
+  if (AnalyzeLoadAvailability(L, Dep, L->getPointerOperand(), AV)) {
+    Value *AvailableValue = AV.MaterializeAdjustedValue(L, L, *this);
+
+    // Replace the load!
+    patchAndReplaceAllUsesWith(L, AvailableValue);
+    markInstructionForDeletion(L);
     if (MSSAU)
       MSSAU->removeMemoryAccess(L);
-    ++NumGVNLoad; 
-    reportLoadElim(L, AvailableValue, ORE); 
-    // Tell MDA to rexamine the reused pointer since we might have more 
-    // information after forwarding it. 
-    if (MD && AvailableValue->getType()->isPtrOrPtrVectorTy()) 
-      MD->invalidateCachedPointerInfo(AvailableValue); 
-    return true; 
-  } 
- 
-  return false; 
-} 
- 
-/// Return a pair the first field showing the value number of \p Exp and the 
-/// second field showing whether it is a value number newly created. 
-std::pair<uint32_t, bool> 
-GVN::ValueTable::assignExpNewValueNum(Expression &Exp) { 
-  uint32_t &e = expressionNumbering[Exp]; 
-  bool CreateNewValNum = !e; 
-  if (CreateNewValNum) { 
-    Expressions.push_back(Exp); 
-    if (ExprIdx.size() < nextValueNumber + 1) 
-      ExprIdx.resize(nextValueNumber * 2); 
-    e = nextValueNumber; 
-    ExprIdx[nextValueNumber++] = nextExprNumber++; 
-  } 
-  return {e, CreateNewValNum}; 
-} 
- 
-/// Return whether all the values related with the same \p num are 
-/// defined in \p BB. 
-bool GVN::ValueTable::areAllValsInBB(uint32_t Num, const BasicBlock *BB, 
-                                     GVN &Gvn) { 
-  LeaderTableEntry *Vals = &Gvn.LeaderTable[Num]; 
-  while (Vals && Vals->BB == BB) 
-    Vals = Vals->Next; 
-  return !Vals; 
-} 
- 
-/// Wrap phiTranslateImpl to provide caching functionality. 
-uint32_t GVN::ValueTable::phiTranslate(const BasicBlock *Pred, 
-                                       const BasicBlock *PhiBlock, uint32_t Num, 
-                                       GVN &Gvn) { 
-  auto FindRes = PhiTranslateTable.find({Num, Pred}); 
-  if (FindRes != PhiTranslateTable.end()) 
-    return FindRes->second; 
-  uint32_t NewNum = phiTranslateImpl(Pred, PhiBlock, Num, Gvn); 
-  PhiTranslateTable.insert({{Num, Pred}, NewNum}); 
-  return NewNum; 
-} 
- 
-// Return true if the value number \p Num and NewNum have equal value. 
-// Return false if the result is unknown. 
-bool GVN::ValueTable::areCallValsEqual(uint32_t Num, uint32_t NewNum, 
-                                       const BasicBlock *Pred, 
-                                       const BasicBlock *PhiBlock, GVN &Gvn) { 
-  CallInst *Call = nullptr; 
-  LeaderTableEntry *Vals = &Gvn.LeaderTable[Num]; 
-  while (Vals) { 
-    Call = dyn_cast<CallInst>(Vals->Val); 
-    if (Call && Call->getParent() == PhiBlock) 
-      break; 
-    Vals = Vals->Next; 
-  } 
- 
-  if (AA->doesNotAccessMemory(Call)) 
-    return true; 
- 
-  if (!MD || !AA->onlyReadsMemory(Call)) 
-    return false; 
- 
-  MemDepResult local_dep = MD->getDependency(Call); 
-  if (!local_dep.isNonLocal()) 
-    return false; 
- 
-  const MemoryDependenceResults::NonLocalDepInfo &deps = 
-      MD->getNonLocalCallDependency(Call); 
- 
-  // Check to see if the Call has no function local clobber. 
-  for (unsigned i = 0; i < deps.size(); i++) { 
-    if (deps[i].getResult().isNonFuncLocal()) 
-      return true; 
-  } 
-  return false; 
-} 
- 
-/// Translate value number \p Num using phis, so that it has the values of 
-/// the phis in BB. 
-uint32_t GVN::ValueTable::phiTranslateImpl(const BasicBlock *Pred, 
-                                           const BasicBlock *PhiBlock, 
-                                           uint32_t Num, GVN &Gvn) { 
-  if (PHINode *PN = NumberingPhi[Num]) { 
-    for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) { 
-      if (PN->getParent() == PhiBlock && PN->getIncomingBlock(i) == Pred) 
-        if (uint32_t TransVal = lookup(PN->getIncomingValue(i), false)) 
-          return TransVal; 
-    } 
-    return Num; 
-  } 
- 
-  // If there is any value related with Num is defined in a BB other than 
-  // PhiBlock, it cannot depend on a phi in PhiBlock without going through 
-  // a backedge. We can do an early exit in that case to save compile time. 
-  if (!areAllValsInBB(Num, PhiBlock, Gvn)) 
-    return Num; 
- 
-  if (Num >= ExprIdx.size() || ExprIdx[Num] == 0) 
-    return Num; 
-  Expression Exp = Expressions[ExprIdx[Num]]; 
- 
-  for (unsigned i = 0; i < Exp.varargs.size(); i++) { 
-    // For InsertValue and ExtractValue, some varargs are index numbers 
-    // instead of value numbers. Those index numbers should not be 
-    // translated. 
-    if ((i > 1 && Exp.opcode == Instruction::InsertValue) || 
-        (i > 0 && Exp.opcode == Instruction::ExtractValue) || 
-        (i > 1 && Exp.opcode == Instruction::ShuffleVector)) 
-      continue; 
-    Exp.varargs[i] = phiTranslate(Pred, PhiBlock, Exp.varargs[i], Gvn); 
-  } 
- 
-  if (Exp.commutative) { 
+    ++NumGVNLoad;
+    reportLoadElim(L, AvailableValue, ORE);
+    // Tell MDA to rexamine the reused pointer since we might have more
+    // information after forwarding it.
+    if (MD && AvailableValue->getType()->isPtrOrPtrVectorTy())
+      MD->invalidateCachedPointerInfo(AvailableValue);
+    return true;
+  }
+
+  return false;
+}
+
+/// Return a pair the first field showing the value number of \p Exp and the
+/// second field showing whether it is a value number newly created.
+std::pair<uint32_t, bool>
+GVN::ValueTable::assignExpNewValueNum(Expression &Exp) {
+  uint32_t &e = expressionNumbering[Exp];
+  bool CreateNewValNum = !e;
+  if (CreateNewValNum) {
+    Expressions.push_back(Exp);
+    if (ExprIdx.size() < nextValueNumber + 1)
+      ExprIdx.resize(nextValueNumber * 2);
+    e = nextValueNumber;
+    ExprIdx[nextValueNumber++] = nextExprNumber++;
+  }
+  return {e, CreateNewValNum};
+}
+
+/// Return whether all the values related with the same \p num are
+/// defined in \p BB.
+bool GVN::ValueTable::areAllValsInBB(uint32_t Num, const BasicBlock *BB,
+                                     GVN &Gvn) {
+  LeaderTableEntry *Vals = &Gvn.LeaderTable[Num];
+  while (Vals && Vals->BB == BB)
+    Vals = Vals->Next;
+  return !Vals;
+}
+
+/// Wrap phiTranslateImpl to provide caching functionality.
+uint32_t GVN::ValueTable::phiTranslate(const BasicBlock *Pred,
+                                       const BasicBlock *PhiBlock, uint32_t Num,
+                                       GVN &Gvn) {
+  auto FindRes = PhiTranslateTable.find({Num, Pred});
+  if (FindRes != PhiTranslateTable.end())
+    return FindRes->second;
+  uint32_t NewNum = phiTranslateImpl(Pred, PhiBlock, Num, Gvn);
+  PhiTranslateTable.insert({{Num, Pred}, NewNum});
+  return NewNum;
+}
+
+// Return true if the value number \p Num and NewNum have equal value.
+// Return false if the result is unknown.
+bool GVN::ValueTable::areCallValsEqual(uint32_t Num, uint32_t NewNum,
+                                       const BasicBlock *Pred,
+                                       const BasicBlock *PhiBlock, GVN &Gvn) {
+  CallInst *Call = nullptr;
+  LeaderTableEntry *Vals = &Gvn.LeaderTable[Num];
+  while (Vals) {
+    Call = dyn_cast<CallInst>(Vals->Val);
+    if (Call && Call->getParent() == PhiBlock)
+      break;
+    Vals = Vals->Next;
+  }
+
+  if (AA->doesNotAccessMemory(Call))
+    return true;
+
+  if (!MD || !AA->onlyReadsMemory(Call))
+    return false;
+
+  MemDepResult local_dep = MD->getDependency(Call);
+  if (!local_dep.isNonLocal())
+    return false;
+
+  const MemoryDependenceResults::NonLocalDepInfo &deps =
+      MD->getNonLocalCallDependency(Call);
+
+  // Check to see if the Call has no function local clobber.
+  for (unsigned i = 0; i < deps.size(); i++) {
+    if (deps[i].getResult().isNonFuncLocal())
+      return true;
+  }
+  return false;
+}
+
+/// Translate value number \p Num using phis, so that it has the values of
+/// the phis in BB.
+uint32_t GVN::ValueTable::phiTranslateImpl(const BasicBlock *Pred,
+                                           const BasicBlock *PhiBlock,
+                                           uint32_t Num, GVN &Gvn) {
+  if (PHINode *PN = NumberingPhi[Num]) {
+    for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) {
+      if (PN->getParent() == PhiBlock && PN->getIncomingBlock(i) == Pred)
+        if (uint32_t TransVal = lookup(PN->getIncomingValue(i), false))
+          return TransVal;
+    }
+    return Num;
+  }
+
+  // If there is any value related with Num is defined in a BB other than
+  // PhiBlock, it cannot depend on a phi in PhiBlock without going through
+  // a backedge. We can do an early exit in that case to save compile time.
+  if (!areAllValsInBB(Num, PhiBlock, Gvn))
+    return Num;
+
+  if (Num >= ExprIdx.size() || ExprIdx[Num] == 0)
+    return Num;
+  Expression Exp = Expressions[ExprIdx[Num]];
+
+  for (unsigned i = 0; i < Exp.varargs.size(); i++) {
+    // For InsertValue and ExtractValue, some varargs are index numbers
+    // instead of value numbers. Those index numbers should not be
+    // translated.
+    if ((i > 1 && Exp.opcode == Instruction::InsertValue) ||
+        (i > 0 && Exp.opcode == Instruction::ExtractValue) ||
+        (i > 1 && Exp.opcode == Instruction::ShuffleVector))
+      continue;
+    Exp.varargs[i] = phiTranslate(Pred, PhiBlock, Exp.varargs[i], Gvn);
+  }
+
+  if (Exp.commutative) {
     assert(Exp.varargs.size() >= 2 && "Unsupported commutative instruction!");
-    if (Exp.varargs[0] > Exp.varargs[1]) { 
-      std::swap(Exp.varargs[0], Exp.varargs[1]); 
-      uint32_t Opcode = Exp.opcode >> 8; 
-      if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) 
-        Exp.opcode = (Opcode << 8) | 
-                     CmpInst::getSwappedPredicate( 
-                         static_cast<CmpInst::Predicate>(Exp.opcode & 255)); 
-    } 
-  } 
- 
-  if (uint32_t NewNum = expressionNumbering[Exp]) { 
-    if (Exp.opcode == Instruction::Call && NewNum != Num) 
-      return areCallValsEqual(Num, NewNum, Pred, PhiBlock, Gvn) ? NewNum : Num; 
-    return NewNum; 
-  } 
-  return Num; 
-} 
- 
-/// Erase stale entry from phiTranslate cache so phiTranslate can be computed 
-/// again. 
-void GVN::ValueTable::eraseTranslateCacheEntry(uint32_t Num, 
-                                               const BasicBlock &CurrBlock) { 
+    if (Exp.varargs[0] > Exp.varargs[1]) {
+      std::swap(Exp.varargs[0], Exp.varargs[1]);
+      uint32_t Opcode = Exp.opcode >> 8;
+      if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp)
+        Exp.opcode = (Opcode << 8) |
+                     CmpInst::getSwappedPredicate(
+                         static_cast<CmpInst::Predicate>(Exp.opcode & 255));
+    }
+  }
+
+  if (uint32_t NewNum = expressionNumbering[Exp]) {
+    if (Exp.opcode == Instruction::Call && NewNum != Num)
+      return areCallValsEqual(Num, NewNum, Pred, PhiBlock, Gvn) ? NewNum : Num;
+    return NewNum;
+  }
+  return Num;
+}
+
+/// Erase stale entry from phiTranslate cache so phiTranslate can be computed
+/// again.
+void GVN::ValueTable::eraseTranslateCacheEntry(uint32_t Num,
+                                               const BasicBlock &CurrBlock) {
   for (const BasicBlock *Pred : predecessors(&CurrBlock))
     PhiTranslateTable.erase({Num, Pred});
-} 
- 
-// In order to find a leader for a given value number at a 
-// specific basic block, we first obtain the list of all Values for that number, 
-// and then scan the list to find one whose block dominates the block in 
-// question.  This is fast because dominator tree queries consist of only 
-// a few comparisons of DFS numbers. 
-Value *GVN::findLeader(const BasicBlock *BB, uint32_t num) { 
-  LeaderTableEntry Vals = LeaderTable[num]; 
-  if (!Vals.Val) return nullptr; 
- 
-  Value *Val = nullptr; 
-  if (DT->dominates(Vals.BB, BB)) { 
-    Val = Vals.Val; 
-    if (isa<Constant>(Val)) return Val; 
-  } 
- 
-  LeaderTableEntry* Next = Vals.Next; 
-  while (Next) { 
-    if (DT->dominates(Next->BB, BB)) { 
-      if (isa<Constant>(Next->Val)) return Next->Val; 
-      if (!Val) Val = Next->Val; 
-    } 
- 
-    Next = Next->Next; 
-  } 
- 
-  return Val; 
-} 
- 
-/// There is an edge from 'Src' to 'Dst'.  Return 
-/// true if every path from the entry block to 'Dst' passes via this edge.  In 
-/// particular 'Dst' must not be reachable via another edge from 'Src'. 
-static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E, 
-                                       DominatorTree *DT) { 
-  // While in theory it is interesting to consider the case in which Dst has 
-  // more than one predecessor, because Dst might be part of a loop which is 
-  // only reachable from Src, in practice it is pointless since at the time 
-  // GVN runs all such loops have preheaders, which means that Dst will have 
-  // been changed to have only one predecessor, namely Src. 
-  const BasicBlock *Pred = E.getEnd()->getSinglePredecessor(); 
-  assert((!Pred || Pred == E.getStart()) && 
-         "No edge between these basic blocks!"); 
-  return Pred != nullptr; 
-} 
- 
-void GVN::assignBlockRPONumber(Function &F) { 
-  BlockRPONumber.clear(); 
-  uint32_t NextBlockNumber = 1; 
-  ReversePostOrderTraversal<Function *> RPOT(&F); 
-  for (BasicBlock *BB : RPOT) 
-    BlockRPONumber[BB] = NextBlockNumber++; 
-  InvalidBlockRPONumbers = false; 
-} 
- 
-bool GVN::replaceOperandsForInBlockEquality(Instruction *Instr) const { 
-  bool Changed = false; 
-  for (unsigned OpNum = 0; OpNum < Instr->getNumOperands(); ++OpNum) { 
-    Value *Operand = Instr->getOperand(OpNum); 
-    auto it = ReplaceOperandsWithMap.find(Operand); 
-    if (it != ReplaceOperandsWithMap.end()) { 
-      LLVM_DEBUG(dbgs() << "GVN replacing: " << *Operand << " with " 
-                        << *it->second << " in instruction " << *Instr << '\n'); 
-      Instr->setOperand(OpNum, it->second); 
-      Changed = true; 
-    } 
-  } 
-  return Changed; 
-} 
- 
-/// The given values are known to be equal in every block 
-/// dominated by 'Root'.  Exploit this, for example by replacing 'LHS' with 
-/// 'RHS' everywhere in the scope.  Returns whether a change was made. 
-/// If DominatesByEdge is false, then it means that we will propagate the RHS 
-/// value starting from the end of Root.Start. 
-bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root, 
-                            bool DominatesByEdge) { 
-  SmallVector<std::pair<Value*, Value*>, 4> Worklist; 
-  Worklist.push_back(std::make_pair(LHS, RHS)); 
-  bool Changed = false; 
-  // For speed, compute a conservative fast approximation to 
-  // DT->dominates(Root, Root.getEnd()); 
-  const bool RootDominatesEnd = isOnlyReachableViaThisEdge(Root, DT); 
- 
-  while (!Worklist.empty()) { 
-    std::pair<Value*, Value*> Item = Worklist.pop_back_val(); 
-    LHS = Item.first; RHS = Item.second; 
- 
-    if (LHS == RHS) 
-      continue; 
-    assert(LHS->getType() == RHS->getType() && "Equality but unequal types!"); 
- 
-    // Don't try to propagate equalities between constants. 
-    if (isa<Constant>(LHS) && isa<Constant>(RHS)) 
-      continue; 
- 
-    // Prefer a constant on the right-hand side, or an Argument if no constants. 
-    if (isa<Constant>(LHS) || (isa<Argument>(LHS) && !isa<Constant>(RHS))) 
-      std::swap(LHS, RHS); 
-    assert((isa<Argument>(LHS) || isa<Instruction>(LHS)) && "Unexpected value!"); 
- 
-    // If there is no obvious reason to prefer the left-hand side over the 
-    // right-hand side, ensure the longest lived term is on the right-hand side, 
-    // so the shortest lived term will be replaced by the longest lived. 
-    // This tends to expose more simplifications. 
-    uint32_t LVN = VN.lookupOrAdd(LHS); 
-    if ((isa<Argument>(LHS) && isa<Argument>(RHS)) || 
-        (isa<Instruction>(LHS) && isa<Instruction>(RHS))) { 
-      // Move the 'oldest' value to the right-hand side, using the value number 
-      // as a proxy for age. 
-      uint32_t RVN = VN.lookupOrAdd(RHS); 
-      if (LVN < RVN) { 
-        std::swap(LHS, RHS); 
-        LVN = RVN; 
-      } 
-    } 
- 
-    // If value numbering later sees that an instruction in the scope is equal 
-    // to 'LHS' then ensure it will be turned into 'RHS'.  In order to preserve 
-    // the invariant that instructions only occur in the leader table for their 
-    // own value number (this is used by removeFromLeaderTable), do not do this 
-    // if RHS is an instruction (if an instruction in the scope is morphed into 
-    // LHS then it will be turned into RHS by the next GVN iteration anyway, so 
-    // using the leader table is about compiling faster, not optimizing better). 
-    // The leader table only tracks basic blocks, not edges. Only add to if we 
-    // have the simple case where the edge dominates the end. 
-    if (RootDominatesEnd && !isa<Instruction>(RHS)) 
-      addToLeaderTable(LVN, RHS, Root.getEnd()); 
- 
-    // Replace all occurrences of 'LHS' with 'RHS' everywhere in the scope.  As 
-    // LHS always has at least one use that is not dominated by Root, this will 
-    // never do anything if LHS has only one use. 
-    if (!LHS->hasOneUse()) { 
-      unsigned NumReplacements = 
-          DominatesByEdge 
-              ? replaceDominatedUsesWith(LHS, RHS, *DT, Root) 
-              : replaceDominatedUsesWith(LHS, RHS, *DT, Root.getStart()); 
- 
-      Changed |= NumReplacements > 0; 
-      NumGVNEqProp += NumReplacements; 
-      // Cached information for anything that uses LHS will be invalid. 
-      if (MD) 
-        MD->invalidateCachedPointerInfo(LHS); 
-    } 
- 
-    // Now try to deduce additional equalities from this one. For example, if 
-    // the known equality was "(A != B)" == "false" then it follows that A and B 
-    // are equal in the scope. Only boolean equalities with an explicit true or 
-    // false RHS are currently supported. 
-    if (!RHS->getType()->isIntegerTy(1)) 
-      // Not a boolean equality - bail out. 
-      continue; 
-    ConstantInt *CI = dyn_cast<ConstantInt>(RHS); 
-    if (!CI) 
-      // RHS neither 'true' nor 'false' - bail out. 
-      continue; 
-    // Whether RHS equals 'true'.  Otherwise it equals 'false'. 
-    bool isKnownTrue = CI->isMinusOne(); 
-    bool isKnownFalse = !isKnownTrue; 
- 
-    // If "A && B" is known true then both A and B are known true.  If "A || B" 
-    // is known false then both A and B are known false. 
-    Value *A, *B; 
+}
+
+// In order to find a leader for a given value number at a
+// specific basic block, we first obtain the list of all Values for that number,
+// and then scan the list to find one whose block dominates the block in
+// question.  This is fast because dominator tree queries consist of only
+// a few comparisons of DFS numbers.
+Value *GVN::findLeader(const BasicBlock *BB, uint32_t num) {
+  LeaderTableEntry Vals = LeaderTable[num];
+  if (!Vals.Val) return nullptr;
+
+  Value *Val = nullptr;
+  if (DT->dominates(Vals.BB, BB)) {
+    Val = Vals.Val;
+    if (isa<Constant>(Val)) return Val;
+  }
+
+  LeaderTableEntry* Next = Vals.Next;
+  while (Next) {
+    if (DT->dominates(Next->BB, BB)) {
+      if (isa<Constant>(Next->Val)) return Next->Val;
+      if (!Val) Val = Next->Val;
+    }
+
+    Next = Next->Next;
+  }
+
+  return Val;
+}
+
+/// There is an edge from 'Src' to 'Dst'.  Return
+/// true if every path from the entry block to 'Dst' passes via this edge.  In
+/// particular 'Dst' must not be reachable via another edge from 'Src'.
+static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E,
+                                       DominatorTree *DT) {
+  // While in theory it is interesting to consider the case in which Dst has
+  // more than one predecessor, because Dst might be part of a loop which is
+  // only reachable from Src, in practice it is pointless since at the time
+  // GVN runs all such loops have preheaders, which means that Dst will have
+  // been changed to have only one predecessor, namely Src.
+  const BasicBlock *Pred = E.getEnd()->getSinglePredecessor();
+  assert((!Pred || Pred == E.getStart()) &&
+         "No edge between these basic blocks!");
+  return Pred != nullptr;
+}
+
+void GVN::assignBlockRPONumber(Function &F) {
+  BlockRPONumber.clear();
+  uint32_t NextBlockNumber = 1;
+  ReversePostOrderTraversal<Function *> RPOT(&F);
+  for (BasicBlock *BB : RPOT)
+    BlockRPONumber[BB] = NextBlockNumber++;
+  InvalidBlockRPONumbers = false;
+}
+
+bool GVN::replaceOperandsForInBlockEquality(Instruction *Instr) const {
+  bool Changed = false;
+  for (unsigned OpNum = 0; OpNum < Instr->getNumOperands(); ++OpNum) {
+    Value *Operand = Instr->getOperand(OpNum);
+    auto it = ReplaceOperandsWithMap.find(Operand);
+    if (it != ReplaceOperandsWithMap.end()) {
+      LLVM_DEBUG(dbgs() << "GVN replacing: " << *Operand << " with "
+                        << *it->second << " in instruction " << *Instr << '\n');
+      Instr->setOperand(OpNum, it->second);
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+/// The given values are known to be equal in every block
+/// dominated by 'Root'.  Exploit this, for example by replacing 'LHS' with
+/// 'RHS' everywhere in the scope.  Returns whether a change was made.
+/// If DominatesByEdge is false, then it means that we will propagate the RHS
+/// value starting from the end of Root.Start.
+bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root,
+                            bool DominatesByEdge) {
+  SmallVector<std::pair<Value*, Value*>, 4> Worklist;
+  Worklist.push_back(std::make_pair(LHS, RHS));
+  bool Changed = false;
+  // For speed, compute a conservative fast approximation to
+  // DT->dominates(Root, Root.getEnd());
+  const bool RootDominatesEnd = isOnlyReachableViaThisEdge(Root, DT);
+
+  while (!Worklist.empty()) {
+    std::pair<Value*, Value*> Item = Worklist.pop_back_val();
+    LHS = Item.first; RHS = Item.second;
+
+    if (LHS == RHS)
+      continue;
+    assert(LHS->getType() == RHS->getType() && "Equality but unequal types!");
+
+    // Don't try to propagate equalities between constants.
+    if (isa<Constant>(LHS) && isa<Constant>(RHS))
+      continue;
+
+    // Prefer a constant on the right-hand side, or an Argument if no constants.
+    if (isa<Constant>(LHS) || (isa<Argument>(LHS) && !isa<Constant>(RHS)))
+      std::swap(LHS, RHS);
+    assert((isa<Argument>(LHS) || isa<Instruction>(LHS)) && "Unexpected value!");
+
+    // If there is no obvious reason to prefer the left-hand side over the
+    // right-hand side, ensure the longest lived term is on the right-hand side,
+    // so the shortest lived term will be replaced by the longest lived.
+    // This tends to expose more simplifications.
+    uint32_t LVN = VN.lookupOrAdd(LHS);
+    if ((isa<Argument>(LHS) && isa<Argument>(RHS)) ||
+        (isa<Instruction>(LHS) && isa<Instruction>(RHS))) {
+      // Move the 'oldest' value to the right-hand side, using the value number
+      // as a proxy for age.
+      uint32_t RVN = VN.lookupOrAdd(RHS);
+      if (LVN < RVN) {
+        std::swap(LHS, RHS);
+        LVN = RVN;
+      }
+    }
+
+    // If value numbering later sees that an instruction in the scope is equal
+    // to 'LHS' then ensure it will be turned into 'RHS'.  In order to preserve
+    // the invariant that instructions only occur in the leader table for their
+    // own value number (this is used by removeFromLeaderTable), do not do this
+    // if RHS is an instruction (if an instruction in the scope is morphed into
+    // LHS then it will be turned into RHS by the next GVN iteration anyway, so
+    // using the leader table is about compiling faster, not optimizing better).
+    // The leader table only tracks basic blocks, not edges. Only add to if we
+    // have the simple case where the edge dominates the end.
+    if (RootDominatesEnd && !isa<Instruction>(RHS))
+      addToLeaderTable(LVN, RHS, Root.getEnd());
+
+    // Replace all occurrences of 'LHS' with 'RHS' everywhere in the scope.  As
+    // LHS always has at least one use that is not dominated by Root, this will
+    // never do anything if LHS has only one use.
+    if (!LHS->hasOneUse()) {
+      unsigned NumReplacements =
+          DominatesByEdge
+              ? replaceDominatedUsesWith(LHS, RHS, *DT, Root)
+              : replaceDominatedUsesWith(LHS, RHS, *DT, Root.getStart());
+
+      Changed |= NumReplacements > 0;
+      NumGVNEqProp += NumReplacements;
+      // Cached information for anything that uses LHS will be invalid.
+      if (MD)
+        MD->invalidateCachedPointerInfo(LHS);
+    }
+
+    // Now try to deduce additional equalities from this one. For example, if
+    // the known equality was "(A != B)" == "false" then it follows that A and B
+    // are equal in the scope. Only boolean equalities with an explicit true or
+    // false RHS are currently supported.
+    if (!RHS->getType()->isIntegerTy(1))
+      // Not a boolean equality - bail out.
+      continue;
+    ConstantInt *CI = dyn_cast<ConstantInt>(RHS);
+    if (!CI)
+      // RHS neither 'true' nor 'false' - bail out.
+      continue;
+    // Whether RHS equals 'true'.  Otherwise it equals 'false'.
+    bool isKnownTrue = CI->isMinusOne();
+    bool isKnownFalse = !isKnownTrue;
+
+    // If "A && B" is known true then both A and B are known true.  If "A || B"
+    // is known false then both A and B are known false.
+    Value *A, *B;
     if ((isKnownTrue && match(LHS, m_LogicalAnd(m_Value(A), m_Value(B)))) ||
         (isKnownFalse && match(LHS, m_LogicalOr(m_Value(A), m_Value(B))))) {
-      Worklist.push_back(std::make_pair(A, RHS)); 
-      Worklist.push_back(std::make_pair(B, RHS)); 
-      continue; 
-    } 
- 
-    // If we are propagating an equality like "(A == B)" == "true" then also 
-    // propagate the equality A == B.  When propagating a comparison such as 
-    // "(A >= B)" == "true", replace all instances of "A < B" with "false". 
-    if (CmpInst *Cmp = dyn_cast<CmpInst>(LHS)) { 
-      Value *Op0 = Cmp->getOperand(0), *Op1 = Cmp->getOperand(1); 
- 
-      // If "A == B" is known true, or "A != B" is known false, then replace 
-      // A with B everywhere in the scope.  For floating point operations, we 
-      // have to be careful since equality does not always imply equivalance. 
-      if ((isKnownTrue && impliesEquivalanceIfTrue(Cmp)) || 
-          (isKnownFalse && impliesEquivalanceIfFalse(Cmp))) 
-        Worklist.push_back(std::make_pair(Op0, Op1)); 
- 
-      // If "A >= B" is known true, replace "A < B" with false everywhere. 
-      CmpInst::Predicate NotPred = Cmp->getInversePredicate(); 
-      Constant *NotVal = ConstantInt::get(Cmp->getType(), isKnownFalse); 
-      // Since we don't have the instruction "A < B" immediately to hand, work 
-      // out the value number that it would have and use that to find an 
-      // appropriate instruction (if any). 
-      uint32_t NextNum = VN.getNextUnusedValueNumber(); 
-      uint32_t Num = VN.lookupOrAddCmp(Cmp->getOpcode(), NotPred, Op0, Op1); 
-      // If the number we were assigned was brand new then there is no point in 
-      // looking for an instruction realizing it: there cannot be one! 
-      if (Num < NextNum) { 
-        Value *NotCmp = findLeader(Root.getEnd(), Num); 
-        if (NotCmp && isa<Instruction>(NotCmp)) { 
-          unsigned NumReplacements = 
-              DominatesByEdge 
-                  ? replaceDominatedUsesWith(NotCmp, NotVal, *DT, Root) 
-                  : replaceDominatedUsesWith(NotCmp, NotVal, *DT, 
-                                             Root.getStart()); 
-          Changed |= NumReplacements > 0; 
-          NumGVNEqProp += NumReplacements; 
-          // Cached information for anything that uses NotCmp will be invalid. 
-          if (MD) 
-            MD->invalidateCachedPointerInfo(NotCmp); 
-        } 
-      } 
-      // Ensure that any instruction in scope that gets the "A < B" value number 
-      // is replaced with false. 
-      // The leader table only tracks basic blocks, not edges. Only add to if we 
-      // have the simple case where the edge dominates the end. 
-      if (RootDominatesEnd) 
-        addToLeaderTable(Num, NotVal, Root.getEnd()); 
- 
-      continue; 
-    } 
-  } 
- 
-  return Changed; 
-} 
- 
-/// When calculating availability, handle an instruction 
-/// by inserting it into the appropriate sets 
-bool GVN::processInstruction(Instruction *I) { 
-  // Ignore dbg info intrinsics. 
-  if (isa<DbgInfoIntrinsic>(I)) 
-    return false; 
- 
-  // If the instruction can be easily simplified then do so now in preference 
-  // to value numbering it.  Value numbering often exposes redundancies, for 
-  // example if it determines that %y is equal to %x then the instruction 
-  // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify. 
-  const DataLayout &DL = I->getModule()->getDataLayout(); 
-  if (Value *V = SimplifyInstruction(I, {DL, TLI, DT, AC})) { 
-    bool Changed = false; 
-    if (!I->use_empty()) { 
-      I->replaceAllUsesWith(V); 
-      Changed = true; 
-    } 
-    if (isInstructionTriviallyDead(I, TLI)) { 
-      markInstructionForDeletion(I); 
-      Changed = true; 
-    } 
-    if (Changed) { 
-      if (MD && V->getType()->isPtrOrPtrVectorTy()) 
-        MD->invalidateCachedPointerInfo(V); 
-      ++NumGVNSimpl; 
-      return true; 
-    } 
-  } 
- 
-  if (IntrinsicInst *IntrinsicI = dyn_cast<IntrinsicInst>(I)) 
-    if (IntrinsicI->getIntrinsicID() == Intrinsic::assume) 
-      return processAssumeIntrinsic(IntrinsicI); 
- 
-  if (LoadInst *LI = dyn_cast<LoadInst>(I)) { 
-    if (processLoad(LI)) 
-      return true; 
- 
-    unsigned Num = VN.lookupOrAdd(LI); 
-    addToLeaderTable(Num, LI, LI->getParent()); 
-    return false; 
-  } 
- 
-  // For conditional branches, we can perform simple conditional propagation on 
-  // the condition value itself. 
-  if (BranchInst *BI = dyn_cast<BranchInst>(I)) { 
-    if (!BI->isConditional()) 
-      return false; 
- 
-    if (isa<Constant>(BI->getCondition())) 
-      return processFoldableCondBr(BI); 
- 
-    Value *BranchCond = BI->getCondition(); 
-    BasicBlock *TrueSucc = BI->getSuccessor(0); 
-    BasicBlock *FalseSucc = BI->getSuccessor(1); 
-    // Avoid multiple edges early. 
-    if (TrueSucc == FalseSucc) 
-      return false; 
- 
-    BasicBlock *Parent = BI->getParent(); 
-    bool Changed = false; 
- 
-    Value *TrueVal = ConstantInt::getTrue(TrueSucc->getContext()); 
-    BasicBlockEdge TrueE(Parent, TrueSucc); 
-    Changed |= propagateEquality(BranchCond, TrueVal, TrueE, true); 
- 
-    Value *FalseVal = ConstantInt::getFalse(FalseSucc->getContext()); 
-    BasicBlockEdge FalseE(Parent, FalseSucc); 
-    Changed |= propagateEquality(BranchCond, FalseVal, FalseE, true); 
- 
-    return Changed; 
-  } 
- 
-  // For switches, propagate the case values into the case destinations. 
-  if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) { 
-    Value *SwitchCond = SI->getCondition(); 
-    BasicBlock *Parent = SI->getParent(); 
-    bool Changed = false; 
- 
-    // Remember how many outgoing edges there are to every successor. 
-    SmallDenseMap<BasicBlock *, unsigned, 16> SwitchEdges; 
-    for (unsigned i = 0, n = SI->getNumSuccessors(); i != n; ++i) 
-      ++SwitchEdges[SI->getSuccessor(i)]; 
- 
-    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); 
-         i != e; ++i) { 
-      BasicBlock *Dst = i->getCaseSuccessor(); 
-      // If there is only a single edge, propagate the case value into it. 
-      if (SwitchEdges.lookup(Dst) == 1) { 
-        BasicBlockEdge E(Parent, Dst); 
-        Changed |= propagateEquality(SwitchCond, i->getCaseValue(), E, true); 
-      } 
-    } 
-    return Changed; 
-  } 
- 
-  // Instructions with void type don't return a value, so there's 
-  // no point in trying to find redundancies in them. 
-  if (I->getType()->isVoidTy()) 
-    return false; 
- 
-  uint32_t NextNum = VN.getNextUnusedValueNumber(); 
-  unsigned Num = VN.lookupOrAdd(I); 
- 
-  // Allocations are always uniquely numbered, so we can save time and memory 
-  // by fast failing them. 
-  if (isa<AllocaInst>(I) || I->isTerminator() || isa<PHINode>(I)) { 
-    addToLeaderTable(Num, I, I->getParent()); 
-    return false; 
-  } 
- 
-  // If the number we were assigned was a brand new VN, then we don't 
-  // need to do a lookup to see if the number already exists 
-  // somewhere in the domtree: it can't! 
-  if (Num >= NextNum) { 
-    addToLeaderTable(Num, I, I->getParent()); 
-    return false; 
-  } 
- 
-  // Perform fast-path value-number based elimination of values inherited from 
-  // dominators. 
-  Value *Repl = findLeader(I->getParent(), Num); 
-  if (!Repl) { 
-    // Failure, just remember this instance for future use. 
-    addToLeaderTable(Num, I, I->getParent()); 
-    return false; 
-  } else if (Repl == I) { 
-    // If I was the result of a shortcut PRE, it might already be in the table 
-    // and the best replacement for itself. Nothing to do. 
-    return false; 
-  } 
- 
-  // Remove it! 
-  patchAndReplaceAllUsesWith(I, Repl); 
-  if (MD && Repl->getType()->isPtrOrPtrVectorTy()) 
-    MD->invalidateCachedPointerInfo(Repl); 
-  markInstructionForDeletion(I); 
-  return true; 
-} 
- 
-/// runOnFunction - This is the main transformation entry point for a function. 
-bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT, 
-                  const TargetLibraryInfo &RunTLI, AAResults &RunAA, 
-                  MemoryDependenceResults *RunMD, LoopInfo *LI, 
+      Worklist.push_back(std::make_pair(A, RHS));
+      Worklist.push_back(std::make_pair(B, RHS));
+      continue;
+    }
+
+    // If we are propagating an equality like "(A == B)" == "true" then also
+    // propagate the equality A == B.  When propagating a comparison such as
+    // "(A >= B)" == "true", replace all instances of "A < B" with "false".
+    if (CmpInst *Cmp = dyn_cast<CmpInst>(LHS)) {
+      Value *Op0 = Cmp->getOperand(0), *Op1 = Cmp->getOperand(1);
+
+      // If "A == B" is known true, or "A != B" is known false, then replace
+      // A with B everywhere in the scope.  For floating point operations, we
+      // have to be careful since equality does not always imply equivalance.
+      if ((isKnownTrue && impliesEquivalanceIfTrue(Cmp)) ||
+          (isKnownFalse && impliesEquivalanceIfFalse(Cmp)))
+        Worklist.push_back(std::make_pair(Op0, Op1));
+
+      // If "A >= B" is known true, replace "A < B" with false everywhere.
+      CmpInst::Predicate NotPred = Cmp->getInversePredicate();
+      Constant *NotVal = ConstantInt::get(Cmp->getType(), isKnownFalse);
+      // Since we don't have the instruction "A < B" immediately to hand, work
+      // out the value number that it would have and use that to find an
+      // appropriate instruction (if any).
+      uint32_t NextNum = VN.getNextUnusedValueNumber();
+      uint32_t Num = VN.lookupOrAddCmp(Cmp->getOpcode(), NotPred, Op0, Op1);
+      // If the number we were assigned was brand new then there is no point in
+      // looking for an instruction realizing it: there cannot be one!
+      if (Num < NextNum) {
+        Value *NotCmp = findLeader(Root.getEnd(), Num);
+        if (NotCmp && isa<Instruction>(NotCmp)) {
+          unsigned NumReplacements =
+              DominatesByEdge
+                  ? replaceDominatedUsesWith(NotCmp, NotVal, *DT, Root)
+                  : replaceDominatedUsesWith(NotCmp, NotVal, *DT,
+                                             Root.getStart());
+          Changed |= NumReplacements > 0;
+          NumGVNEqProp += NumReplacements;
+          // Cached information for anything that uses NotCmp will be invalid.
+          if (MD)
+            MD->invalidateCachedPointerInfo(NotCmp);
+        }
+      }
+      // Ensure that any instruction in scope that gets the "A < B" value number
+      // is replaced with false.
+      // The leader table only tracks basic blocks, not edges. Only add to if we
+      // have the simple case where the edge dominates the end.
+      if (RootDominatesEnd)
+        addToLeaderTable(Num, NotVal, Root.getEnd());
+
+      continue;
+    }
+  }
+
+  return Changed;
+}
+
+/// When calculating availability, handle an instruction
+/// by inserting it into the appropriate sets
+bool GVN::processInstruction(Instruction *I) {
+  // Ignore dbg info intrinsics.
+  if (isa<DbgInfoIntrinsic>(I))
+    return false;
+
+  // If the instruction can be easily simplified then do so now in preference
+  // to value numbering it.  Value numbering often exposes redundancies, for
+  // example if it determines that %y is equal to %x then the instruction
+  // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify.
+  const DataLayout &DL = I->getModule()->getDataLayout();
+  if (Value *V = SimplifyInstruction(I, {DL, TLI, DT, AC})) {
+    bool Changed = false;
+    if (!I->use_empty()) {
+      I->replaceAllUsesWith(V);
+      Changed = true;
+    }
+    if (isInstructionTriviallyDead(I, TLI)) {
+      markInstructionForDeletion(I);
+      Changed = true;
+    }
+    if (Changed) {
+      if (MD && V->getType()->isPtrOrPtrVectorTy())
+        MD->invalidateCachedPointerInfo(V);
+      ++NumGVNSimpl;
+      return true;
+    }
+  }
+
+  if (IntrinsicInst *IntrinsicI = dyn_cast<IntrinsicInst>(I))
+    if (IntrinsicI->getIntrinsicID() == Intrinsic::assume)
+      return processAssumeIntrinsic(IntrinsicI);
+
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    if (processLoad(LI))
+      return true;
+
+    unsigned Num = VN.lookupOrAdd(LI);
+    addToLeaderTable(Num, LI, LI->getParent());
+    return false;
+  }
+
+  // For conditional branches, we can perform simple conditional propagation on
+  // the condition value itself.
+  if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
+    if (!BI->isConditional())
+      return false;
+
+    if (isa<Constant>(BI->getCondition()))
+      return processFoldableCondBr(BI);
+
+    Value *BranchCond = BI->getCondition();
+    BasicBlock *TrueSucc = BI->getSuccessor(0);
+    BasicBlock *FalseSucc = BI->getSuccessor(1);
+    // Avoid multiple edges early.
+    if (TrueSucc == FalseSucc)
+      return false;
+
+    BasicBlock *Parent = BI->getParent();
+    bool Changed = false;
+
+    Value *TrueVal = ConstantInt::getTrue(TrueSucc->getContext());
+    BasicBlockEdge TrueE(Parent, TrueSucc);
+    Changed |= propagateEquality(BranchCond, TrueVal, TrueE, true);
+
+    Value *FalseVal = ConstantInt::getFalse(FalseSucc->getContext());
+    BasicBlockEdge FalseE(Parent, FalseSucc);
+    Changed |= propagateEquality(BranchCond, FalseVal, FalseE, true);
+
+    return Changed;
+  }
+
+  // For switches, propagate the case values into the case destinations.
+  if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+    Value *SwitchCond = SI->getCondition();
+    BasicBlock *Parent = SI->getParent();
+    bool Changed = false;
+
+    // Remember how many outgoing edges there are to every successor.
+    SmallDenseMap<BasicBlock *, unsigned, 16> SwitchEdges;
+    for (unsigned i = 0, n = SI->getNumSuccessors(); i != n; ++i)
+      ++SwitchEdges[SI->getSuccessor(i)];
+
+    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+         i != e; ++i) {
+      BasicBlock *Dst = i->getCaseSuccessor();
+      // If there is only a single edge, propagate the case value into it.
+      if (SwitchEdges.lookup(Dst) == 1) {
+        BasicBlockEdge E(Parent, Dst);
+        Changed |= propagateEquality(SwitchCond, i->getCaseValue(), E, true);
+      }
+    }
+    return Changed;
+  }
+
+  // Instructions with void type don't return a value, so there's
+  // no point in trying to find redundancies in them.
+  if (I->getType()->isVoidTy())
+    return false;
+
+  uint32_t NextNum = VN.getNextUnusedValueNumber();
+  unsigned Num = VN.lookupOrAdd(I);
+
+  // Allocations are always uniquely numbered, so we can save time and memory
+  // by fast failing them.
+  if (isa<AllocaInst>(I) || I->isTerminator() || isa<PHINode>(I)) {
+    addToLeaderTable(Num, I, I->getParent());
+    return false;
+  }
+
+  // If the number we were assigned was a brand new VN, then we don't
+  // need to do a lookup to see if the number already exists
+  // somewhere in the domtree: it can't!
+  if (Num >= NextNum) {
+    addToLeaderTable(Num, I, I->getParent());
+    return false;
+  }
+
+  // Perform fast-path value-number based elimination of values inherited from
+  // dominators.
+  Value *Repl = findLeader(I->getParent(), Num);
+  if (!Repl) {
+    // Failure, just remember this instance for future use.
+    addToLeaderTable(Num, I, I->getParent());
+    return false;
+  } else if (Repl == I) {
+    // If I was the result of a shortcut PRE, it might already be in the table
+    // and the best replacement for itself. Nothing to do.
+    return false;
+  }
+
+  // Remove it!
+  patchAndReplaceAllUsesWith(I, Repl);
+  if (MD && Repl->getType()->isPtrOrPtrVectorTy())
+    MD->invalidateCachedPointerInfo(Repl);
+  markInstructionForDeletion(I);
+  return true;
+}
+
+/// runOnFunction - This is the main transformation entry point for a function.
+bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
+                  const TargetLibraryInfo &RunTLI, AAResults &RunAA,
+                  MemoryDependenceResults *RunMD, LoopInfo *LI,
                   OptimizationRemarkEmitter *RunORE, MemorySSA *MSSA) {
-  AC = &RunAC; 
-  DT = &RunDT; 
-  VN.setDomTree(DT); 
-  TLI = &RunTLI; 
-  VN.setAliasAnalysis(&RunAA); 
-  MD = RunMD; 
-  ImplicitControlFlowTracking ImplicitCFT; 
-  ICF = &ImplicitCFT; 
-  this->LI = LI; 
-  VN.setMemDep(MD); 
-  ORE = RunORE; 
-  InvalidBlockRPONumbers = true; 
+  AC = &RunAC;
+  DT = &RunDT;
+  VN.setDomTree(DT);
+  TLI = &RunTLI;
+  VN.setAliasAnalysis(&RunAA);
+  MD = RunMD;
+  ImplicitControlFlowTracking ImplicitCFT;
+  ICF = &ImplicitCFT;
+  this->LI = LI;
+  VN.setMemDep(MD);
+  ORE = RunORE;
+  InvalidBlockRPONumbers = true;
   MemorySSAUpdater Updater(MSSA);
   MSSAU = MSSA ? &Updater : nullptr;
- 
-  bool Changed = false; 
-  bool ShouldContinue = true; 
- 
-  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); 
-  // Merge unconditional branches, allowing PRE to catch more 
-  // optimization opportunities. 
-  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ) { 
-    BasicBlock *BB = &*FI++; 
- 
+
+  bool Changed = false;
+  bool ShouldContinue = true;
+
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+  // Merge unconditional branches, allowing PRE to catch more
+  // optimization opportunities.
+  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ) {
+    BasicBlock *BB = &*FI++;
+
     bool removedBlock = MergeBlockIntoPredecessor(BB, &DTU, LI, MSSAU, MD);
-    if (removedBlock) 
-      ++NumGVNBlocks; 
- 
-    Changed |= removedBlock; 
-  } 
- 
-  unsigned Iteration = 0; 
-  while (ShouldContinue) { 
-    LLVM_DEBUG(dbgs() << "GVN iteration: " << Iteration << "\n"); 
-    ShouldContinue = iterateOnFunction(F); 
-    Changed |= ShouldContinue; 
-    ++Iteration; 
-  } 
- 
-  if (isPREEnabled()) { 
-    // Fabricate val-num for dead-code in order to suppress assertion in 
-    // performPRE(). 
-    assignValNumForDeadCode(); 
-    bool PREChanged = true; 
-    while (PREChanged) { 
-      PREChanged = performPRE(F); 
-      Changed |= PREChanged; 
-    } 
-  } 
- 
-  // FIXME: Should perform GVN again after PRE does something.  PRE can move 
-  // computations into blocks where they become fully redundant.  Note that 
-  // we can't do this until PRE's critical edge splitting updates memdep. 
-  // Actually, when this happens, we should just fully integrate PRE into GVN. 
- 
-  cleanupGlobalSets(); 
-  // Do not cleanup DeadBlocks in cleanupGlobalSets() as it's called for each 
-  // iteration. 
-  DeadBlocks.clear(); 
- 
+    if (removedBlock)
+      ++NumGVNBlocks;
+
+    Changed |= removedBlock;
+  }
+
+  unsigned Iteration = 0;
+  while (ShouldContinue) {
+    LLVM_DEBUG(dbgs() << "GVN iteration: " << Iteration << "\n");
+    ShouldContinue = iterateOnFunction(F);
+    Changed |= ShouldContinue;
+    ++Iteration;
+  }
+
+  if (isPREEnabled()) {
+    // Fabricate val-num for dead-code in order to suppress assertion in
+    // performPRE().
+    assignValNumForDeadCode();
+    bool PREChanged = true;
+    while (PREChanged) {
+      PREChanged = performPRE(F);
+      Changed |= PREChanged;
+    }
+  }
+
+  // FIXME: Should perform GVN again after PRE does something.  PRE can move
+  // computations into blocks where they become fully redundant.  Note that
+  // we can't do this until PRE's critical edge splitting updates memdep.
+  // Actually, when this happens, we should just fully integrate PRE into GVN.
+
+  cleanupGlobalSets();
+  // Do not cleanup DeadBlocks in cleanupGlobalSets() as it's called for each
+  // iteration.
+  DeadBlocks.clear();
+
   if (MSSA && VerifyMemorySSA)
     MSSA->verifyMemorySSA();
 
-  return Changed; 
-} 
- 
-bool GVN::processBlock(BasicBlock *BB) { 
-  // FIXME: Kill off InstrsToErase by doing erasing eagerly in a helper function 
-  // (and incrementing BI before processing an instruction). 
-  assert(InstrsToErase.empty() && 
-         "We expect InstrsToErase to be empty across iterations"); 
-  if (DeadBlocks.count(BB)) 
-    return false; 
- 
-  // Clearing map before every BB because it can be used only for single BB. 
-  ReplaceOperandsWithMap.clear(); 
-  bool ChangedFunction = false; 
- 
-  for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); 
-       BI != BE;) { 
-    if (!ReplaceOperandsWithMap.empty()) 
-      ChangedFunction |= replaceOperandsForInBlockEquality(&*BI); 
-    ChangedFunction |= processInstruction(&*BI); 
- 
-    if (InstrsToErase.empty()) { 
-      ++BI; 
-      continue; 
-    } 
- 
-    // If we need some instructions deleted, do it now. 
-    NumGVNInstr += InstrsToErase.size(); 
- 
-    // Avoid iterator invalidation. 
-    bool AtStart = BI == BB->begin(); 
-    if (!AtStart) 
-      --BI; 
- 
-    for (auto *I : InstrsToErase) { 
-      assert(I->getParent() == BB && "Removing instruction from wrong block?"); 
-      LLVM_DEBUG(dbgs() << "GVN removed: " << *I << '\n'); 
-      salvageKnowledge(I, AC); 
-      salvageDebugInfo(*I); 
-      if (MD) MD->removeInstruction(I); 
+  return Changed;
+}
+
+bool GVN::processBlock(BasicBlock *BB) {
+  // FIXME: Kill off InstrsToErase by doing erasing eagerly in a helper function
+  // (and incrementing BI before processing an instruction).
+  assert(InstrsToErase.empty() &&
+         "We expect InstrsToErase to be empty across iterations");
+  if (DeadBlocks.count(BB))
+    return false;
+
+  // Clearing map before every BB because it can be used only for single BB.
+  ReplaceOperandsWithMap.clear();
+  bool ChangedFunction = false;
+
+  for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
+       BI != BE;) {
+    if (!ReplaceOperandsWithMap.empty())
+      ChangedFunction |= replaceOperandsForInBlockEquality(&*BI);
+    ChangedFunction |= processInstruction(&*BI);
+
+    if (InstrsToErase.empty()) {
+      ++BI;
+      continue;
+    }
+
+    // If we need some instructions deleted, do it now.
+    NumGVNInstr += InstrsToErase.size();
+
+    // Avoid iterator invalidation.
+    bool AtStart = BI == BB->begin();
+    if (!AtStart)
+      --BI;
+
+    for (auto *I : InstrsToErase) {
+      assert(I->getParent() == BB && "Removing instruction from wrong block?");
+      LLVM_DEBUG(dbgs() << "GVN removed: " << *I << '\n');
+      salvageKnowledge(I, AC);
+      salvageDebugInfo(*I);
+      if (MD) MD->removeInstruction(I);
       if (MSSAU)
         MSSAU->removeMemoryAccess(I);
-      LLVM_DEBUG(verifyRemoved(I)); 
-      ICF->removeInstruction(I); 
-      I->eraseFromParent(); 
-    } 
-    InstrsToErase.clear(); 
- 
-    if (AtStart) 
-      BI = BB->begin(); 
-    else 
-      ++BI; 
-  } 
- 
-  return ChangedFunction; 
-} 
- 
-// Instantiate an expression in a predecessor that lacked it. 
-bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, 
-                                    BasicBlock *Curr, unsigned int ValNo) { 
-  // Because we are going top-down through the block, all value numbers 
-  // will be available in the predecessor by the time we need them.  Any 
-  // that weren't originally present will have been instantiated earlier 
-  // in this loop. 
-  bool success = true; 
-  for (unsigned i = 0, e = Instr->getNumOperands(); i != e; ++i) { 
-    Value *Op = Instr->getOperand(i); 
-    if (isa<Argument>(Op) || isa<Constant>(Op) || isa<GlobalValue>(Op)) 
-      continue; 
-    // This could be a newly inserted instruction, in which case, we won't 
-    // find a value number, and should give up before we hurt ourselves. 
-    // FIXME: Rewrite the infrastructure to let it easier to value number 
-    // and process newly inserted instructions. 
-    if (!VN.exists(Op)) { 
-      success = false; 
-      break; 
-    } 
-    uint32_t TValNo = 
-        VN.phiTranslate(Pred, Curr, VN.lookup(Op), *this); 
-    if (Value *V = findLeader(Pred, TValNo)) { 
-      Instr->setOperand(i, V); 
-    } else { 
-      success = false; 
-      break; 
-    } 
-  } 
- 
-  // Fail out if we encounter an operand that is not available in 
-  // the PRE predecessor.  This is typically because of loads which 
-  // are not value numbered precisely. 
-  if (!success) 
-    return false; 
- 
-  Instr->insertBefore(Pred->getTerminator()); 
-  Instr->setName(Instr->getName() + ".pre"); 
-  Instr->setDebugLoc(Instr->getDebugLoc()); 
- 
-  unsigned Num = VN.lookupOrAdd(Instr); 
-  VN.add(Instr, Num); 
- 
-  // Update the availability map to include the new instruction. 
-  addToLeaderTable(Num, Instr, Pred); 
-  return true; 
-} 
- 
-bool GVN::performScalarPRE(Instruction *CurInst) { 
-  if (isa<AllocaInst>(CurInst) || CurInst->isTerminator() || 
-      isa<PHINode>(CurInst) || CurInst->getType()->isVoidTy() || 
-      CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() || 
-      isa<DbgInfoIntrinsic>(CurInst)) 
-    return false; 
- 
-  // Don't do PRE on compares. The PHI would prevent CodeGenPrepare from 
-  // sinking the compare again, and it would force the code generator to 
-  // move the i1 from processor flags or predicate registers into a general 
-  // purpose register. 
-  if (isa<CmpInst>(CurInst)) 
-    return false; 
- 
-  // Don't do PRE on GEPs. The inserted PHI would prevent CodeGenPrepare from 
-  // sinking the addressing mode computation back to its uses. Extending the 
-  // GEP's live range increases the register pressure, and therefore it can 
-  // introduce unnecessary spills. 
-  // 
-  // This doesn't prevent Load PRE. PHI translation will make the GEP available 
-  // to the load by moving it to the predecessor block if necessary. 
-  if (isa<GetElementPtrInst>(CurInst)) 
-    return false; 
- 
+      LLVM_DEBUG(verifyRemoved(I));
+      ICF->removeInstruction(I);
+      I->eraseFromParent();
+    }
+    InstrsToErase.clear();
+
+    if (AtStart)
+      BI = BB->begin();
+    else
+      ++BI;
+  }
+
+  return ChangedFunction;
+}
+
+// Instantiate an expression in a predecessor that lacked it.
+bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
+                                    BasicBlock *Curr, unsigned int ValNo) {
+  // Because we are going top-down through the block, all value numbers
+  // will be available in the predecessor by the time we need them.  Any
+  // that weren't originally present will have been instantiated earlier
+  // in this loop.
+  bool success = true;
+  for (unsigned i = 0, e = Instr->getNumOperands(); i != e; ++i) {
+    Value *Op = Instr->getOperand(i);
+    if (isa<Argument>(Op) || isa<Constant>(Op) || isa<GlobalValue>(Op))
+      continue;
+    // This could be a newly inserted instruction, in which case, we won't
+    // find a value number, and should give up before we hurt ourselves.
+    // FIXME: Rewrite the infrastructure to let it easier to value number
+    // and process newly inserted instructions.
+    if (!VN.exists(Op)) {
+      success = false;
+      break;
+    }
+    uint32_t TValNo =
+        VN.phiTranslate(Pred, Curr, VN.lookup(Op), *this);
+    if (Value *V = findLeader(Pred, TValNo)) {
+      Instr->setOperand(i, V);
+    } else {
+      success = false;
+      break;
+    }
+  }
+
+  // Fail out if we encounter an operand that is not available in
+  // the PRE predecessor.  This is typically because of loads which
+  // are not value numbered precisely.
+  if (!success)
+    return false;
+
+  Instr->insertBefore(Pred->getTerminator());
+  Instr->setName(Instr->getName() + ".pre");
+  Instr->setDebugLoc(Instr->getDebugLoc());
+
+  unsigned Num = VN.lookupOrAdd(Instr);
+  VN.add(Instr, Num);
+
+  // Update the availability map to include the new instruction.
+  addToLeaderTable(Num, Instr, Pred);
+  return true;
+}
+
+bool GVN::performScalarPRE(Instruction *CurInst) {
+  if (isa<AllocaInst>(CurInst) || CurInst->isTerminator() ||
+      isa<PHINode>(CurInst) || CurInst->getType()->isVoidTy() ||
+      CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() ||
+      isa<DbgInfoIntrinsic>(CurInst))
+    return false;
+
+  // Don't do PRE on compares. The PHI would prevent CodeGenPrepare from
+  // sinking the compare again, and it would force the code generator to
+  // move the i1 from processor flags or predicate registers into a general
+  // purpose register.
+  if (isa<CmpInst>(CurInst))
+    return false;
+
+  // Don't do PRE on GEPs. The inserted PHI would prevent CodeGenPrepare from
+  // sinking the addressing mode computation back to its uses. Extending the
+  // GEP's live range increases the register pressure, and therefore it can
+  // introduce unnecessary spills.
+  //
+  // This doesn't prevent Load PRE. PHI translation will make the GEP available
+  // to the load by moving it to the predecessor block if necessary.
+  if (isa<GetElementPtrInst>(CurInst))
+    return false;
+
   if (auto *CallB = dyn_cast<CallBase>(CurInst)) {
     // We don't currently value number ANY inline asm calls.
-    if (CallB->isInlineAsm()) 
-      return false; 
+    if (CallB->isInlineAsm())
+      return false;
     // Don't do PRE on convergent calls.
     if (CallB->isConvergent())
       return false;
   }
- 
-  uint32_t ValNo = VN.lookup(CurInst); 
- 
-  // Look for the predecessors for PRE opportunities.  We're 
-  // only trying to solve the basic diamond case, where 
-  // a value is computed in the successor and one predecessor, 
-  // but not the other.  We also explicitly disallow cases 
-  // where the successor is its own predecessor, because they're 
-  // more complicated to get right. 
-  unsigned NumWith = 0; 
-  unsigned NumWithout = 0; 
-  BasicBlock *PREPred = nullptr; 
-  BasicBlock *CurrentBlock = CurInst->getParent(); 
- 
-  // Update the RPO numbers for this function. 
-  if (InvalidBlockRPONumbers) 
-    assignBlockRPONumber(*CurrentBlock->getParent()); 
- 
-  SmallVector<std::pair<Value *, BasicBlock *>, 8> predMap; 
-  for (BasicBlock *P : predecessors(CurrentBlock)) { 
-    // We're not interested in PRE where blocks with predecessors that are 
-    // not reachable. 
-    if (!DT->isReachableFromEntry(P)) { 
-      NumWithout = 2; 
-      break; 
-    } 
-    // It is not safe to do PRE when P->CurrentBlock is a loop backedge, and 
-    // when CurInst has operand defined in CurrentBlock (so it may be defined 
-    // by phi in the loop header). 
-    assert(BlockRPONumber.count(P) && BlockRPONumber.count(CurrentBlock) && 
-           "Invalid BlockRPONumber map."); 
-    if (BlockRPONumber[P] >= BlockRPONumber[CurrentBlock] && 
-        llvm::any_of(CurInst->operands(), [&](const Use &U) { 
-          if (auto *Inst = dyn_cast<Instruction>(U.get())) 
-            return Inst->getParent() == CurrentBlock; 
-          return false; 
-        })) { 
-      NumWithout = 2; 
-      break; 
-    } 
- 
-    uint32_t TValNo = VN.phiTranslate(P, CurrentBlock, ValNo, *this); 
-    Value *predV = findLeader(P, TValNo); 
-    if (!predV) { 
-      predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P)); 
-      PREPred = P; 
-      ++NumWithout; 
-    } else if (predV == CurInst) { 
-      /* CurInst dominates this predecessor. */ 
-      NumWithout = 2; 
-      break; 
-    } else { 
-      predMap.push_back(std::make_pair(predV, P)); 
-      ++NumWith; 
-    } 
-  } 
- 
-  // Don't do PRE when it might increase code size, i.e. when 
-  // we would need to insert instructions in more than one pred. 
-  if (NumWithout > 1 || NumWith == 0) 
-    return false; 
- 
-  // We may have a case where all predecessors have the instruction, 
-  // and we just need to insert a phi node. Otherwise, perform 
-  // insertion. 
-  Instruction *PREInstr = nullptr; 
- 
-  if (NumWithout != 0) { 
-    if (!isSafeToSpeculativelyExecute(CurInst)) { 
-      // It is only valid to insert a new instruction if the current instruction 
-      // is always executed. An instruction with implicit control flow could 
-      // prevent us from doing it. If we cannot speculate the execution, then 
-      // PRE should be prohibited. 
-      if (ICF->isDominatedByICFIFromSameBlock(CurInst)) 
-        return false; 
-    } 
- 
-    // Don't do PRE across indirect branch. 
-    if (isa<IndirectBrInst>(PREPred->getTerminator())) 
-      return false; 
- 
-    // Don't do PRE across callbr. 
-    // FIXME: Can we do this across the fallthrough edge? 
-    if (isa<CallBrInst>(PREPred->getTerminator())) 
-      return false; 
- 
-    // We can't do PRE safely on a critical edge, so instead we schedule 
-    // the edge to be split and perform the PRE the next time we iterate 
-    // on the function. 
-    unsigned SuccNum = GetSuccessorNumber(PREPred, CurrentBlock); 
-    if (isCriticalEdge(PREPred->getTerminator(), SuccNum)) { 
-      toSplit.push_back(std::make_pair(PREPred->getTerminator(), SuccNum)); 
-      return false; 
-    } 
-    // We need to insert somewhere, so let's give it a shot 
-    PREInstr = CurInst->clone(); 
-    if (!performScalarPREInsertion(PREInstr, PREPred, CurrentBlock, ValNo)) { 
-      // If we failed insertion, make sure we remove the instruction. 
-      LLVM_DEBUG(verifyRemoved(PREInstr)); 
-      PREInstr->deleteValue(); 
-      return false; 
-    } 
-  } 
- 
-  // Either we should have filled in the PRE instruction, or we should 
-  // not have needed insertions. 
-  assert(PREInstr != nullptr || NumWithout == 0); 
- 
-  ++NumGVNPRE; 
- 
-  // Create a PHI to make the value available in this block. 
-  PHINode *Phi = 
-      PHINode::Create(CurInst->getType(), predMap.size(), 
-                      CurInst->getName() + ".pre-phi", &CurrentBlock->front()); 
-  for (unsigned i = 0, e = predMap.size(); i != e; ++i) { 
-    if (Value *V = predMap[i].first) { 
-      // If we use an existing value in this phi, we have to patch the original 
-      // value because the phi will be used to replace a later value. 
-      patchReplacementInstruction(CurInst, V); 
-      Phi->addIncoming(V, predMap[i].second); 
-    } else 
-      Phi->addIncoming(PREInstr, PREPred); 
-  } 
- 
-  VN.add(Phi, ValNo); 
-  // After creating a new PHI for ValNo, the phi translate result for ValNo will 
-  // be changed, so erase the related stale entries in phi translate cache. 
-  VN.eraseTranslateCacheEntry(ValNo, *CurrentBlock); 
-  addToLeaderTable(ValNo, Phi, CurrentBlock); 
-  Phi->setDebugLoc(CurInst->getDebugLoc()); 
-  CurInst->replaceAllUsesWith(Phi); 
-  if (MD && Phi->getType()->isPtrOrPtrVectorTy()) 
-    MD->invalidateCachedPointerInfo(Phi); 
-  VN.erase(CurInst); 
-  removeFromLeaderTable(ValNo, CurInst, CurrentBlock); 
- 
-  LLVM_DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n'); 
-  if (MD) 
-    MD->removeInstruction(CurInst); 
+
+  uint32_t ValNo = VN.lookup(CurInst);
+
+  // Look for the predecessors for PRE opportunities.  We're
+  // only trying to solve the basic diamond case, where
+  // a value is computed in the successor and one predecessor,
+  // but not the other.  We also explicitly disallow cases
+  // where the successor is its own predecessor, because they're
+  // more complicated to get right.
+  unsigned NumWith = 0;
+  unsigned NumWithout = 0;
+  BasicBlock *PREPred = nullptr;
+  BasicBlock *CurrentBlock = CurInst->getParent();
+
+  // Update the RPO numbers for this function.
+  if (InvalidBlockRPONumbers)
+    assignBlockRPONumber(*CurrentBlock->getParent());
+
+  SmallVector<std::pair<Value *, BasicBlock *>, 8> predMap;
+  for (BasicBlock *P : predecessors(CurrentBlock)) {
+    // We're not interested in PRE where blocks with predecessors that are
+    // not reachable.
+    if (!DT->isReachableFromEntry(P)) {
+      NumWithout = 2;
+      break;
+    }
+    // It is not safe to do PRE when P->CurrentBlock is a loop backedge, and
+    // when CurInst has operand defined in CurrentBlock (so it may be defined
+    // by phi in the loop header).
+    assert(BlockRPONumber.count(P) && BlockRPONumber.count(CurrentBlock) &&
+           "Invalid BlockRPONumber map.");
+    if (BlockRPONumber[P] >= BlockRPONumber[CurrentBlock] &&
+        llvm::any_of(CurInst->operands(), [&](const Use &U) {
+          if (auto *Inst = dyn_cast<Instruction>(U.get()))
+            return Inst->getParent() == CurrentBlock;
+          return false;
+        })) {
+      NumWithout = 2;
+      break;
+    }
+
+    uint32_t TValNo = VN.phiTranslate(P, CurrentBlock, ValNo, *this);
+    Value *predV = findLeader(P, TValNo);
+    if (!predV) {
+      predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P));
+      PREPred = P;
+      ++NumWithout;
+    } else if (predV == CurInst) {
+      /* CurInst dominates this predecessor. */
+      NumWithout = 2;
+      break;
+    } else {
+      predMap.push_back(std::make_pair(predV, P));
+      ++NumWith;
+    }
+  }
+
+  // Don't do PRE when it might increase code size, i.e. when
+  // we would need to insert instructions in more than one pred.
+  if (NumWithout > 1 || NumWith == 0)
+    return false;
+
+  // We may have a case where all predecessors have the instruction,
+  // and we just need to insert a phi node. Otherwise, perform
+  // insertion.
+  Instruction *PREInstr = nullptr;
+
+  if (NumWithout != 0) {
+    if (!isSafeToSpeculativelyExecute(CurInst)) {
+      // It is only valid to insert a new instruction if the current instruction
+      // is always executed. An instruction with implicit control flow could
+      // prevent us from doing it. If we cannot speculate the execution, then
+      // PRE should be prohibited.
+      if (ICF->isDominatedByICFIFromSameBlock(CurInst))
+        return false;
+    }
+
+    // Don't do PRE across indirect branch.
+    if (isa<IndirectBrInst>(PREPred->getTerminator()))
+      return false;
+
+    // Don't do PRE across callbr.
+    // FIXME: Can we do this across the fallthrough edge?
+    if (isa<CallBrInst>(PREPred->getTerminator()))
+      return false;
+
+    // We can't do PRE safely on a critical edge, so instead we schedule
+    // the edge to be split and perform the PRE the next time we iterate
+    // on the function.
+    unsigned SuccNum = GetSuccessorNumber(PREPred, CurrentBlock);
+    if (isCriticalEdge(PREPred->getTerminator(), SuccNum)) {
+      toSplit.push_back(std::make_pair(PREPred->getTerminator(), SuccNum));
+      return false;
+    }
+    // We need to insert somewhere, so let's give it a shot
+    PREInstr = CurInst->clone();
+    if (!performScalarPREInsertion(PREInstr, PREPred, CurrentBlock, ValNo)) {
+      // If we failed insertion, make sure we remove the instruction.
+      LLVM_DEBUG(verifyRemoved(PREInstr));
+      PREInstr->deleteValue();
+      return false;
+    }
+  }
+
+  // Either we should have filled in the PRE instruction, or we should
+  // not have needed insertions.
+  assert(PREInstr != nullptr || NumWithout == 0);
+
+  ++NumGVNPRE;
+
+  // Create a PHI to make the value available in this block.
+  PHINode *Phi =
+      PHINode::Create(CurInst->getType(), predMap.size(),
+                      CurInst->getName() + ".pre-phi", &CurrentBlock->front());
+  for (unsigned i = 0, e = predMap.size(); i != e; ++i) {
+    if (Value *V = predMap[i].first) {
+      // If we use an existing value in this phi, we have to patch the original
+      // value because the phi will be used to replace a later value.
+      patchReplacementInstruction(CurInst, V);
+      Phi->addIncoming(V, predMap[i].second);
+    } else
+      Phi->addIncoming(PREInstr, PREPred);
+  }
+
+  VN.add(Phi, ValNo);
+  // After creating a new PHI for ValNo, the phi translate result for ValNo will
+  // be changed, so erase the related stale entries in phi translate cache.
+  VN.eraseTranslateCacheEntry(ValNo, *CurrentBlock);
+  addToLeaderTable(ValNo, Phi, CurrentBlock);
+  Phi->setDebugLoc(CurInst->getDebugLoc());
+  CurInst->replaceAllUsesWith(Phi);
+  if (MD && Phi->getType()->isPtrOrPtrVectorTy())
+    MD->invalidateCachedPointerInfo(Phi);
+  VN.erase(CurInst);
+  removeFromLeaderTable(ValNo, CurInst, CurrentBlock);
+
+  LLVM_DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n');
+  if (MD)
+    MD->removeInstruction(CurInst);
   if (MSSAU)
     MSSAU->removeMemoryAccess(CurInst);
-  LLVM_DEBUG(verifyRemoved(CurInst)); 
-  // FIXME: Intended to be markInstructionForDeletion(CurInst), but it causes 
-  // some assertion failures. 
-  ICF->removeInstruction(CurInst); 
-  CurInst->eraseFromParent(); 
-  ++NumGVNInstr; 
- 
-  return true; 
-} 
- 
-/// Perform a purely local form of PRE that looks for diamond 
-/// control flow patterns and attempts to perform simple PRE at the join point. 
-bool GVN::performPRE(Function &F) { 
-  bool Changed = false; 
-  for (BasicBlock *CurrentBlock : depth_first(&F.getEntryBlock())) { 
-    // Nothing to PRE in the entry block. 
-    if (CurrentBlock == &F.getEntryBlock()) 
-      continue; 
- 
-    // Don't perform PRE on an EH pad. 
-    if (CurrentBlock->isEHPad()) 
-      continue; 
- 
-    for (BasicBlock::iterator BI = CurrentBlock->begin(), 
-                              BE = CurrentBlock->end(); 
-         BI != BE;) { 
-      Instruction *CurInst = &*BI++; 
-      Changed |= performScalarPRE(CurInst); 
-    } 
-  } 
- 
-  if (splitCriticalEdges()) 
-    Changed = true; 
- 
-  return Changed; 
-} 
- 
-/// Split the critical edge connecting the given two blocks, and return 
-/// the block inserted to the critical edge. 
-BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) { 
-  // GVN does not require loop-simplify, do not try to preserve it if it is not 
-  // possible. 
-  BasicBlock *BB = SplitCriticalEdge( 
-      Pred, Succ, 
+  LLVM_DEBUG(verifyRemoved(CurInst));
+  // FIXME: Intended to be markInstructionForDeletion(CurInst), but it causes
+  // some assertion failures.
+  ICF->removeInstruction(CurInst);
+  CurInst->eraseFromParent();
+  ++NumGVNInstr;
+
+  return true;
+}
+
+/// Perform a purely local form of PRE that looks for diamond
+/// control flow patterns and attempts to perform simple PRE at the join point.
+bool GVN::performPRE(Function &F) {
+  bool Changed = false;
+  for (BasicBlock *CurrentBlock : depth_first(&F.getEntryBlock())) {
+    // Nothing to PRE in the entry block.
+    if (CurrentBlock == &F.getEntryBlock())
+      continue;
+
+    // Don't perform PRE on an EH pad.
+    if (CurrentBlock->isEHPad())
+      continue;
+
+    for (BasicBlock::iterator BI = CurrentBlock->begin(),
+                              BE = CurrentBlock->end();
+         BI != BE;) {
+      Instruction *CurInst = &*BI++;
+      Changed |= performScalarPRE(CurInst);
+    }
+  }
+
+  if (splitCriticalEdges())
+    Changed = true;
+
+  return Changed;
+}
+
+/// Split the critical edge connecting the given two blocks, and return
+/// the block inserted to the critical edge.
+BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) {
+  // GVN does not require loop-simplify, do not try to preserve it if it is not
+  // possible.
+  BasicBlock *BB = SplitCriticalEdge(
+      Pred, Succ,
       CriticalEdgeSplittingOptions(DT, LI, MSSAU).unsetPreserveLoopSimplify());
   if (BB) {
     if (MD)
       MD->invalidateCachedPredecessors();
     InvalidBlockRPONumbers = true;
   }
-  return BB; 
-} 
- 
-/// Split critical edges found during the previous 
-/// iteration that may enable further optimization. 
-bool GVN::splitCriticalEdges() { 
-  if (toSplit.empty()) 
-    return false; 
+  return BB;
+}
+
+/// Split critical edges found during the previous
+/// iteration that may enable further optimization.
+bool GVN::splitCriticalEdges() {
+  if (toSplit.empty())
+    return false;
 
   bool Changed = false;
-  do { 
-    std::pair<Instruction *, unsigned> Edge = toSplit.pop_back_val(); 
+  do {
+    std::pair<Instruction *, unsigned> Edge = toSplit.pop_back_val();
     Changed |= SplitCriticalEdge(Edge.first, Edge.second,
                                  CriticalEdgeSplittingOptions(DT, LI, MSSAU)) !=
                nullptr;
-  } while (!toSplit.empty()); 
+  } while (!toSplit.empty());
   if (Changed) {
     if (MD)
       MD->invalidateCachedPredecessors();
     InvalidBlockRPONumbers = true;
   }
   return Changed;
-} 
- 
-/// Executes one iteration of GVN 
-bool GVN::iterateOnFunction(Function &F) { 
-  cleanupGlobalSets(); 
- 
-  // Top-down walk of the dominator tree 
-  bool Changed = false; 
-  // Needed for value numbering with phi construction to work. 
-  // RPOT walks the graph in its constructor and will not be invalidated during 
-  // processBlock. 
-  ReversePostOrderTraversal<Function *> RPOT(&F); 
- 
-  for (BasicBlock *BB : RPOT) 
-    Changed |= processBlock(BB); 
- 
-  return Changed; 
-} 
- 
-void GVN::cleanupGlobalSets() { 
-  VN.clear(); 
-  LeaderTable.clear(); 
-  BlockRPONumber.clear(); 
-  TableAllocator.Reset(); 
-  ICF->clear(); 
-  InvalidBlockRPONumbers = true; 
-} 
- 
-/// Verify that the specified instruction does not occur in our 
-/// internal data structures. 
-void GVN::verifyRemoved(const Instruction *Inst) const { 
-  VN.verifyRemoved(Inst); 
- 
-  // Walk through the value number scope to make sure the instruction isn't 
-  // ferreted away in it. 
-  for (DenseMap<uint32_t, LeaderTableEntry>::const_iterator 
-       I = LeaderTable.begin(), E = LeaderTable.end(); I != E; ++I) { 
-    const LeaderTableEntry *Node = &I->second; 
-    assert(Node->Val != Inst && "Inst still in value numbering scope!"); 
- 
-    while (Node->Next) { 
-      Node = Node->Next; 
-      assert(Node->Val != Inst && "Inst still in value numbering scope!"); 
-    } 
-  } 
-} 
- 
-/// BB is declared dead, which implied other blocks become dead as well. This 
-/// function is to add all these blocks to "DeadBlocks". For the dead blocks' 
-/// live successors, update their phi nodes by replacing the operands 
-/// corresponding to dead blocks with UndefVal. 
-void GVN::addDeadBlock(BasicBlock *BB) { 
-  SmallVector<BasicBlock *, 4> NewDead; 
-  SmallSetVector<BasicBlock *, 4> DF; 
- 
-  NewDead.push_back(BB); 
-  while (!NewDead.empty()) { 
-    BasicBlock *D = NewDead.pop_back_val(); 
-    if (DeadBlocks.count(D)) 
-      continue; 
- 
-    // All blocks dominated by D are dead. 
-    SmallVector<BasicBlock *, 8> Dom; 
-    DT->getDescendants(D, Dom); 
-    DeadBlocks.insert(Dom.begin(), Dom.end()); 
- 
-    // Figure out the dominance-frontier(D). 
-    for (BasicBlock *B : Dom) { 
-      for (BasicBlock *S : successors(B)) { 
-        if (DeadBlocks.count(S)) 
-          continue; 
- 
-        bool AllPredDead = true; 
-        for (BasicBlock *P : predecessors(S)) 
-          if (!DeadBlocks.count(P)) { 
-            AllPredDead = false; 
-            break; 
-          } 
- 
-        if (!AllPredDead) { 
-          // S could be proved dead later on. That is why we don't update phi 
-          // operands at this moment. 
-          DF.insert(S); 
-        } else { 
-          // While S is not dominated by D, it is dead by now. This could take 
-          // place if S already have a dead predecessor before D is declared 
-          // dead. 
-          NewDead.push_back(S); 
-        } 
-      } 
-    } 
-  } 
- 
-  // For the dead blocks' live successors, update their phi nodes by replacing 
-  // the operands corresponding to dead blocks with UndefVal. 
-  for(SmallSetVector<BasicBlock *, 4>::iterator I = DF.begin(), E = DF.end(); 
-        I != E; I++) { 
-    BasicBlock *B = *I; 
-    if (DeadBlocks.count(B)) 
-      continue; 
- 
-    // First, split the critical edges. This might also create additional blocks 
-    // to preserve LoopSimplify form and adjust edges accordingly. 
+}
+
+/// Executes one iteration of GVN
+bool GVN::iterateOnFunction(Function &F) {
+  cleanupGlobalSets();
+
+  // Top-down walk of the dominator tree
+  bool Changed = false;
+  // Needed for value numbering with phi construction to work.
+  // RPOT walks the graph in its constructor and will not be invalidated during
+  // processBlock.
+  ReversePostOrderTraversal<Function *> RPOT(&F);
+
+  for (BasicBlock *BB : RPOT)
+    Changed |= processBlock(BB);
+
+  return Changed;
+}
+
+void GVN::cleanupGlobalSets() {
+  VN.clear();
+  LeaderTable.clear();
+  BlockRPONumber.clear();
+  TableAllocator.Reset();
+  ICF->clear();
+  InvalidBlockRPONumbers = true;
+}
+
+/// Verify that the specified instruction does not occur in our
+/// internal data structures.
+void GVN::verifyRemoved(const Instruction *Inst) const {
+  VN.verifyRemoved(Inst);
+
+  // Walk through the value number scope to make sure the instruction isn't
+  // ferreted away in it.
+  for (DenseMap<uint32_t, LeaderTableEntry>::const_iterator
+       I = LeaderTable.begin(), E = LeaderTable.end(); I != E; ++I) {
+    const LeaderTableEntry *Node = &I->second;
+    assert(Node->Val != Inst && "Inst still in value numbering scope!");
+
+    while (Node->Next) {
+      Node = Node->Next;
+      assert(Node->Val != Inst && "Inst still in value numbering scope!");
+    }
+  }
+}
+
+/// BB is declared dead, which implied other blocks become dead as well. This
+/// function is to add all these blocks to "DeadBlocks". For the dead blocks'
+/// live successors, update their phi nodes by replacing the operands
+/// corresponding to dead blocks with UndefVal.
+void GVN::addDeadBlock(BasicBlock *BB) {
+  SmallVector<BasicBlock *, 4> NewDead;
+  SmallSetVector<BasicBlock *, 4> DF;
+
+  NewDead.push_back(BB);
+  while (!NewDead.empty()) {
+    BasicBlock *D = NewDead.pop_back_val();
+    if (DeadBlocks.count(D))
+      continue;
+
+    // All blocks dominated by D are dead.
+    SmallVector<BasicBlock *, 8> Dom;
+    DT->getDescendants(D, Dom);
+    DeadBlocks.insert(Dom.begin(), Dom.end());
+
+    // Figure out the dominance-frontier(D).
+    for (BasicBlock *B : Dom) {
+      for (BasicBlock *S : successors(B)) {
+        if (DeadBlocks.count(S))
+          continue;
+
+        bool AllPredDead = true;
+        for (BasicBlock *P : predecessors(S))
+          if (!DeadBlocks.count(P)) {
+            AllPredDead = false;
+            break;
+          }
+
+        if (!AllPredDead) {
+          // S could be proved dead later on. That is why we don't update phi
+          // operands at this moment.
+          DF.insert(S);
+        } else {
+          // While S is not dominated by D, it is dead by now. This could take
+          // place if S already have a dead predecessor before D is declared
+          // dead.
+          NewDead.push_back(S);
+        }
+      }
+    }
+  }
+
+  // For the dead blocks' live successors, update their phi nodes by replacing
+  // the operands corresponding to dead blocks with UndefVal.
+  for(SmallSetVector<BasicBlock *, 4>::iterator I = DF.begin(), E = DF.end();
+        I != E; I++) {
+    BasicBlock *B = *I;
+    if (DeadBlocks.count(B))
+      continue;
+
+    // First, split the critical edges. This might also create additional blocks
+    // to preserve LoopSimplify form and adjust edges accordingly.
     SmallVector<BasicBlock *, 4> Preds(predecessors(B));
-    for (BasicBlock *P : Preds) { 
-      if (!DeadBlocks.count(P)) 
-        continue; 
- 
+    for (BasicBlock *P : Preds) {
+      if (!DeadBlocks.count(P))
+        continue;
+
       if (llvm::is_contained(successors(P), B) &&
-          isCriticalEdge(P->getTerminator(), B)) { 
-        if (BasicBlock *S = splitCriticalEdges(P, B)) 
-          DeadBlocks.insert(P = S); 
-      } 
-    } 
- 
-    // Now undef the incoming values from the dead predecessors. 
-    for (BasicBlock *P : predecessors(B)) { 
-      if (!DeadBlocks.count(P)) 
-        continue; 
-      for (PHINode &Phi : B->phis()) { 
-        Phi.setIncomingValueForBlock(P, UndefValue::get(Phi.getType())); 
-        if (MD) 
-          MD->invalidateCachedPointerInfo(&Phi); 
-      } 
-    } 
-  } 
-} 
- 
-// If the given branch is recognized as a foldable branch (i.e. conditional 
-// branch with constant condition), it will perform following analyses and 
-// transformation. 
-//  1) If the dead out-coming edge is a critical-edge, split it. Let 
-//     R be the target of the dead out-coming edge. 
-//  1) Identify the set of dead blocks implied by the branch's dead outcoming 
-//     edge. The result of this step will be {X| X is dominated by R} 
-//  2) Identify those blocks which haves at least one dead predecessor. The 
-//     result of this step will be dominance-frontier(R). 
-//  3) Update the PHIs in DF(R) by replacing the operands corresponding to 
-//     dead blocks with "UndefVal" in an hope these PHIs will optimized away. 
-// 
-// Return true iff *NEW* dead code are found. 
-bool GVN::processFoldableCondBr(BranchInst *BI) { 
-  if (!BI || BI->isUnconditional()) 
-    return false; 
- 
-  // If a branch has two identical successors, we cannot declare either dead. 
-  if (BI->getSuccessor(0) == BI->getSuccessor(1)) 
-    return false; 
- 
-  ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition()); 
-  if (!Cond) 
-    return false; 
- 
-  BasicBlock *DeadRoot = 
-      Cond->getZExtValue() ? BI->getSuccessor(1) : BI->getSuccessor(0); 
-  if (DeadBlocks.count(DeadRoot)) 
-    return false; 
- 
-  if (!DeadRoot->getSinglePredecessor()) 
-    DeadRoot = splitCriticalEdges(BI->getParent(), DeadRoot); 
- 
-  addDeadBlock(DeadRoot); 
-  return true; 
-} 
- 
-// performPRE() will trigger assert if it comes across an instruction without 
-// associated val-num. As it normally has far more live instructions than dead 
-// instructions, it makes more sense just to "fabricate" a val-number for the 
-// dead code than checking if instruction involved is dead or not. 
-void GVN::assignValNumForDeadCode() { 
-  for (BasicBlock *BB : DeadBlocks) { 
-    for (Instruction &Inst : *BB) { 
-      unsigned ValNum = VN.lookupOrAdd(&Inst); 
-      addToLeaderTable(ValNum, &Inst, BB); 
-    } 
-  } 
-} 
- 
-class llvm::gvn::GVNLegacyPass : public FunctionPass { 
-public: 
-  static char ID; // Pass identification, replacement for typeid 
- 
-  explicit GVNLegacyPass(bool NoMemDepAnalysis = !GVNEnableMemDep) 
-      : FunctionPass(ID), Impl(GVNOptions().setMemDep(!NoMemDepAnalysis)) { 
-    initializeGVNLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnFunction(Function &F) override { 
-    if (skipFunction(F)) 
-      return false; 
- 
-    auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>(); 
- 
+          isCriticalEdge(P->getTerminator(), B)) {
+        if (BasicBlock *S = splitCriticalEdges(P, B))
+          DeadBlocks.insert(P = S);
+      }
+    }
+
+    // Now undef the incoming values from the dead predecessors.
+    for (BasicBlock *P : predecessors(B)) {
+      if (!DeadBlocks.count(P))
+        continue;
+      for (PHINode &Phi : B->phis()) {
+        Phi.setIncomingValueForBlock(P, UndefValue::get(Phi.getType()));
+        if (MD)
+          MD->invalidateCachedPointerInfo(&Phi);
+      }
+    }
+  }
+}
+
+// If the given branch is recognized as a foldable branch (i.e. conditional
+// branch with constant condition), it will perform following analyses and
+// transformation.
+//  1) If the dead out-coming edge is a critical-edge, split it. Let
+//     R be the target of the dead out-coming edge.
+//  1) Identify the set of dead blocks implied by the branch's dead outcoming
+//     edge. The result of this step will be {X| X is dominated by R}
+//  2) Identify those blocks which haves at least one dead predecessor. The
+//     result of this step will be dominance-frontier(R).
+//  3) Update the PHIs in DF(R) by replacing the operands corresponding to
+//     dead blocks with "UndefVal" in an hope these PHIs will optimized away.
+//
+// Return true iff *NEW* dead code are found.
+bool GVN::processFoldableCondBr(BranchInst *BI) {
+  if (!BI || BI->isUnconditional())
+    return false;
+
+  // If a branch has two identical successors, we cannot declare either dead.
+  if (BI->getSuccessor(0) == BI->getSuccessor(1))
+    return false;
+
+  ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition());
+  if (!Cond)
+    return false;
+
+  BasicBlock *DeadRoot =
+      Cond->getZExtValue() ? BI->getSuccessor(1) : BI->getSuccessor(0);
+  if (DeadBlocks.count(DeadRoot))
+    return false;
+
+  if (!DeadRoot->getSinglePredecessor())
+    DeadRoot = splitCriticalEdges(BI->getParent(), DeadRoot);
+
+  addDeadBlock(DeadRoot);
+  return true;
+}
+
+// performPRE() will trigger assert if it comes across an instruction without
+// associated val-num. As it normally has far more live instructions than dead
+// instructions, it makes more sense just to "fabricate" a val-number for the
+// dead code than checking if instruction involved is dead or not.
+void GVN::assignValNumForDeadCode() {
+  for (BasicBlock *BB : DeadBlocks) {
+    for (Instruction &Inst : *BB) {
+      unsigned ValNum = VN.lookupOrAdd(&Inst);
+      addToLeaderTable(ValNum, &Inst, BB);
+    }
+  }
+}
+
+class llvm::gvn::GVNLegacyPass : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  explicit GVNLegacyPass(bool NoMemDepAnalysis = !GVNEnableMemDep)
+      : FunctionPass(ID), Impl(GVNOptions().setMemDep(!NoMemDepAnalysis)) {
+    initializeGVNLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
+
     auto *MSSAWP = getAnalysisIfAvailable<MemorySSAWrapperPass>();
-    return Impl.runImpl( 
-        F, getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F), 
-        getAnalysis<DominatorTreeWrapperPass>().getDomTree(), 
-        getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F), 
-        getAnalysis<AAResultsWrapperPass>().getAAResults(), 
-        Impl.isMemDepEnabled() 
-            ? &getAnalysis<MemoryDependenceWrapperPass>().getMemDep() 
-            : nullptr, 
-        LIWP ? &LIWP->getLoopInfo() : nullptr, 
+    return Impl.runImpl(
+        F, getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
+        getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+        getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F),
+        getAnalysis<AAResultsWrapperPass>().getAAResults(),
+        Impl.isMemDepEnabled()
+            ? &getAnalysis<MemoryDependenceWrapperPass>().getMemDep()
+            : nullptr,
+        LIWP ? &LIWP->getLoopInfo() : nullptr,
         &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(),
         MSSAWP ? &MSSAWP->getMSSA() : nullptr);
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<AssumptionCacheTracker>(); 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-    AU.addRequired<LoopInfoWrapperPass>(); 
-    if (Impl.isMemDepEnabled()) 
-      AU.addRequired<MemoryDependenceWrapperPass>(); 
-    AU.addRequired<AAResultsWrapperPass>(); 
-    AU.addPreserved<DominatorTreeWrapperPass>(); 
-    AU.addPreserved<GlobalsAAWrapperPass>(); 
-    AU.addPreserved<TargetLibraryInfoWrapperPass>(); 
-    AU.addPreserved<LoopInfoWrapperPass>(); 
-    AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    if (Impl.isMemDepEnabled())
+      AU.addRequired<MemoryDependenceWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<TargetLibraryInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
     AU.addPreserved<MemorySSAWrapperPass>();
-  } 
- 
-private: 
-  GVN Impl; 
-}; 
- 
-char GVNLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(GVNLegacyPass, "gvn", "Global Value Numbering", false, false) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
-INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 
-INITIALIZE_PASS_END(GVNLegacyPass, "gvn", "Global Value Numbering", false, false) 
- 
-// The public interface to this file... 
-FunctionPass *llvm::createGVNPass(bool NoMemDepAnalysis) { 
-  return new GVNLegacyPass(NoMemDepAnalysis); 
-} 
+  }
+
+private:
+  GVN Impl;
+};
+
+char GVNLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(GVNLegacyPass, "gvn", "Global Value Numbering", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_END(GVNLegacyPass, "gvn", "Global Value Numbering", false, false)
+
+// The public interface to this file...
+FunctionPass *llvm::createGVNPass(bool NoMemDepAnalysis) {
+  return new GVNLegacyPass(NoMemDepAnalysis);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/GVNHoist.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/GVNHoist.cpp
index 136058877c..8d0bd56749 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/GVNHoist.cpp
@@ -1,247 +1,247 @@
-//===- GVNHoist.cpp - Hoist scalar and load expressions -------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass hoists expressions from branches to a common dominator. It uses 
-// GVN (global value numbering) to discover expressions computing the same 
-// values. The primary goals of code-hoisting are: 
-// 1. To reduce the code size. 
-// 2. In some cases reduce critical path (by exposing more ILP). 
-// 
-// The algorithm factors out the reachability of values such that multiple 
-// queries to find reachability of values are fast. This is based on finding the 
-// ANTIC points in the CFG which do not change during hoisting. The ANTIC points 
-// are basically the dominance-frontiers in the inverse graph. So we introduce a 
-// data structure (CHI nodes) to keep track of values flowing out of a basic 
-// block. We only do this for values with multiple occurrences in the function 
-// as they are the potential hoistable candidates. This approach allows us to 
-// hoist instructions to a basic block with more than two successors, as well as 
-// deal with infinite loops in a trivial way. 
-// 
-// Limitations: This pass does not hoist fully redundant expressions because 
-// they are already handled by GVN-PRE. It is advisable to run gvn-hoist before 
-// and after gvn-pre because gvn-pre creates opportunities for more instructions 
-// to be hoisted. 
-// 
-// Hoisting may affect the performance in some cases. To mitigate that, hoisting 
-// is disabled in the following cases. 
-// 1. Scalars across calls. 
-// 2. geps when corresponding load/store cannot be hoisted. 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/DenseSet.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/iterator_range.h" 
-#include "llvm/Analysis/AliasAnalysis.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/IteratedDominanceFrontier.h" 
-#include "llvm/Analysis/MemoryDependenceAnalysis.h" 
-#include "llvm/Analysis/MemorySSA.h" 
-#include "llvm/Analysis/MemorySSAUpdater.h" 
-#include "llvm/Analysis/PostDominators.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/Argument.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Scalar/GVN.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <iterator> 
-#include <memory> 
-#include <utility> 
-#include <vector> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "gvn-hoist" 
- 
-STATISTIC(NumHoisted, "Number of instructions hoisted"); 
-STATISTIC(NumRemoved, "Number of instructions removed"); 
-STATISTIC(NumLoadsHoisted, "Number of loads hoisted"); 
-STATISTIC(NumLoadsRemoved, "Number of loads removed"); 
-STATISTIC(NumStoresHoisted, "Number of stores hoisted"); 
-STATISTIC(NumStoresRemoved, "Number of stores removed"); 
-STATISTIC(NumCallsHoisted, "Number of calls hoisted"); 
-STATISTIC(NumCallsRemoved, "Number of calls removed"); 
- 
-static cl::opt<int> 
-    MaxHoistedThreshold("gvn-max-hoisted", cl::Hidden, cl::init(-1), 
-                        cl::desc("Max number of instructions to hoist " 
-                                 "(default unlimited = -1)")); 
- 
-static cl::opt<int> MaxNumberOfBBSInPath( 
-    "gvn-hoist-max-bbs", cl::Hidden, cl::init(4), 
-    cl::desc("Max number of basic blocks on the path between " 
-             "hoisting locations (default = 4, unlimited = -1)")); 
- 
-static cl::opt<int> MaxDepthInBB( 
-    "gvn-hoist-max-depth", cl::Hidden, cl::init(100), 
-    cl::desc("Hoist instructions from the beginning of the BB up to the " 
-             "maximum specified depth (default = 100, unlimited = -1)")); 
- 
-static cl::opt<int> 
-    MaxChainLength("gvn-hoist-max-chain-length", cl::Hidden, cl::init(10), 
-                   cl::desc("Maximum length of dependent chains to hoist " 
-                            "(default = 10, unlimited = -1)")); 
- 
-namespace llvm { 
- 
-using BBSideEffectsSet = DenseMap<const BasicBlock *, bool>; 
-using SmallVecInsn = SmallVector<Instruction *, 4>; 
-using SmallVecImplInsn = SmallVectorImpl<Instruction *>; 
- 
-// Each element of a hoisting list contains the basic block where to hoist and 
-// a list of instructions to be hoisted. 
-using HoistingPointInfo = std::pair<BasicBlock *, SmallVecInsn>; 
- 
-using HoistingPointList = SmallVector<HoistingPointInfo, 4>; 
- 
-// A map from a pair of VNs to all the instructions with those VNs. 
-using VNType = std::pair<unsigned, unsigned>; 
- 
-using VNtoInsns = DenseMap<VNType, SmallVector<Instruction *, 4>>; 
- 
-// CHI keeps information about values flowing out of a basic block.  It is 
-// similar to PHI but in the inverse graph, and used for outgoing values on each 
-// edge. For conciseness, it is computed only for instructions with multiple 
-// occurrences in the CFG because they are the only hoistable candidates. 
-//     A (CHI[{V, B, I1}, {V, C, I2}] 
-//  /     \ 
-// /       \ 
-// B(I1)  C (I2) 
-// The Value number for both I1 and I2 is V, the CHI node will save the 
-// instruction as well as the edge where the value is flowing to. 
-struct CHIArg { 
-  VNType VN; 
- 
-  // Edge destination (shows the direction of flow), may not be where the I is. 
-  BasicBlock *Dest; 
- 
-  // The instruction (VN) which uses the values flowing out of CHI. 
-  Instruction *I; 
- 
-  bool operator==(const CHIArg &A) const { return VN == A.VN; } 
-  bool operator!=(const CHIArg &A) const { return !(*this == A); } 
-}; 
- 
-using CHIIt = SmallVectorImpl<CHIArg>::iterator; 
-using CHIArgs = iterator_range<CHIIt>; 
-using OutValuesType = DenseMap<BasicBlock *, SmallVector<CHIArg, 2>>; 
-using InValuesType = 
-    DenseMap<BasicBlock *, SmallVector<std::pair<VNType, Instruction *>, 2>>; 
- 
-// An invalid value number Used when inserting a single value number into 
-// VNtoInsns. 
-enum : unsigned { InvalidVN = ~2U }; 
- 
-// Records all scalar instructions candidate for code hoisting. 
-class InsnInfo { 
-  VNtoInsns VNtoScalars; 
- 
-public: 
-  // Inserts I and its value number in VNtoScalars. 
-  void insert(Instruction *I, GVN::ValueTable &VN) { 
-    // Scalar instruction. 
-    unsigned V = VN.lookupOrAdd(I); 
-    VNtoScalars[{V, InvalidVN}].push_back(I); 
-  } 
- 
-  const VNtoInsns &getVNTable() const { return VNtoScalars; } 
-}; 
- 
-// Records all load instructions candidate for code hoisting. 
-class LoadInfo { 
-  VNtoInsns VNtoLoads; 
- 
-public: 
-  // Insert Load and the value number of its memory address in VNtoLoads. 
-  void insert(LoadInst *Load, GVN::ValueTable &VN) { 
-    if (Load->isSimple()) { 
-      unsigned V = VN.lookupOrAdd(Load->getPointerOperand()); 
-      VNtoLoads[{V, InvalidVN}].push_back(Load); 
-    } 
-  } 
- 
-  const VNtoInsns &getVNTable() const { return VNtoLoads; } 
-}; 
- 
-// Records all store instructions candidate for code hoisting. 
-class StoreInfo { 
-  VNtoInsns VNtoStores; 
- 
-public: 
-  // Insert the Store and a hash number of the store address and the stored 
-  // value in VNtoStores. 
-  void insert(StoreInst *Store, GVN::ValueTable &VN) { 
-    if (!Store->isSimple()) 
-      return; 
-    // Hash the store address and the stored value. 
-    Value *Ptr = Store->getPointerOperand(); 
-    Value *Val = Store->getValueOperand(); 
-    VNtoStores[{VN.lookupOrAdd(Ptr), VN.lookupOrAdd(Val)}].push_back(Store); 
-  } 
- 
-  const VNtoInsns &getVNTable() const { return VNtoStores; } 
-}; 
- 
-// Records all call instructions candidate for code hoisting. 
-class CallInfo { 
-  VNtoInsns VNtoCallsScalars; 
-  VNtoInsns VNtoCallsLoads; 
-  VNtoInsns VNtoCallsStores; 
- 
-public: 
-  // Insert Call and its value numbering in one of the VNtoCalls* containers. 
-  void insert(CallInst *Call, GVN::ValueTable &VN) { 
-    // A call that doesNotAccessMemory is handled as a Scalar, 
-    // onlyReadsMemory will be handled as a Load instruction, 
-    // all other calls will be handled as stores. 
-    unsigned V = VN.lookupOrAdd(Call); 
-    auto Entry = std::make_pair(V, InvalidVN); 
- 
-    if (Call->doesNotAccessMemory()) 
-      VNtoCallsScalars[Entry].push_back(Call); 
-    else if (Call->onlyReadsMemory()) 
-      VNtoCallsLoads[Entry].push_back(Call); 
-    else 
-      VNtoCallsStores[Entry].push_back(Call); 
-  } 
- 
-  const VNtoInsns &getScalarVNTable() const { return VNtoCallsScalars; } 
-  const VNtoInsns &getLoadVNTable() const { return VNtoCallsLoads; } 
-  const VNtoInsns &getStoreVNTable() const { return VNtoCallsStores; } 
-}; 
- 
-static void combineKnownMetadata(Instruction *ReplInst, Instruction *I) { 
+//===- GVNHoist.cpp - Hoist scalar and load expressions -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass hoists expressions from branches to a common dominator. It uses
+// GVN (global value numbering) to discover expressions computing the same
+// values. The primary goals of code-hoisting are:
+// 1. To reduce the code size.
+// 2. In some cases reduce critical path (by exposing more ILP).
+//
+// The algorithm factors out the reachability of values such that multiple
+// queries to find reachability of values are fast. This is based on finding the
+// ANTIC points in the CFG which do not change during hoisting. The ANTIC points
+// are basically the dominance-frontiers in the inverse graph. So we introduce a
+// data structure (CHI nodes) to keep track of values flowing out of a basic
+// block. We only do this for values with multiple occurrences in the function
+// as they are the potential hoistable candidates. This approach allows us to
+// hoist instructions to a basic block with more than two successors, as well as
+// deal with infinite loops in a trivial way.
+//
+// Limitations: This pass does not hoist fully redundant expressions because
+// they are already handled by GVN-PRE. It is advisable to run gvn-hoist before
+// and after gvn-pre because gvn-pre creates opportunities for more instructions
+// to be hoisted.
+//
+// Hoisting may affect the performance in some cases. To mitigate that, hoisting
+// is disabled in the following cases.
+// 1. Scalars across calls.
+// 2. geps when corresponding load/store cannot be hoisted.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/IteratedDominanceFrontier.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <memory>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "gvn-hoist"
+
+STATISTIC(NumHoisted, "Number of instructions hoisted");
+STATISTIC(NumRemoved, "Number of instructions removed");
+STATISTIC(NumLoadsHoisted, "Number of loads hoisted");
+STATISTIC(NumLoadsRemoved, "Number of loads removed");
+STATISTIC(NumStoresHoisted, "Number of stores hoisted");
+STATISTIC(NumStoresRemoved, "Number of stores removed");
+STATISTIC(NumCallsHoisted, "Number of calls hoisted");
+STATISTIC(NumCallsRemoved, "Number of calls removed");
+
+static cl::opt<int>
+    MaxHoistedThreshold("gvn-max-hoisted", cl::Hidden, cl::init(-1),
+                        cl::desc("Max number of instructions to hoist "
+                                 "(default unlimited = -1)"));
+
+static cl::opt<int> MaxNumberOfBBSInPath(
+    "gvn-hoist-max-bbs", cl::Hidden, cl::init(4),
+    cl::desc("Max number of basic blocks on the path between "
+             "hoisting locations (default = 4, unlimited = -1)"));
+
+static cl::opt<int> MaxDepthInBB(
+    "gvn-hoist-max-depth", cl::Hidden, cl::init(100),
+    cl::desc("Hoist instructions from the beginning of the BB up to the "
+             "maximum specified depth (default = 100, unlimited = -1)"));
+
+static cl::opt<int>
+    MaxChainLength("gvn-hoist-max-chain-length", cl::Hidden, cl::init(10),
+                   cl::desc("Maximum length of dependent chains to hoist "
+                            "(default = 10, unlimited = -1)"));
+
+namespace llvm {
+
+using BBSideEffectsSet = DenseMap<const BasicBlock *, bool>;
+using SmallVecInsn = SmallVector<Instruction *, 4>;
+using SmallVecImplInsn = SmallVectorImpl<Instruction *>;
+
+// Each element of a hoisting list contains the basic block where to hoist and
+// a list of instructions to be hoisted.
+using HoistingPointInfo = std::pair<BasicBlock *, SmallVecInsn>;
+
+using HoistingPointList = SmallVector<HoistingPointInfo, 4>;
+
+// A map from a pair of VNs to all the instructions with those VNs.
+using VNType = std::pair<unsigned, unsigned>;
+
+using VNtoInsns = DenseMap<VNType, SmallVector<Instruction *, 4>>;
+
+// CHI keeps information about values flowing out of a basic block.  It is
+// similar to PHI but in the inverse graph, and used for outgoing values on each
+// edge. For conciseness, it is computed only for instructions with multiple
+// occurrences in the CFG because they are the only hoistable candidates.
+//     A (CHI[{V, B, I1}, {V, C, I2}]
+//  /     \
+// /       \
+// B(I1)  C (I2)
+// The Value number for both I1 and I2 is V, the CHI node will save the
+// instruction as well as the edge where the value is flowing to.
+struct CHIArg {
+  VNType VN;
+
+  // Edge destination (shows the direction of flow), may not be where the I is.
+  BasicBlock *Dest;
+
+  // The instruction (VN) which uses the values flowing out of CHI.
+  Instruction *I;
+
+  bool operator==(const CHIArg &A) const { return VN == A.VN; }
+  bool operator!=(const CHIArg &A) const { return !(*this == A); }
+};
+
+using CHIIt = SmallVectorImpl<CHIArg>::iterator;
+using CHIArgs = iterator_range<CHIIt>;
+using OutValuesType = DenseMap<BasicBlock *, SmallVector<CHIArg, 2>>;
+using InValuesType =
+    DenseMap<BasicBlock *, SmallVector<std::pair<VNType, Instruction *>, 2>>;
+
+// An invalid value number Used when inserting a single value number into
+// VNtoInsns.
+enum : unsigned { InvalidVN = ~2U };
+
+// Records all scalar instructions candidate for code hoisting.
+class InsnInfo {
+  VNtoInsns VNtoScalars;
+
+public:
+  // Inserts I and its value number in VNtoScalars.
+  void insert(Instruction *I, GVN::ValueTable &VN) {
+    // Scalar instruction.
+    unsigned V = VN.lookupOrAdd(I);
+    VNtoScalars[{V, InvalidVN}].push_back(I);
+  }
+
+  const VNtoInsns &getVNTable() const { return VNtoScalars; }
+};
+
+// Records all load instructions candidate for code hoisting.
+class LoadInfo {
+  VNtoInsns VNtoLoads;
+
+public:
+  // Insert Load and the value number of its memory address in VNtoLoads.
+  void insert(LoadInst *Load, GVN::ValueTable &VN) {
+    if (Load->isSimple()) {
+      unsigned V = VN.lookupOrAdd(Load->getPointerOperand());
+      VNtoLoads[{V, InvalidVN}].push_back(Load);
+    }
+  }
+
+  const VNtoInsns &getVNTable() const { return VNtoLoads; }
+};
+
+// Records all store instructions candidate for code hoisting.
+class StoreInfo {
+  VNtoInsns VNtoStores;
+
+public:
+  // Insert the Store and a hash number of the store address and the stored
+  // value in VNtoStores.
+  void insert(StoreInst *Store, GVN::ValueTable &VN) {
+    if (!Store->isSimple())
+      return;
+    // Hash the store address and the stored value.
+    Value *Ptr = Store->getPointerOperand();
+    Value *Val = Store->getValueOperand();
+    VNtoStores[{VN.lookupOrAdd(Ptr), VN.lookupOrAdd(Val)}].push_back(Store);
+  }
+
+  const VNtoInsns &getVNTable() const { return VNtoStores; }
+};
+
+// Records all call instructions candidate for code hoisting.
+class CallInfo {
+  VNtoInsns VNtoCallsScalars;
+  VNtoInsns VNtoCallsLoads;
+  VNtoInsns VNtoCallsStores;
+
+public:
+  // Insert Call and its value numbering in one of the VNtoCalls* containers.
+  void insert(CallInst *Call, GVN::ValueTable &VN) {
+    // A call that doesNotAccessMemory is handled as a Scalar,
+    // onlyReadsMemory will be handled as a Load instruction,
+    // all other calls will be handled as stores.
+    unsigned V = VN.lookupOrAdd(Call);
+    auto Entry = std::make_pair(V, InvalidVN);
+
+    if (Call->doesNotAccessMemory())
+      VNtoCallsScalars[Entry].push_back(Call);
+    else if (Call->onlyReadsMemory())
+      VNtoCallsLoads[Entry].push_back(Call);
+    else
+      VNtoCallsStores[Entry].push_back(Call);
+  }
+
+  const VNtoInsns &getScalarVNTable() const { return VNtoCallsScalars; }
+  const VNtoInsns &getLoadVNTable() const { return VNtoCallsLoads; }
+  const VNtoInsns &getStoreVNTable() const { return VNtoCallsStores; }
+};
+
+static void combineKnownMetadata(Instruction *ReplInst, Instruction *I) {
   static const unsigned KnownIDs[] = {LLVMContext::MD_tbaa,
                                       LLVMContext::MD_alias_scope,
                                       LLVMContext::MD_noalias,
@@ -250,248 +250,248 @@ static void combineKnownMetadata(Instruction *ReplInst, Instruction *I) {
                                       LLVMContext::MD_invariant_load,
                                       LLVMContext::MD_invariant_group,
                                       LLVMContext::MD_access_group};
-  combineMetadata(ReplInst, I, KnownIDs, true); 
-} 
- 
-// This pass hoists common computations across branches sharing common 
-// dominator. The primary goal is to reduce the code size, and in some 
-// cases reduce critical path (by exposing more ILP). 
-class GVNHoist { 
-public: 
-  GVNHoist(DominatorTree *DT, PostDominatorTree *PDT, AliasAnalysis *AA, 
-           MemoryDependenceResults *MD, MemorySSA *MSSA) 
-      : DT(DT), PDT(PDT), AA(AA), MD(MD), MSSA(MSSA), 
-        MSSAUpdater(std::make_unique<MemorySSAUpdater>(MSSA)) {} 
- 
+  combineMetadata(ReplInst, I, KnownIDs, true);
+}
+
+// This pass hoists common computations across branches sharing common
+// dominator. The primary goal is to reduce the code size, and in some
+// cases reduce critical path (by exposing more ILP).
+class GVNHoist {
+public:
+  GVNHoist(DominatorTree *DT, PostDominatorTree *PDT, AliasAnalysis *AA,
+           MemoryDependenceResults *MD, MemorySSA *MSSA)
+      : DT(DT), PDT(PDT), AA(AA), MD(MD), MSSA(MSSA),
+        MSSAUpdater(std::make_unique<MemorySSAUpdater>(MSSA)) {}
+
   bool run(Function &F);
- 
-  // Copied from NewGVN.cpp 
-  // This function provides global ranking of operations so that we can place 
-  // them in a canonical order.  Note that rank alone is not necessarily enough 
-  // for a complete ordering, as constants all have the same rank.  However, 
-  // generally, we will simplify an operation with all constants so that it 
-  // doesn't matter what order they appear in. 
+
+  // Copied from NewGVN.cpp
+  // This function provides global ranking of operations so that we can place
+  // them in a canonical order.  Note that rank alone is not necessarily enough
+  // for a complete ordering, as constants all have the same rank.  However,
+  // generally, we will simplify an operation with all constants so that it
+  // doesn't matter what order they appear in.
   unsigned int rank(const Value *V) const;
- 
-private: 
-  GVN::ValueTable VN; 
-  DominatorTree *DT; 
-  PostDominatorTree *PDT; 
-  AliasAnalysis *AA; 
-  MemoryDependenceResults *MD; 
-  MemorySSA *MSSA; 
-  std::unique_ptr<MemorySSAUpdater> MSSAUpdater; 
-  DenseMap<const Value *, unsigned> DFSNumber; 
-  BBSideEffectsSet BBSideEffects; 
-  DenseSet<const BasicBlock *> HoistBarrier; 
-  SmallVector<BasicBlock *, 32> IDFBlocks; 
-  unsigned NumFuncArgs; 
-  const bool HoistingGeps = false; 
- 
-  enum InsKind { Unknown, Scalar, Load, Store }; 
- 
-  // Return true when there are exception handling in BB. 
+
+private:
+  GVN::ValueTable VN;
+  DominatorTree *DT;
+  PostDominatorTree *PDT;
+  AliasAnalysis *AA;
+  MemoryDependenceResults *MD;
+  MemorySSA *MSSA;
+  std::unique_ptr<MemorySSAUpdater> MSSAUpdater;
+  DenseMap<const Value *, unsigned> DFSNumber;
+  BBSideEffectsSet BBSideEffects;
+  DenseSet<const BasicBlock *> HoistBarrier;
+  SmallVector<BasicBlock *, 32> IDFBlocks;
+  unsigned NumFuncArgs;
+  const bool HoistingGeps = false;
+
+  enum InsKind { Unknown, Scalar, Load, Store };
+
+  // Return true when there are exception handling in BB.
   bool hasEH(const BasicBlock *BB);
- 
-  // Return true when I1 appears before I2 in the instructions of BB. 
-  bool firstInBB(const Instruction *I1, const Instruction *I2) { 
-    assert(I1->getParent() == I2->getParent()); 
-    unsigned I1DFS = DFSNumber.lookup(I1); 
-    unsigned I2DFS = DFSNumber.lookup(I2); 
-    assert(I1DFS && I2DFS); 
-    return I1DFS < I2DFS; 
-  } 
- 
-  // Return true when there are memory uses of Def in BB. 
-  bool hasMemoryUse(const Instruction *NewPt, MemoryDef *Def, 
+
+  // Return true when I1 appears before I2 in the instructions of BB.
+  bool firstInBB(const Instruction *I1, const Instruction *I2) {
+    assert(I1->getParent() == I2->getParent());
+    unsigned I1DFS = DFSNumber.lookup(I1);
+    unsigned I2DFS = DFSNumber.lookup(I2);
+    assert(I1DFS && I2DFS);
+    return I1DFS < I2DFS;
+  }
+
+  // Return true when there are memory uses of Def in BB.
+  bool hasMemoryUse(const Instruction *NewPt, MemoryDef *Def,
                     const BasicBlock *BB);
- 
-  bool hasEHhelper(const BasicBlock *BB, const BasicBlock *SrcBB, 
+
+  bool hasEHhelper(const BasicBlock *BB, const BasicBlock *SrcBB,
                    int &NBBsOnAllPaths);
- 
-  // Return true when there are exception handling or loads of memory Def 
-  // between Def and NewPt.  This function is only called for stores: Def is 
-  // the MemoryDef of the store to be hoisted. 
- 
-  // Decrement by 1 NBBsOnAllPaths for each block between HoistPt and BB, and 
-  // return true when the counter NBBsOnAllPaths reaces 0, except when it is 
-  // initialized to -1 which is unlimited. 
-  bool hasEHOrLoadsOnPath(const Instruction *NewPt, MemoryDef *Def, 
+
+  // Return true when there are exception handling or loads of memory Def
+  // between Def and NewPt.  This function is only called for stores: Def is
+  // the MemoryDef of the store to be hoisted.
+
+  // Decrement by 1 NBBsOnAllPaths for each block between HoistPt and BB, and
+  // return true when the counter NBBsOnAllPaths reaces 0, except when it is
+  // initialized to -1 which is unlimited.
+  bool hasEHOrLoadsOnPath(const Instruction *NewPt, MemoryDef *Def,
                           int &NBBsOnAllPaths);
- 
-  // Return true when there are exception handling between HoistPt and BB. 
-  // Decrement by 1 NBBsOnAllPaths for each block between HoistPt and BB, and 
-  // return true when the counter NBBsOnAllPaths reaches 0, except when it is 
-  // initialized to -1 which is unlimited. 
-  bool hasEHOnPath(const BasicBlock *HoistPt, const BasicBlock *SrcBB, 
+
+  // Return true when there are exception handling between HoistPt and BB.
+  // Decrement by 1 NBBsOnAllPaths for each block between HoistPt and BB, and
+  // return true when the counter NBBsOnAllPaths reaches 0, except when it is
+  // initialized to -1 which is unlimited.
+  bool hasEHOnPath(const BasicBlock *HoistPt, const BasicBlock *SrcBB,
                    int &NBBsOnAllPaths);
- 
-  // Return true when it is safe to hoist a memory load or store U from OldPt 
-  // to NewPt. 
-  bool safeToHoistLdSt(const Instruction *NewPt, const Instruction *OldPt, 
+
+  // Return true when it is safe to hoist a memory load or store U from OldPt
+  // to NewPt.
+  bool safeToHoistLdSt(const Instruction *NewPt, const Instruction *OldPt,
                        MemoryUseOrDef *U, InsKind K, int &NBBsOnAllPaths);
- 
-  // Return true when it is safe to hoist scalar instructions from all blocks in 
-  // WL to HoistBB. 
-  bool safeToHoistScalar(const BasicBlock *HoistBB, const BasicBlock *BB, 
-                         int &NBBsOnAllPaths) { 
-    return !hasEHOnPath(HoistBB, BB, NBBsOnAllPaths); 
-  } 
- 
-  // In the inverse CFG, the dominance frontier of basic block (BB) is the 
-  // point where ANTIC needs to be computed for instructions which are going 
-  // to be hoisted. Since this point does not change during gvn-hoist, 
-  // we compute it only once (on demand). 
-  // The ides is inspired from: 
-  // "Partial Redundancy Elimination in SSA Form" 
-  // ROBERT KENNEDY, SUN CHAN, SHIN-MING LIU, RAYMOND LO, PENG TU and FRED CHOW 
-  // They use similar idea in the forward graph to find fully redundant and 
-  // partially redundant expressions, here it is used in the inverse graph to 
-  // find fully anticipable instructions at merge point (post-dominator in 
-  // the inverse CFG). 
-  // Returns the edge via which an instruction in BB will get the values from. 
- 
-  // Returns true when the values are flowing out to each edge. 
+
+  // Return true when it is safe to hoist scalar instructions from all blocks in
+  // WL to HoistBB.
+  bool safeToHoistScalar(const BasicBlock *HoistBB, const BasicBlock *BB,
+                         int &NBBsOnAllPaths) {
+    return !hasEHOnPath(HoistBB, BB, NBBsOnAllPaths);
+  }
+
+  // In the inverse CFG, the dominance frontier of basic block (BB) is the
+  // point where ANTIC needs to be computed for instructions which are going
+  // to be hoisted. Since this point does not change during gvn-hoist,
+  // we compute it only once (on demand).
+  // The ides is inspired from:
+  // "Partial Redundancy Elimination in SSA Form"
+  // ROBERT KENNEDY, SUN CHAN, SHIN-MING LIU, RAYMOND LO, PENG TU and FRED CHOW
+  // They use similar idea in the forward graph to find fully redundant and
+  // partially redundant expressions, here it is used in the inverse graph to
+  // find fully anticipable instructions at merge point (post-dominator in
+  // the inverse CFG).
+  // Returns the edge via which an instruction in BB will get the values from.
+
+  // Returns true when the values are flowing out to each edge.
   bool valueAnticipable(CHIArgs C, Instruction *TI) const;
- 
-  // Check if it is safe to hoist values tracked by CHI in the range 
-  // [Begin, End) and accumulate them in Safe. 
-  void checkSafety(CHIArgs C, BasicBlock *BB, InsKind K, 
+
+  // Check if it is safe to hoist values tracked by CHI in the range
+  // [Begin, End) and accumulate them in Safe.
+  void checkSafety(CHIArgs C, BasicBlock *BB, InsKind K,
                    SmallVectorImpl<CHIArg> &Safe);
- 
-  using RenameStackType = DenseMap<VNType, SmallVector<Instruction *, 2>>; 
- 
-  // Push all the VNs corresponding to BB into RenameStack. 
-  void fillRenameStack(BasicBlock *BB, InValuesType &ValueBBs, 
+
+  using RenameStackType = DenseMap<VNType, SmallVector<Instruction *, 2>>;
+
+  // Push all the VNs corresponding to BB into RenameStack.
+  void fillRenameStack(BasicBlock *BB, InValuesType &ValueBBs,
                        RenameStackType &RenameStack);
- 
-  void fillChiArgs(BasicBlock *BB, OutValuesType &CHIBBs, 
+
+  void fillChiArgs(BasicBlock *BB, OutValuesType &CHIBBs,
                    RenameStackType &RenameStack);
- 
-  // Walk the post-dominator tree top-down and use a stack for each value to 
-  // store the last value you see. When you hit a CHI from a given edge, the 
-  // value to use as the argument is at the top of the stack, add the value to 
-  // CHI and pop. 
-  void insertCHI(InValuesType &ValueBBs, OutValuesType &CHIBBs) { 
-    auto Root = PDT->getNode(nullptr); 
-    if (!Root) 
-      return; 
-    // Depth first walk on PDom tree to fill the CHIargs at each PDF. 
-    RenameStackType RenameStack; 
-    for (auto Node : depth_first(Root)) { 
-      BasicBlock *BB = Node->getBlock(); 
-      if (!BB) 
-        continue; 
- 
-      // Collect all values in BB and push to stack. 
-      fillRenameStack(BB, ValueBBs, RenameStack); 
- 
-      // Fill outgoing values in each CHI corresponding to BB. 
-      fillChiArgs(BB, CHIBBs, RenameStack); 
-    } 
-  } 
- 
-  // Walk all the CHI-nodes to find ones which have a empty-entry and remove 
-  // them Then collect all the instructions which are safe to hoist and see if 
-  // they form a list of anticipable values. OutValues contains CHIs 
-  // corresponding to each basic block. 
-  void findHoistableCandidates(OutValuesType &CHIBBs, InsKind K, 
+
+  // Walk the post-dominator tree top-down and use a stack for each value to
+  // store the last value you see. When you hit a CHI from a given edge, the
+  // value to use as the argument is at the top of the stack, add the value to
+  // CHI and pop.
+  void insertCHI(InValuesType &ValueBBs, OutValuesType &CHIBBs) {
+    auto Root = PDT->getNode(nullptr);
+    if (!Root)
+      return;
+    // Depth first walk on PDom tree to fill the CHIargs at each PDF.
+    RenameStackType RenameStack;
+    for (auto Node : depth_first(Root)) {
+      BasicBlock *BB = Node->getBlock();
+      if (!BB)
+        continue;
+
+      // Collect all values in BB and push to stack.
+      fillRenameStack(BB, ValueBBs, RenameStack);
+
+      // Fill outgoing values in each CHI corresponding to BB.
+      fillChiArgs(BB, CHIBBs, RenameStack);
+    }
+  }
+
+  // Walk all the CHI-nodes to find ones which have a empty-entry and remove
+  // them Then collect all the instructions which are safe to hoist and see if
+  // they form a list of anticipable values. OutValues contains CHIs
+  // corresponding to each basic block.
+  void findHoistableCandidates(OutValuesType &CHIBBs, InsKind K,
                                HoistingPointList &HPL);
- 
-  // Compute insertion points for each values which can be fully anticipated at 
-  // a dominator. HPL contains all such values. 
-  void computeInsertionPoints(const VNtoInsns &Map, HoistingPointList &HPL, 
-                              InsKind K) { 
-    // Sort VNs based on their rankings 
-    std::vector<VNType> Ranks; 
-    for (const auto &Entry : Map) { 
-      Ranks.push_back(Entry.first); 
-    } 
- 
-    // TODO: Remove fully-redundant expressions. 
-    // Get instruction from the Map, assume that all the Instructions 
-    // with same VNs have same rank (this is an approximation). 
-    llvm::sort(Ranks, [this, &Map](const VNType &r1, const VNType &r2) { 
-      return (rank(*Map.lookup(r1).begin()) < rank(*Map.lookup(r2).begin())); 
-    }); 
- 
-    // - Sort VNs according to their rank, and start with lowest ranked VN 
-    // - Take a VN and for each instruction with same VN 
-    //   - Find the dominance frontier in the inverse graph (PDF) 
-    //   - Insert the chi-node at PDF 
-    // - Remove the chi-nodes with missing entries 
-    // - Remove values from CHI-nodes which do not truly flow out, e.g., 
-    //   modified along the path. 
-    // - Collect the remaining values that are still anticipable 
-    SmallVector<BasicBlock *, 2> IDFBlocks; 
-    ReverseIDFCalculator IDFs(*PDT); 
-    OutValuesType OutValue; 
-    InValuesType InValue; 
-    for (const auto &R : Ranks) { 
-      const SmallVecInsn &V = Map.lookup(R); 
-      if (V.size() < 2) 
-        continue; 
-      const VNType &VN = R; 
-      SmallPtrSet<BasicBlock *, 2> VNBlocks; 
-      for (auto &I : V) { 
-        BasicBlock *BBI = I->getParent(); 
-        if (!hasEH(BBI)) 
-          VNBlocks.insert(BBI); 
-      } 
-      // Compute the Post Dominance Frontiers of each basic block 
-      // The dominance frontier of a live block X in the reverse 
-      // control graph is the set of blocks upon which X is control 
-      // dependent. The following sequence computes the set of blocks 
-      // which currently have dead terminators that are control 
-      // dependence sources of a block which is in NewLiveBlocks. 
-      IDFs.setDefiningBlocks(VNBlocks); 
-      IDFBlocks.clear(); 
-      IDFs.calculate(IDFBlocks); 
- 
-      // Make a map of BB vs instructions to be hoisted. 
-      for (unsigned i = 0; i < V.size(); ++i) { 
-        InValue[V[i]->getParent()].push_back(std::make_pair(VN, V[i])); 
-      } 
-      // Insert empty CHI node for this VN. This is used to factor out 
-      // basic blocks where the ANTIC can potentially change. 
+
+  // Compute insertion points for each values which can be fully anticipated at
+  // a dominator. HPL contains all such values.
+  void computeInsertionPoints(const VNtoInsns &Map, HoistingPointList &HPL,
+                              InsKind K) {
+    // Sort VNs based on their rankings
+    std::vector<VNType> Ranks;
+    for (const auto &Entry : Map) {
+      Ranks.push_back(Entry.first);
+    }
+
+    // TODO: Remove fully-redundant expressions.
+    // Get instruction from the Map, assume that all the Instructions
+    // with same VNs have same rank (this is an approximation).
+    llvm::sort(Ranks, [this, &Map](const VNType &r1, const VNType &r2) {
+      return (rank(*Map.lookup(r1).begin()) < rank(*Map.lookup(r2).begin()));
+    });
+
+    // - Sort VNs according to their rank, and start with lowest ranked VN
+    // - Take a VN and for each instruction with same VN
+    //   - Find the dominance frontier in the inverse graph (PDF)
+    //   - Insert the chi-node at PDF
+    // - Remove the chi-nodes with missing entries
+    // - Remove values from CHI-nodes which do not truly flow out, e.g.,
+    //   modified along the path.
+    // - Collect the remaining values that are still anticipable
+    SmallVector<BasicBlock *, 2> IDFBlocks;
+    ReverseIDFCalculator IDFs(*PDT);
+    OutValuesType OutValue;
+    InValuesType InValue;
+    for (const auto &R : Ranks) {
+      const SmallVecInsn &V = Map.lookup(R);
+      if (V.size() < 2)
+        continue;
+      const VNType &VN = R;
+      SmallPtrSet<BasicBlock *, 2> VNBlocks;
+      for (auto &I : V) {
+        BasicBlock *BBI = I->getParent();
+        if (!hasEH(BBI))
+          VNBlocks.insert(BBI);
+      }
+      // Compute the Post Dominance Frontiers of each basic block
+      // The dominance frontier of a live block X in the reverse
+      // control graph is the set of blocks upon which X is control
+      // dependent. The following sequence computes the set of blocks
+      // which currently have dead terminators that are control
+      // dependence sources of a block which is in NewLiveBlocks.
+      IDFs.setDefiningBlocks(VNBlocks);
+      IDFBlocks.clear();
+      IDFs.calculate(IDFBlocks);
+
+      // Make a map of BB vs instructions to be hoisted.
+      for (unsigned i = 0; i < V.size(); ++i) {
+        InValue[V[i]->getParent()].push_back(std::make_pair(VN, V[i]));
+      }
+      // Insert empty CHI node for this VN. This is used to factor out
+      // basic blocks where the ANTIC can potentially change.
       CHIArg EmptyChi = {VN, nullptr, nullptr};
       for (auto *IDFBB : IDFBlocks) {
-        for (unsigned i = 0; i < V.size(); ++i) { 
+        for (unsigned i = 0; i < V.size(); ++i) {
           // Ignore spurious PDFs.
           if (DT->properlyDominates(IDFBB, V[i]->getParent())) {
             OutValue[IDFBB].push_back(EmptyChi);
             LLVM_DEBUG(dbgs() << "\nInserting a CHI for BB: "
                               << IDFBB->getName() << ", for Insn: " << *V[i]);
-          } 
-        } 
-      } 
-    } 
- 
-    // Insert CHI args at each PDF to iterate on factored graph of 
-    // control dependence. 
-    insertCHI(InValue, OutValue); 
-    // Using the CHI args inserted at each PDF, find fully anticipable values. 
-    findHoistableCandidates(OutValue, K, HPL); 
-  } 
- 
-  // Return true when all operands of Instr are available at insertion point 
-  // HoistPt. When limiting the number of hoisted expressions, one could hoist 
-  // a load without hoisting its access function. So before hoisting any 
-  // expression, make sure that all its operands are available at insert point. 
-  bool allOperandsAvailable(const Instruction *I, 
+          }
+        }
+      }
+    }
+
+    // Insert CHI args at each PDF to iterate on factored graph of
+    // control dependence.
+    insertCHI(InValue, OutValue);
+    // Using the CHI args inserted at each PDF, find fully anticipable values.
+    findHoistableCandidates(OutValue, K, HPL);
+  }
+
+  // Return true when all operands of Instr are available at insertion point
+  // HoistPt. When limiting the number of hoisted expressions, one could hoist
+  // a load without hoisting its access function. So before hoisting any
+  // expression, make sure that all its operands are available at insert point.
+  bool allOperandsAvailable(const Instruction *I,
                             const BasicBlock *HoistPt) const;
- 
-  // Same as allOperandsAvailable with recursive check for GEP operands. 
-  bool allGepOperandsAvailable(const Instruction *I, 
+
+  // Same as allOperandsAvailable with recursive check for GEP operands.
+  bool allGepOperandsAvailable(const Instruction *I,
                                const BasicBlock *HoistPt) const;
- 
-  // Make all operands of the GEP available. 
-  void makeGepsAvailable(Instruction *Repl, BasicBlock *HoistPt, 
-                         const SmallVecInsn &InstructionsToHoist, 
+
+  // Make all operands of the GEP available.
+  void makeGepsAvailable(Instruction *Repl, BasicBlock *HoistPt,
+                         const SmallVecInsn &InstructionsToHoist,
                          Instruction *Gep) const;
- 
+
   void updateAlignment(Instruction *I, Instruction *Repl);
- 
+
   // Remove all the instructions in Candidates and replace their usage with
   // Repl. Returns the number of instructions removed.
   unsigned rauw(const SmallVecInsn &Candidates, Instruction *Repl,
@@ -655,20 +655,20 @@ bool GVNHoist::hasMemoryUse(const Instruction *NewPt, MemoryDef *Def,
             continue;
           ReachedNewPt = true;
         }
-      } 
+      }
       if (MemorySSAUtil::defClobbersUseOrDef(Def, MU, *AA))
         return true;
     }
- 
+
   return false;
 }
- 
+
 bool GVNHoist::hasEHhelper(const BasicBlock *BB, const BasicBlock *SrcBB,
                            int &NBBsOnAllPaths) {
   // Stop walk once the limit is reached.
   if (NBBsOnAllPaths == 0)
     return true;
- 
+
   // Impossible to hoist with exceptions on the path.
   if (hasEH(BB))
     return true;
@@ -700,8 +700,8 @@ bool GVNHoist::hasEHOrLoadsOnPath(const Instruction *NewPt, MemoryDef *Def,
       // Stop traversal when reaching HoistPt.
       I.skipChildren();
       continue;
-    } 
- 
+    }
+
     if (hasEHhelper(BB, OldBB, NBBsOnAllPaths))
       return true;
 
@@ -714,8 +714,8 @@ bool GVNHoist::hasEHOrLoadsOnPath(const Instruction *NewPt, MemoryDef *Def,
       --NBBsOnAllPaths;
 
     ++I;
-  } 
- 
+  }
+
   return false;
 }
 
@@ -734,7 +734,7 @@ bool GVNHoist::hasEHOnPath(const BasicBlock *HoistPt, const BasicBlock *SrcBB,
       // Stop traversal when reaching NewHoistPt.
       I.skipChildren();
       continue;
-    } 
+    }
 
     if (hasEHhelper(BB, SrcBB, NBBsOnAllPaths))
       return true;
@@ -744,11 +744,11 @@ bool GVNHoist::hasEHOnPath(const BasicBlock *HoistPt, const BasicBlock *SrcBB,
       --NBBsOnAllPaths;
 
     ++I;
-  } 
- 
+  }
+
   return false;
 }
- 
+
 bool GVNHoist::safeToHoistLdSt(const Instruction *NewPt,
                                const Instruction *OldPt, MemoryUseOrDef *U,
                                GVNHoist::InsKind K, int &NBBsOnAllPaths) {
@@ -785,12 +785,12 @@ bool GVNHoist::safeToHoistLdSt(const Instruction *NewPt,
       return true;
     assert(UBB == DBB);
     assert(MSSA->locallyDominates(D, U));
-  } 
- 
+  }
+
   // No side effects: it is safe to hoist.
   return true;
 }
- 
+
 bool GVNHoist::valueAnticipable(CHIArgs C, Instruction *TI) const {
   if (TI->getNumSuccessors() > (unsigned)size(C))
     return false; // Not enough args in this CHI.
@@ -818,10 +818,10 @@ void GVNHoist::checkSafety(CHIArgs C, BasicBlock *BB, GVNHoist::InsKind K,
       if (MemoryUseOrDef *UD = MSSA->getMemoryAccess(Insn))
         if (safeToHoistLdSt(T, Insn, UD, K, NumBBsOnAllPaths))
           Safe.push_back(CHI);
-    } 
-  } 
+    }
+  }
 }
- 
+
 void GVNHoist::fillRenameStack(BasicBlock *BB, InValuesType &ValueBBs,
                                GVNHoist::RenameStackType &RenameStack) {
   auto it1 = ValueBBs.find(BB);
@@ -831,10 +831,10 @@ void GVNHoist::fillRenameStack(BasicBlock *BB, InValuesType &ValueBBs,
       // Get the value of instruction I
       LLVM_DEBUG(dbgs() << "\nPushing on stack: " << *VI.second);
       RenameStack[VI.first].push_back(VI.second);
-    } 
+    }
   }
 }
- 
+
 void GVNHoist::fillChiArgs(BasicBlock *BB, OutValuesType &CHIBBs,
                            GVNHoist::RenameStackType &RenameStack) {
   // For each *predecessor* (because Post-DOM) of BB check if it has a CHI
@@ -869,7 +869,7 @@ void GVNHoist::fillChiArgs(BasicBlock *BB, OutValuesType &CHIBBs,
     }
   }
 }
- 
+
 void GVNHoist::findHoistableCandidates(OutValuesType &CHIBBs,
                                        GVNHoist::InsKind K,
                                        HoistingPointList &HPL) {
@@ -911,9 +911,9 @@ void GVNHoist::findHoistableCandidates(OutValuesType &CHIBBs,
       PHIIt = std::find_if(PrevIt, CHIs.end(),
                            [PrevIt](CHIArg &A) { return A != *PrevIt; });
     }
-  } 
+  }
 }
- 
+
 bool GVNHoist::allOperandsAvailable(const Instruction *I,
                                     const BasicBlock *HoistPt) const {
   for (const Use &Op : I->operands())
@@ -932,14 +932,14 @@ bool GVNHoist::allGepOperandsAvailable(const Instruction *I,
         if (const GetElementPtrInst *GepOp =
                 dyn_cast<GetElementPtrInst>(Inst)) {
           if (!allGepOperandsAvailable(GepOp, HoistPt))
-            return false; 
+            return false;
           // Gep is available if all operands of GepOp are available.
         } else {
           // Gep is not available if it has operands other than GEPs that are
           // defined in blocks not dominating HoistPt.
-          return false; 
+          return false;
         }
-      } 
+      }
   return true;
 }
 
@@ -959,15 +959,15 @@ void GVNHoist::makeGepsAvailable(Instruction *Repl, BasicBlock *HoistPt,
       // of this GEP available at HoistPt.
       if (GetElementPtrInst *GepOp = dyn_cast<GetElementPtrInst>(Op))
         makeGepsAvailable(ClonedGep, HoistPt, InstructionsToHoist, GepOp);
-    } 
- 
+    }
+
   // Copy Gep and replace its uses in Repl with ClonedGep.
   ClonedGep->insertBefore(HoistPt->getTerminator());
- 
+
   // Conservatively discard any optimization hints, they may differ on the
   // other paths.
   ClonedGep->dropUnknownNonDebugMetadata();
- 
+
   // If we have optimization hints which agree with each other along different
   // paths, preserve them.
   for (const Instruction *OtherInst : InstructionsToHoist) {
@@ -979,7 +979,7 @@ void GVNHoist::makeGepsAvailable(Instruction *Repl, BasicBlock *HoistPt,
           cast<StoreInst>(OtherInst)->getPointerOperand());
     ClonedGep->andIRFlags(OtherGep);
   }
- 
+
   // Replace uses of Gep with ClonedGep in Repl.
   Repl->replaceUsesOfWith(Gep, ClonedGep);
 }
@@ -998,9 +998,9 @@ void GVNHoist::updateAlignment(Instruction *I, Instruction *Repl) {
                                              cast<AllocaInst>(I)->getAlign()));
   } else if (isa<CallInst>(Repl)) {
     ++NumCallsRemoved;
-  } 
+  }
 }
- 
+
 unsigned GVNHoist::rauw(const SmallVecInsn &Candidates, Instruction *Repl,
                         MemoryUseOrDef *NewMemAcc) {
   unsigned NR = 0;
@@ -1014,7 +1014,7 @@ unsigned GVNHoist::rauw(const SmallVecInsn &Candidates, Instruction *Repl,
         OldMA->replaceAllUsesWith(NewMemAcc);
         MSSAUpdater->removeMemoryAccess(OldMA);
       }
- 
+
       Repl->andIRFlags(I);
       combineKnownMetadata(Repl, I);
       I->replaceAllUsesWith(Repl);
@@ -1025,13 +1025,13 @@ unsigned GVNHoist::rauw(const SmallVecInsn &Candidates, Instruction *Repl,
   }
   return NR;
 }
- 
+
 void GVNHoist::raMPHIuw(MemoryUseOrDef *NewMemAcc) {
   SmallPtrSet<MemoryPhi *, 4> UsePhis;
   for (User *U : NewMemAcc->users())
     if (MemoryPhi *Phi = dyn_cast<MemoryPhi>(U))
       UsePhis.insert(Phi);
- 
+
   for (MemoryPhi *Phi : UsePhis) {
     auto In = Phi->incoming_values();
     if (llvm::all_of(In, [&](Use &U) { return U == NewMemAcc; })) {
@@ -1040,7 +1040,7 @@ void GVNHoist::raMPHIuw(MemoryUseOrDef *NewMemAcc) {
     }
   }
 }
- 
+
 unsigned GVNHoist::removeAndReplace(const SmallVecInsn &Candidates,
                                     Instruction *Repl, BasicBlock *DestBB,
                                     bool MoveAccess) {
@@ -1050,16 +1050,16 @@ unsigned GVNHoist::removeAndReplace(const SmallVecInsn &Candidates,
     // legal when the ld/st is not moved past its current definition.
     MSSAUpdater->moveToPlace(NewMemAcc, DestBB, MemorySSA::BeforeTerminator);
   }
- 
+
   // Replace all other instructions with Repl with memory access NewMemAcc.
   unsigned NR = rauw(Candidates, Repl, NewMemAcc);
- 
+
   // Remove MemorySSA phi nodes with the same arguments.
   if (NewMemAcc)
     raMPHIuw(NewMemAcc);
   return NR;
 }
- 
+
 bool GVNHoist::makeGepOperandsAvailable(
     Instruction *Repl, BasicBlock *HoistPt,
     const SmallVecInsn &InstructionsToHoist) const {
@@ -1079,21 +1079,21 @@ bool GVNHoist::makeGepOperandsAvailable(
           return false;
       } else if (!DT->dominates(Val->getParent(), HoistPt))
         return false;
-    } 
+    }
   }
- 
+
   // Check whether we can compute the Gep at HoistPt.
   if (!Gep || !allGepOperandsAvailable(Gep, HoistPt))
     return false;
- 
+
   makeGepsAvailable(Repl, HoistPt, InstructionsToHoist, Gep);
- 
+
   if (Val && isa<GetElementPtrInst>(Val))
     makeGepsAvailable(Repl, HoistPt, InstructionsToHoist, Val);
- 
+
   return true;
 }
- 
+
 std::pair<unsigned, unsigned> GVNHoist::hoist(HoistingPointList &HPL) {
   unsigned NI = 0, NL = 0, NS = 0, NC = 0, NR = 0;
   for (const HoistingPointInfo &HP : HPL) {
@@ -1109,7 +1109,7 @@ std::pair<unsigned, unsigned> GVNHoist::hoist(HoistingPointList &HPL) {
         // of the second based on the first.
         if (!Repl || firstInBB(I, Repl))
           Repl = I;
- 
+
     // Keep track of whether we moved the instruction so we know whether we
     // should move the MemoryAccess.
     bool MoveAccess = true;
@@ -1122,7 +1122,7 @@ std::pair<unsigned, unsigned> GVNHoist::hoist(HoistingPointList &HPL) {
       // When we do not find Repl in HoistPt, select the first in the list
       // and move it to HoistPt.
       Repl = InstructionsToHoist.front();
- 
+
       // We can move Repl in HoistPt only when all operands are available.
       // The order in which hoistings are done may influence the availability
       // of operands.
@@ -1135,7 +1135,7 @@ std::pair<unsigned, unsigned> GVNHoist::hoist(HoistingPointList &HPL) {
         // When not HoistingGeps we need to copy the GEPs.
         if (!makeGepOperandsAvailable(Repl, DestBB, InstructionsToHoist))
           continue;
-      } 
+      }
 
       // Move the instruction at the end of HoistPt.
       Instruction *Last = DestBB->getTerminator();
@@ -1143,8 +1143,8 @@ std::pair<unsigned, unsigned> GVNHoist::hoist(HoistingPointList &HPL) {
       Repl->moveBefore(Last);
 
       DFSNumber[Repl] = DFSNumber[Last]++;
-    } 
- 
+    }
+
     NR += removeAndReplace(InstructionsToHoist, Repl, DestBB, MoveAccess);
 
     if (isa<LoadInst>(Repl))
@@ -1155,11 +1155,11 @@ std::pair<unsigned, unsigned> GVNHoist::hoist(HoistingPointList &HPL) {
       ++NC;
     else // Scalar
       ++NI;
-  } 
- 
+  }
+
   if (MSSA && VerifyMemorySSA)
     MSSA->verifyMemorySSA();
- 
+
   NumHoisted += NL + NS + NC + NI;
   NumRemoved += NR;
   NumLoadsHoisted += NL;
@@ -1167,7 +1167,7 @@ std::pair<unsigned, unsigned> GVNHoist::hoist(HoistingPointList &HPL) {
   NumCallsHoisted += NC;
   return {NI, NL + NC + NS};
 }
- 
+
 std::pair<unsigned, unsigned> GVNHoist::hoistExpressions(Function &F) {
   InsnInfo II;
   LoadInfo LI;
@@ -1186,11 +1186,11 @@ std::pair<unsigned, unsigned> GVNHoist::hoistExpressions(Function &F) {
       // deeper may increase the register pressure and compilation time.
       if (MaxDepthInBB != -1 && InstructionNb++ >= MaxDepthInBB)
         break;
- 
+
       // Do not value number terminator instructions.
       if (I1.isTerminator())
         break;
- 
+
       if (auto *Load = dyn_cast<LoadInst>(&I1))
         LI.insert(Load, VN);
       else if (auto *Store = dyn_cast<StoreInst>(&I1))
@@ -1216,8 +1216,8 @@ std::pair<unsigned, unsigned> GVNHoist::hoistExpressions(Function &F) {
         // registers than X86.
         II.insert(&I1, VN);
     }
-  } 
- 
+  }
+
   HoistingPointList HPL;
   computeInsertionPoints(II.getVNTable(), HPL, InsKind::Scalar);
   computeInsertionPoints(LI.getVNTable(), HPL, InsKind::Load);
@@ -1228,35 +1228,35 @@ std::pair<unsigned, unsigned> GVNHoist::hoistExpressions(Function &F) {
   return hoist(HPL);
 }
 
-} // end namespace llvm 
- 
-PreservedAnalyses GVNHoistPass::run(Function &F, FunctionAnalysisManager &AM) { 
-  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F); 
-  PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F); 
-  AliasAnalysis &AA = AM.getResult<AAManager>(F); 
-  MemoryDependenceResults &MD = AM.getResult<MemoryDependenceAnalysis>(F); 
-  MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA(); 
-  GVNHoist G(&DT, &PDT, &AA, &MD, &MSSA); 
-  if (!G.run(F)) 
-    return PreservedAnalyses::all(); 
- 
-  PreservedAnalyses PA; 
-  PA.preserve<DominatorTreeAnalysis>(); 
-  PA.preserve<MemorySSAAnalysis>(); 
-  PA.preserve<GlobalsAA>(); 
-  return PA; 
-} 
- 
-char GVNHoistLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(GVNHoistLegacyPass, "gvn-hoist", 
-                      "Early GVN Hoisting of Expressions", false, false) 
-INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 
-INITIALIZE_PASS_END(GVNHoistLegacyPass, "gvn-hoist", 
-                    "Early GVN Hoisting of Expressions", false, false) 
- 
-FunctionPass *llvm::createGVNHoistPass() { return new GVNHoistLegacyPass(); } 
+} // end namespace llvm
+
+PreservedAnalyses GVNHoistPass::run(Function &F, FunctionAnalysisManager &AM) {
+  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+  AliasAnalysis &AA = AM.getResult<AAManager>(F);
+  MemoryDependenceResults &MD = AM.getResult<MemoryDependenceAnalysis>(F);
+  MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
+  GVNHoist G(&DT, &PDT, &AA, &MD, &MSSA);
+  if (!G.run(F))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<MemorySSAAnalysis>();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+
+char GVNHoistLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(GVNHoistLegacyPass, "gvn-hoist",
+                      "Early GVN Hoisting of Expressions", false, false)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(GVNHoistLegacyPass, "gvn-hoist",
+                    "Early GVN Hoisting of Expressions", false, false)
+
+FunctionPass *llvm::createGVNHoistPass() { return new GVNHoistLegacyPass(); }
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/GVNSink.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/GVNSink.cpp
index c1d1c06eab..aef927ab65 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/GVNSink.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/GVNSink.cpp
@@ -1,930 +1,930 @@
-//===- GVNSink.cpp - sink expressions into successors ---------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-/// \file GVNSink.cpp 
-/// This pass attempts to sink instructions into successors, reducing static 
-/// instruction count and enabling if-conversion. 
-/// 
-/// We use a variant of global value numbering to decide what can be sunk. 
-/// Consider: 
-/// 
-/// [ %a1 = add i32 %b, 1  ]   [ %c1 = add i32 %d, 1  ] 
-/// [ %a2 = xor i32 %a1, 1 ]   [ %c2 = xor i32 %c1, 1 ] 
-///                  \           / 
-///            [ %e = phi i32 %a2, %c2 ] 
-///            [ add i32 %e, 4         ] 
-/// 
-/// 
-/// GVN would number %a1 and %c1 differently because they compute different 
-/// results - the VN of an instruction is a function of its opcode and the 
-/// transitive closure of its operands. This is the key property for hoisting 
-/// and CSE. 
-/// 
-/// What we want when sinking however is for a numbering that is a function of 
-/// the *uses* of an instruction, which allows us to answer the question "if I 
-/// replace %a1 with %c1, will it contribute in an equivalent way to all 
-/// successive instructions?". The PostValueTable class in GVN provides this 
-/// mapping. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/DenseMapInfo.h" 
-#include "llvm/ADT/DenseSet.h" 
-#include "llvm/ADT/Hashing.h" 
-#include "llvm/ADT/None.h" 
-#include "llvm/ADT/Optional.h" 
-#include "llvm/ADT/PostOrderIterator.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/StringExtras.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Allocator.h" 
-#include "llvm/Support/ArrayRecycler.h" 
-#include "llvm/Support/AtomicOrdering.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/Compiler.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Scalar/GVN.h" 
-#include "llvm/Transforms/Scalar/GVNExpression.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <cstddef> 
-#include <cstdint> 
-#include <iterator> 
-#include <utility> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "gvn-sink" 
- 
-STATISTIC(NumRemoved, "Number of instructions removed"); 
- 
-namespace llvm { 
-namespace GVNExpression { 
- 
-LLVM_DUMP_METHOD void Expression::dump() const { 
-  print(dbgs()); 
-  dbgs() << "\n"; 
-} 
- 
-} // end namespace GVNExpression 
-} // end namespace llvm 
- 
-namespace { 
- 
-static bool isMemoryInst(const Instruction *I) { 
-  return isa<LoadInst>(I) || isa<StoreInst>(I) || 
-         (isa<InvokeInst>(I) && !cast<InvokeInst>(I)->doesNotAccessMemory()) || 
-         (isa<CallInst>(I) && !cast<CallInst>(I)->doesNotAccessMemory()); 
-} 
- 
-/// Iterates through instructions in a set of blocks in reverse order from the 
-/// first non-terminator. For example (assume all blocks have size n): 
-///   LockstepReverseIterator I([B1, B2, B3]); 
-///   *I-- = [B1[n], B2[n], B3[n]]; 
-///   *I-- = [B1[n-1], B2[n-1], B3[n-1]]; 
-///   *I-- = [B1[n-2], B2[n-2], B3[n-2]]; 
-///   ... 
-/// 
-/// It continues until all blocks have been exhausted. Use \c getActiveBlocks() 
-/// to 
-/// determine which blocks are still going and the order they appear in the 
-/// list returned by operator*. 
-class LockstepReverseIterator { 
-  ArrayRef<BasicBlock *> Blocks; 
-  SmallSetVector<BasicBlock *, 4> ActiveBlocks; 
-  SmallVector<Instruction *, 4> Insts; 
-  bool Fail; 
- 
-public: 
-  LockstepReverseIterator(ArrayRef<BasicBlock *> Blocks) : Blocks(Blocks) { 
-    reset(); 
-  } 
- 
-  void reset() { 
-    Fail = false; 
-    ActiveBlocks.clear(); 
-    for (BasicBlock *BB : Blocks) 
-      ActiveBlocks.insert(BB); 
-    Insts.clear(); 
-    for (BasicBlock *BB : Blocks) { 
-      if (BB->size() <= 1) { 
-        // Block wasn't big enough - only contained a terminator. 
-        ActiveBlocks.remove(BB); 
-        continue; 
-      } 
-      Insts.push_back(BB->getTerminator()->getPrevNode()); 
-    } 
-    if (Insts.empty()) 
-      Fail = true; 
-  } 
- 
-  bool isValid() const { return !Fail; } 
-  ArrayRef<Instruction *> operator*() const { return Insts; } 
- 
-  // Note: This needs to return a SmallSetVector as the elements of 
-  // ActiveBlocks will be later copied to Blocks using std::copy. The 
-  // resultant order of elements in Blocks needs to be deterministic. 
-  // Using SmallPtrSet instead causes non-deterministic order while 
-  // copying. And we cannot simply sort Blocks as they need to match the 
-  // corresponding Values. 
-  SmallSetVector<BasicBlock *, 4> &getActiveBlocks() { return ActiveBlocks; } 
- 
-  void restrictToBlocks(SmallSetVector<BasicBlock *, 4> &Blocks) { 
-    for (auto II = Insts.begin(); II != Insts.end();) { 
+//===- GVNSink.cpp - sink expressions into successors ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file GVNSink.cpp
+/// This pass attempts to sink instructions into successors, reducing static
+/// instruction count and enabling if-conversion.
+///
+/// We use a variant of global value numbering to decide what can be sunk.
+/// Consider:
+///
+/// [ %a1 = add i32 %b, 1  ]   [ %c1 = add i32 %d, 1  ]
+/// [ %a2 = xor i32 %a1, 1 ]   [ %c2 = xor i32 %c1, 1 ]
+///                  \           /
+///            [ %e = phi i32 %a2, %c2 ]
+///            [ add i32 %e, 4         ]
+///
+///
+/// GVN would number %a1 and %c1 differently because they compute different
+/// results - the VN of an instruction is a function of its opcode and the
+/// transitive closure of its operands. This is the key property for hoisting
+/// and CSE.
+///
+/// What we want when sinking however is for a numbering that is a function of
+/// the *uses* of an instruction, which allows us to answer the question "if I
+/// replace %a1 with %c1, will it contribute in an equivalent way to all
+/// successive instructions?". The PostValueTable class in GVN provides this
+/// mapping.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/ArrayRecycler.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Scalar/GVNExpression.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "gvn-sink"
+
+STATISTIC(NumRemoved, "Number of instructions removed");
+
+namespace llvm {
+namespace GVNExpression {
+
+LLVM_DUMP_METHOD void Expression::dump() const {
+  print(dbgs());
+  dbgs() << "\n";
+}
+
+} // end namespace GVNExpression
+} // end namespace llvm
+
+namespace {
+
+static bool isMemoryInst(const Instruction *I) {
+  return isa<LoadInst>(I) || isa<StoreInst>(I) ||
+         (isa<InvokeInst>(I) && !cast<InvokeInst>(I)->doesNotAccessMemory()) ||
+         (isa<CallInst>(I) && !cast<CallInst>(I)->doesNotAccessMemory());
+}
+
+/// Iterates through instructions in a set of blocks in reverse order from the
+/// first non-terminator. For example (assume all blocks have size n):
+///   LockstepReverseIterator I([B1, B2, B3]);
+///   *I-- = [B1[n], B2[n], B3[n]];
+///   *I-- = [B1[n-1], B2[n-1], B3[n-1]];
+///   *I-- = [B1[n-2], B2[n-2], B3[n-2]];
+///   ...
+///
+/// It continues until all blocks have been exhausted. Use \c getActiveBlocks()
+/// to
+/// determine which blocks are still going and the order they appear in the
+/// list returned by operator*.
+class LockstepReverseIterator {
+  ArrayRef<BasicBlock *> Blocks;
+  SmallSetVector<BasicBlock *, 4> ActiveBlocks;
+  SmallVector<Instruction *, 4> Insts;
+  bool Fail;
+
+public:
+  LockstepReverseIterator(ArrayRef<BasicBlock *> Blocks) : Blocks(Blocks) {
+    reset();
+  }
+
+  void reset() {
+    Fail = false;
+    ActiveBlocks.clear();
+    for (BasicBlock *BB : Blocks)
+      ActiveBlocks.insert(BB);
+    Insts.clear();
+    for (BasicBlock *BB : Blocks) {
+      if (BB->size() <= 1) {
+        // Block wasn't big enough - only contained a terminator.
+        ActiveBlocks.remove(BB);
+        continue;
+      }
+      Insts.push_back(BB->getTerminator()->getPrevNode());
+    }
+    if (Insts.empty())
+      Fail = true;
+  }
+
+  bool isValid() const { return !Fail; }
+  ArrayRef<Instruction *> operator*() const { return Insts; }
+
+  // Note: This needs to return a SmallSetVector as the elements of
+  // ActiveBlocks will be later copied to Blocks using std::copy. The
+  // resultant order of elements in Blocks needs to be deterministic.
+  // Using SmallPtrSet instead causes non-deterministic order while
+  // copying. And we cannot simply sort Blocks as they need to match the
+  // corresponding Values.
+  SmallSetVector<BasicBlock *, 4> &getActiveBlocks() { return ActiveBlocks; }
+
+  void restrictToBlocks(SmallSetVector<BasicBlock *, 4> &Blocks) {
+    for (auto II = Insts.begin(); II != Insts.end();) {
       if (!llvm::is_contained(Blocks, (*II)->getParent())) {
-        ActiveBlocks.remove((*II)->getParent()); 
-        II = Insts.erase(II); 
-      } else { 
-        ++II; 
-      } 
-    } 
-  } 
- 
-  void operator--() { 
-    if (Fail) 
-      return; 
-    SmallVector<Instruction *, 4> NewInsts; 
-    for (auto *Inst : Insts) { 
-      if (Inst == &Inst->getParent()->front()) 
-        ActiveBlocks.remove(Inst->getParent()); 
-      else 
-        NewInsts.push_back(Inst->getPrevNode()); 
-    } 
-    if (NewInsts.empty()) { 
-      Fail = true; 
-      return; 
-    } 
-    Insts = NewInsts; 
-  } 
-}; 
- 
-//===----------------------------------------------------------------------===// 
- 
-/// Candidate solution for sinking. There may be different ways to 
-/// sink instructions, differing in the number of instructions sunk, 
-/// the number of predecessors sunk from and the number of PHIs 
-/// required. 
-struct SinkingInstructionCandidate { 
-  unsigned NumBlocks; 
-  unsigned NumInstructions; 
-  unsigned NumPHIs; 
-  unsigned NumMemoryInsts; 
-  int Cost = -1; 
-  SmallVector<BasicBlock *, 4> Blocks; 
- 
-  void calculateCost(unsigned NumOrigPHIs, unsigned NumOrigBlocks) { 
-    unsigned NumExtraPHIs = NumPHIs - NumOrigPHIs; 
-    unsigned SplitEdgeCost = (NumOrigBlocks > NumBlocks) ? 2 : 0; 
-    Cost = (NumInstructions * (NumBlocks - 1)) - 
-           (NumExtraPHIs * 
-            NumExtraPHIs) // PHIs are expensive, so make sure they're worth it. 
-           - SplitEdgeCost; 
-  } 
- 
-  bool operator>(const SinkingInstructionCandidate &Other) const { 
-    return Cost > Other.Cost; 
-  } 
-}; 
- 
-#ifndef NDEBUG 
-raw_ostream &operator<<(raw_ostream &OS, const SinkingInstructionCandidate &C) { 
-  OS << "<Candidate Cost=" << C.Cost << " #Blocks=" << C.NumBlocks 
-     << " #Insts=" << C.NumInstructions << " #PHIs=" << C.NumPHIs << ">"; 
-  return OS; 
-} 
-#endif 
- 
-//===----------------------------------------------------------------------===// 
- 
-/// Describes a PHI node that may or may not exist. These track the PHIs 
-/// that must be created if we sunk a sequence of instructions. It provides 
-/// a hash function for efficient equality comparisons. 
-class ModelledPHI { 
-  SmallVector<Value *, 4> Values; 
-  SmallVector<BasicBlock *, 4> Blocks; 
- 
-public: 
-  ModelledPHI() = default; 
- 
-  ModelledPHI(const PHINode *PN) { 
-    // BasicBlock comes first so we sort by basic block pointer order, then by value pointer order. 
-    SmallVector<std::pair<BasicBlock *, Value *>, 4> Ops; 
-    for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) 
-      Ops.push_back({PN->getIncomingBlock(I), PN->getIncomingValue(I)}); 
-    llvm::sort(Ops); 
-    for (auto &P : Ops) { 
-      Blocks.push_back(P.first); 
-      Values.push_back(P.second); 
-    } 
-  } 
- 
-  /// Create a dummy ModelledPHI that will compare unequal to any other ModelledPHI 
-  /// without the same ID. 
-  /// \note This is specifically for DenseMapInfo - do not use this! 
-  static ModelledPHI createDummy(size_t ID) { 
-    ModelledPHI M; 
-    M.Values.push_back(reinterpret_cast<Value*>(ID)); 
-    return M; 
-  } 
- 
-  /// Create a PHI from an array of incoming values and incoming blocks. 
-  template <typename VArray, typename BArray> 
-  ModelledPHI(const VArray &V, const BArray &B) { 
-    llvm::copy(V, std::back_inserter(Values)); 
-    llvm::copy(B, std::back_inserter(Blocks)); 
-  } 
- 
-  /// Create a PHI from [I[OpNum] for I in Insts]. 
-  template <typename BArray> 
-  ModelledPHI(ArrayRef<Instruction *> Insts, unsigned OpNum, const BArray &B) { 
-    llvm::copy(B, std::back_inserter(Blocks)); 
-    for (auto *I : Insts) 
-      Values.push_back(I->getOperand(OpNum)); 
-  } 
- 
-  /// Restrict the PHI's contents down to only \c NewBlocks. 
-  /// \c NewBlocks must be a subset of \c this->Blocks. 
-  void restrictToBlocks(const SmallSetVector<BasicBlock *, 4> &NewBlocks) { 
-    auto BI = Blocks.begin(); 
-    auto VI = Values.begin(); 
-    while (BI != Blocks.end()) { 
-      assert(VI != Values.end()); 
+        ActiveBlocks.remove((*II)->getParent());
+        II = Insts.erase(II);
+      } else {
+        ++II;
+      }
+    }
+  }
+
+  void operator--() {
+    if (Fail)
+      return;
+    SmallVector<Instruction *, 4> NewInsts;
+    for (auto *Inst : Insts) {
+      if (Inst == &Inst->getParent()->front())
+        ActiveBlocks.remove(Inst->getParent());
+      else
+        NewInsts.push_back(Inst->getPrevNode());
+    }
+    if (NewInsts.empty()) {
+      Fail = true;
+      return;
+    }
+    Insts = NewInsts;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+
+/// Candidate solution for sinking. There may be different ways to
+/// sink instructions, differing in the number of instructions sunk,
+/// the number of predecessors sunk from and the number of PHIs
+/// required.
+struct SinkingInstructionCandidate {
+  unsigned NumBlocks;
+  unsigned NumInstructions;
+  unsigned NumPHIs;
+  unsigned NumMemoryInsts;
+  int Cost = -1;
+  SmallVector<BasicBlock *, 4> Blocks;
+
+  void calculateCost(unsigned NumOrigPHIs, unsigned NumOrigBlocks) {
+    unsigned NumExtraPHIs = NumPHIs - NumOrigPHIs;
+    unsigned SplitEdgeCost = (NumOrigBlocks > NumBlocks) ? 2 : 0;
+    Cost = (NumInstructions * (NumBlocks - 1)) -
+           (NumExtraPHIs *
+            NumExtraPHIs) // PHIs are expensive, so make sure they're worth it.
+           - SplitEdgeCost;
+  }
+
+  bool operator>(const SinkingInstructionCandidate &Other) const {
+    return Cost > Other.Cost;
+  }
+};
+
+#ifndef NDEBUG
+raw_ostream &operator<<(raw_ostream &OS, const SinkingInstructionCandidate &C) {
+  OS << "<Candidate Cost=" << C.Cost << " #Blocks=" << C.NumBlocks
+     << " #Insts=" << C.NumInstructions << " #PHIs=" << C.NumPHIs << ">";
+  return OS;
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+
+/// Describes a PHI node that may or may not exist. These track the PHIs
+/// that must be created if we sunk a sequence of instructions. It provides
+/// a hash function for efficient equality comparisons.
+class ModelledPHI {
+  SmallVector<Value *, 4> Values;
+  SmallVector<BasicBlock *, 4> Blocks;
+
+public:
+  ModelledPHI() = default;
+
+  ModelledPHI(const PHINode *PN) {
+    // BasicBlock comes first so we sort by basic block pointer order, then by value pointer order.
+    SmallVector<std::pair<BasicBlock *, Value *>, 4> Ops;
+    for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I)
+      Ops.push_back({PN->getIncomingBlock(I), PN->getIncomingValue(I)});
+    llvm::sort(Ops);
+    for (auto &P : Ops) {
+      Blocks.push_back(P.first);
+      Values.push_back(P.second);
+    }
+  }
+
+  /// Create a dummy ModelledPHI that will compare unequal to any other ModelledPHI
+  /// without the same ID.
+  /// \note This is specifically for DenseMapInfo - do not use this!
+  static ModelledPHI createDummy(size_t ID) {
+    ModelledPHI M;
+    M.Values.push_back(reinterpret_cast<Value*>(ID));
+    return M;
+  }
+
+  /// Create a PHI from an array of incoming values and incoming blocks.
+  template <typename VArray, typename BArray>
+  ModelledPHI(const VArray &V, const BArray &B) {
+    llvm::copy(V, std::back_inserter(Values));
+    llvm::copy(B, std::back_inserter(Blocks));
+  }
+
+  /// Create a PHI from [I[OpNum] for I in Insts].
+  template <typename BArray>
+  ModelledPHI(ArrayRef<Instruction *> Insts, unsigned OpNum, const BArray &B) {
+    llvm::copy(B, std::back_inserter(Blocks));
+    for (auto *I : Insts)
+      Values.push_back(I->getOperand(OpNum));
+  }
+
+  /// Restrict the PHI's contents down to only \c NewBlocks.
+  /// \c NewBlocks must be a subset of \c this->Blocks.
+  void restrictToBlocks(const SmallSetVector<BasicBlock *, 4> &NewBlocks) {
+    auto BI = Blocks.begin();
+    auto VI = Values.begin();
+    while (BI != Blocks.end()) {
+      assert(VI != Values.end());
       if (!llvm::is_contained(NewBlocks, *BI)) {
-        BI = Blocks.erase(BI); 
-        VI = Values.erase(VI); 
-      } else { 
-        ++BI; 
-        ++VI; 
-      } 
-    } 
-    assert(Blocks.size() == NewBlocks.size()); 
-  } 
- 
-  ArrayRef<Value *> getValues() const { return Values; } 
- 
-  bool areAllIncomingValuesSame() const { 
-    return llvm::all_of(Values, [&](Value *V) { return V == Values[0]; }); 
-  } 
- 
-  bool areAllIncomingValuesSameType() const { 
-    return llvm::all_of( 
-        Values, [&](Value *V) { return V->getType() == Values[0]->getType(); }); 
-  } 
- 
-  bool areAnyIncomingValuesConstant() const { 
-    return llvm::any_of(Values, [&](Value *V) { return isa<Constant>(V); }); 
-  } 
- 
-  // Hash functor 
-  unsigned hash() const { 
-      return (unsigned)hash_combine_range(Values.begin(), Values.end()); 
-  } 
- 
-  bool operator==(const ModelledPHI &Other) const { 
-    return Values == Other.Values && Blocks == Other.Blocks; 
-  } 
-}; 
- 
-template <typename ModelledPHI> struct DenseMapInfo { 
-  static inline ModelledPHI &getEmptyKey() { 
-    static ModelledPHI Dummy = ModelledPHI::createDummy(0); 
-    return Dummy; 
-  } 
- 
-  static inline ModelledPHI &getTombstoneKey() { 
-    static ModelledPHI Dummy = ModelledPHI::createDummy(1); 
-    return Dummy; 
-  } 
- 
-  static unsigned getHashValue(const ModelledPHI &V) { return V.hash(); } 
- 
-  static bool isEqual(const ModelledPHI &LHS, const ModelledPHI &RHS) { 
-    return LHS == RHS; 
-  } 
-}; 
- 
-using ModelledPHISet = DenseSet<ModelledPHI, DenseMapInfo<ModelledPHI>>; 
- 
-//===----------------------------------------------------------------------===// 
-//                             ValueTable 
-//===----------------------------------------------------------------------===// 
-// This is a value number table where the value number is a function of the 
-// *uses* of a value, rather than its operands. Thus, if VN(A) == VN(B) we know 
-// that the program would be equivalent if we replaced A with PHI(A, B). 
-//===----------------------------------------------------------------------===// 
- 
-/// A GVN expression describing how an instruction is used. The operands 
-/// field of BasicExpression is used to store uses, not operands. 
-/// 
-/// This class also contains fields for discriminators used when determining 
-/// equivalence of instructions with sideeffects. 
-class InstructionUseExpr : public GVNExpression::BasicExpression { 
-  unsigned MemoryUseOrder = -1; 
-  bool Volatile = false; 
-  ArrayRef<int> ShuffleMask; 
- 
-public: 
-  InstructionUseExpr(Instruction *I, ArrayRecycler<Value *> &R, 
-                     BumpPtrAllocator &A) 
-      : GVNExpression::BasicExpression(I->getNumUses()) { 
-    allocateOperands(R, A); 
-    setOpcode(I->getOpcode()); 
-    setType(I->getType()); 
- 
-    if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(I)) 
-      ShuffleMask = SVI->getShuffleMask().copy(A); 
- 
-    for (auto &U : I->uses()) 
-      op_push_back(U.getUser()); 
-    llvm::sort(op_begin(), op_end()); 
-  } 
- 
-  void setMemoryUseOrder(unsigned MUO) { MemoryUseOrder = MUO; } 
-  void setVolatile(bool V) { Volatile = V; } 
- 
-  hash_code getHashValue() const override { 
-    return hash_combine(GVNExpression::BasicExpression::getHashValue(), 
-                        MemoryUseOrder, Volatile, ShuffleMask); 
-  } 
- 
-  template <typename Function> hash_code getHashValue(Function MapFn) { 
-    hash_code H = hash_combine(getOpcode(), getType(), MemoryUseOrder, Volatile, 
-                               ShuffleMask); 
-    for (auto *V : operands()) 
-      H = hash_combine(H, MapFn(V)); 
-    return H; 
-  } 
-}; 
- 
-class ValueTable { 
-  DenseMap<Value *, uint32_t> ValueNumbering; 
-  DenseMap<GVNExpression::Expression *, uint32_t> ExpressionNumbering; 
-  DenseMap<size_t, uint32_t> HashNumbering; 
-  BumpPtrAllocator Allocator; 
-  ArrayRecycler<Value *> Recycler; 
-  uint32_t nextValueNumber = 1; 
- 
-  /// Create an expression for I based on its opcode and its uses. If I 
-  /// touches or reads memory, the expression is also based upon its memory 
-  /// order - see \c getMemoryUseOrder(). 
-  InstructionUseExpr *createExpr(Instruction *I) { 
-    InstructionUseExpr *E = 
-        new (Allocator) InstructionUseExpr(I, Recycler, Allocator); 
-    if (isMemoryInst(I)) 
-      E->setMemoryUseOrder(getMemoryUseOrder(I)); 
- 
-    if (CmpInst *C = dyn_cast<CmpInst>(I)) { 
-      CmpInst::Predicate Predicate = C->getPredicate(); 
-      E->setOpcode((C->getOpcode() << 8) | Predicate); 
-    } 
-    return E; 
-  } 
- 
-  /// Helper to compute the value number for a memory instruction 
-  /// (LoadInst/StoreInst), including checking the memory ordering and 
-  /// volatility. 
-  template <class Inst> InstructionUseExpr *createMemoryExpr(Inst *I) { 
-    if (isStrongerThanUnordered(I->getOrdering()) || I->isAtomic()) 
-      return nullptr; 
-    InstructionUseExpr *E = createExpr(I); 
-    E->setVolatile(I->isVolatile()); 
-    return E; 
-  } 
- 
-public: 
-  ValueTable() = default; 
- 
-  /// Returns the value number for the specified value, assigning 
-  /// it a new number if it did not have one before. 
-  uint32_t lookupOrAdd(Value *V) { 
-    auto VI = ValueNumbering.find(V); 
-    if (VI != ValueNumbering.end()) 
-      return VI->second; 
- 
-    if (!isa<Instruction>(V)) { 
-      ValueNumbering[V] = nextValueNumber; 
-      return nextValueNumber++; 
-    } 
- 
-    Instruction *I = cast<Instruction>(V); 
-    InstructionUseExpr *exp = nullptr; 
-    switch (I->getOpcode()) { 
-    case Instruction::Load: 
-      exp = createMemoryExpr(cast<LoadInst>(I)); 
-      break; 
-    case Instruction::Store: 
-      exp = createMemoryExpr(cast<StoreInst>(I)); 
-      break; 
-    case Instruction::Call: 
-    case Instruction::Invoke: 
-    case Instruction::FNeg: 
-    case Instruction::Add: 
-    case Instruction::FAdd: 
-    case Instruction::Sub: 
-    case Instruction::FSub: 
-    case Instruction::Mul: 
-    case Instruction::FMul: 
-    case Instruction::UDiv: 
-    case Instruction::SDiv: 
-    case Instruction::FDiv: 
-    case Instruction::URem: 
-    case Instruction::SRem: 
-    case Instruction::FRem: 
-    case Instruction::Shl: 
-    case Instruction::LShr: 
-    case Instruction::AShr: 
-    case Instruction::And: 
-    case Instruction::Or: 
-    case Instruction::Xor: 
-    case Instruction::ICmp: 
-    case Instruction::FCmp: 
-    case Instruction::Trunc: 
-    case Instruction::ZExt: 
-    case Instruction::SExt: 
-    case Instruction::FPToUI: 
-    case Instruction::FPToSI: 
-    case Instruction::UIToFP: 
-    case Instruction::SIToFP: 
-    case Instruction::FPTrunc: 
-    case Instruction::FPExt: 
-    case Instruction::PtrToInt: 
-    case Instruction::IntToPtr: 
-    case Instruction::BitCast: 
-    case Instruction::AddrSpaceCast: 
-    case Instruction::Select: 
-    case Instruction::ExtractElement: 
-    case Instruction::InsertElement: 
-    case Instruction::ShuffleVector: 
-    case Instruction::InsertValue: 
-    case Instruction::GetElementPtr: 
-      exp = createExpr(I); 
-      break; 
-    default: 
-      break; 
-    } 
- 
-    if (!exp) { 
-      ValueNumbering[V] = nextValueNumber; 
-      return nextValueNumber++; 
-    } 
- 
-    uint32_t e = ExpressionNumbering[exp]; 
-    if (!e) { 
-      hash_code H = exp->getHashValue([=](Value *V) { return lookupOrAdd(V); }); 
-      auto I = HashNumbering.find(H); 
-      if (I != HashNumbering.end()) { 
-        e = I->second; 
-      } else { 
-        e = nextValueNumber++; 
-        HashNumbering[H] = e; 
-        ExpressionNumbering[exp] = e; 
-      } 
-    } 
-    ValueNumbering[V] = e; 
-    return e; 
-  } 
- 
-  /// Returns the value number of the specified value. Fails if the value has 
-  /// not yet been numbered. 
-  uint32_t lookup(Value *V) const { 
-    auto VI = ValueNumbering.find(V); 
-    assert(VI != ValueNumbering.end() && "Value not numbered?"); 
-    return VI->second; 
-  } 
- 
-  /// Removes all value numberings and resets the value table. 
-  void clear() { 
-    ValueNumbering.clear(); 
-    ExpressionNumbering.clear(); 
-    HashNumbering.clear(); 
-    Recycler.clear(Allocator); 
-    nextValueNumber = 1; 
-  } 
- 
-  /// \c Inst uses or touches memory. Return an ID describing the memory state 
-  /// at \c Inst such that if getMemoryUseOrder(I1) == getMemoryUseOrder(I2), 
-  /// the exact same memory operations happen after I1 and I2. 
-  /// 
-  /// This is a very hard problem in general, so we use domain-specific 
-  /// knowledge that we only ever check for equivalence between blocks sharing a 
-  /// single immediate successor that is common, and when determining if I1 == 
-  /// I2 we will have already determined that next(I1) == next(I2). This 
-  /// inductive property allows us to simply return the value number of the next 
-  /// instruction that defines memory. 
-  uint32_t getMemoryUseOrder(Instruction *Inst) { 
-    auto *BB = Inst->getParent(); 
-    for (auto I = std::next(Inst->getIterator()), E = BB->end(); 
-         I != E && !I->isTerminator(); ++I) { 
-      if (!isMemoryInst(&*I)) 
-        continue; 
-      if (isa<LoadInst>(&*I)) 
-        continue; 
-      CallInst *CI = dyn_cast<CallInst>(&*I); 
-      if (CI && CI->onlyReadsMemory()) 
-        continue; 
-      InvokeInst *II = dyn_cast<InvokeInst>(&*I); 
-      if (II && II->onlyReadsMemory()) 
-        continue; 
-      return lookupOrAdd(&*I); 
-    } 
-    return 0; 
-  } 
-}; 
- 
-//===----------------------------------------------------------------------===// 
- 
-class GVNSink { 
-public: 
-  GVNSink() = default; 
- 
-  bool run(Function &F) { 
-    LLVM_DEBUG(dbgs() << "GVNSink: running on function @" << F.getName() 
-                      << "\n"); 
- 
-    unsigned NumSunk = 0; 
-    ReversePostOrderTraversal<Function*> RPOT(&F); 
-    for (auto *N : RPOT) 
-      NumSunk += sinkBB(N); 
- 
-    return NumSunk > 0; 
-  } 
- 
-private: 
-  ValueTable VN; 
- 
-  bool shouldAvoidSinkingInstruction(Instruction *I) { 
-    // These instructions may change or break semantics if moved. 
-    if (isa<PHINode>(I) || I->isEHPad() || isa<AllocaInst>(I) || 
-        I->getType()->isTokenTy()) 
-      return true; 
-    return false; 
-  } 
- 
-  /// The main heuristic function. Analyze the set of instructions pointed to by 
-  /// LRI and return a candidate solution if these instructions can be sunk, or 
-  /// None otherwise. 
-  Optional<SinkingInstructionCandidate> analyzeInstructionForSinking( 
-      LockstepReverseIterator &LRI, unsigned &InstNum, unsigned &MemoryInstNum, 
-      ModelledPHISet &NeededPHIs, SmallPtrSetImpl<Value *> &PHIContents); 
- 
-  /// Create a ModelledPHI for each PHI in BB, adding to PHIs. 
-  void analyzeInitialPHIs(BasicBlock *BB, ModelledPHISet &PHIs, 
-                          SmallPtrSetImpl<Value *> &PHIContents) { 
-    for (PHINode &PN : BB->phis()) { 
-      auto MPHI = ModelledPHI(&PN); 
-      PHIs.insert(MPHI); 
-      for (auto *V : MPHI.getValues()) 
-        PHIContents.insert(V); 
-    } 
-  } 
- 
-  /// The main instruction sinking driver. Set up state and try and sink 
-  /// instructions into BBEnd from its predecessors. 
-  unsigned sinkBB(BasicBlock *BBEnd); 
- 
-  /// Perform the actual mechanics of sinking an instruction from Blocks into 
-  /// BBEnd, which is their only successor. 
-  void sinkLastInstruction(ArrayRef<BasicBlock *> Blocks, BasicBlock *BBEnd); 
- 
-  /// Remove PHIs that all have the same incoming value. 
-  void foldPointlessPHINodes(BasicBlock *BB) { 
-    auto I = BB->begin(); 
-    while (PHINode *PN = dyn_cast<PHINode>(I++)) { 
-      if (!llvm::all_of(PN->incoming_values(), [&](const Value *V) { 
-            return V == PN->getIncomingValue(0); 
-          })) 
-        continue; 
-      if (PN->getIncomingValue(0) != PN) 
-        PN->replaceAllUsesWith(PN->getIncomingValue(0)); 
-      else 
-        PN->replaceAllUsesWith(UndefValue::get(PN->getType())); 
-      PN->eraseFromParent(); 
-    } 
-  } 
-}; 
- 
-Optional<SinkingInstructionCandidate> GVNSink::analyzeInstructionForSinking( 
-  LockstepReverseIterator &LRI, unsigned &InstNum, unsigned &MemoryInstNum, 
-  ModelledPHISet &NeededPHIs, SmallPtrSetImpl<Value *> &PHIContents) { 
-  auto Insts = *LRI; 
-  LLVM_DEBUG(dbgs() << " -- Analyzing instruction set: [\n"; for (auto *I 
-                                                                  : Insts) { 
-    I->dump(); 
-  } dbgs() << " ]\n";); 
- 
-  DenseMap<uint32_t, unsigned> VNums; 
-  for (auto *I : Insts) { 
-    uint32_t N = VN.lookupOrAdd(I); 
-    LLVM_DEBUG(dbgs() << " VN=" << Twine::utohexstr(N) << " for" << *I << "\n"); 
-    if (N == ~0U) 
-      return None; 
-    VNums[N]++; 
-  } 
-  unsigned VNumToSink = 
-      std::max_element(VNums.begin(), VNums.end(), 
-                       [](const std::pair<uint32_t, unsigned> &I, 
-                          const std::pair<uint32_t, unsigned> &J) { 
-                         return I.second < J.second; 
-                       }) 
-          ->first; 
- 
-  if (VNums[VNumToSink] == 1) 
-    // Can't sink anything! 
-    return None; 
- 
-  // Now restrict the number of incoming blocks down to only those with 
-  // VNumToSink. 
-  auto &ActivePreds = LRI.getActiveBlocks(); 
-  unsigned InitialActivePredSize = ActivePreds.size(); 
-  SmallVector<Instruction *, 4> NewInsts; 
-  for (auto *I : Insts) { 
-    if (VN.lookup(I) != VNumToSink) 
-      ActivePreds.remove(I->getParent()); 
-    else 
-      NewInsts.push_back(I); 
-  } 
-  for (auto *I : NewInsts) 
-    if (shouldAvoidSinkingInstruction(I)) 
-      return None; 
- 
-  // If we've restricted the incoming blocks, restrict all needed PHIs also 
-  // to that set. 
-  bool RecomputePHIContents = false; 
-  if (ActivePreds.size() != InitialActivePredSize) { 
-    ModelledPHISet NewNeededPHIs; 
-    for (auto P : NeededPHIs) { 
-      P.restrictToBlocks(ActivePreds); 
-      NewNeededPHIs.insert(P); 
-    } 
-    NeededPHIs = NewNeededPHIs; 
-    LRI.restrictToBlocks(ActivePreds); 
-    RecomputePHIContents = true; 
-  } 
- 
-  // The sunk instruction's results. 
-  ModelledPHI NewPHI(NewInsts, ActivePreds); 
- 
-  // Does sinking this instruction render previous PHIs redundant? 
+        BI = Blocks.erase(BI);
+        VI = Values.erase(VI);
+      } else {
+        ++BI;
+        ++VI;
+      }
+    }
+    assert(Blocks.size() == NewBlocks.size());
+  }
+
+  ArrayRef<Value *> getValues() const { return Values; }
+
+  bool areAllIncomingValuesSame() const {
+    return llvm::all_of(Values, [&](Value *V) { return V == Values[0]; });
+  }
+
+  bool areAllIncomingValuesSameType() const {
+    return llvm::all_of(
+        Values, [&](Value *V) { return V->getType() == Values[0]->getType(); });
+  }
+
+  bool areAnyIncomingValuesConstant() const {
+    return llvm::any_of(Values, [&](Value *V) { return isa<Constant>(V); });
+  }
+
+  // Hash functor
+  unsigned hash() const {
+      return (unsigned)hash_combine_range(Values.begin(), Values.end());
+  }
+
+  bool operator==(const ModelledPHI &Other) const {
+    return Values == Other.Values && Blocks == Other.Blocks;
+  }
+};
+
+template <typename ModelledPHI> struct DenseMapInfo {
+  static inline ModelledPHI &getEmptyKey() {
+    static ModelledPHI Dummy = ModelledPHI::createDummy(0);
+    return Dummy;
+  }
+
+  static inline ModelledPHI &getTombstoneKey() {
+    static ModelledPHI Dummy = ModelledPHI::createDummy(1);
+    return Dummy;
+  }
+
+  static unsigned getHashValue(const ModelledPHI &V) { return V.hash(); }
+
+  static bool isEqual(const ModelledPHI &LHS, const ModelledPHI &RHS) {
+    return LHS == RHS;
+  }
+};
+
+using ModelledPHISet = DenseSet<ModelledPHI, DenseMapInfo<ModelledPHI>>;
+
+//===----------------------------------------------------------------------===//
+//                             ValueTable
+//===----------------------------------------------------------------------===//
+// This is a value number table where the value number is a function of the
+// *uses* of a value, rather than its operands. Thus, if VN(A) == VN(B) we know
+// that the program would be equivalent if we replaced A with PHI(A, B).
+//===----------------------------------------------------------------------===//
+
+/// A GVN expression describing how an instruction is used. The operands
+/// field of BasicExpression is used to store uses, not operands.
+///
+/// This class also contains fields for discriminators used when determining
+/// equivalence of instructions with sideeffects.
+class InstructionUseExpr : public GVNExpression::BasicExpression {
+  unsigned MemoryUseOrder = -1;
+  bool Volatile = false;
+  ArrayRef<int> ShuffleMask;
+
+public:
+  InstructionUseExpr(Instruction *I, ArrayRecycler<Value *> &R,
+                     BumpPtrAllocator &A)
+      : GVNExpression::BasicExpression(I->getNumUses()) {
+    allocateOperands(R, A);
+    setOpcode(I->getOpcode());
+    setType(I->getType());
+
+    if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(I))
+      ShuffleMask = SVI->getShuffleMask().copy(A);
+
+    for (auto &U : I->uses())
+      op_push_back(U.getUser());
+    llvm::sort(op_begin(), op_end());
+  }
+
+  void setMemoryUseOrder(unsigned MUO) { MemoryUseOrder = MUO; }
+  void setVolatile(bool V) { Volatile = V; }
+
+  hash_code getHashValue() const override {
+    return hash_combine(GVNExpression::BasicExpression::getHashValue(),
+                        MemoryUseOrder, Volatile, ShuffleMask);
+  }
+
+  template <typename Function> hash_code getHashValue(Function MapFn) {
+    hash_code H = hash_combine(getOpcode(), getType(), MemoryUseOrder, Volatile,
+                               ShuffleMask);
+    for (auto *V : operands())
+      H = hash_combine(H, MapFn(V));
+    return H;
+  }
+};
+
+class ValueTable {
+  DenseMap<Value *, uint32_t> ValueNumbering;
+  DenseMap<GVNExpression::Expression *, uint32_t> ExpressionNumbering;
+  DenseMap<size_t, uint32_t> HashNumbering;
+  BumpPtrAllocator Allocator;
+  ArrayRecycler<Value *> Recycler;
+  uint32_t nextValueNumber = 1;
+
+  /// Create an expression for I based on its opcode and its uses. If I
+  /// touches or reads memory, the expression is also based upon its memory
+  /// order - see \c getMemoryUseOrder().
+  InstructionUseExpr *createExpr(Instruction *I) {
+    InstructionUseExpr *E =
+        new (Allocator) InstructionUseExpr(I, Recycler, Allocator);
+    if (isMemoryInst(I))
+      E->setMemoryUseOrder(getMemoryUseOrder(I));
+
+    if (CmpInst *C = dyn_cast<CmpInst>(I)) {
+      CmpInst::Predicate Predicate = C->getPredicate();
+      E->setOpcode((C->getOpcode() << 8) | Predicate);
+    }
+    return E;
+  }
+
+  /// Helper to compute the value number for a memory instruction
+  /// (LoadInst/StoreInst), including checking the memory ordering and
+  /// volatility.
+  template <class Inst> InstructionUseExpr *createMemoryExpr(Inst *I) {
+    if (isStrongerThanUnordered(I->getOrdering()) || I->isAtomic())
+      return nullptr;
+    InstructionUseExpr *E = createExpr(I);
+    E->setVolatile(I->isVolatile());
+    return E;
+  }
+
+public:
+  ValueTable() = default;
+
+  /// Returns the value number for the specified value, assigning
+  /// it a new number if it did not have one before.
+  uint32_t lookupOrAdd(Value *V) {
+    auto VI = ValueNumbering.find(V);
+    if (VI != ValueNumbering.end())
+      return VI->second;
+
+    if (!isa<Instruction>(V)) {
+      ValueNumbering[V] = nextValueNumber;
+      return nextValueNumber++;
+    }
+
+    Instruction *I = cast<Instruction>(V);
+    InstructionUseExpr *exp = nullptr;
+    switch (I->getOpcode()) {
+    case Instruction::Load:
+      exp = createMemoryExpr(cast<LoadInst>(I));
+      break;
+    case Instruction::Store:
+      exp = createMemoryExpr(cast<StoreInst>(I));
+      break;
+    case Instruction::Call:
+    case Instruction::Invoke:
+    case Instruction::FNeg:
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::ICmp:
+    case Instruction::FCmp:
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::BitCast:
+    case Instruction::AddrSpaceCast:
+    case Instruction::Select:
+    case Instruction::ExtractElement:
+    case Instruction::InsertElement:
+    case Instruction::ShuffleVector:
+    case Instruction::InsertValue:
+    case Instruction::GetElementPtr:
+      exp = createExpr(I);
+      break;
+    default:
+      break;
+    }
+
+    if (!exp) {
+      ValueNumbering[V] = nextValueNumber;
+      return nextValueNumber++;
+    }
+
+    uint32_t e = ExpressionNumbering[exp];
+    if (!e) {
+      hash_code H = exp->getHashValue([=](Value *V) { return lookupOrAdd(V); });
+      auto I = HashNumbering.find(H);
+      if (I != HashNumbering.end()) {
+        e = I->second;
+      } else {
+        e = nextValueNumber++;
+        HashNumbering[H] = e;
+        ExpressionNumbering[exp] = e;
+      }
+    }
+    ValueNumbering[V] = e;
+    return e;
+  }
+
+  /// Returns the value number of the specified value. Fails if the value has
+  /// not yet been numbered.
+  uint32_t lookup(Value *V) const {
+    auto VI = ValueNumbering.find(V);
+    assert(VI != ValueNumbering.end() && "Value not numbered?");
+    return VI->second;
+  }
+
+  /// Removes all value numberings and resets the value table.
+  void clear() {
+    ValueNumbering.clear();
+    ExpressionNumbering.clear();
+    HashNumbering.clear();
+    Recycler.clear(Allocator);
+    nextValueNumber = 1;
+  }
+
+  /// \c Inst uses or touches memory. Return an ID describing the memory state
+  /// at \c Inst such that if getMemoryUseOrder(I1) == getMemoryUseOrder(I2),
+  /// the exact same memory operations happen after I1 and I2.
+  ///
+  /// This is a very hard problem in general, so we use domain-specific
+  /// knowledge that we only ever check for equivalence between blocks sharing a
+  /// single immediate successor that is common, and when determining if I1 ==
+  /// I2 we will have already determined that next(I1) == next(I2). This
+  /// inductive property allows us to simply return the value number of the next
+  /// instruction that defines memory.
+  uint32_t getMemoryUseOrder(Instruction *Inst) {
+    auto *BB = Inst->getParent();
+    for (auto I = std::next(Inst->getIterator()), E = BB->end();
+         I != E && !I->isTerminator(); ++I) {
+      if (!isMemoryInst(&*I))
+        continue;
+      if (isa<LoadInst>(&*I))
+        continue;
+      CallInst *CI = dyn_cast<CallInst>(&*I);
+      if (CI && CI->onlyReadsMemory())
+        continue;
+      InvokeInst *II = dyn_cast<InvokeInst>(&*I);
+      if (II && II->onlyReadsMemory())
+        continue;
+      return lookupOrAdd(&*I);
+    }
+    return 0;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+
+class GVNSink {
+public:
+  GVNSink() = default;
+
+  bool run(Function &F) {
+    LLVM_DEBUG(dbgs() << "GVNSink: running on function @" << F.getName()
+                      << "\n");
+
+    unsigned NumSunk = 0;
+    ReversePostOrderTraversal<Function*> RPOT(&F);
+    for (auto *N : RPOT)
+      NumSunk += sinkBB(N);
+
+    return NumSunk > 0;
+  }
+
+private:
+  ValueTable VN;
+
+  bool shouldAvoidSinkingInstruction(Instruction *I) {
+    // These instructions may change or break semantics if moved.
+    if (isa<PHINode>(I) || I->isEHPad() || isa<AllocaInst>(I) ||
+        I->getType()->isTokenTy())
+      return true;
+    return false;
+  }
+
+  /// The main heuristic function. Analyze the set of instructions pointed to by
+  /// LRI and return a candidate solution if these instructions can be sunk, or
+  /// None otherwise.
+  Optional<SinkingInstructionCandidate> analyzeInstructionForSinking(
+      LockstepReverseIterator &LRI, unsigned &InstNum, unsigned &MemoryInstNum,
+      ModelledPHISet &NeededPHIs, SmallPtrSetImpl<Value *> &PHIContents);
+
+  /// Create a ModelledPHI for each PHI in BB, adding to PHIs.
+  void analyzeInitialPHIs(BasicBlock *BB, ModelledPHISet &PHIs,
+                          SmallPtrSetImpl<Value *> &PHIContents) {
+    for (PHINode &PN : BB->phis()) {
+      auto MPHI = ModelledPHI(&PN);
+      PHIs.insert(MPHI);
+      for (auto *V : MPHI.getValues())
+        PHIContents.insert(V);
+    }
+  }
+
+  /// The main instruction sinking driver. Set up state and try and sink
+  /// instructions into BBEnd from its predecessors.
+  unsigned sinkBB(BasicBlock *BBEnd);
+
+  /// Perform the actual mechanics of sinking an instruction from Blocks into
+  /// BBEnd, which is their only successor.
+  void sinkLastInstruction(ArrayRef<BasicBlock *> Blocks, BasicBlock *BBEnd);
+
+  /// Remove PHIs that all have the same incoming value.
+  void foldPointlessPHINodes(BasicBlock *BB) {
+    auto I = BB->begin();
+    while (PHINode *PN = dyn_cast<PHINode>(I++)) {
+      if (!llvm::all_of(PN->incoming_values(), [&](const Value *V) {
+            return V == PN->getIncomingValue(0);
+          }))
+        continue;
+      if (PN->getIncomingValue(0) != PN)
+        PN->replaceAllUsesWith(PN->getIncomingValue(0));
+      else
+        PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
+      PN->eraseFromParent();
+    }
+  }
+};
+
+Optional<SinkingInstructionCandidate> GVNSink::analyzeInstructionForSinking(
+  LockstepReverseIterator &LRI, unsigned &InstNum, unsigned &MemoryInstNum,
+  ModelledPHISet &NeededPHIs, SmallPtrSetImpl<Value *> &PHIContents) {
+  auto Insts = *LRI;
+  LLVM_DEBUG(dbgs() << " -- Analyzing instruction set: [\n"; for (auto *I
+                                                                  : Insts) {
+    I->dump();
+  } dbgs() << " ]\n";);
+
+  DenseMap<uint32_t, unsigned> VNums;
+  for (auto *I : Insts) {
+    uint32_t N = VN.lookupOrAdd(I);
+    LLVM_DEBUG(dbgs() << " VN=" << Twine::utohexstr(N) << " for" << *I << "\n");
+    if (N == ~0U)
+      return None;
+    VNums[N]++;
+  }
+  unsigned VNumToSink =
+      std::max_element(VNums.begin(), VNums.end(),
+                       [](const std::pair<uint32_t, unsigned> &I,
+                          const std::pair<uint32_t, unsigned> &J) {
+                         return I.second < J.second;
+                       })
+          ->first;
+
+  if (VNums[VNumToSink] == 1)
+    // Can't sink anything!
+    return None;
+
+  // Now restrict the number of incoming blocks down to only those with
+  // VNumToSink.
+  auto &ActivePreds = LRI.getActiveBlocks();
+  unsigned InitialActivePredSize = ActivePreds.size();
+  SmallVector<Instruction *, 4> NewInsts;
+  for (auto *I : Insts) {
+    if (VN.lookup(I) != VNumToSink)
+      ActivePreds.remove(I->getParent());
+    else
+      NewInsts.push_back(I);
+  }
+  for (auto *I : NewInsts)
+    if (shouldAvoidSinkingInstruction(I))
+      return None;
+
+  // If we've restricted the incoming blocks, restrict all needed PHIs also
+  // to that set.
+  bool RecomputePHIContents = false;
+  if (ActivePreds.size() != InitialActivePredSize) {
+    ModelledPHISet NewNeededPHIs;
+    for (auto P : NeededPHIs) {
+      P.restrictToBlocks(ActivePreds);
+      NewNeededPHIs.insert(P);
+    }
+    NeededPHIs = NewNeededPHIs;
+    LRI.restrictToBlocks(ActivePreds);
+    RecomputePHIContents = true;
+  }
+
+  // The sunk instruction's results.
+  ModelledPHI NewPHI(NewInsts, ActivePreds);
+
+  // Does sinking this instruction render previous PHIs redundant?
   if (NeededPHIs.erase(NewPHI))
-    RecomputePHIContents = true; 
- 
-  if (RecomputePHIContents) { 
-    // The needed PHIs have changed, so recompute the set of all needed 
-    // values. 
-    PHIContents.clear(); 
-    for (auto &PHI : NeededPHIs) 
-      PHIContents.insert(PHI.getValues().begin(), PHI.getValues().end()); 
-  } 
- 
-  // Is this instruction required by a later PHI that doesn't match this PHI? 
-  // if so, we can't sink this instruction. 
-  for (auto *V : NewPHI.getValues()) 
-    if (PHIContents.count(V)) 
-      // V exists in this PHI, but the whole PHI is different to NewPHI 
-      // (else it would have been removed earlier). We cannot continue 
-      // because this isn't representable. 
-      return None; 
- 
-  // Which operands need PHIs? 
-  // FIXME: If any of these fail, we should partition up the candidates to 
-  // try and continue making progress. 
-  Instruction *I0 = NewInsts[0]; 
- 
-  // If all instructions that are going to participate don't have the same 
-  // number of operands, we can't do any useful PHI analysis for all operands. 
-  auto hasDifferentNumOperands = [&I0](Instruction *I) { 
-    return I->getNumOperands() != I0->getNumOperands(); 
-  }; 
-  if (any_of(NewInsts, hasDifferentNumOperands)) 
-    return None; 
- 
-  for (unsigned OpNum = 0, E = I0->getNumOperands(); OpNum != E; ++OpNum) { 
-    ModelledPHI PHI(NewInsts, OpNum, ActivePreds); 
-    if (PHI.areAllIncomingValuesSame()) 
-      continue; 
-    if (!canReplaceOperandWithVariable(I0, OpNum)) 
-      // We can 't create a PHI from this instruction! 
-      return None; 
-    if (NeededPHIs.count(PHI)) 
-      continue; 
-    if (!PHI.areAllIncomingValuesSameType()) 
-      return None; 
-    // Don't create indirect calls! The called value is the final operand. 
-    if ((isa<CallInst>(I0) || isa<InvokeInst>(I0)) && OpNum == E - 1 && 
-        PHI.areAnyIncomingValuesConstant()) 
-      return None; 
- 
-    NeededPHIs.reserve(NeededPHIs.size()); 
-    NeededPHIs.insert(PHI); 
-    PHIContents.insert(PHI.getValues().begin(), PHI.getValues().end()); 
-  } 
- 
-  if (isMemoryInst(NewInsts[0])) 
-    ++MemoryInstNum; 
- 
-  SinkingInstructionCandidate Cand; 
-  Cand.NumInstructions = ++InstNum; 
-  Cand.NumMemoryInsts = MemoryInstNum; 
-  Cand.NumBlocks = ActivePreds.size(); 
-  Cand.NumPHIs = NeededPHIs.size(); 
+    RecomputePHIContents = true;
+
+  if (RecomputePHIContents) {
+    // The needed PHIs have changed, so recompute the set of all needed
+    // values.
+    PHIContents.clear();
+    for (auto &PHI : NeededPHIs)
+      PHIContents.insert(PHI.getValues().begin(), PHI.getValues().end());
+  }
+
+  // Is this instruction required by a later PHI that doesn't match this PHI?
+  // if so, we can't sink this instruction.
+  for (auto *V : NewPHI.getValues())
+    if (PHIContents.count(V))
+      // V exists in this PHI, but the whole PHI is different to NewPHI
+      // (else it would have been removed earlier). We cannot continue
+      // because this isn't representable.
+      return None;
+
+  // Which operands need PHIs?
+  // FIXME: If any of these fail, we should partition up the candidates to
+  // try and continue making progress.
+  Instruction *I0 = NewInsts[0];
+
+  // If all instructions that are going to participate don't have the same
+  // number of operands, we can't do any useful PHI analysis for all operands.
+  auto hasDifferentNumOperands = [&I0](Instruction *I) {
+    return I->getNumOperands() != I0->getNumOperands();
+  };
+  if (any_of(NewInsts, hasDifferentNumOperands))
+    return None;
+
+  for (unsigned OpNum = 0, E = I0->getNumOperands(); OpNum != E; ++OpNum) {
+    ModelledPHI PHI(NewInsts, OpNum, ActivePreds);
+    if (PHI.areAllIncomingValuesSame())
+      continue;
+    if (!canReplaceOperandWithVariable(I0, OpNum))
+      // We can 't create a PHI from this instruction!
+      return None;
+    if (NeededPHIs.count(PHI))
+      continue;
+    if (!PHI.areAllIncomingValuesSameType())
+      return None;
+    // Don't create indirect calls! The called value is the final operand.
+    if ((isa<CallInst>(I0) || isa<InvokeInst>(I0)) && OpNum == E - 1 &&
+        PHI.areAnyIncomingValuesConstant())
+      return None;
+
+    NeededPHIs.reserve(NeededPHIs.size());
+    NeededPHIs.insert(PHI);
+    PHIContents.insert(PHI.getValues().begin(), PHI.getValues().end());
+  }
+
+  if (isMemoryInst(NewInsts[0]))
+    ++MemoryInstNum;
+
+  SinkingInstructionCandidate Cand;
+  Cand.NumInstructions = ++InstNum;
+  Cand.NumMemoryInsts = MemoryInstNum;
+  Cand.NumBlocks = ActivePreds.size();
+  Cand.NumPHIs = NeededPHIs.size();
   append_range(Cand.Blocks, ActivePreds);
- 
-  return Cand; 
-} 
- 
-unsigned GVNSink::sinkBB(BasicBlock *BBEnd) { 
-  LLVM_DEBUG(dbgs() << "GVNSink: running on basic block "; 
-             BBEnd->printAsOperand(dbgs()); dbgs() << "\n"); 
-  SmallVector<BasicBlock *, 4> Preds; 
-  for (auto *B : predecessors(BBEnd)) { 
-    auto *T = B->getTerminator(); 
-    if (isa<BranchInst>(T) || isa<SwitchInst>(T)) 
-      Preds.push_back(B); 
-    else 
-      return 0; 
-  } 
-  if (Preds.size() < 2) 
-    return 0; 
-  llvm::sort(Preds); 
- 
-  unsigned NumOrigPreds = Preds.size(); 
-  // We can only sink instructions through unconditional branches. 
-  for (auto I = Preds.begin(); I != Preds.end();) { 
-    if ((*I)->getTerminator()->getNumSuccessors() != 1) 
-      I = Preds.erase(I); 
-    else 
-      ++I; 
-  } 
- 
-  LockstepReverseIterator LRI(Preds); 
-  SmallVector<SinkingInstructionCandidate, 4> Candidates; 
-  unsigned InstNum = 0, MemoryInstNum = 0; 
-  ModelledPHISet NeededPHIs; 
-  SmallPtrSet<Value *, 4> PHIContents; 
-  analyzeInitialPHIs(BBEnd, NeededPHIs, PHIContents); 
-  unsigned NumOrigPHIs = NeededPHIs.size(); 
- 
-  while (LRI.isValid()) { 
-    auto Cand = analyzeInstructionForSinking(LRI, InstNum, MemoryInstNum, 
-                                             NeededPHIs, PHIContents); 
-    if (!Cand) 
-      break; 
-    Cand->calculateCost(NumOrigPHIs, Preds.size()); 
-    Candidates.emplace_back(*Cand); 
-    --LRI; 
-  } 
- 
-  llvm::stable_sort(Candidates, std::greater<SinkingInstructionCandidate>()); 
-  LLVM_DEBUG(dbgs() << " -- Sinking candidates:\n"; for (auto &C 
-                                                         : Candidates) dbgs() 
-                                                    << "  " << C << "\n";); 
- 
-  // Pick the top candidate, as long it is positive! 
-  if (Candidates.empty() || Candidates.front().Cost <= 0) 
-    return 0; 
-  auto C = Candidates.front(); 
- 
-  LLVM_DEBUG(dbgs() << " -- Sinking: " << C << "\n"); 
-  BasicBlock *InsertBB = BBEnd; 
-  if (C.Blocks.size() < NumOrigPreds) { 
-    LLVM_DEBUG(dbgs() << " -- Splitting edge to "; 
-               BBEnd->printAsOperand(dbgs()); dbgs() << "\n"); 
-    InsertBB = SplitBlockPredecessors(BBEnd, C.Blocks, ".gvnsink.split"); 
-    if (!InsertBB) { 
-      LLVM_DEBUG(dbgs() << " -- FAILED to split edge!\n"); 
-      // Edge couldn't be split. 
-      return 0; 
-    } 
-  } 
- 
-  for (unsigned I = 0; I < C.NumInstructions; ++I) 
-    sinkLastInstruction(C.Blocks, InsertBB); 
- 
-  return C.NumInstructions; 
-} 
- 
-void GVNSink::sinkLastInstruction(ArrayRef<BasicBlock *> Blocks, 
-                                  BasicBlock *BBEnd) { 
-  SmallVector<Instruction *, 4> Insts; 
-  for (BasicBlock *BB : Blocks) 
-    Insts.push_back(BB->getTerminator()->getPrevNode()); 
-  Instruction *I0 = Insts.front(); 
- 
-  SmallVector<Value *, 4> NewOperands; 
-  for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O) { 
-    bool NeedPHI = llvm::any_of(Insts, [&I0, O](const Instruction *I) { 
-      return I->getOperand(O) != I0->getOperand(O); 
-    }); 
-    if (!NeedPHI) { 
-      NewOperands.push_back(I0->getOperand(O)); 
-      continue; 
-    } 
- 
-    // Create a new PHI in the successor block and populate it. 
-    auto *Op = I0->getOperand(O); 
-    assert(!Op->getType()->isTokenTy() && "Can't PHI tokens!"); 
-    auto *PN = PHINode::Create(Op->getType(), Insts.size(), 
-                               Op->getName() + ".sink", &BBEnd->front()); 
-    for (auto *I : Insts) 
-      PN->addIncoming(I->getOperand(O), I->getParent()); 
-    NewOperands.push_back(PN); 
-  } 
- 
-  // Arbitrarily use I0 as the new "common" instruction; remap its operands 
-  // and move it to the start of the successor block. 
-  for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O) 
-    I0->getOperandUse(O).set(NewOperands[O]); 
-  I0->moveBefore(&*BBEnd->getFirstInsertionPt()); 
- 
-  // Update metadata and IR flags. 
-  for (auto *I : Insts) 
-    if (I != I0) { 
-      combineMetadataForCSE(I0, I, true); 
-      I0->andIRFlags(I); 
-    } 
- 
-  for (auto *I : Insts) 
-    if (I != I0) 
-      I->replaceAllUsesWith(I0); 
-  foldPointlessPHINodes(BBEnd); 
- 
-  // Finally nuke all instructions apart from the common instruction. 
-  for (auto *I : Insts) 
-    if (I != I0) 
-      I->eraseFromParent(); 
- 
-  NumRemoved += Insts.size() - 1; 
-} 
- 
-//////////////////////////////////////////////////////////////////////////////// 
-// Pass machinery / boilerplate 
- 
-class GVNSinkLegacyPass : public FunctionPass { 
-public: 
-  static char ID; 
- 
-  GVNSinkLegacyPass() : FunctionPass(ID) { 
-    initializeGVNSinkLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnFunction(Function &F) override { 
-    if (skipFunction(F)) 
-      return false; 
-    GVNSink G; 
-    return G.run(F); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addPreserved<GlobalsAAWrapperPass>(); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-PreservedAnalyses GVNSinkPass::run(Function &F, FunctionAnalysisManager &AM) { 
-  GVNSink G; 
-  if (!G.run(F)) 
-    return PreservedAnalyses::all(); 
- 
-  PreservedAnalyses PA; 
-  PA.preserve<GlobalsAA>(); 
-  return PA; 
-} 
- 
-char GVNSinkLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(GVNSinkLegacyPass, "gvn-sink", 
-                      "Early GVN sinking of Expressions", false, false) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) 
-INITIALIZE_PASS_END(GVNSinkLegacyPass, "gvn-sink", 
-                    "Early GVN sinking of Expressions", false, false) 
- 
-FunctionPass *llvm::createGVNSinkPass() { return new GVNSinkLegacyPass(); } 
+
+  return Cand;
+}
+
+unsigned GVNSink::sinkBB(BasicBlock *BBEnd) {
+  LLVM_DEBUG(dbgs() << "GVNSink: running on basic block ";
+             BBEnd->printAsOperand(dbgs()); dbgs() << "\n");
+  SmallVector<BasicBlock *, 4> Preds;
+  for (auto *B : predecessors(BBEnd)) {
+    auto *T = B->getTerminator();
+    if (isa<BranchInst>(T) || isa<SwitchInst>(T))
+      Preds.push_back(B);
+    else
+      return 0;
+  }
+  if (Preds.size() < 2)
+    return 0;
+  llvm::sort(Preds);
+
+  unsigned NumOrigPreds = Preds.size();
+  // We can only sink instructions through unconditional branches.
+  for (auto I = Preds.begin(); I != Preds.end();) {
+    if ((*I)->getTerminator()->getNumSuccessors() != 1)
+      I = Preds.erase(I);
+    else
+      ++I;
+  }
+
+  LockstepReverseIterator LRI(Preds);
+  SmallVector<SinkingInstructionCandidate, 4> Candidates;
+  unsigned InstNum = 0, MemoryInstNum = 0;
+  ModelledPHISet NeededPHIs;
+  SmallPtrSet<Value *, 4> PHIContents;
+  analyzeInitialPHIs(BBEnd, NeededPHIs, PHIContents);
+  unsigned NumOrigPHIs = NeededPHIs.size();
+
+  while (LRI.isValid()) {
+    auto Cand = analyzeInstructionForSinking(LRI, InstNum, MemoryInstNum,
+                                             NeededPHIs, PHIContents);
+    if (!Cand)
+      break;
+    Cand->calculateCost(NumOrigPHIs, Preds.size());
+    Candidates.emplace_back(*Cand);
+    --LRI;
+  }
+
+  llvm::stable_sort(Candidates, std::greater<SinkingInstructionCandidate>());
+  LLVM_DEBUG(dbgs() << " -- Sinking candidates:\n"; for (auto &C
+                                                         : Candidates) dbgs()
+                                                    << "  " << C << "\n";);
+
+  // Pick the top candidate, as long it is positive!
+  if (Candidates.empty() || Candidates.front().Cost <= 0)
+    return 0;
+  auto C = Candidates.front();
+
+  LLVM_DEBUG(dbgs() << " -- Sinking: " << C << "\n");
+  BasicBlock *InsertBB = BBEnd;
+  if (C.Blocks.size() < NumOrigPreds) {
+    LLVM_DEBUG(dbgs() << " -- Splitting edge to ";
+               BBEnd->printAsOperand(dbgs()); dbgs() << "\n");
+    InsertBB = SplitBlockPredecessors(BBEnd, C.Blocks, ".gvnsink.split");
+    if (!InsertBB) {
+      LLVM_DEBUG(dbgs() << " -- FAILED to split edge!\n");
+      // Edge couldn't be split.
+      return 0;
+    }
+  }
+
+  for (unsigned I = 0; I < C.NumInstructions; ++I)
+    sinkLastInstruction(C.Blocks, InsertBB);
+
+  return C.NumInstructions;
+}
+
+void GVNSink::sinkLastInstruction(ArrayRef<BasicBlock *> Blocks,
+                                  BasicBlock *BBEnd) {
+  SmallVector<Instruction *, 4> Insts;
+  for (BasicBlock *BB : Blocks)
+    Insts.push_back(BB->getTerminator()->getPrevNode());
+  Instruction *I0 = Insts.front();
+
+  SmallVector<Value *, 4> NewOperands;
+  for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O) {
+    bool NeedPHI = llvm::any_of(Insts, [&I0, O](const Instruction *I) {
+      return I->getOperand(O) != I0->getOperand(O);
+    });
+    if (!NeedPHI) {
+      NewOperands.push_back(I0->getOperand(O));
+      continue;
+    }
+
+    // Create a new PHI in the successor block and populate it.
+    auto *Op = I0->getOperand(O);
+    assert(!Op->getType()->isTokenTy() && "Can't PHI tokens!");
+    auto *PN = PHINode::Create(Op->getType(), Insts.size(),
+                               Op->getName() + ".sink", &BBEnd->front());
+    for (auto *I : Insts)
+      PN->addIncoming(I->getOperand(O), I->getParent());
+    NewOperands.push_back(PN);
+  }
+
+  // Arbitrarily use I0 as the new "common" instruction; remap its operands
+  // and move it to the start of the successor block.
+  for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O)
+    I0->getOperandUse(O).set(NewOperands[O]);
+  I0->moveBefore(&*BBEnd->getFirstInsertionPt());
+
+  // Update metadata and IR flags.
+  for (auto *I : Insts)
+    if (I != I0) {
+      combineMetadataForCSE(I0, I, true);
+      I0->andIRFlags(I);
+    }
+
+  for (auto *I : Insts)
+    if (I != I0)
+      I->replaceAllUsesWith(I0);
+  foldPointlessPHINodes(BBEnd);
+
+  // Finally nuke all instructions apart from the common instruction.
+  for (auto *I : Insts)
+    if (I != I0)
+      I->eraseFromParent();
+
+  NumRemoved += Insts.size() - 1;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Pass machinery / boilerplate
+
+class GVNSinkLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  GVNSinkLegacyPass() : FunctionPass(ID) {
+    initializeGVNSinkLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    GVNSink G;
+    return G.run(F);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+
+} // end anonymous namespace
+
+PreservedAnalyses GVNSinkPass::run(Function &F, FunctionAnalysisManager &AM) {
+  GVNSink G;
+  if (!G.run(F))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+
+char GVNSinkLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(GVNSinkLegacyPass, "gvn-sink",
+                      "Early GVN sinking of Expressions", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_END(GVNSinkLegacyPass, "gvn-sink",
+                    "Early GVN sinking of Expressions", false, false)
+
+FunctionPass *llvm::createGVNSinkPass() { return new GVNSinkLegacyPass(); }
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/GuardWidening.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/GuardWidening.cpp
index 12363b373a..61eb4ce0ed 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/GuardWidening.cpp
@@ -1,881 +1,881 @@
-//===- GuardWidening.cpp - ---- Guard widening ----------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the guard widening pass.  The semantics of the 
-// @llvm.experimental.guard intrinsic lets LLVM transform it so that it fails 
-// more often that it did before the transform.  This optimization is called 
-// "widening" and can be used hoist and common runtime checks in situations like 
-// these: 
-// 
-//    %cmp0 = 7 u< Length 
-//    call @llvm.experimental.guard(i1 %cmp0) [ "deopt"(...) ] 
-//    call @unknown_side_effects() 
-//    %cmp1 = 9 u< Length 
-//    call @llvm.experimental.guard(i1 %cmp1) [ "deopt"(...) ] 
-//    ... 
-// 
-// => 
-// 
-//    %cmp0 = 9 u< Length 
-//    call @llvm.experimental.guard(i1 %cmp0) [ "deopt"(...) ] 
-//    call @unknown_side_effects() 
-//    ... 
-// 
-// If %cmp0 is false, @llvm.experimental.guard will "deoptimize" back to a 
-// generic implementation of the same function, which will have the correct 
-// semantics from that point onward.  It is always _legal_ to deoptimize (so 
-// replacing %cmp0 with false is "correct"), though it may not always be 
-// profitable to do so. 
-// 
-// NB! This pass is a work in progress.  It hasn't been tuned to be "production 
-// ready" yet.  It is known to have quadriatic running time and will not scale 
-// to large numbers of guards 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/GuardWidening.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/DepthFirstIterator.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/BranchProbabilityInfo.h" 
-#include "llvm/Analysis/GuardUtils.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/LoopPass.h" 
-#include "llvm/Analysis/PostDominators.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/ConstantRange.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/KnownBits.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/GuardUtils.h" 
-#include "llvm/Transforms/Utils/LoopUtils.h" 
-#include <functional> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "guard-widening" 
- 
-STATISTIC(GuardsEliminated, "Number of eliminated guards"); 
-STATISTIC(CondBranchEliminated, "Number of eliminated conditional branches"); 
- 
-static cl::opt<bool> 
-    WidenBranchGuards("guard-widening-widen-branch-guards", cl::Hidden, 
-                      cl::desc("Whether or not we should widen guards  " 
-                               "expressed as branches by widenable conditions"), 
-                      cl::init(true)); 
- 
-namespace { 
- 
-// Get the condition of \p I. It can either be a guard or a conditional branch. 
-static Value *getCondition(Instruction *I) { 
-  if (IntrinsicInst *GI = dyn_cast<IntrinsicInst>(I)) { 
-    assert(GI->getIntrinsicID() == Intrinsic::experimental_guard && 
-           "Bad guard intrinsic?"); 
-    return GI->getArgOperand(0); 
-  } 
-  Value *Cond, *WC; 
-  BasicBlock *IfTrueBB, *IfFalseBB; 
-  if (parseWidenableBranch(I, Cond, WC, IfTrueBB, IfFalseBB)) 
-    return Cond; 
- 
-  return cast<BranchInst>(I)->getCondition(); 
-} 
- 
-// Set the condition for \p I to \p NewCond. \p I can either be a guard or a 
-// conditional branch.   
-static void setCondition(Instruction *I, Value *NewCond) { 
-  if (IntrinsicInst *GI = dyn_cast<IntrinsicInst>(I)) { 
-    assert(GI->getIntrinsicID() == Intrinsic::experimental_guard && 
-           "Bad guard intrinsic?"); 
-    GI->setArgOperand(0, NewCond); 
-    return; 
-  } 
-  cast<BranchInst>(I)->setCondition(NewCond); 
-} 
- 
-// Eliminates the guard instruction properly. 
-static void eliminateGuard(Instruction *GuardInst) { 
-  GuardInst->eraseFromParent(); 
-  ++GuardsEliminated; 
-} 
- 
-class GuardWideningImpl { 
-  DominatorTree &DT; 
-  PostDominatorTree *PDT; 
-  LoopInfo &LI; 
- 
-  /// Together, these describe the region of interest.  This might be all of 
-  /// the blocks within a function, or only a given loop's blocks and preheader. 
-  DomTreeNode *Root; 
-  std::function<bool(BasicBlock*)> BlockFilter; 
- 
-  /// The set of guards and conditional branches whose conditions have been 
-  /// widened into dominating guards. 
-  SmallVector<Instruction *, 16> EliminatedGuardsAndBranches; 
- 
-  /// The set of guards which have been widened to include conditions to other 
-  /// guards. 
-  DenseSet<Instruction *> WidenedGuards; 
- 
-  /// Try to eliminate instruction \p Instr by widening it into an earlier 
-  /// dominating guard.  \p DFSI is the DFS iterator on the dominator tree that 
-  /// is currently visiting the block containing \p Guard, and \p GuardsPerBlock 
-  /// maps BasicBlocks to the set of guards seen in that block. 
-  bool eliminateInstrViaWidening( 
-      Instruction *Instr, const df_iterator<DomTreeNode *> &DFSI, 
-      const DenseMap<BasicBlock *, SmallVector<Instruction *, 8>> & 
-          GuardsPerBlock, bool InvertCondition = false); 
- 
-  /// Used to keep track of which widening potential is more effective. 
-  enum WideningScore { 
-    /// Don't widen. 
-    WS_IllegalOrNegative, 
- 
-    /// Widening is performance neutral as far as the cycles spent in check 
-    /// conditions goes (but can still help, e.g., code layout, having less 
-    /// deopt state). 
-    WS_Neutral, 
- 
-    /// Widening is profitable. 
-    WS_Positive, 
- 
-    /// Widening is very profitable.  Not significantly different from \c 
-    /// WS_Positive, except by the order. 
-    WS_VeryPositive 
-  }; 
- 
-  static StringRef scoreTypeToString(WideningScore WS); 
- 
-  /// Compute the score for widening the condition in \p DominatedInstr 
-  /// into \p DominatingGuard. If \p InvertCond is set, then we widen the 
-  /// inverted condition of the dominating guard. 
-  WideningScore computeWideningScore(Instruction *DominatedInstr, 
-                                     Instruction *DominatingGuard, 
-                                     bool InvertCond); 
- 
-  /// Helper to check if \p V can be hoisted to \p InsertPos. 
-  bool isAvailableAt(const Value *V, const Instruction *InsertPos) const { 
-    SmallPtrSet<const Instruction *, 8> Visited; 
-    return isAvailableAt(V, InsertPos, Visited); 
-  } 
- 
-  bool isAvailableAt(const Value *V, const Instruction *InsertPos, 
-                     SmallPtrSetImpl<const Instruction *> &Visited) const; 
- 
-  /// Helper to hoist \p V to \p InsertPos.  Guaranteed to succeed if \c 
-  /// isAvailableAt returned true. 
-  void makeAvailableAt(Value *V, Instruction *InsertPos) const; 
- 
-  /// Common helper used by \c widenGuard and \c isWideningCondProfitable.  Try 
-  /// to generate an expression computing the logical AND of \p Cond0 and (\p 
-  /// Cond1 XOR \p InvertCondition). 
-  /// Return true if the expression computing the AND is only as 
-  /// expensive as computing one of the two. If \p InsertPt is true then 
-  /// actually generate the resulting expression, make it available at \p 
-  /// InsertPt and return it in \p Result (else no change to the IR is made). 
-  bool widenCondCommon(Value *Cond0, Value *Cond1, Instruction *InsertPt, 
-                       Value *&Result, bool InvertCondition); 
- 
-  /// Represents a range check of the form \c Base + \c Offset u< \c Length, 
-  /// with the constraint that \c Length is not negative.  \c CheckInst is the 
-  /// pre-existing instruction in the IR that computes the result of this range 
-  /// check. 
-  class RangeCheck { 
-    const Value *Base; 
-    const ConstantInt *Offset; 
-    const Value *Length; 
-    ICmpInst *CheckInst; 
- 
-  public: 
-    explicit RangeCheck(const Value *Base, const ConstantInt *Offset, 
-                        const Value *Length, ICmpInst *CheckInst) 
-        : Base(Base), Offset(Offset), Length(Length), CheckInst(CheckInst) {} 
- 
-    void setBase(const Value *NewBase) { Base = NewBase; } 
-    void setOffset(const ConstantInt *NewOffset) { Offset = NewOffset; } 
- 
-    const Value *getBase() const { return Base; } 
-    const ConstantInt *getOffset() const { return Offset; } 
-    const APInt &getOffsetValue() const { return getOffset()->getValue(); } 
-    const Value *getLength() const { return Length; }; 
-    ICmpInst *getCheckInst() const { return CheckInst; } 
- 
-    void print(raw_ostream &OS, bool PrintTypes = false) { 
-      OS << "Base: "; 
-      Base->printAsOperand(OS, PrintTypes); 
-      OS << " Offset: "; 
-      Offset->printAsOperand(OS, PrintTypes); 
-      OS << " Length: "; 
-      Length->printAsOperand(OS, PrintTypes); 
-    } 
- 
-    LLVM_DUMP_METHOD void dump() { 
-      print(dbgs()); 
-      dbgs() << "\n"; 
-    } 
-  }; 
- 
-  /// Parse \p CheckCond into a conjunction (logical-and) of range checks; and 
-  /// append them to \p Checks.  Returns true on success, may clobber \c Checks 
-  /// on failure. 
-  bool parseRangeChecks(Value *CheckCond, SmallVectorImpl<RangeCheck> &Checks) { 
-    SmallPtrSet<const Value *, 8> Visited; 
-    return parseRangeChecks(CheckCond, Checks, Visited); 
-  } 
- 
-  bool parseRangeChecks(Value *CheckCond, SmallVectorImpl<RangeCheck> &Checks, 
-                        SmallPtrSetImpl<const Value *> &Visited); 
- 
-  /// Combine the checks in \p Checks into a smaller set of checks and append 
-  /// them into \p CombinedChecks.  Return true on success (i.e. all of checks 
-  /// in \p Checks were combined into \p CombinedChecks).  Clobbers \p Checks 
-  /// and \p CombinedChecks on success and on failure. 
-  bool combineRangeChecks(SmallVectorImpl<RangeCheck> &Checks, 
-                          SmallVectorImpl<RangeCheck> &CombinedChecks) const; 
- 
-  /// Can we compute the logical AND of \p Cond0 and \p Cond1 for the price of 
-  /// computing only one of the two expressions? 
-  bool isWideningCondProfitable(Value *Cond0, Value *Cond1, bool InvertCond) { 
-    Value *ResultUnused; 
-    return widenCondCommon(Cond0, Cond1, /*InsertPt=*/nullptr, ResultUnused, 
-                           InvertCond); 
-  } 
- 
-  /// If \p InvertCondition is false, Widen \p ToWiden to fail if 
-  /// \p NewCondition is false, otherwise make it fail if \p NewCondition is 
-  /// true (in addition to whatever it is already checking). 
-  void widenGuard(Instruction *ToWiden, Value *NewCondition, 
-                  bool InvertCondition) { 
-    Value *Result; 
-     
-    widenCondCommon(getCondition(ToWiden), NewCondition, ToWiden, Result, 
-                    InvertCondition); 
-    if (isGuardAsWidenableBranch(ToWiden)) { 
-      setWidenableBranchCond(cast<BranchInst>(ToWiden), Result); 
-      return; 
-    } 
-    setCondition(ToWiden, Result); 
-  } 
- 
-public: 
- 
-  explicit GuardWideningImpl(DominatorTree &DT, PostDominatorTree *PDT, 
-                             LoopInfo &LI, DomTreeNode *Root, 
-                             std::function<bool(BasicBlock*)> BlockFilter) 
-    : DT(DT), PDT(PDT), LI(LI), Root(Root), BlockFilter(BlockFilter) 
-        {} 
- 
-  /// The entry point for this pass. 
-  bool run(); 
-}; 
-} 
- 
-static bool isSupportedGuardInstruction(const Instruction *Insn) { 
-  if (isGuard(Insn)) 
-    return true; 
-  if (WidenBranchGuards && isGuardAsWidenableBranch(Insn)) 
-    return true; 
-  return false; 
-} 
- 
-bool GuardWideningImpl::run() { 
-  DenseMap<BasicBlock *, SmallVector<Instruction *, 8>> GuardsInBlock; 
-  bool Changed = false; 
-  for (auto DFI = df_begin(Root), DFE = df_end(Root); 
-       DFI != DFE; ++DFI) { 
-    auto *BB = (*DFI)->getBlock(); 
-    if (!BlockFilter(BB)) 
-      continue; 
- 
-    auto &CurrentList = GuardsInBlock[BB]; 
- 
-    for (auto &I : *BB) 
-      if (isSupportedGuardInstruction(&I)) 
-        CurrentList.push_back(cast<Instruction>(&I)); 
- 
-    for (auto *II : CurrentList) 
-      Changed |= eliminateInstrViaWidening(II, DFI, GuardsInBlock); 
-  } 
- 
-  assert(EliminatedGuardsAndBranches.empty() || Changed); 
-  for (auto *I : EliminatedGuardsAndBranches) 
-    if (!WidenedGuards.count(I)) { 
-      assert(isa<ConstantInt>(getCondition(I)) && "Should be!"); 
-      if (isSupportedGuardInstruction(I)) 
-        eliminateGuard(I); 
-      else { 
-        assert(isa<BranchInst>(I) && 
-               "Eliminated something other than guard or branch?"); 
-        ++CondBranchEliminated; 
-      } 
-    } 
- 
-  return Changed; 
-} 
- 
-bool GuardWideningImpl::eliminateInstrViaWidening( 
-    Instruction *Instr, const df_iterator<DomTreeNode *> &DFSI, 
-    const DenseMap<BasicBlock *, SmallVector<Instruction *, 8>> & 
-        GuardsInBlock, bool InvertCondition) { 
-  // Ignore trivial true or false conditions. These instructions will be 
-  // trivially eliminated by any cleanup pass. Do not erase them because other 
-  // guards can possibly be widened into them. 
-  if (isa<ConstantInt>(getCondition(Instr))) 
-    return false; 
- 
-  Instruction *BestSoFar = nullptr; 
-  auto BestScoreSoFar = WS_IllegalOrNegative; 
- 
-  // In the set of dominating guards, find the one we can merge GuardInst with 
-  // for the most profit. 
-  for (unsigned i = 0, e = DFSI.getPathLength(); i != e; ++i) { 
-    auto *CurBB = DFSI.getPath(i)->getBlock(); 
-    if (!BlockFilter(CurBB)) 
-      break; 
-    assert(GuardsInBlock.count(CurBB) && "Must have been populated by now!"); 
-    const auto &GuardsInCurBB = GuardsInBlock.find(CurBB)->second; 
- 
-    auto I = GuardsInCurBB.begin(); 
+//===- GuardWidening.cpp - ---- Guard widening ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the guard widening pass.  The semantics of the
+// @llvm.experimental.guard intrinsic lets LLVM transform it so that it fails
+// more often that it did before the transform.  This optimization is called
+// "widening" and can be used hoist and common runtime checks in situations like
+// these:
+//
+//    %cmp0 = 7 u< Length
+//    call @llvm.experimental.guard(i1 %cmp0) [ "deopt"(...) ]
+//    call @unknown_side_effects()
+//    %cmp1 = 9 u< Length
+//    call @llvm.experimental.guard(i1 %cmp1) [ "deopt"(...) ]
+//    ...
+//
+// =>
+//
+//    %cmp0 = 9 u< Length
+//    call @llvm.experimental.guard(i1 %cmp0) [ "deopt"(...) ]
+//    call @unknown_side_effects()
+//    ...
+//
+// If %cmp0 is false, @llvm.experimental.guard will "deoptimize" back to a
+// generic implementation of the same function, which will have the correct
+// semantics from that point onward.  It is always _legal_ to deoptimize (so
+// replacing %cmp0 with false is "correct"), though it may not always be
+// profitable to do so.
+//
+// NB! This pass is a work in progress.  It hasn't been tuned to be "production
+// ready" yet.  It is known to have quadriatic running time and will not scale
+// to large numbers of guards
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/GuardWidening.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/GuardUtils.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/GuardUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include <functional>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "guard-widening"
+
+STATISTIC(GuardsEliminated, "Number of eliminated guards");
+STATISTIC(CondBranchEliminated, "Number of eliminated conditional branches");
+
+static cl::opt<bool>
+    WidenBranchGuards("guard-widening-widen-branch-guards", cl::Hidden,
+                      cl::desc("Whether or not we should widen guards  "
+                               "expressed as branches by widenable conditions"),
+                      cl::init(true));
+
+namespace {
+
+// Get the condition of \p I. It can either be a guard or a conditional branch.
+static Value *getCondition(Instruction *I) {
+  if (IntrinsicInst *GI = dyn_cast<IntrinsicInst>(I)) {
+    assert(GI->getIntrinsicID() == Intrinsic::experimental_guard &&
+           "Bad guard intrinsic?");
+    return GI->getArgOperand(0);
+  }
+  Value *Cond, *WC;
+  BasicBlock *IfTrueBB, *IfFalseBB;
+  if (parseWidenableBranch(I, Cond, WC, IfTrueBB, IfFalseBB))
+    return Cond;
+
+  return cast<BranchInst>(I)->getCondition();
+}
+
+// Set the condition for \p I to \p NewCond. \p I can either be a guard or a
+// conditional branch.  
+static void setCondition(Instruction *I, Value *NewCond) {
+  if (IntrinsicInst *GI = dyn_cast<IntrinsicInst>(I)) {
+    assert(GI->getIntrinsicID() == Intrinsic::experimental_guard &&
+           "Bad guard intrinsic?");
+    GI->setArgOperand(0, NewCond);
+    return;
+  }
+  cast<BranchInst>(I)->setCondition(NewCond);
+}
+
+// Eliminates the guard instruction properly.
+static void eliminateGuard(Instruction *GuardInst) {
+  GuardInst->eraseFromParent();
+  ++GuardsEliminated;
+}
+
+class GuardWideningImpl {
+  DominatorTree &DT;
+  PostDominatorTree *PDT;
+  LoopInfo &LI;
+
+  /// Together, these describe the region of interest.  This might be all of
+  /// the blocks within a function, or only a given loop's blocks and preheader.
+  DomTreeNode *Root;
+  std::function<bool(BasicBlock*)> BlockFilter;
+
+  /// The set of guards and conditional branches whose conditions have been
+  /// widened into dominating guards.
+  SmallVector<Instruction *, 16> EliminatedGuardsAndBranches;
+
+  /// The set of guards which have been widened to include conditions to other
+  /// guards.
+  DenseSet<Instruction *> WidenedGuards;
+
+  /// Try to eliminate instruction \p Instr by widening it into an earlier
+  /// dominating guard.  \p DFSI is the DFS iterator on the dominator tree that
+  /// is currently visiting the block containing \p Guard, and \p GuardsPerBlock
+  /// maps BasicBlocks to the set of guards seen in that block.
+  bool eliminateInstrViaWidening(
+      Instruction *Instr, const df_iterator<DomTreeNode *> &DFSI,
+      const DenseMap<BasicBlock *, SmallVector<Instruction *, 8>> &
+          GuardsPerBlock, bool InvertCondition = false);
+
+  /// Used to keep track of which widening potential is more effective.
+  enum WideningScore {
+    /// Don't widen.
+    WS_IllegalOrNegative,
+
+    /// Widening is performance neutral as far as the cycles spent in check
+    /// conditions goes (but can still help, e.g., code layout, having less
+    /// deopt state).
+    WS_Neutral,
+
+    /// Widening is profitable.
+    WS_Positive,
+
+    /// Widening is very profitable.  Not significantly different from \c
+    /// WS_Positive, except by the order.
+    WS_VeryPositive
+  };
+
+  static StringRef scoreTypeToString(WideningScore WS);
+
+  /// Compute the score for widening the condition in \p DominatedInstr
+  /// into \p DominatingGuard. If \p InvertCond is set, then we widen the
+  /// inverted condition of the dominating guard.
+  WideningScore computeWideningScore(Instruction *DominatedInstr,
+                                     Instruction *DominatingGuard,
+                                     bool InvertCond);
+
+  /// Helper to check if \p V can be hoisted to \p InsertPos.
+  bool isAvailableAt(const Value *V, const Instruction *InsertPos) const {
+    SmallPtrSet<const Instruction *, 8> Visited;
+    return isAvailableAt(V, InsertPos, Visited);
+  }
+
+  bool isAvailableAt(const Value *V, const Instruction *InsertPos,
+                     SmallPtrSetImpl<const Instruction *> &Visited) const;
+
+  /// Helper to hoist \p V to \p InsertPos.  Guaranteed to succeed if \c
+  /// isAvailableAt returned true.
+  void makeAvailableAt(Value *V, Instruction *InsertPos) const;
+
+  /// Common helper used by \c widenGuard and \c isWideningCondProfitable.  Try
+  /// to generate an expression computing the logical AND of \p Cond0 and (\p
+  /// Cond1 XOR \p InvertCondition).
+  /// Return true if the expression computing the AND is only as
+  /// expensive as computing one of the two. If \p InsertPt is true then
+  /// actually generate the resulting expression, make it available at \p
+  /// InsertPt and return it in \p Result (else no change to the IR is made).
+  bool widenCondCommon(Value *Cond0, Value *Cond1, Instruction *InsertPt,
+                       Value *&Result, bool InvertCondition);
+
+  /// Represents a range check of the form \c Base + \c Offset u< \c Length,
+  /// with the constraint that \c Length is not negative.  \c CheckInst is the
+  /// pre-existing instruction in the IR that computes the result of this range
+  /// check.
+  class RangeCheck {
+    const Value *Base;
+    const ConstantInt *Offset;
+    const Value *Length;
+    ICmpInst *CheckInst;
+
+  public:
+    explicit RangeCheck(const Value *Base, const ConstantInt *Offset,
+                        const Value *Length, ICmpInst *CheckInst)
+        : Base(Base), Offset(Offset), Length(Length), CheckInst(CheckInst) {}
+
+    void setBase(const Value *NewBase) { Base = NewBase; }
+    void setOffset(const ConstantInt *NewOffset) { Offset = NewOffset; }
+
+    const Value *getBase() const { return Base; }
+    const ConstantInt *getOffset() const { return Offset; }
+    const APInt &getOffsetValue() const { return getOffset()->getValue(); }
+    const Value *getLength() const { return Length; };
+    ICmpInst *getCheckInst() const { return CheckInst; }
+
+    void print(raw_ostream &OS, bool PrintTypes = false) {
+      OS << "Base: ";
+      Base->printAsOperand(OS, PrintTypes);
+      OS << " Offset: ";
+      Offset->printAsOperand(OS, PrintTypes);
+      OS << " Length: ";
+      Length->printAsOperand(OS, PrintTypes);
+    }
+
+    LLVM_DUMP_METHOD void dump() {
+      print(dbgs());
+      dbgs() << "\n";
+    }
+  };
+
+  /// Parse \p CheckCond into a conjunction (logical-and) of range checks; and
+  /// append them to \p Checks.  Returns true on success, may clobber \c Checks
+  /// on failure.
+  bool parseRangeChecks(Value *CheckCond, SmallVectorImpl<RangeCheck> &Checks) {
+    SmallPtrSet<const Value *, 8> Visited;
+    return parseRangeChecks(CheckCond, Checks, Visited);
+  }
+
+  bool parseRangeChecks(Value *CheckCond, SmallVectorImpl<RangeCheck> &Checks,
+                        SmallPtrSetImpl<const Value *> &Visited);
+
+  /// Combine the checks in \p Checks into a smaller set of checks and append
+  /// them into \p CombinedChecks.  Return true on success (i.e. all of checks
+  /// in \p Checks were combined into \p CombinedChecks).  Clobbers \p Checks
+  /// and \p CombinedChecks on success and on failure.
+  bool combineRangeChecks(SmallVectorImpl<RangeCheck> &Checks,
+                          SmallVectorImpl<RangeCheck> &CombinedChecks) const;
+
+  /// Can we compute the logical AND of \p Cond0 and \p Cond1 for the price of
+  /// computing only one of the two expressions?
+  bool isWideningCondProfitable(Value *Cond0, Value *Cond1, bool InvertCond) {
+    Value *ResultUnused;
+    return widenCondCommon(Cond0, Cond1, /*InsertPt=*/nullptr, ResultUnused,
+                           InvertCond);
+  }
+
+  /// If \p InvertCondition is false, Widen \p ToWiden to fail if
+  /// \p NewCondition is false, otherwise make it fail if \p NewCondition is
+  /// true (in addition to whatever it is already checking).
+  void widenGuard(Instruction *ToWiden, Value *NewCondition,
+                  bool InvertCondition) {
+    Value *Result;
+    
+    widenCondCommon(getCondition(ToWiden), NewCondition, ToWiden, Result,
+                    InvertCondition);
+    if (isGuardAsWidenableBranch(ToWiden)) {
+      setWidenableBranchCond(cast<BranchInst>(ToWiden), Result);
+      return;
+    }
+    setCondition(ToWiden, Result);
+  }
+
+public:
+
+  explicit GuardWideningImpl(DominatorTree &DT, PostDominatorTree *PDT,
+                             LoopInfo &LI, DomTreeNode *Root,
+                             std::function<bool(BasicBlock*)> BlockFilter)
+    : DT(DT), PDT(PDT), LI(LI), Root(Root), BlockFilter(BlockFilter)
+        {}
+
+  /// The entry point for this pass.
+  bool run();
+};
+}
+
+static bool isSupportedGuardInstruction(const Instruction *Insn) {
+  if (isGuard(Insn))
+    return true;
+  if (WidenBranchGuards && isGuardAsWidenableBranch(Insn))
+    return true;
+  return false;
+}
+
+bool GuardWideningImpl::run() {
+  DenseMap<BasicBlock *, SmallVector<Instruction *, 8>> GuardsInBlock;
+  bool Changed = false;
+  for (auto DFI = df_begin(Root), DFE = df_end(Root);
+       DFI != DFE; ++DFI) {
+    auto *BB = (*DFI)->getBlock();
+    if (!BlockFilter(BB))
+      continue;
+
+    auto &CurrentList = GuardsInBlock[BB];
+
+    for (auto &I : *BB)
+      if (isSupportedGuardInstruction(&I))
+        CurrentList.push_back(cast<Instruction>(&I));
+
+    for (auto *II : CurrentList)
+      Changed |= eliminateInstrViaWidening(II, DFI, GuardsInBlock);
+  }
+
+  assert(EliminatedGuardsAndBranches.empty() || Changed);
+  for (auto *I : EliminatedGuardsAndBranches)
+    if (!WidenedGuards.count(I)) {
+      assert(isa<ConstantInt>(getCondition(I)) && "Should be!");
+      if (isSupportedGuardInstruction(I))
+        eliminateGuard(I);
+      else {
+        assert(isa<BranchInst>(I) &&
+               "Eliminated something other than guard or branch?");
+        ++CondBranchEliminated;
+      }
+    }
+
+  return Changed;
+}
+
+bool GuardWideningImpl::eliminateInstrViaWidening(
+    Instruction *Instr, const df_iterator<DomTreeNode *> &DFSI,
+    const DenseMap<BasicBlock *, SmallVector<Instruction *, 8>> &
+        GuardsInBlock, bool InvertCondition) {
+  // Ignore trivial true or false conditions. These instructions will be
+  // trivially eliminated by any cleanup pass. Do not erase them because other
+  // guards can possibly be widened into them.
+  if (isa<ConstantInt>(getCondition(Instr)))
+    return false;
+
+  Instruction *BestSoFar = nullptr;
+  auto BestScoreSoFar = WS_IllegalOrNegative;
+
+  // In the set of dominating guards, find the one we can merge GuardInst with
+  // for the most profit.
+  for (unsigned i = 0, e = DFSI.getPathLength(); i != e; ++i) {
+    auto *CurBB = DFSI.getPath(i)->getBlock();
+    if (!BlockFilter(CurBB))
+      break;
+    assert(GuardsInBlock.count(CurBB) && "Must have been populated by now!");
+    const auto &GuardsInCurBB = GuardsInBlock.find(CurBB)->second;
+
+    auto I = GuardsInCurBB.begin();
     auto E = Instr->getParent() == CurBB ? find(GuardsInCurBB, Instr)
                                          : GuardsInCurBB.end();
- 
-#ifndef NDEBUG 
-    { 
-      unsigned Index = 0; 
-      for (auto &I : *CurBB) { 
-        if (Index == GuardsInCurBB.size()) 
-          break; 
-        if (GuardsInCurBB[Index] == &I) 
-          Index++; 
-      } 
-      assert(Index == GuardsInCurBB.size() && 
-             "Guards expected to be in order!"); 
-    } 
-#endif 
- 
-    assert((i == (e - 1)) == (Instr->getParent() == CurBB) && "Bad DFS?"); 
- 
-    for (auto *Candidate : make_range(I, E)) { 
-      auto Score = computeWideningScore(Instr, Candidate, InvertCondition); 
-      LLVM_DEBUG(dbgs() << "Score between " << *getCondition(Instr) 
-                        << " and " << *getCondition(Candidate) << " is " 
-                        << scoreTypeToString(Score) << "\n"); 
-      if (Score > BestScoreSoFar) { 
-        BestScoreSoFar = Score; 
-        BestSoFar = Candidate; 
-      } 
-    } 
-  } 
- 
-  if (BestScoreSoFar == WS_IllegalOrNegative) { 
-    LLVM_DEBUG(dbgs() << "Did not eliminate guard " << *Instr << "\n"); 
-    return false; 
-  } 
- 
-  assert(BestSoFar != Instr && "Should have never visited same guard!"); 
-  assert(DT.dominates(BestSoFar, Instr) && "Should be!"); 
- 
-  LLVM_DEBUG(dbgs() << "Widening " << *Instr << " into " << *BestSoFar 
-                    << " with score " << scoreTypeToString(BestScoreSoFar) 
-                    << "\n"); 
-  widenGuard(BestSoFar, getCondition(Instr), InvertCondition); 
-  auto NewGuardCondition = InvertCondition 
-                               ? ConstantInt::getFalse(Instr->getContext()) 
-                               : ConstantInt::getTrue(Instr->getContext()); 
-  setCondition(Instr, NewGuardCondition); 
-  EliminatedGuardsAndBranches.push_back(Instr); 
-  WidenedGuards.insert(BestSoFar); 
-  return true; 
-} 
- 
-GuardWideningImpl::WideningScore 
-GuardWideningImpl::computeWideningScore(Instruction *DominatedInstr, 
-                                        Instruction *DominatingGuard, 
-                                        bool InvertCond) { 
-  Loop *DominatedInstrLoop = LI.getLoopFor(DominatedInstr->getParent()); 
-  Loop *DominatingGuardLoop = LI.getLoopFor(DominatingGuard->getParent()); 
-  bool HoistingOutOfLoop = false; 
- 
-  if (DominatingGuardLoop != DominatedInstrLoop) { 
-    // Be conservative and don't widen into a sibling loop.  TODO: If the 
-    // sibling is colder, we should consider allowing this. 
-    if (DominatingGuardLoop && 
-        !DominatingGuardLoop->contains(DominatedInstrLoop)) 
-      return WS_IllegalOrNegative; 
- 
-    HoistingOutOfLoop = true; 
-  } 
- 
-  if (!isAvailableAt(getCondition(DominatedInstr), DominatingGuard)) 
-    return WS_IllegalOrNegative; 
- 
-  // If the guard was conditional executed, it may never be reached 
-  // dynamically.  There are two potential downsides to hoisting it out of the 
-  // conditionally executed region: 1) we may spuriously deopt without need and 
-  // 2) we have the extra cost of computing the guard condition in the common 
-  // case.  At the moment, we really only consider the second in our heuristic 
-  // here.  TODO: evaluate cost model for spurious deopt 
-  // NOTE: As written, this also lets us hoist right over another guard which 
-  // is essentially just another spelling for control flow. 
-  if (isWideningCondProfitable(getCondition(DominatedInstr), 
-                               getCondition(DominatingGuard), InvertCond)) 
-    return HoistingOutOfLoop ? WS_VeryPositive : WS_Positive; 
- 
-  if (HoistingOutOfLoop) 
-    return WS_Positive; 
- 
-  // Returns true if we might be hoisting above explicit control flow.  Note 
-  // that this completely ignores implicit control flow (guards, calls which 
-  // throw, etc...).  That choice appears arbitrary. 
-  auto MaybeHoistingOutOfIf = [&]() { 
-    auto *DominatingBlock = DominatingGuard->getParent(); 
-    auto *DominatedBlock = DominatedInstr->getParent(); 
-    if (isGuardAsWidenableBranch(DominatingGuard)) 
-      DominatingBlock = cast<BranchInst>(DominatingGuard)->getSuccessor(0); 
- 
-    // Same Block? 
-    if (DominatedBlock == DominatingBlock) 
-      return false; 
-    // Obvious successor (common loop header/preheader case) 
-    if (DominatedBlock == DominatingBlock->getUniqueSuccessor()) 
-      return false; 
-    // TODO: diamond, triangle cases 
-    if (!PDT) return true; 
-    return !PDT->dominates(DominatedBlock, DominatingBlock); 
-  }; 
- 
-  return MaybeHoistingOutOfIf() ? WS_IllegalOrNegative : WS_Neutral; 
-} 
- 
-bool GuardWideningImpl::isAvailableAt( 
-    const Value *V, const Instruction *Loc, 
-    SmallPtrSetImpl<const Instruction *> &Visited) const { 
-  auto *Inst = dyn_cast<Instruction>(V); 
-  if (!Inst || DT.dominates(Inst, Loc) || Visited.count(Inst)) 
-    return true; 
- 
-  if (!isSafeToSpeculativelyExecute(Inst, Loc, &DT) || 
-      Inst->mayReadFromMemory()) 
-    return false; 
- 
-  Visited.insert(Inst); 
- 
-  // We only want to go _up_ the dominance chain when recursing. 
-  assert(!isa<PHINode>(Loc) && 
-         "PHIs should return false for isSafeToSpeculativelyExecute"); 
-  assert(DT.isReachableFromEntry(Inst->getParent()) && 
-         "We did a DFS from the block entry!"); 
-  return all_of(Inst->operands(), 
-                [&](Value *Op) { return isAvailableAt(Op, Loc, Visited); }); 
-} 
- 
-void GuardWideningImpl::makeAvailableAt(Value *V, Instruction *Loc) const { 
-  auto *Inst = dyn_cast<Instruction>(V); 
-  if (!Inst || DT.dominates(Inst, Loc)) 
-    return; 
- 
-  assert(isSafeToSpeculativelyExecute(Inst, Loc, &DT) && 
-         !Inst->mayReadFromMemory() && "Should've checked with isAvailableAt!"); 
- 
-  for (Value *Op : Inst->operands()) 
-    makeAvailableAt(Op, Loc); 
- 
-  Inst->moveBefore(Loc); 
-} 
- 
-bool GuardWideningImpl::widenCondCommon(Value *Cond0, Value *Cond1, 
-                                        Instruction *InsertPt, Value *&Result, 
-                                        bool InvertCondition) { 
-  using namespace llvm::PatternMatch; 
- 
-  { 
-    // L >u C0 && L >u C1  ->  L >u max(C0, C1) 
-    ConstantInt *RHS0, *RHS1; 
-    Value *LHS; 
-    ICmpInst::Predicate Pred0, Pred1; 
-    if (match(Cond0, m_ICmp(Pred0, m_Value(LHS), m_ConstantInt(RHS0))) && 
-        match(Cond1, m_ICmp(Pred1, m_Specific(LHS), m_ConstantInt(RHS1)))) { 
-      if (InvertCondition) 
-        Pred1 = ICmpInst::getInversePredicate(Pred1); 
- 
-      ConstantRange CR0 = 
-          ConstantRange::makeExactICmpRegion(Pred0, RHS0->getValue()); 
-      ConstantRange CR1 = 
-          ConstantRange::makeExactICmpRegion(Pred1, RHS1->getValue()); 
- 
-      // SubsetIntersect is a subset of the actual mathematical intersection of 
-      // CR0 and CR1, while SupersetIntersect is a superset of the actual 
-      // mathematical intersection.  If these two ConstantRanges are equal, then 
-      // we know we were able to represent the actual mathematical intersection 
-      // of CR0 and CR1, and can use the same to generate an icmp instruction. 
-      // 
-      // Given what we're doing here and the semantics of guards, it would 
-      // actually be correct to just use SubsetIntersect, but that may be too 
-      // aggressive in cases we care about. 
-      auto SubsetIntersect = CR0.inverse().unionWith(CR1.inverse()).inverse(); 
-      auto SupersetIntersect = CR0.intersectWith(CR1); 
- 
-      APInt NewRHSAP; 
-      CmpInst::Predicate Pred; 
-      if (SubsetIntersect == SupersetIntersect && 
-          SubsetIntersect.getEquivalentICmp(Pred, NewRHSAP)) { 
-        if (InsertPt) { 
-          ConstantInt *NewRHS = ConstantInt::get(Cond0->getContext(), NewRHSAP); 
-          Result = new ICmpInst(InsertPt, Pred, LHS, NewRHS, "wide.chk"); 
-        } 
-        return true; 
-      } 
-    } 
-  } 
- 
-  { 
-    SmallVector<GuardWideningImpl::RangeCheck, 4> Checks, CombinedChecks; 
-    // TODO: Support InvertCondition case? 
-    if (!InvertCondition && 
-        parseRangeChecks(Cond0, Checks) && parseRangeChecks(Cond1, Checks) && 
-        combineRangeChecks(Checks, CombinedChecks)) { 
-      if (InsertPt) { 
-        Result = nullptr; 
-        for (auto &RC : CombinedChecks) { 
-          makeAvailableAt(RC.getCheckInst(), InsertPt); 
-          if (Result) 
-            Result = BinaryOperator::CreateAnd(RC.getCheckInst(), Result, "", 
-                                               InsertPt); 
-          else 
-            Result = RC.getCheckInst(); 
-        } 
-        assert(Result && "Failed to find result value"); 
-        Result->setName("wide.chk"); 
-      } 
-      return true; 
-    } 
-  } 
- 
-  // Base case -- just logical-and the two conditions together. 
- 
-  if (InsertPt) { 
-    makeAvailableAt(Cond0, InsertPt); 
-    makeAvailableAt(Cond1, InsertPt); 
-    if (InvertCondition) 
-      Cond1 = BinaryOperator::CreateNot(Cond1, "inverted", InsertPt); 
-    Result = BinaryOperator::CreateAnd(Cond0, Cond1, "wide.chk", InsertPt); 
-  } 
- 
-  // We were not able to compute Cond0 AND Cond1 for the price of one. 
-  return false; 
-} 
- 
-bool GuardWideningImpl::parseRangeChecks( 
-    Value *CheckCond, SmallVectorImpl<GuardWideningImpl::RangeCheck> &Checks, 
-    SmallPtrSetImpl<const Value *> &Visited) { 
-  if (!Visited.insert(CheckCond).second) 
-    return true; 
- 
-  using namespace llvm::PatternMatch; 
- 
-  { 
-    Value *AndLHS, *AndRHS; 
-    if (match(CheckCond, m_And(m_Value(AndLHS), m_Value(AndRHS)))) 
-      return parseRangeChecks(AndLHS, Checks) && 
-             parseRangeChecks(AndRHS, Checks); 
-  } 
- 
-  auto *IC = dyn_cast<ICmpInst>(CheckCond); 
-  if (!IC || !IC->getOperand(0)->getType()->isIntegerTy() || 
-      (IC->getPredicate() != ICmpInst::ICMP_ULT && 
-       IC->getPredicate() != ICmpInst::ICMP_UGT)) 
-    return false; 
- 
-  const Value *CmpLHS = IC->getOperand(0), *CmpRHS = IC->getOperand(1); 
-  if (IC->getPredicate() == ICmpInst::ICMP_UGT) 
-    std::swap(CmpLHS, CmpRHS); 
- 
-  auto &DL = IC->getModule()->getDataLayout(); 
- 
-  GuardWideningImpl::RangeCheck Check( 
-      CmpLHS, cast<ConstantInt>(ConstantInt::getNullValue(CmpRHS->getType())), 
-      CmpRHS, IC); 
- 
-  if (!isKnownNonNegative(Check.getLength(), DL)) 
-    return false; 
- 
-  // What we have in \c Check now is a correct interpretation of \p CheckCond. 
-  // Try to see if we can move some constant offsets into the \c Offset field. 
- 
-  bool Changed; 
-  auto &Ctx = CheckCond->getContext(); 
- 
-  do { 
-    Value *OpLHS; 
-    ConstantInt *OpRHS; 
-    Changed = false; 
- 
-#ifndef NDEBUG 
-    auto *BaseInst = dyn_cast<Instruction>(Check.getBase()); 
-    assert((!BaseInst || DT.isReachableFromEntry(BaseInst->getParent())) && 
-           "Unreachable instruction?"); 
-#endif 
- 
-    if (match(Check.getBase(), m_Add(m_Value(OpLHS), m_ConstantInt(OpRHS)))) { 
-      Check.setBase(OpLHS); 
-      APInt NewOffset = Check.getOffsetValue() + OpRHS->getValue(); 
-      Check.setOffset(ConstantInt::get(Ctx, NewOffset)); 
-      Changed = true; 
-    } else if (match(Check.getBase(), 
-                     m_Or(m_Value(OpLHS), m_ConstantInt(OpRHS)))) { 
-      KnownBits Known = computeKnownBits(OpLHS, DL); 
-      if ((OpRHS->getValue() & Known.Zero) == OpRHS->getValue()) { 
-        Check.setBase(OpLHS); 
-        APInt NewOffset = Check.getOffsetValue() + OpRHS->getValue(); 
-        Check.setOffset(ConstantInt::get(Ctx, NewOffset)); 
-        Changed = true; 
-      } 
-    } 
-  } while (Changed); 
- 
-  Checks.push_back(Check); 
-  return true; 
-} 
- 
-bool GuardWideningImpl::combineRangeChecks( 
-    SmallVectorImpl<GuardWideningImpl::RangeCheck> &Checks, 
-    SmallVectorImpl<GuardWideningImpl::RangeCheck> &RangeChecksOut) const { 
-  unsigned OldCount = Checks.size(); 
-  while (!Checks.empty()) { 
-    // Pick all of the range checks with a specific base and length, and try to 
-    // merge them. 
-    const Value *CurrentBase = Checks.front().getBase(); 
-    const Value *CurrentLength = Checks.front().getLength(); 
- 
-    SmallVector<GuardWideningImpl::RangeCheck, 3> CurrentChecks; 
- 
-    auto IsCurrentCheck = [&](GuardWideningImpl::RangeCheck &RC) { 
-      return RC.getBase() == CurrentBase && RC.getLength() == CurrentLength; 
-    }; 
- 
-    copy_if(Checks, std::back_inserter(CurrentChecks), IsCurrentCheck); 
+
+#ifndef NDEBUG
+    {
+      unsigned Index = 0;
+      for (auto &I : *CurBB) {
+        if (Index == GuardsInCurBB.size())
+          break;
+        if (GuardsInCurBB[Index] == &I)
+          Index++;
+      }
+      assert(Index == GuardsInCurBB.size() &&
+             "Guards expected to be in order!");
+    }
+#endif
+
+    assert((i == (e - 1)) == (Instr->getParent() == CurBB) && "Bad DFS?");
+
+    for (auto *Candidate : make_range(I, E)) {
+      auto Score = computeWideningScore(Instr, Candidate, InvertCondition);
+      LLVM_DEBUG(dbgs() << "Score between " << *getCondition(Instr)
+                        << " and " << *getCondition(Candidate) << " is "
+                        << scoreTypeToString(Score) << "\n");
+      if (Score > BestScoreSoFar) {
+        BestScoreSoFar = Score;
+        BestSoFar = Candidate;
+      }
+    }
+  }
+
+  if (BestScoreSoFar == WS_IllegalOrNegative) {
+    LLVM_DEBUG(dbgs() << "Did not eliminate guard " << *Instr << "\n");
+    return false;
+  }
+
+  assert(BestSoFar != Instr && "Should have never visited same guard!");
+  assert(DT.dominates(BestSoFar, Instr) && "Should be!");
+
+  LLVM_DEBUG(dbgs() << "Widening " << *Instr << " into " << *BestSoFar
+                    << " with score " << scoreTypeToString(BestScoreSoFar)
+                    << "\n");
+  widenGuard(BestSoFar, getCondition(Instr), InvertCondition);
+  auto NewGuardCondition = InvertCondition
+                               ? ConstantInt::getFalse(Instr->getContext())
+                               : ConstantInt::getTrue(Instr->getContext());
+  setCondition(Instr, NewGuardCondition);
+  EliminatedGuardsAndBranches.push_back(Instr);
+  WidenedGuards.insert(BestSoFar);
+  return true;
+}
+
+GuardWideningImpl::WideningScore
+GuardWideningImpl::computeWideningScore(Instruction *DominatedInstr,
+                                        Instruction *DominatingGuard,
+                                        bool InvertCond) {
+  Loop *DominatedInstrLoop = LI.getLoopFor(DominatedInstr->getParent());
+  Loop *DominatingGuardLoop = LI.getLoopFor(DominatingGuard->getParent());
+  bool HoistingOutOfLoop = false;
+
+  if (DominatingGuardLoop != DominatedInstrLoop) {
+    // Be conservative and don't widen into a sibling loop.  TODO: If the
+    // sibling is colder, we should consider allowing this.
+    if (DominatingGuardLoop &&
+        !DominatingGuardLoop->contains(DominatedInstrLoop))
+      return WS_IllegalOrNegative;
+
+    HoistingOutOfLoop = true;
+  }
+
+  if (!isAvailableAt(getCondition(DominatedInstr), DominatingGuard))
+    return WS_IllegalOrNegative;
+
+  // If the guard was conditional executed, it may never be reached
+  // dynamically.  There are two potential downsides to hoisting it out of the
+  // conditionally executed region: 1) we may spuriously deopt without need and
+  // 2) we have the extra cost of computing the guard condition in the common
+  // case.  At the moment, we really only consider the second in our heuristic
+  // here.  TODO: evaluate cost model for spurious deopt
+  // NOTE: As written, this also lets us hoist right over another guard which
+  // is essentially just another spelling for control flow.
+  if (isWideningCondProfitable(getCondition(DominatedInstr),
+                               getCondition(DominatingGuard), InvertCond))
+    return HoistingOutOfLoop ? WS_VeryPositive : WS_Positive;
+
+  if (HoistingOutOfLoop)
+    return WS_Positive;
+
+  // Returns true if we might be hoisting above explicit control flow.  Note
+  // that this completely ignores implicit control flow (guards, calls which
+  // throw, etc...).  That choice appears arbitrary.
+  auto MaybeHoistingOutOfIf = [&]() {
+    auto *DominatingBlock = DominatingGuard->getParent();
+    auto *DominatedBlock = DominatedInstr->getParent();
+    if (isGuardAsWidenableBranch(DominatingGuard))
+      DominatingBlock = cast<BranchInst>(DominatingGuard)->getSuccessor(0);
+
+    // Same Block?
+    if (DominatedBlock == DominatingBlock)
+      return false;
+    // Obvious successor (common loop header/preheader case)
+    if (DominatedBlock == DominatingBlock->getUniqueSuccessor())
+      return false;
+    // TODO: diamond, triangle cases
+    if (!PDT) return true;
+    return !PDT->dominates(DominatedBlock, DominatingBlock);
+  };
+
+  return MaybeHoistingOutOfIf() ? WS_IllegalOrNegative : WS_Neutral;
+}
+
+bool GuardWideningImpl::isAvailableAt(
+    const Value *V, const Instruction *Loc,
+    SmallPtrSetImpl<const Instruction *> &Visited) const {
+  auto *Inst = dyn_cast<Instruction>(V);
+  if (!Inst || DT.dominates(Inst, Loc) || Visited.count(Inst))
+    return true;
+
+  if (!isSafeToSpeculativelyExecute(Inst, Loc, &DT) ||
+      Inst->mayReadFromMemory())
+    return false;
+
+  Visited.insert(Inst);
+
+  // We only want to go _up_ the dominance chain when recursing.
+  assert(!isa<PHINode>(Loc) &&
+         "PHIs should return false for isSafeToSpeculativelyExecute");
+  assert(DT.isReachableFromEntry(Inst->getParent()) &&
+         "We did a DFS from the block entry!");
+  return all_of(Inst->operands(),
+                [&](Value *Op) { return isAvailableAt(Op, Loc, Visited); });
+}
+
+void GuardWideningImpl::makeAvailableAt(Value *V, Instruction *Loc) const {
+  auto *Inst = dyn_cast<Instruction>(V);
+  if (!Inst || DT.dominates(Inst, Loc))
+    return;
+
+  assert(isSafeToSpeculativelyExecute(Inst, Loc, &DT) &&
+         !Inst->mayReadFromMemory() && "Should've checked with isAvailableAt!");
+
+  for (Value *Op : Inst->operands())
+    makeAvailableAt(Op, Loc);
+
+  Inst->moveBefore(Loc);
+}
+
+bool GuardWideningImpl::widenCondCommon(Value *Cond0, Value *Cond1,
+                                        Instruction *InsertPt, Value *&Result,
+                                        bool InvertCondition) {
+  using namespace llvm::PatternMatch;
+
+  {
+    // L >u C0 && L >u C1  ->  L >u max(C0, C1)
+    ConstantInt *RHS0, *RHS1;
+    Value *LHS;
+    ICmpInst::Predicate Pred0, Pred1;
+    if (match(Cond0, m_ICmp(Pred0, m_Value(LHS), m_ConstantInt(RHS0))) &&
+        match(Cond1, m_ICmp(Pred1, m_Specific(LHS), m_ConstantInt(RHS1)))) {
+      if (InvertCondition)
+        Pred1 = ICmpInst::getInversePredicate(Pred1);
+
+      ConstantRange CR0 =
+          ConstantRange::makeExactICmpRegion(Pred0, RHS0->getValue());
+      ConstantRange CR1 =
+          ConstantRange::makeExactICmpRegion(Pred1, RHS1->getValue());
+
+      // SubsetIntersect is a subset of the actual mathematical intersection of
+      // CR0 and CR1, while SupersetIntersect is a superset of the actual
+      // mathematical intersection.  If these two ConstantRanges are equal, then
+      // we know we were able to represent the actual mathematical intersection
+      // of CR0 and CR1, and can use the same to generate an icmp instruction.
+      //
+      // Given what we're doing here and the semantics of guards, it would
+      // actually be correct to just use SubsetIntersect, but that may be too
+      // aggressive in cases we care about.
+      auto SubsetIntersect = CR0.inverse().unionWith(CR1.inverse()).inverse();
+      auto SupersetIntersect = CR0.intersectWith(CR1);
+
+      APInt NewRHSAP;
+      CmpInst::Predicate Pred;
+      if (SubsetIntersect == SupersetIntersect &&
+          SubsetIntersect.getEquivalentICmp(Pred, NewRHSAP)) {
+        if (InsertPt) {
+          ConstantInt *NewRHS = ConstantInt::get(Cond0->getContext(), NewRHSAP);
+          Result = new ICmpInst(InsertPt, Pred, LHS, NewRHS, "wide.chk");
+        }
+        return true;
+      }
+    }
+  }
+
+  {
+    SmallVector<GuardWideningImpl::RangeCheck, 4> Checks, CombinedChecks;
+    // TODO: Support InvertCondition case?
+    if (!InvertCondition &&
+        parseRangeChecks(Cond0, Checks) && parseRangeChecks(Cond1, Checks) &&
+        combineRangeChecks(Checks, CombinedChecks)) {
+      if (InsertPt) {
+        Result = nullptr;
+        for (auto &RC : CombinedChecks) {
+          makeAvailableAt(RC.getCheckInst(), InsertPt);
+          if (Result)
+            Result = BinaryOperator::CreateAnd(RC.getCheckInst(), Result, "",
+                                               InsertPt);
+          else
+            Result = RC.getCheckInst();
+        }
+        assert(Result && "Failed to find result value");
+        Result->setName("wide.chk");
+      }
+      return true;
+    }
+  }
+
+  // Base case -- just logical-and the two conditions together.
+
+  if (InsertPt) {
+    makeAvailableAt(Cond0, InsertPt);
+    makeAvailableAt(Cond1, InsertPt);
+    if (InvertCondition)
+      Cond1 = BinaryOperator::CreateNot(Cond1, "inverted", InsertPt);
+    Result = BinaryOperator::CreateAnd(Cond0, Cond1, "wide.chk", InsertPt);
+  }
+
+  // We were not able to compute Cond0 AND Cond1 for the price of one.
+  return false;
+}
+
+bool GuardWideningImpl::parseRangeChecks(
+    Value *CheckCond, SmallVectorImpl<GuardWideningImpl::RangeCheck> &Checks,
+    SmallPtrSetImpl<const Value *> &Visited) {
+  if (!Visited.insert(CheckCond).second)
+    return true;
+
+  using namespace llvm::PatternMatch;
+
+  {
+    Value *AndLHS, *AndRHS;
+    if (match(CheckCond, m_And(m_Value(AndLHS), m_Value(AndRHS))))
+      return parseRangeChecks(AndLHS, Checks) &&
+             parseRangeChecks(AndRHS, Checks);
+  }
+
+  auto *IC = dyn_cast<ICmpInst>(CheckCond);
+  if (!IC || !IC->getOperand(0)->getType()->isIntegerTy() ||
+      (IC->getPredicate() != ICmpInst::ICMP_ULT &&
+       IC->getPredicate() != ICmpInst::ICMP_UGT))
+    return false;
+
+  const Value *CmpLHS = IC->getOperand(0), *CmpRHS = IC->getOperand(1);
+  if (IC->getPredicate() == ICmpInst::ICMP_UGT)
+    std::swap(CmpLHS, CmpRHS);
+
+  auto &DL = IC->getModule()->getDataLayout();
+
+  GuardWideningImpl::RangeCheck Check(
+      CmpLHS, cast<ConstantInt>(ConstantInt::getNullValue(CmpRHS->getType())),
+      CmpRHS, IC);
+
+  if (!isKnownNonNegative(Check.getLength(), DL))
+    return false;
+
+  // What we have in \c Check now is a correct interpretation of \p CheckCond.
+  // Try to see if we can move some constant offsets into the \c Offset field.
+
+  bool Changed;
+  auto &Ctx = CheckCond->getContext();
+
+  do {
+    Value *OpLHS;
+    ConstantInt *OpRHS;
+    Changed = false;
+
+#ifndef NDEBUG
+    auto *BaseInst = dyn_cast<Instruction>(Check.getBase());
+    assert((!BaseInst || DT.isReachableFromEntry(BaseInst->getParent())) &&
+           "Unreachable instruction?");
+#endif
+
+    if (match(Check.getBase(), m_Add(m_Value(OpLHS), m_ConstantInt(OpRHS)))) {
+      Check.setBase(OpLHS);
+      APInt NewOffset = Check.getOffsetValue() + OpRHS->getValue();
+      Check.setOffset(ConstantInt::get(Ctx, NewOffset));
+      Changed = true;
+    } else if (match(Check.getBase(),
+                     m_Or(m_Value(OpLHS), m_ConstantInt(OpRHS)))) {
+      KnownBits Known = computeKnownBits(OpLHS, DL);
+      if ((OpRHS->getValue() & Known.Zero) == OpRHS->getValue()) {
+        Check.setBase(OpLHS);
+        APInt NewOffset = Check.getOffsetValue() + OpRHS->getValue();
+        Check.setOffset(ConstantInt::get(Ctx, NewOffset));
+        Changed = true;
+      }
+    }
+  } while (Changed);
+
+  Checks.push_back(Check);
+  return true;
+}
+
+bool GuardWideningImpl::combineRangeChecks(
+    SmallVectorImpl<GuardWideningImpl::RangeCheck> &Checks,
+    SmallVectorImpl<GuardWideningImpl::RangeCheck> &RangeChecksOut) const {
+  unsigned OldCount = Checks.size();
+  while (!Checks.empty()) {
+    // Pick all of the range checks with a specific base and length, and try to
+    // merge them.
+    const Value *CurrentBase = Checks.front().getBase();
+    const Value *CurrentLength = Checks.front().getLength();
+
+    SmallVector<GuardWideningImpl::RangeCheck, 3> CurrentChecks;
+
+    auto IsCurrentCheck = [&](GuardWideningImpl::RangeCheck &RC) {
+      return RC.getBase() == CurrentBase && RC.getLength() == CurrentLength;
+    };
+
+    copy_if(Checks, std::back_inserter(CurrentChecks), IsCurrentCheck);
     erase_if(Checks, IsCurrentCheck);
- 
-    assert(CurrentChecks.size() != 0 && "We know we have at least one!"); 
- 
-    if (CurrentChecks.size() < 3) { 
+
+    assert(CurrentChecks.size() != 0 && "We know we have at least one!");
+
+    if (CurrentChecks.size() < 3) {
       llvm::append_range(RangeChecksOut, CurrentChecks);
-      continue; 
-    } 
- 
-    // CurrentChecks.size() will typically be 3 here, but so far there has been 
-    // no need to hard-code that fact. 
- 
-    llvm::sort(CurrentChecks, [&](const GuardWideningImpl::RangeCheck &LHS, 
-                                  const GuardWideningImpl::RangeCheck &RHS) { 
-      return LHS.getOffsetValue().slt(RHS.getOffsetValue()); 
-    }); 
- 
-    // Note: std::sort should not invalidate the ChecksStart iterator. 
- 
-    const ConstantInt *MinOffset = CurrentChecks.front().getOffset(); 
-    const ConstantInt *MaxOffset = CurrentChecks.back().getOffset(); 
- 
-    unsigned BitWidth = MaxOffset->getValue().getBitWidth(); 
-    if ((MaxOffset->getValue() - MinOffset->getValue()) 
-            .ugt(APInt::getSignedMinValue(BitWidth))) 
-      return false; 
- 
-    APInt MaxDiff = MaxOffset->getValue() - MinOffset->getValue(); 
-    const APInt &HighOffset = MaxOffset->getValue(); 
-    auto OffsetOK = [&](const GuardWideningImpl::RangeCheck &RC) { 
-      return (HighOffset - RC.getOffsetValue()).ult(MaxDiff); 
-    }; 
- 
+      continue;
+    }
+
+    // CurrentChecks.size() will typically be 3 here, but so far there has been
+    // no need to hard-code that fact.
+
+    llvm::sort(CurrentChecks, [&](const GuardWideningImpl::RangeCheck &LHS,
+                                  const GuardWideningImpl::RangeCheck &RHS) {
+      return LHS.getOffsetValue().slt(RHS.getOffsetValue());
+    });
+
+    // Note: std::sort should not invalidate the ChecksStart iterator.
+
+    const ConstantInt *MinOffset = CurrentChecks.front().getOffset();
+    const ConstantInt *MaxOffset = CurrentChecks.back().getOffset();
+
+    unsigned BitWidth = MaxOffset->getValue().getBitWidth();
+    if ((MaxOffset->getValue() - MinOffset->getValue())
+            .ugt(APInt::getSignedMinValue(BitWidth)))
+      return false;
+
+    APInt MaxDiff = MaxOffset->getValue() - MinOffset->getValue();
+    const APInt &HighOffset = MaxOffset->getValue();
+    auto OffsetOK = [&](const GuardWideningImpl::RangeCheck &RC) {
+      return (HighOffset - RC.getOffsetValue()).ult(MaxDiff);
+    };
+
     if (MaxDiff.isMinValue() || !all_of(drop_begin(CurrentChecks), OffsetOK))
-      return false; 
- 
-    // We have a series of f+1 checks as: 
-    // 
-    //   I+k_0 u< L   ... Chk_0 
-    //   I+k_1 u< L   ... Chk_1 
-    //   ... 
-    //   I+k_f u< L   ... Chk_f 
-    // 
-    //     with forall i in [0,f]: k_f-k_i u< k_f-k_0  ... Precond_0 
-    //          k_f-k_0 u< INT_MIN+k_f                 ... Precond_1 
-    //          k_f != k_0                             ... Precond_2 
-    // 
-    // Claim: 
-    //   Chk_0 AND Chk_f  implies all the other checks 
-    // 
-    // Informal proof sketch: 
-    // 
-    // We will show that the integer range [I+k_0,I+k_f] does not unsigned-wrap 
-    // (i.e. going from I+k_0 to I+k_f does not cross the -1,0 boundary) and 
-    // thus I+k_f is the greatest unsigned value in that range. 
-    // 
-    // This combined with Ckh_(f+1) shows that everything in that range is u< L. 
-    // Via Precond_0 we know that all of the indices in Chk_0 through Chk_(f+1) 
-    // lie in [I+k_0,I+k_f], this proving our claim. 
-    // 
-    // To see that [I+k_0,I+k_f] is not a wrapping range, note that there are 
-    // two possibilities: I+k_0 u< I+k_f or I+k_0 >u I+k_f (they can't be equal 
-    // since k_0 != k_f).  In the former case, [I+k_0,I+k_f] is not a wrapping 
-    // range by definition, and the latter case is impossible: 
-    // 
-    //   0-----I+k_f---I+k_0----L---INT_MAX,INT_MIN------------------(-1) 
-    //   xxxxxx             xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 
-    // 
-    // For Chk_0 to succeed, we'd have to have k_f-k_0 (the range highlighted 
-    // with 'x' above) to be at least >u INT_MIN. 
- 
-    RangeChecksOut.emplace_back(CurrentChecks.front()); 
-    RangeChecksOut.emplace_back(CurrentChecks.back()); 
-  } 
- 
-  assert(RangeChecksOut.size() <= OldCount && "We pessimized!"); 
-  return RangeChecksOut.size() != OldCount; 
-} 
- 
-#ifndef NDEBUG 
-StringRef GuardWideningImpl::scoreTypeToString(WideningScore WS) { 
-  switch (WS) { 
-  case WS_IllegalOrNegative: 
-    return "IllegalOrNegative"; 
-  case WS_Neutral: 
-    return "Neutral"; 
-  case WS_Positive: 
-    return "Positive"; 
-  case WS_VeryPositive: 
-    return "VeryPositive"; 
-  } 
- 
-  llvm_unreachable("Fully covered switch above!"); 
-} 
-#endif 
- 
-PreservedAnalyses GuardWideningPass::run(Function &F, 
-                                         FunctionAnalysisManager &AM) { 
-  auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 
-  auto &LI = AM.getResult<LoopAnalysis>(F); 
-  auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F); 
-  if (!GuardWideningImpl(DT, &PDT, LI, DT.getRootNode(), 
-                         [](BasicBlock*) { return true; } ).run()) 
-    return PreservedAnalyses::all(); 
- 
-  PreservedAnalyses PA; 
-  PA.preserveSet<CFGAnalyses>(); 
-  return PA; 
-} 
- 
-PreservedAnalyses GuardWideningPass::run(Loop &L, LoopAnalysisManager &AM, 
-                                         LoopStandardAnalysisResults &AR, 
-                                         LPMUpdater &U) { 
-  BasicBlock *RootBB = L.getLoopPredecessor(); 
-  if (!RootBB) 
-    RootBB = L.getHeader(); 
-  auto BlockFilter = [&](BasicBlock *BB) { 
-    return BB == RootBB || L.contains(BB); 
-  }; 
-  if (!GuardWideningImpl(AR.DT, nullptr, AR.LI, AR.DT.getNode(RootBB), 
-                         BlockFilter).run()) 
-    return PreservedAnalyses::all(); 
- 
-  return getLoopPassPreservedAnalyses(); 
-} 
- 
-namespace { 
-struct GuardWideningLegacyPass : public FunctionPass { 
-  static char ID; 
- 
-  GuardWideningLegacyPass() : FunctionPass(ID) { 
-    initializeGuardWideningLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnFunction(Function &F) override { 
-    if (skipFunction(F)) 
-      return false; 
-    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 
-    auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(); 
-    return GuardWideningImpl(DT, &PDT, LI, DT.getRootNode(), 
-                         [](BasicBlock*) { return true; } ).run(); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.setPreservesCFG(); 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addRequired<PostDominatorTreeWrapperPass>(); 
-    AU.addRequired<LoopInfoWrapperPass>(); 
-  } 
-}; 
- 
-/// Same as above, but restricted to a single loop at a time.  Can be 
-/// scheduled with other loop passes w/o breaking out of LPM 
-struct LoopGuardWideningLegacyPass : public LoopPass { 
-  static char ID; 
- 
-  LoopGuardWideningLegacyPass() : LoopPass(ID) { 
-    initializeLoopGuardWideningLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnLoop(Loop *L, LPPassManager &LPM) override { 
-    if (skipLoop(L)) 
-      return false; 
-    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 
-    auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>(); 
-    auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr; 
-    BasicBlock *RootBB = L->getLoopPredecessor(); 
-    if (!RootBB) 
-      RootBB = L->getHeader(); 
-    auto BlockFilter = [&](BasicBlock *BB) { 
-      return BB == RootBB || L->contains(BB); 
-    }; 
-    return GuardWideningImpl(DT, PDT, LI, 
-                             DT.getNode(RootBB), BlockFilter).run(); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.setPreservesCFG(); 
-    getLoopAnalysisUsage(AU); 
-    AU.addPreserved<PostDominatorTreeWrapperPass>(); 
-  } 
-}; 
-} 
- 
-char GuardWideningLegacyPass::ID = 0; 
-char LoopGuardWideningLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(GuardWideningLegacyPass, "guard-widening", "Widen guards", 
-                      false, false) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 
-INITIALIZE_PASS_END(GuardWideningLegacyPass, "guard-widening", "Widen guards", 
-                    false, false) 
- 
-INITIALIZE_PASS_BEGIN(LoopGuardWideningLegacyPass, "loop-guard-widening", 
-                      "Widen guards (within a single loop, as a loop pass)", 
-                      false, false) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 
-INITIALIZE_PASS_END(LoopGuardWideningLegacyPass, "loop-guard-widening", 
-                    "Widen guards (within a single loop, as a loop pass)", 
-                    false, false) 
- 
-FunctionPass *llvm::createGuardWideningPass() { 
-  return new GuardWideningLegacyPass(); 
-} 
- 
-Pass *llvm::createLoopGuardWideningPass() { 
-  return new LoopGuardWideningLegacyPass(); 
-} 
+      return false;
+
+    // We have a series of f+1 checks as:
+    //
+    //   I+k_0 u< L   ... Chk_0
+    //   I+k_1 u< L   ... Chk_1
+    //   ...
+    //   I+k_f u< L   ... Chk_f
+    //
+    //     with forall i in [0,f]: k_f-k_i u< k_f-k_0  ... Precond_0
+    //          k_f-k_0 u< INT_MIN+k_f                 ... Precond_1
+    //          k_f != k_0                             ... Precond_2
+    //
+    // Claim:
+    //   Chk_0 AND Chk_f  implies all the other checks
+    //
+    // Informal proof sketch:
+    //
+    // We will show that the integer range [I+k_0,I+k_f] does not unsigned-wrap
+    // (i.e. going from I+k_0 to I+k_f does not cross the -1,0 boundary) and
+    // thus I+k_f is the greatest unsigned value in that range.
+    //
+    // This combined with Ckh_(f+1) shows that everything in that range is u< L.
+    // Via Precond_0 we know that all of the indices in Chk_0 through Chk_(f+1)
+    // lie in [I+k_0,I+k_f], this proving our claim.
+    //
+    // To see that [I+k_0,I+k_f] is not a wrapping range, note that there are
+    // two possibilities: I+k_0 u< I+k_f or I+k_0 >u I+k_f (they can't be equal
+    // since k_0 != k_f).  In the former case, [I+k_0,I+k_f] is not a wrapping
+    // range by definition, and the latter case is impossible:
+    //
+    //   0-----I+k_f---I+k_0----L---INT_MAX,INT_MIN------------------(-1)
+    //   xxxxxx             xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+    //
+    // For Chk_0 to succeed, we'd have to have k_f-k_0 (the range highlighted
+    // with 'x' above) to be at least >u INT_MIN.
+
+    RangeChecksOut.emplace_back(CurrentChecks.front());
+    RangeChecksOut.emplace_back(CurrentChecks.back());
+  }
+
+  assert(RangeChecksOut.size() <= OldCount && "We pessimized!");
+  return RangeChecksOut.size() != OldCount;
+}
+
+#ifndef NDEBUG
+StringRef GuardWideningImpl::scoreTypeToString(WideningScore WS) {
+  switch (WS) {
+  case WS_IllegalOrNegative:
+    return "IllegalOrNegative";
+  case WS_Neutral:
+    return "Neutral";
+  case WS_Positive:
+    return "Positive";
+  case WS_VeryPositive:
+    return "VeryPositive";
+  }
+
+  llvm_unreachable("Fully covered switch above!");
+}
+#endif
+
+PreservedAnalyses GuardWideningPass::run(Function &F,
+                                         FunctionAnalysisManager &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+  if (!GuardWideningImpl(DT, &PDT, LI, DT.getRootNode(),
+                         [](BasicBlock*) { return true; } ).run())
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
+PreservedAnalyses GuardWideningPass::run(Loop &L, LoopAnalysisManager &AM,
+                                         LoopStandardAnalysisResults &AR,
+                                         LPMUpdater &U) {
+  BasicBlock *RootBB = L.getLoopPredecessor();
+  if (!RootBB)
+    RootBB = L.getHeader();
+  auto BlockFilter = [&](BasicBlock *BB) {
+    return BB == RootBB || L.contains(BB);
+  };
+  if (!GuardWideningImpl(AR.DT, nullptr, AR.LI, AR.DT.getNode(RootBB),
+                         BlockFilter).run())
+    return PreservedAnalyses::all();
+
+  return getLoopPassPreservedAnalyses();
+}
+
+namespace {
+struct GuardWideningLegacyPass : public FunctionPass {
+  static char ID;
+
+  GuardWideningLegacyPass() : FunctionPass(ID) {
+    initializeGuardWideningLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+    return GuardWideningImpl(DT, &PDT, LI, DT.getRootNode(),
+                         [](BasicBlock*) { return true; } ).run();
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<PostDominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+  }
+};
+
+/// Same as above, but restricted to a single loop at a time.  Can be
+/// scheduled with other loop passes w/o breaking out of LPM
+struct LoopGuardWideningLegacyPass : public LoopPass {
+  static char ID;
+
+  LoopGuardWideningLegacyPass() : LoopPass(ID) {
+    initializeLoopGuardWideningLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+    if (skipLoop(L))
+      return false;
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>();
+    auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr;
+    BasicBlock *RootBB = L->getLoopPredecessor();
+    if (!RootBB)
+      RootBB = L->getHeader();
+    auto BlockFilter = [&](BasicBlock *BB) {
+      return BB == RootBB || L->contains(BB);
+    };
+    return GuardWideningImpl(DT, PDT, LI,
+                             DT.getNode(RootBB), BlockFilter).run();
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    getLoopAnalysisUsage(AU);
+    AU.addPreserved<PostDominatorTreeWrapperPass>();
+  }
+};
+}
+
+char GuardWideningLegacyPass::ID = 0;
+char LoopGuardWideningLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(GuardWideningLegacyPass, "guard-widening", "Widen guards",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(GuardWideningLegacyPass, "guard-widening", "Widen guards",
+                    false, false)
+
+INITIALIZE_PASS_BEGIN(LoopGuardWideningLegacyPass, "loop-guard-widening",
+                      "Widen guards (within a single loop, as a loop pass)",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(LoopGuardWideningLegacyPass, "loop-guard-widening",
+                    "Widen guards (within a single loop, as a loop pass)",
+                    false, false)
+
+FunctionPass *llvm::createGuardWideningPass() {
+  return new GuardWideningLegacyPass();
+}
+
+Pass *llvm::createLoopGuardWideningPass() {
+  return new LoopGuardWideningLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/IVUsersPrinter.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/IVUsersPrinter.cpp
index 36deb00b5a..e2022aba97 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/IVUsersPrinter.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/IVUsersPrinter.cpp
@@ -1,21 +1,21 @@
-//===- IVUsersPrinter.cpp - Induction Variable Users Printer ----*- C++ -*-===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/IVUsersPrinter.h" 
-#include "llvm/Analysis/IVUsers.h" 
-#include "llvm/Support/Debug.h" 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "iv-users" 
- 
-PreservedAnalyses IVUsersPrinterPass::run(Loop &L, LoopAnalysisManager &AM, 
-                                          LoopStandardAnalysisResults &AR, 
-                                          LPMUpdater &U) { 
-  AM.getResult<IVUsersAnalysis>(L, AR).print(OS); 
-  return PreservedAnalyses::all(); 
-} 
+//===- IVUsersPrinter.cpp - Induction Variable Users Printer ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/IVUsersPrinter.h"
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "iv-users"
+
+PreservedAnalyses IVUsersPrinterPass::run(Loop &L, LoopAnalysisManager &AM,
+                                          LoopStandardAnalysisResults &AR,
+                                          LPMUpdater &U) {
+  AM.getResult<IVUsersAnalysis>(L, AR).print(OS);
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/IndVarSimplify.cpp
index bba2f76e77..ae1fff0fa8 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -1,641 +1,641 @@
-//===- IndVarSimplify.cpp - Induction Variable Elimination ----------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This transformation analyzes and transforms the induction variables (and 
-// computations derived from them) into simpler forms suitable for subsequent 
-// analysis and transformation. 
-// 
-// If the trip count of a loop is computable, this pass also makes the following 
-// changes: 
-//   1. The exit condition for the loop is canonicalized to compare the 
-//      induction value against the exit value.  This turns loops like: 
-//        'for (i = 7; i*i < 1000; ++i)' into 'for (i = 0; i != 25; ++i)' 
-//   2. Any use outside of the loop of an expression derived from the indvar 
-//      is changed to compute the derived value outside of the loop, eliminating 
-//      the dependence on the exit value of the induction variable.  If the only 
-//      purpose of the loop is to compute the exit value of some derived 
-//      expression, this transformation will make the loop dead. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/IndVarSimplify.h" 
-#include "llvm/ADT/APFloat.h" 
-#include "llvm/ADT/APInt.h" 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/None.h" 
-#include "llvm/ADT/Optional.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/iterator_range.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/LoopPass.h" 
-#include "llvm/Analysis/MemorySSA.h" 
-#include "llvm/Analysis/MemorySSAUpdater.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/Analysis/ScalarEvolutionExpressions.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/ConstantRange.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Operator.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/IR/ValueHandle.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Compiler.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/MathExtras.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Scalar/LoopPassManager.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Transforms/Utils/LoopUtils.h" 
-#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 
-#include "llvm/Transforms/Utils/SimplifyIndVar.h" 
-#include <cassert> 
-#include <cstdint> 
-#include <utility> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "indvars" 
- 
-STATISTIC(NumWidened     , "Number of indvars widened"); 
-STATISTIC(NumReplaced    , "Number of exit values replaced"); 
-STATISTIC(NumLFTR        , "Number of loop exit tests replaced"); 
-STATISTIC(NumElimExt     , "Number of IV sign/zero extends eliminated"); 
-STATISTIC(NumElimIV      , "Number of congruent IVs eliminated"); 
- 
-// Trip count verification can be enabled by default under NDEBUG if we 
-// implement a strong expression equivalence checker in SCEV. Until then, we 
-// use the verify-indvars flag, which may assert in some cases. 
-static cl::opt<bool> VerifyIndvars( 
-    "verify-indvars", cl::Hidden, 
-    cl::desc("Verify the ScalarEvolution result after running indvars. Has no " 
-             "effect in release builds. (Note: this adds additional SCEV " 
-             "queries potentially changing the analysis result)")); 
- 
-static cl::opt<ReplaceExitVal> ReplaceExitValue( 
-    "replexitval", cl::Hidden, cl::init(OnlyCheapRepl), 
-    cl::desc("Choose the strategy to replace exit value in IndVarSimplify"), 
-    cl::values(clEnumValN(NeverRepl, "never", "never replace exit value"), 
-               clEnumValN(OnlyCheapRepl, "cheap", 
-                          "only replace exit value when the cost is cheap"), 
-               clEnumValN(NoHardUse, "noharduse", 
-                          "only replace exit values when loop def likely dead"), 
-               clEnumValN(AlwaysRepl, "always", 
-                          "always replace exit value whenever possible"))); 
- 
-static cl::opt<bool> UsePostIncrementRanges( 
-  "indvars-post-increment-ranges", cl::Hidden, 
-  cl::desc("Use post increment control-dependent ranges in IndVarSimplify"), 
-  cl::init(true)); 
- 
-static cl::opt<bool> 
-DisableLFTR("disable-lftr", cl::Hidden, cl::init(false), 
-            cl::desc("Disable Linear Function Test Replace optimization")); 
- 
-static cl::opt<bool> 
-LoopPredication("indvars-predicate-loops", cl::Hidden, cl::init(true), 
-                cl::desc("Predicate conditions in read only loops")); 
- 
+//===- IndVarSimplify.cpp - Induction Variable Elimination ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation analyzes and transforms the induction variables (and
+// computations derived from them) into simpler forms suitable for subsequent
+// analysis and transformation.
+//
+// If the trip count of a loop is computable, this pass also makes the following
+// changes:
+//   1. The exit condition for the loop is canonicalized to compare the
+//      induction value against the exit value.  This turns loops like:
+//        'for (i = 7; i*i < 1000; ++i)' into 'for (i = 0; i != 25; ++i)'
+//   2. Any use outside of the loop of an expression derived from the indvar
+//      is changed to compute the derived value outside of the loop, eliminating
+//      the dependence on the exit value of the induction variable.  If the only
+//      purpose of the loop is to compute the exit value of some derived
+//      expression, this transformation will make the loop dead.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/IndVarSimplify.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
+#include <cassert>
+#include <cstdint>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "indvars"
+
+STATISTIC(NumWidened     , "Number of indvars widened");
+STATISTIC(NumReplaced    , "Number of exit values replaced");
+STATISTIC(NumLFTR        , "Number of loop exit tests replaced");
+STATISTIC(NumElimExt     , "Number of IV sign/zero extends eliminated");
+STATISTIC(NumElimIV      , "Number of congruent IVs eliminated");
+
+// Trip count verification can be enabled by default under NDEBUG if we
+// implement a strong expression equivalence checker in SCEV. Until then, we
+// use the verify-indvars flag, which may assert in some cases.
+static cl::opt<bool> VerifyIndvars(
+    "verify-indvars", cl::Hidden,
+    cl::desc("Verify the ScalarEvolution result after running indvars. Has no "
+             "effect in release builds. (Note: this adds additional SCEV "
+             "queries potentially changing the analysis result)"));
+
+static cl::opt<ReplaceExitVal> ReplaceExitValue(
+    "replexitval", cl::Hidden, cl::init(OnlyCheapRepl),
+    cl::desc("Choose the strategy to replace exit value in IndVarSimplify"),
+    cl::values(clEnumValN(NeverRepl, "never", "never replace exit value"),
+               clEnumValN(OnlyCheapRepl, "cheap",
+                          "only replace exit value when the cost is cheap"),
+               clEnumValN(NoHardUse, "noharduse",
+                          "only replace exit values when loop def likely dead"),
+               clEnumValN(AlwaysRepl, "always",
+                          "always replace exit value whenever possible")));
+
+static cl::opt<bool> UsePostIncrementRanges(
+  "indvars-post-increment-ranges", cl::Hidden,
+  cl::desc("Use post increment control-dependent ranges in IndVarSimplify"),
+  cl::init(true));
+
+static cl::opt<bool>
+DisableLFTR("disable-lftr", cl::Hidden, cl::init(false),
+            cl::desc("Disable Linear Function Test Replace optimization"));
+
+static cl::opt<bool>
+LoopPredication("indvars-predicate-loops", cl::Hidden, cl::init(true),
+                cl::desc("Predicate conditions in read only loops"));
+
 static cl::opt<bool>
 AllowIVWidening("indvars-widen-indvars", cl::Hidden, cl::init(true),
                 cl::desc("Allow widening of indvars to eliminate s/zext"));
 
-namespace { 
- 
-struct RewritePhi; 
- 
-class IndVarSimplify { 
-  LoopInfo *LI; 
-  ScalarEvolution *SE; 
-  DominatorTree *DT; 
-  const DataLayout &DL; 
-  TargetLibraryInfo *TLI; 
-  const TargetTransformInfo *TTI; 
-  std::unique_ptr<MemorySSAUpdater> MSSAU; 
- 
-  SmallVector<WeakTrackingVH, 16> DeadInsts; 
+namespace {
+
+struct RewritePhi;
+
+class IndVarSimplify {
+  LoopInfo *LI;
+  ScalarEvolution *SE;
+  DominatorTree *DT;
+  const DataLayout &DL;
+  TargetLibraryInfo *TLI;
+  const TargetTransformInfo *TTI;
+  std::unique_ptr<MemorySSAUpdater> MSSAU;
+
+  SmallVector<WeakTrackingVH, 16> DeadInsts;
   bool WidenIndVars;
- 
-  bool handleFloatingPointIV(Loop *L, PHINode *PH); 
-  bool rewriteNonIntegerIVs(Loop *L); 
- 
-  bool simplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LoopInfo *LI); 
-  /// Try to eliminate loop exits based on analyzeable exit counts 
-  bool optimizeLoopExits(Loop *L, SCEVExpander &Rewriter); 
-  /// Try to form loop invariant tests for loop exits by changing how many 
-  /// iterations of the loop run when that is unobservable. 
-  bool predicateLoopExits(Loop *L, SCEVExpander &Rewriter); 
- 
-  bool rewriteFirstIterationLoopExitValues(Loop *L); 
- 
-  bool linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB, 
-                                 const SCEV *ExitCount, 
-                                 PHINode *IndVar, SCEVExpander &Rewriter); 
- 
-  bool sinkUnusedInvariants(Loop *L); 
- 
-public: 
-  IndVarSimplify(LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 
-                 const DataLayout &DL, TargetLibraryInfo *TLI, 
+
+  bool handleFloatingPointIV(Loop *L, PHINode *PH);
+  bool rewriteNonIntegerIVs(Loop *L);
+
+  bool simplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LoopInfo *LI);
+  /// Try to eliminate loop exits based on analyzeable exit counts
+  bool optimizeLoopExits(Loop *L, SCEVExpander &Rewriter);
+  /// Try to form loop invariant tests for loop exits by changing how many
+  /// iterations of the loop run when that is unobservable.
+  bool predicateLoopExits(Loop *L, SCEVExpander &Rewriter);
+
+  bool rewriteFirstIterationLoopExitValues(Loop *L);
+
+  bool linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB,
+                                 const SCEV *ExitCount,
+                                 PHINode *IndVar, SCEVExpander &Rewriter);
+
+  bool sinkUnusedInvariants(Loop *L);
+
+public:
+  IndVarSimplify(LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
+                 const DataLayout &DL, TargetLibraryInfo *TLI,
                  TargetTransformInfo *TTI, MemorySSA *MSSA, bool WidenIndVars)
       : LI(LI), SE(SE), DT(DT), DL(DL), TLI(TLI), TTI(TTI),
         WidenIndVars(WidenIndVars) {
-    if (MSSA) 
-      MSSAU = std::make_unique<MemorySSAUpdater>(MSSA); 
-  } 
- 
-  bool run(Loop *L); 
-}; 
- 
-} // end anonymous namespace 
- 
-//===----------------------------------------------------------------------===// 
-// rewriteNonIntegerIVs and helpers. Prefer integer IVs. 
-//===----------------------------------------------------------------------===// 
- 
-/// Convert APF to an integer, if possible. 
-static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) { 
-  bool isExact = false; 
-  // See if we can convert this to an int64_t 
-  uint64_t UIntVal; 
-  if (APF.convertToInteger(makeMutableArrayRef(UIntVal), 64, true, 
-                           APFloat::rmTowardZero, &isExact) != APFloat::opOK || 
-      !isExact) 
-    return false; 
-  IntVal = UIntVal; 
-  return true; 
-} 
- 
-/// If the loop has floating induction variable then insert corresponding 
-/// integer induction variable if possible. 
-/// For example, 
-/// for(double i = 0; i < 10000; ++i) 
-///   bar(i) 
-/// is converted into 
-/// for(int i = 0; i < 10000; ++i) 
-///   bar((double)i); 
-bool IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) { 
-  unsigned IncomingEdge = L->contains(PN->getIncomingBlock(0)); 
-  unsigned BackEdge     = IncomingEdge^1; 
- 
-  // Check incoming value. 
-  auto *InitValueVal = dyn_cast<ConstantFP>(PN->getIncomingValue(IncomingEdge)); 
- 
-  int64_t InitValue; 
-  if (!InitValueVal || !ConvertToSInt(InitValueVal->getValueAPF(), InitValue)) 
-    return false; 
- 
-  // Check IV increment. Reject this PN if increment operation is not 
-  // an add or increment value can not be represented by an integer. 
-  auto *Incr = dyn_cast<BinaryOperator>(PN->getIncomingValue(BackEdge)); 
-  if (Incr == nullptr || Incr->getOpcode() != Instruction::FAdd) return false; 
- 
-  // If this is not an add of the PHI with a constantfp, or if the constant fp 
-  // is not an integer, bail out. 
-  ConstantFP *IncValueVal = dyn_cast<ConstantFP>(Incr->getOperand(1)); 
-  int64_t IncValue; 
-  if (IncValueVal == nullptr || Incr->getOperand(0) != PN || 
-      !ConvertToSInt(IncValueVal->getValueAPF(), IncValue)) 
-    return false; 
- 
-  // Check Incr uses. One user is PN and the other user is an exit condition 
-  // used by the conditional terminator. 
-  Value::user_iterator IncrUse = Incr->user_begin(); 
-  Instruction *U1 = cast<Instruction>(*IncrUse++); 
-  if (IncrUse == Incr->user_end()) return false; 
-  Instruction *U2 = cast<Instruction>(*IncrUse++); 
-  if (IncrUse != Incr->user_end()) return false; 
- 
-  // Find exit condition, which is an fcmp.  If it doesn't exist, or if it isn't 
-  // only used by a branch, we can't transform it. 
-  FCmpInst *Compare = dyn_cast<FCmpInst>(U1); 
-  if (!Compare) 
-    Compare = dyn_cast<FCmpInst>(U2); 
-  if (!Compare || !Compare->hasOneUse() || 
-      !isa<BranchInst>(Compare->user_back())) 
-    return false; 
- 
-  BranchInst *TheBr = cast<BranchInst>(Compare->user_back()); 
- 
-  // We need to verify that the branch actually controls the iteration count 
-  // of the loop.  If not, the new IV can overflow and no one will notice. 
-  // The branch block must be in the loop and one of the successors must be out 
-  // of the loop. 
-  assert(TheBr->isConditional() && "Can't use fcmp if not conditional"); 
-  if (!L->contains(TheBr->getParent()) || 
-      (L->contains(TheBr->getSuccessor(0)) && 
-       L->contains(TheBr->getSuccessor(1)))) 
-    return false; 
- 
-  // If it isn't a comparison with an integer-as-fp (the exit value), we can't 
-  // transform it. 
-  ConstantFP *ExitValueVal = dyn_cast<ConstantFP>(Compare->getOperand(1)); 
-  int64_t ExitValue; 
-  if (ExitValueVal == nullptr || 
-      !ConvertToSInt(ExitValueVal->getValueAPF(), ExitValue)) 
-    return false; 
- 
-  // Find new predicate for integer comparison. 
-  CmpInst::Predicate NewPred = CmpInst::BAD_ICMP_PREDICATE; 
-  switch (Compare->getPredicate()) { 
-  default: return false;  // Unknown comparison. 
-  case CmpInst::FCMP_OEQ: 
-  case CmpInst::FCMP_UEQ: NewPred = CmpInst::ICMP_EQ; break; 
-  case CmpInst::FCMP_ONE: 
-  case CmpInst::FCMP_UNE: NewPred = CmpInst::ICMP_NE; break; 
-  case CmpInst::FCMP_OGT: 
-  case CmpInst::FCMP_UGT: NewPred = CmpInst::ICMP_SGT; break; 
-  case CmpInst::FCMP_OGE: 
-  case CmpInst::FCMP_UGE: NewPred = CmpInst::ICMP_SGE; break; 
-  case CmpInst::FCMP_OLT: 
-  case CmpInst::FCMP_ULT: NewPred = CmpInst::ICMP_SLT; break; 
-  case CmpInst::FCMP_OLE: 
-  case CmpInst::FCMP_ULE: NewPred = CmpInst::ICMP_SLE; break; 
-  } 
- 
-  // We convert the floating point induction variable to a signed i32 value if 
-  // we can.  This is only safe if the comparison will not overflow in a way 
-  // that won't be trapped by the integer equivalent operations.  Check for this 
-  // now. 
-  // TODO: We could use i64 if it is native and the range requires it. 
- 
-  // The start/stride/exit values must all fit in signed i32. 
-  if (!isInt<32>(InitValue) || !isInt<32>(IncValue) || !isInt<32>(ExitValue)) 
-    return false; 
- 
-  // If not actually striding (add x, 0.0), avoid touching the code. 
-  if (IncValue == 0) 
-    return false; 
- 
-  // Positive and negative strides have different safety conditions. 
-  if (IncValue > 0) { 
-    // If we have a positive stride, we require the init to be less than the 
-    // exit value. 
-    if (InitValue >= ExitValue) 
-      return false; 
- 
-    uint32_t Range = uint32_t(ExitValue-InitValue); 
-    // Check for infinite loop, either: 
-    // while (i <= Exit) or until (i > Exit) 
-    if (NewPred == CmpInst::ICMP_SLE || NewPred == CmpInst::ICMP_SGT) { 
-      if (++Range == 0) return false;  // Range overflows. 
-    } 
- 
-    unsigned Leftover = Range % uint32_t(IncValue); 
- 
-    // If this is an equality comparison, we require that the strided value 
-    // exactly land on the exit value, otherwise the IV condition will wrap 
-    // around and do things the fp IV wouldn't. 
-    if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) && 
-        Leftover != 0) 
-      return false; 
- 
-    // If the stride would wrap around the i32 before exiting, we can't 
-    // transform the IV. 
-    if (Leftover != 0 && int32_t(ExitValue+IncValue) < ExitValue) 
-      return false; 
-  } else { 
-    // If we have a negative stride, we require the init to be greater than the 
-    // exit value. 
-    if (InitValue <= ExitValue) 
-      return false; 
- 
-    uint32_t Range = uint32_t(InitValue-ExitValue); 
-    // Check for infinite loop, either: 
-    // while (i >= Exit) or until (i < Exit) 
-    if (NewPred == CmpInst::ICMP_SGE || NewPred == CmpInst::ICMP_SLT) { 
-      if (++Range == 0) return false;  // Range overflows. 
-    } 
- 
-    unsigned Leftover = Range % uint32_t(-IncValue); 
- 
-    // If this is an equality comparison, we require that the strided value 
-    // exactly land on the exit value, otherwise the IV condition will wrap 
-    // around and do things the fp IV wouldn't. 
-    if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) && 
-        Leftover != 0) 
-      return false; 
- 
-    // If the stride would wrap around the i32 before exiting, we can't 
-    // transform the IV. 
-    if (Leftover != 0 && int32_t(ExitValue+IncValue) > ExitValue) 
-      return false; 
-  } 
- 
-  IntegerType *Int32Ty = Type::getInt32Ty(PN->getContext()); 
- 
-  // Insert new integer induction variable. 
-  PHINode *NewPHI = PHINode::Create(Int32Ty, 2, PN->getName()+".int", PN); 
-  NewPHI->addIncoming(ConstantInt::get(Int32Ty, InitValue), 
-                      PN->getIncomingBlock(IncomingEdge)); 
- 
-  Value *NewAdd = 
-    BinaryOperator::CreateAdd(NewPHI, ConstantInt::get(Int32Ty, IncValue), 
-                              Incr->getName()+".int", Incr); 
-  NewPHI->addIncoming(NewAdd, PN->getIncomingBlock(BackEdge)); 
- 
-  ICmpInst *NewCompare = new ICmpInst(TheBr, NewPred, NewAdd, 
-                                      ConstantInt::get(Int32Ty, ExitValue), 
-                                      Compare->getName()); 
- 
-  // In the following deletions, PN may become dead and may be deleted. 
-  // Use a WeakTrackingVH to observe whether this happens. 
-  WeakTrackingVH WeakPH = PN; 
- 
-  // Delete the old floating point exit comparison.  The branch starts using the 
-  // new comparison. 
-  NewCompare->takeName(Compare); 
-  Compare->replaceAllUsesWith(NewCompare); 
-  RecursivelyDeleteTriviallyDeadInstructions(Compare, TLI, MSSAU.get()); 
- 
-  // Delete the old floating point increment. 
-  Incr->replaceAllUsesWith(UndefValue::get(Incr->getType())); 
-  RecursivelyDeleteTriviallyDeadInstructions(Incr, TLI, MSSAU.get()); 
- 
-  // If the FP induction variable still has uses, this is because something else 
-  // in the loop uses its value.  In order to canonicalize the induction 
-  // variable, we chose to eliminate the IV and rewrite it in terms of an 
-  // int->fp cast. 
-  // 
-  // We give preference to sitofp over uitofp because it is faster on most 
-  // platforms. 
-  if (WeakPH) { 
-    Value *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv", 
-                                 &*PN->getParent()->getFirstInsertionPt()); 
-    PN->replaceAllUsesWith(Conv); 
-    RecursivelyDeleteTriviallyDeadInstructions(PN, TLI, MSSAU.get()); 
-  } 
-  return true; 
-} 
- 
-bool IndVarSimplify::rewriteNonIntegerIVs(Loop *L) { 
-  // First step.  Check to see if there are any floating-point recurrences. 
-  // If there are, change them into integer recurrences, permitting analysis by 
-  // the SCEV routines. 
-  BasicBlock *Header = L->getHeader(); 
- 
-  SmallVector<WeakTrackingVH, 8> PHIs; 
-  for (PHINode &PN : Header->phis()) 
-    PHIs.push_back(&PN); 
- 
-  bool Changed = false; 
-  for (unsigned i = 0, e = PHIs.size(); i != e; ++i) 
-    if (PHINode *PN = dyn_cast_or_null<PHINode>(&*PHIs[i])) 
-      Changed |= handleFloatingPointIV(L, PN); 
- 
-  // If the loop previously had floating-point IV, ScalarEvolution 
-  // may not have been able to compute a trip count. Now that we've done some 
-  // re-writing, the trip count may be computable. 
-  if (Changed) 
-    SE->forgetLoop(L); 
-  return Changed; 
-} 
- 
-//===---------------------------------------------------------------------===// 
-// rewriteFirstIterationLoopExitValues: Rewrite loop exit values if we know 
-// they will exit at the first iteration. 
-//===---------------------------------------------------------------------===// 
- 
-/// Check to see if this loop has loop invariant conditions which lead to loop 
-/// exits. If so, we know that if the exit path is taken, it is at the first 
-/// loop iteration. This lets us predict exit values of PHI nodes that live in 
-/// loop header. 
-bool IndVarSimplify::rewriteFirstIterationLoopExitValues(Loop *L) { 
-  // Verify the input to the pass is already in LCSSA form. 
-  assert(L->isLCSSAForm(*DT)); 
- 
-  SmallVector<BasicBlock *, 8> ExitBlocks; 
-  L->getUniqueExitBlocks(ExitBlocks); 
- 
-  bool MadeAnyChanges = false; 
-  for (auto *ExitBB : ExitBlocks) { 
-    // If there are no more PHI nodes in this exit block, then no more 
-    // values defined inside the loop are used on this path. 
-    for (PHINode &PN : ExitBB->phis()) { 
-      for (unsigned IncomingValIdx = 0, E = PN.getNumIncomingValues(); 
-           IncomingValIdx != E; ++IncomingValIdx) { 
-        auto *IncomingBB = PN.getIncomingBlock(IncomingValIdx); 
- 
-        // Can we prove that the exit must run on the first iteration if it 
-        // runs at all?  (i.e. early exits are fine for our purposes, but 
-        // traces which lead to this exit being taken on the 2nd iteration 
-        // aren't.)  Note that this is about whether the exit branch is 
-        // executed, not about whether it is taken. 
-        if (!L->getLoopLatch() || 
-            !DT->dominates(IncomingBB, L->getLoopLatch())) 
-          continue; 
- 
-        // Get condition that leads to the exit path. 
-        auto *TermInst = IncomingBB->getTerminator(); 
- 
-        Value *Cond = nullptr; 
-        if (auto *BI = dyn_cast<BranchInst>(TermInst)) { 
-          // Must be a conditional branch, otherwise the block 
-          // should not be in the loop. 
-          Cond = BI->getCondition(); 
-        } else if (auto *SI = dyn_cast<SwitchInst>(TermInst)) 
-          Cond = SI->getCondition(); 
-        else 
-          continue; 
- 
-        if (!L->isLoopInvariant(Cond)) 
-          continue; 
- 
-        auto *ExitVal = dyn_cast<PHINode>(PN.getIncomingValue(IncomingValIdx)); 
- 
-        // Only deal with PHIs in the loop header. 
-        if (!ExitVal || ExitVal->getParent() != L->getHeader()) 
-          continue; 
- 
-        // If ExitVal is a PHI on the loop header, then we know its 
-        // value along this exit because the exit can only be taken 
-        // on the first iteration. 
-        auto *LoopPreheader = L->getLoopPreheader(); 
-        assert(LoopPreheader && "Invalid loop"); 
-        int PreheaderIdx = ExitVal->getBasicBlockIndex(LoopPreheader); 
-        if (PreheaderIdx != -1) { 
-          assert(ExitVal->getParent() == L->getHeader() && 
-                 "ExitVal must be in loop header"); 
-          MadeAnyChanges = true; 
-          PN.setIncomingValue(IncomingValIdx, 
-                              ExitVal->getIncomingValue(PreheaderIdx)); 
-        } 
-      } 
-    } 
-  } 
-  return MadeAnyChanges; 
-} 
- 
-//===----------------------------------------------------------------------===// 
-//  IV Widening - Extend the width of an IV to cover its widest uses. 
-//===----------------------------------------------------------------------===// 
- 
-/// Update information about the induction variable that is extended by this 
-/// sign or zero extend operation. This is used to determine the final width of 
-/// the IV before actually widening it. 
+    if (MSSA)
+      MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
+  }
+
+  bool run(Loop *L);
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// rewriteNonIntegerIVs and helpers. Prefer integer IVs.
+//===----------------------------------------------------------------------===//
+
+/// Convert APF to an integer, if possible.
+static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) {
+  bool isExact = false;
+  // See if we can convert this to an int64_t
+  uint64_t UIntVal;
+  if (APF.convertToInteger(makeMutableArrayRef(UIntVal), 64, true,
+                           APFloat::rmTowardZero, &isExact) != APFloat::opOK ||
+      !isExact)
+    return false;
+  IntVal = UIntVal;
+  return true;
+}
+
+/// If the loop has floating induction variable then insert corresponding
+/// integer induction variable if possible.
+/// For example,
+/// for(double i = 0; i < 10000; ++i)
+///   bar(i)
+/// is converted into
+/// for(int i = 0; i < 10000; ++i)
+///   bar((double)i);
+bool IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) {
+  unsigned IncomingEdge = L->contains(PN->getIncomingBlock(0));
+  unsigned BackEdge     = IncomingEdge^1;
+
+  // Check incoming value.
+  auto *InitValueVal = dyn_cast<ConstantFP>(PN->getIncomingValue(IncomingEdge));
+
+  int64_t InitValue;
+  if (!InitValueVal || !ConvertToSInt(InitValueVal->getValueAPF(), InitValue))
+    return false;
+
+  // Check IV increment. Reject this PN if increment operation is not
+  // an add or increment value can not be represented by an integer.
+  auto *Incr = dyn_cast<BinaryOperator>(PN->getIncomingValue(BackEdge));
+  if (Incr == nullptr || Incr->getOpcode() != Instruction::FAdd) return false;
+
+  // If this is not an add of the PHI with a constantfp, or if the constant fp
+  // is not an integer, bail out.
+  ConstantFP *IncValueVal = dyn_cast<ConstantFP>(Incr->getOperand(1));
+  int64_t IncValue;
+  if (IncValueVal == nullptr || Incr->getOperand(0) != PN ||
+      !ConvertToSInt(IncValueVal->getValueAPF(), IncValue))
+    return false;
+
+  // Check Incr uses. One user is PN and the other user is an exit condition
+  // used by the conditional terminator.
+  Value::user_iterator IncrUse = Incr->user_begin();
+  Instruction *U1 = cast<Instruction>(*IncrUse++);
+  if (IncrUse == Incr->user_end()) return false;
+  Instruction *U2 = cast<Instruction>(*IncrUse++);
+  if (IncrUse != Incr->user_end()) return false;
+
+  // Find exit condition, which is an fcmp.  If it doesn't exist, or if it isn't
+  // only used by a branch, we can't transform it.
+  FCmpInst *Compare = dyn_cast<FCmpInst>(U1);
+  if (!Compare)
+    Compare = dyn_cast<FCmpInst>(U2);
+  if (!Compare || !Compare->hasOneUse() ||
+      !isa<BranchInst>(Compare->user_back()))
+    return false;
+
+  BranchInst *TheBr = cast<BranchInst>(Compare->user_back());
+
+  // We need to verify that the branch actually controls the iteration count
+  // of the loop.  If not, the new IV can overflow and no one will notice.
+  // The branch block must be in the loop and one of the successors must be out
+  // of the loop.
+  assert(TheBr->isConditional() && "Can't use fcmp if not conditional");
+  if (!L->contains(TheBr->getParent()) ||
+      (L->contains(TheBr->getSuccessor(0)) &&
+       L->contains(TheBr->getSuccessor(1))))
+    return false;
+
+  // If it isn't a comparison with an integer-as-fp (the exit value), we can't
+  // transform it.
+  ConstantFP *ExitValueVal = dyn_cast<ConstantFP>(Compare->getOperand(1));
+  int64_t ExitValue;
+  if (ExitValueVal == nullptr ||
+      !ConvertToSInt(ExitValueVal->getValueAPF(), ExitValue))
+    return false;
+
+  // Find new predicate for integer comparison.
+  CmpInst::Predicate NewPred = CmpInst::BAD_ICMP_PREDICATE;
+  switch (Compare->getPredicate()) {
+  default: return false;  // Unknown comparison.
+  case CmpInst::FCMP_OEQ:
+  case CmpInst::FCMP_UEQ: NewPred = CmpInst::ICMP_EQ; break;
+  case CmpInst::FCMP_ONE:
+  case CmpInst::FCMP_UNE: NewPred = CmpInst::ICMP_NE; break;
+  case CmpInst::FCMP_OGT:
+  case CmpInst::FCMP_UGT: NewPred = CmpInst::ICMP_SGT; break;
+  case CmpInst::FCMP_OGE:
+  case CmpInst::FCMP_UGE: NewPred = CmpInst::ICMP_SGE; break;
+  case CmpInst::FCMP_OLT:
+  case CmpInst::FCMP_ULT: NewPred = CmpInst::ICMP_SLT; break;
+  case CmpInst::FCMP_OLE:
+  case CmpInst::FCMP_ULE: NewPred = CmpInst::ICMP_SLE; break;
+  }
+
+  // We convert the floating point induction variable to a signed i32 value if
+  // we can.  This is only safe if the comparison will not overflow in a way
+  // that won't be trapped by the integer equivalent operations.  Check for this
+  // now.
+  // TODO: We could use i64 if it is native and the range requires it.
+
+  // The start/stride/exit values must all fit in signed i32.
+  if (!isInt<32>(InitValue) || !isInt<32>(IncValue) || !isInt<32>(ExitValue))
+    return false;
+
+  // If not actually striding (add x, 0.0), avoid touching the code.
+  if (IncValue == 0)
+    return false;
+
+  // Positive and negative strides have different safety conditions.
+  if (IncValue > 0) {
+    // If we have a positive stride, we require the init to be less than the
+    // exit value.
+    if (InitValue >= ExitValue)
+      return false;
+
+    uint32_t Range = uint32_t(ExitValue-InitValue);
+    // Check for infinite loop, either:
+    // while (i <= Exit) or until (i > Exit)
+    if (NewPred == CmpInst::ICMP_SLE || NewPred == CmpInst::ICMP_SGT) {
+      if (++Range == 0) return false;  // Range overflows.
+    }
+
+    unsigned Leftover = Range % uint32_t(IncValue);
+
+    // If this is an equality comparison, we require that the strided value
+    // exactly land on the exit value, otherwise the IV condition will wrap
+    // around and do things the fp IV wouldn't.
+    if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) &&
+        Leftover != 0)
+      return false;
+
+    // If the stride would wrap around the i32 before exiting, we can't
+    // transform the IV.
+    if (Leftover != 0 && int32_t(ExitValue+IncValue) < ExitValue)
+      return false;
+  } else {
+    // If we have a negative stride, we require the init to be greater than the
+    // exit value.
+    if (InitValue <= ExitValue)
+      return false;
+
+    uint32_t Range = uint32_t(InitValue-ExitValue);
+    // Check for infinite loop, either:
+    // while (i >= Exit) or until (i < Exit)
+    if (NewPred == CmpInst::ICMP_SGE || NewPred == CmpInst::ICMP_SLT) {
+      if (++Range == 0) return false;  // Range overflows.
+    }
+
+    unsigned Leftover = Range % uint32_t(-IncValue);
+
+    // If this is an equality comparison, we require that the strided value
+    // exactly land on the exit value, otherwise the IV condition will wrap
+    // around and do things the fp IV wouldn't.
+    if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) &&
+        Leftover != 0)
+      return false;
+
+    // If the stride would wrap around the i32 before exiting, we can't
+    // transform the IV.
+    if (Leftover != 0 && int32_t(ExitValue+IncValue) > ExitValue)
+      return false;
+  }
+
+  IntegerType *Int32Ty = Type::getInt32Ty(PN->getContext());
+
+  // Insert new integer induction variable.
+  PHINode *NewPHI = PHINode::Create(Int32Ty, 2, PN->getName()+".int", PN);
+  NewPHI->addIncoming(ConstantInt::get(Int32Ty, InitValue),
+                      PN->getIncomingBlock(IncomingEdge));
+
+  Value *NewAdd =
+    BinaryOperator::CreateAdd(NewPHI, ConstantInt::get(Int32Ty, IncValue),
+                              Incr->getName()+".int", Incr);
+  NewPHI->addIncoming(NewAdd, PN->getIncomingBlock(BackEdge));
+
+  ICmpInst *NewCompare = new ICmpInst(TheBr, NewPred, NewAdd,
+                                      ConstantInt::get(Int32Ty, ExitValue),
+                                      Compare->getName());
+
+  // In the following deletions, PN may become dead and may be deleted.
+  // Use a WeakTrackingVH to observe whether this happens.
+  WeakTrackingVH WeakPH = PN;
+
+  // Delete the old floating point exit comparison.  The branch starts using the
+  // new comparison.
+  NewCompare->takeName(Compare);
+  Compare->replaceAllUsesWith(NewCompare);
+  RecursivelyDeleteTriviallyDeadInstructions(Compare, TLI, MSSAU.get());
+
+  // Delete the old floating point increment.
+  Incr->replaceAllUsesWith(UndefValue::get(Incr->getType()));
+  RecursivelyDeleteTriviallyDeadInstructions(Incr, TLI, MSSAU.get());
+
+  // If the FP induction variable still has uses, this is because something else
+  // in the loop uses its value.  In order to canonicalize the induction
+  // variable, we chose to eliminate the IV and rewrite it in terms of an
+  // int->fp cast.
+  //
+  // We give preference to sitofp over uitofp because it is faster on most
+  // platforms.
+  if (WeakPH) {
+    Value *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv",
+                                 &*PN->getParent()->getFirstInsertionPt());
+    PN->replaceAllUsesWith(Conv);
+    RecursivelyDeleteTriviallyDeadInstructions(PN, TLI, MSSAU.get());
+  }
+  return true;
+}
+
+bool IndVarSimplify::rewriteNonIntegerIVs(Loop *L) {
+  // First step.  Check to see if there are any floating-point recurrences.
+  // If there are, change them into integer recurrences, permitting analysis by
+  // the SCEV routines.
+  BasicBlock *Header = L->getHeader();
+
+  SmallVector<WeakTrackingVH, 8> PHIs;
+  for (PHINode &PN : Header->phis())
+    PHIs.push_back(&PN);
+
+  bool Changed = false;
+  for (unsigned i = 0, e = PHIs.size(); i != e; ++i)
+    if (PHINode *PN = dyn_cast_or_null<PHINode>(&*PHIs[i]))
+      Changed |= handleFloatingPointIV(L, PN);
+
+  // If the loop previously had floating-point IV, ScalarEvolution
+  // may not have been able to compute a trip count. Now that we've done some
+  // re-writing, the trip count may be computable.
+  if (Changed)
+    SE->forgetLoop(L);
+  return Changed;
+}
+
+//===---------------------------------------------------------------------===//
+// rewriteFirstIterationLoopExitValues: Rewrite loop exit values if we know
+// they will exit at the first iteration.
+//===---------------------------------------------------------------------===//
+
+/// Check to see if this loop has loop invariant conditions which lead to loop
+/// exits. If so, we know that if the exit path is taken, it is at the first
+/// loop iteration. This lets us predict exit values of PHI nodes that live in
+/// loop header.
+bool IndVarSimplify::rewriteFirstIterationLoopExitValues(Loop *L) {
+  // Verify the input to the pass is already in LCSSA form.
+  assert(L->isLCSSAForm(*DT));
+
+  SmallVector<BasicBlock *, 8> ExitBlocks;
+  L->getUniqueExitBlocks(ExitBlocks);
+
+  bool MadeAnyChanges = false;
+  for (auto *ExitBB : ExitBlocks) {
+    // If there are no more PHI nodes in this exit block, then no more
+    // values defined inside the loop are used on this path.
+    for (PHINode &PN : ExitBB->phis()) {
+      for (unsigned IncomingValIdx = 0, E = PN.getNumIncomingValues();
+           IncomingValIdx != E; ++IncomingValIdx) {
+        auto *IncomingBB = PN.getIncomingBlock(IncomingValIdx);
+
+        // Can we prove that the exit must run on the first iteration if it
+        // runs at all?  (i.e. early exits are fine for our purposes, but
+        // traces which lead to this exit being taken on the 2nd iteration
+        // aren't.)  Note that this is about whether the exit branch is
+        // executed, not about whether it is taken.
+        if (!L->getLoopLatch() ||
+            !DT->dominates(IncomingBB, L->getLoopLatch()))
+          continue;
+
+        // Get condition that leads to the exit path.
+        auto *TermInst = IncomingBB->getTerminator();
+
+        Value *Cond = nullptr;
+        if (auto *BI = dyn_cast<BranchInst>(TermInst)) {
+          // Must be a conditional branch, otherwise the block
+          // should not be in the loop.
+          Cond = BI->getCondition();
+        } else if (auto *SI = dyn_cast<SwitchInst>(TermInst))
+          Cond = SI->getCondition();
+        else
+          continue;
+
+        if (!L->isLoopInvariant(Cond))
+          continue;
+
+        auto *ExitVal = dyn_cast<PHINode>(PN.getIncomingValue(IncomingValIdx));
+
+        // Only deal with PHIs in the loop header.
+        if (!ExitVal || ExitVal->getParent() != L->getHeader())
+          continue;
+
+        // If ExitVal is a PHI on the loop header, then we know its
+        // value along this exit because the exit can only be taken
+        // on the first iteration.
+        auto *LoopPreheader = L->getLoopPreheader();
+        assert(LoopPreheader && "Invalid loop");
+        int PreheaderIdx = ExitVal->getBasicBlockIndex(LoopPreheader);
+        if (PreheaderIdx != -1) {
+          assert(ExitVal->getParent() == L->getHeader() &&
+                 "ExitVal must be in loop header");
+          MadeAnyChanges = true;
+          PN.setIncomingValue(IncomingValIdx,
+                              ExitVal->getIncomingValue(PreheaderIdx));
+        }
+      }
+    }
+  }
+  return MadeAnyChanges;
+}
+
+//===----------------------------------------------------------------------===//
+//  IV Widening - Extend the width of an IV to cover its widest uses.
+//===----------------------------------------------------------------------===//
+
+/// Update information about the induction variable that is extended by this
+/// sign or zero extend operation. This is used to determine the final width of
+/// the IV before actually widening it.
 static void visitIVCast(CastInst *Cast, WideIVInfo &WI,
                         ScalarEvolution *SE,
-                        const TargetTransformInfo *TTI) { 
-  bool IsSigned = Cast->getOpcode() == Instruction::SExt; 
-  if (!IsSigned && Cast->getOpcode() != Instruction::ZExt) 
-    return; 
- 
-  Type *Ty = Cast->getType(); 
-  uint64_t Width = SE->getTypeSizeInBits(Ty); 
-  if (!Cast->getModule()->getDataLayout().isLegalInteger(Width)) 
-    return; 
- 
-  // Check that `Cast` actually extends the induction variable (we rely on this 
-  // later).  This takes care of cases where `Cast` is extending a truncation of 
-  // the narrow induction variable, and thus can end up being narrower than the 
-  // "narrow" induction variable. 
-  uint64_t NarrowIVWidth = SE->getTypeSizeInBits(WI.NarrowIV->getType()); 
-  if (NarrowIVWidth >= Width) 
-    return; 
- 
-  // Cast is either an sext or zext up to this point. 
-  // We should not widen an indvar if arithmetics on the wider indvar are more 
-  // expensive than those on the narrower indvar. We check only the cost of ADD 
-  // because at least an ADD is required to increment the induction variable. We 
-  // could compute more comprehensively the cost of all instructions on the 
-  // induction variable when necessary. 
-  if (TTI && 
-      TTI->getArithmeticInstrCost(Instruction::Add, Ty) > 
-          TTI->getArithmeticInstrCost(Instruction::Add, 
-                                      Cast->getOperand(0)->getType())) { 
-    return; 
-  } 
- 
-  if (!WI.WidestNativeType) { 
-    WI.WidestNativeType = SE->getEffectiveSCEVType(Ty); 
-    WI.IsSigned = IsSigned; 
-    return; 
-  } 
- 
-  // We extend the IV to satisfy the sign of its first user, arbitrarily. 
-  if (WI.IsSigned != IsSigned) 
-    return; 
- 
-  if (Width > SE->getTypeSizeInBits(WI.WidestNativeType)) 
-    WI.WidestNativeType = SE->getEffectiveSCEVType(Ty); 
-} 
- 
-//===----------------------------------------------------------------------===// 
-//  Live IV Reduction - Minimize IVs live across the loop. 
-//===----------------------------------------------------------------------===// 
- 
-//===----------------------------------------------------------------------===// 
-//  Simplification of IV users based on SCEV evaluation. 
-//===----------------------------------------------------------------------===// 
- 
-namespace { 
- 
-class IndVarSimplifyVisitor : public IVVisitor { 
-  ScalarEvolution *SE; 
-  const TargetTransformInfo *TTI; 
-  PHINode *IVPhi; 
- 
-public: 
-  WideIVInfo WI; 
- 
-  IndVarSimplifyVisitor(PHINode *IV, ScalarEvolution *SCEV, 
-                        const TargetTransformInfo *TTI, 
-                        const DominatorTree *DTree) 
-    : SE(SCEV), TTI(TTI), IVPhi(IV) { 
-    DT = DTree; 
-    WI.NarrowIV = IVPhi; 
-  } 
- 
-  // Implement the interface used by simplifyUsersOfIV. 
-  void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, TTI); } 
-}; 
- 
-} // end anonymous namespace 
- 
-/// Iteratively perform simplification on a worklist of IV users. Each 
-/// successive simplification may push more users which may themselves be 
-/// candidates for simplification. 
-/// 
-/// Sign/Zero extend elimination is interleaved with IV simplification. 
-bool IndVarSimplify::simplifyAndExtend(Loop *L, 
-                                       SCEVExpander &Rewriter, 
-                                       LoopInfo *LI) { 
-  SmallVector<WideIVInfo, 8> WideIVs; 
- 
-  auto *GuardDecl = L->getBlocks()[0]->getModule()->getFunction( 
-          Intrinsic::getName(Intrinsic::experimental_guard)); 
-  bool HasGuards = GuardDecl && !GuardDecl->use_empty(); 
- 
-  SmallVector<PHINode*, 8> LoopPhis; 
-  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { 
-    LoopPhis.push_back(cast<PHINode>(I)); 
-  } 
-  // Each round of simplification iterates through the SimplifyIVUsers worklist 
-  // for all current phis, then determines whether any IVs can be 
-  // widened. Widening adds new phis to LoopPhis, inducing another round of 
-  // simplification on the wide IVs. 
-  bool Changed = false; 
-  while (!LoopPhis.empty()) { 
-    // Evaluate as many IV expressions as possible before widening any IVs. This 
-    // forces SCEV to set no-wrap flags before evaluating sign/zero 
-    // extension. The first time SCEV attempts to normalize sign/zero extension, 
-    // the result becomes final. So for the most predictable results, we delay 
-    // evaluation of sign/zero extend evaluation until needed, and avoid running 
-    // other SCEV based analysis prior to simplifyAndExtend. 
-    do { 
-      PHINode *CurrIV = LoopPhis.pop_back_val(); 
- 
-      // Information about sign/zero extensions of CurrIV. 
-      IndVarSimplifyVisitor Visitor(CurrIV, SE, TTI, DT); 
- 
-      Changed |= simplifyUsersOfIV(CurrIV, SE, DT, LI, TTI, DeadInsts, Rewriter, 
-                                   &Visitor); 
- 
-      if (Visitor.WI.WidestNativeType) { 
-        WideIVs.push_back(Visitor.WI); 
-      } 
-    } while(!LoopPhis.empty()); 
- 
+                        const TargetTransformInfo *TTI) {
+  bool IsSigned = Cast->getOpcode() == Instruction::SExt;
+  if (!IsSigned && Cast->getOpcode() != Instruction::ZExt)
+    return;
+
+  Type *Ty = Cast->getType();
+  uint64_t Width = SE->getTypeSizeInBits(Ty);
+  if (!Cast->getModule()->getDataLayout().isLegalInteger(Width))
+    return;
+
+  // Check that `Cast` actually extends the induction variable (we rely on this
+  // later).  This takes care of cases where `Cast` is extending a truncation of
+  // the narrow induction variable, and thus can end up being narrower than the
+  // "narrow" induction variable.
+  uint64_t NarrowIVWidth = SE->getTypeSizeInBits(WI.NarrowIV->getType());
+  if (NarrowIVWidth >= Width)
+    return;
+
+  // Cast is either an sext or zext up to this point.
+  // We should not widen an indvar if arithmetics on the wider indvar are more
+  // expensive than those on the narrower indvar. We check only the cost of ADD
+  // because at least an ADD is required to increment the induction variable. We
+  // could compute more comprehensively the cost of all instructions on the
+  // induction variable when necessary.
+  if (TTI &&
+      TTI->getArithmeticInstrCost(Instruction::Add, Ty) >
+          TTI->getArithmeticInstrCost(Instruction::Add,
+                                      Cast->getOperand(0)->getType())) {
+    return;
+  }
+
+  if (!WI.WidestNativeType) {
+    WI.WidestNativeType = SE->getEffectiveSCEVType(Ty);
+    WI.IsSigned = IsSigned;
+    return;
+  }
+
+  // We extend the IV to satisfy the sign of its first user, arbitrarily.
+  if (WI.IsSigned != IsSigned)
+    return;
+
+  if (Width > SE->getTypeSizeInBits(WI.WidestNativeType))
+    WI.WidestNativeType = SE->getEffectiveSCEVType(Ty);
+}
+
+//===----------------------------------------------------------------------===//
+//  Live IV Reduction - Minimize IVs live across the loop.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Simplification of IV users based on SCEV evaluation.
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class IndVarSimplifyVisitor : public IVVisitor {
+  ScalarEvolution *SE;
+  const TargetTransformInfo *TTI;
+  PHINode *IVPhi;
+
+public:
+  WideIVInfo WI;
+
+  IndVarSimplifyVisitor(PHINode *IV, ScalarEvolution *SCEV,
+                        const TargetTransformInfo *TTI,
+                        const DominatorTree *DTree)
+    : SE(SCEV), TTI(TTI), IVPhi(IV) {
+    DT = DTree;
+    WI.NarrowIV = IVPhi;
+  }
+
+  // Implement the interface used by simplifyUsersOfIV.
+  void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, TTI); }
+};
+
+} // end anonymous namespace
+
+/// Iteratively perform simplification on a worklist of IV users. Each
+/// successive simplification may push more users which may themselves be
+/// candidates for simplification.
+///
+/// Sign/Zero extend elimination is interleaved with IV simplification.
+bool IndVarSimplify::simplifyAndExtend(Loop *L,
+                                       SCEVExpander &Rewriter,
+                                       LoopInfo *LI) {
+  SmallVector<WideIVInfo, 8> WideIVs;
+
+  auto *GuardDecl = L->getBlocks()[0]->getModule()->getFunction(
+          Intrinsic::getName(Intrinsic::experimental_guard));
+  bool HasGuards = GuardDecl && !GuardDecl->use_empty();
+
+  SmallVector<PHINode*, 8> LoopPhis;
+  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+    LoopPhis.push_back(cast<PHINode>(I));
+  }
+  // Each round of simplification iterates through the SimplifyIVUsers worklist
+  // for all current phis, then determines whether any IVs can be
+  // widened. Widening adds new phis to LoopPhis, inducing another round of
+  // simplification on the wide IVs.
+  bool Changed = false;
+  while (!LoopPhis.empty()) {
+    // Evaluate as many IV expressions as possible before widening any IVs. This
+    // forces SCEV to set no-wrap flags before evaluating sign/zero
+    // extension. The first time SCEV attempts to normalize sign/zero extension,
+    // the result becomes final. So for the most predictable results, we delay
+    // evaluation of sign/zero extend evaluation until needed, and avoid running
+    // other SCEV based analysis prior to simplifyAndExtend.
+    do {
+      PHINode *CurrIV = LoopPhis.pop_back_val();
+
+      // Information about sign/zero extensions of CurrIV.
+      IndVarSimplifyVisitor Visitor(CurrIV, SE, TTI, DT);
+
+      Changed |= simplifyUsersOfIV(CurrIV, SE, DT, LI, TTI, DeadInsts, Rewriter,
+                                   &Visitor);
+
+      if (Visitor.WI.WidestNativeType) {
+        WideIVs.push_back(Visitor.WI);
+      }
+    } while(!LoopPhis.empty());
+
     // Continue if we disallowed widening.
     if (!WidenIndVars)
       continue;
 
-    for (; !WideIVs.empty(); WideIVs.pop_back()) { 
+    for (; !WideIVs.empty(); WideIVs.pop_back()) {
       unsigned ElimExt;
       unsigned Widened;
       if (PHINode *WidePhi = createWideIV(WideIVs.back(), LI, SE, Rewriter,
@@ -643,653 +643,653 @@ bool IndVarSimplify::simplifyAndExtend(Loop *L,
                                           HasGuards, UsePostIncrementRanges)) {
         NumElimExt += ElimExt;
         NumWidened += Widened;
-        Changed = true; 
-        LoopPhis.push_back(WidePhi); 
-      } 
-    } 
-  } 
-  return Changed; 
-} 
- 
-//===----------------------------------------------------------------------===// 
-//  linearFunctionTestReplace and its kin. Rewrite the loop exit condition. 
-//===----------------------------------------------------------------------===// 
- 
-/// Given an Value which is hoped to be part of an add recurance in the given 
-/// loop, return the associated Phi node if so.  Otherwise, return null.  Note 
-/// that this is less general than SCEVs AddRec checking. 
-static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L) { 
-  Instruction *IncI = dyn_cast<Instruction>(IncV); 
-  if (!IncI) 
-    return nullptr; 
- 
-  switch (IncI->getOpcode()) { 
-  case Instruction::Add: 
-  case Instruction::Sub: 
-    break; 
-  case Instruction::GetElementPtr: 
-    // An IV counter must preserve its type. 
-    if (IncI->getNumOperands() == 2) 
-      break; 
-    LLVM_FALLTHROUGH; 
-  default: 
-    return nullptr; 
-  } 
- 
-  PHINode *Phi = dyn_cast<PHINode>(IncI->getOperand(0)); 
-  if (Phi && Phi->getParent() == L->getHeader()) { 
-    if (L->isLoopInvariant(IncI->getOperand(1))) 
-      return Phi; 
-    return nullptr; 
-  } 
-  if (IncI->getOpcode() == Instruction::GetElementPtr) 
-    return nullptr; 
- 
-  // Allow add/sub to be commuted. 
-  Phi = dyn_cast<PHINode>(IncI->getOperand(1)); 
-  if (Phi && Phi->getParent() == L->getHeader()) { 
-    if (L->isLoopInvariant(IncI->getOperand(0))) 
-      return Phi; 
-  } 
-  return nullptr; 
-} 
- 
-/// Whether the current loop exit test is based on this value.  Currently this 
-/// is limited to a direct use in the loop condition. 
-static bool isLoopExitTestBasedOn(Value *V, BasicBlock *ExitingBB) { 
-  BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 
-  ICmpInst *ICmp = dyn_cast<ICmpInst>(BI->getCondition()); 
-  // TODO: Allow non-icmp loop test. 
-  if (!ICmp) 
-    return false; 
- 
-  // TODO: Allow indirect use. 
-  return ICmp->getOperand(0) == V || ICmp->getOperand(1) == V; 
-} 
- 
-/// linearFunctionTestReplace policy. Return true unless we can show that the 
-/// current exit test is already sufficiently canonical. 
-static bool needsLFTR(Loop *L, BasicBlock *ExitingBB) { 
-  assert(L->getLoopLatch() && "Must be in simplified form"); 
- 
-  // Avoid converting a constant or loop invariant test back to a runtime 
-  // test.  This is critical for when SCEV's cached ExitCount is less precise 
-  // than the current IR (such as after we've proven a particular exit is 
-  // actually dead and thus the BE count never reaches our ExitCount.) 
-  BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 
-  if (L->isLoopInvariant(BI->getCondition())) 
-    return false; 
- 
-  // Do LFTR to simplify the exit condition to an ICMP. 
-  ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition()); 
-  if (!Cond) 
-    return true; 
- 
-  // Do LFTR to simplify the exit ICMP to EQ/NE 
-  ICmpInst::Predicate Pred = Cond->getPredicate(); 
-  if (Pred != ICmpInst::ICMP_NE && Pred != ICmpInst::ICMP_EQ) 
-    return true; 
- 
-  // Look for a loop invariant RHS 
-  Value *LHS = Cond->getOperand(0); 
-  Value *RHS = Cond->getOperand(1); 
-  if (!L->isLoopInvariant(RHS)) { 
-    if (!L->isLoopInvariant(LHS)) 
-      return true; 
-    std::swap(LHS, RHS); 
-  } 
-  // Look for a simple IV counter LHS 
-  PHINode *Phi = dyn_cast<PHINode>(LHS); 
-  if (!Phi) 
-    Phi = getLoopPhiForCounter(LHS, L); 
- 
-  if (!Phi) 
-    return true; 
- 
-  // Do LFTR if PHI node is defined in the loop, but is *not* a counter. 
-  int Idx = Phi->getBasicBlockIndex(L->getLoopLatch()); 
-  if (Idx < 0) 
-    return true; 
- 
-  // Do LFTR if the exit condition's IV is *not* a simple counter. 
-  Value *IncV = Phi->getIncomingValue(Idx); 
-  return Phi != getLoopPhiForCounter(IncV, L); 
-} 
- 
-/// Return true if undefined behavior would provable be executed on the path to 
-/// OnPathTo if Root produced a posion result.  Note that this doesn't say 
-/// anything about whether OnPathTo is actually executed or whether Root is 
-/// actually poison.  This can be used to assess whether a new use of Root can 
-/// be added at a location which is control equivalent with OnPathTo (such as 
-/// immediately before it) without introducing UB which didn't previously 
-/// exist.  Note that a false result conveys no information. 
-static bool mustExecuteUBIfPoisonOnPathTo(Instruction *Root, 
-                                          Instruction *OnPathTo, 
-                                          DominatorTree *DT) { 
-  // Basic approach is to assume Root is poison, propagate poison forward 
-  // through all users we can easily track, and then check whether any of those 
-  // users are provable UB and must execute before out exiting block might 
-  // exit. 
- 
-  // The set of all recursive users we've visited (which are assumed to all be 
-  // poison because of said visit) 
-  SmallSet<const Value *, 16> KnownPoison; 
-  SmallVector<const Instruction*, 16> Worklist; 
-  Worklist.push_back(Root); 
-  while (!Worklist.empty()) { 
-    const Instruction *I = Worklist.pop_back_val(); 
- 
-    // If we know this must trigger UB on a path leading our target. 
-    if (mustTriggerUB(I, KnownPoison) && DT->dominates(I, OnPathTo)) 
-      return true; 
- 
-    // If we can't analyze propagation through this instruction, just skip it 
-    // and transitive users.  Safe as false is a conservative result. 
+        Changed = true;
+        LoopPhis.push_back(WidePhi);
+      }
+    }
+  }
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+//  linearFunctionTestReplace and its kin. Rewrite the loop exit condition.
+//===----------------------------------------------------------------------===//
+
+/// Given an Value which is hoped to be part of an add recurance in the given
+/// loop, return the associated Phi node if so.  Otherwise, return null.  Note
+/// that this is less general than SCEVs AddRec checking.
+static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L) {
+  Instruction *IncI = dyn_cast<Instruction>(IncV);
+  if (!IncI)
+    return nullptr;
+
+  switch (IncI->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Sub:
+    break;
+  case Instruction::GetElementPtr:
+    // An IV counter must preserve its type.
+    if (IncI->getNumOperands() == 2)
+      break;
+    LLVM_FALLTHROUGH;
+  default:
+    return nullptr;
+  }
+
+  PHINode *Phi = dyn_cast<PHINode>(IncI->getOperand(0));
+  if (Phi && Phi->getParent() == L->getHeader()) {
+    if (L->isLoopInvariant(IncI->getOperand(1)))
+      return Phi;
+    return nullptr;
+  }
+  if (IncI->getOpcode() == Instruction::GetElementPtr)
+    return nullptr;
+
+  // Allow add/sub to be commuted.
+  Phi = dyn_cast<PHINode>(IncI->getOperand(1));
+  if (Phi && Phi->getParent() == L->getHeader()) {
+    if (L->isLoopInvariant(IncI->getOperand(0)))
+      return Phi;
+  }
+  return nullptr;
+}
+
+/// Whether the current loop exit test is based on this value.  Currently this
+/// is limited to a direct use in the loop condition.
+static bool isLoopExitTestBasedOn(Value *V, BasicBlock *ExitingBB) {
+  BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
+  ICmpInst *ICmp = dyn_cast<ICmpInst>(BI->getCondition());
+  // TODO: Allow non-icmp loop test.
+  if (!ICmp)
+    return false;
+
+  // TODO: Allow indirect use.
+  return ICmp->getOperand(0) == V || ICmp->getOperand(1) == V;
+}
+
+/// linearFunctionTestReplace policy. Return true unless we can show that the
+/// current exit test is already sufficiently canonical.
+static bool needsLFTR(Loop *L, BasicBlock *ExitingBB) {
+  assert(L->getLoopLatch() && "Must be in simplified form");
+
+  // Avoid converting a constant or loop invariant test back to a runtime
+  // test.  This is critical for when SCEV's cached ExitCount is less precise
+  // than the current IR (such as after we've proven a particular exit is
+  // actually dead and thus the BE count never reaches our ExitCount.)
+  BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
+  if (L->isLoopInvariant(BI->getCondition()))
+    return false;
+
+  // Do LFTR to simplify the exit condition to an ICMP.
+  ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
+  if (!Cond)
+    return true;
+
+  // Do LFTR to simplify the exit ICMP to EQ/NE
+  ICmpInst::Predicate Pred = Cond->getPredicate();
+  if (Pred != ICmpInst::ICMP_NE && Pred != ICmpInst::ICMP_EQ)
+    return true;
+
+  // Look for a loop invariant RHS
+  Value *LHS = Cond->getOperand(0);
+  Value *RHS = Cond->getOperand(1);
+  if (!L->isLoopInvariant(RHS)) {
+    if (!L->isLoopInvariant(LHS))
+      return true;
+    std::swap(LHS, RHS);
+  }
+  // Look for a simple IV counter LHS
+  PHINode *Phi = dyn_cast<PHINode>(LHS);
+  if (!Phi)
+    Phi = getLoopPhiForCounter(LHS, L);
+
+  if (!Phi)
+    return true;
+
+  // Do LFTR if PHI node is defined in the loop, but is *not* a counter.
+  int Idx = Phi->getBasicBlockIndex(L->getLoopLatch());
+  if (Idx < 0)
+    return true;
+
+  // Do LFTR if the exit condition's IV is *not* a simple counter.
+  Value *IncV = Phi->getIncomingValue(Idx);
+  return Phi != getLoopPhiForCounter(IncV, L);
+}
+
+/// Return true if undefined behavior would provable be executed on the path to
+/// OnPathTo if Root produced a posion result.  Note that this doesn't say
+/// anything about whether OnPathTo is actually executed or whether Root is
+/// actually poison.  This can be used to assess whether a new use of Root can
+/// be added at a location which is control equivalent with OnPathTo (such as
+/// immediately before it) without introducing UB which didn't previously
+/// exist.  Note that a false result conveys no information.
+static bool mustExecuteUBIfPoisonOnPathTo(Instruction *Root,
+                                          Instruction *OnPathTo,
+                                          DominatorTree *DT) {
+  // Basic approach is to assume Root is poison, propagate poison forward
+  // through all users we can easily track, and then check whether any of those
+  // users are provable UB and must execute before out exiting block might
+  // exit.
+
+  // The set of all recursive users we've visited (which are assumed to all be
+  // poison because of said visit)
+  SmallSet<const Value *, 16> KnownPoison;
+  SmallVector<const Instruction*, 16> Worklist;
+  Worklist.push_back(Root);
+  while (!Worklist.empty()) {
+    const Instruction *I = Worklist.pop_back_val();
+
+    // If we know this must trigger UB on a path leading our target.
+    if (mustTriggerUB(I, KnownPoison) && DT->dominates(I, OnPathTo))
+      return true;
+
+    // If we can't analyze propagation through this instruction, just skip it
+    // and transitive users.  Safe as false is a conservative result.
     if (!propagatesPoison(cast<Operator>(I)) && I != Root)
-      continue; 
- 
-    if (KnownPoison.insert(I).second) 
-      for (const User *User : I->users()) 
-        Worklist.push_back(cast<Instruction>(User)); 
-  } 
- 
-  // Might be non-UB, or might have a path we couldn't prove must execute on 
-  // way to exiting bb. 
-  return false; 
-} 
- 
-/// Recursive helper for hasConcreteDef(). Unfortunately, this currently boils 
-/// down to checking that all operands are constant and listing instructions 
-/// that may hide undef. 
-static bool hasConcreteDefImpl(Value *V, SmallPtrSetImpl<Value*> &Visited, 
-                               unsigned Depth) { 
-  if (isa<Constant>(V)) 
-    return !isa<UndefValue>(V); 
- 
-  if (Depth >= 6) 
-    return false; 
- 
-  // Conservatively handle non-constant non-instructions. For example, Arguments 
-  // may be undef. 
-  Instruction *I = dyn_cast<Instruction>(V); 
-  if (!I) 
-    return false; 
- 
-  // Load and return values may be undef. 
-  if(I->mayReadFromMemory() || isa<CallInst>(I) || isa<InvokeInst>(I)) 
-    return false; 
- 
-  // Optimistically handle other instructions. 
-  for (Value *Op : I->operands()) { 
-    if (!Visited.insert(Op).second) 
-      continue; 
-    if (!hasConcreteDefImpl(Op, Visited, Depth+1)) 
-      return false; 
-  } 
-  return true; 
-} 
- 
-/// Return true if the given value is concrete. We must prove that undef can 
-/// never reach it. 
-/// 
-/// TODO: If we decide that this is a good approach to checking for undef, we 
-/// may factor it into a common location. 
-static bool hasConcreteDef(Value *V) { 
-  SmallPtrSet<Value*, 8> Visited; 
-  Visited.insert(V); 
-  return hasConcreteDefImpl(V, Visited, 0); 
-} 
- 
-/// Return true if this IV has any uses other than the (soon to be rewritten) 
-/// loop exit test. 
-static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) { 
-  int LatchIdx = Phi->getBasicBlockIndex(LatchBlock); 
-  Value *IncV = Phi->getIncomingValue(LatchIdx); 
- 
-  for (User *U : Phi->users()) 
-    if (U != Cond && U != IncV) return false; 
- 
-  for (User *U : IncV->users()) 
-    if (U != Cond && U != Phi) return false; 
-  return true; 
-} 
- 
-/// Return true if the given phi is a "counter" in L.  A counter is an 
-/// add recurance (of integer or pointer type) with an arbitrary start, and a 
-/// step of 1.  Note that L must have exactly one latch. 
-static bool isLoopCounter(PHINode* Phi, Loop *L, 
-                          ScalarEvolution *SE) { 
-  assert(Phi->getParent() == L->getHeader()); 
-  assert(L->getLoopLatch()); 
- 
-  if (!SE->isSCEVable(Phi->getType())) 
-    return false; 
- 
-  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Phi)); 
-  if (!AR || AR->getLoop() != L || !AR->isAffine()) 
-    return false; 
- 
-  const SCEV *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)); 
-  if (!Step || !Step->isOne()) 
-    return false; 
- 
-  int LatchIdx = Phi->getBasicBlockIndex(L->getLoopLatch()); 
-  Value *IncV = Phi->getIncomingValue(LatchIdx); 
-  return (getLoopPhiForCounter(IncV, L) == Phi); 
-} 
- 
-/// Search the loop header for a loop counter (anadd rec w/step of one) 
-/// suitable for use by LFTR.  If multiple counters are available, select the 
-/// "best" one based profitable heuristics. 
-/// 
-/// BECount may be an i8* pointer type. The pointer difference is already 
-/// valid count without scaling the address stride, so it remains a pointer 
-/// expression as far as SCEV is concerned. 
-static PHINode *FindLoopCounter(Loop *L, BasicBlock *ExitingBB, 
-                                const SCEV *BECount, 
-                                ScalarEvolution *SE, DominatorTree *DT) { 
-  uint64_t BCWidth = SE->getTypeSizeInBits(BECount->getType()); 
- 
-  Value *Cond = cast<BranchInst>(ExitingBB->getTerminator())->getCondition(); 
- 
-  // Loop over all of the PHI nodes, looking for a simple counter. 
-  PHINode *BestPhi = nullptr; 
-  const SCEV *BestInit = nullptr; 
-  BasicBlock *LatchBlock = L->getLoopLatch(); 
-  assert(LatchBlock && "Must be in simplified form"); 
-  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 
- 
-  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { 
-    PHINode *Phi = cast<PHINode>(I); 
-    if (!isLoopCounter(Phi, L, SE)) 
-      continue; 
- 
-    // Avoid comparing an integer IV against a pointer Limit. 
-    if (BECount->getType()->isPointerTy() && !Phi->getType()->isPointerTy()) 
-      continue; 
- 
-    const auto *AR = cast<SCEVAddRecExpr>(SE->getSCEV(Phi)); 
- 
-    // AR may be a pointer type, while BECount is an integer type. 
-    // AR may be wider than BECount. With eq/ne tests overflow is immaterial. 
-    // AR may not be a narrower type, or we may never exit. 
-    uint64_t PhiWidth = SE->getTypeSizeInBits(AR->getType()); 
-    if (PhiWidth < BCWidth || !DL.isLegalInteger(PhiWidth)) 
-      continue; 
- 
-    // Avoid reusing a potentially undef value to compute other values that may 
-    // have originally had a concrete definition. 
-    if (!hasConcreteDef(Phi)) { 
-      // We explicitly allow unknown phis as long as they are already used by 
-      // the loop exit test.  This is legal since performing LFTR could not 
-      // increase the number of undef users. 
-      Value *IncPhi = Phi->getIncomingValueForBlock(LatchBlock); 
-      if (!isLoopExitTestBasedOn(Phi, ExitingBB) && 
-          !isLoopExitTestBasedOn(IncPhi, ExitingBB)) 
-        continue; 
-    } 
- 
-    // Avoid introducing undefined behavior due to poison which didn't exist in 
-    // the original program.  (Annoyingly, the rules for poison and undef 
-    // propagation are distinct, so this does NOT cover the undef case above.) 
-    // We have to ensure that we don't introduce UB by introducing a use on an 
-    // iteration where said IV produces poison.  Our strategy here differs for 
-    // pointers and integer IVs.  For integers, we strip and reinfer as needed, 
-    // see code in linearFunctionTestReplace.  For pointers, we restrict 
-    // transforms as there is no good way to reinfer inbounds once lost. 
-    if (!Phi->getType()->isIntegerTy() && 
-        !mustExecuteUBIfPoisonOnPathTo(Phi, ExitingBB->getTerminator(), DT)) 
-      continue; 
- 
-    const SCEV *Init = AR->getStart(); 
- 
-    if (BestPhi && !AlmostDeadIV(BestPhi, LatchBlock, Cond)) { 
-      // Don't force a live loop counter if another IV can be used. 
-      if (AlmostDeadIV(Phi, LatchBlock, Cond)) 
-        continue; 
- 
-      // Prefer to count-from-zero. This is a more "canonical" counter form. It 
-      // also prefers integer to pointer IVs. 
-      if (BestInit->isZero() != Init->isZero()) { 
-        if (BestInit->isZero()) 
-          continue; 
-      } 
-      // If two IVs both count from zero or both count from nonzero then the 
-      // narrower is likely a dead phi that has been widened. Use the wider phi 
-      // to allow the other to be eliminated. 
-      else if (PhiWidth <= SE->getTypeSizeInBits(BestPhi->getType())) 
-        continue; 
-    } 
-    BestPhi = Phi; 
-    BestInit = Init; 
-  } 
-  return BestPhi; 
-} 
- 
-/// Insert an IR expression which computes the value held by the IV IndVar 
-/// (which must be an loop counter w/unit stride) after the backedge of loop L 
-/// is taken ExitCount times. 
-static Value *genLoopLimit(PHINode *IndVar, BasicBlock *ExitingBB, 
-                           const SCEV *ExitCount, bool UsePostInc, Loop *L, 
-                           SCEVExpander &Rewriter, ScalarEvolution *SE) { 
-  assert(isLoopCounter(IndVar, L, SE)); 
-  const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(SE->getSCEV(IndVar)); 
-  const SCEV *IVInit = AR->getStart(); 
- 
-  // IVInit may be a pointer while ExitCount is an integer when FindLoopCounter 
-  // finds a valid pointer IV. Sign extend ExitCount in order to materialize a 
-  // GEP. Avoid running SCEVExpander on a new pointer value, instead reusing 
-  // the existing GEPs whenever possible. 
-  if (IndVar->getType()->isPointerTy() && 
-      !ExitCount->getType()->isPointerTy()) { 
-    // IVOffset will be the new GEP offset that is interpreted by GEP as a 
-    // signed value. ExitCount on the other hand represents the loop trip count, 
-    // which is an unsigned value. FindLoopCounter only allows induction 
-    // variables that have a positive unit stride of one. This means we don't 
-    // have to handle the case of negative offsets (yet) and just need to zero 
-    // extend ExitCount. 
-    Type *OfsTy = SE->getEffectiveSCEVType(IVInit->getType()); 
-    const SCEV *IVOffset = SE->getTruncateOrZeroExtend(ExitCount, OfsTy); 
-    if (UsePostInc) 
-      IVOffset = SE->getAddExpr(IVOffset, SE->getOne(OfsTy)); 
- 
-    // Expand the code for the iteration count. 
-    assert(SE->isLoopInvariant(IVOffset, L) && 
-           "Computed iteration count is not loop invariant!"); 
- 
-    // We could handle pointer IVs other than i8*, but we need to compensate for 
-    // gep index scaling. 
-    assert(SE->getSizeOfExpr(IntegerType::getInt64Ty(IndVar->getContext()), 
-                             cast<PointerType>(IndVar->getType()) 
-                                 ->getElementType())->isOne() && 
-           "unit stride pointer IV must be i8*"); 
- 
-    const SCEV *IVLimit = SE->getAddExpr(IVInit, IVOffset); 
-    BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 
-    return Rewriter.expandCodeFor(IVLimit, IndVar->getType(), BI); 
-  } else { 
-    // In any other case, convert both IVInit and ExitCount to integers before 
-    // comparing. This may result in SCEV expansion of pointers, but in practice 
-    // SCEV will fold the pointer arithmetic away as such: 
-    // BECount = (IVEnd - IVInit - 1) => IVLimit = IVInit (postinc). 
-    // 
-    // Valid Cases: (1) both integers is most common; (2) both may be pointers 
-    // for simple memset-style loops. 
-    // 
-    // IVInit integer and ExitCount pointer would only occur if a canonical IV 
-    // were generated on top of case #2, which is not expected. 
- 
-    assert(AR->getStepRecurrence(*SE)->isOne() && "only handles unit stride"); 
-    // For unit stride, IVCount = Start + ExitCount with 2's complement 
-    // overflow. 
- 
-    // For integer IVs, truncate the IV before computing IVInit + BECount, 
-    // unless we know apriori that the limit must be a constant when evaluated 
-    // in the bitwidth of the IV.  We prefer (potentially) keeping a truncate 
-    // of the IV in the loop over a (potentially) expensive expansion of the 
-    // widened exit count add(zext(add)) expression. 
-    if (SE->getTypeSizeInBits(IVInit->getType()) 
-        > SE->getTypeSizeInBits(ExitCount->getType())) { 
-      if (isa<SCEVConstant>(IVInit) && isa<SCEVConstant>(ExitCount)) 
-        ExitCount = SE->getZeroExtendExpr(ExitCount, IVInit->getType()); 
-      else 
-        IVInit = SE->getTruncateExpr(IVInit, ExitCount->getType()); 
-    } 
- 
-    const SCEV *IVLimit = SE->getAddExpr(IVInit, ExitCount); 
- 
-    if (UsePostInc) 
-      IVLimit = SE->getAddExpr(IVLimit, SE->getOne(IVLimit->getType())); 
- 
-    // Expand the code for the iteration count. 
-    assert(SE->isLoopInvariant(IVLimit, L) && 
-           "Computed iteration count is not loop invariant!"); 
-    // Ensure that we generate the same type as IndVar, or a smaller integer 
-    // type. In the presence of null pointer values, we have an integer type 
-    // SCEV expression (IVInit) for a pointer type IV value (IndVar). 
-    Type *LimitTy = ExitCount->getType()->isPointerTy() ? 
-      IndVar->getType() : ExitCount->getType(); 
-    BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 
-    return Rewriter.expandCodeFor(IVLimit, LimitTy, BI); 
-  } 
-} 
- 
-/// This method rewrites the exit condition of the loop to be a canonical != 
-/// comparison against the incremented loop induction variable.  This pass is 
-/// able to rewrite the exit tests of any loop where the SCEV analysis can 
-/// determine a loop-invariant trip count of the loop, which is actually a much 
-/// broader range than just linear tests. 
-bool IndVarSimplify:: 
-linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB, 
-                          const SCEV *ExitCount, 
-                          PHINode *IndVar, SCEVExpander &Rewriter) { 
-  assert(L->getLoopLatch() && "Loop no longer in simplified form?"); 
-  assert(isLoopCounter(IndVar, L, SE)); 
-  Instruction * const IncVar = 
-    cast<Instruction>(IndVar->getIncomingValueForBlock(L->getLoopLatch())); 
- 
-  // Initialize CmpIndVar to the preincremented IV. 
-  Value *CmpIndVar = IndVar; 
-  bool UsePostInc = false; 
- 
-  // If the exiting block is the same as the backedge block, we prefer to 
-  // compare against the post-incremented value, otherwise we must compare 
-  // against the preincremented value. 
-  if (ExitingBB == L->getLoopLatch()) { 
-    // For pointer IVs, we chose to not strip inbounds which requires us not 
-    // to add a potentially UB introducing use.  We need to either a) show 
-    // the loop test we're modifying is already in post-inc form, or b) show 
-    // that adding a use must not introduce UB. 
-    bool SafeToPostInc = 
-        IndVar->getType()->isIntegerTy() || 
-        isLoopExitTestBasedOn(IncVar, ExitingBB) || 
-        mustExecuteUBIfPoisonOnPathTo(IncVar, ExitingBB->getTerminator(), DT); 
-    if (SafeToPostInc) { 
-      UsePostInc = true; 
-      CmpIndVar = IncVar; 
-    } 
-  } 
- 
-  // It may be necessary to drop nowrap flags on the incrementing instruction 
-  // if either LFTR moves from a pre-inc check to a post-inc check (in which 
-  // case the increment might have previously been poison on the last iteration 
-  // only) or if LFTR switches to a different IV that was previously dynamically 
-  // dead (and as such may be arbitrarily poison). We remove any nowrap flags 
-  // that SCEV didn't infer for the post-inc addrec (even if we use a pre-inc 
-  // check), because the pre-inc addrec flags may be adopted from the original 
-  // instruction, while SCEV has to explicitly prove the post-inc nowrap flags. 
-  // TODO: This handling is inaccurate for one case: If we switch to a 
-  // dynamically dead IV that wraps on the first loop iteration only, which is 
-  // not covered by the post-inc addrec. (If the new IV was not dynamically 
-  // dead, it could not be poison on the first iteration in the first place.) 
-  if (auto *BO = dyn_cast<BinaryOperator>(IncVar)) { 
-    const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(SE->getSCEV(IncVar)); 
-    if (BO->hasNoUnsignedWrap()) 
-      BO->setHasNoUnsignedWrap(AR->hasNoUnsignedWrap()); 
-    if (BO->hasNoSignedWrap()) 
-      BO->setHasNoSignedWrap(AR->hasNoSignedWrap()); 
-  } 
- 
-  Value *ExitCnt = genLoopLimit( 
-      IndVar, ExitingBB, ExitCount, UsePostInc, L, Rewriter, SE); 
-  assert(ExitCnt->getType()->isPointerTy() == 
-             IndVar->getType()->isPointerTy() && 
-         "genLoopLimit missed a cast"); 
- 
-  // Insert a new icmp_ne or icmp_eq instruction before the branch. 
-  BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); 
-  ICmpInst::Predicate P; 
-  if (L->contains(BI->getSuccessor(0))) 
-    P = ICmpInst::ICMP_NE; 
-  else 
-    P = ICmpInst::ICMP_EQ; 
- 
-  IRBuilder<> Builder(BI); 
- 
-  // The new loop exit condition should reuse the debug location of the 
-  // original loop exit condition. 
-  if (auto *Cond = dyn_cast<Instruction>(BI->getCondition())) 
-    Builder.SetCurrentDebugLocation(Cond->getDebugLoc()); 
- 
-  // For integer IVs, if we evaluated the limit in the narrower bitwidth to 
-  // avoid the expensive expansion of the limit expression in the wider type, 
-  // emit a truncate to narrow the IV to the ExitCount type.  This is safe 
-  // since we know (from the exit count bitwidth), that we can't self-wrap in 
-  // the narrower type. 
-  unsigned CmpIndVarSize = SE->getTypeSizeInBits(CmpIndVar->getType()); 
-  unsigned ExitCntSize = SE->getTypeSizeInBits(ExitCnt->getType()); 
-  if (CmpIndVarSize > ExitCntSize) { 
-    assert(!CmpIndVar->getType()->isPointerTy() && 
-           !ExitCnt->getType()->isPointerTy()); 
- 
-    // Before resorting to actually inserting the truncate, use the same 
-    // reasoning as from SimplifyIndvar::eliminateTrunc to see if we can extend 
-    // the other side of the comparison instead.  We still evaluate the limit 
-    // in the narrower bitwidth, we just prefer a zext/sext outside the loop to 
-    // a truncate within in. 
-    bool Extended = false; 
-    const SCEV *IV = SE->getSCEV(CmpIndVar); 
-    const SCEV *TruncatedIV = SE->getTruncateExpr(SE->getSCEV(CmpIndVar), 
-                                                  ExitCnt->getType()); 
-    const SCEV *ZExtTrunc = 
-      SE->getZeroExtendExpr(TruncatedIV, CmpIndVar->getType()); 
- 
-    if (ZExtTrunc == IV) { 
-      Extended = true; 
-      ExitCnt = Builder.CreateZExt(ExitCnt, IndVar->getType(), 
-                                   "wide.trip.count"); 
-    } else { 
-      const SCEV *SExtTrunc = 
-        SE->getSignExtendExpr(TruncatedIV, CmpIndVar->getType()); 
-      if (SExtTrunc == IV) { 
-        Extended = true; 
-        ExitCnt = Builder.CreateSExt(ExitCnt, IndVar->getType(), 
-                                     "wide.trip.count"); 
-      } 
-    } 
- 
-    if (Extended) { 
-      bool Discard; 
-      L->makeLoopInvariant(ExitCnt, Discard); 
-    } else 
-      CmpIndVar = Builder.CreateTrunc(CmpIndVar, ExitCnt->getType(), 
-                                      "lftr.wideiv"); 
-  } 
-  LLVM_DEBUG(dbgs() << "INDVARS: Rewriting loop exit condition to:\n" 
-                    << "      LHS:" << *CmpIndVar << '\n' 
-                    << "       op:\t" << (P == ICmpInst::ICMP_NE ? "!=" : "==") 
-                    << "\n" 
-                    << "      RHS:\t" << *ExitCnt << "\n" 
-                    << "ExitCount:\t" << *ExitCount << "\n" 
-                    << "  was: " << *BI->getCondition() << "\n"); 
- 
-  Value *Cond = Builder.CreateICmp(P, CmpIndVar, ExitCnt, "exitcond"); 
-  Value *OrigCond = BI->getCondition(); 
-  // It's tempting to use replaceAllUsesWith here to fully replace the old 
-  // comparison, but that's not immediately safe, since users of the old 
-  // comparison may not be dominated by the new comparison. Instead, just 
-  // update the branch to use the new comparison; in the common case this 
-  // will make old comparison dead. 
-  BI->setCondition(Cond); 
-  DeadInsts.emplace_back(OrigCond); 
- 
-  ++NumLFTR; 
-  return true; 
-} 
- 
-//===----------------------------------------------------------------------===// 
-//  sinkUnusedInvariants. A late subpass to cleanup loop preheaders. 
-//===----------------------------------------------------------------------===// 
- 
-/// If there's a single exit block, sink any loop-invariant values that 
-/// were defined in the preheader but not used inside the loop into the 
-/// exit block to reduce register pressure in the loop. 
-bool IndVarSimplify::sinkUnusedInvariants(Loop *L) { 
-  BasicBlock *ExitBlock = L->getExitBlock(); 
-  if (!ExitBlock) return false; 
- 
-  BasicBlock *Preheader = L->getLoopPreheader(); 
-  if (!Preheader) return false; 
- 
-  bool MadeAnyChanges = false; 
-  BasicBlock::iterator InsertPt = ExitBlock->getFirstInsertionPt(); 
-  BasicBlock::iterator I(Preheader->getTerminator()); 
-  while (I != Preheader->begin()) { 
-    --I; 
-    // New instructions were inserted at the end of the preheader. 
-    if (isa<PHINode>(I)) 
-      break; 
- 
-    // Don't move instructions which might have side effects, since the side 
-    // effects need to complete before instructions inside the loop.  Also don't 
-    // move instructions which might read memory, since the loop may modify 
-    // memory. Note that it's okay if the instruction might have undefined 
-    // behavior: LoopSimplify guarantees that the preheader dominates the exit 
-    // block. 
-    if (I->mayHaveSideEffects() || I->mayReadFromMemory()) 
-      continue; 
- 
-    // Skip debug info intrinsics. 
-    if (isa<DbgInfoIntrinsic>(I)) 
-      continue; 
- 
-    // Skip eh pad instructions. 
-    if (I->isEHPad()) 
-      continue; 
- 
-    // Don't sink alloca: we never want to sink static alloca's out of the 
-    // entry block, and correctly sinking dynamic alloca's requires 
-    // checks for stacksave/stackrestore intrinsics. 
-    // FIXME: Refactor this check somehow? 
-    if (isa<AllocaInst>(I)) 
-      continue; 
- 
-    // Determine if there is a use in or before the loop (direct or 
-    // otherwise). 
-    bool UsedInLoop = false; 
-    for (Use &U : I->uses()) { 
-      Instruction *User = cast<Instruction>(U.getUser()); 
-      BasicBlock *UseBB = User->getParent(); 
-      if (PHINode *P = dyn_cast<PHINode>(User)) { 
-        unsigned i = 
-          PHINode::getIncomingValueNumForOperand(U.getOperandNo()); 
-        UseBB = P->getIncomingBlock(i); 
-      } 
-      if (UseBB == Preheader || L->contains(UseBB)) { 
-        UsedInLoop = true; 
-        break; 
-      } 
-    } 
- 
-    // If there is, the def must remain in the preheader. 
-    if (UsedInLoop) 
-      continue; 
- 
-    // Otherwise, sink it to the exit block. 
-    Instruction *ToMove = &*I; 
-    bool Done = false; 
- 
-    if (I != Preheader->begin()) { 
-      // Skip debug info intrinsics. 
-      do { 
-        --I; 
-      } while (isa<DbgInfoIntrinsic>(I) && I != Preheader->begin()); 
- 
-      if (isa<DbgInfoIntrinsic>(I) && I == Preheader->begin()) 
-        Done = true; 
-    } else { 
-      Done = true; 
-    } 
- 
-    MadeAnyChanges = true; 
-    ToMove->moveBefore(*ExitBlock, InsertPt); 
-    if (Done) break; 
-    InsertPt = ToMove->getIterator(); 
-  } 
- 
-  return MadeAnyChanges; 
-} 
- 
+      continue;
+
+    if (KnownPoison.insert(I).second)
+      for (const User *User : I->users())
+        Worklist.push_back(cast<Instruction>(User));
+  }
+
+  // Might be non-UB, or might have a path we couldn't prove must execute on
+  // way to exiting bb.
+  return false;
+}
+
+/// Recursive helper for hasConcreteDef(). Unfortunately, this currently boils
+/// down to checking that all operands are constant and listing instructions
+/// that may hide undef.
+static bool hasConcreteDefImpl(Value *V, SmallPtrSetImpl<Value*> &Visited,
+                               unsigned Depth) {
+  if (isa<Constant>(V))
+    return !isa<UndefValue>(V);
+
+  if (Depth >= 6)
+    return false;
+
+  // Conservatively handle non-constant non-instructions. For example, Arguments
+  // may be undef.
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return false;
+
+  // Load and return values may be undef.
+  if(I->mayReadFromMemory() || isa<CallInst>(I) || isa<InvokeInst>(I))
+    return false;
+
+  // Optimistically handle other instructions.
+  for (Value *Op : I->operands()) {
+    if (!Visited.insert(Op).second)
+      continue;
+    if (!hasConcreteDefImpl(Op, Visited, Depth+1))
+      return false;
+  }
+  return true;
+}
+
+/// Return true if the given value is concrete. We must prove that undef can
+/// never reach it.
+///
+/// TODO: If we decide that this is a good approach to checking for undef, we
+/// may factor it into a common location.
+static bool hasConcreteDef(Value *V) {
+  SmallPtrSet<Value*, 8> Visited;
+  Visited.insert(V);
+  return hasConcreteDefImpl(V, Visited, 0);
+}
+
+/// Return true if this IV has any uses other than the (soon to be rewritten)
+/// loop exit test.
+static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) {
+  int LatchIdx = Phi->getBasicBlockIndex(LatchBlock);
+  Value *IncV = Phi->getIncomingValue(LatchIdx);
+
+  for (User *U : Phi->users())
+    if (U != Cond && U != IncV) return false;
+
+  for (User *U : IncV->users())
+    if (U != Cond && U != Phi) return false;
+  return true;
+}
+
+/// Return true if the given phi is a "counter" in L.  A counter is an
+/// add recurance (of integer or pointer type) with an arbitrary start, and a
+/// step of 1.  Note that L must have exactly one latch.
+static bool isLoopCounter(PHINode* Phi, Loop *L,
+                          ScalarEvolution *SE) {
+  assert(Phi->getParent() == L->getHeader());
+  assert(L->getLoopLatch());
+
+  if (!SE->isSCEVable(Phi->getType()))
+    return false;
+
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Phi));
+  if (!AR || AR->getLoop() != L || !AR->isAffine())
+    return false;
+
+  const SCEV *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE));
+  if (!Step || !Step->isOne())
+    return false;
+
+  int LatchIdx = Phi->getBasicBlockIndex(L->getLoopLatch());
+  Value *IncV = Phi->getIncomingValue(LatchIdx);
+  return (getLoopPhiForCounter(IncV, L) == Phi);
+}
+
+/// Search the loop header for a loop counter (anadd rec w/step of one)
+/// suitable for use by LFTR.  If multiple counters are available, select the
+/// "best" one based profitable heuristics.
+///
+/// BECount may be an i8* pointer type. The pointer difference is already
+/// valid count without scaling the address stride, so it remains a pointer
+/// expression as far as SCEV is concerned.
+static PHINode *FindLoopCounter(Loop *L, BasicBlock *ExitingBB,
+                                const SCEV *BECount,
+                                ScalarEvolution *SE, DominatorTree *DT) {
+  uint64_t BCWidth = SE->getTypeSizeInBits(BECount->getType());
+
+  Value *Cond = cast<BranchInst>(ExitingBB->getTerminator())->getCondition();
+
+  // Loop over all of the PHI nodes, looking for a simple counter.
+  PHINode *BestPhi = nullptr;
+  const SCEV *BestInit = nullptr;
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  assert(LatchBlock && "Must be in simplified form");
+  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+
+  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+    PHINode *Phi = cast<PHINode>(I);
+    if (!isLoopCounter(Phi, L, SE))
+      continue;
+
+    // Avoid comparing an integer IV against a pointer Limit.
+    if (BECount->getType()->isPointerTy() && !Phi->getType()->isPointerTy())
+      continue;
+
+    const auto *AR = cast<SCEVAddRecExpr>(SE->getSCEV(Phi));
+
+    // AR may be a pointer type, while BECount is an integer type.
+    // AR may be wider than BECount. With eq/ne tests overflow is immaterial.
+    // AR may not be a narrower type, or we may never exit.
+    uint64_t PhiWidth = SE->getTypeSizeInBits(AR->getType());
+    if (PhiWidth < BCWidth || !DL.isLegalInteger(PhiWidth))
+      continue;
+
+    // Avoid reusing a potentially undef value to compute other values that may
+    // have originally had a concrete definition.
+    if (!hasConcreteDef(Phi)) {
+      // We explicitly allow unknown phis as long as they are already used by
+      // the loop exit test.  This is legal since performing LFTR could not
+      // increase the number of undef users.
+      Value *IncPhi = Phi->getIncomingValueForBlock(LatchBlock);
+      if (!isLoopExitTestBasedOn(Phi, ExitingBB) &&
+          !isLoopExitTestBasedOn(IncPhi, ExitingBB))
+        continue;
+    }
+
+    // Avoid introducing undefined behavior due to poison which didn't exist in
+    // the original program.  (Annoyingly, the rules for poison and undef
+    // propagation are distinct, so this does NOT cover the undef case above.)
+    // We have to ensure that we don't introduce UB by introducing a use on an
+    // iteration where said IV produces poison.  Our strategy here differs for
+    // pointers and integer IVs.  For integers, we strip and reinfer as needed,
+    // see code in linearFunctionTestReplace.  For pointers, we restrict
+    // transforms as there is no good way to reinfer inbounds once lost.
+    if (!Phi->getType()->isIntegerTy() &&
+        !mustExecuteUBIfPoisonOnPathTo(Phi, ExitingBB->getTerminator(), DT))
+      continue;
+
+    const SCEV *Init = AR->getStart();
+
+    if (BestPhi && !AlmostDeadIV(BestPhi, LatchBlock, Cond)) {
+      // Don't force a live loop counter if another IV can be used.
+      if (AlmostDeadIV(Phi, LatchBlock, Cond))
+        continue;
+
+      // Prefer to count-from-zero. This is a more "canonical" counter form. It
+      // also prefers integer to pointer IVs.
+      if (BestInit->isZero() != Init->isZero()) {
+        if (BestInit->isZero())
+          continue;
+      }
+      // If two IVs both count from zero or both count from nonzero then the
+      // narrower is likely a dead phi that has been widened. Use the wider phi
+      // to allow the other to be eliminated.
+      else if (PhiWidth <= SE->getTypeSizeInBits(BestPhi->getType()))
+        continue;
+    }
+    BestPhi = Phi;
+    BestInit = Init;
+  }
+  return BestPhi;
+}
+
+/// Insert an IR expression which computes the value held by the IV IndVar
+/// (which must be an loop counter w/unit stride) after the backedge of loop L
+/// is taken ExitCount times.
+static Value *genLoopLimit(PHINode *IndVar, BasicBlock *ExitingBB,
+                           const SCEV *ExitCount, bool UsePostInc, Loop *L,
+                           SCEVExpander &Rewriter, ScalarEvolution *SE) {
+  assert(isLoopCounter(IndVar, L, SE));
+  const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(SE->getSCEV(IndVar));
+  const SCEV *IVInit = AR->getStart();
+
+  // IVInit may be a pointer while ExitCount is an integer when FindLoopCounter
+  // finds a valid pointer IV. Sign extend ExitCount in order to materialize a
+  // GEP. Avoid running SCEVExpander on a new pointer value, instead reusing
+  // the existing GEPs whenever possible.
+  if (IndVar->getType()->isPointerTy() &&
+      !ExitCount->getType()->isPointerTy()) {
+    // IVOffset will be the new GEP offset that is interpreted by GEP as a
+    // signed value. ExitCount on the other hand represents the loop trip count,
+    // which is an unsigned value. FindLoopCounter only allows induction
+    // variables that have a positive unit stride of one. This means we don't
+    // have to handle the case of negative offsets (yet) and just need to zero
+    // extend ExitCount.
+    Type *OfsTy = SE->getEffectiveSCEVType(IVInit->getType());
+    const SCEV *IVOffset = SE->getTruncateOrZeroExtend(ExitCount, OfsTy);
+    if (UsePostInc)
+      IVOffset = SE->getAddExpr(IVOffset, SE->getOne(OfsTy));
+
+    // Expand the code for the iteration count.
+    assert(SE->isLoopInvariant(IVOffset, L) &&
+           "Computed iteration count is not loop invariant!");
+
+    // We could handle pointer IVs other than i8*, but we need to compensate for
+    // gep index scaling.
+    assert(SE->getSizeOfExpr(IntegerType::getInt64Ty(IndVar->getContext()),
+                             cast<PointerType>(IndVar->getType())
+                                 ->getElementType())->isOne() &&
+           "unit stride pointer IV must be i8*");
+
+    const SCEV *IVLimit = SE->getAddExpr(IVInit, IVOffset);
+    BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
+    return Rewriter.expandCodeFor(IVLimit, IndVar->getType(), BI);
+  } else {
+    // In any other case, convert both IVInit and ExitCount to integers before
+    // comparing. This may result in SCEV expansion of pointers, but in practice
+    // SCEV will fold the pointer arithmetic away as such:
+    // BECount = (IVEnd - IVInit - 1) => IVLimit = IVInit (postinc).
+    //
+    // Valid Cases: (1) both integers is most common; (2) both may be pointers
+    // for simple memset-style loops.
+    //
+    // IVInit integer and ExitCount pointer would only occur if a canonical IV
+    // were generated on top of case #2, which is not expected.
+
+    assert(AR->getStepRecurrence(*SE)->isOne() && "only handles unit stride");
+    // For unit stride, IVCount = Start + ExitCount with 2's complement
+    // overflow.
+
+    // For integer IVs, truncate the IV before computing IVInit + BECount,
+    // unless we know apriori that the limit must be a constant when evaluated
+    // in the bitwidth of the IV.  We prefer (potentially) keeping a truncate
+    // of the IV in the loop over a (potentially) expensive expansion of the
+    // widened exit count add(zext(add)) expression.
+    if (SE->getTypeSizeInBits(IVInit->getType())
+        > SE->getTypeSizeInBits(ExitCount->getType())) {
+      if (isa<SCEVConstant>(IVInit) && isa<SCEVConstant>(ExitCount))
+        ExitCount = SE->getZeroExtendExpr(ExitCount, IVInit->getType());
+      else
+        IVInit = SE->getTruncateExpr(IVInit, ExitCount->getType());
+    }
+
+    const SCEV *IVLimit = SE->getAddExpr(IVInit, ExitCount);
+
+    if (UsePostInc)
+      IVLimit = SE->getAddExpr(IVLimit, SE->getOne(IVLimit->getType()));
+
+    // Expand the code for the iteration count.
+    assert(SE->isLoopInvariant(IVLimit, L) &&
+           "Computed iteration count is not loop invariant!");
+    // Ensure that we generate the same type as IndVar, or a smaller integer
+    // type. In the presence of null pointer values, we have an integer type
+    // SCEV expression (IVInit) for a pointer type IV value (IndVar).
+    Type *LimitTy = ExitCount->getType()->isPointerTy() ?
+      IndVar->getType() : ExitCount->getType();
+    BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
+    return Rewriter.expandCodeFor(IVLimit, LimitTy, BI);
+  }
+}
+
+/// This method rewrites the exit condition of the loop to be a canonical !=
+/// comparison against the incremented loop induction variable.  This pass is
+/// able to rewrite the exit tests of any loop where the SCEV analysis can
+/// determine a loop-invariant trip count of the loop, which is actually a much
+/// broader range than just linear tests.
+bool IndVarSimplify::
+linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB,
+                          const SCEV *ExitCount,
+                          PHINode *IndVar, SCEVExpander &Rewriter) {
+  assert(L->getLoopLatch() && "Loop no longer in simplified form?");
+  assert(isLoopCounter(IndVar, L, SE));
+  Instruction * const IncVar =
+    cast<Instruction>(IndVar->getIncomingValueForBlock(L->getLoopLatch()));
+
+  // Initialize CmpIndVar to the preincremented IV.
+  Value *CmpIndVar = IndVar;
+  bool UsePostInc = false;
+
+  // If the exiting block is the same as the backedge block, we prefer to
+  // compare against the post-incremented value, otherwise we must compare
+  // against the preincremented value.
+  if (ExitingBB == L->getLoopLatch()) {
+    // For pointer IVs, we chose to not strip inbounds which requires us not
+    // to add a potentially UB introducing use.  We need to either a) show
+    // the loop test we're modifying is already in post-inc form, or b) show
+    // that adding a use must not introduce UB.
+    bool SafeToPostInc =
+        IndVar->getType()->isIntegerTy() ||
+        isLoopExitTestBasedOn(IncVar, ExitingBB) ||
+        mustExecuteUBIfPoisonOnPathTo(IncVar, ExitingBB->getTerminator(), DT);
+    if (SafeToPostInc) {
+      UsePostInc = true;
+      CmpIndVar = IncVar;
+    }
+  }
+
+  // It may be necessary to drop nowrap flags on the incrementing instruction
+  // if either LFTR moves from a pre-inc check to a post-inc check (in which
+  // case the increment might have previously been poison on the last iteration
+  // only) or if LFTR switches to a different IV that was previously dynamically
+  // dead (and as such may be arbitrarily poison). We remove any nowrap flags
+  // that SCEV didn't infer for the post-inc addrec (even if we use a pre-inc
+  // check), because the pre-inc addrec flags may be adopted from the original
+  // instruction, while SCEV has to explicitly prove the post-inc nowrap flags.
+  // TODO: This handling is inaccurate for one case: If we switch to a
+  // dynamically dead IV that wraps on the first loop iteration only, which is
+  // not covered by the post-inc addrec. (If the new IV was not dynamically
+  // dead, it could not be poison on the first iteration in the first place.)
+  if (auto *BO = dyn_cast<BinaryOperator>(IncVar)) {
+    const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(SE->getSCEV(IncVar));
+    if (BO->hasNoUnsignedWrap())
+      BO->setHasNoUnsignedWrap(AR->hasNoUnsignedWrap());
+    if (BO->hasNoSignedWrap())
+      BO->setHasNoSignedWrap(AR->hasNoSignedWrap());
+  }
+
+  Value *ExitCnt = genLoopLimit(
+      IndVar, ExitingBB, ExitCount, UsePostInc, L, Rewriter, SE);
+  assert(ExitCnt->getType()->isPointerTy() ==
+             IndVar->getType()->isPointerTy() &&
+         "genLoopLimit missed a cast");
+
+  // Insert a new icmp_ne or icmp_eq instruction before the branch.
+  BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
+  ICmpInst::Predicate P;
+  if (L->contains(BI->getSuccessor(0)))
+    P = ICmpInst::ICMP_NE;
+  else
+    P = ICmpInst::ICMP_EQ;
+
+  IRBuilder<> Builder(BI);
+
+  // The new loop exit condition should reuse the debug location of the
+  // original loop exit condition.
+  if (auto *Cond = dyn_cast<Instruction>(BI->getCondition()))
+    Builder.SetCurrentDebugLocation(Cond->getDebugLoc());
+
+  // For integer IVs, if we evaluated the limit in the narrower bitwidth to
+  // avoid the expensive expansion of the limit expression in the wider type,
+  // emit a truncate to narrow the IV to the ExitCount type.  This is safe
+  // since we know (from the exit count bitwidth), that we can't self-wrap in
+  // the narrower type.
+  unsigned CmpIndVarSize = SE->getTypeSizeInBits(CmpIndVar->getType());
+  unsigned ExitCntSize = SE->getTypeSizeInBits(ExitCnt->getType());
+  if (CmpIndVarSize > ExitCntSize) {
+    assert(!CmpIndVar->getType()->isPointerTy() &&
+           !ExitCnt->getType()->isPointerTy());
+
+    // Before resorting to actually inserting the truncate, use the same
+    // reasoning as from SimplifyIndvar::eliminateTrunc to see if we can extend
+    // the other side of the comparison instead.  We still evaluate the limit
+    // in the narrower bitwidth, we just prefer a zext/sext outside the loop to
+    // a truncate within in.
+    bool Extended = false;
+    const SCEV *IV = SE->getSCEV(CmpIndVar);
+    const SCEV *TruncatedIV = SE->getTruncateExpr(SE->getSCEV(CmpIndVar),
+                                                  ExitCnt->getType());
+    const SCEV *ZExtTrunc =
+      SE->getZeroExtendExpr(TruncatedIV, CmpIndVar->getType());
+
+    if (ZExtTrunc == IV) {
+      Extended = true;
+      ExitCnt = Builder.CreateZExt(ExitCnt, IndVar->getType(),
+                                   "wide.trip.count");
+    } else {
+      const SCEV *SExtTrunc =
+        SE->getSignExtendExpr(TruncatedIV, CmpIndVar->getType());
+      if (SExtTrunc == IV) {
+        Extended = true;
+        ExitCnt = Builder.CreateSExt(ExitCnt, IndVar->getType(),
+                                     "wide.trip.count");
+      }
+    }
+
+    if (Extended) {
+      bool Discard;
+      L->makeLoopInvariant(ExitCnt, Discard);
+    } else
+      CmpIndVar = Builder.CreateTrunc(CmpIndVar, ExitCnt->getType(),
+                                      "lftr.wideiv");
+  }
+  LLVM_DEBUG(dbgs() << "INDVARS: Rewriting loop exit condition to:\n"
+                    << "      LHS:" << *CmpIndVar << '\n'
+                    << "       op:\t" << (P == ICmpInst::ICMP_NE ? "!=" : "==")
+                    << "\n"
+                    << "      RHS:\t" << *ExitCnt << "\n"
+                    << "ExitCount:\t" << *ExitCount << "\n"
+                    << "  was: " << *BI->getCondition() << "\n");
+
+  Value *Cond = Builder.CreateICmp(P, CmpIndVar, ExitCnt, "exitcond");
+  Value *OrigCond = BI->getCondition();
+  // It's tempting to use replaceAllUsesWith here to fully replace the old
+  // comparison, but that's not immediately safe, since users of the old
+  // comparison may not be dominated by the new comparison. Instead, just
+  // update the branch to use the new comparison; in the common case this
+  // will make old comparison dead.
+  BI->setCondition(Cond);
+  DeadInsts.emplace_back(OrigCond);
+
+  ++NumLFTR;
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//  sinkUnusedInvariants. A late subpass to cleanup loop preheaders.
+//===----------------------------------------------------------------------===//
+
+/// If there's a single exit block, sink any loop-invariant values that
+/// were defined in the preheader but not used inside the loop into the
+/// exit block to reduce register pressure in the loop.
+bool IndVarSimplify::sinkUnusedInvariants(Loop *L) {
+  BasicBlock *ExitBlock = L->getExitBlock();
+  if (!ExitBlock) return false;
+
+  BasicBlock *Preheader = L->getLoopPreheader();
+  if (!Preheader) return false;
+
+  bool MadeAnyChanges = false;
+  BasicBlock::iterator InsertPt = ExitBlock->getFirstInsertionPt();
+  BasicBlock::iterator I(Preheader->getTerminator());
+  while (I != Preheader->begin()) {
+    --I;
+    // New instructions were inserted at the end of the preheader.
+    if (isa<PHINode>(I))
+      break;
+
+    // Don't move instructions which might have side effects, since the side
+    // effects need to complete before instructions inside the loop.  Also don't
+    // move instructions which might read memory, since the loop may modify
+    // memory. Note that it's okay if the instruction might have undefined
+    // behavior: LoopSimplify guarantees that the preheader dominates the exit
+    // block.
+    if (I->mayHaveSideEffects() || I->mayReadFromMemory())
+      continue;
+
+    // Skip debug info intrinsics.
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
+
+    // Skip eh pad instructions.
+    if (I->isEHPad())
+      continue;
+
+    // Don't sink alloca: we never want to sink static alloca's out of the
+    // entry block, and correctly sinking dynamic alloca's requires
+    // checks for stacksave/stackrestore intrinsics.
+    // FIXME: Refactor this check somehow?
+    if (isa<AllocaInst>(I))
+      continue;
+
+    // Determine if there is a use in or before the loop (direct or
+    // otherwise).
+    bool UsedInLoop = false;
+    for (Use &U : I->uses()) {
+      Instruction *User = cast<Instruction>(U.getUser());
+      BasicBlock *UseBB = User->getParent();
+      if (PHINode *P = dyn_cast<PHINode>(User)) {
+        unsigned i =
+          PHINode::getIncomingValueNumForOperand(U.getOperandNo());
+        UseBB = P->getIncomingBlock(i);
+      }
+      if (UseBB == Preheader || L->contains(UseBB)) {
+        UsedInLoop = true;
+        break;
+      }
+    }
+
+    // If there is, the def must remain in the preheader.
+    if (UsedInLoop)
+      continue;
+
+    // Otherwise, sink it to the exit block.
+    Instruction *ToMove = &*I;
+    bool Done = false;
+
+    if (I != Preheader->begin()) {
+      // Skip debug info intrinsics.
+      do {
+        --I;
+      } while (isa<DbgInfoIntrinsic>(I) && I != Preheader->begin());
+
+      if (isa<DbgInfoIntrinsic>(I) && I == Preheader->begin())
+        Done = true;
+    } else {
+      Done = true;
+    }
+
+    MadeAnyChanges = true;
+    ToMove->moveBefore(*ExitBlock, InsertPt);
+    if (Done) break;
+    InsertPt = ToMove->getIterator();
+  }
+
+  return MadeAnyChanges;
+}
+
 static void replaceExitCond(BranchInst *BI, Value *NewCond,
                             SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
   auto *OldCond = BI->getCondition();
@@ -1297,7 +1297,7 @@ static void replaceExitCond(BranchInst *BI, Value *NewCond,
   if (OldCond->use_empty())
     DeadInsts.emplace_back(OldCond);
 }
- 
+
 static void foldExit(const Loop *L, BasicBlock *ExitingBB, bool IsTaken,
                      SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
   BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
@@ -1355,7 +1355,7 @@ static bool optimizeLoopExitWithUnknownExitCount(
   if (SE->isKnownPredicateAt(Pred, LHSS, RHSS, BI)) {
     foldExit(L, ExitingBB, Inverted, DeadInsts);
     return true;
-  } 
+  }
   // Further logic works for non-inverted condition only.
   if (Inverted)
     return false;
@@ -1391,52 +1391,52 @@ static bool optimizeLoopExitWithUnknownExitCount(
                              Rewriter, DeadInsts);
 
   return true;
-} 
- 
-bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) { 
-  SmallVector<BasicBlock*, 16> ExitingBlocks; 
-  L->getExitingBlocks(ExitingBlocks); 
- 
+}
+
+bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
+  SmallVector<BasicBlock*, 16> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+
   // Remove all exits which aren't both rewriteable and execute on every
   // iteration.
   llvm::erase_if(ExitingBlocks, [&](BasicBlock *ExitingBB) {
-    // If our exitting block exits multiple loops, we can only rewrite the 
-    // innermost one.  Otherwise, we're changing how many times the innermost 
-    // loop runs before it exits. 
-    if (LI->getLoopFor(ExitingBB) != L) 
-      return true; 
- 
-    // Can't rewrite non-branch yet. 
-    BranchInst *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); 
-    if (!BI) 
-      return true; 
- 
-    // If already constant, nothing to do. 
-    if (isa<Constant>(BI->getCondition())) 
-      return true; 
- 
+    // If our exitting block exits multiple loops, we can only rewrite the
+    // innermost one.  Otherwise, we're changing how many times the innermost
+    // loop runs before it exits.
+    if (LI->getLoopFor(ExitingBB) != L)
+      return true;
+
+    // Can't rewrite non-branch yet.
+    BranchInst *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator());
+    if (!BI)
+      return true;
+
+    // If already constant, nothing to do.
+    if (isa<Constant>(BI->getCondition()))
+      return true;
+
     // Likewise, the loop latch must be dominated by the exiting BB.
     if (!DT->dominates(ExitingBB, L->getLoopLatch()))
-      return true; 
-
-    return false; 
-  }); 
- 
-  if (ExitingBlocks.empty()) 
-    return false; 
- 
-  // Get a symbolic upper bound on the loop backedge taken count. 
+      return true;
+
+    return false;
+  });
+
+  if (ExitingBlocks.empty())
+    return false;
+
+  // Get a symbolic upper bound on the loop backedge taken count.
   const SCEV *MaxExitCount = SE->getSymbolicMaxBackedgeTakenCount(L);
-  if (isa<SCEVCouldNotCompute>(MaxExitCount)) 
-    return false; 
- 
+  if (isa<SCEVCouldNotCompute>(MaxExitCount))
+    return false;
+
   // Visit our exit blocks in order of dominance. We know from the fact that
   // all exits must dominate the latch, so there is a total dominance order
   // between them.
   llvm::sort(ExitingBlocks, [&](BasicBlock *A, BasicBlock *B) {
-               // std::sort sorts in ascending order, so we want the inverse of 
-               // the normal dominance relation. 
-               if (A == B) return false; 
+               // std::sort sorts in ascending order, so we want the inverse of
+               // the normal dominance relation.
+               if (A == B) return false;
                if (DT->properlyDominates(A, B))
                  return true;
                else {
@@ -1445,17 +1445,17 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
                  return false;
                }
   });
-#ifdef ASSERT 
-  for (unsigned i = 1; i < ExitingBlocks.size(); i++) { 
-    assert(DT->dominates(ExitingBlocks[i-1], ExitingBlocks[i])); 
-  } 
-#endif 
- 
-  bool Changed = false; 
+#ifdef ASSERT
+  for (unsigned i = 1; i < ExitingBlocks.size(); i++) {
+    assert(DT->dominates(ExitingBlocks[i-1], ExitingBlocks[i]));
+  }
+#endif
+
+  bool Changed = false;
   bool SkipLastIter = false;
-  SmallSet<const SCEV*, 8> DominatingExitCounts; 
-  for (BasicBlock *ExitingBB : ExitingBlocks) { 
-    const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); 
+  SmallSet<const SCEV*, 8> DominatingExitCounts;
+  for (BasicBlock *ExitingBB : ExitingBlocks) {
+    const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
     if (isa<SCEVCouldNotCompute>(ExitCount)) {
       // Okay, we do not know the exit count here. Can we at least prove that it
       // will remain the same within iteration space?
@@ -1465,7 +1465,7 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
             L, BI, ExitingBB, MaxExitCount, Inverted, SkipLastIter, SE,
             Rewriter, DeadInsts);
       };
- 
+
       // TODO: We might have proved that we can skip the last iteration for
       // this check. In this case, we only want to check the condition on the
       // pre-last iteration (MaxExitCount - 1). However, there is a nasty
@@ -1495,496 +1495,496 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
       // executed 1 iteration less.
       SkipLastIter = true;
 
-    // If we know we'd exit on the first iteration, rewrite the exit to 
-    // reflect this.  This does not imply the loop must exit through this 
-    // exit; there may be an earlier one taken on the first iteration. 
-    // TODO: Given we know the backedge can't be taken, we should go ahead 
-    // and break it.  Or at least, kill all the header phis and simplify. 
-    if (ExitCount->isZero()) { 
+    // If we know we'd exit on the first iteration, rewrite the exit to
+    // reflect this.  This does not imply the loop must exit through this
+    // exit; there may be an earlier one taken on the first iteration.
+    // TODO: Given we know the backedge can't be taken, we should go ahead
+    // and break it.  Or at least, kill all the header phis and simplify.
+    if (ExitCount->isZero()) {
       foldExit(L, ExitingBB, true, DeadInsts);
-      Changed = true; 
-      continue; 
-    } 
- 
-    // If we end up with a pointer exit count, bail.  Note that we can end up 
-    // with a pointer exit count for one exiting block, and not for another in 
-    // the same loop. 
-    if (!ExitCount->getType()->isIntegerTy() || 
-        !MaxExitCount->getType()->isIntegerTy()) 
-      continue; 
- 
-    Type *WiderType = 
-      SE->getWiderType(MaxExitCount->getType(), ExitCount->getType()); 
-    ExitCount = SE->getNoopOrZeroExtend(ExitCount, WiderType); 
-    MaxExitCount = SE->getNoopOrZeroExtend(MaxExitCount, WiderType); 
-    assert(MaxExitCount->getType() == ExitCount->getType()); 
- 
-    // Can we prove that some other exit must be taken strictly before this 
-    // one? 
-    if (SE->isLoopEntryGuardedByCond(L, CmpInst::ICMP_ULT, 
-                                     MaxExitCount, ExitCount)) { 
+      Changed = true;
+      continue;
+    }
+
+    // If we end up with a pointer exit count, bail.  Note that we can end up
+    // with a pointer exit count for one exiting block, and not for another in
+    // the same loop.
+    if (!ExitCount->getType()->isIntegerTy() ||
+        !MaxExitCount->getType()->isIntegerTy())
+      continue;
+
+    Type *WiderType =
+      SE->getWiderType(MaxExitCount->getType(), ExitCount->getType());
+    ExitCount = SE->getNoopOrZeroExtend(ExitCount, WiderType);
+    MaxExitCount = SE->getNoopOrZeroExtend(MaxExitCount, WiderType);
+    assert(MaxExitCount->getType() == ExitCount->getType());
+
+    // Can we prove that some other exit must be taken strictly before this
+    // one?
+    if (SE->isLoopEntryGuardedByCond(L, CmpInst::ICMP_ULT,
+                                     MaxExitCount, ExitCount)) {
       foldExit(L, ExitingBB, false, DeadInsts);
-      Changed = true; 
-      continue; 
-    } 
- 
-    // As we run, keep track of which exit counts we've encountered.  If we 
-    // find a duplicate, we've found an exit which would have exited on the 
-    // exiting iteration, but (from the visit order) strictly follows another 
-    // which does the same and is thus dead. 
-    if (!DominatingExitCounts.insert(ExitCount).second) { 
+      Changed = true;
+      continue;
+    }
+
+    // As we run, keep track of which exit counts we've encountered.  If we
+    // find a duplicate, we've found an exit which would have exited on the
+    // exiting iteration, but (from the visit order) strictly follows another
+    // which does the same and is thus dead.
+    if (!DominatingExitCounts.insert(ExitCount).second) {
       foldExit(L, ExitingBB, false, DeadInsts);
-      Changed = true; 
-      continue; 
-    } 
- 
-    // TODO: There might be another oppurtunity to leverage SCEV's reasoning 
-    // here.  If we kept track of the min of dominanting exits so far, we could 
-    // discharge exits with EC >= MDEC. This is less powerful than the existing 
-    // transform (since later exits aren't considered), but potentially more 
-    // powerful for any case where SCEV can prove a >=u b, but neither a == b 
-    // or a >u b.  Such a case is not currently known. 
-  } 
-  return Changed; 
-} 
- 
-bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) { 
-  SmallVector<BasicBlock*, 16> ExitingBlocks; 
-  L->getExitingBlocks(ExitingBlocks); 
- 
-  // Finally, see if we can rewrite our exit conditions into a loop invariant 
-  // form. If we have a read-only loop, and we can tell that we must exit down 
-  // a path which does not need any of the values computed within the loop, we 
-  // can rewrite the loop to exit on the first iteration.  Note that this 
-  // doesn't either a) tell us the loop exits on the first iteration (unless 
-  // *all* exits are predicateable) or b) tell us *which* exit might be taken. 
-  // This transformation looks a lot like a restricted form of dead loop 
-  // elimination, but restricted to read-only loops and without neccesssarily 
-  // needing to kill the loop entirely. 
-  if (!LoopPredication) 
-    return false; 
- 
-  if (!SE->hasLoopInvariantBackedgeTakenCount(L)) 
-    return false; 
- 
-  // Note: ExactBTC is the exact backedge taken count *iff* the loop exits 
-  // through *explicit* control flow.  We have to eliminate the possibility of 
-  // implicit exits (see below) before we know it's truly exact. 
-  const SCEV *ExactBTC = SE->getBackedgeTakenCount(L); 
-  if (isa<SCEVCouldNotCompute>(ExactBTC) || 
-      !SE->isLoopInvariant(ExactBTC, L) || 
-      !isSafeToExpand(ExactBTC, *SE)) 
-    return false; 
- 
-  // If we end up with a pointer exit count, bail.  It may be unsized. 
-  if (!ExactBTC->getType()->isIntegerTy()) 
-    return false; 
- 
-  auto BadExit = [&](BasicBlock *ExitingBB) { 
-    // If our exiting block exits multiple loops, we can only rewrite the 
-    // innermost one.  Otherwise, we're changing how many times the innermost 
-    // loop runs before it exits. 
-    if (LI->getLoopFor(ExitingBB) != L) 
-      return true; 
- 
-    // Can't rewrite non-branch yet. 
-    BranchInst *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); 
-    if (!BI) 
-      return true; 
- 
-    // If already constant, nothing to do. 
-    if (isa<Constant>(BI->getCondition())) 
-      return true; 
- 
-    // If the exit block has phis, we need to be able to compute the values 
-    // within the loop which contains them.  This assumes trivially lcssa phis 
-    // have already been removed; TODO: generalize 
-    BasicBlock *ExitBlock = 
-    BI->getSuccessor(L->contains(BI->getSuccessor(0)) ? 1 : 0); 
-    if (!ExitBlock->phis().empty()) 
-      return true; 
- 
-    const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); 
-    assert(!isa<SCEVCouldNotCompute>(ExactBTC) && "implied by having exact trip count"); 
-    if (!SE->isLoopInvariant(ExitCount, L) || 
-        !isSafeToExpand(ExitCount, *SE)) 
-      return true; 
- 
-    // If we end up with a pointer exit count, bail.  It may be unsized. 
-    if (!ExitCount->getType()->isIntegerTy()) 
-      return true; 
- 
-    return false; 
-  }; 
- 
-  // If we have any exits which can't be predicated themselves, than we can't 
-  // predicate any exit which isn't guaranteed to execute before it.  Consider 
-  // two exits (a) and (b) which would both exit on the same iteration.  If we 
-  // can predicate (b), but not (a), and (a) preceeds (b) along some path, then 
-  // we could convert a loop from exiting through (a) to one exiting through 
-  // (b).  Note that this problem exists only for exits with the same exit 
-  // count, and we could be more aggressive when exit counts are known inequal. 
-  llvm::sort(ExitingBlocks, 
-            [&](BasicBlock *A, BasicBlock *B) { 
-              // std::sort sorts in ascending order, so we want the inverse of 
-              // the normal dominance relation, plus a tie breaker for blocks 
-              // unordered by dominance. 
-              if (DT->properlyDominates(A, B)) return true; 
-              if (DT->properlyDominates(B, A)) return false; 
-              return A->getName() < B->getName(); 
-            }); 
-  // Check to see if our exit blocks are a total order (i.e. a linear chain of 
-  // exits before the backedge).  If they aren't, reasoning about reachability 
-  // is complicated and we choose not to for now. 
-  for (unsigned i = 1; i < ExitingBlocks.size(); i++) 
-    if (!DT->dominates(ExitingBlocks[i-1], ExitingBlocks[i])) 
-      return false; 
- 
-  // Given our sorted total order, we know that exit[j] must be evaluated 
-  // after all exit[i] such j > i. 
-  for (unsigned i = 0, e = ExitingBlocks.size(); i < e; i++) 
-    if (BadExit(ExitingBlocks[i])) { 
-      ExitingBlocks.resize(i); 
-      break; 
-    } 
- 
-  if (ExitingBlocks.empty()) 
-    return false; 
- 
-  // We rely on not being able to reach an exiting block on a later iteration 
-  // then it's statically compute exit count.  The implementaton of 
-  // getExitCount currently has this invariant, but assert it here so that 
-  // breakage is obvious if this ever changes.. 
-  assert(llvm::all_of(ExitingBlocks, [&](BasicBlock *ExitingBB) { 
-        return DT->dominates(ExitingBB, L->getLoopLatch()); 
-      })); 
- 
-  // At this point, ExitingBlocks consists of only those blocks which are 
-  // predicatable.  Given that, we know we have at least one exit we can 
-  // predicate if the loop is doesn't have side effects and doesn't have any 
-  // implicit exits (because then our exact BTC isn't actually exact). 
-  // @Reviewers - As structured, this is O(I^2) for loop nests.  Any 
-  // suggestions on how to improve this?  I can obviously bail out for outer 
-  // loops, but that seems less than ideal.  MemorySSA can find memory writes, 
-  // is that enough for *all* side effects? 
-  for (BasicBlock *BB : L->blocks()) 
-    for (auto &I : *BB) 
-      // TODO:isGuaranteedToTransfer 
-      if (I.mayHaveSideEffects() || I.mayThrow()) 
-        return false; 
- 
-  bool Changed = false; 
-  // Finally, do the actual predication for all predicatable blocks.  A couple 
-  // of notes here: 
-  // 1) We don't bother to constant fold dominated exits with identical exit 
-  //    counts; that's simply a form of CSE/equality propagation and we leave 
-  //    it for dedicated passes. 
-  // 2) We insert the comparison at the branch.  Hoisting introduces additional 
-  //    legality constraints and we leave that to dedicated logic.  We want to 
-  //    predicate even if we can't insert a loop invariant expression as 
-  //    peeling or unrolling will likely reduce the cost of the otherwise loop 
-  //    varying check. 
-  Rewriter.setInsertPoint(L->getLoopPreheader()->getTerminator()); 
-  IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 
-  Value *ExactBTCV = nullptr; // Lazily generated if needed. 
-  for (BasicBlock *ExitingBB : ExitingBlocks) { 
-    const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); 
- 
-    auto *BI = cast<BranchInst>(ExitingBB->getTerminator()); 
-    Value *NewCond; 
-    if (ExitCount == ExactBTC) { 
-      NewCond = L->contains(BI->getSuccessor(0)) ? 
-        B.getFalse() : B.getTrue(); 
-    } else { 
-      Value *ECV = Rewriter.expandCodeFor(ExitCount); 
-      if (!ExactBTCV) 
-        ExactBTCV = Rewriter.expandCodeFor(ExactBTC); 
-      Value *RHS = ExactBTCV; 
-      if (ECV->getType() != RHS->getType()) { 
-        Type *WiderTy = SE->getWiderType(ECV->getType(), RHS->getType()); 
-        ECV = B.CreateZExt(ECV, WiderTy); 
-        RHS = B.CreateZExt(RHS, WiderTy); 
-      } 
-      auto Pred = L->contains(BI->getSuccessor(0)) ? 
-        ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ; 
-      NewCond = B.CreateICmp(Pred, ECV, RHS); 
-    } 
-    Value *OldCond = BI->getCondition(); 
-    BI->setCondition(NewCond); 
-    if (OldCond->use_empty()) 
-      DeadInsts.emplace_back(OldCond); 
-    Changed = true; 
-  } 
- 
-  return Changed; 
-} 
- 
-//===----------------------------------------------------------------------===// 
-//  IndVarSimplify driver. Manage several subpasses of IV simplification. 
-//===----------------------------------------------------------------------===// 
- 
-bool IndVarSimplify::run(Loop *L) { 
-  // We need (and expect!) the incoming loop to be in LCSSA. 
-  assert(L->isRecursivelyLCSSAForm(*DT, *LI) && 
-         "LCSSA required to run indvars!"); 
- 
-  // If LoopSimplify form is not available, stay out of trouble. Some notes: 
-  //  - LSR currently only supports LoopSimplify-form loops. Indvars' 
-  //    canonicalization can be a pessimization without LSR to "clean up" 
-  //    afterwards. 
-  //  - We depend on having a preheader; in particular, 
-  //    Loop::getCanonicalInductionVariable only supports loops with preheaders, 
-  //    and we're in trouble if we can't find the induction variable even when 
-  //    we've manually inserted one. 
-  //  - LFTR relies on having a single backedge. 
-  if (!L->isLoopSimplifyForm()) 
-    return false; 
- 
-#ifndef NDEBUG 
-  // Used below for a consistency check only 
-  // Note: Since the result returned by ScalarEvolution may depend on the order 
-  // in which previous results are added to its cache, the call to 
-  // getBackedgeTakenCount() may change following SCEV queries. 
-  const SCEV *BackedgeTakenCount; 
-  if (VerifyIndvars) 
-    BackedgeTakenCount = SE->getBackedgeTakenCount(L); 
-#endif 
- 
-  bool Changed = false; 
-  // If there are any floating-point recurrences, attempt to 
-  // transform them to use integer recurrences. 
-  Changed |= rewriteNonIntegerIVs(L); 
- 
-  // Create a rewriter object which we'll use to transform the code with. 
-  SCEVExpander Rewriter(*SE, DL, "indvars"); 
-#ifndef NDEBUG 
-  Rewriter.setDebugType(DEBUG_TYPE); 
-#endif 
- 
-  // Eliminate redundant IV users. 
-  // 
-  // Simplification works best when run before other consumers of SCEV. We 
-  // attempt to avoid evaluating SCEVs for sign/zero extend operations until 
-  // other expressions involving loop IVs have been evaluated. This helps SCEV 
-  // set no-wrap flags before normalizing sign/zero extension. 
-  Rewriter.disableCanonicalMode(); 
-  Changed |= simplifyAndExtend(L, Rewriter, LI); 
- 
-  // Check to see if we can compute the final value of any expressions 
-  // that are recurrent in the loop, and substitute the exit values from the 
-  // loop into any instructions outside of the loop that use the final values 
-  // of the current expressions. 
-  if (ReplaceExitValue != NeverRepl) { 
-    if (int Rewrites = rewriteLoopExitValues(L, LI, TLI, SE, TTI, Rewriter, DT, 
-                                             ReplaceExitValue, DeadInsts)) { 
-      NumReplaced += Rewrites; 
-      Changed = true; 
-    } 
-  } 
- 
-  // Eliminate redundant IV cycles. 
-  NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts); 
- 
-  // Try to eliminate loop exits based on analyzeable exit counts 
-  if (optimizeLoopExits(L, Rewriter))  { 
-    Changed = true; 
-    // Given we've changed exit counts, notify SCEV 
+      Changed = true;
+      continue;
+    }
+
+    // TODO: There might be another oppurtunity to leverage SCEV's reasoning
+    // here.  If we kept track of the min of dominanting exits so far, we could
+    // discharge exits with EC >= MDEC. This is less powerful than the existing
+    // transform (since later exits aren't considered), but potentially more
+    // powerful for any case where SCEV can prove a >=u b, but neither a == b
+    // or a >u b.  Such a case is not currently known.
+  }
+  return Changed;
+}
+
+bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
+  SmallVector<BasicBlock*, 16> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+
+  // Finally, see if we can rewrite our exit conditions into a loop invariant
+  // form. If we have a read-only loop, and we can tell that we must exit down
+  // a path which does not need any of the values computed within the loop, we
+  // can rewrite the loop to exit on the first iteration.  Note that this
+  // doesn't either a) tell us the loop exits on the first iteration (unless
+  // *all* exits are predicateable) or b) tell us *which* exit might be taken.
+  // This transformation looks a lot like a restricted form of dead loop
+  // elimination, but restricted to read-only loops and without neccesssarily
+  // needing to kill the loop entirely.
+  if (!LoopPredication)
+    return false;
+
+  if (!SE->hasLoopInvariantBackedgeTakenCount(L))
+    return false;
+
+  // Note: ExactBTC is the exact backedge taken count *iff* the loop exits
+  // through *explicit* control flow.  We have to eliminate the possibility of
+  // implicit exits (see below) before we know it's truly exact.
+  const SCEV *ExactBTC = SE->getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(ExactBTC) ||
+      !SE->isLoopInvariant(ExactBTC, L) ||
+      !isSafeToExpand(ExactBTC, *SE))
+    return false;
+
+  // If we end up with a pointer exit count, bail.  It may be unsized.
+  if (!ExactBTC->getType()->isIntegerTy())
+    return false;
+
+  auto BadExit = [&](BasicBlock *ExitingBB) {
+    // If our exiting block exits multiple loops, we can only rewrite the
+    // innermost one.  Otherwise, we're changing how many times the innermost
+    // loop runs before it exits.
+    if (LI->getLoopFor(ExitingBB) != L)
+      return true;
+
+    // Can't rewrite non-branch yet.
+    BranchInst *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator());
+    if (!BI)
+      return true;
+
+    // If already constant, nothing to do.
+    if (isa<Constant>(BI->getCondition()))
+      return true;
+
+    // If the exit block has phis, we need to be able to compute the values
+    // within the loop which contains them.  This assumes trivially lcssa phis
+    // have already been removed; TODO: generalize
+    BasicBlock *ExitBlock =
+    BI->getSuccessor(L->contains(BI->getSuccessor(0)) ? 1 : 0);
+    if (!ExitBlock->phis().empty())
+      return true;
+
+    const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
+    assert(!isa<SCEVCouldNotCompute>(ExactBTC) && "implied by having exact trip count");
+    if (!SE->isLoopInvariant(ExitCount, L) ||
+        !isSafeToExpand(ExitCount, *SE))
+      return true;
+
+    // If we end up with a pointer exit count, bail.  It may be unsized.
+    if (!ExitCount->getType()->isIntegerTy())
+      return true;
+
+    return false;
+  };
+
+  // If we have any exits which can't be predicated themselves, than we can't
+  // predicate any exit which isn't guaranteed to execute before it.  Consider
+  // two exits (a) and (b) which would both exit on the same iteration.  If we
+  // can predicate (b), but not (a), and (a) preceeds (b) along some path, then
+  // we could convert a loop from exiting through (a) to one exiting through
+  // (b).  Note that this problem exists only for exits with the same exit
+  // count, and we could be more aggressive when exit counts are known inequal.
+  llvm::sort(ExitingBlocks,
+            [&](BasicBlock *A, BasicBlock *B) {
+              // std::sort sorts in ascending order, so we want the inverse of
+              // the normal dominance relation, plus a tie breaker for blocks
+              // unordered by dominance.
+              if (DT->properlyDominates(A, B)) return true;
+              if (DT->properlyDominates(B, A)) return false;
+              return A->getName() < B->getName();
+            });
+  // Check to see if our exit blocks are a total order (i.e. a linear chain of
+  // exits before the backedge).  If they aren't, reasoning about reachability
+  // is complicated and we choose not to for now.
+  for (unsigned i = 1; i < ExitingBlocks.size(); i++)
+    if (!DT->dominates(ExitingBlocks[i-1], ExitingBlocks[i]))
+      return false;
+
+  // Given our sorted total order, we know that exit[j] must be evaluated
+  // after all exit[i] such j > i.
+  for (unsigned i = 0, e = ExitingBlocks.size(); i < e; i++)
+    if (BadExit(ExitingBlocks[i])) {
+      ExitingBlocks.resize(i);
+      break;
+    }
+
+  if (ExitingBlocks.empty())
+    return false;
+
+  // We rely on not being able to reach an exiting block on a later iteration
+  // then it's statically compute exit count.  The implementaton of
+  // getExitCount currently has this invariant, but assert it here so that
+  // breakage is obvious if this ever changes..
+  assert(llvm::all_of(ExitingBlocks, [&](BasicBlock *ExitingBB) {
+        return DT->dominates(ExitingBB, L->getLoopLatch());
+      }));
+
+  // At this point, ExitingBlocks consists of only those blocks which are
+  // predicatable.  Given that, we know we have at least one exit we can
+  // predicate if the loop is doesn't have side effects and doesn't have any
+  // implicit exits (because then our exact BTC isn't actually exact).
+  // @Reviewers - As structured, this is O(I^2) for loop nests.  Any
+  // suggestions on how to improve this?  I can obviously bail out for outer
+  // loops, but that seems less than ideal.  MemorySSA can find memory writes,
+  // is that enough for *all* side effects?
+  for (BasicBlock *BB : L->blocks())
+    for (auto &I : *BB)
+      // TODO:isGuaranteedToTransfer
+      if (I.mayHaveSideEffects() || I.mayThrow())
+        return false;
+
+  bool Changed = false;
+  // Finally, do the actual predication for all predicatable blocks.  A couple
+  // of notes here:
+  // 1) We don't bother to constant fold dominated exits with identical exit
+  //    counts; that's simply a form of CSE/equality propagation and we leave
+  //    it for dedicated passes.
+  // 2) We insert the comparison at the branch.  Hoisting introduces additional
+  //    legality constraints and we leave that to dedicated logic.  We want to
+  //    predicate even if we can't insert a loop invariant expression as
+  //    peeling or unrolling will likely reduce the cost of the otherwise loop
+  //    varying check.
+  Rewriter.setInsertPoint(L->getLoopPreheader()->getTerminator());
+  IRBuilder<> B(L->getLoopPreheader()->getTerminator());
+  Value *ExactBTCV = nullptr; // Lazily generated if needed.
+  for (BasicBlock *ExitingBB : ExitingBlocks) {
+    const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
+
+    auto *BI = cast<BranchInst>(ExitingBB->getTerminator());
+    Value *NewCond;
+    if (ExitCount == ExactBTC) {
+      NewCond = L->contains(BI->getSuccessor(0)) ?
+        B.getFalse() : B.getTrue();
+    } else {
+      Value *ECV = Rewriter.expandCodeFor(ExitCount);
+      if (!ExactBTCV)
+        ExactBTCV = Rewriter.expandCodeFor(ExactBTC);
+      Value *RHS = ExactBTCV;
+      if (ECV->getType() != RHS->getType()) {
+        Type *WiderTy = SE->getWiderType(ECV->getType(), RHS->getType());
+        ECV = B.CreateZExt(ECV, WiderTy);
+        RHS = B.CreateZExt(RHS, WiderTy);
+      }
+      auto Pred = L->contains(BI->getSuccessor(0)) ?
+        ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ;
+      NewCond = B.CreateICmp(Pred, ECV, RHS);
+    }
+    Value *OldCond = BI->getCondition();
+    BI->setCondition(NewCond);
+    if (OldCond->use_empty())
+      DeadInsts.emplace_back(OldCond);
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+//  IndVarSimplify driver. Manage several subpasses of IV simplification.
+//===----------------------------------------------------------------------===//
+
+bool IndVarSimplify::run(Loop *L) {
+  // We need (and expect!) the incoming loop to be in LCSSA.
+  assert(L->isRecursivelyLCSSAForm(*DT, *LI) &&
+         "LCSSA required to run indvars!");
+
+  // If LoopSimplify form is not available, stay out of trouble. Some notes:
+  //  - LSR currently only supports LoopSimplify-form loops. Indvars'
+  //    canonicalization can be a pessimization without LSR to "clean up"
+  //    afterwards.
+  //  - We depend on having a preheader; in particular,
+  //    Loop::getCanonicalInductionVariable only supports loops with preheaders,
+  //    and we're in trouble if we can't find the induction variable even when
+  //    we've manually inserted one.
+  //  - LFTR relies on having a single backedge.
+  if (!L->isLoopSimplifyForm())
+    return false;
+
+#ifndef NDEBUG
+  // Used below for a consistency check only
+  // Note: Since the result returned by ScalarEvolution may depend on the order
+  // in which previous results are added to its cache, the call to
+  // getBackedgeTakenCount() may change following SCEV queries.
+  const SCEV *BackedgeTakenCount;
+  if (VerifyIndvars)
+    BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+#endif
+
+  bool Changed = false;
+  // If there are any floating-point recurrences, attempt to
+  // transform them to use integer recurrences.
+  Changed |= rewriteNonIntegerIVs(L);
+
+  // Create a rewriter object which we'll use to transform the code with.
+  SCEVExpander Rewriter(*SE, DL, "indvars");
+#ifndef NDEBUG
+  Rewriter.setDebugType(DEBUG_TYPE);
+#endif
+
+  // Eliminate redundant IV users.
+  //
+  // Simplification works best when run before other consumers of SCEV. We
+  // attempt to avoid evaluating SCEVs for sign/zero extend operations until
+  // other expressions involving loop IVs have been evaluated. This helps SCEV
+  // set no-wrap flags before normalizing sign/zero extension.
+  Rewriter.disableCanonicalMode();
+  Changed |= simplifyAndExtend(L, Rewriter, LI);
+
+  // Check to see if we can compute the final value of any expressions
+  // that are recurrent in the loop, and substitute the exit values from the
+  // loop into any instructions outside of the loop that use the final values
+  // of the current expressions.
+  if (ReplaceExitValue != NeverRepl) {
+    if (int Rewrites = rewriteLoopExitValues(L, LI, TLI, SE, TTI, Rewriter, DT,
+                                             ReplaceExitValue, DeadInsts)) {
+      NumReplaced += Rewrites;
+      Changed = true;
+    }
+  }
+
+  // Eliminate redundant IV cycles.
+  NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts);
+
+  // Try to eliminate loop exits based on analyzeable exit counts
+  if (optimizeLoopExits(L, Rewriter))  {
+    Changed = true;
+    // Given we've changed exit counts, notify SCEV
     // Some nested loops may share same folded exit basic block,
     // thus we need to notify top most loop.
     SE->forgetTopmostLoop(L);
-  } 
- 
-  // Try to form loop invariant tests for loop exits by changing how many 
-  // iterations of the loop run when that is unobservable. 
-  if (predicateLoopExits(L, Rewriter)) { 
-    Changed = true; 
-    // Given we've changed exit counts, notify SCEV 
-    SE->forgetLoop(L); 
-  } 
- 
-  // If we have a trip count expression, rewrite the loop's exit condition 
-  // using it. 
-  if (!DisableLFTR) { 
-    BasicBlock *PreHeader = L->getLoopPreheader(); 
-    BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator()); 
- 
-    SmallVector<BasicBlock*, 16> ExitingBlocks; 
-    L->getExitingBlocks(ExitingBlocks); 
-    for (BasicBlock *ExitingBB : ExitingBlocks) { 
-      // Can't rewrite non-branch yet. 
-      if (!isa<BranchInst>(ExitingBB->getTerminator())) 
-        continue; 
- 
-      // If our exitting block exits multiple loops, we can only rewrite the 
-      // innermost one.  Otherwise, we're changing how many times the innermost 
-      // loop runs before it exits. 
-      if (LI->getLoopFor(ExitingBB) != L) 
-        continue; 
- 
-      if (!needsLFTR(L, ExitingBB)) 
-        continue; 
- 
-      const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); 
-      if (isa<SCEVCouldNotCompute>(ExitCount)) 
-        continue; 
- 
-      // This was handled above, but as we form SCEVs, we can sometimes refine 
-      // existing ones; this allows exit counts to be folded to zero which 
-      // weren't when optimizeLoopExits saw them.  Arguably, we should iterate 
-      // until stable to handle cases like this better. 
-      if (ExitCount->isZero()) 
-        continue; 
- 
-      PHINode *IndVar = FindLoopCounter(L, ExitingBB, ExitCount, SE, DT); 
-      if (!IndVar) 
-        continue; 
- 
-      // Avoid high cost expansions.  Note: This heuristic is questionable in 
-      // that our definition of "high cost" is not exactly principled. 
-      if (Rewriter.isHighCostExpansion(ExitCount, L, SCEVCheapExpansionBudget, 
-                                       TTI, PreHeaderBR)) 
-        continue; 
- 
-      // Check preconditions for proper SCEVExpander operation. SCEV does not 
-      // express SCEVExpander's dependencies, such as LoopSimplify. Instead 
-      // any pass that uses the SCEVExpander must do it. This does not work 
-      // well for loop passes because SCEVExpander makes assumptions about 
-      // all loops, while LoopPassManager only forces the current loop to be 
-      // simplified. 
-      // 
-      // FIXME: SCEV expansion has no way to bail out, so the caller must 
-      // explicitly check any assumptions made by SCEV. Brittle. 
-      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(ExitCount); 
-      if (!AR || AR->getLoop()->getLoopPreheader()) 
-        Changed |= linearFunctionTestReplace(L, ExitingBB, 
-                                             ExitCount, IndVar, 
-                                             Rewriter); 
-    } 
-  } 
-  // Clear the rewriter cache, because values that are in the rewriter's cache 
-  // can be deleted in the loop below, causing the AssertingVH in the cache to 
-  // trigger. 
-  Rewriter.clear(); 
- 
-  // Now that we're done iterating through lists, clean up any instructions 
-  // which are now dead. 
+  }
+
+  // Try to form loop invariant tests for loop exits by changing how many
+  // iterations of the loop run when that is unobservable.
+  if (predicateLoopExits(L, Rewriter)) {
+    Changed = true;
+    // Given we've changed exit counts, notify SCEV
+    SE->forgetLoop(L);
+  }
+
+  // If we have a trip count expression, rewrite the loop's exit condition
+  // using it.
+  if (!DisableLFTR) {
+    BasicBlock *PreHeader = L->getLoopPreheader();
+    BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
+
+    SmallVector<BasicBlock*, 16> ExitingBlocks;
+    L->getExitingBlocks(ExitingBlocks);
+    for (BasicBlock *ExitingBB : ExitingBlocks) {
+      // Can't rewrite non-branch yet.
+      if (!isa<BranchInst>(ExitingBB->getTerminator()))
+        continue;
+
+      // If our exitting block exits multiple loops, we can only rewrite the
+      // innermost one.  Otherwise, we're changing how many times the innermost
+      // loop runs before it exits.
+      if (LI->getLoopFor(ExitingBB) != L)
+        continue;
+
+      if (!needsLFTR(L, ExitingBB))
+        continue;
+
+      const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
+      if (isa<SCEVCouldNotCompute>(ExitCount))
+        continue;
+
+      // This was handled above, but as we form SCEVs, we can sometimes refine
+      // existing ones; this allows exit counts to be folded to zero which
+      // weren't when optimizeLoopExits saw them.  Arguably, we should iterate
+      // until stable to handle cases like this better.
+      if (ExitCount->isZero())
+        continue;
+
+      PHINode *IndVar = FindLoopCounter(L, ExitingBB, ExitCount, SE, DT);
+      if (!IndVar)
+        continue;
+
+      // Avoid high cost expansions.  Note: This heuristic is questionable in
+      // that our definition of "high cost" is not exactly principled.
+      if (Rewriter.isHighCostExpansion(ExitCount, L, SCEVCheapExpansionBudget,
+                                       TTI, PreHeaderBR))
+        continue;
+
+      // Check preconditions for proper SCEVExpander operation. SCEV does not
+      // express SCEVExpander's dependencies, such as LoopSimplify. Instead
+      // any pass that uses the SCEVExpander must do it. This does not work
+      // well for loop passes because SCEVExpander makes assumptions about
+      // all loops, while LoopPassManager only forces the current loop to be
+      // simplified.
+      //
+      // FIXME: SCEV expansion has no way to bail out, so the caller must
+      // explicitly check any assumptions made by SCEV. Brittle.
+      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(ExitCount);
+      if (!AR || AR->getLoop()->getLoopPreheader())
+        Changed |= linearFunctionTestReplace(L, ExitingBB,
+                                             ExitCount, IndVar,
+                                             Rewriter);
+    }
+  }
+  // Clear the rewriter cache, because values that are in the rewriter's cache
+  // can be deleted in the loop below, causing the AssertingVH in the cache to
+  // trigger.
+  Rewriter.clear();
+
+  // Now that we're done iterating through lists, clean up any instructions
+  // which are now dead.
   while (!DeadInsts.empty()) {
     Value *V = DeadInsts.pop_back_val();
 
     if (PHINode *PHI = dyn_cast_or_null<PHINode>(V))
       Changed |= RecursivelyDeleteDeadPHINode(PHI, TLI, MSSAU.get());
     else if (Instruction *Inst = dyn_cast_or_null<Instruction>(V))
-      Changed |= 
-          RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI, MSSAU.get()); 
+      Changed |=
+          RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI, MSSAU.get());
   }
- 
-  // The Rewriter may not be used from this point on. 
- 
-  // Loop-invariant instructions in the preheader that aren't used in the 
-  // loop may be sunk below the loop to reduce register pressure. 
-  Changed |= sinkUnusedInvariants(L); 
- 
-  // rewriteFirstIterationLoopExitValues does not rely on the computation of 
-  // trip count and therefore can further simplify exit values in addition to 
-  // rewriteLoopExitValues. 
-  Changed |= rewriteFirstIterationLoopExitValues(L); 
- 
-  // Clean up dead instructions. 
-  Changed |= DeleteDeadPHIs(L->getHeader(), TLI, MSSAU.get()); 
- 
-  // Check a post-condition. 
-  assert(L->isRecursivelyLCSSAForm(*DT, *LI) && 
-         "Indvars did not preserve LCSSA!"); 
- 
-  // Verify that LFTR, and any other change have not interfered with SCEV's 
-  // ability to compute trip count.  We may have *changed* the exit count, but 
-  // only by reducing it. 
-#ifndef NDEBUG 
-  if (VerifyIndvars && !isa<SCEVCouldNotCompute>(BackedgeTakenCount)) { 
-    SE->forgetLoop(L); 
-    const SCEV *NewBECount = SE->getBackedgeTakenCount(L); 
-    if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) < 
-        SE->getTypeSizeInBits(NewBECount->getType())) 
-      NewBECount = SE->getTruncateOrNoop(NewBECount, 
-                                         BackedgeTakenCount->getType()); 
-    else 
-      BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, 
-                                                 NewBECount->getType()); 
-    assert(!SE->isKnownPredicate(ICmpInst::ICMP_ULT, BackedgeTakenCount, 
-                                 NewBECount) && "indvars must preserve SCEV"); 
-  } 
-  if (VerifyMemorySSA && MSSAU) 
-    MSSAU->getMemorySSA()->verifyMemorySSA(); 
-#endif 
- 
-  return Changed; 
-} 
- 
-PreservedAnalyses IndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &AM, 
-                                          LoopStandardAnalysisResults &AR, 
-                                          LPMUpdater &) { 
-  Function *F = L.getHeader()->getParent(); 
-  const DataLayout &DL = F->getParent()->getDataLayout(); 
- 
+
+  // The Rewriter may not be used from this point on.
+
+  // Loop-invariant instructions in the preheader that aren't used in the
+  // loop may be sunk below the loop to reduce register pressure.
+  Changed |= sinkUnusedInvariants(L);
+
+  // rewriteFirstIterationLoopExitValues does not rely on the computation of
+  // trip count and therefore can further simplify exit values in addition to
+  // rewriteLoopExitValues.
+  Changed |= rewriteFirstIterationLoopExitValues(L);
+
+  // Clean up dead instructions.
+  Changed |= DeleteDeadPHIs(L->getHeader(), TLI, MSSAU.get());
+
+  // Check a post-condition.
+  assert(L->isRecursivelyLCSSAForm(*DT, *LI) &&
+         "Indvars did not preserve LCSSA!");
+
+  // Verify that LFTR, and any other change have not interfered with SCEV's
+  // ability to compute trip count.  We may have *changed* the exit count, but
+  // only by reducing it.
+#ifndef NDEBUG
+  if (VerifyIndvars && !isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
+    SE->forgetLoop(L);
+    const SCEV *NewBECount = SE->getBackedgeTakenCount(L);
+    if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) <
+        SE->getTypeSizeInBits(NewBECount->getType()))
+      NewBECount = SE->getTruncateOrNoop(NewBECount,
+                                         BackedgeTakenCount->getType());
+    else
+      BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount,
+                                                 NewBECount->getType());
+    assert(!SE->isKnownPredicate(ICmpInst::ICMP_ULT, BackedgeTakenCount,
+                                 NewBECount) && "indvars must preserve SCEV");
+  }
+  if (VerifyMemorySSA && MSSAU)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+#endif
+
+  return Changed;
+}
+
+PreservedAnalyses IndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
+                                          LoopStandardAnalysisResults &AR,
+                                          LPMUpdater &) {
+  Function *F = L.getHeader()->getParent();
+  const DataLayout &DL = F->getParent()->getDataLayout();
+
   IndVarSimplify IVS(&AR.LI, &AR.SE, &AR.DT, DL, &AR.TLI, &AR.TTI, AR.MSSA,
                      WidenIndVars && AllowIVWidening);
-  if (!IVS.run(&L)) 
-    return PreservedAnalyses::all(); 
- 
-  auto PA = getLoopPassPreservedAnalyses(); 
-  PA.preserveSet<CFGAnalyses>(); 
-  if (AR.MSSA) 
-    PA.preserve<MemorySSAAnalysis>(); 
-  return PA; 
-} 
- 
-namespace { 
- 
-struct IndVarSimplifyLegacyPass : public LoopPass { 
-  static char ID; // Pass identification, replacement for typeid 
- 
-  IndVarSimplifyLegacyPass() : LoopPass(ID) { 
-    initializeIndVarSimplifyLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnLoop(Loop *L, LPPassManager &LPM) override { 
-    if (skipLoop(L)) 
-      return false; 
- 
-    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 
-    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 
-    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 
-    auto *TLI = TLIP ? &TLIP->getTLI(*L->getHeader()->getParent()) : nullptr; 
-    auto *TTIP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>(); 
-    auto *TTI = TTIP ? &TTIP->getTTI(*L->getHeader()->getParent()) : nullptr; 
-    const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 
-    auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>(); 
-    MemorySSA *MSSA = nullptr; 
-    if (MSSAAnalysis) 
-      MSSA = &MSSAAnalysis->getMSSA(); 
- 
+  if (!IVS.run(&L))
+    return PreservedAnalyses::all();
+
+  auto PA = getLoopPassPreservedAnalyses();
+  PA.preserveSet<CFGAnalyses>();
+  if (AR.MSSA)
+    PA.preserve<MemorySSAAnalysis>();
+  return PA;
+}
+
+namespace {
+
+struct IndVarSimplifyLegacyPass : public LoopPass {
+  static char ID; // Pass identification, replacement for typeid
+
+  IndVarSimplifyLegacyPass() : LoopPass(ID) {
+    initializeIndVarSimplifyLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+    if (skipLoop(L))
+      return false;
+
+    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+    auto *TLI = TLIP ? &TLIP->getTLI(*L->getHeader()->getParent()) : nullptr;
+    auto *TTIP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>();
+    auto *TTI = TTIP ? &TTIP->getTTI(*L->getHeader()->getParent()) : nullptr;
+    const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+    auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
+    MemorySSA *MSSA = nullptr;
+    if (MSSAAnalysis)
+      MSSA = &MSSAAnalysis->getMSSA();
+
     IndVarSimplify IVS(LI, SE, DT, DL, TLI, TTI, MSSA, AllowIVWidening);
-    return IVS.run(L); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.setPreservesCFG(); 
-    AU.addPreserved<MemorySSAWrapperPass>(); 
-    getLoopAnalysisUsage(AU); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-char IndVarSimplifyLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(IndVarSimplifyLegacyPass, "indvars", 
-                      "Induction Variable Simplification", false, false) 
-INITIALIZE_PASS_DEPENDENCY(LoopPass) 
-INITIALIZE_PASS_END(IndVarSimplifyLegacyPass, "indvars", 
-                    "Induction Variable Simplification", false, false) 
- 
-Pass *llvm::createIndVarSimplifyPass() { 
-  return new IndVarSimplifyLegacyPass(); 
-} 
+    return IVS.run(L);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addPreserved<MemorySSAWrapperPass>();
+    getLoopAnalysisUsage(AU);
+  }
+};
+
+} // end anonymous namespace
+
+char IndVarSimplifyLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(IndVarSimplifyLegacyPass, "indvars",
+                      "Induction Variable Simplification", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_END(IndVarSimplifyLegacyPass, "indvars",
+                    "Induction Variable Simplification", false, false)
+
+Pass *llvm::createIndVarSimplifyPass() {
+  return new IndVarSimplifyLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index 006523ecf4..6e09dec198 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -1,241 +1,241 @@
-//===- InductiveRangeCheckElimination.cpp - -------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// The InductiveRangeCheckElimination pass splits a loop's iteration space into 
-// three disjoint ranges.  It does that in a way such that the loop running in 
-// the middle loop provably does not need range checks. As an example, it will 
-// convert 
-// 
-//   len = < known positive > 
-//   for (i = 0; i < n; i++) { 
-//     if (0 <= i && i < len) { 
-//       do_something(); 
-//     } else { 
-//       throw_out_of_bounds(); 
-//     } 
-//   } 
-// 
-// to 
-// 
-//   len = < known positive > 
-//   limit = smin(n, len) 
-//   // no first segment 
-//   for (i = 0; i < limit; i++) { 
-//     if (0 <= i && i < len) { // this check is fully redundant 
-//       do_something(); 
-//     } else { 
-//       throw_out_of_bounds(); 
-//     } 
-//   } 
-//   for (i = limit; i < n; i++) { 
-//     if (0 <= i && i < len) { 
-//       do_something(); 
-//     } else { 
-//       throw_out_of_bounds(); 
-//     } 
-//   } 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/InductiveRangeCheckElimination.h" 
-#include "llvm/ADT/APInt.h" 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/None.h" 
-#include "llvm/ADT/Optional.h" 
-#include "llvm/ADT/PriorityWorklist.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/StringRef.h" 
-#include "llvm/ADT/Twine.h" 
+//===- InductiveRangeCheckElimination.cpp - -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The InductiveRangeCheckElimination pass splits a loop's iteration space into
+// three disjoint ranges.  It does that in a way such that the loop running in
+// the middle loop provably does not need range checks. As an example, it will
+// convert
+//
+//   len = < known positive >
+//   for (i = 0; i < n; i++) {
+//     if (0 <= i && i < len) {
+//       do_something();
+//     } else {
+//       throw_out_of_bounds();
+//     }
+//   }
+//
+// to
+//
+//   len = < known positive >
+//   limit = smin(n, len)
+//   // no first segment
+//   for (i = 0; i < limit; i++) {
+//     if (0 <= i && i < len) { // this check is fully redundant
+//       do_something();
+//     } else {
+//       throw_out_of_bounds();
+//     }
+//   }
+//   for (i = limit; i < n; i++) {
+//     if (0 <= i && i < len) {
+//       do_something();
+//     } else {
+//       throw_out_of_bounds();
+//     }
+//   }
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/InductiveRangeCheckElimination.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/PriorityWorklist.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/BranchProbabilityInfo.h" 
-#include "llvm/Analysis/LoopAnalysisManager.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/LoopPass.h" 
-#include "llvm/Analysis/PostDominators.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/Analysis/ScalarEvolutionExpressions.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/BranchProbability.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Compiler.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/Cloning.h" 
-#include "llvm/Transforms/Utils/LoopSimplify.h" 
-#include "llvm/Transforms/Utils/LoopUtils.h" 
-#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 
-#include "llvm/Transforms/Utils/ValueMapper.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <iterator> 
-#include <limits> 
-#include <utility> 
-#include <vector> 
- 
-using namespace llvm; 
-using namespace llvm::PatternMatch; 
- 
-static cl::opt<unsigned> LoopSizeCutoff("irce-loop-size-cutoff", cl::Hidden, 
-                                        cl::init(64)); 
- 
-static cl::opt<bool> PrintChangedLoops("irce-print-changed-loops", cl::Hidden, 
-                                       cl::init(false)); 
- 
-static cl::opt<bool> PrintRangeChecks("irce-print-range-checks", cl::Hidden, 
-                                      cl::init(false)); 
- 
-static cl::opt<bool> SkipProfitabilityChecks("irce-skip-profitability-checks", 
-                                             cl::Hidden, cl::init(false)); 
- 
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <limits>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+static cl::opt<unsigned> LoopSizeCutoff("irce-loop-size-cutoff", cl::Hidden,
+                                        cl::init(64));
+
+static cl::opt<bool> PrintChangedLoops("irce-print-changed-loops", cl::Hidden,
+                                       cl::init(false));
+
+static cl::opt<bool> PrintRangeChecks("irce-print-range-checks", cl::Hidden,
+                                      cl::init(false));
+
+static cl::opt<bool> SkipProfitabilityChecks("irce-skip-profitability-checks",
+                                             cl::Hidden, cl::init(false));
+
 static cl::opt<unsigned> MinRuntimeIterations("irce-min-runtime-iterations",
                                               cl::Hidden, cl::init(10));
 
-static cl::opt<bool> AllowUnsignedLatchCondition("irce-allow-unsigned-latch", 
-                                                 cl::Hidden, cl::init(true)); 
- 
-static cl::opt<bool> AllowNarrowLatchCondition( 
-    "irce-allow-narrow-latch", cl::Hidden, cl::init(true), 
-    cl::desc("If set to true, IRCE may eliminate wide range checks in loops " 
-             "with narrow latch condition.")); 
- 
-static const char *ClonedLoopTag = "irce.loop.clone"; 
- 
-#define DEBUG_TYPE "irce" 
- 
-namespace { 
- 
-/// An inductive range check is conditional branch in a loop with 
-/// 
-///  1. a very cold successor (i.e. the branch jumps to that successor very 
-///     rarely) 
-/// 
-///  and 
-/// 
-///  2. a condition that is provably true for some contiguous range of values 
-///     taken by the containing loop's induction variable. 
-/// 
-class InductiveRangeCheck { 
- 
-  const SCEV *Begin = nullptr; 
-  const SCEV *Step = nullptr; 
-  const SCEV *End = nullptr; 
-  Use *CheckUse = nullptr; 
- 
-  static bool parseRangeCheckICmp(Loop *L, ICmpInst *ICI, ScalarEvolution &SE, 
-                                  Value *&Index, Value *&Length, 
-                                  bool &IsSigned); 
- 
-  static void 
-  extractRangeChecksFromCond(Loop *L, ScalarEvolution &SE, Use &ConditionUse, 
-                             SmallVectorImpl<InductiveRangeCheck> &Checks, 
-                             SmallPtrSetImpl<Value *> &Visited); 
- 
-public: 
-  const SCEV *getBegin() const { return Begin; } 
-  const SCEV *getStep() const { return Step; } 
-  const SCEV *getEnd() const { return End; } 
- 
-  void print(raw_ostream &OS) const { 
-    OS << "InductiveRangeCheck:\n"; 
-    OS << "  Begin: "; 
-    Begin->print(OS); 
-    OS << "  Step: "; 
-    Step->print(OS); 
-    OS << "  End: "; 
-    End->print(OS); 
-    OS << "\n  CheckUse: "; 
-    getCheckUse()->getUser()->print(OS); 
-    OS << " Operand: " << getCheckUse()->getOperandNo() << "\n"; 
-  } 
- 
-  LLVM_DUMP_METHOD 
-  void dump() { 
-    print(dbgs()); 
-  } 
- 
-  Use *getCheckUse() const { return CheckUse; } 
- 
-  /// Represents an signed integer range [Range.getBegin(), Range.getEnd()).  If 
-  /// R.getEnd() le R.getBegin(), then R denotes the empty range. 
- 
-  class Range { 
-    const SCEV *Begin; 
-    const SCEV *End; 
- 
-  public: 
-    Range(const SCEV *Begin, const SCEV *End) : Begin(Begin), End(End) { 
-      assert(Begin->getType() == End->getType() && "ill-typed range!"); 
-    } 
- 
-    Type *getType() const { return Begin->getType(); } 
-    const SCEV *getBegin() const { return Begin; } 
-    const SCEV *getEnd() const { return End; } 
-    bool isEmpty(ScalarEvolution &SE, bool IsSigned) const { 
-      if (Begin == End) 
-        return true; 
-      if (IsSigned) 
-        return SE.isKnownPredicate(ICmpInst::ICMP_SGE, Begin, End); 
-      else 
-        return SE.isKnownPredicate(ICmpInst::ICMP_UGE, Begin, End); 
-    } 
-  }; 
- 
-  /// This is the value the condition of the branch needs to evaluate to for the 
-  /// branch to take the hot successor (see (1) above). 
-  bool getPassingDirection() { return true; } 
- 
-  /// Computes a range for the induction variable (IndVar) in which the range 
-  /// check is redundant and can be constant-folded away.  The induction 
-  /// variable is not required to be the canonical {0,+,1} induction variable. 
-  Optional<Range> computeSafeIterationSpace(ScalarEvolution &SE, 
-                                            const SCEVAddRecExpr *IndVar, 
-                                            bool IsLatchSigned) const; 
- 
-  /// Parse out a set of inductive range checks from \p BI and append them to \p 
-  /// Checks. 
-  /// 
-  /// NB! There may be conditions feeding into \p BI that aren't inductive range 
-  /// checks, and hence don't end up in \p Checks. 
-  static void 
-  extractRangeChecksFromBranch(BranchInst *BI, Loop *L, ScalarEvolution &SE, 
-                               BranchProbabilityInfo *BPI, 
-                               SmallVectorImpl<InductiveRangeCheck> &Checks); 
-}; 
- 
+static cl::opt<bool> AllowUnsignedLatchCondition("irce-allow-unsigned-latch",
+                                                 cl::Hidden, cl::init(true));
+
+static cl::opt<bool> AllowNarrowLatchCondition(
+    "irce-allow-narrow-latch", cl::Hidden, cl::init(true),
+    cl::desc("If set to true, IRCE may eliminate wide range checks in loops "
+             "with narrow latch condition."));
+
+static const char *ClonedLoopTag = "irce.loop.clone";
+
+#define DEBUG_TYPE "irce"
+
+namespace {
+
+/// An inductive range check is conditional branch in a loop with
+///
+///  1. a very cold successor (i.e. the branch jumps to that successor very
+///     rarely)
+///
+///  and
+///
+///  2. a condition that is provably true for some contiguous range of values
+///     taken by the containing loop's induction variable.
+///
+class InductiveRangeCheck {
+
+  const SCEV *Begin = nullptr;
+  const SCEV *Step = nullptr;
+  const SCEV *End = nullptr;
+  Use *CheckUse = nullptr;
+
+  static bool parseRangeCheckICmp(Loop *L, ICmpInst *ICI, ScalarEvolution &SE,
+                                  Value *&Index, Value *&Length,
+                                  bool &IsSigned);
+
+  static void
+  extractRangeChecksFromCond(Loop *L, ScalarEvolution &SE, Use &ConditionUse,
+                             SmallVectorImpl<InductiveRangeCheck> &Checks,
+                             SmallPtrSetImpl<Value *> &Visited);
+
+public:
+  const SCEV *getBegin() const { return Begin; }
+  const SCEV *getStep() const { return Step; }
+  const SCEV *getEnd() const { return End; }
+
+  void print(raw_ostream &OS) const {
+    OS << "InductiveRangeCheck:\n";
+    OS << "  Begin: ";
+    Begin->print(OS);
+    OS << "  Step: ";
+    Step->print(OS);
+    OS << "  End: ";
+    End->print(OS);
+    OS << "\n  CheckUse: ";
+    getCheckUse()->getUser()->print(OS);
+    OS << " Operand: " << getCheckUse()->getOperandNo() << "\n";
+  }
+
+  LLVM_DUMP_METHOD
+  void dump() {
+    print(dbgs());
+  }
+
+  Use *getCheckUse() const { return CheckUse; }
+
+  /// Represents an signed integer range [Range.getBegin(), Range.getEnd()).  If
+  /// R.getEnd() le R.getBegin(), then R denotes the empty range.
+
+  class Range {
+    const SCEV *Begin;
+    const SCEV *End;
+
+  public:
+    Range(const SCEV *Begin, const SCEV *End) : Begin(Begin), End(End) {
+      assert(Begin->getType() == End->getType() && "ill-typed range!");
+    }
+
+    Type *getType() const { return Begin->getType(); }
+    const SCEV *getBegin() const { return Begin; }
+    const SCEV *getEnd() const { return End; }
+    bool isEmpty(ScalarEvolution &SE, bool IsSigned) const {
+      if (Begin == End)
+        return true;
+      if (IsSigned)
+        return SE.isKnownPredicate(ICmpInst::ICMP_SGE, Begin, End);
+      else
+        return SE.isKnownPredicate(ICmpInst::ICMP_UGE, Begin, End);
+    }
+  };
+
+  /// This is the value the condition of the branch needs to evaluate to for the
+  /// branch to take the hot successor (see (1) above).
+  bool getPassingDirection() { return true; }
+
+  /// Computes a range for the induction variable (IndVar) in which the range
+  /// check is redundant and can be constant-folded away.  The induction
+  /// variable is not required to be the canonical {0,+,1} induction variable.
+  Optional<Range> computeSafeIterationSpace(ScalarEvolution &SE,
+                                            const SCEVAddRecExpr *IndVar,
+                                            bool IsLatchSigned) const;
+
+  /// Parse out a set of inductive range checks from \p BI and append them to \p
+  /// Checks.
+  ///
+  /// NB! There may be conditions feeding into \p BI that aren't inductive range
+  /// checks, and hence don't end up in \p Checks.
+  static void
+  extractRangeChecksFromBranch(BranchInst *BI, Loop *L, ScalarEvolution &SE,
+                               BranchProbabilityInfo *BPI,
+                               SmallVectorImpl<InductiveRangeCheck> &Checks);
+};
+
 struct LoopStructure;
 
-class InductiveRangeCheckElimination { 
-  ScalarEvolution &SE; 
-  BranchProbabilityInfo *BPI; 
-  DominatorTree &DT; 
-  LoopInfo &LI; 
- 
+class InductiveRangeCheckElimination {
+  ScalarEvolution &SE;
+  BranchProbabilityInfo *BPI;
+  DominatorTree &DT;
+  LoopInfo &LI;
+
   using GetBFIFunc =
       llvm::Optional<llvm::function_ref<llvm::BlockFrequencyInfo &()> >;
   GetBFIFunc GetBFI;
@@ -244,1538 +244,1538 @@ class InductiveRangeCheckElimination {
   // number of iterations.
   bool isProfitableToTransform(const Loop &L, LoopStructure &LS);
 
-public: 
-  InductiveRangeCheckElimination(ScalarEvolution &SE, 
-                                 BranchProbabilityInfo *BPI, DominatorTree &DT, 
+public:
+  InductiveRangeCheckElimination(ScalarEvolution &SE,
+                                 BranchProbabilityInfo *BPI, DominatorTree &DT,
                                  LoopInfo &LI, GetBFIFunc GetBFI = None)
       : SE(SE), BPI(BPI), DT(DT), LI(LI), GetBFI(GetBFI) {}
- 
-  bool run(Loop *L, function_ref<void(Loop *, bool)> LPMAddNewLoop); 
-}; 
- 
-class IRCELegacyPass : public FunctionPass { 
-public: 
-  static char ID; 
- 
-  IRCELegacyPass() : FunctionPass(ID) { 
-    initializeIRCELegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<BranchProbabilityInfoWrapperPass>(); 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addPreserved<DominatorTreeWrapperPass>(); 
-    AU.addRequired<LoopInfoWrapperPass>(); 
-    AU.addPreserved<LoopInfoWrapperPass>(); 
-    AU.addRequired<ScalarEvolutionWrapperPass>(); 
-    AU.addPreserved<ScalarEvolutionWrapperPass>(); 
-  } 
- 
-  bool runOnFunction(Function &F) override; 
-}; 
- 
-} // end anonymous namespace 
- 
-char IRCELegacyPass::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(IRCELegacyPass, "irce", 
-                      "Inductive range check elimination", false, false) 
-INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 
-INITIALIZE_PASS_END(IRCELegacyPass, "irce", "Inductive range check elimination", 
-                    false, false) 
- 
-/// Parse a single ICmp instruction, `ICI`, into a range check.  If `ICI` cannot 
-/// be interpreted as a range check, return false and set `Index` and `Length` 
-/// to `nullptr`.  Otherwise set `Index` to the value being range checked, and 
-/// set `Length` to the upper limit `Index` is being range checked. 
-bool 
-InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI, 
-                                         ScalarEvolution &SE, Value *&Index, 
-                                         Value *&Length, bool &IsSigned) { 
-  auto IsLoopInvariant = [&SE, L](Value *V) { 
-    return SE.isLoopInvariant(SE.getSCEV(V), L); 
-  }; 
- 
-  ICmpInst::Predicate Pred = ICI->getPredicate(); 
-  Value *LHS = ICI->getOperand(0); 
-  Value *RHS = ICI->getOperand(1); 
- 
-  switch (Pred) { 
-  default: 
-    return false; 
- 
-  case ICmpInst::ICMP_SLE: 
-    std::swap(LHS, RHS); 
-    LLVM_FALLTHROUGH; 
-  case ICmpInst::ICMP_SGE: 
-    IsSigned = true; 
-    if (match(RHS, m_ConstantInt<0>())) { 
-      Index = LHS; 
-      return true; // Lower. 
-    } 
-    return false; 
- 
-  case ICmpInst::ICMP_SLT: 
-    std::swap(LHS, RHS); 
-    LLVM_FALLTHROUGH; 
-  case ICmpInst::ICMP_SGT: 
-    IsSigned = true; 
-    if (match(RHS, m_ConstantInt<-1>())) { 
-      Index = LHS; 
-      return true; // Lower. 
-    } 
- 
-    if (IsLoopInvariant(LHS)) { 
-      Index = RHS; 
-      Length = LHS; 
-      return true; // Upper. 
-    } 
-    return false; 
- 
-  case ICmpInst::ICMP_ULT: 
-    std::swap(LHS, RHS); 
-    LLVM_FALLTHROUGH; 
-  case ICmpInst::ICMP_UGT: 
-    IsSigned = false; 
-    if (IsLoopInvariant(LHS)) { 
-      Index = RHS; 
-      Length = LHS; 
-      return true; // Both lower and upper. 
-    } 
-    return false; 
-  } 
- 
-  llvm_unreachable("default clause returns!"); 
-} 
- 
-void InductiveRangeCheck::extractRangeChecksFromCond( 
-    Loop *L, ScalarEvolution &SE, Use &ConditionUse, 
-    SmallVectorImpl<InductiveRangeCheck> &Checks, 
-    SmallPtrSetImpl<Value *> &Visited) { 
-  Value *Condition = ConditionUse.get(); 
-  if (!Visited.insert(Condition).second) 
-    return; 
- 
-  // TODO: Do the same for OR, XOR, NOT etc? 
-  if (match(Condition, m_And(m_Value(), m_Value()))) { 
-    extractRangeChecksFromCond(L, SE, cast<User>(Condition)->getOperandUse(0), 
-                               Checks, Visited); 
-    extractRangeChecksFromCond(L, SE, cast<User>(Condition)->getOperandUse(1), 
-                               Checks, Visited); 
-    return; 
-  } 
- 
-  ICmpInst *ICI = dyn_cast<ICmpInst>(Condition); 
-  if (!ICI) 
-    return; 
- 
-  Value *Length = nullptr, *Index; 
-  bool IsSigned; 
-  if (!parseRangeCheckICmp(L, ICI, SE, Index, Length, IsSigned)) 
-    return; 
- 
-  const auto *IndexAddRec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Index)); 
-  bool IsAffineIndex = 
-      IndexAddRec && (IndexAddRec->getLoop() == L) && IndexAddRec->isAffine(); 
- 
-  if (!IsAffineIndex) 
-    return; 
- 
-  const SCEV *End = nullptr; 
-  // We strengthen "0 <= I" to "0 <= I < INT_SMAX" and "I < L" to "0 <= I < L". 
-  // We can potentially do much better here. 
-  if (Length) 
-    End = SE.getSCEV(Length); 
-  else { 
-    // So far we can only reach this point for Signed range check. This may 
-    // change in future. In this case we will need to pick Unsigned max for the 
-    // unsigned range check. 
-    unsigned BitWidth = cast<IntegerType>(IndexAddRec->getType())->getBitWidth(); 
-    const SCEV *SIntMax = SE.getConstant(APInt::getSignedMaxValue(BitWidth)); 
-    End = SIntMax; 
-  } 
- 
-  InductiveRangeCheck IRC; 
-  IRC.End = End; 
-  IRC.Begin = IndexAddRec->getStart(); 
-  IRC.Step = IndexAddRec->getStepRecurrence(SE); 
-  IRC.CheckUse = &ConditionUse; 
-  Checks.push_back(IRC); 
-} 
- 
-void InductiveRangeCheck::extractRangeChecksFromBranch( 
-    BranchInst *BI, Loop *L, ScalarEvolution &SE, BranchProbabilityInfo *BPI, 
-    SmallVectorImpl<InductiveRangeCheck> &Checks) { 
-  if (BI->isUnconditional() || BI->getParent() == L->getLoopLatch()) 
-    return; 
- 
-  BranchProbability LikelyTaken(15, 16); 
- 
-  if (!SkipProfitabilityChecks && BPI && 
-      BPI->getEdgeProbability(BI->getParent(), (unsigned)0) < LikelyTaken) 
-    return; 
- 
-  SmallPtrSet<Value *, 8> Visited; 
-  InductiveRangeCheck::extractRangeChecksFromCond(L, SE, BI->getOperandUse(0), 
-                                                  Checks, Visited); 
-} 
- 
-// Add metadata to the loop L to disable loop optimizations. Callers need to 
-// confirm that optimizing loop L is not beneficial. 
-static void DisableAllLoopOptsOnLoop(Loop &L) { 
-  // We do not care about any existing loopID related metadata for L, since we 
-  // are setting all loop metadata to false. 
-  LLVMContext &Context = L.getHeader()->getContext(); 
-  // Reserve first location for self reference to the LoopID metadata node. 
-  MDNode *Dummy = MDNode::get(Context, {}); 
-  MDNode *DisableUnroll = MDNode::get( 
-      Context, {MDString::get(Context, "llvm.loop.unroll.disable")}); 
-  Metadata *FalseVal = 
-      ConstantAsMetadata::get(ConstantInt::get(Type::getInt1Ty(Context), 0)); 
-  MDNode *DisableVectorize = MDNode::get( 
-      Context, 
-      {MDString::get(Context, "llvm.loop.vectorize.enable"), FalseVal}); 
-  MDNode *DisableLICMVersioning = MDNode::get( 
-      Context, {MDString::get(Context, "llvm.loop.licm_versioning.disable")}); 
-  MDNode *DisableDistribution= MDNode::get( 
-      Context, 
-      {MDString::get(Context, "llvm.loop.distribute.enable"), FalseVal}); 
-  MDNode *NewLoopID = 
-      MDNode::get(Context, {Dummy, DisableUnroll, DisableVectorize, 
-                            DisableLICMVersioning, DisableDistribution}); 
-  // Set operand 0 to refer to the loop id itself. 
-  NewLoopID->replaceOperandWith(0, NewLoopID); 
-  L.setLoopID(NewLoopID); 
-} 
- 
-namespace { 
- 
-// Keeps track of the structure of a loop.  This is similar to llvm::Loop, 
-// except that it is more lightweight and can track the state of a loop through 
-// changing and potentially invalid IR.  This structure also formalizes the 
-// kinds of loops we can deal with -- ones that have a single latch that is also 
-// an exiting block *and* have a canonical induction variable. 
-struct LoopStructure { 
-  const char *Tag = ""; 
- 
-  BasicBlock *Header = nullptr; 
-  BasicBlock *Latch = nullptr; 
- 
-  // `Latch's terminator instruction is `LatchBr', and it's `LatchBrExitIdx'th 
-  // successor is `LatchExit', the exit block of the loop. 
-  BranchInst *LatchBr = nullptr; 
-  BasicBlock *LatchExit = nullptr; 
-  unsigned LatchBrExitIdx = std::numeric_limits<unsigned>::max(); 
- 
-  // The loop represented by this instance of LoopStructure is semantically 
-  // equivalent to: 
-  // 
-  // intN_ty inc = IndVarIncreasing ? 1 : -1; 
-  // pred_ty predicate = IndVarIncreasing ? ICMP_SLT : ICMP_SGT; 
-  // 
-  // for (intN_ty iv = IndVarStart; predicate(iv, LoopExitAt); iv = IndVarBase) 
-  //   ... body ... 
- 
-  Value *IndVarBase = nullptr; 
-  Value *IndVarStart = nullptr; 
-  Value *IndVarStep = nullptr; 
-  Value *LoopExitAt = nullptr; 
-  bool IndVarIncreasing = false; 
-  bool IsSignedPredicate = true; 
- 
-  LoopStructure() = default; 
- 
-  template <typename M> LoopStructure map(M Map) const { 
-    LoopStructure Result; 
-    Result.Tag = Tag; 
-    Result.Header = cast<BasicBlock>(Map(Header)); 
-    Result.Latch = cast<BasicBlock>(Map(Latch)); 
-    Result.LatchBr = cast<BranchInst>(Map(LatchBr)); 
-    Result.LatchExit = cast<BasicBlock>(Map(LatchExit)); 
-    Result.LatchBrExitIdx = LatchBrExitIdx; 
-    Result.IndVarBase = Map(IndVarBase); 
-    Result.IndVarStart = Map(IndVarStart); 
-    Result.IndVarStep = Map(IndVarStep); 
-    Result.LoopExitAt = Map(LoopExitAt); 
-    Result.IndVarIncreasing = IndVarIncreasing; 
-    Result.IsSignedPredicate = IsSignedPredicate; 
-    return Result; 
-  } 
- 
+
+  bool run(Loop *L, function_ref<void(Loop *, bool)> LPMAddNewLoop);
+};
+
+class IRCELegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  IRCELegacyPass() : FunctionPass(ID) {
+    initializeIRCELegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<BranchProbabilityInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addPreserved<ScalarEvolutionWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override;
+};
+
+} // end anonymous namespace
+
+char IRCELegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(IRCELegacyPass, "irce",
+                      "Inductive range check elimination", false, false)
+INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(IRCELegacyPass, "irce", "Inductive range check elimination",
+                    false, false)
+
+/// Parse a single ICmp instruction, `ICI`, into a range check.  If `ICI` cannot
+/// be interpreted as a range check, return false and set `Index` and `Length`
+/// to `nullptr`.  Otherwise set `Index` to the value being range checked, and
+/// set `Length` to the upper limit `Index` is being range checked.
+bool
+InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
+                                         ScalarEvolution &SE, Value *&Index,
+                                         Value *&Length, bool &IsSigned) {
+  auto IsLoopInvariant = [&SE, L](Value *V) {
+    return SE.isLoopInvariant(SE.getSCEV(V), L);
+  };
+
+  ICmpInst::Predicate Pred = ICI->getPredicate();
+  Value *LHS = ICI->getOperand(0);
+  Value *RHS = ICI->getOperand(1);
+
+  switch (Pred) {
+  default:
+    return false;
+
+  case ICmpInst::ICMP_SLE:
+    std::swap(LHS, RHS);
+    LLVM_FALLTHROUGH;
+  case ICmpInst::ICMP_SGE:
+    IsSigned = true;
+    if (match(RHS, m_ConstantInt<0>())) {
+      Index = LHS;
+      return true; // Lower.
+    }
+    return false;
+
+  case ICmpInst::ICMP_SLT:
+    std::swap(LHS, RHS);
+    LLVM_FALLTHROUGH;
+  case ICmpInst::ICMP_SGT:
+    IsSigned = true;
+    if (match(RHS, m_ConstantInt<-1>())) {
+      Index = LHS;
+      return true; // Lower.
+    }
+
+    if (IsLoopInvariant(LHS)) {
+      Index = RHS;
+      Length = LHS;
+      return true; // Upper.
+    }
+    return false;
+
+  case ICmpInst::ICMP_ULT:
+    std::swap(LHS, RHS);
+    LLVM_FALLTHROUGH;
+  case ICmpInst::ICMP_UGT:
+    IsSigned = false;
+    if (IsLoopInvariant(LHS)) {
+      Index = RHS;
+      Length = LHS;
+      return true; // Both lower and upper.
+    }
+    return false;
+  }
+
+  llvm_unreachable("default clause returns!");
+}
+
+void InductiveRangeCheck::extractRangeChecksFromCond(
+    Loop *L, ScalarEvolution &SE, Use &ConditionUse,
+    SmallVectorImpl<InductiveRangeCheck> &Checks,
+    SmallPtrSetImpl<Value *> &Visited) {
+  Value *Condition = ConditionUse.get();
+  if (!Visited.insert(Condition).second)
+    return;
+
+  // TODO: Do the same for OR, XOR, NOT etc?
+  if (match(Condition, m_And(m_Value(), m_Value()))) {
+    extractRangeChecksFromCond(L, SE, cast<User>(Condition)->getOperandUse(0),
+                               Checks, Visited);
+    extractRangeChecksFromCond(L, SE, cast<User>(Condition)->getOperandUse(1),
+                               Checks, Visited);
+    return;
+  }
+
+  ICmpInst *ICI = dyn_cast<ICmpInst>(Condition);
+  if (!ICI)
+    return;
+
+  Value *Length = nullptr, *Index;
+  bool IsSigned;
+  if (!parseRangeCheckICmp(L, ICI, SE, Index, Length, IsSigned))
+    return;
+
+  const auto *IndexAddRec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Index));
+  bool IsAffineIndex =
+      IndexAddRec && (IndexAddRec->getLoop() == L) && IndexAddRec->isAffine();
+
+  if (!IsAffineIndex)
+    return;
+
+  const SCEV *End = nullptr;
+  // We strengthen "0 <= I" to "0 <= I < INT_SMAX" and "I < L" to "0 <= I < L".
+  // We can potentially do much better here.
+  if (Length)
+    End = SE.getSCEV(Length);
+  else {
+    // So far we can only reach this point for Signed range check. This may
+    // change in future. In this case we will need to pick Unsigned max for the
+    // unsigned range check.
+    unsigned BitWidth = cast<IntegerType>(IndexAddRec->getType())->getBitWidth();
+    const SCEV *SIntMax = SE.getConstant(APInt::getSignedMaxValue(BitWidth));
+    End = SIntMax;
+  }
+
+  InductiveRangeCheck IRC;
+  IRC.End = End;
+  IRC.Begin = IndexAddRec->getStart();
+  IRC.Step = IndexAddRec->getStepRecurrence(SE);
+  IRC.CheckUse = &ConditionUse;
+  Checks.push_back(IRC);
+}
+
+void InductiveRangeCheck::extractRangeChecksFromBranch(
+    BranchInst *BI, Loop *L, ScalarEvolution &SE, BranchProbabilityInfo *BPI,
+    SmallVectorImpl<InductiveRangeCheck> &Checks) {
+  if (BI->isUnconditional() || BI->getParent() == L->getLoopLatch())
+    return;
+
+  BranchProbability LikelyTaken(15, 16);
+
+  if (!SkipProfitabilityChecks && BPI &&
+      BPI->getEdgeProbability(BI->getParent(), (unsigned)0) < LikelyTaken)
+    return;
+
+  SmallPtrSet<Value *, 8> Visited;
+  InductiveRangeCheck::extractRangeChecksFromCond(L, SE, BI->getOperandUse(0),
+                                                  Checks, Visited);
+}
+
+// Add metadata to the loop L to disable loop optimizations. Callers need to
+// confirm that optimizing loop L is not beneficial.
+static void DisableAllLoopOptsOnLoop(Loop &L) {
+  // We do not care about any existing loopID related metadata for L, since we
+  // are setting all loop metadata to false.
+  LLVMContext &Context = L.getHeader()->getContext();
+  // Reserve first location for self reference to the LoopID metadata node.
+  MDNode *Dummy = MDNode::get(Context, {});
+  MDNode *DisableUnroll = MDNode::get(
+      Context, {MDString::get(Context, "llvm.loop.unroll.disable")});
+  Metadata *FalseVal =
+      ConstantAsMetadata::get(ConstantInt::get(Type::getInt1Ty(Context), 0));
+  MDNode *DisableVectorize = MDNode::get(
+      Context,
+      {MDString::get(Context, "llvm.loop.vectorize.enable"), FalseVal});
+  MDNode *DisableLICMVersioning = MDNode::get(
+      Context, {MDString::get(Context, "llvm.loop.licm_versioning.disable")});
+  MDNode *DisableDistribution= MDNode::get(
+      Context,
+      {MDString::get(Context, "llvm.loop.distribute.enable"), FalseVal});
+  MDNode *NewLoopID =
+      MDNode::get(Context, {Dummy, DisableUnroll, DisableVectorize,
+                            DisableLICMVersioning, DisableDistribution});
+  // Set operand 0 to refer to the loop id itself.
+  NewLoopID->replaceOperandWith(0, NewLoopID);
+  L.setLoopID(NewLoopID);
+}
+
+namespace {
+
+// Keeps track of the structure of a loop.  This is similar to llvm::Loop,
+// except that it is more lightweight and can track the state of a loop through
+// changing and potentially invalid IR.  This structure also formalizes the
+// kinds of loops we can deal with -- ones that have a single latch that is also
+// an exiting block *and* have a canonical induction variable.
+struct LoopStructure {
+  const char *Tag = "";
+
+  BasicBlock *Header = nullptr;
+  BasicBlock *Latch = nullptr;
+
+  // `Latch's terminator instruction is `LatchBr', and it's `LatchBrExitIdx'th
+  // successor is `LatchExit', the exit block of the loop.
+  BranchInst *LatchBr = nullptr;
+  BasicBlock *LatchExit = nullptr;
+  unsigned LatchBrExitIdx = std::numeric_limits<unsigned>::max();
+
+  // The loop represented by this instance of LoopStructure is semantically
+  // equivalent to:
+  //
+  // intN_ty inc = IndVarIncreasing ? 1 : -1;
+  // pred_ty predicate = IndVarIncreasing ? ICMP_SLT : ICMP_SGT;
+  //
+  // for (intN_ty iv = IndVarStart; predicate(iv, LoopExitAt); iv = IndVarBase)
+  //   ... body ...
+
+  Value *IndVarBase = nullptr;
+  Value *IndVarStart = nullptr;
+  Value *IndVarStep = nullptr;
+  Value *LoopExitAt = nullptr;
+  bool IndVarIncreasing = false;
+  bool IsSignedPredicate = true;
+
+  LoopStructure() = default;
+
+  template <typename M> LoopStructure map(M Map) const {
+    LoopStructure Result;
+    Result.Tag = Tag;
+    Result.Header = cast<BasicBlock>(Map(Header));
+    Result.Latch = cast<BasicBlock>(Map(Latch));
+    Result.LatchBr = cast<BranchInst>(Map(LatchBr));
+    Result.LatchExit = cast<BasicBlock>(Map(LatchExit));
+    Result.LatchBrExitIdx = LatchBrExitIdx;
+    Result.IndVarBase = Map(IndVarBase);
+    Result.IndVarStart = Map(IndVarStart);
+    Result.IndVarStep = Map(IndVarStep);
+    Result.LoopExitAt = Map(LoopExitAt);
+    Result.IndVarIncreasing = IndVarIncreasing;
+    Result.IsSignedPredicate = IsSignedPredicate;
+    return Result;
+  }
+
   static Optional<LoopStructure> parseLoopStructure(ScalarEvolution &, Loop &,
                                                     const char *&);
-}; 
- 
-/// This class is used to constrain loops to run within a given iteration space. 
-/// The algorithm this class implements is given a Loop and a range [Begin, 
-/// End).  The algorithm then tries to break out a "main loop" out of the loop 
-/// it is given in a way that the "main loop" runs with the induction variable 
-/// in a subset of [Begin, End).  The algorithm emits appropriate pre and post 
-/// loops to run any remaining iterations.  The pre loop runs any iterations in 
-/// which the induction variable is < Begin, and the post loop runs any 
-/// iterations in which the induction variable is >= End. 
-class LoopConstrainer { 
-  // The representation of a clone of the original loop we started out with. 
-  struct ClonedLoop { 
-    // The cloned blocks 
-    std::vector<BasicBlock *> Blocks; 
- 
-    // `Map` maps values in the clonee into values in the cloned version 
-    ValueToValueMapTy Map; 
- 
-    // An instance of `LoopStructure` for the cloned loop 
-    LoopStructure Structure; 
-  }; 
- 
-  // Result of rewriting the range of a loop.  See changeIterationSpaceEnd for 
-  // more details on what these fields mean. 
-  struct RewrittenRangeInfo { 
-    BasicBlock *PseudoExit = nullptr; 
-    BasicBlock *ExitSelector = nullptr; 
-    std::vector<PHINode *> PHIValuesAtPseudoExit; 
-    PHINode *IndVarEnd = nullptr; 
- 
-    RewrittenRangeInfo() = default; 
-  }; 
- 
-  // Calculated subranges we restrict the iteration space of the main loop to. 
-  // See the implementation of `calculateSubRanges' for more details on how 
-  // these fields are computed.  `LowLimit` is None if there is no restriction 
-  // on low end of the restricted iteration space of the main loop.  `HighLimit` 
-  // is None if there is no restriction on high end of the restricted iteration 
-  // space of the main loop. 
- 
-  struct SubRanges { 
-    Optional<const SCEV *> LowLimit; 
-    Optional<const SCEV *> HighLimit; 
-  }; 
- 
-  // Compute a safe set of limits for the main loop to run in -- effectively the 
-  // intersection of `Range' and the iteration space of the original loop. 
-  // Return None if unable to compute the set of subranges. 
-  Optional<SubRanges> calculateSubRanges(bool IsSignedPredicate) const; 
- 
-  // Clone `OriginalLoop' and return the result in CLResult.  The IR after 
-  // running `cloneLoop' is well formed except for the PHI nodes in CLResult -- 
-  // the PHI nodes say that there is an incoming edge from `OriginalPreheader` 
-  // but there is no such edge. 
-  void cloneLoop(ClonedLoop &CLResult, const char *Tag) const; 
- 
-  // Create the appropriate loop structure needed to describe a cloned copy of 
-  // `Original`.  The clone is described by `VM`. 
-  Loop *createClonedLoopStructure(Loop *Original, Loop *Parent, 
-                                  ValueToValueMapTy &VM, bool IsSubloop); 
- 
-  // Rewrite the iteration space of the loop denoted by (LS, Preheader). The 
-  // iteration space of the rewritten loop ends at ExitLoopAt.  The start of the 
-  // iteration space is not changed.  `ExitLoopAt' is assumed to be slt 
-  // `OriginalHeaderCount'. 
-  // 
-  // If there are iterations left to execute, control is made to jump to 
-  // `ContinuationBlock', otherwise they take the normal loop exit.  The 
-  // returned `RewrittenRangeInfo' object is populated as follows: 
-  // 
-  //  .PseudoExit is a basic block that unconditionally branches to 
-  //      `ContinuationBlock'. 
-  // 
-  //  .ExitSelector is a basic block that decides, on exit from the loop, 
-  //      whether to branch to the "true" exit or to `PseudoExit'. 
-  // 
-  //  .PHIValuesAtPseudoExit are PHINodes in `PseudoExit' that compute the value 
-  //      for each PHINode in the loop header on taking the pseudo exit. 
-  // 
-  // After changeIterationSpaceEnd, `Preheader' is no longer a legitimate 
-  // preheader because it is made to branch to the loop header only 
-  // conditionally. 
-  RewrittenRangeInfo 
-  changeIterationSpaceEnd(const LoopStructure &LS, BasicBlock *Preheader, 
-                          Value *ExitLoopAt, 
-                          BasicBlock *ContinuationBlock) const; 
- 
-  // The loop denoted by `LS' has `OldPreheader' as its preheader.  This 
-  // function creates a new preheader for `LS' and returns it. 
-  BasicBlock *createPreheader(const LoopStructure &LS, BasicBlock *OldPreheader, 
-                              const char *Tag) const; 
- 
-  // `ContinuationBlockAndPreheader' was the continuation block for some call to 
-  // `changeIterationSpaceEnd' and is the preheader to the loop denoted by `LS'. 
-  // This function rewrites the PHI nodes in `LS.Header' to start with the 
-  // correct value. 
-  void rewriteIncomingValuesForPHIs( 
-      LoopStructure &LS, BasicBlock *ContinuationBlockAndPreheader, 
-      const LoopConstrainer::RewrittenRangeInfo &RRI) const; 
- 
-  // Even though we do not preserve any passes at this time, we at least need to 
-  // keep the parent loop structure consistent.  The `LPPassManager' seems to 
-  // verify this after running a loop pass.  This function adds the list of 
-  // blocks denoted by BBs to this loops parent loop if required. 
-  void addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs); 
- 
-  // Some global state. 
-  Function &F; 
-  LLVMContext &Ctx; 
-  ScalarEvolution &SE; 
-  DominatorTree &DT; 
-  LoopInfo &LI; 
-  function_ref<void(Loop *, bool)> LPMAddNewLoop; 
- 
-  // Information about the original loop we started out with. 
-  Loop &OriginalLoop; 
- 
-  const SCEV *LatchTakenCount = nullptr; 
-  BasicBlock *OriginalPreheader = nullptr; 
- 
-  // The preheader of the main loop.  This may or may not be different from 
-  // `OriginalPreheader'. 
-  BasicBlock *MainLoopPreheader = nullptr; 
- 
-  // The range we need to run the main loop in. 
-  InductiveRangeCheck::Range Range; 
- 
-  // The structure of the main loop (see comment at the beginning of this class 
-  // for a definition) 
-  LoopStructure MainLoopStructure; 
- 
-public: 
-  LoopConstrainer(Loop &L, LoopInfo &LI, 
-                  function_ref<void(Loop *, bool)> LPMAddNewLoop, 
-                  const LoopStructure &LS, ScalarEvolution &SE, 
-                  DominatorTree &DT, InductiveRangeCheck::Range R) 
-      : F(*L.getHeader()->getParent()), Ctx(L.getHeader()->getContext()), 
-        SE(SE), DT(DT), LI(LI), LPMAddNewLoop(LPMAddNewLoop), OriginalLoop(L), 
-        Range(R), MainLoopStructure(LS) {} 
- 
-  // Entry point for the algorithm.  Returns true on success. 
-  bool run(); 
-}; 
- 
-} // end anonymous namespace 
- 
-/// Given a loop with an deccreasing induction variable, is it possible to 
-/// safely calculate the bounds of a new loop using the given Predicate. 
-static bool isSafeDecreasingBound(const SCEV *Start, 
-                                  const SCEV *BoundSCEV, const SCEV *Step, 
-                                  ICmpInst::Predicate Pred, 
-                                  unsigned LatchBrExitIdx, 
-                                  Loop *L, ScalarEvolution &SE) { 
-  if (Pred != ICmpInst::ICMP_SLT && Pred != ICmpInst::ICMP_SGT && 
-      Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_UGT) 
-    return false; 
- 
-  if (!SE.isAvailableAtLoopEntry(BoundSCEV, L)) 
-    return false; 
- 
-  assert(SE.isKnownNegative(Step) && "expecting negative step"); 
- 
-  LLVM_DEBUG(dbgs() << "irce: isSafeDecreasingBound with:\n"); 
-  LLVM_DEBUG(dbgs() << "irce: Start: " << *Start << "\n"); 
-  LLVM_DEBUG(dbgs() << "irce: Step: " << *Step << "\n"); 
-  LLVM_DEBUG(dbgs() << "irce: BoundSCEV: " << *BoundSCEV << "\n"); 
-  LLVM_DEBUG(dbgs() << "irce: Pred: " << ICmpInst::getPredicateName(Pred) 
-                    << "\n"); 
-  LLVM_DEBUG(dbgs() << "irce: LatchExitBrIdx: " << LatchBrExitIdx << "\n"); 
- 
-  bool IsSigned = ICmpInst::isSigned(Pred); 
-  // The predicate that we need to check that the induction variable lies 
-  // within bounds. 
-  ICmpInst::Predicate BoundPred = 
-    IsSigned ? CmpInst::ICMP_SGT : CmpInst::ICMP_UGT; 
- 
-  if (LatchBrExitIdx == 1) 
-    return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, BoundSCEV); 
- 
-  assert(LatchBrExitIdx == 0 && 
-         "LatchBrExitIdx should be either 0 or 1"); 
- 
-  const SCEV *StepPlusOne = SE.getAddExpr(Step, SE.getOne(Step->getType())); 
-  unsigned BitWidth = cast<IntegerType>(BoundSCEV->getType())->getBitWidth(); 
-  APInt Min = IsSigned ? APInt::getSignedMinValue(BitWidth) : 
-    APInt::getMinValue(BitWidth); 
-  const SCEV *Limit = SE.getMinusSCEV(SE.getConstant(Min), StepPlusOne); 
- 
-  const SCEV *MinusOne = 
-    SE.getMinusSCEV(BoundSCEV, SE.getOne(BoundSCEV->getType())); 
- 
-  return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, MinusOne) && 
-         SE.isLoopEntryGuardedByCond(L, BoundPred, BoundSCEV, Limit); 
- 
-} 
- 
-/// Given a loop with an increasing induction variable, is it possible to 
-/// safely calculate the bounds of a new loop using the given Predicate. 
-static bool isSafeIncreasingBound(const SCEV *Start, 
-                                  const SCEV *BoundSCEV, const SCEV *Step, 
-                                  ICmpInst::Predicate Pred, 
-                                  unsigned LatchBrExitIdx, 
-                                  Loop *L, ScalarEvolution &SE) { 
-  if (Pred != ICmpInst::ICMP_SLT && Pred != ICmpInst::ICMP_SGT && 
-      Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_UGT) 
-    return false; 
- 
-  if (!SE.isAvailableAtLoopEntry(BoundSCEV, L)) 
-    return false; 
- 
-  LLVM_DEBUG(dbgs() << "irce: isSafeIncreasingBound with:\n"); 
-  LLVM_DEBUG(dbgs() << "irce: Start: " << *Start << "\n"); 
-  LLVM_DEBUG(dbgs() << "irce: Step: " << *Step << "\n"); 
-  LLVM_DEBUG(dbgs() << "irce: BoundSCEV: " << *BoundSCEV << "\n"); 
-  LLVM_DEBUG(dbgs() << "irce: Pred: " << ICmpInst::getPredicateName(Pred) 
-                    << "\n"); 
-  LLVM_DEBUG(dbgs() << "irce: LatchExitBrIdx: " << LatchBrExitIdx << "\n"); 
- 
-  bool IsSigned = ICmpInst::isSigned(Pred); 
-  // The predicate that we need to check that the induction variable lies 
-  // within bounds. 
-  ICmpInst::Predicate BoundPred = 
-      IsSigned ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT; 
- 
-  if (LatchBrExitIdx == 1) 
-    return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, BoundSCEV); 
- 
-  assert(LatchBrExitIdx == 0 && "LatchBrExitIdx should be 0 or 1"); 
- 
-  const SCEV *StepMinusOne = 
-    SE.getMinusSCEV(Step, SE.getOne(Step->getType())); 
-  unsigned BitWidth = cast<IntegerType>(BoundSCEV->getType())->getBitWidth(); 
-  APInt Max = IsSigned ? APInt::getSignedMaxValue(BitWidth) : 
-    APInt::getMaxValue(BitWidth); 
-  const SCEV *Limit = SE.getMinusSCEV(SE.getConstant(Max), StepMinusOne); 
- 
-  return (SE.isLoopEntryGuardedByCond(L, BoundPred, Start, 
-                                      SE.getAddExpr(BoundSCEV, Step)) && 
-          SE.isLoopEntryGuardedByCond(L, BoundPred, BoundSCEV, Limit)); 
-} 
- 
-Optional<LoopStructure> 
+};
+
+/// This class is used to constrain loops to run within a given iteration space.
+/// The algorithm this class implements is given a Loop and a range [Begin,
+/// End).  The algorithm then tries to break out a "main loop" out of the loop
+/// it is given in a way that the "main loop" runs with the induction variable
+/// in a subset of [Begin, End).  The algorithm emits appropriate pre and post
+/// loops to run any remaining iterations.  The pre loop runs any iterations in
+/// which the induction variable is < Begin, and the post loop runs any
+/// iterations in which the induction variable is >= End.
+class LoopConstrainer {
+  // The representation of a clone of the original loop we started out with.
+  struct ClonedLoop {
+    // The cloned blocks
+    std::vector<BasicBlock *> Blocks;
+
+    // `Map` maps values in the clonee into values in the cloned version
+    ValueToValueMapTy Map;
+
+    // An instance of `LoopStructure` for the cloned loop
+    LoopStructure Structure;
+  };
+
+  // Result of rewriting the range of a loop.  See changeIterationSpaceEnd for
+  // more details on what these fields mean.
+  struct RewrittenRangeInfo {
+    BasicBlock *PseudoExit = nullptr;
+    BasicBlock *ExitSelector = nullptr;
+    std::vector<PHINode *> PHIValuesAtPseudoExit;
+    PHINode *IndVarEnd = nullptr;
+
+    RewrittenRangeInfo() = default;
+  };
+
+  // Calculated subranges we restrict the iteration space of the main loop to.
+  // See the implementation of `calculateSubRanges' for more details on how
+  // these fields are computed.  `LowLimit` is None if there is no restriction
+  // on low end of the restricted iteration space of the main loop.  `HighLimit`
+  // is None if there is no restriction on high end of the restricted iteration
+  // space of the main loop.
+
+  struct SubRanges {
+    Optional<const SCEV *> LowLimit;
+    Optional<const SCEV *> HighLimit;
+  };
+
+  // Compute a safe set of limits for the main loop to run in -- effectively the
+  // intersection of `Range' and the iteration space of the original loop.
+  // Return None if unable to compute the set of subranges.
+  Optional<SubRanges> calculateSubRanges(bool IsSignedPredicate) const;
+
+  // Clone `OriginalLoop' and return the result in CLResult.  The IR after
+  // running `cloneLoop' is well formed except for the PHI nodes in CLResult --
+  // the PHI nodes say that there is an incoming edge from `OriginalPreheader`
+  // but there is no such edge.
+  void cloneLoop(ClonedLoop &CLResult, const char *Tag) const;
+
+  // Create the appropriate loop structure needed to describe a cloned copy of
+  // `Original`.  The clone is described by `VM`.
+  Loop *createClonedLoopStructure(Loop *Original, Loop *Parent,
+                                  ValueToValueMapTy &VM, bool IsSubloop);
+
+  // Rewrite the iteration space of the loop denoted by (LS, Preheader). The
+  // iteration space of the rewritten loop ends at ExitLoopAt.  The start of the
+  // iteration space is not changed.  `ExitLoopAt' is assumed to be slt
+  // `OriginalHeaderCount'.
+  //
+  // If there are iterations left to execute, control is made to jump to
+  // `ContinuationBlock', otherwise they take the normal loop exit.  The
+  // returned `RewrittenRangeInfo' object is populated as follows:
+  //
+  //  .PseudoExit is a basic block that unconditionally branches to
+  //      `ContinuationBlock'.
+  //
+  //  .ExitSelector is a basic block that decides, on exit from the loop,
+  //      whether to branch to the "true" exit or to `PseudoExit'.
+  //
+  //  .PHIValuesAtPseudoExit are PHINodes in `PseudoExit' that compute the value
+  //      for each PHINode in the loop header on taking the pseudo exit.
+  //
+  // After changeIterationSpaceEnd, `Preheader' is no longer a legitimate
+  // preheader because it is made to branch to the loop header only
+  // conditionally.
+  RewrittenRangeInfo
+  changeIterationSpaceEnd(const LoopStructure &LS, BasicBlock *Preheader,
+                          Value *ExitLoopAt,
+                          BasicBlock *ContinuationBlock) const;
+
+  // The loop denoted by `LS' has `OldPreheader' as its preheader.  This
+  // function creates a new preheader for `LS' and returns it.
+  BasicBlock *createPreheader(const LoopStructure &LS, BasicBlock *OldPreheader,
+                              const char *Tag) const;
+
+  // `ContinuationBlockAndPreheader' was the continuation block for some call to
+  // `changeIterationSpaceEnd' and is the preheader to the loop denoted by `LS'.
+  // This function rewrites the PHI nodes in `LS.Header' to start with the
+  // correct value.
+  void rewriteIncomingValuesForPHIs(
+      LoopStructure &LS, BasicBlock *ContinuationBlockAndPreheader,
+      const LoopConstrainer::RewrittenRangeInfo &RRI) const;
+
+  // Even though we do not preserve any passes at this time, we at least need to
+  // keep the parent loop structure consistent.  The `LPPassManager' seems to
+  // verify this after running a loop pass.  This function adds the list of
+  // blocks denoted by BBs to this loops parent loop if required.
+  void addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs);
+
+  // Some global state.
+  Function &F;
+  LLVMContext &Ctx;
+  ScalarEvolution &SE;
+  DominatorTree &DT;
+  LoopInfo &LI;
+  function_ref<void(Loop *, bool)> LPMAddNewLoop;
+
+  // Information about the original loop we started out with.
+  Loop &OriginalLoop;
+
+  const SCEV *LatchTakenCount = nullptr;
+  BasicBlock *OriginalPreheader = nullptr;
+
+  // The preheader of the main loop.  This may or may not be different from
+  // `OriginalPreheader'.
+  BasicBlock *MainLoopPreheader = nullptr;
+
+  // The range we need to run the main loop in.
+  InductiveRangeCheck::Range Range;
+
+  // The structure of the main loop (see comment at the beginning of this class
+  // for a definition)
+  LoopStructure MainLoopStructure;
+
+public:
+  LoopConstrainer(Loop &L, LoopInfo &LI,
+                  function_ref<void(Loop *, bool)> LPMAddNewLoop,
+                  const LoopStructure &LS, ScalarEvolution &SE,
+                  DominatorTree &DT, InductiveRangeCheck::Range R)
+      : F(*L.getHeader()->getParent()), Ctx(L.getHeader()->getContext()),
+        SE(SE), DT(DT), LI(LI), LPMAddNewLoop(LPMAddNewLoop), OriginalLoop(L),
+        Range(R), MainLoopStructure(LS) {}
+
+  // Entry point for the algorithm.  Returns true on success.
+  bool run();
+};
+
+} // end anonymous namespace
+
+/// Given a loop with an deccreasing induction variable, is it possible to
+/// safely calculate the bounds of a new loop using the given Predicate.
+static bool isSafeDecreasingBound(const SCEV *Start,
+                                  const SCEV *BoundSCEV, const SCEV *Step,
+                                  ICmpInst::Predicate Pred,
+                                  unsigned LatchBrExitIdx,
+                                  Loop *L, ScalarEvolution &SE) {
+  if (Pred != ICmpInst::ICMP_SLT && Pred != ICmpInst::ICMP_SGT &&
+      Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_UGT)
+    return false;
+
+  if (!SE.isAvailableAtLoopEntry(BoundSCEV, L))
+    return false;
+
+  assert(SE.isKnownNegative(Step) && "expecting negative step");
+
+  LLVM_DEBUG(dbgs() << "irce: isSafeDecreasingBound with:\n");
+  LLVM_DEBUG(dbgs() << "irce: Start: " << *Start << "\n");
+  LLVM_DEBUG(dbgs() << "irce: Step: " << *Step << "\n");
+  LLVM_DEBUG(dbgs() << "irce: BoundSCEV: " << *BoundSCEV << "\n");
+  LLVM_DEBUG(dbgs() << "irce: Pred: " << ICmpInst::getPredicateName(Pred)
+                    << "\n");
+  LLVM_DEBUG(dbgs() << "irce: LatchExitBrIdx: " << LatchBrExitIdx << "\n");
+
+  bool IsSigned = ICmpInst::isSigned(Pred);
+  // The predicate that we need to check that the induction variable lies
+  // within bounds.
+  ICmpInst::Predicate BoundPred =
+    IsSigned ? CmpInst::ICMP_SGT : CmpInst::ICMP_UGT;
+
+  if (LatchBrExitIdx == 1)
+    return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, BoundSCEV);
+
+  assert(LatchBrExitIdx == 0 &&
+         "LatchBrExitIdx should be either 0 or 1");
+
+  const SCEV *StepPlusOne = SE.getAddExpr(Step, SE.getOne(Step->getType()));
+  unsigned BitWidth = cast<IntegerType>(BoundSCEV->getType())->getBitWidth();
+  APInt Min = IsSigned ? APInt::getSignedMinValue(BitWidth) :
+    APInt::getMinValue(BitWidth);
+  const SCEV *Limit = SE.getMinusSCEV(SE.getConstant(Min), StepPlusOne);
+
+  const SCEV *MinusOne =
+    SE.getMinusSCEV(BoundSCEV, SE.getOne(BoundSCEV->getType()));
+
+  return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, MinusOne) &&
+         SE.isLoopEntryGuardedByCond(L, BoundPred, BoundSCEV, Limit);
+
+}
+
+/// Given a loop with an increasing induction variable, is it possible to
+/// safely calculate the bounds of a new loop using the given Predicate.
+static bool isSafeIncreasingBound(const SCEV *Start,
+                                  const SCEV *BoundSCEV, const SCEV *Step,
+                                  ICmpInst::Predicate Pred,
+                                  unsigned LatchBrExitIdx,
+                                  Loop *L, ScalarEvolution &SE) {
+  if (Pred != ICmpInst::ICMP_SLT && Pred != ICmpInst::ICMP_SGT &&
+      Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_UGT)
+    return false;
+
+  if (!SE.isAvailableAtLoopEntry(BoundSCEV, L))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "irce: isSafeIncreasingBound with:\n");
+  LLVM_DEBUG(dbgs() << "irce: Start: " << *Start << "\n");
+  LLVM_DEBUG(dbgs() << "irce: Step: " << *Step << "\n");
+  LLVM_DEBUG(dbgs() << "irce: BoundSCEV: " << *BoundSCEV << "\n");
+  LLVM_DEBUG(dbgs() << "irce: Pred: " << ICmpInst::getPredicateName(Pred)
+                    << "\n");
+  LLVM_DEBUG(dbgs() << "irce: LatchExitBrIdx: " << LatchBrExitIdx << "\n");
+
+  bool IsSigned = ICmpInst::isSigned(Pred);
+  // The predicate that we need to check that the induction variable lies
+  // within bounds.
+  ICmpInst::Predicate BoundPred =
+      IsSigned ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT;
+
+  if (LatchBrExitIdx == 1)
+    return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, BoundSCEV);
+
+  assert(LatchBrExitIdx == 0 && "LatchBrExitIdx should be 0 or 1");
+
+  const SCEV *StepMinusOne =
+    SE.getMinusSCEV(Step, SE.getOne(Step->getType()));
+  unsigned BitWidth = cast<IntegerType>(BoundSCEV->getType())->getBitWidth();
+  APInt Max = IsSigned ? APInt::getSignedMaxValue(BitWidth) :
+    APInt::getMaxValue(BitWidth);
+  const SCEV *Limit = SE.getMinusSCEV(SE.getConstant(Max), StepMinusOne);
+
+  return (SE.isLoopEntryGuardedByCond(L, BoundPred, Start,
+                                      SE.getAddExpr(BoundSCEV, Step)) &&
+          SE.isLoopEntryGuardedByCond(L, BoundPred, BoundSCEV, Limit));
+}
+
+Optional<LoopStructure>
 LoopStructure::parseLoopStructure(ScalarEvolution &SE, Loop &L,
-                                  const char *&FailureReason) { 
-  if (!L.isLoopSimplifyForm()) { 
-    FailureReason = "loop not in LoopSimplify form"; 
-    return None; 
-  } 
- 
-  BasicBlock *Latch = L.getLoopLatch(); 
-  assert(Latch && "Simplified loops only have one latch!"); 
- 
-  if (Latch->getTerminator()->getMetadata(ClonedLoopTag)) { 
-    FailureReason = "loop has already been cloned"; 
-    return None; 
-  } 
- 
-  if (!L.isLoopExiting(Latch)) { 
-    FailureReason = "no loop latch"; 
-    return None; 
-  } 
- 
-  BasicBlock *Header = L.getHeader(); 
-  BasicBlock *Preheader = L.getLoopPreheader(); 
-  if (!Preheader) { 
-    FailureReason = "no preheader"; 
-    return None; 
-  } 
- 
-  BranchInst *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator()); 
-  if (!LatchBr || LatchBr->isUnconditional()) { 
-    FailureReason = "latch terminator not conditional branch"; 
-    return None; 
-  } 
- 
-  unsigned LatchBrExitIdx = LatchBr->getSuccessor(0) == Header ? 1 : 0; 
- 
-  ICmpInst *ICI = dyn_cast<ICmpInst>(LatchBr->getCondition()); 
-  if (!ICI || !isa<IntegerType>(ICI->getOperand(0)->getType())) { 
-    FailureReason = "latch terminator branch not conditional on integral icmp"; 
-    return None; 
-  } 
- 
-  const SCEV *LatchCount = SE.getExitCount(&L, Latch); 
-  if (isa<SCEVCouldNotCompute>(LatchCount)) { 
-    FailureReason = "could not compute latch count"; 
-    return None; 
-  } 
- 
-  ICmpInst::Predicate Pred = ICI->getPredicate(); 
-  Value *LeftValue = ICI->getOperand(0); 
-  const SCEV *LeftSCEV = SE.getSCEV(LeftValue); 
-  IntegerType *IndVarTy = cast<IntegerType>(LeftValue->getType()); 
- 
-  Value *RightValue = ICI->getOperand(1); 
-  const SCEV *RightSCEV = SE.getSCEV(RightValue); 
- 
-  // We canonicalize `ICI` such that `LeftSCEV` is an add recurrence. 
-  if (!isa<SCEVAddRecExpr>(LeftSCEV)) { 
-    if (isa<SCEVAddRecExpr>(RightSCEV)) { 
-      std::swap(LeftSCEV, RightSCEV); 
-      std::swap(LeftValue, RightValue); 
-      Pred = ICmpInst::getSwappedPredicate(Pred); 
-    } else { 
-      FailureReason = "no add recurrences in the icmp"; 
-      return None; 
-    } 
-  } 
- 
-  auto HasNoSignedWrap = [&](const SCEVAddRecExpr *AR) { 
-    if (AR->getNoWrapFlags(SCEV::FlagNSW)) 
-      return true; 
- 
-    IntegerType *Ty = cast<IntegerType>(AR->getType()); 
-    IntegerType *WideTy = 
-        IntegerType::get(Ty->getContext(), Ty->getBitWidth() * 2); 
- 
-    const SCEVAddRecExpr *ExtendAfterOp = 
-        dyn_cast<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy)); 
-    if (ExtendAfterOp) { 
-      const SCEV *ExtendedStart = SE.getSignExtendExpr(AR->getStart(), WideTy); 
-      const SCEV *ExtendedStep = 
-          SE.getSignExtendExpr(AR->getStepRecurrence(SE), WideTy); 
- 
-      bool NoSignedWrap = ExtendAfterOp->getStart() == ExtendedStart && 
-                          ExtendAfterOp->getStepRecurrence(SE) == ExtendedStep; 
- 
-      if (NoSignedWrap) 
-        return true; 
-    } 
- 
-    // We may have proved this when computing the sign extension above. 
-    return AR->getNoWrapFlags(SCEV::FlagNSW) != SCEV::FlagAnyWrap; 
-  }; 
- 
-  // `ICI` is interpreted as taking the backedge if the *next* value of the 
-  // induction variable satisfies some constraint. 
- 
-  const SCEVAddRecExpr *IndVarBase = cast<SCEVAddRecExpr>(LeftSCEV); 
-  if (!IndVarBase->isAffine()) { 
-    FailureReason = "LHS in icmp not induction variable"; 
-    return None; 
-  } 
-  const SCEV* StepRec = IndVarBase->getStepRecurrence(SE); 
-  if (!isa<SCEVConstant>(StepRec)) { 
-    FailureReason = "LHS in icmp not induction variable"; 
-    return None; 
-  } 
-  ConstantInt *StepCI = cast<SCEVConstant>(StepRec)->getValue(); 
- 
-  if (ICI->isEquality() && !HasNoSignedWrap(IndVarBase)) { 
-    FailureReason = "LHS in icmp needs nsw for equality predicates"; 
-    return None; 
-  } 
- 
-  assert(!StepCI->isZero() && "Zero step?"); 
-  bool IsIncreasing = !StepCI->isNegative(); 
-  bool IsSignedPredicate; 
-  const SCEV *StartNext = IndVarBase->getStart(); 
-  const SCEV *Addend = SE.getNegativeSCEV(IndVarBase->getStepRecurrence(SE)); 
-  const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend); 
-  const SCEV *Step = SE.getSCEV(StepCI); 
- 
-  const SCEV *FixedRightSCEV = nullptr; 
- 
-  // If RightValue resides within loop (but still being loop invariant), 
-  // regenerate it as preheader. 
-  if (auto *I = dyn_cast<Instruction>(RightValue)) 
-    if (L.contains(I->getParent())) 
-      FixedRightSCEV = RightSCEV; 
- 
-  if (IsIncreasing) { 
-    bool DecreasedRightValueByOne = false; 
-    if (StepCI->isOne()) { 
-      // Try to turn eq/ne predicates to those we can work with. 
-      if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1) 
-        // while (++i != len) {         while (++i < len) { 
-        //   ...                 --->     ... 
-        // }                            } 
-        // If both parts are known non-negative, it is profitable to use 
-        // unsigned comparison in increasing loop. This allows us to make the 
-        // comparison check against "RightSCEV + 1" more optimistic. 
-        if (isKnownNonNegativeInLoop(IndVarStart, &L, SE) && 
-            isKnownNonNegativeInLoop(RightSCEV, &L, SE)) 
-          Pred = ICmpInst::ICMP_ULT; 
-        else 
-          Pred = ICmpInst::ICMP_SLT; 
-      else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0) { 
-        // while (true) {               while (true) { 
-        //   if (++i == len)     --->     if (++i > len - 1) 
-        //     break;                       break; 
-        //   ...                          ... 
-        // }                            } 
-        if (IndVarBase->getNoWrapFlags(SCEV::FlagNUW) && 
-            cannotBeMinInLoop(RightSCEV, &L, SE, /*Signed*/false)) { 
-          Pred = ICmpInst::ICMP_UGT; 
-          RightSCEV = SE.getMinusSCEV(RightSCEV, 
-                                      SE.getOne(RightSCEV->getType())); 
-          DecreasedRightValueByOne = true; 
-        } else if (cannotBeMinInLoop(RightSCEV, &L, SE, /*Signed*/true)) { 
-          Pred = ICmpInst::ICMP_SGT; 
-          RightSCEV = SE.getMinusSCEV(RightSCEV, 
-                                      SE.getOne(RightSCEV->getType())); 
-          DecreasedRightValueByOne = true; 
-        } 
-      } 
-    } 
- 
-    bool LTPred = (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_ULT); 
-    bool GTPred = (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_UGT); 
-    bool FoundExpectedPred = 
-        (LTPred && LatchBrExitIdx == 1) || (GTPred && LatchBrExitIdx == 0); 
- 
-    if (!FoundExpectedPred) { 
-      FailureReason = "expected icmp slt semantically, found something else"; 
-      return None; 
-    } 
- 
-    IsSignedPredicate = ICmpInst::isSigned(Pred); 
-    if (!IsSignedPredicate && !AllowUnsignedLatchCondition) { 
-      FailureReason = "unsigned latch conditions are explicitly prohibited"; 
-      return None; 
-    } 
- 
-    if (!isSafeIncreasingBound(IndVarStart, RightSCEV, Step, Pred, 
-                               LatchBrExitIdx, &L, SE)) { 
-      FailureReason = "Unsafe loop bounds"; 
-      return None; 
-    } 
-    if (LatchBrExitIdx == 0) { 
-      // We need to increase the right value unless we have already decreased 
-      // it virtually when we replaced EQ with SGT. 
-      if (!DecreasedRightValueByOne) 
-        FixedRightSCEV = 
-            SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType())); 
-    } else { 
-      assert(!DecreasedRightValueByOne && 
-             "Right value can be decreased only for LatchBrExitIdx == 0!"); 
-    } 
-  } else { 
-    bool IncreasedRightValueByOne = false; 
-    if (StepCI->isMinusOne()) { 
-      // Try to turn eq/ne predicates to those we can work with. 
-      if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1) 
-        // while (--i != len) {         while (--i > len) { 
-        //   ...                 --->     ... 
-        // }                            } 
-        // We intentionally don't turn the predicate into UGT even if we know 
-        // that both operands are non-negative, because it will only pessimize 
-        // our check against "RightSCEV - 1". 
-        Pred = ICmpInst::ICMP_SGT; 
-      else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0) { 
-        // while (true) {               while (true) { 
-        //   if (--i == len)     --->     if (--i < len + 1) 
-        //     break;                       break; 
-        //   ...                          ... 
-        // }                            } 
-        if (IndVarBase->getNoWrapFlags(SCEV::FlagNUW) && 
-            cannotBeMaxInLoop(RightSCEV, &L, SE, /* Signed */ false)) { 
-          Pred = ICmpInst::ICMP_ULT; 
-          RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType())); 
-          IncreasedRightValueByOne = true; 
-        } else if (cannotBeMaxInLoop(RightSCEV, &L, SE, /* Signed */ true)) { 
-          Pred = ICmpInst::ICMP_SLT; 
-          RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType())); 
-          IncreasedRightValueByOne = true; 
-        } 
-      } 
-    } 
- 
-    bool LTPred = (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_ULT); 
-    bool GTPred = (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_UGT); 
- 
-    bool FoundExpectedPred = 
-        (GTPred && LatchBrExitIdx == 1) || (LTPred && LatchBrExitIdx == 0); 
- 
-    if (!FoundExpectedPred) { 
-      FailureReason = "expected icmp sgt semantically, found something else"; 
-      return None; 
-    } 
- 
-    IsSignedPredicate = 
-        Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGT; 
- 
-    if (!IsSignedPredicate && !AllowUnsignedLatchCondition) { 
-      FailureReason = "unsigned latch conditions are explicitly prohibited"; 
-      return None; 
-    } 
- 
-    if (!isSafeDecreasingBound(IndVarStart, RightSCEV, Step, Pred, 
-                               LatchBrExitIdx, &L, SE)) { 
-      FailureReason = "Unsafe bounds"; 
-      return None; 
-    } 
- 
-    if (LatchBrExitIdx == 0) { 
-      // We need to decrease the right value unless we have already increased 
-      // it virtually when we replaced EQ with SLT. 
-      if (!IncreasedRightValueByOne) 
-        FixedRightSCEV = 
-            SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType())); 
-    } else { 
-      assert(!IncreasedRightValueByOne && 
-             "Right value can be increased only for LatchBrExitIdx == 0!"); 
-    } 
-  } 
-  BasicBlock *LatchExit = LatchBr->getSuccessor(LatchBrExitIdx); 
- 
-  assert(SE.getLoopDisposition(LatchCount, &L) == 
-             ScalarEvolution::LoopInvariant && 
-         "loop variant exit count doesn't make sense!"); 
- 
-  assert(!L.contains(LatchExit) && "expected an exit block!"); 
-  const DataLayout &DL = Preheader->getModule()->getDataLayout(); 
-  SCEVExpander Expander(SE, DL, "irce"); 
-  Instruction *Ins = Preheader->getTerminator(); 
- 
-  if (FixedRightSCEV) 
-    RightValue = 
-        Expander.expandCodeFor(FixedRightSCEV, FixedRightSCEV->getType(), Ins); 
- 
-  Value *IndVarStartV = Expander.expandCodeFor(IndVarStart, IndVarTy, Ins); 
-  IndVarStartV->setName("indvar.start"); 
- 
-  LoopStructure Result; 
- 
-  Result.Tag = "main"; 
-  Result.Header = Header; 
-  Result.Latch = Latch; 
-  Result.LatchBr = LatchBr; 
-  Result.LatchExit = LatchExit; 
-  Result.LatchBrExitIdx = LatchBrExitIdx; 
-  Result.IndVarStart = IndVarStartV; 
-  Result.IndVarStep = StepCI; 
-  Result.IndVarBase = LeftValue; 
-  Result.IndVarIncreasing = IsIncreasing; 
-  Result.LoopExitAt = RightValue; 
-  Result.IsSignedPredicate = IsSignedPredicate; 
- 
-  FailureReason = nullptr; 
- 
-  return Result; 
-} 
- 
-/// If the type of \p S matches with \p Ty, return \p S. Otherwise, return 
-/// signed or unsigned extension of \p S to type \p Ty. 
-static const SCEV *NoopOrExtend(const SCEV *S, Type *Ty, ScalarEvolution &SE, 
-                                bool Signed) { 
-  return Signed ? SE.getNoopOrSignExtend(S, Ty) : SE.getNoopOrZeroExtend(S, Ty); 
-} 
- 
-Optional<LoopConstrainer::SubRanges> 
-LoopConstrainer::calculateSubRanges(bool IsSignedPredicate) const { 
-  IntegerType *Ty = cast<IntegerType>(LatchTakenCount->getType()); 
- 
-  auto *RTy = cast<IntegerType>(Range.getType()); 
- 
-  // We only support wide range checks and narrow latches. 
-  if (!AllowNarrowLatchCondition && RTy != Ty) 
-    return None; 
-  if (RTy->getBitWidth() < Ty->getBitWidth()) 
-    return None; 
- 
-  LoopConstrainer::SubRanges Result; 
- 
-  // I think we can be more aggressive here and make this nuw / nsw if the 
-  // addition that feeds into the icmp for the latch's terminating branch is nuw 
-  // / nsw.  In any case, a wrapping 2's complement addition is safe. 
-  const SCEV *Start = NoopOrExtend(SE.getSCEV(MainLoopStructure.IndVarStart), 
-                                   RTy, SE, IsSignedPredicate); 
-  const SCEV *End = NoopOrExtend(SE.getSCEV(MainLoopStructure.LoopExitAt), RTy, 
-                                 SE, IsSignedPredicate); 
- 
-  bool Increasing = MainLoopStructure.IndVarIncreasing; 
- 
-  // We compute `Smallest` and `Greatest` such that [Smallest, Greatest), or 
-  // [Smallest, GreatestSeen] is the range of values the induction variable 
-  // takes. 
- 
-  const SCEV *Smallest = nullptr, *Greatest = nullptr, *GreatestSeen = nullptr; 
- 
-  const SCEV *One = SE.getOne(RTy); 
-  if (Increasing) { 
-    Smallest = Start; 
-    Greatest = End; 
-    // No overflow, because the range [Smallest, GreatestSeen] is not empty. 
-    GreatestSeen = SE.getMinusSCEV(End, One); 
-  } else { 
-    // These two computations may sign-overflow.  Here is why that is okay: 
-    // 
-    // We know that the induction variable does not sign-overflow on any 
-    // iteration except the last one, and it starts at `Start` and ends at 
-    // `End`, decrementing by one every time. 
-    // 
-    //  * if `Smallest` sign-overflows we know `End` is `INT_SMAX`. Since the 
-    //    induction variable is decreasing we know that that the smallest value 
-    //    the loop body is actually executed with is `INT_SMIN` == `Smallest`. 
-    // 
-    //  * if `Greatest` sign-overflows, we know it can only be `INT_SMIN`.  In 
-    //    that case, `Clamp` will always return `Smallest` and 
-    //    [`Result.LowLimit`, `Result.HighLimit`) = [`Smallest`, `Smallest`) 
-    //    will be an empty range.  Returning an empty range is always safe. 
- 
-    Smallest = SE.getAddExpr(End, One); 
-    Greatest = SE.getAddExpr(Start, One); 
-    GreatestSeen = Start; 
-  } 
- 
-  auto Clamp = [this, Smallest, Greatest, IsSignedPredicate](const SCEV *S) { 
-    return IsSignedPredicate 
-               ? SE.getSMaxExpr(Smallest, SE.getSMinExpr(Greatest, S)) 
-               : SE.getUMaxExpr(Smallest, SE.getUMinExpr(Greatest, S)); 
-  }; 
- 
-  // In some cases we can prove that we don't need a pre or post loop. 
-  ICmpInst::Predicate PredLE = 
-      IsSignedPredicate ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; 
-  ICmpInst::Predicate PredLT = 
-      IsSignedPredicate ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; 
- 
-  bool ProvablyNoPreloop = 
-      SE.isKnownPredicate(PredLE, Range.getBegin(), Smallest); 
-  if (!ProvablyNoPreloop) 
-    Result.LowLimit = Clamp(Range.getBegin()); 
- 
-  bool ProvablyNoPostLoop = 
-      SE.isKnownPredicate(PredLT, GreatestSeen, Range.getEnd()); 
-  if (!ProvablyNoPostLoop) 
-    Result.HighLimit = Clamp(Range.getEnd()); 
- 
-  return Result; 
-} 
- 
-void LoopConstrainer::cloneLoop(LoopConstrainer::ClonedLoop &Result, 
-                                const char *Tag) const { 
-  for (BasicBlock *BB : OriginalLoop.getBlocks()) { 
-    BasicBlock *Clone = CloneBasicBlock(BB, Result.Map, Twine(".") + Tag, &F); 
-    Result.Blocks.push_back(Clone); 
-    Result.Map[BB] = Clone; 
-  } 
- 
-  auto GetClonedValue = [&Result](Value *V) { 
-    assert(V && "null values not in domain!"); 
-    auto It = Result.Map.find(V); 
-    if (It == Result.Map.end()) 
-      return V; 
-    return static_cast<Value *>(It->second); 
-  }; 
- 
-  auto *ClonedLatch = 
-      cast<BasicBlock>(GetClonedValue(OriginalLoop.getLoopLatch())); 
-  ClonedLatch->getTerminator()->setMetadata(ClonedLoopTag, 
-                                            MDNode::get(Ctx, {})); 
- 
-  Result.Structure = MainLoopStructure.map(GetClonedValue); 
-  Result.Structure.Tag = Tag; 
- 
-  for (unsigned i = 0, e = Result.Blocks.size(); i != e; ++i) { 
-    BasicBlock *ClonedBB = Result.Blocks[i]; 
-    BasicBlock *OriginalBB = OriginalLoop.getBlocks()[i]; 
- 
-    assert(Result.Map[OriginalBB] == ClonedBB && "invariant!"); 
- 
-    for (Instruction &I : *ClonedBB) 
-      RemapInstruction(&I, Result.Map, 
-                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); 
- 
-    // Exit blocks will now have one more predecessor and their PHI nodes need 
-    // to be edited to reflect that.  No phi nodes need to be introduced because 
-    // the loop is in LCSSA. 
- 
-    for (auto *SBB : successors(OriginalBB)) { 
-      if (OriginalLoop.contains(SBB)) 
-        continue; // not an exit block 
- 
-      for (PHINode &PN : SBB->phis()) { 
-        Value *OldIncoming = PN.getIncomingValueForBlock(OriginalBB); 
-        PN.addIncoming(GetClonedValue(OldIncoming), ClonedBB); 
-      } 
-    } 
-  } 
-} 
- 
-LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd( 
-    const LoopStructure &LS, BasicBlock *Preheader, Value *ExitSubloopAt, 
-    BasicBlock *ContinuationBlock) const { 
-  // We start with a loop with a single latch: 
-  // 
-  //    +--------------------+ 
-  //    |                    | 
-  //    |     preheader      | 
-  //    |                    | 
-  //    +--------+-----------+ 
-  //             |      ----------------\ 
-  //             |     /                | 
-  //    +--------v----v------+          | 
-  //    |                    |          | 
-  //    |      header        |          | 
-  //    |                    |          | 
-  //    +--------------------+          | 
-  //                                    | 
-  //            .....                   | 
-  //                                    | 
-  //    +--------------------+          | 
-  //    |                    |          | 
-  //    |       latch        >----------/ 
-  //    |                    | 
-  //    +-------v------------+ 
-  //            | 
-  //            | 
-  //            |   +--------------------+ 
-  //            |   |                    | 
-  //            +--->   original exit    | 
-  //                |                    | 
-  //                +--------------------+ 
-  // 
-  // We change the control flow to look like 
-  // 
-  // 
-  //    +--------------------+ 
-  //    |                    | 
-  //    |     preheader      >-------------------------+ 
-  //    |                    |                         | 
-  //    +--------v-----------+                         | 
-  //             |    /-------------+                  | 
-  //             |   /              |                  | 
-  //    +--------v--v--------+      |                  | 
-  //    |                    |      |                  | 
-  //    |      header        |      |   +--------+     | 
-  //    |                    |      |   |        |     | 
-  //    +--------------------+      |   |  +-----v-----v-----------+ 
-  //                                |   |  |                       | 
-  //                                |   |  |     .pseudo.exit      | 
-  //                                |   |  |                       | 
-  //                                |   |  +-----------v-----------+ 
-  //                                |   |              | 
-  //            .....               |   |              | 
-  //                                |   |     +--------v-------------+ 
-  //    +--------------------+      |   |     |                      | 
-  //    |                    |      |   |     |   ContinuationBlock  | 
-  //    |       latch        >------+   |     |                      | 
-  //    |                    |          |     +----------------------+ 
-  //    +---------v----------+          | 
-  //              |                     | 
-  //              |                     | 
-  //              |     +---------------^-----+ 
-  //              |     |                     | 
-  //              +----->    .exit.selector   | 
-  //                    |                     | 
-  //                    +----------v----------+ 
-  //                               | 
-  //     +--------------------+    | 
-  //     |                    |    | 
-  //     |   original exit    <----+ 
-  //     |                    | 
-  //     +--------------------+ 
- 
-  RewrittenRangeInfo RRI; 
- 
-  BasicBlock *BBInsertLocation = LS.Latch->getNextNode(); 
-  RRI.ExitSelector = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".exit.selector", 
-                                        &F, BBInsertLocation); 
-  RRI.PseudoExit = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".pseudo.exit", &F, 
-                                      BBInsertLocation); 
- 
-  BranchInst *PreheaderJump = cast<BranchInst>(Preheader->getTerminator()); 
-  bool Increasing = LS.IndVarIncreasing; 
-  bool IsSignedPredicate = LS.IsSignedPredicate; 
- 
-  IRBuilder<> B(PreheaderJump); 
-  auto *RangeTy = Range.getBegin()->getType(); 
-  auto NoopOrExt = [&](Value *V) { 
-    if (V->getType() == RangeTy) 
-      return V; 
-    return IsSignedPredicate ? B.CreateSExt(V, RangeTy, "wide." + V->getName()) 
-                             : B.CreateZExt(V, RangeTy, "wide." + V->getName()); 
-  }; 
- 
-  // EnterLoopCond - is it okay to start executing this `LS'? 
-  Value *EnterLoopCond = nullptr; 
-  auto Pred = 
-      Increasing 
-          ? (IsSignedPredicate ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT) 
-          : (IsSignedPredicate ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT); 
-  Value *IndVarStart = NoopOrExt(LS.IndVarStart); 
-  EnterLoopCond = B.CreateICmp(Pred, IndVarStart, ExitSubloopAt); 
- 
-  B.CreateCondBr(EnterLoopCond, LS.Header, RRI.PseudoExit); 
-  PreheaderJump->eraseFromParent(); 
- 
-  LS.LatchBr->setSuccessor(LS.LatchBrExitIdx, RRI.ExitSelector); 
-  B.SetInsertPoint(LS.LatchBr); 
-  Value *IndVarBase = NoopOrExt(LS.IndVarBase); 
-  Value *TakeBackedgeLoopCond = B.CreateICmp(Pred, IndVarBase, ExitSubloopAt); 
- 
-  Value *CondForBranch = LS.LatchBrExitIdx == 1 
-                             ? TakeBackedgeLoopCond 
-                             : B.CreateNot(TakeBackedgeLoopCond); 
- 
-  LS.LatchBr->setCondition(CondForBranch); 
- 
-  B.SetInsertPoint(RRI.ExitSelector); 
- 
-  // IterationsLeft - are there any more iterations left, given the original 
-  // upper bound on the induction variable?  If not, we branch to the "real" 
-  // exit. 
-  Value *LoopExitAt = NoopOrExt(LS.LoopExitAt); 
-  Value *IterationsLeft = B.CreateICmp(Pred, IndVarBase, LoopExitAt); 
-  B.CreateCondBr(IterationsLeft, RRI.PseudoExit, LS.LatchExit); 
- 
-  BranchInst *BranchToContinuation = 
-      BranchInst::Create(ContinuationBlock, RRI.PseudoExit); 
- 
-  // We emit PHI nodes into `RRI.PseudoExit' that compute the "latest" value of 
-  // each of the PHI nodes in the loop header.  This feeds into the initial 
-  // value of the same PHI nodes if/when we continue execution. 
-  for (PHINode &PN : LS.Header->phis()) { 
-    PHINode *NewPHI = PHINode::Create(PN.getType(), 2, PN.getName() + ".copy", 
-                                      BranchToContinuation); 
- 
-    NewPHI->addIncoming(PN.getIncomingValueForBlock(Preheader), Preheader); 
-    NewPHI->addIncoming(PN.getIncomingValueForBlock(LS.Latch), 
-                        RRI.ExitSelector); 
-    RRI.PHIValuesAtPseudoExit.push_back(NewPHI); 
-  } 
- 
-  RRI.IndVarEnd = PHINode::Create(IndVarBase->getType(), 2, "indvar.end", 
-                                  BranchToContinuation); 
-  RRI.IndVarEnd->addIncoming(IndVarStart, Preheader); 
-  RRI.IndVarEnd->addIncoming(IndVarBase, RRI.ExitSelector); 
- 
-  // The latch exit now has a branch from `RRI.ExitSelector' instead of 
-  // `LS.Latch'.  The PHI nodes need to be updated to reflect that. 
-  LS.LatchExit->replacePhiUsesWith(LS.Latch, RRI.ExitSelector); 
- 
-  return RRI; 
-} 
- 
-void LoopConstrainer::rewriteIncomingValuesForPHIs( 
-    LoopStructure &LS, BasicBlock *ContinuationBlock, 
-    const LoopConstrainer::RewrittenRangeInfo &RRI) const { 
-  unsigned PHIIndex = 0; 
-  for (PHINode &PN : LS.Header->phis()) 
-    PN.setIncomingValueForBlock(ContinuationBlock, 
-                                RRI.PHIValuesAtPseudoExit[PHIIndex++]); 
- 
-  LS.IndVarStart = RRI.IndVarEnd; 
-} 
- 
-BasicBlock *LoopConstrainer::createPreheader(const LoopStructure &LS, 
-                                             BasicBlock *OldPreheader, 
-                                             const char *Tag) const { 
-  BasicBlock *Preheader = BasicBlock::Create(Ctx, Tag, &F, LS.Header); 
-  BranchInst::Create(LS.Header, Preheader); 
- 
-  LS.Header->replacePhiUsesWith(OldPreheader, Preheader); 
- 
-  return Preheader; 
-} 
- 
-void LoopConstrainer::addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs) { 
-  Loop *ParentLoop = OriginalLoop.getParentLoop(); 
-  if (!ParentLoop) 
-    return; 
- 
-  for (BasicBlock *BB : BBs) 
-    ParentLoop->addBasicBlockToLoop(BB, LI); 
-} 
- 
-Loop *LoopConstrainer::createClonedLoopStructure(Loop *Original, Loop *Parent, 
-                                                 ValueToValueMapTy &VM, 
-                                                 bool IsSubloop) { 
-  Loop &New = *LI.AllocateLoop(); 
-  if (Parent) 
-    Parent->addChildLoop(&New); 
-  else 
-    LI.addTopLevelLoop(&New); 
-  LPMAddNewLoop(&New, IsSubloop); 
- 
-  // Add all of the blocks in Original to the new loop. 
-  for (auto *BB : Original->blocks()) 
-    if (LI.getLoopFor(BB) == Original) 
-      New.addBasicBlockToLoop(cast<BasicBlock>(VM[BB]), LI); 
- 
-  // Add all of the subloops to the new loop. 
-  for (Loop *SubLoop : *Original) 
-    createClonedLoopStructure(SubLoop, &New, VM, /* IsSubloop */ true); 
- 
-  return &New; 
-} 
- 
-bool LoopConstrainer::run() { 
-  BasicBlock *Preheader = nullptr; 
-  LatchTakenCount = SE.getExitCount(&OriginalLoop, MainLoopStructure.Latch); 
-  Preheader = OriginalLoop.getLoopPreheader(); 
-  assert(!isa<SCEVCouldNotCompute>(LatchTakenCount) && Preheader != nullptr && 
-         "preconditions!"); 
- 
-  OriginalPreheader = Preheader; 
-  MainLoopPreheader = Preheader; 
- 
-  bool IsSignedPredicate = MainLoopStructure.IsSignedPredicate; 
-  Optional<SubRanges> MaybeSR = calculateSubRanges(IsSignedPredicate); 
-  if (!MaybeSR.hasValue()) { 
-    LLVM_DEBUG(dbgs() << "irce: could not compute subranges\n"); 
-    return false; 
-  } 
- 
-  SubRanges SR = MaybeSR.getValue(); 
-  bool Increasing = MainLoopStructure.IndVarIncreasing; 
-  IntegerType *IVTy = 
-      cast<IntegerType>(Range.getBegin()->getType()); 
- 
-  SCEVExpander Expander(SE, F.getParent()->getDataLayout(), "irce"); 
-  Instruction *InsertPt = OriginalPreheader->getTerminator(); 
- 
-  // It would have been better to make `PreLoop' and `PostLoop' 
-  // `Optional<ClonedLoop>'s, but `ValueToValueMapTy' does not have a copy 
-  // constructor. 
-  ClonedLoop PreLoop, PostLoop; 
-  bool NeedsPreLoop = 
-      Increasing ? SR.LowLimit.hasValue() : SR.HighLimit.hasValue(); 
-  bool NeedsPostLoop = 
-      Increasing ? SR.HighLimit.hasValue() : SR.LowLimit.hasValue(); 
- 
-  Value *ExitPreLoopAt = nullptr; 
-  Value *ExitMainLoopAt = nullptr; 
-  const SCEVConstant *MinusOneS = 
-      cast<SCEVConstant>(SE.getConstant(IVTy, -1, true /* isSigned */)); 
- 
-  if (NeedsPreLoop) { 
-    const SCEV *ExitPreLoopAtSCEV = nullptr; 
- 
-    if (Increasing) 
-      ExitPreLoopAtSCEV = *SR.LowLimit; 
-    else if (cannotBeMinInLoop(*SR.HighLimit, &OriginalLoop, SE, 
-                               IsSignedPredicate)) 
-      ExitPreLoopAtSCEV = SE.getAddExpr(*SR.HighLimit, MinusOneS); 
-    else { 
-      LLVM_DEBUG(dbgs() << "irce: could not prove no-overflow when computing " 
-                        << "preloop exit limit.  HighLimit = " 
-                        << *(*SR.HighLimit) << "\n"); 
-      return false; 
-    } 
- 
-    if (!isSafeToExpandAt(ExitPreLoopAtSCEV, InsertPt, SE)) { 
-      LLVM_DEBUG(dbgs() << "irce: could not prove that it is safe to expand the" 
-                        << " preloop exit limit " << *ExitPreLoopAtSCEV 
-                        << " at block " << InsertPt->getParent()->getName() 
-                        << "\n"); 
-      return false; 
-    } 
- 
-    ExitPreLoopAt = Expander.expandCodeFor(ExitPreLoopAtSCEV, IVTy, InsertPt); 
-    ExitPreLoopAt->setName("exit.preloop.at"); 
-  } 
- 
-  if (NeedsPostLoop) { 
-    const SCEV *ExitMainLoopAtSCEV = nullptr; 
- 
-    if (Increasing) 
-      ExitMainLoopAtSCEV = *SR.HighLimit; 
-    else if (cannotBeMinInLoop(*SR.LowLimit, &OriginalLoop, SE, 
-                               IsSignedPredicate)) 
-      ExitMainLoopAtSCEV = SE.getAddExpr(*SR.LowLimit, MinusOneS); 
-    else { 
-      LLVM_DEBUG(dbgs() << "irce: could not prove no-overflow when computing " 
-                        << "mainloop exit limit.  LowLimit = " 
-                        << *(*SR.LowLimit) << "\n"); 
-      return false; 
-    } 
- 
-    if (!isSafeToExpandAt(ExitMainLoopAtSCEV, InsertPt, SE)) { 
-      LLVM_DEBUG(dbgs() << "irce: could not prove that it is safe to expand the" 
-                        << " main loop exit limit " << *ExitMainLoopAtSCEV 
-                        << " at block " << InsertPt->getParent()->getName() 
-                        << "\n"); 
-      return false; 
-    } 
- 
-    ExitMainLoopAt = Expander.expandCodeFor(ExitMainLoopAtSCEV, IVTy, InsertPt); 
-    ExitMainLoopAt->setName("exit.mainloop.at"); 
-  } 
- 
-  // We clone these ahead of time so that we don't have to deal with changing 
-  // and temporarily invalid IR as we transform the loops. 
-  if (NeedsPreLoop) 
-    cloneLoop(PreLoop, "preloop"); 
-  if (NeedsPostLoop) 
-    cloneLoop(PostLoop, "postloop"); 
- 
-  RewrittenRangeInfo PreLoopRRI; 
- 
-  if (NeedsPreLoop) { 
-    Preheader->getTerminator()->replaceUsesOfWith(MainLoopStructure.Header, 
-                                                  PreLoop.Structure.Header); 
- 
-    MainLoopPreheader = 
-        createPreheader(MainLoopStructure, Preheader, "mainloop"); 
-    PreLoopRRI = changeIterationSpaceEnd(PreLoop.Structure, Preheader, 
-                                         ExitPreLoopAt, MainLoopPreheader); 
-    rewriteIncomingValuesForPHIs(MainLoopStructure, MainLoopPreheader, 
-                                 PreLoopRRI); 
-  } 
- 
-  BasicBlock *PostLoopPreheader = nullptr; 
-  RewrittenRangeInfo PostLoopRRI; 
- 
-  if (NeedsPostLoop) { 
-    PostLoopPreheader = 
-        createPreheader(PostLoop.Structure, Preheader, "postloop"); 
-    PostLoopRRI = changeIterationSpaceEnd(MainLoopStructure, MainLoopPreheader, 
-                                          ExitMainLoopAt, PostLoopPreheader); 
-    rewriteIncomingValuesForPHIs(PostLoop.Structure, PostLoopPreheader, 
-                                 PostLoopRRI); 
-  } 
- 
-  BasicBlock *NewMainLoopPreheader = 
-      MainLoopPreheader != Preheader ? MainLoopPreheader : nullptr; 
-  BasicBlock *NewBlocks[] = {PostLoopPreheader,        PreLoopRRI.PseudoExit, 
-                             PreLoopRRI.ExitSelector,  PostLoopRRI.PseudoExit, 
-                             PostLoopRRI.ExitSelector, NewMainLoopPreheader}; 
- 
-  // Some of the above may be nullptr, filter them out before passing to 
-  // addToParentLoopIfNeeded. 
-  auto NewBlocksEnd = 
-      std::remove(std::begin(NewBlocks), std::end(NewBlocks), nullptr); 
- 
-  addToParentLoopIfNeeded(makeArrayRef(std::begin(NewBlocks), NewBlocksEnd)); 
- 
-  DT.recalculate(F); 
- 
-  // We need to first add all the pre and post loop blocks into the loop 
-  // structures (as part of createClonedLoopStructure), and then update the 
-  // LCSSA form and LoopSimplifyForm. This is necessary for correctly updating 
-  // LI when LoopSimplifyForm is generated. 
-  Loop *PreL = nullptr, *PostL = nullptr; 
-  if (!PreLoop.Blocks.empty()) { 
-    PreL = createClonedLoopStructure(&OriginalLoop, 
-                                     OriginalLoop.getParentLoop(), PreLoop.Map, 
-                                     /* IsSubLoop */ false); 
-  } 
- 
-  if (!PostLoop.Blocks.empty()) { 
-    PostL = 
-        createClonedLoopStructure(&OriginalLoop, OriginalLoop.getParentLoop(), 
-                                  PostLoop.Map, /* IsSubLoop */ false); 
-  } 
- 
-  // This function canonicalizes the loop into Loop-Simplify and LCSSA forms. 
-  auto CanonicalizeLoop = [&] (Loop *L, bool IsOriginalLoop) { 
-    formLCSSARecursively(*L, DT, &LI, &SE); 
-    simplifyLoop(L, &DT, &LI, &SE, nullptr, nullptr, true); 
-    // Pre/post loops are slow paths, we do not need to perform any loop 
-    // optimizations on them. 
-    if (!IsOriginalLoop) 
-      DisableAllLoopOptsOnLoop(*L); 
-  }; 
-  if (PreL) 
-    CanonicalizeLoop(PreL, false); 
-  if (PostL) 
-    CanonicalizeLoop(PostL, false); 
-  CanonicalizeLoop(&OriginalLoop, true); 
- 
-  return true; 
-} 
- 
-/// Computes and returns a range of values for the induction variable (IndVar) 
-/// in which the range check can be safely elided.  If it cannot compute such a 
-/// range, returns None. 
-Optional<InductiveRangeCheck::Range> 
-InductiveRangeCheck::computeSafeIterationSpace( 
-    ScalarEvolution &SE, const SCEVAddRecExpr *IndVar, 
-    bool IsLatchSigned) const { 
-  // We can deal when types of latch check and range checks don't match in case 
-  // if latch check is more narrow. 
-  auto *IVType = cast<IntegerType>(IndVar->getType()); 
-  auto *RCType = cast<IntegerType>(getBegin()->getType()); 
-  if (IVType->getBitWidth() > RCType->getBitWidth()) 
-    return None; 
-  // IndVar is of the form "A + B * I" (where "I" is the canonical induction 
-  // variable, that may or may not exist as a real llvm::Value in the loop) and 
-  // this inductive range check is a range check on the "C + D * I" ("C" is 
-  // getBegin() and "D" is getStep()).  We rewrite the value being range 
-  // checked to "M + N * IndVar" where "N" = "D * B^(-1)" and "M" = "C - NA". 
-  // 
-  // The actual inequalities we solve are of the form 
-  // 
-  //   0 <= M + 1 * IndVar < L given L >= 0  (i.e. N == 1) 
-  // 
-  // Here L stands for upper limit of the safe iteration space. 
-  // The inequality is satisfied by (0 - M) <= IndVar < (L - M). To avoid 
-  // overflows when calculating (0 - M) and (L - M) we, depending on type of 
-  // IV's iteration space, limit the calculations by borders of the iteration 
-  // space. For example, if IndVar is unsigned, (0 - M) overflows for any M > 0. 
-  // If we figured out that "anything greater than (-M) is safe", we strengthen 
-  // this to "everything greater than 0 is safe", assuming that values between 
-  // -M and 0 just do not exist in unsigned iteration space, and we don't want 
-  // to deal with overflown values. 
- 
-  if (!IndVar->isAffine()) 
-    return None; 
- 
-  const SCEV *A = NoopOrExtend(IndVar->getStart(), RCType, SE, IsLatchSigned); 
-  const SCEVConstant *B = dyn_cast<SCEVConstant>( 
-      NoopOrExtend(IndVar->getStepRecurrence(SE), RCType, SE, IsLatchSigned)); 
-  if (!B) 
-    return None; 
-  assert(!B->isZero() && "Recurrence with zero step?"); 
- 
-  const SCEV *C = getBegin(); 
-  const SCEVConstant *D = dyn_cast<SCEVConstant>(getStep()); 
-  if (D != B) 
-    return None; 
- 
-  assert(!D->getValue()->isZero() && "Recurrence with zero step?"); 
-  unsigned BitWidth = RCType->getBitWidth(); 
-  const SCEV *SIntMax = SE.getConstant(APInt::getSignedMaxValue(BitWidth)); 
- 
-  // Subtract Y from X so that it does not go through border of the IV 
-  // iteration space. Mathematically, it is equivalent to: 
-  // 
-  //    ClampedSubtract(X, Y) = min(max(X - Y, INT_MIN), INT_MAX).        [1] 
-  // 
-  // In [1], 'X - Y' is a mathematical subtraction (result is not bounded to 
-  // any width of bit grid). But after we take min/max, the result is 
-  // guaranteed to be within [INT_MIN, INT_MAX]. 
-  // 
-  // In [1], INT_MAX and INT_MIN are respectively signed and unsigned max/min 
-  // values, depending on type of latch condition that defines IV iteration 
-  // space. 
-  auto ClampedSubtract = [&](const SCEV *X, const SCEV *Y) { 
-    // FIXME: The current implementation assumes that X is in [0, SINT_MAX]. 
-    // This is required to ensure that SINT_MAX - X does not overflow signed and 
-    // that X - Y does not overflow unsigned if Y is negative. Can we lift this 
-    // restriction and make it work for negative X either? 
-    if (IsLatchSigned) { 
-      // X is a number from signed range, Y is interpreted as signed. 
-      // Even if Y is SINT_MAX, (X - Y) does not reach SINT_MIN. So the only 
-      // thing we should care about is that we didn't cross SINT_MAX. 
-      // So, if Y is positive, we subtract Y safely. 
-      //   Rule 1: Y > 0 ---> Y. 
-      // If 0 <= -Y <= (SINT_MAX - X), we subtract Y safely. 
-      //   Rule 2: Y >=s (X - SINT_MAX) ---> Y. 
-      // If 0 <= (SINT_MAX - X) < -Y, we can only subtract (X - SINT_MAX). 
-      //   Rule 3: Y <s (X - SINT_MAX) ---> (X - SINT_MAX). 
-      // It gives us smax(Y, X - SINT_MAX) to subtract in all cases. 
-      const SCEV *XMinusSIntMax = SE.getMinusSCEV(X, SIntMax); 
-      return SE.getMinusSCEV(X, SE.getSMaxExpr(Y, XMinusSIntMax), 
-                             SCEV::FlagNSW); 
-    } else 
-      // X is a number from unsigned range, Y is interpreted as signed. 
-      // Even if Y is SINT_MIN, (X - Y) does not reach UINT_MAX. So the only 
-      // thing we should care about is that we didn't cross zero. 
-      // So, if Y is negative, we subtract Y safely. 
-      //   Rule 1: Y <s 0 ---> Y. 
-      // If 0 <= Y <= X, we subtract Y safely. 
-      //   Rule 2: Y <=s X ---> Y. 
-      // If 0 <= X < Y, we should stop at 0 and can only subtract X. 
-      //   Rule 3: Y >s X ---> X. 
-      // It gives us smin(X, Y) to subtract in all cases. 
-      return SE.getMinusSCEV(X, SE.getSMinExpr(X, Y), SCEV::FlagNUW); 
-  }; 
-  const SCEV *M = SE.getMinusSCEV(C, A); 
-  const SCEV *Zero = SE.getZero(M->getType()); 
- 
-  // This function returns SCEV equal to 1 if X is non-negative 0 otherwise. 
-  auto SCEVCheckNonNegative = [&](const SCEV *X) { 
-    const Loop *L = IndVar->getLoop(); 
-    const SCEV *One = SE.getOne(X->getType()); 
-    // Can we trivially prove that X is a non-negative or negative value? 
-    if (isKnownNonNegativeInLoop(X, L, SE)) 
-      return One; 
-    else if (isKnownNegativeInLoop(X, L, SE)) 
-      return Zero; 
-    // If not, we will have to figure it out during the execution. 
-    // Function smax(smin(X, 0), -1) + 1 equals to 1 if X >= 0 and 0 if X < 0. 
-    const SCEV *NegOne = SE.getNegativeSCEV(One); 
-    return SE.getAddExpr(SE.getSMaxExpr(SE.getSMinExpr(X, Zero), NegOne), One); 
-  }; 
-  // FIXME: Current implementation of ClampedSubtract implicitly assumes that 
-  // X is non-negative (in sense of a signed value). We need to re-implement 
-  // this function in a way that it will correctly handle negative X as well. 
-  // We use it twice: for X = 0 everything is fine, but for X = getEnd() we can 
-  // end up with a negative X and produce wrong results. So currently we ensure 
-  // that if getEnd() is negative then both ends of the safe range are zero. 
-  // Note that this may pessimize elimination of unsigned range checks against 
-  // negative values. 
-  const SCEV *REnd = getEnd(); 
-  const SCEV *EndIsNonNegative = SCEVCheckNonNegative(REnd); 
- 
-  const SCEV *Begin = SE.getMulExpr(ClampedSubtract(Zero, M), EndIsNonNegative); 
-  const SCEV *End = SE.getMulExpr(ClampedSubtract(REnd, M), EndIsNonNegative); 
-  return InductiveRangeCheck::Range(Begin, End); 
-} 
- 
-static Optional<InductiveRangeCheck::Range> 
-IntersectSignedRange(ScalarEvolution &SE, 
-                     const Optional<InductiveRangeCheck::Range> &R1, 
-                     const InductiveRangeCheck::Range &R2) { 
-  if (R2.isEmpty(SE, /* IsSigned */ true)) 
-    return None; 
-  if (!R1.hasValue()) 
-    return R2; 
-  auto &R1Value = R1.getValue(); 
-  // We never return empty ranges from this function, and R1 is supposed to be 
-  // a result of intersection. Thus, R1 is never empty. 
-  assert(!R1Value.isEmpty(SE, /* IsSigned */ true) && 
-         "We should never have empty R1!"); 
- 
-  // TODO: we could widen the smaller range and have this work; but for now we 
-  // bail out to keep things simple. 
-  if (R1Value.getType() != R2.getType()) 
-    return None; 
- 
-  const SCEV *NewBegin = SE.getSMaxExpr(R1Value.getBegin(), R2.getBegin()); 
-  const SCEV *NewEnd = SE.getSMinExpr(R1Value.getEnd(), R2.getEnd()); 
- 
-  // If the resulting range is empty, just return None. 
-  auto Ret = InductiveRangeCheck::Range(NewBegin, NewEnd); 
-  if (Ret.isEmpty(SE, /* IsSigned */ true)) 
-    return None; 
-  return Ret; 
-} 
- 
-static Optional<InductiveRangeCheck::Range> 
-IntersectUnsignedRange(ScalarEvolution &SE, 
-                       const Optional<InductiveRangeCheck::Range> &R1, 
-                       const InductiveRangeCheck::Range &R2) { 
-  if (R2.isEmpty(SE, /* IsSigned */ false)) 
-    return None; 
-  if (!R1.hasValue()) 
-    return R2; 
-  auto &R1Value = R1.getValue(); 
-  // We never return empty ranges from this function, and R1 is supposed to be 
-  // a result of intersection. Thus, R1 is never empty. 
-  assert(!R1Value.isEmpty(SE, /* IsSigned */ false) && 
-         "We should never have empty R1!"); 
- 
-  // TODO: we could widen the smaller range and have this work; but for now we 
-  // bail out to keep things simple. 
-  if (R1Value.getType() != R2.getType()) 
-    return None; 
- 
-  const SCEV *NewBegin = SE.getUMaxExpr(R1Value.getBegin(), R2.getBegin()); 
-  const SCEV *NewEnd = SE.getUMinExpr(R1Value.getEnd(), R2.getEnd()); 
- 
-  // If the resulting range is empty, just return None. 
-  auto Ret = InductiveRangeCheck::Range(NewBegin, NewEnd); 
-  if (Ret.isEmpty(SE, /* IsSigned */ false)) 
-    return None; 
-  return Ret; 
-} 
- 
-PreservedAnalyses IRCEPass::run(Function &F, FunctionAnalysisManager &AM) { 
-  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 
-  auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 
-  auto &BPI = AM.getResult<BranchProbabilityAnalysis>(F); 
-  LoopInfo &LI = AM.getResult<LoopAnalysis>(F); 
- 
+                                  const char *&FailureReason) {
+  if (!L.isLoopSimplifyForm()) {
+    FailureReason = "loop not in LoopSimplify form";
+    return None;
+  }
+
+  BasicBlock *Latch = L.getLoopLatch();
+  assert(Latch && "Simplified loops only have one latch!");
+
+  if (Latch->getTerminator()->getMetadata(ClonedLoopTag)) {
+    FailureReason = "loop has already been cloned";
+    return None;
+  }
+
+  if (!L.isLoopExiting(Latch)) {
+    FailureReason = "no loop latch";
+    return None;
+  }
+
+  BasicBlock *Header = L.getHeader();
+  BasicBlock *Preheader = L.getLoopPreheader();
+  if (!Preheader) {
+    FailureReason = "no preheader";
+    return None;
+  }
+
+  BranchInst *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator());
+  if (!LatchBr || LatchBr->isUnconditional()) {
+    FailureReason = "latch terminator not conditional branch";
+    return None;
+  }
+
+  unsigned LatchBrExitIdx = LatchBr->getSuccessor(0) == Header ? 1 : 0;
+
+  ICmpInst *ICI = dyn_cast<ICmpInst>(LatchBr->getCondition());
+  if (!ICI || !isa<IntegerType>(ICI->getOperand(0)->getType())) {
+    FailureReason = "latch terminator branch not conditional on integral icmp";
+    return None;
+  }
+
+  const SCEV *LatchCount = SE.getExitCount(&L, Latch);
+  if (isa<SCEVCouldNotCompute>(LatchCount)) {
+    FailureReason = "could not compute latch count";
+    return None;
+  }
+
+  ICmpInst::Predicate Pred = ICI->getPredicate();
+  Value *LeftValue = ICI->getOperand(0);
+  const SCEV *LeftSCEV = SE.getSCEV(LeftValue);
+  IntegerType *IndVarTy = cast<IntegerType>(LeftValue->getType());
+
+  Value *RightValue = ICI->getOperand(1);
+  const SCEV *RightSCEV = SE.getSCEV(RightValue);
+
+  // We canonicalize `ICI` such that `LeftSCEV` is an add recurrence.
+  if (!isa<SCEVAddRecExpr>(LeftSCEV)) {
+    if (isa<SCEVAddRecExpr>(RightSCEV)) {
+      std::swap(LeftSCEV, RightSCEV);
+      std::swap(LeftValue, RightValue);
+      Pred = ICmpInst::getSwappedPredicate(Pred);
+    } else {
+      FailureReason = "no add recurrences in the icmp";
+      return None;
+    }
+  }
+
+  auto HasNoSignedWrap = [&](const SCEVAddRecExpr *AR) {
+    if (AR->getNoWrapFlags(SCEV::FlagNSW))
+      return true;
+
+    IntegerType *Ty = cast<IntegerType>(AR->getType());
+    IntegerType *WideTy =
+        IntegerType::get(Ty->getContext(), Ty->getBitWidth() * 2);
+
+    const SCEVAddRecExpr *ExtendAfterOp =
+        dyn_cast<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
+    if (ExtendAfterOp) {
+      const SCEV *ExtendedStart = SE.getSignExtendExpr(AR->getStart(), WideTy);
+      const SCEV *ExtendedStep =
+          SE.getSignExtendExpr(AR->getStepRecurrence(SE), WideTy);
+
+      bool NoSignedWrap = ExtendAfterOp->getStart() == ExtendedStart &&
+                          ExtendAfterOp->getStepRecurrence(SE) == ExtendedStep;
+
+      if (NoSignedWrap)
+        return true;
+    }
+
+    // We may have proved this when computing the sign extension above.
+    return AR->getNoWrapFlags(SCEV::FlagNSW) != SCEV::FlagAnyWrap;
+  };
+
+  // `ICI` is interpreted as taking the backedge if the *next* value of the
+  // induction variable satisfies some constraint.
+
+  const SCEVAddRecExpr *IndVarBase = cast<SCEVAddRecExpr>(LeftSCEV);
+  if (!IndVarBase->isAffine()) {
+    FailureReason = "LHS in icmp not induction variable";
+    return None;
+  }
+  const SCEV* StepRec = IndVarBase->getStepRecurrence(SE);
+  if (!isa<SCEVConstant>(StepRec)) {
+    FailureReason = "LHS in icmp not induction variable";
+    return None;
+  }
+  ConstantInt *StepCI = cast<SCEVConstant>(StepRec)->getValue();
+
+  if (ICI->isEquality() && !HasNoSignedWrap(IndVarBase)) {
+    FailureReason = "LHS in icmp needs nsw for equality predicates";
+    return None;
+  }
+
+  assert(!StepCI->isZero() && "Zero step?");
+  bool IsIncreasing = !StepCI->isNegative();
+  bool IsSignedPredicate;
+  const SCEV *StartNext = IndVarBase->getStart();
+  const SCEV *Addend = SE.getNegativeSCEV(IndVarBase->getStepRecurrence(SE));
+  const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend);
+  const SCEV *Step = SE.getSCEV(StepCI);
+
+  const SCEV *FixedRightSCEV = nullptr;
+
+  // If RightValue resides within loop (but still being loop invariant),
+  // regenerate it as preheader.
+  if (auto *I = dyn_cast<Instruction>(RightValue))
+    if (L.contains(I->getParent()))
+      FixedRightSCEV = RightSCEV;
+
+  if (IsIncreasing) {
+    bool DecreasedRightValueByOne = false;
+    if (StepCI->isOne()) {
+      // Try to turn eq/ne predicates to those we can work with.
+      if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1)
+        // while (++i != len) {         while (++i < len) {
+        //   ...                 --->     ...
+        // }                            }
+        // If both parts are known non-negative, it is profitable to use
+        // unsigned comparison in increasing loop. This allows us to make the
+        // comparison check against "RightSCEV + 1" more optimistic.
+        if (isKnownNonNegativeInLoop(IndVarStart, &L, SE) &&
+            isKnownNonNegativeInLoop(RightSCEV, &L, SE))
+          Pred = ICmpInst::ICMP_ULT;
+        else
+          Pred = ICmpInst::ICMP_SLT;
+      else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0) {
+        // while (true) {               while (true) {
+        //   if (++i == len)     --->     if (++i > len - 1)
+        //     break;                       break;
+        //   ...                          ...
+        // }                            }
+        if (IndVarBase->getNoWrapFlags(SCEV::FlagNUW) &&
+            cannotBeMinInLoop(RightSCEV, &L, SE, /*Signed*/false)) {
+          Pred = ICmpInst::ICMP_UGT;
+          RightSCEV = SE.getMinusSCEV(RightSCEV,
+                                      SE.getOne(RightSCEV->getType()));
+          DecreasedRightValueByOne = true;
+        } else if (cannotBeMinInLoop(RightSCEV, &L, SE, /*Signed*/true)) {
+          Pred = ICmpInst::ICMP_SGT;
+          RightSCEV = SE.getMinusSCEV(RightSCEV,
+                                      SE.getOne(RightSCEV->getType()));
+          DecreasedRightValueByOne = true;
+        }
+      }
+    }
+
+    bool LTPred = (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_ULT);
+    bool GTPred = (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_UGT);
+    bool FoundExpectedPred =
+        (LTPred && LatchBrExitIdx == 1) || (GTPred && LatchBrExitIdx == 0);
+
+    if (!FoundExpectedPred) {
+      FailureReason = "expected icmp slt semantically, found something else";
+      return None;
+    }
+
+    IsSignedPredicate = ICmpInst::isSigned(Pred);
+    if (!IsSignedPredicate && !AllowUnsignedLatchCondition) {
+      FailureReason = "unsigned latch conditions are explicitly prohibited";
+      return None;
+    }
+
+    if (!isSafeIncreasingBound(IndVarStart, RightSCEV, Step, Pred,
+                               LatchBrExitIdx, &L, SE)) {
+      FailureReason = "Unsafe loop bounds";
+      return None;
+    }
+    if (LatchBrExitIdx == 0) {
+      // We need to increase the right value unless we have already decreased
+      // it virtually when we replaced EQ with SGT.
+      if (!DecreasedRightValueByOne)
+        FixedRightSCEV =
+            SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
+    } else {
+      assert(!DecreasedRightValueByOne &&
+             "Right value can be decreased only for LatchBrExitIdx == 0!");
+    }
+  } else {
+    bool IncreasedRightValueByOne = false;
+    if (StepCI->isMinusOne()) {
+      // Try to turn eq/ne predicates to those we can work with.
+      if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1)
+        // while (--i != len) {         while (--i > len) {
+        //   ...                 --->     ...
+        // }                            }
+        // We intentionally don't turn the predicate into UGT even if we know
+        // that both operands are non-negative, because it will only pessimize
+        // our check against "RightSCEV - 1".
+        Pred = ICmpInst::ICMP_SGT;
+      else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0) {
+        // while (true) {               while (true) {
+        //   if (--i == len)     --->     if (--i < len + 1)
+        //     break;                       break;
+        //   ...                          ...
+        // }                            }
+        if (IndVarBase->getNoWrapFlags(SCEV::FlagNUW) &&
+            cannotBeMaxInLoop(RightSCEV, &L, SE, /* Signed */ false)) {
+          Pred = ICmpInst::ICMP_ULT;
+          RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
+          IncreasedRightValueByOne = true;
+        } else if (cannotBeMaxInLoop(RightSCEV, &L, SE, /* Signed */ true)) {
+          Pred = ICmpInst::ICMP_SLT;
+          RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
+          IncreasedRightValueByOne = true;
+        }
+      }
+    }
+
+    bool LTPred = (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_ULT);
+    bool GTPred = (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_UGT);
+
+    bool FoundExpectedPred =
+        (GTPred && LatchBrExitIdx == 1) || (LTPred && LatchBrExitIdx == 0);
+
+    if (!FoundExpectedPred) {
+      FailureReason = "expected icmp sgt semantically, found something else";
+      return None;
+    }
+
+    IsSignedPredicate =
+        Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGT;
+
+    if (!IsSignedPredicate && !AllowUnsignedLatchCondition) {
+      FailureReason = "unsigned latch conditions are explicitly prohibited";
+      return None;
+    }
+
+    if (!isSafeDecreasingBound(IndVarStart, RightSCEV, Step, Pred,
+                               LatchBrExitIdx, &L, SE)) {
+      FailureReason = "Unsafe bounds";
+      return None;
+    }
+
+    if (LatchBrExitIdx == 0) {
+      // We need to decrease the right value unless we have already increased
+      // it virtually when we replaced EQ with SLT.
+      if (!IncreasedRightValueByOne)
+        FixedRightSCEV =
+            SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType()));
+    } else {
+      assert(!IncreasedRightValueByOne &&
+             "Right value can be increased only for LatchBrExitIdx == 0!");
+    }
+  }
+  BasicBlock *LatchExit = LatchBr->getSuccessor(LatchBrExitIdx);
+
+  assert(SE.getLoopDisposition(LatchCount, &L) ==
+             ScalarEvolution::LoopInvariant &&
+         "loop variant exit count doesn't make sense!");
+
+  assert(!L.contains(LatchExit) && "expected an exit block!");
+  const DataLayout &DL = Preheader->getModule()->getDataLayout();
+  SCEVExpander Expander(SE, DL, "irce");
+  Instruction *Ins = Preheader->getTerminator();
+
+  if (FixedRightSCEV)
+    RightValue =
+        Expander.expandCodeFor(FixedRightSCEV, FixedRightSCEV->getType(), Ins);
+
+  Value *IndVarStartV = Expander.expandCodeFor(IndVarStart, IndVarTy, Ins);
+  IndVarStartV->setName("indvar.start");
+
+  LoopStructure Result;
+
+  Result.Tag = "main";
+  Result.Header = Header;
+  Result.Latch = Latch;
+  Result.LatchBr = LatchBr;
+  Result.LatchExit = LatchExit;
+  Result.LatchBrExitIdx = LatchBrExitIdx;
+  Result.IndVarStart = IndVarStartV;
+  Result.IndVarStep = StepCI;
+  Result.IndVarBase = LeftValue;
+  Result.IndVarIncreasing = IsIncreasing;
+  Result.LoopExitAt = RightValue;
+  Result.IsSignedPredicate = IsSignedPredicate;
+
+  FailureReason = nullptr;
+
+  return Result;
+}
+
+/// If the type of \p S matches with \p Ty, return \p S. Otherwise, return
+/// signed or unsigned extension of \p S to type \p Ty.
+static const SCEV *NoopOrExtend(const SCEV *S, Type *Ty, ScalarEvolution &SE,
+                                bool Signed) {
+  return Signed ? SE.getNoopOrSignExtend(S, Ty) : SE.getNoopOrZeroExtend(S, Ty);
+}
+
+Optional<LoopConstrainer::SubRanges>
+LoopConstrainer::calculateSubRanges(bool IsSignedPredicate) const {
+  IntegerType *Ty = cast<IntegerType>(LatchTakenCount->getType());
+
+  auto *RTy = cast<IntegerType>(Range.getType());
+
+  // We only support wide range checks and narrow latches.
+  if (!AllowNarrowLatchCondition && RTy != Ty)
+    return None;
+  if (RTy->getBitWidth() < Ty->getBitWidth())
+    return None;
+
+  LoopConstrainer::SubRanges Result;
+
+  // I think we can be more aggressive here and make this nuw / nsw if the
+  // addition that feeds into the icmp for the latch's terminating branch is nuw
+  // / nsw.  In any case, a wrapping 2's complement addition is safe.
+  const SCEV *Start = NoopOrExtend(SE.getSCEV(MainLoopStructure.IndVarStart),
+                                   RTy, SE, IsSignedPredicate);
+  const SCEV *End = NoopOrExtend(SE.getSCEV(MainLoopStructure.LoopExitAt), RTy,
+                                 SE, IsSignedPredicate);
+
+  bool Increasing = MainLoopStructure.IndVarIncreasing;
+
+  // We compute `Smallest` and `Greatest` such that [Smallest, Greatest), or
+  // [Smallest, GreatestSeen] is the range of values the induction variable
+  // takes.
+
+  const SCEV *Smallest = nullptr, *Greatest = nullptr, *GreatestSeen = nullptr;
+
+  const SCEV *One = SE.getOne(RTy);
+  if (Increasing) {
+    Smallest = Start;
+    Greatest = End;
+    // No overflow, because the range [Smallest, GreatestSeen] is not empty.
+    GreatestSeen = SE.getMinusSCEV(End, One);
+  } else {
+    // These two computations may sign-overflow.  Here is why that is okay:
+    //
+    // We know that the induction variable does not sign-overflow on any
+    // iteration except the last one, and it starts at `Start` and ends at
+    // `End`, decrementing by one every time.
+    //
+    //  * if `Smallest` sign-overflows we know `End` is `INT_SMAX`. Since the
+    //    induction variable is decreasing we know that that the smallest value
+    //    the loop body is actually executed with is `INT_SMIN` == `Smallest`.
+    //
+    //  * if `Greatest` sign-overflows, we know it can only be `INT_SMIN`.  In
+    //    that case, `Clamp` will always return `Smallest` and
+    //    [`Result.LowLimit`, `Result.HighLimit`) = [`Smallest`, `Smallest`)
+    //    will be an empty range.  Returning an empty range is always safe.
+
+    Smallest = SE.getAddExpr(End, One);
+    Greatest = SE.getAddExpr(Start, One);
+    GreatestSeen = Start;
+  }
+
+  auto Clamp = [this, Smallest, Greatest, IsSignedPredicate](const SCEV *S) {
+    return IsSignedPredicate
+               ? SE.getSMaxExpr(Smallest, SE.getSMinExpr(Greatest, S))
+               : SE.getUMaxExpr(Smallest, SE.getUMinExpr(Greatest, S));
+  };
+
+  // In some cases we can prove that we don't need a pre or post loop.
+  ICmpInst::Predicate PredLE =
+      IsSignedPredicate ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE;
+  ICmpInst::Predicate PredLT =
+      IsSignedPredicate ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
+
+  bool ProvablyNoPreloop =
+      SE.isKnownPredicate(PredLE, Range.getBegin(), Smallest);
+  if (!ProvablyNoPreloop)
+    Result.LowLimit = Clamp(Range.getBegin());
+
+  bool ProvablyNoPostLoop =
+      SE.isKnownPredicate(PredLT, GreatestSeen, Range.getEnd());
+  if (!ProvablyNoPostLoop)
+    Result.HighLimit = Clamp(Range.getEnd());
+
+  return Result;
+}
+
+void LoopConstrainer::cloneLoop(LoopConstrainer::ClonedLoop &Result,
+                                const char *Tag) const {
+  for (BasicBlock *BB : OriginalLoop.getBlocks()) {
+    BasicBlock *Clone = CloneBasicBlock(BB, Result.Map, Twine(".") + Tag, &F);
+    Result.Blocks.push_back(Clone);
+    Result.Map[BB] = Clone;
+  }
+
+  auto GetClonedValue = [&Result](Value *V) {
+    assert(V && "null values not in domain!");
+    auto It = Result.Map.find(V);
+    if (It == Result.Map.end())
+      return V;
+    return static_cast<Value *>(It->second);
+  };
+
+  auto *ClonedLatch =
+      cast<BasicBlock>(GetClonedValue(OriginalLoop.getLoopLatch()));
+  ClonedLatch->getTerminator()->setMetadata(ClonedLoopTag,
+                                            MDNode::get(Ctx, {}));
+
+  Result.Structure = MainLoopStructure.map(GetClonedValue);
+  Result.Structure.Tag = Tag;
+
+  for (unsigned i = 0, e = Result.Blocks.size(); i != e; ++i) {
+    BasicBlock *ClonedBB = Result.Blocks[i];
+    BasicBlock *OriginalBB = OriginalLoop.getBlocks()[i];
+
+    assert(Result.Map[OriginalBB] == ClonedBB && "invariant!");
+
+    for (Instruction &I : *ClonedBB)
+      RemapInstruction(&I, Result.Map,
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+
+    // Exit blocks will now have one more predecessor and their PHI nodes need
+    // to be edited to reflect that.  No phi nodes need to be introduced because
+    // the loop is in LCSSA.
+
+    for (auto *SBB : successors(OriginalBB)) {
+      if (OriginalLoop.contains(SBB))
+        continue; // not an exit block
+
+      for (PHINode &PN : SBB->phis()) {
+        Value *OldIncoming = PN.getIncomingValueForBlock(OriginalBB);
+        PN.addIncoming(GetClonedValue(OldIncoming), ClonedBB);
+      }
+    }
+  }
+}
+
+LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
+    const LoopStructure &LS, BasicBlock *Preheader, Value *ExitSubloopAt,
+    BasicBlock *ContinuationBlock) const {
+  // We start with a loop with a single latch:
+  //
+  //    +--------------------+
+  //    |                    |
+  //    |     preheader      |
+  //    |                    |
+  //    +--------+-----------+
+  //             |      ----------------\
+  //             |     /                |
+  //    +--------v----v------+          |
+  //    |                    |          |
+  //    |      header        |          |
+  //    |                    |          |
+  //    +--------------------+          |
+  //                                    |
+  //            .....                   |
+  //                                    |
+  //    +--------------------+          |
+  //    |                    |          |
+  //    |       latch        >----------/
+  //    |                    |
+  //    +-------v------------+
+  //            |
+  //            |
+  //            |   +--------------------+
+  //            |   |                    |
+  //            +--->   original exit    |
+  //                |                    |
+  //                +--------------------+
+  //
+  // We change the control flow to look like
+  //
+  //
+  //    +--------------------+
+  //    |                    |
+  //    |     preheader      >-------------------------+
+  //    |                    |                         |
+  //    +--------v-----------+                         |
+  //             |    /-------------+                  |
+  //             |   /              |                  |
+  //    +--------v--v--------+      |                  |
+  //    |                    |      |                  |
+  //    |      header        |      |   +--------+     |
+  //    |                    |      |   |        |     |
+  //    +--------------------+      |   |  +-----v-----v-----------+
+  //                                |   |  |                       |
+  //                                |   |  |     .pseudo.exit      |
+  //                                |   |  |                       |
+  //                                |   |  +-----------v-----------+
+  //                                |   |              |
+  //            .....               |   |              |
+  //                                |   |     +--------v-------------+
+  //    +--------------------+      |   |     |                      |
+  //    |                    |      |   |     |   ContinuationBlock  |
+  //    |       latch        >------+   |     |                      |
+  //    |                    |          |     +----------------------+
+  //    +---------v----------+          |
+  //              |                     |
+  //              |                     |
+  //              |     +---------------^-----+
+  //              |     |                     |
+  //              +----->    .exit.selector   |
+  //                    |                     |
+  //                    +----------v----------+
+  //                               |
+  //     +--------------------+    |
+  //     |                    |    |
+  //     |   original exit    <----+
+  //     |                    |
+  //     +--------------------+
+
+  RewrittenRangeInfo RRI;
+
+  BasicBlock *BBInsertLocation = LS.Latch->getNextNode();
+  RRI.ExitSelector = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".exit.selector",
+                                        &F, BBInsertLocation);
+  RRI.PseudoExit = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".pseudo.exit", &F,
+                                      BBInsertLocation);
+
+  BranchInst *PreheaderJump = cast<BranchInst>(Preheader->getTerminator());
+  bool Increasing = LS.IndVarIncreasing;
+  bool IsSignedPredicate = LS.IsSignedPredicate;
+
+  IRBuilder<> B(PreheaderJump);
+  auto *RangeTy = Range.getBegin()->getType();
+  auto NoopOrExt = [&](Value *V) {
+    if (V->getType() == RangeTy)
+      return V;
+    return IsSignedPredicate ? B.CreateSExt(V, RangeTy, "wide." + V->getName())
+                             : B.CreateZExt(V, RangeTy, "wide." + V->getName());
+  };
+
+  // EnterLoopCond - is it okay to start executing this `LS'?
+  Value *EnterLoopCond = nullptr;
+  auto Pred =
+      Increasing
+          ? (IsSignedPredicate ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT)
+          : (IsSignedPredicate ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT);
+  Value *IndVarStart = NoopOrExt(LS.IndVarStart);
+  EnterLoopCond = B.CreateICmp(Pred, IndVarStart, ExitSubloopAt);
+
+  B.CreateCondBr(EnterLoopCond, LS.Header, RRI.PseudoExit);
+  PreheaderJump->eraseFromParent();
+
+  LS.LatchBr->setSuccessor(LS.LatchBrExitIdx, RRI.ExitSelector);
+  B.SetInsertPoint(LS.LatchBr);
+  Value *IndVarBase = NoopOrExt(LS.IndVarBase);
+  Value *TakeBackedgeLoopCond = B.CreateICmp(Pred, IndVarBase, ExitSubloopAt);
+
+  Value *CondForBranch = LS.LatchBrExitIdx == 1
+                             ? TakeBackedgeLoopCond
+                             : B.CreateNot(TakeBackedgeLoopCond);
+
+  LS.LatchBr->setCondition(CondForBranch);
+
+  B.SetInsertPoint(RRI.ExitSelector);
+
+  // IterationsLeft - are there any more iterations left, given the original
+  // upper bound on the induction variable?  If not, we branch to the "real"
+  // exit.
+  Value *LoopExitAt = NoopOrExt(LS.LoopExitAt);
+  Value *IterationsLeft = B.CreateICmp(Pred, IndVarBase, LoopExitAt);
+  B.CreateCondBr(IterationsLeft, RRI.PseudoExit, LS.LatchExit);
+
+  BranchInst *BranchToContinuation =
+      BranchInst::Create(ContinuationBlock, RRI.PseudoExit);
+
+  // We emit PHI nodes into `RRI.PseudoExit' that compute the "latest" value of
+  // each of the PHI nodes in the loop header.  This feeds into the initial
+  // value of the same PHI nodes if/when we continue execution.
+  for (PHINode &PN : LS.Header->phis()) {
+    PHINode *NewPHI = PHINode::Create(PN.getType(), 2, PN.getName() + ".copy",
+                                      BranchToContinuation);
+
+    NewPHI->addIncoming(PN.getIncomingValueForBlock(Preheader), Preheader);
+    NewPHI->addIncoming(PN.getIncomingValueForBlock(LS.Latch),
+                        RRI.ExitSelector);
+    RRI.PHIValuesAtPseudoExit.push_back(NewPHI);
+  }
+
+  RRI.IndVarEnd = PHINode::Create(IndVarBase->getType(), 2, "indvar.end",
+                                  BranchToContinuation);
+  RRI.IndVarEnd->addIncoming(IndVarStart, Preheader);
+  RRI.IndVarEnd->addIncoming(IndVarBase, RRI.ExitSelector);
+
+  // The latch exit now has a branch from `RRI.ExitSelector' instead of
+  // `LS.Latch'.  The PHI nodes need to be updated to reflect that.
+  LS.LatchExit->replacePhiUsesWith(LS.Latch, RRI.ExitSelector);
+
+  return RRI;
+}
+
+void LoopConstrainer::rewriteIncomingValuesForPHIs(
+    LoopStructure &LS, BasicBlock *ContinuationBlock,
+    const LoopConstrainer::RewrittenRangeInfo &RRI) const {
+  unsigned PHIIndex = 0;
+  for (PHINode &PN : LS.Header->phis())
+    PN.setIncomingValueForBlock(ContinuationBlock,
+                                RRI.PHIValuesAtPseudoExit[PHIIndex++]);
+
+  LS.IndVarStart = RRI.IndVarEnd;
+}
+
+BasicBlock *LoopConstrainer::createPreheader(const LoopStructure &LS,
+                                             BasicBlock *OldPreheader,
+                                             const char *Tag) const {
+  BasicBlock *Preheader = BasicBlock::Create(Ctx, Tag, &F, LS.Header);
+  BranchInst::Create(LS.Header, Preheader);
+
+  LS.Header->replacePhiUsesWith(OldPreheader, Preheader);
+
+  return Preheader;
+}
+
+void LoopConstrainer::addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs) {
+  Loop *ParentLoop = OriginalLoop.getParentLoop();
+  if (!ParentLoop)
+    return;
+
+  for (BasicBlock *BB : BBs)
+    ParentLoop->addBasicBlockToLoop(BB, LI);
+}
+
+Loop *LoopConstrainer::createClonedLoopStructure(Loop *Original, Loop *Parent,
+                                                 ValueToValueMapTy &VM,
+                                                 bool IsSubloop) {
+  Loop &New = *LI.AllocateLoop();
+  if (Parent)
+    Parent->addChildLoop(&New);
+  else
+    LI.addTopLevelLoop(&New);
+  LPMAddNewLoop(&New, IsSubloop);
+
+  // Add all of the blocks in Original to the new loop.
+  for (auto *BB : Original->blocks())
+    if (LI.getLoopFor(BB) == Original)
+      New.addBasicBlockToLoop(cast<BasicBlock>(VM[BB]), LI);
+
+  // Add all of the subloops to the new loop.
+  for (Loop *SubLoop : *Original)
+    createClonedLoopStructure(SubLoop, &New, VM, /* IsSubloop */ true);
+
+  return &New;
+}
+
+bool LoopConstrainer::run() {
+  BasicBlock *Preheader = nullptr;
+  LatchTakenCount = SE.getExitCount(&OriginalLoop, MainLoopStructure.Latch);
+  Preheader = OriginalLoop.getLoopPreheader();
+  assert(!isa<SCEVCouldNotCompute>(LatchTakenCount) && Preheader != nullptr &&
+         "preconditions!");
+
+  OriginalPreheader = Preheader;
+  MainLoopPreheader = Preheader;
+
+  bool IsSignedPredicate = MainLoopStructure.IsSignedPredicate;
+  Optional<SubRanges> MaybeSR = calculateSubRanges(IsSignedPredicate);
+  if (!MaybeSR.hasValue()) {
+    LLVM_DEBUG(dbgs() << "irce: could not compute subranges\n");
+    return false;
+  }
+
+  SubRanges SR = MaybeSR.getValue();
+  bool Increasing = MainLoopStructure.IndVarIncreasing;
+  IntegerType *IVTy =
+      cast<IntegerType>(Range.getBegin()->getType());
+
+  SCEVExpander Expander(SE, F.getParent()->getDataLayout(), "irce");
+  Instruction *InsertPt = OriginalPreheader->getTerminator();
+
+  // It would have been better to make `PreLoop' and `PostLoop'
+  // `Optional<ClonedLoop>'s, but `ValueToValueMapTy' does not have a copy
+  // constructor.
+  ClonedLoop PreLoop, PostLoop;
+  bool NeedsPreLoop =
+      Increasing ? SR.LowLimit.hasValue() : SR.HighLimit.hasValue();
+  bool NeedsPostLoop =
+      Increasing ? SR.HighLimit.hasValue() : SR.LowLimit.hasValue();
+
+  Value *ExitPreLoopAt = nullptr;
+  Value *ExitMainLoopAt = nullptr;
+  const SCEVConstant *MinusOneS =
+      cast<SCEVConstant>(SE.getConstant(IVTy, -1, true /* isSigned */));
+
+  if (NeedsPreLoop) {
+    const SCEV *ExitPreLoopAtSCEV = nullptr;
+
+    if (Increasing)
+      ExitPreLoopAtSCEV = *SR.LowLimit;
+    else if (cannotBeMinInLoop(*SR.HighLimit, &OriginalLoop, SE,
+                               IsSignedPredicate))
+      ExitPreLoopAtSCEV = SE.getAddExpr(*SR.HighLimit, MinusOneS);
+    else {
+      LLVM_DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
+                        << "preloop exit limit.  HighLimit = "
+                        << *(*SR.HighLimit) << "\n");
+      return false;
+    }
+
+    if (!isSafeToExpandAt(ExitPreLoopAtSCEV, InsertPt, SE)) {
+      LLVM_DEBUG(dbgs() << "irce: could not prove that it is safe to expand the"
+                        << " preloop exit limit " << *ExitPreLoopAtSCEV
+                        << " at block " << InsertPt->getParent()->getName()
+                        << "\n");
+      return false;
+    }
+
+    ExitPreLoopAt = Expander.expandCodeFor(ExitPreLoopAtSCEV, IVTy, InsertPt);
+    ExitPreLoopAt->setName("exit.preloop.at");
+  }
+
+  if (NeedsPostLoop) {
+    const SCEV *ExitMainLoopAtSCEV = nullptr;
+
+    if (Increasing)
+      ExitMainLoopAtSCEV = *SR.HighLimit;
+    else if (cannotBeMinInLoop(*SR.LowLimit, &OriginalLoop, SE,
+                               IsSignedPredicate))
+      ExitMainLoopAtSCEV = SE.getAddExpr(*SR.LowLimit, MinusOneS);
+    else {
+      LLVM_DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
+                        << "mainloop exit limit.  LowLimit = "
+                        << *(*SR.LowLimit) << "\n");
+      return false;
+    }
+
+    if (!isSafeToExpandAt(ExitMainLoopAtSCEV, InsertPt, SE)) {
+      LLVM_DEBUG(dbgs() << "irce: could not prove that it is safe to expand the"
+                        << " main loop exit limit " << *ExitMainLoopAtSCEV
+                        << " at block " << InsertPt->getParent()->getName()
+                        << "\n");
+      return false;
+    }
+
+    ExitMainLoopAt = Expander.expandCodeFor(ExitMainLoopAtSCEV, IVTy, InsertPt);
+    ExitMainLoopAt->setName("exit.mainloop.at");
+  }
+
+  // We clone these ahead of time so that we don't have to deal with changing
+  // and temporarily invalid IR as we transform the loops.
+  if (NeedsPreLoop)
+    cloneLoop(PreLoop, "preloop");
+  if (NeedsPostLoop)
+    cloneLoop(PostLoop, "postloop");
+
+  RewrittenRangeInfo PreLoopRRI;
+
+  if (NeedsPreLoop) {
+    Preheader->getTerminator()->replaceUsesOfWith(MainLoopStructure.Header,
+                                                  PreLoop.Structure.Header);
+
+    MainLoopPreheader =
+        createPreheader(MainLoopStructure, Preheader, "mainloop");
+    PreLoopRRI = changeIterationSpaceEnd(PreLoop.Structure, Preheader,
+                                         ExitPreLoopAt, MainLoopPreheader);
+    rewriteIncomingValuesForPHIs(MainLoopStructure, MainLoopPreheader,
+                                 PreLoopRRI);
+  }
+
+  BasicBlock *PostLoopPreheader = nullptr;
+  RewrittenRangeInfo PostLoopRRI;
+
+  if (NeedsPostLoop) {
+    PostLoopPreheader =
+        createPreheader(PostLoop.Structure, Preheader, "postloop");
+    PostLoopRRI = changeIterationSpaceEnd(MainLoopStructure, MainLoopPreheader,
+                                          ExitMainLoopAt, PostLoopPreheader);
+    rewriteIncomingValuesForPHIs(PostLoop.Structure, PostLoopPreheader,
+                                 PostLoopRRI);
+  }
+
+  BasicBlock *NewMainLoopPreheader =
+      MainLoopPreheader != Preheader ? MainLoopPreheader : nullptr;
+  BasicBlock *NewBlocks[] = {PostLoopPreheader,        PreLoopRRI.PseudoExit,
+                             PreLoopRRI.ExitSelector,  PostLoopRRI.PseudoExit,
+                             PostLoopRRI.ExitSelector, NewMainLoopPreheader};
+
+  // Some of the above may be nullptr, filter them out before passing to
+  // addToParentLoopIfNeeded.
+  auto NewBlocksEnd =
+      std::remove(std::begin(NewBlocks), std::end(NewBlocks), nullptr);
+
+  addToParentLoopIfNeeded(makeArrayRef(std::begin(NewBlocks), NewBlocksEnd));
+
+  DT.recalculate(F);
+
+  // We need to first add all the pre and post loop blocks into the loop
+  // structures (as part of createClonedLoopStructure), and then update the
+  // LCSSA form and LoopSimplifyForm. This is necessary for correctly updating
+  // LI when LoopSimplifyForm is generated.
+  Loop *PreL = nullptr, *PostL = nullptr;
+  if (!PreLoop.Blocks.empty()) {
+    PreL = createClonedLoopStructure(&OriginalLoop,
+                                     OriginalLoop.getParentLoop(), PreLoop.Map,
+                                     /* IsSubLoop */ false);
+  }
+
+  if (!PostLoop.Blocks.empty()) {
+    PostL =
+        createClonedLoopStructure(&OriginalLoop, OriginalLoop.getParentLoop(),
+                                  PostLoop.Map, /* IsSubLoop */ false);
+  }
+
+  // This function canonicalizes the loop into Loop-Simplify and LCSSA forms.
+  auto CanonicalizeLoop = [&] (Loop *L, bool IsOriginalLoop) {
+    formLCSSARecursively(*L, DT, &LI, &SE);
+    simplifyLoop(L, &DT, &LI, &SE, nullptr, nullptr, true);
+    // Pre/post loops are slow paths, we do not need to perform any loop
+    // optimizations on them.
+    if (!IsOriginalLoop)
+      DisableAllLoopOptsOnLoop(*L);
+  };
+  if (PreL)
+    CanonicalizeLoop(PreL, false);
+  if (PostL)
+    CanonicalizeLoop(PostL, false);
+  CanonicalizeLoop(&OriginalLoop, true);
+
+  return true;
+}
+
+/// Computes and returns a range of values for the induction variable (IndVar)
+/// in which the range check can be safely elided.  If it cannot compute such a
+/// range, returns None.
+Optional<InductiveRangeCheck::Range>
+InductiveRangeCheck::computeSafeIterationSpace(
+    ScalarEvolution &SE, const SCEVAddRecExpr *IndVar,
+    bool IsLatchSigned) const {
+  // We can deal when types of latch check and range checks don't match in case
+  // if latch check is more narrow.
+  auto *IVType = cast<IntegerType>(IndVar->getType());
+  auto *RCType = cast<IntegerType>(getBegin()->getType());
+  if (IVType->getBitWidth() > RCType->getBitWidth())
+    return None;
+  // IndVar is of the form "A + B * I" (where "I" is the canonical induction
+  // variable, that may or may not exist as a real llvm::Value in the loop) and
+  // this inductive range check is a range check on the "C + D * I" ("C" is
+  // getBegin() and "D" is getStep()).  We rewrite the value being range
+  // checked to "M + N * IndVar" where "N" = "D * B^(-1)" and "M" = "C - NA".
+  //
+  // The actual inequalities we solve are of the form
+  //
+  //   0 <= M + 1 * IndVar < L given L >= 0  (i.e. N == 1)
+  //
+  // Here L stands for upper limit of the safe iteration space.
+  // The inequality is satisfied by (0 - M) <= IndVar < (L - M). To avoid
+  // overflows when calculating (0 - M) and (L - M) we, depending on type of
+  // IV's iteration space, limit the calculations by borders of the iteration
+  // space. For example, if IndVar is unsigned, (0 - M) overflows for any M > 0.
+  // If we figured out that "anything greater than (-M) is safe", we strengthen
+  // this to "everything greater than 0 is safe", assuming that values between
+  // -M and 0 just do not exist in unsigned iteration space, and we don't want
+  // to deal with overflown values.
+
+  if (!IndVar->isAffine())
+    return None;
+
+  const SCEV *A = NoopOrExtend(IndVar->getStart(), RCType, SE, IsLatchSigned);
+  const SCEVConstant *B = dyn_cast<SCEVConstant>(
+      NoopOrExtend(IndVar->getStepRecurrence(SE), RCType, SE, IsLatchSigned));
+  if (!B)
+    return None;
+  assert(!B->isZero() && "Recurrence with zero step?");
+
+  const SCEV *C = getBegin();
+  const SCEVConstant *D = dyn_cast<SCEVConstant>(getStep());
+  if (D != B)
+    return None;
+
+  assert(!D->getValue()->isZero() && "Recurrence with zero step?");
+  unsigned BitWidth = RCType->getBitWidth();
+  const SCEV *SIntMax = SE.getConstant(APInt::getSignedMaxValue(BitWidth));
+
+  // Subtract Y from X so that it does not go through border of the IV
+  // iteration space. Mathematically, it is equivalent to:
+  //
+  //    ClampedSubtract(X, Y) = min(max(X - Y, INT_MIN), INT_MAX).        [1]
+  //
+  // In [1], 'X - Y' is a mathematical subtraction (result is not bounded to
+  // any width of bit grid). But after we take min/max, the result is
+  // guaranteed to be within [INT_MIN, INT_MAX].
+  //
+  // In [1], INT_MAX and INT_MIN are respectively signed and unsigned max/min
+  // values, depending on type of latch condition that defines IV iteration
+  // space.
+  auto ClampedSubtract = [&](const SCEV *X, const SCEV *Y) {
+    // FIXME: The current implementation assumes that X is in [0, SINT_MAX].
+    // This is required to ensure that SINT_MAX - X does not overflow signed and
+    // that X - Y does not overflow unsigned if Y is negative. Can we lift this
+    // restriction and make it work for negative X either?
+    if (IsLatchSigned) {
+      // X is a number from signed range, Y is interpreted as signed.
+      // Even if Y is SINT_MAX, (X - Y) does not reach SINT_MIN. So the only
+      // thing we should care about is that we didn't cross SINT_MAX.
+      // So, if Y is positive, we subtract Y safely.
+      //   Rule 1: Y > 0 ---> Y.
+      // If 0 <= -Y <= (SINT_MAX - X), we subtract Y safely.
+      //   Rule 2: Y >=s (X - SINT_MAX) ---> Y.
+      // If 0 <= (SINT_MAX - X) < -Y, we can only subtract (X - SINT_MAX).
+      //   Rule 3: Y <s (X - SINT_MAX) ---> (X - SINT_MAX).
+      // It gives us smax(Y, X - SINT_MAX) to subtract in all cases.
+      const SCEV *XMinusSIntMax = SE.getMinusSCEV(X, SIntMax);
+      return SE.getMinusSCEV(X, SE.getSMaxExpr(Y, XMinusSIntMax),
+                             SCEV::FlagNSW);
+    } else
+      // X is a number from unsigned range, Y is interpreted as signed.
+      // Even if Y is SINT_MIN, (X - Y) does not reach UINT_MAX. So the only
+      // thing we should care about is that we didn't cross zero.
+      // So, if Y is negative, we subtract Y safely.
+      //   Rule 1: Y <s 0 ---> Y.
+      // If 0 <= Y <= X, we subtract Y safely.
+      //   Rule 2: Y <=s X ---> Y.
+      // If 0 <= X < Y, we should stop at 0 and can only subtract X.
+      //   Rule 3: Y >s X ---> X.
+      // It gives us smin(X, Y) to subtract in all cases.
+      return SE.getMinusSCEV(X, SE.getSMinExpr(X, Y), SCEV::FlagNUW);
+  };
+  const SCEV *M = SE.getMinusSCEV(C, A);
+  const SCEV *Zero = SE.getZero(M->getType());
+
+  // This function returns SCEV equal to 1 if X is non-negative 0 otherwise.
+  auto SCEVCheckNonNegative = [&](const SCEV *X) {
+    const Loop *L = IndVar->getLoop();
+    const SCEV *One = SE.getOne(X->getType());
+    // Can we trivially prove that X is a non-negative or negative value?
+    if (isKnownNonNegativeInLoop(X, L, SE))
+      return One;
+    else if (isKnownNegativeInLoop(X, L, SE))
+      return Zero;
+    // If not, we will have to figure it out during the execution.
+    // Function smax(smin(X, 0), -1) + 1 equals to 1 if X >= 0 and 0 if X < 0.
+    const SCEV *NegOne = SE.getNegativeSCEV(One);
+    return SE.getAddExpr(SE.getSMaxExpr(SE.getSMinExpr(X, Zero), NegOne), One);
+  };
+  // FIXME: Current implementation of ClampedSubtract implicitly assumes that
+  // X is non-negative (in sense of a signed value). We need to re-implement
+  // this function in a way that it will correctly handle negative X as well.
+  // We use it twice: for X = 0 everything is fine, but for X = getEnd() we can
+  // end up with a negative X and produce wrong results. So currently we ensure
+  // that if getEnd() is negative then both ends of the safe range are zero.
+  // Note that this may pessimize elimination of unsigned range checks against
+  // negative values.
+  const SCEV *REnd = getEnd();
+  const SCEV *EndIsNonNegative = SCEVCheckNonNegative(REnd);
+
+  const SCEV *Begin = SE.getMulExpr(ClampedSubtract(Zero, M), EndIsNonNegative);
+  const SCEV *End = SE.getMulExpr(ClampedSubtract(REnd, M), EndIsNonNegative);
+  return InductiveRangeCheck::Range(Begin, End);
+}
+
+static Optional<InductiveRangeCheck::Range>
+IntersectSignedRange(ScalarEvolution &SE,
+                     const Optional<InductiveRangeCheck::Range> &R1,
+                     const InductiveRangeCheck::Range &R2) {
+  if (R2.isEmpty(SE, /* IsSigned */ true))
+    return None;
+  if (!R1.hasValue())
+    return R2;
+  auto &R1Value = R1.getValue();
+  // We never return empty ranges from this function, and R1 is supposed to be
+  // a result of intersection. Thus, R1 is never empty.
+  assert(!R1Value.isEmpty(SE, /* IsSigned */ true) &&
+         "We should never have empty R1!");
+
+  // TODO: we could widen the smaller range and have this work; but for now we
+  // bail out to keep things simple.
+  if (R1Value.getType() != R2.getType())
+    return None;
+
+  const SCEV *NewBegin = SE.getSMaxExpr(R1Value.getBegin(), R2.getBegin());
+  const SCEV *NewEnd = SE.getSMinExpr(R1Value.getEnd(), R2.getEnd());
+
+  // If the resulting range is empty, just return None.
+  auto Ret = InductiveRangeCheck::Range(NewBegin, NewEnd);
+  if (Ret.isEmpty(SE, /* IsSigned */ true))
+    return None;
+  return Ret;
+}
+
+static Optional<InductiveRangeCheck::Range>
+IntersectUnsignedRange(ScalarEvolution &SE,
+                       const Optional<InductiveRangeCheck::Range> &R1,
+                       const InductiveRangeCheck::Range &R2) {
+  if (R2.isEmpty(SE, /* IsSigned */ false))
+    return None;
+  if (!R1.hasValue())
+    return R2;
+  auto &R1Value = R1.getValue();
+  // We never return empty ranges from this function, and R1 is supposed to be
+  // a result of intersection. Thus, R1 is never empty.
+  assert(!R1Value.isEmpty(SE, /* IsSigned */ false) &&
+         "We should never have empty R1!");
+
+  // TODO: we could widen the smaller range and have this work; but for now we
+  // bail out to keep things simple.
+  if (R1Value.getType() != R2.getType())
+    return None;
+
+  const SCEV *NewBegin = SE.getUMaxExpr(R1Value.getBegin(), R2.getBegin());
+  const SCEV *NewEnd = SE.getUMinExpr(R1Value.getEnd(), R2.getEnd());
+
+  // If the resulting range is empty, just return None.
+  auto Ret = InductiveRangeCheck::Range(NewBegin, NewEnd);
+  if (Ret.isEmpty(SE, /* IsSigned */ false))
+    return None;
+  return Ret;
+}
+
+PreservedAnalyses IRCEPass::run(Function &F, FunctionAnalysisManager &AM) {
+  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &BPI = AM.getResult<BranchProbabilityAnalysis>(F);
+  LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
+
   // Get BFI analysis result on demand. Please note that modification of
   // CFG invalidates this analysis and we should handle it.
   auto getBFI = [&F, &AM ]()->BlockFrequencyInfo & {
     return AM.getResult<BlockFrequencyAnalysis>(F);
   };
   InductiveRangeCheckElimination IRCE(SE, &BPI, DT, LI, { getBFI });
- 
-  bool Changed = false; 
+
+  bool Changed = false;
   {
     bool CFGChanged = false;
     for (const auto &L : LI) {
@@ -1784,65 +1784,65 @@ PreservedAnalyses IRCEPass::run(Function &F, FunctionAnalysisManager &AM) {
       Changed |= formLCSSARecursively(*L, DT, &LI, &SE);
     }
     Changed |= CFGChanged;
- 
+
     if (CFGChanged && !SkipProfitabilityChecks)
       AM.invalidate<BlockFrequencyAnalysis>(F);
-  } 
- 
-  SmallPriorityWorklist<Loop *, 4> Worklist; 
-  appendLoopsToWorklist(LI, Worklist); 
-  auto LPMAddNewLoop = [&Worklist](Loop *NL, bool IsSubloop) { 
-    if (!IsSubloop) 
-      appendLoopsToWorklist(*NL, Worklist); 
-  }; 
- 
-  while (!Worklist.empty()) { 
-    Loop *L = Worklist.pop_back_val(); 
+  }
+
+  SmallPriorityWorklist<Loop *, 4> Worklist;
+  appendLoopsToWorklist(LI, Worklist);
+  auto LPMAddNewLoop = [&Worklist](Loop *NL, bool IsSubloop) {
+    if (!IsSubloop)
+      appendLoopsToWorklist(*NL, Worklist);
+  };
+
+  while (!Worklist.empty()) {
+    Loop *L = Worklist.pop_back_val();
     if (IRCE.run(L, LPMAddNewLoop)) {
       Changed = true;
       if (!SkipProfitabilityChecks)
         AM.invalidate<BlockFrequencyAnalysis>(F);
     }
-  } 
- 
-  if (!Changed) 
-    return PreservedAnalyses::all(); 
-  return getLoopPassPreservedAnalyses(); 
-} 
- 
-bool IRCELegacyPass::runOnFunction(Function &F) { 
-  if (skipFunction(F)) 
-    return false; 
- 
-  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 
-  BranchProbabilityInfo &BPI = 
-      getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI(); 
-  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 
-  InductiveRangeCheckElimination IRCE(SE, &BPI, DT, LI); 
- 
-  bool Changed = false; 
- 
-  for (const auto &L : LI) { 
-    Changed |= simplifyLoop(L, &DT, &LI, &SE, nullptr, nullptr, 
-                            /*PreserveLCSSA=*/false); 
-    Changed |= formLCSSARecursively(*L, DT, &LI, &SE); 
-  } 
- 
-  SmallPriorityWorklist<Loop *, 4> Worklist; 
-  appendLoopsToWorklist(LI, Worklist); 
-  auto LPMAddNewLoop = [&](Loop *NL, bool IsSubloop) { 
-    if (!IsSubloop) 
-      appendLoopsToWorklist(*NL, Worklist); 
-  }; 
- 
-  while (!Worklist.empty()) { 
-    Loop *L = Worklist.pop_back_val(); 
-    Changed |= IRCE.run(L, LPMAddNewLoop); 
-  } 
-  return Changed; 
-} 
- 
+  }
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+  return getLoopPassPreservedAnalyses();
+}
+
+bool IRCELegacyPass::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  BranchProbabilityInfo &BPI =
+      getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  InductiveRangeCheckElimination IRCE(SE, &BPI, DT, LI);
+
+  bool Changed = false;
+
+  for (const auto &L : LI) {
+    Changed |= simplifyLoop(L, &DT, &LI, &SE, nullptr, nullptr,
+                            /*PreserveLCSSA=*/false);
+    Changed |= formLCSSARecursively(*L, DT, &LI, &SE);
+  }
+
+  SmallPriorityWorklist<Loop *, 4> Worklist;
+  appendLoopsToWorklist(LI, Worklist);
+  auto LPMAddNewLoop = [&](Loop *NL, bool IsSubloop) {
+    if (!IsSubloop)
+      appendLoopsToWorklist(*NL, Worklist);
+  };
+
+  while (!Worklist.empty()) {
+    Loop *L = Worklist.pop_back_val();
+    Changed |= IRCE.run(L, LPMAddNewLoop);
+  }
+  return Changed;
+}
+
 bool
 InductiveRangeCheckElimination::isProfitableToTransform(const Loop &L,
                                                         LoopStructure &LS) {
@@ -1874,118 +1874,118 @@ InductiveRangeCheckElimination::isProfitableToTransform(const Loop &L,
   return true;
 }
 
-bool InductiveRangeCheckElimination::run( 
-    Loop *L, function_ref<void(Loop *, bool)> LPMAddNewLoop) { 
-  if (L->getBlocks().size() >= LoopSizeCutoff) { 
-    LLVM_DEBUG(dbgs() << "irce: giving up constraining loop, too large\n"); 
-    return false; 
-  } 
- 
-  BasicBlock *Preheader = L->getLoopPreheader(); 
-  if (!Preheader) { 
-    LLVM_DEBUG(dbgs() << "irce: loop has no preheader, leaving\n"); 
-    return false; 
-  } 
- 
-  LLVMContext &Context = Preheader->getContext(); 
-  SmallVector<InductiveRangeCheck, 16> RangeChecks; 
- 
-  for (auto BBI : L->getBlocks()) 
-    if (BranchInst *TBI = dyn_cast<BranchInst>(BBI->getTerminator())) 
-      InductiveRangeCheck::extractRangeChecksFromBranch(TBI, L, SE, BPI, 
-                                                        RangeChecks); 
- 
-  if (RangeChecks.empty()) 
-    return false; 
- 
-  auto PrintRecognizedRangeChecks = [&](raw_ostream &OS) { 
-    OS << "irce: looking at loop "; L->print(OS); 
-    OS << "irce: loop has " << RangeChecks.size() 
-       << " inductive range checks: \n"; 
-    for (InductiveRangeCheck &IRC : RangeChecks) 
-      IRC.print(OS); 
-  }; 
- 
-  LLVM_DEBUG(PrintRecognizedRangeChecks(dbgs())); 
- 
-  if (PrintRangeChecks) 
-    PrintRecognizedRangeChecks(errs()); 
- 
-  const char *FailureReason = nullptr; 
-  Optional<LoopStructure> MaybeLoopStructure = 
+bool InductiveRangeCheckElimination::run(
+    Loop *L, function_ref<void(Loop *, bool)> LPMAddNewLoop) {
+  if (L->getBlocks().size() >= LoopSizeCutoff) {
+    LLVM_DEBUG(dbgs() << "irce: giving up constraining loop, too large\n");
+    return false;
+  }
+
+  BasicBlock *Preheader = L->getLoopPreheader();
+  if (!Preheader) {
+    LLVM_DEBUG(dbgs() << "irce: loop has no preheader, leaving\n");
+    return false;
+  }
+
+  LLVMContext &Context = Preheader->getContext();
+  SmallVector<InductiveRangeCheck, 16> RangeChecks;
+
+  for (auto BBI : L->getBlocks())
+    if (BranchInst *TBI = dyn_cast<BranchInst>(BBI->getTerminator()))
+      InductiveRangeCheck::extractRangeChecksFromBranch(TBI, L, SE, BPI,
+                                                        RangeChecks);
+
+  if (RangeChecks.empty())
+    return false;
+
+  auto PrintRecognizedRangeChecks = [&](raw_ostream &OS) {
+    OS << "irce: looking at loop "; L->print(OS);
+    OS << "irce: loop has " << RangeChecks.size()
+       << " inductive range checks: \n";
+    for (InductiveRangeCheck &IRC : RangeChecks)
+      IRC.print(OS);
+  };
+
+  LLVM_DEBUG(PrintRecognizedRangeChecks(dbgs()));
+
+  if (PrintRangeChecks)
+    PrintRecognizedRangeChecks(errs());
+
+  const char *FailureReason = nullptr;
+  Optional<LoopStructure> MaybeLoopStructure =
       LoopStructure::parseLoopStructure(SE, *L, FailureReason);
-  if (!MaybeLoopStructure.hasValue()) { 
-    LLVM_DEBUG(dbgs() << "irce: could not parse loop structure: " 
-                      << FailureReason << "\n";); 
-    return false; 
-  } 
-  LoopStructure LS = MaybeLoopStructure.getValue(); 
+  if (!MaybeLoopStructure.hasValue()) {
+    LLVM_DEBUG(dbgs() << "irce: could not parse loop structure: "
+                      << FailureReason << "\n";);
+    return false;
+  }
+  LoopStructure LS = MaybeLoopStructure.getValue();
   if (!isProfitableToTransform(*L, LS))
     return false;
-  const SCEVAddRecExpr *IndVar = 
-      cast<SCEVAddRecExpr>(SE.getMinusSCEV(SE.getSCEV(LS.IndVarBase), SE.getSCEV(LS.IndVarStep))); 
- 
-  Optional<InductiveRangeCheck::Range> SafeIterRange; 
-  Instruction *ExprInsertPt = Preheader->getTerminator(); 
- 
-  SmallVector<InductiveRangeCheck, 4> RangeChecksToEliminate; 
-  // Basing on the type of latch predicate, we interpret the IV iteration range 
-  // as signed or unsigned range. We use different min/max functions (signed or 
-  // unsigned) when intersecting this range with safe iteration ranges implied 
-  // by range checks. 
-  auto IntersectRange = 
-      LS.IsSignedPredicate ? IntersectSignedRange : IntersectUnsignedRange; 
- 
-  IRBuilder<> B(ExprInsertPt); 
-  for (InductiveRangeCheck &IRC : RangeChecks) { 
-    auto Result = IRC.computeSafeIterationSpace(SE, IndVar, 
-                                                LS.IsSignedPredicate); 
-    if (Result.hasValue()) { 
-      auto MaybeSafeIterRange = 
-          IntersectRange(SE, SafeIterRange, Result.getValue()); 
-      if (MaybeSafeIterRange.hasValue()) { 
-        assert( 
-            !MaybeSafeIterRange.getValue().isEmpty(SE, LS.IsSignedPredicate) && 
-            "We should never return empty ranges!"); 
-        RangeChecksToEliminate.push_back(IRC); 
-        SafeIterRange = MaybeSafeIterRange.getValue(); 
-      } 
-    } 
-  } 
- 
-  if (!SafeIterRange.hasValue()) 
-    return false; 
- 
-  LoopConstrainer LC(*L, LI, LPMAddNewLoop, LS, SE, DT, 
-                     SafeIterRange.getValue()); 
-  bool Changed = LC.run(); 
- 
-  if (Changed) { 
-    auto PrintConstrainedLoopInfo = [L]() { 
-      dbgs() << "irce: in function "; 
-      dbgs() << L->getHeader()->getParent()->getName() << ": "; 
-      dbgs() << "constrained "; 
-      L->print(dbgs()); 
-    }; 
- 
-    LLVM_DEBUG(PrintConstrainedLoopInfo()); 
- 
-    if (PrintChangedLoops) 
-      PrintConstrainedLoopInfo(); 
- 
-    // Optimize away the now-redundant range checks. 
- 
-    for (InductiveRangeCheck &IRC : RangeChecksToEliminate) { 
-      ConstantInt *FoldedRangeCheck = IRC.getPassingDirection() 
-                                          ? ConstantInt::getTrue(Context) 
-                                          : ConstantInt::getFalse(Context); 
-      IRC.getCheckUse()->set(FoldedRangeCheck); 
-    } 
-  } 
- 
-  return Changed; 
-} 
- 
-Pass *llvm::createInductiveRangeCheckEliminationPass() { 
-  return new IRCELegacyPass(); 
-} 
+  const SCEVAddRecExpr *IndVar =
+      cast<SCEVAddRecExpr>(SE.getMinusSCEV(SE.getSCEV(LS.IndVarBase), SE.getSCEV(LS.IndVarStep)));
+
+  Optional<InductiveRangeCheck::Range> SafeIterRange;
+  Instruction *ExprInsertPt = Preheader->getTerminator();
+
+  SmallVector<InductiveRangeCheck, 4> RangeChecksToEliminate;
+  // Basing on the type of latch predicate, we interpret the IV iteration range
+  // as signed or unsigned range. We use different min/max functions (signed or
+  // unsigned) when intersecting this range with safe iteration ranges implied
+  // by range checks.
+  auto IntersectRange =
+      LS.IsSignedPredicate ? IntersectSignedRange : IntersectUnsignedRange;
+
+  IRBuilder<> B(ExprInsertPt);
+  for (InductiveRangeCheck &IRC : RangeChecks) {
+    auto Result = IRC.computeSafeIterationSpace(SE, IndVar,
+                                                LS.IsSignedPredicate);
+    if (Result.hasValue()) {
+      auto MaybeSafeIterRange =
+          IntersectRange(SE, SafeIterRange, Result.getValue());
+      if (MaybeSafeIterRange.hasValue()) {
+        assert(
+            !MaybeSafeIterRange.getValue().isEmpty(SE, LS.IsSignedPredicate) &&
+            "We should never return empty ranges!");
+        RangeChecksToEliminate.push_back(IRC);
+        SafeIterRange = MaybeSafeIterRange.getValue();
+      }
+    }
+  }
+
+  if (!SafeIterRange.hasValue())
+    return false;
+
+  LoopConstrainer LC(*L, LI, LPMAddNewLoop, LS, SE, DT,
+                     SafeIterRange.getValue());
+  bool Changed = LC.run();
+
+  if (Changed) {
+    auto PrintConstrainedLoopInfo = [L]() {
+      dbgs() << "irce: in function ";
+      dbgs() << L->getHeader()->getParent()->getName() << ": ";
+      dbgs() << "constrained ";
+      L->print(dbgs());
+    };
+
+    LLVM_DEBUG(PrintConstrainedLoopInfo());
+
+    if (PrintChangedLoops)
+      PrintConstrainedLoopInfo();
+
+    // Optimize away the now-redundant range checks.
+
+    for (InductiveRangeCheck &IRC : RangeChecksToEliminate) {
+      ConstantInt *FoldedRangeCheck = IRC.getPassingDirection()
+                                          ? ConstantInt::getTrue(Context)
+                                          : ConstantInt::getFalse(Context);
+      IRC.getCheckUse()->set(FoldedRangeCheck);
+    }
+  }
+
+  return Changed;
+}
+
+Pass *llvm::createInductiveRangeCheckEliminationPass() {
+  return new IRCELegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/InferAddressSpaces.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/InferAddressSpaces.cpp
index d8df431486..332eb10ac1 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -1,171 +1,171 @@
-//===- InferAddressSpace.cpp - --------------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// CUDA C/C++ includes memory space designation as variable type qualifers (such 
-// as __global__ and __shared__). Knowing the space of a memory access allows 
-// CUDA compilers to emit faster PTX loads and stores. For example, a load from 
-// shared memory can be translated to `ld.shared` which is roughly 10% faster 
-// than a generic `ld` on an NVIDIA Tesla K40c. 
-// 
-// Unfortunately, type qualifiers only apply to variable declarations, so CUDA 
-// compilers must infer the memory space of an address expression from 
-// type-qualified variables. 
-// 
-// LLVM IR uses non-zero (so-called) specific address spaces to represent memory 
-// spaces (e.g. addrspace(3) means shared memory). The Clang frontend 
-// places only type-qualified variables in specific address spaces, and then 
-// conservatively `addrspacecast`s each type-qualified variable to addrspace(0) 
-// (so-called the generic address space) for other instructions to use. 
-// 
-// For example, the Clang translates the following CUDA code 
-//   __shared__ float a[10]; 
-//   float v = a[i]; 
-// to 
-//   %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]* 
-//   %1 = gep [10 x float], [10 x float]* %0, i64 0, i64 %i 
-//   %v = load float, float* %1 ; emits ld.f32 
-// @a is in addrspace(3) since it's type-qualified, but its use from %1 is 
-// redirected to %0 (the generic version of @a). 
-// 
-// The optimization implemented in this file propagates specific address spaces 
-// from type-qualified variable declarations to its users. For example, it 
-// optimizes the above IR to 
-//   %1 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i 
-//   %v = load float addrspace(3)* %1 ; emits ld.shared.f32 
-// propagating the addrspace(3) from @a to %1. As the result, the NVPTX 
-// codegen is able to emit ld.shared.f32 for %v. 
-// 
-// Address space inference works in two steps. First, it uses a data-flow 
-// analysis to infer as many generic pointers as possible to point to only one 
-// specific address space. In the above example, it can prove that %1 only 
-// points to addrspace(3). This algorithm was published in 
-//   CUDA: Compiling and optimizing for a GPU platform 
-//   Chakrabarti, Grover, Aarts, Kong, Kudlur, Lin, Marathe, Murphy, Wang 
-//   ICCS 2012 
-// 
-// Then, address space inference replaces all refinable generic pointers with 
-// equivalent specific pointers. 
-// 
-// The major challenge of implementing this optimization is handling PHINodes, 
-// which may create loops in the data flow graph. This brings two complications. 
-// 
-// First, the data flow analysis in Step 1 needs to be circular. For example, 
-//     %generic.input = addrspacecast float addrspace(3)* %input to float* 
-//   loop: 
-//     %y = phi [ %generic.input, %y2 ] 
-//     %y2 = getelementptr %y, 1 
-//     %v = load %y2 
-//     br ..., label %loop, ... 
-// proving %y specific requires proving both %generic.input and %y2 specific, 
-// but proving %y2 specific circles back to %y. To address this complication, 
-// the data flow analysis operates on a lattice: 
-//   uninitialized > specific address spaces > generic. 
-// All address expressions (our implementation only considers phi, bitcast, 
-// addrspacecast, and getelementptr) start with the uninitialized address space. 
-// The monotone transfer function moves the address space of a pointer down a 
-// lattice path from uninitialized to specific and then to generic. A join 
-// operation of two different specific address spaces pushes the expression down 
-// to the generic address space. The analysis completes once it reaches a fixed 
-// point. 
-// 
-// Second, IR rewriting in Step 2 also needs to be circular. For example, 
-// converting %y to addrspace(3) requires the compiler to know the converted 
-// %y2, but converting %y2 needs the converted %y. To address this complication, 
-// we break these cycles using "undef" placeholders. When converting an 
-// instruction `I` to a new address space, if its operand `Op` is not converted 
-// yet, we let `I` temporarily use `undef` and fix all the uses of undef later. 
-// For instance, our algorithm first converts %y to 
-//   %y' = phi float addrspace(3)* [ %input, undef ] 
-// Then, it converts %y2 to 
-//   %y2' = getelementptr %y', 1 
-// Finally, it fixes the undef in %y' so that 
-//   %y' = phi float addrspace(3)* [ %input, %y2' ] 
-// 
-//===----------------------------------------------------------------------===// 
- 
+//===- InferAddressSpace.cpp - --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// CUDA C/C++ includes memory space designation as variable type qualifers (such
+// as __global__ and __shared__). Knowing the space of a memory access allows
+// CUDA compilers to emit faster PTX loads and stores. For example, a load from
+// shared memory can be translated to `ld.shared` which is roughly 10% faster
+// than a generic `ld` on an NVIDIA Tesla K40c.
+//
+// Unfortunately, type qualifiers only apply to variable declarations, so CUDA
+// compilers must infer the memory space of an address expression from
+// type-qualified variables.
+//
+// LLVM IR uses non-zero (so-called) specific address spaces to represent memory
+// spaces (e.g. addrspace(3) means shared memory). The Clang frontend
+// places only type-qualified variables in specific address spaces, and then
+// conservatively `addrspacecast`s each type-qualified variable to addrspace(0)
+// (so-called the generic address space) for other instructions to use.
+//
+// For example, the Clang translates the following CUDA code
+//   __shared__ float a[10];
+//   float v = a[i];
+// to
+//   %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]*
+//   %1 = gep [10 x float], [10 x float]* %0, i64 0, i64 %i
+//   %v = load float, float* %1 ; emits ld.f32
+// @a is in addrspace(3) since it's type-qualified, but its use from %1 is
+// redirected to %0 (the generic version of @a).
+//
+// The optimization implemented in this file propagates specific address spaces
+// from type-qualified variable declarations to its users. For example, it
+// optimizes the above IR to
+//   %1 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
+//   %v = load float addrspace(3)* %1 ; emits ld.shared.f32
+// propagating the addrspace(3) from @a to %1. As the result, the NVPTX
+// codegen is able to emit ld.shared.f32 for %v.
+//
+// Address space inference works in two steps. First, it uses a data-flow
+// analysis to infer as many generic pointers as possible to point to only one
+// specific address space. In the above example, it can prove that %1 only
+// points to addrspace(3). This algorithm was published in
+//   CUDA: Compiling and optimizing for a GPU platform
+//   Chakrabarti, Grover, Aarts, Kong, Kudlur, Lin, Marathe, Murphy, Wang
+//   ICCS 2012
+//
+// Then, address space inference replaces all refinable generic pointers with
+// equivalent specific pointers.
+//
+// The major challenge of implementing this optimization is handling PHINodes,
+// which may create loops in the data flow graph. This brings two complications.
+//
+// First, the data flow analysis in Step 1 needs to be circular. For example,
+//     %generic.input = addrspacecast float addrspace(3)* %input to float*
+//   loop:
+//     %y = phi [ %generic.input, %y2 ]
+//     %y2 = getelementptr %y, 1
+//     %v = load %y2
+//     br ..., label %loop, ...
+// proving %y specific requires proving both %generic.input and %y2 specific,
+// but proving %y2 specific circles back to %y. To address this complication,
+// the data flow analysis operates on a lattice:
+//   uninitialized > specific address spaces > generic.
+// All address expressions (our implementation only considers phi, bitcast,
+// addrspacecast, and getelementptr) start with the uninitialized address space.
+// The monotone transfer function moves the address space of a pointer down a
+// lattice path from uninitialized to specific and then to generic. A join
+// operation of two different specific address spaces pushes the expression down
+// to the generic address space. The analysis completes once it reaches a fixed
+// point.
+//
+// Second, IR rewriting in Step 2 also needs to be circular. For example,
+// converting %y to addrspace(3) requires the compiler to know the converted
+// %y2, but converting %y2 needs the converted %y. To address this complication,
+// we break these cycles using "undef" placeholders. When converting an
+// instruction `I` to a new address space, if its operand `Op` is not converted
+// yet, we let `I` temporarily use `undef` and fix all the uses of undef later.
+// For instance, our algorithm first converts %y to
+//   %y' = phi float addrspace(3)* [ %input, undef ]
+// Then, it converts %y2 to
+//   %y2' = getelementptr %y', 1
+// Finally, it fixes the undef in %y' so that
+//   %y' = phi float addrspace(3)* [ %input, %y2' ]
+//
+//===----------------------------------------------------------------------===//
+
 #include "llvm/Transforms/Scalar/InferAddressSpaces.h"
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/DenseSet.h" 
-#include "llvm/ADT/None.h" 
-#include "llvm/ADT/Optional.h" 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstIterator.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/Operator.h" 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Operator.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/IR/ValueHandle.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Compiler.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Transforms/Utils/ValueMapper.h" 
-#include <cassert> 
-#include <iterator> 
-#include <limits> 
-#include <utility> 
-#include <vector> 
- 
-#define DEBUG_TYPE "infer-address-spaces" 
- 
-using namespace llvm; 
- 
-static cl::opt<bool> AssumeDefaultIsFlatAddressSpace( 
-    "assume-default-is-flat-addrspace", cl::init(false), cl::ReallyHidden, 
-    cl::desc("The default address space is assumed as the flat address space. " 
-             "This is mainly for test purpose.")); 
- 
-static const unsigned UninitializedAddressSpace = 
-    std::numeric_limits<unsigned>::max(); 
- 
-namespace { 
- 
-using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>; 
-using PostorderStackTy = llvm::SmallVector<PointerIntPair<Value *, 1, bool>, 4>; 
- 
-class InferAddressSpaces : public FunctionPass { 
-  unsigned FlatAddrSpace = 0; 
- 
-public: 
-  static char ID; 
- 
-  InferAddressSpaces() : 
-    FunctionPass(ID), FlatAddrSpace(UninitializedAddressSpace) {} 
-  InferAddressSpaces(unsigned AS) : FunctionPass(ID), FlatAddrSpace(AS) {} 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.setPreservesCFG(); 
-    AU.addRequired<TargetTransformInfoWrapperPass>(); 
-  } 
- 
-  bool runOnFunction(Function &F) override; 
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <cassert>
+#include <iterator>
+#include <limits>
+#include <utility>
+#include <vector>
+
+#define DEBUG_TYPE "infer-address-spaces"
+
+using namespace llvm;
+
+static cl::opt<bool> AssumeDefaultIsFlatAddressSpace(
+    "assume-default-is-flat-addrspace", cl::init(false), cl::ReallyHidden,
+    cl::desc("The default address space is assumed as the flat address space. "
+             "This is mainly for test purpose."));
+
+static const unsigned UninitializedAddressSpace =
+    std::numeric_limits<unsigned>::max();
+
+namespace {
+
+using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>;
+using PostorderStackTy = llvm::SmallVector<PointerIntPair<Value *, 1, bool>, 4>;
+
+class InferAddressSpaces : public FunctionPass {
+  unsigned FlatAddrSpace = 0;
+
+public:
+  static char ID;
+
+  InferAddressSpaces() :
+    FunctionPass(ID), FlatAddrSpace(UninitializedAddressSpace) {}
+  InferAddressSpaces(unsigned AS) : FunctionPass(ID), FlatAddrSpace(AS) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override;
 };
- 
+
 class InferAddressSpacesImpl {
   const TargetTransformInfo *TTI = nullptr;
   const DataLayout *DL = nullptr;
@@ -174,400 +174,400 @@ class InferAddressSpacesImpl {
   /// possible.
   unsigned FlatAddrSpace = 0;
 
-  // Returns the new address space of V if updated; otherwise, returns None. 
-  Optional<unsigned> 
-  updateAddressSpace(const Value &V, 
-                     const ValueToAddrSpaceMapTy &InferredAddrSpace) const; 
- 
-  // Tries to infer the specific address space of each address expression in 
-  // Postorder. 
-  void inferAddressSpaces(ArrayRef<WeakTrackingVH> Postorder, 
-                          ValueToAddrSpaceMapTy *InferredAddrSpace) const; 
- 
-  bool isSafeToCastConstAddrSpace(Constant *C, unsigned NewAS) const; 
- 
-  Value *cloneInstructionWithNewAddressSpace( 
-      Instruction *I, unsigned NewAddrSpace, 
-      const ValueToValueMapTy &ValueWithNewAddrSpace, 
-      SmallVectorImpl<const Use *> *UndefUsesToFix) const; 
- 
-  // Changes the flat address expressions in function F to point to specific 
-  // address spaces if InferredAddrSpace says so. Postorder is the postorder of 
-  // all flat expressions in the use-def graph of function F. 
-  bool rewriteWithNewAddressSpaces( 
-      const TargetTransformInfo &TTI, ArrayRef<WeakTrackingVH> Postorder, 
-      const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) const; 
- 
-  void appendsFlatAddressExpressionToPostorderStack( 
-      Value *V, PostorderStackTy &PostorderStack, 
-      DenseSet<Value *> &Visited) const; 
- 
-  bool rewriteIntrinsicOperands(IntrinsicInst *II, 
-                                Value *OldV, Value *NewV) const; 
-  void collectRewritableIntrinsicOperands(IntrinsicInst *II, 
-                                          PostorderStackTy &PostorderStack, 
-                                          DenseSet<Value *> &Visited) const; 
- 
-  std::vector<WeakTrackingVH> collectFlatAddressExpressions(Function &F) const; 
- 
-  Value *cloneValueWithNewAddressSpace( 
-    Value *V, unsigned NewAddrSpace, 
-    const ValueToValueMapTy &ValueWithNewAddrSpace, 
-    SmallVectorImpl<const Use *> *UndefUsesToFix) const; 
-  unsigned joinAddressSpaces(unsigned AS1, unsigned AS2) const; 
+  // Returns the new address space of V if updated; otherwise, returns None.
+  Optional<unsigned>
+  updateAddressSpace(const Value &V,
+                     const ValueToAddrSpaceMapTy &InferredAddrSpace) const;
+
+  // Tries to infer the specific address space of each address expression in
+  // Postorder.
+  void inferAddressSpaces(ArrayRef<WeakTrackingVH> Postorder,
+                          ValueToAddrSpaceMapTy *InferredAddrSpace) const;
+
+  bool isSafeToCastConstAddrSpace(Constant *C, unsigned NewAS) const;
+
+  Value *cloneInstructionWithNewAddressSpace(
+      Instruction *I, unsigned NewAddrSpace,
+      const ValueToValueMapTy &ValueWithNewAddrSpace,
+      SmallVectorImpl<const Use *> *UndefUsesToFix) const;
+
+  // Changes the flat address expressions in function F to point to specific
+  // address spaces if InferredAddrSpace says so. Postorder is the postorder of
+  // all flat expressions in the use-def graph of function F.
+  bool rewriteWithNewAddressSpaces(
+      const TargetTransformInfo &TTI, ArrayRef<WeakTrackingVH> Postorder,
+      const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) const;
+
+  void appendsFlatAddressExpressionToPostorderStack(
+      Value *V, PostorderStackTy &PostorderStack,
+      DenseSet<Value *> &Visited) const;
+
+  bool rewriteIntrinsicOperands(IntrinsicInst *II,
+                                Value *OldV, Value *NewV) const;
+  void collectRewritableIntrinsicOperands(IntrinsicInst *II,
+                                          PostorderStackTy &PostorderStack,
+                                          DenseSet<Value *> &Visited) const;
+
+  std::vector<WeakTrackingVH> collectFlatAddressExpressions(Function &F) const;
+
+  Value *cloneValueWithNewAddressSpace(
+    Value *V, unsigned NewAddrSpace,
+    const ValueToValueMapTy &ValueWithNewAddrSpace,
+    SmallVectorImpl<const Use *> *UndefUsesToFix) const;
+  unsigned joinAddressSpaces(unsigned AS1, unsigned AS2) const;
 
 public:
   InferAddressSpacesImpl(const TargetTransformInfo *TTI, unsigned FlatAddrSpace)
       : TTI(TTI), FlatAddrSpace(FlatAddrSpace) {}
   bool run(Function &F);
-}; 
- 
-} // end anonymous namespace 
- 
-char InferAddressSpaces::ID = 0; 
- 
-namespace llvm { 
- 
-void initializeInferAddressSpacesPass(PassRegistry &); 
- 
-} // end namespace llvm 
- 
-INITIALIZE_PASS(InferAddressSpaces, DEBUG_TYPE, "Infer address spaces", 
-                false, false) 
- 
-// Check whether that's no-op pointer bicast using a pair of 
-// `ptrtoint`/`inttoptr` due to the missing no-op pointer bitcast over 
-// different address spaces. 
-static bool isNoopPtrIntCastPair(const Operator *I2P, const DataLayout &DL, 
-                                 const TargetTransformInfo *TTI) { 
-  assert(I2P->getOpcode() == Instruction::IntToPtr); 
-  auto *P2I = dyn_cast<Operator>(I2P->getOperand(0)); 
-  if (!P2I || P2I->getOpcode() != Instruction::PtrToInt) 
-    return false; 
-  // Check it's really safe to treat that pair of `ptrtoint`/`inttoptr` as a 
-  // no-op cast. Besides checking both of them are no-op casts, as the 
-  // reinterpreted pointer may be used in other pointer arithmetic, we also 
-  // need to double-check that through the target-specific hook. That ensures 
-  // the underlying target also agrees that's a no-op address space cast and 
-  // pointer bits are preserved. 
-  // The current IR spec doesn't have clear rules on address space casts, 
-  // especially a clear definition for pointer bits in non-default address 
-  // spaces. It would be undefined if that pointer is dereferenced after an 
-  // invalid reinterpret cast. Also, due to the unclearness for the meaning of 
-  // bits in non-default address spaces in the current spec, the pointer 
-  // arithmetic may also be undefined after invalid pointer reinterpret cast. 
-  // However, as we confirm through the target hooks that it's a no-op 
-  // addrspacecast, it doesn't matter since the bits should be the same. 
-  return CastInst::isNoopCast(Instruction::CastOps(I2P->getOpcode()), 
-                              I2P->getOperand(0)->getType(), I2P->getType(), 
-                              DL) && 
-         CastInst::isNoopCast(Instruction::CastOps(P2I->getOpcode()), 
-                              P2I->getOperand(0)->getType(), P2I->getType(), 
-                              DL) && 
-         TTI->isNoopAddrSpaceCast( 
-             P2I->getOperand(0)->getType()->getPointerAddressSpace(), 
-             I2P->getType()->getPointerAddressSpace()); 
-} 
- 
-// Returns true if V is an address expression. 
-// TODO: Currently, we consider only phi, bitcast, addrspacecast, and 
-// getelementptr operators. 
-static bool isAddressExpression(const Value &V, const DataLayout &DL, 
-                                const TargetTransformInfo *TTI) { 
-  const Operator *Op = dyn_cast<Operator>(&V); 
-  if (!Op) 
-    return false; 
- 
-  switch (Op->getOpcode()) { 
-  case Instruction::PHI: 
-    assert(Op->getType()->isPointerTy()); 
-    return true; 
-  case Instruction::BitCast: 
-  case Instruction::AddrSpaceCast: 
-  case Instruction::GetElementPtr: 
-    return true; 
-  case Instruction::Select: 
-    return Op->getType()->isPointerTy(); 
-  case Instruction::Call: { 
-    const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&V); 
-    return II && II->getIntrinsicID() == Intrinsic::ptrmask; 
-  } 
-  case Instruction::IntToPtr: 
-    return isNoopPtrIntCastPair(Op, DL, TTI); 
-  default: 
+};
+
+} // end anonymous namespace
+
+char InferAddressSpaces::ID = 0;
+
+namespace llvm {
+
+void initializeInferAddressSpacesPass(PassRegistry &);
+
+} // end namespace llvm
+
+INITIALIZE_PASS(InferAddressSpaces, DEBUG_TYPE, "Infer address spaces",
+                false, false)
+
+// Check whether that's no-op pointer bicast using a pair of
+// `ptrtoint`/`inttoptr` due to the missing no-op pointer bitcast over
+// different address spaces.
+static bool isNoopPtrIntCastPair(const Operator *I2P, const DataLayout &DL,
+                                 const TargetTransformInfo *TTI) {
+  assert(I2P->getOpcode() == Instruction::IntToPtr);
+  auto *P2I = dyn_cast<Operator>(I2P->getOperand(0));
+  if (!P2I || P2I->getOpcode() != Instruction::PtrToInt)
+    return false;
+  // Check it's really safe to treat that pair of `ptrtoint`/`inttoptr` as a
+  // no-op cast. Besides checking both of them are no-op casts, as the
+  // reinterpreted pointer may be used in other pointer arithmetic, we also
+  // need to double-check that through the target-specific hook. That ensures
+  // the underlying target also agrees that's a no-op address space cast and
+  // pointer bits are preserved.
+  // The current IR spec doesn't have clear rules on address space casts,
+  // especially a clear definition for pointer bits in non-default address
+  // spaces. It would be undefined if that pointer is dereferenced after an
+  // invalid reinterpret cast. Also, due to the unclearness for the meaning of
+  // bits in non-default address spaces in the current spec, the pointer
+  // arithmetic may also be undefined after invalid pointer reinterpret cast.
+  // However, as we confirm through the target hooks that it's a no-op
+  // addrspacecast, it doesn't matter since the bits should be the same.
+  return CastInst::isNoopCast(Instruction::CastOps(I2P->getOpcode()),
+                              I2P->getOperand(0)->getType(), I2P->getType(),
+                              DL) &&
+         CastInst::isNoopCast(Instruction::CastOps(P2I->getOpcode()),
+                              P2I->getOperand(0)->getType(), P2I->getType(),
+                              DL) &&
+         TTI->isNoopAddrSpaceCast(
+             P2I->getOperand(0)->getType()->getPointerAddressSpace(),
+             I2P->getType()->getPointerAddressSpace());
+}
+
+// Returns true if V is an address expression.
+// TODO: Currently, we consider only phi, bitcast, addrspacecast, and
+// getelementptr operators.
+static bool isAddressExpression(const Value &V, const DataLayout &DL,
+                                const TargetTransformInfo *TTI) {
+  const Operator *Op = dyn_cast<Operator>(&V);
+  if (!Op)
+    return false;
+
+  switch (Op->getOpcode()) {
+  case Instruction::PHI:
+    assert(Op->getType()->isPointerTy());
+    return true;
+  case Instruction::BitCast:
+  case Instruction::AddrSpaceCast:
+  case Instruction::GetElementPtr:
+    return true;
+  case Instruction::Select:
+    return Op->getType()->isPointerTy();
+  case Instruction::Call: {
+    const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&V);
+    return II && II->getIntrinsicID() == Intrinsic::ptrmask;
+  }
+  case Instruction::IntToPtr:
+    return isNoopPtrIntCastPair(Op, DL, TTI);
+  default:
     // That value is an address expression if it has an assumed address space.
     return TTI->getAssumedAddrSpace(&V) != UninitializedAddressSpace;
-  } 
-} 
- 
-// Returns the pointer operands of V. 
-// 
-// Precondition: V is an address expression. 
-static SmallVector<Value *, 2> 
-getPointerOperands(const Value &V, const DataLayout &DL, 
-                   const TargetTransformInfo *TTI) { 
-  const Operator &Op = cast<Operator>(V); 
-  switch (Op.getOpcode()) { 
-  case Instruction::PHI: { 
-    auto IncomingValues = cast<PHINode>(Op).incoming_values(); 
-    return SmallVector<Value *, 2>(IncomingValues.begin(), 
-                                   IncomingValues.end()); 
-  } 
-  case Instruction::BitCast: 
-  case Instruction::AddrSpaceCast: 
-  case Instruction::GetElementPtr: 
-    return {Op.getOperand(0)}; 
-  case Instruction::Select: 
-    return {Op.getOperand(1), Op.getOperand(2)}; 
-  case Instruction::Call: { 
-    const IntrinsicInst &II = cast<IntrinsicInst>(Op); 
-    assert(II.getIntrinsicID() == Intrinsic::ptrmask && 
-           "unexpected intrinsic call"); 
-    return {II.getArgOperand(0)}; 
-  } 
-  case Instruction::IntToPtr: { 
-    assert(isNoopPtrIntCastPair(&Op, DL, TTI)); 
-    auto *P2I = cast<Operator>(Op.getOperand(0)); 
-    return {P2I->getOperand(0)}; 
-  } 
-  default: 
-    llvm_unreachable("Unexpected instruction type."); 
-  } 
-} 
- 
+  }
+}
+
+// Returns the pointer operands of V.
+//
+// Precondition: V is an address expression.
+static SmallVector<Value *, 2>
+getPointerOperands(const Value &V, const DataLayout &DL,
+                   const TargetTransformInfo *TTI) {
+  const Operator &Op = cast<Operator>(V);
+  switch (Op.getOpcode()) {
+  case Instruction::PHI: {
+    auto IncomingValues = cast<PHINode>(Op).incoming_values();
+    return SmallVector<Value *, 2>(IncomingValues.begin(),
+                                   IncomingValues.end());
+  }
+  case Instruction::BitCast:
+  case Instruction::AddrSpaceCast:
+  case Instruction::GetElementPtr:
+    return {Op.getOperand(0)};
+  case Instruction::Select:
+    return {Op.getOperand(1), Op.getOperand(2)};
+  case Instruction::Call: {
+    const IntrinsicInst &II = cast<IntrinsicInst>(Op);
+    assert(II.getIntrinsicID() == Intrinsic::ptrmask &&
+           "unexpected intrinsic call");
+    return {II.getArgOperand(0)};
+  }
+  case Instruction::IntToPtr: {
+    assert(isNoopPtrIntCastPair(&Op, DL, TTI));
+    auto *P2I = cast<Operator>(Op.getOperand(0));
+    return {P2I->getOperand(0)};
+  }
+  default:
+    llvm_unreachable("Unexpected instruction type.");
+  }
+}
+
 bool InferAddressSpacesImpl::rewriteIntrinsicOperands(IntrinsicInst *II,
                                                       Value *OldV,
                                                       Value *NewV) const {
-  Module *M = II->getParent()->getParent()->getParent(); 
- 
-  switch (II->getIntrinsicID()) { 
-  case Intrinsic::objectsize: { 
-    Type *DestTy = II->getType(); 
-    Type *SrcTy = NewV->getType(); 
-    Function *NewDecl = 
-        Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy}); 
-    II->setArgOperand(0, NewV); 
-    II->setCalledFunction(NewDecl); 
-    return true; 
-  } 
-  case Intrinsic::ptrmask: 
-    // This is handled as an address expression, not as a use memory operation. 
-    return false; 
-  default: { 
-    Value *Rewrite = TTI->rewriteIntrinsicWithAddressSpace(II, OldV, NewV); 
-    if (!Rewrite) 
-      return false; 
-    if (Rewrite != II) 
-      II->replaceAllUsesWith(Rewrite); 
-    return true; 
-  } 
-  } 
-} 
- 
+  Module *M = II->getParent()->getParent()->getParent();
+
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::objectsize: {
+    Type *DestTy = II->getType();
+    Type *SrcTy = NewV->getType();
+    Function *NewDecl =
+        Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
+    II->setArgOperand(0, NewV);
+    II->setCalledFunction(NewDecl);
+    return true;
+  }
+  case Intrinsic::ptrmask:
+    // This is handled as an address expression, not as a use memory operation.
+    return false;
+  default: {
+    Value *Rewrite = TTI->rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
+    if (!Rewrite)
+      return false;
+    if (Rewrite != II)
+      II->replaceAllUsesWith(Rewrite);
+    return true;
+  }
+  }
+}
+
 void InferAddressSpacesImpl::collectRewritableIntrinsicOperands(
-    IntrinsicInst *II, PostorderStackTy &PostorderStack, 
-    DenseSet<Value *> &Visited) const { 
-  auto IID = II->getIntrinsicID(); 
-  switch (IID) { 
-  case Intrinsic::ptrmask: 
-  case Intrinsic::objectsize: 
-    appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(0), 
-                                                 PostorderStack, Visited); 
-    break; 
-  default: 
-    SmallVector<int, 2> OpIndexes; 
-    if (TTI->collectFlatAddressOperands(OpIndexes, IID)) { 
-      for (int Idx : OpIndexes) { 
-        appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(Idx), 
-                                                     PostorderStack, Visited); 
-      } 
-    } 
-    break; 
-  } 
-} 
- 
-// Returns all flat address expressions in function F. The elements are 
-// If V is an unvisited flat address expression, appends V to PostorderStack 
-// and marks it as visited. 
+    IntrinsicInst *II, PostorderStackTy &PostorderStack,
+    DenseSet<Value *> &Visited) const {
+  auto IID = II->getIntrinsicID();
+  switch (IID) {
+  case Intrinsic::ptrmask:
+  case Intrinsic::objectsize:
+    appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(0),
+                                                 PostorderStack, Visited);
+    break;
+  default:
+    SmallVector<int, 2> OpIndexes;
+    if (TTI->collectFlatAddressOperands(OpIndexes, IID)) {
+      for (int Idx : OpIndexes) {
+        appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(Idx),
+                                                     PostorderStack, Visited);
+      }
+    }
+    break;
+  }
+}
+
+// Returns all flat address expressions in function F. The elements are
+// If V is an unvisited flat address expression, appends V to PostorderStack
+// and marks it as visited.
 void InferAddressSpacesImpl::appendsFlatAddressExpressionToPostorderStack(
-    Value *V, PostorderStackTy &PostorderStack, 
-    DenseSet<Value *> &Visited) const { 
-  assert(V->getType()->isPointerTy()); 
- 
-  // Generic addressing expressions may be hidden in nested constant 
-  // expressions. 
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) { 
-    // TODO: Look in non-address parts, like icmp operands. 
-    if (isAddressExpression(*CE, *DL, TTI) && Visited.insert(CE).second) 
-      PostorderStack.emplace_back(CE, false); 
- 
-    return; 
-  } 
- 
+    Value *V, PostorderStackTy &PostorderStack,
+    DenseSet<Value *> &Visited) const {
+  assert(V->getType()->isPointerTy());
+
+  // Generic addressing expressions may be hidden in nested constant
+  // expressions.
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
+    // TODO: Look in non-address parts, like icmp operands.
+    if (isAddressExpression(*CE, *DL, TTI) && Visited.insert(CE).second)
+      PostorderStack.emplace_back(CE, false);
+
+    return;
+  }
+
   if (V->getType()->getPointerAddressSpace() == FlatAddrSpace &&
       isAddressExpression(*V, *DL, TTI)) {
-    if (Visited.insert(V).second) { 
-      PostorderStack.emplace_back(V, false); 
- 
-      Operator *Op = cast<Operator>(V); 
-      for (unsigned I = 0, E = Op->getNumOperands(); I != E; ++I) { 
-        if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Op->getOperand(I))) { 
-          if (isAddressExpression(*CE, *DL, TTI) && Visited.insert(CE).second) 
-            PostorderStack.emplace_back(CE, false); 
-        } 
-      } 
-    } 
-  } 
-} 
- 
-// Returns all flat address expressions in function F. The elements are ordered 
-// ordered in postorder. 
-std::vector<WeakTrackingVH> 
+    if (Visited.insert(V).second) {
+      PostorderStack.emplace_back(V, false);
+
+      Operator *Op = cast<Operator>(V);
+      for (unsigned I = 0, E = Op->getNumOperands(); I != E; ++I) {
+        if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Op->getOperand(I))) {
+          if (isAddressExpression(*CE, *DL, TTI) && Visited.insert(CE).second)
+            PostorderStack.emplace_back(CE, false);
+        }
+      }
+    }
+  }
+}
+
+// Returns all flat address expressions in function F. The elements are ordered
+// ordered in postorder.
+std::vector<WeakTrackingVH>
 InferAddressSpacesImpl::collectFlatAddressExpressions(Function &F) const {
-  // This function implements a non-recursive postorder traversal of a partial 
-  // use-def graph of function F. 
-  PostorderStackTy PostorderStack; 
-  // The set of visited expressions. 
-  DenseSet<Value *> Visited; 
- 
-  auto PushPtrOperand = [&](Value *Ptr) { 
-    appendsFlatAddressExpressionToPostorderStack(Ptr, PostorderStack, 
-                                                 Visited); 
-  }; 
- 
-  // Look at operations that may be interesting accelerate by moving to a known 
-  // address space. We aim at generating after loads and stores, but pure 
-  // addressing calculations may also be faster. 
-  for (Instruction &I : instructions(F)) { 
-    if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { 
-      if (!GEP->getType()->isVectorTy()) 
-        PushPtrOperand(GEP->getPointerOperand()); 
-    } else if (auto *LI = dyn_cast<LoadInst>(&I)) 
-      PushPtrOperand(LI->getPointerOperand()); 
-    else if (auto *SI = dyn_cast<StoreInst>(&I)) 
-      PushPtrOperand(SI->getPointerOperand()); 
-    else if (auto *RMW = dyn_cast<AtomicRMWInst>(&I)) 
-      PushPtrOperand(RMW->getPointerOperand()); 
-    else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(&I)) 
-      PushPtrOperand(CmpX->getPointerOperand()); 
-    else if (auto *MI = dyn_cast<MemIntrinsic>(&I)) { 
-      // For memset/memcpy/memmove, any pointer operand can be replaced. 
-      PushPtrOperand(MI->getRawDest()); 
- 
-      // Handle 2nd operand for memcpy/memmove. 
-      if (auto *MTI = dyn_cast<MemTransferInst>(MI)) 
-        PushPtrOperand(MTI->getRawSource()); 
-    } else if (auto *II = dyn_cast<IntrinsicInst>(&I)) 
-      collectRewritableIntrinsicOperands(II, PostorderStack, Visited); 
-    else if (ICmpInst *Cmp = dyn_cast<ICmpInst>(&I)) { 
-      // FIXME: Handle vectors of pointers 
-      if (Cmp->getOperand(0)->getType()->isPointerTy()) { 
-        PushPtrOperand(Cmp->getOperand(0)); 
-        PushPtrOperand(Cmp->getOperand(1)); 
-      } 
-    } else if (auto *ASC = dyn_cast<AddrSpaceCastInst>(&I)) { 
-      if (!ASC->getType()->isVectorTy()) 
-        PushPtrOperand(ASC->getPointerOperand()); 
-    } else if (auto *I2P = dyn_cast<IntToPtrInst>(&I)) { 
-      if (isNoopPtrIntCastPair(cast<Operator>(I2P), *DL, TTI)) 
-        PushPtrOperand( 
-            cast<PtrToIntInst>(I2P->getOperand(0))->getPointerOperand()); 
-    } 
-  } 
- 
-  std::vector<WeakTrackingVH> Postorder; // The resultant postorder. 
-  while (!PostorderStack.empty()) { 
-    Value *TopVal = PostorderStack.back().getPointer(); 
-    // If the operands of the expression on the top are already explored, 
-    // adds that expression to the resultant postorder. 
-    if (PostorderStack.back().getInt()) { 
-      if (TopVal->getType()->getPointerAddressSpace() == FlatAddrSpace) 
-        Postorder.push_back(TopVal); 
-      PostorderStack.pop_back(); 
-      continue; 
-    } 
-    // Otherwise, adds its operands to the stack and explores them. 
-    PostorderStack.back().setInt(true); 
+  // This function implements a non-recursive postorder traversal of a partial
+  // use-def graph of function F.
+  PostorderStackTy PostorderStack;
+  // The set of visited expressions.
+  DenseSet<Value *> Visited;
+
+  auto PushPtrOperand = [&](Value *Ptr) {
+    appendsFlatAddressExpressionToPostorderStack(Ptr, PostorderStack,
+                                                 Visited);
+  };
+
+  // Look at operations that may be interesting accelerate by moving to a known
+  // address space. We aim at generating after loads and stores, but pure
+  // addressing calculations may also be faster.
+  for (Instruction &I : instructions(F)) {
+    if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+      if (!GEP->getType()->isVectorTy())
+        PushPtrOperand(GEP->getPointerOperand());
+    } else if (auto *LI = dyn_cast<LoadInst>(&I))
+      PushPtrOperand(LI->getPointerOperand());
+    else if (auto *SI = dyn_cast<StoreInst>(&I))
+      PushPtrOperand(SI->getPointerOperand());
+    else if (auto *RMW = dyn_cast<AtomicRMWInst>(&I))
+      PushPtrOperand(RMW->getPointerOperand());
+    else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(&I))
+      PushPtrOperand(CmpX->getPointerOperand());
+    else if (auto *MI = dyn_cast<MemIntrinsic>(&I)) {
+      // For memset/memcpy/memmove, any pointer operand can be replaced.
+      PushPtrOperand(MI->getRawDest());
+
+      // Handle 2nd operand for memcpy/memmove.
+      if (auto *MTI = dyn_cast<MemTransferInst>(MI))
+        PushPtrOperand(MTI->getRawSource());
+    } else if (auto *II = dyn_cast<IntrinsicInst>(&I))
+      collectRewritableIntrinsicOperands(II, PostorderStack, Visited);
+    else if (ICmpInst *Cmp = dyn_cast<ICmpInst>(&I)) {
+      // FIXME: Handle vectors of pointers
+      if (Cmp->getOperand(0)->getType()->isPointerTy()) {
+        PushPtrOperand(Cmp->getOperand(0));
+        PushPtrOperand(Cmp->getOperand(1));
+      }
+    } else if (auto *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
+      if (!ASC->getType()->isVectorTy())
+        PushPtrOperand(ASC->getPointerOperand());
+    } else if (auto *I2P = dyn_cast<IntToPtrInst>(&I)) {
+      if (isNoopPtrIntCastPair(cast<Operator>(I2P), *DL, TTI))
+        PushPtrOperand(
+            cast<PtrToIntInst>(I2P->getOperand(0))->getPointerOperand());
+    }
+  }
+
+  std::vector<WeakTrackingVH> Postorder; // The resultant postorder.
+  while (!PostorderStack.empty()) {
+    Value *TopVal = PostorderStack.back().getPointer();
+    // If the operands of the expression on the top are already explored,
+    // adds that expression to the resultant postorder.
+    if (PostorderStack.back().getInt()) {
+      if (TopVal->getType()->getPointerAddressSpace() == FlatAddrSpace)
+        Postorder.push_back(TopVal);
+      PostorderStack.pop_back();
+      continue;
+    }
+    // Otherwise, adds its operands to the stack and explores them.
+    PostorderStack.back().setInt(true);
     // Skip values with an assumed address space.
     if (TTI->getAssumedAddrSpace(TopVal) == UninitializedAddressSpace) {
       for (Value *PtrOperand : getPointerOperands(*TopVal, *DL, TTI)) {
         appendsFlatAddressExpressionToPostorderStack(PtrOperand, PostorderStack,
                                                      Visited);
       }
-    } 
-  } 
-  return Postorder; 
-} 
- 
-// A helper function for cloneInstructionWithNewAddressSpace. Returns the clone 
-// of OperandUse.get() in the new address space. If the clone is not ready yet, 
-// returns an undef in the new address space as a placeholder. 
-static Value *operandWithNewAddressSpaceOrCreateUndef( 
-    const Use &OperandUse, unsigned NewAddrSpace, 
-    const ValueToValueMapTy &ValueWithNewAddrSpace, 
-    SmallVectorImpl<const Use *> *UndefUsesToFix) { 
-  Value *Operand = OperandUse.get(); 
- 
-  Type *NewPtrTy = 
-      Operand->getType()->getPointerElementType()->getPointerTo(NewAddrSpace); 
- 
-  if (Constant *C = dyn_cast<Constant>(Operand)) 
-    return ConstantExpr::getAddrSpaceCast(C, NewPtrTy); 
- 
-  if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand)) 
-    return NewOperand; 
- 
-  UndefUsesToFix->push_back(&OperandUse); 
-  return UndefValue::get(NewPtrTy); 
-} 
- 
-// Returns a clone of `I` with its operands converted to those specified in 
-// ValueWithNewAddrSpace. Due to potential cycles in the data flow graph, an 
-// operand whose address space needs to be modified might not exist in 
-// ValueWithNewAddrSpace. In that case, uses undef as a placeholder operand and 
-// adds that operand use to UndefUsesToFix so that caller can fix them later. 
-// 
-// Note that we do not necessarily clone `I`, e.g., if it is an addrspacecast 
-// from a pointer whose type already matches. Therefore, this function returns a 
-// Value* instead of an Instruction*. 
-// 
-// This may also return nullptr in the case the instruction could not be 
-// rewritten. 
+    }
+  }
+  return Postorder;
+}
+
+// A helper function for cloneInstructionWithNewAddressSpace. Returns the clone
+// of OperandUse.get() in the new address space. If the clone is not ready yet,
+// returns an undef in the new address space as a placeholder.
+static Value *operandWithNewAddressSpaceOrCreateUndef(
+    const Use &OperandUse, unsigned NewAddrSpace,
+    const ValueToValueMapTy &ValueWithNewAddrSpace,
+    SmallVectorImpl<const Use *> *UndefUsesToFix) {
+  Value *Operand = OperandUse.get();
+
+  Type *NewPtrTy =
+      Operand->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
+
+  if (Constant *C = dyn_cast<Constant>(Operand))
+    return ConstantExpr::getAddrSpaceCast(C, NewPtrTy);
+
+  if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand))
+    return NewOperand;
+
+  UndefUsesToFix->push_back(&OperandUse);
+  return UndefValue::get(NewPtrTy);
+}
+
+// Returns a clone of `I` with its operands converted to those specified in
+// ValueWithNewAddrSpace. Due to potential cycles in the data flow graph, an
+// operand whose address space needs to be modified might not exist in
+// ValueWithNewAddrSpace. In that case, uses undef as a placeholder operand and
+// adds that operand use to UndefUsesToFix so that caller can fix them later.
+//
+// Note that we do not necessarily clone `I`, e.g., if it is an addrspacecast
+// from a pointer whose type already matches. Therefore, this function returns a
+// Value* instead of an Instruction*.
+//
+// This may also return nullptr in the case the instruction could not be
+// rewritten.
 Value *InferAddressSpacesImpl::cloneInstructionWithNewAddressSpace(
-    Instruction *I, unsigned NewAddrSpace, 
-    const ValueToValueMapTy &ValueWithNewAddrSpace, 
-    SmallVectorImpl<const Use *> *UndefUsesToFix) const { 
-  Type *NewPtrType = 
-      I->getType()->getPointerElementType()->getPointerTo(NewAddrSpace); 
- 
-  if (I->getOpcode() == Instruction::AddrSpaceCast) { 
-    Value *Src = I->getOperand(0); 
-    // Because `I` is flat, the source address space must be specific. 
-    // Therefore, the inferred address space must be the source space, according 
-    // to our algorithm. 
-    assert(Src->getType()->getPointerAddressSpace() == NewAddrSpace); 
-    if (Src->getType() != NewPtrType) 
-      return new BitCastInst(Src, NewPtrType); 
-    return Src; 
-  } 
- 
-  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 
-    // Technically the intrinsic ID is a pointer typed argument, so specially 
-    // handle calls early. 
-    assert(II->getIntrinsicID() == Intrinsic::ptrmask); 
-    Value *NewPtr = operandWithNewAddressSpaceOrCreateUndef( 
-        II->getArgOperandUse(0), NewAddrSpace, ValueWithNewAddrSpace, 
-        UndefUsesToFix); 
-    Value *Rewrite = 
-        TTI->rewriteIntrinsicWithAddressSpace(II, II->getArgOperand(0), NewPtr); 
-    if (Rewrite) { 
-      assert(Rewrite != II && "cannot modify this pointer operation in place"); 
-      return Rewrite; 
-    } 
- 
-    return nullptr; 
-  } 
- 
+    Instruction *I, unsigned NewAddrSpace,
+    const ValueToValueMapTy &ValueWithNewAddrSpace,
+    SmallVectorImpl<const Use *> *UndefUsesToFix) const {
+  Type *NewPtrType =
+      I->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
+
+  if (I->getOpcode() == Instruction::AddrSpaceCast) {
+    Value *Src = I->getOperand(0);
+    // Because `I` is flat, the source address space must be specific.
+    // Therefore, the inferred address space must be the source space, according
+    // to our algorithm.
+    assert(Src->getType()->getPointerAddressSpace() == NewAddrSpace);
+    if (Src->getType() != NewPtrType)
+      return new BitCastInst(Src, NewPtrType);
+    return Src;
+  }
+
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    // Technically the intrinsic ID is a pointer typed argument, so specially
+    // handle calls early.
+    assert(II->getIntrinsicID() == Intrinsic::ptrmask);
+    Value *NewPtr = operandWithNewAddressSpaceOrCreateUndef(
+        II->getArgOperandUse(0), NewAddrSpace, ValueWithNewAddrSpace,
+        UndefUsesToFix);
+    Value *Rewrite =
+        TTI->rewriteIntrinsicWithAddressSpace(II, II->getArgOperand(0), NewPtr);
+    if (Rewrite) {
+      assert(Rewrite != II && "cannot modify this pointer operation in place");
+      return Rewrite;
+    }
+
+    return nullptr;
+  }
+
   unsigned AS = TTI->getAssumedAddrSpace(I);
   if (AS != UninitializedAddressSpace) {
     // For the assumed address space, insert an `addrspacecast` to make that
@@ -578,295 +578,295 @@ Value *InferAddressSpacesImpl::cloneInstructionWithNewAddressSpace(
     return NewI;
   }
 
-  // Computes the converted pointer operands. 
-  SmallVector<Value *, 4> NewPointerOperands; 
-  for (const Use &OperandUse : I->operands()) { 
-    if (!OperandUse.get()->getType()->isPointerTy()) 
-      NewPointerOperands.push_back(nullptr); 
-    else 
-      NewPointerOperands.push_back(operandWithNewAddressSpaceOrCreateUndef( 
-                                     OperandUse, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix)); 
-  } 
- 
-  switch (I->getOpcode()) { 
-  case Instruction::BitCast: 
-    return new BitCastInst(NewPointerOperands[0], NewPtrType); 
-  case Instruction::PHI: { 
-    assert(I->getType()->isPointerTy()); 
-    PHINode *PHI = cast<PHINode>(I); 
-    PHINode *NewPHI = PHINode::Create(NewPtrType, PHI->getNumIncomingValues()); 
-    for (unsigned Index = 0; Index < PHI->getNumIncomingValues(); ++Index) { 
-      unsigned OperandNo = PHINode::getOperandNumForIncomingValue(Index); 
-      NewPHI->addIncoming(NewPointerOperands[OperandNo], 
-                          PHI->getIncomingBlock(Index)); 
-    } 
-    return NewPHI; 
-  } 
-  case Instruction::GetElementPtr: { 
-    GetElementPtrInst *GEP = cast<GetElementPtrInst>(I); 
-    GetElementPtrInst *NewGEP = GetElementPtrInst::Create( 
-        GEP->getSourceElementType(), NewPointerOperands[0], 
+  // Computes the converted pointer operands.
+  SmallVector<Value *, 4> NewPointerOperands;
+  for (const Use &OperandUse : I->operands()) {
+    if (!OperandUse.get()->getType()->isPointerTy())
+      NewPointerOperands.push_back(nullptr);
+    else
+      NewPointerOperands.push_back(operandWithNewAddressSpaceOrCreateUndef(
+                                     OperandUse, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix));
+  }
+
+  switch (I->getOpcode()) {
+  case Instruction::BitCast:
+    return new BitCastInst(NewPointerOperands[0], NewPtrType);
+  case Instruction::PHI: {
+    assert(I->getType()->isPointerTy());
+    PHINode *PHI = cast<PHINode>(I);
+    PHINode *NewPHI = PHINode::Create(NewPtrType, PHI->getNumIncomingValues());
+    for (unsigned Index = 0; Index < PHI->getNumIncomingValues(); ++Index) {
+      unsigned OperandNo = PHINode::getOperandNumForIncomingValue(Index);
+      NewPHI->addIncoming(NewPointerOperands[OperandNo],
+                          PHI->getIncomingBlock(Index));
+    }
+    return NewPHI;
+  }
+  case Instruction::GetElementPtr: {
+    GetElementPtrInst *GEP = cast<GetElementPtrInst>(I);
+    GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
+        GEP->getSourceElementType(), NewPointerOperands[0],
         SmallVector<Value *, 4>(GEP->indices()));
-    NewGEP->setIsInBounds(GEP->isInBounds()); 
-    return NewGEP; 
-  } 
-  case Instruction::Select: 
-    assert(I->getType()->isPointerTy()); 
-    return SelectInst::Create(I->getOperand(0), NewPointerOperands[1], 
-                              NewPointerOperands[2], "", nullptr, I); 
-  case Instruction::IntToPtr: { 
-    assert(isNoopPtrIntCastPair(cast<Operator>(I), *DL, TTI)); 
-    Value *Src = cast<Operator>(I->getOperand(0))->getOperand(0); 
-    assert(Src->getType()->getPointerAddressSpace() == NewAddrSpace); 
-    if (Src->getType() != NewPtrType) 
-      return new BitCastInst(Src, NewPtrType); 
-    return Src; 
-  } 
-  default: 
-    llvm_unreachable("Unexpected opcode"); 
-  } 
-} 
- 
-// Similar to cloneInstructionWithNewAddressSpace, returns a clone of the 
-// constant expression `CE` with its operands replaced as specified in 
-// ValueWithNewAddrSpace. 
-static Value *cloneConstantExprWithNewAddressSpace( 
-    ConstantExpr *CE, unsigned NewAddrSpace, 
-    const ValueToValueMapTy &ValueWithNewAddrSpace, const DataLayout *DL, 
-    const TargetTransformInfo *TTI) { 
-  Type *TargetType = 
-    CE->getType()->getPointerElementType()->getPointerTo(NewAddrSpace); 
- 
-  if (CE->getOpcode() == Instruction::AddrSpaceCast) { 
-    // Because CE is flat, the source address space must be specific. 
-    // Therefore, the inferred address space must be the source space according 
-    // to our algorithm. 
-    assert(CE->getOperand(0)->getType()->getPointerAddressSpace() == 
-           NewAddrSpace); 
-    return ConstantExpr::getBitCast(CE->getOperand(0), TargetType); 
-  } 
- 
-  if (CE->getOpcode() == Instruction::BitCast) { 
-    if (Value *NewOperand = ValueWithNewAddrSpace.lookup(CE->getOperand(0))) 
-      return ConstantExpr::getBitCast(cast<Constant>(NewOperand), TargetType); 
-    return ConstantExpr::getAddrSpaceCast(CE, TargetType); 
-  } 
- 
-  if (CE->getOpcode() == Instruction::Select) { 
-    Constant *Src0 = CE->getOperand(1); 
-    Constant *Src1 = CE->getOperand(2); 
-    if (Src0->getType()->getPointerAddressSpace() == 
-        Src1->getType()->getPointerAddressSpace()) { 
- 
-      return ConstantExpr::getSelect( 
-          CE->getOperand(0), ConstantExpr::getAddrSpaceCast(Src0, TargetType), 
-          ConstantExpr::getAddrSpaceCast(Src1, TargetType)); 
-    } 
-  } 
- 
-  if (CE->getOpcode() == Instruction::IntToPtr) { 
-    assert(isNoopPtrIntCastPair(cast<Operator>(CE), *DL, TTI)); 
-    Constant *Src = cast<ConstantExpr>(CE->getOperand(0))->getOperand(0); 
-    assert(Src->getType()->getPointerAddressSpace() == NewAddrSpace); 
-    return ConstantExpr::getBitCast(Src, TargetType); 
-  } 
- 
-  // Computes the operands of the new constant expression. 
-  bool IsNew = false; 
-  SmallVector<Constant *, 4> NewOperands; 
-  for (unsigned Index = 0; Index < CE->getNumOperands(); ++Index) { 
-    Constant *Operand = CE->getOperand(Index); 
-    // If the address space of `Operand` needs to be modified, the new operand 
-    // with the new address space should already be in ValueWithNewAddrSpace 
-    // because (1) the constant expressions we consider (i.e. addrspacecast, 
-    // bitcast, and getelementptr) do not incur cycles in the data flow graph 
-    // and (2) this function is called on constant expressions in postorder. 
-    if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand)) { 
-      IsNew = true; 
-      NewOperands.push_back(cast<Constant>(NewOperand)); 
-      continue; 
-    } 
-    if (auto CExpr = dyn_cast<ConstantExpr>(Operand)) 
-      if (Value *NewOperand = cloneConstantExprWithNewAddressSpace( 
-              CExpr, NewAddrSpace, ValueWithNewAddrSpace, DL, TTI)) { 
-        IsNew = true; 
-        NewOperands.push_back(cast<Constant>(NewOperand)); 
-        continue; 
-      } 
-    // Otherwise, reuses the old operand. 
-    NewOperands.push_back(Operand); 
-  } 
- 
-  // If !IsNew, we will replace the Value with itself. However, replaced values 
-  // are assumed to wrapped in a addrspace cast later so drop it now. 
-  if (!IsNew) 
-    return nullptr; 
- 
-  if (CE->getOpcode() == Instruction::GetElementPtr) { 
-    // Needs to specify the source type while constructing a getelementptr 
-    // constant expression. 
-    return CE->getWithOperands( 
-      NewOperands, TargetType, /*OnlyIfReduced=*/false, 
-      NewOperands[0]->getType()->getPointerElementType()); 
-  } 
- 
-  return CE->getWithOperands(NewOperands, TargetType); 
-} 
- 
-// Returns a clone of the value `V`, with its operands replaced as specified in 
-// ValueWithNewAddrSpace. This function is called on every flat address 
-// expression whose address space needs to be modified, in postorder. 
-// 
-// See cloneInstructionWithNewAddressSpace for the meaning of UndefUsesToFix. 
+    NewGEP->setIsInBounds(GEP->isInBounds());
+    return NewGEP;
+  }
+  case Instruction::Select:
+    assert(I->getType()->isPointerTy());
+    return SelectInst::Create(I->getOperand(0), NewPointerOperands[1],
+                              NewPointerOperands[2], "", nullptr, I);
+  case Instruction::IntToPtr: {
+    assert(isNoopPtrIntCastPair(cast<Operator>(I), *DL, TTI));
+    Value *Src = cast<Operator>(I->getOperand(0))->getOperand(0);
+    assert(Src->getType()->getPointerAddressSpace() == NewAddrSpace);
+    if (Src->getType() != NewPtrType)
+      return new BitCastInst(Src, NewPtrType);
+    return Src;
+  }
+  default:
+    llvm_unreachable("Unexpected opcode");
+  }
+}
+
+// Similar to cloneInstructionWithNewAddressSpace, returns a clone of the
+// constant expression `CE` with its operands replaced as specified in
+// ValueWithNewAddrSpace.
+static Value *cloneConstantExprWithNewAddressSpace(
+    ConstantExpr *CE, unsigned NewAddrSpace,
+    const ValueToValueMapTy &ValueWithNewAddrSpace, const DataLayout *DL,
+    const TargetTransformInfo *TTI) {
+  Type *TargetType =
+    CE->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
+
+  if (CE->getOpcode() == Instruction::AddrSpaceCast) {
+    // Because CE is flat, the source address space must be specific.
+    // Therefore, the inferred address space must be the source space according
+    // to our algorithm.
+    assert(CE->getOperand(0)->getType()->getPointerAddressSpace() ==
+           NewAddrSpace);
+    return ConstantExpr::getBitCast(CE->getOperand(0), TargetType);
+  }
+
+  if (CE->getOpcode() == Instruction::BitCast) {
+    if (Value *NewOperand = ValueWithNewAddrSpace.lookup(CE->getOperand(0)))
+      return ConstantExpr::getBitCast(cast<Constant>(NewOperand), TargetType);
+    return ConstantExpr::getAddrSpaceCast(CE, TargetType);
+  }
+
+  if (CE->getOpcode() == Instruction::Select) {
+    Constant *Src0 = CE->getOperand(1);
+    Constant *Src1 = CE->getOperand(2);
+    if (Src0->getType()->getPointerAddressSpace() ==
+        Src1->getType()->getPointerAddressSpace()) {
+
+      return ConstantExpr::getSelect(
+          CE->getOperand(0), ConstantExpr::getAddrSpaceCast(Src0, TargetType),
+          ConstantExpr::getAddrSpaceCast(Src1, TargetType));
+    }
+  }
+
+  if (CE->getOpcode() == Instruction::IntToPtr) {
+    assert(isNoopPtrIntCastPair(cast<Operator>(CE), *DL, TTI));
+    Constant *Src = cast<ConstantExpr>(CE->getOperand(0))->getOperand(0);
+    assert(Src->getType()->getPointerAddressSpace() == NewAddrSpace);
+    return ConstantExpr::getBitCast(Src, TargetType);
+  }
+
+  // Computes the operands of the new constant expression.
+  bool IsNew = false;
+  SmallVector<Constant *, 4> NewOperands;
+  for (unsigned Index = 0; Index < CE->getNumOperands(); ++Index) {
+    Constant *Operand = CE->getOperand(Index);
+    // If the address space of `Operand` needs to be modified, the new operand
+    // with the new address space should already be in ValueWithNewAddrSpace
+    // because (1) the constant expressions we consider (i.e. addrspacecast,
+    // bitcast, and getelementptr) do not incur cycles in the data flow graph
+    // and (2) this function is called on constant expressions in postorder.
+    if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand)) {
+      IsNew = true;
+      NewOperands.push_back(cast<Constant>(NewOperand));
+      continue;
+    }
+    if (auto CExpr = dyn_cast<ConstantExpr>(Operand))
+      if (Value *NewOperand = cloneConstantExprWithNewAddressSpace(
+              CExpr, NewAddrSpace, ValueWithNewAddrSpace, DL, TTI)) {
+        IsNew = true;
+        NewOperands.push_back(cast<Constant>(NewOperand));
+        continue;
+      }
+    // Otherwise, reuses the old operand.
+    NewOperands.push_back(Operand);
+  }
+
+  // If !IsNew, we will replace the Value with itself. However, replaced values
+  // are assumed to wrapped in a addrspace cast later so drop it now.
+  if (!IsNew)
+    return nullptr;
+
+  if (CE->getOpcode() == Instruction::GetElementPtr) {
+    // Needs to specify the source type while constructing a getelementptr
+    // constant expression.
+    return CE->getWithOperands(
+      NewOperands, TargetType, /*OnlyIfReduced=*/false,
+      NewOperands[0]->getType()->getPointerElementType());
+  }
+
+  return CE->getWithOperands(NewOperands, TargetType);
+}
+
+// Returns a clone of the value `V`, with its operands replaced as specified in
+// ValueWithNewAddrSpace. This function is called on every flat address
+// expression whose address space needs to be modified, in postorder.
+//
+// See cloneInstructionWithNewAddressSpace for the meaning of UndefUsesToFix.
 Value *InferAddressSpacesImpl::cloneValueWithNewAddressSpace(
     Value *V, unsigned NewAddrSpace,
     const ValueToValueMapTy &ValueWithNewAddrSpace,
     SmallVectorImpl<const Use *> *UndefUsesToFix) const {
-  // All values in Postorder are flat address expressions. 
+  // All values in Postorder are flat address expressions.
   assert(V->getType()->getPointerAddressSpace() == FlatAddrSpace &&
          isAddressExpression(*V, *DL, TTI));
- 
-  if (Instruction *I = dyn_cast<Instruction>(V)) { 
-    Value *NewV = cloneInstructionWithNewAddressSpace( 
-      I, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix); 
-    if (Instruction *NewI = dyn_cast_or_null<Instruction>(NewV)) { 
-      if (NewI->getParent() == nullptr) { 
-        NewI->insertBefore(I); 
-        NewI->takeName(I); 
-      } 
-    } 
-    return NewV; 
-  } 
- 
-  return cloneConstantExprWithNewAddressSpace( 
-      cast<ConstantExpr>(V), NewAddrSpace, ValueWithNewAddrSpace, DL, TTI); 
-} 
- 
-// Defines the join operation on the address space lattice (see the file header 
-// comments). 
+
+  if (Instruction *I = dyn_cast<Instruction>(V)) {
+    Value *NewV = cloneInstructionWithNewAddressSpace(
+      I, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix);
+    if (Instruction *NewI = dyn_cast_or_null<Instruction>(NewV)) {
+      if (NewI->getParent() == nullptr) {
+        NewI->insertBefore(I);
+        NewI->takeName(I);
+      }
+    }
+    return NewV;
+  }
+
+  return cloneConstantExprWithNewAddressSpace(
+      cast<ConstantExpr>(V), NewAddrSpace, ValueWithNewAddrSpace, DL, TTI);
+}
+
+// Defines the join operation on the address space lattice (see the file header
+// comments).
 unsigned InferAddressSpacesImpl::joinAddressSpaces(unsigned AS1,
                                                    unsigned AS2) const {
-  if (AS1 == FlatAddrSpace || AS2 == FlatAddrSpace) 
-    return FlatAddrSpace; 
- 
-  if (AS1 == UninitializedAddressSpace) 
-    return AS2; 
-  if (AS2 == UninitializedAddressSpace) 
-    return AS1; 
- 
-  // The join of two different specific address spaces is flat. 
-  return (AS1 == AS2) ? AS1 : FlatAddrSpace; 
-} 
- 
+  if (AS1 == FlatAddrSpace || AS2 == FlatAddrSpace)
+    return FlatAddrSpace;
+
+  if (AS1 == UninitializedAddressSpace)
+    return AS2;
+  if (AS2 == UninitializedAddressSpace)
+    return AS1;
+
+  // The join of two different specific address spaces is flat.
+  return (AS1 == AS2) ? AS1 : FlatAddrSpace;
+}
+
 bool InferAddressSpacesImpl::run(Function &F) {
-  DL = &F.getParent()->getDataLayout(); 
- 
-  if (AssumeDefaultIsFlatAddressSpace) 
-    FlatAddrSpace = 0; 
- 
-  if (FlatAddrSpace == UninitializedAddressSpace) { 
-    FlatAddrSpace = TTI->getFlatAddressSpace(); 
-    if (FlatAddrSpace == UninitializedAddressSpace) 
-      return false; 
-  } 
- 
-  // Collects all flat address expressions in postorder. 
-  std::vector<WeakTrackingVH> Postorder = collectFlatAddressExpressions(F); 
- 
-  // Runs a data-flow analysis to refine the address spaces of every expression 
-  // in Postorder. 
-  ValueToAddrSpaceMapTy InferredAddrSpace; 
-  inferAddressSpaces(Postorder, &InferredAddrSpace); 
- 
-  // Changes the address spaces of the flat address expressions who are inferred 
-  // to point to a specific address space. 
-  return rewriteWithNewAddressSpaces(*TTI, Postorder, InferredAddrSpace, &F); 
-} 
- 
-// Constants need to be tracked through RAUW to handle cases with nested 
-// constant expressions, so wrap values in WeakTrackingVH. 
+  DL = &F.getParent()->getDataLayout();
+
+  if (AssumeDefaultIsFlatAddressSpace)
+    FlatAddrSpace = 0;
+
+  if (FlatAddrSpace == UninitializedAddressSpace) {
+    FlatAddrSpace = TTI->getFlatAddressSpace();
+    if (FlatAddrSpace == UninitializedAddressSpace)
+      return false;
+  }
+
+  // Collects all flat address expressions in postorder.
+  std::vector<WeakTrackingVH> Postorder = collectFlatAddressExpressions(F);
+
+  // Runs a data-flow analysis to refine the address spaces of every expression
+  // in Postorder.
+  ValueToAddrSpaceMapTy InferredAddrSpace;
+  inferAddressSpaces(Postorder, &InferredAddrSpace);
+
+  // Changes the address spaces of the flat address expressions who are inferred
+  // to point to a specific address space.
+  return rewriteWithNewAddressSpaces(*TTI, Postorder, InferredAddrSpace, &F);
+}
+
+// Constants need to be tracked through RAUW to handle cases with nested
+// constant expressions, so wrap values in WeakTrackingVH.
 void InferAddressSpacesImpl::inferAddressSpaces(
-    ArrayRef<WeakTrackingVH> Postorder, 
-    ValueToAddrSpaceMapTy *InferredAddrSpace) const { 
-  SetVector<Value *> Worklist(Postorder.begin(), Postorder.end()); 
-  // Initially, all expressions are in the uninitialized address space. 
-  for (Value *V : Postorder) 
-    (*InferredAddrSpace)[V] = UninitializedAddressSpace; 
- 
-  while (!Worklist.empty()) { 
-    Value *V = Worklist.pop_back_val(); 
- 
-    // Tries to update the address space of the stack top according to the 
-    // address spaces of its operands. 
-    LLVM_DEBUG(dbgs() << "Updating the address space of\n  " << *V << '\n'); 
-    Optional<unsigned> NewAS = updateAddressSpace(*V, *InferredAddrSpace); 
-    if (!NewAS.hasValue()) 
-      continue; 
-    // If any updates are made, grabs its users to the worklist because 
-    // their address spaces can also be possibly updated. 
-    LLVM_DEBUG(dbgs() << "  to " << NewAS.getValue() << '\n'); 
-    (*InferredAddrSpace)[V] = NewAS.getValue(); 
- 
-    for (Value *User : V->users()) { 
-      // Skip if User is already in the worklist. 
-      if (Worklist.count(User)) 
-        continue; 
- 
-      auto Pos = InferredAddrSpace->find(User); 
-      // Our algorithm only updates the address spaces of flat address 
-      // expressions, which are those in InferredAddrSpace. 
-      if (Pos == InferredAddrSpace->end()) 
-        continue; 
- 
-      // Function updateAddressSpace moves the address space down a lattice 
-      // path. Therefore, nothing to do if User is already inferred as flat (the 
-      // bottom element in the lattice). 
-      if (Pos->second == FlatAddrSpace) 
-        continue; 
- 
-      Worklist.insert(User); 
-    } 
-  } 
-} 
- 
+    ArrayRef<WeakTrackingVH> Postorder,
+    ValueToAddrSpaceMapTy *InferredAddrSpace) const {
+  SetVector<Value *> Worklist(Postorder.begin(), Postorder.end());
+  // Initially, all expressions are in the uninitialized address space.
+  for (Value *V : Postorder)
+    (*InferredAddrSpace)[V] = UninitializedAddressSpace;
+
+  while (!Worklist.empty()) {
+    Value *V = Worklist.pop_back_val();
+
+    // Tries to update the address space of the stack top according to the
+    // address spaces of its operands.
+    LLVM_DEBUG(dbgs() << "Updating the address space of\n  " << *V << '\n');
+    Optional<unsigned> NewAS = updateAddressSpace(*V, *InferredAddrSpace);
+    if (!NewAS.hasValue())
+      continue;
+    // If any updates are made, grabs its users to the worklist because
+    // their address spaces can also be possibly updated.
+    LLVM_DEBUG(dbgs() << "  to " << NewAS.getValue() << '\n');
+    (*InferredAddrSpace)[V] = NewAS.getValue();
+
+    for (Value *User : V->users()) {
+      // Skip if User is already in the worklist.
+      if (Worklist.count(User))
+        continue;
+
+      auto Pos = InferredAddrSpace->find(User);
+      // Our algorithm only updates the address spaces of flat address
+      // expressions, which are those in InferredAddrSpace.
+      if (Pos == InferredAddrSpace->end())
+        continue;
+
+      // Function updateAddressSpace moves the address space down a lattice
+      // path. Therefore, nothing to do if User is already inferred as flat (the
+      // bottom element in the lattice).
+      if (Pos->second == FlatAddrSpace)
+        continue;
+
+      Worklist.insert(User);
+    }
+  }
+}
+
 Optional<unsigned> InferAddressSpacesImpl::updateAddressSpace(
-    const Value &V, const ValueToAddrSpaceMapTy &InferredAddrSpace) const { 
-  assert(InferredAddrSpace.count(&V)); 
- 
-  // The new inferred address space equals the join of the address spaces 
-  // of all its pointer operands. 
-  unsigned NewAS = UninitializedAddressSpace; 
- 
-  const Operator &Op = cast<Operator>(V); 
-  if (Op.getOpcode() == Instruction::Select) { 
-    Value *Src0 = Op.getOperand(1); 
-    Value *Src1 = Op.getOperand(2); 
- 
-    auto I = InferredAddrSpace.find(Src0); 
-    unsigned Src0AS = (I != InferredAddrSpace.end()) ? 
-      I->second : Src0->getType()->getPointerAddressSpace(); 
- 
-    auto J = InferredAddrSpace.find(Src1); 
-    unsigned Src1AS = (J != InferredAddrSpace.end()) ? 
-      J->second : Src1->getType()->getPointerAddressSpace(); 
- 
-    auto *C0 = dyn_cast<Constant>(Src0); 
-    auto *C1 = dyn_cast<Constant>(Src1); 
- 
-    // If one of the inputs is a constant, we may be able to do a constant 
-    // addrspacecast of it. Defer inferring the address space until the input 
-    // address space is known. 
-    if ((C1 && Src0AS == UninitializedAddressSpace) || 
-        (C0 && Src1AS == UninitializedAddressSpace)) 
-      return None; 
- 
-    if (C0 && isSafeToCastConstAddrSpace(C0, Src1AS)) 
-      NewAS = Src1AS; 
-    else if (C1 && isSafeToCastConstAddrSpace(C1, Src0AS)) 
-      NewAS = Src0AS; 
-    else 
-      NewAS = joinAddressSpaces(Src0AS, Src1AS); 
-  } else { 
+    const Value &V, const ValueToAddrSpaceMapTy &InferredAddrSpace) const {
+  assert(InferredAddrSpace.count(&V));
+
+  // The new inferred address space equals the join of the address spaces
+  // of all its pointer operands.
+  unsigned NewAS = UninitializedAddressSpace;
+
+  const Operator &Op = cast<Operator>(V);
+  if (Op.getOpcode() == Instruction::Select) {
+    Value *Src0 = Op.getOperand(1);
+    Value *Src1 = Op.getOperand(2);
+
+    auto I = InferredAddrSpace.find(Src0);
+    unsigned Src0AS = (I != InferredAddrSpace.end()) ?
+      I->second : Src0->getType()->getPointerAddressSpace();
+
+    auto J = InferredAddrSpace.find(Src1);
+    unsigned Src1AS = (J != InferredAddrSpace.end()) ?
+      J->second : Src1->getType()->getPointerAddressSpace();
+
+    auto *C0 = dyn_cast<Constant>(Src0);
+    auto *C1 = dyn_cast<Constant>(Src1);
+
+    // If one of the inputs is a constant, we may be able to do a constant
+    // addrspacecast of it. Defer inferring the address space until the input
+    // address space is known.
+    if ((C1 && Src0AS == UninitializedAddressSpace) ||
+        (C0 && Src1AS == UninitializedAddressSpace))
+      return None;
+
+    if (C0 && isSafeToCastConstAddrSpace(C0, Src1AS))
+      NewAS = Src1AS;
+    else if (C1 && isSafeToCastConstAddrSpace(C1, Src0AS))
+      NewAS = Src0AS;
+    else
+      NewAS = joinAddressSpaces(Src0AS, Src1AS);
+  } else {
     unsigned AS = TTI->getAssumedAddrSpace(&V);
     if (AS != UninitializedAddressSpace) {
       // Use the assumed address space directly.
@@ -879,313 +879,313 @@ Optional<unsigned> InferAddressSpacesImpl::updateAddressSpace(
             I != InferredAddrSpace.end()
                 ? I->second
                 : PtrOperand->getType()->getPointerAddressSpace();
- 
+
         // join(flat, *) = flat. So we can break if NewAS is already flat.
         NewAS = joinAddressSpaces(NewAS, OperandAS);
         if (NewAS == FlatAddrSpace)
           break;
       }
-    } 
-  } 
- 
-  unsigned OldAS = InferredAddrSpace.lookup(&V); 
-  assert(OldAS != FlatAddrSpace); 
-  if (OldAS == NewAS) 
-    return None; 
-  return NewAS; 
-} 
- 
-/// \p returns true if \p U is the pointer operand of a memory instruction with 
-/// a single pointer operand that can have its address space changed by simply 
-/// mutating the use to a new value. If the memory instruction is volatile, 
-/// return true only if the target allows the memory instruction to be volatile 
-/// in the new address space. 
-static bool isSimplePointerUseValidToReplace(const TargetTransformInfo &TTI, 
-                                             Use &U, unsigned AddrSpace) { 
-  User *Inst = U.getUser(); 
-  unsigned OpNo = U.getOperandNo(); 
-  bool VolatileIsAllowed = false; 
-  if (auto *I = dyn_cast<Instruction>(Inst)) 
-    VolatileIsAllowed = TTI.hasVolatileVariant(I, AddrSpace); 
- 
-  if (auto *LI = dyn_cast<LoadInst>(Inst)) 
-    return OpNo == LoadInst::getPointerOperandIndex() && 
-           (VolatileIsAllowed || !LI->isVolatile()); 
- 
-  if (auto *SI = dyn_cast<StoreInst>(Inst)) 
-    return OpNo == StoreInst::getPointerOperandIndex() && 
-           (VolatileIsAllowed || !SI->isVolatile()); 
- 
-  if (auto *RMW = dyn_cast<AtomicRMWInst>(Inst)) 
-    return OpNo == AtomicRMWInst::getPointerOperandIndex() && 
-           (VolatileIsAllowed || !RMW->isVolatile()); 
- 
-  if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) 
-    return OpNo == AtomicCmpXchgInst::getPointerOperandIndex() && 
-           (VolatileIsAllowed || !CmpX->isVolatile()); 
- 
-  return false; 
-} 
- 
-/// Update memory intrinsic uses that require more complex processing than 
-/// simple memory instructions. Thse require re-mangling and may have multiple 
-/// pointer operands. 
-static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV, 
-                                     Value *NewV) { 
-  IRBuilder<> B(MI); 
-  MDNode *TBAA = MI->getMetadata(LLVMContext::MD_tbaa); 
-  MDNode *ScopeMD = MI->getMetadata(LLVMContext::MD_alias_scope); 
-  MDNode *NoAliasMD = MI->getMetadata(LLVMContext::MD_noalias); 
- 
-  if (auto *MSI = dyn_cast<MemSetInst>(MI)) { 
-    B.CreateMemSet(NewV, MSI->getValue(), MSI->getLength(), 
-                   MaybeAlign(MSI->getDestAlignment()), 
-                   false, // isVolatile 
-                   TBAA, ScopeMD, NoAliasMD); 
-  } else if (auto *MTI = dyn_cast<MemTransferInst>(MI)) { 
-    Value *Src = MTI->getRawSource(); 
-    Value *Dest = MTI->getRawDest(); 
- 
-    // Be careful in case this is a self-to-self copy. 
-    if (Src == OldV) 
-      Src = NewV; 
- 
-    if (Dest == OldV) 
-      Dest = NewV; 
- 
-    if (isa<MemCpyInst>(MTI)) { 
-      MDNode *TBAAStruct = MTI->getMetadata(LLVMContext::MD_tbaa_struct); 
-      B.CreateMemCpy(Dest, MTI->getDestAlign(), Src, MTI->getSourceAlign(), 
-                     MTI->getLength(), 
-                     false, // isVolatile 
-                     TBAA, TBAAStruct, ScopeMD, NoAliasMD); 
-    } else { 
-      assert(isa<MemMoveInst>(MTI)); 
-      B.CreateMemMove(Dest, MTI->getDestAlign(), Src, MTI->getSourceAlign(), 
-                      MTI->getLength(), 
-                      false, // isVolatile 
-                      TBAA, ScopeMD, NoAliasMD); 
-    } 
-  } else 
-    llvm_unreachable("unhandled MemIntrinsic"); 
- 
-  MI->eraseFromParent(); 
-  return true; 
-} 
- 
-// \p returns true if it is OK to change the address space of constant \p C with 
-// a ConstantExpr addrspacecast. 
+    }
+  }
+
+  unsigned OldAS = InferredAddrSpace.lookup(&V);
+  assert(OldAS != FlatAddrSpace);
+  if (OldAS == NewAS)
+    return None;
+  return NewAS;
+}
+
+/// \p returns true if \p U is the pointer operand of a memory instruction with
+/// a single pointer operand that can have its address space changed by simply
+/// mutating the use to a new value. If the memory instruction is volatile,
+/// return true only if the target allows the memory instruction to be volatile
+/// in the new address space.
+static bool isSimplePointerUseValidToReplace(const TargetTransformInfo &TTI,
+                                             Use &U, unsigned AddrSpace) {
+  User *Inst = U.getUser();
+  unsigned OpNo = U.getOperandNo();
+  bool VolatileIsAllowed = false;
+  if (auto *I = dyn_cast<Instruction>(Inst))
+    VolatileIsAllowed = TTI.hasVolatileVariant(I, AddrSpace);
+
+  if (auto *LI = dyn_cast<LoadInst>(Inst))
+    return OpNo == LoadInst::getPointerOperandIndex() &&
+           (VolatileIsAllowed || !LI->isVolatile());
+
+  if (auto *SI = dyn_cast<StoreInst>(Inst))
+    return OpNo == StoreInst::getPointerOperandIndex() &&
+           (VolatileIsAllowed || !SI->isVolatile());
+
+  if (auto *RMW = dyn_cast<AtomicRMWInst>(Inst))
+    return OpNo == AtomicRMWInst::getPointerOperandIndex() &&
+           (VolatileIsAllowed || !RMW->isVolatile());
+
+  if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst))
+    return OpNo == AtomicCmpXchgInst::getPointerOperandIndex() &&
+           (VolatileIsAllowed || !CmpX->isVolatile());
+
+  return false;
+}
+
+/// Update memory intrinsic uses that require more complex processing than
+/// simple memory instructions. Thse require re-mangling and may have multiple
+/// pointer operands.
+static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV,
+                                     Value *NewV) {
+  IRBuilder<> B(MI);
+  MDNode *TBAA = MI->getMetadata(LLVMContext::MD_tbaa);
+  MDNode *ScopeMD = MI->getMetadata(LLVMContext::MD_alias_scope);
+  MDNode *NoAliasMD = MI->getMetadata(LLVMContext::MD_noalias);
+
+  if (auto *MSI = dyn_cast<MemSetInst>(MI)) {
+    B.CreateMemSet(NewV, MSI->getValue(), MSI->getLength(),
+                   MaybeAlign(MSI->getDestAlignment()),
+                   false, // isVolatile
+                   TBAA, ScopeMD, NoAliasMD);
+  } else if (auto *MTI = dyn_cast<MemTransferInst>(MI)) {
+    Value *Src = MTI->getRawSource();
+    Value *Dest = MTI->getRawDest();
+
+    // Be careful in case this is a self-to-self copy.
+    if (Src == OldV)
+      Src = NewV;
+
+    if (Dest == OldV)
+      Dest = NewV;
+
+    if (isa<MemCpyInst>(MTI)) {
+      MDNode *TBAAStruct = MTI->getMetadata(LLVMContext::MD_tbaa_struct);
+      B.CreateMemCpy(Dest, MTI->getDestAlign(), Src, MTI->getSourceAlign(),
+                     MTI->getLength(),
+                     false, // isVolatile
+                     TBAA, TBAAStruct, ScopeMD, NoAliasMD);
+    } else {
+      assert(isa<MemMoveInst>(MTI));
+      B.CreateMemMove(Dest, MTI->getDestAlign(), Src, MTI->getSourceAlign(),
+                      MTI->getLength(),
+                      false, // isVolatile
+                      TBAA, ScopeMD, NoAliasMD);
+    }
+  } else
+    llvm_unreachable("unhandled MemIntrinsic");
+
+  MI->eraseFromParent();
+  return true;
+}
+
+// \p returns true if it is OK to change the address space of constant \p C with
+// a ConstantExpr addrspacecast.
 bool InferAddressSpacesImpl::isSafeToCastConstAddrSpace(Constant *C,
                                                         unsigned NewAS) const {
-  assert(NewAS != UninitializedAddressSpace); 
- 
-  unsigned SrcAS = C->getType()->getPointerAddressSpace(); 
-  if (SrcAS == NewAS || isa<UndefValue>(C)) 
-    return true; 
- 
-  // Prevent illegal casts between different non-flat address spaces. 
-  if (SrcAS != FlatAddrSpace && NewAS != FlatAddrSpace) 
-    return false; 
- 
-  if (isa<ConstantPointerNull>(C)) 
-    return true; 
- 
-  if (auto *Op = dyn_cast<Operator>(C)) { 
-    // If we already have a constant addrspacecast, it should be safe to cast it 
-    // off. 
-    if (Op->getOpcode() == Instruction::AddrSpaceCast) 
-      return isSafeToCastConstAddrSpace(cast<Constant>(Op->getOperand(0)), NewAS); 
- 
-    if (Op->getOpcode() == Instruction::IntToPtr && 
-        Op->getType()->getPointerAddressSpace() == FlatAddrSpace) 
-      return true; 
-  } 
- 
-  return false; 
-} 
- 
-static Value::use_iterator skipToNextUser(Value::use_iterator I, 
-                                          Value::use_iterator End) { 
-  User *CurUser = I->getUser(); 
-  ++I; 
- 
-  while (I != End && I->getUser() == CurUser) 
-    ++I; 
- 
-  return I; 
-} 
- 
+  assert(NewAS != UninitializedAddressSpace);
+
+  unsigned SrcAS = C->getType()->getPointerAddressSpace();
+  if (SrcAS == NewAS || isa<UndefValue>(C))
+    return true;
+
+  // Prevent illegal casts between different non-flat address spaces.
+  if (SrcAS != FlatAddrSpace && NewAS != FlatAddrSpace)
+    return false;
+
+  if (isa<ConstantPointerNull>(C))
+    return true;
+
+  if (auto *Op = dyn_cast<Operator>(C)) {
+    // If we already have a constant addrspacecast, it should be safe to cast it
+    // off.
+    if (Op->getOpcode() == Instruction::AddrSpaceCast)
+      return isSafeToCastConstAddrSpace(cast<Constant>(Op->getOperand(0)), NewAS);
+
+    if (Op->getOpcode() == Instruction::IntToPtr &&
+        Op->getType()->getPointerAddressSpace() == FlatAddrSpace)
+      return true;
+  }
+
+  return false;
+}
+
+static Value::use_iterator skipToNextUser(Value::use_iterator I,
+                                          Value::use_iterator End) {
+  User *CurUser = I->getUser();
+  ++I;
+
+  while (I != End && I->getUser() == CurUser)
+    ++I;
+
+  return I;
+}
+
 bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces(
-    const TargetTransformInfo &TTI, ArrayRef<WeakTrackingVH> Postorder, 
-    const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) const { 
-  // For each address expression to be modified, creates a clone of it with its 
-  // pointer operands converted to the new address space. Since the pointer 
-  // operands are converted, the clone is naturally in the new address space by 
-  // construction. 
-  ValueToValueMapTy ValueWithNewAddrSpace; 
-  SmallVector<const Use *, 32> UndefUsesToFix; 
-  for (Value* V : Postorder) { 
-    unsigned NewAddrSpace = InferredAddrSpace.lookup(V); 
+    const TargetTransformInfo &TTI, ArrayRef<WeakTrackingVH> Postorder,
+    const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) const {
+  // For each address expression to be modified, creates a clone of it with its
+  // pointer operands converted to the new address space. Since the pointer
+  // operands are converted, the clone is naturally in the new address space by
+  // construction.
+  ValueToValueMapTy ValueWithNewAddrSpace;
+  SmallVector<const Use *, 32> UndefUsesToFix;
+  for (Value* V : Postorder) {
+    unsigned NewAddrSpace = InferredAddrSpace.lookup(V);
 
     // In some degenerate cases (e.g. invalid IR in unreachable code), we may
     // not even infer the value to have its original address space.
     if (NewAddrSpace == UninitializedAddressSpace)
       continue;
 
-    if (V->getType()->getPointerAddressSpace() != NewAddrSpace) { 
-      Value *New = cloneValueWithNewAddressSpace( 
-          V, NewAddrSpace, ValueWithNewAddrSpace, &UndefUsesToFix); 
-      if (New) 
-        ValueWithNewAddrSpace[V] = New; 
-    } 
-  } 
- 
-  if (ValueWithNewAddrSpace.empty()) 
-    return false; 
- 
-  // Fixes all the undef uses generated by cloneInstructionWithNewAddressSpace. 
-  for (const Use *UndefUse : UndefUsesToFix) { 
-    User *V = UndefUse->getUser(); 
-    User *NewV = cast_or_null<User>(ValueWithNewAddrSpace.lookup(V)); 
-    if (!NewV) 
-      continue; 
- 
-    unsigned OperandNo = UndefUse->getOperandNo(); 
-    assert(isa<UndefValue>(NewV->getOperand(OperandNo))); 
-    NewV->setOperand(OperandNo, ValueWithNewAddrSpace.lookup(UndefUse->get())); 
-  } 
- 
-  SmallVector<Instruction *, 16> DeadInstructions; 
- 
-  // Replaces the uses of the old address expressions with the new ones. 
-  for (const WeakTrackingVH &WVH : Postorder) { 
-    assert(WVH && "value was unexpectedly deleted"); 
-    Value *V = WVH; 
-    Value *NewV = ValueWithNewAddrSpace.lookup(V); 
-    if (NewV == nullptr) 
-      continue; 
- 
-    LLVM_DEBUG(dbgs() << "Replacing the uses of " << *V << "\n  with\n  " 
-                      << *NewV << '\n'); 
- 
-    if (Constant *C = dyn_cast<Constant>(V)) { 
-      Constant *Replace = ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV), 
-                                                         C->getType()); 
-      if (C != Replace) { 
-        LLVM_DEBUG(dbgs() << "Inserting replacement const cast: " << Replace 
-                          << ": " << *Replace << '\n'); 
-        C->replaceAllUsesWith(Replace); 
-        V = Replace; 
-      } 
-    } 
- 
-    Value::use_iterator I, E, Next; 
-    for (I = V->use_begin(), E = V->use_end(); I != E; ) { 
-      Use &U = *I; 
- 
-      // Some users may see the same pointer operand in multiple operands. Skip 
-      // to the next instruction. 
-      I = skipToNextUser(I, E); 
- 
-      if (isSimplePointerUseValidToReplace( 
-              TTI, U, V->getType()->getPointerAddressSpace())) { 
-        // If V is used as the pointer operand of a compatible memory operation, 
-        // sets the pointer operand to NewV. This replacement does not change 
-        // the element type, so the resultant load/store is still valid. 
-        U.set(NewV); 
-        continue; 
-      } 
- 
-      User *CurUser = U.getUser(); 
+    if (V->getType()->getPointerAddressSpace() != NewAddrSpace) {
+      Value *New = cloneValueWithNewAddressSpace(
+          V, NewAddrSpace, ValueWithNewAddrSpace, &UndefUsesToFix);
+      if (New)
+        ValueWithNewAddrSpace[V] = New;
+    }
+  }
+
+  if (ValueWithNewAddrSpace.empty())
+    return false;
+
+  // Fixes all the undef uses generated by cloneInstructionWithNewAddressSpace.
+  for (const Use *UndefUse : UndefUsesToFix) {
+    User *V = UndefUse->getUser();
+    User *NewV = cast_or_null<User>(ValueWithNewAddrSpace.lookup(V));
+    if (!NewV)
+      continue;
+
+    unsigned OperandNo = UndefUse->getOperandNo();
+    assert(isa<UndefValue>(NewV->getOperand(OperandNo)));
+    NewV->setOperand(OperandNo, ValueWithNewAddrSpace.lookup(UndefUse->get()));
+  }
+
+  SmallVector<Instruction *, 16> DeadInstructions;
+
+  // Replaces the uses of the old address expressions with the new ones.
+  for (const WeakTrackingVH &WVH : Postorder) {
+    assert(WVH && "value was unexpectedly deleted");
+    Value *V = WVH;
+    Value *NewV = ValueWithNewAddrSpace.lookup(V);
+    if (NewV == nullptr)
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Replacing the uses of " << *V << "\n  with\n  "
+                      << *NewV << '\n');
+
+    if (Constant *C = dyn_cast<Constant>(V)) {
+      Constant *Replace = ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV),
+                                                         C->getType());
+      if (C != Replace) {
+        LLVM_DEBUG(dbgs() << "Inserting replacement const cast: " << Replace
+                          << ": " << *Replace << '\n');
+        C->replaceAllUsesWith(Replace);
+        V = Replace;
+      }
+    }
+
+    Value::use_iterator I, E, Next;
+    for (I = V->use_begin(), E = V->use_end(); I != E; ) {
+      Use &U = *I;
+
+      // Some users may see the same pointer operand in multiple operands. Skip
+      // to the next instruction.
+      I = skipToNextUser(I, E);
+
+      if (isSimplePointerUseValidToReplace(
+              TTI, U, V->getType()->getPointerAddressSpace())) {
+        // If V is used as the pointer operand of a compatible memory operation,
+        // sets the pointer operand to NewV. This replacement does not change
+        // the element type, so the resultant load/store is still valid.
+        U.set(NewV);
+        continue;
+      }
+
+      User *CurUser = U.getUser();
       // Skip if the current user is the new value itself.
       if (CurUser == NewV)
         continue;
-      // Handle more complex cases like intrinsic that need to be remangled. 
-      if (auto *MI = dyn_cast<MemIntrinsic>(CurUser)) { 
-        if (!MI->isVolatile() && handleMemIntrinsicPtrUse(MI, V, NewV)) 
-          continue; 
-      } 
- 
-      if (auto *II = dyn_cast<IntrinsicInst>(CurUser)) { 
-        if (rewriteIntrinsicOperands(II, V, NewV)) 
-          continue; 
-      } 
- 
-      if (isa<Instruction>(CurUser)) { 
-        if (ICmpInst *Cmp = dyn_cast<ICmpInst>(CurUser)) { 
-          // If we can infer that both pointers are in the same addrspace, 
-          // transform e.g. 
-          //   %cmp = icmp eq float* %p, %q 
-          // into 
-          //   %cmp = icmp eq float addrspace(3)* %new_p, %new_q 
- 
-          unsigned NewAS = NewV->getType()->getPointerAddressSpace(); 
-          int SrcIdx = U.getOperandNo(); 
-          int OtherIdx = (SrcIdx == 0) ? 1 : 0; 
-          Value *OtherSrc = Cmp->getOperand(OtherIdx); 
- 
-          if (Value *OtherNewV = ValueWithNewAddrSpace.lookup(OtherSrc)) { 
-            if (OtherNewV->getType()->getPointerAddressSpace() == NewAS) { 
-              Cmp->setOperand(OtherIdx, OtherNewV); 
-              Cmp->setOperand(SrcIdx, NewV); 
-              continue; 
-            } 
-          } 
- 
-          // Even if the type mismatches, we can cast the constant. 
-          if (auto *KOtherSrc = dyn_cast<Constant>(OtherSrc)) { 
-            if (isSafeToCastConstAddrSpace(KOtherSrc, NewAS)) { 
-              Cmp->setOperand(SrcIdx, NewV); 
-              Cmp->setOperand(OtherIdx, 
-                ConstantExpr::getAddrSpaceCast(KOtherSrc, NewV->getType())); 
-              continue; 
-            } 
-          } 
-        } 
- 
-        if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(CurUser)) { 
-          unsigned NewAS = NewV->getType()->getPointerAddressSpace(); 
-          if (ASC->getDestAddressSpace() == NewAS) { 
-            if (ASC->getType()->getPointerElementType() != 
-                NewV->getType()->getPointerElementType()) { 
-              NewV = CastInst::Create(Instruction::BitCast, NewV, 
-                                      ASC->getType(), "", ASC); 
-            } 
-            ASC->replaceAllUsesWith(NewV); 
-            DeadInstructions.push_back(ASC); 
-            continue; 
-          } 
-        } 
- 
-        // Otherwise, replaces the use with flat(NewV). 
-        if (Instruction *Inst = dyn_cast<Instruction>(V)) { 
-          // Don't create a copy of the original addrspacecast. 
-          if (U == V && isa<AddrSpaceCastInst>(V)) 
-            continue; 
- 
-          BasicBlock::iterator InsertPos = std::next(Inst->getIterator()); 
-          while (isa<PHINode>(InsertPos)) 
-            ++InsertPos; 
-          U.set(new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos)); 
-        } else { 
-          U.set(ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV), 
-                                               V->getType())); 
-        } 
-      } 
-    } 
- 
-    if (V->use_empty()) { 
-      if (Instruction *I = dyn_cast<Instruction>(V)) 
-        DeadInstructions.push_back(I); 
-    } 
-  } 
- 
-  for (Instruction *I : DeadInstructions) 
-    RecursivelyDeleteTriviallyDeadInstructions(I); 
- 
-  return true; 
-} 
- 
+      // Handle more complex cases like intrinsic that need to be remangled.
+      if (auto *MI = dyn_cast<MemIntrinsic>(CurUser)) {
+        if (!MI->isVolatile() && handleMemIntrinsicPtrUse(MI, V, NewV))
+          continue;
+      }
+
+      if (auto *II = dyn_cast<IntrinsicInst>(CurUser)) {
+        if (rewriteIntrinsicOperands(II, V, NewV))
+          continue;
+      }
+
+      if (isa<Instruction>(CurUser)) {
+        if (ICmpInst *Cmp = dyn_cast<ICmpInst>(CurUser)) {
+          // If we can infer that both pointers are in the same addrspace,
+          // transform e.g.
+          //   %cmp = icmp eq float* %p, %q
+          // into
+          //   %cmp = icmp eq float addrspace(3)* %new_p, %new_q
+
+          unsigned NewAS = NewV->getType()->getPointerAddressSpace();
+          int SrcIdx = U.getOperandNo();
+          int OtherIdx = (SrcIdx == 0) ? 1 : 0;
+          Value *OtherSrc = Cmp->getOperand(OtherIdx);
+
+          if (Value *OtherNewV = ValueWithNewAddrSpace.lookup(OtherSrc)) {
+            if (OtherNewV->getType()->getPointerAddressSpace() == NewAS) {
+              Cmp->setOperand(OtherIdx, OtherNewV);
+              Cmp->setOperand(SrcIdx, NewV);
+              continue;
+            }
+          }
+
+          // Even if the type mismatches, we can cast the constant.
+          if (auto *KOtherSrc = dyn_cast<Constant>(OtherSrc)) {
+            if (isSafeToCastConstAddrSpace(KOtherSrc, NewAS)) {
+              Cmp->setOperand(SrcIdx, NewV);
+              Cmp->setOperand(OtherIdx,
+                ConstantExpr::getAddrSpaceCast(KOtherSrc, NewV->getType()));
+              continue;
+            }
+          }
+        }
+
+        if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(CurUser)) {
+          unsigned NewAS = NewV->getType()->getPointerAddressSpace();
+          if (ASC->getDestAddressSpace() == NewAS) {
+            if (ASC->getType()->getPointerElementType() !=
+                NewV->getType()->getPointerElementType()) {
+              NewV = CastInst::Create(Instruction::BitCast, NewV,
+                                      ASC->getType(), "", ASC);
+            }
+            ASC->replaceAllUsesWith(NewV);
+            DeadInstructions.push_back(ASC);
+            continue;
+          }
+        }
+
+        // Otherwise, replaces the use with flat(NewV).
+        if (Instruction *Inst = dyn_cast<Instruction>(V)) {
+          // Don't create a copy of the original addrspacecast.
+          if (U == V && isa<AddrSpaceCastInst>(V))
+            continue;
+
+          BasicBlock::iterator InsertPos = std::next(Inst->getIterator());
+          while (isa<PHINode>(InsertPos))
+            ++InsertPos;
+          U.set(new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos));
+        } else {
+          U.set(ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV),
+                                               V->getType()));
+        }
+      }
+    }
+
+    if (V->use_empty()) {
+      if (Instruction *I = dyn_cast<Instruction>(V))
+        DeadInstructions.push_back(I);
+    }
+  }
+
+  for (Instruction *I : DeadInstructions)
+    RecursivelyDeleteTriviallyDeadInstructions(I);
+
+  return true;
+}
+
 bool InferAddressSpaces::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
@@ -1196,9 +1196,9 @@ bool InferAddressSpaces::runOnFunction(Function &F) {
       .run(F);
 }
 
-FunctionPass *llvm::createInferAddressSpacesPass(unsigned AddressSpace) { 
-  return new InferAddressSpaces(AddressSpace); 
-} 
+FunctionPass *llvm::createInferAddressSpacesPass(unsigned AddressSpace) {
+  return new InferAddressSpaces(AddressSpace);
+}
 
 InferAddressSpacesPass::InferAddressSpacesPass()
     : FlatAddrSpace(UninitializedAddressSpace) {}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/InstSimplifyPass.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/InstSimplifyPass.cpp
index 2c47a99985..c11d2e4c1d 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/InstSimplifyPass.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/InstSimplifyPass.cpp
@@ -1,148 +1,148 @@
-//===- InstSimplifyPass.cpp -----------------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/InstSimplifyPass.h" 
-#include "llvm/ADT/DepthFirstIterator.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/Analysis/OptimizationRemarkEmitter.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
+//===- InstSimplifyPass.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/InstSimplifyPass.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-
-using namespace llvm; 
- 
-#define DEBUG_TYPE "instsimplify" 
- 
-STATISTIC(NumSimplified, "Number of redundant instructions removed"); 
- 
-static bool runImpl(Function &F, const SimplifyQuery &SQ, 
-                    OptimizationRemarkEmitter *ORE) { 
-  SmallPtrSet<const Instruction *, 8> S1, S2, *ToSimplify = &S1, *Next = &S2; 
-  bool Changed = false; 
- 
-  do { 
-    for (BasicBlock &BB : F) { 
-      // Unreachable code can take on strange forms that we are not prepared to 
-      // handle. For example, an instruction may have itself as an operand. 
-      if (!SQ.DT->isReachableFromEntry(&BB)) 
-        continue; 
- 
-      SmallVector<WeakTrackingVH, 8> DeadInstsInBB; 
-      for (Instruction &I : BB) { 
-        // The first time through the loop, ToSimplify is empty and we try to 
-        // simplify all instructions. On later iterations, ToSimplify is not 
-        // empty and we only bother simplifying instructions that are in it. 
-        if (!ToSimplify->empty() && !ToSimplify->count(&I)) 
-          continue; 
- 
-        // Don't waste time simplifying dead/unused instructions. 
-        if (isInstructionTriviallyDead(&I)) { 
-          DeadInstsInBB.push_back(&I); 
-          Changed = true; 
-        } else if (!I.use_empty()) { 
-          if (Value *V = SimplifyInstruction(&I, SQ, ORE)) { 
-            // Mark all uses for resimplification next time round the loop. 
-            for (User *U : I.users()) 
-              Next->insert(cast<Instruction>(U)); 
-            I.replaceAllUsesWith(V); 
-            ++NumSimplified; 
-            Changed = true; 
-            // A call can get simplified, but it may not be trivially dead. 
-            if (isInstructionTriviallyDead(&I)) 
-              DeadInstsInBB.push_back(&I); 
-          } 
-        } 
-      } 
-      RecursivelyDeleteTriviallyDeadInstructions(DeadInstsInBB, SQ.TLI); 
-    } 
- 
-    // Place the list of instructions to simplify on the next loop iteration 
-    // into ToSimplify. 
-    std::swap(ToSimplify, Next); 
-    Next->clear(); 
-  } while (!ToSimplify->empty()); 
- 
-  return Changed; 
-} 
- 
-namespace { 
-struct InstSimplifyLegacyPass : public FunctionPass { 
-  static char ID; // Pass identification, replacement for typeid 
-  InstSimplifyLegacyPass() : FunctionPass(ID) { 
-    initializeInstSimplifyLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.setPreservesCFG(); 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addRequired<AssumptionCacheTracker>(); 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-    AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 
-  } 
- 
-  /// Remove instructions that simplify. 
-  bool runOnFunction(Function &F) override { 
-    if (skipFunction(F)) 
-      return false; 
- 
-    const DominatorTree *DT = 
-        &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-    const TargetLibraryInfo *TLI = 
-        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 
-    AssumptionCache *AC = 
-        &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 
-    OptimizationRemarkEmitter *ORE = 
-        &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 
-    const DataLayout &DL = F.getParent()->getDataLayout(); 
-    const SimplifyQuery SQ(DL, TLI, DT, AC); 
-    return runImpl(F, SQ, ORE); 
-  } 
-}; 
-} // namespace 
- 
-char InstSimplifyLegacyPass::ID = 0; 
-INITIALIZE_PASS_BEGIN(InstSimplifyLegacyPass, "instsimplify", 
-                      "Remove redundant instructions", false, false) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 
-INITIALIZE_PASS_END(InstSimplifyLegacyPass, "instsimplify", 
-                    "Remove redundant instructions", false, false) 
- 
-// Public interface to the simplify instructions pass. 
-FunctionPass *llvm::createInstSimplifyLegacyPass() { 
-  return new InstSimplifyLegacyPass(); 
-} 
- 
-PreservedAnalyses InstSimplifyPass::run(Function &F, 
-                                        FunctionAnalysisManager &AM) { 
-  auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 
-  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 
-  auto &AC = AM.getResult<AssumptionAnalysis>(F); 
-  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 
-  const DataLayout &DL = F.getParent()->getDataLayout(); 
-  const SimplifyQuery SQ(DL, &TLI, &DT, &AC); 
-  bool Changed = runImpl(F, SQ, &ORE); 
-  if (!Changed) 
-    return PreservedAnalyses::all(); 
- 
-  PreservedAnalyses PA; 
-  PA.preserveSet<CFGAnalyses>(); 
-  return PA; 
-} 
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "instsimplify"
+
+STATISTIC(NumSimplified, "Number of redundant instructions removed");
+
+static bool runImpl(Function &F, const SimplifyQuery &SQ,
+                    OptimizationRemarkEmitter *ORE) {
+  SmallPtrSet<const Instruction *, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
+  bool Changed = false;
+
+  do {
+    for (BasicBlock &BB : F) {
+      // Unreachable code can take on strange forms that we are not prepared to
+      // handle. For example, an instruction may have itself as an operand.
+      if (!SQ.DT->isReachableFromEntry(&BB))
+        continue;
+
+      SmallVector<WeakTrackingVH, 8> DeadInstsInBB;
+      for (Instruction &I : BB) {
+        // The first time through the loop, ToSimplify is empty and we try to
+        // simplify all instructions. On later iterations, ToSimplify is not
+        // empty and we only bother simplifying instructions that are in it.
+        if (!ToSimplify->empty() && !ToSimplify->count(&I))
+          continue;
+
+        // Don't waste time simplifying dead/unused instructions.
+        if (isInstructionTriviallyDead(&I)) {
+          DeadInstsInBB.push_back(&I);
+          Changed = true;
+        } else if (!I.use_empty()) {
+          if (Value *V = SimplifyInstruction(&I, SQ, ORE)) {
+            // Mark all uses for resimplification next time round the loop.
+            for (User *U : I.users())
+              Next->insert(cast<Instruction>(U));
+            I.replaceAllUsesWith(V);
+            ++NumSimplified;
+            Changed = true;
+            // A call can get simplified, but it may not be trivially dead.
+            if (isInstructionTriviallyDead(&I))
+              DeadInstsInBB.push_back(&I);
+          }
+        }
+      }
+      RecursivelyDeleteTriviallyDeadInstructions(DeadInstsInBB, SQ.TLI);
+    }
+
+    // Place the list of instructions to simplify on the next loop iteration
+    // into ToSimplify.
+    std::swap(ToSimplify, Next);
+    Next->clear();
+  } while (!ToSimplify->empty());
+
+  return Changed;
+}
+
+namespace {
+struct InstSimplifyLegacyPass : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  InstSimplifyLegacyPass() : FunctionPass(ID) {
+    initializeInstSimplifyLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+  }
+
+  /// Remove instructions that simplify.
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    const DominatorTree *DT =
+        &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    const TargetLibraryInfo *TLI =
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    AssumptionCache *AC =
+        &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    OptimizationRemarkEmitter *ORE =
+        &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    const SimplifyQuery SQ(DL, TLI, DT, AC);
+    return runImpl(F, SQ, ORE);
+  }
+};
+} // namespace
+
+char InstSimplifyLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(InstSimplifyLegacyPass, "instsimplify",
+                      "Remove redundant instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_END(InstSimplifyLegacyPass, "instsimplify",
+                    "Remove redundant instructions", false, false)
+
+// Public interface to the simplify instructions pass.
+FunctionPass *llvm::createInstSimplifyLegacyPass() {
+  return new InstSimplifyLegacyPass();
+}
+
+PreservedAnalyses InstSimplifyPass::run(Function &F,
+                                        FunctionAnalysisManager &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  const SimplifyQuery SQ(DL, &TLI, &DT, &AC);
+  bool Changed = runImpl(F, SQ, &ORE);
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/JumpThreading.cpp
index 4ba1bea9f0..10b08b4e22 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/JumpThreading.cpp
@@ -1,724 +1,724 @@
-//===- JumpThreading.cpp - Thread control through conditional blocks ------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the Jump Threading pass. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/JumpThreading.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/DenseSet.h" 
-#include "llvm/ADT/MapVector.h" 
-#include "llvm/ADT/Optional.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/AliasAnalysis.h" 
-#include "llvm/Analysis/BlockFrequencyInfo.h" 
-#include "llvm/Analysis/BranchProbabilityInfo.h" 
-#include "llvm/Analysis/CFG.h" 
-#include "llvm/Analysis/ConstantFolding.h" 
-#include "llvm/Analysis/DomTreeUpdater.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/GuardUtils.h" 
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/Analysis/LazyValueInfo.h" 
-#include "llvm/Analysis/Loads.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
+//===- JumpThreading.cpp - Thread control through conditional blocks ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Jump Threading pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/JumpThreading.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/GuardUtils.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/ConstantRange.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/MDBuilder.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/BlockFrequency.h" 
-#include "llvm/Support/BranchProbability.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/Cloning.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Transforms/Utils/SSAUpdater.h" 
-#include "llvm/Transforms/Utils/ValueMapper.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <cstddef> 
-#include <cstdint> 
-#include <iterator> 
-#include <memory> 
-#include <utility> 
- 
-using namespace llvm; 
-using namespace jumpthreading; 
- 
-#define DEBUG_TYPE "jump-threading" 
- 
-STATISTIC(NumThreads, "Number of jumps threaded"); 
-STATISTIC(NumFolds,   "Number of terminators folded"); 
-STATISTIC(NumDupes,   "Number of branch blocks duplicated to eliminate phi"); 
- 
-static cl::opt<unsigned> 
-BBDuplicateThreshold("jump-threading-threshold", 
-          cl::desc("Max block size to duplicate for jump threading"), 
-          cl::init(6), cl::Hidden); 
- 
-static cl::opt<unsigned> 
-ImplicationSearchThreshold( 
-  "jump-threading-implication-search-threshold", 
-  cl::desc("The number of predecessors to search for a stronger " 
-           "condition to use to thread over a weaker condition"), 
-  cl::init(3), cl::Hidden); 
- 
-static cl::opt<bool> PrintLVIAfterJumpThreading( 
-    "print-lvi-after-jump-threading", 
-    cl::desc("Print the LazyValueInfo cache after JumpThreading"), cl::init(false), 
-    cl::Hidden); 
- 
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <utility>
+
+using namespace llvm;
+using namespace jumpthreading;
+
+#define DEBUG_TYPE "jump-threading"
+
+STATISTIC(NumThreads, "Number of jumps threaded");
+STATISTIC(NumFolds,   "Number of terminators folded");
+STATISTIC(NumDupes,   "Number of branch blocks duplicated to eliminate phi");
+
+static cl::opt<unsigned>
+BBDuplicateThreshold("jump-threading-threshold",
+          cl::desc("Max block size to duplicate for jump threading"),
+          cl::init(6), cl::Hidden);
+
+static cl::opt<unsigned>
+ImplicationSearchThreshold(
+  "jump-threading-implication-search-threshold",
+  cl::desc("The number of predecessors to search for a stronger "
+           "condition to use to thread over a weaker condition"),
+  cl::init(3), cl::Hidden);
+
+static cl::opt<bool> PrintLVIAfterJumpThreading(
+    "print-lvi-after-jump-threading",
+    cl::desc("Print the LazyValueInfo cache after JumpThreading"), cl::init(false),
+    cl::Hidden);
+
 static cl::opt<bool> JumpThreadingFreezeSelectCond(
     "jump-threading-freeze-select-cond",
     cl::desc("Freeze the condition when unfolding select"), cl::init(false),
     cl::Hidden);
 
-static cl::opt<bool> ThreadAcrossLoopHeaders( 
-    "jump-threading-across-loop-headers", 
-    cl::desc("Allow JumpThreading to thread across loop headers, for testing"), 
-    cl::init(false), cl::Hidden); 
- 
- 
-namespace { 
- 
-  /// This pass performs 'jump threading', which looks at blocks that have 
-  /// multiple predecessors and multiple successors.  If one or more of the 
-  /// predecessors of the block can be proven to always jump to one of the 
-  /// successors, we forward the edge from the predecessor to the successor by 
-  /// duplicating the contents of this block. 
-  /// 
-  /// An example of when this can occur is code like this: 
-  /// 
-  ///   if () { ... 
-  ///     X = 4; 
-  ///   } 
-  ///   if (X < 3) { 
-  /// 
-  /// In this case, the unconditional branch at the end of the first if can be 
-  /// revectored to the false side of the second if. 
-  class JumpThreading : public FunctionPass { 
-    JumpThreadingPass Impl; 
- 
-  public: 
-    static char ID; // Pass identification 
- 
+static cl::opt<bool> ThreadAcrossLoopHeaders(
+    "jump-threading-across-loop-headers",
+    cl::desc("Allow JumpThreading to thread across loop headers, for testing"),
+    cl::init(false), cl::Hidden);
+
+
+namespace {
+
+  /// This pass performs 'jump threading', which looks at blocks that have
+  /// multiple predecessors and multiple successors.  If one or more of the
+  /// predecessors of the block can be proven to always jump to one of the
+  /// successors, we forward the edge from the predecessor to the successor by
+  /// duplicating the contents of this block.
+  ///
+  /// An example of when this can occur is code like this:
+  ///
+  ///   if () { ...
+  ///     X = 4;
+  ///   }
+  ///   if (X < 3) {
+  ///
+  /// In this case, the unconditional branch at the end of the first if can be
+  /// revectored to the false side of the second if.
+  class JumpThreading : public FunctionPass {
+    JumpThreadingPass Impl;
+
+  public:
+    static char ID; // Pass identification
+
     JumpThreading(bool InsertFreezeWhenUnfoldingSelect = false, int T = -1)
         : FunctionPass(ID), Impl(InsertFreezeWhenUnfoldingSelect, T) {
-      initializeJumpThreadingPass(*PassRegistry::getPassRegistry()); 
-    } 
- 
-    bool runOnFunction(Function &F) override; 
- 
-    void getAnalysisUsage(AnalysisUsage &AU) const override { 
-      AU.addRequired<DominatorTreeWrapperPass>(); 
-      AU.addPreserved<DominatorTreeWrapperPass>(); 
-      AU.addRequired<AAResultsWrapperPass>(); 
-      AU.addRequired<LazyValueInfoWrapperPass>(); 
-      AU.addPreserved<LazyValueInfoWrapperPass>(); 
-      AU.addPreserved<GlobalsAAWrapperPass>(); 
-      AU.addRequired<TargetLibraryInfoWrapperPass>(); 
+      initializeJumpThreadingPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addRequired<AAResultsWrapperPass>();
+      AU.addRequired<LazyValueInfoWrapperPass>();
+      AU.addPreserved<LazyValueInfoWrapperPass>();
+      AU.addPreserved<GlobalsAAWrapperPass>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
       AU.addRequired<TargetTransformInfoWrapperPass>();
-    } 
- 
-    void releaseMemory() override { Impl.releaseMemory(); } 
-  }; 
- 
-} // end anonymous namespace 
- 
-char JumpThreading::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading", 
-                "Jump Threading", false, false) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 
-INITIALIZE_PASS_END(JumpThreading, "jump-threading", 
-                "Jump Threading", false, false) 
- 
-// Public interface to the Jump Threading pass 
+    }
+
+    void releaseMemory() override { Impl.releaseMemory(); }
+  };
+
+} // end anonymous namespace
+
+char JumpThreading::ID = 0;
+
+INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading",
+                "Jump Threading", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(JumpThreading, "jump-threading",
+                "Jump Threading", false, false)
+
+// Public interface to the Jump Threading pass
 FunctionPass *llvm::createJumpThreadingPass(bool InsertFr, int Threshold) {
   return new JumpThreading(InsertFr, Threshold);
-} 
- 
+}
+
 JumpThreadingPass::JumpThreadingPass(bool InsertFr, int T) {
   InsertFreezeWhenUnfoldingSelect = JumpThreadingFreezeSelectCond | InsertFr;
-  DefaultBBDupThreshold = (T == -1) ? BBDuplicateThreshold : unsigned(T); 
-} 
- 
-// Update branch probability information according to conditional 
-// branch probability. This is usually made possible for cloned branches 
-// in inline instances by the context specific profile in the caller. 
-// For instance, 
-// 
-//  [Block PredBB] 
-//  [Branch PredBr] 
-//  if (t) { 
-//     Block A; 
-//  } else { 
-//     Block B; 
-//  } 
-// 
-//  [Block BB] 
-//  cond = PN([true, %A], [..., %B]); // PHI node 
-//  [Branch CondBr] 
-//  if (cond) { 
-//    ...  // P(cond == true) = 1% 
-//  } 
-// 
-//  Here we know that when block A is taken, cond must be true, which means 
-//      P(cond == true | A) = 1 
-// 
-//  Given that P(cond == true) = P(cond == true | A) * P(A) + 
-//                               P(cond == true | B) * P(B) 
-//  we get: 
-//     P(cond == true ) = P(A) + P(cond == true | B) * P(B) 
-// 
-//  which gives us: 
-//     P(A) is less than P(cond == true), i.e. 
-//     P(t == true) <= P(cond == true) 
-// 
-//  In other words, if we know P(cond == true) is unlikely, we know 
-//  that P(t == true) is also unlikely. 
-// 
-static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) { 
-  BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator()); 
-  if (!CondBr) 
-    return; 
- 
-  uint64_t TrueWeight, FalseWeight; 
-  if (!CondBr->extractProfMetadata(TrueWeight, FalseWeight)) 
-    return; 
- 
-  if (TrueWeight + FalseWeight == 0) 
-    // Zero branch_weights do not give a hint for getting branch probabilities. 
-    // Technically it would result in division by zero denominator, which is 
-    // TrueWeight + FalseWeight. 
-    return; 
- 
-  // Returns the outgoing edge of the dominating predecessor block 
-  // that leads to the PhiNode's incoming block: 
-  auto GetPredOutEdge = 
-      [](BasicBlock *IncomingBB, 
-         BasicBlock *PhiBB) -> std::pair<BasicBlock *, BasicBlock *> { 
-    auto *PredBB = IncomingBB; 
-    auto *SuccBB = PhiBB; 
-    SmallPtrSet<BasicBlock *, 16> Visited; 
-    while (true) { 
-      BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator()); 
-      if (PredBr && PredBr->isConditional()) 
-        return {PredBB, SuccBB}; 
-      Visited.insert(PredBB); 
-      auto *SinglePredBB = PredBB->getSinglePredecessor(); 
-      if (!SinglePredBB) 
-        return {nullptr, nullptr}; 
- 
-      // Stop searching when SinglePredBB has been visited. It means we see 
-      // an unreachable loop. 
-      if (Visited.count(SinglePredBB)) 
-        return {nullptr, nullptr}; 
- 
-      SuccBB = PredBB; 
-      PredBB = SinglePredBB; 
-    } 
-  }; 
- 
-  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { 
-    Value *PhiOpnd = PN->getIncomingValue(i); 
-    ConstantInt *CI = dyn_cast<ConstantInt>(PhiOpnd); 
- 
-    if (!CI || !CI->getType()->isIntegerTy(1)) 
-      continue; 
- 
-    BranchProbability BP = 
-        (CI->isOne() ? BranchProbability::getBranchProbability( 
-                           TrueWeight, TrueWeight + FalseWeight) 
-                     : BranchProbability::getBranchProbability( 
-                           FalseWeight, TrueWeight + FalseWeight)); 
- 
-    auto PredOutEdge = GetPredOutEdge(PN->getIncomingBlock(i), BB); 
-    if (!PredOutEdge.first) 
-      return; 
- 
-    BasicBlock *PredBB = PredOutEdge.first; 
-    BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator()); 
-    if (!PredBr) 
-      return; 
- 
-    uint64_t PredTrueWeight, PredFalseWeight; 
-    // FIXME: We currently only set the profile data when it is missing. 
-    // With PGO, this can be used to refine even existing profile data with 
-    // context information. This needs to be done after more performance 
-    // testing. 
-    if (PredBr->extractProfMetadata(PredTrueWeight, PredFalseWeight)) 
-      continue; 
- 
-    // We can not infer anything useful when BP >= 50%, because BP is the 
-    // upper bound probability value. 
-    if (BP >= BranchProbability(50, 100)) 
-      continue; 
- 
-    SmallVector<uint32_t, 2> Weights; 
-    if (PredBr->getSuccessor(0) == PredOutEdge.second) { 
-      Weights.push_back(BP.getNumerator()); 
-      Weights.push_back(BP.getCompl().getNumerator()); 
-    } else { 
-      Weights.push_back(BP.getCompl().getNumerator()); 
-      Weights.push_back(BP.getNumerator()); 
-    } 
-    PredBr->setMetadata(LLVMContext::MD_prof, 
-                        MDBuilder(PredBr->getParent()->getContext()) 
-                            .createBranchWeights(Weights)); 
-  } 
-} 
- 
-/// runOnFunction - Toplevel algorithm. 
-bool JumpThreading::runOnFunction(Function &F) { 
-  if (skipFunction(F)) 
-    return false; 
+  DefaultBBDupThreshold = (T == -1) ? BBDuplicateThreshold : unsigned(T);
+}
+
+// Update branch probability information according to conditional
+// branch probability. This is usually made possible for cloned branches
+// in inline instances by the context specific profile in the caller.
+// For instance,
+//
+//  [Block PredBB]
+//  [Branch PredBr]
+//  if (t) {
+//     Block A;
+//  } else {
+//     Block B;
+//  }
+//
+//  [Block BB]
+//  cond = PN([true, %A], [..., %B]); // PHI node
+//  [Branch CondBr]
+//  if (cond) {
+//    ...  // P(cond == true) = 1%
+//  }
+//
+//  Here we know that when block A is taken, cond must be true, which means
+//      P(cond == true | A) = 1
+//
+//  Given that P(cond == true) = P(cond == true | A) * P(A) +
+//                               P(cond == true | B) * P(B)
+//  we get:
+//     P(cond == true ) = P(A) + P(cond == true | B) * P(B)
+//
+//  which gives us:
+//     P(A) is less than P(cond == true), i.e.
+//     P(t == true) <= P(cond == true)
+//
+//  In other words, if we know P(cond == true) is unlikely, we know
+//  that P(t == true) is also unlikely.
+//
+static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) {
+  BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
+  if (!CondBr)
+    return;
+
+  uint64_t TrueWeight, FalseWeight;
+  if (!CondBr->extractProfMetadata(TrueWeight, FalseWeight))
+    return;
+
+  if (TrueWeight + FalseWeight == 0)
+    // Zero branch_weights do not give a hint for getting branch probabilities.
+    // Technically it would result in division by zero denominator, which is
+    // TrueWeight + FalseWeight.
+    return;
+
+  // Returns the outgoing edge of the dominating predecessor block
+  // that leads to the PhiNode's incoming block:
+  auto GetPredOutEdge =
+      [](BasicBlock *IncomingBB,
+         BasicBlock *PhiBB) -> std::pair<BasicBlock *, BasicBlock *> {
+    auto *PredBB = IncomingBB;
+    auto *SuccBB = PhiBB;
+    SmallPtrSet<BasicBlock *, 16> Visited;
+    while (true) {
+      BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator());
+      if (PredBr && PredBr->isConditional())
+        return {PredBB, SuccBB};
+      Visited.insert(PredBB);
+      auto *SinglePredBB = PredBB->getSinglePredecessor();
+      if (!SinglePredBB)
+        return {nullptr, nullptr};
+
+      // Stop searching when SinglePredBB has been visited. It means we see
+      // an unreachable loop.
+      if (Visited.count(SinglePredBB))
+        return {nullptr, nullptr};
+
+      SuccBB = PredBB;
+      PredBB = SinglePredBB;
+    }
+  };
+
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    Value *PhiOpnd = PN->getIncomingValue(i);
+    ConstantInt *CI = dyn_cast<ConstantInt>(PhiOpnd);
+
+    if (!CI || !CI->getType()->isIntegerTy(1))
+      continue;
+
+    BranchProbability BP =
+        (CI->isOne() ? BranchProbability::getBranchProbability(
+                           TrueWeight, TrueWeight + FalseWeight)
+                     : BranchProbability::getBranchProbability(
+                           FalseWeight, TrueWeight + FalseWeight));
+
+    auto PredOutEdge = GetPredOutEdge(PN->getIncomingBlock(i), BB);
+    if (!PredOutEdge.first)
+      return;
+
+    BasicBlock *PredBB = PredOutEdge.first;
+    BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator());
+    if (!PredBr)
+      return;
+
+    uint64_t PredTrueWeight, PredFalseWeight;
+    // FIXME: We currently only set the profile data when it is missing.
+    // With PGO, this can be used to refine even existing profile data with
+    // context information. This needs to be done after more performance
+    // testing.
+    if (PredBr->extractProfMetadata(PredTrueWeight, PredFalseWeight))
+      continue;
+
+    // We can not infer anything useful when BP >= 50%, because BP is the
+    // upper bound probability value.
+    if (BP >= BranchProbability(50, 100))
+      continue;
+
+    SmallVector<uint32_t, 2> Weights;
+    if (PredBr->getSuccessor(0) == PredOutEdge.second) {
+      Weights.push_back(BP.getNumerator());
+      Weights.push_back(BP.getCompl().getNumerator());
+    } else {
+      Weights.push_back(BP.getCompl().getNumerator());
+      Weights.push_back(BP.getNumerator());
+    }
+    PredBr->setMetadata(LLVMContext::MD_prof,
+                        MDBuilder(PredBr->getParent()->getContext())
+                            .createBranchWeights(Weights));
+  }
+}
+
+/// runOnFunction - Toplevel algorithm.
+bool JumpThreading::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
   auto TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   // Jump Threading has no sense for the targets with divergent CF
   if (TTI->hasBranchDivergence())
     return false;
-  auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 
-  auto DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-  auto LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI(); 
-  auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 
-  DomTreeUpdater DTU(*DT, DomTreeUpdater::UpdateStrategy::Lazy); 
-  std::unique_ptr<BlockFrequencyInfo> BFI; 
-  std::unique_ptr<BranchProbabilityInfo> BPI; 
-  if (F.hasProfileData()) { 
-    LoopInfo LI{DominatorTree(F)}; 
-    BPI.reset(new BranchProbabilityInfo(F, LI, TLI)); 
-    BFI.reset(new BlockFrequencyInfo(F, *BPI, LI)); 
-  } 
- 
-  bool Changed = Impl.runImpl(F, TLI, LVI, AA, &DTU, F.hasProfileData(), 
-                              std::move(BFI), std::move(BPI)); 
-  if (PrintLVIAfterJumpThreading) { 
-    dbgs() << "LVI for function '" << F.getName() << "':\n"; 
-    LVI->printLVI(F, DTU.getDomTree(), dbgs()); 
-  } 
-  return Changed; 
-} 
- 
-PreservedAnalyses JumpThreadingPass::run(Function &F, 
-                                         FunctionAnalysisManager &AM) { 
+  auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  auto DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
+  auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  DomTreeUpdater DTU(*DT, DomTreeUpdater::UpdateStrategy::Lazy);
+  std::unique_ptr<BlockFrequencyInfo> BFI;
+  std::unique_ptr<BranchProbabilityInfo> BPI;
+  if (F.hasProfileData()) {
+    LoopInfo LI{DominatorTree(F)};
+    BPI.reset(new BranchProbabilityInfo(F, LI, TLI));
+    BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
+  }
+
+  bool Changed = Impl.runImpl(F, TLI, LVI, AA, &DTU, F.hasProfileData(),
+                              std::move(BFI), std::move(BPI));
+  if (PrintLVIAfterJumpThreading) {
+    dbgs() << "LVI for function '" << F.getName() << "':\n";
+    LVI->printLVI(F, DTU.getDomTree(), dbgs());
+  }
+  return Changed;
+}
+
+PreservedAnalyses JumpThreadingPass::run(Function &F,
+                                         FunctionAnalysisManager &AM) {
   auto &TTI = AM.getResult<TargetIRAnalysis>(F);
   // Jump Threading has no sense for the targets with divergent CF
   if (TTI.hasBranchDivergence())
     return PreservedAnalyses::all();
-  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 
-  auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 
-  auto &LVI = AM.getResult<LazyValueAnalysis>(F); 
-  auto &AA = AM.getResult<AAManager>(F); 
-  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); 
- 
-  std::unique_ptr<BlockFrequencyInfo> BFI; 
-  std::unique_ptr<BranchProbabilityInfo> BPI; 
-  if (F.hasProfileData()) { 
-    LoopInfo LI{DominatorTree(F)}; 
-    BPI.reset(new BranchProbabilityInfo(F, LI, &TLI)); 
-    BFI.reset(new BlockFrequencyInfo(F, *BPI, LI)); 
-  } 
- 
-  bool Changed = runImpl(F, &TLI, &LVI, &AA, &DTU, F.hasProfileData(), 
-                         std::move(BFI), std::move(BPI)); 
- 
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &LVI = AM.getResult<LazyValueAnalysis>(F);
+  auto &AA = AM.getResult<AAManager>(F);
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+
+  std::unique_ptr<BlockFrequencyInfo> BFI;
+  std::unique_ptr<BranchProbabilityInfo> BPI;
+  if (F.hasProfileData()) {
+    LoopInfo LI{DominatorTree(F)};
+    BPI.reset(new BranchProbabilityInfo(F, LI, &TLI));
+    BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
+  }
+
+  bool Changed = runImpl(F, &TLI, &LVI, &AA, &DTU, F.hasProfileData(),
+                         std::move(BFI), std::move(BPI));
+
   if (PrintLVIAfterJumpThreading) {
     dbgs() << "LVI for function '" << F.getName() << "':\n";
     LVI.printLVI(F, DTU.getDomTree(), dbgs());
   }
 
-  if (!Changed) 
-    return PreservedAnalyses::all(); 
-  PreservedAnalyses PA; 
-  PA.preserve<GlobalsAA>(); 
-  PA.preserve<DominatorTreeAnalysis>(); 
-  PA.preserve<LazyValueAnalysis>(); 
-  return PA; 
-} 
- 
-bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_, 
-                                LazyValueInfo *LVI_, AliasAnalysis *AA_, 
-                                DomTreeUpdater *DTU_, bool HasProfileData_, 
-                                std::unique_ptr<BlockFrequencyInfo> BFI_, 
-                                std::unique_ptr<BranchProbabilityInfo> BPI_) { 
-  LLVM_DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n"); 
-  TLI = TLI_; 
-  LVI = LVI_; 
-  AA = AA_; 
-  DTU = DTU_; 
-  BFI.reset(); 
-  BPI.reset(); 
-  // When profile data is available, we need to update edge weights after 
-  // successful jump threading, which requires both BPI and BFI being available. 
-  HasProfileData = HasProfileData_; 
-  auto *GuardDecl = F.getParent()->getFunction( 
-      Intrinsic::getName(Intrinsic::experimental_guard)); 
-  HasGuards = GuardDecl && !GuardDecl->use_empty(); 
-  if (HasProfileData) { 
-    BPI = std::move(BPI_); 
-    BFI = std::move(BFI_); 
-  } 
- 
-  // Reduce the number of instructions duplicated when optimizing strictly for 
-  // size. 
-  if (BBDuplicateThreshold.getNumOccurrences()) 
-    BBDupThreshold = BBDuplicateThreshold; 
-  else if (F.hasFnAttribute(Attribute::MinSize)) 
-    BBDupThreshold = 3; 
-  else 
-    BBDupThreshold = DefaultBBDupThreshold; 
- 
-  // JumpThreading must not processes blocks unreachable from entry. It's a 
-  // waste of compute time and can potentially lead to hangs. 
-  SmallPtrSet<BasicBlock *, 16> Unreachable; 
-  assert(DTU && "DTU isn't passed into JumpThreading before using it."); 
-  assert(DTU->hasDomTree() && "JumpThreading relies on DomTree to proceed."); 
-  DominatorTree &DT = DTU->getDomTree(); 
-  for (auto &BB : F) 
-    if (!DT.isReachableFromEntry(&BB)) 
-      Unreachable.insert(&BB); 
- 
-  if (!ThreadAcrossLoopHeaders) 
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<GlobalsAA>();
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<LazyValueAnalysis>();
+  return PA;
+}
+
+bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
+                                LazyValueInfo *LVI_, AliasAnalysis *AA_,
+                                DomTreeUpdater *DTU_, bool HasProfileData_,
+                                std::unique_ptr<BlockFrequencyInfo> BFI_,
+                                std::unique_ptr<BranchProbabilityInfo> BPI_) {
+  LLVM_DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
+  TLI = TLI_;
+  LVI = LVI_;
+  AA = AA_;
+  DTU = DTU_;
+  BFI.reset();
+  BPI.reset();
+  // When profile data is available, we need to update edge weights after
+  // successful jump threading, which requires both BPI and BFI being available.
+  HasProfileData = HasProfileData_;
+  auto *GuardDecl = F.getParent()->getFunction(
+      Intrinsic::getName(Intrinsic::experimental_guard));
+  HasGuards = GuardDecl && !GuardDecl->use_empty();
+  if (HasProfileData) {
+    BPI = std::move(BPI_);
+    BFI = std::move(BFI_);
+  }
+
+  // Reduce the number of instructions duplicated when optimizing strictly for
+  // size.
+  if (BBDuplicateThreshold.getNumOccurrences())
+    BBDupThreshold = BBDuplicateThreshold;
+  else if (F.hasFnAttribute(Attribute::MinSize))
+    BBDupThreshold = 3;
+  else
+    BBDupThreshold = DefaultBBDupThreshold;
+
+  // JumpThreading must not processes blocks unreachable from entry. It's a
+  // waste of compute time and can potentially lead to hangs.
+  SmallPtrSet<BasicBlock *, 16> Unreachable;
+  assert(DTU && "DTU isn't passed into JumpThreading before using it.");
+  assert(DTU->hasDomTree() && "JumpThreading relies on DomTree to proceed.");
+  DominatorTree &DT = DTU->getDomTree();
+  for (auto &BB : F)
+    if (!DT.isReachableFromEntry(&BB))
+      Unreachable.insert(&BB);
+
+  if (!ThreadAcrossLoopHeaders)
     findLoopHeaders(F);
- 
-  bool EverChanged = false; 
-  bool Changed; 
-  do { 
-    Changed = false; 
-    for (auto &BB : F) { 
-      if (Unreachable.count(&BB)) 
-        continue; 
+
+  bool EverChanged = false;
+  bool Changed;
+  do {
+    Changed = false;
+    for (auto &BB : F) {
+      if (Unreachable.count(&BB))
+        continue;
       while (processBlock(&BB)) // Thread all of the branches we can over BB.
-        Changed = true; 
- 
-      // Jump threading may have introduced redundant debug values into BB 
-      // which should be removed. 
-      if (Changed) 
-        RemoveRedundantDbgInstrs(&BB); 
- 
-      // Stop processing BB if it's the entry or is now deleted. The following 
-      // routines attempt to eliminate BB and locating a suitable replacement 
-      // for the entry is non-trivial. 
-      if (&BB == &F.getEntryBlock() || DTU->isBBPendingDeletion(&BB)) 
-        continue; 
- 
-      if (pred_empty(&BB)) { 
+        Changed = true;
+
+      // Jump threading may have introduced redundant debug values into BB
+      // which should be removed.
+      if (Changed)
+        RemoveRedundantDbgInstrs(&BB);
+
+      // Stop processing BB if it's the entry or is now deleted. The following
+      // routines attempt to eliminate BB and locating a suitable replacement
+      // for the entry is non-trivial.
+      if (&BB == &F.getEntryBlock() || DTU->isBBPendingDeletion(&BB))
+        continue;
+
+      if (pred_empty(&BB)) {
         // When processBlock makes BB unreachable it doesn't bother to fix up
-        // the instructions in it. We must remove BB to prevent invalid IR. 
-        LLVM_DEBUG(dbgs() << "  JT: Deleting dead block '" << BB.getName() 
-                          << "' with terminator: " << *BB.getTerminator() 
-                          << '\n'); 
-        LoopHeaders.erase(&BB); 
-        LVI->eraseBlock(&BB); 
-        DeleteDeadBlock(&BB, DTU); 
-        Changed = true; 
-        continue; 
-      } 
- 
+        // the instructions in it. We must remove BB to prevent invalid IR.
+        LLVM_DEBUG(dbgs() << "  JT: Deleting dead block '" << BB.getName()
+                          << "' with terminator: " << *BB.getTerminator()
+                          << '\n');
+        LoopHeaders.erase(&BB);
+        LVI->eraseBlock(&BB);
+        DeleteDeadBlock(&BB, DTU);
+        Changed = true;
+        continue;
+      }
+
       // processBlock doesn't thread BBs with unconditional TIs. However, if BB
-      // is "almost empty", we attempt to merge BB with its sole successor. 
-      auto *BI = dyn_cast<BranchInst>(BB.getTerminator()); 
-      if (BI && BI->isUnconditional()) { 
-        BasicBlock *Succ = BI->getSuccessor(0); 
-        if ( 
-            // The terminator must be the only non-phi instruction in BB. 
-            BB.getFirstNonPHIOrDbg()->isTerminator() && 
-            // Don't alter Loop headers and latches to ensure another pass can 
-            // detect and transform nested loops later. 
-            !LoopHeaders.count(&BB) && !LoopHeaders.count(Succ) && 
-            TryToSimplifyUncondBranchFromEmptyBlock(&BB, DTU)) { 
-          RemoveRedundantDbgInstrs(Succ); 
-          // BB is valid for cleanup here because we passed in DTU. F remains 
-          // BB's parent until a DTU->getDomTree() event. 
-          LVI->eraseBlock(&BB); 
-          Changed = true; 
-        } 
-      } 
-    } 
-    EverChanged |= Changed; 
-  } while (Changed); 
- 
-  LoopHeaders.clear(); 
-  return EverChanged; 
-} 
- 
-// Replace uses of Cond with ToVal when safe to do so. If all uses are 
-// replaced, we can remove Cond. We cannot blindly replace all uses of Cond 
-// because we may incorrectly replace uses when guards/assumes are uses of 
-// of `Cond` and we used the guards/assume to reason about the `Cond` value 
-// at the end of block. RAUW unconditionally replaces all uses 
-// including the guards/assumes themselves and the uses before the 
-// guard/assume. 
+      // is "almost empty", we attempt to merge BB with its sole successor.
+      auto *BI = dyn_cast<BranchInst>(BB.getTerminator());
+      if (BI && BI->isUnconditional()) {
+        BasicBlock *Succ = BI->getSuccessor(0);
+        if (
+            // The terminator must be the only non-phi instruction in BB.
+            BB.getFirstNonPHIOrDbg()->isTerminator() &&
+            // Don't alter Loop headers and latches to ensure another pass can
+            // detect and transform nested loops later.
+            !LoopHeaders.count(&BB) && !LoopHeaders.count(Succ) &&
+            TryToSimplifyUncondBranchFromEmptyBlock(&BB, DTU)) {
+          RemoveRedundantDbgInstrs(Succ);
+          // BB is valid for cleanup here because we passed in DTU. F remains
+          // BB's parent until a DTU->getDomTree() event.
+          LVI->eraseBlock(&BB);
+          Changed = true;
+        }
+      }
+    }
+    EverChanged |= Changed;
+  } while (Changed);
+
+  LoopHeaders.clear();
+  return EverChanged;
+}
+
+// Replace uses of Cond with ToVal when safe to do so. If all uses are
+// replaced, we can remove Cond. We cannot blindly replace all uses of Cond
+// because we may incorrectly replace uses when guards/assumes are uses of
+// of `Cond` and we used the guards/assume to reason about the `Cond` value
+// at the end of block. RAUW unconditionally replaces all uses
+// including the guards/assumes themselves and the uses before the
+// guard/assume.
 static void replaceFoldableUses(Instruction *Cond, Value *ToVal) {
-  assert(Cond->getType() == ToVal->getType()); 
-  auto *BB = Cond->getParent(); 
-  // We can unconditionally replace all uses in non-local blocks (i.e. uses 
-  // strictly dominated by BB), since LVI information is true from the 
-  // terminator of BB. 
-  replaceNonLocalUsesWith(Cond, ToVal); 
-  for (Instruction &I : reverse(*BB)) { 
-    // Reached the Cond whose uses we are trying to replace, so there are no 
-    // more uses. 
-    if (&I == Cond) 
-      break; 
-    // We only replace uses in instructions that are guaranteed to reach the end 
-    // of BB, where we know Cond is ToVal. 
-    if (!isGuaranteedToTransferExecutionToSuccessor(&I)) 
-      break; 
-    I.replaceUsesOfWith(Cond, ToVal); 
-  } 
-  if (Cond->use_empty() && !Cond->mayHaveSideEffects()) 
-    Cond->eraseFromParent(); 
-} 
- 
-/// Return the cost of duplicating a piece of this block from first non-phi 
-/// and before StopAt instruction to thread across it. Stop scanning the block 
-/// when exceeding the threshold. If duplication is impossible, returns ~0U. 
-static unsigned getJumpThreadDuplicationCost(BasicBlock *BB, 
-                                             Instruction *StopAt, 
-                                             unsigned Threshold) { 
-  assert(StopAt->getParent() == BB && "Not an instruction from proper BB?"); 
-  /// Ignore PHI nodes, these will be flattened when duplication happens. 
-  BasicBlock::const_iterator I(BB->getFirstNonPHI()); 
- 
-  // FIXME: THREADING will delete values that are just used to compute the 
-  // branch, so they shouldn't count against the duplication cost. 
- 
-  unsigned Bonus = 0; 
-  if (BB->getTerminator() == StopAt) { 
-    // Threading through a switch statement is particularly profitable.  If this 
-    // block ends in a switch, decrease its cost to make it more likely to 
-    // happen. 
-    if (isa<SwitchInst>(StopAt)) 
-      Bonus = 6; 
- 
-    // The same holds for indirect branches, but slightly more so. 
-    if (isa<IndirectBrInst>(StopAt)) 
-      Bonus = 8; 
-  } 
- 
-  // Bump the threshold up so the early exit from the loop doesn't skip the 
-  // terminator-based Size adjustment at the end. 
-  Threshold += Bonus; 
- 
-  // Sum up the cost of each instruction until we get to the terminator.  Don't 
-  // include the terminator because the copy won't include it. 
-  unsigned Size = 0; 
-  for (; &*I != StopAt; ++I) { 
- 
-    // Stop scanning the block if we've reached the threshold. 
-    if (Size > Threshold) 
-      return Size; 
- 
-    // Debugger intrinsics don't incur code size. 
-    if (isa<DbgInfoIntrinsic>(I)) continue; 
- 
+  assert(Cond->getType() == ToVal->getType());
+  auto *BB = Cond->getParent();
+  // We can unconditionally replace all uses in non-local blocks (i.e. uses
+  // strictly dominated by BB), since LVI information is true from the
+  // terminator of BB.
+  replaceNonLocalUsesWith(Cond, ToVal);
+  for (Instruction &I : reverse(*BB)) {
+    // Reached the Cond whose uses we are trying to replace, so there are no
+    // more uses.
+    if (&I == Cond)
+      break;
+    // We only replace uses in instructions that are guaranteed to reach the end
+    // of BB, where we know Cond is ToVal.
+    if (!isGuaranteedToTransferExecutionToSuccessor(&I))
+      break;
+    I.replaceUsesOfWith(Cond, ToVal);
+  }
+  if (Cond->use_empty() && !Cond->mayHaveSideEffects())
+    Cond->eraseFromParent();
+}
+
+/// Return the cost of duplicating a piece of this block from first non-phi
+/// and before StopAt instruction to thread across it. Stop scanning the block
+/// when exceeding the threshold. If duplication is impossible, returns ~0U.
+static unsigned getJumpThreadDuplicationCost(BasicBlock *BB,
+                                             Instruction *StopAt,
+                                             unsigned Threshold) {
+  assert(StopAt->getParent() == BB && "Not an instruction from proper BB?");
+  /// Ignore PHI nodes, these will be flattened when duplication happens.
+  BasicBlock::const_iterator I(BB->getFirstNonPHI());
+
+  // FIXME: THREADING will delete values that are just used to compute the
+  // branch, so they shouldn't count against the duplication cost.
+
+  unsigned Bonus = 0;
+  if (BB->getTerminator() == StopAt) {
+    // Threading through a switch statement is particularly profitable.  If this
+    // block ends in a switch, decrease its cost to make it more likely to
+    // happen.
+    if (isa<SwitchInst>(StopAt))
+      Bonus = 6;
+
+    // The same holds for indirect branches, but slightly more so.
+    if (isa<IndirectBrInst>(StopAt))
+      Bonus = 8;
+  }
+
+  // Bump the threshold up so the early exit from the loop doesn't skip the
+  // terminator-based Size adjustment at the end.
+  Threshold += Bonus;
+
+  // Sum up the cost of each instruction until we get to the terminator.  Don't
+  // include the terminator because the copy won't include it.
+  unsigned Size = 0;
+  for (; &*I != StopAt; ++I) {
+
+    // Stop scanning the block if we've reached the threshold.
+    if (Size > Threshold)
+      return Size;
+
+    // Debugger intrinsics don't incur code size.
+    if (isa<DbgInfoIntrinsic>(I)) continue;
+
     // Pseudo-probes don't incur code size.
     if (isa<PseudoProbeInst>(I))
       continue;
 
-    // If this is a pointer->pointer bitcast, it is free. 
-    if (isa<BitCastInst>(I) && I->getType()->isPointerTy()) 
-      continue; 
- 
+    // If this is a pointer->pointer bitcast, it is free.
+    if (isa<BitCastInst>(I) && I->getType()->isPointerTy())
+      continue;
+
     // Freeze instruction is free, too.
     if (isa<FreezeInst>(I))
       continue;
 
-    // Bail out if this instruction gives back a token type, it is not possible 
-    // to duplicate it if it is used outside this BB. 
-    if (I->getType()->isTokenTy() && I->isUsedOutsideOfBlock(BB)) 
-      return ~0U; 
- 
-    // All other instructions count for at least one unit. 
-    ++Size; 
- 
-    // Calls are more expensive.  If they are non-intrinsic calls, we model them 
-    // as having cost of 4.  If they are a non-vector intrinsic, we model them 
-    // as having cost of 2 total, and if they are a vector intrinsic, we model 
-    // them as having cost 1. 
-    if (const CallInst *CI = dyn_cast<CallInst>(I)) { 
-      if (CI->cannotDuplicate() || CI->isConvergent()) 
-        // Blocks with NoDuplicate are modelled as having infinite cost, so they 
-        // are never duplicated. 
-        return ~0U; 
-      else if (!isa<IntrinsicInst>(CI)) 
-        Size += 3; 
-      else if (!CI->getType()->isVectorTy()) 
-        Size += 1; 
-    } 
-  } 
- 
-  return Size > Bonus ? Size - Bonus : 0; 
-} 
- 
+    // Bail out if this instruction gives back a token type, it is not possible
+    // to duplicate it if it is used outside this BB.
+    if (I->getType()->isTokenTy() && I->isUsedOutsideOfBlock(BB))
+      return ~0U;
+
+    // All other instructions count for at least one unit.
+    ++Size;
+
+    // Calls are more expensive.  If they are non-intrinsic calls, we model them
+    // as having cost of 4.  If they are a non-vector intrinsic, we model them
+    // as having cost of 2 total, and if they are a vector intrinsic, we model
+    // them as having cost 1.
+    if (const CallInst *CI = dyn_cast<CallInst>(I)) {
+      if (CI->cannotDuplicate() || CI->isConvergent())
+        // Blocks with NoDuplicate are modelled as having infinite cost, so they
+        // are never duplicated.
+        return ~0U;
+      else if (!isa<IntrinsicInst>(CI))
+        Size += 3;
+      else if (!CI->getType()->isVectorTy())
+        Size += 1;
+    }
+  }
+
+  return Size > Bonus ? Size - Bonus : 0;
+}
+
 /// findLoopHeaders - We do not want jump threading to turn proper loop
-/// structures into irreducible loops.  Doing this breaks up the loop nesting 
-/// hierarchy and pessimizes later transformations.  To prevent this from 
-/// happening, we first have to find the loop headers.  Here we approximate this 
-/// by finding targets of backedges in the CFG. 
-/// 
-/// Note that there definitely are cases when we want to allow threading of 
-/// edges across a loop header.  For example, threading a jump from outside the 
-/// loop (the preheader) to an exit block of the loop is definitely profitable. 
-/// It is also almost always profitable to thread backedges from within the loop 
-/// to exit blocks, and is often profitable to thread backedges to other blocks 
-/// within the loop (forming a nested loop).  This simple analysis is not rich 
-/// enough to track all of these properties and keep it up-to-date as the CFG 
-/// mutates, so we don't allow any of these transformations. 
+/// structures into irreducible loops.  Doing this breaks up the loop nesting
+/// hierarchy and pessimizes later transformations.  To prevent this from
+/// happening, we first have to find the loop headers.  Here we approximate this
+/// by finding targets of backedges in the CFG.
+///
+/// Note that there definitely are cases when we want to allow threading of
+/// edges across a loop header.  For example, threading a jump from outside the
+/// loop (the preheader) to an exit block of the loop is definitely profitable.
+/// It is also almost always profitable to thread backedges from within the loop
+/// to exit blocks, and is often profitable to thread backedges to other blocks
+/// within the loop (forming a nested loop).  This simple analysis is not rich
+/// enough to track all of these properties and keep it up-to-date as the CFG
+/// mutates, so we don't allow any of these transformations.
 void JumpThreadingPass::findLoopHeaders(Function &F) {
-  SmallVector<std::pair<const BasicBlock*,const BasicBlock*>, 32> Edges; 
-  FindFunctionBackedges(F, Edges); 
- 
-  for (const auto &Edge : Edges) 
-    LoopHeaders.insert(Edge.second); 
-} 
- 
-/// getKnownConstant - Helper method to determine if we can thread over a 
-/// terminator with the given value as its condition, and if so what value to 
-/// use for that. What kind of value this is depends on whether we want an 
-/// integer or a block address, but an undef is always accepted. 
-/// Returns null if Val is null or not an appropriate constant. 
-static Constant *getKnownConstant(Value *Val, ConstantPreference Preference) { 
-  if (!Val) 
-    return nullptr; 
- 
-  // Undef is "known" enough. 
-  if (UndefValue *U = dyn_cast<UndefValue>(Val)) 
-    return U; 
- 
-  if (Preference == WantBlockAddress) 
-    return dyn_cast<BlockAddress>(Val->stripPointerCasts()); 
- 
-  return dyn_cast<ConstantInt>(Val); 
-} 
- 
+  SmallVector<std::pair<const BasicBlock*,const BasicBlock*>, 32> Edges;
+  FindFunctionBackedges(F, Edges);
+
+  for (const auto &Edge : Edges)
+    LoopHeaders.insert(Edge.second);
+}
+
+/// getKnownConstant - Helper method to determine if we can thread over a
+/// terminator with the given value as its condition, and if so what value to
+/// use for that. What kind of value this is depends on whether we want an
+/// integer or a block address, but an undef is always accepted.
+/// Returns null if Val is null or not an appropriate constant.
+static Constant *getKnownConstant(Value *Val, ConstantPreference Preference) {
+  if (!Val)
+    return nullptr;
+
+  // Undef is "known" enough.
+  if (UndefValue *U = dyn_cast<UndefValue>(Val))
+    return U;
+
+  if (Preference == WantBlockAddress)
+    return dyn_cast<BlockAddress>(Val->stripPointerCasts());
+
+  return dyn_cast<ConstantInt>(Val);
+}
+
 /// computeValueKnownInPredecessors - Given a basic block BB and a value V, see
-/// if we can infer that the value is a known ConstantInt/BlockAddress or undef 
-/// in any of our predecessors.  If so, return the known list of value and pred 
-/// BB in the result vector. 
-/// 
-/// This returns true if there were any known values. 
+/// if we can infer that the value is a known ConstantInt/BlockAddress or undef
+/// in any of our predecessors.  If so, return the known list of value and pred
+/// BB in the result vector.
+///
+/// This returns true if there were any known values.
 bool JumpThreadingPass::computeValueKnownInPredecessorsImpl(
-    Value *V, BasicBlock *BB, PredValueInfo &Result, 
-    ConstantPreference Preference, DenseSet<Value *> &RecursionSet, 
-    Instruction *CxtI) { 
-  // This method walks up use-def chains recursively.  Because of this, we could 
-  // get into an infinite loop going around loops in the use-def chain.  To 
-  // prevent this, keep track of what (value, block) pairs we've already visited 
-  // and terminate the search if we loop back to them 
-  if (!RecursionSet.insert(V).second) 
-    return false; 
- 
-  // If V is a constant, then it is known in all predecessors. 
-  if (Constant *KC = getKnownConstant(V, Preference)) { 
-    for (BasicBlock *Pred : predecessors(BB)) 
-      Result.emplace_back(KC, Pred); 
- 
-    return !Result.empty(); 
-  } 
- 
-  // If V is a non-instruction value, or an instruction in a different block, 
-  // then it can't be derived from a PHI. 
-  Instruction *I = dyn_cast<Instruction>(V); 
-  if (!I || I->getParent() != BB) { 
- 
-    // Okay, if this is a live-in value, see if it has a known value at the end 
-    // of any of our predecessors. 
-    // 
-    // FIXME: This should be an edge property, not a block end property. 
-    /// TODO: Per PR2563, we could infer value range information about a 
-    /// predecessor based on its terminator. 
-    // 
-    // FIXME: change this to use the more-rich 'getPredicateOnEdge' method if 
-    // "I" is a non-local compare-with-a-constant instruction.  This would be 
-    // able to handle value inequalities better, for example if the compare is 
-    // "X < 4" and "X < 3" is known true but "X < 4" itself is not available. 
-    // Perhaps getConstantOnEdge should be smart enough to do this? 
-    for (BasicBlock *P : predecessors(BB)) { 
-      // If the value is known by LazyValueInfo to be a constant in a 
-      // predecessor, use that information to try to thread this block. 
-      Constant *PredCst = LVI->getConstantOnEdge(V, P, BB, CxtI); 
-      if (Constant *KC = getKnownConstant(PredCst, Preference)) 
-        Result.emplace_back(KC, P); 
-    } 
- 
-    return !Result.empty(); 
-  } 
- 
-  /// If I is a PHI node, then we know the incoming values for any constants. 
-  if (PHINode *PN = dyn_cast<PHINode>(I)) { 
-    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { 
-      Value *InVal = PN->getIncomingValue(i); 
-      if (Constant *KC = getKnownConstant(InVal, Preference)) { 
-        Result.emplace_back(KC, PN->getIncomingBlock(i)); 
-      } else { 
-        Constant *CI = LVI->getConstantOnEdge(InVal, 
-                                              PN->getIncomingBlock(i), 
-                                              BB, CxtI); 
-        if (Constant *KC = getKnownConstant(CI, Preference)) 
-          Result.emplace_back(KC, PN->getIncomingBlock(i)); 
-      } 
-    } 
- 
-    return !Result.empty(); 
-  } 
- 
+    Value *V, BasicBlock *BB, PredValueInfo &Result,
+    ConstantPreference Preference, DenseSet<Value *> &RecursionSet,
+    Instruction *CxtI) {
+  // This method walks up use-def chains recursively.  Because of this, we could
+  // get into an infinite loop going around loops in the use-def chain.  To
+  // prevent this, keep track of what (value, block) pairs we've already visited
+  // and terminate the search if we loop back to them
+  if (!RecursionSet.insert(V).second)
+    return false;
+
+  // If V is a constant, then it is known in all predecessors.
+  if (Constant *KC = getKnownConstant(V, Preference)) {
+    for (BasicBlock *Pred : predecessors(BB))
+      Result.emplace_back(KC, Pred);
+
+    return !Result.empty();
+  }
+
+  // If V is a non-instruction value, or an instruction in a different block,
+  // then it can't be derived from a PHI.
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I || I->getParent() != BB) {
+
+    // Okay, if this is a live-in value, see if it has a known value at the end
+    // of any of our predecessors.
+    //
+    // FIXME: This should be an edge property, not a block end property.
+    /// TODO: Per PR2563, we could infer value range information about a
+    /// predecessor based on its terminator.
+    //
+    // FIXME: change this to use the more-rich 'getPredicateOnEdge' method if
+    // "I" is a non-local compare-with-a-constant instruction.  This would be
+    // able to handle value inequalities better, for example if the compare is
+    // "X < 4" and "X < 3" is known true but "X < 4" itself is not available.
+    // Perhaps getConstantOnEdge should be smart enough to do this?
+    for (BasicBlock *P : predecessors(BB)) {
+      // If the value is known by LazyValueInfo to be a constant in a
+      // predecessor, use that information to try to thread this block.
+      Constant *PredCst = LVI->getConstantOnEdge(V, P, BB, CxtI);
+      if (Constant *KC = getKnownConstant(PredCst, Preference))
+        Result.emplace_back(KC, P);
+    }
+
+    return !Result.empty();
+  }
+
+  /// If I is a PHI node, then we know the incoming values for any constants.
+  if (PHINode *PN = dyn_cast<PHINode>(I)) {
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      Value *InVal = PN->getIncomingValue(i);
+      if (Constant *KC = getKnownConstant(InVal, Preference)) {
+        Result.emplace_back(KC, PN->getIncomingBlock(i));
+      } else {
+        Constant *CI = LVI->getConstantOnEdge(InVal,
+                                              PN->getIncomingBlock(i),
+                                              BB, CxtI);
+        if (Constant *KC = getKnownConstant(CI, Preference))
+          Result.emplace_back(KC, PN->getIncomingBlock(i));
+      }
+    }
+
+    return !Result.empty();
+  }
+
   // Handle Cast instructions.
-  if (CastInst *CI = dyn_cast<CastInst>(I)) { 
-    Value *Source = CI->getOperand(0); 
+  if (CastInst *CI = dyn_cast<CastInst>(I)) {
+    Value *Source = CI->getOperand(0);
     computeValueKnownInPredecessorsImpl(Source, BB, Result, Preference,
-                                        RecursionSet, CxtI); 
-    if (Result.empty()) 
-      return false; 
- 
-    // Convert the known values. 
-    for (auto &R : Result) 
-      R.first = ConstantExpr::getCast(CI->getOpcode(), R.first, CI->getType()); 
- 
-    return true; 
-  } 
- 
+                                        RecursionSet, CxtI);
+    if (Result.empty())
+      return false;
+
+    // Convert the known values.
+    for (auto &R : Result)
+      R.first = ConstantExpr::getCast(CI->getOpcode(), R.first, CI->getType());
+
+    return true;
+  }
+
   if (FreezeInst *FI = dyn_cast<FreezeInst>(I)) {
     Value *Source = FI->getOperand(0);
     computeValueKnownInPredecessorsImpl(Source, BB, Result, Preference,
@@ -731,1351 +731,1351 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl(
     return !Result.empty();
   }
 
-  // Handle some boolean conditions. 
-  if (I->getType()->getPrimitiveSizeInBits() == 1) { 
-    assert(Preference == WantInteger && "One-bit non-integer type?"); 
-    // X | true -> true 
-    // X & false -> false 
-    if (I->getOpcode() == Instruction::Or || 
-        I->getOpcode() == Instruction::And) { 
-      PredValueInfoTy LHSVals, RHSVals; 
- 
+  // Handle some boolean conditions.
+  if (I->getType()->getPrimitiveSizeInBits() == 1) {
+    assert(Preference == WantInteger && "One-bit non-integer type?");
+    // X | true -> true
+    // X & false -> false
+    if (I->getOpcode() == Instruction::Or ||
+        I->getOpcode() == Instruction::And) {
+      PredValueInfoTy LHSVals, RHSVals;
+
       computeValueKnownInPredecessorsImpl(I->getOperand(0), BB, LHSVals,
-                                      WantInteger, RecursionSet, CxtI); 
+                                      WantInteger, RecursionSet, CxtI);
       computeValueKnownInPredecessorsImpl(I->getOperand(1), BB, RHSVals,
-                                          WantInteger, RecursionSet, CxtI); 
- 
-      if (LHSVals.empty() && RHSVals.empty()) 
-        return false; 
- 
-      ConstantInt *InterestingVal; 
-      if (I->getOpcode() == Instruction::Or) 
-        InterestingVal = ConstantInt::getTrue(I->getContext()); 
-      else 
-        InterestingVal = ConstantInt::getFalse(I->getContext()); 
- 
-      SmallPtrSet<BasicBlock*, 4> LHSKnownBBs; 
- 
-      // Scan for the sentinel.  If we find an undef, force it to the 
-      // interesting value: x|undef -> true and x&undef -> false. 
-      for (const auto &LHSVal : LHSVals) 
-        if (LHSVal.first == InterestingVal || isa<UndefValue>(LHSVal.first)) { 
-          Result.emplace_back(InterestingVal, LHSVal.second); 
-          LHSKnownBBs.insert(LHSVal.second); 
-        } 
-      for (const auto &RHSVal : RHSVals) 
-        if (RHSVal.first == InterestingVal || isa<UndefValue>(RHSVal.first)) { 
-          // If we already inferred a value for this block on the LHS, don't 
-          // re-add it. 
-          if (!LHSKnownBBs.count(RHSVal.second)) 
-            Result.emplace_back(InterestingVal, RHSVal.second); 
-        } 
- 
-      return !Result.empty(); 
-    } 
- 
-    // Handle the NOT form of XOR. 
-    if (I->getOpcode() == Instruction::Xor && 
-        isa<ConstantInt>(I->getOperand(1)) && 
-        cast<ConstantInt>(I->getOperand(1))->isOne()) { 
+                                          WantInteger, RecursionSet, CxtI);
+
+      if (LHSVals.empty() && RHSVals.empty())
+        return false;
+
+      ConstantInt *InterestingVal;
+      if (I->getOpcode() == Instruction::Or)
+        InterestingVal = ConstantInt::getTrue(I->getContext());
+      else
+        InterestingVal = ConstantInt::getFalse(I->getContext());
+
+      SmallPtrSet<BasicBlock*, 4> LHSKnownBBs;
+
+      // Scan for the sentinel.  If we find an undef, force it to the
+      // interesting value: x|undef -> true and x&undef -> false.
+      for (const auto &LHSVal : LHSVals)
+        if (LHSVal.first == InterestingVal || isa<UndefValue>(LHSVal.first)) {
+          Result.emplace_back(InterestingVal, LHSVal.second);
+          LHSKnownBBs.insert(LHSVal.second);
+        }
+      for (const auto &RHSVal : RHSVals)
+        if (RHSVal.first == InterestingVal || isa<UndefValue>(RHSVal.first)) {
+          // If we already inferred a value for this block on the LHS, don't
+          // re-add it.
+          if (!LHSKnownBBs.count(RHSVal.second))
+            Result.emplace_back(InterestingVal, RHSVal.second);
+        }
+
+      return !Result.empty();
+    }
+
+    // Handle the NOT form of XOR.
+    if (I->getOpcode() == Instruction::Xor &&
+        isa<ConstantInt>(I->getOperand(1)) &&
+        cast<ConstantInt>(I->getOperand(1))->isOne()) {
       computeValueKnownInPredecessorsImpl(I->getOperand(0), BB, Result,
-                                          WantInteger, RecursionSet, CxtI); 
-      if (Result.empty()) 
-        return false; 
- 
-      // Invert the known values. 
-      for (auto &R : Result) 
-        R.first = ConstantExpr::getNot(R.first); 
- 
-      return true; 
-    } 
- 
-  // Try to simplify some other binary operator values. 
-  } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) { 
-    assert(Preference != WantBlockAddress 
-            && "A binary operator creating a block address?"); 
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) { 
-      PredValueInfoTy LHSVals; 
+                                          WantInteger, RecursionSet, CxtI);
+      if (Result.empty())
+        return false;
+
+      // Invert the known values.
+      for (auto &R : Result)
+        R.first = ConstantExpr::getNot(R.first);
+
+      return true;
+    }
+
+  // Try to simplify some other binary operator values.
+  } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
+    assert(Preference != WantBlockAddress
+            && "A binary operator creating a block address?");
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) {
+      PredValueInfoTy LHSVals;
       computeValueKnownInPredecessorsImpl(BO->getOperand(0), BB, LHSVals,
-                                          WantInteger, RecursionSet, CxtI); 
- 
-      // Try to use constant folding to simplify the binary operator. 
-      for (const auto &LHSVal : LHSVals) { 
-        Constant *V = LHSVal.first; 
-        Constant *Folded = ConstantExpr::get(BO->getOpcode(), V, CI); 
- 
-        if (Constant *KC = getKnownConstant(Folded, WantInteger)) 
-          Result.emplace_back(KC, LHSVal.second); 
-      } 
-    } 
- 
-    return !Result.empty(); 
-  } 
- 
-  // Handle compare with phi operand, where the PHI is defined in this block. 
-  if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) { 
-    assert(Preference == WantInteger && "Compares only produce integers"); 
-    Type *CmpType = Cmp->getType(); 
-    Value *CmpLHS = Cmp->getOperand(0); 
-    Value *CmpRHS = Cmp->getOperand(1); 
-    CmpInst::Predicate Pred = Cmp->getPredicate(); 
- 
-    PHINode *PN = dyn_cast<PHINode>(CmpLHS); 
-    if (!PN) 
-      PN = dyn_cast<PHINode>(CmpRHS); 
-    if (PN && PN->getParent() == BB) { 
-      const DataLayout &DL = PN->getModule()->getDataLayout(); 
-      // We can do this simplification if any comparisons fold to true or false. 
-      // See if any do. 
-      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { 
-        BasicBlock *PredBB = PN->getIncomingBlock(i); 
-        Value *LHS, *RHS; 
-        if (PN == CmpLHS) { 
-          LHS = PN->getIncomingValue(i); 
-          RHS = CmpRHS->DoPHITranslation(BB, PredBB); 
-        } else { 
-          LHS = CmpLHS->DoPHITranslation(BB, PredBB); 
-          RHS = PN->getIncomingValue(i); 
-        } 
-        Value *Res = SimplifyCmpInst(Pred, LHS, RHS, {DL}); 
-        if (!Res) { 
-          if (!isa<Constant>(RHS)) 
-            continue; 
- 
-          // getPredicateOnEdge call will make no sense if LHS is defined in BB. 
-          auto LHSInst = dyn_cast<Instruction>(LHS); 
-          if (LHSInst && LHSInst->getParent() == BB) 
-            continue; 
- 
-          LazyValueInfo::Tristate 
-            ResT = LVI->getPredicateOnEdge(Pred, LHS, 
-                                           cast<Constant>(RHS), PredBB, BB, 
-                                           CxtI ? CxtI : Cmp); 
-          if (ResT == LazyValueInfo::Unknown) 
-            continue; 
-          Res = ConstantInt::get(Type::getInt1Ty(LHS->getContext()), ResT); 
-        } 
- 
-        if (Constant *KC = getKnownConstant(Res, WantInteger)) 
-          Result.emplace_back(KC, PredBB); 
-      } 
- 
-      return !Result.empty(); 
-    } 
- 
-    // If comparing a live-in value against a constant, see if we know the 
-    // live-in value on any predecessors. 
-    if (isa<Constant>(CmpRHS) && !CmpType->isVectorTy()) { 
-      Constant *CmpConst = cast<Constant>(CmpRHS); 
- 
-      if (!isa<Instruction>(CmpLHS) || 
-          cast<Instruction>(CmpLHS)->getParent() != BB) { 
-        for (BasicBlock *P : predecessors(BB)) { 
-          // If the value is known by LazyValueInfo to be a constant in a 
-          // predecessor, use that information to try to thread this block. 
-          LazyValueInfo::Tristate Res = 
-            LVI->getPredicateOnEdge(Pred, CmpLHS, 
-                                    CmpConst, P, BB, CxtI ? CxtI : Cmp); 
-          if (Res == LazyValueInfo::Unknown) 
-            continue; 
- 
-          Constant *ResC = ConstantInt::get(CmpType, Res); 
-          Result.emplace_back(ResC, P); 
-        } 
- 
-        return !Result.empty(); 
-      } 
- 
-      // InstCombine can fold some forms of constant range checks into 
-      // (icmp (add (x, C1)), C2). See if we have we have such a thing with 
-      // x as a live-in. 
-      { 
-        using namespace PatternMatch; 
- 
-        Value *AddLHS; 
-        ConstantInt *AddConst; 
-        if (isa<ConstantInt>(CmpConst) && 
-            match(CmpLHS, m_Add(m_Value(AddLHS), m_ConstantInt(AddConst)))) { 
-          if (!isa<Instruction>(AddLHS) || 
-              cast<Instruction>(AddLHS)->getParent() != BB) { 
-            for (BasicBlock *P : predecessors(BB)) { 
-              // If the value is known by LazyValueInfo to be a ConstantRange in 
-              // a predecessor, use that information to try to thread this 
-              // block. 
-              ConstantRange CR = LVI->getConstantRangeOnEdge( 
-                  AddLHS, P, BB, CxtI ? CxtI : cast<Instruction>(CmpLHS)); 
-              // Propagate the range through the addition. 
-              CR = CR.add(AddConst->getValue()); 
- 
-              // Get the range where the compare returns true. 
-              ConstantRange CmpRange = ConstantRange::makeExactICmpRegion( 
-                  Pred, cast<ConstantInt>(CmpConst)->getValue()); 
- 
-              Constant *ResC; 
-              if (CmpRange.contains(CR)) 
-                ResC = ConstantInt::getTrue(CmpType); 
-              else if (CmpRange.inverse().contains(CR)) 
-                ResC = ConstantInt::getFalse(CmpType); 
-              else 
-                continue; 
- 
-              Result.emplace_back(ResC, P); 
-            } 
- 
-            return !Result.empty(); 
-          } 
-        } 
-      } 
- 
-      // Try to find a constant value for the LHS of a comparison, 
-      // and evaluate it statically if we can. 
-      PredValueInfoTy LHSVals; 
+                                          WantInteger, RecursionSet, CxtI);
+
+      // Try to use constant folding to simplify the binary operator.
+      for (const auto &LHSVal : LHSVals) {
+        Constant *V = LHSVal.first;
+        Constant *Folded = ConstantExpr::get(BO->getOpcode(), V, CI);
+
+        if (Constant *KC = getKnownConstant(Folded, WantInteger))
+          Result.emplace_back(KC, LHSVal.second);
+      }
+    }
+
+    return !Result.empty();
+  }
+
+  // Handle compare with phi operand, where the PHI is defined in this block.
+  if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) {
+    assert(Preference == WantInteger && "Compares only produce integers");
+    Type *CmpType = Cmp->getType();
+    Value *CmpLHS = Cmp->getOperand(0);
+    Value *CmpRHS = Cmp->getOperand(1);
+    CmpInst::Predicate Pred = Cmp->getPredicate();
+
+    PHINode *PN = dyn_cast<PHINode>(CmpLHS);
+    if (!PN)
+      PN = dyn_cast<PHINode>(CmpRHS);
+    if (PN && PN->getParent() == BB) {
+      const DataLayout &DL = PN->getModule()->getDataLayout();
+      // We can do this simplification if any comparisons fold to true or false.
+      // See if any do.
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+        BasicBlock *PredBB = PN->getIncomingBlock(i);
+        Value *LHS, *RHS;
+        if (PN == CmpLHS) {
+          LHS = PN->getIncomingValue(i);
+          RHS = CmpRHS->DoPHITranslation(BB, PredBB);
+        } else {
+          LHS = CmpLHS->DoPHITranslation(BB, PredBB);
+          RHS = PN->getIncomingValue(i);
+        }
+        Value *Res = SimplifyCmpInst(Pred, LHS, RHS, {DL});
+        if (!Res) {
+          if (!isa<Constant>(RHS))
+            continue;
+
+          // getPredicateOnEdge call will make no sense if LHS is defined in BB.
+          auto LHSInst = dyn_cast<Instruction>(LHS);
+          if (LHSInst && LHSInst->getParent() == BB)
+            continue;
+
+          LazyValueInfo::Tristate
+            ResT = LVI->getPredicateOnEdge(Pred, LHS,
+                                           cast<Constant>(RHS), PredBB, BB,
+                                           CxtI ? CxtI : Cmp);
+          if (ResT == LazyValueInfo::Unknown)
+            continue;
+          Res = ConstantInt::get(Type::getInt1Ty(LHS->getContext()), ResT);
+        }
+
+        if (Constant *KC = getKnownConstant(Res, WantInteger))
+          Result.emplace_back(KC, PredBB);
+      }
+
+      return !Result.empty();
+    }
+
+    // If comparing a live-in value against a constant, see if we know the
+    // live-in value on any predecessors.
+    if (isa<Constant>(CmpRHS) && !CmpType->isVectorTy()) {
+      Constant *CmpConst = cast<Constant>(CmpRHS);
+
+      if (!isa<Instruction>(CmpLHS) ||
+          cast<Instruction>(CmpLHS)->getParent() != BB) {
+        for (BasicBlock *P : predecessors(BB)) {
+          // If the value is known by LazyValueInfo to be a constant in a
+          // predecessor, use that information to try to thread this block.
+          LazyValueInfo::Tristate Res =
+            LVI->getPredicateOnEdge(Pred, CmpLHS,
+                                    CmpConst, P, BB, CxtI ? CxtI : Cmp);
+          if (Res == LazyValueInfo::Unknown)
+            continue;
+
+          Constant *ResC = ConstantInt::get(CmpType, Res);
+          Result.emplace_back(ResC, P);
+        }
+
+        return !Result.empty();
+      }
+
+      // InstCombine can fold some forms of constant range checks into
+      // (icmp (add (x, C1)), C2). See if we have we have such a thing with
+      // x as a live-in.
+      {
+        using namespace PatternMatch;
+
+        Value *AddLHS;
+        ConstantInt *AddConst;
+        if (isa<ConstantInt>(CmpConst) &&
+            match(CmpLHS, m_Add(m_Value(AddLHS), m_ConstantInt(AddConst)))) {
+          if (!isa<Instruction>(AddLHS) ||
+              cast<Instruction>(AddLHS)->getParent() != BB) {
+            for (BasicBlock *P : predecessors(BB)) {
+              // If the value is known by LazyValueInfo to be a ConstantRange in
+              // a predecessor, use that information to try to thread this
+              // block.
+              ConstantRange CR = LVI->getConstantRangeOnEdge(
+                  AddLHS, P, BB, CxtI ? CxtI : cast<Instruction>(CmpLHS));
+              // Propagate the range through the addition.
+              CR = CR.add(AddConst->getValue());
+
+              // Get the range where the compare returns true.
+              ConstantRange CmpRange = ConstantRange::makeExactICmpRegion(
+                  Pred, cast<ConstantInt>(CmpConst)->getValue());
+
+              Constant *ResC;
+              if (CmpRange.contains(CR))
+                ResC = ConstantInt::getTrue(CmpType);
+              else if (CmpRange.inverse().contains(CR))
+                ResC = ConstantInt::getFalse(CmpType);
+              else
+                continue;
+
+              Result.emplace_back(ResC, P);
+            }
+
+            return !Result.empty();
+          }
+        }
+      }
+
+      // Try to find a constant value for the LHS of a comparison,
+      // and evaluate it statically if we can.
+      PredValueInfoTy LHSVals;
       computeValueKnownInPredecessorsImpl(I->getOperand(0), BB, LHSVals,
-                                          WantInteger, RecursionSet, CxtI); 
- 
-      for (const auto &LHSVal : LHSVals) { 
-        Constant *V = LHSVal.first; 
-        Constant *Folded = ConstantExpr::getCompare(Pred, V, CmpConst); 
-        if (Constant *KC = getKnownConstant(Folded, WantInteger)) 
-          Result.emplace_back(KC, LHSVal.second); 
-      } 
- 
-      return !Result.empty(); 
-    } 
-  } 
- 
-  if (SelectInst *SI = dyn_cast<SelectInst>(I)) { 
-    // Handle select instructions where at least one operand is a known constant 
-    // and we can figure out the condition value for any predecessor block. 
-    Constant *TrueVal = getKnownConstant(SI->getTrueValue(), Preference); 
-    Constant *FalseVal = getKnownConstant(SI->getFalseValue(), Preference); 
-    PredValueInfoTy Conds; 
-    if ((TrueVal || FalseVal) && 
+                                          WantInteger, RecursionSet, CxtI);
+
+      for (const auto &LHSVal : LHSVals) {
+        Constant *V = LHSVal.first;
+        Constant *Folded = ConstantExpr::getCompare(Pred, V, CmpConst);
+        if (Constant *KC = getKnownConstant(Folded, WantInteger))
+          Result.emplace_back(KC, LHSVal.second);
+      }
+
+      return !Result.empty();
+    }
+  }
+
+  if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+    // Handle select instructions where at least one operand is a known constant
+    // and we can figure out the condition value for any predecessor block.
+    Constant *TrueVal = getKnownConstant(SI->getTrueValue(), Preference);
+    Constant *FalseVal = getKnownConstant(SI->getFalseValue(), Preference);
+    PredValueInfoTy Conds;
+    if ((TrueVal || FalseVal) &&
         computeValueKnownInPredecessorsImpl(SI->getCondition(), BB, Conds,
-                                            WantInteger, RecursionSet, CxtI)) { 
-      for (auto &C : Conds) { 
-        Constant *Cond = C.first; 
- 
-        // Figure out what value to use for the condition. 
-        bool KnownCond; 
-        if (ConstantInt *CI = dyn_cast<ConstantInt>(Cond)) { 
-          // A known boolean. 
-          KnownCond = CI->isOne(); 
-        } else { 
-          assert(isa<UndefValue>(Cond) && "Unexpected condition value"); 
-          // Either operand will do, so be sure to pick the one that's a known 
-          // constant. 
-          // FIXME: Do this more cleverly if both values are known constants? 
-          KnownCond = (TrueVal != nullptr); 
-        } 
- 
-        // See if the select has a known constant value for this predecessor. 
-        if (Constant *Val = KnownCond ? TrueVal : FalseVal) 
-          Result.emplace_back(Val, C.second); 
-      } 
- 
-      return !Result.empty(); 
-    } 
-  } 
- 
-  // If all else fails, see if LVI can figure out a constant value for us. 
+                                            WantInteger, RecursionSet, CxtI)) {
+      for (auto &C : Conds) {
+        Constant *Cond = C.first;
+
+        // Figure out what value to use for the condition.
+        bool KnownCond;
+        if (ConstantInt *CI = dyn_cast<ConstantInt>(Cond)) {
+          // A known boolean.
+          KnownCond = CI->isOne();
+        } else {
+          assert(isa<UndefValue>(Cond) && "Unexpected condition value");
+          // Either operand will do, so be sure to pick the one that's a known
+          // constant.
+          // FIXME: Do this more cleverly if both values are known constants?
+          KnownCond = (TrueVal != nullptr);
+        }
+
+        // See if the select has a known constant value for this predecessor.
+        if (Constant *Val = KnownCond ? TrueVal : FalseVal)
+          Result.emplace_back(Val, C.second);
+      }
+
+      return !Result.empty();
+    }
+  }
+
+  // If all else fails, see if LVI can figure out a constant value for us.
   assert(CxtI->getParent() == BB && "CxtI should be in BB");
   Constant *CI = LVI->getConstant(V, CxtI);
-  if (Constant *KC = getKnownConstant(CI, Preference)) { 
-    for (BasicBlock *Pred : predecessors(BB)) 
-      Result.emplace_back(KC, Pred); 
-  } 
- 
-  return !Result.empty(); 
-} 
- 
-/// GetBestDestForBranchOnUndef - If we determine that the specified block ends 
-/// in an undefined jump, decide which block is best to revector to. 
-/// 
-/// Since we can pick an arbitrary destination, we pick the successor with the 
-/// fewest predecessors.  This should reduce the in-degree of the others. 
+  if (Constant *KC = getKnownConstant(CI, Preference)) {
+    for (BasicBlock *Pred : predecessors(BB))
+      Result.emplace_back(KC, Pred);
+  }
+
+  return !Result.empty();
+}
+
+/// GetBestDestForBranchOnUndef - If we determine that the specified block ends
+/// in an undefined jump, decide which block is best to revector to.
+///
+/// Since we can pick an arbitrary destination, we pick the successor with the
+/// fewest predecessors.  This should reduce the in-degree of the others.
 static unsigned getBestDestForJumpOnUndef(BasicBlock *BB) {
-  Instruction *BBTerm = BB->getTerminator(); 
-  unsigned MinSucc = 0; 
-  BasicBlock *TestBB = BBTerm->getSuccessor(MinSucc); 
-  // Compute the successor with the minimum number of predecessors. 
-  unsigned MinNumPreds = pred_size(TestBB); 
-  for (unsigned i = 1, e = BBTerm->getNumSuccessors(); i != e; ++i) { 
-    TestBB = BBTerm->getSuccessor(i); 
-    unsigned NumPreds = pred_size(TestBB); 
-    if (NumPreds < MinNumPreds) { 
-      MinSucc = i; 
-      MinNumPreds = NumPreds; 
-    } 
-  } 
- 
-  return MinSucc; 
-} 
- 
-static bool hasAddressTakenAndUsed(BasicBlock *BB) { 
-  if (!BB->hasAddressTaken()) return false; 
- 
-  // If the block has its address taken, it may be a tree of dead constants 
-  // hanging off of it.  These shouldn't keep the block alive. 
-  BlockAddress *BA = BlockAddress::get(BB); 
-  BA->removeDeadConstantUsers(); 
-  return !BA->use_empty(); 
-} 
- 
+  Instruction *BBTerm = BB->getTerminator();
+  unsigned MinSucc = 0;
+  BasicBlock *TestBB = BBTerm->getSuccessor(MinSucc);
+  // Compute the successor with the minimum number of predecessors.
+  unsigned MinNumPreds = pred_size(TestBB);
+  for (unsigned i = 1, e = BBTerm->getNumSuccessors(); i != e; ++i) {
+    TestBB = BBTerm->getSuccessor(i);
+    unsigned NumPreds = pred_size(TestBB);
+    if (NumPreds < MinNumPreds) {
+      MinSucc = i;
+      MinNumPreds = NumPreds;
+    }
+  }
+
+  return MinSucc;
+}
+
+static bool hasAddressTakenAndUsed(BasicBlock *BB) {
+  if (!BB->hasAddressTaken()) return false;
+
+  // If the block has its address taken, it may be a tree of dead constants
+  // hanging off of it.  These shouldn't keep the block alive.
+  BlockAddress *BA = BlockAddress::get(BB);
+  BA->removeDeadConstantUsers();
+  return !BA->use_empty();
+}
+
 /// processBlock - If there are any predecessors whose control can be threaded
-/// through to a successor, transform them now. 
+/// through to a successor, transform them now.
 bool JumpThreadingPass::processBlock(BasicBlock *BB) {
-  // If the block is trivially dead, just return and let the caller nuke it. 
-  // This simplifies other transformations. 
-  if (DTU->isBBPendingDeletion(BB) || 
-      (pred_empty(BB) && BB != &BB->getParent()->getEntryBlock())) 
-    return false; 
- 
-  // If this block has a single predecessor, and if that pred has a single 
-  // successor, merge the blocks.  This encourages recursive jump threading 
-  // because now the condition in this block can be threaded through 
-  // predecessors of our predecessor block. 
+  // If the block is trivially dead, just return and let the caller nuke it.
+  // This simplifies other transformations.
+  if (DTU->isBBPendingDeletion(BB) ||
+      (pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()))
+    return false;
+
+  // If this block has a single predecessor, and if that pred has a single
+  // successor, merge the blocks.  This encourages recursive jump threading
+  // because now the condition in this block can be threaded through
+  // predecessors of our predecessor block.
   if (maybeMergeBasicBlockIntoOnlyPred(BB))
-    return true; 
- 
+    return true;
+
   if (tryToUnfoldSelectInCurrBB(BB))
-    return true; 
- 
-  // Look if we can propagate guards to predecessors. 
+    return true;
+
+  // Look if we can propagate guards to predecessors.
   if (HasGuards && processGuards(BB))
-    return true; 
- 
-  // What kind of constant we're looking for. 
-  ConstantPreference Preference = WantInteger; 
- 
-  // Look to see if the terminator is a conditional branch, switch or indirect 
-  // branch, if not we can't thread it. 
-  Value *Condition; 
-  Instruction *Terminator = BB->getTerminator(); 
-  if (BranchInst *BI = dyn_cast<BranchInst>(Terminator)) { 
-    // Can't thread an unconditional jump. 
-    if (BI->isUnconditional()) return false; 
-    Condition = BI->getCondition(); 
-  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Terminator)) { 
-    Condition = SI->getCondition(); 
-  } else if (IndirectBrInst *IB = dyn_cast<IndirectBrInst>(Terminator)) { 
-    // Can't thread indirect branch with no successors. 
-    if (IB->getNumSuccessors() == 0) return false; 
-    Condition = IB->getAddress()->stripPointerCasts(); 
-    Preference = WantBlockAddress; 
-  } else { 
-    return false; // Must be an invoke or callbr. 
-  } 
- 
+    return true;
+
+  // What kind of constant we're looking for.
+  ConstantPreference Preference = WantInteger;
+
+  // Look to see if the terminator is a conditional branch, switch or indirect
+  // branch, if not we can't thread it.
+  Value *Condition;
+  Instruction *Terminator = BB->getTerminator();
+  if (BranchInst *BI = dyn_cast<BranchInst>(Terminator)) {
+    // Can't thread an unconditional jump.
+    if (BI->isUnconditional()) return false;
+    Condition = BI->getCondition();
+  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Terminator)) {
+    Condition = SI->getCondition();
+  } else if (IndirectBrInst *IB = dyn_cast<IndirectBrInst>(Terminator)) {
+    // Can't thread indirect branch with no successors.
+    if (IB->getNumSuccessors() == 0) return false;
+    Condition = IB->getAddress()->stripPointerCasts();
+    Preference = WantBlockAddress;
+  } else {
+    return false; // Must be an invoke or callbr.
+  }
+
   // Keep track if we constant folded the condition in this invocation.
   bool ConstantFolded = false;
 
-  // Run constant folding to see if we can reduce the condition to a simple 
-  // constant. 
-  if (Instruction *I = dyn_cast<Instruction>(Condition)) { 
-    Value *SimpleVal = 
-        ConstantFoldInstruction(I, BB->getModule()->getDataLayout(), TLI); 
-    if (SimpleVal) { 
-      I->replaceAllUsesWith(SimpleVal); 
-      if (isInstructionTriviallyDead(I, TLI)) 
-        I->eraseFromParent(); 
-      Condition = SimpleVal; 
+  // Run constant folding to see if we can reduce the condition to a simple
+  // constant.
+  if (Instruction *I = dyn_cast<Instruction>(Condition)) {
+    Value *SimpleVal =
+        ConstantFoldInstruction(I, BB->getModule()->getDataLayout(), TLI);
+    if (SimpleVal) {
+      I->replaceAllUsesWith(SimpleVal);
+      if (isInstructionTriviallyDead(I, TLI))
+        I->eraseFromParent();
+      Condition = SimpleVal;
       ConstantFolded = true;
-    } 
-  } 
- 
+    }
+  }
+
   // If the terminator is branching on an undef or freeze undef, we can pick any
   // of the successors to branch to.  Let getBestDestForJumpOnUndef decide.
   auto *FI = dyn_cast<FreezeInst>(Condition);
   if (isa<UndefValue>(Condition) ||
       (FI && isa<UndefValue>(FI->getOperand(0)) && FI->hasOneUse())) {
     unsigned BestSucc = getBestDestForJumpOnUndef(BB);
-    std::vector<DominatorTree::UpdateType> Updates; 
- 
-    // Fold the branch/switch. 
-    Instruction *BBTerm = BB->getTerminator(); 
-    Updates.reserve(BBTerm->getNumSuccessors()); 
-    for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i) { 
-      if (i == BestSucc) continue; 
-      BasicBlock *Succ = BBTerm->getSuccessor(i); 
-      Succ->removePredecessor(BB, true); 
-      Updates.push_back({DominatorTree::Delete, BB, Succ}); 
-    } 
- 
-    LLVM_DEBUG(dbgs() << "  In block '" << BB->getName() 
-                      << "' folding undef terminator: " << *BBTerm << '\n'); 
-    BranchInst::Create(BBTerm->getSuccessor(BestSucc), BBTerm); 
-    BBTerm->eraseFromParent(); 
-    DTU->applyUpdatesPermissive(Updates); 
+    std::vector<DominatorTree::UpdateType> Updates;
+
+    // Fold the branch/switch.
+    Instruction *BBTerm = BB->getTerminator();
+    Updates.reserve(BBTerm->getNumSuccessors());
+    for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i) {
+      if (i == BestSucc) continue;
+      BasicBlock *Succ = BBTerm->getSuccessor(i);
+      Succ->removePredecessor(BB, true);
+      Updates.push_back({DominatorTree::Delete, BB, Succ});
+    }
+
+    LLVM_DEBUG(dbgs() << "  In block '" << BB->getName()
+                      << "' folding undef terminator: " << *BBTerm << '\n');
+    BranchInst::Create(BBTerm->getSuccessor(BestSucc), BBTerm);
+    BBTerm->eraseFromParent();
+    DTU->applyUpdatesPermissive(Updates);
     if (FI)
       FI->eraseFromParent();
-    return true; 
-  } 
- 
-  // If the terminator of this block is branching on a constant, simplify the 
-  // terminator to an unconditional branch.  This can occur due to threading in 
-  // other blocks. 
-  if (getKnownConstant(Condition, Preference)) { 
-    LLVM_DEBUG(dbgs() << "  In block '" << BB->getName() 
-                      << "' folding terminator: " << *BB->getTerminator() 
-                      << '\n'); 
-    ++NumFolds; 
-    ConstantFoldTerminator(BB, true, nullptr, DTU); 
+    return true;
+  }
+
+  // If the terminator of this block is branching on a constant, simplify the
+  // terminator to an unconditional branch.  This can occur due to threading in
+  // other blocks.
+  if (getKnownConstant(Condition, Preference)) {
+    LLVM_DEBUG(dbgs() << "  In block '" << BB->getName()
+                      << "' folding terminator: " << *BB->getTerminator()
+                      << '\n');
+    ++NumFolds;
+    ConstantFoldTerminator(BB, true, nullptr, DTU);
     if (HasProfileData)
       BPI->eraseBlock(BB);
-    return true; 
-  } 
- 
-  Instruction *CondInst = dyn_cast<Instruction>(Condition); 
- 
-  // All the rest of our checks depend on the condition being an instruction. 
-  if (!CondInst) { 
-    // FIXME: Unify this with code below. 
+    return true;
+  }
+
+  Instruction *CondInst = dyn_cast<Instruction>(Condition);
+
+  // All the rest of our checks depend on the condition being an instruction.
+  if (!CondInst) {
+    // FIXME: Unify this with code below.
     if (processThreadableEdges(Condition, BB, Preference, Terminator))
-      return true; 
+      return true;
     return ConstantFolded;
-  } 
- 
-  if (CmpInst *CondCmp = dyn_cast<CmpInst>(CondInst)) { 
-    // If we're branching on a conditional, LVI might be able to determine 
-    // it's value at the branch instruction.  We only handle comparisons 
-    // against a constant at this time. 
-    // TODO: This should be extended to handle switches as well. 
-    BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator()); 
-    Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1)); 
-    if (CondBr && CondConst) { 
-      // We should have returned as soon as we turn a conditional branch to 
-      // unconditional. Because its no longer interesting as far as jump 
-      // threading is concerned. 
-      assert(CondBr->isConditional() && "Threading on unconditional terminator"); 
- 
-      LazyValueInfo::Tristate Ret = 
-        LVI->getPredicateAt(CondCmp->getPredicate(), CondCmp->getOperand(0), 
-                            CondConst, CondBr); 
-      if (Ret != LazyValueInfo::Unknown) { 
-        unsigned ToRemove = Ret == LazyValueInfo::True ? 1 : 0; 
-        unsigned ToKeep = Ret == LazyValueInfo::True ? 0 : 1; 
-        BasicBlock *ToRemoveSucc = CondBr->getSuccessor(ToRemove); 
-        ToRemoveSucc->removePredecessor(BB, true); 
-        BranchInst *UncondBr = 
-          BranchInst::Create(CondBr->getSuccessor(ToKeep), CondBr); 
-        UncondBr->setDebugLoc(CondBr->getDebugLoc()); 
-        CondBr->eraseFromParent(); 
-        if (CondCmp->use_empty()) 
-          CondCmp->eraseFromParent(); 
-        // We can safely replace *some* uses of the CondInst if it has 
-        // exactly one value as returned by LVI. RAUW is incorrect in the 
-        // presence of guards and assumes, that have the `Cond` as the use. This 
-        // is because we use the guards/assume to reason about the `Cond` value 
-        // at the end of block, but RAUW unconditionally replaces all uses 
-        // including the guards/assumes themselves and the uses before the 
-        // guard/assume. 
-        else if (CondCmp->getParent() == BB) { 
-          auto *CI = Ret == LazyValueInfo::True ? 
-            ConstantInt::getTrue(CondCmp->getType()) : 
-            ConstantInt::getFalse(CondCmp->getType()); 
+  }
+
+  if (CmpInst *CondCmp = dyn_cast<CmpInst>(CondInst)) {
+    // If we're branching on a conditional, LVI might be able to determine
+    // it's value at the branch instruction.  We only handle comparisons
+    // against a constant at this time.
+    // TODO: This should be extended to handle switches as well.
+    BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
+    Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1));
+    if (CondBr && CondConst) {
+      // We should have returned as soon as we turn a conditional branch to
+      // unconditional. Because its no longer interesting as far as jump
+      // threading is concerned.
+      assert(CondBr->isConditional() && "Threading on unconditional terminator");
+
+      LazyValueInfo::Tristate Ret =
+        LVI->getPredicateAt(CondCmp->getPredicate(), CondCmp->getOperand(0),
+                            CondConst, CondBr);
+      if (Ret != LazyValueInfo::Unknown) {
+        unsigned ToRemove = Ret == LazyValueInfo::True ? 1 : 0;
+        unsigned ToKeep = Ret == LazyValueInfo::True ? 0 : 1;
+        BasicBlock *ToRemoveSucc = CondBr->getSuccessor(ToRemove);
+        ToRemoveSucc->removePredecessor(BB, true);
+        BranchInst *UncondBr =
+          BranchInst::Create(CondBr->getSuccessor(ToKeep), CondBr);
+        UncondBr->setDebugLoc(CondBr->getDebugLoc());
+        CondBr->eraseFromParent();
+        if (CondCmp->use_empty())
+          CondCmp->eraseFromParent();
+        // We can safely replace *some* uses of the CondInst if it has
+        // exactly one value as returned by LVI. RAUW is incorrect in the
+        // presence of guards and assumes, that have the `Cond` as the use. This
+        // is because we use the guards/assume to reason about the `Cond` value
+        // at the end of block, but RAUW unconditionally replaces all uses
+        // including the guards/assumes themselves and the uses before the
+        // guard/assume.
+        else if (CondCmp->getParent() == BB) {
+          auto *CI = Ret == LazyValueInfo::True ?
+            ConstantInt::getTrue(CondCmp->getType()) :
+            ConstantInt::getFalse(CondCmp->getType());
           replaceFoldableUses(CondCmp, CI);
-        } 
-        DTU->applyUpdatesPermissive( 
-            {{DominatorTree::Delete, BB, ToRemoveSucc}}); 
+        }
+        DTU->applyUpdatesPermissive(
+            {{DominatorTree::Delete, BB, ToRemoveSucc}});
         if (HasProfileData)
           BPI->eraseBlock(BB);
-        return true; 
-      } 
- 
-      // We did not manage to simplify this branch, try to see whether 
-      // CondCmp depends on a known phi-select pattern. 
+        return true;
+      }
+
+      // We did not manage to simplify this branch, try to see whether
+      // CondCmp depends on a known phi-select pattern.
       if (tryToUnfoldSelect(CondCmp, BB))
-        return true; 
-    } 
-  } 
- 
-  if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) 
+        return true;
+    }
+  }
+
+  if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator()))
     if (tryToUnfoldSelect(SI, BB))
-      return true; 
- 
-  // Check for some cases that are worth simplifying.  Right now we want to look 
-  // for loads that are used by a switch or by the condition for the branch.  If 
-  // we see one, check to see if it's partially redundant.  If so, insert a PHI 
-  // which can then be used to thread the values. 
-  Value *SimplifyValue = CondInst; 
+      return true;
+
+  // Check for some cases that are worth simplifying.  Right now we want to look
+  // for loads that are used by a switch or by the condition for the branch.  If
+  // we see one, check to see if it's partially redundant.  If so, insert a PHI
+  // which can then be used to thread the values.
+  Value *SimplifyValue = CondInst;
 
   if (auto *FI = dyn_cast<FreezeInst>(SimplifyValue))
     // Look into freeze's operand
     SimplifyValue = FI->getOperand(0);
 
-  if (CmpInst *CondCmp = dyn_cast<CmpInst>(SimplifyValue)) 
-    if (isa<Constant>(CondCmp->getOperand(1))) 
-      SimplifyValue = CondCmp->getOperand(0); 
- 
-  // TODO: There are other places where load PRE would be profitable, such as 
-  // more complex comparisons. 
-  if (LoadInst *LoadI = dyn_cast<LoadInst>(SimplifyValue)) 
+  if (CmpInst *CondCmp = dyn_cast<CmpInst>(SimplifyValue))
+    if (isa<Constant>(CondCmp->getOperand(1)))
+      SimplifyValue = CondCmp->getOperand(0);
+
+  // TODO: There are other places where load PRE would be profitable, such as
+  // more complex comparisons.
+  if (LoadInst *LoadI = dyn_cast<LoadInst>(SimplifyValue))
     if (simplifyPartiallyRedundantLoad(LoadI))
-      return true; 
- 
-  // Before threading, try to propagate profile data backwards: 
-  if (PHINode *PN = dyn_cast<PHINode>(CondInst)) 
-    if (PN->getParent() == BB && isa<BranchInst>(BB->getTerminator())) 
-      updatePredecessorProfileMetadata(PN, BB); 
- 
-  // Handle a variety of cases where we are branching on something derived from 
-  // a PHI node in the current block.  If we can prove that any predecessors 
-  // compute a predictable value based on a PHI node, thread those predecessors. 
+      return true;
+
+  // Before threading, try to propagate profile data backwards:
+  if (PHINode *PN = dyn_cast<PHINode>(CondInst))
+    if (PN->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
+      updatePredecessorProfileMetadata(PN, BB);
+
+  // Handle a variety of cases where we are branching on something derived from
+  // a PHI node in the current block.  If we can prove that any predecessors
+  // compute a predictable value based on a PHI node, thread those predecessors.
   if (processThreadableEdges(CondInst, BB, Preference, Terminator))
-    return true; 
- 
+    return true;
+
   // If this is an otherwise-unfoldable branch on a phi node or freeze(phi) in
   // the current block, see if we can simplify.
   PHINode *PN = dyn_cast<PHINode>(
       isa<FreezeInst>(CondInst) ? cast<FreezeInst>(CondInst)->getOperand(0)
                                 : CondInst);
- 
+
   if (PN && PN->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
     return processBranchOnPHI(PN);
 
-  // If this is an otherwise-unfoldable branch on a XOR, see if we can simplify. 
-  if (CondInst->getOpcode() == Instruction::Xor && 
-      CondInst->getParent() == BB && isa<BranchInst>(BB->getTerminator())) 
+  // If this is an otherwise-unfoldable branch on a XOR, see if we can simplify.
+  if (CondInst->getOpcode() == Instruction::Xor &&
+      CondInst->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
     return processBranchOnXOR(cast<BinaryOperator>(CondInst));
- 
-  // Search for a stronger dominating condition that can be used to simplify a 
-  // conditional branch leaving BB. 
+
+  // Search for a stronger dominating condition that can be used to simplify a
+  // conditional branch leaving BB.
   if (processImpliedCondition(BB))
-    return true; 
- 
-  return false; 
-} 
- 
+    return true;
+
+  return false;
+}
+
 bool JumpThreadingPass::processImpliedCondition(BasicBlock *BB) {
-  auto *BI = dyn_cast<BranchInst>(BB->getTerminator()); 
-  if (!BI || !BI->isConditional()) 
-    return false; 
- 
-  Value *Cond = BI->getCondition(); 
-  BasicBlock *CurrentBB = BB; 
-  BasicBlock *CurrentPred = BB->getSinglePredecessor(); 
-  unsigned Iter = 0; 
- 
-  auto &DL = BB->getModule()->getDataLayout(); 
- 
-  while (CurrentPred && Iter++ < ImplicationSearchThreshold) { 
-    auto *PBI = dyn_cast<BranchInst>(CurrentPred->getTerminator()); 
-    if (!PBI || !PBI->isConditional()) 
-      return false; 
-    if (PBI->getSuccessor(0) != CurrentBB && PBI->getSuccessor(1) != CurrentBB) 
-      return false; 
- 
-    bool CondIsTrue = PBI->getSuccessor(0) == CurrentBB; 
-    Optional<bool> Implication = 
-        isImpliedCondition(PBI->getCondition(), Cond, DL, CondIsTrue); 
-    if (Implication) { 
-      BasicBlock *KeepSucc = BI->getSuccessor(*Implication ? 0 : 1); 
-      BasicBlock *RemoveSucc = BI->getSuccessor(*Implication ? 1 : 0); 
-      RemoveSucc->removePredecessor(BB); 
-      BranchInst *UncondBI = BranchInst::Create(KeepSucc, BI); 
-      UncondBI->setDebugLoc(BI->getDebugLoc()); 
-      BI->eraseFromParent(); 
-      DTU->applyUpdatesPermissive({{DominatorTree::Delete, BB, RemoveSucc}}); 
+  auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
+  if (!BI || !BI->isConditional())
+    return false;
+
+  Value *Cond = BI->getCondition();
+  BasicBlock *CurrentBB = BB;
+  BasicBlock *CurrentPred = BB->getSinglePredecessor();
+  unsigned Iter = 0;
+
+  auto &DL = BB->getModule()->getDataLayout();
+
+  while (CurrentPred && Iter++ < ImplicationSearchThreshold) {
+    auto *PBI = dyn_cast<BranchInst>(CurrentPred->getTerminator());
+    if (!PBI || !PBI->isConditional())
+      return false;
+    if (PBI->getSuccessor(0) != CurrentBB && PBI->getSuccessor(1) != CurrentBB)
+      return false;
+
+    bool CondIsTrue = PBI->getSuccessor(0) == CurrentBB;
+    Optional<bool> Implication =
+        isImpliedCondition(PBI->getCondition(), Cond, DL, CondIsTrue);
+    if (Implication) {
+      BasicBlock *KeepSucc = BI->getSuccessor(*Implication ? 0 : 1);
+      BasicBlock *RemoveSucc = BI->getSuccessor(*Implication ? 1 : 0);
+      RemoveSucc->removePredecessor(BB);
+      BranchInst *UncondBI = BranchInst::Create(KeepSucc, BI);
+      UncondBI->setDebugLoc(BI->getDebugLoc());
+      BI->eraseFromParent();
+      DTU->applyUpdatesPermissive({{DominatorTree::Delete, BB, RemoveSucc}});
       if (HasProfileData)
         BPI->eraseBlock(BB);
-      return true; 
-    } 
-    CurrentBB = CurrentPred; 
-    CurrentPred = CurrentBB->getSinglePredecessor(); 
-  } 
- 
-  return false; 
-} 
- 
-/// Return true if Op is an instruction defined in the given block. 
-static bool isOpDefinedInBlock(Value *Op, BasicBlock *BB) { 
-  if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 
-    if (OpInst->getParent() == BB) 
-      return true; 
-  return false; 
-} 
- 
+      return true;
+    }
+    CurrentBB = CurrentPred;
+    CurrentPred = CurrentBB->getSinglePredecessor();
+  }
+
+  return false;
+}
+
+/// Return true if Op is an instruction defined in the given block.
+static bool isOpDefinedInBlock(Value *Op, BasicBlock *BB) {
+  if (Instruction *OpInst = dyn_cast<Instruction>(Op))
+    if (OpInst->getParent() == BB)
+      return true;
+  return false;
+}
+
 /// simplifyPartiallyRedundantLoad - If LoadI is an obviously partially
-/// redundant load instruction, eliminate it by replacing it with a PHI node. 
-/// This is an important optimization that encourages jump threading, and needs 
-/// to be run interlaced with other jump threading tasks. 
+/// redundant load instruction, eliminate it by replacing it with a PHI node.
+/// This is an important optimization that encourages jump threading, and needs
+/// to be run interlaced with other jump threading tasks.
 bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) {
-  // Don't hack volatile and ordered loads. 
-  if (!LoadI->isUnordered()) return false; 
- 
-  // If the load is defined in a block with exactly one predecessor, it can't be 
-  // partially redundant. 
-  BasicBlock *LoadBB = LoadI->getParent(); 
-  if (LoadBB->getSinglePredecessor()) 
-    return false; 
- 
-  // If the load is defined in an EH pad, it can't be partially redundant, 
-  // because the edges between the invoke and the EH pad cannot have other 
-  // instructions between them. 
-  if (LoadBB->isEHPad()) 
-    return false; 
- 
-  Value *LoadedPtr = LoadI->getOperand(0); 
- 
-  // If the loaded operand is defined in the LoadBB and its not a phi, 
-  // it can't be available in predecessors. 
-  if (isOpDefinedInBlock(LoadedPtr, LoadBB) && !isa<PHINode>(LoadedPtr)) 
-    return false; 
- 
-  // Scan a few instructions up from the load, to see if it is obviously live at 
-  // the entry to its block. 
-  BasicBlock::iterator BBIt(LoadI); 
-  bool IsLoadCSE; 
-  if (Value *AvailableVal = FindAvailableLoadedValue( 
-          LoadI, LoadBB, BBIt, DefMaxInstsToScan, AA, &IsLoadCSE)) { 
-    // If the value of the load is locally available within the block, just use 
-    // it.  This frequently occurs for reg2mem'd allocas. 
- 
-    if (IsLoadCSE) { 
-      LoadInst *NLoadI = cast<LoadInst>(AvailableVal); 
-      combineMetadataForCSE(NLoadI, LoadI, false); 
-    }; 
- 
-    // If the returned value is the load itself, replace with an undef. This can 
-    // only happen in dead loops. 
-    if (AvailableVal == LoadI) 
-      AvailableVal = UndefValue::get(LoadI->getType()); 
-    if (AvailableVal->getType() != LoadI->getType()) 
-      AvailableVal = CastInst::CreateBitOrPointerCast( 
-          AvailableVal, LoadI->getType(), "", LoadI); 
-    LoadI->replaceAllUsesWith(AvailableVal); 
-    LoadI->eraseFromParent(); 
-    return true; 
-  } 
- 
-  // Otherwise, if we scanned the whole block and got to the top of the block, 
-  // we know the block is locally transparent to the load.  If not, something 
-  // might clobber its value. 
-  if (BBIt != LoadBB->begin()) 
-    return false; 
- 
-  // If all of the loads and stores that feed the value have the same AA tags, 
-  // then we can propagate them onto any newly inserted loads. 
-  AAMDNodes AATags; 
-  LoadI->getAAMetadata(AATags); 
- 
-  SmallPtrSet<BasicBlock*, 8> PredsScanned; 
- 
-  using AvailablePredsTy = SmallVector<std::pair<BasicBlock *, Value *>, 8>; 
- 
-  AvailablePredsTy AvailablePreds; 
-  BasicBlock *OneUnavailablePred = nullptr; 
-  SmallVector<LoadInst*, 8> CSELoads; 
- 
-  // If we got here, the loaded value is transparent through to the start of the 
-  // block.  Check to see if it is available in any of the predecessor blocks. 
-  for (BasicBlock *PredBB : predecessors(LoadBB)) { 
-    // If we already scanned this predecessor, skip it. 
-    if (!PredsScanned.insert(PredBB).second) 
-      continue; 
- 
-    BBIt = PredBB->end(); 
-    unsigned NumScanedInst = 0; 
-    Value *PredAvailable = nullptr; 
-    // NOTE: We don't CSE load that is volatile or anything stronger than 
-    // unordered, that should have been checked when we entered the function. 
-    assert(LoadI->isUnordered() && 
-           "Attempting to CSE volatile or atomic loads"); 
-    // If this is a load on a phi pointer, phi-translate it and search 
-    // for available load/store to the pointer in predecessors. 
-    Value *Ptr = LoadedPtr->DoPHITranslation(LoadBB, PredBB); 
-    PredAvailable = FindAvailablePtrLoadStore( 
-        Ptr, LoadI->getType(), LoadI->isAtomic(), PredBB, BBIt, 
-        DefMaxInstsToScan, AA, &IsLoadCSE, &NumScanedInst); 
- 
-    // If PredBB has a single predecessor, continue scanning through the 
-    // single predecessor. 
-    BasicBlock *SinglePredBB = PredBB; 
-    while (!PredAvailable && SinglePredBB && BBIt == SinglePredBB->begin() && 
-           NumScanedInst < DefMaxInstsToScan) { 
-      SinglePredBB = SinglePredBB->getSinglePredecessor(); 
-      if (SinglePredBB) { 
-        BBIt = SinglePredBB->end(); 
-        PredAvailable = FindAvailablePtrLoadStore( 
-            Ptr, LoadI->getType(), LoadI->isAtomic(), SinglePredBB, BBIt, 
-            (DefMaxInstsToScan - NumScanedInst), AA, &IsLoadCSE, 
-            &NumScanedInst); 
-      } 
-    } 
- 
-    if (!PredAvailable) { 
-      OneUnavailablePred = PredBB; 
-      continue; 
-    } 
- 
-    if (IsLoadCSE) 
-      CSELoads.push_back(cast<LoadInst>(PredAvailable)); 
- 
-    // If so, this load is partially redundant.  Remember this info so that we 
-    // can create a PHI node. 
-    AvailablePreds.emplace_back(PredBB, PredAvailable); 
-  } 
- 
-  // If the loaded value isn't available in any predecessor, it isn't partially 
-  // redundant. 
-  if (AvailablePreds.empty()) return false; 
- 
-  // Okay, the loaded value is available in at least one (and maybe all!) 
-  // predecessors.  If the value is unavailable in more than one unique 
-  // predecessor, we want to insert a merge block for those common predecessors. 
-  // This ensures that we only have to insert one reload, thus not increasing 
-  // code size. 
-  BasicBlock *UnavailablePred = nullptr; 
- 
-  // If the value is unavailable in one of predecessors, we will end up 
-  // inserting a new instruction into them. It is only valid if all the 
-  // instructions before LoadI are guaranteed to pass execution to its 
-  // successor, or if LoadI is safe to speculate. 
-  // TODO: If this logic becomes more complex, and we will perform PRE insertion 
-  // farther than to a predecessor, we need to reuse the code from GVN's PRE. 
-  // It requires domination tree analysis, so for this simple case it is an 
-  // overkill. 
-  if (PredsScanned.size() != AvailablePreds.size() && 
-      !isSafeToSpeculativelyExecute(LoadI)) 
-    for (auto I = LoadBB->begin(); &*I != LoadI; ++I) 
-      if (!isGuaranteedToTransferExecutionToSuccessor(&*I)) 
-        return false; 
- 
-  // If there is exactly one predecessor where the value is unavailable, the 
-  // already computed 'OneUnavailablePred' block is it.  If it ends in an 
-  // unconditional branch, we know that it isn't a critical edge. 
-  if (PredsScanned.size() == AvailablePreds.size()+1 && 
-      OneUnavailablePred->getTerminator()->getNumSuccessors() == 1) { 
-    UnavailablePred = OneUnavailablePred; 
-  } else if (PredsScanned.size() != AvailablePreds.size()) { 
-    // Otherwise, we had multiple unavailable predecessors or we had a critical 
-    // edge from the one. 
-    SmallVector<BasicBlock*, 8> PredsToSplit; 
-    SmallPtrSet<BasicBlock*, 8> AvailablePredSet; 
- 
-    for (const auto &AvailablePred : AvailablePreds) 
-      AvailablePredSet.insert(AvailablePred.first); 
- 
-    // Add all the unavailable predecessors to the PredsToSplit list. 
-    for (BasicBlock *P : predecessors(LoadBB)) { 
-      // If the predecessor is an indirect goto, we can't split the edge. 
-      // Same for CallBr. 
-      if (isa<IndirectBrInst>(P->getTerminator()) || 
-          isa<CallBrInst>(P->getTerminator())) 
-        return false; 
- 
-      if (!AvailablePredSet.count(P)) 
-        PredsToSplit.push_back(P); 
-    } 
- 
-    // Split them out to their own block. 
+  // Don't hack volatile and ordered loads.
+  if (!LoadI->isUnordered()) return false;
+
+  // If the load is defined in a block with exactly one predecessor, it can't be
+  // partially redundant.
+  BasicBlock *LoadBB = LoadI->getParent();
+  if (LoadBB->getSinglePredecessor())
+    return false;
+
+  // If the load is defined in an EH pad, it can't be partially redundant,
+  // because the edges between the invoke and the EH pad cannot have other
+  // instructions between them.
+  if (LoadBB->isEHPad())
+    return false;
+
+  Value *LoadedPtr = LoadI->getOperand(0);
+
+  // If the loaded operand is defined in the LoadBB and its not a phi,
+  // it can't be available in predecessors.
+  if (isOpDefinedInBlock(LoadedPtr, LoadBB) && !isa<PHINode>(LoadedPtr))
+    return false;
+
+  // Scan a few instructions up from the load, to see if it is obviously live at
+  // the entry to its block.
+  BasicBlock::iterator BBIt(LoadI);
+  bool IsLoadCSE;
+  if (Value *AvailableVal = FindAvailableLoadedValue(
+          LoadI, LoadBB, BBIt, DefMaxInstsToScan, AA, &IsLoadCSE)) {
+    // If the value of the load is locally available within the block, just use
+    // it.  This frequently occurs for reg2mem'd allocas.
+
+    if (IsLoadCSE) {
+      LoadInst *NLoadI = cast<LoadInst>(AvailableVal);
+      combineMetadataForCSE(NLoadI, LoadI, false);
+    };
+
+    // If the returned value is the load itself, replace with an undef. This can
+    // only happen in dead loops.
+    if (AvailableVal == LoadI)
+      AvailableVal = UndefValue::get(LoadI->getType());
+    if (AvailableVal->getType() != LoadI->getType())
+      AvailableVal = CastInst::CreateBitOrPointerCast(
+          AvailableVal, LoadI->getType(), "", LoadI);
+    LoadI->replaceAllUsesWith(AvailableVal);
+    LoadI->eraseFromParent();
+    return true;
+  }
+
+  // Otherwise, if we scanned the whole block and got to the top of the block,
+  // we know the block is locally transparent to the load.  If not, something
+  // might clobber its value.
+  if (BBIt != LoadBB->begin())
+    return false;
+
+  // If all of the loads and stores that feed the value have the same AA tags,
+  // then we can propagate them onto any newly inserted loads.
+  AAMDNodes AATags;
+  LoadI->getAAMetadata(AATags);
+
+  SmallPtrSet<BasicBlock*, 8> PredsScanned;
+
+  using AvailablePredsTy = SmallVector<std::pair<BasicBlock *, Value *>, 8>;
+
+  AvailablePredsTy AvailablePreds;
+  BasicBlock *OneUnavailablePred = nullptr;
+  SmallVector<LoadInst*, 8> CSELoads;
+
+  // If we got here, the loaded value is transparent through to the start of the
+  // block.  Check to see if it is available in any of the predecessor blocks.
+  for (BasicBlock *PredBB : predecessors(LoadBB)) {
+    // If we already scanned this predecessor, skip it.
+    if (!PredsScanned.insert(PredBB).second)
+      continue;
+
+    BBIt = PredBB->end();
+    unsigned NumScanedInst = 0;
+    Value *PredAvailable = nullptr;
+    // NOTE: We don't CSE load that is volatile or anything stronger than
+    // unordered, that should have been checked when we entered the function.
+    assert(LoadI->isUnordered() &&
+           "Attempting to CSE volatile or atomic loads");
+    // If this is a load on a phi pointer, phi-translate it and search
+    // for available load/store to the pointer in predecessors.
+    Value *Ptr = LoadedPtr->DoPHITranslation(LoadBB, PredBB);
+    PredAvailable = FindAvailablePtrLoadStore(
+        Ptr, LoadI->getType(), LoadI->isAtomic(), PredBB, BBIt,
+        DefMaxInstsToScan, AA, &IsLoadCSE, &NumScanedInst);
+
+    // If PredBB has a single predecessor, continue scanning through the
+    // single predecessor.
+    BasicBlock *SinglePredBB = PredBB;
+    while (!PredAvailable && SinglePredBB && BBIt == SinglePredBB->begin() &&
+           NumScanedInst < DefMaxInstsToScan) {
+      SinglePredBB = SinglePredBB->getSinglePredecessor();
+      if (SinglePredBB) {
+        BBIt = SinglePredBB->end();
+        PredAvailable = FindAvailablePtrLoadStore(
+            Ptr, LoadI->getType(), LoadI->isAtomic(), SinglePredBB, BBIt,
+            (DefMaxInstsToScan - NumScanedInst), AA, &IsLoadCSE,
+            &NumScanedInst);
+      }
+    }
+
+    if (!PredAvailable) {
+      OneUnavailablePred = PredBB;
+      continue;
+    }
+
+    if (IsLoadCSE)
+      CSELoads.push_back(cast<LoadInst>(PredAvailable));
+
+    // If so, this load is partially redundant.  Remember this info so that we
+    // can create a PHI node.
+    AvailablePreds.emplace_back(PredBB, PredAvailable);
+  }
+
+  // If the loaded value isn't available in any predecessor, it isn't partially
+  // redundant.
+  if (AvailablePreds.empty()) return false;
+
+  // Okay, the loaded value is available in at least one (and maybe all!)
+  // predecessors.  If the value is unavailable in more than one unique
+  // predecessor, we want to insert a merge block for those common predecessors.
+  // This ensures that we only have to insert one reload, thus not increasing
+  // code size.
+  BasicBlock *UnavailablePred = nullptr;
+
+  // If the value is unavailable in one of predecessors, we will end up
+  // inserting a new instruction into them. It is only valid if all the
+  // instructions before LoadI are guaranteed to pass execution to its
+  // successor, or if LoadI is safe to speculate.
+  // TODO: If this logic becomes more complex, and we will perform PRE insertion
+  // farther than to a predecessor, we need to reuse the code from GVN's PRE.
+  // It requires domination tree analysis, so for this simple case it is an
+  // overkill.
+  if (PredsScanned.size() != AvailablePreds.size() &&
+      !isSafeToSpeculativelyExecute(LoadI))
+    for (auto I = LoadBB->begin(); &*I != LoadI; ++I)
+      if (!isGuaranteedToTransferExecutionToSuccessor(&*I))
+        return false;
+
+  // If there is exactly one predecessor where the value is unavailable, the
+  // already computed 'OneUnavailablePred' block is it.  If it ends in an
+  // unconditional branch, we know that it isn't a critical edge.
+  if (PredsScanned.size() == AvailablePreds.size()+1 &&
+      OneUnavailablePred->getTerminator()->getNumSuccessors() == 1) {
+    UnavailablePred = OneUnavailablePred;
+  } else if (PredsScanned.size() != AvailablePreds.size()) {
+    // Otherwise, we had multiple unavailable predecessors or we had a critical
+    // edge from the one.
+    SmallVector<BasicBlock*, 8> PredsToSplit;
+    SmallPtrSet<BasicBlock*, 8> AvailablePredSet;
+
+    for (const auto &AvailablePred : AvailablePreds)
+      AvailablePredSet.insert(AvailablePred.first);
+
+    // Add all the unavailable predecessors to the PredsToSplit list.
+    for (BasicBlock *P : predecessors(LoadBB)) {
+      // If the predecessor is an indirect goto, we can't split the edge.
+      // Same for CallBr.
+      if (isa<IndirectBrInst>(P->getTerminator()) ||
+          isa<CallBrInst>(P->getTerminator()))
+        return false;
+
+      if (!AvailablePredSet.count(P))
+        PredsToSplit.push_back(P);
+    }
+
+    // Split them out to their own block.
     UnavailablePred = splitBlockPreds(LoadBB, PredsToSplit, "thread-pre-split");
-  } 
- 
-  // If the value isn't available in all predecessors, then there will be 
-  // exactly one where it isn't available.  Insert a load on that edge and add 
-  // it to the AvailablePreds list. 
-  if (UnavailablePred) { 
-    assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 && 
-           "Can't handle critical edge here!"); 
-    LoadInst *NewVal = new LoadInst( 
-        LoadI->getType(), LoadedPtr->DoPHITranslation(LoadBB, UnavailablePred), 
-        LoadI->getName() + ".pr", false, LoadI->getAlign(), 
-        LoadI->getOrdering(), LoadI->getSyncScopeID(), 
-        UnavailablePred->getTerminator()); 
-    NewVal->setDebugLoc(LoadI->getDebugLoc()); 
-    if (AATags) 
-      NewVal->setAAMetadata(AATags); 
- 
-    AvailablePreds.emplace_back(UnavailablePred, NewVal); 
-  } 
- 
-  // Now we know that each predecessor of this block has a value in 
-  // AvailablePreds, sort them for efficient access as we're walking the preds. 
-  array_pod_sort(AvailablePreds.begin(), AvailablePreds.end()); 
- 
-  // Create a PHI node at the start of the block for the PRE'd load value. 
-  pred_iterator PB = pred_begin(LoadBB), PE = pred_end(LoadBB); 
-  PHINode *PN = PHINode::Create(LoadI->getType(), std::distance(PB, PE), "", 
-                                &LoadBB->front()); 
-  PN->takeName(LoadI); 
-  PN->setDebugLoc(LoadI->getDebugLoc()); 
- 
-  // Insert new entries into the PHI for each predecessor.  A single block may 
-  // have multiple entries here. 
-  for (pred_iterator PI = PB; PI != PE; ++PI) { 
-    BasicBlock *P = *PI; 
-    AvailablePredsTy::iterator I = 
-        llvm::lower_bound(AvailablePreds, std::make_pair(P, (Value *)nullptr)); 
- 
-    assert(I != AvailablePreds.end() && I->first == P && 
-           "Didn't find entry for predecessor!"); 
- 
-    // If we have an available predecessor but it requires casting, insert the 
-    // cast in the predecessor and use the cast. Note that we have to update the 
-    // AvailablePreds vector as we go so that all of the PHI entries for this 
-    // predecessor use the same bitcast. 
-    Value *&PredV = I->second; 
-    if (PredV->getType() != LoadI->getType()) 
-      PredV = CastInst::CreateBitOrPointerCast(PredV, LoadI->getType(), "", 
-                                               P->getTerminator()); 
- 
-    PN->addIncoming(PredV, I->first); 
-  } 
- 
-  for (LoadInst *PredLoadI : CSELoads) { 
-    combineMetadataForCSE(PredLoadI, LoadI, true); 
-  } 
- 
-  LoadI->replaceAllUsesWith(PN); 
-  LoadI->eraseFromParent(); 
- 
-  return true; 
-} 
- 
+  }
+
+  // If the value isn't available in all predecessors, then there will be
+  // exactly one where it isn't available.  Insert a load on that edge and add
+  // it to the AvailablePreds list.
+  if (UnavailablePred) {
+    assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 &&
+           "Can't handle critical edge here!");
+    LoadInst *NewVal = new LoadInst(
+        LoadI->getType(), LoadedPtr->DoPHITranslation(LoadBB, UnavailablePred),
+        LoadI->getName() + ".pr", false, LoadI->getAlign(),
+        LoadI->getOrdering(), LoadI->getSyncScopeID(),
+        UnavailablePred->getTerminator());
+    NewVal->setDebugLoc(LoadI->getDebugLoc());
+    if (AATags)
+      NewVal->setAAMetadata(AATags);
+
+    AvailablePreds.emplace_back(UnavailablePred, NewVal);
+  }
+
+  // Now we know that each predecessor of this block has a value in
+  // AvailablePreds, sort them for efficient access as we're walking the preds.
+  array_pod_sort(AvailablePreds.begin(), AvailablePreds.end());
+
+  // Create a PHI node at the start of the block for the PRE'd load value.
+  pred_iterator PB = pred_begin(LoadBB), PE = pred_end(LoadBB);
+  PHINode *PN = PHINode::Create(LoadI->getType(), std::distance(PB, PE), "",
+                                &LoadBB->front());
+  PN->takeName(LoadI);
+  PN->setDebugLoc(LoadI->getDebugLoc());
+
+  // Insert new entries into the PHI for each predecessor.  A single block may
+  // have multiple entries here.
+  for (pred_iterator PI = PB; PI != PE; ++PI) {
+    BasicBlock *P = *PI;
+    AvailablePredsTy::iterator I =
+        llvm::lower_bound(AvailablePreds, std::make_pair(P, (Value *)nullptr));
+
+    assert(I != AvailablePreds.end() && I->first == P &&
+           "Didn't find entry for predecessor!");
+
+    // If we have an available predecessor but it requires casting, insert the
+    // cast in the predecessor and use the cast. Note that we have to update the
+    // AvailablePreds vector as we go so that all of the PHI entries for this
+    // predecessor use the same bitcast.
+    Value *&PredV = I->second;
+    if (PredV->getType() != LoadI->getType())
+      PredV = CastInst::CreateBitOrPointerCast(PredV, LoadI->getType(), "",
+                                               P->getTerminator());
+
+    PN->addIncoming(PredV, I->first);
+  }
+
+  for (LoadInst *PredLoadI : CSELoads) {
+    combineMetadataForCSE(PredLoadI, LoadI, true);
+  }
+
+  LoadI->replaceAllUsesWith(PN);
+  LoadI->eraseFromParent();
+
+  return true;
+}
+
 /// findMostPopularDest - The specified list contains multiple possible
-/// threadable destinations.  Pick the one that occurs the most frequently in 
-/// the list. 
-static BasicBlock * 
+/// threadable destinations.  Pick the one that occurs the most frequently in
+/// the list.
+static BasicBlock *
 findMostPopularDest(BasicBlock *BB,
-                    const SmallVectorImpl<std::pair<BasicBlock *, 
-                                          BasicBlock *>> &PredToDestList) { 
-  assert(!PredToDestList.empty()); 
- 
-  // Determine popularity.  If there are multiple possible destinations, we 
-  // explicitly choose to ignore 'undef' destinations.  We prefer to thread 
-  // blocks with known and real destinations to threading undef.  We'll handle 
-  // them later if interesting. 
-  MapVector<BasicBlock *, unsigned> DestPopularity; 
- 
-  // Populate DestPopularity with the successors in the order they appear in the 
-  // successor list.  This way, we ensure determinism by iterating it in the 
-  // same order in std::max_element below.  We map nullptr to 0 so that we can 
-  // return nullptr when PredToDestList contains nullptr only. 
-  DestPopularity[nullptr] = 0; 
-  for (auto *SuccBB : successors(BB)) 
-    DestPopularity[SuccBB] = 0; 
- 
-  for (const auto &PredToDest : PredToDestList) 
-    if (PredToDest.second) 
-      DestPopularity[PredToDest.second]++; 
- 
-  // Find the most popular dest. 
-  using VT = decltype(DestPopularity)::value_type; 
-  auto MostPopular = std::max_element( 
-      DestPopularity.begin(), DestPopularity.end(), 
-      [](const VT &L, const VT &R) { return L.second < R.second; }); 
- 
-  // Okay, we have finally picked the most popular destination. 
-  return MostPopular->first; 
-} 
- 
-// Try to evaluate the value of V when the control flows from PredPredBB to 
-// BB->getSinglePredecessor() and then on to BB. 
+                    const SmallVectorImpl<std::pair<BasicBlock *,
+                                          BasicBlock *>> &PredToDestList) {
+  assert(!PredToDestList.empty());
+
+  // Determine popularity.  If there are multiple possible destinations, we
+  // explicitly choose to ignore 'undef' destinations.  We prefer to thread
+  // blocks with known and real destinations to threading undef.  We'll handle
+  // them later if interesting.
+  MapVector<BasicBlock *, unsigned> DestPopularity;
+
+  // Populate DestPopularity with the successors in the order they appear in the
+  // successor list.  This way, we ensure determinism by iterating it in the
+  // same order in std::max_element below.  We map nullptr to 0 so that we can
+  // return nullptr when PredToDestList contains nullptr only.
+  DestPopularity[nullptr] = 0;
+  for (auto *SuccBB : successors(BB))
+    DestPopularity[SuccBB] = 0;
+
+  for (const auto &PredToDest : PredToDestList)
+    if (PredToDest.second)
+      DestPopularity[PredToDest.second]++;
+
+  // Find the most popular dest.
+  using VT = decltype(DestPopularity)::value_type;
+  auto MostPopular = std::max_element(
+      DestPopularity.begin(), DestPopularity.end(),
+      [](const VT &L, const VT &R) { return L.second < R.second; });
+
+  // Okay, we have finally picked the most popular destination.
+  return MostPopular->first;
+}
+
+// Try to evaluate the value of V when the control flows from PredPredBB to
+// BB->getSinglePredecessor() and then on to BB.
 Constant *JumpThreadingPass::evaluateOnPredecessorEdge(BasicBlock *BB,
-                                                       BasicBlock *PredPredBB, 
-                                                       Value *V) { 
-  BasicBlock *PredBB = BB->getSinglePredecessor(); 
-  assert(PredBB && "Expected a single predecessor"); 
- 
-  if (Constant *Cst = dyn_cast<Constant>(V)) { 
-    return Cst; 
-  } 
- 
-  // Consult LVI if V is not an instruction in BB or PredBB. 
-  Instruction *I = dyn_cast<Instruction>(V); 
-  if (!I || (I->getParent() != BB && I->getParent() != PredBB)) { 
-    return LVI->getConstantOnEdge(V, PredPredBB, PredBB, nullptr); 
-  } 
- 
-  // Look into a PHI argument. 
-  if (PHINode *PHI = dyn_cast<PHINode>(V)) { 
-    if (PHI->getParent() == PredBB) 
-      return dyn_cast<Constant>(PHI->getIncomingValueForBlock(PredPredBB)); 
-    return nullptr; 
-  } 
- 
-  // If we have a CmpInst, try to fold it for each incoming edge into PredBB. 
-  if (CmpInst *CondCmp = dyn_cast<CmpInst>(V)) { 
-    if (CondCmp->getParent() == BB) { 
-      Constant *Op0 = 
+                                                       BasicBlock *PredPredBB,
+                                                       Value *V) {
+  BasicBlock *PredBB = BB->getSinglePredecessor();
+  assert(PredBB && "Expected a single predecessor");
+
+  if (Constant *Cst = dyn_cast<Constant>(V)) {
+    return Cst;
+  }
+
+  // Consult LVI if V is not an instruction in BB or PredBB.
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I || (I->getParent() != BB && I->getParent() != PredBB)) {
+    return LVI->getConstantOnEdge(V, PredPredBB, PredBB, nullptr);
+  }
+
+  // Look into a PHI argument.
+  if (PHINode *PHI = dyn_cast<PHINode>(V)) {
+    if (PHI->getParent() == PredBB)
+      return dyn_cast<Constant>(PHI->getIncomingValueForBlock(PredPredBB));
+    return nullptr;
+  }
+
+  // If we have a CmpInst, try to fold it for each incoming edge into PredBB.
+  if (CmpInst *CondCmp = dyn_cast<CmpInst>(V)) {
+    if (CondCmp->getParent() == BB) {
+      Constant *Op0 =
           evaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(0));
-      Constant *Op1 = 
+      Constant *Op1 =
           evaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(1));
-      if (Op0 && Op1) { 
-        return ConstantExpr::getCompare(CondCmp->getPredicate(), Op0, Op1); 
-      } 
-    } 
-    return nullptr; 
-  } 
- 
-  return nullptr; 
-} 
- 
+      if (Op0 && Op1) {
+        return ConstantExpr::getCompare(CondCmp->getPredicate(), Op0, Op1);
+      }
+    }
+    return nullptr;
+  }
+
+  return nullptr;
+}
+
 bool JumpThreadingPass::processThreadableEdges(Value *Cond, BasicBlock *BB,
-                                               ConstantPreference Preference, 
-                                               Instruction *CxtI) { 
-  // If threading this would thread across a loop header, don't even try to 
-  // thread the edge. 
-  if (LoopHeaders.count(BB)) 
-    return false; 
- 
-  PredValueInfoTy PredValues; 
+                                               ConstantPreference Preference,
+                                               Instruction *CxtI) {
+  // If threading this would thread across a loop header, don't even try to
+  // thread the edge.
+  if (LoopHeaders.count(BB))
+    return false;
+
+  PredValueInfoTy PredValues;
   if (!computeValueKnownInPredecessors(Cond, BB, PredValues, Preference,
-                                       CxtI)) { 
-    // We don't have known values in predecessors.  See if we can thread through 
-    // BB and its sole predecessor. 
+                                       CxtI)) {
+    // We don't have known values in predecessors.  See if we can thread through
+    // BB and its sole predecessor.
     return maybethreadThroughTwoBasicBlocks(BB, Cond);
-  } 
- 
-  assert(!PredValues.empty() && 
+  }
+
+  assert(!PredValues.empty() &&
          "computeValueKnownInPredecessors returned true with no values");
- 
-  LLVM_DEBUG(dbgs() << "IN BB: " << *BB; 
-             for (const auto &PredValue : PredValues) { 
-               dbgs() << "  BB '" << BB->getName() 
-                      << "': FOUND condition = " << *PredValue.first 
-                      << " for pred '" << PredValue.second->getName() << "'.\n"; 
-  }); 
- 
-  // Decide what we want to thread through.  Convert our list of known values to 
-  // a list of known destinations for each pred.  This also discards duplicate 
-  // predecessors and keeps track of the undefined inputs (which are represented 
-  // as a null dest in the PredToDestList). 
-  SmallPtrSet<BasicBlock*, 16> SeenPreds; 
-  SmallVector<std::pair<BasicBlock*, BasicBlock*>, 16> PredToDestList; 
- 
-  BasicBlock *OnlyDest = nullptr; 
-  BasicBlock *MultipleDestSentinel = (BasicBlock*)(intptr_t)~0ULL; 
-  Constant *OnlyVal = nullptr; 
-  Constant *MultipleVal = (Constant *)(intptr_t)~0ULL; 
- 
-  for (const auto &PredValue : PredValues) { 
-    BasicBlock *Pred = PredValue.second; 
-    if (!SeenPreds.insert(Pred).second) 
-      continue;  // Duplicate predecessor entry. 
- 
-    Constant *Val = PredValue.first; 
- 
-    BasicBlock *DestBB; 
-    if (isa<UndefValue>(Val)) 
-      DestBB = nullptr; 
-    else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) { 
-      assert(isa<ConstantInt>(Val) && "Expecting a constant integer"); 
-      DestBB = BI->getSuccessor(cast<ConstantInt>(Val)->isZero()); 
-    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) { 
-      assert(isa<ConstantInt>(Val) && "Expecting a constant integer"); 
-      DestBB = SI->findCaseValue(cast<ConstantInt>(Val))->getCaseSuccessor(); 
-    } else { 
-      assert(isa<IndirectBrInst>(BB->getTerminator()) 
-              && "Unexpected terminator"); 
-      assert(isa<BlockAddress>(Val) && "Expecting a constant blockaddress"); 
-      DestBB = cast<BlockAddress>(Val)->getBasicBlock(); 
-    } 
- 
-    // If we have exactly one destination, remember it for efficiency below. 
-    if (PredToDestList.empty()) { 
-      OnlyDest = DestBB; 
-      OnlyVal = Val; 
-    } else { 
-      if (OnlyDest != DestBB) 
-        OnlyDest = MultipleDestSentinel; 
-      // It possible we have same destination, but different value, e.g. default 
-      // case in switchinst. 
-      if (Val != OnlyVal) 
-        OnlyVal = MultipleVal; 
-    } 
- 
-    // If the predecessor ends with an indirect goto, we can't change its 
-    // destination. Same for CallBr. 
-    if (isa<IndirectBrInst>(Pred->getTerminator()) || 
-        isa<CallBrInst>(Pred->getTerminator())) 
-      continue; 
- 
-    PredToDestList.emplace_back(Pred, DestBB); 
-  } 
- 
-  // If all edges were unthreadable, we fail. 
-  if (PredToDestList.empty()) 
-    return false; 
- 
-  // If all the predecessors go to a single known successor, we want to fold, 
-  // not thread. By doing so, we do not need to duplicate the current block and 
-  // also miss potential opportunities in case we dont/cant duplicate. 
-  if (OnlyDest && OnlyDest != MultipleDestSentinel) { 
-    if (BB->hasNPredecessors(PredToDestList.size())) { 
-      bool SeenFirstBranchToOnlyDest = false; 
-      std::vector <DominatorTree::UpdateType> Updates; 
-      Updates.reserve(BB->getTerminator()->getNumSuccessors() - 1); 
-      for (BasicBlock *SuccBB : successors(BB)) { 
-        if (SuccBB == OnlyDest && !SeenFirstBranchToOnlyDest) { 
-          SeenFirstBranchToOnlyDest = true; // Don't modify the first branch. 
-        } else { 
-          SuccBB->removePredecessor(BB, true); // This is unreachable successor. 
-          Updates.push_back({DominatorTree::Delete, BB, SuccBB}); 
-        } 
-      } 
- 
-      // Finally update the terminator. 
-      Instruction *Term = BB->getTerminator(); 
-      BranchInst::Create(OnlyDest, Term); 
-      Term->eraseFromParent(); 
-      DTU->applyUpdatesPermissive(Updates); 
+
+  LLVM_DEBUG(dbgs() << "IN BB: " << *BB;
+             for (const auto &PredValue : PredValues) {
+               dbgs() << "  BB '" << BB->getName()
+                      << "': FOUND condition = " << *PredValue.first
+                      << " for pred '" << PredValue.second->getName() << "'.\n";
+  });
+
+  // Decide what we want to thread through.  Convert our list of known values to
+  // a list of known destinations for each pred.  This also discards duplicate
+  // predecessors and keeps track of the undefined inputs (which are represented
+  // as a null dest in the PredToDestList).
+  SmallPtrSet<BasicBlock*, 16> SeenPreds;
+  SmallVector<std::pair<BasicBlock*, BasicBlock*>, 16> PredToDestList;
+
+  BasicBlock *OnlyDest = nullptr;
+  BasicBlock *MultipleDestSentinel = (BasicBlock*)(intptr_t)~0ULL;
+  Constant *OnlyVal = nullptr;
+  Constant *MultipleVal = (Constant *)(intptr_t)~0ULL;
+
+  for (const auto &PredValue : PredValues) {
+    BasicBlock *Pred = PredValue.second;
+    if (!SeenPreds.insert(Pred).second)
+      continue;  // Duplicate predecessor entry.
+
+    Constant *Val = PredValue.first;
+
+    BasicBlock *DestBB;
+    if (isa<UndefValue>(Val))
+      DestBB = nullptr;
+    else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
+      assert(isa<ConstantInt>(Val) && "Expecting a constant integer");
+      DestBB = BI->getSuccessor(cast<ConstantInt>(Val)->isZero());
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
+      assert(isa<ConstantInt>(Val) && "Expecting a constant integer");
+      DestBB = SI->findCaseValue(cast<ConstantInt>(Val))->getCaseSuccessor();
+    } else {
+      assert(isa<IndirectBrInst>(BB->getTerminator())
+              && "Unexpected terminator");
+      assert(isa<BlockAddress>(Val) && "Expecting a constant blockaddress");
+      DestBB = cast<BlockAddress>(Val)->getBasicBlock();
+    }
+
+    // If we have exactly one destination, remember it for efficiency below.
+    if (PredToDestList.empty()) {
+      OnlyDest = DestBB;
+      OnlyVal = Val;
+    } else {
+      if (OnlyDest != DestBB)
+        OnlyDest = MultipleDestSentinel;
+      // It possible we have same destination, but different value, e.g. default
+      // case in switchinst.
+      if (Val != OnlyVal)
+        OnlyVal = MultipleVal;
+    }
+
+    // If the predecessor ends with an indirect goto, we can't change its
+    // destination. Same for CallBr.
+    if (isa<IndirectBrInst>(Pred->getTerminator()) ||
+        isa<CallBrInst>(Pred->getTerminator()))
+      continue;
+
+    PredToDestList.emplace_back(Pred, DestBB);
+  }
+
+  // If all edges were unthreadable, we fail.
+  if (PredToDestList.empty())
+    return false;
+
+  // If all the predecessors go to a single known successor, we want to fold,
+  // not thread. By doing so, we do not need to duplicate the current block and
+  // also miss potential opportunities in case we dont/cant duplicate.
+  if (OnlyDest && OnlyDest != MultipleDestSentinel) {
+    if (BB->hasNPredecessors(PredToDestList.size())) {
+      bool SeenFirstBranchToOnlyDest = false;
+      std::vector <DominatorTree::UpdateType> Updates;
+      Updates.reserve(BB->getTerminator()->getNumSuccessors() - 1);
+      for (BasicBlock *SuccBB : successors(BB)) {
+        if (SuccBB == OnlyDest && !SeenFirstBranchToOnlyDest) {
+          SeenFirstBranchToOnlyDest = true; // Don't modify the first branch.
+        } else {
+          SuccBB->removePredecessor(BB, true); // This is unreachable successor.
+          Updates.push_back({DominatorTree::Delete, BB, SuccBB});
+        }
+      }
+
+      // Finally update the terminator.
+      Instruction *Term = BB->getTerminator();
+      BranchInst::Create(OnlyDest, Term);
+      Term->eraseFromParent();
+      DTU->applyUpdatesPermissive(Updates);
       if (HasProfileData)
         BPI->eraseBlock(BB);
- 
-      // If the condition is now dead due to the removal of the old terminator, 
-      // erase it. 
-      if (auto *CondInst = dyn_cast<Instruction>(Cond)) { 
-        if (CondInst->use_empty() && !CondInst->mayHaveSideEffects()) 
-          CondInst->eraseFromParent(); 
-        // We can safely replace *some* uses of the CondInst if it has 
-        // exactly one value as returned by LVI. RAUW is incorrect in the 
-        // presence of guards and assumes, that have the `Cond` as the use. This 
-        // is because we use the guards/assume to reason about the `Cond` value 
-        // at the end of block, but RAUW unconditionally replaces all uses 
-        // including the guards/assumes themselves and the uses before the 
-        // guard/assume. 
-        else if (OnlyVal && OnlyVal != MultipleVal && 
-                 CondInst->getParent() == BB) 
+
+      // If the condition is now dead due to the removal of the old terminator,
+      // erase it.
+      if (auto *CondInst = dyn_cast<Instruction>(Cond)) {
+        if (CondInst->use_empty() && !CondInst->mayHaveSideEffects())
+          CondInst->eraseFromParent();
+        // We can safely replace *some* uses of the CondInst if it has
+        // exactly one value as returned by LVI. RAUW is incorrect in the
+        // presence of guards and assumes, that have the `Cond` as the use. This
+        // is because we use the guards/assume to reason about the `Cond` value
+        // at the end of block, but RAUW unconditionally replaces all uses
+        // including the guards/assumes themselves and the uses before the
+        // guard/assume.
+        else if (OnlyVal && OnlyVal != MultipleVal &&
+                 CondInst->getParent() == BB)
           replaceFoldableUses(CondInst, OnlyVal);
-      } 
-      return true; 
-    } 
-  } 
- 
-  // Determine which is the most common successor.  If we have many inputs and 
-  // this block is a switch, we want to start by threading the batch that goes 
-  // to the most popular destination first.  If we only know about one 
-  // threadable destination (the common case) we can avoid this. 
-  BasicBlock *MostPopularDest = OnlyDest; 
- 
-  if (MostPopularDest == MultipleDestSentinel) { 
+      }
+      return true;
+    }
+  }
+
+  // Determine which is the most common successor.  If we have many inputs and
+  // this block is a switch, we want to start by threading the batch that goes
+  // to the most popular destination first.  If we only know about one
+  // threadable destination (the common case) we can avoid this.
+  BasicBlock *MostPopularDest = OnlyDest;
+
+  if (MostPopularDest == MultipleDestSentinel) {
     // Remove any loop headers from the Dest list, threadEdge conservatively
-    // won't process them, but we might have other destination that are eligible 
-    // and we still want to process. 
-    erase_if(PredToDestList, 
-             [&](const std::pair<BasicBlock *, BasicBlock *> &PredToDest) { 
+    // won't process them, but we might have other destination that are eligible
+    // and we still want to process.
+    erase_if(PredToDestList,
+             [&](const std::pair<BasicBlock *, BasicBlock *> &PredToDest) {
                return LoopHeaders.contains(PredToDest.second);
-             }); 
- 
-    if (PredToDestList.empty()) 
-      return false; 
- 
+             });
+
+    if (PredToDestList.empty())
+      return false;
+
     MostPopularDest = findMostPopularDest(BB, PredToDestList);
-  } 
- 
-  // Now that we know what the most popular destination is, factor all 
-  // predecessors that will jump to it into a single predecessor. 
-  SmallVector<BasicBlock*, 16> PredsToFactor; 
-  for (const auto &PredToDest : PredToDestList) 
-    if (PredToDest.second == MostPopularDest) { 
-      BasicBlock *Pred = PredToDest.first; 
- 
-      // This predecessor may be a switch or something else that has multiple 
-      // edges to the block.  Factor each of these edges by listing them 
-      // according to # occurrences in PredsToFactor. 
-      for (BasicBlock *Succ : successors(Pred)) 
-        if (Succ == BB) 
-          PredsToFactor.push_back(Pred); 
-    } 
- 
-  // If the threadable edges are branching on an undefined value, we get to pick 
-  // the destination that these predecessors should get to. 
-  if (!MostPopularDest) 
-    MostPopularDest = BB->getTerminator()-> 
+  }
+
+  // Now that we know what the most popular destination is, factor all
+  // predecessors that will jump to it into a single predecessor.
+  SmallVector<BasicBlock*, 16> PredsToFactor;
+  for (const auto &PredToDest : PredToDestList)
+    if (PredToDest.second == MostPopularDest) {
+      BasicBlock *Pred = PredToDest.first;
+
+      // This predecessor may be a switch or something else that has multiple
+      // edges to the block.  Factor each of these edges by listing them
+      // according to # occurrences in PredsToFactor.
+      for (BasicBlock *Succ : successors(Pred))
+        if (Succ == BB)
+          PredsToFactor.push_back(Pred);
+    }
+
+  // If the threadable edges are branching on an undefined value, we get to pick
+  // the destination that these predecessors should get to.
+  if (!MostPopularDest)
+    MostPopularDest = BB->getTerminator()->
                             getSuccessor(getBestDestForJumpOnUndef(BB));
- 
-  // Ok, try to thread it! 
+
+  // Ok, try to thread it!
   return tryThreadEdge(BB, PredsToFactor, MostPopularDest);
-} 
- 
+}
+
 /// processBranchOnPHI - We have an otherwise unthreadable conditional branch on
 /// a PHI node (or freeze PHI) in the current block.  See if there are any
 /// simplifications we can do based on inputs to the phi node.
 bool JumpThreadingPass::processBranchOnPHI(PHINode *PN) {
-  BasicBlock *BB = PN->getParent(); 
- 
-  // TODO: We could make use of this to do it once for blocks with common PHI 
-  // values. 
-  SmallVector<BasicBlock*, 1> PredBBs; 
-  PredBBs.resize(1); 
- 
-  // If any of the predecessor blocks end in an unconditional branch, we can 
-  // *duplicate* the conditional branch into that block in order to further 
-  // encourage jump threading and to eliminate cases where we have branch on a 
-  // phi of an icmp (branch on icmp is much better). 
+  BasicBlock *BB = PN->getParent();
+
+  // TODO: We could make use of this to do it once for blocks with common PHI
+  // values.
+  SmallVector<BasicBlock*, 1> PredBBs;
+  PredBBs.resize(1);
+
+  // If any of the predecessor blocks end in an unconditional branch, we can
+  // *duplicate* the conditional branch into that block in order to further
+  // encourage jump threading and to eliminate cases where we have branch on a
+  // phi of an icmp (branch on icmp is much better).
   // This is still beneficial when a frozen phi is used as the branch condition
   // because it allows CodeGenPrepare to further canonicalize br(freeze(icmp))
   // to br(icmp(freeze ...)).
-  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { 
-    BasicBlock *PredBB = PN->getIncomingBlock(i); 
-    if (BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator())) 
-      if (PredBr->isUnconditional()) { 
-        PredBBs[0] = PredBB; 
-        // Try to duplicate BB into PredBB. 
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    BasicBlock *PredBB = PN->getIncomingBlock(i);
+    if (BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator()))
+      if (PredBr->isUnconditional()) {
+        PredBBs[0] = PredBB;
+        // Try to duplicate BB into PredBB.
         if (duplicateCondBranchOnPHIIntoPred(BB, PredBBs))
-          return true; 
-      } 
-  } 
- 
-  return false; 
-} 
- 
+          return true;
+      }
+  }
+
+  return false;
+}
+
 /// processBranchOnXOR - We have an otherwise unthreadable conditional branch on
-/// a xor instruction in the current block.  See if there are any 
-/// simplifications we can do based on inputs to the xor. 
+/// a xor instruction in the current block.  See if there are any
+/// simplifications we can do based on inputs to the xor.
 bool JumpThreadingPass::processBranchOnXOR(BinaryOperator *BO) {
-  BasicBlock *BB = BO->getParent(); 
- 
-  // If either the LHS or RHS of the xor is a constant, don't do this 
-  // optimization. 
-  if (isa<ConstantInt>(BO->getOperand(0)) || 
-      isa<ConstantInt>(BO->getOperand(1))) 
-    return false; 
- 
-  // If the first instruction in BB isn't a phi, we won't be able to infer 
-  // anything special about any particular predecessor. 
-  if (!isa<PHINode>(BB->front())) 
-    return false; 
- 
-  // If this BB is a landing pad, we won't be able to split the edge into it. 
-  if (BB->isEHPad()) 
-    return false; 
- 
-  // If we have a xor as the branch input to this block, and we know that the 
-  // LHS or RHS of the xor in any predecessor is true/false, then we can clone 
-  // the condition into the predecessor and fix that value to true, saving some 
-  // logical ops on that path and encouraging other paths to simplify. 
-  // 
-  // This copies something like this: 
-  // 
-  //  BB: 
-  //    %X = phi i1 [1],  [%X'] 
-  //    %Y = icmp eq i32 %A, %B 
-  //    %Z = xor i1 %X, %Y 
-  //    br i1 %Z, ... 
-  // 
-  // Into: 
-  //  BB': 
-  //    %Y = icmp ne i32 %A, %B 
-  //    br i1 %Y, ... 
- 
-  PredValueInfoTy XorOpValues; 
-  bool isLHS = true; 
+  BasicBlock *BB = BO->getParent();
+
+  // If either the LHS or RHS of the xor is a constant, don't do this
+  // optimization.
+  if (isa<ConstantInt>(BO->getOperand(0)) ||
+      isa<ConstantInt>(BO->getOperand(1)))
+    return false;
+
+  // If the first instruction in BB isn't a phi, we won't be able to infer
+  // anything special about any particular predecessor.
+  if (!isa<PHINode>(BB->front()))
+    return false;
+
+  // If this BB is a landing pad, we won't be able to split the edge into it.
+  if (BB->isEHPad())
+    return false;
+
+  // If we have a xor as the branch input to this block, and we know that the
+  // LHS or RHS of the xor in any predecessor is true/false, then we can clone
+  // the condition into the predecessor and fix that value to true, saving some
+  // logical ops on that path and encouraging other paths to simplify.
+  //
+  // This copies something like this:
+  //
+  //  BB:
+  //    %X = phi i1 [1],  [%X']
+  //    %Y = icmp eq i32 %A, %B
+  //    %Z = xor i1 %X, %Y
+  //    br i1 %Z, ...
+  //
+  // Into:
+  //  BB':
+  //    %Y = icmp ne i32 %A, %B
+  //    br i1 %Y, ...
+
+  PredValueInfoTy XorOpValues;
+  bool isLHS = true;
   if (!computeValueKnownInPredecessors(BO->getOperand(0), BB, XorOpValues,
-                                       WantInteger, BO)) { 
-    assert(XorOpValues.empty()); 
+                                       WantInteger, BO)) {
+    assert(XorOpValues.empty());
     if (!computeValueKnownInPredecessors(BO->getOperand(1), BB, XorOpValues,
-                                         WantInteger, BO)) 
-      return false; 
-    isLHS = false; 
-  } 
- 
-  assert(!XorOpValues.empty() && 
+                                         WantInteger, BO))
+      return false;
+    isLHS = false;
+  }
+
+  assert(!XorOpValues.empty() &&
          "computeValueKnownInPredecessors returned true with no values");
- 
-  // Scan the information to see which is most popular: true or false.  The 
-  // predecessors can be of the set true, false, or undef. 
-  unsigned NumTrue = 0, NumFalse = 0; 
-  for (const auto &XorOpValue : XorOpValues) { 
-    if (isa<UndefValue>(XorOpValue.first)) 
-      // Ignore undefs for the count. 
-      continue; 
-    if (cast<ConstantInt>(XorOpValue.first)->isZero()) 
-      ++NumFalse; 
-    else 
-      ++NumTrue; 
-  } 
- 
-  // Determine which value to split on, true, false, or undef if neither. 
-  ConstantInt *SplitVal = nullptr; 
-  if (NumTrue > NumFalse) 
-    SplitVal = ConstantInt::getTrue(BB->getContext()); 
-  else if (NumTrue != 0 || NumFalse != 0) 
-    SplitVal = ConstantInt::getFalse(BB->getContext()); 
- 
-  // Collect all of the blocks that this can be folded into so that we can 
-  // factor this once and clone it once. 
-  SmallVector<BasicBlock*, 8> BlocksToFoldInto; 
-  for (const auto &XorOpValue : XorOpValues) { 
-    if (XorOpValue.first != SplitVal && !isa<UndefValue>(XorOpValue.first)) 
-      continue; 
- 
-    BlocksToFoldInto.push_back(XorOpValue.second); 
-  } 
- 
-  // If we inferred a value for all of the predecessors, then duplication won't 
-  // help us.  However, we can just replace the LHS or RHS with the constant. 
-  if (BlocksToFoldInto.size() == 
-      cast<PHINode>(BB->front()).getNumIncomingValues()) { 
-    if (!SplitVal) { 
-      // If all preds provide undef, just nuke the xor, because it is undef too. 
-      BO->replaceAllUsesWith(UndefValue::get(BO->getType())); 
-      BO->eraseFromParent(); 
-    } else if (SplitVal->isZero()) { 
-      // If all preds provide 0, replace the xor with the other input. 
-      BO->replaceAllUsesWith(BO->getOperand(isLHS)); 
-      BO->eraseFromParent(); 
-    } else { 
-      // If all preds provide 1, set the computed value to 1. 
-      BO->setOperand(!isLHS, SplitVal); 
-    } 
- 
-    return true; 
-  } 
- 
-  // If any of predecessors end with an indirect goto, we can't change its 
-  // destination. Same for CallBr. 
-  if (any_of(BlocksToFoldInto, [](BasicBlock *Pred) { 
-        return isa<IndirectBrInst>(Pred->getTerminator()) || 
-               isa<CallBrInst>(Pred->getTerminator()); 
-      })) 
-    return false; 
- 
-  // Try to duplicate BB into PredBB. 
+
+  // Scan the information to see which is most popular: true or false.  The
+  // predecessors can be of the set true, false, or undef.
+  unsigned NumTrue = 0, NumFalse = 0;
+  for (const auto &XorOpValue : XorOpValues) {
+    if (isa<UndefValue>(XorOpValue.first))
+      // Ignore undefs for the count.
+      continue;
+    if (cast<ConstantInt>(XorOpValue.first)->isZero())
+      ++NumFalse;
+    else
+      ++NumTrue;
+  }
+
+  // Determine which value to split on, true, false, or undef if neither.
+  ConstantInt *SplitVal = nullptr;
+  if (NumTrue > NumFalse)
+    SplitVal = ConstantInt::getTrue(BB->getContext());
+  else if (NumTrue != 0 || NumFalse != 0)
+    SplitVal = ConstantInt::getFalse(BB->getContext());
+
+  // Collect all of the blocks that this can be folded into so that we can
+  // factor this once and clone it once.
+  SmallVector<BasicBlock*, 8> BlocksToFoldInto;
+  for (const auto &XorOpValue : XorOpValues) {
+    if (XorOpValue.first != SplitVal && !isa<UndefValue>(XorOpValue.first))
+      continue;
+
+    BlocksToFoldInto.push_back(XorOpValue.second);
+  }
+
+  // If we inferred a value for all of the predecessors, then duplication won't
+  // help us.  However, we can just replace the LHS or RHS with the constant.
+  if (BlocksToFoldInto.size() ==
+      cast<PHINode>(BB->front()).getNumIncomingValues()) {
+    if (!SplitVal) {
+      // If all preds provide undef, just nuke the xor, because it is undef too.
+      BO->replaceAllUsesWith(UndefValue::get(BO->getType()));
+      BO->eraseFromParent();
+    } else if (SplitVal->isZero()) {
+      // If all preds provide 0, replace the xor with the other input.
+      BO->replaceAllUsesWith(BO->getOperand(isLHS));
+      BO->eraseFromParent();
+    } else {
+      // If all preds provide 1, set the computed value to 1.
+      BO->setOperand(!isLHS, SplitVal);
+    }
+
+    return true;
+  }
+
+  // If any of predecessors end with an indirect goto, we can't change its
+  // destination. Same for CallBr.
+  if (any_of(BlocksToFoldInto, [](BasicBlock *Pred) {
+        return isa<IndirectBrInst>(Pred->getTerminator()) ||
+               isa<CallBrInst>(Pred->getTerminator());
+      }))
+    return false;
+
+  // Try to duplicate BB into PredBB.
   return duplicateCondBranchOnPHIIntoPred(BB, BlocksToFoldInto);
-} 
- 
+}
+
 /// addPHINodeEntriesForMappedBlock - We're adding 'NewPred' as a new
-/// predecessor to the PHIBB block.  If it has PHI nodes, add entries for 
-/// NewPred using the entries from OldPred (suitably mapped). 
+/// predecessor to the PHIBB block.  If it has PHI nodes, add entries for
+/// NewPred using the entries from OldPred (suitably mapped).
 static void addPHINodeEntriesForMappedBlock(BasicBlock *PHIBB,
-                                            BasicBlock *OldPred, 
-                                            BasicBlock *NewPred, 
-                                     DenseMap<Instruction*, Value*> &ValueMap) { 
-  for (PHINode &PN : PHIBB->phis()) { 
-    // Ok, we have a PHI node.  Figure out what the incoming value was for the 
-    // DestBlock. 
-    Value *IV = PN.getIncomingValueForBlock(OldPred); 
- 
-    // Remap the value if necessary. 
-    if (Instruction *Inst = dyn_cast<Instruction>(IV)) { 
-      DenseMap<Instruction*, Value*>::iterator I = ValueMap.find(Inst); 
-      if (I != ValueMap.end()) 
-        IV = I->second; 
-    } 
- 
-    PN.addIncoming(IV, NewPred); 
-  } 
-} 
- 
-/// Merge basic block BB into its sole predecessor if possible. 
+                                            BasicBlock *OldPred,
+                                            BasicBlock *NewPred,
+                                     DenseMap<Instruction*, Value*> &ValueMap) {
+  for (PHINode &PN : PHIBB->phis()) {
+    // Ok, we have a PHI node.  Figure out what the incoming value was for the
+    // DestBlock.
+    Value *IV = PN.getIncomingValueForBlock(OldPred);
+
+    // Remap the value if necessary.
+    if (Instruction *Inst = dyn_cast<Instruction>(IV)) {
+      DenseMap<Instruction*, Value*>::iterator I = ValueMap.find(Inst);
+      if (I != ValueMap.end())
+        IV = I->second;
+    }
+
+    PN.addIncoming(IV, NewPred);
+  }
+}
+
+/// Merge basic block BB into its sole predecessor if possible.
 bool JumpThreadingPass::maybeMergeBasicBlockIntoOnlyPred(BasicBlock *BB) {
-  BasicBlock *SinglePred = BB->getSinglePredecessor(); 
-  if (!SinglePred) 
-    return false; 
- 
-  const Instruction *TI = SinglePred->getTerminator(); 
-  if (TI->isExceptionalTerminator() || TI->getNumSuccessors() != 1 || 
-      SinglePred == BB || hasAddressTakenAndUsed(BB)) 
-    return false; 
- 
-  // If SinglePred was a loop header, BB becomes one. 
-  if (LoopHeaders.erase(SinglePred)) 
-    LoopHeaders.insert(BB); 
- 
-  LVI->eraseBlock(SinglePred); 
-  MergeBasicBlockIntoOnlyPred(BB, DTU); 
- 
-  // Now that BB is merged into SinglePred (i.e. SinglePred code followed by 
-  // BB code within one basic block `BB`), we need to invalidate the LVI 
-  // information associated with BB, because the LVI information need not be 
-  // true for all of BB after the merge. For example, 
-  // Before the merge, LVI info and code is as follows: 
-  // SinglePred: <LVI info1 for %p val> 
-  // %y = use of %p 
-  // call @exit() // need not transfer execution to successor. 
-  // assume(%p) // from this point on %p is true 
-  // br label %BB 
-  // BB: <LVI info2 for %p val, i.e. %p is true> 
-  // %x = use of %p 
-  // br label exit 
-  // 
-  // Note that this LVI info for blocks BB and SinglPred is correct for %p 
-  // (info2 and info1 respectively). After the merge and the deletion of the 
-  // LVI info1 for SinglePred. We have the following code: 
-  // BB: <LVI info2 for %p val> 
-  // %y = use of %p 
-  // call @exit() 
-  // assume(%p) 
-  // %x = use of %p <-- LVI info2 is correct from here onwards. 
-  // br label exit 
-  // LVI info2 for BB is incorrect at the beginning of BB. 
- 
-  // Invalidate LVI information for BB if the LVI is not provably true for 
-  // all of BB. 
-  if (!isGuaranteedToTransferExecutionToSuccessor(BB)) 
-    LVI->eraseBlock(BB); 
-  return true; 
-} 
- 
-/// Update the SSA form.  NewBB contains instructions that are copied from BB. 
-/// ValueMapping maps old values in BB to new ones in NewBB. 
+  BasicBlock *SinglePred = BB->getSinglePredecessor();
+  if (!SinglePred)
+    return false;
+
+  const Instruction *TI = SinglePred->getTerminator();
+  if (TI->isExceptionalTerminator() || TI->getNumSuccessors() != 1 ||
+      SinglePred == BB || hasAddressTakenAndUsed(BB))
+    return false;
+
+  // If SinglePred was a loop header, BB becomes one.
+  if (LoopHeaders.erase(SinglePred))
+    LoopHeaders.insert(BB);
+
+  LVI->eraseBlock(SinglePred);
+  MergeBasicBlockIntoOnlyPred(BB, DTU);
+
+  // Now that BB is merged into SinglePred (i.e. SinglePred code followed by
+  // BB code within one basic block `BB`), we need to invalidate the LVI
+  // information associated with BB, because the LVI information need not be
+  // true for all of BB after the merge. For example,
+  // Before the merge, LVI info and code is as follows:
+  // SinglePred: <LVI info1 for %p val>
+  // %y = use of %p
+  // call @exit() // need not transfer execution to successor.
+  // assume(%p) // from this point on %p is true
+  // br label %BB
+  // BB: <LVI info2 for %p val, i.e. %p is true>
+  // %x = use of %p
+  // br label exit
+  //
+  // Note that this LVI info for blocks BB and SinglPred is correct for %p
+  // (info2 and info1 respectively). After the merge and the deletion of the
+  // LVI info1 for SinglePred. We have the following code:
+  // BB: <LVI info2 for %p val>
+  // %y = use of %p
+  // call @exit()
+  // assume(%p)
+  // %x = use of %p <-- LVI info2 is correct from here onwards.
+  // br label exit
+  // LVI info2 for BB is incorrect at the beginning of BB.
+
+  // Invalidate LVI information for BB if the LVI is not provably true for
+  // all of BB.
+  if (!isGuaranteedToTransferExecutionToSuccessor(BB))
+    LVI->eraseBlock(BB);
+  return true;
+}
+
+/// Update the SSA form.  NewBB contains instructions that are copied from BB.
+/// ValueMapping maps old values in BB to new ones in NewBB.
 void JumpThreadingPass::updateSSA(
-    BasicBlock *BB, BasicBlock *NewBB, 
-    DenseMap<Instruction *, Value *> &ValueMapping) { 
-  // If there were values defined in BB that are used outside the block, then we 
-  // now have to update all uses of the value to use either the original value, 
-  // the cloned value, or some PHI derived value.  This can require arbitrary 
-  // PHI insertion, of which we are prepared to do, clean these up now. 
-  SSAUpdater SSAUpdate; 
-  SmallVector<Use *, 16> UsesToRename; 
- 
-  for (Instruction &I : *BB) { 
-    // Scan all uses of this instruction to see if it is used outside of its 
-    // block, and if so, record them in UsesToRename. 
-    for (Use &U : I.uses()) { 
-      Instruction *User = cast<Instruction>(U.getUser()); 
-      if (PHINode *UserPN = dyn_cast<PHINode>(User)) { 
-        if (UserPN->getIncomingBlock(U) == BB) 
-          continue; 
-      } else if (User->getParent() == BB) 
-        continue; 
- 
-      UsesToRename.push_back(&U); 
-    } 
- 
-    // If there are no uses outside the block, we're done with this instruction. 
-    if (UsesToRename.empty()) 
-      continue; 
-    LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n"); 
- 
-    // We found a use of I outside of BB.  Rename all uses of I that are outside 
-    // its block to be uses of the appropriate PHI node etc.  See ValuesInBlocks 
-    // with the two values we know. 
-    SSAUpdate.Initialize(I.getType(), I.getName()); 
-    SSAUpdate.AddAvailableValue(BB, &I); 
-    SSAUpdate.AddAvailableValue(NewBB, ValueMapping[&I]); 
- 
-    while (!UsesToRename.empty()) 
-      SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); 
-    LLVM_DEBUG(dbgs() << "\n"); 
-  } 
-} 
- 
-/// Clone instructions in range [BI, BE) to NewBB.  For PHI nodes, we only clone 
-/// arguments that come from PredBB.  Return the map from the variables in the 
-/// source basic block to the variables in the newly created basic block. 
-DenseMap<Instruction *, Value *> 
+    BasicBlock *BB, BasicBlock *NewBB,
+    DenseMap<Instruction *, Value *> &ValueMapping) {
+  // If there were values defined in BB that are used outside the block, then we
+  // now have to update all uses of the value to use either the original value,
+  // the cloned value, or some PHI derived value.  This can require arbitrary
+  // PHI insertion, of which we are prepared to do, clean these up now.
+  SSAUpdater SSAUpdate;
+  SmallVector<Use *, 16> UsesToRename;
+
+  for (Instruction &I : *BB) {
+    // Scan all uses of this instruction to see if it is used outside of its
+    // block, and if so, record them in UsesToRename.
+    for (Use &U : I.uses()) {
+      Instruction *User = cast<Instruction>(U.getUser());
+      if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
+        if (UserPN->getIncomingBlock(U) == BB)
+          continue;
+      } else if (User->getParent() == BB)
+        continue;
+
+      UsesToRename.push_back(&U);
+    }
+
+    // If there are no uses outside the block, we're done with this instruction.
+    if (UsesToRename.empty())
+      continue;
+    LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n");
+
+    // We found a use of I outside of BB.  Rename all uses of I that are outside
+    // its block to be uses of the appropriate PHI node etc.  See ValuesInBlocks
+    // with the two values we know.
+    SSAUpdate.Initialize(I.getType(), I.getName());
+    SSAUpdate.AddAvailableValue(BB, &I);
+    SSAUpdate.AddAvailableValue(NewBB, ValueMapping[&I]);
+
+    while (!UsesToRename.empty())
+      SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());
+    LLVM_DEBUG(dbgs() << "\n");
+  }
+}
+
+/// Clone instructions in range [BI, BE) to NewBB.  For PHI nodes, we only clone
+/// arguments that come from PredBB.  Return the map from the variables in the
+/// source basic block to the variables in the newly created basic block.
+DenseMap<Instruction *, Value *>
 JumpThreadingPass::cloneInstructions(BasicBlock::iterator BI,
-                                     BasicBlock::iterator BE, BasicBlock *NewBB, 
-                                     BasicBlock *PredBB) { 
-  // We are going to have to map operands from the source basic block to the new 
-  // copy of the block 'NewBB'.  If there are PHI nodes in the source basic 
-  // block, evaluate them to account for entry from PredBB. 
-  DenseMap<Instruction *, Value *> ValueMapping; 
- 
-  // Clone the phi nodes of the source basic block into NewBB.  The resulting 
-  // phi nodes are trivial since NewBB only has one predecessor, but SSAUpdater 
-  // might need to rewrite the operand of the cloned phi. 
-  for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI) { 
-    PHINode *NewPN = PHINode::Create(PN->getType(), 1, PN->getName(), NewBB); 
-    NewPN->addIncoming(PN->getIncomingValueForBlock(PredBB), PredBB); 
-    ValueMapping[PN] = NewPN; 
-  } 
- 
+                                     BasicBlock::iterator BE, BasicBlock *NewBB,
+                                     BasicBlock *PredBB) {
+  // We are going to have to map operands from the source basic block to the new
+  // copy of the block 'NewBB'.  If there are PHI nodes in the source basic
+  // block, evaluate them to account for entry from PredBB.
+  DenseMap<Instruction *, Value *> ValueMapping;
+
+  // Clone the phi nodes of the source basic block into NewBB.  The resulting
+  // phi nodes are trivial since NewBB only has one predecessor, but SSAUpdater
+  // might need to rewrite the operand of the cloned phi.
+  for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI) {
+    PHINode *NewPN = PHINode::Create(PN->getType(), 1, PN->getName(), NewBB);
+    NewPN->addIncoming(PN->getIncomingValueForBlock(PredBB), PredBB);
+    ValueMapping[PN] = NewPN;
+  }
+
   // Clone noalias scope declarations in the threaded block. When threading a
   // loop exit, we would otherwise end up with two idential scope declarations
   // visible at the same time.
@@ -2085,974 +2085,974 @@ JumpThreadingPass::cloneInstructions(BasicBlock::iterator BI,
   identifyNoAliasScopesToClone(BI, BE, NoAliasScopes);
   cloneNoAliasScopes(NoAliasScopes, ClonedScopes, "thread", Context);
 
-  // Clone the non-phi instructions of the source basic block into NewBB, 
-  // keeping track of the mapping and using it to remap operands in the cloned 
-  // instructions. 
-  for (; BI != BE; ++BI) { 
-    Instruction *New = BI->clone(); 
-    New->setName(BI->getName()); 
-    NewBB->getInstList().push_back(New); 
-    ValueMapping[&*BI] = New; 
+  // Clone the non-phi instructions of the source basic block into NewBB,
+  // keeping track of the mapping and using it to remap operands in the cloned
+  // instructions.
+  for (; BI != BE; ++BI) {
+    Instruction *New = BI->clone();
+    New->setName(BI->getName());
+    NewBB->getInstList().push_back(New);
+    ValueMapping[&*BI] = New;
     adaptNoAliasScopes(New, ClonedScopes, Context);
- 
-    // Remap operands to patch up intra-block references. 
-    for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i) 
-      if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) { 
-        DenseMap<Instruction *, Value *>::iterator I = ValueMapping.find(Inst); 
-        if (I != ValueMapping.end()) 
-          New->setOperand(i, I->second); 
-      } 
-  } 
- 
-  return ValueMapping; 
-} 
- 
-/// Attempt to thread through two successive basic blocks. 
+
+    // Remap operands to patch up intra-block references.
+    for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
+      if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
+        DenseMap<Instruction *, Value *>::iterator I = ValueMapping.find(Inst);
+        if (I != ValueMapping.end())
+          New->setOperand(i, I->second);
+      }
+  }
+
+  return ValueMapping;
+}
+
+/// Attempt to thread through two successive basic blocks.
 bool JumpThreadingPass::maybethreadThroughTwoBasicBlocks(BasicBlock *BB,
-                                                         Value *Cond) { 
-  // Consider: 
-  // 
-  // PredBB: 
-  //   %var = phi i32* [ null, %bb1 ], [ @a, %bb2 ] 
-  //   %tobool = icmp eq i32 %cond, 0 
-  //   br i1 %tobool, label %BB, label ... 
-  // 
-  // BB: 
-  //   %cmp = icmp eq i32* %var, null 
-  //   br i1 %cmp, label ..., label ... 
-  // 
-  // We don't know the value of %var at BB even if we know which incoming edge 
-  // we take to BB.  However, once we duplicate PredBB for each of its incoming 
-  // edges (say, PredBB1 and PredBB2), we know the value of %var in each copy of 
-  // PredBB.  Then we can thread edges PredBB1->BB and PredBB2->BB through BB. 
- 
-  // Require that BB end with a Branch for simplicity. 
-  BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator()); 
-  if (!CondBr) 
-    return false; 
- 
-  // BB must have exactly one predecessor. 
-  BasicBlock *PredBB = BB->getSinglePredecessor(); 
-  if (!PredBB) 
-    return false; 
- 
-  // Require that PredBB end with a conditional Branch. If PredBB ends with an 
-  // unconditional branch, we should be merging PredBB and BB instead. For 
-  // simplicity, we don't deal with a switch. 
-  BranchInst *PredBBBranch = dyn_cast<BranchInst>(PredBB->getTerminator()); 
-  if (!PredBBBranch || PredBBBranch->isUnconditional()) 
-    return false; 
- 
-  // If PredBB has exactly one incoming edge, we don't gain anything by copying 
-  // PredBB. 
-  if (PredBB->getSinglePredecessor()) 
-    return false; 
- 
-  // Don't thread through PredBB if it contains a successor edge to itself, in 
-  // which case we would infinite loop.  Suppose we are threading an edge from 
-  // PredPredBB through PredBB and BB to SuccBB with PredBB containing a 
-  // successor edge to itself.  If we allowed jump threading in this case, we 
-  // could duplicate PredBB and BB as, say, PredBB.thread and BB.thread.  Since 
-  // PredBB.thread has a successor edge to PredBB, we would immediately come up 
-  // with another jump threading opportunity from PredBB.thread through PredBB 
-  // and BB to SuccBB.  This jump threading would repeatedly occur.  That is, we 
-  // would keep peeling one iteration from PredBB. 
-  if (llvm::is_contained(successors(PredBB), PredBB)) 
-    return false; 
- 
-  // Don't thread across a loop header. 
-  if (LoopHeaders.count(PredBB)) 
-    return false; 
- 
-  // Avoid complication with duplicating EH pads. 
-  if (PredBB->isEHPad()) 
-    return false; 
- 
-  // Find a predecessor that we can thread.  For simplicity, we only consider a 
-  // successor edge out of BB to which we thread exactly one incoming edge into 
-  // PredBB. 
-  unsigned ZeroCount = 0; 
-  unsigned OneCount = 0; 
-  BasicBlock *ZeroPred = nullptr; 
-  BasicBlock *OnePred = nullptr; 
-  for (BasicBlock *P : predecessors(PredBB)) { 
-    if (ConstantInt *CI = dyn_cast_or_null<ConstantInt>( 
+                                                         Value *Cond) {
+  // Consider:
+  //
+  // PredBB:
+  //   %var = phi i32* [ null, %bb1 ], [ @a, %bb2 ]
+  //   %tobool = icmp eq i32 %cond, 0
+  //   br i1 %tobool, label %BB, label ...
+  //
+  // BB:
+  //   %cmp = icmp eq i32* %var, null
+  //   br i1 %cmp, label ..., label ...
+  //
+  // We don't know the value of %var at BB even if we know which incoming edge
+  // we take to BB.  However, once we duplicate PredBB for each of its incoming
+  // edges (say, PredBB1 and PredBB2), we know the value of %var in each copy of
+  // PredBB.  Then we can thread edges PredBB1->BB and PredBB2->BB through BB.
+
+  // Require that BB end with a Branch for simplicity.
+  BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
+  if (!CondBr)
+    return false;
+
+  // BB must have exactly one predecessor.
+  BasicBlock *PredBB = BB->getSinglePredecessor();
+  if (!PredBB)
+    return false;
+
+  // Require that PredBB end with a conditional Branch. If PredBB ends with an
+  // unconditional branch, we should be merging PredBB and BB instead. For
+  // simplicity, we don't deal with a switch.
+  BranchInst *PredBBBranch = dyn_cast<BranchInst>(PredBB->getTerminator());
+  if (!PredBBBranch || PredBBBranch->isUnconditional())
+    return false;
+
+  // If PredBB has exactly one incoming edge, we don't gain anything by copying
+  // PredBB.
+  if (PredBB->getSinglePredecessor())
+    return false;
+
+  // Don't thread through PredBB if it contains a successor edge to itself, in
+  // which case we would infinite loop.  Suppose we are threading an edge from
+  // PredPredBB through PredBB and BB to SuccBB with PredBB containing a
+  // successor edge to itself.  If we allowed jump threading in this case, we
+  // could duplicate PredBB and BB as, say, PredBB.thread and BB.thread.  Since
+  // PredBB.thread has a successor edge to PredBB, we would immediately come up
+  // with another jump threading opportunity from PredBB.thread through PredBB
+  // and BB to SuccBB.  This jump threading would repeatedly occur.  That is, we
+  // would keep peeling one iteration from PredBB.
+  if (llvm::is_contained(successors(PredBB), PredBB))
+    return false;
+
+  // Don't thread across a loop header.
+  if (LoopHeaders.count(PredBB))
+    return false;
+
+  // Avoid complication with duplicating EH pads.
+  if (PredBB->isEHPad())
+    return false;
+
+  // Find a predecessor that we can thread.  For simplicity, we only consider a
+  // successor edge out of BB to which we thread exactly one incoming edge into
+  // PredBB.
+  unsigned ZeroCount = 0;
+  unsigned OneCount = 0;
+  BasicBlock *ZeroPred = nullptr;
+  BasicBlock *OnePred = nullptr;
+  for (BasicBlock *P : predecessors(PredBB)) {
+    if (ConstantInt *CI = dyn_cast_or_null<ConstantInt>(
             evaluateOnPredecessorEdge(BB, P, Cond))) {
-      if (CI->isZero()) { 
-        ZeroCount++; 
-        ZeroPred = P; 
-      } else if (CI->isOne()) { 
-        OneCount++; 
-        OnePred = P; 
-      } 
-    } 
-  } 
- 
-  // Disregard complicated cases where we have to thread multiple edges. 
-  BasicBlock *PredPredBB; 
-  if (ZeroCount == 1) { 
-    PredPredBB = ZeroPred; 
-  } else if (OneCount == 1) { 
-    PredPredBB = OnePred; 
-  } else { 
-    return false; 
-  } 
- 
-  BasicBlock *SuccBB = CondBr->getSuccessor(PredPredBB == ZeroPred); 
- 
-  // If threading to the same block as we come from, we would infinite loop. 
-  if (SuccBB == BB) { 
-    LLVM_DEBUG(dbgs() << "  Not threading across BB '" << BB->getName() 
-                      << "' - would thread to self!\n"); 
-    return false; 
-  } 
- 
-  // If threading this would thread across a loop header, don't thread the edge. 
+      if (CI->isZero()) {
+        ZeroCount++;
+        ZeroPred = P;
+      } else if (CI->isOne()) {
+        OneCount++;
+        OnePred = P;
+      }
+    }
+  }
+
+  // Disregard complicated cases where we have to thread multiple edges.
+  BasicBlock *PredPredBB;
+  if (ZeroCount == 1) {
+    PredPredBB = ZeroPred;
+  } else if (OneCount == 1) {
+    PredPredBB = OnePred;
+  } else {
+    return false;
+  }
+
+  BasicBlock *SuccBB = CondBr->getSuccessor(PredPredBB == ZeroPred);
+
+  // If threading to the same block as we come from, we would infinite loop.
+  if (SuccBB == BB) {
+    LLVM_DEBUG(dbgs() << "  Not threading across BB '" << BB->getName()
+                      << "' - would thread to self!\n");
+    return false;
+  }
+
+  // If threading this would thread across a loop header, don't thread the edge.
   // See the comments above findLoopHeaders for justifications and caveats.
-  if (LoopHeaders.count(BB) || LoopHeaders.count(SuccBB)) { 
-    LLVM_DEBUG({ 
-      bool BBIsHeader = LoopHeaders.count(BB); 
-      bool SuccIsHeader = LoopHeaders.count(SuccBB); 
-      dbgs() << "  Not threading across " 
-             << (BBIsHeader ? "loop header BB '" : "block BB '") 
-             << BB->getName() << "' to dest " 
-             << (SuccIsHeader ? "loop header BB '" : "block BB '") 
-             << SuccBB->getName() 
-             << "' - it might create an irreducible loop!\n"; 
-    }); 
-    return false; 
-  } 
- 
-  // Compute the cost of duplicating BB and PredBB. 
-  unsigned BBCost = 
-      getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold); 
-  unsigned PredBBCost = getJumpThreadDuplicationCost( 
-      PredBB, PredBB->getTerminator(), BBDupThreshold); 
- 
-  // Give up if costs are too high.  We need to check BBCost and PredBBCost 
-  // individually before checking their sum because getJumpThreadDuplicationCost 
-  // return (unsigned)~0 for those basic blocks that cannot be duplicated. 
-  if (BBCost > BBDupThreshold || PredBBCost > BBDupThreshold || 
-      BBCost + PredBBCost > BBDupThreshold) { 
-    LLVM_DEBUG(dbgs() << "  Not threading BB '" << BB->getName() 
-                      << "' - Cost is too high: " << PredBBCost 
-                      << " for PredBB, " << BBCost << "for BB\n"); 
-    return false; 
-  } 
- 
-  // Now we are ready to duplicate PredBB. 
+  if (LoopHeaders.count(BB) || LoopHeaders.count(SuccBB)) {
+    LLVM_DEBUG({
+      bool BBIsHeader = LoopHeaders.count(BB);
+      bool SuccIsHeader = LoopHeaders.count(SuccBB);
+      dbgs() << "  Not threading across "
+             << (BBIsHeader ? "loop header BB '" : "block BB '")
+             << BB->getName() << "' to dest "
+             << (SuccIsHeader ? "loop header BB '" : "block BB '")
+             << SuccBB->getName()
+             << "' - it might create an irreducible loop!\n";
+    });
+    return false;
+  }
+
+  // Compute the cost of duplicating BB and PredBB.
+  unsigned BBCost =
+      getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
+  unsigned PredBBCost = getJumpThreadDuplicationCost(
+      PredBB, PredBB->getTerminator(), BBDupThreshold);
+
+  // Give up if costs are too high.  We need to check BBCost and PredBBCost
+  // individually before checking their sum because getJumpThreadDuplicationCost
+  // return (unsigned)~0 for those basic blocks that cannot be duplicated.
+  if (BBCost > BBDupThreshold || PredBBCost > BBDupThreshold ||
+      BBCost + PredBBCost > BBDupThreshold) {
+    LLVM_DEBUG(dbgs() << "  Not threading BB '" << BB->getName()
+                      << "' - Cost is too high: " << PredBBCost
+                      << " for PredBB, " << BBCost << "for BB\n");
+    return false;
+  }
+
+  // Now we are ready to duplicate PredBB.
   threadThroughTwoBasicBlocks(PredPredBB, PredBB, BB, SuccBB);
-  return true; 
-} 
- 
+  return true;
+}
+
 void JumpThreadingPass::threadThroughTwoBasicBlocks(BasicBlock *PredPredBB,
-                                                    BasicBlock *PredBB, 
-                                                    BasicBlock *BB, 
-                                                    BasicBlock *SuccBB) { 
-  LLVM_DEBUG(dbgs() << "  Threading through '" << PredBB->getName() << "' and '" 
-                    << BB->getName() << "'\n"); 
- 
-  BranchInst *CondBr = cast<BranchInst>(BB->getTerminator()); 
-  BranchInst *PredBBBranch = cast<BranchInst>(PredBB->getTerminator()); 
- 
-  BasicBlock *NewBB = 
-      BasicBlock::Create(PredBB->getContext(), PredBB->getName() + ".thread", 
-                         PredBB->getParent(), PredBB); 
-  NewBB->moveAfter(PredBB); 
- 
-  // Set the block frequency of NewBB. 
-  if (HasProfileData) { 
-    auto NewBBFreq = BFI->getBlockFreq(PredPredBB) * 
-                     BPI->getEdgeProbability(PredPredBB, PredBB); 
-    BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency()); 
-  } 
- 
-  // We are going to have to map operands from the original BB block to the new 
-  // copy of the block 'NewBB'.  If there are PHI nodes in PredBB, evaluate them 
-  // to account for entry from PredPredBB. 
-  DenseMap<Instruction *, Value *> ValueMapping = 
+                                                    BasicBlock *PredBB,
+                                                    BasicBlock *BB,
+                                                    BasicBlock *SuccBB) {
+  LLVM_DEBUG(dbgs() << "  Threading through '" << PredBB->getName() << "' and '"
+                    << BB->getName() << "'\n");
+
+  BranchInst *CondBr = cast<BranchInst>(BB->getTerminator());
+  BranchInst *PredBBBranch = cast<BranchInst>(PredBB->getTerminator());
+
+  BasicBlock *NewBB =
+      BasicBlock::Create(PredBB->getContext(), PredBB->getName() + ".thread",
+                         PredBB->getParent(), PredBB);
+  NewBB->moveAfter(PredBB);
+
+  // Set the block frequency of NewBB.
+  if (HasProfileData) {
+    auto NewBBFreq = BFI->getBlockFreq(PredPredBB) *
+                     BPI->getEdgeProbability(PredPredBB, PredBB);
+    BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency());
+  }
+
+  // We are going to have to map operands from the original BB block to the new
+  // copy of the block 'NewBB'.  If there are PHI nodes in PredBB, evaluate them
+  // to account for entry from PredPredBB.
+  DenseMap<Instruction *, Value *> ValueMapping =
       cloneInstructions(PredBB->begin(), PredBB->end(), NewBB, PredPredBB);
- 
+
   // Copy the edge probabilities from PredBB to NewBB.
   if (HasProfileData)
     BPI->copyEdgeProbabilities(PredBB, NewBB);
 
-  // Update the terminator of PredPredBB to jump to NewBB instead of PredBB. 
-  // This eliminates predecessors from PredPredBB, which requires us to simplify 
-  // any PHI nodes in PredBB. 
-  Instruction *PredPredTerm = PredPredBB->getTerminator(); 
-  for (unsigned i = 0, e = PredPredTerm->getNumSuccessors(); i != e; ++i) 
-    if (PredPredTerm->getSuccessor(i) == PredBB) { 
-      PredBB->removePredecessor(PredPredBB, true); 
-      PredPredTerm->setSuccessor(i, NewBB); 
-    } 
- 
+  // Update the terminator of PredPredBB to jump to NewBB instead of PredBB.
+  // This eliminates predecessors from PredPredBB, which requires us to simplify
+  // any PHI nodes in PredBB.
+  Instruction *PredPredTerm = PredPredBB->getTerminator();
+  for (unsigned i = 0, e = PredPredTerm->getNumSuccessors(); i != e; ++i)
+    if (PredPredTerm->getSuccessor(i) == PredBB) {
+      PredBB->removePredecessor(PredPredBB, true);
+      PredPredTerm->setSuccessor(i, NewBB);
+    }
+
   addPHINodeEntriesForMappedBlock(PredBBBranch->getSuccessor(0), PredBB, NewBB,
-                                  ValueMapping); 
+                                  ValueMapping);
   addPHINodeEntriesForMappedBlock(PredBBBranch->getSuccessor(1), PredBB, NewBB,
-                                  ValueMapping); 
- 
-  DTU->applyUpdatesPermissive( 
-      {{DominatorTree::Insert, NewBB, CondBr->getSuccessor(0)}, 
-       {DominatorTree::Insert, NewBB, CondBr->getSuccessor(1)}, 
-       {DominatorTree::Insert, PredPredBB, NewBB}, 
-       {DominatorTree::Delete, PredPredBB, PredBB}}); 
- 
+                                  ValueMapping);
+
+  DTU->applyUpdatesPermissive(
+      {{DominatorTree::Insert, NewBB, CondBr->getSuccessor(0)},
+       {DominatorTree::Insert, NewBB, CondBr->getSuccessor(1)},
+       {DominatorTree::Insert, PredPredBB, NewBB},
+       {DominatorTree::Delete, PredPredBB, PredBB}});
+
   updateSSA(PredBB, NewBB, ValueMapping);
- 
-  // Clean up things like PHI nodes with single operands, dead instructions, 
-  // etc. 
-  SimplifyInstructionsInBlock(NewBB, TLI); 
-  SimplifyInstructionsInBlock(PredBB, TLI); 
- 
-  SmallVector<BasicBlock *, 1> PredsToFactor; 
-  PredsToFactor.push_back(NewBB); 
+
+  // Clean up things like PHI nodes with single operands, dead instructions,
+  // etc.
+  SimplifyInstructionsInBlock(NewBB, TLI);
+  SimplifyInstructionsInBlock(PredBB, TLI);
+
+  SmallVector<BasicBlock *, 1> PredsToFactor;
+  PredsToFactor.push_back(NewBB);
   threadEdge(BB, PredsToFactor, SuccBB);
-} 
- 
+}
+
 /// tryThreadEdge - Thread an edge if it's safe and profitable to do so.
 bool JumpThreadingPass::tryThreadEdge(
-    BasicBlock *BB, const SmallVectorImpl<BasicBlock *> &PredBBs, 
-    BasicBlock *SuccBB) { 
-  // If threading to the same block as we come from, we would infinite loop. 
-  if (SuccBB == BB) { 
-    LLVM_DEBUG(dbgs() << "  Not threading across BB '" << BB->getName() 
-                      << "' - would thread to self!\n"); 
-    return false; 
-  } 
- 
-  // If threading this would thread across a loop header, don't thread the edge. 
+    BasicBlock *BB, const SmallVectorImpl<BasicBlock *> &PredBBs,
+    BasicBlock *SuccBB) {
+  // If threading to the same block as we come from, we would infinite loop.
+  if (SuccBB == BB) {
+    LLVM_DEBUG(dbgs() << "  Not threading across BB '" << BB->getName()
+                      << "' - would thread to self!\n");
+    return false;
+  }
+
+  // If threading this would thread across a loop header, don't thread the edge.
   // See the comments above findLoopHeaders for justifications and caveats.
-  if (LoopHeaders.count(BB) || LoopHeaders.count(SuccBB)) { 
-    LLVM_DEBUG({ 
-      bool BBIsHeader = LoopHeaders.count(BB); 
-      bool SuccIsHeader = LoopHeaders.count(SuccBB); 
-      dbgs() << "  Not threading across " 
-          << (BBIsHeader ? "loop header BB '" : "block BB '") << BB->getName() 
-          << "' to dest " << (SuccIsHeader ? "loop header BB '" : "block BB '") 
-          << SuccBB->getName() << "' - it might create an irreducible loop!\n"; 
-    }); 
-    return false; 
-  } 
- 
-  unsigned JumpThreadCost = 
-      getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold); 
-  if (JumpThreadCost > BBDupThreshold) { 
-    LLVM_DEBUG(dbgs() << "  Not threading BB '" << BB->getName() 
-                      << "' - Cost is too high: " << JumpThreadCost << "\n"); 
-    return false; 
-  } 
- 
+  if (LoopHeaders.count(BB) || LoopHeaders.count(SuccBB)) {
+    LLVM_DEBUG({
+      bool BBIsHeader = LoopHeaders.count(BB);
+      bool SuccIsHeader = LoopHeaders.count(SuccBB);
+      dbgs() << "  Not threading across "
+          << (BBIsHeader ? "loop header BB '" : "block BB '") << BB->getName()
+          << "' to dest " << (SuccIsHeader ? "loop header BB '" : "block BB '")
+          << SuccBB->getName() << "' - it might create an irreducible loop!\n";
+    });
+    return false;
+  }
+
+  unsigned JumpThreadCost =
+      getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
+  if (JumpThreadCost > BBDupThreshold) {
+    LLVM_DEBUG(dbgs() << "  Not threading BB '" << BB->getName()
+                      << "' - Cost is too high: " << JumpThreadCost << "\n");
+    return false;
+  }
+
   threadEdge(BB, PredBBs, SuccBB);
-  return true; 
-} 
- 
+  return true;
+}
+
 /// threadEdge - We have decided that it is safe and profitable to factor the
-/// blocks in PredBBs to one predecessor, then thread an edge from it to SuccBB 
-/// across BB.  Transform the IR to reflect this change. 
+/// blocks in PredBBs to one predecessor, then thread an edge from it to SuccBB
+/// across BB.  Transform the IR to reflect this change.
 void JumpThreadingPass::threadEdge(BasicBlock *BB,
-                                   const SmallVectorImpl<BasicBlock *> &PredBBs, 
-                                   BasicBlock *SuccBB) { 
-  assert(SuccBB != BB && "Don't create an infinite loop"); 
- 
-  assert(!LoopHeaders.count(BB) && !LoopHeaders.count(SuccBB) && 
-         "Don't thread across loop headers"); 
- 
-  // And finally, do it!  Start by factoring the predecessors if needed. 
-  BasicBlock *PredBB; 
-  if (PredBBs.size() == 1) 
-    PredBB = PredBBs[0]; 
-  else { 
-    LLVM_DEBUG(dbgs() << "  Factoring out " << PredBBs.size() 
-                      << " common predecessors.\n"); 
+                                   const SmallVectorImpl<BasicBlock *> &PredBBs,
+                                   BasicBlock *SuccBB) {
+  assert(SuccBB != BB && "Don't create an infinite loop");
+
+  assert(!LoopHeaders.count(BB) && !LoopHeaders.count(SuccBB) &&
+         "Don't thread across loop headers");
+
+  // And finally, do it!  Start by factoring the predecessors if needed.
+  BasicBlock *PredBB;
+  if (PredBBs.size() == 1)
+    PredBB = PredBBs[0];
+  else {
+    LLVM_DEBUG(dbgs() << "  Factoring out " << PredBBs.size()
+                      << " common predecessors.\n");
     PredBB = splitBlockPreds(BB, PredBBs, ".thr_comm");
-  } 
- 
-  // And finally, do it! 
-  LLVM_DEBUG(dbgs() << "  Threading edge from '" << PredBB->getName() 
-                    << "' to '" << SuccBB->getName() 
-                    << ", across block:\n    " << *BB << "\n"); 
- 
-  LVI->threadEdge(PredBB, BB, SuccBB); 
- 
-  BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), 
-                                         BB->getName()+".thread", 
-                                         BB->getParent(), BB); 
-  NewBB->moveAfter(PredBB); 
- 
-  // Set the block frequency of NewBB. 
-  if (HasProfileData) { 
-    auto NewBBFreq = 
-        BFI->getBlockFreq(PredBB) * BPI->getEdgeProbability(PredBB, BB); 
-    BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency()); 
-  } 
- 
-  // Copy all the instructions from BB to NewBB except the terminator. 
-  DenseMap<Instruction *, Value *> ValueMapping = 
+  }
+
+  // And finally, do it!
+  LLVM_DEBUG(dbgs() << "  Threading edge from '" << PredBB->getName()
+                    << "' to '" << SuccBB->getName()
+                    << ", across block:\n    " << *BB << "\n");
+
+  LVI->threadEdge(PredBB, BB, SuccBB);
+
+  BasicBlock *NewBB = BasicBlock::Create(BB->getContext(),
+                                         BB->getName()+".thread",
+                                         BB->getParent(), BB);
+  NewBB->moveAfter(PredBB);
+
+  // Set the block frequency of NewBB.
+  if (HasProfileData) {
+    auto NewBBFreq =
+        BFI->getBlockFreq(PredBB) * BPI->getEdgeProbability(PredBB, BB);
+    BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency());
+  }
+
+  // Copy all the instructions from BB to NewBB except the terminator.
+  DenseMap<Instruction *, Value *> ValueMapping =
       cloneInstructions(BB->begin(), std::prev(BB->end()), NewBB, PredBB);
- 
-  // We didn't copy the terminator from BB over to NewBB, because there is now 
-  // an unconditional jump to SuccBB.  Insert the unconditional jump. 
-  BranchInst *NewBI = BranchInst::Create(SuccBB, NewBB); 
-  NewBI->setDebugLoc(BB->getTerminator()->getDebugLoc()); 
- 
-  // Check to see if SuccBB has PHI nodes. If so, we need to add entries to the 
-  // PHI nodes for NewBB now. 
+
+  // We didn't copy the terminator from BB over to NewBB, because there is now
+  // an unconditional jump to SuccBB.  Insert the unconditional jump.
+  BranchInst *NewBI = BranchInst::Create(SuccBB, NewBB);
+  NewBI->setDebugLoc(BB->getTerminator()->getDebugLoc());
+
+  // Check to see if SuccBB has PHI nodes. If so, we need to add entries to the
+  // PHI nodes for NewBB now.
   addPHINodeEntriesForMappedBlock(SuccBB, BB, NewBB, ValueMapping);
- 
-  // Update the terminator of PredBB to jump to NewBB instead of BB.  This 
-  // eliminates predecessors from BB, which requires us to simplify any PHI 
-  // nodes in BB. 
-  Instruction *PredTerm = PredBB->getTerminator(); 
-  for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e; ++i) 
-    if (PredTerm->getSuccessor(i) == BB) { 
-      BB->removePredecessor(PredBB, true); 
-      PredTerm->setSuccessor(i, NewBB); 
-    } 
- 
-  // Enqueue required DT updates. 
-  DTU->applyUpdatesPermissive({{DominatorTree::Insert, NewBB, SuccBB}, 
-                               {DominatorTree::Insert, PredBB, NewBB}, 
-                               {DominatorTree::Delete, PredBB, BB}}); 
- 
+
+  // Update the terminator of PredBB to jump to NewBB instead of BB.  This
+  // eliminates predecessors from BB, which requires us to simplify any PHI
+  // nodes in BB.
+  Instruction *PredTerm = PredBB->getTerminator();
+  for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e; ++i)
+    if (PredTerm->getSuccessor(i) == BB) {
+      BB->removePredecessor(PredBB, true);
+      PredTerm->setSuccessor(i, NewBB);
+    }
+
+  // Enqueue required DT updates.
+  DTU->applyUpdatesPermissive({{DominatorTree::Insert, NewBB, SuccBB},
+                               {DominatorTree::Insert, PredBB, NewBB},
+                               {DominatorTree::Delete, PredBB, BB}});
+
   updateSSA(BB, NewBB, ValueMapping);
- 
-  // At this point, the IR is fully up to date and consistent.  Do a quick scan 
-  // over the new instructions and zap any that are constants or dead.  This 
-  // frequently happens because of phi translation. 
-  SimplifyInstructionsInBlock(NewBB, TLI); 
- 
-  // Update the edge weight from BB to SuccBB, which should be less than before. 
+
+  // At this point, the IR is fully up to date and consistent.  Do a quick scan
+  // over the new instructions and zap any that are constants or dead.  This
+  // frequently happens because of phi translation.
+  SimplifyInstructionsInBlock(NewBB, TLI);
+
+  // Update the edge weight from BB to SuccBB, which should be less than before.
   updateBlockFreqAndEdgeWeight(PredBB, BB, NewBB, SuccBB);
- 
-  // Threaded an edge! 
-  ++NumThreads; 
-} 
- 
-/// Create a new basic block that will be the predecessor of BB and successor of 
-/// all blocks in Preds. When profile data is available, update the frequency of 
-/// this new block. 
+
+  // Threaded an edge!
+  ++NumThreads;
+}
+
+/// Create a new basic block that will be the predecessor of BB and successor of
+/// all blocks in Preds. When profile data is available, update the frequency of
+/// this new block.
 BasicBlock *JumpThreadingPass::splitBlockPreds(BasicBlock *BB,
-                                               ArrayRef<BasicBlock *> Preds, 
-                                               const char *Suffix) { 
-  SmallVector<BasicBlock *, 2> NewBBs; 
- 
-  // Collect the frequencies of all predecessors of BB, which will be used to 
-  // update the edge weight of the result of splitting predecessors. 
-  DenseMap<BasicBlock *, BlockFrequency> FreqMap; 
-  if (HasProfileData) 
-    for (auto Pred : Preds) 
-      FreqMap.insert(std::make_pair( 
-          Pred, BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, BB))); 
- 
-  // In the case when BB is a LandingPad block we create 2 new predecessors 
-  // instead of just one. 
-  if (BB->isLandingPad()) { 
-    std::string NewName = std::string(Suffix) + ".split-lp"; 
-    SplitLandingPadPredecessors(BB, Preds, Suffix, NewName.c_str(), NewBBs); 
-  } else { 
-    NewBBs.push_back(SplitBlockPredecessors(BB, Preds, Suffix)); 
-  } 
- 
-  std::vector<DominatorTree::UpdateType> Updates; 
-  Updates.reserve((2 * Preds.size()) + NewBBs.size()); 
-  for (auto NewBB : NewBBs) { 
-    BlockFrequency NewBBFreq(0); 
-    Updates.push_back({DominatorTree::Insert, NewBB, BB}); 
-    for (auto Pred : predecessors(NewBB)) { 
-      Updates.push_back({DominatorTree::Delete, Pred, BB}); 
-      Updates.push_back({DominatorTree::Insert, Pred, NewBB}); 
-      if (HasProfileData) // Update frequencies between Pred -> NewBB. 
-        NewBBFreq += FreqMap.lookup(Pred); 
-    } 
-    if (HasProfileData) // Apply the summed frequency to NewBB. 
-      BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency()); 
-  } 
- 
-  DTU->applyUpdatesPermissive(Updates); 
-  return NewBBs[0]; 
-} 
- 
-bool JumpThreadingPass::doesBlockHaveProfileData(BasicBlock *BB) { 
-  const Instruction *TI = BB->getTerminator(); 
-  assert(TI->getNumSuccessors() > 1 && "not a split"); 
- 
-  MDNode *WeightsNode = TI->getMetadata(LLVMContext::MD_prof); 
-  if (!WeightsNode) 
-    return false; 
- 
-  MDString *MDName = cast<MDString>(WeightsNode->getOperand(0)); 
-  if (MDName->getString() != "branch_weights") 
-    return false; 
- 
-  // Ensure there are weights for all of the successors. Note that the first 
-  // operand to the metadata node is a name, not a weight. 
-  return WeightsNode->getNumOperands() == TI->getNumSuccessors() + 1; 
-} 
- 
-/// Update the block frequency of BB and branch weight and the metadata on the 
-/// edge BB->SuccBB. This is done by scaling the weight of BB->SuccBB by 1 - 
-/// Freq(PredBB->BB) / Freq(BB->SuccBB). 
+                                               ArrayRef<BasicBlock *> Preds,
+                                               const char *Suffix) {
+  SmallVector<BasicBlock *, 2> NewBBs;
+
+  // Collect the frequencies of all predecessors of BB, which will be used to
+  // update the edge weight of the result of splitting predecessors.
+  DenseMap<BasicBlock *, BlockFrequency> FreqMap;
+  if (HasProfileData)
+    for (auto Pred : Preds)
+      FreqMap.insert(std::make_pair(
+          Pred, BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, BB)));
+
+  // In the case when BB is a LandingPad block we create 2 new predecessors
+  // instead of just one.
+  if (BB->isLandingPad()) {
+    std::string NewName = std::string(Suffix) + ".split-lp";
+    SplitLandingPadPredecessors(BB, Preds, Suffix, NewName.c_str(), NewBBs);
+  } else {
+    NewBBs.push_back(SplitBlockPredecessors(BB, Preds, Suffix));
+  }
+
+  std::vector<DominatorTree::UpdateType> Updates;
+  Updates.reserve((2 * Preds.size()) + NewBBs.size());
+  for (auto NewBB : NewBBs) {
+    BlockFrequency NewBBFreq(0);
+    Updates.push_back({DominatorTree::Insert, NewBB, BB});
+    for (auto Pred : predecessors(NewBB)) {
+      Updates.push_back({DominatorTree::Delete, Pred, BB});
+      Updates.push_back({DominatorTree::Insert, Pred, NewBB});
+      if (HasProfileData) // Update frequencies between Pred -> NewBB.
+        NewBBFreq += FreqMap.lookup(Pred);
+    }
+    if (HasProfileData) // Apply the summed frequency to NewBB.
+      BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency());
+  }
+
+  DTU->applyUpdatesPermissive(Updates);
+  return NewBBs[0];
+}
+
+bool JumpThreadingPass::doesBlockHaveProfileData(BasicBlock *BB) {
+  const Instruction *TI = BB->getTerminator();
+  assert(TI->getNumSuccessors() > 1 && "not a split");
+
+  MDNode *WeightsNode = TI->getMetadata(LLVMContext::MD_prof);
+  if (!WeightsNode)
+    return false;
+
+  MDString *MDName = cast<MDString>(WeightsNode->getOperand(0));
+  if (MDName->getString() != "branch_weights")
+    return false;
+
+  // Ensure there are weights for all of the successors. Note that the first
+  // operand to the metadata node is a name, not a weight.
+  return WeightsNode->getNumOperands() == TI->getNumSuccessors() + 1;
+}
+
+/// Update the block frequency of BB and branch weight and the metadata on the
+/// edge BB->SuccBB. This is done by scaling the weight of BB->SuccBB by 1 -
+/// Freq(PredBB->BB) / Freq(BB->SuccBB).
 void JumpThreadingPass::updateBlockFreqAndEdgeWeight(BasicBlock *PredBB,
-                                                     BasicBlock *BB, 
-                                                     BasicBlock *NewBB, 
-                                                     BasicBlock *SuccBB) { 
-  if (!HasProfileData) 
-    return; 
- 
-  assert(BFI && BPI && "BFI & BPI should have been created here"); 
- 
-  // As the edge from PredBB to BB is deleted, we have to update the block 
-  // frequency of BB. 
-  auto BBOrigFreq = BFI->getBlockFreq(BB); 
-  auto NewBBFreq = BFI->getBlockFreq(NewBB); 
-  auto BB2SuccBBFreq = BBOrigFreq * BPI->getEdgeProbability(BB, SuccBB); 
-  auto BBNewFreq = BBOrigFreq - NewBBFreq; 
-  BFI->setBlockFreq(BB, BBNewFreq.getFrequency()); 
- 
-  // Collect updated outgoing edges' frequencies from BB and use them to update 
-  // edge probabilities. 
-  SmallVector<uint64_t, 4> BBSuccFreq; 
-  for (BasicBlock *Succ : successors(BB)) { 
-    auto SuccFreq = (Succ == SuccBB) 
-                        ? BB2SuccBBFreq - NewBBFreq 
-                        : BBOrigFreq * BPI->getEdgeProbability(BB, Succ); 
-    BBSuccFreq.push_back(SuccFreq.getFrequency()); 
-  } 
- 
-  uint64_t MaxBBSuccFreq = 
-      *std::max_element(BBSuccFreq.begin(), BBSuccFreq.end()); 
- 
-  SmallVector<BranchProbability, 4> BBSuccProbs; 
-  if (MaxBBSuccFreq == 0) 
-    BBSuccProbs.assign(BBSuccFreq.size(), 
-                       {1, static_cast<unsigned>(BBSuccFreq.size())}); 
-  else { 
-    for (uint64_t Freq : BBSuccFreq) 
-      BBSuccProbs.push_back( 
-          BranchProbability::getBranchProbability(Freq, MaxBBSuccFreq)); 
-    // Normalize edge probabilities so that they sum up to one. 
-    BranchProbability::normalizeProbabilities(BBSuccProbs.begin(), 
-                                              BBSuccProbs.end()); 
-  } 
- 
-  // Update edge probabilities in BPI. 
-  BPI->setEdgeProbability(BB, BBSuccProbs); 
- 
-  // Update the profile metadata as well. 
-  // 
-  // Don't do this if the profile of the transformed blocks was statically 
-  // estimated.  (This could occur despite the function having an entry 
-  // frequency in completely cold parts of the CFG.) 
-  // 
-  // In this case we don't want to suggest to subsequent passes that the 
-  // calculated weights are fully consistent.  Consider this graph: 
-  // 
-  //                 check_1 
-  //             50% /  | 
-  //             eq_1   | 50% 
-  //                 \  | 
-  //                 check_2 
-  //             50% /  | 
-  //             eq_2   | 50% 
-  //                 \  | 
-  //                 check_3 
-  //             50% /  | 
-  //             eq_3   | 50% 
-  //                 \  | 
-  // 
-  // Assuming the blocks check_* all compare the same value against 1, 2 and 3, 
-  // the overall probabilities are inconsistent; the total probability that the 
-  // value is either 1, 2 or 3 is 150%. 
-  // 
-  // As a consequence if we thread eq_1 -> check_2 to check_3, check_2->check_3 
-  // becomes 0%.  This is even worse if the edge whose probability becomes 0% is 
-  // the loop exit edge.  Then based solely on static estimation we would assume 
-  // the loop was extremely hot. 
-  // 
-  // FIXME this locally as well so that BPI and BFI are consistent as well.  We 
-  // shouldn't make edges extremely likely or unlikely based solely on static 
-  // estimation. 
-  if (BBSuccProbs.size() >= 2 && doesBlockHaveProfileData(BB)) { 
-    SmallVector<uint32_t, 4> Weights; 
-    for (auto Prob : BBSuccProbs) 
-      Weights.push_back(Prob.getNumerator()); 
- 
-    auto TI = BB->getTerminator(); 
-    TI->setMetadata( 
-        LLVMContext::MD_prof, 
-        MDBuilder(TI->getParent()->getContext()).createBranchWeights(Weights)); 
-  } 
-} 
- 
+                                                     BasicBlock *BB,
+                                                     BasicBlock *NewBB,
+                                                     BasicBlock *SuccBB) {
+  if (!HasProfileData)
+    return;
+
+  assert(BFI && BPI && "BFI & BPI should have been created here");
+
+  // As the edge from PredBB to BB is deleted, we have to update the block
+  // frequency of BB.
+  auto BBOrigFreq = BFI->getBlockFreq(BB);
+  auto NewBBFreq = BFI->getBlockFreq(NewBB);
+  auto BB2SuccBBFreq = BBOrigFreq * BPI->getEdgeProbability(BB, SuccBB);
+  auto BBNewFreq = BBOrigFreq - NewBBFreq;
+  BFI->setBlockFreq(BB, BBNewFreq.getFrequency());
+
+  // Collect updated outgoing edges' frequencies from BB and use them to update
+  // edge probabilities.
+  SmallVector<uint64_t, 4> BBSuccFreq;
+  for (BasicBlock *Succ : successors(BB)) {
+    auto SuccFreq = (Succ == SuccBB)
+                        ? BB2SuccBBFreq - NewBBFreq
+                        : BBOrigFreq * BPI->getEdgeProbability(BB, Succ);
+    BBSuccFreq.push_back(SuccFreq.getFrequency());
+  }
+
+  uint64_t MaxBBSuccFreq =
+      *std::max_element(BBSuccFreq.begin(), BBSuccFreq.end());
+
+  SmallVector<BranchProbability, 4> BBSuccProbs;
+  if (MaxBBSuccFreq == 0)
+    BBSuccProbs.assign(BBSuccFreq.size(),
+                       {1, static_cast<unsigned>(BBSuccFreq.size())});
+  else {
+    for (uint64_t Freq : BBSuccFreq)
+      BBSuccProbs.push_back(
+          BranchProbability::getBranchProbability(Freq, MaxBBSuccFreq));
+    // Normalize edge probabilities so that they sum up to one.
+    BranchProbability::normalizeProbabilities(BBSuccProbs.begin(),
+                                              BBSuccProbs.end());
+  }
+
+  // Update edge probabilities in BPI.
+  BPI->setEdgeProbability(BB, BBSuccProbs);
+
+  // Update the profile metadata as well.
+  //
+  // Don't do this if the profile of the transformed blocks was statically
+  // estimated.  (This could occur despite the function having an entry
+  // frequency in completely cold parts of the CFG.)
+  //
+  // In this case we don't want to suggest to subsequent passes that the
+  // calculated weights are fully consistent.  Consider this graph:
+  //
+  //                 check_1
+  //             50% /  |
+  //             eq_1   | 50%
+  //                 \  |
+  //                 check_2
+  //             50% /  |
+  //             eq_2   | 50%
+  //                 \  |
+  //                 check_3
+  //             50% /  |
+  //             eq_3   | 50%
+  //                 \  |
+  //
+  // Assuming the blocks check_* all compare the same value against 1, 2 and 3,
+  // the overall probabilities are inconsistent; the total probability that the
+  // value is either 1, 2 or 3 is 150%.
+  //
+  // As a consequence if we thread eq_1 -> check_2 to check_3, check_2->check_3
+  // becomes 0%.  This is even worse if the edge whose probability becomes 0% is
+  // the loop exit edge.  Then based solely on static estimation we would assume
+  // the loop was extremely hot.
+  //
+  // FIXME this locally as well so that BPI and BFI are consistent as well.  We
+  // shouldn't make edges extremely likely or unlikely based solely on static
+  // estimation.
+  if (BBSuccProbs.size() >= 2 && doesBlockHaveProfileData(BB)) {
+    SmallVector<uint32_t, 4> Weights;
+    for (auto Prob : BBSuccProbs)
+      Weights.push_back(Prob.getNumerator());
+
+    auto TI = BB->getTerminator();
+    TI->setMetadata(
+        LLVMContext::MD_prof,
+        MDBuilder(TI->getParent()->getContext()).createBranchWeights(Weights));
+  }
+}
+
 /// duplicateCondBranchOnPHIIntoPred - PredBB contains an unconditional branch
-/// to BB which contains an i1 PHI node and a conditional branch on that PHI. 
-/// If we can duplicate the contents of BB up into PredBB do so now, this 
-/// improves the odds that the branch will be on an analyzable instruction like 
-/// a compare. 
+/// to BB which contains an i1 PHI node and a conditional branch on that PHI.
+/// If we can duplicate the contents of BB up into PredBB do so now, this
+/// improves the odds that the branch will be on an analyzable instruction like
+/// a compare.
 bool JumpThreadingPass::duplicateCondBranchOnPHIIntoPred(
-    BasicBlock *BB, const SmallVectorImpl<BasicBlock *> &PredBBs) { 
-  assert(!PredBBs.empty() && "Can't handle an empty set"); 
- 
-  // If BB is a loop header, then duplicating this block outside the loop would 
-  // cause us to transform this into an irreducible loop, don't do this. 
+    BasicBlock *BB, const SmallVectorImpl<BasicBlock *> &PredBBs) {
+  assert(!PredBBs.empty() && "Can't handle an empty set");
+
+  // If BB is a loop header, then duplicating this block outside the loop would
+  // cause us to transform this into an irreducible loop, don't do this.
   // See the comments above findLoopHeaders for justifications and caveats.
-  if (LoopHeaders.count(BB)) { 
-    LLVM_DEBUG(dbgs() << "  Not duplicating loop header '" << BB->getName() 
-                      << "' into predecessor block '" << PredBBs[0]->getName() 
-                      << "' - it might create an irreducible loop!\n"); 
-    return false; 
-  } 
- 
-  unsigned DuplicationCost = 
-      getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold); 
-  if (DuplicationCost > BBDupThreshold) { 
-    LLVM_DEBUG(dbgs() << "  Not duplicating BB '" << BB->getName() 
-                      << "' - Cost is too high: " << DuplicationCost << "\n"); 
-    return false; 
-  } 
- 
-  // And finally, do it!  Start by factoring the predecessors if needed. 
-  std::vector<DominatorTree::UpdateType> Updates; 
-  BasicBlock *PredBB; 
-  if (PredBBs.size() == 1) 
-    PredBB = PredBBs[0]; 
-  else { 
-    LLVM_DEBUG(dbgs() << "  Factoring out " << PredBBs.size() 
-                      << " common predecessors.\n"); 
+  if (LoopHeaders.count(BB)) {
+    LLVM_DEBUG(dbgs() << "  Not duplicating loop header '" << BB->getName()
+                      << "' into predecessor block '" << PredBBs[0]->getName()
+                      << "' - it might create an irreducible loop!\n");
+    return false;
+  }
+
+  unsigned DuplicationCost =
+      getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
+  if (DuplicationCost > BBDupThreshold) {
+    LLVM_DEBUG(dbgs() << "  Not duplicating BB '" << BB->getName()
+                      << "' - Cost is too high: " << DuplicationCost << "\n");
+    return false;
+  }
+
+  // And finally, do it!  Start by factoring the predecessors if needed.
+  std::vector<DominatorTree::UpdateType> Updates;
+  BasicBlock *PredBB;
+  if (PredBBs.size() == 1)
+    PredBB = PredBBs[0];
+  else {
+    LLVM_DEBUG(dbgs() << "  Factoring out " << PredBBs.size()
+                      << " common predecessors.\n");
     PredBB = splitBlockPreds(BB, PredBBs, ".thr_comm");
-  } 
-  Updates.push_back({DominatorTree::Delete, PredBB, BB}); 
- 
-  // Okay, we decided to do this!  Clone all the instructions in BB onto the end 
-  // of PredBB. 
-  LLVM_DEBUG(dbgs() << "  Duplicating block '" << BB->getName() 
-                    << "' into end of '" << PredBB->getName() 
-                    << "' to eliminate branch on phi.  Cost: " 
-                    << DuplicationCost << " block is:" << *BB << "\n"); 
- 
-  // Unless PredBB ends with an unconditional branch, split the edge so that we 
-  // can just clone the bits from BB into the end of the new PredBB. 
-  BranchInst *OldPredBranch = dyn_cast<BranchInst>(PredBB->getTerminator()); 
- 
-  if (!OldPredBranch || !OldPredBranch->isUnconditional()) { 
-    BasicBlock *OldPredBB = PredBB; 
-    PredBB = SplitEdge(OldPredBB, BB); 
-    Updates.push_back({DominatorTree::Insert, OldPredBB, PredBB}); 
-    Updates.push_back({DominatorTree::Insert, PredBB, BB}); 
-    Updates.push_back({DominatorTree::Delete, OldPredBB, BB}); 
-    OldPredBranch = cast<BranchInst>(PredBB->getTerminator()); 
-  } 
- 
-  // We are going to have to map operands from the original BB block into the 
-  // PredBB block.  Evaluate PHI nodes in BB. 
-  DenseMap<Instruction*, Value*> ValueMapping; 
- 
-  BasicBlock::iterator BI = BB->begin(); 
-  for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI) 
-    ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB); 
-  // Clone the non-phi instructions of BB into PredBB, keeping track of the 
-  // mapping and using it to remap operands in the cloned instructions. 
-  for (; BI != BB->end(); ++BI) { 
-    Instruction *New = BI->clone(); 
- 
-    // Remap operands to patch up intra-block references. 
-    for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i) 
-      if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) { 
-        DenseMap<Instruction*, Value*>::iterator I = ValueMapping.find(Inst); 
-        if (I != ValueMapping.end()) 
-          New->setOperand(i, I->second); 
-      } 
- 
-    // If this instruction can be simplified after the operands are updated, 
-    // just use the simplified value instead.  This frequently happens due to 
-    // phi translation. 
-    if (Value *IV = SimplifyInstruction( 
-            New, 
-            {BB->getModule()->getDataLayout(), TLI, nullptr, nullptr, New})) { 
-      ValueMapping[&*BI] = IV; 
-      if (!New->mayHaveSideEffects()) { 
-        New->deleteValue(); 
-        New = nullptr; 
-      } 
-    } else { 
-      ValueMapping[&*BI] = New; 
-    } 
-    if (New) { 
-      // Otherwise, insert the new instruction into the block. 
-      New->setName(BI->getName()); 
-      PredBB->getInstList().insert(OldPredBranch->getIterator(), New); 
-      // Update Dominance from simplified New instruction operands. 
-      for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i) 
-        if (BasicBlock *SuccBB = dyn_cast<BasicBlock>(New->getOperand(i))) 
-          Updates.push_back({DominatorTree::Insert, PredBB, SuccBB}); 
-    } 
-  } 
- 
-  // Check to see if the targets of the branch had PHI nodes. If so, we need to 
-  // add entries to the PHI nodes for branch from PredBB now. 
-  BranchInst *BBBranch = cast<BranchInst>(BB->getTerminator()); 
+  }
+  Updates.push_back({DominatorTree::Delete, PredBB, BB});
+
+  // Okay, we decided to do this!  Clone all the instructions in BB onto the end
+  // of PredBB.
+  LLVM_DEBUG(dbgs() << "  Duplicating block '" << BB->getName()
+                    << "' into end of '" << PredBB->getName()
+                    << "' to eliminate branch on phi.  Cost: "
+                    << DuplicationCost << " block is:" << *BB << "\n");
+
+  // Unless PredBB ends with an unconditional branch, split the edge so that we
+  // can just clone the bits from BB into the end of the new PredBB.
+  BranchInst *OldPredBranch = dyn_cast<BranchInst>(PredBB->getTerminator());
+
+  if (!OldPredBranch || !OldPredBranch->isUnconditional()) {
+    BasicBlock *OldPredBB = PredBB;
+    PredBB = SplitEdge(OldPredBB, BB);
+    Updates.push_back({DominatorTree::Insert, OldPredBB, PredBB});
+    Updates.push_back({DominatorTree::Insert, PredBB, BB});
+    Updates.push_back({DominatorTree::Delete, OldPredBB, BB});
+    OldPredBranch = cast<BranchInst>(PredBB->getTerminator());
+  }
+
+  // We are going to have to map operands from the original BB block into the
+  // PredBB block.  Evaluate PHI nodes in BB.
+  DenseMap<Instruction*, Value*> ValueMapping;
+
+  BasicBlock::iterator BI = BB->begin();
+  for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+    ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
+  // Clone the non-phi instructions of BB into PredBB, keeping track of the
+  // mapping and using it to remap operands in the cloned instructions.
+  for (; BI != BB->end(); ++BI) {
+    Instruction *New = BI->clone();
+
+    // Remap operands to patch up intra-block references.
+    for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
+      if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
+        DenseMap<Instruction*, Value*>::iterator I = ValueMapping.find(Inst);
+        if (I != ValueMapping.end())
+          New->setOperand(i, I->second);
+      }
+
+    // If this instruction can be simplified after the operands are updated,
+    // just use the simplified value instead.  This frequently happens due to
+    // phi translation.
+    if (Value *IV = SimplifyInstruction(
+            New,
+            {BB->getModule()->getDataLayout(), TLI, nullptr, nullptr, New})) {
+      ValueMapping[&*BI] = IV;
+      if (!New->mayHaveSideEffects()) {
+        New->deleteValue();
+        New = nullptr;
+      }
+    } else {
+      ValueMapping[&*BI] = New;
+    }
+    if (New) {
+      // Otherwise, insert the new instruction into the block.
+      New->setName(BI->getName());
+      PredBB->getInstList().insert(OldPredBranch->getIterator(), New);
+      // Update Dominance from simplified New instruction operands.
+      for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
+        if (BasicBlock *SuccBB = dyn_cast<BasicBlock>(New->getOperand(i)))
+          Updates.push_back({DominatorTree::Insert, PredBB, SuccBB});
+    }
+  }
+
+  // Check to see if the targets of the branch had PHI nodes. If so, we need to
+  // add entries to the PHI nodes for branch from PredBB now.
+  BranchInst *BBBranch = cast<BranchInst>(BB->getTerminator());
   addPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(0), BB, PredBB,
-                                  ValueMapping); 
+                                  ValueMapping);
   addPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(1), BB, PredBB,
-                                  ValueMapping); 
- 
+                                  ValueMapping);
+
   updateSSA(BB, PredBB, ValueMapping);
- 
-  // PredBB no longer jumps to BB, remove entries in the PHI node for the edge 
-  // that we nuked. 
-  BB->removePredecessor(PredBB, true); 
- 
-  // Remove the unconditional branch at the end of the PredBB block. 
-  OldPredBranch->eraseFromParent(); 
+
+  // PredBB no longer jumps to BB, remove entries in the PHI node for the edge
+  // that we nuked.
+  BB->removePredecessor(PredBB, true);
+
+  // Remove the unconditional branch at the end of the PredBB block.
+  OldPredBranch->eraseFromParent();
   if (HasProfileData)
     BPI->copyEdgeProbabilities(BB, PredBB);
-  DTU->applyUpdatesPermissive(Updates); 
- 
-  ++NumDupes; 
-  return true; 
-} 
- 
-// Pred is a predecessor of BB with an unconditional branch to BB. SI is 
-// a Select instruction in Pred. BB has other predecessors and SI is used in 
-// a PHI node in BB. SI has no other use. 
-// A new basic block, NewBB, is created and SI is converted to compare and  
-// conditional branch. SI is erased from parent. 
+  DTU->applyUpdatesPermissive(Updates);
+
+  ++NumDupes;
+  return true;
+}
+
+// Pred is a predecessor of BB with an unconditional branch to BB. SI is
+// a Select instruction in Pred. BB has other predecessors and SI is used in
+// a PHI node in BB. SI has no other use.
+// A new basic block, NewBB, is created and SI is converted to compare and 
+// conditional branch. SI is erased from parent.
 void JumpThreadingPass::unfoldSelectInstr(BasicBlock *Pred, BasicBlock *BB,
-                                          SelectInst *SI, PHINode *SIUse, 
-                                          unsigned Idx) { 
-  // Expand the select. 
-  // 
-  // Pred -- 
-  //  |    v 
-  //  |  NewBB 
-  //  |    | 
-  //  |----- 
-  //  v 
-  // BB 
-  BranchInst *PredTerm = cast<BranchInst>(Pred->getTerminator()); 
-  BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "select.unfold", 
-                                         BB->getParent(), BB); 
-  // Move the unconditional branch to NewBB. 
-  PredTerm->removeFromParent(); 
-  NewBB->getInstList().insert(NewBB->end(), PredTerm); 
-  // Create a conditional branch and update PHI nodes. 
-  BranchInst::Create(NewBB, BB, SI->getCondition(), Pred); 
-  SIUse->setIncomingValue(Idx, SI->getFalseValue()); 
-  SIUse->addIncoming(SI->getTrueValue(), NewBB); 
- 
-  // The select is now dead. 
-  SI->eraseFromParent(); 
-  DTU->applyUpdatesPermissive({{DominatorTree::Insert, NewBB, BB}, 
-                               {DominatorTree::Insert, Pred, NewBB}}); 
- 
-  // Update any other PHI nodes in BB. 
-  for (BasicBlock::iterator BI = BB->begin(); 
-       PHINode *Phi = dyn_cast<PHINode>(BI); ++BI) 
-    if (Phi != SIUse) 
-      Phi->addIncoming(Phi->getIncomingValueForBlock(Pred), NewBB); 
-} 
- 
+                                          SelectInst *SI, PHINode *SIUse,
+                                          unsigned Idx) {
+  // Expand the select.
+  //
+  // Pred --
+  //  |    v
+  //  |  NewBB
+  //  |    |
+  //  |-----
+  //  v
+  // BB
+  BranchInst *PredTerm = cast<BranchInst>(Pred->getTerminator());
+  BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "select.unfold",
+                                         BB->getParent(), BB);
+  // Move the unconditional branch to NewBB.
+  PredTerm->removeFromParent();
+  NewBB->getInstList().insert(NewBB->end(), PredTerm);
+  // Create a conditional branch and update PHI nodes.
+  BranchInst::Create(NewBB, BB, SI->getCondition(), Pred);
+  SIUse->setIncomingValue(Idx, SI->getFalseValue());
+  SIUse->addIncoming(SI->getTrueValue(), NewBB);
+
+  // The select is now dead.
+  SI->eraseFromParent();
+  DTU->applyUpdatesPermissive({{DominatorTree::Insert, NewBB, BB},
+                               {DominatorTree::Insert, Pred, NewBB}});
+
+  // Update any other PHI nodes in BB.
+  for (BasicBlock::iterator BI = BB->begin();
+       PHINode *Phi = dyn_cast<PHINode>(BI); ++BI)
+    if (Phi != SIUse)
+      Phi->addIncoming(Phi->getIncomingValueForBlock(Pred), NewBB);
+}
+
 bool JumpThreadingPass::tryToUnfoldSelect(SwitchInst *SI, BasicBlock *BB) {
-  PHINode *CondPHI = dyn_cast<PHINode>(SI->getCondition()); 
- 
-  if (!CondPHI || CondPHI->getParent() != BB) 
-    return false; 
- 
-  for (unsigned I = 0, E = CondPHI->getNumIncomingValues(); I != E; ++I) { 
-    BasicBlock *Pred = CondPHI->getIncomingBlock(I); 
-    SelectInst *PredSI = dyn_cast<SelectInst>(CondPHI->getIncomingValue(I)); 
- 
-    // The second and third condition can be potentially relaxed. Currently 
-    // the conditions help to simplify the code and allow us to reuse existing 
+  PHINode *CondPHI = dyn_cast<PHINode>(SI->getCondition());
+
+  if (!CondPHI || CondPHI->getParent() != BB)
+    return false;
+
+  for (unsigned I = 0, E = CondPHI->getNumIncomingValues(); I != E; ++I) {
+    BasicBlock *Pred = CondPHI->getIncomingBlock(I);
+    SelectInst *PredSI = dyn_cast<SelectInst>(CondPHI->getIncomingValue(I));
+
+    // The second and third condition can be potentially relaxed. Currently
+    // the conditions help to simplify the code and allow us to reuse existing
     // code, developed for tryToUnfoldSelect(CmpInst *, BasicBlock *)
-    if (!PredSI || PredSI->getParent() != Pred || !PredSI->hasOneUse()) 
-      continue; 
- 
-    BranchInst *PredTerm = dyn_cast<BranchInst>(Pred->getTerminator()); 
-    if (!PredTerm || !PredTerm->isUnconditional()) 
-      continue; 
- 
+    if (!PredSI || PredSI->getParent() != Pred || !PredSI->hasOneUse())
+      continue;
+
+    BranchInst *PredTerm = dyn_cast<BranchInst>(Pred->getTerminator());
+    if (!PredTerm || !PredTerm->isUnconditional())
+      continue;
+
     unfoldSelectInstr(Pred, BB, PredSI, CondPHI, I);
-    return true; 
-  } 
-  return false; 
-} 
- 
+    return true;
+  }
+  return false;
+}
+
 /// tryToUnfoldSelect - Look for blocks of the form
-/// bb1: 
-///   %a = select 
-///   br bb2 
-/// 
-/// bb2: 
-///   %p = phi [%a, %bb1] ... 
-///   %c = icmp %p 
-///   br i1 %c 
-/// 
-/// And expand the select into a branch structure if one of its arms allows %c 
-/// to be folded. This later enables threading from bb1 over bb2. 
+/// bb1:
+///   %a = select
+///   br bb2
+///
+/// bb2:
+///   %p = phi [%a, %bb1] ...
+///   %c = icmp %p
+///   br i1 %c
+///
+/// And expand the select into a branch structure if one of its arms allows %c
+/// to be folded. This later enables threading from bb1 over bb2.
 bool JumpThreadingPass::tryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
-  BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator()); 
-  PHINode *CondLHS = dyn_cast<PHINode>(CondCmp->getOperand(0)); 
-  Constant *CondRHS = cast<Constant>(CondCmp->getOperand(1)); 
- 
-  if (!CondBr || !CondBr->isConditional() || !CondLHS || 
-      CondLHS->getParent() != BB) 
-    return false; 
- 
-  for (unsigned I = 0, E = CondLHS->getNumIncomingValues(); I != E; ++I) { 
-    BasicBlock *Pred = CondLHS->getIncomingBlock(I); 
-    SelectInst *SI = dyn_cast<SelectInst>(CondLHS->getIncomingValue(I)); 
- 
-    // Look if one of the incoming values is a select in the corresponding 
-    // predecessor. 
-    if (!SI || SI->getParent() != Pred || !SI->hasOneUse()) 
-      continue; 
- 
-    BranchInst *PredTerm = dyn_cast<BranchInst>(Pred->getTerminator()); 
-    if (!PredTerm || !PredTerm->isUnconditional()) 
-      continue; 
- 
-    // Now check if one of the select values would allow us to constant fold the 
-    // terminator in BB. We don't do the transform if both sides fold, those 
-    // cases will be threaded in any case. 
-    LazyValueInfo::Tristate LHSFolds = 
-        LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(1), 
-                                CondRHS, Pred, BB, CondCmp); 
-    LazyValueInfo::Tristate RHSFolds = 
-        LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(2), 
-                                CondRHS, Pred, BB, CondCmp); 
-    if ((LHSFolds != LazyValueInfo::Unknown || 
-         RHSFolds != LazyValueInfo::Unknown) && 
-        LHSFolds != RHSFolds) { 
+  BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
+  PHINode *CondLHS = dyn_cast<PHINode>(CondCmp->getOperand(0));
+  Constant *CondRHS = cast<Constant>(CondCmp->getOperand(1));
+
+  if (!CondBr || !CondBr->isConditional() || !CondLHS ||
+      CondLHS->getParent() != BB)
+    return false;
+
+  for (unsigned I = 0, E = CondLHS->getNumIncomingValues(); I != E; ++I) {
+    BasicBlock *Pred = CondLHS->getIncomingBlock(I);
+    SelectInst *SI = dyn_cast<SelectInst>(CondLHS->getIncomingValue(I));
+
+    // Look if one of the incoming values is a select in the corresponding
+    // predecessor.
+    if (!SI || SI->getParent() != Pred || !SI->hasOneUse())
+      continue;
+
+    BranchInst *PredTerm = dyn_cast<BranchInst>(Pred->getTerminator());
+    if (!PredTerm || !PredTerm->isUnconditional())
+      continue;
+
+    // Now check if one of the select values would allow us to constant fold the
+    // terminator in BB. We don't do the transform if both sides fold, those
+    // cases will be threaded in any case.
+    LazyValueInfo::Tristate LHSFolds =
+        LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(1),
+                                CondRHS, Pred, BB, CondCmp);
+    LazyValueInfo::Tristate RHSFolds =
+        LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(2),
+                                CondRHS, Pred, BB, CondCmp);
+    if ((LHSFolds != LazyValueInfo::Unknown ||
+         RHSFolds != LazyValueInfo::Unknown) &&
+        LHSFolds != RHSFolds) {
       unfoldSelectInstr(Pred, BB, SI, CondLHS, I);
-      return true; 
-    } 
-  } 
-  return false; 
-} 
- 
+      return true;
+    }
+  }
+  return false;
+}
+
 /// tryToUnfoldSelectInCurrBB - Look for PHI/Select or PHI/CMP/Select in the
-/// same BB in the form 
-/// bb: 
-///   %p = phi [false, %bb1], [true, %bb2], [false, %bb3], [true, %bb4], ... 
-///   %s = select %p, trueval, falseval 
-/// 
-/// or 
-/// 
-/// bb: 
-///   %p = phi [0, %bb1], [1, %bb2], [0, %bb3], [1, %bb4], ... 
-///   %c = cmp %p, 0 
-///   %s = select %c, trueval, falseval 
-/// 
-/// And expand the select into a branch structure. This later enables 
-/// jump-threading over bb in this pass. 
-/// 
-/// Using the similar approach of SimplifyCFG::FoldCondBranchOnPHI(), unfold 
-/// select if the associated PHI has at least one constant.  If the unfolded 
-/// select is not jump-threaded, it will be folded again in the later 
-/// optimizations. 
+/// same BB in the form
+/// bb:
+///   %p = phi [false, %bb1], [true, %bb2], [false, %bb3], [true, %bb4], ...
+///   %s = select %p, trueval, falseval
+///
+/// or
+///
+/// bb:
+///   %p = phi [0, %bb1], [1, %bb2], [0, %bb3], [1, %bb4], ...
+///   %c = cmp %p, 0
+///   %s = select %c, trueval, falseval
+///
+/// And expand the select into a branch structure. This later enables
+/// jump-threading over bb in this pass.
+///
+/// Using the similar approach of SimplifyCFG::FoldCondBranchOnPHI(), unfold
+/// select if the associated PHI has at least one constant.  If the unfolded
+/// select is not jump-threaded, it will be folded again in the later
+/// optimizations.
 bool JumpThreadingPass::tryToUnfoldSelectInCurrBB(BasicBlock *BB) {
   // This transform would reduce the quality of msan diagnostics.
-  // Disable this transform under MemorySanitizer. 
-  if (BB->getParent()->hasFnAttribute(Attribute::SanitizeMemory)) 
-    return false; 
- 
-  // If threading this would thread across a loop header, don't thread the edge. 
+  // Disable this transform under MemorySanitizer.
+  if (BB->getParent()->hasFnAttribute(Attribute::SanitizeMemory))
+    return false;
+
+  // If threading this would thread across a loop header, don't thread the edge.
   // See the comments above findLoopHeaders for justifications and caveats.
-  if (LoopHeaders.count(BB)) 
-    return false; 
- 
-  for (BasicBlock::iterator BI = BB->begin(); 
-       PHINode *PN = dyn_cast<PHINode>(BI); ++BI) { 
-    // Look for a Phi having at least one constant incoming value. 
-    if (llvm::all_of(PN->incoming_values(), 
-                     [](Value *V) { return !isa<ConstantInt>(V); })) 
-      continue; 
- 
-    auto isUnfoldCandidate = [BB](SelectInst *SI, Value *V) { 
-      // Check if SI is in BB and use V as condition. 
-      if (SI->getParent() != BB) 
-        return false; 
-      Value *Cond = SI->getCondition(); 
-      return (Cond && Cond == V && Cond->getType()->isIntegerTy(1)); 
-    }; 
- 
-    SelectInst *SI = nullptr; 
-    for (Use &U : PN->uses()) { 
-      if (ICmpInst *Cmp = dyn_cast<ICmpInst>(U.getUser())) { 
-        // Look for a ICmp in BB that compares PN with a constant and is the 
-        // condition of a Select. 
-        if (Cmp->getParent() == BB && Cmp->hasOneUse() && 
-            isa<ConstantInt>(Cmp->getOperand(1 - U.getOperandNo()))) 
-          if (SelectInst *SelectI = dyn_cast<SelectInst>(Cmp->user_back())) 
-            if (isUnfoldCandidate(SelectI, Cmp->use_begin()->get())) { 
-              SI = SelectI; 
-              break; 
-            } 
-      } else if (SelectInst *SelectI = dyn_cast<SelectInst>(U.getUser())) { 
-        // Look for a Select in BB that uses PN as condition. 
-        if (isUnfoldCandidate(SelectI, U.get())) { 
-          SI = SelectI; 
-          break; 
-        } 
-      } 
-    } 
- 
-    if (!SI) 
-      continue; 
-    // Expand the select. 
+  if (LoopHeaders.count(BB))
+    return false;
+
+  for (BasicBlock::iterator BI = BB->begin();
+       PHINode *PN = dyn_cast<PHINode>(BI); ++BI) {
+    // Look for a Phi having at least one constant incoming value.
+    if (llvm::all_of(PN->incoming_values(),
+                     [](Value *V) { return !isa<ConstantInt>(V); }))
+      continue;
+
+    auto isUnfoldCandidate = [BB](SelectInst *SI, Value *V) {
+      // Check if SI is in BB and use V as condition.
+      if (SI->getParent() != BB)
+        return false;
+      Value *Cond = SI->getCondition();
+      return (Cond && Cond == V && Cond->getType()->isIntegerTy(1));
+    };
+
+    SelectInst *SI = nullptr;
+    for (Use &U : PN->uses()) {
+      if (ICmpInst *Cmp = dyn_cast<ICmpInst>(U.getUser())) {
+        // Look for a ICmp in BB that compares PN with a constant and is the
+        // condition of a Select.
+        if (Cmp->getParent() == BB && Cmp->hasOneUse() &&
+            isa<ConstantInt>(Cmp->getOperand(1 - U.getOperandNo())))
+          if (SelectInst *SelectI = dyn_cast<SelectInst>(Cmp->user_back()))
+            if (isUnfoldCandidate(SelectI, Cmp->use_begin()->get())) {
+              SI = SelectI;
+              break;
+            }
+      } else if (SelectInst *SelectI = dyn_cast<SelectInst>(U.getUser())) {
+        // Look for a Select in BB that uses PN as condition.
+        if (isUnfoldCandidate(SelectI, U.get())) {
+          SI = SelectI;
+          break;
+        }
+      }
+    }
+
+    if (!SI)
+      continue;
+    // Expand the select.
     Value *Cond = SI->getCondition();
     if (InsertFreezeWhenUnfoldingSelect &&
         !isGuaranteedNotToBeUndefOrPoison(Cond, nullptr, SI,
                                           &DTU->getDomTree()))
       Cond = new FreezeInst(Cond, "cond.fr", SI);
     Instruction *Term = SplitBlockAndInsertIfThen(Cond, SI, false);
-    BasicBlock *SplitBB = SI->getParent(); 
-    BasicBlock *NewBB = Term->getParent(); 
-    PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI); 
-    NewPN->addIncoming(SI->getTrueValue(), Term->getParent()); 
-    NewPN->addIncoming(SI->getFalseValue(), BB); 
-    SI->replaceAllUsesWith(NewPN); 
-    SI->eraseFromParent(); 
-    // NewBB and SplitBB are newly created blocks which require insertion. 
-    std::vector<DominatorTree::UpdateType> Updates; 
-    Updates.reserve((2 * SplitBB->getTerminator()->getNumSuccessors()) + 3); 
-    Updates.push_back({DominatorTree::Insert, BB, SplitBB}); 
-    Updates.push_back({DominatorTree::Insert, BB, NewBB}); 
-    Updates.push_back({DominatorTree::Insert, NewBB, SplitBB}); 
-    // BB's successors were moved to SplitBB, update DTU accordingly. 
-    for (auto *Succ : successors(SplitBB)) { 
-      Updates.push_back({DominatorTree::Delete, BB, Succ}); 
-      Updates.push_back({DominatorTree::Insert, SplitBB, Succ}); 
-    } 
-    DTU->applyUpdatesPermissive(Updates); 
-    return true; 
-  } 
-  return false; 
-} 
- 
-/// Try to propagate a guard from the current BB into one of its predecessors 
-/// in case if another branch of execution implies that the condition of this 
-/// guard is always true. Currently we only process the simplest case that 
-/// looks like: 
-/// 
-/// Start: 
-///   %cond = ... 
-///   br i1 %cond, label %T1, label %F1 
-/// T1: 
-///   br label %Merge 
-/// F1: 
-///   br label %Merge 
-/// Merge: 
-///   %condGuard = ... 
-///   call void(i1, ...) @llvm.experimental.guard( i1 %condGuard )[ "deopt"() ] 
-/// 
-/// And cond either implies condGuard or !condGuard. In this case all the 
-/// instructions before the guard can be duplicated in both branches, and the 
-/// guard is then threaded to one of them. 
+    BasicBlock *SplitBB = SI->getParent();
+    BasicBlock *NewBB = Term->getParent();
+    PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI);
+    NewPN->addIncoming(SI->getTrueValue(), Term->getParent());
+    NewPN->addIncoming(SI->getFalseValue(), BB);
+    SI->replaceAllUsesWith(NewPN);
+    SI->eraseFromParent();
+    // NewBB and SplitBB are newly created blocks which require insertion.
+    std::vector<DominatorTree::UpdateType> Updates;
+    Updates.reserve((2 * SplitBB->getTerminator()->getNumSuccessors()) + 3);
+    Updates.push_back({DominatorTree::Insert, BB, SplitBB});
+    Updates.push_back({DominatorTree::Insert, BB, NewBB});
+    Updates.push_back({DominatorTree::Insert, NewBB, SplitBB});
+    // BB's successors were moved to SplitBB, update DTU accordingly.
+    for (auto *Succ : successors(SplitBB)) {
+      Updates.push_back({DominatorTree::Delete, BB, Succ});
+      Updates.push_back({DominatorTree::Insert, SplitBB, Succ});
+    }
+    DTU->applyUpdatesPermissive(Updates);
+    return true;
+  }
+  return false;
+}
+
+/// Try to propagate a guard from the current BB into one of its predecessors
+/// in case if another branch of execution implies that the condition of this
+/// guard is always true. Currently we only process the simplest case that
+/// looks like:
+///
+/// Start:
+///   %cond = ...
+///   br i1 %cond, label %T1, label %F1
+/// T1:
+///   br label %Merge
+/// F1:
+///   br label %Merge
+/// Merge:
+///   %condGuard = ...
+///   call void(i1, ...) @llvm.experimental.guard( i1 %condGuard )[ "deopt"() ]
+///
+/// And cond either implies condGuard or !condGuard. In this case all the
+/// instructions before the guard can be duplicated in both branches, and the
+/// guard is then threaded to one of them.
 bool JumpThreadingPass::processGuards(BasicBlock *BB) {
-  using namespace PatternMatch; 
- 
-  // We only want to deal with two predecessors. 
-  BasicBlock *Pred1, *Pred2; 
-  auto PI = pred_begin(BB), PE = pred_end(BB); 
-  if (PI == PE) 
-    return false; 
-  Pred1 = *PI++; 
-  if (PI == PE) 
-    return false; 
-  Pred2 = *PI++; 
-  if (PI != PE) 
-    return false; 
-  if (Pred1 == Pred2) 
-    return false; 
- 
-  // Try to thread one of the guards of the block. 
-  // TODO: Look up deeper than to immediate predecessor? 
-  auto *Parent = Pred1->getSinglePredecessor(); 
-  if (!Parent || Parent != Pred2->getSinglePredecessor()) 
-    return false; 
- 
-  if (auto *BI = dyn_cast<BranchInst>(Parent->getTerminator())) 
-    for (auto &I : *BB) 
+  using namespace PatternMatch;
+
+  // We only want to deal with two predecessors.
+  BasicBlock *Pred1, *Pred2;
+  auto PI = pred_begin(BB), PE = pred_end(BB);
+  if (PI == PE)
+    return false;
+  Pred1 = *PI++;
+  if (PI == PE)
+    return false;
+  Pred2 = *PI++;
+  if (PI != PE)
+    return false;
+  if (Pred1 == Pred2)
+    return false;
+
+  // Try to thread one of the guards of the block.
+  // TODO: Look up deeper than to immediate predecessor?
+  auto *Parent = Pred1->getSinglePredecessor();
+  if (!Parent || Parent != Pred2->getSinglePredecessor())
+    return false;
+
+  if (auto *BI = dyn_cast<BranchInst>(Parent->getTerminator()))
+    for (auto &I : *BB)
       if (isGuard(&I) && threadGuard(BB, cast<IntrinsicInst>(&I), BI))
-        return true; 
- 
-  return false; 
-} 
- 
-/// Try to propagate the guard from BB which is the lower block of a diamond 
-/// to one of its branches, in case if diamond's condition implies guard's 
-/// condition. 
+        return true;
+
+  return false;
+}
+
+/// Try to propagate the guard from BB which is the lower block of a diamond
+/// to one of its branches, in case if diamond's condition implies guard's
+/// condition.
 bool JumpThreadingPass::threadGuard(BasicBlock *BB, IntrinsicInst *Guard,
-                                    BranchInst *BI) { 
-  assert(BI->getNumSuccessors() == 2 && "Wrong number of successors?"); 
-  assert(BI->isConditional() && "Unconditional branch has 2 successors?"); 
-  Value *GuardCond = Guard->getArgOperand(0); 
-  Value *BranchCond = BI->getCondition(); 
-  BasicBlock *TrueDest = BI->getSuccessor(0); 
-  BasicBlock *FalseDest = BI->getSuccessor(1); 
- 
-  auto &DL = BB->getModule()->getDataLayout(); 
-  bool TrueDestIsSafe = false; 
-  bool FalseDestIsSafe = false; 
- 
-  // True dest is safe if BranchCond => GuardCond. 
-  auto Impl = isImpliedCondition(BranchCond, GuardCond, DL); 
-  if (Impl && *Impl) 
-    TrueDestIsSafe = true; 
-  else { 
-    // False dest is safe if !BranchCond => GuardCond. 
-    Impl = isImpliedCondition(BranchCond, GuardCond, DL, /* LHSIsTrue */ false); 
-    if (Impl && *Impl) 
-      FalseDestIsSafe = true; 
-  } 
- 
-  if (!TrueDestIsSafe && !FalseDestIsSafe) 
-    return false; 
- 
-  BasicBlock *PredUnguardedBlock = TrueDestIsSafe ? TrueDest : FalseDest; 
-  BasicBlock *PredGuardedBlock = FalseDestIsSafe ? TrueDest : FalseDest; 
- 
-  ValueToValueMapTy UnguardedMapping, GuardedMapping; 
-  Instruction *AfterGuard = Guard->getNextNode(); 
-  unsigned Cost = getJumpThreadDuplicationCost(BB, AfterGuard, BBDupThreshold); 
-  if (Cost > BBDupThreshold) 
-    return false; 
-  // Duplicate all instructions before the guard and the guard itself to the 
-  // branch where implication is not proved. 
-  BasicBlock *GuardedBlock = DuplicateInstructionsInSplitBetween( 
-      BB, PredGuardedBlock, AfterGuard, GuardedMapping, *DTU); 
-  assert(GuardedBlock && "Could not create the guarded block?"); 
-  // Duplicate all instructions before the guard in the unguarded branch. 
-  // Since we have successfully duplicated the guarded block and this block 
-  // has fewer instructions, we expect it to succeed. 
-  BasicBlock *UnguardedBlock = DuplicateInstructionsInSplitBetween( 
-      BB, PredUnguardedBlock, Guard, UnguardedMapping, *DTU); 
-  assert(UnguardedBlock && "Could not create the unguarded block?"); 
-  LLVM_DEBUG(dbgs() << "Moved guard " << *Guard << " to block " 
-                    << GuardedBlock->getName() << "\n"); 
-  // Some instructions before the guard may still have uses. For them, we need 
-  // to create Phi nodes merging their copies in both guarded and unguarded 
-  // branches. Those instructions that have no uses can be just removed. 
-  SmallVector<Instruction *, 4> ToRemove; 
-  for (auto BI = BB->begin(); &*BI != AfterGuard; ++BI) 
-    if (!isa<PHINode>(&*BI)) 
-      ToRemove.push_back(&*BI); 
- 
-  Instruction *InsertionPoint = &*BB->getFirstInsertionPt(); 
-  assert(InsertionPoint && "Empty block?"); 
-  // Substitute with Phis & remove. 
-  for (auto *Inst : reverse(ToRemove)) { 
-    if (!Inst->use_empty()) { 
-      PHINode *NewPN = PHINode::Create(Inst->getType(), 2); 
-      NewPN->addIncoming(UnguardedMapping[Inst], UnguardedBlock); 
-      NewPN->addIncoming(GuardedMapping[Inst], GuardedBlock); 
-      NewPN->insertBefore(InsertionPoint); 
-      Inst->replaceAllUsesWith(NewPN); 
-    } 
-    Inst->eraseFromParent(); 
-  } 
-  return true; 
-} 
+                                    BranchInst *BI) {
+  assert(BI->getNumSuccessors() == 2 && "Wrong number of successors?");
+  assert(BI->isConditional() && "Unconditional branch has 2 successors?");
+  Value *GuardCond = Guard->getArgOperand(0);
+  Value *BranchCond = BI->getCondition();
+  BasicBlock *TrueDest = BI->getSuccessor(0);
+  BasicBlock *FalseDest = BI->getSuccessor(1);
+
+  auto &DL = BB->getModule()->getDataLayout();
+  bool TrueDestIsSafe = false;
+  bool FalseDestIsSafe = false;
+
+  // True dest is safe if BranchCond => GuardCond.
+  auto Impl = isImpliedCondition(BranchCond, GuardCond, DL);
+  if (Impl && *Impl)
+    TrueDestIsSafe = true;
+  else {
+    // False dest is safe if !BranchCond => GuardCond.
+    Impl = isImpliedCondition(BranchCond, GuardCond, DL, /* LHSIsTrue */ false);
+    if (Impl && *Impl)
+      FalseDestIsSafe = true;
+  }
+
+  if (!TrueDestIsSafe && !FalseDestIsSafe)
+    return false;
+
+  BasicBlock *PredUnguardedBlock = TrueDestIsSafe ? TrueDest : FalseDest;
+  BasicBlock *PredGuardedBlock = FalseDestIsSafe ? TrueDest : FalseDest;
+
+  ValueToValueMapTy UnguardedMapping, GuardedMapping;
+  Instruction *AfterGuard = Guard->getNextNode();
+  unsigned Cost = getJumpThreadDuplicationCost(BB, AfterGuard, BBDupThreshold);
+  if (Cost > BBDupThreshold)
+    return false;
+  // Duplicate all instructions before the guard and the guard itself to the
+  // branch where implication is not proved.
+  BasicBlock *GuardedBlock = DuplicateInstructionsInSplitBetween(
+      BB, PredGuardedBlock, AfterGuard, GuardedMapping, *DTU);
+  assert(GuardedBlock && "Could not create the guarded block?");
+  // Duplicate all instructions before the guard in the unguarded branch.
+  // Since we have successfully duplicated the guarded block and this block
+  // has fewer instructions, we expect it to succeed.
+  BasicBlock *UnguardedBlock = DuplicateInstructionsInSplitBetween(
+      BB, PredUnguardedBlock, Guard, UnguardedMapping, *DTU);
+  assert(UnguardedBlock && "Could not create the unguarded block?");
+  LLVM_DEBUG(dbgs() << "Moved guard " << *Guard << " to block "
+                    << GuardedBlock->getName() << "\n");
+  // Some instructions before the guard may still have uses. For them, we need
+  // to create Phi nodes merging their copies in both guarded and unguarded
+  // branches. Those instructions that have no uses can be just removed.
+  SmallVector<Instruction *, 4> ToRemove;
+  for (auto BI = BB->begin(); &*BI != AfterGuard; ++BI)
+    if (!isa<PHINode>(&*BI))
+      ToRemove.push_back(&*BI);
+
+  Instruction *InsertionPoint = &*BB->getFirstInsertionPt();
+  assert(InsertionPoint && "Empty block?");
+  // Substitute with Phis & remove.
+  for (auto *Inst : reverse(ToRemove)) {
+    if (!Inst->use_empty()) {
+      PHINode *NewPN = PHINode::Create(Inst->getType(), 2);
+      NewPN->addIncoming(UnguardedMapping[Inst], UnguardedBlock);
+      NewPN->addIncoming(GuardedMapping[Inst], GuardedBlock);
+      NewPN->insertBefore(InsertionPoint);
+      Inst->replaceAllUsesWith(NewPN);
+    }
+    Inst->eraseFromParent();
+  }
+  return true;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LICM.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LICM.cpp
index 5276b77f8c..d2b4ba296f 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LICM.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LICM.cpp
@@ -1,17 +1,17 @@
-//===-- LICM.cpp - Loop Invariant Code Motion Pass ------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass performs loop invariant code motion, attempting to remove as much 
-// code from the body of a loop as possible.  It does this by either hoisting 
-// code into the preheader block, or by sinking code to the exit blocks if it is 
-// safe.  This pass also promotes must-aliased memory locations in the loop to 
-// live in registers, thus hoisting and sinking "invariant" loads and stores. 
-// 
+//===-- LICM.cpp - Loop Invariant Code Motion Pass ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs loop invariant code motion, attempting to remove as much
+// code from the body of a loop as possible.  It does this by either hoisting
+// code into the preheader block, or by sinking code to the exit blocks if it is
+// safe.  This pass also promotes must-aliased memory locations in the loop to
+// live in registers, thus hoisting and sinking "invariant" loads and stores.
+//
 // Hoisting operations out of loops is a canonicalization transform.  It
 // enables and simplifies subsequent optimizations in the middle-end.
 // Rematerialization of hoisted instructions to reduce register pressure is the
@@ -19,223 +19,223 @@
 // register pressure and also handles other optimizations than LICM that
 // increase live-ranges.
 //
-// This pass uses alias analysis for two purposes: 
-// 
-//  1. Moving loop invariant loads and calls out of loops.  If we can determine 
-//     that a load or call inside of a loop never aliases anything stored to, 
-//     we can hoist it or sink it like any other instruction. 
-//  2. Scalar Promotion of Memory - If there is a store instruction inside of 
-//     the loop, we try to move the store to happen AFTER the loop instead of 
-//     inside of the loop.  This can only happen if a few conditions are true: 
-//       A. The pointer stored through is loop invariant 
-//       B. There are no stores or loads in the loop which _may_ alias the 
-//          pointer.  There are no calls in the loop which mod/ref the pointer. 
-//     If these conditions are true, we can promote the loads and stores in the 
-//     loop of the pointer to use a temporary alloca'd variable.  We then use 
-//     the SSAUpdater to construct the appropriate SSA form for the value. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/LICM.h" 
-#include "llvm/ADT/SetOperations.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/AliasAnalysis.h" 
-#include "llvm/Analysis/AliasSetTracker.h" 
-#include "llvm/Analysis/BasicAliasAnalysis.h" 
+// This pass uses alias analysis for two purposes:
+//
+//  1. Moving loop invariant loads and calls out of loops.  If we can determine
+//     that a load or call inside of a loop never aliases anything stored to,
+//     we can hoist it or sink it like any other instruction.
+//  2. Scalar Promotion of Memory - If there is a store instruction inside of
+//     the loop, we try to move the store to happen AFTER the loop instead of
+//     inside of the loop.  This can only happen if a few conditions are true:
+//       A. The pointer stored through is loop invariant
+//       B. There are no stores or loads in the loop which _may_ alias the
+//          pointer.  There are no calls in the loop which mod/ref the pointer.
+//     If these conditions are true, we can promote the loads and stores in the
+//     loop of the pointer to use a temporary alloca'd variable.  We then use
+//     the SSAUpdater to construct the appropriate SSA form for the value.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LICM.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/CaptureTracking.h" 
-#include "llvm/Analysis/ConstantFolding.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/GuardUtils.h" 
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/GuardUtils.h"
 #include "llvm/Analysis/LazyBlockFrequencyInfo.h"
-#include "llvm/Analysis/Loads.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/LoopIterator.h" 
-#include "llvm/Analysis/LoopPass.h" 
-#include "llvm/Analysis/MemoryBuiltins.h" 
-#include "llvm/Analysis/MemorySSA.h" 
-#include "llvm/Analysis/MemorySSAUpdater.h" 
-#include "llvm/Analysis/MustExecute.h" 
-#include "llvm/Analysis/OptimizationRemarkEmitter.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DebugInfoMetadata.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/IR/PredIteratorCache.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Scalar/LoopPassManager.h" 
-#include "llvm/Transforms/Utils/AssumeBundleBuilder.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Transforms/Utils/LoopUtils.h" 
-#include "llvm/Transforms/Utils/SSAUpdater.h" 
-#include <algorithm> 
-#include <utility> 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "licm" 
- 
-STATISTIC(NumCreatedBlocks, "Number of blocks created"); 
-STATISTIC(NumClonedBranches, "Number of branches cloned"); 
-STATISTIC(NumSunk, "Number of instructions sunk out of loop"); 
-STATISTIC(NumHoisted, "Number of instructions hoisted out of loop"); 
-STATISTIC(NumMovedLoads, "Number of load insts hoisted or sunk"); 
-STATISTIC(NumMovedCalls, "Number of call insts hoisted or sunk"); 
-STATISTIC(NumPromoted, "Number of memory locations promoted to registers"); 
- 
-/// Memory promotion is enabled by default. 
-static cl::opt<bool> 
-    DisablePromotion("disable-licm-promotion", cl::Hidden, cl::init(false), 
-                     cl::desc("Disable memory promotion in LICM pass")); 
- 
-static cl::opt<bool> ControlFlowHoisting( 
-    "licm-control-flow-hoisting", cl::Hidden, cl::init(false), 
-    cl::desc("Enable control flow (and PHI) hoisting in LICM")); 
- 
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/MustExecute.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/PredIteratorCache.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include <algorithm>
+#include <utility>
+using namespace llvm;
+
+#define DEBUG_TYPE "licm"
+
+STATISTIC(NumCreatedBlocks, "Number of blocks created");
+STATISTIC(NumClonedBranches, "Number of branches cloned");
+STATISTIC(NumSunk, "Number of instructions sunk out of loop");
+STATISTIC(NumHoisted, "Number of instructions hoisted out of loop");
+STATISTIC(NumMovedLoads, "Number of load insts hoisted or sunk");
+STATISTIC(NumMovedCalls, "Number of call insts hoisted or sunk");
+STATISTIC(NumPromoted, "Number of memory locations promoted to registers");
+
+/// Memory promotion is enabled by default.
+static cl::opt<bool>
+    DisablePromotion("disable-licm-promotion", cl::Hidden, cl::init(false),
+                     cl::desc("Disable memory promotion in LICM pass"));
+
+static cl::opt<bool> ControlFlowHoisting(
+    "licm-control-flow-hoisting", cl::Hidden, cl::init(false),
+    cl::desc("Enable control flow (and PHI) hoisting in LICM"));
+
 static cl::opt<unsigned> HoistSinkColdnessThreshold(
     "licm-coldness-threshold", cl::Hidden, cl::init(4),
     cl::desc("Relative coldness Threshold of hoisting/sinking destination "
              "block for LICM to be considered beneficial"));
 
-static cl::opt<uint32_t> MaxNumUsesTraversed( 
-    "licm-max-num-uses-traversed", cl::Hidden, cl::init(8), 
-    cl::desc("Max num uses visited for identifying load " 
-             "invariance in loop using invariant start (default = 8)")); 
- 
-// Default value of zero implies we use the regular alias set tracker mechanism 
-// instead of the cross product using AA to identify aliasing of the memory 
-// location we are interested in. 
-static cl::opt<int> 
-LICMN2Theshold("licm-n2-threshold", cl::Hidden, cl::init(0), 
-               cl::desc("How many instruction to cross product using AA")); 
- 
-// Experimental option to allow imprecision in LICM in pathological cases, in 
-// exchange for faster compile. This is to be removed if MemorySSA starts to 
-// address the same issue. This flag applies only when LICM uses MemorySSA 
-// instead on AliasSetTracker. LICM calls MemorySSAWalker's 
-// getClobberingMemoryAccess, up to the value of the Cap, getting perfect 
-// accuracy. Afterwards, LICM will call into MemorySSA's getDefiningAccess, 
-// which may not be precise, since optimizeUses is capped. The result is 
-// correct, but we may not get as "far up" as possible to get which access is 
-// clobbering the one queried. 
-cl::opt<unsigned> llvm::SetLicmMssaOptCap( 
-    "licm-mssa-optimization-cap", cl::init(100), cl::Hidden, 
-    cl::desc("Enable imprecision in LICM in pathological cases, in exchange " 
-             "for faster compile. Caps the MemorySSA clobbering calls.")); 
- 
-// Experimentally, memory promotion carries less importance than sinking and 
-// hoisting. Limit when we do promotion when using MemorySSA, in order to save 
-// compile time. 
-cl::opt<unsigned> llvm::SetLicmMssaNoAccForPromotionCap( 
-    "licm-mssa-max-acc-promotion", cl::init(250), cl::Hidden, 
-    cl::desc("[LICM & MemorySSA] When MSSA in LICM is disabled, this has no " 
-             "effect. When MSSA in LICM is enabled, then this is the maximum " 
-             "number of accesses allowed to be present in a loop in order to " 
-             "enable memory promotion.")); 
- 
-static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI); 
-static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop, 
-                                  const LoopSafetyInfo *SafetyInfo, 
-                                  TargetTransformInfo *TTI, bool &FreeInLoop); 
-static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, 
-                  BasicBlock *Dest, ICFLoopSafetyInfo *SafetyInfo, 
-                  MemorySSAUpdater *MSSAU, ScalarEvolution *SE, 
-                  OptimizationRemarkEmitter *ORE); 
-static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, 
+static cl::opt<uint32_t> MaxNumUsesTraversed(
+    "licm-max-num-uses-traversed", cl::Hidden, cl::init(8),
+    cl::desc("Max num uses visited for identifying load "
+             "invariance in loop using invariant start (default = 8)"));
+
+// Default value of zero implies we use the regular alias set tracker mechanism
+// instead of the cross product using AA to identify aliasing of the memory
+// location we are interested in.
+static cl::opt<int>
+LICMN2Theshold("licm-n2-threshold", cl::Hidden, cl::init(0),
+               cl::desc("How many instruction to cross product using AA"));
+
+// Experimental option to allow imprecision in LICM in pathological cases, in
+// exchange for faster compile. This is to be removed if MemorySSA starts to
+// address the same issue. This flag applies only when LICM uses MemorySSA
+// instead on AliasSetTracker. LICM calls MemorySSAWalker's
+// getClobberingMemoryAccess, up to the value of the Cap, getting perfect
+// accuracy. Afterwards, LICM will call into MemorySSA's getDefiningAccess,
+// which may not be precise, since optimizeUses is capped. The result is
+// correct, but we may not get as "far up" as possible to get which access is
+// clobbering the one queried.
+cl::opt<unsigned> llvm::SetLicmMssaOptCap(
+    "licm-mssa-optimization-cap", cl::init(100), cl::Hidden,
+    cl::desc("Enable imprecision in LICM in pathological cases, in exchange "
+             "for faster compile. Caps the MemorySSA clobbering calls."));
+
+// Experimentally, memory promotion carries less importance than sinking and
+// hoisting. Limit when we do promotion when using MemorySSA, in order to save
+// compile time.
+cl::opt<unsigned> llvm::SetLicmMssaNoAccForPromotionCap(
+    "licm-mssa-max-acc-promotion", cl::init(250), cl::Hidden,
+    cl::desc("[LICM & MemorySSA] When MSSA in LICM is disabled, this has no "
+             "effect. When MSSA in LICM is enabled, then this is the maximum "
+             "number of accesses allowed to be present in a loop in order to "
+             "enable memory promotion."));
+
+static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
+static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop,
+                                  const LoopSafetyInfo *SafetyInfo,
+                                  TargetTransformInfo *TTI, bool &FreeInLoop);
+static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
+                  BasicBlock *Dest, ICFLoopSafetyInfo *SafetyInfo,
+                  MemorySSAUpdater *MSSAU, ScalarEvolution *SE,
+                  OptimizationRemarkEmitter *ORE);
+static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
                  BlockFrequencyInfo *BFI, const Loop *CurLoop,
                  ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU,
                  OptimizationRemarkEmitter *ORE);
-static bool isSafeToExecuteUnconditionally(Instruction &Inst, 
-                                           const DominatorTree *DT, 
-                                           const Loop *CurLoop, 
-                                           const LoopSafetyInfo *SafetyInfo, 
-                                           OptimizationRemarkEmitter *ORE, 
-                                           const Instruction *CtxI = nullptr); 
-static bool pointerInvalidatedByLoop(MemoryLocation MemLoc, 
-                                     AliasSetTracker *CurAST, Loop *CurLoop, 
-                                     AAResults *AA); 
-static bool pointerInvalidatedByLoopWithMSSA(MemorySSA *MSSA, MemoryUse *MU, 
+static bool isSafeToExecuteUnconditionally(Instruction &Inst,
+                                           const DominatorTree *DT,
+                                           const Loop *CurLoop,
+                                           const LoopSafetyInfo *SafetyInfo,
+                                           OptimizationRemarkEmitter *ORE,
+                                           const Instruction *CtxI = nullptr);
+static bool pointerInvalidatedByLoop(MemoryLocation MemLoc,
+                                     AliasSetTracker *CurAST, Loop *CurLoop,
+                                     AAResults *AA);
+static bool pointerInvalidatedByLoopWithMSSA(MemorySSA *MSSA, MemoryUse *MU,
                                              Loop *CurLoop, Instruction &I,
-                                             SinkAndHoistLICMFlags &Flags); 
+                                             SinkAndHoistLICMFlags &Flags);
 static bool pointerInvalidatedByBlockWithMSSA(BasicBlock &BB, MemorySSA &MSSA,
                                               MemoryUse &MU);
-static Instruction *cloneInstructionInExitBlock( 
-    Instruction &I, BasicBlock &ExitBlock, PHINode &PN, const LoopInfo *LI, 
-    const LoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU); 
- 
-static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, 
-                             AliasSetTracker *AST, MemorySSAUpdater *MSSAU); 
- 
-static void moveInstructionBefore(Instruction &I, Instruction &Dest, 
-                                  ICFLoopSafetyInfo &SafetyInfo, 
-                                  MemorySSAUpdater *MSSAU, ScalarEvolution *SE); 
- 
-namespace { 
-struct LoopInvariantCodeMotion { 
-  bool runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT, 
+static Instruction *cloneInstructionInExitBlock(
+    Instruction &I, BasicBlock &ExitBlock, PHINode &PN, const LoopInfo *LI,
+    const LoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU);
+
+static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,
+                             AliasSetTracker *AST, MemorySSAUpdater *MSSAU);
+
+static void moveInstructionBefore(Instruction &I, Instruction &Dest,
+                                  ICFLoopSafetyInfo &SafetyInfo,
+                                  MemorySSAUpdater *MSSAU, ScalarEvolution *SE);
+
+namespace {
+struct LoopInvariantCodeMotion {
+  bool runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT,
                  BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI,
                  TargetTransformInfo *TTI, ScalarEvolution *SE, MemorySSA *MSSA,
-                 OptimizationRemarkEmitter *ORE); 
- 
-  LoopInvariantCodeMotion(unsigned LicmMssaOptCap, 
-                          unsigned LicmMssaNoAccForPromotionCap) 
-      : LicmMssaOptCap(LicmMssaOptCap), 
-        LicmMssaNoAccForPromotionCap(LicmMssaNoAccForPromotionCap) {} 
- 
-private: 
-  unsigned LicmMssaOptCap; 
-  unsigned LicmMssaNoAccForPromotionCap; 
- 
-  std::unique_ptr<AliasSetTracker> 
-  collectAliasInfoForLoop(Loop *L, LoopInfo *LI, AAResults *AA); 
-  std::unique_ptr<AliasSetTracker> 
-  collectAliasInfoForLoopWithMSSA(Loop *L, AAResults *AA, 
-                                  MemorySSAUpdater *MSSAU); 
-}; 
- 
-struct LegacyLICMPass : public LoopPass { 
-  static char ID; // Pass identification, replacement for typeid 
-  LegacyLICMPass( 
-      unsigned LicmMssaOptCap = SetLicmMssaOptCap, 
-      unsigned LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap) 
-      : LoopPass(ID), LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap) { 
-    initializeLegacyLICMPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnLoop(Loop *L, LPPassManager &LPM) override { 
-    if (skipLoop(L)) 
-      return false; 
- 
+                 OptimizationRemarkEmitter *ORE);
+
+  LoopInvariantCodeMotion(unsigned LicmMssaOptCap,
+                          unsigned LicmMssaNoAccForPromotionCap)
+      : LicmMssaOptCap(LicmMssaOptCap),
+        LicmMssaNoAccForPromotionCap(LicmMssaNoAccForPromotionCap) {}
+
+private:
+  unsigned LicmMssaOptCap;
+  unsigned LicmMssaNoAccForPromotionCap;
+
+  std::unique_ptr<AliasSetTracker>
+  collectAliasInfoForLoop(Loop *L, LoopInfo *LI, AAResults *AA);
+  std::unique_ptr<AliasSetTracker>
+  collectAliasInfoForLoopWithMSSA(Loop *L, AAResults *AA,
+                                  MemorySSAUpdater *MSSAU);
+};
+
+struct LegacyLICMPass : public LoopPass {
+  static char ID; // Pass identification, replacement for typeid
+  LegacyLICMPass(
+      unsigned LicmMssaOptCap = SetLicmMssaOptCap,
+      unsigned LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap)
+      : LoopPass(ID), LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap) {
+    initializeLegacyLICMPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+    if (skipLoop(L))
+      return false;
+
     LLVM_DEBUG(dbgs() << "Perform LICM on Loop with header at block "
                       << L->getHeader()->getNameOrAsOperand() << "\n");
 
-    auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); 
-    MemorySSA *MSSA = EnableMSSALoopDependency 
-                          ? (&getAnalysis<MemorySSAWrapperPass>().getMSSA()) 
-                          : nullptr; 
+    auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+    MemorySSA *MSSA = EnableMSSALoopDependency
+                          ? (&getAnalysis<MemorySSAWrapperPass>().getMSSA())
+                          : nullptr;
     bool hasProfileData = L->getHeader()->getParent()->hasProfileData();
     BlockFrequencyInfo *BFI =
         hasProfileData ? &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI()
                        : nullptr;
-    // For the old PM, we can't use OptimizationRemarkEmitter as an analysis 
+    // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
     // pass. Function analyses need to be preserved across loop transformations
-    // but ORE cannot be preserved (see comment before the pass definition). 
-    OptimizationRemarkEmitter ORE(L->getHeader()->getParent()); 
+    // but ORE cannot be preserved (see comment before the pass definition).
+    OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
     return LICM.runOnLoop(
         L, &getAnalysis<AAResultsWrapperPass>().getAAResults(),
         &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
@@ -245,70 +245,70 @@ struct LegacyLICMPass : public LoopPass {
         &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
             *L->getHeader()->getParent()),
         SE ? &SE->getSE() : nullptr, MSSA, &ORE);
-  } 
- 
-  /// This transformation requires natural loop information & requires that 
-  /// loop preheaders be inserted into the CFG... 
-  /// 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addPreserved<DominatorTreeWrapperPass>(); 
-    AU.addPreserved<LoopInfoWrapperPass>(); 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-    if (EnableMSSALoopDependency) { 
-      AU.addRequired<MemorySSAWrapperPass>(); 
-      AU.addPreserved<MemorySSAWrapperPass>(); 
-    } 
-    AU.addRequired<TargetTransformInfoWrapperPass>(); 
-    getLoopAnalysisUsage(AU); 
+  }
+
+  /// This transformation requires natural loop information & requires that
+  /// loop preheaders be inserted into the CFG...
+  ///
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    if (EnableMSSALoopDependency) {
+      AU.addRequired<MemorySSAWrapperPass>();
+      AU.addPreserved<MemorySSAWrapperPass>();
+    }
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    getLoopAnalysisUsage(AU);
     LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
     AU.addPreserved<LazyBlockFrequencyInfoPass>();
     AU.addPreserved<LazyBranchProbabilityInfoPass>();
-  } 
- 
-private: 
-  LoopInvariantCodeMotion LICM; 
-}; 
-} // namespace 
- 
-PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM, 
-                                LoopStandardAnalysisResults &AR, LPMUpdater &) { 
-  // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis 
-  // pass.  Function analyses need to be preserved across loop transformations 
-  // but ORE cannot be preserved (see comment before the pass definition). 
-  OptimizationRemarkEmitter ORE(L.getHeader()->getParent()); 
- 
-  LoopInvariantCodeMotion LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap); 
+  }
+
+private:
+  LoopInvariantCodeMotion LICM;
+};
+} // namespace
+
+PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
+                                LoopStandardAnalysisResults &AR, LPMUpdater &) {
+  // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
+  // pass.  Function analyses need to be preserved across loop transformations
+  // but ORE cannot be preserved (see comment before the pass definition).
+  OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
+
+  LoopInvariantCodeMotion LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap);
   if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, AR.BFI, &AR.TLI, &AR.TTI,
                       &AR.SE, AR.MSSA, &ORE))
-    return PreservedAnalyses::all(); 
- 
-  auto PA = getLoopPassPreservedAnalyses(); 
- 
-  PA.preserve<DominatorTreeAnalysis>(); 
-  PA.preserve<LoopAnalysis>(); 
-  if (AR.MSSA) 
-    PA.preserve<MemorySSAAnalysis>(); 
- 
-  return PA; 
-} 
- 
-char LegacyLICMPass::ID = 0; 
-INITIALIZE_PASS_BEGIN(LegacyLICMPass, "licm", "Loop Invariant Code Motion", 
-                      false, false) 
-INITIALIZE_PASS_DEPENDENCY(LoopPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) 
+    return PreservedAnalyses::all();
+
+  auto PA = getLoopPassPreservedAnalyses();
+
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<LoopAnalysis>();
+  if (AR.MSSA)
+    PA.preserve<MemorySSAAnalysis>();
+
+  return PA;
+}
+
+char LegacyLICMPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LegacyLICMPass, "licm", "Loop Invariant Code Motion",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LazyBFIPass)
-INITIALIZE_PASS_END(LegacyLICMPass, "licm", "Loop Invariant Code Motion", false, 
-                    false) 
- 
-Pass *llvm::createLICMPass() { return new LegacyLICMPass(); } 
-Pass *llvm::createLICMPass(unsigned LicmMssaOptCap, 
-                           unsigned LicmMssaNoAccForPromotionCap) { 
-  return new LegacyLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap); 
-} 
- 
+INITIALIZE_PASS_END(LegacyLICMPass, "licm", "Loop Invariant Code Motion", false,
+                    false)
+
+Pass *llvm::createLICMPass() { return new LegacyLICMPass(); }
+Pass *llvm::createLICMPass(unsigned LicmMssaOptCap,
+                           unsigned LicmMssaNoAccForPromotionCap) {
+  return new LegacyLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap);
+}
+
 llvm::SinkAndHoistLICMFlags::SinkAndHoistLICMFlags(bool IsSink, Loop *L,
                                                    MemorySSA *MSSA)
     : SinkAndHoistLICMFlags(SetLicmMssaOptCap, SetLicmMssaNoAccForPromotionCap,
@@ -338,456 +338,456 @@ llvm::SinkAndHoistLICMFlags::SinkAndHoistLICMFlags(
       }
 }
 
-/// Hoist expressions out of the specified loop. Note, alias info for inner 
-/// loop is not preserved so it is not a good idea to run LICM multiple 
-/// times on one loop. 
-bool LoopInvariantCodeMotion::runOnLoop( 
-    Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT, 
+/// Hoist expressions out of the specified loop. Note, alias info for inner
+/// loop is not preserved so it is not a good idea to run LICM multiple
+/// times on one loop.
+bool LoopInvariantCodeMotion::runOnLoop(
+    Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT,
     BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI, TargetTransformInfo *TTI,
     ScalarEvolution *SE, MemorySSA *MSSA, OptimizationRemarkEmitter *ORE) {
-  bool Changed = false; 
- 
-  assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form."); 
- 
-  // If this loop has metadata indicating that LICM is not to be performed then 
-  // just exit. 
-  if (hasDisableLICMTransformsHint(L)) { 
-    return false; 
-  } 
- 
-  std::unique_ptr<AliasSetTracker> CurAST; 
-  std::unique_ptr<MemorySSAUpdater> MSSAU; 
+  bool Changed = false;
+
+  assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form.");
+
+  // If this loop has metadata indicating that LICM is not to be performed then
+  // just exit.
+  if (hasDisableLICMTransformsHint(L)) {
+    return false;
+  }
+
+  std::unique_ptr<AliasSetTracker> CurAST;
+  std::unique_ptr<MemorySSAUpdater> MSSAU;
   std::unique_ptr<SinkAndHoistLICMFlags> Flags;
- 
-  if (!MSSA) { 
-    LLVM_DEBUG(dbgs() << "LICM: Using Alias Set Tracker.\n"); 
-    CurAST = collectAliasInfoForLoop(L, LI, AA); 
+
+  if (!MSSA) {
+    LLVM_DEBUG(dbgs() << "LICM: Using Alias Set Tracker.\n");
+    CurAST = collectAliasInfoForLoop(L, LI, AA);
     Flags = std::make_unique<SinkAndHoistLICMFlags>(
         LicmMssaOptCap, LicmMssaNoAccForPromotionCap, /*IsSink=*/true);
-  } else { 
-    LLVM_DEBUG(dbgs() << "LICM: Using MemorySSA.\n"); 
-    MSSAU = std::make_unique<MemorySSAUpdater>(MSSA); 
+  } else {
+    LLVM_DEBUG(dbgs() << "LICM: Using MemorySSA.\n");
+    MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
     Flags = std::make_unique<SinkAndHoistLICMFlags>(
         LicmMssaOptCap, LicmMssaNoAccForPromotionCap, /*IsSink=*/true, L, MSSA);
-  } 
- 
-  // Get the preheader block to move instructions into... 
-  BasicBlock *Preheader = L->getLoopPreheader(); 
- 
-  // Compute loop safety information. 
-  ICFLoopSafetyInfo SafetyInfo; 
-  SafetyInfo.computeLoopSafetyInfo(L); 
- 
-  // We want to visit all of the instructions in this loop... that are not parts 
-  // of our subloops (they have already had their invariants hoisted out of 
-  // their loop, into this loop, so there is no need to process the BODIES of 
-  // the subloops). 
-  // 
-  // Traverse the body of the loop in depth first order on the dominator tree so 
-  // that we are guaranteed to see definitions before we see uses.  This allows 
-  // us to sink instructions in one pass, without iteration.  After sinking 
-  // instructions, we perform another pass to hoist them out of the loop. 
-  if (L->hasDedicatedExits()) 
+  }
+
+  // Get the preheader block to move instructions into...
+  BasicBlock *Preheader = L->getLoopPreheader();
+
+  // Compute loop safety information.
+  ICFLoopSafetyInfo SafetyInfo;
+  SafetyInfo.computeLoopSafetyInfo(L);
+
+  // We want to visit all of the instructions in this loop... that are not parts
+  // of our subloops (they have already had their invariants hoisted out of
+  // their loop, into this loop, so there is no need to process the BODIES of
+  // the subloops).
+  //
+  // Traverse the body of the loop in depth first order on the dominator tree so
+  // that we are guaranteed to see definitions before we see uses.  This allows
+  // us to sink instructions in one pass, without iteration.  After sinking
+  // instructions, we perform another pass to hoist them out of the loop.
+  if (L->hasDedicatedExits())
     Changed |=
         sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI, TTI, L,
                    CurAST.get(), MSSAU.get(), &SafetyInfo, *Flags.get(), ORE);
   Flags->setIsSink(false);
-  if (Preheader) 
+  if (Preheader)
     Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI, L,
                            CurAST.get(), MSSAU.get(), SE, &SafetyInfo,
                            *Flags.get(), ORE);
- 
-  // Now that all loop invariants have been removed from the loop, promote any 
-  // memory references to scalars that we can. 
-  // Don't sink stores from loops without dedicated block exits. Exits 
-  // containing indirect branches are not transformed by loop simplify, 
-  // make sure we catch that. An additional load may be generated in the 
-  // preheader for SSA updater, so also avoid sinking when no preheader 
-  // is available. 
-  if (!DisablePromotion && Preheader && L->hasDedicatedExits() && 
+
+  // Now that all loop invariants have been removed from the loop, promote any
+  // memory references to scalars that we can.
+  // Don't sink stores from loops without dedicated block exits. Exits
+  // containing indirect branches are not transformed by loop simplify,
+  // make sure we catch that. An additional load may be generated in the
+  // preheader for SSA updater, so also avoid sinking when no preheader
+  // is available.
+  if (!DisablePromotion && Preheader && L->hasDedicatedExits() &&
       !Flags->tooManyMemoryAccesses()) {
-    // Figure out the loop exits and their insertion points 
-    SmallVector<BasicBlock *, 8> ExitBlocks; 
-    L->getUniqueExitBlocks(ExitBlocks); 
- 
-    // We can't insert into a catchswitch. 
-    bool HasCatchSwitch = llvm::any_of(ExitBlocks, [](BasicBlock *Exit) { 
-      return isa<CatchSwitchInst>(Exit->getTerminator()); 
-    }); 
- 
-    if (!HasCatchSwitch) { 
-      SmallVector<Instruction *, 8> InsertPts; 
-      SmallVector<MemoryAccess *, 8> MSSAInsertPts; 
-      InsertPts.reserve(ExitBlocks.size()); 
-      if (MSSAU) 
-        MSSAInsertPts.reserve(ExitBlocks.size()); 
-      for (BasicBlock *ExitBlock : ExitBlocks) { 
-        InsertPts.push_back(&*ExitBlock->getFirstInsertionPt()); 
-        if (MSSAU) 
-          MSSAInsertPts.push_back(nullptr); 
-      } 
- 
-      PredIteratorCache PIC; 
- 
-      bool Promoted = false; 
- 
-      // Build an AST using MSSA. 
-      if (!CurAST.get()) 
-        CurAST = collectAliasInfoForLoopWithMSSA(L, AA, MSSAU.get()); 
- 
-      // Loop over all of the alias sets in the tracker object. 
-      for (AliasSet &AS : *CurAST) { 
-        // We can promote this alias set if it has a store, if it is a "Must" 
-        // alias set, if the pointer is loop invariant, and if we are not 
-        // eliminating any volatile loads or stores. 
-        if (AS.isForwardingAliasSet() || !AS.isMod() || !AS.isMustAlias() || 
-            !L->isLoopInvariant(AS.begin()->getValue())) 
-          continue; 
- 
-        assert( 
-            !AS.empty() && 
-            "Must alias set should have at least one pointer element in it!"); 
- 
-        SmallSetVector<Value *, 8> PointerMustAliases; 
-        for (const auto &ASI : AS) 
-          PointerMustAliases.insert(ASI.getValue()); 
- 
-        Promoted |= promoteLoopAccessesToScalars( 
-            PointerMustAliases, ExitBlocks, InsertPts, MSSAInsertPts, PIC, LI, 
-            DT, TLI, L, CurAST.get(), MSSAU.get(), &SafetyInfo, ORE); 
-      } 
- 
-      // Once we have promoted values across the loop body we have to 
-      // recursively reform LCSSA as any nested loop may now have values defined 
-      // within the loop used in the outer loop. 
-      // FIXME: This is really heavy handed. It would be a bit better to use an 
-      // SSAUpdater strategy during promotion that was LCSSA aware and reformed 
-      // it as it went. 
-      if (Promoted) 
-        formLCSSARecursively(*L, *DT, LI, SE); 
- 
-      Changed |= Promoted; 
-    } 
-  } 
- 
-  // Check that neither this loop nor its parent have had LCSSA broken. LICM is 
-  // specifically moving instructions across the loop boundary and so it is 
-  // especially in need of sanity checking here. 
-  assert(L->isLCSSAForm(*DT) && "Loop not left in LCSSA form after LICM!"); 
+    // Figure out the loop exits and their insertion points
+    SmallVector<BasicBlock *, 8> ExitBlocks;
+    L->getUniqueExitBlocks(ExitBlocks);
+
+    // We can't insert into a catchswitch.
+    bool HasCatchSwitch = llvm::any_of(ExitBlocks, [](BasicBlock *Exit) {
+      return isa<CatchSwitchInst>(Exit->getTerminator());
+    });
+
+    if (!HasCatchSwitch) {
+      SmallVector<Instruction *, 8> InsertPts;
+      SmallVector<MemoryAccess *, 8> MSSAInsertPts;
+      InsertPts.reserve(ExitBlocks.size());
+      if (MSSAU)
+        MSSAInsertPts.reserve(ExitBlocks.size());
+      for (BasicBlock *ExitBlock : ExitBlocks) {
+        InsertPts.push_back(&*ExitBlock->getFirstInsertionPt());
+        if (MSSAU)
+          MSSAInsertPts.push_back(nullptr);
+      }
+
+      PredIteratorCache PIC;
+
+      bool Promoted = false;
+
+      // Build an AST using MSSA.
+      if (!CurAST.get())
+        CurAST = collectAliasInfoForLoopWithMSSA(L, AA, MSSAU.get());
+
+      // Loop over all of the alias sets in the tracker object.
+      for (AliasSet &AS : *CurAST) {
+        // We can promote this alias set if it has a store, if it is a "Must"
+        // alias set, if the pointer is loop invariant, and if we are not
+        // eliminating any volatile loads or stores.
+        if (AS.isForwardingAliasSet() || !AS.isMod() || !AS.isMustAlias() ||
+            !L->isLoopInvariant(AS.begin()->getValue()))
+          continue;
+
+        assert(
+            !AS.empty() &&
+            "Must alias set should have at least one pointer element in it!");
+
+        SmallSetVector<Value *, 8> PointerMustAliases;
+        for (const auto &ASI : AS)
+          PointerMustAliases.insert(ASI.getValue());
+
+        Promoted |= promoteLoopAccessesToScalars(
+            PointerMustAliases, ExitBlocks, InsertPts, MSSAInsertPts, PIC, LI,
+            DT, TLI, L, CurAST.get(), MSSAU.get(), &SafetyInfo, ORE);
+      }
+
+      // Once we have promoted values across the loop body we have to
+      // recursively reform LCSSA as any nested loop may now have values defined
+      // within the loop used in the outer loop.
+      // FIXME: This is really heavy handed. It would be a bit better to use an
+      // SSAUpdater strategy during promotion that was LCSSA aware and reformed
+      // it as it went.
+      if (Promoted)
+        formLCSSARecursively(*L, *DT, LI, SE);
+
+      Changed |= Promoted;
+    }
+  }
+
+  // Check that neither this loop nor its parent have had LCSSA broken. LICM is
+  // specifically moving instructions across the loop boundary and so it is
+  // especially in need of sanity checking here.
+  assert(L->isLCSSAForm(*DT) && "Loop not left in LCSSA form after LICM!");
   assert((L->isOutermost() || L->getParentLoop()->isLCSSAForm(*DT)) &&
-         "Parent loop not left in LCSSA form after LICM!"); 
- 
-  if (MSSAU.get() && VerifyMemorySSA) 
-    MSSAU->getMemorySSA()->verifyMemorySSA(); 
- 
-  if (Changed && SE) 
-    SE->forgetLoopDispositions(L); 
-  return Changed; 
-} 
- 
-/// Walk the specified region of the CFG (defined by all blocks dominated by 
-/// the specified block, and that are in the current loop) in reverse depth 
-/// first order w.r.t the DominatorTree.  This allows us to visit uses before 
-/// definitions, allowing us to sink a loop body in one pass without iteration. 
-/// 
-bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, 
+         "Parent loop not left in LCSSA form after LICM!");
+
+  if (MSSAU.get() && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
+  if (Changed && SE)
+    SE->forgetLoopDispositions(L);
+  return Changed;
+}
+
+/// Walk the specified region of the CFG (defined by all blocks dominated by
+/// the specified block, and that are in the current loop) in reverse depth
+/// first order w.r.t the DominatorTree.  This allows us to visit uses before
+/// definitions, allowing us to sink a loop body in one pass without iteration.
+///
+bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
                       DominatorTree *DT, BlockFrequencyInfo *BFI,
                       TargetLibraryInfo *TLI, TargetTransformInfo *TTI,
                       Loop *CurLoop, AliasSetTracker *CurAST,
                       MemorySSAUpdater *MSSAU, ICFLoopSafetyInfo *SafetyInfo,
-                      SinkAndHoistLICMFlags &Flags, 
-                      OptimizationRemarkEmitter *ORE) { 
- 
-  // Verify inputs. 
-  assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr && 
-         CurLoop != nullptr && SafetyInfo != nullptr && 
-         "Unexpected input to sinkRegion."); 
-  assert(((CurAST != nullptr) ^ (MSSAU != nullptr)) && 
-         "Either AliasSetTracker or MemorySSA should be initialized."); 
- 
-  // We want to visit children before parents. We will enque all the parents 
-  // before their children in the worklist and process the worklist in reverse 
-  // order. 
-  SmallVector<DomTreeNode *, 16> Worklist = collectChildrenInLoop(N, CurLoop); 
- 
-  bool Changed = false; 
-  for (DomTreeNode *DTN : reverse(Worklist)) { 
-    BasicBlock *BB = DTN->getBlock(); 
-    // Only need to process the contents of this block if it is not part of a 
-    // subloop (which would already have been processed). 
-    if (inSubLoop(BB, CurLoop, LI)) 
-      continue; 
- 
-    for (BasicBlock::iterator II = BB->end(); II != BB->begin();) { 
-      Instruction &I = *--II; 
- 
-      // If the instruction is dead, we would try to sink it because it isn't 
-      // used in the loop, instead, just delete it. 
-      if (isInstructionTriviallyDead(&I, TLI)) { 
-        LLVM_DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n'); 
-        salvageKnowledge(&I); 
-        salvageDebugInfo(I); 
-        ++II; 
-        eraseInstruction(I, *SafetyInfo, CurAST, MSSAU); 
-        Changed = true; 
-        continue; 
-      } 
- 
-      // Check to see if we can sink this instruction to the exit blocks 
-      // of the loop.  We can do this if the all users of the instruction are 
-      // outside of the loop.  In this case, it doesn't even matter if the 
-      // operands of the instruction are loop invariant. 
-      // 
-      bool FreeInLoop = false; 
-      if (!I.mayHaveSideEffects() && 
-          isNotUsedOrFreeInLoop(I, CurLoop, SafetyInfo, TTI, FreeInLoop) && 
-          canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, MSSAU, true, &Flags, 
-                             ORE)) { 
+                      SinkAndHoistLICMFlags &Flags,
+                      OptimizationRemarkEmitter *ORE) {
+
+  // Verify inputs.
+  assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr &&
+         CurLoop != nullptr && SafetyInfo != nullptr &&
+         "Unexpected input to sinkRegion.");
+  assert(((CurAST != nullptr) ^ (MSSAU != nullptr)) &&
+         "Either AliasSetTracker or MemorySSA should be initialized.");
+
+  // We want to visit children before parents. We will enque all the parents
+  // before their children in the worklist and process the worklist in reverse
+  // order.
+  SmallVector<DomTreeNode *, 16> Worklist = collectChildrenInLoop(N, CurLoop);
+
+  bool Changed = false;
+  for (DomTreeNode *DTN : reverse(Worklist)) {
+    BasicBlock *BB = DTN->getBlock();
+    // Only need to process the contents of this block if it is not part of a
+    // subloop (which would already have been processed).
+    if (inSubLoop(BB, CurLoop, LI))
+      continue;
+
+    for (BasicBlock::iterator II = BB->end(); II != BB->begin();) {
+      Instruction &I = *--II;
+
+      // If the instruction is dead, we would try to sink it because it isn't
+      // used in the loop, instead, just delete it.
+      if (isInstructionTriviallyDead(&I, TLI)) {
+        LLVM_DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n');
+        salvageKnowledge(&I);
+        salvageDebugInfo(I);
+        ++II;
+        eraseInstruction(I, *SafetyInfo, CurAST, MSSAU);
+        Changed = true;
+        continue;
+      }
+
+      // Check to see if we can sink this instruction to the exit blocks
+      // of the loop.  We can do this if the all users of the instruction are
+      // outside of the loop.  In this case, it doesn't even matter if the
+      // operands of the instruction are loop invariant.
+      //
+      bool FreeInLoop = false;
+      if (!I.mayHaveSideEffects() &&
+          isNotUsedOrFreeInLoop(I, CurLoop, SafetyInfo, TTI, FreeInLoop) &&
+          canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, MSSAU, true, &Flags,
+                             ORE)) {
         if (sink(I, LI, DT, BFI, CurLoop, SafetyInfo, MSSAU, ORE)) {
-          if (!FreeInLoop) { 
-            ++II; 
-            salvageDebugInfo(I); 
-            eraseInstruction(I, *SafetyInfo, CurAST, MSSAU); 
-          } 
-          Changed = true; 
-        } 
-      } 
-    } 
-  } 
-  if (MSSAU && VerifyMemorySSA) 
-    MSSAU->getMemorySSA()->verifyMemorySSA(); 
-  return Changed; 
-} 
- 
-namespace { 
-// This is a helper class for hoistRegion to make it able to hoist control flow 
-// in order to be able to hoist phis. The way this works is that we initially 
-// start hoisting to the loop preheader, and when we see a loop invariant branch 
-// we make note of this. When we then come to hoist an instruction that's 
-// conditional on such a branch we duplicate the branch and the relevant control 
-// flow, then hoist the instruction into the block corresponding to its original 
-// block in the duplicated control flow. 
-class ControlFlowHoister { 
-private: 
-  // Information about the loop we are hoisting from 
-  LoopInfo *LI; 
-  DominatorTree *DT; 
-  Loop *CurLoop; 
-  MemorySSAUpdater *MSSAU; 
- 
-  // A map of blocks in the loop to the block their instructions will be hoisted 
-  // to. 
-  DenseMap<BasicBlock *, BasicBlock *> HoistDestinationMap; 
- 
-  // The branches that we can hoist, mapped to the block that marks a 
-  // convergence point of their control flow. 
-  DenseMap<BranchInst *, BasicBlock *> HoistableBranches; 
- 
-public: 
-  ControlFlowHoister(LoopInfo *LI, DominatorTree *DT, Loop *CurLoop, 
-                     MemorySSAUpdater *MSSAU) 
-      : LI(LI), DT(DT), CurLoop(CurLoop), MSSAU(MSSAU) {} 
- 
-  void registerPossiblyHoistableBranch(BranchInst *BI) { 
-    // We can only hoist conditional branches with loop invariant operands. 
-    if (!ControlFlowHoisting || !BI->isConditional() || 
-        !CurLoop->hasLoopInvariantOperands(BI)) 
-      return; 
- 
-    // The branch destinations need to be in the loop, and we don't gain 
-    // anything by duplicating conditional branches with duplicate successors, 
-    // as it's essentially the same as an unconditional branch. 
-    BasicBlock *TrueDest = BI->getSuccessor(0); 
-    BasicBlock *FalseDest = BI->getSuccessor(1); 
-    if (!CurLoop->contains(TrueDest) || !CurLoop->contains(FalseDest) || 
-        TrueDest == FalseDest) 
-      return; 
- 
-    // We can hoist BI if one branch destination is the successor of the other, 
-    // or both have common successor which we check by seeing if the 
-    // intersection of their successors is non-empty. 
-    // TODO: This could be expanded to allowing branches where both ends 
-    // eventually converge to a single block. 
-    SmallPtrSet<BasicBlock *, 4> TrueDestSucc, FalseDestSucc; 
-    TrueDestSucc.insert(succ_begin(TrueDest), succ_end(TrueDest)); 
-    FalseDestSucc.insert(succ_begin(FalseDest), succ_end(FalseDest)); 
-    BasicBlock *CommonSucc = nullptr; 
-    if (TrueDestSucc.count(FalseDest)) { 
-      CommonSucc = FalseDest; 
-    } else if (FalseDestSucc.count(TrueDest)) { 
-      CommonSucc = TrueDest; 
-    } else { 
-      set_intersect(TrueDestSucc, FalseDestSucc); 
-      // If there's one common successor use that. 
-      if (TrueDestSucc.size() == 1) 
-        CommonSucc = *TrueDestSucc.begin(); 
-      // If there's more than one pick whichever appears first in the block list 
-      // (we can't use the value returned by TrueDestSucc.begin() as it's 
-      // unpredicatable which element gets returned). 
-      else if (!TrueDestSucc.empty()) { 
-        Function *F = TrueDest->getParent(); 
-        auto IsSucc = [&](BasicBlock &BB) { return TrueDestSucc.count(&BB); }; 
+          if (!FreeInLoop) {
+            ++II;
+            salvageDebugInfo(I);
+            eraseInstruction(I, *SafetyInfo, CurAST, MSSAU);
+          }
+          Changed = true;
+        }
+      }
+    }
+  }
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+  return Changed;
+}
+
+namespace {
+// This is a helper class for hoistRegion to make it able to hoist control flow
+// in order to be able to hoist phis. The way this works is that we initially
+// start hoisting to the loop preheader, and when we see a loop invariant branch
+// we make note of this. When we then come to hoist an instruction that's
+// conditional on such a branch we duplicate the branch and the relevant control
+// flow, then hoist the instruction into the block corresponding to its original
+// block in the duplicated control flow.
+class ControlFlowHoister {
+private:
+  // Information about the loop we are hoisting from
+  LoopInfo *LI;
+  DominatorTree *DT;
+  Loop *CurLoop;
+  MemorySSAUpdater *MSSAU;
+
+  // A map of blocks in the loop to the block their instructions will be hoisted
+  // to.
+  DenseMap<BasicBlock *, BasicBlock *> HoistDestinationMap;
+
+  // The branches that we can hoist, mapped to the block that marks a
+  // convergence point of their control flow.
+  DenseMap<BranchInst *, BasicBlock *> HoistableBranches;
+
+public:
+  ControlFlowHoister(LoopInfo *LI, DominatorTree *DT, Loop *CurLoop,
+                     MemorySSAUpdater *MSSAU)
+      : LI(LI), DT(DT), CurLoop(CurLoop), MSSAU(MSSAU) {}
+
+  void registerPossiblyHoistableBranch(BranchInst *BI) {
+    // We can only hoist conditional branches with loop invariant operands.
+    if (!ControlFlowHoisting || !BI->isConditional() ||
+        !CurLoop->hasLoopInvariantOperands(BI))
+      return;
+
+    // The branch destinations need to be in the loop, and we don't gain
+    // anything by duplicating conditional branches with duplicate successors,
+    // as it's essentially the same as an unconditional branch.
+    BasicBlock *TrueDest = BI->getSuccessor(0);
+    BasicBlock *FalseDest = BI->getSuccessor(1);
+    if (!CurLoop->contains(TrueDest) || !CurLoop->contains(FalseDest) ||
+        TrueDest == FalseDest)
+      return;
+
+    // We can hoist BI if one branch destination is the successor of the other,
+    // or both have common successor which we check by seeing if the
+    // intersection of their successors is non-empty.
+    // TODO: This could be expanded to allowing branches where both ends
+    // eventually converge to a single block.
+    SmallPtrSet<BasicBlock *, 4> TrueDestSucc, FalseDestSucc;
+    TrueDestSucc.insert(succ_begin(TrueDest), succ_end(TrueDest));
+    FalseDestSucc.insert(succ_begin(FalseDest), succ_end(FalseDest));
+    BasicBlock *CommonSucc = nullptr;
+    if (TrueDestSucc.count(FalseDest)) {
+      CommonSucc = FalseDest;
+    } else if (FalseDestSucc.count(TrueDest)) {
+      CommonSucc = TrueDest;
+    } else {
+      set_intersect(TrueDestSucc, FalseDestSucc);
+      // If there's one common successor use that.
+      if (TrueDestSucc.size() == 1)
+        CommonSucc = *TrueDestSucc.begin();
+      // If there's more than one pick whichever appears first in the block list
+      // (we can't use the value returned by TrueDestSucc.begin() as it's
+      // unpredicatable which element gets returned).
+      else if (!TrueDestSucc.empty()) {
+        Function *F = TrueDest->getParent();
+        auto IsSucc = [&](BasicBlock &BB) { return TrueDestSucc.count(&BB); };
         auto It = llvm::find_if(*F, IsSucc);
-        assert(It != F->end() && "Could not find successor in function"); 
-        CommonSucc = &*It; 
-      } 
-    } 
-    // The common successor has to be dominated by the branch, as otherwise 
-    // there will be some other path to the successor that will not be 
-    // controlled by this branch so any phi we hoist would be controlled by the 
-    // wrong condition. This also takes care of avoiding hoisting of loop back 
-    // edges. 
-    // TODO: In some cases this could be relaxed if the successor is dominated 
-    // by another block that's been hoisted and we can guarantee that the 
-    // control flow has been replicated exactly. 
-    if (CommonSucc && DT->dominates(BI, CommonSucc)) 
-      HoistableBranches[BI] = CommonSucc; 
-  } 
- 
-  bool canHoistPHI(PHINode *PN) { 
-    // The phi must have loop invariant operands. 
-    if (!ControlFlowHoisting || !CurLoop->hasLoopInvariantOperands(PN)) 
-      return false; 
-    // We can hoist phis if the block they are in is the target of hoistable 
-    // branches which cover all of the predecessors of the block. 
-    SmallPtrSet<BasicBlock *, 8> PredecessorBlocks; 
-    BasicBlock *BB = PN->getParent(); 
-    for (BasicBlock *PredBB : predecessors(BB)) 
-      PredecessorBlocks.insert(PredBB); 
-    // If we have less predecessor blocks than predecessors then the phi will 
-    // have more than one incoming value for the same block which we can't 
-    // handle. 
-    // TODO: This could be handled be erasing some of the duplicate incoming 
-    // values. 
-    if (PredecessorBlocks.size() != pred_size(BB)) 
-      return false; 
-    for (auto &Pair : HoistableBranches) { 
-      if (Pair.second == BB) { 
-        // Which blocks are predecessors via this branch depends on if the 
-        // branch is triangle-like or diamond-like. 
-        if (Pair.first->getSuccessor(0) == BB) { 
-          PredecessorBlocks.erase(Pair.first->getParent()); 
-          PredecessorBlocks.erase(Pair.first->getSuccessor(1)); 
-        } else if (Pair.first->getSuccessor(1) == BB) { 
-          PredecessorBlocks.erase(Pair.first->getParent()); 
-          PredecessorBlocks.erase(Pair.first->getSuccessor(0)); 
-        } else { 
-          PredecessorBlocks.erase(Pair.first->getSuccessor(0)); 
-          PredecessorBlocks.erase(Pair.first->getSuccessor(1)); 
-        } 
-      } 
-    } 
-    // PredecessorBlocks will now be empty if for every predecessor of BB we 
-    // found a hoistable branch source. 
-    return PredecessorBlocks.empty(); 
-  } 
- 
-  BasicBlock *getOrCreateHoistedBlock(BasicBlock *BB) { 
-    if (!ControlFlowHoisting) 
-      return CurLoop->getLoopPreheader(); 
-    // If BB has already been hoisted, return that 
-    if (HoistDestinationMap.count(BB)) 
-      return HoistDestinationMap[BB]; 
- 
-    // Check if this block is conditional based on a pending branch 
-    auto HasBBAsSuccessor = 
-        [&](DenseMap<BranchInst *, BasicBlock *>::value_type &Pair) { 
-          return BB != Pair.second && (Pair.first->getSuccessor(0) == BB || 
-                                       Pair.first->getSuccessor(1) == BB); 
-        }; 
+        assert(It != F->end() && "Could not find successor in function");
+        CommonSucc = &*It;
+      }
+    }
+    // The common successor has to be dominated by the branch, as otherwise
+    // there will be some other path to the successor that will not be
+    // controlled by this branch so any phi we hoist would be controlled by the
+    // wrong condition. This also takes care of avoiding hoisting of loop back
+    // edges.
+    // TODO: In some cases this could be relaxed if the successor is dominated
+    // by another block that's been hoisted and we can guarantee that the
+    // control flow has been replicated exactly.
+    if (CommonSucc && DT->dominates(BI, CommonSucc))
+      HoistableBranches[BI] = CommonSucc;
+  }
+
+  bool canHoistPHI(PHINode *PN) {
+    // The phi must have loop invariant operands.
+    if (!ControlFlowHoisting || !CurLoop->hasLoopInvariantOperands(PN))
+      return false;
+    // We can hoist phis if the block they are in is the target of hoistable
+    // branches which cover all of the predecessors of the block.
+    SmallPtrSet<BasicBlock *, 8> PredecessorBlocks;
+    BasicBlock *BB = PN->getParent();
+    for (BasicBlock *PredBB : predecessors(BB))
+      PredecessorBlocks.insert(PredBB);
+    // If we have less predecessor blocks than predecessors then the phi will
+    // have more than one incoming value for the same block which we can't
+    // handle.
+    // TODO: This could be handled be erasing some of the duplicate incoming
+    // values.
+    if (PredecessorBlocks.size() != pred_size(BB))
+      return false;
+    for (auto &Pair : HoistableBranches) {
+      if (Pair.second == BB) {
+        // Which blocks are predecessors via this branch depends on if the
+        // branch is triangle-like or diamond-like.
+        if (Pair.first->getSuccessor(0) == BB) {
+          PredecessorBlocks.erase(Pair.first->getParent());
+          PredecessorBlocks.erase(Pair.first->getSuccessor(1));
+        } else if (Pair.first->getSuccessor(1) == BB) {
+          PredecessorBlocks.erase(Pair.first->getParent());
+          PredecessorBlocks.erase(Pair.first->getSuccessor(0));
+        } else {
+          PredecessorBlocks.erase(Pair.first->getSuccessor(0));
+          PredecessorBlocks.erase(Pair.first->getSuccessor(1));
+        }
+      }
+    }
+    // PredecessorBlocks will now be empty if for every predecessor of BB we
+    // found a hoistable branch source.
+    return PredecessorBlocks.empty();
+  }
+
+  BasicBlock *getOrCreateHoistedBlock(BasicBlock *BB) {
+    if (!ControlFlowHoisting)
+      return CurLoop->getLoopPreheader();
+    // If BB has already been hoisted, return that
+    if (HoistDestinationMap.count(BB))
+      return HoistDestinationMap[BB];
+
+    // Check if this block is conditional based on a pending branch
+    auto HasBBAsSuccessor =
+        [&](DenseMap<BranchInst *, BasicBlock *>::value_type &Pair) {
+          return BB != Pair.second && (Pair.first->getSuccessor(0) == BB ||
+                                       Pair.first->getSuccessor(1) == BB);
+        };
     auto It = llvm::find_if(HoistableBranches, HasBBAsSuccessor);
- 
-    // If not involved in a pending branch, hoist to preheader 
-    BasicBlock *InitialPreheader = CurLoop->getLoopPreheader(); 
-    if (It == HoistableBranches.end()) { 
+
+    // If not involved in a pending branch, hoist to preheader
+    BasicBlock *InitialPreheader = CurLoop->getLoopPreheader();
+    if (It == HoistableBranches.end()) {
       LLVM_DEBUG(dbgs() << "LICM using "
                         << InitialPreheader->getNameOrAsOperand()
                         << " as hoist destination for "
                         << BB->getNameOrAsOperand() << "\n");
-      HoistDestinationMap[BB] = InitialPreheader; 
-      return InitialPreheader; 
-    } 
-    BranchInst *BI = It->first; 
-    assert(std::find_if(++It, HoistableBranches.end(), HasBBAsSuccessor) == 
-               HoistableBranches.end() && 
-           "BB is expected to be the target of at most one branch"); 
- 
-    LLVMContext &C = BB->getContext(); 
-    BasicBlock *TrueDest = BI->getSuccessor(0); 
-    BasicBlock *FalseDest = BI->getSuccessor(1); 
-    BasicBlock *CommonSucc = HoistableBranches[BI]; 
-    BasicBlock *HoistTarget = getOrCreateHoistedBlock(BI->getParent()); 
- 
-    // Create hoisted versions of blocks that currently don't have them 
-    auto CreateHoistedBlock = [&](BasicBlock *Orig) { 
-      if (HoistDestinationMap.count(Orig)) 
-        return HoistDestinationMap[Orig]; 
-      BasicBlock *New = 
-          BasicBlock::Create(C, Orig->getName() + ".licm", Orig->getParent()); 
-      HoistDestinationMap[Orig] = New; 
-      DT->addNewBlock(New, HoistTarget); 
-      if (CurLoop->getParentLoop()) 
-        CurLoop->getParentLoop()->addBasicBlockToLoop(New, *LI); 
-      ++NumCreatedBlocks; 
-      LLVM_DEBUG(dbgs() << "LICM created " << New->getName() 
-                        << " as hoist destination for " << Orig->getName() 
-                        << "\n"); 
-      return New; 
-    }; 
-    BasicBlock *HoistTrueDest = CreateHoistedBlock(TrueDest); 
-    BasicBlock *HoistFalseDest = CreateHoistedBlock(FalseDest); 
-    BasicBlock *HoistCommonSucc = CreateHoistedBlock(CommonSucc); 
- 
-    // Link up these blocks with branches. 
-    if (!HoistCommonSucc->getTerminator()) { 
-      // The new common successor we've generated will branch to whatever that 
-      // hoist target branched to. 
-      BasicBlock *TargetSucc = HoistTarget->getSingleSuccessor(); 
-      assert(TargetSucc && "Expected hoist target to have a single successor"); 
-      HoistCommonSucc->moveBefore(TargetSucc); 
-      BranchInst::Create(TargetSucc, HoistCommonSucc); 
-    } 
-    if (!HoistTrueDest->getTerminator()) { 
-      HoistTrueDest->moveBefore(HoistCommonSucc); 
-      BranchInst::Create(HoistCommonSucc, HoistTrueDest); 
-    } 
-    if (!HoistFalseDest->getTerminator()) { 
-      HoistFalseDest->moveBefore(HoistCommonSucc); 
-      BranchInst::Create(HoistCommonSucc, HoistFalseDest); 
-    } 
- 
-    // If BI is being cloned to what was originally the preheader then 
-    // HoistCommonSucc will now be the new preheader. 
-    if (HoistTarget == InitialPreheader) { 
-      // Phis in the loop header now need to use the new preheader. 
-      InitialPreheader->replaceSuccessorsPhiUsesWith(HoistCommonSucc); 
-      if (MSSAU) 
-        MSSAU->wireOldPredecessorsToNewImmediatePredecessor( 
-            HoistTarget->getSingleSuccessor(), HoistCommonSucc, {HoistTarget}); 
-      // The new preheader dominates the loop header. 
-      DomTreeNode *PreheaderNode = DT->getNode(HoistCommonSucc); 
-      DomTreeNode *HeaderNode = DT->getNode(CurLoop->getHeader()); 
-      DT->changeImmediateDominator(HeaderNode, PreheaderNode); 
-      // The preheader hoist destination is now the new preheader, with the 
-      // exception of the hoist destination of this branch. 
-      for (auto &Pair : HoistDestinationMap) 
-        if (Pair.second == InitialPreheader && Pair.first != BI->getParent()) 
-          Pair.second = HoistCommonSucc; 
-    } 
- 
-    // Now finally clone BI. 
-    ReplaceInstWithInst( 
-        HoistTarget->getTerminator(), 
-        BranchInst::Create(HoistTrueDest, HoistFalseDest, BI->getCondition())); 
-    ++NumClonedBranches; 
- 
-    assert(CurLoop->getLoopPreheader() && 
-           "Hoisting blocks should not have destroyed preheader"); 
-    return HoistDestinationMap[BB]; 
-  } 
-}; 
-} // namespace 
- 
+      HoistDestinationMap[BB] = InitialPreheader;
+      return InitialPreheader;
+    }
+    BranchInst *BI = It->first;
+    assert(std::find_if(++It, HoistableBranches.end(), HasBBAsSuccessor) ==
+               HoistableBranches.end() &&
+           "BB is expected to be the target of at most one branch");
+
+    LLVMContext &C = BB->getContext();
+    BasicBlock *TrueDest = BI->getSuccessor(0);
+    BasicBlock *FalseDest = BI->getSuccessor(1);
+    BasicBlock *CommonSucc = HoistableBranches[BI];
+    BasicBlock *HoistTarget = getOrCreateHoistedBlock(BI->getParent());
+
+    // Create hoisted versions of blocks that currently don't have them
+    auto CreateHoistedBlock = [&](BasicBlock *Orig) {
+      if (HoistDestinationMap.count(Orig))
+        return HoistDestinationMap[Orig];
+      BasicBlock *New =
+          BasicBlock::Create(C, Orig->getName() + ".licm", Orig->getParent());
+      HoistDestinationMap[Orig] = New;
+      DT->addNewBlock(New, HoistTarget);
+      if (CurLoop->getParentLoop())
+        CurLoop->getParentLoop()->addBasicBlockToLoop(New, *LI);
+      ++NumCreatedBlocks;
+      LLVM_DEBUG(dbgs() << "LICM created " << New->getName()
+                        << " as hoist destination for " << Orig->getName()
+                        << "\n");
+      return New;
+    };
+    BasicBlock *HoistTrueDest = CreateHoistedBlock(TrueDest);
+    BasicBlock *HoistFalseDest = CreateHoistedBlock(FalseDest);
+    BasicBlock *HoistCommonSucc = CreateHoistedBlock(CommonSucc);
+
+    // Link up these blocks with branches.
+    if (!HoistCommonSucc->getTerminator()) {
+      // The new common successor we've generated will branch to whatever that
+      // hoist target branched to.
+      BasicBlock *TargetSucc = HoistTarget->getSingleSuccessor();
+      assert(TargetSucc && "Expected hoist target to have a single successor");
+      HoistCommonSucc->moveBefore(TargetSucc);
+      BranchInst::Create(TargetSucc, HoistCommonSucc);
+    }
+    if (!HoistTrueDest->getTerminator()) {
+      HoistTrueDest->moveBefore(HoistCommonSucc);
+      BranchInst::Create(HoistCommonSucc, HoistTrueDest);
+    }
+    if (!HoistFalseDest->getTerminator()) {
+      HoistFalseDest->moveBefore(HoistCommonSucc);
+      BranchInst::Create(HoistCommonSucc, HoistFalseDest);
+    }
+
+    // If BI is being cloned to what was originally the preheader then
+    // HoistCommonSucc will now be the new preheader.
+    if (HoistTarget == InitialPreheader) {
+      // Phis in the loop header now need to use the new preheader.
+      InitialPreheader->replaceSuccessorsPhiUsesWith(HoistCommonSucc);
+      if (MSSAU)
+        MSSAU->wireOldPredecessorsToNewImmediatePredecessor(
+            HoistTarget->getSingleSuccessor(), HoistCommonSucc, {HoistTarget});
+      // The new preheader dominates the loop header.
+      DomTreeNode *PreheaderNode = DT->getNode(HoistCommonSucc);
+      DomTreeNode *HeaderNode = DT->getNode(CurLoop->getHeader());
+      DT->changeImmediateDominator(HeaderNode, PreheaderNode);
+      // The preheader hoist destination is now the new preheader, with the
+      // exception of the hoist destination of this branch.
+      for (auto &Pair : HoistDestinationMap)
+        if (Pair.second == InitialPreheader && Pair.first != BI->getParent())
+          Pair.second = HoistCommonSucc;
+    }
+
+    // Now finally clone BI.
+    ReplaceInstWithInst(
+        HoistTarget->getTerminator(),
+        BranchInst::Create(HoistTrueDest, HoistFalseDest, BI->getCondition()));
+    ++NumClonedBranches;
+
+    assert(CurLoop->getLoopPreheader() &&
+           "Hoisting blocks should not have destroyed preheader");
+    return HoistDestinationMap[BB];
+  }
+};
+} // namespace
+
 // Hoisting/sinking instruction out of a loop isn't always beneficial. It's only
 // only worthwhile if the destination block is actually colder than current
 // block.
@@ -817,205 +817,205 @@ static bool worthSinkOrHoistInst(Instruction &I, BasicBlock *DstBlock,
   return true;
 }
 
-/// Walk the specified region of the CFG (defined by all blocks dominated by 
-/// the specified block, and that are in the current loop) in depth first 
-/// order w.r.t the DominatorTree.  This allows us to visit definitions before 
-/// uses, allowing us to hoist a loop body in one pass without iteration. 
-/// 
-bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, 
+/// Walk the specified region of the CFG (defined by all blocks dominated by
+/// the specified block, and that are in the current loop) in depth first
+/// order w.r.t the DominatorTree.  This allows us to visit definitions before
+/// uses, allowing us to hoist a loop body in one pass without iteration.
+///
+bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
                        DominatorTree *DT, BlockFrequencyInfo *BFI,
                        TargetLibraryInfo *TLI, Loop *CurLoop,
-                       AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU, 
-                       ScalarEvolution *SE, ICFLoopSafetyInfo *SafetyInfo, 
-                       SinkAndHoistLICMFlags &Flags, 
-                       OptimizationRemarkEmitter *ORE) { 
-  // Verify inputs. 
-  assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr && 
-         CurLoop != nullptr && SafetyInfo != nullptr && 
-         "Unexpected input to hoistRegion."); 
-  assert(((CurAST != nullptr) ^ (MSSAU != nullptr)) && 
-         "Either AliasSetTracker or MemorySSA should be initialized."); 
- 
-  ControlFlowHoister CFH(LI, DT, CurLoop, MSSAU); 
- 
-  // Keep track of instructions that have been hoisted, as they may need to be 
-  // re-hoisted if they end up not dominating all of their uses. 
-  SmallVector<Instruction *, 16> HoistedInstructions; 
- 
-  // For PHI hoisting to work we need to hoist blocks before their successors. 
-  // We can do this by iterating through the blocks in the loop in reverse 
-  // post-order. 
-  LoopBlocksRPO Worklist(CurLoop); 
-  Worklist.perform(LI); 
-  bool Changed = false; 
-  for (BasicBlock *BB : Worklist) { 
-    // Only need to process the contents of this block if it is not part of a 
-    // subloop (which would already have been processed). 
-    if (inSubLoop(BB, CurLoop, LI)) 
-      continue; 
- 
-    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) { 
-      Instruction &I = *II++; 
-      // Try constant folding this instruction.  If all the operands are 
-      // constants, it is technically hoistable, but it would be better to 
-      // just fold it. 
-      if (Constant *C = ConstantFoldInstruction( 
-              &I, I.getModule()->getDataLayout(), TLI)) { 
-        LLVM_DEBUG(dbgs() << "LICM folding inst: " << I << "  --> " << *C 
-                          << '\n'); 
-        if (CurAST) 
-          CurAST->copyValue(&I, C); 
-        // FIXME MSSA: Such replacements may make accesses unoptimized (D51960). 
-        I.replaceAllUsesWith(C); 
-        if (isInstructionTriviallyDead(&I, TLI)) 
-          eraseInstruction(I, *SafetyInfo, CurAST, MSSAU); 
-        Changed = true; 
-        continue; 
-      } 
- 
-      // Try hoisting the instruction out to the preheader.  We can only do 
-      // this if all of the operands of the instruction are loop invariant and 
+                       AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU,
+                       ScalarEvolution *SE, ICFLoopSafetyInfo *SafetyInfo,
+                       SinkAndHoistLICMFlags &Flags,
+                       OptimizationRemarkEmitter *ORE) {
+  // Verify inputs.
+  assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr &&
+         CurLoop != nullptr && SafetyInfo != nullptr &&
+         "Unexpected input to hoistRegion.");
+  assert(((CurAST != nullptr) ^ (MSSAU != nullptr)) &&
+         "Either AliasSetTracker or MemorySSA should be initialized.");
+
+  ControlFlowHoister CFH(LI, DT, CurLoop, MSSAU);
+
+  // Keep track of instructions that have been hoisted, as they may need to be
+  // re-hoisted if they end up not dominating all of their uses.
+  SmallVector<Instruction *, 16> HoistedInstructions;
+
+  // For PHI hoisting to work we need to hoist blocks before their successors.
+  // We can do this by iterating through the blocks in the loop in reverse
+  // post-order.
+  LoopBlocksRPO Worklist(CurLoop);
+  Worklist.perform(LI);
+  bool Changed = false;
+  for (BasicBlock *BB : Worklist) {
+    // Only need to process the contents of this block if it is not part of a
+    // subloop (which would already have been processed).
+    if (inSubLoop(BB, CurLoop, LI))
+      continue;
+
+    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) {
+      Instruction &I = *II++;
+      // Try constant folding this instruction.  If all the operands are
+      // constants, it is technically hoistable, but it would be better to
+      // just fold it.
+      if (Constant *C = ConstantFoldInstruction(
+              &I, I.getModule()->getDataLayout(), TLI)) {
+        LLVM_DEBUG(dbgs() << "LICM folding inst: " << I << "  --> " << *C
+                          << '\n');
+        if (CurAST)
+          CurAST->copyValue(&I, C);
+        // FIXME MSSA: Such replacements may make accesses unoptimized (D51960).
+        I.replaceAllUsesWith(C);
+        if (isInstructionTriviallyDead(&I, TLI))
+          eraseInstruction(I, *SafetyInfo, CurAST, MSSAU);
+        Changed = true;
+        continue;
+      }
+
+      // Try hoisting the instruction out to the preheader.  We can only do
+      // this if all of the operands of the instruction are loop invariant and
       // if it is safe to hoist the instruction. We also check block frequency
       // to make sure instruction only gets hoisted into colder blocks.
-      // TODO: It may be safe to hoist if we are hoisting to a conditional block 
-      // and we have accurately duplicated the control flow from the loop header 
-      // to that block. 
-      if (CurLoop->hasLoopInvariantOperands(&I) && 
-          canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, MSSAU, true, &Flags, 
-                             ORE) && 
+      // TODO: It may be safe to hoist if we are hoisting to a conditional block
+      // and we have accurately duplicated the control flow from the loop header
+      // to that block.
+      if (CurLoop->hasLoopInvariantOperands(&I) &&
+          canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, MSSAU, true, &Flags,
+                             ORE) &&
           worthSinkOrHoistInst(I, CurLoop->getLoopPreheader(), ORE, BFI) &&
-          isSafeToExecuteUnconditionally( 
-              I, DT, CurLoop, SafetyInfo, ORE, 
-              CurLoop->getLoopPreheader()->getTerminator())) { 
-        hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, 
-              MSSAU, SE, ORE); 
-        HoistedInstructions.push_back(&I); 
-        Changed = true; 
-        continue; 
-      } 
- 
-      // Attempt to remove floating point division out of the loop by 
-      // converting it to a reciprocal multiplication. 
-      if (I.getOpcode() == Instruction::FDiv && I.hasAllowReciprocal() && 
-          CurLoop->isLoopInvariant(I.getOperand(1))) { 
-        auto Divisor = I.getOperand(1); 
-        auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0); 
-        auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor); 
-        ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags()); 
-        SafetyInfo->insertInstructionTo(ReciprocalDivisor, I.getParent()); 
-        ReciprocalDivisor->insertBefore(&I); 
- 
-        auto Product = 
-            BinaryOperator::CreateFMul(I.getOperand(0), ReciprocalDivisor); 
-        Product->setFastMathFlags(I.getFastMathFlags()); 
-        SafetyInfo->insertInstructionTo(Product, I.getParent()); 
-        Product->insertAfter(&I); 
-        I.replaceAllUsesWith(Product); 
-        eraseInstruction(I, *SafetyInfo, CurAST, MSSAU); 
- 
-        hoist(*ReciprocalDivisor, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), 
-              SafetyInfo, MSSAU, SE, ORE); 
-        HoistedInstructions.push_back(ReciprocalDivisor); 
-        Changed = true; 
-        continue; 
-      } 
- 
-      auto IsInvariantStart = [&](Instruction &I) { 
-        using namespace PatternMatch; 
-        return I.use_empty() && 
-               match(&I, m_Intrinsic<Intrinsic::invariant_start>()); 
-      }; 
-      auto MustExecuteWithoutWritesBefore = [&](Instruction &I) { 
-        return SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop) && 
-               SafetyInfo->doesNotWriteMemoryBefore(I, CurLoop); 
-      }; 
-      if ((IsInvariantStart(I) || isGuard(&I)) && 
-          CurLoop->hasLoopInvariantOperands(&I) && 
-          MustExecuteWithoutWritesBefore(I)) { 
-        hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, 
-              MSSAU, SE, ORE); 
-        HoistedInstructions.push_back(&I); 
-        Changed = true; 
-        continue; 
-      } 
- 
-      if (PHINode *PN = dyn_cast<PHINode>(&I)) { 
-        if (CFH.canHoistPHI(PN)) { 
-          // Redirect incoming blocks first to ensure that we create hoisted 
-          // versions of those blocks before we hoist the phi. 
-          for (unsigned int i = 0; i < PN->getNumIncomingValues(); ++i) 
-            PN->setIncomingBlock( 
-                i, CFH.getOrCreateHoistedBlock(PN->getIncomingBlock(i))); 
-          hoist(*PN, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, 
-                MSSAU, SE, ORE); 
-          assert(DT->dominates(PN, BB) && "Conditional PHIs not expected"); 
-          Changed = true; 
-          continue; 
-        } 
-      } 
- 
-      // Remember possibly hoistable branches so we can actually hoist them 
-      // later if needed. 
-      if (BranchInst *BI = dyn_cast<BranchInst>(&I)) 
-        CFH.registerPossiblyHoistableBranch(BI); 
-    } 
-  } 
- 
-  // If we hoisted instructions to a conditional block they may not dominate 
-  // their uses that weren't hoisted (such as phis where some operands are not 
-  // loop invariant). If so make them unconditional by moving them to their 
-  // immediate dominator. We iterate through the instructions in reverse order 
-  // which ensures that when we rehoist an instruction we rehoist its operands, 
-  // and also keep track of where in the block we are rehoisting to to make sure 
-  // that we rehoist instructions before the instructions that use them. 
-  Instruction *HoistPoint = nullptr; 
-  if (ControlFlowHoisting) { 
-    for (Instruction *I : reverse(HoistedInstructions)) { 
-      if (!llvm::all_of(I->uses(), 
-                        [&](Use &U) { return DT->dominates(I, U); })) { 
-        BasicBlock *Dominator = 
-            DT->getNode(I->getParent())->getIDom()->getBlock(); 
-        if (!HoistPoint || !DT->dominates(HoistPoint->getParent(), Dominator)) { 
-          if (HoistPoint) 
-            assert(DT->dominates(Dominator, HoistPoint->getParent()) && 
-                   "New hoist point expected to dominate old hoist point"); 
-          HoistPoint = Dominator->getTerminator(); 
-        } 
-        LLVM_DEBUG(dbgs() << "LICM rehoisting to " 
+          isSafeToExecuteUnconditionally(
+              I, DT, CurLoop, SafetyInfo, ORE,
+              CurLoop->getLoopPreheader()->getTerminator())) {
+        hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo,
+              MSSAU, SE, ORE);
+        HoistedInstructions.push_back(&I);
+        Changed = true;
+        continue;
+      }
+
+      // Attempt to remove floating point division out of the loop by
+      // converting it to a reciprocal multiplication.
+      if (I.getOpcode() == Instruction::FDiv && I.hasAllowReciprocal() &&
+          CurLoop->isLoopInvariant(I.getOperand(1))) {
+        auto Divisor = I.getOperand(1);
+        auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0);
+        auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor);
+        ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags());
+        SafetyInfo->insertInstructionTo(ReciprocalDivisor, I.getParent());
+        ReciprocalDivisor->insertBefore(&I);
+
+        auto Product =
+            BinaryOperator::CreateFMul(I.getOperand(0), ReciprocalDivisor);
+        Product->setFastMathFlags(I.getFastMathFlags());
+        SafetyInfo->insertInstructionTo(Product, I.getParent());
+        Product->insertAfter(&I);
+        I.replaceAllUsesWith(Product);
+        eraseInstruction(I, *SafetyInfo, CurAST, MSSAU);
+
+        hoist(*ReciprocalDivisor, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB),
+              SafetyInfo, MSSAU, SE, ORE);
+        HoistedInstructions.push_back(ReciprocalDivisor);
+        Changed = true;
+        continue;
+      }
+
+      auto IsInvariantStart = [&](Instruction &I) {
+        using namespace PatternMatch;
+        return I.use_empty() &&
+               match(&I, m_Intrinsic<Intrinsic::invariant_start>());
+      };
+      auto MustExecuteWithoutWritesBefore = [&](Instruction &I) {
+        return SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop) &&
+               SafetyInfo->doesNotWriteMemoryBefore(I, CurLoop);
+      };
+      if ((IsInvariantStart(I) || isGuard(&I)) &&
+          CurLoop->hasLoopInvariantOperands(&I) &&
+          MustExecuteWithoutWritesBefore(I)) {
+        hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo,
+              MSSAU, SE, ORE);
+        HoistedInstructions.push_back(&I);
+        Changed = true;
+        continue;
+      }
+
+      if (PHINode *PN = dyn_cast<PHINode>(&I)) {
+        if (CFH.canHoistPHI(PN)) {
+          // Redirect incoming blocks first to ensure that we create hoisted
+          // versions of those blocks before we hoist the phi.
+          for (unsigned int i = 0; i < PN->getNumIncomingValues(); ++i)
+            PN->setIncomingBlock(
+                i, CFH.getOrCreateHoistedBlock(PN->getIncomingBlock(i)));
+          hoist(*PN, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo,
+                MSSAU, SE, ORE);
+          assert(DT->dominates(PN, BB) && "Conditional PHIs not expected");
+          Changed = true;
+          continue;
+        }
+      }
+
+      // Remember possibly hoistable branches so we can actually hoist them
+      // later if needed.
+      if (BranchInst *BI = dyn_cast<BranchInst>(&I))
+        CFH.registerPossiblyHoistableBranch(BI);
+    }
+  }
+
+  // If we hoisted instructions to a conditional block they may not dominate
+  // their uses that weren't hoisted (such as phis where some operands are not
+  // loop invariant). If so make them unconditional by moving them to their
+  // immediate dominator. We iterate through the instructions in reverse order
+  // which ensures that when we rehoist an instruction we rehoist its operands,
+  // and also keep track of where in the block we are rehoisting to to make sure
+  // that we rehoist instructions before the instructions that use them.
+  Instruction *HoistPoint = nullptr;
+  if (ControlFlowHoisting) {
+    for (Instruction *I : reverse(HoistedInstructions)) {
+      if (!llvm::all_of(I->uses(),
+                        [&](Use &U) { return DT->dominates(I, U); })) {
+        BasicBlock *Dominator =
+            DT->getNode(I->getParent())->getIDom()->getBlock();
+        if (!HoistPoint || !DT->dominates(HoistPoint->getParent(), Dominator)) {
+          if (HoistPoint)
+            assert(DT->dominates(Dominator, HoistPoint->getParent()) &&
+                   "New hoist point expected to dominate old hoist point");
+          HoistPoint = Dominator->getTerminator();
+        }
+        LLVM_DEBUG(dbgs() << "LICM rehoisting to "
                           << HoistPoint->getParent()->getNameOrAsOperand()
-                          << ": " << *I << "\n"); 
-        moveInstructionBefore(*I, *HoistPoint, *SafetyInfo, MSSAU, SE); 
-        HoistPoint = I; 
-        Changed = true; 
-      } 
-    } 
-  } 
-  if (MSSAU && VerifyMemorySSA) 
-    MSSAU->getMemorySSA()->verifyMemorySSA(); 
- 
-    // Now that we've finished hoisting make sure that LI and DT are still 
-    // valid. 
-#ifdef EXPENSIVE_CHECKS 
-  if (Changed) { 
-    assert(DT->verify(DominatorTree::VerificationLevel::Fast) && 
-           "Dominator tree verification failed"); 
-    LI->verify(*DT); 
-  } 
-#endif 
- 
-  return Changed; 
-} 
- 
-// Return true if LI is invariant within scope of the loop. LI is invariant if 
-// CurLoop is dominated by an invariant.start representing the same memory 
-// location and size as the memory location LI loads from, and also the 
-// invariant.start has no uses. 
-static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT, 
-                                  Loop *CurLoop) { 
-  Value *Addr = LI->getOperand(0); 
-  const DataLayout &DL = LI->getModule()->getDataLayout(); 
+                          << ": " << *I << "\n");
+        moveInstructionBefore(*I, *HoistPoint, *SafetyInfo, MSSAU, SE);
+        HoistPoint = I;
+        Changed = true;
+      }
+    }
+  }
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
+    // Now that we've finished hoisting make sure that LI and DT are still
+    // valid.
+#ifdef EXPENSIVE_CHECKS
+  if (Changed) {
+    assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
+           "Dominator tree verification failed");
+    LI->verify(*DT);
+  }
+#endif
+
+  return Changed;
+}
+
+// Return true if LI is invariant within scope of the loop. LI is invariant if
+// CurLoop is dominated by an invariant.start representing the same memory
+// location and size as the memory location LI loads from, and also the
+// invariant.start has no uses.
+static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT,
+                                  Loop *CurLoop) {
+  Value *Addr = LI->getOperand(0);
+  const DataLayout &DL = LI->getModule()->getDataLayout();
   const TypeSize LocSizeInBits = DL.getTypeSizeInBits(LI->getType());
- 
+
   // It is not currently possible for clang to generate an invariant.start
   // intrinsic with scalable vector types because we don't support thread local
   // sizeless types and we don't permit sizeless types in structs or classes.
@@ -1028,166 +1028,166 @@ static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT,
   if (LocSizeInBits.isScalable())
     return false;
 
-  // if the type is i8 addrspace(x)*, we know this is the type of 
-  // llvm.invariant.start operand 
-  auto *PtrInt8Ty = PointerType::get(Type::getInt8Ty(LI->getContext()), 
-                                     LI->getPointerAddressSpace()); 
-  unsigned BitcastsVisited = 0; 
-  // Look through bitcasts until we reach the i8* type (this is invariant.start 
-  // operand type). 
-  while (Addr->getType() != PtrInt8Ty) { 
-    auto *BC = dyn_cast<BitCastInst>(Addr); 
-    // Avoid traversing high number of bitcast uses. 
-    if (++BitcastsVisited > MaxNumUsesTraversed || !BC) 
-      return false; 
-    Addr = BC->getOperand(0); 
-  } 
- 
-  unsigned UsesVisited = 0; 
-  // Traverse all uses of the load operand value, to see if invariant.start is 
-  // one of the uses, and whether it dominates the load instruction. 
-  for (auto *U : Addr->users()) { 
-    // Avoid traversing for Load operand with high number of users. 
-    if (++UsesVisited > MaxNumUsesTraversed) 
-      return false; 
-    IntrinsicInst *II = dyn_cast<IntrinsicInst>(U); 
-    // If there are escaping uses of invariant.start instruction, the load maybe 
-    // non-invariant. 
-    if (!II || II->getIntrinsicID() != Intrinsic::invariant_start || 
-        !II->use_empty()) 
-      continue; 
+  // if the type is i8 addrspace(x)*, we know this is the type of
+  // llvm.invariant.start operand
+  auto *PtrInt8Ty = PointerType::get(Type::getInt8Ty(LI->getContext()),
+                                     LI->getPointerAddressSpace());
+  unsigned BitcastsVisited = 0;
+  // Look through bitcasts until we reach the i8* type (this is invariant.start
+  // operand type).
+  while (Addr->getType() != PtrInt8Ty) {
+    auto *BC = dyn_cast<BitCastInst>(Addr);
+    // Avoid traversing high number of bitcast uses.
+    if (++BitcastsVisited > MaxNumUsesTraversed || !BC)
+      return false;
+    Addr = BC->getOperand(0);
+  }
+
+  unsigned UsesVisited = 0;
+  // Traverse all uses of the load operand value, to see if invariant.start is
+  // one of the uses, and whether it dominates the load instruction.
+  for (auto *U : Addr->users()) {
+    // Avoid traversing for Load operand with high number of users.
+    if (++UsesVisited > MaxNumUsesTraversed)
+      return false;
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
+    // If there are escaping uses of invariant.start instruction, the load maybe
+    // non-invariant.
+    if (!II || II->getIntrinsicID() != Intrinsic::invariant_start ||
+        !II->use_empty())
+      continue;
     ConstantInt *InvariantSize = cast<ConstantInt>(II->getArgOperand(0));
     // The intrinsic supports having a -1 argument for variable sized objects
     // so we should check for that here.
     if (InvariantSize->isNegative())
       continue;
     uint64_t InvariantSizeInBits = InvariantSize->getSExtValue() * 8;
-    // Confirm the invariant.start location size contains the load operand size 
-    // in bits. Also, the invariant.start should dominate the load, and we 
-    // should not hoist the load out of a loop that contains this dominating 
-    // invariant.start. 
+    // Confirm the invariant.start location size contains the load operand size
+    // in bits. Also, the invariant.start should dominate the load, and we
+    // should not hoist the load out of a loop that contains this dominating
+    // invariant.start.
     if (LocSizeInBits.getFixedSize() <= InvariantSizeInBits &&
-        DT->properlyDominates(II->getParent(), CurLoop->getHeader())) 
-      return true; 
-  } 
- 
-  return false; 
-} 
- 
-namespace { 
-/// Return true if-and-only-if we know how to (mechanically) both hoist and 
-/// sink a given instruction out of a loop.  Does not address legality 
-/// concerns such as aliasing or speculation safety. 
-bool isHoistableAndSinkableInst(Instruction &I) { 
-  // Only these instructions are hoistable/sinkable. 
-  return (isa<LoadInst>(I) || isa<StoreInst>(I) || isa<CallInst>(I) || 
-          isa<FenceInst>(I) || isa<CastInst>(I) || isa<UnaryOperator>(I) || 
-          isa<BinaryOperator>(I) || isa<SelectInst>(I) || 
-          isa<GetElementPtrInst>(I) || isa<CmpInst>(I) || 
-          isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 
-          isa<ShuffleVectorInst>(I) || isa<ExtractValueInst>(I) || 
-          isa<InsertValueInst>(I) || isa<FreezeInst>(I)); 
-} 
-/// Return true if all of the alias sets within this AST are known not to 
-/// contain a Mod, or if MSSA knows thare are no MemoryDefs in the loop. 
-bool isReadOnly(AliasSetTracker *CurAST, const MemorySSAUpdater *MSSAU, 
-                const Loop *L) { 
-  if (CurAST) { 
-    for (AliasSet &AS : *CurAST) { 
-      if (!AS.isForwardingAliasSet() && AS.isMod()) { 
-        return false; 
-      } 
-    } 
-    return true; 
-  } else { /*MSSAU*/ 
-    for (auto *BB : L->getBlocks()) 
-      if (MSSAU->getMemorySSA()->getBlockDefs(BB)) 
-        return false; 
-    return true; 
-  } 
-} 
- 
-/// Return true if I is the only Instruction with a MemoryAccess in L. 
-bool isOnlyMemoryAccess(const Instruction *I, const Loop *L, 
-                        const MemorySSAUpdater *MSSAU) { 
-  for (auto *BB : L->getBlocks()) 
-    if (auto *Accs = MSSAU->getMemorySSA()->getBlockAccesses(BB)) { 
-      int NotAPhi = 0; 
-      for (const auto &Acc : *Accs) { 
-        if (isa<MemoryPhi>(&Acc)) 
-          continue; 
-        const auto *MUD = cast<MemoryUseOrDef>(&Acc); 
-        if (MUD->getMemoryInst() != I || NotAPhi++ == 1) 
-          return false; 
-      } 
-    } 
-  return true; 
-} 
-} 
- 
-bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT, 
-                              Loop *CurLoop, AliasSetTracker *CurAST, 
-                              MemorySSAUpdater *MSSAU, 
-                              bool TargetExecutesOncePerLoop, 
-                              SinkAndHoistLICMFlags *Flags, 
-                              OptimizationRemarkEmitter *ORE) { 
+        DT->properlyDominates(II->getParent(), CurLoop->getHeader()))
+      return true;
+  }
+
+  return false;
+}
+
+namespace {
+/// Return true if-and-only-if we know how to (mechanically) both hoist and
+/// sink a given instruction out of a loop.  Does not address legality
+/// concerns such as aliasing or speculation safety.
+bool isHoistableAndSinkableInst(Instruction &I) {
+  // Only these instructions are hoistable/sinkable.
+  return (isa<LoadInst>(I) || isa<StoreInst>(I) || isa<CallInst>(I) ||
+          isa<FenceInst>(I) || isa<CastInst>(I) || isa<UnaryOperator>(I) ||
+          isa<BinaryOperator>(I) || isa<SelectInst>(I) ||
+          isa<GetElementPtrInst>(I) || isa<CmpInst>(I) ||
+          isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
+          isa<ShuffleVectorInst>(I) || isa<ExtractValueInst>(I) ||
+          isa<InsertValueInst>(I) || isa<FreezeInst>(I));
+}
+/// Return true if all of the alias sets within this AST are known not to
+/// contain a Mod, or if MSSA knows thare are no MemoryDefs in the loop.
+bool isReadOnly(AliasSetTracker *CurAST, const MemorySSAUpdater *MSSAU,
+                const Loop *L) {
+  if (CurAST) {
+    for (AliasSet &AS : *CurAST) {
+      if (!AS.isForwardingAliasSet() && AS.isMod()) {
+        return false;
+      }
+    }
+    return true;
+  } else { /*MSSAU*/
+    for (auto *BB : L->getBlocks())
+      if (MSSAU->getMemorySSA()->getBlockDefs(BB))
+        return false;
+    return true;
+  }
+}
+
+/// Return true if I is the only Instruction with a MemoryAccess in L.
+bool isOnlyMemoryAccess(const Instruction *I, const Loop *L,
+                        const MemorySSAUpdater *MSSAU) {
+  for (auto *BB : L->getBlocks())
+    if (auto *Accs = MSSAU->getMemorySSA()->getBlockAccesses(BB)) {
+      int NotAPhi = 0;
+      for (const auto &Acc : *Accs) {
+        if (isa<MemoryPhi>(&Acc))
+          continue;
+        const auto *MUD = cast<MemoryUseOrDef>(&Acc);
+        if (MUD->getMemoryInst() != I || NotAPhi++ == 1)
+          return false;
+      }
+    }
+  return true;
+}
+}
+
+bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
+                              Loop *CurLoop, AliasSetTracker *CurAST,
+                              MemorySSAUpdater *MSSAU,
+                              bool TargetExecutesOncePerLoop,
+                              SinkAndHoistLICMFlags *Flags,
+                              OptimizationRemarkEmitter *ORE) {
   assert(((CurAST != nullptr) ^ (MSSAU != nullptr)) &&
          "Either AliasSetTracker or MemorySSA should be initialized.");
 
-  // If we don't understand the instruction, bail early. 
-  if (!isHoistableAndSinkableInst(I)) 
-    return false; 
- 
-  MemorySSA *MSSA = MSSAU ? MSSAU->getMemorySSA() : nullptr; 
-  if (MSSA) 
-    assert(Flags != nullptr && "Flags cannot be null."); 
- 
-  // Loads have extra constraints we have to verify before we can hoist them. 
-  if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { 
-    if (!LI->isUnordered()) 
-      return false; // Don't sink/hoist volatile or ordered atomic loads! 
- 
-    // Loads from constant memory are always safe to move, even if they end up 
-    // in the same alias set as something that ends up being modified. 
-    if (AA->pointsToConstantMemory(LI->getOperand(0))) 
-      return true; 
-    if (LI->hasMetadata(LLVMContext::MD_invariant_load)) 
-      return true; 
- 
-    if (LI->isAtomic() && !TargetExecutesOncePerLoop) 
-      return false; // Don't risk duplicating unordered loads 
- 
-    // This checks for an invariant.start dominating the load. 
-    if (isLoadInvariantInLoop(LI, DT, CurLoop)) 
-      return true; 
- 
-    bool Invalidated; 
-    if (CurAST) 
-      Invalidated = pointerInvalidatedByLoop(MemoryLocation::get(LI), CurAST, 
-                                             CurLoop, AA); 
-    else 
-      Invalidated = pointerInvalidatedByLoopWithMSSA( 
+  // If we don't understand the instruction, bail early.
+  if (!isHoistableAndSinkableInst(I))
+    return false;
+
+  MemorySSA *MSSA = MSSAU ? MSSAU->getMemorySSA() : nullptr;
+  if (MSSA)
+    assert(Flags != nullptr && "Flags cannot be null.");
+
+  // Loads have extra constraints we have to verify before we can hoist them.
+  if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+    if (!LI->isUnordered())
+      return false; // Don't sink/hoist volatile or ordered atomic loads!
+
+    // Loads from constant memory are always safe to move, even if they end up
+    // in the same alias set as something that ends up being modified.
+    if (AA->pointsToConstantMemory(LI->getOperand(0)))
+      return true;
+    if (LI->hasMetadata(LLVMContext::MD_invariant_load))
+      return true;
+
+    if (LI->isAtomic() && !TargetExecutesOncePerLoop)
+      return false; // Don't risk duplicating unordered loads
+
+    // This checks for an invariant.start dominating the load.
+    if (isLoadInvariantInLoop(LI, DT, CurLoop))
+      return true;
+
+    bool Invalidated;
+    if (CurAST)
+      Invalidated = pointerInvalidatedByLoop(MemoryLocation::get(LI), CurAST,
+                                             CurLoop, AA);
+    else
+      Invalidated = pointerInvalidatedByLoopWithMSSA(
           MSSA, cast<MemoryUse>(MSSA->getMemoryAccess(LI)), CurLoop, I, *Flags);
-    // Check loop-invariant address because this may also be a sinkable load 
-    // whose address is not necessarily loop-invariant. 
-    if (ORE && Invalidated && CurLoop->isLoopInvariant(LI->getPointerOperand())) 
-      ORE->emit([&]() { 
-        return OptimizationRemarkMissed( 
-                   DEBUG_TYPE, "LoadWithLoopInvariantAddressInvalidated", LI) 
-               << "failed to move load with loop-invariant address " 
-                  "because the loop may invalidate its value"; 
-      }); 
- 
-    return !Invalidated; 
-  } else if (CallInst *CI = dyn_cast<CallInst>(&I)) { 
-    // Don't sink or hoist dbg info; it's legal, but not useful. 
-    if (isa<DbgInfoIntrinsic>(I)) 
-      return false; 
- 
-    // Don't sink calls which can throw. 
-    if (CI->mayThrow()) 
-      return false; 
- 
+    // Check loop-invariant address because this may also be a sinkable load
+    // whose address is not necessarily loop-invariant.
+    if (ORE && Invalidated && CurLoop->isLoopInvariant(LI->getPointerOperand()))
+      ORE->emit([&]() {
+        return OptimizationRemarkMissed(
+                   DEBUG_TYPE, "LoadWithLoopInvariantAddressInvalidated", LI)
+               << "failed to move load with loop-invariant address "
+                  "because the loop may invalidate its value";
+      });
+
+    return !Invalidated;
+  } else if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+    // Don't sink or hoist dbg info; it's legal, but not useful.
+    if (isa<DbgInfoIntrinsic>(I))
+      return false;
+
+    // Don't sink calls which can throw.
+    if (CI->mayThrow())
+      return false;
+
     // Convergent attribute has been used on operations that involve
     // inter-thread communication which results are implicitly affected by the
     // enclosing control flows. It is not safe to hoist or sink such operations
@@ -1195,526 +1195,526 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
     if (CI->isConvergent())
       return false;
 
-    using namespace PatternMatch; 
-    if (match(CI, m_Intrinsic<Intrinsic::assume>())) 
-      // Assumes don't actually alias anything or throw 
-      return true; 
- 
-    if (match(CI, m_Intrinsic<Intrinsic::experimental_widenable_condition>())) 
-      // Widenable conditions don't actually alias anything or throw 
-      return true; 
- 
-    // Handle simple cases by querying alias analysis. 
-    FunctionModRefBehavior Behavior = AA->getModRefBehavior(CI); 
-    if (Behavior == FMRB_DoesNotAccessMemory) 
-      return true; 
-    if (AAResults::onlyReadsMemory(Behavior)) { 
-      // A readonly argmemonly function only reads from memory pointed to by 
-      // it's arguments with arbitrary offsets.  If we can prove there are no 
-      // writes to this memory in the loop, we can hoist or sink. 
-      if (AAResults::onlyAccessesArgPointees(Behavior)) { 
-        // TODO: expand to writeable arguments 
-        for (Value *Op : CI->arg_operands()) 
-          if (Op->getType()->isPointerTy()) { 
-            bool Invalidated; 
-            if (CurAST) 
-              Invalidated = pointerInvalidatedByLoop( 
+    using namespace PatternMatch;
+    if (match(CI, m_Intrinsic<Intrinsic::assume>()))
+      // Assumes don't actually alias anything or throw
+      return true;
+
+    if (match(CI, m_Intrinsic<Intrinsic::experimental_widenable_condition>()))
+      // Widenable conditions don't actually alias anything or throw
+      return true;
+
+    // Handle simple cases by querying alias analysis.
+    FunctionModRefBehavior Behavior = AA->getModRefBehavior(CI);
+    if (Behavior == FMRB_DoesNotAccessMemory)
+      return true;
+    if (AAResults::onlyReadsMemory(Behavior)) {
+      // A readonly argmemonly function only reads from memory pointed to by
+      // it's arguments with arbitrary offsets.  If we can prove there are no
+      // writes to this memory in the loop, we can hoist or sink.
+      if (AAResults::onlyAccessesArgPointees(Behavior)) {
+        // TODO: expand to writeable arguments
+        for (Value *Op : CI->arg_operands())
+          if (Op->getType()->isPointerTy()) {
+            bool Invalidated;
+            if (CurAST)
+              Invalidated = pointerInvalidatedByLoop(
                   MemoryLocation::getBeforeOrAfter(Op), CurAST, CurLoop, AA);
-            else 
-              Invalidated = pointerInvalidatedByLoopWithMSSA( 
+            else
+              Invalidated = pointerInvalidatedByLoopWithMSSA(
                   MSSA, cast<MemoryUse>(MSSA->getMemoryAccess(CI)), CurLoop, I,
-                  *Flags); 
-            if (Invalidated) 
-              return false; 
-          } 
-        return true; 
-      } 
- 
-      // If this call only reads from memory and there are no writes to memory 
-      // in the loop, we can hoist or sink the call as appropriate. 
-      if (isReadOnly(CurAST, MSSAU, CurLoop)) 
-        return true; 
-    } 
- 
-    // FIXME: This should use mod/ref information to see if we can hoist or 
-    // sink the call. 
- 
-    return false; 
-  } else if (auto *FI = dyn_cast<FenceInst>(&I)) { 
-    // Fences alias (most) everything to provide ordering.  For the moment, 
-    // just give up if there are any other memory operations in the loop. 
-    if (CurAST) { 
-      auto Begin = CurAST->begin(); 
-      assert(Begin != CurAST->end() && "must contain FI"); 
-      if (std::next(Begin) != CurAST->end()) 
-        // constant memory for instance, TODO: handle better 
-        return false; 
-      auto *UniqueI = Begin->getUniqueInstruction(); 
-      if (!UniqueI) 
-        // other memory op, give up 
-        return false; 
-      (void)FI; // suppress unused variable warning 
-      assert(UniqueI == FI && "AS must contain FI"); 
-      return true; 
-    } else // MSSAU 
-      return isOnlyMemoryAccess(FI, CurLoop, MSSAU); 
-  } else if (auto *SI = dyn_cast<StoreInst>(&I)) { 
-    if (!SI->isUnordered()) 
-      return false; // Don't sink/hoist volatile or ordered atomic store! 
- 
-    // We can only hoist a store that we can prove writes a value which is not 
-    // read or overwritten within the loop.  For those cases, we fallback to 
-    // load store promotion instead.  TODO: We can extend this to cases where 
-    // there is exactly one write to the location and that write dominates an 
-    // arbitrary number of reads in the loop. 
-    if (CurAST) { 
-      auto &AS = CurAST->getAliasSetFor(MemoryLocation::get(SI)); 
- 
-      if (AS.isRef() || !AS.isMustAlias()) 
-        // Quick exit test, handled by the full path below as well. 
-        return false; 
-      auto *UniqueI = AS.getUniqueInstruction(); 
-      if (!UniqueI) 
-        // other memory op, give up 
-        return false; 
-      assert(UniqueI == SI && "AS must contain SI"); 
-      return true; 
-    } else { // MSSAU 
-      if (isOnlyMemoryAccess(SI, CurLoop, MSSAU)) 
-        return true; 
+                  *Flags);
+            if (Invalidated)
+              return false;
+          }
+        return true;
+      }
+
+      // If this call only reads from memory and there are no writes to memory
+      // in the loop, we can hoist or sink the call as appropriate.
+      if (isReadOnly(CurAST, MSSAU, CurLoop))
+        return true;
+    }
+
+    // FIXME: This should use mod/ref information to see if we can hoist or
+    // sink the call.
+
+    return false;
+  } else if (auto *FI = dyn_cast<FenceInst>(&I)) {
+    // Fences alias (most) everything to provide ordering.  For the moment,
+    // just give up if there are any other memory operations in the loop.
+    if (CurAST) {
+      auto Begin = CurAST->begin();
+      assert(Begin != CurAST->end() && "must contain FI");
+      if (std::next(Begin) != CurAST->end())
+        // constant memory for instance, TODO: handle better
+        return false;
+      auto *UniqueI = Begin->getUniqueInstruction();
+      if (!UniqueI)
+        // other memory op, give up
+        return false;
+      (void)FI; // suppress unused variable warning
+      assert(UniqueI == FI && "AS must contain FI");
+      return true;
+    } else // MSSAU
+      return isOnlyMemoryAccess(FI, CurLoop, MSSAU);
+  } else if (auto *SI = dyn_cast<StoreInst>(&I)) {
+    if (!SI->isUnordered())
+      return false; // Don't sink/hoist volatile or ordered atomic store!
+
+    // We can only hoist a store that we can prove writes a value which is not
+    // read or overwritten within the loop.  For those cases, we fallback to
+    // load store promotion instead.  TODO: We can extend this to cases where
+    // there is exactly one write to the location and that write dominates an
+    // arbitrary number of reads in the loop.
+    if (CurAST) {
+      auto &AS = CurAST->getAliasSetFor(MemoryLocation::get(SI));
+
+      if (AS.isRef() || !AS.isMustAlias())
+        // Quick exit test, handled by the full path below as well.
+        return false;
+      auto *UniqueI = AS.getUniqueInstruction();
+      if (!UniqueI)
+        // other memory op, give up
+        return false;
+      assert(UniqueI == SI && "AS must contain SI");
+      return true;
+    } else { // MSSAU
+      if (isOnlyMemoryAccess(SI, CurLoop, MSSAU))
+        return true;
       // If there are more accesses than the Promotion cap or no "quota" to
       // check clobber, then give up as we're not walking a list that long.
       if (Flags->tooManyMemoryAccesses() || Flags->tooManyClobberingCalls())
-        return false; 
-      // If there are interfering Uses (i.e. their defining access is in the 
-      // loop), or ordered loads (stored as Defs!), don't move this store. 
-      // Could do better here, but this is conservatively correct. 
-      // TODO: Cache set of Uses on the first walk in runOnLoop, update when 
-      // moving accesses. Can also extend to dominating uses. 
-      auto *SIMD = MSSA->getMemoryAccess(SI); 
-      for (auto *BB : CurLoop->getBlocks()) 
-        if (auto *Accesses = MSSA->getBlockAccesses(BB)) { 
-          for (const auto &MA : *Accesses) 
-            if (const auto *MU = dyn_cast<MemoryUse>(&MA)) { 
-              auto *MD = MU->getDefiningAccess(); 
-              if (!MSSA->isLiveOnEntryDef(MD) && 
-                  CurLoop->contains(MD->getBlock())) 
-                return false; 
-              // Disable hoisting past potentially interfering loads. Optimized 
-              // Uses may point to an access outside the loop, as getClobbering 
-              // checks the previous iteration when walking the backedge. 
-              // FIXME: More precise: no Uses that alias SI. 
+        return false;
+      // If there are interfering Uses (i.e. their defining access is in the
+      // loop), or ordered loads (stored as Defs!), don't move this store.
+      // Could do better here, but this is conservatively correct.
+      // TODO: Cache set of Uses on the first walk in runOnLoop, update when
+      // moving accesses. Can also extend to dominating uses.
+      auto *SIMD = MSSA->getMemoryAccess(SI);
+      for (auto *BB : CurLoop->getBlocks())
+        if (auto *Accesses = MSSA->getBlockAccesses(BB)) {
+          for (const auto &MA : *Accesses)
+            if (const auto *MU = dyn_cast<MemoryUse>(&MA)) {
+              auto *MD = MU->getDefiningAccess();
+              if (!MSSA->isLiveOnEntryDef(MD) &&
+                  CurLoop->contains(MD->getBlock()))
+                return false;
+              // Disable hoisting past potentially interfering loads. Optimized
+              // Uses may point to an access outside the loop, as getClobbering
+              // checks the previous iteration when walking the backedge.
+              // FIXME: More precise: no Uses that alias SI.
               if (!Flags->getIsSink() && !MSSA->dominates(SIMD, MU))
-                return false; 
-            } else if (const auto *MD = dyn_cast<MemoryDef>(&MA)) { 
-              if (auto *LI = dyn_cast<LoadInst>(MD->getMemoryInst())) { 
-                (void)LI; // Silence warning. 
-                assert(!LI->isUnordered() && "Expected unordered load"); 
-                return false; 
-              } 
-              // Any call, while it may not be clobbering SI, it may be a use. 
-              if (auto *CI = dyn_cast<CallInst>(MD->getMemoryInst())) { 
-                // Check if the call may read from the memory locattion written 
-                // to by SI. Check CI's attributes and arguments; the number of 
-                // such checks performed is limited above by NoOfMemAccTooLarge. 
-                ModRefInfo MRI = AA->getModRefInfo(CI, MemoryLocation::get(SI)); 
-                if (isModOrRefSet(MRI)) 
-                  return false; 
-              } 
-            } 
-        } 
-      auto *Source = MSSA->getSkipSelfWalker()->getClobberingMemoryAccess(SI); 
+                return false;
+            } else if (const auto *MD = dyn_cast<MemoryDef>(&MA)) {
+              if (auto *LI = dyn_cast<LoadInst>(MD->getMemoryInst())) {
+                (void)LI; // Silence warning.
+                assert(!LI->isUnordered() && "Expected unordered load");
+                return false;
+              }
+              // Any call, while it may not be clobbering SI, it may be a use.
+              if (auto *CI = dyn_cast<CallInst>(MD->getMemoryInst())) {
+                // Check if the call may read from the memory locattion written
+                // to by SI. Check CI's attributes and arguments; the number of
+                // such checks performed is limited above by NoOfMemAccTooLarge.
+                ModRefInfo MRI = AA->getModRefInfo(CI, MemoryLocation::get(SI));
+                if (isModOrRefSet(MRI))
+                  return false;
+              }
+            }
+        }
+      auto *Source = MSSA->getSkipSelfWalker()->getClobberingMemoryAccess(SI);
       Flags->incrementClobberingCalls();
-      // If there are no clobbering Defs in the loop, store is safe to hoist. 
-      return MSSA->isLiveOnEntryDef(Source) || 
-             !CurLoop->contains(Source->getBlock()); 
-    } 
-  } 
- 
-  assert(!I.mayReadOrWriteMemory() && "unhandled aliasing"); 
- 
-  // We've established mechanical ability and aliasing, it's up to the caller 
-  // to check fault safety 
-  return true; 
-} 
- 
-/// Returns true if a PHINode is a trivially replaceable with an 
-/// Instruction. 
-/// This is true when all incoming values are that instruction. 
-/// This pattern occurs most often with LCSSA PHI nodes. 
-/// 
-static bool isTriviallyReplaceablePHI(const PHINode &PN, const Instruction &I) { 
-  for (const Value *IncValue : PN.incoming_values()) 
-    if (IncValue != &I) 
-      return false; 
- 
-  return true; 
-} 
- 
-/// Return true if the instruction is free in the loop. 
-static bool isFreeInLoop(const Instruction &I, const Loop *CurLoop, 
-                         const TargetTransformInfo *TTI) { 
- 
-  if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I)) { 
-    if (TTI->getUserCost(GEP, TargetTransformInfo::TCK_SizeAndLatency) != 
-        TargetTransformInfo::TCC_Free) 
-      return false; 
-    // For a GEP, we cannot simply use getUserCost because currently it 
-    // optimistically assume that a GEP will fold into addressing mode 
-    // regardless of its users. 
-    const BasicBlock *BB = GEP->getParent(); 
-    for (const User *U : GEP->users()) { 
-      const Instruction *UI = cast<Instruction>(U); 
-      if (CurLoop->contains(UI) && 
-          (BB != UI->getParent() || 
-           (!isa<StoreInst>(UI) && !isa<LoadInst>(UI)))) 
-        return false; 
-    } 
-    return true; 
-  } else 
-    return TTI->getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency) == 
-           TargetTransformInfo::TCC_Free; 
-} 
- 
-/// Return true if the only users of this instruction are outside of 
-/// the loop. If this is true, we can sink the instruction to the exit 
-/// blocks of the loop. 
-/// 
-/// We also return true if the instruction could be folded away in lowering. 
-/// (e.g.,  a GEP can be folded into a load as an addressing mode in the loop). 
-static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop, 
-                                  const LoopSafetyInfo *SafetyInfo, 
-                                  TargetTransformInfo *TTI, bool &FreeInLoop) { 
-  const auto &BlockColors = SafetyInfo->getBlockColors(); 
-  bool IsFree = isFreeInLoop(I, CurLoop, TTI); 
-  for (const User *U : I.users()) { 
-    const Instruction *UI = cast<Instruction>(U); 
-    if (const PHINode *PN = dyn_cast<PHINode>(UI)) { 
-      const BasicBlock *BB = PN->getParent(); 
-      // We cannot sink uses in catchswitches. 
-      if (isa<CatchSwitchInst>(BB->getTerminator())) 
-        return false; 
- 
-      // We need to sink a callsite to a unique funclet.  Avoid sinking if the 
-      // phi use is too muddled. 
-      if (isa<CallInst>(I)) 
-        if (!BlockColors.empty() && 
-            BlockColors.find(const_cast<BasicBlock *>(BB))->second.size() != 1) 
-          return false; 
-    } 
- 
-    if (CurLoop->contains(UI)) { 
-      if (IsFree) { 
-        FreeInLoop = true; 
-        continue; 
-      } 
-      return false; 
-    } 
-  } 
-  return true; 
-} 
- 
-static Instruction *cloneInstructionInExitBlock( 
-    Instruction &I, BasicBlock &ExitBlock, PHINode &PN, const LoopInfo *LI, 
-    const LoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU) { 
-  Instruction *New; 
-  if (auto *CI = dyn_cast<CallInst>(&I)) { 
-    const auto &BlockColors = SafetyInfo->getBlockColors(); 
- 
-    // Sinking call-sites need to be handled differently from other 
-    // instructions.  The cloned call-site needs a funclet bundle operand 
-    // appropriate for its location in the CFG. 
-    SmallVector<OperandBundleDef, 1> OpBundles; 
-    for (unsigned BundleIdx = 0, BundleEnd = CI->getNumOperandBundles(); 
-         BundleIdx != BundleEnd; ++BundleIdx) { 
-      OperandBundleUse Bundle = CI->getOperandBundleAt(BundleIdx); 
-      if (Bundle.getTagID() == LLVMContext::OB_funclet) 
-        continue; 
- 
-      OpBundles.emplace_back(Bundle); 
-    } 
- 
-    if (!BlockColors.empty()) { 
-      const ColorVector &CV = BlockColors.find(&ExitBlock)->second; 
-      assert(CV.size() == 1 && "non-unique color for exit block!"); 
-      BasicBlock *BBColor = CV.front(); 
-      Instruction *EHPad = BBColor->getFirstNonPHI(); 
-      if (EHPad->isEHPad()) 
-        OpBundles.emplace_back("funclet", EHPad); 
-    } 
- 
-    New = CallInst::Create(CI, OpBundles); 
-  } else { 
-    New = I.clone(); 
-  } 
- 
-  ExitBlock.getInstList().insert(ExitBlock.getFirstInsertionPt(), New); 
-  if (!I.getName().empty()) 
-    New->setName(I.getName() + ".le"); 
- 
-  if (MSSAU && MSSAU->getMemorySSA()->getMemoryAccess(&I)) { 
-    // Create a new MemoryAccess and let MemorySSA set its defining access. 
-    MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB( 
-        New, nullptr, New->getParent(), MemorySSA::Beginning); 
-    if (NewMemAcc) { 
-      if (auto *MemDef = dyn_cast<MemoryDef>(NewMemAcc)) 
-        MSSAU->insertDef(MemDef, /*RenameUses=*/true); 
-      else { 
-        auto *MemUse = cast<MemoryUse>(NewMemAcc); 
-        MSSAU->insertUse(MemUse, /*RenameUses=*/true); 
-      } 
-    } 
-  } 
- 
-  // Build LCSSA PHI nodes for any in-loop operands. Note that this is 
-  // particularly cheap because we can rip off the PHI node that we're 
-  // replacing for the number and blocks of the predecessors. 
-  // OPT: If this shows up in a profile, we can instead finish sinking all 
-  // invariant instructions, and then walk their operands to re-establish 
-  // LCSSA. That will eliminate creating PHI nodes just to nuke them when 
-  // sinking bottom-up. 
-  for (User::op_iterator OI = New->op_begin(), OE = New->op_end(); OI != OE; 
-       ++OI) 
-    if (Instruction *OInst = dyn_cast<Instruction>(*OI)) 
-      if (Loop *OLoop = LI->getLoopFor(OInst->getParent())) 
-        if (!OLoop->contains(&PN)) { 
-          PHINode *OpPN = 
-              PHINode::Create(OInst->getType(), PN.getNumIncomingValues(), 
-                              OInst->getName() + ".lcssa", &ExitBlock.front()); 
-          for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) 
-            OpPN->addIncoming(OInst, PN.getIncomingBlock(i)); 
-          *OI = OpPN; 
-        } 
-  return New; 
-} 
- 
-static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, 
-                             AliasSetTracker *AST, MemorySSAUpdater *MSSAU) { 
-  if (AST) 
-    AST->deleteValue(&I); 
-  if (MSSAU) 
-    MSSAU->removeMemoryAccess(&I); 
-  SafetyInfo.removeInstruction(&I); 
-  I.eraseFromParent(); 
-} 
- 
-static void moveInstructionBefore(Instruction &I, Instruction &Dest, 
-                                  ICFLoopSafetyInfo &SafetyInfo, 
-                                  MemorySSAUpdater *MSSAU, 
-                                  ScalarEvolution *SE) { 
-  SafetyInfo.removeInstruction(&I); 
-  SafetyInfo.insertInstructionTo(&I, Dest.getParent()); 
-  I.moveBefore(&Dest); 
-  if (MSSAU) 
-    if (MemoryUseOrDef *OldMemAcc = cast_or_null<MemoryUseOrDef>( 
-            MSSAU->getMemorySSA()->getMemoryAccess(&I))) 
-      MSSAU->moveToPlace(OldMemAcc, Dest.getParent(), 
-                         MemorySSA::BeforeTerminator); 
-  if (SE) 
-    SE->forgetValue(&I); 
-} 
- 
-static Instruction *sinkThroughTriviallyReplaceablePHI( 
-    PHINode *TPN, Instruction *I, LoopInfo *LI, 
-    SmallDenseMap<BasicBlock *, Instruction *, 32> &SunkCopies, 
-    const LoopSafetyInfo *SafetyInfo, const Loop *CurLoop, 
-    MemorySSAUpdater *MSSAU) { 
-  assert(isTriviallyReplaceablePHI(*TPN, *I) && 
-         "Expect only trivially replaceable PHI"); 
-  BasicBlock *ExitBlock = TPN->getParent(); 
-  Instruction *New; 
-  auto It = SunkCopies.find(ExitBlock); 
-  if (It != SunkCopies.end()) 
-    New = It->second; 
-  else 
-    New = SunkCopies[ExitBlock] = cloneInstructionInExitBlock( 
-        *I, *ExitBlock, *TPN, LI, SafetyInfo, MSSAU); 
-  return New; 
-} 
- 
-static bool canSplitPredecessors(PHINode *PN, LoopSafetyInfo *SafetyInfo) { 
-  BasicBlock *BB = PN->getParent(); 
-  if (!BB->canSplitPredecessors()) 
-    return false; 
-  // It's not impossible to split EHPad blocks, but if BlockColors already exist 
-  // it require updating BlockColors for all offspring blocks accordingly. By 
-  // skipping such corner case, we can make updating BlockColors after splitting 
-  // predecessor fairly simple. 
-  if (!SafetyInfo->getBlockColors().empty() && BB->getFirstNonPHI()->isEHPad()) 
-    return false; 
-  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { 
-    BasicBlock *BBPred = *PI; 
-    if (isa<IndirectBrInst>(BBPred->getTerminator()) || 
-        isa<CallBrInst>(BBPred->getTerminator())) 
-      return false; 
-  } 
-  return true; 
-} 
- 
-static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT, 
-                                        LoopInfo *LI, const Loop *CurLoop, 
-                                        LoopSafetyInfo *SafetyInfo, 
-                                        MemorySSAUpdater *MSSAU) { 
-#ifndef NDEBUG 
-  SmallVector<BasicBlock *, 32> ExitBlocks; 
-  CurLoop->getUniqueExitBlocks(ExitBlocks); 
-  SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(), 
-                                             ExitBlocks.end()); 
-#endif 
-  BasicBlock *ExitBB = PN->getParent(); 
-  assert(ExitBlockSet.count(ExitBB) && "Expect the PHI is in an exit block."); 
- 
-  // Split predecessors of the loop exit to make instructions in the loop are 
-  // exposed to exit blocks through trivially replaceable PHIs while keeping the 
-  // loop in the canonical form where each predecessor of each exit block should 
-  // be contained within the loop. For example, this will convert the loop below 
-  // from 
-  // 
-  // LB1: 
-  //   %v1 = 
-  //   br %LE, %LB2 
-  // LB2: 
-  //   %v2 = 
-  //   br %LE, %LB1 
-  // LE: 
-  //   %p = phi [%v1, %LB1], [%v2, %LB2] <-- non-trivially replaceable 
-  // 
-  // to 
-  // 
-  // LB1: 
-  //   %v1 = 
-  //   br %LE.split, %LB2 
-  // LB2: 
-  //   %v2 = 
-  //   br %LE.split2, %LB1 
-  // LE.split: 
-  //   %p1 = phi [%v1, %LB1]  <-- trivially replaceable 
-  //   br %LE 
-  // LE.split2: 
-  //   %p2 = phi [%v2, %LB2]  <-- trivially replaceable 
-  //   br %LE 
-  // LE: 
-  //   %p = phi [%p1, %LE.split], [%p2, %LE.split2] 
-  // 
-  const auto &BlockColors = SafetyInfo->getBlockColors(); 
-  SmallSetVector<BasicBlock *, 8> PredBBs(pred_begin(ExitBB), pred_end(ExitBB)); 
-  while (!PredBBs.empty()) { 
-    BasicBlock *PredBB = *PredBBs.begin(); 
-    assert(CurLoop->contains(PredBB) && 
-           "Expect all predecessors are in the loop"); 
-    if (PN->getBasicBlockIndex(PredBB) >= 0) { 
-      BasicBlock *NewPred = SplitBlockPredecessors( 
-          ExitBB, PredBB, ".split.loop.exit", DT, LI, MSSAU, true); 
-      // Since we do not allow splitting EH-block with BlockColors in 
-      // canSplitPredecessors(), we can simply assign predecessor's color to 
-      // the new block. 
-      if (!BlockColors.empty()) 
-        // Grab a reference to the ColorVector to be inserted before getting the 
-        // reference to the vector we are copying because inserting the new 
-        // element in BlockColors might cause the map to be reallocated. 
-        SafetyInfo->copyColors(NewPred, PredBB); 
-    } 
-    PredBBs.remove(PredBB); 
-  } 
-} 
- 
-/// When an instruction is found to only be used outside of the loop, this 
-/// function moves it to the exit blocks and patches up SSA form as needed. 
-/// This method is guaranteed to remove the original instruction from its 
-/// position, and may either delete it or move it to outside of the loop. 
-/// 
-static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, 
+      // If there are no clobbering Defs in the loop, store is safe to hoist.
+      return MSSA->isLiveOnEntryDef(Source) ||
+             !CurLoop->contains(Source->getBlock());
+    }
+  }
+
+  assert(!I.mayReadOrWriteMemory() && "unhandled aliasing");
+
+  // We've established mechanical ability and aliasing, it's up to the caller
+  // to check fault safety
+  return true;
+}
+
+/// Returns true if a PHINode is a trivially replaceable with an
+/// Instruction.
+/// This is true when all incoming values are that instruction.
+/// This pattern occurs most often with LCSSA PHI nodes.
+///
+static bool isTriviallyReplaceablePHI(const PHINode &PN, const Instruction &I) {
+  for (const Value *IncValue : PN.incoming_values())
+    if (IncValue != &I)
+      return false;
+
+  return true;
+}
+
+/// Return true if the instruction is free in the loop.
+static bool isFreeInLoop(const Instruction &I, const Loop *CurLoop,
+                         const TargetTransformInfo *TTI) {
+
+  if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+    if (TTI->getUserCost(GEP, TargetTransformInfo::TCK_SizeAndLatency) !=
+        TargetTransformInfo::TCC_Free)
+      return false;
+    // For a GEP, we cannot simply use getUserCost because currently it
+    // optimistically assume that a GEP will fold into addressing mode
+    // regardless of its users.
+    const BasicBlock *BB = GEP->getParent();
+    for (const User *U : GEP->users()) {
+      const Instruction *UI = cast<Instruction>(U);
+      if (CurLoop->contains(UI) &&
+          (BB != UI->getParent() ||
+           (!isa<StoreInst>(UI) && !isa<LoadInst>(UI))))
+        return false;
+    }
+    return true;
+  } else
+    return TTI->getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
+           TargetTransformInfo::TCC_Free;
+}
+
+/// Return true if the only users of this instruction are outside of
+/// the loop. If this is true, we can sink the instruction to the exit
+/// blocks of the loop.
+///
+/// We also return true if the instruction could be folded away in lowering.
+/// (e.g.,  a GEP can be folded into a load as an addressing mode in the loop).
+static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop,
+                                  const LoopSafetyInfo *SafetyInfo,
+                                  TargetTransformInfo *TTI, bool &FreeInLoop) {
+  const auto &BlockColors = SafetyInfo->getBlockColors();
+  bool IsFree = isFreeInLoop(I, CurLoop, TTI);
+  for (const User *U : I.users()) {
+    const Instruction *UI = cast<Instruction>(U);
+    if (const PHINode *PN = dyn_cast<PHINode>(UI)) {
+      const BasicBlock *BB = PN->getParent();
+      // We cannot sink uses in catchswitches.
+      if (isa<CatchSwitchInst>(BB->getTerminator()))
+        return false;
+
+      // We need to sink a callsite to a unique funclet.  Avoid sinking if the
+      // phi use is too muddled.
+      if (isa<CallInst>(I))
+        if (!BlockColors.empty() &&
+            BlockColors.find(const_cast<BasicBlock *>(BB))->second.size() != 1)
+          return false;
+    }
+
+    if (CurLoop->contains(UI)) {
+      if (IsFree) {
+        FreeInLoop = true;
+        continue;
+      }
+      return false;
+    }
+  }
+  return true;
+}
+
+static Instruction *cloneInstructionInExitBlock(
+    Instruction &I, BasicBlock &ExitBlock, PHINode &PN, const LoopInfo *LI,
+    const LoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU) {
+  Instruction *New;
+  if (auto *CI = dyn_cast<CallInst>(&I)) {
+    const auto &BlockColors = SafetyInfo->getBlockColors();
+
+    // Sinking call-sites need to be handled differently from other
+    // instructions.  The cloned call-site needs a funclet bundle operand
+    // appropriate for its location in the CFG.
+    SmallVector<OperandBundleDef, 1> OpBundles;
+    for (unsigned BundleIdx = 0, BundleEnd = CI->getNumOperandBundles();
+         BundleIdx != BundleEnd; ++BundleIdx) {
+      OperandBundleUse Bundle = CI->getOperandBundleAt(BundleIdx);
+      if (Bundle.getTagID() == LLVMContext::OB_funclet)
+        continue;
+
+      OpBundles.emplace_back(Bundle);
+    }
+
+    if (!BlockColors.empty()) {
+      const ColorVector &CV = BlockColors.find(&ExitBlock)->second;
+      assert(CV.size() == 1 && "non-unique color for exit block!");
+      BasicBlock *BBColor = CV.front();
+      Instruction *EHPad = BBColor->getFirstNonPHI();
+      if (EHPad->isEHPad())
+        OpBundles.emplace_back("funclet", EHPad);
+    }
+
+    New = CallInst::Create(CI, OpBundles);
+  } else {
+    New = I.clone();
+  }
+
+  ExitBlock.getInstList().insert(ExitBlock.getFirstInsertionPt(), New);
+  if (!I.getName().empty())
+    New->setName(I.getName() + ".le");
+
+  if (MSSAU && MSSAU->getMemorySSA()->getMemoryAccess(&I)) {
+    // Create a new MemoryAccess and let MemorySSA set its defining access.
+    MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
+        New, nullptr, New->getParent(), MemorySSA::Beginning);
+    if (NewMemAcc) {
+      if (auto *MemDef = dyn_cast<MemoryDef>(NewMemAcc))
+        MSSAU->insertDef(MemDef, /*RenameUses=*/true);
+      else {
+        auto *MemUse = cast<MemoryUse>(NewMemAcc);
+        MSSAU->insertUse(MemUse, /*RenameUses=*/true);
+      }
+    }
+  }
+
+  // Build LCSSA PHI nodes for any in-loop operands. Note that this is
+  // particularly cheap because we can rip off the PHI node that we're
+  // replacing for the number and blocks of the predecessors.
+  // OPT: If this shows up in a profile, we can instead finish sinking all
+  // invariant instructions, and then walk their operands to re-establish
+  // LCSSA. That will eliminate creating PHI nodes just to nuke them when
+  // sinking bottom-up.
+  for (User::op_iterator OI = New->op_begin(), OE = New->op_end(); OI != OE;
+       ++OI)
+    if (Instruction *OInst = dyn_cast<Instruction>(*OI))
+      if (Loop *OLoop = LI->getLoopFor(OInst->getParent()))
+        if (!OLoop->contains(&PN)) {
+          PHINode *OpPN =
+              PHINode::Create(OInst->getType(), PN.getNumIncomingValues(),
+                              OInst->getName() + ".lcssa", &ExitBlock.front());
+          for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
+            OpPN->addIncoming(OInst, PN.getIncomingBlock(i));
+          *OI = OpPN;
+        }
+  return New;
+}
+
+static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,
+                             AliasSetTracker *AST, MemorySSAUpdater *MSSAU) {
+  if (AST)
+    AST->deleteValue(&I);
+  if (MSSAU)
+    MSSAU->removeMemoryAccess(&I);
+  SafetyInfo.removeInstruction(&I);
+  I.eraseFromParent();
+}
+
+static void moveInstructionBefore(Instruction &I, Instruction &Dest,
+                                  ICFLoopSafetyInfo &SafetyInfo,
+                                  MemorySSAUpdater *MSSAU,
+                                  ScalarEvolution *SE) {
+  SafetyInfo.removeInstruction(&I);
+  SafetyInfo.insertInstructionTo(&I, Dest.getParent());
+  I.moveBefore(&Dest);
+  if (MSSAU)
+    if (MemoryUseOrDef *OldMemAcc = cast_or_null<MemoryUseOrDef>(
+            MSSAU->getMemorySSA()->getMemoryAccess(&I)))
+      MSSAU->moveToPlace(OldMemAcc, Dest.getParent(),
+                         MemorySSA::BeforeTerminator);
+  if (SE)
+    SE->forgetValue(&I);
+}
+
+static Instruction *sinkThroughTriviallyReplaceablePHI(
+    PHINode *TPN, Instruction *I, LoopInfo *LI,
+    SmallDenseMap<BasicBlock *, Instruction *, 32> &SunkCopies,
+    const LoopSafetyInfo *SafetyInfo, const Loop *CurLoop,
+    MemorySSAUpdater *MSSAU) {
+  assert(isTriviallyReplaceablePHI(*TPN, *I) &&
+         "Expect only trivially replaceable PHI");
+  BasicBlock *ExitBlock = TPN->getParent();
+  Instruction *New;
+  auto It = SunkCopies.find(ExitBlock);
+  if (It != SunkCopies.end())
+    New = It->second;
+  else
+    New = SunkCopies[ExitBlock] = cloneInstructionInExitBlock(
+        *I, *ExitBlock, *TPN, LI, SafetyInfo, MSSAU);
+  return New;
+}
+
+static bool canSplitPredecessors(PHINode *PN, LoopSafetyInfo *SafetyInfo) {
+  BasicBlock *BB = PN->getParent();
+  if (!BB->canSplitPredecessors())
+    return false;
+  // It's not impossible to split EHPad blocks, but if BlockColors already exist
+  // it require updating BlockColors for all offspring blocks accordingly. By
+  // skipping such corner case, we can make updating BlockColors after splitting
+  // predecessor fairly simple.
+  if (!SafetyInfo->getBlockColors().empty() && BB->getFirstNonPHI()->isEHPad())
+    return false;
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+    BasicBlock *BBPred = *PI;
+    if (isa<IndirectBrInst>(BBPred->getTerminator()) ||
+        isa<CallBrInst>(BBPred->getTerminator()))
+      return false;
+  }
+  return true;
+}
+
+static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
+                                        LoopInfo *LI, const Loop *CurLoop,
+                                        LoopSafetyInfo *SafetyInfo,
+                                        MemorySSAUpdater *MSSAU) {
+#ifndef NDEBUG
+  SmallVector<BasicBlock *, 32> ExitBlocks;
+  CurLoop->getUniqueExitBlocks(ExitBlocks);
+  SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(),
+                                             ExitBlocks.end());
+#endif
+  BasicBlock *ExitBB = PN->getParent();
+  assert(ExitBlockSet.count(ExitBB) && "Expect the PHI is in an exit block.");
+
+  // Split predecessors of the loop exit to make instructions in the loop are
+  // exposed to exit blocks through trivially replaceable PHIs while keeping the
+  // loop in the canonical form where each predecessor of each exit block should
+  // be contained within the loop. For example, this will convert the loop below
+  // from
+  //
+  // LB1:
+  //   %v1 =
+  //   br %LE, %LB2
+  // LB2:
+  //   %v2 =
+  //   br %LE, %LB1
+  // LE:
+  //   %p = phi [%v1, %LB1], [%v2, %LB2] <-- non-trivially replaceable
+  //
+  // to
+  //
+  // LB1:
+  //   %v1 =
+  //   br %LE.split, %LB2
+  // LB2:
+  //   %v2 =
+  //   br %LE.split2, %LB1
+  // LE.split:
+  //   %p1 = phi [%v1, %LB1]  <-- trivially replaceable
+  //   br %LE
+  // LE.split2:
+  //   %p2 = phi [%v2, %LB2]  <-- trivially replaceable
+  //   br %LE
+  // LE:
+  //   %p = phi [%p1, %LE.split], [%p2, %LE.split2]
+  //
+  const auto &BlockColors = SafetyInfo->getBlockColors();
+  SmallSetVector<BasicBlock *, 8> PredBBs(pred_begin(ExitBB), pred_end(ExitBB));
+  while (!PredBBs.empty()) {
+    BasicBlock *PredBB = *PredBBs.begin();
+    assert(CurLoop->contains(PredBB) &&
+           "Expect all predecessors are in the loop");
+    if (PN->getBasicBlockIndex(PredBB) >= 0) {
+      BasicBlock *NewPred = SplitBlockPredecessors(
+          ExitBB, PredBB, ".split.loop.exit", DT, LI, MSSAU, true);
+      // Since we do not allow splitting EH-block with BlockColors in
+      // canSplitPredecessors(), we can simply assign predecessor's color to
+      // the new block.
+      if (!BlockColors.empty())
+        // Grab a reference to the ColorVector to be inserted before getting the
+        // reference to the vector we are copying because inserting the new
+        // element in BlockColors might cause the map to be reallocated.
+        SafetyInfo->copyColors(NewPred, PredBB);
+    }
+    PredBBs.remove(PredBB);
+  }
+}
+
+/// When an instruction is found to only be used outside of the loop, this
+/// function moves it to the exit blocks and patches up SSA form as needed.
+/// This method is guaranteed to remove the original instruction from its
+/// position, and may either delete it or move it to outside of the loop.
+///
+static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
                  BlockFrequencyInfo *BFI, const Loop *CurLoop,
                  ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU,
                  OptimizationRemarkEmitter *ORE) {
-  LLVM_DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n"); 
-  ORE->emit([&]() { 
-    return OptimizationRemark(DEBUG_TYPE, "InstSunk", &I) 
-           << "sinking " << ore::NV("Inst", &I); 
-  }); 
-  bool Changed = false; 
-  if (isa<LoadInst>(I)) 
-    ++NumMovedLoads; 
-  else if (isa<CallInst>(I)) 
-    ++NumMovedCalls; 
-  ++NumSunk; 
- 
-  // Iterate over users to be ready for actual sinking. Replace users via 
-  // unreachable blocks with undef and make all user PHIs trivially replaceable. 
-  SmallPtrSet<Instruction *, 8> VisitedUsers; 
-  for (Value::user_iterator UI = I.user_begin(), UE = I.user_end(); UI != UE;) { 
-    auto *User = cast<Instruction>(*UI); 
-    Use &U = UI.getUse(); 
-    ++UI; 
- 
-    if (VisitedUsers.count(User) || CurLoop->contains(User)) 
-      continue; 
- 
-    if (!DT->isReachableFromEntry(User->getParent())) { 
-      U = UndefValue::get(I.getType()); 
-      Changed = true; 
-      continue; 
-    } 
- 
-    // The user must be a PHI node. 
-    PHINode *PN = cast<PHINode>(User); 
- 
-    // Surprisingly, instructions can be used outside of loops without any 
-    // exits.  This can only happen in PHI nodes if the incoming block is 
-    // unreachable. 
-    BasicBlock *BB = PN->getIncomingBlock(U); 
-    if (!DT->isReachableFromEntry(BB)) { 
-      U = UndefValue::get(I.getType()); 
-      Changed = true; 
-      continue; 
-    } 
- 
-    VisitedUsers.insert(PN); 
-    if (isTriviallyReplaceablePHI(*PN, I)) 
-      continue; 
- 
-    if (!canSplitPredecessors(PN, SafetyInfo)) 
-      return Changed; 
- 
-    // Split predecessors of the PHI so that we can make users trivially 
-    // replaceable. 
-    splitPredecessorsOfLoopExit(PN, DT, LI, CurLoop, SafetyInfo, MSSAU); 
- 
-    // Should rebuild the iterators, as they may be invalidated by 
-    // splitPredecessorsOfLoopExit(). 
-    UI = I.user_begin(); 
-    UE = I.user_end(); 
-  } 
- 
-  if (VisitedUsers.empty()) 
-    return Changed; 
- 
-#ifndef NDEBUG 
-  SmallVector<BasicBlock *, 32> ExitBlocks; 
-  CurLoop->getUniqueExitBlocks(ExitBlocks); 
-  SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(), 
-                                             ExitBlocks.end()); 
-#endif 
- 
-  // Clones of this instruction. Don't create more than one per exit block! 
-  SmallDenseMap<BasicBlock *, Instruction *, 32> SunkCopies; 
- 
-  // If this instruction is only used outside of the loop, then all users are 
-  // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of 
-  // the instruction. 
+  LLVM_DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
+  ORE->emit([&]() {
+    return OptimizationRemark(DEBUG_TYPE, "InstSunk", &I)
+           << "sinking " << ore::NV("Inst", &I);
+  });
+  bool Changed = false;
+  if (isa<LoadInst>(I))
+    ++NumMovedLoads;
+  else if (isa<CallInst>(I))
+    ++NumMovedCalls;
+  ++NumSunk;
+
+  // Iterate over users to be ready for actual sinking. Replace users via
+  // unreachable blocks with undef and make all user PHIs trivially replaceable.
+  SmallPtrSet<Instruction *, 8> VisitedUsers;
+  for (Value::user_iterator UI = I.user_begin(), UE = I.user_end(); UI != UE;) {
+    auto *User = cast<Instruction>(*UI);
+    Use &U = UI.getUse();
+    ++UI;
+
+    if (VisitedUsers.count(User) || CurLoop->contains(User))
+      continue;
+
+    if (!DT->isReachableFromEntry(User->getParent())) {
+      U = UndefValue::get(I.getType());
+      Changed = true;
+      continue;
+    }
+
+    // The user must be a PHI node.
+    PHINode *PN = cast<PHINode>(User);
+
+    // Surprisingly, instructions can be used outside of loops without any
+    // exits.  This can only happen in PHI nodes if the incoming block is
+    // unreachable.
+    BasicBlock *BB = PN->getIncomingBlock(U);
+    if (!DT->isReachableFromEntry(BB)) {
+      U = UndefValue::get(I.getType());
+      Changed = true;
+      continue;
+    }
+
+    VisitedUsers.insert(PN);
+    if (isTriviallyReplaceablePHI(*PN, I))
+      continue;
+
+    if (!canSplitPredecessors(PN, SafetyInfo))
+      return Changed;
+
+    // Split predecessors of the PHI so that we can make users trivially
+    // replaceable.
+    splitPredecessorsOfLoopExit(PN, DT, LI, CurLoop, SafetyInfo, MSSAU);
+
+    // Should rebuild the iterators, as they may be invalidated by
+    // splitPredecessorsOfLoopExit().
+    UI = I.user_begin();
+    UE = I.user_end();
+  }
+
+  if (VisitedUsers.empty())
+    return Changed;
+
+#ifndef NDEBUG
+  SmallVector<BasicBlock *, 32> ExitBlocks;
+  CurLoop->getUniqueExitBlocks(ExitBlocks);
+  SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(),
+                                             ExitBlocks.end());
+#endif
+
+  // Clones of this instruction. Don't create more than one per exit block!
+  SmallDenseMap<BasicBlock *, Instruction *, 32> SunkCopies;
+
+  // If this instruction is only used outside of the loop, then all users are
+  // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of
+  // the instruction.
   // First check if I is worth sinking for all uses. Sink only when it is worth
   // across all uses.
-  SmallSetVector<User*, 8> Users(I.user_begin(), I.user_end()); 
+  SmallSetVector<User*, 8> Users(I.user_begin(), I.user_end());
   SmallVector<PHINode *, 8> ExitPNs;
-  for (auto *UI : Users) { 
-    auto *User = cast<Instruction>(UI); 
- 
-    if (CurLoop->contains(User)) 
-      continue; 
- 
-    PHINode *PN = cast<PHINode>(User); 
-    assert(ExitBlockSet.count(PN->getParent()) && 
-           "The LCSSA PHI is not in an exit block!"); 
+  for (auto *UI : Users) {
+    auto *User = cast<Instruction>(UI);
+
+    if (CurLoop->contains(User))
+      continue;
+
+    PHINode *PN = cast<PHINode>(User);
+    assert(ExitBlockSet.count(PN->getParent()) &&
+           "The LCSSA PHI is not in an exit block!");
     if (!worthSinkOrHoistInst(I, PN->getParent(), ORE, BFI)) {
       return Changed;
     }
@@ -1724,622 +1724,622 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
 
   for (auto *PN : ExitPNs) {
 
-    // The PHI must be trivially replaceable. 
-    Instruction *New = sinkThroughTriviallyReplaceablePHI( 
-        PN, &I, LI, SunkCopies, SafetyInfo, CurLoop, MSSAU); 
-    PN->replaceAllUsesWith(New); 
-    eraseInstruction(*PN, *SafetyInfo, nullptr, nullptr); 
-    Changed = true; 
-  } 
-  return Changed; 
-} 
- 
-/// When an instruction is found to only use loop invariant operands that 
-/// is safe to hoist, this instruction is called to do the dirty work. 
-/// 
-static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, 
-                  BasicBlock *Dest, ICFLoopSafetyInfo *SafetyInfo, 
-                  MemorySSAUpdater *MSSAU, ScalarEvolution *SE, 
-                  OptimizationRemarkEmitter *ORE) { 
+    // The PHI must be trivially replaceable.
+    Instruction *New = sinkThroughTriviallyReplaceablePHI(
+        PN, &I, LI, SunkCopies, SafetyInfo, CurLoop, MSSAU);
+    PN->replaceAllUsesWith(New);
+    eraseInstruction(*PN, *SafetyInfo, nullptr, nullptr);
+    Changed = true;
+  }
+  return Changed;
+}
+
+/// When an instruction is found to only use loop invariant operands that
+/// is safe to hoist, this instruction is called to do the dirty work.
+///
+static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
+                  BasicBlock *Dest, ICFLoopSafetyInfo *SafetyInfo,
+                  MemorySSAUpdater *MSSAU, ScalarEvolution *SE,
+                  OptimizationRemarkEmitter *ORE) {
   LLVM_DEBUG(dbgs() << "LICM hoisting to " << Dest->getNameOrAsOperand() << ": "
                     << I << "\n");
-  ORE->emit([&]() { 
-    return OptimizationRemark(DEBUG_TYPE, "Hoisted", &I) << "hoisting " 
-                                                         << ore::NV("Inst", &I); 
-  }); 
- 
-  // Metadata can be dependent on conditions we are hoisting above. 
-  // Conservatively strip all metadata on the instruction unless we were 
-  // guaranteed to execute I if we entered the loop, in which case the metadata 
-  // is valid in the loop preheader. 
-  if (I.hasMetadataOtherThanDebugLoc() && 
-      // The check on hasMetadataOtherThanDebugLoc is to prevent us from burning 
-      // time in isGuaranteedToExecute if we don't actually have anything to 
-      // drop.  It is a compile time optimization, not required for correctness. 
-      !SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop)) 
-    I.dropUnknownNonDebugMetadata(); 
- 
-  if (isa<PHINode>(I)) 
-    // Move the new node to the end of the phi list in the destination block. 
-    moveInstructionBefore(I, *Dest->getFirstNonPHI(), *SafetyInfo, MSSAU, SE); 
-  else 
-    // Move the new node to the destination block, before its terminator. 
-    moveInstructionBefore(I, *Dest->getTerminator(), *SafetyInfo, MSSAU, SE); 
- 
+  ORE->emit([&]() {
+    return OptimizationRemark(DEBUG_TYPE, "Hoisted", &I) << "hoisting "
+                                                         << ore::NV("Inst", &I);
+  });
+
+  // Metadata can be dependent on conditions we are hoisting above.
+  // Conservatively strip all metadata on the instruction unless we were
+  // guaranteed to execute I if we entered the loop, in which case the metadata
+  // is valid in the loop preheader.
+  if (I.hasMetadataOtherThanDebugLoc() &&
+      // The check on hasMetadataOtherThanDebugLoc is to prevent us from burning
+      // time in isGuaranteedToExecute if we don't actually have anything to
+      // drop.  It is a compile time optimization, not required for correctness.
+      !SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop))
+    I.dropUnknownNonDebugMetadata();
+
+  if (isa<PHINode>(I))
+    // Move the new node to the end of the phi list in the destination block.
+    moveInstructionBefore(I, *Dest->getFirstNonPHI(), *SafetyInfo, MSSAU, SE);
+  else
+    // Move the new node to the destination block, before its terminator.
+    moveInstructionBefore(I, *Dest->getTerminator(), *SafetyInfo, MSSAU, SE);
+
   I.updateLocationAfterHoist();
- 
-  if (isa<LoadInst>(I)) 
-    ++NumMovedLoads; 
-  else if (isa<CallInst>(I)) 
-    ++NumMovedCalls; 
-  ++NumHoisted; 
-} 
- 
-/// Only sink or hoist an instruction if it is not a trapping instruction, 
-/// or if the instruction is known not to trap when moved to the preheader. 
-/// or if it is a trapping instruction and is guaranteed to execute. 
-static bool isSafeToExecuteUnconditionally(Instruction &Inst, 
-                                           const DominatorTree *DT, 
-                                           const Loop *CurLoop, 
-                                           const LoopSafetyInfo *SafetyInfo, 
-                                           OptimizationRemarkEmitter *ORE, 
-                                           const Instruction *CtxI) { 
-  if (isSafeToSpeculativelyExecute(&Inst, CtxI, DT)) 
-    return true; 
- 
-  bool GuaranteedToExecute = 
-      SafetyInfo->isGuaranteedToExecute(Inst, DT, CurLoop); 
- 
-  if (!GuaranteedToExecute) { 
-    auto *LI = dyn_cast<LoadInst>(&Inst); 
-    if (LI && CurLoop->isLoopInvariant(LI->getPointerOperand())) 
-      ORE->emit([&]() { 
-        return OptimizationRemarkMissed( 
-                   DEBUG_TYPE, "LoadWithLoopInvariantAddressCondExecuted", LI) 
-               << "failed to hoist load with loop-invariant address " 
-                  "because load is conditionally executed"; 
-      }); 
-  } 
- 
-  return GuaranteedToExecute; 
-} 
- 
-namespace { 
-class LoopPromoter : public LoadAndStorePromoter { 
-  Value *SomePtr; // Designated pointer to store to. 
-  const SmallSetVector<Value *, 8> &PointerMustAliases; 
-  SmallVectorImpl<BasicBlock *> &LoopExitBlocks; 
-  SmallVectorImpl<Instruction *> &LoopInsertPts; 
-  SmallVectorImpl<MemoryAccess *> &MSSAInsertPts; 
-  PredIteratorCache &PredCache; 
+
+  if (isa<LoadInst>(I))
+    ++NumMovedLoads;
+  else if (isa<CallInst>(I))
+    ++NumMovedCalls;
+  ++NumHoisted;
+}
+
+/// Only sink or hoist an instruction if it is not a trapping instruction,
+/// or if the instruction is known not to trap when moved to the preheader.
+/// or if it is a trapping instruction and is guaranteed to execute.
+static bool isSafeToExecuteUnconditionally(Instruction &Inst,
+                                           const DominatorTree *DT,
+                                           const Loop *CurLoop,
+                                           const LoopSafetyInfo *SafetyInfo,
+                                           OptimizationRemarkEmitter *ORE,
+                                           const Instruction *CtxI) {
+  if (isSafeToSpeculativelyExecute(&Inst, CtxI, DT))
+    return true;
+
+  bool GuaranteedToExecute =
+      SafetyInfo->isGuaranteedToExecute(Inst, DT, CurLoop);
+
+  if (!GuaranteedToExecute) {
+    auto *LI = dyn_cast<LoadInst>(&Inst);
+    if (LI && CurLoop->isLoopInvariant(LI->getPointerOperand()))
+      ORE->emit([&]() {
+        return OptimizationRemarkMissed(
+                   DEBUG_TYPE, "LoadWithLoopInvariantAddressCondExecuted", LI)
+               << "failed to hoist load with loop-invariant address "
+                  "because load is conditionally executed";
+      });
+  }
+
+  return GuaranteedToExecute;
+}
+
+namespace {
+class LoopPromoter : public LoadAndStorePromoter {
+  Value *SomePtr; // Designated pointer to store to.
+  const SmallSetVector<Value *, 8> &PointerMustAliases;
+  SmallVectorImpl<BasicBlock *> &LoopExitBlocks;
+  SmallVectorImpl<Instruction *> &LoopInsertPts;
+  SmallVectorImpl<MemoryAccess *> &MSSAInsertPts;
+  PredIteratorCache &PredCache;
   AliasSetTracker *AST;
-  MemorySSAUpdater *MSSAU; 
-  LoopInfo &LI; 
-  DebugLoc DL; 
-  int Alignment; 
-  bool UnorderedAtomic; 
-  AAMDNodes AATags; 
-  ICFLoopSafetyInfo &SafetyInfo; 
- 
-  Value *maybeInsertLCSSAPHI(Value *V, BasicBlock *BB) const { 
-    if (Instruction *I = dyn_cast<Instruction>(V)) 
-      if (Loop *L = LI.getLoopFor(I->getParent())) 
-        if (!L->contains(BB)) { 
-          // We need to create an LCSSA PHI node for the incoming value and 
-          // store that. 
-          PHINode *PN = PHINode::Create(I->getType(), PredCache.size(BB), 
-                                        I->getName() + ".lcssa", &BB->front()); 
-          for (BasicBlock *Pred : PredCache.get(BB)) 
-            PN->addIncoming(I, Pred); 
-          return PN; 
-        } 
-    return V; 
-  } 
- 
-public: 
-  LoopPromoter(Value *SP, ArrayRef<const Instruction *> Insts, SSAUpdater &S, 
-               const SmallSetVector<Value *, 8> &PMA, 
-               SmallVectorImpl<BasicBlock *> &LEB, 
-               SmallVectorImpl<Instruction *> &LIP, 
-               SmallVectorImpl<MemoryAccess *> &MSSAIP, PredIteratorCache &PIC, 
+  MemorySSAUpdater *MSSAU;
+  LoopInfo &LI;
+  DebugLoc DL;
+  int Alignment;
+  bool UnorderedAtomic;
+  AAMDNodes AATags;
+  ICFLoopSafetyInfo &SafetyInfo;
+
+  Value *maybeInsertLCSSAPHI(Value *V, BasicBlock *BB) const {
+    if (Instruction *I = dyn_cast<Instruction>(V))
+      if (Loop *L = LI.getLoopFor(I->getParent()))
+        if (!L->contains(BB)) {
+          // We need to create an LCSSA PHI node for the incoming value and
+          // store that.
+          PHINode *PN = PHINode::Create(I->getType(), PredCache.size(BB),
+                                        I->getName() + ".lcssa", &BB->front());
+          for (BasicBlock *Pred : PredCache.get(BB))
+            PN->addIncoming(I, Pred);
+          return PN;
+        }
+    return V;
+  }
+
+public:
+  LoopPromoter(Value *SP, ArrayRef<const Instruction *> Insts, SSAUpdater &S,
+               const SmallSetVector<Value *, 8> &PMA,
+               SmallVectorImpl<BasicBlock *> &LEB,
+               SmallVectorImpl<Instruction *> &LIP,
+               SmallVectorImpl<MemoryAccess *> &MSSAIP, PredIteratorCache &PIC,
                AliasSetTracker *ast, MemorySSAUpdater *MSSAU, LoopInfo &li,
-               DebugLoc dl, int alignment, bool UnorderedAtomic, 
-               const AAMDNodes &AATags, ICFLoopSafetyInfo &SafetyInfo) 
-      : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA), 
-        LoopExitBlocks(LEB), LoopInsertPts(LIP), MSSAInsertPts(MSSAIP), 
-        PredCache(PIC), AST(ast), MSSAU(MSSAU), LI(li), DL(std::move(dl)), 
-        Alignment(alignment), UnorderedAtomic(UnorderedAtomic), AATags(AATags), 
-        SafetyInfo(SafetyInfo) {} 
- 
-  bool isInstInList(Instruction *I, 
-                    const SmallVectorImpl<Instruction *> &) const override { 
-    Value *Ptr; 
-    if (LoadInst *LI = dyn_cast<LoadInst>(I)) 
-      Ptr = LI->getOperand(0); 
-    else 
-      Ptr = cast<StoreInst>(I)->getPointerOperand(); 
-    return PointerMustAliases.count(Ptr); 
-  } 
- 
-  void doExtraRewritesBeforeFinalDeletion() override { 
-    // Insert stores after in the loop exit blocks.  Each exit block gets a 
-    // store of the live-out values that feed them.  Since we've already told 
-    // the SSA updater about the defs in the loop and the preheader 
-    // definition, it is all set and we can start using it. 
-    for (unsigned i = 0, e = LoopExitBlocks.size(); i != e; ++i) { 
-      BasicBlock *ExitBlock = LoopExitBlocks[i]; 
-      Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock); 
-      LiveInValue = maybeInsertLCSSAPHI(LiveInValue, ExitBlock); 
-      Value *Ptr = maybeInsertLCSSAPHI(SomePtr, ExitBlock); 
-      Instruction *InsertPos = LoopInsertPts[i]; 
-      StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos); 
-      if (UnorderedAtomic) 
-        NewSI->setOrdering(AtomicOrdering::Unordered); 
-      NewSI->setAlignment(Align(Alignment)); 
-      NewSI->setDebugLoc(DL); 
-      if (AATags) 
-        NewSI->setAAMetadata(AATags); 
- 
-      if (MSSAU) { 
-        MemoryAccess *MSSAInsertPoint = MSSAInsertPts[i]; 
-        MemoryAccess *NewMemAcc; 
-        if (!MSSAInsertPoint) { 
-          NewMemAcc = MSSAU->createMemoryAccessInBB( 
-              NewSI, nullptr, NewSI->getParent(), MemorySSA::Beginning); 
-        } else { 
-          NewMemAcc = 
-              MSSAU->createMemoryAccessAfter(NewSI, nullptr, MSSAInsertPoint); 
-        } 
-        MSSAInsertPts[i] = NewMemAcc; 
-        MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true); 
-        // FIXME: true for safety, false may still be correct. 
-      } 
-    } 
-  } 
- 
-  void replaceLoadWithValue(LoadInst *LI, Value *V) const override { 
-    // Update alias analysis. 
+               DebugLoc dl, int alignment, bool UnorderedAtomic,
+               const AAMDNodes &AATags, ICFLoopSafetyInfo &SafetyInfo)
+      : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA),
+        LoopExitBlocks(LEB), LoopInsertPts(LIP), MSSAInsertPts(MSSAIP),
+        PredCache(PIC), AST(ast), MSSAU(MSSAU), LI(li), DL(std::move(dl)),
+        Alignment(alignment), UnorderedAtomic(UnorderedAtomic), AATags(AATags),
+        SafetyInfo(SafetyInfo) {}
+
+  bool isInstInList(Instruction *I,
+                    const SmallVectorImpl<Instruction *> &) const override {
+    Value *Ptr;
+    if (LoadInst *LI = dyn_cast<LoadInst>(I))
+      Ptr = LI->getOperand(0);
+    else
+      Ptr = cast<StoreInst>(I)->getPointerOperand();
+    return PointerMustAliases.count(Ptr);
+  }
+
+  void doExtraRewritesBeforeFinalDeletion() override {
+    // Insert stores after in the loop exit blocks.  Each exit block gets a
+    // store of the live-out values that feed them.  Since we've already told
+    // the SSA updater about the defs in the loop and the preheader
+    // definition, it is all set and we can start using it.
+    for (unsigned i = 0, e = LoopExitBlocks.size(); i != e; ++i) {
+      BasicBlock *ExitBlock = LoopExitBlocks[i];
+      Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock);
+      LiveInValue = maybeInsertLCSSAPHI(LiveInValue, ExitBlock);
+      Value *Ptr = maybeInsertLCSSAPHI(SomePtr, ExitBlock);
+      Instruction *InsertPos = LoopInsertPts[i];
+      StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos);
+      if (UnorderedAtomic)
+        NewSI->setOrdering(AtomicOrdering::Unordered);
+      NewSI->setAlignment(Align(Alignment));
+      NewSI->setDebugLoc(DL);
+      if (AATags)
+        NewSI->setAAMetadata(AATags);
+
+      if (MSSAU) {
+        MemoryAccess *MSSAInsertPoint = MSSAInsertPts[i];
+        MemoryAccess *NewMemAcc;
+        if (!MSSAInsertPoint) {
+          NewMemAcc = MSSAU->createMemoryAccessInBB(
+              NewSI, nullptr, NewSI->getParent(), MemorySSA::Beginning);
+        } else {
+          NewMemAcc =
+              MSSAU->createMemoryAccessAfter(NewSI, nullptr, MSSAInsertPoint);
+        }
+        MSSAInsertPts[i] = NewMemAcc;
+        MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
+        // FIXME: true for safety, false may still be correct.
+      }
+    }
+  }
+
+  void replaceLoadWithValue(LoadInst *LI, Value *V) const override {
+    // Update alias analysis.
     if (AST)
       AST->copyValue(LI, V);
-  } 
-  void instructionDeleted(Instruction *I) const override { 
-    SafetyInfo.removeInstruction(I); 
+  }
+  void instructionDeleted(Instruction *I) const override {
+    SafetyInfo.removeInstruction(I);
     if (AST)
       AST->deleteValue(I);
-    if (MSSAU) 
-      MSSAU->removeMemoryAccess(I); 
-  } 
-}; 
- 
- 
-/// Return true iff we can prove that a caller of this function can not inspect 
-/// the contents of the provided object in a well defined program. 
-bool isKnownNonEscaping(Value *Object, const TargetLibraryInfo *TLI) { 
-  if (isa<AllocaInst>(Object)) 
-    // Since the alloca goes out of scope, we know the caller can't retain a 
-    // reference to it and be well defined.  Thus, we don't need to check for 
-    // capture. 
-    return true; 
- 
-  // For all other objects we need to know that the caller can't possibly 
-  // have gotten a reference to the object.  There are two components of 
-  // that: 
-  //   1) Object can't be escaped by this function.  This is what 
-  //      PointerMayBeCaptured checks. 
-  //   2) Object can't have been captured at definition site.  For this, we 
-  //      need to know the return value is noalias.  At the moment, we use a 
-  //      weaker condition and handle only AllocLikeFunctions (which are 
-  //      known to be noalias).  TODO 
-  return isAllocLikeFn(Object, TLI) && 
-    !PointerMayBeCaptured(Object, true, true); 
-} 
- 
-} // namespace 
- 
-/// Try to promote memory values to scalars by sinking stores out of the 
-/// loop and moving loads to before the loop.  We do this by looping over 
-/// the stores in the loop, looking for stores to Must pointers which are 
-/// loop invariant. 
-/// 
-bool llvm::promoteLoopAccessesToScalars( 
-    const SmallSetVector<Value *, 8> &PointerMustAliases, 
-    SmallVectorImpl<BasicBlock *> &ExitBlocks, 
-    SmallVectorImpl<Instruction *> &InsertPts, 
-    SmallVectorImpl<MemoryAccess *> &MSSAInsertPts, PredIteratorCache &PIC, 
-    LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, 
-    Loop *CurLoop, AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU, 
-    ICFLoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE) { 
-  // Verify inputs. 
-  assert(LI != nullptr && DT != nullptr && CurLoop != nullptr && 
+    if (MSSAU)
+      MSSAU->removeMemoryAccess(I);
+  }
+};
+
+
+/// Return true iff we can prove that a caller of this function can not inspect
+/// the contents of the provided object in a well defined program.
+bool isKnownNonEscaping(Value *Object, const TargetLibraryInfo *TLI) {
+  if (isa<AllocaInst>(Object))
+    // Since the alloca goes out of scope, we know the caller can't retain a
+    // reference to it and be well defined.  Thus, we don't need to check for
+    // capture.
+    return true;
+
+  // For all other objects we need to know that the caller can't possibly
+  // have gotten a reference to the object.  There are two components of
+  // that:
+  //   1) Object can't be escaped by this function.  This is what
+  //      PointerMayBeCaptured checks.
+  //   2) Object can't have been captured at definition site.  For this, we
+  //      need to know the return value is noalias.  At the moment, we use a
+  //      weaker condition and handle only AllocLikeFunctions (which are
+  //      known to be noalias).  TODO
+  return isAllocLikeFn(Object, TLI) &&
+    !PointerMayBeCaptured(Object, true, true);
+}
+
+} // namespace
+
+/// Try to promote memory values to scalars by sinking stores out of the
+/// loop and moving loads to before the loop.  We do this by looping over
+/// the stores in the loop, looking for stores to Must pointers which are
+/// loop invariant.
+///
+bool llvm::promoteLoopAccessesToScalars(
+    const SmallSetVector<Value *, 8> &PointerMustAliases,
+    SmallVectorImpl<BasicBlock *> &ExitBlocks,
+    SmallVectorImpl<Instruction *> &InsertPts,
+    SmallVectorImpl<MemoryAccess *> &MSSAInsertPts, PredIteratorCache &PIC,
+    LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI,
+    Loop *CurLoop, AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU,
+    ICFLoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE) {
+  // Verify inputs.
+  assert(LI != nullptr && DT != nullptr && CurLoop != nullptr &&
          SafetyInfo != nullptr &&
-         "Unexpected Input to promoteLoopAccessesToScalars"); 
- 
-  Value *SomePtr = *PointerMustAliases.begin(); 
-  BasicBlock *Preheader = CurLoop->getLoopPreheader(); 
- 
-  // It is not safe to promote a load/store from the loop if the load/store is 
-  // conditional.  For example, turning: 
-  // 
-  //    for () { if (c) *P += 1; } 
-  // 
-  // into: 
-  // 
-  //    tmp = *P;  for () { if (c) tmp +=1; } *P = tmp; 
-  // 
-  // is not safe, because *P may only be valid to access if 'c' is true. 
-  // 
-  // The safety property divides into two parts: 
-  // p1) The memory may not be dereferenceable on entry to the loop.  In this 
-  //    case, we can't insert the required load in the preheader. 
-  // p2) The memory model does not allow us to insert a store along any dynamic 
-  //    path which did not originally have one. 
-  // 
-  // If at least one store is guaranteed to execute, both properties are 
-  // satisfied, and promotion is legal. 
-  // 
-  // This, however, is not a necessary condition. Even if no store/load is 
-  // guaranteed to execute, we can still establish these properties. 
-  // We can establish (p1) by proving that hoisting the load into the preheader 
-  // is safe (i.e. proving dereferenceability on all paths through the loop). We 
-  // can use any access within the alias set to prove dereferenceability, 
-  // since they're all must alias. 
-  // 
-  // There are two ways establish (p2): 
-  // a) Prove the location is thread-local. In this case the memory model 
-  // requirement does not apply, and stores are safe to insert. 
-  // b) Prove a store dominates every exit block. In this case, if an exit 
-  // blocks is reached, the original dynamic path would have taken us through 
-  // the store, so inserting a store into the exit block is safe. Note that this 
-  // is different from the store being guaranteed to execute. For instance, 
-  // if an exception is thrown on the first iteration of the loop, the original 
-  // store is never executed, but the exit blocks are not executed either. 
- 
-  bool DereferenceableInPH = false; 
-  bool SafeToInsertStore = false; 
- 
-  SmallVector<Instruction *, 64> LoopUses; 
- 
-  // We start with an alignment of one and try to find instructions that allow 
-  // us to prove better alignment. 
-  Align Alignment; 
-  // Keep track of which types of access we see 
-  bool SawUnorderedAtomic = false; 
-  bool SawNotAtomic = false; 
-  AAMDNodes AATags; 
- 
-  const DataLayout &MDL = Preheader->getModule()->getDataLayout(); 
- 
-  bool IsKnownThreadLocalObject = false; 
-  if (SafetyInfo->anyBlockMayThrow()) { 
-    // If a loop can throw, we have to insert a store along each unwind edge. 
-    // That said, we can't actually make the unwind edge explicit. Therefore, 
-    // we have to prove that the store is dead along the unwind edge.  We do 
-    // this by proving that the caller can't have a reference to the object 
-    // after return and thus can't possibly load from the object. 
+         "Unexpected Input to promoteLoopAccessesToScalars");
+
+  Value *SomePtr = *PointerMustAliases.begin();
+  BasicBlock *Preheader = CurLoop->getLoopPreheader();
+
+  // It is not safe to promote a load/store from the loop if the load/store is
+  // conditional.  For example, turning:
+  //
+  //    for () { if (c) *P += 1; }
+  //
+  // into:
+  //
+  //    tmp = *P;  for () { if (c) tmp +=1; } *P = tmp;
+  //
+  // is not safe, because *P may only be valid to access if 'c' is true.
+  //
+  // The safety property divides into two parts:
+  // p1) The memory may not be dereferenceable on entry to the loop.  In this
+  //    case, we can't insert the required load in the preheader.
+  // p2) The memory model does not allow us to insert a store along any dynamic
+  //    path which did not originally have one.
+  //
+  // If at least one store is guaranteed to execute, both properties are
+  // satisfied, and promotion is legal.
+  //
+  // This, however, is not a necessary condition. Even if no store/load is
+  // guaranteed to execute, we can still establish these properties.
+  // We can establish (p1) by proving that hoisting the load into the preheader
+  // is safe (i.e. proving dereferenceability on all paths through the loop). We
+  // can use any access within the alias set to prove dereferenceability,
+  // since they're all must alias.
+  //
+  // There are two ways establish (p2):
+  // a) Prove the location is thread-local. In this case the memory model
+  // requirement does not apply, and stores are safe to insert.
+  // b) Prove a store dominates every exit block. In this case, if an exit
+  // blocks is reached, the original dynamic path would have taken us through
+  // the store, so inserting a store into the exit block is safe. Note that this
+  // is different from the store being guaranteed to execute. For instance,
+  // if an exception is thrown on the first iteration of the loop, the original
+  // store is never executed, but the exit blocks are not executed either.
+
+  bool DereferenceableInPH = false;
+  bool SafeToInsertStore = false;
+
+  SmallVector<Instruction *, 64> LoopUses;
+
+  // We start with an alignment of one and try to find instructions that allow
+  // us to prove better alignment.
+  Align Alignment;
+  // Keep track of which types of access we see
+  bool SawUnorderedAtomic = false;
+  bool SawNotAtomic = false;
+  AAMDNodes AATags;
+
+  const DataLayout &MDL = Preheader->getModule()->getDataLayout();
+
+  bool IsKnownThreadLocalObject = false;
+  if (SafetyInfo->anyBlockMayThrow()) {
+    // If a loop can throw, we have to insert a store along each unwind edge.
+    // That said, we can't actually make the unwind edge explicit. Therefore,
+    // we have to prove that the store is dead along the unwind edge.  We do
+    // this by proving that the caller can't have a reference to the object
+    // after return and thus can't possibly load from the object.
     Value *Object = getUnderlyingObject(SomePtr);
-    if (!isKnownNonEscaping(Object, TLI)) 
-      return false; 
-    // Subtlety: Alloca's aren't visible to callers, but *are* potentially 
-    // visible to other threads if captured and used during their lifetimes. 
-    IsKnownThreadLocalObject = !isa<AllocaInst>(Object); 
-  } 
- 
-  // Check that all of the pointers in the alias set have the same type.  We 
-  // cannot (yet) promote a memory location that is loaded and stored in 
-  // different sizes.  While we are at it, collect alignment and AA info. 
-  for (Value *ASIV : PointerMustAliases) { 
-    // Check that all of the pointers in the alias set have the same type.  We 
-    // cannot (yet) promote a memory location that is loaded and stored in 
-    // different sizes. 
-    if (SomePtr->getType() != ASIV->getType()) 
-      return false; 
- 
-    for (User *U : ASIV->users()) { 
-      // Ignore instructions that are outside the loop. 
-      Instruction *UI = dyn_cast<Instruction>(U); 
-      if (!UI || !CurLoop->contains(UI)) 
-        continue; 
- 
-      // If there is an non-load/store instruction in the loop, we can't promote 
-      // it. 
-      if (LoadInst *Load = dyn_cast<LoadInst>(UI)) { 
-        if (!Load->isUnordered()) 
-          return false; 
- 
-        SawUnorderedAtomic |= Load->isAtomic(); 
-        SawNotAtomic |= !Load->isAtomic(); 
- 
-        Align InstAlignment = Load->getAlign(); 
- 
-        // Note that proving a load safe to speculate requires proving 
-        // sufficient alignment at the target location.  Proving it guaranteed 
-        // to execute does as well.  Thus we can increase our guaranteed 
-        // alignment as well.  
-        if (!DereferenceableInPH || (InstAlignment > Alignment)) 
-          if (isSafeToExecuteUnconditionally(*Load, DT, CurLoop, SafetyInfo, 
-                                             ORE, Preheader->getTerminator())) { 
-            DereferenceableInPH = true; 
-            Alignment = std::max(Alignment, InstAlignment); 
-          } 
-      } else if (const StoreInst *Store = dyn_cast<StoreInst>(UI)) { 
-        // Stores *of* the pointer are not interesting, only stores *to* the 
-        // pointer. 
-        if (UI->getOperand(1) != ASIV) 
-          continue; 
-        if (!Store->isUnordered()) 
-          return false; 
- 
-        SawUnorderedAtomic |= Store->isAtomic(); 
-        SawNotAtomic |= !Store->isAtomic(); 
- 
-        // If the store is guaranteed to execute, both properties are satisfied. 
-        // We may want to check if a store is guaranteed to execute even if we 
-        // already know that promotion is safe, since it may have higher 
-        // alignment than any other guaranteed stores, in which case we can 
-        // raise the alignment on the promoted store. 
-        Align InstAlignment = Store->getAlign(); 
- 
-        if (!DereferenceableInPH || !SafeToInsertStore || 
-            (InstAlignment > Alignment)) { 
-          if (SafetyInfo->isGuaranteedToExecute(*UI, DT, CurLoop)) { 
-            DereferenceableInPH = true; 
-            SafeToInsertStore = true; 
-            Alignment = std::max(Alignment, InstAlignment); 
-          } 
-        } 
- 
-        // If a store dominates all exit blocks, it is safe to sink. 
-        // As explained above, if an exit block was executed, a dominating 
-        // store must have been executed at least once, so we are not 
-        // introducing stores on paths that did not have them. 
-        // Note that this only looks at explicit exit blocks. If we ever 
-        // start sinking stores into unwind edges (see above), this will break. 
-        if (!SafeToInsertStore) 
-          SafeToInsertStore = llvm::all_of(ExitBlocks, [&](BasicBlock *Exit) { 
-            return DT->dominates(Store->getParent(), Exit); 
-          }); 
- 
-        // If the store is not guaranteed to execute, we may still get 
-        // deref info through it. 
-        if (!DereferenceableInPH) { 
-          DereferenceableInPH = isDereferenceableAndAlignedPointer( 
-              Store->getPointerOperand(), Store->getValueOperand()->getType(), 
-              Store->getAlign(), MDL, Preheader->getTerminator(), DT); 
-        } 
-      } else 
-        return false; // Not a load or store. 
- 
-      // Merge the AA tags. 
-      if (LoopUses.empty()) { 
-        // On the first load/store, just take its AA tags. 
-        UI->getAAMetadata(AATags); 
-      } else if (AATags) { 
-        UI->getAAMetadata(AATags, /* Merge = */ true); 
-      } 
- 
-      LoopUses.push_back(UI); 
-    } 
-  } 
- 
-  // If we found both an unordered atomic instruction and a non-atomic memory 
-  // access, bail.  We can't blindly promote non-atomic to atomic since we 
-  // might not be able to lower the result.  We can't downgrade since that 
-  // would violate memory model.  Also, align 0 is an error for atomics. 
-  if (SawUnorderedAtomic && SawNotAtomic) 
-    return false; 
- 
-  // If we're inserting an atomic load in the preheader, we must be able to 
-  // lower it.  We're only guaranteed to be able to lower naturally aligned 
-  // atomics. 
-  auto *SomePtrElemType = SomePtr->getType()->getPointerElementType(); 
-  if (SawUnorderedAtomic && 
-      Alignment < MDL.getTypeStoreSize(SomePtrElemType)) 
-    return false; 
- 
-  // If we couldn't prove we can hoist the load, bail. 
-  if (!DereferenceableInPH) 
-    return false; 
- 
-  // We know we can hoist the load, but don't have a guaranteed store. 
-  // Check whether the location is thread-local. If it is, then we can insert 
-  // stores along paths which originally didn't have them without violating the 
-  // memory model. 
-  if (!SafeToInsertStore) { 
-    if (IsKnownThreadLocalObject) 
-      SafeToInsertStore = true; 
-    else { 
+    if (!isKnownNonEscaping(Object, TLI))
+      return false;
+    // Subtlety: Alloca's aren't visible to callers, but *are* potentially
+    // visible to other threads if captured and used during their lifetimes.
+    IsKnownThreadLocalObject = !isa<AllocaInst>(Object);
+  }
+
+  // Check that all of the pointers in the alias set have the same type.  We
+  // cannot (yet) promote a memory location that is loaded and stored in
+  // different sizes.  While we are at it, collect alignment and AA info.
+  for (Value *ASIV : PointerMustAliases) {
+    // Check that all of the pointers in the alias set have the same type.  We
+    // cannot (yet) promote a memory location that is loaded and stored in
+    // different sizes.
+    if (SomePtr->getType() != ASIV->getType())
+      return false;
+
+    for (User *U : ASIV->users()) {
+      // Ignore instructions that are outside the loop.
+      Instruction *UI = dyn_cast<Instruction>(U);
+      if (!UI || !CurLoop->contains(UI))
+        continue;
+
+      // If there is an non-load/store instruction in the loop, we can't promote
+      // it.
+      if (LoadInst *Load = dyn_cast<LoadInst>(UI)) {
+        if (!Load->isUnordered())
+          return false;
+
+        SawUnorderedAtomic |= Load->isAtomic();
+        SawNotAtomic |= !Load->isAtomic();
+
+        Align InstAlignment = Load->getAlign();
+
+        // Note that proving a load safe to speculate requires proving
+        // sufficient alignment at the target location.  Proving it guaranteed
+        // to execute does as well.  Thus we can increase our guaranteed
+        // alignment as well. 
+        if (!DereferenceableInPH || (InstAlignment > Alignment))
+          if (isSafeToExecuteUnconditionally(*Load, DT, CurLoop, SafetyInfo,
+                                             ORE, Preheader->getTerminator())) {
+            DereferenceableInPH = true;
+            Alignment = std::max(Alignment, InstAlignment);
+          }
+      } else if (const StoreInst *Store = dyn_cast<StoreInst>(UI)) {
+        // Stores *of* the pointer are not interesting, only stores *to* the
+        // pointer.
+        if (UI->getOperand(1) != ASIV)
+          continue;
+        if (!Store->isUnordered())
+          return false;
+
+        SawUnorderedAtomic |= Store->isAtomic();
+        SawNotAtomic |= !Store->isAtomic();
+
+        // If the store is guaranteed to execute, both properties are satisfied.
+        // We may want to check if a store is guaranteed to execute even if we
+        // already know that promotion is safe, since it may have higher
+        // alignment than any other guaranteed stores, in which case we can
+        // raise the alignment on the promoted store.
+        Align InstAlignment = Store->getAlign();
+
+        if (!DereferenceableInPH || !SafeToInsertStore ||
+            (InstAlignment > Alignment)) {
+          if (SafetyInfo->isGuaranteedToExecute(*UI, DT, CurLoop)) {
+            DereferenceableInPH = true;
+            SafeToInsertStore = true;
+            Alignment = std::max(Alignment, InstAlignment);
+          }
+        }
+
+        // If a store dominates all exit blocks, it is safe to sink.
+        // As explained above, if an exit block was executed, a dominating
+        // store must have been executed at least once, so we are not
+        // introducing stores on paths that did not have them.
+        // Note that this only looks at explicit exit blocks. If we ever
+        // start sinking stores into unwind edges (see above), this will break.
+        if (!SafeToInsertStore)
+          SafeToInsertStore = llvm::all_of(ExitBlocks, [&](BasicBlock *Exit) {
+            return DT->dominates(Store->getParent(), Exit);
+          });
+
+        // If the store is not guaranteed to execute, we may still get
+        // deref info through it.
+        if (!DereferenceableInPH) {
+          DereferenceableInPH = isDereferenceableAndAlignedPointer(
+              Store->getPointerOperand(), Store->getValueOperand()->getType(),
+              Store->getAlign(), MDL, Preheader->getTerminator(), DT);
+        }
+      } else
+        return false; // Not a load or store.
+
+      // Merge the AA tags.
+      if (LoopUses.empty()) {
+        // On the first load/store, just take its AA tags.
+        UI->getAAMetadata(AATags);
+      } else if (AATags) {
+        UI->getAAMetadata(AATags, /* Merge = */ true);
+      }
+
+      LoopUses.push_back(UI);
+    }
+  }
+
+  // If we found both an unordered atomic instruction and a non-atomic memory
+  // access, bail.  We can't blindly promote non-atomic to atomic since we
+  // might not be able to lower the result.  We can't downgrade since that
+  // would violate memory model.  Also, align 0 is an error for atomics.
+  if (SawUnorderedAtomic && SawNotAtomic)
+    return false;
+
+  // If we're inserting an atomic load in the preheader, we must be able to
+  // lower it.  We're only guaranteed to be able to lower naturally aligned
+  // atomics.
+  auto *SomePtrElemType = SomePtr->getType()->getPointerElementType();
+  if (SawUnorderedAtomic &&
+      Alignment < MDL.getTypeStoreSize(SomePtrElemType))
+    return false;
+
+  // If we couldn't prove we can hoist the load, bail.
+  if (!DereferenceableInPH)
+    return false;
+
+  // We know we can hoist the load, but don't have a guaranteed store.
+  // Check whether the location is thread-local. If it is, then we can insert
+  // stores along paths which originally didn't have them without violating the
+  // memory model.
+  if (!SafeToInsertStore) {
+    if (IsKnownThreadLocalObject)
+      SafeToInsertStore = true;
+    else {
       Value *Object = getUnderlyingObject(SomePtr);
-      SafeToInsertStore = 
-          (isAllocLikeFn(Object, TLI) || isa<AllocaInst>(Object)) && 
-          !PointerMayBeCaptured(Object, true, true); 
-    } 
-  } 
- 
-  // If we've still failed to prove we can sink the store, give up. 
-  if (!SafeToInsertStore) 
-    return false; 
- 
-  // Otherwise, this is safe to promote, lets do it! 
-  LLVM_DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " << *SomePtr 
-                    << '\n'); 
-  ORE->emit([&]() { 
-    return OptimizationRemark(DEBUG_TYPE, "PromoteLoopAccessesToScalar", 
-                              LoopUses[0]) 
-           << "Moving accesses to memory location out of the loop"; 
-  }); 
-  ++NumPromoted; 
- 
-  // Look at all the loop uses, and try to merge their locations. 
-  std::vector<const DILocation *> LoopUsesLocs; 
-  for (auto U : LoopUses) 
-    LoopUsesLocs.push_back(U->getDebugLoc().get()); 
-  auto DL = DebugLoc(DILocation::getMergedLocations(LoopUsesLocs)); 
- 
-  // We use the SSAUpdater interface to insert phi nodes as required. 
-  SmallVector<PHINode *, 16> NewPHIs; 
-  SSAUpdater SSA(&NewPHIs); 
-  LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks, 
+      SafeToInsertStore =
+          (isAllocLikeFn(Object, TLI) || isa<AllocaInst>(Object)) &&
+          !PointerMayBeCaptured(Object, true, true);
+    }
+  }
+
+  // If we've still failed to prove we can sink the store, give up.
+  if (!SafeToInsertStore)
+    return false;
+
+  // Otherwise, this is safe to promote, lets do it!
+  LLVM_DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " << *SomePtr
+                    << '\n');
+  ORE->emit([&]() {
+    return OptimizationRemark(DEBUG_TYPE, "PromoteLoopAccessesToScalar",
+                              LoopUses[0])
+           << "Moving accesses to memory location out of the loop";
+  });
+  ++NumPromoted;
+
+  // Look at all the loop uses, and try to merge their locations.
+  std::vector<const DILocation *> LoopUsesLocs;
+  for (auto U : LoopUses)
+    LoopUsesLocs.push_back(U->getDebugLoc().get());
+  auto DL = DebugLoc(DILocation::getMergedLocations(LoopUsesLocs));
+
+  // We use the SSAUpdater interface to insert phi nodes as required.
+  SmallVector<PHINode *, 16> NewPHIs;
+  SSAUpdater SSA(&NewPHIs);
+  LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks,
                         InsertPts, MSSAInsertPts, PIC, CurAST, MSSAU, *LI, DL,
-                        Alignment.value(), SawUnorderedAtomic, AATags, 
-                        *SafetyInfo); 
- 
-  // Set up the preheader to have a definition of the value.  It is the live-out 
-  // value from the preheader that uses in the loop will use. 
-  LoadInst *PreheaderLoad = new LoadInst( 
-      SomePtr->getType()->getPointerElementType(), SomePtr, 
-      SomePtr->getName() + ".promoted", Preheader->getTerminator()); 
-  if (SawUnorderedAtomic) 
-    PreheaderLoad->setOrdering(AtomicOrdering::Unordered); 
-  PreheaderLoad->setAlignment(Alignment); 
-  PreheaderLoad->setDebugLoc(DebugLoc()); 
-  if (AATags) 
-    PreheaderLoad->setAAMetadata(AATags); 
-  SSA.AddAvailableValue(Preheader, PreheaderLoad); 
- 
-  if (MSSAU) { 
-    MemoryAccess *PreheaderLoadMemoryAccess = MSSAU->createMemoryAccessInBB( 
-        PreheaderLoad, nullptr, PreheaderLoad->getParent(), MemorySSA::End); 
-    MemoryUse *NewMemUse = cast<MemoryUse>(PreheaderLoadMemoryAccess); 
-    MSSAU->insertUse(NewMemUse, /*RenameUses=*/true); 
-  } 
- 
-  if (MSSAU && VerifyMemorySSA) 
-    MSSAU->getMemorySSA()->verifyMemorySSA(); 
-  // Rewrite all the loads in the loop and remember all the definitions from 
-  // stores in the loop. 
-  Promoter.run(LoopUses); 
- 
-  if (MSSAU && VerifyMemorySSA) 
-    MSSAU->getMemorySSA()->verifyMemorySSA(); 
-  // If the SSAUpdater didn't use the load in the preheader, just zap it now. 
-  if (PreheaderLoad->use_empty()) 
-    eraseInstruction(*PreheaderLoad, *SafetyInfo, CurAST, MSSAU); 
- 
-  return true; 
-} 
- 
-/// Returns an owning pointer to an alias set which incorporates aliasing info 
-/// from L and all subloops of L. 
-std::unique_ptr<AliasSetTracker> 
-LoopInvariantCodeMotion::collectAliasInfoForLoop(Loop *L, LoopInfo *LI, 
-                                                 AAResults *AA) { 
-  auto CurAST = std::make_unique<AliasSetTracker>(*AA); 
- 
-  // Add everything from all the sub loops. 
-  for (Loop *InnerL : L->getSubLoops()) 
-    for (BasicBlock *BB : InnerL->blocks()) 
-      CurAST->add(*BB); 
- 
-  // And merge in this loop (without anything from inner loops). 
-  for (BasicBlock *BB : L->blocks()) 
-    if (LI->getLoopFor(BB) == L) 
-      CurAST->add(*BB); 
- 
-  return CurAST; 
-} 
- 
-std::unique_ptr<AliasSetTracker> 
-LoopInvariantCodeMotion::collectAliasInfoForLoopWithMSSA( 
-    Loop *L, AAResults *AA, MemorySSAUpdater *MSSAU) { 
-  auto *MSSA = MSSAU->getMemorySSA(); 
-  auto CurAST = std::make_unique<AliasSetTracker>(*AA, MSSA, L); 
-  CurAST->addAllInstructionsInLoopUsingMSSA(); 
-  return CurAST; 
-} 
- 
-static bool pointerInvalidatedByLoop(MemoryLocation MemLoc, 
-                                     AliasSetTracker *CurAST, Loop *CurLoop, 
-                                     AAResults *AA) { 
-  // First check to see if any of the basic blocks in CurLoop invalidate *V. 
-  bool isInvalidatedAccordingToAST = CurAST->getAliasSetFor(MemLoc).isMod(); 
- 
-  if (!isInvalidatedAccordingToAST || !LICMN2Theshold) 
-    return isInvalidatedAccordingToAST; 
- 
-  // Check with a diagnostic analysis if we can refine the information above. 
-  // This is to identify the limitations of using the AST. 
-  // The alias set mechanism used by LICM has a major weakness in that it 
-  // combines all things which may alias into a single set *before* asking 
-  // modref questions. As a result, a single readonly call within a loop will 
-  // collapse all loads and stores into a single alias set and report 
-  // invalidation if the loop contains any store. For example, readonly calls 
-  // with deopt states have this form and create a general alias set with all 
-  // loads and stores.  In order to get any LICM in loops containing possible 
-  // deopt states we need a more precise invalidation of checking the mod ref 
-  // info of each instruction within the loop and LI. This has a complexity of 
-  // O(N^2), so currently, it is used only as a diagnostic tool since the 
-  // default value of LICMN2Threshold is zero. 
- 
-  // Don't look at nested loops. 
-  if (CurLoop->begin() != CurLoop->end()) 
-    return true; 
- 
-  int N = 0; 
-  for (BasicBlock *BB : CurLoop->getBlocks()) 
-    for (Instruction &I : *BB) { 
-      if (N >= LICMN2Theshold) { 
-        LLVM_DEBUG(dbgs() << "Alasing N2 threshold exhausted for " 
-                          << *(MemLoc.Ptr) << "\n"); 
-        return true; 
-      } 
-      N++; 
-      auto Res = AA->getModRefInfo(&I, MemLoc); 
-      if (isModSet(Res)) { 
-        LLVM_DEBUG(dbgs() << "Aliasing failed on " << I << " for " 
-                          << *(MemLoc.Ptr) << "\n"); 
-        return true; 
-      } 
-    } 
-  LLVM_DEBUG(dbgs() << "Aliasing okay for " << *(MemLoc.Ptr) << "\n"); 
-  return false; 
-} 
- 
+                        Alignment.value(), SawUnorderedAtomic, AATags,
+                        *SafetyInfo);
+
+  // Set up the preheader to have a definition of the value.  It is the live-out
+  // value from the preheader that uses in the loop will use.
+  LoadInst *PreheaderLoad = new LoadInst(
+      SomePtr->getType()->getPointerElementType(), SomePtr,
+      SomePtr->getName() + ".promoted", Preheader->getTerminator());
+  if (SawUnorderedAtomic)
+    PreheaderLoad->setOrdering(AtomicOrdering::Unordered);
+  PreheaderLoad->setAlignment(Alignment);
+  PreheaderLoad->setDebugLoc(DebugLoc());
+  if (AATags)
+    PreheaderLoad->setAAMetadata(AATags);
+  SSA.AddAvailableValue(Preheader, PreheaderLoad);
+
+  if (MSSAU) {
+    MemoryAccess *PreheaderLoadMemoryAccess = MSSAU->createMemoryAccessInBB(
+        PreheaderLoad, nullptr, PreheaderLoad->getParent(), MemorySSA::End);
+    MemoryUse *NewMemUse = cast<MemoryUse>(PreheaderLoadMemoryAccess);
+    MSSAU->insertUse(NewMemUse, /*RenameUses=*/true);
+  }
+
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+  // Rewrite all the loads in the loop and remember all the definitions from
+  // stores in the loop.
+  Promoter.run(LoopUses);
+
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+  // If the SSAUpdater didn't use the load in the preheader, just zap it now.
+  if (PreheaderLoad->use_empty())
+    eraseInstruction(*PreheaderLoad, *SafetyInfo, CurAST, MSSAU);
+
+  return true;
+}
+
+/// Returns an owning pointer to an alias set which incorporates aliasing info
+/// from L and all subloops of L.
+std::unique_ptr<AliasSetTracker>
+LoopInvariantCodeMotion::collectAliasInfoForLoop(Loop *L, LoopInfo *LI,
+                                                 AAResults *AA) {
+  auto CurAST = std::make_unique<AliasSetTracker>(*AA);
+
+  // Add everything from all the sub loops.
+  for (Loop *InnerL : L->getSubLoops())
+    for (BasicBlock *BB : InnerL->blocks())
+      CurAST->add(*BB);
+
+  // And merge in this loop (without anything from inner loops).
+  for (BasicBlock *BB : L->blocks())
+    if (LI->getLoopFor(BB) == L)
+      CurAST->add(*BB);
+
+  return CurAST;
+}
+
+std::unique_ptr<AliasSetTracker>
+LoopInvariantCodeMotion::collectAliasInfoForLoopWithMSSA(
+    Loop *L, AAResults *AA, MemorySSAUpdater *MSSAU) {
+  auto *MSSA = MSSAU->getMemorySSA();
+  auto CurAST = std::make_unique<AliasSetTracker>(*AA, MSSA, L);
+  CurAST->addAllInstructionsInLoopUsingMSSA();
+  return CurAST;
+}
+
+static bool pointerInvalidatedByLoop(MemoryLocation MemLoc,
+                                     AliasSetTracker *CurAST, Loop *CurLoop,
+                                     AAResults *AA) {
+  // First check to see if any of the basic blocks in CurLoop invalidate *V.
+  bool isInvalidatedAccordingToAST = CurAST->getAliasSetFor(MemLoc).isMod();
+
+  if (!isInvalidatedAccordingToAST || !LICMN2Theshold)
+    return isInvalidatedAccordingToAST;
+
+  // Check with a diagnostic analysis if we can refine the information above.
+  // This is to identify the limitations of using the AST.
+  // The alias set mechanism used by LICM has a major weakness in that it
+  // combines all things which may alias into a single set *before* asking
+  // modref questions. As a result, a single readonly call within a loop will
+  // collapse all loads and stores into a single alias set and report
+  // invalidation if the loop contains any store. For example, readonly calls
+  // with deopt states have this form and create a general alias set with all
+  // loads and stores.  In order to get any LICM in loops containing possible
+  // deopt states we need a more precise invalidation of checking the mod ref
+  // info of each instruction within the loop and LI. This has a complexity of
+  // O(N^2), so currently, it is used only as a diagnostic tool since the
+  // default value of LICMN2Threshold is zero.
+
+  // Don't look at nested loops.
+  if (CurLoop->begin() != CurLoop->end())
+    return true;
+
+  int N = 0;
+  for (BasicBlock *BB : CurLoop->getBlocks())
+    for (Instruction &I : *BB) {
+      if (N >= LICMN2Theshold) {
+        LLVM_DEBUG(dbgs() << "Alasing N2 threshold exhausted for "
+                          << *(MemLoc.Ptr) << "\n");
+        return true;
+      }
+      N++;
+      auto Res = AA->getModRefInfo(&I, MemLoc);
+      if (isModSet(Res)) {
+        LLVM_DEBUG(dbgs() << "Aliasing failed on " << I << " for "
+                          << *(MemLoc.Ptr) << "\n");
+        return true;
+      }
+    }
+  LLVM_DEBUG(dbgs() << "Aliasing okay for " << *(MemLoc.Ptr) << "\n");
+  return false;
+}
+
 bool pointerInvalidatedByLoopWithMSSA(MemorySSA *MSSA, MemoryUse *MU,
                                       Loop *CurLoop, Instruction &I,
                                       SinkAndHoistLICMFlags &Flags) {
-  // For hoisting, use the walker to determine safety 
+  // For hoisting, use the walker to determine safety
   if (!Flags.getIsSink()) {
-    MemoryAccess *Source; 
-    // See declaration of SetLicmMssaOptCap for usage details. 
+    MemoryAccess *Source;
+    // See declaration of SetLicmMssaOptCap for usage details.
     if (Flags.tooManyClobberingCalls())
-      Source = MU->getDefiningAccess(); 
-    else { 
-      Source = MSSA->getSkipSelfWalker()->getClobberingMemoryAccess(MU); 
+      Source = MU->getDefiningAccess();
+    else {
+      Source = MSSA->getSkipSelfWalker()->getClobberingMemoryAccess(MU);
       Flags.incrementClobberingCalls();
-    } 
-    return !MSSA->isLiveOnEntryDef(Source) && 
-           CurLoop->contains(Source->getBlock()); 
-  } 
- 
-  // For sinking, we'd need to check all Defs below this use. The getClobbering 
-  // call will look on the backedge of the loop, but will check aliasing with 
-  // the instructions on the previous iteration. 
-  // For example: 
-  // for (i ... ) 
-  //   load a[i] ( Use (LoE) 
-  //   store a[i] ( 1 = Def (2), with 2 = Phi for the loop. 
-  //   i++; 
-  // The load sees no clobbering inside the loop, as the backedge alias check 
-  // does phi translation, and will check aliasing against store a[i-1]. 
-  // However sinking the load outside the loop, below the store is incorrect. 
- 
-  // For now, only sink if there are no Defs in the loop, and the existing ones 
-  // precede the use and are in the same block. 
-  // FIXME: Increase precision: Safe to sink if Use post dominates the Def; 
-  // needs PostDominatorTreeAnalysis. 
-  // FIXME: More precise: no Defs that alias this Use. 
+    }
+    return !MSSA->isLiveOnEntryDef(Source) &&
+           CurLoop->contains(Source->getBlock());
+  }
+
+  // For sinking, we'd need to check all Defs below this use. The getClobbering
+  // call will look on the backedge of the loop, but will check aliasing with
+  // the instructions on the previous iteration.
+  // For example:
+  // for (i ... )
+  //   load a[i] ( Use (LoE)
+  //   store a[i] ( 1 = Def (2), with 2 = Phi for the loop.
+  //   i++;
+  // The load sees no clobbering inside the loop, as the backedge alias check
+  // does phi translation, and will check aliasing against store a[i-1].
+  // However sinking the load outside the loop, below the store is incorrect.
+
+  // For now, only sink if there are no Defs in the loop, and the existing ones
+  // precede the use and are in the same block.
+  // FIXME: Increase precision: Safe to sink if Use post dominates the Def;
+  // needs PostDominatorTreeAnalysis.
+  // FIXME: More precise: no Defs that alias this Use.
   if (Flags.tooManyMemoryAccesses())
-    return true; 
-  for (auto *BB : CurLoop->getBlocks()) 
+    return true;
+  for (auto *BB : CurLoop->getBlocks())
     if (pointerInvalidatedByBlockWithMSSA(*BB, *MSSA, *MU))
       return true;
   // When sinking, the source block may not be part of the loop so check it.
   if (!CurLoop->contains(&I))
     return pointerInvalidatedByBlockWithMSSA(*I.getParent(), *MSSA, *MU);
 
-  return false; 
-} 
- 
+  return false;
+}
+
 bool pointerInvalidatedByBlockWithMSSA(BasicBlock &BB, MemorySSA &MSSA,
                                        MemoryUse &MU) {
   if (const auto *Accesses = MSSA.getBlockDefs(&BB))
@@ -2350,10 +2350,10 @@ bool pointerInvalidatedByBlockWithMSSA(BasicBlock &BB, MemorySSA &MSSA,
   return false;
 }
 
-/// Little predicate that returns true if the specified basic block is in 
-/// a subloop of the current one, not the current one itself. 
-/// 
-static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI) { 
-  assert(CurLoop->contains(BB) && "Only valid if BB is IN the loop"); 
-  return LI->getLoopFor(BB) != CurLoop; 
-} 
+/// Little predicate that returns true if the specified basic block is in
+/// a subloop of the current one, not the current one itself.
+///
+static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI) {
+  assert(CurLoop->contains(BB) && "Only valid if BB is IN the loop");
+  return LI->getLoopFor(BB) != CurLoop;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp
index 32e6cd4e93..1c3ff1a61b 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp
@@ -1,24 +1,24 @@
-//===- LoopAccessAnalysisPrinter.cpp - Loop Access Analysis Printer --------==// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/LoopAccessAnalysisPrinter.h" 
-#include "llvm/Analysis/LoopAccessAnalysis.h" 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "loop-accesses" 
- 
-PreservedAnalyses 
-LoopAccessInfoPrinterPass::run(Loop &L, LoopAnalysisManager &AM, 
-                               LoopStandardAnalysisResults &AR, LPMUpdater &) { 
-  Function &F = *L.getHeader()->getParent(); 
-  auto &LAI = AM.getResult<LoopAccessAnalysis>(L, AR); 
-  OS << "Loop access info in function '" << F.getName() << "':\n"; 
-  OS.indent(2) << L.getHeader()->getName() << ":\n"; 
-  LAI.print(OS, 4); 
-  return PreservedAnalyses::all(); 
-} 
+//===- LoopAccessAnalysisPrinter.cpp - Loop Access Analysis Printer --------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopAccessAnalysisPrinter.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-accesses"
+
+PreservedAnalyses
+LoopAccessInfoPrinterPass::run(Loop &L, LoopAnalysisManager &AM,
+                               LoopStandardAnalysisResults &AR, LPMUpdater &) {
+  Function &F = *L.getHeader()->getParent();
+  auto &LAI = AM.getResult<LoopAccessAnalysis>(L, AR);
+  OS << "Loop access info in function '" << F.getName() << "':\n";
+  OS.indent(2) << L.getHeader()->getName() << ":\n";
+  LAI.print(OS, 4);
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopDataPrefetch.cpp
index 63b79c9caa..45cdcb2f37 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopDataPrefetch.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopDataPrefetch.cpp
@@ -1,419 +1,419 @@
-//===-------- LoopDataPrefetch.cpp - Loop Data Prefetching Pass -----------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements a Loop Data Prefetching Pass. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/LoopDataPrefetch.h" 
-#include "llvm/InitializePasses.h" 
- 
-#define DEBUG_TYPE "loop-data-prefetch" 
-#include "llvm/ADT/DepthFirstIterator.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/CodeMetrics.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/OptimizationRemarkEmitter.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/Analysis/ScalarEvolutionExpressions.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 
-#include "llvm/Transforms/Utils/ValueMapper.h" 
-using namespace llvm; 
- 
-// By default, we limit this to creating 16 PHIs (which is a little over half 
-// of the allocatable register set). 
-static cl::opt<bool> 
-PrefetchWrites("loop-prefetch-writes", cl::Hidden, cl::init(false), 
-               cl::desc("Prefetch write addresses")); 
- 
-static cl::opt<unsigned> 
-    PrefetchDistance("prefetch-distance", 
-                     cl::desc("Number of instructions to prefetch ahead"), 
-                     cl::Hidden); 
- 
-static cl::opt<unsigned> 
-    MinPrefetchStride("min-prefetch-stride", 
-                      cl::desc("Min stride to add prefetches"), cl::Hidden); 
- 
-static cl::opt<unsigned> MaxPrefetchIterationsAhead( 
-    "max-prefetch-iters-ahead", 
-    cl::desc("Max number of iterations to prefetch ahead"), cl::Hidden); 
- 
-STATISTIC(NumPrefetches, "Number of prefetches inserted"); 
- 
-namespace { 
- 
-/// Loop prefetch implementation class. 
-class LoopDataPrefetch { 
-public: 
-  LoopDataPrefetch(AssumptionCache *AC, DominatorTree *DT, LoopInfo *LI, 
-                   ScalarEvolution *SE, const TargetTransformInfo *TTI, 
-                   OptimizationRemarkEmitter *ORE) 
-      : AC(AC), DT(DT), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {} 
- 
-  bool run(); 
- 
-private: 
-  bool runOnLoop(Loop *L); 
- 
-  /// Check if the stride of the accesses is large enough to 
-  /// warrant a prefetch. 
-  bool isStrideLargeEnough(const SCEVAddRecExpr *AR, unsigned TargetMinStride); 
- 
-  unsigned getMinPrefetchStride(unsigned NumMemAccesses, 
-                                unsigned NumStridedMemAccesses, 
-                                unsigned NumPrefetches, 
-                                bool HasCall) { 
-    if (MinPrefetchStride.getNumOccurrences() > 0) 
-      return MinPrefetchStride; 
-    return TTI->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses, 
-                                     NumPrefetches, HasCall); 
-  } 
- 
-  unsigned getPrefetchDistance() { 
-    if (PrefetchDistance.getNumOccurrences() > 0) 
-      return PrefetchDistance; 
-    return TTI->getPrefetchDistance(); 
-  } 
- 
-  unsigned getMaxPrefetchIterationsAhead() { 
-    if (MaxPrefetchIterationsAhead.getNumOccurrences() > 0) 
-      return MaxPrefetchIterationsAhead; 
-    return TTI->getMaxPrefetchIterationsAhead(); 
-  } 
- 
-  bool doPrefetchWrites() { 
-    if (PrefetchWrites.getNumOccurrences() > 0) 
-      return PrefetchWrites; 
-    return TTI->enableWritePrefetching(); 
-  } 
- 
-  AssumptionCache *AC; 
-  DominatorTree *DT; 
-  LoopInfo *LI; 
-  ScalarEvolution *SE; 
-  const TargetTransformInfo *TTI; 
-  OptimizationRemarkEmitter *ORE; 
-}; 
- 
-/// Legacy class for inserting loop data prefetches. 
-class LoopDataPrefetchLegacyPass : public FunctionPass { 
-public: 
-  static char ID; // Pass ID, replacement for typeid 
-  LoopDataPrefetchLegacyPass() : FunctionPass(ID) { 
-    initializeLoopDataPrefetchLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<AssumptionCacheTracker>(); 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addPreserved<DominatorTreeWrapperPass>(); 
-    AU.addRequired<LoopInfoWrapperPass>(); 
-    AU.addPreserved<LoopInfoWrapperPass>(); 
-    AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 
-    AU.addRequired<ScalarEvolutionWrapperPass>(); 
-    AU.addPreserved<ScalarEvolutionWrapperPass>(); 
-    AU.addRequired<TargetTransformInfoWrapperPass>(); 
-  } 
- 
-  bool runOnFunction(Function &F) override; 
-  }; 
-} 
- 
-char LoopDataPrefetchLegacyPass::ID = 0; 
-INITIALIZE_PASS_BEGIN(LoopDataPrefetchLegacyPass, "loop-data-prefetch", 
-                      "Loop Data Prefetch", false, false) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 
-INITIALIZE_PASS_END(LoopDataPrefetchLegacyPass, "loop-data-prefetch", 
-                    "Loop Data Prefetch", false, false) 
- 
-FunctionPass *llvm::createLoopDataPrefetchPass() { 
-  return new LoopDataPrefetchLegacyPass(); 
-} 
- 
-bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR, 
-                                           unsigned TargetMinStride) { 
-  // No need to check if any stride goes. 
-  if (TargetMinStride <= 1) 
-    return true; 
- 
-  const auto *ConstStride = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)); 
-  // If MinStride is set, don't prefetch unless we can ensure that stride is 
-  // larger. 
-  if (!ConstStride) 
-    return false; 
- 
-  unsigned AbsStride = std::abs(ConstStride->getAPInt().getSExtValue()); 
-  return TargetMinStride <= AbsStride; 
-} 
- 
-PreservedAnalyses LoopDataPrefetchPass::run(Function &F, 
-                                            FunctionAnalysisManager &AM) { 
-  DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F); 
-  LoopInfo *LI = &AM.getResult<LoopAnalysis>(F); 
-  ScalarEvolution *SE = &AM.getResult<ScalarEvolutionAnalysis>(F); 
-  AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F); 
-  OptimizationRemarkEmitter *ORE = 
-      &AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 
-  const TargetTransformInfo *TTI = &AM.getResult<TargetIRAnalysis>(F); 
- 
-  LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE); 
-  bool Changed = LDP.run(); 
- 
-  if (Changed) { 
-    PreservedAnalyses PA; 
-    PA.preserve<DominatorTreeAnalysis>(); 
-    PA.preserve<LoopAnalysis>(); 
-    return PA; 
-  } 
- 
-  return PreservedAnalyses::all(); 
-} 
- 
-bool LoopDataPrefetchLegacyPass::runOnFunction(Function &F) { 
-  if (skipFunction(F)) 
-    return false; 
- 
-  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 
-  ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 
-  AssumptionCache *AC = 
-      &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 
-  OptimizationRemarkEmitter *ORE = 
-      &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 
-  const TargetTransformInfo *TTI = 
-      &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 
- 
-  LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE); 
-  return LDP.run(); 
-} 
- 
-bool LoopDataPrefetch::run() { 
-  // If PrefetchDistance is not set, don't run the pass.  This gives an 
-  // opportunity for targets to run this pass for selected subtargets only 
-  // (whose TTI sets PrefetchDistance). 
-  if (getPrefetchDistance() == 0) 
-    return false; 
-  assert(TTI->getCacheLineSize() && "Cache line size is not set for target"); 
- 
-  bool MadeChange = false; 
- 
-  for (Loop *I : *LI) 
-    for (auto L = df_begin(I), LE = df_end(I); L != LE; ++L) 
-      MadeChange |= runOnLoop(*L); 
- 
-  return MadeChange; 
-} 
- 
-/// A record for a potential prefetch made during the initial scan of the 
-/// loop. This is used to let a single prefetch target multiple memory accesses. 
-struct Prefetch { 
-  /// The address formula for this prefetch as returned by ScalarEvolution. 
-  const SCEVAddRecExpr *LSCEVAddRec; 
-  /// The point of insertion for the prefetch instruction. 
-  Instruction *InsertPt; 
-  /// True if targeting a write memory access. 
-  bool Writes; 
-  /// The (first seen) prefetched instruction. 
-  Instruction *MemI; 
- 
-  /// Constructor to create a new Prefetch for \p I. 
-  Prefetch(const SCEVAddRecExpr *L, Instruction *I) 
-      : LSCEVAddRec(L), InsertPt(nullptr), Writes(false), MemI(nullptr) { 
-    addInstruction(I); 
-  }; 
- 
-  /// Add the instruction \param I to this prefetch. If it's not the first 
-  /// one, 'InsertPt' and 'Writes' will be updated as required. 
-  /// \param PtrDiff the known constant address difference to the first added 
-  /// instruction. 
-  void addInstruction(Instruction *I, DominatorTree *DT = nullptr, 
-                      int64_t PtrDiff = 0) { 
-    if (!InsertPt) { 
-      MemI = I; 
-      InsertPt = I; 
-      Writes = isa<StoreInst>(I); 
-    } else { 
-      BasicBlock *PrefBB = InsertPt->getParent(); 
-      BasicBlock *InsBB = I->getParent(); 
-      if (PrefBB != InsBB) { 
-        BasicBlock *DomBB = DT->findNearestCommonDominator(PrefBB, InsBB); 
-        if (DomBB != PrefBB) 
-          InsertPt = DomBB->getTerminator(); 
-      } 
- 
-      if (isa<StoreInst>(I) && PtrDiff == 0) 
-        Writes = true; 
-    } 
-  } 
-}; 
- 
-bool LoopDataPrefetch::runOnLoop(Loop *L) { 
-  bool MadeChange = false; 
- 
-  // Only prefetch in the inner-most loop 
+//===-------- LoopDataPrefetch.cpp - Loop Data Prefetching Pass -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a Loop Data Prefetching Pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopDataPrefetch.h"
+#include "llvm/InitializePasses.h"
+
+#define DEBUG_TYPE "loop-data-prefetch"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+using namespace llvm;
+
+// By default, we limit this to creating 16 PHIs (which is a little over half
+// of the allocatable register set).
+static cl::opt<bool>
+PrefetchWrites("loop-prefetch-writes", cl::Hidden, cl::init(false),
+               cl::desc("Prefetch write addresses"));
+
+static cl::opt<unsigned>
+    PrefetchDistance("prefetch-distance",
+                     cl::desc("Number of instructions to prefetch ahead"),
+                     cl::Hidden);
+
+static cl::opt<unsigned>
+    MinPrefetchStride("min-prefetch-stride",
+                      cl::desc("Min stride to add prefetches"), cl::Hidden);
+
+static cl::opt<unsigned> MaxPrefetchIterationsAhead(
+    "max-prefetch-iters-ahead",
+    cl::desc("Max number of iterations to prefetch ahead"), cl::Hidden);
+
+STATISTIC(NumPrefetches, "Number of prefetches inserted");
+
+namespace {
+
+/// Loop prefetch implementation class.
+class LoopDataPrefetch {
+public:
+  LoopDataPrefetch(AssumptionCache *AC, DominatorTree *DT, LoopInfo *LI,
+                   ScalarEvolution *SE, const TargetTransformInfo *TTI,
+                   OptimizationRemarkEmitter *ORE)
+      : AC(AC), DT(DT), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {}
+
+  bool run();
+
+private:
+  bool runOnLoop(Loop *L);
+
+  /// Check if the stride of the accesses is large enough to
+  /// warrant a prefetch.
+  bool isStrideLargeEnough(const SCEVAddRecExpr *AR, unsigned TargetMinStride);
+
+  unsigned getMinPrefetchStride(unsigned NumMemAccesses,
+                                unsigned NumStridedMemAccesses,
+                                unsigned NumPrefetches,
+                                bool HasCall) {
+    if (MinPrefetchStride.getNumOccurrences() > 0)
+      return MinPrefetchStride;
+    return TTI->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
+                                     NumPrefetches, HasCall);
+  }
+
+  unsigned getPrefetchDistance() {
+    if (PrefetchDistance.getNumOccurrences() > 0)
+      return PrefetchDistance;
+    return TTI->getPrefetchDistance();
+  }
+
+  unsigned getMaxPrefetchIterationsAhead() {
+    if (MaxPrefetchIterationsAhead.getNumOccurrences() > 0)
+      return MaxPrefetchIterationsAhead;
+    return TTI->getMaxPrefetchIterationsAhead();
+  }
+
+  bool doPrefetchWrites() {
+    if (PrefetchWrites.getNumOccurrences() > 0)
+      return PrefetchWrites;
+    return TTI->enableWritePrefetching();
+  }
+
+  AssumptionCache *AC;
+  DominatorTree *DT;
+  LoopInfo *LI;
+  ScalarEvolution *SE;
+  const TargetTransformInfo *TTI;
+  OptimizationRemarkEmitter *ORE;
+};
+
+/// Legacy class for inserting loop data prefetches.
+class LoopDataPrefetchLegacyPass : public FunctionPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+  LoopDataPrefetchLegacyPass() : FunctionPass(ID) {
+    initializeLoopDataPrefetchLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addPreserved<ScalarEvolutionWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override;
+  };
+}
+
+char LoopDataPrefetchLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopDataPrefetchLegacyPass, "loop-data-prefetch",
+                      "Loop Data Prefetch", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(LoopDataPrefetchLegacyPass, "loop-data-prefetch",
+                    "Loop Data Prefetch", false, false)
+
+FunctionPass *llvm::createLoopDataPrefetchPass() {
+  return new LoopDataPrefetchLegacyPass();
+}
+
+bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR,
+                                           unsigned TargetMinStride) {
+  // No need to check if any stride goes.
+  if (TargetMinStride <= 1)
+    return true;
+
+  const auto *ConstStride = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE));
+  // If MinStride is set, don't prefetch unless we can ensure that stride is
+  // larger.
+  if (!ConstStride)
+    return false;
+
+  unsigned AbsStride = std::abs(ConstStride->getAPInt().getSExtValue());
+  return TargetMinStride <= AbsStride;
+}
+
+PreservedAnalyses LoopDataPrefetchPass::run(Function &F,
+                                            FunctionAnalysisManager &AM) {
+  DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+  LoopInfo *LI = &AM.getResult<LoopAnalysis>(F);
+  ScalarEvolution *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
+  AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F);
+  OptimizationRemarkEmitter *ORE =
+      &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  const TargetTransformInfo *TTI = &AM.getResult<TargetIRAnalysis>(F);
+
+  LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE);
+  bool Changed = LDP.run();
+
+  if (Changed) {
+    PreservedAnalyses PA;
+    PA.preserve<DominatorTreeAnalysis>();
+    PA.preserve<LoopAnalysis>();
+    return PA;
+  }
+
+  return PreservedAnalyses::all();
+}
+
+bool LoopDataPrefetchLegacyPass::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  AssumptionCache *AC =
+      &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  OptimizationRemarkEmitter *ORE =
+      &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+  const TargetTransformInfo *TTI =
+      &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+  LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE);
+  return LDP.run();
+}
+
+bool LoopDataPrefetch::run() {
+  // If PrefetchDistance is not set, don't run the pass.  This gives an
+  // opportunity for targets to run this pass for selected subtargets only
+  // (whose TTI sets PrefetchDistance).
+  if (getPrefetchDistance() == 0)
+    return false;
+  assert(TTI->getCacheLineSize() && "Cache line size is not set for target");
+
+  bool MadeChange = false;
+
+  for (Loop *I : *LI)
+    for (auto L = df_begin(I), LE = df_end(I); L != LE; ++L)
+      MadeChange |= runOnLoop(*L);
+
+  return MadeChange;
+}
+
+/// A record for a potential prefetch made during the initial scan of the
+/// loop. This is used to let a single prefetch target multiple memory accesses.
+struct Prefetch {
+  /// The address formula for this prefetch as returned by ScalarEvolution.
+  const SCEVAddRecExpr *LSCEVAddRec;
+  /// The point of insertion for the prefetch instruction.
+  Instruction *InsertPt;
+  /// True if targeting a write memory access.
+  bool Writes;
+  /// The (first seen) prefetched instruction.
+  Instruction *MemI;
+
+  /// Constructor to create a new Prefetch for \p I.
+  Prefetch(const SCEVAddRecExpr *L, Instruction *I)
+      : LSCEVAddRec(L), InsertPt(nullptr), Writes(false), MemI(nullptr) {
+    addInstruction(I);
+  };
+
+  /// Add the instruction \param I to this prefetch. If it's not the first
+  /// one, 'InsertPt' and 'Writes' will be updated as required.
+  /// \param PtrDiff the known constant address difference to the first added
+  /// instruction.
+  void addInstruction(Instruction *I, DominatorTree *DT = nullptr,
+                      int64_t PtrDiff = 0) {
+    if (!InsertPt) {
+      MemI = I;
+      InsertPt = I;
+      Writes = isa<StoreInst>(I);
+    } else {
+      BasicBlock *PrefBB = InsertPt->getParent();
+      BasicBlock *InsBB = I->getParent();
+      if (PrefBB != InsBB) {
+        BasicBlock *DomBB = DT->findNearestCommonDominator(PrefBB, InsBB);
+        if (DomBB != PrefBB)
+          InsertPt = DomBB->getTerminator();
+      }
+
+      if (isa<StoreInst>(I) && PtrDiff == 0)
+        Writes = true;
+    }
+  }
+};
+
+bool LoopDataPrefetch::runOnLoop(Loop *L) {
+  bool MadeChange = false;
+
+  // Only prefetch in the inner-most loop
   if (!L->isInnermost())
-    return MadeChange; 
- 
-  SmallPtrSet<const Value *, 32> EphValues; 
-  CodeMetrics::collectEphemeralValues(L, AC, EphValues); 
- 
-  // Calculate the number of iterations ahead to prefetch 
-  CodeMetrics Metrics; 
-  bool HasCall = false; 
-  for (const auto BB : L->blocks()) { 
-    // If the loop already has prefetches, then assume that the user knows 
-    // what they are doing and don't add any more. 
-    for (auto &I : *BB) { 
-      if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) { 
-        if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 
-          if (F->getIntrinsicID() == Intrinsic::prefetch) 
-            return MadeChange; 
-          if (TTI->isLoweredToCall(F)) 
-            HasCall = true; 
-        } else { // indirect call. 
-          HasCall = true; 
-        } 
-      } 
-    } 
-    Metrics.analyzeBasicBlock(BB, *TTI, EphValues); 
-  } 
-  unsigned LoopSize = Metrics.NumInsts; 
-  if (!LoopSize) 
-    LoopSize = 1; 
- 
-  unsigned ItersAhead = getPrefetchDistance() / LoopSize; 
-  if (!ItersAhead) 
-    ItersAhead = 1; 
- 
-  if (ItersAhead > getMaxPrefetchIterationsAhead()) 
-    return MadeChange; 
- 
-  unsigned ConstantMaxTripCount = SE->getSmallConstantMaxTripCount(L); 
-  if (ConstantMaxTripCount && ConstantMaxTripCount < ItersAhead + 1) 
-    return MadeChange; 
- 
-  unsigned NumMemAccesses = 0; 
-  unsigned NumStridedMemAccesses = 0; 
-  SmallVector<Prefetch, 16> Prefetches; 
-  for (const auto BB : L->blocks()) 
-    for (auto &I : *BB) { 
-      Value *PtrValue; 
-      Instruction *MemI; 
- 
-      if (LoadInst *LMemI = dyn_cast<LoadInst>(&I)) { 
-        MemI = LMemI; 
-        PtrValue = LMemI->getPointerOperand(); 
-      } else if (StoreInst *SMemI = dyn_cast<StoreInst>(&I)) { 
-        if (!doPrefetchWrites()) continue; 
-        MemI = SMemI; 
-        PtrValue = SMemI->getPointerOperand(); 
-      } else continue; 
- 
-      unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace(); 
-      if (PtrAddrSpace) 
-        continue; 
-      NumMemAccesses++; 
-      if (L->isLoopInvariant(PtrValue)) 
-        continue; 
- 
-      const SCEV *LSCEV = SE->getSCEV(PtrValue); 
-      const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV); 
-      if (!LSCEVAddRec) 
-        continue; 
-      NumStridedMemAccesses++; 
- 
-      // We don't want to double prefetch individual cache lines. If this 
-      // access is known to be within one cache line of some other one that 
-      // has already been prefetched, then don't prefetch this one as well. 
-      bool DupPref = false; 
-      for (auto &Pref : Prefetches) { 
-        const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, Pref.LSCEVAddRec); 
-        if (const SCEVConstant *ConstPtrDiff = 
-            dyn_cast<SCEVConstant>(PtrDiff)) { 
-          int64_t PD = std::abs(ConstPtrDiff->getValue()->getSExtValue()); 
-          if (PD < (int64_t) TTI->getCacheLineSize()) { 
-            Pref.addInstruction(MemI, DT, PD); 
-            DupPref = true; 
-            break; 
-          } 
-        } 
-      } 
-      if (!DupPref) 
-        Prefetches.push_back(Prefetch(LSCEVAddRec, MemI)); 
-    } 
- 
-  unsigned TargetMinStride = 
-    getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses, 
-                         Prefetches.size(), HasCall); 
- 
-  LLVM_DEBUG(dbgs() << "Prefetching " << ItersAhead 
-             << " iterations ahead (loop size: " << LoopSize << ") in " 
-             << L->getHeader()->getParent()->getName() << ": " << *L); 
-  LLVM_DEBUG(dbgs() << "Loop has: " 
-             << NumMemAccesses << " memory accesses, " 
-             << NumStridedMemAccesses << " strided memory accesses, " 
-             << Prefetches.size() << " potential prefetch(es), " 
-             << "a minimum stride of " << TargetMinStride << ", " 
-             << (HasCall ? "calls" : "no calls") << ".\n"); 
- 
-  for (auto &P : Prefetches) { 
-    // Check if the stride of the accesses is large enough to warrant a 
-    // prefetch. 
-    if (!isStrideLargeEnough(P.LSCEVAddRec, TargetMinStride)) 
-      continue; 
- 
-    const SCEV *NextLSCEV = SE->getAddExpr(P.LSCEVAddRec, SE->getMulExpr( 
-      SE->getConstant(P.LSCEVAddRec->getType(), ItersAhead), 
-      P.LSCEVAddRec->getStepRecurrence(*SE))); 
-    if (!isSafeToExpand(NextLSCEV, *SE)) 
-      continue; 
- 
-    BasicBlock *BB = P.InsertPt->getParent(); 
-    Type *I8Ptr = Type::getInt8PtrTy(BB->getContext(), 0/*PtrAddrSpace*/); 
-    SCEVExpander SCEVE(*SE, BB->getModule()->getDataLayout(), "prefaddr"); 
-    Value *PrefPtrValue = SCEVE.expandCodeFor(NextLSCEV, I8Ptr, P.InsertPt); 
- 
-    IRBuilder<> Builder(P.InsertPt); 
-    Module *M = BB->getParent()->getParent(); 
-    Type *I32 = Type::getInt32Ty(BB->getContext()); 
-    Function *PrefetchFunc = Intrinsic::getDeclaration( 
-        M, Intrinsic::prefetch, PrefPtrValue->getType()); 
-    Builder.CreateCall( 
-        PrefetchFunc, 
-        {PrefPtrValue, 
-         ConstantInt::get(I32, P.Writes), 
-         ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)}); 
-    ++NumPrefetches; 
-    LLVM_DEBUG(dbgs() << "  Access: " 
-               << *P.MemI->getOperand(isa<LoadInst>(P.MemI) ? 0 : 1) 
-               << ", SCEV: " << *P.LSCEVAddRec << "\n"); 
-    ORE->emit([&]() { 
-        return OptimizationRemark(DEBUG_TYPE, "Prefetched", P.MemI) 
-          << "prefetched memory access"; 
-      }); 
- 
-    MadeChange = true; 
-  } 
- 
-  return MadeChange; 
-} 
+    return MadeChange;
+
+  SmallPtrSet<const Value *, 32> EphValues;
+  CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
+  // Calculate the number of iterations ahead to prefetch
+  CodeMetrics Metrics;
+  bool HasCall = false;
+  for (const auto BB : L->blocks()) {
+    // If the loop already has prefetches, then assume that the user knows
+    // what they are doing and don't add any more.
+    for (auto &I : *BB) {
+      if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) {
+        if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
+          if (F->getIntrinsicID() == Intrinsic::prefetch)
+            return MadeChange;
+          if (TTI->isLoweredToCall(F))
+            HasCall = true;
+        } else { // indirect call.
+          HasCall = true;
+        }
+      }
+    }
+    Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
+  }
+  unsigned LoopSize = Metrics.NumInsts;
+  if (!LoopSize)
+    LoopSize = 1;
+
+  unsigned ItersAhead = getPrefetchDistance() / LoopSize;
+  if (!ItersAhead)
+    ItersAhead = 1;
+
+  if (ItersAhead > getMaxPrefetchIterationsAhead())
+    return MadeChange;
+
+  unsigned ConstantMaxTripCount = SE->getSmallConstantMaxTripCount(L);
+  if (ConstantMaxTripCount && ConstantMaxTripCount < ItersAhead + 1)
+    return MadeChange;
+
+  unsigned NumMemAccesses = 0;
+  unsigned NumStridedMemAccesses = 0;
+  SmallVector<Prefetch, 16> Prefetches;
+  for (const auto BB : L->blocks())
+    for (auto &I : *BB) {
+      Value *PtrValue;
+      Instruction *MemI;
+
+      if (LoadInst *LMemI = dyn_cast<LoadInst>(&I)) {
+        MemI = LMemI;
+        PtrValue = LMemI->getPointerOperand();
+      } else if (StoreInst *SMemI = dyn_cast<StoreInst>(&I)) {
+        if (!doPrefetchWrites()) continue;
+        MemI = SMemI;
+        PtrValue = SMemI->getPointerOperand();
+      } else continue;
+
+      unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace();
+      if (PtrAddrSpace)
+        continue;
+      NumMemAccesses++;
+      if (L->isLoopInvariant(PtrValue))
+        continue;
+
+      const SCEV *LSCEV = SE->getSCEV(PtrValue);
+      const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
+      if (!LSCEVAddRec)
+        continue;
+      NumStridedMemAccesses++;
+
+      // We don't want to double prefetch individual cache lines. If this
+      // access is known to be within one cache line of some other one that
+      // has already been prefetched, then don't prefetch this one as well.
+      bool DupPref = false;
+      for (auto &Pref : Prefetches) {
+        const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, Pref.LSCEVAddRec);
+        if (const SCEVConstant *ConstPtrDiff =
+            dyn_cast<SCEVConstant>(PtrDiff)) {
+          int64_t PD = std::abs(ConstPtrDiff->getValue()->getSExtValue());
+          if (PD < (int64_t) TTI->getCacheLineSize()) {
+            Pref.addInstruction(MemI, DT, PD);
+            DupPref = true;
+            break;
+          }
+        }
+      }
+      if (!DupPref)
+        Prefetches.push_back(Prefetch(LSCEVAddRec, MemI));
+    }
+
+  unsigned TargetMinStride =
+    getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
+                         Prefetches.size(), HasCall);
+
+  LLVM_DEBUG(dbgs() << "Prefetching " << ItersAhead
+             << " iterations ahead (loop size: " << LoopSize << ") in "
+             << L->getHeader()->getParent()->getName() << ": " << *L);
+  LLVM_DEBUG(dbgs() << "Loop has: "
+             << NumMemAccesses << " memory accesses, "
+             << NumStridedMemAccesses << " strided memory accesses, "
+             << Prefetches.size() << " potential prefetch(es), "
+             << "a minimum stride of " << TargetMinStride << ", "
+             << (HasCall ? "calls" : "no calls") << ".\n");
+
+  for (auto &P : Prefetches) {
+    // Check if the stride of the accesses is large enough to warrant a
+    // prefetch.
+    if (!isStrideLargeEnough(P.LSCEVAddRec, TargetMinStride))
+      continue;
+
+    const SCEV *NextLSCEV = SE->getAddExpr(P.LSCEVAddRec, SE->getMulExpr(
+      SE->getConstant(P.LSCEVAddRec->getType(), ItersAhead),
+      P.LSCEVAddRec->getStepRecurrence(*SE)));
+    if (!isSafeToExpand(NextLSCEV, *SE))
+      continue;
+
+    BasicBlock *BB = P.InsertPt->getParent();
+    Type *I8Ptr = Type::getInt8PtrTy(BB->getContext(), 0/*PtrAddrSpace*/);
+    SCEVExpander SCEVE(*SE, BB->getModule()->getDataLayout(), "prefaddr");
+    Value *PrefPtrValue = SCEVE.expandCodeFor(NextLSCEV, I8Ptr, P.InsertPt);
+
+    IRBuilder<> Builder(P.InsertPt);
+    Module *M = BB->getParent()->getParent();
+    Type *I32 = Type::getInt32Ty(BB->getContext());
+    Function *PrefetchFunc = Intrinsic::getDeclaration(
+        M, Intrinsic::prefetch, PrefPtrValue->getType());
+    Builder.CreateCall(
+        PrefetchFunc,
+        {PrefPtrValue,
+         ConstantInt::get(I32, P.Writes),
+         ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)});
+    ++NumPrefetches;
+    LLVM_DEBUG(dbgs() << "  Access: "
+               << *P.MemI->getOperand(isa<LoadInst>(P.MemI) ? 0 : 1)
+               << ", SCEV: " << *P.LSCEVAddRec << "\n");
+    ORE->emit([&]() {
+        return OptimizationRemark(DEBUG_TYPE, "Prefetched", P.MemI)
+          << "prefetched memory access";
+      });
+
+    MadeChange = true;
+  }
+
+  return MadeChange;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopDeletion.cpp
index 59873b0352..1266c93316 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -1,44 +1,44 @@
-//===- LoopDeletion.cpp - Dead Loop Deletion Pass ---------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the Dead Loop Deletion Pass. This pass is responsible 
-// for eliminating loops with non-infinite computable trip counts that have no 
-// side effects or volatile instructions, and do not contribute to the 
-// computation of the function's return value. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/LoopDeletion.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/LoopPass.h" 
-#include "llvm/Analysis/MemorySSA.h" 
-#include "llvm/Analysis/OptimizationRemarkEmitter.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Scalar/LoopPassManager.h" 
-#include "llvm/Transforms/Utils/LoopUtils.h" 
-
-using namespace llvm; 
- 
-#define DEBUG_TYPE "loop-delete" 
- 
-STATISTIC(NumDeleted, "Number of loops deleted"); 
- 
-enum class LoopDeletionResult { 
-  Unmodified, 
-  Modified, 
-  Deleted, 
-}; 
- 
+//===- LoopDeletion.cpp - Dead Loop Deletion Pass ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Dead Loop Deletion Pass. This pass is responsible
+// for eliminating loops with non-infinite computable trip counts that have no
+// side effects or volatile instructions, and do not contribute to the
+// computation of the function's return value.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopDeletion.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-delete"
+
+STATISTIC(NumDeleted, "Number of loops deleted");
+
+enum class LoopDeletionResult {
+  Unmodified,
+  Modified,
+  Deleted,
+};
+
 static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B) {
   if (A == LoopDeletionResult::Deleted || B == LoopDeletionResult::Deleted)
     return LoopDeletionResult::Deleted;
@@ -47,25 +47,25 @@ static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B) {
   return LoopDeletionResult::Unmodified;
 }
 
-/// Determines if a loop is dead. 
-/// 
-/// This assumes that we've already checked for unique exit and exiting blocks, 
-/// and that the code is in LCSSA form. 
-static bool isLoopDead(Loop *L, ScalarEvolution &SE, 
-                       SmallVectorImpl<BasicBlock *> &ExitingBlocks, 
-                       BasicBlock *ExitBlock, bool &Changed, 
-                       BasicBlock *Preheader) { 
-  // Make sure that all PHI entries coming from the loop are loop invariant. 
-  // Because the code is in LCSSA form, any values used outside of the loop 
-  // must pass through a PHI in the exit block, meaning that this check is 
-  // sufficient to guarantee that no loop-variant values are used outside 
-  // of the loop. 
-  bool AllEntriesInvariant = true; 
-  bool AllOutgoingValuesSame = true; 
+/// Determines if a loop is dead.
+///
+/// This assumes that we've already checked for unique exit and exiting blocks,
+/// and that the code is in LCSSA form.
+static bool isLoopDead(Loop *L, ScalarEvolution &SE,
+                       SmallVectorImpl<BasicBlock *> &ExitingBlocks,
+                       BasicBlock *ExitBlock, bool &Changed,
+                       BasicBlock *Preheader) {
+  // Make sure that all PHI entries coming from the loop are loop invariant.
+  // Because the code is in LCSSA form, any values used outside of the loop
+  // must pass through a PHI in the exit block, meaning that this check is
+  // sufficient to guarantee that no loop-variant values are used outside
+  // of the loop.
+  bool AllEntriesInvariant = true;
+  bool AllOutgoingValuesSame = true;
   if (!L->hasNoExitBlocks()) {
     for (PHINode &P : ExitBlock->phis()) {
       Value *incoming = P.getIncomingValueForBlock(ExitingBlocks[0]);
- 
+
       // Make sure all exiting blocks produce the same incoming value for the
       // block. If there are different incoming values for different exiting
       // blocks, then it is impossible to statically determine which value
@@ -74,67 +74,67 @@ static bool isLoopDead(Loop *L, ScalarEvolution &SE,
           all_of(makeArrayRef(ExitingBlocks).slice(1), [&](BasicBlock *BB) {
             return incoming == P.getIncomingValueForBlock(BB);
           });
- 
+
       if (!AllOutgoingValuesSame)
         break;
- 
+
       if (Instruction *I = dyn_cast<Instruction>(incoming))
         if (!L->makeLoopInvariant(I, Changed, Preheader->getTerminator())) {
           AllEntriesInvariant = false;
           break;
         }
     }
-  } 
- 
-  if (Changed) 
-    SE.forgetLoopDispositions(L); 
- 
-  if (!AllEntriesInvariant || !AllOutgoingValuesSame) 
-    return false; 
- 
-  // Make sure that no instructions in the block have potential side-effects. 
-  // This includes instructions that could write to memory, and loads that are 
-  // marked volatile. 
-  for (auto &I : L->blocks()) 
+  }
+
+  if (Changed)
+    SE.forgetLoopDispositions(L);
+
+  if (!AllEntriesInvariant || !AllOutgoingValuesSame)
+    return false;
+
+  // Make sure that no instructions in the block have potential side-effects.
+  // This includes instructions that could write to memory, and loads that are
+  // marked volatile.
+  for (auto &I : L->blocks())
     if (any_of(*I, [](Instruction &I) {
           return I.mayHaveSideEffects() && !I.isDroppable();
         }))
-      return false; 
-  return true; 
-} 
- 
-/// This function returns true if there is no viable path from the 
-/// entry block to the header of \p L. Right now, it only does 
-/// a local search to save compile time. 
-static bool isLoopNeverExecuted(Loop *L) { 
-  using namespace PatternMatch; 
- 
-  auto *Preheader = L->getLoopPreheader(); 
-  // TODO: We can relax this constraint, since we just need a loop 
-  // predecessor. 
-  assert(Preheader && "Needs preheader!"); 
- 
-  if (Preheader == &Preheader->getParent()->getEntryBlock()) 
-    return false; 
-  // All predecessors of the preheader should have a constant conditional 
-  // branch, with the loop's preheader as not-taken. 
-  for (auto *Pred: predecessors(Preheader)) { 
-    BasicBlock *Taken, *NotTaken; 
-    ConstantInt *Cond; 
-    if (!match(Pred->getTerminator(), 
-               m_Br(m_ConstantInt(Cond), Taken, NotTaken))) 
-      return false; 
-    if (!Cond->getZExtValue()) 
-      std::swap(Taken, NotTaken); 
-    if (Taken == Preheader) 
-      return false; 
-  } 
-  assert(!pred_empty(Preheader) && 
-         "Preheader should have predecessors at this point!"); 
-  // All the predecessors have the loop preheader as not-taken target. 
-  return true; 
-} 
- 
+      return false;
+  return true;
+}
+
+/// This function returns true if there is no viable path from the
+/// entry block to the header of \p L. Right now, it only does
+/// a local search to save compile time.
+static bool isLoopNeverExecuted(Loop *L) {
+  using namespace PatternMatch;
+
+  auto *Preheader = L->getLoopPreheader();
+  // TODO: We can relax this constraint, since we just need a loop
+  // predecessor.
+  assert(Preheader && "Needs preheader!");
+
+  if (Preheader == &Preheader->getParent()->getEntryBlock())
+    return false;
+  // All predecessors of the preheader should have a constant conditional
+  // branch, with the loop's preheader as not-taken.
+  for (auto *Pred: predecessors(Preheader)) {
+    BasicBlock *Taken, *NotTaken;
+    ConstantInt *Cond;
+    if (!match(Pred->getTerminator(),
+               m_Br(m_ConstantInt(Cond), Taken, NotTaken)))
+      return false;
+    if (!Cond->getZExtValue())
+      std::swap(Taken, NotTaken);
+    if (Taken == Preheader)
+      return false;
+  }
+  assert(!pred_empty(Preheader) &&
+         "Preheader should have predecessors at this point!");
+  // All the predecessors have the loop preheader as not-taken target.
+  return true;
+}
+
 /// If we can prove the backedge is untaken, remove it.  This destroys the
 /// loop, but leaves the (now trivially loop invariant) control flow and
 /// side effects (if any) in place.
@@ -155,116 +155,116 @@ breakBackedgeIfNotTaken(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
   return LoopDeletionResult::Deleted;
 }
 
-/// Remove a loop if it is dead. 
-/// 
+/// Remove a loop if it is dead.
+///
 /// A loop is considered dead either if it does not impact the observable
 /// behavior of the program other than finite running time, or if it is
 /// required to make progress by an attribute such as 'mustprogress' or
 /// 'llvm.loop.mustprogress' and does not make any. This may remove
 /// infinite loops that have been required to make progress.
-/// 
-/// This entire process relies pretty heavily on LoopSimplify form and LCSSA in 
-/// order to make various safety checks work. 
-/// 
-/// \returns true if any changes were made. This may mutate the loop even if it 
-/// is unable to delete it due to hoisting trivially loop invariant 
-/// instructions out of the loop. 
-static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT, 
-                                           ScalarEvolution &SE, LoopInfo &LI, 
-                                           MemorySSA *MSSA, 
-                                           OptimizationRemarkEmitter &ORE) { 
-  assert(L->isLCSSAForm(DT) && "Expected LCSSA!"); 
- 
-  // We can only remove the loop if there is a preheader that we can branch from 
-  // after removing it. Also, if LoopSimplify form is not available, stay out 
-  // of trouble. 
-  BasicBlock *Preheader = L->getLoopPreheader(); 
-  if (!Preheader || !L->hasDedicatedExits()) { 
-    LLVM_DEBUG( 
-        dbgs() 
-        << "Deletion requires Loop with preheader and dedicated exits.\n"); 
-    return LoopDeletionResult::Unmodified; 
-  } 
- 
-  BasicBlock *ExitBlock = L->getUniqueExitBlock(); 
- 
-  if (ExitBlock && isLoopNeverExecuted(L)) { 
-    LLVM_DEBUG(dbgs() << "Loop is proven to never execute, delete it!"); 
+///
+/// This entire process relies pretty heavily on LoopSimplify form and LCSSA in
+/// order to make various safety checks work.
+///
+/// \returns true if any changes were made. This may mutate the loop even if it
+/// is unable to delete it due to hoisting trivially loop invariant
+/// instructions out of the loop.
+static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT,
+                                           ScalarEvolution &SE, LoopInfo &LI,
+                                           MemorySSA *MSSA,
+                                           OptimizationRemarkEmitter &ORE) {
+  assert(L->isLCSSAForm(DT) && "Expected LCSSA!");
+
+  // We can only remove the loop if there is a preheader that we can branch from
+  // after removing it. Also, if LoopSimplify form is not available, stay out
+  // of trouble.
+  BasicBlock *Preheader = L->getLoopPreheader();
+  if (!Preheader || !L->hasDedicatedExits()) {
+    LLVM_DEBUG(
+        dbgs()
+        << "Deletion requires Loop with preheader and dedicated exits.\n");
+    return LoopDeletionResult::Unmodified;
+  }
+
+  BasicBlock *ExitBlock = L->getUniqueExitBlock();
+
+  if (ExitBlock && isLoopNeverExecuted(L)) {
+    LLVM_DEBUG(dbgs() << "Loop is proven to never execute, delete it!");
     // We need to forget the loop before setting the incoming values of the exit
     // phis to undef, so we properly invalidate the SCEV expressions for those
     // phis.
     SE.forgetLoop(L);
-    // Set incoming value to undef for phi nodes in the exit block. 
-    for (PHINode &P : ExitBlock->phis()) { 
-      std::fill(P.incoming_values().begin(), P.incoming_values().end(), 
-                UndefValue::get(P.getType())); 
-    } 
-    ORE.emit([&]() { 
-      return OptimizationRemark(DEBUG_TYPE, "NeverExecutes", L->getStartLoc(), 
-                                L->getHeader()) 
-             << "Loop deleted because it never executes"; 
-    }); 
-    deleteDeadLoop(L, &DT, &SE, &LI, MSSA); 
-    ++NumDeleted; 
-    return LoopDeletionResult::Deleted; 
-  } 
- 
-  // The remaining checks below are for a loop being dead because all statements 
-  // in the loop are invariant. 
-  SmallVector<BasicBlock *, 4> ExitingBlocks; 
-  L->getExitingBlocks(ExitingBlocks); 
- 
+    // Set incoming value to undef for phi nodes in the exit block.
+    for (PHINode &P : ExitBlock->phis()) {
+      std::fill(P.incoming_values().begin(), P.incoming_values().end(),
+                UndefValue::get(P.getType()));
+    }
+    ORE.emit([&]() {
+      return OptimizationRemark(DEBUG_TYPE, "NeverExecutes", L->getStartLoc(),
+                                L->getHeader())
+             << "Loop deleted because it never executes";
+    });
+    deleteDeadLoop(L, &DT, &SE, &LI, MSSA);
+    ++NumDeleted;
+    return LoopDeletionResult::Deleted;
+  }
+
+  // The remaining checks below are for a loop being dead because all statements
+  // in the loop are invariant.
+  SmallVector<BasicBlock *, 4> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+
   // We require that the loop has at most one exit block. Otherwise, we'd be in
   // the situation of needing to be able to solve statically which exit block
   // will be branched to, or trying to preserve the branching logic in a loop
   // invariant manner.
   if (!ExitBlock && !L->hasNoExitBlocks()) {
     LLVM_DEBUG(dbgs() << "Deletion requires at most one exit block.\n");
-    return LoopDeletionResult::Unmodified; 
-  } 
-  // Finally, we have to check that the loop really is dead. 
-  bool Changed = false; 
-  if (!isLoopDead(L, SE, ExitingBlocks, ExitBlock, Changed, Preheader)) { 
-    LLVM_DEBUG(dbgs() << "Loop is not invariant, cannot delete.\n"); 
-    return Changed ? LoopDeletionResult::Modified 
-                   : LoopDeletionResult::Unmodified; 
-  } 
- 
+    return LoopDeletionResult::Unmodified;
+  }
+  // Finally, we have to check that the loop really is dead.
+  bool Changed = false;
+  if (!isLoopDead(L, SE, ExitingBlocks, ExitBlock, Changed, Preheader)) {
+    LLVM_DEBUG(dbgs() << "Loop is not invariant, cannot delete.\n");
+    return Changed ? LoopDeletionResult::Modified
+                   : LoopDeletionResult::Unmodified;
+  }
+
   // Don't remove loops for which we can't solve the trip count unless the loop
   // was required to make progress but has been determined to be dead.
-  const SCEV *S = SE.getConstantMaxBackedgeTakenCount(L); 
+  const SCEV *S = SE.getConstantMaxBackedgeTakenCount(L);
   if (isa<SCEVCouldNotCompute>(S) &&
       !L->getHeader()->getParent()->mustProgress() && !hasMustProgress(L)) {
     LLVM_DEBUG(dbgs() << "Could not compute SCEV MaxBackedgeTakenCount and was "
                          "not required to make progress.\n");
-    return Changed ? LoopDeletionResult::Modified 
-                   : LoopDeletionResult::Unmodified; 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "Loop is invariant, delete it!"); 
-  ORE.emit([&]() { 
-    return OptimizationRemark(DEBUG_TYPE, "Invariant", L->getStartLoc(), 
-                              L->getHeader()) 
-           << "Loop deleted because it is invariant"; 
-  }); 
-  deleteDeadLoop(L, &DT, &SE, &LI, MSSA); 
-  ++NumDeleted; 
- 
-  return LoopDeletionResult::Deleted; 
-} 
- 
-PreservedAnalyses LoopDeletionPass::run(Loop &L, LoopAnalysisManager &AM, 
-                                        LoopStandardAnalysisResults &AR, 
-                                        LPMUpdater &Updater) { 
- 
-  LLVM_DEBUG(dbgs() << "Analyzing Loop for deletion: "); 
-  LLVM_DEBUG(L.dump()); 
-  std::string LoopName = std::string(L.getName()); 
-  // For the new PM, we can't use OptimizationRemarkEmitter as an analysis 
-  // pass. Function analyses need to be preserved across loop transformations 
-  // but ORE cannot be preserved (see comment before the pass definition). 
-  OptimizationRemarkEmitter ORE(L.getHeader()->getParent()); 
-  auto Result = deleteLoopIfDead(&L, AR.DT, AR.SE, AR.LI, AR.MSSA, ORE); 
+    return Changed ? LoopDeletionResult::Modified
+                   : LoopDeletionResult::Unmodified;
+  }
+
+  LLVM_DEBUG(dbgs() << "Loop is invariant, delete it!");
+  ORE.emit([&]() {
+    return OptimizationRemark(DEBUG_TYPE, "Invariant", L->getStartLoc(),
+                              L->getHeader())
+           << "Loop deleted because it is invariant";
+  });
+  deleteDeadLoop(L, &DT, &SE, &LI, MSSA);
+  ++NumDeleted;
+
+  return LoopDeletionResult::Deleted;
+}
+
+PreservedAnalyses LoopDeletionPass::run(Loop &L, LoopAnalysisManager &AM,
+                                        LoopStandardAnalysisResults &AR,
+                                        LPMUpdater &Updater) {
+
+  LLVM_DEBUG(dbgs() << "Analyzing Loop for deletion: ");
+  LLVM_DEBUG(L.dump());
+  std::string LoopName = std::string(L.getName());
+  // For the new PM, we can't use OptimizationRemarkEmitter as an analysis
+  // pass. Function analyses need to be preserved across loop transformations
+  // but ORE cannot be preserved (see comment before the pass definition).
+  OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
+  auto Result = deleteLoopIfDead(&L, AR.DT, AR.SE, AR.LI, AR.MSSA, ORE);
 
   // If we can prove the backedge isn't taken, just break it and be done.  This
   // leaves the loop structure in place which means it can handle dispatching
@@ -273,73 +273,73 @@ PreservedAnalyses LoopDeletionPass::run(Loop &L, LoopAnalysisManager &AM,
     Result = merge(Result, breakBackedgeIfNotTaken(&L, AR.DT, AR.SE, AR.LI,
                                                    AR.MSSA, ORE));
 
-  if (Result == LoopDeletionResult::Unmodified) 
-    return PreservedAnalyses::all(); 
- 
-  if (Result == LoopDeletionResult::Deleted) 
-    Updater.markLoopAsDeleted(L, LoopName); 
- 
-  auto PA = getLoopPassPreservedAnalyses(); 
-  if (AR.MSSA) 
-    PA.preserve<MemorySSAAnalysis>(); 
-  return PA; 
-} 
- 
-namespace { 
-class LoopDeletionLegacyPass : public LoopPass { 
-public: 
-  static char ID; // Pass ID, replacement for typeid 
-  LoopDeletionLegacyPass() : LoopPass(ID) { 
-    initializeLoopDeletionLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  // Possibly eliminate loop L if it is dead. 
-  bool runOnLoop(Loop *L, LPPassManager &) override; 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addPreserved<MemorySSAWrapperPass>(); 
-    getLoopAnalysisUsage(AU); 
-  } 
-}; 
-} 
- 
-char LoopDeletionLegacyPass::ID = 0; 
-INITIALIZE_PASS_BEGIN(LoopDeletionLegacyPass, "loop-deletion", 
-                      "Delete dead loops", false, false) 
-INITIALIZE_PASS_DEPENDENCY(LoopPass) 
-INITIALIZE_PASS_END(LoopDeletionLegacyPass, "loop-deletion", 
-                    "Delete dead loops", false, false) 
- 
-Pass *llvm::createLoopDeletionPass() { return new LoopDeletionLegacyPass(); } 
- 
-bool LoopDeletionLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) { 
-  if (skipLoop(L)) 
-    return false; 
-  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 
-  LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 
-  auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>(); 
-  MemorySSA *MSSA = nullptr; 
-  if (MSSAAnalysis) 
-    MSSA = &MSSAAnalysis->getMSSA(); 
-  // For the old PM, we can't use OptimizationRemarkEmitter as an analysis 
-  // pass.  Function analyses need to be preserved across loop transformations 
-  // but ORE cannot be preserved (see comment before the pass definition). 
-  OptimizationRemarkEmitter ORE(L->getHeader()->getParent()); 
- 
-  LLVM_DEBUG(dbgs() << "Analyzing Loop for deletion: "); 
-  LLVM_DEBUG(L->dump()); 
- 
-  LoopDeletionResult Result = deleteLoopIfDead(L, DT, SE, LI, MSSA, ORE); 
- 
+  if (Result == LoopDeletionResult::Unmodified)
+    return PreservedAnalyses::all();
+
+  if (Result == LoopDeletionResult::Deleted)
+    Updater.markLoopAsDeleted(L, LoopName);
+
+  auto PA = getLoopPassPreservedAnalyses();
+  if (AR.MSSA)
+    PA.preserve<MemorySSAAnalysis>();
+  return PA;
+}
+
+namespace {
+class LoopDeletionLegacyPass : public LoopPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+  LoopDeletionLegacyPass() : LoopPass(ID) {
+    initializeLoopDeletionLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  // Possibly eliminate loop L if it is dead.
+  bool runOnLoop(Loop *L, LPPassManager &) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<MemorySSAWrapperPass>();
+    getLoopAnalysisUsage(AU);
+  }
+};
+}
+
+char LoopDeletionLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopDeletionLegacyPass, "loop-deletion",
+                      "Delete dead loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_END(LoopDeletionLegacyPass, "loop-deletion",
+                    "Delete dead loops", false, false)
+
+Pass *llvm::createLoopDeletionPass() { return new LoopDeletionLegacyPass(); }
+
+bool LoopDeletionLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
+  if (skipLoop(L))
+    return false;
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
+  MemorySSA *MSSA = nullptr;
+  if (MSSAAnalysis)
+    MSSA = &MSSAAnalysis->getMSSA();
+  // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
+  // pass.  Function analyses need to be preserved across loop transformations
+  // but ORE cannot be preserved (see comment before the pass definition).
+  OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
+
+  LLVM_DEBUG(dbgs() << "Analyzing Loop for deletion: ");
+  LLVM_DEBUG(L->dump());
+
+  LoopDeletionResult Result = deleteLoopIfDead(L, DT, SE, LI, MSSA, ORE);
+
   // If we can prove the backedge isn't taken, just break it and be done.  This
   // leaves the loop structure in place which means it can handle dispatching
   // to the right exit based on whatever loop invariant structure remains.
   if (Result != LoopDeletionResult::Deleted)
     Result = merge(Result, breakBackedgeIfNotTaken(L, DT, SE, LI, MSSA, ORE));
 
-  if (Result == LoopDeletionResult::Deleted) 
-    LPM.markLoopAsDeleted(*L); 
- 
-  return Result != LoopDeletionResult::Unmodified; 
-} 
+  if (Result == LoopDeletionResult::Deleted)
+    LPM.markLoopAsDeleted(*L);
+
+  return Result != LoopDeletionResult::Unmodified;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopDistribute.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopDistribute.cpp
index d4b83c0fc3..1bd2529891 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -1,1088 +1,1088 @@
-//===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the Loop Distribution Pass.  Its main focus is to 
-// distribute loops that cannot be vectorized due to dependence cycles.  It 
-// tries to isolate the offending dependences into a new loop allowing 
-// vectorization of the remaining parts. 
-// 
-// For dependence analysis, the pass uses the LoopVectorizer's 
-// LoopAccessAnalysis.  Because this analysis presumes no change in the order of 
-// memory operations, special care is taken to preserve the lexical order of 
-// these operations. 
-// 
-// Similarly to the Vectorizer, the pass also supports loop versioning to 
-// run-time disambiguate potentially overlapping arrays. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/LoopDistribute.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/DepthFirstIterator.h" 
-#include "llvm/ADT/EquivalenceClasses.h" 
-#include "llvm/ADT/Optional.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/StringRef.h" 
-#include "llvm/ADT/Twine.h" 
-#include "llvm/ADT/iterator_range.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/LoopAccessAnalysis.h" 
-#include "llvm/Analysis/LoopAnalysisManager.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/OptimizationRemarkEmitter.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DiagnosticInfo.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/Cloning.h" 
-#include "llvm/Transforms/Utils/LoopUtils.h" 
-#include "llvm/Transforms/Utils/LoopVersioning.h" 
-#include "llvm/Transforms/Utils/ValueMapper.h" 
-#include <cassert> 
-#include <functional> 
-#include <list> 
-#include <tuple> 
-#include <utility> 
- 
-using namespace llvm; 
- 
-#define LDIST_NAME "loop-distribute" 
-#define DEBUG_TYPE LDIST_NAME 
- 
-/// @{ 
-/// Metadata attribute names 
-static const char *const LLVMLoopDistributeFollowupAll = 
-    "llvm.loop.distribute.followup_all"; 
-static const char *const LLVMLoopDistributeFollowupCoincident = 
-    "llvm.loop.distribute.followup_coincident"; 
-static const char *const LLVMLoopDistributeFollowupSequential = 
-    "llvm.loop.distribute.followup_sequential"; 
-static const char *const LLVMLoopDistributeFollowupFallback = 
-    "llvm.loop.distribute.followup_fallback"; 
-/// @} 
- 
-static cl::opt<bool> 
-    LDistVerify("loop-distribute-verify", cl::Hidden, 
-                cl::desc("Turn on DominatorTree and LoopInfo verification " 
-                         "after Loop Distribution"), 
-                cl::init(false)); 
- 
-static cl::opt<bool> DistributeNonIfConvertible( 
-    "loop-distribute-non-if-convertible", cl::Hidden, 
-    cl::desc("Whether to distribute into a loop that may not be " 
-             "if-convertible by the loop vectorizer"), 
-    cl::init(false)); 
- 
-static cl::opt<unsigned> DistributeSCEVCheckThreshold( 
-    "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden, 
-    cl::desc("The maximum number of SCEV checks allowed for Loop " 
-             "Distribution")); 
- 
-static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold( 
-    "loop-distribute-scev-check-threshold-with-pragma", cl::init(128), 
-    cl::Hidden, 
-    cl::desc( 
-        "The maximum number of SCEV checks allowed for Loop " 
-        "Distribution for loop marked with #pragma loop distribute(enable)")); 
- 
-static cl::opt<bool> EnableLoopDistribute( 
-    "enable-loop-distribute", cl::Hidden, 
-    cl::desc("Enable the new, experimental LoopDistribution Pass"), 
-    cl::init(false)); 
- 
-STATISTIC(NumLoopsDistributed, "Number of loops distributed"); 
- 
-namespace { 
- 
-/// Maintains the set of instructions of the loop for a partition before 
-/// cloning.  After cloning, it hosts the new loop. 
-class InstPartition { 
-  using InstructionSet = SmallPtrSet<Instruction *, 8>; 
- 
-public: 
-  InstPartition(Instruction *I, Loop *L, bool DepCycle = false) 
-      : DepCycle(DepCycle), OrigLoop(L) { 
-    Set.insert(I); 
-  } 
- 
-  /// Returns whether this partition contains a dependence cycle. 
-  bool hasDepCycle() const { return DepCycle; } 
- 
-  /// Adds an instruction to this partition. 
-  void add(Instruction *I) { Set.insert(I); } 
- 
-  /// Collection accessors. 
-  InstructionSet::iterator begin() { return Set.begin(); } 
-  InstructionSet::iterator end() { return Set.end(); } 
-  InstructionSet::const_iterator begin() const { return Set.begin(); } 
-  InstructionSet::const_iterator end() const { return Set.end(); } 
-  bool empty() const { return Set.empty(); } 
- 
-  /// Moves this partition into \p Other.  This partition becomes empty 
-  /// after this. 
-  void moveTo(InstPartition &Other) { 
-    Other.Set.insert(Set.begin(), Set.end()); 
-    Set.clear(); 
-    Other.DepCycle |= DepCycle; 
-  } 
- 
-  /// Populates the partition with a transitive closure of all the 
-  /// instructions that the seeded instructions dependent on. 
-  void populateUsedSet() { 
-    // FIXME: We currently don't use control-dependence but simply include all 
-    // blocks (possibly empty at the end) and let simplifycfg mostly clean this 
-    // up. 
-    for (auto *B : OrigLoop->getBlocks()) 
-      Set.insert(B->getTerminator()); 
- 
-    // Follow the use-def chains to form a transitive closure of all the 
-    // instructions that the originally seeded instructions depend on. 
-    SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end()); 
-    while (!Worklist.empty()) { 
-      Instruction *I = Worklist.pop_back_val(); 
-      // Insert instructions from the loop that we depend on. 
-      for (Value *V : I->operand_values()) { 
-        auto *I = dyn_cast<Instruction>(V); 
-        if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second) 
-          Worklist.push_back(I); 
-      } 
-    } 
-  } 
- 
-  /// Clones the original loop. 
-  /// 
-  /// Updates LoopInfo and DominatorTree using the information that block \p 
-  /// LoopDomBB dominates the loop. 
-  Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB, 
-                               unsigned Index, LoopInfo *LI, 
-                               DominatorTree *DT) { 
-    ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop, 
-                                          VMap, Twine(".ldist") + Twine(Index), 
-                                          LI, DT, ClonedLoopBlocks); 
-    return ClonedLoop; 
-  } 
- 
-  /// The cloned loop.  If this partition is mapped to the original loop, 
-  /// this is null. 
-  const Loop *getClonedLoop() const { return ClonedLoop; } 
- 
-  /// Returns the loop where this partition ends up after distribution. 
-  /// If this partition is mapped to the original loop then use the block from 
-  /// the loop. 
-  Loop *getDistributedLoop() const { 
-    return ClonedLoop ? ClonedLoop : OrigLoop; 
-  } 
- 
-  /// The VMap that is populated by cloning and then used in 
-  /// remapinstruction to remap the cloned instructions. 
-  ValueToValueMapTy &getVMap() { return VMap; } 
- 
-  /// Remaps the cloned instructions using VMap. 
-  void remapInstructions() { 
-    remapInstructionsInBlocks(ClonedLoopBlocks, VMap); 
-  } 
- 
-  /// Based on the set of instructions selected for this partition, 
-  /// removes the unnecessary ones. 
-  void removeUnusedInsts() { 
-    SmallVector<Instruction *, 8> Unused; 
- 
-    for (auto *Block : OrigLoop->getBlocks()) 
-      for (auto &Inst : *Block) 
-        if (!Set.count(&Inst)) { 
-          Instruction *NewInst = &Inst; 
-          if (!VMap.empty()) 
-            NewInst = cast<Instruction>(VMap[NewInst]); 
- 
-          assert(!isa<BranchInst>(NewInst) && 
-                 "Branches are marked used early on"); 
-          Unused.push_back(NewInst); 
-        } 
- 
-    // Delete the instructions backwards, as it has a reduced likelihood of 
-    // having to update as many def-use and use-def chains. 
-    for (auto *Inst : reverse(Unused)) { 
-      if (!Inst->use_empty()) 
-        Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); 
-      Inst->eraseFromParent(); 
-    } 
-  } 
- 
-  void print() const { 
-    if (DepCycle) 
-      dbgs() << "  (cycle)\n"; 
-    for (auto *I : Set) 
-      // Prefix with the block name. 
-      dbgs() << "  " << I->getParent()->getName() << ":" << *I << "\n"; 
-  } 
- 
-  void printBlocks() const { 
-    for (auto *BB : getDistributedLoop()->getBlocks()) 
-      dbgs() << *BB; 
-  } 
- 
-private: 
-  /// Instructions from OrigLoop selected for this partition. 
-  InstructionSet Set; 
- 
-  /// Whether this partition contains a dependence cycle. 
-  bool DepCycle; 
- 
-  /// The original loop. 
-  Loop *OrigLoop; 
- 
-  /// The cloned loop.  If this partition is mapped to the original loop, 
-  /// this is null. 
-  Loop *ClonedLoop = nullptr; 
- 
-  /// The blocks of ClonedLoop including the preheader.  If this 
-  /// partition is mapped to the original loop, this is empty. 
-  SmallVector<BasicBlock *, 8> ClonedLoopBlocks; 
- 
-  /// These gets populated once the set of instructions have been 
-  /// finalized. If this partition is mapped to the original loop, these are not 
-  /// set. 
-  ValueToValueMapTy VMap; 
-}; 
- 
-/// Holds the set of Partitions.  It populates them, merges them and then 
-/// clones the loops. 
-class InstPartitionContainer { 
-  using InstToPartitionIdT = DenseMap<Instruction *, int>; 
- 
-public: 
-  InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT) 
-      : L(L), LI(LI), DT(DT) {} 
- 
-  /// Returns the number of partitions. 
-  unsigned getSize() const { return PartitionContainer.size(); } 
- 
-  /// Adds \p Inst into the current partition if that is marked to 
-  /// contain cycles.  Otherwise start a new partition for it. 
-  void addToCyclicPartition(Instruction *Inst) { 
-    // If the current partition is non-cyclic.  Start a new one. 
-    if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle()) 
-      PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true); 
-    else 
-      PartitionContainer.back().add(Inst); 
-  } 
- 
-  /// Adds \p Inst into a partition that is not marked to contain 
-  /// dependence cycles. 
-  /// 
-  //  Initially we isolate memory instructions into as many partitions as 
-  //  possible, then later we may merge them back together. 
-  void addToNewNonCyclicPartition(Instruction *Inst) { 
-    PartitionContainer.emplace_back(Inst, L); 
-  } 
- 
-  /// Merges adjacent non-cyclic partitions. 
-  /// 
-  /// The idea is that we currently only want to isolate the non-vectorizable 
-  /// partition.  We could later allow more distribution among these partition 
-  /// too. 
-  void mergeAdjacentNonCyclic() { 
-    mergeAdjacentPartitionsIf( 
-        [](const InstPartition *P) { return !P->hasDepCycle(); }); 
-  } 
- 
-  /// If a partition contains only conditional stores, we won't vectorize 
-  /// it.  Try to merge it with a previous cyclic partition. 
-  void mergeNonIfConvertible() { 
-    mergeAdjacentPartitionsIf([&](const InstPartition *Partition) { 
-      if (Partition->hasDepCycle()) 
-        return true; 
- 
-      // Now, check if all stores are conditional in this partition. 
-      bool seenStore = false; 
- 
-      for (auto *Inst : *Partition) 
-        if (isa<StoreInst>(Inst)) { 
-          seenStore = true; 
-          if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT)) 
-            return false; 
-        } 
-      return seenStore; 
-    }); 
-  } 
- 
-  /// Merges the partitions according to various heuristics. 
-  void mergeBeforePopulating() { 
-    mergeAdjacentNonCyclic(); 
-    if (!DistributeNonIfConvertible) 
-      mergeNonIfConvertible(); 
-  } 
- 
-  /// Merges partitions in order to ensure that no loads are duplicated. 
-  /// 
-  /// We can't duplicate loads because that could potentially reorder them. 
-  /// LoopAccessAnalysis provides dependency information with the context that 
-  /// the order of memory operation is preserved. 
-  /// 
-  /// Return if any partitions were merged. 
-  bool mergeToAvoidDuplicatedLoads() { 
-    using LoadToPartitionT = DenseMap<Instruction *, InstPartition *>; 
-    using ToBeMergedT = EquivalenceClasses<InstPartition *>; 
- 
-    LoadToPartitionT LoadToPartition; 
-    ToBeMergedT ToBeMerged; 
- 
-    // Step through the partitions and create equivalence between partitions 
-    // that contain the same load.  Also put partitions in between them in the 
-    // same equivalence class to avoid reordering of memory operations. 
-    for (PartitionContainerT::iterator I = PartitionContainer.begin(), 
-                                       E = PartitionContainer.end(); 
-         I != E; ++I) { 
-      auto *PartI = &*I; 
- 
-      // If a load occurs in two partitions PartI and PartJ, merge all 
-      // partitions (PartI, PartJ] into PartI. 
-      for (Instruction *Inst : *PartI) 
-        if (isa<LoadInst>(Inst)) { 
-          bool NewElt; 
-          LoadToPartitionT::iterator LoadToPart; 
- 
-          std::tie(LoadToPart, NewElt) = 
-              LoadToPartition.insert(std::make_pair(Inst, PartI)); 
-          if (!NewElt) { 
-            LLVM_DEBUG(dbgs() 
-                       << "Merging partitions due to this load in multiple " 
-                       << "partitions: " << PartI << ", " << LoadToPart->second 
-                       << "\n" 
-                       << *Inst << "\n"); 
- 
-            auto PartJ = I; 
-            do { 
-              --PartJ; 
-              ToBeMerged.unionSets(PartI, &*PartJ); 
-            } while (&*PartJ != LoadToPart->second); 
-          } 
-        } 
-    } 
-    if (ToBeMerged.empty()) 
-      return false; 
- 
-    // Merge the member of an equivalence class into its class leader.  This 
-    // makes the members empty. 
-    for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end(); 
-         I != E; ++I) { 
-      if (!I->isLeader()) 
-        continue; 
- 
-      auto PartI = I->getData(); 
-      for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)), 
-                                   ToBeMerged.member_end())) { 
-        PartJ->moveTo(*PartI); 
-      } 
-    } 
- 
-    // Remove the empty partitions. 
-    PartitionContainer.remove_if( 
-        [](const InstPartition &P) { return P.empty(); }); 
- 
-    return true; 
-  } 
- 
-  /// Sets up the mapping between instructions to partitions.  If the 
-  /// instruction is duplicated across multiple partitions, set the entry to -1. 
-  void setupPartitionIdOnInstructions() { 
-    int PartitionID = 0; 
-    for (const auto &Partition : PartitionContainer) { 
-      for (Instruction *Inst : Partition) { 
-        bool NewElt; 
-        InstToPartitionIdT::iterator Iter; 
- 
-        std::tie(Iter, NewElt) = 
-            InstToPartitionId.insert(std::make_pair(Inst, PartitionID)); 
-        if (!NewElt) 
-          Iter->second = -1; 
-      } 
-      ++PartitionID; 
-    } 
-  } 
- 
-  /// Populates the partition with everything that the seeding 
-  /// instructions require. 
-  void populateUsedSet() { 
-    for (auto &P : PartitionContainer) 
-      P.populateUsedSet(); 
-  } 
- 
-  /// This performs the main chunk of the work of cloning the loops for 
-  /// the partitions. 
-  void cloneLoops() { 
-    BasicBlock *OrigPH = L->getLoopPreheader(); 
-    // At this point the predecessor of the preheader is either the memcheck 
-    // block or the top part of the original preheader. 
-    BasicBlock *Pred = OrigPH->getSinglePredecessor(); 
-    assert(Pred && "Preheader does not have a single predecessor"); 
-    BasicBlock *ExitBlock = L->getExitBlock(); 
-    assert(ExitBlock && "No single exit block"); 
-    Loop *NewLoop; 
- 
-    assert(!PartitionContainer.empty() && "at least two partitions expected"); 
-    // We're cloning the preheader along with the loop so we already made sure 
-    // it was empty. 
-    assert(&*OrigPH->begin() == OrigPH->getTerminator() && 
-           "preheader not empty"); 
- 
-    // Preserve the original loop ID for use after the transformation. 
-    MDNode *OrigLoopID = L->getLoopID(); 
- 
-    // Create a loop for each partition except the last.  Clone the original 
-    // loop before PH along with adding a preheader for the cloned loop.  Then 
-    // update PH to point to the newly added preheader. 
-    BasicBlock *TopPH = OrigPH; 
-    unsigned Index = getSize() - 1; 
-    for (auto I = std::next(PartitionContainer.rbegin()), 
-              E = PartitionContainer.rend(); 
-         I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) { 
-      auto *Part = &*I; 
- 
-      NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT); 
- 
-      Part->getVMap()[ExitBlock] = TopPH; 
-      Part->remapInstructions(); 
-      setNewLoopID(OrigLoopID, Part); 
-    } 
-    Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH); 
- 
-    // Also set a new loop ID for the last loop. 
-    setNewLoopID(OrigLoopID, &PartitionContainer.back()); 
- 
-    // Now go in forward order and update the immediate dominator for the 
-    // preheaders with the exiting block of the previous loop.  Dominance 
-    // within the loop is updated in cloneLoopWithPreheader. 
-    for (auto Curr = PartitionContainer.cbegin(), 
-              Next = std::next(PartitionContainer.cbegin()), 
-              E = PartitionContainer.cend(); 
-         Next != E; ++Curr, ++Next) 
-      DT->changeImmediateDominator( 
-          Next->getDistributedLoop()->getLoopPreheader(), 
-          Curr->getDistributedLoop()->getExitingBlock()); 
-  } 
- 
-  /// Removes the dead instructions from the cloned loops. 
-  void removeUnusedInsts() { 
-    for (auto &Partition : PartitionContainer) 
-      Partition.removeUnusedInsts(); 
-  } 
- 
-  /// For each memory pointer, it computes the partitionId the pointer is 
-  /// used in. 
-  /// 
-  /// This returns an array of int where the I-th entry corresponds to I-th 
-  /// entry in LAI.getRuntimePointerCheck().  If the pointer is used in multiple 
-  /// partitions its entry is set to -1. 
-  SmallVector<int, 8> 
-  computePartitionSetForPointers(const LoopAccessInfo &LAI) { 
-    const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking(); 
- 
-    unsigned N = RtPtrCheck->Pointers.size(); 
-    SmallVector<int, 8> PtrToPartitions(N); 
-    for (unsigned I = 0; I < N; ++I) { 
-      Value *Ptr = RtPtrCheck->Pointers[I].PointerValue; 
-      auto Instructions = 
-          LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr); 
- 
-      int &Partition = PtrToPartitions[I]; 
-      // First set it to uninitialized. 
-      Partition = -2; 
-      for (Instruction *Inst : Instructions) { 
-        // Note that this could be -1 if Inst is duplicated across multiple 
-        // partitions. 
-        int ThisPartition = this->InstToPartitionId[Inst]; 
-        if (Partition == -2) 
-          Partition = ThisPartition; 
-        // -1 means belonging to multiple partitions. 
-        else if (Partition == -1) 
-          break; 
-        else if (Partition != (int)ThisPartition) 
-          Partition = -1; 
-      } 
-      assert(Partition != -2 && "Pointer not belonging to any partition"); 
-    } 
- 
-    return PtrToPartitions; 
-  } 
- 
-  void print(raw_ostream &OS) const { 
-    unsigned Index = 0; 
-    for (const auto &P : PartitionContainer) { 
-      OS << "Partition " << Index++ << " (" << &P << "):\n"; 
-      P.print(); 
-    } 
-  } 
- 
-  void dump() const { print(dbgs()); } 
- 
-#ifndef NDEBUG 
-  friend raw_ostream &operator<<(raw_ostream &OS, 
-                                 const InstPartitionContainer &Partitions) { 
-    Partitions.print(OS); 
-    return OS; 
-  } 
-#endif 
- 
-  void printBlocks() const { 
-    unsigned Index = 0; 
-    for (const auto &P : PartitionContainer) { 
-      dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n"; 
-      P.printBlocks(); 
-    } 
-  } 
- 
-private: 
-  using PartitionContainerT = std::list<InstPartition>; 
- 
-  /// List of partitions. 
-  PartitionContainerT PartitionContainer; 
- 
-  /// Mapping from Instruction to partition Id.  If the instruction 
-  /// belongs to multiple partitions the entry contains -1. 
-  InstToPartitionIdT InstToPartitionId; 
- 
-  Loop *L; 
-  LoopInfo *LI; 
-  DominatorTree *DT; 
- 
-  /// The control structure to merge adjacent partitions if both satisfy 
-  /// the \p Predicate. 
-  template <class UnaryPredicate> 
-  void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) { 
-    InstPartition *PrevMatch = nullptr; 
-    for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) { 
-      auto DoesMatch = Predicate(&*I); 
-      if (PrevMatch == nullptr && DoesMatch) { 
-        PrevMatch = &*I; 
-        ++I; 
-      } else if (PrevMatch != nullptr && DoesMatch) { 
-        I->moveTo(*PrevMatch); 
-        I = PartitionContainer.erase(I); 
-      } else { 
-        PrevMatch = nullptr; 
-        ++I; 
-      } 
-    } 
-  } 
- 
-  /// Assign new LoopIDs for the partition's cloned loop. 
-  void setNewLoopID(MDNode *OrigLoopID, InstPartition *Part) { 
-    Optional<MDNode *> PartitionID = makeFollowupLoopID( 
-        OrigLoopID, 
-        {LLVMLoopDistributeFollowupAll, 
-         Part->hasDepCycle() ? LLVMLoopDistributeFollowupSequential 
-                             : LLVMLoopDistributeFollowupCoincident}); 
-    if (PartitionID.hasValue()) { 
-      Loop *NewLoop = Part->getDistributedLoop(); 
-      NewLoop->setLoopID(PartitionID.getValue()); 
-    } 
-  } 
-}; 
- 
-/// For each memory instruction, this class maintains difference of the 
-/// number of unsafe dependences that start out from this instruction minus 
-/// those that end here. 
-/// 
-/// By traversing the memory instructions in program order and accumulating this 
-/// number, we know whether any unsafe dependence crosses over a program point. 
-class MemoryInstructionDependences { 
-  using Dependence = MemoryDepChecker::Dependence; 
- 
-public: 
-  struct Entry { 
-    Instruction *Inst; 
-    unsigned NumUnsafeDependencesStartOrEnd = 0; 
- 
-    Entry(Instruction *Inst) : Inst(Inst) {} 
-  }; 
- 
-  using AccessesType = SmallVector<Entry, 8>; 
- 
-  AccessesType::const_iterator begin() const { return Accesses.begin(); } 
-  AccessesType::const_iterator end() const { return Accesses.end(); } 
- 
-  MemoryInstructionDependences( 
-      const SmallVectorImpl<Instruction *> &Instructions, 
-      const SmallVectorImpl<Dependence> &Dependences) { 
-    Accesses.append(Instructions.begin(), Instructions.end()); 
- 
-    LLVM_DEBUG(dbgs() << "Backward dependences:\n"); 
-    for (auto &Dep : Dependences) 
-      if (Dep.isPossiblyBackward()) { 
-        // Note that the designations source and destination follow the program 
-        // order, i.e. source is always first.  (The direction is given by the 
-        // DepType.) 
-        ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd; 
-        --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd; 
- 
-        LLVM_DEBUG(Dep.print(dbgs(), 2, Instructions)); 
-      } 
-  } 
- 
-private: 
-  AccessesType Accesses; 
-}; 
- 
-/// The actual class performing the per-loop work. 
-class LoopDistributeForLoop { 
-public: 
-  LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT, 
-                        ScalarEvolution *SE, OptimizationRemarkEmitter *ORE) 
-      : L(L), F(F), LI(LI), DT(DT), SE(SE), ORE(ORE) { 
-    setForced(); 
-  } 
- 
-  /// Try to distribute an inner-most loop. 
-  bool processLoop(std::function<const LoopAccessInfo &(Loop &)> &GetLAA) { 
+//===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Loop Distribution Pass.  Its main focus is to
+// distribute loops that cannot be vectorized due to dependence cycles.  It
+// tries to isolate the offending dependences into a new loop allowing
+// vectorization of the remaining parts.
+//
+// For dependence analysis, the pass uses the LoopVectorizer's
+// LoopAccessAnalysis.  Because this analysis presumes no change in the order of
+// memory operations, special care is taken to preserve the lexical order of
+// these operations.
+//
+// Similarly to the Vectorizer, the pass also supports loop versioning to
+// run-time disambiguate potentially overlapping arrays.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopDistribute.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <cassert>
+#include <functional>
+#include <list>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+
+#define LDIST_NAME "loop-distribute"
+#define DEBUG_TYPE LDIST_NAME
+
+/// @{
+/// Metadata attribute names
+static const char *const LLVMLoopDistributeFollowupAll =
+    "llvm.loop.distribute.followup_all";
+static const char *const LLVMLoopDistributeFollowupCoincident =
+    "llvm.loop.distribute.followup_coincident";
+static const char *const LLVMLoopDistributeFollowupSequential =
+    "llvm.loop.distribute.followup_sequential";
+static const char *const LLVMLoopDistributeFollowupFallback =
+    "llvm.loop.distribute.followup_fallback";
+/// @}
+
+static cl::opt<bool>
+    LDistVerify("loop-distribute-verify", cl::Hidden,
+                cl::desc("Turn on DominatorTree and LoopInfo verification "
+                         "after Loop Distribution"),
+                cl::init(false));
+
+static cl::opt<bool> DistributeNonIfConvertible(
+    "loop-distribute-non-if-convertible", cl::Hidden,
+    cl::desc("Whether to distribute into a loop that may not be "
+             "if-convertible by the loop vectorizer"),
+    cl::init(false));
+
+static cl::opt<unsigned> DistributeSCEVCheckThreshold(
+    "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden,
+    cl::desc("The maximum number of SCEV checks allowed for Loop "
+             "Distribution"));
+
+static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold(
+    "loop-distribute-scev-check-threshold-with-pragma", cl::init(128),
+    cl::Hidden,
+    cl::desc(
+        "The maximum number of SCEV checks allowed for Loop "
+        "Distribution for loop marked with #pragma loop distribute(enable)"));
+
+static cl::opt<bool> EnableLoopDistribute(
+    "enable-loop-distribute", cl::Hidden,
+    cl::desc("Enable the new, experimental LoopDistribution Pass"),
+    cl::init(false));
+
+STATISTIC(NumLoopsDistributed, "Number of loops distributed");
+
+namespace {
+
+/// Maintains the set of instructions of the loop for a partition before
+/// cloning.  After cloning, it hosts the new loop.
+class InstPartition {
+  using InstructionSet = SmallPtrSet<Instruction *, 8>;
+
+public:
+  InstPartition(Instruction *I, Loop *L, bool DepCycle = false)
+      : DepCycle(DepCycle), OrigLoop(L) {
+    Set.insert(I);
+  }
+
+  /// Returns whether this partition contains a dependence cycle.
+  bool hasDepCycle() const { return DepCycle; }
+
+  /// Adds an instruction to this partition.
+  void add(Instruction *I) { Set.insert(I); }
+
+  /// Collection accessors.
+  InstructionSet::iterator begin() { return Set.begin(); }
+  InstructionSet::iterator end() { return Set.end(); }
+  InstructionSet::const_iterator begin() const { return Set.begin(); }
+  InstructionSet::const_iterator end() const { return Set.end(); }
+  bool empty() const { return Set.empty(); }
+
+  /// Moves this partition into \p Other.  This partition becomes empty
+  /// after this.
+  void moveTo(InstPartition &Other) {
+    Other.Set.insert(Set.begin(), Set.end());
+    Set.clear();
+    Other.DepCycle |= DepCycle;
+  }
+
+  /// Populates the partition with a transitive closure of all the
+  /// instructions that the seeded instructions dependent on.
+  void populateUsedSet() {
+    // FIXME: We currently don't use control-dependence but simply include all
+    // blocks (possibly empty at the end) and let simplifycfg mostly clean this
+    // up.
+    for (auto *B : OrigLoop->getBlocks())
+      Set.insert(B->getTerminator());
+
+    // Follow the use-def chains to form a transitive closure of all the
+    // instructions that the originally seeded instructions depend on.
+    SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end());
+    while (!Worklist.empty()) {
+      Instruction *I = Worklist.pop_back_val();
+      // Insert instructions from the loop that we depend on.
+      for (Value *V : I->operand_values()) {
+        auto *I = dyn_cast<Instruction>(V);
+        if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second)
+          Worklist.push_back(I);
+      }
+    }
+  }
+
+  /// Clones the original loop.
+  ///
+  /// Updates LoopInfo and DominatorTree using the information that block \p
+  /// LoopDomBB dominates the loop.
+  Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB,
+                               unsigned Index, LoopInfo *LI,
+                               DominatorTree *DT) {
+    ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop,
+                                          VMap, Twine(".ldist") + Twine(Index),
+                                          LI, DT, ClonedLoopBlocks);
+    return ClonedLoop;
+  }
+
+  /// The cloned loop.  If this partition is mapped to the original loop,
+  /// this is null.
+  const Loop *getClonedLoop() const { return ClonedLoop; }
+
+  /// Returns the loop where this partition ends up after distribution.
+  /// If this partition is mapped to the original loop then use the block from
+  /// the loop.
+  Loop *getDistributedLoop() const {
+    return ClonedLoop ? ClonedLoop : OrigLoop;
+  }
+
+  /// The VMap that is populated by cloning and then used in
+  /// remapinstruction to remap the cloned instructions.
+  ValueToValueMapTy &getVMap() { return VMap; }
+
+  /// Remaps the cloned instructions using VMap.
+  void remapInstructions() {
+    remapInstructionsInBlocks(ClonedLoopBlocks, VMap);
+  }
+
+  /// Based on the set of instructions selected for this partition,
+  /// removes the unnecessary ones.
+  void removeUnusedInsts() {
+    SmallVector<Instruction *, 8> Unused;
+
+    for (auto *Block : OrigLoop->getBlocks())
+      for (auto &Inst : *Block)
+        if (!Set.count(&Inst)) {
+          Instruction *NewInst = &Inst;
+          if (!VMap.empty())
+            NewInst = cast<Instruction>(VMap[NewInst]);
+
+          assert(!isa<BranchInst>(NewInst) &&
+                 "Branches are marked used early on");
+          Unused.push_back(NewInst);
+        }
+
+    // Delete the instructions backwards, as it has a reduced likelihood of
+    // having to update as many def-use and use-def chains.
+    for (auto *Inst : reverse(Unused)) {
+      if (!Inst->use_empty())
+        Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
+      Inst->eraseFromParent();
+    }
+  }
+
+  void print() const {
+    if (DepCycle)
+      dbgs() << "  (cycle)\n";
+    for (auto *I : Set)
+      // Prefix with the block name.
+      dbgs() << "  " << I->getParent()->getName() << ":" << *I << "\n";
+  }
+
+  void printBlocks() const {
+    for (auto *BB : getDistributedLoop()->getBlocks())
+      dbgs() << *BB;
+  }
+
+private:
+  /// Instructions from OrigLoop selected for this partition.
+  InstructionSet Set;
+
+  /// Whether this partition contains a dependence cycle.
+  bool DepCycle;
+
+  /// The original loop.
+  Loop *OrigLoop;
+
+  /// The cloned loop.  If this partition is mapped to the original loop,
+  /// this is null.
+  Loop *ClonedLoop = nullptr;
+
+  /// The blocks of ClonedLoop including the preheader.  If this
+  /// partition is mapped to the original loop, this is empty.
+  SmallVector<BasicBlock *, 8> ClonedLoopBlocks;
+
+  /// These gets populated once the set of instructions have been
+  /// finalized. If this partition is mapped to the original loop, these are not
+  /// set.
+  ValueToValueMapTy VMap;
+};
+
+/// Holds the set of Partitions.  It populates them, merges them and then
+/// clones the loops.
+class InstPartitionContainer {
+  using InstToPartitionIdT = DenseMap<Instruction *, int>;
+
+public:
+  InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT)
+      : L(L), LI(LI), DT(DT) {}
+
+  /// Returns the number of partitions.
+  unsigned getSize() const { return PartitionContainer.size(); }
+
+  /// Adds \p Inst into the current partition if that is marked to
+  /// contain cycles.  Otherwise start a new partition for it.
+  void addToCyclicPartition(Instruction *Inst) {
+    // If the current partition is non-cyclic.  Start a new one.
+    if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle())
+      PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true);
+    else
+      PartitionContainer.back().add(Inst);
+  }
+
+  /// Adds \p Inst into a partition that is not marked to contain
+  /// dependence cycles.
+  ///
+  //  Initially we isolate memory instructions into as many partitions as
+  //  possible, then later we may merge them back together.
+  void addToNewNonCyclicPartition(Instruction *Inst) {
+    PartitionContainer.emplace_back(Inst, L);
+  }
+
+  /// Merges adjacent non-cyclic partitions.
+  ///
+  /// The idea is that we currently only want to isolate the non-vectorizable
+  /// partition.  We could later allow more distribution among these partition
+  /// too.
+  void mergeAdjacentNonCyclic() {
+    mergeAdjacentPartitionsIf(
+        [](const InstPartition *P) { return !P->hasDepCycle(); });
+  }
+
+  /// If a partition contains only conditional stores, we won't vectorize
+  /// it.  Try to merge it with a previous cyclic partition.
+  void mergeNonIfConvertible() {
+    mergeAdjacentPartitionsIf([&](const InstPartition *Partition) {
+      if (Partition->hasDepCycle())
+        return true;
+
+      // Now, check if all stores are conditional in this partition.
+      bool seenStore = false;
+
+      for (auto *Inst : *Partition)
+        if (isa<StoreInst>(Inst)) {
+          seenStore = true;
+          if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT))
+            return false;
+        }
+      return seenStore;
+    });
+  }
+
+  /// Merges the partitions according to various heuristics.
+  void mergeBeforePopulating() {
+    mergeAdjacentNonCyclic();
+    if (!DistributeNonIfConvertible)
+      mergeNonIfConvertible();
+  }
+
+  /// Merges partitions in order to ensure that no loads are duplicated.
+  ///
+  /// We can't duplicate loads because that could potentially reorder them.
+  /// LoopAccessAnalysis provides dependency information with the context that
+  /// the order of memory operation is preserved.
+  ///
+  /// Return if any partitions were merged.
+  bool mergeToAvoidDuplicatedLoads() {
+    using LoadToPartitionT = DenseMap<Instruction *, InstPartition *>;
+    using ToBeMergedT = EquivalenceClasses<InstPartition *>;
+
+    LoadToPartitionT LoadToPartition;
+    ToBeMergedT ToBeMerged;
+
+    // Step through the partitions and create equivalence between partitions
+    // that contain the same load.  Also put partitions in between them in the
+    // same equivalence class to avoid reordering of memory operations.
+    for (PartitionContainerT::iterator I = PartitionContainer.begin(),
+                                       E = PartitionContainer.end();
+         I != E; ++I) {
+      auto *PartI = &*I;
+
+      // If a load occurs in two partitions PartI and PartJ, merge all
+      // partitions (PartI, PartJ] into PartI.
+      for (Instruction *Inst : *PartI)
+        if (isa<LoadInst>(Inst)) {
+          bool NewElt;
+          LoadToPartitionT::iterator LoadToPart;
+
+          std::tie(LoadToPart, NewElt) =
+              LoadToPartition.insert(std::make_pair(Inst, PartI));
+          if (!NewElt) {
+            LLVM_DEBUG(dbgs()
+                       << "Merging partitions due to this load in multiple "
+                       << "partitions: " << PartI << ", " << LoadToPart->second
+                       << "\n"
+                       << *Inst << "\n");
+
+            auto PartJ = I;
+            do {
+              --PartJ;
+              ToBeMerged.unionSets(PartI, &*PartJ);
+            } while (&*PartJ != LoadToPart->second);
+          }
+        }
+    }
+    if (ToBeMerged.empty())
+      return false;
+
+    // Merge the member of an equivalence class into its class leader.  This
+    // makes the members empty.
+    for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end();
+         I != E; ++I) {
+      if (!I->isLeader())
+        continue;
+
+      auto PartI = I->getData();
+      for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)),
+                                   ToBeMerged.member_end())) {
+        PartJ->moveTo(*PartI);
+      }
+    }
+
+    // Remove the empty partitions.
+    PartitionContainer.remove_if(
+        [](const InstPartition &P) { return P.empty(); });
+
+    return true;
+  }
+
+  /// Sets up the mapping between instructions to partitions.  If the
+  /// instruction is duplicated across multiple partitions, set the entry to -1.
+  void setupPartitionIdOnInstructions() {
+    int PartitionID = 0;
+    for (const auto &Partition : PartitionContainer) {
+      for (Instruction *Inst : Partition) {
+        bool NewElt;
+        InstToPartitionIdT::iterator Iter;
+
+        std::tie(Iter, NewElt) =
+            InstToPartitionId.insert(std::make_pair(Inst, PartitionID));
+        if (!NewElt)
+          Iter->second = -1;
+      }
+      ++PartitionID;
+    }
+  }
+
+  /// Populates the partition with everything that the seeding
+  /// instructions require.
+  void populateUsedSet() {
+    for (auto &P : PartitionContainer)
+      P.populateUsedSet();
+  }
+
+  /// This performs the main chunk of the work of cloning the loops for
+  /// the partitions.
+  void cloneLoops() {
+    BasicBlock *OrigPH = L->getLoopPreheader();
+    // At this point the predecessor of the preheader is either the memcheck
+    // block or the top part of the original preheader.
+    BasicBlock *Pred = OrigPH->getSinglePredecessor();
+    assert(Pred && "Preheader does not have a single predecessor");
+    BasicBlock *ExitBlock = L->getExitBlock();
+    assert(ExitBlock && "No single exit block");
+    Loop *NewLoop;
+
+    assert(!PartitionContainer.empty() && "at least two partitions expected");
+    // We're cloning the preheader along with the loop so we already made sure
+    // it was empty.
+    assert(&*OrigPH->begin() == OrigPH->getTerminator() &&
+           "preheader not empty");
+
+    // Preserve the original loop ID for use after the transformation.
+    MDNode *OrigLoopID = L->getLoopID();
+
+    // Create a loop for each partition except the last.  Clone the original
+    // loop before PH along with adding a preheader for the cloned loop.  Then
+    // update PH to point to the newly added preheader.
+    BasicBlock *TopPH = OrigPH;
+    unsigned Index = getSize() - 1;
+    for (auto I = std::next(PartitionContainer.rbegin()),
+              E = PartitionContainer.rend();
+         I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) {
+      auto *Part = &*I;
+
+      NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT);
+
+      Part->getVMap()[ExitBlock] = TopPH;
+      Part->remapInstructions();
+      setNewLoopID(OrigLoopID, Part);
+    }
+    Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH);
+
+    // Also set a new loop ID for the last loop.
+    setNewLoopID(OrigLoopID, &PartitionContainer.back());
+
+    // Now go in forward order and update the immediate dominator for the
+    // preheaders with the exiting block of the previous loop.  Dominance
+    // within the loop is updated in cloneLoopWithPreheader.
+    for (auto Curr = PartitionContainer.cbegin(),
+              Next = std::next(PartitionContainer.cbegin()),
+              E = PartitionContainer.cend();
+         Next != E; ++Curr, ++Next)
+      DT->changeImmediateDominator(
+          Next->getDistributedLoop()->getLoopPreheader(),
+          Curr->getDistributedLoop()->getExitingBlock());
+  }
+
+  /// Removes the dead instructions from the cloned loops.
+  void removeUnusedInsts() {
+    for (auto &Partition : PartitionContainer)
+      Partition.removeUnusedInsts();
+  }
+
+  /// For each memory pointer, it computes the partitionId the pointer is
+  /// used in.
+  ///
+  /// This returns an array of int where the I-th entry corresponds to I-th
+  /// entry in LAI.getRuntimePointerCheck().  If the pointer is used in multiple
+  /// partitions its entry is set to -1.
+  SmallVector<int, 8>
+  computePartitionSetForPointers(const LoopAccessInfo &LAI) {
+    const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking();
+
+    unsigned N = RtPtrCheck->Pointers.size();
+    SmallVector<int, 8> PtrToPartitions(N);
+    for (unsigned I = 0; I < N; ++I) {
+      Value *Ptr = RtPtrCheck->Pointers[I].PointerValue;
+      auto Instructions =
+          LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr);
+
+      int &Partition = PtrToPartitions[I];
+      // First set it to uninitialized.
+      Partition = -2;
+      for (Instruction *Inst : Instructions) {
+        // Note that this could be -1 if Inst is duplicated across multiple
+        // partitions.
+        int ThisPartition = this->InstToPartitionId[Inst];
+        if (Partition == -2)
+          Partition = ThisPartition;
+        // -1 means belonging to multiple partitions.
+        else if (Partition == -1)
+          break;
+        else if (Partition != (int)ThisPartition)
+          Partition = -1;
+      }
+      assert(Partition != -2 && "Pointer not belonging to any partition");
+    }
+
+    return PtrToPartitions;
+  }
+
+  void print(raw_ostream &OS) const {
+    unsigned Index = 0;
+    for (const auto &P : PartitionContainer) {
+      OS << "Partition " << Index++ << " (" << &P << "):\n";
+      P.print();
+    }
+  }
+
+  void dump() const { print(dbgs()); }
+
+#ifndef NDEBUG
+  friend raw_ostream &operator<<(raw_ostream &OS,
+                                 const InstPartitionContainer &Partitions) {
+    Partitions.print(OS);
+    return OS;
+  }
+#endif
+
+  void printBlocks() const {
+    unsigned Index = 0;
+    for (const auto &P : PartitionContainer) {
+      dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n";
+      P.printBlocks();
+    }
+  }
+
+private:
+  using PartitionContainerT = std::list<InstPartition>;
+
+  /// List of partitions.
+  PartitionContainerT PartitionContainer;
+
+  /// Mapping from Instruction to partition Id.  If the instruction
+  /// belongs to multiple partitions the entry contains -1.
+  InstToPartitionIdT InstToPartitionId;
+
+  Loop *L;
+  LoopInfo *LI;
+  DominatorTree *DT;
+
+  /// The control structure to merge adjacent partitions if both satisfy
+  /// the \p Predicate.
+  template <class UnaryPredicate>
+  void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) {
+    InstPartition *PrevMatch = nullptr;
+    for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) {
+      auto DoesMatch = Predicate(&*I);
+      if (PrevMatch == nullptr && DoesMatch) {
+        PrevMatch = &*I;
+        ++I;
+      } else if (PrevMatch != nullptr && DoesMatch) {
+        I->moveTo(*PrevMatch);
+        I = PartitionContainer.erase(I);
+      } else {
+        PrevMatch = nullptr;
+        ++I;
+      }
+    }
+  }
+
+  /// Assign new LoopIDs for the partition's cloned loop.
+  void setNewLoopID(MDNode *OrigLoopID, InstPartition *Part) {
+    Optional<MDNode *> PartitionID = makeFollowupLoopID(
+        OrigLoopID,
+        {LLVMLoopDistributeFollowupAll,
+         Part->hasDepCycle() ? LLVMLoopDistributeFollowupSequential
+                             : LLVMLoopDistributeFollowupCoincident});
+    if (PartitionID.hasValue()) {
+      Loop *NewLoop = Part->getDistributedLoop();
+      NewLoop->setLoopID(PartitionID.getValue());
+    }
+  }
+};
+
+/// For each memory instruction, this class maintains difference of the
+/// number of unsafe dependences that start out from this instruction minus
+/// those that end here.
+///
+/// By traversing the memory instructions in program order and accumulating this
+/// number, we know whether any unsafe dependence crosses over a program point.
+class MemoryInstructionDependences {
+  using Dependence = MemoryDepChecker::Dependence;
+
+public:
+  struct Entry {
+    Instruction *Inst;
+    unsigned NumUnsafeDependencesStartOrEnd = 0;
+
+    Entry(Instruction *Inst) : Inst(Inst) {}
+  };
+
+  using AccessesType = SmallVector<Entry, 8>;
+
+  AccessesType::const_iterator begin() const { return Accesses.begin(); }
+  AccessesType::const_iterator end() const { return Accesses.end(); }
+
+  MemoryInstructionDependences(
+      const SmallVectorImpl<Instruction *> &Instructions,
+      const SmallVectorImpl<Dependence> &Dependences) {
+    Accesses.append(Instructions.begin(), Instructions.end());
+
+    LLVM_DEBUG(dbgs() << "Backward dependences:\n");
+    for (auto &Dep : Dependences)
+      if (Dep.isPossiblyBackward()) {
+        // Note that the designations source and destination follow the program
+        // order, i.e. source is always first.  (The direction is given by the
+        // DepType.)
+        ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd;
+        --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd;
+
+        LLVM_DEBUG(Dep.print(dbgs(), 2, Instructions));
+      }
+  }
+
+private:
+  AccessesType Accesses;
+};
+
+/// The actual class performing the per-loop work.
+class LoopDistributeForLoop {
+public:
+  LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT,
+                        ScalarEvolution *SE, OptimizationRemarkEmitter *ORE)
+      : L(L), F(F), LI(LI), DT(DT), SE(SE), ORE(ORE) {
+    setForced();
+  }
+
+  /// Try to distribute an inner-most loop.
+  bool processLoop(std::function<const LoopAccessInfo &(Loop &)> &GetLAA) {
     assert(L->isInnermost() && "Only process inner loops.");
- 
-    LLVM_DEBUG(dbgs() << "\nLDist: In \"" 
-                      << L->getHeader()->getParent()->getName() 
-                      << "\" checking " << *L << "\n"); 
- 
+
+    LLVM_DEBUG(dbgs() << "\nLDist: In \""
+                      << L->getHeader()->getParent()->getName()
+                      << "\" checking " << *L << "\n");
+
     // Having a single exit block implies there's also one exiting block.
-    if (!L->getExitBlock()) 
-      return fail("MultipleExitBlocks", "multiple exit blocks"); 
-    if (!L->isLoopSimplifyForm()) 
-      return fail("NotLoopSimplifyForm", 
-                  "loop is not in loop-simplify form"); 
+    if (!L->getExitBlock())
+      return fail("MultipleExitBlocks", "multiple exit blocks");
+    if (!L->isLoopSimplifyForm())
+      return fail("NotLoopSimplifyForm",
+                  "loop is not in loop-simplify form");
     if (!L->isRotatedForm())
       return fail("NotBottomTested", "loop is not bottom tested");
- 
-    BasicBlock *PH = L->getLoopPreheader(); 
- 
-    LAI = &GetLAA(*L); 
- 
-    // Currently, we only distribute to isolate the part of the loop with 
-    // dependence cycles to enable partial vectorization. 
-    if (LAI->canVectorizeMemory()) 
-      return fail("MemOpsCanBeVectorized", 
-                  "memory operations are safe for vectorization"); 
- 
-    auto *Dependences = LAI->getDepChecker().getDependences(); 
-    if (!Dependences || Dependences->empty()) 
-      return fail("NoUnsafeDeps", "no unsafe dependences to isolate"); 
- 
-    InstPartitionContainer Partitions(L, LI, DT); 
- 
-    // First, go through each memory operation and assign them to consecutive 
-    // partitions (the order of partitions follows program order).  Put those 
-    // with unsafe dependences into "cyclic" partition otherwise put each store 
-    // in its own "non-cyclic" partition (we'll merge these later). 
-    // 
-    // Note that a memory operation (e.g. Load2 below) at a program point that 
-    // has an unsafe dependence (Store3->Load1) spanning over it must be 
-    // included in the same cyclic partition as the dependent operations.  This 
-    // is to preserve the original program order after distribution.  E.g.: 
-    // 
-    //                NumUnsafeDependencesStartOrEnd  NumUnsafeDependencesActive 
-    //  Load1   -.                     1                       0->1 
-    //  Load2    | /Unsafe/            0                       1 
-    //  Store3  -'                    -1                       1->0 
-    //  Load4                          0                       0 
-    // 
-    // NumUnsafeDependencesActive > 0 indicates this situation and in this case 
-    // we just keep assigning to the same cyclic partition until 
-    // NumUnsafeDependencesActive reaches 0. 
-    const MemoryDepChecker &DepChecker = LAI->getDepChecker(); 
-    MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(), 
-                                     *Dependences); 
- 
-    int NumUnsafeDependencesActive = 0; 
-    for (auto &InstDep : MID) { 
-      Instruction *I = InstDep.Inst; 
-      // We update NumUnsafeDependencesActive post-instruction, catch the 
-      // start of a dependence directly via NumUnsafeDependencesStartOrEnd. 
-      if (NumUnsafeDependencesActive || 
-          InstDep.NumUnsafeDependencesStartOrEnd > 0) 
-        Partitions.addToCyclicPartition(I); 
-      else 
-        Partitions.addToNewNonCyclicPartition(I); 
-      NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd; 
-      assert(NumUnsafeDependencesActive >= 0 && 
-             "Negative number of dependences active"); 
-    } 
- 
-    // Add partitions for values used outside.  These partitions can be out of 
-    // order from the original program order.  This is OK because if the 
-    // partition uses a load we will merge this partition with the original 
-    // partition of the load that we set up in the previous loop (see 
-    // mergeToAvoidDuplicatedLoads). 
-    auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L); 
-    for (auto *Inst : DefsUsedOutside) 
-      Partitions.addToNewNonCyclicPartition(Inst); 
- 
-    LLVM_DEBUG(dbgs() << "Seeded partitions:\n" << Partitions); 
-    if (Partitions.getSize() < 2) 
-      return fail("CantIsolateUnsafeDeps", 
-                  "cannot isolate unsafe dependencies"); 
- 
-    // Run the merge heuristics: Merge non-cyclic adjacent partitions since we 
-    // should be able to vectorize these together. 
-    Partitions.mergeBeforePopulating(); 
-    LLVM_DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions); 
-    if (Partitions.getSize() < 2) 
-      return fail("CantIsolateUnsafeDeps", 
-                  "cannot isolate unsafe dependencies"); 
- 
-    // Now, populate the partitions with non-memory operations. 
-    Partitions.populateUsedSet(); 
-    LLVM_DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions); 
- 
-    // In order to preserve original lexical order for loads, keep them in the 
-    // partition that we set up in the MemoryInstructionDependences loop. 
-    if (Partitions.mergeToAvoidDuplicatedLoads()) { 
-      LLVM_DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n" 
-                        << Partitions); 
-      if (Partitions.getSize() < 2) 
-        return fail("CantIsolateUnsafeDeps", 
-                    "cannot isolate unsafe dependencies"); 
-    } 
- 
-    // Don't distribute the loop if we need too many SCEV run-time checks, or 
-    // any if it's illegal. 
-    const SCEVUnionPredicate &Pred = LAI->getPSE().getUnionPredicate(); 
-    if (LAI->hasConvergentOp() && !Pred.isAlwaysTrue()) { 
-      return fail("RuntimeCheckWithConvergent", 
-                  "may not insert runtime check with convergent operation"); 
-    } 
- 
-    if (Pred.getComplexity() > (IsForced.getValueOr(false) 
-                                    ? PragmaDistributeSCEVCheckThreshold 
-                                    : DistributeSCEVCheckThreshold)) 
-      return fail("TooManySCEVRuntimeChecks", 
-                  "too many SCEV run-time checks needed.\n"); 
- 
-    if (!IsForced.getValueOr(false) && hasDisableAllTransformsHint(L)) 
-      return fail("HeuristicDisabled", "distribution heuristic disabled"); 
- 
-    LLVM_DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n"); 
-    // We're done forming the partitions set up the reverse mapping from 
-    // instructions to partitions. 
-    Partitions.setupPartitionIdOnInstructions(); 
- 
-    // If we need run-time checks, version the loop now. 
-    auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI); 
-    const auto *RtPtrChecking = LAI->getRuntimePointerChecking(); 
-    const auto &AllChecks = RtPtrChecking->getChecks(); 
-    auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition, 
-                                                  RtPtrChecking); 
- 
-    if (LAI->hasConvergentOp() && !Checks.empty()) { 
-      return fail("RuntimeCheckWithConvergent", 
-                  "may not insert runtime check with convergent operation"); 
-    } 
- 
-    // To keep things simple have an empty preheader before we version or clone 
-    // the loop.  (Also split if this has no predecessor, i.e. entry, because we 
-    // rely on PH having a predecessor.) 
-    if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator()) 
-      SplitBlock(PH, PH->getTerminator(), DT, LI); 
- 
-    if (!Pred.isAlwaysTrue() || !Checks.empty()) { 
-      assert(!LAI->hasConvergentOp() && "inserting illegal loop versioning"); 
- 
-      MDNode *OrigLoopID = L->getLoopID(); 
- 
-      LLVM_DEBUG(dbgs() << "\nPointers:\n"); 
-      LLVM_DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks)); 
+
+    BasicBlock *PH = L->getLoopPreheader();
+
+    LAI = &GetLAA(*L);
+
+    // Currently, we only distribute to isolate the part of the loop with
+    // dependence cycles to enable partial vectorization.
+    if (LAI->canVectorizeMemory())
+      return fail("MemOpsCanBeVectorized",
+                  "memory operations are safe for vectorization");
+
+    auto *Dependences = LAI->getDepChecker().getDependences();
+    if (!Dependences || Dependences->empty())
+      return fail("NoUnsafeDeps", "no unsafe dependences to isolate");
+
+    InstPartitionContainer Partitions(L, LI, DT);
+
+    // First, go through each memory operation and assign them to consecutive
+    // partitions (the order of partitions follows program order).  Put those
+    // with unsafe dependences into "cyclic" partition otherwise put each store
+    // in its own "non-cyclic" partition (we'll merge these later).
+    //
+    // Note that a memory operation (e.g. Load2 below) at a program point that
+    // has an unsafe dependence (Store3->Load1) spanning over it must be
+    // included in the same cyclic partition as the dependent operations.  This
+    // is to preserve the original program order after distribution.  E.g.:
+    //
+    //                NumUnsafeDependencesStartOrEnd  NumUnsafeDependencesActive
+    //  Load1   -.                     1                       0->1
+    //  Load2    | /Unsafe/            0                       1
+    //  Store3  -'                    -1                       1->0
+    //  Load4                          0                       0
+    //
+    // NumUnsafeDependencesActive > 0 indicates this situation and in this case
+    // we just keep assigning to the same cyclic partition until
+    // NumUnsafeDependencesActive reaches 0.
+    const MemoryDepChecker &DepChecker = LAI->getDepChecker();
+    MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(),
+                                     *Dependences);
+
+    int NumUnsafeDependencesActive = 0;
+    for (auto &InstDep : MID) {
+      Instruction *I = InstDep.Inst;
+      // We update NumUnsafeDependencesActive post-instruction, catch the
+      // start of a dependence directly via NumUnsafeDependencesStartOrEnd.
+      if (NumUnsafeDependencesActive ||
+          InstDep.NumUnsafeDependencesStartOrEnd > 0)
+        Partitions.addToCyclicPartition(I);
+      else
+        Partitions.addToNewNonCyclicPartition(I);
+      NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd;
+      assert(NumUnsafeDependencesActive >= 0 &&
+             "Negative number of dependences active");
+    }
+
+    // Add partitions for values used outside.  These partitions can be out of
+    // order from the original program order.  This is OK because if the
+    // partition uses a load we will merge this partition with the original
+    // partition of the load that we set up in the previous loop (see
+    // mergeToAvoidDuplicatedLoads).
+    auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L);
+    for (auto *Inst : DefsUsedOutside)
+      Partitions.addToNewNonCyclicPartition(Inst);
+
+    LLVM_DEBUG(dbgs() << "Seeded partitions:\n" << Partitions);
+    if (Partitions.getSize() < 2)
+      return fail("CantIsolateUnsafeDeps",
+                  "cannot isolate unsafe dependencies");
+
+    // Run the merge heuristics: Merge non-cyclic adjacent partitions since we
+    // should be able to vectorize these together.
+    Partitions.mergeBeforePopulating();
+    LLVM_DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions);
+    if (Partitions.getSize() < 2)
+      return fail("CantIsolateUnsafeDeps",
+                  "cannot isolate unsafe dependencies");
+
+    // Now, populate the partitions with non-memory operations.
+    Partitions.populateUsedSet();
+    LLVM_DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions);
+
+    // In order to preserve original lexical order for loads, keep them in the
+    // partition that we set up in the MemoryInstructionDependences loop.
+    if (Partitions.mergeToAvoidDuplicatedLoads()) {
+      LLVM_DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n"
+                        << Partitions);
+      if (Partitions.getSize() < 2)
+        return fail("CantIsolateUnsafeDeps",
+                    "cannot isolate unsafe dependencies");
+    }
+
+    // Don't distribute the loop if we need too many SCEV run-time checks, or
+    // any if it's illegal.
+    const SCEVUnionPredicate &Pred = LAI->getPSE().getUnionPredicate();
+    if (LAI->hasConvergentOp() && !Pred.isAlwaysTrue()) {
+      return fail("RuntimeCheckWithConvergent",
+                  "may not insert runtime check with convergent operation");
+    }
+
+    if (Pred.getComplexity() > (IsForced.getValueOr(false)
+                                    ? PragmaDistributeSCEVCheckThreshold
+                                    : DistributeSCEVCheckThreshold))
+      return fail("TooManySCEVRuntimeChecks",
+                  "too many SCEV run-time checks needed.\n");
+
+    if (!IsForced.getValueOr(false) && hasDisableAllTransformsHint(L))
+      return fail("HeuristicDisabled", "distribution heuristic disabled");
+
+    LLVM_DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n");
+    // We're done forming the partitions set up the reverse mapping from
+    // instructions to partitions.
+    Partitions.setupPartitionIdOnInstructions();
+
+    // If we need run-time checks, version the loop now.
+    auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI);
+    const auto *RtPtrChecking = LAI->getRuntimePointerChecking();
+    const auto &AllChecks = RtPtrChecking->getChecks();
+    auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition,
+                                                  RtPtrChecking);
+
+    if (LAI->hasConvergentOp() && !Checks.empty()) {
+      return fail("RuntimeCheckWithConvergent",
+                  "may not insert runtime check with convergent operation");
+    }
+
+    // To keep things simple have an empty preheader before we version or clone
+    // the loop.  (Also split if this has no predecessor, i.e. entry, because we
+    // rely on PH having a predecessor.)
+    if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator())
+      SplitBlock(PH, PH->getTerminator(), DT, LI);
+
+    if (!Pred.isAlwaysTrue() || !Checks.empty()) {
+      assert(!LAI->hasConvergentOp() && "inserting illegal loop versioning");
+
+      MDNode *OrigLoopID = L->getLoopID();
+
+      LLVM_DEBUG(dbgs() << "\nPointers:\n");
+      LLVM_DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks));
       LoopVersioning LVer(*LAI, Checks, L, LI, DT, SE);
-      LVer.versionLoop(DefsUsedOutside); 
-      LVer.annotateLoopWithNoAlias(); 
- 
-      // The unversioned loop will not be changed, so we inherit all attributes 
-      // from the original loop, but remove the loop distribution metadata to 
-      // avoid to distribute it again. 
-      MDNode *UnversionedLoopID = 
-          makeFollowupLoopID(OrigLoopID, 
-                             {LLVMLoopDistributeFollowupAll, 
-                              LLVMLoopDistributeFollowupFallback}, 
-                             "llvm.loop.distribute.", true) 
-              .getValue(); 
-      LVer.getNonVersionedLoop()->setLoopID(UnversionedLoopID); 
-    } 
- 
-    // Create identical copies of the original loop for each partition and hook 
-    // them up sequentially. 
-    Partitions.cloneLoops(); 
- 
-    // Now, we remove the instruction from each loop that don't belong to that 
-    // partition. 
-    Partitions.removeUnusedInsts(); 
-    LLVM_DEBUG(dbgs() << "\nAfter removing unused Instrs:\n"); 
-    LLVM_DEBUG(Partitions.printBlocks()); 
- 
-    if (LDistVerify) { 
-      LI->verify(*DT); 
-      assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 
-    } 
- 
-    ++NumLoopsDistributed; 
-    // Report the success. 
-    ORE->emit([&]() { 
-      return OptimizationRemark(LDIST_NAME, "Distribute", L->getStartLoc(), 
-                                L->getHeader()) 
-             << "distributed loop"; 
-    }); 
-    return true; 
-  } 
- 
-  /// Provide diagnostics then \return with false. 
-  bool fail(StringRef RemarkName, StringRef Message) { 
-    LLVMContext &Ctx = F->getContext(); 
-    bool Forced = isForced().getValueOr(false); 
- 
-    LLVM_DEBUG(dbgs() << "Skipping; " << Message << "\n"); 
- 
-    // With Rpass-missed report that distribution failed. 
-    ORE->emit([&]() { 
-      return OptimizationRemarkMissed(LDIST_NAME, "NotDistributed", 
-                                      L->getStartLoc(), L->getHeader()) 
-             << "loop not distributed: use -Rpass-analysis=loop-distribute for " 
-                "more " 
-                "info"; 
-    }); 
- 
-    // With Rpass-analysis report why.  This is on by default if distribution 
-    // was requested explicitly. 
-    ORE->emit(OptimizationRemarkAnalysis( 
-                  Forced ? OptimizationRemarkAnalysis::AlwaysPrint : LDIST_NAME, 
-                  RemarkName, L->getStartLoc(), L->getHeader()) 
-              << "loop not distributed: " << Message); 
- 
-    // Also issue a warning if distribution was requested explicitly but it 
-    // failed. 
-    if (Forced) 
-      Ctx.diagnose(DiagnosticInfoOptimizationFailure( 
-          *F, L->getStartLoc(), "loop not distributed: failed " 
-                                "explicitly specified loop distribution")); 
- 
-    return false; 
-  } 
- 
-  /// Return if distribution forced to be enabled/disabled for the loop. 
-  /// 
-  /// If the optional has a value, it indicates whether distribution was forced 
-  /// to be enabled (true) or disabled (false).  If the optional has no value 
-  /// distribution was not forced either way. 
-  const Optional<bool> &isForced() const { return IsForced; } 
- 
-private: 
-  /// Filter out checks between pointers from the same partition. 
-  /// 
-  /// \p PtrToPartition contains the partition number for pointers.  Partition 
-  /// number -1 means that the pointer is used in multiple partitions.  In this 
-  /// case we can't safely omit the check. 
-  SmallVector<RuntimePointerCheck, 4> includeOnlyCrossPartitionChecks( 
-      const SmallVectorImpl<RuntimePointerCheck> &AllChecks, 
-      const SmallVectorImpl<int> &PtrToPartition, 
-      const RuntimePointerChecking *RtPtrChecking) { 
-    SmallVector<RuntimePointerCheck, 4> Checks; 
- 
-    copy_if(AllChecks, std::back_inserter(Checks), 
-            [&](const RuntimePointerCheck &Check) { 
-              for (unsigned PtrIdx1 : Check.first->Members) 
-                for (unsigned PtrIdx2 : Check.second->Members) 
-                  // Only include this check if there is a pair of pointers 
-                  // that require checking and the pointers fall into 
-                  // separate partitions. 
-                  // 
-                  // (Note that we already know at this point that the two 
-                  // pointer groups need checking but it doesn't follow 
-                  // that each pair of pointers within the two groups need 
-                  // checking as well. 
-                  // 
-                  // In other words we don't want to include a check just 
-                  // because there is a pair of pointers between the two 
-                  // pointer groups that require checks and a different 
-                  // pair whose pointers fall into different partitions.) 
-                  if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) && 
-                      !RuntimePointerChecking::arePointersInSamePartition( 
-                          PtrToPartition, PtrIdx1, PtrIdx2)) 
-                    return true; 
-              return false; 
-            }); 
- 
-    return Checks; 
-  } 
- 
-  /// Check whether the loop metadata is forcing distribution to be 
-  /// enabled/disabled. 
-  void setForced() { 
-    Optional<const MDOperand *> Value = 
-        findStringMetadataForLoop(L, "llvm.loop.distribute.enable"); 
-    if (!Value) 
-      return; 
- 
-    const MDOperand *Op = *Value; 
-    assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata"); 
-    IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue(); 
-  } 
- 
-  Loop *L; 
-  Function *F; 
- 
-  // Analyses used. 
-  LoopInfo *LI; 
-  const LoopAccessInfo *LAI = nullptr; 
-  DominatorTree *DT; 
-  ScalarEvolution *SE; 
-  OptimizationRemarkEmitter *ORE; 
- 
-  /// Indicates whether distribution is forced to be enabled/disabled for 
-  /// the loop. 
-  /// 
-  /// If the optional has a value, it indicates whether distribution was forced 
-  /// to be enabled (true) or disabled (false).  If the optional has no value 
-  /// distribution was not forced either way. 
-  Optional<bool> IsForced; 
-}; 
- 
-} // end anonymous namespace 
- 
-/// Shared implementation between new and old PMs. 
-static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT, 
-                    ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, 
-                    std::function<const LoopAccessInfo &(Loop &)> &GetLAA) { 
-  // Build up a worklist of inner-loops to vectorize. This is necessary as the 
-  // act of distributing a loop creates new loops and can invalidate iterators 
-  // across the loops. 
-  SmallVector<Loop *, 8> Worklist; 
- 
-  for (Loop *TopLevelLoop : *LI) 
-    for (Loop *L : depth_first(TopLevelLoop)) 
-      // We only handle inner-most loops. 
+      LVer.versionLoop(DefsUsedOutside);
+      LVer.annotateLoopWithNoAlias();
+
+      // The unversioned loop will not be changed, so we inherit all attributes
+      // from the original loop, but remove the loop distribution metadata to
+      // avoid to distribute it again.
+      MDNode *UnversionedLoopID =
+          makeFollowupLoopID(OrigLoopID,
+                             {LLVMLoopDistributeFollowupAll,
+                              LLVMLoopDistributeFollowupFallback},
+                             "llvm.loop.distribute.", true)
+              .getValue();
+      LVer.getNonVersionedLoop()->setLoopID(UnversionedLoopID);
+    }
+
+    // Create identical copies of the original loop for each partition and hook
+    // them up sequentially.
+    Partitions.cloneLoops();
+
+    // Now, we remove the instruction from each loop that don't belong to that
+    // partition.
+    Partitions.removeUnusedInsts();
+    LLVM_DEBUG(dbgs() << "\nAfter removing unused Instrs:\n");
+    LLVM_DEBUG(Partitions.printBlocks());
+
+    if (LDistVerify) {
+      LI->verify(*DT);
+      assert(DT->verify(DominatorTree::VerificationLevel::Fast));
+    }
+
+    ++NumLoopsDistributed;
+    // Report the success.
+    ORE->emit([&]() {
+      return OptimizationRemark(LDIST_NAME, "Distribute", L->getStartLoc(),
+                                L->getHeader())
+             << "distributed loop";
+    });
+    return true;
+  }
+
+  /// Provide diagnostics then \return with false.
+  bool fail(StringRef RemarkName, StringRef Message) {
+    LLVMContext &Ctx = F->getContext();
+    bool Forced = isForced().getValueOr(false);
+
+    LLVM_DEBUG(dbgs() << "Skipping; " << Message << "\n");
+
+    // With Rpass-missed report that distribution failed.
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(LDIST_NAME, "NotDistributed",
+                                      L->getStartLoc(), L->getHeader())
+             << "loop not distributed: use -Rpass-analysis=loop-distribute for "
+                "more "
+                "info";
+    });
+
+    // With Rpass-analysis report why.  This is on by default if distribution
+    // was requested explicitly.
+    ORE->emit(OptimizationRemarkAnalysis(
+                  Forced ? OptimizationRemarkAnalysis::AlwaysPrint : LDIST_NAME,
+                  RemarkName, L->getStartLoc(), L->getHeader())
+              << "loop not distributed: " << Message);
+
+    // Also issue a warning if distribution was requested explicitly but it
+    // failed.
+    if (Forced)
+      Ctx.diagnose(DiagnosticInfoOptimizationFailure(
+          *F, L->getStartLoc(), "loop not distributed: failed "
+                                "explicitly specified loop distribution"));
+
+    return false;
+  }
+
+  /// Return if distribution forced to be enabled/disabled for the loop.
+  ///
+  /// If the optional has a value, it indicates whether distribution was forced
+  /// to be enabled (true) or disabled (false).  If the optional has no value
+  /// distribution was not forced either way.
+  const Optional<bool> &isForced() const { return IsForced; }
+
+private:
+  /// Filter out checks between pointers from the same partition.
+  ///
+  /// \p PtrToPartition contains the partition number for pointers.  Partition
+  /// number -1 means that the pointer is used in multiple partitions.  In this
+  /// case we can't safely omit the check.
+  SmallVector<RuntimePointerCheck, 4> includeOnlyCrossPartitionChecks(
+      const SmallVectorImpl<RuntimePointerCheck> &AllChecks,
+      const SmallVectorImpl<int> &PtrToPartition,
+      const RuntimePointerChecking *RtPtrChecking) {
+    SmallVector<RuntimePointerCheck, 4> Checks;
+
+    copy_if(AllChecks, std::back_inserter(Checks),
+            [&](const RuntimePointerCheck &Check) {
+              for (unsigned PtrIdx1 : Check.first->Members)
+                for (unsigned PtrIdx2 : Check.second->Members)
+                  // Only include this check if there is a pair of pointers
+                  // that require checking and the pointers fall into
+                  // separate partitions.
+                  //
+                  // (Note that we already know at this point that the two
+                  // pointer groups need checking but it doesn't follow
+                  // that each pair of pointers within the two groups need
+                  // checking as well.
+                  //
+                  // In other words we don't want to include a check just
+                  // because there is a pair of pointers between the two
+                  // pointer groups that require checks and a different
+                  // pair whose pointers fall into different partitions.)
+                  if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) &&
+                      !RuntimePointerChecking::arePointersInSamePartition(
+                          PtrToPartition, PtrIdx1, PtrIdx2))
+                    return true;
+              return false;
+            });
+
+    return Checks;
+  }
+
+  /// Check whether the loop metadata is forcing distribution to be
+  /// enabled/disabled.
+  void setForced() {
+    Optional<const MDOperand *> Value =
+        findStringMetadataForLoop(L, "llvm.loop.distribute.enable");
+    if (!Value)
+      return;
+
+    const MDOperand *Op = *Value;
+    assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
+    IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue();
+  }
+
+  Loop *L;
+  Function *F;
+
+  // Analyses used.
+  LoopInfo *LI;
+  const LoopAccessInfo *LAI = nullptr;
+  DominatorTree *DT;
+  ScalarEvolution *SE;
+  OptimizationRemarkEmitter *ORE;
+
+  /// Indicates whether distribution is forced to be enabled/disabled for
+  /// the loop.
+  ///
+  /// If the optional has a value, it indicates whether distribution was forced
+  /// to be enabled (true) or disabled (false).  If the optional has no value
+  /// distribution was not forced either way.
+  Optional<bool> IsForced;
+};
+
+} // end anonymous namespace
+
+/// Shared implementation between new and old PMs.
+static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT,
+                    ScalarEvolution *SE, OptimizationRemarkEmitter *ORE,
+                    std::function<const LoopAccessInfo &(Loop &)> &GetLAA) {
+  // Build up a worklist of inner-loops to vectorize. This is necessary as the
+  // act of distributing a loop creates new loops and can invalidate iterators
+  // across the loops.
+  SmallVector<Loop *, 8> Worklist;
+
+  for (Loop *TopLevelLoop : *LI)
+    for (Loop *L : depth_first(TopLevelLoop))
+      // We only handle inner-most loops.
       if (L->isInnermost())
-        Worklist.push_back(L); 
- 
-  // Now walk the identified inner loops. 
-  bool Changed = false; 
-  for (Loop *L : Worklist) { 
-    LoopDistributeForLoop LDL(L, &F, LI, DT, SE, ORE); 
- 
-    // If distribution was forced for the specific loop to be 
-    // enabled/disabled, follow that.  Otherwise use the global flag. 
-    if (LDL.isForced().getValueOr(EnableLoopDistribute)) 
-      Changed |= LDL.processLoop(GetLAA); 
-  } 
- 
-  // Process each loop nest in the function. 
-  return Changed; 
-} 
- 
-namespace { 
- 
-/// The pass class. 
-class LoopDistributeLegacy : public FunctionPass { 
-public: 
-  static char ID; 
- 
-  LoopDistributeLegacy() : FunctionPass(ID) { 
-    // The default is set by the caller. 
-    initializeLoopDistributeLegacyPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnFunction(Function &F) override { 
-    if (skipFunction(F)) 
-      return false; 
- 
-    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 
-    auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 
-    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 
-    auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 
-    std::function<const LoopAccessInfo &(Loop &)> GetLAA = 
-        [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 
- 
-    return runImpl(F, LI, DT, SE, ORE, GetLAA); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<ScalarEvolutionWrapperPass>(); 
-    AU.addRequired<LoopInfoWrapperPass>(); 
-    AU.addPreserved<LoopInfoWrapperPass>(); 
-    AU.addRequired<LoopAccessLegacyAnalysis>(); 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addPreserved<DominatorTreeWrapperPass>(); 
-    AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 
-    AU.addPreserved<GlobalsAAWrapperPass>(); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-PreservedAnalyses LoopDistributePass::run(Function &F, 
-                                          FunctionAnalysisManager &AM) { 
-  auto &LI = AM.getResult<LoopAnalysis>(F); 
-  auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 
-  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 
-  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 
- 
-  // We don't directly need these analyses but they're required for loop 
-  // analyses so provide them below. 
-  auto &AA = AM.getResult<AAManager>(F); 
-  auto &AC = AM.getResult<AssumptionAnalysis>(F); 
-  auto &TTI = AM.getResult<TargetIRAnalysis>(F); 
-  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 
- 
-  auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 
-  std::function<const LoopAccessInfo &(Loop &)> GetLAA = 
-      [&](Loop &L) -> const LoopAccessInfo & { 
+        Worklist.push_back(L);
+
+  // Now walk the identified inner loops.
+  bool Changed = false;
+  for (Loop *L : Worklist) {
+    LoopDistributeForLoop LDL(L, &F, LI, DT, SE, ORE);
+
+    // If distribution was forced for the specific loop to be
+    // enabled/disabled, follow that.  Otherwise use the global flag.
+    if (LDL.isForced().getValueOr(EnableLoopDistribute))
+      Changed |= LDL.processLoop(GetLAA);
+  }
+
+  // Process each loop nest in the function.
+  return Changed;
+}
+
+namespace {
+
+/// The pass class.
+class LoopDistributeLegacy : public FunctionPass {
+public:
+  static char ID;
+
+  LoopDistributeLegacy() : FunctionPass(ID) {
+    // The default is set by the caller.
+    initializeLoopDistributeLegacyPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
+    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+    std::function<const LoopAccessInfo &(Loop &)> GetLAA =
+        [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
+
+    return runImpl(F, LI, DT, SE, ORE, GetLAA);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequired<LoopAccessLegacyAnalysis>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+
+} // end anonymous namespace
+
+PreservedAnalyses LoopDistributePass::run(Function &F,
+                                          FunctionAnalysisManager &AM) {
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+
+  // We don't directly need these analyses but they're required for loop
+  // analyses so provide them below.
+  auto &AA = AM.getResult<AAManager>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+
+  auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
+  std::function<const LoopAccessInfo &(Loop &)> GetLAA =
+      [&](Loop &L) -> const LoopAccessInfo & {
     LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,     SE,
                                       TLI, TTI, nullptr, nullptr};
-    return LAM.getResult<LoopAccessAnalysis>(L, AR); 
-  }; 
- 
-  bool Changed = runImpl(F, &LI, &DT, &SE, &ORE, GetLAA); 
-  if (!Changed) 
-    return PreservedAnalyses::all(); 
-  PreservedAnalyses PA; 
-  PA.preserve<LoopAnalysis>(); 
-  PA.preserve<DominatorTreeAnalysis>(); 
-  PA.preserve<GlobalsAA>(); 
-  return PA; 
-} 
- 
-char LoopDistributeLegacy::ID; 
- 
-static const char ldist_name[] = "Loop Distribution"; 
- 
-INITIALIZE_PASS_BEGIN(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, 
-                      false) 
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 
-INITIALIZE_PASS_END(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, false) 
- 
-FunctionPass *llvm::createLoopDistributePass() { return new LoopDistributeLegacy(); } 
+    return LAM.getResult<LoopAccessAnalysis>(L, AR);
+  };
+
+  bool Changed = runImpl(F, &LI, &DT, &SE, &ORE, GetLAA);
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<LoopAnalysis>();
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+
+char LoopDistributeLegacy::ID;
+
+static const char ldist_name[] = "Loop Distribution";
+
+INITIALIZE_PASS_BEGIN(LoopDistributeLegacy, LDIST_NAME, ldist_name, false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_END(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, false)
+
+FunctionPass *llvm::createLoopDistributePass() { return new LoopDistributeLegacy(); }
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopFuse.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopFuse.cpp
index aa754a7077..b5f8dfa9aa 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopFuse.cpp
@@ -1,264 +1,264 @@
-//===- LoopFuse.cpp - Loop Fusion Pass ------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-/// 
-/// \file 
-/// This file implements the loop fusion pass. 
-/// The implementation is largely based on the following document: 
-/// 
-///       Code Transformations to Augment the Scope of Loop Fusion in a 
-///         Production Compiler 
-///       Christopher Mark Barton 
-///       MSc Thesis 
-///       https://webdocs.cs.ualberta.ca/~amaral/thesis/ChristopherBartonMSc.pdf 
-/// 
-/// The general approach taken is to collect sets of control flow equivalent 
-/// loops and test whether they can be fused. The necessary conditions for 
-/// fusion are: 
-///    1. The loops must be adjacent (there cannot be any statements between 
-///       the two loops). 
-///    2. The loops must be conforming (they must execute the same number of 
-///       iterations). 
-///    3. The loops must be control flow equivalent (if one loop executes, the 
-///       other is guaranteed to execute). 
-///    4. There cannot be any negative distance dependencies between the loops. 
-/// If all of these conditions are satisfied, it is safe to fuse the loops. 
-/// 
-/// This implementation creates FusionCandidates that represent the loop and the 
-/// necessary information needed by fusion. It then operates on the fusion 
-/// candidates, first confirming that the candidate is eligible for fusion. The 
-/// candidates are then collected into control flow equivalent sets, sorted in 
-/// dominance order. Each set of control flow equivalent candidates is then 
-/// traversed, attempting to fuse pairs of candidates in the set. If all 
-/// requirements for fusion are met, the two candidates are fused, creating a 
-/// new (fused) candidate which is then added back into the set to consider for 
-/// additional fusion. 
-/// 
-/// This implementation currently does not make any modifications to remove 
-/// conditions for fusion. Code transformations to make loops conform to each of 
-/// the conditions for fusion are discussed in more detail in the document 
-/// above. These can be added to the current implementation in the future. 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/LoopFuse.h" 
-#include "llvm/ADT/Statistic.h" 
+//===- LoopFuse.cpp - Loop Fusion Pass ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the loop fusion pass.
+/// The implementation is largely based on the following document:
+///
+///       Code Transformations to Augment the Scope of Loop Fusion in a
+///         Production Compiler
+///       Christopher Mark Barton
+///       MSc Thesis
+///       https://webdocs.cs.ualberta.ca/~amaral/thesis/ChristopherBartonMSc.pdf
+///
+/// The general approach taken is to collect sets of control flow equivalent
+/// loops and test whether they can be fused. The necessary conditions for
+/// fusion are:
+///    1. The loops must be adjacent (there cannot be any statements between
+///       the two loops).
+///    2. The loops must be conforming (they must execute the same number of
+///       iterations).
+///    3. The loops must be control flow equivalent (if one loop executes, the
+///       other is guaranteed to execute).
+///    4. There cannot be any negative distance dependencies between the loops.
+/// If all of these conditions are satisfied, it is safe to fuse the loops.
+///
+/// This implementation creates FusionCandidates that represent the loop and the
+/// necessary information needed by fusion. It then operates on the fusion
+/// candidates, first confirming that the candidate is eligible for fusion. The
+/// candidates are then collected into control flow equivalent sets, sorted in
+/// dominance order. Each set of control flow equivalent candidates is then
+/// traversed, attempting to fuse pairs of candidates in the set. If all
+/// requirements for fusion are met, the two candidates are fused, creating a
+/// new (fused) candidate which is then added back into the set to consider for
+/// additional fusion.
+///
+/// This implementation currently does not make any modifications to remove
+/// conditions for fusion. Code transformations to make loops conform to each of
+/// the conditions for fusion are discussed in more detail in the document
+/// above. These can be added to the current implementation in the future.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopFuse.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/DependenceAnalysis.h" 
-#include "llvm/Analysis/DomTreeUpdater.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/OptimizationRemarkEmitter.h" 
-#include "llvm/Analysis/PostDominators.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/Analysis/ScalarEvolutionExpressions.h" 
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/Verifier.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/CodeMoverUtils.h" 
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/CodeMoverUtils.h"
 #include "llvm/Transforms/Utils/LoopPeel.h"
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "loop-fusion" 
- 
-STATISTIC(FuseCounter, "Loops fused"); 
-STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion"); 
-STATISTIC(InvalidPreheader, "Loop has invalid preheader"); 
-STATISTIC(InvalidHeader, "Loop has invalid header"); 
-STATISTIC(InvalidExitingBlock, "Loop has invalid exiting blocks"); 
-STATISTIC(InvalidExitBlock, "Loop has invalid exit block"); 
-STATISTIC(InvalidLatch, "Loop has invalid latch"); 
-STATISTIC(InvalidLoop, "Loop is invalid"); 
-STATISTIC(AddressTakenBB, "Basic block has address taken"); 
-STATISTIC(MayThrowException, "Loop may throw an exception"); 
-STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access"); 
-STATISTIC(NotSimplifiedForm, "Loop is not in simplified form"); 
-STATISTIC(InvalidDependencies, "Dependencies prevent fusion"); 
-STATISTIC(UnknownTripCount, "Loop has unknown trip count"); 
-STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop"); 
-STATISTIC(NonEqualTripCount, "Loop trip counts are not the same"); 
-STATISTIC(NonAdjacent, "Loops are not adjacent"); 
-STATISTIC( 
-    NonEmptyPreheader, 
-    "Loop has a non-empty preheader with instructions that cannot be moved"); 
-STATISTIC(FusionNotBeneficial, "Fusion is not beneficial"); 
-STATISTIC(NonIdenticalGuards, "Candidates have different guards"); 
-STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block with " 
-                             "instructions that cannot be moved"); 
-STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block with " 
-                              "instructions that cannot be moved"); 
-STATISTIC(NotRotated, "Candidate is not rotated"); 
- 
-enum FusionDependenceAnalysisChoice { 
-  FUSION_DEPENDENCE_ANALYSIS_SCEV, 
-  FUSION_DEPENDENCE_ANALYSIS_DA, 
-  FUSION_DEPENDENCE_ANALYSIS_ALL, 
-}; 
- 
-static cl::opt<FusionDependenceAnalysisChoice> FusionDependenceAnalysis( 
-    "loop-fusion-dependence-analysis", 
-    cl::desc("Which dependence analysis should loop fusion use?"), 
-    cl::values(clEnumValN(FUSION_DEPENDENCE_ANALYSIS_SCEV, "scev", 
-                          "Use the scalar evolution interface"), 
-               clEnumValN(FUSION_DEPENDENCE_ANALYSIS_DA, "da", 
-                          "Use the dependence analysis interface"), 
-               clEnumValN(FUSION_DEPENDENCE_ANALYSIS_ALL, "all", 
-                          "Use all available analyses")), 
-    cl::Hidden, cl::init(FUSION_DEPENDENCE_ANALYSIS_ALL), cl::ZeroOrMore); 
- 
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-fusion"
+
+STATISTIC(FuseCounter, "Loops fused");
+STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion");
+STATISTIC(InvalidPreheader, "Loop has invalid preheader");
+STATISTIC(InvalidHeader, "Loop has invalid header");
+STATISTIC(InvalidExitingBlock, "Loop has invalid exiting blocks");
+STATISTIC(InvalidExitBlock, "Loop has invalid exit block");
+STATISTIC(InvalidLatch, "Loop has invalid latch");
+STATISTIC(InvalidLoop, "Loop is invalid");
+STATISTIC(AddressTakenBB, "Basic block has address taken");
+STATISTIC(MayThrowException, "Loop may throw an exception");
+STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access");
+STATISTIC(NotSimplifiedForm, "Loop is not in simplified form");
+STATISTIC(InvalidDependencies, "Dependencies prevent fusion");
+STATISTIC(UnknownTripCount, "Loop has unknown trip count");
+STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop");
+STATISTIC(NonEqualTripCount, "Loop trip counts are not the same");
+STATISTIC(NonAdjacent, "Loops are not adjacent");
+STATISTIC(
+    NonEmptyPreheader,
+    "Loop has a non-empty preheader with instructions that cannot be moved");
+STATISTIC(FusionNotBeneficial, "Fusion is not beneficial");
+STATISTIC(NonIdenticalGuards, "Candidates have different guards");
+STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block with "
+                             "instructions that cannot be moved");
+STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block with "
+                              "instructions that cannot be moved");
+STATISTIC(NotRotated, "Candidate is not rotated");
+
+enum FusionDependenceAnalysisChoice {
+  FUSION_DEPENDENCE_ANALYSIS_SCEV,
+  FUSION_DEPENDENCE_ANALYSIS_DA,
+  FUSION_DEPENDENCE_ANALYSIS_ALL,
+};
+
+static cl::opt<FusionDependenceAnalysisChoice> FusionDependenceAnalysis(
+    "loop-fusion-dependence-analysis",
+    cl::desc("Which dependence analysis should loop fusion use?"),
+    cl::values(clEnumValN(FUSION_DEPENDENCE_ANALYSIS_SCEV, "scev",
+                          "Use the scalar evolution interface"),
+               clEnumValN(FUSION_DEPENDENCE_ANALYSIS_DA, "da",
+                          "Use the dependence analysis interface"),
+               clEnumValN(FUSION_DEPENDENCE_ANALYSIS_ALL, "all",
+                          "Use all available analyses")),
+    cl::Hidden, cl::init(FUSION_DEPENDENCE_ANALYSIS_ALL), cl::ZeroOrMore);
+
 static cl::opt<unsigned> FusionPeelMaxCount(
     "loop-fusion-peel-max-count", cl::init(0), cl::Hidden,
     cl::desc("Max number of iterations to be peeled from a loop, such that "
              "fusion can take place"));
 
-#ifndef NDEBUG 
-static cl::opt<bool> 
-    VerboseFusionDebugging("loop-fusion-verbose-debug", 
-                           cl::desc("Enable verbose debugging for Loop Fusion"), 
-                           cl::Hidden, cl::init(false), cl::ZeroOrMore); 
-#endif 
- 
-namespace { 
-/// This class is used to represent a candidate for loop fusion. When it is 
-/// constructed, it checks the conditions for loop fusion to ensure that it 
-/// represents a valid candidate. It caches several parts of a loop that are 
-/// used throughout loop fusion (e.g., loop preheader, loop header, etc) instead 
-/// of continually querying the underlying Loop to retrieve these values. It is 
-/// assumed these will not change throughout loop fusion. 
-/// 
-/// The invalidate method should be used to indicate that the FusionCandidate is 
-/// no longer a valid candidate for fusion. Similarly, the isValid() method can 
-/// be used to ensure that the FusionCandidate is still valid for fusion. 
-struct FusionCandidate { 
-  /// Cache of parts of the loop used throughout loop fusion. These should not 
-  /// need to change throughout the analysis and transformation. 
-  /// These parts are cached to avoid repeatedly looking up in the Loop class. 
- 
-  /// Preheader of the loop this candidate represents 
-  BasicBlock *Preheader; 
-  /// Header of the loop this candidate represents 
-  BasicBlock *Header; 
-  /// Blocks in the loop that exit the loop 
-  BasicBlock *ExitingBlock; 
-  /// The successor block of this loop (where the exiting blocks go to) 
-  BasicBlock *ExitBlock; 
-  /// Latch of the loop 
-  BasicBlock *Latch; 
-  /// The loop that this fusion candidate represents 
-  Loop *L; 
-  /// Vector of instructions in this loop that read from memory 
-  SmallVector<Instruction *, 16> MemReads; 
-  /// Vector of instructions in this loop that write to memory 
-  SmallVector<Instruction *, 16> MemWrites; 
-  /// Are all of the members of this fusion candidate still valid 
-  bool Valid; 
-  /// Guard branch of the loop, if it exists 
-  BranchInst *GuardBranch; 
+#ifndef NDEBUG
+static cl::opt<bool>
+    VerboseFusionDebugging("loop-fusion-verbose-debug",
+                           cl::desc("Enable verbose debugging for Loop Fusion"),
+                           cl::Hidden, cl::init(false), cl::ZeroOrMore);
+#endif
+
+namespace {
+/// This class is used to represent a candidate for loop fusion. When it is
+/// constructed, it checks the conditions for loop fusion to ensure that it
+/// represents a valid candidate. It caches several parts of a loop that are
+/// used throughout loop fusion (e.g., loop preheader, loop header, etc) instead
+/// of continually querying the underlying Loop to retrieve these values. It is
+/// assumed these will not change throughout loop fusion.
+///
+/// The invalidate method should be used to indicate that the FusionCandidate is
+/// no longer a valid candidate for fusion. Similarly, the isValid() method can
+/// be used to ensure that the FusionCandidate is still valid for fusion.
+struct FusionCandidate {
+  /// Cache of parts of the loop used throughout loop fusion. These should not
+  /// need to change throughout the analysis and transformation.
+  /// These parts are cached to avoid repeatedly looking up in the Loop class.
+
+  /// Preheader of the loop this candidate represents
+  BasicBlock *Preheader;
+  /// Header of the loop this candidate represents
+  BasicBlock *Header;
+  /// Blocks in the loop that exit the loop
+  BasicBlock *ExitingBlock;
+  /// The successor block of this loop (where the exiting blocks go to)
+  BasicBlock *ExitBlock;
+  /// Latch of the loop
+  BasicBlock *Latch;
+  /// The loop that this fusion candidate represents
+  Loop *L;
+  /// Vector of instructions in this loop that read from memory
+  SmallVector<Instruction *, 16> MemReads;
+  /// Vector of instructions in this loop that write to memory
+  SmallVector<Instruction *, 16> MemWrites;
+  /// Are all of the members of this fusion candidate still valid
+  bool Valid;
+  /// Guard branch of the loop, if it exists
+  BranchInst *GuardBranch;
   /// Peeling Paramaters of the Loop.
   TTI::PeelingPreferences PP;
   /// Can you Peel this Loop?
   bool AbleToPeel;
   /// Has this loop been Peeled
   bool Peeled;
- 
-  /// Dominator and PostDominator trees are needed for the 
-  /// FusionCandidateCompare function, required by FusionCandidateSet to 
-  /// determine where the FusionCandidate should be inserted into the set. These 
-  /// are used to establish ordering of the FusionCandidates based on dominance. 
-  const DominatorTree *DT; 
-  const PostDominatorTree *PDT; 
- 
-  OptimizationRemarkEmitter &ORE; 
- 
-  FusionCandidate(Loop *L, const DominatorTree *DT, 
+
+  /// Dominator and PostDominator trees are needed for the
+  /// FusionCandidateCompare function, required by FusionCandidateSet to
+  /// determine where the FusionCandidate should be inserted into the set. These
+  /// are used to establish ordering of the FusionCandidates based on dominance.
+  const DominatorTree *DT;
+  const PostDominatorTree *PDT;
+
+  OptimizationRemarkEmitter &ORE;
+
+  FusionCandidate(Loop *L, const DominatorTree *DT,
                   const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE,
                   TTI::PeelingPreferences PP)
-      : Preheader(L->getLoopPreheader()), Header(L->getHeader()), 
-        ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()), 
-        Latch(L->getLoopLatch()), L(L), Valid(true), 
+      : Preheader(L->getLoopPreheader()), Header(L->getHeader()),
+        ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()),
+        Latch(L->getLoopLatch()), L(L), Valid(true),
         GuardBranch(L->getLoopGuardBranch()), PP(PP), AbleToPeel(canPeel(L)),
         Peeled(false), DT(DT), PDT(PDT), ORE(ORE) {
- 
-    // Walk over all blocks in the loop and check for conditions that may 
-    // prevent fusion. For each block, walk over all instructions and collect 
-    // the memory reads and writes If any instructions that prevent fusion are 
-    // found, invalidate this object and return. 
-    for (BasicBlock *BB : L->blocks()) { 
-      if (BB->hasAddressTaken()) { 
-        invalidate(); 
-        reportInvalidCandidate(AddressTakenBB); 
-        return; 
-      } 
- 
-      for (Instruction &I : *BB) { 
-        if (I.mayThrow()) { 
-          invalidate(); 
-          reportInvalidCandidate(MayThrowException); 
-          return; 
-        } 
-        if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { 
-          if (SI->isVolatile()) { 
-            invalidate(); 
-            reportInvalidCandidate(ContainsVolatileAccess); 
-            return; 
-          } 
-        } 
-        if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { 
-          if (LI->isVolatile()) { 
-            invalidate(); 
-            reportInvalidCandidate(ContainsVolatileAccess); 
-            return; 
-          } 
-        } 
-        if (I.mayWriteToMemory()) 
-          MemWrites.push_back(&I); 
-        if (I.mayReadFromMemory()) 
-          MemReads.push_back(&I); 
-      } 
-    } 
-  } 
- 
-  /// Check if all members of the class are valid. 
-  bool isValid() const { 
-    return Preheader && Header && ExitingBlock && ExitBlock && Latch && L && 
-           !L->isInvalid() && Valid; 
-  } 
- 
-  /// Verify that all members are in sync with the Loop object. 
-  void verify() const { 
-    assert(isValid() && "Candidate is not valid!!"); 
-    assert(!L->isInvalid() && "Loop is invalid!"); 
-    assert(Preheader == L->getLoopPreheader() && "Preheader is out of sync"); 
-    assert(Header == L->getHeader() && "Header is out of sync"); 
-    assert(ExitingBlock == L->getExitingBlock() && 
-           "Exiting Blocks is out of sync"); 
-    assert(ExitBlock == L->getExitBlock() && "Exit block is out of sync"); 
-    assert(Latch == L->getLoopLatch() && "Latch is out of sync"); 
-  } 
- 
-  /// Get the entry block for this fusion candidate. 
-  /// 
-  /// If this fusion candidate represents a guarded loop, the entry block is the 
-  /// loop guard block. If it represents an unguarded loop, the entry block is 
-  /// the preheader of the loop. 
-  BasicBlock *getEntryBlock() const { 
-    if (GuardBranch) 
-      return GuardBranch->getParent(); 
-    else 
-      return Preheader; 
-  } 
- 
+
+    // Walk over all blocks in the loop and check for conditions that may
+    // prevent fusion. For each block, walk over all instructions and collect
+    // the memory reads and writes If any instructions that prevent fusion are
+    // found, invalidate this object and return.
+    for (BasicBlock *BB : L->blocks()) {
+      if (BB->hasAddressTaken()) {
+        invalidate();
+        reportInvalidCandidate(AddressTakenBB);
+        return;
+      }
+
+      for (Instruction &I : *BB) {
+        if (I.mayThrow()) {
+          invalidate();
+          reportInvalidCandidate(MayThrowException);
+          return;
+        }
+        if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
+          if (SI->isVolatile()) {
+            invalidate();
+            reportInvalidCandidate(ContainsVolatileAccess);
+            return;
+          }
+        }
+        if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+          if (LI->isVolatile()) {
+            invalidate();
+            reportInvalidCandidate(ContainsVolatileAccess);
+            return;
+          }
+        }
+        if (I.mayWriteToMemory())
+          MemWrites.push_back(&I);
+        if (I.mayReadFromMemory())
+          MemReads.push_back(&I);
+      }
+    }
+  }
+
+  /// Check if all members of the class are valid.
+  bool isValid() const {
+    return Preheader && Header && ExitingBlock && ExitBlock && Latch && L &&
+           !L->isInvalid() && Valid;
+  }
+
+  /// Verify that all members are in sync with the Loop object.
+  void verify() const {
+    assert(isValid() && "Candidate is not valid!!");
+    assert(!L->isInvalid() && "Loop is invalid!");
+    assert(Preheader == L->getLoopPreheader() && "Preheader is out of sync");
+    assert(Header == L->getHeader() && "Header is out of sync");
+    assert(ExitingBlock == L->getExitingBlock() &&
+           "Exiting Blocks is out of sync");
+    assert(ExitBlock == L->getExitBlock() && "Exit block is out of sync");
+    assert(Latch == L->getLoopLatch() && "Latch is out of sync");
+  }
+
+  /// Get the entry block for this fusion candidate.
+  ///
+  /// If this fusion candidate represents a guarded loop, the entry block is the
+  /// loop guard block. If it represents an unguarded loop, the entry block is
+  /// the preheader of the loop.
+  BasicBlock *getEntryBlock() const {
+    if (GuardBranch)
+      return GuardBranch->getParent();
+    else
+      return Preheader;
+  }
+
   /// After Peeling the loop is modified quite a bit, hence all of the Blocks
   /// need to be updated accordingly.
   void updateAfterPeeling() {
@@ -270,427 +270,427 @@ struct FusionCandidate {
     verify();
   }
 
-  /// Given a guarded loop, get the successor of the guard that is not in the 
-  /// loop. 
-  /// 
-  /// This method returns the successor of the loop guard that is not located 
-  /// within the loop (i.e., the successor of the guard that is not the 
-  /// preheader). 
-  /// This method is only valid for guarded loops. 
-  BasicBlock *getNonLoopBlock() const { 
-    assert(GuardBranch && "Only valid on guarded loops."); 
-    assert(GuardBranch->isConditional() && 
-           "Expecting guard to be a conditional branch."); 
+  /// Given a guarded loop, get the successor of the guard that is not in the
+  /// loop.
+  ///
+  /// This method returns the successor of the loop guard that is not located
+  /// within the loop (i.e., the successor of the guard that is not the
+  /// preheader).
+  /// This method is only valid for guarded loops.
+  BasicBlock *getNonLoopBlock() const {
+    assert(GuardBranch && "Only valid on guarded loops.");
+    assert(GuardBranch->isConditional() &&
+           "Expecting guard to be a conditional branch.");
     if (Peeled)
       return GuardBranch->getSuccessor(1);
-    return (GuardBranch->getSuccessor(0) == Preheader) 
-               ? GuardBranch->getSuccessor(1) 
-               : GuardBranch->getSuccessor(0); 
-  } 
- 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 
-  LLVM_DUMP_METHOD void dump() const { 
-    dbgs() << "\tGuardBranch: "; 
-    if (GuardBranch) 
-      dbgs() << *GuardBranch; 
-    else 
-      dbgs() << "nullptr"; 
-    dbgs() << "\n" 
-           << (GuardBranch ? GuardBranch->getName() : "nullptr") << "\n" 
-           << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr") 
-           << "\n" 
-           << "\tHeader: " << (Header ? Header->getName() : "nullptr") << "\n" 
-           << "\tExitingBB: " 
-           << (ExitingBlock ? ExitingBlock->getName() : "nullptr") << "\n" 
-           << "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr") 
-           << "\n" 
-           << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n" 
-           << "\tEntryBlock: " 
-           << (getEntryBlock() ? getEntryBlock()->getName() : "nullptr") 
-           << "\n"; 
-  } 
-#endif 
- 
-  /// Determine if a fusion candidate (representing a loop) is eligible for 
-  /// fusion. Note that this only checks whether a single loop can be fused - it 
-  /// does not check whether it is *legal* to fuse two loops together. 
-  bool isEligibleForFusion(ScalarEvolution &SE) const { 
-    if (!isValid()) { 
-      LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n"); 
-      if (!Preheader) 
-        ++InvalidPreheader; 
-      if (!Header) 
-        ++InvalidHeader; 
-      if (!ExitingBlock) 
-        ++InvalidExitingBlock; 
-      if (!ExitBlock) 
-        ++InvalidExitBlock; 
-      if (!Latch) 
-        ++InvalidLatch; 
-      if (L->isInvalid()) 
-        ++InvalidLoop; 
- 
-      return false; 
-    } 
- 
-    // Require ScalarEvolution to be able to determine a trip count. 
-    if (!SE.hasLoopInvariantBackedgeTakenCount(L)) { 
-      LLVM_DEBUG(dbgs() << "Loop " << L->getName() 
-                        << " trip count not computable!\n"); 
-      return reportInvalidCandidate(UnknownTripCount); 
-    } 
- 
-    if (!L->isLoopSimplifyForm()) { 
-      LLVM_DEBUG(dbgs() << "Loop " << L->getName() 
-                        << " is not in simplified form!\n"); 
-      return reportInvalidCandidate(NotSimplifiedForm); 
-    } 
- 
-    if (!L->isRotatedForm()) { 
-      LLVM_DEBUG(dbgs() << "Loop " << L->getName() << " is not rotated!\n"); 
-      return reportInvalidCandidate(NotRotated); 
-    } 
- 
-    return true; 
-  } 
- 
-private: 
-  // This is only used internally for now, to clear the MemWrites and MemReads 
-  // list and setting Valid to false. I can't envision other uses of this right 
-  // now, since once FusionCandidates are put into the FusionCandidateSet they 
-  // are immutable. Thus, any time we need to change/update a FusionCandidate, 
-  // we must create a new one and insert it into the FusionCandidateSet to 
-  // ensure the FusionCandidateSet remains ordered correctly. 
-  void invalidate() { 
-    MemWrites.clear(); 
-    MemReads.clear(); 
-    Valid = false; 
-  } 
- 
-  bool reportInvalidCandidate(llvm::Statistic &Stat) const { 
-    using namespace ore; 
-    assert(L && Preheader && "Fusion candidate not initialized properly!"); 
-    ++Stat; 
-    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, Stat.getName(), 
-                                        L->getStartLoc(), Preheader) 
-             << "[" << Preheader->getParent()->getName() << "]: " 
-             << "Loop is not a candidate for fusion: " << Stat.getDesc()); 
-    return false; 
-  } 
-}; 
- 
-struct FusionCandidateCompare { 
-  /// Comparison functor to sort two Control Flow Equivalent fusion candidates 
-  /// into dominance order. 
-  /// If LHS dominates RHS and RHS post-dominates LHS, return true; 
-  /// IF RHS dominates LHS and LHS post-dominates RHS, return false; 
-  bool operator()(const FusionCandidate &LHS, 
-                  const FusionCandidate &RHS) const { 
-    const DominatorTree *DT = LHS.DT; 
- 
-    BasicBlock *LHSEntryBlock = LHS.getEntryBlock(); 
-    BasicBlock *RHSEntryBlock = RHS.getEntryBlock(); 
- 
-    // Do not save PDT to local variable as it is only used in asserts and thus 
-    // will trigger an unused variable warning if building without asserts. 
-    assert(DT && LHS.PDT && "Expecting valid dominator tree"); 
- 
-    // Do this compare first so if LHS == RHS, function returns false. 
-    if (DT->dominates(RHSEntryBlock, LHSEntryBlock)) { 
-      // RHS dominates LHS 
-      // Verify LHS post-dominates RHS 
-      assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock)); 
-      return false; 
-    } 
- 
-    if (DT->dominates(LHSEntryBlock, RHSEntryBlock)) { 
-      // Verify RHS Postdominates LHS 
-      assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock)); 
-      return true; 
-    } 
- 
-    // If LHS does not dominate RHS and RHS does not dominate LHS then there is 
-    // no dominance relationship between the two FusionCandidates. Thus, they 
-    // should not be in the same set together. 
-    llvm_unreachable( 
-        "No dominance relationship between these fusion candidates!"); 
-  } 
-}; 
- 
-using LoopVector = SmallVector<Loop *, 4>; 
- 
-// Set of Control Flow Equivalent (CFE) Fusion Candidates, sorted in dominance 
-// order. Thus, if FC0 comes *before* FC1 in a FusionCandidateSet, then FC0 
-// dominates FC1 and FC1 post-dominates FC0. 
-// std::set was chosen because we want a sorted data structure with stable 
-// iterators. A subsequent patch to loop fusion will enable fusing non-ajdacent 
-// loops by moving intervening code around. When this intervening code contains 
-// loops, those loops will be moved also. The corresponding FusionCandidates 
-// will also need to be moved accordingly. As this is done, having stable 
-// iterators will simplify the logic. Similarly, having an efficient insert that 
-// keeps the FusionCandidateSet sorted will also simplify the implementation. 
-using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>; 
-using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>; 
- 
-#if !defined(NDEBUG) 
-static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, 
-                                     const FusionCandidate &FC) { 
-  if (FC.isValid()) 
-    OS << FC.Preheader->getName(); 
-  else 
-    OS << "<Invalid>"; 
- 
-  return OS; 
-} 
- 
-static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, 
-                                     const FusionCandidateSet &CandSet) { 
-  for (const FusionCandidate &FC : CandSet) 
-    OS << FC << '\n'; 
- 
-  return OS; 
-} 
- 
-static void 
-printFusionCandidates(const FusionCandidateCollection &FusionCandidates) { 
-  dbgs() << "Fusion Candidates: \n"; 
-  for (const auto &CandidateSet : FusionCandidates) { 
-    dbgs() << "*** Fusion Candidate Set ***\n"; 
-    dbgs() << CandidateSet; 
-    dbgs() << "****************************\n"; 
-  } 
-} 
-#endif 
- 
-/// Collect all loops in function at the same nest level, starting at the 
-/// outermost level. 
-/// 
-/// This data structure collects all loops at the same nest level for a 
-/// given function (specified by the LoopInfo object). It starts at the 
-/// outermost level. 
-struct LoopDepthTree { 
-  using LoopsOnLevelTy = SmallVector<LoopVector, 4>; 
-  using iterator = LoopsOnLevelTy::iterator; 
-  using const_iterator = LoopsOnLevelTy::const_iterator; 
- 
-  LoopDepthTree(LoopInfo &LI) : Depth(1) { 
-    if (!LI.empty()) 
-      LoopsOnLevel.emplace_back(LoopVector(LI.rbegin(), LI.rend())); 
-  } 
- 
-  /// Test whether a given loop has been removed from the function, and thus is 
-  /// no longer valid. 
-  bool isRemovedLoop(const Loop *L) const { return RemovedLoops.count(L); } 
- 
-  /// Record that a given loop has been removed from the function and is no 
-  /// longer valid. 
-  void removeLoop(const Loop *L) { RemovedLoops.insert(L); } 
- 
-  /// Descend the tree to the next (inner) nesting level 
-  void descend() { 
-    LoopsOnLevelTy LoopsOnNextLevel; 
- 
-    for (const LoopVector &LV : *this) 
-      for (Loop *L : LV) 
-        if (!isRemovedLoop(L) && L->begin() != L->end()) 
-          LoopsOnNextLevel.emplace_back(LoopVector(L->begin(), L->end())); 
- 
-    LoopsOnLevel = LoopsOnNextLevel; 
-    RemovedLoops.clear(); 
-    Depth++; 
-  } 
- 
-  bool empty() const { return size() == 0; } 
-  size_t size() const { return LoopsOnLevel.size() - RemovedLoops.size(); } 
-  unsigned getDepth() const { return Depth; } 
- 
-  iterator begin() { return LoopsOnLevel.begin(); } 
-  iterator end() { return LoopsOnLevel.end(); } 
-  const_iterator begin() const { return LoopsOnLevel.begin(); } 
-  const_iterator end() const { return LoopsOnLevel.end(); } 
- 
-private: 
-  /// Set of loops that have been removed from the function and are no longer 
-  /// valid. 
-  SmallPtrSet<const Loop *, 8> RemovedLoops; 
- 
-  /// Depth of the current level, starting at 1 (outermost loops). 
-  unsigned Depth; 
- 
-  /// Vector of loops at the current depth level that have the same parent loop 
-  LoopsOnLevelTy LoopsOnLevel; 
-}; 
- 
-#ifndef NDEBUG 
-static void printLoopVector(const LoopVector &LV) { 
-  dbgs() << "****************************\n"; 
-  for (auto L : LV) 
-    printLoop(*L, dbgs()); 
-  dbgs() << "****************************\n"; 
-} 
-#endif 
- 
-struct LoopFuser { 
-private: 
-  // Sets of control flow equivalent fusion candidates for a given nest level. 
-  FusionCandidateCollection FusionCandidates; 
- 
-  LoopDepthTree LDT; 
-  DomTreeUpdater DTU; 
- 
-  LoopInfo &LI; 
-  DominatorTree &DT; 
-  DependenceInfo &DI; 
-  ScalarEvolution &SE; 
-  PostDominatorTree &PDT; 
-  OptimizationRemarkEmitter &ORE; 
+    return (GuardBranch->getSuccessor(0) == Preheader)
+               ? GuardBranch->getSuccessor(1)
+               : GuardBranch->getSuccessor(0);
+  }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  LLVM_DUMP_METHOD void dump() const {
+    dbgs() << "\tGuardBranch: ";
+    if (GuardBranch)
+      dbgs() << *GuardBranch;
+    else
+      dbgs() << "nullptr";
+    dbgs() << "\n"
+           << (GuardBranch ? GuardBranch->getName() : "nullptr") << "\n"
+           << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr")
+           << "\n"
+           << "\tHeader: " << (Header ? Header->getName() : "nullptr") << "\n"
+           << "\tExitingBB: "
+           << (ExitingBlock ? ExitingBlock->getName() : "nullptr") << "\n"
+           << "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr")
+           << "\n"
+           << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n"
+           << "\tEntryBlock: "
+           << (getEntryBlock() ? getEntryBlock()->getName() : "nullptr")
+           << "\n";
+  }
+#endif
+
+  /// Determine if a fusion candidate (representing a loop) is eligible for
+  /// fusion. Note that this only checks whether a single loop can be fused - it
+  /// does not check whether it is *legal* to fuse two loops together.
+  bool isEligibleForFusion(ScalarEvolution &SE) const {
+    if (!isValid()) {
+      LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n");
+      if (!Preheader)
+        ++InvalidPreheader;
+      if (!Header)
+        ++InvalidHeader;
+      if (!ExitingBlock)
+        ++InvalidExitingBlock;
+      if (!ExitBlock)
+        ++InvalidExitBlock;
+      if (!Latch)
+        ++InvalidLatch;
+      if (L->isInvalid())
+        ++InvalidLoop;
+
+      return false;
+    }
+
+    // Require ScalarEvolution to be able to determine a trip count.
+    if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
+      LLVM_DEBUG(dbgs() << "Loop " << L->getName()
+                        << " trip count not computable!\n");
+      return reportInvalidCandidate(UnknownTripCount);
+    }
+
+    if (!L->isLoopSimplifyForm()) {
+      LLVM_DEBUG(dbgs() << "Loop " << L->getName()
+                        << " is not in simplified form!\n");
+      return reportInvalidCandidate(NotSimplifiedForm);
+    }
+
+    if (!L->isRotatedForm()) {
+      LLVM_DEBUG(dbgs() << "Loop " << L->getName() << " is not rotated!\n");
+      return reportInvalidCandidate(NotRotated);
+    }
+
+    return true;
+  }
+
+private:
+  // This is only used internally for now, to clear the MemWrites and MemReads
+  // list and setting Valid to false. I can't envision other uses of this right
+  // now, since once FusionCandidates are put into the FusionCandidateSet they
+  // are immutable. Thus, any time we need to change/update a FusionCandidate,
+  // we must create a new one and insert it into the FusionCandidateSet to
+  // ensure the FusionCandidateSet remains ordered correctly.
+  void invalidate() {
+    MemWrites.clear();
+    MemReads.clear();
+    Valid = false;
+  }
+
+  bool reportInvalidCandidate(llvm::Statistic &Stat) const {
+    using namespace ore;
+    assert(L && Preheader && "Fusion candidate not initialized properly!");
+    ++Stat;
+    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, Stat.getName(),
+                                        L->getStartLoc(), Preheader)
+             << "[" << Preheader->getParent()->getName() << "]: "
+             << "Loop is not a candidate for fusion: " << Stat.getDesc());
+    return false;
+  }
+};
+
+struct FusionCandidateCompare {
+  /// Comparison functor to sort two Control Flow Equivalent fusion candidates
+  /// into dominance order.
+  /// If LHS dominates RHS and RHS post-dominates LHS, return true;
+  /// IF RHS dominates LHS and LHS post-dominates RHS, return false;
+  bool operator()(const FusionCandidate &LHS,
+                  const FusionCandidate &RHS) const {
+    const DominatorTree *DT = LHS.DT;
+
+    BasicBlock *LHSEntryBlock = LHS.getEntryBlock();
+    BasicBlock *RHSEntryBlock = RHS.getEntryBlock();
+
+    // Do not save PDT to local variable as it is only used in asserts and thus
+    // will trigger an unused variable warning if building without asserts.
+    assert(DT && LHS.PDT && "Expecting valid dominator tree");
+
+    // Do this compare first so if LHS == RHS, function returns false.
+    if (DT->dominates(RHSEntryBlock, LHSEntryBlock)) {
+      // RHS dominates LHS
+      // Verify LHS post-dominates RHS
+      assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock));
+      return false;
+    }
+
+    if (DT->dominates(LHSEntryBlock, RHSEntryBlock)) {
+      // Verify RHS Postdominates LHS
+      assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock));
+      return true;
+    }
+
+    // If LHS does not dominate RHS and RHS does not dominate LHS then there is
+    // no dominance relationship between the two FusionCandidates. Thus, they
+    // should not be in the same set together.
+    llvm_unreachable(
+        "No dominance relationship between these fusion candidates!");
+  }
+};
+
+using LoopVector = SmallVector<Loop *, 4>;
+
+// Set of Control Flow Equivalent (CFE) Fusion Candidates, sorted in dominance
+// order. Thus, if FC0 comes *before* FC1 in a FusionCandidateSet, then FC0
+// dominates FC1 and FC1 post-dominates FC0.
+// std::set was chosen because we want a sorted data structure with stable
+// iterators. A subsequent patch to loop fusion will enable fusing non-ajdacent
+// loops by moving intervening code around. When this intervening code contains
+// loops, those loops will be moved also. The corresponding FusionCandidates
+// will also need to be moved accordingly. As this is done, having stable
+// iterators will simplify the logic. Similarly, having an efficient insert that
+// keeps the FusionCandidateSet sorted will also simplify the implementation.
+using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>;
+using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>;
+
+#if !defined(NDEBUG)
+static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
+                                     const FusionCandidate &FC) {
+  if (FC.isValid())
+    OS << FC.Preheader->getName();
+  else
+    OS << "<Invalid>";
+
+  return OS;
+}
+
+static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
+                                     const FusionCandidateSet &CandSet) {
+  for (const FusionCandidate &FC : CandSet)
+    OS << FC << '\n';
+
+  return OS;
+}
+
+static void
+printFusionCandidates(const FusionCandidateCollection &FusionCandidates) {
+  dbgs() << "Fusion Candidates: \n";
+  for (const auto &CandidateSet : FusionCandidates) {
+    dbgs() << "*** Fusion Candidate Set ***\n";
+    dbgs() << CandidateSet;
+    dbgs() << "****************************\n";
+  }
+}
+#endif
+
+/// Collect all loops in function at the same nest level, starting at the
+/// outermost level.
+///
+/// This data structure collects all loops at the same nest level for a
+/// given function (specified by the LoopInfo object). It starts at the
+/// outermost level.
+struct LoopDepthTree {
+  using LoopsOnLevelTy = SmallVector<LoopVector, 4>;
+  using iterator = LoopsOnLevelTy::iterator;
+  using const_iterator = LoopsOnLevelTy::const_iterator;
+
+  LoopDepthTree(LoopInfo &LI) : Depth(1) {
+    if (!LI.empty())
+      LoopsOnLevel.emplace_back(LoopVector(LI.rbegin(), LI.rend()));
+  }
+
+  /// Test whether a given loop has been removed from the function, and thus is
+  /// no longer valid.
+  bool isRemovedLoop(const Loop *L) const { return RemovedLoops.count(L); }
+
+  /// Record that a given loop has been removed from the function and is no
+  /// longer valid.
+  void removeLoop(const Loop *L) { RemovedLoops.insert(L); }
+
+  /// Descend the tree to the next (inner) nesting level
+  void descend() {
+    LoopsOnLevelTy LoopsOnNextLevel;
+
+    for (const LoopVector &LV : *this)
+      for (Loop *L : LV)
+        if (!isRemovedLoop(L) && L->begin() != L->end())
+          LoopsOnNextLevel.emplace_back(LoopVector(L->begin(), L->end()));
+
+    LoopsOnLevel = LoopsOnNextLevel;
+    RemovedLoops.clear();
+    Depth++;
+  }
+
+  bool empty() const { return size() == 0; }
+  size_t size() const { return LoopsOnLevel.size() - RemovedLoops.size(); }
+  unsigned getDepth() const { return Depth; }
+
+  iterator begin() { return LoopsOnLevel.begin(); }
+  iterator end() { return LoopsOnLevel.end(); }
+  const_iterator begin() const { return LoopsOnLevel.begin(); }
+  const_iterator end() const { return LoopsOnLevel.end(); }
+
+private:
+  /// Set of loops that have been removed from the function and are no longer
+  /// valid.
+  SmallPtrSet<const Loop *, 8> RemovedLoops;
+
+  /// Depth of the current level, starting at 1 (outermost loops).
+  unsigned Depth;
+
+  /// Vector of loops at the current depth level that have the same parent loop
+  LoopsOnLevelTy LoopsOnLevel;
+};
+
+#ifndef NDEBUG
+static void printLoopVector(const LoopVector &LV) {
+  dbgs() << "****************************\n";
+  for (auto L : LV)
+    printLoop(*L, dbgs());
+  dbgs() << "****************************\n";
+}
+#endif
+
+struct LoopFuser {
+private:
+  // Sets of control flow equivalent fusion candidates for a given nest level.
+  FusionCandidateCollection FusionCandidates;
+
+  LoopDepthTree LDT;
+  DomTreeUpdater DTU;
+
+  LoopInfo &LI;
+  DominatorTree &DT;
+  DependenceInfo &DI;
+  ScalarEvolution &SE;
+  PostDominatorTree &PDT;
+  OptimizationRemarkEmitter &ORE;
   AssumptionCache &AC;
- 
+
   const TargetTransformInfo &TTI;
 
-public: 
-  LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI, 
-            ScalarEvolution &SE, PostDominatorTree &PDT, 
+public:
+  LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI,
+            ScalarEvolution &SE, PostDominatorTree &PDT,
             OptimizationRemarkEmitter &ORE, const DataLayout &DL,
             AssumptionCache &AC, const TargetTransformInfo &TTI)
-      : LDT(LI), DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI), 
+      : LDT(LI), DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI),
         DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE), AC(AC), TTI(TTI) {}
- 
-  /// This is the main entry point for loop fusion. It will traverse the 
-  /// specified function and collect candidate loops to fuse, starting at the 
-  /// outermost nesting level and working inwards. 
-  bool fuseLoops(Function &F) { 
-#ifndef NDEBUG 
-    if (VerboseFusionDebugging) { 
-      LI.print(dbgs()); 
-    } 
-#endif 
- 
-    LLVM_DEBUG(dbgs() << "Performing Loop Fusion on function " << F.getName() 
-                      << "\n"); 
-    bool Changed = false; 
- 
-    while (!LDT.empty()) { 
-      LLVM_DEBUG(dbgs() << "Got " << LDT.size() << " loop sets for depth " 
-                        << LDT.getDepth() << "\n";); 
- 
-      for (const LoopVector &LV : LDT) { 
-        assert(LV.size() > 0 && "Empty loop set was build!"); 
- 
-        // Skip singleton loop sets as they do not offer fusion opportunities on 
-        // this level. 
-        if (LV.size() == 1) 
-          continue; 
-#ifndef NDEBUG 
-        if (VerboseFusionDebugging) { 
-          LLVM_DEBUG({ 
-            dbgs() << "  Visit loop set (#" << LV.size() << "):\n"; 
-            printLoopVector(LV); 
-          }); 
-        } 
-#endif 
- 
-        collectFusionCandidates(LV); 
-        Changed |= fuseCandidates(); 
-      } 
- 
-      // Finished analyzing candidates at this level. 
-      // Descend to the next level and clear all of the candidates currently 
-      // collected. Note that it will not be possible to fuse any of the 
-      // existing candidates with new candidates because the new candidates will 
-      // be at a different nest level and thus not be control flow equivalent 
-      // with all of the candidates collected so far. 
-      LLVM_DEBUG(dbgs() << "Descend one level!\n"); 
-      LDT.descend(); 
-      FusionCandidates.clear(); 
-    } 
- 
-    if (Changed) 
-      LLVM_DEBUG(dbgs() << "Function after Loop Fusion: \n"; F.dump();); 
- 
-#ifndef NDEBUG 
-    assert(DT.verify()); 
-    assert(PDT.verify()); 
-    LI.verify(DT); 
-    SE.verify(); 
-#endif 
- 
-    LLVM_DEBUG(dbgs() << "Loop Fusion complete\n"); 
-    return Changed; 
-  } 
- 
-private: 
-  /// Determine if two fusion candidates are control flow equivalent. 
-  /// 
-  /// Two fusion candidates are control flow equivalent if when one executes, 
-  /// the other is guaranteed to execute. This is determined using dominators 
-  /// and post-dominators: if A dominates B and B post-dominates A then A and B 
-  /// are control-flow equivalent. 
-  bool isControlFlowEquivalent(const FusionCandidate &FC0, 
-                               const FusionCandidate &FC1) const { 
-    assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders"); 
- 
-    return ::isControlFlowEquivalent(*FC0.getEntryBlock(), *FC1.getEntryBlock(), 
-                                     DT, PDT); 
-  } 
- 
-  /// Iterate over all loops in the given loop set and identify the loops that 
-  /// are eligible for fusion. Place all eligible fusion candidates into Control 
-  /// Flow Equivalent sets, sorted by dominance. 
-  void collectFusionCandidates(const LoopVector &LV) { 
-    for (Loop *L : LV) { 
+
+  /// This is the main entry point for loop fusion. It will traverse the
+  /// specified function and collect candidate loops to fuse, starting at the
+  /// outermost nesting level and working inwards.
+  bool fuseLoops(Function &F) {
+#ifndef NDEBUG
+    if (VerboseFusionDebugging) {
+      LI.print(dbgs());
+    }
+#endif
+
+    LLVM_DEBUG(dbgs() << "Performing Loop Fusion on function " << F.getName()
+                      << "\n");
+    bool Changed = false;
+
+    while (!LDT.empty()) {
+      LLVM_DEBUG(dbgs() << "Got " << LDT.size() << " loop sets for depth "
+                        << LDT.getDepth() << "\n";);
+
+      for (const LoopVector &LV : LDT) {
+        assert(LV.size() > 0 && "Empty loop set was build!");
+
+        // Skip singleton loop sets as they do not offer fusion opportunities on
+        // this level.
+        if (LV.size() == 1)
+          continue;
+#ifndef NDEBUG
+        if (VerboseFusionDebugging) {
+          LLVM_DEBUG({
+            dbgs() << "  Visit loop set (#" << LV.size() << "):\n";
+            printLoopVector(LV);
+          });
+        }
+#endif
+
+        collectFusionCandidates(LV);
+        Changed |= fuseCandidates();
+      }
+
+      // Finished analyzing candidates at this level.
+      // Descend to the next level and clear all of the candidates currently
+      // collected. Note that it will not be possible to fuse any of the
+      // existing candidates with new candidates because the new candidates will
+      // be at a different nest level and thus not be control flow equivalent
+      // with all of the candidates collected so far.
+      LLVM_DEBUG(dbgs() << "Descend one level!\n");
+      LDT.descend();
+      FusionCandidates.clear();
+    }
+
+    if (Changed)
+      LLVM_DEBUG(dbgs() << "Function after Loop Fusion: \n"; F.dump(););
+
+#ifndef NDEBUG
+    assert(DT.verify());
+    assert(PDT.verify());
+    LI.verify(DT);
+    SE.verify();
+#endif
+
+    LLVM_DEBUG(dbgs() << "Loop Fusion complete\n");
+    return Changed;
+  }
+
+private:
+  /// Determine if two fusion candidates are control flow equivalent.
+  ///
+  /// Two fusion candidates are control flow equivalent if when one executes,
+  /// the other is guaranteed to execute. This is determined using dominators
+  /// and post-dominators: if A dominates B and B post-dominates A then A and B
+  /// are control-flow equivalent.
+  bool isControlFlowEquivalent(const FusionCandidate &FC0,
+                               const FusionCandidate &FC1) const {
+    assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders");
+
+    return ::isControlFlowEquivalent(*FC0.getEntryBlock(), *FC1.getEntryBlock(),
+                                     DT, PDT);
+  }
+
+  /// Iterate over all loops in the given loop set and identify the loops that
+  /// are eligible for fusion. Place all eligible fusion candidates into Control
+  /// Flow Equivalent sets, sorted by dominance.
+  void collectFusionCandidates(const LoopVector &LV) {
+    for (Loop *L : LV) {
       TTI::PeelingPreferences PP =
           gatherPeelingPreferences(L, SE, TTI, None, None);
       FusionCandidate CurrCand(L, &DT, &PDT, ORE, PP);
-      if (!CurrCand.isEligibleForFusion(SE)) 
-        continue; 
- 
-      // Go through each list in FusionCandidates and determine if L is control 
-      // flow equivalent with the first loop in that list. If it is, append LV. 
-      // If not, go to the next list. 
-      // If no suitable list is found, start another list and add it to 
-      // FusionCandidates. 
-      bool FoundSet = false; 
- 
-      for (auto &CurrCandSet : FusionCandidates) { 
-        if (isControlFlowEquivalent(*CurrCandSet.begin(), CurrCand)) { 
-          CurrCandSet.insert(CurrCand); 
-          FoundSet = true; 
-#ifndef NDEBUG 
-          if (VerboseFusionDebugging) 
-            LLVM_DEBUG(dbgs() << "Adding " << CurrCand 
-                              << " to existing candidate set\n"); 
-#endif 
-          break; 
-        } 
-      } 
-      if (!FoundSet) { 
-        // No set was found. Create a new set and add to FusionCandidates 
-#ifndef NDEBUG 
-        if (VerboseFusionDebugging) 
-          LLVM_DEBUG(dbgs() << "Adding " << CurrCand << " to new set\n"); 
-#endif 
-        FusionCandidateSet NewCandSet; 
-        NewCandSet.insert(CurrCand); 
-        FusionCandidates.push_back(NewCandSet); 
-      } 
-      NumFusionCandidates++; 
-    } 
-  } 
- 
-  /// Determine if it is beneficial to fuse two loops. 
-  /// 
-  /// For now, this method simply returns true because we want to fuse as much 
-  /// as possible (primarily to test the pass). This method will evolve, over 
-  /// time, to add heuristics for profitability of fusion. 
-  bool isBeneficialFusion(const FusionCandidate &FC0, 
-                          const FusionCandidate &FC1) { 
-    return true; 
-  } 
- 
-  /// Determine if two fusion candidates have the same trip count (i.e., they 
-  /// execute the same number of iterations). 
-  /// 
+      if (!CurrCand.isEligibleForFusion(SE))
+        continue;
+
+      // Go through each list in FusionCandidates and determine if L is control
+      // flow equivalent with the first loop in that list. If it is, append LV.
+      // If not, go to the next list.
+      // If no suitable list is found, start another list and add it to
+      // FusionCandidates.
+      bool FoundSet = false;
+
+      for (auto &CurrCandSet : FusionCandidates) {
+        if (isControlFlowEquivalent(*CurrCandSet.begin(), CurrCand)) {
+          CurrCandSet.insert(CurrCand);
+          FoundSet = true;
+#ifndef NDEBUG
+          if (VerboseFusionDebugging)
+            LLVM_DEBUG(dbgs() << "Adding " << CurrCand
+                              << " to existing candidate set\n");
+#endif
+          break;
+        }
+      }
+      if (!FoundSet) {
+        // No set was found. Create a new set and add to FusionCandidates
+#ifndef NDEBUG
+        if (VerboseFusionDebugging)
+          LLVM_DEBUG(dbgs() << "Adding " << CurrCand << " to new set\n");
+#endif
+        FusionCandidateSet NewCandSet;
+        NewCandSet.insert(CurrCand);
+        FusionCandidates.push_back(NewCandSet);
+      }
+      NumFusionCandidates++;
+    }
+  }
+
+  /// Determine if it is beneficial to fuse two loops.
+  ///
+  /// For now, this method simply returns true because we want to fuse as much
+  /// as possible (primarily to test the pass). This method will evolve, over
+  /// time, to add heuristics for profitability of fusion.
+  bool isBeneficialFusion(const FusionCandidate &FC0,
+                          const FusionCandidate &FC1) {
+    return true;
+  }
+
+  /// Determine if two fusion candidates have the same trip count (i.e., they
+  /// execute the same number of iterations).
+  ///
   /// This function will return a pair of values. The first is a boolean,
   /// stating whether or not the two candidates are known at compile time to
   /// have the same TripCount. The second is the difference in the two
@@ -700,25 +700,25 @@ private:
   haveIdenticalTripCounts(const FusionCandidate &FC0,
                           const FusionCandidate &FC1) const {
 
-    const SCEV *TripCount0 = SE.getBackedgeTakenCount(FC0.L); 
-    if (isa<SCEVCouldNotCompute>(TripCount0)) { 
-      UncomputableTripCount++; 
-      LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!"); 
+    const SCEV *TripCount0 = SE.getBackedgeTakenCount(FC0.L);
+    if (isa<SCEVCouldNotCompute>(TripCount0)) {
+      UncomputableTripCount++;
+      LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!");
       return {false, None};
-    } 
- 
-    const SCEV *TripCount1 = SE.getBackedgeTakenCount(FC1.L); 
-    if (isa<SCEVCouldNotCompute>(TripCount1)) { 
-      UncomputableTripCount++; 
-      LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!"); 
+    }
+
+    const SCEV *TripCount1 = SE.getBackedgeTakenCount(FC1.L);
+    if (isa<SCEVCouldNotCompute>(TripCount1)) {
+      UncomputableTripCount++;
+      LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!");
       return {false, None};
-    } 
+    }
+
+    LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0 << " & "
+                      << *TripCount1 << " are "
+                      << (TripCount0 == TripCount1 ? "identical" : "different")
+                      << "\n");
 
-    LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0 << " & " 
-                      << *TripCount1 << " are " 
-                      << (TripCount0 == TripCount1 ? "identical" : "different") 
-                      << "\n"); 
- 
     if (TripCount0 == TripCount1)
       return {true, 0};
 
@@ -754,8 +754,8 @@ private:
                       << "\n");
 
     return {false, Difference};
-  } 
- 
+  }
+
   void peelFusionCandidate(FusionCandidate &FC0, const FusionCandidate &FC1,
                            unsigned PeelCount) {
     assert(FC0.AbleToPeel && "Should be able to peel loop");
@@ -820,37 +820,37 @@ private:
     }
   }
 
-  /// Walk each set of control flow equivalent fusion candidates and attempt to 
-  /// fuse them. This does a single linear traversal of all candidates in the 
-  /// set. The conditions for legal fusion are checked at this point. If a pair 
-  /// of fusion candidates passes all legality checks, they are fused together 
-  /// and a new fusion candidate is created and added to the FusionCandidateSet. 
-  /// The original fusion candidates are then removed, as they are no longer 
-  /// valid. 
-  bool fuseCandidates() { 
-    bool Fused = false; 
-    LLVM_DEBUG(printFusionCandidates(FusionCandidates)); 
-    for (auto &CandidateSet : FusionCandidates) { 
-      if (CandidateSet.size() < 2) 
-        continue; 
- 
-      LLVM_DEBUG(dbgs() << "Attempting fusion on Candidate Set:\n" 
-                        << CandidateSet << "\n"); 
- 
-      for (auto FC0 = CandidateSet.begin(); FC0 != CandidateSet.end(); ++FC0) { 
-        assert(!LDT.isRemovedLoop(FC0->L) && 
-               "Should not have removed loops in CandidateSet!"); 
-        auto FC1 = FC0; 
-        for (++FC1; FC1 != CandidateSet.end(); ++FC1) { 
-          assert(!LDT.isRemovedLoop(FC1->L) && 
-                 "Should not have removed loops in CandidateSet!"); 
- 
-          LLVM_DEBUG(dbgs() << "Attempting to fuse candidate \n"; FC0->dump(); 
-                     dbgs() << " with\n"; FC1->dump(); dbgs() << "\n"); 
- 
-          FC0->verify(); 
-          FC1->verify(); 
- 
+  /// Walk each set of control flow equivalent fusion candidates and attempt to
+  /// fuse them. This does a single linear traversal of all candidates in the
+  /// set. The conditions for legal fusion are checked at this point. If a pair
+  /// of fusion candidates passes all legality checks, they are fused together
+  /// and a new fusion candidate is created and added to the FusionCandidateSet.
+  /// The original fusion candidates are then removed, as they are no longer
+  /// valid.
+  bool fuseCandidates() {
+    bool Fused = false;
+    LLVM_DEBUG(printFusionCandidates(FusionCandidates));
+    for (auto &CandidateSet : FusionCandidates) {
+      if (CandidateSet.size() < 2)
+        continue;
+
+      LLVM_DEBUG(dbgs() << "Attempting fusion on Candidate Set:\n"
+                        << CandidateSet << "\n");
+
+      for (auto FC0 = CandidateSet.begin(); FC0 != CandidateSet.end(); ++FC0) {
+        assert(!LDT.isRemovedLoop(FC0->L) &&
+               "Should not have removed loops in CandidateSet!");
+        auto FC1 = FC0;
+        for (++FC1; FC1 != CandidateSet.end(); ++FC1) {
+          assert(!LDT.isRemovedLoop(FC1->L) &&
+                 "Should not have removed loops in CandidateSet!");
+
+          LLVM_DEBUG(dbgs() << "Attempting to fuse candidate \n"; FC0->dump();
+                     dbgs() << " with\n"; FC1->dump(); dbgs() << "\n");
+
+          FC0->verify();
+          FC1->verify();
+
           // Check if the candidates have identical tripcounts (first value of
           // pair), and if not check the difference in the tripcounts between
           // the loops (second value of pair). The difference is not equal to
@@ -877,92 +877,92 @@ private:
           }
 
           if (!SameTripCount) {
-            LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip " 
-                                 "counts. Not fusing.\n"); 
-            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 
-                                                       NonEqualTripCount); 
-            continue; 
-          } 
- 
-          if (!isAdjacent(*FC0, *FC1)) { 
-            LLVM_DEBUG(dbgs() 
-                       << "Fusion candidates are not adjacent. Not fusing.\n"); 
-            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, NonAdjacent); 
-            continue; 
-          } 
- 
-          // Ensure that FC0 and FC1 have identical guards. 
-          // If one (or both) are not guarded, this check is not necessary. 
-          if (FC0->GuardBranch && FC1->GuardBranch && 
+            LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip "
+                                 "counts. Not fusing.\n");
+            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+                                                       NonEqualTripCount);
+            continue;
+          }
+
+          if (!isAdjacent(*FC0, *FC1)) {
+            LLVM_DEBUG(dbgs()
+                       << "Fusion candidates are not adjacent. Not fusing.\n");
+            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, NonAdjacent);
+            continue;
+          }
+
+          // Ensure that FC0 and FC1 have identical guards.
+          // If one (or both) are not guarded, this check is not necessary.
+          if (FC0->GuardBranch && FC1->GuardBranch &&
               !haveIdenticalGuards(*FC0, *FC1) && !TCDifference) {
-            LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical " 
-                                 "guards. Not Fusing.\n"); 
-            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 
-                                                       NonIdenticalGuards); 
-            continue; 
-          } 
- 
-          if (!isSafeToMoveBefore(*FC1->Preheader, 
-                                  *FC0->Preheader->getTerminator(), DT, &PDT, 
-                                  &DI)) { 
-            LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe " 
-                                 "instructions in preheader. Not fusing.\n"); 
-            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 
-                                                       NonEmptyPreheader); 
-            continue; 
-          } 
- 
-          if (FC0->GuardBranch) { 
-            assert(FC1->GuardBranch && "Expecting valid FC1 guard branch"); 
- 
-            if (!isSafeToMoveBefore(*FC0->ExitBlock, 
-                                    *FC1->ExitBlock->getFirstNonPHIOrDbg(), DT, 
-                                    &PDT, &DI)) { 
-              LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe " 
-                                   "instructions in exit block. Not fusing.\n"); 
-              reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 
-                                                         NonEmptyExitBlock); 
-              continue; 
-            } 
- 
-            if (!isSafeToMoveBefore( 
-                    *FC1->GuardBranch->getParent(), 
-                    *FC0->GuardBranch->getParent()->getTerminator(), DT, &PDT, 
-                    &DI)) { 
-              LLVM_DEBUG(dbgs() 
-                         << "Fusion candidate contains unsafe " 
-                            "instructions in guard block. Not fusing.\n"); 
-              reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 
-                                                         NonEmptyGuardBlock); 
-              continue; 
-            } 
-          } 
- 
-          // Check the dependencies across the loops and do not fuse if it would 
-          // violate them. 
-          if (!dependencesAllowFusion(*FC0, *FC1)) { 
-            LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n"); 
-            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 
-                                                       InvalidDependencies); 
-            continue; 
-          } 
- 
-          bool BeneficialToFuse = isBeneficialFusion(*FC0, *FC1); 
-          LLVM_DEBUG(dbgs() 
-                     << "\tFusion appears to be " 
-                     << (BeneficialToFuse ? "" : "un") << "profitable!\n"); 
-          if (!BeneficialToFuse) { 
-            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 
-                                                       FusionNotBeneficial); 
-            continue; 
-          } 
-          // All analysis has completed and has determined that fusion is legal 
-          // and profitable. At this point, start transforming the code and 
-          // perform fusion. 
- 
-          LLVM_DEBUG(dbgs() << "\tFusion is performed: " << *FC0 << " and " 
-                            << *FC1 << "\n"); 
- 
+            LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical "
+                                 "guards. Not Fusing.\n");
+            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+                                                       NonIdenticalGuards);
+            continue;
+          }
+
+          if (!isSafeToMoveBefore(*FC1->Preheader,
+                                  *FC0->Preheader->getTerminator(), DT, &PDT,
+                                  &DI)) {
+            LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe "
+                                 "instructions in preheader. Not fusing.\n");
+            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+                                                       NonEmptyPreheader);
+            continue;
+          }
+
+          if (FC0->GuardBranch) {
+            assert(FC1->GuardBranch && "Expecting valid FC1 guard branch");
+
+            if (!isSafeToMoveBefore(*FC0->ExitBlock,
+                                    *FC1->ExitBlock->getFirstNonPHIOrDbg(), DT,
+                                    &PDT, &DI)) {
+              LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe "
+                                   "instructions in exit block. Not fusing.\n");
+              reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+                                                         NonEmptyExitBlock);
+              continue;
+            }
+
+            if (!isSafeToMoveBefore(
+                    *FC1->GuardBranch->getParent(),
+                    *FC0->GuardBranch->getParent()->getTerminator(), DT, &PDT,
+                    &DI)) {
+              LLVM_DEBUG(dbgs()
+                         << "Fusion candidate contains unsafe "
+                            "instructions in guard block. Not fusing.\n");
+              reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+                                                         NonEmptyGuardBlock);
+              continue;
+            }
+          }
+
+          // Check the dependencies across the loops and do not fuse if it would
+          // violate them.
+          if (!dependencesAllowFusion(*FC0, *FC1)) {
+            LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n");
+            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+                                                       InvalidDependencies);
+            continue;
+          }
+
+          bool BeneficialToFuse = isBeneficialFusion(*FC0, *FC1);
+          LLVM_DEBUG(dbgs()
+                     << "\tFusion appears to be "
+                     << (BeneficialToFuse ? "" : "un") << "profitable!\n");
+          if (!BeneficialToFuse) {
+            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+                                                       FusionNotBeneficial);
+            continue;
+          }
+          // All analysis has completed and has determined that fusion is legal
+          // and profitable. At this point, start transforming the code and
+          // perform fusion.
+
+          LLVM_DEBUG(dbgs() << "\tFusion is performed: " << *FC0 << " and "
+                            << *FC1 << "\n");
+
           FusionCandidate FC0Copy = *FC0;
           // Peel the loop after determining that fusion is legal. The Loops
           // will still be safe to fuse after the peeling is performed.
@@ -970,405 +970,405 @@ private:
           if (Peel)
             peelFusionCandidate(FC0Copy, *FC1, *TCDifference);
 
-          // Report fusion to the Optimization Remarks. 
-          // Note this needs to be done *before* performFusion because 
-          // performFusion will change the original loops, making it not 
-          // possible to identify them after fusion is complete. 
+          // Report fusion to the Optimization Remarks.
+          // Note this needs to be done *before* performFusion because
+          // performFusion will change the original loops, making it not
+          // possible to identify them after fusion is complete.
           reportLoopFusion<OptimizationRemark>((Peel ? FC0Copy : *FC0), *FC1,
                                                FuseCounter);
- 
+
           FusionCandidate FusedCand(
               performFusion((Peel ? FC0Copy : *FC0), *FC1), &DT, &PDT, ORE,
               FC0Copy.PP);
-          FusedCand.verify(); 
-          assert(FusedCand.isEligibleForFusion(SE) && 
-                 "Fused candidate should be eligible for fusion!"); 
- 
-          // Notify the loop-depth-tree that these loops are not valid objects 
-          LDT.removeLoop(FC1->L); 
- 
-          CandidateSet.erase(FC0); 
-          CandidateSet.erase(FC1); 
- 
-          auto InsertPos = CandidateSet.insert(FusedCand); 
- 
-          assert(InsertPos.second && 
-                 "Unable to insert TargetCandidate in CandidateSet!"); 
- 
-          // Reset FC0 and FC1 the new (fused) candidate. Subsequent iterations 
-          // of the FC1 loop will attempt to fuse the new (fused) loop with the 
-          // remaining candidates in the current candidate set. 
-          FC0 = FC1 = InsertPos.first; 
- 
-          LLVM_DEBUG(dbgs() << "Candidate Set (after fusion): " << CandidateSet 
-                            << "\n"); 
- 
-          Fused = true; 
-        } 
-      } 
-    } 
-    return Fused; 
-  } 
- 
-  /// Rewrite all additive recurrences in a SCEV to use a new loop. 
-  class AddRecLoopReplacer : public SCEVRewriteVisitor<AddRecLoopReplacer> { 
-  public: 
-    AddRecLoopReplacer(ScalarEvolution &SE, const Loop &OldL, const Loop &NewL, 
-                       bool UseMax = true) 
-        : SCEVRewriteVisitor(SE), Valid(true), UseMax(UseMax), OldL(OldL), 
-          NewL(NewL) {} 
- 
-    const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) { 
-      const Loop *ExprL = Expr->getLoop(); 
-      SmallVector<const SCEV *, 2> Operands; 
-      if (ExprL == &OldL) { 
-        Operands.append(Expr->op_begin(), Expr->op_end()); 
-        return SE.getAddRecExpr(Operands, &NewL, Expr->getNoWrapFlags()); 
-      } 
- 
-      if (OldL.contains(ExprL)) { 
-        bool Pos = SE.isKnownPositive(Expr->getStepRecurrence(SE)); 
-        if (!UseMax || !Pos || !Expr->isAffine()) { 
-          Valid = false; 
-          return Expr; 
-        } 
-        return visit(Expr->getStart()); 
-      } 
- 
-      for (const SCEV *Op : Expr->operands()) 
-        Operands.push_back(visit(Op)); 
-      return SE.getAddRecExpr(Operands, ExprL, Expr->getNoWrapFlags()); 
-    } 
- 
-    bool wasValidSCEV() const { return Valid; } 
- 
-  private: 
-    bool Valid, UseMax; 
-    const Loop &OldL, &NewL; 
-  }; 
- 
-  /// Return false if the access functions of \p I0 and \p I1 could cause 
-  /// a negative dependence. 
-  bool accessDiffIsPositive(const Loop &L0, const Loop &L1, Instruction &I0, 
-                            Instruction &I1, bool EqualIsInvalid) { 
-    Value *Ptr0 = getLoadStorePointerOperand(&I0); 
-    Value *Ptr1 = getLoadStorePointerOperand(&I1); 
-    if (!Ptr0 || !Ptr1) 
-      return false; 
- 
-    const SCEV *SCEVPtr0 = SE.getSCEVAtScope(Ptr0, &L0); 
-    const SCEV *SCEVPtr1 = SE.getSCEVAtScope(Ptr1, &L1); 
-#ifndef NDEBUG 
-    if (VerboseFusionDebugging) 
-      LLVM_DEBUG(dbgs() << "    Access function check: " << *SCEVPtr0 << " vs " 
-                        << *SCEVPtr1 << "\n"); 
-#endif 
-    AddRecLoopReplacer Rewriter(SE, L0, L1); 
-    SCEVPtr0 = Rewriter.visit(SCEVPtr0); 
-#ifndef NDEBUG 
-    if (VerboseFusionDebugging) 
-      LLVM_DEBUG(dbgs() << "    Access function after rewrite: " << *SCEVPtr0 
-                        << " [Valid: " << Rewriter.wasValidSCEV() << "]\n"); 
-#endif 
-    if (!Rewriter.wasValidSCEV()) 
-      return false; 
- 
-    // TODO: isKnownPredicate doesnt work well when one SCEV is loop carried (by 
-    //       L0) and the other is not. We could check if it is monotone and test 
-    //       the beginning and end value instead. 
- 
-    BasicBlock *L0Header = L0.getHeader(); 
-    auto HasNonLinearDominanceRelation = [&](const SCEV *S) { 
-      const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S); 
-      if (!AddRec) 
-        return false; 
-      return !DT.dominates(L0Header, AddRec->getLoop()->getHeader()) && 
-             !DT.dominates(AddRec->getLoop()->getHeader(), L0Header); 
-    }; 
-    if (SCEVExprContains(SCEVPtr1, HasNonLinearDominanceRelation)) 
-      return false; 
- 
-    ICmpInst::Predicate Pred = 
-        EqualIsInvalid ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_SGE; 
-    bool IsAlwaysGE = SE.isKnownPredicate(Pred, SCEVPtr0, SCEVPtr1); 
-#ifndef NDEBUG 
-    if (VerboseFusionDebugging) 
-      LLVM_DEBUG(dbgs() << "    Relation: " << *SCEVPtr0 
-                        << (IsAlwaysGE ? "  >=  " : "  may <  ") << *SCEVPtr1 
-                        << "\n"); 
-#endif 
-    return IsAlwaysGE; 
-  } 
- 
-  /// Return true if the dependences between @p I0 (in @p L0) and @p I1 (in 
-  /// @p L1) allow loop fusion of @p L0 and @p L1. The dependence analyses 
-  /// specified by @p DepChoice are used to determine this. 
-  bool dependencesAllowFusion(const FusionCandidate &FC0, 
-                              const FusionCandidate &FC1, Instruction &I0, 
-                              Instruction &I1, bool AnyDep, 
-                              FusionDependenceAnalysisChoice DepChoice) { 
-#ifndef NDEBUG 
-    if (VerboseFusionDebugging) { 
-      LLVM_DEBUG(dbgs() << "Check dep: " << I0 << " vs " << I1 << " : " 
-                        << DepChoice << "\n"); 
-    } 
-#endif 
-    switch (DepChoice) { 
-    case FUSION_DEPENDENCE_ANALYSIS_SCEV: 
-      return accessDiffIsPositive(*FC0.L, *FC1.L, I0, I1, AnyDep); 
-    case FUSION_DEPENDENCE_ANALYSIS_DA: { 
-      auto DepResult = DI.depends(&I0, &I1, true); 
-      if (!DepResult) 
-        return true; 
-#ifndef NDEBUG 
-      if (VerboseFusionDebugging) { 
-        LLVM_DEBUG(dbgs() << "DA res: "; DepResult->dump(dbgs()); 
-                   dbgs() << " [#l: " << DepResult->getLevels() << "][Ordered: " 
-                          << (DepResult->isOrdered() ? "true" : "false") 
-                          << "]\n"); 
-        LLVM_DEBUG(dbgs() << "DepResult Levels: " << DepResult->getLevels() 
-                          << "\n"); 
-      } 
-#endif 
- 
-      if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor()) 
-        LLVM_DEBUG( 
-            dbgs() << "TODO: Implement pred/succ dependence handling!\n"); 
- 
-      // TODO: Can we actually use the dependence info analysis here? 
-      return false; 
-    } 
- 
-    case FUSION_DEPENDENCE_ANALYSIS_ALL: 
-      return dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep, 
-                                    FUSION_DEPENDENCE_ANALYSIS_SCEV) || 
-             dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep, 
-                                    FUSION_DEPENDENCE_ANALYSIS_DA); 
-    } 
- 
-    llvm_unreachable("Unknown fusion dependence analysis choice!"); 
-  } 
- 
-  /// Perform a dependence check and return if @p FC0 and @p FC1 can be fused. 
-  bool dependencesAllowFusion(const FusionCandidate &FC0, 
-                              const FusionCandidate &FC1) { 
-    LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1 
-                      << "\n"); 
-    assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth()); 
-    assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock())); 
- 
-    for (Instruction *WriteL0 : FC0.MemWrites) { 
-      for (Instruction *WriteL1 : FC1.MemWrites) 
-        if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1, 
-                                    /* AnyDep */ false, 
-                                    FusionDependenceAnalysis)) { 
-          InvalidDependencies++; 
-          return false; 
-        } 
-      for (Instruction *ReadL1 : FC1.MemReads) 
-        if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *ReadL1, 
-                                    /* AnyDep */ false, 
-                                    FusionDependenceAnalysis)) { 
-          InvalidDependencies++; 
-          return false; 
-        } 
-    } 
- 
-    for (Instruction *WriteL1 : FC1.MemWrites) { 
-      for (Instruction *WriteL0 : FC0.MemWrites) 
-        if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1, 
-                                    /* AnyDep */ false, 
-                                    FusionDependenceAnalysis)) { 
-          InvalidDependencies++; 
-          return false; 
-        } 
-      for (Instruction *ReadL0 : FC0.MemReads) 
-        if (!dependencesAllowFusion(FC0, FC1, *ReadL0, *WriteL1, 
-                                    /* AnyDep */ false, 
-                                    FusionDependenceAnalysis)) { 
-          InvalidDependencies++; 
-          return false; 
-        } 
-    } 
- 
-    // Walk through all uses in FC1. For each use, find the reaching def. If the 
-    // def is located in FC0 then it is is not safe to fuse. 
-    for (BasicBlock *BB : FC1.L->blocks()) 
-      for (Instruction &I : *BB) 
-        for (auto &Op : I.operands()) 
-          if (Instruction *Def = dyn_cast<Instruction>(Op)) 
-            if (FC0.L->contains(Def->getParent())) { 
-              InvalidDependencies++; 
-              return false; 
-            } 
- 
-    return true; 
-  } 
- 
-  /// Determine if two fusion candidates are adjacent in the CFG. 
-  /// 
-  /// This method will determine if there are additional basic blocks in the CFG 
-  /// between the exit of \p FC0 and the entry of \p FC1. 
-  /// If the two candidates are guarded loops, then it checks whether the 
-  /// non-loop successor of the \p FC0 guard branch is the entry block of \p 
-  /// FC1. If not, then the loops are not adjacent. If the two candidates are 
-  /// not guarded loops, then it checks whether the exit block of \p FC0 is the 
-  /// preheader of \p FC1. 
-  bool isAdjacent(const FusionCandidate &FC0, 
-                  const FusionCandidate &FC1) const { 
-    // If the successor of the guard branch is FC1, then the loops are adjacent 
-    if (FC0.GuardBranch) 
-      return FC0.getNonLoopBlock() == FC1.getEntryBlock(); 
-    else 
-      return FC0.ExitBlock == FC1.getEntryBlock(); 
-  } 
- 
-  /// Determine if two fusion candidates have identical guards 
-  /// 
-  /// This method will determine if two fusion candidates have the same guards. 
-  /// The guards are considered the same if: 
-  ///   1. The instructions to compute the condition used in the compare are 
-  ///      identical. 
-  ///   2. The successors of the guard have the same flow into/around the loop. 
-  /// If the compare instructions are identical, then the first successor of the 
-  /// guard must go to the same place (either the preheader of the loop or the 
-  /// NonLoopBlock). In other words, the the first successor of both loops must 
-  /// both go into the loop (i.e., the preheader) or go around the loop (i.e., 
-  /// the NonLoopBlock). The same must be true for the second successor. 
-  bool haveIdenticalGuards(const FusionCandidate &FC0, 
-                           const FusionCandidate &FC1) const { 
-    assert(FC0.GuardBranch && FC1.GuardBranch && 
-           "Expecting FC0 and FC1 to be guarded loops."); 
- 
-    if (auto FC0CmpInst = 
-            dyn_cast<Instruction>(FC0.GuardBranch->getCondition())) 
-      if (auto FC1CmpInst = 
-              dyn_cast<Instruction>(FC1.GuardBranch->getCondition())) 
-        if (!FC0CmpInst->isIdenticalTo(FC1CmpInst)) 
-          return false; 
- 
-    // The compare instructions are identical. 
-    // Now make sure the successor of the guards have the same flow into/around 
-    // the loop 
-    if (FC0.GuardBranch->getSuccessor(0) == FC0.Preheader) 
-      return (FC1.GuardBranch->getSuccessor(0) == FC1.Preheader); 
-    else 
-      return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader); 
-  } 
- 
+          FusedCand.verify();
+          assert(FusedCand.isEligibleForFusion(SE) &&
+                 "Fused candidate should be eligible for fusion!");
+
+          // Notify the loop-depth-tree that these loops are not valid objects
+          LDT.removeLoop(FC1->L);
+
+          CandidateSet.erase(FC0);
+          CandidateSet.erase(FC1);
+
+          auto InsertPos = CandidateSet.insert(FusedCand);
+
+          assert(InsertPos.second &&
+                 "Unable to insert TargetCandidate in CandidateSet!");
+
+          // Reset FC0 and FC1 the new (fused) candidate. Subsequent iterations
+          // of the FC1 loop will attempt to fuse the new (fused) loop with the
+          // remaining candidates in the current candidate set.
+          FC0 = FC1 = InsertPos.first;
+
+          LLVM_DEBUG(dbgs() << "Candidate Set (after fusion): " << CandidateSet
+                            << "\n");
+
+          Fused = true;
+        }
+      }
+    }
+    return Fused;
+  }
+
+  /// Rewrite all additive recurrences in a SCEV to use a new loop.
+  class AddRecLoopReplacer : public SCEVRewriteVisitor<AddRecLoopReplacer> {
+  public:
+    AddRecLoopReplacer(ScalarEvolution &SE, const Loop &OldL, const Loop &NewL,
+                       bool UseMax = true)
+        : SCEVRewriteVisitor(SE), Valid(true), UseMax(UseMax), OldL(OldL),
+          NewL(NewL) {}
+
+    const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
+      const Loop *ExprL = Expr->getLoop();
+      SmallVector<const SCEV *, 2> Operands;
+      if (ExprL == &OldL) {
+        Operands.append(Expr->op_begin(), Expr->op_end());
+        return SE.getAddRecExpr(Operands, &NewL, Expr->getNoWrapFlags());
+      }
+
+      if (OldL.contains(ExprL)) {
+        bool Pos = SE.isKnownPositive(Expr->getStepRecurrence(SE));
+        if (!UseMax || !Pos || !Expr->isAffine()) {
+          Valid = false;
+          return Expr;
+        }
+        return visit(Expr->getStart());
+      }
+
+      for (const SCEV *Op : Expr->operands())
+        Operands.push_back(visit(Op));
+      return SE.getAddRecExpr(Operands, ExprL, Expr->getNoWrapFlags());
+    }
+
+    bool wasValidSCEV() const { return Valid; }
+
+  private:
+    bool Valid, UseMax;
+    const Loop &OldL, &NewL;
+  };
+
+  /// Return false if the access functions of \p I0 and \p I1 could cause
+  /// a negative dependence.
+  bool accessDiffIsPositive(const Loop &L0, const Loop &L1, Instruction &I0,
+                            Instruction &I1, bool EqualIsInvalid) {
+    Value *Ptr0 = getLoadStorePointerOperand(&I0);
+    Value *Ptr1 = getLoadStorePointerOperand(&I1);
+    if (!Ptr0 || !Ptr1)
+      return false;
+
+    const SCEV *SCEVPtr0 = SE.getSCEVAtScope(Ptr0, &L0);
+    const SCEV *SCEVPtr1 = SE.getSCEVAtScope(Ptr1, &L1);
+#ifndef NDEBUG
+    if (VerboseFusionDebugging)
+      LLVM_DEBUG(dbgs() << "    Access function check: " << *SCEVPtr0 << " vs "
+                        << *SCEVPtr1 << "\n");
+#endif
+    AddRecLoopReplacer Rewriter(SE, L0, L1);
+    SCEVPtr0 = Rewriter.visit(SCEVPtr0);
+#ifndef NDEBUG
+    if (VerboseFusionDebugging)
+      LLVM_DEBUG(dbgs() << "    Access function after rewrite: " << *SCEVPtr0
+                        << " [Valid: " << Rewriter.wasValidSCEV() << "]\n");
+#endif
+    if (!Rewriter.wasValidSCEV())
+      return false;
+
+    // TODO: isKnownPredicate doesnt work well when one SCEV is loop carried (by
+    //       L0) and the other is not. We could check if it is monotone and test
+    //       the beginning and end value instead.
+
+    BasicBlock *L0Header = L0.getHeader();
+    auto HasNonLinearDominanceRelation = [&](const SCEV *S) {
+      const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S);
+      if (!AddRec)
+        return false;
+      return !DT.dominates(L0Header, AddRec->getLoop()->getHeader()) &&
+             !DT.dominates(AddRec->getLoop()->getHeader(), L0Header);
+    };
+    if (SCEVExprContains(SCEVPtr1, HasNonLinearDominanceRelation))
+      return false;
+
+    ICmpInst::Predicate Pred =
+        EqualIsInvalid ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_SGE;
+    bool IsAlwaysGE = SE.isKnownPredicate(Pred, SCEVPtr0, SCEVPtr1);
+#ifndef NDEBUG
+    if (VerboseFusionDebugging)
+      LLVM_DEBUG(dbgs() << "    Relation: " << *SCEVPtr0
+                        << (IsAlwaysGE ? "  >=  " : "  may <  ") << *SCEVPtr1
+                        << "\n");
+#endif
+    return IsAlwaysGE;
+  }
+
+  /// Return true if the dependences between @p I0 (in @p L0) and @p I1 (in
+  /// @p L1) allow loop fusion of @p L0 and @p L1. The dependence analyses
+  /// specified by @p DepChoice are used to determine this.
+  bool dependencesAllowFusion(const FusionCandidate &FC0,
+                              const FusionCandidate &FC1, Instruction &I0,
+                              Instruction &I1, bool AnyDep,
+                              FusionDependenceAnalysisChoice DepChoice) {
+#ifndef NDEBUG
+    if (VerboseFusionDebugging) {
+      LLVM_DEBUG(dbgs() << "Check dep: " << I0 << " vs " << I1 << " : "
+                        << DepChoice << "\n");
+    }
+#endif
+    switch (DepChoice) {
+    case FUSION_DEPENDENCE_ANALYSIS_SCEV:
+      return accessDiffIsPositive(*FC0.L, *FC1.L, I0, I1, AnyDep);
+    case FUSION_DEPENDENCE_ANALYSIS_DA: {
+      auto DepResult = DI.depends(&I0, &I1, true);
+      if (!DepResult)
+        return true;
+#ifndef NDEBUG
+      if (VerboseFusionDebugging) {
+        LLVM_DEBUG(dbgs() << "DA res: "; DepResult->dump(dbgs());
+                   dbgs() << " [#l: " << DepResult->getLevels() << "][Ordered: "
+                          << (DepResult->isOrdered() ? "true" : "false")
+                          << "]\n");
+        LLVM_DEBUG(dbgs() << "DepResult Levels: " << DepResult->getLevels()
+                          << "\n");
+      }
+#endif
+
+      if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor())
+        LLVM_DEBUG(
+            dbgs() << "TODO: Implement pred/succ dependence handling!\n");
+
+      // TODO: Can we actually use the dependence info analysis here?
+      return false;
+    }
+
+    case FUSION_DEPENDENCE_ANALYSIS_ALL:
+      return dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep,
+                                    FUSION_DEPENDENCE_ANALYSIS_SCEV) ||
+             dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep,
+                                    FUSION_DEPENDENCE_ANALYSIS_DA);
+    }
+
+    llvm_unreachable("Unknown fusion dependence analysis choice!");
+  }
+
+  /// Perform a dependence check and return if @p FC0 and @p FC1 can be fused.
+  bool dependencesAllowFusion(const FusionCandidate &FC0,
+                              const FusionCandidate &FC1) {
+    LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1
+                      << "\n");
+    assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth());
+    assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock()));
+
+    for (Instruction *WriteL0 : FC0.MemWrites) {
+      for (Instruction *WriteL1 : FC1.MemWrites)
+        if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1,
+                                    /* AnyDep */ false,
+                                    FusionDependenceAnalysis)) {
+          InvalidDependencies++;
+          return false;
+        }
+      for (Instruction *ReadL1 : FC1.MemReads)
+        if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *ReadL1,
+                                    /* AnyDep */ false,
+                                    FusionDependenceAnalysis)) {
+          InvalidDependencies++;
+          return false;
+        }
+    }
+
+    for (Instruction *WriteL1 : FC1.MemWrites) {
+      for (Instruction *WriteL0 : FC0.MemWrites)
+        if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1,
+                                    /* AnyDep */ false,
+                                    FusionDependenceAnalysis)) {
+          InvalidDependencies++;
+          return false;
+        }
+      for (Instruction *ReadL0 : FC0.MemReads)
+        if (!dependencesAllowFusion(FC0, FC1, *ReadL0, *WriteL1,
+                                    /* AnyDep */ false,
+                                    FusionDependenceAnalysis)) {
+          InvalidDependencies++;
+          return false;
+        }
+    }
+
+    // Walk through all uses in FC1. For each use, find the reaching def. If the
+    // def is located in FC0 then it is is not safe to fuse.
+    for (BasicBlock *BB : FC1.L->blocks())
+      for (Instruction &I : *BB)
+        for (auto &Op : I.operands())
+          if (Instruction *Def = dyn_cast<Instruction>(Op))
+            if (FC0.L->contains(Def->getParent())) {
+              InvalidDependencies++;
+              return false;
+            }
+
+    return true;
+  }
+
+  /// Determine if two fusion candidates are adjacent in the CFG.
+  ///
+  /// This method will determine if there are additional basic blocks in the CFG
+  /// between the exit of \p FC0 and the entry of \p FC1.
+  /// If the two candidates are guarded loops, then it checks whether the
+  /// non-loop successor of the \p FC0 guard branch is the entry block of \p
+  /// FC1. If not, then the loops are not adjacent. If the two candidates are
+  /// not guarded loops, then it checks whether the exit block of \p FC0 is the
+  /// preheader of \p FC1.
+  bool isAdjacent(const FusionCandidate &FC0,
+                  const FusionCandidate &FC1) const {
+    // If the successor of the guard branch is FC1, then the loops are adjacent
+    if (FC0.GuardBranch)
+      return FC0.getNonLoopBlock() == FC1.getEntryBlock();
+    else
+      return FC0.ExitBlock == FC1.getEntryBlock();
+  }
+
+  /// Determine if two fusion candidates have identical guards
+  ///
+  /// This method will determine if two fusion candidates have the same guards.
+  /// The guards are considered the same if:
+  ///   1. The instructions to compute the condition used in the compare are
+  ///      identical.
+  ///   2. The successors of the guard have the same flow into/around the loop.
+  /// If the compare instructions are identical, then the first successor of the
+  /// guard must go to the same place (either the preheader of the loop or the
+  /// NonLoopBlock). In other words, the the first successor of both loops must
+  /// both go into the loop (i.e., the preheader) or go around the loop (i.e.,
+  /// the NonLoopBlock). The same must be true for the second successor.
+  bool haveIdenticalGuards(const FusionCandidate &FC0,
+                           const FusionCandidate &FC1) const {
+    assert(FC0.GuardBranch && FC1.GuardBranch &&
+           "Expecting FC0 and FC1 to be guarded loops.");
+
+    if (auto FC0CmpInst =
+            dyn_cast<Instruction>(FC0.GuardBranch->getCondition()))
+      if (auto FC1CmpInst =
+              dyn_cast<Instruction>(FC1.GuardBranch->getCondition()))
+        if (!FC0CmpInst->isIdenticalTo(FC1CmpInst))
+          return false;
+
+    // The compare instructions are identical.
+    // Now make sure the successor of the guards have the same flow into/around
+    // the loop
+    if (FC0.GuardBranch->getSuccessor(0) == FC0.Preheader)
+      return (FC1.GuardBranch->getSuccessor(0) == FC1.Preheader);
+    else
+      return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader);
+  }
+
   /// Modify the latch branch of FC to be unconditional since successors of the
   /// branch are the same.
-  void simplifyLatchBranch(const FusionCandidate &FC) const { 
-    BranchInst *FCLatchBranch = dyn_cast<BranchInst>(FC.Latch->getTerminator()); 
-    if (FCLatchBranch) { 
-      assert(FCLatchBranch->isConditional() && 
-             FCLatchBranch->getSuccessor(0) == FCLatchBranch->getSuccessor(1) && 
-             "Expecting the two successors of FCLatchBranch to be the same"); 
+  void simplifyLatchBranch(const FusionCandidate &FC) const {
+    BranchInst *FCLatchBranch = dyn_cast<BranchInst>(FC.Latch->getTerminator());
+    if (FCLatchBranch) {
+      assert(FCLatchBranch->isConditional() &&
+             FCLatchBranch->getSuccessor(0) == FCLatchBranch->getSuccessor(1) &&
+             "Expecting the two successors of FCLatchBranch to be the same");
       BranchInst *NewBranch =
           BranchInst::Create(FCLatchBranch->getSuccessor(0));
       ReplaceInstWithInst(FCLatchBranch, NewBranch);
-    } 
-  } 
- 
-  /// Move instructions from FC0.Latch to FC1.Latch. If FC0.Latch has an unique 
-  /// successor, then merge FC0.Latch with its unique successor. 
-  void mergeLatch(const FusionCandidate &FC0, const FusionCandidate &FC1) { 
-    moveInstructionsToTheBeginning(*FC0.Latch, *FC1.Latch, DT, PDT, DI); 
-    if (BasicBlock *Succ = FC0.Latch->getUniqueSuccessor()) { 
-      MergeBlockIntoPredecessor(Succ, &DTU, &LI); 
-      DTU.flush(); 
-    } 
-  } 
- 
-  /// Fuse two fusion candidates, creating a new fused loop. 
-  /// 
-  /// This method contains the mechanics of fusing two loops, represented by \p 
-  /// FC0 and \p FC1. It is assumed that \p FC0 dominates \p FC1 and \p FC1 
-  /// postdominates \p FC0 (making them control flow equivalent). It also 
-  /// assumes that the other conditions for fusion have been met: adjacent, 
-  /// identical trip counts, and no negative distance dependencies exist that 
-  /// would prevent fusion. Thus, there is no checking for these conditions in 
-  /// this method. 
-  /// 
-  /// Fusion is performed by rewiring the CFG to update successor blocks of the 
-  /// components of tho loop. Specifically, the following changes are done: 
-  /// 
-  ///   1. The preheader of \p FC1 is removed as it is no longer necessary 
-  ///   (because it is currently only a single statement block). 
-  ///   2. The latch of \p FC0 is modified to jump to the header of \p FC1. 
-  ///   3. The latch of \p FC1 i modified to jump to the header of \p FC0. 
-  ///   4. All blocks from \p FC1 are removed from FC1 and added to FC0. 
-  /// 
-  /// All of these modifications are done with dominator tree updates, thus 
-  /// keeping the dominator (and post dominator) information up-to-date. 
-  /// 
-  /// This can be improved in the future by actually merging blocks during 
-  /// fusion. For example, the preheader of \p FC1 can be merged with the 
-  /// preheader of \p FC0. This would allow loops with more than a single 
-  /// statement in the preheader to be fused. Similarly, the latch blocks of the 
-  /// two loops could also be fused into a single block. This will require 
-  /// analysis to prove it is safe to move the contents of the block past 
-  /// existing code, which currently has not been implemented. 
-  Loop *performFusion(const FusionCandidate &FC0, const FusionCandidate &FC1) { 
-    assert(FC0.isValid() && FC1.isValid() && 
-           "Expecting valid fusion candidates"); 
- 
-    LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0.dump(); 
-               dbgs() << "Fusion Candidate 1: \n"; FC1.dump();); 
- 
-    // Move instructions from the preheader of FC1 to the end of the preheader 
-    // of FC0. 
-    moveInstructionsToTheEnd(*FC1.Preheader, *FC0.Preheader, DT, PDT, DI); 
- 
-    // Fusing guarded loops is handled slightly differently than non-guarded 
-    // loops and has been broken out into a separate method instead of trying to 
-    // intersperse the logic within a single method. 
-    if (FC0.GuardBranch) 
-      return fuseGuardedLoops(FC0, FC1); 
- 
+    }
+  }
+
+  /// Move instructions from FC0.Latch to FC1.Latch. If FC0.Latch has an unique
+  /// successor, then merge FC0.Latch with its unique successor.
+  void mergeLatch(const FusionCandidate &FC0, const FusionCandidate &FC1) {
+    moveInstructionsToTheBeginning(*FC0.Latch, *FC1.Latch, DT, PDT, DI);
+    if (BasicBlock *Succ = FC0.Latch->getUniqueSuccessor()) {
+      MergeBlockIntoPredecessor(Succ, &DTU, &LI);
+      DTU.flush();
+    }
+  }
+
+  /// Fuse two fusion candidates, creating a new fused loop.
+  ///
+  /// This method contains the mechanics of fusing two loops, represented by \p
+  /// FC0 and \p FC1. It is assumed that \p FC0 dominates \p FC1 and \p FC1
+  /// postdominates \p FC0 (making them control flow equivalent). It also
+  /// assumes that the other conditions for fusion have been met: adjacent,
+  /// identical trip counts, and no negative distance dependencies exist that
+  /// would prevent fusion. Thus, there is no checking for these conditions in
+  /// this method.
+  ///
+  /// Fusion is performed by rewiring the CFG to update successor blocks of the
+  /// components of tho loop. Specifically, the following changes are done:
+  ///
+  ///   1. The preheader of \p FC1 is removed as it is no longer necessary
+  ///   (because it is currently only a single statement block).
+  ///   2. The latch of \p FC0 is modified to jump to the header of \p FC1.
+  ///   3. The latch of \p FC1 i modified to jump to the header of \p FC0.
+  ///   4. All blocks from \p FC1 are removed from FC1 and added to FC0.
+  ///
+  /// All of these modifications are done with dominator tree updates, thus
+  /// keeping the dominator (and post dominator) information up-to-date.
+  ///
+  /// This can be improved in the future by actually merging blocks during
+  /// fusion. For example, the preheader of \p FC1 can be merged with the
+  /// preheader of \p FC0. This would allow loops with more than a single
+  /// statement in the preheader to be fused. Similarly, the latch blocks of the
+  /// two loops could also be fused into a single block. This will require
+  /// analysis to prove it is safe to move the contents of the block past
+  /// existing code, which currently has not been implemented.
+  Loop *performFusion(const FusionCandidate &FC0, const FusionCandidate &FC1) {
+    assert(FC0.isValid() && FC1.isValid() &&
+           "Expecting valid fusion candidates");
+
+    LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0.dump();
+               dbgs() << "Fusion Candidate 1: \n"; FC1.dump(););
+
+    // Move instructions from the preheader of FC1 to the end of the preheader
+    // of FC0.
+    moveInstructionsToTheEnd(*FC1.Preheader, *FC0.Preheader, DT, PDT, DI);
+
+    // Fusing guarded loops is handled slightly differently than non-guarded
+    // loops and has been broken out into a separate method instead of trying to
+    // intersperse the logic within a single method.
+    if (FC0.GuardBranch)
+      return fuseGuardedLoops(FC0, FC1);
+
     assert(FC1.Preheader ==
            (FC0.Peeled ? FC0.ExitBlock->getUniqueSuccessor() : FC0.ExitBlock));
-    assert(FC1.Preheader->size() == 1 && 
-           FC1.Preheader->getSingleSuccessor() == FC1.Header); 
- 
-    // Remember the phi nodes originally in the header of FC0 in order to rewire 
-    // them later. However, this is only necessary if the new loop carried 
-    // values might not dominate the exiting branch. While we do not generally 
-    // test if this is the case but simply insert intermediate phi nodes, we 
-    // need to make sure these intermediate phi nodes have different 
-    // predecessors. To this end, we filter the special case where the exiting 
-    // block is the latch block of the first loop. Nothing needs to be done 
-    // anyway as all loop carried values dominate the latch and thereby also the 
-    // exiting branch. 
-    SmallVector<PHINode *, 8> OriginalFC0PHIs; 
-    if (FC0.ExitingBlock != FC0.Latch) 
-      for (PHINode &PHI : FC0.Header->phis()) 
-        OriginalFC0PHIs.push_back(&PHI); 
- 
-    // Replace incoming blocks for header PHIs first. 
-    FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader); 
-    FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch); 
- 
-    // Then modify the control flow and update DT and PDT. 
-    SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; 
- 
-    // The old exiting block of the first loop (FC0) has to jump to the header 
-    // of the second as we need to execute the code in the second header block 
-    // regardless of the trip count. That is, if the trip count is 0, so the 
-    // back edge is never taken, we still have to execute both loop headers, 
-    // especially (but not only!) if the second is a do-while style loop. 
-    // However, doing so might invalidate the phi nodes of the first loop as 
-    // the new values do only need to dominate their latch and not the exiting 
-    // predicate. To remedy this potential problem we always introduce phi 
-    // nodes in the header of the second loop later that select the loop carried 
-    // value, if the second header was reached through an old latch of the 
-    // first, or undef otherwise. This is sound as exiting the first implies the 
-    // second will exit too, __without__ taking the back-edge. [Their 
-    // trip-counts are equal after all. 
-    // KB: Would this sequence be simpler to just just make FC0.ExitingBlock go 
-    // to FC1.Header? I think this is basically what the three sequences are 
-    // trying to accomplish; however, doing this directly in the CFG may mean 
-    // the DT/PDT becomes invalid 
+    assert(FC1.Preheader->size() == 1 &&
+           FC1.Preheader->getSingleSuccessor() == FC1.Header);
+
+    // Remember the phi nodes originally in the header of FC0 in order to rewire
+    // them later. However, this is only necessary if the new loop carried
+    // values might not dominate the exiting branch. While we do not generally
+    // test if this is the case but simply insert intermediate phi nodes, we
+    // need to make sure these intermediate phi nodes have different
+    // predecessors. To this end, we filter the special case where the exiting
+    // block is the latch block of the first loop. Nothing needs to be done
+    // anyway as all loop carried values dominate the latch and thereby also the
+    // exiting branch.
+    SmallVector<PHINode *, 8> OriginalFC0PHIs;
+    if (FC0.ExitingBlock != FC0.Latch)
+      for (PHINode &PHI : FC0.Header->phis())
+        OriginalFC0PHIs.push_back(&PHI);
+
+    // Replace incoming blocks for header PHIs first.
+    FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader);
+    FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch);
+
+    // Then modify the control flow and update DT and PDT.
+    SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;
+
+    // The old exiting block of the first loop (FC0) has to jump to the header
+    // of the second as we need to execute the code in the second header block
+    // regardless of the trip count. That is, if the trip count is 0, so the
+    // back edge is never taken, we still have to execute both loop headers,
+    // especially (but not only!) if the second is a do-while style loop.
+    // However, doing so might invalidate the phi nodes of the first loop as
+    // the new values do only need to dominate their latch and not the exiting
+    // predicate. To remedy this potential problem we always introduce phi
+    // nodes in the header of the second loop later that select the loop carried
+    // value, if the second header was reached through an old latch of the
+    // first, or undef otherwise. This is sound as exiting the first implies the
+    // second will exit too, __without__ taking the back-edge. [Their
+    // trip-counts are equal after all.
+    // KB: Would this sequence be simpler to just just make FC0.ExitingBlock go
+    // to FC1.Header? I think this is basically what the three sequences are
+    // trying to accomplish; however, doing this directly in the CFG may mean
+    // the DT/PDT becomes invalid
     if (!FC0.Peeled) {
       FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC1.Preheader,
                                                            FC1.Header);
@@ -1379,7 +1379,7 @@ private:
     } else {
       TreeUpdates.emplace_back(DominatorTree::UpdateType(
           DominatorTree::Delete, FC0.ExitBlock, FC1.Preheader));
- 
+
       // Remove the ExitBlock of the first Loop (also not needed)
       FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock,
                                                            FC1.Header);
@@ -1391,215 +1391,215 @@ private:
       new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock);
     }
 
-    // The pre-header of L1 is not necessary anymore. 
+    // The pre-header of L1 is not necessary anymore.
     assert(pred_empty(FC1.Preheader));
-    FC1.Preheader->getTerminator()->eraseFromParent(); 
-    new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader); 
-    TreeUpdates.emplace_back(DominatorTree::UpdateType( 
-        DominatorTree::Delete, FC1.Preheader, FC1.Header)); 
- 
-    // Moves the phi nodes from the second to the first loops header block. 
-    while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) { 
-      if (SE.isSCEVable(PHI->getType())) 
-        SE.forgetValue(PHI); 
-      if (PHI->hasNUsesOrMore(1)) 
-        PHI->moveBefore(&*FC0.Header->getFirstInsertionPt()); 
-      else 
-        PHI->eraseFromParent(); 
-    } 
- 
-    // Introduce new phi nodes in the second loop header to ensure 
-    // exiting the first and jumping to the header of the second does not break 
-    // the SSA property of the phis originally in the first loop. See also the 
-    // comment above. 
-    Instruction *L1HeaderIP = &FC1.Header->front(); 
-    for (PHINode *LCPHI : OriginalFC0PHIs) { 
-      int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch); 
-      assert(L1LatchBBIdx >= 0 && 
-             "Expected loop carried value to be rewired at this point!"); 
- 
-      Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx); 
- 
-      PHINode *L1HeaderPHI = PHINode::Create( 
-          LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP); 
-      L1HeaderPHI->addIncoming(LCV, FC0.Latch); 
-      L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()), 
-                               FC0.ExitingBlock); 
- 
-      LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI); 
-    } 
- 
-    // Replace latch terminator destinations. 
-    FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header); 
-    FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header); 
- 
+    FC1.Preheader->getTerminator()->eraseFromParent();
+    new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader);
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Delete, FC1.Preheader, FC1.Header));
+
+    // Moves the phi nodes from the second to the first loops header block.
+    while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) {
+      if (SE.isSCEVable(PHI->getType()))
+        SE.forgetValue(PHI);
+      if (PHI->hasNUsesOrMore(1))
+        PHI->moveBefore(&*FC0.Header->getFirstInsertionPt());
+      else
+        PHI->eraseFromParent();
+    }
+
+    // Introduce new phi nodes in the second loop header to ensure
+    // exiting the first and jumping to the header of the second does not break
+    // the SSA property of the phis originally in the first loop. See also the
+    // comment above.
+    Instruction *L1HeaderIP = &FC1.Header->front();
+    for (PHINode *LCPHI : OriginalFC0PHIs) {
+      int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch);
+      assert(L1LatchBBIdx >= 0 &&
+             "Expected loop carried value to be rewired at this point!");
+
+      Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx);
+
+      PHINode *L1HeaderPHI = PHINode::Create(
+          LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP);
+      L1HeaderPHI->addIncoming(LCV, FC0.Latch);
+      L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()),
+                               FC0.ExitingBlock);
+
+      LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI);
+    }
+
+    // Replace latch terminator destinations.
+    FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);
+    FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);
+
     // Modify the latch branch of FC0 to be unconditional as both successors of
-    // the branch are the same. 
-    simplifyLatchBranch(FC0); 
- 
-    // If FC0.Latch and FC0.ExitingBlock are the same then we have already 
-    // performed the updates above. 
-    if (FC0.Latch != FC0.ExitingBlock) 
-      TreeUpdates.emplace_back(DominatorTree::UpdateType( 
-          DominatorTree::Insert, FC0.Latch, FC1.Header)); 
- 
-    TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, 
-                                                       FC0.Latch, FC0.Header)); 
-    TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert, 
-                                                       FC1.Latch, FC0.Header)); 
-    TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, 
-                                                       FC1.Latch, FC1.Header)); 
- 
-    // Update DT/PDT 
-    DTU.applyUpdates(TreeUpdates); 
- 
-    LI.removeBlock(FC1.Preheader); 
-    DTU.deleteBB(FC1.Preheader); 
+    // the branch are the same.
+    simplifyLatchBranch(FC0);
+
+    // If FC0.Latch and FC0.ExitingBlock are the same then we have already
+    // performed the updates above.
+    if (FC0.Latch != FC0.ExitingBlock)
+      TreeUpdates.emplace_back(DominatorTree::UpdateType(
+          DominatorTree::Insert, FC0.Latch, FC1.Header));
+
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
+                                                       FC0.Latch, FC0.Header));
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert,
+                                                       FC1.Latch, FC0.Header));
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
+                                                       FC1.Latch, FC1.Header));
+
+    // Update DT/PDT
+    DTU.applyUpdates(TreeUpdates);
+
+    LI.removeBlock(FC1.Preheader);
+    DTU.deleteBB(FC1.Preheader);
     if (FC0.Peeled) {
       LI.removeBlock(FC0.ExitBlock);
       DTU.deleteBB(FC0.ExitBlock);
     }
 
-    DTU.flush(); 
- 
-    // Is there a way to keep SE up-to-date so we don't need to forget the loops 
-    // and rebuild the information in subsequent passes of fusion? 
-    // Note: Need to forget the loops before merging the loop latches, as 
-    // mergeLatch may remove the only block in FC1. 
-    SE.forgetLoop(FC1.L); 
-    SE.forgetLoop(FC0.L); 
- 
-    // Move instructions from FC0.Latch to FC1.Latch. 
-    // Note: mergeLatch requires an updated DT. 
-    mergeLatch(FC0, FC1); 
- 
-    // Merge the loops. 
+    DTU.flush();
+
+    // Is there a way to keep SE up-to-date so we don't need to forget the loops
+    // and rebuild the information in subsequent passes of fusion?
+    // Note: Need to forget the loops before merging the loop latches, as
+    // mergeLatch may remove the only block in FC1.
+    SE.forgetLoop(FC1.L);
+    SE.forgetLoop(FC0.L);
+
+    // Move instructions from FC0.Latch to FC1.Latch.
+    // Note: mergeLatch requires an updated DT.
+    mergeLatch(FC0, FC1);
+
+    // Merge the loops.
     SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks());
-    for (BasicBlock *BB : Blocks) { 
-      FC0.L->addBlockEntry(BB); 
-      FC1.L->removeBlockFromLoop(BB); 
-      if (LI.getLoopFor(BB) != FC1.L) 
-        continue; 
-      LI.changeLoopFor(BB, FC0.L); 
-    } 
+    for (BasicBlock *BB : Blocks) {
+      FC0.L->addBlockEntry(BB);
+      FC1.L->removeBlockFromLoop(BB);
+      if (LI.getLoopFor(BB) != FC1.L)
+        continue;
+      LI.changeLoopFor(BB, FC0.L);
+    }
     while (!FC1.L->isInnermost()) {
-      const auto &ChildLoopIt = FC1.L->begin(); 
-      Loop *ChildLoop = *ChildLoopIt; 
-      FC1.L->removeChildLoop(ChildLoopIt); 
-      FC0.L->addChildLoop(ChildLoop); 
-    } 
- 
-    // Delete the now empty loop L1. 
-    LI.erase(FC1.L); 
- 
-#ifndef NDEBUG 
-    assert(!verifyFunction(*FC0.Header->getParent(), &errs())); 
-    assert(DT.verify(DominatorTree::VerificationLevel::Fast)); 
-    assert(PDT.verify()); 
-    LI.verify(DT); 
-    SE.verify(); 
-#endif 
- 
-    LLVM_DEBUG(dbgs() << "Fusion done:\n"); 
- 
-    return FC0.L; 
-  } 
- 
-  /// Report details on loop fusion opportunities. 
-  /// 
-  /// This template function can be used to report both successful and missed 
-  /// loop fusion opportunities, based on the RemarkKind. The RemarkKind should 
-  /// be one of: 
-  ///   - OptimizationRemarkMissed to report when loop fusion is unsuccessful 
-  ///     given two valid fusion candidates. 
-  ///   - OptimizationRemark to report successful fusion of two fusion 
-  ///     candidates. 
-  /// The remarks will be printed using the form: 
-  ///    <path/filename>:<line number>:<column number>: [<function name>]: 
-  ///       <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description> 
-  template <typename RemarkKind> 
-  void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1, 
-                        llvm::Statistic &Stat) { 
-    assert(FC0.Preheader && FC1.Preheader && 
-           "Expecting valid fusion candidates"); 
-    using namespace ore; 
-    ++Stat; 
-    ORE.emit(RemarkKind(DEBUG_TYPE, Stat.getName(), FC0.L->getStartLoc(), 
-                        FC0.Preheader) 
-             << "[" << FC0.Preheader->getParent()->getName() 
-             << "]: " << NV("Cand1", StringRef(FC0.Preheader->getName())) 
-             << " and " << NV("Cand2", StringRef(FC1.Preheader->getName())) 
-             << ": " << Stat.getDesc()); 
-  } 
- 
-  /// Fuse two guarded fusion candidates, creating a new fused loop. 
-  /// 
-  /// Fusing guarded loops is handled much the same way as fusing non-guarded 
-  /// loops. The rewiring of the CFG is slightly different though, because of 
-  /// the presence of the guards around the loops and the exit blocks after the 
-  /// loop body. As such, the new loop is rewired as follows: 
-  ///    1. Keep the guard branch from FC0 and use the non-loop block target 
-  /// from the FC1 guard branch. 
-  ///    2. Remove the exit block from FC0 (this exit block should be empty 
-  /// right now). 
-  ///    3. Remove the guard branch for FC1 
-  ///    4. Remove the preheader for FC1. 
-  /// The exit block successor for the latch of FC0 is updated to be the header 
-  /// of FC1 and the non-exit block successor of the latch of FC1 is updated to 
-  /// be the header of FC0, thus creating the fused loop. 
-  Loop *fuseGuardedLoops(const FusionCandidate &FC0, 
-                         const FusionCandidate &FC1) { 
-    assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops"); 
- 
-    BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent(); 
-    BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent(); 
-    BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock(); 
-    BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock(); 
+      const auto &ChildLoopIt = FC1.L->begin();
+      Loop *ChildLoop = *ChildLoopIt;
+      FC1.L->removeChildLoop(ChildLoopIt);
+      FC0.L->addChildLoop(ChildLoop);
+    }
+
+    // Delete the now empty loop L1.
+    LI.erase(FC1.L);
+
+#ifndef NDEBUG
+    assert(!verifyFunction(*FC0.Header->getParent(), &errs()));
+    assert(DT.verify(DominatorTree::VerificationLevel::Fast));
+    assert(PDT.verify());
+    LI.verify(DT);
+    SE.verify();
+#endif
+
+    LLVM_DEBUG(dbgs() << "Fusion done:\n");
+
+    return FC0.L;
+  }
+
+  /// Report details on loop fusion opportunities.
+  ///
+  /// This template function can be used to report both successful and missed
+  /// loop fusion opportunities, based on the RemarkKind. The RemarkKind should
+  /// be one of:
+  ///   - OptimizationRemarkMissed to report when loop fusion is unsuccessful
+  ///     given two valid fusion candidates.
+  ///   - OptimizationRemark to report successful fusion of two fusion
+  ///     candidates.
+  /// The remarks will be printed using the form:
+  ///    <path/filename>:<line number>:<column number>: [<function name>]:
+  ///       <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description>
+  template <typename RemarkKind>
+  void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1,
+                        llvm::Statistic &Stat) {
+    assert(FC0.Preheader && FC1.Preheader &&
+           "Expecting valid fusion candidates");
+    using namespace ore;
+    ++Stat;
+    ORE.emit(RemarkKind(DEBUG_TYPE, Stat.getName(), FC0.L->getStartLoc(),
+                        FC0.Preheader)
+             << "[" << FC0.Preheader->getParent()->getName()
+             << "]: " << NV("Cand1", StringRef(FC0.Preheader->getName()))
+             << " and " << NV("Cand2", StringRef(FC1.Preheader->getName()))
+             << ": " << Stat.getDesc());
+  }
+
+  /// Fuse two guarded fusion candidates, creating a new fused loop.
+  ///
+  /// Fusing guarded loops is handled much the same way as fusing non-guarded
+  /// loops. The rewiring of the CFG is slightly different though, because of
+  /// the presence of the guards around the loops and the exit blocks after the
+  /// loop body. As such, the new loop is rewired as follows:
+  ///    1. Keep the guard branch from FC0 and use the non-loop block target
+  /// from the FC1 guard branch.
+  ///    2. Remove the exit block from FC0 (this exit block should be empty
+  /// right now).
+  ///    3. Remove the guard branch for FC1
+  ///    4. Remove the preheader for FC1.
+  /// The exit block successor for the latch of FC0 is updated to be the header
+  /// of FC1 and the non-exit block successor of the latch of FC1 is updated to
+  /// be the header of FC0, thus creating the fused loop.
+  Loop *fuseGuardedLoops(const FusionCandidate &FC0,
+                         const FusionCandidate &FC1) {
+    assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops");
+
+    BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent();
+    BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent();
+    BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock();
+    BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock();
     BasicBlock *FC0ExitBlockSuccessor = FC0.ExitBlock->getUniqueSuccessor();
- 
-    // Move instructions from the exit block of FC0 to the beginning of the exit 
+
+    // Move instructions from the exit block of FC0 to the beginning of the exit
     // block of FC1, in the case that the FC0 loop has not been peeled. In the
     // case that FC0 loop is peeled, then move the instructions of the successor
     // of the FC0 Exit block to the beginning of the exit block of FC1.
     moveInstructionsToTheBeginning(
         (FC0.Peeled ? *FC0ExitBlockSuccessor : *FC0.ExitBlock), *FC1.ExitBlock,
         DT, PDT, DI);
- 
-    // Move instructions from the guard block of FC1 to the end of the guard 
-    // block of FC0. 
-    moveInstructionsToTheEnd(*FC1GuardBlock, *FC0GuardBlock, DT, PDT, DI); 
- 
-    assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent"); 
- 
-    SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; 
- 
-    //////////////////////////////////////////////////////////////////////////// 
-    // Update the Loop Guard 
-    //////////////////////////////////////////////////////////////////////////// 
-    // The guard for FC0 is updated to guard both FC0 and FC1. This is done by 
-    // changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1. 
-    // Thus, one path from the guard goes to the preheader for FC0 (and thus 
-    // executes the new fused loop) and the other path goes to the NonLoopBlock 
-    // for FC1 (where FC1 guard would have gone if FC1 was not executed). 
-    FC1NonLoopBlock->replacePhiUsesWith(FC1GuardBlock, FC0GuardBlock); 
-    FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock); 
- 
+
+    // Move instructions from the guard block of FC1 to the end of the guard
+    // block of FC0.
+    moveInstructionsToTheEnd(*FC1GuardBlock, *FC0GuardBlock, DT, PDT, DI);
+
+    assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent");
+
+    SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;
+
+    ////////////////////////////////////////////////////////////////////////////
+    // Update the Loop Guard
+    ////////////////////////////////////////////////////////////////////////////
+    // The guard for FC0 is updated to guard both FC0 and FC1. This is done by
+    // changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1.
+    // Thus, one path from the guard goes to the preheader for FC0 (and thus
+    // executes the new fused loop) and the other path goes to the NonLoopBlock
+    // for FC1 (where FC1 guard would have gone if FC1 was not executed).
+    FC1NonLoopBlock->replacePhiUsesWith(FC1GuardBlock, FC0GuardBlock);
+    FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock);
+
     BasicBlock *BBToUpdate = FC0.Peeled ? FC0ExitBlockSuccessor : FC0.ExitBlock;
     BBToUpdate->getTerminator()->replaceUsesOfWith(FC1GuardBlock, FC1.Header);
 
-    // The guard of FC1 is not necessary anymore. 
-    FC1.GuardBranch->eraseFromParent(); 
-    new UnreachableInst(FC1GuardBlock->getContext(), FC1GuardBlock); 
- 
-    TreeUpdates.emplace_back(DominatorTree::UpdateType( 
-        DominatorTree::Delete, FC1GuardBlock, FC1.Preheader)); 
-    TreeUpdates.emplace_back(DominatorTree::UpdateType( 
-        DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock)); 
-    TreeUpdates.emplace_back(DominatorTree::UpdateType( 
-        DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock)); 
-    TreeUpdates.emplace_back(DominatorTree::UpdateType( 
-        DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock)); 
- 
+    // The guard of FC1 is not necessary anymore.
+    FC1.GuardBranch->eraseFromParent();
+    new UnreachableInst(FC1GuardBlock->getContext(), FC1GuardBlock);
+
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Delete, FC1GuardBlock, FC1.Preheader));
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock));
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock));
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock));
+
     if (FC0.Peeled) {
       // Remove the Block after the ExitBlock of FC0
       TreeUpdates.emplace_back(DominatorTree::UpdateType(
@@ -1610,273 +1610,273 @@ private:
     }
 
     assert(pred_empty(FC1GuardBlock) &&
-           "Expecting guard block to have no predecessors"); 
+           "Expecting guard block to have no predecessors");
     assert(succ_empty(FC1GuardBlock) &&
-           "Expecting guard block to have no successors"); 
- 
-    // Remember the phi nodes originally in the header of FC0 in order to rewire 
-    // them later. However, this is only necessary if the new loop carried 
-    // values might not dominate the exiting branch. While we do not generally 
-    // test if this is the case but simply insert intermediate phi nodes, we 
-    // need to make sure these intermediate phi nodes have different 
-    // predecessors. To this end, we filter the special case where the exiting 
-    // block is the latch block of the first loop. Nothing needs to be done 
-    // anyway as all loop carried values dominate the latch and thereby also the 
-    // exiting branch. 
-    // KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch 
-    // (because the loops are rotated. Thus, nothing will ever be added to 
-    // OriginalFC0PHIs. 
-    SmallVector<PHINode *, 8> OriginalFC0PHIs; 
-    if (FC0.ExitingBlock != FC0.Latch) 
-      for (PHINode &PHI : FC0.Header->phis()) 
-        OriginalFC0PHIs.push_back(&PHI); 
- 
-    assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!"); 
- 
-    // Replace incoming blocks for header PHIs first. 
-    FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader); 
-    FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch); 
- 
-    // The old exiting block of the first loop (FC0) has to jump to the header 
-    // of the second as we need to execute the code in the second header block 
-    // regardless of the trip count. That is, if the trip count is 0, so the 
-    // back edge is never taken, we still have to execute both loop headers, 
-    // especially (but not only!) if the second is a do-while style loop. 
-    // However, doing so might invalidate the phi nodes of the first loop as 
-    // the new values do only need to dominate their latch and not the exiting 
-    // predicate. To remedy this potential problem we always introduce phi 
-    // nodes in the header of the second loop later that select the loop carried 
-    // value, if the second header was reached through an old latch of the 
-    // first, or undef otherwise. This is sound as exiting the first implies the 
-    // second will exit too, __without__ taking the back-edge (their 
-    // trip-counts are equal after all). 
-    FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock, 
-                                                         FC1.Header); 
- 
-    TreeUpdates.emplace_back(DominatorTree::UpdateType( 
-        DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock)); 
-    TreeUpdates.emplace_back(DominatorTree::UpdateType( 
-        DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); 
- 
-    // Remove FC0 Exit Block 
-    // The exit block for FC0 is no longer needed since control will flow 
-    // directly to the header of FC1. Since it is an empty block, it can be 
-    // removed at this point. 
-    // TODO: In the future, we can handle non-empty exit blocks my merging any 
-    // instructions from FC0 exit block into FC1 exit block prior to removing 
-    // the block. 
+           "Expecting guard block to have no successors");
+
+    // Remember the phi nodes originally in the header of FC0 in order to rewire
+    // them later. However, this is only necessary if the new loop carried
+    // values might not dominate the exiting branch. While we do not generally
+    // test if this is the case but simply insert intermediate phi nodes, we
+    // need to make sure these intermediate phi nodes have different
+    // predecessors. To this end, we filter the special case where the exiting
+    // block is the latch block of the first loop. Nothing needs to be done
+    // anyway as all loop carried values dominate the latch and thereby also the
+    // exiting branch.
+    // KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch
+    // (because the loops are rotated. Thus, nothing will ever be added to
+    // OriginalFC0PHIs.
+    SmallVector<PHINode *, 8> OriginalFC0PHIs;
+    if (FC0.ExitingBlock != FC0.Latch)
+      for (PHINode &PHI : FC0.Header->phis())
+        OriginalFC0PHIs.push_back(&PHI);
+
+    assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!");
+
+    // Replace incoming blocks for header PHIs first.
+    FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader);
+    FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch);
+
+    // The old exiting block of the first loop (FC0) has to jump to the header
+    // of the second as we need to execute the code in the second header block
+    // regardless of the trip count. That is, if the trip count is 0, so the
+    // back edge is never taken, we still have to execute both loop headers,
+    // especially (but not only!) if the second is a do-while style loop.
+    // However, doing so might invalidate the phi nodes of the first loop as
+    // the new values do only need to dominate their latch and not the exiting
+    // predicate. To remedy this potential problem we always introduce phi
+    // nodes in the header of the second loop later that select the loop carried
+    // value, if the second header was reached through an old latch of the
+    // first, or undef otherwise. This is sound as exiting the first implies the
+    // second will exit too, __without__ taking the back-edge (their
+    // trip-counts are equal after all).
+    FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock,
+                                                         FC1.Header);
+
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock));
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
+
+    // Remove FC0 Exit Block
+    // The exit block for FC0 is no longer needed since control will flow
+    // directly to the header of FC1. Since it is an empty block, it can be
+    // removed at this point.
+    // TODO: In the future, we can handle non-empty exit blocks my merging any
+    // instructions from FC0 exit block into FC1 exit block prior to removing
+    // the block.
     assert(pred_empty(FC0.ExitBlock) && "Expecting exit block to be empty");
-    FC0.ExitBlock->getTerminator()->eraseFromParent(); 
-    new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock); 
- 
-    // Remove FC1 Preheader 
-    // The pre-header of L1 is not necessary anymore. 
+    FC0.ExitBlock->getTerminator()->eraseFromParent();
+    new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock);
+
+    // Remove FC1 Preheader
+    // The pre-header of L1 is not necessary anymore.
     assert(pred_empty(FC1.Preheader));
-    FC1.Preheader->getTerminator()->eraseFromParent(); 
-    new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader); 
-    TreeUpdates.emplace_back(DominatorTree::UpdateType( 
-        DominatorTree::Delete, FC1.Preheader, FC1.Header)); 
- 
-    // Moves the phi nodes from the second to the first loops header block. 
-    while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) { 
-      if (SE.isSCEVable(PHI->getType())) 
-        SE.forgetValue(PHI); 
-      if (PHI->hasNUsesOrMore(1)) 
-        PHI->moveBefore(&*FC0.Header->getFirstInsertionPt()); 
-      else 
-        PHI->eraseFromParent(); 
-    } 
- 
-    // Introduce new phi nodes in the second loop header to ensure 
-    // exiting the first and jumping to the header of the second does not break 
-    // the SSA property of the phis originally in the first loop. See also the 
-    // comment above. 
-    Instruction *L1HeaderIP = &FC1.Header->front(); 
-    for (PHINode *LCPHI : OriginalFC0PHIs) { 
-      int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch); 
-      assert(L1LatchBBIdx >= 0 && 
-             "Expected loop carried value to be rewired at this point!"); 
- 
-      Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx); 
- 
-      PHINode *L1HeaderPHI = PHINode::Create( 
-          LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP); 
-      L1HeaderPHI->addIncoming(LCV, FC0.Latch); 
-      L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()), 
-                               FC0.ExitingBlock); 
- 
-      LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI); 
-    } 
- 
-    // Update the latches 
- 
-    // Replace latch terminator destinations. 
-    FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header); 
-    FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header); 
- 
+    FC1.Preheader->getTerminator()->eraseFromParent();
+    new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader);
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Delete, FC1.Preheader, FC1.Header));
+
+    // Moves the phi nodes from the second to the first loops header block.
+    while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) {
+      if (SE.isSCEVable(PHI->getType()))
+        SE.forgetValue(PHI);
+      if (PHI->hasNUsesOrMore(1))
+        PHI->moveBefore(&*FC0.Header->getFirstInsertionPt());
+      else
+        PHI->eraseFromParent();
+    }
+
+    // Introduce new phi nodes in the second loop header to ensure
+    // exiting the first and jumping to the header of the second does not break
+    // the SSA property of the phis originally in the first loop. See also the
+    // comment above.
+    Instruction *L1HeaderIP = &FC1.Header->front();
+    for (PHINode *LCPHI : OriginalFC0PHIs) {
+      int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch);
+      assert(L1LatchBBIdx >= 0 &&
+             "Expected loop carried value to be rewired at this point!");
+
+      Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx);
+
+      PHINode *L1HeaderPHI = PHINode::Create(
+          LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP);
+      L1HeaderPHI->addIncoming(LCV, FC0.Latch);
+      L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()),
+                               FC0.ExitingBlock);
+
+      LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI);
+    }
+
+    // Update the latches
+
+    // Replace latch terminator destinations.
+    FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);
+    FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);
+
     // Modify the latch branch of FC0 to be unconditional as both successors of
-    // the branch are the same. 
-    simplifyLatchBranch(FC0); 
- 
-    // If FC0.Latch and FC0.ExitingBlock are the same then we have already 
-    // performed the updates above. 
-    if (FC0.Latch != FC0.ExitingBlock) 
-      TreeUpdates.emplace_back(DominatorTree::UpdateType( 
-          DominatorTree::Insert, FC0.Latch, FC1.Header)); 
- 
-    TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, 
-                                                       FC0.Latch, FC0.Header)); 
-    TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert, 
-                                                       FC1.Latch, FC0.Header)); 
-    TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, 
-                                                       FC1.Latch, FC1.Header)); 
- 
-    // All done 
-    // Apply the updates to the Dominator Tree and cleanup. 
- 
+    // the branch are the same.
+    simplifyLatchBranch(FC0);
+
+    // If FC0.Latch and FC0.ExitingBlock are the same then we have already
+    // performed the updates above.
+    if (FC0.Latch != FC0.ExitingBlock)
+      TreeUpdates.emplace_back(DominatorTree::UpdateType(
+          DominatorTree::Insert, FC0.Latch, FC1.Header));
+
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
+                                                       FC0.Latch, FC0.Header));
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert,
+                                                       FC1.Latch, FC0.Header));
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
+                                                       FC1.Latch, FC1.Header));
+
+    // All done
+    // Apply the updates to the Dominator Tree and cleanup.
+
     assert(succ_empty(FC1GuardBlock) && "FC1GuardBlock has successors!!");
     assert(pred_empty(FC1GuardBlock) && "FC1GuardBlock has predecessors!!");
- 
-    // Update DT/PDT 
-    DTU.applyUpdates(TreeUpdates); 
- 
-    LI.removeBlock(FC1GuardBlock); 
-    LI.removeBlock(FC1.Preheader); 
-    LI.removeBlock(FC0.ExitBlock); 
+
+    // Update DT/PDT
+    DTU.applyUpdates(TreeUpdates);
+
+    LI.removeBlock(FC1GuardBlock);
+    LI.removeBlock(FC1.Preheader);
+    LI.removeBlock(FC0.ExitBlock);
     if (FC0.Peeled) {
       LI.removeBlock(FC0ExitBlockSuccessor);
       DTU.deleteBB(FC0ExitBlockSuccessor);
     }
-    DTU.deleteBB(FC1GuardBlock); 
-    DTU.deleteBB(FC1.Preheader); 
-    DTU.deleteBB(FC0.ExitBlock); 
-    DTU.flush(); 
- 
-    // Is there a way to keep SE up-to-date so we don't need to forget the loops 
-    // and rebuild the information in subsequent passes of fusion? 
-    // Note: Need to forget the loops before merging the loop latches, as 
-    // mergeLatch may remove the only block in FC1. 
-    SE.forgetLoop(FC1.L); 
-    SE.forgetLoop(FC0.L); 
- 
-    // Move instructions from FC0.Latch to FC1.Latch. 
-    // Note: mergeLatch requires an updated DT. 
-    mergeLatch(FC0, FC1); 
- 
-    // Merge the loops. 
+    DTU.deleteBB(FC1GuardBlock);
+    DTU.deleteBB(FC1.Preheader);
+    DTU.deleteBB(FC0.ExitBlock);
+    DTU.flush();
+
+    // Is there a way to keep SE up-to-date so we don't need to forget the loops
+    // and rebuild the information in subsequent passes of fusion?
+    // Note: Need to forget the loops before merging the loop latches, as
+    // mergeLatch may remove the only block in FC1.
+    SE.forgetLoop(FC1.L);
+    SE.forgetLoop(FC0.L);
+
+    // Move instructions from FC0.Latch to FC1.Latch.
+    // Note: mergeLatch requires an updated DT.
+    mergeLatch(FC0, FC1);
+
+    // Merge the loops.
     SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks());
-    for (BasicBlock *BB : Blocks) { 
-      FC0.L->addBlockEntry(BB); 
-      FC1.L->removeBlockFromLoop(BB); 
-      if (LI.getLoopFor(BB) != FC1.L) 
-        continue; 
-      LI.changeLoopFor(BB, FC0.L); 
-    } 
+    for (BasicBlock *BB : Blocks) {
+      FC0.L->addBlockEntry(BB);
+      FC1.L->removeBlockFromLoop(BB);
+      if (LI.getLoopFor(BB) != FC1.L)
+        continue;
+      LI.changeLoopFor(BB, FC0.L);
+    }
     while (!FC1.L->isInnermost()) {
-      const auto &ChildLoopIt = FC1.L->begin(); 
-      Loop *ChildLoop = *ChildLoopIt; 
-      FC1.L->removeChildLoop(ChildLoopIt); 
-      FC0.L->addChildLoop(ChildLoop); 
-    } 
- 
-    // Delete the now empty loop L1. 
-    LI.erase(FC1.L); 
- 
-#ifndef NDEBUG 
-    assert(!verifyFunction(*FC0.Header->getParent(), &errs())); 
-    assert(DT.verify(DominatorTree::VerificationLevel::Fast)); 
-    assert(PDT.verify()); 
-    LI.verify(DT); 
-    SE.verify(); 
-#endif 
- 
-    LLVM_DEBUG(dbgs() << "Fusion done:\n"); 
- 
-    return FC0.L; 
-  } 
-}; 
- 
-struct LoopFuseLegacy : public FunctionPass { 
- 
-  static char ID; 
- 
-  LoopFuseLegacy() : FunctionPass(ID) { 
-    initializeLoopFuseLegacyPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequiredID(LoopSimplifyID); 
-    AU.addRequired<ScalarEvolutionWrapperPass>(); 
-    AU.addRequired<LoopInfoWrapperPass>(); 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addRequired<PostDominatorTreeWrapperPass>(); 
-    AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 
-    AU.addRequired<DependenceAnalysisWrapperPass>(); 
+      const auto &ChildLoopIt = FC1.L->begin();
+      Loop *ChildLoop = *ChildLoopIt;
+      FC1.L->removeChildLoop(ChildLoopIt);
+      FC0.L->addChildLoop(ChildLoop);
+    }
+
+    // Delete the now empty loop L1.
+    LI.erase(FC1.L);
+
+#ifndef NDEBUG
+    assert(!verifyFunction(*FC0.Header->getParent(), &errs()));
+    assert(DT.verify(DominatorTree::VerificationLevel::Fast));
+    assert(PDT.verify());
+    LI.verify(DT);
+    SE.verify();
+#endif
+
+    LLVM_DEBUG(dbgs() << "Fusion done:\n");
+
+    return FC0.L;
+  }
+};
+
+struct LoopFuseLegacy : public FunctionPass {
+
+  static char ID;
+
+  LoopFuseLegacy() : FunctionPass(ID) {
+    initializeLoopFuseLegacyPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequiredID(LoopSimplifyID);
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<PostDominatorTreeWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+    AU.addRequired<DependenceAnalysisWrapperPass>();
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
- 
-    AU.addPreserved<ScalarEvolutionWrapperPass>(); 
-    AU.addPreserved<LoopInfoWrapperPass>(); 
-    AU.addPreserved<DominatorTreeWrapperPass>(); 
-    AU.addPreserved<PostDominatorTreeWrapperPass>(); 
-  } 
- 
-  bool runOnFunction(Function &F) override { 
-    if (skipFunction(F)) 
-      return false; 
-    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 
-    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-    auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI(); 
-    auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 
-    auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(); 
-    auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 
+
+    AU.addPreserved<ScalarEvolutionWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<PostDominatorTreeWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI();
+    auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+    auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
     auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
     const TargetTransformInfo &TTI =
         getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     const DataLayout &DL = F.getParent()->getDataLayout();
- 
+
     LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI);
-    return LF.fuseLoops(F); 
-  } 
-}; 
-} // namespace 
- 
-PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) { 
-  auto &LI = AM.getResult<LoopAnalysis>(F); 
-  auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 
-  auto &DI = AM.getResult<DependenceAnalysis>(F); 
-  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 
-  auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F); 
-  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 
+    return LF.fuseLoops(F);
+  }
+};
+} // namespace
+
+PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) {
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &DI = AM.getResult<DependenceAnalysis>(F);
+  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+  auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
   auto &AC = AM.getResult<AssumptionAnalysis>(F);
   const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
   const DataLayout &DL = F.getParent()->getDataLayout();
- 
+
   LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI);
-  bool Changed = LF.fuseLoops(F); 
-  if (!Changed) 
-    return PreservedAnalyses::all(); 
- 
-  PreservedAnalyses PA; 
-  PA.preserve<DominatorTreeAnalysis>(); 
-  PA.preserve<PostDominatorTreeAnalysis>(); 
-  PA.preserve<ScalarEvolutionAnalysis>(); 
-  PA.preserve<LoopAnalysis>(); 
-  return PA; 
-} 
- 
-char LoopFuseLegacy::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false, 
-                      false) 
-INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 
+  bool Changed = LF.fuseLoops(F);
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<PostDominatorTreeAnalysis>();
+  PA.preserve<ScalarEvolutionAnalysis>();
+  PA.preserve<LoopAnalysis>();
+  return PA;
+}
+
+char LoopFuseLegacy::ID = 0;
+
+INITIALIZE_PASS_BEGIN(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_END(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false, false) 
- 
-FunctionPass *llvm::createLoopFusePass() { return new LoopFuseLegacy(); } 
+INITIALIZE_PASS_END(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false, false)
+
+FunctionPass *llvm::createLoopFusePass() { return new LoopFuseLegacy(); }
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 7c55efb78d..8064c02e2b 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -1,117 +1,117 @@
-//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass implements an idiom recognizer that transforms simple loops into a 
-// non-loop form.  In cases that this kicks in, it can be a significant 
-// performance win. 
-// 
-// If compiling for code size we avoid idiom recognition if the resulting 
-// code could be larger than the code for the original loop. One way this could 
-// happen is if the loop is not removable after idiom recognition due to the 
-// presence of non-idiom instructions. The initial implementation of the 
-// heuristics applies to idioms in multi-block loops. 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// TODO List: 
-// 
-// Future loop memory idioms to recognize: 
-//   memcmp, memmove, strlen, etc. 
-// Future floating point idioms to recognize in -ffast-math mode: 
-//   fpowi 
-// Future integer operation idioms to recognize: 
-//   ctpop 
-// 
-// Beware that isel's default lowering for ctpop is highly inefficient for 
-// i64 and larger types when i64 is legal and the value has few bits set.  It 
-// would be good to enhance isel to emit a loop for ctpop in this case. 
-// 
-// This could recognize common matrix multiplies and dot product idioms and 
-// replace them with calls to BLAS (if linked in??). 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h" 
-#include "llvm/ADT/APInt.h" 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/MapVector.h" 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/StringRef.h" 
-#include "llvm/Analysis/AliasAnalysis.h" 
+//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements an idiom recognizer that transforms simple loops into a
+// non-loop form.  In cases that this kicks in, it can be a significant
+// performance win.
+//
+// If compiling for code size we avoid idiom recognition if the resulting
+// code could be larger than the code for the original loop. One way this could
+// happen is if the loop is not removable after idiom recognition due to the
+// presence of non-idiom instructions. The initial implementation of the
+// heuristics applies to idioms in multi-block loops.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO List:
+//
+// Future loop memory idioms to recognize:
+//   memcmp, memmove, strlen, etc.
+// Future floating point idioms to recognize in -ffast-math mode:
+//   fpowi
+// Future integer operation idioms to recognize:
+//   ctpop
+//
+// Beware that isel's default lowering for ctpop is highly inefficient for
+// i64 and larger types when i64 is legal and the value has few bits set.  It
+// would be good to enhance isel to emit a loop for ctpop in this case.
+//
+// This could recognize common matrix multiplies and dot product idioms and
+// replace them with calls to BLAS (if linked in??).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CmpInstAnalysis.h"
-#include "llvm/Analysis/LoopAccessAnalysis.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/LoopPass.h" 
-#include "llvm/Analysis/MemoryLocation.h" 
-#include "llvm/Analysis/MemorySSA.h" 
-#include "llvm/Analysis/MemorySSAUpdater.h" 
-#include "llvm/Analysis/MustExecute.h" 
-#include "llvm/Analysis/OptimizationRemarkEmitter.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/Analysis/ScalarEvolutionExpressions.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/Attributes.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DebugLoc.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/GlobalValue.h" 
-#include "llvm/IR/GlobalVariable.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/PassManager.h" 
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/MustExecute.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/IR/ValueHandle.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/BuildLibCalls.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Transforms/Utils/LoopUtils.h" 
-#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <cstdint> 
-#include <utility> 
-#include <vector> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "loop-idiom" 
- 
-STATISTIC(NumMemSet, "Number of memset's formed from loop stores"); 
-STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores"); 
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-idiom"
+
+STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
+STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
 STATISTIC(
     NumShiftUntilBitTest,
     "Number of uncountable loops recognized as 'shift until bitttest' idiom");
- 
+
 bool DisableLIRP::All;
 static cl::opt<bool, true>
     DisableLIRPAll("disable-" DEBUG_TYPE "-all",
@@ -135,841 +135,841 @@ static cl::opt<bool, true>
                       cl::location(DisableLIRP::Memcpy), cl::init(false),
                       cl::ReallyHidden);
 
-static cl::opt<bool> UseLIRCodeSizeHeurs( 
-    "use-lir-code-size-heurs", 
-    cl::desc("Use loop idiom recognition code size heuristics when compiling" 
-             "with -Os/-Oz"), 
-    cl::init(true), cl::Hidden); 
- 
-namespace { 
- 
-class LoopIdiomRecognize { 
-  Loop *CurLoop = nullptr; 
-  AliasAnalysis *AA; 
-  DominatorTree *DT; 
-  LoopInfo *LI; 
-  ScalarEvolution *SE; 
-  TargetLibraryInfo *TLI; 
-  const TargetTransformInfo *TTI; 
-  const DataLayout *DL; 
-  OptimizationRemarkEmitter &ORE; 
-  bool ApplyCodeSizeHeuristics; 
-  std::unique_ptr<MemorySSAUpdater> MSSAU; 
- 
-public: 
-  explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT, 
-                              LoopInfo *LI, ScalarEvolution *SE, 
-                              TargetLibraryInfo *TLI, 
-                              const TargetTransformInfo *TTI, MemorySSA *MSSA, 
-                              const DataLayout *DL, 
-                              OptimizationRemarkEmitter &ORE) 
-      : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) { 
-    if (MSSA) 
-      MSSAU = std::make_unique<MemorySSAUpdater>(MSSA); 
-  } 
- 
-  bool runOnLoop(Loop *L); 
- 
-private: 
-  using StoreList = SmallVector<StoreInst *, 8>; 
-  using StoreListMap = MapVector<Value *, StoreList>; 
- 
-  StoreListMap StoreRefsForMemset; 
-  StoreListMap StoreRefsForMemsetPattern; 
-  StoreList StoreRefsForMemcpy; 
-  bool HasMemset; 
-  bool HasMemsetPattern; 
-  bool HasMemcpy; 
- 
-  /// Return code for isLegalStore() 
-  enum LegalStoreKind { 
-    None = 0, 
-    Memset, 
-    MemsetPattern, 
-    Memcpy, 
-    UnorderedAtomicMemcpy, 
-    DontUse // Dummy retval never to be used. Allows catching errors in retval 
-            // handling. 
-  }; 
- 
-  /// \name Countable Loop Idiom Handling 
-  /// @{ 
- 
-  bool runOnCountableLoop(); 
-  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, 
-                      SmallVectorImpl<BasicBlock *> &ExitBlocks); 
- 
-  void collectStores(BasicBlock *BB); 
-  LegalStoreKind isLegalStore(StoreInst *SI); 
-  enum class ForMemset { No, Yes }; 
-  bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount, 
-                         ForMemset For); 
-  bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount); 
- 
-  bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize, 
-                               MaybeAlign StoreAlignment, Value *StoredVal, 
-                               Instruction *TheStore, 
-                               SmallPtrSetImpl<Instruction *> &Stores, 
-                               const SCEVAddRecExpr *Ev, const SCEV *BECount, 
-                               bool NegStride, bool IsLoopMemset = false); 
-  bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount); 
-  bool avoidLIRForMultiBlockLoop(bool IsMemset = false, 
-                                 bool IsLoopMemset = false); 
- 
-  /// @} 
-  /// \name Noncountable Loop Idiom Handling 
-  /// @{ 
- 
-  bool runOnNoncountableLoop(); 
- 
-  bool recognizePopcount(); 
-  void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst, 
-                               PHINode *CntPhi, Value *Var); 
-  bool recognizeAndInsertFFS();  /// Find First Set: ctlz or cttz 
-  void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB, 
-                                Instruction *CntInst, PHINode *CntPhi, 
-                                Value *Var, Instruction *DefX, 
-                                const DebugLoc &DL, bool ZeroCheck, 
-                                bool IsCntPhiUsedOutsideLoop); 
- 
+static cl::opt<bool> UseLIRCodeSizeHeurs(
+    "use-lir-code-size-heurs",
+    cl::desc("Use loop idiom recognition code size heuristics when compiling"
+             "with -Os/-Oz"),
+    cl::init(true), cl::Hidden);
+
+namespace {
+
+class LoopIdiomRecognize {
+  Loop *CurLoop = nullptr;
+  AliasAnalysis *AA;
+  DominatorTree *DT;
+  LoopInfo *LI;
+  ScalarEvolution *SE;
+  TargetLibraryInfo *TLI;
+  const TargetTransformInfo *TTI;
+  const DataLayout *DL;
+  OptimizationRemarkEmitter &ORE;
+  bool ApplyCodeSizeHeuristics;
+  std::unique_ptr<MemorySSAUpdater> MSSAU;
+
+public:
+  explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
+                              LoopInfo *LI, ScalarEvolution *SE,
+                              TargetLibraryInfo *TLI,
+                              const TargetTransformInfo *TTI, MemorySSA *MSSA,
+                              const DataLayout *DL,
+                              OptimizationRemarkEmitter &ORE)
+      : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
+    if (MSSA)
+      MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
+  }
+
+  bool runOnLoop(Loop *L);
+
+private:
+  using StoreList = SmallVector<StoreInst *, 8>;
+  using StoreListMap = MapVector<Value *, StoreList>;
+
+  StoreListMap StoreRefsForMemset;
+  StoreListMap StoreRefsForMemsetPattern;
+  StoreList StoreRefsForMemcpy;
+  bool HasMemset;
+  bool HasMemsetPattern;
+  bool HasMemcpy;
+
+  /// Return code for isLegalStore()
+  enum LegalStoreKind {
+    None = 0,
+    Memset,
+    MemsetPattern,
+    Memcpy,
+    UnorderedAtomicMemcpy,
+    DontUse // Dummy retval never to be used. Allows catching errors in retval
+            // handling.
+  };
+
+  /// \name Countable Loop Idiom Handling
+  /// @{
+
+  bool runOnCountableLoop();
+  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+                      SmallVectorImpl<BasicBlock *> &ExitBlocks);
+
+  void collectStores(BasicBlock *BB);
+  LegalStoreKind isLegalStore(StoreInst *SI);
+  enum class ForMemset { No, Yes };
+  bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
+                         ForMemset For);
+  bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
+
+  bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
+                               MaybeAlign StoreAlignment, Value *StoredVal,
+                               Instruction *TheStore,
+                               SmallPtrSetImpl<Instruction *> &Stores,
+                               const SCEVAddRecExpr *Ev, const SCEV *BECount,
+                               bool NegStride, bool IsLoopMemset = false);
+  bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
+  bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
+                                 bool IsLoopMemset = false);
+
+  /// @}
+  /// \name Noncountable Loop Idiom Handling
+  /// @{
+
+  bool runOnNoncountableLoop();
+
+  bool recognizePopcount();
+  void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
+                               PHINode *CntPhi, Value *Var);
+  bool recognizeAndInsertFFS();  /// Find First Set: ctlz or cttz
+  void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
+                                Instruction *CntInst, PHINode *CntPhi,
+                                Value *Var, Instruction *DefX,
+                                const DebugLoc &DL, bool ZeroCheck,
+                                bool IsCntPhiUsedOutsideLoop);
+
   bool recognizeShiftUntilBitTest();
 
-  /// @} 
-}; 
- 
-class LoopIdiomRecognizeLegacyPass : public LoopPass { 
-public: 
-  static char ID; 
- 
-  explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) { 
-    initializeLoopIdiomRecognizeLegacyPassPass( 
-        *PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnLoop(Loop *L, LPPassManager &LPM) override { 
+  /// @}
+};
+
+class LoopIdiomRecognizeLegacyPass : public LoopPass {
+public:
+  static char ID;
+
+  explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
+    initializeLoopIdiomRecognizeLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
     if (DisableLIRP::All)
       return false;
 
-    if (skipLoop(L)) 
-      return false; 
- 
-    AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 
-    DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-    LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 
-    ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 
-    TargetLibraryInfo *TLI = 
-        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI( 
-            *L->getHeader()->getParent()); 
-    const TargetTransformInfo *TTI = 
-        &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( 
-            *L->getHeader()->getParent()); 
-    const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout(); 
-    auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>(); 
-    MemorySSA *MSSA = nullptr; 
-    if (MSSAAnalysis) 
-      MSSA = &MSSAAnalysis->getMSSA(); 
- 
-    // For the old PM, we can't use OptimizationRemarkEmitter as an analysis 
-    // pass.  Function analyses need to be preserved across loop transformations 
-    // but ORE cannot be preserved (see comment before the pass definition). 
-    OptimizationRemarkEmitter ORE(L->getHeader()->getParent()); 
- 
-    LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, MSSA, DL, ORE); 
-    return LIR.runOnLoop(L); 
-  } 
- 
-  /// This transformation requires natural loop information & requires that 
-  /// loop preheaders be inserted into the CFG. 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-    AU.addRequired<TargetTransformInfoWrapperPass>(); 
-    AU.addPreserved<MemorySSAWrapperPass>(); 
-    getLoopAnalysisUsage(AU); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-char LoopIdiomRecognizeLegacyPass::ID = 0; 
- 
-PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM, 
-                                              LoopStandardAnalysisResults &AR, 
-                                              LPMUpdater &) { 
+    if (skipLoop(L))
+      return false;
+
+    AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+    DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    TargetLibraryInfo *TLI =
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
+            *L->getHeader()->getParent());
+    const TargetTransformInfo *TTI =
+        &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+            *L->getHeader()->getParent());
+    const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
+    auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
+    MemorySSA *MSSA = nullptr;
+    if (MSSAAnalysis)
+      MSSA = &MSSAAnalysis->getMSSA();
+
+    // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
+    // pass.  Function analyses need to be preserved across loop transformations
+    // but ORE cannot be preserved (see comment before the pass definition).
+    OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
+
+    LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, MSSA, DL, ORE);
+    return LIR.runOnLoop(L);
+  }
+
+  /// This transformation requires natural loop information & requires that
+  /// loop preheaders be inserted into the CFG.
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addPreserved<MemorySSAWrapperPass>();
+    getLoopAnalysisUsage(AU);
+  }
+};
+
+} // end anonymous namespace
+
+char LoopIdiomRecognizeLegacyPass::ID = 0;
+
+PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
+                                              LoopStandardAnalysisResults &AR,
+                                              LPMUpdater &) {
   if (DisableLIRP::All)
     return PreservedAnalyses::all();
 
-  const auto *DL = &L.getHeader()->getModule()->getDataLayout(); 
- 
-  // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis 
-  // pass.  Function analyses need to be preserved across loop transformations 
-  // but ORE cannot be preserved (see comment before the pass definition). 
-  OptimizationRemarkEmitter ORE(L.getHeader()->getParent()); 
- 
-  LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, 
-                         AR.MSSA, DL, ORE); 
-  if (!LIR.runOnLoop(&L)) 
-    return PreservedAnalyses::all(); 
- 
-  auto PA = getLoopPassPreservedAnalyses(); 
-  if (AR.MSSA) 
-    PA.preserve<MemorySSAAnalysis>(); 
-  return PA; 
-} 
- 
-INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom", 
-                      "Recognize loop idioms", false, false) 
-INITIALIZE_PASS_DEPENDENCY(LoopPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 
-INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom", 
-                    "Recognize loop idioms", false, false) 
- 
-Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); } 
- 
-static void deleteDeadInstruction(Instruction *I) { 
-  I->replaceAllUsesWith(UndefValue::get(I->getType())); 
-  I->eraseFromParent(); 
-} 
- 
-//===----------------------------------------------------------------------===// 
-// 
-//          Implementation of LoopIdiomRecognize 
-// 
-//===----------------------------------------------------------------------===// 
- 
-bool LoopIdiomRecognize::runOnLoop(Loop *L) { 
-  CurLoop = L; 
-  // If the loop could not be converted to canonical form, it must have an 
-  // indirectbr in it, just give up. 
-  if (!L->getLoopPreheader()) 
-    return false; 
- 
-  // Disable loop idiom recognition if the function's name is a common idiom. 
-  StringRef Name = L->getHeader()->getParent()->getName(); 
-  if (Name == "memset" || Name == "memcpy") 
-    return false; 
- 
-  // Determine if code size heuristics need to be applied. 
-  ApplyCodeSizeHeuristics = 
-      L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs; 
- 
-  HasMemset = TLI->has(LibFunc_memset); 
-  HasMemsetPattern = TLI->has(LibFunc_memset_pattern16); 
-  HasMemcpy = TLI->has(LibFunc_memcpy); 
- 
-  if (HasMemset || HasMemsetPattern || HasMemcpy) 
-    if (SE->hasLoopInvariantBackedgeTakenCount(L)) 
-      return runOnCountableLoop(); 
- 
-  return runOnNoncountableLoop(); 
-} 
- 
-bool LoopIdiomRecognize::runOnCountableLoop() { 
-  const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop); 
-  assert(!isa<SCEVCouldNotCompute>(BECount) && 
-         "runOnCountableLoop() called on a loop without a predictable" 
-         "backedge-taken count"); 
- 
-  // If this loop executes exactly one time, then it should be peeled, not 
-  // optimized by this pass. 
-  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount)) 
-    if (BECst->getAPInt() == 0) 
-      return false; 
- 
-  SmallVector<BasicBlock *, 8> ExitBlocks; 
-  CurLoop->getUniqueExitBlocks(ExitBlocks); 
- 
-  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" 
-                    << CurLoop->getHeader()->getParent()->getName() 
-                    << "] Countable Loop %" << CurLoop->getHeader()->getName() 
-                    << "\n"); 
- 
-  // The following transforms hoist stores/memsets into the loop pre-header. 
-  // Give up if the loop has instructions that may throw. 
-  SimpleLoopSafetyInfo SafetyInfo; 
-  SafetyInfo.computeLoopSafetyInfo(CurLoop); 
-  if (SafetyInfo.anyBlockMayThrow()) 
-    return false; 
- 
-  bool MadeChange = false; 
- 
-  // Scan all the blocks in the loop that are not in subloops. 
-  for (auto *BB : CurLoop->getBlocks()) { 
-    // Ignore blocks in subloops. 
-    if (LI->getLoopFor(BB) != CurLoop) 
-      continue; 
- 
-    MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks); 
-  } 
-  return MadeChange; 
-} 
- 
-static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) { 
-  const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1)); 
-  return ConstStride->getAPInt(); 
-} 
- 
-/// getMemSetPatternValue - If a strided store of the specified value is safe to 
-/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should 
-/// be passed in.  Otherwise, return null. 
-/// 
-/// Note that we don't ever attempt to use memset_pattern8 or 4, because these 
-/// just replicate their input array and then pass on to memset_pattern16. 
-static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) { 
-  // FIXME: This could check for UndefValue because it can be merged into any 
-  // other valid pattern. 
- 
-  // If the value isn't a constant, we can't promote it to being in a constant 
-  // array.  We could theoretically do a store to an alloca or something, but 
-  // that doesn't seem worthwhile. 
-  Constant *C = dyn_cast<Constant>(V); 
-  if (!C) 
-    return nullptr; 
- 
-  // Only handle simple values that are a power of two bytes in size. 
-  uint64_t Size = DL->getTypeSizeInBits(V->getType()); 
-  if (Size == 0 || (Size & 7) || (Size & (Size - 1))) 
-    return nullptr; 
- 
-  // Don't care enough about darwin/ppc to implement this. 
-  if (DL->isBigEndian()) 
-    return nullptr; 
- 
-  // Convert to size in bytes. 
-  Size /= 8; 
- 
-  // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see 
-  // if the top and bottom are the same (e.g. for vectors and large integers). 
-  if (Size > 16) 
-    return nullptr; 
- 
-  // If the constant is exactly 16 bytes, just use it. 
-  if (Size == 16) 
-    return C; 
- 
-  // Otherwise, we'll use an array of the constants. 
-  unsigned ArraySize = 16 / Size; 
-  ArrayType *AT = ArrayType::get(V->getType(), ArraySize); 
-  return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C)); 
-} 
- 
-LoopIdiomRecognize::LegalStoreKind 
-LoopIdiomRecognize::isLegalStore(StoreInst *SI) { 
-  // Don't touch volatile stores. 
-  if (SI->isVolatile()) 
-    return LegalStoreKind::None; 
-  // We only want simple or unordered-atomic stores. 
-  if (!SI->isUnordered()) 
-    return LegalStoreKind::None; 
- 
-  // Avoid merging nontemporal stores. 
-  if (SI->getMetadata(LLVMContext::MD_nontemporal)) 
-    return LegalStoreKind::None; 
- 
-  Value *StoredVal = SI->getValueOperand(); 
-  Value *StorePtr = SI->getPointerOperand(); 
- 
+  const auto *DL = &L.getHeader()->getModule()->getDataLayout();
+
+  // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
+  // pass.  Function analyses need to be preserved across loop transformations
+  // but ORE cannot be preserved (see comment before the pass definition).
+  OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
+
+  LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
+                         AR.MSSA, DL, ORE);
+  if (!LIR.runOnLoop(&L))
+    return PreservedAnalyses::all();
+
+  auto PA = getLoopPassPreservedAnalyses();
+  if (AR.MSSA)
+    PA.preserve<MemorySSAAnalysis>();
+  return PA;
+}
+
+INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",
+                      "Recognize loop idioms", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",
+                    "Recognize loop idioms", false, false)
+
+Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
+
+static void deleteDeadInstruction(Instruction *I) {
+  I->replaceAllUsesWith(UndefValue::get(I->getType()));
+  I->eraseFromParent();
+}
+
+//===----------------------------------------------------------------------===//
+//
+//          Implementation of LoopIdiomRecognize
+//
+//===----------------------------------------------------------------------===//
+
+bool LoopIdiomRecognize::runOnLoop(Loop *L) {
+  CurLoop = L;
+  // If the loop could not be converted to canonical form, it must have an
+  // indirectbr in it, just give up.
+  if (!L->getLoopPreheader())
+    return false;
+
+  // Disable loop idiom recognition if the function's name is a common idiom.
+  StringRef Name = L->getHeader()->getParent()->getName();
+  if (Name == "memset" || Name == "memcpy")
+    return false;
+
+  // Determine if code size heuristics need to be applied.
+  ApplyCodeSizeHeuristics =
+      L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
+
+  HasMemset = TLI->has(LibFunc_memset);
+  HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
+  HasMemcpy = TLI->has(LibFunc_memcpy);
+
+  if (HasMemset || HasMemsetPattern || HasMemcpy)
+    if (SE->hasLoopInvariantBackedgeTakenCount(L))
+      return runOnCountableLoop();
+
+  return runOnNoncountableLoop();
+}
+
+bool LoopIdiomRecognize::runOnCountableLoop() {
+  const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
+  assert(!isa<SCEVCouldNotCompute>(BECount) &&
+         "runOnCountableLoop() called on a loop without a predictable"
+         "backedge-taken count");
+
+  // If this loop executes exactly one time, then it should be peeled, not
+  // optimized by this pass.
+  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
+    if (BECst->getAPInt() == 0)
+      return false;
+
+  SmallVector<BasicBlock *, 8> ExitBlocks;
+  CurLoop->getUniqueExitBlocks(ExitBlocks);
+
+  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
+                    << CurLoop->getHeader()->getParent()->getName()
+                    << "] Countable Loop %" << CurLoop->getHeader()->getName()
+                    << "\n");
+
+  // The following transforms hoist stores/memsets into the loop pre-header.
+  // Give up if the loop has instructions that may throw.
+  SimpleLoopSafetyInfo SafetyInfo;
+  SafetyInfo.computeLoopSafetyInfo(CurLoop);
+  if (SafetyInfo.anyBlockMayThrow())
+    return false;
+
+  bool MadeChange = false;
+
+  // Scan all the blocks in the loop that are not in subloops.
+  for (auto *BB : CurLoop->getBlocks()) {
+    // Ignore blocks in subloops.
+    if (LI->getLoopFor(BB) != CurLoop)
+      continue;
+
+    MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
+  }
+  return MadeChange;
+}
+
+static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
+  const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
+  return ConstStride->getAPInt();
+}
+
+/// getMemSetPatternValue - If a strided store of the specified value is safe to
+/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
+/// be passed in.  Otherwise, return null.
+///
+/// Note that we don't ever attempt to use memset_pattern8 or 4, because these
+/// just replicate their input array and then pass on to memset_pattern16.
+static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
+  // FIXME: This could check for UndefValue because it can be merged into any
+  // other valid pattern.
+
+  // If the value isn't a constant, we can't promote it to being in a constant
+  // array.  We could theoretically do a store to an alloca or something, but
+  // that doesn't seem worthwhile.
+  Constant *C = dyn_cast<Constant>(V);
+  if (!C)
+    return nullptr;
+
+  // Only handle simple values that are a power of two bytes in size.
+  uint64_t Size = DL->getTypeSizeInBits(V->getType());
+  if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
+    return nullptr;
+
+  // Don't care enough about darwin/ppc to implement this.
+  if (DL->isBigEndian())
+    return nullptr;
+
+  // Convert to size in bytes.
+  Size /= 8;
+
+  // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
+  // if the top and bottom are the same (e.g. for vectors and large integers).
+  if (Size > 16)
+    return nullptr;
+
+  // If the constant is exactly 16 bytes, just use it.
+  if (Size == 16)
+    return C;
+
+  // Otherwise, we'll use an array of the constants.
+  unsigned ArraySize = 16 / Size;
+  ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
+  return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
+}
+
+LoopIdiomRecognize::LegalStoreKind
+LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
+  // Don't touch volatile stores.
+  if (SI->isVolatile())
+    return LegalStoreKind::None;
+  // We only want simple or unordered-atomic stores.
+  if (!SI->isUnordered())
+    return LegalStoreKind::None;
+
+  // Avoid merging nontemporal stores.
+  if (SI->getMetadata(LLVMContext::MD_nontemporal))
+    return LegalStoreKind::None;
+
+  Value *StoredVal = SI->getValueOperand();
+  Value *StorePtr = SI->getPointerOperand();
+
   // Don't convert stores of non-integral pointer types to memsets (which stores
   // integers).
   if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
     return LegalStoreKind::None;
 
-  // Reject stores that are so large that they overflow an unsigned. 
+  // Reject stores that are so large that they overflow an unsigned.
   // When storing out scalable vectors we bail out for now, since the code
   // below currently only works for constant strides.
   TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
   if (SizeInBits.isScalable() || (SizeInBits.getFixedSize() & 7) ||
       (SizeInBits.getFixedSize() >> 32) != 0)
-    return LegalStoreKind::None; 
- 
-  // See if the pointer expression is an AddRec like {base,+,1} on the current 
-  // loop, which indicates a strided store.  If we have something else, it's a 
-  // random store we can't handle. 
-  const SCEVAddRecExpr *StoreEv = 
-      dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); 
-  if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine()) 
-    return LegalStoreKind::None; 
- 
-  // Check to see if we have a constant stride. 
-  if (!isa<SCEVConstant>(StoreEv->getOperand(1))) 
-    return LegalStoreKind::None; 
- 
-  // See if the store can be turned into a memset. 
- 
-  // If the stored value is a byte-wise value (like i32 -1), then it may be 
-  // turned into a memset of i8 -1, assuming that all the consecutive bytes 
-  // are stored.  A store of i32 0x01020304 can never be turned into a memset, 
-  // but it can be turned into memset_pattern if the target supports it. 
-  Value *SplatValue = isBytewiseValue(StoredVal, *DL); 
-  Constant *PatternValue = nullptr; 
- 
-  // Note: memset and memset_pattern on unordered-atomic is yet not supported 
-  bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple(); 
- 
-  // If we're allowed to form a memset, and the stored value would be 
-  // acceptable for memset, use it. 
+    return LegalStoreKind::None;
+
+  // See if the pointer expression is an AddRec like {base,+,1} on the current
+  // loop, which indicates a strided store.  If we have something else, it's a
+  // random store we can't handle.
+  const SCEVAddRecExpr *StoreEv =
+      dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+  if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
+    return LegalStoreKind::None;
+
+  // Check to see if we have a constant stride.
+  if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
+    return LegalStoreKind::None;
+
+  // See if the store can be turned into a memset.
+
+  // If the stored value is a byte-wise value (like i32 -1), then it may be
+  // turned into a memset of i8 -1, assuming that all the consecutive bytes
+  // are stored.  A store of i32 0x01020304 can never be turned into a memset,
+  // but it can be turned into memset_pattern if the target supports it.
+  Value *SplatValue = isBytewiseValue(StoredVal, *DL);
+  Constant *PatternValue = nullptr;
+
+  // Note: memset and memset_pattern on unordered-atomic is yet not supported
+  bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
+
+  // If we're allowed to form a memset, and the stored value would be
+  // acceptable for memset, use it.
   if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
-      // Verify that the stored value is loop invariant.  If not, we can't 
-      // promote the memset. 
-      CurLoop->isLoopInvariant(SplatValue)) { 
-    // It looks like we can use SplatValue. 
-    return LegalStoreKind::Memset; 
+      // Verify that the stored value is loop invariant.  If not, we can't
+      // promote the memset.
+      CurLoop->isLoopInvariant(SplatValue)) {
+    // It looks like we can use SplatValue.
+    return LegalStoreKind::Memset;
   } else if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
-             // Don't create memset_pattern16s with address spaces. 
-             StorePtr->getType()->getPointerAddressSpace() == 0 && 
-             (PatternValue = getMemSetPatternValue(StoredVal, DL))) { 
-    // It looks like we can use PatternValue! 
-    return LegalStoreKind::MemsetPattern; 
-  } 
- 
-  // Otherwise, see if the store can be turned into a memcpy. 
+             // Don't create memset_pattern16s with address spaces.
+             StorePtr->getType()->getPointerAddressSpace() == 0 &&
+             (PatternValue = getMemSetPatternValue(StoredVal, DL))) {
+    // It looks like we can use PatternValue!
+    return LegalStoreKind::MemsetPattern;
+  }
+
+  // Otherwise, see if the store can be turned into a memcpy.
   if (HasMemcpy && !DisableLIRP::Memcpy) {
-    // Check to see if the stride matches the size of the store.  If so, then we 
-    // know that every byte is touched in the loop. 
-    APInt Stride = getStoreStride(StoreEv); 
-    unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType()); 
-    if (StoreSize != Stride && StoreSize != -Stride) 
-      return LegalStoreKind::None; 
- 
-    // The store must be feeding a non-volatile load. 
-    LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand()); 
- 
-    // Only allow non-volatile loads 
-    if (!LI || LI->isVolatile()) 
-      return LegalStoreKind::None; 
-    // Only allow simple or unordered-atomic loads 
-    if (!LI->isUnordered()) 
-      return LegalStoreKind::None; 
- 
-    // See if the pointer expression is an AddRec like {base,+,1} on the current 
-    // loop, which indicates a strided load.  If we have something else, it's a 
-    // random load we can't handle. 
-    const SCEVAddRecExpr *LoadEv = 
-        dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand())); 
-    if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine()) 
-      return LegalStoreKind::None; 
- 
-    // The store and load must share the same stride. 
-    if (StoreEv->getOperand(1) != LoadEv->getOperand(1)) 
-      return LegalStoreKind::None; 
- 
-    // Success.  This store can be converted into a memcpy. 
-    UnorderedAtomic = UnorderedAtomic || LI->isAtomic(); 
-    return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy 
-                           : LegalStoreKind::Memcpy; 
-  } 
-  // This store can't be transformed into a memset/memcpy. 
-  return LegalStoreKind::None; 
-} 
- 
-void LoopIdiomRecognize::collectStores(BasicBlock *BB) { 
-  StoreRefsForMemset.clear(); 
-  StoreRefsForMemsetPattern.clear(); 
-  StoreRefsForMemcpy.clear(); 
-  for (Instruction &I : *BB) { 
-    StoreInst *SI = dyn_cast<StoreInst>(&I); 
-    if (!SI) 
-      continue; 
- 
-    // Make sure this is a strided store with a constant stride. 
-    switch (isLegalStore(SI)) { 
-    case LegalStoreKind::None: 
-      // Nothing to do 
-      break; 
-    case LegalStoreKind::Memset: { 
-      // Find the base pointer. 
+    // Check to see if the stride matches the size of the store.  If so, then we
+    // know that every byte is touched in the loop.
+    APInt Stride = getStoreStride(StoreEv);
+    unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
+    if (StoreSize != Stride && StoreSize != -Stride)
+      return LegalStoreKind::None;
+
+    // The store must be feeding a non-volatile load.
+    LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
+
+    // Only allow non-volatile loads
+    if (!LI || LI->isVolatile())
+      return LegalStoreKind::None;
+    // Only allow simple or unordered-atomic loads
+    if (!LI->isUnordered())
+      return LegalStoreKind::None;
+
+    // See if the pointer expression is an AddRec like {base,+,1} on the current
+    // loop, which indicates a strided load.  If we have something else, it's a
+    // random load we can't handle.
+    const SCEVAddRecExpr *LoadEv =
+        dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
+    if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
+      return LegalStoreKind::None;
+
+    // The store and load must share the same stride.
+    if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
+      return LegalStoreKind::None;
+
+    // Success.  This store can be converted into a memcpy.
+    UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
+    return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
+                           : LegalStoreKind::Memcpy;
+  }
+  // This store can't be transformed into a memset/memcpy.
+  return LegalStoreKind::None;
+}
+
+void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
+  StoreRefsForMemset.clear();
+  StoreRefsForMemsetPattern.clear();
+  StoreRefsForMemcpy.clear();
+  for (Instruction &I : *BB) {
+    StoreInst *SI = dyn_cast<StoreInst>(&I);
+    if (!SI)
+      continue;
+
+    // Make sure this is a strided store with a constant stride.
+    switch (isLegalStore(SI)) {
+    case LegalStoreKind::None:
+      // Nothing to do
+      break;
+    case LegalStoreKind::Memset: {
+      // Find the base pointer.
       Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
-      StoreRefsForMemset[Ptr].push_back(SI); 
-    } break; 
-    case LegalStoreKind::MemsetPattern: { 
-      // Find the base pointer. 
+      StoreRefsForMemset[Ptr].push_back(SI);
+    } break;
+    case LegalStoreKind::MemsetPattern: {
+      // Find the base pointer.
       Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
-      StoreRefsForMemsetPattern[Ptr].push_back(SI); 
-    } break; 
-    case LegalStoreKind::Memcpy: 
-    case LegalStoreKind::UnorderedAtomicMemcpy: 
-      StoreRefsForMemcpy.push_back(SI); 
-      break; 
-    default: 
-      assert(false && "unhandled return value"); 
-      break; 
-    } 
-  } 
-} 
- 
-/// runOnLoopBlock - Process the specified block, which lives in a counted loop 
-/// with the specified backedge count.  This block is known to be in the current 
-/// loop and not in any subloops. 
-bool LoopIdiomRecognize::runOnLoopBlock( 
-    BasicBlock *BB, const SCEV *BECount, 
-    SmallVectorImpl<BasicBlock *> &ExitBlocks) { 
-  // We can only promote stores in this block if they are unconditionally 
-  // executed in the loop.  For a block to be unconditionally executed, it has 
-  // to dominate all the exit blocks of the loop.  Verify this now. 
-  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) 
-    if (!DT->dominates(BB, ExitBlocks[i])) 
-      return false; 
- 
-  bool MadeChange = false; 
-  // Look for store instructions, which may be optimized to memset/memcpy. 
-  collectStores(BB); 
- 
-  // Look for a single store or sets of stores with a common base, which can be 
-  // optimized into a memset (memset_pattern).  The latter most commonly happens 
-  // with structs and handunrolled loops. 
-  for (auto &SL : StoreRefsForMemset) 
-    MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes); 
- 
-  for (auto &SL : StoreRefsForMemsetPattern) 
-    MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No); 
- 
-  // Optimize the store into a memcpy, if it feeds an similarly strided load. 
-  for (auto &SI : StoreRefsForMemcpy) 
-    MadeChange |= processLoopStoreOfLoopLoad(SI, BECount); 
- 
-  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 
-    Instruction *Inst = &*I++; 
-    // Look for memset instructions, which may be optimized to a larger memset. 
-    if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) { 
-      WeakTrackingVH InstPtr(&*I); 
-      if (!processLoopMemSet(MSI, BECount)) 
-        continue; 
-      MadeChange = true; 
- 
-      // If processing the memset invalidated our iterator, start over from the 
-      // top of the block. 
-      if (!InstPtr) 
-        I = BB->begin(); 
-      continue; 
-    } 
-  } 
- 
-  return MadeChange; 
-} 
- 
-/// See if this store(s) can be promoted to a memset. 
-bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL, 
-                                           const SCEV *BECount, ForMemset For) { 
-  // Try to find consecutive stores that can be transformed into memsets. 
-  SetVector<StoreInst *> Heads, Tails; 
-  SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain; 
- 
-  // Do a quadratic search on all of the given stores and find 
-  // all of the pairs of stores that follow each other. 
-  SmallVector<unsigned, 16> IndexQueue; 
-  for (unsigned i = 0, e = SL.size(); i < e; ++i) { 
-    assert(SL[i]->isSimple() && "Expected only non-volatile stores."); 
- 
-    Value *FirstStoredVal = SL[i]->getValueOperand(); 
-    Value *FirstStorePtr = SL[i]->getPointerOperand(); 
-    const SCEVAddRecExpr *FirstStoreEv = 
-        cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr)); 
-    APInt FirstStride = getStoreStride(FirstStoreEv); 
-    unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType()); 
- 
-    // See if we can optimize just this store in isolation. 
-    if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) { 
-      Heads.insert(SL[i]); 
-      continue; 
-    } 
- 
-    Value *FirstSplatValue = nullptr; 
-    Constant *FirstPatternValue = nullptr; 
- 
-    if (For == ForMemset::Yes) 
-      FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL); 
-    else 
-      FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL); 
- 
-    assert((FirstSplatValue || FirstPatternValue) && 
-           "Expected either splat value or pattern value."); 
- 
-    IndexQueue.clear(); 
-    // If a store has multiple consecutive store candidates, search Stores 
-    // array according to the sequence: from i+1 to e, then from i-1 to 0. 
-    // This is because usually pairing with immediate succeeding or preceding 
-    // candidate create the best chance to find memset opportunity. 
-    unsigned j = 0; 
-    for (j = i + 1; j < e; ++j) 
-      IndexQueue.push_back(j); 
-    for (j = i; j > 0; --j) 
-      IndexQueue.push_back(j - 1); 
- 
-    for (auto &k : IndexQueue) { 
-      assert(SL[k]->isSimple() && "Expected only non-volatile stores."); 
-      Value *SecondStorePtr = SL[k]->getPointerOperand(); 
-      const SCEVAddRecExpr *SecondStoreEv = 
-          cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr)); 
-      APInt SecondStride = getStoreStride(SecondStoreEv); 
- 
-      if (FirstStride != SecondStride) 
-        continue; 
- 
-      Value *SecondStoredVal = SL[k]->getValueOperand(); 
-      Value *SecondSplatValue = nullptr; 
-      Constant *SecondPatternValue = nullptr; 
- 
-      if (For == ForMemset::Yes) 
-        SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL); 
-      else 
-        SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL); 
- 
-      assert((SecondSplatValue || SecondPatternValue) && 
-             "Expected either splat value or pattern value."); 
- 
-      if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) { 
-        if (For == ForMemset::Yes) { 
-          if (isa<UndefValue>(FirstSplatValue)) 
-            FirstSplatValue = SecondSplatValue; 
-          if (FirstSplatValue != SecondSplatValue) 
-            continue; 
-        } else { 
-          if (isa<UndefValue>(FirstPatternValue)) 
-            FirstPatternValue = SecondPatternValue; 
-          if (FirstPatternValue != SecondPatternValue) 
-            continue; 
-        } 
-        Tails.insert(SL[k]); 
-        Heads.insert(SL[i]); 
-        ConsecutiveChain[SL[i]] = SL[k]; 
-        break; 
-      } 
-    } 
-  } 
- 
-  // We may run into multiple chains that merge into a single chain. We mark the 
-  // stores that we transformed so that we don't visit the same store twice. 
-  SmallPtrSet<Value *, 16> TransformedStores; 
-  bool Changed = false; 
- 
-  // For stores that start but don't end a link in the chain: 
-  for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end(); 
-       it != e; ++it) { 
-    if (Tails.count(*it)) 
-      continue; 
- 
-    // We found a store instr that starts a chain. Now follow the chain and try 
-    // to transform it. 
-    SmallPtrSet<Instruction *, 8> AdjacentStores; 
-    StoreInst *I = *it; 
- 
-    StoreInst *HeadStore = I; 
-    unsigned StoreSize = 0; 
- 
-    // Collect the chain into a list. 
-    while (Tails.count(I) || Heads.count(I)) { 
-      if (TransformedStores.count(I)) 
-        break; 
-      AdjacentStores.insert(I); 
- 
-      StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType()); 
-      // Move to the next value in the chain. 
-      I = ConsecutiveChain[I]; 
-    } 
- 
-    Value *StoredVal = HeadStore->getValueOperand(); 
-    Value *StorePtr = HeadStore->getPointerOperand(); 
-    const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); 
-    APInt Stride = getStoreStride(StoreEv); 
- 
-    // Check to see if the stride matches the size of the stores.  If so, then 
-    // we know that every byte is touched in the loop. 
-    if (StoreSize != Stride && StoreSize != -Stride) 
-      continue; 
- 
-    bool NegStride = StoreSize == -Stride; 
- 
-    if (processLoopStridedStore(StorePtr, StoreSize, 
-                                MaybeAlign(HeadStore->getAlignment()), 
-                                StoredVal, HeadStore, AdjacentStores, StoreEv, 
-                                BECount, NegStride)) { 
-      TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end()); 
-      Changed = true; 
-    } 
-  } 
- 
-  return Changed; 
-} 
- 
-/// processLoopMemSet - See if this memset can be promoted to a large memset. 
-bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, 
-                                           const SCEV *BECount) { 
-  // We can only handle non-volatile memsets with a constant size. 
-  if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength())) 
-    return false; 
- 
-  // If we're not allowed to hack on memset, we fail. 
-  if (!HasMemset) 
-    return false; 
- 
-  Value *Pointer = MSI->getDest(); 
- 
-  // See if the pointer expression is an AddRec like {base,+,1} on the current 
-  // loop, which indicates a strided store.  If we have something else, it's a 
-  // random store we can't handle. 
-  const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer)); 
-  if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine()) 
-    return false; 
- 
-  // Reject memsets that are so large that they overflow an unsigned. 
-  uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue(); 
-  if ((SizeInBytes >> 32) != 0) 
-    return false; 
- 
-  // Check to see if the stride matches the size of the memset.  If so, then we 
-  // know that every byte is touched in the loop. 
-  const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1)); 
-  if (!ConstStride) 
-    return false; 
- 
-  APInt Stride = ConstStride->getAPInt(); 
-  if (SizeInBytes != Stride && SizeInBytes != -Stride) 
-    return false; 
- 
-  // Verify that the memset value is loop invariant.  If not, we can't promote 
-  // the memset. 
-  Value *SplatValue = MSI->getValue(); 
-  if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue)) 
-    return false; 
- 
-  SmallPtrSet<Instruction *, 1> MSIs; 
-  MSIs.insert(MSI); 
-  bool NegStride = SizeInBytes == -Stride; 
-  return processLoopStridedStore( 
-      Pointer, (unsigned)SizeInBytes, MaybeAlign(MSI->getDestAlignment()), 
-      SplatValue, MSI, MSIs, Ev, BECount, NegStride, /*IsLoopMemset=*/true); 
-} 
- 
-/// mayLoopAccessLocation - Return true if the specified loop might access the 
-/// specified pointer location, which is a loop-strided access.  The 'Access' 
-/// argument specifies what the verboten forms of access are (read or write). 
-static bool 
-mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, 
-                      const SCEV *BECount, unsigned StoreSize, 
-                      AliasAnalysis &AA, 
-                      SmallPtrSetImpl<Instruction *> &IgnoredStores) { 
-  // Get the location that may be stored across the loop.  Since the access is 
-  // strided positively through memory, we say that the modified location starts 
-  // at the pointer and has infinite size. 
+      StoreRefsForMemsetPattern[Ptr].push_back(SI);
+    } break;
+    case LegalStoreKind::Memcpy:
+    case LegalStoreKind::UnorderedAtomicMemcpy:
+      StoreRefsForMemcpy.push_back(SI);
+      break;
+    default:
+      assert(false && "unhandled return value");
+      break;
+    }
+  }
+}
+
+/// runOnLoopBlock - Process the specified block, which lives in a counted loop
+/// with the specified backedge count.  This block is known to be in the current
+/// loop and not in any subloops.
+bool LoopIdiomRecognize::runOnLoopBlock(
+    BasicBlock *BB, const SCEV *BECount,
+    SmallVectorImpl<BasicBlock *> &ExitBlocks) {
+  // We can only promote stores in this block if they are unconditionally
+  // executed in the loop.  For a block to be unconditionally executed, it has
+  // to dominate all the exit blocks of the loop.  Verify this now.
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
+    if (!DT->dominates(BB, ExitBlocks[i]))
+      return false;
+
+  bool MadeChange = false;
+  // Look for store instructions, which may be optimized to memset/memcpy.
+  collectStores(BB);
+
+  // Look for a single store or sets of stores with a common base, which can be
+  // optimized into a memset (memset_pattern).  The latter most commonly happens
+  // with structs and handunrolled loops.
+  for (auto &SL : StoreRefsForMemset)
+    MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
+
+  for (auto &SL : StoreRefsForMemsetPattern)
+    MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
+
+  // Optimize the store into a memcpy, if it feeds an similarly strided load.
+  for (auto &SI : StoreRefsForMemcpy)
+    MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
+
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+    Instruction *Inst = &*I++;
+    // Look for memset instructions, which may be optimized to a larger memset.
+    if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
+      WeakTrackingVH InstPtr(&*I);
+      if (!processLoopMemSet(MSI, BECount))
+        continue;
+      MadeChange = true;
+
+      // If processing the memset invalidated our iterator, start over from the
+      // top of the block.
+      if (!InstPtr)
+        I = BB->begin();
+      continue;
+    }
+  }
+
+  return MadeChange;
+}
+
+/// See if this store(s) can be promoted to a memset.
+bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
+                                           const SCEV *BECount, ForMemset For) {
+  // Try to find consecutive stores that can be transformed into memsets.
+  SetVector<StoreInst *> Heads, Tails;
+  SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
+
+  // Do a quadratic search on all of the given stores and find
+  // all of the pairs of stores that follow each other.
+  SmallVector<unsigned, 16> IndexQueue;
+  for (unsigned i = 0, e = SL.size(); i < e; ++i) {
+    assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
+
+    Value *FirstStoredVal = SL[i]->getValueOperand();
+    Value *FirstStorePtr = SL[i]->getPointerOperand();
+    const SCEVAddRecExpr *FirstStoreEv =
+        cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
+    APInt FirstStride = getStoreStride(FirstStoreEv);
+    unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
+
+    // See if we can optimize just this store in isolation.
+    if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
+      Heads.insert(SL[i]);
+      continue;
+    }
+
+    Value *FirstSplatValue = nullptr;
+    Constant *FirstPatternValue = nullptr;
+
+    if (For == ForMemset::Yes)
+      FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
+    else
+      FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
+
+    assert((FirstSplatValue || FirstPatternValue) &&
+           "Expected either splat value or pattern value.");
+
+    IndexQueue.clear();
+    // If a store has multiple consecutive store candidates, search Stores
+    // array according to the sequence: from i+1 to e, then from i-1 to 0.
+    // This is because usually pairing with immediate succeeding or preceding
+    // candidate create the best chance to find memset opportunity.
+    unsigned j = 0;
+    for (j = i + 1; j < e; ++j)
+      IndexQueue.push_back(j);
+    for (j = i; j > 0; --j)
+      IndexQueue.push_back(j - 1);
+
+    for (auto &k : IndexQueue) {
+      assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
+      Value *SecondStorePtr = SL[k]->getPointerOperand();
+      const SCEVAddRecExpr *SecondStoreEv =
+          cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
+      APInt SecondStride = getStoreStride(SecondStoreEv);
+
+      if (FirstStride != SecondStride)
+        continue;
+
+      Value *SecondStoredVal = SL[k]->getValueOperand();
+      Value *SecondSplatValue = nullptr;
+      Constant *SecondPatternValue = nullptr;
+
+      if (For == ForMemset::Yes)
+        SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
+      else
+        SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
+
+      assert((SecondSplatValue || SecondPatternValue) &&
+             "Expected either splat value or pattern value.");
+
+      if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
+        if (For == ForMemset::Yes) {
+          if (isa<UndefValue>(FirstSplatValue))
+            FirstSplatValue = SecondSplatValue;
+          if (FirstSplatValue != SecondSplatValue)
+            continue;
+        } else {
+          if (isa<UndefValue>(FirstPatternValue))
+            FirstPatternValue = SecondPatternValue;
+          if (FirstPatternValue != SecondPatternValue)
+            continue;
+        }
+        Tails.insert(SL[k]);
+        Heads.insert(SL[i]);
+        ConsecutiveChain[SL[i]] = SL[k];
+        break;
+      }
+    }
+  }
+
+  // We may run into multiple chains that merge into a single chain. We mark the
+  // stores that we transformed so that we don't visit the same store twice.
+  SmallPtrSet<Value *, 16> TransformedStores;
+  bool Changed = false;
+
+  // For stores that start but don't end a link in the chain:
+  for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
+       it != e; ++it) {
+    if (Tails.count(*it))
+      continue;
+
+    // We found a store instr that starts a chain. Now follow the chain and try
+    // to transform it.
+    SmallPtrSet<Instruction *, 8> AdjacentStores;
+    StoreInst *I = *it;
+
+    StoreInst *HeadStore = I;
+    unsigned StoreSize = 0;
+
+    // Collect the chain into a list.
+    while (Tails.count(I) || Heads.count(I)) {
+      if (TransformedStores.count(I))
+        break;
+      AdjacentStores.insert(I);
+
+      StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
+      // Move to the next value in the chain.
+      I = ConsecutiveChain[I];
+    }
+
+    Value *StoredVal = HeadStore->getValueOperand();
+    Value *StorePtr = HeadStore->getPointerOperand();
+    const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+    APInt Stride = getStoreStride(StoreEv);
+
+    // Check to see if the stride matches the size of the stores.  If so, then
+    // we know that every byte is touched in the loop.
+    if (StoreSize != Stride && StoreSize != -Stride)
+      continue;
+
+    bool NegStride = StoreSize == -Stride;
+
+    if (processLoopStridedStore(StorePtr, StoreSize,
+                                MaybeAlign(HeadStore->getAlignment()),
+                                StoredVal, HeadStore, AdjacentStores, StoreEv,
+                                BECount, NegStride)) {
+      TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+/// processLoopMemSet - See if this memset can be promoted to a large memset.
+bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
+                                           const SCEV *BECount) {
+  // We can only handle non-volatile memsets with a constant size.
+  if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength()))
+    return false;
+
+  // If we're not allowed to hack on memset, we fail.
+  if (!HasMemset)
+    return false;
+
+  Value *Pointer = MSI->getDest();
+
+  // See if the pointer expression is an AddRec like {base,+,1} on the current
+  // loop, which indicates a strided store.  If we have something else, it's a
+  // random store we can't handle.
+  const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
+  if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine())
+    return false;
+
+  // Reject memsets that are so large that they overflow an unsigned.
+  uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
+  if ((SizeInBytes >> 32) != 0)
+    return false;
+
+  // Check to see if the stride matches the size of the memset.  If so, then we
+  // know that every byte is touched in the loop.
+  const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
+  if (!ConstStride)
+    return false;
+
+  APInt Stride = ConstStride->getAPInt();
+  if (SizeInBytes != Stride && SizeInBytes != -Stride)
+    return false;
+
+  // Verify that the memset value is loop invariant.  If not, we can't promote
+  // the memset.
+  Value *SplatValue = MSI->getValue();
+  if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
+    return false;
+
+  SmallPtrSet<Instruction *, 1> MSIs;
+  MSIs.insert(MSI);
+  bool NegStride = SizeInBytes == -Stride;
+  return processLoopStridedStore(
+      Pointer, (unsigned)SizeInBytes, MaybeAlign(MSI->getDestAlignment()),
+      SplatValue, MSI, MSIs, Ev, BECount, NegStride, /*IsLoopMemset=*/true);
+}
+
+/// mayLoopAccessLocation - Return true if the specified loop might access the
+/// specified pointer location, which is a loop-strided access.  The 'Access'
+/// argument specifies what the verboten forms of access are (read or write).
+static bool
+mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
+                      const SCEV *BECount, unsigned StoreSize,
+                      AliasAnalysis &AA,
+                      SmallPtrSetImpl<Instruction *> &IgnoredStores) {
+  // Get the location that may be stored across the loop.  Since the access is
+  // strided positively through memory, we say that the modified location starts
+  // at the pointer and has infinite size.
   LocationSize AccessSize = LocationSize::afterPointer();
- 
-  // If the loop iterates a fixed number of times, we can refine the access size 
-  // to be exactly the size of the memset, which is (BECount+1)*StoreSize 
-  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount)) 
-    AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) * 
-                                       StoreSize); 
- 
-  // TODO: For this to be really effective, we have to dive into the pointer 
-  // operand in the store.  Store to &A[i] of 100 will always return may alias 
-  // with store of &A[100], we need to StoreLoc to be "A" with size of 100, 
-  // which will then no-alias a store to &A[100]. 
-  MemoryLocation StoreLoc(Ptr, AccessSize); 
- 
-  for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E; 
-       ++BI) 
-    for (Instruction &I : **BI) 
-      if (IgnoredStores.count(&I) == 0 && 
-          isModOrRefSet( 
-              intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access))) 
-        return true; 
- 
-  return false; 
-} 
- 
-// If we have a negative stride, Start refers to the end of the memory location 
-// we're trying to memset.  Therefore, we need to recompute the base pointer, 
-// which is just Start - BECount*Size. 
-static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount, 
-                                        Type *IntPtr, unsigned StoreSize, 
-                                        ScalarEvolution *SE) { 
-  const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr); 
-  if (StoreSize != 1) 
-    Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize), 
-                           SCEV::FlagNUW); 
-  return SE->getMinusSCEV(Start, Index); 
-} 
- 
-/// Compute the number of bytes as a SCEV from the backedge taken count. 
-/// 
-/// This also maps the SCEV into the provided type and tries to handle the 
-/// computation in a way that will fold cleanly. 
-static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr, 
-                               unsigned StoreSize, Loop *CurLoop, 
-                               const DataLayout *DL, ScalarEvolution *SE) { 
-  const SCEV *NumBytesS; 
-  // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to 
-  // pointer size if it isn't already. 
-  // 
-  // If we're going to need to zero extend the BE count, check if we can add 
-  // one to it prior to zero extending without overflow. Provided this is safe, 
-  // it allows better simplification of the +1. 
+
+  // If the loop iterates a fixed number of times, we can refine the access size
+  // to be exactly the size of the memset, which is (BECount+1)*StoreSize
+  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
+    AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) *
+                                       StoreSize);
+
+  // TODO: For this to be really effective, we have to dive into the pointer
+  // operand in the store.  Store to &A[i] of 100 will always return may alias
+  // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
+  // which will then no-alias a store to &A[100].
+  MemoryLocation StoreLoc(Ptr, AccessSize);
+
+  for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
+       ++BI)
+    for (Instruction &I : **BI)
+      if (IgnoredStores.count(&I) == 0 &&
+          isModOrRefSet(
+              intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access)))
+        return true;
+
+  return false;
+}
+
+// If we have a negative stride, Start refers to the end of the memory location
+// we're trying to memset.  Therefore, we need to recompute the base pointer,
+// which is just Start - BECount*Size.
+static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
+                                        Type *IntPtr, unsigned StoreSize,
+                                        ScalarEvolution *SE) {
+  const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
+  if (StoreSize != 1)
+    Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize),
+                           SCEV::FlagNUW);
+  return SE->getMinusSCEV(Start, Index);
+}
+
+/// Compute the number of bytes as a SCEV from the backedge taken count.
+///
+/// This also maps the SCEV into the provided type and tries to handle the
+/// computation in a way that will fold cleanly.
+static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
+                               unsigned StoreSize, Loop *CurLoop,
+                               const DataLayout *DL, ScalarEvolution *SE) {
+  const SCEV *NumBytesS;
+  // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
+  // pointer size if it isn't already.
+  //
+  // If we're going to need to zero extend the BE count, check if we can add
+  // one to it prior to zero extending without overflow. Provided this is safe,
+  // it allows better simplification of the +1.
   if (DL->getTypeSizeInBits(BECount->getType()).getFixedSize() <
           DL->getTypeSizeInBits(IntPtr).getFixedSize() &&
-      SE->isLoopEntryGuardedByCond( 
-          CurLoop, ICmpInst::ICMP_NE, BECount, 
-          SE->getNegativeSCEV(SE->getOne(BECount->getType())))) { 
-    NumBytesS = SE->getZeroExtendExpr( 
-        SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW), 
-        IntPtr); 
-  } else { 
-    NumBytesS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr), 
-                               SE->getOne(IntPtr), SCEV::FlagNUW); 
-  } 
- 
-  // And scale it based on the store size. 
-  if (StoreSize != 1) { 
-    NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize), 
-                               SCEV::FlagNUW); 
-  } 
-  return NumBytesS; 
-} 
- 
-/// processLoopStridedStore - We see a strided store of some value.  If we can 
-/// transform this into a memset or memset_pattern in the loop preheader, do so. 
-bool LoopIdiomRecognize::processLoopStridedStore( 
-    Value *DestPtr, unsigned StoreSize, MaybeAlign StoreAlignment, 
-    Value *StoredVal, Instruction *TheStore, 
-    SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev, 
-    const SCEV *BECount, bool NegStride, bool IsLoopMemset) { 
-  Value *SplatValue = isBytewiseValue(StoredVal, *DL); 
-  Constant *PatternValue = nullptr; 
- 
-  if (!SplatValue) 
-    PatternValue = getMemSetPatternValue(StoredVal, DL); 
- 
-  assert((SplatValue || PatternValue) && 
-         "Expected either splat value or pattern value."); 
- 
-  // The trip count of the loop and the base pointer of the addrec SCEV is 
-  // guaranteed to be loop invariant, which means that it should dominate the 
-  // header.  This allows us to insert code for it in the preheader. 
-  unsigned DestAS = DestPtr->getType()->getPointerAddressSpace(); 
-  BasicBlock *Preheader = CurLoop->getLoopPreheader(); 
-  IRBuilder<> Builder(Preheader->getTerminator()); 
-  SCEVExpander Expander(*SE, *DL, "loop-idiom"); 
+      SE->isLoopEntryGuardedByCond(
+          CurLoop, ICmpInst::ICMP_NE, BECount,
+          SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
+    NumBytesS = SE->getZeroExtendExpr(
+        SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW),
+        IntPtr);
+  } else {
+    NumBytesS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
+                               SE->getOne(IntPtr), SCEV::FlagNUW);
+  }
+
+  // And scale it based on the store size.
+  if (StoreSize != 1) {
+    NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
+                               SCEV::FlagNUW);
+  }
+  return NumBytesS;
+}
+
+/// processLoopStridedStore - We see a strided store of some value.  If we can
+/// transform this into a memset or memset_pattern in the loop preheader, do so.
+bool LoopIdiomRecognize::processLoopStridedStore(
+    Value *DestPtr, unsigned StoreSize, MaybeAlign StoreAlignment,
+    Value *StoredVal, Instruction *TheStore,
+    SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev,
+    const SCEV *BECount, bool NegStride, bool IsLoopMemset) {
+  Value *SplatValue = isBytewiseValue(StoredVal, *DL);
+  Constant *PatternValue = nullptr;
+
+  if (!SplatValue)
+    PatternValue = getMemSetPatternValue(StoredVal, DL);
+
+  assert((SplatValue || PatternValue) &&
+         "Expected either splat value or pattern value.");
+
+  // The trip count of the loop and the base pointer of the addrec SCEV is
+  // guaranteed to be loop invariant, which means that it should dominate the
+  // header.  This allows us to insert code for it in the preheader.
+  unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
+  BasicBlock *Preheader = CurLoop->getLoopPreheader();
+  IRBuilder<> Builder(Preheader->getTerminator());
+  SCEVExpander Expander(*SE, *DL, "loop-idiom");
   SCEVExpanderCleaner ExpCleaner(Expander, *DT);
- 
-  Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS); 
-  Type *IntIdxTy = DL->getIndexType(DestPtr->getType()); 
- 
+
+  Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
+  Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
+
   bool Changed = false;
-  const SCEV *Start = Ev->getStart(); 
-  // Handle negative strided loops. 
-  if (NegStride) 
-    Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSize, SE); 
- 
-  // TODO: ideally we should still be able to generate memset if SCEV expander 
-  // is taught to generate the dependencies at the latest point. 
-  if (!isSafeToExpand(Start, *SE)) 
+  const SCEV *Start = Ev->getStart();
+  // Handle negative strided loops.
+  if (NegStride)
+    Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSize, SE);
+
+  // TODO: ideally we should still be able to generate memset if SCEV expander
+  // is taught to generate the dependencies at the latest point.
+  if (!isSafeToExpand(Start, *SE))
     return Changed;
- 
-  // Okay, we have a strided store "p[i]" of a splattable value.  We can turn 
-  // this into a memset in the loop preheader now if we want.  However, this 
-  // would be unsafe to do if there is anything else in the loop that may read 
-  // or write to the aliased location.  Check for any overlap by generating the 
-  // base pointer and checking the region. 
-  Value *BasePtr = 
-      Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator()); 
+
+  // Okay, we have a strided store "p[i]" of a splattable value.  We can turn
+  // this into a memset in the loop preheader now if we want.  However, this
+  // would be unsafe to do if there is anything else in the loop that may read
+  // or write to the aliased location.  Check for any overlap by generating the
+  // base pointer and checking the region.
+  Value *BasePtr =
+      Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
 
   // From here on out, conservatively report to the pass manager that we've
   // changed the IR, even if we later clean up these added instructions. There
@@ -980,134 +980,134 @@ bool LoopIdiomRecognize::processLoopStridedStore(
   // the return value will read this comment, and leave them alone.
   Changed = true;
 
-  if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount, 
+  if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
                             StoreSize, *AA, Stores))
     return Changed;
- 
-  if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset)) 
+
+  if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
     return Changed;
- 
-  // Okay, everything looks good, insert the memset. 
- 
-  const SCEV *NumBytesS = 
-      getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE); 
- 
-  // TODO: ideally we should still be able to generate memset if SCEV expander 
-  // is taught to generate the dependencies at the latest point. 
-  if (!isSafeToExpand(NumBytesS, *SE)) 
+
+  // Okay, everything looks good, insert the memset.
+
+  const SCEV *NumBytesS =
+      getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE);
+
+  // TODO: ideally we should still be able to generate memset if SCEV expander
+  // is taught to generate the dependencies at the latest point.
+  if (!isSafeToExpand(NumBytesS, *SE))
     return Changed;
- 
-  Value *NumBytes = 
-      Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator()); 
- 
-  CallInst *NewCall; 
-  if (SplatValue) { 
-    NewCall = Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, 
-                                   MaybeAlign(StoreAlignment)); 
-  } else { 
-    // Everything is emitted in default address space 
-    Type *Int8PtrTy = DestInt8PtrTy; 
- 
-    Module *M = TheStore->getModule(); 
-    StringRef FuncName = "memset_pattern16"; 
-    FunctionCallee MSP = M->getOrInsertFunction(FuncName, Builder.getVoidTy(), 
-                                                Int8PtrTy, Int8PtrTy, IntIdxTy); 
-    inferLibFuncAttributes(M, FuncName, *TLI); 
- 
-    // Otherwise we should form a memset_pattern16.  PatternValue is known to be 
-    // an constant array of 16-bytes.  Plop the value into a mergable global. 
-    GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true, 
-                                            GlobalValue::PrivateLinkage, 
-                                            PatternValue, ".memset_pattern"); 
-    GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these. 
-    GV->setAlignment(Align(16)); 
-    Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy); 
-    NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes}); 
-  } 
-  NewCall->setDebugLoc(TheStore->getDebugLoc()); 
- 
-  if (MSSAU) { 
-    MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB( 
-        NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator); 
-    MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true); 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "  Formed memset: " << *NewCall << "\n" 
-                    << "    from store to: " << *Ev << " at: " << *TheStore 
-                    << "\n"); 
- 
-  ORE.emit([&]() { 
-    return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStridedStore", 
-                              NewCall->getDebugLoc(), Preheader) 
-           << "Transformed loop-strided store into a call to " 
-           << ore::NV("NewFunction", NewCall->getCalledFunction()) 
-           << "() function"; 
-  }); 
- 
-  // Okay, the memset has been formed.  Zap the original store and anything that 
-  // feeds into it. 
-  for (auto *I : Stores) { 
-    if (MSSAU) 
-      MSSAU->removeMemoryAccess(I, true); 
-    deleteDeadInstruction(I); 
-  } 
-  if (MSSAU && VerifyMemorySSA) 
-    MSSAU->getMemorySSA()->verifyMemorySSA(); 
-  ++NumMemSet; 
+
+  Value *NumBytes =
+      Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
+
+  CallInst *NewCall;
+  if (SplatValue) {
+    NewCall = Builder.CreateMemSet(BasePtr, SplatValue, NumBytes,
+                                   MaybeAlign(StoreAlignment));
+  } else {
+    // Everything is emitted in default address space
+    Type *Int8PtrTy = DestInt8PtrTy;
+
+    Module *M = TheStore->getModule();
+    StringRef FuncName = "memset_pattern16";
+    FunctionCallee MSP = M->getOrInsertFunction(FuncName, Builder.getVoidTy(),
+                                                Int8PtrTy, Int8PtrTy, IntIdxTy);
+    inferLibFuncAttributes(M, FuncName, *TLI);
+
+    // Otherwise we should form a memset_pattern16.  PatternValue is known to be
+    // an constant array of 16-bytes.  Plop the value into a mergable global.
+    GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
+                                            GlobalValue::PrivateLinkage,
+                                            PatternValue, ".memset_pattern");
+    GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
+    GV->setAlignment(Align(16));
+    Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
+    NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
+  }
+  NewCall->setDebugLoc(TheStore->getDebugLoc());
+
+  if (MSSAU) {
+    MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
+        NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
+    MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
+  }
+
+  LLVM_DEBUG(dbgs() << "  Formed memset: " << *NewCall << "\n"
+                    << "    from store to: " << *Ev << " at: " << *TheStore
+                    << "\n");
+
+  ORE.emit([&]() {
+    return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStridedStore",
+                              NewCall->getDebugLoc(), Preheader)
+           << "Transformed loop-strided store into a call to "
+           << ore::NV("NewFunction", NewCall->getCalledFunction())
+           << "() function";
+  });
+
+  // Okay, the memset has been formed.  Zap the original store and anything that
+  // feeds into it.
+  for (auto *I : Stores) {
+    if (MSSAU)
+      MSSAU->removeMemoryAccess(I, true);
+    deleteDeadInstruction(I);
+  }
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+  ++NumMemSet;
   ExpCleaner.markResultUsed();
-  return true; 
-} 
- 
-/// If the stored value is a strided load in the same loop with the same stride 
-/// this may be transformable into a memcpy.  This kicks in for stuff like 
-/// for (i) A[i] = B[i]; 
-bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, 
-                                                    const SCEV *BECount) { 
-  assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores."); 
- 
-  Value *StorePtr = SI->getPointerOperand(); 
-  const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); 
-  APInt Stride = getStoreStride(StoreEv); 
-  unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType()); 
-  bool NegStride = StoreSize == -Stride; 
- 
-  // The store must be feeding a non-volatile load. 
-  LoadInst *LI = cast<LoadInst>(SI->getValueOperand()); 
-  assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads."); 
- 
-  // See if the pointer expression is an AddRec like {base,+,1} on the current 
-  // loop, which indicates a strided load.  If we have something else, it's a 
-  // random load we can't handle. 
-  const SCEVAddRecExpr *LoadEv = 
-      cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand())); 
- 
-  // The trip count of the loop and the base pointer of the addrec SCEV is 
-  // guaranteed to be loop invariant, which means that it should dominate the 
-  // header.  This allows us to insert code for it in the preheader. 
-  BasicBlock *Preheader = CurLoop->getLoopPreheader(); 
-  IRBuilder<> Builder(Preheader->getTerminator()); 
-  SCEVExpander Expander(*SE, *DL, "loop-idiom"); 
- 
+  return true;
+}
+
+/// If the stored value is a strided load in the same loop with the same stride
+/// this may be transformable into a memcpy.  This kicks in for stuff like
+/// for (i) A[i] = B[i];
+bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
+                                                    const SCEV *BECount) {
+  assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
+
+  Value *StorePtr = SI->getPointerOperand();
+  const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+  APInt Stride = getStoreStride(StoreEv);
+  unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
+  bool NegStride = StoreSize == -Stride;
+
+  // The store must be feeding a non-volatile load.
+  LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
+  assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
+
+  // See if the pointer expression is an AddRec like {base,+,1} on the current
+  // loop, which indicates a strided load.  If we have something else, it's a
+  // random load we can't handle.
+  const SCEVAddRecExpr *LoadEv =
+      cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
+
+  // The trip count of the loop and the base pointer of the addrec SCEV is
+  // guaranteed to be loop invariant, which means that it should dominate the
+  // header.  This allows us to insert code for it in the preheader.
+  BasicBlock *Preheader = CurLoop->getLoopPreheader();
+  IRBuilder<> Builder(Preheader->getTerminator());
+  SCEVExpander Expander(*SE, *DL, "loop-idiom");
+
   SCEVExpanderCleaner ExpCleaner(Expander, *DT);
- 
+
   bool Changed = false;
-  const SCEV *StrStart = StoreEv->getStart(); 
-  unsigned StrAS = SI->getPointerAddressSpace(); 
-  Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS)); 
- 
-  // Handle negative strided loops. 
-  if (NegStride) 
-    StrStart = getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSize, SE); 
- 
-  // Okay, we have a strided store "p[i]" of a loaded value.  We can turn 
-  // this into a memcpy in the loop preheader now if we want.  However, this 
-  // would be unsafe to do if there is anything else in the loop that may read 
-  // or write the memory region we're storing to.  This includes the load that 
-  // feeds the stores.  Check for an alias by generating the base address and 
-  // checking everything. 
-  Value *StoreBasePtr = Expander.expandCodeFor( 
-      StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator()); 
- 
+  const SCEV *StrStart = StoreEv->getStart();
+  unsigned StrAS = SI->getPointerAddressSpace();
+  Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
+
+  // Handle negative strided loops.
+  if (NegStride)
+    StrStart = getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSize, SE);
+
+  // Okay, we have a strided store "p[i]" of a loaded value.  We can turn
+  // this into a memcpy in the loop preheader now if we want.  However, this
+  // would be unsafe to do if there is anything else in the loop that may read
+  // or write the memory region we're storing to.  This includes the load that
+  // feeds the stores.  Check for an alias by generating the base address and
+  // checking everything.
+  Value *StoreBasePtr = Expander.expandCodeFor(
+      StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
+
   // From here on out, conservatively report to the pass manager that we've
   // changed the IR, even if we later clean up these added instructions. There
   // may be structural differences e.g. in the order of use lists not accounted
@@ -1117,650 +1117,650 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
   // the return value will read this comment, and leave them alone.
   Changed = true;
 
-  SmallPtrSet<Instruction *, 1> Stores; 
-  Stores.insert(SI); 
-  if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount, 
-                            StoreSize, *AA, Stores)) 
+  SmallPtrSet<Instruction *, 1> Stores;
+  Stores.insert(SI);
+  if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
+                            StoreSize, *AA, Stores))
     return Changed;
- 
-  const SCEV *LdStart = LoadEv->getStart(); 
-  unsigned LdAS = LI->getPointerAddressSpace(); 
- 
-  // Handle negative strided loops. 
-  if (NegStride) 
-    LdStart = getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSize, SE); 
- 
-  // For a memcpy, we have to make sure that the input array is not being 
-  // mutated by the loop. 
-  Value *LoadBasePtr = Expander.expandCodeFor( 
-      LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator()); 
- 
-  if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount, 
-                            StoreSize, *AA, Stores)) 
+
+  const SCEV *LdStart = LoadEv->getStart();
+  unsigned LdAS = LI->getPointerAddressSpace();
+
+  // Handle negative strided loops.
+  if (NegStride)
+    LdStart = getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSize, SE);
+
+  // For a memcpy, we have to make sure that the input array is not being
+  // mutated by the loop.
+  Value *LoadBasePtr = Expander.expandCodeFor(
+      LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
+
+  if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
+                            StoreSize, *AA, Stores))
     return Changed;
- 
-  if (avoidLIRForMultiBlockLoop()) 
+
+  if (avoidLIRForMultiBlockLoop())
     return Changed;
- 
-  // Okay, everything is safe, we can transform this! 
- 
-  const SCEV *NumBytesS = 
-      getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE); 
- 
-  Value *NumBytes = 
-      Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator()); 
- 
-  CallInst *NewCall = nullptr; 
-  // Check whether to generate an unordered atomic memcpy: 
-  //  If the load or store are atomic, then they must necessarily be unordered 
-  //  by previous checks. 
-  if (!SI->isAtomic() && !LI->isAtomic()) 
-    NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlign(), LoadBasePtr, 
-                                   LI->getAlign(), NumBytes); 
-  else { 
-    // We cannot allow unaligned ops for unordered load/store, so reject 
-    // anything where the alignment isn't at least the element size. 
-    const Align StoreAlign = SI->getAlign(); 
-    const Align LoadAlign = LI->getAlign(); 
-    if (StoreAlign < StoreSize || LoadAlign < StoreSize) 
+
+  // Okay, everything is safe, we can transform this!
+
+  const SCEV *NumBytesS =
+      getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE);
+
+  Value *NumBytes =
+      Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
+
+  CallInst *NewCall = nullptr;
+  // Check whether to generate an unordered atomic memcpy:
+  //  If the load or store are atomic, then they must necessarily be unordered
+  //  by previous checks.
+  if (!SI->isAtomic() && !LI->isAtomic())
+    NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlign(), LoadBasePtr,
+                                   LI->getAlign(), NumBytes);
+  else {
+    // We cannot allow unaligned ops for unordered load/store, so reject
+    // anything where the alignment isn't at least the element size.
+    const Align StoreAlign = SI->getAlign();
+    const Align LoadAlign = LI->getAlign();
+    if (StoreAlign < StoreSize || LoadAlign < StoreSize)
       return Changed;
- 
-    // If the element.atomic memcpy is not lowered into explicit 
-    // loads/stores later, then it will be lowered into an element-size 
-    // specific lib call. If the lib call doesn't exist for our store size, then 
-    // we shouldn't generate the memcpy. 
-    if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize()) 
+
+    // If the element.atomic memcpy is not lowered into explicit
+    // loads/stores later, then it will be lowered into an element-size
+    // specific lib call. If the lib call doesn't exist for our store size, then
+    // we shouldn't generate the memcpy.
+    if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
       return Changed;
- 
-    // Create the call. 
-    // Note that unordered atomic loads/stores are *required* by the spec to 
-    // have an alignment but non-atomic loads/stores may not. 
-    NewCall = Builder.CreateElementUnorderedAtomicMemCpy( 
-        StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign, NumBytes, 
-        StoreSize); 
-  } 
-  NewCall->setDebugLoc(SI->getDebugLoc()); 
- 
-  if (MSSAU) { 
-    MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB( 
-        NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator); 
-    MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true); 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "  Formed memcpy: " << *NewCall << "\n" 
-                    << "    from load ptr=" << *LoadEv << " at: " << *LI << "\n" 
-                    << "    from store ptr=" << *StoreEv << " at: " << *SI 
-                    << "\n"); 
- 
-  ORE.emit([&]() { 
-    return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad", 
-                              NewCall->getDebugLoc(), Preheader) 
-           << "Formed a call to " 
-           << ore::NV("NewFunction", NewCall->getCalledFunction()) 
-           << "() function"; 
-  }); 
- 
-  // Okay, the memcpy has been formed.  Zap the original store and anything that 
-  // feeds into it. 
-  if (MSSAU) 
-    MSSAU->removeMemoryAccess(SI, true); 
-  deleteDeadInstruction(SI); 
-  if (MSSAU && VerifyMemorySSA) 
-    MSSAU->getMemorySSA()->verifyMemorySSA(); 
-  ++NumMemCpy; 
+
+    // Create the call.
+    // Note that unordered atomic loads/stores are *required* by the spec to
+    // have an alignment but non-atomic loads/stores may not.
+    NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
+        StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign, NumBytes,
+        StoreSize);
+  }
+  NewCall->setDebugLoc(SI->getDebugLoc());
+
+  if (MSSAU) {
+    MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
+        NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
+    MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
+  }
+
+  LLVM_DEBUG(dbgs() << "  Formed memcpy: " << *NewCall << "\n"
+                    << "    from load ptr=" << *LoadEv << " at: " << *LI << "\n"
+                    << "    from store ptr=" << *StoreEv << " at: " << *SI
+                    << "\n");
+
+  ORE.emit([&]() {
+    return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad",
+                              NewCall->getDebugLoc(), Preheader)
+           << "Formed a call to "
+           << ore::NV("NewFunction", NewCall->getCalledFunction())
+           << "() function";
+  });
+
+  // Okay, the memcpy has been formed.  Zap the original store and anything that
+  // feeds into it.
+  if (MSSAU)
+    MSSAU->removeMemoryAccess(SI, true);
+  deleteDeadInstruction(SI);
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+  ++NumMemCpy;
   ExpCleaner.markResultUsed();
-  return true; 
-} 
- 
-// When compiling for codesize we avoid idiom recognition for a multi-block loop 
-// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop. 
-// 
-bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset, 
-                                                   bool IsLoopMemset) { 
-  if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) { 
+  return true;
+}
+
+// When compiling for codesize we avoid idiom recognition for a multi-block loop
+// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
+//
+bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
+                                                   bool IsLoopMemset) {
+  if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
     if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) {
-      LLVM_DEBUG(dbgs() << "  " << CurLoop->getHeader()->getParent()->getName() 
-                        << " : LIR " << (IsMemset ? "Memset" : "Memcpy") 
-                        << " avoided: multi-block top-level loop\n"); 
-      return true; 
-    } 
-  } 
- 
-  return false; 
-} 
- 
-bool LoopIdiomRecognize::runOnNoncountableLoop() { 
-  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" 
-                    << CurLoop->getHeader()->getParent()->getName() 
-                    << "] Noncountable Loop %" 
-                    << CurLoop->getHeader()->getName() << "\n"); 
- 
+      LLVM_DEBUG(dbgs() << "  " << CurLoop->getHeader()->getParent()->getName()
+                        << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
+                        << " avoided: multi-block top-level loop\n");
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool LoopIdiomRecognize::runOnNoncountableLoop() {
+  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
+                    << CurLoop->getHeader()->getParent()->getName()
+                    << "] Noncountable Loop %"
+                    << CurLoop->getHeader()->getName() << "\n");
+
   return recognizePopcount() || recognizeAndInsertFFS() ||
          recognizeShiftUntilBitTest();
-} 
- 
-/// Check if the given conditional branch is based on the comparison between 
-/// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is 
-/// true), the control yields to the loop entry. If the branch matches the 
-/// behavior, the variable involved in the comparison is returned. This function 
-/// will be called to see if the precondition and postcondition of the loop are 
-/// in desirable form. 
-static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry, 
-                             bool JmpOnZero = false) { 
-  if (!BI || !BI->isConditional()) 
-    return nullptr; 
- 
-  ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition()); 
-  if (!Cond) 
-    return nullptr; 
- 
-  ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1)); 
-  if (!CmpZero || !CmpZero->isZero()) 
-    return nullptr; 
- 
-  BasicBlock *TrueSucc = BI->getSuccessor(0); 
-  BasicBlock *FalseSucc = BI->getSuccessor(1); 
-  if (JmpOnZero) 
-    std::swap(TrueSucc, FalseSucc); 
- 
-  ICmpInst::Predicate Pred = Cond->getPredicate(); 
-  if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) || 
-      (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry)) 
-    return Cond->getOperand(0); 
- 
-  return nullptr; 
-} 
- 
-// Check if the recurrence variable `VarX` is in the right form to create 
-// the idiom. Returns the value coerced to a PHINode if so. 
-static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX, 
-                                 BasicBlock *LoopEntry) { 
-  auto *PhiX = dyn_cast<PHINode>(VarX); 
-  if (PhiX && PhiX->getParent() == LoopEntry && 
-      (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX)) 
-    return PhiX; 
-  return nullptr; 
-} 
- 
-/// Return true iff the idiom is detected in the loop. 
-/// 
-/// Additionally: 
-/// 1) \p CntInst is set to the instruction counting the population bit. 
-/// 2) \p CntPhi is set to the corresponding phi node. 
-/// 3) \p Var is set to the value whose population bits are being counted. 
-/// 
-/// The core idiom we are trying to detect is: 
-/// \code 
-///    if (x0 != 0) 
-///      goto loop-exit // the precondition of the loop 
-///    cnt0 = init-val; 
-///    do { 
-///       x1 = phi (x0, x2); 
-///       cnt1 = phi(cnt0, cnt2); 
-/// 
-///       cnt2 = cnt1 + 1; 
-///        ... 
-///       x2 = x1 & (x1 - 1); 
-///        ... 
-///    } while(x != 0); 
-/// 
-/// loop-exit: 
-/// \endcode 
-static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, 
-                                Instruction *&CntInst, PHINode *&CntPhi, 
-                                Value *&Var) { 
-  // step 1: Check to see if the look-back branch match this pattern: 
-  //    "if (a!=0) goto loop-entry". 
-  BasicBlock *LoopEntry; 
-  Instruction *DefX2, *CountInst; 
-  Value *VarX1, *VarX0; 
-  PHINode *PhiX, *CountPhi; 
- 
-  DefX2 = CountInst = nullptr; 
-  VarX1 = VarX0 = nullptr; 
-  PhiX = CountPhi = nullptr; 
-  LoopEntry = *(CurLoop->block_begin()); 
- 
-  // step 1: Check if the loop-back branch is in desirable form. 
-  { 
-    if (Value *T = matchCondition( 
-            dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry)) 
-      DefX2 = dyn_cast<Instruction>(T); 
-    else 
-      return false; 
-  } 
- 
-  // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)" 
-  { 
-    if (!DefX2 || DefX2->getOpcode() != Instruction::And) 
-      return false; 
- 
-    BinaryOperator *SubOneOp; 
- 
-    if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0)))) 
-      VarX1 = DefX2->getOperand(1); 
-    else { 
-      VarX1 = DefX2->getOperand(0); 
-      SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1)); 
-    } 
-    if (!SubOneOp || SubOneOp->getOperand(0) != VarX1) 
-      return false; 
- 
-    ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1)); 
-    if (!Dec || 
-        !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) || 
-          (SubOneOp->getOpcode() == Instruction::Add && 
-           Dec->isMinusOne()))) { 
-      return false; 
-    } 
-  } 
- 
-  // step 3: Check the recurrence of variable X 
-  PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry); 
-  if (!PhiX) 
-    return false; 
- 
-  // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1 
-  { 
-    CountInst = nullptr; 
-    for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(), 
-                              IterE = LoopEntry->end(); 
-         Iter != IterE; Iter++) { 
-      Instruction *Inst = &*Iter; 
-      if (Inst->getOpcode() != Instruction::Add) 
-        continue; 
- 
-      ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1)); 
-      if (!Inc || !Inc->isOne()) 
-        continue; 
- 
-      PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry); 
-      if (!Phi) 
-        continue; 
- 
-      // Check if the result of the instruction is live of the loop. 
-      bool LiveOutLoop = false; 
-      for (User *U : Inst->users()) { 
-        if ((cast<Instruction>(U))->getParent() != LoopEntry) { 
-          LiveOutLoop = true; 
-          break; 
-        } 
-      } 
- 
-      if (LiveOutLoop) { 
-        CountInst = Inst; 
-        CountPhi = Phi; 
-        break; 
-      } 
-    } 
- 
-    if (!CountInst) 
-      return false; 
-  } 
- 
-  // step 5: check if the precondition is in this form: 
-  //   "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;" 
-  { 
-    auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator()); 
-    Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader()); 
-    if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1)) 
-      return false; 
- 
-    CntInst = CountInst; 
-    CntPhi = CountPhi; 
-    Var = T; 
-  } 
- 
-  return true; 
-} 
- 
-/// Return true if the idiom is detected in the loop. 
-/// 
-/// Additionally: 
-/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ) 
-///       or nullptr if there is no such. 
-/// 2) \p CntPhi is set to the corresponding phi node 
-///       or nullptr if there is no such. 
-/// 3) \p Var is set to the value whose CTLZ could be used. 
-/// 4) \p DefX is set to the instruction calculating Loop exit condition. 
-/// 
-/// The core idiom we are trying to detect is: 
-/// \code 
-///    if (x0 == 0) 
-///      goto loop-exit // the precondition of the loop 
-///    cnt0 = init-val; 
-///    do { 
-///       x = phi (x0, x.next);   //PhiX 
-///       cnt = phi(cnt0, cnt.next); 
-/// 
-///       cnt.next = cnt + 1; 
-///        ... 
-///       x.next = x >> 1;   // DefX 
-///        ... 
-///    } while(x.next != 0); 
-/// 
-/// loop-exit: 
-/// \endcode 
-static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL, 
-                                      Intrinsic::ID &IntrinID, Value *&InitX, 
-                                      Instruction *&CntInst, PHINode *&CntPhi, 
-                                      Instruction *&DefX) { 
-  BasicBlock *LoopEntry; 
-  Value *VarX = nullptr; 
- 
-  DefX = nullptr; 
-  CntInst = nullptr; 
-  CntPhi = nullptr; 
-  LoopEntry = *(CurLoop->block_begin()); 
- 
-  // step 1: Check if the loop-back branch is in desirable form. 
-  if (Value *T = matchCondition( 
-          dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry)) 
-    DefX = dyn_cast<Instruction>(T); 
-  else 
-    return false; 
- 
-  // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1" 
-  if (!DefX || !DefX->isShift()) 
-    return false; 
-  IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz : 
-                                                     Intrinsic::ctlz; 
-  ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1)); 
-  if (!Shft || !Shft->isOne()) 
-    return false; 
-  VarX = DefX->getOperand(0); 
- 
-  // step 3: Check the recurrence of variable X 
-  PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry); 
-  if (!PhiX) 
-    return false; 
- 
-  InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader()); 
- 
-  // Make sure the initial value can't be negative otherwise the ashr in the 
-  // loop might never reach zero which would make the loop infinite. 
-  if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL)) 
-    return false; 
- 
-  // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1 
+}
+
+/// Check if the given conditional branch is based on the comparison between
+/// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
+/// true), the control yields to the loop entry. If the branch matches the
+/// behavior, the variable involved in the comparison is returned. This function
+/// will be called to see if the precondition and postcondition of the loop are
+/// in desirable form.
+static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
+                             bool JmpOnZero = false) {
+  if (!BI || !BI->isConditional())
+    return nullptr;
+
+  ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
+  if (!Cond)
+    return nullptr;
+
+  ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
+  if (!CmpZero || !CmpZero->isZero())
+    return nullptr;
+
+  BasicBlock *TrueSucc = BI->getSuccessor(0);
+  BasicBlock *FalseSucc = BI->getSuccessor(1);
+  if (JmpOnZero)
+    std::swap(TrueSucc, FalseSucc);
+
+  ICmpInst::Predicate Pred = Cond->getPredicate();
+  if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||
+      (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
+    return Cond->getOperand(0);
+
+  return nullptr;
+}
+
+// Check if the recurrence variable `VarX` is in the right form to create
+// the idiom. Returns the value coerced to a PHINode if so.
+static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX,
+                                 BasicBlock *LoopEntry) {
+  auto *PhiX = dyn_cast<PHINode>(VarX);
+  if (PhiX && PhiX->getParent() == LoopEntry &&
+      (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
+    return PhiX;
+  return nullptr;
+}
+
+/// Return true iff the idiom is detected in the loop.
+///
+/// Additionally:
+/// 1) \p CntInst is set to the instruction counting the population bit.
+/// 2) \p CntPhi is set to the corresponding phi node.
+/// 3) \p Var is set to the value whose population bits are being counted.
+///
+/// The core idiom we are trying to detect is:
+/// \code
+///    if (x0 != 0)
+///      goto loop-exit // the precondition of the loop
+///    cnt0 = init-val;
+///    do {
+///       x1 = phi (x0, x2);
+///       cnt1 = phi(cnt0, cnt2);
+///
+///       cnt2 = cnt1 + 1;
+///        ...
+///       x2 = x1 & (x1 - 1);
+///        ...
+///    } while(x != 0);
+///
+/// loop-exit:
+/// \endcode
+static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
+                                Instruction *&CntInst, PHINode *&CntPhi,
+                                Value *&Var) {
+  // step 1: Check to see if the look-back branch match this pattern:
+  //    "if (a!=0) goto loop-entry".
+  BasicBlock *LoopEntry;
+  Instruction *DefX2, *CountInst;
+  Value *VarX1, *VarX0;
+  PHINode *PhiX, *CountPhi;
+
+  DefX2 = CountInst = nullptr;
+  VarX1 = VarX0 = nullptr;
+  PhiX = CountPhi = nullptr;
+  LoopEntry = *(CurLoop->block_begin());
+
+  // step 1: Check if the loop-back branch is in desirable form.
+  {
+    if (Value *T = matchCondition(
+            dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
+      DefX2 = dyn_cast<Instruction>(T);
+    else
+      return false;
+  }
+
+  // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
+  {
+    if (!DefX2 || DefX2->getOpcode() != Instruction::And)
+      return false;
+
+    BinaryOperator *SubOneOp;
+
+    if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
+      VarX1 = DefX2->getOperand(1);
+    else {
+      VarX1 = DefX2->getOperand(0);
+      SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
+    }
+    if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
+      return false;
+
+    ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
+    if (!Dec ||
+        !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
+          (SubOneOp->getOpcode() == Instruction::Add &&
+           Dec->isMinusOne()))) {
+      return false;
+    }
+  }
+
+  // step 3: Check the recurrence of variable X
+  PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
+  if (!PhiX)
+    return false;
+
+  // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
+  {
+    CountInst = nullptr;
+    for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
+                              IterE = LoopEntry->end();
+         Iter != IterE; Iter++) {
+      Instruction *Inst = &*Iter;
+      if (Inst->getOpcode() != Instruction::Add)
+        continue;
+
+      ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
+      if (!Inc || !Inc->isOne())
+        continue;
+
+      PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
+      if (!Phi)
+        continue;
+
+      // Check if the result of the instruction is live of the loop.
+      bool LiveOutLoop = false;
+      for (User *U : Inst->users()) {
+        if ((cast<Instruction>(U))->getParent() != LoopEntry) {
+          LiveOutLoop = true;
+          break;
+        }
+      }
+
+      if (LiveOutLoop) {
+        CountInst = Inst;
+        CountPhi = Phi;
+        break;
+      }
+    }
+
+    if (!CountInst)
+      return false;
+  }
+
+  // step 5: check if the precondition is in this form:
+  //   "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
+  {
+    auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
+    Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
+    if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
+      return false;
+
+    CntInst = CountInst;
+    CntPhi = CountPhi;
+    Var = T;
+  }
+
+  return true;
+}
+
+/// Return true if the idiom is detected in the loop.
+///
+/// Additionally:
+/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
+///       or nullptr if there is no such.
+/// 2) \p CntPhi is set to the corresponding phi node
+///       or nullptr if there is no such.
+/// 3) \p Var is set to the value whose CTLZ could be used.
+/// 4) \p DefX is set to the instruction calculating Loop exit condition.
+///
+/// The core idiom we are trying to detect is:
+/// \code
+///    if (x0 == 0)
+///      goto loop-exit // the precondition of the loop
+///    cnt0 = init-val;
+///    do {
+///       x = phi (x0, x.next);   //PhiX
+///       cnt = phi(cnt0, cnt.next);
+///
+///       cnt.next = cnt + 1;
+///        ...
+///       x.next = x >> 1;   // DefX
+///        ...
+///    } while(x.next != 0);
+///
+/// loop-exit:
+/// \endcode
+static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
+                                      Intrinsic::ID &IntrinID, Value *&InitX,
+                                      Instruction *&CntInst, PHINode *&CntPhi,
+                                      Instruction *&DefX) {
+  BasicBlock *LoopEntry;
+  Value *VarX = nullptr;
+
+  DefX = nullptr;
+  CntInst = nullptr;
+  CntPhi = nullptr;
+  LoopEntry = *(CurLoop->block_begin());
+
+  // step 1: Check if the loop-back branch is in desirable form.
+  if (Value *T = matchCondition(
+          dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
+    DefX = dyn_cast<Instruction>(T);
+  else
+    return false;
+
+  // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
+  if (!DefX || !DefX->isShift())
+    return false;
+  IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
+                                                     Intrinsic::ctlz;
+  ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
+  if (!Shft || !Shft->isOne())
+    return false;
+  VarX = DefX->getOperand(0);
+
+  // step 3: Check the recurrence of variable X
+  PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
+  if (!PhiX)
+    return false;
+
+  InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
+
+  // Make sure the initial value can't be negative otherwise the ashr in the
+  // loop might never reach zero which would make the loop infinite.
+  if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
+    return false;
+
+  // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
   //         or cnt.next = cnt + -1.
-  // TODO: We can skip the step. If loop trip count is known (CTLZ), 
-  //       then all uses of "cnt.next" could be optimized to the trip count 
-  //       plus "cnt0". Currently it is not optimized. 
-  //       This step could be used to detect POPCNT instruction: 
-  //       cnt.next = cnt + (x.next & 1) 
-  for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(), 
-                            IterE = LoopEntry->end(); 
-       Iter != IterE; Iter++) { 
-    Instruction *Inst = &*Iter; 
-    if (Inst->getOpcode() != Instruction::Add) 
-      continue; 
- 
-    ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1)); 
+  // TODO: We can skip the step. If loop trip count is known (CTLZ),
+  //       then all uses of "cnt.next" could be optimized to the trip count
+  //       plus "cnt0". Currently it is not optimized.
+  //       This step could be used to detect POPCNT instruction:
+  //       cnt.next = cnt + (x.next & 1)
+  for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
+                            IterE = LoopEntry->end();
+       Iter != IterE; Iter++) {
+    Instruction *Inst = &*Iter;
+    if (Inst->getOpcode() != Instruction::Add)
+      continue;
+
+    ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
     if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
-      continue; 
- 
-    PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry); 
-    if (!Phi) 
-      continue; 
- 
-    CntInst = Inst; 
-    CntPhi = Phi; 
-    break; 
-  } 
-  if (!CntInst) 
-    return false; 
- 
-  return true; 
-} 
- 
-/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop 
-/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new 
-/// trip count returns true; otherwise, returns false. 
-bool LoopIdiomRecognize::recognizeAndInsertFFS() { 
-  // Give up if the loop has multiple blocks or multiple backedges. 
-  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) 
-    return false; 
- 
-  Intrinsic::ID IntrinID; 
-  Value *InitX; 
-  Instruction *DefX = nullptr; 
-  PHINode *CntPhi = nullptr; 
-  Instruction *CntInst = nullptr; 
-  // Help decide if transformation is profitable. For ShiftUntilZero idiom, 
-  // this is always 6. 
-  size_t IdiomCanonicalSize = 6; 
- 
-  if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX, 
-                                 CntInst, CntPhi, DefX)) 
-    return false; 
- 
-  bool IsCntPhiUsedOutsideLoop = false; 
-  for (User *U : CntPhi->users()) 
-    if (!CurLoop->contains(cast<Instruction>(U))) { 
-      IsCntPhiUsedOutsideLoop = true; 
-      break; 
-    } 
-  bool IsCntInstUsedOutsideLoop = false; 
-  for (User *U : CntInst->users()) 
-    if (!CurLoop->contains(cast<Instruction>(U))) { 
-      IsCntInstUsedOutsideLoop = true; 
-      break; 
-    } 
-  // If both CntInst and CntPhi are used outside the loop the profitability 
-  // is questionable. 
-  if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop) 
-    return false; 
- 
-  // For some CPUs result of CTLZ(X) intrinsic is undefined 
-  // when X is 0. If we can not guarantee X != 0, we need to check this 
-  // when expand. 
-  bool ZeroCheck = false; 
-  // It is safe to assume Preheader exist as it was checked in 
-  // parent function RunOnLoop. 
-  BasicBlock *PH = CurLoop->getLoopPreheader(); 
- 
-  // If we are using the count instruction outside the loop, make sure we 
-  // have a zero check as a precondition. Without the check the loop would run 
-  // one iteration for before any check of the input value. This means 0 and 1 
-  // would have identical behavior in the original loop and thus 
-  if (!IsCntPhiUsedOutsideLoop) { 
-    auto *PreCondBB = PH->getSinglePredecessor(); 
-    if (!PreCondBB) 
-      return false; 
-    auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator()); 
-    if (!PreCondBI) 
-      return false; 
-    if (matchCondition(PreCondBI, PH) != InitX) 
-      return false; 
-    ZeroCheck = true; 
-  } 
- 
-  // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always 
-  // profitable if we delete the loop. 
- 
-  // the loop has only 6 instructions: 
-  //  %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ] 
-  //  %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ] 
-  //  %shr = ashr %n.addr.0, 1 
-  //  %tobool = icmp eq %shr, 0 
-  //  %inc = add nsw %i.0, 1 
-  //  br i1 %tobool 
- 
-  const Value *Args[] = { 
-      InitX, ZeroCheck ? ConstantInt::getTrue(InitX->getContext()) 
-                       : ConstantInt::getFalse(InitX->getContext())}; 
- 
-  // @llvm.dbg doesn't count as they have no semantic effect. 
-  auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug(); 
-  uint32_t HeaderSize = 
-      std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end()); 
- 
-  IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args); 
-  int Cost = 
-    TTI->getIntrinsicInstrCost(Attrs, TargetTransformInfo::TCK_SizeAndLatency); 
-  if (HeaderSize != IdiomCanonicalSize && 
-      Cost > TargetTransformInfo::TCC_Basic) 
-    return false; 
- 
-  transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX, 
-                           DefX->getDebugLoc(), ZeroCheck, 
-                           IsCntPhiUsedOutsideLoop); 
-  return true; 
-} 
- 
-/// Recognizes a population count idiom in a non-countable loop. 
-/// 
-/// If detected, transforms the relevant code to issue the popcount intrinsic 
-/// function call, and returns true; otherwise, returns false. 
-bool LoopIdiomRecognize::recognizePopcount() { 
-  if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware) 
-    return false; 
- 
-  // Counting population are usually conducted by few arithmetic instructions. 
-  // Such instructions can be easily "absorbed" by vacant slots in a 
-  // non-compact loop. Therefore, recognizing popcount idiom only makes sense 
-  // in a compact loop. 
- 
-  // Give up if the loop has multiple blocks or multiple backedges. 
-  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) 
-    return false; 
- 
-  BasicBlock *LoopBody = *(CurLoop->block_begin()); 
-  if (LoopBody->size() >= 20) { 
-    // The loop is too big, bail out. 
-    return false; 
-  } 
- 
-  // It should have a preheader containing nothing but an unconditional branch. 
-  BasicBlock *PH = CurLoop->getLoopPreheader(); 
-  if (!PH || &PH->front() != PH->getTerminator()) 
-    return false; 
-  auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator()); 
-  if (!EntryBI || EntryBI->isConditional()) 
-    return false; 
- 
-  // It should have a precondition block where the generated popcount intrinsic 
-  // function can be inserted. 
-  auto *PreCondBB = PH->getSinglePredecessor(); 
-  if (!PreCondBB) 
-    return false; 
-  auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator()); 
-  if (!PreCondBI || PreCondBI->isUnconditional()) 
-    return false; 
- 
-  Instruction *CntInst; 
-  PHINode *CntPhi; 
-  Value *Val; 
-  if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val)) 
-    return false; 
- 
-  transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val); 
-  return true; 
-} 
- 
-static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, 
-                                       const DebugLoc &DL) { 
-  Value *Ops[] = {Val}; 
-  Type *Tys[] = {Val->getType()}; 
- 
-  Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent(); 
-  Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys); 
-  CallInst *CI = IRBuilder.CreateCall(Func, Ops); 
-  CI->setDebugLoc(DL); 
- 
-  return CI; 
-} 
- 
-static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val, 
-                                    const DebugLoc &DL, bool ZeroCheck, 
-                                    Intrinsic::ID IID) { 
-  Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()}; 
-  Type *Tys[] = {Val->getType()}; 
- 
-  Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent(); 
-  Function *Func = Intrinsic::getDeclaration(M, IID, Tys); 
-  CallInst *CI = IRBuilder.CreateCall(Func, Ops); 
-  CI->setDebugLoc(DL); 
- 
-  return CI; 
-} 
- 
-/// Transform the following loop (Using CTLZ, CTTZ is similar): 
-/// loop: 
-///   CntPhi = PHI [Cnt0, CntInst] 
-///   PhiX = PHI [InitX, DefX] 
-///   CntInst = CntPhi + 1 
-///   DefX = PhiX >> 1 
-///   LOOP_BODY 
-///   Br: loop if (DefX != 0) 
-/// Use(CntPhi) or Use(CntInst) 
-/// 
-/// Into: 
-/// If CntPhi used outside the loop: 
-///   CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1) 
-///   Count = CountPrev + 1 
-/// else 
-///   Count = BitWidth(InitX) - CTLZ(InitX) 
-/// loop: 
-///   CntPhi = PHI [Cnt0, CntInst] 
-///   PhiX = PHI [InitX, DefX] 
-///   PhiCount = PHI [Count, Dec] 
-///   CntInst = CntPhi + 1 
-///   DefX = PhiX >> 1 
-///   Dec = PhiCount - 1 
-///   LOOP_BODY 
-///   Br: loop if (Dec != 0) 
-/// Use(CountPrev + Cnt0) // Use(CntPhi) 
-/// or 
-/// Use(Count + Cnt0) // Use(CntInst) 
-/// 
-/// If LOOP_BODY is empty the loop will be deleted. 
-/// If CntInst and DefX are not used in LOOP_BODY they will be removed. 
-void LoopIdiomRecognize::transformLoopToCountable( 
-    Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst, 
-    PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL, 
-    bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) { 
-  BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator()); 
- 
-  // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block 
-  IRBuilder<> Builder(PreheaderBr); 
-  Builder.SetCurrentDebugLocation(DL); 
- 
-  //   Count = BitWidth - CTLZ(InitX); 
+      continue;
+
+    PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
+    if (!Phi)
+      continue;
+
+    CntInst = Inst;
+    CntPhi = Phi;
+    break;
+  }
+  if (!CntInst)
+    return false;
+
+  return true;
+}
+
+/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
+/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
+/// trip count returns true; otherwise, returns false.
+bool LoopIdiomRecognize::recognizeAndInsertFFS() {
+  // Give up if the loop has multiple blocks or multiple backedges.
+  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
+    return false;
+
+  Intrinsic::ID IntrinID;
+  Value *InitX;
+  Instruction *DefX = nullptr;
+  PHINode *CntPhi = nullptr;
+  Instruction *CntInst = nullptr;
+  // Help decide if transformation is profitable. For ShiftUntilZero idiom,
+  // this is always 6.
+  size_t IdiomCanonicalSize = 6;
+
+  if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX,
+                                 CntInst, CntPhi, DefX))
+    return false;
+
+  bool IsCntPhiUsedOutsideLoop = false;
+  for (User *U : CntPhi->users())
+    if (!CurLoop->contains(cast<Instruction>(U))) {
+      IsCntPhiUsedOutsideLoop = true;
+      break;
+    }
+  bool IsCntInstUsedOutsideLoop = false;
+  for (User *U : CntInst->users())
+    if (!CurLoop->contains(cast<Instruction>(U))) {
+      IsCntInstUsedOutsideLoop = true;
+      break;
+    }
+  // If both CntInst and CntPhi are used outside the loop the profitability
+  // is questionable.
+  if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
+    return false;
+
+  // For some CPUs result of CTLZ(X) intrinsic is undefined
+  // when X is 0. If we can not guarantee X != 0, we need to check this
+  // when expand.
+  bool ZeroCheck = false;
+  // It is safe to assume Preheader exist as it was checked in
+  // parent function RunOnLoop.
+  BasicBlock *PH = CurLoop->getLoopPreheader();
+
+  // If we are using the count instruction outside the loop, make sure we
+  // have a zero check as a precondition. Without the check the loop would run
+  // one iteration for before any check of the input value. This means 0 and 1
+  // would have identical behavior in the original loop and thus
+  if (!IsCntPhiUsedOutsideLoop) {
+    auto *PreCondBB = PH->getSinglePredecessor();
+    if (!PreCondBB)
+      return false;
+    auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
+    if (!PreCondBI)
+      return false;
+    if (matchCondition(PreCondBI, PH) != InitX)
+      return false;
+    ZeroCheck = true;
+  }
+
+  // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
+  // profitable if we delete the loop.
+
+  // the loop has only 6 instructions:
+  //  %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
+  //  %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
+  //  %shr = ashr %n.addr.0, 1
+  //  %tobool = icmp eq %shr, 0
+  //  %inc = add nsw %i.0, 1
+  //  br i1 %tobool
+
+  const Value *Args[] = {
+      InitX, ZeroCheck ? ConstantInt::getTrue(InitX->getContext())
+                       : ConstantInt::getFalse(InitX->getContext())};
+
+  // @llvm.dbg doesn't count as they have no semantic effect.
+  auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
+  uint32_t HeaderSize =
+      std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
+
+  IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
+  int Cost =
+    TTI->getIntrinsicInstrCost(Attrs, TargetTransformInfo::TCK_SizeAndLatency);
+  if (HeaderSize != IdiomCanonicalSize &&
+      Cost > TargetTransformInfo::TCC_Basic)
+    return false;
+
+  transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
+                           DefX->getDebugLoc(), ZeroCheck,
+                           IsCntPhiUsedOutsideLoop);
+  return true;
+}
+
+/// Recognizes a population count idiom in a non-countable loop.
+///
+/// If detected, transforms the relevant code to issue the popcount intrinsic
+/// function call, and returns true; otherwise, returns false.
+bool LoopIdiomRecognize::recognizePopcount() {
+  if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
+    return false;
+
+  // Counting population are usually conducted by few arithmetic instructions.
+  // Such instructions can be easily "absorbed" by vacant slots in a
+  // non-compact loop. Therefore, recognizing popcount idiom only makes sense
+  // in a compact loop.
+
+  // Give up if the loop has multiple blocks or multiple backedges.
+  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
+    return false;
+
+  BasicBlock *LoopBody = *(CurLoop->block_begin());
+  if (LoopBody->size() >= 20) {
+    // The loop is too big, bail out.
+    return false;
+  }
+
+  // It should have a preheader containing nothing but an unconditional branch.
+  BasicBlock *PH = CurLoop->getLoopPreheader();
+  if (!PH || &PH->front() != PH->getTerminator())
+    return false;
+  auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
+  if (!EntryBI || EntryBI->isConditional())
+    return false;
+
+  // It should have a precondition block where the generated popcount intrinsic
+  // function can be inserted.
+  auto *PreCondBB = PH->getSinglePredecessor();
+  if (!PreCondBB)
+    return false;
+  auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
+  if (!PreCondBI || PreCondBI->isUnconditional())
+    return false;
+
+  Instruction *CntInst;
+  PHINode *CntPhi;
+  Value *Val;
+  if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
+    return false;
+
+  transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
+  return true;
+}
+
+static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
+                                       const DebugLoc &DL) {
+  Value *Ops[] = {Val};
+  Type *Tys[] = {Val->getType()};
+
+  Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
+  Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
+  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
+  CI->setDebugLoc(DL);
+
+  return CI;
+}
+
+static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
+                                    const DebugLoc &DL, bool ZeroCheck,
+                                    Intrinsic::ID IID) {
+  Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()};
+  Type *Tys[] = {Val->getType()};
+
+  Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
+  Function *Func = Intrinsic::getDeclaration(M, IID, Tys);
+  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
+  CI->setDebugLoc(DL);
+
+  return CI;
+}
+
+/// Transform the following loop (Using CTLZ, CTTZ is similar):
+/// loop:
+///   CntPhi = PHI [Cnt0, CntInst]
+///   PhiX = PHI [InitX, DefX]
+///   CntInst = CntPhi + 1
+///   DefX = PhiX >> 1
+///   LOOP_BODY
+///   Br: loop if (DefX != 0)
+/// Use(CntPhi) or Use(CntInst)
+///
+/// Into:
+/// If CntPhi used outside the loop:
+///   CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
+///   Count = CountPrev + 1
+/// else
+///   Count = BitWidth(InitX) - CTLZ(InitX)
+/// loop:
+///   CntPhi = PHI [Cnt0, CntInst]
+///   PhiX = PHI [InitX, DefX]
+///   PhiCount = PHI [Count, Dec]
+///   CntInst = CntPhi + 1
+///   DefX = PhiX >> 1
+///   Dec = PhiCount - 1
+///   LOOP_BODY
+///   Br: loop if (Dec != 0)
+/// Use(CountPrev + Cnt0) // Use(CntPhi)
+/// or
+/// Use(Count + Cnt0) // Use(CntInst)
+///
+/// If LOOP_BODY is empty the loop will be deleted.
+/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
+void LoopIdiomRecognize::transformLoopToCountable(
+    Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
+    PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
+    bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
+  BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
+
+  // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
+  IRBuilder<> Builder(PreheaderBr);
+  Builder.SetCurrentDebugLocation(DL);
+
+  //   Count = BitWidth - CTLZ(InitX);
   //   NewCount = Count;
-  // If there are uses of CntPhi create: 
+  // If there are uses of CntPhi create:
   //   NewCount = BitWidth - CTLZ(InitX >> 1);
   //   Count = NewCount + 1;
   Value *InitXNext;
-  if (IsCntPhiUsedOutsideLoop) { 
-    if (DefX->getOpcode() == Instruction::AShr) 
-      InitXNext = 
-          Builder.CreateAShr(InitX, ConstantInt::get(InitX->getType(), 1)); 
-    else if (DefX->getOpcode() == Instruction::LShr) 
-      InitXNext = 
-          Builder.CreateLShr(InitX, ConstantInt::get(InitX->getType(), 1)); 
-    else if (DefX->getOpcode() == Instruction::Shl) // cttz 
-      InitXNext = 
-          Builder.CreateShl(InitX, ConstantInt::get(InitX->getType(), 1)); 
-    else 
-      llvm_unreachable("Unexpected opcode!"); 
-  } else 
-    InitXNext = InitX; 
+  if (IsCntPhiUsedOutsideLoop) {
+    if (DefX->getOpcode() == Instruction::AShr)
+      InitXNext =
+          Builder.CreateAShr(InitX, ConstantInt::get(InitX->getType(), 1));
+    else if (DefX->getOpcode() == Instruction::LShr)
+      InitXNext =
+          Builder.CreateLShr(InitX, ConstantInt::get(InitX->getType(), 1));
+    else if (DefX->getOpcode() == Instruction::Shl) // cttz
+      InitXNext =
+          Builder.CreateShl(InitX, ConstantInt::get(InitX->getType(), 1));
+    else
+      llvm_unreachable("Unexpected opcode!");
+  } else
+    InitXNext = InitX;
   Value *FFS = createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
   Value *Count = Builder.CreateSub(
       ConstantInt::get(FFS->getType(), FFS->getType()->getIntegerBitWidth()),
-      FFS); 
+      FFS);
   Value *NewCount = Count;
-  if (IsCntPhiUsedOutsideLoop) { 
+  if (IsCntPhiUsedOutsideLoop) {
     NewCount = Count;
     Count = Builder.CreateAdd(Count, ConstantInt::get(Count->getType(), 1));
-  } 
- 
+  }
+
   NewCount = Builder.CreateZExtOrTrunc(NewCount,
                                        cast<IntegerType>(CntInst->getType()));
- 
-  Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader); 
+
+  Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
   if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) {
     // If the counter was being incremented in the loop, add NewCount to the
     // counter's initial value, but only if the initial value is not zero.
@@ -1772,153 +1772,153 @@ void LoopIdiomRecognize::transformLoopToCountable(
     // the counter's initial value.
     NewCount = Builder.CreateSub(CntInitVal, NewCount);
   }
- 
-  // Step 2: Insert new IV and loop condition: 
-  // loop: 
-  //   ... 
-  //   PhiCount = PHI [Count, Dec] 
-  //   ... 
-  //   Dec = PhiCount - 1 
-  //   ... 
-  //   Br: loop if (Dec != 0) 
-  BasicBlock *Body = *(CurLoop->block_begin()); 
-  auto *LbBr = cast<BranchInst>(Body->getTerminator()); 
-  ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition()); 
-  Type *Ty = Count->getType(); 
- 
-  PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front()); 
- 
-  Builder.SetInsertPoint(LbCond); 
-  Instruction *TcDec = cast<Instruction>( 
-      Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1), 
-                        "tcdec", false, true)); 
- 
-  TcPhi->addIncoming(Count, Preheader); 
-  TcPhi->addIncoming(TcDec, Body); 
- 
-  CmpInst::Predicate Pred = 
-      (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ; 
-  LbCond->setPredicate(Pred); 
-  LbCond->setOperand(0, TcDec); 
-  LbCond->setOperand(1, ConstantInt::get(Ty, 0)); 
- 
-  // Step 3: All the references to the original counter outside 
-  //  the loop are replaced with the NewCount 
-  if (IsCntPhiUsedOutsideLoop) 
-    CntPhi->replaceUsesOutsideBlock(NewCount, Body); 
-  else 
-    CntInst->replaceUsesOutsideBlock(NewCount, Body); 
- 
-  // step 4: Forget the "non-computable" trip-count SCEV associated with the 
-  //   loop. The loop would otherwise not be deleted even if it becomes empty. 
-  SE->forgetLoop(CurLoop); 
-} 
- 
-void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB, 
-                                                 Instruction *CntInst, 
-                                                 PHINode *CntPhi, Value *Var) { 
-  BasicBlock *PreHead = CurLoop->getLoopPreheader(); 
-  auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator()); 
-  const DebugLoc &DL = CntInst->getDebugLoc(); 
- 
-  // Assuming before transformation, the loop is following: 
-  //  if (x) // the precondition 
-  //     do { cnt++; x &= x - 1; } while(x); 
- 
-  // Step 1: Insert the ctpop instruction at the end of the precondition block 
-  IRBuilder<> Builder(PreCondBr); 
-  Value *PopCnt, *PopCntZext, *NewCount, *TripCnt; 
-  { 
-    PopCnt = createPopcntIntrinsic(Builder, Var, DL); 
-    NewCount = PopCntZext = 
-        Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType())); 
- 
-    if (NewCount != PopCnt) 
-      (cast<Instruction>(NewCount))->setDebugLoc(DL); 
- 
-    // TripCnt is exactly the number of iterations the loop has 
-    TripCnt = NewCount; 
- 
-    // If the population counter's initial value is not zero, insert Add Inst. 
-    Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead); 
-    ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal); 
-    if (!InitConst || !InitConst->isZero()) { 
-      NewCount = Builder.CreateAdd(NewCount, CntInitVal); 
-      (cast<Instruction>(NewCount))->setDebugLoc(DL); 
-    } 
-  } 
- 
-  // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to 
-  //   "if (NewCount == 0) loop-exit". Without this change, the intrinsic 
-  //   function would be partial dead code, and downstream passes will drag 
-  //   it back from the precondition block to the preheader. 
-  { 
-    ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition()); 
- 
-    Value *Opnd0 = PopCntZext; 
-    Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0); 
-    if (PreCond->getOperand(0) != Var) 
-      std::swap(Opnd0, Opnd1); 
- 
-    ICmpInst *NewPreCond = cast<ICmpInst>( 
-        Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1)); 
-    PreCondBr->setCondition(NewPreCond); 
- 
-    RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI); 
-  } 
- 
-  // Step 3: Note that the population count is exactly the trip count of the 
-  // loop in question, which enable us to convert the loop from noncountable 
-  // loop into a countable one. The benefit is twofold: 
-  // 
-  //  - If the loop only counts population, the entire loop becomes dead after 
-  //    the transformation. It is a lot easier to prove a countable loop dead 
-  //    than to prove a noncountable one. (In some C dialects, an infinite loop 
-  //    isn't dead even if it computes nothing useful. In general, DCE needs 
-  //    to prove a noncountable loop finite before safely delete it.) 
-  // 
-  //  - If the loop also performs something else, it remains alive. 
-  //    Since it is transformed to countable form, it can be aggressively 
-  //    optimized by some optimizations which are in general not applicable 
-  //    to a noncountable loop. 
-  // 
-  // After this step, this loop (conceptually) would look like following: 
-  //   newcnt = __builtin_ctpop(x); 
-  //   t = newcnt; 
-  //   if (x) 
-  //     do { cnt++; x &= x-1; t--) } while (t > 0); 
-  BasicBlock *Body = *(CurLoop->block_begin()); 
-  { 
-    auto *LbBr = cast<BranchInst>(Body->getTerminator()); 
-    ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition()); 
-    Type *Ty = TripCnt->getType(); 
- 
-    PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front()); 
- 
-    Builder.SetInsertPoint(LbCond); 
-    Instruction *TcDec = cast<Instruction>( 
-        Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1), 
-                          "tcdec", false, true)); 
- 
-    TcPhi->addIncoming(TripCnt, PreHead); 
-    TcPhi->addIncoming(TcDec, Body); 
- 
-    CmpInst::Predicate Pred = 
-        (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE; 
-    LbCond->setPredicate(Pred); 
-    LbCond->setOperand(0, TcDec); 
-    LbCond->setOperand(1, ConstantInt::get(Ty, 0)); 
-  } 
- 
-  // Step 4: All the references to the original population counter outside 
-  //  the loop are replaced with the NewCount -- the value returned from 
-  //  __builtin_ctpop(). 
-  CntInst->replaceUsesOutsideBlock(NewCount, Body); 
- 
-  // step 5: Forget the "non-computable" trip-count SCEV associated with the 
-  //   loop. The loop would otherwise not be deleted even if it becomes empty. 
-  SE->forgetLoop(CurLoop); 
-} 
+
+  // Step 2: Insert new IV and loop condition:
+  // loop:
+  //   ...
+  //   PhiCount = PHI [Count, Dec]
+  //   ...
+  //   Dec = PhiCount - 1
+  //   ...
+  //   Br: loop if (Dec != 0)
+  BasicBlock *Body = *(CurLoop->block_begin());
+  auto *LbBr = cast<BranchInst>(Body->getTerminator());
+  ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
+  Type *Ty = Count->getType();
+
+  PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
+
+  Builder.SetInsertPoint(LbCond);
+  Instruction *TcDec = cast<Instruction>(
+      Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
+                        "tcdec", false, true));
+
+  TcPhi->addIncoming(Count, Preheader);
+  TcPhi->addIncoming(TcDec, Body);
+
+  CmpInst::Predicate Pred =
+      (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
+  LbCond->setPredicate(Pred);
+  LbCond->setOperand(0, TcDec);
+  LbCond->setOperand(1, ConstantInt::get(Ty, 0));
+
+  // Step 3: All the references to the original counter outside
+  //  the loop are replaced with the NewCount
+  if (IsCntPhiUsedOutsideLoop)
+    CntPhi->replaceUsesOutsideBlock(NewCount, Body);
+  else
+    CntInst->replaceUsesOutsideBlock(NewCount, Body);
+
+  // step 4: Forget the "non-computable" trip-count SCEV associated with the
+  //   loop. The loop would otherwise not be deleted even if it becomes empty.
+  SE->forgetLoop(CurLoop);
+}
+
+void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
+                                                 Instruction *CntInst,
+                                                 PHINode *CntPhi, Value *Var) {
+  BasicBlock *PreHead = CurLoop->getLoopPreheader();
+  auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
+  const DebugLoc &DL = CntInst->getDebugLoc();
+
+  // Assuming before transformation, the loop is following:
+  //  if (x) // the precondition
+  //     do { cnt++; x &= x - 1; } while(x);
+
+  // Step 1: Insert the ctpop instruction at the end of the precondition block
+  IRBuilder<> Builder(PreCondBr);
+  Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
+  {
+    PopCnt = createPopcntIntrinsic(Builder, Var, DL);
+    NewCount = PopCntZext =
+        Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
+
+    if (NewCount != PopCnt)
+      (cast<Instruction>(NewCount))->setDebugLoc(DL);
+
+    // TripCnt is exactly the number of iterations the loop has
+    TripCnt = NewCount;
+
+    // If the population counter's initial value is not zero, insert Add Inst.
+    Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
+    ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
+    if (!InitConst || !InitConst->isZero()) {
+      NewCount = Builder.CreateAdd(NewCount, CntInitVal);
+      (cast<Instruction>(NewCount))->setDebugLoc(DL);
+    }
+  }
+
+  // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
+  //   "if (NewCount == 0) loop-exit". Without this change, the intrinsic
+  //   function would be partial dead code, and downstream passes will drag
+  //   it back from the precondition block to the preheader.
+  {
+    ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
+
+    Value *Opnd0 = PopCntZext;
+    Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
+    if (PreCond->getOperand(0) != Var)
+      std::swap(Opnd0, Opnd1);
+
+    ICmpInst *NewPreCond = cast<ICmpInst>(
+        Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
+    PreCondBr->setCondition(NewPreCond);
+
+    RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI);
+  }
+
+  // Step 3: Note that the population count is exactly the trip count of the
+  // loop in question, which enable us to convert the loop from noncountable
+  // loop into a countable one. The benefit is twofold:
+  //
+  //  - If the loop only counts population, the entire loop becomes dead after
+  //    the transformation. It is a lot easier to prove a countable loop dead
+  //    than to prove a noncountable one. (In some C dialects, an infinite loop
+  //    isn't dead even if it computes nothing useful. In general, DCE needs
+  //    to prove a noncountable loop finite before safely delete it.)
+  //
+  //  - If the loop also performs something else, it remains alive.
+  //    Since it is transformed to countable form, it can be aggressively
+  //    optimized by some optimizations which are in general not applicable
+  //    to a noncountable loop.
+  //
+  // After this step, this loop (conceptually) would look like following:
+  //   newcnt = __builtin_ctpop(x);
+  //   t = newcnt;
+  //   if (x)
+  //     do { cnt++; x &= x-1; t--) } while (t > 0);
+  BasicBlock *Body = *(CurLoop->block_begin());
+  {
+    auto *LbBr = cast<BranchInst>(Body->getTerminator());
+    ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
+    Type *Ty = TripCnt->getType();
+
+    PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
+
+    Builder.SetInsertPoint(LbCond);
+    Instruction *TcDec = cast<Instruction>(
+        Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
+                          "tcdec", false, true));
+
+    TcPhi->addIncoming(TripCnt, PreHead);
+    TcPhi->addIncoming(TcDec, Body);
+
+    CmpInst::Predicate Pred =
+        (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
+    LbCond->setPredicate(Pred);
+    LbCond->setOperand(0, TcDec);
+    LbCond->setOperand(1, ConstantInt::get(Ty, 0));
+  }
+
+  // Step 4: All the references to the original population counter outside
+  //  the loop are replaced with the NewCount -- the value returned from
+  //  __builtin_ctpop().
+  CntInst->replaceUsesOutsideBlock(NewCount, Body);
+
+  // step 5: Forget the "non-computable" trip-count SCEV associated with the
+  //   loop. The loop would otherwise not be deleted even if it becomes empty.
+  SE->forgetLoop(CurLoop);
+}
 
 /// Match loop-invariant value.
 template <typename SubPattern_t> struct match_LoopInvariant {
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopInstSimplify.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopInstSimplify.cpp
index 219f7f38b6..3153a87211 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -1,257 +1,257 @@
-//===- LoopInstSimplify.cpp - Loop Instruction Simplification Pass --------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass performs lightweight instruction simplification on loop bodies. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/LoopInstSimplify.h" 
-#include "llvm/ADT/PointerIntPair.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/LoopIterator.h" 
-#include "llvm/Analysis/LoopPass.h" 
-#include "llvm/Analysis/MemorySSA.h" 
-#include "llvm/Analysis/MemorySSAUpdater.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Transforms/Utils/LoopUtils.h" 
-#include <algorithm> 
-#include <utility> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "loop-instsimplify" 
- 
-STATISTIC(NumSimplified, "Number of redundant instructions simplified"); 
- 
-static bool simplifyLoopInst(Loop &L, DominatorTree &DT, LoopInfo &LI, 
-                             AssumptionCache &AC, const TargetLibraryInfo &TLI, 
-                             MemorySSAUpdater *MSSAU) { 
-  const DataLayout &DL = L.getHeader()->getModule()->getDataLayout(); 
-  SimplifyQuery SQ(DL, &TLI, &DT, &AC); 
- 
-  // On the first pass over the loop body we try to simplify every instruction. 
-  // On subsequent passes, we can restrict this to only simplifying instructions 
-  // where the inputs have been updated. We end up needing two sets: one 
-  // containing the instructions we are simplifying in *this* pass, and one for 
-  // the instructions we will want to simplify in the *next* pass. We use 
-  // pointers so we can swap between two stably allocated sets. 
-  SmallPtrSet<const Instruction *, 8> S1, S2, *ToSimplify = &S1, *Next = &S2; 
- 
-  // Track the PHI nodes that have already been visited during each iteration so 
-  // that we can identify when it is necessary to iterate. 
-  SmallPtrSet<PHINode *, 4> VisitedPHIs; 
- 
-  // While simplifying we may discover dead code or cause code to become dead. 
-  // Keep track of all such instructions and we will delete them at the end. 
-  SmallVector<WeakTrackingVH, 8> DeadInsts; 
- 
-  // First we want to create an RPO traversal of the loop body. By processing in 
-  // RPO we can ensure that definitions are processed prior to uses (for non PHI 
-  // uses) in all cases. This ensures we maximize the simplifications in each 
-  // iteration over the loop and minimizes the possible causes for continuing to 
-  // iterate. 
-  LoopBlocksRPO RPOT(&L); 
-  RPOT.perform(&LI); 
-  MemorySSA *MSSA = MSSAU ? MSSAU->getMemorySSA() : nullptr; 
- 
-  bool Changed = false; 
-  for (;;) { 
-    if (MSSAU && VerifyMemorySSA) 
-      MSSA->verifyMemorySSA(); 
-    for (BasicBlock *BB : RPOT) { 
-      for (Instruction &I : *BB) { 
-        if (auto *PI = dyn_cast<PHINode>(&I)) 
-          VisitedPHIs.insert(PI); 
- 
-        if (I.use_empty()) { 
-          if (isInstructionTriviallyDead(&I, &TLI)) 
-            DeadInsts.push_back(&I); 
-          continue; 
-        } 
- 
-        // We special case the first iteration which we can detect due to the 
-        // empty `ToSimplify` set. 
-        bool IsFirstIteration = ToSimplify->empty(); 
- 
-        if (!IsFirstIteration && !ToSimplify->count(&I)) 
-          continue; 
- 
-        Value *V = SimplifyInstruction(&I, SQ.getWithInstruction(&I)); 
-        if (!V || !LI.replacementPreservesLCSSAForm(&I, V)) 
-          continue; 
- 
-        for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); 
-             UI != UE;) { 
-          Use &U = *UI++; 
-          auto *UserI = cast<Instruction>(U.getUser()); 
-          U.set(V); 
- 
-          // If the instruction is used by a PHI node we have already processed 
-          // we'll need to iterate on the loop body to converge, so add it to 
-          // the next set. 
-          if (auto *UserPI = dyn_cast<PHINode>(UserI)) 
-            if (VisitedPHIs.count(UserPI)) { 
-              Next->insert(UserPI); 
-              continue; 
-            } 
- 
-          // If we are only simplifying targeted instructions and the user is an 
-          // instruction in the loop body, add it to our set of targeted 
-          // instructions. Because we process defs before uses (outside of PHIs) 
-          // we won't have visited it yet. 
-          // 
-          // We also skip any uses outside of the loop being simplified. Those 
-          // should always be PHI nodes due to LCSSA form, and we don't want to 
-          // try to simplify those away. 
-          assert((L.contains(UserI) || isa<PHINode>(UserI)) && 
-                 "Uses outside the loop should be PHI nodes due to LCSSA!"); 
-          if (!IsFirstIteration && L.contains(UserI)) 
-            ToSimplify->insert(UserI); 
-        } 
- 
-        if (MSSAU) 
-          if (Instruction *SimpleI = dyn_cast_or_null<Instruction>(V)) 
-            if (MemoryAccess *MA = MSSA->getMemoryAccess(&I)) 
-              if (MemoryAccess *ReplacementMA = MSSA->getMemoryAccess(SimpleI)) 
-                MA->replaceAllUsesWith(ReplacementMA); 
- 
-        assert(I.use_empty() && "Should always have replaced all uses!"); 
-        if (isInstructionTriviallyDead(&I, &TLI)) 
-          DeadInsts.push_back(&I); 
-        ++NumSimplified; 
-        Changed = true; 
-      } 
-    } 
- 
-    // Delete any dead instructions found thus far now that we've finished an 
-    // iteration over all instructions in all the loop blocks. 
-    if (!DeadInsts.empty()) { 
-      Changed = true; 
-      RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, &TLI, MSSAU); 
-    } 
- 
-    if (MSSAU && VerifyMemorySSA) 
-      MSSA->verifyMemorySSA(); 
- 
-    // If we never found a PHI that needs to be simplified in the next 
-    // iteration, we're done. 
-    if (Next->empty()) 
-      break; 
- 
-    // Otherwise, put the next set in place for the next iteration and reset it 
-    // and the visited PHIs for that iteration. 
-    std::swap(Next, ToSimplify); 
-    Next->clear(); 
-    VisitedPHIs.clear(); 
-    DeadInsts.clear(); 
-  } 
- 
-  return Changed; 
-} 
- 
-namespace { 
- 
-class LoopInstSimplifyLegacyPass : public LoopPass { 
-public: 
-  static char ID; // Pass ID, replacement for typeid 
- 
-  LoopInstSimplifyLegacyPass() : LoopPass(ID) { 
-    initializeLoopInstSimplifyLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnLoop(Loop *L, LPPassManager &LPM) override { 
-    if (skipLoop(L)) 
-      return false; 
-    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-    LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 
-    AssumptionCache &AC = 
-        getAnalysis<AssumptionCacheTracker>().getAssumptionCache( 
-            *L->getHeader()->getParent()); 
-    const TargetLibraryInfo &TLI = 
-        getAnalysis<TargetLibraryInfoWrapperPass>().getTLI( 
-            *L->getHeader()->getParent()); 
-    MemorySSA *MSSA = nullptr; 
-    Optional<MemorySSAUpdater> MSSAU; 
-    if (EnableMSSALoopDependency) { 
-      MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA(); 
-      MSSAU = MemorySSAUpdater(MSSA); 
-    } 
- 
-    return simplifyLoopInst(*L, DT, LI, AC, TLI, 
-                            MSSAU.hasValue() ? MSSAU.getPointer() : nullptr); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<AssumptionCacheTracker>(); 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-    AU.setPreservesCFG(); 
-    if (EnableMSSALoopDependency) { 
-      AU.addRequired<MemorySSAWrapperPass>(); 
-      AU.addPreserved<MemorySSAWrapperPass>(); 
-    } 
-    getLoopAnalysisUsage(AU); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM, 
-                                            LoopStandardAnalysisResults &AR, 
-                                            LPMUpdater &) { 
-  Optional<MemorySSAUpdater> MSSAU; 
-  if (AR.MSSA) { 
-    MSSAU = MemorySSAUpdater(AR.MSSA); 
-    if (VerifyMemorySSA) 
-      AR.MSSA->verifyMemorySSA(); 
-  } 
-  if (!simplifyLoopInst(L, AR.DT, AR.LI, AR.AC, AR.TLI, 
-                        MSSAU.hasValue() ? MSSAU.getPointer() : nullptr)) 
-    return PreservedAnalyses::all(); 
- 
-  auto PA = getLoopPassPreservedAnalyses(); 
-  PA.preserveSet<CFGAnalyses>(); 
-  if (AR.MSSA) 
-    PA.preserve<MemorySSAAnalysis>(); 
-  return PA; 
-} 
- 
-char LoopInstSimplifyLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(LoopInstSimplifyLegacyPass, "loop-instsimplify", 
-                      "Simplify instructions in loops", false, false) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
-INITIALIZE_PASS_DEPENDENCY(LoopPass) 
-INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_END(LoopInstSimplifyLegacyPass, "loop-instsimplify", 
-                    "Simplify instructions in loops", false, false) 
- 
-Pass *llvm::createLoopInstSimplifyPass() { 
-  return new LoopInstSimplifyLegacyPass(); 
-} 
+//===- LoopInstSimplify.cpp - Loop Instruction Simplification Pass --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs lightweight instruction simplification on loop bodies.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopInstSimplify.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/User.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include <algorithm>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-instsimplify"
+
+STATISTIC(NumSimplified, "Number of redundant instructions simplified");
+
+static bool simplifyLoopInst(Loop &L, DominatorTree &DT, LoopInfo &LI,
+                             AssumptionCache &AC, const TargetLibraryInfo &TLI,
+                             MemorySSAUpdater *MSSAU) {
+  const DataLayout &DL = L.getHeader()->getModule()->getDataLayout();
+  SimplifyQuery SQ(DL, &TLI, &DT, &AC);
+
+  // On the first pass over the loop body we try to simplify every instruction.
+  // On subsequent passes, we can restrict this to only simplifying instructions
+  // where the inputs have been updated. We end up needing two sets: one
+  // containing the instructions we are simplifying in *this* pass, and one for
+  // the instructions we will want to simplify in the *next* pass. We use
+  // pointers so we can swap between two stably allocated sets.
+  SmallPtrSet<const Instruction *, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
+
+  // Track the PHI nodes that have already been visited during each iteration so
+  // that we can identify when it is necessary to iterate.
+  SmallPtrSet<PHINode *, 4> VisitedPHIs;
+
+  // While simplifying we may discover dead code or cause code to become dead.
+  // Keep track of all such instructions and we will delete them at the end.
+  SmallVector<WeakTrackingVH, 8> DeadInsts;
+
+  // First we want to create an RPO traversal of the loop body. By processing in
+  // RPO we can ensure that definitions are processed prior to uses (for non PHI
+  // uses) in all cases. This ensures we maximize the simplifications in each
+  // iteration over the loop and minimizes the possible causes for continuing to
+  // iterate.
+  LoopBlocksRPO RPOT(&L);
+  RPOT.perform(&LI);
+  MemorySSA *MSSA = MSSAU ? MSSAU->getMemorySSA() : nullptr;
+
+  bool Changed = false;
+  for (;;) {
+    if (MSSAU && VerifyMemorySSA)
+      MSSA->verifyMemorySSA();
+    for (BasicBlock *BB : RPOT) {
+      for (Instruction &I : *BB) {
+        if (auto *PI = dyn_cast<PHINode>(&I))
+          VisitedPHIs.insert(PI);
+
+        if (I.use_empty()) {
+          if (isInstructionTriviallyDead(&I, &TLI))
+            DeadInsts.push_back(&I);
+          continue;
+        }
+
+        // We special case the first iteration which we can detect due to the
+        // empty `ToSimplify` set.
+        bool IsFirstIteration = ToSimplify->empty();
+
+        if (!IsFirstIteration && !ToSimplify->count(&I))
+          continue;
+
+        Value *V = SimplifyInstruction(&I, SQ.getWithInstruction(&I));
+        if (!V || !LI.replacementPreservesLCSSAForm(&I, V))
+          continue;
+
+        for (Value::use_iterator UI = I.use_begin(), UE = I.use_end();
+             UI != UE;) {
+          Use &U = *UI++;
+          auto *UserI = cast<Instruction>(U.getUser());
+          U.set(V);
+
+          // If the instruction is used by a PHI node we have already processed
+          // we'll need to iterate on the loop body to converge, so add it to
+          // the next set.
+          if (auto *UserPI = dyn_cast<PHINode>(UserI))
+            if (VisitedPHIs.count(UserPI)) {
+              Next->insert(UserPI);
+              continue;
+            }
+
+          // If we are only simplifying targeted instructions and the user is an
+          // instruction in the loop body, add it to our set of targeted
+          // instructions. Because we process defs before uses (outside of PHIs)
+          // we won't have visited it yet.
+          //
+          // We also skip any uses outside of the loop being simplified. Those
+          // should always be PHI nodes due to LCSSA form, and we don't want to
+          // try to simplify those away.
+          assert((L.contains(UserI) || isa<PHINode>(UserI)) &&
+                 "Uses outside the loop should be PHI nodes due to LCSSA!");
+          if (!IsFirstIteration && L.contains(UserI))
+            ToSimplify->insert(UserI);
+        }
+
+        if (MSSAU)
+          if (Instruction *SimpleI = dyn_cast_or_null<Instruction>(V))
+            if (MemoryAccess *MA = MSSA->getMemoryAccess(&I))
+              if (MemoryAccess *ReplacementMA = MSSA->getMemoryAccess(SimpleI))
+                MA->replaceAllUsesWith(ReplacementMA);
+
+        assert(I.use_empty() && "Should always have replaced all uses!");
+        if (isInstructionTriviallyDead(&I, &TLI))
+          DeadInsts.push_back(&I);
+        ++NumSimplified;
+        Changed = true;
+      }
+    }
+
+    // Delete any dead instructions found thus far now that we've finished an
+    // iteration over all instructions in all the loop blocks.
+    if (!DeadInsts.empty()) {
+      Changed = true;
+      RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, &TLI, MSSAU);
+    }
+
+    if (MSSAU && VerifyMemorySSA)
+      MSSA->verifyMemorySSA();
+
+    // If we never found a PHI that needs to be simplified in the next
+    // iteration, we're done.
+    if (Next->empty())
+      break;
+
+    // Otherwise, put the next set in place for the next iteration and reset it
+    // and the visited PHIs for that iteration.
+    std::swap(Next, ToSimplify);
+    Next->clear();
+    VisitedPHIs.clear();
+    DeadInsts.clear();
+  }
+
+  return Changed;
+}
+
+namespace {
+
+class LoopInstSimplifyLegacyPass : public LoopPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+
+  LoopInstSimplifyLegacyPass() : LoopPass(ID) {
+    initializeLoopInstSimplifyLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+    if (skipLoop(L))
+      return false;
+    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    AssumptionCache &AC =
+        getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+            *L->getHeader()->getParent());
+    const TargetLibraryInfo &TLI =
+        getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
+            *L->getHeader()->getParent());
+    MemorySSA *MSSA = nullptr;
+    Optional<MemorySSAUpdater> MSSAU;
+    if (EnableMSSALoopDependency) {
+      MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
+      MSSAU = MemorySSAUpdater(MSSA);
+    }
+
+    return simplifyLoopInst(*L, DT, LI, AC, TLI,
+                            MSSAU.hasValue() ? MSSAU.getPointer() : nullptr);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.setPreservesCFG();
+    if (EnableMSSALoopDependency) {
+      AU.addRequired<MemorySSAWrapperPass>();
+      AU.addPreserved<MemorySSAWrapperPass>();
+    }
+    getLoopAnalysisUsage(AU);
+  }
+};
+
+} // end anonymous namespace
+
+PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
+                                            LoopStandardAnalysisResults &AR,
+                                            LPMUpdater &) {
+  Optional<MemorySSAUpdater> MSSAU;
+  if (AR.MSSA) {
+    MSSAU = MemorySSAUpdater(AR.MSSA);
+    if (VerifyMemorySSA)
+      AR.MSSA->verifyMemorySSA();
+  }
+  if (!simplifyLoopInst(L, AR.DT, AR.LI, AR.AC, AR.TLI,
+                        MSSAU.hasValue() ? MSSAU.getPointer() : nullptr))
+    return PreservedAnalyses::all();
+
+  auto PA = getLoopPassPreservedAnalyses();
+  PA.preserveSet<CFGAnalyses>();
+  if (AR.MSSA)
+    PA.preserve<MemorySSAAnalysis>();
+  return PA;
+}
+
+char LoopInstSimplifyLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(LoopInstSimplifyLegacyPass, "loop-instsimplify",
+                      "Simplify instructions in loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(LoopInstSimplifyLegacyPass, "loop-instsimplify",
+                    "Simplify instructions in loops", false, false)
+
+Pass *llvm::createLoopInstSimplifyPass() {
+  return new LoopInstSimplifyLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopInterchange.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopInterchange.cpp
index 9b23343a0f..d9dbc0deb4 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -1,616 +1,616 @@
-//===- LoopInterchange.cpp - Loop interchange pass-------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This Pass handles loop interchange transform. 
-// This pass interchanges loops to provide a more cache-friendly memory access 
-// patterns. 
-// 
-//===----------------------------------------------------------------------===// 
- 
+//===- LoopInterchange.cpp - Loop interchange pass-------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This Pass handles loop interchange transform.
+// This pass interchanges loops to provide a more cache-friendly memory access
+// patterns.
+//
+//===----------------------------------------------------------------------===//
+
 #include "llvm/Transforms/Scalar/LoopInterchange.h"
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/StringRef.h" 
-#include "llvm/Analysis/DependenceAnalysis.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/LoopPass.h" 
-#include "llvm/Analysis/OptimizationRemarkEmitter.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/Analysis/ScalarEvolutionExpressions.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DiagnosticInfo.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/LoopUtils.h" 
-#include <cassert> 
-#include <utility> 
-#include <vector> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "loop-interchange" 
- 
-STATISTIC(LoopsInterchanged, "Number of loops interchanged"); 
- 
-static cl::opt<int> LoopInterchangeCostThreshold( 
-    "loop-interchange-threshold", cl::init(0), cl::Hidden, 
-    cl::desc("Interchange if you gain more than this number")); 
- 
-namespace { 
- 
-using LoopVector = SmallVector<Loop *, 8>; 
- 
-// TODO: Check if we can use a sparse matrix here. 
-using CharMatrix = std::vector<std::vector<char>>; 
- 
-} // end anonymous namespace 
- 
-// Maximum number of dependencies that can be handled in the dependency matrix. 
-static const unsigned MaxMemInstrCount = 100; 
- 
-// Maximum loop depth supported. 
-static const unsigned MaxLoopNestDepth = 10; 
- 
-#ifdef DUMP_DEP_MATRICIES 
-static void printDepMatrix(CharMatrix &DepMatrix) { 
-  for (auto &Row : DepMatrix) { 
-    for (auto D : Row) 
-      LLVM_DEBUG(dbgs() << D << " "); 
-    LLVM_DEBUG(dbgs() << "\n"); 
-  } 
-} 
-#endif 
- 
-static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, 
-                                     Loop *L, DependenceInfo *DI) { 
-  using ValueVector = SmallVector<Value *, 16>; 
- 
-  ValueVector MemInstr; 
- 
-  // For each block. 
-  for (BasicBlock *BB : L->blocks()) { 
-    // Scan the BB and collect legal loads and stores. 
-    for (Instruction &I : *BB) { 
-      if (!isa<Instruction>(I)) 
-        return false; 
-      if (auto *Ld = dyn_cast<LoadInst>(&I)) { 
-        if (!Ld->isSimple()) 
-          return false; 
-        MemInstr.push_back(&I); 
-      } else if (auto *St = dyn_cast<StoreInst>(&I)) { 
-        if (!St->isSimple()) 
-          return false; 
-        MemInstr.push_back(&I); 
-      } 
-    } 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "Found " << MemInstr.size() 
-                    << " Loads and Stores to analyze\n"); 
- 
-  ValueVector::iterator I, IE, J, JE; 
- 
-  for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) { 
-    for (J = I, JE = MemInstr.end(); J != JE; ++J) { 
-      std::vector<char> Dep; 
-      Instruction *Src = cast<Instruction>(*I); 
-      Instruction *Dst = cast<Instruction>(*J); 
-      if (Src == Dst) 
-        continue; 
-      // Ignore Input dependencies. 
-      if (isa<LoadInst>(Src) && isa<LoadInst>(Dst)) 
-        continue; 
-      // Track Output, Flow, and Anti dependencies. 
-      if (auto D = DI->depends(Src, Dst, true)) { 
-        assert(D->isOrdered() && "Expected an output, flow or anti dep."); 
-        LLVM_DEBUG(StringRef DepType = 
-                       D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output"; 
-                   dbgs() << "Found " << DepType 
-                          << " dependency between Src and Dst\n" 
-                          << " Src:" << *Src << "\n Dst:" << *Dst << '\n'); 
-        unsigned Levels = D->getLevels(); 
-        char Direction; 
-        for (unsigned II = 1; II <= Levels; ++II) { 
-          const SCEV *Distance = D->getDistance(II); 
-          const SCEVConstant *SCEVConst = 
-              dyn_cast_or_null<SCEVConstant>(Distance); 
-          if (SCEVConst) { 
-            const ConstantInt *CI = SCEVConst->getValue(); 
-            if (CI->isNegative()) 
-              Direction = '<'; 
-            else if (CI->isZero()) 
-              Direction = '='; 
-            else 
-              Direction = '>'; 
-            Dep.push_back(Direction); 
-          } else if (D->isScalar(II)) { 
-            Direction = 'S'; 
-            Dep.push_back(Direction); 
-          } else { 
-            unsigned Dir = D->getDirection(II); 
-            if (Dir == Dependence::DVEntry::LT || 
-                Dir == Dependence::DVEntry::LE) 
-              Direction = '<'; 
-            else if (Dir == Dependence::DVEntry::GT || 
-                     Dir == Dependence::DVEntry::GE) 
-              Direction = '>'; 
-            else if (Dir == Dependence::DVEntry::EQ) 
-              Direction = '='; 
-            else 
-              Direction = '*'; 
-            Dep.push_back(Direction); 
-          } 
-        } 
-        while (Dep.size() != Level) { 
-          Dep.push_back('I'); 
-        } 
- 
-        DepMatrix.push_back(Dep); 
-        if (DepMatrix.size() > MaxMemInstrCount) { 
-          LLVM_DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount 
-                            << " dependencies inside loop\n"); 
-          return false; 
-        } 
-      } 
-    } 
-  } 
- 
-  return true; 
-} 
- 
-// A loop is moved from index 'from' to an index 'to'. Update the Dependence 
-// matrix by exchanging the two columns. 
-static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx, 
-                                    unsigned ToIndx) { 
-  unsigned numRows = DepMatrix.size(); 
-  for (unsigned i = 0; i < numRows; ++i) { 
-    char TmpVal = DepMatrix[i][ToIndx]; 
-    DepMatrix[i][ToIndx] = DepMatrix[i][FromIndx]; 
-    DepMatrix[i][FromIndx] = TmpVal; 
-  } 
-} 
- 
-// Checks if outermost non '=','S'or'I' dependence in the dependence matrix is 
-// '>' 
-static bool isOuterMostDepPositive(CharMatrix &DepMatrix, unsigned Row, 
-                                   unsigned Column) { 
-  for (unsigned i = 0; i <= Column; ++i) { 
-    if (DepMatrix[Row][i] == '<') 
-      return false; 
-    if (DepMatrix[Row][i] == '>') 
-      return true; 
-  } 
-  // All dependencies were '=','S' or 'I' 
-  return false; 
-} 
- 
-// Checks if no dependence exist in the dependency matrix in Row before Column. 
-static bool containsNoDependence(CharMatrix &DepMatrix, unsigned Row, 
-                                 unsigned Column) { 
-  for (unsigned i = 0; i < Column; ++i) { 
-    if (DepMatrix[Row][i] != '=' && DepMatrix[Row][i] != 'S' && 
-        DepMatrix[Row][i] != 'I') 
-      return false; 
-  } 
-  return true; 
-} 
- 
-static bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row, 
-                                unsigned OuterLoopId, char InnerDep, 
-                                char OuterDep) { 
-  if (isOuterMostDepPositive(DepMatrix, Row, OuterLoopId)) 
-    return false; 
- 
-  if (InnerDep == OuterDep) 
-    return true; 
- 
-  // It is legal to interchange if and only if after interchange no row has a 
-  // '>' direction as the leftmost non-'='. 
- 
-  if (InnerDep == '=' || InnerDep == 'S' || InnerDep == 'I') 
-    return true; 
- 
-  if (InnerDep == '<') 
-    return true; 
- 
-  if (InnerDep == '>') { 
-    // If OuterLoopId represents outermost loop then interchanging will make the 
-    // 1st dependency as '>' 
-    if (OuterLoopId == 0) 
-      return false; 
- 
-    // If all dependencies before OuterloopId are '=','S'or 'I'. Then 
-    // interchanging will result in this row having an outermost non '=' 
-    // dependency of '>' 
-    if (!containsNoDependence(DepMatrix, Row, OuterLoopId)) 
-      return true; 
-  } 
- 
-  return false; 
-} 
- 
-// Checks if it is legal to interchange 2 loops. 
-// [Theorem] A permutation of the loops in a perfect nest is legal if and only 
-// if the direction matrix, after the same permutation is applied to its 
-// columns, has no ">" direction as the leftmost non-"=" direction in any row. 
-static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, 
-                                      unsigned InnerLoopId, 
-                                      unsigned OuterLoopId) { 
-  unsigned NumRows = DepMatrix.size(); 
-  // For each row check if it is valid to interchange. 
-  for (unsigned Row = 0; Row < NumRows; ++Row) { 
-    char InnerDep = DepMatrix[Row][InnerLoopId]; 
-    char OuterDep = DepMatrix[Row][OuterLoopId]; 
-    if (InnerDep == '*' || OuterDep == '*') 
-      return false; 
-    if (!validDepInterchange(DepMatrix, Row, OuterLoopId, InnerDep, OuterDep)) 
-      return false; 
-  } 
-  return true; 
-} 
- 
-static LoopVector populateWorklist(Loop &L) { 
-  LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: " 
-                    << L.getHeader()->getParent()->getName() << " Loop: %" 
-                    << L.getHeader()->getName() << '\n'); 
-  LoopVector LoopList; 
-  Loop *CurrentLoop = &L; 
-  const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops(); 
-  while (!Vec->empty()) { 
-    // The current loop has multiple subloops in it hence it is not tightly 
-    // nested. 
-    // Discard all loops above it added into Worklist. 
-    if (Vec->size() != 1) 
-      return {}; 
- 
-    LoopList.push_back(CurrentLoop); 
-    CurrentLoop = Vec->front(); 
-    Vec = &CurrentLoop->getSubLoops(); 
-  } 
-  LoopList.push_back(CurrentLoop); 
-  return LoopList; 
-} 
- 
-static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) { 
-  PHINode *InnerIndexVar = L->getCanonicalInductionVariable(); 
-  if (InnerIndexVar) 
-    return InnerIndexVar; 
-  if (L->getLoopLatch() == nullptr || L->getLoopPredecessor() == nullptr) 
-    return nullptr; 
-  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { 
-    PHINode *PhiVar = cast<PHINode>(I); 
-    Type *PhiTy = PhiVar->getType(); 
-    if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && 
-        !PhiTy->isPointerTy()) 
-      return nullptr; 
-    const SCEVAddRecExpr *AddRec = 
-        dyn_cast<SCEVAddRecExpr>(SE->getSCEV(PhiVar)); 
-    if (!AddRec || !AddRec->isAffine()) 
-      continue; 
-    const SCEV *Step = AddRec->getStepRecurrence(*SE); 
-    if (!isa<SCEVConstant>(Step)) 
-      continue; 
-    // Found the induction variable. 
-    // FIXME: Handle loops with more than one induction variable. Note that, 
-    // currently, legality makes sure we have only one induction variable. 
-    return PhiVar; 
-  } 
-  return nullptr; 
-} 
- 
-namespace { 
- 
-/// LoopInterchangeLegality checks if it is legal to interchange the loop. 
-class LoopInterchangeLegality { 
-public: 
-  LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE, 
-                          OptimizationRemarkEmitter *ORE) 
-      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {} 
- 
-  /// Check if the loops can be interchanged. 
-  bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId, 
-                           CharMatrix &DepMatrix); 
- 
-  /// Check if the loop structure is understood. We do not handle triangular 
-  /// loops for now. 
-  bool isLoopStructureUnderstood(PHINode *InnerInductionVar); 
- 
-  bool currentLimitations(); 
- 
-  const SmallPtrSetImpl<PHINode *> &getOuterInnerReductions() const { 
-    return OuterInnerReductions; 
-  } 
- 
-private: 
-  bool tightlyNested(Loop *Outer, Loop *Inner); 
-  bool containsUnsafeInstructions(BasicBlock *BB); 
- 
-  /// Discover induction and reduction PHIs in the header of \p L. Induction 
-  /// PHIs are added to \p Inductions, reductions are added to 
-  /// OuterInnerReductions. When the outer loop is passed, the inner loop needs 
-  /// to be passed as \p InnerLoop. 
-  bool findInductionAndReductions(Loop *L, 
-                                  SmallVector<PHINode *, 8> &Inductions, 
-                                  Loop *InnerLoop); 
- 
-  Loop *OuterLoop; 
-  Loop *InnerLoop; 
- 
-  ScalarEvolution *SE; 
- 
-  /// Interface to emit optimization remarks. 
-  OptimizationRemarkEmitter *ORE; 
- 
-  /// Set of reduction PHIs taking part of a reduction across the inner and 
-  /// outer loop. 
-  SmallPtrSet<PHINode *, 4> OuterInnerReductions; 
-}; 
- 
-/// LoopInterchangeProfitability checks if it is profitable to interchange the 
-/// loop. 
-class LoopInterchangeProfitability { 
-public: 
-  LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE, 
-                               OptimizationRemarkEmitter *ORE) 
-      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {} 
- 
-  /// Check if the loop interchange is profitable. 
-  bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId, 
-                    CharMatrix &DepMatrix); 
- 
-private: 
-  int getInstrOrderCost(); 
- 
-  Loop *OuterLoop; 
-  Loop *InnerLoop; 
- 
-  /// Scev analysis. 
-  ScalarEvolution *SE; 
- 
-  /// Interface to emit optimization remarks. 
-  OptimizationRemarkEmitter *ORE; 
-}; 
- 
-/// LoopInterchangeTransform interchanges the loop. 
-class LoopInterchangeTransform { 
-public: 
-  LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE, 
-                           LoopInfo *LI, DominatorTree *DT, 
-                           BasicBlock *LoopNestExit, 
-                           const LoopInterchangeLegality &LIL) 
-      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), 
-        LoopExit(LoopNestExit), LIL(LIL) {} 
- 
-  /// Interchange OuterLoop and InnerLoop. 
-  bool transform(); 
-  void restructureLoops(Loop *NewInner, Loop *NewOuter, 
-                        BasicBlock *OrigInnerPreHeader, 
-                        BasicBlock *OrigOuterPreHeader); 
-  void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop); 
- 
-private: 
-  bool adjustLoopLinks(); 
-  bool adjustLoopBranches(); 
- 
-  Loop *OuterLoop; 
-  Loop *InnerLoop; 
- 
-  /// Scev analysis. 
-  ScalarEvolution *SE; 
- 
-  LoopInfo *LI; 
-  DominatorTree *DT; 
-  BasicBlock *LoopExit; 
- 
-  const LoopInterchangeLegality &LIL; 
-}; 
- 
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include <cassert>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-interchange"
+
+STATISTIC(LoopsInterchanged, "Number of loops interchanged");
+
+static cl::opt<int> LoopInterchangeCostThreshold(
+    "loop-interchange-threshold", cl::init(0), cl::Hidden,
+    cl::desc("Interchange if you gain more than this number"));
+
+namespace {
+
+using LoopVector = SmallVector<Loop *, 8>;
+
+// TODO: Check if we can use a sparse matrix here.
+using CharMatrix = std::vector<std::vector<char>>;
+
+} // end anonymous namespace
+
+// Maximum number of dependencies that can be handled in the dependency matrix.
+static const unsigned MaxMemInstrCount = 100;
+
+// Maximum loop depth supported.
+static const unsigned MaxLoopNestDepth = 10;
+
+#ifdef DUMP_DEP_MATRICIES
+static void printDepMatrix(CharMatrix &DepMatrix) {
+  for (auto &Row : DepMatrix) {
+    for (auto D : Row)
+      LLVM_DEBUG(dbgs() << D << " ");
+    LLVM_DEBUG(dbgs() << "\n");
+  }
+}
+#endif
+
+static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
+                                     Loop *L, DependenceInfo *DI) {
+  using ValueVector = SmallVector<Value *, 16>;
+
+  ValueVector MemInstr;
+
+  // For each block.
+  for (BasicBlock *BB : L->blocks()) {
+    // Scan the BB and collect legal loads and stores.
+    for (Instruction &I : *BB) {
+      if (!isa<Instruction>(I))
+        return false;
+      if (auto *Ld = dyn_cast<LoadInst>(&I)) {
+        if (!Ld->isSimple())
+          return false;
+        MemInstr.push_back(&I);
+      } else if (auto *St = dyn_cast<StoreInst>(&I)) {
+        if (!St->isSimple())
+          return false;
+        MemInstr.push_back(&I);
+      }
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "Found " << MemInstr.size()
+                    << " Loads and Stores to analyze\n");
+
+  ValueVector::iterator I, IE, J, JE;
+
+  for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
+    for (J = I, JE = MemInstr.end(); J != JE; ++J) {
+      std::vector<char> Dep;
+      Instruction *Src = cast<Instruction>(*I);
+      Instruction *Dst = cast<Instruction>(*J);
+      if (Src == Dst)
+        continue;
+      // Ignore Input dependencies.
+      if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
+        continue;
+      // Track Output, Flow, and Anti dependencies.
+      if (auto D = DI->depends(Src, Dst, true)) {
+        assert(D->isOrdered() && "Expected an output, flow or anti dep.");
+        LLVM_DEBUG(StringRef DepType =
+                       D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
+                   dbgs() << "Found " << DepType
+                          << " dependency between Src and Dst\n"
+                          << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
+        unsigned Levels = D->getLevels();
+        char Direction;
+        for (unsigned II = 1; II <= Levels; ++II) {
+          const SCEV *Distance = D->getDistance(II);
+          const SCEVConstant *SCEVConst =
+              dyn_cast_or_null<SCEVConstant>(Distance);
+          if (SCEVConst) {
+            const ConstantInt *CI = SCEVConst->getValue();
+            if (CI->isNegative())
+              Direction = '<';
+            else if (CI->isZero())
+              Direction = '=';
+            else
+              Direction = '>';
+            Dep.push_back(Direction);
+          } else if (D->isScalar(II)) {
+            Direction = 'S';
+            Dep.push_back(Direction);
+          } else {
+            unsigned Dir = D->getDirection(II);
+            if (Dir == Dependence::DVEntry::LT ||
+                Dir == Dependence::DVEntry::LE)
+              Direction = '<';
+            else if (Dir == Dependence::DVEntry::GT ||
+                     Dir == Dependence::DVEntry::GE)
+              Direction = '>';
+            else if (Dir == Dependence::DVEntry::EQ)
+              Direction = '=';
+            else
+              Direction = '*';
+            Dep.push_back(Direction);
+          }
+        }
+        while (Dep.size() != Level) {
+          Dep.push_back('I');
+        }
+
+        DepMatrix.push_back(Dep);
+        if (DepMatrix.size() > MaxMemInstrCount) {
+          LLVM_DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount
+                            << " dependencies inside loop\n");
+          return false;
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+// A loop is moved from index 'from' to an index 'to'. Update the Dependence
+// matrix by exchanging the two columns.
+static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
+                                    unsigned ToIndx) {
+  unsigned numRows = DepMatrix.size();
+  for (unsigned i = 0; i < numRows; ++i) {
+    char TmpVal = DepMatrix[i][ToIndx];
+    DepMatrix[i][ToIndx] = DepMatrix[i][FromIndx];
+    DepMatrix[i][FromIndx] = TmpVal;
+  }
+}
+
+// Checks if outermost non '=','S'or'I' dependence in the dependence matrix is
+// '>'
+static bool isOuterMostDepPositive(CharMatrix &DepMatrix, unsigned Row,
+                                   unsigned Column) {
+  for (unsigned i = 0; i <= Column; ++i) {
+    if (DepMatrix[Row][i] == '<')
+      return false;
+    if (DepMatrix[Row][i] == '>')
+      return true;
+  }
+  // All dependencies were '=','S' or 'I'
+  return false;
+}
+
+// Checks if no dependence exist in the dependency matrix in Row before Column.
+static bool containsNoDependence(CharMatrix &DepMatrix, unsigned Row,
+                                 unsigned Column) {
+  for (unsigned i = 0; i < Column; ++i) {
+    if (DepMatrix[Row][i] != '=' && DepMatrix[Row][i] != 'S' &&
+        DepMatrix[Row][i] != 'I')
+      return false;
+  }
+  return true;
+}
+
+static bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row,
+                                unsigned OuterLoopId, char InnerDep,
+                                char OuterDep) {
+  if (isOuterMostDepPositive(DepMatrix, Row, OuterLoopId))
+    return false;
+
+  if (InnerDep == OuterDep)
+    return true;
+
+  // It is legal to interchange if and only if after interchange no row has a
+  // '>' direction as the leftmost non-'='.
+
+  if (InnerDep == '=' || InnerDep == 'S' || InnerDep == 'I')
+    return true;
+
+  if (InnerDep == '<')
+    return true;
+
+  if (InnerDep == '>') {
+    // If OuterLoopId represents outermost loop then interchanging will make the
+    // 1st dependency as '>'
+    if (OuterLoopId == 0)
+      return false;
+
+    // If all dependencies before OuterloopId are '=','S'or 'I'. Then
+    // interchanging will result in this row having an outermost non '='
+    // dependency of '>'
+    if (!containsNoDependence(DepMatrix, Row, OuterLoopId))
+      return true;
+  }
+
+  return false;
+}
+
+// Checks if it is legal to interchange 2 loops.
+// [Theorem] A permutation of the loops in a perfect nest is legal if and only
+// if the direction matrix, after the same permutation is applied to its
+// columns, has no ">" direction as the leftmost non-"=" direction in any row.
+static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
+                                      unsigned InnerLoopId,
+                                      unsigned OuterLoopId) {
+  unsigned NumRows = DepMatrix.size();
+  // For each row check if it is valid to interchange.
+  for (unsigned Row = 0; Row < NumRows; ++Row) {
+    char InnerDep = DepMatrix[Row][InnerLoopId];
+    char OuterDep = DepMatrix[Row][OuterLoopId];
+    if (InnerDep == '*' || OuterDep == '*')
+      return false;
+    if (!validDepInterchange(DepMatrix, Row, OuterLoopId, InnerDep, OuterDep))
+      return false;
+  }
+  return true;
+}
+
+static LoopVector populateWorklist(Loop &L) {
+  LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
+                    << L.getHeader()->getParent()->getName() << " Loop: %"
+                    << L.getHeader()->getName() << '\n');
+  LoopVector LoopList;
+  Loop *CurrentLoop = &L;
+  const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
+  while (!Vec->empty()) {
+    // The current loop has multiple subloops in it hence it is not tightly
+    // nested.
+    // Discard all loops above it added into Worklist.
+    if (Vec->size() != 1)
+      return {};
+
+    LoopList.push_back(CurrentLoop);
+    CurrentLoop = Vec->front();
+    Vec = &CurrentLoop->getSubLoops();
+  }
+  LoopList.push_back(CurrentLoop);
+  return LoopList;
+}
+
+static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) {
+  PHINode *InnerIndexVar = L->getCanonicalInductionVariable();
+  if (InnerIndexVar)
+    return InnerIndexVar;
+  if (L->getLoopLatch() == nullptr || L->getLoopPredecessor() == nullptr)
+    return nullptr;
+  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PhiVar = cast<PHINode>(I);
+    Type *PhiTy = PhiVar->getType();
+    if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
+        !PhiTy->isPointerTy())
+      return nullptr;
+    const SCEVAddRecExpr *AddRec =
+        dyn_cast<SCEVAddRecExpr>(SE->getSCEV(PhiVar));
+    if (!AddRec || !AddRec->isAffine())
+      continue;
+    const SCEV *Step = AddRec->getStepRecurrence(*SE);
+    if (!isa<SCEVConstant>(Step))
+      continue;
+    // Found the induction variable.
+    // FIXME: Handle loops with more than one induction variable. Note that,
+    // currently, legality makes sure we have only one induction variable.
+    return PhiVar;
+  }
+  return nullptr;
+}
+
+namespace {
+
+/// LoopInterchangeLegality checks if it is legal to interchange the loop.
+class LoopInterchangeLegality {
+public:
+  LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
+                          OptimizationRemarkEmitter *ORE)
+      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
+
+  /// Check if the loops can be interchanged.
+  bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
+                           CharMatrix &DepMatrix);
+
+  /// Check if the loop structure is understood. We do not handle triangular
+  /// loops for now.
+  bool isLoopStructureUnderstood(PHINode *InnerInductionVar);
+
+  bool currentLimitations();
+
+  const SmallPtrSetImpl<PHINode *> &getOuterInnerReductions() const {
+    return OuterInnerReductions;
+  }
+
+private:
+  bool tightlyNested(Loop *Outer, Loop *Inner);
+  bool containsUnsafeInstructions(BasicBlock *BB);
+
+  /// Discover induction and reduction PHIs in the header of \p L. Induction
+  /// PHIs are added to \p Inductions, reductions are added to
+  /// OuterInnerReductions. When the outer loop is passed, the inner loop needs
+  /// to be passed as \p InnerLoop.
+  bool findInductionAndReductions(Loop *L,
+                                  SmallVector<PHINode *, 8> &Inductions,
+                                  Loop *InnerLoop);
+
+  Loop *OuterLoop;
+  Loop *InnerLoop;
+
+  ScalarEvolution *SE;
+
+  /// Interface to emit optimization remarks.
+  OptimizationRemarkEmitter *ORE;
+
+  /// Set of reduction PHIs taking part of a reduction across the inner and
+  /// outer loop.
+  SmallPtrSet<PHINode *, 4> OuterInnerReductions;
+};
+
+/// LoopInterchangeProfitability checks if it is profitable to interchange the
+/// loop.
+class LoopInterchangeProfitability {
+public:
+  LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
+                               OptimizationRemarkEmitter *ORE)
+      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
+
+  /// Check if the loop interchange is profitable.
+  bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId,
+                    CharMatrix &DepMatrix);
+
+private:
+  int getInstrOrderCost();
+
+  Loop *OuterLoop;
+  Loop *InnerLoop;
+
+  /// Scev analysis.
+  ScalarEvolution *SE;
+
+  /// Interface to emit optimization remarks.
+  OptimizationRemarkEmitter *ORE;
+};
+
+/// LoopInterchangeTransform interchanges the loop.
+class LoopInterchangeTransform {
+public:
+  LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
+                           LoopInfo *LI, DominatorTree *DT,
+                           BasicBlock *LoopNestExit,
+                           const LoopInterchangeLegality &LIL)
+      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
+        LoopExit(LoopNestExit), LIL(LIL) {}
+
+  /// Interchange OuterLoop and InnerLoop.
+  bool transform();
+  void restructureLoops(Loop *NewInner, Loop *NewOuter,
+                        BasicBlock *OrigInnerPreHeader,
+                        BasicBlock *OrigOuterPreHeader);
+  void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
+
+private:
+  bool adjustLoopLinks();
+  bool adjustLoopBranches();
+
+  Loop *OuterLoop;
+  Loop *InnerLoop;
+
+  /// Scev analysis.
+  ScalarEvolution *SE;
+
+  LoopInfo *LI;
+  DominatorTree *DT;
+  BasicBlock *LoopExit;
+
+  const LoopInterchangeLegality &LIL;
+};
+
 struct LoopInterchange {
-  ScalarEvolution *SE = nullptr; 
-  LoopInfo *LI = nullptr; 
-  DependenceInfo *DI = nullptr; 
-  DominatorTree *DT = nullptr; 
- 
-  /// Interface to emit optimization remarks. 
-  OptimizationRemarkEmitter *ORE; 
- 
+  ScalarEvolution *SE = nullptr;
+  LoopInfo *LI = nullptr;
+  DependenceInfo *DI = nullptr;
+  DominatorTree *DT = nullptr;
+
+  /// Interface to emit optimization remarks.
+  OptimizationRemarkEmitter *ORE;
+
   LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
                   DominatorTree *DT, OptimizationRemarkEmitter *ORE)
       : SE(SE), LI(LI), DI(DI), DT(DT), ORE(ORE) {}
- 
+
   bool run(Loop *L) {
     if (L->getParentLoop())
-      return false; 
- 
-    return processLoopList(populateWorklist(*L)); 
-  } 
- 
-  bool isComputableLoopNest(LoopVector LoopList) { 
-    for (Loop *L : LoopList) { 
-      const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L); 
+      return false;
+
+    return processLoopList(populateWorklist(*L));
+  }
+
+  bool isComputableLoopNest(LoopVector LoopList) {
+    for (Loop *L : LoopList) {
+      const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
       if (isa<SCEVCouldNotCompute>(ExitCountOuter)) {
-        LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n"); 
-        return false; 
-      } 
-      if (L->getNumBackEdges() != 1) { 
-        LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n"); 
-        return false; 
-      } 
-      if (!L->getExitingBlock()) { 
-        LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n"); 
-        return false; 
-      } 
-    } 
-    return true; 
-  } 
- 
-  unsigned selectLoopForInterchange(const LoopVector &LoopList) { 
-    // TODO: Add a better heuristic to select the loop to be interchanged based 
-    // on the dependence matrix. Currently we select the innermost loop. 
-    return LoopList.size() - 1; 
-  } 
- 
-  bool processLoopList(LoopVector LoopList) { 
-    bool Changed = false; 
-    unsigned LoopNestDepth = LoopList.size(); 
-    if (LoopNestDepth < 2) { 
-      LLVM_DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n"); 
-      return false; 
-    } 
-    if (LoopNestDepth > MaxLoopNestDepth) { 
-      LLVM_DEBUG(dbgs() << "Cannot handle loops of depth greater than " 
-                        << MaxLoopNestDepth << "\n"); 
-      return false; 
-    } 
-    if (!isComputableLoopNest(LoopList)) { 
-      LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n"); 
-      return false; 
-    } 
- 
-    LLVM_DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth 
-                      << "\n"); 
- 
-    CharMatrix DependencyMatrix; 
-    Loop *OuterMostLoop = *(LoopList.begin()); 
-    if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth, 
-                                  OuterMostLoop, DI)) { 
-      LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n"); 
-      return false; 
-    } 
-#ifdef DUMP_DEP_MATRICIES 
-    LLVM_DEBUG(dbgs() << "Dependence before interchange\n"); 
-    printDepMatrix(DependencyMatrix); 
-#endif 
- 
-    // Get the Outermost loop exit. 
-    BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock(); 
-    if (!LoopNestExit) { 
-      LLVM_DEBUG(dbgs() << "OuterMostLoop needs an unique exit block"); 
-      return false; 
-    } 
- 
-    unsigned SelecLoopId = selectLoopForInterchange(LoopList); 
-    // Move the selected loop outwards to the best possible position. 
-    for (unsigned i = SelecLoopId; i > 0; i--) { 
-      bool Interchanged = 
-          processLoop(LoopList, i, i - 1, LoopNestExit, DependencyMatrix); 
-      if (!Interchanged) 
-        return Changed; 
-      // Loops interchanged reflect the same in LoopList 
-      std::swap(LoopList[i - 1], LoopList[i]); 
- 
-      // Update the DependencyMatrix 
-      interChangeDependencies(DependencyMatrix, i, i - 1); 
-#ifdef DUMP_DEP_MATRICIES 
-      LLVM_DEBUG(dbgs() << "Dependence after interchange\n"); 
-      printDepMatrix(DependencyMatrix); 
-#endif 
-      Changed |= Interchanged; 
-    } 
-    return Changed; 
-  } 
- 
-  bool processLoop(LoopVector LoopList, unsigned InnerLoopId, 
-                   unsigned OuterLoopId, BasicBlock *LoopNestExit, 
-                   std::vector<std::vector<char>> &DependencyMatrix) { 
-    LLVM_DEBUG(dbgs() << "Processing Inner Loop Id = " << InnerLoopId 
-                      << " and OuterLoopId = " << OuterLoopId << "\n"); 
-    Loop *InnerLoop = LoopList[InnerLoopId]; 
-    Loop *OuterLoop = LoopList[OuterLoopId]; 
- 
-    LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE); 
-    if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) { 
-      LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n"); 
-      return false; 
-    } 
-    LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n"); 
-    LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE); 
-    if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) { 
-      LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n"); 
-      return false; 
-    } 
- 
-    ORE->emit([&]() { 
-      return OptimizationRemark(DEBUG_TYPE, "Interchanged", 
-                                InnerLoop->getStartLoc(), 
-                                InnerLoop->getHeader()) 
-             << "Loop interchanged with enclosing loop."; 
-    }); 
- 
-    LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LoopNestExit, 
-                                 LIL); 
-    LIT.transform(); 
-    LLVM_DEBUG(dbgs() << "Loops interchanged.\n"); 
-    LoopsInterchanged++; 
- 
-    assert(InnerLoop->isLCSSAForm(*DT) && 
-           "Inner loop not left in LCSSA form after loop interchange!"); 
-    assert(OuterLoop->isLCSSAForm(*DT) && 
-           "Outer loop not left in LCSSA form after loop interchange!"); 
- 
-    return true; 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB) { 
-  return any_of(*BB, [](const Instruction &I) { 
-    return I.mayHaveSideEffects() || I.mayReadFromMemory(); 
-  }); 
-} 
- 
-bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) { 
-  BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); 
-  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 
-  BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); 
- 
-  LLVM_DEBUG(dbgs() << "Checking if loops are tightly nested\n"); 
- 
-  // A perfectly nested loop will not have any branch in between the outer and 
-  // inner block i.e. outer header will branch to either inner preheader and 
-  // outerloop latch. 
-  BranchInst *OuterLoopHeaderBI = 
-      dyn_cast<BranchInst>(OuterLoopHeader->getTerminator()); 
-  if (!OuterLoopHeaderBI) 
-    return false; 
- 
-  for (BasicBlock *Succ : successors(OuterLoopHeaderBI)) 
-    if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader() && 
-        Succ != OuterLoopLatch) 
-      return false; 
- 
-  LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n"); 
-  // We do not have any basic block in between now make sure the outer header 
-  // and outer loop latch doesn't contain any unsafe instructions. 
-  if (containsUnsafeInstructions(OuterLoopHeader) || 
-      containsUnsafeInstructions(OuterLoopLatch)) 
-    return false; 
- 
+        LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n");
+        return false;
+      }
+      if (L->getNumBackEdges() != 1) {
+        LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
+        return false;
+      }
+      if (!L->getExitingBlock()) {
+        LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
+        return false;
+      }
+    }
+    return true;
+  }
+
+  unsigned selectLoopForInterchange(const LoopVector &LoopList) {
+    // TODO: Add a better heuristic to select the loop to be interchanged based
+    // on the dependence matrix. Currently we select the innermost loop.
+    return LoopList.size() - 1;
+  }
+
+  bool processLoopList(LoopVector LoopList) {
+    bool Changed = false;
+    unsigned LoopNestDepth = LoopList.size();
+    if (LoopNestDepth < 2) {
+      LLVM_DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n");
+      return false;
+    }
+    if (LoopNestDepth > MaxLoopNestDepth) {
+      LLVM_DEBUG(dbgs() << "Cannot handle loops of depth greater than "
+                        << MaxLoopNestDepth << "\n");
+      return false;
+    }
+    if (!isComputableLoopNest(LoopList)) {
+      LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
+      return false;
+    }
+
+    LLVM_DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth
+                      << "\n");
+
+    CharMatrix DependencyMatrix;
+    Loop *OuterMostLoop = *(LoopList.begin());
+    if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
+                                  OuterMostLoop, DI)) {
+      LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
+      return false;
+    }
+#ifdef DUMP_DEP_MATRICIES
+    LLVM_DEBUG(dbgs() << "Dependence before interchange\n");
+    printDepMatrix(DependencyMatrix);
+#endif
+
+    // Get the Outermost loop exit.
+    BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
+    if (!LoopNestExit) {
+      LLVM_DEBUG(dbgs() << "OuterMostLoop needs an unique exit block");
+      return false;
+    }
+
+    unsigned SelecLoopId = selectLoopForInterchange(LoopList);
+    // Move the selected loop outwards to the best possible position.
+    for (unsigned i = SelecLoopId; i > 0; i--) {
+      bool Interchanged =
+          processLoop(LoopList, i, i - 1, LoopNestExit, DependencyMatrix);
+      if (!Interchanged)
+        return Changed;
+      // Loops interchanged reflect the same in LoopList
+      std::swap(LoopList[i - 1], LoopList[i]);
+
+      // Update the DependencyMatrix
+      interChangeDependencies(DependencyMatrix, i, i - 1);
+#ifdef DUMP_DEP_MATRICIES
+      LLVM_DEBUG(dbgs() << "Dependence after interchange\n");
+      printDepMatrix(DependencyMatrix);
+#endif
+      Changed |= Interchanged;
+    }
+    return Changed;
+  }
+
+  bool processLoop(LoopVector LoopList, unsigned InnerLoopId,
+                   unsigned OuterLoopId, BasicBlock *LoopNestExit,
+                   std::vector<std::vector<char>> &DependencyMatrix) {
+    LLVM_DEBUG(dbgs() << "Processing Inner Loop Id = " << InnerLoopId
+                      << " and OuterLoopId = " << OuterLoopId << "\n");
+    Loop *InnerLoop = LoopList[InnerLoopId];
+    Loop *OuterLoop = LoopList[OuterLoopId];
+
+    LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE);
+    if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
+      LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n");
+      return false;
+    }
+    LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
+    LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
+    if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) {
+      LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
+      return false;
+    }
+
+    ORE->emit([&]() {
+      return OptimizationRemark(DEBUG_TYPE, "Interchanged",
+                                InnerLoop->getStartLoc(),
+                                InnerLoop->getHeader())
+             << "Loop interchanged with enclosing loop.";
+    });
+
+    LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LoopNestExit,
+                                 LIL);
+    LIT.transform();
+    LLVM_DEBUG(dbgs() << "Loops interchanged.\n");
+    LoopsInterchanged++;
+
+    assert(InnerLoop->isLCSSAForm(*DT) &&
+           "Inner loop not left in LCSSA form after loop interchange!");
+    assert(OuterLoop->isLCSSAForm(*DT) &&
+           "Outer loop not left in LCSSA form after loop interchange!");
+
+    return true;
+  }
+};
+
+} // end anonymous namespace
+
+bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB) {
+  return any_of(*BB, [](const Instruction &I) {
+    return I.mayHaveSideEffects() || I.mayReadFromMemory();
+  });
+}
+
+bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
+  BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
+  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+  BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
+
+  LLVM_DEBUG(dbgs() << "Checking if loops are tightly nested\n");
+
+  // A perfectly nested loop will not have any branch in between the outer and
+  // inner block i.e. outer header will branch to either inner preheader and
+  // outerloop latch.
+  BranchInst *OuterLoopHeaderBI =
+      dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
+  if (!OuterLoopHeaderBI)
+    return false;
+
+  for (BasicBlock *Succ : successors(OuterLoopHeaderBI))
+    if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader() &&
+        Succ != OuterLoopLatch)
+      return false;
+
+  LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
+  // We do not have any basic block in between now make sure the outer header
+  // and outer loop latch doesn't contain any unsafe instructions.
+  if (containsUnsafeInstructions(OuterLoopHeader) ||
+      containsUnsafeInstructions(OuterLoopLatch))
+    return false;
+
   // Also make sure the inner loop preheader does not contain any unsafe
   // instructions. Note that all instructions in the preheader will be moved to
   // the outer loop header when interchanging.
@@ -618,694 +618,694 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
       containsUnsafeInstructions(InnerLoopPreHeader))
     return false;
 
-  LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n"); 
-  // We have a perfect loop nest. 
-  return true; 
-} 
- 
-bool LoopInterchangeLegality::isLoopStructureUnderstood( 
-    PHINode *InnerInduction) { 
-  unsigned Num = InnerInduction->getNumOperands(); 
-  BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader(); 
-  for (unsigned i = 0; i < Num; ++i) { 
-    Value *Val = InnerInduction->getOperand(i); 
-    if (isa<Constant>(Val)) 
-      continue; 
-    Instruction *I = dyn_cast<Instruction>(Val); 
-    if (!I) 
-      return false; 
-    // TODO: Handle triangular loops. 
-    // e.g. for(int i=0;i<N;i++) 
-    //        for(int j=i;j<N;j++) 
-    unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i); 
-    if (InnerInduction->getIncomingBlock(IncomBlockIndx) == 
-            InnerLoopPreheader && 
-        !OuterLoop->isLoopInvariant(I)) { 
-      return false; 
-    } 
-  } 
-  return true; 
-} 
- 
-// If SV is a LCSSA PHI node with a single incoming value, return the incoming 
-// value. 
-static Value *followLCSSA(Value *SV) { 
-  PHINode *PHI = dyn_cast<PHINode>(SV); 
-  if (!PHI) 
-    return SV; 
- 
-  if (PHI->getNumIncomingValues() != 1) 
-    return SV; 
-  return followLCSSA(PHI->getIncomingValue(0)); 
-} 
- 
-// Check V's users to see if it is involved in a reduction in L. 
-static PHINode *findInnerReductionPhi(Loop *L, Value *V) { 
+  LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
+  // We have a perfect loop nest.
+  return true;
+}
+
+bool LoopInterchangeLegality::isLoopStructureUnderstood(
+    PHINode *InnerInduction) {
+  unsigned Num = InnerInduction->getNumOperands();
+  BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
+  for (unsigned i = 0; i < Num; ++i) {
+    Value *Val = InnerInduction->getOperand(i);
+    if (isa<Constant>(Val))
+      continue;
+    Instruction *I = dyn_cast<Instruction>(Val);
+    if (!I)
+      return false;
+    // TODO: Handle triangular loops.
+    // e.g. for(int i=0;i<N;i++)
+    //        for(int j=i;j<N;j++)
+    unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
+    if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
+            InnerLoopPreheader &&
+        !OuterLoop->isLoopInvariant(I)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// If SV is a LCSSA PHI node with a single incoming value, return the incoming
+// value.
+static Value *followLCSSA(Value *SV) {
+  PHINode *PHI = dyn_cast<PHINode>(SV);
+  if (!PHI)
+    return SV;
+
+  if (PHI->getNumIncomingValues() != 1)
+    return SV;
+  return followLCSSA(PHI->getIncomingValue(0));
+}
+
+// Check V's users to see if it is involved in a reduction in L.
+static PHINode *findInnerReductionPhi(Loop *L, Value *V) {
   // Reduction variables cannot be constants.
   if (isa<Constant>(V))
     return nullptr;
 
-  for (Value *User : V->users()) { 
-    if (PHINode *PHI = dyn_cast<PHINode>(User)) { 
-      if (PHI->getNumIncomingValues() == 1) 
-        continue; 
-      RecurrenceDescriptor RD; 
-      if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) 
-        return PHI; 
-      return nullptr; 
-    } 
-  } 
- 
-  return nullptr; 
-} 
- 
-bool LoopInterchangeLegality::findInductionAndReductions( 
-    Loop *L, SmallVector<PHINode *, 8> &Inductions, Loop *InnerLoop) { 
-  if (!L->getLoopLatch() || !L->getLoopPredecessor()) 
-    return false; 
-  for (PHINode &PHI : L->getHeader()->phis()) { 
-    RecurrenceDescriptor RD; 
-    InductionDescriptor ID; 
-    if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID)) 
-      Inductions.push_back(&PHI); 
-    else { 
-      // PHIs in inner loops need to be part of a reduction in the outer loop, 
-      // discovered when checking the PHIs of the outer loop earlier. 
-      if (!InnerLoop) { 
-        if (!OuterInnerReductions.count(&PHI)) { 
-          LLVM_DEBUG(dbgs() << "Inner loop PHI is not part of reductions " 
-                               "across the outer loop.\n"); 
-          return false; 
-        } 
-      } else { 
-        assert(PHI.getNumIncomingValues() == 2 && 
-               "Phis in loop header should have exactly 2 incoming values"); 
-        // Check if we have a PHI node in the outer loop that has a reduction 
-        // result from the inner loop as an incoming value. 
-        Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch())); 
-        PHINode *InnerRedPhi = findInnerReductionPhi(InnerLoop, V); 
-        if (!InnerRedPhi || 
+  for (Value *User : V->users()) {
+    if (PHINode *PHI = dyn_cast<PHINode>(User)) {
+      if (PHI->getNumIncomingValues() == 1)
+        continue;
+      RecurrenceDescriptor RD;
+      if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD))
+        return PHI;
+      return nullptr;
+    }
+  }
+
+  return nullptr;
+}
+
+bool LoopInterchangeLegality::findInductionAndReductions(
+    Loop *L, SmallVector<PHINode *, 8> &Inductions, Loop *InnerLoop) {
+  if (!L->getLoopLatch() || !L->getLoopPredecessor())
+    return false;
+  for (PHINode &PHI : L->getHeader()->phis()) {
+    RecurrenceDescriptor RD;
+    InductionDescriptor ID;
+    if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID))
+      Inductions.push_back(&PHI);
+    else {
+      // PHIs in inner loops need to be part of a reduction in the outer loop,
+      // discovered when checking the PHIs of the outer loop earlier.
+      if (!InnerLoop) {
+        if (!OuterInnerReductions.count(&PHI)) {
+          LLVM_DEBUG(dbgs() << "Inner loop PHI is not part of reductions "
+                               "across the outer loop.\n");
+          return false;
+        }
+      } else {
+        assert(PHI.getNumIncomingValues() == 2 &&
+               "Phis in loop header should have exactly 2 incoming values");
+        // Check if we have a PHI node in the outer loop that has a reduction
+        // result from the inner loop as an incoming value.
+        Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch()));
+        PHINode *InnerRedPhi = findInnerReductionPhi(InnerLoop, V);
+        if (!InnerRedPhi ||
             !llvm::is_contained(InnerRedPhi->incoming_values(), &PHI)) {
-          LLVM_DEBUG( 
-              dbgs() 
-              << "Failed to recognize PHI as an induction or reduction.\n"); 
-          return false; 
-        } 
-        OuterInnerReductions.insert(&PHI); 
-        OuterInnerReductions.insert(InnerRedPhi); 
-      } 
-    } 
-  } 
-  return true; 
-} 
- 
-// This function indicates the current limitations in the transform as a result 
-// of which we do not proceed. 
-bool LoopInterchangeLegality::currentLimitations() { 
-  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 
-  BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); 
- 
-  // transform currently expects the loop latches to also be the exiting 
-  // blocks. 
-  if (InnerLoop->getExitingBlock() != InnerLoopLatch || 
-      OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() || 
-      !isa<BranchInst>(InnerLoopLatch->getTerminator()) || 
-      !isa<BranchInst>(OuterLoop->getLoopLatch()->getTerminator())) { 
-    LLVM_DEBUG( 
-        dbgs() << "Loops where the latch is not the exiting block are not" 
-               << " supported currently.\n"); 
-    ORE->emit([&]() { 
-      return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch", 
-                                      OuterLoop->getStartLoc(), 
-                                      OuterLoop->getHeader()) 
-             << "Loops where the latch is not the exiting block cannot be" 
-                " interchange currently."; 
-    }); 
-    return true; 
-  } 
- 
-  PHINode *InnerInductionVar; 
-  SmallVector<PHINode *, 8> Inductions; 
-  if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) { 
-    LLVM_DEBUG( 
-        dbgs() << "Only outer loops with induction or reduction PHI nodes " 
-               << "are supported currently.\n"); 
-    ORE->emit([&]() { 
-      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter", 
-                                      OuterLoop->getStartLoc(), 
-                                      OuterLoop->getHeader()) 
-             << "Only outer loops with induction or reduction PHI nodes can be" 
-                " interchanged currently."; 
-    }); 
-    return true; 
-  } 
- 
-  // TODO: Currently we handle only loops with 1 induction variable. 
-  if (Inductions.size() != 1) { 
-    LLVM_DEBUG(dbgs() << "Loops with more than 1 induction variables are not " 
-                      << "supported currently.\n"); 
-    ORE->emit([&]() { 
-      return OptimizationRemarkMissed(DEBUG_TYPE, "MultiIndutionOuter", 
-                                      OuterLoop->getStartLoc(), 
-                                      OuterLoop->getHeader()) 
-             << "Only outer loops with 1 induction variable can be " 
-                "interchanged currently."; 
-    }); 
-    return true; 
-  } 
- 
-  Inductions.clear(); 
-  if (!findInductionAndReductions(InnerLoop, Inductions, nullptr)) { 
-    LLVM_DEBUG( 
-        dbgs() << "Only inner loops with induction or reduction PHI nodes " 
-               << "are supported currently.\n"); 
-    ORE->emit([&]() { 
-      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner", 
-                                      InnerLoop->getStartLoc(), 
-                                      InnerLoop->getHeader()) 
-             << "Only inner loops with induction or reduction PHI nodes can be" 
-                " interchange currently."; 
-    }); 
-    return true; 
-  } 
- 
-  // TODO: Currently we handle only loops with 1 induction variable. 
-  if (Inductions.size() != 1) { 
-    LLVM_DEBUG( 
-        dbgs() << "We currently only support loops with 1 induction variable." 
-               << "Failed to interchange due to current limitation\n"); 
-    ORE->emit([&]() { 
-      return OptimizationRemarkMissed(DEBUG_TYPE, "MultiInductionInner", 
-                                      InnerLoop->getStartLoc(), 
-                                      InnerLoop->getHeader()) 
-             << "Only inner loops with 1 induction variable can be " 
-                "interchanged currently."; 
-    }); 
-    return true; 
-  } 
-  InnerInductionVar = Inductions.pop_back_val(); 
- 
-  // TODO: Triangular loops are not handled for now. 
-  if (!isLoopStructureUnderstood(InnerInductionVar)) { 
-    LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n"); 
-    ORE->emit([&]() { 
-      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner", 
-                                      InnerLoop->getStartLoc(), 
-                                      InnerLoop->getHeader()) 
-             << "Inner loop structure not understood currently."; 
-    }); 
-    return true; 
-  } 
- 
-  // TODO: Current limitation: Since we split the inner loop latch at the point 
-  // were induction variable is incremented (induction.next); We cannot have 
-  // more than 1 user of induction.next since it would result in broken code 
-  // after split. 
-  // e.g. 
-  // for(i=0;i<N;i++) { 
-  //    for(j = 0;j<M;j++) { 
-  //      A[j+1][i+2] = A[j][i]+k; 
-  //  } 
-  // } 
-  Instruction *InnerIndexVarInc = nullptr; 
-  if (InnerInductionVar->getIncomingBlock(0) == InnerLoopPreHeader) 
-    InnerIndexVarInc = 
-        dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(1)); 
-  else 
-    InnerIndexVarInc = 
-        dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0)); 
- 
-  if (!InnerIndexVarInc) { 
-    LLVM_DEBUG( 
-        dbgs() << "Did not find an instruction to increment the induction " 
-               << "variable.\n"); 
-    ORE->emit([&]() { 
-      return OptimizationRemarkMissed(DEBUG_TYPE, "NoIncrementInInner", 
-                                      InnerLoop->getStartLoc(), 
-                                      InnerLoop->getHeader()) 
-             << "The inner loop does not increment the induction variable."; 
-    }); 
-    return true; 
-  } 
- 
-  // Since we split the inner loop latch on this induction variable. Make sure 
-  // we do not have any instruction between the induction variable and branch 
-  // instruction. 
- 
-  bool FoundInduction = false; 
-  for (const Instruction &I : 
-       llvm::reverse(InnerLoopLatch->instructionsWithoutDebug())) { 
-    if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I) || 
-        isa<ZExtInst>(I)) 
-      continue; 
- 
-    // We found an instruction. If this is not induction variable then it is not 
-    // safe to split this loop latch. 
-    if (!I.isIdenticalTo(InnerIndexVarInc)) { 
-      LLVM_DEBUG(dbgs() << "Found unsupported instructions between induction " 
-                        << "variable increment and branch.\n"); 
-      ORE->emit([&]() { 
-        return OptimizationRemarkMissed( 
-                   DEBUG_TYPE, "UnsupportedInsBetweenInduction", 
-                   InnerLoop->getStartLoc(), InnerLoop->getHeader()) 
-               << "Found unsupported instruction between induction variable " 
-                  "increment and branch."; 
-      }); 
-      return true; 
-    } 
- 
-    FoundInduction = true; 
-    break; 
-  } 
-  // The loop latch ended and we didn't find the induction variable return as 
-  // current limitation. 
-  if (!FoundInduction) { 
-    LLVM_DEBUG(dbgs() << "Did not find the induction variable.\n"); 
-    ORE->emit([&]() { 
-      return OptimizationRemarkMissed(DEBUG_TYPE, "NoIndutionVariable", 
-                                      InnerLoop->getStartLoc(), 
-                                      InnerLoop->getHeader()) 
-             << "Did not find the induction variable."; 
-    }); 
-    return true; 
-  } 
-  return false; 
-} 
- 
-// We currently only support LCSSA PHI nodes in the inner loop exit, if their 
-// users are either reduction PHIs or PHIs outside the outer loop (which means 
-// the we are only interested in the final value after the loop). 
-static bool 
-areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL, 
-                              SmallPtrSetImpl<PHINode *> &Reductions) { 
-  BasicBlock *InnerExit = OuterL->getUniqueExitBlock(); 
-  for (PHINode &PHI : InnerExit->phis()) { 
-    // Reduction lcssa phi will have only 1 incoming block that from loop latch. 
-    if (PHI.getNumIncomingValues() > 1) 
-      return false; 
-    if (any_of(PHI.users(), [&Reductions, OuterL](User *U) { 
-          PHINode *PN = dyn_cast<PHINode>(U); 
-          return !PN || 
-                 (!Reductions.count(PN) && OuterL->contains(PN->getParent())); 
-        })) { 
-      return false; 
-    } 
-  } 
-  return true; 
-} 
- 
-// We currently support LCSSA PHI nodes in the outer loop exit, if their 
-// incoming values do not come from the outer loop latch or if the 
-// outer loop latch has a single predecessor. In that case, the value will 
-// be available if both the inner and outer loop conditions are true, which 
-// will still be true after interchanging. If we have multiple predecessor, 
-// that may not be the case, e.g. because the outer loop latch may be executed 
-// if the inner loop is not executed. 
-static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) { 
-  BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock(); 
-  for (PHINode &PHI : LoopNestExit->phis()) { 
-    //  FIXME: We currently are not able to detect floating point reductions 
-    //         and have to use floating point PHIs as a proxy to prevent 
-    //         interchanging in the presence of floating point reductions. 
-    if (PHI.getType()->isFloatingPointTy()) 
-      return false; 
-    for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) { 
-     Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i)); 
-     if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch()) 
-       continue; 
- 
-     // The incoming value is defined in the outer loop latch. Currently we 
-     // only support that in case the outer loop latch has a single predecessor. 
-     // This guarantees that the outer loop latch is executed if and only if 
-     // the inner loop is executed (because tightlyNested() guarantees that the 
-     // outer loop header only branches to the inner loop or the outer loop 
-     // latch). 
-     // FIXME: We could weaken this logic and allow multiple predecessors, 
-     //        if the values are produced outside the loop latch. We would need 
-     //        additional logic to update the PHI nodes in the exit block as 
-     //        well. 
-     if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr) 
-       return false; 
-    } 
-  } 
-  return true; 
-} 
- 
-bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, 
-                                                  unsigned OuterLoopId, 
-                                                  CharMatrix &DepMatrix) { 
-  if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) { 
-    LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId 
-                      << " and OuterLoopId = " << OuterLoopId 
-                      << " due to dependence\n"); 
-    ORE->emit([&]() { 
-      return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence", 
-                                      InnerLoop->getStartLoc(), 
-                                      InnerLoop->getHeader()) 
-             << "Cannot interchange loops due to dependences."; 
-    }); 
-    return false; 
-  } 
-  // Check if outer and inner loop contain legal instructions only. 
-  for (auto *BB : OuterLoop->blocks()) 
-    for (Instruction &I : BB->instructionsWithoutDebug()) 
-      if (CallInst *CI = dyn_cast<CallInst>(&I)) { 
-        // readnone functions do not prevent interchanging. 
-        if (CI->doesNotReadMemory()) 
-          continue; 
-        LLVM_DEBUG( 
-            dbgs() << "Loops with call instructions cannot be interchanged " 
-                   << "safely."); 
-        ORE->emit([&]() { 
-          return OptimizationRemarkMissed(DEBUG_TYPE, "CallInst", 
-                                          CI->getDebugLoc(), 
-                                          CI->getParent()) 
-                 << "Cannot interchange loops due to call instruction."; 
-        }); 
- 
-        return false; 
-      } 
- 
-  // TODO: The loops could not be interchanged due to current limitations in the 
-  // transform module. 
-  if (currentLimitations()) { 
-    LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n"); 
-    return false; 
-  } 
- 
-  // Check if the loops are tightly nested. 
-  if (!tightlyNested(OuterLoop, InnerLoop)) { 
-    LLVM_DEBUG(dbgs() << "Loops not tightly nested\n"); 
-    ORE->emit([&]() { 
-      return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested", 
-                                      InnerLoop->getStartLoc(), 
-                                      InnerLoop->getHeader()) 
-             << "Cannot interchange loops because they are not tightly " 
-                "nested."; 
-    }); 
-    return false; 
-  } 
- 
-  if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop, 
-                                     OuterInnerReductions)) { 
-    LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n"); 
-    ORE->emit([&]() { 
-      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI", 
-                                      InnerLoop->getStartLoc(), 
-                                      InnerLoop->getHeader()) 
-             << "Found unsupported PHI node in loop exit."; 
-    }); 
-    return false; 
-  } 
- 
-  if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) { 
-    LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n"); 
-    ORE->emit([&]() { 
-      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI", 
-                                      OuterLoop->getStartLoc(), 
-                                      OuterLoop->getHeader()) 
-             << "Found unsupported PHI node in loop exit."; 
-    }); 
-    return false; 
-  } 
- 
-  return true; 
-} 
- 
-int LoopInterchangeProfitability::getInstrOrderCost() { 
-  unsigned GoodOrder, BadOrder; 
-  BadOrder = GoodOrder = 0; 
-  for (BasicBlock *BB : InnerLoop->blocks()) { 
-    for (Instruction &Ins : *BB) { 
-      if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) { 
-        unsigned NumOp = GEP->getNumOperands(); 
-        bool FoundInnerInduction = false; 
-        bool FoundOuterInduction = false; 
-        for (unsigned i = 0; i < NumOp; ++i) { 
+          LLVM_DEBUG(
+              dbgs()
+              << "Failed to recognize PHI as an induction or reduction.\n");
+          return false;
+        }
+        OuterInnerReductions.insert(&PHI);
+        OuterInnerReductions.insert(InnerRedPhi);
+      }
+    }
+  }
+  return true;
+}
+
+// This function indicates the current limitations in the transform as a result
+// of which we do not proceed.
+bool LoopInterchangeLegality::currentLimitations() {
+  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+  BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
+
+  // transform currently expects the loop latches to also be the exiting
+  // blocks.
+  if (InnerLoop->getExitingBlock() != InnerLoopLatch ||
+      OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() ||
+      !isa<BranchInst>(InnerLoopLatch->getTerminator()) ||
+      !isa<BranchInst>(OuterLoop->getLoopLatch()->getTerminator())) {
+    LLVM_DEBUG(
+        dbgs() << "Loops where the latch is not the exiting block are not"
+               << " supported currently.\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch",
+                                      OuterLoop->getStartLoc(),
+                                      OuterLoop->getHeader())
+             << "Loops where the latch is not the exiting block cannot be"
+                " interchange currently.";
+    });
+    return true;
+  }
+
+  PHINode *InnerInductionVar;
+  SmallVector<PHINode *, 8> Inductions;
+  if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) {
+    LLVM_DEBUG(
+        dbgs() << "Only outer loops with induction or reduction PHI nodes "
+               << "are supported currently.\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
+                                      OuterLoop->getStartLoc(),
+                                      OuterLoop->getHeader())
+             << "Only outer loops with induction or reduction PHI nodes can be"
+                " interchanged currently.";
+    });
+    return true;
+  }
+
+  // TODO: Currently we handle only loops with 1 induction variable.
+  if (Inductions.size() != 1) {
+    LLVM_DEBUG(dbgs() << "Loops with more than 1 induction variables are not "
+                      << "supported currently.\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "MultiIndutionOuter",
+                                      OuterLoop->getStartLoc(),
+                                      OuterLoop->getHeader())
+             << "Only outer loops with 1 induction variable can be "
+                "interchanged currently.";
+    });
+    return true;
+  }
+
+  Inductions.clear();
+  if (!findInductionAndReductions(InnerLoop, Inductions, nullptr)) {
+    LLVM_DEBUG(
+        dbgs() << "Only inner loops with induction or reduction PHI nodes "
+               << "are supported currently.\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
+                                      InnerLoop->getStartLoc(),
+                                      InnerLoop->getHeader())
+             << "Only inner loops with induction or reduction PHI nodes can be"
+                " interchange currently.";
+    });
+    return true;
+  }
+
+  // TODO: Currently we handle only loops with 1 induction variable.
+  if (Inductions.size() != 1) {
+    LLVM_DEBUG(
+        dbgs() << "We currently only support loops with 1 induction variable."
+               << "Failed to interchange due to current limitation\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "MultiInductionInner",
+                                      InnerLoop->getStartLoc(),
+                                      InnerLoop->getHeader())
+             << "Only inner loops with 1 induction variable can be "
+                "interchanged currently.";
+    });
+    return true;
+  }
+  InnerInductionVar = Inductions.pop_back_val();
+
+  // TODO: Triangular loops are not handled for now.
+  if (!isLoopStructureUnderstood(InnerInductionVar)) {
+    LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
+                                      InnerLoop->getStartLoc(),
+                                      InnerLoop->getHeader())
+             << "Inner loop structure not understood currently.";
+    });
+    return true;
+  }
+
+  // TODO: Current limitation: Since we split the inner loop latch at the point
+  // were induction variable is incremented (induction.next); We cannot have
+  // more than 1 user of induction.next since it would result in broken code
+  // after split.
+  // e.g.
+  // for(i=0;i<N;i++) {
+  //    for(j = 0;j<M;j++) {
+  //      A[j+1][i+2] = A[j][i]+k;
+  //  }
+  // }
+  Instruction *InnerIndexVarInc = nullptr;
+  if (InnerInductionVar->getIncomingBlock(0) == InnerLoopPreHeader)
+    InnerIndexVarInc =
+        dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(1));
+  else
+    InnerIndexVarInc =
+        dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0));
+
+  if (!InnerIndexVarInc) {
+    LLVM_DEBUG(
+        dbgs() << "Did not find an instruction to increment the induction "
+               << "variable.\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "NoIncrementInInner",
+                                      InnerLoop->getStartLoc(),
+                                      InnerLoop->getHeader())
+             << "The inner loop does not increment the induction variable.";
+    });
+    return true;
+  }
+
+  // Since we split the inner loop latch on this induction variable. Make sure
+  // we do not have any instruction between the induction variable and branch
+  // instruction.
+
+  bool FoundInduction = false;
+  for (const Instruction &I :
+       llvm::reverse(InnerLoopLatch->instructionsWithoutDebug())) {
+    if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I) ||
+        isa<ZExtInst>(I))
+      continue;
+
+    // We found an instruction. If this is not induction variable then it is not
+    // safe to split this loop latch.
+    if (!I.isIdenticalTo(InnerIndexVarInc)) {
+      LLVM_DEBUG(dbgs() << "Found unsupported instructions between induction "
+                        << "variable increment and branch.\n");
+      ORE->emit([&]() {
+        return OptimizationRemarkMissed(
+                   DEBUG_TYPE, "UnsupportedInsBetweenInduction",
+                   InnerLoop->getStartLoc(), InnerLoop->getHeader())
+               << "Found unsupported instruction between induction variable "
+                  "increment and branch.";
+      });
+      return true;
+    }
+
+    FoundInduction = true;
+    break;
+  }
+  // The loop latch ended and we didn't find the induction variable return as
+  // current limitation.
+  if (!FoundInduction) {
+    LLVM_DEBUG(dbgs() << "Did not find the induction variable.\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "NoIndutionVariable",
+                                      InnerLoop->getStartLoc(),
+                                      InnerLoop->getHeader())
+             << "Did not find the induction variable.";
+    });
+    return true;
+  }
+  return false;
+}
+
+// We currently only support LCSSA PHI nodes in the inner loop exit, if their
+// users are either reduction PHIs or PHIs outside the outer loop (which means
+// the we are only interested in the final value after the loop).
+static bool
+areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL,
+                              SmallPtrSetImpl<PHINode *> &Reductions) {
+  BasicBlock *InnerExit = OuterL->getUniqueExitBlock();
+  for (PHINode &PHI : InnerExit->phis()) {
+    // Reduction lcssa phi will have only 1 incoming block that from loop latch.
+    if (PHI.getNumIncomingValues() > 1)
+      return false;
+    if (any_of(PHI.users(), [&Reductions, OuterL](User *U) {
+          PHINode *PN = dyn_cast<PHINode>(U);
+          return !PN ||
+                 (!Reductions.count(PN) && OuterL->contains(PN->getParent()));
+        })) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// We currently support LCSSA PHI nodes in the outer loop exit, if their
+// incoming values do not come from the outer loop latch or if the
+// outer loop latch has a single predecessor. In that case, the value will
+// be available if both the inner and outer loop conditions are true, which
+// will still be true after interchanging. If we have multiple predecessor,
+// that may not be the case, e.g. because the outer loop latch may be executed
+// if the inner loop is not executed.
+static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
+  BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
+  for (PHINode &PHI : LoopNestExit->phis()) {
+    //  FIXME: We currently are not able to detect floating point reductions
+    //         and have to use floating point PHIs as a proxy to prevent
+    //         interchanging in the presence of floating point reductions.
+    if (PHI.getType()->isFloatingPointTy())
+      return false;
+    for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) {
+     Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i));
+     if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
+       continue;
+
+     // The incoming value is defined in the outer loop latch. Currently we
+     // only support that in case the outer loop latch has a single predecessor.
+     // This guarantees that the outer loop latch is executed if and only if
+     // the inner loop is executed (because tightlyNested() guarantees that the
+     // outer loop header only branches to the inner loop or the outer loop
+     // latch).
+     // FIXME: We could weaken this logic and allow multiple predecessors,
+     //        if the values are produced outside the loop latch. We would need
+     //        additional logic to update the PHI nodes in the exit block as
+     //        well.
+     if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
+       return false;
+    }
+  }
+  return true;
+}
+
+bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
+                                                  unsigned OuterLoopId,
+                                                  CharMatrix &DepMatrix) {
+  if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
+    LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
+                      << " and OuterLoopId = " << OuterLoopId
+                      << " due to dependence\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
+                                      InnerLoop->getStartLoc(),
+                                      InnerLoop->getHeader())
+             << "Cannot interchange loops due to dependences.";
+    });
+    return false;
+  }
+  // Check if outer and inner loop contain legal instructions only.
+  for (auto *BB : OuterLoop->blocks())
+    for (Instruction &I : BB->instructionsWithoutDebug())
+      if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+        // readnone functions do not prevent interchanging.
+        if (CI->doesNotReadMemory())
+          continue;
+        LLVM_DEBUG(
+            dbgs() << "Loops with call instructions cannot be interchanged "
+                   << "safely.");
+        ORE->emit([&]() {
+          return OptimizationRemarkMissed(DEBUG_TYPE, "CallInst",
+                                          CI->getDebugLoc(),
+                                          CI->getParent())
+                 << "Cannot interchange loops due to call instruction.";
+        });
+
+        return false;
+      }
+
+  // TODO: The loops could not be interchanged due to current limitations in the
+  // transform module.
+  if (currentLimitations()) {
+    LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n");
+    return false;
+  }
+
+  // Check if the loops are tightly nested.
+  if (!tightlyNested(OuterLoop, InnerLoop)) {
+    LLVM_DEBUG(dbgs() << "Loops not tightly nested\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested",
+                                      InnerLoop->getStartLoc(),
+                                      InnerLoop->getHeader())
+             << "Cannot interchange loops because they are not tightly "
+                "nested.";
+    });
+    return false;
+  }
+
+  if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop,
+                                     OuterInnerReductions)) {
+    LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
+                                      InnerLoop->getStartLoc(),
+                                      InnerLoop->getHeader())
+             << "Found unsupported PHI node in loop exit.";
+    });
+    return false;
+  }
+
+  if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
+    LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
+                                      OuterLoop->getStartLoc(),
+                                      OuterLoop->getHeader())
+             << "Found unsupported PHI node in loop exit.";
+    });
+    return false;
+  }
+
+  return true;
+}
+
+int LoopInterchangeProfitability::getInstrOrderCost() {
+  unsigned GoodOrder, BadOrder;
+  BadOrder = GoodOrder = 0;
+  for (BasicBlock *BB : InnerLoop->blocks()) {
+    for (Instruction &Ins : *BB) {
+      if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) {
+        unsigned NumOp = GEP->getNumOperands();
+        bool FoundInnerInduction = false;
+        bool FoundOuterInduction = false;
+        for (unsigned i = 0; i < NumOp; ++i) {
           // Skip operands that are not SCEV-able.
           if (!SE->isSCEVable(GEP->getOperand(i)->getType()))
             continue;
 
-          const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i)); 
-          const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal); 
-          if (!AR) 
-            continue; 
- 
-          // If we find the inner induction after an outer induction e.g. 
-          // for(int i=0;i<N;i++) 
-          //   for(int j=0;j<N;j++) 
-          //     A[i][j] = A[i-1][j-1]+k; 
-          // then it is a good order. 
-          if (AR->getLoop() == InnerLoop) { 
-            // We found an InnerLoop induction after OuterLoop induction. It is 
-            // a good order. 
-            FoundInnerInduction = true; 
-            if (FoundOuterInduction) { 
-              GoodOrder++; 
-              break; 
-            } 
-          } 
-          // If we find the outer induction after an inner induction e.g. 
-          // for(int i=0;i<N;i++) 
-          //   for(int j=0;j<N;j++) 
-          //     A[j][i] = A[j-1][i-1]+k; 
-          // then it is a bad order. 
-          if (AR->getLoop() == OuterLoop) { 
-            // We found an OuterLoop induction after InnerLoop induction. It is 
-            // a bad order. 
-            FoundOuterInduction = true; 
-            if (FoundInnerInduction) { 
-              BadOrder++; 
-              break; 
-            } 
-          } 
-        } 
-      } 
-    } 
-  } 
-  return GoodOrder - BadOrder; 
-} 
- 
-static bool isProfitableForVectorization(unsigned InnerLoopId, 
-                                         unsigned OuterLoopId, 
-                                         CharMatrix &DepMatrix) { 
-  // TODO: Improve this heuristic to catch more cases. 
-  // If the inner loop is loop independent or doesn't carry any dependency it is 
-  // profitable to move this to outer position. 
-  for (auto &Row : DepMatrix) { 
-    if (Row[InnerLoopId] != 'S' && Row[InnerLoopId] != 'I') 
-      return false; 
-    // TODO: We need to improve this heuristic. 
-    if (Row[OuterLoopId] != '=') 
-      return false; 
-  } 
-  // If outer loop has dependence and inner loop is loop independent then it is 
-  // profitable to interchange to enable parallelism. 
-  // If there are no dependences, interchanging will not improve anything. 
-  return !DepMatrix.empty(); 
-} 
- 
-bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId, 
-                                                unsigned OuterLoopId, 
-                                                CharMatrix &DepMatrix) { 
-  // TODO: Add better profitability checks. 
-  // e.g 
-  // 1) Construct dependency matrix and move the one with no loop carried dep 
-  //    inside to enable vectorization. 
- 
-  // This is rough cost estimation algorithm. It counts the good and bad order 
-  // of induction variables in the instruction and allows reordering if number 
-  // of bad orders is more than good. 
-  int Cost = getInstrOrderCost(); 
-  LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n"); 
-  if (Cost < -LoopInterchangeCostThreshold) 
-    return true; 
- 
-  // It is not profitable as per current cache profitability model. But check if 
-  // we can move this loop outside to improve parallelism. 
-  if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix)) 
-    return true; 
- 
-  ORE->emit([&]() { 
-    return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable", 
-                                    InnerLoop->getStartLoc(), 
-                                    InnerLoop->getHeader()) 
-           << "Interchanging loops is too costly (cost=" 
-           << ore::NV("Cost", Cost) << ", threshold=" 
-           << ore::NV("Threshold", LoopInterchangeCostThreshold) 
-           << ") and it does not improve parallelism."; 
-  }); 
-  return false; 
-} 
- 
-void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop, 
-                                               Loop *InnerLoop) { 
-  for (Loop *L : *OuterLoop) 
-    if (L == InnerLoop) { 
-      OuterLoop->removeChildLoop(L); 
-      return; 
-    } 
-  llvm_unreachable("Couldn't find loop"); 
-} 
- 
-/// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the 
-/// new inner and outer loop after interchanging: NewInner is the original 
-/// outer loop and NewOuter is the original inner loop. 
-/// 
-/// Before interchanging, we have the following structure 
-/// Outer preheader 
-//  Outer header 
-//    Inner preheader 
-//    Inner header 
-//      Inner body 
-//      Inner latch 
-//   outer bbs 
-//   Outer latch 
-// 
-// After interchanging: 
-// Inner preheader 
-// Inner header 
-//   Outer preheader 
-//   Outer header 
-//     Inner body 
-//     outer bbs 
-//     Outer latch 
-//   Inner latch 
-void LoopInterchangeTransform::restructureLoops( 
-    Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader, 
-    BasicBlock *OrigOuterPreHeader) { 
-  Loop *OuterLoopParent = OuterLoop->getParentLoop(); 
-  // The original inner loop preheader moves from the new inner loop to 
-  // the parent loop, if there is one. 
-  NewInner->removeBlockFromLoop(OrigInnerPreHeader); 
-  LI->changeLoopFor(OrigInnerPreHeader, OuterLoopParent); 
- 
-  // Switch the loop levels. 
-  if (OuterLoopParent) { 
-    // Remove the loop from its parent loop. 
-    removeChildLoop(OuterLoopParent, NewInner); 
-    removeChildLoop(NewInner, NewOuter); 
-    OuterLoopParent->addChildLoop(NewOuter); 
-  } else { 
-    removeChildLoop(NewInner, NewOuter); 
-    LI->changeTopLevelLoop(NewInner, NewOuter); 
-  } 
+          const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i));
+          const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal);
+          if (!AR)
+            continue;
+
+          // If we find the inner induction after an outer induction e.g.
+          // for(int i=0;i<N;i++)
+          //   for(int j=0;j<N;j++)
+          //     A[i][j] = A[i-1][j-1]+k;
+          // then it is a good order.
+          if (AR->getLoop() == InnerLoop) {
+            // We found an InnerLoop induction after OuterLoop induction. It is
+            // a good order.
+            FoundInnerInduction = true;
+            if (FoundOuterInduction) {
+              GoodOrder++;
+              break;
+            }
+          }
+          // If we find the outer induction after an inner induction e.g.
+          // for(int i=0;i<N;i++)
+          //   for(int j=0;j<N;j++)
+          //     A[j][i] = A[j-1][i-1]+k;
+          // then it is a bad order.
+          if (AR->getLoop() == OuterLoop) {
+            // We found an OuterLoop induction after InnerLoop induction. It is
+            // a bad order.
+            FoundOuterInduction = true;
+            if (FoundInnerInduction) {
+              BadOrder++;
+              break;
+            }
+          }
+        }
+      }
+    }
+  }
+  return GoodOrder - BadOrder;
+}
+
+static bool isProfitableForVectorization(unsigned InnerLoopId,
+                                         unsigned OuterLoopId,
+                                         CharMatrix &DepMatrix) {
+  // TODO: Improve this heuristic to catch more cases.
+  // If the inner loop is loop independent or doesn't carry any dependency it is
+  // profitable to move this to outer position.
+  for (auto &Row : DepMatrix) {
+    if (Row[InnerLoopId] != 'S' && Row[InnerLoopId] != 'I')
+      return false;
+    // TODO: We need to improve this heuristic.
+    if (Row[OuterLoopId] != '=')
+      return false;
+  }
+  // If outer loop has dependence and inner loop is loop independent then it is
+  // profitable to interchange to enable parallelism.
+  // If there are no dependences, interchanging will not improve anything.
+  return !DepMatrix.empty();
+}
+
+bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
+                                                unsigned OuterLoopId,
+                                                CharMatrix &DepMatrix) {
+  // TODO: Add better profitability checks.
+  // e.g
+  // 1) Construct dependency matrix and move the one with no loop carried dep
+  //    inside to enable vectorization.
+
+  // This is rough cost estimation algorithm. It counts the good and bad order
+  // of induction variables in the instruction and allows reordering if number
+  // of bad orders is more than good.
+  int Cost = getInstrOrderCost();
+  LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
+  if (Cost < -LoopInterchangeCostThreshold)
+    return true;
+
+  // It is not profitable as per current cache profitability model. But check if
+  // we can move this loop outside to improve parallelism.
+  if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix))
+    return true;
+
+  ORE->emit([&]() {
+    return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
+                                    InnerLoop->getStartLoc(),
+                                    InnerLoop->getHeader())
+           << "Interchanging loops is too costly (cost="
+           << ore::NV("Cost", Cost) << ", threshold="
+           << ore::NV("Threshold", LoopInterchangeCostThreshold)
+           << ") and it does not improve parallelism.";
+  });
+  return false;
+}
+
+void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
+                                               Loop *InnerLoop) {
+  for (Loop *L : *OuterLoop)
+    if (L == InnerLoop) {
+      OuterLoop->removeChildLoop(L);
+      return;
+    }
+  llvm_unreachable("Couldn't find loop");
+}
+
+/// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the
+/// new inner and outer loop after interchanging: NewInner is the original
+/// outer loop and NewOuter is the original inner loop.
+///
+/// Before interchanging, we have the following structure
+/// Outer preheader
+//  Outer header
+//    Inner preheader
+//    Inner header
+//      Inner body
+//      Inner latch
+//   outer bbs
+//   Outer latch
+//
+// After interchanging:
+// Inner preheader
+// Inner header
+//   Outer preheader
+//   Outer header
+//     Inner body
+//     outer bbs
+//     Outer latch
+//   Inner latch
+void LoopInterchangeTransform::restructureLoops(
+    Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader,
+    BasicBlock *OrigOuterPreHeader) {
+  Loop *OuterLoopParent = OuterLoop->getParentLoop();
+  // The original inner loop preheader moves from the new inner loop to
+  // the parent loop, if there is one.
+  NewInner->removeBlockFromLoop(OrigInnerPreHeader);
+  LI->changeLoopFor(OrigInnerPreHeader, OuterLoopParent);
+
+  // Switch the loop levels.
+  if (OuterLoopParent) {
+    // Remove the loop from its parent loop.
+    removeChildLoop(OuterLoopParent, NewInner);
+    removeChildLoop(NewInner, NewOuter);
+    OuterLoopParent->addChildLoop(NewOuter);
+  } else {
+    removeChildLoop(NewInner, NewOuter);
+    LI->changeTopLevelLoop(NewInner, NewOuter);
+  }
   while (!NewOuter->isInnermost())
-    NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin())); 
-  NewOuter->addChildLoop(NewInner); 
- 
-  // BBs from the original inner loop. 
-  SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks()); 
- 
-  // Add BBs from the original outer loop to the original inner loop (excluding 
-  // BBs already in inner loop) 
-  for (BasicBlock *BB : NewInner->blocks()) 
-    if (LI->getLoopFor(BB) == NewInner) 
-      NewOuter->addBlockEntry(BB); 
- 
-  // Now remove inner loop header and latch from the new inner loop and move 
-  // other BBs (the loop body) to the new inner loop. 
-  BasicBlock *OuterHeader = NewOuter->getHeader(); 
-  BasicBlock *OuterLatch = NewOuter->getLoopLatch(); 
-  for (BasicBlock *BB : OrigInnerBBs) { 
-    // Nothing will change for BBs in child loops. 
-    if (LI->getLoopFor(BB) != NewOuter) 
-      continue; 
-    // Remove the new outer loop header and latch from the new inner loop. 
-    if (BB == OuterHeader || BB == OuterLatch) 
-      NewInner->removeBlockFromLoop(BB); 
-    else 
-      LI->changeLoopFor(BB, NewInner); 
-  } 
- 
-  // The preheader of the original outer loop becomes part of the new 
-  // outer loop. 
-  NewOuter->addBlockEntry(OrigOuterPreHeader); 
-  LI->changeLoopFor(OrigOuterPreHeader, NewOuter); 
- 
-  // Tell SE that we move the loops around. 
-  SE->forgetLoop(NewOuter); 
-  SE->forgetLoop(NewInner); 
-} 
- 
-bool LoopInterchangeTransform::transform() { 
-  bool Transformed = false; 
-  Instruction *InnerIndexVar; 
- 
-  if (InnerLoop->getSubLoops().empty()) { 
-    BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 
-    LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n"); 
-    PHINode *InductionPHI = getInductionVariable(InnerLoop, SE); 
-    if (!InductionPHI) { 
-      LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n"); 
-      return false; 
-    } 
- 
-    if (InductionPHI->getIncomingBlock(0) == InnerLoopPreHeader) 
-      InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(1)); 
-    else 
-      InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0)); 
- 
-    // Ensure that InductionPHI is the first Phi node. 
-    if (&InductionPHI->getParent()->front() != InductionPHI) 
-      InductionPHI->moveBefore(&InductionPHI->getParent()->front()); 
- 
-    // Create a new latch block for the inner loop. We split at the 
-    // current latch's terminator and then move the condition and all 
-    // operands that are not either loop-invariant or the induction PHI into the 
-    // new latch block. 
-    BasicBlock *NewLatch = 
-        SplitBlock(InnerLoop->getLoopLatch(), 
-                   InnerLoop->getLoopLatch()->getTerminator(), DT, LI); 
- 
-    SmallSetVector<Instruction *, 4> WorkList; 
-    unsigned i = 0; 
-    auto MoveInstructions = [&i, &WorkList, this, InductionPHI, NewLatch]() { 
-      for (; i < WorkList.size(); i++) { 
-        // Duplicate instruction and move it the new latch. Update uses that 
-        // have been moved. 
-        Instruction *NewI = WorkList[i]->clone(); 
-        NewI->insertBefore(NewLatch->getFirstNonPHI()); 
-        assert(!NewI->mayHaveSideEffects() && 
-               "Moving instructions with side-effects may change behavior of " 
-               "the loop nest!"); 
-        for (auto UI = WorkList[i]->use_begin(), UE = WorkList[i]->use_end(); 
-             UI != UE;) { 
-          Use &U = *UI++; 
-          Instruction *UserI = cast<Instruction>(U.getUser()); 
-          if (!InnerLoop->contains(UserI->getParent()) || 
-              UserI->getParent() == NewLatch || UserI == InductionPHI) 
-            U.set(NewI); 
-        } 
-        // Add operands of moved instruction to the worklist, except if they are 
-        // outside the inner loop or are the induction PHI. 
-        for (Value *Op : WorkList[i]->operands()) { 
-          Instruction *OpI = dyn_cast<Instruction>(Op); 
-          if (!OpI || 
-              this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop || 
-              OpI == InductionPHI) 
-            continue; 
-          WorkList.insert(OpI); 
-        } 
-      } 
-    }; 
- 
-    // FIXME: Should we interchange when we have a constant condition? 
-    Instruction *CondI = dyn_cast<Instruction>( 
-        cast<BranchInst>(InnerLoop->getLoopLatch()->getTerminator()) 
-            ->getCondition()); 
-    if (CondI) 
-      WorkList.insert(CondI); 
-    MoveInstructions(); 
-    WorkList.insert(cast<Instruction>(InnerIndexVar)); 
-    MoveInstructions(); 
- 
-    // Splits the inner loops phi nodes out into a separate basic block. 
-    BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); 
-    SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI); 
-    LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n"); 
-  } 
- 
+    NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin()));
+  NewOuter->addChildLoop(NewInner);
+
+  // BBs from the original inner loop.
+  SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks());
+
+  // Add BBs from the original outer loop to the original inner loop (excluding
+  // BBs already in inner loop)
+  for (BasicBlock *BB : NewInner->blocks())
+    if (LI->getLoopFor(BB) == NewInner)
+      NewOuter->addBlockEntry(BB);
+
+  // Now remove inner loop header and latch from the new inner loop and move
+  // other BBs (the loop body) to the new inner loop.
+  BasicBlock *OuterHeader = NewOuter->getHeader();
+  BasicBlock *OuterLatch = NewOuter->getLoopLatch();
+  for (BasicBlock *BB : OrigInnerBBs) {
+    // Nothing will change for BBs in child loops.
+    if (LI->getLoopFor(BB) != NewOuter)
+      continue;
+    // Remove the new outer loop header and latch from the new inner loop.
+    if (BB == OuterHeader || BB == OuterLatch)
+      NewInner->removeBlockFromLoop(BB);
+    else
+      LI->changeLoopFor(BB, NewInner);
+  }
+
+  // The preheader of the original outer loop becomes part of the new
+  // outer loop.
+  NewOuter->addBlockEntry(OrigOuterPreHeader);
+  LI->changeLoopFor(OrigOuterPreHeader, NewOuter);
+
+  // Tell SE that we move the loops around.
+  SE->forgetLoop(NewOuter);
+  SE->forgetLoop(NewInner);
+}
+
+bool LoopInterchangeTransform::transform() {
+  bool Transformed = false;
+  Instruction *InnerIndexVar;
+
+  if (InnerLoop->getSubLoops().empty()) {
+    BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+    LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
+    PHINode *InductionPHI = getInductionVariable(InnerLoop, SE);
+    if (!InductionPHI) {
+      LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
+      return false;
+    }
+
+    if (InductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)
+      InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(1));
+    else
+      InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0));
+
+    // Ensure that InductionPHI is the first Phi node.
+    if (&InductionPHI->getParent()->front() != InductionPHI)
+      InductionPHI->moveBefore(&InductionPHI->getParent()->front());
+
+    // Create a new latch block for the inner loop. We split at the
+    // current latch's terminator and then move the condition and all
+    // operands that are not either loop-invariant or the induction PHI into the
+    // new latch block.
+    BasicBlock *NewLatch =
+        SplitBlock(InnerLoop->getLoopLatch(),
+                   InnerLoop->getLoopLatch()->getTerminator(), DT, LI);
+
+    SmallSetVector<Instruction *, 4> WorkList;
+    unsigned i = 0;
+    auto MoveInstructions = [&i, &WorkList, this, InductionPHI, NewLatch]() {
+      for (; i < WorkList.size(); i++) {
+        // Duplicate instruction and move it the new latch. Update uses that
+        // have been moved.
+        Instruction *NewI = WorkList[i]->clone();
+        NewI->insertBefore(NewLatch->getFirstNonPHI());
+        assert(!NewI->mayHaveSideEffects() &&
+               "Moving instructions with side-effects may change behavior of "
+               "the loop nest!");
+        for (auto UI = WorkList[i]->use_begin(), UE = WorkList[i]->use_end();
+             UI != UE;) {
+          Use &U = *UI++;
+          Instruction *UserI = cast<Instruction>(U.getUser());
+          if (!InnerLoop->contains(UserI->getParent()) ||
+              UserI->getParent() == NewLatch || UserI == InductionPHI)
+            U.set(NewI);
+        }
+        // Add operands of moved instruction to the worklist, except if they are
+        // outside the inner loop or are the induction PHI.
+        for (Value *Op : WorkList[i]->operands()) {
+          Instruction *OpI = dyn_cast<Instruction>(Op);
+          if (!OpI ||
+              this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop ||
+              OpI == InductionPHI)
+            continue;
+          WorkList.insert(OpI);
+        }
+      }
+    };
+
+    // FIXME: Should we interchange when we have a constant condition?
+    Instruction *CondI = dyn_cast<Instruction>(
+        cast<BranchInst>(InnerLoop->getLoopLatch()->getTerminator())
+            ->getCondition());
+    if (CondI)
+      WorkList.insert(CondI);
+    MoveInstructions();
+    WorkList.insert(cast<Instruction>(InnerIndexVar));
+    MoveInstructions();
+
+    // Splits the inner loops phi nodes out into a separate basic block.
+    BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
+    SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI);
+    LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
+  }
+
   // Instructions in the original inner loop preheader may depend on values
   // defined in the outer loop header. Move them there, because the original
   // inner loop preheader will become the entry into the interchanged loop nest.
@@ -1321,295 +1321,295 @@ bool LoopInterchangeTransform::transform() {
       I.moveBefore(OuterLoopHeader->getTerminator());
   }
 
-  Transformed |= adjustLoopLinks(); 
-  if (!Transformed) { 
-    LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n"); 
-    return false; 
-  } 
- 
-  return true; 
-} 
- 
-/// \brief Move all instructions except the terminator from FromBB right before 
-/// InsertBefore 
-static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) { 
-  auto &ToList = InsertBefore->getParent()->getInstList(); 
-  auto &FromList = FromBB->getInstList(); 
- 
-  ToList.splice(InsertBefore->getIterator(), FromList, FromList.begin(), 
-                FromBB->getTerminator()->getIterator()); 
-} 
- 
-/// Swap instructions between \p BB1 and \p BB2 but keep terminators intact. 
-static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) { 
-  // Save all non-terminator instructions of BB1 into TempInstrs and unlink them 
-  // from BB1 afterwards. 
-  auto Iter = map_range(*BB1, [](Instruction &I) { return &I; }); 
-  SmallVector<Instruction *, 4> TempInstrs(Iter.begin(), std::prev(Iter.end())); 
-  for (Instruction *I : TempInstrs) 
-    I->removeFromParent(); 
- 
-  // Move instructions from BB2 to BB1. 
-  moveBBContents(BB2, BB1->getTerminator()); 
- 
-  // Move instructions from TempInstrs to BB2. 
-  for (Instruction *I : TempInstrs) 
-    I->insertBefore(BB2->getTerminator()); 
-} 
- 
-// Update BI to jump to NewBB instead of OldBB. Records updates to the 
-// dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that 
-// \p OldBB  is exactly once in BI's successor list. 
-static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB, 
-                            BasicBlock *NewBB, 
-                            std::vector<DominatorTree::UpdateType> &DTUpdates, 
-                            bool MustUpdateOnce = true) { 
-  assert((!MustUpdateOnce || 
-          llvm::count_if(successors(BI), 
-                         [OldBB](BasicBlock *BB) { 
-                           return BB == OldBB; 
-                         }) == 1) && "BI must jump to OldBB exactly once."); 
-  bool Changed = false; 
-  for (Use &Op : BI->operands()) 
-    if (Op == OldBB) { 
-      Op.set(NewBB); 
-      Changed = true; 
-    } 
- 
-  if (Changed) { 
-    DTUpdates.push_back( 
-        {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB}); 
-    DTUpdates.push_back( 
-        {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB}); 
-  } 
-  assert(Changed && "Expected a successor to be updated"); 
-} 
- 
-// Move Lcssa PHIs to the right place. 
-static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader, 
-                          BasicBlock *InnerLatch, BasicBlock *OuterHeader, 
-                          BasicBlock *OuterLatch, BasicBlock *OuterExit, 
-                          Loop *InnerLoop, LoopInfo *LI) { 
- 
-  // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are 
-  // defined either in the header or latch. Those blocks will become header and 
-  // latch of the new outer loop, and the only possible users can PHI nodes 
-  // in the exit block of the loop nest or the outer loop header (reduction 
-  // PHIs, in that case, the incoming value must be defined in the inner loop 
-  // header). We can just substitute the user with the incoming value and remove 
-  // the PHI. 
-  for (PHINode &P : make_early_inc_range(InnerExit->phis())) { 
-    assert(P.getNumIncomingValues() == 1 && 
-           "Only loops with a single exit are supported!"); 
- 
-    // Incoming values are guaranteed be instructions currently. 
-    auto IncI = cast<Instruction>(P.getIncomingValueForBlock(InnerLatch)); 
-    // Skip phis with incoming values from the inner loop body, excluding the 
-    // header and latch. 
-    if (IncI->getParent() != InnerLatch && IncI->getParent() != InnerHeader) 
-      continue; 
- 
-    assert(all_of(P.users(), 
-                  [OuterHeader, OuterExit, IncI, InnerHeader](User *U) { 
-                    return (cast<PHINode>(U)->getParent() == OuterHeader && 
-                            IncI->getParent() == InnerHeader) || 
-                           cast<PHINode>(U)->getParent() == OuterExit; 
-                  }) && 
-           "Can only replace phis iff the uses are in the loop nest exit or " 
-           "the incoming value is defined in the inner header (it will " 
-           "dominate all loop blocks after interchanging)"); 
-    P.replaceAllUsesWith(IncI); 
-    P.eraseFromParent(); 
-  } 
- 
-  SmallVector<PHINode *, 8> LcssaInnerExit; 
-  for (PHINode &P : InnerExit->phis()) 
-    LcssaInnerExit.push_back(&P); 
- 
-  SmallVector<PHINode *, 8> LcssaInnerLatch; 
-  for (PHINode &P : InnerLatch->phis()) 
-    LcssaInnerLatch.push_back(&P); 
- 
-  // Lcssa PHIs for values used outside the inner loop are in InnerExit. 
-  // If a PHI node has users outside of InnerExit, it has a use outside the 
-  // interchanged loop and we have to preserve it. We move these to 
-  // InnerLatch, which will become the new exit block for the innermost 
-  // loop after interchanging. 
-  for (PHINode *P : LcssaInnerExit) 
-    P->moveBefore(InnerLatch->getFirstNonPHI()); 
- 
-  // If the inner loop latch contains LCSSA PHIs, those come from a child loop 
-  // and we have to move them to the new inner latch. 
-  for (PHINode *P : LcssaInnerLatch) 
-    P->moveBefore(InnerExit->getFirstNonPHI()); 
- 
-  // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have 
-  // incoming values defined in the outer loop, we have to add a new PHI 
-  // in the inner loop latch, which became the exit block of the outer loop, 
-  // after interchanging. 
-  if (OuterExit) { 
-    for (PHINode &P : OuterExit->phis()) { 
-      if (P.getNumIncomingValues() != 1) 
-        continue; 
-      // Skip Phis with incoming values defined in the inner loop. Those should 
-      // already have been updated. 
-      auto I = dyn_cast<Instruction>(P.getIncomingValue(0)); 
-      if (!I || LI->getLoopFor(I->getParent()) == InnerLoop) 
-        continue; 
- 
-      PHINode *NewPhi = dyn_cast<PHINode>(P.clone()); 
-      NewPhi->setIncomingValue(0, P.getIncomingValue(0)); 
-      NewPhi->setIncomingBlock(0, OuterLatch); 
-      NewPhi->insertBefore(InnerLatch->getFirstNonPHI()); 
-      P.setIncomingValue(0, NewPhi); 
-    } 
-  } 
- 
-  // Now adjust the incoming blocks for the LCSSA PHIs. 
-  // For PHIs moved from Inner's exit block, we need to replace Inner's latch 
-  // with the new latch. 
-  InnerLatch->replacePhiUsesWith(InnerLatch, OuterLatch); 
-} 
- 
-bool LoopInterchangeTransform::adjustLoopBranches() { 
-  LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n"); 
-  std::vector<DominatorTree::UpdateType> DTUpdates; 
- 
-  BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); 
-  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 
- 
-  assert(OuterLoopPreHeader != OuterLoop->getHeader() && 
-         InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader && 
-         InnerLoopPreHeader && "Guaranteed by loop-simplify form"); 
-  // Ensure that both preheaders do not contain PHI nodes and have single 
-  // predecessors. This allows us to move them easily. We use 
-  // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing 
-  // preheaders do not satisfy those conditions. 
-  if (isa<PHINode>(OuterLoopPreHeader->begin()) || 
-      !OuterLoopPreHeader->getUniquePredecessor()) 
-    OuterLoopPreHeader = 
-        InsertPreheaderForLoop(OuterLoop, DT, LI, nullptr, true); 
-  if (InnerLoopPreHeader == OuterLoop->getHeader()) 
-    InnerLoopPreHeader = 
-        InsertPreheaderForLoop(InnerLoop, DT, LI, nullptr, true); 
- 
-  // Adjust the loop preheader 
-  BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); 
-  BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); 
-  BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); 
-  BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); 
-  BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor(); 
-  BasicBlock *InnerLoopLatchPredecessor = 
-      InnerLoopLatch->getUniquePredecessor(); 
-  BasicBlock *InnerLoopLatchSuccessor; 
-  BasicBlock *OuterLoopLatchSuccessor; 
- 
-  BranchInst *OuterLoopLatchBI = 
-      dyn_cast<BranchInst>(OuterLoopLatch->getTerminator()); 
-  BranchInst *InnerLoopLatchBI = 
-      dyn_cast<BranchInst>(InnerLoopLatch->getTerminator()); 
-  BranchInst *OuterLoopHeaderBI = 
-      dyn_cast<BranchInst>(OuterLoopHeader->getTerminator()); 
-  BranchInst *InnerLoopHeaderBI = 
-      dyn_cast<BranchInst>(InnerLoopHeader->getTerminator()); 
- 
-  if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor || 
-      !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI || 
-      !InnerLoopHeaderBI) 
-    return false; 
- 
-  BranchInst *InnerLoopLatchPredecessorBI = 
-      dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator()); 
-  BranchInst *OuterLoopPredecessorBI = 
-      dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator()); 
- 
-  if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI) 
-    return false; 
-  BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor(); 
-  if (!InnerLoopHeaderSuccessor) 
-    return false; 
- 
-  // Adjust Loop Preheader and headers. 
-  // The branches in the outer loop predecessor and the outer loop header can 
-  // be unconditional branches or conditional branches with duplicates. Consider 
-  // this when updating the successors. 
-  updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader, 
-                  InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false); 
-  // The outer loop header might or might not branch to the outer latch. 
-  // We are guaranteed to branch to the inner loop preheader. 
+  Transformed |= adjustLoopLinks();
+  if (!Transformed) {
+    LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n");
+    return false;
+  }
+
+  return true;
+}
+
+/// \brief Move all instructions except the terminator from FromBB right before
+/// InsertBefore
+static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
+  auto &ToList = InsertBefore->getParent()->getInstList();
+  auto &FromList = FromBB->getInstList();
+
+  ToList.splice(InsertBefore->getIterator(), FromList, FromList.begin(),
+                FromBB->getTerminator()->getIterator());
+}
+
+/// Swap instructions between \p BB1 and \p BB2 but keep terminators intact.
+static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) {
+  // Save all non-terminator instructions of BB1 into TempInstrs and unlink them
+  // from BB1 afterwards.
+  auto Iter = map_range(*BB1, [](Instruction &I) { return &I; });
+  SmallVector<Instruction *, 4> TempInstrs(Iter.begin(), std::prev(Iter.end()));
+  for (Instruction *I : TempInstrs)
+    I->removeFromParent();
+
+  // Move instructions from BB2 to BB1.
+  moveBBContents(BB2, BB1->getTerminator());
+
+  // Move instructions from TempInstrs to BB2.
+  for (Instruction *I : TempInstrs)
+    I->insertBefore(BB2->getTerminator());
+}
+
+// Update BI to jump to NewBB instead of OldBB. Records updates to the
+// dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that
+// \p OldBB  is exactly once in BI's successor list.
+static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB,
+                            BasicBlock *NewBB,
+                            std::vector<DominatorTree::UpdateType> &DTUpdates,
+                            bool MustUpdateOnce = true) {
+  assert((!MustUpdateOnce ||
+          llvm::count_if(successors(BI),
+                         [OldBB](BasicBlock *BB) {
+                           return BB == OldBB;
+                         }) == 1) && "BI must jump to OldBB exactly once.");
+  bool Changed = false;
+  for (Use &Op : BI->operands())
+    if (Op == OldBB) {
+      Op.set(NewBB);
+      Changed = true;
+    }
+
+  if (Changed) {
+    DTUpdates.push_back(
+        {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB});
+    DTUpdates.push_back(
+        {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB});
+  }
+  assert(Changed && "Expected a successor to be updated");
+}
+
+// Move Lcssa PHIs to the right place.
+static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
+                          BasicBlock *InnerLatch, BasicBlock *OuterHeader,
+                          BasicBlock *OuterLatch, BasicBlock *OuterExit,
+                          Loop *InnerLoop, LoopInfo *LI) {
+
+  // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are
+  // defined either in the header or latch. Those blocks will become header and
+  // latch of the new outer loop, and the only possible users can PHI nodes
+  // in the exit block of the loop nest or the outer loop header (reduction
+  // PHIs, in that case, the incoming value must be defined in the inner loop
+  // header). We can just substitute the user with the incoming value and remove
+  // the PHI.
+  for (PHINode &P : make_early_inc_range(InnerExit->phis())) {
+    assert(P.getNumIncomingValues() == 1 &&
+           "Only loops with a single exit are supported!");
+
+    // Incoming values are guaranteed be instructions currently.
+    auto IncI = cast<Instruction>(P.getIncomingValueForBlock(InnerLatch));
+    // Skip phis with incoming values from the inner loop body, excluding the
+    // header and latch.
+    if (IncI->getParent() != InnerLatch && IncI->getParent() != InnerHeader)
+      continue;
+
+    assert(all_of(P.users(),
+                  [OuterHeader, OuterExit, IncI, InnerHeader](User *U) {
+                    return (cast<PHINode>(U)->getParent() == OuterHeader &&
+                            IncI->getParent() == InnerHeader) ||
+                           cast<PHINode>(U)->getParent() == OuterExit;
+                  }) &&
+           "Can only replace phis iff the uses are in the loop nest exit or "
+           "the incoming value is defined in the inner header (it will "
+           "dominate all loop blocks after interchanging)");
+    P.replaceAllUsesWith(IncI);
+    P.eraseFromParent();
+  }
+
+  SmallVector<PHINode *, 8> LcssaInnerExit;
+  for (PHINode &P : InnerExit->phis())
+    LcssaInnerExit.push_back(&P);
+
+  SmallVector<PHINode *, 8> LcssaInnerLatch;
+  for (PHINode &P : InnerLatch->phis())
+    LcssaInnerLatch.push_back(&P);
+
+  // Lcssa PHIs for values used outside the inner loop are in InnerExit.
+  // If a PHI node has users outside of InnerExit, it has a use outside the
+  // interchanged loop and we have to preserve it. We move these to
+  // InnerLatch, which will become the new exit block for the innermost
+  // loop after interchanging.
+  for (PHINode *P : LcssaInnerExit)
+    P->moveBefore(InnerLatch->getFirstNonPHI());
+
+  // If the inner loop latch contains LCSSA PHIs, those come from a child loop
+  // and we have to move them to the new inner latch.
+  for (PHINode *P : LcssaInnerLatch)
+    P->moveBefore(InnerExit->getFirstNonPHI());
+
+  // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have
+  // incoming values defined in the outer loop, we have to add a new PHI
+  // in the inner loop latch, which became the exit block of the outer loop,
+  // after interchanging.
+  if (OuterExit) {
+    for (PHINode &P : OuterExit->phis()) {
+      if (P.getNumIncomingValues() != 1)
+        continue;
+      // Skip Phis with incoming values defined in the inner loop. Those should
+      // already have been updated.
+      auto I = dyn_cast<Instruction>(P.getIncomingValue(0));
+      if (!I || LI->getLoopFor(I->getParent()) == InnerLoop)
+        continue;
+
+      PHINode *NewPhi = dyn_cast<PHINode>(P.clone());
+      NewPhi->setIncomingValue(0, P.getIncomingValue(0));
+      NewPhi->setIncomingBlock(0, OuterLatch);
+      NewPhi->insertBefore(InnerLatch->getFirstNonPHI());
+      P.setIncomingValue(0, NewPhi);
+    }
+  }
+
+  // Now adjust the incoming blocks for the LCSSA PHIs.
+  // For PHIs moved from Inner's exit block, we need to replace Inner's latch
+  // with the new latch.
+  InnerLatch->replacePhiUsesWith(InnerLatch, OuterLatch);
+}
+
+bool LoopInterchangeTransform::adjustLoopBranches() {
+  LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");
+  std::vector<DominatorTree::UpdateType> DTUpdates;
+
+  BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
+  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+
+  assert(OuterLoopPreHeader != OuterLoop->getHeader() &&
+         InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader &&
+         InnerLoopPreHeader && "Guaranteed by loop-simplify form");
+  // Ensure that both preheaders do not contain PHI nodes and have single
+  // predecessors. This allows us to move them easily. We use
+  // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing
+  // preheaders do not satisfy those conditions.
+  if (isa<PHINode>(OuterLoopPreHeader->begin()) ||
+      !OuterLoopPreHeader->getUniquePredecessor())
+    OuterLoopPreHeader =
+        InsertPreheaderForLoop(OuterLoop, DT, LI, nullptr, true);
+  if (InnerLoopPreHeader == OuterLoop->getHeader())
+    InnerLoopPreHeader =
+        InsertPreheaderForLoop(InnerLoop, DT, LI, nullptr, true);
+
+  // Adjust the loop preheader
+  BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
+  BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
+  BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
+  BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
+  BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
+  BasicBlock *InnerLoopLatchPredecessor =
+      InnerLoopLatch->getUniquePredecessor();
+  BasicBlock *InnerLoopLatchSuccessor;
+  BasicBlock *OuterLoopLatchSuccessor;
+
+  BranchInst *OuterLoopLatchBI =
+      dyn_cast<BranchInst>(OuterLoopLatch->getTerminator());
+  BranchInst *InnerLoopLatchBI =
+      dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
+  BranchInst *OuterLoopHeaderBI =
+      dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
+  BranchInst *InnerLoopHeaderBI =
+      dyn_cast<BranchInst>(InnerLoopHeader->getTerminator());
+
+  if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
+      !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
+      !InnerLoopHeaderBI)
+    return false;
+
+  BranchInst *InnerLoopLatchPredecessorBI =
+      dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator());
+  BranchInst *OuterLoopPredecessorBI =
+      dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator());
+
+  if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
+    return false;
+  BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
+  if (!InnerLoopHeaderSuccessor)
+    return false;
+
+  // Adjust Loop Preheader and headers.
+  // The branches in the outer loop predecessor and the outer loop header can
+  // be unconditional branches or conditional branches with duplicates. Consider
+  // this when updating the successors.
+  updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader,
+                  InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false);
+  // The outer loop header might or might not branch to the outer latch.
+  // We are guaranteed to branch to the inner loop preheader.
   if (llvm::is_contained(OuterLoopHeaderBI->successors(), OuterLoopLatch))
-    updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, LoopExit, DTUpdates, 
-                    /*MustUpdateOnce=*/false); 
-  updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader, 
-                  InnerLoopHeaderSuccessor, DTUpdates, 
-                  /*MustUpdateOnce=*/false); 
- 
-  // Adjust reduction PHI's now that the incoming block has changed. 
-  InnerLoopHeaderSuccessor->replacePhiUsesWith(InnerLoopHeader, 
-                                               OuterLoopHeader); 
- 
-  updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor, 
-                  OuterLoopPreHeader, DTUpdates); 
- 
-  // -------------Adjust loop latches----------- 
-  if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader) 
-    InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1); 
-  else 
-    InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0); 
- 
-  updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch, 
-                  InnerLoopLatchSuccessor, DTUpdates); 
- 
- 
-  if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader) 
-    OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1); 
-  else 
-    OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0); 
- 
-  updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor, 
-                  OuterLoopLatchSuccessor, DTUpdates); 
-  updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch, 
-                  DTUpdates); 
- 
-  DT->applyUpdates(DTUpdates); 
-  restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader, 
-                   OuterLoopPreHeader); 
- 
-  moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopHeader, InnerLoopLatch, 
-                OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock(), 
-                InnerLoop, LI); 
-  // For PHIs in the exit block of the outer loop, outer's latch has been 
-  // replaced by Inners'. 
-  OuterLoopLatchSuccessor->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch); 
- 
-  // Now update the reduction PHIs in the inner and outer loop headers. 
-  SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs; 
+    updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, LoopExit, DTUpdates,
+                    /*MustUpdateOnce=*/false);
+  updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader,
+                  InnerLoopHeaderSuccessor, DTUpdates,
+                  /*MustUpdateOnce=*/false);
+
+  // Adjust reduction PHI's now that the incoming block has changed.
+  InnerLoopHeaderSuccessor->replacePhiUsesWith(InnerLoopHeader,
+                                               OuterLoopHeader);
+
+  updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor,
+                  OuterLoopPreHeader, DTUpdates);
+
+  // -------------Adjust loop latches-----------
+  if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
+    InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1);
+  else
+    InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
+
+  updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch,
+                  InnerLoopLatchSuccessor, DTUpdates);
+
+
+  if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
+    OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
+  else
+    OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
+
+  updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor,
+                  OuterLoopLatchSuccessor, DTUpdates);
+  updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch,
+                  DTUpdates);
+
+  DT->applyUpdates(DTUpdates);
+  restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader,
+                   OuterLoopPreHeader);
+
+  moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopHeader, InnerLoopLatch,
+                OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock(),
+                InnerLoop, LI);
+  // For PHIs in the exit block of the outer loop, outer's latch has been
+  // replaced by Inners'.
+  OuterLoopLatchSuccessor->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
+
+  // Now update the reduction PHIs in the inner and outer loop headers.
+  SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
   for (PHINode &PHI : drop_begin(InnerLoopHeader->phis()))
-    InnerLoopPHIs.push_back(cast<PHINode>(&PHI)); 
+    InnerLoopPHIs.push_back(cast<PHINode>(&PHI));
   for (PHINode &PHI : drop_begin(OuterLoopHeader->phis()))
-    OuterLoopPHIs.push_back(cast<PHINode>(&PHI)); 
- 
-  auto &OuterInnerReductions = LIL.getOuterInnerReductions(); 
-  (void)OuterInnerReductions; 
- 
-  // Now move the remaining reduction PHIs from outer to inner loop header and 
-  // vice versa. The PHI nodes must be part of a reduction across the inner and 
-  // outer loop and all the remains to do is and updating the incoming blocks. 
-  for (PHINode *PHI : OuterLoopPHIs) { 
-    PHI->moveBefore(InnerLoopHeader->getFirstNonPHI()); 
-    assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node"); 
-  } 
-  for (PHINode *PHI : InnerLoopPHIs) { 
-    PHI->moveBefore(OuterLoopHeader->getFirstNonPHI()); 
-    assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node"); 
-  } 
- 
-  // Update the incoming blocks for moved PHI nodes. 
-  OuterLoopHeader->replacePhiUsesWith(InnerLoopPreHeader, OuterLoopPreHeader); 
-  OuterLoopHeader->replacePhiUsesWith(InnerLoopLatch, OuterLoopLatch); 
-  InnerLoopHeader->replacePhiUsesWith(OuterLoopPreHeader, InnerLoopPreHeader); 
-  InnerLoopHeader->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch); 
- 
+    OuterLoopPHIs.push_back(cast<PHINode>(&PHI));
+
+  auto &OuterInnerReductions = LIL.getOuterInnerReductions();
+  (void)OuterInnerReductions;
+
+  // Now move the remaining reduction PHIs from outer to inner loop header and
+  // vice versa. The PHI nodes must be part of a reduction across the inner and
+  // outer loop and all the remains to do is and updating the incoming blocks.
+  for (PHINode *PHI : OuterLoopPHIs) {
+    PHI->moveBefore(InnerLoopHeader->getFirstNonPHI());
+    assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
+  }
+  for (PHINode *PHI : InnerLoopPHIs) {
+    PHI->moveBefore(OuterLoopHeader->getFirstNonPHI());
+    assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
+  }
+
+  // Update the incoming blocks for moved PHI nodes.
+  OuterLoopHeader->replacePhiUsesWith(InnerLoopPreHeader, OuterLoopPreHeader);
+  OuterLoopHeader->replacePhiUsesWith(InnerLoopLatch, OuterLoopLatch);
+  InnerLoopHeader->replacePhiUsesWith(OuterLoopPreHeader, InnerLoopPreHeader);
+  InnerLoopHeader->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
+
   // Values defined in the outer loop header could be used in the inner loop
   // latch. In that case, we need to create LCSSA phis for them, because after
   // interchanging they will be defined in the new inner loop and used in the
@@ -1621,27 +1621,27 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
     MayNeedLCSSAPhis.push_back(&I);
   formLCSSAForInstructions(MayNeedLCSSAPhis, *DT, *LI, SE, Builder);
 
-  return true; 
-} 
- 
-bool LoopInterchangeTransform::adjustLoopLinks() { 
-  // Adjust all branches in the inner and outer loop. 
-  bool Changed = adjustLoopBranches(); 
-  if (Changed) { 
-    // We have interchanged the preheaders so we need to interchange the data in 
-    // the preheaders as well. This is because the content of the inner 
-    // preheader was previously executed inside the outer loop. 
-    BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); 
-    BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 
-    swapBBContents(OuterLoopPreHeader, InnerLoopPreHeader); 
-  } 
-  return Changed; 
-} 
- 
+  return true;
+}
+
+bool LoopInterchangeTransform::adjustLoopLinks() {
+  // Adjust all branches in the inner and outer loop.
+  bool Changed = adjustLoopBranches();
+  if (Changed) {
+    // We have interchanged the preheaders so we need to interchange the data in
+    // the preheaders as well. This is because the content of the inner
+    // preheader was previously executed inside the outer loop.
+    BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
+    BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+    swapBBContents(OuterLoopPreHeader, InnerLoopPreHeader);
+  }
+  return Changed;
+}
+
 /// Main LoopInterchange Pass.
 struct LoopInterchangeLegacyPass : public LoopPass {
   static char ID;
- 
+
   LoopInterchangeLegacyPass() : LoopPass(ID) {
     initializeLoopInterchangeLegacyPassPass(*PassRegistry::getPassRegistry());
   }
@@ -1670,14 +1670,14 @@ struct LoopInterchangeLegacyPass : public LoopPass {
 char LoopInterchangeLegacyPass::ID = 0;
 
 INITIALIZE_PASS_BEGIN(LoopInterchangeLegacyPass, "loop-interchange",
-                      "Interchanges loops for cache reuse", false, false) 
-INITIALIZE_PASS_DEPENDENCY(LoopPass) 
-INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 
- 
+                      "Interchanges loops for cache reuse", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+
 INITIALIZE_PASS_END(LoopInterchangeLegacyPass, "loop-interchange",
-                    "Interchanges loops for cache reuse", false, false) 
- 
+                    "Interchanges loops for cache reuse", false, false)
+
 Pass *llvm::createLoopInterchangePass() {
   return new LoopInterchangeLegacyPass();
 }
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopLoadElimination.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopLoadElimination.cpp
index e82d9f5407..058612149a 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -1,510 +1,510 @@
-//===- LoopLoadElimination.cpp - Loop Load Elimination Pass ---------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implement a loop-aware load elimination pass. 
-// 
-// It uses LoopAccessAnalysis to identify loop-carried dependences with a 
-// distance of one between stores and loads.  These form the candidates for the 
-// transformation.  The source value of each store then propagated to the user 
-// of the corresponding load.  This makes the load dead. 
-// 
-// The pass can also version the loop and add memchecks in order to prove that 
-// may-aliasing stores can't change the value in memory before it's read by the 
-// load. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/LoopLoadElimination.h" 
-#include "llvm/ADT/APInt.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/DepthFirstIterator.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/BlockFrequencyInfo.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/LazyBlockFrequencyInfo.h" 
-#include "llvm/Analysis/LoopAccessAnalysis.h" 
-#include "llvm/Analysis/LoopAnalysisManager.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/MemorySSA.h" 
-#include "llvm/Analysis/ProfileSummaryInfo.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/Analysis/ScalarEvolutionExpressions.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils.h" 
+//===- LoopLoadElimination.cpp - Loop Load Elimination Pass ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implement a loop-aware load elimination pass.
+//
+// It uses LoopAccessAnalysis to identify loop-carried dependences with a
+// distance of one between stores and loads.  These form the candidates for the
+// transformation.  The source value of each store then propagated to the user
+// of the corresponding load.  This makes the load dead.
+//
+// The pass can also version the loop and add memchecks in order to prove that
+// may-aliasing stores can't change the value in memory before it's read by the
+// load.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopLoadElimination.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/LoopSimplify.h"
-#include "llvm/Transforms/Utils/LoopVersioning.h" 
-#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 
-#include "llvm/Transforms/Utils/SizeOpts.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <forward_list> 
-#include <set> 
-#include <tuple> 
-#include <utility> 
- 
-using namespace llvm; 
- 
-#define LLE_OPTION "loop-load-elim" 
-#define DEBUG_TYPE LLE_OPTION 
- 
-static cl::opt<unsigned> CheckPerElim( 
-    "runtime-check-per-loop-load-elim", cl::Hidden, 
-    cl::desc("Max number of memchecks allowed per eliminated load on average"), 
-    cl::init(1)); 
- 
-static cl::opt<unsigned> LoadElimSCEVCheckThreshold( 
-    "loop-load-elimination-scev-check-threshold", cl::init(8), cl::Hidden, 
-    cl::desc("The maximum number of SCEV checks allowed for Loop " 
-             "Load Elimination")); 
- 
-STATISTIC(NumLoopLoadEliminted, "Number of loads eliminated by LLE"); 
- 
-namespace { 
- 
-/// Represent a store-to-forwarding candidate. 
-struct StoreToLoadForwardingCandidate { 
-  LoadInst *Load; 
-  StoreInst *Store; 
- 
-  StoreToLoadForwardingCandidate(LoadInst *Load, StoreInst *Store) 
-      : Load(Load), Store(Store) {} 
- 
-  /// Return true if the dependence from the store to the load has a 
-  /// distance of one.  E.g. A[i+1] = A[i] 
-  bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE, 
-                                 Loop *L) const { 
-    Value *LoadPtr = Load->getPointerOperand(); 
-    Value *StorePtr = Store->getPointerOperand(); 
-    Type *LoadPtrType = LoadPtr->getType(); 
-    Type *LoadType = LoadPtrType->getPointerElementType(); 
- 
-    assert(LoadPtrType->getPointerAddressSpace() == 
-               StorePtr->getType()->getPointerAddressSpace() && 
-           LoadType == StorePtr->getType()->getPointerElementType() && 
-           "Should be a known dependence"); 
- 
-    // Currently we only support accesses with unit stride.  FIXME: we should be 
-    // able to handle non unit stirde as well as long as the stride is equal to 
-    // the dependence distance. 
-    if (getPtrStride(PSE, LoadPtr, L) != 1 || 
-        getPtrStride(PSE, StorePtr, L) != 1) 
-      return false; 
- 
-    auto &DL = Load->getParent()->getModule()->getDataLayout(); 
-    unsigned TypeByteSize = DL.getTypeAllocSize(const_cast<Type *>(LoadType)); 
- 
-    auto *LoadPtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(LoadPtr)); 
-    auto *StorePtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(StorePtr)); 
- 
-    // We don't need to check non-wrapping here because forward/backward 
-    // dependence wouldn't be valid if these weren't monotonic accesses. 
-    auto *Dist = cast<SCEVConstant>( 
-        PSE.getSE()->getMinusSCEV(StorePtrSCEV, LoadPtrSCEV)); 
-    const APInt &Val = Dist->getAPInt(); 
-    return Val == TypeByteSize; 
-  } 
- 
-  Value *getLoadPtr() const { return Load->getPointerOperand(); } 
- 
-#ifndef NDEBUG 
-  friend raw_ostream &operator<<(raw_ostream &OS, 
-                                 const StoreToLoadForwardingCandidate &Cand) { 
-    OS << *Cand.Store << " -->\n"; 
-    OS.indent(2) << *Cand.Load << "\n"; 
-    return OS; 
-  } 
-#endif 
-}; 
- 
-} // end anonymous namespace 
- 
-/// Check if the store dominates all latches, so as long as there is no 
-/// intervening store this value will be loaded in the next iteration. 
-static bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L, 
-                                         DominatorTree *DT) { 
-  SmallVector<BasicBlock *, 8> Latches; 
-  L->getLoopLatches(Latches); 
-  return llvm::all_of(Latches, [&](const BasicBlock *Latch) { 
-    return DT->dominates(StoreBlock, Latch); 
-  }); 
-} 
- 
-/// Return true if the load is not executed on all paths in the loop. 
-static bool isLoadConditional(LoadInst *Load, Loop *L) { 
-  return Load->getParent() != L->getHeader(); 
-} 
- 
-namespace { 
- 
-/// The per-loop class that does most of the work. 
-class LoadEliminationForLoop { 
-public: 
-  LoadEliminationForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI, 
-                         DominatorTree *DT, BlockFrequencyInfo *BFI, 
-                         ProfileSummaryInfo* PSI) 
-      : L(L), LI(LI), LAI(LAI), DT(DT), BFI(BFI), PSI(PSI), PSE(LAI.getPSE()) {} 
- 
-  /// Look through the loop-carried and loop-independent dependences in 
-  /// this loop and find store->load dependences. 
-  /// 
-  /// Note that no candidate is returned if LAA has failed to analyze the loop 
-  /// (e.g. if it's not bottom-tested, contains volatile memops, etc.) 
-  std::forward_list<StoreToLoadForwardingCandidate> 
-  findStoreToLoadDependences(const LoopAccessInfo &LAI) { 
-    std::forward_list<StoreToLoadForwardingCandidate> Candidates; 
- 
-    const auto *Deps = LAI.getDepChecker().getDependences(); 
-    if (!Deps) 
-      return Candidates; 
- 
-    // Find store->load dependences (consequently true dep).  Both lexically 
-    // forward and backward dependences qualify.  Disqualify loads that have 
-    // other unknown dependences. 
- 
-    SmallPtrSet<Instruction *, 4> LoadsWithUnknownDepedence; 
- 
-    for (const auto &Dep : *Deps) { 
-      Instruction *Source = Dep.getSource(LAI); 
-      Instruction *Destination = Dep.getDestination(LAI); 
- 
-      if (Dep.Type == MemoryDepChecker::Dependence::Unknown) { 
-        if (isa<LoadInst>(Source)) 
-          LoadsWithUnknownDepedence.insert(Source); 
-        if (isa<LoadInst>(Destination)) 
-          LoadsWithUnknownDepedence.insert(Destination); 
-        continue; 
-      } 
- 
-      if (Dep.isBackward()) 
-        // Note that the designations source and destination follow the program 
-        // order, i.e. source is always first.  (The direction is given by the 
-        // DepType.) 
-        std::swap(Source, Destination); 
-      else 
-        assert(Dep.isForward() && "Needs to be a forward dependence"); 
- 
-      auto *Store = dyn_cast<StoreInst>(Source); 
-      if (!Store) 
-        continue; 
-      auto *Load = dyn_cast<LoadInst>(Destination); 
-      if (!Load) 
-        continue; 
- 
-      // Only progagate the value if they are of the same type. 
-      if (Store->getPointerOperandType() != Load->getPointerOperandType()) 
-        continue; 
- 
-      Candidates.emplace_front(Load, Store); 
-    } 
- 
-    if (!LoadsWithUnknownDepedence.empty()) 
-      Candidates.remove_if([&](const StoreToLoadForwardingCandidate &C) { 
-        return LoadsWithUnknownDepedence.count(C.Load); 
-      }); 
- 
-    return Candidates; 
-  } 
- 
-  /// Return the index of the instruction according to program order. 
-  unsigned getInstrIndex(Instruction *Inst) { 
-    auto I = InstOrder.find(Inst); 
-    assert(I != InstOrder.end() && "No index for instruction"); 
-    return I->second; 
-  } 
- 
-  /// If a load has multiple candidates associated (i.e. different 
-  /// stores), it means that it could be forwarding from multiple stores 
-  /// depending on control flow.  Remove these candidates. 
-  /// 
-  /// Here, we rely on LAA to include the relevant loop-independent dependences. 
-  /// LAA is known to omit these in the very simple case when the read and the 
-  /// write within an alias set always takes place using the *same* pointer. 
-  /// 
-  /// However, we know that this is not the case here, i.e. we can rely on LAA 
-  /// to provide us with loop-independent dependences for the cases we're 
-  /// interested.  Consider the case for example where a loop-independent 
-  /// dependece S1->S2 invalidates the forwarding S3->S2. 
-  /// 
-  ///         A[i]   = ...   (S1) 
-  ///         ...    = A[i]  (S2) 
-  ///         A[i+1] = ...   (S3) 
-  /// 
-  /// LAA will perform dependence analysis here because there are two 
-  /// *different* pointers involved in the same alias set (&A[i] and &A[i+1]). 
-  void removeDependencesFromMultipleStores( 
-      std::forward_list<StoreToLoadForwardingCandidate> &Candidates) { 
-    // If Store is nullptr it means that we have multiple stores forwarding to 
-    // this store. 
-    using LoadToSingleCandT = 
-        DenseMap<LoadInst *, const StoreToLoadForwardingCandidate *>; 
-    LoadToSingleCandT LoadToSingleCand; 
- 
-    for (const auto &Cand : Candidates) { 
-      bool NewElt; 
-      LoadToSingleCandT::iterator Iter; 
- 
-      std::tie(Iter, NewElt) = 
-          LoadToSingleCand.insert(std::make_pair(Cand.Load, &Cand)); 
-      if (!NewElt) { 
-        const StoreToLoadForwardingCandidate *&OtherCand = Iter->second; 
-        // Already multiple stores forward to this load. 
-        if (OtherCand == nullptr) 
-          continue; 
- 
-        // Handle the very basic case when the two stores are in the same block 
-        // so deciding which one forwards is easy.  The later one forwards as 
-        // long as they both have a dependence distance of one to the load. 
-        if (Cand.Store->getParent() == OtherCand->Store->getParent() && 
-            Cand.isDependenceDistanceOfOne(PSE, L) && 
-            OtherCand->isDependenceDistanceOfOne(PSE, L)) { 
-          // They are in the same block, the later one will forward to the load. 
-          if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store)) 
-            OtherCand = &Cand; 
-        } else 
-          OtherCand = nullptr; 
-      } 
-    } 
- 
-    Candidates.remove_if([&](const StoreToLoadForwardingCandidate &Cand) { 
-      if (LoadToSingleCand[Cand.Load] != &Cand) { 
-        LLVM_DEBUG( 
-            dbgs() << "Removing from candidates: \n" 
-                   << Cand 
-                   << "  The load may have multiple stores forwarding to " 
-                   << "it\n"); 
-        return true; 
-      } 
-      return false; 
-    }); 
-  } 
- 
-  /// Given two pointers operations by their RuntimePointerChecking 
-  /// indices, return true if they require an alias check. 
-  /// 
-  /// We need a check if one is a pointer for a candidate load and the other is 
-  /// a pointer for a possibly intervening store. 
-  bool needsChecking(unsigned PtrIdx1, unsigned PtrIdx2, 
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
+#include <algorithm>
+#include <cassert>
+#include <forward_list>
+#include <set>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+
+#define LLE_OPTION "loop-load-elim"
+#define DEBUG_TYPE LLE_OPTION
+
+static cl::opt<unsigned> CheckPerElim(
+    "runtime-check-per-loop-load-elim", cl::Hidden,
+    cl::desc("Max number of memchecks allowed per eliminated load on average"),
+    cl::init(1));
+
+static cl::opt<unsigned> LoadElimSCEVCheckThreshold(
+    "loop-load-elimination-scev-check-threshold", cl::init(8), cl::Hidden,
+    cl::desc("The maximum number of SCEV checks allowed for Loop "
+             "Load Elimination"));
+
+STATISTIC(NumLoopLoadEliminted, "Number of loads eliminated by LLE");
+
+namespace {
+
+/// Represent a store-to-forwarding candidate.
+struct StoreToLoadForwardingCandidate {
+  LoadInst *Load;
+  StoreInst *Store;
+
+  StoreToLoadForwardingCandidate(LoadInst *Load, StoreInst *Store)
+      : Load(Load), Store(Store) {}
+
+  /// Return true if the dependence from the store to the load has a
+  /// distance of one.  E.g. A[i+1] = A[i]
+  bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE,
+                                 Loop *L) const {
+    Value *LoadPtr = Load->getPointerOperand();
+    Value *StorePtr = Store->getPointerOperand();
+    Type *LoadPtrType = LoadPtr->getType();
+    Type *LoadType = LoadPtrType->getPointerElementType();
+
+    assert(LoadPtrType->getPointerAddressSpace() ==
+               StorePtr->getType()->getPointerAddressSpace() &&
+           LoadType == StorePtr->getType()->getPointerElementType() &&
+           "Should be a known dependence");
+
+    // Currently we only support accesses with unit stride.  FIXME: we should be
+    // able to handle non unit stirde as well as long as the stride is equal to
+    // the dependence distance.
+    if (getPtrStride(PSE, LoadPtr, L) != 1 ||
+        getPtrStride(PSE, StorePtr, L) != 1)
+      return false;
+
+    auto &DL = Load->getParent()->getModule()->getDataLayout();
+    unsigned TypeByteSize = DL.getTypeAllocSize(const_cast<Type *>(LoadType));
+
+    auto *LoadPtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(LoadPtr));
+    auto *StorePtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(StorePtr));
+
+    // We don't need to check non-wrapping here because forward/backward
+    // dependence wouldn't be valid if these weren't monotonic accesses.
+    auto *Dist = cast<SCEVConstant>(
+        PSE.getSE()->getMinusSCEV(StorePtrSCEV, LoadPtrSCEV));
+    const APInt &Val = Dist->getAPInt();
+    return Val == TypeByteSize;
+  }
+
+  Value *getLoadPtr() const { return Load->getPointerOperand(); }
+
+#ifndef NDEBUG
+  friend raw_ostream &operator<<(raw_ostream &OS,
+                                 const StoreToLoadForwardingCandidate &Cand) {
+    OS << *Cand.Store << " -->\n";
+    OS.indent(2) << *Cand.Load << "\n";
+    return OS;
+  }
+#endif
+};
+
+} // end anonymous namespace
+
+/// Check if the store dominates all latches, so as long as there is no
+/// intervening store this value will be loaded in the next iteration.
+static bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L,
+                                         DominatorTree *DT) {
+  SmallVector<BasicBlock *, 8> Latches;
+  L->getLoopLatches(Latches);
+  return llvm::all_of(Latches, [&](const BasicBlock *Latch) {
+    return DT->dominates(StoreBlock, Latch);
+  });
+}
+
+/// Return true if the load is not executed on all paths in the loop.
+static bool isLoadConditional(LoadInst *Load, Loop *L) {
+  return Load->getParent() != L->getHeader();
+}
+
+namespace {
+
+/// The per-loop class that does most of the work.
+class LoadEliminationForLoop {
+public:
+  LoadEliminationForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI,
+                         DominatorTree *DT, BlockFrequencyInfo *BFI,
+                         ProfileSummaryInfo* PSI)
+      : L(L), LI(LI), LAI(LAI), DT(DT), BFI(BFI), PSI(PSI), PSE(LAI.getPSE()) {}
+
+  /// Look through the loop-carried and loop-independent dependences in
+  /// this loop and find store->load dependences.
+  ///
+  /// Note that no candidate is returned if LAA has failed to analyze the loop
+  /// (e.g. if it's not bottom-tested, contains volatile memops, etc.)
+  std::forward_list<StoreToLoadForwardingCandidate>
+  findStoreToLoadDependences(const LoopAccessInfo &LAI) {
+    std::forward_list<StoreToLoadForwardingCandidate> Candidates;
+
+    const auto *Deps = LAI.getDepChecker().getDependences();
+    if (!Deps)
+      return Candidates;
+
+    // Find store->load dependences (consequently true dep).  Both lexically
+    // forward and backward dependences qualify.  Disqualify loads that have
+    // other unknown dependences.
+
+    SmallPtrSet<Instruction *, 4> LoadsWithUnknownDepedence;
+
+    for (const auto &Dep : *Deps) {
+      Instruction *Source = Dep.getSource(LAI);
+      Instruction *Destination = Dep.getDestination(LAI);
+
+      if (Dep.Type == MemoryDepChecker::Dependence::Unknown) {
+        if (isa<LoadInst>(Source))
+          LoadsWithUnknownDepedence.insert(Source);
+        if (isa<LoadInst>(Destination))
+          LoadsWithUnknownDepedence.insert(Destination);
+        continue;
+      }
+
+      if (Dep.isBackward())
+        // Note that the designations source and destination follow the program
+        // order, i.e. source is always first.  (The direction is given by the
+        // DepType.)
+        std::swap(Source, Destination);
+      else
+        assert(Dep.isForward() && "Needs to be a forward dependence");
+
+      auto *Store = dyn_cast<StoreInst>(Source);
+      if (!Store)
+        continue;
+      auto *Load = dyn_cast<LoadInst>(Destination);
+      if (!Load)
+        continue;
+
+      // Only progagate the value if they are of the same type.
+      if (Store->getPointerOperandType() != Load->getPointerOperandType())
+        continue;
+
+      Candidates.emplace_front(Load, Store);
+    }
+
+    if (!LoadsWithUnknownDepedence.empty())
+      Candidates.remove_if([&](const StoreToLoadForwardingCandidate &C) {
+        return LoadsWithUnknownDepedence.count(C.Load);
+      });
+
+    return Candidates;
+  }
+
+  /// Return the index of the instruction according to program order.
+  unsigned getInstrIndex(Instruction *Inst) {
+    auto I = InstOrder.find(Inst);
+    assert(I != InstOrder.end() && "No index for instruction");
+    return I->second;
+  }
+
+  /// If a load has multiple candidates associated (i.e. different
+  /// stores), it means that it could be forwarding from multiple stores
+  /// depending on control flow.  Remove these candidates.
+  ///
+  /// Here, we rely on LAA to include the relevant loop-independent dependences.
+  /// LAA is known to omit these in the very simple case when the read and the
+  /// write within an alias set always takes place using the *same* pointer.
+  ///
+  /// However, we know that this is not the case here, i.e. we can rely on LAA
+  /// to provide us with loop-independent dependences for the cases we're
+  /// interested.  Consider the case for example where a loop-independent
+  /// dependece S1->S2 invalidates the forwarding S3->S2.
+  ///
+  ///         A[i]   = ...   (S1)
+  ///         ...    = A[i]  (S2)
+  ///         A[i+1] = ...   (S3)
+  ///
+  /// LAA will perform dependence analysis here because there are two
+  /// *different* pointers involved in the same alias set (&A[i] and &A[i+1]).
+  void removeDependencesFromMultipleStores(
+      std::forward_list<StoreToLoadForwardingCandidate> &Candidates) {
+    // If Store is nullptr it means that we have multiple stores forwarding to
+    // this store.
+    using LoadToSingleCandT =
+        DenseMap<LoadInst *, const StoreToLoadForwardingCandidate *>;
+    LoadToSingleCandT LoadToSingleCand;
+
+    for (const auto &Cand : Candidates) {
+      bool NewElt;
+      LoadToSingleCandT::iterator Iter;
+
+      std::tie(Iter, NewElt) =
+          LoadToSingleCand.insert(std::make_pair(Cand.Load, &Cand));
+      if (!NewElt) {
+        const StoreToLoadForwardingCandidate *&OtherCand = Iter->second;
+        // Already multiple stores forward to this load.
+        if (OtherCand == nullptr)
+          continue;
+
+        // Handle the very basic case when the two stores are in the same block
+        // so deciding which one forwards is easy.  The later one forwards as
+        // long as they both have a dependence distance of one to the load.
+        if (Cand.Store->getParent() == OtherCand->Store->getParent() &&
+            Cand.isDependenceDistanceOfOne(PSE, L) &&
+            OtherCand->isDependenceDistanceOfOne(PSE, L)) {
+          // They are in the same block, the later one will forward to the load.
+          if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store))
+            OtherCand = &Cand;
+        } else
+          OtherCand = nullptr;
+      }
+    }
+
+    Candidates.remove_if([&](const StoreToLoadForwardingCandidate &Cand) {
+      if (LoadToSingleCand[Cand.Load] != &Cand) {
+        LLVM_DEBUG(
+            dbgs() << "Removing from candidates: \n"
+                   << Cand
+                   << "  The load may have multiple stores forwarding to "
+                   << "it\n");
+        return true;
+      }
+      return false;
+    });
+  }
+
+  /// Given two pointers operations by their RuntimePointerChecking
+  /// indices, return true if they require an alias check.
+  ///
+  /// We need a check if one is a pointer for a candidate load and the other is
+  /// a pointer for a possibly intervening store.
+  bool needsChecking(unsigned PtrIdx1, unsigned PtrIdx2,
                      const SmallPtrSetImpl<Value *> &PtrsWrittenOnFwdingPath,
                      const SmallPtrSetImpl<Value *> &CandLoadPtrs) {
-    Value *Ptr1 = 
-        LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx1).PointerValue; 
-    Value *Ptr2 = 
-        LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx2).PointerValue; 
-    return ((PtrsWrittenOnFwdingPath.count(Ptr1) && CandLoadPtrs.count(Ptr2)) || 
-            (PtrsWrittenOnFwdingPath.count(Ptr2) && CandLoadPtrs.count(Ptr1))); 
-  } 
- 
-  /// Return pointers that are possibly written to on the path from a 
-  /// forwarding store to a load. 
-  /// 
-  /// These pointers need to be alias-checked against the forwarding candidates. 
-  SmallPtrSet<Value *, 4> findPointersWrittenOnForwardingPath( 
-      const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) { 
-    // From FirstStore to LastLoad neither of the elimination candidate loads 
-    // should overlap with any of the stores. 
-    // 
-    // E.g.: 
-    // 
-    // st1 C[i] 
-    // ld1 B[i] <-------, 
-    // ld0 A[i] <----,  |              * LastLoad 
-    // ...           |  | 
-    // st2 E[i]      |  | 
-    // st3 B[i+1] -- | -'              * FirstStore 
-    // st0 A[i+1] ---' 
-    // st4 D[i] 
-    // 
-    // st0 forwards to ld0 if the accesses in st4 and st1 don't overlap with 
-    // ld0. 
- 
-    LoadInst *LastLoad = 
-        std::max_element(Candidates.begin(), Candidates.end(), 
-                         [&](const StoreToLoadForwardingCandidate &A, 
-                             const StoreToLoadForwardingCandidate &B) { 
-                           return getInstrIndex(A.Load) < getInstrIndex(B.Load); 
-                         }) 
-            ->Load; 
-    StoreInst *FirstStore = 
-        std::min_element(Candidates.begin(), Candidates.end(), 
-                         [&](const StoreToLoadForwardingCandidate &A, 
-                             const StoreToLoadForwardingCandidate &B) { 
-                           return getInstrIndex(A.Store) < 
-                                  getInstrIndex(B.Store); 
-                         }) 
-            ->Store; 
- 
-    // We're looking for stores after the first forwarding store until the end 
-    // of the loop, then from the beginning of the loop until the last 
-    // forwarded-to load.  Collect the pointer for the stores. 
-    SmallPtrSet<Value *, 4> PtrsWrittenOnFwdingPath; 
- 
-    auto InsertStorePtr = [&](Instruction *I) { 
-      if (auto *S = dyn_cast<StoreInst>(I)) 
-        PtrsWrittenOnFwdingPath.insert(S->getPointerOperand()); 
-    }; 
-    const auto &MemInstrs = LAI.getDepChecker().getMemoryInstructions(); 
-    std::for_each(MemInstrs.begin() + getInstrIndex(FirstStore) + 1, 
-                  MemInstrs.end(), InsertStorePtr); 
-    std::for_each(MemInstrs.begin(), &MemInstrs[getInstrIndex(LastLoad)], 
-                  InsertStorePtr); 
- 
-    return PtrsWrittenOnFwdingPath; 
-  } 
- 
-  /// Determine the pointer alias checks to prove that there are no 
-  /// intervening stores. 
-  SmallVector<RuntimePointerCheck, 4> collectMemchecks( 
-      const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) { 
- 
-    SmallPtrSet<Value *, 4> PtrsWrittenOnFwdingPath = 
-        findPointersWrittenOnForwardingPath(Candidates); 
- 
-    // Collect the pointers of the candidate loads. 
+    Value *Ptr1 =
+        LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx1).PointerValue;
+    Value *Ptr2 =
+        LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx2).PointerValue;
+    return ((PtrsWrittenOnFwdingPath.count(Ptr1) && CandLoadPtrs.count(Ptr2)) ||
+            (PtrsWrittenOnFwdingPath.count(Ptr2) && CandLoadPtrs.count(Ptr1)));
+  }
+
+  /// Return pointers that are possibly written to on the path from a
+  /// forwarding store to a load.
+  ///
+  /// These pointers need to be alias-checked against the forwarding candidates.
+  SmallPtrSet<Value *, 4> findPointersWrittenOnForwardingPath(
+      const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) {
+    // From FirstStore to LastLoad neither of the elimination candidate loads
+    // should overlap with any of the stores.
+    //
+    // E.g.:
+    //
+    // st1 C[i]
+    // ld1 B[i] <-------,
+    // ld0 A[i] <----,  |              * LastLoad
+    // ...           |  |
+    // st2 E[i]      |  |
+    // st3 B[i+1] -- | -'              * FirstStore
+    // st0 A[i+1] ---'
+    // st4 D[i]
+    //
+    // st0 forwards to ld0 if the accesses in st4 and st1 don't overlap with
+    // ld0.
+
+    LoadInst *LastLoad =
+        std::max_element(Candidates.begin(), Candidates.end(),
+                         [&](const StoreToLoadForwardingCandidate &A,
+                             const StoreToLoadForwardingCandidate &B) {
+                           return getInstrIndex(A.Load) < getInstrIndex(B.Load);
+                         })
+            ->Load;
+    StoreInst *FirstStore =
+        std::min_element(Candidates.begin(), Candidates.end(),
+                         [&](const StoreToLoadForwardingCandidate &A,
+                             const StoreToLoadForwardingCandidate &B) {
+                           return getInstrIndex(A.Store) <
+                                  getInstrIndex(B.Store);
+                         })
+            ->Store;
+
+    // We're looking for stores after the first forwarding store until the end
+    // of the loop, then from the beginning of the loop until the last
+    // forwarded-to load.  Collect the pointer for the stores.
+    SmallPtrSet<Value *, 4> PtrsWrittenOnFwdingPath;
+
+    auto InsertStorePtr = [&](Instruction *I) {
+      if (auto *S = dyn_cast<StoreInst>(I))
+        PtrsWrittenOnFwdingPath.insert(S->getPointerOperand());
+    };
+    const auto &MemInstrs = LAI.getDepChecker().getMemoryInstructions();
+    std::for_each(MemInstrs.begin() + getInstrIndex(FirstStore) + 1,
+                  MemInstrs.end(), InsertStorePtr);
+    std::for_each(MemInstrs.begin(), &MemInstrs[getInstrIndex(LastLoad)],
+                  InsertStorePtr);
+
+    return PtrsWrittenOnFwdingPath;
+  }
+
+  /// Determine the pointer alias checks to prove that there are no
+  /// intervening stores.
+  SmallVector<RuntimePointerCheck, 4> collectMemchecks(
+      const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) {
+
+    SmallPtrSet<Value *, 4> PtrsWrittenOnFwdingPath =
+        findPointersWrittenOnForwardingPath(Candidates);
+
+    // Collect the pointers of the candidate loads.
     SmallPtrSet<Value *, 4> CandLoadPtrs;
     for (const auto &Candidate : Candidates)
       CandLoadPtrs.insert(Candidate.getLoadPtr());
- 
-    const auto &AllChecks = LAI.getRuntimePointerChecking()->getChecks(); 
-    SmallVector<RuntimePointerCheck, 4> Checks; 
- 
-    copy_if(AllChecks, std::back_inserter(Checks), 
-            [&](const RuntimePointerCheck &Check) { 
-              for (auto PtrIdx1 : Check.first->Members) 
-                for (auto PtrIdx2 : Check.second->Members) 
-                  if (needsChecking(PtrIdx1, PtrIdx2, PtrsWrittenOnFwdingPath, 
-                                    CandLoadPtrs)) 
-                    return true; 
-              return false; 
-            }); 
- 
-    LLVM_DEBUG(dbgs() << "\nPointer Checks (count: " << Checks.size() 
-                      << "):\n"); 
-    LLVM_DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks)); 
- 
-    return Checks; 
-  } 
- 
-  /// Perform the transformation for a candidate. 
-  void 
-  propagateStoredValueToLoadUsers(const StoreToLoadForwardingCandidate &Cand, 
-                                  SCEVExpander &SEE) { 
-    // loop: 
-    //      %x = load %gep_i 
-    //         = ... %x 
-    //      store %y, %gep_i_plus_1 
-    // 
-    // => 
-    // 
-    // ph: 
-    //      %x.initial = load %gep_0 
-    // loop: 
-    //      %x.storeforward = phi [%x.initial, %ph] [%y, %loop] 
-    //      %x = load %gep_i            <---- now dead 
-    //         = ... %x.storeforward 
-    //      store %y, %gep_i_plus_1 
- 
-    Value *Ptr = Cand.Load->getPointerOperand(); 
-    auto *PtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(Ptr)); 
-    auto *PH = L->getLoopPreheader(); 
-    assert(PH && "Preheader should exist!"); 
-    Value *InitialPtr = SEE.expandCodeFor(PtrSCEV->getStart(), Ptr->getType(), 
-                                          PH->getTerminator()); 
-    Value *Initial = new LoadInst( 
-        Cand.Load->getType(), InitialPtr, "load_initial", 
-        /* isVolatile */ false, Cand.Load->getAlign(), PH->getTerminator()); 
- 
-    PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded", 
-                                   &L->getHeader()->front()); 
-    PHI->addIncoming(Initial, PH); 
-    PHI->addIncoming(Cand.Store->getOperand(0), L->getLoopLatch()); 
- 
-    Cand.Load->replaceAllUsesWith(PHI); 
-  } 
- 
-  /// Top-level driver for each loop: find store->load forwarding 
-  /// candidates, add run-time checks and perform transformation. 
-  bool processLoop() { 
-    LLVM_DEBUG(dbgs() << "\nIn \"" << L->getHeader()->getParent()->getName() 
-                      << "\" checking " << *L << "\n"); 
- 
-    // Look for store-to-load forwarding cases across the 
-    // backedge. E.g.: 
-    // 
-    // loop: 
-    //      %x = load %gep_i 
-    //         = ... %x 
-    //      store %y, %gep_i_plus_1 
-    // 
-    // => 
-    // 
-    // ph: 
-    //      %x.initial = load %gep_0 
-    // loop: 
-    //      %x.storeforward = phi [%x.initial, %ph] [%y, %loop] 
-    //      %x = load %gep_i            <---- now dead 
-    //         = ... %x.storeforward 
-    //      store %y, %gep_i_plus_1 
- 
-    // First start with store->load dependences. 
-    auto StoreToLoadDependences = findStoreToLoadDependences(LAI); 
-    if (StoreToLoadDependences.empty()) 
-      return false; 
- 
-    // Generate an index for each load and store according to the original 
-    // program order.  This will be used later. 
-    InstOrder = LAI.getDepChecker().generateInstructionOrderMap(); 
- 
-    // To keep things simple for now, remove those where the load is potentially 
-    // fed by multiple stores. 
-    removeDependencesFromMultipleStores(StoreToLoadDependences); 
-    if (StoreToLoadDependences.empty()) 
-      return false; 
- 
-    // Filter the candidates further. 
-    SmallVector<StoreToLoadForwardingCandidate, 4> Candidates; 
-    for (const StoreToLoadForwardingCandidate &Cand : StoreToLoadDependences) { 
-      LLVM_DEBUG(dbgs() << "Candidate " << Cand); 
- 
-      // Make sure that the stored values is available everywhere in the loop in 
-      // the next iteration. 
-      if (!doesStoreDominatesAllLatches(Cand.Store->getParent(), L, DT)) 
-        continue; 
- 
-      // If the load is conditional we can't hoist its 0-iteration instance to 
-      // the preheader because that would make it unconditional.  Thus we would 
-      // access a memory location that the original loop did not access. 
-      if (isLoadConditional(Cand.Load, L)) 
-        continue; 
- 
-      // Check whether the SCEV difference is the same as the induction step, 
-      // thus we load the value in the next iteration. 
-      if (!Cand.isDependenceDistanceOfOne(PSE, L)) 
-        continue; 
- 
+
+    const auto &AllChecks = LAI.getRuntimePointerChecking()->getChecks();
+    SmallVector<RuntimePointerCheck, 4> Checks;
+
+    copy_if(AllChecks, std::back_inserter(Checks),
+            [&](const RuntimePointerCheck &Check) {
+              for (auto PtrIdx1 : Check.first->Members)
+                for (auto PtrIdx2 : Check.second->Members)
+                  if (needsChecking(PtrIdx1, PtrIdx2, PtrsWrittenOnFwdingPath,
+                                    CandLoadPtrs))
+                    return true;
+              return false;
+            });
+
+    LLVM_DEBUG(dbgs() << "\nPointer Checks (count: " << Checks.size()
+                      << "):\n");
+    LLVM_DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks));
+
+    return Checks;
+  }
+
+  /// Perform the transformation for a candidate.
+  void
+  propagateStoredValueToLoadUsers(const StoreToLoadForwardingCandidate &Cand,
+                                  SCEVExpander &SEE) {
+    // loop:
+    //      %x = load %gep_i
+    //         = ... %x
+    //      store %y, %gep_i_plus_1
+    //
+    // =>
+    //
+    // ph:
+    //      %x.initial = load %gep_0
+    // loop:
+    //      %x.storeforward = phi [%x.initial, %ph] [%y, %loop]
+    //      %x = load %gep_i            <---- now dead
+    //         = ... %x.storeforward
+    //      store %y, %gep_i_plus_1
+
+    Value *Ptr = Cand.Load->getPointerOperand();
+    auto *PtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(Ptr));
+    auto *PH = L->getLoopPreheader();
+    assert(PH && "Preheader should exist!");
+    Value *InitialPtr = SEE.expandCodeFor(PtrSCEV->getStart(), Ptr->getType(),
+                                          PH->getTerminator());
+    Value *Initial = new LoadInst(
+        Cand.Load->getType(), InitialPtr, "load_initial",
+        /* isVolatile */ false, Cand.Load->getAlign(), PH->getTerminator());
+
+    PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded",
+                                   &L->getHeader()->front());
+    PHI->addIncoming(Initial, PH);
+    PHI->addIncoming(Cand.Store->getOperand(0), L->getLoopLatch());
+
+    Cand.Load->replaceAllUsesWith(PHI);
+  }
+
+  /// Top-level driver for each loop: find store->load forwarding
+  /// candidates, add run-time checks and perform transformation.
+  bool processLoop() {
+    LLVM_DEBUG(dbgs() << "\nIn \"" << L->getHeader()->getParent()->getName()
+                      << "\" checking " << *L << "\n");
+
+    // Look for store-to-load forwarding cases across the
+    // backedge. E.g.:
+    //
+    // loop:
+    //      %x = load %gep_i
+    //         = ... %x
+    //      store %y, %gep_i_plus_1
+    //
+    // =>
+    //
+    // ph:
+    //      %x.initial = load %gep_0
+    // loop:
+    //      %x.storeforward = phi [%x.initial, %ph] [%y, %loop]
+    //      %x = load %gep_i            <---- now dead
+    //         = ... %x.storeforward
+    //      store %y, %gep_i_plus_1
+
+    // First start with store->load dependences.
+    auto StoreToLoadDependences = findStoreToLoadDependences(LAI);
+    if (StoreToLoadDependences.empty())
+      return false;
+
+    // Generate an index for each load and store according to the original
+    // program order.  This will be used later.
+    InstOrder = LAI.getDepChecker().generateInstructionOrderMap();
+
+    // To keep things simple for now, remove those where the load is potentially
+    // fed by multiple stores.
+    removeDependencesFromMultipleStores(StoreToLoadDependences);
+    if (StoreToLoadDependences.empty())
+      return false;
+
+    // Filter the candidates further.
+    SmallVector<StoreToLoadForwardingCandidate, 4> Candidates;
+    for (const StoreToLoadForwardingCandidate &Cand : StoreToLoadDependences) {
+      LLVM_DEBUG(dbgs() << "Candidate " << Cand);
+
+      // Make sure that the stored values is available everywhere in the loop in
+      // the next iteration.
+      if (!doesStoreDominatesAllLatches(Cand.Store->getParent(), L, DT))
+        continue;
+
+      // If the load is conditional we can't hoist its 0-iteration instance to
+      // the preheader because that would make it unconditional.  Thus we would
+      // access a memory location that the original loop did not access.
+      if (isLoadConditional(Cand.Load, L))
+        continue;
+
+      // Check whether the SCEV difference is the same as the induction step,
+      // thus we load the value in the next iteration.
+      if (!Cand.isDependenceDistanceOfOne(PSE, L))
+        continue;
+
       assert(isa<SCEVAddRecExpr>(PSE.getSCEV(Cand.Load->getPointerOperand())) &&
              "Loading from something other than indvar?");
       assert(
@@ -512,59 +512,59 @@ public:
           "Storing to something other than indvar?");
 
       Candidates.push_back(Cand);
-      LLVM_DEBUG( 
-          dbgs() 
+      LLVM_DEBUG(
+          dbgs()
           << Candidates.size()
-          << ". Valid store-to-load forwarding across the loop backedge\n"); 
-    } 
-    if (Candidates.empty()) 
-      return false; 
- 
-    // Check intervening may-alias stores.  These need runtime checks for alias 
-    // disambiguation. 
-    SmallVector<RuntimePointerCheck, 4> Checks = collectMemchecks(Candidates); 
- 
-    // Too many checks are likely to outweigh the benefits of forwarding. 
-    if (Checks.size() > Candidates.size() * CheckPerElim) { 
-      LLVM_DEBUG(dbgs() << "Too many run-time checks needed.\n"); 
-      return false; 
-    } 
- 
-    if (LAI.getPSE().getUnionPredicate().getComplexity() > 
-        LoadElimSCEVCheckThreshold) { 
-      LLVM_DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n"); 
-      return false; 
-    } 
- 
-    if (!L->isLoopSimplifyForm()) { 
-      LLVM_DEBUG(dbgs() << "Loop is not is loop-simplify form"); 
-      return false; 
-    } 
- 
-    if (!Checks.empty() || !LAI.getPSE().getUnionPredicate().isAlwaysTrue()) { 
-      if (LAI.hasConvergentOp()) { 
-        LLVM_DEBUG(dbgs() << "Versioning is needed but not allowed with " 
-                             "convergent calls\n"); 
-        return false; 
-      } 
- 
-      auto *HeaderBB = L->getHeader(); 
-      auto *F = HeaderBB->getParent(); 
-      bool OptForSize = F->hasOptSize() || 
-                        llvm::shouldOptimizeForSize(HeaderBB, PSI, BFI, 
-                                                    PGSOQueryType::IRPass); 
-      if (OptForSize) { 
-        LLVM_DEBUG( 
-            dbgs() << "Versioning is needed but not allowed when optimizing " 
-                      "for size.\n"); 
-        return false; 
-      } 
- 
-      // Point of no-return, start the transformation.  First, version the loop 
-      // if necessary. 
- 
+          << ". Valid store-to-load forwarding across the loop backedge\n");
+    }
+    if (Candidates.empty())
+      return false;
+
+    // Check intervening may-alias stores.  These need runtime checks for alias
+    // disambiguation.
+    SmallVector<RuntimePointerCheck, 4> Checks = collectMemchecks(Candidates);
+
+    // Too many checks are likely to outweigh the benefits of forwarding.
+    if (Checks.size() > Candidates.size() * CheckPerElim) {
+      LLVM_DEBUG(dbgs() << "Too many run-time checks needed.\n");
+      return false;
+    }
+
+    if (LAI.getPSE().getUnionPredicate().getComplexity() >
+        LoadElimSCEVCheckThreshold) {
+      LLVM_DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
+      return false;
+    }
+
+    if (!L->isLoopSimplifyForm()) {
+      LLVM_DEBUG(dbgs() << "Loop is not is loop-simplify form");
+      return false;
+    }
+
+    if (!Checks.empty() || !LAI.getPSE().getUnionPredicate().isAlwaysTrue()) {
+      if (LAI.hasConvergentOp()) {
+        LLVM_DEBUG(dbgs() << "Versioning is needed but not allowed with "
+                             "convergent calls\n");
+        return false;
+      }
+
+      auto *HeaderBB = L->getHeader();
+      auto *F = HeaderBB->getParent();
+      bool OptForSize = F->hasOptSize() ||
+                        llvm::shouldOptimizeForSize(HeaderBB, PSI, BFI,
+                                                    PGSOQueryType::IRPass);
+      if (OptForSize) {
+        LLVM_DEBUG(
+            dbgs() << "Versioning is needed but not allowed when optimizing "
+                      "for size.\n");
+        return false;
+      }
+
+      // Point of no-return, start the transformation.  First, version the loop
+      // if necessary.
+
       LoopVersioning LV(LAI, Checks, L, LI, DT, PSE.getSE());
-      LV.versionLoop(); 
+      LV.versionLoop();
 
       // After versioning, some of the candidates' pointers could stop being
       // SCEVAddRecs. We need to filter them out.
@@ -576,163 +576,163 @@ public:
                     PSE.getSCEV(Cand.Store->getPointerOperand()));
       };
       llvm::erase_if(Candidates, NoLongerGoodCandidate);
-    } 
- 
-    // Next, propagate the value stored by the store to the users of the load. 
-    // Also for the first iteration, generate the initial value of the load. 
-    SCEVExpander SEE(*PSE.getSE(), L->getHeader()->getModule()->getDataLayout(), 
-                     "storeforward"); 
-    for (const auto &Cand : Candidates) 
-      propagateStoredValueToLoadUsers(Cand, SEE); 
+    }
+
+    // Next, propagate the value stored by the store to the users of the load.
+    // Also for the first iteration, generate the initial value of the load.
+    SCEVExpander SEE(*PSE.getSE(), L->getHeader()->getModule()->getDataLayout(),
+                     "storeforward");
+    for (const auto &Cand : Candidates)
+      propagateStoredValueToLoadUsers(Cand, SEE);
     NumLoopLoadEliminted += Candidates.size();
- 
-    return true; 
-  } 
- 
-private: 
-  Loop *L; 
- 
-  /// Maps the load/store instructions to their index according to 
-  /// program order. 
-  DenseMap<Instruction *, unsigned> InstOrder; 
- 
-  // Analyses used. 
-  LoopInfo *LI; 
-  const LoopAccessInfo &LAI; 
-  DominatorTree *DT; 
-  BlockFrequencyInfo *BFI; 
-  ProfileSummaryInfo *PSI; 
-  PredicatedScalarEvolution PSE; 
-}; 
- 
-} // end anonymous namespace 
- 
-static bool 
-eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT, 
-                          BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 
+
+    return true;
+  }
+
+private:
+  Loop *L;
+
+  /// Maps the load/store instructions to their index according to
+  /// program order.
+  DenseMap<Instruction *, unsigned> InstOrder;
+
+  // Analyses used.
+  LoopInfo *LI;
+  const LoopAccessInfo &LAI;
+  DominatorTree *DT;
+  BlockFrequencyInfo *BFI;
+  ProfileSummaryInfo *PSI;
+  PredicatedScalarEvolution PSE;
+};
+
+} // end anonymous namespace
+
+static bool
+eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT,
+                          BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
                           ScalarEvolution *SE, AssumptionCache *AC,
-                          function_ref<const LoopAccessInfo &(Loop &)> GetLAI) { 
-  // Build up a worklist of inner-loops to transform to avoid iterator 
-  // invalidation. 
-  // FIXME: This logic comes from other passes that actually change the loop 
-  // nest structure. It isn't clear this is necessary (or useful) for a pass 
-  // which merely optimizes the use of loads in a loop. 
-  SmallVector<Loop *, 8> Worklist; 
- 
+                          function_ref<const LoopAccessInfo &(Loop &)> GetLAI) {
+  // Build up a worklist of inner-loops to transform to avoid iterator
+  // invalidation.
+  // FIXME: This logic comes from other passes that actually change the loop
+  // nest structure. It isn't clear this is necessary (or useful) for a pass
+  // which merely optimizes the use of loads in a loop.
+  SmallVector<Loop *, 8> Worklist;
+
   bool Changed = false;
 
-  for (Loop *TopLevelLoop : LI) 
+  for (Loop *TopLevelLoop : LI)
     for (Loop *L : depth_first(TopLevelLoop)) {
       Changed |= simplifyLoop(L, &DT, &LI, SE, AC, /*MSSAU*/ nullptr, false);
-      // We only handle inner-most loops. 
+      // We only handle inner-most loops.
       if (L->isInnermost())
-        Worklist.push_back(L); 
+        Worklist.push_back(L);
     }
- 
-  // Now walk the identified inner loops. 
-  for (Loop *L : Worklist) { 
+
+  // Now walk the identified inner loops.
+  for (Loop *L : Worklist) {
     // Match historical behavior
     if (!L->isRotatedForm() || !L->getExitingBlock())
       continue;
-    // The actual work is performed by LoadEliminationForLoop. 
-    LoadEliminationForLoop LEL(L, &LI, GetLAI(*L), &DT, BFI, PSI); 
-    Changed |= LEL.processLoop(); 
-  } 
-  return Changed; 
-} 
- 
-namespace { 
- 
-/// The pass.  Most of the work is delegated to the per-loop 
-/// LoadEliminationForLoop class. 
-class LoopLoadElimination : public FunctionPass { 
-public: 
-  static char ID; 
- 
-  LoopLoadElimination() : FunctionPass(ID) { 
-    initializeLoopLoadEliminationPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnFunction(Function &F) override { 
-    if (skipFunction(F)) 
-      return false; 
- 
-    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 
-    auto &LAA = getAnalysis<LoopAccessLegacyAnalysis>(); 
-    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-    auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 
-    auto *BFI = (PSI && PSI->hasProfileSummary()) ? 
-                &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() : 
-                nullptr; 
- 
-    // Process each loop nest in the function. 
-    return eliminateLoadsAcrossLoops( 
+    // The actual work is performed by LoadEliminationForLoop.
+    LoadEliminationForLoop LEL(L, &LI, GetLAI(*L), &DT, BFI, PSI);
+    Changed |= LEL.processLoop();
+  }
+  return Changed;
+}
+
+namespace {
+
+/// The pass.  Most of the work is delegated to the per-loop
+/// LoadEliminationForLoop class.
+class LoopLoadElimination : public FunctionPass {
+public:
+  static char ID;
+
+  LoopLoadElimination() : FunctionPass(ID) {
+    initializeLoopLoadEliminationPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto &LAA = getAnalysis<LoopAccessLegacyAnalysis>();
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+    auto *BFI = (PSI && PSI->hasProfileSummary()) ?
+                &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() :
+                nullptr;
+
+    // Process each loop nest in the function.
+    return eliminateLoadsAcrossLoops(
         F, LI, DT, BFI, PSI, /*SE*/ nullptr, /*AC*/ nullptr,
-        [&LAA](Loop &L) -> const LoopAccessInfo & { return LAA.getInfo(&L); }); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequiredID(LoopSimplifyID); 
-    AU.addRequired<LoopInfoWrapperPass>(); 
-    AU.addPreserved<LoopInfoWrapperPass>(); 
-    AU.addRequired<LoopAccessLegacyAnalysis>(); 
-    AU.addRequired<ScalarEvolutionWrapperPass>(); 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addPreserved<DominatorTreeWrapperPass>(); 
-    AU.addPreserved<GlobalsAAWrapperPass>(); 
-    AU.addRequired<ProfileSummaryInfoWrapperPass>(); 
-    LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-char LoopLoadElimination::ID; 
- 
-static const char LLE_name[] = "Loop Load Elimination"; 
- 
-INITIALIZE_PASS_BEGIN(LoopLoadElimination, LLE_OPTION, LLE_name, false, false) 
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass) 
-INITIALIZE_PASS_END(LoopLoadElimination, LLE_OPTION, LLE_name, false, false) 
- 
-FunctionPass *llvm::createLoopLoadEliminationPass() { 
-  return new LoopLoadElimination(); 
-} 
- 
-PreservedAnalyses LoopLoadEliminationPass::run(Function &F, 
-                                               FunctionAnalysisManager &AM) { 
-  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 
-  auto &LI = AM.getResult<LoopAnalysis>(F); 
-  auto &TTI = AM.getResult<TargetIRAnalysis>(F); 
-  auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 
-  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 
-  auto &AA = AM.getResult<AAManager>(F); 
-  auto &AC = AM.getResult<AssumptionAnalysis>(F); 
-  auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 
-  auto *PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 
-  auto *BFI = (PSI && PSI->hasProfileSummary()) ? 
-      &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr; 
-  MemorySSA *MSSA = EnableMSSALoopDependency 
-                        ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 
-                        : nullptr; 
- 
-  auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 
-  bool Changed = eliminateLoadsAcrossLoops( 
+        [&LAA](Loop &L) -> const LoopAccessInfo & { return LAA.getInfo(&L); });
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequiredID(LoopSimplifyID);
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequired<LoopAccessLegacyAnalysis>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
+    LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
+  }
+};
+
+} // end anonymous namespace
+
+char LoopLoadElimination::ID;
+
+static const char LLE_name[] = "Loop Load Elimination";
+
+INITIALIZE_PASS_BEGIN(LoopLoadElimination, LLE_OPTION, LLE_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass)
+INITIALIZE_PASS_END(LoopLoadElimination, LLE_OPTION, LLE_name, false, false)
+
+FunctionPass *llvm::createLoopLoadEliminationPass() {
+  return new LoopLoadElimination();
+}
+
+PreservedAnalyses LoopLoadEliminationPass::run(Function &F,
+                                               FunctionAnalysisManager &AM) {
+  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &AA = AM.getResult<AAManager>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
+  auto *PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+  auto *BFI = (PSI && PSI->hasProfileSummary()) ?
+      &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
+  MemorySSA *MSSA = EnableMSSALoopDependency
+                        ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
+                        : nullptr;
+
+  auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
+  bool Changed = eliminateLoadsAcrossLoops(
       F, LI, DT, BFI, PSI, &SE, &AC, [&](Loop &L) -> const LoopAccessInfo & {
         LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE,
                                           TLI, TTI, nullptr, MSSA};
-        return LAM.getResult<LoopAccessAnalysis>(L, AR); 
-      }); 
- 
-  if (!Changed) 
-    return PreservedAnalyses::all(); 
- 
-  PreservedAnalyses PA; 
-  return PA; 
-} 
+        return LAM.getResult<LoopAccessAnalysis>(L, AR);
+      });
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopPassManager.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopPassManager.cpp
index 18ab347d1b..3fe8e72591 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopPassManager.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopPassManager.cpp
@@ -1,11 +1,11 @@
-//===- LoopPassManager.cpp - Loop pass management -------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
- 
+//===- LoopPassManager.cpp - Loop pass management -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
@@ -14,22 +14,22 @@
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Support/TimeProfiler.h" 
- 
-using namespace llvm; 
- 
-namespace llvm { 
- 
-/// Explicitly specialize the pass manager's run method to handle loop nest 
-/// structure updates. 
-PreservedAnalyses 
-PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &, 
-            LPMUpdater &>::run(Loop &L, LoopAnalysisManager &AM, 
-                               LoopStandardAnalysisResults &AR, LPMUpdater &U) { 
- 
-  if (DebugLogging) 
-    dbgs() << "Starting Loop pass manager run.\n"; 
- 
+#include "llvm/Support/TimeProfiler.h"
+
+using namespace llvm;
+
+namespace llvm {
+
+/// Explicitly specialize the pass manager's run method to handle loop nest
+/// structure updates.
+PreservedAnalyses
+PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &,
+            LPMUpdater &>::run(Loop &L, LoopAnalysisManager &AM,
+                               LoopStandardAnalysisResults &AR, LPMUpdater &U) {
+
+  if (DebugLogging)
+    dbgs() << "Starting Loop pass manager run.\n";
+
   // Runs loop-nest passes only when the current loop is a top-level one.
   PreservedAnalyses PA = (L.isOutermost() && !LoopNestPasses.empty())
                              ? runWithLoopNestPasses(L, AM, AR, U)
@@ -59,12 +59,12 @@ LoopPassManager::runWithLoopNestPasses(Loop &L, LoopAnalysisManager &AM,
          "Loop-nest passes should only run on top-level loops.");
   PreservedAnalyses PA = PreservedAnalyses::all();
 
-  // Request PassInstrumentation from analysis manager, will use it to run 
-  // instrumenting callbacks for the passes later. 
-  PassInstrumentation PI = AM.getResult<PassInstrumentationAnalysis>(L, AR); 
- 
+  // Request PassInstrumentation from analysis manager, will use it to run
+  // instrumenting callbacks for the passes later.
+  PassInstrumentation PI = AM.getResult<PassInstrumentationAnalysis>(L, AR);
+
   unsigned LoopPassIndex = 0, LoopNestPassIndex = 0;
- 
+
   // `LoopNestPtr` points to the `LoopNest` object for the current top-level
   // loop and `IsLoopNestPtrValid` indicates whether the pointer is still valid.
   // The `LoopNest` object will have to be re-constructed if the pointer is
@@ -89,24 +89,24 @@ LoopPassManager::runWithLoopNestPasses(Loop &L, LoopAnalysisManager &AM,
         IsLoopNestPtrValid = true;
       }
       PassPA = runSinglePass(*LoopNestPtr, Pass, AM, AR, U, PI);
-    } 
- 
+    }
+
     // `PassPA` is `None` means that the before-pass callbacks in
     // `PassInstrumentation` return false. The pass does not run in this case,
     // so we can skip the following procedure.
     if (!PassPA)
       continue;
- 
-    // If the loop was deleted, abort the run and return to the outer walk. 
-    if (U.skipCurrentLoop()) { 
+
+    // If the loop was deleted, abort the run and return to the outer walk.
+    if (U.skipCurrentLoop()) {
       PA.intersect(std::move(*PassPA));
-      break; 
-    } 
- 
+      break;
+    }
+
     // Update the analysis manager as each pass runs and potentially
     // invalidates analyses.
     AM.invalidate(L, *PassPA);
- 
+
     // Finally, we intersect the final preserved analyses to compute the
     // aggregate preserved set for this pass manager.
     PA.intersect(std::move(*PassPA));
@@ -150,24 +150,24 @@ LoopPassManager::runWithoutLoopNestPasses(Loop &L, LoopAnalysisManager &AM,
       break;
     }
 
-    // Update the analysis manager as each pass runs and potentially 
-    // invalidates analyses. 
+    // Update the analysis manager as each pass runs and potentially
+    // invalidates analyses.
     AM.invalidate(L, *PassPA);
- 
-    // Finally, we intersect the final preserved analyses to compute the 
-    // aggregate preserved set for this pass manager. 
+
+    // Finally, we intersect the final preserved analyses to compute the
+    // aggregate preserved set for this pass manager.
     PA.intersect(std::move(*PassPA));
- 
-    // FIXME: Historically, the pass managers all called the LLVM context's 
-    // yield function here. We don't have a generic way to acquire the 
-    // context and it isn't yet clear what the right pattern is for yielding 
-    // in the new pass manager so it is currently omitted. 
-    // ...getContext().yield(); 
-  } 
+
+    // FIXME: Historically, the pass managers all called the LLVM context's
+    // yield function here. We don't have a generic way to acquire the
+    // context and it isn't yet clear what the right pattern is for yielding
+    // in the new pass manager so it is currently omitted.
+    // ...getContext().yield();
+  }
   return PA;
 }
 } // namespace llvm
- 
+
 PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F,
                                                  FunctionAnalysisManager &AM) {
   // Before we even compute any loop analyses, first run a miniature function
@@ -175,7 +175,7 @@ PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F,
   // directly build up function analyses after this as the function pass
   // manager handles all the invalidation at that layer.
   PassInstrumentation PI = AM.getResult<PassInstrumentationAnalysis>(F);
- 
+
   PreservedAnalyses PA = PreservedAnalyses::all();
   // Check the PassInstrumentation's BeforePass callbacks before running the
   // canonicalization pipeline.
@@ -183,7 +183,7 @@ PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F,
     PA = LoopCanonicalizationFPM.run(F, AM);
     PI.runAfterPass<Function>(LoopCanonicalizationFPM, F, PA);
   }
- 
+
   // Get the loop structure for this function
   LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
 
@@ -320,16 +320,16 @@ PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F,
   PA.preserve<BasicAA>();
   PA.preserve<GlobalsAA>();
   PA.preserve<SCEVAA>();
-  return PA; 
-} 
- 
-PrintLoopPass::PrintLoopPass() : OS(dbgs()) {} 
-PrintLoopPass::PrintLoopPass(raw_ostream &OS, const std::string &Banner) 
-    : OS(OS), Banner(Banner) {} 
- 
-PreservedAnalyses PrintLoopPass::run(Loop &L, LoopAnalysisManager &, 
-                                     LoopStandardAnalysisResults &, 
-                                     LPMUpdater &) { 
-  printLoop(L, OS, Banner); 
-  return PreservedAnalyses::all(); 
-} 
+  return PA;
+}
+
+PrintLoopPass::PrintLoopPass() : OS(dbgs()) {}
+PrintLoopPass::PrintLoopPass(raw_ostream &OS, const std::string &Banner)
+    : OS(OS), Banner(Banner) {}
+
+PreservedAnalyses PrintLoopPass::run(Loop &L, LoopAnalysisManager &,
+                                     LoopStandardAnalysisResults &,
+                                     LPMUpdater &) {
+  printLoop(L, OS, Banner);
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopPredication.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopPredication.cpp
index 34f5868699..4f97641e20 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopPredication.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopPredication.cpp
@@ -1,1246 +1,1246 @@
-//===-- LoopPredication.cpp - Guard based loop predication pass -----------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// The LoopPredication pass tries to convert loop variant range checks to loop 
-// invariant by widening checks across loop iterations. For example, it will 
-// convert 
-// 
-//   for (i = 0; i < n; i++) { 
-//     guard(i < len); 
-//     ... 
-//   } 
-// 
-// to 
-// 
-//   for (i = 0; i < n; i++) { 
-//     guard(n - 1 < len); 
-//     ... 
-//   } 
-// 
-// After this transformation the condition of the guard is loop invariant, so 
-// loop-unswitch can later unswitch the loop by this condition which basically 
-// predicates the loop by the widened condition: 
-// 
-//   if (n - 1 < len) 
-//     for (i = 0; i < n; i++) { 
-//       ... 
-//     } 
-//   else 
-//     deoptimize 
-// 
-// It's tempting to rely on SCEV here, but it has proven to be problematic. 
-// Generally the facts SCEV provides about the increment step of add 
-// recurrences are true if the backedge of the loop is taken, which implicitly 
-// assumes that the guard doesn't fail. Using these facts to optimize the 
-// guard results in a circular logic where the guard is optimized under the 
-// assumption that it never fails. 
-// 
-// For example, in the loop below the induction variable will be marked as nuw 
-// basing on the guard. Basing on nuw the guard predicate will be considered 
-// monotonic. Given a monotonic condition it's tempting to replace the induction 
-// variable in the condition with its value on the last iteration. But this 
-// transformation is not correct, e.g. e = 4, b = 5 breaks the loop. 
-// 
-//   for (int i = b; i != e; i++) 
-//     guard(i u< len) 
-// 
-// One of the ways to reason about this problem is to use an inductive proof 
-// approach. Given the loop: 
-// 
-//   if (B(0)) { 
-//     do { 
-//       I = PHI(0, I.INC) 
-//       I.INC = I + Step 
-//       guard(G(I)); 
-//     } while (B(I)); 
-//   } 
-// 
-// where B(x) and G(x) are predicates that map integers to booleans, we want a 
-// loop invariant expression M such the following program has the same semantics 
-// as the above: 
-// 
-//   if (B(0)) { 
-//     do { 
-//       I = PHI(0, I.INC) 
-//       I.INC = I + Step 
-//       guard(G(0) && M); 
-//     } while (B(I)); 
-//   } 
-// 
-// One solution for M is M = forall X . (G(X) && B(X)) => G(X + Step) 
-// 
-// Informal proof that the transformation above is correct: 
-// 
-//   By the definition of guards we can rewrite the guard condition to: 
-//     G(I) && G(0) && M 
-// 
-//   Let's prove that for each iteration of the loop: 
-//     G(0) && M => G(I) 
-//   And the condition above can be simplified to G(Start) && M. 
-// 
-//   Induction base. 
-//     G(0) && M => G(0) 
-// 
-//   Induction step. Assuming G(0) && M => G(I) on the subsequent 
-//   iteration: 
-// 
-//     B(I) is true because it's the backedge condition. 
-//     G(I) is true because the backedge is guarded by this condition. 
-// 
-//   So M = forall X . (G(X) && B(X)) => G(X + Step) implies G(I + Step). 
-// 
-// Note that we can use anything stronger than M, i.e. any condition which 
-// implies M. 
-// 
-// When S = 1 (i.e. forward iterating loop), the transformation is supported 
-// when: 
-//   * The loop has a single latch with the condition of the form: 
-//     B(X) = latchStart + X <pred> latchLimit, 
-//     where <pred> is u<, u<=, s<, or s<=. 
-//   * The guard condition is of the form 
-//     G(X) = guardStart + X u< guardLimit 
-// 
-//   For the ult latch comparison case M is: 
-//     forall X . guardStart + X u< guardLimit && latchStart + X <u latchLimit => 
-//        guardStart + X + 1 u< guardLimit 
-// 
-//   The only way the antecedent can be true and the consequent can be false is 
-//   if 
-//     X == guardLimit - 1 - guardStart 
-//   (and guardLimit is non-zero, but we won't use this latter fact). 
-//   If X == guardLimit - 1 - guardStart then the second half of the antecedent is 
-//     latchStart + guardLimit - 1 - guardStart u< latchLimit 
-//   and its negation is 
-//     latchStart + guardLimit - 1 - guardStart u>= latchLimit 
-// 
-//   In other words, if 
-//     latchLimit u<= latchStart + guardLimit - 1 - guardStart 
-//   then: 
-//   (the ranges below are written in ConstantRange notation, where [A, B) is the 
-//   set for (I = A; I != B; I++ /*maywrap*/) yield(I);) 
-// 
-//      forall X . guardStart + X u< guardLimit && 
-//                 latchStart + X u< latchLimit => 
-//        guardStart + X + 1 u< guardLimit 
-//   == forall X . guardStart + X u< guardLimit && 
-//                 latchStart + X u< latchStart + guardLimit - 1 - guardStart => 
-//        guardStart + X + 1 u< guardLimit 
-//   == forall X . (guardStart + X) in [0, guardLimit) && 
-//                 (latchStart + X) in [0, latchStart + guardLimit - 1 - guardStart) => 
-//        (guardStart + X + 1) in [0, guardLimit) 
-//   == forall X . X in [-guardStart, guardLimit - guardStart) && 
-//                 X in [-latchStart, guardLimit - 1 - guardStart) => 
-//         X in [-guardStart - 1, guardLimit - guardStart - 1) 
-//   == true 
-// 
-//   So the widened condition is: 
-//     guardStart u< guardLimit && 
-//     latchStart + guardLimit - 1 - guardStart u>= latchLimit 
-//   Similarly for ule condition the widened condition is: 
-//     guardStart u< guardLimit && 
-//     latchStart + guardLimit - 1 - guardStart u> latchLimit 
-//   For slt condition the widened condition is: 
-//     guardStart u< guardLimit && 
-//     latchStart + guardLimit - 1 - guardStart s>= latchLimit 
-//   For sle condition the widened condition is: 
-//     guardStart u< guardLimit && 
-//     latchStart + guardLimit - 1 - guardStart s> latchLimit 
-// 
-// When S = -1 (i.e. reverse iterating loop), the transformation is supported 
-// when: 
-//   * The loop has a single latch with the condition of the form: 
-//     B(X) = X <pred> latchLimit, where <pred> is u>, u>=, s>, or s>=. 
-//   * The guard condition is of the form 
-//     G(X) = X - 1 u< guardLimit 
-// 
-//   For the ugt latch comparison case M is: 
-//     forall X. X-1 u< guardLimit and X u> latchLimit => X-2 u< guardLimit 
-// 
-//   The only way the antecedent can be true and the consequent can be false is if 
-//     X == 1. 
-//   If X == 1 then the second half of the antecedent is 
-//     1 u> latchLimit, and its negation is latchLimit u>= 1. 
-// 
-//   So the widened condition is: 
-//     guardStart u< guardLimit && latchLimit u>= 1. 
-//   Similarly for sgt condition the widened condition is: 
-//     guardStart u< guardLimit && latchLimit s>= 1. 
-//   For uge condition the widened condition is: 
-//     guardStart u< guardLimit && latchLimit u> 1. 
-//   For sge condition the widened condition is: 
-//     guardStart u< guardLimit && latchLimit s> 1. 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/LoopPredication.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/AliasAnalysis.h" 
-#include "llvm/Analysis/BranchProbabilityInfo.h" 
-#include "llvm/Analysis/GuardUtils.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/LoopPass.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/Analysis/ScalarEvolutionExpressions.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GlobalValue.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/GuardUtils.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Transforms/Utils/LoopUtils.h" 
-#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 
- 
-#define DEBUG_TYPE "loop-predication" 
- 
-STATISTIC(TotalConsidered, "Number of guards considered"); 
-STATISTIC(TotalWidened, "Number of checks widened"); 
- 
-using namespace llvm; 
- 
-static cl::opt<bool> EnableIVTruncation("loop-predication-enable-iv-truncation", 
-                                        cl::Hidden, cl::init(true)); 
- 
-static cl::opt<bool> EnableCountDownLoop("loop-predication-enable-count-down-loop", 
-                                        cl::Hidden, cl::init(true)); 
- 
-static cl::opt<bool> 
-    SkipProfitabilityChecks("loop-predication-skip-profitability-checks", 
-                            cl::Hidden, cl::init(false)); 
- 
-// This is the scale factor for the latch probability. We use this during 
-// profitability analysis to find other exiting blocks that have a much higher 
-// probability of exiting the loop instead of loop exiting via latch. 
-// This value should be greater than 1 for a sane profitability check. 
-static cl::opt<float> LatchExitProbabilityScale( 
-    "loop-predication-latch-probability-scale", cl::Hidden, cl::init(2.0), 
-    cl::desc("scale factor for the latch probability. Value should be greater " 
-             "than 1. Lower values are ignored")); 
- 
-static cl::opt<bool> PredicateWidenableBranchGuards( 
-    "loop-predication-predicate-widenable-branches-to-deopt", cl::Hidden, 
-    cl::desc("Whether or not we should predicate guards " 
-             "expressed as widenable branches to deoptimize blocks"), 
-    cl::init(true)); 
- 
-namespace { 
-/// Represents an induction variable check: 
-///   icmp Pred, <induction variable>, <loop invariant limit> 
-struct LoopICmp { 
-  ICmpInst::Predicate Pred; 
-  const SCEVAddRecExpr *IV; 
-  const SCEV *Limit; 
-  LoopICmp(ICmpInst::Predicate Pred, const SCEVAddRecExpr *IV, 
-           const SCEV *Limit) 
-    : Pred(Pred), IV(IV), Limit(Limit) {} 
-  LoopICmp() {} 
-  void dump() { 
-    dbgs() << "LoopICmp Pred = " << Pred << ", IV = " << *IV 
-           << ", Limit = " << *Limit << "\n"; 
-  } 
-}; 
- 
-class LoopPredication { 
-  AliasAnalysis *AA; 
-  DominatorTree *DT; 
-  ScalarEvolution *SE; 
-  LoopInfo *LI; 
-  BranchProbabilityInfo *BPI; 
- 
-  Loop *L; 
-  const DataLayout *DL; 
-  BasicBlock *Preheader; 
-  LoopICmp LatchCheck; 
- 
-  bool isSupportedStep(const SCEV* Step); 
-  Optional<LoopICmp> parseLoopICmp(ICmpInst *ICI); 
-  Optional<LoopICmp> parseLoopLatchICmp(); 
- 
-  /// Return an insertion point suitable for inserting a safe to speculate 
-  /// instruction whose only user will be 'User' which has operands 'Ops'.  A 
-  /// trivial result would be the at the User itself, but we try to return a 
-  /// loop invariant location if possible. 
-  Instruction *findInsertPt(Instruction *User, ArrayRef<Value*> Ops); 
-  /// Same as above, *except* that this uses the SCEV definition of invariant 
-  /// which is that an expression *can be made* invariant via SCEVExpander. 
-  /// Thus, this version is only suitable for finding an insert point to be be 
-  /// passed to SCEVExpander! 
-  Instruction *findInsertPt(Instruction *User, ArrayRef<const SCEV*> Ops); 
- 
-  /// Return true if the value is known to produce a single fixed value across 
-  /// all iterations on which it executes.  Note that this does not imply 
-  /// speculation safety.  That must be established separately. 
-  bool isLoopInvariantValue(const SCEV* S); 
- 
-  Value *expandCheck(SCEVExpander &Expander, Instruction *Guard, 
-                     ICmpInst::Predicate Pred, const SCEV *LHS, 
-                     const SCEV *RHS); 
- 
-  Optional<Value *> widenICmpRangeCheck(ICmpInst *ICI, SCEVExpander &Expander, 
-                                        Instruction *Guard); 
-  Optional<Value *> widenICmpRangeCheckIncrementingLoop(LoopICmp LatchCheck, 
-                                                        LoopICmp RangeCheck, 
-                                                        SCEVExpander &Expander, 
-                                                        Instruction *Guard); 
-  Optional<Value *> widenICmpRangeCheckDecrementingLoop(LoopICmp LatchCheck, 
-                                                        LoopICmp RangeCheck, 
-                                                        SCEVExpander &Expander, 
-                                                        Instruction *Guard); 
-  unsigned collectChecks(SmallVectorImpl<Value *> &Checks, Value *Condition, 
-                         SCEVExpander &Expander, Instruction *Guard); 
-  bool widenGuardConditions(IntrinsicInst *II, SCEVExpander &Expander); 
-  bool widenWidenableBranchGuardConditions(BranchInst *Guard, SCEVExpander &Expander); 
-  // If the loop always exits through another block in the loop, we should not 
-  // predicate based on the latch check. For example, the latch check can be a 
-  // very coarse grained check and there can be more fine grained exit checks 
-  // within the loop. We identify such unprofitable loops through BPI. 
-  bool isLoopProfitableToPredicate(); 
- 
-  bool predicateLoopExits(Loop *L, SCEVExpander &Rewriter); 
- 
-public: 
-  LoopPredication(AliasAnalysis *AA, DominatorTree *DT, 
-                  ScalarEvolution *SE, LoopInfo *LI, 
-                  BranchProbabilityInfo *BPI) 
-    : AA(AA), DT(DT), SE(SE), LI(LI), BPI(BPI) {}; 
-  bool runOnLoop(Loop *L); 
-}; 
- 
-class LoopPredicationLegacyPass : public LoopPass { 
-public: 
-  static char ID; 
-  LoopPredicationLegacyPass() : LoopPass(ID) { 
-    initializeLoopPredicationLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<BranchProbabilityInfoWrapperPass>(); 
-    getLoopAnalysisUsage(AU); 
-  } 
- 
-  bool runOnLoop(Loop *L, LPPassManager &LPM) override { 
-    if (skipLoop(L)) 
-      return false; 
-    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 
-    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 
-    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-    BranchProbabilityInfo &BPI = 
-        getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI(); 
-    auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 
-    LoopPredication LP(AA, DT, SE, LI, &BPI); 
-    return LP.runOnLoop(L); 
-  } 
-}; 
- 
-char LoopPredicationLegacyPass::ID = 0; 
-} // end namespace 
- 
-INITIALIZE_PASS_BEGIN(LoopPredicationLegacyPass, "loop-predication", 
-                      "Loop predication", false, false) 
-INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LoopPass) 
-INITIALIZE_PASS_END(LoopPredicationLegacyPass, "loop-predication", 
-                    "Loop predication", false, false) 
- 
-Pass *llvm::createLoopPredicationPass() { 
-  return new LoopPredicationLegacyPass(); 
-} 
- 
-PreservedAnalyses LoopPredicationPass::run(Loop &L, LoopAnalysisManager &AM, 
-                                           LoopStandardAnalysisResults &AR, 
-                                           LPMUpdater &U) { 
-  Function *F = L.getHeader()->getParent(); 
-  // For the new PM, we also can't use BranchProbabilityInfo as an analysis 
-  // pass. Function analyses need to be preserved across loop transformations 
-  // but BPI is not preserved, hence a newly built one is needed. 
+//===-- LoopPredication.cpp - Guard based loop predication pass -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The LoopPredication pass tries to convert loop variant range checks to loop
+// invariant by widening checks across loop iterations. For example, it will
+// convert
+//
+//   for (i = 0; i < n; i++) {
+//     guard(i < len);
+//     ...
+//   }
+//
+// to
+//
+//   for (i = 0; i < n; i++) {
+//     guard(n - 1 < len);
+//     ...
+//   }
+//
+// After this transformation the condition of the guard is loop invariant, so
+// loop-unswitch can later unswitch the loop by this condition which basically
+// predicates the loop by the widened condition:
+//
+//   if (n - 1 < len)
+//     for (i = 0; i < n; i++) {
+//       ...
+//     }
+//   else
+//     deoptimize
+//
+// It's tempting to rely on SCEV here, but it has proven to be problematic.
+// Generally the facts SCEV provides about the increment step of add
+// recurrences are true if the backedge of the loop is taken, which implicitly
+// assumes that the guard doesn't fail. Using these facts to optimize the
+// guard results in a circular logic where the guard is optimized under the
+// assumption that it never fails.
+//
+// For example, in the loop below the induction variable will be marked as nuw
+// basing on the guard. Basing on nuw the guard predicate will be considered
+// monotonic. Given a monotonic condition it's tempting to replace the induction
+// variable in the condition with its value on the last iteration. But this
+// transformation is not correct, e.g. e = 4, b = 5 breaks the loop.
+//
+//   for (int i = b; i != e; i++)
+//     guard(i u< len)
+//
+// One of the ways to reason about this problem is to use an inductive proof
+// approach. Given the loop:
+//
+//   if (B(0)) {
+//     do {
+//       I = PHI(0, I.INC)
+//       I.INC = I + Step
+//       guard(G(I));
+//     } while (B(I));
+//   }
+//
+// where B(x) and G(x) are predicates that map integers to booleans, we want a
+// loop invariant expression M such the following program has the same semantics
+// as the above:
+//
+//   if (B(0)) {
+//     do {
+//       I = PHI(0, I.INC)
+//       I.INC = I + Step
+//       guard(G(0) && M);
+//     } while (B(I));
+//   }
+//
+// One solution for M is M = forall X . (G(X) && B(X)) => G(X + Step)
+//
+// Informal proof that the transformation above is correct:
+//
+//   By the definition of guards we can rewrite the guard condition to:
+//     G(I) && G(0) && M
+//
+//   Let's prove that for each iteration of the loop:
+//     G(0) && M => G(I)
+//   And the condition above can be simplified to G(Start) && M.
+//
+//   Induction base.
+//     G(0) && M => G(0)
+//
+//   Induction step. Assuming G(0) && M => G(I) on the subsequent
+//   iteration:
+//
+//     B(I) is true because it's the backedge condition.
+//     G(I) is true because the backedge is guarded by this condition.
+//
+//   So M = forall X . (G(X) && B(X)) => G(X + Step) implies G(I + Step).
+//
+// Note that we can use anything stronger than M, i.e. any condition which
+// implies M.
+//
+// When S = 1 (i.e. forward iterating loop), the transformation is supported
+// when:
+//   * The loop has a single latch with the condition of the form:
+//     B(X) = latchStart + X <pred> latchLimit,
+//     where <pred> is u<, u<=, s<, or s<=.
+//   * The guard condition is of the form
+//     G(X) = guardStart + X u< guardLimit
+//
+//   For the ult latch comparison case M is:
+//     forall X . guardStart + X u< guardLimit && latchStart + X <u latchLimit =>
+//        guardStart + X + 1 u< guardLimit
+//
+//   The only way the antecedent can be true and the consequent can be false is
+//   if
+//     X == guardLimit - 1 - guardStart
+//   (and guardLimit is non-zero, but we won't use this latter fact).
+//   If X == guardLimit - 1 - guardStart then the second half of the antecedent is
+//     latchStart + guardLimit - 1 - guardStart u< latchLimit
+//   and its negation is
+//     latchStart + guardLimit - 1 - guardStart u>= latchLimit
+//
+//   In other words, if
+//     latchLimit u<= latchStart + guardLimit - 1 - guardStart
+//   then:
+//   (the ranges below are written in ConstantRange notation, where [A, B) is the
+//   set for (I = A; I != B; I++ /*maywrap*/) yield(I);)
+//
+//      forall X . guardStart + X u< guardLimit &&
+//                 latchStart + X u< latchLimit =>
+//        guardStart + X + 1 u< guardLimit
+//   == forall X . guardStart + X u< guardLimit &&
+//                 latchStart + X u< latchStart + guardLimit - 1 - guardStart =>
+//        guardStart + X + 1 u< guardLimit
+//   == forall X . (guardStart + X) in [0, guardLimit) &&
+//                 (latchStart + X) in [0, latchStart + guardLimit - 1 - guardStart) =>
+//        (guardStart + X + 1) in [0, guardLimit)
+//   == forall X . X in [-guardStart, guardLimit - guardStart) &&
+//                 X in [-latchStart, guardLimit - 1 - guardStart) =>
+//         X in [-guardStart - 1, guardLimit - guardStart - 1)
+//   == true
+//
+//   So the widened condition is:
+//     guardStart u< guardLimit &&
+//     latchStart + guardLimit - 1 - guardStart u>= latchLimit
+//   Similarly for ule condition the widened condition is:
+//     guardStart u< guardLimit &&
+//     latchStart + guardLimit - 1 - guardStart u> latchLimit
+//   For slt condition the widened condition is:
+//     guardStart u< guardLimit &&
+//     latchStart + guardLimit - 1 - guardStart s>= latchLimit
+//   For sle condition the widened condition is:
+//     guardStart u< guardLimit &&
+//     latchStart + guardLimit - 1 - guardStart s> latchLimit
+//
+// When S = -1 (i.e. reverse iterating loop), the transformation is supported
+// when:
+//   * The loop has a single latch with the condition of the form:
+//     B(X) = X <pred> latchLimit, where <pred> is u>, u>=, s>, or s>=.
+//   * The guard condition is of the form
+//     G(X) = X - 1 u< guardLimit
+//
+//   For the ugt latch comparison case M is:
+//     forall X. X-1 u< guardLimit and X u> latchLimit => X-2 u< guardLimit
+//
+//   The only way the antecedent can be true and the consequent can be false is if
+//     X == 1.
+//   If X == 1 then the second half of the antecedent is
+//     1 u> latchLimit, and its negation is latchLimit u>= 1.
+//
+//   So the widened condition is:
+//     guardStart u< guardLimit && latchLimit u>= 1.
+//   Similarly for sgt condition the widened condition is:
+//     guardStart u< guardLimit && latchLimit s>= 1.
+//   For uge condition the widened condition is:
+//     guardStart u< guardLimit && latchLimit u> 1.
+//   For sge condition the widened condition is:
+//     guardStart u< guardLimit && latchLimit s> 1.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopPredication.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/GuardUtils.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/GuardUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+
+#define DEBUG_TYPE "loop-predication"
+
+STATISTIC(TotalConsidered, "Number of guards considered");
+STATISTIC(TotalWidened, "Number of checks widened");
+
+using namespace llvm;
+
+static cl::opt<bool> EnableIVTruncation("loop-predication-enable-iv-truncation",
+                                        cl::Hidden, cl::init(true));
+
+static cl::opt<bool> EnableCountDownLoop("loop-predication-enable-count-down-loop",
+                                        cl::Hidden, cl::init(true));
+
+static cl::opt<bool>
+    SkipProfitabilityChecks("loop-predication-skip-profitability-checks",
+                            cl::Hidden, cl::init(false));
+
+// This is the scale factor for the latch probability. We use this during
+// profitability analysis to find other exiting blocks that have a much higher
+// probability of exiting the loop instead of loop exiting via latch.
+// This value should be greater than 1 for a sane profitability check.
+static cl::opt<float> LatchExitProbabilityScale(
+    "loop-predication-latch-probability-scale", cl::Hidden, cl::init(2.0),
+    cl::desc("scale factor for the latch probability. Value should be greater "
+             "than 1. Lower values are ignored"));
+
+static cl::opt<bool> PredicateWidenableBranchGuards(
+    "loop-predication-predicate-widenable-branches-to-deopt", cl::Hidden,
+    cl::desc("Whether or not we should predicate guards "
+             "expressed as widenable branches to deoptimize blocks"),
+    cl::init(true));
+
+namespace {
+/// Represents an induction variable check:
+///   icmp Pred, <induction variable>, <loop invariant limit>
+struct LoopICmp {
+  ICmpInst::Predicate Pred;
+  const SCEVAddRecExpr *IV;
+  const SCEV *Limit;
+  LoopICmp(ICmpInst::Predicate Pred, const SCEVAddRecExpr *IV,
+           const SCEV *Limit)
+    : Pred(Pred), IV(IV), Limit(Limit) {}
+  LoopICmp() {}
+  void dump() {
+    dbgs() << "LoopICmp Pred = " << Pred << ", IV = " << *IV
+           << ", Limit = " << *Limit << "\n";
+  }
+};
+
+class LoopPredication {
+  AliasAnalysis *AA;
+  DominatorTree *DT;
+  ScalarEvolution *SE;
+  LoopInfo *LI;
+  BranchProbabilityInfo *BPI;
+
+  Loop *L;
+  const DataLayout *DL;
+  BasicBlock *Preheader;
+  LoopICmp LatchCheck;
+
+  bool isSupportedStep(const SCEV* Step);
+  Optional<LoopICmp> parseLoopICmp(ICmpInst *ICI);
+  Optional<LoopICmp> parseLoopLatchICmp();
+
+  /// Return an insertion point suitable for inserting a safe to speculate
+  /// instruction whose only user will be 'User' which has operands 'Ops'.  A
+  /// trivial result would be the at the User itself, but we try to return a
+  /// loop invariant location if possible.
+  Instruction *findInsertPt(Instruction *User, ArrayRef<Value*> Ops);
+  /// Same as above, *except* that this uses the SCEV definition of invariant
+  /// which is that an expression *can be made* invariant via SCEVExpander.
+  /// Thus, this version is only suitable for finding an insert point to be be
+  /// passed to SCEVExpander!
+  Instruction *findInsertPt(Instruction *User, ArrayRef<const SCEV*> Ops);
+
+  /// Return true if the value is known to produce a single fixed value across
+  /// all iterations on which it executes.  Note that this does not imply
+  /// speculation safety.  That must be established separately.
+  bool isLoopInvariantValue(const SCEV* S);
+
+  Value *expandCheck(SCEVExpander &Expander, Instruction *Guard,
+                     ICmpInst::Predicate Pred, const SCEV *LHS,
+                     const SCEV *RHS);
+
+  Optional<Value *> widenICmpRangeCheck(ICmpInst *ICI, SCEVExpander &Expander,
+                                        Instruction *Guard);
+  Optional<Value *> widenICmpRangeCheckIncrementingLoop(LoopICmp LatchCheck,
+                                                        LoopICmp RangeCheck,
+                                                        SCEVExpander &Expander,
+                                                        Instruction *Guard);
+  Optional<Value *> widenICmpRangeCheckDecrementingLoop(LoopICmp LatchCheck,
+                                                        LoopICmp RangeCheck,
+                                                        SCEVExpander &Expander,
+                                                        Instruction *Guard);
+  unsigned collectChecks(SmallVectorImpl<Value *> &Checks, Value *Condition,
+                         SCEVExpander &Expander, Instruction *Guard);
+  bool widenGuardConditions(IntrinsicInst *II, SCEVExpander &Expander);
+  bool widenWidenableBranchGuardConditions(BranchInst *Guard, SCEVExpander &Expander);
+  // If the loop always exits through another block in the loop, we should not
+  // predicate based on the latch check. For example, the latch check can be a
+  // very coarse grained check and there can be more fine grained exit checks
+  // within the loop. We identify such unprofitable loops through BPI.
+  bool isLoopProfitableToPredicate();
+
+  bool predicateLoopExits(Loop *L, SCEVExpander &Rewriter);
+
+public:
+  LoopPredication(AliasAnalysis *AA, DominatorTree *DT,
+                  ScalarEvolution *SE, LoopInfo *LI,
+                  BranchProbabilityInfo *BPI)
+    : AA(AA), DT(DT), SE(SE), LI(LI), BPI(BPI) {};
+  bool runOnLoop(Loop *L);
+};
+
+class LoopPredicationLegacyPass : public LoopPass {
+public:
+  static char ID;
+  LoopPredicationLegacyPass() : LoopPass(ID) {
+    initializeLoopPredicationLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<BranchProbabilityInfoWrapperPass>();
+    getLoopAnalysisUsage(AU);
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+    if (skipLoop(L))
+      return false;
+    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    BranchProbabilityInfo &BPI =
+        getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
+    auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+    LoopPredication LP(AA, DT, SE, LI, &BPI);
+    return LP.runOnLoop(L);
+  }
+};
+
+char LoopPredicationLegacyPass::ID = 0;
+} // end namespace
+
+INITIALIZE_PASS_BEGIN(LoopPredicationLegacyPass, "loop-predication",
+                      "Loop predication", false, false)
+INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_END(LoopPredicationLegacyPass, "loop-predication",
+                    "Loop predication", false, false)
+
+Pass *llvm::createLoopPredicationPass() {
+  return new LoopPredicationLegacyPass();
+}
+
+PreservedAnalyses LoopPredicationPass::run(Loop &L, LoopAnalysisManager &AM,
+                                           LoopStandardAnalysisResults &AR,
+                                           LPMUpdater &U) {
+  Function *F = L.getHeader()->getParent();
+  // For the new PM, we also can't use BranchProbabilityInfo as an analysis
+  // pass. Function analyses need to be preserved across loop transformations
+  // but BPI is not preserved, hence a newly built one is needed.
   BranchProbabilityInfo BPI(*F, AR.LI, &AR.TLI, &AR.DT, nullptr);
-  LoopPredication LP(&AR.AA, &AR.DT, &AR.SE, &AR.LI, &BPI); 
-  if (!LP.runOnLoop(&L)) 
-    return PreservedAnalyses::all(); 
- 
-  return getLoopPassPreservedAnalyses(); 
-} 
- 
-Optional<LoopICmp> 
-LoopPredication::parseLoopICmp(ICmpInst *ICI) { 
-  auto Pred = ICI->getPredicate(); 
-  auto *LHS = ICI->getOperand(0); 
-  auto *RHS = ICI->getOperand(1); 
- 
-  const SCEV *LHSS = SE->getSCEV(LHS); 
-  if (isa<SCEVCouldNotCompute>(LHSS)) 
-    return None; 
-  const SCEV *RHSS = SE->getSCEV(RHS); 
-  if (isa<SCEVCouldNotCompute>(RHSS)) 
-    return None; 
- 
-  // Canonicalize RHS to be loop invariant bound, LHS - a loop computable IV 
-  if (SE->isLoopInvariant(LHSS, L)) { 
-    std::swap(LHS, RHS); 
-    std::swap(LHSS, RHSS); 
-    Pred = ICmpInst::getSwappedPredicate(Pred); 
-  } 
- 
-  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHSS); 
-  if (!AR || AR->getLoop() != L) 
-    return None; 
- 
-  return LoopICmp(Pred, AR, RHSS); 
-} 
- 
-Value *LoopPredication::expandCheck(SCEVExpander &Expander, 
-                                    Instruction *Guard, 
-                                    ICmpInst::Predicate Pred, const SCEV *LHS, 
-                                    const SCEV *RHS) { 
-  Type *Ty = LHS->getType(); 
-  assert(Ty == RHS->getType() && "expandCheck operands have different types?"); 
- 
-  if (SE->isLoopInvariant(LHS, L) && SE->isLoopInvariant(RHS, L)) { 
-    IRBuilder<> Builder(Guard); 
-    if (SE->isLoopEntryGuardedByCond(L, Pred, LHS, RHS)) 
-      return Builder.getTrue(); 
-    if (SE->isLoopEntryGuardedByCond(L, ICmpInst::getInversePredicate(Pred), 
-                                     LHS, RHS)) 
-      return Builder.getFalse(); 
-  } 
- 
-  Value *LHSV = Expander.expandCodeFor(LHS, Ty, findInsertPt(Guard, {LHS})); 
-  Value *RHSV = Expander.expandCodeFor(RHS, Ty, findInsertPt(Guard, {RHS})); 
-  IRBuilder<> Builder(findInsertPt(Guard, {LHSV, RHSV})); 
-  return Builder.CreateICmp(Pred, LHSV, RHSV); 
-} 
- 
- 
-// Returns true if its safe to truncate the IV to RangeCheckType. 
-// When the IV type is wider than the range operand type, we can still do loop 
-// predication, by generating SCEVs for the range and latch that are of the 
-// same type. We achieve this by generating a SCEV truncate expression for the 
-// latch IV. This is done iff truncation of the IV is a safe operation, 
-// without loss of information. 
-// Another way to achieve this is by generating a wider type SCEV for the 
-// range check operand, however, this needs a more involved check that 
-// operands do not overflow. This can lead to loss of information when the 
-// range operand is of the form: add i32 %offset, %iv. We need to prove that 
-// sext(x + y) is same as sext(x) + sext(y). 
-// This function returns true if we can safely represent the IV type in 
-// the RangeCheckType without loss of information. 
-static bool isSafeToTruncateWideIVType(const DataLayout &DL, 
-                                       ScalarEvolution &SE, 
-                                       const LoopICmp LatchCheck, 
-                                       Type *RangeCheckType) { 
-  if (!EnableIVTruncation) 
-    return false; 
+  LoopPredication LP(&AR.AA, &AR.DT, &AR.SE, &AR.LI, &BPI);
+  if (!LP.runOnLoop(&L))
+    return PreservedAnalyses::all();
+
+  return getLoopPassPreservedAnalyses();
+}
+
+Optional<LoopICmp>
+LoopPredication::parseLoopICmp(ICmpInst *ICI) {
+  auto Pred = ICI->getPredicate();
+  auto *LHS = ICI->getOperand(0);
+  auto *RHS = ICI->getOperand(1);
+
+  const SCEV *LHSS = SE->getSCEV(LHS);
+  if (isa<SCEVCouldNotCompute>(LHSS))
+    return None;
+  const SCEV *RHSS = SE->getSCEV(RHS);
+  if (isa<SCEVCouldNotCompute>(RHSS))
+    return None;
+
+  // Canonicalize RHS to be loop invariant bound, LHS - a loop computable IV
+  if (SE->isLoopInvariant(LHSS, L)) {
+    std::swap(LHS, RHS);
+    std::swap(LHSS, RHSS);
+    Pred = ICmpInst::getSwappedPredicate(Pred);
+  }
+
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHSS);
+  if (!AR || AR->getLoop() != L)
+    return None;
+
+  return LoopICmp(Pred, AR, RHSS);
+}
+
+Value *LoopPredication::expandCheck(SCEVExpander &Expander,
+                                    Instruction *Guard,
+                                    ICmpInst::Predicate Pred, const SCEV *LHS,
+                                    const SCEV *RHS) {
+  Type *Ty = LHS->getType();
+  assert(Ty == RHS->getType() && "expandCheck operands have different types?");
+
+  if (SE->isLoopInvariant(LHS, L) && SE->isLoopInvariant(RHS, L)) {
+    IRBuilder<> Builder(Guard);
+    if (SE->isLoopEntryGuardedByCond(L, Pred, LHS, RHS))
+      return Builder.getTrue();
+    if (SE->isLoopEntryGuardedByCond(L, ICmpInst::getInversePredicate(Pred),
+                                     LHS, RHS))
+      return Builder.getFalse();
+  }
+
+  Value *LHSV = Expander.expandCodeFor(LHS, Ty, findInsertPt(Guard, {LHS}));
+  Value *RHSV = Expander.expandCodeFor(RHS, Ty, findInsertPt(Guard, {RHS}));
+  IRBuilder<> Builder(findInsertPt(Guard, {LHSV, RHSV}));
+  return Builder.CreateICmp(Pred, LHSV, RHSV);
+}
+
+
+// Returns true if its safe to truncate the IV to RangeCheckType.
+// When the IV type is wider than the range operand type, we can still do loop
+// predication, by generating SCEVs for the range and latch that are of the
+// same type. We achieve this by generating a SCEV truncate expression for the
+// latch IV. This is done iff truncation of the IV is a safe operation,
+// without loss of information.
+// Another way to achieve this is by generating a wider type SCEV for the
+// range check operand, however, this needs a more involved check that
+// operands do not overflow. This can lead to loss of information when the
+// range operand is of the form: add i32 %offset, %iv. We need to prove that
+// sext(x + y) is same as sext(x) + sext(y).
+// This function returns true if we can safely represent the IV type in
+// the RangeCheckType without loss of information.
+static bool isSafeToTruncateWideIVType(const DataLayout &DL,
+                                       ScalarEvolution &SE,
+                                       const LoopICmp LatchCheck,
+                                       Type *RangeCheckType) {
+  if (!EnableIVTruncation)
+    return false;
   assert(DL.getTypeSizeInBits(LatchCheck.IV->getType()).getFixedSize() >
              DL.getTypeSizeInBits(RangeCheckType).getFixedSize() &&
-         "Expected latch check IV type to be larger than range check operand " 
-         "type!"); 
-  // The start and end values of the IV should be known. This is to guarantee 
-  // that truncating the wide type will not lose information. 
-  auto *Limit = dyn_cast<SCEVConstant>(LatchCheck.Limit); 
-  auto *Start = dyn_cast<SCEVConstant>(LatchCheck.IV->getStart()); 
-  if (!Limit || !Start) 
-    return false; 
-  // This check makes sure that the IV does not change sign during loop 
-  // iterations. Consider latchType = i64, LatchStart = 5, Pred = ICMP_SGE, 
-  // LatchEnd = 2, rangeCheckType = i32. If it's not a monotonic predicate, the 
-  // IV wraps around, and the truncation of the IV would lose the range of 
-  // iterations between 2^32 and 2^64. 
+         "Expected latch check IV type to be larger than range check operand "
+         "type!");
+  // The start and end values of the IV should be known. This is to guarantee
+  // that truncating the wide type will not lose information.
+  auto *Limit = dyn_cast<SCEVConstant>(LatchCheck.Limit);
+  auto *Start = dyn_cast<SCEVConstant>(LatchCheck.IV->getStart());
+  if (!Limit || !Start)
+    return false;
+  // This check makes sure that the IV does not change sign during loop
+  // iterations. Consider latchType = i64, LatchStart = 5, Pred = ICMP_SGE,
+  // LatchEnd = 2, rangeCheckType = i32. If it's not a monotonic predicate, the
+  // IV wraps around, and the truncation of the IV would lose the range of
+  // iterations between 2^32 and 2^64.
   if (!SE.getMonotonicPredicateType(LatchCheck.IV, LatchCheck.Pred))
-    return false; 
-  // The active bits should be less than the bits in the RangeCheckType. This 
-  // guarantees that truncating the latch check to RangeCheckType is a safe 
-  // operation. 
+    return false;
+  // The active bits should be less than the bits in the RangeCheckType. This
+  // guarantees that truncating the latch check to RangeCheckType is a safe
+  // operation.
   auto RangeCheckTypeBitSize =
       DL.getTypeSizeInBits(RangeCheckType).getFixedSize();
-  return Start->getAPInt().getActiveBits() < RangeCheckTypeBitSize && 
-         Limit->getAPInt().getActiveBits() < RangeCheckTypeBitSize; 
-} 
- 
- 
-// Return an LoopICmp describing a latch check equivlent to LatchCheck but with 
-// the requested type if safe to do so.  May involve the use of a new IV. 
-static Optional<LoopICmp> generateLoopLatchCheck(const DataLayout &DL, 
-                                                 ScalarEvolution &SE, 
-                                                 const LoopICmp LatchCheck, 
-                                                 Type *RangeCheckType) { 
- 
-  auto *LatchType = LatchCheck.IV->getType(); 
-  if (RangeCheckType == LatchType) 
-    return LatchCheck; 
-  // For now, bail out if latch type is narrower than range type. 
+  return Start->getAPInt().getActiveBits() < RangeCheckTypeBitSize &&
+         Limit->getAPInt().getActiveBits() < RangeCheckTypeBitSize;
+}
+
+
+// Return an LoopICmp describing a latch check equivlent to LatchCheck but with
+// the requested type if safe to do so.  May involve the use of a new IV.
+static Optional<LoopICmp> generateLoopLatchCheck(const DataLayout &DL,
+                                                 ScalarEvolution &SE,
+                                                 const LoopICmp LatchCheck,
+                                                 Type *RangeCheckType) {
+
+  auto *LatchType = LatchCheck.IV->getType();
+  if (RangeCheckType == LatchType)
+    return LatchCheck;
+  // For now, bail out if latch type is narrower than range type.
   if (DL.getTypeSizeInBits(LatchType).getFixedSize() <
       DL.getTypeSizeInBits(RangeCheckType).getFixedSize())
-    return None; 
-  if (!isSafeToTruncateWideIVType(DL, SE, LatchCheck, RangeCheckType)) 
-    return None; 
-  // We can now safely identify the truncated version of the IV and limit for 
-  // RangeCheckType. 
-  LoopICmp NewLatchCheck; 
-  NewLatchCheck.Pred = LatchCheck.Pred; 
-  NewLatchCheck.IV = dyn_cast<SCEVAddRecExpr>( 
-      SE.getTruncateExpr(LatchCheck.IV, RangeCheckType)); 
-  if (!NewLatchCheck.IV) 
-    return None; 
-  NewLatchCheck.Limit = SE.getTruncateExpr(LatchCheck.Limit, RangeCheckType); 
-  LLVM_DEBUG(dbgs() << "IV of type: " << *LatchType 
-                    << "can be represented as range check type:" 
-                    << *RangeCheckType << "\n"); 
-  LLVM_DEBUG(dbgs() << "LatchCheck.IV: " << *NewLatchCheck.IV << "\n"); 
-  LLVM_DEBUG(dbgs() << "LatchCheck.Limit: " << *NewLatchCheck.Limit << "\n"); 
-  return NewLatchCheck; 
-} 
- 
-bool LoopPredication::isSupportedStep(const SCEV* Step) { 
-  return Step->isOne() || (Step->isAllOnesValue() && EnableCountDownLoop); 
-} 
- 
-Instruction *LoopPredication::findInsertPt(Instruction *Use, 
-                                           ArrayRef<Value*> Ops) { 
-  for (Value *Op : Ops) 
-    if (!L->isLoopInvariant(Op)) 
-      return Use; 
-  return Preheader->getTerminator(); 
-} 
- 
-Instruction *LoopPredication::findInsertPt(Instruction *Use, 
-                                           ArrayRef<const SCEV*> Ops) { 
-  // Subtlety: SCEV considers things to be invariant if the value produced is 
-  // the same across iterations.  This is not the same as being able to 
-  // evaluate outside the loop, which is what we actually need here. 
-  for (const SCEV *Op : Ops) 
-    if (!SE->isLoopInvariant(Op, L) || 
-        !isSafeToExpandAt(Op, Preheader->getTerminator(), *SE)) 
-      return Use; 
-  return Preheader->getTerminator(); 
-} 
- 
-bool LoopPredication::isLoopInvariantValue(const SCEV* S) { 
-  // Handling expressions which produce invariant results, but *haven't* yet 
-  // been removed from the loop serves two important purposes. 
-  // 1) Most importantly, it resolves a pass ordering cycle which would 
-  // otherwise need us to iteration licm, loop-predication, and either 
-  // loop-unswitch or loop-peeling to make progress on examples with lots of 
-  // predicable range checks in a row.  (Since, in the general case,  we can't 
-  // hoist the length checks until the dominating checks have been discharged 
-  // as we can't prove doing so is safe.) 
-  // 2) As a nice side effect, this exposes the value of peeling or unswitching 
-  // much more obviously in the IR.  Otherwise, the cost modeling for other 
-  // transforms would end up needing to duplicate all of this logic to model a 
-  // check which becomes predictable based on a modeled peel or unswitch. 
-  // 
-  // The cost of doing so in the worst case is an extra fill from the stack  in 
-  // the loop to materialize the loop invariant test value instead of checking 
-  // against the original IV which is presumable in a register inside the loop. 
-  // Such cases are presumably rare, and hint at missing oppurtunities for 
-  // other passes. 
- 
-  if (SE->isLoopInvariant(S, L)) 
-    // Note: This the SCEV variant, so the original Value* may be within the 
-    // loop even though SCEV has proven it is loop invariant. 
-    return true; 
- 
-  // Handle a particular important case which SCEV doesn't yet know about which 
-  // shows up in range checks on arrays with immutable lengths. 
-  // TODO: This should be sunk inside SCEV. 
-  if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) 
-    if (const auto *LI = dyn_cast<LoadInst>(U->getValue())) 
-      if (LI->isUnordered() && L->hasLoopInvariantOperands(LI)) 
-        if (AA->pointsToConstantMemory(LI->getOperand(0)) || 
-            LI->hasMetadata(LLVMContext::MD_invariant_load)) 
-          return true; 
-  return false; 
-} 
- 
-Optional<Value *> LoopPredication::widenICmpRangeCheckIncrementingLoop( 
-    LoopICmp LatchCheck, LoopICmp RangeCheck, 
-    SCEVExpander &Expander, Instruction *Guard) { 
-  auto *Ty = RangeCheck.IV->getType(); 
-  // Generate the widened condition for the forward loop: 
-  //   guardStart u< guardLimit && 
-  //   latchLimit <pred> guardLimit - 1 - guardStart + latchStart 
-  // where <pred> depends on the latch condition predicate. See the file 
-  // header comment for the reasoning. 
-  // guardLimit - guardStart + latchStart - 1 
-  const SCEV *GuardStart = RangeCheck.IV->getStart(); 
-  const SCEV *GuardLimit = RangeCheck.Limit; 
-  const SCEV *LatchStart = LatchCheck.IV->getStart(); 
-  const SCEV *LatchLimit = LatchCheck.Limit; 
-  // Subtlety: We need all the values to be *invariant* across all iterations, 
-  // but we only need to check expansion safety for those which *aren't* 
-  // already guaranteed to dominate the guard. 
-  if (!isLoopInvariantValue(GuardStart) || 
-      !isLoopInvariantValue(GuardLimit) || 
-      !isLoopInvariantValue(LatchStart) || 
-      !isLoopInvariantValue(LatchLimit)) { 
-    LLVM_DEBUG(dbgs() << "Can't expand limit check!\n"); 
-    return None; 
-  } 
-  if (!isSafeToExpandAt(LatchStart, Guard, *SE) || 
-      !isSafeToExpandAt(LatchLimit, Guard, *SE)) { 
-    LLVM_DEBUG(dbgs() << "Can't expand limit check!\n"); 
-    return None; 
-  } 
- 
-  // guardLimit - guardStart + latchStart - 1 
-  const SCEV *RHS = 
-      SE->getAddExpr(SE->getMinusSCEV(GuardLimit, GuardStart), 
-                     SE->getMinusSCEV(LatchStart, SE->getOne(Ty))); 
-  auto LimitCheckPred = 
-      ICmpInst::getFlippedStrictnessPredicate(LatchCheck.Pred); 
- 
-  LLVM_DEBUG(dbgs() << "LHS: " << *LatchLimit << "\n"); 
-  LLVM_DEBUG(dbgs() << "RHS: " << *RHS << "\n"); 
-  LLVM_DEBUG(dbgs() << "Pred: " << LimitCheckPred << "\n"); 
- 
-  auto *LimitCheck = 
-      expandCheck(Expander, Guard, LimitCheckPred, LatchLimit, RHS); 
-  auto *FirstIterationCheck = expandCheck(Expander, Guard, RangeCheck.Pred, 
-                                          GuardStart, GuardLimit); 
-  IRBuilder<> Builder(findInsertPt(Guard, {FirstIterationCheck, LimitCheck})); 
-  return Builder.CreateAnd(FirstIterationCheck, LimitCheck); 
-} 
- 
-Optional<Value *> LoopPredication::widenICmpRangeCheckDecrementingLoop( 
-    LoopICmp LatchCheck, LoopICmp RangeCheck, 
-    SCEVExpander &Expander, Instruction *Guard) { 
-  auto *Ty = RangeCheck.IV->getType(); 
-  const SCEV *GuardStart = RangeCheck.IV->getStart(); 
-  const SCEV *GuardLimit = RangeCheck.Limit; 
-  const SCEV *LatchStart = LatchCheck.IV->getStart(); 
-  const SCEV *LatchLimit = LatchCheck.Limit; 
-  // Subtlety: We need all the values to be *invariant* across all iterations, 
-  // but we only need to check expansion safety for those which *aren't* 
-  // already guaranteed to dominate the guard. 
-  if (!isLoopInvariantValue(GuardStart) || 
-      !isLoopInvariantValue(GuardLimit) || 
-      !isLoopInvariantValue(LatchStart) || 
-      !isLoopInvariantValue(LatchLimit)) { 
-    LLVM_DEBUG(dbgs() << "Can't expand limit check!\n"); 
-    return None; 
-  } 
-  if (!isSafeToExpandAt(LatchStart, Guard, *SE) || 
-      !isSafeToExpandAt(LatchLimit, Guard, *SE)) { 
-    LLVM_DEBUG(dbgs() << "Can't expand limit check!\n"); 
-    return None; 
-  } 
-  // The decrement of the latch check IV should be the same as the 
-  // rangeCheckIV. 
-  auto *PostDecLatchCheckIV = LatchCheck.IV->getPostIncExpr(*SE); 
-  if (RangeCheck.IV != PostDecLatchCheckIV) { 
-    LLVM_DEBUG(dbgs() << "Not the same. PostDecLatchCheckIV: " 
-                      << *PostDecLatchCheckIV 
-                      << "  and RangeCheckIV: " << *RangeCheck.IV << "\n"); 
-    return None; 
-  } 
- 
-  // Generate the widened condition for CountDownLoop: 
-  // guardStart u< guardLimit && 
-  // latchLimit <pred> 1. 
-  // See the header comment for reasoning of the checks. 
-  auto LimitCheckPred = 
-      ICmpInst::getFlippedStrictnessPredicate(LatchCheck.Pred); 
-  auto *FirstIterationCheck = expandCheck(Expander, Guard, 
-                                          ICmpInst::ICMP_ULT, 
-                                          GuardStart, GuardLimit); 
-  auto *LimitCheck = expandCheck(Expander, Guard, LimitCheckPred, LatchLimit, 
-                                 SE->getOne(Ty)); 
-  IRBuilder<> Builder(findInsertPt(Guard, {FirstIterationCheck, LimitCheck})); 
-  return Builder.CreateAnd(FirstIterationCheck, LimitCheck); 
-} 
- 
-static void normalizePredicate(ScalarEvolution *SE, Loop *L, 
-                               LoopICmp& RC) { 
-  // LFTR canonicalizes checks to the ICMP_NE/EQ form; normalize back to the 
-  // ULT/UGE form for ease of handling by our caller. 
-  if (ICmpInst::isEquality(RC.Pred) && 
-      RC.IV->getStepRecurrence(*SE)->isOne() && 
-      SE->isKnownPredicate(ICmpInst::ICMP_ULE, RC.IV->getStart(), RC.Limit)) 
-    RC.Pred = RC.Pred == ICmpInst::ICMP_NE ? 
-      ICmpInst::ICMP_ULT : ICmpInst::ICMP_UGE; 
-} 
- 
- 
-/// If ICI can be widened to a loop invariant condition emits the loop 
-/// invariant condition in the loop preheader and return it, otherwise 
-/// returns None. 
-Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI, 
-                                                       SCEVExpander &Expander, 
-                                                       Instruction *Guard) { 
-  LLVM_DEBUG(dbgs() << "Analyzing ICmpInst condition:\n"); 
-  LLVM_DEBUG(ICI->dump()); 
- 
-  // parseLoopStructure guarantees that the latch condition is: 
-  //   ++i <pred> latchLimit, where <pred> is u<, u<=, s<, or s<=. 
-  // We are looking for the range checks of the form: 
-  //   i u< guardLimit 
-  auto RangeCheck = parseLoopICmp(ICI); 
-  if (!RangeCheck) { 
-    LLVM_DEBUG(dbgs() << "Failed to parse the loop latch condition!\n"); 
-    return None; 
-  } 
-  LLVM_DEBUG(dbgs() << "Guard check:\n"); 
-  LLVM_DEBUG(RangeCheck->dump()); 
-  if (RangeCheck->Pred != ICmpInst::ICMP_ULT) { 
-    LLVM_DEBUG(dbgs() << "Unsupported range check predicate(" 
-                      << RangeCheck->Pred << ")!\n"); 
-    return None; 
-  } 
-  auto *RangeCheckIV = RangeCheck->IV; 
-  if (!RangeCheckIV->isAffine()) { 
-    LLVM_DEBUG(dbgs() << "Range check IV is not affine!\n"); 
-    return None; 
-  } 
-  auto *Step = RangeCheckIV->getStepRecurrence(*SE); 
-  // We cannot just compare with latch IV step because the latch and range IVs 
-  // may have different types. 
-  if (!isSupportedStep(Step)) { 
-    LLVM_DEBUG(dbgs() << "Range check and latch have IVs different steps!\n"); 
-    return None; 
-  } 
-  auto *Ty = RangeCheckIV->getType(); 
-  auto CurrLatchCheckOpt = generateLoopLatchCheck(*DL, *SE, LatchCheck, Ty); 
-  if (!CurrLatchCheckOpt) { 
-    LLVM_DEBUG(dbgs() << "Failed to generate a loop latch check " 
-                         "corresponding to range type: " 
-                      << *Ty << "\n"); 
-    return None; 
-  } 
- 
-  LoopICmp CurrLatchCheck = *CurrLatchCheckOpt; 
-  // At this point, the range and latch step should have the same type, but need 
-  // not have the same value (we support both 1 and -1 steps). 
-  assert(Step->getType() == 
-             CurrLatchCheck.IV->getStepRecurrence(*SE)->getType() && 
-         "Range and latch steps should be of same type!"); 
-  if (Step != CurrLatchCheck.IV->getStepRecurrence(*SE)) { 
-    LLVM_DEBUG(dbgs() << "Range and latch have different step values!\n"); 
-    return None; 
-  } 
- 
-  if (Step->isOne()) 
-    return widenICmpRangeCheckIncrementingLoop(CurrLatchCheck, *RangeCheck, 
-                                               Expander, Guard); 
-  else { 
-    assert(Step->isAllOnesValue() && "Step should be -1!"); 
-    return widenICmpRangeCheckDecrementingLoop(CurrLatchCheck, *RangeCheck, 
-                                               Expander, Guard); 
-  } 
-} 
- 
-unsigned LoopPredication::collectChecks(SmallVectorImpl<Value *> &Checks, 
-                                        Value *Condition, 
-                                        SCEVExpander &Expander, 
-                                        Instruction *Guard) { 
-  unsigned NumWidened = 0; 
-  // The guard condition is expected to be in form of: 
-  //   cond1 && cond2 && cond3 ... 
-  // Iterate over subconditions looking for icmp conditions which can be 
-  // widened across loop iterations. Widening these conditions remember the 
-  // resulting list of subconditions in Checks vector. 
-  SmallVector<Value *, 4> Worklist(1, Condition); 
-  SmallPtrSet<Value *, 4> Visited; 
-  Value *WideableCond = nullptr; 
-  do { 
-    Value *Condition = Worklist.pop_back_val(); 
-    if (!Visited.insert(Condition).second) 
-      continue; 
- 
-    Value *LHS, *RHS; 
-    using namespace llvm::PatternMatch; 
-    if (match(Condition, m_And(m_Value(LHS), m_Value(RHS)))) { 
-      Worklist.push_back(LHS); 
-      Worklist.push_back(RHS); 
-      continue; 
-    } 
- 
-    if (match(Condition, 
-              m_Intrinsic<Intrinsic::experimental_widenable_condition>())) { 
-      // Pick any, we don't care which 
-      WideableCond = Condition; 
-      continue; 
-    } 
- 
-    if (ICmpInst *ICI = dyn_cast<ICmpInst>(Condition)) { 
-      if (auto NewRangeCheck = widenICmpRangeCheck(ICI, Expander, 
-                                                   Guard)) { 
-        Checks.push_back(NewRangeCheck.getValue()); 
-        NumWidened++; 
-        continue; 
-      } 
-    } 
- 
-    // Save the condition as is if we can't widen it 
-    Checks.push_back(Condition); 
-  } while (!Worklist.empty()); 
-  // At the moment, our matching logic for wideable conditions implicitly 
-  // assumes we preserve the form: (br (and Cond, WC())).  FIXME 
-  // Note that if there were multiple calls to wideable condition in the 
-  // traversal, we only need to keep one, and which one is arbitrary. 
-  if (WideableCond) 
-    Checks.push_back(WideableCond); 
-  return NumWidened; 
-} 
- 
-bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard, 
-                                           SCEVExpander &Expander) { 
-  LLVM_DEBUG(dbgs() << "Processing guard:\n"); 
-  LLVM_DEBUG(Guard->dump()); 
- 
-  TotalConsidered++; 
-  SmallVector<Value *, 4> Checks; 
-  unsigned NumWidened = collectChecks(Checks, Guard->getOperand(0), Expander, 
-                                      Guard); 
-  if (NumWidened == 0) 
-    return false; 
- 
-  TotalWidened += NumWidened; 
- 
-  // Emit the new guard condition 
-  IRBuilder<> Builder(findInsertPt(Guard, Checks)); 
-  Value *AllChecks = Builder.CreateAnd(Checks); 
-  auto *OldCond = Guard->getOperand(0); 
-  Guard->setOperand(0, AllChecks); 
-  RecursivelyDeleteTriviallyDeadInstructions(OldCond); 
- 
-  LLVM_DEBUG(dbgs() << "Widened checks = " << NumWidened << "\n"); 
-  return true; 
-} 
- 
-bool LoopPredication::widenWidenableBranchGuardConditions( 
-    BranchInst *BI, SCEVExpander &Expander) { 
-  assert(isGuardAsWidenableBranch(BI) && "Must be!"); 
-  LLVM_DEBUG(dbgs() << "Processing guard:\n"); 
-  LLVM_DEBUG(BI->dump()); 
- 
-  TotalConsidered++; 
-  SmallVector<Value *, 4> Checks; 
-  unsigned NumWidened = collectChecks(Checks, BI->getCondition(), 
-                                      Expander, BI); 
-  if (NumWidened == 0) 
-    return false; 
- 
-  TotalWidened += NumWidened; 
- 
-  // Emit the new guard condition 
-  IRBuilder<> Builder(findInsertPt(BI, Checks)); 
-  Value *AllChecks = Builder.CreateAnd(Checks); 
-  auto *OldCond = BI->getCondition(); 
-  BI->setCondition(AllChecks); 
-  RecursivelyDeleteTriviallyDeadInstructions(OldCond); 
-  assert(isGuardAsWidenableBranch(BI) && 
-         "Stopped being a guard after transform?"); 
- 
-  LLVM_DEBUG(dbgs() << "Widened checks = " << NumWidened << "\n"); 
-  return true; 
-} 
- 
-Optional<LoopICmp> LoopPredication::parseLoopLatchICmp() { 
-  using namespace PatternMatch; 
- 
-  BasicBlock *LoopLatch = L->getLoopLatch(); 
-  if (!LoopLatch) { 
-    LLVM_DEBUG(dbgs() << "The loop doesn't have a single latch!\n"); 
-    return None; 
-  } 
- 
-  auto *BI = dyn_cast<BranchInst>(LoopLatch->getTerminator()); 
-  if (!BI || !BI->isConditional()) { 
-    LLVM_DEBUG(dbgs() << "Failed to match the latch terminator!\n"); 
-    return None; 
-  } 
-  BasicBlock *TrueDest = BI->getSuccessor(0); 
-  assert( 
-      (TrueDest == L->getHeader() || BI->getSuccessor(1) == L->getHeader()) && 
-      "One of the latch's destinations must be the header"); 
- 
-  auto *ICI = dyn_cast<ICmpInst>(BI->getCondition()); 
-  if (!ICI) { 
-    LLVM_DEBUG(dbgs() << "Failed to match the latch condition!\n"); 
-    return None; 
-  } 
-  auto Result = parseLoopICmp(ICI); 
-  if (!Result) { 
-    LLVM_DEBUG(dbgs() << "Failed to parse the loop latch condition!\n"); 
-    return None; 
-  } 
- 
-  if (TrueDest != L->getHeader()) 
-    Result->Pred = ICmpInst::getInversePredicate(Result->Pred); 
- 
-  // Check affine first, so if it's not we don't try to compute the step 
-  // recurrence. 
-  if (!Result->IV->isAffine()) { 
-    LLVM_DEBUG(dbgs() << "The induction variable is not affine!\n"); 
-    return None; 
-  } 
- 
-  auto *Step = Result->IV->getStepRecurrence(*SE); 
-  if (!isSupportedStep(Step)) { 
-    LLVM_DEBUG(dbgs() << "Unsupported loop stride(" << *Step << ")!\n"); 
-    return None; 
-  } 
- 
-  auto IsUnsupportedPredicate = [](const SCEV *Step, ICmpInst::Predicate Pred) { 
-    if (Step->isOne()) { 
-      return Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_SLT && 
-             Pred != ICmpInst::ICMP_ULE && Pred != ICmpInst::ICMP_SLE; 
-    } else { 
-      assert(Step->isAllOnesValue() && "Step should be -1!"); 
-      return Pred != ICmpInst::ICMP_UGT && Pred != ICmpInst::ICMP_SGT && 
-             Pred != ICmpInst::ICMP_UGE && Pred != ICmpInst::ICMP_SGE; 
-    } 
-  }; 
- 
-  normalizePredicate(SE, L, *Result); 
-  if (IsUnsupportedPredicate(Step, Result->Pred)) { 
-    LLVM_DEBUG(dbgs() << "Unsupported loop latch predicate(" << Result->Pred 
-                      << ")!\n"); 
-    return None; 
-  } 
- 
-  return Result; 
-} 
- 
- 
-bool LoopPredication::isLoopProfitableToPredicate() { 
-  if (SkipProfitabilityChecks || !BPI) 
-    return true; 
- 
-  SmallVector<std::pair<BasicBlock *, BasicBlock *>, 8> ExitEdges; 
-  L->getExitEdges(ExitEdges); 
-  // If there is only one exiting edge in the loop, it is always profitable to 
-  // predicate the loop. 
-  if (ExitEdges.size() == 1) 
-    return true; 
- 
-  // Calculate the exiting probabilities of all exiting edges from the loop, 
-  // starting with the LatchExitProbability. 
-  // Heuristic for profitability: If any of the exiting blocks' probability of 
-  // exiting the loop is larger than exiting through the latch block, it's not 
-  // profitable to predicate the loop. 
-  auto *LatchBlock = L->getLoopLatch(); 
-  assert(LatchBlock && "Should have a single latch at this point!"); 
-  auto *LatchTerm = LatchBlock->getTerminator(); 
-  assert(LatchTerm->getNumSuccessors() == 2 && 
-         "expected to be an exiting block with 2 succs!"); 
-  unsigned LatchBrExitIdx = 
-      LatchTerm->getSuccessor(0) == L->getHeader() ? 1 : 0; 
-  BranchProbability LatchExitProbability = 
-      BPI->getEdgeProbability(LatchBlock, LatchBrExitIdx); 
- 
-  // Protect against degenerate inputs provided by the user. Providing a value 
-  // less than one, can invert the definition of profitable loop predication. 
-  float ScaleFactor = LatchExitProbabilityScale; 
-  if (ScaleFactor < 1) { 
-    LLVM_DEBUG( 
-        dbgs() 
-        << "Ignored user setting for loop-predication-latch-probability-scale: " 
-        << LatchExitProbabilityScale << "\n"); 
-    LLVM_DEBUG(dbgs() << "The value is set to 1.0\n"); 
-    ScaleFactor = 1.0; 
-  } 
-  const auto LatchProbabilityThreshold = 
-      LatchExitProbability * ScaleFactor; 
- 
-  for (const auto &ExitEdge : ExitEdges) { 
-    BranchProbability ExitingBlockProbability = 
-        BPI->getEdgeProbability(ExitEdge.first, ExitEdge.second); 
-    // Some exiting edge has higher probability than the latch exiting edge. 
-    // No longer profitable to predicate. 
-    if (ExitingBlockProbability > LatchProbabilityThreshold) 
-      return false; 
-  } 
-  // Using BPI, we have concluded that the most probable way to exit from the 
-  // loop is through the latch (or there's no profile information and all 
-  // exits are equally likely). 
-  return true; 
-} 
- 
-/// If we can (cheaply) find a widenable branch which controls entry into the 
-/// loop, return it. 
-static BranchInst *FindWidenableTerminatorAboveLoop(Loop *L, LoopInfo &LI) { 
-  // Walk back through any unconditional executed blocks and see if we can find 
-  // a widenable condition which seems to control execution of this loop.  Note 
-  // that we predict that maythrow calls are likely untaken and thus that it's 
-  // profitable to widen a branch before a maythrow call with a condition 
-  // afterwards even though that may cause the slow path to run in a case where 
-  // it wouldn't have otherwise. 
-  BasicBlock *BB = L->getLoopPreheader(); 
-  if (!BB) 
-    return nullptr; 
-  do { 
-    if (BasicBlock *Pred = BB->getSinglePredecessor()) 
-      if (BB == Pred->getSingleSuccessor()) { 
-        BB = Pred; 
-        continue; 
-      } 
-    break; 
-  } while (true); 
- 
-  if (BasicBlock *Pred = BB->getSinglePredecessor()) { 
-    auto *Term = Pred->getTerminator(); 
- 
-    Value *Cond, *WC; 
-    BasicBlock *IfTrueBB, *IfFalseBB; 
-    if (parseWidenableBranch(Term, Cond, WC, IfTrueBB, IfFalseBB) && 
-        IfTrueBB == BB) 
-      return cast<BranchInst>(Term); 
-  } 
-  return nullptr; 
-} 
- 
-/// Return the minimum of all analyzeable exit counts.  This is an upper bound 
-/// on the actual exit count.  If there are not at least two analyzeable exits, 
-/// returns SCEVCouldNotCompute. 
-static const SCEV *getMinAnalyzeableBackedgeTakenCount(ScalarEvolution &SE, 
-                                                       DominatorTree &DT, 
-                                                       Loop *L) { 
-  SmallVector<BasicBlock *, 16> ExitingBlocks; 
-  L->getExitingBlocks(ExitingBlocks); 
- 
-  SmallVector<const SCEV *, 4> ExitCounts; 
-  for (BasicBlock *ExitingBB : ExitingBlocks) { 
-    const SCEV *ExitCount = SE.getExitCount(L, ExitingBB); 
-    if (isa<SCEVCouldNotCompute>(ExitCount)) 
-      continue; 
-    assert(DT.dominates(ExitingBB, L->getLoopLatch()) && 
-           "We should only have known counts for exiting blocks that " 
-           "dominate latch!"); 
-    ExitCounts.push_back(ExitCount); 
-  } 
-  if (ExitCounts.size() < 2) 
-    return SE.getCouldNotCompute(); 
-  return SE.getUMinFromMismatchedTypes(ExitCounts); 
-} 
- 
-/// This implements an analogous, but entirely distinct transform from the main 
-/// loop predication transform.  This one is phrased in terms of using a 
-/// widenable branch *outside* the loop to allow us to simplify loop exits in a 
-/// following loop.  This is close in spirit to the IndVarSimplify transform 
-/// of the same name, but is materially different widening loosens legality 
-/// sharply. 
-bool LoopPredication::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) { 
-  // The transformation performed here aims to widen a widenable condition 
-  // above the loop such that all analyzeable exit leading to deopt are dead. 
-  // It assumes that the latch is the dominant exit for profitability and that 
-  // exits branching to deoptimizing blocks are rarely taken. It relies on the 
-  // semantics of widenable expressions for legality. (i.e. being able to fall 
-  // down the widenable path spuriously allows us to ignore exit order, 
-  // unanalyzeable exits, side effects, exceptional exits, and other challenges 
-  // which restrict the applicability of the non-WC based version of this 
-  // transform in IndVarSimplify.) 
-  // 
-  // NOTE ON POISON/UNDEF - We're hoisting an expression above guards which may 
-  // imply flags on the expression being hoisted and inserting new uses (flags 
-  // are only correct for current uses).  The result is that we may be 
-  // inserting a branch on the value which can be either poison or undef.  In 
-  // this case, the branch can legally go either way; we just need to avoid 
-  // introducing UB.  This is achieved through the use of the freeze 
-  // instruction. 
- 
-  SmallVector<BasicBlock *, 16> ExitingBlocks; 
-  L->getExitingBlocks(ExitingBlocks); 
- 
-  if (ExitingBlocks.empty()) 
-    return false; // Nothing to do. 
- 
-  auto *Latch = L->getLoopLatch(); 
-  if (!Latch) 
-    return false; 
- 
-  auto *WidenableBR = FindWidenableTerminatorAboveLoop(L, *LI); 
-  if (!WidenableBR) 
-    return false; 
- 
-  const SCEV *LatchEC = SE->getExitCount(L, Latch); 
-  if (isa<SCEVCouldNotCompute>(LatchEC)) 
-    return false; // profitability - want hot exit in analyzeable set 
- 
-  // At this point, we have found an analyzeable latch, and a widenable 
-  // condition above the loop.  If we have a widenable exit within the loop 
-  // (for which we can't compute exit counts), drop the ability to further 
-  // widen so that we gain ability to analyze it's exit count and perform this 
-  // transform.  TODO: It'd be nice to know for sure the exit became 
-  // analyzeable after dropping widenability. 
-  { 
-    bool Invalidate = false; 
- 
-    for (auto *ExitingBB : ExitingBlocks) { 
-      if (LI->getLoopFor(ExitingBB) != L) 
-        continue; 
- 
-      auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); 
-      if (!BI) 
-        continue; 
- 
-      Use *Cond, *WC; 
-      BasicBlock *IfTrueBB, *IfFalseBB; 
-      if (parseWidenableBranch(BI, Cond, WC, IfTrueBB, IfFalseBB) && 
-          L->contains(IfTrueBB)) { 
-        WC->set(ConstantInt::getTrue(IfTrueBB->getContext())); 
-        Invalidate = true; 
-      } 
-    } 
-    if (Invalidate) 
-      SE->forgetLoop(L); 
-  } 
- 
-  // The use of umin(all analyzeable exits) instead of latch is subtle, but 
-  // important for profitability.  We may have a loop which hasn't been fully 
-  // canonicalized just yet.  If the exit we chose to widen is provably never 
-  // taken, we want the widened form to *also* be provably never taken.  We 
-  // can't guarantee this as a current unanalyzeable exit may later become 
-  // analyzeable, but we can at least avoid the obvious cases. 
-  const SCEV *MinEC = getMinAnalyzeableBackedgeTakenCount(*SE, *DT, L); 
-  if (isa<SCEVCouldNotCompute>(MinEC) || MinEC->getType()->isPointerTy() || 
-      !SE->isLoopInvariant(MinEC, L) || 
-      !isSafeToExpandAt(MinEC, WidenableBR, *SE)) 
-    return false; 
- 
-  // Subtlety: We need to avoid inserting additional uses of the WC.  We know 
-  // that it can only have one transitive use at the moment, and thus moving 
-  // that use to just before the branch and inserting code before it and then 
-  // modifying the operand is legal. 
-  auto *IP = cast<Instruction>(WidenableBR->getCondition()); 
-  IP->moveBefore(WidenableBR); 
-  Rewriter.setInsertPoint(IP); 
-  IRBuilder<> B(IP); 
- 
-  bool Changed = false; 
-  Value *MinECV = nullptr; // lazily generated if needed 
-  for (BasicBlock *ExitingBB : ExitingBlocks) { 
-    // If our exiting block exits multiple loops, we can only rewrite the 
-    // innermost one.  Otherwise, we're changing how many times the innermost 
-    // loop runs before it exits. 
-    if (LI->getLoopFor(ExitingBB) != L) 
-      continue; 
- 
-    // Can't rewrite non-branch yet. 
-    auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); 
-    if (!BI) 
-      continue; 
- 
-    // If already constant, nothing to do. 
-    if (isa<Constant>(BI->getCondition())) 
-      continue; 
- 
-    const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); 
-    if (isa<SCEVCouldNotCompute>(ExitCount) || 
-        ExitCount->getType()->isPointerTy() || 
-        !isSafeToExpandAt(ExitCount, WidenableBR, *SE)) 
-      continue; 
- 
-    const bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB)); 
-    BasicBlock *ExitBB = BI->getSuccessor(ExitIfTrue ? 0 : 1); 
-    if (!ExitBB->getPostdominatingDeoptimizeCall()) 
-      continue; 
- 
-    /// Here we can be fairly sure that executing this exit will most likely 
-    /// lead to executing llvm.experimental.deoptimize. 
-    /// This is a profitability heuristic, not a legality constraint. 
- 
-    // If we found a widenable exit condition, do two things: 
-    // 1) fold the widened exit test into the widenable condition 
-    // 2) fold the branch to untaken - avoids infinite looping 
- 
-    Value *ECV = Rewriter.expandCodeFor(ExitCount); 
-    if (!MinECV) 
-      MinECV = Rewriter.expandCodeFor(MinEC); 
-    Value *RHS = MinECV; 
-    if (ECV->getType() != RHS->getType()) { 
-      Type *WiderTy = SE->getWiderType(ECV->getType(), RHS->getType()); 
-      ECV = B.CreateZExt(ECV, WiderTy); 
-      RHS = B.CreateZExt(RHS, WiderTy); 
-    } 
-    assert(!Latch || DT->dominates(ExitingBB, Latch)); 
-    Value *NewCond = B.CreateICmp(ICmpInst::ICMP_UGT, ECV, RHS); 
-    // Freeze poison or undef to an arbitrary bit pattern to ensure we can 
-    // branch without introducing UB.  See NOTE ON POISON/UNDEF above for 
-    // context. 
-    NewCond = B.CreateFreeze(NewCond); 
- 
-    widenWidenableBranch(WidenableBR, NewCond); 
- 
-    Value *OldCond = BI->getCondition(); 
-    BI->setCondition(ConstantInt::get(OldCond->getType(), !ExitIfTrue)); 
-    Changed = true; 
-  } 
- 
-  if (Changed) 
-    // We just mutated a bunch of loop exits changing there exit counts 
-    // widely.  We need to force recomputation of the exit counts given these 
-    // changes.  Note that all of the inserted exits are never taken, and 
-    // should be removed next time the CFG is modified. 
-    SE->forgetLoop(L); 
-  return Changed; 
-} 
- 
-bool LoopPredication::runOnLoop(Loop *Loop) { 
-  L = Loop; 
- 
-  LLVM_DEBUG(dbgs() << "Analyzing "); 
-  LLVM_DEBUG(L->dump()); 
- 
-  Module *M = L->getHeader()->getModule(); 
- 
-  // There is nothing to do if the module doesn't use guards 
-  auto *GuardDecl = 
-      M->getFunction(Intrinsic::getName(Intrinsic::experimental_guard)); 
-  bool HasIntrinsicGuards = GuardDecl && !GuardDecl->use_empty(); 
-  auto *WCDecl = M->getFunction( 
-      Intrinsic::getName(Intrinsic::experimental_widenable_condition)); 
-  bool HasWidenableConditions = 
-      PredicateWidenableBranchGuards && WCDecl && !WCDecl->use_empty(); 
-  if (!HasIntrinsicGuards && !HasWidenableConditions) 
-    return false; 
- 
-  DL = &M->getDataLayout(); 
- 
-  Preheader = L->getLoopPreheader(); 
-  if (!Preheader) 
-    return false; 
- 
-  auto LatchCheckOpt = parseLoopLatchICmp(); 
-  if (!LatchCheckOpt) 
-    return false; 
-  LatchCheck = *LatchCheckOpt; 
- 
-  LLVM_DEBUG(dbgs() << "Latch check:\n"); 
-  LLVM_DEBUG(LatchCheck.dump()); 
- 
-  if (!isLoopProfitableToPredicate()) { 
-    LLVM_DEBUG(dbgs() << "Loop not profitable to predicate!\n"); 
-    return false; 
-  } 
-  // Collect all the guards into a vector and process later, so as not 
-  // to invalidate the instruction iterator. 
-  SmallVector<IntrinsicInst *, 4> Guards; 
-  SmallVector<BranchInst *, 4> GuardsAsWidenableBranches; 
-  for (const auto BB : L->blocks()) { 
-    for (auto &I : *BB) 
-      if (isGuard(&I)) 
-        Guards.push_back(cast<IntrinsicInst>(&I)); 
-    if (PredicateWidenableBranchGuards && 
-        isGuardAsWidenableBranch(BB->getTerminator())) 
-      GuardsAsWidenableBranches.push_back( 
-          cast<BranchInst>(BB->getTerminator())); 
-  } 
- 
-  SCEVExpander Expander(*SE, *DL, "loop-predication"); 
-  bool Changed = false; 
-  for (auto *Guard : Guards) 
-    Changed |= widenGuardConditions(Guard, Expander); 
-  for (auto *Guard : GuardsAsWidenableBranches) 
-    Changed |= widenWidenableBranchGuardConditions(Guard, Expander); 
-  Changed |= predicateLoopExits(L, Expander); 
-  return Changed; 
-} 
+    return None;
+  if (!isSafeToTruncateWideIVType(DL, SE, LatchCheck, RangeCheckType))
+    return None;
+  // We can now safely identify the truncated version of the IV and limit for
+  // RangeCheckType.
+  LoopICmp NewLatchCheck;
+  NewLatchCheck.Pred = LatchCheck.Pred;
+  NewLatchCheck.IV = dyn_cast<SCEVAddRecExpr>(
+      SE.getTruncateExpr(LatchCheck.IV, RangeCheckType));
+  if (!NewLatchCheck.IV)
+    return None;
+  NewLatchCheck.Limit = SE.getTruncateExpr(LatchCheck.Limit, RangeCheckType);
+  LLVM_DEBUG(dbgs() << "IV of type: " << *LatchType
+                    << "can be represented as range check type:"
+                    << *RangeCheckType << "\n");
+  LLVM_DEBUG(dbgs() << "LatchCheck.IV: " << *NewLatchCheck.IV << "\n");
+  LLVM_DEBUG(dbgs() << "LatchCheck.Limit: " << *NewLatchCheck.Limit << "\n");
+  return NewLatchCheck;
+}
+
+bool LoopPredication::isSupportedStep(const SCEV* Step) {
+  return Step->isOne() || (Step->isAllOnesValue() && EnableCountDownLoop);
+}
+
+Instruction *LoopPredication::findInsertPt(Instruction *Use,
+                                           ArrayRef<Value*> Ops) {
+  for (Value *Op : Ops)
+    if (!L->isLoopInvariant(Op))
+      return Use;
+  return Preheader->getTerminator();
+}
+
+Instruction *LoopPredication::findInsertPt(Instruction *Use,
+                                           ArrayRef<const SCEV*> Ops) {
+  // Subtlety: SCEV considers things to be invariant if the value produced is
+  // the same across iterations.  This is not the same as being able to
+  // evaluate outside the loop, which is what we actually need here.
+  for (const SCEV *Op : Ops)
+    if (!SE->isLoopInvariant(Op, L) ||
+        !isSafeToExpandAt(Op, Preheader->getTerminator(), *SE))
+      return Use;
+  return Preheader->getTerminator();
+}
+
+bool LoopPredication::isLoopInvariantValue(const SCEV* S) {
+  // Handling expressions which produce invariant results, but *haven't* yet
+  // been removed from the loop serves two important purposes.
+  // 1) Most importantly, it resolves a pass ordering cycle which would
+  // otherwise need us to iteration licm, loop-predication, and either
+  // loop-unswitch or loop-peeling to make progress on examples with lots of
+  // predicable range checks in a row.  (Since, in the general case,  we can't
+  // hoist the length checks until the dominating checks have been discharged
+  // as we can't prove doing so is safe.)
+  // 2) As a nice side effect, this exposes the value of peeling or unswitching
+  // much more obviously in the IR.  Otherwise, the cost modeling for other
+  // transforms would end up needing to duplicate all of this logic to model a
+  // check which becomes predictable based on a modeled peel or unswitch.
+  //
+  // The cost of doing so in the worst case is an extra fill from the stack  in
+  // the loop to materialize the loop invariant test value instead of checking
+  // against the original IV which is presumable in a register inside the loop.
+  // Such cases are presumably rare, and hint at missing oppurtunities for
+  // other passes.
+
+  if (SE->isLoopInvariant(S, L))
+    // Note: This the SCEV variant, so the original Value* may be within the
+    // loop even though SCEV has proven it is loop invariant.
+    return true;
+
+  // Handle a particular important case which SCEV doesn't yet know about which
+  // shows up in range checks on arrays with immutable lengths.
+  // TODO: This should be sunk inside SCEV.
+  if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S))
+    if (const auto *LI = dyn_cast<LoadInst>(U->getValue()))
+      if (LI->isUnordered() && L->hasLoopInvariantOperands(LI))
+        if (AA->pointsToConstantMemory(LI->getOperand(0)) ||
+            LI->hasMetadata(LLVMContext::MD_invariant_load))
+          return true;
+  return false;
+}
+
+Optional<Value *> LoopPredication::widenICmpRangeCheckIncrementingLoop(
+    LoopICmp LatchCheck, LoopICmp RangeCheck,
+    SCEVExpander &Expander, Instruction *Guard) {
+  auto *Ty = RangeCheck.IV->getType();
+  // Generate the widened condition for the forward loop:
+  //   guardStart u< guardLimit &&
+  //   latchLimit <pred> guardLimit - 1 - guardStart + latchStart
+  // where <pred> depends on the latch condition predicate. See the file
+  // header comment for the reasoning.
+  // guardLimit - guardStart + latchStart - 1
+  const SCEV *GuardStart = RangeCheck.IV->getStart();
+  const SCEV *GuardLimit = RangeCheck.Limit;
+  const SCEV *LatchStart = LatchCheck.IV->getStart();
+  const SCEV *LatchLimit = LatchCheck.Limit;
+  // Subtlety: We need all the values to be *invariant* across all iterations,
+  // but we only need to check expansion safety for those which *aren't*
+  // already guaranteed to dominate the guard.
+  if (!isLoopInvariantValue(GuardStart) ||
+      !isLoopInvariantValue(GuardLimit) ||
+      !isLoopInvariantValue(LatchStart) ||
+      !isLoopInvariantValue(LatchLimit)) {
+    LLVM_DEBUG(dbgs() << "Can't expand limit check!\n");
+    return None;
+  }
+  if (!isSafeToExpandAt(LatchStart, Guard, *SE) ||
+      !isSafeToExpandAt(LatchLimit, Guard, *SE)) {
+    LLVM_DEBUG(dbgs() << "Can't expand limit check!\n");
+    return None;
+  }
+
+  // guardLimit - guardStart + latchStart - 1
+  const SCEV *RHS =
+      SE->getAddExpr(SE->getMinusSCEV(GuardLimit, GuardStart),
+                     SE->getMinusSCEV(LatchStart, SE->getOne(Ty)));
+  auto LimitCheckPred =
+      ICmpInst::getFlippedStrictnessPredicate(LatchCheck.Pred);
+
+  LLVM_DEBUG(dbgs() << "LHS: " << *LatchLimit << "\n");
+  LLVM_DEBUG(dbgs() << "RHS: " << *RHS << "\n");
+  LLVM_DEBUG(dbgs() << "Pred: " << LimitCheckPred << "\n");
+
+  auto *LimitCheck =
+      expandCheck(Expander, Guard, LimitCheckPred, LatchLimit, RHS);
+  auto *FirstIterationCheck = expandCheck(Expander, Guard, RangeCheck.Pred,
+                                          GuardStart, GuardLimit);
+  IRBuilder<> Builder(findInsertPt(Guard, {FirstIterationCheck, LimitCheck}));
+  return Builder.CreateAnd(FirstIterationCheck, LimitCheck);
+}
+
+Optional<Value *> LoopPredication::widenICmpRangeCheckDecrementingLoop(
+    LoopICmp LatchCheck, LoopICmp RangeCheck,
+    SCEVExpander &Expander, Instruction *Guard) {
+  auto *Ty = RangeCheck.IV->getType();
+  const SCEV *GuardStart = RangeCheck.IV->getStart();
+  const SCEV *GuardLimit = RangeCheck.Limit;
+  const SCEV *LatchStart = LatchCheck.IV->getStart();
+  const SCEV *LatchLimit = LatchCheck.Limit;
+  // Subtlety: We need all the values to be *invariant* across all iterations,
+  // but we only need to check expansion safety for those which *aren't*
+  // already guaranteed to dominate the guard.
+  if (!isLoopInvariantValue(GuardStart) ||
+      !isLoopInvariantValue(GuardLimit) ||
+      !isLoopInvariantValue(LatchStart) ||
+      !isLoopInvariantValue(LatchLimit)) {
+    LLVM_DEBUG(dbgs() << "Can't expand limit check!\n");
+    return None;
+  }
+  if (!isSafeToExpandAt(LatchStart, Guard, *SE) ||
+      !isSafeToExpandAt(LatchLimit, Guard, *SE)) {
+    LLVM_DEBUG(dbgs() << "Can't expand limit check!\n");
+    return None;
+  }
+  // The decrement of the latch check IV should be the same as the
+  // rangeCheckIV.
+  auto *PostDecLatchCheckIV = LatchCheck.IV->getPostIncExpr(*SE);
+  if (RangeCheck.IV != PostDecLatchCheckIV) {
+    LLVM_DEBUG(dbgs() << "Not the same. PostDecLatchCheckIV: "
+                      << *PostDecLatchCheckIV
+                      << "  and RangeCheckIV: " << *RangeCheck.IV << "\n");
+    return None;
+  }
+
+  // Generate the widened condition for CountDownLoop:
+  // guardStart u< guardLimit &&
+  // latchLimit <pred> 1.
+  // See the header comment for reasoning of the checks.
+  auto LimitCheckPred =
+      ICmpInst::getFlippedStrictnessPredicate(LatchCheck.Pred);
+  auto *FirstIterationCheck = expandCheck(Expander, Guard,
+                                          ICmpInst::ICMP_ULT,
+                                          GuardStart, GuardLimit);
+  auto *LimitCheck = expandCheck(Expander, Guard, LimitCheckPred, LatchLimit,
+                                 SE->getOne(Ty));
+  IRBuilder<> Builder(findInsertPt(Guard, {FirstIterationCheck, LimitCheck}));
+  return Builder.CreateAnd(FirstIterationCheck, LimitCheck);
+}
+
+static void normalizePredicate(ScalarEvolution *SE, Loop *L,
+                               LoopICmp& RC) {
+  // LFTR canonicalizes checks to the ICMP_NE/EQ form; normalize back to the
+  // ULT/UGE form for ease of handling by our caller.
+  if (ICmpInst::isEquality(RC.Pred) &&
+      RC.IV->getStepRecurrence(*SE)->isOne() &&
+      SE->isKnownPredicate(ICmpInst::ICMP_ULE, RC.IV->getStart(), RC.Limit))
+    RC.Pred = RC.Pred == ICmpInst::ICMP_NE ?
+      ICmpInst::ICMP_ULT : ICmpInst::ICMP_UGE;
+}
+
+
+/// If ICI can be widened to a loop invariant condition emits the loop
+/// invariant condition in the loop preheader and return it, otherwise
+/// returns None.
+Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
+                                                       SCEVExpander &Expander,
+                                                       Instruction *Guard) {
+  LLVM_DEBUG(dbgs() << "Analyzing ICmpInst condition:\n");
+  LLVM_DEBUG(ICI->dump());
+
+  // parseLoopStructure guarantees that the latch condition is:
+  //   ++i <pred> latchLimit, where <pred> is u<, u<=, s<, or s<=.
+  // We are looking for the range checks of the form:
+  //   i u< guardLimit
+  auto RangeCheck = parseLoopICmp(ICI);
+  if (!RangeCheck) {
+    LLVM_DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
+    return None;
+  }
+  LLVM_DEBUG(dbgs() << "Guard check:\n");
+  LLVM_DEBUG(RangeCheck->dump());
+  if (RangeCheck->Pred != ICmpInst::ICMP_ULT) {
+    LLVM_DEBUG(dbgs() << "Unsupported range check predicate("
+                      << RangeCheck->Pred << ")!\n");
+    return None;
+  }
+  auto *RangeCheckIV = RangeCheck->IV;
+  if (!RangeCheckIV->isAffine()) {
+    LLVM_DEBUG(dbgs() << "Range check IV is not affine!\n");
+    return None;
+  }
+  auto *Step = RangeCheckIV->getStepRecurrence(*SE);
+  // We cannot just compare with latch IV step because the latch and range IVs
+  // may have different types.
+  if (!isSupportedStep(Step)) {
+    LLVM_DEBUG(dbgs() << "Range check and latch have IVs different steps!\n");
+    return None;
+  }
+  auto *Ty = RangeCheckIV->getType();
+  auto CurrLatchCheckOpt = generateLoopLatchCheck(*DL, *SE, LatchCheck, Ty);
+  if (!CurrLatchCheckOpt) {
+    LLVM_DEBUG(dbgs() << "Failed to generate a loop latch check "
+                         "corresponding to range type: "
+                      << *Ty << "\n");
+    return None;
+  }
+
+  LoopICmp CurrLatchCheck = *CurrLatchCheckOpt;
+  // At this point, the range and latch step should have the same type, but need
+  // not have the same value (we support both 1 and -1 steps).
+  assert(Step->getType() ==
+             CurrLatchCheck.IV->getStepRecurrence(*SE)->getType() &&
+         "Range and latch steps should be of same type!");
+  if (Step != CurrLatchCheck.IV->getStepRecurrence(*SE)) {
+    LLVM_DEBUG(dbgs() << "Range and latch have different step values!\n");
+    return None;
+  }
+
+  if (Step->isOne())
+    return widenICmpRangeCheckIncrementingLoop(CurrLatchCheck, *RangeCheck,
+                                               Expander, Guard);
+  else {
+    assert(Step->isAllOnesValue() && "Step should be -1!");
+    return widenICmpRangeCheckDecrementingLoop(CurrLatchCheck, *RangeCheck,
+                                               Expander, Guard);
+  }
+}
+
+unsigned LoopPredication::collectChecks(SmallVectorImpl<Value *> &Checks,
+                                        Value *Condition,
+                                        SCEVExpander &Expander,
+                                        Instruction *Guard) {
+  unsigned NumWidened = 0;
+  // The guard condition is expected to be in form of:
+  //   cond1 && cond2 && cond3 ...
+  // Iterate over subconditions looking for icmp conditions which can be
+  // widened across loop iterations. Widening these conditions remember the
+  // resulting list of subconditions in Checks vector.
+  SmallVector<Value *, 4> Worklist(1, Condition);
+  SmallPtrSet<Value *, 4> Visited;
+  Value *WideableCond = nullptr;
+  do {
+    Value *Condition = Worklist.pop_back_val();
+    if (!Visited.insert(Condition).second)
+      continue;
+
+    Value *LHS, *RHS;
+    using namespace llvm::PatternMatch;
+    if (match(Condition, m_And(m_Value(LHS), m_Value(RHS)))) {
+      Worklist.push_back(LHS);
+      Worklist.push_back(RHS);
+      continue;
+    }
+
+    if (match(Condition,
+              m_Intrinsic<Intrinsic::experimental_widenable_condition>())) {
+      // Pick any, we don't care which
+      WideableCond = Condition;
+      continue;
+    }
+
+    if (ICmpInst *ICI = dyn_cast<ICmpInst>(Condition)) {
+      if (auto NewRangeCheck = widenICmpRangeCheck(ICI, Expander,
+                                                   Guard)) {
+        Checks.push_back(NewRangeCheck.getValue());
+        NumWidened++;
+        continue;
+      }
+    }
+
+    // Save the condition as is if we can't widen it
+    Checks.push_back(Condition);
+  } while (!Worklist.empty());
+  // At the moment, our matching logic for wideable conditions implicitly
+  // assumes we preserve the form: (br (and Cond, WC())).  FIXME
+  // Note that if there were multiple calls to wideable condition in the
+  // traversal, we only need to keep one, and which one is arbitrary.
+  if (WideableCond)
+    Checks.push_back(WideableCond);
+  return NumWidened;
+}
+
+bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard,
+                                           SCEVExpander &Expander) {
+  LLVM_DEBUG(dbgs() << "Processing guard:\n");
+  LLVM_DEBUG(Guard->dump());
+
+  TotalConsidered++;
+  SmallVector<Value *, 4> Checks;
+  unsigned NumWidened = collectChecks(Checks, Guard->getOperand(0), Expander,
+                                      Guard);
+  if (NumWidened == 0)
+    return false;
+
+  TotalWidened += NumWidened;
+
+  // Emit the new guard condition
+  IRBuilder<> Builder(findInsertPt(Guard, Checks));
+  Value *AllChecks = Builder.CreateAnd(Checks);
+  auto *OldCond = Guard->getOperand(0);
+  Guard->setOperand(0, AllChecks);
+  RecursivelyDeleteTriviallyDeadInstructions(OldCond);
+
+  LLVM_DEBUG(dbgs() << "Widened checks = " << NumWidened << "\n");
+  return true;
+}
+
+bool LoopPredication::widenWidenableBranchGuardConditions(
+    BranchInst *BI, SCEVExpander &Expander) {
+  assert(isGuardAsWidenableBranch(BI) && "Must be!");
+  LLVM_DEBUG(dbgs() << "Processing guard:\n");
+  LLVM_DEBUG(BI->dump());
+
+  TotalConsidered++;
+  SmallVector<Value *, 4> Checks;
+  unsigned NumWidened = collectChecks(Checks, BI->getCondition(),
+                                      Expander, BI);
+  if (NumWidened == 0)
+    return false;
+
+  TotalWidened += NumWidened;
+
+  // Emit the new guard condition
+  IRBuilder<> Builder(findInsertPt(BI, Checks));
+  Value *AllChecks = Builder.CreateAnd(Checks);
+  auto *OldCond = BI->getCondition();
+  BI->setCondition(AllChecks);
+  RecursivelyDeleteTriviallyDeadInstructions(OldCond);
+  assert(isGuardAsWidenableBranch(BI) &&
+         "Stopped being a guard after transform?");
+
+  LLVM_DEBUG(dbgs() << "Widened checks = " << NumWidened << "\n");
+  return true;
+}
+
+Optional<LoopICmp> LoopPredication::parseLoopLatchICmp() {
+  using namespace PatternMatch;
+
+  BasicBlock *LoopLatch = L->getLoopLatch();
+  if (!LoopLatch) {
+    LLVM_DEBUG(dbgs() << "The loop doesn't have a single latch!\n");
+    return None;
+  }
+
+  auto *BI = dyn_cast<BranchInst>(LoopLatch->getTerminator());
+  if (!BI || !BI->isConditional()) {
+    LLVM_DEBUG(dbgs() << "Failed to match the latch terminator!\n");
+    return None;
+  }
+  BasicBlock *TrueDest = BI->getSuccessor(0);
+  assert(
+      (TrueDest == L->getHeader() || BI->getSuccessor(1) == L->getHeader()) &&
+      "One of the latch's destinations must be the header");
+
+  auto *ICI = dyn_cast<ICmpInst>(BI->getCondition());
+  if (!ICI) {
+    LLVM_DEBUG(dbgs() << "Failed to match the latch condition!\n");
+    return None;
+  }
+  auto Result = parseLoopICmp(ICI);
+  if (!Result) {
+    LLVM_DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
+    return None;
+  }
+
+  if (TrueDest != L->getHeader())
+    Result->Pred = ICmpInst::getInversePredicate(Result->Pred);
+
+  // Check affine first, so if it's not we don't try to compute the step
+  // recurrence.
+  if (!Result->IV->isAffine()) {
+    LLVM_DEBUG(dbgs() << "The induction variable is not affine!\n");
+    return None;
+  }
+
+  auto *Step = Result->IV->getStepRecurrence(*SE);
+  if (!isSupportedStep(Step)) {
+    LLVM_DEBUG(dbgs() << "Unsupported loop stride(" << *Step << ")!\n");
+    return None;
+  }
+
+  auto IsUnsupportedPredicate = [](const SCEV *Step, ICmpInst::Predicate Pred) {
+    if (Step->isOne()) {
+      return Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_SLT &&
+             Pred != ICmpInst::ICMP_ULE && Pred != ICmpInst::ICMP_SLE;
+    } else {
+      assert(Step->isAllOnesValue() && "Step should be -1!");
+      return Pred != ICmpInst::ICMP_UGT && Pred != ICmpInst::ICMP_SGT &&
+             Pred != ICmpInst::ICMP_UGE && Pred != ICmpInst::ICMP_SGE;
+    }
+  };
+
+  normalizePredicate(SE, L, *Result);
+  if (IsUnsupportedPredicate(Step, Result->Pred)) {
+    LLVM_DEBUG(dbgs() << "Unsupported loop latch predicate(" << Result->Pred
+                      << ")!\n");
+    return None;
+  }
+
+  return Result;
+}
+
+
+bool LoopPredication::isLoopProfitableToPredicate() {
+  if (SkipProfitabilityChecks || !BPI)
+    return true;
+
+  SmallVector<std::pair<BasicBlock *, BasicBlock *>, 8> ExitEdges;
+  L->getExitEdges(ExitEdges);
+  // If there is only one exiting edge in the loop, it is always profitable to
+  // predicate the loop.
+  if (ExitEdges.size() == 1)
+    return true;
+
+  // Calculate the exiting probabilities of all exiting edges from the loop,
+  // starting with the LatchExitProbability.
+  // Heuristic for profitability: If any of the exiting blocks' probability of
+  // exiting the loop is larger than exiting through the latch block, it's not
+  // profitable to predicate the loop.
+  auto *LatchBlock = L->getLoopLatch();
+  assert(LatchBlock && "Should have a single latch at this point!");
+  auto *LatchTerm = LatchBlock->getTerminator();
+  assert(LatchTerm->getNumSuccessors() == 2 &&
+         "expected to be an exiting block with 2 succs!");
+  unsigned LatchBrExitIdx =
+      LatchTerm->getSuccessor(0) == L->getHeader() ? 1 : 0;
+  BranchProbability LatchExitProbability =
+      BPI->getEdgeProbability(LatchBlock, LatchBrExitIdx);
+
+  // Protect against degenerate inputs provided by the user. Providing a value
+  // less than one, can invert the definition of profitable loop predication.
+  float ScaleFactor = LatchExitProbabilityScale;
+  if (ScaleFactor < 1) {
+    LLVM_DEBUG(
+        dbgs()
+        << "Ignored user setting for loop-predication-latch-probability-scale: "
+        << LatchExitProbabilityScale << "\n");
+    LLVM_DEBUG(dbgs() << "The value is set to 1.0\n");
+    ScaleFactor = 1.0;
+  }
+  const auto LatchProbabilityThreshold =
+      LatchExitProbability * ScaleFactor;
+
+  for (const auto &ExitEdge : ExitEdges) {
+    BranchProbability ExitingBlockProbability =
+        BPI->getEdgeProbability(ExitEdge.first, ExitEdge.second);
+    // Some exiting edge has higher probability than the latch exiting edge.
+    // No longer profitable to predicate.
+    if (ExitingBlockProbability > LatchProbabilityThreshold)
+      return false;
+  }
+  // Using BPI, we have concluded that the most probable way to exit from the
+  // loop is through the latch (or there's no profile information and all
+  // exits are equally likely).
+  return true;
+}
+
+/// If we can (cheaply) find a widenable branch which controls entry into the
+/// loop, return it.
+static BranchInst *FindWidenableTerminatorAboveLoop(Loop *L, LoopInfo &LI) {
+  // Walk back through any unconditional executed blocks and see if we can find
+  // a widenable condition which seems to control execution of this loop.  Note
+  // that we predict that maythrow calls are likely untaken and thus that it's
+  // profitable to widen a branch before a maythrow call with a condition
+  // afterwards even though that may cause the slow path to run in a case where
+  // it wouldn't have otherwise.
+  BasicBlock *BB = L->getLoopPreheader();
+  if (!BB)
+    return nullptr;
+  do {
+    if (BasicBlock *Pred = BB->getSinglePredecessor())
+      if (BB == Pred->getSingleSuccessor()) {
+        BB = Pred;
+        continue;
+      }
+    break;
+  } while (true);
+
+  if (BasicBlock *Pred = BB->getSinglePredecessor()) {
+    auto *Term = Pred->getTerminator();
+
+    Value *Cond, *WC;
+    BasicBlock *IfTrueBB, *IfFalseBB;
+    if (parseWidenableBranch(Term, Cond, WC, IfTrueBB, IfFalseBB) &&
+        IfTrueBB == BB)
+      return cast<BranchInst>(Term);
+  }
+  return nullptr;
+}
+
+/// Return the minimum of all analyzeable exit counts.  This is an upper bound
+/// on the actual exit count.  If there are not at least two analyzeable exits,
+/// returns SCEVCouldNotCompute.
+static const SCEV *getMinAnalyzeableBackedgeTakenCount(ScalarEvolution &SE,
+                                                       DominatorTree &DT,
+                                                       Loop *L) {
+  SmallVector<BasicBlock *, 16> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+
+  SmallVector<const SCEV *, 4> ExitCounts;
+  for (BasicBlock *ExitingBB : ExitingBlocks) {
+    const SCEV *ExitCount = SE.getExitCount(L, ExitingBB);
+    if (isa<SCEVCouldNotCompute>(ExitCount))
+      continue;
+    assert(DT.dominates(ExitingBB, L->getLoopLatch()) &&
+           "We should only have known counts for exiting blocks that "
+           "dominate latch!");
+    ExitCounts.push_back(ExitCount);
+  }
+  if (ExitCounts.size() < 2)
+    return SE.getCouldNotCompute();
+  return SE.getUMinFromMismatchedTypes(ExitCounts);
+}
+
+/// This implements an analogous, but entirely distinct transform from the main
+/// loop predication transform.  This one is phrased in terms of using a
+/// widenable branch *outside* the loop to allow us to simplify loop exits in a
+/// following loop.  This is close in spirit to the IndVarSimplify transform
+/// of the same name, but is materially different widening loosens legality
+/// sharply.
+bool LoopPredication::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
+  // The transformation performed here aims to widen a widenable condition
+  // above the loop such that all analyzeable exit leading to deopt are dead.
+  // It assumes that the latch is the dominant exit for profitability and that
+  // exits branching to deoptimizing blocks are rarely taken. It relies on the
+  // semantics of widenable expressions for legality. (i.e. being able to fall
+  // down the widenable path spuriously allows us to ignore exit order,
+  // unanalyzeable exits, side effects, exceptional exits, and other challenges
+  // which restrict the applicability of the non-WC based version of this
+  // transform in IndVarSimplify.)
+  //
+  // NOTE ON POISON/UNDEF - We're hoisting an expression above guards which may
+  // imply flags on the expression being hoisted and inserting new uses (flags
+  // are only correct for current uses).  The result is that we may be
+  // inserting a branch on the value which can be either poison or undef.  In
+  // this case, the branch can legally go either way; we just need to avoid
+  // introducing UB.  This is achieved through the use of the freeze
+  // instruction.
+
+  SmallVector<BasicBlock *, 16> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+
+  if (ExitingBlocks.empty())
+    return false; // Nothing to do.
+
+  auto *Latch = L->getLoopLatch();
+  if (!Latch)
+    return false;
+
+  auto *WidenableBR = FindWidenableTerminatorAboveLoop(L, *LI);
+  if (!WidenableBR)
+    return false;
+
+  const SCEV *LatchEC = SE->getExitCount(L, Latch);
+  if (isa<SCEVCouldNotCompute>(LatchEC))
+    return false; // profitability - want hot exit in analyzeable set
+
+  // At this point, we have found an analyzeable latch, and a widenable
+  // condition above the loop.  If we have a widenable exit within the loop
+  // (for which we can't compute exit counts), drop the ability to further
+  // widen so that we gain ability to analyze it's exit count and perform this
+  // transform.  TODO: It'd be nice to know for sure the exit became
+  // analyzeable after dropping widenability.
+  {
+    bool Invalidate = false;
+
+    for (auto *ExitingBB : ExitingBlocks) {
+      if (LI->getLoopFor(ExitingBB) != L)
+        continue;
+
+      auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator());
+      if (!BI)
+        continue;
+
+      Use *Cond, *WC;
+      BasicBlock *IfTrueBB, *IfFalseBB;
+      if (parseWidenableBranch(BI, Cond, WC, IfTrueBB, IfFalseBB) &&
+          L->contains(IfTrueBB)) {
+        WC->set(ConstantInt::getTrue(IfTrueBB->getContext()));
+        Invalidate = true;
+      }
+    }
+    if (Invalidate)
+      SE->forgetLoop(L);
+  }
+
+  // The use of umin(all analyzeable exits) instead of latch is subtle, but
+  // important for profitability.  We may have a loop which hasn't been fully
+  // canonicalized just yet.  If the exit we chose to widen is provably never
+  // taken, we want the widened form to *also* be provably never taken.  We
+  // can't guarantee this as a current unanalyzeable exit may later become
+  // analyzeable, but we can at least avoid the obvious cases.
+  const SCEV *MinEC = getMinAnalyzeableBackedgeTakenCount(*SE, *DT, L);
+  if (isa<SCEVCouldNotCompute>(MinEC) || MinEC->getType()->isPointerTy() ||
+      !SE->isLoopInvariant(MinEC, L) ||
+      !isSafeToExpandAt(MinEC, WidenableBR, *SE))
+    return false;
+
+  // Subtlety: We need to avoid inserting additional uses of the WC.  We know
+  // that it can only have one transitive use at the moment, and thus moving
+  // that use to just before the branch and inserting code before it and then
+  // modifying the operand is legal.
+  auto *IP = cast<Instruction>(WidenableBR->getCondition());
+  IP->moveBefore(WidenableBR);
+  Rewriter.setInsertPoint(IP);
+  IRBuilder<> B(IP);
+
+  bool Changed = false;
+  Value *MinECV = nullptr; // lazily generated if needed
+  for (BasicBlock *ExitingBB : ExitingBlocks) {
+    // If our exiting block exits multiple loops, we can only rewrite the
+    // innermost one.  Otherwise, we're changing how many times the innermost
+    // loop runs before it exits.
+    if (LI->getLoopFor(ExitingBB) != L)
+      continue;
+
+    // Can't rewrite non-branch yet.
+    auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator());
+    if (!BI)
+      continue;
+
+    // If already constant, nothing to do.
+    if (isa<Constant>(BI->getCondition()))
+      continue;
+
+    const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
+    if (isa<SCEVCouldNotCompute>(ExitCount) ||
+        ExitCount->getType()->isPointerTy() ||
+        !isSafeToExpandAt(ExitCount, WidenableBR, *SE))
+      continue;
+
+    const bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB));
+    BasicBlock *ExitBB = BI->getSuccessor(ExitIfTrue ? 0 : 1);
+    if (!ExitBB->getPostdominatingDeoptimizeCall())
+      continue;
+
+    /// Here we can be fairly sure that executing this exit will most likely
+    /// lead to executing llvm.experimental.deoptimize.
+    /// This is a profitability heuristic, not a legality constraint.
+
+    // If we found a widenable exit condition, do two things:
+    // 1) fold the widened exit test into the widenable condition
+    // 2) fold the branch to untaken - avoids infinite looping
+
+    Value *ECV = Rewriter.expandCodeFor(ExitCount);
+    if (!MinECV)
+      MinECV = Rewriter.expandCodeFor(MinEC);
+    Value *RHS = MinECV;
+    if (ECV->getType() != RHS->getType()) {
+      Type *WiderTy = SE->getWiderType(ECV->getType(), RHS->getType());
+      ECV = B.CreateZExt(ECV, WiderTy);
+      RHS = B.CreateZExt(RHS, WiderTy);
+    }
+    assert(!Latch || DT->dominates(ExitingBB, Latch));
+    Value *NewCond = B.CreateICmp(ICmpInst::ICMP_UGT, ECV, RHS);
+    // Freeze poison or undef to an arbitrary bit pattern to ensure we can
+    // branch without introducing UB.  See NOTE ON POISON/UNDEF above for
+    // context.
+    NewCond = B.CreateFreeze(NewCond);
+
+    widenWidenableBranch(WidenableBR, NewCond);
+
+    Value *OldCond = BI->getCondition();
+    BI->setCondition(ConstantInt::get(OldCond->getType(), !ExitIfTrue));
+    Changed = true;
+  }
+
+  if (Changed)
+    // We just mutated a bunch of loop exits changing there exit counts
+    // widely.  We need to force recomputation of the exit counts given these
+    // changes.  Note that all of the inserted exits are never taken, and
+    // should be removed next time the CFG is modified.
+    SE->forgetLoop(L);
+  return Changed;
+}
+
+bool LoopPredication::runOnLoop(Loop *Loop) {
+  L = Loop;
+
+  LLVM_DEBUG(dbgs() << "Analyzing ");
+  LLVM_DEBUG(L->dump());
+
+  Module *M = L->getHeader()->getModule();
+
+  // There is nothing to do if the module doesn't use guards
+  auto *GuardDecl =
+      M->getFunction(Intrinsic::getName(Intrinsic::experimental_guard));
+  bool HasIntrinsicGuards = GuardDecl && !GuardDecl->use_empty();
+  auto *WCDecl = M->getFunction(
+      Intrinsic::getName(Intrinsic::experimental_widenable_condition));
+  bool HasWidenableConditions =
+      PredicateWidenableBranchGuards && WCDecl && !WCDecl->use_empty();
+  if (!HasIntrinsicGuards && !HasWidenableConditions)
+    return false;
+
+  DL = &M->getDataLayout();
+
+  Preheader = L->getLoopPreheader();
+  if (!Preheader)
+    return false;
+
+  auto LatchCheckOpt = parseLoopLatchICmp();
+  if (!LatchCheckOpt)
+    return false;
+  LatchCheck = *LatchCheckOpt;
+
+  LLVM_DEBUG(dbgs() << "Latch check:\n");
+  LLVM_DEBUG(LatchCheck.dump());
+
+  if (!isLoopProfitableToPredicate()) {
+    LLVM_DEBUG(dbgs() << "Loop not profitable to predicate!\n");
+    return false;
+  }
+  // Collect all the guards into a vector and process later, so as not
+  // to invalidate the instruction iterator.
+  SmallVector<IntrinsicInst *, 4> Guards;
+  SmallVector<BranchInst *, 4> GuardsAsWidenableBranches;
+  for (const auto BB : L->blocks()) {
+    for (auto &I : *BB)
+      if (isGuard(&I))
+        Guards.push_back(cast<IntrinsicInst>(&I));
+    if (PredicateWidenableBranchGuards &&
+        isGuardAsWidenableBranch(BB->getTerminator()))
+      GuardsAsWidenableBranches.push_back(
+          cast<BranchInst>(BB->getTerminator()));
+  }
+
+  SCEVExpander Expander(*SE, *DL, "loop-predication");
+  bool Changed = false;
+  for (auto *Guard : Guards)
+    Changed |= widenGuardConditions(Guard, Expander);
+  for (auto *Guard : GuardsAsWidenableBranches)
+    Changed |= widenWidenableBranchGuardConditions(Guard, Expander);
+  Changed |= predicateLoopExits(L, Expander);
+  return Changed;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopRerollPass.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopRerollPass.cpp
index cd8e046fb8..65a6205f03 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopRerollPass.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -1,183 +1,183 @@
-//===- LoopReroll.cpp - Loop rerolling pass -------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass implements a simple loop reroller. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/ADT/APInt.h" 
-#include "llvm/ADT/BitVector.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/DenseSet.h" 
-#include "llvm/ADT/MapVector.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/AliasAnalysis.h" 
-#include "llvm/Analysis/AliasSetTracker.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/LoopPass.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/Analysis/ScalarEvolutionExpressions.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
+//===- LoopReroll.cpp - Loop rerolling pass -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements a simple loop reroller.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/LoopReroll.h"
-#include "llvm/Transforms/Utils.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Transforms/Utils/LoopUtils.h" 
-#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 
-#include <cassert> 
-#include <cstddef> 
-#include <cstdint> 
-#include <cstdlib> 
-#include <iterator> 
-#include <map> 
-#include <utility> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "loop-reroll" 
- 
-STATISTIC(NumRerolledLoops, "Number of rerolled loops"); 
- 
-static cl::opt<unsigned> 
-NumToleratedFailedMatches("reroll-num-tolerated-failed-matches", cl::init(400), 
-                          cl::Hidden, 
-                          cl::desc("The maximum number of failures to tolerate" 
-                                   " during fuzzy matching. (default: 400)")); 
- 
-// This loop re-rolling transformation aims to transform loops like this: 
-// 
-// int foo(int a); 
-// void bar(int *x) { 
-//   for (int i = 0; i < 500; i += 3) { 
-//     foo(i); 
-//     foo(i+1); 
-//     foo(i+2); 
-//   } 
-// } 
-// 
-// into a loop like this: 
-// 
-// void bar(int *x) { 
-//   for (int i = 0; i < 500; ++i) 
-//     foo(i); 
-// } 
-// 
-// It does this by looking for loops that, besides the latch code, are composed 
-// of isomorphic DAGs of instructions, with each DAG rooted at some increment 
-// to the induction variable, and where each DAG is isomorphic to the DAG 
-// rooted at the induction variable (excepting the sub-DAGs which root the 
-// other induction-variable increments). In other words, we're looking for loop 
-// bodies of the form: 
-// 
-// %iv = phi [ (preheader, ...), (body, %iv.next) ] 
-// f(%iv) 
-// %iv.1 = add %iv, 1                <-- a root increment 
-// f(%iv.1) 
-// %iv.2 = add %iv, 2                <-- a root increment 
-// f(%iv.2) 
-// %iv.scale_m_1 = add %iv, scale-1  <-- a root increment 
-// f(%iv.scale_m_1) 
-// ... 
-// %iv.next = add %iv, scale 
-// %cmp = icmp(%iv, ...) 
-// br %cmp, header, exit 
-// 
-// where each f(i) is a set of instructions that, collectively, are a function 
-// only of i (and other loop-invariant values). 
-// 
-// As a special case, we can also reroll loops like this: 
-// 
-// int foo(int); 
-// void bar(int *x) { 
-//   for (int i = 0; i < 500; ++i) { 
-//     x[3*i] = foo(0); 
-//     x[3*i+1] = foo(0); 
-//     x[3*i+2] = foo(0); 
-//   } 
-// } 
-// 
-// into this: 
-// 
-// void bar(int *x) { 
-//   for (int i = 0; i < 1500; ++i) 
-//     x[i] = foo(0); 
-// } 
-// 
-// in which case, we're looking for inputs like this: 
-// 
-// %iv = phi [ (preheader, ...), (body, %iv.next) ] 
-// %scaled.iv = mul %iv, scale 
-// f(%scaled.iv) 
-// %scaled.iv.1 = add %scaled.iv, 1 
-// f(%scaled.iv.1) 
-// %scaled.iv.2 = add %scaled.iv, 2 
-// f(%scaled.iv.2) 
-// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1 
-// f(%scaled.iv.scale_m_1) 
-// ... 
-// %iv.next = add %iv, 1 
-// %cmp = icmp(%iv, ...) 
-// br %cmp, header, exit 
- 
-namespace { 
- 
-  enum IterationLimits { 
-    /// The maximum number of iterations that we'll try and reroll. 
-    IL_MaxRerollIterations = 32, 
-    /// The bitvector index used by loop induction variables and other 
-    /// instructions that belong to all iterations. 
-    IL_All, 
-    IL_End 
-  }; 
- 
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <iterator>
+#include <map>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-reroll"
+
+STATISTIC(NumRerolledLoops, "Number of rerolled loops");
+
+static cl::opt<unsigned>
+NumToleratedFailedMatches("reroll-num-tolerated-failed-matches", cl::init(400),
+                          cl::Hidden,
+                          cl::desc("The maximum number of failures to tolerate"
+                                   " during fuzzy matching. (default: 400)"));
+
+// This loop re-rolling transformation aims to transform loops like this:
+//
+// int foo(int a);
+// void bar(int *x) {
+//   for (int i = 0; i < 500; i += 3) {
+//     foo(i);
+//     foo(i+1);
+//     foo(i+2);
+//   }
+// }
+//
+// into a loop like this:
+//
+// void bar(int *x) {
+//   for (int i = 0; i < 500; ++i)
+//     foo(i);
+// }
+//
+// It does this by looking for loops that, besides the latch code, are composed
+// of isomorphic DAGs of instructions, with each DAG rooted at some increment
+// to the induction variable, and where each DAG is isomorphic to the DAG
+// rooted at the induction variable (excepting the sub-DAGs which root the
+// other induction-variable increments). In other words, we're looking for loop
+// bodies of the form:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// f(%iv)
+// %iv.1 = add %iv, 1                <-- a root increment
+// f(%iv.1)
+// %iv.2 = add %iv, 2                <-- a root increment
+// f(%iv.2)
+// %iv.scale_m_1 = add %iv, scale-1  <-- a root increment
+// f(%iv.scale_m_1)
+// ...
+// %iv.next = add %iv, scale
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+//
+// where each f(i) is a set of instructions that, collectively, are a function
+// only of i (and other loop-invariant values).
+//
+// As a special case, we can also reroll loops like this:
+//
+// int foo(int);
+// void bar(int *x) {
+//   for (int i = 0; i < 500; ++i) {
+//     x[3*i] = foo(0);
+//     x[3*i+1] = foo(0);
+//     x[3*i+2] = foo(0);
+//   }
+// }
+//
+// into this:
+//
+// void bar(int *x) {
+//   for (int i = 0; i < 1500; ++i)
+//     x[i] = foo(0);
+// }
+//
+// in which case, we're looking for inputs like this:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// %scaled.iv = mul %iv, scale
+// f(%scaled.iv)
+// %scaled.iv.1 = add %scaled.iv, 1
+// f(%scaled.iv.1)
+// %scaled.iv.2 = add %scaled.iv, 2
+// f(%scaled.iv.2)
+// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1
+// f(%scaled.iv.scale_m_1)
+// ...
+// %iv.next = add %iv, 1
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+
+namespace {
+
+  enum IterationLimits {
+    /// The maximum number of iterations that we'll try and reroll.
+    IL_MaxRerollIterations = 32,
+    /// The bitvector index used by loop induction variables and other
+    /// instructions that belong to all iterations.
+    IL_All,
+    IL_End
+  };
+
   class LoopRerollLegacyPass : public LoopPass {
-  public: 
-    static char ID; // Pass ID, replacement for typeid 
- 
+  public:
+    static char ID; // Pass ID, replacement for typeid
+
     LoopRerollLegacyPass() : LoopPass(ID) {
       initializeLoopRerollLegacyPassPass(*PassRegistry::getPassRegistry());
-    } 
- 
-    bool runOnLoop(Loop *L, LPPassManager &LPM) override; 
- 
-    void getAnalysisUsage(AnalysisUsage &AU) const override { 
-      AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-      getLoopAnalysisUsage(AU); 
-    } 
+    }
+
+    bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
+      getLoopAnalysisUsage(AU);
+    }
   };
- 
+
   class LoopReroll {
   public:
     LoopReroll(AliasAnalysis *AA, LoopInfo *LI, ScalarEvolution *SE,
@@ -186,1529 +186,1529 @@ namespace {
           PreserveLCSSA(PreserveLCSSA) {}
     bool runOnLoop(Loop *L);
 
-  protected: 
-    AliasAnalysis *AA; 
-    LoopInfo *LI; 
-    ScalarEvolution *SE; 
-    TargetLibraryInfo *TLI; 
-    DominatorTree *DT; 
-    bool PreserveLCSSA; 
- 
-    using SmallInstructionVector = SmallVector<Instruction *, 16>; 
-    using SmallInstructionSet = SmallPtrSet<Instruction *, 16>; 
- 
-    // Map between induction variable and its increment 
-    DenseMap<Instruction *, int64_t> IVToIncMap; 
- 
-    // For loop with multiple induction variable, remember the one used only to 
-    // control the loop. 
-    Instruction *LoopControlIV; 
- 
-    // A chain of isomorphic instructions, identified by a single-use PHI 
-    // representing a reduction. Only the last value may be used outside the 
-    // loop. 
-    struct SimpleLoopReduction { 
-      SimpleLoopReduction(Instruction *P, Loop *L) : Instructions(1, P) { 
-        assert(isa<PHINode>(P) && "First reduction instruction must be a PHI"); 
-        add(L); 
-      } 
- 
-      bool valid() const { 
-        return Valid; 
-      } 
- 
-      Instruction *getPHI() const { 
-        assert(Valid && "Using invalid reduction"); 
-        return Instructions.front(); 
-      } 
- 
-      Instruction *getReducedValue() const { 
-        assert(Valid && "Using invalid reduction"); 
-        return Instructions.back(); 
-      } 
- 
-      Instruction *get(size_t i) const { 
-        assert(Valid && "Using invalid reduction"); 
-        return Instructions[i+1]; 
-      } 
- 
-      Instruction *operator [] (size_t i) const { return get(i); } 
- 
-      // The size, ignoring the initial PHI. 
-      size_t size() const { 
-        assert(Valid && "Using invalid reduction"); 
-        return Instructions.size()-1; 
-      } 
- 
-      using iterator = SmallInstructionVector::iterator; 
-      using const_iterator = SmallInstructionVector::const_iterator; 
- 
-      iterator begin() { 
-        assert(Valid && "Using invalid reduction"); 
-        return std::next(Instructions.begin()); 
-      } 
- 
-      const_iterator begin() const { 
-        assert(Valid && "Using invalid reduction"); 
-        return std::next(Instructions.begin()); 
-      } 
- 
-      iterator end() { return Instructions.end(); } 
-      const_iterator end() const { return Instructions.end(); } 
- 
-    protected: 
-      bool Valid = false; 
-      SmallInstructionVector Instructions; 
- 
-      void add(Loop *L); 
-    }; 
- 
-    // The set of all reductions, and state tracking of possible reductions 
-    // during loop instruction processing. 
-    struct ReductionTracker { 
-      using SmallReductionVector = SmallVector<SimpleLoopReduction, 16>; 
- 
-      // Add a new possible reduction. 
-      void addSLR(SimpleLoopReduction &SLR) { PossibleReds.push_back(SLR); } 
- 
-      // Setup to track possible reductions corresponding to the provided 
-      // rerolling scale. Only reductions with a number of non-PHI instructions 
-      // that is divisible by the scale are considered. Three instructions sets 
-      // are filled in: 
-      //   - A set of all possible instructions in eligible reductions. 
-      //   - A set of all PHIs in eligible reductions 
-      //   - A set of all reduced values (last instructions) in eligible 
-      //     reductions. 
-      void restrictToScale(uint64_t Scale, 
-                           SmallInstructionSet &PossibleRedSet, 
-                           SmallInstructionSet &PossibleRedPHISet, 
-                           SmallInstructionSet &PossibleRedLastSet) { 
-        PossibleRedIdx.clear(); 
-        PossibleRedIter.clear(); 
-        Reds.clear(); 
- 
-        for (unsigned i = 0, e = PossibleReds.size(); i != e; ++i) 
-          if (PossibleReds[i].size() % Scale == 0) { 
-            PossibleRedLastSet.insert(PossibleReds[i].getReducedValue()); 
-            PossibleRedPHISet.insert(PossibleReds[i].getPHI()); 
- 
-            PossibleRedSet.insert(PossibleReds[i].getPHI()); 
-            PossibleRedIdx[PossibleReds[i].getPHI()] = i; 
-            for (Instruction *J : PossibleReds[i]) { 
-              PossibleRedSet.insert(J); 
-              PossibleRedIdx[J] = i; 
-            } 
-          } 
-      } 
- 
-      // The functions below are used while processing the loop instructions. 
- 
-      // Are the two instructions both from reductions, and furthermore, from 
-      // the same reduction? 
-      bool isPairInSame(Instruction *J1, Instruction *J2) { 
-        DenseMap<Instruction *, int>::iterator J1I = PossibleRedIdx.find(J1); 
-        if (J1I != PossibleRedIdx.end()) { 
-          DenseMap<Instruction *, int>::iterator J2I = PossibleRedIdx.find(J2); 
-          if (J2I != PossibleRedIdx.end() && J1I->second == J2I->second) 
-            return true; 
-        } 
- 
-        return false; 
-      } 
- 
-      // The two provided instructions, the first from the base iteration, and 
-      // the second from iteration i, form a matched pair. If these are part of 
-      // a reduction, record that fact. 
-      void recordPair(Instruction *J1, Instruction *J2, unsigned i) { 
-        if (PossibleRedIdx.count(J1)) { 
-          assert(PossibleRedIdx.count(J2) && 
-                 "Recording reduction vs. non-reduction instruction?"); 
- 
-          PossibleRedIter[J1] = 0; 
-          PossibleRedIter[J2] = i; 
- 
-          int Idx = PossibleRedIdx[J1]; 
-          assert(Idx == PossibleRedIdx[J2] && 
-                 "Recording pair from different reductions?"); 
-          Reds.insert(Idx); 
-        } 
-      } 
- 
-      // The functions below can be called after we've finished processing all 
-      // instructions in the loop, and we know which reductions were selected. 
- 
-      bool validateSelected(); 
-      void replaceSelected(); 
- 
-    protected: 
-      // The vector of all possible reductions (for any scale). 
-      SmallReductionVector PossibleReds; 
- 
-      DenseMap<Instruction *, int> PossibleRedIdx; 
-      DenseMap<Instruction *, int> PossibleRedIter; 
-      DenseSet<int> Reds; 
-    }; 
- 
-    // A DAGRootSet models an induction variable being used in a rerollable 
-    // loop. For example, 
-    // 
-    //   x[i*3+0] = y1 
-    //   x[i*3+1] = y2 
-    //   x[i*3+2] = y3 
-    // 
-    //   Base instruction -> i*3 
-    //                    +---+----+ 
-    //                   /    |     \ 
-    //               ST[y1]  +1     +2  <-- Roots 
-    //                        |      | 
-    //                      ST[y2] ST[y3] 
-    // 
-    // There may be multiple DAGRoots, for example: 
-    // 
-    //   x[i*2+0] = ...   (1) 
-    //   x[i*2+1] = ...   (1) 
-    //   x[i*2+4] = ...   (2) 
-    //   x[i*2+5] = ...   (2) 
-    //   x[(i+1234)*2+5678] = ... (3) 
-    //   x[(i+1234)*2+5679] = ... (3) 
-    // 
-    // The loop will be rerolled by adding a new loop induction variable, 
-    // one for the Base instruction in each DAGRootSet. 
-    // 
-    struct DAGRootSet { 
-      Instruction *BaseInst; 
-      SmallInstructionVector Roots; 
- 
-      // The instructions between IV and BaseInst (but not including BaseInst). 
-      SmallInstructionSet SubsumedInsts; 
-    }; 
- 
-    // The set of all DAG roots, and state tracking of all roots 
-    // for a particular induction variable. 
-    struct DAGRootTracker { 
-      DAGRootTracker(LoopReroll *Parent, Loop *L, Instruction *IV, 
-                     ScalarEvolution *SE, AliasAnalysis *AA, 
-                     TargetLibraryInfo *TLI, DominatorTree *DT, LoopInfo *LI, 
-                     bool PreserveLCSSA, 
-                     DenseMap<Instruction *, int64_t> &IncrMap, 
-                     Instruction *LoopCtrlIV) 
-          : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), DT(DT), LI(LI), 
-            PreserveLCSSA(PreserveLCSSA), IV(IV), IVToIncMap(IncrMap), 
-            LoopControlIV(LoopCtrlIV) {} 
- 
-      /// Stage 1: Find all the DAG roots for the induction variable. 
-      bool findRoots(); 
- 
-      /// Stage 2: Validate if the found roots are valid. 
-      bool validate(ReductionTracker &Reductions); 
- 
-      /// Stage 3: Assuming validate() returned true, perform the 
-      /// replacement. 
-      /// @param BackedgeTakenCount The backedge-taken count of L. 
-      void replace(const SCEV *BackedgeTakenCount); 
- 
-    protected: 
-      using UsesTy = MapVector<Instruction *, BitVector>; 
- 
-      void findRootsRecursive(Instruction *IVU, 
-                              SmallInstructionSet SubsumedInsts); 
-      bool findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts); 
-      bool collectPossibleRoots(Instruction *Base, 
-                                std::map<int64_t,Instruction*> &Roots); 
-      bool validateRootSet(DAGRootSet &DRS); 
- 
-      bool collectUsedInstructions(SmallInstructionSet &PossibleRedSet); 
-      void collectInLoopUserSet(const SmallInstructionVector &Roots, 
-                                const SmallInstructionSet &Exclude, 
-                                const SmallInstructionSet &Final, 
-                                DenseSet<Instruction *> &Users); 
-      void collectInLoopUserSet(Instruction *Root, 
-                                const SmallInstructionSet &Exclude, 
-                                const SmallInstructionSet &Final, 
-                                DenseSet<Instruction *> &Users); 
- 
-      UsesTy::iterator nextInstr(int Val, UsesTy &In, 
-                                 const SmallInstructionSet &Exclude, 
-                                 UsesTy::iterator *StartI=nullptr); 
-      bool isBaseInst(Instruction *I); 
-      bool isRootInst(Instruction *I); 
-      bool instrDependsOn(Instruction *I, 
-                          UsesTy::iterator Start, 
-                          UsesTy::iterator End); 
-      void replaceIV(DAGRootSet &DRS, const SCEV *Start, const SCEV *IncrExpr); 
- 
-      LoopReroll *Parent; 
- 
-      // Members of Parent, replicated here for brevity. 
-      Loop *L; 
-      ScalarEvolution *SE; 
-      AliasAnalysis *AA; 
-      TargetLibraryInfo *TLI; 
-      DominatorTree *DT; 
-      LoopInfo *LI; 
-      bool PreserveLCSSA; 
- 
-      // The loop induction variable. 
-      Instruction *IV; 
- 
-      // Loop step amount. 
-      int64_t Inc; 
- 
-      // Loop reroll count; if Inc == 1, this records the scaling applied 
-      // to the indvar: a[i*2+0] = ...; a[i*2+1] = ... ; 
-      // If Inc is not 1, Scale = Inc. 
-      uint64_t Scale; 
- 
-      // The roots themselves. 
-      SmallVector<DAGRootSet,16> RootSets; 
- 
-      // All increment instructions for IV. 
-      SmallInstructionVector LoopIncs; 
- 
-      // Map of all instructions in the loop (in order) to the iterations 
-      // they are used in (or specially, IL_All for instructions 
-      // used in the loop increment mechanism). 
-      UsesTy Uses; 
- 
-      // Map between induction variable and its increment 
-      DenseMap<Instruction *, int64_t> &IVToIncMap; 
- 
-      Instruction *LoopControlIV; 
-    }; 
- 
-    // Check if it is a compare-like instruction whose user is a branch 
-    bool isCompareUsedByBranch(Instruction *I) { 
-      auto *TI = I->getParent()->getTerminator(); 
-      if (!isa<BranchInst>(TI) || !isa<CmpInst>(I)) 
-        return false; 
-      return I->hasOneUse() && TI->getOperand(0) == I; 
-    }; 
- 
-    bool isLoopControlIV(Loop *L, Instruction *IV); 
-    void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs); 
-    void collectPossibleReductions(Loop *L, 
-           ReductionTracker &Reductions); 
-    bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, 
-                const SCEV *BackedgeTakenCount, ReductionTracker &Reductions); 
-  }; 
- 
-} // end anonymous namespace 
- 
+  protected:
+    AliasAnalysis *AA;
+    LoopInfo *LI;
+    ScalarEvolution *SE;
+    TargetLibraryInfo *TLI;
+    DominatorTree *DT;
+    bool PreserveLCSSA;
+
+    using SmallInstructionVector = SmallVector<Instruction *, 16>;
+    using SmallInstructionSet = SmallPtrSet<Instruction *, 16>;
+
+    // Map between induction variable and its increment
+    DenseMap<Instruction *, int64_t> IVToIncMap;
+
+    // For loop with multiple induction variable, remember the one used only to
+    // control the loop.
+    Instruction *LoopControlIV;
+
+    // A chain of isomorphic instructions, identified by a single-use PHI
+    // representing a reduction. Only the last value may be used outside the
+    // loop.
+    struct SimpleLoopReduction {
+      SimpleLoopReduction(Instruction *P, Loop *L) : Instructions(1, P) {
+        assert(isa<PHINode>(P) && "First reduction instruction must be a PHI");
+        add(L);
+      }
+
+      bool valid() const {
+        return Valid;
+      }
+
+      Instruction *getPHI() const {
+        assert(Valid && "Using invalid reduction");
+        return Instructions.front();
+      }
+
+      Instruction *getReducedValue() const {
+        assert(Valid && "Using invalid reduction");
+        return Instructions.back();
+      }
+
+      Instruction *get(size_t i) const {
+        assert(Valid && "Using invalid reduction");
+        return Instructions[i+1];
+      }
+
+      Instruction *operator [] (size_t i) const { return get(i); }
+
+      // The size, ignoring the initial PHI.
+      size_t size() const {
+        assert(Valid && "Using invalid reduction");
+        return Instructions.size()-1;
+      }
+
+      using iterator = SmallInstructionVector::iterator;
+      using const_iterator = SmallInstructionVector::const_iterator;
+
+      iterator begin() {
+        assert(Valid && "Using invalid reduction");
+        return std::next(Instructions.begin());
+      }
+
+      const_iterator begin() const {
+        assert(Valid && "Using invalid reduction");
+        return std::next(Instructions.begin());
+      }
+
+      iterator end() { return Instructions.end(); }
+      const_iterator end() const { return Instructions.end(); }
+
+    protected:
+      bool Valid = false;
+      SmallInstructionVector Instructions;
+
+      void add(Loop *L);
+    };
+
+    // The set of all reductions, and state tracking of possible reductions
+    // during loop instruction processing.
+    struct ReductionTracker {
+      using SmallReductionVector = SmallVector<SimpleLoopReduction, 16>;
+
+      // Add a new possible reduction.
+      void addSLR(SimpleLoopReduction &SLR) { PossibleReds.push_back(SLR); }
+
+      // Setup to track possible reductions corresponding to the provided
+      // rerolling scale. Only reductions with a number of non-PHI instructions
+      // that is divisible by the scale are considered. Three instructions sets
+      // are filled in:
+      //   - A set of all possible instructions in eligible reductions.
+      //   - A set of all PHIs in eligible reductions
+      //   - A set of all reduced values (last instructions) in eligible
+      //     reductions.
+      void restrictToScale(uint64_t Scale,
+                           SmallInstructionSet &PossibleRedSet,
+                           SmallInstructionSet &PossibleRedPHISet,
+                           SmallInstructionSet &PossibleRedLastSet) {
+        PossibleRedIdx.clear();
+        PossibleRedIter.clear();
+        Reds.clear();
+
+        for (unsigned i = 0, e = PossibleReds.size(); i != e; ++i)
+          if (PossibleReds[i].size() % Scale == 0) {
+            PossibleRedLastSet.insert(PossibleReds[i].getReducedValue());
+            PossibleRedPHISet.insert(PossibleReds[i].getPHI());
+
+            PossibleRedSet.insert(PossibleReds[i].getPHI());
+            PossibleRedIdx[PossibleReds[i].getPHI()] = i;
+            for (Instruction *J : PossibleReds[i]) {
+              PossibleRedSet.insert(J);
+              PossibleRedIdx[J] = i;
+            }
+          }
+      }
+
+      // The functions below are used while processing the loop instructions.
+
+      // Are the two instructions both from reductions, and furthermore, from
+      // the same reduction?
+      bool isPairInSame(Instruction *J1, Instruction *J2) {
+        DenseMap<Instruction *, int>::iterator J1I = PossibleRedIdx.find(J1);
+        if (J1I != PossibleRedIdx.end()) {
+          DenseMap<Instruction *, int>::iterator J2I = PossibleRedIdx.find(J2);
+          if (J2I != PossibleRedIdx.end() && J1I->second == J2I->second)
+            return true;
+        }
+
+        return false;
+      }
+
+      // The two provided instructions, the first from the base iteration, and
+      // the second from iteration i, form a matched pair. If these are part of
+      // a reduction, record that fact.
+      void recordPair(Instruction *J1, Instruction *J2, unsigned i) {
+        if (PossibleRedIdx.count(J1)) {
+          assert(PossibleRedIdx.count(J2) &&
+                 "Recording reduction vs. non-reduction instruction?");
+
+          PossibleRedIter[J1] = 0;
+          PossibleRedIter[J2] = i;
+
+          int Idx = PossibleRedIdx[J1];
+          assert(Idx == PossibleRedIdx[J2] &&
+                 "Recording pair from different reductions?");
+          Reds.insert(Idx);
+        }
+      }
+
+      // The functions below can be called after we've finished processing all
+      // instructions in the loop, and we know which reductions were selected.
+
+      bool validateSelected();
+      void replaceSelected();
+
+    protected:
+      // The vector of all possible reductions (for any scale).
+      SmallReductionVector PossibleReds;
+
+      DenseMap<Instruction *, int> PossibleRedIdx;
+      DenseMap<Instruction *, int> PossibleRedIter;
+      DenseSet<int> Reds;
+    };
+
+    // A DAGRootSet models an induction variable being used in a rerollable
+    // loop. For example,
+    //
+    //   x[i*3+0] = y1
+    //   x[i*3+1] = y2
+    //   x[i*3+2] = y3
+    //
+    //   Base instruction -> i*3
+    //                    +---+----+
+    //                   /    |     \
+    //               ST[y1]  +1     +2  <-- Roots
+    //                        |      |
+    //                      ST[y2] ST[y3]
+    //
+    // There may be multiple DAGRoots, for example:
+    //
+    //   x[i*2+0] = ...   (1)
+    //   x[i*2+1] = ...   (1)
+    //   x[i*2+4] = ...   (2)
+    //   x[i*2+5] = ...   (2)
+    //   x[(i+1234)*2+5678] = ... (3)
+    //   x[(i+1234)*2+5679] = ... (3)
+    //
+    // The loop will be rerolled by adding a new loop induction variable,
+    // one for the Base instruction in each DAGRootSet.
+    //
+    struct DAGRootSet {
+      Instruction *BaseInst;
+      SmallInstructionVector Roots;
+
+      // The instructions between IV and BaseInst (but not including BaseInst).
+      SmallInstructionSet SubsumedInsts;
+    };
+
+    // The set of all DAG roots, and state tracking of all roots
+    // for a particular induction variable.
+    struct DAGRootTracker {
+      DAGRootTracker(LoopReroll *Parent, Loop *L, Instruction *IV,
+                     ScalarEvolution *SE, AliasAnalysis *AA,
+                     TargetLibraryInfo *TLI, DominatorTree *DT, LoopInfo *LI,
+                     bool PreserveLCSSA,
+                     DenseMap<Instruction *, int64_t> &IncrMap,
+                     Instruction *LoopCtrlIV)
+          : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), DT(DT), LI(LI),
+            PreserveLCSSA(PreserveLCSSA), IV(IV), IVToIncMap(IncrMap),
+            LoopControlIV(LoopCtrlIV) {}
+
+      /// Stage 1: Find all the DAG roots for the induction variable.
+      bool findRoots();
+
+      /// Stage 2: Validate if the found roots are valid.
+      bool validate(ReductionTracker &Reductions);
+
+      /// Stage 3: Assuming validate() returned true, perform the
+      /// replacement.
+      /// @param BackedgeTakenCount The backedge-taken count of L.
+      void replace(const SCEV *BackedgeTakenCount);
+
+    protected:
+      using UsesTy = MapVector<Instruction *, BitVector>;
+
+      void findRootsRecursive(Instruction *IVU,
+                              SmallInstructionSet SubsumedInsts);
+      bool findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts);
+      bool collectPossibleRoots(Instruction *Base,
+                                std::map<int64_t,Instruction*> &Roots);
+      bool validateRootSet(DAGRootSet &DRS);
+
+      bool collectUsedInstructions(SmallInstructionSet &PossibleRedSet);
+      void collectInLoopUserSet(const SmallInstructionVector &Roots,
+                                const SmallInstructionSet &Exclude,
+                                const SmallInstructionSet &Final,
+                                DenseSet<Instruction *> &Users);
+      void collectInLoopUserSet(Instruction *Root,
+                                const SmallInstructionSet &Exclude,
+                                const SmallInstructionSet &Final,
+                                DenseSet<Instruction *> &Users);
+
+      UsesTy::iterator nextInstr(int Val, UsesTy &In,
+                                 const SmallInstructionSet &Exclude,
+                                 UsesTy::iterator *StartI=nullptr);
+      bool isBaseInst(Instruction *I);
+      bool isRootInst(Instruction *I);
+      bool instrDependsOn(Instruction *I,
+                          UsesTy::iterator Start,
+                          UsesTy::iterator End);
+      void replaceIV(DAGRootSet &DRS, const SCEV *Start, const SCEV *IncrExpr);
+
+      LoopReroll *Parent;
+
+      // Members of Parent, replicated here for brevity.
+      Loop *L;
+      ScalarEvolution *SE;
+      AliasAnalysis *AA;
+      TargetLibraryInfo *TLI;
+      DominatorTree *DT;
+      LoopInfo *LI;
+      bool PreserveLCSSA;
+
+      // The loop induction variable.
+      Instruction *IV;
+
+      // Loop step amount.
+      int64_t Inc;
+
+      // Loop reroll count; if Inc == 1, this records the scaling applied
+      // to the indvar: a[i*2+0] = ...; a[i*2+1] = ... ;
+      // If Inc is not 1, Scale = Inc.
+      uint64_t Scale;
+
+      // The roots themselves.
+      SmallVector<DAGRootSet,16> RootSets;
+
+      // All increment instructions for IV.
+      SmallInstructionVector LoopIncs;
+
+      // Map of all instructions in the loop (in order) to the iterations
+      // they are used in (or specially, IL_All for instructions
+      // used in the loop increment mechanism).
+      UsesTy Uses;
+
+      // Map between induction variable and its increment
+      DenseMap<Instruction *, int64_t> &IVToIncMap;
+
+      Instruction *LoopControlIV;
+    };
+
+    // Check if it is a compare-like instruction whose user is a branch
+    bool isCompareUsedByBranch(Instruction *I) {
+      auto *TI = I->getParent()->getTerminator();
+      if (!isa<BranchInst>(TI) || !isa<CmpInst>(I))
+        return false;
+      return I->hasOneUse() && TI->getOperand(0) == I;
+    };
+
+    bool isLoopControlIV(Loop *L, Instruction *IV);
+    void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs);
+    void collectPossibleReductions(Loop *L,
+           ReductionTracker &Reductions);
+    bool reroll(Instruction *IV, Loop *L, BasicBlock *Header,
+                const SCEV *BackedgeTakenCount, ReductionTracker &Reductions);
+  };
+
+} // end anonymous namespace
+
 char LoopRerollLegacyPass::ID = 0;
- 
+
 INITIALIZE_PASS_BEGIN(LoopRerollLegacyPass, "loop-reroll", "Reroll loops",
                       false, false)
-INITIALIZE_PASS_DEPENDENCY(LoopPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(LoopRerollLegacyPass, "loop-reroll", "Reroll loops", false,
                     false)
- 
+
 Pass *llvm::createLoopRerollPass() { return new LoopRerollLegacyPass; }
- 
-// Returns true if the provided instruction is used outside the given loop. 
-// This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in 
-// non-loop blocks to be outside the loop. 
-static bool hasUsesOutsideLoop(Instruction *I, Loop *L) { 
-  for (User *U : I->users()) { 
-    if (!L->contains(cast<Instruction>(U))) 
-      return true; 
-  } 
-  return false; 
-} 
- 
-// Check if an IV is only used to control the loop. There are two cases: 
-// 1. It only has one use which is loop increment, and the increment is only 
-// used by comparison and the PHI (could has sext with nsw in between), and the 
-// comparison is only used by branch. 
-// 2. It is used by loop increment and the comparison, the loop increment is 
-// only used by the PHI, and the comparison is used only by the branch. 
-bool LoopReroll::isLoopControlIV(Loop *L, Instruction *IV) { 
-  unsigned IVUses = IV->getNumUses(); 
-  if (IVUses != 2 && IVUses != 1) 
-    return false; 
- 
-  for (auto *User : IV->users()) { 
-    int32_t IncOrCmpUses = User->getNumUses(); 
-    bool IsCompInst = isCompareUsedByBranch(cast<Instruction>(User)); 
- 
-    // User can only have one or two uses. 
-    if (IncOrCmpUses != 2 && IncOrCmpUses != 1) 
-      return false; 
- 
-    // Case 1 
-    if (IVUses == 1) { 
-      // The only user must be the loop increment. 
-      // The loop increment must have two uses. 
-      if (IsCompInst || IncOrCmpUses != 2) 
-        return false; 
-    } 
- 
-    // Case 2 
-    if (IVUses == 2 && IncOrCmpUses != 1) 
-      return false; 
- 
-    // The users of the IV must be a binary operation or a comparison 
-    if (auto *BO = dyn_cast<BinaryOperator>(User)) { 
-      if (BO->getOpcode() == Instruction::Add) { 
-        // Loop Increment 
-        // User of Loop Increment should be either PHI or CMP 
-        for (auto *UU : User->users()) { 
-          if (PHINode *PN = dyn_cast<PHINode>(UU)) { 
-            if (PN != IV) 
-              return false; 
-          } 
-          // Must be a CMP or an ext (of a value with nsw) then CMP 
-          else { 
-            Instruction *UUser = dyn_cast<Instruction>(UU); 
-            // Skip SExt if we are extending an nsw value 
-            // TODO: Allow ZExt too 
-            if (BO->hasNoSignedWrap() && UUser && UUser->hasOneUse() && 
-                isa<SExtInst>(UUser)) 
-              UUser = dyn_cast<Instruction>(*(UUser->user_begin())); 
-            if (!isCompareUsedByBranch(UUser)) 
-              return false; 
-          } 
-        } 
-      } else 
-        return false; 
-      // Compare : can only have one use, and must be branch 
-    } else if (!IsCompInst) 
-      return false; 
-  } 
-  return true; 
-} 
- 
-// Collect the list of loop induction variables with respect to which it might 
-// be possible to reroll the loop. 
-void LoopReroll::collectPossibleIVs(Loop *L, 
-                                    SmallInstructionVector &PossibleIVs) { 
-  BasicBlock *Header = L->getHeader(); 
-  for (BasicBlock::iterator I = Header->begin(), 
-       IE = Header->getFirstInsertionPt(); I != IE; ++I) { 
-    if (!isa<PHINode>(I)) 
-      continue; 
-    if (!I->getType()->isIntegerTy() && !I->getType()->isPointerTy()) 
-      continue; 
- 
-    if (const SCEVAddRecExpr *PHISCEV = 
-            dyn_cast<SCEVAddRecExpr>(SE->getSCEV(&*I))) { 
-      if (PHISCEV->getLoop() != L) 
-        continue; 
-      if (!PHISCEV->isAffine()) 
-        continue; 
-      auto IncSCEV = dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE)); 
-      if (IncSCEV) { 
-        IVToIncMap[&*I] = IncSCEV->getValue()->getSExtValue(); 
-        LLVM_DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << *PHISCEV 
-                          << "\n"); 
- 
-        if (isLoopControlIV(L, &*I)) { 
-          assert(!LoopControlIV && "Found two loop control only IV"); 
-          LoopControlIV = &(*I); 
-          LLVM_DEBUG(dbgs() << "LRR: Possible loop control only IV: " << *I 
-                            << " = " << *PHISCEV << "\n"); 
-        } else 
-          PossibleIVs.push_back(&*I); 
-      } 
-    } 
-  } 
-} 
- 
-// Add the remainder of the reduction-variable chain to the instruction vector 
-// (the initial PHINode has already been added). If successful, the object is 
-// marked as valid. 
-void LoopReroll::SimpleLoopReduction::add(Loop *L) { 
-  assert(!Valid && "Cannot add to an already-valid chain"); 
- 
-  // The reduction variable must be a chain of single-use instructions 
-  // (including the PHI), except for the last value (which is used by the PHI 
-  // and also outside the loop). 
-  Instruction *C = Instructions.front(); 
-  if (C->user_empty()) 
-    return; 
- 
-  do { 
-    C = cast<Instruction>(*C->user_begin()); 
-    if (C->hasOneUse()) { 
-      if (!C->isBinaryOp()) 
-        return; 
- 
-      if (!(isa<PHINode>(Instructions.back()) || 
-            C->isSameOperationAs(Instructions.back()))) 
-        return; 
- 
-      Instructions.push_back(C); 
-    } 
-  } while (C->hasOneUse()); 
- 
-  if (Instructions.size() < 2 || 
-      !C->isSameOperationAs(Instructions.back()) || 
-      C->use_empty()) 
-    return; 
- 
-  // C is now the (potential) last instruction in the reduction chain. 
-  for (User *U : C->users()) { 
-    // The only in-loop user can be the initial PHI. 
-    if (L->contains(cast<Instruction>(U))) 
-      if (cast<Instruction>(U) != Instructions.front()) 
-        return; 
-  } 
- 
-  Instructions.push_back(C); 
-  Valid = true; 
-} 
- 
-// Collect the vector of possible reduction variables. 
-void LoopReroll::collectPossibleReductions(Loop *L, 
-  ReductionTracker &Reductions) { 
-  BasicBlock *Header = L->getHeader(); 
-  for (BasicBlock::iterator I = Header->begin(), 
-       IE = Header->getFirstInsertionPt(); I != IE; ++I) { 
-    if (!isa<PHINode>(I)) 
-      continue; 
-    if (!I->getType()->isSingleValueType()) 
-      continue; 
- 
-    SimpleLoopReduction SLR(&*I, L); 
-    if (!SLR.valid()) 
-      continue; 
- 
-    LLVM_DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with " 
-                      << SLR.size() << " chained instructions)\n"); 
-    Reductions.addSLR(SLR); 
-  } 
-} 
- 
-// Collect the set of all users of the provided root instruction. This set of 
-// users contains not only the direct users of the root instruction, but also 
-// all users of those users, and so on. There are two exceptions: 
-// 
-//   1. Instructions in the set of excluded instructions are never added to the 
-//   use set (even if they are users). This is used, for example, to exclude 
-//   including root increments in the use set of the primary IV. 
-// 
-//   2. Instructions in the set of final instructions are added to the use set 
-//   if they are users, but their users are not added. This is used, for 
-//   example, to prevent a reduction update from forcing all later reduction 
-//   updates into the use set. 
-void LoopReroll::DAGRootTracker::collectInLoopUserSet( 
-  Instruction *Root, const SmallInstructionSet &Exclude, 
-  const SmallInstructionSet &Final, 
-  DenseSet<Instruction *> &Users) { 
-  SmallInstructionVector Queue(1, Root); 
-  while (!Queue.empty()) { 
-    Instruction *I = Queue.pop_back_val(); 
-    if (!Users.insert(I).second) 
-      continue; 
- 
-    if (!Final.count(I)) 
-      for (Use &U : I->uses()) { 
-        Instruction *User = cast<Instruction>(U.getUser()); 
-        if (PHINode *PN = dyn_cast<PHINode>(User)) { 
-          // Ignore "wrap-around" uses to PHIs of this loop's header. 
-          if (PN->getIncomingBlock(U) == L->getHeader()) 
-            continue; 
-        } 
- 
-        if (L->contains(User) && !Exclude.count(User)) { 
-          Queue.push_back(User); 
-        } 
-      } 
- 
-    // We also want to collect single-user "feeder" values. 
-    for (User::op_iterator OI = I->op_begin(), 
-         OIE = I->op_end(); OI != OIE; ++OI) { 
-      if (Instruction *Op = dyn_cast<Instruction>(*OI)) 
-        if (Op->hasOneUse() && L->contains(Op) && !Exclude.count(Op) && 
-            !Final.count(Op)) 
-          Queue.push_back(Op); 
-    } 
-  } 
-} 
- 
-// Collect all of the users of all of the provided root instructions (combined 
-// into a single set). 
-void LoopReroll::DAGRootTracker::collectInLoopUserSet( 
-  const SmallInstructionVector &Roots, 
-  const SmallInstructionSet &Exclude, 
-  const SmallInstructionSet &Final, 
-  DenseSet<Instruction *> &Users) { 
-  for (Instruction *Root : Roots) 
-    collectInLoopUserSet(Root, Exclude, Final, Users); 
-} 
- 
-static bool isUnorderedLoadStore(Instruction *I) { 
-  if (LoadInst *LI = dyn_cast<LoadInst>(I)) 
-    return LI->isUnordered(); 
-  if (StoreInst *SI = dyn_cast<StoreInst>(I)) 
-    return SI->isUnordered(); 
-  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) 
-    return !MI->isVolatile(); 
-  return false; 
-} 
- 
-/// Return true if IVU is a "simple" arithmetic operation. 
-/// This is used for narrowing the search space for DAGRoots; only arithmetic 
-/// and GEPs can be part of a DAGRoot. 
-static bool isSimpleArithmeticOp(User *IVU) { 
-  if (Instruction *I = dyn_cast<Instruction>(IVU)) { 
-    switch (I->getOpcode()) { 
-    default: return false; 
-    case Instruction::Add: 
-    case Instruction::Sub: 
-    case Instruction::Mul: 
-    case Instruction::Shl: 
-    case Instruction::AShr: 
-    case Instruction::LShr: 
-    case Instruction::GetElementPtr: 
-    case Instruction::Trunc: 
-    case Instruction::ZExt: 
-    case Instruction::SExt: 
-      return true; 
-    } 
-  } 
-  return false; 
-} 
- 
-static bool isLoopIncrement(User *U, Instruction *IV) { 
-  BinaryOperator *BO = dyn_cast<BinaryOperator>(U); 
- 
-  if ((BO && BO->getOpcode() != Instruction::Add) || 
-      (!BO && !isa<GetElementPtrInst>(U))) 
-    return false; 
- 
-  for (auto *UU : U->users()) { 
-    PHINode *PN = dyn_cast<PHINode>(UU); 
-    if (PN && PN == IV) 
-      return true; 
-  } 
-  return false; 
-} 
- 
-bool LoopReroll::DAGRootTracker:: 
-collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) { 
-  SmallInstructionVector BaseUsers; 
- 
-  for (auto *I : Base->users()) { 
-    ConstantInt *CI = nullptr; 
- 
-    if (isLoopIncrement(I, IV)) { 
-      LoopIncs.push_back(cast<Instruction>(I)); 
-      continue; 
-    } 
- 
-    // The root nodes must be either GEPs, ORs or ADDs. 
-    if (auto *BO = dyn_cast<BinaryOperator>(I)) { 
-      if (BO->getOpcode() == Instruction::Add || 
-          BO->getOpcode() == Instruction::Or) 
-        CI = dyn_cast<ConstantInt>(BO->getOperand(1)); 
-    } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) { 
-      Value *LastOperand = GEP->getOperand(GEP->getNumOperands()-1); 
-      CI = dyn_cast<ConstantInt>(LastOperand); 
-    } 
- 
-    if (!CI) { 
-      if (Instruction *II = dyn_cast<Instruction>(I)) { 
-        BaseUsers.push_back(II); 
-        continue; 
-      } else { 
-        LLVM_DEBUG(dbgs() << "LRR: Aborting due to non-instruction: " << *I 
-                          << "\n"); 
-        return false; 
-      } 
-    } 
- 
-    int64_t V = std::abs(CI->getValue().getSExtValue()); 
-    if (Roots.find(V) != Roots.end()) 
-      // No duplicates, please. 
-      return false; 
- 
-    Roots[V] = cast<Instruction>(I); 
-  } 
- 
-  // Make sure we have at least two roots. 
-  if (Roots.empty() || (Roots.size() == 1 && BaseUsers.empty())) 
-    return false; 
- 
-  // If we found non-loop-inc, non-root users of Base, assume they are 
-  // for the zeroth root index. This is because "add %a, 0" gets optimized 
-  // away. 
-  if (BaseUsers.size()) { 
-    if (Roots.find(0) != Roots.end()) { 
-      LLVM_DEBUG(dbgs() << "LRR: Multiple roots found for base - aborting!\n"); 
-      return false; 
-    } 
-    Roots[0] = Base; 
-  } 
- 
-  // Calculate the number of users of the base, or lowest indexed, iteration. 
-  unsigned NumBaseUses = BaseUsers.size(); 
-  if (NumBaseUses == 0) 
-    NumBaseUses = Roots.begin()->second->getNumUses(); 
- 
-  // Check that every node has the same number of users. 
-  for (auto &KV : Roots) { 
-    if (KV.first == 0) 
-      continue; 
-    if (!KV.second->hasNUses(NumBaseUses)) { 
-      LLVM_DEBUG(dbgs() << "LRR: Aborting - Root and Base #users not the same: " 
-                        << "#Base=" << NumBaseUses 
-                        << ", #Root=" << KV.second->getNumUses() << "\n"); 
-      return false; 
-    } 
-  } 
- 
-  return true; 
-} 
- 
-void LoopReroll::DAGRootTracker:: 
-findRootsRecursive(Instruction *I, SmallInstructionSet SubsumedInsts) { 
-  // Does the user look like it could be part of a root set? 
-  // All its users must be simple arithmetic ops. 
-  if (I->hasNUsesOrMore(IL_MaxRerollIterations + 1)) 
-    return; 
- 
-  if (I != IV && findRootsBase(I, SubsumedInsts)) 
-    return; 
- 
-  SubsumedInsts.insert(I); 
- 
-  for (User *V : I->users()) { 
-    Instruction *I = cast<Instruction>(V); 
-    if (is_contained(LoopIncs, I)) 
-      continue; 
- 
-    if (!isSimpleArithmeticOp(I)) 
-      continue; 
- 
-    // The recursive call makes a copy of SubsumedInsts. 
-    findRootsRecursive(I, SubsumedInsts); 
-  } 
-} 
- 
-bool LoopReroll::DAGRootTracker::validateRootSet(DAGRootSet &DRS) { 
-  if (DRS.Roots.empty()) 
-    return false; 
- 
-  // If the value of the base instruction is used outside the loop, we cannot 
-  // reroll the loop. Check for other root instructions is unnecessary because 
-  // they don't match any base instructions if their values are used outside. 
-  if (hasUsesOutsideLoop(DRS.BaseInst, L)) 
-    return false; 
- 
-  // Consider a DAGRootSet with N-1 roots (so N different values including 
-  //   BaseInst). 
-  // Define d = Roots[0] - BaseInst, which should be the same as 
-  //   Roots[I] - Roots[I-1] for all I in [1..N). 
-  // Define D = BaseInst@J - BaseInst@J-1, where "@J" means the value at the 
-  //   loop iteration J. 
-  // 
-  // Now, For the loop iterations to be consecutive: 
-  //   D = d * N 
-  const auto *ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst)); 
-  if (!ADR) 
-    return false; 
- 
-  // Check that the first root is evenly spaced. 
-  unsigned N = DRS.Roots.size() + 1; 
-  const SCEV *StepSCEV = SE->getMinusSCEV(SE->getSCEV(DRS.Roots[0]), ADR); 
-  const SCEV *ScaleSCEV = SE->getConstant(StepSCEV->getType(), N); 
-  if (ADR->getStepRecurrence(*SE) != SE->getMulExpr(StepSCEV, ScaleSCEV)) 
-    return false; 
- 
-  // Check that the remainling roots are evenly spaced. 
-  for (unsigned i = 1; i < N - 1; ++i) { 
-    const SCEV *NewStepSCEV = SE->getMinusSCEV(SE->getSCEV(DRS.Roots[i]), 
-                                               SE->getSCEV(DRS.Roots[i-1])); 
-    if (NewStepSCEV != StepSCEV) 
-      return false; 
-  } 
- 
-  return true; 
-} 
- 
-bool LoopReroll::DAGRootTracker:: 
-findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) { 
-  // The base of a RootSet must be an AddRec, so it can be erased. 
-  const auto *IVU_ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IVU)); 
-  if (!IVU_ADR || IVU_ADR->getLoop() != L) 
-    return false; 
- 
-  std::map<int64_t, Instruction*> V; 
-  if (!collectPossibleRoots(IVU, V)) 
-    return false; 
- 
-  // If we didn't get a root for index zero, then IVU must be 
-  // subsumed. 
-  if (V.find(0) == V.end()) 
-    SubsumedInsts.insert(IVU); 
- 
-  // Partition the vector into monotonically increasing indexes. 
-  DAGRootSet DRS; 
-  DRS.BaseInst = nullptr; 
- 
-  SmallVector<DAGRootSet, 16> PotentialRootSets; 
- 
-  for (auto &KV : V) { 
-    if (!DRS.BaseInst) { 
-      DRS.BaseInst = KV.second; 
-      DRS.SubsumedInsts = SubsumedInsts; 
-    } else if (DRS.Roots.empty()) { 
-      DRS.Roots.push_back(KV.second); 
-    } else if (V.find(KV.first - 1) != V.end()) { 
-      DRS.Roots.push_back(KV.second); 
-    } else { 
-      // Linear sequence terminated. 
-      if (!validateRootSet(DRS)) 
-        return false; 
- 
-      // Construct a new DAGRootSet with the next sequence. 
-      PotentialRootSets.push_back(DRS); 
-      DRS.BaseInst = KV.second; 
-      DRS.Roots.clear(); 
-    } 
-  } 
- 
-  if (!validateRootSet(DRS)) 
-    return false; 
- 
-  PotentialRootSets.push_back(DRS); 
- 
-  RootSets.append(PotentialRootSets.begin(), PotentialRootSets.end()); 
- 
-  return true; 
-} 
- 
-bool LoopReroll::DAGRootTracker::findRoots() { 
-  Inc = IVToIncMap[IV]; 
- 
-  assert(RootSets.empty() && "Unclean state!"); 
-  if (std::abs(Inc) == 1) { 
-    for (auto *IVU : IV->users()) { 
-      if (isLoopIncrement(IVU, IV)) 
-        LoopIncs.push_back(cast<Instruction>(IVU)); 
-    } 
-    findRootsRecursive(IV, SmallInstructionSet()); 
-    LoopIncs.push_back(IV); 
-  } else { 
-    if (!findRootsBase(IV, SmallInstructionSet())) 
-      return false; 
-  } 
- 
-  // Ensure all sets have the same size. 
-  if (RootSets.empty()) { 
-    LLVM_DEBUG(dbgs() << "LRR: Aborting because no root sets found!\n"); 
-    return false; 
-  } 
-  for (auto &V : RootSets) { 
-    if (V.Roots.empty() || V.Roots.size() != RootSets[0].Roots.size()) { 
-      LLVM_DEBUG( 
-          dbgs() 
-          << "LRR: Aborting because not all root sets have the same size\n"); 
-      return false; 
-    } 
-  } 
- 
-  Scale = RootSets[0].Roots.size() + 1; 
- 
-  if (Scale > IL_MaxRerollIterations) { 
-    LLVM_DEBUG(dbgs() << "LRR: Aborting - too many iterations found. " 
-                      << "#Found=" << Scale 
-                      << ", #Max=" << IL_MaxRerollIterations << "\n"); 
-    return false; 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "LRR: Successfully found roots: Scale=" << Scale 
-                    << "\n"); 
- 
-  return true; 
-} 
- 
-bool LoopReroll::DAGRootTracker::collectUsedInstructions(SmallInstructionSet &PossibleRedSet) { 
-  // Populate the MapVector with all instructions in the block, in order first, 
-  // so we can iterate over the contents later in perfect order. 
-  for (auto &I : *L->getHeader()) { 
-    Uses[&I].resize(IL_End); 
-  } 
- 
-  SmallInstructionSet Exclude; 
-  for (auto &DRS : RootSets) { 
-    Exclude.insert(DRS.Roots.begin(), DRS.Roots.end()); 
-    Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end()); 
-    Exclude.insert(DRS.BaseInst); 
-  } 
-  Exclude.insert(LoopIncs.begin(), LoopIncs.end()); 
- 
-  for (auto &DRS : RootSets) { 
-    DenseSet<Instruction*> VBase; 
-    collectInLoopUserSet(DRS.BaseInst, Exclude, PossibleRedSet, VBase); 
-    for (auto *I : VBase) { 
-      Uses[I].set(0); 
-    } 
- 
-    unsigned Idx = 1; 
-    for (auto *Root : DRS.Roots) { 
-      DenseSet<Instruction*> V; 
-      collectInLoopUserSet(Root, Exclude, PossibleRedSet, V); 
- 
-      // While we're here, check the use sets are the same size. 
-      if (V.size() != VBase.size()) { 
-        LLVM_DEBUG(dbgs() << "LRR: Aborting - use sets are different sizes\n"); 
-        return false; 
-      } 
- 
-      for (auto *I : V) { 
-        Uses[I].set(Idx); 
-      } 
-      ++Idx; 
-    } 
- 
-    // Make sure our subsumed instructions are remembered too. 
-    for (auto *I : DRS.SubsumedInsts) { 
-      Uses[I].set(IL_All); 
-    } 
-  } 
- 
-  // Make sure the loop increments are also accounted for. 
- 
-  Exclude.clear(); 
-  for (auto &DRS : RootSets) { 
-    Exclude.insert(DRS.Roots.begin(), DRS.Roots.end()); 
-    Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end()); 
-    Exclude.insert(DRS.BaseInst); 
-  } 
- 
-  DenseSet<Instruction*> V; 
-  collectInLoopUserSet(LoopIncs, Exclude, PossibleRedSet, V); 
-  for (auto *I : V) { 
+
+// Returns true if the provided instruction is used outside the given loop.
+// This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in
+// non-loop blocks to be outside the loop.
+static bool hasUsesOutsideLoop(Instruction *I, Loop *L) {
+  for (User *U : I->users()) {
+    if (!L->contains(cast<Instruction>(U)))
+      return true;
+  }
+  return false;
+}
+
+// Check if an IV is only used to control the loop. There are two cases:
+// 1. It only has one use which is loop increment, and the increment is only
+// used by comparison and the PHI (could has sext with nsw in between), and the
+// comparison is only used by branch.
+// 2. It is used by loop increment and the comparison, the loop increment is
+// only used by the PHI, and the comparison is used only by the branch.
+bool LoopReroll::isLoopControlIV(Loop *L, Instruction *IV) {
+  unsigned IVUses = IV->getNumUses();
+  if (IVUses != 2 && IVUses != 1)
+    return false;
+
+  for (auto *User : IV->users()) {
+    int32_t IncOrCmpUses = User->getNumUses();
+    bool IsCompInst = isCompareUsedByBranch(cast<Instruction>(User));
+
+    // User can only have one or two uses.
+    if (IncOrCmpUses != 2 && IncOrCmpUses != 1)
+      return false;
+
+    // Case 1
+    if (IVUses == 1) {
+      // The only user must be the loop increment.
+      // The loop increment must have two uses.
+      if (IsCompInst || IncOrCmpUses != 2)
+        return false;
+    }
+
+    // Case 2
+    if (IVUses == 2 && IncOrCmpUses != 1)
+      return false;
+
+    // The users of the IV must be a binary operation or a comparison
+    if (auto *BO = dyn_cast<BinaryOperator>(User)) {
+      if (BO->getOpcode() == Instruction::Add) {
+        // Loop Increment
+        // User of Loop Increment should be either PHI or CMP
+        for (auto *UU : User->users()) {
+          if (PHINode *PN = dyn_cast<PHINode>(UU)) {
+            if (PN != IV)
+              return false;
+          }
+          // Must be a CMP or an ext (of a value with nsw) then CMP
+          else {
+            Instruction *UUser = dyn_cast<Instruction>(UU);
+            // Skip SExt if we are extending an nsw value
+            // TODO: Allow ZExt too
+            if (BO->hasNoSignedWrap() && UUser && UUser->hasOneUse() &&
+                isa<SExtInst>(UUser))
+              UUser = dyn_cast<Instruction>(*(UUser->user_begin()));
+            if (!isCompareUsedByBranch(UUser))
+              return false;
+          }
+        }
+      } else
+        return false;
+      // Compare : can only have one use, and must be branch
+    } else if (!IsCompInst)
+      return false;
+  }
+  return true;
+}
+
+// Collect the list of loop induction variables with respect to which it might
+// be possible to reroll the loop.
+void LoopReroll::collectPossibleIVs(Loop *L,
+                                    SmallInstructionVector &PossibleIVs) {
+  BasicBlock *Header = L->getHeader();
+  for (BasicBlock::iterator I = Header->begin(),
+       IE = Header->getFirstInsertionPt(); I != IE; ++I) {
+    if (!isa<PHINode>(I))
+      continue;
+    if (!I->getType()->isIntegerTy() && !I->getType()->isPointerTy())
+      continue;
+
+    if (const SCEVAddRecExpr *PHISCEV =
+            dyn_cast<SCEVAddRecExpr>(SE->getSCEV(&*I))) {
+      if (PHISCEV->getLoop() != L)
+        continue;
+      if (!PHISCEV->isAffine())
+        continue;
+      auto IncSCEV = dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE));
+      if (IncSCEV) {
+        IVToIncMap[&*I] = IncSCEV->getValue()->getSExtValue();
+        LLVM_DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << *PHISCEV
+                          << "\n");
+
+        if (isLoopControlIV(L, &*I)) {
+          assert(!LoopControlIV && "Found two loop control only IV");
+          LoopControlIV = &(*I);
+          LLVM_DEBUG(dbgs() << "LRR: Possible loop control only IV: " << *I
+                            << " = " << *PHISCEV << "\n");
+        } else
+          PossibleIVs.push_back(&*I);
+      }
+    }
+  }
+}
+
+// Add the remainder of the reduction-variable chain to the instruction vector
+// (the initial PHINode has already been added). If successful, the object is
+// marked as valid.
+void LoopReroll::SimpleLoopReduction::add(Loop *L) {
+  assert(!Valid && "Cannot add to an already-valid chain");
+
+  // The reduction variable must be a chain of single-use instructions
+  // (including the PHI), except for the last value (which is used by the PHI
+  // and also outside the loop).
+  Instruction *C = Instructions.front();
+  if (C->user_empty())
+    return;
+
+  do {
+    C = cast<Instruction>(*C->user_begin());
+    if (C->hasOneUse()) {
+      if (!C->isBinaryOp())
+        return;
+
+      if (!(isa<PHINode>(Instructions.back()) ||
+            C->isSameOperationAs(Instructions.back())))
+        return;
+
+      Instructions.push_back(C);
+    }
+  } while (C->hasOneUse());
+
+  if (Instructions.size() < 2 ||
+      !C->isSameOperationAs(Instructions.back()) ||
+      C->use_empty())
+    return;
+
+  // C is now the (potential) last instruction in the reduction chain.
+  for (User *U : C->users()) {
+    // The only in-loop user can be the initial PHI.
+    if (L->contains(cast<Instruction>(U)))
+      if (cast<Instruction>(U) != Instructions.front())
+        return;
+  }
+
+  Instructions.push_back(C);
+  Valid = true;
+}
+
+// Collect the vector of possible reduction variables.
+void LoopReroll::collectPossibleReductions(Loop *L,
+  ReductionTracker &Reductions) {
+  BasicBlock *Header = L->getHeader();
+  for (BasicBlock::iterator I = Header->begin(),
+       IE = Header->getFirstInsertionPt(); I != IE; ++I) {
+    if (!isa<PHINode>(I))
+      continue;
+    if (!I->getType()->isSingleValueType())
+      continue;
+
+    SimpleLoopReduction SLR(&*I, L);
+    if (!SLR.valid())
+      continue;
+
+    LLVM_DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with "
+                      << SLR.size() << " chained instructions)\n");
+    Reductions.addSLR(SLR);
+  }
+}
+
+// Collect the set of all users of the provided root instruction. This set of
+// users contains not only the direct users of the root instruction, but also
+// all users of those users, and so on. There are two exceptions:
+//
+//   1. Instructions in the set of excluded instructions are never added to the
+//   use set (even if they are users). This is used, for example, to exclude
+//   including root increments in the use set of the primary IV.
+//
+//   2. Instructions in the set of final instructions are added to the use set
+//   if they are users, but their users are not added. This is used, for
+//   example, to prevent a reduction update from forcing all later reduction
+//   updates into the use set.
+void LoopReroll::DAGRootTracker::collectInLoopUserSet(
+  Instruction *Root, const SmallInstructionSet &Exclude,
+  const SmallInstructionSet &Final,
+  DenseSet<Instruction *> &Users) {
+  SmallInstructionVector Queue(1, Root);
+  while (!Queue.empty()) {
+    Instruction *I = Queue.pop_back_val();
+    if (!Users.insert(I).second)
+      continue;
+
+    if (!Final.count(I))
+      for (Use &U : I->uses()) {
+        Instruction *User = cast<Instruction>(U.getUser());
+        if (PHINode *PN = dyn_cast<PHINode>(User)) {
+          // Ignore "wrap-around" uses to PHIs of this loop's header.
+          if (PN->getIncomingBlock(U) == L->getHeader())
+            continue;
+        }
+
+        if (L->contains(User) && !Exclude.count(User)) {
+          Queue.push_back(User);
+        }
+      }
+
+    // We also want to collect single-user "feeder" values.
+    for (User::op_iterator OI = I->op_begin(),
+         OIE = I->op_end(); OI != OIE; ++OI) {
+      if (Instruction *Op = dyn_cast<Instruction>(*OI))
+        if (Op->hasOneUse() && L->contains(Op) && !Exclude.count(Op) &&
+            !Final.count(Op))
+          Queue.push_back(Op);
+    }
+  }
+}
+
+// Collect all of the users of all of the provided root instructions (combined
+// into a single set).
+void LoopReroll::DAGRootTracker::collectInLoopUserSet(
+  const SmallInstructionVector &Roots,
+  const SmallInstructionSet &Exclude,
+  const SmallInstructionSet &Final,
+  DenseSet<Instruction *> &Users) {
+  for (Instruction *Root : Roots)
+    collectInLoopUserSet(Root, Exclude, Final, Users);
+}
+
+static bool isUnorderedLoadStore(Instruction *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->isUnordered();
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->isUnordered();
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
+    return !MI->isVolatile();
+  return false;
+}
+
+/// Return true if IVU is a "simple" arithmetic operation.
+/// This is used for narrowing the search space for DAGRoots; only arithmetic
+/// and GEPs can be part of a DAGRoot.
+static bool isSimpleArithmeticOp(User *IVU) {
+  if (Instruction *I = dyn_cast<Instruction>(IVU)) {
+    switch (I->getOpcode()) {
+    default: return false;
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::Mul:
+    case Instruction::Shl:
+    case Instruction::AShr:
+    case Instruction::LShr:
+    case Instruction::GetElementPtr:
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool isLoopIncrement(User *U, Instruction *IV) {
+  BinaryOperator *BO = dyn_cast<BinaryOperator>(U);
+
+  if ((BO && BO->getOpcode() != Instruction::Add) ||
+      (!BO && !isa<GetElementPtrInst>(U)))
+    return false;
+
+  for (auto *UU : U->users()) {
+    PHINode *PN = dyn_cast<PHINode>(UU);
+    if (PN && PN == IV)
+      return true;
+  }
+  return false;
+}
+
+bool LoopReroll::DAGRootTracker::
+collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) {
+  SmallInstructionVector BaseUsers;
+
+  for (auto *I : Base->users()) {
+    ConstantInt *CI = nullptr;
+
+    if (isLoopIncrement(I, IV)) {
+      LoopIncs.push_back(cast<Instruction>(I));
+      continue;
+    }
+
+    // The root nodes must be either GEPs, ORs or ADDs.
+    if (auto *BO = dyn_cast<BinaryOperator>(I)) {
+      if (BO->getOpcode() == Instruction::Add ||
+          BO->getOpcode() == Instruction::Or)
+        CI = dyn_cast<ConstantInt>(BO->getOperand(1));
+    } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
+      Value *LastOperand = GEP->getOperand(GEP->getNumOperands()-1);
+      CI = dyn_cast<ConstantInt>(LastOperand);
+    }
+
+    if (!CI) {
+      if (Instruction *II = dyn_cast<Instruction>(I)) {
+        BaseUsers.push_back(II);
+        continue;
+      } else {
+        LLVM_DEBUG(dbgs() << "LRR: Aborting due to non-instruction: " << *I
+                          << "\n");
+        return false;
+      }
+    }
+
+    int64_t V = std::abs(CI->getValue().getSExtValue());
+    if (Roots.find(V) != Roots.end())
+      // No duplicates, please.
+      return false;
+
+    Roots[V] = cast<Instruction>(I);
+  }
+
+  // Make sure we have at least two roots.
+  if (Roots.empty() || (Roots.size() == 1 && BaseUsers.empty()))
+    return false;
+
+  // If we found non-loop-inc, non-root users of Base, assume they are
+  // for the zeroth root index. This is because "add %a, 0" gets optimized
+  // away.
+  if (BaseUsers.size()) {
+    if (Roots.find(0) != Roots.end()) {
+      LLVM_DEBUG(dbgs() << "LRR: Multiple roots found for base - aborting!\n");
+      return false;
+    }
+    Roots[0] = Base;
+  }
+
+  // Calculate the number of users of the base, or lowest indexed, iteration.
+  unsigned NumBaseUses = BaseUsers.size();
+  if (NumBaseUses == 0)
+    NumBaseUses = Roots.begin()->second->getNumUses();
+
+  // Check that every node has the same number of users.
+  for (auto &KV : Roots) {
+    if (KV.first == 0)
+      continue;
+    if (!KV.second->hasNUses(NumBaseUses)) {
+      LLVM_DEBUG(dbgs() << "LRR: Aborting - Root and Base #users not the same: "
+                        << "#Base=" << NumBaseUses
+                        << ", #Root=" << KV.second->getNumUses() << "\n");
+      return false;
+    }
+  }
+
+  return true;
+}
+
+void LoopReroll::DAGRootTracker::
+findRootsRecursive(Instruction *I, SmallInstructionSet SubsumedInsts) {
+  // Does the user look like it could be part of a root set?
+  // All its users must be simple arithmetic ops.
+  if (I->hasNUsesOrMore(IL_MaxRerollIterations + 1))
+    return;
+
+  if (I != IV && findRootsBase(I, SubsumedInsts))
+    return;
+
+  SubsumedInsts.insert(I);
+
+  for (User *V : I->users()) {
+    Instruction *I = cast<Instruction>(V);
+    if (is_contained(LoopIncs, I))
+      continue;
+
+    if (!isSimpleArithmeticOp(I))
+      continue;
+
+    // The recursive call makes a copy of SubsumedInsts.
+    findRootsRecursive(I, SubsumedInsts);
+  }
+}
+
+bool LoopReroll::DAGRootTracker::validateRootSet(DAGRootSet &DRS) {
+  if (DRS.Roots.empty())
+    return false;
+
+  // If the value of the base instruction is used outside the loop, we cannot
+  // reroll the loop. Check for other root instructions is unnecessary because
+  // they don't match any base instructions if their values are used outside.
+  if (hasUsesOutsideLoop(DRS.BaseInst, L))
+    return false;
+
+  // Consider a DAGRootSet with N-1 roots (so N different values including
+  //   BaseInst).
+  // Define d = Roots[0] - BaseInst, which should be the same as
+  //   Roots[I] - Roots[I-1] for all I in [1..N).
+  // Define D = BaseInst@J - BaseInst@J-1, where "@J" means the value at the
+  //   loop iteration J.
+  //
+  // Now, For the loop iterations to be consecutive:
+  //   D = d * N
+  const auto *ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst));
+  if (!ADR)
+    return false;
+
+  // Check that the first root is evenly spaced.
+  unsigned N = DRS.Roots.size() + 1;
+  const SCEV *StepSCEV = SE->getMinusSCEV(SE->getSCEV(DRS.Roots[0]), ADR);
+  const SCEV *ScaleSCEV = SE->getConstant(StepSCEV->getType(), N);
+  if (ADR->getStepRecurrence(*SE) != SE->getMulExpr(StepSCEV, ScaleSCEV))
+    return false;
+
+  // Check that the remainling roots are evenly spaced.
+  for (unsigned i = 1; i < N - 1; ++i) {
+    const SCEV *NewStepSCEV = SE->getMinusSCEV(SE->getSCEV(DRS.Roots[i]),
+                                               SE->getSCEV(DRS.Roots[i-1]));
+    if (NewStepSCEV != StepSCEV)
+      return false;
+  }
+
+  return true;
+}
+
+bool LoopReroll::DAGRootTracker::
+findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) {
+  // The base of a RootSet must be an AddRec, so it can be erased.
+  const auto *IVU_ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IVU));
+  if (!IVU_ADR || IVU_ADR->getLoop() != L)
+    return false;
+
+  std::map<int64_t, Instruction*> V;
+  if (!collectPossibleRoots(IVU, V))
+    return false;
+
+  // If we didn't get a root for index zero, then IVU must be
+  // subsumed.
+  if (V.find(0) == V.end())
+    SubsumedInsts.insert(IVU);
+
+  // Partition the vector into monotonically increasing indexes.
+  DAGRootSet DRS;
+  DRS.BaseInst = nullptr;
+
+  SmallVector<DAGRootSet, 16> PotentialRootSets;
+
+  for (auto &KV : V) {
+    if (!DRS.BaseInst) {
+      DRS.BaseInst = KV.second;
+      DRS.SubsumedInsts = SubsumedInsts;
+    } else if (DRS.Roots.empty()) {
+      DRS.Roots.push_back(KV.second);
+    } else if (V.find(KV.first - 1) != V.end()) {
+      DRS.Roots.push_back(KV.second);
+    } else {
+      // Linear sequence terminated.
+      if (!validateRootSet(DRS))
+        return false;
+
+      // Construct a new DAGRootSet with the next sequence.
+      PotentialRootSets.push_back(DRS);
+      DRS.BaseInst = KV.second;
+      DRS.Roots.clear();
+    }
+  }
+
+  if (!validateRootSet(DRS))
+    return false;
+
+  PotentialRootSets.push_back(DRS);
+
+  RootSets.append(PotentialRootSets.begin(), PotentialRootSets.end());
+
+  return true;
+}
+
+bool LoopReroll::DAGRootTracker::findRoots() {
+  Inc = IVToIncMap[IV];
+
+  assert(RootSets.empty() && "Unclean state!");
+  if (std::abs(Inc) == 1) {
+    for (auto *IVU : IV->users()) {
+      if (isLoopIncrement(IVU, IV))
+        LoopIncs.push_back(cast<Instruction>(IVU));
+    }
+    findRootsRecursive(IV, SmallInstructionSet());
+    LoopIncs.push_back(IV);
+  } else {
+    if (!findRootsBase(IV, SmallInstructionSet()))
+      return false;
+  }
+
+  // Ensure all sets have the same size.
+  if (RootSets.empty()) {
+    LLVM_DEBUG(dbgs() << "LRR: Aborting because no root sets found!\n");
+    return false;
+  }
+  for (auto &V : RootSets) {
+    if (V.Roots.empty() || V.Roots.size() != RootSets[0].Roots.size()) {
+      LLVM_DEBUG(
+          dbgs()
+          << "LRR: Aborting because not all root sets have the same size\n");
+      return false;
+    }
+  }
+
+  Scale = RootSets[0].Roots.size() + 1;
+
+  if (Scale > IL_MaxRerollIterations) {
+    LLVM_DEBUG(dbgs() << "LRR: Aborting - too many iterations found. "
+                      << "#Found=" << Scale
+                      << ", #Max=" << IL_MaxRerollIterations << "\n");
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "LRR: Successfully found roots: Scale=" << Scale
+                    << "\n");
+
+  return true;
+}
+
+bool LoopReroll::DAGRootTracker::collectUsedInstructions(SmallInstructionSet &PossibleRedSet) {
+  // Populate the MapVector with all instructions in the block, in order first,
+  // so we can iterate over the contents later in perfect order.
+  for (auto &I : *L->getHeader()) {
+    Uses[&I].resize(IL_End);
+  }
+
+  SmallInstructionSet Exclude;
+  for (auto &DRS : RootSets) {
+    Exclude.insert(DRS.Roots.begin(), DRS.Roots.end());
+    Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end());
+    Exclude.insert(DRS.BaseInst);
+  }
+  Exclude.insert(LoopIncs.begin(), LoopIncs.end());
+
+  for (auto &DRS : RootSets) {
+    DenseSet<Instruction*> VBase;
+    collectInLoopUserSet(DRS.BaseInst, Exclude, PossibleRedSet, VBase);
+    for (auto *I : VBase) {
+      Uses[I].set(0);
+    }
+
+    unsigned Idx = 1;
+    for (auto *Root : DRS.Roots) {
+      DenseSet<Instruction*> V;
+      collectInLoopUserSet(Root, Exclude, PossibleRedSet, V);
+
+      // While we're here, check the use sets are the same size.
+      if (V.size() != VBase.size()) {
+        LLVM_DEBUG(dbgs() << "LRR: Aborting - use sets are different sizes\n");
+        return false;
+      }
+
+      for (auto *I : V) {
+        Uses[I].set(Idx);
+      }
+      ++Idx;
+    }
+
+    // Make sure our subsumed instructions are remembered too.
+    for (auto *I : DRS.SubsumedInsts) {
+      Uses[I].set(IL_All);
+    }
+  }
+
+  // Make sure the loop increments are also accounted for.
+
+  Exclude.clear();
+  for (auto &DRS : RootSets) {
+    Exclude.insert(DRS.Roots.begin(), DRS.Roots.end());
+    Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end());
+    Exclude.insert(DRS.BaseInst);
+  }
+
+  DenseSet<Instruction*> V;
+  collectInLoopUserSet(LoopIncs, Exclude, PossibleRedSet, V);
+  for (auto *I : V) {
     if (I->mayHaveSideEffects()) {
       LLVM_DEBUG(dbgs() << "LRR: Aborting - "
                         << "An instruction which does not belong to any root "
                         << "sets must not have side effects: " << *I);
       return false;
     }
-    Uses[I].set(IL_All); 
-  } 
- 
-  return true; 
-} 
- 
-/// Get the next instruction in "In" that is a member of set Val. 
-/// Start searching from StartI, and do not return anything in Exclude. 
-/// If StartI is not given, start from In.begin(). 
-LoopReroll::DAGRootTracker::UsesTy::iterator 
-LoopReroll::DAGRootTracker::nextInstr(int Val, UsesTy &In, 
-                                      const SmallInstructionSet &Exclude, 
-                                      UsesTy::iterator *StartI) { 
-  UsesTy::iterator I = StartI ? *StartI : In.begin(); 
-  while (I != In.end() && (I->second.test(Val) == 0 || 
+    Uses[I].set(IL_All);
+  }
+
+  return true;
+}
+
+/// Get the next instruction in "In" that is a member of set Val.
+/// Start searching from StartI, and do not return anything in Exclude.
+/// If StartI is not given, start from In.begin().
+LoopReroll::DAGRootTracker::UsesTy::iterator
+LoopReroll::DAGRootTracker::nextInstr(int Val, UsesTy &In,
+                                      const SmallInstructionSet &Exclude,
+                                      UsesTy::iterator *StartI) {
+  UsesTy::iterator I = StartI ? *StartI : In.begin();
+  while (I != In.end() && (I->second.test(Val) == 0 ||
                            Exclude.contains(I->first)))
-    ++I; 
-  return I; 
-} 
- 
-bool LoopReroll::DAGRootTracker::isBaseInst(Instruction *I) { 
-  for (auto &DRS : RootSets) { 
-    if (DRS.BaseInst == I) 
-      return true; 
-  } 
-  return false; 
-} 
- 
-bool LoopReroll::DAGRootTracker::isRootInst(Instruction *I) { 
-  for (auto &DRS : RootSets) { 
-    if (is_contained(DRS.Roots, I)) 
-      return true; 
-  } 
-  return false; 
-} 
- 
-/// Return true if instruction I depends on any instruction between 
-/// Start and End. 
-bool LoopReroll::DAGRootTracker::instrDependsOn(Instruction *I, 
-                                                UsesTy::iterator Start, 
-                                                UsesTy::iterator End) { 
-  for (auto *U : I->users()) { 
-    for (auto It = Start; It != End; ++It) 
-      if (U == It->first) 
-        return true; 
-  } 
-  return false; 
-} 
- 
-static bool isIgnorableInst(const Instruction *I) { 
-  if (isa<DbgInfoIntrinsic>(I)) 
-    return true; 
-  const IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); 
-  if (!II) 
-    return false; 
-  switch (II->getIntrinsicID()) { 
-    default: 
-      return false; 
-    case Intrinsic::annotation: 
-    case Intrinsic::ptr_annotation: 
-    case Intrinsic::var_annotation: 
-    // TODO: the following intrinsics may also be allowed: 
-    //   lifetime_start, lifetime_end, invariant_start, invariant_end 
-      return true; 
-  } 
-  return false; 
-} 
- 
-bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) { 
-  // We now need to check for equivalence of the use graph of each root with 
-  // that of the primary induction variable (excluding the roots). Our goal 
-  // here is not to solve the full graph isomorphism problem, but rather to 
-  // catch common cases without a lot of work. As a result, we will assume 
-  // that the relative order of the instructions in each unrolled iteration 
-  // is the same (although we will not make an assumption about how the 
-  // different iterations are intermixed). Note that while the order must be 
-  // the same, the instructions may not be in the same basic block. 
- 
-  // An array of just the possible reductions for this scale factor. When we 
-  // collect the set of all users of some root instructions, these reduction 
-  // instructions are treated as 'final' (their uses are not considered). 
-  // This is important because we don't want the root use set to search down 
-  // the reduction chain. 
-  SmallInstructionSet PossibleRedSet; 
-  SmallInstructionSet PossibleRedLastSet; 
-  SmallInstructionSet PossibleRedPHISet; 
-  Reductions.restrictToScale(Scale, PossibleRedSet, 
-                             PossibleRedPHISet, PossibleRedLastSet); 
- 
-  // Populate "Uses" with where each instruction is used. 
-  if (!collectUsedInstructions(PossibleRedSet)) 
-    return false; 
- 
-  // Make sure we mark the reduction PHIs as used in all iterations. 
-  for (auto *I : PossibleRedPHISet) { 
-    Uses[I].set(IL_All); 
-  } 
- 
-  // Make sure we mark loop-control-only PHIs as used in all iterations. See 
-  // comment above LoopReroll::isLoopControlIV for more information. 
-  BasicBlock *Header = L->getHeader(); 
-  if (LoopControlIV && LoopControlIV != IV) { 
-    for (auto *U : LoopControlIV->users()) { 
-      Instruction *IVUser = dyn_cast<Instruction>(U); 
-      // IVUser could be loop increment or compare 
-      Uses[IVUser].set(IL_All); 
-      for (auto *UU : IVUser->users()) { 
-        Instruction *UUser = dyn_cast<Instruction>(UU); 
-        // UUser could be compare, PHI or branch 
-        Uses[UUser].set(IL_All); 
-        // Skip SExt 
-        if (isa<SExtInst>(UUser)) { 
-          UUser = dyn_cast<Instruction>(*(UUser->user_begin())); 
-          Uses[UUser].set(IL_All); 
-        } 
-        // Is UUser a compare instruction? 
-        if (UU->hasOneUse()) { 
-          Instruction *BI = dyn_cast<BranchInst>(*UUser->user_begin()); 
-          if (BI == cast<BranchInst>(Header->getTerminator())) 
-            Uses[BI].set(IL_All); 
-        } 
-      } 
-    } 
-  } 
- 
-  // Make sure all instructions in the loop are in one and only one 
-  // set. 
-  for (auto &KV : Uses) { 
-    if (KV.second.count() != 1 && !isIgnorableInst(KV.first)) { 
-      LLVM_DEBUG( 
-          dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: " 
-                 << *KV.first << " (#uses=" << KV.second.count() << ")\n"); 
-      return false; 
-    } 
-  } 
- 
-  LLVM_DEBUG(for (auto &KV 
-                  : Uses) { 
-    dbgs() << "LRR: " << KV.second.find_first() << "\t" << *KV.first << "\n"; 
-  }); 
- 
-  for (unsigned Iter = 1; Iter < Scale; ++Iter) { 
-    // In addition to regular aliasing information, we need to look for 
-    // instructions from later (future) iterations that have side effects 
-    // preventing us from reordering them past other instructions with side 
-    // effects. 
-    bool FutureSideEffects = false; 
-    AliasSetTracker AST(*AA); 
-    // The map between instructions in f(%iv.(i+1)) and f(%iv). 
-    DenseMap<Value *, Value *> BaseMap; 
- 
-    // Compare iteration Iter to the base. 
-    SmallInstructionSet Visited; 
-    auto BaseIt = nextInstr(0, Uses, Visited); 
-    auto RootIt = nextInstr(Iter, Uses, Visited); 
-    auto LastRootIt = Uses.begin(); 
- 
-    while (BaseIt != Uses.end() && RootIt != Uses.end()) { 
-      Instruction *BaseInst = BaseIt->first; 
-      Instruction *RootInst = RootIt->first; 
- 
-      // Skip over the IV or root instructions; only match their users. 
-      bool Continue = false; 
-      if (isBaseInst(BaseInst)) { 
-        Visited.insert(BaseInst); 
-        BaseIt = nextInstr(0, Uses, Visited); 
-        Continue = true; 
-      } 
-      if (isRootInst(RootInst)) { 
-        LastRootIt = RootIt; 
-        Visited.insert(RootInst); 
-        RootIt = nextInstr(Iter, Uses, Visited); 
-        Continue = true; 
-      } 
-      if (Continue) continue; 
- 
-      if (!BaseInst->isSameOperationAs(RootInst)) { 
-        // Last chance saloon. We don't try and solve the full isomorphism 
-        // problem, but try and at least catch the case where two instructions 
-        // *of different types* are round the wrong way. We won't be able to 
-        // efficiently tell, given two ADD instructions, which way around we 
-        // should match them, but given an ADD and a SUB, we can at least infer 
-        // which one is which. 
-        // 
-        // This should allow us to deal with a greater subset of the isomorphism 
-        // problem. It does however change a linear algorithm into a quadratic 
-        // one, so limit the number of probes we do. 
-        auto TryIt = RootIt; 
-        unsigned N = NumToleratedFailedMatches; 
-        while (TryIt != Uses.end() && 
-               !BaseInst->isSameOperationAs(TryIt->first) && 
-               N--) { 
-          ++TryIt; 
-          TryIt = nextInstr(Iter, Uses, Visited, &TryIt); 
-        } 
- 
-        if (TryIt == Uses.end() || TryIt == RootIt || 
-            instrDependsOn(TryIt->first, RootIt, TryIt)) { 
-          LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " 
-                            << *BaseInst << " vs. " << *RootInst << "\n"); 
-          return false; 
-        } 
- 
-        RootIt = TryIt; 
-        RootInst = TryIt->first; 
-      } 
- 
-      // All instructions between the last root and this root 
-      // may belong to some other iteration. If they belong to a 
-      // future iteration, then they're dangerous to alias with. 
-      // 
-      // Note that because we allow a limited amount of flexibility in the order 
-      // that we visit nodes, LastRootIt might be *before* RootIt, in which 
-      // case we've already checked this set of instructions so we shouldn't 
-      // do anything. 
-      for (; LastRootIt < RootIt; ++LastRootIt) { 
-        Instruction *I = LastRootIt->first; 
-        if (LastRootIt->second.find_first() < (int)Iter) 
-          continue; 
-        if (I->mayWriteToMemory()) 
-          AST.add(I); 
-        // Note: This is specifically guarded by a check on isa<PHINode>, 
-        // which while a valid (somewhat arbitrary) micro-optimization, is 
-        // needed because otherwise isSafeToSpeculativelyExecute returns 
-        // false on PHI nodes. 
-        if (!isa<PHINode>(I) && !isUnorderedLoadStore(I) && 
-            !isSafeToSpeculativelyExecute(I)) 
-          // Intervening instructions cause side effects. 
-          FutureSideEffects = true; 
-      } 
- 
-      // Make sure that this instruction, which is in the use set of this 
-      // root instruction, does not also belong to the base set or the set of 
-      // some other root instruction. 
-      if (RootIt->second.count() > 1) { 
-        LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst 
-                          << " vs. " << *RootInst << " (prev. case overlap)\n"); 
-        return false; 
-      } 
- 
-      // Make sure that we don't alias with any instruction in the alias set 
-      // tracker. If we do, then we depend on a future iteration, and we 
-      // can't reroll. 
-      if (RootInst->mayReadFromMemory()) 
-        for (auto &K : AST) { 
-          if (K.aliasesUnknownInst(RootInst, *AA)) { 
-            LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " 
-                              << *BaseInst << " vs. " << *RootInst 
-                              << " (depends on future store)\n"); 
-            return false; 
-          } 
-        } 
- 
-      // If we've past an instruction from a future iteration that may have 
-      // side effects, and this instruction might also, then we can't reorder 
-      // them, and this matching fails. As an exception, we allow the alias 
-      // set tracker to handle regular (unordered) load/store dependencies. 
-      if (FutureSideEffects && ((!isUnorderedLoadStore(BaseInst) && 
-                                 !isSafeToSpeculativelyExecute(BaseInst)) || 
-                                (!isUnorderedLoadStore(RootInst) && 
-                                 !isSafeToSpeculativelyExecute(RootInst)))) { 
-        LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst 
-                          << " vs. " << *RootInst 
-                          << " (side effects prevent reordering)\n"); 
-        return false; 
-      } 
- 
-      // For instructions that are part of a reduction, if the operation is 
-      // associative, then don't bother matching the operands (because we 
-      // already know that the instructions are isomorphic, and the order 
-      // within the iteration does not matter). For non-associative reductions, 
-      // we do need to match the operands, because we need to reject 
-      // out-of-order instructions within an iteration! 
-      // For example (assume floating-point addition), we need to reject this: 
-      //   x += a[i]; x += b[i]; 
-      //   x += a[i+1]; x += b[i+1]; 
-      //   x += b[i+2]; x += a[i+2]; 
-      bool InReduction = Reductions.isPairInSame(BaseInst, RootInst); 
- 
-      if (!(InReduction && BaseInst->isAssociative())) { 
-        bool Swapped = false, SomeOpMatched = false; 
-        for (unsigned j = 0; j < BaseInst->getNumOperands(); ++j) { 
-          Value *Op2 = RootInst->getOperand(j); 
- 
-          // If this is part of a reduction (and the operation is not 
-          // associatve), then we match all operands, but not those that are 
-          // part of the reduction. 
-          if (InReduction) 
-            if (Instruction *Op2I = dyn_cast<Instruction>(Op2)) 
-              if (Reductions.isPairInSame(RootInst, Op2I)) 
-                continue; 
- 
-          DenseMap<Value *, Value *>::iterator BMI = BaseMap.find(Op2); 
-          if (BMI != BaseMap.end()) { 
-            Op2 = BMI->second; 
-          } else { 
-            for (auto &DRS : RootSets) { 
-              if (DRS.Roots[Iter-1] == (Instruction*) Op2) { 
-                Op2 = DRS.BaseInst; 
-                break; 
-              } 
-            } 
-          } 
- 
-          if (BaseInst->getOperand(Swapped ? unsigned(!j) : j) != Op2) { 
-            // If we've not already decided to swap the matched operands, and 
-            // we've not already matched our first operand (note that we could 
-            // have skipped matching the first operand because it is part of a 
-            // reduction above), and the instruction is commutative, then try 
-            // the swapped match. 
-            if (!Swapped && BaseInst->isCommutative() && !SomeOpMatched && 
-                BaseInst->getOperand(!j) == Op2) { 
-              Swapped = true; 
-            } else { 
-              LLVM_DEBUG(dbgs() 
-                         << "LRR: iteration root match failed at " << *BaseInst 
-                         << " vs. " << *RootInst << " (operand " << j << ")\n"); 
-              return false; 
-            } 
-          } 
- 
-          SomeOpMatched = true; 
-        } 
-      } 
- 
-      if ((!PossibleRedLastSet.count(BaseInst) && 
-           hasUsesOutsideLoop(BaseInst, L)) || 
-          (!PossibleRedLastSet.count(RootInst) && 
-           hasUsesOutsideLoop(RootInst, L))) { 
-        LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst 
-                          << " vs. " << *RootInst << " (uses outside loop)\n"); 
-        return false; 
-      } 
- 
-      Reductions.recordPair(BaseInst, RootInst, Iter); 
-      BaseMap.insert(std::make_pair(RootInst, BaseInst)); 
- 
-      LastRootIt = RootIt; 
-      Visited.insert(BaseInst); 
-      Visited.insert(RootInst); 
-      BaseIt = nextInstr(0, Uses, Visited); 
-      RootIt = nextInstr(Iter, Uses, Visited); 
-    } 
-    assert(BaseIt == Uses.end() && RootIt == Uses.end() && 
-           "Mismatched set sizes!"); 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "LRR: Matched all iteration increments for " << *IV 
-                    << "\n"); 
- 
-  return true; 
-} 
- 
-void LoopReroll::DAGRootTracker::replace(const SCEV *BackedgeTakenCount) { 
-  BasicBlock *Header = L->getHeader(); 
- 
-  // Compute the start and increment for each BaseInst before we start erasing 
-  // instructions. 
-  SmallVector<const SCEV *, 8> StartExprs; 
-  SmallVector<const SCEV *, 8> IncrExprs; 
-  for (auto &DRS : RootSets) { 
-    const SCEVAddRecExpr *IVSCEV = 
-        cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst)); 
-    StartExprs.push_back(IVSCEV->getStart()); 
-    IncrExprs.push_back(SE->getMinusSCEV(SE->getSCEV(DRS.Roots[0]), IVSCEV)); 
-  } 
- 
-  // Remove instructions associated with non-base iterations. 
-  for (BasicBlock::reverse_iterator J = Header->rbegin(), JE = Header->rend(); 
-       J != JE;) { 
-    unsigned I = Uses[&*J].find_first(); 
-    if (I > 0 && I < IL_All) { 
-      LLVM_DEBUG(dbgs() << "LRR: removing: " << *J << "\n"); 
-      J++->eraseFromParent(); 
-      continue; 
-    } 
- 
-    ++J; 
-  } 
- 
-  // Rewrite each BaseInst using SCEV. 
-  for (size_t i = 0, e = RootSets.size(); i != e; ++i) 
-    // Insert the new induction variable. 
-    replaceIV(RootSets[i], StartExprs[i], IncrExprs[i]); 
- 
-  { // Limit the lifetime of SCEVExpander. 
-    BranchInst *BI = cast<BranchInst>(Header->getTerminator()); 
-    const DataLayout &DL = Header->getModule()->getDataLayout(); 
-    SCEVExpander Expander(*SE, DL, "reroll"); 
-    auto Zero = SE->getZero(BackedgeTakenCount->getType()); 
-    auto One = SE->getOne(BackedgeTakenCount->getType()); 
-    auto NewIVSCEV = SE->getAddRecExpr(Zero, One, L, SCEV::FlagAnyWrap); 
-    Value *NewIV = 
-        Expander.expandCodeFor(NewIVSCEV, BackedgeTakenCount->getType(), 
-                               Header->getFirstNonPHIOrDbg()); 
-    // FIXME: This arithmetic can overflow. 
-    auto TripCount = SE->getAddExpr(BackedgeTakenCount, One); 
-    auto ScaledTripCount = SE->getMulExpr( 
-        TripCount, SE->getConstant(BackedgeTakenCount->getType(), Scale)); 
-    auto ScaledBECount = SE->getMinusSCEV(ScaledTripCount, One); 
-    Value *TakenCount = 
-        Expander.expandCodeFor(ScaledBECount, BackedgeTakenCount->getType(), 
-                               Header->getFirstNonPHIOrDbg()); 
-    Value *Cond = 
-        new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, TakenCount, "exitcond"); 
-    BI->setCondition(Cond); 
- 
-    if (BI->getSuccessor(1) != Header) 
-      BI->swapSuccessors(); 
-  } 
- 
-  SimplifyInstructionsInBlock(Header, TLI); 
-  DeleteDeadPHIs(Header, TLI); 
-} 
- 
-void LoopReroll::DAGRootTracker::replaceIV(DAGRootSet &DRS, 
-                                           const SCEV *Start, 
-                                           const SCEV *IncrExpr) { 
-  BasicBlock *Header = L->getHeader(); 
-  Instruction *Inst = DRS.BaseInst; 
- 
-  const SCEV *NewIVSCEV = 
-      SE->getAddRecExpr(Start, IncrExpr, L, SCEV::FlagAnyWrap); 
- 
-  { // Limit the lifetime of SCEVExpander. 
-    const DataLayout &DL = Header->getModule()->getDataLayout(); 
-    SCEVExpander Expander(*SE, DL, "reroll"); 
-    Value *NewIV = Expander.expandCodeFor(NewIVSCEV, Inst->getType(), 
-                                          Header->getFirstNonPHIOrDbg()); 
- 
-    for (auto &KV : Uses) 
-      if (KV.second.find_first() == 0) 
-        KV.first->replaceUsesOfWith(Inst, NewIV); 
-  } 
-} 
- 
-// Validate the selected reductions. All iterations must have an isomorphic 
-// part of the reduction chain and, for non-associative reductions, the chain 
-// entries must appear in order. 
-bool LoopReroll::ReductionTracker::validateSelected() { 
-  // For a non-associative reduction, the chain entries must appear in order. 
-  for (int i : Reds) { 
-    int PrevIter = 0, BaseCount = 0, Count = 0; 
-    for (Instruction *J : PossibleReds[i]) { 
-      // Note that all instructions in the chain must have been found because 
-      // all instructions in the function must have been assigned to some 
-      // iteration. 
-      int Iter = PossibleRedIter[J]; 
-      if (Iter != PrevIter && Iter != PrevIter + 1 && 
-          !PossibleReds[i].getReducedValue()->isAssociative()) { 
-        LLVM_DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " 
-                          << J << "\n"); 
-        return false; 
-      } 
- 
-      if (Iter != PrevIter) { 
-        if (Count != BaseCount) { 
-          LLVM_DEBUG(dbgs() 
-                     << "LRR: Iteration " << PrevIter << " reduction use count " 
-                     << Count << " is not equal to the base use count " 
-                     << BaseCount << "\n"); 
-          return false; 
-        } 
- 
-        Count = 0; 
-      } 
- 
-      ++Count; 
-      if (Iter == 0) 
-        ++BaseCount; 
- 
-      PrevIter = Iter; 
-    } 
-  } 
- 
-  return true; 
-} 
- 
-// For all selected reductions, remove all parts except those in the first 
-// iteration (and the PHI). Replace outside uses of the reduced value with uses 
-// of the first-iteration reduced value (in other words, reroll the selected 
-// reductions). 
-void LoopReroll::ReductionTracker::replaceSelected() { 
-  // Fixup reductions to refer to the last instruction associated with the 
-  // first iteration (not the last). 
-  for (int i : Reds) { 
-    int j = 0; 
-    for (int e = PossibleReds[i].size(); j != e; ++j) 
-      if (PossibleRedIter[PossibleReds[i][j]] != 0) { 
-        --j; 
-        break; 
-      } 
- 
-    // Replace users with the new end-of-chain value. 
-    SmallInstructionVector Users; 
-    for (User *U : PossibleReds[i].getReducedValue()->users()) { 
-      Users.push_back(cast<Instruction>(U)); 
-    } 
- 
-    for (Instruction *User : Users) 
-      User->replaceUsesOfWith(PossibleReds[i].getReducedValue(), 
-                              PossibleReds[i][j]); 
-  } 
-} 
- 
-// Reroll the provided loop with respect to the provided induction variable. 
-// Generally, we're looking for a loop like this: 
-// 
-// %iv = phi [ (preheader, ...), (body, %iv.next) ] 
-// f(%iv) 
-// %iv.1 = add %iv, 1                <-- a root increment 
-// f(%iv.1) 
-// %iv.2 = add %iv, 2                <-- a root increment 
-// f(%iv.2) 
-// %iv.scale_m_1 = add %iv, scale-1  <-- a root increment 
-// f(%iv.scale_m_1) 
-// ... 
-// %iv.next = add %iv, scale 
-// %cmp = icmp(%iv, ...) 
-// br %cmp, header, exit 
-// 
-// Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of 
-// instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can 
-// be intermixed with eachother. The restriction imposed by this algorithm is 
-// that the relative order of the isomorphic instructions in f(%iv), f(%iv.1), 
-// etc. be the same. 
-// 
-// First, we collect the use set of %iv, excluding the other increment roots. 
-// This gives us f(%iv). Then we iterate over the loop instructions (scale-1) 
-// times, having collected the use set of f(%iv.(i+1)), during which we: 
-//   - Ensure that the next unmatched instruction in f(%iv) is isomorphic to 
-//     the next unmatched instruction in f(%iv.(i+1)). 
-//   - Ensure that both matched instructions don't have any external users 
-//     (with the exception of last-in-chain reduction instructions). 
-//   - Track the (aliasing) write set, and other side effects, of all 
-//     instructions that belong to future iterations that come before the matched 
-//     instructions. If the matched instructions read from that write set, then 
-//     f(%iv) or f(%iv.(i+1)) has some dependency on instructions in 
-//     f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly, 
-//     if any of these future instructions had side effects (could not be 
-//     speculatively executed), and so do the matched instructions, when we 
-//     cannot reorder those side-effect-producing instructions, and rerolling 
-//     fails. 
-// 
-// Finally, we make sure that all loop instructions are either loop increment 
-// roots, belong to simple latch code, parts of validated reductions, part of 
-// f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions 
-// have been validated), then we reroll the loop. 
-bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, 
-                        const SCEV *BackedgeTakenCount, 
-                        ReductionTracker &Reductions) { 
-  DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, DT, LI, PreserveLCSSA, 
-                          IVToIncMap, LoopControlIV); 
- 
-  if (!DAGRoots.findRoots()) 
-    return false; 
-  LLVM_DEBUG(dbgs() << "LRR: Found all root induction increments for: " << *IV 
-                    << "\n"); 
- 
-  if (!DAGRoots.validate(Reductions)) 
-    return false; 
-  if (!Reductions.validateSelected()) 
-    return false; 
-  // At this point, we've validated the rerolling, and we're committed to 
-  // making changes! 
- 
-  Reductions.replaceSelected(); 
-  DAGRoots.replace(BackedgeTakenCount); 
- 
-  ++NumRerolledLoops; 
-  return true; 
-} 
- 
+    ++I;
+  return I;
+}
+
+bool LoopReroll::DAGRootTracker::isBaseInst(Instruction *I) {
+  for (auto &DRS : RootSets) {
+    if (DRS.BaseInst == I)
+      return true;
+  }
+  return false;
+}
+
+bool LoopReroll::DAGRootTracker::isRootInst(Instruction *I) {
+  for (auto &DRS : RootSets) {
+    if (is_contained(DRS.Roots, I))
+      return true;
+  }
+  return false;
+}
+
+/// Return true if instruction I depends on any instruction between
+/// Start and End.
+bool LoopReroll::DAGRootTracker::instrDependsOn(Instruction *I,
+                                                UsesTy::iterator Start,
+                                                UsesTy::iterator End) {
+  for (auto *U : I->users()) {
+    for (auto It = Start; It != End; ++It)
+      if (U == It->first)
+        return true;
+  }
+  return false;
+}
+
+static bool isIgnorableInst(const Instruction *I) {
+  if (isa<DbgInfoIntrinsic>(I))
+    return true;
+  const IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
+  if (!II)
+    return false;
+  switch (II->getIntrinsicID()) {
+    default:
+      return false;
+    case Intrinsic::annotation:
+    case Intrinsic::ptr_annotation:
+    case Intrinsic::var_annotation:
+    // TODO: the following intrinsics may also be allowed:
+    //   lifetime_start, lifetime_end, invariant_start, invariant_end
+      return true;
+  }
+  return false;
+}
+
+bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
+  // We now need to check for equivalence of the use graph of each root with
+  // that of the primary induction variable (excluding the roots). Our goal
+  // here is not to solve the full graph isomorphism problem, but rather to
+  // catch common cases without a lot of work. As a result, we will assume
+  // that the relative order of the instructions in each unrolled iteration
+  // is the same (although we will not make an assumption about how the
+  // different iterations are intermixed). Note that while the order must be
+  // the same, the instructions may not be in the same basic block.
+
+  // An array of just the possible reductions for this scale factor. When we
+  // collect the set of all users of some root instructions, these reduction
+  // instructions are treated as 'final' (their uses are not considered).
+  // This is important because we don't want the root use set to search down
+  // the reduction chain.
+  SmallInstructionSet PossibleRedSet;
+  SmallInstructionSet PossibleRedLastSet;
+  SmallInstructionSet PossibleRedPHISet;
+  Reductions.restrictToScale(Scale, PossibleRedSet,
+                             PossibleRedPHISet, PossibleRedLastSet);
+
+  // Populate "Uses" with where each instruction is used.
+  if (!collectUsedInstructions(PossibleRedSet))
+    return false;
+
+  // Make sure we mark the reduction PHIs as used in all iterations.
+  for (auto *I : PossibleRedPHISet) {
+    Uses[I].set(IL_All);
+  }
+
+  // Make sure we mark loop-control-only PHIs as used in all iterations. See
+  // comment above LoopReroll::isLoopControlIV for more information.
+  BasicBlock *Header = L->getHeader();
+  if (LoopControlIV && LoopControlIV != IV) {
+    for (auto *U : LoopControlIV->users()) {
+      Instruction *IVUser = dyn_cast<Instruction>(U);
+      // IVUser could be loop increment or compare
+      Uses[IVUser].set(IL_All);
+      for (auto *UU : IVUser->users()) {
+        Instruction *UUser = dyn_cast<Instruction>(UU);
+        // UUser could be compare, PHI or branch
+        Uses[UUser].set(IL_All);
+        // Skip SExt
+        if (isa<SExtInst>(UUser)) {
+          UUser = dyn_cast<Instruction>(*(UUser->user_begin()));
+          Uses[UUser].set(IL_All);
+        }
+        // Is UUser a compare instruction?
+        if (UU->hasOneUse()) {
+          Instruction *BI = dyn_cast<BranchInst>(*UUser->user_begin());
+          if (BI == cast<BranchInst>(Header->getTerminator()))
+            Uses[BI].set(IL_All);
+        }
+      }
+    }
+  }
+
+  // Make sure all instructions in the loop are in one and only one
+  // set.
+  for (auto &KV : Uses) {
+    if (KV.second.count() != 1 && !isIgnorableInst(KV.first)) {
+      LLVM_DEBUG(
+          dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: "
+                 << *KV.first << " (#uses=" << KV.second.count() << ")\n");
+      return false;
+    }
+  }
+
+  LLVM_DEBUG(for (auto &KV
+                  : Uses) {
+    dbgs() << "LRR: " << KV.second.find_first() << "\t" << *KV.first << "\n";
+  });
+
+  for (unsigned Iter = 1; Iter < Scale; ++Iter) {
+    // In addition to regular aliasing information, we need to look for
+    // instructions from later (future) iterations that have side effects
+    // preventing us from reordering them past other instructions with side
+    // effects.
+    bool FutureSideEffects = false;
+    AliasSetTracker AST(*AA);
+    // The map between instructions in f(%iv.(i+1)) and f(%iv).
+    DenseMap<Value *, Value *> BaseMap;
+
+    // Compare iteration Iter to the base.
+    SmallInstructionSet Visited;
+    auto BaseIt = nextInstr(0, Uses, Visited);
+    auto RootIt = nextInstr(Iter, Uses, Visited);
+    auto LastRootIt = Uses.begin();
+
+    while (BaseIt != Uses.end() && RootIt != Uses.end()) {
+      Instruction *BaseInst = BaseIt->first;
+      Instruction *RootInst = RootIt->first;
+
+      // Skip over the IV or root instructions; only match their users.
+      bool Continue = false;
+      if (isBaseInst(BaseInst)) {
+        Visited.insert(BaseInst);
+        BaseIt = nextInstr(0, Uses, Visited);
+        Continue = true;
+      }
+      if (isRootInst(RootInst)) {
+        LastRootIt = RootIt;
+        Visited.insert(RootInst);
+        RootIt = nextInstr(Iter, Uses, Visited);
+        Continue = true;
+      }
+      if (Continue) continue;
+
+      if (!BaseInst->isSameOperationAs(RootInst)) {
+        // Last chance saloon. We don't try and solve the full isomorphism
+        // problem, but try and at least catch the case where two instructions
+        // *of different types* are round the wrong way. We won't be able to
+        // efficiently tell, given two ADD instructions, which way around we
+        // should match them, but given an ADD and a SUB, we can at least infer
+        // which one is which.
+        //
+        // This should allow us to deal with a greater subset of the isomorphism
+        // problem. It does however change a linear algorithm into a quadratic
+        // one, so limit the number of probes we do.
+        auto TryIt = RootIt;
+        unsigned N = NumToleratedFailedMatches;
+        while (TryIt != Uses.end() &&
+               !BaseInst->isSameOperationAs(TryIt->first) &&
+               N--) {
+          ++TryIt;
+          TryIt = nextInstr(Iter, Uses, Visited, &TryIt);
+        }
+
+        if (TryIt == Uses.end() || TryIt == RootIt ||
+            instrDependsOn(TryIt->first, RootIt, TryIt)) {
+          LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at "
+                            << *BaseInst << " vs. " << *RootInst << "\n");
+          return false;
+        }
+
+        RootIt = TryIt;
+        RootInst = TryIt->first;
+      }
+
+      // All instructions between the last root and this root
+      // may belong to some other iteration. If they belong to a
+      // future iteration, then they're dangerous to alias with.
+      //
+      // Note that because we allow a limited amount of flexibility in the order
+      // that we visit nodes, LastRootIt might be *before* RootIt, in which
+      // case we've already checked this set of instructions so we shouldn't
+      // do anything.
+      for (; LastRootIt < RootIt; ++LastRootIt) {
+        Instruction *I = LastRootIt->first;
+        if (LastRootIt->second.find_first() < (int)Iter)
+          continue;
+        if (I->mayWriteToMemory())
+          AST.add(I);
+        // Note: This is specifically guarded by a check on isa<PHINode>,
+        // which while a valid (somewhat arbitrary) micro-optimization, is
+        // needed because otherwise isSafeToSpeculativelyExecute returns
+        // false on PHI nodes.
+        if (!isa<PHINode>(I) && !isUnorderedLoadStore(I) &&
+            !isSafeToSpeculativelyExecute(I))
+          // Intervening instructions cause side effects.
+          FutureSideEffects = true;
+      }
+
+      // Make sure that this instruction, which is in the use set of this
+      // root instruction, does not also belong to the base set or the set of
+      // some other root instruction.
+      if (RootIt->second.count() > 1) {
+        LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
+                          << " vs. " << *RootInst << " (prev. case overlap)\n");
+        return false;
+      }
+
+      // Make sure that we don't alias with any instruction in the alias set
+      // tracker. If we do, then we depend on a future iteration, and we
+      // can't reroll.
+      if (RootInst->mayReadFromMemory())
+        for (auto &K : AST) {
+          if (K.aliasesUnknownInst(RootInst, *AA)) {
+            LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at "
+                              << *BaseInst << " vs. " << *RootInst
+                              << " (depends on future store)\n");
+            return false;
+          }
+        }
+
+      // If we've past an instruction from a future iteration that may have
+      // side effects, and this instruction might also, then we can't reorder
+      // them, and this matching fails. As an exception, we allow the alias
+      // set tracker to handle regular (unordered) load/store dependencies.
+      if (FutureSideEffects && ((!isUnorderedLoadStore(BaseInst) &&
+                                 !isSafeToSpeculativelyExecute(BaseInst)) ||
+                                (!isUnorderedLoadStore(RootInst) &&
+                                 !isSafeToSpeculativelyExecute(RootInst)))) {
+        LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
+                          << " vs. " << *RootInst
+                          << " (side effects prevent reordering)\n");
+        return false;
+      }
+
+      // For instructions that are part of a reduction, if the operation is
+      // associative, then don't bother matching the operands (because we
+      // already know that the instructions are isomorphic, and the order
+      // within the iteration does not matter). For non-associative reductions,
+      // we do need to match the operands, because we need to reject
+      // out-of-order instructions within an iteration!
+      // For example (assume floating-point addition), we need to reject this:
+      //   x += a[i]; x += b[i];
+      //   x += a[i+1]; x += b[i+1];
+      //   x += b[i+2]; x += a[i+2];
+      bool InReduction = Reductions.isPairInSame(BaseInst, RootInst);
+
+      if (!(InReduction && BaseInst->isAssociative())) {
+        bool Swapped = false, SomeOpMatched = false;
+        for (unsigned j = 0; j < BaseInst->getNumOperands(); ++j) {
+          Value *Op2 = RootInst->getOperand(j);
+
+          // If this is part of a reduction (and the operation is not
+          // associatve), then we match all operands, but not those that are
+          // part of the reduction.
+          if (InReduction)
+            if (Instruction *Op2I = dyn_cast<Instruction>(Op2))
+              if (Reductions.isPairInSame(RootInst, Op2I))
+                continue;
+
+          DenseMap<Value *, Value *>::iterator BMI = BaseMap.find(Op2);
+          if (BMI != BaseMap.end()) {
+            Op2 = BMI->second;
+          } else {
+            for (auto &DRS : RootSets) {
+              if (DRS.Roots[Iter-1] == (Instruction*) Op2) {
+                Op2 = DRS.BaseInst;
+                break;
+              }
+            }
+          }
+
+          if (BaseInst->getOperand(Swapped ? unsigned(!j) : j) != Op2) {
+            // If we've not already decided to swap the matched operands, and
+            // we've not already matched our first operand (note that we could
+            // have skipped matching the first operand because it is part of a
+            // reduction above), and the instruction is commutative, then try
+            // the swapped match.
+            if (!Swapped && BaseInst->isCommutative() && !SomeOpMatched &&
+                BaseInst->getOperand(!j) == Op2) {
+              Swapped = true;
+            } else {
+              LLVM_DEBUG(dbgs()
+                         << "LRR: iteration root match failed at " << *BaseInst
+                         << " vs. " << *RootInst << " (operand " << j << ")\n");
+              return false;
+            }
+          }
+
+          SomeOpMatched = true;
+        }
+      }
+
+      if ((!PossibleRedLastSet.count(BaseInst) &&
+           hasUsesOutsideLoop(BaseInst, L)) ||
+          (!PossibleRedLastSet.count(RootInst) &&
+           hasUsesOutsideLoop(RootInst, L))) {
+        LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
+                          << " vs. " << *RootInst << " (uses outside loop)\n");
+        return false;
+      }
+
+      Reductions.recordPair(BaseInst, RootInst, Iter);
+      BaseMap.insert(std::make_pair(RootInst, BaseInst));
+
+      LastRootIt = RootIt;
+      Visited.insert(BaseInst);
+      Visited.insert(RootInst);
+      BaseIt = nextInstr(0, Uses, Visited);
+      RootIt = nextInstr(Iter, Uses, Visited);
+    }
+    assert(BaseIt == Uses.end() && RootIt == Uses.end() &&
+           "Mismatched set sizes!");
+  }
+
+  LLVM_DEBUG(dbgs() << "LRR: Matched all iteration increments for " << *IV
+                    << "\n");
+
+  return true;
+}
+
+void LoopReroll::DAGRootTracker::replace(const SCEV *BackedgeTakenCount) {
+  BasicBlock *Header = L->getHeader();
+
+  // Compute the start and increment for each BaseInst before we start erasing
+  // instructions.
+  SmallVector<const SCEV *, 8> StartExprs;
+  SmallVector<const SCEV *, 8> IncrExprs;
+  for (auto &DRS : RootSets) {
+    const SCEVAddRecExpr *IVSCEV =
+        cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst));
+    StartExprs.push_back(IVSCEV->getStart());
+    IncrExprs.push_back(SE->getMinusSCEV(SE->getSCEV(DRS.Roots[0]), IVSCEV));
+  }
+
+  // Remove instructions associated with non-base iterations.
+  for (BasicBlock::reverse_iterator J = Header->rbegin(), JE = Header->rend();
+       J != JE;) {
+    unsigned I = Uses[&*J].find_first();
+    if (I > 0 && I < IL_All) {
+      LLVM_DEBUG(dbgs() << "LRR: removing: " << *J << "\n");
+      J++->eraseFromParent();
+      continue;
+    }
+
+    ++J;
+  }
+
+  // Rewrite each BaseInst using SCEV.
+  for (size_t i = 0, e = RootSets.size(); i != e; ++i)
+    // Insert the new induction variable.
+    replaceIV(RootSets[i], StartExprs[i], IncrExprs[i]);
+
+  { // Limit the lifetime of SCEVExpander.
+    BranchInst *BI = cast<BranchInst>(Header->getTerminator());
+    const DataLayout &DL = Header->getModule()->getDataLayout();
+    SCEVExpander Expander(*SE, DL, "reroll");
+    auto Zero = SE->getZero(BackedgeTakenCount->getType());
+    auto One = SE->getOne(BackedgeTakenCount->getType());
+    auto NewIVSCEV = SE->getAddRecExpr(Zero, One, L, SCEV::FlagAnyWrap);
+    Value *NewIV =
+        Expander.expandCodeFor(NewIVSCEV, BackedgeTakenCount->getType(),
+                               Header->getFirstNonPHIOrDbg());
+    // FIXME: This arithmetic can overflow.
+    auto TripCount = SE->getAddExpr(BackedgeTakenCount, One);
+    auto ScaledTripCount = SE->getMulExpr(
+        TripCount, SE->getConstant(BackedgeTakenCount->getType(), Scale));
+    auto ScaledBECount = SE->getMinusSCEV(ScaledTripCount, One);
+    Value *TakenCount =
+        Expander.expandCodeFor(ScaledBECount, BackedgeTakenCount->getType(),
+                               Header->getFirstNonPHIOrDbg());
+    Value *Cond =
+        new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, TakenCount, "exitcond");
+    BI->setCondition(Cond);
+
+    if (BI->getSuccessor(1) != Header)
+      BI->swapSuccessors();
+  }
+
+  SimplifyInstructionsInBlock(Header, TLI);
+  DeleteDeadPHIs(Header, TLI);
+}
+
+void LoopReroll::DAGRootTracker::replaceIV(DAGRootSet &DRS,
+                                           const SCEV *Start,
+                                           const SCEV *IncrExpr) {
+  BasicBlock *Header = L->getHeader();
+  Instruction *Inst = DRS.BaseInst;
+
+  const SCEV *NewIVSCEV =
+      SE->getAddRecExpr(Start, IncrExpr, L, SCEV::FlagAnyWrap);
+
+  { // Limit the lifetime of SCEVExpander.
+    const DataLayout &DL = Header->getModule()->getDataLayout();
+    SCEVExpander Expander(*SE, DL, "reroll");
+    Value *NewIV = Expander.expandCodeFor(NewIVSCEV, Inst->getType(),
+                                          Header->getFirstNonPHIOrDbg());
+
+    for (auto &KV : Uses)
+      if (KV.second.find_first() == 0)
+        KV.first->replaceUsesOfWith(Inst, NewIV);
+  }
+}
+
+// Validate the selected reductions. All iterations must have an isomorphic
+// part of the reduction chain and, for non-associative reductions, the chain
+// entries must appear in order.
+bool LoopReroll::ReductionTracker::validateSelected() {
+  // For a non-associative reduction, the chain entries must appear in order.
+  for (int i : Reds) {
+    int PrevIter = 0, BaseCount = 0, Count = 0;
+    for (Instruction *J : PossibleReds[i]) {
+      // Note that all instructions in the chain must have been found because
+      // all instructions in the function must have been assigned to some
+      // iteration.
+      int Iter = PossibleRedIter[J];
+      if (Iter != PrevIter && Iter != PrevIter + 1 &&
+          !PossibleReds[i].getReducedValue()->isAssociative()) {
+        LLVM_DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: "
+                          << J << "\n");
+        return false;
+      }
+
+      if (Iter != PrevIter) {
+        if (Count != BaseCount) {
+          LLVM_DEBUG(dbgs()
+                     << "LRR: Iteration " << PrevIter << " reduction use count "
+                     << Count << " is not equal to the base use count "
+                     << BaseCount << "\n");
+          return false;
+        }
+
+        Count = 0;
+      }
+
+      ++Count;
+      if (Iter == 0)
+        ++BaseCount;
+
+      PrevIter = Iter;
+    }
+  }
+
+  return true;
+}
+
+// For all selected reductions, remove all parts except those in the first
+// iteration (and the PHI). Replace outside uses of the reduced value with uses
+// of the first-iteration reduced value (in other words, reroll the selected
+// reductions).
+void LoopReroll::ReductionTracker::replaceSelected() {
+  // Fixup reductions to refer to the last instruction associated with the
+  // first iteration (not the last).
+  for (int i : Reds) {
+    int j = 0;
+    for (int e = PossibleReds[i].size(); j != e; ++j)
+      if (PossibleRedIter[PossibleReds[i][j]] != 0) {
+        --j;
+        break;
+      }
+
+    // Replace users with the new end-of-chain value.
+    SmallInstructionVector Users;
+    for (User *U : PossibleReds[i].getReducedValue()->users()) {
+      Users.push_back(cast<Instruction>(U));
+    }
+
+    for (Instruction *User : Users)
+      User->replaceUsesOfWith(PossibleReds[i].getReducedValue(),
+                              PossibleReds[i][j]);
+  }
+}
+
+// Reroll the provided loop with respect to the provided induction variable.
+// Generally, we're looking for a loop like this:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// f(%iv)
+// %iv.1 = add %iv, 1                <-- a root increment
+// f(%iv.1)
+// %iv.2 = add %iv, 2                <-- a root increment
+// f(%iv.2)
+// %iv.scale_m_1 = add %iv, scale-1  <-- a root increment
+// f(%iv.scale_m_1)
+// ...
+// %iv.next = add %iv, scale
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+//
+// Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of
+// instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can
+// be intermixed with eachother. The restriction imposed by this algorithm is
+// that the relative order of the isomorphic instructions in f(%iv), f(%iv.1),
+// etc. be the same.
+//
+// First, we collect the use set of %iv, excluding the other increment roots.
+// This gives us f(%iv). Then we iterate over the loop instructions (scale-1)
+// times, having collected the use set of f(%iv.(i+1)), during which we:
+//   - Ensure that the next unmatched instruction in f(%iv) is isomorphic to
+//     the next unmatched instruction in f(%iv.(i+1)).
+//   - Ensure that both matched instructions don't have any external users
+//     (with the exception of last-in-chain reduction instructions).
+//   - Track the (aliasing) write set, and other side effects, of all
+//     instructions that belong to future iterations that come before the matched
+//     instructions. If the matched instructions read from that write set, then
+//     f(%iv) or f(%iv.(i+1)) has some dependency on instructions in
+//     f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly,
+//     if any of these future instructions had side effects (could not be
+//     speculatively executed), and so do the matched instructions, when we
+//     cannot reorder those side-effect-producing instructions, and rerolling
+//     fails.
+//
+// Finally, we make sure that all loop instructions are either loop increment
+// roots, belong to simple latch code, parts of validated reductions, part of
+// f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions
+// have been validated), then we reroll the loop.
+bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
+                        const SCEV *BackedgeTakenCount,
+                        ReductionTracker &Reductions) {
+  DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, DT, LI, PreserveLCSSA,
+                          IVToIncMap, LoopControlIV);
+
+  if (!DAGRoots.findRoots())
+    return false;
+  LLVM_DEBUG(dbgs() << "LRR: Found all root induction increments for: " << *IV
+                    << "\n");
+
+  if (!DAGRoots.validate(Reductions))
+    return false;
+  if (!Reductions.validateSelected())
+    return false;
+  // At this point, we've validated the rerolling, and we're committed to
+  // making changes!
+
+  Reductions.replaceSelected();
+  DAGRoots.replace(BackedgeTakenCount);
+
+  ++NumRerolledLoops;
+  return true;
+}
+
 bool LoopReroll::runOnLoop(Loop *L) {
-  BasicBlock *Header = L->getHeader(); 
-  LLVM_DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() << "] Loop %" 
-                    << Header->getName() << " (" << L->getNumBlocks() 
-                    << " block(s))\n"); 
- 
-  // For now, we'll handle only single BB loops. 
-  if (L->getNumBlocks() > 1) 
-    return false; 
- 
-  if (!SE->hasLoopInvariantBackedgeTakenCount(L)) 
-    return false; 
- 
-  const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L); 
-  LLVM_DEBUG(dbgs() << "\n Before Reroll:\n" << *(L->getHeader()) << "\n"); 
-  LLVM_DEBUG(dbgs() << "LRR: backedge-taken count = " << *BackedgeTakenCount 
-               << "\n"); 
- 
-  // First, we need to find the induction variable with respect to which we can 
-  // reroll (there may be several possible options). 
-  SmallInstructionVector PossibleIVs; 
-  IVToIncMap.clear(); 
-  LoopControlIV = nullptr; 
-  collectPossibleIVs(L, PossibleIVs); 
- 
-  if (PossibleIVs.empty()) { 
-    LLVM_DEBUG(dbgs() << "LRR: No possible IVs found\n"); 
-    return false; 
-  } 
- 
-  ReductionTracker Reductions; 
-  collectPossibleReductions(L, Reductions); 
-  bool Changed = false; 
- 
-  // For each possible IV, collect the associated possible set of 'root' nodes 
-  // (i+1, i+2, etc.). 
-  for (Instruction *PossibleIV : PossibleIVs) 
-    if (reroll(PossibleIV, L, Header, BackedgeTakenCount, Reductions)) { 
-      Changed = true; 
-      break; 
-    } 
-  LLVM_DEBUG(dbgs() << "\n After Reroll:\n" << *(L->getHeader()) << "\n"); 
- 
-  // Trip count of L has changed so SE must be re-evaluated. 
-  if (Changed) 
-    SE->forgetLoop(L); 
- 
-  return Changed; 
-} 
+  BasicBlock *Header = L->getHeader();
+  LLVM_DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() << "] Loop %"
+                    << Header->getName() << " (" << L->getNumBlocks()
+                    << " block(s))\n");
+
+  // For now, we'll handle only single BB loops.
+  if (L->getNumBlocks() > 1)
+    return false;
+
+  if (!SE->hasLoopInvariantBackedgeTakenCount(L))
+    return false;
+
+  const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+  LLVM_DEBUG(dbgs() << "\n Before Reroll:\n" << *(L->getHeader()) << "\n");
+  LLVM_DEBUG(dbgs() << "LRR: backedge-taken count = " << *BackedgeTakenCount
+               << "\n");
+
+  // First, we need to find the induction variable with respect to which we can
+  // reroll (there may be several possible options).
+  SmallInstructionVector PossibleIVs;
+  IVToIncMap.clear();
+  LoopControlIV = nullptr;
+  collectPossibleIVs(L, PossibleIVs);
+
+  if (PossibleIVs.empty()) {
+    LLVM_DEBUG(dbgs() << "LRR: No possible IVs found\n");
+    return false;
+  }
+
+  ReductionTracker Reductions;
+  collectPossibleReductions(L, Reductions);
+  bool Changed = false;
+
+  // For each possible IV, collect the associated possible set of 'root' nodes
+  // (i+1, i+2, etc.).
+  for (Instruction *PossibleIV : PossibleIVs)
+    if (reroll(PossibleIV, L, Header, BackedgeTakenCount, Reductions)) {
+      Changed = true;
+      break;
+    }
+  LLVM_DEBUG(dbgs() << "\n After Reroll:\n" << *(L->getHeader()) << "\n");
+
+  // Trip count of L has changed so SE must be re-evaluated.
+  if (Changed)
+    SE->forgetLoop(L);
+
+  return Changed;
+}
 
 bool LoopRerollLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
   if (skipLoop(L))
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopRotation.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopRotation.cpp
index ff63d625d8..ad1cfc68ec 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopRotation.cpp
@@ -1,51 +1,51 @@
-//===- LoopRotation.cpp - Loop Rotation Pass ------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements Loop Rotation Pass. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/LoopRotation.h" 
-#include "llvm/ADT/Statistic.h" 
+//===- LoopRotation.cpp - Loop Rotation Pass ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements Loop Rotation Pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopRotation.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/Analysis/LoopPass.h" 
-#include "llvm/Analysis/MemorySSA.h" 
-#include "llvm/Analysis/MemorySSAUpdater.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Scalar/LoopPassManager.h" 
-#include "llvm/Transforms/Utils/LoopRotationUtils.h" 
-#include "llvm/Transforms/Utils/LoopUtils.h" 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "loop-rotate" 
- 
-static cl::opt<unsigned> DefaultRotationThreshold( 
-    "rotation-max-header-size", cl::init(16), cl::Hidden, 
-    cl::desc("The default maximum header size for automatic loop rotation")); 
- 
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/LoopRotationUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-rotate"
+
+static cl::opt<unsigned> DefaultRotationThreshold(
+    "rotation-max-header-size", cl::init(16), cl::Hidden,
+    cl::desc("The default maximum header size for automatic loop rotation"));
+
 static cl::opt<bool> PrepareForLTOOption(
     "rotation-prepare-for-lto", cl::init(false), cl::Hidden,
     cl::desc("Run loop-rotation in the prepare-for-lto stage. This option "
              "should be used for testing only."));
- 
+
 LoopRotatePass::LoopRotatePass(bool EnableHeaderDuplication, bool PrepareForLTO)
     : EnableHeaderDuplication(EnableHeaderDuplication),
       PrepareForLTO(PrepareForLTO) {}
 
-PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM, 
-                                      LoopStandardAnalysisResults &AR, 
-                                      LPMUpdater &) { 
+PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM,
+                                      LoopStandardAnalysisResults &AR,
+                                      LPMUpdater &) {
   // Vectorization requires loop-rotation. Use default threshold for loops the
   // user explicitly marked for vectorization, even when header duplication is
   // disabled.
@@ -53,75 +53,75 @@ PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM,
                           hasVectorizeTransformation(&L) == TM_ForcedByUser
                       ? DefaultRotationThreshold
                       : 0;
-  const DataLayout &DL = L.getHeader()->getModule()->getDataLayout(); 
-  const SimplifyQuery SQ = getBestSimplifyQuery(AR, DL); 
- 
-  Optional<MemorySSAUpdater> MSSAU; 
-  if (AR.MSSA) 
-    MSSAU = MemorySSAUpdater(AR.MSSA); 
+  const DataLayout &DL = L.getHeader()->getModule()->getDataLayout();
+  const SimplifyQuery SQ = getBestSimplifyQuery(AR, DL);
+
+  Optional<MemorySSAUpdater> MSSAU;
+  if (AR.MSSA)
+    MSSAU = MemorySSAUpdater(AR.MSSA);
   bool Changed =
       LoopRotation(&L, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE,
                    MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, SQ, false,
                    Threshold, false, PrepareForLTO || PrepareForLTOOption);
- 
-  if (!Changed) 
-    return PreservedAnalyses::all(); 
- 
-  if (AR.MSSA && VerifyMemorySSA) 
-    AR.MSSA->verifyMemorySSA(); 
- 
-  auto PA = getLoopPassPreservedAnalyses(); 
-  if (AR.MSSA) 
-    PA.preserve<MemorySSAAnalysis>(); 
-  return PA; 
-} 
- 
-namespace { 
- 
-class LoopRotateLegacyPass : public LoopPass { 
-  unsigned MaxHeaderSize; 
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  if (AR.MSSA && VerifyMemorySSA)
+    AR.MSSA->verifyMemorySSA();
+
+  auto PA = getLoopPassPreservedAnalyses();
+  if (AR.MSSA)
+    PA.preserve<MemorySSAAnalysis>();
+  return PA;
+}
+
+namespace {
+
+class LoopRotateLegacyPass : public LoopPass {
+  unsigned MaxHeaderSize;
   bool PrepareForLTO;
- 
-public: 
-  static char ID; // Pass ID, replacement for typeid 
+
+public:
+  static char ID; // Pass ID, replacement for typeid
   LoopRotateLegacyPass(int SpecifiedMaxHeaderSize = -1,
                        bool PrepareForLTO = false)
       : LoopPass(ID), PrepareForLTO(PrepareForLTO) {
-    initializeLoopRotateLegacyPassPass(*PassRegistry::getPassRegistry()); 
-    if (SpecifiedMaxHeaderSize == -1) 
-      MaxHeaderSize = DefaultRotationThreshold; 
-    else 
-      MaxHeaderSize = unsigned(SpecifiedMaxHeaderSize); 
-  } 
- 
-  // LCSSA form makes instruction renaming easier. 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<AssumptionCacheTracker>(); 
-    AU.addRequired<TargetTransformInfoWrapperPass>(); 
-    if (EnableMSSALoopDependency) 
-      AU.addPreserved<MemorySSAWrapperPass>(); 
-    getLoopAnalysisUsage(AU); 
-  } 
- 
-  bool runOnLoop(Loop *L, LPPassManager &LPM) override { 
-    if (skipLoop(L)) 
-      return false; 
-    Function &F = *L->getHeader()->getParent(); 
- 
-    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 
-    const auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 
-    auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 
-    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-    auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 
-    const SimplifyQuery SQ = getBestSimplifyQuery(*this, F); 
-    Optional<MemorySSAUpdater> MSSAU; 
-    if (EnableMSSALoopDependency) { 
-      // Not requiring MemorySSA and getting it only if available will split 
-      // the loop pass pipeline when LoopRotate is being run first. 
-      auto *MSSAA = getAnalysisIfAvailable<MemorySSAWrapperPass>(); 
-      if (MSSAA) 
-        MSSAU = MemorySSAUpdater(&MSSAA->getMSSA()); 
-    } 
+    initializeLoopRotateLegacyPassPass(*PassRegistry::getPassRegistry());
+    if (SpecifiedMaxHeaderSize == -1)
+      MaxHeaderSize = DefaultRotationThreshold;
+    else
+      MaxHeaderSize = unsigned(SpecifiedMaxHeaderSize);
+  }
+
+  // LCSSA form makes instruction renaming easier.
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    if (EnableMSSALoopDependency)
+      AU.addPreserved<MemorySSAWrapperPass>();
+    getLoopAnalysisUsage(AU);
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+    if (skipLoop(L))
+      return false;
+    Function &F = *L->getHeader()->getParent();
+
+    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    const auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    const SimplifyQuery SQ = getBestSimplifyQuery(*this, F);
+    Optional<MemorySSAUpdater> MSSAU;
+    if (EnableMSSALoopDependency) {
+      // Not requiring MemorySSA and getting it only if available will split
+      // the loop pass pipeline when LoopRotate is being run first.
+      auto *MSSAA = getAnalysisIfAvailable<MemorySSAWrapperPass>();
+      if (MSSAA)
+        MSSAU = MemorySSAUpdater(&MSSAA->getMSSA());
+    }
     // Vectorization requires loop-rotation. Use default threshold for loops the
     // user explicitly marked for vectorization, even when header duplication is
     // disabled.
@@ -129,24 +129,24 @@ public:
                         ? DefaultRotationThreshold
                         : MaxHeaderSize;
 
-    return LoopRotation(L, LI, TTI, AC, &DT, &SE, 
-                        MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, SQ, 
+    return LoopRotation(L, LI, TTI, AC, &DT, &SE,
+                        MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, SQ,
                         false, Threshold, false,
                         PrepareForLTO || PrepareForLTOOption);
-  } 
-}; 
-} // end namespace 
- 
-char LoopRotateLegacyPass::ID = 0; 
-INITIALIZE_PASS_BEGIN(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops", 
-                      false, false) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
-INITIALIZE_PASS_DEPENDENCY(LoopPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) 
-INITIALIZE_PASS_END(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops", false, 
-                    false) 
- 
+  }
+};
+} // end namespace
+
+char LoopRotateLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_END(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops", false,
+                    false)
+
 Pass *llvm::createLoopRotatePass(int MaxHeaderSize, bool PrepareForLTO) {
   return new LoopRotateLegacyPass(MaxHeaderSize, PrepareForLTO);
-} 
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index c5d3c4519b..cc6d112208 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -1,773 +1,773 @@
-//===--------- LoopSimplifyCFG.cpp - Loop CFG Simplification Pass ---------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the Loop SimplifyCFG Pass. This pass is responsible for 
-// basic loop CFG cleanup, primarily to assist other loop passes. If you 
-// encounter a noncanonical CFG construct that causes another loop pass to 
-// perform suboptimally, this is the place to fix it up. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/LoopSimplifyCFG.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/BasicAliasAnalysis.h" 
-#include "llvm/Analysis/DependenceAnalysis.h" 
-#include "llvm/Analysis/DomTreeUpdater.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/LoopIterator.h" 
-#include "llvm/Analysis/LoopPass.h" 
-#include "llvm/Analysis/MemorySSA.h" 
-#include "llvm/Analysis/MemorySSAUpdater.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Scalar/LoopPassManager.h" 
-#include "llvm/Transforms/Utils.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Transforms/Utils/LoopUtils.h" 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "loop-simplifycfg" 
- 
-static cl::opt<bool> EnableTermFolding("enable-loop-simplifycfg-term-folding", 
-                                       cl::init(true)); 
- 
-STATISTIC(NumTerminatorsFolded, 
-          "Number of terminators folded to unconditional branches"); 
-STATISTIC(NumLoopBlocksDeleted, 
-          "Number of loop blocks deleted"); 
-STATISTIC(NumLoopExitsDeleted, 
-          "Number of loop exiting edges deleted"); 
- 
-/// If \p BB is a switch or a conditional branch, but only one of its successors 
-/// can be reached from this block in runtime, return this successor. Otherwise, 
-/// return nullptr. 
-static BasicBlock *getOnlyLiveSuccessor(BasicBlock *BB) { 
-  Instruction *TI = BB->getTerminator(); 
-  if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { 
-    if (BI->isUnconditional()) 
-      return nullptr; 
-    if (BI->getSuccessor(0) == BI->getSuccessor(1)) 
-      return BI->getSuccessor(0); 
-    ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition()); 
-    if (!Cond) 
-      return nullptr; 
-    return Cond->isZero() ? BI->getSuccessor(1) : BI->getSuccessor(0); 
-  } 
- 
-  if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { 
-    auto *CI = dyn_cast<ConstantInt>(SI->getCondition()); 
-    if (!CI) 
-      return nullptr; 
-    for (auto Case : SI->cases()) 
-      if (Case.getCaseValue() == CI) 
-        return Case.getCaseSuccessor(); 
-    return SI->getDefaultDest(); 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Removes \p BB from all loops from [FirstLoop, LastLoop) in parent chain. 
-static void removeBlockFromLoops(BasicBlock *BB, Loop *FirstLoop, 
-                                 Loop *LastLoop = nullptr) { 
-  assert((!LastLoop || LastLoop->contains(FirstLoop->getHeader())) && 
-         "First loop is supposed to be inside of last loop!"); 
-  assert(FirstLoop->contains(BB) && "Must be a loop block!"); 
-  for (Loop *Current = FirstLoop; Current != LastLoop; 
-       Current = Current->getParentLoop()) 
-    Current->removeBlockFromLoop(BB); 
-} 
- 
-/// Find innermost loop that contains at least one block from \p BBs and 
-/// contains the header of loop \p L. 
-static Loop *getInnermostLoopFor(SmallPtrSetImpl<BasicBlock *> &BBs, 
-                                 Loop &L, LoopInfo &LI) { 
-  Loop *Innermost = nullptr; 
-  for (BasicBlock *BB : BBs) { 
-    Loop *BBL = LI.getLoopFor(BB); 
-    while (BBL && !BBL->contains(L.getHeader())) 
-      BBL = BBL->getParentLoop(); 
-    if (BBL == &L) 
-      BBL = BBL->getParentLoop(); 
-    if (!BBL) 
-      continue; 
-    if (!Innermost || BBL->getLoopDepth() > Innermost->getLoopDepth()) 
-      Innermost = BBL; 
-  } 
-  return Innermost; 
-} 
- 
-namespace { 
-/// Helper class that can turn branches and switches with constant conditions 
-/// into unconditional branches. 
-class ConstantTerminatorFoldingImpl { 
-private: 
-  Loop &L; 
-  LoopInfo &LI; 
-  DominatorTree &DT; 
-  ScalarEvolution &SE; 
-  MemorySSAUpdater *MSSAU; 
-  LoopBlocksDFS DFS; 
-  DomTreeUpdater DTU; 
-  SmallVector<DominatorTree::UpdateType, 16> DTUpdates; 
- 
-  // Whether or not the current loop has irreducible CFG. 
-  bool HasIrreducibleCFG = false; 
-  // Whether or not the current loop will still exist after terminator constant 
-  // folding will be done. In theory, there are two ways how it can happen: 
-  // 1. Loop's latch(es) become unreachable from loop header; 
-  // 2. Loop's header becomes unreachable from method entry. 
-  // In practice, the second situation is impossible because we only modify the 
-  // current loop and its preheader and do not affect preheader's reachibility 
-  // from any other block. So this variable set to true means that loop's latch 
-  // has become unreachable from loop header. 
-  bool DeleteCurrentLoop = false; 
- 
-  // The blocks of the original loop that will still be reachable from entry 
-  // after the constant folding. 
-  SmallPtrSet<BasicBlock *, 8> LiveLoopBlocks; 
-  // The blocks of the original loop that will become unreachable from entry 
-  // after the constant folding. 
-  SmallVector<BasicBlock *, 8> DeadLoopBlocks; 
-  // The exits of the original loop that will still be reachable from entry 
-  // after the constant folding. 
-  SmallPtrSet<BasicBlock *, 8> LiveExitBlocks; 
-  // The exits of the original loop that will become unreachable from entry 
-  // after the constant folding. 
-  SmallVector<BasicBlock *, 8> DeadExitBlocks; 
-  // The blocks that will still be a part of the current loop after folding. 
-  SmallPtrSet<BasicBlock *, 8> BlocksInLoopAfterFolding; 
-  // The blocks that have terminators with constant condition that can be 
-  // folded. Note: fold candidates should be in L but not in any of its 
-  // subloops to avoid complex LI updates. 
-  SmallVector<BasicBlock *, 8> FoldCandidates; 
- 
-  void dump() const { 
-    dbgs() << "Constant terminator folding for loop " << L << "\n"; 
-    dbgs() << "After terminator constant-folding, the loop will"; 
-    if (!DeleteCurrentLoop) 
-      dbgs() << " not"; 
-    dbgs() << " be destroyed\n"; 
-    auto PrintOutVector = [&](const char *Message, 
-                           const SmallVectorImpl<BasicBlock *> &S) { 
-      dbgs() << Message << "\n"; 
-      for (const BasicBlock *BB : S) 
-        dbgs() << "\t" << BB->getName() << "\n"; 
-    }; 
-    auto PrintOutSet = [&](const char *Message, 
-                           const SmallPtrSetImpl<BasicBlock *> &S) { 
-      dbgs() << Message << "\n"; 
-      for (const BasicBlock *BB : S) 
-        dbgs() << "\t" << BB->getName() << "\n"; 
-    }; 
-    PrintOutVector("Blocks in which we can constant-fold terminator:", 
-                   FoldCandidates); 
-    PrintOutSet("Live blocks from the original loop:", LiveLoopBlocks); 
-    PrintOutVector("Dead blocks from the original loop:", DeadLoopBlocks); 
-    PrintOutSet("Live exit blocks:", LiveExitBlocks); 
-    PrintOutVector("Dead exit blocks:", DeadExitBlocks); 
-    if (!DeleteCurrentLoop) 
-      PrintOutSet("The following blocks will still be part of the loop:", 
-                  BlocksInLoopAfterFolding); 
-  } 
- 
-  /// Whether or not the current loop has irreducible CFG. 
-  bool hasIrreducibleCFG(LoopBlocksDFS &DFS) { 
-    assert(DFS.isComplete() && "DFS is expected to be finished"); 
-    // Index of a basic block in RPO traversal. 
-    DenseMap<const BasicBlock *, unsigned> RPO; 
-    unsigned Current = 0; 
-    for (auto I = DFS.beginRPO(), E = DFS.endRPO(); I != E; ++I) 
-      RPO[*I] = Current++; 
- 
-    for (auto I = DFS.beginRPO(), E = DFS.endRPO(); I != E; ++I) { 
-      BasicBlock *BB = *I; 
-      for (auto *Succ : successors(BB)) 
-        if (L.contains(Succ) && !LI.isLoopHeader(Succ) && RPO[BB] > RPO[Succ]) 
-          // If an edge goes from a block with greater order number into a block 
-          // with lesses number, and it is not a loop backedge, then it can only 
-          // be a part of irreducible non-loop cycle. 
-          return true; 
-    } 
-    return false; 
-  } 
- 
-  /// Fill all information about status of blocks and exits of the current loop 
-  /// if constant folding of all branches will be done. 
-  void analyze() { 
-    DFS.perform(&LI); 
-    assert(DFS.isComplete() && "DFS is expected to be finished"); 
- 
-    // TODO: The algorithm below relies on both RPO and Postorder traversals. 
-    // When the loop has only reducible CFG inside, then the invariant "all 
-    // predecessors of X are processed before X in RPO" is preserved. However 
-    // an irreducible loop can break this invariant (e.g. latch does not have to 
-    // be the last block in the traversal in this case, and the algorithm relies 
-    // on this). We can later decide to support such cases by altering the 
-    // algorithms, but so far we just give up analyzing them. 
-    if (hasIrreducibleCFG(DFS)) { 
-      HasIrreducibleCFG = true; 
-      return; 
-    } 
- 
-    // Collect live and dead loop blocks and exits. 
-    LiveLoopBlocks.insert(L.getHeader()); 
-    for (auto I = DFS.beginRPO(), E = DFS.endRPO(); I != E; ++I) { 
-      BasicBlock *BB = *I; 
- 
-      // If a loop block wasn't marked as live so far, then it's dead. 
-      if (!LiveLoopBlocks.count(BB)) { 
-        DeadLoopBlocks.push_back(BB); 
-        continue; 
-      } 
- 
-      BasicBlock *TheOnlySucc = getOnlyLiveSuccessor(BB); 
- 
-      // If a block has only one live successor, it's a candidate on constant 
-      // folding. Only handle blocks from current loop: branches in child loops 
-      // are skipped because if they can be folded, they should be folded during 
-      // the processing of child loops. 
-      bool TakeFoldCandidate = TheOnlySucc && LI.getLoopFor(BB) == &L; 
-      if (TakeFoldCandidate) 
-        FoldCandidates.push_back(BB); 
- 
-      // Handle successors. 
-      for (BasicBlock *Succ : successors(BB)) 
-        if (!TakeFoldCandidate || TheOnlySucc == Succ) { 
-          if (L.contains(Succ)) 
-            LiveLoopBlocks.insert(Succ); 
-          else 
-            LiveExitBlocks.insert(Succ); 
-        } 
-    } 
- 
-    // Sanity check: amount of dead and live loop blocks should match the total 
-    // number of blocks in loop. 
-    assert(L.getNumBlocks() == LiveLoopBlocks.size() + DeadLoopBlocks.size() && 
-           "Malformed block sets?"); 
- 
-    // Now, all exit blocks that are not marked as live are dead. 
-    SmallVector<BasicBlock *, 8> ExitBlocks; 
-    L.getExitBlocks(ExitBlocks); 
-    SmallPtrSet<BasicBlock *, 8> UniqueDeadExits; 
-    for (auto *ExitBlock : ExitBlocks) 
-      if (!LiveExitBlocks.count(ExitBlock) && 
-          UniqueDeadExits.insert(ExitBlock).second) 
-        DeadExitBlocks.push_back(ExitBlock); 
- 
-    // Whether or not the edge From->To will still be present in graph after the 
-    // folding. 
-    auto IsEdgeLive = [&](BasicBlock *From, BasicBlock *To) { 
-      if (!LiveLoopBlocks.count(From)) 
-        return false; 
-      BasicBlock *TheOnlySucc = getOnlyLiveSuccessor(From); 
-      return !TheOnlySucc || TheOnlySucc == To || LI.getLoopFor(From) != &L; 
-    }; 
- 
-    // The loop will not be destroyed if its latch is live. 
-    DeleteCurrentLoop = !IsEdgeLive(L.getLoopLatch(), L.getHeader()); 
- 
-    // If we are going to delete the current loop completely, no extra analysis 
-    // is needed. 
-    if (DeleteCurrentLoop) 
-      return; 
- 
-    // Otherwise, we should check which blocks will still be a part of the 
-    // current loop after the transform. 
-    BlocksInLoopAfterFolding.insert(L.getLoopLatch()); 
-    // If the loop is live, then we should compute what blocks are still in 
-    // loop after all branch folding has been done. A block is in loop if 
-    // it has a live edge to another block that is in the loop; by definition, 
-    // latch is in the loop. 
-    auto BlockIsInLoop = [&](BasicBlock *BB) { 
-      return any_of(successors(BB), [&](BasicBlock *Succ) { 
-        return BlocksInLoopAfterFolding.count(Succ) && IsEdgeLive(BB, Succ); 
-      }); 
-    }; 
-    for (auto I = DFS.beginPostorder(), E = DFS.endPostorder(); I != E; ++I) { 
-      BasicBlock *BB = *I; 
-      if (BlockIsInLoop(BB)) 
-        BlocksInLoopAfterFolding.insert(BB); 
-    } 
- 
-    // Sanity check: header must be in loop. 
-    assert(BlocksInLoopAfterFolding.count(L.getHeader()) && 
-           "Header not in loop?"); 
-    assert(BlocksInLoopAfterFolding.size() <= LiveLoopBlocks.size() && 
-           "All blocks that stay in loop should be live!"); 
-  } 
- 
-  /// We need to preserve static reachibility of all loop exit blocks (this is) 
-  /// required by loop pass manager. In order to do it, we make the following 
-  /// trick: 
-  /// 
-  ///  preheader: 
-  ///    <preheader code> 
-  ///    br label %loop_header 
-  /// 
-  ///  loop_header: 
-  ///    ... 
-  ///    br i1 false, label %dead_exit, label %loop_block 
-  ///    ... 
-  /// 
-  /// We cannot simply remove edge from the loop to dead exit because in this 
-  /// case dead_exit (and its successors) may become unreachable. To avoid that, 
-  /// we insert the following fictive preheader: 
-  /// 
-  ///  preheader: 
-  ///    <preheader code> 
-  ///    switch i32 0, label %preheader-split, 
-  ///                  [i32 1, label %dead_exit_1], 
-  ///                  [i32 2, label %dead_exit_2], 
-  ///                  ... 
-  ///                  [i32 N, label %dead_exit_N], 
-  /// 
-  ///  preheader-split: 
-  ///    br label %loop_header 
-  /// 
-  ///  loop_header: 
-  ///    ... 
-  ///    br i1 false, label %dead_exit_N, label %loop_block 
-  ///    ... 
-  /// 
-  /// Doing so, we preserve static reachibility of all dead exits and can later 
-  /// remove edges from the loop to these blocks. 
-  void handleDeadExits() { 
-    // If no dead exits, nothing to do. 
-    if (DeadExitBlocks.empty()) 
-      return; 
- 
-    // Construct split preheader and the dummy switch to thread edges from it to 
-    // dead exits. 
-    BasicBlock *Preheader = L.getLoopPreheader(); 
-    BasicBlock *NewPreheader = llvm::SplitBlock( 
-        Preheader, Preheader->getTerminator(), &DT, &LI, MSSAU); 
- 
-    IRBuilder<> Builder(Preheader->getTerminator()); 
-    SwitchInst *DummySwitch = 
-        Builder.CreateSwitch(Builder.getInt32(0), NewPreheader); 
-    Preheader->getTerminator()->eraseFromParent(); 
- 
-    unsigned DummyIdx = 1; 
-    for (BasicBlock *BB : DeadExitBlocks) { 
+//===--------- LoopSimplifyCFG.cpp - Loop CFG Simplification Pass ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Loop SimplifyCFG Pass. This pass is responsible for
+// basic loop CFG cleanup, primarily to assist other loop passes. If you
+// encounter a noncanonical CFG construct that causes another loop pass to
+// perform suboptimally, this is the place to fix it up.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopSimplifyCFG.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-simplifycfg"
+
+static cl::opt<bool> EnableTermFolding("enable-loop-simplifycfg-term-folding",
+                                       cl::init(true));
+
+STATISTIC(NumTerminatorsFolded,
+          "Number of terminators folded to unconditional branches");
+STATISTIC(NumLoopBlocksDeleted,
+          "Number of loop blocks deleted");
+STATISTIC(NumLoopExitsDeleted,
+          "Number of loop exiting edges deleted");
+
+/// If \p BB is a switch or a conditional branch, but only one of its successors
+/// can be reached from this block in runtime, return this successor. Otherwise,
+/// return nullptr.
+static BasicBlock *getOnlyLiveSuccessor(BasicBlock *BB) {
+  Instruction *TI = BB->getTerminator();
+  if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+    if (BI->isUnconditional())
+      return nullptr;
+    if (BI->getSuccessor(0) == BI->getSuccessor(1))
+      return BI->getSuccessor(0);
+    ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition());
+    if (!Cond)
+      return nullptr;
+    return Cond->isZero() ? BI->getSuccessor(1) : BI->getSuccessor(0);
+  }
+
+  if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+    auto *CI = dyn_cast<ConstantInt>(SI->getCondition());
+    if (!CI)
+      return nullptr;
+    for (auto Case : SI->cases())
+      if (Case.getCaseValue() == CI)
+        return Case.getCaseSuccessor();
+    return SI->getDefaultDest();
+  }
+
+  return nullptr;
+}
+
+/// Removes \p BB from all loops from [FirstLoop, LastLoop) in parent chain.
+static void removeBlockFromLoops(BasicBlock *BB, Loop *FirstLoop,
+                                 Loop *LastLoop = nullptr) {
+  assert((!LastLoop || LastLoop->contains(FirstLoop->getHeader())) &&
+         "First loop is supposed to be inside of last loop!");
+  assert(FirstLoop->contains(BB) && "Must be a loop block!");
+  for (Loop *Current = FirstLoop; Current != LastLoop;
+       Current = Current->getParentLoop())
+    Current->removeBlockFromLoop(BB);
+}
+
+/// Find innermost loop that contains at least one block from \p BBs and
+/// contains the header of loop \p L.
+static Loop *getInnermostLoopFor(SmallPtrSetImpl<BasicBlock *> &BBs,
+                                 Loop &L, LoopInfo &LI) {
+  Loop *Innermost = nullptr;
+  for (BasicBlock *BB : BBs) {
+    Loop *BBL = LI.getLoopFor(BB);
+    while (BBL && !BBL->contains(L.getHeader()))
+      BBL = BBL->getParentLoop();
+    if (BBL == &L)
+      BBL = BBL->getParentLoop();
+    if (!BBL)
+      continue;
+    if (!Innermost || BBL->getLoopDepth() > Innermost->getLoopDepth())
+      Innermost = BBL;
+  }
+  return Innermost;
+}
+
+namespace {
+/// Helper class that can turn branches and switches with constant conditions
+/// into unconditional branches.
+class ConstantTerminatorFoldingImpl {
+private:
+  Loop &L;
+  LoopInfo &LI;
+  DominatorTree &DT;
+  ScalarEvolution &SE;
+  MemorySSAUpdater *MSSAU;
+  LoopBlocksDFS DFS;
+  DomTreeUpdater DTU;
+  SmallVector<DominatorTree::UpdateType, 16> DTUpdates;
+
+  // Whether or not the current loop has irreducible CFG.
+  bool HasIrreducibleCFG = false;
+  // Whether or not the current loop will still exist after terminator constant
+  // folding will be done. In theory, there are two ways how it can happen:
+  // 1. Loop's latch(es) become unreachable from loop header;
+  // 2. Loop's header becomes unreachable from method entry.
+  // In practice, the second situation is impossible because we only modify the
+  // current loop and its preheader and do not affect preheader's reachibility
+  // from any other block. So this variable set to true means that loop's latch
+  // has become unreachable from loop header.
+  bool DeleteCurrentLoop = false;
+
+  // The blocks of the original loop that will still be reachable from entry
+  // after the constant folding.
+  SmallPtrSet<BasicBlock *, 8> LiveLoopBlocks;
+  // The blocks of the original loop that will become unreachable from entry
+  // after the constant folding.
+  SmallVector<BasicBlock *, 8> DeadLoopBlocks;
+  // The exits of the original loop that will still be reachable from entry
+  // after the constant folding.
+  SmallPtrSet<BasicBlock *, 8> LiveExitBlocks;
+  // The exits of the original loop that will become unreachable from entry
+  // after the constant folding.
+  SmallVector<BasicBlock *, 8> DeadExitBlocks;
+  // The blocks that will still be a part of the current loop after folding.
+  SmallPtrSet<BasicBlock *, 8> BlocksInLoopAfterFolding;
+  // The blocks that have terminators with constant condition that can be
+  // folded. Note: fold candidates should be in L but not in any of its
+  // subloops to avoid complex LI updates.
+  SmallVector<BasicBlock *, 8> FoldCandidates;
+
+  void dump() const {
+    dbgs() << "Constant terminator folding for loop " << L << "\n";
+    dbgs() << "After terminator constant-folding, the loop will";
+    if (!DeleteCurrentLoop)
+      dbgs() << " not";
+    dbgs() << " be destroyed\n";
+    auto PrintOutVector = [&](const char *Message,
+                           const SmallVectorImpl<BasicBlock *> &S) {
+      dbgs() << Message << "\n";
+      for (const BasicBlock *BB : S)
+        dbgs() << "\t" << BB->getName() << "\n";
+    };
+    auto PrintOutSet = [&](const char *Message,
+                           const SmallPtrSetImpl<BasicBlock *> &S) {
+      dbgs() << Message << "\n";
+      for (const BasicBlock *BB : S)
+        dbgs() << "\t" << BB->getName() << "\n";
+    };
+    PrintOutVector("Blocks in which we can constant-fold terminator:",
+                   FoldCandidates);
+    PrintOutSet("Live blocks from the original loop:", LiveLoopBlocks);
+    PrintOutVector("Dead blocks from the original loop:", DeadLoopBlocks);
+    PrintOutSet("Live exit blocks:", LiveExitBlocks);
+    PrintOutVector("Dead exit blocks:", DeadExitBlocks);
+    if (!DeleteCurrentLoop)
+      PrintOutSet("The following blocks will still be part of the loop:",
+                  BlocksInLoopAfterFolding);
+  }
+
+  /// Whether or not the current loop has irreducible CFG.
+  bool hasIrreducibleCFG(LoopBlocksDFS &DFS) {
+    assert(DFS.isComplete() && "DFS is expected to be finished");
+    // Index of a basic block in RPO traversal.
+    DenseMap<const BasicBlock *, unsigned> RPO;
+    unsigned Current = 0;
+    for (auto I = DFS.beginRPO(), E = DFS.endRPO(); I != E; ++I)
+      RPO[*I] = Current++;
+
+    for (auto I = DFS.beginRPO(), E = DFS.endRPO(); I != E; ++I) {
+      BasicBlock *BB = *I;
+      for (auto *Succ : successors(BB))
+        if (L.contains(Succ) && !LI.isLoopHeader(Succ) && RPO[BB] > RPO[Succ])
+          // If an edge goes from a block with greater order number into a block
+          // with lesses number, and it is not a loop backedge, then it can only
+          // be a part of irreducible non-loop cycle.
+          return true;
+    }
+    return false;
+  }
+
+  /// Fill all information about status of blocks and exits of the current loop
+  /// if constant folding of all branches will be done.
+  void analyze() {
+    DFS.perform(&LI);
+    assert(DFS.isComplete() && "DFS is expected to be finished");
+
+    // TODO: The algorithm below relies on both RPO and Postorder traversals.
+    // When the loop has only reducible CFG inside, then the invariant "all
+    // predecessors of X are processed before X in RPO" is preserved. However
+    // an irreducible loop can break this invariant (e.g. latch does not have to
+    // be the last block in the traversal in this case, and the algorithm relies
+    // on this). We can later decide to support such cases by altering the
+    // algorithms, but so far we just give up analyzing them.
+    if (hasIrreducibleCFG(DFS)) {
+      HasIrreducibleCFG = true;
+      return;
+    }
+
+    // Collect live and dead loop blocks and exits.
+    LiveLoopBlocks.insert(L.getHeader());
+    for (auto I = DFS.beginRPO(), E = DFS.endRPO(); I != E; ++I) {
+      BasicBlock *BB = *I;
+
+      // If a loop block wasn't marked as live so far, then it's dead.
+      if (!LiveLoopBlocks.count(BB)) {
+        DeadLoopBlocks.push_back(BB);
+        continue;
+      }
+
+      BasicBlock *TheOnlySucc = getOnlyLiveSuccessor(BB);
+
+      // If a block has only one live successor, it's a candidate on constant
+      // folding. Only handle blocks from current loop: branches in child loops
+      // are skipped because if they can be folded, they should be folded during
+      // the processing of child loops.
+      bool TakeFoldCandidate = TheOnlySucc && LI.getLoopFor(BB) == &L;
+      if (TakeFoldCandidate)
+        FoldCandidates.push_back(BB);
+
+      // Handle successors.
+      for (BasicBlock *Succ : successors(BB))
+        if (!TakeFoldCandidate || TheOnlySucc == Succ) {
+          if (L.contains(Succ))
+            LiveLoopBlocks.insert(Succ);
+          else
+            LiveExitBlocks.insert(Succ);
+        }
+    }
+
+    // Sanity check: amount of dead and live loop blocks should match the total
+    // number of blocks in loop.
+    assert(L.getNumBlocks() == LiveLoopBlocks.size() + DeadLoopBlocks.size() &&
+           "Malformed block sets?");
+
+    // Now, all exit blocks that are not marked as live are dead.
+    SmallVector<BasicBlock *, 8> ExitBlocks;
+    L.getExitBlocks(ExitBlocks);
+    SmallPtrSet<BasicBlock *, 8> UniqueDeadExits;
+    for (auto *ExitBlock : ExitBlocks)
+      if (!LiveExitBlocks.count(ExitBlock) &&
+          UniqueDeadExits.insert(ExitBlock).second)
+        DeadExitBlocks.push_back(ExitBlock);
+
+    // Whether or not the edge From->To will still be present in graph after the
+    // folding.
+    auto IsEdgeLive = [&](BasicBlock *From, BasicBlock *To) {
+      if (!LiveLoopBlocks.count(From))
+        return false;
+      BasicBlock *TheOnlySucc = getOnlyLiveSuccessor(From);
+      return !TheOnlySucc || TheOnlySucc == To || LI.getLoopFor(From) != &L;
+    };
+
+    // The loop will not be destroyed if its latch is live.
+    DeleteCurrentLoop = !IsEdgeLive(L.getLoopLatch(), L.getHeader());
+
+    // If we are going to delete the current loop completely, no extra analysis
+    // is needed.
+    if (DeleteCurrentLoop)
+      return;
+
+    // Otherwise, we should check which blocks will still be a part of the
+    // current loop after the transform.
+    BlocksInLoopAfterFolding.insert(L.getLoopLatch());
+    // If the loop is live, then we should compute what blocks are still in
+    // loop after all branch folding has been done. A block is in loop if
+    // it has a live edge to another block that is in the loop; by definition,
+    // latch is in the loop.
+    auto BlockIsInLoop = [&](BasicBlock *BB) {
+      return any_of(successors(BB), [&](BasicBlock *Succ) {
+        return BlocksInLoopAfterFolding.count(Succ) && IsEdgeLive(BB, Succ);
+      });
+    };
+    for (auto I = DFS.beginPostorder(), E = DFS.endPostorder(); I != E; ++I) {
+      BasicBlock *BB = *I;
+      if (BlockIsInLoop(BB))
+        BlocksInLoopAfterFolding.insert(BB);
+    }
+
+    // Sanity check: header must be in loop.
+    assert(BlocksInLoopAfterFolding.count(L.getHeader()) &&
+           "Header not in loop?");
+    assert(BlocksInLoopAfterFolding.size() <= LiveLoopBlocks.size() &&
+           "All blocks that stay in loop should be live!");
+  }
+
+  /// We need to preserve static reachibility of all loop exit blocks (this is)
+  /// required by loop pass manager. In order to do it, we make the following
+  /// trick:
+  ///
+  ///  preheader:
+  ///    <preheader code>
+  ///    br label %loop_header
+  ///
+  ///  loop_header:
+  ///    ...
+  ///    br i1 false, label %dead_exit, label %loop_block
+  ///    ...
+  ///
+  /// We cannot simply remove edge from the loop to dead exit because in this
+  /// case dead_exit (and its successors) may become unreachable. To avoid that,
+  /// we insert the following fictive preheader:
+  ///
+  ///  preheader:
+  ///    <preheader code>
+  ///    switch i32 0, label %preheader-split,
+  ///                  [i32 1, label %dead_exit_1],
+  ///                  [i32 2, label %dead_exit_2],
+  ///                  ...
+  ///                  [i32 N, label %dead_exit_N],
+  ///
+  ///  preheader-split:
+  ///    br label %loop_header
+  ///
+  ///  loop_header:
+  ///    ...
+  ///    br i1 false, label %dead_exit_N, label %loop_block
+  ///    ...
+  ///
+  /// Doing so, we preserve static reachibility of all dead exits and can later
+  /// remove edges from the loop to these blocks.
+  void handleDeadExits() {
+    // If no dead exits, nothing to do.
+    if (DeadExitBlocks.empty())
+      return;
+
+    // Construct split preheader and the dummy switch to thread edges from it to
+    // dead exits.
+    BasicBlock *Preheader = L.getLoopPreheader();
+    BasicBlock *NewPreheader = llvm::SplitBlock(
+        Preheader, Preheader->getTerminator(), &DT, &LI, MSSAU);
+
+    IRBuilder<> Builder(Preheader->getTerminator());
+    SwitchInst *DummySwitch =
+        Builder.CreateSwitch(Builder.getInt32(0), NewPreheader);
+    Preheader->getTerminator()->eraseFromParent();
+
+    unsigned DummyIdx = 1;
+    for (BasicBlock *BB : DeadExitBlocks) {
       // Eliminate all Phis and LandingPads from dead exits.
       // TODO: Consider removing all instructions in this dead block.
       SmallVector<Instruction *, 4> DeadInstructions;
-      for (auto &PN : BB->phis()) 
+      for (auto &PN : BB->phis())
         DeadInstructions.push_back(&PN);
- 
+
       if (auto *LandingPad = dyn_cast<LandingPadInst>(BB->getFirstNonPHI()))
         DeadInstructions.emplace_back(LandingPad);
 
       for (Instruction *I : DeadInstructions) {
         I->replaceAllUsesWith(UndefValue::get(I->getType()));
         I->eraseFromParent();
-      } 
-
-      assert(DummyIdx != 0 && "Too many dead exits!"); 
-      DummySwitch->addCase(Builder.getInt32(DummyIdx++), BB); 
-      DTUpdates.push_back({DominatorTree::Insert, Preheader, BB}); 
-      ++NumLoopExitsDeleted; 
-    } 
- 
-    assert(L.getLoopPreheader() == NewPreheader && "Malformed CFG?"); 
-    if (Loop *OuterLoop = LI.getLoopFor(Preheader)) { 
-      // When we break dead edges, the outer loop may become unreachable from 
-      // the current loop. We need to fix loop info accordingly. For this, we 
-      // find the most nested loop that still contains L and remove L from all 
-      // loops that are inside of it. 
-      Loop *StillReachable = getInnermostLoopFor(LiveExitBlocks, L, LI); 
- 
-      // Okay, our loop is no longer in the outer loop (and maybe not in some of 
-      // its parents as well). Make the fixup. 
-      if (StillReachable != OuterLoop) { 
-        LI.changeLoopFor(NewPreheader, StillReachable); 
-        removeBlockFromLoops(NewPreheader, OuterLoop, StillReachable); 
-        for (auto *BB : L.blocks()) 
-          removeBlockFromLoops(BB, OuterLoop, StillReachable); 
-        OuterLoop->removeChildLoop(&L); 
-        if (StillReachable) 
-          StillReachable->addChildLoop(&L); 
-        else 
-          LI.addTopLevelLoop(&L); 
- 
-        // Some values from loops in [OuterLoop, StillReachable) could be used 
-        // in the current loop. Now it is not their child anymore, so such uses 
-        // require LCSSA Phis. 
-        Loop *FixLCSSALoop = OuterLoop; 
-        while (FixLCSSALoop->getParentLoop() != StillReachable) 
-          FixLCSSALoop = FixLCSSALoop->getParentLoop(); 
-        assert(FixLCSSALoop && "Should be a loop!"); 
-        // We need all DT updates to be done before forming LCSSA. 
-        if (MSSAU) 
+      }
+
+      assert(DummyIdx != 0 && "Too many dead exits!");
+      DummySwitch->addCase(Builder.getInt32(DummyIdx++), BB);
+      DTUpdates.push_back({DominatorTree::Insert, Preheader, BB});
+      ++NumLoopExitsDeleted;
+    }
+
+    assert(L.getLoopPreheader() == NewPreheader && "Malformed CFG?");
+    if (Loop *OuterLoop = LI.getLoopFor(Preheader)) {
+      // When we break dead edges, the outer loop may become unreachable from
+      // the current loop. We need to fix loop info accordingly. For this, we
+      // find the most nested loop that still contains L and remove L from all
+      // loops that are inside of it.
+      Loop *StillReachable = getInnermostLoopFor(LiveExitBlocks, L, LI);
+
+      // Okay, our loop is no longer in the outer loop (and maybe not in some of
+      // its parents as well). Make the fixup.
+      if (StillReachable != OuterLoop) {
+        LI.changeLoopFor(NewPreheader, StillReachable);
+        removeBlockFromLoops(NewPreheader, OuterLoop, StillReachable);
+        for (auto *BB : L.blocks())
+          removeBlockFromLoops(BB, OuterLoop, StillReachable);
+        OuterLoop->removeChildLoop(&L);
+        if (StillReachable)
+          StillReachable->addChildLoop(&L);
+        else
+          LI.addTopLevelLoop(&L);
+
+        // Some values from loops in [OuterLoop, StillReachable) could be used
+        // in the current loop. Now it is not their child anymore, so such uses
+        // require LCSSA Phis.
+        Loop *FixLCSSALoop = OuterLoop;
+        while (FixLCSSALoop->getParentLoop() != StillReachable)
+          FixLCSSALoop = FixLCSSALoop->getParentLoop();
+        assert(FixLCSSALoop && "Should be a loop!");
+        // We need all DT updates to be done before forming LCSSA.
+        if (MSSAU)
           MSSAU->applyUpdates(DTUpdates, DT, /*UpdateDT=*/true);
         else
           DTU.applyUpdates(DTUpdates);
-        DTUpdates.clear(); 
-        formLCSSARecursively(*FixLCSSALoop, DT, &LI, &SE); 
-      } 
-    } 
- 
-    if (MSSAU) { 
-      // Clear all updates now. Facilitates deletes that follow. 
+        DTUpdates.clear();
+        formLCSSARecursively(*FixLCSSALoop, DT, &LI, &SE);
+      }
+    }
+
+    if (MSSAU) {
+      // Clear all updates now. Facilitates deletes that follow.
       MSSAU->applyUpdates(DTUpdates, DT, /*UpdateDT=*/true);
-      DTUpdates.clear(); 
-      if (VerifyMemorySSA) 
-        MSSAU->getMemorySSA()->verifyMemorySSA(); 
-    } 
-  } 
- 
-  /// Delete loop blocks that have become unreachable after folding. Make all 
-  /// relevant updates to DT and LI. 
-  void deleteDeadLoopBlocks() { 
-    if (MSSAU) { 
-      SmallSetVector<BasicBlock *, 8> DeadLoopBlocksSet(DeadLoopBlocks.begin(), 
-                                                        DeadLoopBlocks.end()); 
-      MSSAU->removeBlocks(DeadLoopBlocksSet); 
-    } 
- 
-    // The function LI.erase has some invariants that need to be preserved when 
-    // it tries to remove a loop which is not the top-level loop. In particular, 
-    // it requires loop's preheader to be strictly in loop's parent. We cannot 
-    // just remove blocks one by one, because after removal of preheader we may 
-    // break this invariant for the dead loop. So we detatch and erase all dead 
-    // loops beforehand. 
-    for (auto *BB : DeadLoopBlocks) 
-      if (LI.isLoopHeader(BB)) { 
-        assert(LI.getLoopFor(BB) != &L && "Attempt to remove current loop!"); 
-        Loop *DL = LI.getLoopFor(BB); 
+      DTUpdates.clear();
+      if (VerifyMemorySSA)
+        MSSAU->getMemorySSA()->verifyMemorySSA();
+    }
+  }
+
+  /// Delete loop blocks that have become unreachable after folding. Make all
+  /// relevant updates to DT and LI.
+  void deleteDeadLoopBlocks() {
+    if (MSSAU) {
+      SmallSetVector<BasicBlock *, 8> DeadLoopBlocksSet(DeadLoopBlocks.begin(),
+                                                        DeadLoopBlocks.end());
+      MSSAU->removeBlocks(DeadLoopBlocksSet);
+    }
+
+    // The function LI.erase has some invariants that need to be preserved when
+    // it tries to remove a loop which is not the top-level loop. In particular,
+    // it requires loop's preheader to be strictly in loop's parent. We cannot
+    // just remove blocks one by one, because after removal of preheader we may
+    // break this invariant for the dead loop. So we detatch and erase all dead
+    // loops beforehand.
+    for (auto *BB : DeadLoopBlocks)
+      if (LI.isLoopHeader(BB)) {
+        assert(LI.getLoopFor(BB) != &L && "Attempt to remove current loop!");
+        Loop *DL = LI.getLoopFor(BB);
         if (!DL->isOutermost()) {
-          for (auto *PL = DL->getParentLoop(); PL; PL = PL->getParentLoop()) 
-            for (auto *BB : DL->getBlocks()) 
-              PL->removeBlockFromLoop(BB); 
-          DL->getParentLoop()->removeChildLoop(DL); 
-          LI.addTopLevelLoop(DL); 
-        } 
-        LI.erase(DL); 
-      } 
- 
-    for (auto *BB : DeadLoopBlocks) { 
-      assert(BB != L.getHeader() && 
-             "Header of the current loop cannot be dead!"); 
-      LLVM_DEBUG(dbgs() << "Deleting dead loop block " << BB->getName() 
-                        << "\n"); 
-      LI.removeBlock(BB); 
-    } 
- 
-    DetatchDeadBlocks(DeadLoopBlocks, &DTUpdates, /*KeepOneInputPHIs*/true); 
-    DTU.applyUpdates(DTUpdates); 
-    DTUpdates.clear(); 
-    for (auto *BB : DeadLoopBlocks) 
-      DTU.deleteBB(BB); 
- 
-    NumLoopBlocksDeleted += DeadLoopBlocks.size(); 
-  } 
- 
-  /// Constant-fold terminators of blocks acculumated in FoldCandidates into the 
-  /// unconditional branches. 
-  void foldTerminators() { 
-    for (BasicBlock *BB : FoldCandidates) { 
-      assert(LI.getLoopFor(BB) == &L && "Should be a loop block!"); 
-      BasicBlock *TheOnlySucc = getOnlyLiveSuccessor(BB); 
-      assert(TheOnlySucc && "Should have one live successor!"); 
- 
-      LLVM_DEBUG(dbgs() << "Replacing terminator of " << BB->getName() 
-                        << " with an unconditional branch to the block " 
-                        << TheOnlySucc->getName() << "\n"); 
- 
-      SmallPtrSet<BasicBlock *, 2> DeadSuccessors; 
-      // Remove all BB's successors except for the live one. 
-      unsigned TheOnlySuccDuplicates = 0; 
-      for (auto *Succ : successors(BB)) 
-        if (Succ != TheOnlySucc) { 
-          DeadSuccessors.insert(Succ); 
-          // If our successor lies in a different loop, we don't want to remove 
-          // the one-input Phi because it is a LCSSA Phi. 
-          bool PreserveLCSSAPhi = !L.contains(Succ); 
-          Succ->removePredecessor(BB, PreserveLCSSAPhi); 
-          if (MSSAU) 
-            MSSAU->removeEdge(BB, Succ); 
-        } else 
-          ++TheOnlySuccDuplicates; 
- 
-      assert(TheOnlySuccDuplicates > 0 && "Should be!"); 
-      // If TheOnlySucc was BB's successor more than once, after transform it 
-      // will be its successor only once. Remove redundant inputs from 
-      // TheOnlySucc's Phis. 
-      bool PreserveLCSSAPhi = !L.contains(TheOnlySucc); 
-      for (unsigned Dup = 1; Dup < TheOnlySuccDuplicates; ++Dup) 
-        TheOnlySucc->removePredecessor(BB, PreserveLCSSAPhi); 
-      if (MSSAU && TheOnlySuccDuplicates > 1) 
-        MSSAU->removeDuplicatePhiEdgesBetween(BB, TheOnlySucc); 
- 
-      IRBuilder<> Builder(BB->getContext()); 
-      Instruction *Term = BB->getTerminator(); 
-      Builder.SetInsertPoint(Term); 
-      Builder.CreateBr(TheOnlySucc); 
-      Term->eraseFromParent(); 
- 
-      for (auto *DeadSucc : DeadSuccessors) 
-        DTUpdates.push_back({DominatorTree::Delete, BB, DeadSucc}); 
- 
-      ++NumTerminatorsFolded; 
-    } 
-  } 
- 
-public: 
-  ConstantTerminatorFoldingImpl(Loop &L, LoopInfo &LI, DominatorTree &DT, 
-                                ScalarEvolution &SE, 
-                                MemorySSAUpdater *MSSAU) 
-      : L(L), LI(LI), DT(DT), SE(SE), MSSAU(MSSAU), DFS(&L), 
-        DTU(DT, DomTreeUpdater::UpdateStrategy::Eager) {} 
-  bool run() { 
-    assert(L.getLoopLatch() && "Should be single latch!"); 
- 
-    // Collect all available information about status of blocks after constant 
-    // folding. 
-    analyze(); 
-    BasicBlock *Header = L.getHeader(); 
-    (void)Header; 
- 
-    LLVM_DEBUG(dbgs() << "In function " << Header->getParent()->getName() 
-                      << ": "); 
- 
-    if (HasIrreducibleCFG) { 
-      LLVM_DEBUG(dbgs() << "Loops with irreducible CFG are not supported!\n"); 
-      return false; 
-    } 
- 
-    // Nothing to constant-fold. 
-    if (FoldCandidates.empty()) { 
-      LLVM_DEBUG( 
-          dbgs() << "No constant terminator folding candidates found in loop " 
-                 << Header->getName() << "\n"); 
-      return false; 
-    } 
- 
-    // TODO: Support deletion of the current loop. 
-    if (DeleteCurrentLoop) { 
-      LLVM_DEBUG( 
-          dbgs() 
-          << "Give up constant terminator folding in loop " << Header->getName() 
-          << ": we don't currently support deletion of the current loop.\n"); 
-      return false; 
-    } 
- 
-    // TODO: Support blocks that are not dead, but also not in loop after the 
-    // folding. 
-    if (BlocksInLoopAfterFolding.size() + DeadLoopBlocks.size() != 
-        L.getNumBlocks()) { 
-      LLVM_DEBUG( 
-          dbgs() << "Give up constant terminator folding in loop " 
-                 << Header->getName() << ": we don't currently" 
-                    " support blocks that are not dead, but will stop " 
-                    "being a part of the loop after constant-folding.\n"); 
-      return false; 
-    } 
- 
-    SE.forgetTopmostLoop(&L); 
-    // Dump analysis results. 
-    LLVM_DEBUG(dump()); 
- 
-    LLVM_DEBUG(dbgs() << "Constant-folding " << FoldCandidates.size() 
-                      << " terminators in loop " << Header->getName() << "\n"); 
- 
-    // Make the actual transforms. 
-    handleDeadExits(); 
-    foldTerminators(); 
- 
-    if (!DeadLoopBlocks.empty()) { 
-      LLVM_DEBUG(dbgs() << "Deleting " << DeadLoopBlocks.size() 
-                    << " dead blocks in loop " << Header->getName() << "\n"); 
-      deleteDeadLoopBlocks(); 
-    } else { 
-      // If we didn't do updates inside deleteDeadLoopBlocks, do them here. 
-      DTU.applyUpdates(DTUpdates); 
-      DTUpdates.clear(); 
-    } 
- 
-    if (MSSAU && VerifyMemorySSA) 
-      MSSAU->getMemorySSA()->verifyMemorySSA(); 
- 
-#ifndef NDEBUG 
-    // Make sure that we have preserved all data structures after the transform. 
-#if defined(EXPENSIVE_CHECKS) 
-    assert(DT.verify(DominatorTree::VerificationLevel::Full) && 
-           "DT broken after transform!"); 
-#else 
-    assert(DT.verify(DominatorTree::VerificationLevel::Fast) && 
-           "DT broken after transform!"); 
-#endif 
-    assert(DT.isReachableFromEntry(Header)); 
-    LI.verify(DT); 
-#endif 
- 
-    return true; 
-  } 
- 
-  bool foldingBreaksCurrentLoop() const { 
-    return DeleteCurrentLoop; 
-  } 
-}; 
-} // namespace 
- 
-/// Turn branches and switches with known constant conditions into unconditional 
-/// branches. 
-static bool constantFoldTerminators(Loop &L, DominatorTree &DT, LoopInfo &LI, 
-                                    ScalarEvolution &SE, 
-                                    MemorySSAUpdater *MSSAU, 
-                                    bool &IsLoopDeleted) { 
-  if (!EnableTermFolding) 
-    return false; 
- 
-  // To keep things simple, only process loops with single latch. We 
-  // canonicalize most loops to this form. We can support multi-latch if needed. 
-  if (!L.getLoopLatch()) 
-    return false; 
- 
-  ConstantTerminatorFoldingImpl BranchFolder(L, LI, DT, SE, MSSAU); 
-  bool Changed = BranchFolder.run(); 
-  IsLoopDeleted = Changed && BranchFolder.foldingBreaksCurrentLoop(); 
-  return Changed; 
-} 
- 
-static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, 
-                                        LoopInfo &LI, MemorySSAUpdater *MSSAU) { 
-  bool Changed = false; 
-  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); 
-  // Copy blocks into a temporary array to avoid iterator invalidation issues 
-  // as we remove them. 
-  SmallVector<WeakTrackingVH, 16> Blocks(L.blocks()); 
- 
-  for (auto &Block : Blocks) { 
-    // Attempt to merge blocks in the trivial case. Don't modify blocks which 
-    // belong to other loops. 
-    BasicBlock *Succ = cast_or_null<BasicBlock>(Block); 
-    if (!Succ) 
-      continue; 
- 
-    BasicBlock *Pred = Succ->getSinglePredecessor(); 
-    if (!Pred || !Pred->getSingleSuccessor() || LI.getLoopFor(Pred) != &L) 
-      continue; 
- 
-    // Merge Succ into Pred and delete it. 
-    MergeBlockIntoPredecessor(Succ, &DTU, &LI, MSSAU); 
- 
-    if (MSSAU && VerifyMemorySSA) 
-      MSSAU->getMemorySSA()->verifyMemorySSA(); 
- 
-    Changed = true; 
-  } 
- 
-  return Changed; 
-} 
- 
-static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI, 
-                            ScalarEvolution &SE, MemorySSAUpdater *MSSAU, 
-                            bool &IsLoopDeleted) { 
-  bool Changed = false; 
- 
-  // Constant-fold terminators with known constant conditions. 
-  Changed |= constantFoldTerminators(L, DT, LI, SE, MSSAU, IsLoopDeleted); 
- 
-  if (IsLoopDeleted) 
-    return true; 
- 
-  // Eliminate unconditional branches by merging blocks into their predecessors. 
-  Changed |= mergeBlocksIntoPredecessors(L, DT, LI, MSSAU); 
- 
-  if (Changed) 
-    SE.forgetTopmostLoop(&L); 
- 
-  return Changed; 
-} 
- 
-PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM, 
-                                           LoopStandardAnalysisResults &AR, 
-                                           LPMUpdater &LPMU) { 
-  Optional<MemorySSAUpdater> MSSAU; 
-  if (AR.MSSA) 
-    MSSAU = MemorySSAUpdater(AR.MSSA); 
-  bool DeleteCurrentLoop = false; 
-  if (!simplifyLoopCFG(L, AR.DT, AR.LI, AR.SE, 
-                       MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, 
-                       DeleteCurrentLoop)) 
-    return PreservedAnalyses::all(); 
- 
-  if (DeleteCurrentLoop) 
-    LPMU.markLoopAsDeleted(L, "loop-simplifycfg"); 
- 
-  auto PA = getLoopPassPreservedAnalyses(); 
-  if (AR.MSSA) 
-    PA.preserve<MemorySSAAnalysis>(); 
-  return PA; 
-} 
- 
-namespace { 
-class LoopSimplifyCFGLegacyPass : public LoopPass { 
-public: 
-  static char ID; // Pass ID, replacement for typeid 
-  LoopSimplifyCFGLegacyPass() : LoopPass(ID) { 
-    initializeLoopSimplifyCFGLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnLoop(Loop *L, LPPassManager &LPM) override { 
-    if (skipLoop(L)) 
-      return false; 
- 
-    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-    LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 
-    ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 
-    Optional<MemorySSAUpdater> MSSAU; 
-    if (EnableMSSALoopDependency) { 
-      MemorySSA *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA(); 
-      MSSAU = MemorySSAUpdater(MSSA); 
-      if (VerifyMemorySSA) 
-        MSSA->verifyMemorySSA(); 
-    } 
-    bool DeleteCurrentLoop = false; 
-    bool Changed = simplifyLoopCFG( 
-        *L, DT, LI, SE, MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, 
-        DeleteCurrentLoop); 
-    if (DeleteCurrentLoop) 
-      LPM.markLoopAsDeleted(*L); 
-    return Changed; 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    if (EnableMSSALoopDependency) { 
-      AU.addRequired<MemorySSAWrapperPass>(); 
-      AU.addPreserved<MemorySSAWrapperPass>(); 
-    } 
-    AU.addPreserved<DependenceAnalysisWrapperPass>(); 
-    getLoopAnalysisUsage(AU); 
-  } 
-}; 
-} // end namespace 
- 
-char LoopSimplifyCFGLegacyPass::ID = 0; 
-INITIALIZE_PASS_BEGIN(LoopSimplifyCFGLegacyPass, "loop-simplifycfg", 
-                      "Simplify loop CFG", false, false) 
-INITIALIZE_PASS_DEPENDENCY(LoopPass) 
-INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) 
-INITIALIZE_PASS_END(LoopSimplifyCFGLegacyPass, "loop-simplifycfg", 
-                    "Simplify loop CFG", false, false) 
- 
-Pass *llvm::createLoopSimplifyCFGPass() { 
-  return new LoopSimplifyCFGLegacyPass(); 
-} 
+          for (auto *PL = DL->getParentLoop(); PL; PL = PL->getParentLoop())
+            for (auto *BB : DL->getBlocks())
+              PL->removeBlockFromLoop(BB);
+          DL->getParentLoop()->removeChildLoop(DL);
+          LI.addTopLevelLoop(DL);
+        }
+        LI.erase(DL);
+      }
+
+    for (auto *BB : DeadLoopBlocks) {
+      assert(BB != L.getHeader() &&
+             "Header of the current loop cannot be dead!");
+      LLVM_DEBUG(dbgs() << "Deleting dead loop block " << BB->getName()
+                        << "\n");
+      LI.removeBlock(BB);
+    }
+
+    DetatchDeadBlocks(DeadLoopBlocks, &DTUpdates, /*KeepOneInputPHIs*/true);
+    DTU.applyUpdates(DTUpdates);
+    DTUpdates.clear();
+    for (auto *BB : DeadLoopBlocks)
+      DTU.deleteBB(BB);
+
+    NumLoopBlocksDeleted += DeadLoopBlocks.size();
+  }
+
+  /// Constant-fold terminators of blocks acculumated in FoldCandidates into the
+  /// unconditional branches.
+  void foldTerminators() {
+    for (BasicBlock *BB : FoldCandidates) {
+      assert(LI.getLoopFor(BB) == &L && "Should be a loop block!");
+      BasicBlock *TheOnlySucc = getOnlyLiveSuccessor(BB);
+      assert(TheOnlySucc && "Should have one live successor!");
+
+      LLVM_DEBUG(dbgs() << "Replacing terminator of " << BB->getName()
+                        << " with an unconditional branch to the block "
+                        << TheOnlySucc->getName() << "\n");
+
+      SmallPtrSet<BasicBlock *, 2> DeadSuccessors;
+      // Remove all BB's successors except for the live one.
+      unsigned TheOnlySuccDuplicates = 0;
+      for (auto *Succ : successors(BB))
+        if (Succ != TheOnlySucc) {
+          DeadSuccessors.insert(Succ);
+          // If our successor lies in a different loop, we don't want to remove
+          // the one-input Phi because it is a LCSSA Phi.
+          bool PreserveLCSSAPhi = !L.contains(Succ);
+          Succ->removePredecessor(BB, PreserveLCSSAPhi);
+          if (MSSAU)
+            MSSAU->removeEdge(BB, Succ);
+        } else
+          ++TheOnlySuccDuplicates;
+
+      assert(TheOnlySuccDuplicates > 0 && "Should be!");
+      // If TheOnlySucc was BB's successor more than once, after transform it
+      // will be its successor only once. Remove redundant inputs from
+      // TheOnlySucc's Phis.
+      bool PreserveLCSSAPhi = !L.contains(TheOnlySucc);
+      for (unsigned Dup = 1; Dup < TheOnlySuccDuplicates; ++Dup)
+        TheOnlySucc->removePredecessor(BB, PreserveLCSSAPhi);
+      if (MSSAU && TheOnlySuccDuplicates > 1)
+        MSSAU->removeDuplicatePhiEdgesBetween(BB, TheOnlySucc);
+
+      IRBuilder<> Builder(BB->getContext());
+      Instruction *Term = BB->getTerminator();
+      Builder.SetInsertPoint(Term);
+      Builder.CreateBr(TheOnlySucc);
+      Term->eraseFromParent();
+
+      for (auto *DeadSucc : DeadSuccessors)
+        DTUpdates.push_back({DominatorTree::Delete, BB, DeadSucc});
+
+      ++NumTerminatorsFolded;
+    }
+  }
+
+public:
+  ConstantTerminatorFoldingImpl(Loop &L, LoopInfo &LI, DominatorTree &DT,
+                                ScalarEvolution &SE,
+                                MemorySSAUpdater *MSSAU)
+      : L(L), LI(LI), DT(DT), SE(SE), MSSAU(MSSAU), DFS(&L),
+        DTU(DT, DomTreeUpdater::UpdateStrategy::Eager) {}
+  bool run() {
+    assert(L.getLoopLatch() && "Should be single latch!");
+
+    // Collect all available information about status of blocks after constant
+    // folding.
+    analyze();
+    BasicBlock *Header = L.getHeader();
+    (void)Header;
+
+    LLVM_DEBUG(dbgs() << "In function " << Header->getParent()->getName()
+                      << ": ");
+
+    if (HasIrreducibleCFG) {
+      LLVM_DEBUG(dbgs() << "Loops with irreducible CFG are not supported!\n");
+      return false;
+    }
+
+    // Nothing to constant-fold.
+    if (FoldCandidates.empty()) {
+      LLVM_DEBUG(
+          dbgs() << "No constant terminator folding candidates found in loop "
+                 << Header->getName() << "\n");
+      return false;
+    }
+
+    // TODO: Support deletion of the current loop.
+    if (DeleteCurrentLoop) {
+      LLVM_DEBUG(
+          dbgs()
+          << "Give up constant terminator folding in loop " << Header->getName()
+          << ": we don't currently support deletion of the current loop.\n");
+      return false;
+    }
+
+    // TODO: Support blocks that are not dead, but also not in loop after the
+    // folding.
+    if (BlocksInLoopAfterFolding.size() + DeadLoopBlocks.size() !=
+        L.getNumBlocks()) {
+      LLVM_DEBUG(
+          dbgs() << "Give up constant terminator folding in loop "
+                 << Header->getName() << ": we don't currently"
+                    " support blocks that are not dead, but will stop "
+                    "being a part of the loop after constant-folding.\n");
+      return false;
+    }
+
+    SE.forgetTopmostLoop(&L);
+    // Dump analysis results.
+    LLVM_DEBUG(dump());
+
+    LLVM_DEBUG(dbgs() << "Constant-folding " << FoldCandidates.size()
+                      << " terminators in loop " << Header->getName() << "\n");
+
+    // Make the actual transforms.
+    handleDeadExits();
+    foldTerminators();
+
+    if (!DeadLoopBlocks.empty()) {
+      LLVM_DEBUG(dbgs() << "Deleting " << DeadLoopBlocks.size()
+                    << " dead blocks in loop " << Header->getName() << "\n");
+      deleteDeadLoopBlocks();
+    } else {
+      // If we didn't do updates inside deleteDeadLoopBlocks, do them here.
+      DTU.applyUpdates(DTUpdates);
+      DTUpdates.clear();
+    }
+
+    if (MSSAU && VerifyMemorySSA)
+      MSSAU->getMemorySSA()->verifyMemorySSA();
+
+#ifndef NDEBUG
+    // Make sure that we have preserved all data structures after the transform.
+#if defined(EXPENSIVE_CHECKS)
+    assert(DT.verify(DominatorTree::VerificationLevel::Full) &&
+           "DT broken after transform!");
+#else
+    assert(DT.verify(DominatorTree::VerificationLevel::Fast) &&
+           "DT broken after transform!");
+#endif
+    assert(DT.isReachableFromEntry(Header));
+    LI.verify(DT);
+#endif
+
+    return true;
+  }
+
+  bool foldingBreaksCurrentLoop() const {
+    return DeleteCurrentLoop;
+  }
+};
+} // namespace
+
+/// Turn branches and switches with known constant conditions into unconditional
+/// branches.
+static bool constantFoldTerminators(Loop &L, DominatorTree &DT, LoopInfo &LI,
+                                    ScalarEvolution &SE,
+                                    MemorySSAUpdater *MSSAU,
+                                    bool &IsLoopDeleted) {
+  if (!EnableTermFolding)
+    return false;
+
+  // To keep things simple, only process loops with single latch. We
+  // canonicalize most loops to this form. We can support multi-latch if needed.
+  if (!L.getLoopLatch())
+    return false;
+
+  ConstantTerminatorFoldingImpl BranchFolder(L, LI, DT, SE, MSSAU);
+  bool Changed = BranchFolder.run();
+  IsLoopDeleted = Changed && BranchFolder.foldingBreaksCurrentLoop();
+  return Changed;
+}
+
+static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT,
+                                        LoopInfo &LI, MemorySSAUpdater *MSSAU) {
+  bool Changed = false;
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+  // Copy blocks into a temporary array to avoid iterator invalidation issues
+  // as we remove them.
+  SmallVector<WeakTrackingVH, 16> Blocks(L.blocks());
+
+  for (auto &Block : Blocks) {
+    // Attempt to merge blocks in the trivial case. Don't modify blocks which
+    // belong to other loops.
+    BasicBlock *Succ = cast_or_null<BasicBlock>(Block);
+    if (!Succ)
+      continue;
+
+    BasicBlock *Pred = Succ->getSinglePredecessor();
+    if (!Pred || !Pred->getSingleSuccessor() || LI.getLoopFor(Pred) != &L)
+      continue;
+
+    // Merge Succ into Pred and delete it.
+    MergeBlockIntoPredecessor(Succ, &DTU, &LI, MSSAU);
+
+    if (MSSAU && VerifyMemorySSA)
+      MSSAU->getMemorySSA()->verifyMemorySSA();
+
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI,
+                            ScalarEvolution &SE, MemorySSAUpdater *MSSAU,
+                            bool &IsLoopDeleted) {
+  bool Changed = false;
+
+  // Constant-fold terminators with known constant conditions.
+  Changed |= constantFoldTerminators(L, DT, LI, SE, MSSAU, IsLoopDeleted);
+
+  if (IsLoopDeleted)
+    return true;
+
+  // Eliminate unconditional branches by merging blocks into their predecessors.
+  Changed |= mergeBlocksIntoPredecessors(L, DT, LI, MSSAU);
+
+  if (Changed)
+    SE.forgetTopmostLoop(&L);
+
+  return Changed;
+}
+
+PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM,
+                                           LoopStandardAnalysisResults &AR,
+                                           LPMUpdater &LPMU) {
+  Optional<MemorySSAUpdater> MSSAU;
+  if (AR.MSSA)
+    MSSAU = MemorySSAUpdater(AR.MSSA);
+  bool DeleteCurrentLoop = false;
+  if (!simplifyLoopCFG(L, AR.DT, AR.LI, AR.SE,
+                       MSSAU.hasValue() ? MSSAU.getPointer() : nullptr,
+                       DeleteCurrentLoop))
+    return PreservedAnalyses::all();
+
+  if (DeleteCurrentLoop)
+    LPMU.markLoopAsDeleted(L, "loop-simplifycfg");
+
+  auto PA = getLoopPassPreservedAnalyses();
+  if (AR.MSSA)
+    PA.preserve<MemorySSAAnalysis>();
+  return PA;
+}
+
+namespace {
+class LoopSimplifyCFGLegacyPass : public LoopPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+  LoopSimplifyCFGLegacyPass() : LoopPass(ID) {
+    initializeLoopSimplifyCFGLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+    if (skipLoop(L))
+      return false;
+
+    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    Optional<MemorySSAUpdater> MSSAU;
+    if (EnableMSSALoopDependency) {
+      MemorySSA *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
+      MSSAU = MemorySSAUpdater(MSSA);
+      if (VerifyMemorySSA)
+        MSSA->verifyMemorySSA();
+    }
+    bool DeleteCurrentLoop = false;
+    bool Changed = simplifyLoopCFG(
+        *L, DT, LI, SE, MSSAU.hasValue() ? MSSAU.getPointer() : nullptr,
+        DeleteCurrentLoop);
+    if (DeleteCurrentLoop)
+      LPM.markLoopAsDeleted(*L);
+    return Changed;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    if (EnableMSSALoopDependency) {
+      AU.addRequired<MemorySSAWrapperPass>();
+      AU.addPreserved<MemorySSAWrapperPass>();
+    }
+    AU.addPreserved<DependenceAnalysisWrapperPass>();
+    getLoopAnalysisUsage(AU);
+  }
+};
+} // end namespace
+
+char LoopSimplifyCFGLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopSimplifyCFGLegacyPass, "loop-simplifycfg",
+                      "Simplify loop CFG", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_END(LoopSimplifyCFGLegacyPass, "loop-simplifycfg",
+                    "Simplify loop CFG", false, false)
+
+Pass *llvm::createLoopSimplifyCFGPass() {
+  return new LoopSimplifyCFGLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopSink.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopSink.cpp
index 5ea1f430c3..47698fdde6 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopSink.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopSink.cpp
@@ -1,74 +1,74 @@
-//===-- LoopSink.cpp - Loop Sink Pass -------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass does the inverse transformation of what LICM does. 
-// It traverses all of the instructions in the loop's preheader and sinks 
-// them to the loop body where frequency is lower than the loop's preheader. 
-// This pass is a reverse-transformation of LICM. It differs from the Sink 
-// pass in the following ways: 
-// 
-// * It only handles sinking of instructions from the loop's preheader to the 
-//   loop's body 
-// * It uses alias set tracker to get more accurate alias info 
-// * It uses block frequency info to find the optimal sinking locations 
-// 
-// Overall algorithm: 
-// 
-// For I in Preheader: 
-//   InsertBBs = BBs that uses I 
-//   For BB in sorted(LoopBBs): 
-//     DomBBs = BBs in InsertBBs that are dominated by BB 
-//     if freq(DomBBs) > freq(BB) 
-//       InsertBBs = UseBBs - DomBBs + BB 
-//   For BB in InsertBBs: 
-//     Insert I at BB's beginning 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/LoopSink.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/AliasAnalysis.h" 
-#include "llvm/Analysis/AliasSetTracker.h" 
-#include "llvm/Analysis/BasicAliasAnalysis.h" 
-#include "llvm/Analysis/BlockFrequencyInfo.h" 
-#include "llvm/Analysis/Loads.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/LoopPass.h" 
+//===-- LoopSink.cpp - Loop Sink Pass -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass does the inverse transformation of what LICM does.
+// It traverses all of the instructions in the loop's preheader and sinks
+// them to the loop body where frequency is lower than the loop's preheader.
+// This pass is a reverse-transformation of LICM. It differs from the Sink
+// pass in the following ways:
+//
+// * It only handles sinking of instructions from the loop's preheader to the
+//   loop's body
+// * It uses alias set tracker to get more accurate alias info
+// * It uses block frequency info to find the optimal sinking locations
+//
+// Overall algorithm:
+//
+// For I in Preheader:
+//   InsertBBs = BBs that uses I
+//   For BB in sorted(LoopBBs):
+//     DomBBs = BBs in InsertBBs that are dominated by BB
+//     if freq(DomBBs) > freq(BB)
+//       InsertBBs = UseBBs - DomBBs + BB
+//   For BB in InsertBBs:
+//     Insert I at BB's beginning
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopSink.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Scalar/LoopPassManager.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Transforms/Utils/LoopUtils.h" 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "loopsink" 
- 
-STATISTIC(NumLoopSunk, "Number of instructions sunk into loop"); 
-STATISTIC(NumLoopSunkCloned, "Number of cloned instructions sunk into loop"); 
- 
-static cl::opt<unsigned> SinkFrequencyPercentThreshold( 
-    "sink-freq-percent-threshold", cl::Hidden, cl::init(90), 
-    cl::desc("Do not sink instructions that require cloning unless they " 
-             "execute less than this percent of the time.")); 
- 
-static cl::opt<unsigned> MaxNumberOfUseBBsForSinking( 
-    "max-uses-for-sinking", cl::Hidden, cl::init(30), 
-    cl::desc("Do not sink instructions that have too many uses.")); 
- 
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loopsink"
+
+STATISTIC(NumLoopSunk, "Number of instructions sunk into loop");
+STATISTIC(NumLoopSunkCloned, "Number of cloned instructions sunk into loop");
+
+static cl::opt<unsigned> SinkFrequencyPercentThreshold(
+    "sink-freq-percent-threshold", cl::Hidden, cl::init(90),
+    cl::desc("Do not sink instructions that require cloning unless they "
+             "execute less than this percent of the time."));
+
+static cl::opt<unsigned> MaxNumberOfUseBBsForSinking(
+    "max-uses-for-sinking", cl::Hidden, cl::init(30),
+    cl::desc("Do not sink instructions that have too many uses."));
+
 static cl::opt<bool> EnableMSSAInLoopSink(
     "enable-mssa-in-loop-sink", cl::Hidden, cl::init(true),
     cl::desc("Enable MemorySSA for LoopSink in new pass manager"));
@@ -77,167 +77,167 @@ static cl::opt<bool> EnableMSSAInLegacyLoopSink(
     "enable-mssa-in-legacy-loop-sink", cl::Hidden, cl::init(false),
     cl::desc("Enable MemorySSA for LoopSink in legacy pass manager"));
 
-/// Return adjusted total frequency of \p BBs. 
-/// 
-/// * If there is only one BB, sinking instruction will not introduce code 
-///   size increase. Thus there is no need to adjust the frequency. 
-/// * If there are more than one BB, sinking would lead to code size increase. 
-///   In this case, we add some "tax" to the total frequency to make it harder 
-///   to sink. E.g. 
-///     Freq(Preheader) = 100 
-///     Freq(BBs) = sum(50, 49) = 99 
-///   Even if Freq(BBs) < Freq(Preheader), we will not sink from Preheade to 
-///   BBs as the difference is too small to justify the code size increase. 
-///   To model this, The adjusted Freq(BBs) will be: 
-///     AdjustedFreq(BBs) = 99 / SinkFrequencyPercentThreshold% 
-static BlockFrequency adjustedSumFreq(SmallPtrSetImpl<BasicBlock *> &BBs, 
-                                      BlockFrequencyInfo &BFI) { 
-  BlockFrequency T = 0; 
-  for (BasicBlock *B : BBs) 
-    T += BFI.getBlockFreq(B); 
-  if (BBs.size() > 1) 
-    T /= BranchProbability(SinkFrequencyPercentThreshold, 100); 
-  return T; 
-} 
- 
-/// Return a set of basic blocks to insert sinked instructions. 
-/// 
-/// The returned set of basic blocks (BBsToSinkInto) should satisfy: 
-/// 
-/// * Inside the loop \p L 
-/// * For each UseBB in \p UseBBs, there is at least one BB in BBsToSinkInto 
-///   that domintates the UseBB 
-/// * Has minimum total frequency that is no greater than preheader frequency 
-/// 
-/// The purpose of the function is to find the optimal sinking points to 
-/// minimize execution cost, which is defined as "sum of frequency of 
-/// BBsToSinkInto". 
-/// As a result, the returned BBsToSinkInto needs to have minimum total 
-/// frequency. 
-/// Additionally, if the total frequency of BBsToSinkInto exceeds preheader 
-/// frequency, the optimal solution is not sinking (return empty set). 
-/// 
-/// \p ColdLoopBBs is used to help find the optimal sinking locations. 
-/// It stores a list of BBs that is: 
-/// 
-/// * Inside the loop \p L 
-/// * Has a frequency no larger than the loop's preheader 
-/// * Sorted by BB frequency 
-/// 
-/// The complexity of the function is O(UseBBs.size() * ColdLoopBBs.size()). 
-/// To avoid expensive computation, we cap the maximum UseBBs.size() in its 
-/// caller. 
-static SmallPtrSet<BasicBlock *, 2> 
-findBBsToSinkInto(const Loop &L, const SmallPtrSetImpl<BasicBlock *> &UseBBs, 
-                  const SmallVectorImpl<BasicBlock *> &ColdLoopBBs, 
-                  DominatorTree &DT, BlockFrequencyInfo &BFI) { 
-  SmallPtrSet<BasicBlock *, 2> BBsToSinkInto; 
-  if (UseBBs.size() == 0) 
-    return BBsToSinkInto; 
- 
-  BBsToSinkInto.insert(UseBBs.begin(), UseBBs.end()); 
-  SmallPtrSet<BasicBlock *, 2> BBsDominatedByColdestBB; 
- 
-  // For every iteration: 
-  //   * Pick the ColdestBB from ColdLoopBBs 
-  //   * Find the set BBsDominatedByColdestBB that satisfy: 
-  //     - BBsDominatedByColdestBB is a subset of BBsToSinkInto 
-  //     - Every BB in BBsDominatedByColdestBB is dominated by ColdestBB 
-  //   * If Freq(ColdestBB) < Freq(BBsDominatedByColdestBB), remove 
-  //     BBsDominatedByColdestBB from BBsToSinkInto, add ColdestBB to 
-  //     BBsToSinkInto 
-  for (BasicBlock *ColdestBB : ColdLoopBBs) { 
-    BBsDominatedByColdestBB.clear(); 
-    for (BasicBlock *SinkedBB : BBsToSinkInto) 
-      if (DT.dominates(ColdestBB, SinkedBB)) 
-        BBsDominatedByColdestBB.insert(SinkedBB); 
-    if (BBsDominatedByColdestBB.size() == 0) 
-      continue; 
-    if (adjustedSumFreq(BBsDominatedByColdestBB, BFI) > 
-        BFI.getBlockFreq(ColdestBB)) { 
-      for (BasicBlock *DominatedBB : BBsDominatedByColdestBB) { 
-        BBsToSinkInto.erase(DominatedBB); 
-      } 
-      BBsToSinkInto.insert(ColdestBB); 
-    } 
-  } 
- 
-  // Can't sink into blocks that have no valid insertion point. 
-  for (BasicBlock *BB : BBsToSinkInto) { 
-    if (BB->getFirstInsertionPt() == BB->end()) { 
-      BBsToSinkInto.clear(); 
-      break; 
-    } 
-  } 
- 
-  // If the total frequency of BBsToSinkInto is larger than preheader frequency, 
-  // do not sink. 
-  if (adjustedSumFreq(BBsToSinkInto, BFI) > 
-      BFI.getBlockFreq(L.getLoopPreheader())) 
-    BBsToSinkInto.clear(); 
-  return BBsToSinkInto; 
-} 
- 
-// Sinks \p I from the loop \p L's preheader to its uses. Returns true if 
-// sinking is successful. 
-// \p LoopBlockNumber is used to sort the insertion blocks to ensure 
-// determinism. 
+/// Return adjusted total frequency of \p BBs.
+///
+/// * If there is only one BB, sinking instruction will not introduce code
+///   size increase. Thus there is no need to adjust the frequency.
+/// * If there are more than one BB, sinking would lead to code size increase.
+///   In this case, we add some "tax" to the total frequency to make it harder
+///   to sink. E.g.
+///     Freq(Preheader) = 100
+///     Freq(BBs) = sum(50, 49) = 99
+///   Even if Freq(BBs) < Freq(Preheader), we will not sink from Preheade to
+///   BBs as the difference is too small to justify the code size increase.
+///   To model this, The adjusted Freq(BBs) will be:
+///     AdjustedFreq(BBs) = 99 / SinkFrequencyPercentThreshold%
+static BlockFrequency adjustedSumFreq(SmallPtrSetImpl<BasicBlock *> &BBs,
+                                      BlockFrequencyInfo &BFI) {
+  BlockFrequency T = 0;
+  for (BasicBlock *B : BBs)
+    T += BFI.getBlockFreq(B);
+  if (BBs.size() > 1)
+    T /= BranchProbability(SinkFrequencyPercentThreshold, 100);
+  return T;
+}
+
+/// Return a set of basic blocks to insert sinked instructions.
+///
+/// The returned set of basic blocks (BBsToSinkInto) should satisfy:
+///
+/// * Inside the loop \p L
+/// * For each UseBB in \p UseBBs, there is at least one BB in BBsToSinkInto
+///   that domintates the UseBB
+/// * Has minimum total frequency that is no greater than preheader frequency
+///
+/// The purpose of the function is to find the optimal sinking points to
+/// minimize execution cost, which is defined as "sum of frequency of
+/// BBsToSinkInto".
+/// As a result, the returned BBsToSinkInto needs to have minimum total
+/// frequency.
+/// Additionally, if the total frequency of BBsToSinkInto exceeds preheader
+/// frequency, the optimal solution is not sinking (return empty set).
+///
+/// \p ColdLoopBBs is used to help find the optimal sinking locations.
+/// It stores a list of BBs that is:
+///
+/// * Inside the loop \p L
+/// * Has a frequency no larger than the loop's preheader
+/// * Sorted by BB frequency
+///
+/// The complexity of the function is O(UseBBs.size() * ColdLoopBBs.size()).
+/// To avoid expensive computation, we cap the maximum UseBBs.size() in its
+/// caller.
+static SmallPtrSet<BasicBlock *, 2>
+findBBsToSinkInto(const Loop &L, const SmallPtrSetImpl<BasicBlock *> &UseBBs,
+                  const SmallVectorImpl<BasicBlock *> &ColdLoopBBs,
+                  DominatorTree &DT, BlockFrequencyInfo &BFI) {
+  SmallPtrSet<BasicBlock *, 2> BBsToSinkInto;
+  if (UseBBs.size() == 0)
+    return BBsToSinkInto;
+
+  BBsToSinkInto.insert(UseBBs.begin(), UseBBs.end());
+  SmallPtrSet<BasicBlock *, 2> BBsDominatedByColdestBB;
+
+  // For every iteration:
+  //   * Pick the ColdestBB from ColdLoopBBs
+  //   * Find the set BBsDominatedByColdestBB that satisfy:
+  //     - BBsDominatedByColdestBB is a subset of BBsToSinkInto
+  //     - Every BB in BBsDominatedByColdestBB is dominated by ColdestBB
+  //   * If Freq(ColdestBB) < Freq(BBsDominatedByColdestBB), remove
+  //     BBsDominatedByColdestBB from BBsToSinkInto, add ColdestBB to
+  //     BBsToSinkInto
+  for (BasicBlock *ColdestBB : ColdLoopBBs) {
+    BBsDominatedByColdestBB.clear();
+    for (BasicBlock *SinkedBB : BBsToSinkInto)
+      if (DT.dominates(ColdestBB, SinkedBB))
+        BBsDominatedByColdestBB.insert(SinkedBB);
+    if (BBsDominatedByColdestBB.size() == 0)
+      continue;
+    if (adjustedSumFreq(BBsDominatedByColdestBB, BFI) >
+        BFI.getBlockFreq(ColdestBB)) {
+      for (BasicBlock *DominatedBB : BBsDominatedByColdestBB) {
+        BBsToSinkInto.erase(DominatedBB);
+      }
+      BBsToSinkInto.insert(ColdestBB);
+    }
+  }
+
+  // Can't sink into blocks that have no valid insertion point.
+  for (BasicBlock *BB : BBsToSinkInto) {
+    if (BB->getFirstInsertionPt() == BB->end()) {
+      BBsToSinkInto.clear();
+      break;
+    }
+  }
+
+  // If the total frequency of BBsToSinkInto is larger than preheader frequency,
+  // do not sink.
+  if (adjustedSumFreq(BBsToSinkInto, BFI) >
+      BFI.getBlockFreq(L.getLoopPreheader()))
+    BBsToSinkInto.clear();
+  return BBsToSinkInto;
+}
+
+// Sinks \p I from the loop \p L's preheader to its uses. Returns true if
+// sinking is successful.
+// \p LoopBlockNumber is used to sort the insertion blocks to ensure
+// determinism.
 static bool sinkInstruction(
     Loop &L, Instruction &I, const SmallVectorImpl<BasicBlock *> &ColdLoopBBs,
     const SmallDenseMap<BasicBlock *, int, 16> &LoopBlockNumber, LoopInfo &LI,
     DominatorTree &DT, BlockFrequencyInfo &BFI, MemorySSAUpdater *MSSAU) {
-  // Compute the set of blocks in loop L which contain a use of I. 
-  SmallPtrSet<BasicBlock *, 2> BBs; 
-  for (auto &U : I.uses()) { 
-    Instruction *UI = cast<Instruction>(U.getUser()); 
-    // We cannot sink I to PHI-uses. 
-    if (dyn_cast<PHINode>(UI)) 
-      return false; 
-    // We cannot sink I if it has uses outside of the loop. 
-    if (!L.contains(LI.getLoopFor(UI->getParent()))) 
-      return false; 
-    BBs.insert(UI->getParent()); 
-  } 
- 
-  // findBBsToSinkInto is O(BBs.size() * ColdLoopBBs.size()). We cap the max 
-  // BBs.size() to avoid expensive computation. 
-  // FIXME: Handle code size growth for min_size and opt_size. 
-  if (BBs.size() > MaxNumberOfUseBBsForSinking) 
-    return false; 
- 
-  // Find the set of BBs that we should insert a copy of I. 
-  SmallPtrSet<BasicBlock *, 2> BBsToSinkInto = 
-      findBBsToSinkInto(L, BBs, ColdLoopBBs, DT, BFI); 
-  if (BBsToSinkInto.empty()) 
-    return false; 
- 
-  // Return if any of the candidate blocks to sink into is non-cold. 
-  if (BBsToSinkInto.size() > 1) { 
-    for (auto *BB : BBsToSinkInto) 
-      if (!LoopBlockNumber.count(BB)) 
-        return false; 
-  } 
- 
-  // Copy the final BBs into a vector and sort them using the total ordering 
-  // of the loop block numbers as iterating the set doesn't give a useful 
-  // order. No need to stable sort as the block numbers are a total ordering. 
-  SmallVector<BasicBlock *, 2> SortedBBsToSinkInto; 
+  // Compute the set of blocks in loop L which contain a use of I.
+  SmallPtrSet<BasicBlock *, 2> BBs;
+  for (auto &U : I.uses()) {
+    Instruction *UI = cast<Instruction>(U.getUser());
+    // We cannot sink I to PHI-uses.
+    if (dyn_cast<PHINode>(UI))
+      return false;
+    // We cannot sink I if it has uses outside of the loop.
+    if (!L.contains(LI.getLoopFor(UI->getParent())))
+      return false;
+    BBs.insert(UI->getParent());
+  }
+
+  // findBBsToSinkInto is O(BBs.size() * ColdLoopBBs.size()). We cap the max
+  // BBs.size() to avoid expensive computation.
+  // FIXME: Handle code size growth for min_size and opt_size.
+  if (BBs.size() > MaxNumberOfUseBBsForSinking)
+    return false;
+
+  // Find the set of BBs that we should insert a copy of I.
+  SmallPtrSet<BasicBlock *, 2> BBsToSinkInto =
+      findBBsToSinkInto(L, BBs, ColdLoopBBs, DT, BFI);
+  if (BBsToSinkInto.empty())
+    return false;
+
+  // Return if any of the candidate blocks to sink into is non-cold.
+  if (BBsToSinkInto.size() > 1) {
+    for (auto *BB : BBsToSinkInto)
+      if (!LoopBlockNumber.count(BB))
+        return false;
+  }
+
+  // Copy the final BBs into a vector and sort them using the total ordering
+  // of the loop block numbers as iterating the set doesn't give a useful
+  // order. No need to stable sort as the block numbers are a total ordering.
+  SmallVector<BasicBlock *, 2> SortedBBsToSinkInto;
   llvm::append_range(SortedBBsToSinkInto, BBsToSinkInto);
-  llvm::sort(SortedBBsToSinkInto, [&](BasicBlock *A, BasicBlock *B) { 
-    return LoopBlockNumber.find(A)->second < LoopBlockNumber.find(B)->second; 
-  }); 
- 
-  BasicBlock *MoveBB = *SortedBBsToSinkInto.begin(); 
-  // FIXME: Optimize the efficiency for cloned value replacement. The current 
-  //        implementation is O(SortedBBsToSinkInto.size() * I.num_uses()). 
-  for (BasicBlock *N : makeArrayRef(SortedBBsToSinkInto).drop_front(1)) { 
-    assert(LoopBlockNumber.find(N)->second > 
-               LoopBlockNumber.find(MoveBB)->second && 
-           "BBs not sorted!"); 
-    // Clone I and replace its uses. 
-    Instruction *IC = I.clone(); 
-    IC->setName(I.getName()); 
-    IC->insertBefore(&*N->getFirstInsertionPt()); 
+  llvm::sort(SortedBBsToSinkInto, [&](BasicBlock *A, BasicBlock *B) {
+    return LoopBlockNumber.find(A)->second < LoopBlockNumber.find(B)->second;
+  });
+
+  BasicBlock *MoveBB = *SortedBBsToSinkInto.begin();
+  // FIXME: Optimize the efficiency for cloned value replacement. The current
+  //        implementation is O(SortedBBsToSinkInto.size() * I.num_uses()).
+  for (BasicBlock *N : makeArrayRef(SortedBBsToSinkInto).drop_front(1)) {
+    assert(LoopBlockNumber.find(N)->second >
+               LoopBlockNumber.find(MoveBB)->second &&
+           "BBs not sorted!");
+    // Clone I and replace its uses.
+    Instruction *IC = I.clone();
+    IC->setName(I.getName());
+    IC->insertBefore(&*N->getFirstInsertionPt());
 
     if (MSSAU && MSSAU->getMemorySSA()->getMemoryAccess(&I)) {
       // Create a new MemoryAccess and let MemorySSA set its defining access.
@@ -253,51 +253,51 @@ static bool sinkInstruction(
       }
     }
 
-    // Replaces uses of I with IC in N 
-    I.replaceUsesWithIf(IC, [N](Use &U) { 
-      return cast<Instruction>(U.getUser())->getParent() == N; 
-    }); 
-    // Replaces uses of I with IC in blocks dominated by N 
-    replaceDominatedUsesWith(&I, IC, DT, N); 
-    LLVM_DEBUG(dbgs() << "Sinking a clone of " << I << " To: " << N->getName() 
-                      << '\n'); 
-    NumLoopSunkCloned++; 
-  } 
-  LLVM_DEBUG(dbgs() << "Sinking " << I << " To: " << MoveBB->getName() << '\n'); 
-  NumLoopSunk++; 
-  I.moveBefore(&*MoveBB->getFirstInsertionPt()); 
- 
+    // Replaces uses of I with IC in N
+    I.replaceUsesWithIf(IC, [N](Use &U) {
+      return cast<Instruction>(U.getUser())->getParent() == N;
+    });
+    // Replaces uses of I with IC in blocks dominated by N
+    replaceDominatedUsesWith(&I, IC, DT, N);
+    LLVM_DEBUG(dbgs() << "Sinking a clone of " << I << " To: " << N->getName()
+                      << '\n');
+    NumLoopSunkCloned++;
+  }
+  LLVM_DEBUG(dbgs() << "Sinking " << I << " To: " << MoveBB->getName() << '\n');
+  NumLoopSunk++;
+  I.moveBefore(&*MoveBB->getFirstInsertionPt());
+
   if (MSSAU)
     if (MemoryUseOrDef *OldMemAcc = cast_or_null<MemoryUseOrDef>(
             MSSAU->getMemorySSA()->getMemoryAccess(&I)))
       MSSAU->moveToPlace(OldMemAcc, MoveBB, MemorySSA::Beginning);
 
-  return true; 
-} 
- 
-/// Sinks instructions from loop's preheader to the loop body if the 
-/// sum frequency of inserted copy is smaller than preheader's frequency. 
-static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI, 
-                                          DominatorTree &DT, 
-                                          BlockFrequencyInfo &BFI, 
+  return true;
+}
+
+/// Sinks instructions from loop's preheader to the loop body if the
+/// sum frequency of inserted copy is smaller than preheader's frequency.
+static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
+                                          DominatorTree &DT,
+                                          BlockFrequencyInfo &BFI,
                                           ScalarEvolution *SE,
                                           AliasSetTracker *CurAST,
                                           MemorySSA *MSSA) {
-  BasicBlock *Preheader = L.getLoopPreheader(); 
+  BasicBlock *Preheader = L.getLoopPreheader();
   assert(Preheader && "Expected loop to have preheader");
- 
+
   assert(Preheader->getParent()->hasProfileData() &&
          "Unexpected call when profile data unavailable.");
- 
-  const BlockFrequency PreheaderFreq = BFI.getBlockFreq(Preheader); 
-  // If there are no basic blocks with lower frequency than the preheader then 
-  // we can avoid the detailed analysis as we will never find profitable sinking 
-  // opportunities. 
-  if (all_of(L.blocks(), [&](const BasicBlock *BB) { 
-        return BFI.getBlockFreq(BB) > PreheaderFreq; 
-      })) 
-    return false; 
- 
+
+  const BlockFrequency PreheaderFreq = BFI.getBlockFreq(Preheader);
+  // If there are no basic blocks with lower frequency than the preheader then
+  // we can avoid the detailed analysis as we will never find profitable sinking
+  // opportunities.
+  if (all_of(L.blocks(), [&](const BasicBlock *BB) {
+        return BFI.getBlockFreq(BB) > PreheaderFreq;
+      }))
+    return false;
+
   std::unique_ptr<MemorySSAUpdater> MSSAU;
   std::unique_ptr<SinkAndHoistLICMFlags> LICMFlags;
   if (MSSA) {
@@ -306,42 +306,42 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
         std::make_unique<SinkAndHoistLICMFlags>(/*IsSink=*/true, &L, MSSA);
   }
 
-  bool Changed = false; 
- 
-  // Sort loop's basic blocks by frequency 
-  SmallVector<BasicBlock *, 10> ColdLoopBBs; 
-  SmallDenseMap<BasicBlock *, int, 16> LoopBlockNumber; 
-  int i = 0; 
-  for (BasicBlock *B : L.blocks()) 
-    if (BFI.getBlockFreq(B) < BFI.getBlockFreq(L.getLoopPreheader())) { 
-      ColdLoopBBs.push_back(B); 
-      LoopBlockNumber[B] = ++i; 
-    } 
-  llvm::stable_sort(ColdLoopBBs, [&](BasicBlock *A, BasicBlock *B) { 
-    return BFI.getBlockFreq(A) < BFI.getBlockFreq(B); 
-  }); 
- 
-  // Traverse preheader's instructions in reverse order becaue if A depends 
-  // on B (A appears after B), A needs to be sinked first before B can be 
-  // sinked. 
-  for (auto II = Preheader->rbegin(), E = Preheader->rend(); II != E;) { 
-    Instruction *I = &*II++; 
-    // No need to check for instruction's operands are loop invariant. 
-    assert(L.hasLoopInvariantOperands(I) && 
-           "Insts in a loop's preheader should have loop invariant operands!"); 
+  bool Changed = false;
+
+  // Sort loop's basic blocks by frequency
+  SmallVector<BasicBlock *, 10> ColdLoopBBs;
+  SmallDenseMap<BasicBlock *, int, 16> LoopBlockNumber;
+  int i = 0;
+  for (BasicBlock *B : L.blocks())
+    if (BFI.getBlockFreq(B) < BFI.getBlockFreq(L.getLoopPreheader())) {
+      ColdLoopBBs.push_back(B);
+      LoopBlockNumber[B] = ++i;
+    }
+  llvm::stable_sort(ColdLoopBBs, [&](BasicBlock *A, BasicBlock *B) {
+    return BFI.getBlockFreq(A) < BFI.getBlockFreq(B);
+  });
+
+  // Traverse preheader's instructions in reverse order becaue if A depends
+  // on B (A appears after B), A needs to be sinked first before B can be
+  // sinked.
+  for (auto II = Preheader->rbegin(), E = Preheader->rend(); II != E;) {
+    Instruction *I = &*II++;
+    // No need to check for instruction's operands are loop invariant.
+    assert(L.hasLoopInvariantOperands(I) &&
+           "Insts in a loop's preheader should have loop invariant operands!");
     if (!canSinkOrHoistInst(*I, &AA, &DT, &L, CurAST, MSSAU.get(), false,
                             LICMFlags.get()))
-      continue; 
+      continue;
     if (sinkInstruction(L, *I, ColdLoopBBs, LoopBlockNumber, LI, DT, BFI,
                         MSSAU.get()))
-      Changed = true; 
-  } 
- 
-  if (Changed && SE) 
-    SE->forgetLoopDispositions(&L); 
-  return Changed; 
-} 
- 
+      Changed = true;
+  }
+
+  if (Changed && SE)
+    SE->forgetLoopDispositions(&L);
+  return Changed;
+}
+
 static void computeAliasSet(Loop &L, BasicBlock &Preheader,
                             AliasSetTracker &CurAST) {
   for (BasicBlock *BB : L.blocks())
@@ -349,31 +349,31 @@ static void computeAliasSet(Loop &L, BasicBlock &Preheader,
   CurAST.add(Preheader);
 }
 
-PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) { 
-  LoopInfo &LI = FAM.getResult<LoopAnalysis>(F); 
-  // Nothing to do if there are no loops. 
-  if (LI.empty()) 
-    return PreservedAnalyses::all(); 
- 
-  AAResults &AA = FAM.getResult<AAManager>(F); 
-  DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F); 
-  BlockFrequencyInfo &BFI = FAM.getResult<BlockFrequencyAnalysis>(F); 
- 
+PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) {
+  LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
+  // Nothing to do if there are no loops.
+  if (LI.empty())
+    return PreservedAnalyses::all();
+
+  AAResults &AA = FAM.getResult<AAManager>(F);
+  DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+  BlockFrequencyInfo &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+
   MemorySSA *MSSA = EnableMSSAInLoopSink
                         ? &FAM.getResult<MemorySSAAnalysis>(F).getMSSA()
                         : nullptr;
 
-  // We want to do a postorder walk over the loops. Since loops are a tree this 
-  // is equivalent to a reversed preorder walk and preorder is easy to compute 
-  // without recursion. Since we reverse the preorder, we will visit siblings 
-  // in reverse program order. This isn't expected to matter at all but is more 
-  // consistent with sinking algorithms which generally work bottom-up. 
-  SmallVector<Loop *, 4> PreorderLoops = LI.getLoopsInPreorder(); 
- 
-  bool Changed = false; 
-  do { 
-    Loop &L = *PreorderLoops.pop_back_val(); 
- 
+  // We want to do a postorder walk over the loops. Since loops are a tree this
+  // is equivalent to a reversed preorder walk and preorder is easy to compute
+  // without recursion. Since we reverse the preorder, we will visit siblings
+  // in reverse program order. This isn't expected to matter at all but is more
+  // consistent with sinking algorithms which generally work bottom-up.
+  SmallVector<Loop *, 4> PreorderLoops = LI.getLoopsInPreorder();
+
+  bool Changed = false;
+  do {
+    Loop &L = *PreorderLoops.pop_back_val();
+
     BasicBlock *Preheader = L.getLoopPreheader();
     if (!Preheader)
       continue;
@@ -389,19 +389,19 @@ PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) {
       computeAliasSet(L, *Preheader, *CurAST.get());
     }
 
-    // Note that we don't pass SCEV here because it is only used to invalidate 
-    // loops in SCEV and we don't preserve (or request) SCEV at all making that 
-    // unnecessary. 
-    Changed |= sinkLoopInvariantInstructions(L, AA, LI, DT, BFI, 
+    // Note that we don't pass SCEV here because it is only used to invalidate
+    // loops in SCEV and we don't preserve (or request) SCEV at all making that
+    // unnecessary.
+    Changed |= sinkLoopInvariantInstructions(L, AA, LI, DT, BFI,
                                              /*ScalarEvolution*/ nullptr,
                                              CurAST.get(), MSSA);
-  } while (!PreorderLoops.empty()); 
- 
-  if (!Changed) 
-    return PreservedAnalyses::all(); 
- 
-  PreservedAnalyses PA; 
-  PA.preserveSet<CFGAnalyses>(); 
+  } while (!PreorderLoops.empty());
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
 
   if (MSSA) {
     PA.preserve<MemorySSAAnalysis>();
@@ -410,20 +410,20 @@ PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) {
       MSSA->verifyMemorySSA();
   }
 
-  return PA; 
-} 
- 
-namespace { 
-struct LegacyLoopSinkPass : public LoopPass { 
-  static char ID; 
-  LegacyLoopSinkPass() : LoopPass(ID) { 
-    initializeLegacyLoopSinkPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnLoop(Loop *L, LPPassManager &LPM) override { 
-    if (skipLoop(L)) 
-      return false; 
- 
+  return PA;
+}
+
+namespace {
+struct LegacyLoopSinkPass : public LoopPass {
+  static char ID;
+  LegacyLoopSinkPass() : LoopPass(ID) {
+    initializeLegacyLoopSinkPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+    if (skipLoop(L))
+      return false;
+
     BasicBlock *Preheader = L->getLoopPreheader();
     if (!Preheader)
       return false;
@@ -434,7 +434,7 @@ struct LegacyLoopSinkPass : public LoopPass {
       return false;
 
     AAResults &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
-    auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); 
+    auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
     std::unique_ptr<AliasSetTracker> CurAST;
     MemorySSA *MSSA = nullptr;
     if (EnableMSSAInLegacyLoopSink)
@@ -446,34 +446,34 @@ struct LegacyLoopSinkPass : public LoopPass {
 
     bool Changed = sinkLoopInvariantInstructions(
         *L, AA, getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
-        getAnalysis<DominatorTreeWrapperPass>().getDomTree(), 
-        getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(), 
+        getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+        getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(),
         SE ? &SE->getSE() : nullptr, CurAST.get(), MSSA);
 
     if (MSSA && VerifyMemorySSA)
       MSSA->verifyMemorySSA();
 
     return Changed;
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.setPreservesCFG(); 
-    AU.addRequired<BlockFrequencyInfoWrapperPass>(); 
-    getLoopAnalysisUsage(AU); 
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
+    getLoopAnalysisUsage(AU);
     if (EnableMSSAInLegacyLoopSink) {
       AU.addRequired<MemorySSAWrapperPass>();
       AU.addPreserved<MemorySSAWrapperPass>();
     }
-  } 
-}; 
-} 
- 
-char LegacyLoopSinkPass::ID = 0; 
-INITIALIZE_PASS_BEGIN(LegacyLoopSinkPass, "loop-sink", "Loop Sink", false, 
-                      false) 
-INITIALIZE_PASS_DEPENDENCY(LoopPass) 
-INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 
+  }
+};
+}
+
+char LegacyLoopSinkPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LegacyLoopSinkPass, "loop-sink", "Loop Sink", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
-INITIALIZE_PASS_END(LegacyLoopSinkPass, "loop-sink", "Loop Sink", false, false) 
- 
-Pass *llvm::createLoopSinkPass() { return new LegacyLoopSinkPass(); } 
+INITIALIZE_PASS_END(LegacyLoopSinkPass, "loop-sink", "Loop Sink", false, false)
+
+Pass *llvm::createLoopSinkPass() { return new LegacyLoopSinkPass(); }
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index a528f34c15..5dec9b5420 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -1,5631 +1,5631 @@
-//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This transformation analyzes and transforms the induction variables (and 
-// computations derived from them) into forms suitable for efficient execution 
-// on the target. 
-// 
-// This pass performs a strength reduction on array references inside loops that 
-// have as one or more of their components the loop induction variable, it 
-// rewrites expressions to take advantage of scaled-index addressing modes 
-// available on the target, and it performs a variety of other optimizations 
-// related to loop induction variables. 
-// 
-// Terminology note: this code has a lot of handling for "post-increment" or 
-// "post-inc" users. This is not talking about post-increment addressing modes; 
-// it is instead talking about code like this: 
-// 
-//   %i = phi [ 0, %entry ], [ %i.next, %latch ] 
-//   ... 
-//   %i.next = add %i, 1 
-//   %c = icmp eq %i.next, %n 
-// 
-// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however 
-// it's useful to think about these as the same register, with some uses using 
-// the value of the register before the add and some using it after. In this 
-// example, the icmp is a post-increment user, since it uses %i.next, which is 
-// the value of the induction variable after the increment. The other common 
-// case of post-increment users is users outside the loop. 
-// 
-// TODO: More sophistication in the way Formulae are generated and filtered. 
-// 
-// TODO: Handle multiple loops at a time. 
-// 
-// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead 
-//       of a GlobalValue? 
-// 
-// TODO: When truncation is free, truncate ICmp users' operands to make it a 
-//       smaller encoding (on x86 at least). 
-// 
-// TODO: When a negated register is used by an add (such as in a list of 
-//       multiple base registers, or as the increment expression in an addrec), 
-//       we may not actually need both reg and (-1 * reg) in registers; the 
-//       negation can be implemented by using a sub instead of an add. The 
-//       lack of support for taking this into consideration when making 
-//       register pressure decisions is partly worked around by the "Special" 
-//       use kind. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/LoopStrengthReduce.h" 
-#include "llvm/ADT/APInt.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/DenseSet.h" 
-#include "llvm/ADT/Hashing.h" 
-#include "llvm/ADT/PointerIntPair.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/ADT/SmallBitVector.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/iterator_range.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/IVUsers.h" 
-#include "llvm/Analysis/LoopAnalysisManager.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/LoopPass.h" 
-#include "llvm/Analysis/MemorySSA.h" 
-#include "llvm/Analysis/MemorySSAUpdater.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/Analysis/ScalarEvolutionExpressions.h" 
-#include "llvm/Analysis/ScalarEvolutionNormalization.h" 
+//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation analyzes and transforms the induction variables (and
+// computations derived from them) into forms suitable for efficient execution
+// on the target.
+//
+// This pass performs a strength reduction on array references inside loops that
+// have as one or more of their components the loop induction variable, it
+// rewrites expressions to take advantage of scaled-index addressing modes
+// available on the target, and it performs a variety of other optimizations
+// related to loop induction variables.
+//
+// Terminology note: this code has a lot of handling for "post-increment" or
+// "post-inc" users. This is not talking about post-increment addressing modes;
+// it is instead talking about code like this:
+//
+//   %i = phi [ 0, %entry ], [ %i.next, %latch ]
+//   ...
+//   %i.next = add %i, 1
+//   %c = icmp eq %i.next, %n
+//
+// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
+// it's useful to think about these as the same register, with some uses using
+// the value of the register before the add and some using it after. In this
+// example, the icmp is a post-increment user, since it uses %i.next, which is
+// the value of the induction variable after the increment. The other common
+// case of post-increment users is users outside the loop.
+//
+// TODO: More sophistication in the way Formulae are generated and filtered.
+//
+// TODO: Handle multiple loops at a time.
+//
+// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
+//       of a GlobalValue?
+//
+// TODO: When truncation is free, truncate ICmp users' operands to make it a
+//       smaller encoding (on x86 at least).
+//
+// TODO: When a negated register is used by an add (such as in a list of
+//       multiple base registers, or as the increment expression in an addrec),
+//       we may not actually need both reg and (-1 * reg) in registers; the
+//       negation can be implemented by using a sub instead of an add. The
+//       lack of support for taking this into consideration when making
+//       register pressure decisions is partly worked around by the "Special"
+//       use kind.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopStrengthReduce.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ScalarEvolutionNormalization.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/Config/llvm-config.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/GlobalValue.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/OperandTraits.h" 
-#include "llvm/IR/Operator.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/IR/ValueHandle.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Compiler.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/MathExtras.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <cstddef> 
-#include <cstdint> 
-#include <cstdlib> 
-#include <iterator> 
-#include <limits> 
-#include <map> 
-#include <numeric> 
-#include <utility> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "loop-reduce" 
- 
-/// MaxIVUsers is an arbitrary threshold that provides an early opportunity for 
-/// bail out. This threshold is far beyond the number of users that LSR can 
-/// conceivably solve, so it should not affect generated code, but catches the 
-/// worst cases before LSR burns too much compile time and stack space. 
-static const unsigned MaxIVUsers = 200; 
- 
-// Temporary flag to cleanup congruent phis after LSR phi expansion. 
-// It's currently disabled until we can determine whether it's truly useful or 
-// not. The flag should be removed after the v3.0 release. 
-// This is now needed for ivchains. 
-static cl::opt<bool> EnablePhiElim( 
-  "enable-lsr-phielim", cl::Hidden, cl::init(true), 
-  cl::desc("Enable LSR phi elimination")); 
- 
-// The flag adds instruction count to solutions cost comparision. 
-static cl::opt<bool> InsnsCost( 
-  "lsr-insns-cost", cl::Hidden, cl::init(true), 
-  cl::desc("Add instruction count to a LSR cost model")); 
- 
-// Flag to choose how to narrow complex lsr solution 
-static cl::opt<bool> LSRExpNarrow( 
-  "lsr-exp-narrow", cl::Hidden, cl::init(false), 
-  cl::desc("Narrow LSR complex solution using" 
-           " expectation of registers number")); 
- 
-// Flag to narrow search space by filtering non-optimal formulae with 
-// the same ScaledReg and Scale. 
-static cl::opt<bool> FilterSameScaledReg( 
-    "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true), 
-    cl::desc("Narrow LSR search space by filtering non-optimal formulae" 
-             " with the same ScaledReg and Scale")); 
- 
-static cl::opt<bool> EnableBackedgeIndexing( 
-  "lsr-backedge-indexing", cl::Hidden, cl::init(true), 
-  cl::desc("Enable the generation of cross iteration indexed memops")); 
- 
-static cl::opt<unsigned> ComplexityLimit( 
-  "lsr-complexity-limit", cl::Hidden, 
-  cl::init(std::numeric_limits<uint16_t>::max()), 
-  cl::desc("LSR search space complexity limit")); 
- 
-static cl::opt<unsigned> SetupCostDepthLimit( 
-    "lsr-setupcost-depth-limit", cl::Hidden, cl::init(7), 
-    cl::desc("The limit on recursion depth for LSRs setup cost")); 
- 
-#ifndef NDEBUG 
-// Stress test IV chain generation. 
-static cl::opt<bool> StressIVChain( 
-  "stress-ivchain", cl::Hidden, cl::init(false), 
-  cl::desc("Stress test LSR IV chains")); 
-#else 
-static bool StressIVChain = false; 
-#endif 
- 
-namespace { 
- 
-struct MemAccessTy { 
-  /// Used in situations where the accessed memory type is unknown. 
-  static const unsigned UnknownAddressSpace = 
-      std::numeric_limits<unsigned>::max(); 
- 
-  Type *MemTy = nullptr; 
-  unsigned AddrSpace = UnknownAddressSpace; 
- 
-  MemAccessTy() = default; 
-  MemAccessTy(Type *Ty, unsigned AS) : MemTy(Ty), AddrSpace(AS) {} 
- 
-  bool operator==(MemAccessTy Other) const { 
-    return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace; 
-  } 
- 
-  bool operator!=(MemAccessTy Other) const { return !(*this == Other); } 
- 
-  static MemAccessTy getUnknown(LLVMContext &Ctx, 
-                                unsigned AS = UnknownAddressSpace) { 
-    return MemAccessTy(Type::getVoidTy(Ctx), AS); 
-  } 
- 
-  Type *getType() { return MemTy; } 
-}; 
- 
-/// This class holds data which is used to order reuse candidates. 
-class RegSortData { 
-public: 
-  /// This represents the set of LSRUse indices which reference 
-  /// a particular register. 
-  SmallBitVector UsedByIndices; 
- 
-  void print(raw_ostream &OS) const; 
-  void dump() const; 
-}; 
- 
-} // end anonymous namespace 
- 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 
-void RegSortData::print(raw_ostream &OS) const { 
-  OS << "[NumUses=" << UsedByIndices.count() << ']'; 
-} 
- 
-LLVM_DUMP_METHOD void RegSortData::dump() const { 
-  print(errs()); errs() << '\n'; 
-} 
-#endif 
- 
-namespace { 
- 
-/// Map register candidates to information about how they are used. 
-class RegUseTracker { 
-  using RegUsesTy = DenseMap<const SCEV *, RegSortData>; 
- 
-  RegUsesTy RegUsesMap; 
-  SmallVector<const SCEV *, 16> RegSequence; 
- 
-public: 
-  void countRegister(const SCEV *Reg, size_t LUIdx); 
-  void dropRegister(const SCEV *Reg, size_t LUIdx); 
-  void swapAndDropUse(size_t LUIdx, size_t LastLUIdx); 
- 
-  bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const; 
- 
-  const SmallBitVector &getUsedByIndices(const SCEV *Reg) const; 
- 
-  void clear(); 
- 
-  using iterator = SmallVectorImpl<const SCEV *>::iterator; 
-  using const_iterator = SmallVectorImpl<const SCEV *>::const_iterator; 
- 
-  iterator begin() { return RegSequence.begin(); } 
-  iterator end()   { return RegSequence.end(); } 
-  const_iterator begin() const { return RegSequence.begin(); } 
-  const_iterator end() const   { return RegSequence.end(); } 
-}; 
- 
-} // end anonymous namespace 
- 
-void 
-RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) { 
-  std::pair<RegUsesTy::iterator, bool> Pair = 
-    RegUsesMap.insert(std::make_pair(Reg, RegSortData())); 
-  RegSortData &RSD = Pair.first->second; 
-  if (Pair.second) 
-    RegSequence.push_back(Reg); 
-  RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1)); 
-  RSD.UsedByIndices.set(LUIdx); 
-} 
- 
-void 
-RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) { 
-  RegUsesTy::iterator It = RegUsesMap.find(Reg); 
-  assert(It != RegUsesMap.end()); 
-  RegSortData &RSD = It->second; 
-  assert(RSD.UsedByIndices.size() > LUIdx); 
-  RSD.UsedByIndices.reset(LUIdx); 
-} 
- 
-void 
-RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) { 
-  assert(LUIdx <= LastLUIdx); 
- 
-  // Update RegUses. The data structure is not optimized for this purpose; 
-  // we must iterate through it and update each of the bit vectors. 
-  for (auto &Pair : RegUsesMap) { 
-    SmallBitVector &UsedByIndices = Pair.second.UsedByIndices; 
-    if (LUIdx < UsedByIndices.size()) 
-      UsedByIndices[LUIdx] = 
-        LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : false; 
-    UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx)); 
-  } 
-} 
- 
-bool 
-RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const { 
-  RegUsesTy::const_iterator I = RegUsesMap.find(Reg); 
-  if (I == RegUsesMap.end()) 
-    return false; 
-  const SmallBitVector &UsedByIndices = I->second.UsedByIndices; 
-  int i = UsedByIndices.find_first(); 
-  if (i == -1) return false; 
-  if ((size_t)i != LUIdx) return true; 
-  return UsedByIndices.find_next(i) != -1; 
-} 
- 
-const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const { 
-  RegUsesTy::const_iterator I = RegUsesMap.find(Reg); 
-  assert(I != RegUsesMap.end() && "Unknown register!"); 
-  return I->second.UsedByIndices; 
-} 
- 
-void RegUseTracker::clear() { 
-  RegUsesMap.clear(); 
-  RegSequence.clear(); 
-} 
- 
-namespace { 
- 
-/// This class holds information that describes a formula for computing 
-/// satisfying a use. It may include broken-out immediates and scaled registers. 
-struct Formula { 
-  /// Global base address used for complex addressing. 
-  GlobalValue *BaseGV = nullptr; 
- 
-  /// Base offset for complex addressing. 
-  int64_t BaseOffset = 0; 
- 
-  /// Whether any complex addressing has a base register. 
-  bool HasBaseReg = false; 
- 
-  /// The scale of any complex addressing. 
-  int64_t Scale = 0; 
- 
-  /// The list of "base" registers for this use. When this is non-empty. The 
-  /// canonical representation of a formula is 
-  /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and 
-  /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty(). 
-  /// 3. The reg containing recurrent expr related with currect loop in the 
-  /// formula should be put in the ScaledReg. 
-  /// #1 enforces that the scaled register is always used when at least two 
-  /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2. 
-  /// #2 enforces that 1 * reg is reg. 
-  /// #3 ensures invariant regs with respect to current loop can be combined 
-  /// together in LSR codegen. 
-  /// This invariant can be temporarily broken while building a formula. 
-  /// However, every formula inserted into the LSRInstance must be in canonical 
-  /// form. 
-  SmallVector<const SCEV *, 4> BaseRegs; 
- 
-  /// The 'scaled' register for this use. This should be non-null when Scale is 
-  /// not zero. 
-  const SCEV *ScaledReg = nullptr; 
- 
-  /// An additional constant offset which added near the use. This requires a 
-  /// temporary register, but the offset itself can live in an add immediate 
-  /// field rather than a register. 
-  int64_t UnfoldedOffset = 0; 
- 
-  Formula() = default; 
- 
-  void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE); 
- 
-  bool isCanonical(const Loop &L) const; 
- 
-  void canonicalize(const Loop &L); 
- 
-  bool unscale(); 
- 
-  bool hasZeroEnd() const; 
- 
-  size_t getNumRegs() const; 
-  Type *getType() const; 
- 
-  void deleteBaseReg(const SCEV *&S); 
- 
-  bool referencesReg(const SCEV *S) const; 
-  bool hasRegsUsedByUsesOtherThan(size_t LUIdx, 
-                                  const RegUseTracker &RegUses) const; 
- 
-  void print(raw_ostream &OS) const; 
-  void dump() const; 
-}; 
- 
-} // end anonymous namespace 
- 
-/// Recursion helper for initialMatch. 
-static void DoInitialMatch(const SCEV *S, Loop *L, 
-                           SmallVectorImpl<const SCEV *> &Good, 
-                           SmallVectorImpl<const SCEV *> &Bad, 
-                           ScalarEvolution &SE) { 
-  // Collect expressions which properly dominate the loop header. 
-  if (SE.properlyDominates(S, L->getHeader())) { 
-    Good.push_back(S); 
-    return; 
-  } 
- 
-  // Look at add operands. 
-  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { 
-    for (const SCEV *S : Add->operands()) 
-      DoInitialMatch(S, L, Good, Bad, SE); 
-    return; 
-  } 
- 
-  // Look at addrec operands. 
-  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) 
-    if (!AR->getStart()->isZero() && AR->isAffine()) { 
-      DoInitialMatch(AR->getStart(), L, Good, Bad, SE); 
-      DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0), 
-                                      AR->getStepRecurrence(SE), 
-                                      // FIXME: AR->getNoWrapFlags() 
-                                      AR->getLoop(), SCEV::FlagAnyWrap), 
-                     L, Good, Bad, SE); 
-      return; 
-    } 
- 
-  // Handle a multiplication by -1 (negation) if it didn't fold. 
-  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) 
-    if (Mul->getOperand(0)->isAllOnesValue()) { 
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/OperandTraits.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <iterator>
+#include <limits>
+#include <map>
+#include <numeric>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-reduce"
+
+/// MaxIVUsers is an arbitrary threshold that provides an early opportunity for
+/// bail out. This threshold is far beyond the number of users that LSR can
+/// conceivably solve, so it should not affect generated code, but catches the
+/// worst cases before LSR burns too much compile time and stack space.
+static const unsigned MaxIVUsers = 200;
+
+// Temporary flag to cleanup congruent phis after LSR phi expansion.
+// It's currently disabled until we can determine whether it's truly useful or
+// not. The flag should be removed after the v3.0 release.
+// This is now needed for ivchains.
+static cl::opt<bool> EnablePhiElim(
+  "enable-lsr-phielim", cl::Hidden, cl::init(true),
+  cl::desc("Enable LSR phi elimination"));
+
+// The flag adds instruction count to solutions cost comparision.
+static cl::opt<bool> InsnsCost(
+  "lsr-insns-cost", cl::Hidden, cl::init(true),
+  cl::desc("Add instruction count to a LSR cost model"));
+
+// Flag to choose how to narrow complex lsr solution
+static cl::opt<bool> LSRExpNarrow(
+  "lsr-exp-narrow", cl::Hidden, cl::init(false),
+  cl::desc("Narrow LSR complex solution using"
+           " expectation of registers number"));
+
+// Flag to narrow search space by filtering non-optimal formulae with
+// the same ScaledReg and Scale.
+static cl::opt<bool> FilterSameScaledReg(
+    "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true),
+    cl::desc("Narrow LSR search space by filtering non-optimal formulae"
+             " with the same ScaledReg and Scale"));
+
+static cl::opt<bool> EnableBackedgeIndexing(
+  "lsr-backedge-indexing", cl::Hidden, cl::init(true),
+  cl::desc("Enable the generation of cross iteration indexed memops"));
+
+static cl::opt<unsigned> ComplexityLimit(
+  "lsr-complexity-limit", cl::Hidden,
+  cl::init(std::numeric_limits<uint16_t>::max()),
+  cl::desc("LSR search space complexity limit"));
+
+static cl::opt<unsigned> SetupCostDepthLimit(
+    "lsr-setupcost-depth-limit", cl::Hidden, cl::init(7),
+    cl::desc("The limit on recursion depth for LSRs setup cost"));
+
+#ifndef NDEBUG
+// Stress test IV chain generation.
+static cl::opt<bool> StressIVChain(
+  "stress-ivchain", cl::Hidden, cl::init(false),
+  cl::desc("Stress test LSR IV chains"));
+#else
+static bool StressIVChain = false;
+#endif
+
+namespace {
+
+struct MemAccessTy {
+  /// Used in situations where the accessed memory type is unknown.
+  static const unsigned UnknownAddressSpace =
+      std::numeric_limits<unsigned>::max();
+
+  Type *MemTy = nullptr;
+  unsigned AddrSpace = UnknownAddressSpace;
+
+  MemAccessTy() = default;
+  MemAccessTy(Type *Ty, unsigned AS) : MemTy(Ty), AddrSpace(AS) {}
+
+  bool operator==(MemAccessTy Other) const {
+    return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
+  }
+
+  bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
+
+  static MemAccessTy getUnknown(LLVMContext &Ctx,
+                                unsigned AS = UnknownAddressSpace) {
+    return MemAccessTy(Type::getVoidTy(Ctx), AS);
+  }
+
+  Type *getType() { return MemTy; }
+};
+
+/// This class holds data which is used to order reuse candidates.
+class RegSortData {
+public:
+  /// This represents the set of LSRUse indices which reference
+  /// a particular register.
+  SmallBitVector UsedByIndices;
+
+  void print(raw_ostream &OS) const;
+  void dump() const;
+};
+
+} // end anonymous namespace
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void RegSortData::print(raw_ostream &OS) const {
+  OS << "[NumUses=" << UsedByIndices.count() << ']';
+}
+
+LLVM_DUMP_METHOD void RegSortData::dump() const {
+  print(errs()); errs() << '\n';
+}
+#endif
+
+namespace {
+
+/// Map register candidates to information about how they are used.
+class RegUseTracker {
+  using RegUsesTy = DenseMap<const SCEV *, RegSortData>;
+
+  RegUsesTy RegUsesMap;
+  SmallVector<const SCEV *, 16> RegSequence;
+
+public:
+  void countRegister(const SCEV *Reg, size_t LUIdx);
+  void dropRegister(const SCEV *Reg, size_t LUIdx);
+  void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
+
+  bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
+
+  const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
+
+  void clear();
+
+  using iterator = SmallVectorImpl<const SCEV *>::iterator;
+  using const_iterator = SmallVectorImpl<const SCEV *>::const_iterator;
+
+  iterator begin() { return RegSequence.begin(); }
+  iterator end()   { return RegSequence.end(); }
+  const_iterator begin() const { return RegSequence.begin(); }
+  const_iterator end() const   { return RegSequence.end(); }
+};
+
+} // end anonymous namespace
+
+void
+RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
+  std::pair<RegUsesTy::iterator, bool> Pair =
+    RegUsesMap.insert(std::make_pair(Reg, RegSortData()));
+  RegSortData &RSD = Pair.first->second;
+  if (Pair.second)
+    RegSequence.push_back(Reg);
+  RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
+  RSD.UsedByIndices.set(LUIdx);
+}
+
+void
+RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
+  RegUsesTy::iterator It = RegUsesMap.find(Reg);
+  assert(It != RegUsesMap.end());
+  RegSortData &RSD = It->second;
+  assert(RSD.UsedByIndices.size() > LUIdx);
+  RSD.UsedByIndices.reset(LUIdx);
+}
+
+void
+RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
+  assert(LUIdx <= LastLUIdx);
+
+  // Update RegUses. The data structure is not optimized for this purpose;
+  // we must iterate through it and update each of the bit vectors.
+  for (auto &Pair : RegUsesMap) {
+    SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
+    if (LUIdx < UsedByIndices.size())
+      UsedByIndices[LUIdx] =
+        LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : false;
+    UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
+  }
+}
+
+bool
+RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
+  RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
+  if (I == RegUsesMap.end())
+    return false;
+  const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
+  int i = UsedByIndices.find_first();
+  if (i == -1) return false;
+  if ((size_t)i != LUIdx) return true;
+  return UsedByIndices.find_next(i) != -1;
+}
+
+const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
+  RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
+  assert(I != RegUsesMap.end() && "Unknown register!");
+  return I->second.UsedByIndices;
+}
+
+void RegUseTracker::clear() {
+  RegUsesMap.clear();
+  RegSequence.clear();
+}
+
+namespace {
+
+/// This class holds information that describes a formula for computing
+/// satisfying a use. It may include broken-out immediates and scaled registers.
+struct Formula {
+  /// Global base address used for complex addressing.
+  GlobalValue *BaseGV = nullptr;
+
+  /// Base offset for complex addressing.
+  int64_t BaseOffset = 0;
+
+  /// Whether any complex addressing has a base register.
+  bool HasBaseReg = false;
+
+  /// The scale of any complex addressing.
+  int64_t Scale = 0;
+
+  /// The list of "base" registers for this use. When this is non-empty. The
+  /// canonical representation of a formula is
+  /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
+  /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
+  /// 3. The reg containing recurrent expr related with currect loop in the
+  /// formula should be put in the ScaledReg.
+  /// #1 enforces that the scaled register is always used when at least two
+  /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
+  /// #2 enforces that 1 * reg is reg.
+  /// #3 ensures invariant regs with respect to current loop can be combined
+  /// together in LSR codegen.
+  /// This invariant can be temporarily broken while building a formula.
+  /// However, every formula inserted into the LSRInstance must be in canonical
+  /// form.
+  SmallVector<const SCEV *, 4> BaseRegs;
+
+  /// The 'scaled' register for this use. This should be non-null when Scale is
+  /// not zero.
+  const SCEV *ScaledReg = nullptr;
+
+  /// An additional constant offset which added near the use. This requires a
+  /// temporary register, but the offset itself can live in an add immediate
+  /// field rather than a register.
+  int64_t UnfoldedOffset = 0;
+
+  Formula() = default;
+
+  void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
+
+  bool isCanonical(const Loop &L) const;
+
+  void canonicalize(const Loop &L);
+
+  bool unscale();
+
+  bool hasZeroEnd() const;
+
+  size_t getNumRegs() const;
+  Type *getType() const;
+
+  void deleteBaseReg(const SCEV *&S);
+
+  bool referencesReg(const SCEV *S) const;
+  bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
+                                  const RegUseTracker &RegUses) const;
+
+  void print(raw_ostream &OS) const;
+  void dump() const;
+};
+
+} // end anonymous namespace
+
+/// Recursion helper for initialMatch.
+static void DoInitialMatch(const SCEV *S, Loop *L,
+                           SmallVectorImpl<const SCEV *> &Good,
+                           SmallVectorImpl<const SCEV *> &Bad,
+                           ScalarEvolution &SE) {
+  // Collect expressions which properly dominate the loop header.
+  if (SE.properlyDominates(S, L->getHeader())) {
+    Good.push_back(S);
+    return;
+  }
+
+  // Look at add operands.
+  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+    for (const SCEV *S : Add->operands())
+      DoInitialMatch(S, L, Good, Bad, SE);
+    return;
+  }
+
+  // Look at addrec operands.
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
+    if (!AR->getStart()->isZero() && AR->isAffine()) {
+      DoInitialMatch(AR->getStart(), L, Good, Bad, SE);
+      DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
+                                      AR->getStepRecurrence(SE),
+                                      // FIXME: AR->getNoWrapFlags()
+                                      AR->getLoop(), SCEV::FlagAnyWrap),
+                     L, Good, Bad, SE);
+      return;
+    }
+
+  // Handle a multiplication by -1 (negation) if it didn't fold.
+  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
+    if (Mul->getOperand(0)->isAllOnesValue()) {
       SmallVector<const SCEV *, 4> Ops(drop_begin(Mul->operands()));
-      const SCEV *NewMul = SE.getMulExpr(Ops); 
- 
-      SmallVector<const SCEV *, 4> MyGood; 
-      SmallVector<const SCEV *, 4> MyBad; 
-      DoInitialMatch(NewMul, L, MyGood, MyBad, SE); 
-      const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue( 
-        SE.getEffectiveSCEVType(NewMul->getType()))); 
-      for (const SCEV *S : MyGood) 
-        Good.push_back(SE.getMulExpr(NegOne, S)); 
-      for (const SCEV *S : MyBad) 
-        Bad.push_back(SE.getMulExpr(NegOne, S)); 
-      return; 
-    } 
- 
-  // Ok, we can't do anything interesting. Just stuff the whole thing into a 
-  // register and hope for the best. 
-  Bad.push_back(S); 
-} 
- 
-/// Incorporate loop-variant parts of S into this Formula, attempting to keep 
-/// all loop-invariant and loop-computable values in a single base register. 
-void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) { 
-  SmallVector<const SCEV *, 4> Good; 
-  SmallVector<const SCEV *, 4> Bad; 
-  DoInitialMatch(S, L, Good, Bad, SE); 
-  if (!Good.empty()) { 
-    const SCEV *Sum = SE.getAddExpr(Good); 
-    if (!Sum->isZero()) 
-      BaseRegs.push_back(Sum); 
-    HasBaseReg = true; 
-  } 
-  if (!Bad.empty()) { 
-    const SCEV *Sum = SE.getAddExpr(Bad); 
-    if (!Sum->isZero()) 
-      BaseRegs.push_back(Sum); 
-    HasBaseReg = true; 
-  } 
-  canonicalize(*L); 
-} 
- 
-/// Check whether or not this formula satisfies the canonical 
-/// representation. 
-/// \see Formula::BaseRegs. 
-bool Formula::isCanonical(const Loop &L) const { 
-  if (!ScaledReg) 
-    return BaseRegs.size() <= 1; 
- 
-  if (Scale != 1) 
-    return true; 
- 
-  if (Scale == 1 && BaseRegs.empty()) 
-    return false; 
- 
-  const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg); 
-  if (SAR && SAR->getLoop() == &L) 
-    return true; 
- 
-  // If ScaledReg is not a recurrent expr, or it is but its loop is not current 
-  // loop, meanwhile BaseRegs contains a recurrent expr reg related with current 
-  // loop, we want to swap the reg in BaseRegs with ScaledReg. 
+      const SCEV *NewMul = SE.getMulExpr(Ops);
+
+      SmallVector<const SCEV *, 4> MyGood;
+      SmallVector<const SCEV *, 4> MyBad;
+      DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
+      const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
+        SE.getEffectiveSCEVType(NewMul->getType())));
+      for (const SCEV *S : MyGood)
+        Good.push_back(SE.getMulExpr(NegOne, S));
+      for (const SCEV *S : MyBad)
+        Bad.push_back(SE.getMulExpr(NegOne, S));
+      return;
+    }
+
+  // Ok, we can't do anything interesting. Just stuff the whole thing into a
+  // register and hope for the best.
+  Bad.push_back(S);
+}
+
+/// Incorporate loop-variant parts of S into this Formula, attempting to keep
+/// all loop-invariant and loop-computable values in a single base register.
+void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
+  SmallVector<const SCEV *, 4> Good;
+  SmallVector<const SCEV *, 4> Bad;
+  DoInitialMatch(S, L, Good, Bad, SE);
+  if (!Good.empty()) {
+    const SCEV *Sum = SE.getAddExpr(Good);
+    if (!Sum->isZero())
+      BaseRegs.push_back(Sum);
+    HasBaseReg = true;
+  }
+  if (!Bad.empty()) {
+    const SCEV *Sum = SE.getAddExpr(Bad);
+    if (!Sum->isZero())
+      BaseRegs.push_back(Sum);
+    HasBaseReg = true;
+  }
+  canonicalize(*L);
+}
+
+/// Check whether or not this formula satisfies the canonical
+/// representation.
+/// \see Formula::BaseRegs.
+bool Formula::isCanonical(const Loop &L) const {
+  if (!ScaledReg)
+    return BaseRegs.size() <= 1;
+
+  if (Scale != 1)
+    return true;
+
+  if (Scale == 1 && BaseRegs.empty())
+    return false;
+
+  const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg);
+  if (SAR && SAR->getLoop() == &L)
+    return true;
+
+  // If ScaledReg is not a recurrent expr, or it is but its loop is not current
+  // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
+  // loop, we want to swap the reg in BaseRegs with ScaledReg.
   auto I = find_if(BaseRegs, [&](const SCEV *S) {
     return isa<const SCEVAddRecExpr>(S) &&
            (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
   });
-  return I == BaseRegs.end(); 
-} 
- 
-/// Helper method to morph a formula into its canonical representation. 
-/// \see Formula::BaseRegs. 
-/// Every formula having more than one base register, must use the ScaledReg 
-/// field. Otherwise, we would have to do special cases everywhere in LSR 
-/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ... 
-/// On the other hand, 1*reg should be canonicalized into reg. 
-void Formula::canonicalize(const Loop &L) { 
-  if (isCanonical(L)) 
-    return; 
-  // So far we did not need this case. This is easy to implement but it is 
-  // useless to maintain dead code. Beside it could hurt compile time. 
-  assert(!BaseRegs.empty() && "1*reg => reg, should not be needed."); 
- 
-  // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg. 
-  if (!ScaledReg) { 
+  return I == BaseRegs.end();
+}
+
+/// Helper method to morph a formula into its canonical representation.
+/// \see Formula::BaseRegs.
+/// Every formula having more than one base register, must use the ScaledReg
+/// field. Otherwise, we would have to do special cases everywhere in LSR
+/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
+/// On the other hand, 1*reg should be canonicalized into reg.
+void Formula::canonicalize(const Loop &L) {
+  if (isCanonical(L))
+    return;
+  // So far we did not need this case. This is easy to implement but it is
+  // useless to maintain dead code. Beside it could hurt compile time.
+  assert(!BaseRegs.empty() && "1*reg => reg, should not be needed.");
+
+  // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
+  if (!ScaledReg) {
     ScaledReg = BaseRegs.pop_back_val();
-    Scale = 1; 
-  } 
- 
-  // If ScaledReg is an invariant with respect to L, find the reg from 
-  // BaseRegs containing the recurrent expr related with Loop L. Swap the 
-  // reg with ScaledReg. 
-  const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg); 
-  if (!SAR || SAR->getLoop() != &L) { 
+    Scale = 1;
+  }
+
+  // If ScaledReg is an invariant with respect to L, find the reg from
+  // BaseRegs containing the recurrent expr related with Loop L. Swap the
+  // reg with ScaledReg.
+  const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg);
+  if (!SAR || SAR->getLoop() != &L) {
     auto I = find_if(BaseRegs, [&](const SCEV *S) {
       return isa<const SCEVAddRecExpr>(S) &&
              (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
     });
-    if (I != BaseRegs.end()) 
-      std::swap(ScaledReg, *I); 
-  } 
-} 
- 
-/// Get rid of the scale in the formula. 
-/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2. 
-/// \return true if it was possible to get rid of the scale, false otherwise. 
-/// \note After this operation the formula may not be in the canonical form. 
-bool Formula::unscale() { 
-  if (Scale != 1) 
-    return false; 
-  Scale = 0; 
-  BaseRegs.push_back(ScaledReg); 
-  ScaledReg = nullptr; 
-  return true; 
-} 
- 
-bool Formula::hasZeroEnd() const { 
-  if (UnfoldedOffset || BaseOffset) 
-    return false; 
-  if (BaseRegs.size() != 1 || ScaledReg) 
-    return false; 
-  return true; 
-} 
- 
-/// Return the total number of register operands used by this formula. This does 
-/// not include register uses implied by non-constant addrec strides. 
-size_t Formula::getNumRegs() const { 
-  return !!ScaledReg + BaseRegs.size(); 
-} 
- 
-/// Return the type of this formula, if it has one, or null otherwise. This type 
-/// is meaningless except for the bit size. 
-Type *Formula::getType() const { 
-  return !BaseRegs.empty() ? BaseRegs.front()->getType() : 
-         ScaledReg ? ScaledReg->getType() : 
-         BaseGV ? BaseGV->getType() : 
-         nullptr; 
-} 
- 
-/// Delete the given base reg from the BaseRegs list. 
-void Formula::deleteBaseReg(const SCEV *&S) { 
-  if (&S != &BaseRegs.back()) 
-    std::swap(S, BaseRegs.back()); 
-  BaseRegs.pop_back(); 
-} 
- 
-/// Test if this formula references the given register. 
-bool Formula::referencesReg(const SCEV *S) const { 
-  return S == ScaledReg || is_contained(BaseRegs, S); 
-} 
- 
-/// Test whether this formula uses registers which are used by uses other than 
-/// the use with the given index. 
-bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx, 
-                                         const RegUseTracker &RegUses) const { 
-  if (ScaledReg) 
-    if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx)) 
-      return true; 
-  for (const SCEV *BaseReg : BaseRegs) 
-    if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx)) 
-      return true; 
-  return false; 
-} 
- 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 
-void Formula::print(raw_ostream &OS) const { 
-  bool First = true; 
-  if (BaseGV) { 
-    if (!First) OS << " + "; else First = false; 
-    BaseGV->printAsOperand(OS, /*PrintType=*/false); 
-  } 
-  if (BaseOffset != 0) { 
-    if (!First) OS << " + "; else First = false; 
-    OS << BaseOffset; 
-  } 
-  for (const SCEV *BaseReg : BaseRegs) { 
-    if (!First) OS << " + "; else First = false; 
-    OS << "reg(" << *BaseReg << ')'; 
-  } 
-  if (HasBaseReg && BaseRegs.empty()) { 
-    if (!First) OS << " + "; else First = false; 
-    OS << "**error: HasBaseReg**"; 
-  } else if (!HasBaseReg && !BaseRegs.empty()) { 
-    if (!First) OS << " + "; else First = false; 
-    OS << "**error: !HasBaseReg**"; 
-  } 
-  if (Scale != 0) { 
-    if (!First) OS << " + "; else First = false; 
-    OS << Scale << "*reg("; 
-    if (ScaledReg) 
-      OS << *ScaledReg; 
-    else 
-      OS << "<unknown>"; 
-    OS << ')'; 
-  } 
-  if (UnfoldedOffset != 0) { 
-    if (!First) OS << " + "; 
-    OS << "imm(" << UnfoldedOffset << ')'; 
-  } 
-} 
- 
-LLVM_DUMP_METHOD void Formula::dump() const { 
-  print(errs()); errs() << '\n'; 
-} 
-#endif 
- 
-/// Return true if the given addrec can be sign-extended without changing its 
-/// value. 
-static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) { 
-  Type *WideTy = 
-    IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1); 
-  return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy)); 
-} 
- 
-/// Return true if the given add can be sign-extended without changing its 
-/// value. 
-static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) { 
-  Type *WideTy = 
-    IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1); 
-  return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy)); 
-} 
- 
-/// Return true if the given mul can be sign-extended without changing its 
-/// value. 
-static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) { 
-  Type *WideTy = 
-    IntegerType::get(SE.getContext(), 
-                     SE.getTypeSizeInBits(M->getType()) * M->getNumOperands()); 
-  return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy)); 
-} 
- 
-/// Return an expression for LHS /s RHS, if it can be determined and if the 
-/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits 
-/// is true, expressions like (X * Y) /s Y are simplified to Y, ignoring that 
-/// the multiplication may overflow, which is useful when the result will be 
-/// used in a context where the most significant bits are ignored. 
-static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, 
-                                ScalarEvolution &SE, 
-                                bool IgnoreSignificantBits = false) { 
-  // Handle the trivial case, which works for any SCEV type. 
-  if (LHS == RHS) 
-    return SE.getConstant(LHS->getType(), 1); 
- 
-  // Handle a few RHS special cases. 
-  const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS); 
-  if (RC) { 
-    const APInt &RA = RC->getAPInt(); 
-    // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do 
-    // some folding. 
-    if (RA.isAllOnesValue()) 
-      return SE.getMulExpr(LHS, RC); 
-    // Handle x /s 1 as x. 
-    if (RA == 1) 
-      return LHS; 
-  } 
- 
-  // Check for a division of a constant by a constant. 
-  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) { 
-    if (!RC) 
-      return nullptr; 
-    const APInt &LA = C->getAPInt(); 
-    const APInt &RA = RC->getAPInt(); 
-    if (LA.srem(RA) != 0) 
-      return nullptr; 
-    return SE.getConstant(LA.sdiv(RA)); 
-  } 
- 
-  // Distribute the sdiv over addrec operands, if the addrec doesn't overflow. 
-  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) { 
-    if ((IgnoreSignificantBits || isAddRecSExtable(AR, SE)) && AR->isAffine()) { 
-      const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE, 
-                                      IgnoreSignificantBits); 
-      if (!Step) return nullptr; 
-      const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE, 
-                                       IgnoreSignificantBits); 
-      if (!Start) return nullptr; 
-      // FlagNW is independent of the start value, step direction, and is 
-      // preserved with smaller magnitude steps. 
-      // FIXME: AR->getNoWrapFlags(SCEV::FlagNW) 
-      return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap); 
-    } 
-    return nullptr; 
-  } 
- 
-  // Distribute the sdiv over add operands, if the add doesn't overflow. 
-  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) { 
-    if (IgnoreSignificantBits || isAddSExtable(Add, SE)) { 
-      SmallVector<const SCEV *, 8> Ops; 
-      for (const SCEV *S : Add->operands()) { 
-        const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits); 
-        if (!Op) return nullptr; 
-        Ops.push_back(Op); 
-      } 
-      return SE.getAddExpr(Ops); 
-    } 
-    return nullptr; 
-  } 
- 
-  // Check for a multiply operand that we can pull RHS out of. 
-  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) { 
-    if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) { 
-      SmallVector<const SCEV *, 4> Ops; 
-      bool Found = false; 
-      for (const SCEV *S : Mul->operands()) { 
-        if (!Found) 
-          if (const SCEV *Q = getExactSDiv(S, RHS, SE, 
-                                           IgnoreSignificantBits)) { 
-            S = Q; 
-            Found = true; 
-          } 
-        Ops.push_back(S); 
-      } 
-      return Found ? SE.getMulExpr(Ops) : nullptr; 
-    } 
-    return nullptr; 
-  } 
- 
-  // Otherwise we don't know. 
-  return nullptr; 
-} 
- 
-/// If S involves the addition of a constant integer value, return that integer 
-/// value, and mutate S to point to a new SCEV with that value excluded. 
-static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) { 
-  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) { 
-    if (C->getAPInt().getMinSignedBits() <= 64) { 
-      S = SE.getConstant(C->getType(), 0); 
-      return C->getValue()->getSExtValue(); 
-    } 
-  } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { 
+    if (I != BaseRegs.end())
+      std::swap(ScaledReg, *I);
+  }
+}
+
+/// Get rid of the scale in the formula.
+/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
+/// \return true if it was possible to get rid of the scale, false otherwise.
+/// \note After this operation the formula may not be in the canonical form.
+bool Formula::unscale() {
+  if (Scale != 1)
+    return false;
+  Scale = 0;
+  BaseRegs.push_back(ScaledReg);
+  ScaledReg = nullptr;
+  return true;
+}
+
+bool Formula::hasZeroEnd() const {
+  if (UnfoldedOffset || BaseOffset)
+    return false;
+  if (BaseRegs.size() != 1 || ScaledReg)
+    return false;
+  return true;
+}
+
+/// Return the total number of register operands used by this formula. This does
+/// not include register uses implied by non-constant addrec strides.
+size_t Formula::getNumRegs() const {
+  return !!ScaledReg + BaseRegs.size();
+}
+
+/// Return the type of this formula, if it has one, or null otherwise. This type
+/// is meaningless except for the bit size.
+Type *Formula::getType() const {
+  return !BaseRegs.empty() ? BaseRegs.front()->getType() :
+         ScaledReg ? ScaledReg->getType() :
+         BaseGV ? BaseGV->getType() :
+         nullptr;
+}
+
+/// Delete the given base reg from the BaseRegs list.
+void Formula::deleteBaseReg(const SCEV *&S) {
+  if (&S != &BaseRegs.back())
+    std::swap(S, BaseRegs.back());
+  BaseRegs.pop_back();
+}
+
+/// Test if this formula references the given register.
+bool Formula::referencesReg(const SCEV *S) const {
+  return S == ScaledReg || is_contained(BaseRegs, S);
+}
+
+/// Test whether this formula uses registers which are used by uses other than
+/// the use with the given index.
+bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
+                                         const RegUseTracker &RegUses) const {
+  if (ScaledReg)
+    if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
+      return true;
+  for (const SCEV *BaseReg : BaseRegs)
+    if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx))
+      return true;
+  return false;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void Formula::print(raw_ostream &OS) const {
+  bool First = true;
+  if (BaseGV) {
+    if (!First) OS << " + "; else First = false;
+    BaseGV->printAsOperand(OS, /*PrintType=*/false);
+  }
+  if (BaseOffset != 0) {
+    if (!First) OS << " + "; else First = false;
+    OS << BaseOffset;
+  }
+  for (const SCEV *BaseReg : BaseRegs) {
+    if (!First) OS << " + "; else First = false;
+    OS << "reg(" << *BaseReg << ')';
+  }
+  if (HasBaseReg && BaseRegs.empty()) {
+    if (!First) OS << " + "; else First = false;
+    OS << "**error: HasBaseReg**";
+  } else if (!HasBaseReg && !BaseRegs.empty()) {
+    if (!First) OS << " + "; else First = false;
+    OS << "**error: !HasBaseReg**";
+  }
+  if (Scale != 0) {
+    if (!First) OS << " + "; else First = false;
+    OS << Scale << "*reg(";
+    if (ScaledReg)
+      OS << *ScaledReg;
+    else
+      OS << "<unknown>";
+    OS << ')';
+  }
+  if (UnfoldedOffset != 0) {
+    if (!First) OS << " + ";
+    OS << "imm(" << UnfoldedOffset << ')';
+  }
+}
+
+LLVM_DUMP_METHOD void Formula::dump() const {
+  print(errs()); errs() << '\n';
+}
+#endif
+
+/// Return true if the given addrec can be sign-extended without changing its
+/// value.
+static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
+  Type *WideTy =
+    IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1);
+  return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
+}
+
+/// Return true if the given add can be sign-extended without changing its
+/// value.
+static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
+  Type *WideTy =
+    IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
+  return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
+}
+
+/// Return true if the given mul can be sign-extended without changing its
+/// value.
+static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
+  Type *WideTy =
+    IntegerType::get(SE.getContext(),
+                     SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
+  return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
+}
+
+/// Return an expression for LHS /s RHS, if it can be determined and if the
+/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
+/// is true, expressions like (X * Y) /s Y are simplified to Y, ignoring that
+/// the multiplication may overflow, which is useful when the result will be
+/// used in a context where the most significant bits are ignored.
+static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
+                                ScalarEvolution &SE,
+                                bool IgnoreSignificantBits = false) {
+  // Handle the trivial case, which works for any SCEV type.
+  if (LHS == RHS)
+    return SE.getConstant(LHS->getType(), 1);
+
+  // Handle a few RHS special cases.
+  const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
+  if (RC) {
+    const APInt &RA = RC->getAPInt();
+    // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
+    // some folding.
+    if (RA.isAllOnesValue())
+      return SE.getMulExpr(LHS, RC);
+    // Handle x /s 1 as x.
+    if (RA == 1)
+      return LHS;
+  }
+
+  // Check for a division of a constant by a constant.
+  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
+    if (!RC)
+      return nullptr;
+    const APInt &LA = C->getAPInt();
+    const APInt &RA = RC->getAPInt();
+    if (LA.srem(RA) != 0)
+      return nullptr;
+    return SE.getConstant(LA.sdiv(RA));
+  }
+
+  // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
+    if ((IgnoreSignificantBits || isAddRecSExtable(AR, SE)) && AR->isAffine()) {
+      const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
+                                      IgnoreSignificantBits);
+      if (!Step) return nullptr;
+      const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
+                                       IgnoreSignificantBits);
+      if (!Start) return nullptr;
+      // FlagNW is independent of the start value, step direction, and is
+      // preserved with smaller magnitude steps.
+      // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
+      return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
+    }
+    return nullptr;
+  }
+
+  // Distribute the sdiv over add operands, if the add doesn't overflow.
+  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
+    if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
+      SmallVector<const SCEV *, 8> Ops;
+      for (const SCEV *S : Add->operands()) {
+        const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits);
+        if (!Op) return nullptr;
+        Ops.push_back(Op);
+      }
+      return SE.getAddExpr(Ops);
+    }
+    return nullptr;
+  }
+
+  // Check for a multiply operand that we can pull RHS out of.
+  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) {
+    if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
+      SmallVector<const SCEV *, 4> Ops;
+      bool Found = false;
+      for (const SCEV *S : Mul->operands()) {
+        if (!Found)
+          if (const SCEV *Q = getExactSDiv(S, RHS, SE,
+                                           IgnoreSignificantBits)) {
+            S = Q;
+            Found = true;
+          }
+        Ops.push_back(S);
+      }
+      return Found ? SE.getMulExpr(Ops) : nullptr;
+    }
+    return nullptr;
+  }
+
+  // Otherwise we don't know.
+  return nullptr;
+}
+
+/// If S involves the addition of a constant integer value, return that integer
+/// value, and mutate S to point to a new SCEV with that value excluded.
+static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
+  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
+    if (C->getAPInt().getMinSignedBits() <= 64) {
+      S = SE.getConstant(C->getType(), 0);
+      return C->getValue()->getSExtValue();
+    }
+  } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
     SmallVector<const SCEV *, 8> NewOps(Add->operands());
-    int64_t Result = ExtractImmediate(NewOps.front(), SE); 
-    if (Result != 0) 
-      S = SE.getAddExpr(NewOps); 
-    return Result; 
-  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { 
+    int64_t Result = ExtractImmediate(NewOps.front(), SE);
+    if (Result != 0)
+      S = SE.getAddExpr(NewOps);
+    return Result;
+  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
     SmallVector<const SCEV *, 8> NewOps(AR->operands());
-    int64_t Result = ExtractImmediate(NewOps.front(), SE); 
-    if (Result != 0) 
-      S = SE.getAddRecExpr(NewOps, AR->getLoop(), 
-                           // FIXME: AR->getNoWrapFlags(SCEV::FlagNW) 
-                           SCEV::FlagAnyWrap); 
-    return Result; 
-  } 
-  return 0; 
-} 
- 
-/// If S involves the addition of a GlobalValue address, return that symbol, and 
-/// mutate S to point to a new SCEV with that value excluded. 
-static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) { 
-  if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) { 
-    if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) { 
-      S = SE.getConstant(GV->getType(), 0); 
-      return GV; 
-    } 
-  } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { 
+    int64_t Result = ExtractImmediate(NewOps.front(), SE);
+    if (Result != 0)
+      S = SE.getAddRecExpr(NewOps, AR->getLoop(),
+                           // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
+                           SCEV::FlagAnyWrap);
+    return Result;
+  }
+  return 0;
+}
+
+/// If S involves the addition of a GlobalValue address, return that symbol, and
+/// mutate S to point to a new SCEV with that value excluded.
+static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
+  if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
+    if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
+      S = SE.getConstant(GV->getType(), 0);
+      return GV;
+    }
+  } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
     SmallVector<const SCEV *, 8> NewOps(Add->operands());
-    GlobalValue *Result = ExtractSymbol(NewOps.back(), SE); 
-    if (Result) 
-      S = SE.getAddExpr(NewOps); 
-    return Result; 
-  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { 
+    GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
+    if (Result)
+      S = SE.getAddExpr(NewOps);
+    return Result;
+  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
     SmallVector<const SCEV *, 8> NewOps(AR->operands());
-    GlobalValue *Result = ExtractSymbol(NewOps.front(), SE); 
-    if (Result) 
-      S = SE.getAddRecExpr(NewOps, AR->getLoop(), 
-                           // FIXME: AR->getNoWrapFlags(SCEV::FlagNW) 
-                           SCEV::FlagAnyWrap); 
-    return Result; 
-  } 
-  return nullptr; 
-} 
- 
-/// Returns true if the specified instruction is using the specified value as an 
-/// address. 
-static bool isAddressUse(const TargetTransformInfo &TTI, 
-                         Instruction *Inst, Value *OperandVal) { 
-  bool isAddress = isa<LoadInst>(Inst); 
-  if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { 
-    if (SI->getPointerOperand() == OperandVal) 
-      isAddress = true; 
-  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { 
-    // Addressing modes can also be folded into prefetches and a variety 
-    // of intrinsics. 
-    switch (II->getIntrinsicID()) { 
-    case Intrinsic::memset: 
-    case Intrinsic::prefetch: 
-    case Intrinsic::masked_load: 
-      if (II->getArgOperand(0) == OperandVal) 
-        isAddress = true; 
-      break; 
-    case Intrinsic::masked_store: 
-      if (II->getArgOperand(1) == OperandVal) 
-        isAddress = true; 
-      break; 
-    case Intrinsic::memmove: 
-    case Intrinsic::memcpy: 
-      if (II->getArgOperand(0) == OperandVal || 
-          II->getArgOperand(1) == OperandVal) 
-        isAddress = true; 
-      break; 
-    default: { 
-      MemIntrinsicInfo IntrInfo; 
-      if (TTI.getTgtMemIntrinsic(II, IntrInfo)) { 
-        if (IntrInfo.PtrVal == OperandVal) 
-          isAddress = true; 
-      } 
-    } 
-    } 
-  } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) { 
-    if (RMW->getPointerOperand() == OperandVal) 
-      isAddress = true; 
-  } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) { 
-    if (CmpX->getPointerOperand() == OperandVal) 
-      isAddress = true; 
-  } 
-  return isAddress; 
-} 
- 
-/// Return the type of the memory being accessed. 
-static MemAccessTy getAccessType(const TargetTransformInfo &TTI, 
-                                 Instruction *Inst, Value *OperandVal) { 
-  MemAccessTy AccessTy(Inst->getType(), MemAccessTy::UnknownAddressSpace); 
-  if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) { 
-    AccessTy.MemTy = SI->getOperand(0)->getType(); 
-    AccessTy.AddrSpace = SI->getPointerAddressSpace(); 
-  } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) { 
-    AccessTy.AddrSpace = LI->getPointerAddressSpace(); 
-  } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) { 
-    AccessTy.AddrSpace = RMW->getPointerAddressSpace(); 
-  } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) { 
-    AccessTy.AddrSpace = CmpX->getPointerAddressSpace(); 
-  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { 
-    switch (II->getIntrinsicID()) { 
-    case Intrinsic::prefetch: 
-    case Intrinsic::memset: 
-      AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace(); 
-      AccessTy.MemTy = OperandVal->getType(); 
-      break; 
-    case Intrinsic::memmove: 
-    case Intrinsic::memcpy: 
-      AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace(); 
-      AccessTy.MemTy = OperandVal->getType(); 
-      break; 
-    case Intrinsic::masked_load: 
-      AccessTy.AddrSpace = 
-          II->getArgOperand(0)->getType()->getPointerAddressSpace(); 
-      break; 
-    case Intrinsic::masked_store: 
-      AccessTy.MemTy = II->getOperand(0)->getType(); 
-      AccessTy.AddrSpace = 
-          II->getArgOperand(1)->getType()->getPointerAddressSpace(); 
-      break; 
-    default: { 
-      MemIntrinsicInfo IntrInfo; 
-      if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) { 
-        AccessTy.AddrSpace 
-          = IntrInfo.PtrVal->getType()->getPointerAddressSpace(); 
-      } 
- 
-      break; 
-    } 
-    } 
-  } 
- 
-  // All pointers have the same requirements, so canonicalize them to an 
-  // arbitrary pointer type to minimize variation. 
-  if (PointerType *PTy = dyn_cast<PointerType>(AccessTy.MemTy)) 
-    AccessTy.MemTy = PointerType::get(IntegerType::get(PTy->getContext(), 1), 
-                                      PTy->getAddressSpace()); 
- 
-  return AccessTy; 
-} 
- 
-/// Return true if this AddRec is already a phi in its loop. 
-static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) { 
-  for (PHINode &PN : AR->getLoop()->getHeader()->phis()) { 
-    if (SE.isSCEVable(PN.getType()) && 
-        (SE.getEffectiveSCEVType(PN.getType()) == 
-         SE.getEffectiveSCEVType(AR->getType())) && 
-        SE.getSCEV(&PN) == AR) 
-      return true; 
-  } 
-  return false; 
-} 
- 
-/// Check if expanding this expression is likely to incur significant cost. This 
-/// is tricky because SCEV doesn't track which expressions are actually computed 
-/// by the current IR. 
-/// 
-/// We currently allow expansion of IV increments that involve adds, 
-/// multiplication by constants, and AddRecs from existing phis. 
-/// 
-/// TODO: Allow UDivExpr if we can find an existing IV increment that is an 
-/// obvious multiple of the UDivExpr. 
-static bool isHighCostExpansion(const SCEV *S, 
-                                SmallPtrSetImpl<const SCEV*> &Processed, 
-                                ScalarEvolution &SE) { 
-  // Zero/One operand expressions 
-  switch (S->getSCEVType()) { 
-  case scUnknown: 
-  case scConstant: 
-    return false; 
-  case scTruncate: 
-    return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(), 
-                               Processed, SE); 
-  case scZeroExtend: 
-    return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(), 
-                               Processed, SE); 
-  case scSignExtend: 
-    return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(), 
-                               Processed, SE); 
+    GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
+    if (Result)
+      S = SE.getAddRecExpr(NewOps, AR->getLoop(),
+                           // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
+                           SCEV::FlagAnyWrap);
+    return Result;
+  }
+  return nullptr;
+}
+
+/// Returns true if the specified instruction is using the specified value as an
+/// address.
+static bool isAddressUse(const TargetTransformInfo &TTI,
+                         Instruction *Inst, Value *OperandVal) {
+  bool isAddress = isa<LoadInst>(Inst);
+  if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+    if (SI->getPointerOperand() == OperandVal)
+      isAddress = true;
+  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+    // Addressing modes can also be folded into prefetches and a variety
+    // of intrinsics.
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::memset:
+    case Intrinsic::prefetch:
+    case Intrinsic::masked_load:
+      if (II->getArgOperand(0) == OperandVal)
+        isAddress = true;
+      break;
+    case Intrinsic::masked_store:
+      if (II->getArgOperand(1) == OperandVal)
+        isAddress = true;
+      break;
+    case Intrinsic::memmove:
+    case Intrinsic::memcpy:
+      if (II->getArgOperand(0) == OperandVal ||
+          II->getArgOperand(1) == OperandVal)
+        isAddress = true;
+      break;
+    default: {
+      MemIntrinsicInfo IntrInfo;
+      if (TTI.getTgtMemIntrinsic(II, IntrInfo)) {
+        if (IntrInfo.PtrVal == OperandVal)
+          isAddress = true;
+      }
+    }
+    }
+  } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
+    if (RMW->getPointerOperand() == OperandVal)
+      isAddress = true;
+  } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+    if (CmpX->getPointerOperand() == OperandVal)
+      isAddress = true;
+  }
+  return isAddress;
+}
+
+/// Return the type of the memory being accessed.
+static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
+                                 Instruction *Inst, Value *OperandVal) {
+  MemAccessTy AccessTy(Inst->getType(), MemAccessTy::UnknownAddressSpace);
+  if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+    AccessTy.MemTy = SI->getOperand(0)->getType();
+    AccessTy.AddrSpace = SI->getPointerAddressSpace();
+  } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+    AccessTy.AddrSpace = LI->getPointerAddressSpace();
+  } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
+    AccessTy.AddrSpace = RMW->getPointerAddressSpace();
+  } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+    AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
+  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::prefetch:
+    case Intrinsic::memset:
+      AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace();
+      AccessTy.MemTy = OperandVal->getType();
+      break;
+    case Intrinsic::memmove:
+    case Intrinsic::memcpy:
+      AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace();
+      AccessTy.MemTy = OperandVal->getType();
+      break;
+    case Intrinsic::masked_load:
+      AccessTy.AddrSpace =
+          II->getArgOperand(0)->getType()->getPointerAddressSpace();
+      break;
+    case Intrinsic::masked_store:
+      AccessTy.MemTy = II->getOperand(0)->getType();
+      AccessTy.AddrSpace =
+          II->getArgOperand(1)->getType()->getPointerAddressSpace();
+      break;
+    default: {
+      MemIntrinsicInfo IntrInfo;
+      if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) {
+        AccessTy.AddrSpace
+          = IntrInfo.PtrVal->getType()->getPointerAddressSpace();
+      }
+
+      break;
+    }
+    }
+  }
+
+  // All pointers have the same requirements, so canonicalize them to an
+  // arbitrary pointer type to minimize variation.
+  if (PointerType *PTy = dyn_cast<PointerType>(AccessTy.MemTy))
+    AccessTy.MemTy = PointerType::get(IntegerType::get(PTy->getContext(), 1),
+                                      PTy->getAddressSpace());
+
+  return AccessTy;
+}
+
+/// Return true if this AddRec is already a phi in its loop.
+static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
+  for (PHINode &PN : AR->getLoop()->getHeader()->phis()) {
+    if (SE.isSCEVable(PN.getType()) &&
+        (SE.getEffectiveSCEVType(PN.getType()) ==
+         SE.getEffectiveSCEVType(AR->getType())) &&
+        SE.getSCEV(&PN) == AR)
+      return true;
+  }
+  return false;
+}
+
+/// Check if expanding this expression is likely to incur significant cost. This
+/// is tricky because SCEV doesn't track which expressions are actually computed
+/// by the current IR.
+///
+/// We currently allow expansion of IV increments that involve adds,
+/// multiplication by constants, and AddRecs from existing phis.
+///
+/// TODO: Allow UDivExpr if we can find an existing IV increment that is an
+/// obvious multiple of the UDivExpr.
+static bool isHighCostExpansion(const SCEV *S,
+                                SmallPtrSetImpl<const SCEV*> &Processed,
+                                ScalarEvolution &SE) {
+  // Zero/One operand expressions
+  switch (S->getSCEVType()) {
+  case scUnknown:
+  case scConstant:
+    return false;
+  case scTruncate:
+    return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
+                               Processed, SE);
+  case scZeroExtend:
+    return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
+                               Processed, SE);
+  case scSignExtend:
+    return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
+                               Processed, SE);
   default:
     break;
-  } 
- 
-  if (!Processed.insert(S).second) 
-    return false; 
- 
-  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { 
-    for (const SCEV *S : Add->operands()) { 
-      if (isHighCostExpansion(S, Processed, SE)) 
-        return true; 
-    } 
-    return false; 
-  } 
- 
-  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) { 
-    if (Mul->getNumOperands() == 2) { 
-      // Multiplication by a constant is ok 
-      if (isa<SCEVConstant>(Mul->getOperand(0))) 
-        return isHighCostExpansion(Mul->getOperand(1), Processed, SE); 
- 
-      // If we have the value of one operand, check if an existing 
-      // multiplication already generates this expression. 
-      if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) { 
-        Value *UVal = U->getValue(); 
-        for (User *UR : UVal->users()) { 
-          // If U is a constant, it may be used by a ConstantExpr. 
-          Instruction *UI = dyn_cast<Instruction>(UR); 
-          if (UI && UI->getOpcode() == Instruction::Mul && 
-              SE.isSCEVable(UI->getType())) { 
-            return SE.getSCEV(UI) == Mul; 
-          } 
-        } 
-      } 
-    } 
-  } 
- 
-  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { 
-    if (isExistingPhi(AR, SE)) 
-      return false; 
-  } 
- 
-  // Fow now, consider any other type of expression (div/mul/min/max) high cost. 
-  return true; 
-} 
- 
-namespace { 
- 
-class LSRUse; 
- 
-} // end anonymous namespace 
- 
-/// Check if the addressing mode defined by \p F is completely 
-/// folded in \p LU at isel time. 
-/// This includes address-mode folding and special icmp tricks. 
-/// This function returns true if \p LU can accommodate what \p F 
-/// defines and up to 1 base + 1 scaled + offset. 
-/// In other words, if \p F has several base registers, this function may 
-/// still return true. Therefore, users still need to account for 
-/// additional base registers and/or unfolded offsets to derive an 
-/// accurate cost model. 
-static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, 
-                                 const LSRUse &LU, const Formula &F); 
- 
-// Get the cost of the scaling factor used in F for LU. 
-static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, 
-                                     const LSRUse &LU, const Formula &F, 
-                                     const Loop &L); 
- 
-namespace { 
- 
-/// This class is used to measure and compare candidate formulae. 
-class Cost { 
-  const Loop *L = nullptr; 
-  ScalarEvolution *SE = nullptr; 
-  const TargetTransformInfo *TTI = nullptr; 
-  TargetTransformInfo::LSRCost C; 
- 
-public: 
-  Cost() = delete; 
-  Cost(const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI) : 
-    L(L), SE(&SE), TTI(&TTI) { 
-    C.Insns = 0; 
-    C.NumRegs = 0; 
-    C.AddRecCost = 0; 
-    C.NumIVMuls = 0; 
-    C.NumBaseAdds = 0; 
-    C.ImmCost = 0; 
-    C.SetupCost = 0; 
-    C.ScaleCost = 0; 
-  } 
- 
-  bool isLess(Cost &Other); 
- 
-  void Lose(); 
- 
-#ifndef NDEBUG 
-  // Once any of the metrics loses, they must all remain losers. 
-  bool isValid() { 
-    return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds 
-             | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u) 
-      || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds 
-           & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u); 
-  } 
-#endif 
- 
-  bool isLoser() { 
-    assert(isValid() && "invalid cost"); 
-    return C.NumRegs == ~0u; 
-  } 
- 
-  void RateFormula(const Formula &F, 
-                   SmallPtrSetImpl<const SCEV *> &Regs, 
-                   const DenseSet<const SCEV *> &VisitedRegs, 
-                   const LSRUse &LU, 
-                   SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr); 
- 
-  void print(raw_ostream &OS) const; 
-  void dump() const; 
- 
-private: 
-  void RateRegister(const Formula &F, const SCEV *Reg, 
-                    SmallPtrSetImpl<const SCEV *> &Regs); 
-  void RatePrimaryRegister(const Formula &F, const SCEV *Reg, 
-                           SmallPtrSetImpl<const SCEV *> &Regs, 
-                           SmallPtrSetImpl<const SCEV *> *LoserRegs); 
-}; 
- 
-/// An operand value in an instruction which is to be replaced with some 
-/// equivalent, possibly strength-reduced, replacement. 
-struct LSRFixup { 
-  /// The instruction which will be updated. 
-  Instruction *UserInst = nullptr; 
- 
-  /// The operand of the instruction which will be replaced. The operand may be 
-  /// used more than once; every instance will be replaced. 
-  Value *OperandValToReplace = nullptr; 
- 
-  /// If this user is to use the post-incremented value of an induction 
-  /// variable, this set is non-empty and holds the loops associated with the 
-  /// induction variable. 
-  PostIncLoopSet PostIncLoops; 
- 
-  /// A constant offset to be added to the LSRUse expression.  This allows 
-  /// multiple fixups to share the same LSRUse with different offsets, for 
-  /// example in an unrolled loop. 
-  int64_t Offset = 0; 
- 
-  LSRFixup() = default; 
- 
-  bool isUseFullyOutsideLoop(const Loop *L) const; 
- 
-  void print(raw_ostream &OS) const; 
-  void dump() const; 
-}; 
- 
-/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of sorted 
-/// SmallVectors of const SCEV*. 
-struct UniquifierDenseMapInfo { 
-  static SmallVector<const SCEV *, 4> getEmptyKey() { 
-    SmallVector<const SCEV *, 4>  V; 
-    V.push_back(reinterpret_cast<const SCEV *>(-1)); 
-    return V; 
-  } 
- 
-  static SmallVector<const SCEV *, 4> getTombstoneKey() { 
-    SmallVector<const SCEV *, 4> V; 
-    V.push_back(reinterpret_cast<const SCEV *>(-2)); 
-    return V; 
-  } 
- 
-  static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) { 
-    return static_cast<unsigned>(hash_combine_range(V.begin(), V.end())); 
-  } 
- 
-  static bool isEqual(const SmallVector<const SCEV *, 4> &LHS, 
-                      const SmallVector<const SCEV *, 4> &RHS) { 
-    return LHS == RHS; 
-  } 
-}; 
- 
-/// This class holds the state that LSR keeps for each use in IVUsers, as well 
-/// as uses invented by LSR itself. It includes information about what kinds of 
-/// things can be folded into the user, information about the user itself, and 
-/// information about how the use may be satisfied.  TODO: Represent multiple 
-/// users of the same expression in common? 
-class LSRUse { 
-  DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier; 
- 
-public: 
-  /// An enum for a kind of use, indicating what types of scaled and immediate 
-  /// operands it might support. 
-  enum KindType { 
-    Basic,   ///< A normal use, with no folding. 
-    Special, ///< A special case of basic, allowing -1 scales. 
-    Address, ///< An address use; folding according to TargetLowering 
-    ICmpZero ///< An equality icmp with both operands folded into one. 
-    // TODO: Add a generic icmp too? 
-  }; 
- 
-  using SCEVUseKindPair = PointerIntPair<const SCEV *, 2, KindType>; 
- 
-  KindType Kind; 
-  MemAccessTy AccessTy; 
- 
-  /// The list of operands which are to be replaced. 
-  SmallVector<LSRFixup, 8> Fixups; 
- 
-  /// Keep track of the min and max offsets of the fixups. 
-  int64_t MinOffset = std::numeric_limits<int64_t>::max(); 
-  int64_t MaxOffset = std::numeric_limits<int64_t>::min(); 
- 
-  /// This records whether all of the fixups using this LSRUse are outside of 
-  /// the loop, in which case some special-case heuristics may be used. 
-  bool AllFixupsOutsideLoop = true; 
- 
-  /// RigidFormula is set to true to guarantee that this use will be associated 
-  /// with a single formula--the one that initially matched. Some SCEV 
-  /// expressions cannot be expanded. This allows LSR to consider the registers 
-  /// used by those expressions without the need to expand them later after 
-  /// changing the formula. 
-  bool RigidFormula = false; 
- 
-  /// This records the widest use type for any fixup using this 
-  /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max 
-  /// fixup widths to be equivalent, because the narrower one may be relying on 
-  /// the implicit truncation to truncate away bogus bits. 
-  Type *WidestFixupType = nullptr; 
- 
-  /// A list of ways to build a value that can satisfy this user.  After the 
-  /// list is populated, one of these is selected heuristically and used to 
-  /// formulate a replacement for OperandValToReplace in UserInst. 
-  SmallVector<Formula, 12> Formulae; 
- 
-  /// The set of register candidates used by all formulae in this LSRUse. 
-  SmallPtrSet<const SCEV *, 4> Regs; 
- 
-  LSRUse(KindType K, MemAccessTy AT) : Kind(K), AccessTy(AT) {} 
- 
-  LSRFixup &getNewFixup() { 
-    Fixups.push_back(LSRFixup()); 
-    return Fixups.back(); 
-  } 
- 
-  void pushFixup(LSRFixup &f) { 
-    Fixups.push_back(f); 
-    if (f.Offset > MaxOffset) 
-      MaxOffset = f.Offset; 
-    if (f.Offset < MinOffset) 
-      MinOffset = f.Offset; 
-  } 
- 
-  bool HasFormulaWithSameRegs(const Formula &F) const; 
-  float getNotSelectedProbability(const SCEV *Reg) const; 
-  bool InsertFormula(const Formula &F, const Loop &L); 
-  void DeleteFormula(Formula &F); 
-  void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses); 
- 
-  void print(raw_ostream &OS) const; 
-  void dump() const; 
-}; 
- 
-} // end anonymous namespace 
- 
-static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, 
-                                 LSRUse::KindType Kind, MemAccessTy AccessTy, 
-                                 GlobalValue *BaseGV, int64_t BaseOffset, 
-                                 bool HasBaseReg, int64_t Scale, 
-                                 Instruction *Fixup = nullptr); 
- 
-static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) { 
-  if (isa<SCEVUnknown>(Reg) || isa<SCEVConstant>(Reg)) 
-    return 1; 
-  if (Depth == 0) 
-    return 0; 
-  if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg)) 
-    return getSetupCost(S->getStart(), Depth - 1); 
+  }
+
+  if (!Processed.insert(S).second)
+    return false;
+
+  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+    for (const SCEV *S : Add->operands()) {
+      if (isHighCostExpansion(S, Processed, SE))
+        return true;
+    }
+    return false;
+  }
+
+  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
+    if (Mul->getNumOperands() == 2) {
+      // Multiplication by a constant is ok
+      if (isa<SCEVConstant>(Mul->getOperand(0)))
+        return isHighCostExpansion(Mul->getOperand(1), Processed, SE);
+
+      // If we have the value of one operand, check if an existing
+      // multiplication already generates this expression.
+      if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) {
+        Value *UVal = U->getValue();
+        for (User *UR : UVal->users()) {
+          // If U is a constant, it may be used by a ConstantExpr.
+          Instruction *UI = dyn_cast<Instruction>(UR);
+          if (UI && UI->getOpcode() == Instruction::Mul &&
+              SE.isSCEVable(UI->getType())) {
+            return SE.getSCEV(UI) == Mul;
+          }
+        }
+      }
+    }
+  }
+
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+    if (isExistingPhi(AR, SE))
+      return false;
+  }
+
+  // Fow now, consider any other type of expression (div/mul/min/max) high cost.
+  return true;
+}
+
+namespace {
+
+class LSRUse;
+
+} // end anonymous namespace
+
+/// Check if the addressing mode defined by \p F is completely
+/// folded in \p LU at isel time.
+/// This includes address-mode folding and special icmp tricks.
+/// This function returns true if \p LU can accommodate what \p F
+/// defines and up to 1 base + 1 scaled + offset.
+/// In other words, if \p F has several base registers, this function may
+/// still return true. Therefore, users still need to account for
+/// additional base registers and/or unfolded offsets to derive an
+/// accurate cost model.
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+                                 const LSRUse &LU, const Formula &F);
+
+// Get the cost of the scaling factor used in F for LU.
+static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
+                                     const LSRUse &LU, const Formula &F,
+                                     const Loop &L);
+
+namespace {
+
+/// This class is used to measure and compare candidate formulae.
+class Cost {
+  const Loop *L = nullptr;
+  ScalarEvolution *SE = nullptr;
+  const TargetTransformInfo *TTI = nullptr;
+  TargetTransformInfo::LSRCost C;
+
+public:
+  Cost() = delete;
+  Cost(const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI) :
+    L(L), SE(&SE), TTI(&TTI) {
+    C.Insns = 0;
+    C.NumRegs = 0;
+    C.AddRecCost = 0;
+    C.NumIVMuls = 0;
+    C.NumBaseAdds = 0;
+    C.ImmCost = 0;
+    C.SetupCost = 0;
+    C.ScaleCost = 0;
+  }
+
+  bool isLess(Cost &Other);
+
+  void Lose();
+
+#ifndef NDEBUG
+  // Once any of the metrics loses, they must all remain losers.
+  bool isValid() {
+    return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds
+             | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u)
+      || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds
+           & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);
+  }
+#endif
+
+  bool isLoser() {
+    assert(isValid() && "invalid cost");
+    return C.NumRegs == ~0u;
+  }
+
+  void RateFormula(const Formula &F,
+                   SmallPtrSetImpl<const SCEV *> &Regs,
+                   const DenseSet<const SCEV *> &VisitedRegs,
+                   const LSRUse &LU,
+                   SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
+
+  void print(raw_ostream &OS) const;
+  void dump() const;
+
+private:
+  void RateRegister(const Formula &F, const SCEV *Reg,
+                    SmallPtrSetImpl<const SCEV *> &Regs);
+  void RatePrimaryRegister(const Formula &F, const SCEV *Reg,
+                           SmallPtrSetImpl<const SCEV *> &Regs,
+                           SmallPtrSetImpl<const SCEV *> *LoserRegs);
+};
+
+/// An operand value in an instruction which is to be replaced with some
+/// equivalent, possibly strength-reduced, replacement.
+struct LSRFixup {
+  /// The instruction which will be updated.
+  Instruction *UserInst = nullptr;
+
+  /// The operand of the instruction which will be replaced. The operand may be
+  /// used more than once; every instance will be replaced.
+  Value *OperandValToReplace = nullptr;
+
+  /// If this user is to use the post-incremented value of an induction
+  /// variable, this set is non-empty and holds the loops associated with the
+  /// induction variable.
+  PostIncLoopSet PostIncLoops;
+
+  /// A constant offset to be added to the LSRUse expression.  This allows
+  /// multiple fixups to share the same LSRUse with different offsets, for
+  /// example in an unrolled loop.
+  int64_t Offset = 0;
+
+  LSRFixup() = default;
+
+  bool isUseFullyOutsideLoop(const Loop *L) const;
+
+  void print(raw_ostream &OS) const;
+  void dump() const;
+};
+
+/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of sorted
+/// SmallVectors of const SCEV*.
+struct UniquifierDenseMapInfo {
+  static SmallVector<const SCEV *, 4> getEmptyKey() {
+    SmallVector<const SCEV *, 4>  V;
+    V.push_back(reinterpret_cast<const SCEV *>(-1));
+    return V;
+  }
+
+  static SmallVector<const SCEV *, 4> getTombstoneKey() {
+    SmallVector<const SCEV *, 4> V;
+    V.push_back(reinterpret_cast<const SCEV *>(-2));
+    return V;
+  }
+
+  static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) {
+    return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
+  }
+
+  static bool isEqual(const SmallVector<const SCEV *, 4> &LHS,
+                      const SmallVector<const SCEV *, 4> &RHS) {
+    return LHS == RHS;
+  }
+};
+
+/// This class holds the state that LSR keeps for each use in IVUsers, as well
+/// as uses invented by LSR itself. It includes information about what kinds of
+/// things can be folded into the user, information about the user itself, and
+/// information about how the use may be satisfied.  TODO: Represent multiple
+/// users of the same expression in common?
+class LSRUse {
+  DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier;
+
+public:
+  /// An enum for a kind of use, indicating what types of scaled and immediate
+  /// operands it might support.
+  enum KindType {
+    Basic,   ///< A normal use, with no folding.
+    Special, ///< A special case of basic, allowing -1 scales.
+    Address, ///< An address use; folding according to TargetLowering
+    ICmpZero ///< An equality icmp with both operands folded into one.
+    // TODO: Add a generic icmp too?
+  };
+
+  using SCEVUseKindPair = PointerIntPair<const SCEV *, 2, KindType>;
+
+  KindType Kind;
+  MemAccessTy AccessTy;
+
+  /// The list of operands which are to be replaced.
+  SmallVector<LSRFixup, 8> Fixups;
+
+  /// Keep track of the min and max offsets of the fixups.
+  int64_t MinOffset = std::numeric_limits<int64_t>::max();
+  int64_t MaxOffset = std::numeric_limits<int64_t>::min();
+
+  /// This records whether all of the fixups using this LSRUse are outside of
+  /// the loop, in which case some special-case heuristics may be used.
+  bool AllFixupsOutsideLoop = true;
+
+  /// RigidFormula is set to true to guarantee that this use will be associated
+  /// with a single formula--the one that initially matched. Some SCEV
+  /// expressions cannot be expanded. This allows LSR to consider the registers
+  /// used by those expressions without the need to expand them later after
+  /// changing the formula.
+  bool RigidFormula = false;
+
+  /// This records the widest use type for any fixup using this
+  /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
+  /// fixup widths to be equivalent, because the narrower one may be relying on
+  /// the implicit truncation to truncate away bogus bits.
+  Type *WidestFixupType = nullptr;
+
+  /// A list of ways to build a value that can satisfy this user.  After the
+  /// list is populated, one of these is selected heuristically and used to
+  /// formulate a replacement for OperandValToReplace in UserInst.
+  SmallVector<Formula, 12> Formulae;
+
+  /// The set of register candidates used by all formulae in this LSRUse.
+  SmallPtrSet<const SCEV *, 4> Regs;
+
+  LSRUse(KindType K, MemAccessTy AT) : Kind(K), AccessTy(AT) {}
+
+  LSRFixup &getNewFixup() {
+    Fixups.push_back(LSRFixup());
+    return Fixups.back();
+  }
+
+  void pushFixup(LSRFixup &f) {
+    Fixups.push_back(f);
+    if (f.Offset > MaxOffset)
+      MaxOffset = f.Offset;
+    if (f.Offset < MinOffset)
+      MinOffset = f.Offset;
+  }
+
+  bool HasFormulaWithSameRegs(const Formula &F) const;
+  float getNotSelectedProbability(const SCEV *Reg) const;
+  bool InsertFormula(const Formula &F, const Loop &L);
+  void DeleteFormula(Formula &F);
+  void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
+
+  void print(raw_ostream &OS) const;
+  void dump() const;
+};
+
+} // end anonymous namespace
+
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+                                 LSRUse::KindType Kind, MemAccessTy AccessTy,
+                                 GlobalValue *BaseGV, int64_t BaseOffset,
+                                 bool HasBaseReg, int64_t Scale,
+                                 Instruction *Fixup = nullptr);
+
+static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) {
+  if (isa<SCEVUnknown>(Reg) || isa<SCEVConstant>(Reg))
+    return 1;
+  if (Depth == 0)
+    return 0;
+  if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg))
+    return getSetupCost(S->getStart(), Depth - 1);
   if (auto S = dyn_cast<SCEVIntegralCastExpr>(Reg))
-    return getSetupCost(S->getOperand(), Depth - 1); 
-  if (auto S = dyn_cast<SCEVNAryExpr>(Reg)) 
-    return std::accumulate(S->op_begin(), S->op_end(), 0, 
-                           [&](unsigned i, const SCEV *Reg) { 
-                             return i + getSetupCost(Reg, Depth - 1); 
-                           }); 
-  if (auto S = dyn_cast<SCEVUDivExpr>(Reg)) 
-    return getSetupCost(S->getLHS(), Depth - 1) + 
-           getSetupCost(S->getRHS(), Depth - 1); 
-  return 0; 
-} 
- 
-/// Tally up interesting quantities from the given register. 
-void Cost::RateRegister(const Formula &F, const SCEV *Reg, 
-                        SmallPtrSetImpl<const SCEV *> &Regs) { 
-  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) { 
-    // If this is an addrec for another loop, it should be an invariant 
-    // with respect to L since L is the innermost loop (at least 
-    // for now LSR only handles innermost loops). 
-    if (AR->getLoop() != L) { 
-      // If the AddRec exists, consider it's register free and leave it alone. 
-      if (isExistingPhi(AR, *SE) && !TTI->shouldFavorPostInc()) 
-        return; 
- 
-      // It is bad to allow LSR for current loop to add induction variables 
-      // for its sibling loops. 
-      if (!AR->getLoop()->contains(L)) { 
-        Lose(); 
-        return; 
-      } 
- 
-      // Otherwise, it will be an invariant with respect to Loop L. 
-      ++C.NumRegs; 
-      return; 
-    } 
- 
-    unsigned LoopCost = 1; 
-    if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) || 
-        TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) { 
- 
-      // If the step size matches the base offset, we could use pre-indexed 
-      // addressing. 
-      if (TTI->shouldFavorBackedgeIndex(L)) { 
-        if (auto *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE))) 
-          if (Step->getAPInt() == F.BaseOffset) 
-            LoopCost = 0; 
-      } 
- 
-      if (TTI->shouldFavorPostInc()) { 
-        const SCEV *LoopStep = AR->getStepRecurrence(*SE); 
-        if (isa<SCEVConstant>(LoopStep)) { 
-          const SCEV *LoopStart = AR->getStart(); 
-          if (!isa<SCEVConstant>(LoopStart) && 
-              SE->isLoopInvariant(LoopStart, L)) 
-            LoopCost = 0; 
-        } 
-      } 
-    } 
-    C.AddRecCost += LoopCost; 
- 
-    // Add the step value register, if it needs one. 
-    // TODO: The non-affine case isn't precisely modeled here. 
-    if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) { 
-      if (!Regs.count(AR->getOperand(1))) { 
-        RateRegister(F, AR->getOperand(1), Regs); 
-        if (isLoser()) 
-          return; 
-      } 
-    } 
-  } 
-  ++C.NumRegs; 
- 
-  // Rough heuristic; favor registers which don't require extra setup 
-  // instructions in the preheader. 
-  C.SetupCost += getSetupCost(Reg, SetupCostDepthLimit); 
-  // Ensure we don't, even with the recusion limit, produce invalid costs. 
-  C.SetupCost = std::min<unsigned>(C.SetupCost, 1 << 16); 
- 
-  C.NumIVMuls += isa<SCEVMulExpr>(Reg) && 
-               SE->hasComputableLoopEvolution(Reg, L); 
-} 
- 
-/// Record this register in the set. If we haven't seen it before, rate 
-/// it. Optional LoserRegs provides a way to declare any formula that refers to 
-/// one of those regs an instant loser. 
-void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg, 
-                               SmallPtrSetImpl<const SCEV *> &Regs, 
-                               SmallPtrSetImpl<const SCEV *> *LoserRegs) { 
-  if (LoserRegs && LoserRegs->count(Reg)) { 
-    Lose(); 
-    return; 
-  } 
-  if (Regs.insert(Reg).second) { 
-    RateRegister(F, Reg, Regs); 
-    if (LoserRegs && isLoser()) 
-      LoserRegs->insert(Reg); 
-  } 
-} 
- 
-void Cost::RateFormula(const Formula &F, 
-                       SmallPtrSetImpl<const SCEV *> &Regs, 
-                       const DenseSet<const SCEV *> &VisitedRegs, 
-                       const LSRUse &LU, 
-                       SmallPtrSetImpl<const SCEV *> *LoserRegs) { 
-  assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula"); 
-  // Tally up the registers. 
-  unsigned PrevAddRecCost = C.AddRecCost; 
-  unsigned PrevNumRegs = C.NumRegs; 
-  unsigned PrevNumBaseAdds = C.NumBaseAdds; 
-  if (const SCEV *ScaledReg = F.ScaledReg) { 
-    if (VisitedRegs.count(ScaledReg)) { 
-      Lose(); 
-      return; 
-    } 
-    RatePrimaryRegister(F, ScaledReg, Regs, LoserRegs); 
-    if (isLoser()) 
-      return; 
-  } 
-  for (const SCEV *BaseReg : F.BaseRegs) { 
-    if (VisitedRegs.count(BaseReg)) { 
-      Lose(); 
-      return; 
-    } 
-    RatePrimaryRegister(F, BaseReg, Regs, LoserRegs); 
-    if (isLoser()) 
-      return; 
-  } 
- 
-  // Determine how many (unfolded) adds we'll need inside the loop. 
-  size_t NumBaseParts = F.getNumRegs(); 
-  if (NumBaseParts > 1) 
-    // Do not count the base and a possible second register if the target 
-    // allows to fold 2 registers. 
-    C.NumBaseAdds += 
-        NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F))); 
-  C.NumBaseAdds += (F.UnfoldedOffset != 0); 
- 
-  // Accumulate non-free scaling amounts. 
-  C.ScaleCost += getScalingFactorCost(*TTI, LU, F, *L); 
- 
-  // Tally up the non-zero immediates. 
-  for (const LSRFixup &Fixup : LU.Fixups) { 
-    int64_t O = Fixup.Offset; 
-    int64_t Offset = (uint64_t)O + F.BaseOffset; 
-    if (F.BaseGV) 
-      C.ImmCost += 64; // Handle symbolic values conservatively. 
-                     // TODO: This should probably be the pointer size. 
-    else if (Offset != 0) 
-      C.ImmCost += APInt(64, Offset, true).getMinSignedBits(); 
- 
-    // Check with target if this offset with this instruction is 
-    // specifically not supported. 
-    if (LU.Kind == LSRUse::Address && Offset != 0 && 
-        !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV, 
-                              Offset, F.HasBaseReg, F.Scale, Fixup.UserInst)) 
-      C.NumBaseAdds++; 
-  } 
- 
-  // If we don't count instruction cost exit here. 
-  if (!InsnsCost) { 
-    assert(isValid() && "invalid cost"); 
-    return; 
-  } 
- 
-  // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as 
-  // additional instruction (at least fill). 
-  // TODO: Need distinguish register class? 
-  unsigned TTIRegNum = TTI->getNumberOfRegisters( 
-                       TTI->getRegisterClassForType(false, F.getType())) - 1; 
-  if (C.NumRegs > TTIRegNum) { 
-    // Cost already exceeded TTIRegNum, then only newly added register can add 
-    // new instructions. 
-    if (PrevNumRegs > TTIRegNum) 
-      C.Insns += (C.NumRegs - PrevNumRegs); 
-    else 
-      C.Insns += (C.NumRegs - TTIRegNum); 
-  } 
- 
-  // If ICmpZero formula ends with not 0, it could not be replaced by 
-  // just add or sub. We'll need to compare final result of AddRec. 
-  // That means we'll need an additional instruction. But if the target can 
-  // macro-fuse a compare with a branch, don't count this extra instruction. 
-  // For -10 + {0, +, 1}: 
-  // i = i + 1; 
-  // cmp i, 10 
-  // 
-  // For {-10, +, 1}: 
-  // i = i + 1; 
-  if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() && 
-      !TTI->canMacroFuseCmp()) 
-    C.Insns++; 
-  // Each new AddRec adds 1 instruction to calculation. 
-  C.Insns += (C.AddRecCost - PrevAddRecCost); 
- 
-  // BaseAdds adds instructions for unfolded registers. 
-  if (LU.Kind != LSRUse::ICmpZero) 
-    C.Insns += C.NumBaseAdds - PrevNumBaseAdds; 
-  assert(isValid() && "invalid cost"); 
-} 
- 
-/// Set this cost to a losing value. 
-void Cost::Lose() { 
-  C.Insns = std::numeric_limits<unsigned>::max(); 
-  C.NumRegs = std::numeric_limits<unsigned>::max(); 
-  C.AddRecCost = std::numeric_limits<unsigned>::max(); 
-  C.NumIVMuls = std::numeric_limits<unsigned>::max(); 
-  C.NumBaseAdds = std::numeric_limits<unsigned>::max(); 
-  C.ImmCost = std::numeric_limits<unsigned>::max(); 
-  C.SetupCost = std::numeric_limits<unsigned>::max(); 
-  C.ScaleCost = std::numeric_limits<unsigned>::max(); 
-} 
- 
-/// Choose the lower cost. 
-bool Cost::isLess(Cost &Other) { 
-  if (InsnsCost.getNumOccurrences() > 0 && InsnsCost && 
-      C.Insns != Other.C.Insns) 
-    return C.Insns < Other.C.Insns; 
-  return TTI->isLSRCostLess(C, Other.C); 
-} 
- 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 
-void Cost::print(raw_ostream &OS) const { 
-  if (InsnsCost) 
-    OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s "); 
-  OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s"); 
-  if (C.AddRecCost != 0) 
-    OS << ", with addrec cost " << C.AddRecCost; 
-  if (C.NumIVMuls != 0) 
-    OS << ", plus " << C.NumIVMuls << " IV mul" 
-       << (C.NumIVMuls == 1 ? "" : "s"); 
-  if (C.NumBaseAdds != 0) 
-    OS << ", plus " << C.NumBaseAdds << " base add" 
-       << (C.NumBaseAdds == 1 ? "" : "s"); 
-  if (C.ScaleCost != 0) 
-    OS << ", plus " << C.ScaleCost << " scale cost"; 
-  if (C.ImmCost != 0) 
-    OS << ", plus " << C.ImmCost << " imm cost"; 
-  if (C.SetupCost != 0) 
-    OS << ", plus " << C.SetupCost << " setup cost"; 
-} 
- 
-LLVM_DUMP_METHOD void Cost::dump() const { 
-  print(errs()); errs() << '\n'; 
-} 
-#endif 
- 
-/// Test whether this fixup always uses its value outside of the given loop. 
-bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const { 
-  // PHI nodes use their value in their incoming blocks. 
-  if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) { 
-    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) 
-      if (PN->getIncomingValue(i) == OperandValToReplace && 
-          L->contains(PN->getIncomingBlock(i))) 
-        return false; 
-    return true; 
-  } 
- 
-  return !L->contains(UserInst); 
-} 
- 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 
-void LSRFixup::print(raw_ostream &OS) const { 
-  OS << "UserInst="; 
-  // Store is common and interesting enough to be worth special-casing. 
-  if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) { 
-    OS << "store "; 
-    Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false); 
-  } else if (UserInst->getType()->isVoidTy()) 
-    OS << UserInst->getOpcodeName(); 
-  else 
-    UserInst->printAsOperand(OS, /*PrintType=*/false); 
- 
-  OS << ", OperandValToReplace="; 
-  OperandValToReplace->printAsOperand(OS, /*PrintType=*/false); 
- 
-  for (const Loop *PIL : PostIncLoops) { 
-    OS << ", PostIncLoop="; 
-    PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false); 
-  } 
- 
-  if (Offset != 0) 
-    OS << ", Offset=" << Offset; 
-} 
- 
-LLVM_DUMP_METHOD void LSRFixup::dump() const { 
-  print(errs()); errs() << '\n'; 
-} 
-#endif 
- 
-/// Test whether this use as a formula which has the same registers as the given 
-/// formula. 
-bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const { 
-  SmallVector<const SCEV *, 4> Key = F.BaseRegs; 
-  if (F.ScaledReg) Key.push_back(F.ScaledReg); 
-  // Unstable sort by host order ok, because this is only used for uniquifying. 
-  llvm::sort(Key); 
-  return Uniquifier.count(Key); 
-} 
- 
-/// The function returns a probability of selecting formula without Reg. 
-float LSRUse::getNotSelectedProbability(const SCEV *Reg) const { 
-  unsigned FNum = 0; 
-  for (const Formula &F : Formulae) 
-    if (F.referencesReg(Reg)) 
-      FNum++; 
-  return ((float)(Formulae.size() - FNum)) / Formulae.size(); 
-} 
- 
-/// If the given formula has not yet been inserted, add it to the list, and 
-/// return true. Return false otherwise.  The formula must be in canonical form. 
-bool LSRUse::InsertFormula(const Formula &F, const Loop &L) { 
-  assert(F.isCanonical(L) && "Invalid canonical representation"); 
- 
-  if (!Formulae.empty() && RigidFormula) 
-    return false; 
- 
-  SmallVector<const SCEV *, 4> Key = F.BaseRegs; 
-  if (F.ScaledReg) Key.push_back(F.ScaledReg); 
-  // Unstable sort by host order ok, because this is only used for uniquifying. 
-  llvm::sort(Key); 
- 
-  if (!Uniquifier.insert(Key).second) 
-    return false; 
- 
-  // Using a register to hold the value of 0 is not profitable. 
-  assert((!F.ScaledReg || !F.ScaledReg->isZero()) && 
-         "Zero allocated in a scaled register!"); 
-#ifndef NDEBUG 
-  for (const SCEV *BaseReg : F.BaseRegs) 
-    assert(!BaseReg->isZero() && "Zero allocated in a base register!"); 
-#endif 
- 
-  // Add the formula to the list. 
-  Formulae.push_back(F); 
- 
-  // Record registers now being used by this use. 
-  Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end()); 
-  if (F.ScaledReg) 
-    Regs.insert(F.ScaledReg); 
- 
-  return true; 
-} 
- 
-/// Remove the given formula from this use's list. 
-void LSRUse::DeleteFormula(Formula &F) { 
-  if (&F != &Formulae.back()) 
-    std::swap(F, Formulae.back()); 
-  Formulae.pop_back(); 
-} 
- 
-/// Recompute the Regs field, and update RegUses. 
-void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) { 
-  // Now that we've filtered out some formulae, recompute the Regs set. 
-  SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs); 
-  Regs.clear(); 
-  for (const Formula &F : Formulae) { 
-    if (F.ScaledReg) Regs.insert(F.ScaledReg); 
-    Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end()); 
-  } 
- 
-  // Update the RegTracker. 
-  for (const SCEV *S : OldRegs) 
-    if (!Regs.count(S)) 
-      RegUses.dropRegister(S, LUIdx); 
-} 
- 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 
-void LSRUse::print(raw_ostream &OS) const { 
-  OS << "LSR Use: Kind="; 
-  switch (Kind) { 
-  case Basic:    OS << "Basic"; break; 
-  case Special:  OS << "Special"; break; 
-  case ICmpZero: OS << "ICmpZero"; break; 
-  case Address: 
-    OS << "Address of "; 
-    if (AccessTy.MemTy->isPointerTy()) 
-      OS << "pointer"; // the full pointer type could be really verbose 
-    else { 
-      OS << *AccessTy.MemTy; 
-    } 
- 
-    OS << " in addrspace(" << AccessTy.AddrSpace << ')'; 
-  } 
- 
-  OS << ", Offsets={"; 
-  bool NeedComma = false; 
-  for (const LSRFixup &Fixup : Fixups) { 
-    if (NeedComma) OS << ','; 
-    OS << Fixup.Offset; 
-    NeedComma = true; 
-  } 
-  OS << '}'; 
- 
-  if (AllFixupsOutsideLoop) 
-    OS << ", all-fixups-outside-loop"; 
- 
-  if (WidestFixupType) 
-    OS << ", widest fixup type: " << *WidestFixupType; 
-} 
- 
-LLVM_DUMP_METHOD void LSRUse::dump() const { 
-  print(errs()); errs() << '\n'; 
-} 
-#endif 
- 
-static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, 
-                                 LSRUse::KindType Kind, MemAccessTy AccessTy, 
-                                 GlobalValue *BaseGV, int64_t BaseOffset, 
-                                 bool HasBaseReg, int64_t Scale, 
-                                 Instruction *Fixup/*= nullptr*/) { 
-  switch (Kind) { 
-  case LSRUse::Address: 
-    return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, BaseOffset, 
-                                     HasBaseReg, Scale, AccessTy.AddrSpace, Fixup); 
- 
-  case LSRUse::ICmpZero: 
-    // There's not even a target hook for querying whether it would be legal to 
-    // fold a GV into an ICmp. 
-    if (BaseGV) 
-      return false; 
- 
-    // ICmp only has two operands; don't allow more than two non-trivial parts. 
-    if (Scale != 0 && HasBaseReg && BaseOffset != 0) 
-      return false; 
- 
-    // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by 
-    // putting the scaled register in the other operand of the icmp. 
-    if (Scale != 0 && Scale != -1) 
-      return false; 
- 
-    // If we have low-level target information, ask the target if it can fold an 
-    // integer immediate on an icmp. 
-    if (BaseOffset != 0) { 
-      // We have one of: 
-      // ICmpZero     BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset 
-      // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset 
-      // Offs is the ICmp immediate. 
-      if (Scale == 0) 
-        // The cast does the right thing with 
-        // std::numeric_limits<int64_t>::min(). 
-        BaseOffset = -(uint64_t)BaseOffset; 
-      return TTI.isLegalICmpImmediate(BaseOffset); 
-    } 
- 
-    // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg 
-    return true; 
- 
-  case LSRUse::Basic: 
-    // Only handle single-register values. 
-    return !BaseGV && Scale == 0 && BaseOffset == 0; 
- 
-  case LSRUse::Special: 
-    // Special case Basic to handle -1 scales. 
-    return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset == 0; 
-  } 
- 
-  llvm_unreachable("Invalid LSRUse Kind!"); 
-} 
- 
-static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, 
-                                 int64_t MinOffset, int64_t MaxOffset, 
-                                 LSRUse::KindType Kind, MemAccessTy AccessTy, 
-                                 GlobalValue *BaseGV, int64_t BaseOffset, 
-                                 bool HasBaseReg, int64_t Scale) { 
-  // Check for overflow. 
-  if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) != 
-      (MinOffset > 0)) 
-    return false; 
-  MinOffset = (uint64_t)BaseOffset + MinOffset; 
-  if (((int64_t)((uint64_t)BaseOffset + MaxOffset) > BaseOffset) != 
-      (MaxOffset > 0)) 
-    return false; 
-  MaxOffset = (uint64_t)BaseOffset + MaxOffset; 
- 
-  return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset, 
-                              HasBaseReg, Scale) && 
-         isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset, 
-                              HasBaseReg, Scale); 
-} 
- 
-static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, 
-                                 int64_t MinOffset, int64_t MaxOffset, 
-                                 LSRUse::KindType Kind, MemAccessTy AccessTy, 
-                                 const Formula &F, const Loop &L) { 
-  // For the purpose of isAMCompletelyFolded either having a canonical formula 
-  // or a scale not equal to zero is correct. 
-  // Problems may arise from non canonical formulae having a scale == 0. 
-  // Strictly speaking it would best to just rely on canonical formulae. 
-  // However, when we generate the scaled formulae, we first check that the 
-  // scaling factor is profitable before computing the actual ScaledReg for 
-  // compile time sake. 
-  assert((F.isCanonical(L) || F.Scale != 0)); 
-  return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, 
-                              F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale); 
-} 
- 
-/// Test whether we know how to expand the current formula. 
-static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, 
-                       int64_t MaxOffset, LSRUse::KindType Kind, 
-                       MemAccessTy AccessTy, GlobalValue *BaseGV, 
-                       int64_t BaseOffset, bool HasBaseReg, int64_t Scale) { 
-  // We know how to expand completely foldable formulae. 
-  return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV, 
-                              BaseOffset, HasBaseReg, Scale) || 
-         // Or formulae that use a base register produced by a sum of base 
-         // registers. 
-         (Scale == 1 && 
-          isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, 
-                               BaseGV, BaseOffset, true, 0)); 
-} 
- 
-static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, 
-                       int64_t MaxOffset, LSRUse::KindType Kind, 
-                       MemAccessTy AccessTy, const Formula &F) { 
-  return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV, 
-                    F.BaseOffset, F.HasBaseReg, F.Scale); 
-} 
- 
-static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, 
-                                 const LSRUse &LU, const Formula &F) { 
-  // Target may want to look at the user instructions. 
-  if (LU.Kind == LSRUse::Address && TTI.LSRWithInstrQueries()) { 
-    for (const LSRFixup &Fixup : LU.Fixups) 
-      if (!isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV, 
-                                (F.BaseOffset + Fixup.Offset), F.HasBaseReg, 
-                                F.Scale, Fixup.UserInst)) 
-        return false; 
-    return true; 
-  } 
- 
-  return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, 
-                              LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg, 
-                              F.Scale); 
-} 
- 
-static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, 
-                                     const LSRUse &LU, const Formula &F, 
-                                     const Loop &L) { 
-  if (!F.Scale) 
-    return 0; 
- 
-  // If the use is not completely folded in that instruction, we will have to 
-  // pay an extra cost only for scale != 1. 
-  if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, 
-                            LU.AccessTy, F, L)) 
-    return F.Scale != 1; 
- 
-  switch (LU.Kind) { 
-  case LSRUse::Address: { 
-    // Check the scaling factor cost with both the min and max offsets. 
-    int ScaleCostMinOffset = TTI.getScalingFactorCost( 
-        LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MinOffset, F.HasBaseReg, 
-        F.Scale, LU.AccessTy.AddrSpace); 
-    int ScaleCostMaxOffset = TTI.getScalingFactorCost( 
-        LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MaxOffset, F.HasBaseReg, 
-        F.Scale, LU.AccessTy.AddrSpace); 
- 
-    assert(ScaleCostMinOffset >= 0 && ScaleCostMaxOffset >= 0 && 
-           "Legal addressing mode has an illegal cost!"); 
-    return std::max(ScaleCostMinOffset, ScaleCostMaxOffset); 
-  } 
-  case LSRUse::ICmpZero: 
-  case LSRUse::Basic: 
-  case LSRUse::Special: 
-    // The use is completely folded, i.e., everything is folded into the 
-    // instruction. 
-    return 0; 
-  } 
- 
-  llvm_unreachable("Invalid LSRUse Kind!"); 
-} 
- 
-static bool isAlwaysFoldable(const TargetTransformInfo &TTI, 
-                             LSRUse::KindType Kind, MemAccessTy AccessTy, 
-                             GlobalValue *BaseGV, int64_t BaseOffset, 
-                             bool HasBaseReg) { 
-  // Fast-path: zero is always foldable. 
-  if (BaseOffset == 0 && !BaseGV) return true; 
- 
-  // Conservatively, create an address with an immediate and a 
-  // base and a scale. 
-  int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1; 
- 
-  // Canonicalize a scale of 1 to a base register if the formula doesn't 
-  // already have a base register. 
-  if (!HasBaseReg && Scale == 1) { 
-    Scale = 0; 
-    HasBaseReg = true; 
-  } 
- 
-  return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset, 
-                              HasBaseReg, Scale); 
-} 
- 
-static bool isAlwaysFoldable(const TargetTransformInfo &TTI, 
-                             ScalarEvolution &SE, int64_t MinOffset, 
-                             int64_t MaxOffset, LSRUse::KindType Kind, 
-                             MemAccessTy AccessTy, const SCEV *S, 
-                             bool HasBaseReg) { 
-  // Fast-path: zero is always foldable. 
-  if (S->isZero()) return true; 
- 
-  // Conservatively, create an address with an immediate and a 
-  // base and a scale. 
-  int64_t BaseOffset = ExtractImmediate(S, SE); 
-  GlobalValue *BaseGV = ExtractSymbol(S, SE); 
- 
-  // If there's anything else involved, it's not foldable. 
-  if (!S->isZero()) return false; 
- 
-  // Fast-path: zero is always foldable. 
-  if (BaseOffset == 0 && !BaseGV) return true; 
- 
-  // Conservatively, create an address with an immediate and a 
-  // base and a scale. 
-  int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1; 
- 
-  return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV, 
-                              BaseOffset, HasBaseReg, Scale); 
-} 
- 
-namespace { 
- 
-/// An individual increment in a Chain of IV increments.  Relate an IV user to 
-/// an expression that computes the IV it uses from the IV used by the previous 
-/// link in the Chain. 
-/// 
-/// For the head of a chain, IncExpr holds the absolute SCEV expression for the 
-/// original IVOperand. The head of the chain's IVOperand is only valid during 
-/// chain collection, before LSR replaces IV users. During chain generation, 
-/// IncExpr can be used to find the new IVOperand that computes the same 
-/// expression. 
-struct IVInc { 
-  Instruction *UserInst; 
-  Value* IVOperand; 
-  const SCEV *IncExpr; 
- 
-  IVInc(Instruction *U, Value *O, const SCEV *E) 
-      : UserInst(U), IVOperand(O), IncExpr(E) {} 
-}; 
- 
-// The list of IV increments in program order.  We typically add the head of a 
-// chain without finding subsequent links. 
-struct IVChain { 
-  SmallVector<IVInc, 1> Incs; 
-  const SCEV *ExprBase = nullptr; 
- 
-  IVChain() = default; 
-  IVChain(const IVInc &Head, const SCEV *Base) 
-      : Incs(1, Head), ExprBase(Base) {} 
- 
-  using const_iterator = SmallVectorImpl<IVInc>::const_iterator; 
- 
-  // Return the first increment in the chain. 
-  const_iterator begin() const { 
-    assert(!Incs.empty()); 
-    return std::next(Incs.begin()); 
-  } 
-  const_iterator end() const { 
-    return Incs.end(); 
-  } 
- 
-  // Returns true if this chain contains any increments. 
-  bool hasIncs() const { return Incs.size() >= 2; } 
- 
-  // Add an IVInc to the end of this chain. 
-  void add(const IVInc &X) { Incs.push_back(X); } 
- 
-  // Returns the last UserInst in the chain. 
-  Instruction *tailUserInst() const { return Incs.back().UserInst; } 
- 
-  // Returns true if IncExpr can be profitably added to this chain. 
-  bool isProfitableIncrement(const SCEV *OperExpr, 
-                             const SCEV *IncExpr, 
-                             ScalarEvolution&); 
-}; 
- 
-/// Helper for CollectChains to track multiple IV increment uses.  Distinguish 
-/// between FarUsers that definitely cross IV increments and NearUsers that may 
-/// be used between IV increments. 
-struct ChainUsers { 
-  SmallPtrSet<Instruction*, 4> FarUsers; 
-  SmallPtrSet<Instruction*, 4> NearUsers; 
-}; 
- 
-/// This class holds state for the main loop strength reduction logic. 
-class LSRInstance { 
-  IVUsers &IU; 
-  ScalarEvolution &SE; 
-  DominatorTree &DT; 
-  LoopInfo &LI; 
-  AssumptionCache &AC; 
-  TargetLibraryInfo &TLI; 
-  const TargetTransformInfo &TTI; 
-  Loop *const L; 
-  MemorySSAUpdater *MSSAU; 
-  bool FavorBackedgeIndex = false; 
-  bool Changed = false; 
- 
-  /// This is the insert position that the current loop's induction variable 
-  /// increment should be placed. In simple loops, this is the latch block's 
-  /// terminator. But in more complicated cases, this is a position which will 
-  /// dominate all the in-loop post-increment users. 
-  Instruction *IVIncInsertPos = nullptr; 
- 
-  /// Interesting factors between use strides. 
-  /// 
-  /// We explicitly use a SetVector which contains a SmallSet, instead of the 
-  /// default, a SmallDenseSet, because we need to use the full range of 
-  /// int64_ts, and there's currently no good way of doing that with 
-  /// SmallDenseSet. 
-  SetVector<int64_t, SmallVector<int64_t, 8>, SmallSet<int64_t, 8>> Factors; 
- 
-  /// Interesting use types, to facilitate truncation reuse. 
-  SmallSetVector<Type *, 4> Types; 
- 
-  /// The list of interesting uses. 
-  mutable SmallVector<LSRUse, 16> Uses; 
- 
-  /// Track which uses use which register candidates. 
-  RegUseTracker RegUses; 
- 
-  // Limit the number of chains to avoid quadratic behavior. We don't expect to 
-  // have more than a few IV increment chains in a loop. Missing a Chain falls 
-  // back to normal LSR behavior for those uses. 
-  static const unsigned MaxChains = 8; 
- 
-  /// IV users can form a chain of IV increments. 
-  SmallVector<IVChain, MaxChains> IVChainVec; 
- 
-  /// IV users that belong to profitable IVChains. 
-  SmallPtrSet<Use*, MaxChains> IVIncSet; 
- 
-  void OptimizeShadowIV(); 
-  bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse); 
-  ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse); 
-  void OptimizeLoopTermCond(); 
- 
-  void ChainInstruction(Instruction *UserInst, Instruction *IVOper, 
-                        SmallVectorImpl<ChainUsers> &ChainUsersVec); 
-  void FinalizeChain(IVChain &Chain); 
-  void CollectChains(); 
-  void GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, 
-                       SmallVectorImpl<WeakTrackingVH> &DeadInsts); 
- 
-  void CollectInterestingTypesAndFactors(); 
-  void CollectFixupsAndInitialFormulae(); 
- 
-  // Support for sharing of LSRUses between LSRFixups. 
-  using UseMapTy = DenseMap<LSRUse::SCEVUseKindPair, size_t>; 
-  UseMapTy UseMap; 
- 
-  bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, 
-                          LSRUse::KindType Kind, MemAccessTy AccessTy); 
- 
-  std::pair<size_t, int64_t> getUse(const SCEV *&Expr, LSRUse::KindType Kind, 
-                                    MemAccessTy AccessTy); 
- 
-  void DeleteUse(LSRUse &LU, size_t LUIdx); 
- 
-  LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU); 
- 
-  void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx); 
-  void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx); 
-  void CountRegisters(const Formula &F, size_t LUIdx); 
-  bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F); 
- 
-  void CollectLoopInvariantFixupsAndFormulae(); 
- 
-  void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base, 
-                              unsigned Depth = 0); 
- 
-  void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx, 
-                                  const Formula &Base, unsigned Depth, 
-                                  size_t Idx, bool IsScaledReg = false); 
-  void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base); 
-  void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx, 
-                                   const Formula &Base, size_t Idx, 
-                                   bool IsScaledReg = false); 
-  void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base); 
-  void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx, 
-                                   const Formula &Base, 
-                                   const SmallVectorImpl<int64_t> &Worklist, 
-                                   size_t Idx, bool IsScaledReg = false); 
-  void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base); 
-  void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base); 
-  void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base); 
-  void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base); 
-  void GenerateCrossUseConstantOffsets(); 
-  void GenerateAllReuseFormulae(); 
- 
-  void FilterOutUndesirableDedicatedRegisters(); 
- 
-  size_t EstimateSearchSpaceComplexity() const; 
-  void NarrowSearchSpaceByDetectingSupersets(); 
-  void NarrowSearchSpaceByCollapsingUnrolledCode(); 
-  void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(); 
-  void NarrowSearchSpaceByFilterFormulaWithSameScaledReg(); 
-  void NarrowSearchSpaceByFilterPostInc(); 
-  void NarrowSearchSpaceByDeletingCostlyFormulas(); 
-  void NarrowSearchSpaceByPickingWinnerRegs(); 
-  void NarrowSearchSpaceUsingHeuristics(); 
- 
-  void SolveRecurse(SmallVectorImpl<const Formula *> &Solution, 
-                    Cost &SolutionCost, 
-                    SmallVectorImpl<const Formula *> &Workspace, 
-                    const Cost &CurCost, 
-                    const SmallPtrSet<const SCEV *, 16> &CurRegs, 
-                    DenseSet<const SCEV *> &VisitedRegs) const; 
-  void Solve(SmallVectorImpl<const Formula *> &Solution) const; 
- 
-  BasicBlock::iterator 
-    HoistInsertPosition(BasicBlock::iterator IP, 
-                        const SmallVectorImpl<Instruction *> &Inputs) const; 
-  BasicBlock::iterator 
-    AdjustInsertPositionForExpand(BasicBlock::iterator IP, 
-                                  const LSRFixup &LF, 
-                                  const LSRUse &LU, 
-                                  SCEVExpander &Rewriter) const; 
- 
-  Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F, 
-                BasicBlock::iterator IP, SCEVExpander &Rewriter, 
-                SmallVectorImpl<WeakTrackingVH> &DeadInsts) const; 
-  void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF, 
-                     const Formula &F, SCEVExpander &Rewriter, 
-                     SmallVectorImpl<WeakTrackingVH> &DeadInsts) const; 
-  void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F, 
-               SCEVExpander &Rewriter, 
-               SmallVectorImpl<WeakTrackingVH> &DeadInsts) const; 
-  void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution); 
- 
-public: 
-  LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, 
-              LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC, 
-              TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU); 
- 
-  bool getChanged() const { return Changed; } 
- 
-  void print_factors_and_types(raw_ostream &OS) const; 
-  void print_fixups(raw_ostream &OS) const; 
-  void print_uses(raw_ostream &OS) const; 
-  void print(raw_ostream &OS) const; 
-  void dump() const; 
-}; 
- 
-} // end anonymous namespace 
- 
-/// If IV is used in a int-to-float cast inside the loop then try to eliminate 
-/// the cast operation. 
-void LSRInstance::OptimizeShadowIV() { 
-  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); 
-  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) 
-    return; 
- 
-  for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); 
-       UI != E; /* empty */) { 
-    IVUsers::const_iterator CandidateUI = UI; 
-    ++UI; 
-    Instruction *ShadowUse = CandidateUI->getUser(); 
-    Type *DestTy = nullptr; 
-    bool IsSigned = false; 
- 
-    /* If shadow use is a int->float cast then insert a second IV 
-       to eliminate this cast. 
- 
-         for (unsigned i = 0; i < n; ++i) 
-           foo((double)i); 
- 
-       is transformed into 
- 
-         double d = 0.0; 
-         for (unsigned i = 0; i < n; ++i, ++d) 
-           foo(d); 
-    */ 
-    if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) { 
-      IsSigned = false; 
-      DestTy = UCast->getDestTy(); 
-    } 
-    else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) { 
-      IsSigned = true; 
-      DestTy = SCast->getDestTy(); 
-    } 
-    if (!DestTy) continue; 
- 
-    // If target does not support DestTy natively then do not apply 
-    // this transformation. 
-    if (!TTI.isTypeLegal(DestTy)) continue; 
- 
-    PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0)); 
-    if (!PH) continue; 
-    if (PH->getNumIncomingValues() != 2) continue; 
- 
-    // If the calculation in integers overflows, the result in FP type will 
-    // differ. So we only can do this transformation if we are guaranteed to not 
-    // deal with overflowing values 
-    const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(PH)); 
-    if (!AR) continue; 
-    if (IsSigned && !AR->hasNoSignedWrap()) continue; 
-    if (!IsSigned && !AR->hasNoUnsignedWrap()) continue; 
- 
-    Type *SrcTy = PH->getType(); 
-    int Mantissa = DestTy->getFPMantissaWidth(); 
-    if (Mantissa == -1) continue; 
-    if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa) 
-      continue; 
- 
-    unsigned Entry, Latch; 
-    if (PH->getIncomingBlock(0) == L->getLoopPreheader()) { 
-      Entry = 0; 
-      Latch = 1; 
-    } else { 
-      Entry = 1; 
-      Latch = 0; 
-    } 
- 
-    ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry)); 
-    if (!Init) continue; 
-    Constant *NewInit = ConstantFP::get(DestTy, IsSigned ? 
-                                        (double)Init->getSExtValue() : 
-                                        (double)Init->getZExtValue()); 
- 
-    BinaryOperator *Incr = 
-      dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch)); 
-    if (!Incr) continue; 
-    if (Incr->getOpcode() != Instruction::Add 
-        && Incr->getOpcode() != Instruction::Sub) 
-      continue; 
- 
-    /* Initialize new IV, double d = 0.0 in above example. */ 
-    ConstantInt *C = nullptr; 
-    if (Incr->getOperand(0) == PH) 
-      C = dyn_cast<ConstantInt>(Incr->getOperand(1)); 
-    else if (Incr->getOperand(1) == PH) 
-      C = dyn_cast<ConstantInt>(Incr->getOperand(0)); 
-    else 
-      continue; 
- 
-    if (!C) continue; 
- 
-    // Ignore negative constants, as the code below doesn't handle them 
-    // correctly. TODO: Remove this restriction. 
-    if (!C->getValue().isStrictlyPositive()) continue; 
- 
-    /* Add new PHINode. */ 
-    PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH); 
- 
-    /* create new increment. '++d' in above example. */ 
-    Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue()); 
-    BinaryOperator *NewIncr = 
-      BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ? 
-                               Instruction::FAdd : Instruction::FSub, 
-                             NewPH, CFP, "IV.S.next.", Incr); 
- 
-    NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry)); 
-    NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch)); 
- 
-    /* Remove cast operation */ 
-    ShadowUse->replaceAllUsesWith(NewPH); 
-    ShadowUse->eraseFromParent(); 
-    Changed = true; 
-    break; 
-  } 
-} 
- 
-/// If Cond has an operand that is an expression of an IV, set the IV user and 
-/// stride information and return true, otherwise return false. 
-bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) { 
-  for (IVStrideUse &U : IU) 
-    if (U.getUser() == Cond) { 
-      // NOTE: we could handle setcc instructions with multiple uses here, but 
-      // InstCombine does it as well for simple uses, it's not clear that it 
-      // occurs enough in real life to handle. 
-      CondUse = &U; 
-      return true; 
-    } 
-  return false; 
-} 
- 
-/// Rewrite the loop's terminating condition if it uses a max computation. 
-/// 
-/// This is a narrow solution to a specific, but acute, problem. For loops 
-/// like this: 
-/// 
-///   i = 0; 
-///   do { 
-///     p[i] = 0.0; 
-///   } while (++i < n); 
-/// 
-/// the trip count isn't just 'n', because 'n' might not be positive. And 
-/// unfortunately this can come up even for loops where the user didn't use 
-/// a C do-while loop. For example, seemingly well-behaved top-test loops 
-/// will commonly be lowered like this: 
-/// 
-///   if (n > 0) { 
-///     i = 0; 
-///     do { 
-///       p[i] = 0.0; 
-///     } while (++i < n); 
-///   } 
-/// 
-/// and then it's possible for subsequent optimization to obscure the if 
-/// test in such a way that indvars can't find it. 
-/// 
-/// When indvars can't find the if test in loops like this, it creates a 
-/// max expression, which allows it to give the loop a canonical 
-/// induction variable: 
-/// 
-///   i = 0; 
-///   max = n < 1 ? 1 : n; 
-///   do { 
-///     p[i] = 0.0; 
-///   } while (++i != max); 
-/// 
-/// Canonical induction variables are necessary because the loop passes 
-/// are designed around them. The most obvious example of this is the 
-/// LoopInfo analysis, which doesn't remember trip count values. It 
-/// expects to be able to rediscover the trip count each time it is 
-/// needed, and it does this using a simple analysis that only succeeds if 
-/// the loop has a canonical induction variable. 
-/// 
-/// However, when it comes time to generate code, the maximum operation 
-/// can be quite costly, especially if it's inside of an outer loop. 
-/// 
-/// This function solves this problem by detecting this type of loop and 
-/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting 
-/// the instructions for the maximum computation. 
-ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) { 
-  // Check that the loop matches the pattern we're looking for. 
-  if (Cond->getPredicate() != CmpInst::ICMP_EQ && 
-      Cond->getPredicate() != CmpInst::ICMP_NE) 
-    return Cond; 
- 
-  SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1)); 
-  if (!Sel || !Sel->hasOneUse()) return Cond; 
- 
-  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); 
-  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) 
-    return Cond; 
-  const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1); 
- 
-  // Add one to the backedge-taken count to get the trip count. 
-  const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount); 
-  if (IterationCount != SE.getSCEV(Sel)) return Cond; 
- 
-  // Check for a max calculation that matches the pattern. There's no check 
-  // for ICMP_ULE here because the comparison would be with zero, which 
-  // isn't interesting. 
-  CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE; 
-  const SCEVNAryExpr *Max = nullptr; 
-  if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) { 
-    Pred = ICmpInst::ICMP_SLE; 
-    Max = S; 
-  } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) { 
-    Pred = ICmpInst::ICMP_SLT; 
-    Max = S; 
-  } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) { 
-    Pred = ICmpInst::ICMP_ULT; 
-    Max = U; 
-  } else { 
-    // No match; bail. 
-    return Cond; 
-  } 
- 
-  // To handle a max with more than two operands, this optimization would 
-  // require additional checking and setup. 
-  if (Max->getNumOperands() != 2) 
-    return Cond; 
- 
-  const SCEV *MaxLHS = Max->getOperand(0); 
-  const SCEV *MaxRHS = Max->getOperand(1); 
- 
-  // ScalarEvolution canonicalizes constants to the left. For < and >, look 
-  // for a comparison with 1. For <= and >=, a comparison with zero. 
-  if (!MaxLHS || 
-      (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One))) 
-    return Cond; 
- 
-  // Check the relevant induction variable for conformance to 
-  // the pattern. 
-  const SCEV *IV = SE.getSCEV(Cond->getOperand(0)); 
-  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV); 
-  if (!AR || !AR->isAffine() || 
-      AR->getStart() != One || 
-      AR->getStepRecurrence(SE) != One) 
-    return Cond; 
- 
-  assert(AR->getLoop() == L && 
-         "Loop condition operand is an addrec in a different loop!"); 
- 
-  // Check the right operand of the select, and remember it, as it will 
-  // be used in the new comparison instruction. 
-  Value *NewRHS = nullptr; 
-  if (ICmpInst::isTrueWhenEqual(Pred)) { 
-    // Look for n+1, and grab n. 
-    if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1))) 
-      if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1))) 
-         if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS) 
-           NewRHS = BO->getOperand(0); 
-    if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2))) 
-      if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1))) 
-        if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS) 
-          NewRHS = BO->getOperand(0); 
-    if (!NewRHS) 
-      return Cond; 
-  } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS) 
-    NewRHS = Sel->getOperand(1); 
-  else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS) 
-    NewRHS = Sel->getOperand(2); 
-  else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS)) 
-    NewRHS = SU->getValue(); 
-  else 
-    // Max doesn't match expected pattern. 
-    return Cond; 
- 
-  // Determine the new comparison opcode. It may be signed or unsigned, 
-  // and the original comparison may be either equality or inequality. 
-  if (Cond->getPredicate() == CmpInst::ICMP_EQ) 
-    Pred = CmpInst::getInversePredicate(Pred); 
- 
-  // Ok, everything looks ok to change the condition into an SLT or SGE and 
-  // delete the max calculation. 
-  ICmpInst *NewCond = 
-    new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp"); 
- 
-  // Delete the max calculation instructions. 
-  Cond->replaceAllUsesWith(NewCond); 
-  CondUse->setUser(NewCond); 
-  Instruction *Cmp = cast<Instruction>(Sel->getOperand(0)); 
-  Cond->eraseFromParent(); 
-  Sel->eraseFromParent(); 
-  if (Cmp->use_empty()) 
-    Cmp->eraseFromParent(); 
-  return NewCond; 
-} 
- 
-/// Change loop terminating condition to use the postinc iv when possible. 
-void 
-LSRInstance::OptimizeLoopTermCond() { 
-  SmallPtrSet<Instruction *, 4> PostIncs; 
- 
-  // We need a different set of heuristics for rotated and non-rotated loops. 
-  // If a loop is rotated then the latch is also the backedge, so inserting 
-  // post-inc expressions just before the latch is ideal. To reduce live ranges 
-  // it also makes sense to rewrite terminating conditions to use post-inc 
-  // expressions. 
-  // 
-  // If the loop is not rotated then the latch is not a backedge; the latch 
-  // check is done in the loop head. Adding post-inc expressions before the 
-  // latch will cause overlapping live-ranges of pre-inc and post-inc expressions 
-  // in the loop body. In this case we do *not* want to use post-inc expressions 
-  // in the latch check, and we want to insert post-inc expressions before 
-  // the backedge. 
-  BasicBlock *LatchBlock = L->getLoopLatch(); 
-  SmallVector<BasicBlock*, 8> ExitingBlocks; 
-  L->getExitingBlocks(ExitingBlocks); 
-  if (llvm::all_of(ExitingBlocks, [&LatchBlock](const BasicBlock *BB) { 
-        return LatchBlock != BB; 
-      })) { 
-    // The backedge doesn't exit the loop; treat this as a head-tested loop. 
-    IVIncInsertPos = LatchBlock->getTerminator(); 
-    return; 
-  } 
- 
-  // Otherwise treat this as a rotated loop. 
-  for (BasicBlock *ExitingBlock : ExitingBlocks) { 
-    // Get the terminating condition for the loop if possible.  If we 
-    // can, we want to change it to use a post-incremented version of its 
-    // induction variable, to allow coalescing the live ranges for the IV into 
-    // one register value. 
- 
-    BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator()); 
-    if (!TermBr) 
-      continue; 
-    // FIXME: Overly conservative, termination condition could be an 'or' etc.. 
-    if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition())) 
-      continue; 
- 
-    // Search IVUsesByStride to find Cond's IVUse if there is one. 
-    IVStrideUse *CondUse = nullptr; 
-    ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition()); 
-    if (!FindIVUserForCond(Cond, CondUse)) 
-      continue; 
- 
-    // If the trip count is computed in terms of a max (due to ScalarEvolution 
-    // being unable to find a sufficient guard, for example), change the loop 
-    // comparison to use SLT or ULT instead of NE. 
-    // One consequence of doing this now is that it disrupts the count-down 
-    // optimization. That's not always a bad thing though, because in such 
-    // cases it may still be worthwhile to avoid a max. 
-    Cond = OptimizeMax(Cond, CondUse); 
- 
-    // If this exiting block dominates the latch block, it may also use 
-    // the post-inc value if it won't be shared with other uses. 
-    // Check for dominance. 
-    if (!DT.dominates(ExitingBlock, LatchBlock)) 
-      continue; 
- 
-    // Conservatively avoid trying to use the post-inc value in non-latch 
-    // exits if there may be pre-inc users in intervening blocks. 
-    if (LatchBlock != ExitingBlock) 
-      for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) 
-        // Test if the use is reachable from the exiting block. This dominator 
-        // query is a conservative approximation of reachability. 
-        if (&*UI != CondUse && 
-            !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) { 
-          // Conservatively assume there may be reuse if the quotient of their 
-          // strides could be a legal scale. 
-          const SCEV *A = IU.getStride(*CondUse, L); 
-          const SCEV *B = IU.getStride(*UI, L); 
-          if (!A || !B) continue; 
-          if (SE.getTypeSizeInBits(A->getType()) != 
-              SE.getTypeSizeInBits(B->getType())) { 
-            if (SE.getTypeSizeInBits(A->getType()) > 
-                SE.getTypeSizeInBits(B->getType())) 
-              B = SE.getSignExtendExpr(B, A->getType()); 
-            else 
-              A = SE.getSignExtendExpr(A, B->getType()); 
-          } 
-          if (const SCEVConstant *D = 
-                dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) { 
-            const ConstantInt *C = D->getValue(); 
-            // Stride of one or negative one can have reuse with non-addresses. 
-            if (C->isOne() || C->isMinusOne()) 
-              goto decline_post_inc; 
-            // Avoid weird situations. 
-            if (C->getValue().getMinSignedBits() >= 64 || 
-                C->getValue().isMinSignedValue()) 
-              goto decline_post_inc; 
-            // Check for possible scaled-address reuse. 
-            if (isAddressUse(TTI, UI->getUser(), UI->getOperandValToReplace())) { 
-              MemAccessTy AccessTy = getAccessType( 
-                  TTI, UI->getUser(), UI->getOperandValToReplace()); 
-              int64_t Scale = C->getSExtValue(); 
-              if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr, 
-                                            /*BaseOffset=*/0, 
-                                            /*HasBaseReg=*/false, Scale, 
-                                            AccessTy.AddrSpace)) 
-                goto decline_post_inc; 
-              Scale = -Scale; 
-              if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr, 
-                                            /*BaseOffset=*/0, 
-                                            /*HasBaseReg=*/false, Scale, 
-                                            AccessTy.AddrSpace)) 
-                goto decline_post_inc; 
-            } 
-          } 
-        } 
- 
-    LLVM_DEBUG(dbgs() << "  Change loop exiting icmp to use postinc iv: " 
-                      << *Cond << '\n'); 
- 
-    // It's possible for the setcc instruction to be anywhere in the loop, and 
-    // possible for it to have multiple users.  If it is not immediately before 
-    // the exiting block branch, move it. 
-    if (&*++BasicBlock::iterator(Cond) != TermBr) { 
-      if (Cond->hasOneUse()) { 
-        Cond->moveBefore(TermBr); 
-      } else { 
-        // Clone the terminating condition and insert into the loopend. 
-        ICmpInst *OldCond = Cond; 
-        Cond = cast<ICmpInst>(Cond->clone()); 
-        Cond->setName(L->getHeader()->getName() + ".termcond"); 
-        ExitingBlock->getInstList().insert(TermBr->getIterator(), Cond); 
- 
-        // Clone the IVUse, as the old use still exists! 
-        CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace()); 
-        TermBr->replaceUsesOfWith(OldCond, Cond); 
-      } 
-    } 
- 
-    // If we get to here, we know that we can transform the setcc instruction to 
-    // use the post-incremented version of the IV, allowing us to coalesce the 
-    // live ranges for the IV correctly. 
-    CondUse->transformToPostInc(L); 
-    Changed = true; 
- 
-    PostIncs.insert(Cond); 
-  decline_post_inc:; 
-  } 
- 
-  // Determine an insertion point for the loop induction variable increment. It 
-  // must dominate all the post-inc comparisons we just set up, and it must 
-  // dominate the loop latch edge. 
-  IVIncInsertPos = L->getLoopLatch()->getTerminator(); 
-  for (Instruction *Inst : PostIncs) { 
-    BasicBlock *BB = 
-      DT.findNearestCommonDominator(IVIncInsertPos->getParent(), 
-                                    Inst->getParent()); 
-    if (BB == Inst->getParent()) 
-      IVIncInsertPos = Inst; 
-    else if (BB != IVIncInsertPos->getParent()) 
-      IVIncInsertPos = BB->getTerminator(); 
-  } 
-} 
- 
-/// Determine if the given use can accommodate a fixup at the given offset and 
-/// other details. If so, update the use and return true. 
-bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, 
-                                     bool HasBaseReg, LSRUse::KindType Kind, 
-                                     MemAccessTy AccessTy) { 
-  int64_t NewMinOffset = LU.MinOffset; 
-  int64_t NewMaxOffset = LU.MaxOffset; 
-  MemAccessTy NewAccessTy = AccessTy; 
- 
-  // Check for a mismatched kind. It's tempting to collapse mismatched kinds to 
-  // something conservative, however this can pessimize in the case that one of 
-  // the uses will have all its uses outside the loop, for example. 
-  if (LU.Kind != Kind) 
-    return false; 
- 
-  // Check for a mismatched access type, and fall back conservatively as needed. 
-  // TODO: Be less conservative when the type is similar and can use the same 
-  // addressing modes. 
-  if (Kind == LSRUse::Address) { 
-    if (AccessTy.MemTy != LU.AccessTy.MemTy) { 
-      NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext(), 
-                                            AccessTy.AddrSpace); 
-    } 
-  } 
- 
-  // Conservatively assume HasBaseReg is true for now. 
-  if (NewOffset < LU.MinOffset) { 
-    if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr, 
-                          LU.MaxOffset - NewOffset, HasBaseReg)) 
-      return false; 
-    NewMinOffset = NewOffset; 
-  } else if (NewOffset > LU.MaxOffset) { 
-    if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr, 
-                          NewOffset - LU.MinOffset, HasBaseReg)) 
-      return false; 
-    NewMaxOffset = NewOffset; 
-  } 
- 
-  // Update the use. 
-  LU.MinOffset = NewMinOffset; 
-  LU.MaxOffset = NewMaxOffset; 
-  LU.AccessTy = NewAccessTy; 
-  return true; 
-} 
- 
-/// Return an LSRUse index and an offset value for a fixup which needs the given 
-/// expression, with the given kind and optional access type.  Either reuse an 
-/// existing use or create a new one, as needed. 
-std::pair<size_t, int64_t> LSRInstance::getUse(const SCEV *&Expr, 
-                                               LSRUse::KindType Kind, 
-                                               MemAccessTy AccessTy) { 
-  const SCEV *Copy = Expr; 
-  int64_t Offset = ExtractImmediate(Expr, SE); 
- 
-  // Basic uses can't accept any offset, for example. 
-  if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr, 
-                        Offset, /*HasBaseReg=*/ true)) { 
-    Expr = Copy; 
-    Offset = 0; 
-  } 
- 
-  std::pair<UseMapTy::iterator, bool> P = 
-    UseMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0)); 
-  if (!P.second) { 
-    // A use already existed with this base. 
-    size_t LUIdx = P.first->second; 
-    LSRUse &LU = Uses[LUIdx]; 
-    if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy)) 
-      // Reuse this use. 
-      return std::make_pair(LUIdx, Offset); 
-  } 
- 
-  // Create a new use. 
-  size_t LUIdx = Uses.size(); 
-  P.first->second = LUIdx; 
-  Uses.push_back(LSRUse(Kind, AccessTy)); 
-  LSRUse &LU = Uses[LUIdx]; 
- 
-  LU.MinOffset = Offset; 
-  LU.MaxOffset = Offset; 
-  return std::make_pair(LUIdx, Offset); 
-} 
- 
-/// Delete the given use from the Uses list. 
-void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) { 
-  if (&LU != &Uses.back()) 
-    std::swap(LU, Uses.back()); 
-  Uses.pop_back(); 
- 
-  // Update RegUses. 
-  RegUses.swapAndDropUse(LUIdx, Uses.size()); 
-} 
- 
-/// Look for a use distinct from OrigLU which is has a formula that has the same 
-/// registers as the given formula. 
-LSRUse * 
-LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF, 
-                                       const LSRUse &OrigLU) { 
-  // Search all uses for the formula. This could be more clever. 
-  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 
-    LSRUse &LU = Uses[LUIdx]; 
-    // Check whether this use is close enough to OrigLU, to see whether it's 
-    // worthwhile looking through its formulae. 
-    // Ignore ICmpZero uses because they may contain formulae generated by 
-    // GenerateICmpZeroScales, in which case adding fixup offsets may 
-    // be invalid. 
-    if (&LU != &OrigLU && 
-        LU.Kind != LSRUse::ICmpZero && 
-        LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy && 
-        LU.WidestFixupType == OrigLU.WidestFixupType && 
-        LU.HasFormulaWithSameRegs(OrigF)) { 
-      // Scan through this use's formulae. 
-      for (const Formula &F : LU.Formulae) { 
-        // Check to see if this formula has the same registers and symbols 
-        // as OrigF. 
-        if (F.BaseRegs == OrigF.BaseRegs && 
-            F.ScaledReg == OrigF.ScaledReg && 
-            F.BaseGV == OrigF.BaseGV && 
-            F.Scale == OrigF.Scale && 
-            F.UnfoldedOffset == OrigF.UnfoldedOffset) { 
-          if (F.BaseOffset == 0) 
-            return &LU; 
-          // This is the formula where all the registers and symbols matched; 
-          // there aren't going to be any others. Since we declined it, we 
-          // can skip the rest of the formulae and proceed to the next LSRUse. 
-          break; 
-        } 
-      } 
-    } 
-  } 
- 
-  // Nothing looked good. 
-  return nullptr; 
-} 
- 
-void LSRInstance::CollectInterestingTypesAndFactors() { 
-  SmallSetVector<const SCEV *, 4> Strides; 
- 
-  // Collect interesting types and strides. 
-  SmallVector<const SCEV *, 4> Worklist; 
-  for (const IVStrideUse &U : IU) { 
-    const SCEV *Expr = IU.getExpr(U); 
- 
-    // Collect interesting types. 
-    Types.insert(SE.getEffectiveSCEVType(Expr->getType())); 
- 
-    // Add strides for mentioned loops. 
-    Worklist.push_back(Expr); 
-    do { 
-      const SCEV *S = Worklist.pop_back_val(); 
-      if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { 
-        if (AR->getLoop() == L) 
-          Strides.insert(AR->getStepRecurrence(SE)); 
-        Worklist.push_back(AR->getStart()); 
-      } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { 
-        Worklist.append(Add->op_begin(), Add->op_end()); 
-      } 
-    } while (!Worklist.empty()); 
-  } 
- 
-  // Compute interesting factors from the set of interesting strides. 
-  for (SmallSetVector<const SCEV *, 4>::const_iterator 
-       I = Strides.begin(), E = Strides.end(); I != E; ++I) 
-    for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter = 
-         std::next(I); NewStrideIter != E; ++NewStrideIter) { 
-      const SCEV *OldStride = *I; 
-      const SCEV *NewStride = *NewStrideIter; 
- 
-      if (SE.getTypeSizeInBits(OldStride->getType()) != 
-          SE.getTypeSizeInBits(NewStride->getType())) { 
-        if (SE.getTypeSizeInBits(OldStride->getType()) > 
-            SE.getTypeSizeInBits(NewStride->getType())) 
-          NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType()); 
-        else 
-          OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType()); 
-      } 
-      if (const SCEVConstant *Factor = 
-            dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride, 
-                                                        SE, true))) { 
-        if (Factor->getAPInt().getMinSignedBits() <= 64) 
-          Factors.insert(Factor->getAPInt().getSExtValue()); 
-      } else if (const SCEVConstant *Factor = 
-                   dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride, 
-                                                               NewStride, 
-                                                               SE, true))) { 
-        if (Factor->getAPInt().getMinSignedBits() <= 64) 
-          Factors.insert(Factor->getAPInt().getSExtValue()); 
-      } 
-    } 
- 
-  // If all uses use the same type, don't bother looking for truncation-based 
-  // reuse. 
-  if (Types.size() == 1) 
-    Types.clear(); 
- 
-  LLVM_DEBUG(print_factors_and_types(dbgs())); 
-} 
- 
-/// Helper for CollectChains that finds an IV operand (computed by an AddRec in 
-/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to 
-/// IVStrideUses, we could partially skip this. 
-static User::op_iterator 
-findIVOperand(User::op_iterator OI, User::op_iterator OE, 
-              Loop *L, ScalarEvolution &SE) { 
-  for(; OI != OE; ++OI) { 
-    if (Instruction *Oper = dyn_cast<Instruction>(*OI)) { 
-      if (!SE.isSCEVable(Oper->getType())) 
-        continue; 
- 
-      if (const SCEVAddRecExpr *AR = 
-          dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Oper))) { 
-        if (AR->getLoop() == L) 
-          break; 
-      } 
-    } 
-  } 
-  return OI; 
-} 
- 
-/// IVChain logic must consistently peek base TruncInst operands, so wrap it in 
-/// a convenient helper. 
-static Value *getWideOperand(Value *Oper) { 
-  if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper)) 
-    return Trunc->getOperand(0); 
-  return Oper; 
-} 
- 
-/// Return true if we allow an IV chain to include both types. 
-static bool isCompatibleIVType(Value *LVal, Value *RVal) { 
-  Type *LType = LVal->getType(); 
-  Type *RType = RVal->getType(); 
-  return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy() && 
-                              // Different address spaces means (possibly) 
-                              // different types of the pointer implementation, 
-                              // e.g. i16 vs i32 so disallow that. 
-                              (LType->getPointerAddressSpace() == 
-                               RType->getPointerAddressSpace())); 
-} 
- 
-/// Return an approximation of this SCEV expression's "base", or NULL for any 
-/// constant. Returning the expression itself is conservative. Returning a 
-/// deeper subexpression is more precise and valid as long as it isn't less 
-/// complex than another subexpression. For expressions involving multiple 
-/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids 
-/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i], 
-/// IVInc==b-a. 
-/// 
-/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost 
-/// SCEVUnknown, we simply return the rightmost SCEV operand. 
-static const SCEV *getExprBase(const SCEV *S) { 
-  switch (S->getSCEVType()) { 
-  default: // uncluding scUnknown. 
-    return S; 
-  case scConstant: 
-    return nullptr; 
-  case scTruncate: 
-    return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand()); 
-  case scZeroExtend: 
-    return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand()); 
-  case scSignExtend: 
-    return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand()); 
-  case scAddExpr: { 
-    // Skip over scaled operands (scMulExpr) to follow add operands as long as 
-    // there's nothing more complex. 
-    // FIXME: not sure if we want to recognize negation. 
-    const SCEVAddExpr *Add = cast<SCEVAddExpr>(S); 
-    for (std::reverse_iterator<SCEVAddExpr::op_iterator> I(Add->op_end()), 
-           E(Add->op_begin()); I != E; ++I) { 
-      const SCEV *SubExpr = *I; 
-      if (SubExpr->getSCEVType() == scAddExpr) 
-        return getExprBase(SubExpr); 
- 
-      if (SubExpr->getSCEVType() != scMulExpr) 
-        return SubExpr; 
-    } 
-    return S; // all operands are scaled, be conservative. 
-  } 
-  case scAddRecExpr: 
-    return getExprBase(cast<SCEVAddRecExpr>(S)->getStart()); 
-  } 
+    return getSetupCost(S->getOperand(), Depth - 1);
+  if (auto S = dyn_cast<SCEVNAryExpr>(Reg))
+    return std::accumulate(S->op_begin(), S->op_end(), 0,
+                           [&](unsigned i, const SCEV *Reg) {
+                             return i + getSetupCost(Reg, Depth - 1);
+                           });
+  if (auto S = dyn_cast<SCEVUDivExpr>(Reg))
+    return getSetupCost(S->getLHS(), Depth - 1) +
+           getSetupCost(S->getRHS(), Depth - 1);
+  return 0;
+}
+
+/// Tally up interesting quantities from the given register.
+void Cost::RateRegister(const Formula &F, const SCEV *Reg,
+                        SmallPtrSetImpl<const SCEV *> &Regs) {
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
+    // If this is an addrec for another loop, it should be an invariant
+    // with respect to L since L is the innermost loop (at least
+    // for now LSR only handles innermost loops).
+    if (AR->getLoop() != L) {
+      // If the AddRec exists, consider it's register free and leave it alone.
+      if (isExistingPhi(AR, *SE) && !TTI->shouldFavorPostInc())
+        return;
+
+      // It is bad to allow LSR for current loop to add induction variables
+      // for its sibling loops.
+      if (!AR->getLoop()->contains(L)) {
+        Lose();
+        return;
+      }
+
+      // Otherwise, it will be an invariant with respect to Loop L.
+      ++C.NumRegs;
+      return;
+    }
+
+    unsigned LoopCost = 1;
+    if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||
+        TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {
+
+      // If the step size matches the base offset, we could use pre-indexed
+      // addressing.
+      if (TTI->shouldFavorBackedgeIndex(L)) {
+        if (auto *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)))
+          if (Step->getAPInt() == F.BaseOffset)
+            LoopCost = 0;
+      }
+
+      if (TTI->shouldFavorPostInc()) {
+        const SCEV *LoopStep = AR->getStepRecurrence(*SE);
+        if (isa<SCEVConstant>(LoopStep)) {
+          const SCEV *LoopStart = AR->getStart();
+          if (!isa<SCEVConstant>(LoopStart) &&
+              SE->isLoopInvariant(LoopStart, L))
+            LoopCost = 0;
+        }
+      }
+    }
+    C.AddRecCost += LoopCost;
+
+    // Add the step value register, if it needs one.
+    // TODO: The non-affine case isn't precisely modeled here.
+    if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
+      if (!Regs.count(AR->getOperand(1))) {
+        RateRegister(F, AR->getOperand(1), Regs);
+        if (isLoser())
+          return;
+      }
+    }
+  }
+  ++C.NumRegs;
+
+  // Rough heuristic; favor registers which don't require extra setup
+  // instructions in the preheader.
+  C.SetupCost += getSetupCost(Reg, SetupCostDepthLimit);
+  // Ensure we don't, even with the recusion limit, produce invalid costs.
+  C.SetupCost = std::min<unsigned>(C.SetupCost, 1 << 16);
+
+  C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
+               SE->hasComputableLoopEvolution(Reg, L);
+}
+
+/// Record this register in the set. If we haven't seen it before, rate
+/// it. Optional LoserRegs provides a way to declare any formula that refers to
+/// one of those regs an instant loser.
+void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,
+                               SmallPtrSetImpl<const SCEV *> &Regs,
+                               SmallPtrSetImpl<const SCEV *> *LoserRegs) {
+  if (LoserRegs && LoserRegs->count(Reg)) {
+    Lose();
+    return;
+  }
+  if (Regs.insert(Reg).second) {
+    RateRegister(F, Reg, Regs);
+    if (LoserRegs && isLoser())
+      LoserRegs->insert(Reg);
+  }
+}
+
+void Cost::RateFormula(const Formula &F,
+                       SmallPtrSetImpl<const SCEV *> &Regs,
+                       const DenseSet<const SCEV *> &VisitedRegs,
+                       const LSRUse &LU,
+                       SmallPtrSetImpl<const SCEV *> *LoserRegs) {
+  assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
+  // Tally up the registers.
+  unsigned PrevAddRecCost = C.AddRecCost;
+  unsigned PrevNumRegs = C.NumRegs;
+  unsigned PrevNumBaseAdds = C.NumBaseAdds;
+  if (const SCEV *ScaledReg = F.ScaledReg) {
+    if (VisitedRegs.count(ScaledReg)) {
+      Lose();
+      return;
+    }
+    RatePrimaryRegister(F, ScaledReg, Regs, LoserRegs);
+    if (isLoser())
+      return;
+  }
+  for (const SCEV *BaseReg : F.BaseRegs) {
+    if (VisitedRegs.count(BaseReg)) {
+      Lose();
+      return;
+    }
+    RatePrimaryRegister(F, BaseReg, Regs, LoserRegs);
+    if (isLoser())
+      return;
+  }
+
+  // Determine how many (unfolded) adds we'll need inside the loop.
+  size_t NumBaseParts = F.getNumRegs();
+  if (NumBaseParts > 1)
+    // Do not count the base and a possible second register if the target
+    // allows to fold 2 registers.
+    C.NumBaseAdds +=
+        NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F)));
+  C.NumBaseAdds += (F.UnfoldedOffset != 0);
+
+  // Accumulate non-free scaling amounts.
+  C.ScaleCost += getScalingFactorCost(*TTI, LU, F, *L);
+
+  // Tally up the non-zero immediates.
+  for (const LSRFixup &Fixup : LU.Fixups) {
+    int64_t O = Fixup.Offset;
+    int64_t Offset = (uint64_t)O + F.BaseOffset;
+    if (F.BaseGV)
+      C.ImmCost += 64; // Handle symbolic values conservatively.
+                     // TODO: This should probably be the pointer size.
+    else if (Offset != 0)
+      C.ImmCost += APInt(64, Offset, true).getMinSignedBits();
+
+    // Check with target if this offset with this instruction is
+    // specifically not supported.
+    if (LU.Kind == LSRUse::Address && Offset != 0 &&
+        !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
+                              Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
+      C.NumBaseAdds++;
+  }
+
+  // If we don't count instruction cost exit here.
+  if (!InsnsCost) {
+    assert(isValid() && "invalid cost");
+    return;
+  }
+
+  // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
+  // additional instruction (at least fill).
+  // TODO: Need distinguish register class?
+  unsigned TTIRegNum = TTI->getNumberOfRegisters(
+                       TTI->getRegisterClassForType(false, F.getType())) - 1;
+  if (C.NumRegs > TTIRegNum) {
+    // Cost already exceeded TTIRegNum, then only newly added register can add
+    // new instructions.
+    if (PrevNumRegs > TTIRegNum)
+      C.Insns += (C.NumRegs - PrevNumRegs);
+    else
+      C.Insns += (C.NumRegs - TTIRegNum);
+  }
+
+  // If ICmpZero formula ends with not 0, it could not be replaced by
+  // just add or sub. We'll need to compare final result of AddRec.
+  // That means we'll need an additional instruction. But if the target can
+  // macro-fuse a compare with a branch, don't count this extra instruction.
+  // For -10 + {0, +, 1}:
+  // i = i + 1;
+  // cmp i, 10
+  //
+  // For {-10, +, 1}:
+  // i = i + 1;
+  if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() &&
+      !TTI->canMacroFuseCmp())
+    C.Insns++;
+  // Each new AddRec adds 1 instruction to calculation.
+  C.Insns += (C.AddRecCost - PrevAddRecCost);
+
+  // BaseAdds adds instructions for unfolded registers.
+  if (LU.Kind != LSRUse::ICmpZero)
+    C.Insns += C.NumBaseAdds - PrevNumBaseAdds;
+  assert(isValid() && "invalid cost");
+}
+
+/// Set this cost to a losing value.
+void Cost::Lose() {
+  C.Insns = std::numeric_limits<unsigned>::max();
+  C.NumRegs = std::numeric_limits<unsigned>::max();
+  C.AddRecCost = std::numeric_limits<unsigned>::max();
+  C.NumIVMuls = std::numeric_limits<unsigned>::max();
+  C.NumBaseAdds = std::numeric_limits<unsigned>::max();
+  C.ImmCost = std::numeric_limits<unsigned>::max();
+  C.SetupCost = std::numeric_limits<unsigned>::max();
+  C.ScaleCost = std::numeric_limits<unsigned>::max();
+}
+
+/// Choose the lower cost.
+bool Cost::isLess(Cost &Other) {
+  if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&
+      C.Insns != Other.C.Insns)
+    return C.Insns < Other.C.Insns;
+  return TTI->isLSRCostLess(C, Other.C);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void Cost::print(raw_ostream &OS) const {
+  if (InsnsCost)
+    OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s ");
+  OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s");
+  if (C.AddRecCost != 0)
+    OS << ", with addrec cost " << C.AddRecCost;
+  if (C.NumIVMuls != 0)
+    OS << ", plus " << C.NumIVMuls << " IV mul"
+       << (C.NumIVMuls == 1 ? "" : "s");
+  if (C.NumBaseAdds != 0)
+    OS << ", plus " << C.NumBaseAdds << " base add"
+       << (C.NumBaseAdds == 1 ? "" : "s");
+  if (C.ScaleCost != 0)
+    OS << ", plus " << C.ScaleCost << " scale cost";
+  if (C.ImmCost != 0)
+    OS << ", plus " << C.ImmCost << " imm cost";
+  if (C.SetupCost != 0)
+    OS << ", plus " << C.SetupCost << " setup cost";
+}
+
+LLVM_DUMP_METHOD void Cost::dump() const {
+  print(errs()); errs() << '\n';
+}
+#endif
+
+/// Test whether this fixup always uses its value outside of the given loop.
+bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
+  // PHI nodes use their value in their incoming blocks.
+  if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (PN->getIncomingValue(i) == OperandValToReplace &&
+          L->contains(PN->getIncomingBlock(i)))
+        return false;
+    return true;
+  }
+
+  return !L->contains(UserInst);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void LSRFixup::print(raw_ostream &OS) const {
+  OS << "UserInst=";
+  // Store is common and interesting enough to be worth special-casing.
+  if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
+    OS << "store ";
+    Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
+  } else if (UserInst->getType()->isVoidTy())
+    OS << UserInst->getOpcodeName();
+  else
+    UserInst->printAsOperand(OS, /*PrintType=*/false);
+
+  OS << ", OperandValToReplace=";
+  OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
+
+  for (const Loop *PIL : PostIncLoops) {
+    OS << ", PostIncLoop=";
+    PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
+  }
+
+  if (Offset != 0)
+    OS << ", Offset=" << Offset;
+}
+
+LLVM_DUMP_METHOD void LSRFixup::dump() const {
+  print(errs()); errs() << '\n';
+}
+#endif
+
+/// Test whether this use as a formula which has the same registers as the given
+/// formula.
+bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
+  SmallVector<const SCEV *, 4> Key = F.BaseRegs;
+  if (F.ScaledReg) Key.push_back(F.ScaledReg);
+  // Unstable sort by host order ok, because this is only used for uniquifying.
+  llvm::sort(Key);
+  return Uniquifier.count(Key);
+}
+
+/// The function returns a probability of selecting formula without Reg.
+float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {
+  unsigned FNum = 0;
+  for (const Formula &F : Formulae)
+    if (F.referencesReg(Reg))
+      FNum++;
+  return ((float)(Formulae.size() - FNum)) / Formulae.size();
+}
+
+/// If the given formula has not yet been inserted, add it to the list, and
+/// return true. Return false otherwise.  The formula must be in canonical form.
+bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
+  assert(F.isCanonical(L) && "Invalid canonical representation");
+
+  if (!Formulae.empty() && RigidFormula)
+    return false;
+
+  SmallVector<const SCEV *, 4> Key = F.BaseRegs;
+  if (F.ScaledReg) Key.push_back(F.ScaledReg);
+  // Unstable sort by host order ok, because this is only used for uniquifying.
+  llvm::sort(Key);
+
+  if (!Uniquifier.insert(Key).second)
+    return false;
+
+  // Using a register to hold the value of 0 is not profitable.
+  assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
+         "Zero allocated in a scaled register!");
+#ifndef NDEBUG
+  for (const SCEV *BaseReg : F.BaseRegs)
+    assert(!BaseReg->isZero() && "Zero allocated in a base register!");
+#endif
+
+  // Add the formula to the list.
+  Formulae.push_back(F);
+
+  // Record registers now being used by this use.
+  Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
+  if (F.ScaledReg)
+    Regs.insert(F.ScaledReg);
+
+  return true;
+}
+
+/// Remove the given formula from this use's list.
+void LSRUse::DeleteFormula(Formula &F) {
+  if (&F != &Formulae.back())
+    std::swap(F, Formulae.back());
+  Formulae.pop_back();
+}
+
+/// Recompute the Regs field, and update RegUses.
+void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
+  // Now that we've filtered out some formulae, recompute the Regs set.
+  SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
+  Regs.clear();
+  for (const Formula &F : Formulae) {
+    if (F.ScaledReg) Regs.insert(F.ScaledReg);
+    Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
+  }
+
+  // Update the RegTracker.
+  for (const SCEV *S : OldRegs)
+    if (!Regs.count(S))
+      RegUses.dropRegister(S, LUIdx);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void LSRUse::print(raw_ostream &OS) const {
+  OS << "LSR Use: Kind=";
+  switch (Kind) {
+  case Basic:    OS << "Basic"; break;
+  case Special:  OS << "Special"; break;
+  case ICmpZero: OS << "ICmpZero"; break;
+  case Address:
+    OS << "Address of ";
+    if (AccessTy.MemTy->isPointerTy())
+      OS << "pointer"; // the full pointer type could be really verbose
+    else {
+      OS << *AccessTy.MemTy;
+    }
+
+    OS << " in addrspace(" << AccessTy.AddrSpace << ')';
+  }
+
+  OS << ", Offsets={";
+  bool NeedComma = false;
+  for (const LSRFixup &Fixup : Fixups) {
+    if (NeedComma) OS << ',';
+    OS << Fixup.Offset;
+    NeedComma = true;
+  }
+  OS << '}';
+
+  if (AllFixupsOutsideLoop)
+    OS << ", all-fixups-outside-loop";
+
+  if (WidestFixupType)
+    OS << ", widest fixup type: " << *WidestFixupType;
+}
+
+LLVM_DUMP_METHOD void LSRUse::dump() const {
+  print(errs()); errs() << '\n';
+}
+#endif
+
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+                                 LSRUse::KindType Kind, MemAccessTy AccessTy,
+                                 GlobalValue *BaseGV, int64_t BaseOffset,
+                                 bool HasBaseReg, int64_t Scale,
+                                 Instruction *Fixup/*= nullptr*/) {
+  switch (Kind) {
+  case LSRUse::Address:
+    return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, BaseOffset,
+                                     HasBaseReg, Scale, AccessTy.AddrSpace, Fixup);
+
+  case LSRUse::ICmpZero:
+    // There's not even a target hook for querying whether it would be legal to
+    // fold a GV into an ICmp.
+    if (BaseGV)
+      return false;
+
+    // ICmp only has two operands; don't allow more than two non-trivial parts.
+    if (Scale != 0 && HasBaseReg && BaseOffset != 0)
+      return false;
+
+    // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
+    // putting the scaled register in the other operand of the icmp.
+    if (Scale != 0 && Scale != -1)
+      return false;
+
+    // If we have low-level target information, ask the target if it can fold an
+    // integer immediate on an icmp.
+    if (BaseOffset != 0) {
+      // We have one of:
+      // ICmpZero     BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
+      // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
+      // Offs is the ICmp immediate.
+      if (Scale == 0)
+        // The cast does the right thing with
+        // std::numeric_limits<int64_t>::min().
+        BaseOffset = -(uint64_t)BaseOffset;
+      return TTI.isLegalICmpImmediate(BaseOffset);
+    }
+
+    // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
+    return true;
+
+  case LSRUse::Basic:
+    // Only handle single-register values.
+    return !BaseGV && Scale == 0 && BaseOffset == 0;
+
+  case LSRUse::Special:
+    // Special case Basic to handle -1 scales.
+    return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset == 0;
+  }
+
+  llvm_unreachable("Invalid LSRUse Kind!");
+}
+
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+                                 int64_t MinOffset, int64_t MaxOffset,
+                                 LSRUse::KindType Kind, MemAccessTy AccessTy,
+                                 GlobalValue *BaseGV, int64_t BaseOffset,
+                                 bool HasBaseReg, int64_t Scale) {
+  // Check for overflow.
+  if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) !=
+      (MinOffset > 0))
+    return false;
+  MinOffset = (uint64_t)BaseOffset + MinOffset;
+  if (((int64_t)((uint64_t)BaseOffset + MaxOffset) > BaseOffset) !=
+      (MaxOffset > 0))
+    return false;
+  MaxOffset = (uint64_t)BaseOffset + MaxOffset;
+
+  return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
+                              HasBaseReg, Scale) &&
+         isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
+                              HasBaseReg, Scale);
+}
+
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+                                 int64_t MinOffset, int64_t MaxOffset,
+                                 LSRUse::KindType Kind, MemAccessTy AccessTy,
+                                 const Formula &F, const Loop &L) {
+  // For the purpose of isAMCompletelyFolded either having a canonical formula
+  // or a scale not equal to zero is correct.
+  // Problems may arise from non canonical formulae having a scale == 0.
+  // Strictly speaking it would best to just rely on canonical formulae.
+  // However, when we generate the scaled formulae, we first check that the
+  // scaling factor is profitable before computing the actual ScaledReg for
+  // compile time sake.
+  assert((F.isCanonical(L) || F.Scale != 0));
+  return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
+                              F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
+}
+
+/// Test whether we know how to expand the current formula.
+static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
+                       int64_t MaxOffset, LSRUse::KindType Kind,
+                       MemAccessTy AccessTy, GlobalValue *BaseGV,
+                       int64_t BaseOffset, bool HasBaseReg, int64_t Scale) {
+  // We know how to expand completely foldable formulae.
+  return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
+                              BaseOffset, HasBaseReg, Scale) ||
+         // Or formulae that use a base register produced by a sum of base
+         // registers.
+         (Scale == 1 &&
+          isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
+                               BaseGV, BaseOffset, true, 0));
+}
+
+static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
+                       int64_t MaxOffset, LSRUse::KindType Kind,
+                       MemAccessTy AccessTy, const Formula &F) {
+  return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
+                    F.BaseOffset, F.HasBaseReg, F.Scale);
+}
+
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+                                 const LSRUse &LU, const Formula &F) {
+  // Target may want to look at the user instructions.
+  if (LU.Kind == LSRUse::Address && TTI.LSRWithInstrQueries()) {
+    for (const LSRFixup &Fixup : LU.Fixups)
+      if (!isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
+                                (F.BaseOffset + Fixup.Offset), F.HasBaseReg,
+                                F.Scale, Fixup.UserInst))
+        return false;
+    return true;
+  }
+
+  return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
+                              LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
+                              F.Scale);
+}
+
+static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
+                                     const LSRUse &LU, const Formula &F,
+                                     const Loop &L) {
+  if (!F.Scale)
+    return 0;
+
+  // If the use is not completely folded in that instruction, we will have to
+  // pay an extra cost only for scale != 1.
+  if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
+                            LU.AccessTy, F, L))
+    return F.Scale != 1;
+
+  switch (LU.Kind) {
+  case LSRUse::Address: {
+    // Check the scaling factor cost with both the min and max offsets.
+    int ScaleCostMinOffset = TTI.getScalingFactorCost(
+        LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MinOffset, F.HasBaseReg,
+        F.Scale, LU.AccessTy.AddrSpace);
+    int ScaleCostMaxOffset = TTI.getScalingFactorCost(
+        LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MaxOffset, F.HasBaseReg,
+        F.Scale, LU.AccessTy.AddrSpace);
+
+    assert(ScaleCostMinOffset >= 0 && ScaleCostMaxOffset >= 0 &&
+           "Legal addressing mode has an illegal cost!");
+    return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
+  }
+  case LSRUse::ICmpZero:
+  case LSRUse::Basic:
+  case LSRUse::Special:
+    // The use is completely folded, i.e., everything is folded into the
+    // instruction.
+    return 0;
+  }
+
+  llvm_unreachable("Invalid LSRUse Kind!");
+}
+
+static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
+                             LSRUse::KindType Kind, MemAccessTy AccessTy,
+                             GlobalValue *BaseGV, int64_t BaseOffset,
+                             bool HasBaseReg) {
+  // Fast-path: zero is always foldable.
+  if (BaseOffset == 0 && !BaseGV) return true;
+
+  // Conservatively, create an address with an immediate and a
+  // base and a scale.
+  int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
+
+  // Canonicalize a scale of 1 to a base register if the formula doesn't
+  // already have a base register.
+  if (!HasBaseReg && Scale == 1) {
+    Scale = 0;
+    HasBaseReg = true;
+  }
+
+  return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
+                              HasBaseReg, Scale);
+}
+
+static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
+                             ScalarEvolution &SE, int64_t MinOffset,
+                             int64_t MaxOffset, LSRUse::KindType Kind,
+                             MemAccessTy AccessTy, const SCEV *S,
+                             bool HasBaseReg) {
+  // Fast-path: zero is always foldable.
+  if (S->isZero()) return true;
+
+  // Conservatively, create an address with an immediate and a
+  // base and a scale.
+  int64_t BaseOffset = ExtractImmediate(S, SE);
+  GlobalValue *BaseGV = ExtractSymbol(S, SE);
+
+  // If there's anything else involved, it's not foldable.
+  if (!S->isZero()) return false;
+
+  // Fast-path: zero is always foldable.
+  if (BaseOffset == 0 && !BaseGV) return true;
+
+  // Conservatively, create an address with an immediate and a
+  // base and a scale.
+  int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
+
+  return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
+                              BaseOffset, HasBaseReg, Scale);
+}
+
+namespace {
+
+/// An individual increment in a Chain of IV increments.  Relate an IV user to
+/// an expression that computes the IV it uses from the IV used by the previous
+/// link in the Chain.
+///
+/// For the head of a chain, IncExpr holds the absolute SCEV expression for the
+/// original IVOperand. The head of the chain's IVOperand is only valid during
+/// chain collection, before LSR replaces IV users. During chain generation,
+/// IncExpr can be used to find the new IVOperand that computes the same
+/// expression.
+struct IVInc {
+  Instruction *UserInst;
+  Value* IVOperand;
+  const SCEV *IncExpr;
+
+  IVInc(Instruction *U, Value *O, const SCEV *E)
+      : UserInst(U), IVOperand(O), IncExpr(E) {}
+};
+
+// The list of IV increments in program order.  We typically add the head of a
+// chain without finding subsequent links.
+struct IVChain {
+  SmallVector<IVInc, 1> Incs;
+  const SCEV *ExprBase = nullptr;
+
+  IVChain() = default;
+  IVChain(const IVInc &Head, const SCEV *Base)
+      : Incs(1, Head), ExprBase(Base) {}
+
+  using const_iterator = SmallVectorImpl<IVInc>::const_iterator;
+
+  // Return the first increment in the chain.
+  const_iterator begin() const {
+    assert(!Incs.empty());
+    return std::next(Incs.begin());
+  }
+  const_iterator end() const {
+    return Incs.end();
+  }
+
+  // Returns true if this chain contains any increments.
+  bool hasIncs() const { return Incs.size() >= 2; }
+
+  // Add an IVInc to the end of this chain.
+  void add(const IVInc &X) { Incs.push_back(X); }
+
+  // Returns the last UserInst in the chain.
+  Instruction *tailUserInst() const { return Incs.back().UserInst; }
+
+  // Returns true if IncExpr can be profitably added to this chain.
+  bool isProfitableIncrement(const SCEV *OperExpr,
+                             const SCEV *IncExpr,
+                             ScalarEvolution&);
+};
+
+/// Helper for CollectChains to track multiple IV increment uses.  Distinguish
+/// between FarUsers that definitely cross IV increments and NearUsers that may
+/// be used between IV increments.
+struct ChainUsers {
+  SmallPtrSet<Instruction*, 4> FarUsers;
+  SmallPtrSet<Instruction*, 4> NearUsers;
+};
+
+/// This class holds state for the main loop strength reduction logic.
+class LSRInstance {
+  IVUsers &IU;
+  ScalarEvolution &SE;
+  DominatorTree &DT;
+  LoopInfo &LI;
+  AssumptionCache &AC;
+  TargetLibraryInfo &TLI;
+  const TargetTransformInfo &TTI;
+  Loop *const L;
+  MemorySSAUpdater *MSSAU;
+  bool FavorBackedgeIndex = false;
+  bool Changed = false;
+
+  /// This is the insert position that the current loop's induction variable
+  /// increment should be placed. In simple loops, this is the latch block's
+  /// terminator. But in more complicated cases, this is a position which will
+  /// dominate all the in-loop post-increment users.
+  Instruction *IVIncInsertPos = nullptr;
+
+  /// Interesting factors between use strides.
+  ///
+  /// We explicitly use a SetVector which contains a SmallSet, instead of the
+  /// default, a SmallDenseSet, because we need to use the full range of
+  /// int64_ts, and there's currently no good way of doing that with
+  /// SmallDenseSet.
+  SetVector<int64_t, SmallVector<int64_t, 8>, SmallSet<int64_t, 8>> Factors;
+
+  /// Interesting use types, to facilitate truncation reuse.
+  SmallSetVector<Type *, 4> Types;
+
+  /// The list of interesting uses.
+  mutable SmallVector<LSRUse, 16> Uses;
+
+  /// Track which uses use which register candidates.
+  RegUseTracker RegUses;
+
+  // Limit the number of chains to avoid quadratic behavior. We don't expect to
+  // have more than a few IV increment chains in a loop. Missing a Chain falls
+  // back to normal LSR behavior for those uses.
+  static const unsigned MaxChains = 8;
+
+  /// IV users can form a chain of IV increments.
+  SmallVector<IVChain, MaxChains> IVChainVec;
+
+  /// IV users that belong to profitable IVChains.
+  SmallPtrSet<Use*, MaxChains> IVIncSet;
+
+  void OptimizeShadowIV();
+  bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
+  ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
+  void OptimizeLoopTermCond();
+
+  void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
+                        SmallVectorImpl<ChainUsers> &ChainUsersVec);
+  void FinalizeChain(IVChain &Chain);
+  void CollectChains();
+  void GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
+                       SmallVectorImpl<WeakTrackingVH> &DeadInsts);
+
+  void CollectInterestingTypesAndFactors();
+  void CollectFixupsAndInitialFormulae();
+
+  // Support for sharing of LSRUses between LSRFixups.
+  using UseMapTy = DenseMap<LSRUse::SCEVUseKindPair, size_t>;
+  UseMapTy UseMap;
+
+  bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
+                          LSRUse::KindType Kind, MemAccessTy AccessTy);
+
+  std::pair<size_t, int64_t> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
+                                    MemAccessTy AccessTy);
+
+  void DeleteUse(LSRUse &LU, size_t LUIdx);
+
+  LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
+
+  void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
+  void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
+  void CountRegisters(const Formula &F, size_t LUIdx);
+  bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
+
+  void CollectLoopInvariantFixupsAndFormulae();
+
+  void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
+                              unsigned Depth = 0);
+
+  void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
+                                  const Formula &Base, unsigned Depth,
+                                  size_t Idx, bool IsScaledReg = false);
+  void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
+  void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
+                                   const Formula &Base, size_t Idx,
+                                   bool IsScaledReg = false);
+  void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
+  void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
+                                   const Formula &Base,
+                                   const SmallVectorImpl<int64_t> &Worklist,
+                                   size_t Idx, bool IsScaledReg = false);
+  void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
+  void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
+  void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
+  void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
+  void GenerateCrossUseConstantOffsets();
+  void GenerateAllReuseFormulae();
+
+  void FilterOutUndesirableDedicatedRegisters();
+
+  size_t EstimateSearchSpaceComplexity() const;
+  void NarrowSearchSpaceByDetectingSupersets();
+  void NarrowSearchSpaceByCollapsingUnrolledCode();
+  void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
+  void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
+  void NarrowSearchSpaceByFilterPostInc();
+  void NarrowSearchSpaceByDeletingCostlyFormulas();
+  void NarrowSearchSpaceByPickingWinnerRegs();
+  void NarrowSearchSpaceUsingHeuristics();
+
+  void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
+                    Cost &SolutionCost,
+                    SmallVectorImpl<const Formula *> &Workspace,
+                    const Cost &CurCost,
+                    const SmallPtrSet<const SCEV *, 16> &CurRegs,
+                    DenseSet<const SCEV *> &VisitedRegs) const;
+  void Solve(SmallVectorImpl<const Formula *> &Solution) const;
+
+  BasicBlock::iterator
+    HoistInsertPosition(BasicBlock::iterator IP,
+                        const SmallVectorImpl<Instruction *> &Inputs) const;
+  BasicBlock::iterator
+    AdjustInsertPositionForExpand(BasicBlock::iterator IP,
+                                  const LSRFixup &LF,
+                                  const LSRUse &LU,
+                                  SCEVExpander &Rewriter) const;
+
+  Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
+                BasicBlock::iterator IP, SCEVExpander &Rewriter,
+                SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
+  void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
+                     const Formula &F, SCEVExpander &Rewriter,
+                     SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
+  void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
+               SCEVExpander &Rewriter,
+               SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
+  void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
+
+public:
+  LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
+              LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC,
+              TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU);
+
+  bool getChanged() const { return Changed; }
+
+  void print_factors_and_types(raw_ostream &OS) const;
+  void print_fixups(raw_ostream &OS) const;
+  void print_uses(raw_ostream &OS) const;
+  void print(raw_ostream &OS) const;
+  void dump() const;
+};
+
+} // end anonymous namespace
+
+/// If IV is used in a int-to-float cast inside the loop then try to eliminate
+/// the cast operation.
+void LSRInstance::OptimizeShadowIV() {
+  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+    return;
+
+  for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
+       UI != E; /* empty */) {
+    IVUsers::const_iterator CandidateUI = UI;
+    ++UI;
+    Instruction *ShadowUse = CandidateUI->getUser();
+    Type *DestTy = nullptr;
+    bool IsSigned = false;
+
+    /* If shadow use is a int->float cast then insert a second IV
+       to eliminate this cast.
+
+         for (unsigned i = 0; i < n; ++i)
+           foo((double)i);
+
+       is transformed into
+
+         double d = 0.0;
+         for (unsigned i = 0; i < n; ++i, ++d)
+           foo(d);
+    */
+    if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
+      IsSigned = false;
+      DestTy = UCast->getDestTy();
+    }
+    else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
+      IsSigned = true;
+      DestTy = SCast->getDestTy();
+    }
+    if (!DestTy) continue;
+
+    // If target does not support DestTy natively then do not apply
+    // this transformation.
+    if (!TTI.isTypeLegal(DestTy)) continue;
+
+    PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
+    if (!PH) continue;
+    if (PH->getNumIncomingValues() != 2) continue;
+
+    // If the calculation in integers overflows, the result in FP type will
+    // differ. So we only can do this transformation if we are guaranteed to not
+    // deal with overflowing values
+    const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(PH));
+    if (!AR) continue;
+    if (IsSigned && !AR->hasNoSignedWrap()) continue;
+    if (!IsSigned && !AR->hasNoUnsignedWrap()) continue;
+
+    Type *SrcTy = PH->getType();
+    int Mantissa = DestTy->getFPMantissaWidth();
+    if (Mantissa == -1) continue;
+    if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
+      continue;
+
+    unsigned Entry, Latch;
+    if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
+      Entry = 0;
+      Latch = 1;
+    } else {
+      Entry = 1;
+      Latch = 0;
+    }
+
+    ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
+    if (!Init) continue;
+    Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
+                                        (double)Init->getSExtValue() :
+                                        (double)Init->getZExtValue());
+
+    BinaryOperator *Incr =
+      dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
+    if (!Incr) continue;
+    if (Incr->getOpcode() != Instruction::Add
+        && Incr->getOpcode() != Instruction::Sub)
+      continue;
+
+    /* Initialize new IV, double d = 0.0 in above example. */
+    ConstantInt *C = nullptr;
+    if (Incr->getOperand(0) == PH)
+      C = dyn_cast<ConstantInt>(Incr->getOperand(1));
+    else if (Incr->getOperand(1) == PH)
+      C = dyn_cast<ConstantInt>(Incr->getOperand(0));
+    else
+      continue;
+
+    if (!C) continue;
+
+    // Ignore negative constants, as the code below doesn't handle them
+    // correctly. TODO: Remove this restriction.
+    if (!C->getValue().isStrictlyPositive()) continue;
+
+    /* Add new PHINode. */
+    PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH);
+
+    /* create new increment. '++d' in above example. */
+    Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
+    BinaryOperator *NewIncr =
+      BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ?
+                               Instruction::FAdd : Instruction::FSub,
+                             NewPH, CFP, "IV.S.next.", Incr);
+
+    NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
+    NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
+
+    /* Remove cast operation */
+    ShadowUse->replaceAllUsesWith(NewPH);
+    ShadowUse->eraseFromParent();
+    Changed = true;
+    break;
+  }
+}
+
+/// If Cond has an operand that is an expression of an IV, set the IV user and
+/// stride information and return true, otherwise return false.
+bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
+  for (IVStrideUse &U : IU)
+    if (U.getUser() == Cond) {
+      // NOTE: we could handle setcc instructions with multiple uses here, but
+      // InstCombine does it as well for simple uses, it's not clear that it
+      // occurs enough in real life to handle.
+      CondUse = &U;
+      return true;
+    }
+  return false;
+}
+
+/// Rewrite the loop's terminating condition if it uses a max computation.
+///
+/// This is a narrow solution to a specific, but acute, problem. For loops
+/// like this:
+///
+///   i = 0;
+///   do {
+///     p[i] = 0.0;
+///   } while (++i < n);
+///
+/// the trip count isn't just 'n', because 'n' might not be positive. And
+/// unfortunately this can come up even for loops where the user didn't use
+/// a C do-while loop. For example, seemingly well-behaved top-test loops
+/// will commonly be lowered like this:
+///
+///   if (n > 0) {
+///     i = 0;
+///     do {
+///       p[i] = 0.0;
+///     } while (++i < n);
+///   }
+///
+/// and then it's possible for subsequent optimization to obscure the if
+/// test in such a way that indvars can't find it.
+///
+/// When indvars can't find the if test in loops like this, it creates a
+/// max expression, which allows it to give the loop a canonical
+/// induction variable:
+///
+///   i = 0;
+///   max = n < 1 ? 1 : n;
+///   do {
+///     p[i] = 0.0;
+///   } while (++i != max);
+///
+/// Canonical induction variables are necessary because the loop passes
+/// are designed around them. The most obvious example of this is the
+/// LoopInfo analysis, which doesn't remember trip count values. It
+/// expects to be able to rediscover the trip count each time it is
+/// needed, and it does this using a simple analysis that only succeeds if
+/// the loop has a canonical induction variable.
+///
+/// However, when it comes time to generate code, the maximum operation
+/// can be quite costly, especially if it's inside of an outer loop.
+///
+/// This function solves this problem by detecting this type of loop and
+/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
+/// the instructions for the maximum computation.
+ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
+  // Check that the loop matches the pattern we're looking for.
+  if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
+      Cond->getPredicate() != CmpInst::ICMP_NE)
+    return Cond;
+
+  SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
+  if (!Sel || !Sel->hasOneUse()) return Cond;
+
+  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+    return Cond;
+  const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
+
+  // Add one to the backedge-taken count to get the trip count.
+  const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
+  if (IterationCount != SE.getSCEV(Sel)) return Cond;
+
+  // Check for a max calculation that matches the pattern. There's no check
+  // for ICMP_ULE here because the comparison would be with zero, which
+  // isn't interesting.
+  CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
+  const SCEVNAryExpr *Max = nullptr;
+  if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
+    Pred = ICmpInst::ICMP_SLE;
+    Max = S;
+  } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
+    Pred = ICmpInst::ICMP_SLT;
+    Max = S;
+  } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
+    Pred = ICmpInst::ICMP_ULT;
+    Max = U;
+  } else {
+    // No match; bail.
+    return Cond;
+  }
+
+  // To handle a max with more than two operands, this optimization would
+  // require additional checking and setup.
+  if (Max->getNumOperands() != 2)
+    return Cond;
+
+  const SCEV *MaxLHS = Max->getOperand(0);
+  const SCEV *MaxRHS = Max->getOperand(1);
+
+  // ScalarEvolution canonicalizes constants to the left. For < and >, look
+  // for a comparison with 1. For <= and >=, a comparison with zero.
+  if (!MaxLHS ||
+      (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
+    return Cond;
+
+  // Check the relevant induction variable for conformance to
+  // the pattern.
+  const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
+  if (!AR || !AR->isAffine() ||
+      AR->getStart() != One ||
+      AR->getStepRecurrence(SE) != One)
+    return Cond;
+
+  assert(AR->getLoop() == L &&
+         "Loop condition operand is an addrec in a different loop!");
+
+  // Check the right operand of the select, and remember it, as it will
+  // be used in the new comparison instruction.
+  Value *NewRHS = nullptr;
+  if (ICmpInst::isTrueWhenEqual(Pred)) {
+    // Look for n+1, and grab n.
+    if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
+      if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
+         if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
+           NewRHS = BO->getOperand(0);
+    if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
+      if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
+        if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
+          NewRHS = BO->getOperand(0);
+    if (!NewRHS)
+      return Cond;
+  } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
+    NewRHS = Sel->getOperand(1);
+  else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
+    NewRHS = Sel->getOperand(2);
+  else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
+    NewRHS = SU->getValue();
+  else
+    // Max doesn't match expected pattern.
+    return Cond;
+
+  // Determine the new comparison opcode. It may be signed or unsigned,
+  // and the original comparison may be either equality or inequality.
+  if (Cond->getPredicate() == CmpInst::ICMP_EQ)
+    Pred = CmpInst::getInversePredicate(Pred);
+
+  // Ok, everything looks ok to change the condition into an SLT or SGE and
+  // delete the max calculation.
+  ICmpInst *NewCond =
+    new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp");
+
+  // Delete the max calculation instructions.
+  Cond->replaceAllUsesWith(NewCond);
+  CondUse->setUser(NewCond);
+  Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
+  Cond->eraseFromParent();
+  Sel->eraseFromParent();
+  if (Cmp->use_empty())
+    Cmp->eraseFromParent();
+  return NewCond;
+}
+
+/// Change loop terminating condition to use the postinc iv when possible.
+void
+LSRInstance::OptimizeLoopTermCond() {
+  SmallPtrSet<Instruction *, 4> PostIncs;
+
+  // We need a different set of heuristics for rotated and non-rotated loops.
+  // If a loop is rotated then the latch is also the backedge, so inserting
+  // post-inc expressions just before the latch is ideal. To reduce live ranges
+  // it also makes sense to rewrite terminating conditions to use post-inc
+  // expressions.
+  //
+  // If the loop is not rotated then the latch is not a backedge; the latch
+  // check is done in the loop head. Adding post-inc expressions before the
+  // latch will cause overlapping live-ranges of pre-inc and post-inc expressions
+  // in the loop body. In this case we do *not* want to use post-inc expressions
+  // in the latch check, and we want to insert post-inc expressions before
+  // the backedge.
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  SmallVector<BasicBlock*, 8> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+  if (llvm::all_of(ExitingBlocks, [&LatchBlock](const BasicBlock *BB) {
+        return LatchBlock != BB;
+      })) {
+    // The backedge doesn't exit the loop; treat this as a head-tested loop.
+    IVIncInsertPos = LatchBlock->getTerminator();
+    return;
+  }
+
+  // Otherwise treat this as a rotated loop.
+  for (BasicBlock *ExitingBlock : ExitingBlocks) {
+    // Get the terminating condition for the loop if possible.  If we
+    // can, we want to change it to use a post-incremented version of its
+    // induction variable, to allow coalescing the live ranges for the IV into
+    // one register value.
+
+    BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
+    if (!TermBr)
+      continue;
+    // FIXME: Overly conservative, termination condition could be an 'or' etc..
+    if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
+      continue;
+
+    // Search IVUsesByStride to find Cond's IVUse if there is one.
+    IVStrideUse *CondUse = nullptr;
+    ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
+    if (!FindIVUserForCond(Cond, CondUse))
+      continue;
+
+    // If the trip count is computed in terms of a max (due to ScalarEvolution
+    // being unable to find a sufficient guard, for example), change the loop
+    // comparison to use SLT or ULT instead of NE.
+    // One consequence of doing this now is that it disrupts the count-down
+    // optimization. That's not always a bad thing though, because in such
+    // cases it may still be worthwhile to avoid a max.
+    Cond = OptimizeMax(Cond, CondUse);
+
+    // If this exiting block dominates the latch block, it may also use
+    // the post-inc value if it won't be shared with other uses.
+    // Check for dominance.
+    if (!DT.dominates(ExitingBlock, LatchBlock))
+      continue;
+
+    // Conservatively avoid trying to use the post-inc value in non-latch
+    // exits if there may be pre-inc users in intervening blocks.
+    if (LatchBlock != ExitingBlock)
+      for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
+        // Test if the use is reachable from the exiting block. This dominator
+        // query is a conservative approximation of reachability.
+        if (&*UI != CondUse &&
+            !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) {
+          // Conservatively assume there may be reuse if the quotient of their
+          // strides could be a legal scale.
+          const SCEV *A = IU.getStride(*CondUse, L);
+          const SCEV *B = IU.getStride(*UI, L);
+          if (!A || !B) continue;
+          if (SE.getTypeSizeInBits(A->getType()) !=
+              SE.getTypeSizeInBits(B->getType())) {
+            if (SE.getTypeSizeInBits(A->getType()) >
+                SE.getTypeSizeInBits(B->getType()))
+              B = SE.getSignExtendExpr(B, A->getType());
+            else
+              A = SE.getSignExtendExpr(A, B->getType());
+          }
+          if (const SCEVConstant *D =
+                dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
+            const ConstantInt *C = D->getValue();
+            // Stride of one or negative one can have reuse with non-addresses.
+            if (C->isOne() || C->isMinusOne())
+              goto decline_post_inc;
+            // Avoid weird situations.
+            if (C->getValue().getMinSignedBits() >= 64 ||
+                C->getValue().isMinSignedValue())
+              goto decline_post_inc;
+            // Check for possible scaled-address reuse.
+            if (isAddressUse(TTI, UI->getUser(), UI->getOperandValToReplace())) {
+              MemAccessTy AccessTy = getAccessType(
+                  TTI, UI->getUser(), UI->getOperandValToReplace());
+              int64_t Scale = C->getSExtValue();
+              if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
+                                            /*BaseOffset=*/0,
+                                            /*HasBaseReg=*/false, Scale,
+                                            AccessTy.AddrSpace))
+                goto decline_post_inc;
+              Scale = -Scale;
+              if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
+                                            /*BaseOffset=*/0,
+                                            /*HasBaseReg=*/false, Scale,
+                                            AccessTy.AddrSpace))
+                goto decline_post_inc;
+            }
+          }
+        }
+
+    LLVM_DEBUG(dbgs() << "  Change loop exiting icmp to use postinc iv: "
+                      << *Cond << '\n');
+
+    // It's possible for the setcc instruction to be anywhere in the loop, and
+    // possible for it to have multiple users.  If it is not immediately before
+    // the exiting block branch, move it.
+    if (&*++BasicBlock::iterator(Cond) != TermBr) {
+      if (Cond->hasOneUse()) {
+        Cond->moveBefore(TermBr);
+      } else {
+        // Clone the terminating condition and insert into the loopend.
+        ICmpInst *OldCond = Cond;
+        Cond = cast<ICmpInst>(Cond->clone());
+        Cond->setName(L->getHeader()->getName() + ".termcond");
+        ExitingBlock->getInstList().insert(TermBr->getIterator(), Cond);
+
+        // Clone the IVUse, as the old use still exists!
+        CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
+        TermBr->replaceUsesOfWith(OldCond, Cond);
+      }
+    }
+
+    // If we get to here, we know that we can transform the setcc instruction to
+    // use the post-incremented version of the IV, allowing us to coalesce the
+    // live ranges for the IV correctly.
+    CondUse->transformToPostInc(L);
+    Changed = true;
+
+    PostIncs.insert(Cond);
+  decline_post_inc:;
+  }
+
+  // Determine an insertion point for the loop induction variable increment. It
+  // must dominate all the post-inc comparisons we just set up, and it must
+  // dominate the loop latch edge.
+  IVIncInsertPos = L->getLoopLatch()->getTerminator();
+  for (Instruction *Inst : PostIncs) {
+    BasicBlock *BB =
+      DT.findNearestCommonDominator(IVIncInsertPos->getParent(),
+                                    Inst->getParent());
+    if (BB == Inst->getParent())
+      IVIncInsertPos = Inst;
+    else if (BB != IVIncInsertPos->getParent())
+      IVIncInsertPos = BB->getTerminator();
+  }
+}
+
+/// Determine if the given use can accommodate a fixup at the given offset and
+/// other details. If so, update the use and return true.
+bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
+                                     bool HasBaseReg, LSRUse::KindType Kind,
+                                     MemAccessTy AccessTy) {
+  int64_t NewMinOffset = LU.MinOffset;
+  int64_t NewMaxOffset = LU.MaxOffset;
+  MemAccessTy NewAccessTy = AccessTy;
+
+  // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
+  // something conservative, however this can pessimize in the case that one of
+  // the uses will have all its uses outside the loop, for example.
+  if (LU.Kind != Kind)
+    return false;
+
+  // Check for a mismatched access type, and fall back conservatively as needed.
+  // TODO: Be less conservative when the type is similar and can use the same
+  // addressing modes.
+  if (Kind == LSRUse::Address) {
+    if (AccessTy.MemTy != LU.AccessTy.MemTy) {
+      NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext(),
+                                            AccessTy.AddrSpace);
+    }
+  }
+
+  // Conservatively assume HasBaseReg is true for now.
+  if (NewOffset < LU.MinOffset) {
+    if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
+                          LU.MaxOffset - NewOffset, HasBaseReg))
+      return false;
+    NewMinOffset = NewOffset;
+  } else if (NewOffset > LU.MaxOffset) {
+    if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
+                          NewOffset - LU.MinOffset, HasBaseReg))
+      return false;
+    NewMaxOffset = NewOffset;
+  }
+
+  // Update the use.
+  LU.MinOffset = NewMinOffset;
+  LU.MaxOffset = NewMaxOffset;
+  LU.AccessTy = NewAccessTy;
+  return true;
+}
+
+/// Return an LSRUse index and an offset value for a fixup which needs the given
+/// expression, with the given kind and optional access type.  Either reuse an
+/// existing use or create a new one, as needed.
+std::pair<size_t, int64_t> LSRInstance::getUse(const SCEV *&Expr,
+                                               LSRUse::KindType Kind,
+                                               MemAccessTy AccessTy) {
+  const SCEV *Copy = Expr;
+  int64_t Offset = ExtractImmediate(Expr, SE);
+
+  // Basic uses can't accept any offset, for example.
+  if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
+                        Offset, /*HasBaseReg=*/ true)) {
+    Expr = Copy;
+    Offset = 0;
+  }
+
+  std::pair<UseMapTy::iterator, bool> P =
+    UseMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0));
+  if (!P.second) {
+    // A use already existed with this base.
+    size_t LUIdx = P.first->second;
+    LSRUse &LU = Uses[LUIdx];
+    if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
+      // Reuse this use.
+      return std::make_pair(LUIdx, Offset);
+  }
+
+  // Create a new use.
+  size_t LUIdx = Uses.size();
+  P.first->second = LUIdx;
+  Uses.push_back(LSRUse(Kind, AccessTy));
+  LSRUse &LU = Uses[LUIdx];
+
+  LU.MinOffset = Offset;
+  LU.MaxOffset = Offset;
+  return std::make_pair(LUIdx, Offset);
+}
+
+/// Delete the given use from the Uses list.
+void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
+  if (&LU != &Uses.back())
+    std::swap(LU, Uses.back());
+  Uses.pop_back();
+
+  // Update RegUses.
+  RegUses.swapAndDropUse(LUIdx, Uses.size());
+}
+
+/// Look for a use distinct from OrigLU which is has a formula that has the same
+/// registers as the given formula.
+LSRUse *
+LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
+                                       const LSRUse &OrigLU) {
+  // Search all uses for the formula. This could be more clever.
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    // Check whether this use is close enough to OrigLU, to see whether it's
+    // worthwhile looking through its formulae.
+    // Ignore ICmpZero uses because they may contain formulae generated by
+    // GenerateICmpZeroScales, in which case adding fixup offsets may
+    // be invalid.
+    if (&LU != &OrigLU &&
+        LU.Kind != LSRUse::ICmpZero &&
+        LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
+        LU.WidestFixupType == OrigLU.WidestFixupType &&
+        LU.HasFormulaWithSameRegs(OrigF)) {
+      // Scan through this use's formulae.
+      for (const Formula &F : LU.Formulae) {
+        // Check to see if this formula has the same registers and symbols
+        // as OrigF.
+        if (F.BaseRegs == OrigF.BaseRegs &&
+            F.ScaledReg == OrigF.ScaledReg &&
+            F.BaseGV == OrigF.BaseGV &&
+            F.Scale == OrigF.Scale &&
+            F.UnfoldedOffset == OrigF.UnfoldedOffset) {
+          if (F.BaseOffset == 0)
+            return &LU;
+          // This is the formula where all the registers and symbols matched;
+          // there aren't going to be any others. Since we declined it, we
+          // can skip the rest of the formulae and proceed to the next LSRUse.
+          break;
+        }
+      }
+    }
+  }
+
+  // Nothing looked good.
+  return nullptr;
+}
+
+void LSRInstance::CollectInterestingTypesAndFactors() {
+  SmallSetVector<const SCEV *, 4> Strides;
+
+  // Collect interesting types and strides.
+  SmallVector<const SCEV *, 4> Worklist;
+  for (const IVStrideUse &U : IU) {
+    const SCEV *Expr = IU.getExpr(U);
+
+    // Collect interesting types.
+    Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
+
+    // Add strides for mentioned loops.
+    Worklist.push_back(Expr);
+    do {
+      const SCEV *S = Worklist.pop_back_val();
+      if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+        if (AR->getLoop() == L)
+          Strides.insert(AR->getStepRecurrence(SE));
+        Worklist.push_back(AR->getStart());
+      } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+        Worklist.append(Add->op_begin(), Add->op_end());
+      }
+    } while (!Worklist.empty());
+  }
+
+  // Compute interesting factors from the set of interesting strides.
+  for (SmallSetVector<const SCEV *, 4>::const_iterator
+       I = Strides.begin(), E = Strides.end(); I != E; ++I)
+    for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
+         std::next(I); NewStrideIter != E; ++NewStrideIter) {
+      const SCEV *OldStride = *I;
+      const SCEV *NewStride = *NewStrideIter;
+
+      if (SE.getTypeSizeInBits(OldStride->getType()) !=
+          SE.getTypeSizeInBits(NewStride->getType())) {
+        if (SE.getTypeSizeInBits(OldStride->getType()) >
+            SE.getTypeSizeInBits(NewStride->getType()))
+          NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
+        else
+          OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
+      }
+      if (const SCEVConstant *Factor =
+            dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
+                                                        SE, true))) {
+        if (Factor->getAPInt().getMinSignedBits() <= 64)
+          Factors.insert(Factor->getAPInt().getSExtValue());
+      } else if (const SCEVConstant *Factor =
+                   dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
+                                                               NewStride,
+                                                               SE, true))) {
+        if (Factor->getAPInt().getMinSignedBits() <= 64)
+          Factors.insert(Factor->getAPInt().getSExtValue());
+      }
+    }
+
+  // If all uses use the same type, don't bother looking for truncation-based
+  // reuse.
+  if (Types.size() == 1)
+    Types.clear();
+
+  LLVM_DEBUG(print_factors_and_types(dbgs()));
+}
+
+/// Helper for CollectChains that finds an IV operand (computed by an AddRec in
+/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
+/// IVStrideUses, we could partially skip this.
+static User::op_iterator
+findIVOperand(User::op_iterator OI, User::op_iterator OE,
+              Loop *L, ScalarEvolution &SE) {
+  for(; OI != OE; ++OI) {
+    if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
+      if (!SE.isSCEVable(Oper->getType()))
+        continue;
+
+      if (const SCEVAddRecExpr *AR =
+          dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Oper))) {
+        if (AR->getLoop() == L)
+          break;
+      }
+    }
+  }
+  return OI;
+}
+
+/// IVChain logic must consistently peek base TruncInst operands, so wrap it in
+/// a convenient helper.
+static Value *getWideOperand(Value *Oper) {
+  if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
+    return Trunc->getOperand(0);
+  return Oper;
+}
+
+/// Return true if we allow an IV chain to include both types.
+static bool isCompatibleIVType(Value *LVal, Value *RVal) {
+  Type *LType = LVal->getType();
+  Type *RType = RVal->getType();
+  return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy() &&
+                              // Different address spaces means (possibly)
+                              // different types of the pointer implementation,
+                              // e.g. i16 vs i32 so disallow that.
+                              (LType->getPointerAddressSpace() ==
+                               RType->getPointerAddressSpace()));
+}
+
+/// Return an approximation of this SCEV expression's "base", or NULL for any
+/// constant. Returning the expression itself is conservative. Returning a
+/// deeper subexpression is more precise and valid as long as it isn't less
+/// complex than another subexpression. For expressions involving multiple
+/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
+/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
+/// IVInc==b-a.
+///
+/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
+/// SCEVUnknown, we simply return the rightmost SCEV operand.
+static const SCEV *getExprBase(const SCEV *S) {
+  switch (S->getSCEVType()) {
+  default: // uncluding scUnknown.
+    return S;
+  case scConstant:
+    return nullptr;
+  case scTruncate:
+    return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
+  case scZeroExtend:
+    return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
+  case scSignExtend:
+    return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
+  case scAddExpr: {
+    // Skip over scaled operands (scMulExpr) to follow add operands as long as
+    // there's nothing more complex.
+    // FIXME: not sure if we want to recognize negation.
+    const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
+    for (std::reverse_iterator<SCEVAddExpr::op_iterator> I(Add->op_end()),
+           E(Add->op_begin()); I != E; ++I) {
+      const SCEV *SubExpr = *I;
+      if (SubExpr->getSCEVType() == scAddExpr)
+        return getExprBase(SubExpr);
+
+      if (SubExpr->getSCEVType() != scMulExpr)
+        return SubExpr;
+    }
+    return S; // all operands are scaled, be conservative.
+  }
+  case scAddRecExpr:
+    return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
+  }
   llvm_unreachable("Unknown SCEV kind!");
-} 
- 
-/// Return true if the chain increment is profitable to expand into a loop 
-/// invariant value, which may require its own register. A profitable chain 
-/// increment will be an offset relative to the same base. We allow such offsets 
-/// to potentially be used as chain increment as long as it's not obviously 
-/// expensive to expand using real instructions. 
-bool IVChain::isProfitableIncrement(const SCEV *OperExpr, 
-                                    const SCEV *IncExpr, 
-                                    ScalarEvolution &SE) { 
-  // Aggressively form chains when -stress-ivchain. 
-  if (StressIVChain) 
-    return true; 
- 
-  // Do not replace a constant offset from IV head with a nonconstant IV 
-  // increment. 
-  if (!isa<SCEVConstant>(IncExpr)) { 
-    const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand)); 
-    if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr))) 
-      return false; 
-  } 
- 
-  SmallPtrSet<const SCEV*, 8> Processed; 
-  return !isHighCostExpansion(IncExpr, Processed, SE); 
-} 
- 
-/// Return true if the number of registers needed for the chain is estimated to 
-/// be less than the number required for the individual IV users. First prohibit 
-/// any IV users that keep the IV live across increments (the Users set should 
-/// be empty). Next count the number and type of increments in the chain. 
-/// 
-/// Chaining IVs can lead to considerable code bloat if ISEL doesn't 
-/// effectively use postinc addressing modes. Only consider it profitable it the 
-/// increments can be computed in fewer registers when chained. 
-/// 
-/// TODO: Consider IVInc free if it's already used in another chains. 
-static bool isProfitableChain(IVChain &Chain, 
-                              SmallPtrSetImpl<Instruction *> &Users, 
-                              ScalarEvolution &SE, 
-                              const TargetTransformInfo &TTI) { 
-  if (StressIVChain) 
-    return true; 
- 
-  if (!Chain.hasIncs()) 
-    return false; 
- 
-  if (!Users.empty()) { 
-    LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n"; 
-               for (Instruction *Inst 
-                    : Users) { dbgs() << "  " << *Inst << "\n"; }); 
-    return false; 
-  } 
-  assert(!Chain.Incs.empty() && "empty IV chains are not allowed"); 
- 
-  // The chain itself may require a register, so intialize cost to 1. 
-  int cost = 1; 
- 
-  // A complete chain likely eliminates the need for keeping the original IV in 
-  // a register. LSR does not currently know how to form a complete chain unless 
-  // the header phi already exists. 
-  if (isa<PHINode>(Chain.tailUserInst()) 
-      && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) { 
-    --cost; 
-  } 
-  const SCEV *LastIncExpr = nullptr; 
-  unsigned NumConstIncrements = 0; 
-  unsigned NumVarIncrements = 0; 
-  unsigned NumReusedIncrements = 0; 
- 
-  if (TTI.isProfitableLSRChainElement(Chain.Incs[0].UserInst)) 
-    return true; 
- 
-  for (const IVInc &Inc : Chain) { 
-    if (TTI.isProfitableLSRChainElement(Inc.UserInst)) 
-      return true; 
-    if (Inc.IncExpr->isZero()) 
-      continue; 
- 
-    // Incrementing by zero or some constant is neutral. We assume constants can 
-    // be folded into an addressing mode or an add's immediate operand. 
-    if (isa<SCEVConstant>(Inc.IncExpr)) { 
-      ++NumConstIncrements; 
-      continue; 
-    } 
- 
-    if (Inc.IncExpr == LastIncExpr) 
-      ++NumReusedIncrements; 
-    else 
-      ++NumVarIncrements; 
- 
-    LastIncExpr = Inc.IncExpr; 
-  } 
-  // An IV chain with a single increment is handled by LSR's postinc 
-  // uses. However, a chain with multiple increments requires keeping the IV's 
-  // value live longer than it needs to be if chained. 
-  if (NumConstIncrements > 1) 
-    --cost; 
- 
-  // Materializing increment expressions in the preheader that didn't exist in 
-  // the original code may cost a register. For example, sign-extended array 
-  // indices can produce ridiculous increments like this: 
-  // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64))) 
-  cost += NumVarIncrements; 
- 
-  // Reusing variable increments likely saves a register to hold the multiple of 
-  // the stride. 
-  cost -= NumReusedIncrements; 
- 
-  LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost 
-                    << "\n"); 
- 
-  return cost < 0; 
-} 
- 
-/// Add this IV user to an existing chain or make it the head of a new chain. 
-void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper, 
-                                   SmallVectorImpl<ChainUsers> &ChainUsersVec) { 
-  // When IVs are used as types of varying widths, they are generally converted 
-  // to a wider type with some uses remaining narrow under a (free) trunc. 
-  Value *const NextIV = getWideOperand(IVOper); 
-  const SCEV *const OperExpr = SE.getSCEV(NextIV); 
-  const SCEV *const OperExprBase = getExprBase(OperExpr); 
- 
-  // Visit all existing chains. Check if its IVOper can be computed as a 
-  // profitable loop invariant increment from the last link in the Chain. 
-  unsigned ChainIdx = 0, NChains = IVChainVec.size(); 
-  const SCEV *LastIncExpr = nullptr; 
-  for (; ChainIdx < NChains; ++ChainIdx) { 
-    IVChain &Chain = IVChainVec[ChainIdx]; 
- 
-    // Prune the solution space aggressively by checking that both IV operands 
-    // are expressions that operate on the same unscaled SCEVUnknown. This 
-    // "base" will be canceled by the subsequent getMinusSCEV call. Checking 
-    // first avoids creating extra SCEV expressions. 
-    if (!StressIVChain && Chain.ExprBase != OperExprBase) 
-      continue; 
- 
-    Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand); 
-    if (!isCompatibleIVType(PrevIV, NextIV)) 
-      continue; 
- 
-    // A phi node terminates a chain. 
-    if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst())) 
-      continue; 
- 
-    // The increment must be loop-invariant so it can be kept in a register. 
-    const SCEV *PrevExpr = SE.getSCEV(PrevIV); 
-    const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr); 
-    if (!SE.isLoopInvariant(IncExpr, L)) 
-      continue; 
- 
-    if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) { 
-      LastIncExpr = IncExpr; 
-      break; 
-    } 
-  } 
-  // If we haven't found a chain, create a new one, unless we hit the max. Don't 
-  // bother for phi nodes, because they must be last in the chain. 
-  if (ChainIdx == NChains) { 
-    if (isa<PHINode>(UserInst)) 
-      return; 
-    if (NChains >= MaxChains && !StressIVChain) { 
-      LLVM_DEBUG(dbgs() << "IV Chain Limit\n"); 
-      return; 
-    } 
-    LastIncExpr = OperExpr; 
-    // IVUsers may have skipped over sign/zero extensions. We don't currently 
-    // attempt to form chains involving extensions unless they can be hoisted 
-    // into this loop's AddRec. 
-    if (!isa<SCEVAddRecExpr>(LastIncExpr)) 
-      return; 
-    ++NChains; 
-    IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr), 
-                                 OperExprBase)); 
-    ChainUsersVec.resize(NChains); 
-    LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst 
-                      << ") IV=" << *LastIncExpr << "\n"); 
-  } else { 
-    LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << "  Inc: (" << *UserInst 
-                      << ") IV+" << *LastIncExpr << "\n"); 
-    // Add this IV user to the end of the chain. 
-    IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr)); 
-  } 
-  IVChain &Chain = IVChainVec[ChainIdx]; 
- 
-  SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers; 
-  // This chain's NearUsers become FarUsers. 
-  if (!LastIncExpr->isZero()) { 
-    ChainUsersVec[ChainIdx].FarUsers.insert(NearUsers.begin(), 
-                                            NearUsers.end()); 
-    NearUsers.clear(); 
-  } 
- 
-  // All other uses of IVOperand become near uses of the chain. 
-  // We currently ignore intermediate values within SCEV expressions, assuming 
-  // they will eventually be used be the current chain, or can be computed 
-  // from one of the chain increments. To be more precise we could 
-  // transitively follow its user and only add leaf IV users to the set. 
-  for (User *U : IVOper->users()) { 
-    Instruction *OtherUse = dyn_cast<Instruction>(U); 
-    if (!OtherUse) 
-      continue; 
-    // Uses in the chain will no longer be uses if the chain is formed. 
-    // Include the head of the chain in this iteration (not Chain.begin()). 
-    IVChain::const_iterator IncIter = Chain.Incs.begin(); 
-    IVChain::const_iterator IncEnd = Chain.Incs.end(); 
-    for( ; IncIter != IncEnd; ++IncIter) { 
-      if (IncIter->UserInst == OtherUse) 
-        break; 
-    } 
-    if (IncIter != IncEnd) 
-      continue; 
- 
-    if (SE.isSCEVable(OtherUse->getType()) 
-        && !isa<SCEVUnknown>(SE.getSCEV(OtherUse)) 
-        && IU.isIVUserOrOperand(OtherUse)) { 
-      continue; 
-    } 
-    NearUsers.insert(OtherUse); 
-  } 
- 
-  // Since this user is part of the chain, it's no longer considered a use 
-  // of the chain. 
-  ChainUsersVec[ChainIdx].FarUsers.erase(UserInst); 
-} 
- 
-/// Populate the vector of Chains. 
-/// 
-/// This decreases ILP at the architecture level. Targets with ample registers, 
-/// multiple memory ports, and no register renaming probably don't want 
-/// this. However, such targets should probably disable LSR altogether. 
-/// 
-/// The job of LSR is to make a reasonable choice of induction variables across 
-/// the loop. Subsequent passes can easily "unchain" computation exposing more 
-/// ILP *within the loop* if the target wants it. 
-/// 
-/// Finding the best IV chain is potentially a scheduling problem. Since LSR 
-/// will not reorder memory operations, it will recognize this as a chain, but 
-/// will generate redundant IV increments. Ideally this would be corrected later 
-/// by a smart scheduler: 
-///        = A[i] 
-///        = A[i+x] 
-/// A[i]   = 
-/// A[i+x] = 
-/// 
-/// TODO: Walk the entire domtree within this loop, not just the path to the 
-/// loop latch. This will discover chains on side paths, but requires 
-/// maintaining multiple copies of the Chains state. 
-void LSRInstance::CollectChains() { 
-  LLVM_DEBUG(dbgs() << "Collecting IV Chains.\n"); 
-  SmallVector<ChainUsers, 8> ChainUsersVec; 
- 
-  SmallVector<BasicBlock *,8> LatchPath; 
-  BasicBlock *LoopHeader = L->getHeader(); 
-  for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch()); 
-       Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) { 
-    LatchPath.push_back(Rung->getBlock()); 
-  } 
-  LatchPath.push_back(LoopHeader); 
- 
-  // Walk the instruction stream from the loop header to the loop latch. 
-  for (BasicBlock *BB : reverse(LatchPath)) { 
-    for (Instruction &I : *BB) { 
-      // Skip instructions that weren't seen by IVUsers analysis. 
-      if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&I)) 
-        continue; 
- 
-      // Ignore users that are part of a SCEV expression. This way we only 
-      // consider leaf IV Users. This effectively rediscovers a portion of 
-      // IVUsers analysis but in program order this time. 
-      if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I))) 
-          continue; 
- 
-      // Remove this instruction from any NearUsers set it may be in. 
-      for (unsigned ChainIdx = 0, NChains = IVChainVec.size(); 
-           ChainIdx < NChains; ++ChainIdx) { 
-        ChainUsersVec[ChainIdx].NearUsers.erase(&I); 
-      } 
-      // Search for operands that can be chained. 
-      SmallPtrSet<Instruction*, 4> UniqueOperands; 
-      User::op_iterator IVOpEnd = I.op_end(); 
-      User::op_iterator IVOpIter = findIVOperand(I.op_begin(), IVOpEnd, L, SE); 
-      while (IVOpIter != IVOpEnd) { 
-        Instruction *IVOpInst = cast<Instruction>(*IVOpIter); 
-        if (UniqueOperands.insert(IVOpInst).second) 
-          ChainInstruction(&I, IVOpInst, ChainUsersVec); 
-        IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE); 
-      } 
-    } // Continue walking down the instructions. 
-  } // Continue walking down the domtree. 
-  // Visit phi backedges to determine if the chain can generate the IV postinc. 
-  for (PHINode &PN : L->getHeader()->phis()) { 
-    if (!SE.isSCEVable(PN.getType())) 
-      continue; 
- 
-    Instruction *IncV = 
-        dyn_cast<Instruction>(PN.getIncomingValueForBlock(L->getLoopLatch())); 
-    if (IncV) 
-      ChainInstruction(&PN, IncV, ChainUsersVec); 
-  } 
-  // Remove any unprofitable chains. 
-  unsigned ChainIdx = 0; 
-  for (unsigned UsersIdx = 0, NChains = IVChainVec.size(); 
-       UsersIdx < NChains; ++UsersIdx) { 
-    if (!isProfitableChain(IVChainVec[UsersIdx], 
-                           ChainUsersVec[UsersIdx].FarUsers, SE, TTI)) 
-      continue; 
-    // Preserve the chain at UsesIdx. 
-    if (ChainIdx != UsersIdx) 
-      IVChainVec[ChainIdx] = IVChainVec[UsersIdx]; 
-    FinalizeChain(IVChainVec[ChainIdx]); 
-    ++ChainIdx; 
-  } 
-  IVChainVec.resize(ChainIdx); 
-} 
- 
-void LSRInstance::FinalizeChain(IVChain &Chain) { 
-  assert(!Chain.Incs.empty() && "empty IV chains are not allowed"); 
-  LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n"); 
-   
-  for (const IVInc &Inc : Chain) { 
-    LLVM_DEBUG(dbgs() << "        Inc: " << *Inc.UserInst << "\n"); 
-    auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand); 
-    assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand"); 
-    IVIncSet.insert(UseI); 
-  } 
-} 
- 
-/// Return true if the IVInc can be folded into an addressing mode. 
-static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, 
-                             Value *Operand, const TargetTransformInfo &TTI) { 
-  const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr); 
-  if (!IncConst || !isAddressUse(TTI, UserInst, Operand)) 
-    return false; 
- 
-  if (IncConst->getAPInt().getMinSignedBits() > 64) 
-    return false; 
- 
-  MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand); 
-  int64_t IncOffset = IncConst->getValue()->getSExtValue(); 
-  if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr, 
-                        IncOffset, /*HasBaseReg=*/false)) 
-    return false; 
- 
-  return true; 
-} 
- 
-/// Generate an add or subtract for each IVInc in a chain to materialize the IV 
-/// user's operand from the previous IV user's operand. 
-void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, 
-                                  SmallVectorImpl<WeakTrackingVH> &DeadInsts) { 
-  // Find the new IVOperand for the head of the chain. It may have been replaced 
-  // by LSR. 
-  const IVInc &Head = Chain.Incs[0]; 
-  User::op_iterator IVOpEnd = Head.UserInst->op_end(); 
-  // findIVOperand returns IVOpEnd if it can no longer find a valid IV user. 
-  User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(), 
-                                             IVOpEnd, L, SE); 
-  Value *IVSrc = nullptr; 
-  while (IVOpIter != IVOpEnd) { 
-    IVSrc = getWideOperand(*IVOpIter); 
- 
-    // If this operand computes the expression that the chain needs, we may use 
-    // it. (Check this after setting IVSrc which is used below.) 
-    // 
-    // Note that if Head.IncExpr is wider than IVSrc, then this phi is too 
-    // narrow for the chain, so we can no longer use it. We do allow using a 
-    // wider phi, assuming the LSR checked for free truncation. In that case we 
-    // should already have a truncate on this operand such that 
-    // getSCEV(IVSrc) == IncExpr. 
-    if (SE.getSCEV(*IVOpIter) == Head.IncExpr 
-        || SE.getSCEV(IVSrc) == Head.IncExpr) { 
-      break; 
-    } 
-    IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE); 
-  } 
-  if (IVOpIter == IVOpEnd) { 
-    // Gracefully give up on this chain. 
-    LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n"); 
-    return; 
-  } 
-  assert(IVSrc && "Failed to find IV chain source"); 
- 
-  LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n"); 
-  Type *IVTy = IVSrc->getType(); 
-  Type *IntTy = SE.getEffectiveSCEVType(IVTy); 
-  const SCEV *LeftOverExpr = nullptr; 
-  for (const IVInc &Inc : Chain) { 
-    Instruction *InsertPt = Inc.UserInst; 
-    if (isa<PHINode>(InsertPt)) 
-      InsertPt = L->getLoopLatch()->getTerminator(); 
- 
-    // IVOper will replace the current IV User's operand. IVSrc is the IV 
-    // value currently held in a register. 
-    Value *IVOper = IVSrc; 
-    if (!Inc.IncExpr->isZero()) { 
-      // IncExpr was the result of subtraction of two narrow values, so must 
-      // be signed. 
-      const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy); 
-      LeftOverExpr = LeftOverExpr ? 
-        SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr; 
-    } 
-    if (LeftOverExpr && !LeftOverExpr->isZero()) { 
-      // Expand the IV increment. 
-      Rewriter.clearPostInc(); 
-      Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt); 
-      const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc), 
-                                             SE.getUnknown(IncV)); 
-      IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt); 
- 
-      // If an IV increment can't be folded, use it as the next IV value. 
-      if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) { 
-        assert(IVTy == IVOper->getType() && "inconsistent IV increment type"); 
-        IVSrc = IVOper; 
-        LeftOverExpr = nullptr; 
-      } 
-    } 
-    Type *OperTy = Inc.IVOperand->getType(); 
-    if (IVTy != OperTy) { 
-      assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) && 
-             "cannot extend a chained IV"); 
-      IRBuilder<> Builder(InsertPt); 
-      IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain"); 
-    } 
-    Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper); 
-    if (auto *OperandIsInstr = dyn_cast<Instruction>(Inc.IVOperand)) 
-      DeadInsts.emplace_back(OperandIsInstr); 
-  } 
-  // If LSR created a new, wider phi, we may also replace its postinc. We only 
-  // do this if we also found a wide value for the head of the chain. 
-  if (isa<PHINode>(Chain.tailUserInst())) { 
-    for (PHINode &Phi : L->getHeader()->phis()) { 
-      if (!isCompatibleIVType(&Phi, IVSrc)) 
-        continue; 
-      Instruction *PostIncV = dyn_cast<Instruction>( 
-          Phi.getIncomingValueForBlock(L->getLoopLatch())); 
-      if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc))) 
-        continue; 
-      Value *IVOper = IVSrc; 
-      Type *PostIncTy = PostIncV->getType(); 
-      if (IVTy != PostIncTy) { 
-        assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types"); 
-        IRBuilder<> Builder(L->getLoopLatch()->getTerminator()); 
-        Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc()); 
-        IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain"); 
-      } 
-      Phi.replaceUsesOfWith(PostIncV, IVOper); 
-      DeadInsts.emplace_back(PostIncV); 
-    } 
-  } 
-} 
- 
-void LSRInstance::CollectFixupsAndInitialFormulae() { 
-  BranchInst *ExitBranch = nullptr; 
-  bool SaveCmp = TTI.canSaveCmp(L, &ExitBranch, &SE, &LI, &DT, &AC, &TLI); 
- 
-  for (const IVStrideUse &U : IU) { 
-    Instruction *UserInst = U.getUser(); 
-    // Skip IV users that are part of profitable IV Chains. 
-    User::op_iterator UseI = 
-        find(UserInst->operands(), U.getOperandValToReplace()); 
-    assert(UseI != UserInst->op_end() && "cannot find IV operand"); 
-    if (IVIncSet.count(UseI)) { 
-      LLVM_DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n'); 
-      continue; 
-    } 
- 
-    LSRUse::KindType Kind = LSRUse::Basic; 
-    MemAccessTy AccessTy; 
-    if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) { 
-      Kind = LSRUse::Address; 
-      AccessTy = getAccessType(TTI, UserInst, U.getOperandValToReplace()); 
-    } 
- 
-    const SCEV *S = IU.getExpr(U); 
-    PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops(); 
- 
-    // Equality (== and !=) ICmps are special. We can rewrite (i == N) as 
-    // (N - i == 0), and this allows (N - i) to be the expression that we work 
-    // with rather than just N or i, so we can consider the register 
-    // requirements for both N and i at the same time. Limiting this code to 
-    // equality icmps is not a problem because all interesting loops use 
-    // equality icmps, thanks to IndVarSimplify. 
-    if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst)) { 
-      // If CI can be saved in some target, like replaced inside hardware loop 
-      // in PowerPC, no need to generate initial formulae for it. 
-      if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition())) 
-        continue; 
-      if (CI->isEquality()) { 
-        // Swap the operands if needed to put the OperandValToReplace on the 
-        // left, for consistency. 
-        Value *NV = CI->getOperand(1); 
-        if (NV == U.getOperandValToReplace()) { 
-          CI->setOperand(1, CI->getOperand(0)); 
-          CI->setOperand(0, NV); 
-          NV = CI->getOperand(1); 
-          Changed = true; 
-        } 
- 
-        // x == y  -->  x - y == 0 
-        const SCEV *N = SE.getSCEV(NV); 
-        if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE)) { 
-          // S is normalized, so normalize N before folding it into S 
-          // to keep the result normalized. 
-          N = normalizeForPostIncUse(N, TmpPostIncLoops, SE); 
-          Kind = LSRUse::ICmpZero; 
-          S = SE.getMinusSCEV(N, S); 
-        } 
- 
-        // -1 and the negations of all interesting strides (except the negation 
-        // of -1) are now also interesting. 
-        for (size_t i = 0, e = Factors.size(); i != e; ++i) 
-          if (Factors[i] != -1) 
-            Factors.insert(-(uint64_t)Factors[i]); 
-        Factors.insert(-1); 
-      } 
-    } 
- 
-    // Get or create an LSRUse. 
-    std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy); 
-    size_t LUIdx = P.first; 
-    int64_t Offset = P.second; 
-    LSRUse &LU = Uses[LUIdx]; 
- 
-    // Record the fixup. 
-    LSRFixup &LF = LU.getNewFixup(); 
-    LF.UserInst = UserInst; 
-    LF.OperandValToReplace = U.getOperandValToReplace(); 
-    LF.PostIncLoops = TmpPostIncLoops; 
-    LF.Offset = Offset; 
-    LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); 
- 
-    if (!LU.WidestFixupType || 
-        SE.getTypeSizeInBits(LU.WidestFixupType) < 
-        SE.getTypeSizeInBits(LF.OperandValToReplace->getType())) 
-      LU.WidestFixupType = LF.OperandValToReplace->getType(); 
- 
-    // If this is the first use of this LSRUse, give it a formula. 
-    if (LU.Formulae.empty()) { 
-      InsertInitialFormula(S, LU, LUIdx); 
-      CountRegisters(LU.Formulae.back(), LUIdx); 
-    } 
-  } 
- 
-  LLVM_DEBUG(print_fixups(dbgs())); 
-} 
- 
-/// Insert a formula for the given expression into the given use, separating out 
-/// loop-variant portions from loop-invariant and loop-computable portions. 
-void 
-LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) { 
-  // Mark uses whose expressions cannot be expanded. 
-  if (!isSafeToExpand(S, SE)) 
-    LU.RigidFormula = true; 
- 
-  Formula F; 
-  F.initialMatch(S, L, SE); 
-  bool Inserted = InsertFormula(LU, LUIdx, F); 
-  assert(Inserted && "Initial formula already exists!"); (void)Inserted; 
-} 
- 
-/// Insert a simple single-register formula for the given expression into the 
-/// given use. 
-void 
-LSRInstance::InsertSupplementalFormula(const SCEV *S, 
-                                       LSRUse &LU, size_t LUIdx) { 
-  Formula F; 
-  F.BaseRegs.push_back(S); 
-  F.HasBaseReg = true; 
-  bool Inserted = InsertFormula(LU, LUIdx, F); 
-  assert(Inserted && "Supplemental formula already exists!"); (void)Inserted; 
-} 
- 
-/// Note which registers are used by the given formula, updating RegUses. 
-void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) { 
-  if (F.ScaledReg) 
-    RegUses.countRegister(F.ScaledReg, LUIdx); 
-  for (const SCEV *BaseReg : F.BaseRegs) 
-    RegUses.countRegister(BaseReg, LUIdx); 
-} 
- 
-/// If the given formula has not yet been inserted, add it to the list, and 
-/// return true. Return false otherwise. 
-bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) { 
-  // Do not insert formula that we will not be able to expand. 
-  assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) && 
-         "Formula is illegal"); 
- 
-  if (!LU.InsertFormula(F, *L)) 
-    return false; 
- 
-  CountRegisters(F, LUIdx); 
-  return true; 
-} 
- 
-/// Check for other uses of loop-invariant values which we're tracking. These 
-/// other uses will pin these values in registers, making them less profitable 
-/// for elimination. 
-/// TODO: This currently misses non-constant addrec step registers. 
-/// TODO: Should this give more weight to users inside the loop? 
-void 
-LSRInstance::CollectLoopInvariantFixupsAndFormulae() { 
-  SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end()); 
-  SmallPtrSet<const SCEV *, 32> Visited; 
- 
-  while (!Worklist.empty()) { 
-    const SCEV *S = Worklist.pop_back_val(); 
- 
-    // Don't process the same SCEV twice 
-    if (!Visited.insert(S).second) 
-      continue; 
- 
-    if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S)) 
-      Worklist.append(N->op_begin(), N->op_end()); 
+}
+
+/// Return true if the chain increment is profitable to expand into a loop
+/// invariant value, which may require its own register. A profitable chain
+/// increment will be an offset relative to the same base. We allow such offsets
+/// to potentially be used as chain increment as long as it's not obviously
+/// expensive to expand using real instructions.
+bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
+                                    const SCEV *IncExpr,
+                                    ScalarEvolution &SE) {
+  // Aggressively form chains when -stress-ivchain.
+  if (StressIVChain)
+    return true;
+
+  // Do not replace a constant offset from IV head with a nonconstant IV
+  // increment.
+  if (!isa<SCEVConstant>(IncExpr)) {
+    const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
+    if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
+      return false;
+  }
+
+  SmallPtrSet<const SCEV*, 8> Processed;
+  return !isHighCostExpansion(IncExpr, Processed, SE);
+}
+
+/// Return true if the number of registers needed for the chain is estimated to
+/// be less than the number required for the individual IV users. First prohibit
+/// any IV users that keep the IV live across increments (the Users set should
+/// be empty). Next count the number and type of increments in the chain.
+///
+/// Chaining IVs can lead to considerable code bloat if ISEL doesn't
+/// effectively use postinc addressing modes. Only consider it profitable it the
+/// increments can be computed in fewer registers when chained.
+///
+/// TODO: Consider IVInc free if it's already used in another chains.
+static bool isProfitableChain(IVChain &Chain,
+                              SmallPtrSetImpl<Instruction *> &Users,
+                              ScalarEvolution &SE,
+                              const TargetTransformInfo &TTI) {
+  if (StressIVChain)
+    return true;
+
+  if (!Chain.hasIncs())
+    return false;
+
+  if (!Users.empty()) {
+    LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
+               for (Instruction *Inst
+                    : Users) { dbgs() << "  " << *Inst << "\n"; });
+    return false;
+  }
+  assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
+
+  // The chain itself may require a register, so intialize cost to 1.
+  int cost = 1;
+
+  // A complete chain likely eliminates the need for keeping the original IV in
+  // a register. LSR does not currently know how to form a complete chain unless
+  // the header phi already exists.
+  if (isa<PHINode>(Chain.tailUserInst())
+      && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
+    --cost;
+  }
+  const SCEV *LastIncExpr = nullptr;
+  unsigned NumConstIncrements = 0;
+  unsigned NumVarIncrements = 0;
+  unsigned NumReusedIncrements = 0;
+
+  if (TTI.isProfitableLSRChainElement(Chain.Incs[0].UserInst))
+    return true;
+
+  for (const IVInc &Inc : Chain) {
+    if (TTI.isProfitableLSRChainElement(Inc.UserInst))
+      return true;
+    if (Inc.IncExpr->isZero())
+      continue;
+
+    // Incrementing by zero or some constant is neutral. We assume constants can
+    // be folded into an addressing mode or an add's immediate operand.
+    if (isa<SCEVConstant>(Inc.IncExpr)) {
+      ++NumConstIncrements;
+      continue;
+    }
+
+    if (Inc.IncExpr == LastIncExpr)
+      ++NumReusedIncrements;
+    else
+      ++NumVarIncrements;
+
+    LastIncExpr = Inc.IncExpr;
+  }
+  // An IV chain with a single increment is handled by LSR's postinc
+  // uses. However, a chain with multiple increments requires keeping the IV's
+  // value live longer than it needs to be if chained.
+  if (NumConstIncrements > 1)
+    --cost;
+
+  // Materializing increment expressions in the preheader that didn't exist in
+  // the original code may cost a register. For example, sign-extended array
+  // indices can produce ridiculous increments like this:
+  // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
+  cost += NumVarIncrements;
+
+  // Reusing variable increments likely saves a register to hold the multiple of
+  // the stride.
+  cost -= NumReusedIncrements;
+
+  LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
+                    << "\n");
+
+  return cost < 0;
+}
+
+/// Add this IV user to an existing chain or make it the head of a new chain.
+void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
+                                   SmallVectorImpl<ChainUsers> &ChainUsersVec) {
+  // When IVs are used as types of varying widths, they are generally converted
+  // to a wider type with some uses remaining narrow under a (free) trunc.
+  Value *const NextIV = getWideOperand(IVOper);
+  const SCEV *const OperExpr = SE.getSCEV(NextIV);
+  const SCEV *const OperExprBase = getExprBase(OperExpr);
+
+  // Visit all existing chains. Check if its IVOper can be computed as a
+  // profitable loop invariant increment from the last link in the Chain.
+  unsigned ChainIdx = 0, NChains = IVChainVec.size();
+  const SCEV *LastIncExpr = nullptr;
+  for (; ChainIdx < NChains; ++ChainIdx) {
+    IVChain &Chain = IVChainVec[ChainIdx];
+
+    // Prune the solution space aggressively by checking that both IV operands
+    // are expressions that operate on the same unscaled SCEVUnknown. This
+    // "base" will be canceled by the subsequent getMinusSCEV call. Checking
+    // first avoids creating extra SCEV expressions.
+    if (!StressIVChain && Chain.ExprBase != OperExprBase)
+      continue;
+
+    Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
+    if (!isCompatibleIVType(PrevIV, NextIV))
+      continue;
+
+    // A phi node terminates a chain.
+    if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
+      continue;
+
+    // The increment must be loop-invariant so it can be kept in a register.
+    const SCEV *PrevExpr = SE.getSCEV(PrevIV);
+    const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
+    if (!SE.isLoopInvariant(IncExpr, L))
+      continue;
+
+    if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
+      LastIncExpr = IncExpr;
+      break;
+    }
+  }
+  // If we haven't found a chain, create a new one, unless we hit the max. Don't
+  // bother for phi nodes, because they must be last in the chain.
+  if (ChainIdx == NChains) {
+    if (isa<PHINode>(UserInst))
+      return;
+    if (NChains >= MaxChains && !StressIVChain) {
+      LLVM_DEBUG(dbgs() << "IV Chain Limit\n");
+      return;
+    }
+    LastIncExpr = OperExpr;
+    // IVUsers may have skipped over sign/zero extensions. We don't currently
+    // attempt to form chains involving extensions unless they can be hoisted
+    // into this loop's AddRec.
+    if (!isa<SCEVAddRecExpr>(LastIncExpr))
+      return;
+    ++NChains;
+    IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
+                                 OperExprBase));
+    ChainUsersVec.resize(NChains);
+    LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
+                      << ") IV=" << *LastIncExpr << "\n");
+  } else {
+    LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << "  Inc: (" << *UserInst
+                      << ") IV+" << *LastIncExpr << "\n");
+    // Add this IV user to the end of the chain.
+    IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
+  }
+  IVChain &Chain = IVChainVec[ChainIdx];
+
+  SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
+  // This chain's NearUsers become FarUsers.
+  if (!LastIncExpr->isZero()) {
+    ChainUsersVec[ChainIdx].FarUsers.insert(NearUsers.begin(),
+                                            NearUsers.end());
+    NearUsers.clear();
+  }
+
+  // All other uses of IVOperand become near uses of the chain.
+  // We currently ignore intermediate values within SCEV expressions, assuming
+  // they will eventually be used be the current chain, or can be computed
+  // from one of the chain increments. To be more precise we could
+  // transitively follow its user and only add leaf IV users to the set.
+  for (User *U : IVOper->users()) {
+    Instruction *OtherUse = dyn_cast<Instruction>(U);
+    if (!OtherUse)
+      continue;
+    // Uses in the chain will no longer be uses if the chain is formed.
+    // Include the head of the chain in this iteration (not Chain.begin()).
+    IVChain::const_iterator IncIter = Chain.Incs.begin();
+    IVChain::const_iterator IncEnd = Chain.Incs.end();
+    for( ; IncIter != IncEnd; ++IncIter) {
+      if (IncIter->UserInst == OtherUse)
+        break;
+    }
+    if (IncIter != IncEnd)
+      continue;
+
+    if (SE.isSCEVable(OtherUse->getType())
+        && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
+        && IU.isIVUserOrOperand(OtherUse)) {
+      continue;
+    }
+    NearUsers.insert(OtherUse);
+  }
+
+  // Since this user is part of the chain, it's no longer considered a use
+  // of the chain.
+  ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
+}
+
+/// Populate the vector of Chains.
+///
+/// This decreases ILP at the architecture level. Targets with ample registers,
+/// multiple memory ports, and no register renaming probably don't want
+/// this. However, such targets should probably disable LSR altogether.
+///
+/// The job of LSR is to make a reasonable choice of induction variables across
+/// the loop. Subsequent passes can easily "unchain" computation exposing more
+/// ILP *within the loop* if the target wants it.
+///
+/// Finding the best IV chain is potentially a scheduling problem. Since LSR
+/// will not reorder memory operations, it will recognize this as a chain, but
+/// will generate redundant IV increments. Ideally this would be corrected later
+/// by a smart scheduler:
+///        = A[i]
+///        = A[i+x]
+/// A[i]   =
+/// A[i+x] =
+///
+/// TODO: Walk the entire domtree within this loop, not just the path to the
+/// loop latch. This will discover chains on side paths, but requires
+/// maintaining multiple copies of the Chains state.
+void LSRInstance::CollectChains() {
+  LLVM_DEBUG(dbgs() << "Collecting IV Chains.\n");
+  SmallVector<ChainUsers, 8> ChainUsersVec;
+
+  SmallVector<BasicBlock *,8> LatchPath;
+  BasicBlock *LoopHeader = L->getHeader();
+  for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
+       Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
+    LatchPath.push_back(Rung->getBlock());
+  }
+  LatchPath.push_back(LoopHeader);
+
+  // Walk the instruction stream from the loop header to the loop latch.
+  for (BasicBlock *BB : reverse(LatchPath)) {
+    for (Instruction &I : *BB) {
+      // Skip instructions that weren't seen by IVUsers analysis.
+      if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&I))
+        continue;
+
+      // Ignore users that are part of a SCEV expression. This way we only
+      // consider leaf IV Users. This effectively rediscovers a portion of
+      // IVUsers analysis but in program order this time.
+      if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I)))
+          continue;
+
+      // Remove this instruction from any NearUsers set it may be in.
+      for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
+           ChainIdx < NChains; ++ChainIdx) {
+        ChainUsersVec[ChainIdx].NearUsers.erase(&I);
+      }
+      // Search for operands that can be chained.
+      SmallPtrSet<Instruction*, 4> UniqueOperands;
+      User::op_iterator IVOpEnd = I.op_end();
+      User::op_iterator IVOpIter = findIVOperand(I.op_begin(), IVOpEnd, L, SE);
+      while (IVOpIter != IVOpEnd) {
+        Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
+        if (UniqueOperands.insert(IVOpInst).second)
+          ChainInstruction(&I, IVOpInst, ChainUsersVec);
+        IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
+      }
+    } // Continue walking down the instructions.
+  } // Continue walking down the domtree.
+  // Visit phi backedges to determine if the chain can generate the IV postinc.
+  for (PHINode &PN : L->getHeader()->phis()) {
+    if (!SE.isSCEVable(PN.getType()))
+      continue;
+
+    Instruction *IncV =
+        dyn_cast<Instruction>(PN.getIncomingValueForBlock(L->getLoopLatch()));
+    if (IncV)
+      ChainInstruction(&PN, IncV, ChainUsersVec);
+  }
+  // Remove any unprofitable chains.
+  unsigned ChainIdx = 0;
+  for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
+       UsersIdx < NChains; ++UsersIdx) {
+    if (!isProfitableChain(IVChainVec[UsersIdx],
+                           ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
+      continue;
+    // Preserve the chain at UsesIdx.
+    if (ChainIdx != UsersIdx)
+      IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
+    FinalizeChain(IVChainVec[ChainIdx]);
+    ++ChainIdx;
+  }
+  IVChainVec.resize(ChainIdx);
+}
+
+void LSRInstance::FinalizeChain(IVChain &Chain) {
+  assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
+  LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
+  
+  for (const IVInc &Inc : Chain) {
+    LLVM_DEBUG(dbgs() << "        Inc: " << *Inc.UserInst << "\n");
+    auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
+    assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
+    IVIncSet.insert(UseI);
+  }
+}
+
+/// Return true if the IVInc can be folded into an addressing mode.
+static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
+                             Value *Operand, const TargetTransformInfo &TTI) {
+  const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
+  if (!IncConst || !isAddressUse(TTI, UserInst, Operand))
+    return false;
+
+  if (IncConst->getAPInt().getMinSignedBits() > 64)
+    return false;
+
+  MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand);
+  int64_t IncOffset = IncConst->getValue()->getSExtValue();
+  if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
+                        IncOffset, /*HasBaseReg=*/false))
+    return false;
+
+  return true;
+}
+
+/// Generate an add or subtract for each IVInc in a chain to materialize the IV
+/// user's operand from the previous IV user's operand.
+void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
+                                  SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
+  // Find the new IVOperand for the head of the chain. It may have been replaced
+  // by LSR.
+  const IVInc &Head = Chain.Incs[0];
+  User::op_iterator IVOpEnd = Head.UserInst->op_end();
+  // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
+  User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
+                                             IVOpEnd, L, SE);
+  Value *IVSrc = nullptr;
+  while (IVOpIter != IVOpEnd) {
+    IVSrc = getWideOperand(*IVOpIter);
+
+    // If this operand computes the expression that the chain needs, we may use
+    // it. (Check this after setting IVSrc which is used below.)
+    //
+    // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
+    // narrow for the chain, so we can no longer use it. We do allow using a
+    // wider phi, assuming the LSR checked for free truncation. In that case we
+    // should already have a truncate on this operand such that
+    // getSCEV(IVSrc) == IncExpr.
+    if (SE.getSCEV(*IVOpIter) == Head.IncExpr
+        || SE.getSCEV(IVSrc) == Head.IncExpr) {
+      break;
+    }
+    IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
+  }
+  if (IVOpIter == IVOpEnd) {
+    // Gracefully give up on this chain.
+    LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
+    return;
+  }
+  assert(IVSrc && "Failed to find IV chain source");
+
+  LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
+  Type *IVTy = IVSrc->getType();
+  Type *IntTy = SE.getEffectiveSCEVType(IVTy);
+  const SCEV *LeftOverExpr = nullptr;
+  for (const IVInc &Inc : Chain) {
+    Instruction *InsertPt = Inc.UserInst;
+    if (isa<PHINode>(InsertPt))
+      InsertPt = L->getLoopLatch()->getTerminator();
+
+    // IVOper will replace the current IV User's operand. IVSrc is the IV
+    // value currently held in a register.
+    Value *IVOper = IVSrc;
+    if (!Inc.IncExpr->isZero()) {
+      // IncExpr was the result of subtraction of two narrow values, so must
+      // be signed.
+      const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy);
+      LeftOverExpr = LeftOverExpr ?
+        SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
+    }
+    if (LeftOverExpr && !LeftOverExpr->isZero()) {
+      // Expand the IV increment.
+      Rewriter.clearPostInc();
+      Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
+      const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
+                                             SE.getUnknown(IncV));
+      IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
+
+      // If an IV increment can't be folded, use it as the next IV value.
+      if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) {
+        assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
+        IVSrc = IVOper;
+        LeftOverExpr = nullptr;
+      }
+    }
+    Type *OperTy = Inc.IVOperand->getType();
+    if (IVTy != OperTy) {
+      assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
+             "cannot extend a chained IV");
+      IRBuilder<> Builder(InsertPt);
+      IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
+    }
+    Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper);
+    if (auto *OperandIsInstr = dyn_cast<Instruction>(Inc.IVOperand))
+      DeadInsts.emplace_back(OperandIsInstr);
+  }
+  // If LSR created a new, wider phi, we may also replace its postinc. We only
+  // do this if we also found a wide value for the head of the chain.
+  if (isa<PHINode>(Chain.tailUserInst())) {
+    for (PHINode &Phi : L->getHeader()->phis()) {
+      if (!isCompatibleIVType(&Phi, IVSrc))
+        continue;
+      Instruction *PostIncV = dyn_cast<Instruction>(
+          Phi.getIncomingValueForBlock(L->getLoopLatch()));
+      if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
+        continue;
+      Value *IVOper = IVSrc;
+      Type *PostIncTy = PostIncV->getType();
+      if (IVTy != PostIncTy) {
+        assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
+        IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
+        Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
+        IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
+      }
+      Phi.replaceUsesOfWith(PostIncV, IVOper);
+      DeadInsts.emplace_back(PostIncV);
+    }
+  }
+}
+
+void LSRInstance::CollectFixupsAndInitialFormulae() {
+  BranchInst *ExitBranch = nullptr;
+  bool SaveCmp = TTI.canSaveCmp(L, &ExitBranch, &SE, &LI, &DT, &AC, &TLI);
+
+  for (const IVStrideUse &U : IU) {
+    Instruction *UserInst = U.getUser();
+    // Skip IV users that are part of profitable IV Chains.
+    User::op_iterator UseI =
+        find(UserInst->operands(), U.getOperandValToReplace());
+    assert(UseI != UserInst->op_end() && "cannot find IV operand");
+    if (IVIncSet.count(UseI)) {
+      LLVM_DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
+      continue;
+    }
+
+    LSRUse::KindType Kind = LSRUse::Basic;
+    MemAccessTy AccessTy;
+    if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) {
+      Kind = LSRUse::Address;
+      AccessTy = getAccessType(TTI, UserInst, U.getOperandValToReplace());
+    }
+
+    const SCEV *S = IU.getExpr(U);
+    PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
+
+    // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
+    // (N - i == 0), and this allows (N - i) to be the expression that we work
+    // with rather than just N or i, so we can consider the register
+    // requirements for both N and i at the same time. Limiting this code to
+    // equality icmps is not a problem because all interesting loops use
+    // equality icmps, thanks to IndVarSimplify.
+    if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst)) {
+      // If CI can be saved in some target, like replaced inside hardware loop
+      // in PowerPC, no need to generate initial formulae for it.
+      if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition()))
+        continue;
+      if (CI->isEquality()) {
+        // Swap the operands if needed to put the OperandValToReplace on the
+        // left, for consistency.
+        Value *NV = CI->getOperand(1);
+        if (NV == U.getOperandValToReplace()) {
+          CI->setOperand(1, CI->getOperand(0));
+          CI->setOperand(0, NV);
+          NV = CI->getOperand(1);
+          Changed = true;
+        }
+
+        // x == y  -->  x - y == 0
+        const SCEV *N = SE.getSCEV(NV);
+        if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE)) {
+          // S is normalized, so normalize N before folding it into S
+          // to keep the result normalized.
+          N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
+          Kind = LSRUse::ICmpZero;
+          S = SE.getMinusSCEV(N, S);
+        }
+
+        // -1 and the negations of all interesting strides (except the negation
+        // of -1) are now also interesting.
+        for (size_t i = 0, e = Factors.size(); i != e; ++i)
+          if (Factors[i] != -1)
+            Factors.insert(-(uint64_t)Factors[i]);
+        Factors.insert(-1);
+      }
+    }
+
+    // Get or create an LSRUse.
+    std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);
+    size_t LUIdx = P.first;
+    int64_t Offset = P.second;
+    LSRUse &LU = Uses[LUIdx];
+
+    // Record the fixup.
+    LSRFixup &LF = LU.getNewFixup();
+    LF.UserInst = UserInst;
+    LF.OperandValToReplace = U.getOperandValToReplace();
+    LF.PostIncLoops = TmpPostIncLoops;
+    LF.Offset = Offset;
+    LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
+
+    if (!LU.WidestFixupType ||
+        SE.getTypeSizeInBits(LU.WidestFixupType) <
+        SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
+      LU.WidestFixupType = LF.OperandValToReplace->getType();
+
+    // If this is the first use of this LSRUse, give it a formula.
+    if (LU.Formulae.empty()) {
+      InsertInitialFormula(S, LU, LUIdx);
+      CountRegisters(LU.Formulae.back(), LUIdx);
+    }
+  }
+
+  LLVM_DEBUG(print_fixups(dbgs()));
+}
+
+/// Insert a formula for the given expression into the given use, separating out
+/// loop-variant portions from loop-invariant and loop-computable portions.
+void
+LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
+  // Mark uses whose expressions cannot be expanded.
+  if (!isSafeToExpand(S, SE))
+    LU.RigidFormula = true;
+
+  Formula F;
+  F.initialMatch(S, L, SE);
+  bool Inserted = InsertFormula(LU, LUIdx, F);
+  assert(Inserted && "Initial formula already exists!"); (void)Inserted;
+}
+
+/// Insert a simple single-register formula for the given expression into the
+/// given use.
+void
+LSRInstance::InsertSupplementalFormula(const SCEV *S,
+                                       LSRUse &LU, size_t LUIdx) {
+  Formula F;
+  F.BaseRegs.push_back(S);
+  F.HasBaseReg = true;
+  bool Inserted = InsertFormula(LU, LUIdx, F);
+  assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
+}
+
+/// Note which registers are used by the given formula, updating RegUses.
+void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
+  if (F.ScaledReg)
+    RegUses.countRegister(F.ScaledReg, LUIdx);
+  for (const SCEV *BaseReg : F.BaseRegs)
+    RegUses.countRegister(BaseReg, LUIdx);
+}
+
+/// If the given formula has not yet been inserted, add it to the list, and
+/// return true. Return false otherwise.
+bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
+  // Do not insert formula that we will not be able to expand.
+  assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
+         "Formula is illegal");
+
+  if (!LU.InsertFormula(F, *L))
+    return false;
+
+  CountRegisters(F, LUIdx);
+  return true;
+}
+
+/// Check for other uses of loop-invariant values which we're tracking. These
+/// other uses will pin these values in registers, making them less profitable
+/// for elimination.
+/// TODO: This currently misses non-constant addrec step registers.
+/// TODO: Should this give more weight to users inside the loop?
+void
+LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
+  SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
+  SmallPtrSet<const SCEV *, 32> Visited;
+
+  while (!Worklist.empty()) {
+    const SCEV *S = Worklist.pop_back_val();
+
+    // Don't process the same SCEV twice
+    if (!Visited.insert(S).second)
+      continue;
+
+    if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
+      Worklist.append(N->op_begin(), N->op_end());
     else if (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(S))
-      Worklist.push_back(C->getOperand()); 
-    else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) { 
-      Worklist.push_back(D->getLHS()); 
-      Worklist.push_back(D->getRHS()); 
-    } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) { 
-      const Value *V = US->getValue(); 
-      if (const Instruction *Inst = dyn_cast<Instruction>(V)) { 
-        // Look for instructions defined outside the loop. 
-        if (L->contains(Inst)) continue; 
-      } else if (isa<UndefValue>(V)) 
-        // Undef doesn't have a live range, so it doesn't matter. 
-        continue; 
-      for (const Use &U : V->uses()) { 
-        const Instruction *UserInst = dyn_cast<Instruction>(U.getUser()); 
-        // Ignore non-instructions. 
-        if (!UserInst) 
-          continue; 
-        // Ignore instructions in other functions (as can happen with 
-        // Constants). 
-        if (UserInst->getParent()->getParent() != L->getHeader()->getParent()) 
-          continue; 
-        // Ignore instructions not dominated by the loop. 
-        const BasicBlock *UseBB = !isa<PHINode>(UserInst) ? 
-          UserInst->getParent() : 
-          cast<PHINode>(UserInst)->getIncomingBlock( 
-            PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 
-        if (!DT.dominates(L->getHeader(), UseBB)) 
-          continue; 
-        // Don't bother if the instruction is in a BB which ends in an EHPad. 
-        if (UseBB->getTerminator()->isEHPad()) 
-          continue; 
-        // Don't bother rewriting PHIs in catchswitch blocks. 
-        if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator())) 
-          continue; 
-        // Ignore uses which are part of other SCEV expressions, to avoid 
-        // analyzing them multiple times. 
-        if (SE.isSCEVable(UserInst->getType())) { 
-          const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst)); 
-          // If the user is a no-op, look through to its uses. 
-          if (!isa<SCEVUnknown>(UserS)) 
-            continue; 
-          if (UserS == US) { 
-            Worklist.push_back( 
-              SE.getUnknown(const_cast<Instruction *>(UserInst))); 
-            continue; 
-          } 
-        } 
-        // Ignore icmp instructions which are already being analyzed. 
-        if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) { 
-          unsigned OtherIdx = !U.getOperandNo(); 
-          Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx)); 
-          if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L)) 
-            continue; 
-        } 
- 
-        std::pair<size_t, int64_t> P = getUse( 
-            S, LSRUse::Basic, MemAccessTy()); 
-        size_t LUIdx = P.first; 
-        int64_t Offset = P.second; 
-        LSRUse &LU = Uses[LUIdx]; 
-        LSRFixup &LF = LU.getNewFixup(); 
-        LF.UserInst = const_cast<Instruction *>(UserInst); 
-        LF.OperandValToReplace = U; 
-        LF.Offset = Offset; 
-        LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); 
-        if (!LU.WidestFixupType || 
-            SE.getTypeSizeInBits(LU.WidestFixupType) < 
-            SE.getTypeSizeInBits(LF.OperandValToReplace->getType())) 
-          LU.WidestFixupType = LF.OperandValToReplace->getType(); 
-        InsertSupplementalFormula(US, LU, LUIdx); 
-        CountRegisters(LU.Formulae.back(), Uses.size() - 1); 
-        break; 
-      } 
-    } 
-  } 
-} 
- 
-/// Split S into subexpressions which can be pulled out into separate 
-/// registers. If C is non-null, multiply each subexpression by C. 
-/// 
-/// Return remainder expression after factoring the subexpressions captured by 
-/// Ops. If Ops is complete, return NULL. 
-static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C, 
-                                   SmallVectorImpl<const SCEV *> &Ops, 
-                                   const Loop *L, 
-                                   ScalarEvolution &SE, 
-                                   unsigned Depth = 0) { 
-  // Arbitrarily cap recursion to protect compile time. 
-  if (Depth >= 3) 
-    return S; 
- 
-  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { 
-    // Break out add operands. 
-    for (const SCEV *S : Add->operands()) { 
-      const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1); 
-      if (Remainder) 
-        Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder); 
-    } 
-    return nullptr; 
-  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { 
-    // Split a non-zero base out of an addrec. 
-    if (AR->getStart()->isZero() || !AR->isAffine()) 
-      return S; 
- 
-    const SCEV *Remainder = CollectSubexprs(AR->getStart(), 
-                                            C, Ops, L, SE, Depth+1); 
-    // Split the non-zero AddRec unless it is part of a nested recurrence that 
-    // does not pertain to this loop. 
-    if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) { 
-      Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder); 
-      Remainder = nullptr; 
-    } 
-    if (Remainder != AR->getStart()) { 
-      if (!Remainder) 
-        Remainder = SE.getConstant(AR->getType(), 0); 
-      return SE.getAddRecExpr(Remainder, 
-                              AR->getStepRecurrence(SE), 
-                              AR->getLoop(), 
-                              //FIXME: AR->getNoWrapFlags(SCEV::FlagNW) 
-                              SCEV::FlagAnyWrap); 
-    } 
-  } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) { 
-    // Break (C * (a + b + c)) into C*a + C*b + C*c. 
-    if (Mul->getNumOperands() != 2) 
-      return S; 
-    if (const SCEVConstant *Op0 = 
-        dyn_cast<SCEVConstant>(Mul->getOperand(0))) { 
-      C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0; 
-      const SCEV *Remainder = 
-        CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1); 
-      if (Remainder) 
-        Ops.push_back(SE.getMulExpr(C, Remainder)); 
-      return nullptr; 
-    } 
-  } 
-  return S; 
-} 
- 
-/// Return true if the SCEV represents a value that may end up as a 
-/// post-increment operation. 
-static bool mayUsePostIncMode(const TargetTransformInfo &TTI, 
-                              LSRUse &LU, const SCEV *S, const Loop *L, 
-                              ScalarEvolution &SE) { 
-  if (LU.Kind != LSRUse::Address || 
-      !LU.AccessTy.getType()->isIntOrIntVectorTy()) 
-    return false; 
-  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S); 
-  if (!AR) 
-    return false; 
-  const SCEV *LoopStep = AR->getStepRecurrence(SE); 
-  if (!isa<SCEVConstant>(LoopStep)) 
-    return false; 
-  // Check if a post-indexed load/store can be used. 
-  if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) || 
-      TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) { 
-    const SCEV *LoopStart = AR->getStart(); 
-    if (!isa<SCEVConstant>(LoopStart) && SE.isLoopInvariant(LoopStart, L)) 
-      return true; 
-  } 
-  return false; 
-} 
- 
-/// Helper function for LSRInstance::GenerateReassociations. 
-void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx, 
-                                             const Formula &Base, 
-                                             unsigned Depth, size_t Idx, 
-                                             bool IsScaledReg) { 
-  const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx]; 
-  // Don't generate reassociations for the base register of a value that 
-  // may generate a post-increment operator. The reason is that the 
-  // reassociations cause extra base+register formula to be created, 
-  // and possibly chosen, but the post-increment is more efficient. 
-  if (TTI.shouldFavorPostInc() && mayUsePostIncMode(TTI, LU, BaseReg, L, SE)) 
-    return; 
-  SmallVector<const SCEV *, 8> AddOps; 
-  const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE); 
-  if (Remainder) 
-    AddOps.push_back(Remainder); 
- 
-  if (AddOps.size() == 1) 
-    return; 
- 
-  for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(), 
-                                                     JE = AddOps.end(); 
-       J != JE; ++J) { 
-    // Loop-variant "unknown" values are uninteresting; we won't be able to 
-    // do anything meaningful with them. 
-    if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L)) 
-      continue; 
- 
-    // Don't pull a constant into a register if the constant could be folded 
-    // into an immediate field. 
-    if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind, 
-                         LU.AccessTy, *J, Base.getNumRegs() > 1)) 
-      continue; 
- 
-    // Collect all operands except *J. 
-    SmallVector<const SCEV *, 8> InnerAddOps( 
-        ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J); 
-    InnerAddOps.append(std::next(J), 
-                       ((const SmallVector<const SCEV *, 8> &)AddOps).end()); 
- 
-    // Don't leave just a constant behind in a register if the constant could 
-    // be folded into an immediate field. 
-    if (InnerAddOps.size() == 1 && 
-        isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind, 
-                         LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1)) 
-      continue; 
- 
-    const SCEV *InnerSum = SE.getAddExpr(InnerAddOps); 
-    if (InnerSum->isZero()) 
-      continue; 
-    Formula F = Base; 
- 
-    // Add the remaining pieces of the add back into the new formula. 
-    const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum); 
-    if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 && 
-        TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset + 
-                                InnerSumSC->getValue()->getZExtValue())) { 
-      F.UnfoldedOffset = 
-          (uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue(); 
-      if (IsScaledReg) 
-        F.ScaledReg = nullptr; 
-      else 
-        F.BaseRegs.erase(F.BaseRegs.begin() + Idx); 
-    } else if (IsScaledReg) 
-      F.ScaledReg = InnerSum; 
-    else 
-      F.BaseRegs[Idx] = InnerSum; 
- 
-    // Add J as its own register, or an unfolded immediate. 
-    const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J); 
-    if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 && 
-        TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset + 
-                                SC->getValue()->getZExtValue())) 
-      F.UnfoldedOffset = 
-          (uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue(); 
-    else 
-      F.BaseRegs.push_back(*J); 
-    // We may have changed the number of register in base regs, adjust the 
-    // formula accordingly. 
-    F.canonicalize(*L); 
- 
-    if (InsertFormula(LU, LUIdx, F)) 
-      // If that formula hadn't been seen before, recurse to find more like 
-      // it. 
-      // Add check on Log16(AddOps.size()) - same as Log2_32(AddOps.size()) >> 2) 
-      // Because just Depth is not enough to bound compile time. 
-      // This means that every time AddOps.size() is greater 16^x we will add 
-      // x to Depth. 
-      GenerateReassociations(LU, LUIdx, LU.Formulae.back(), 
-                             Depth + 1 + (Log2_32(AddOps.size()) >> 2)); 
-  } 
-} 
- 
-/// Split out subexpressions from adds and the bases of addrecs. 
-void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, 
-                                         Formula Base, unsigned Depth) { 
-  assert(Base.isCanonical(*L) && "Input must be in the canonical form"); 
-  // Arbitrarily cap recursion to protect compile time. 
-  if (Depth >= 3) 
-    return; 
- 
-  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) 
-    GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i); 
- 
-  if (Base.Scale == 1) 
-    GenerateReassociationsImpl(LU, LUIdx, Base, Depth, 
-                               /* Idx */ -1, /* IsScaledReg */ true); 
-} 
- 
-///  Generate a formula consisting of all of the loop-dominating registers added 
-/// into a single register. 
-void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, 
-                                       Formula Base) { 
-  // This method is only interesting on a plurality of registers. 
-  if (Base.BaseRegs.size() + (Base.Scale == 1) + 
-      (Base.UnfoldedOffset != 0) <= 1) 
-    return; 
- 
-  // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before 
-  // processing the formula. 
-  Base.unscale(); 
-  SmallVector<const SCEV *, 4> Ops; 
-  Formula NewBase = Base; 
-  NewBase.BaseRegs.clear(); 
-  Type *CombinedIntegerType = nullptr; 
-  for (const SCEV *BaseReg : Base.BaseRegs) { 
-    if (SE.properlyDominates(BaseReg, L->getHeader()) && 
-        !SE.hasComputableLoopEvolution(BaseReg, L)) { 
-      if (!CombinedIntegerType) 
-        CombinedIntegerType = SE.getEffectiveSCEVType(BaseReg->getType()); 
-      Ops.push_back(BaseReg); 
-    } 
-    else 
-      NewBase.BaseRegs.push_back(BaseReg); 
-  } 
- 
-  // If no register is relevant, we're done. 
-  if (Ops.size() == 0) 
-    return; 
- 
-  // Utility function for generating the required variants of the combined 
-  // registers. 
-  auto GenerateFormula = [&](const SCEV *Sum) { 
-    Formula F = NewBase; 
- 
-    // TODO: If Sum is zero, it probably means ScalarEvolution missed an 
-    // opportunity to fold something. For now, just ignore such cases 
-    // rather than proceed with zero in a register. 
-    if (Sum->isZero()) 
-      return; 
- 
-    F.BaseRegs.push_back(Sum); 
-    F.canonicalize(*L); 
-    (void)InsertFormula(LU, LUIdx, F); 
-  }; 
- 
-  // If we collected at least two registers, generate a formula combining them. 
-  if (Ops.size() > 1) { 
-    SmallVector<const SCEV *, 4> OpsCopy(Ops); // Don't let SE modify Ops. 
-    GenerateFormula(SE.getAddExpr(OpsCopy)); 
-  } 
- 
-  // If we have an unfolded offset, generate a formula combining it with the 
-  // registers collected. 
-  if (NewBase.UnfoldedOffset) { 
-    assert(CombinedIntegerType && "Missing a type for the unfolded offset"); 
-    Ops.push_back(SE.getConstant(CombinedIntegerType, NewBase.UnfoldedOffset, 
-                                 true)); 
-    NewBase.UnfoldedOffset = 0; 
-    GenerateFormula(SE.getAddExpr(Ops)); 
-  } 
-} 
- 
-/// Helper function for LSRInstance::GenerateSymbolicOffsets. 
-void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx, 
-                                              const Formula &Base, size_t Idx, 
-                                              bool IsScaledReg) { 
-  const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx]; 
-  GlobalValue *GV = ExtractSymbol(G, SE); 
-  if (G->isZero() || !GV) 
-    return; 
-  Formula F = Base; 
-  F.BaseGV = GV; 
-  if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) 
-    return; 
-  if (IsScaledReg) 
-    F.ScaledReg = G; 
-  else 
-    F.BaseRegs[Idx] = G; 
-  (void)InsertFormula(LU, LUIdx, F); 
-} 
- 
-/// Generate reuse formulae using symbolic offsets. 
-void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, 
-                                          Formula Base) { 
-  // We can't add a symbolic offset if the address already contains one. 
-  if (Base.BaseGV) return; 
- 
-  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) 
-    GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i); 
-  if (Base.Scale == 1) 
-    GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1, 
-                                /* IsScaledReg */ true); 
-} 
- 
-/// Helper function for LSRInstance::GenerateConstantOffsets. 
-void LSRInstance::GenerateConstantOffsetsImpl( 
-    LSRUse &LU, unsigned LUIdx, const Formula &Base, 
-    const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) { 
- 
-  auto GenerateOffset = [&](const SCEV *G, int64_t Offset) { 
-    Formula F = Base; 
-    F.BaseOffset = (uint64_t)Base.BaseOffset - Offset; 
- 
-    if (isLegalUse(TTI, LU.MinOffset - Offset, LU.MaxOffset - Offset, LU.Kind, 
-                   LU.AccessTy, F)) { 
-      // Add the offset to the base register. 
-      const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), Offset), G); 
-      // If it cancelled out, drop the base register, otherwise update it. 
-      if (NewG->isZero()) { 
-        if (IsScaledReg) { 
-          F.Scale = 0; 
-          F.ScaledReg = nullptr; 
-        } else 
-          F.deleteBaseReg(F.BaseRegs[Idx]); 
-        F.canonicalize(*L); 
-      } else if (IsScaledReg) 
-        F.ScaledReg = NewG; 
-      else 
-        F.BaseRegs[Idx] = NewG; 
- 
-      (void)InsertFormula(LU, LUIdx, F); 
-    } 
-  }; 
- 
-  const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx]; 
- 
-  // With constant offsets and constant steps, we can generate pre-inc 
-  // accesses by having the offset equal the step. So, for access #0 with a 
-  // step of 8, we generate a G - 8 base which would require the first access 
-  // to be ((G - 8) + 8),+,8. The pre-indexed access then updates the pointer 
-  // for itself and hopefully becomes the base for other accesses. This means 
-  // means that a single pre-indexed access can be generated to become the new 
-  // base pointer for each iteration of the loop, resulting in no extra add/sub 
-  // instructions for pointer updating. 
-  if (FavorBackedgeIndex && LU.Kind == LSRUse::Address) { 
-    if (auto *GAR = dyn_cast<SCEVAddRecExpr>(G)) { 
-      if (auto *StepRec = 
-          dyn_cast<SCEVConstant>(GAR->getStepRecurrence(SE))) { 
-        const APInt &StepInt = StepRec->getAPInt(); 
-        int64_t Step = StepInt.isNegative() ? 
-          StepInt.getSExtValue() : StepInt.getZExtValue(); 
- 
-        for (int64_t Offset : Worklist) { 
-          Offset -= Step; 
-          GenerateOffset(G, Offset); 
-        } 
-      } 
-    } 
-  } 
-  for (int64_t Offset : Worklist) 
-    GenerateOffset(G, Offset); 
- 
-  int64_t Imm = ExtractImmediate(G, SE); 
-  if (G->isZero() || Imm == 0) 
-    return; 
-  Formula F = Base; 
-  F.BaseOffset = (uint64_t)F.BaseOffset + Imm; 
-  if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) 
-    return; 
+      Worklist.push_back(C->getOperand());
+    else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
+      Worklist.push_back(D->getLHS());
+      Worklist.push_back(D->getRHS());
+    } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
+      const Value *V = US->getValue();
+      if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
+        // Look for instructions defined outside the loop.
+        if (L->contains(Inst)) continue;
+      } else if (isa<UndefValue>(V))
+        // Undef doesn't have a live range, so it doesn't matter.
+        continue;
+      for (const Use &U : V->uses()) {
+        const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
+        // Ignore non-instructions.
+        if (!UserInst)
+          continue;
+        // Ignore instructions in other functions (as can happen with
+        // Constants).
+        if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
+          continue;
+        // Ignore instructions not dominated by the loop.
+        const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
+          UserInst->getParent() :
+          cast<PHINode>(UserInst)->getIncomingBlock(
+            PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
+        if (!DT.dominates(L->getHeader(), UseBB))
+          continue;
+        // Don't bother if the instruction is in a BB which ends in an EHPad.
+        if (UseBB->getTerminator()->isEHPad())
+          continue;
+        // Don't bother rewriting PHIs in catchswitch blocks.
+        if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator()))
+          continue;
+        // Ignore uses which are part of other SCEV expressions, to avoid
+        // analyzing them multiple times.
+        if (SE.isSCEVable(UserInst->getType())) {
+          const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
+          // If the user is a no-op, look through to its uses.
+          if (!isa<SCEVUnknown>(UserS))
+            continue;
+          if (UserS == US) {
+            Worklist.push_back(
+              SE.getUnknown(const_cast<Instruction *>(UserInst)));
+            continue;
+          }
+        }
+        // Ignore icmp instructions which are already being analyzed.
+        if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
+          unsigned OtherIdx = !U.getOperandNo();
+          Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
+          if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
+            continue;
+        }
+
+        std::pair<size_t, int64_t> P = getUse(
+            S, LSRUse::Basic, MemAccessTy());
+        size_t LUIdx = P.first;
+        int64_t Offset = P.second;
+        LSRUse &LU = Uses[LUIdx];
+        LSRFixup &LF = LU.getNewFixup();
+        LF.UserInst = const_cast<Instruction *>(UserInst);
+        LF.OperandValToReplace = U;
+        LF.Offset = Offset;
+        LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
+        if (!LU.WidestFixupType ||
+            SE.getTypeSizeInBits(LU.WidestFixupType) <
+            SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
+          LU.WidestFixupType = LF.OperandValToReplace->getType();
+        InsertSupplementalFormula(US, LU, LUIdx);
+        CountRegisters(LU.Formulae.back(), Uses.size() - 1);
+        break;
+      }
+    }
+  }
+}
+
+/// Split S into subexpressions which can be pulled out into separate
+/// registers. If C is non-null, multiply each subexpression by C.
+///
+/// Return remainder expression after factoring the subexpressions captured by
+/// Ops. If Ops is complete, return NULL.
+static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
+                                   SmallVectorImpl<const SCEV *> &Ops,
+                                   const Loop *L,
+                                   ScalarEvolution &SE,
+                                   unsigned Depth = 0) {
+  // Arbitrarily cap recursion to protect compile time.
+  if (Depth >= 3)
+    return S;
+
+  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+    // Break out add operands.
+    for (const SCEV *S : Add->operands()) {
+      const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1);
+      if (Remainder)
+        Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
+    }
+    return nullptr;
+  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+    // Split a non-zero base out of an addrec.
+    if (AR->getStart()->isZero() || !AR->isAffine())
+      return S;
+
+    const SCEV *Remainder = CollectSubexprs(AR->getStart(),
+                                            C, Ops, L, SE, Depth+1);
+    // Split the non-zero AddRec unless it is part of a nested recurrence that
+    // does not pertain to this loop.
+    if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) {
+      Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
+      Remainder = nullptr;
+    }
+    if (Remainder != AR->getStart()) {
+      if (!Remainder)
+        Remainder = SE.getConstant(AR->getType(), 0);
+      return SE.getAddRecExpr(Remainder,
+                              AR->getStepRecurrence(SE),
+                              AR->getLoop(),
+                              //FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
+                              SCEV::FlagAnyWrap);
+    }
+  } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
+    // Break (C * (a + b + c)) into C*a + C*b + C*c.
+    if (Mul->getNumOperands() != 2)
+      return S;
+    if (const SCEVConstant *Op0 =
+        dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
+      C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
+      const SCEV *Remainder =
+        CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1);
+      if (Remainder)
+        Ops.push_back(SE.getMulExpr(C, Remainder));
+      return nullptr;
+    }
+  }
+  return S;
+}
+
+/// Return true if the SCEV represents a value that may end up as a
+/// post-increment operation.
+static bool mayUsePostIncMode(const TargetTransformInfo &TTI,
+                              LSRUse &LU, const SCEV *S, const Loop *L,
+                              ScalarEvolution &SE) {
+  if (LU.Kind != LSRUse::Address ||
+      !LU.AccessTy.getType()->isIntOrIntVectorTy())
+    return false;
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S);
+  if (!AR)
+    return false;
+  const SCEV *LoopStep = AR->getStepRecurrence(SE);
+  if (!isa<SCEVConstant>(LoopStep))
+    return false;
+  // Check if a post-indexed load/store can be used.
+  if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) ||
+      TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) {
+    const SCEV *LoopStart = AR->getStart();
+    if (!isa<SCEVConstant>(LoopStart) && SE.isLoopInvariant(LoopStart, L))
+      return true;
+  }
+  return false;
+}
+
+/// Helper function for LSRInstance::GenerateReassociations.
+void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
+                                             const Formula &Base,
+                                             unsigned Depth, size_t Idx,
+                                             bool IsScaledReg) {
+  const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
+  // Don't generate reassociations for the base register of a value that
+  // may generate a post-increment operator. The reason is that the
+  // reassociations cause extra base+register formula to be created,
+  // and possibly chosen, but the post-increment is more efficient.
+  if (TTI.shouldFavorPostInc() && mayUsePostIncMode(TTI, LU, BaseReg, L, SE))
+    return;
+  SmallVector<const SCEV *, 8> AddOps;
+  const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
+  if (Remainder)
+    AddOps.push_back(Remainder);
+
+  if (AddOps.size() == 1)
+    return;
+
+  for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
+                                                     JE = AddOps.end();
+       J != JE; ++J) {
+    // Loop-variant "unknown" values are uninteresting; we won't be able to
+    // do anything meaningful with them.
+    if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
+      continue;
+
+    // Don't pull a constant into a register if the constant could be folded
+    // into an immediate field.
+    if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
+                         LU.AccessTy, *J, Base.getNumRegs() > 1))
+      continue;
+
+    // Collect all operands except *J.
+    SmallVector<const SCEV *, 8> InnerAddOps(
+        ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
+    InnerAddOps.append(std::next(J),
+                       ((const SmallVector<const SCEV *, 8> &)AddOps).end());
+
+    // Don't leave just a constant behind in a register if the constant could
+    // be folded into an immediate field.
+    if (InnerAddOps.size() == 1 &&
+        isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
+                         LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
+      continue;
+
+    const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
+    if (InnerSum->isZero())
+      continue;
+    Formula F = Base;
+
+    // Add the remaining pieces of the add back into the new formula.
+    const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
+    if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
+        TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
+                                InnerSumSC->getValue()->getZExtValue())) {
+      F.UnfoldedOffset =
+          (uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue();
+      if (IsScaledReg)
+        F.ScaledReg = nullptr;
+      else
+        F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
+    } else if (IsScaledReg)
+      F.ScaledReg = InnerSum;
+    else
+      F.BaseRegs[Idx] = InnerSum;
+
+    // Add J as its own register, or an unfolded immediate.
+    const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
+    if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
+        TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
+                                SC->getValue()->getZExtValue()))
+      F.UnfoldedOffset =
+          (uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue();
+    else
+      F.BaseRegs.push_back(*J);
+    // We may have changed the number of register in base regs, adjust the
+    // formula accordingly.
+    F.canonicalize(*L);
+
+    if (InsertFormula(LU, LUIdx, F))
+      // If that formula hadn't been seen before, recurse to find more like
+      // it.
+      // Add check on Log16(AddOps.size()) - same as Log2_32(AddOps.size()) >> 2)
+      // Because just Depth is not enough to bound compile time.
+      // This means that every time AddOps.size() is greater 16^x we will add
+      // x to Depth.
+      GenerateReassociations(LU, LUIdx, LU.Formulae.back(),
+                             Depth + 1 + (Log2_32(AddOps.size()) >> 2));
+  }
+}
+
+/// Split out subexpressions from adds and the bases of addrecs.
+void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
+                                         Formula Base, unsigned Depth) {
+  assert(Base.isCanonical(*L) && "Input must be in the canonical form");
+  // Arbitrarily cap recursion to protect compile time.
+  if (Depth >= 3)
+    return;
+
+  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
+    GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
+
+  if (Base.Scale == 1)
+    GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
+                               /* Idx */ -1, /* IsScaledReg */ true);
+}
+
+///  Generate a formula consisting of all of the loop-dominating registers added
+/// into a single register.
+void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
+                                       Formula Base) {
+  // This method is only interesting on a plurality of registers.
+  if (Base.BaseRegs.size() + (Base.Scale == 1) +
+      (Base.UnfoldedOffset != 0) <= 1)
+    return;
+
+  // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
+  // processing the formula.
+  Base.unscale();
+  SmallVector<const SCEV *, 4> Ops;
+  Formula NewBase = Base;
+  NewBase.BaseRegs.clear();
+  Type *CombinedIntegerType = nullptr;
+  for (const SCEV *BaseReg : Base.BaseRegs) {
+    if (SE.properlyDominates(BaseReg, L->getHeader()) &&
+        !SE.hasComputableLoopEvolution(BaseReg, L)) {
+      if (!CombinedIntegerType)
+        CombinedIntegerType = SE.getEffectiveSCEVType(BaseReg->getType());
+      Ops.push_back(BaseReg);
+    }
+    else
+      NewBase.BaseRegs.push_back(BaseReg);
+  }
+
+  // If no register is relevant, we're done.
+  if (Ops.size() == 0)
+    return;
+
+  // Utility function for generating the required variants of the combined
+  // registers.
+  auto GenerateFormula = [&](const SCEV *Sum) {
+    Formula F = NewBase;
+
+    // TODO: If Sum is zero, it probably means ScalarEvolution missed an
+    // opportunity to fold something. For now, just ignore such cases
+    // rather than proceed with zero in a register.
+    if (Sum->isZero())
+      return;
+
+    F.BaseRegs.push_back(Sum);
+    F.canonicalize(*L);
+    (void)InsertFormula(LU, LUIdx, F);
+  };
+
+  // If we collected at least two registers, generate a formula combining them.
+  if (Ops.size() > 1) {
+    SmallVector<const SCEV *, 4> OpsCopy(Ops); // Don't let SE modify Ops.
+    GenerateFormula(SE.getAddExpr(OpsCopy));
+  }
+
+  // If we have an unfolded offset, generate a formula combining it with the
+  // registers collected.
+  if (NewBase.UnfoldedOffset) {
+    assert(CombinedIntegerType && "Missing a type for the unfolded offset");
+    Ops.push_back(SE.getConstant(CombinedIntegerType, NewBase.UnfoldedOffset,
+                                 true));
+    NewBase.UnfoldedOffset = 0;
+    GenerateFormula(SE.getAddExpr(Ops));
+  }
+}
+
+/// Helper function for LSRInstance::GenerateSymbolicOffsets.
+void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
+                                              const Formula &Base, size_t Idx,
+                                              bool IsScaledReg) {
+  const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
+  GlobalValue *GV = ExtractSymbol(G, SE);
+  if (G->isZero() || !GV)
+    return;
+  Formula F = Base;
+  F.BaseGV = GV;
+  if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
+    return;
+  if (IsScaledReg)
+    F.ScaledReg = G;
+  else
+    F.BaseRegs[Idx] = G;
+  (void)InsertFormula(LU, LUIdx, F);
+}
+
+/// Generate reuse formulae using symbolic offsets.
+void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
+                                          Formula Base) {
+  // We can't add a symbolic offset if the address already contains one.
+  if (Base.BaseGV) return;
+
+  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
+    GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
+  if (Base.Scale == 1)
+    GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
+                                /* IsScaledReg */ true);
+}
+
+/// Helper function for LSRInstance::GenerateConstantOffsets.
+void LSRInstance::GenerateConstantOffsetsImpl(
+    LSRUse &LU, unsigned LUIdx, const Formula &Base,
+    const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) {
+
+  auto GenerateOffset = [&](const SCEV *G, int64_t Offset) {
+    Formula F = Base;
+    F.BaseOffset = (uint64_t)Base.BaseOffset - Offset;
+
+    if (isLegalUse(TTI, LU.MinOffset - Offset, LU.MaxOffset - Offset, LU.Kind,
+                   LU.AccessTy, F)) {
+      // Add the offset to the base register.
+      const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), Offset), G);
+      // If it cancelled out, drop the base register, otherwise update it.
+      if (NewG->isZero()) {
+        if (IsScaledReg) {
+          F.Scale = 0;
+          F.ScaledReg = nullptr;
+        } else
+          F.deleteBaseReg(F.BaseRegs[Idx]);
+        F.canonicalize(*L);
+      } else if (IsScaledReg)
+        F.ScaledReg = NewG;
+      else
+        F.BaseRegs[Idx] = NewG;
+
+      (void)InsertFormula(LU, LUIdx, F);
+    }
+  };
+
+  const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
+
+  // With constant offsets and constant steps, we can generate pre-inc
+  // accesses by having the offset equal the step. So, for access #0 with a
+  // step of 8, we generate a G - 8 base which would require the first access
+  // to be ((G - 8) + 8),+,8. The pre-indexed access then updates the pointer
+  // for itself and hopefully becomes the base for other accesses. This means
+  // means that a single pre-indexed access can be generated to become the new
+  // base pointer for each iteration of the loop, resulting in no extra add/sub
+  // instructions for pointer updating.
+  if (FavorBackedgeIndex && LU.Kind == LSRUse::Address) {
+    if (auto *GAR = dyn_cast<SCEVAddRecExpr>(G)) {
+      if (auto *StepRec =
+          dyn_cast<SCEVConstant>(GAR->getStepRecurrence(SE))) {
+        const APInt &StepInt = StepRec->getAPInt();
+        int64_t Step = StepInt.isNegative() ?
+          StepInt.getSExtValue() : StepInt.getZExtValue();
+
+        for (int64_t Offset : Worklist) {
+          Offset -= Step;
+          GenerateOffset(G, Offset);
+        }
+      }
+    }
+  }
+  for (int64_t Offset : Worklist)
+    GenerateOffset(G, Offset);
+
+  int64_t Imm = ExtractImmediate(G, SE);
+  if (G->isZero() || Imm == 0)
+    return;
+  Formula F = Base;
+  F.BaseOffset = (uint64_t)F.BaseOffset + Imm;
+  if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
+    return;
   if (IsScaledReg) {
-    F.ScaledReg = G; 
+    F.ScaledReg = G;
   } else {
-    F.BaseRegs[Idx] = G; 
+    F.BaseRegs[Idx] = G;
     // We may generate non canonical Formula if G is a recurrent expr reg
     // related with current loop while F.ScaledReg is not.
     F.canonicalize(*L);
   }
-  (void)InsertFormula(LU, LUIdx, F); 
-} 
- 
-/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets. 
-void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, 
-                                          Formula Base) { 
-  // TODO: For now, just add the min and max offset, because it usually isn't 
-  // worthwhile looking at everything inbetween. 
-  SmallVector<int64_t, 2> Worklist; 
-  Worklist.push_back(LU.MinOffset); 
-  if (LU.MaxOffset != LU.MinOffset) 
-    Worklist.push_back(LU.MaxOffset); 
- 
-  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) 
-    GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i); 
-  if (Base.Scale == 1) 
-    GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1, 
-                                /* IsScaledReg */ true); 
-} 
- 
-/// For ICmpZero, check to see if we can scale up the comparison. For example, x 
-/// == y -> x*c == y*c. 
-void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, 
-                                         Formula Base) { 
-  if (LU.Kind != LSRUse::ICmpZero) return; 
- 
-  // Determine the integer type for the base formula. 
-  Type *IntTy = Base.getType(); 
-  if (!IntTy) return; 
-  if (SE.getTypeSizeInBits(IntTy) > 64) return; 
- 
-  // Don't do this if there is more than one offset. 
-  if (LU.MinOffset != LU.MaxOffset) return; 
- 
-  // Check if transformation is valid. It is illegal to multiply pointer. 
-  if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy()) 
-    return; 
-  for (const SCEV *BaseReg : Base.BaseRegs) 
-    if (BaseReg->getType()->isPointerTy()) 
-      return; 
-  assert(!Base.BaseGV && "ICmpZero use is not legal!"); 
- 
-  // Check each interesting stride. 
-  for (int64_t Factor : Factors) { 
-    // Check that the multiplication doesn't overflow. 
-    if (Base.BaseOffset == std::numeric_limits<int64_t>::min() && Factor == -1) 
-      continue; 
-    int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor; 
-    if (NewBaseOffset / Factor != Base.BaseOffset) 
-      continue; 
-    // If the offset will be truncated at this use, check that it is in bounds. 
-    if (!IntTy->isPointerTy() && 
-        !ConstantInt::isValueValidForType(IntTy, NewBaseOffset)) 
-      continue; 
- 
-    // Check that multiplying with the use offset doesn't overflow. 
-    int64_t Offset = LU.MinOffset; 
-    if (Offset == std::numeric_limits<int64_t>::min() && Factor == -1) 
-      continue; 
-    Offset = (uint64_t)Offset * Factor; 
-    if (Offset / Factor != LU.MinOffset) 
-      continue; 
-    // If the offset will be truncated at this use, check that it is in bounds. 
-    if (!IntTy->isPointerTy() && 
-        !ConstantInt::isValueValidForType(IntTy, Offset)) 
-      continue; 
- 
-    Formula F = Base; 
-    F.BaseOffset = NewBaseOffset; 
- 
-    // Check that this scale is legal. 
-    if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F)) 
-      continue; 
- 
-    // Compensate for the use having MinOffset built into it. 
-    F.BaseOffset = (uint64_t)F.BaseOffset + Offset - LU.MinOffset; 
- 
-    const SCEV *FactorS = SE.getConstant(IntTy, Factor); 
- 
-    // Check that multiplying with each base register doesn't overflow. 
-    for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) { 
-      F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS); 
-      if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i]) 
-        goto next; 
-    } 
- 
-    // Check that multiplying with the scaled register doesn't overflow. 
-    if (F.ScaledReg) { 
-      F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS); 
-      if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg) 
-        continue; 
-    } 
- 
-    // Check that multiplying with the unfolded offset doesn't overflow. 
-    if (F.UnfoldedOffset != 0) { 
-      if (F.UnfoldedOffset == std::numeric_limits<int64_t>::min() && 
-          Factor == -1) 
-        continue; 
-      F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor; 
-      if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset) 
-        continue; 
-      // If the offset will be truncated, check that it is in bounds. 
-      if (!IntTy->isPointerTy() && 
-          !ConstantInt::isValueValidForType(IntTy, F.UnfoldedOffset)) 
-        continue; 
-    } 
- 
-    // If we make it here and it's legal, add it. 
-    (void)InsertFormula(LU, LUIdx, F); 
-  next:; 
-  } 
-} 
- 
-/// Generate stride factor reuse formulae by making use of scaled-offset address 
-/// modes, for example. 
-void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { 
-  // Determine the integer type for the base formula. 
-  Type *IntTy = Base.getType(); 
-  if (!IntTy) return; 
- 
-  // If this Formula already has a scaled register, we can't add another one. 
-  // Try to unscale the formula to generate a better scale. 
-  if (Base.Scale != 0 && !Base.unscale()) 
-    return; 
- 
-  assert(Base.Scale == 0 && "unscale did not did its job!"); 
- 
-  // Check each interesting stride. 
-  for (int64_t Factor : Factors) { 
-    Base.Scale = Factor; 
-    Base.HasBaseReg = Base.BaseRegs.size() > 1; 
-    // Check whether this scale is going to be legal. 
-    if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, 
-                    Base)) { 
-      // As a special-case, handle special out-of-loop Basic users specially. 
-      // TODO: Reconsider this special case. 
-      if (LU.Kind == LSRUse::Basic && 
-          isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special, 
-                     LU.AccessTy, Base) && 
-          LU.AllFixupsOutsideLoop) 
-        LU.Kind = LSRUse::Special; 
-      else 
-        continue; 
-    } 
-    // For an ICmpZero, negating a solitary base register won't lead to 
-    // new solutions. 
-    if (LU.Kind == LSRUse::ICmpZero && 
-        !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV) 
-      continue; 
-    // For each addrec base reg, if its loop is current loop, apply the scale. 
-    for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) { 
-      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]); 
-      if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) { 
-        const SCEV *FactorS = SE.getConstant(IntTy, Factor); 
-        if (FactorS->isZero()) 
-          continue; 
-        // Divide out the factor, ignoring high bits, since we'll be 
-        // scaling the value back up in the end. 
-        if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) { 
-          // TODO: This could be optimized to avoid all the copying. 
-          Formula F = Base; 
-          F.ScaledReg = Quotient; 
-          F.deleteBaseReg(F.BaseRegs[i]); 
-          // The canonical representation of 1*reg is reg, which is already in 
-          // Base. In that case, do not try to insert the formula, it will be 
-          // rejected anyway. 
-          if (F.Scale == 1 && (F.BaseRegs.empty() || 
-                               (AR->getLoop() != L && LU.AllFixupsOutsideLoop))) 
-            continue; 
-          // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate 
-          // non canonical Formula with ScaledReg's loop not being L. 
-          if (F.Scale == 1 && LU.AllFixupsOutsideLoop) 
-            F.canonicalize(*L); 
-          (void)InsertFormula(LU, LUIdx, F); 
-        } 
-      } 
-    } 
-  } 
-} 
- 
-/// Generate reuse formulae from different IV types. 
-void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) { 
-  // Don't bother truncating symbolic values. 
-  if (Base.BaseGV) return; 
- 
-  // Determine the integer type for the base formula. 
-  Type *DstTy = Base.getType(); 
-  if (!DstTy) return; 
-  DstTy = SE.getEffectiveSCEVType(DstTy); 
- 
-  for (Type *SrcTy : Types) { 
-    if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) { 
-      Formula F = Base; 
- 
-      // Sometimes SCEV is able to prove zero during ext transform. It may 
-      // happen if SCEV did not do all possible transforms while creating the 
-      // initial node (maybe due to depth limitations), but it can do them while 
-      // taking ext. 
-      if (F.ScaledReg) { 
-        const SCEV *NewScaledReg = SE.getAnyExtendExpr(F.ScaledReg, SrcTy); 
-        if (NewScaledReg->isZero()) 
-         continue; 
-        F.ScaledReg = NewScaledReg; 
-      } 
-      bool HasZeroBaseReg = false; 
-      for (const SCEV *&BaseReg : F.BaseRegs) { 
-        const SCEV *NewBaseReg = SE.getAnyExtendExpr(BaseReg, SrcTy); 
-        if (NewBaseReg->isZero()) { 
-          HasZeroBaseReg = true; 
-          break; 
-        } 
-        BaseReg = NewBaseReg; 
-      } 
-      if (HasZeroBaseReg) 
-        continue; 
- 
-      // TODO: This assumes we've done basic processing on all uses and 
-      // have an idea what the register usage is. 
-      if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses)) 
-        continue; 
- 
-      F.canonicalize(*L); 
-      (void)InsertFormula(LU, LUIdx, F); 
-    } 
-  } 
-} 
- 
-namespace { 
- 
-/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer 
-/// modifications so that the search phase doesn't have to worry about the data 
-/// structures moving underneath it. 
-struct WorkItem { 
-  size_t LUIdx; 
-  int64_t Imm; 
-  const SCEV *OrigReg; 
- 
-  WorkItem(size_t LI, int64_t I, const SCEV *R) 
-      : LUIdx(LI), Imm(I), OrigReg(R) {} 
- 
-  void print(raw_ostream &OS) const; 
-  void dump() const; 
-}; 
- 
-} // end anonymous namespace 
- 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 
-void WorkItem::print(raw_ostream &OS) const { 
-  OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx 
-     << " , add offset " << Imm; 
-} 
- 
-LLVM_DUMP_METHOD void WorkItem::dump() const { 
-  print(errs()); errs() << '\n'; 
-} 
-#endif 
- 
-/// Look for registers which are a constant distance apart and try to form reuse 
-/// opportunities between them. 
-void LSRInstance::GenerateCrossUseConstantOffsets() { 
-  // Group the registers by their value without any added constant offset. 
-  using ImmMapTy = std::map<int64_t, const SCEV *>; 
- 
-  DenseMap<const SCEV *, ImmMapTy> Map; 
-  DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap; 
-  SmallVector<const SCEV *, 8> Sequence; 
-  for (const SCEV *Use : RegUses) { 
-    const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify. 
-    int64_t Imm = ExtractImmediate(Reg, SE); 
-    auto Pair = Map.insert(std::make_pair(Reg, ImmMapTy())); 
-    if (Pair.second) 
-      Sequence.push_back(Reg); 
-    Pair.first->second.insert(std::make_pair(Imm, Use)); 
-    UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use); 
-  } 
- 
-  // Now examine each set of registers with the same base value. Build up 
-  // a list of work to do and do the work in a separate step so that we're 
-  // not adding formulae and register counts while we're searching. 
-  SmallVector<WorkItem, 32> WorkItems; 
-  SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems; 
-  for (const SCEV *Reg : Sequence) { 
-    const ImmMapTy &Imms = Map.find(Reg)->second; 
- 
-    // It's not worthwhile looking for reuse if there's only one offset. 
-    if (Imms.size() == 1) 
-      continue; 
- 
-    LLVM_DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':'; 
-               for (const auto &Entry 
-                    : Imms) dbgs() 
-               << ' ' << Entry.first; 
-               dbgs() << '\n'); 
- 
-    // Examine each offset. 
-    for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end(); 
-         J != JE; ++J) { 
-      const SCEV *OrigReg = J->second; 
- 
-      int64_t JImm = J->first; 
-      const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg); 
- 
-      if (!isa<SCEVConstant>(OrigReg) && 
-          UsedByIndicesMap[Reg].count() == 1) { 
-        LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg 
-                          << '\n'); 
-        continue; 
-      } 
- 
-      // Conservatively examine offsets between this orig reg a few selected 
-      // other orig regs. 
-      int64_t First = Imms.begin()->first; 
-      int64_t Last = std::prev(Imms.end())->first; 
-      // Compute (First + Last)  / 2 without overflow using the fact that 
-      // First + Last = 2 * (First + Last) + (First ^ Last). 
-      int64_t Avg = (First & Last) + ((First ^ Last) >> 1); 
-      // If the result is negative and First is odd and Last even (or vice versa), 
-      // we rounded towards -inf. Add 1 in that case, to round towards 0. 
-      Avg = Avg + ((First ^ Last) & ((uint64_t)Avg >> 63)); 
-      ImmMapTy::const_iterator OtherImms[] = { 
-          Imms.begin(), std::prev(Imms.end()), 
-         Imms.lower_bound(Avg)}; 
-      for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) { 
-        ImmMapTy::const_iterator M = OtherImms[i]; 
-        if (M == J || M == JE) continue; 
- 
-        // Compute the difference between the two. 
-        int64_t Imm = (uint64_t)JImm - M->first; 
-        for (unsigned LUIdx : UsedByIndices.set_bits()) 
-          // Make a memo of this use, offset, and register tuple. 
-          if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second) 
-            WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg)); 
-      } 
-    } 
-  } 
- 
-  Map.clear(); 
-  Sequence.clear(); 
-  UsedByIndicesMap.clear(); 
-  UniqueItems.clear(); 
- 
-  // Now iterate through the worklist and add new formulae. 
-  for (const WorkItem &WI : WorkItems) { 
-    size_t LUIdx = WI.LUIdx; 
-    LSRUse &LU = Uses[LUIdx]; 
-    int64_t Imm = WI.Imm; 
-    const SCEV *OrigReg = WI.OrigReg; 
- 
-    Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType()); 
-    const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm)); 
-    unsigned BitWidth = SE.getTypeSizeInBits(IntTy); 
- 
-    // TODO: Use a more targeted data structure. 
-    for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) { 
-      Formula F = LU.Formulae[L]; 
-      // FIXME: The code for the scaled and unscaled registers looks 
-      // very similar but slightly different. Investigate if they 
-      // could be merged. That way, we would not have to unscale the 
-      // Formula. 
-      F.unscale(); 
-      // Use the immediate in the scaled register. 
-      if (F.ScaledReg == OrigReg) { 
-        int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale; 
-        // Don't create 50 + reg(-50). 
-        if (F.referencesReg(SE.getSCEV( 
-                   ConstantInt::get(IntTy, -(uint64_t)Offset)))) 
-          continue; 
-        Formula NewF = F; 
-        NewF.BaseOffset = Offset; 
-        if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, 
-                        NewF)) 
-          continue; 
-        NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg); 
- 
-        // If the new scale is a constant in a register, and adding the constant 
-        // value to the immediate would produce a value closer to zero than the 
-        // immediate itself, then the formula isn't worthwhile. 
-        if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) 
-          if (C->getValue()->isNegative() != (NewF.BaseOffset < 0) && 
-              (C->getAPInt().abs() * APInt(BitWidth, F.Scale)) 
-                  .ule(std::abs(NewF.BaseOffset))) 
-            continue; 
- 
-        // OK, looks good. 
-        NewF.canonicalize(*this->L); 
-        (void)InsertFormula(LU, LUIdx, NewF); 
-      } else { 
-        // Use the immediate in a base register. 
-        for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) { 
-          const SCEV *BaseReg = F.BaseRegs[N]; 
-          if (BaseReg != OrigReg) 
-            continue; 
-          Formula NewF = F; 
-          NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm; 
-          if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, 
-                          LU.Kind, LU.AccessTy, NewF)) { 
-            if (TTI.shouldFavorPostInc() && 
-                mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE)) 
-              continue; 
-            if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm)) 
-              continue; 
-            NewF = F; 
-            NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm; 
-          } 
-          NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg); 
- 
-          // If the new formula has a constant in a register, and adding the 
-          // constant value to the immediate would produce a value closer to 
-          // zero than the immediate itself, then the formula isn't worthwhile. 
-          for (const SCEV *NewReg : NewF.BaseRegs) 
-            if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg)) 
-              if ((C->getAPInt() + NewF.BaseOffset) 
-                      .abs() 
-                      .slt(std::abs(NewF.BaseOffset)) && 
-                  (C->getAPInt() + NewF.BaseOffset).countTrailingZeros() >= 
-                      countTrailingZeros<uint64_t>(NewF.BaseOffset)) 
-                goto skip_formula; 
- 
-          // Ok, looks good. 
-          NewF.canonicalize(*this->L); 
-          (void)InsertFormula(LU, LUIdx, NewF); 
-          break; 
-        skip_formula:; 
-        } 
-      } 
-    } 
-  } 
-} 
- 
-/// Generate formulae for each use. 
-void 
-LSRInstance::GenerateAllReuseFormulae() { 
-  // This is split into multiple loops so that hasRegsUsedByUsesOtherThan 
-  // queries are more precise. 
-  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 
-    LSRUse &LU = Uses[LUIdx]; 
-    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 
-      GenerateReassociations(LU, LUIdx, LU.Formulae[i]); 
-    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 
-      GenerateCombinations(LU, LUIdx, LU.Formulae[i]); 
-  } 
-  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 
-    LSRUse &LU = Uses[LUIdx]; 
-    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 
-      GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]); 
-    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 
-      GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]); 
-    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 
-      GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]); 
-    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 
-      GenerateScales(LU, LUIdx, LU.Formulae[i]); 
-  } 
-  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 
-    LSRUse &LU = Uses[LUIdx]; 
-    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 
-      GenerateTruncates(LU, LUIdx, LU.Formulae[i]); 
-  } 
- 
-  GenerateCrossUseConstantOffsets(); 
- 
-  LLVM_DEBUG(dbgs() << "\n" 
-                       "After generating reuse formulae:\n"; 
-             print_uses(dbgs())); 
-} 
- 
-/// If there are multiple formulae with the same set of registers used 
-/// by other uses, pick the best one and delete the others. 
-void LSRInstance::FilterOutUndesirableDedicatedRegisters() { 
-  DenseSet<const SCEV *> VisitedRegs; 
-  SmallPtrSet<const SCEV *, 16> Regs; 
-  SmallPtrSet<const SCEV *, 16> LoserRegs; 
-#ifndef NDEBUG 
-  bool ChangedFormulae = false; 
-#endif 
- 
-  // Collect the best formula for each unique set of shared registers. This 
-  // is reset for each use. 
-  using BestFormulaeTy = 
-      DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo>; 
- 
-  BestFormulaeTy BestFormulae; 
- 
-  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 
-    LSRUse &LU = Uses[LUIdx]; 
-    LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); 
-               dbgs() << '\n'); 
- 
-    bool Any = false; 
-    for (size_t FIdx = 0, NumForms = LU.Formulae.size(); 
-         FIdx != NumForms; ++FIdx) { 
-      Formula &F = LU.Formulae[FIdx]; 
- 
-      // Some formulas are instant losers. For example, they may depend on 
-      // nonexistent AddRecs from other loops. These need to be filtered 
-      // immediately, otherwise heuristics could choose them over others leading 
-      // to an unsatisfactory solution. Passing LoserRegs into RateFormula here 
-      // avoids the need to recompute this information across formulae using the 
-      // same bad AddRec. Passing LoserRegs is also essential unless we remove 
-      // the corresponding bad register from the Regs set. 
-      Cost CostF(L, SE, TTI); 
-      Regs.clear(); 
-      CostF.RateFormula(F, Regs, VisitedRegs, LU, &LoserRegs); 
-      if (CostF.isLoser()) { 
-        // During initial formula generation, undesirable formulae are generated 
-        // by uses within other loops that have some non-trivial address mode or 
-        // use the postinc form of the IV. LSR needs to provide these formulae 
-        // as the basis of rediscovering the desired formula that uses an AddRec 
-        // corresponding to the existing phi. Once all formulae have been 
-        // generated, these initial losers may be pruned. 
-        LLVM_DEBUG(dbgs() << "  Filtering loser "; F.print(dbgs()); 
-                   dbgs() << "\n"); 
-      } 
-      else { 
-        SmallVector<const SCEV *, 4> Key; 
-        for (const SCEV *Reg : F.BaseRegs) { 
-          if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx)) 
-            Key.push_back(Reg); 
-        } 
-        if (F.ScaledReg && 
-            RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx)) 
-          Key.push_back(F.ScaledReg); 
-        // Unstable sort by host order ok, because this is only used for 
-        // uniquifying. 
-        llvm::sort(Key); 
- 
-        std::pair<BestFormulaeTy::const_iterator, bool> P = 
-          BestFormulae.insert(std::make_pair(Key, FIdx)); 
-        if (P.second) 
-          continue; 
- 
-        Formula &Best = LU.Formulae[P.first->second]; 
- 
-        Cost CostBest(L, SE, TTI); 
-        Regs.clear(); 
-        CostBest.RateFormula(Best, Regs, VisitedRegs, LU); 
-        if (CostF.isLess(CostBest)) 
-          std::swap(F, Best); 
-        LLVM_DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs()); 
-                   dbgs() << "\n" 
-                             "    in favor of formula "; 
-                   Best.print(dbgs()); dbgs() << '\n'); 
-      } 
-#ifndef NDEBUG 
-      ChangedFormulae = true; 
-#endif 
-      LU.DeleteFormula(F); 
-      --FIdx; 
-      --NumForms; 
-      Any = true; 
-    } 
- 
-    // Now that we've filtered out some formulae, recompute the Regs set. 
-    if (Any) 
-      LU.RecomputeRegs(LUIdx, RegUses); 
- 
-    // Reset this to prepare for the next use. 
-    BestFormulae.clear(); 
-  } 
- 
-  LLVM_DEBUG(if (ChangedFormulae) { 
-    dbgs() << "\n" 
-              "After filtering out undesirable candidates:\n"; 
-    print_uses(dbgs()); 
-  }); 
-} 
- 
-/// Estimate the worst-case number of solutions the solver might have to 
-/// consider. It almost never considers this many solutions because it prune the 
-/// search space, but the pruning isn't always sufficient. 
-size_t LSRInstance::EstimateSearchSpaceComplexity() const { 
-  size_t Power = 1; 
-  for (const LSRUse &LU : Uses) { 
-    size_t FSize = LU.Formulae.size(); 
-    if (FSize >= ComplexityLimit) { 
-      Power = ComplexityLimit; 
-      break; 
-    } 
-    Power *= FSize; 
-    if (Power >= ComplexityLimit) 
-      break; 
-  } 
-  return Power; 
-} 
- 
-/// When one formula uses a superset of the registers of another formula, it 
-/// won't help reduce register pressure (though it may not necessarily hurt 
-/// register pressure); remove it to simplify the system. 
-void LSRInstance::NarrowSearchSpaceByDetectingSupersets() { 
-  if (EstimateSearchSpaceComplexity() >= ComplexityLimit) { 
-    LLVM_DEBUG(dbgs() << "The search space is too complex.\n"); 
- 
-    LLVM_DEBUG(dbgs() << "Narrowing the search space by eliminating formulae " 
-                         "which use a superset of registers used by other " 
-                         "formulae.\n"); 
- 
-    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 
-      LSRUse &LU = Uses[LUIdx]; 
-      bool Any = false; 
-      for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) { 
-        Formula &F = LU.Formulae[i]; 
-        // Look for a formula with a constant or GV in a register. If the use 
-        // also has a formula with that same value in an immediate field, 
-        // delete the one that uses a register. 
-        for (SmallVectorImpl<const SCEV *>::const_iterator 
-             I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) { 
-          if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) { 
-            Formula NewF = F; 
-            //FIXME: Formulas should store bitwidth to do wrapping properly. 
-            //       See PR41034. 
-            NewF.BaseOffset += (uint64_t)C->getValue()->getSExtValue(); 
-            NewF.BaseRegs.erase(NewF.BaseRegs.begin() + 
-                                (I - F.BaseRegs.begin())); 
-            if (LU.HasFormulaWithSameRegs(NewF)) { 
-              LLVM_DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); 
-                         dbgs() << '\n'); 
-              LU.DeleteFormula(F); 
-              --i; 
-              --e; 
-              Any = true; 
-              break; 
-            } 
-          } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) { 
-            if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) 
-              if (!F.BaseGV) { 
-                Formula NewF = F; 
-                NewF.BaseGV = GV; 
-                NewF.BaseRegs.erase(NewF.BaseRegs.begin() + 
-                                    (I - F.BaseRegs.begin())); 
-                if (LU.HasFormulaWithSameRegs(NewF)) { 
-                  LLVM_DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); 
-                             dbgs() << '\n'); 
-                  LU.DeleteFormula(F); 
-                  --i; 
-                  --e; 
-                  Any = true; 
-                  break; 
-                } 
-              } 
-          } 
-        } 
-      } 
-      if (Any) 
-        LU.RecomputeRegs(LUIdx, RegUses); 
-    } 
- 
-    LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs())); 
-  } 
-} 
- 
-/// When there are many registers for expressions like A, A+1, A+2, etc., 
-/// allocate a single register for them. 
-void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { 
-  if (EstimateSearchSpaceComplexity() < ComplexityLimit)  
-    return; 
- 
-  LLVM_DEBUG( 
-      dbgs() << "The search space is too complex.\n" 
-                "Narrowing the search space by assuming that uses separated " 
-                "by a constant offset will use the same registers.\n"); 
- 
-  // This is especially useful for unrolled loops. 
- 
-  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 
-    LSRUse &LU = Uses[LUIdx]; 
-    for (const Formula &F : LU.Formulae) { 
-      if (F.BaseOffset == 0 || (F.Scale != 0 && F.Scale != 1)) 
-        continue; 
- 
-      LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU); 
-      if (!LUThatHas) 
-        continue; 
- 
-      if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false, 
-                              LU.Kind, LU.AccessTy)) 
-        continue; 
- 
-      LLVM_DEBUG(dbgs() << "  Deleting use "; LU.print(dbgs()); dbgs() << '\n'); 
- 
-      LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop; 
- 
-      // Transfer the fixups of LU to LUThatHas. 
-      for (LSRFixup &Fixup : LU.Fixups) { 
-        Fixup.Offset += F.BaseOffset; 
-        LUThatHas->pushFixup(Fixup); 
-        LLVM_DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n'); 
-      } 
- 
-      // Delete formulae from the new use which are no longer legal. 
-      bool Any = false; 
-      for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) { 
-        Formula &F = LUThatHas->Formulae[i]; 
-        if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset, 
-                        LUThatHas->Kind, LUThatHas->AccessTy, F)) { 
-          LLVM_DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n'); 
-          LUThatHas->DeleteFormula(F); 
-          --i; 
-          --e; 
-          Any = true; 
-        } 
-      } 
- 
-      if (Any) 
-        LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses); 
- 
-      // Delete the old use. 
-      DeleteUse(LU, LUIdx); 
-      --LUIdx; 
-      --NumUses; 
-      break; 
-    } 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs())); 
-} 
- 
-/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that 
-/// we've done more filtering, as it may be able to find more formulae to 
-/// eliminate. 
-void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){ 
-  if (EstimateSearchSpaceComplexity() >= ComplexityLimit) { 
-    LLVM_DEBUG(dbgs() << "The search space is too complex.\n"); 
- 
-    LLVM_DEBUG(dbgs() << "Narrowing the search space by re-filtering out " 
-                         "undesirable dedicated registers.\n"); 
- 
-    FilterOutUndesirableDedicatedRegisters(); 
- 
-    LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs())); 
-  } 
-} 
- 
-/// If a LSRUse has multiple formulae with the same ScaledReg and Scale. 
-/// Pick the best one and delete the others. 
-/// This narrowing heuristic is to keep as many formulae with different 
-/// Scale and ScaledReg pair as possible while narrowing the search space. 
-/// The benefit is that it is more likely to find out a better solution 
-/// from a formulae set with more Scale and ScaledReg variations than 
-/// a formulae set with the same Scale and ScaledReg. The picking winner 
-/// reg heuristic will often keep the formulae with the same Scale and 
-/// ScaledReg and filter others, and we want to avoid that if possible. 
-void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() { 
-  if (EstimateSearchSpaceComplexity() < ComplexityLimit) 
-    return; 
- 
-  LLVM_DEBUG( 
-      dbgs() << "The search space is too complex.\n" 
-                "Narrowing the search space by choosing the best Formula " 
-                "from the Formulae with the same Scale and ScaledReg.\n"); 
- 
-  // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse. 
-  using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>; 
- 
-  BestFormulaeTy BestFormulae; 
-#ifndef NDEBUG 
-  bool ChangedFormulae = false; 
-#endif 
-  DenseSet<const SCEV *> VisitedRegs; 
-  SmallPtrSet<const SCEV *, 16> Regs; 
- 
-  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 
-    LSRUse &LU = Uses[LUIdx]; 
-    LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); 
-               dbgs() << '\n'); 
- 
-    // Return true if Formula FA is better than Formula FB. 
-    auto IsBetterThan = [&](Formula &FA, Formula &FB) { 
-      // First we will try to choose the Formula with fewer new registers. 
-      // For a register used by current Formula, the more the register is 
-      // shared among LSRUses, the less we increase the register number 
-      // counter of the formula. 
-      size_t FARegNum = 0; 
-      for (const SCEV *Reg : FA.BaseRegs) { 
-        const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg); 
-        FARegNum += (NumUses - UsedByIndices.count() + 1); 
-      } 
-      size_t FBRegNum = 0; 
-      for (const SCEV *Reg : FB.BaseRegs) { 
-        const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg); 
-        FBRegNum += (NumUses - UsedByIndices.count() + 1); 
-      } 
-      if (FARegNum != FBRegNum) 
-        return FARegNum < FBRegNum; 
- 
-      // If the new register numbers are the same, choose the Formula with 
-      // less Cost. 
-      Cost CostFA(L, SE, TTI); 
-      Cost CostFB(L, SE, TTI); 
-      Regs.clear(); 
-      CostFA.RateFormula(FA, Regs, VisitedRegs, LU); 
-      Regs.clear(); 
-      CostFB.RateFormula(FB, Regs, VisitedRegs, LU); 
-      return CostFA.isLess(CostFB); 
-    }; 
- 
-    bool Any = false; 
-    for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms; 
-         ++FIdx) { 
-      Formula &F = LU.Formulae[FIdx]; 
-      if (!F.ScaledReg) 
-        continue; 
-      auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx}); 
-      if (P.second) 
-        continue; 
- 
-      Formula &Best = LU.Formulae[P.first->second]; 
-      if (IsBetterThan(F, Best)) 
-        std::swap(F, Best); 
-      LLVM_DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs()); 
-                 dbgs() << "\n" 
-                           "    in favor of formula "; 
-                 Best.print(dbgs()); dbgs() << '\n'); 
-#ifndef NDEBUG 
-      ChangedFormulae = true; 
-#endif 
-      LU.DeleteFormula(F); 
-      --FIdx; 
-      --NumForms; 
-      Any = true; 
-    } 
-    if (Any) 
-      LU.RecomputeRegs(LUIdx, RegUses); 
- 
-    // Reset this to prepare for the next use. 
-    BestFormulae.clear(); 
-  } 
- 
-  LLVM_DEBUG(if (ChangedFormulae) { 
-    dbgs() << "\n" 
-              "After filtering out undesirable candidates:\n"; 
-    print_uses(dbgs()); 
-  }); 
-} 
- 
-/// If we are over the complexity limit, filter out any post-inc prefering 
-/// variables to only post-inc values. 
-void LSRInstance::NarrowSearchSpaceByFilterPostInc() { 
-  if (!TTI.shouldFavorPostInc()) 
-    return; 
-  if (EstimateSearchSpaceComplexity() < ComplexityLimit) 
-    return; 
- 
-  LLVM_DEBUG(dbgs() << "The search space is too complex.\n" 
-                       "Narrowing the search space by choosing the lowest " 
-                       "register Formula for PostInc Uses.\n"); 
- 
-  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 
-    LSRUse &LU = Uses[LUIdx]; 
- 
-    if (LU.Kind != LSRUse::Address) 
-      continue; 
-    if (!TTI.isIndexedLoadLegal(TTI.MIM_PostInc, LU.AccessTy.getType()) && 
-        !TTI.isIndexedStoreLegal(TTI.MIM_PostInc, LU.AccessTy.getType())) 
-      continue; 
- 
-    size_t MinRegs = std::numeric_limits<size_t>::max(); 
-    for (const Formula &F : LU.Formulae) 
-      MinRegs = std::min(F.getNumRegs(), MinRegs); 
- 
-    bool Any = false; 
-    for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms; 
-         ++FIdx) { 
-      Formula &F = LU.Formulae[FIdx]; 
-      if (F.getNumRegs() > MinRegs) { 
-        LLVM_DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs()); 
-                   dbgs() << "\n"); 
-        LU.DeleteFormula(F); 
-        --FIdx; 
-        --NumForms; 
-        Any = true; 
-      } 
-    } 
-    if (Any) 
-      LU.RecomputeRegs(LUIdx, RegUses); 
- 
-    if (EstimateSearchSpaceComplexity() < ComplexityLimit) 
-      break; 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs())); 
-} 
- 
-/// The function delete formulas with high registers number expectation. 
-/// Assuming we don't know the value of each formula (already delete 
-/// all inefficient), generate probability of not selecting for each 
-/// register. 
-/// For example, 
-/// Use1: 
-///  reg(a) + reg({0,+,1}) 
-///  reg(a) + reg({-1,+,1}) + 1 
-///  reg({a,+,1}) 
-/// Use2: 
-///  reg(b) + reg({0,+,1}) 
-///  reg(b) + reg({-1,+,1}) + 1 
-///  reg({b,+,1}) 
-/// Use3: 
-///  reg(c) + reg(b) + reg({0,+,1}) 
-///  reg(c) + reg({b,+,1}) 
-/// 
-/// Probability of not selecting 
-///                 Use1   Use2    Use3 
-/// reg(a)         (1/3) *   1   *   1 
-/// reg(b)           1   * (1/3) * (1/2) 
-/// reg({0,+,1})   (2/3) * (2/3) * (1/2) 
-/// reg({-1,+,1})  (2/3) * (2/3) *   1 
-/// reg({a,+,1})   (2/3) *   1   *   1 
-/// reg({b,+,1})     1   * (2/3) * (2/3) 
-/// reg(c)           1   *   1   *   0 
-/// 
-/// Now count registers number mathematical expectation for each formula: 
-/// Note that for each use we exclude probability if not selecting for the use. 
-/// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding 
-/// probabilty 1/3 of not selecting for Use1). 
-/// Use1: 
-///  reg(a) + reg({0,+,1})          1 + 1/3       -- to be deleted 
-///  reg(a) + reg({-1,+,1}) + 1     1 + 4/9       -- to be deleted 
-///  reg({a,+,1})                   1 
-/// Use2: 
-///  reg(b) + reg({0,+,1})          1/2 + 1/3     -- to be deleted 
-///  reg(b) + reg({-1,+,1}) + 1     1/2 + 2/3     -- to be deleted 
-///  reg({b,+,1})                   2/3 
-/// Use3: 
-///  reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted 
-///  reg(c) + reg({b,+,1})          1 + 2/3 
-void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() { 
+  (void)InsertFormula(LU, LUIdx, F);
+}
+
+/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
+void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
+                                          Formula Base) {
+  // TODO: For now, just add the min and max offset, because it usually isn't
+  // worthwhile looking at everything inbetween.
+  SmallVector<int64_t, 2> Worklist;
+  Worklist.push_back(LU.MinOffset);
+  if (LU.MaxOffset != LU.MinOffset)
+    Worklist.push_back(LU.MaxOffset);
+
+  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
+    GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
+  if (Base.Scale == 1)
+    GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
+                                /* IsScaledReg */ true);
+}
+
+/// For ICmpZero, check to see if we can scale up the comparison. For example, x
+/// == y -> x*c == y*c.
+void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
+                                         Formula Base) {
+  if (LU.Kind != LSRUse::ICmpZero) return;
+
+  // Determine the integer type for the base formula.
+  Type *IntTy = Base.getType();
+  if (!IntTy) return;
+  if (SE.getTypeSizeInBits(IntTy) > 64) return;
+
+  // Don't do this if there is more than one offset.
+  if (LU.MinOffset != LU.MaxOffset) return;
+
+  // Check if transformation is valid. It is illegal to multiply pointer.
+  if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
+    return;
+  for (const SCEV *BaseReg : Base.BaseRegs)
+    if (BaseReg->getType()->isPointerTy())
+      return;
+  assert(!Base.BaseGV && "ICmpZero use is not legal!");
+
+  // Check each interesting stride.
+  for (int64_t Factor : Factors) {
+    // Check that the multiplication doesn't overflow.
+    if (Base.BaseOffset == std::numeric_limits<int64_t>::min() && Factor == -1)
+      continue;
+    int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor;
+    if (NewBaseOffset / Factor != Base.BaseOffset)
+      continue;
+    // If the offset will be truncated at this use, check that it is in bounds.
+    if (!IntTy->isPointerTy() &&
+        !ConstantInt::isValueValidForType(IntTy, NewBaseOffset))
+      continue;
+
+    // Check that multiplying with the use offset doesn't overflow.
+    int64_t Offset = LU.MinOffset;
+    if (Offset == std::numeric_limits<int64_t>::min() && Factor == -1)
+      continue;
+    Offset = (uint64_t)Offset * Factor;
+    if (Offset / Factor != LU.MinOffset)
+      continue;
+    // If the offset will be truncated at this use, check that it is in bounds.
+    if (!IntTy->isPointerTy() &&
+        !ConstantInt::isValueValidForType(IntTy, Offset))
+      continue;
+
+    Formula F = Base;
+    F.BaseOffset = NewBaseOffset;
+
+    // Check that this scale is legal.
+    if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
+      continue;
+
+    // Compensate for the use having MinOffset built into it.
+    F.BaseOffset = (uint64_t)F.BaseOffset + Offset - LU.MinOffset;
+
+    const SCEV *FactorS = SE.getConstant(IntTy, Factor);
+
+    // Check that multiplying with each base register doesn't overflow.
+    for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
+      F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
+      if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
+        goto next;
+    }
+
+    // Check that multiplying with the scaled register doesn't overflow.
+    if (F.ScaledReg) {
+      F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
+      if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
+        continue;
+    }
+
+    // Check that multiplying with the unfolded offset doesn't overflow.
+    if (F.UnfoldedOffset != 0) {
+      if (F.UnfoldedOffset == std::numeric_limits<int64_t>::min() &&
+          Factor == -1)
+        continue;
+      F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor;
+      if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset)
+        continue;
+      // If the offset will be truncated, check that it is in bounds.
+      if (!IntTy->isPointerTy() &&
+          !ConstantInt::isValueValidForType(IntTy, F.UnfoldedOffset))
+        continue;
+    }
+
+    // If we make it here and it's legal, add it.
+    (void)InsertFormula(LU, LUIdx, F);
+  next:;
+  }
+}
+
+/// Generate stride factor reuse formulae by making use of scaled-offset address
+/// modes, for example.
+void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
+  // Determine the integer type for the base formula.
+  Type *IntTy = Base.getType();
+  if (!IntTy) return;
+
+  // If this Formula already has a scaled register, we can't add another one.
+  // Try to unscale the formula to generate a better scale.
+  if (Base.Scale != 0 && !Base.unscale())
+    return;
+
+  assert(Base.Scale == 0 && "unscale did not did its job!");
+
+  // Check each interesting stride.
+  for (int64_t Factor : Factors) {
+    Base.Scale = Factor;
+    Base.HasBaseReg = Base.BaseRegs.size() > 1;
+    // Check whether this scale is going to be legal.
+    if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
+                    Base)) {
+      // As a special-case, handle special out-of-loop Basic users specially.
+      // TODO: Reconsider this special case.
+      if (LU.Kind == LSRUse::Basic &&
+          isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
+                     LU.AccessTy, Base) &&
+          LU.AllFixupsOutsideLoop)
+        LU.Kind = LSRUse::Special;
+      else
+        continue;
+    }
+    // For an ICmpZero, negating a solitary base register won't lead to
+    // new solutions.
+    if (LU.Kind == LSRUse::ICmpZero &&
+        !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV)
+      continue;
+    // For each addrec base reg, if its loop is current loop, apply the scale.
+    for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
+      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]);
+      if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) {
+        const SCEV *FactorS = SE.getConstant(IntTy, Factor);
+        if (FactorS->isZero())
+          continue;
+        // Divide out the factor, ignoring high bits, since we'll be
+        // scaling the value back up in the end.
+        if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) {
+          // TODO: This could be optimized to avoid all the copying.
+          Formula F = Base;
+          F.ScaledReg = Quotient;
+          F.deleteBaseReg(F.BaseRegs[i]);
+          // The canonical representation of 1*reg is reg, which is already in
+          // Base. In that case, do not try to insert the formula, it will be
+          // rejected anyway.
+          if (F.Scale == 1 && (F.BaseRegs.empty() ||
+                               (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
+            continue;
+          // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
+          // non canonical Formula with ScaledReg's loop not being L.
+          if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
+            F.canonicalize(*L);
+          (void)InsertFormula(LU, LUIdx, F);
+        }
+      }
+    }
+  }
+}
+
+/// Generate reuse formulae from different IV types.
+void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
+  // Don't bother truncating symbolic values.
+  if (Base.BaseGV) return;
+
+  // Determine the integer type for the base formula.
+  Type *DstTy = Base.getType();
+  if (!DstTy) return;
+  DstTy = SE.getEffectiveSCEVType(DstTy);
+
+  for (Type *SrcTy : Types) {
+    if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
+      Formula F = Base;
+
+      // Sometimes SCEV is able to prove zero during ext transform. It may
+      // happen if SCEV did not do all possible transforms while creating the
+      // initial node (maybe due to depth limitations), but it can do them while
+      // taking ext.
+      if (F.ScaledReg) {
+        const SCEV *NewScaledReg = SE.getAnyExtendExpr(F.ScaledReg, SrcTy);
+        if (NewScaledReg->isZero())
+         continue;
+        F.ScaledReg = NewScaledReg;
+      }
+      bool HasZeroBaseReg = false;
+      for (const SCEV *&BaseReg : F.BaseRegs) {
+        const SCEV *NewBaseReg = SE.getAnyExtendExpr(BaseReg, SrcTy);
+        if (NewBaseReg->isZero()) {
+          HasZeroBaseReg = true;
+          break;
+        }
+        BaseReg = NewBaseReg;
+      }
+      if (HasZeroBaseReg)
+        continue;
+
+      // TODO: This assumes we've done basic processing on all uses and
+      // have an idea what the register usage is.
+      if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
+        continue;
+
+      F.canonicalize(*L);
+      (void)InsertFormula(LU, LUIdx, F);
+    }
+  }
+}
+
+namespace {
+
+/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
+/// modifications so that the search phase doesn't have to worry about the data
+/// structures moving underneath it.
+struct WorkItem {
+  size_t LUIdx;
+  int64_t Imm;
+  const SCEV *OrigReg;
+
+  WorkItem(size_t LI, int64_t I, const SCEV *R)
+      : LUIdx(LI), Imm(I), OrigReg(R) {}
+
+  void print(raw_ostream &OS) const;
+  void dump() const;
+};
+
+} // end anonymous namespace
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void WorkItem::print(raw_ostream &OS) const {
+  OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
+     << " , add offset " << Imm;
+}
+
+LLVM_DUMP_METHOD void WorkItem::dump() const {
+  print(errs()); errs() << '\n';
+}
+#endif
+
+/// Look for registers which are a constant distance apart and try to form reuse
+/// opportunities between them.
+void LSRInstance::GenerateCrossUseConstantOffsets() {
+  // Group the registers by their value without any added constant offset.
+  using ImmMapTy = std::map<int64_t, const SCEV *>;
+
+  DenseMap<const SCEV *, ImmMapTy> Map;
+  DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
+  SmallVector<const SCEV *, 8> Sequence;
+  for (const SCEV *Use : RegUses) {
+    const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify.
+    int64_t Imm = ExtractImmediate(Reg, SE);
+    auto Pair = Map.insert(std::make_pair(Reg, ImmMapTy()));
+    if (Pair.second)
+      Sequence.push_back(Reg);
+    Pair.first->second.insert(std::make_pair(Imm, Use));
+    UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use);
+  }
+
+  // Now examine each set of registers with the same base value. Build up
+  // a list of work to do and do the work in a separate step so that we're
+  // not adding formulae and register counts while we're searching.
+  SmallVector<WorkItem, 32> WorkItems;
+  SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems;
+  for (const SCEV *Reg : Sequence) {
+    const ImmMapTy &Imms = Map.find(Reg)->second;
+
+    // It's not worthwhile looking for reuse if there's only one offset.
+    if (Imms.size() == 1)
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
+               for (const auto &Entry
+                    : Imms) dbgs()
+               << ' ' << Entry.first;
+               dbgs() << '\n');
+
+    // Examine each offset.
+    for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
+         J != JE; ++J) {
+      const SCEV *OrigReg = J->second;
+
+      int64_t JImm = J->first;
+      const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
+
+      if (!isa<SCEVConstant>(OrigReg) &&
+          UsedByIndicesMap[Reg].count() == 1) {
+        LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
+                          << '\n');
+        continue;
+      }
+
+      // Conservatively examine offsets between this orig reg a few selected
+      // other orig regs.
+      int64_t First = Imms.begin()->first;
+      int64_t Last = std::prev(Imms.end())->first;
+      // Compute (First + Last)  / 2 without overflow using the fact that
+      // First + Last = 2 * (First + Last) + (First ^ Last).
+      int64_t Avg = (First & Last) + ((First ^ Last) >> 1);
+      // If the result is negative and First is odd and Last even (or vice versa),
+      // we rounded towards -inf. Add 1 in that case, to round towards 0.
+      Avg = Avg + ((First ^ Last) & ((uint64_t)Avg >> 63));
+      ImmMapTy::const_iterator OtherImms[] = {
+          Imms.begin(), std::prev(Imms.end()),
+         Imms.lower_bound(Avg)};
+      for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) {
+        ImmMapTy::const_iterator M = OtherImms[i];
+        if (M == J || M == JE) continue;
+
+        // Compute the difference between the two.
+        int64_t Imm = (uint64_t)JImm - M->first;
+        for (unsigned LUIdx : UsedByIndices.set_bits())
+          // Make a memo of this use, offset, and register tuple.
+          if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
+            WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
+      }
+    }
+  }
+
+  Map.clear();
+  Sequence.clear();
+  UsedByIndicesMap.clear();
+  UniqueItems.clear();
+
+  // Now iterate through the worklist and add new formulae.
+  for (const WorkItem &WI : WorkItems) {
+    size_t LUIdx = WI.LUIdx;
+    LSRUse &LU = Uses[LUIdx];
+    int64_t Imm = WI.Imm;
+    const SCEV *OrigReg = WI.OrigReg;
+
+    Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
+    const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm));
+    unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
+
+    // TODO: Use a more targeted data structure.
+    for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
+      Formula F = LU.Formulae[L];
+      // FIXME: The code for the scaled and unscaled registers looks
+      // very similar but slightly different. Investigate if they
+      // could be merged. That way, we would not have to unscale the
+      // Formula.
+      F.unscale();
+      // Use the immediate in the scaled register.
+      if (F.ScaledReg == OrigReg) {
+        int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale;
+        // Don't create 50 + reg(-50).
+        if (F.referencesReg(SE.getSCEV(
+                   ConstantInt::get(IntTy, -(uint64_t)Offset))))
+          continue;
+        Formula NewF = F;
+        NewF.BaseOffset = Offset;
+        if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
+                        NewF))
+          continue;
+        NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
+
+        // If the new scale is a constant in a register, and adding the constant
+        // value to the immediate would produce a value closer to zero than the
+        // immediate itself, then the formula isn't worthwhile.
+        if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg))
+          if (C->getValue()->isNegative() != (NewF.BaseOffset < 0) &&
+              (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
+                  .ule(std::abs(NewF.BaseOffset)))
+            continue;
+
+        // OK, looks good.
+        NewF.canonicalize(*this->L);
+        (void)InsertFormula(LU, LUIdx, NewF);
+      } else {
+        // Use the immediate in a base register.
+        for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
+          const SCEV *BaseReg = F.BaseRegs[N];
+          if (BaseReg != OrigReg)
+            continue;
+          Formula NewF = F;
+          NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm;
+          if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
+                          LU.Kind, LU.AccessTy, NewF)) {
+            if (TTI.shouldFavorPostInc() &&
+                mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE))
+              continue;
+            if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
+              continue;
+            NewF = F;
+            NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm;
+          }
+          NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
+
+          // If the new formula has a constant in a register, and adding the
+          // constant value to the immediate would produce a value closer to
+          // zero than the immediate itself, then the formula isn't worthwhile.
+          for (const SCEV *NewReg : NewF.BaseRegs)
+            if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg))
+              if ((C->getAPInt() + NewF.BaseOffset)
+                      .abs()
+                      .slt(std::abs(NewF.BaseOffset)) &&
+                  (C->getAPInt() + NewF.BaseOffset).countTrailingZeros() >=
+                      countTrailingZeros<uint64_t>(NewF.BaseOffset))
+                goto skip_formula;
+
+          // Ok, looks good.
+          NewF.canonicalize(*this->L);
+          (void)InsertFormula(LU, LUIdx, NewF);
+          break;
+        skip_formula:;
+        }
+      }
+    }
+  }
+}
+
+/// Generate formulae for each use.
+void
+LSRInstance::GenerateAllReuseFormulae() {
+  // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
+  // queries are more precise.
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+      GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
+    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+      GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
+  }
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+      GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
+    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+      GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
+    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+      GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
+    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+      GenerateScales(LU, LUIdx, LU.Formulae[i]);
+  }
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+      GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
+  }
+
+  GenerateCrossUseConstantOffsets();
+
+  LLVM_DEBUG(dbgs() << "\n"
+                       "After generating reuse formulae:\n";
+             print_uses(dbgs()));
+}
+
+/// If there are multiple formulae with the same set of registers used
+/// by other uses, pick the best one and delete the others.
+void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
+  DenseSet<const SCEV *> VisitedRegs;
+  SmallPtrSet<const SCEV *, 16> Regs;
+  SmallPtrSet<const SCEV *, 16> LoserRegs;
+#ifndef NDEBUG
+  bool ChangedFormulae = false;
+#endif
+
+  // Collect the best formula for each unique set of shared registers. This
+  // is reset for each use.
+  using BestFormulaeTy =
+      DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo>;
+
+  BestFormulaeTy BestFormulae;
+
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
+               dbgs() << '\n');
+
+    bool Any = false;
+    for (size_t FIdx = 0, NumForms = LU.Formulae.size();
+         FIdx != NumForms; ++FIdx) {
+      Formula &F = LU.Formulae[FIdx];
+
+      // Some formulas are instant losers. For example, they may depend on
+      // nonexistent AddRecs from other loops. These need to be filtered
+      // immediately, otherwise heuristics could choose them over others leading
+      // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
+      // avoids the need to recompute this information across formulae using the
+      // same bad AddRec. Passing LoserRegs is also essential unless we remove
+      // the corresponding bad register from the Regs set.
+      Cost CostF(L, SE, TTI);
+      Regs.clear();
+      CostF.RateFormula(F, Regs, VisitedRegs, LU, &LoserRegs);
+      if (CostF.isLoser()) {
+        // During initial formula generation, undesirable formulae are generated
+        // by uses within other loops that have some non-trivial address mode or
+        // use the postinc form of the IV. LSR needs to provide these formulae
+        // as the basis of rediscovering the desired formula that uses an AddRec
+        // corresponding to the existing phi. Once all formulae have been
+        // generated, these initial losers may be pruned.
+        LLVM_DEBUG(dbgs() << "  Filtering loser "; F.print(dbgs());
+                   dbgs() << "\n");
+      }
+      else {
+        SmallVector<const SCEV *, 4> Key;
+        for (const SCEV *Reg : F.BaseRegs) {
+          if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
+            Key.push_back(Reg);
+        }
+        if (F.ScaledReg &&
+            RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
+          Key.push_back(F.ScaledReg);
+        // Unstable sort by host order ok, because this is only used for
+        // uniquifying.
+        llvm::sort(Key);
+
+        std::pair<BestFormulaeTy::const_iterator, bool> P =
+          BestFormulae.insert(std::make_pair(Key, FIdx));
+        if (P.second)
+          continue;
+
+        Formula &Best = LU.Formulae[P.first->second];
+
+        Cost CostBest(L, SE, TTI);
+        Regs.clear();
+        CostBest.RateFormula(Best, Regs, VisitedRegs, LU);
+        if (CostF.isLess(CostBest))
+          std::swap(F, Best);
+        LLVM_DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
+                   dbgs() << "\n"
+                             "    in favor of formula ";
+                   Best.print(dbgs()); dbgs() << '\n');
+      }
+#ifndef NDEBUG
+      ChangedFormulae = true;
+#endif
+      LU.DeleteFormula(F);
+      --FIdx;
+      --NumForms;
+      Any = true;
+    }
+
+    // Now that we've filtered out some formulae, recompute the Regs set.
+    if (Any)
+      LU.RecomputeRegs(LUIdx, RegUses);
+
+    // Reset this to prepare for the next use.
+    BestFormulae.clear();
+  }
+
+  LLVM_DEBUG(if (ChangedFormulae) {
+    dbgs() << "\n"
+              "After filtering out undesirable candidates:\n";
+    print_uses(dbgs());
+  });
+}
+
+/// Estimate the worst-case number of solutions the solver might have to
+/// consider. It almost never considers this many solutions because it prune the
+/// search space, but the pruning isn't always sufficient.
+size_t LSRInstance::EstimateSearchSpaceComplexity() const {
+  size_t Power = 1;
+  for (const LSRUse &LU : Uses) {
+    size_t FSize = LU.Formulae.size();
+    if (FSize >= ComplexityLimit) {
+      Power = ComplexityLimit;
+      break;
+    }
+    Power *= FSize;
+    if (Power >= ComplexityLimit)
+      break;
+  }
+  return Power;
+}
+
+/// When one formula uses a superset of the registers of another formula, it
+/// won't help reduce register pressure (though it may not necessarily hurt
+/// register pressure); remove it to simplify the system.
+void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
+  if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
+    LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
+
+    LLVM_DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
+                         "which use a superset of registers used by other "
+                         "formulae.\n");
+
+    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+      LSRUse &LU = Uses[LUIdx];
+      bool Any = false;
+      for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
+        Formula &F = LU.Formulae[i];
+        // Look for a formula with a constant or GV in a register. If the use
+        // also has a formula with that same value in an immediate field,
+        // delete the one that uses a register.
+        for (SmallVectorImpl<const SCEV *>::const_iterator
+             I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
+          if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
+            Formula NewF = F;
+            //FIXME: Formulas should store bitwidth to do wrapping properly.
+            //       See PR41034.
+            NewF.BaseOffset += (uint64_t)C->getValue()->getSExtValue();
+            NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
+                                (I - F.BaseRegs.begin()));
+            if (LU.HasFormulaWithSameRegs(NewF)) {
+              LLVM_DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
+                         dbgs() << '\n');
+              LU.DeleteFormula(F);
+              --i;
+              --e;
+              Any = true;
+              break;
+            }
+          } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
+            if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
+              if (!F.BaseGV) {
+                Formula NewF = F;
+                NewF.BaseGV = GV;
+                NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
+                                    (I - F.BaseRegs.begin()));
+                if (LU.HasFormulaWithSameRegs(NewF)) {
+                  LLVM_DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
+                             dbgs() << '\n');
+                  LU.DeleteFormula(F);
+                  --i;
+                  --e;
+                  Any = true;
+                  break;
+                }
+              }
+          }
+        }
+      }
+      if (Any)
+        LU.RecomputeRegs(LUIdx, RegUses);
+    }
+
+    LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
+  }
+}
+
+/// When there are many registers for expressions like A, A+1, A+2, etc.,
+/// allocate a single register for them.
+void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
   if (EstimateSearchSpaceComplexity() < ComplexityLimit) 
-    return; 
-  // Ok, we have too many of formulae on our hands to conveniently handle. 
-  // Use a rough heuristic to thin out the list. 
- 
-  // Set of Regs wich will be 100% used in final solution. 
-  // Used in each formula of a solution (in example above this is reg(c)). 
-  // We can skip them in calculations. 
-  SmallPtrSet<const SCEV *, 4> UniqRegs; 
-  LLVM_DEBUG(dbgs() << "The search space is too complex.\n"); 
- 
-  // Map each register to probability of not selecting 
-  DenseMap <const SCEV *, float> RegNumMap; 
-  for (const SCEV *Reg : RegUses) { 
-    if (UniqRegs.count(Reg)) 
-      continue; 
-    float PNotSel = 1; 
-    for (const LSRUse &LU : Uses) { 
-      if (!LU.Regs.count(Reg)) 
-        continue; 
-      float P = LU.getNotSelectedProbability(Reg); 
-      if (P != 0.0) 
-        PNotSel *= P; 
-      else 
-        UniqRegs.insert(Reg); 
-    } 
-    RegNumMap.insert(std::make_pair(Reg, PNotSel)); 
-  } 
- 
-  LLVM_DEBUG( 
-      dbgs() << "Narrowing the search space by deleting costly formulas\n"); 
- 
-  // Delete formulas where registers number expectation is high. 
-  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 
-    LSRUse &LU = Uses[LUIdx]; 
-    // If nothing to delete - continue. 
-    if (LU.Formulae.size() < 2) 
-      continue; 
-    // This is temporary solution to test performance. Float should be 
-    // replaced with round independent type (based on integers) to avoid 
-    // different results for different target builds. 
-    float FMinRegNum = LU.Formulae[0].getNumRegs(); 
-    float FMinARegNum = LU.Formulae[0].getNumRegs(); 
-    size_t MinIdx = 0; 
-    for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) { 
-      Formula &F = LU.Formulae[i]; 
-      float FRegNum = 0; 
-      float FARegNum = 0; 
-      for (const SCEV *BaseReg : F.BaseRegs) { 
-        if (UniqRegs.count(BaseReg)) 
-          continue; 
-        FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg); 
-        if (isa<SCEVAddRecExpr>(BaseReg)) 
-          FARegNum += 
-              RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg); 
-      } 
-      if (const SCEV *ScaledReg = F.ScaledReg) { 
-        if (!UniqRegs.count(ScaledReg)) { 
-          FRegNum += 
-              RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg); 
-          if (isa<SCEVAddRecExpr>(ScaledReg)) 
-            FARegNum += 
-                RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg); 
-        } 
-      } 
-      if (FMinRegNum > FRegNum || 
-          (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) { 
-        FMinRegNum = FRegNum; 
-        FMinARegNum = FARegNum; 
-        MinIdx = i; 
-      } 
-    } 
-    LLVM_DEBUG(dbgs() << "  The formula "; LU.Formulae[MinIdx].print(dbgs()); 
-               dbgs() << " with min reg num " << FMinRegNum << '\n'); 
-    if (MinIdx != 0) 
-      std::swap(LU.Formulae[MinIdx], LU.Formulae[0]); 
-    while (LU.Formulae.size() != 1) { 
-      LLVM_DEBUG(dbgs() << "  Deleting "; LU.Formulae.back().print(dbgs()); 
-                 dbgs() << '\n'); 
-      LU.Formulae.pop_back(); 
-    } 
-    LU.RecomputeRegs(LUIdx, RegUses); 
-    assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula"); 
-    Formula &F = LU.Formulae[0]; 
-    LLVM_DEBUG(dbgs() << "  Leaving only "; F.print(dbgs()); dbgs() << '\n'); 
-    // When we choose the formula, the regs become unique. 
-    UniqRegs.insert(F.BaseRegs.begin(), F.BaseRegs.end()); 
-    if (F.ScaledReg) 
-      UniqRegs.insert(F.ScaledReg); 
-  } 
-  LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs())); 
-} 
- 
-/// Pick a register which seems likely to be profitable, and then in any use 
-/// which has any reference to that register, delete all formulae which do not 
-/// reference that register. 
-void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() { 
-  // With all other options exhausted, loop until the system is simple 
-  // enough to handle. 
-  SmallPtrSet<const SCEV *, 4> Taken; 
-  while (EstimateSearchSpaceComplexity() >= ComplexityLimit) { 
-    // Ok, we have too many of formulae on our hands to conveniently handle. 
-    // Use a rough heuristic to thin out the list. 
-    LLVM_DEBUG(dbgs() << "The search space is too complex.\n"); 
- 
-    // Pick the register which is used by the most LSRUses, which is likely 
-    // to be a good reuse register candidate. 
-    const SCEV *Best = nullptr; 
-    unsigned BestNum = 0; 
-    for (const SCEV *Reg : RegUses) { 
-      if (Taken.count(Reg)) 
-        continue; 
-      if (!Best) { 
-        Best = Reg; 
-        BestNum = RegUses.getUsedByIndices(Reg).count(); 
-      } else { 
-        unsigned Count = RegUses.getUsedByIndices(Reg).count(); 
-        if (Count > BestNum) { 
-          Best = Reg; 
-          BestNum = Count; 
-        } 
-      } 
-    } 
-    assert(Best && "Failed to find best LSRUse candidate"); 
- 
-    LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best 
-                      << " will yield profitable reuse.\n"); 
-    Taken.insert(Best); 
- 
-    // In any use with formulae which references this register, delete formulae 
-    // which don't reference it. 
-    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 
-      LSRUse &LU = Uses[LUIdx]; 
-      if (!LU.Regs.count(Best)) continue; 
- 
-      bool Any = false; 
-      for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) { 
-        Formula &F = LU.Formulae[i]; 
-        if (!F.referencesReg(Best)) { 
-          LLVM_DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n'); 
-          LU.DeleteFormula(F); 
-          --e; 
-          --i; 
-          Any = true; 
-          assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?"); 
-          continue; 
-        } 
-      } 
- 
-      if (Any) 
-        LU.RecomputeRegs(LUIdx, RegUses); 
-    } 
- 
-    LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs())); 
-  } 
-} 
- 
-/// If there are an extraordinary number of formulae to choose from, use some 
-/// rough heuristics to prune down the number of formulae. This keeps the main 
-/// solver from taking an extraordinary amount of time in some worst-case 
-/// scenarios. 
-void LSRInstance::NarrowSearchSpaceUsingHeuristics() { 
-  NarrowSearchSpaceByDetectingSupersets(); 
-  NarrowSearchSpaceByCollapsingUnrolledCode(); 
-  NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(); 
-  if (FilterSameScaledReg) 
-    NarrowSearchSpaceByFilterFormulaWithSameScaledReg(); 
-  NarrowSearchSpaceByFilterPostInc(); 
-  if (LSRExpNarrow) 
-    NarrowSearchSpaceByDeletingCostlyFormulas(); 
-  else 
-    NarrowSearchSpaceByPickingWinnerRegs(); 
-} 
- 
-/// This is the recursive solver. 
-void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution, 
-                               Cost &SolutionCost, 
-                               SmallVectorImpl<const Formula *> &Workspace, 
-                               const Cost &CurCost, 
-                               const SmallPtrSet<const SCEV *, 16> &CurRegs, 
-                               DenseSet<const SCEV *> &VisitedRegs) const { 
-  // Some ideas: 
-  //  - prune more: 
-  //    - use more aggressive filtering 
-  //    - sort the formula so that the most profitable solutions are found first 
-  //    - sort the uses too 
-  //  - search faster: 
-  //    - don't compute a cost, and then compare. compare while computing a cost 
-  //      and bail early. 
-  //    - track register sets with SmallBitVector 
- 
-  const LSRUse &LU = Uses[Workspace.size()]; 
- 
-  // If this use references any register that's already a part of the 
-  // in-progress solution, consider it a requirement that a formula must 
-  // reference that register in order to be considered. This prunes out 
-  // unprofitable searching. 
-  SmallSetVector<const SCEV *, 4> ReqRegs; 
-  for (const SCEV *S : CurRegs) 
-    if (LU.Regs.count(S)) 
-      ReqRegs.insert(S); 
- 
-  SmallPtrSet<const SCEV *, 16> NewRegs; 
-  Cost NewCost(L, SE, TTI); 
-  for (const Formula &F : LU.Formulae) { 
-    // Ignore formulae which may not be ideal in terms of register reuse of 
-    // ReqRegs.  The formula should use all required registers before 
-    // introducing new ones. 
-    // This can sometimes (notably when trying to favour postinc) lead to 
-    // sub-optimial decisions. There it is best left to the cost modelling to 
-    // get correct. 
-    if (!TTI.shouldFavorPostInc() || LU.Kind != LSRUse::Address) { 
-      int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size()); 
-      for (const SCEV *Reg : ReqRegs) { 
-        if ((F.ScaledReg && F.ScaledReg == Reg) || 
-            is_contained(F.BaseRegs, Reg)) { 
-          --NumReqRegsToFind; 
-          if (NumReqRegsToFind == 0) 
-            break; 
-        } 
-      } 
-      if (NumReqRegsToFind != 0) { 
-        // If none of the formulae satisfied the required registers, then we could 
-        // clear ReqRegs and try again. Currently, we simply give up in this case. 
-        continue; 
-      } 
-    } 
- 
-    // Evaluate the cost of the current formula. If it's already worse than 
-    // the current best, prune the search at that point. 
-    NewCost = CurCost; 
-    NewRegs = CurRegs; 
-    NewCost.RateFormula(F, NewRegs, VisitedRegs, LU); 
-    if (NewCost.isLess(SolutionCost)) { 
-      Workspace.push_back(&F); 
-      if (Workspace.size() != Uses.size()) { 
-        SolveRecurse(Solution, SolutionCost, Workspace, NewCost, 
-                     NewRegs, VisitedRegs); 
-        if (F.getNumRegs() == 1 && Workspace.size() == 1) 
-          VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]); 
-      } else { 
-        LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs()); 
-                   dbgs() << ".\nRegs:\n"; 
-                   for (const SCEV *S : NewRegs) dbgs() 
-                      << "- " << *S << "\n"; 
-                   dbgs() << '\n'); 
- 
-        SolutionCost = NewCost; 
-        Solution = Workspace; 
-      } 
-      Workspace.pop_back(); 
-    } 
-  } 
-} 
- 
-/// Choose one formula from each use. Return the results in the given Solution 
-/// vector. 
-void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const { 
-  SmallVector<const Formula *, 8> Workspace; 
-  Cost SolutionCost(L, SE, TTI); 
-  SolutionCost.Lose(); 
-  Cost CurCost(L, SE, TTI); 
-  SmallPtrSet<const SCEV *, 16> CurRegs; 
-  DenseSet<const SCEV *> VisitedRegs; 
-  Workspace.reserve(Uses.size()); 
- 
-  // SolveRecurse does all the work. 
-  SolveRecurse(Solution, SolutionCost, Workspace, CurCost, 
-               CurRegs, VisitedRegs); 
-  if (Solution.empty()) { 
-    LLVM_DEBUG(dbgs() << "\nNo Satisfactory Solution\n"); 
-    return; 
-  } 
- 
-  // Ok, we've now made all our decisions. 
-  LLVM_DEBUG(dbgs() << "\n" 
-                       "The chosen solution requires "; 
-             SolutionCost.print(dbgs()); dbgs() << ":\n"; 
-             for (size_t i = 0, e = Uses.size(); i != e; ++i) { 
-               dbgs() << "  "; 
-               Uses[i].print(dbgs()); 
-               dbgs() << "\n" 
-                         "    "; 
-               Solution[i]->print(dbgs()); 
-               dbgs() << '\n'; 
-             }); 
- 
-  assert(Solution.size() == Uses.size() && "Malformed solution!"); 
-} 
- 
-/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as 
-/// we can go while still being dominated by the input positions. This helps 
-/// canonicalize the insert position, which encourages sharing. 
-BasicBlock::iterator 
-LSRInstance::HoistInsertPosition(BasicBlock::iterator IP, 
-                                 const SmallVectorImpl<Instruction *> &Inputs) 
-                                                                         const { 
-  Instruction *Tentative = &*IP; 
-  while (true) { 
-    bool AllDominate = true; 
-    Instruction *BetterPos = nullptr; 
-    // Don't bother attempting to insert before a catchswitch, their basic block 
-    // cannot have other non-PHI instructions. 
-    if (isa<CatchSwitchInst>(Tentative)) 
-      return IP; 
- 
-    for (Instruction *Inst : Inputs) { 
-      if (Inst == Tentative || !DT.dominates(Inst, Tentative)) { 
-        AllDominate = false; 
-        break; 
-      } 
-      // Attempt to find an insert position in the middle of the block, 
-      // instead of at the end, so that it can be used for other expansions. 
-      if (Tentative->getParent() == Inst->getParent() && 
-          (!BetterPos || !DT.dominates(Inst, BetterPos))) 
-        BetterPos = &*std::next(BasicBlock::iterator(Inst)); 
-    } 
-    if (!AllDominate) 
-      break; 
-    if (BetterPos) 
-      IP = BetterPos->getIterator(); 
-    else 
-      IP = Tentative->getIterator(); 
- 
-    const Loop *IPLoop = LI.getLoopFor(IP->getParent()); 
-    unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0; 
- 
-    BasicBlock *IDom; 
-    for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) { 
-      if (!Rung) return IP; 
-      Rung = Rung->getIDom(); 
-      if (!Rung) return IP; 
-      IDom = Rung->getBlock(); 
- 
-      // Don't climb into a loop though. 
-      const Loop *IDomLoop = LI.getLoopFor(IDom); 
-      unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0; 
-      if (IDomDepth <= IPLoopDepth && 
-          (IDomDepth != IPLoopDepth || IDomLoop == IPLoop)) 
-        break; 
-    } 
- 
-    Tentative = IDom->getTerminator(); 
-  } 
- 
-  return IP; 
-} 
- 
-/// Determine an input position which will be dominated by the operands and 
-/// which will dominate the result. 
-BasicBlock::iterator 
-LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP, 
-                                           const LSRFixup &LF, 
-                                           const LSRUse &LU, 
-                                           SCEVExpander &Rewriter) const { 
-  // Collect some instructions which must be dominated by the 
-  // expanding replacement. These must be dominated by any operands that 
-  // will be required in the expansion. 
-  SmallVector<Instruction *, 4> Inputs; 
-  if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace)) 
-    Inputs.push_back(I); 
-  if (LU.Kind == LSRUse::ICmpZero) 
-    if (Instruction *I = 
-          dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1))) 
-      Inputs.push_back(I); 
-  if (LF.PostIncLoops.count(L)) { 
-    if (LF.isUseFullyOutsideLoop(L)) 
-      Inputs.push_back(L->getLoopLatch()->getTerminator()); 
-    else 
-      Inputs.push_back(IVIncInsertPos); 
-  } 
-  // The expansion must also be dominated by the increment positions of any 
-  // loops it for which it is using post-inc mode. 
-  for (const Loop *PIL : LF.PostIncLoops) { 
-    if (PIL == L) continue; 
- 
-    // Be dominated by the loop exit. 
-    SmallVector<BasicBlock *, 4> ExitingBlocks; 
-    PIL->getExitingBlocks(ExitingBlocks); 
-    if (!ExitingBlocks.empty()) { 
-      BasicBlock *BB = ExitingBlocks[0]; 
-      for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i) 
-        BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]); 
-      Inputs.push_back(BB->getTerminator()); 
-    } 
-  } 
- 
-  assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad() 
-         && !isa<DbgInfoIntrinsic>(LowestIP) && 
-         "Insertion point must be a normal instruction"); 
- 
-  // Then, climb up the immediate dominator tree as far as we can go while 
-  // still being dominated by the input positions. 
-  BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs); 
- 
-  // Don't insert instructions before PHI nodes. 
-  while (isa<PHINode>(IP)) ++IP; 
- 
-  // Ignore landingpad instructions. 
-  while (IP->isEHPad()) ++IP; 
- 
-  // Ignore debug intrinsics. 
-  while (isa<DbgInfoIntrinsic>(IP)) ++IP; 
- 
-  // Set IP below instructions recently inserted by SCEVExpander. This keeps the 
-  // IP consistent across expansions and allows the previously inserted 
-  // instructions to be reused by subsequent expansion. 
-  while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP) 
-    ++IP; 
- 
-  return IP; 
-} 
- 
-/// Emit instructions for the leading candidate expression for this LSRUse (this 
-/// is called "expanding"). 
-Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF, 
-                           const Formula &F, BasicBlock::iterator IP, 
-                           SCEVExpander &Rewriter, 
-                           SmallVectorImpl<WeakTrackingVH> &DeadInsts) const { 
-  if (LU.RigidFormula) 
-    return LF.OperandValToReplace; 
- 
-  // Determine an input position which will be dominated by the operands and 
-  // which will dominate the result. 
-  IP = AdjustInsertPositionForExpand(IP, LF, LU, Rewriter); 
-  Rewriter.setInsertPoint(&*IP); 
- 
-  // Inform the Rewriter if we have a post-increment use, so that it can 
-  // perform an advantageous expansion. 
-  Rewriter.setPostInc(LF.PostIncLoops); 
- 
-  // This is the type that the user actually needs. 
-  Type *OpTy = LF.OperandValToReplace->getType(); 
-  // This will be the type that we'll initially expand to. 
-  Type *Ty = F.getType(); 
-  if (!Ty) 
-    // No type known; just expand directly to the ultimate type. 
-    Ty = OpTy; 
-  else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy)) 
-    // Expand directly to the ultimate type if it's the right size. 
-    Ty = OpTy; 
-  // This is the type to do integer arithmetic in. 
-  Type *IntTy = SE.getEffectiveSCEVType(Ty); 
- 
-  // Build up a list of operands to add together to form the full base. 
-  SmallVector<const SCEV *, 8> Ops; 
- 
-  // Expand the BaseRegs portion. 
-  for (const SCEV *Reg : F.BaseRegs) { 
-    assert(!Reg->isZero() && "Zero allocated in a base register!"); 
- 
-    // If we're expanding for a post-inc user, make the post-inc adjustment. 
-    Reg = denormalizeForPostIncUse(Reg, LF.PostIncLoops, SE); 
-    Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr))); 
-  } 
- 
-  // Expand the ScaledReg portion. 
-  Value *ICmpScaledV = nullptr; 
-  if (F.Scale != 0) { 
-    const SCEV *ScaledS = F.ScaledReg; 
- 
-    // If we're expanding for a post-inc user, make the post-inc adjustment. 
-    PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops); 
-    ScaledS = denormalizeForPostIncUse(ScaledS, Loops, SE); 
- 
-    if (LU.Kind == LSRUse::ICmpZero) { 
-      // Expand ScaleReg as if it was part of the base regs. 
-      if (F.Scale == 1) 
-        Ops.push_back( 
-            SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr))); 
-      else { 
-        // An interesting way of "folding" with an icmp is to use a negated 
-        // scale, which we'll implement by inserting it into the other operand 
-        // of the icmp. 
-        assert(F.Scale == -1 && 
-               "The only scale supported by ICmpZero uses is -1!"); 
-        ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr); 
-      } 
-    } else { 
-      // Otherwise just expand the scaled register and an explicit scale, 
-      // which is expected to be matched as part of the address. 
- 
-      // Flush the operand list to suppress SCEVExpander hoisting address modes. 
-      // Unless the addressing mode will not be folded. 
-      if (!Ops.empty() && LU.Kind == LSRUse::Address && 
-          isAMCompletelyFolded(TTI, LU, F)) { 
-        Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), nullptr); 
-        Ops.clear(); 
-        Ops.push_back(SE.getUnknown(FullV)); 
-      } 
-      ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr)); 
-      if (F.Scale != 1) 
-        ScaledS = 
-            SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale)); 
-      Ops.push_back(ScaledS); 
-    } 
-  } 
- 
-  // Expand the GV portion. 
-  if (F.BaseGV) { 
-    // Flush the operand list to suppress SCEVExpander hoisting. 
-    if (!Ops.empty()) { 
-      Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty); 
-      Ops.clear(); 
-      Ops.push_back(SE.getUnknown(FullV)); 
-    } 
-    Ops.push_back(SE.getUnknown(F.BaseGV)); 
-  } 
- 
-  // Flush the operand list to suppress SCEVExpander hoisting of both folded and 
-  // unfolded offsets. LSR assumes they both live next to their uses. 
-  if (!Ops.empty()) { 
-    Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty); 
-    Ops.clear(); 
-    Ops.push_back(SE.getUnknown(FullV)); 
-  } 
- 
-  // Expand the immediate portion. 
-  int64_t Offset = (uint64_t)F.BaseOffset + LF.Offset; 
-  if (Offset != 0) { 
-    if (LU.Kind == LSRUse::ICmpZero) { 
-      // The other interesting way of "folding" with an ICmpZero is to use a 
-      // negated immediate. 
-      if (!ICmpScaledV) 
-        ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset); 
-      else { 
-        Ops.push_back(SE.getUnknown(ICmpScaledV)); 
-        ICmpScaledV = ConstantInt::get(IntTy, Offset); 
-      } 
-    } else { 
-      // Just add the immediate values. These again are expected to be matched 
-      // as part of the address. 
-      Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset))); 
-    } 
-  } 
- 
-  // Expand the unfolded offset portion. 
-  int64_t UnfoldedOffset = F.UnfoldedOffset; 
-  if (UnfoldedOffset != 0) { 
-    // Just add the immediate values. 
-    Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, 
-                                                       UnfoldedOffset))); 
-  } 
- 
-  // Emit instructions summing all the operands. 
-  const SCEV *FullS = Ops.empty() ? 
-                      SE.getConstant(IntTy, 0) : 
-                      SE.getAddExpr(Ops); 
-  Value *FullV = Rewriter.expandCodeFor(FullS, Ty); 
- 
-  // We're done expanding now, so reset the rewriter. 
-  Rewriter.clearPostInc(); 
- 
-  // An ICmpZero Formula represents an ICmp which we're handling as a 
-  // comparison against zero. Now that we've expanded an expression for that 
-  // form, update the ICmp's other operand. 
-  if (LU.Kind == LSRUse::ICmpZero) { 
-    ICmpInst *CI = cast<ICmpInst>(LF.UserInst); 
-    if (auto *OperandIsInstr = dyn_cast<Instruction>(CI->getOperand(1))) 
-      DeadInsts.emplace_back(OperandIsInstr); 
-    assert(!F.BaseGV && "ICmp does not support folding a global value and " 
-                           "a scale at the same time!"); 
-    if (F.Scale == -1) { 
-      if (ICmpScaledV->getType() != OpTy) { 
-        Instruction *Cast = 
-          CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false, 
-                                                   OpTy, false), 
-                           ICmpScaledV, OpTy, "tmp", CI); 
-        ICmpScaledV = Cast; 
-      } 
-      CI->setOperand(1, ICmpScaledV); 
-    } else { 
-      // A scale of 1 means that the scale has been expanded as part of the 
-      // base regs. 
-      assert((F.Scale == 0 || F.Scale == 1) && 
-             "ICmp does not support folding a global value and " 
-             "a scale at the same time!"); 
-      Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy), 
-                                           -(uint64_t)Offset); 
-      if (C->getType() != OpTy) 
-        C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false, 
-                                                          OpTy, false), 
-                                  C, OpTy); 
- 
-      CI->setOperand(1, C); 
-    } 
-  } 
- 
-  return FullV; 
-} 
- 
-/// Helper for Rewrite. PHI nodes are special because the use of their operands 
-/// effectively happens in their predecessor blocks, so the expression may need 
-/// to be expanded in multiple places. 
-void LSRInstance::RewriteForPHI( 
-    PHINode *PN, const LSRUse &LU, const LSRFixup &LF, const Formula &F, 
-    SCEVExpander &Rewriter, SmallVectorImpl<WeakTrackingVH> &DeadInsts) const { 
-  DenseMap<BasicBlock *, Value *> Inserted; 
-  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) 
-    if (PN->getIncomingValue(i) == LF.OperandValToReplace) { 
-      bool needUpdateFixups = false; 
-      BasicBlock *BB = PN->getIncomingBlock(i); 
- 
-      // If this is a critical edge, split the edge so that we do not insert 
-      // the code on all predecessor/successor paths.  We do this unless this 
-      // is the canonical backedge for this loop, which complicates post-inc 
-      // users. 
-      if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 && 
-          !isa<IndirectBrInst>(BB->getTerminator()) && 
-          !isa<CatchSwitchInst>(BB->getTerminator())) { 
-        BasicBlock *Parent = PN->getParent(); 
-        Loop *PNLoop = LI.getLoopFor(Parent); 
-        if (!PNLoop || Parent != PNLoop->getHeader()) { 
-          // Split the critical edge. 
-          BasicBlock *NewBB = nullptr; 
-          if (!Parent->isLandingPad()) { 
+    return;
+
+  LLVM_DEBUG(
+      dbgs() << "The search space is too complex.\n"
+                "Narrowing the search space by assuming that uses separated "
+                "by a constant offset will use the same registers.\n");
+
+  // This is especially useful for unrolled loops.
+
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    for (const Formula &F : LU.Formulae) {
+      if (F.BaseOffset == 0 || (F.Scale != 0 && F.Scale != 1))
+        continue;
+
+      LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
+      if (!LUThatHas)
+        continue;
+
+      if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
+                              LU.Kind, LU.AccessTy))
+        continue;
+
+      LLVM_DEBUG(dbgs() << "  Deleting use "; LU.print(dbgs()); dbgs() << '\n');
+
+      LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
+
+      // Transfer the fixups of LU to LUThatHas.
+      for (LSRFixup &Fixup : LU.Fixups) {
+        Fixup.Offset += F.BaseOffset;
+        LUThatHas->pushFixup(Fixup);
+        LLVM_DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
+      }
+
+      // Delete formulae from the new use which are no longer legal.
+      bool Any = false;
+      for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
+        Formula &F = LUThatHas->Formulae[i];
+        if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
+                        LUThatHas->Kind, LUThatHas->AccessTy, F)) {
+          LLVM_DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
+          LUThatHas->DeleteFormula(F);
+          --i;
+          --e;
+          Any = true;
+        }
+      }
+
+      if (Any)
+        LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
+
+      // Delete the old use.
+      DeleteUse(LU, LUIdx);
+      --LUIdx;
+      --NumUses;
+      break;
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
+}
+
+/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
+/// we've done more filtering, as it may be able to find more formulae to
+/// eliminate.
+void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
+  if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
+    LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
+
+    LLVM_DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
+                         "undesirable dedicated registers.\n");
+
+    FilterOutUndesirableDedicatedRegisters();
+
+    LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
+  }
+}
+
+/// If a LSRUse has multiple formulae with the same ScaledReg and Scale.
+/// Pick the best one and delete the others.
+/// This narrowing heuristic is to keep as many formulae with different
+/// Scale and ScaledReg pair as possible while narrowing the search space.
+/// The benefit is that it is more likely to find out a better solution
+/// from a formulae set with more Scale and ScaledReg variations than
+/// a formulae set with the same Scale and ScaledReg. The picking winner
+/// reg heuristic will often keep the formulae with the same Scale and
+/// ScaledReg and filter others, and we want to avoid that if possible.
+void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
+  if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+    return;
+
+  LLVM_DEBUG(
+      dbgs() << "The search space is too complex.\n"
+                "Narrowing the search space by choosing the best Formula "
+                "from the Formulae with the same Scale and ScaledReg.\n");
+
+  // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
+  using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>;
+
+  BestFormulaeTy BestFormulae;
+#ifndef NDEBUG
+  bool ChangedFormulae = false;
+#endif
+  DenseSet<const SCEV *> VisitedRegs;
+  SmallPtrSet<const SCEV *, 16> Regs;
+
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
+               dbgs() << '\n');
+
+    // Return true if Formula FA is better than Formula FB.
+    auto IsBetterThan = [&](Formula &FA, Formula &FB) {
+      // First we will try to choose the Formula with fewer new registers.
+      // For a register used by current Formula, the more the register is
+      // shared among LSRUses, the less we increase the register number
+      // counter of the formula.
+      size_t FARegNum = 0;
+      for (const SCEV *Reg : FA.BaseRegs) {
+        const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
+        FARegNum += (NumUses - UsedByIndices.count() + 1);
+      }
+      size_t FBRegNum = 0;
+      for (const SCEV *Reg : FB.BaseRegs) {
+        const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
+        FBRegNum += (NumUses - UsedByIndices.count() + 1);
+      }
+      if (FARegNum != FBRegNum)
+        return FARegNum < FBRegNum;
+
+      // If the new register numbers are the same, choose the Formula with
+      // less Cost.
+      Cost CostFA(L, SE, TTI);
+      Cost CostFB(L, SE, TTI);
+      Regs.clear();
+      CostFA.RateFormula(FA, Regs, VisitedRegs, LU);
+      Regs.clear();
+      CostFB.RateFormula(FB, Regs, VisitedRegs, LU);
+      return CostFA.isLess(CostFB);
+    };
+
+    bool Any = false;
+    for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
+         ++FIdx) {
+      Formula &F = LU.Formulae[FIdx];
+      if (!F.ScaledReg)
+        continue;
+      auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx});
+      if (P.second)
+        continue;
+
+      Formula &Best = LU.Formulae[P.first->second];
+      if (IsBetterThan(F, Best))
+        std::swap(F, Best);
+      LLVM_DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
+                 dbgs() << "\n"
+                           "    in favor of formula ";
+                 Best.print(dbgs()); dbgs() << '\n');
+#ifndef NDEBUG
+      ChangedFormulae = true;
+#endif
+      LU.DeleteFormula(F);
+      --FIdx;
+      --NumForms;
+      Any = true;
+    }
+    if (Any)
+      LU.RecomputeRegs(LUIdx, RegUses);
+
+    // Reset this to prepare for the next use.
+    BestFormulae.clear();
+  }
+
+  LLVM_DEBUG(if (ChangedFormulae) {
+    dbgs() << "\n"
+              "After filtering out undesirable candidates:\n";
+    print_uses(dbgs());
+  });
+}
+
+/// If we are over the complexity limit, filter out any post-inc prefering
+/// variables to only post-inc values.
+void LSRInstance::NarrowSearchSpaceByFilterPostInc() {
+  if (!TTI.shouldFavorPostInc())
+    return;
+  if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+    return;
+
+  LLVM_DEBUG(dbgs() << "The search space is too complex.\n"
+                       "Narrowing the search space by choosing the lowest "
+                       "register Formula for PostInc Uses.\n");
+
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+
+    if (LU.Kind != LSRUse::Address)
+      continue;
+    if (!TTI.isIndexedLoadLegal(TTI.MIM_PostInc, LU.AccessTy.getType()) &&
+        !TTI.isIndexedStoreLegal(TTI.MIM_PostInc, LU.AccessTy.getType()))
+      continue;
+
+    size_t MinRegs = std::numeric_limits<size_t>::max();
+    for (const Formula &F : LU.Formulae)
+      MinRegs = std::min(F.getNumRegs(), MinRegs);
+
+    bool Any = false;
+    for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
+         ++FIdx) {
+      Formula &F = LU.Formulae[FIdx];
+      if (F.getNumRegs() > MinRegs) {
+        LLVM_DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
+                   dbgs() << "\n");
+        LU.DeleteFormula(F);
+        --FIdx;
+        --NumForms;
+        Any = true;
+      }
+    }
+    if (Any)
+      LU.RecomputeRegs(LUIdx, RegUses);
+
+    if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+      break;
+  }
+
+  LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
+}
+
+/// The function delete formulas with high registers number expectation.
+/// Assuming we don't know the value of each formula (already delete
+/// all inefficient), generate probability of not selecting for each
+/// register.
+/// For example,
+/// Use1:
+///  reg(a) + reg({0,+,1})
+///  reg(a) + reg({-1,+,1}) + 1
+///  reg({a,+,1})
+/// Use2:
+///  reg(b) + reg({0,+,1})
+///  reg(b) + reg({-1,+,1}) + 1
+///  reg({b,+,1})
+/// Use3:
+///  reg(c) + reg(b) + reg({0,+,1})
+///  reg(c) + reg({b,+,1})
+///
+/// Probability of not selecting
+///                 Use1   Use2    Use3
+/// reg(a)         (1/3) *   1   *   1
+/// reg(b)           1   * (1/3) * (1/2)
+/// reg({0,+,1})   (2/3) * (2/3) * (1/2)
+/// reg({-1,+,1})  (2/3) * (2/3) *   1
+/// reg({a,+,1})   (2/3) *   1   *   1
+/// reg({b,+,1})     1   * (2/3) * (2/3)
+/// reg(c)           1   *   1   *   0
+///
+/// Now count registers number mathematical expectation for each formula:
+/// Note that for each use we exclude probability if not selecting for the use.
+/// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding
+/// probabilty 1/3 of not selecting for Use1).
+/// Use1:
+///  reg(a) + reg({0,+,1})          1 + 1/3       -- to be deleted
+///  reg(a) + reg({-1,+,1}) + 1     1 + 4/9       -- to be deleted
+///  reg({a,+,1})                   1
+/// Use2:
+///  reg(b) + reg({0,+,1})          1/2 + 1/3     -- to be deleted
+///  reg(b) + reg({-1,+,1}) + 1     1/2 + 2/3     -- to be deleted
+///  reg({b,+,1})                   2/3
+/// Use3:
+///  reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
+///  reg(c) + reg({b,+,1})          1 + 2/3
+void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
+  if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+    return;
+  // Ok, we have too many of formulae on our hands to conveniently handle.
+  // Use a rough heuristic to thin out the list.
+
+  // Set of Regs wich will be 100% used in final solution.
+  // Used in each formula of a solution (in example above this is reg(c)).
+  // We can skip them in calculations.
+  SmallPtrSet<const SCEV *, 4> UniqRegs;
+  LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
+
+  // Map each register to probability of not selecting
+  DenseMap <const SCEV *, float> RegNumMap;
+  for (const SCEV *Reg : RegUses) {
+    if (UniqRegs.count(Reg))
+      continue;
+    float PNotSel = 1;
+    for (const LSRUse &LU : Uses) {
+      if (!LU.Regs.count(Reg))
+        continue;
+      float P = LU.getNotSelectedProbability(Reg);
+      if (P != 0.0)
+        PNotSel *= P;
+      else
+        UniqRegs.insert(Reg);
+    }
+    RegNumMap.insert(std::make_pair(Reg, PNotSel));
+  }
+
+  LLVM_DEBUG(
+      dbgs() << "Narrowing the search space by deleting costly formulas\n");
+
+  // Delete formulas where registers number expectation is high.
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    // If nothing to delete - continue.
+    if (LU.Formulae.size() < 2)
+      continue;
+    // This is temporary solution to test performance. Float should be
+    // replaced with round independent type (based on integers) to avoid
+    // different results for different target builds.
+    float FMinRegNum = LU.Formulae[0].getNumRegs();
+    float FMinARegNum = LU.Formulae[0].getNumRegs();
+    size_t MinIdx = 0;
+    for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
+      Formula &F = LU.Formulae[i];
+      float FRegNum = 0;
+      float FARegNum = 0;
+      for (const SCEV *BaseReg : F.BaseRegs) {
+        if (UniqRegs.count(BaseReg))
+          continue;
+        FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
+        if (isa<SCEVAddRecExpr>(BaseReg))
+          FARegNum +=
+              RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
+      }
+      if (const SCEV *ScaledReg = F.ScaledReg) {
+        if (!UniqRegs.count(ScaledReg)) {
+          FRegNum +=
+              RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
+          if (isa<SCEVAddRecExpr>(ScaledReg))
+            FARegNum +=
+                RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
+        }
+      }
+      if (FMinRegNum > FRegNum ||
+          (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) {
+        FMinRegNum = FRegNum;
+        FMinARegNum = FARegNum;
+        MinIdx = i;
+      }
+    }
+    LLVM_DEBUG(dbgs() << "  The formula "; LU.Formulae[MinIdx].print(dbgs());
+               dbgs() << " with min reg num " << FMinRegNum << '\n');
+    if (MinIdx != 0)
+      std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
+    while (LU.Formulae.size() != 1) {
+      LLVM_DEBUG(dbgs() << "  Deleting "; LU.Formulae.back().print(dbgs());
+                 dbgs() << '\n');
+      LU.Formulae.pop_back();
+    }
+    LU.RecomputeRegs(LUIdx, RegUses);
+    assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
+    Formula &F = LU.Formulae[0];
+    LLVM_DEBUG(dbgs() << "  Leaving only "; F.print(dbgs()); dbgs() << '\n');
+    // When we choose the formula, the regs become unique.
+    UniqRegs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
+    if (F.ScaledReg)
+      UniqRegs.insert(F.ScaledReg);
+  }
+  LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
+}
+
+/// Pick a register which seems likely to be profitable, and then in any use
+/// which has any reference to that register, delete all formulae which do not
+/// reference that register.
+void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
+  // With all other options exhausted, loop until the system is simple
+  // enough to handle.
+  SmallPtrSet<const SCEV *, 4> Taken;
+  while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
+    // Ok, we have too many of formulae on our hands to conveniently handle.
+    // Use a rough heuristic to thin out the list.
+    LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
+
+    // Pick the register which is used by the most LSRUses, which is likely
+    // to be a good reuse register candidate.
+    const SCEV *Best = nullptr;
+    unsigned BestNum = 0;
+    for (const SCEV *Reg : RegUses) {
+      if (Taken.count(Reg))
+        continue;
+      if (!Best) {
+        Best = Reg;
+        BestNum = RegUses.getUsedByIndices(Reg).count();
+      } else {
+        unsigned Count = RegUses.getUsedByIndices(Reg).count();
+        if (Count > BestNum) {
+          Best = Reg;
+          BestNum = Count;
+        }
+      }
+    }
+    assert(Best && "Failed to find best LSRUse candidate");
+
+    LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
+                      << " will yield profitable reuse.\n");
+    Taken.insert(Best);
+
+    // In any use with formulae which references this register, delete formulae
+    // which don't reference it.
+    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+      LSRUse &LU = Uses[LUIdx];
+      if (!LU.Regs.count(Best)) continue;
+
+      bool Any = false;
+      for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
+        Formula &F = LU.Formulae[i];
+        if (!F.referencesReg(Best)) {
+          LLVM_DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
+          LU.DeleteFormula(F);
+          --e;
+          --i;
+          Any = true;
+          assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
+          continue;
+        }
+      }
+
+      if (Any)
+        LU.RecomputeRegs(LUIdx, RegUses);
+    }
+
+    LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
+  }
+}
+
+/// If there are an extraordinary number of formulae to choose from, use some
+/// rough heuristics to prune down the number of formulae. This keeps the main
+/// solver from taking an extraordinary amount of time in some worst-case
+/// scenarios.
+void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
+  NarrowSearchSpaceByDetectingSupersets();
+  NarrowSearchSpaceByCollapsingUnrolledCode();
+  NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
+  if (FilterSameScaledReg)
+    NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
+  NarrowSearchSpaceByFilterPostInc();
+  if (LSRExpNarrow)
+    NarrowSearchSpaceByDeletingCostlyFormulas();
+  else
+    NarrowSearchSpaceByPickingWinnerRegs();
+}
+
+/// This is the recursive solver.
+void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
+                               Cost &SolutionCost,
+                               SmallVectorImpl<const Formula *> &Workspace,
+                               const Cost &CurCost,
+                               const SmallPtrSet<const SCEV *, 16> &CurRegs,
+                               DenseSet<const SCEV *> &VisitedRegs) const {
+  // Some ideas:
+  //  - prune more:
+  //    - use more aggressive filtering
+  //    - sort the formula so that the most profitable solutions are found first
+  //    - sort the uses too
+  //  - search faster:
+  //    - don't compute a cost, and then compare. compare while computing a cost
+  //      and bail early.
+  //    - track register sets with SmallBitVector
+
+  const LSRUse &LU = Uses[Workspace.size()];
+
+  // If this use references any register that's already a part of the
+  // in-progress solution, consider it a requirement that a formula must
+  // reference that register in order to be considered. This prunes out
+  // unprofitable searching.
+  SmallSetVector<const SCEV *, 4> ReqRegs;
+  for (const SCEV *S : CurRegs)
+    if (LU.Regs.count(S))
+      ReqRegs.insert(S);
+
+  SmallPtrSet<const SCEV *, 16> NewRegs;
+  Cost NewCost(L, SE, TTI);
+  for (const Formula &F : LU.Formulae) {
+    // Ignore formulae which may not be ideal in terms of register reuse of
+    // ReqRegs.  The formula should use all required registers before
+    // introducing new ones.
+    // This can sometimes (notably when trying to favour postinc) lead to
+    // sub-optimial decisions. There it is best left to the cost modelling to
+    // get correct.
+    if (!TTI.shouldFavorPostInc() || LU.Kind != LSRUse::Address) {
+      int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
+      for (const SCEV *Reg : ReqRegs) {
+        if ((F.ScaledReg && F.ScaledReg == Reg) ||
+            is_contained(F.BaseRegs, Reg)) {
+          --NumReqRegsToFind;
+          if (NumReqRegsToFind == 0)
+            break;
+        }
+      }
+      if (NumReqRegsToFind != 0) {
+        // If none of the formulae satisfied the required registers, then we could
+        // clear ReqRegs and try again. Currently, we simply give up in this case.
+        continue;
+      }
+    }
+
+    // Evaluate the cost of the current formula. If it's already worse than
+    // the current best, prune the search at that point.
+    NewCost = CurCost;
+    NewRegs = CurRegs;
+    NewCost.RateFormula(F, NewRegs, VisitedRegs, LU);
+    if (NewCost.isLess(SolutionCost)) {
+      Workspace.push_back(&F);
+      if (Workspace.size() != Uses.size()) {
+        SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
+                     NewRegs, VisitedRegs);
+        if (F.getNumRegs() == 1 && Workspace.size() == 1)
+          VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
+      } else {
+        LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
+                   dbgs() << ".\nRegs:\n";
+                   for (const SCEV *S : NewRegs) dbgs()
+                      << "- " << *S << "\n";
+                   dbgs() << '\n');
+
+        SolutionCost = NewCost;
+        Solution = Workspace;
+      }
+      Workspace.pop_back();
+    }
+  }
+}
+
+/// Choose one formula from each use. Return the results in the given Solution
+/// vector.
+void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
+  SmallVector<const Formula *, 8> Workspace;
+  Cost SolutionCost(L, SE, TTI);
+  SolutionCost.Lose();
+  Cost CurCost(L, SE, TTI);
+  SmallPtrSet<const SCEV *, 16> CurRegs;
+  DenseSet<const SCEV *> VisitedRegs;
+  Workspace.reserve(Uses.size());
+
+  // SolveRecurse does all the work.
+  SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
+               CurRegs, VisitedRegs);
+  if (Solution.empty()) {
+    LLVM_DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
+    return;
+  }
+
+  // Ok, we've now made all our decisions.
+  LLVM_DEBUG(dbgs() << "\n"
+                       "The chosen solution requires ";
+             SolutionCost.print(dbgs()); dbgs() << ":\n";
+             for (size_t i = 0, e = Uses.size(); i != e; ++i) {
+               dbgs() << "  ";
+               Uses[i].print(dbgs());
+               dbgs() << "\n"
+                         "    ";
+               Solution[i]->print(dbgs());
+               dbgs() << '\n';
+             });
+
+  assert(Solution.size() == Uses.size() && "Malformed solution!");
+}
+
+/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
+/// we can go while still being dominated by the input positions. This helps
+/// canonicalize the insert position, which encourages sharing.
+BasicBlock::iterator
+LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
+                                 const SmallVectorImpl<Instruction *> &Inputs)
+                                                                         const {
+  Instruction *Tentative = &*IP;
+  while (true) {
+    bool AllDominate = true;
+    Instruction *BetterPos = nullptr;
+    // Don't bother attempting to insert before a catchswitch, their basic block
+    // cannot have other non-PHI instructions.
+    if (isa<CatchSwitchInst>(Tentative))
+      return IP;
+
+    for (Instruction *Inst : Inputs) {
+      if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
+        AllDominate = false;
+        break;
+      }
+      // Attempt to find an insert position in the middle of the block,
+      // instead of at the end, so that it can be used for other expansions.
+      if (Tentative->getParent() == Inst->getParent() &&
+          (!BetterPos || !DT.dominates(Inst, BetterPos)))
+        BetterPos = &*std::next(BasicBlock::iterator(Inst));
+    }
+    if (!AllDominate)
+      break;
+    if (BetterPos)
+      IP = BetterPos->getIterator();
+    else
+      IP = Tentative->getIterator();
+
+    const Loop *IPLoop = LI.getLoopFor(IP->getParent());
+    unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
+
+    BasicBlock *IDom;
+    for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
+      if (!Rung) return IP;
+      Rung = Rung->getIDom();
+      if (!Rung) return IP;
+      IDom = Rung->getBlock();
+
+      // Don't climb into a loop though.
+      const Loop *IDomLoop = LI.getLoopFor(IDom);
+      unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
+      if (IDomDepth <= IPLoopDepth &&
+          (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
+        break;
+    }
+
+    Tentative = IDom->getTerminator();
+  }
+
+  return IP;
+}
+
+/// Determine an input position which will be dominated by the operands and
+/// which will dominate the result.
+BasicBlock::iterator
+LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP,
+                                           const LSRFixup &LF,
+                                           const LSRUse &LU,
+                                           SCEVExpander &Rewriter) const {
+  // Collect some instructions which must be dominated by the
+  // expanding replacement. These must be dominated by any operands that
+  // will be required in the expansion.
+  SmallVector<Instruction *, 4> Inputs;
+  if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
+    Inputs.push_back(I);
+  if (LU.Kind == LSRUse::ICmpZero)
+    if (Instruction *I =
+          dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
+      Inputs.push_back(I);
+  if (LF.PostIncLoops.count(L)) {
+    if (LF.isUseFullyOutsideLoop(L))
+      Inputs.push_back(L->getLoopLatch()->getTerminator());
+    else
+      Inputs.push_back(IVIncInsertPos);
+  }
+  // The expansion must also be dominated by the increment positions of any
+  // loops it for which it is using post-inc mode.
+  for (const Loop *PIL : LF.PostIncLoops) {
+    if (PIL == L) continue;
+
+    // Be dominated by the loop exit.
+    SmallVector<BasicBlock *, 4> ExitingBlocks;
+    PIL->getExitingBlocks(ExitingBlocks);
+    if (!ExitingBlocks.empty()) {
+      BasicBlock *BB = ExitingBlocks[0];
+      for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
+        BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
+      Inputs.push_back(BB->getTerminator());
+    }
+  }
+
+  assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad()
+         && !isa<DbgInfoIntrinsic>(LowestIP) &&
+         "Insertion point must be a normal instruction");
+
+  // Then, climb up the immediate dominator tree as far as we can go while
+  // still being dominated by the input positions.
+  BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
+
+  // Don't insert instructions before PHI nodes.
+  while (isa<PHINode>(IP)) ++IP;
+
+  // Ignore landingpad instructions.
+  while (IP->isEHPad()) ++IP;
+
+  // Ignore debug intrinsics.
+  while (isa<DbgInfoIntrinsic>(IP)) ++IP;
+
+  // Set IP below instructions recently inserted by SCEVExpander. This keeps the
+  // IP consistent across expansions and allows the previously inserted
+  // instructions to be reused by subsequent expansion.
+  while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP)
+    ++IP;
+
+  return IP;
+}
+
+/// Emit instructions for the leading candidate expression for this LSRUse (this
+/// is called "expanding").
+Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
+                           const Formula &F, BasicBlock::iterator IP,
+                           SCEVExpander &Rewriter,
+                           SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
+  if (LU.RigidFormula)
+    return LF.OperandValToReplace;
+
+  // Determine an input position which will be dominated by the operands and
+  // which will dominate the result.
+  IP = AdjustInsertPositionForExpand(IP, LF, LU, Rewriter);
+  Rewriter.setInsertPoint(&*IP);
+
+  // Inform the Rewriter if we have a post-increment use, so that it can
+  // perform an advantageous expansion.
+  Rewriter.setPostInc(LF.PostIncLoops);
+
+  // This is the type that the user actually needs.
+  Type *OpTy = LF.OperandValToReplace->getType();
+  // This will be the type that we'll initially expand to.
+  Type *Ty = F.getType();
+  if (!Ty)
+    // No type known; just expand directly to the ultimate type.
+    Ty = OpTy;
+  else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
+    // Expand directly to the ultimate type if it's the right size.
+    Ty = OpTy;
+  // This is the type to do integer arithmetic in.
+  Type *IntTy = SE.getEffectiveSCEVType(Ty);
+
+  // Build up a list of operands to add together to form the full base.
+  SmallVector<const SCEV *, 8> Ops;
+
+  // Expand the BaseRegs portion.
+  for (const SCEV *Reg : F.BaseRegs) {
+    assert(!Reg->isZero() && "Zero allocated in a base register!");
+
+    // If we're expanding for a post-inc user, make the post-inc adjustment.
+    Reg = denormalizeForPostIncUse(Reg, LF.PostIncLoops, SE);
+    Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr)));
+  }
+
+  // Expand the ScaledReg portion.
+  Value *ICmpScaledV = nullptr;
+  if (F.Scale != 0) {
+    const SCEV *ScaledS = F.ScaledReg;
+
+    // If we're expanding for a post-inc user, make the post-inc adjustment.
+    PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
+    ScaledS = denormalizeForPostIncUse(ScaledS, Loops, SE);
+
+    if (LU.Kind == LSRUse::ICmpZero) {
+      // Expand ScaleReg as if it was part of the base regs.
+      if (F.Scale == 1)
+        Ops.push_back(
+            SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr)));
+      else {
+        // An interesting way of "folding" with an icmp is to use a negated
+        // scale, which we'll implement by inserting it into the other operand
+        // of the icmp.
+        assert(F.Scale == -1 &&
+               "The only scale supported by ICmpZero uses is -1!");
+        ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr);
+      }
+    } else {
+      // Otherwise just expand the scaled register and an explicit scale,
+      // which is expected to be matched as part of the address.
+
+      // Flush the operand list to suppress SCEVExpander hoisting address modes.
+      // Unless the addressing mode will not be folded.
+      if (!Ops.empty() && LU.Kind == LSRUse::Address &&
+          isAMCompletelyFolded(TTI, LU, F)) {
+        Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), nullptr);
+        Ops.clear();
+        Ops.push_back(SE.getUnknown(FullV));
+      }
+      ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr));
+      if (F.Scale != 1)
+        ScaledS =
+            SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
+      Ops.push_back(ScaledS);
+    }
+  }
+
+  // Expand the GV portion.
+  if (F.BaseGV) {
+    // Flush the operand list to suppress SCEVExpander hoisting.
+    if (!Ops.empty()) {
+      Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
+      Ops.clear();
+      Ops.push_back(SE.getUnknown(FullV));
+    }
+    Ops.push_back(SE.getUnknown(F.BaseGV));
+  }
+
+  // Flush the operand list to suppress SCEVExpander hoisting of both folded and
+  // unfolded offsets. LSR assumes they both live next to their uses.
+  if (!Ops.empty()) {
+    Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
+    Ops.clear();
+    Ops.push_back(SE.getUnknown(FullV));
+  }
+
+  // Expand the immediate portion.
+  int64_t Offset = (uint64_t)F.BaseOffset + LF.Offset;
+  if (Offset != 0) {
+    if (LU.Kind == LSRUse::ICmpZero) {
+      // The other interesting way of "folding" with an ICmpZero is to use a
+      // negated immediate.
+      if (!ICmpScaledV)
+        ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset);
+      else {
+        Ops.push_back(SE.getUnknown(ICmpScaledV));
+        ICmpScaledV = ConstantInt::get(IntTy, Offset);
+      }
+    } else {
+      // Just add the immediate values. These again are expected to be matched
+      // as part of the address.
+      Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset)));
+    }
+  }
+
+  // Expand the unfolded offset portion.
+  int64_t UnfoldedOffset = F.UnfoldedOffset;
+  if (UnfoldedOffset != 0) {
+    // Just add the immediate values.
+    Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy,
+                                                       UnfoldedOffset)));
+  }
+
+  // Emit instructions summing all the operands.
+  const SCEV *FullS = Ops.empty() ?
+                      SE.getConstant(IntTy, 0) :
+                      SE.getAddExpr(Ops);
+  Value *FullV = Rewriter.expandCodeFor(FullS, Ty);
+
+  // We're done expanding now, so reset the rewriter.
+  Rewriter.clearPostInc();
+
+  // An ICmpZero Formula represents an ICmp which we're handling as a
+  // comparison against zero. Now that we've expanded an expression for that
+  // form, update the ICmp's other operand.
+  if (LU.Kind == LSRUse::ICmpZero) {
+    ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
+    if (auto *OperandIsInstr = dyn_cast<Instruction>(CI->getOperand(1)))
+      DeadInsts.emplace_back(OperandIsInstr);
+    assert(!F.BaseGV && "ICmp does not support folding a global value and "
+                           "a scale at the same time!");
+    if (F.Scale == -1) {
+      if (ICmpScaledV->getType() != OpTy) {
+        Instruction *Cast =
+          CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false,
+                                                   OpTy, false),
+                           ICmpScaledV, OpTy, "tmp", CI);
+        ICmpScaledV = Cast;
+      }
+      CI->setOperand(1, ICmpScaledV);
+    } else {
+      // A scale of 1 means that the scale has been expanded as part of the
+      // base regs.
+      assert((F.Scale == 0 || F.Scale == 1) &&
+             "ICmp does not support folding a global value and "
+             "a scale at the same time!");
+      Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
+                                           -(uint64_t)Offset);
+      if (C->getType() != OpTy)
+        C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
+                                                          OpTy, false),
+                                  C, OpTy);
+
+      CI->setOperand(1, C);
+    }
+  }
+
+  return FullV;
+}
+
+/// Helper for Rewrite. PHI nodes are special because the use of their operands
+/// effectively happens in their predecessor blocks, so the expression may need
+/// to be expanded in multiple places.
+void LSRInstance::RewriteForPHI(
+    PHINode *PN, const LSRUse &LU, const LSRFixup &LF, const Formula &F,
+    SCEVExpander &Rewriter, SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
+  DenseMap<BasicBlock *, Value *> Inserted;
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+    if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
+      bool needUpdateFixups = false;
+      BasicBlock *BB = PN->getIncomingBlock(i);
+
+      // If this is a critical edge, split the edge so that we do not insert
+      // the code on all predecessor/successor paths.  We do this unless this
+      // is the canonical backedge for this loop, which complicates post-inc
+      // users.
+      if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
+          !isa<IndirectBrInst>(BB->getTerminator()) &&
+          !isa<CatchSwitchInst>(BB->getTerminator())) {
+        BasicBlock *Parent = PN->getParent();
+        Loop *PNLoop = LI.getLoopFor(Parent);
+        if (!PNLoop || Parent != PNLoop->getHeader()) {
+          // Split the critical edge.
+          BasicBlock *NewBB = nullptr;
+          if (!Parent->isLandingPad()) {
             NewBB =
                 SplitCriticalEdge(BB, Parent,
                                   CriticalEdgeSplittingOptions(&DT, &LI, MSSAU)
                                       .setMergeIdenticalEdges()
                                       .setKeepOneInputPHIs());
-          } else { 
-            SmallVector<BasicBlock*, 2> NewBBs; 
-            SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DT, &LI); 
-            NewBB = NewBBs[0]; 
-          } 
-          // If NewBB==NULL, then SplitCriticalEdge refused to split because all 
-          // phi predecessors are identical. The simple thing to do is skip 
-          // splitting in this case rather than complicate the API. 
-          if (NewBB) { 
-            // If PN is outside of the loop and BB is in the loop, we want to 
-            // move the block to be immediately before the PHI block, not 
-            // immediately after BB. 
-            if (L->contains(BB) && !L->contains(PN)) 
-              NewBB->moveBefore(PN->getParent()); 
- 
-            // Splitting the edge can reduce the number of PHI entries we have. 
-            e = PN->getNumIncomingValues(); 
-            BB = NewBB; 
-            i = PN->getBasicBlockIndex(BB); 
- 
-            needUpdateFixups = true; 
-          } 
-        } 
-      } 
- 
-      std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair = 
-        Inserted.insert(std::make_pair(BB, static_cast<Value *>(nullptr))); 
-      if (!Pair.second) 
-        PN->setIncomingValue(i, Pair.first->second); 
-      else { 
-        Value *FullV = Expand(LU, LF, F, BB->getTerminator()->getIterator(), 
-                              Rewriter, DeadInsts); 
- 
-        // If this is reuse-by-noop-cast, insert the noop cast. 
-        Type *OpTy = LF.OperandValToReplace->getType(); 
-        if (FullV->getType() != OpTy) 
-          FullV = 
-            CastInst::Create(CastInst::getCastOpcode(FullV, false, 
-                                                     OpTy, false), 
-                             FullV, LF.OperandValToReplace->getType(), 
-                             "tmp", BB->getTerminator()); 
- 
-        PN->setIncomingValue(i, FullV); 
-        Pair.first->second = FullV; 
-      } 
- 
-      // If LSR splits critical edge and phi node has other pending 
-      // fixup operands, we need to update those pending fixups. Otherwise 
-      // formulae will not be implemented completely and some instructions 
-      // will not be eliminated. 
-      if (needUpdateFixups) { 
-        for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) 
-          for (LSRFixup &Fixup : Uses[LUIdx].Fixups) 
-            // If fixup is supposed to rewrite some operand in the phi 
-            // that was just updated, it may be already moved to 
-            // another phi node. Such fixup requires update. 
-            if (Fixup.UserInst == PN) { 
-              // Check if the operand we try to replace still exists in the 
-              // original phi. 
-              bool foundInOriginalPHI = false; 
-              for (const auto &val : PN->incoming_values()) 
-                if (val == Fixup.OperandValToReplace) { 
-                  foundInOriginalPHI = true; 
-                  break; 
-                } 
- 
-              // If fixup operand found in original PHI - nothing to do. 
-              if (foundInOriginalPHI) 
-                continue; 
- 
-              // Otherwise it might be moved to another PHI and requires update. 
-              // If fixup operand not found in any of the incoming blocks that 
-              // means we have already rewritten it - nothing to do. 
-              for (const auto &Block : PN->blocks()) 
-                for (BasicBlock::iterator I = Block->begin(); isa<PHINode>(I); 
-                     ++I) { 
-                  PHINode *NewPN = cast<PHINode>(I); 
-                  for (const auto &val : NewPN->incoming_values()) 
-                    if (val == Fixup.OperandValToReplace) 
-                      Fixup.UserInst = NewPN; 
-                } 
-            } 
-      } 
-    } 
-} 
- 
-/// Emit instructions for the leading candidate expression for this LSRUse (this 
-/// is called "expanding"), and update the UserInst to reference the newly 
-/// expanded value. 
-void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF, 
-                          const Formula &F, SCEVExpander &Rewriter, 
-                          SmallVectorImpl<WeakTrackingVH> &DeadInsts) const { 
-  // First, find an insertion point that dominates UserInst. For PHI nodes, 
-  // find the nearest block which dominates all the relevant uses. 
-  if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) { 
-    RewriteForPHI(PN, LU, LF, F, Rewriter, DeadInsts); 
-  } else { 
-    Value *FullV = 
-      Expand(LU, LF, F, LF.UserInst->getIterator(), Rewriter, DeadInsts); 
- 
-    // If this is reuse-by-noop-cast, insert the noop cast. 
-    Type *OpTy = LF.OperandValToReplace->getType(); 
-    if (FullV->getType() != OpTy) { 
-      Instruction *Cast = 
-        CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false), 
-                         FullV, OpTy, "tmp", LF.UserInst); 
-      FullV = Cast; 
-    } 
- 
-    // Update the user. ICmpZero is handled specially here (for now) because 
-    // Expand may have updated one of the operands of the icmp already, and 
-    // its new value may happen to be equal to LF.OperandValToReplace, in 
-    // which case doing replaceUsesOfWith leads to replacing both operands 
-    // with the same value. TODO: Reorganize this. 
-    if (LU.Kind == LSRUse::ICmpZero) 
-      LF.UserInst->setOperand(0, FullV); 
-    else 
-      LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV); 
-  } 
- 
-  if (auto *OperandIsInstr = dyn_cast<Instruction>(LF.OperandValToReplace)) 
-    DeadInsts.emplace_back(OperandIsInstr); 
-} 
- 
-/// Rewrite all the fixup locations with new values, following the chosen 
-/// solution. 
-void LSRInstance::ImplementSolution( 
-    const SmallVectorImpl<const Formula *> &Solution) { 
-  // Keep track of instructions we may have made dead, so that 
-  // we can remove them after we are done working. 
-  SmallVector<WeakTrackingVH, 16> DeadInsts; 
- 
+          } else {
+            SmallVector<BasicBlock*, 2> NewBBs;
+            SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DT, &LI);
+            NewBB = NewBBs[0];
+          }
+          // If NewBB==NULL, then SplitCriticalEdge refused to split because all
+          // phi predecessors are identical. The simple thing to do is skip
+          // splitting in this case rather than complicate the API.
+          if (NewBB) {
+            // If PN is outside of the loop and BB is in the loop, we want to
+            // move the block to be immediately before the PHI block, not
+            // immediately after BB.
+            if (L->contains(BB) && !L->contains(PN))
+              NewBB->moveBefore(PN->getParent());
+
+            // Splitting the edge can reduce the number of PHI entries we have.
+            e = PN->getNumIncomingValues();
+            BB = NewBB;
+            i = PN->getBasicBlockIndex(BB);
+
+            needUpdateFixups = true;
+          }
+        }
+      }
+
+      std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
+        Inserted.insert(std::make_pair(BB, static_cast<Value *>(nullptr)));
+      if (!Pair.second)
+        PN->setIncomingValue(i, Pair.first->second);
+      else {
+        Value *FullV = Expand(LU, LF, F, BB->getTerminator()->getIterator(),
+                              Rewriter, DeadInsts);
+
+        // If this is reuse-by-noop-cast, insert the noop cast.
+        Type *OpTy = LF.OperandValToReplace->getType();
+        if (FullV->getType() != OpTy)
+          FullV =
+            CastInst::Create(CastInst::getCastOpcode(FullV, false,
+                                                     OpTy, false),
+                             FullV, LF.OperandValToReplace->getType(),
+                             "tmp", BB->getTerminator());
+
+        PN->setIncomingValue(i, FullV);
+        Pair.first->second = FullV;
+      }
+
+      // If LSR splits critical edge and phi node has other pending
+      // fixup operands, we need to update those pending fixups. Otherwise
+      // formulae will not be implemented completely and some instructions
+      // will not be eliminated.
+      if (needUpdateFixups) {
+        for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
+          for (LSRFixup &Fixup : Uses[LUIdx].Fixups)
+            // If fixup is supposed to rewrite some operand in the phi
+            // that was just updated, it may be already moved to
+            // another phi node. Such fixup requires update.
+            if (Fixup.UserInst == PN) {
+              // Check if the operand we try to replace still exists in the
+              // original phi.
+              bool foundInOriginalPHI = false;
+              for (const auto &val : PN->incoming_values())
+                if (val == Fixup.OperandValToReplace) {
+                  foundInOriginalPHI = true;
+                  break;
+                }
+
+              // If fixup operand found in original PHI - nothing to do.
+              if (foundInOriginalPHI)
+                continue;
+
+              // Otherwise it might be moved to another PHI and requires update.
+              // If fixup operand not found in any of the incoming blocks that
+              // means we have already rewritten it - nothing to do.
+              for (const auto &Block : PN->blocks())
+                for (BasicBlock::iterator I = Block->begin(); isa<PHINode>(I);
+                     ++I) {
+                  PHINode *NewPN = cast<PHINode>(I);
+                  for (const auto &val : NewPN->incoming_values())
+                    if (val == Fixup.OperandValToReplace)
+                      Fixup.UserInst = NewPN;
+                }
+            }
+      }
+    }
+}
+
+/// Emit instructions for the leading candidate expression for this LSRUse (this
+/// is called "expanding"), and update the UserInst to reference the newly
+/// expanded value.
+void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF,
+                          const Formula &F, SCEVExpander &Rewriter,
+                          SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
+  // First, find an insertion point that dominates UserInst. For PHI nodes,
+  // find the nearest block which dominates all the relevant uses.
+  if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
+    RewriteForPHI(PN, LU, LF, F, Rewriter, DeadInsts);
+  } else {
+    Value *FullV =
+      Expand(LU, LF, F, LF.UserInst->getIterator(), Rewriter, DeadInsts);
+
+    // If this is reuse-by-noop-cast, insert the noop cast.
+    Type *OpTy = LF.OperandValToReplace->getType();
+    if (FullV->getType() != OpTy) {
+      Instruction *Cast =
+        CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
+                         FullV, OpTy, "tmp", LF.UserInst);
+      FullV = Cast;
+    }
+
+    // Update the user. ICmpZero is handled specially here (for now) because
+    // Expand may have updated one of the operands of the icmp already, and
+    // its new value may happen to be equal to LF.OperandValToReplace, in
+    // which case doing replaceUsesOfWith leads to replacing both operands
+    // with the same value. TODO: Reorganize this.
+    if (LU.Kind == LSRUse::ICmpZero)
+      LF.UserInst->setOperand(0, FullV);
+    else
+      LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
+  }
+
+  if (auto *OperandIsInstr = dyn_cast<Instruction>(LF.OperandValToReplace))
+    DeadInsts.emplace_back(OperandIsInstr);
+}
+
+/// Rewrite all the fixup locations with new values, following the chosen
+/// solution.
+void LSRInstance::ImplementSolution(
+    const SmallVectorImpl<const Formula *> &Solution) {
+  // Keep track of instructions we may have made dead, so that
+  // we can remove them after we are done working.
+  SmallVector<WeakTrackingVH, 16> DeadInsts;
+
   SCEVExpander Rewriter(SE, L->getHeader()->getModule()->getDataLayout(), "lsr",
                         false);
-#ifndef NDEBUG 
-  Rewriter.setDebugType(DEBUG_TYPE); 
-#endif 
-  Rewriter.disableCanonicalMode(); 
-  Rewriter.enableLSRMode(); 
-  Rewriter.setIVIncInsertPos(L, IVIncInsertPos); 
- 
-  // Mark phi nodes that terminate chains so the expander tries to reuse them. 
-  for (const IVChain &Chain : IVChainVec) { 
-    if (PHINode *PN = dyn_cast<PHINode>(Chain.tailUserInst())) 
-      Rewriter.setChainedPhi(PN); 
-  } 
- 
-  // Expand the new value definitions and update the users. 
-  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) 
-    for (const LSRFixup &Fixup : Uses[LUIdx].Fixups) { 
-      Rewrite(Uses[LUIdx], Fixup, *Solution[LUIdx], Rewriter, DeadInsts); 
-      Changed = true; 
-    } 
- 
-  for (const IVChain &Chain : IVChainVec) { 
-    GenerateIVChain(Chain, Rewriter, DeadInsts); 
-    Changed = true; 
-  } 
-  // Clean up after ourselves. This must be done before deleting any 
-  // instructions. 
-  Rewriter.clear(); 
- 
-  Changed |= RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts, 
-                                                                  &TLI, MSSAU); 
-} 
- 
-LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, 
-                         DominatorTree &DT, LoopInfo &LI, 
-                         const TargetTransformInfo &TTI, AssumptionCache &AC, 
-                         TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU) 
-    : IU(IU), SE(SE), DT(DT), LI(LI), AC(AC), TLI(TLI), TTI(TTI), L(L), 
-      MSSAU(MSSAU), FavorBackedgeIndex(EnableBackedgeIndexing && 
-                                       TTI.shouldFavorBackedgeIndex(L)) { 
-  // If LoopSimplify form is not available, stay out of trouble. 
-  if (!L->isLoopSimplifyForm()) 
-    return; 
- 
-  // If there's no interesting work to be done, bail early. 
-  if (IU.empty()) return; 
- 
-  // If there's too much analysis to be done, bail early. We won't be able to 
-  // model the problem anyway. 
-  unsigned NumUsers = 0; 
-  for (const IVStrideUse &U : IU) { 
-    if (++NumUsers > MaxIVUsers) { 
-      (void)U; 
-      LLVM_DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U 
-                        << "\n"); 
-      return; 
-    } 
-    // Bail out if we have a PHI on an EHPad that gets a value from a 
-    // CatchSwitchInst.  Because the CatchSwitchInst cannot be split, there is 
-    // no good place to stick any instructions. 
-    if (auto *PN = dyn_cast<PHINode>(U.getUser())) { 
-       auto *FirstNonPHI = PN->getParent()->getFirstNonPHI(); 
-       if (isa<FuncletPadInst>(FirstNonPHI) || 
-           isa<CatchSwitchInst>(FirstNonPHI)) 
-         for (BasicBlock *PredBB : PN->blocks()) 
-           if (isa<CatchSwitchInst>(PredBB->getFirstNonPHI())) 
-             return; 
-    } 
-  } 
- 
-#ifndef NDEBUG 
-  // All dominating loops must have preheaders, or SCEVExpander may not be able 
-  // to materialize an AddRecExpr whose Start is an outer AddRecExpr. 
-  // 
-  // IVUsers analysis should only create users that are dominated by simple loop 
-  // headers. Since this loop should dominate all of its users, its user list 
-  // should be empty if this loop itself is not within a simple loop nest. 
-  for (DomTreeNode *Rung = DT.getNode(L->getLoopPreheader()); 
-       Rung; Rung = Rung->getIDom()) { 
-    BasicBlock *BB = Rung->getBlock(); 
-    const Loop *DomLoop = LI.getLoopFor(BB); 
-    if (DomLoop && DomLoop->getHeader() == BB) { 
-      assert(DomLoop->getLoopPreheader() && "LSR needs a simplified loop nest"); 
-    } 
-  } 
-#endif // DEBUG 
- 
-  LLVM_DEBUG(dbgs() << "\nLSR on loop "; 
-             L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false); 
-             dbgs() << ":\n"); 
- 
-  // First, perform some low-level loop optimizations. 
-  OptimizeShadowIV(); 
-  OptimizeLoopTermCond(); 
- 
-  // If loop preparation eliminates all interesting IV users, bail. 
-  if (IU.empty()) return; 
- 
-  // Skip nested loops until we can model them better with formulae. 
+#ifndef NDEBUG
+  Rewriter.setDebugType(DEBUG_TYPE);
+#endif
+  Rewriter.disableCanonicalMode();
+  Rewriter.enableLSRMode();
+  Rewriter.setIVIncInsertPos(L, IVIncInsertPos);
+
+  // Mark phi nodes that terminate chains so the expander tries to reuse them.
+  for (const IVChain &Chain : IVChainVec) {
+    if (PHINode *PN = dyn_cast<PHINode>(Chain.tailUserInst()))
+      Rewriter.setChainedPhi(PN);
+  }
+
+  // Expand the new value definitions and update the users.
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
+    for (const LSRFixup &Fixup : Uses[LUIdx].Fixups) {
+      Rewrite(Uses[LUIdx], Fixup, *Solution[LUIdx], Rewriter, DeadInsts);
+      Changed = true;
+    }
+
+  for (const IVChain &Chain : IVChainVec) {
+    GenerateIVChain(Chain, Rewriter, DeadInsts);
+    Changed = true;
+  }
+  // Clean up after ourselves. This must be done before deleting any
+  // instructions.
+  Rewriter.clear();
+
+  Changed |= RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts,
+                                                                  &TLI, MSSAU);
+}
+
+LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
+                         DominatorTree &DT, LoopInfo &LI,
+                         const TargetTransformInfo &TTI, AssumptionCache &AC,
+                         TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU)
+    : IU(IU), SE(SE), DT(DT), LI(LI), AC(AC), TLI(TLI), TTI(TTI), L(L),
+      MSSAU(MSSAU), FavorBackedgeIndex(EnableBackedgeIndexing &&
+                                       TTI.shouldFavorBackedgeIndex(L)) {
+  // If LoopSimplify form is not available, stay out of trouble.
+  if (!L->isLoopSimplifyForm())
+    return;
+
+  // If there's no interesting work to be done, bail early.
+  if (IU.empty()) return;
+
+  // If there's too much analysis to be done, bail early. We won't be able to
+  // model the problem anyway.
+  unsigned NumUsers = 0;
+  for (const IVStrideUse &U : IU) {
+    if (++NumUsers > MaxIVUsers) {
+      (void)U;
+      LLVM_DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U
+                        << "\n");
+      return;
+    }
+    // Bail out if we have a PHI on an EHPad that gets a value from a
+    // CatchSwitchInst.  Because the CatchSwitchInst cannot be split, there is
+    // no good place to stick any instructions.
+    if (auto *PN = dyn_cast<PHINode>(U.getUser())) {
+       auto *FirstNonPHI = PN->getParent()->getFirstNonPHI();
+       if (isa<FuncletPadInst>(FirstNonPHI) ||
+           isa<CatchSwitchInst>(FirstNonPHI))
+         for (BasicBlock *PredBB : PN->blocks())
+           if (isa<CatchSwitchInst>(PredBB->getFirstNonPHI()))
+             return;
+    }
+  }
+
+#ifndef NDEBUG
+  // All dominating loops must have preheaders, or SCEVExpander may not be able
+  // to materialize an AddRecExpr whose Start is an outer AddRecExpr.
+  //
+  // IVUsers analysis should only create users that are dominated by simple loop
+  // headers. Since this loop should dominate all of its users, its user list
+  // should be empty if this loop itself is not within a simple loop nest.
+  for (DomTreeNode *Rung = DT.getNode(L->getLoopPreheader());
+       Rung; Rung = Rung->getIDom()) {
+    BasicBlock *BB = Rung->getBlock();
+    const Loop *DomLoop = LI.getLoopFor(BB);
+    if (DomLoop && DomLoop->getHeader() == BB) {
+      assert(DomLoop->getLoopPreheader() && "LSR needs a simplified loop nest");
+    }
+  }
+#endif // DEBUG
+
+  LLVM_DEBUG(dbgs() << "\nLSR on loop ";
+             L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
+             dbgs() << ":\n");
+
+  // First, perform some low-level loop optimizations.
+  OptimizeShadowIV();
+  OptimizeLoopTermCond();
+
+  // If loop preparation eliminates all interesting IV users, bail.
+  if (IU.empty()) return;
+
+  // Skip nested loops until we can model them better with formulae.
   if (!L->isInnermost()) {
-    LLVM_DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n"); 
-    return; 
-  } 
- 
-  // Start collecting data and preparing for the solver. 
+    LLVM_DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
+    return;
+  }
+
+  // Start collecting data and preparing for the solver.
   // If number of registers is not the major cost, we cannot benefit from the
   // current profitable chain optimization which is based on number of
   // registers.
@@ -5633,145 +5633,145 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
   // example number of instructions.
   if (TTI.isNumRegsMajorCostOfLSR() || StressIVChain)
     CollectChains();
-  CollectInterestingTypesAndFactors(); 
-  CollectFixupsAndInitialFormulae(); 
-  CollectLoopInvariantFixupsAndFormulae(); 
- 
-  if (Uses.empty()) 
-    return; 
- 
-  LLVM_DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n"; 
-             print_uses(dbgs())); 
- 
-  // Now use the reuse data to generate a bunch of interesting ways 
-  // to formulate the values needed for the uses. 
-  GenerateAllReuseFormulae(); 
- 
-  FilterOutUndesirableDedicatedRegisters(); 
-  NarrowSearchSpaceUsingHeuristics(); 
- 
-  SmallVector<const Formula *, 8> Solution; 
-  Solve(Solution); 
- 
-  // Release memory that is no longer needed. 
-  Factors.clear(); 
-  Types.clear(); 
-  RegUses.clear(); 
- 
-  if (Solution.empty()) 
-    return; 
- 
-#ifndef NDEBUG 
-  // Formulae should be legal. 
-  for (const LSRUse &LU : Uses) { 
-    for (const Formula &F : LU.Formulae) 
-      assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, 
-                        F) && "Illegal formula generated!"); 
-  }; 
-#endif 
- 
-  // Now that we've decided what we want, make it so. 
-  ImplementSolution(Solution); 
-} 
- 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 
-void LSRInstance::print_factors_and_types(raw_ostream &OS) const { 
-  if (Factors.empty() && Types.empty()) return; 
- 
-  OS << "LSR has identified the following interesting factors and types: "; 
-  bool First = true; 
- 
-  for (int64_t Factor : Factors) { 
-    if (!First) OS << ", "; 
-    First = false; 
-    OS << '*' << Factor; 
-  } 
- 
-  for (Type *Ty : Types) { 
-    if (!First) OS << ", "; 
-    First = false; 
-    OS << '(' << *Ty << ')'; 
-  } 
-  OS << '\n'; 
-} 
- 
-void LSRInstance::print_fixups(raw_ostream &OS) const { 
-  OS << "LSR is examining the following fixup sites:\n"; 
-  for (const LSRUse &LU : Uses) 
-    for (const LSRFixup &LF : LU.Fixups) { 
-      dbgs() << "  "; 
-      LF.print(OS); 
-      OS << '\n'; 
-    } 
-} 
- 
-void LSRInstance::print_uses(raw_ostream &OS) const { 
-  OS << "LSR is examining the following uses:\n"; 
-  for (const LSRUse &LU : Uses) { 
-    dbgs() << "  "; 
-    LU.print(OS); 
-    OS << '\n'; 
-    for (const Formula &F : LU.Formulae) { 
-      OS << "    "; 
-      F.print(OS); 
-      OS << '\n'; 
-    } 
-  } 
-} 
- 
-void LSRInstance::print(raw_ostream &OS) const { 
-  print_factors_and_types(OS); 
-  print_fixups(OS); 
-  print_uses(OS); 
-} 
- 
-LLVM_DUMP_METHOD void LSRInstance::dump() const { 
-  print(errs()); errs() << '\n'; 
-} 
-#endif 
- 
-namespace { 
- 
-class LoopStrengthReduce : public LoopPass { 
-public: 
-  static char ID; // Pass ID, replacement for typeid 
- 
-  LoopStrengthReduce(); 
- 
-private: 
-  bool runOnLoop(Loop *L, LPPassManager &LPM) override; 
-  void getAnalysisUsage(AnalysisUsage &AU) const override; 
-}; 
- 
-} // end anonymous namespace 
- 
-LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) { 
-  initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry()); 
-} 
- 
-void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { 
-  // We split critical edges, so we change the CFG.  However, we do update 
-  // many analyses if they are around. 
-  AU.addPreservedID(LoopSimplifyID); 
- 
-  AU.addRequired<LoopInfoWrapperPass>(); 
-  AU.addPreserved<LoopInfoWrapperPass>(); 
-  AU.addRequiredID(LoopSimplifyID); 
-  AU.addRequired<DominatorTreeWrapperPass>(); 
-  AU.addPreserved<DominatorTreeWrapperPass>(); 
-  AU.addRequired<ScalarEvolutionWrapperPass>(); 
-  AU.addPreserved<ScalarEvolutionWrapperPass>(); 
-  AU.addRequired<AssumptionCacheTracker>(); 
-  AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-  // Requiring LoopSimplify a second time here prevents IVUsers from running 
-  // twice, since LoopSimplify was invalidated by running ScalarEvolution. 
-  AU.addRequiredID(LoopSimplifyID); 
-  AU.addRequired<IVUsersWrapperPass>(); 
-  AU.addPreserved<IVUsersWrapperPass>(); 
-  AU.addRequired<TargetTransformInfoWrapperPass>(); 
-  AU.addPreserved<MemorySSAWrapperPass>(); 
-} 
- 
+  CollectInterestingTypesAndFactors();
+  CollectFixupsAndInitialFormulae();
+  CollectLoopInvariantFixupsAndFormulae();
+
+  if (Uses.empty())
+    return;
+
+  LLVM_DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
+             print_uses(dbgs()));
+
+  // Now use the reuse data to generate a bunch of interesting ways
+  // to formulate the values needed for the uses.
+  GenerateAllReuseFormulae();
+
+  FilterOutUndesirableDedicatedRegisters();
+  NarrowSearchSpaceUsingHeuristics();
+
+  SmallVector<const Formula *, 8> Solution;
+  Solve(Solution);
+
+  // Release memory that is no longer needed.
+  Factors.clear();
+  Types.clear();
+  RegUses.clear();
+
+  if (Solution.empty())
+    return;
+
+#ifndef NDEBUG
+  // Formulae should be legal.
+  for (const LSRUse &LU : Uses) {
+    for (const Formula &F : LU.Formulae)
+      assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
+                        F) && "Illegal formula generated!");
+  };
+#endif
+
+  // Now that we've decided what we want, make it so.
+  ImplementSolution(Solution);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
+  if (Factors.empty() && Types.empty()) return;
+
+  OS << "LSR has identified the following interesting factors and types: ";
+  bool First = true;
+
+  for (int64_t Factor : Factors) {
+    if (!First) OS << ", ";
+    First = false;
+    OS << '*' << Factor;
+  }
+
+  for (Type *Ty : Types) {
+    if (!First) OS << ", ";
+    First = false;
+    OS << '(' << *Ty << ')';
+  }
+  OS << '\n';
+}
+
+void LSRInstance::print_fixups(raw_ostream &OS) const {
+  OS << "LSR is examining the following fixup sites:\n";
+  for (const LSRUse &LU : Uses)
+    for (const LSRFixup &LF : LU.Fixups) {
+      dbgs() << "  ";
+      LF.print(OS);
+      OS << '\n';
+    }
+}
+
+void LSRInstance::print_uses(raw_ostream &OS) const {
+  OS << "LSR is examining the following uses:\n";
+  for (const LSRUse &LU : Uses) {
+    dbgs() << "  ";
+    LU.print(OS);
+    OS << '\n';
+    for (const Formula &F : LU.Formulae) {
+      OS << "    ";
+      F.print(OS);
+      OS << '\n';
+    }
+  }
+}
+
+void LSRInstance::print(raw_ostream &OS) const {
+  print_factors_and_types(OS);
+  print_fixups(OS);
+  print_uses(OS);
+}
+
+LLVM_DUMP_METHOD void LSRInstance::dump() const {
+  print(errs()); errs() << '\n';
+}
+#endif
+
+namespace {
+
+class LoopStrengthReduce : public LoopPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+
+  LoopStrengthReduce();
+
+private:
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+
+} // end anonymous namespace
+
+LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
+  initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry());
+}
+
+void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
+  // We split critical edges, so we change the CFG.  However, we do update
+  // many analyses if they are around.
+  AU.addPreservedID(LoopSimplifyID);
+
+  AU.addRequired<LoopInfoWrapperPass>();
+  AU.addPreserved<LoopInfoWrapperPass>();
+  AU.addRequiredID(LoopSimplifyID);
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addPreserved<DominatorTreeWrapperPass>();
+  AU.addRequired<ScalarEvolutionWrapperPass>();
+  AU.addPreserved<ScalarEvolutionWrapperPass>();
+  AU.addRequired<AssumptionCacheTracker>();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
+  // Requiring LoopSimplify a second time here prevents IVUsers from running
+  // twice, since LoopSimplify was invalidated by running ScalarEvolution.
+  AU.addRequiredID(LoopSimplifyID);
+  AU.addRequired<IVUsersWrapperPass>();
+  AU.addPreserved<IVUsersWrapperPass>();
+  AU.addRequired<TargetTransformInfoWrapperPass>();
+  AU.addPreserved<MemorySSAWrapperPass>();
+}
+
 using EqualValues = SmallVector<std::tuple<WeakVH, int64_t, DIExpression *>, 4>;
 using EqualValuesMap = DenseMap<DbgValueInst *, EqualValues>;
 
@@ -5829,94 +5829,94 @@ static void DbgApplyEqualValues(EqualValuesMap &DbgValueToEqualSet) {
   }
 }
 
-static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, 
-                               DominatorTree &DT, LoopInfo &LI, 
-                               const TargetTransformInfo &TTI, 
-                               AssumptionCache &AC, TargetLibraryInfo &TLI, 
-                               MemorySSA *MSSA) { 
- 
-  bool Changed = false; 
-  std::unique_ptr<MemorySSAUpdater> MSSAU; 
-  if (MSSA) 
-    MSSAU = std::make_unique<MemorySSAUpdater>(MSSA); 
- 
-  // Run the main LSR transformation. 
-  Changed |= 
-      LSRInstance(L, IU, SE, DT, LI, TTI, AC, TLI, MSSAU.get()).getChanged(); 
- 
+static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
+                               DominatorTree &DT, LoopInfo &LI,
+                               const TargetTransformInfo &TTI,
+                               AssumptionCache &AC, TargetLibraryInfo &TLI,
+                               MemorySSA *MSSA) {
+
+  bool Changed = false;
+  std::unique_ptr<MemorySSAUpdater> MSSAU;
+  if (MSSA)
+    MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
+
+  // Run the main LSR transformation.
+  Changed |=
+      LSRInstance(L, IU, SE, DT, LI, TTI, AC, TLI, MSSAU.get()).getChanged();
+
   // Debug preservation - before we start removing anything create equivalence
   // sets for the llvm.dbg.value intrinsics.
   EqualValuesMap DbgValueToEqualSet;
   DbgGatherEqualValues(L, SE, DbgValueToEqualSet);
 
-  // Remove any extra phis created by processing inner loops. 
-  Changed |= DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get()); 
-  if (EnablePhiElim && L->isLoopSimplifyForm()) { 
-    SmallVector<WeakTrackingVH, 16> DeadInsts; 
-    const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 
+  // Remove any extra phis created by processing inner loops.
+  Changed |= DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
+  if (EnablePhiElim && L->isLoopSimplifyForm()) {
+    SmallVector<WeakTrackingVH, 16> DeadInsts;
+    const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
     SCEVExpander Rewriter(SE, DL, "lsr", false);
-#ifndef NDEBUG 
-    Rewriter.setDebugType(DEBUG_TYPE); 
-#endif 
-    unsigned numFolded = Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, &TTI); 
-    if (numFolded) { 
-      Changed = true; 
-      RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts, &TLI, 
-                                                           MSSAU.get()); 
-      DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get()); 
-    } 
-  } 
+#ifndef NDEBUG
+    Rewriter.setDebugType(DEBUG_TYPE);
+#endif
+    unsigned numFolded = Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, &TTI);
+    if (numFolded) {
+      Changed = true;
+      RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts, &TLI,
+                                                           MSSAU.get());
+      DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
+    }
+  }
 
   DbgApplyEqualValues(DbgValueToEqualSet);
 
-  return Changed; 
-} 
- 
-bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { 
-  if (skipLoop(L)) 
-    return false; 
- 
-  auto &IU = getAnalysis<IVUsersWrapperPass>().getIU(); 
-  auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 
-  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 
-  const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI( 
-      *L->getHeader()->getParent()); 
-  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache( 
-      *L->getHeader()->getParent()); 
-  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI( 
-      *L->getHeader()->getParent()); 
-  auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>(); 
-  MemorySSA *MSSA = nullptr; 
-  if (MSSAAnalysis) 
-    MSSA = &MSSAAnalysis->getMSSA(); 
-  return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, TLI, MSSA); 
-} 
- 
-PreservedAnalyses LoopStrengthReducePass::run(Loop &L, LoopAnalysisManager &AM, 
-                                              LoopStandardAnalysisResults &AR, 
-                                              LPMUpdater &) { 
-  if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE, 
-                          AR.DT, AR.LI, AR.TTI, AR.AC, AR.TLI, AR.MSSA)) 
-    return PreservedAnalyses::all(); 
- 
-  auto PA = getLoopPassPreservedAnalyses(); 
-  if (AR.MSSA) 
-    PA.preserve<MemorySSAAnalysis>(); 
-  return PA; 
-} 
- 
-char LoopStrengthReduce::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce", 
-                      "Loop Strength Reduction", false, false) 
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(IVUsersWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 
-INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce", 
-                    "Loop Strength Reduction", false, false) 
- 
-Pass *llvm::createLoopStrengthReducePass() { return new LoopStrengthReduce(); } 
+  return Changed;
+}
+
+bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
+  if (skipLoop(L))
+    return false;
+
+  auto &IU = getAnalysis<IVUsersWrapperPass>().getIU();
+  auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+      *L->getHeader()->getParent());
+  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+      *L->getHeader()->getParent());
+  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
+      *L->getHeader()->getParent());
+  auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
+  MemorySSA *MSSA = nullptr;
+  if (MSSAAnalysis)
+    MSSA = &MSSAAnalysis->getMSSA();
+  return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, TLI, MSSA);
+}
+
+PreservedAnalyses LoopStrengthReducePass::run(Loop &L, LoopAnalysisManager &AM,
+                                              LoopStandardAnalysisResults &AR,
+                                              LPMUpdater &) {
+  if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE,
+                          AR.DT, AR.LI, AR.TTI, AR.AC, AR.TLI, AR.MSSA))
+    return PreservedAnalyses::all();
+
+  auto PA = getLoopPassPreservedAnalyses();
+  if (AR.MSSA)
+    PA.preserve<MemorySSAAnalysis>();
+  return PA;
+}
+
+char LoopStrengthReduce::ID = 0;
+
+INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
+                      "Loop Strength Reduction", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(IVUsersWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
+                    "Loop Strength Reduction", false, false)
+
+Pass *llvm::createLoopStrengthReducePass() { return new LoopStrengthReduce(); }
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
index 766b313f4f..495906e1a7 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
@@ -1,293 +1,293 @@
-//===- LoopUnrollAndJam.cpp - Loop unroll and jam pass --------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass implements an unroll and jam pass. Most of the work is done by 
-// Utils/UnrollLoopAndJam.cpp. 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h" 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/None.h" 
-#include "llvm/ADT/Optional.h" 
-#include "llvm/ADT/PriorityWorklist.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/StringRef.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/CodeMetrics.h" 
-#include "llvm/Analysis/DependenceAnalysis.h" 
-#include "llvm/Analysis/LoopAnalysisManager.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/OptimizationRemarkEmitter.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/PassRegistry.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Compiler.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
+//===- LoopUnrollAndJam.cpp - Loop unroll and jam pass --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements an unroll and jam pass. Most of the work is done by
+// Utils/UnrollLoopAndJam.cpp.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/PriorityWorklist.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/LoopPeel.h"
-#include "llvm/Transforms/Utils/LoopSimplify.h" 
-#include "llvm/Transforms/Utils/LoopUtils.h" 
-#include "llvm/Transforms/Utils/UnrollLoop.h" 
-#include <cassert> 
-#include <cstdint> 
-#include <vector> 
- 
-namespace llvm { 
-class Instruction; 
-class Value; 
-} // namespace llvm 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "loop-unroll-and-jam" 
- 
-/// @{ 
-/// Metadata attribute names 
-static const char *const LLVMLoopUnrollAndJamFollowupAll = 
-    "llvm.loop.unroll_and_jam.followup_all"; 
-static const char *const LLVMLoopUnrollAndJamFollowupInner = 
-    "llvm.loop.unroll_and_jam.followup_inner"; 
-static const char *const LLVMLoopUnrollAndJamFollowupOuter = 
-    "llvm.loop.unroll_and_jam.followup_outer"; 
-static const char *const LLVMLoopUnrollAndJamFollowupRemainderInner = 
-    "llvm.loop.unroll_and_jam.followup_remainder_inner"; 
-static const char *const LLVMLoopUnrollAndJamFollowupRemainderOuter = 
-    "llvm.loop.unroll_and_jam.followup_remainder_outer"; 
-/// @} 
- 
-static cl::opt<bool> 
-    AllowUnrollAndJam("allow-unroll-and-jam", cl::Hidden, 
-                      cl::desc("Allows loops to be unroll-and-jammed.")); 
- 
-static cl::opt<unsigned> UnrollAndJamCount( 
-    "unroll-and-jam-count", cl::Hidden, 
-    cl::desc("Use this unroll count for all loops including those with " 
-             "unroll_and_jam_count pragma values, for testing purposes")); 
- 
-static cl::opt<unsigned> UnrollAndJamThreshold( 
-    "unroll-and-jam-threshold", cl::init(60), cl::Hidden, 
-    cl::desc("Threshold to use for inner loop when doing unroll and jam.")); 
- 
-static cl::opt<unsigned> PragmaUnrollAndJamThreshold( 
-    "pragma-unroll-and-jam-threshold", cl::init(1024), cl::Hidden, 
-    cl::desc("Unrolled size limit for loops with an unroll_and_jam(full) or " 
-             "unroll_count pragma.")); 
- 
-// Returns the loop hint metadata node with the given name (for example, 
-// "llvm.loop.unroll.count").  If no such metadata node exists, then nullptr is 
-// returned. 
-static MDNode *getUnrollMetadataForLoop(const Loop *L, StringRef Name) { 
-  if (MDNode *LoopID = L->getLoopID()) 
-    return GetUnrollMetadata(LoopID, Name); 
-  return nullptr; 
-} 
- 
-// Returns true if the loop has any metadata starting with Prefix. For example a 
-// Prefix of "llvm.loop.unroll." returns true if we have any unroll metadata. 
-static bool hasAnyUnrollPragma(const Loop *L, StringRef Prefix) { 
-  if (MDNode *LoopID = L->getLoopID()) { 
-    // First operand should refer to the loop id itself. 
-    assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); 
-    assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); 
- 
-    for (unsigned I = 1, E = LoopID->getNumOperands(); I < E; ++I) { 
-      MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(I)); 
-      if (!MD) 
-        continue; 
- 
-      MDString *S = dyn_cast<MDString>(MD->getOperand(0)); 
-      if (!S) 
-        continue; 
- 
-      if (S->getString().startswith(Prefix)) 
-        return true; 
-    } 
-  } 
-  return false; 
-} 
- 
-// Returns true if the loop has an unroll_and_jam(enable) pragma. 
-static bool hasUnrollAndJamEnablePragma(const Loop *L) { 
-  return getUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.enable"); 
-} 
- 
-// If loop has an unroll_and_jam_count pragma return the (necessarily 
-// positive) value from the pragma.  Otherwise return 0. 
-static unsigned unrollAndJamCountPragmaValue(const Loop *L) { 
-  MDNode *MD = getUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.count"); 
-  if (MD) { 
-    assert(MD->getNumOperands() == 2 && 
-           "Unroll count hint metadata should have two operands."); 
-    unsigned Count = 
-        mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue(); 
-    assert(Count >= 1 && "Unroll count must be positive."); 
-    return Count; 
-  } 
-  return 0; 
-} 
- 
-// Returns loop size estimation for unrolled loop. 
-static uint64_t 
-getUnrollAndJammedLoopSize(unsigned LoopSize, 
-                           TargetTransformInfo::UnrollingPreferences &UP) { 
-  assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!"); 
-  return static_cast<uint64_t>(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns; 
-} 
- 
-// Calculates unroll and jam count and writes it to UP.Count. Returns true if 
-// unroll count was set explicitly. 
-static bool computeUnrollAndJamCount( 
-    Loop *L, Loop *SubLoop, const TargetTransformInfo &TTI, DominatorTree &DT, 
-    LoopInfo *LI, ScalarEvolution &SE, 
-    const SmallPtrSetImpl<const Value *> &EphValues, 
-    OptimizationRemarkEmitter *ORE, unsigned OuterTripCount, 
-    unsigned OuterTripMultiple, unsigned OuterLoopSize, unsigned InnerTripCount, 
-    unsigned InnerLoopSize, TargetTransformInfo::UnrollingPreferences &UP, 
-    TargetTransformInfo::PeelingPreferences &PP) { 
-  // First up use computeUnrollCount from the loop unroller to get a count 
-  // for unrolling the outer loop, plus any loops requiring explicit 
-  // unrolling we leave to the unroller. This uses UP.Threshold / 
-  // UP.PartialThreshold / UP.MaxCount to come up with sensible loop values. 
-  // We have already checked that the loop has no unroll.* pragmas. 
-  unsigned MaxTripCount = 0; 
-  bool UseUpperBound = false; 
-  bool ExplicitUnroll = computeUnrollCount( 
-      L, TTI, DT, LI, SE, EphValues, ORE, OuterTripCount, MaxTripCount, 
-      /*MaxOrZero*/ false, OuterTripMultiple, OuterLoopSize, UP, PP, 
-      UseUpperBound); 
-  if (ExplicitUnroll || UseUpperBound) { 
-    // If the user explicitly set the loop as unrolled, dont UnJ it. Leave it 
-    // for the unroller instead. 
-    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; explicit count set by " 
-                         "computeUnrollCount\n"); 
-    UP.Count = 0; 
-    return false; 
-  } 
- 
-  // Override with any explicit Count from the "unroll-and-jam-count" option. 
-  bool UserUnrollCount = UnrollAndJamCount.getNumOccurrences() > 0; 
-  if (UserUnrollCount) { 
-    UP.Count = UnrollAndJamCount; 
-    UP.Force = true; 
-    if (UP.AllowRemainder && 
-        getUnrollAndJammedLoopSize(OuterLoopSize, UP) < UP.Threshold && 
-        getUnrollAndJammedLoopSize(InnerLoopSize, UP) < 
-            UP.UnrollAndJamInnerLoopThreshold) 
-      return true; 
-  } 
- 
-  // Check for unroll_and_jam pragmas 
-  unsigned PragmaCount = unrollAndJamCountPragmaValue(L); 
-  if (PragmaCount > 0) { 
-    UP.Count = PragmaCount; 
-    UP.Runtime = true; 
-    UP.Force = true; 
-    if ((UP.AllowRemainder || (OuterTripMultiple % PragmaCount == 0)) && 
-        getUnrollAndJammedLoopSize(OuterLoopSize, UP) < UP.Threshold && 
-        getUnrollAndJammedLoopSize(InnerLoopSize, UP) < 
-            UP.UnrollAndJamInnerLoopThreshold) 
-      return true; 
-  } 
- 
-  bool PragmaEnableUnroll = hasUnrollAndJamEnablePragma(L); 
-  bool ExplicitUnrollAndJamCount = PragmaCount > 0 || UserUnrollCount; 
-  bool ExplicitUnrollAndJam = PragmaEnableUnroll || ExplicitUnrollAndJamCount; 
- 
-  // If the loop has an unrolling pragma, we want to be more aggressive with 
-  // unrolling limits. 
-  if (ExplicitUnrollAndJam) 
-    UP.UnrollAndJamInnerLoopThreshold = PragmaUnrollAndJamThreshold; 
- 
-  if (!UP.AllowRemainder && getUnrollAndJammedLoopSize(InnerLoopSize, UP) >= 
-                                UP.UnrollAndJamInnerLoopThreshold) { 
-    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't create remainder and " 
-                         "inner loop too large\n"); 
-    UP.Count = 0; 
-    return false; 
-  } 
- 
-  // We have a sensible limit for the outer loop, now adjust it for the inner 
-  // loop and UP.UnrollAndJamInnerLoopThreshold. If the outer limit was set 
-  // explicitly, we want to stick to it. 
-  if (!ExplicitUnrollAndJamCount && UP.AllowRemainder) { 
-    while (UP.Count != 0 && getUnrollAndJammedLoopSize(InnerLoopSize, UP) >= 
-                                UP.UnrollAndJamInnerLoopThreshold) 
-      UP.Count--; 
-  } 
- 
-  // If we are explicitly unroll and jamming, we are done. Otherwise there are a 
-  // number of extra performance heuristics to check. 
-  if (ExplicitUnrollAndJam) 
-    return true; 
- 
-  // If the inner loop count is known and small, leave the entire loop nest to 
-  // be the unroller 
-  if (InnerTripCount && InnerLoopSize * InnerTripCount < UP.Threshold) { 
-    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; small inner loop count is " 
-                         "being left for the unroller\n"); 
-    UP.Count = 0; 
-    return false; 
-  } 
- 
-  // Check for situations where UnJ is likely to be unprofitable. Including 
-  // subloops with more than 1 block. 
-  if (SubLoop->getBlocks().size() != 1) { 
-    LLVM_DEBUG( 
-        dbgs() << "Won't unroll-and-jam; More than one inner loop block\n"); 
-    UP.Count = 0; 
-    return false; 
-  } 
- 
-  // Limit to loops where there is something to gain from unrolling and 
-  // jamming the loop. In this case, look for loads that are invariant in the 
-  // outer loop and can become shared. 
-  unsigned NumInvariant = 0; 
-  for (BasicBlock *BB : SubLoop->getBlocks()) { 
-    for (Instruction &I : *BB) { 
-      if (auto *Ld = dyn_cast<LoadInst>(&I)) { 
-        Value *V = Ld->getPointerOperand(); 
-        const SCEV *LSCEV = SE.getSCEVAtScope(V, L); 
-        if (SE.isLoopInvariant(LSCEV, L)) 
-          NumInvariant++; 
-      } 
-    } 
-  } 
-  if (NumInvariant == 0) { 
-    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; No loop invariant loads\n"); 
-    UP.Count = 0; 
-    return false; 
-  } 
- 
-  return false; 
-} 
- 
-static LoopUnrollResult 
-tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, 
-                      ScalarEvolution &SE, const TargetTransformInfo &TTI, 
-                      AssumptionCache &AC, DependenceInfo &DI, 
-                      OptimizationRemarkEmitter &ORE, int OptLevel) { 
-  TargetTransformInfo::UnrollingPreferences UP = 
-      gatherUnrollingPreferences(L, SE, TTI, nullptr, nullptr, OptLevel, None, 
-                                 None, None, None, None, None); 
-  TargetTransformInfo::PeelingPreferences PP = 
-      gatherPeelingPreferences(L, SE, TTI, None, None); 
+#include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include <cassert>
+#include <cstdint>
+#include <vector>
+
+namespace llvm {
+class Instruction;
+class Value;
+} // namespace llvm
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-unroll-and-jam"
+
+/// @{
+/// Metadata attribute names
+static const char *const LLVMLoopUnrollAndJamFollowupAll =
+    "llvm.loop.unroll_and_jam.followup_all";
+static const char *const LLVMLoopUnrollAndJamFollowupInner =
+    "llvm.loop.unroll_and_jam.followup_inner";
+static const char *const LLVMLoopUnrollAndJamFollowupOuter =
+    "llvm.loop.unroll_and_jam.followup_outer";
+static const char *const LLVMLoopUnrollAndJamFollowupRemainderInner =
+    "llvm.loop.unroll_and_jam.followup_remainder_inner";
+static const char *const LLVMLoopUnrollAndJamFollowupRemainderOuter =
+    "llvm.loop.unroll_and_jam.followup_remainder_outer";
+/// @}
+
+static cl::opt<bool>
+    AllowUnrollAndJam("allow-unroll-and-jam", cl::Hidden,
+                      cl::desc("Allows loops to be unroll-and-jammed."));
+
+static cl::opt<unsigned> UnrollAndJamCount(
+    "unroll-and-jam-count", cl::Hidden,
+    cl::desc("Use this unroll count for all loops including those with "
+             "unroll_and_jam_count pragma values, for testing purposes"));
+
+static cl::opt<unsigned> UnrollAndJamThreshold(
+    "unroll-and-jam-threshold", cl::init(60), cl::Hidden,
+    cl::desc("Threshold to use for inner loop when doing unroll and jam."));
+
+static cl::opt<unsigned> PragmaUnrollAndJamThreshold(
+    "pragma-unroll-and-jam-threshold", cl::init(1024), cl::Hidden,
+    cl::desc("Unrolled size limit for loops with an unroll_and_jam(full) or "
+             "unroll_count pragma."));
+
+// Returns the loop hint metadata node with the given name (for example,
+// "llvm.loop.unroll.count").  If no such metadata node exists, then nullptr is
+// returned.
+static MDNode *getUnrollMetadataForLoop(const Loop *L, StringRef Name) {
+  if (MDNode *LoopID = L->getLoopID())
+    return GetUnrollMetadata(LoopID, Name);
+  return nullptr;
+}
+
+// Returns true if the loop has any metadata starting with Prefix. For example a
+// Prefix of "llvm.loop.unroll." returns true if we have any unroll metadata.
+static bool hasAnyUnrollPragma(const Loop *L, StringRef Prefix) {
+  if (MDNode *LoopID = L->getLoopID()) {
+    // First operand should refer to the loop id itself.
+    assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
+    assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
+
+    for (unsigned I = 1, E = LoopID->getNumOperands(); I < E; ++I) {
+      MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(I));
+      if (!MD)
+        continue;
+
+      MDString *S = dyn_cast<MDString>(MD->getOperand(0));
+      if (!S)
+        continue;
+
+      if (S->getString().startswith(Prefix))
+        return true;
+    }
+  }
+  return false;
+}
+
+// Returns true if the loop has an unroll_and_jam(enable) pragma.
+static bool hasUnrollAndJamEnablePragma(const Loop *L) {
+  return getUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.enable");
+}
+
+// If loop has an unroll_and_jam_count pragma return the (necessarily
+// positive) value from the pragma.  Otherwise return 0.
+static unsigned unrollAndJamCountPragmaValue(const Loop *L) {
+  MDNode *MD = getUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.count");
+  if (MD) {
+    assert(MD->getNumOperands() == 2 &&
+           "Unroll count hint metadata should have two operands.");
+    unsigned Count =
+        mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
+    assert(Count >= 1 && "Unroll count must be positive.");
+    return Count;
+  }
+  return 0;
+}
+
+// Returns loop size estimation for unrolled loop.
+static uint64_t
+getUnrollAndJammedLoopSize(unsigned LoopSize,
+                           TargetTransformInfo::UnrollingPreferences &UP) {
+  assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!");
+  return static_cast<uint64_t>(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns;
+}
+
+// Calculates unroll and jam count and writes it to UP.Count. Returns true if
+// unroll count was set explicitly.
+static bool computeUnrollAndJamCount(
+    Loop *L, Loop *SubLoop, const TargetTransformInfo &TTI, DominatorTree &DT,
+    LoopInfo *LI, ScalarEvolution &SE,
+    const SmallPtrSetImpl<const Value *> &EphValues,
+    OptimizationRemarkEmitter *ORE, unsigned OuterTripCount,
+    unsigned OuterTripMultiple, unsigned OuterLoopSize, unsigned InnerTripCount,
+    unsigned InnerLoopSize, TargetTransformInfo::UnrollingPreferences &UP,
+    TargetTransformInfo::PeelingPreferences &PP) {
+  // First up use computeUnrollCount from the loop unroller to get a count
+  // for unrolling the outer loop, plus any loops requiring explicit
+  // unrolling we leave to the unroller. This uses UP.Threshold /
+  // UP.PartialThreshold / UP.MaxCount to come up with sensible loop values.
+  // We have already checked that the loop has no unroll.* pragmas.
+  unsigned MaxTripCount = 0;
+  bool UseUpperBound = false;
+  bool ExplicitUnroll = computeUnrollCount(
+      L, TTI, DT, LI, SE, EphValues, ORE, OuterTripCount, MaxTripCount,
+      /*MaxOrZero*/ false, OuterTripMultiple, OuterLoopSize, UP, PP,
+      UseUpperBound);
+  if (ExplicitUnroll || UseUpperBound) {
+    // If the user explicitly set the loop as unrolled, dont UnJ it. Leave it
+    // for the unroller instead.
+    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; explicit count set by "
+                         "computeUnrollCount\n");
+    UP.Count = 0;
+    return false;
+  }
+
+  // Override with any explicit Count from the "unroll-and-jam-count" option.
+  bool UserUnrollCount = UnrollAndJamCount.getNumOccurrences() > 0;
+  if (UserUnrollCount) {
+    UP.Count = UnrollAndJamCount;
+    UP.Force = true;
+    if (UP.AllowRemainder &&
+        getUnrollAndJammedLoopSize(OuterLoopSize, UP) < UP.Threshold &&
+        getUnrollAndJammedLoopSize(InnerLoopSize, UP) <
+            UP.UnrollAndJamInnerLoopThreshold)
+      return true;
+  }
+
+  // Check for unroll_and_jam pragmas
+  unsigned PragmaCount = unrollAndJamCountPragmaValue(L);
+  if (PragmaCount > 0) {
+    UP.Count = PragmaCount;
+    UP.Runtime = true;
+    UP.Force = true;
+    if ((UP.AllowRemainder || (OuterTripMultiple % PragmaCount == 0)) &&
+        getUnrollAndJammedLoopSize(OuterLoopSize, UP) < UP.Threshold &&
+        getUnrollAndJammedLoopSize(InnerLoopSize, UP) <
+            UP.UnrollAndJamInnerLoopThreshold)
+      return true;
+  }
+
+  bool PragmaEnableUnroll = hasUnrollAndJamEnablePragma(L);
+  bool ExplicitUnrollAndJamCount = PragmaCount > 0 || UserUnrollCount;
+  bool ExplicitUnrollAndJam = PragmaEnableUnroll || ExplicitUnrollAndJamCount;
+
+  // If the loop has an unrolling pragma, we want to be more aggressive with
+  // unrolling limits.
+  if (ExplicitUnrollAndJam)
+    UP.UnrollAndJamInnerLoopThreshold = PragmaUnrollAndJamThreshold;
+
+  if (!UP.AllowRemainder && getUnrollAndJammedLoopSize(InnerLoopSize, UP) >=
+                                UP.UnrollAndJamInnerLoopThreshold) {
+    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't create remainder and "
+                         "inner loop too large\n");
+    UP.Count = 0;
+    return false;
+  }
+
+  // We have a sensible limit for the outer loop, now adjust it for the inner
+  // loop and UP.UnrollAndJamInnerLoopThreshold. If the outer limit was set
+  // explicitly, we want to stick to it.
+  if (!ExplicitUnrollAndJamCount && UP.AllowRemainder) {
+    while (UP.Count != 0 && getUnrollAndJammedLoopSize(InnerLoopSize, UP) >=
+                                UP.UnrollAndJamInnerLoopThreshold)
+      UP.Count--;
+  }
+
+  // If we are explicitly unroll and jamming, we are done. Otherwise there are a
+  // number of extra performance heuristics to check.
+  if (ExplicitUnrollAndJam)
+    return true;
+
+  // If the inner loop count is known and small, leave the entire loop nest to
+  // be the unroller
+  if (InnerTripCount && InnerLoopSize * InnerTripCount < UP.Threshold) {
+    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; small inner loop count is "
+                         "being left for the unroller\n");
+    UP.Count = 0;
+    return false;
+  }
+
+  // Check for situations where UnJ is likely to be unprofitable. Including
+  // subloops with more than 1 block.
+  if (SubLoop->getBlocks().size() != 1) {
+    LLVM_DEBUG(
+        dbgs() << "Won't unroll-and-jam; More than one inner loop block\n");
+    UP.Count = 0;
+    return false;
+  }
+
+  // Limit to loops where there is something to gain from unrolling and
+  // jamming the loop. In this case, look for loads that are invariant in the
+  // outer loop and can become shared.
+  unsigned NumInvariant = 0;
+  for (BasicBlock *BB : SubLoop->getBlocks()) {
+    for (Instruction &I : *BB) {
+      if (auto *Ld = dyn_cast<LoadInst>(&I)) {
+        Value *V = Ld->getPointerOperand();
+        const SCEV *LSCEV = SE.getSCEVAtScope(V, L);
+        if (SE.isLoopInvariant(LSCEV, L))
+          NumInvariant++;
+      }
+    }
+  }
+  if (NumInvariant == 0) {
+    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; No loop invariant loads\n");
+    UP.Count = 0;
+    return false;
+  }
+
+  return false;
+}
+
+static LoopUnrollResult
+tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
+                      ScalarEvolution &SE, const TargetTransformInfo &TTI,
+                      AssumptionCache &AC, DependenceInfo &DI,
+                      OptimizationRemarkEmitter &ORE, int OptLevel) {
+  TargetTransformInfo::UnrollingPreferences UP =
+      gatherUnrollingPreferences(L, SE, TTI, nullptr, nullptr, OptLevel, None,
+                                 None, None, None, None, None);
+  TargetTransformInfo::PeelingPreferences PP =
+      gatherPeelingPreferences(L, SE, TTI, None, None);
 
   TransformationMode EnableMode = hasUnrollAndJamTransformation(L);
   if (EnableMode & TM_Disable)
@@ -295,242 +295,242 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
   if (EnableMode & TM_ForcedByUser)
     UP.UnrollAndJam = true;
 
-  if (AllowUnrollAndJam.getNumOccurrences() > 0) 
-    UP.UnrollAndJam = AllowUnrollAndJam; 
-  if (UnrollAndJamThreshold.getNumOccurrences() > 0) 
-    UP.UnrollAndJamInnerLoopThreshold = UnrollAndJamThreshold; 
-  // Exit early if unrolling is disabled. 
-  if (!UP.UnrollAndJam || UP.UnrollAndJamInnerLoopThreshold == 0) 
-    return LoopUnrollResult::Unmodified; 
- 
-  LLVM_DEBUG(dbgs() << "Loop Unroll and Jam: F[" 
-                    << L->getHeader()->getParent()->getName() << "] Loop %" 
-                    << L->getHeader()->getName() << "\n"); 
- 
-  // A loop with any unroll pragma (enabling/disabling/count/etc) is left for 
-  // the unroller, so long as it does not explicitly have unroll_and_jam 
-  // metadata. This means #pragma nounroll will disable unroll and jam as well 
-  // as unrolling 
-  if (hasAnyUnrollPragma(L, "llvm.loop.unroll.") && 
-      !hasAnyUnrollPragma(L, "llvm.loop.unroll_and_jam.")) { 
-    LLVM_DEBUG(dbgs() << "  Disabled due to pragma.\n"); 
-    return LoopUnrollResult::Unmodified; 
-  } 
- 
-  if (!isSafeToUnrollAndJam(L, SE, DT, DI, *LI)) { 
-    LLVM_DEBUG(dbgs() << "  Disabled due to not being safe.\n"); 
-    return LoopUnrollResult::Unmodified; 
-  } 
- 
-  // Approximate the loop size and collect useful info 
-  unsigned NumInlineCandidates; 
-  bool NotDuplicatable; 
-  bool Convergent; 
-  SmallPtrSet<const Value *, 32> EphValues; 
-  CodeMetrics::collectEphemeralValues(L, &AC, EphValues); 
-  Loop *SubLoop = L->getSubLoops()[0]; 
-  unsigned InnerLoopSize = 
-      ApproximateLoopSize(SubLoop, NumInlineCandidates, NotDuplicatable, 
-                          Convergent, TTI, EphValues, UP.BEInsns); 
-  unsigned OuterLoopSize = 
-      ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent, 
-                          TTI, EphValues, UP.BEInsns); 
-  LLVM_DEBUG(dbgs() << "  Outer Loop Size: " << OuterLoopSize << "\n"); 
-  LLVM_DEBUG(dbgs() << "  Inner Loop Size: " << InnerLoopSize << "\n"); 
-  if (NotDuplicatable) { 
-    LLVM_DEBUG(dbgs() << "  Not unrolling loop which contains non-duplicatable " 
-                         "instructions.\n"); 
-    return LoopUnrollResult::Unmodified; 
-  } 
-  if (NumInlineCandidates != 0) { 
-    LLVM_DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n"); 
-    return LoopUnrollResult::Unmodified; 
-  } 
-  if (Convergent) { 
-    LLVM_DEBUG( 
-        dbgs() << "  Not unrolling loop with convergent instructions.\n"); 
-    return LoopUnrollResult::Unmodified; 
-  } 
- 
-  // Save original loop IDs for after the transformation. 
-  MDNode *OrigOuterLoopID = L->getLoopID(); 
-  MDNode *OrigSubLoopID = SubLoop->getLoopID(); 
- 
-  // To assign the loop id of the epilogue, assign it before unrolling it so it 
-  // is applied to every inner loop of the epilogue. We later apply the loop ID 
-  // for the jammed inner loop. 
-  Optional<MDNode *> NewInnerEpilogueLoopID = makeFollowupLoopID( 
-      OrigOuterLoopID, {LLVMLoopUnrollAndJamFollowupAll, 
-                        LLVMLoopUnrollAndJamFollowupRemainderInner}); 
-  if (NewInnerEpilogueLoopID.hasValue()) 
-    SubLoop->setLoopID(NewInnerEpilogueLoopID.getValue()); 
- 
-  // Find trip count and trip multiple 
-  BasicBlock *Latch = L->getLoopLatch(); 
-  BasicBlock *SubLoopLatch = SubLoop->getLoopLatch(); 
-  unsigned OuterTripCount = SE.getSmallConstantTripCount(L, Latch); 
-  unsigned OuterTripMultiple = SE.getSmallConstantTripMultiple(L, Latch); 
-  unsigned InnerTripCount = SE.getSmallConstantTripCount(SubLoop, SubLoopLatch); 
- 
-  // Decide if, and by how much, to unroll 
-  bool IsCountSetExplicitly = computeUnrollAndJamCount( 
-      L, SubLoop, TTI, DT, LI, SE, EphValues, &ORE, OuterTripCount, 
-      OuterTripMultiple, OuterLoopSize, InnerTripCount, InnerLoopSize, UP, PP); 
-  if (UP.Count <= 1) 
-    return LoopUnrollResult::Unmodified; 
-  // Unroll factor (Count) must be less or equal to TripCount. 
-  if (OuterTripCount && UP.Count > OuterTripCount) 
-    UP.Count = OuterTripCount; 
- 
-  Loop *EpilogueOuterLoop = nullptr; 
-  LoopUnrollResult UnrollResult = UnrollAndJamLoop( 
-      L, UP.Count, OuterTripCount, OuterTripMultiple, UP.UnrollRemainder, LI, 
-      &SE, &DT, &AC, &TTI, &ORE, &EpilogueOuterLoop); 
- 
-  // Assign new loop attributes. 
-  if (EpilogueOuterLoop) { 
-    Optional<MDNode *> NewOuterEpilogueLoopID = makeFollowupLoopID( 
-        OrigOuterLoopID, {LLVMLoopUnrollAndJamFollowupAll, 
-                          LLVMLoopUnrollAndJamFollowupRemainderOuter}); 
-    if (NewOuterEpilogueLoopID.hasValue()) 
-      EpilogueOuterLoop->setLoopID(NewOuterEpilogueLoopID.getValue()); 
-  } 
- 
-  Optional<MDNode *> NewInnerLoopID = 
-      makeFollowupLoopID(OrigOuterLoopID, {LLVMLoopUnrollAndJamFollowupAll, 
-                                           LLVMLoopUnrollAndJamFollowupInner}); 
-  if (NewInnerLoopID.hasValue()) 
-    SubLoop->setLoopID(NewInnerLoopID.getValue()); 
-  else 
-    SubLoop->setLoopID(OrigSubLoopID); 
- 
-  if (UnrollResult == LoopUnrollResult::PartiallyUnrolled) { 
-    Optional<MDNode *> NewOuterLoopID = makeFollowupLoopID( 
-        OrigOuterLoopID, 
-        {LLVMLoopUnrollAndJamFollowupAll, LLVMLoopUnrollAndJamFollowupOuter}); 
-    if (NewOuterLoopID.hasValue()) { 
-      L->setLoopID(NewOuterLoopID.getValue()); 
- 
-      // Do not setLoopAlreadyUnrolled if a followup was given. 
-      return UnrollResult; 
-    } 
-  } 
- 
-  // If loop has an unroll count pragma or unrolled by explicitly set count 
-  // mark loop as unrolled to prevent unrolling beyond that requested. 
-  if (UnrollResult != LoopUnrollResult::FullyUnrolled && IsCountSetExplicitly) 
-    L->setLoopAlreadyUnrolled(); 
- 
-  return UnrollResult; 
-} 
- 
-static bool tryToUnrollAndJamLoop(Function &F, DominatorTree &DT, LoopInfo &LI, 
-                                  ScalarEvolution &SE, 
-                                  const TargetTransformInfo &TTI, 
-                                  AssumptionCache &AC, DependenceInfo &DI, 
-                                  OptimizationRemarkEmitter &ORE, 
-                                  int OptLevel) { 
-  bool DidSomething = false; 
- 
-  // The loop unroll and jam pass requires loops to be in simplified form, and 
-  // also needs LCSSA. Since simplification may add new inner loops, it has to 
-  // run before the legality and profitability checks. This means running the 
-  // loop unroll and jam pass will simplify all loops, regardless of whether 
-  // anything end up being unroll and jammed. 
-  for (auto &L : LI) { 
-    DidSomething |= 
-        simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */); 
-    DidSomething |= formLCSSARecursively(*L, DT, &LI, &SE); 
-  } 
- 
-  // Add the loop nests in the reverse order of LoopInfo. See method 
-  // declaration. 
-  SmallPriorityWorklist<Loop *, 4> Worklist; 
-  appendLoopsToWorklist(LI, Worklist); 
-  while (!Worklist.empty()) { 
-    Loop *L = Worklist.pop_back_val(); 
-    LoopUnrollResult Result = 
-        tryToUnrollAndJamLoop(L, DT, &LI, SE, TTI, AC, DI, ORE, OptLevel); 
-    if (Result != LoopUnrollResult::Unmodified) 
-      DidSomething = true; 
-  } 
- 
-  return DidSomething; 
-} 
- 
-namespace { 
- 
-class LoopUnrollAndJam : public FunctionPass { 
-public: 
-  static char ID; // Pass ID, replacement for typeid 
-  unsigned OptLevel; 
- 
-  LoopUnrollAndJam(int OptLevel = 2) : FunctionPass(ID), OptLevel(OptLevel) { 
-    initializeLoopUnrollAndJamPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnFunction(Function &F) override { 
-    if (skipFunction(F)) 
-      return false; 
- 
-    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-    LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 
-    ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 
-    const TargetTransformInfo &TTI = 
-        getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 
-    auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 
-    auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI(); 
-    auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 
- 
-    return tryToUnrollAndJamLoop(F, DT, LI, SE, TTI, AC, DI, ORE, OptLevel); 
-  } 
- 
-  /// This transformation requires natural loop information & requires that 
-  /// loop preheaders be inserted into the CFG... 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addRequired<LoopInfoWrapperPass>(); 
-    AU.addRequired<ScalarEvolutionWrapperPass>(); 
-    AU.addRequired<TargetTransformInfoWrapperPass>(); 
-    AU.addRequired<AssumptionCacheTracker>(); 
-    AU.addRequired<DependenceAnalysisWrapperPass>(); 
-    AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-char LoopUnrollAndJam::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(LoopUnrollAndJam, "loop-unroll-and-jam", 
-                      "Unroll and Jam loops", false, false) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
-INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 
-INITIALIZE_PASS_END(LoopUnrollAndJam, "loop-unroll-and-jam", 
-                    "Unroll and Jam loops", false, false) 
- 
-Pass *llvm::createLoopUnrollAndJamPass(int OptLevel) { 
-  return new LoopUnrollAndJam(OptLevel); 
-} 
- 
-PreservedAnalyses LoopUnrollAndJamPass::run(Function &F, 
-                                            FunctionAnalysisManager &AM) { 
-  ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 
-  LoopInfo &LI = AM.getResult<LoopAnalysis>(F); 
-  TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F); 
-  AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F); 
-  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F); 
-  DependenceInfo &DI = AM.getResult<DependenceAnalysis>(F); 
-  OptimizationRemarkEmitter &ORE = 
-      AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 
- 
-  if (!tryToUnrollAndJamLoop(F, DT, LI, SE, TTI, AC, DI, ORE, OptLevel)) 
-    return PreservedAnalyses::all(); 
- 
-  return getLoopPassPreservedAnalyses(); 
-} 
+  if (AllowUnrollAndJam.getNumOccurrences() > 0)
+    UP.UnrollAndJam = AllowUnrollAndJam;
+  if (UnrollAndJamThreshold.getNumOccurrences() > 0)
+    UP.UnrollAndJamInnerLoopThreshold = UnrollAndJamThreshold;
+  // Exit early if unrolling is disabled.
+  if (!UP.UnrollAndJam || UP.UnrollAndJamInnerLoopThreshold == 0)
+    return LoopUnrollResult::Unmodified;
+
+  LLVM_DEBUG(dbgs() << "Loop Unroll and Jam: F["
+                    << L->getHeader()->getParent()->getName() << "] Loop %"
+                    << L->getHeader()->getName() << "\n");
+
+  // A loop with any unroll pragma (enabling/disabling/count/etc) is left for
+  // the unroller, so long as it does not explicitly have unroll_and_jam
+  // metadata. This means #pragma nounroll will disable unroll and jam as well
+  // as unrolling
+  if (hasAnyUnrollPragma(L, "llvm.loop.unroll.") &&
+      !hasAnyUnrollPragma(L, "llvm.loop.unroll_and_jam.")) {
+    LLVM_DEBUG(dbgs() << "  Disabled due to pragma.\n");
+    return LoopUnrollResult::Unmodified;
+  }
+
+  if (!isSafeToUnrollAndJam(L, SE, DT, DI, *LI)) {
+    LLVM_DEBUG(dbgs() << "  Disabled due to not being safe.\n");
+    return LoopUnrollResult::Unmodified;
+  }
+
+  // Approximate the loop size and collect useful info
+  unsigned NumInlineCandidates;
+  bool NotDuplicatable;
+  bool Convergent;
+  SmallPtrSet<const Value *, 32> EphValues;
+  CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
+  Loop *SubLoop = L->getSubLoops()[0];
+  unsigned InnerLoopSize =
+      ApproximateLoopSize(SubLoop, NumInlineCandidates, NotDuplicatable,
+                          Convergent, TTI, EphValues, UP.BEInsns);
+  unsigned OuterLoopSize =
+      ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent,
+                          TTI, EphValues, UP.BEInsns);
+  LLVM_DEBUG(dbgs() << "  Outer Loop Size: " << OuterLoopSize << "\n");
+  LLVM_DEBUG(dbgs() << "  Inner Loop Size: " << InnerLoopSize << "\n");
+  if (NotDuplicatable) {
+    LLVM_DEBUG(dbgs() << "  Not unrolling loop which contains non-duplicatable "
+                         "instructions.\n");
+    return LoopUnrollResult::Unmodified;
+  }
+  if (NumInlineCandidates != 0) {
+    LLVM_DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n");
+    return LoopUnrollResult::Unmodified;
+  }
+  if (Convergent) {
+    LLVM_DEBUG(
+        dbgs() << "  Not unrolling loop with convergent instructions.\n");
+    return LoopUnrollResult::Unmodified;
+  }
+
+  // Save original loop IDs for after the transformation.
+  MDNode *OrigOuterLoopID = L->getLoopID();
+  MDNode *OrigSubLoopID = SubLoop->getLoopID();
+
+  // To assign the loop id of the epilogue, assign it before unrolling it so it
+  // is applied to every inner loop of the epilogue. We later apply the loop ID
+  // for the jammed inner loop.
+  Optional<MDNode *> NewInnerEpilogueLoopID = makeFollowupLoopID(
+      OrigOuterLoopID, {LLVMLoopUnrollAndJamFollowupAll,
+                        LLVMLoopUnrollAndJamFollowupRemainderInner});
+  if (NewInnerEpilogueLoopID.hasValue())
+    SubLoop->setLoopID(NewInnerEpilogueLoopID.getValue());
+
+  // Find trip count and trip multiple
+  BasicBlock *Latch = L->getLoopLatch();
+  BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
+  unsigned OuterTripCount = SE.getSmallConstantTripCount(L, Latch);
+  unsigned OuterTripMultiple = SE.getSmallConstantTripMultiple(L, Latch);
+  unsigned InnerTripCount = SE.getSmallConstantTripCount(SubLoop, SubLoopLatch);
+
+  // Decide if, and by how much, to unroll
+  bool IsCountSetExplicitly = computeUnrollAndJamCount(
+      L, SubLoop, TTI, DT, LI, SE, EphValues, &ORE, OuterTripCount,
+      OuterTripMultiple, OuterLoopSize, InnerTripCount, InnerLoopSize, UP, PP);
+  if (UP.Count <= 1)
+    return LoopUnrollResult::Unmodified;
+  // Unroll factor (Count) must be less or equal to TripCount.
+  if (OuterTripCount && UP.Count > OuterTripCount)
+    UP.Count = OuterTripCount;
+
+  Loop *EpilogueOuterLoop = nullptr;
+  LoopUnrollResult UnrollResult = UnrollAndJamLoop(
+      L, UP.Count, OuterTripCount, OuterTripMultiple, UP.UnrollRemainder, LI,
+      &SE, &DT, &AC, &TTI, &ORE, &EpilogueOuterLoop);
+
+  // Assign new loop attributes.
+  if (EpilogueOuterLoop) {
+    Optional<MDNode *> NewOuterEpilogueLoopID = makeFollowupLoopID(
+        OrigOuterLoopID, {LLVMLoopUnrollAndJamFollowupAll,
+                          LLVMLoopUnrollAndJamFollowupRemainderOuter});
+    if (NewOuterEpilogueLoopID.hasValue())
+      EpilogueOuterLoop->setLoopID(NewOuterEpilogueLoopID.getValue());
+  }
+
+  Optional<MDNode *> NewInnerLoopID =
+      makeFollowupLoopID(OrigOuterLoopID, {LLVMLoopUnrollAndJamFollowupAll,
+                                           LLVMLoopUnrollAndJamFollowupInner});
+  if (NewInnerLoopID.hasValue())
+    SubLoop->setLoopID(NewInnerLoopID.getValue());
+  else
+    SubLoop->setLoopID(OrigSubLoopID);
+
+  if (UnrollResult == LoopUnrollResult::PartiallyUnrolled) {
+    Optional<MDNode *> NewOuterLoopID = makeFollowupLoopID(
+        OrigOuterLoopID,
+        {LLVMLoopUnrollAndJamFollowupAll, LLVMLoopUnrollAndJamFollowupOuter});
+    if (NewOuterLoopID.hasValue()) {
+      L->setLoopID(NewOuterLoopID.getValue());
+
+      // Do not setLoopAlreadyUnrolled if a followup was given.
+      return UnrollResult;
+    }
+  }
+
+  // If loop has an unroll count pragma or unrolled by explicitly set count
+  // mark loop as unrolled to prevent unrolling beyond that requested.
+  if (UnrollResult != LoopUnrollResult::FullyUnrolled && IsCountSetExplicitly)
+    L->setLoopAlreadyUnrolled();
+
+  return UnrollResult;
+}
+
+static bool tryToUnrollAndJamLoop(Function &F, DominatorTree &DT, LoopInfo &LI,
+                                  ScalarEvolution &SE,
+                                  const TargetTransformInfo &TTI,
+                                  AssumptionCache &AC, DependenceInfo &DI,
+                                  OptimizationRemarkEmitter &ORE,
+                                  int OptLevel) {
+  bool DidSomething = false;
+
+  // The loop unroll and jam pass requires loops to be in simplified form, and
+  // also needs LCSSA. Since simplification may add new inner loops, it has to
+  // run before the legality and profitability checks. This means running the
+  // loop unroll and jam pass will simplify all loops, regardless of whether
+  // anything end up being unroll and jammed.
+  for (auto &L : LI) {
+    DidSomething |=
+        simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */);
+    DidSomething |= formLCSSARecursively(*L, DT, &LI, &SE);
+  }
+
+  // Add the loop nests in the reverse order of LoopInfo. See method
+  // declaration.
+  SmallPriorityWorklist<Loop *, 4> Worklist;
+  appendLoopsToWorklist(LI, Worklist);
+  while (!Worklist.empty()) {
+    Loop *L = Worklist.pop_back_val();
+    LoopUnrollResult Result =
+        tryToUnrollAndJamLoop(L, DT, &LI, SE, TTI, AC, DI, ORE, OptLevel);
+    if (Result != LoopUnrollResult::Unmodified)
+      DidSomething = true;
+  }
+
+  return DidSomething;
+}
+
+namespace {
+
+class LoopUnrollAndJam : public FunctionPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+  unsigned OptLevel;
+
+  LoopUnrollAndJam(int OptLevel = 2) : FunctionPass(ID), OptLevel(OptLevel) {
+    initializeLoopUnrollAndJamPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    const TargetTransformInfo &TTI =
+        getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI();
+    auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+
+    return tryToUnrollAndJamLoop(F, DT, LI, SE, TTI, AC, DI, ORE, OptLevel);
+  }
+
+  /// This transformation requires natural loop information & requires that
+  /// loop preheaders be inserted into the CFG...
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DependenceAnalysisWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+  }
+};
+
+} // end anonymous namespace
+
+char LoopUnrollAndJam::ID = 0;
+
+INITIALIZE_PASS_BEGIN(LoopUnrollAndJam, "loop-unroll-and-jam",
+                      "Unroll and Jam loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_END(LoopUnrollAndJam, "loop-unroll-and-jam",
+                    "Unroll and Jam loops", false, false)
+
+Pass *llvm::createLoopUnrollAndJamPass(int OptLevel) {
+  return new LoopUnrollAndJam(OptLevel);
+}
+
+PreservedAnalyses LoopUnrollAndJamPass::run(Function &F,
+                                            FunctionAnalysisManager &AM) {
+  ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+  LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
+  TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
+  AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
+  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  DependenceInfo &DI = AM.getResult<DependenceAnalysis>(F);
+  OptimizationRemarkEmitter &ORE =
+      AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+
+  if (!tryToUnrollAndJamLoop(F, DT, LI, SE, TTI, AC, DI, ORE, OptLevel))
+    return PreservedAnalyses::all();
+
+  return getLoopPassPreservedAnalyses();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 02e1f82b54..1b974576a3 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -1,1472 +1,1472 @@
-//===- LoopUnroll.cpp - Loop unroller pass --------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass implements a simple loop unroller.  It works best when loops have 
-// been canonicalized by the -indvars pass, allowing it to determine the trip 
-// counts of loops easily. 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/LoopUnrollPass.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/DenseMapInfo.h" 
-#include "llvm/ADT/DenseSet.h" 
-#include "llvm/ADT/None.h" 
-#include "llvm/ADT/Optional.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/StringRef.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/BlockFrequencyInfo.h" 
-#include "llvm/Analysis/CodeMetrics.h" 
-#include "llvm/Analysis/LazyBlockFrequencyInfo.h" 
-#include "llvm/Analysis/LoopAnalysisManager.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/LoopPass.h" 
-#include "llvm/Analysis/LoopUnrollAnalyzer.h" 
-#include "llvm/Analysis/OptimizationRemarkEmitter.h" 
-#include "llvm/Analysis/ProfileSummaryInfo.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DiagnosticInfo.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Scalar/LoopPassManager.h" 
-#include "llvm/Transforms/Utils.h" 
+//===- LoopUnroll.cpp - Loop unroller pass --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements a simple loop unroller.  It works best when loops have
+// been canonicalized by the -indvars pass, allowing it to determine the trip
+// counts of loops easily.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopUnrollPass.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/LoopUnrollAnalyzer.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/LoopPeel.h"
-#include "llvm/Transforms/Utils/LoopSimplify.h" 
-#include "llvm/Transforms/Utils/LoopUtils.h" 
-#include "llvm/Transforms/Utils/SizeOpts.h" 
-#include "llvm/Transforms/Utils/UnrollLoop.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <cstdint> 
-#include <limits> 
-#include <string> 
-#include <tuple> 
-#include <utility> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "loop-unroll" 
- 
-cl::opt<bool> llvm::ForgetSCEVInLoopUnroll( 
-    "forget-scev-loop-unroll", cl::init(false), cl::Hidden, 
-    cl::desc("Forget everything in SCEV when doing LoopUnroll, instead of just" 
+#include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <limits>
+#include <string>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-unroll"
+
+cl::opt<bool> llvm::ForgetSCEVInLoopUnroll(
+    "forget-scev-loop-unroll", cl::init(false), cl::Hidden,
+    cl::desc("Forget everything in SCEV when doing LoopUnroll, instead of just"
              " the current top-most loop. This is sometimes preferred to reduce"
-             " compile time.")); 
- 
-static cl::opt<unsigned> 
-    UnrollThreshold("unroll-threshold", cl::Hidden, 
-                    cl::desc("The cost threshold for loop unrolling")); 
- 
+             " compile time."));
+
+static cl::opt<unsigned>
+    UnrollThreshold("unroll-threshold", cl::Hidden,
+                    cl::desc("The cost threshold for loop unrolling"));
+
 static cl::opt<unsigned>
     UnrollOptSizeThreshold(
       "unroll-optsize-threshold", cl::init(0), cl::Hidden,
       cl::desc("The cost threshold for loop unrolling when optimizing for "
                "size"));
 
-static cl::opt<unsigned> UnrollPartialThreshold( 
-    "unroll-partial-threshold", cl::Hidden, 
-    cl::desc("The cost threshold for partial loop unrolling")); 
- 
-static cl::opt<unsigned> UnrollMaxPercentThresholdBoost( 
-    "unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden, 
-    cl::desc("The maximum 'boost' (represented as a percentage >= 100) applied " 
-             "to the threshold when aggressively unrolling a loop due to the " 
-             "dynamic cost savings. If completely unrolling a loop will reduce " 
-             "the total runtime from X to Y, we boost the loop unroll " 
-             "threshold to DefaultThreshold*std::min(MaxPercentThresholdBoost, " 
-             "X/Y). This limit avoids excessive code bloat.")); 
- 
-static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze( 
-    "unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden, 
-    cl::desc("Don't allow loop unrolling to simulate more than this number of" 
-             "iterations when checking full unroll profitability")); 
- 
-static cl::opt<unsigned> UnrollCount( 
-    "unroll-count", cl::Hidden, 
-    cl::desc("Use this unroll count for all loops including those with " 
-             "unroll_count pragma values, for testing purposes")); 
- 
-static cl::opt<unsigned> UnrollMaxCount( 
-    "unroll-max-count", cl::Hidden, 
-    cl::desc("Set the max unroll count for partial and runtime unrolling, for" 
-             "testing purposes")); 
- 
-static cl::opt<unsigned> UnrollFullMaxCount( 
-    "unroll-full-max-count", cl::Hidden, 
-    cl::desc( 
-        "Set the max unroll count for full unrolling, for testing purposes")); 
- 
-static cl::opt<bool> 
-    UnrollAllowPartial("unroll-allow-partial", cl::Hidden, 
-                       cl::desc("Allows loops to be partially unrolled until " 
-                                "-unroll-threshold loop size is reached.")); 
- 
-static cl::opt<bool> UnrollAllowRemainder( 
-    "unroll-allow-remainder", cl::Hidden, 
-    cl::desc("Allow generation of a loop remainder (extra iterations) " 
-             "when unrolling a loop.")); 
- 
-static cl::opt<bool> 
-    UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden, 
-                  cl::desc("Unroll loops with run-time trip counts")); 
- 
-static cl::opt<unsigned> UnrollMaxUpperBound( 
-    "unroll-max-upperbound", cl::init(8), cl::Hidden, 
-    cl::desc( 
-        "The max of trip count upper bound that is considered in unrolling")); 
- 
-static cl::opt<unsigned> PragmaUnrollThreshold( 
-    "pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden, 
-    cl::desc("Unrolled size limit for loops with an unroll(full) or " 
-             "unroll_count pragma.")); 
- 
-static cl::opt<unsigned> FlatLoopTripCountThreshold( 
-    "flat-loop-tripcount-threshold", cl::init(5), cl::Hidden, 
-    cl::desc("If the runtime tripcount for the loop is lower than the " 
-             "threshold, the loop is considered as flat and will be less " 
-             "aggressively unrolled.")); 
- 
-static cl::opt<bool> UnrollUnrollRemainder( 
-  "unroll-remainder", cl::Hidden, 
-  cl::desc("Allow the loop remainder to be unrolled.")); 
- 
-// This option isn't ever intended to be enabled, it serves to allow 
-// experiments to check the assumptions about when this kind of revisit is 
-// necessary. 
-static cl::opt<bool> UnrollRevisitChildLoops( 
-    "unroll-revisit-child-loops", cl::Hidden, 
-    cl::desc("Enqueue and re-visit child loops in the loop PM after unrolling. " 
-             "This shouldn't typically be needed as child loops (or their " 
-             "clones) were already visited.")); 
- 
-static cl::opt<unsigned> UnrollThresholdAggressive( 
-    "unroll-threshold-aggressive", cl::init(300), cl::Hidden, 
-    cl::desc("Threshold (max size of unrolled loop) to use in aggressive (O3) " 
-             "optimizations")); 
-static cl::opt<unsigned> 
-    UnrollThresholdDefault("unroll-threshold-default", cl::init(150), 
-                           cl::Hidden, 
-                           cl::desc("Default threshold (max size of unrolled " 
-                                    "loop), used in all but O3 optimizations")); 
- 
-/// A magic value for use with the Threshold parameter to indicate 
-/// that the loop unroll should be performed regardless of how much 
-/// code expansion would result. 
-static const unsigned NoThreshold = std::numeric_limits<unsigned>::max(); 
- 
-/// Gather the various unrolling parameters based on the defaults, compiler 
-/// flags, TTI overrides and user specified parameters. 
-TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences( 
-    Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, 
-    BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, int OptLevel, 
-    Optional<unsigned> UserThreshold, Optional<unsigned> UserCount, 
-    Optional<bool> UserAllowPartial, Optional<bool> UserRuntime, 
-    Optional<bool> UserUpperBound, Optional<unsigned> UserFullUnrollMaxCount) { 
-  TargetTransformInfo::UnrollingPreferences UP; 
- 
-  // Set up the defaults 
-  UP.Threshold = 
-      OptLevel > 2 ? UnrollThresholdAggressive : UnrollThresholdDefault; 
-  UP.MaxPercentThresholdBoost = 400; 
+static cl::opt<unsigned> UnrollPartialThreshold(
+    "unroll-partial-threshold", cl::Hidden,
+    cl::desc("The cost threshold for partial loop unrolling"));
+
+static cl::opt<unsigned> UnrollMaxPercentThresholdBoost(
+    "unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden,
+    cl::desc("The maximum 'boost' (represented as a percentage >= 100) applied "
+             "to the threshold when aggressively unrolling a loop due to the "
+             "dynamic cost savings. If completely unrolling a loop will reduce "
+             "the total runtime from X to Y, we boost the loop unroll "
+             "threshold to DefaultThreshold*std::min(MaxPercentThresholdBoost, "
+             "X/Y). This limit avoids excessive code bloat."));
+
+static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze(
+    "unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden,
+    cl::desc("Don't allow loop unrolling to simulate more than this number of"
+             "iterations when checking full unroll profitability"));
+
+static cl::opt<unsigned> UnrollCount(
+    "unroll-count", cl::Hidden,
+    cl::desc("Use this unroll count for all loops including those with "
+             "unroll_count pragma values, for testing purposes"));
+
+static cl::opt<unsigned> UnrollMaxCount(
+    "unroll-max-count", cl::Hidden,
+    cl::desc("Set the max unroll count for partial and runtime unrolling, for"
+             "testing purposes"));
+
+static cl::opt<unsigned> UnrollFullMaxCount(
+    "unroll-full-max-count", cl::Hidden,
+    cl::desc(
+        "Set the max unroll count for full unrolling, for testing purposes"));
+
+static cl::opt<bool>
+    UnrollAllowPartial("unroll-allow-partial", cl::Hidden,
+                       cl::desc("Allows loops to be partially unrolled until "
+                                "-unroll-threshold loop size is reached."));
+
+static cl::opt<bool> UnrollAllowRemainder(
+    "unroll-allow-remainder", cl::Hidden,
+    cl::desc("Allow generation of a loop remainder (extra iterations) "
+             "when unrolling a loop."));
+
+static cl::opt<bool>
+    UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden,
+                  cl::desc("Unroll loops with run-time trip counts"));
+
+static cl::opt<unsigned> UnrollMaxUpperBound(
+    "unroll-max-upperbound", cl::init(8), cl::Hidden,
+    cl::desc(
+        "The max of trip count upper bound that is considered in unrolling"));
+
+static cl::opt<unsigned> PragmaUnrollThreshold(
+    "pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden,
+    cl::desc("Unrolled size limit for loops with an unroll(full) or "
+             "unroll_count pragma."));
+
+static cl::opt<unsigned> FlatLoopTripCountThreshold(
+    "flat-loop-tripcount-threshold", cl::init(5), cl::Hidden,
+    cl::desc("If the runtime tripcount for the loop is lower than the "
+             "threshold, the loop is considered as flat and will be less "
+             "aggressively unrolled."));
+
+static cl::opt<bool> UnrollUnrollRemainder(
+  "unroll-remainder", cl::Hidden,
+  cl::desc("Allow the loop remainder to be unrolled."));
+
+// This option isn't ever intended to be enabled, it serves to allow
+// experiments to check the assumptions about when this kind of revisit is
+// necessary.
+static cl::opt<bool> UnrollRevisitChildLoops(
+    "unroll-revisit-child-loops", cl::Hidden,
+    cl::desc("Enqueue and re-visit child loops in the loop PM after unrolling. "
+             "This shouldn't typically be needed as child loops (or their "
+             "clones) were already visited."));
+
+static cl::opt<unsigned> UnrollThresholdAggressive(
+    "unroll-threshold-aggressive", cl::init(300), cl::Hidden,
+    cl::desc("Threshold (max size of unrolled loop) to use in aggressive (O3) "
+             "optimizations"));
+static cl::opt<unsigned>
+    UnrollThresholdDefault("unroll-threshold-default", cl::init(150),
+                           cl::Hidden,
+                           cl::desc("Default threshold (max size of unrolled "
+                                    "loop), used in all but O3 optimizations"));
+
+/// A magic value for use with the Threshold parameter to indicate
+/// that the loop unroll should be performed regardless of how much
+/// code expansion would result.
+static const unsigned NoThreshold = std::numeric_limits<unsigned>::max();
+
+/// Gather the various unrolling parameters based on the defaults, compiler
+/// flags, TTI overrides and user specified parameters.
+TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
+    Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
+    BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, int OptLevel,
+    Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
+    Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
+    Optional<bool> UserUpperBound, Optional<unsigned> UserFullUnrollMaxCount) {
+  TargetTransformInfo::UnrollingPreferences UP;
+
+  // Set up the defaults
+  UP.Threshold =
+      OptLevel > 2 ? UnrollThresholdAggressive : UnrollThresholdDefault;
+  UP.MaxPercentThresholdBoost = 400;
   UP.OptSizeThreshold = UnrollOptSizeThreshold;
-  UP.PartialThreshold = 150; 
+  UP.PartialThreshold = 150;
   UP.PartialOptSizeThreshold = UnrollOptSizeThreshold;
-  UP.Count = 0; 
-  UP.DefaultUnrollRuntimeCount = 8; 
-  UP.MaxCount = std::numeric_limits<unsigned>::max(); 
-  UP.FullUnrollMaxCount = std::numeric_limits<unsigned>::max(); 
-  UP.BEInsns = 2; 
-  UP.Partial = false; 
-  UP.Runtime = false; 
-  UP.AllowRemainder = true; 
-  UP.UnrollRemainder = false; 
-  UP.AllowExpensiveTripCount = false; 
-  UP.Force = false; 
-  UP.UpperBound = false; 
-  UP.UnrollAndJam = false; 
-  UP.UnrollAndJamInnerLoopThreshold = 60; 
-  UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze; 
- 
-  // Override with any target specific settings 
-  TTI.getUnrollingPreferences(L, SE, UP); 
- 
-  // Apply size attributes 
-  bool OptForSize = L->getHeader()->getParent()->hasOptSize() || 
+  UP.Count = 0;
+  UP.DefaultUnrollRuntimeCount = 8;
+  UP.MaxCount = std::numeric_limits<unsigned>::max();
+  UP.FullUnrollMaxCount = std::numeric_limits<unsigned>::max();
+  UP.BEInsns = 2;
+  UP.Partial = false;
+  UP.Runtime = false;
+  UP.AllowRemainder = true;
+  UP.UnrollRemainder = false;
+  UP.AllowExpensiveTripCount = false;
+  UP.Force = false;
+  UP.UpperBound = false;
+  UP.UnrollAndJam = false;
+  UP.UnrollAndJamInnerLoopThreshold = 60;
+  UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze;
+
+  // Override with any target specific settings
+  TTI.getUnrollingPreferences(L, SE, UP);
+
+  // Apply size attributes
+  bool OptForSize = L->getHeader()->getParent()->hasOptSize() ||
                     // Let unroll hints / pragmas take precedence over PGSO.
                     (hasUnrollTransformation(L) != TM_ForcedByUser &&
                      llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
                                                  PGSOQueryType::IRPass));
-  if (OptForSize) { 
-    UP.Threshold = UP.OptSizeThreshold; 
-    UP.PartialThreshold = UP.PartialOptSizeThreshold; 
-    UP.MaxPercentThresholdBoost = 100; 
-  } 
- 
-  // Apply any user values specified by cl::opt 
-  if (UnrollThreshold.getNumOccurrences() > 0) 
-    UP.Threshold = UnrollThreshold; 
-  if (UnrollPartialThreshold.getNumOccurrences() > 0) 
-    UP.PartialThreshold = UnrollPartialThreshold; 
-  if (UnrollMaxPercentThresholdBoost.getNumOccurrences() > 0) 
-    UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost; 
-  if (UnrollMaxCount.getNumOccurrences() > 0) 
-    UP.MaxCount = UnrollMaxCount; 
-  if (UnrollFullMaxCount.getNumOccurrences() > 0) 
-    UP.FullUnrollMaxCount = UnrollFullMaxCount; 
-  if (UnrollAllowPartial.getNumOccurrences() > 0) 
-    UP.Partial = UnrollAllowPartial; 
-  if (UnrollAllowRemainder.getNumOccurrences() > 0) 
-    UP.AllowRemainder = UnrollAllowRemainder; 
-  if (UnrollRuntime.getNumOccurrences() > 0) 
-    UP.Runtime = UnrollRuntime; 
-  if (UnrollMaxUpperBound == 0) 
-    UP.UpperBound = false; 
-  if (UnrollUnrollRemainder.getNumOccurrences() > 0) 
-    UP.UnrollRemainder = UnrollUnrollRemainder; 
-  if (UnrollMaxIterationsCountToAnalyze.getNumOccurrences() > 0) 
-    UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze; 
- 
-  // Apply user values provided by argument 
-  if (UserThreshold.hasValue()) { 
-    UP.Threshold = *UserThreshold; 
-    UP.PartialThreshold = *UserThreshold; 
-  } 
-  if (UserCount.hasValue()) 
-    UP.Count = *UserCount; 
-  if (UserAllowPartial.hasValue()) 
-    UP.Partial = *UserAllowPartial; 
-  if (UserRuntime.hasValue()) 
-    UP.Runtime = *UserRuntime; 
-  if (UserUpperBound.hasValue()) 
-    UP.UpperBound = *UserUpperBound; 
-  if (UserFullUnrollMaxCount.hasValue()) 
-    UP.FullUnrollMaxCount = *UserFullUnrollMaxCount; 
- 
-  return UP; 
-} 
- 
-namespace { 
- 
-/// A struct to densely store the state of an instruction after unrolling at 
-/// each iteration. 
-/// 
-/// This is designed to work like a tuple of <Instruction *, int> for the 
-/// purposes of hashing and lookup, but to be able to associate two boolean 
-/// states with each key. 
-struct UnrolledInstState { 
-  Instruction *I; 
-  int Iteration : 30; 
-  unsigned IsFree : 1; 
-  unsigned IsCounted : 1; 
-}; 
- 
-/// Hashing and equality testing for a set of the instruction states. 
-struct UnrolledInstStateKeyInfo { 
-  using PtrInfo = DenseMapInfo<Instruction *>; 
-  using PairInfo = DenseMapInfo<std::pair<Instruction *, int>>; 
- 
-  static inline UnrolledInstState getEmptyKey() { 
-    return {PtrInfo::getEmptyKey(), 0, 0, 0}; 
-  } 
- 
-  static inline UnrolledInstState getTombstoneKey() { 
-    return {PtrInfo::getTombstoneKey(), 0, 0, 0}; 
-  } 
- 
-  static inline unsigned getHashValue(const UnrolledInstState &S) { 
-    return PairInfo::getHashValue({S.I, S.Iteration}); 
-  } 
- 
-  static inline bool isEqual(const UnrolledInstState &LHS, 
-                             const UnrolledInstState &RHS) { 
-    return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration}); 
-  } 
-}; 
- 
-struct EstimatedUnrollCost { 
-  /// The estimated cost after unrolling. 
-  unsigned UnrolledCost; 
- 
-  /// The estimated dynamic cost of executing the instructions in the 
-  /// rolled form. 
-  unsigned RolledDynamicCost; 
-}; 
- 
-} // end anonymous namespace 
- 
-/// Figure out if the loop is worth full unrolling. 
-/// 
-/// Complete loop unrolling can make some loads constant, and we need to know 
-/// if that would expose any further optimization opportunities.  This routine 
-/// estimates this optimization.  It computes cost of unrolled loop 
-/// (UnrolledCost) and dynamic cost of the original loop (RolledDynamicCost). By 
-/// dynamic cost we mean that we won't count costs of blocks that are known not 
-/// to be executed (i.e. if we have a branch in the loop and we know that at the 
-/// given iteration its condition would be resolved to true, we won't add up the 
-/// cost of the 'false'-block). 
-/// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If 
-/// the analysis failed (no benefits expected from the unrolling, or the loop is 
-/// too big to analyze), the returned value is None. 
-static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost( 
-    const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE, 
-    const SmallPtrSetImpl<const Value *> &EphValues, 
-    const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize, 
-    unsigned MaxIterationsCountToAnalyze) { 
-  // We want to be able to scale offsets by the trip count and add more offsets 
-  // to them without checking for overflows, and we already don't want to 
-  // analyze *massive* trip counts, so we force the max to be reasonably small. 
-  assert(MaxIterationsCountToAnalyze < 
-             (unsigned)(std::numeric_limits<int>::max() / 2) && 
-         "The unroll iterations max is too large!"); 
- 
-  // Only analyze inner loops. We can't properly estimate cost of nested loops 
-  // and we won't visit inner loops again anyway. 
+  if (OptForSize) {
+    UP.Threshold = UP.OptSizeThreshold;
+    UP.PartialThreshold = UP.PartialOptSizeThreshold;
+    UP.MaxPercentThresholdBoost = 100;
+  }
+
+  // Apply any user values specified by cl::opt
+  if (UnrollThreshold.getNumOccurrences() > 0)
+    UP.Threshold = UnrollThreshold;
+  if (UnrollPartialThreshold.getNumOccurrences() > 0)
+    UP.PartialThreshold = UnrollPartialThreshold;
+  if (UnrollMaxPercentThresholdBoost.getNumOccurrences() > 0)
+    UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost;
+  if (UnrollMaxCount.getNumOccurrences() > 0)
+    UP.MaxCount = UnrollMaxCount;
+  if (UnrollFullMaxCount.getNumOccurrences() > 0)
+    UP.FullUnrollMaxCount = UnrollFullMaxCount;
+  if (UnrollAllowPartial.getNumOccurrences() > 0)
+    UP.Partial = UnrollAllowPartial;
+  if (UnrollAllowRemainder.getNumOccurrences() > 0)
+    UP.AllowRemainder = UnrollAllowRemainder;
+  if (UnrollRuntime.getNumOccurrences() > 0)
+    UP.Runtime = UnrollRuntime;
+  if (UnrollMaxUpperBound == 0)
+    UP.UpperBound = false;
+  if (UnrollUnrollRemainder.getNumOccurrences() > 0)
+    UP.UnrollRemainder = UnrollUnrollRemainder;
+  if (UnrollMaxIterationsCountToAnalyze.getNumOccurrences() > 0)
+    UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze;
+
+  // Apply user values provided by argument
+  if (UserThreshold.hasValue()) {
+    UP.Threshold = *UserThreshold;
+    UP.PartialThreshold = *UserThreshold;
+  }
+  if (UserCount.hasValue())
+    UP.Count = *UserCount;
+  if (UserAllowPartial.hasValue())
+    UP.Partial = *UserAllowPartial;
+  if (UserRuntime.hasValue())
+    UP.Runtime = *UserRuntime;
+  if (UserUpperBound.hasValue())
+    UP.UpperBound = *UserUpperBound;
+  if (UserFullUnrollMaxCount.hasValue())
+    UP.FullUnrollMaxCount = *UserFullUnrollMaxCount;
+
+  return UP;
+}
+
+namespace {
+
+/// A struct to densely store the state of an instruction after unrolling at
+/// each iteration.
+///
+/// This is designed to work like a tuple of <Instruction *, int> for the
+/// purposes of hashing and lookup, but to be able to associate two boolean
+/// states with each key.
+struct UnrolledInstState {
+  Instruction *I;
+  int Iteration : 30;
+  unsigned IsFree : 1;
+  unsigned IsCounted : 1;
+};
+
+/// Hashing and equality testing for a set of the instruction states.
+struct UnrolledInstStateKeyInfo {
+  using PtrInfo = DenseMapInfo<Instruction *>;
+  using PairInfo = DenseMapInfo<std::pair<Instruction *, int>>;
+
+  static inline UnrolledInstState getEmptyKey() {
+    return {PtrInfo::getEmptyKey(), 0, 0, 0};
+  }
+
+  static inline UnrolledInstState getTombstoneKey() {
+    return {PtrInfo::getTombstoneKey(), 0, 0, 0};
+  }
+
+  static inline unsigned getHashValue(const UnrolledInstState &S) {
+    return PairInfo::getHashValue({S.I, S.Iteration});
+  }
+
+  static inline bool isEqual(const UnrolledInstState &LHS,
+                             const UnrolledInstState &RHS) {
+    return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration});
+  }
+};
+
+struct EstimatedUnrollCost {
+  /// The estimated cost after unrolling.
+  unsigned UnrolledCost;
+
+  /// The estimated dynamic cost of executing the instructions in the
+  /// rolled form.
+  unsigned RolledDynamicCost;
+};
+
+} // end anonymous namespace
+
+/// Figure out if the loop is worth full unrolling.
+///
+/// Complete loop unrolling can make some loads constant, and we need to know
+/// if that would expose any further optimization opportunities.  This routine
+/// estimates this optimization.  It computes cost of unrolled loop
+/// (UnrolledCost) and dynamic cost of the original loop (RolledDynamicCost). By
+/// dynamic cost we mean that we won't count costs of blocks that are known not
+/// to be executed (i.e. if we have a branch in the loop and we know that at the
+/// given iteration its condition would be resolved to true, we won't add up the
+/// cost of the 'false'-block).
+/// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If
+/// the analysis failed (no benefits expected from the unrolling, or the loop is
+/// too big to analyze), the returned value is None.
+static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
+    const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE,
+    const SmallPtrSetImpl<const Value *> &EphValues,
+    const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize,
+    unsigned MaxIterationsCountToAnalyze) {
+  // We want to be able to scale offsets by the trip count and add more offsets
+  // to them without checking for overflows, and we already don't want to
+  // analyze *massive* trip counts, so we force the max to be reasonably small.
+  assert(MaxIterationsCountToAnalyze <
+             (unsigned)(std::numeric_limits<int>::max() / 2) &&
+         "The unroll iterations max is too large!");
+
+  // Only analyze inner loops. We can't properly estimate cost of nested loops
+  // and we won't visit inner loops again anyway.
   if (!L->isInnermost())
-    return None; 
- 
-  // Don't simulate loops with a big or unknown tripcount 
-  if (!TripCount || TripCount > MaxIterationsCountToAnalyze) 
-    return None; 
- 
-  SmallSetVector<BasicBlock *, 16> BBWorklist; 
-  SmallSetVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitWorklist; 
-  DenseMap<Value *, Constant *> SimplifiedValues; 
-  SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues; 
- 
-  // The estimated cost of the unrolled form of the loop. We try to estimate 
-  // this by simplifying as much as we can while computing the estimate. 
-  unsigned UnrolledCost = 0; 
- 
-  // We also track the estimated dynamic (that is, actually executed) cost in 
-  // the rolled form. This helps identify cases when the savings from unrolling 
-  // aren't just exposing dead control flows, but actual reduced dynamic 
-  // instructions due to the simplifications which we expect to occur after 
-  // unrolling. 
-  unsigned RolledDynamicCost = 0; 
- 
-  // We track the simplification of each instruction in each iteration. We use 
-  // this to recursively merge costs into the unrolled cost on-demand so that 
-  // we don't count the cost of any dead code. This is essentially a map from 
-  // <instruction, int> to <bool, bool>, but stored as a densely packed struct. 
-  DenseSet<UnrolledInstState, UnrolledInstStateKeyInfo> InstCostMap; 
- 
-  // A small worklist used to accumulate cost of instructions from each 
-  // observable and reached root in the loop. 
-  SmallVector<Instruction *, 16> CostWorklist; 
- 
-  // PHI-used worklist used between iterations while accumulating cost. 
-  SmallVector<Instruction *, 4> PHIUsedList; 
- 
-  // Helper function to accumulate cost for instructions in the loop. 
-  auto AddCostRecursively = [&](Instruction &RootI, int Iteration) { 
-    assert(Iteration >= 0 && "Cannot have a negative iteration!"); 
-    assert(CostWorklist.empty() && "Must start with an empty cost list"); 
-    assert(PHIUsedList.empty() && "Must start with an empty phi used list"); 
-    CostWorklist.push_back(&RootI); 
+    return None;
+
+  // Don't simulate loops with a big or unknown tripcount
+  if (!TripCount || TripCount > MaxIterationsCountToAnalyze)
+    return None;
+
+  SmallSetVector<BasicBlock *, 16> BBWorklist;
+  SmallSetVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitWorklist;
+  DenseMap<Value *, Constant *> SimplifiedValues;
+  SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues;
+
+  // The estimated cost of the unrolled form of the loop. We try to estimate
+  // this by simplifying as much as we can while computing the estimate.
+  unsigned UnrolledCost = 0;
+
+  // We also track the estimated dynamic (that is, actually executed) cost in
+  // the rolled form. This helps identify cases when the savings from unrolling
+  // aren't just exposing dead control flows, but actual reduced dynamic
+  // instructions due to the simplifications which we expect to occur after
+  // unrolling.
+  unsigned RolledDynamicCost = 0;
+
+  // We track the simplification of each instruction in each iteration. We use
+  // this to recursively merge costs into the unrolled cost on-demand so that
+  // we don't count the cost of any dead code. This is essentially a map from
+  // <instruction, int> to <bool, bool>, but stored as a densely packed struct.
+  DenseSet<UnrolledInstState, UnrolledInstStateKeyInfo> InstCostMap;
+
+  // A small worklist used to accumulate cost of instructions from each
+  // observable and reached root in the loop.
+  SmallVector<Instruction *, 16> CostWorklist;
+
+  // PHI-used worklist used between iterations while accumulating cost.
+  SmallVector<Instruction *, 4> PHIUsedList;
+
+  // Helper function to accumulate cost for instructions in the loop.
+  auto AddCostRecursively = [&](Instruction &RootI, int Iteration) {
+    assert(Iteration >= 0 && "Cannot have a negative iteration!");
+    assert(CostWorklist.empty() && "Must start with an empty cost list");
+    assert(PHIUsedList.empty() && "Must start with an empty phi used list");
+    CostWorklist.push_back(&RootI);
     TargetTransformInfo::TargetCostKind CostKind =
       RootI.getFunction()->hasMinSize() ?
       TargetTransformInfo::TCK_CodeSize :
       TargetTransformInfo::TCK_SizeAndLatency;
-    for (;; --Iteration) { 
-      do { 
-        Instruction *I = CostWorklist.pop_back_val(); 
- 
-        // InstCostMap only uses I and Iteration as a key, the other two values 
-        // don't matter here. 
-        auto CostIter = InstCostMap.find({I, Iteration, 0, 0}); 
-        if (CostIter == InstCostMap.end()) 
-          // If an input to a PHI node comes from a dead path through the loop 
-          // we may have no cost data for it here. What that actually means is 
-          // that it is free. 
-          continue; 
-        auto &Cost = *CostIter; 
-        if (Cost.IsCounted) 
-          // Already counted this instruction. 
-          continue; 
- 
-        // Mark that we are counting the cost of this instruction now. 
-        Cost.IsCounted = true; 
- 
-        // If this is a PHI node in the loop header, just add it to the PHI set. 
-        if (auto *PhiI = dyn_cast<PHINode>(I)) 
-          if (PhiI->getParent() == L->getHeader()) { 
-            assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they " 
-                                  "inherently simplify during unrolling."); 
-            if (Iteration == 0) 
-              continue; 
- 
-            // Push the incoming value from the backedge into the PHI used list 
-            // if it is an in-loop instruction. We'll use this to populate the 
-            // cost worklist for the next iteration (as we count backwards). 
-            if (auto *OpI = dyn_cast<Instruction>( 
-                    PhiI->getIncomingValueForBlock(L->getLoopLatch()))) 
-              if (L->contains(OpI)) 
-                PHIUsedList.push_back(OpI); 
-            continue; 
-          } 
- 
-        // First accumulate the cost of this instruction. 
-        if (!Cost.IsFree) { 
+    for (;; --Iteration) {
+      do {
+        Instruction *I = CostWorklist.pop_back_val();
+
+        // InstCostMap only uses I and Iteration as a key, the other two values
+        // don't matter here.
+        auto CostIter = InstCostMap.find({I, Iteration, 0, 0});
+        if (CostIter == InstCostMap.end())
+          // If an input to a PHI node comes from a dead path through the loop
+          // we may have no cost data for it here. What that actually means is
+          // that it is free.
+          continue;
+        auto &Cost = *CostIter;
+        if (Cost.IsCounted)
+          // Already counted this instruction.
+          continue;
+
+        // Mark that we are counting the cost of this instruction now.
+        Cost.IsCounted = true;
+
+        // If this is a PHI node in the loop header, just add it to the PHI set.
+        if (auto *PhiI = dyn_cast<PHINode>(I))
+          if (PhiI->getParent() == L->getHeader()) {
+            assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they "
+                                  "inherently simplify during unrolling.");
+            if (Iteration == 0)
+              continue;
+
+            // Push the incoming value from the backedge into the PHI used list
+            // if it is an in-loop instruction. We'll use this to populate the
+            // cost worklist for the next iteration (as we count backwards).
+            if (auto *OpI = dyn_cast<Instruction>(
+                    PhiI->getIncomingValueForBlock(L->getLoopLatch())))
+              if (L->contains(OpI))
+                PHIUsedList.push_back(OpI);
+            continue;
+          }
+
+        // First accumulate the cost of this instruction.
+        if (!Cost.IsFree) {
           UnrolledCost += TTI.getUserCost(I, CostKind);
-          LLVM_DEBUG(dbgs() << "Adding cost of instruction (iteration " 
-                            << Iteration << "): "); 
-          LLVM_DEBUG(I->dump()); 
-        } 
- 
-        // We must count the cost of every operand which is not free, 
-        // recursively. If we reach a loop PHI node, simply add it to the set 
-        // to be considered on the next iteration (backwards!). 
-        for (Value *Op : I->operands()) { 
-          // Check whether this operand is free due to being a constant or 
-          // outside the loop. 
-          auto *OpI = dyn_cast<Instruction>(Op); 
-          if (!OpI || !L->contains(OpI)) 
-            continue; 
- 
-          // Otherwise accumulate its cost. 
-          CostWorklist.push_back(OpI); 
-        } 
-      } while (!CostWorklist.empty()); 
- 
-      if (PHIUsedList.empty()) 
-        // We've exhausted the search. 
-        break; 
- 
-      assert(Iteration > 0 && 
-             "Cannot track PHI-used values past the first iteration!"); 
-      CostWorklist.append(PHIUsedList.begin(), PHIUsedList.end()); 
-      PHIUsedList.clear(); 
-    } 
-  }; 
- 
-  // Ensure that we don't violate the loop structure invariants relied on by 
-  // this analysis. 
-  assert(L->isLoopSimplifyForm() && "Must put loop into normal form first."); 
-  assert(L->isLCSSAForm(DT) && 
-         "Must have loops in LCSSA form to track live-out values."); 
- 
-  LLVM_DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n"); 
- 
+          LLVM_DEBUG(dbgs() << "Adding cost of instruction (iteration "
+                            << Iteration << "): ");
+          LLVM_DEBUG(I->dump());
+        }
+
+        // We must count the cost of every operand which is not free,
+        // recursively. If we reach a loop PHI node, simply add it to the set
+        // to be considered on the next iteration (backwards!).
+        for (Value *Op : I->operands()) {
+          // Check whether this operand is free due to being a constant or
+          // outside the loop.
+          auto *OpI = dyn_cast<Instruction>(Op);
+          if (!OpI || !L->contains(OpI))
+            continue;
+
+          // Otherwise accumulate its cost.
+          CostWorklist.push_back(OpI);
+        }
+      } while (!CostWorklist.empty());
+
+      if (PHIUsedList.empty())
+        // We've exhausted the search.
+        break;
+
+      assert(Iteration > 0 &&
+             "Cannot track PHI-used values past the first iteration!");
+      CostWorklist.append(PHIUsedList.begin(), PHIUsedList.end());
+      PHIUsedList.clear();
+    }
+  };
+
+  // Ensure that we don't violate the loop structure invariants relied on by
+  // this analysis.
+  assert(L->isLoopSimplifyForm() && "Must put loop into normal form first.");
+  assert(L->isLCSSAForm(DT) &&
+         "Must have loops in LCSSA form to track live-out values.");
+
+  LLVM_DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n");
+
   TargetTransformInfo::TargetCostKind CostKind =
     L->getHeader()->getParent()->hasMinSize() ?
     TargetTransformInfo::TCK_CodeSize : TargetTransformInfo::TCK_SizeAndLatency;
-  // Simulate execution of each iteration of the loop counting instructions, 
-  // which would be simplified. 
-  // Since the same load will take different values on different iterations, 
-  // we literally have to go through all loop's iterations. 
-  for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) { 
-    LLVM_DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n"); 
- 
-    // Prepare for the iteration by collecting any simplified entry or backedge 
-    // inputs. 
-    for (Instruction &I : *L->getHeader()) { 
-      auto *PHI = dyn_cast<PHINode>(&I); 
-      if (!PHI) 
-        break; 
- 
-      // The loop header PHI nodes must have exactly two input: one from the 
-      // loop preheader and one from the loop latch. 
-      assert( 
-          PHI->getNumIncomingValues() == 2 && 
-          "Must have an incoming value only for the preheader and the latch."); 
- 
-      Value *V = PHI->getIncomingValueForBlock( 
-          Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch()); 
-      Constant *C = dyn_cast<Constant>(V); 
-      if (Iteration != 0 && !C) 
-        C = SimplifiedValues.lookup(V); 
-      if (C) 
-        SimplifiedInputValues.push_back({PHI, C}); 
-    } 
- 
-    // Now clear and re-populate the map for the next iteration. 
-    SimplifiedValues.clear(); 
-    while (!SimplifiedInputValues.empty()) 
-      SimplifiedValues.insert(SimplifiedInputValues.pop_back_val()); 
- 
-    UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L); 
- 
-    BBWorklist.clear(); 
-    BBWorklist.insert(L->getHeader()); 
-    // Note that we *must not* cache the size, this loop grows the worklist. 
-    for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { 
-      BasicBlock *BB = BBWorklist[Idx]; 
- 
-      // Visit all instructions in the given basic block and try to simplify 
-      // it.  We don't change the actual IR, just count optimization 
-      // opportunities. 
-      for (Instruction &I : *BB) { 
-        // These won't get into the final code - don't even try calculating the 
-        // cost for them. 
-        if (isa<DbgInfoIntrinsic>(I) || EphValues.count(&I)) 
-          continue; 
- 
-        // Track this instruction's expected baseline cost when executing the 
-        // rolled loop form. 
+  // Simulate execution of each iteration of the loop counting instructions,
+  // which would be simplified.
+  // Since the same load will take different values on different iterations,
+  // we literally have to go through all loop's iterations.
+  for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) {
+    LLVM_DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n");
+
+    // Prepare for the iteration by collecting any simplified entry or backedge
+    // inputs.
+    for (Instruction &I : *L->getHeader()) {
+      auto *PHI = dyn_cast<PHINode>(&I);
+      if (!PHI)
+        break;
+
+      // The loop header PHI nodes must have exactly two input: one from the
+      // loop preheader and one from the loop latch.
+      assert(
+          PHI->getNumIncomingValues() == 2 &&
+          "Must have an incoming value only for the preheader and the latch.");
+
+      Value *V = PHI->getIncomingValueForBlock(
+          Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch());
+      Constant *C = dyn_cast<Constant>(V);
+      if (Iteration != 0 && !C)
+        C = SimplifiedValues.lookup(V);
+      if (C)
+        SimplifiedInputValues.push_back({PHI, C});
+    }
+
+    // Now clear and re-populate the map for the next iteration.
+    SimplifiedValues.clear();
+    while (!SimplifiedInputValues.empty())
+      SimplifiedValues.insert(SimplifiedInputValues.pop_back_val());
+
+    UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L);
+
+    BBWorklist.clear();
+    BBWorklist.insert(L->getHeader());
+    // Note that we *must not* cache the size, this loop grows the worklist.
+    for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
+      BasicBlock *BB = BBWorklist[Idx];
+
+      // Visit all instructions in the given basic block and try to simplify
+      // it.  We don't change the actual IR, just count optimization
+      // opportunities.
+      for (Instruction &I : *BB) {
+        // These won't get into the final code - don't even try calculating the
+        // cost for them.
+        if (isa<DbgInfoIntrinsic>(I) || EphValues.count(&I))
+          continue;
+
+        // Track this instruction's expected baseline cost when executing the
+        // rolled loop form.
         RolledDynamicCost += TTI.getUserCost(&I, CostKind);
- 
-        // Visit the instruction to analyze its loop cost after unrolling, 
-        // and if the visitor returns true, mark the instruction as free after 
-        // unrolling and continue. 
-        bool IsFree = Analyzer.visit(I); 
-        bool Inserted = InstCostMap.insert({&I, (int)Iteration, 
-                                           (unsigned)IsFree, 
-                                           /*IsCounted*/ false}).second; 
-        (void)Inserted; 
-        assert(Inserted && "Cannot have a state for an unvisited instruction!"); 
- 
-        if (IsFree) 
-          continue; 
- 
-        // Can't properly model a cost of a call. 
-        // FIXME: With a proper cost model we should be able to do it. 
-        if (auto *CI = dyn_cast<CallInst>(&I)) { 
-          const Function *Callee = CI->getCalledFunction(); 
-          if (!Callee || TTI.isLoweredToCall(Callee)) { 
-            LLVM_DEBUG(dbgs() << "Can't analyze cost of loop with call\n"); 
-            return None; 
-          } 
-        } 
- 
-        // If the instruction might have a side-effect recursively account for 
-        // the cost of it and all the instructions leading up to it. 
-        if (I.mayHaveSideEffects()) 
-          AddCostRecursively(I, Iteration); 
- 
-        // If unrolled body turns out to be too big, bail out. 
-        if (UnrolledCost > MaxUnrolledLoopSize) { 
-          LLVM_DEBUG(dbgs() << "  Exceeded threshold.. exiting.\n" 
-                            << "  UnrolledCost: " << UnrolledCost 
-                            << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize 
-                            << "\n"); 
-          return None; 
-        } 
-      } 
- 
-      Instruction *TI = BB->getTerminator(); 
- 
-      // Add in the live successors by first checking whether we have terminator 
-      // that may be simplified based on the values simplified by this call. 
-      BasicBlock *KnownSucc = nullptr; 
-      if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { 
-        if (BI->isConditional()) { 
-          if (Constant *SimpleCond = 
-                  SimplifiedValues.lookup(BI->getCondition())) { 
-            // Just take the first successor if condition is undef 
-            if (isa<UndefValue>(SimpleCond)) 
-              KnownSucc = BI->getSuccessor(0); 
-            else if (ConstantInt *SimpleCondVal = 
-                         dyn_cast<ConstantInt>(SimpleCond)) 
-              KnownSucc = BI->getSuccessor(SimpleCondVal->isZero() ? 1 : 0); 
-          } 
-        } 
-      } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { 
-        if (Constant *SimpleCond = 
-                SimplifiedValues.lookup(SI->getCondition())) { 
-          // Just take the first successor if condition is undef 
-          if (isa<UndefValue>(SimpleCond)) 
-            KnownSucc = SI->getSuccessor(0); 
-          else if (ConstantInt *SimpleCondVal = 
-                       dyn_cast<ConstantInt>(SimpleCond)) 
-            KnownSucc = SI->findCaseValue(SimpleCondVal)->getCaseSuccessor(); 
-        } 
-      } 
-      if (KnownSucc) { 
-        if (L->contains(KnownSucc)) 
-          BBWorklist.insert(KnownSucc); 
-        else 
-          ExitWorklist.insert({BB, KnownSucc}); 
-        continue; 
-      } 
- 
-      // Add BB's successors to the worklist. 
-      for (BasicBlock *Succ : successors(BB)) 
-        if (L->contains(Succ)) 
-          BBWorklist.insert(Succ); 
-        else 
-          ExitWorklist.insert({BB, Succ}); 
-      AddCostRecursively(*TI, Iteration); 
-    } 
- 
-    // If we found no optimization opportunities on the first iteration, we 
-    // won't find them on later ones too. 
-    if (UnrolledCost == RolledDynamicCost) { 
-      LLVM_DEBUG(dbgs() << "  No opportunities found.. exiting.\n" 
-                        << "  UnrolledCost: " << UnrolledCost << "\n"); 
-      return None; 
-    } 
-  } 
- 
-  while (!ExitWorklist.empty()) { 
-    BasicBlock *ExitingBB, *ExitBB; 
-    std::tie(ExitingBB, ExitBB) = ExitWorklist.pop_back_val(); 
- 
-    for (Instruction &I : *ExitBB) { 
-      auto *PN = dyn_cast<PHINode>(&I); 
-      if (!PN) 
-        break; 
- 
-      Value *Op = PN->getIncomingValueForBlock(ExitingBB); 
-      if (auto *OpI = dyn_cast<Instruction>(Op)) 
-        if (L->contains(OpI)) 
-          AddCostRecursively(*OpI, TripCount - 1); 
-    } 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "Analysis finished:\n" 
-                    << "UnrolledCost: " << UnrolledCost << ", " 
-                    << "RolledDynamicCost: " << RolledDynamicCost << "\n"); 
-  return {{UnrolledCost, RolledDynamicCost}}; 
-} 
- 
-/// ApproximateLoopSize - Approximate the size of the loop. 
-unsigned llvm::ApproximateLoopSize( 
-    const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, bool &Convergent, 
-    const TargetTransformInfo &TTI, 
-    const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns) { 
-  CodeMetrics Metrics; 
-  for (BasicBlock *BB : L->blocks()) 
-    Metrics.analyzeBasicBlock(BB, TTI, EphValues); 
-  NumCalls = Metrics.NumInlineCandidates; 
-  NotDuplicatable = Metrics.notDuplicatable; 
-  Convergent = Metrics.convergent; 
- 
-  unsigned LoopSize = Metrics.NumInsts; 
- 
-  // Don't allow an estimate of size zero.  This would allows unrolling of loops 
-  // with huge iteration counts, which is a compile time problem even if it's 
-  // not a problem for code quality. Also, the code using this size may assume 
-  // that each loop has at least three instructions (likely a conditional 
-  // branch, a comparison feeding that branch, and some kind of loop increment 
-  // feeding that comparison instruction). 
-  LoopSize = std::max(LoopSize, BEInsns + 1); 
- 
-  return LoopSize; 
-} 
- 
-// Returns the loop hint metadata node with the given name (for example, 
-// "llvm.loop.unroll.count").  If no such metadata node exists, then nullptr is 
-// returned. 
-static MDNode *getUnrollMetadataForLoop(const Loop *L, StringRef Name) { 
-  if (MDNode *LoopID = L->getLoopID()) 
-    return GetUnrollMetadata(LoopID, Name); 
-  return nullptr; 
-} 
- 
-// Returns true if the loop has an unroll(full) pragma. 
-static bool hasUnrollFullPragma(const Loop *L) { 
-  return getUnrollMetadataForLoop(L, "llvm.loop.unroll.full"); 
-} 
- 
-// Returns true if the loop has an unroll(enable) pragma. This metadata is used 
-// for both "#pragma unroll" and "#pragma clang loop unroll(enable)" directives. 
-static bool hasUnrollEnablePragma(const Loop *L) { 
-  return getUnrollMetadataForLoop(L, "llvm.loop.unroll.enable"); 
-} 
- 
-// Returns true if the loop has an runtime unroll(disable) pragma. 
-static bool hasRuntimeUnrollDisablePragma(const Loop *L) { 
-  return getUnrollMetadataForLoop(L, "llvm.loop.unroll.runtime.disable"); 
-} 
- 
-// If loop has an unroll_count pragma return the (necessarily 
-// positive) value from the pragma.  Otherwise return 0. 
-static unsigned unrollCountPragmaValue(const Loop *L) { 
-  MDNode *MD = getUnrollMetadataForLoop(L, "llvm.loop.unroll.count"); 
-  if (MD) { 
-    assert(MD->getNumOperands() == 2 && 
-           "Unroll count hint metadata should have two operands."); 
-    unsigned Count = 
-        mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue(); 
-    assert(Count >= 1 && "Unroll count must be positive."); 
-    return Count; 
-  } 
-  return 0; 
-} 
- 
-// Computes the boosting factor for complete unrolling. 
-// If fully unrolling the loop would save a lot of RolledDynamicCost, it would 
-// be beneficial to fully unroll the loop even if unrolledcost is large. We 
-// use (RolledDynamicCost / UnrolledCost) to model the unroll benefits to adjust 
-// the unroll threshold. 
-static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost, 
-                                            unsigned MaxPercentThresholdBoost) { 
-  if (Cost.RolledDynamicCost >= std::numeric_limits<unsigned>::max() / 100) 
-    return 100; 
-  else if (Cost.UnrolledCost != 0) 
-    // The boosting factor is RolledDynamicCost / UnrolledCost 
-    return std::min(100 * Cost.RolledDynamicCost / Cost.UnrolledCost, 
-                    MaxPercentThresholdBoost); 
-  else 
-    return MaxPercentThresholdBoost; 
-} 
- 
-// Returns loop size estimation for unrolled loop. 
-static uint64_t getUnrolledLoopSize( 
-    unsigned LoopSize, 
-    TargetTransformInfo::UnrollingPreferences &UP) { 
-  assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!"); 
-  return (uint64_t)(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns; 
-} 
- 
-// Returns true if unroll count was set explicitly. 
-// Calculates unroll count and writes it to UP.Count. 
-// Unless IgnoreUser is true, will also use metadata and command-line options 
-// that are specific to to the LoopUnroll pass (which, for instance, are 
-// irrelevant for the LoopUnrollAndJam pass). 
-// FIXME: This function is used by LoopUnroll and LoopUnrollAndJam, but consumes 
-// many LoopUnroll-specific options. The shared functionality should be 
-// refactored into it own function. 
-bool llvm::computeUnrollCount( 
-    Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, 
-    ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues, 
-    OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount, 
-    bool MaxOrZero, unsigned &TripMultiple, unsigned LoopSize, 
-    TargetTransformInfo::UnrollingPreferences &UP, 
-    TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound) { 
- 
-  // Check for explicit Count. 
-  // 1st priority is unroll count set by "unroll-count" option. 
-  bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0; 
-  if (UserUnrollCount) { 
-    UP.Count = UnrollCount; 
-    UP.AllowExpensiveTripCount = true; 
-    UP.Force = true; 
-    if (UP.AllowRemainder && getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) 
-      return true; 
-  } 
- 
-  // 2nd priority is unroll count set by pragma. 
-  unsigned PragmaCount = unrollCountPragmaValue(L); 
-  if (PragmaCount > 0) { 
-    UP.Count = PragmaCount; 
-    UP.Runtime = true; 
-    UP.AllowExpensiveTripCount = true; 
-    UP.Force = true; 
-    if ((UP.AllowRemainder || (TripMultiple % PragmaCount == 0)) && 
-        getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold) 
-      return true; 
-  } 
-  bool PragmaFullUnroll = hasUnrollFullPragma(L); 
-  if (PragmaFullUnroll && TripCount != 0) { 
-    UP.Count = TripCount; 
-    if (getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold) 
-      return false; 
-  } 
- 
-  bool PragmaEnableUnroll = hasUnrollEnablePragma(L); 
-  bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll || 
-                        PragmaEnableUnroll || UserUnrollCount; 
- 
-  if (ExplicitUnroll && TripCount != 0) { 
-    // If the loop has an unrolling pragma, we want to be more aggressive with 
-    // unrolling limits. Set thresholds to at least the PragmaUnrollThreshold 
-    // value which is larger than the default limits. 
-    UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold); 
-    UP.PartialThreshold = 
-        std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold); 
-  } 
- 
-  // 3rd priority is full unroll count. 
-  // Full unroll makes sense only when TripCount or its upper bound could be 
-  // statically calculated. 
-  // Also we need to check if we exceed FullUnrollMaxCount. 
-  // If using the upper bound to unroll, TripMultiple should be set to 1 because 
-  // we do not know when loop may exit. 
- 
-  // We can unroll by the upper bound amount if it's generally allowed or if 
-  // we know that the loop is executed either the upper bound or zero times. 
-  // (MaxOrZero unrolling keeps only the first loop test, so the number of 
-  // loop tests remains the same compared to the non-unrolled version, whereas 
-  // the generic upper bound unrolling keeps all but the last loop test so the 
-  // number of loop tests goes up which may end up being worse on targets with 
-  // constrained branch predictor resources so is controlled by an option.) 
-  // In addition we only unroll small upper bounds. 
-  unsigned FullUnrollMaxTripCount = MaxTripCount; 
-  if (!(UP.UpperBound || MaxOrZero) || 
-      FullUnrollMaxTripCount > UnrollMaxUpperBound) 
-    FullUnrollMaxTripCount = 0; 
- 
-  // UnrollByMaxCount and ExactTripCount cannot both be non zero since we only 
-  // compute the former when the latter is zero. 
-  unsigned ExactTripCount = TripCount; 
-  assert((ExactTripCount == 0 || FullUnrollMaxTripCount == 0) && 
-         "ExtractTripCount and UnrollByMaxCount cannot both be non zero."); 
- 
-  unsigned FullUnrollTripCount = 
-      ExactTripCount ? ExactTripCount : FullUnrollMaxTripCount; 
-  UP.Count = FullUnrollTripCount; 
-  if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) { 
-    // When computing the unrolled size, note that BEInsns are not replicated 
-    // like the rest of the loop body. 
-    if (getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) { 
-      UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount); 
-      TripCount = FullUnrollTripCount; 
-      TripMultiple = UP.UpperBound ? 1 : TripMultiple; 
-      return ExplicitUnroll; 
-    } else { 
-      // The loop isn't that small, but we still can fully unroll it if that 
-      // helps to remove a significant number of instructions. 
-      // To check that, run additional analysis on the loop. 
-      if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost( 
-              L, FullUnrollTripCount, DT, SE, EphValues, TTI, 
-              UP.Threshold * UP.MaxPercentThresholdBoost / 100, 
-              UP.MaxIterationsCountToAnalyze)) { 
-        unsigned Boost = 
-            getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost); 
-        if (Cost->UnrolledCost < UP.Threshold * Boost / 100) { 
-          UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount); 
-          TripCount = FullUnrollTripCount; 
-          TripMultiple = UP.UpperBound ? 1 : TripMultiple; 
-          return ExplicitUnroll; 
-        } 
-      } 
-    } 
-  } 
- 
-  // 4th priority is loop peeling. 
+
+        // Visit the instruction to analyze its loop cost after unrolling,
+        // and if the visitor returns true, mark the instruction as free after
+        // unrolling and continue.
+        bool IsFree = Analyzer.visit(I);
+        bool Inserted = InstCostMap.insert({&I, (int)Iteration,
+                                           (unsigned)IsFree,
+                                           /*IsCounted*/ false}).second;
+        (void)Inserted;
+        assert(Inserted && "Cannot have a state for an unvisited instruction!");
+
+        if (IsFree)
+          continue;
+
+        // Can't properly model a cost of a call.
+        // FIXME: With a proper cost model we should be able to do it.
+        if (auto *CI = dyn_cast<CallInst>(&I)) {
+          const Function *Callee = CI->getCalledFunction();
+          if (!Callee || TTI.isLoweredToCall(Callee)) {
+            LLVM_DEBUG(dbgs() << "Can't analyze cost of loop with call\n");
+            return None;
+          }
+        }
+
+        // If the instruction might have a side-effect recursively account for
+        // the cost of it and all the instructions leading up to it.
+        if (I.mayHaveSideEffects())
+          AddCostRecursively(I, Iteration);
+
+        // If unrolled body turns out to be too big, bail out.
+        if (UnrolledCost > MaxUnrolledLoopSize) {
+          LLVM_DEBUG(dbgs() << "  Exceeded threshold.. exiting.\n"
+                            << "  UnrolledCost: " << UnrolledCost
+                            << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize
+                            << "\n");
+          return None;
+        }
+      }
+
+      Instruction *TI = BB->getTerminator();
+
+      // Add in the live successors by first checking whether we have terminator
+      // that may be simplified based on the values simplified by this call.
+      BasicBlock *KnownSucc = nullptr;
+      if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+        if (BI->isConditional()) {
+          if (Constant *SimpleCond =
+                  SimplifiedValues.lookup(BI->getCondition())) {
+            // Just take the first successor if condition is undef
+            if (isa<UndefValue>(SimpleCond))
+              KnownSucc = BI->getSuccessor(0);
+            else if (ConstantInt *SimpleCondVal =
+                         dyn_cast<ConstantInt>(SimpleCond))
+              KnownSucc = BI->getSuccessor(SimpleCondVal->isZero() ? 1 : 0);
+          }
+        }
+      } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+        if (Constant *SimpleCond =
+                SimplifiedValues.lookup(SI->getCondition())) {
+          // Just take the first successor if condition is undef
+          if (isa<UndefValue>(SimpleCond))
+            KnownSucc = SI->getSuccessor(0);
+          else if (ConstantInt *SimpleCondVal =
+                       dyn_cast<ConstantInt>(SimpleCond))
+            KnownSucc = SI->findCaseValue(SimpleCondVal)->getCaseSuccessor();
+        }
+      }
+      if (KnownSucc) {
+        if (L->contains(KnownSucc))
+          BBWorklist.insert(KnownSucc);
+        else
+          ExitWorklist.insert({BB, KnownSucc});
+        continue;
+      }
+
+      // Add BB's successors to the worklist.
+      for (BasicBlock *Succ : successors(BB))
+        if (L->contains(Succ))
+          BBWorklist.insert(Succ);
+        else
+          ExitWorklist.insert({BB, Succ});
+      AddCostRecursively(*TI, Iteration);
+    }
+
+    // If we found no optimization opportunities on the first iteration, we
+    // won't find them on later ones too.
+    if (UnrolledCost == RolledDynamicCost) {
+      LLVM_DEBUG(dbgs() << "  No opportunities found.. exiting.\n"
+                        << "  UnrolledCost: " << UnrolledCost << "\n");
+      return None;
+    }
+  }
+
+  while (!ExitWorklist.empty()) {
+    BasicBlock *ExitingBB, *ExitBB;
+    std::tie(ExitingBB, ExitBB) = ExitWorklist.pop_back_val();
+
+    for (Instruction &I : *ExitBB) {
+      auto *PN = dyn_cast<PHINode>(&I);
+      if (!PN)
+        break;
+
+      Value *Op = PN->getIncomingValueForBlock(ExitingBB);
+      if (auto *OpI = dyn_cast<Instruction>(Op))
+        if (L->contains(OpI))
+          AddCostRecursively(*OpI, TripCount - 1);
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "Analysis finished:\n"
+                    << "UnrolledCost: " << UnrolledCost << ", "
+                    << "RolledDynamicCost: " << RolledDynamicCost << "\n");
+  return {{UnrolledCost, RolledDynamicCost}};
+}
+
+/// ApproximateLoopSize - Approximate the size of the loop.
+unsigned llvm::ApproximateLoopSize(
+    const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, bool &Convergent,
+    const TargetTransformInfo &TTI,
+    const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns) {
+  CodeMetrics Metrics;
+  for (BasicBlock *BB : L->blocks())
+    Metrics.analyzeBasicBlock(BB, TTI, EphValues);
+  NumCalls = Metrics.NumInlineCandidates;
+  NotDuplicatable = Metrics.notDuplicatable;
+  Convergent = Metrics.convergent;
+
+  unsigned LoopSize = Metrics.NumInsts;
+
+  // Don't allow an estimate of size zero.  This would allows unrolling of loops
+  // with huge iteration counts, which is a compile time problem even if it's
+  // not a problem for code quality. Also, the code using this size may assume
+  // that each loop has at least three instructions (likely a conditional
+  // branch, a comparison feeding that branch, and some kind of loop increment
+  // feeding that comparison instruction).
+  LoopSize = std::max(LoopSize, BEInsns + 1);
+
+  return LoopSize;
+}
+
+// Returns the loop hint metadata node with the given name (for example,
+// "llvm.loop.unroll.count").  If no such metadata node exists, then nullptr is
+// returned.
+static MDNode *getUnrollMetadataForLoop(const Loop *L, StringRef Name) {
+  if (MDNode *LoopID = L->getLoopID())
+    return GetUnrollMetadata(LoopID, Name);
+  return nullptr;
+}
+
+// Returns true if the loop has an unroll(full) pragma.
+static bool hasUnrollFullPragma(const Loop *L) {
+  return getUnrollMetadataForLoop(L, "llvm.loop.unroll.full");
+}
+
+// Returns true if the loop has an unroll(enable) pragma. This metadata is used
+// for both "#pragma unroll" and "#pragma clang loop unroll(enable)" directives.
+static bool hasUnrollEnablePragma(const Loop *L) {
+  return getUnrollMetadataForLoop(L, "llvm.loop.unroll.enable");
+}
+
+// Returns true if the loop has an runtime unroll(disable) pragma.
+static bool hasRuntimeUnrollDisablePragma(const Loop *L) {
+  return getUnrollMetadataForLoop(L, "llvm.loop.unroll.runtime.disable");
+}
+
+// If loop has an unroll_count pragma return the (necessarily
+// positive) value from the pragma.  Otherwise return 0.
+static unsigned unrollCountPragmaValue(const Loop *L) {
+  MDNode *MD = getUnrollMetadataForLoop(L, "llvm.loop.unroll.count");
+  if (MD) {
+    assert(MD->getNumOperands() == 2 &&
+           "Unroll count hint metadata should have two operands.");
+    unsigned Count =
+        mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
+    assert(Count >= 1 && "Unroll count must be positive.");
+    return Count;
+  }
+  return 0;
+}
+
+// Computes the boosting factor for complete unrolling.
+// If fully unrolling the loop would save a lot of RolledDynamicCost, it would
+// be beneficial to fully unroll the loop even if unrolledcost is large. We
+// use (RolledDynamicCost / UnrolledCost) to model the unroll benefits to adjust
+// the unroll threshold.
+static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost,
+                                            unsigned MaxPercentThresholdBoost) {
+  if (Cost.RolledDynamicCost >= std::numeric_limits<unsigned>::max() / 100)
+    return 100;
+  else if (Cost.UnrolledCost != 0)
+    // The boosting factor is RolledDynamicCost / UnrolledCost
+    return std::min(100 * Cost.RolledDynamicCost / Cost.UnrolledCost,
+                    MaxPercentThresholdBoost);
+  else
+    return MaxPercentThresholdBoost;
+}
+
+// Returns loop size estimation for unrolled loop.
+static uint64_t getUnrolledLoopSize(
+    unsigned LoopSize,
+    TargetTransformInfo::UnrollingPreferences &UP) {
+  assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!");
+  return (uint64_t)(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns;
+}
+
+// Returns true if unroll count was set explicitly.
+// Calculates unroll count and writes it to UP.Count.
+// Unless IgnoreUser is true, will also use metadata and command-line options
+// that are specific to to the LoopUnroll pass (which, for instance, are
+// irrelevant for the LoopUnrollAndJam pass).
+// FIXME: This function is used by LoopUnroll and LoopUnrollAndJam, but consumes
+// many LoopUnroll-specific options. The shared functionality should be
+// refactored into it own function.
+bool llvm::computeUnrollCount(
+    Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
+    ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
+    OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount,
+    bool MaxOrZero, unsigned &TripMultiple, unsigned LoopSize,
+    TargetTransformInfo::UnrollingPreferences &UP,
+    TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound) {
+
+  // Check for explicit Count.
+  // 1st priority is unroll count set by "unroll-count" option.
+  bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0;
+  if (UserUnrollCount) {
+    UP.Count = UnrollCount;
+    UP.AllowExpensiveTripCount = true;
+    UP.Force = true;
+    if (UP.AllowRemainder && getUnrolledLoopSize(LoopSize, UP) < UP.Threshold)
+      return true;
+  }
+
+  // 2nd priority is unroll count set by pragma.
+  unsigned PragmaCount = unrollCountPragmaValue(L);
+  if (PragmaCount > 0) {
+    UP.Count = PragmaCount;
+    UP.Runtime = true;
+    UP.AllowExpensiveTripCount = true;
+    UP.Force = true;
+    if ((UP.AllowRemainder || (TripMultiple % PragmaCount == 0)) &&
+        getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
+      return true;
+  }
+  bool PragmaFullUnroll = hasUnrollFullPragma(L);
+  if (PragmaFullUnroll && TripCount != 0) {
+    UP.Count = TripCount;
+    if (getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
+      return false;
+  }
+
+  bool PragmaEnableUnroll = hasUnrollEnablePragma(L);
+  bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll ||
+                        PragmaEnableUnroll || UserUnrollCount;
+
+  if (ExplicitUnroll && TripCount != 0) {
+    // If the loop has an unrolling pragma, we want to be more aggressive with
+    // unrolling limits. Set thresholds to at least the PragmaUnrollThreshold
+    // value which is larger than the default limits.
+    UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
+    UP.PartialThreshold =
+        std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
+  }
+
+  // 3rd priority is full unroll count.
+  // Full unroll makes sense only when TripCount or its upper bound could be
+  // statically calculated.
+  // Also we need to check if we exceed FullUnrollMaxCount.
+  // If using the upper bound to unroll, TripMultiple should be set to 1 because
+  // we do not know when loop may exit.
+
+  // We can unroll by the upper bound amount if it's generally allowed or if
+  // we know that the loop is executed either the upper bound or zero times.
+  // (MaxOrZero unrolling keeps only the first loop test, so the number of
+  // loop tests remains the same compared to the non-unrolled version, whereas
+  // the generic upper bound unrolling keeps all but the last loop test so the
+  // number of loop tests goes up which may end up being worse on targets with
+  // constrained branch predictor resources so is controlled by an option.)
+  // In addition we only unroll small upper bounds.
+  unsigned FullUnrollMaxTripCount = MaxTripCount;
+  if (!(UP.UpperBound || MaxOrZero) ||
+      FullUnrollMaxTripCount > UnrollMaxUpperBound)
+    FullUnrollMaxTripCount = 0;
+
+  // UnrollByMaxCount and ExactTripCount cannot both be non zero since we only
+  // compute the former when the latter is zero.
+  unsigned ExactTripCount = TripCount;
+  assert((ExactTripCount == 0 || FullUnrollMaxTripCount == 0) &&
+         "ExtractTripCount and UnrollByMaxCount cannot both be non zero.");
+
+  unsigned FullUnrollTripCount =
+      ExactTripCount ? ExactTripCount : FullUnrollMaxTripCount;
+  UP.Count = FullUnrollTripCount;
+  if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) {
+    // When computing the unrolled size, note that BEInsns are not replicated
+    // like the rest of the loop body.
+    if (getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) {
+      UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
+      TripCount = FullUnrollTripCount;
+      TripMultiple = UP.UpperBound ? 1 : TripMultiple;
+      return ExplicitUnroll;
+    } else {
+      // The loop isn't that small, but we still can fully unroll it if that
+      // helps to remove a significant number of instructions.
+      // To check that, run additional analysis on the loop.
+      if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
+              L, FullUnrollTripCount, DT, SE, EphValues, TTI,
+              UP.Threshold * UP.MaxPercentThresholdBoost / 100,
+              UP.MaxIterationsCountToAnalyze)) {
+        unsigned Boost =
+            getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
+        if (Cost->UnrolledCost < UP.Threshold * Boost / 100) {
+          UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
+          TripCount = FullUnrollTripCount;
+          TripMultiple = UP.UpperBound ? 1 : TripMultiple;
+          return ExplicitUnroll;
+        }
+      }
+    }
+  }
+
+  // 4th priority is loop peeling.
   computePeelCount(L, LoopSize, PP, TripCount, SE, UP.Threshold);
-  if (PP.PeelCount) { 
-    UP.Runtime = false; 
-    UP.Count = 1; 
-    return ExplicitUnroll; 
-  } 
- 
-  // 5th priority is partial unrolling. 
-  // Try partial unroll only when TripCount could be statically calculated. 
-  if (TripCount) { 
-    UP.Partial |= ExplicitUnroll; 
-    if (!UP.Partial) { 
-      LLVM_DEBUG(dbgs() << "  will not try to unroll partially because " 
-                        << "-unroll-allow-partial not given\n"); 
-      UP.Count = 0; 
-      return false; 
-    } 
-    if (UP.Count == 0) 
-      UP.Count = TripCount; 
-    if (UP.PartialThreshold != NoThreshold) { 
-      // Reduce unroll count to be modulo of TripCount for partial unrolling. 
-      if (getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold) 
-        UP.Count = 
-            (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) / 
-            (LoopSize - UP.BEInsns); 
-      if (UP.Count > UP.MaxCount) 
-        UP.Count = UP.MaxCount; 
-      while (UP.Count != 0 && TripCount % UP.Count != 0) 
-        UP.Count--; 
-      if (UP.AllowRemainder && UP.Count <= 1) { 
-        // If there is no Count that is modulo of TripCount, set Count to 
-        // largest power-of-two factor that satisfies the threshold limit. 
-        // As we'll create fixup loop, do the type of unrolling only if 
-        // remainder loop is allowed. 
-        UP.Count = UP.DefaultUnrollRuntimeCount; 
-        while (UP.Count != 0 && 
-               getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold) 
-          UP.Count >>= 1; 
-      } 
-      if (UP.Count < 2) { 
-        if (PragmaEnableUnroll) 
-          ORE->emit([&]() { 
-            return OptimizationRemarkMissed(DEBUG_TYPE, 
-                                            "UnrollAsDirectedTooLarge", 
-                                            L->getStartLoc(), L->getHeader()) 
-                   << "Unable to unroll loop as directed by unroll(enable) " 
-                      "pragma " 
-                      "because unrolled size is too large."; 
-          }); 
-        UP.Count = 0; 
-      } 
-    } else { 
-      UP.Count = TripCount; 
-    } 
-    if (UP.Count > UP.MaxCount) 
-      UP.Count = UP.MaxCount; 
-    if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount && 
-        UP.Count != TripCount) 
-      ORE->emit([&]() { 
-        return OptimizationRemarkMissed(DEBUG_TYPE, 
-                                        "FullUnrollAsDirectedTooLarge", 
-                                        L->getStartLoc(), L->getHeader()) 
-               << "Unable to fully unroll loop as directed by unroll pragma " 
-                  "because " 
-                  "unrolled size is too large."; 
-      }); 
-    LLVM_DEBUG(dbgs() << "  partially unrolling with count: " << UP.Count 
-                      << "\n"); 
-    return ExplicitUnroll; 
-  } 
-  assert(TripCount == 0 && 
-         "All cases when TripCount is constant should be covered here."); 
-  if (PragmaFullUnroll) 
-    ORE->emit([&]() { 
-      return OptimizationRemarkMissed( 
-                 DEBUG_TYPE, "CantFullUnrollAsDirectedRuntimeTripCount", 
-                 L->getStartLoc(), L->getHeader()) 
-             << "Unable to fully unroll loop as directed by unroll(full) " 
-                "pragma " 
-                "because loop has a runtime trip count."; 
-    }); 
- 
-  // 6th priority is runtime unrolling. 
-  // Don't unroll a runtime trip count loop when it is disabled. 
-  if (hasRuntimeUnrollDisablePragma(L)) { 
-    UP.Count = 0; 
-    return false; 
-  } 
- 
-  // Don't unroll a small upper bound loop unless user or TTI asked to do so. 
-  if (MaxTripCount && !UP.Force && MaxTripCount < UnrollMaxUpperBound) { 
-    UP.Count = 0; 
-    return false; 
-  } 
- 
-  // Check if the runtime trip count is too small when profile is available. 
-  if (L->getHeader()->getParent()->hasProfileData()) { 
-    if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) { 
-      if (*ProfileTripCount < FlatLoopTripCountThreshold) 
-        return false; 
-      else 
-        UP.AllowExpensiveTripCount = true; 
-    } 
-  } 
- 
-  // Reduce count based on the type of unrolling and the threshold values. 
-  UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount; 
-  if (!UP.Runtime) { 
-    LLVM_DEBUG( 
-        dbgs() << "  will not try to unroll loop with runtime trip count " 
-               << "-unroll-runtime not given\n"); 
-    UP.Count = 0; 
-    return false; 
-  } 
-  if (UP.Count == 0) 
-    UP.Count = UP.DefaultUnrollRuntimeCount; 
- 
-  // Reduce unroll count to be the largest power-of-two factor of 
-  // the original count which satisfies the threshold limit. 
-  while (UP.Count != 0 && 
-         getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold) 
-    UP.Count >>= 1; 
- 
-#ifndef NDEBUG 
-  unsigned OrigCount = UP.Count; 
-#endif 
- 
-  if (!UP.AllowRemainder && UP.Count != 0 && (TripMultiple % UP.Count) != 0) { 
-    while (UP.Count != 0 && TripMultiple % UP.Count != 0) 
-      UP.Count >>= 1; 
-    LLVM_DEBUG( 
-        dbgs() << "Remainder loop is restricted (that could architecture " 
-                  "specific or because the loop contains a convergent " 
-                  "instruction), so unroll count must divide the trip " 
-                  "multiple, " 
-               << TripMultiple << ".  Reducing unroll count from " << OrigCount 
-               << " to " << UP.Count << ".\n"); 
- 
-    using namespace ore; 
- 
-    if (PragmaCount > 0 && !UP.AllowRemainder) 
-      ORE->emit([&]() { 
-        return OptimizationRemarkMissed(DEBUG_TYPE, 
-                                        "DifferentUnrollCountFromDirected", 
-                                        L->getStartLoc(), L->getHeader()) 
-               << "Unable to unroll loop the number of times directed by " 
-                  "unroll_count pragma because remainder loop is restricted " 
-                  "(that could architecture specific or because the loop " 
-                  "contains a convergent instruction) and so must have an " 
-                  "unroll " 
-                  "count that divides the loop trip multiple of " 
-               << NV("TripMultiple", TripMultiple) << ".  Unrolling instead " 
-               << NV("UnrollCount", UP.Count) << " time(s)."; 
-      }); 
-  } 
- 
-  if (UP.Count > UP.MaxCount) 
-    UP.Count = UP.MaxCount; 
- 
-  if (MaxTripCount && UP.Count > MaxTripCount) 
-    UP.Count = MaxTripCount; 
- 
-  LLVM_DEBUG(dbgs() << "  runtime unrolling with count: " << UP.Count 
-                    << "\n"); 
-  if (UP.Count < 2) 
-    UP.Count = 0; 
-  return ExplicitUnroll; 
-} 
- 
-static LoopUnrollResult tryToUnrollLoop( 
-    Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, 
-    const TargetTransformInfo &TTI, AssumptionCache &AC, 
-    OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI, 
-    ProfileSummaryInfo *PSI, bool PreserveLCSSA, int OptLevel, 
-    bool OnlyWhenForced, bool ForgetAllSCEV, Optional<unsigned> ProvidedCount, 
-    Optional<unsigned> ProvidedThreshold, Optional<bool> ProvidedAllowPartial, 
-    Optional<bool> ProvidedRuntime, Optional<bool> ProvidedUpperBound, 
-    Optional<bool> ProvidedAllowPeeling, 
-    Optional<bool> ProvidedAllowProfileBasedPeeling, 
-    Optional<unsigned> ProvidedFullUnrollMaxCount) { 
-  LLVM_DEBUG(dbgs() << "Loop Unroll: F[" 
-                    << L->getHeader()->getParent()->getName() << "] Loop %" 
-                    << L->getHeader()->getName() << "\n"); 
-  TransformationMode TM = hasUnrollTransformation(L); 
-  if (TM & TM_Disable) 
-    return LoopUnrollResult::Unmodified; 
-  if (!L->isLoopSimplifyForm()) { 
-    LLVM_DEBUG( 
-        dbgs() << "  Not unrolling loop which is not in loop-simplify form.\n"); 
-    return LoopUnrollResult::Unmodified; 
-  } 
- 
+  if (PP.PeelCount) {
+    UP.Runtime = false;
+    UP.Count = 1;
+    return ExplicitUnroll;
+  }
+
+  // 5th priority is partial unrolling.
+  // Try partial unroll only when TripCount could be statically calculated.
+  if (TripCount) {
+    UP.Partial |= ExplicitUnroll;
+    if (!UP.Partial) {
+      LLVM_DEBUG(dbgs() << "  will not try to unroll partially because "
+                        << "-unroll-allow-partial not given\n");
+      UP.Count = 0;
+      return false;
+    }
+    if (UP.Count == 0)
+      UP.Count = TripCount;
+    if (UP.PartialThreshold != NoThreshold) {
+      // Reduce unroll count to be modulo of TripCount for partial unrolling.
+      if (getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
+        UP.Count =
+            (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) /
+            (LoopSize - UP.BEInsns);
+      if (UP.Count > UP.MaxCount)
+        UP.Count = UP.MaxCount;
+      while (UP.Count != 0 && TripCount % UP.Count != 0)
+        UP.Count--;
+      if (UP.AllowRemainder && UP.Count <= 1) {
+        // If there is no Count that is modulo of TripCount, set Count to
+        // largest power-of-two factor that satisfies the threshold limit.
+        // As we'll create fixup loop, do the type of unrolling only if
+        // remainder loop is allowed.
+        UP.Count = UP.DefaultUnrollRuntimeCount;
+        while (UP.Count != 0 &&
+               getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
+          UP.Count >>= 1;
+      }
+      if (UP.Count < 2) {
+        if (PragmaEnableUnroll)
+          ORE->emit([&]() {
+            return OptimizationRemarkMissed(DEBUG_TYPE,
+                                            "UnrollAsDirectedTooLarge",
+                                            L->getStartLoc(), L->getHeader())
+                   << "Unable to unroll loop as directed by unroll(enable) "
+                      "pragma "
+                      "because unrolled size is too large.";
+          });
+        UP.Count = 0;
+      }
+    } else {
+      UP.Count = TripCount;
+    }
+    if (UP.Count > UP.MaxCount)
+      UP.Count = UP.MaxCount;
+    if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
+        UP.Count != TripCount)
+      ORE->emit([&]() {
+        return OptimizationRemarkMissed(DEBUG_TYPE,
+                                        "FullUnrollAsDirectedTooLarge",
+                                        L->getStartLoc(), L->getHeader())
+               << "Unable to fully unroll loop as directed by unroll pragma "
+                  "because "
+                  "unrolled size is too large.";
+      });
+    LLVM_DEBUG(dbgs() << "  partially unrolling with count: " << UP.Count
+                      << "\n");
+    return ExplicitUnroll;
+  }
+  assert(TripCount == 0 &&
+         "All cases when TripCount is constant should be covered here.");
+  if (PragmaFullUnroll)
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(
+                 DEBUG_TYPE, "CantFullUnrollAsDirectedRuntimeTripCount",
+                 L->getStartLoc(), L->getHeader())
+             << "Unable to fully unroll loop as directed by unroll(full) "
+                "pragma "
+                "because loop has a runtime trip count.";
+    });
+
+  // 6th priority is runtime unrolling.
+  // Don't unroll a runtime trip count loop when it is disabled.
+  if (hasRuntimeUnrollDisablePragma(L)) {
+    UP.Count = 0;
+    return false;
+  }
+
+  // Don't unroll a small upper bound loop unless user or TTI asked to do so.
+  if (MaxTripCount && !UP.Force && MaxTripCount < UnrollMaxUpperBound) {
+    UP.Count = 0;
+    return false;
+  }
+
+  // Check if the runtime trip count is too small when profile is available.
+  if (L->getHeader()->getParent()->hasProfileData()) {
+    if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
+      if (*ProfileTripCount < FlatLoopTripCountThreshold)
+        return false;
+      else
+        UP.AllowExpensiveTripCount = true;
+    }
+  }
+
+  // Reduce count based on the type of unrolling and the threshold values.
+  UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount;
+  if (!UP.Runtime) {
+    LLVM_DEBUG(
+        dbgs() << "  will not try to unroll loop with runtime trip count "
+               << "-unroll-runtime not given\n");
+    UP.Count = 0;
+    return false;
+  }
+  if (UP.Count == 0)
+    UP.Count = UP.DefaultUnrollRuntimeCount;
+
+  // Reduce unroll count to be the largest power-of-two factor of
+  // the original count which satisfies the threshold limit.
+  while (UP.Count != 0 &&
+         getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
+    UP.Count >>= 1;
+
+#ifndef NDEBUG
+  unsigned OrigCount = UP.Count;
+#endif
+
+  if (!UP.AllowRemainder && UP.Count != 0 && (TripMultiple % UP.Count) != 0) {
+    while (UP.Count != 0 && TripMultiple % UP.Count != 0)
+      UP.Count >>= 1;
+    LLVM_DEBUG(
+        dbgs() << "Remainder loop is restricted (that could architecture "
+                  "specific or because the loop contains a convergent "
+                  "instruction), so unroll count must divide the trip "
+                  "multiple, "
+               << TripMultiple << ".  Reducing unroll count from " << OrigCount
+               << " to " << UP.Count << ".\n");
+
+    using namespace ore;
+
+    if (PragmaCount > 0 && !UP.AllowRemainder)
+      ORE->emit([&]() {
+        return OptimizationRemarkMissed(DEBUG_TYPE,
+                                        "DifferentUnrollCountFromDirected",
+                                        L->getStartLoc(), L->getHeader())
+               << "Unable to unroll loop the number of times directed by "
+                  "unroll_count pragma because remainder loop is restricted "
+                  "(that could architecture specific or because the loop "
+                  "contains a convergent instruction) and so must have an "
+                  "unroll "
+                  "count that divides the loop trip multiple of "
+               << NV("TripMultiple", TripMultiple) << ".  Unrolling instead "
+               << NV("UnrollCount", UP.Count) << " time(s).";
+      });
+  }
+
+  if (UP.Count > UP.MaxCount)
+    UP.Count = UP.MaxCount;
+
+  if (MaxTripCount && UP.Count > MaxTripCount)
+    UP.Count = MaxTripCount;
+
+  LLVM_DEBUG(dbgs() << "  runtime unrolling with count: " << UP.Count
+                    << "\n");
+  if (UP.Count < 2)
+    UP.Count = 0;
+  return ExplicitUnroll;
+}
+
+static LoopUnrollResult tryToUnrollLoop(
+    Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
+    const TargetTransformInfo &TTI, AssumptionCache &AC,
+    OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
+    ProfileSummaryInfo *PSI, bool PreserveLCSSA, int OptLevel,
+    bool OnlyWhenForced, bool ForgetAllSCEV, Optional<unsigned> ProvidedCount,
+    Optional<unsigned> ProvidedThreshold, Optional<bool> ProvidedAllowPartial,
+    Optional<bool> ProvidedRuntime, Optional<bool> ProvidedUpperBound,
+    Optional<bool> ProvidedAllowPeeling,
+    Optional<bool> ProvidedAllowProfileBasedPeeling,
+    Optional<unsigned> ProvidedFullUnrollMaxCount) {
+  LLVM_DEBUG(dbgs() << "Loop Unroll: F["
+                    << L->getHeader()->getParent()->getName() << "] Loop %"
+                    << L->getHeader()->getName() << "\n");
+  TransformationMode TM = hasUnrollTransformation(L);
+  if (TM & TM_Disable)
+    return LoopUnrollResult::Unmodified;
+  if (!L->isLoopSimplifyForm()) {
+    LLVM_DEBUG(
+        dbgs() << "  Not unrolling loop which is not in loop-simplify form.\n");
+    return LoopUnrollResult::Unmodified;
+  }
+
   // When automatic unrolling is disabled, do not unroll unless overridden for
-  // this loop. 
-  if (OnlyWhenForced && !(TM & TM_Enable)) 
-    return LoopUnrollResult::Unmodified; 
- 
-  bool OptForSize = L->getHeader()->getParent()->hasOptSize(); 
-  unsigned NumInlineCandidates; 
-  bool NotDuplicatable; 
-  bool Convergent; 
-  TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences( 
-      L, SE, TTI, BFI, PSI, OptLevel, ProvidedThreshold, ProvidedCount, 
-      ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound, 
-      ProvidedFullUnrollMaxCount); 
-  TargetTransformInfo::PeelingPreferences PP = gatherPeelingPreferences( 
+  // this loop.
+  if (OnlyWhenForced && !(TM & TM_Enable))
+    return LoopUnrollResult::Unmodified;
+
+  bool OptForSize = L->getHeader()->getParent()->hasOptSize();
+  unsigned NumInlineCandidates;
+  bool NotDuplicatable;
+  bool Convergent;
+  TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
+      L, SE, TTI, BFI, PSI, OptLevel, ProvidedThreshold, ProvidedCount,
+      ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
+      ProvidedFullUnrollMaxCount);
+  TargetTransformInfo::PeelingPreferences PP = gatherPeelingPreferences(
       L, SE, TTI, ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling, true);
- 
-  // Exit early if unrolling is disabled. For OptForSize, we pick the loop size 
-  // as threshold later on. 
-  if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0) && 
-      !OptForSize) 
-    return LoopUnrollResult::Unmodified; 
- 
-  SmallPtrSet<const Value *, 32> EphValues; 
-  CodeMetrics::collectEphemeralValues(L, &AC, EphValues); 
- 
-  unsigned LoopSize = 
-      ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent, 
-                          TTI, EphValues, UP.BEInsns); 
-  LLVM_DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n"); 
-  if (NotDuplicatable) { 
-    LLVM_DEBUG(dbgs() << "  Not unrolling loop which contains non-duplicatable" 
-                      << " instructions.\n"); 
-    return LoopUnrollResult::Unmodified; 
-  } 
- 
-  // When optimizing for size, use LoopSize + 1 as threshold (we use < Threshold 
-  // later), to (fully) unroll loops, if it does not increase code size. 
-  if (OptForSize) 
-    UP.Threshold = std::max(UP.Threshold, LoopSize + 1); 
- 
-  if (NumInlineCandidates != 0) { 
-    LLVM_DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n"); 
-    return LoopUnrollResult::Unmodified; 
-  } 
- 
-  // Find trip count and trip multiple if count is not available 
-  unsigned TripCount = 0; 
-  unsigned TripMultiple = 1; 
-  // If there are multiple exiting blocks but one of them is the latch, use the 
-  // latch for the trip count estimation. Otherwise insist on a single exiting 
-  // block for the trip count estimation. 
-  BasicBlock *ExitingBlock = L->getLoopLatch(); 
-  if (!ExitingBlock || !L->isLoopExiting(ExitingBlock)) 
-    ExitingBlock = L->getExitingBlock(); 
-  if (ExitingBlock) { 
-    TripCount = SE.getSmallConstantTripCount(L, ExitingBlock); 
-    TripMultiple = SE.getSmallConstantTripMultiple(L, ExitingBlock); 
-  } 
- 
-  // If the loop contains a convergent operation, the prelude we'd add 
-  // to do the first few instructions before we hit the unrolled loop 
-  // is unsafe -- it adds a control-flow dependency to the convergent 
+
+  // Exit early if unrolling is disabled. For OptForSize, we pick the loop size
+  // as threshold later on.
+  if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0) &&
+      !OptForSize)
+    return LoopUnrollResult::Unmodified;
+
+  SmallPtrSet<const Value *, 32> EphValues;
+  CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
+
+  unsigned LoopSize =
+      ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent,
+                          TTI, EphValues, UP.BEInsns);
+  LLVM_DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n");
+  if (NotDuplicatable) {
+    LLVM_DEBUG(dbgs() << "  Not unrolling loop which contains non-duplicatable"
+                      << " instructions.\n");
+    return LoopUnrollResult::Unmodified;
+  }
+
+  // When optimizing for size, use LoopSize + 1 as threshold (we use < Threshold
+  // later), to (fully) unroll loops, if it does not increase code size.
+  if (OptForSize)
+    UP.Threshold = std::max(UP.Threshold, LoopSize + 1);
+
+  if (NumInlineCandidates != 0) {
+    LLVM_DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n");
+    return LoopUnrollResult::Unmodified;
+  }
+
+  // Find trip count and trip multiple if count is not available
+  unsigned TripCount = 0;
+  unsigned TripMultiple = 1;
+  // If there are multiple exiting blocks but one of them is the latch, use the
+  // latch for the trip count estimation. Otherwise insist on a single exiting
+  // block for the trip count estimation.
+  BasicBlock *ExitingBlock = L->getLoopLatch();
+  if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))
+    ExitingBlock = L->getExitingBlock();
+  if (ExitingBlock) {
+    TripCount = SE.getSmallConstantTripCount(L, ExitingBlock);
+    TripMultiple = SE.getSmallConstantTripMultiple(L, ExitingBlock);
+  }
+
+  // If the loop contains a convergent operation, the prelude we'd add
+  // to do the first few instructions before we hit the unrolled loop
+  // is unsafe -- it adds a control-flow dependency to the convergent
   // operation.  Therefore restrict remainder loop (try unrolling without).
-  // 
-  // TODO: This is quite conservative.  In practice, convergent_op() 
-  // is likely to be called unconditionally in the loop.  In this 
-  // case, the program would be ill-formed (on most architectures) 
-  // unless n were the same on all threads in a thread group. 
-  // Assuming n is the same on all threads, any kind of unrolling is 
-  // safe.  But currently llvm's notion of convergence isn't powerful 
-  // enough to express this. 
-  if (Convergent) 
-    UP.AllowRemainder = false; 
- 
-  // Try to find the trip count upper bound if we cannot find the exact trip 
-  // count. 
-  unsigned MaxTripCount = 0; 
-  bool MaxOrZero = false; 
-  if (!TripCount) { 
-    MaxTripCount = SE.getSmallConstantMaxTripCount(L); 
-    MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L); 
-  } 
- 
-  // computeUnrollCount() decides whether it is beneficial to use upper bound to 
-  // fully unroll the loop. 
-  bool UseUpperBound = false; 
-  bool IsCountSetExplicitly = computeUnrollCount( 
-      L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount, MaxOrZero, 
-      TripMultiple, LoopSize, UP, PP, UseUpperBound); 
-  if (!UP.Count) 
-    return LoopUnrollResult::Unmodified; 
-  // Unroll factor (Count) must be less or equal to TripCount. 
-  if (TripCount && UP.Count > TripCount) 
-    UP.Count = TripCount; 
- 
-  // Save loop properties before it is transformed. 
-  MDNode *OrigLoopID = L->getLoopID(); 
- 
-  // Unroll the loop. 
-  Loop *RemainderLoop = nullptr; 
-  LoopUnrollResult UnrollResult = UnrollLoop( 
-      L, 
-      {UP.Count, TripCount, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount, 
-       UseUpperBound, MaxOrZero, TripMultiple, PP.PeelCount, UP.UnrollRemainder, 
-       ForgetAllSCEV}, 
-      LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop); 
-  if (UnrollResult == LoopUnrollResult::Unmodified) 
-    return LoopUnrollResult::Unmodified; 
- 
-  if (RemainderLoop) { 
-    Optional<MDNode *> RemainderLoopID = 
-        makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll, 
-                                        LLVMLoopUnrollFollowupRemainder}); 
-    if (RemainderLoopID.hasValue()) 
-      RemainderLoop->setLoopID(RemainderLoopID.getValue()); 
-  } 
- 
-  if (UnrollResult != LoopUnrollResult::FullyUnrolled) { 
-    Optional<MDNode *> NewLoopID = 
-        makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll, 
-                                        LLVMLoopUnrollFollowupUnrolled}); 
-    if (NewLoopID.hasValue()) { 
-      L->setLoopID(NewLoopID.getValue()); 
- 
-      // Do not setLoopAlreadyUnrolled if loop attributes have been specified 
-      // explicitly. 
-      return UnrollResult; 
-    } 
-  } 
- 
-  // If loop has an unroll count pragma or unrolled by explicitly set count 
-  // mark loop as unrolled to prevent unrolling beyond that requested. 
-  // If the loop was peeled, we already "used up" the profile information 
-  // we had, so we don't want to unroll or peel again. 
-  if (UnrollResult != LoopUnrollResult::FullyUnrolled && 
-      (IsCountSetExplicitly || (PP.PeelProfiledIterations && PP.PeelCount))) 
-    L->setLoopAlreadyUnrolled(); 
- 
-  return UnrollResult; 
-} 
- 
-namespace { 
- 
-class LoopUnroll : public LoopPass { 
-public: 
-  static char ID; // Pass ID, replacement for typeid 
- 
-  int OptLevel; 
- 
-  /// If false, use a cost model to determine whether unrolling of a loop is 
-  /// profitable. If true, only loops that explicitly request unrolling via 
-  /// metadata are considered. All other loops are skipped. 
-  bool OnlyWhenForced; 
- 
-  /// If false, when SCEV is invalidated, only forget everything in the 
-  /// top-most loop (call forgetTopMostLoop), of the loop being processed. 
-  /// Otherwise, forgetAllLoops and rebuild when needed next. 
-  bool ForgetAllSCEV; 
- 
-  Optional<unsigned> ProvidedCount; 
-  Optional<unsigned> ProvidedThreshold; 
-  Optional<bool> ProvidedAllowPartial; 
-  Optional<bool> ProvidedRuntime; 
-  Optional<bool> ProvidedUpperBound; 
-  Optional<bool> ProvidedAllowPeeling; 
-  Optional<bool> ProvidedAllowProfileBasedPeeling; 
-  Optional<unsigned> ProvidedFullUnrollMaxCount; 
- 
-  LoopUnroll(int OptLevel = 2, bool OnlyWhenForced = false, 
-             bool ForgetAllSCEV = false, Optional<unsigned> Threshold = None, 
-             Optional<unsigned> Count = None, 
-             Optional<bool> AllowPartial = None, Optional<bool> Runtime = None, 
-             Optional<bool> UpperBound = None, 
-             Optional<bool> AllowPeeling = None, 
-             Optional<bool> AllowProfileBasedPeeling = None, 
-             Optional<unsigned> ProvidedFullUnrollMaxCount = None) 
-      : LoopPass(ID), OptLevel(OptLevel), OnlyWhenForced(OnlyWhenForced), 
-        ForgetAllSCEV(ForgetAllSCEV), ProvidedCount(std::move(Count)), 
-        ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial), 
-        ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound), 
-        ProvidedAllowPeeling(AllowPeeling), 
-        ProvidedAllowProfileBasedPeeling(AllowProfileBasedPeeling), 
-        ProvidedFullUnrollMaxCount(ProvidedFullUnrollMaxCount) { 
-    initializeLoopUnrollPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnLoop(Loop *L, LPPassManager &LPM) override { 
-    if (skipLoop(L)) 
-      return false; 
- 
-    Function &F = *L->getHeader()->getParent(); 
- 
-    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-    LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 
-    ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 
-    const TargetTransformInfo &TTI = 
-        getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 
-    auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 
-    // For the old PM, we can't use OptimizationRemarkEmitter as an analysis 
-    // pass.  Function analyses need to be preserved across loop transformations 
-    // but ORE cannot be preserved (see comment before the pass definition). 
-    OptimizationRemarkEmitter ORE(&F); 
-    bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); 
- 
-    LoopUnrollResult Result = tryToUnrollLoop( 
-        L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr, PreserveLCSSA, OptLevel, 
-        OnlyWhenForced, ForgetAllSCEV, ProvidedCount, ProvidedThreshold, 
-        ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound, 
-        ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling, 
-        ProvidedFullUnrollMaxCount); 
- 
-    if (Result == LoopUnrollResult::FullyUnrolled) 
-      LPM.markLoopAsDeleted(*L); 
- 
-    return Result != LoopUnrollResult::Unmodified; 
-  } 
- 
-  /// This transformation requires natural loop information & requires that 
-  /// loop preheaders be inserted into the CFG... 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<AssumptionCacheTracker>(); 
-    AU.addRequired<TargetTransformInfoWrapperPass>(); 
-    // FIXME: Loop passes are required to preserve domtree, and for now we just 
-    // recreate dom info if anything gets unrolled. 
-    getLoopAnalysisUsage(AU); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-char LoopUnroll::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
-INITIALIZE_PASS_DEPENDENCY(LoopPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 
-INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false) 
- 
-Pass *llvm::createLoopUnrollPass(int OptLevel, bool OnlyWhenForced, 
-                                 bool ForgetAllSCEV, int Threshold, int Count, 
-                                 int AllowPartial, int Runtime, int UpperBound, 
-                                 int AllowPeeling) { 
-  // TODO: It would make more sense for this function to take the optionals 
-  // directly, but that's dangerous since it would silently break out of tree 
-  // callers. 
-  return new LoopUnroll( 
-      OptLevel, OnlyWhenForced, ForgetAllSCEV, 
-      Threshold == -1 ? None : Optional<unsigned>(Threshold), 
-      Count == -1 ? None : Optional<unsigned>(Count), 
-      AllowPartial == -1 ? None : Optional<bool>(AllowPartial), 
-      Runtime == -1 ? None : Optional<bool>(Runtime), 
-      UpperBound == -1 ? None : Optional<bool>(UpperBound), 
-      AllowPeeling == -1 ? None : Optional<bool>(AllowPeeling)); 
-} 
- 
-Pass *llvm::createSimpleLoopUnrollPass(int OptLevel, bool OnlyWhenForced, 
-                                       bool ForgetAllSCEV) { 
-  return createLoopUnrollPass(OptLevel, OnlyWhenForced, ForgetAllSCEV, -1, -1, 
+  //
+  // TODO: This is quite conservative.  In practice, convergent_op()
+  // is likely to be called unconditionally in the loop.  In this
+  // case, the program would be ill-formed (on most architectures)
+  // unless n were the same on all threads in a thread group.
+  // Assuming n is the same on all threads, any kind of unrolling is
+  // safe.  But currently llvm's notion of convergence isn't powerful
+  // enough to express this.
+  if (Convergent)
+    UP.AllowRemainder = false;
+
+  // Try to find the trip count upper bound if we cannot find the exact trip
+  // count.
+  unsigned MaxTripCount = 0;
+  bool MaxOrZero = false;
+  if (!TripCount) {
+    MaxTripCount = SE.getSmallConstantMaxTripCount(L);
+    MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L);
+  }
+
+  // computeUnrollCount() decides whether it is beneficial to use upper bound to
+  // fully unroll the loop.
+  bool UseUpperBound = false;
+  bool IsCountSetExplicitly = computeUnrollCount(
+      L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount, MaxOrZero,
+      TripMultiple, LoopSize, UP, PP, UseUpperBound);
+  if (!UP.Count)
+    return LoopUnrollResult::Unmodified;
+  // Unroll factor (Count) must be less or equal to TripCount.
+  if (TripCount && UP.Count > TripCount)
+    UP.Count = TripCount;
+
+  // Save loop properties before it is transformed.
+  MDNode *OrigLoopID = L->getLoopID();
+
+  // Unroll the loop.
+  Loop *RemainderLoop = nullptr;
+  LoopUnrollResult UnrollResult = UnrollLoop(
+      L,
+      {UP.Count, TripCount, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount,
+       UseUpperBound, MaxOrZero, TripMultiple, PP.PeelCount, UP.UnrollRemainder,
+       ForgetAllSCEV},
+      LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop);
+  if (UnrollResult == LoopUnrollResult::Unmodified)
+    return LoopUnrollResult::Unmodified;
+
+  if (RemainderLoop) {
+    Optional<MDNode *> RemainderLoopID =
+        makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll,
+                                        LLVMLoopUnrollFollowupRemainder});
+    if (RemainderLoopID.hasValue())
+      RemainderLoop->setLoopID(RemainderLoopID.getValue());
+  }
+
+  if (UnrollResult != LoopUnrollResult::FullyUnrolled) {
+    Optional<MDNode *> NewLoopID =
+        makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll,
+                                        LLVMLoopUnrollFollowupUnrolled});
+    if (NewLoopID.hasValue()) {
+      L->setLoopID(NewLoopID.getValue());
+
+      // Do not setLoopAlreadyUnrolled if loop attributes have been specified
+      // explicitly.
+      return UnrollResult;
+    }
+  }
+
+  // If loop has an unroll count pragma or unrolled by explicitly set count
+  // mark loop as unrolled to prevent unrolling beyond that requested.
+  // If the loop was peeled, we already "used up" the profile information
+  // we had, so we don't want to unroll or peel again.
+  if (UnrollResult != LoopUnrollResult::FullyUnrolled &&
+      (IsCountSetExplicitly || (PP.PeelProfiledIterations && PP.PeelCount)))
+    L->setLoopAlreadyUnrolled();
+
+  return UnrollResult;
+}
+
+namespace {
+
+class LoopUnroll : public LoopPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+
+  int OptLevel;
+
+  /// If false, use a cost model to determine whether unrolling of a loop is
+  /// profitable. If true, only loops that explicitly request unrolling via
+  /// metadata are considered. All other loops are skipped.
+  bool OnlyWhenForced;
+
+  /// If false, when SCEV is invalidated, only forget everything in the
+  /// top-most loop (call forgetTopMostLoop), of the loop being processed.
+  /// Otherwise, forgetAllLoops and rebuild when needed next.
+  bool ForgetAllSCEV;
+
+  Optional<unsigned> ProvidedCount;
+  Optional<unsigned> ProvidedThreshold;
+  Optional<bool> ProvidedAllowPartial;
+  Optional<bool> ProvidedRuntime;
+  Optional<bool> ProvidedUpperBound;
+  Optional<bool> ProvidedAllowPeeling;
+  Optional<bool> ProvidedAllowProfileBasedPeeling;
+  Optional<unsigned> ProvidedFullUnrollMaxCount;
+
+  LoopUnroll(int OptLevel = 2, bool OnlyWhenForced = false,
+             bool ForgetAllSCEV = false, Optional<unsigned> Threshold = None,
+             Optional<unsigned> Count = None,
+             Optional<bool> AllowPartial = None, Optional<bool> Runtime = None,
+             Optional<bool> UpperBound = None,
+             Optional<bool> AllowPeeling = None,
+             Optional<bool> AllowProfileBasedPeeling = None,
+             Optional<unsigned> ProvidedFullUnrollMaxCount = None)
+      : LoopPass(ID), OptLevel(OptLevel), OnlyWhenForced(OnlyWhenForced),
+        ForgetAllSCEV(ForgetAllSCEV), ProvidedCount(std::move(Count)),
+        ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
+        ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound),
+        ProvidedAllowPeeling(AllowPeeling),
+        ProvidedAllowProfileBasedPeeling(AllowProfileBasedPeeling),
+        ProvidedFullUnrollMaxCount(ProvidedFullUnrollMaxCount) {
+    initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+    if (skipLoop(L))
+      return false;
+
+    Function &F = *L->getHeader()->getParent();
+
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    const TargetTransformInfo &TTI =
+        getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
+    // pass.  Function analyses need to be preserved across loop transformations
+    // but ORE cannot be preserved (see comment before the pass definition).
+    OptimizationRemarkEmitter ORE(&F);
+    bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+
+    LoopUnrollResult Result = tryToUnrollLoop(
+        L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr, PreserveLCSSA, OptLevel,
+        OnlyWhenForced, ForgetAllSCEV, ProvidedCount, ProvidedThreshold,
+        ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
+        ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling,
+        ProvidedFullUnrollMaxCount);
+
+    if (Result == LoopUnrollResult::FullyUnrolled)
+      LPM.markLoopAsDeleted(*L);
+
+    return Result != LoopUnrollResult::Unmodified;
+  }
+
+  /// This transformation requires natural loop information & requires that
+  /// loop preheaders be inserted into the CFG...
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    // FIXME: Loop passes are required to preserve domtree, and for now we just
+    // recreate dom info if anything gets unrolled.
+    getLoopAnalysisUsage(AU);
+  }
+};
+
+} // end anonymous namespace
+
+char LoopUnroll::ID = 0;
+
+INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
+
+Pass *llvm::createLoopUnrollPass(int OptLevel, bool OnlyWhenForced,
+                                 bool ForgetAllSCEV, int Threshold, int Count,
+                                 int AllowPartial, int Runtime, int UpperBound,
+                                 int AllowPeeling) {
+  // TODO: It would make more sense for this function to take the optionals
+  // directly, but that's dangerous since it would silently break out of tree
+  // callers.
+  return new LoopUnroll(
+      OptLevel, OnlyWhenForced, ForgetAllSCEV,
+      Threshold == -1 ? None : Optional<unsigned>(Threshold),
+      Count == -1 ? None : Optional<unsigned>(Count),
+      AllowPartial == -1 ? None : Optional<bool>(AllowPartial),
+      Runtime == -1 ? None : Optional<bool>(Runtime),
+      UpperBound == -1 ? None : Optional<bool>(UpperBound),
+      AllowPeeling == -1 ? None : Optional<bool>(AllowPeeling));
+}
+
+Pass *llvm::createSimpleLoopUnrollPass(int OptLevel, bool OnlyWhenForced,
+                                       bool ForgetAllSCEV) {
+  return createLoopUnrollPass(OptLevel, OnlyWhenForced, ForgetAllSCEV, -1, -1,
                               0, 0, 0, 1);
-} 
- 
-PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM, 
-                                          LoopStandardAnalysisResults &AR, 
-                                          LPMUpdater &Updater) { 
-  // For the new PM, we can't use OptimizationRemarkEmitter as an analysis 
-  // pass. Function analyses need to be preserved across loop transformations 
-  // but ORE cannot be preserved (see comment before the pass definition). 
-  OptimizationRemarkEmitter ORE(L.getHeader()->getParent()); 
- 
-  // Keep track of the previous loop structure so we can identify new loops 
-  // created by unrolling. 
-  Loop *ParentL = L.getParentLoop(); 
-  SmallPtrSet<Loop *, 4> OldLoops; 
-  if (ParentL) 
-    OldLoops.insert(ParentL->begin(), ParentL->end()); 
-  else 
-    OldLoops.insert(AR.LI.begin(), AR.LI.end()); 
- 
-  std::string LoopName = std::string(L.getName()); 
- 
-  bool Changed = tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, ORE, 
-                                 /*BFI*/ nullptr, /*PSI*/ nullptr, 
-                                 /*PreserveLCSSA*/ true, OptLevel, 
-                                 OnlyWhenForced, ForgetSCEV, /*Count*/ None, 
-                                 /*Threshold*/ None, /*AllowPartial*/ false, 
-                                 /*Runtime*/ false, /*UpperBound*/ false, 
+}
+
+PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
+                                          LoopStandardAnalysisResults &AR,
+                                          LPMUpdater &Updater) {
+  // For the new PM, we can't use OptimizationRemarkEmitter as an analysis
+  // pass. Function analyses need to be preserved across loop transformations
+  // but ORE cannot be preserved (see comment before the pass definition).
+  OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
+
+  // Keep track of the previous loop structure so we can identify new loops
+  // created by unrolling.
+  Loop *ParentL = L.getParentLoop();
+  SmallPtrSet<Loop *, 4> OldLoops;
+  if (ParentL)
+    OldLoops.insert(ParentL->begin(), ParentL->end());
+  else
+    OldLoops.insert(AR.LI.begin(), AR.LI.end());
+
+  std::string LoopName = std::string(L.getName());
+
+  bool Changed = tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, ORE,
+                                 /*BFI*/ nullptr, /*PSI*/ nullptr,
+                                 /*PreserveLCSSA*/ true, OptLevel,
+                                 OnlyWhenForced, ForgetSCEV, /*Count*/ None,
+                                 /*Threshold*/ None, /*AllowPartial*/ false,
+                                 /*Runtime*/ false, /*UpperBound*/ false,
                                  /*AllowPeeling*/ true,
-                                 /*AllowProfileBasedPeeling*/ false, 
-                                 /*FullUnrollMaxCount*/ None) != 
-                 LoopUnrollResult::Unmodified; 
-  if (!Changed) 
-    return PreservedAnalyses::all(); 
- 
-  // The parent must not be damaged by unrolling! 
-#ifndef NDEBUG 
-  if (ParentL) 
-    ParentL->verifyLoop(); 
-#endif 
- 
-  // Unrolling can do several things to introduce new loops into a loop nest: 
-  // - Full unrolling clones child loops within the current loop but then 
-  //   removes the current loop making all of the children appear to be new 
-  //   sibling loops. 
-  // 
-  // When a new loop appears as a sibling loop after fully unrolling, 
-  // its nesting structure has fundamentally changed and we want to revisit 
-  // it to reflect that. 
-  // 
-  // When unrolling has removed the current loop, we need to tell the 
-  // infrastructure that it is gone. 
-  // 
-  // Finally, we support a debugging/testing mode where we revisit child loops 
-  // as well. These are not expected to require further optimizations as either 
-  // they or the loop they were cloned from have been directly visited already. 
-  // But the debugging mode allows us to check this assumption. 
-  bool IsCurrentLoopValid = false; 
-  SmallVector<Loop *, 4> SibLoops; 
-  if (ParentL) 
-    SibLoops.append(ParentL->begin(), ParentL->end()); 
-  else 
-    SibLoops.append(AR.LI.begin(), AR.LI.end()); 
-  erase_if(SibLoops, [&](Loop *SibLoop) { 
-    if (SibLoop == &L) { 
-      IsCurrentLoopValid = true; 
-      return true; 
-    } 
- 
-    // Otherwise erase the loop from the list if it was in the old loops. 
+                                 /*AllowProfileBasedPeeling*/ false,
+                                 /*FullUnrollMaxCount*/ None) !=
+                 LoopUnrollResult::Unmodified;
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  // The parent must not be damaged by unrolling!
+#ifndef NDEBUG
+  if (ParentL)
+    ParentL->verifyLoop();
+#endif
+
+  // Unrolling can do several things to introduce new loops into a loop nest:
+  // - Full unrolling clones child loops within the current loop but then
+  //   removes the current loop making all of the children appear to be new
+  //   sibling loops.
+  //
+  // When a new loop appears as a sibling loop after fully unrolling,
+  // its nesting structure has fundamentally changed and we want to revisit
+  // it to reflect that.
+  //
+  // When unrolling has removed the current loop, we need to tell the
+  // infrastructure that it is gone.
+  //
+  // Finally, we support a debugging/testing mode where we revisit child loops
+  // as well. These are not expected to require further optimizations as either
+  // they or the loop they were cloned from have been directly visited already.
+  // But the debugging mode allows us to check this assumption.
+  bool IsCurrentLoopValid = false;
+  SmallVector<Loop *, 4> SibLoops;
+  if (ParentL)
+    SibLoops.append(ParentL->begin(), ParentL->end());
+  else
+    SibLoops.append(AR.LI.begin(), AR.LI.end());
+  erase_if(SibLoops, [&](Loop *SibLoop) {
+    if (SibLoop == &L) {
+      IsCurrentLoopValid = true;
+      return true;
+    }
+
+    // Otherwise erase the loop from the list if it was in the old loops.
     return OldLoops.contains(SibLoop);
-  }); 
-  Updater.addSiblingLoops(SibLoops); 
- 
-  if (!IsCurrentLoopValid) { 
-    Updater.markLoopAsDeleted(L, LoopName); 
-  } else { 
-    // We can only walk child loops if the current loop remained valid. 
-    if (UnrollRevisitChildLoops) { 
-      // Walk *all* of the child loops. 
-      SmallVector<Loop *, 4> ChildLoops(L.begin(), L.end()); 
-      Updater.addChildLoops(ChildLoops); 
-    } 
-  } 
- 
-  return getLoopPassPreservedAnalyses(); 
-} 
- 
-PreservedAnalyses LoopUnrollPass::run(Function &F, 
-                                      FunctionAnalysisManager &AM) { 
-  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 
-  auto &LI = AM.getResult<LoopAnalysis>(F); 
-  auto &TTI = AM.getResult<TargetIRAnalysis>(F); 
-  auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 
-  auto &AC = AM.getResult<AssumptionAnalysis>(F); 
-  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 
- 
-  LoopAnalysisManager *LAM = nullptr; 
-  if (auto *LAMProxy = AM.getCachedResult<LoopAnalysisManagerFunctionProxy>(F)) 
-    LAM = &LAMProxy->getManager(); 
- 
-  auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 
-  ProfileSummaryInfo *PSI = 
-      MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 
-  auto *BFI = (PSI && PSI->hasProfileSummary()) ? 
-      &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr; 
- 
-  bool Changed = false; 
- 
-  // The unroller requires loops to be in simplified form, and also needs LCSSA. 
-  // Since simplification may add new inner loops, it has to run before the 
-  // legality and profitability checks. This means running the loop unroller 
-  // will simplify all loops, regardless of whether anything end up being 
-  // unrolled. 
-  for (auto &L : LI) { 
-    Changed |= 
-        simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */); 
-    Changed |= formLCSSARecursively(*L, DT, &LI, &SE); 
-  } 
- 
-  // Add the loop nests in the reverse order of LoopInfo. See method 
-  // declaration. 
-  SmallPriorityWorklist<Loop *, 4> Worklist; 
-  appendLoopsToWorklist(LI, Worklist); 
- 
-  while (!Worklist.empty()) { 
-    // Because the LoopInfo stores the loops in RPO, we walk the worklist 
-    // from back to front so that we work forward across the CFG, which 
-    // for unrolling is only needed to get optimization remarks emitted in 
-    // a forward order. 
-    Loop &L = *Worklist.pop_back_val(); 
-#ifndef NDEBUG 
-    Loop *ParentL = L.getParentLoop(); 
-#endif 
- 
-    // Check if the profile summary indicates that the profiled application 
-    // has a huge working set size, in which case we disable peeling to avoid 
-    // bloating it further. 
-    Optional<bool> LocalAllowPeeling = UnrollOpts.AllowPeeling; 
-    if (PSI && PSI->hasHugeWorkingSetSize()) 
-      LocalAllowPeeling = false; 
-    std::string LoopName = std::string(L.getName()); 
-    // The API here is quite complex to call and we allow to select some 
-    // flavors of unrolling during construction time (by setting UnrollOpts). 
-    LoopUnrollResult Result = tryToUnrollLoop( 
-        &L, DT, &LI, SE, TTI, AC, ORE, BFI, PSI, 
-        /*PreserveLCSSA*/ true, UnrollOpts.OptLevel, UnrollOpts.OnlyWhenForced, 
-        UnrollOpts.ForgetSCEV, /*Count*/ None, 
-        /*Threshold*/ None, UnrollOpts.AllowPartial, UnrollOpts.AllowRuntime, 
-        UnrollOpts.AllowUpperBound, LocalAllowPeeling, 
-        UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount); 
-    Changed |= Result != LoopUnrollResult::Unmodified; 
- 
-    // The parent must not be damaged by unrolling! 
-#ifndef NDEBUG 
-    if (Result != LoopUnrollResult::Unmodified && ParentL) 
-      ParentL->verifyLoop(); 
-#endif 
- 
-    // Clear any cached analysis results for L if we removed it completely. 
-    if (LAM && Result == LoopUnrollResult::FullyUnrolled) 
-      LAM->clear(L, LoopName); 
-  } 
- 
-  if (!Changed) 
-    return PreservedAnalyses::all(); 
- 
-  return getLoopPassPreservedAnalyses(); 
-} 
+  });
+  Updater.addSiblingLoops(SibLoops);
+
+  if (!IsCurrentLoopValid) {
+    Updater.markLoopAsDeleted(L, LoopName);
+  } else {
+    // We can only walk child loops if the current loop remained valid.
+    if (UnrollRevisitChildLoops) {
+      // Walk *all* of the child loops.
+      SmallVector<Loop *, 4> ChildLoops(L.begin(), L.end());
+      Updater.addChildLoops(ChildLoops);
+    }
+  }
+
+  return getLoopPassPreservedAnalyses();
+}
+
+PreservedAnalyses LoopUnrollPass::run(Function &F,
+                                      FunctionAnalysisManager &AM) {
+  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+
+  LoopAnalysisManager *LAM = nullptr;
+  if (auto *LAMProxy = AM.getCachedResult<LoopAnalysisManagerFunctionProxy>(F))
+    LAM = &LAMProxy->getManager();
+
+  auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
+  ProfileSummaryInfo *PSI =
+      MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+  auto *BFI = (PSI && PSI->hasProfileSummary()) ?
+      &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
+
+  bool Changed = false;
+
+  // The unroller requires loops to be in simplified form, and also needs LCSSA.
+  // Since simplification may add new inner loops, it has to run before the
+  // legality and profitability checks. This means running the loop unroller
+  // will simplify all loops, regardless of whether anything end up being
+  // unrolled.
+  for (auto &L : LI) {
+    Changed |=
+        simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */);
+    Changed |= formLCSSARecursively(*L, DT, &LI, &SE);
+  }
+
+  // Add the loop nests in the reverse order of LoopInfo. See method
+  // declaration.
+  SmallPriorityWorklist<Loop *, 4> Worklist;
+  appendLoopsToWorklist(LI, Worklist);
+
+  while (!Worklist.empty()) {
+    // Because the LoopInfo stores the loops in RPO, we walk the worklist
+    // from back to front so that we work forward across the CFG, which
+    // for unrolling is only needed to get optimization remarks emitted in
+    // a forward order.
+    Loop &L = *Worklist.pop_back_val();
+#ifndef NDEBUG
+    Loop *ParentL = L.getParentLoop();
+#endif
+
+    // Check if the profile summary indicates that the profiled application
+    // has a huge working set size, in which case we disable peeling to avoid
+    // bloating it further.
+    Optional<bool> LocalAllowPeeling = UnrollOpts.AllowPeeling;
+    if (PSI && PSI->hasHugeWorkingSetSize())
+      LocalAllowPeeling = false;
+    std::string LoopName = std::string(L.getName());
+    // The API here is quite complex to call and we allow to select some
+    // flavors of unrolling during construction time (by setting UnrollOpts).
+    LoopUnrollResult Result = tryToUnrollLoop(
+        &L, DT, &LI, SE, TTI, AC, ORE, BFI, PSI,
+        /*PreserveLCSSA*/ true, UnrollOpts.OptLevel, UnrollOpts.OnlyWhenForced,
+        UnrollOpts.ForgetSCEV, /*Count*/ None,
+        /*Threshold*/ None, UnrollOpts.AllowPartial, UnrollOpts.AllowRuntime,
+        UnrollOpts.AllowUpperBound, LocalAllowPeeling,
+        UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount);
+    Changed |= Result != LoopUnrollResult::Unmodified;
+
+    // The parent must not be damaged by unrolling!
+#ifndef NDEBUG
+    if (Result != LoopUnrollResult::Unmodified && ParentL)
+      ParentL->verifyLoop();
+#endif
+
+    // Clear any cached analysis results for L if we removed it completely.
+    if (LAM && Result == LoopUnrollResult::FullyUnrolled)
+      LAM->clear(L, LoopName);
+  }
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  return getLoopPassPreservedAnalyses();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopUnswitch.cpp
index a4f67ba667..822a786fc7 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -1,645 +1,645 @@
-//===- LoopUnswitch.cpp - Hoist loop-invariant conditionals in loop -------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass transforms loops that contain branches on loop-invariant conditions 
-// to multiple loops.  For example, it turns the left into the right code: 
-// 
-//  for (...)                  if (lic) 
-//    A                          for (...) 
-//    if (lic)                     A; B; C 
-//      B                      else 
-//    C                          for (...) 
-//                                 A; C 
-// 
-// This can increase the size of the code exponentially (doubling it every time 
-// a loop is unswitched) so we only unswitch if the resultant code will be 
-// smaller than a threshold. 
-// 
-// This pass expects LICM to be run before it to hoist invariant conditions out 
-// of the loop, to make the unswitching opportunity obvious. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/CodeMetrics.h" 
-#include "llvm/Analysis/InstructionSimplify.h" 
+//===- LoopUnswitch.cpp - Hoist loop-invariant conditionals in loop -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass transforms loops that contain branches on loop-invariant conditions
+// to multiple loops.  For example, it turns the left into the right code:
+//
+//  for (...)                  if (lic)
+//    A                          for (...)
+//    if (lic)                     A; B; C
+//      B                      else
+//    C                          for (...)
+//                                 A; C
+//
+// This can increase the size of the code exponentially (doubling it every time
+// a loop is unswitched) so we only unswitch if the resultant code will be
+// smaller than a threshold.
+//
+// This pass expects LICM to be run before it to hoist invariant conditions out
+// of the loop, to make the unswitching opportunity obvious.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LazyBlockFrequencyInfo.h"
-#include "llvm/Analysis/LegacyDivergenceAnalysis.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/LoopIterator.h" 
-#include "llvm/Analysis/LoopPass.h" 
-#include "llvm/Analysis/MemorySSA.h" 
-#include "llvm/Analysis/MemorySSAUpdater.h" 
-#include "llvm/Analysis/MustExecute.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/IR/Attributes.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/IR/ValueHandle.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Scalar/LoopPassManager.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/Cloning.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Transforms/Utils/LoopUtils.h" 
-#include "llvm/Transforms/Utils/ValueMapper.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <map> 
-#include <set> 
-#include <tuple> 
-#include <utility> 
-#include <vector> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "loop-unswitch" 
- 
-STATISTIC(NumBranches, "Number of branches unswitched"); 
-STATISTIC(NumSwitches, "Number of switches unswitched"); 
-STATISTIC(NumGuards,   "Number of guards unswitched"); 
-STATISTIC(NumSelects , "Number of selects unswitched"); 
-STATISTIC(NumTrivial , "Number of unswitches that are trivial"); 
-STATISTIC(NumSimplify, "Number of simplifications of unswitched code"); 
-STATISTIC(TotalInsts,  "Total number of instructions analyzed"); 
- 
-// The specific value of 100 here was chosen based only on intuition and a 
-// few specific examples. 
-static cl::opt<unsigned> 
-Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"), 
-          cl::init(100), cl::Hidden); 
- 
+#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/MustExecute.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
+#include <cassert>
+#include <map>
+#include <set>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-unswitch"
+
+STATISTIC(NumBranches, "Number of branches unswitched");
+STATISTIC(NumSwitches, "Number of switches unswitched");
+STATISTIC(NumGuards,   "Number of guards unswitched");
+STATISTIC(NumSelects , "Number of selects unswitched");
+STATISTIC(NumTrivial , "Number of unswitches that are trivial");
+STATISTIC(NumSimplify, "Number of simplifications of unswitched code");
+STATISTIC(TotalInsts,  "Total number of instructions analyzed");
+
+// The specific value of 100 here was chosen based only on intuition and a
+// few specific examples.
+static cl::opt<unsigned>
+Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"),
+          cl::init(100), cl::Hidden);
+
 static cl::opt<unsigned>
     MSSAThreshold("loop-unswitch-memoryssa-threshold",
                   cl::desc("Max number of memory uses to explore during "
                            "partial unswitching analysis"),
                   cl::init(100), cl::Hidden);
 
-namespace { 
- 
-  class LUAnalysisCache { 
-    using UnswitchedValsMap = 
-        DenseMap<const SwitchInst *, SmallPtrSet<const Value *, 8>>; 
-    using UnswitchedValsIt = UnswitchedValsMap::iterator; 
- 
-    struct LoopProperties { 
-      unsigned CanBeUnswitchedCount; 
-      unsigned WasUnswitchedCount; 
-      unsigned SizeEstimation; 
-      UnswitchedValsMap UnswitchedVals; 
-    }; 
- 
-    // Here we use std::map instead of DenseMap, since we need to keep valid 
-    // LoopProperties pointer for current loop for better performance. 
-    using LoopPropsMap = std::map<const Loop *, LoopProperties>; 
-    using LoopPropsMapIt = LoopPropsMap::iterator; 
- 
-    LoopPropsMap LoopsProperties; 
-    UnswitchedValsMap *CurLoopInstructions = nullptr; 
-    LoopProperties *CurrentLoopProperties = nullptr; 
- 
-    // A loop unswitching with an estimated cost above this threshold 
-    // is not performed. MaxSize is turned into unswitching quota for 
-    // the current loop, and reduced correspondingly, though note that 
-    // the quota is returned by releaseMemory() when the loop has been 
-    // processed, so that MaxSize will return to its previous 
-    // value. So in most cases MaxSize will equal the Threshold flag 
-    // when a new loop is processed. An exception to that is that 
-    // MaxSize will have a smaller value while processing nested loops 
-    // that were introduced due to loop unswitching of an outer loop. 
-    // 
-    // FIXME: The way that MaxSize works is subtle and depends on the 
-    // pass manager processing loops and calling releaseMemory() in a 
-    // specific order. It would be good to find a more straightforward 
-    // way of doing what MaxSize does. 
-    unsigned MaxSize; 
- 
-  public: 
-    LUAnalysisCache() : MaxSize(Threshold) {} 
- 
-    // Analyze loop. Check its size, calculate is it possible to unswitch 
-    // it. Returns true if we can unswitch this loop. 
-    bool countLoop(const Loop *L, const TargetTransformInfo &TTI, 
-                   AssumptionCache *AC); 
- 
-    // Clean all data related to given loop. 
-    void forgetLoop(const Loop *L); 
- 
-    // Mark case value as unswitched. 
-    // Since SI instruction can be partly unswitched, in order to avoid 
-    // extra unswitching in cloned loops keep track all unswitched values. 
-    void setUnswitched(const SwitchInst *SI, const Value *V); 
- 
-    // Check was this case value unswitched before or not. 
-    bool isUnswitched(const SwitchInst *SI, const Value *V); 
- 
-    // Returns true if another unswitching could be done within the cost 
-    // threshold. 
-    bool costAllowsUnswitching(); 
- 
-    // Clone all loop-unswitch related loop properties. 
-    // Redistribute unswitching quotas. 
-    // Note, that new loop data is stored inside the VMap. 
-    void cloneData(const Loop *NewLoop, const Loop *OldLoop, 
-                   const ValueToValueMapTy &VMap); 
-  }; 
- 
-  class LoopUnswitch : public LoopPass { 
-    LoopInfo *LI;  // Loop information 
-    LPPassManager *LPM; 
-    AssumptionCache *AC; 
- 
-    // Used to check if second loop needs processing after 
-    // rewriteLoopBodyWithConditionConstant rewrites first loop. 
-    std::vector<Loop*> LoopProcessWorklist; 
- 
-    LUAnalysisCache BranchesInfo; 
- 
-    bool OptimizeForSize; 
-    bool RedoLoop = false; 
- 
-    Loop *CurrentLoop = nullptr; 
-    DominatorTree *DT = nullptr; 
-    MemorySSA *MSSA = nullptr; 
+namespace {
+
+  class LUAnalysisCache {
+    using UnswitchedValsMap =
+        DenseMap<const SwitchInst *, SmallPtrSet<const Value *, 8>>;
+    using UnswitchedValsIt = UnswitchedValsMap::iterator;
+
+    struct LoopProperties {
+      unsigned CanBeUnswitchedCount;
+      unsigned WasUnswitchedCount;
+      unsigned SizeEstimation;
+      UnswitchedValsMap UnswitchedVals;
+    };
+
+    // Here we use std::map instead of DenseMap, since we need to keep valid
+    // LoopProperties pointer for current loop for better performance.
+    using LoopPropsMap = std::map<const Loop *, LoopProperties>;
+    using LoopPropsMapIt = LoopPropsMap::iterator;
+
+    LoopPropsMap LoopsProperties;
+    UnswitchedValsMap *CurLoopInstructions = nullptr;
+    LoopProperties *CurrentLoopProperties = nullptr;
+
+    // A loop unswitching with an estimated cost above this threshold
+    // is not performed. MaxSize is turned into unswitching quota for
+    // the current loop, and reduced correspondingly, though note that
+    // the quota is returned by releaseMemory() when the loop has been
+    // processed, so that MaxSize will return to its previous
+    // value. So in most cases MaxSize will equal the Threshold flag
+    // when a new loop is processed. An exception to that is that
+    // MaxSize will have a smaller value while processing nested loops
+    // that were introduced due to loop unswitching of an outer loop.
+    //
+    // FIXME: The way that MaxSize works is subtle and depends on the
+    // pass manager processing loops and calling releaseMemory() in a
+    // specific order. It would be good to find a more straightforward
+    // way of doing what MaxSize does.
+    unsigned MaxSize;
+
+  public:
+    LUAnalysisCache() : MaxSize(Threshold) {}
+
+    // Analyze loop. Check its size, calculate is it possible to unswitch
+    // it. Returns true if we can unswitch this loop.
+    bool countLoop(const Loop *L, const TargetTransformInfo &TTI,
+                   AssumptionCache *AC);
+
+    // Clean all data related to given loop.
+    void forgetLoop(const Loop *L);
+
+    // Mark case value as unswitched.
+    // Since SI instruction can be partly unswitched, in order to avoid
+    // extra unswitching in cloned loops keep track all unswitched values.
+    void setUnswitched(const SwitchInst *SI, const Value *V);
+
+    // Check was this case value unswitched before or not.
+    bool isUnswitched(const SwitchInst *SI, const Value *V);
+
+    // Returns true if another unswitching could be done within the cost
+    // threshold.
+    bool costAllowsUnswitching();
+
+    // Clone all loop-unswitch related loop properties.
+    // Redistribute unswitching quotas.
+    // Note, that new loop data is stored inside the VMap.
+    void cloneData(const Loop *NewLoop, const Loop *OldLoop,
+                   const ValueToValueMapTy &VMap);
+  };
+
+  class LoopUnswitch : public LoopPass {
+    LoopInfo *LI;  // Loop information
+    LPPassManager *LPM;
+    AssumptionCache *AC;
+
+    // Used to check if second loop needs processing after
+    // rewriteLoopBodyWithConditionConstant rewrites first loop.
+    std::vector<Loop*> LoopProcessWorklist;
+
+    LUAnalysisCache BranchesInfo;
+
+    bool OptimizeForSize;
+    bool RedoLoop = false;
+
+    Loop *CurrentLoop = nullptr;
+    DominatorTree *DT = nullptr;
+    MemorySSA *MSSA = nullptr;
     AAResults *AA = nullptr;
-    std::unique_ptr<MemorySSAUpdater> MSSAU; 
-    BasicBlock *LoopHeader = nullptr; 
-    BasicBlock *LoopPreheader = nullptr; 
- 
-    bool SanitizeMemory; 
-    SimpleLoopSafetyInfo SafetyInfo; 
- 
-    // LoopBlocks contains all of the basic blocks of the loop, including the 
-    // preheader of the loop, the body of the loop, and the exit blocks of the 
-    // loop, in that order. 
-    std::vector<BasicBlock*> LoopBlocks; 
-    // NewBlocks contained cloned copy of basic blocks from LoopBlocks. 
-    std::vector<BasicBlock*> NewBlocks; 
- 
-    bool HasBranchDivergence; 
- 
-  public: 
-    static char ID; // Pass ID, replacement for typeid 
- 
-    explicit LoopUnswitch(bool Os = false, bool HasBranchDivergence = false) 
-        : LoopPass(ID), OptimizeForSize(Os), 
-          HasBranchDivergence(HasBranchDivergence) { 
-      initializeLoopUnswitchPass(*PassRegistry::getPassRegistry()); 
-    } 
- 
-    bool runOnLoop(Loop *L, LPPassManager &LPM) override; 
-    bool processCurrentLoop(); 
-    bool isUnreachableDueToPreviousUnswitching(BasicBlock *); 
- 
-    /// This transformation requires natural loop information & requires that 
-    /// loop preheaders be inserted into the CFG. 
-    /// 
-    void getAnalysisUsage(AnalysisUsage &AU) const override { 
+    std::unique_ptr<MemorySSAUpdater> MSSAU;
+    BasicBlock *LoopHeader = nullptr;
+    BasicBlock *LoopPreheader = nullptr;
+
+    bool SanitizeMemory;
+    SimpleLoopSafetyInfo SafetyInfo;
+
+    // LoopBlocks contains all of the basic blocks of the loop, including the
+    // preheader of the loop, the body of the loop, and the exit blocks of the
+    // loop, in that order.
+    std::vector<BasicBlock*> LoopBlocks;
+    // NewBlocks contained cloned copy of basic blocks from LoopBlocks.
+    std::vector<BasicBlock*> NewBlocks;
+
+    bool HasBranchDivergence;
+
+  public:
+    static char ID; // Pass ID, replacement for typeid
+
+    explicit LoopUnswitch(bool Os = false, bool HasBranchDivergence = false)
+        : LoopPass(ID), OptimizeForSize(Os),
+          HasBranchDivergence(HasBranchDivergence) {
+      initializeLoopUnswitchPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+    bool processCurrentLoop();
+    bool isUnreachableDueToPreviousUnswitching(BasicBlock *);
+
+    /// This transformation requires natural loop information & requires that
+    /// loop preheaders be inserted into the CFG.
+    ///
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       // Lazy BFI and BPI are marked as preserved here so Loop Unswitching
       // can remain part of the same loop pass as LICM
       AU.addPreserved<LazyBlockFrequencyInfoPass>();
       AU.addPreserved<LazyBranchProbabilityInfoPass>();
-      AU.addRequired<AssumptionCacheTracker>(); 
-      AU.addRequired<TargetTransformInfoWrapperPass>(); 
-      if (EnableMSSALoopDependency) { 
-        AU.addRequired<MemorySSAWrapperPass>(); 
-        AU.addPreserved<MemorySSAWrapperPass>(); 
-      } 
-      if (HasBranchDivergence) 
-        AU.addRequired<LegacyDivergenceAnalysis>(); 
-      getLoopAnalysisUsage(AU); 
-    } 
- 
-  private: 
-    void releaseMemory() override { BranchesInfo.forgetLoop(CurrentLoop); } 
- 
-    void initLoopData() { 
-      LoopHeader = CurrentLoop->getHeader(); 
-      LoopPreheader = CurrentLoop->getLoopPreheader(); 
-    } 
- 
-    /// Split all of the edges from inside the loop to their exit blocks. 
-    /// Update the appropriate Phi nodes as we do so. 
-    void splitExitEdges(Loop *L, 
-                        const SmallVectorImpl<BasicBlock *> &ExitBlocks); 
- 
-    bool tryTrivialLoopUnswitch(bool &Changed); 
- 
-    bool unswitchIfProfitable(Value *LoopCond, Constant *Val, 
+      AU.addRequired<AssumptionCacheTracker>();
+      AU.addRequired<TargetTransformInfoWrapperPass>();
+      if (EnableMSSALoopDependency) {
+        AU.addRequired<MemorySSAWrapperPass>();
+        AU.addPreserved<MemorySSAWrapperPass>();
+      }
+      if (HasBranchDivergence)
+        AU.addRequired<LegacyDivergenceAnalysis>();
+      getLoopAnalysisUsage(AU);
+    }
+
+  private:
+    void releaseMemory() override { BranchesInfo.forgetLoop(CurrentLoop); }
+
+    void initLoopData() {
+      LoopHeader = CurrentLoop->getHeader();
+      LoopPreheader = CurrentLoop->getLoopPreheader();
+    }
+
+    /// Split all of the edges from inside the loop to their exit blocks.
+    /// Update the appropriate Phi nodes as we do so.
+    void splitExitEdges(Loop *L,
+                        const SmallVectorImpl<BasicBlock *> &ExitBlocks);
+
+    bool tryTrivialLoopUnswitch(bool &Changed);
+
+    bool unswitchIfProfitable(Value *LoopCond, Constant *Val,
                               Instruction *TI = nullptr,
                               ArrayRef<Instruction *> ToDuplicate = {});
-    void unswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val, 
-                                  BasicBlock *ExitBlock, Instruction *TI); 
-    void unswitchNontrivialCondition(Value *LIC, Constant *OnVal, Loop *L, 
+    void unswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
+                                  BasicBlock *ExitBlock, Instruction *TI);
+    void unswitchNontrivialCondition(Value *LIC, Constant *OnVal, Loop *L,
                                      Instruction *TI,
                                      ArrayRef<Instruction *> ToDuplicate = {});
- 
-    void rewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, 
-                                              Constant *Val, bool IsEqual); 
- 
+
+    void rewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
+                                              Constant *Val, bool IsEqual);
+
     void
     emitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
                                    BasicBlock *TrueDest, BasicBlock *FalseDest,
                                    BranchInst *OldBranch, Instruction *TI,
                                    ArrayRef<Instruction *> ToDuplicate = {});
- 
-    void simplifyCode(std::vector<Instruction *> &Worklist, Loop *L); 
- 
-    /// Given that the Invariant is not equal to Val. Simplify instructions 
-    /// in the loop. 
-    Value *simplifyInstructionWithNotEqual(Instruction *Inst, Value *Invariant, 
-                                           Constant *Val); 
-  }; 
- 
-} // end anonymous namespace 
- 
-// Analyze loop. Check its size, calculate is it possible to unswitch 
-// it. Returns true if we can unswitch this loop. 
-bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI, 
-                                AssumptionCache *AC) { 
-  LoopPropsMapIt PropsIt; 
-  bool Inserted; 
-  std::tie(PropsIt, Inserted) = 
-      LoopsProperties.insert(std::make_pair(L, LoopProperties())); 
- 
-  LoopProperties &Props = PropsIt->second; 
- 
-  if (Inserted) { 
-    // New loop. 
- 
-    // Limit the number of instructions to avoid causing significant code 
-    // expansion, and the number of basic blocks, to avoid loops with 
-    // large numbers of branches which cause loop unswitching to go crazy. 
-    // This is a very ad-hoc heuristic. 
- 
-    SmallPtrSet<const Value *, 32> EphValues; 
-    CodeMetrics::collectEphemeralValues(L, AC, EphValues); 
- 
-    // FIXME: This is overly conservative because it does not take into 
-    // consideration code simplification opportunities and code that can 
-    // be shared by the resultant unswitched loops. 
-    CodeMetrics Metrics; 
-    for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E; 
-         ++I) 
-      Metrics.analyzeBasicBlock(*I, TTI, EphValues); 
- 
-    Props.SizeEstimation = Metrics.NumInsts; 
-    Props.CanBeUnswitchedCount = MaxSize / (Props.SizeEstimation); 
-    Props.WasUnswitchedCount = 0; 
-    MaxSize -= Props.SizeEstimation * Props.CanBeUnswitchedCount; 
- 
-    if (Metrics.notDuplicatable) { 
-      LLVM_DEBUG(dbgs() << "NOT unswitching loop %" << L->getHeader()->getName() 
-                        << ", contents cannot be " 
-                        << "duplicated!\n"); 
-      return false; 
-    } 
-  } 
- 
-  // Be careful. This links are good only before new loop addition. 
-  CurrentLoopProperties = &Props; 
-  CurLoopInstructions = &Props.UnswitchedVals; 
- 
-  return true; 
-} 
- 
-// Clean all data related to given loop. 
-void LUAnalysisCache::forgetLoop(const Loop *L) { 
-  LoopPropsMapIt LIt = LoopsProperties.find(L); 
- 
-  if (LIt != LoopsProperties.end()) { 
-    LoopProperties &Props = LIt->second; 
-    MaxSize += (Props.CanBeUnswitchedCount + Props.WasUnswitchedCount) * 
-               Props.SizeEstimation; 
-    LoopsProperties.erase(LIt); 
-  } 
- 
-  CurrentLoopProperties = nullptr; 
-  CurLoopInstructions = nullptr; 
-} 
- 
-// Mark case value as unswitched. 
-// Since SI instruction can be partly unswitched, in order to avoid 
-// extra unswitching in cloned loops keep track all unswitched values. 
-void LUAnalysisCache::setUnswitched(const SwitchInst *SI, const Value *V) { 
-  (*CurLoopInstructions)[SI].insert(V); 
-} 
- 
-// Check was this case value unswitched before or not. 
-bool LUAnalysisCache::isUnswitched(const SwitchInst *SI, const Value *V) { 
-  return (*CurLoopInstructions)[SI].count(V); 
-} 
- 
-bool LUAnalysisCache::costAllowsUnswitching() { 
-  return CurrentLoopProperties->CanBeUnswitchedCount > 0; 
-} 
- 
-// Clone all loop-unswitch related loop properties. 
-// Redistribute unswitching quotas. 
-// Note, that new loop data is stored inside the VMap. 
-void LUAnalysisCache::cloneData(const Loop *NewLoop, const Loop *OldLoop, 
-                                const ValueToValueMapTy &VMap) { 
-  LoopProperties &NewLoopProps = LoopsProperties[NewLoop]; 
-  LoopProperties &OldLoopProps = *CurrentLoopProperties; 
-  UnswitchedValsMap &Insts = OldLoopProps.UnswitchedVals; 
- 
-  // Reallocate "can-be-unswitched quota" 
- 
-  --OldLoopProps.CanBeUnswitchedCount; 
-  ++OldLoopProps.WasUnswitchedCount; 
-  NewLoopProps.WasUnswitchedCount = 0; 
-  unsigned Quota = OldLoopProps.CanBeUnswitchedCount; 
-  NewLoopProps.CanBeUnswitchedCount = Quota / 2; 
-  OldLoopProps.CanBeUnswitchedCount = Quota - Quota / 2; 
- 
-  NewLoopProps.SizeEstimation = OldLoopProps.SizeEstimation; 
- 
-  // Clone unswitched values info: 
-  // for new loop switches we clone info about values that was 
-  // already unswitched and has redundant successors. 
-  for (UnswitchedValsIt I = Insts.begin(); I != Insts.end(); ++I) { 
-    const SwitchInst *OldInst = I->first; 
-    Value *NewI = VMap.lookup(OldInst); 
-    const SwitchInst *NewInst = cast_or_null<SwitchInst>(NewI); 
-    assert(NewInst && "All instructions that are in SrcBB must be in VMap."); 
- 
-    NewLoopProps.UnswitchedVals[NewInst] = OldLoopProps.UnswitchedVals[OldInst]; 
-  } 
-} 
- 
-char LoopUnswitch::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops", 
-                      false, false) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
-INITIALIZE_PASS_DEPENDENCY(LoopPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) 
-INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) 
-INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops", 
-                      false, false) 
- 
-Pass *llvm::createLoopUnswitchPass(bool Os, bool HasBranchDivergence) { 
-  return new LoopUnswitch(Os, HasBranchDivergence); 
-} 
- 
-/// Operator chain lattice. 
-enum OperatorChain { 
-  OC_OpChainNone,    ///< There is no operator. 
-  OC_OpChainOr,      ///< There are only ORs. 
-  OC_OpChainAnd,     ///< There are only ANDs. 
-  OC_OpChainMixed    ///< There are ANDs and ORs. 
-}; 
- 
-/// Cond is a condition that occurs in L. If it is invariant in the loop, or has 
-/// an invariant piece, return the invariant. Otherwise, return null. 
-// 
-/// NOTE: findLIVLoopCondition will not return a partial LIV by walking up a 
-/// mixed operator chain, as we can not reliably find a value which will 
-/// simplify the operator chain. If the chain is AND-only or OR-only, we can use 
-/// 0 or ~0 to simplify the chain. 
-/// 
-/// NOTE: In case a partial LIV and a mixed operator chain, we may be able to 
-/// simplify the condition itself to a loop variant condition, but at the 
-/// cost of creating an entirely new loop. 
-static Value *findLIVLoopCondition(Value *Cond, Loop *L, bool &Changed, 
-                                   OperatorChain &ParentChain, 
-                                   DenseMap<Value *, Value *> &Cache, 
-                                   MemorySSAUpdater *MSSAU) { 
-  auto CacheIt = Cache.find(Cond); 
-  if (CacheIt != Cache.end()) 
-    return CacheIt->second; 
- 
-  // We started analyze new instruction, increment scanned instructions counter. 
-  ++TotalInsts; 
- 
-  // We can never unswitch on vector conditions. 
-  if (Cond->getType()->isVectorTy()) 
-    return nullptr; 
- 
-  // Constants should be folded, not unswitched on! 
-  if (isa<Constant>(Cond)) return nullptr; 
- 
-  // TODO: Handle: br (VARIANT|INVARIANT). 
- 
-  // Hoist simple values out. 
-  if (L->makeLoopInvariant(Cond, Changed, nullptr, MSSAU)) { 
-    Cache[Cond] = Cond; 
-    return Cond; 
-  } 
- 
-  // Walk up the operator chain to find partial invariant conditions. 
-  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Cond)) 
-    if (BO->getOpcode() == Instruction::And || 
-        BO->getOpcode() == Instruction::Or) { 
-      // Given the previous operator, compute the current operator chain status. 
-      OperatorChain NewChain; 
-      switch (ParentChain) { 
-      case OC_OpChainNone: 
-        NewChain = BO->getOpcode() == Instruction::And ? OC_OpChainAnd : 
-                                      OC_OpChainOr; 
-        break; 
-      case OC_OpChainOr: 
-        NewChain = BO->getOpcode() == Instruction::Or ? OC_OpChainOr : 
-                                      OC_OpChainMixed; 
-        break; 
-      case OC_OpChainAnd: 
-        NewChain = BO->getOpcode() == Instruction::And ? OC_OpChainAnd : 
-                                      OC_OpChainMixed; 
-        break; 
-      case OC_OpChainMixed: 
-        NewChain = OC_OpChainMixed; 
-        break; 
-      } 
- 
-      // If we reach a Mixed state, we do not want to keep walking up as we can not 
-      // reliably find a value that will simplify the chain. With this check, we 
-      // will return null on the first sight of mixed chain and the caller will 
-      // either backtrack to find partial LIV in other operand or return null. 
-      if (NewChain != OC_OpChainMixed) { 
-        // Update the current operator chain type before we search up the chain. 
-        ParentChain = NewChain; 
-        // If either the left or right side is invariant, we can unswitch on this, 
-        // which will cause the branch to go away in one loop and the condition to 
-        // simplify in the other one. 
-        if (Value *LHS = findLIVLoopCondition(BO->getOperand(0), L, Changed, 
-                                              ParentChain, Cache, MSSAU)) { 
-          Cache[Cond] = LHS; 
-          return LHS; 
-        } 
-        // We did not manage to find a partial LIV in operand(0). Backtrack and try 
-        // operand(1). 
-        ParentChain = NewChain; 
-        if (Value *RHS = findLIVLoopCondition(BO->getOperand(1), L, Changed, 
-                                              ParentChain, Cache, MSSAU)) { 
-          Cache[Cond] = RHS; 
-          return RHS; 
-        } 
-      } 
-    } 
- 
-  Cache[Cond] = nullptr; 
-  return nullptr; 
-} 
- 
-/// Cond is a condition that occurs in L. If it is invariant in the loop, or has 
-/// an invariant piece, return the invariant along with the operator chain type. 
-/// Otherwise, return null. 
-static std::pair<Value *, OperatorChain> 
-findLIVLoopCondition(Value *Cond, Loop *L, bool &Changed, 
-                     MemorySSAUpdater *MSSAU) { 
-  DenseMap<Value *, Value *> Cache; 
-  OperatorChain OpChain = OC_OpChainNone; 
-  Value *FCond = findLIVLoopCondition(Cond, L, Changed, OpChain, Cache, MSSAU); 
- 
-  // In case we do find a LIV, it can not be obtained by walking up a mixed 
-  // operator chain. 
-  assert((!FCond || OpChain != OC_OpChainMixed) && 
-        "Do not expect a partial LIV with mixed operator chain"); 
-  return {FCond, OpChain}; 
-} 
- 
-bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPMRef) { 
-  if (skipLoop(L)) 
-    return false; 
- 
-  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache( 
-      *L->getHeader()->getParent()); 
-  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 
-  LPM = &LPMRef; 
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
+
+    void simplifyCode(std::vector<Instruction *> &Worklist, Loop *L);
+
+    /// Given that the Invariant is not equal to Val. Simplify instructions
+    /// in the loop.
+    Value *simplifyInstructionWithNotEqual(Instruction *Inst, Value *Invariant,
+                                           Constant *Val);
+  };
+
+} // end anonymous namespace
+
+// Analyze loop. Check its size, calculate is it possible to unswitch
+// it. Returns true if we can unswitch this loop.
+bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI,
+                                AssumptionCache *AC) {
+  LoopPropsMapIt PropsIt;
+  bool Inserted;
+  std::tie(PropsIt, Inserted) =
+      LoopsProperties.insert(std::make_pair(L, LoopProperties()));
+
+  LoopProperties &Props = PropsIt->second;
+
+  if (Inserted) {
+    // New loop.
+
+    // Limit the number of instructions to avoid causing significant code
+    // expansion, and the number of basic blocks, to avoid loops with
+    // large numbers of branches which cause loop unswitching to go crazy.
+    // This is a very ad-hoc heuristic.
+
+    SmallPtrSet<const Value *, 32> EphValues;
+    CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
+    // FIXME: This is overly conservative because it does not take into
+    // consideration code simplification opportunities and code that can
+    // be shared by the resultant unswitched loops.
+    CodeMetrics Metrics;
+    for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E;
+         ++I)
+      Metrics.analyzeBasicBlock(*I, TTI, EphValues);
+
+    Props.SizeEstimation = Metrics.NumInsts;
+    Props.CanBeUnswitchedCount = MaxSize / (Props.SizeEstimation);
+    Props.WasUnswitchedCount = 0;
+    MaxSize -= Props.SizeEstimation * Props.CanBeUnswitchedCount;
+
+    if (Metrics.notDuplicatable) {
+      LLVM_DEBUG(dbgs() << "NOT unswitching loop %" << L->getHeader()->getName()
+                        << ", contents cannot be "
+                        << "duplicated!\n");
+      return false;
+    }
+  }
+
+  // Be careful. This links are good only before new loop addition.
+  CurrentLoopProperties = &Props;
+  CurLoopInstructions = &Props.UnswitchedVals;
+
+  return true;
+}
+
+// Clean all data related to given loop.
+void LUAnalysisCache::forgetLoop(const Loop *L) {
+  LoopPropsMapIt LIt = LoopsProperties.find(L);
+
+  if (LIt != LoopsProperties.end()) {
+    LoopProperties &Props = LIt->second;
+    MaxSize += (Props.CanBeUnswitchedCount + Props.WasUnswitchedCount) *
+               Props.SizeEstimation;
+    LoopsProperties.erase(LIt);
+  }
+
+  CurrentLoopProperties = nullptr;
+  CurLoopInstructions = nullptr;
+}
+
+// Mark case value as unswitched.
+// Since SI instruction can be partly unswitched, in order to avoid
+// extra unswitching in cloned loops keep track all unswitched values.
+void LUAnalysisCache::setUnswitched(const SwitchInst *SI, const Value *V) {
+  (*CurLoopInstructions)[SI].insert(V);
+}
+
+// Check was this case value unswitched before or not.
+bool LUAnalysisCache::isUnswitched(const SwitchInst *SI, const Value *V) {
+  return (*CurLoopInstructions)[SI].count(V);
+}
+
+bool LUAnalysisCache::costAllowsUnswitching() {
+  return CurrentLoopProperties->CanBeUnswitchedCount > 0;
+}
+
+// Clone all loop-unswitch related loop properties.
+// Redistribute unswitching quotas.
+// Note, that new loop data is stored inside the VMap.
+void LUAnalysisCache::cloneData(const Loop *NewLoop, const Loop *OldLoop,
+                                const ValueToValueMapTy &VMap) {
+  LoopProperties &NewLoopProps = LoopsProperties[NewLoop];
+  LoopProperties &OldLoopProps = *CurrentLoopProperties;
+  UnswitchedValsMap &Insts = OldLoopProps.UnswitchedVals;
+
+  // Reallocate "can-be-unswitched quota"
+
+  --OldLoopProps.CanBeUnswitchedCount;
+  ++OldLoopProps.WasUnswitchedCount;
+  NewLoopProps.WasUnswitchedCount = 0;
+  unsigned Quota = OldLoopProps.CanBeUnswitchedCount;
+  NewLoopProps.CanBeUnswitchedCount = Quota / 2;
+  OldLoopProps.CanBeUnswitchedCount = Quota - Quota / 2;
+
+  NewLoopProps.SizeEstimation = OldLoopProps.SizeEstimation;
+
+  // Clone unswitched values info:
+  // for new loop switches we clone info about values that was
+  // already unswitched and has redundant successors.
+  for (UnswitchedValsIt I = Insts.begin(); I != Insts.end(); ++I) {
+    const SwitchInst *OldInst = I->first;
+    Value *NewI = VMap.lookup(OldInst);
+    const SwitchInst *NewInst = cast_or_null<SwitchInst>(NewI);
+    assert(NewInst && "All instructions that are in SrcBB must be in VMap.");
+
+    NewLoopProps.UnswitchedVals[NewInst] = OldLoopProps.UnswitchedVals[OldInst];
+  }
+}
+
+char LoopUnswitch::ID = 0;
+
+INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops",
+                      false, false)
+
+Pass *llvm::createLoopUnswitchPass(bool Os, bool HasBranchDivergence) {
+  return new LoopUnswitch(Os, HasBranchDivergence);
+}
+
+/// Operator chain lattice.
+enum OperatorChain {
+  OC_OpChainNone,    ///< There is no operator.
+  OC_OpChainOr,      ///< There are only ORs.
+  OC_OpChainAnd,     ///< There are only ANDs.
+  OC_OpChainMixed    ///< There are ANDs and ORs.
+};
+
+/// Cond is a condition that occurs in L. If it is invariant in the loop, or has
+/// an invariant piece, return the invariant. Otherwise, return null.
+//
+/// NOTE: findLIVLoopCondition will not return a partial LIV by walking up a
+/// mixed operator chain, as we can not reliably find a value which will
+/// simplify the operator chain. If the chain is AND-only or OR-only, we can use
+/// 0 or ~0 to simplify the chain.
+///
+/// NOTE: In case a partial LIV and a mixed operator chain, we may be able to
+/// simplify the condition itself to a loop variant condition, but at the
+/// cost of creating an entirely new loop.
+static Value *findLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
+                                   OperatorChain &ParentChain,
+                                   DenseMap<Value *, Value *> &Cache,
+                                   MemorySSAUpdater *MSSAU) {
+  auto CacheIt = Cache.find(Cond);
+  if (CacheIt != Cache.end())
+    return CacheIt->second;
+
+  // We started analyze new instruction, increment scanned instructions counter.
+  ++TotalInsts;
+
+  // We can never unswitch on vector conditions.
+  if (Cond->getType()->isVectorTy())
+    return nullptr;
+
+  // Constants should be folded, not unswitched on!
+  if (isa<Constant>(Cond)) return nullptr;
+
+  // TODO: Handle: br (VARIANT|INVARIANT).
+
+  // Hoist simple values out.
+  if (L->makeLoopInvariant(Cond, Changed, nullptr, MSSAU)) {
+    Cache[Cond] = Cond;
+    return Cond;
+  }
+
+  // Walk up the operator chain to find partial invariant conditions.
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Cond))
+    if (BO->getOpcode() == Instruction::And ||
+        BO->getOpcode() == Instruction::Or) {
+      // Given the previous operator, compute the current operator chain status.
+      OperatorChain NewChain;
+      switch (ParentChain) {
+      case OC_OpChainNone:
+        NewChain = BO->getOpcode() == Instruction::And ? OC_OpChainAnd :
+                                      OC_OpChainOr;
+        break;
+      case OC_OpChainOr:
+        NewChain = BO->getOpcode() == Instruction::Or ? OC_OpChainOr :
+                                      OC_OpChainMixed;
+        break;
+      case OC_OpChainAnd:
+        NewChain = BO->getOpcode() == Instruction::And ? OC_OpChainAnd :
+                                      OC_OpChainMixed;
+        break;
+      case OC_OpChainMixed:
+        NewChain = OC_OpChainMixed;
+        break;
+      }
+
+      // If we reach a Mixed state, we do not want to keep walking up as we can not
+      // reliably find a value that will simplify the chain. With this check, we
+      // will return null on the first sight of mixed chain and the caller will
+      // either backtrack to find partial LIV in other operand or return null.
+      if (NewChain != OC_OpChainMixed) {
+        // Update the current operator chain type before we search up the chain.
+        ParentChain = NewChain;
+        // If either the left or right side is invariant, we can unswitch on this,
+        // which will cause the branch to go away in one loop and the condition to
+        // simplify in the other one.
+        if (Value *LHS = findLIVLoopCondition(BO->getOperand(0), L, Changed,
+                                              ParentChain, Cache, MSSAU)) {
+          Cache[Cond] = LHS;
+          return LHS;
+        }
+        // We did not manage to find a partial LIV in operand(0). Backtrack and try
+        // operand(1).
+        ParentChain = NewChain;
+        if (Value *RHS = findLIVLoopCondition(BO->getOperand(1), L, Changed,
+                                              ParentChain, Cache, MSSAU)) {
+          Cache[Cond] = RHS;
+          return RHS;
+        }
+      }
+    }
+
+  Cache[Cond] = nullptr;
+  return nullptr;
+}
+
+/// Cond is a condition that occurs in L. If it is invariant in the loop, or has
+/// an invariant piece, return the invariant along with the operator chain type.
+/// Otherwise, return null.
+static std::pair<Value *, OperatorChain>
+findLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
+                     MemorySSAUpdater *MSSAU) {
+  DenseMap<Value *, Value *> Cache;
+  OperatorChain OpChain = OC_OpChainNone;
+  Value *FCond = findLIVLoopCondition(Cond, L, Changed, OpChain, Cache, MSSAU);
+
+  // In case we do find a LIV, it can not be obtained by walking up a mixed
+  // operator chain.
+  assert((!FCond || OpChain != OC_OpChainMixed) &&
+        "Do not expect a partial LIV with mixed operator chain");
+  return {FCond, OpChain};
+}
+
+bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPMRef) {
+  if (skipLoop(L))
+    return false;
+
+  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+      *L->getHeader()->getParent());
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  LPM = &LPMRef;
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
-  if (EnableMSSALoopDependency) { 
-    MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA(); 
-    MSSAU = std::make_unique<MemorySSAUpdater>(MSSA); 
-    assert(DT && "Cannot update MemorySSA without a valid DomTree."); 
-  } 
-  CurrentLoop = L; 
-  Function *F = CurrentLoop->getHeader()->getParent(); 
- 
-  SanitizeMemory = F->hasFnAttribute(Attribute::SanitizeMemory); 
-  if (SanitizeMemory) 
-    SafetyInfo.computeLoopSafetyInfo(L); 
- 
-  if (MSSA && VerifyMemorySSA) 
-    MSSA->verifyMemorySSA(); 
- 
-  bool Changed = false; 
-  do { 
-    assert(CurrentLoop->isLCSSAForm(*DT)); 
-    if (MSSA && VerifyMemorySSA) 
-      MSSA->verifyMemorySSA(); 
-    RedoLoop = false; 
-    Changed |= processCurrentLoop(); 
-  } while (RedoLoop); 
- 
-  if (MSSA && VerifyMemorySSA) 
-    MSSA->verifyMemorySSA(); 
- 
-  return Changed; 
-} 
- 
-// Return true if the BasicBlock BB is unreachable from the loop header. 
-// Return false, otherwise. 
-bool LoopUnswitch::isUnreachableDueToPreviousUnswitching(BasicBlock *BB) { 
-  auto *Node = DT->getNode(BB)->getIDom(); 
-  BasicBlock *DomBB = Node->getBlock(); 
-  while (CurrentLoop->contains(DomBB)) { 
-    BranchInst *BInst = dyn_cast<BranchInst>(DomBB->getTerminator()); 
- 
-    Node = DT->getNode(DomBB)->getIDom(); 
-    DomBB = Node->getBlock(); 
- 
-    if (!BInst || !BInst->isConditional()) 
-      continue; 
- 
-    Value *Cond = BInst->getCondition(); 
-    if (!isa<ConstantInt>(Cond)) 
-      continue; 
- 
-    BasicBlock *UnreachableSucc = 
-        Cond == ConstantInt::getTrue(Cond->getContext()) 
-            ? BInst->getSuccessor(1) 
-            : BInst->getSuccessor(0); 
- 
-    if (DT->dominates(UnreachableSucc, BB)) 
-      return true; 
-  } 
-  return false; 
-} 
- 
-/// FIXME: Remove this workaround when freeze related patches are done. 
-/// LoopUnswitch and Equality propagation in GVN have discrepancy about 
-/// whether branch on undef/poison has undefine behavior. Here it is to 
-/// rule out some common cases that we found such discrepancy already 
-/// causing problems. Detail could be found in PR31652. Note if the 
-/// func returns true, it is unsafe. But if it is false, it doesn't mean 
-/// it is necessarily safe. 
-static bool equalityPropUnSafe(Value &LoopCond) { 
-  ICmpInst *CI = dyn_cast<ICmpInst>(&LoopCond); 
-  if (!CI || !CI->isEquality()) 
-    return false; 
- 
-  Value *LHS = CI->getOperand(0); 
-  Value *RHS = CI->getOperand(1); 
-  if (isa<UndefValue>(LHS) || isa<UndefValue>(RHS)) 
-    return true; 
- 
-  auto HasUndefInPHI = [](PHINode &PN) { 
-    for (Value *Opd : PN.incoming_values()) { 
-      if (isa<UndefValue>(Opd)) 
-        return true; 
-    } 
-    return false; 
-  }; 
-  PHINode *LPHI = dyn_cast<PHINode>(LHS); 
-  PHINode *RPHI = dyn_cast<PHINode>(RHS); 
-  if ((LPHI && HasUndefInPHI(*LPHI)) || (RPHI && HasUndefInPHI(*RPHI))) 
-    return true; 
- 
-  auto HasUndefInSelect = [](SelectInst &SI) { 
-    if (isa<UndefValue>(SI.getTrueValue()) || 
-        isa<UndefValue>(SI.getFalseValue())) 
-      return true; 
-    return false; 
-  }; 
-  SelectInst *LSI = dyn_cast<SelectInst>(LHS); 
-  SelectInst *RSI = dyn_cast<SelectInst>(RHS); 
-  if ((LSI && HasUndefInSelect(*LSI)) || (RSI && HasUndefInSelect(*RSI))) 
-    return true; 
-  return false; 
-} 
- 
+  if (EnableMSSALoopDependency) {
+    MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
+    MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
+    assert(DT && "Cannot update MemorySSA without a valid DomTree.");
+  }
+  CurrentLoop = L;
+  Function *F = CurrentLoop->getHeader()->getParent();
+
+  SanitizeMemory = F->hasFnAttribute(Attribute::SanitizeMemory);
+  if (SanitizeMemory)
+    SafetyInfo.computeLoopSafetyInfo(L);
+
+  if (MSSA && VerifyMemorySSA)
+    MSSA->verifyMemorySSA();
+
+  bool Changed = false;
+  do {
+    assert(CurrentLoop->isLCSSAForm(*DT));
+    if (MSSA && VerifyMemorySSA)
+      MSSA->verifyMemorySSA();
+    RedoLoop = false;
+    Changed |= processCurrentLoop();
+  } while (RedoLoop);
+
+  if (MSSA && VerifyMemorySSA)
+    MSSA->verifyMemorySSA();
+
+  return Changed;
+}
+
+// Return true if the BasicBlock BB is unreachable from the loop header.
+// Return false, otherwise.
+bool LoopUnswitch::isUnreachableDueToPreviousUnswitching(BasicBlock *BB) {
+  auto *Node = DT->getNode(BB)->getIDom();
+  BasicBlock *DomBB = Node->getBlock();
+  while (CurrentLoop->contains(DomBB)) {
+    BranchInst *BInst = dyn_cast<BranchInst>(DomBB->getTerminator());
+
+    Node = DT->getNode(DomBB)->getIDom();
+    DomBB = Node->getBlock();
+
+    if (!BInst || !BInst->isConditional())
+      continue;
+
+    Value *Cond = BInst->getCondition();
+    if (!isa<ConstantInt>(Cond))
+      continue;
+
+    BasicBlock *UnreachableSucc =
+        Cond == ConstantInt::getTrue(Cond->getContext())
+            ? BInst->getSuccessor(1)
+            : BInst->getSuccessor(0);
+
+    if (DT->dominates(UnreachableSucc, BB))
+      return true;
+  }
+  return false;
+}
+
+/// FIXME: Remove this workaround when freeze related patches are done.
+/// LoopUnswitch and Equality propagation in GVN have discrepancy about
+/// whether branch on undef/poison has undefine behavior. Here it is to
+/// rule out some common cases that we found such discrepancy already
+/// causing problems. Detail could be found in PR31652. Note if the
+/// func returns true, it is unsafe. But if it is false, it doesn't mean
+/// it is necessarily safe.
+static bool equalityPropUnSafe(Value &LoopCond) {
+  ICmpInst *CI = dyn_cast<ICmpInst>(&LoopCond);
+  if (!CI || !CI->isEquality())
+    return false;
+
+  Value *LHS = CI->getOperand(0);
+  Value *RHS = CI->getOperand(1);
+  if (isa<UndefValue>(LHS) || isa<UndefValue>(RHS))
+    return true;
+
+  auto HasUndefInPHI = [](PHINode &PN) {
+    for (Value *Opd : PN.incoming_values()) {
+      if (isa<UndefValue>(Opd))
+        return true;
+    }
+    return false;
+  };
+  PHINode *LPHI = dyn_cast<PHINode>(LHS);
+  PHINode *RPHI = dyn_cast<PHINode>(RHS);
+  if ((LPHI && HasUndefInPHI(*LPHI)) || (RPHI && HasUndefInPHI(*RPHI)))
+    return true;
+
+  auto HasUndefInSelect = [](SelectInst &SI) {
+    if (isa<UndefValue>(SI.getTrueValue()) ||
+        isa<UndefValue>(SI.getFalseValue()))
+      return true;
+    return false;
+  };
+  SelectInst *LSI = dyn_cast<SelectInst>(LHS);
+  SelectInst *RSI = dyn_cast<SelectInst>(RHS);
+  if ((LSI && HasUndefInSelect(*LSI)) || (RSI && HasUndefInSelect(*RSI)))
+    return true;
+  return false;
+}
+
 /// Check if the loop header has a conditional branch that is not
 /// loop-invariant, because it involves load instructions. If all paths from
 /// either the true or false successor to the header or loop exists do not
@@ -779,205 +779,205 @@ hasPartialIVCondition(Loop *L, MemorySSA &MSSA, AAResults *AA) {
   return {};
 }
 
-/// Do actual work and unswitch loop if possible and profitable. 
-bool LoopUnswitch::processCurrentLoop() { 
-  bool Changed = false; 
- 
-  initLoopData(); 
- 
-  // If LoopSimplify was unable to form a preheader, don't do any unswitching. 
-  if (!LoopPreheader) 
-    return false; 
- 
-  // Loops with indirectbr cannot be cloned. 
-  if (!CurrentLoop->isSafeToClone()) 
-    return false; 
- 
-  // Without dedicated exits, splitting the exit edge may fail. 
-  if (!CurrentLoop->hasDedicatedExits()) 
-    return false; 
- 
-  LLVMContext &Context = LoopHeader->getContext(); 
- 
-  // Analyze loop cost, and stop unswitching if loop content can not be duplicated. 
-  if (!BranchesInfo.countLoop( 
-          CurrentLoop, 
-          getAnalysis<TargetTransformInfoWrapperPass>().getTTI( 
-              *CurrentLoop->getHeader()->getParent()), 
-          AC)) 
-    return false; 
- 
-  // Try trivial unswitch first before loop over other basic blocks in the loop. 
-  if (tryTrivialLoopUnswitch(Changed)) { 
-    return true; 
-  } 
- 
-  // Do not do non-trivial unswitch while optimizing for size. 
-  // FIXME: Use Function::hasOptSize(). 
-  if (OptimizeForSize || 
-      LoopHeader->getParent()->hasFnAttribute(Attribute::OptimizeForSize)) 
+/// Do actual work and unswitch loop if possible and profitable.
+bool LoopUnswitch::processCurrentLoop() {
+  bool Changed = false;
+
+  initLoopData();
+
+  // If LoopSimplify was unable to form a preheader, don't do any unswitching.
+  if (!LoopPreheader)
+    return false;
+
+  // Loops with indirectbr cannot be cloned.
+  if (!CurrentLoop->isSafeToClone())
+    return false;
+
+  // Without dedicated exits, splitting the exit edge may fail.
+  if (!CurrentLoop->hasDedicatedExits())
+    return false;
+
+  LLVMContext &Context = LoopHeader->getContext();
+
+  // Analyze loop cost, and stop unswitching if loop content can not be duplicated.
+  if (!BranchesInfo.countLoop(
+          CurrentLoop,
+          getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+              *CurrentLoop->getHeader()->getParent()),
+          AC))
+    return false;
+
+  // Try trivial unswitch first before loop over other basic blocks in the loop.
+  if (tryTrivialLoopUnswitch(Changed)) {
+    return true;
+  }
+
+  // Do not do non-trivial unswitch while optimizing for size.
+  // FIXME: Use Function::hasOptSize().
+  if (OptimizeForSize ||
+      LoopHeader->getParent()->hasFnAttribute(Attribute::OptimizeForSize))
     return Changed;
- 
-  // Run through the instructions in the loop, keeping track of three things: 
-  // 
-  //  - That we do not unswitch loops containing convergent operations, as we 
-  //    might be making them control dependent on the unswitch value when they 
-  //    were not before. 
-  //    FIXME: This could be refined to only bail if the convergent operation is 
-  //    not already control-dependent on the unswitch value. 
-  // 
-  //  - That basic blocks in the loop contain invokes whose predecessor edges we 
-  //    cannot split. 
-  // 
-  //  - The set of guard intrinsics encountered (these are non terminator 
-  //    instructions that are also profitable to be unswitched). 
- 
-  SmallVector<IntrinsicInst *, 4> Guards; 
- 
-  for (const auto BB : CurrentLoop->blocks()) { 
-    for (auto &I : *BB) { 
-      auto *CB = dyn_cast<CallBase>(&I); 
-      if (!CB) 
-        continue; 
-      if (CB->isConvergent()) 
+
+  // Run through the instructions in the loop, keeping track of three things:
+  //
+  //  - That we do not unswitch loops containing convergent operations, as we
+  //    might be making them control dependent on the unswitch value when they
+  //    were not before.
+  //    FIXME: This could be refined to only bail if the convergent operation is
+  //    not already control-dependent on the unswitch value.
+  //
+  //  - That basic blocks in the loop contain invokes whose predecessor edges we
+  //    cannot split.
+  //
+  //  - The set of guard intrinsics encountered (these are non terminator
+  //    instructions that are also profitable to be unswitched).
+
+  SmallVector<IntrinsicInst *, 4> Guards;
+
+  for (const auto BB : CurrentLoop->blocks()) {
+    for (auto &I : *BB) {
+      auto *CB = dyn_cast<CallBase>(&I);
+      if (!CB)
+        continue;
+      if (CB->isConvergent())
         return Changed;
-      if (auto *II = dyn_cast<InvokeInst>(&I)) 
-        if (!II->getUnwindDest()->canSplitPredecessors()) 
+      if (auto *II = dyn_cast<InvokeInst>(&I))
+        if (!II->getUnwindDest()->canSplitPredecessors())
           return Changed;
-      if (auto *II = dyn_cast<IntrinsicInst>(&I)) 
-        if (II->getIntrinsicID() == Intrinsic::experimental_guard) 
-          Guards.push_back(II); 
-    } 
-  } 
- 
-  for (IntrinsicInst *Guard : Guards) { 
-    Value *LoopCond = findLIVLoopCondition(Guard->getOperand(0), CurrentLoop, 
-                                           Changed, MSSAU.get()) 
-                          .first; 
-    if (LoopCond && 
-        unswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) { 
-      // NB! Unswitching (if successful) could have erased some of the 
-      // instructions in Guards leaving dangling pointers there.  This is fine 
-      // because we're returning now, and won't look at Guards again. 
-      ++NumGuards; 
-      return true; 
-    } 
-  } 
- 
-  // Loop over all of the basic blocks in the loop.  If we find an interior 
-  // block that is branching on a loop-invariant condition, we can unswitch this 
-  // loop. 
-  for (Loop::block_iterator I = CurrentLoop->block_begin(), 
-                            E = CurrentLoop->block_end(); 
-       I != E; ++I) { 
-    Instruction *TI = (*I)->getTerminator(); 
- 
-    // Unswitching on a potentially uninitialized predicate is not 
-    // MSan-friendly. Limit this to the cases when the original predicate is 
-    // guaranteed to execute, to avoid creating a use-of-uninitialized-value 
-    // in the code that did not have one. 
-    // This is a workaround for the discrepancy between LLVM IR and MSan 
-    // semantics. See PR28054 for more details. 
-    if (SanitizeMemory && 
-        !SafetyInfo.isGuaranteedToExecute(*TI, DT, CurrentLoop)) 
-      continue; 
- 
-    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { 
-      // Some branches may be rendered unreachable because of previous 
-      // unswitching. 
-      // Unswitch only those branches that are reachable. 
-      if (isUnreachableDueToPreviousUnswitching(*I)) 
-        continue; 
- 
-      // If this isn't branching on an invariant condition, we can't unswitch 
-      // it. 
-      if (BI->isConditional()) { 
-        // See if this, or some part of it, is loop invariant.  If so, we can 
-        // unswitch on it if we desire. 
-        Value *LoopCond = findLIVLoopCondition(BI->getCondition(), CurrentLoop, 
-                                               Changed, MSSAU.get()) 
-                              .first; 
-        if (LoopCond && !equalityPropUnSafe(*LoopCond) && 
-            unswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context), TI)) { 
-          ++NumBranches; 
-          return true; 
-        } 
-      } 
-    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { 
-      Value *SC = SI->getCondition(); 
-      Value *LoopCond; 
-      OperatorChain OpChain; 
-      std::tie(LoopCond, OpChain) = 
-          findLIVLoopCondition(SC, CurrentLoop, Changed, MSSAU.get()); 
- 
-      unsigned NumCases = SI->getNumCases(); 
-      if (LoopCond && NumCases) { 
-        // Find a value to unswitch on: 
-        // FIXME: this should chose the most expensive case! 
-        // FIXME: scan for a case with a non-critical edge? 
-        Constant *UnswitchVal = nullptr; 
-        // Find a case value such that at least one case value is unswitched 
-        // out. 
-        if (OpChain == OC_OpChainAnd) { 
-          // If the chain only has ANDs and the switch has a case value of 0. 
-          // Dropping in a 0 to the chain will unswitch out the 0-casevalue. 
-          auto *AllZero = cast<ConstantInt>(Constant::getNullValue(SC->getType())); 
-          if (BranchesInfo.isUnswitched(SI, AllZero)) 
-            continue; 
-          // We are unswitching 0 out. 
-          UnswitchVal = AllZero; 
-        } else if (OpChain == OC_OpChainOr) { 
-          // If the chain only has ORs and the switch has a case value of ~0. 
-          // Dropping in a ~0 to the chain will unswitch out the ~0-casevalue. 
-          auto *AllOne = cast<ConstantInt>(Constant::getAllOnesValue(SC->getType())); 
-          if (BranchesInfo.isUnswitched(SI, AllOne)) 
-            continue; 
-          // We are unswitching ~0 out. 
-          UnswitchVal = AllOne; 
-        } else { 
-          assert(OpChain == OC_OpChainNone && 
-                 "Expect to unswitch on trivial chain"); 
-          // Do not process same value again and again. 
-          // At this point we have some cases already unswitched and 
-          // some not yet unswitched. Let's find the first not yet unswitched one. 
-          for (auto Case : SI->cases()) { 
-            Constant *UnswitchValCandidate = Case.getCaseValue(); 
-            if (!BranchesInfo.isUnswitched(SI, UnswitchValCandidate)) { 
-              UnswitchVal = UnswitchValCandidate; 
-              break; 
-            } 
-          } 
-        } 
- 
-        if (!UnswitchVal) 
-          continue; 
- 
-        if (unswitchIfProfitable(LoopCond, UnswitchVal)) { 
-          ++NumSwitches; 
-          // In case of a full LIV, UnswitchVal is the value we unswitched out. 
-          // In case of a partial LIV, we only unswitch when its an AND-chain 
-          // or OR-chain. In both cases switch input value simplifies to 
-          // UnswitchVal. 
-          BranchesInfo.setUnswitched(SI, UnswitchVal); 
-          return true; 
-        } 
-      } 
-    } 
- 
-    // Scan the instructions to check for unswitchable values. 
-    for (BasicBlock::iterator BBI = (*I)->begin(), E = (*I)->end(); 
-         BBI != E; ++BBI) 
-      if (SelectInst *SI = dyn_cast<SelectInst>(BBI)) { 
-        Value *LoopCond = findLIVLoopCondition(SI->getCondition(), CurrentLoop, 
-                                               Changed, MSSAU.get()) 
-                              .first; 
-        if (LoopCond && 
-            unswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) { 
-          ++NumSelects; 
-          return true; 
-        } 
-      } 
-  } 
+      if (auto *II = dyn_cast<IntrinsicInst>(&I))
+        if (II->getIntrinsicID() == Intrinsic::experimental_guard)
+          Guards.push_back(II);
+    }
+  }
+
+  for (IntrinsicInst *Guard : Guards) {
+    Value *LoopCond = findLIVLoopCondition(Guard->getOperand(0), CurrentLoop,
+                                           Changed, MSSAU.get())
+                          .first;
+    if (LoopCond &&
+        unswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) {
+      // NB! Unswitching (if successful) could have erased some of the
+      // instructions in Guards leaving dangling pointers there.  This is fine
+      // because we're returning now, and won't look at Guards again.
+      ++NumGuards;
+      return true;
+    }
+  }
+
+  // Loop over all of the basic blocks in the loop.  If we find an interior
+  // block that is branching on a loop-invariant condition, we can unswitch this
+  // loop.
+  for (Loop::block_iterator I = CurrentLoop->block_begin(),
+                            E = CurrentLoop->block_end();
+       I != E; ++I) {
+    Instruction *TI = (*I)->getTerminator();
+
+    // Unswitching on a potentially uninitialized predicate is not
+    // MSan-friendly. Limit this to the cases when the original predicate is
+    // guaranteed to execute, to avoid creating a use-of-uninitialized-value
+    // in the code that did not have one.
+    // This is a workaround for the discrepancy between LLVM IR and MSan
+    // semantics. See PR28054 for more details.
+    if (SanitizeMemory &&
+        !SafetyInfo.isGuaranteedToExecute(*TI, DT, CurrentLoop))
+      continue;
+
+    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+      // Some branches may be rendered unreachable because of previous
+      // unswitching.
+      // Unswitch only those branches that are reachable.
+      if (isUnreachableDueToPreviousUnswitching(*I))
+        continue;
+
+      // If this isn't branching on an invariant condition, we can't unswitch
+      // it.
+      if (BI->isConditional()) {
+        // See if this, or some part of it, is loop invariant.  If so, we can
+        // unswitch on it if we desire.
+        Value *LoopCond = findLIVLoopCondition(BI->getCondition(), CurrentLoop,
+                                               Changed, MSSAU.get())
+                              .first;
+        if (LoopCond && !equalityPropUnSafe(*LoopCond) &&
+            unswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context), TI)) {
+          ++NumBranches;
+          return true;
+        }
+      }
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+      Value *SC = SI->getCondition();
+      Value *LoopCond;
+      OperatorChain OpChain;
+      std::tie(LoopCond, OpChain) =
+          findLIVLoopCondition(SC, CurrentLoop, Changed, MSSAU.get());
+
+      unsigned NumCases = SI->getNumCases();
+      if (LoopCond && NumCases) {
+        // Find a value to unswitch on:
+        // FIXME: this should chose the most expensive case!
+        // FIXME: scan for a case with a non-critical edge?
+        Constant *UnswitchVal = nullptr;
+        // Find a case value such that at least one case value is unswitched
+        // out.
+        if (OpChain == OC_OpChainAnd) {
+          // If the chain only has ANDs and the switch has a case value of 0.
+          // Dropping in a 0 to the chain will unswitch out the 0-casevalue.
+          auto *AllZero = cast<ConstantInt>(Constant::getNullValue(SC->getType()));
+          if (BranchesInfo.isUnswitched(SI, AllZero))
+            continue;
+          // We are unswitching 0 out.
+          UnswitchVal = AllZero;
+        } else if (OpChain == OC_OpChainOr) {
+          // If the chain only has ORs and the switch has a case value of ~0.
+          // Dropping in a ~0 to the chain will unswitch out the ~0-casevalue.
+          auto *AllOne = cast<ConstantInt>(Constant::getAllOnesValue(SC->getType()));
+          if (BranchesInfo.isUnswitched(SI, AllOne))
+            continue;
+          // We are unswitching ~0 out.
+          UnswitchVal = AllOne;
+        } else {
+          assert(OpChain == OC_OpChainNone &&
+                 "Expect to unswitch on trivial chain");
+          // Do not process same value again and again.
+          // At this point we have some cases already unswitched and
+          // some not yet unswitched. Let's find the first not yet unswitched one.
+          for (auto Case : SI->cases()) {
+            Constant *UnswitchValCandidate = Case.getCaseValue();
+            if (!BranchesInfo.isUnswitched(SI, UnswitchValCandidate)) {
+              UnswitchVal = UnswitchValCandidate;
+              break;
+            }
+          }
+        }
+
+        if (!UnswitchVal)
+          continue;
+
+        if (unswitchIfProfitable(LoopCond, UnswitchVal)) {
+          ++NumSwitches;
+          // In case of a full LIV, UnswitchVal is the value we unswitched out.
+          // In case of a partial LIV, we only unswitch when its an AND-chain
+          // or OR-chain. In both cases switch input value simplifies to
+          // UnswitchVal.
+          BranchesInfo.setUnswitched(SI, UnswitchVal);
+          return true;
+        }
+      }
+    }
+
+    // Scan the instructions to check for unswitchable values.
+    for (BasicBlock::iterator BBI = (*I)->begin(), E = (*I)->end();
+         BBI != E; ++BBI)
+      if (SelectInst *SI = dyn_cast<SelectInst>(BBI)) {
+        Value *LoopCond = findLIVLoopCondition(SI->getCondition(), CurrentLoop,
+                                               Changed, MSSAU.get())
+                              .first;
+        if (LoopCond &&
+            unswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) {
+          ++NumSelects;
+          return true;
+        }
+      }
+  }
 
   // Check if there is a header condition that is invariant along the patch from
   // either the true or false successors to the header. This allows unswitching
@@ -1000,102 +1000,102 @@ bool LoopUnswitch::processCurrentLoop() {
     }
   }
 
-  return Changed; 
-} 
- 
-/// Check to see if all paths from BB exit the loop with no side effects 
-/// (including infinite loops). 
-/// 
-/// If true, we return true and set ExitBB to the block we 
-/// exit through. 
-/// 
-static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB, 
-                                         BasicBlock *&ExitBB, 
-                                         std::set<BasicBlock*> &Visited) { 
-  if (!Visited.insert(BB).second) { 
-    // Already visited. Without more analysis, this could indicate an infinite 
-    // loop. 
-    return false; 
-  } 
-  if (!L->contains(BB)) { 
-    // Otherwise, this is a loop exit, this is fine so long as this is the 
-    // first exit. 
-    if (ExitBB) return false; 
-    ExitBB = BB; 
-    return true; 
-  } 
- 
-  // Otherwise, this is an unvisited intra-loop node.  Check all successors. 
-  for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI) { 
-    // Check to see if the successor is a trivial loop exit. 
-    if (!isTrivialLoopExitBlockHelper(L, *SI, ExitBB, Visited)) 
-      return false; 
-  } 
- 
-  // Okay, everything after this looks good, check to make sure that this block 
-  // doesn't include any side effects. 
-  for (Instruction &I : *BB) 
-    if (I.mayHaveSideEffects()) 
-      return false; 
- 
-  return true; 
-} 
- 
-/// Return true if the specified block unconditionally leads to an exit from 
-/// the specified loop, and has no side-effects in the process. If so, return 
-/// the block that is exited to, otherwise return null. 
-static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) { 
-  std::set<BasicBlock*> Visited; 
-  Visited.insert(L->getHeader());  // Branches to header make infinite loops. 
-  BasicBlock *ExitBB = nullptr; 
-  if (isTrivialLoopExitBlockHelper(L, BB, ExitBB, Visited)) 
-    return ExitBB; 
-  return nullptr; 
-} 
- 
-/// We have found that we can unswitch CurrentLoop when LoopCond == Val to 
-/// simplify the loop.  If we decide that this is profitable, 
-/// unswitch the loop, reprocess the pieces, then return true. 
-bool LoopUnswitch::unswitchIfProfitable(Value *LoopCond, Constant *Val, 
+  return Changed;
+}
+
+/// Check to see if all paths from BB exit the loop with no side effects
+/// (including infinite loops).
+///
+/// If true, we return true and set ExitBB to the block we
+/// exit through.
+///
+static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB,
+                                         BasicBlock *&ExitBB,
+                                         std::set<BasicBlock*> &Visited) {
+  if (!Visited.insert(BB).second) {
+    // Already visited. Without more analysis, this could indicate an infinite
+    // loop.
+    return false;
+  }
+  if (!L->contains(BB)) {
+    // Otherwise, this is a loop exit, this is fine so long as this is the
+    // first exit.
+    if (ExitBB) return false;
+    ExitBB = BB;
+    return true;
+  }
+
+  // Otherwise, this is an unvisited intra-loop node.  Check all successors.
+  for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI) {
+    // Check to see if the successor is a trivial loop exit.
+    if (!isTrivialLoopExitBlockHelper(L, *SI, ExitBB, Visited))
+      return false;
+  }
+
+  // Okay, everything after this looks good, check to make sure that this block
+  // doesn't include any side effects.
+  for (Instruction &I : *BB)
+    if (I.mayHaveSideEffects())
+      return false;
+
+  return true;
+}
+
+/// Return true if the specified block unconditionally leads to an exit from
+/// the specified loop, and has no side-effects in the process. If so, return
+/// the block that is exited to, otherwise return null.
+static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) {
+  std::set<BasicBlock*> Visited;
+  Visited.insert(L->getHeader());  // Branches to header make infinite loops.
+  BasicBlock *ExitBB = nullptr;
+  if (isTrivialLoopExitBlockHelper(L, BB, ExitBB, Visited))
+    return ExitBB;
+  return nullptr;
+}
+
+/// We have found that we can unswitch CurrentLoop when LoopCond == Val to
+/// simplify the loop.  If we decide that this is profitable,
+/// unswitch the loop, reprocess the pieces, then return true.
+bool LoopUnswitch::unswitchIfProfitable(Value *LoopCond, Constant *Val,
                                         Instruction *TI,
                                         ArrayRef<Instruction *> ToDuplicate) {
-  // Check to see if it would be profitable to unswitch current loop. 
-  if (!BranchesInfo.costAllowsUnswitching()) { 
-    LLVM_DEBUG(dbgs() << "NOT unswitching loop %" 
-                      << CurrentLoop->getHeader()->getName() 
-                      << " at non-trivial condition '" << *Val 
-                      << "' == " << *LoopCond << "\n" 
-                      << ". Cost too high.\n"); 
-    return false; 
-  } 
-  if (HasBranchDivergence && 
-      getAnalysis<LegacyDivergenceAnalysis>().isDivergent(LoopCond)) { 
-    LLVM_DEBUG(dbgs() << "NOT unswitching loop %" 
-                      << CurrentLoop->getHeader()->getName() 
-                      << " at non-trivial condition '" << *Val 
-                      << "' == " << *LoopCond << "\n" 
-                      << ". Condition is divergent.\n"); 
-    return false; 
-  } 
- 
+  // Check to see if it would be profitable to unswitch current loop.
+  if (!BranchesInfo.costAllowsUnswitching()) {
+    LLVM_DEBUG(dbgs() << "NOT unswitching loop %"
+                      << CurrentLoop->getHeader()->getName()
+                      << " at non-trivial condition '" << *Val
+                      << "' == " << *LoopCond << "\n"
+                      << ". Cost too high.\n");
+    return false;
+  }
+  if (HasBranchDivergence &&
+      getAnalysis<LegacyDivergenceAnalysis>().isDivergent(LoopCond)) {
+    LLVM_DEBUG(dbgs() << "NOT unswitching loop %"
+                      << CurrentLoop->getHeader()->getName()
+                      << " at non-trivial condition '" << *Val
+                      << "' == " << *LoopCond << "\n"
+                      << ". Condition is divergent.\n");
+    return false;
+  }
+
   unswitchNontrivialCondition(LoopCond, Val, CurrentLoop, TI, ToDuplicate);
-  return true; 
-} 
- 
-/// Emit a conditional branch on two values if LIC == Val, branch to TrueDst, 
-/// otherwise branch to FalseDest. Insert the code immediately before OldBranch 
-/// and remove (but not erase!) it from the function. 
+  return true;
+}
+
+/// Emit a conditional branch on two values if LIC == Val, branch to TrueDst,
+/// otherwise branch to FalseDest. Insert the code immediately before OldBranch
+/// and remove (but not erase!) it from the function.
 void LoopUnswitch::emitPreheaderBranchOnCondition(
     Value *LIC, Constant *Val, BasicBlock *TrueDest, BasicBlock *FalseDest,
     BranchInst *OldBranch, Instruction *TI,
     ArrayRef<Instruction *> ToDuplicate) {
-  assert(OldBranch->isUnconditional() && "Preheader is not split correctly"); 
-  assert(TrueDest != FalseDest && "Branch targets should be different"); 
+  assert(OldBranch->isUnconditional() && "Preheader is not split correctly");
+  assert(TrueDest != FalseDest && "Branch targets should be different");
 
-  // Insert a conditional branch on LIC to the two preheaders.  The original 
-  // code is the true version and the new code is the false version. 
-  Value *BranchVal = LIC; 
-  bool Swapped = false; 
+  // Insert a conditional branch on LIC to the two preheaders.  The original
+  // code is the true version and the new code is the false version.
+  Value *BranchVal = LIC;
+  bool Swapped = false;
 
   if (!ToDuplicate.empty()) {
     ValueToValueMapTy Old2New;
@@ -1141,450 +1141,450 @@ void LoopUnswitch::emitPreheaderBranchOnCondition(
       std::swap(TrueDest, FalseDest);
       Swapped = true;
     }
-  } 
- 
-  // Old branch will be removed, so save its parent and successor to update the 
-  // DomTree. 
-  auto *OldBranchSucc = OldBranch->getSuccessor(0); 
-  auto *OldBranchParent = OldBranch->getParent(); 
- 
-  // Insert the new branch. 
-  BranchInst *BI = 
-      IRBuilder<>(OldBranch).CreateCondBr(BranchVal, TrueDest, FalseDest, TI); 
-  if (Swapped) 
-    BI->swapProfMetadata(); 
- 
-  // Remove the old branch so there is only one branch at the end. This is 
-  // needed to perform DomTree's internal DFS walk on the function's CFG. 
-  OldBranch->removeFromParent(); 
- 
-  // Inform the DT about the new branch. 
-  if (DT) { 
-    // First, add both successors. 
-    SmallVector<DominatorTree::UpdateType, 3> Updates; 
-    if (TrueDest != OldBranchSucc) 
-      Updates.push_back({DominatorTree::Insert, OldBranchParent, TrueDest}); 
-    if (FalseDest != OldBranchSucc) 
-      Updates.push_back({DominatorTree::Insert, OldBranchParent, FalseDest}); 
-    // If both of the new successors are different from the old one, inform the 
-    // DT that the edge was deleted. 
-    if (OldBranchSucc != TrueDest && OldBranchSucc != FalseDest) { 
-      Updates.push_back({DominatorTree::Delete, OldBranchParent, OldBranchSucc}); 
-    } 
- 
-    if (MSSAU) 
+  }
+
+  // Old branch will be removed, so save its parent and successor to update the
+  // DomTree.
+  auto *OldBranchSucc = OldBranch->getSuccessor(0);
+  auto *OldBranchParent = OldBranch->getParent();
+
+  // Insert the new branch.
+  BranchInst *BI =
+      IRBuilder<>(OldBranch).CreateCondBr(BranchVal, TrueDest, FalseDest, TI);
+  if (Swapped)
+    BI->swapProfMetadata();
+
+  // Remove the old branch so there is only one branch at the end. This is
+  // needed to perform DomTree's internal DFS walk on the function's CFG.
+  OldBranch->removeFromParent();
+
+  // Inform the DT about the new branch.
+  if (DT) {
+    // First, add both successors.
+    SmallVector<DominatorTree::UpdateType, 3> Updates;
+    if (TrueDest != OldBranchSucc)
+      Updates.push_back({DominatorTree::Insert, OldBranchParent, TrueDest});
+    if (FalseDest != OldBranchSucc)
+      Updates.push_back({DominatorTree::Insert, OldBranchParent, FalseDest});
+    // If both of the new successors are different from the old one, inform the
+    // DT that the edge was deleted.
+    if (OldBranchSucc != TrueDest && OldBranchSucc != FalseDest) {
+      Updates.push_back({DominatorTree::Delete, OldBranchParent, OldBranchSucc});
+    }
+
+    if (MSSAU)
       MSSAU->applyUpdates(Updates, *DT, /*UpdateDT=*/true);
     else
       DT->applyUpdates(Updates);
-  } 
- 
-  // If either edge is critical, split it. This helps preserve LoopSimplify 
-  // form for enclosing loops. 
-  auto Options = 
-      CriticalEdgeSplittingOptions(DT, LI, MSSAU.get()).setPreserveLCSSA(); 
-  SplitCriticalEdge(BI, 0, Options); 
-  SplitCriticalEdge(BI, 1, Options); 
-} 
- 
-/// Given a loop that has a trivial unswitchable condition in it (a cond branch 
-/// from its header block to its latch block, where the path through the loop 
-/// that doesn't execute its body has no side-effects), unswitch it. This 
-/// doesn't involve any code duplication, just moving the conditional branch 
-/// outside of the loop and updating loop info. 
-void LoopUnswitch::unswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val, 
-                                            BasicBlock *ExitBlock, 
-                                            Instruction *TI) { 
-  LLVM_DEBUG(dbgs() << "loop-unswitch: Trivial-Unswitch loop %" 
-                    << LoopHeader->getName() << " [" << L->getBlocks().size() 
-                    << " blocks] in Function " 
-                    << L->getHeader()->getParent()->getName() 
-                    << " on cond: " << *Val << " == " << *Cond << "\n"); 
-  // We are going to make essential changes to CFG. This may invalidate cached 
-  // information for L or one of its parent loops in SCEV. 
-  if (auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>()) 
-    SEWP->getSE().forgetTopmostLoop(L); 
- 
-  // First step, split the preheader, so that we know that there is a safe place 
-  // to insert the conditional branch.  We will change LoopPreheader to have a 
-  // conditional branch on Cond. 
-  BasicBlock *NewPH = SplitEdge(LoopPreheader, LoopHeader, DT, LI, MSSAU.get()); 
- 
-  // Now that we have a place to insert the conditional branch, create a place 
-  // to branch to: this is the exit block out of the loop that we should 
-  // short-circuit to. 
- 
-  // Split this block now, so that the loop maintains its exit block, and so 
-  // that the jump from the preheader can execute the contents of the exit block 
-  // without actually branching to it (the exit block should be dominated by the 
-  // loop header, not the preheader). 
-  assert(!L->contains(ExitBlock) && "Exit block is in the loop?"); 
-  BasicBlock *NewExit = 
-      SplitBlock(ExitBlock, &ExitBlock->front(), DT, LI, MSSAU.get()); 
- 
-  // Okay, now we have a position to branch from and a position to branch to, 
-  // insert the new conditional branch. 
-  auto *OldBranch = dyn_cast<BranchInst>(LoopPreheader->getTerminator()); 
-  assert(OldBranch && "Failed to split the preheader"); 
-  emitPreheaderBranchOnCondition(Cond, Val, NewExit, NewPH, OldBranch, TI); 
- 
-  // emitPreheaderBranchOnCondition removed the OldBranch from the function. 
-  // Delete it, as it is no longer needed. 
-  delete OldBranch; 
- 
-  // We need to reprocess this loop, it could be unswitched again. 
-  RedoLoop = true; 
- 
-  // Now that we know that the loop is never entered when this condition is a 
-  // particular value, rewrite the loop with this info.  We know that this will 
-  // at least eliminate the old branch. 
-  rewriteLoopBodyWithConditionConstant(L, Cond, Val, /*IsEqual=*/false); 
- 
-  ++NumTrivial; 
-} 
- 
-/// Check if the first non-constant condition starting from the loop header is 
-/// a trivial unswitch condition: that is, a condition controls whether or not 
-/// the loop does anything at all. If it is a trivial condition, unswitching 
-/// produces no code duplications (equivalently, it produces a simpler loop and 
-/// a new empty loop, which gets deleted). Therefore always unswitch trivial 
-/// condition. 
-bool LoopUnswitch::tryTrivialLoopUnswitch(bool &Changed) { 
-  BasicBlock *CurrentBB = CurrentLoop->getHeader(); 
-  Instruction *CurrentTerm = CurrentBB->getTerminator(); 
-  LLVMContext &Context = CurrentBB->getContext(); 
- 
-  // If loop header has only one reachable successor (currently via an 
-  // unconditional branch or constant foldable conditional branch, but 
-  // should also consider adding constant foldable switch instruction in 
-  // future), we should keep looking for trivial condition candidates in 
-  // the successor as well. An alternative is to constant fold conditions 
-  // and merge successors into loop header (then we only need to check header's 
-  // terminator). The reason for not doing this in LoopUnswitch pass is that 
-  // it could potentially break LoopPassManager's invariants. Folding dead 
-  // branches could either eliminate the current loop or make other loops 
-  // unreachable. LCSSA form might also not be preserved after deleting 
-  // branches. The following code keeps traversing loop header's successors 
-  // until it finds the trivial condition candidate (condition that is not a 
-  // constant). Since unswitching generates branches with constant conditions, 
-  // this scenario could be very common in practice. 
-  SmallPtrSet<BasicBlock*, 8> Visited; 
- 
-  while (true) { 
-    // If we exit loop or reach a previous visited block, then 
-    // we can not reach any trivial condition candidates (unfoldable 
-    // branch instructions or switch instructions) and no unswitch 
-    // can happen. Exit and return false. 
-    if (!CurrentLoop->contains(CurrentBB) || !Visited.insert(CurrentBB).second) 
-      return false; 
- 
-    // Check if this loop will execute any side-effecting instructions (e.g. 
-    // stores, calls, volatile loads) in the part of the loop that the code 
-    // *would* execute. Check the header first. 
-    for (Instruction &I : *CurrentBB) 
-      if (I.mayHaveSideEffects()) 
-        return false; 
- 
-    if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) { 
-      if (BI->isUnconditional()) { 
-        CurrentBB = BI->getSuccessor(0); 
-      } else if (BI->getCondition() == ConstantInt::getTrue(Context)) { 
-        CurrentBB = BI->getSuccessor(0); 
-      } else if (BI->getCondition() == ConstantInt::getFalse(Context)) { 
-        CurrentBB = BI->getSuccessor(1); 
-      } else { 
-        // Found a trivial condition candidate: non-foldable conditional branch. 
-        break; 
-      } 
-    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) { 
-      // At this point, any constant-foldable instructions should have probably 
-      // been folded. 
-      ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition()); 
-      if (!Cond) 
-        break; 
-      // Find the target block we are definitely going to. 
-      CurrentBB = SI->findCaseValue(Cond)->getCaseSuccessor(); 
-    } else { 
-      // We do not understand these terminator instructions. 
-      break; 
-    } 
- 
-    CurrentTerm = CurrentBB->getTerminator(); 
-  } 
- 
-  // CondVal is the condition that controls the trivial condition. 
-  // LoopExitBB is the BasicBlock that loop exits when meets trivial condition. 
-  Constant *CondVal = nullptr; 
-  BasicBlock *LoopExitBB = nullptr; 
- 
-  if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) { 
-    // If this isn't branching on an invariant condition, we can't unswitch it. 
-    if (!BI->isConditional()) 
-      return false; 
- 
-    Value *LoopCond = findLIVLoopCondition(BI->getCondition(), CurrentLoop, 
-                                           Changed, MSSAU.get()) 
-                          .first; 
- 
-    // Unswitch only if the trivial condition itself is an LIV (not 
-    // partial LIV which could occur in and/or) 
-    if (!LoopCond || LoopCond != BI->getCondition()) 
-      return false; 
- 
-    // Check to see if a successor of the branch is guaranteed to 
-    // exit through a unique exit block without having any 
-    // side-effects.  If so, determine the value of Cond that causes 
-    // it to do this. 
-    if ((LoopExitBB = 
-             isTrivialLoopExitBlock(CurrentLoop, BI->getSuccessor(0)))) { 
-      CondVal = ConstantInt::getTrue(Context); 
-    } else if ((LoopExitBB = 
-                    isTrivialLoopExitBlock(CurrentLoop, BI->getSuccessor(1)))) { 
-      CondVal = ConstantInt::getFalse(Context); 
-    } 
- 
-    // If we didn't find a single unique LoopExit block, or if the loop exit 
-    // block contains phi nodes, this isn't trivial. 
-    if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin())) 
-      return false;   // Can't handle this. 
- 
-    if (equalityPropUnSafe(*LoopCond)) 
-      return false; 
- 
-    unswitchTrivialCondition(CurrentLoop, LoopCond, CondVal, LoopExitBB, 
-                             CurrentTerm); 
-    ++NumBranches; 
-    return true; 
-  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) { 
-    // If this isn't switching on an invariant condition, we can't unswitch it. 
-    Value *LoopCond = findLIVLoopCondition(SI->getCondition(), CurrentLoop, 
-                                           Changed, MSSAU.get()) 
-                          .first; 
- 
-    // Unswitch only if the trivial condition itself is an LIV (not 
-    // partial LIV which could occur in and/or) 
-    if (!LoopCond || LoopCond != SI->getCondition()) 
-      return false; 
- 
-    // Check to see if a successor of the switch is guaranteed to go to the 
-    // latch block or exit through a one exit block without having any 
-    // side-effects.  If so, determine the value of Cond that causes it to do 
-    // this. 
-    // Note that we can't trivially unswitch on the default case or 
-    // on already unswitched cases. 
-    for (auto Case : SI->cases()) { 
-      BasicBlock *LoopExitCandidate; 
-      if ((LoopExitCandidate = 
-               isTrivialLoopExitBlock(CurrentLoop, Case.getCaseSuccessor()))) { 
-        // Okay, we found a trivial case, remember the value that is trivial. 
-        ConstantInt *CaseVal = Case.getCaseValue(); 
- 
-        // Check that it was not unswitched before, since already unswitched 
-        // trivial vals are looks trivial too. 
-        if (BranchesInfo.isUnswitched(SI, CaseVal)) 
-          continue; 
-        LoopExitBB = LoopExitCandidate; 
-        CondVal = CaseVal; 
-        break; 
-      } 
-    } 
- 
-    // If we didn't find a single unique LoopExit block, or if the loop exit 
-    // block contains phi nodes, this isn't trivial. 
-    if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin())) 
-      return false;   // Can't handle this. 
- 
-    unswitchTrivialCondition(CurrentLoop, LoopCond, CondVal, LoopExitBB, 
-                             nullptr); 
- 
-    // We are only unswitching full LIV. 
-    BranchesInfo.setUnswitched(SI, CondVal); 
-    ++NumSwitches; 
-    return true; 
-  } 
-  return false; 
-} 
- 
-/// Split all of the edges from inside the loop to their exit blocks. 
-/// Update the appropriate Phi nodes as we do so. 
-void LoopUnswitch::splitExitEdges( 
-    Loop *L, const SmallVectorImpl<BasicBlock *> &ExitBlocks) { 
- 
-  for (unsigned I = 0, E = ExitBlocks.size(); I != E; ++I) { 
-    BasicBlock *ExitBlock = ExitBlocks[I]; 
-    SmallVector<BasicBlock *, 4> Preds(pred_begin(ExitBlock), 
-                                       pred_end(ExitBlock)); 
- 
-    // Although SplitBlockPredecessors doesn't preserve loop-simplify in 
-    // general, if we call it on all predecessors of all exits then it does. 
-    SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa", DT, LI, MSSAU.get(), 
-                           /*PreserveLCSSA*/ true); 
-  } 
-} 
- 
-/// We determined that the loop is profitable to unswitch when LIC equal Val. 
-/// Split it into loop versions and test the condition outside of either loop. 
-/// Return the loops created as Out1/Out2. 
+  }
+
+  // If either edge is critical, split it. This helps preserve LoopSimplify
+  // form for enclosing loops.
+  auto Options =
+      CriticalEdgeSplittingOptions(DT, LI, MSSAU.get()).setPreserveLCSSA();
+  SplitCriticalEdge(BI, 0, Options);
+  SplitCriticalEdge(BI, 1, Options);
+}
+
+/// Given a loop that has a trivial unswitchable condition in it (a cond branch
+/// from its header block to its latch block, where the path through the loop
+/// that doesn't execute its body has no side-effects), unswitch it. This
+/// doesn't involve any code duplication, just moving the conditional branch
+/// outside of the loop and updating loop info.
+void LoopUnswitch::unswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
+                                            BasicBlock *ExitBlock,
+                                            Instruction *TI) {
+  LLVM_DEBUG(dbgs() << "loop-unswitch: Trivial-Unswitch loop %"
+                    << LoopHeader->getName() << " [" << L->getBlocks().size()
+                    << " blocks] in Function "
+                    << L->getHeader()->getParent()->getName()
+                    << " on cond: " << *Val << " == " << *Cond << "\n");
+  // We are going to make essential changes to CFG. This may invalidate cached
+  // information for L or one of its parent loops in SCEV.
+  if (auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>())
+    SEWP->getSE().forgetTopmostLoop(L);
+
+  // First step, split the preheader, so that we know that there is a safe place
+  // to insert the conditional branch.  We will change LoopPreheader to have a
+  // conditional branch on Cond.
+  BasicBlock *NewPH = SplitEdge(LoopPreheader, LoopHeader, DT, LI, MSSAU.get());
+
+  // Now that we have a place to insert the conditional branch, create a place
+  // to branch to: this is the exit block out of the loop that we should
+  // short-circuit to.
+
+  // Split this block now, so that the loop maintains its exit block, and so
+  // that the jump from the preheader can execute the contents of the exit block
+  // without actually branching to it (the exit block should be dominated by the
+  // loop header, not the preheader).
+  assert(!L->contains(ExitBlock) && "Exit block is in the loop?");
+  BasicBlock *NewExit =
+      SplitBlock(ExitBlock, &ExitBlock->front(), DT, LI, MSSAU.get());
+
+  // Okay, now we have a position to branch from and a position to branch to,
+  // insert the new conditional branch.
+  auto *OldBranch = dyn_cast<BranchInst>(LoopPreheader->getTerminator());
+  assert(OldBranch && "Failed to split the preheader");
+  emitPreheaderBranchOnCondition(Cond, Val, NewExit, NewPH, OldBranch, TI);
+
+  // emitPreheaderBranchOnCondition removed the OldBranch from the function.
+  // Delete it, as it is no longer needed.
+  delete OldBranch;
+
+  // We need to reprocess this loop, it could be unswitched again.
+  RedoLoop = true;
+
+  // Now that we know that the loop is never entered when this condition is a
+  // particular value, rewrite the loop with this info.  We know that this will
+  // at least eliminate the old branch.
+  rewriteLoopBodyWithConditionConstant(L, Cond, Val, /*IsEqual=*/false);
+
+  ++NumTrivial;
+}
+
+/// Check if the first non-constant condition starting from the loop header is
+/// a trivial unswitch condition: that is, a condition controls whether or not
+/// the loop does anything at all. If it is a trivial condition, unswitching
+/// produces no code duplications (equivalently, it produces a simpler loop and
+/// a new empty loop, which gets deleted). Therefore always unswitch trivial
+/// condition.
+bool LoopUnswitch::tryTrivialLoopUnswitch(bool &Changed) {
+  BasicBlock *CurrentBB = CurrentLoop->getHeader();
+  Instruction *CurrentTerm = CurrentBB->getTerminator();
+  LLVMContext &Context = CurrentBB->getContext();
+
+  // If loop header has only one reachable successor (currently via an
+  // unconditional branch or constant foldable conditional branch, but
+  // should also consider adding constant foldable switch instruction in
+  // future), we should keep looking for trivial condition candidates in
+  // the successor as well. An alternative is to constant fold conditions
+  // and merge successors into loop header (then we only need to check header's
+  // terminator). The reason for not doing this in LoopUnswitch pass is that
+  // it could potentially break LoopPassManager's invariants. Folding dead
+  // branches could either eliminate the current loop or make other loops
+  // unreachable. LCSSA form might also not be preserved after deleting
+  // branches. The following code keeps traversing loop header's successors
+  // until it finds the trivial condition candidate (condition that is not a
+  // constant). Since unswitching generates branches with constant conditions,
+  // this scenario could be very common in practice.
+  SmallPtrSet<BasicBlock*, 8> Visited;
+
+  while (true) {
+    // If we exit loop or reach a previous visited block, then
+    // we can not reach any trivial condition candidates (unfoldable
+    // branch instructions or switch instructions) and no unswitch
+    // can happen. Exit and return false.
+    if (!CurrentLoop->contains(CurrentBB) || !Visited.insert(CurrentBB).second)
+      return false;
+
+    // Check if this loop will execute any side-effecting instructions (e.g.
+    // stores, calls, volatile loads) in the part of the loop that the code
+    // *would* execute. Check the header first.
+    for (Instruction &I : *CurrentBB)
+      if (I.mayHaveSideEffects())
+        return false;
+
+    if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) {
+      if (BI->isUnconditional()) {
+        CurrentBB = BI->getSuccessor(0);
+      } else if (BI->getCondition() == ConstantInt::getTrue(Context)) {
+        CurrentBB = BI->getSuccessor(0);
+      } else if (BI->getCondition() == ConstantInt::getFalse(Context)) {
+        CurrentBB = BI->getSuccessor(1);
+      } else {
+        // Found a trivial condition candidate: non-foldable conditional branch.
+        break;
+      }
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
+      // At this point, any constant-foldable instructions should have probably
+      // been folded.
+      ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition());
+      if (!Cond)
+        break;
+      // Find the target block we are definitely going to.
+      CurrentBB = SI->findCaseValue(Cond)->getCaseSuccessor();
+    } else {
+      // We do not understand these terminator instructions.
+      break;
+    }
+
+    CurrentTerm = CurrentBB->getTerminator();
+  }
+
+  // CondVal is the condition that controls the trivial condition.
+  // LoopExitBB is the BasicBlock that loop exits when meets trivial condition.
+  Constant *CondVal = nullptr;
+  BasicBlock *LoopExitBB = nullptr;
+
+  if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) {
+    // If this isn't branching on an invariant condition, we can't unswitch it.
+    if (!BI->isConditional())
+      return false;
+
+    Value *LoopCond = findLIVLoopCondition(BI->getCondition(), CurrentLoop,
+                                           Changed, MSSAU.get())
+                          .first;
+
+    // Unswitch only if the trivial condition itself is an LIV (not
+    // partial LIV which could occur in and/or)
+    if (!LoopCond || LoopCond != BI->getCondition())
+      return false;
+
+    // Check to see if a successor of the branch is guaranteed to
+    // exit through a unique exit block without having any
+    // side-effects.  If so, determine the value of Cond that causes
+    // it to do this.
+    if ((LoopExitBB =
+             isTrivialLoopExitBlock(CurrentLoop, BI->getSuccessor(0)))) {
+      CondVal = ConstantInt::getTrue(Context);
+    } else if ((LoopExitBB =
+                    isTrivialLoopExitBlock(CurrentLoop, BI->getSuccessor(1)))) {
+      CondVal = ConstantInt::getFalse(Context);
+    }
+
+    // If we didn't find a single unique LoopExit block, or if the loop exit
+    // block contains phi nodes, this isn't trivial.
+    if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin()))
+      return false;   // Can't handle this.
+
+    if (equalityPropUnSafe(*LoopCond))
+      return false;
+
+    unswitchTrivialCondition(CurrentLoop, LoopCond, CondVal, LoopExitBB,
+                             CurrentTerm);
+    ++NumBranches;
+    return true;
+  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
+    // If this isn't switching on an invariant condition, we can't unswitch it.
+    Value *LoopCond = findLIVLoopCondition(SI->getCondition(), CurrentLoop,
+                                           Changed, MSSAU.get())
+                          .first;
+
+    // Unswitch only if the trivial condition itself is an LIV (not
+    // partial LIV which could occur in and/or)
+    if (!LoopCond || LoopCond != SI->getCondition())
+      return false;
+
+    // Check to see if a successor of the switch is guaranteed to go to the
+    // latch block or exit through a one exit block without having any
+    // side-effects.  If so, determine the value of Cond that causes it to do
+    // this.
+    // Note that we can't trivially unswitch on the default case or
+    // on already unswitched cases.
+    for (auto Case : SI->cases()) {
+      BasicBlock *LoopExitCandidate;
+      if ((LoopExitCandidate =
+               isTrivialLoopExitBlock(CurrentLoop, Case.getCaseSuccessor()))) {
+        // Okay, we found a trivial case, remember the value that is trivial.
+        ConstantInt *CaseVal = Case.getCaseValue();
+
+        // Check that it was not unswitched before, since already unswitched
+        // trivial vals are looks trivial too.
+        if (BranchesInfo.isUnswitched(SI, CaseVal))
+          continue;
+        LoopExitBB = LoopExitCandidate;
+        CondVal = CaseVal;
+        break;
+      }
+    }
+
+    // If we didn't find a single unique LoopExit block, or if the loop exit
+    // block contains phi nodes, this isn't trivial.
+    if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin()))
+      return false;   // Can't handle this.
+
+    unswitchTrivialCondition(CurrentLoop, LoopCond, CondVal, LoopExitBB,
+                             nullptr);
+
+    // We are only unswitching full LIV.
+    BranchesInfo.setUnswitched(SI, CondVal);
+    ++NumSwitches;
+    return true;
+  }
+  return false;
+}
+
+/// Split all of the edges from inside the loop to their exit blocks.
+/// Update the appropriate Phi nodes as we do so.
+void LoopUnswitch::splitExitEdges(
+    Loop *L, const SmallVectorImpl<BasicBlock *> &ExitBlocks) {
+
+  for (unsigned I = 0, E = ExitBlocks.size(); I != E; ++I) {
+    BasicBlock *ExitBlock = ExitBlocks[I];
+    SmallVector<BasicBlock *, 4> Preds(pred_begin(ExitBlock),
+                                       pred_end(ExitBlock));
+
+    // Although SplitBlockPredecessors doesn't preserve loop-simplify in
+    // general, if we call it on all predecessors of all exits then it does.
+    SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa", DT, LI, MSSAU.get(),
+                           /*PreserveLCSSA*/ true);
+  }
+}
+
+/// We determined that the loop is profitable to unswitch when LIC equal Val.
+/// Split it into loop versions and test the condition outside of either loop.
+/// Return the loops created as Out1/Out2.
 void LoopUnswitch::unswitchNontrivialCondition(
     Value *LIC, Constant *Val, Loop *L, Instruction *TI,
     ArrayRef<Instruction *> ToDuplicate) {
-  Function *F = LoopHeader->getParent(); 
-  LLVM_DEBUG(dbgs() << "loop-unswitch: Unswitching loop %" 
-                    << LoopHeader->getName() << " [" << L->getBlocks().size() 
-                    << " blocks] in Function " << F->getName() << " when '" 
-                    << *Val << "' == " << *LIC << "\n"); 
- 
-  // We are going to make essential changes to CFG. This may invalidate cached 
-  // information for L or one of its parent loops in SCEV. 
-  if (auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>()) 
-    SEWP->getSE().forgetTopmostLoop(L); 
- 
-  LoopBlocks.clear(); 
-  NewBlocks.clear(); 
- 
-  if (MSSAU && VerifyMemorySSA) 
-    MSSA->verifyMemorySSA(); 
- 
-  // First step, split the preheader and exit blocks, and add these blocks to 
-  // the LoopBlocks list. 
-  BasicBlock *NewPreheader = 
-      SplitEdge(LoopPreheader, LoopHeader, DT, LI, MSSAU.get()); 
-  LoopBlocks.push_back(NewPreheader); 
- 
-  // We want the loop to come after the preheader, but before the exit blocks. 
+  Function *F = LoopHeader->getParent();
+  LLVM_DEBUG(dbgs() << "loop-unswitch: Unswitching loop %"
+                    << LoopHeader->getName() << " [" << L->getBlocks().size()
+                    << " blocks] in Function " << F->getName() << " when '"
+                    << *Val << "' == " << *LIC << "\n");
+
+  // We are going to make essential changes to CFG. This may invalidate cached
+  // information for L or one of its parent loops in SCEV.
+  if (auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>())
+    SEWP->getSE().forgetTopmostLoop(L);
+
+  LoopBlocks.clear();
+  NewBlocks.clear();
+
+  if (MSSAU && VerifyMemorySSA)
+    MSSA->verifyMemorySSA();
+
+  // First step, split the preheader and exit blocks, and add these blocks to
+  // the LoopBlocks list.
+  BasicBlock *NewPreheader =
+      SplitEdge(LoopPreheader, LoopHeader, DT, LI, MSSAU.get());
+  LoopBlocks.push_back(NewPreheader);
+
+  // We want the loop to come after the preheader, but before the exit blocks.
   llvm::append_range(LoopBlocks, L->blocks());
- 
-  SmallVector<BasicBlock*, 8> ExitBlocks; 
-  L->getUniqueExitBlocks(ExitBlocks); 
- 
-  // Split all of the edges from inside the loop to their exit blocks.  Update 
-  // the appropriate Phi nodes as we do so. 
-  splitExitEdges(L, ExitBlocks); 
- 
-  // The exit blocks may have been changed due to edge splitting, recompute. 
-  ExitBlocks.clear(); 
-  L->getUniqueExitBlocks(ExitBlocks); 
- 
-  // Add exit blocks to the loop blocks. 
+
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  L->getUniqueExitBlocks(ExitBlocks);
+
+  // Split all of the edges from inside the loop to their exit blocks.  Update
+  // the appropriate Phi nodes as we do so.
+  splitExitEdges(L, ExitBlocks);
+
+  // The exit blocks may have been changed due to edge splitting, recompute.
+  ExitBlocks.clear();
+  L->getUniqueExitBlocks(ExitBlocks);
+
+  // Add exit blocks to the loop blocks.
   llvm::append_range(LoopBlocks, ExitBlocks);
- 
-  // Next step, clone all of the basic blocks that make up the loop (including 
-  // the loop preheader and exit blocks), keeping track of the mapping between 
-  // the instructions and blocks. 
-  NewBlocks.reserve(LoopBlocks.size()); 
-  ValueToValueMapTy VMap; 
-  for (unsigned I = 0, E = LoopBlocks.size(); I != E; ++I) { 
-    BasicBlock *NewBB = CloneBasicBlock(LoopBlocks[I], VMap, ".us", F); 
- 
-    NewBlocks.push_back(NewBB); 
-    VMap[LoopBlocks[I]] = NewBB; // Keep the BB mapping. 
-  } 
- 
-  // Splice the newly inserted blocks into the function right before the 
-  // original preheader. 
-  F->getBasicBlockList().splice(NewPreheader->getIterator(), 
-                                F->getBasicBlockList(), 
-                                NewBlocks[0]->getIterator(), F->end()); 
- 
-  // Now we create the new Loop object for the versioned loop. 
-  Loop *NewLoop = cloneLoop(L, L->getParentLoop(), VMap, LI, LPM); 
- 
-  // Recalculate unswitching quota, inherit simplified switches info for NewBB, 
-  // Probably clone more loop-unswitch related loop properties. 
-  BranchesInfo.cloneData(NewLoop, L, VMap); 
- 
-  Loop *ParentLoop = L->getParentLoop(); 
-  if (ParentLoop) { 
-    // Make sure to add the cloned preheader and exit blocks to the parent loop 
-    // as well. 
-    ParentLoop->addBasicBlockToLoop(NewBlocks[0], *LI); 
-  } 
- 
-  for (unsigned EBI = 0, EBE = ExitBlocks.size(); EBI != EBE; ++EBI) { 
-    BasicBlock *NewExit = cast<BasicBlock>(VMap[ExitBlocks[EBI]]); 
-    // The new exit block should be in the same loop as the old one. 
-    if (Loop *ExitBBLoop = LI->getLoopFor(ExitBlocks[EBI])) 
-      ExitBBLoop->addBasicBlockToLoop(NewExit, *LI); 
- 
-    assert(NewExit->getTerminator()->getNumSuccessors() == 1 && 
-           "Exit block should have been split to have one successor!"); 
-    BasicBlock *ExitSucc = NewExit->getTerminator()->getSuccessor(0); 
- 
-    // If the successor of the exit block had PHI nodes, add an entry for 
-    // NewExit. 
-    for (PHINode &PN : ExitSucc->phis()) { 
-      Value *V = PN.getIncomingValueForBlock(ExitBlocks[EBI]); 
-      ValueToValueMapTy::iterator It = VMap.find(V); 
-      if (It != VMap.end()) V = It->second; 
-      PN.addIncoming(V, NewExit); 
-    } 
- 
-    if (LandingPadInst *LPad = NewExit->getLandingPadInst()) { 
-      PHINode *PN = PHINode::Create(LPad->getType(), 0, "", 
-                                    &*ExitSucc->getFirstInsertionPt()); 
- 
-      for (pred_iterator I = pred_begin(ExitSucc), E = pred_end(ExitSucc); 
-           I != E; ++I) { 
-        BasicBlock *BB = *I; 
-        LandingPadInst *LPI = BB->getLandingPadInst(); 
-        LPI->replaceAllUsesWith(PN); 
-        PN->addIncoming(LPI, BB); 
-      } 
-    } 
-  } 
- 
-  // Rewrite the code to refer to itself. 
-  for (unsigned NBI = 0, NBE = NewBlocks.size(); NBI != NBE; ++NBI) { 
-    for (Instruction &I : *NewBlocks[NBI]) { 
-      RemapInstruction(&I, VMap, 
-                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); 
-      if (auto *II = dyn_cast<IntrinsicInst>(&I)) 
-        if (II->getIntrinsicID() == Intrinsic::assume) 
-          AC->registerAssumption(II); 
-    } 
-  } 
- 
-  // Rewrite the original preheader to select between versions of the loop. 
-  BranchInst *OldBR = cast<BranchInst>(LoopPreheader->getTerminator()); 
-  assert(OldBR->isUnconditional() && OldBR->getSuccessor(0) == LoopBlocks[0] && 
-         "Preheader splitting did not work correctly!"); 
- 
-  if (MSSAU) { 
-    // Update MemorySSA after cloning, and before splitting to unreachables, 
-    // since that invalidates the 1:1 mapping of clones in VMap. 
-    LoopBlocksRPO LBRPO(L); 
-    LBRPO.perform(LI); 
-    MSSAU->updateForClonedLoop(LBRPO, ExitBlocks, VMap); 
-  } 
- 
-  // Emit the new branch that selects between the two versions of this loop. 
-  emitPreheaderBranchOnCondition(LIC, Val, NewBlocks[0], LoopBlocks[0], OldBR, 
+
+  // Next step, clone all of the basic blocks that make up the loop (including
+  // the loop preheader and exit blocks), keeping track of the mapping between
+  // the instructions and blocks.
+  NewBlocks.reserve(LoopBlocks.size());
+  ValueToValueMapTy VMap;
+  for (unsigned I = 0, E = LoopBlocks.size(); I != E; ++I) {
+    BasicBlock *NewBB = CloneBasicBlock(LoopBlocks[I], VMap, ".us", F);
+
+    NewBlocks.push_back(NewBB);
+    VMap[LoopBlocks[I]] = NewBB; // Keep the BB mapping.
+  }
+
+  // Splice the newly inserted blocks into the function right before the
+  // original preheader.
+  F->getBasicBlockList().splice(NewPreheader->getIterator(),
+                                F->getBasicBlockList(),
+                                NewBlocks[0]->getIterator(), F->end());
+
+  // Now we create the new Loop object for the versioned loop.
+  Loop *NewLoop = cloneLoop(L, L->getParentLoop(), VMap, LI, LPM);
+
+  // Recalculate unswitching quota, inherit simplified switches info for NewBB,
+  // Probably clone more loop-unswitch related loop properties.
+  BranchesInfo.cloneData(NewLoop, L, VMap);
+
+  Loop *ParentLoop = L->getParentLoop();
+  if (ParentLoop) {
+    // Make sure to add the cloned preheader and exit blocks to the parent loop
+    // as well.
+    ParentLoop->addBasicBlockToLoop(NewBlocks[0], *LI);
+  }
+
+  for (unsigned EBI = 0, EBE = ExitBlocks.size(); EBI != EBE; ++EBI) {
+    BasicBlock *NewExit = cast<BasicBlock>(VMap[ExitBlocks[EBI]]);
+    // The new exit block should be in the same loop as the old one.
+    if (Loop *ExitBBLoop = LI->getLoopFor(ExitBlocks[EBI]))
+      ExitBBLoop->addBasicBlockToLoop(NewExit, *LI);
+
+    assert(NewExit->getTerminator()->getNumSuccessors() == 1 &&
+           "Exit block should have been split to have one successor!");
+    BasicBlock *ExitSucc = NewExit->getTerminator()->getSuccessor(0);
+
+    // If the successor of the exit block had PHI nodes, add an entry for
+    // NewExit.
+    for (PHINode &PN : ExitSucc->phis()) {
+      Value *V = PN.getIncomingValueForBlock(ExitBlocks[EBI]);
+      ValueToValueMapTy::iterator It = VMap.find(V);
+      if (It != VMap.end()) V = It->second;
+      PN.addIncoming(V, NewExit);
+    }
+
+    if (LandingPadInst *LPad = NewExit->getLandingPadInst()) {
+      PHINode *PN = PHINode::Create(LPad->getType(), 0, "",
+                                    &*ExitSucc->getFirstInsertionPt());
+
+      for (pred_iterator I = pred_begin(ExitSucc), E = pred_end(ExitSucc);
+           I != E; ++I) {
+        BasicBlock *BB = *I;
+        LandingPadInst *LPI = BB->getLandingPadInst();
+        LPI->replaceAllUsesWith(PN);
+        PN->addIncoming(LPI, BB);
+      }
+    }
+  }
+
+  // Rewrite the code to refer to itself.
+  for (unsigned NBI = 0, NBE = NewBlocks.size(); NBI != NBE; ++NBI) {
+    for (Instruction &I : *NewBlocks[NBI]) {
+      RemapInstruction(&I, VMap,
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+      if (auto *II = dyn_cast<IntrinsicInst>(&I))
+        if (II->getIntrinsicID() == Intrinsic::assume)
+          AC->registerAssumption(II);
+    }
+  }
+
+  // Rewrite the original preheader to select between versions of the loop.
+  BranchInst *OldBR = cast<BranchInst>(LoopPreheader->getTerminator());
+  assert(OldBR->isUnconditional() && OldBR->getSuccessor(0) == LoopBlocks[0] &&
+         "Preheader splitting did not work correctly!");
+
+  if (MSSAU) {
+    // Update MemorySSA after cloning, and before splitting to unreachables,
+    // since that invalidates the 1:1 mapping of clones in VMap.
+    LoopBlocksRPO LBRPO(L);
+    LBRPO.perform(LI);
+    MSSAU->updateForClonedLoop(LBRPO, ExitBlocks, VMap);
+  }
+
+  // Emit the new branch that selects between the two versions of this loop.
+  emitPreheaderBranchOnCondition(LIC, Val, NewBlocks[0], LoopBlocks[0], OldBR,
                                  TI, ToDuplicate);
-  if (MSSAU) { 
-    // Update MemoryPhis in Exit blocks. 
-    MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMap, *DT); 
-    if (VerifyMemorySSA) 
-      MSSA->verifyMemorySSA(); 
-  } 
- 
-  // The OldBr was replaced by a new one and removed (but not erased) by 
-  // emitPreheaderBranchOnCondition. It is no longer needed, so delete it. 
-  delete OldBR; 
- 
-  LoopProcessWorklist.push_back(NewLoop); 
-  RedoLoop = true; 
- 
-  // Keep a WeakTrackingVH holding onto LIC.  If the first call to 
-  // RewriteLoopBody 
-  // deletes the instruction (for example by simplifying a PHI that feeds into 
-  // the condition that we're unswitching on), we don't rewrite the second 
-  // iteration. 
-  WeakTrackingVH LICHandle(LIC); 
- 
+  if (MSSAU) {
+    // Update MemoryPhis in Exit blocks.
+    MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMap, *DT);
+    if (VerifyMemorySSA)
+      MSSA->verifyMemorySSA();
+  }
+
+  // The OldBr was replaced by a new one and removed (but not erased) by
+  // emitPreheaderBranchOnCondition. It is no longer needed, so delete it.
+  delete OldBR;
+
+  LoopProcessWorklist.push_back(NewLoop);
+  RedoLoop = true;
+
+  // Keep a WeakTrackingVH holding onto LIC.  If the first call to
+  // RewriteLoopBody
+  // deletes the instruction (for example by simplifying a PHI that feeds into
+  // the condition that we're unswitching on), we don't rewrite the second
+  // iteration.
+  WeakTrackingVH LICHandle(LIC);
+
   if (ToDuplicate.empty()) {
     // Now we rewrite the original code to know that the condition is true and
     // the new code to know that the condition is false.
     rewriteLoopBodyWithConditionConstant(L, LIC, Val, /*IsEqual=*/false);
- 
+
     // It's possible that simplifying one loop could cause the other to be
     // changed to another value or a constant.  If its a constant, don't
     // simplify it.
@@ -1601,7 +1601,7 @@ void LoopUnswitch::unswitchNontrivialCondition(
                                            /*IsEqual=*/true);
     } else
       rewriteLoopBodyWithConditionConstant(L, LIC, Val, /*IsEqual=*/true);
- 
+
     // Mark the new loop as partially unswitched, to avoid unswitching on the
     // same condition again.
     auto &Context = NewLoop->getHeader()->getContext();
@@ -1613,270 +1613,270 @@ void LoopUnswitch::unswitchNontrivialCondition(
     NewLoop->setLoopID(NewLoopID);
   }
 
-  if (MSSA && VerifyMemorySSA) 
-    MSSA->verifyMemorySSA(); 
-} 
- 
-/// Remove all instances of I from the worklist vector specified. 
-static void removeFromWorklist(Instruction *I, 
-                               std::vector<Instruction *> &Worklist) { 
+  if (MSSA && VerifyMemorySSA)
+    MSSA->verifyMemorySSA();
+}
+
+/// Remove all instances of I from the worklist vector specified.
+static void removeFromWorklist(Instruction *I,
+                               std::vector<Instruction *> &Worklist) {
   llvm::erase_value(Worklist, I);
-} 
- 
-/// When we find that I really equals V, remove I from the 
-/// program, replacing all uses with V and update the worklist. 
-static void replaceUsesOfWith(Instruction *I, Value *V, 
-                              std::vector<Instruction *> &Worklist, Loop *L, 
-                              LPPassManager *LPM, MemorySSAUpdater *MSSAU) { 
-  LLVM_DEBUG(dbgs() << "Replace with '" << *V << "': " << *I << "\n"); 
- 
-  // Add uses to the worklist, which may be dead now. 
-  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) 
-    if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i))) 
-      Worklist.push_back(Use); 
- 
-  // Add users to the worklist which may be simplified now. 
-  for (User *U : I->users()) 
-    Worklist.push_back(cast<Instruction>(U)); 
-  removeFromWorklist(I, Worklist); 
-  I->replaceAllUsesWith(V); 
-  if (!I->mayHaveSideEffects()) { 
-    if (MSSAU) 
-      MSSAU->removeMemoryAccess(I); 
-    I->eraseFromParent(); 
-  } 
-  ++NumSimplify; 
-} 
- 
-/// We know either that the value LIC has the value specified by Val in the 
-/// specified loop, or we know it does NOT have that value. 
-/// Rewrite any uses of LIC or of properties correlated to it. 
-void LoopUnswitch::rewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, 
-                                                        Constant *Val, 
-                                                        bool IsEqual) { 
-  assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?"); 
- 
-  // FIXME: Support correlated properties, like: 
-  //  for (...) 
-  //    if (li1 < li2) 
-  //      ... 
-  //    if (li1 > li2) 
-  //      ... 
- 
-  // FOLD boolean conditions (X|LIC), (X&LIC).  Fold conditional branches, 
-  // selects, switches. 
-  std::vector<Instruction*> Worklist; 
-  LLVMContext &Context = Val->getContext(); 
- 
-  // If we know that LIC == Val, or that LIC == NotVal, just replace uses of LIC 
-  // in the loop with the appropriate one directly. 
-  if (IsEqual || (isa<ConstantInt>(Val) && 
-      Val->getType()->isIntegerTy(1))) { 
-    Value *Replacement; 
-    if (IsEqual) 
-      Replacement = Val; 
-    else 
-      Replacement = ConstantInt::get(Type::getInt1Ty(Val->getContext()), 
-                                     !cast<ConstantInt>(Val)->getZExtValue()); 
- 
-    for (User *U : LIC->users()) { 
-      Instruction *UI = dyn_cast<Instruction>(U); 
-      if (!UI || !L->contains(UI)) 
-        continue; 
-      Worklist.push_back(UI); 
-    } 
- 
-    for (Instruction *UI : Worklist) 
-      UI->replaceUsesOfWith(LIC, Replacement); 
- 
-    simplifyCode(Worklist, L); 
-    return; 
-  } 
- 
-  // Otherwise, we don't know the precise value of LIC, but we do know that it 
-  // is certainly NOT "Val".  As such, simplify any uses in the loop that we 
-  // can.  This case occurs when we unswitch switch statements. 
-  for (User *U : LIC->users()) { 
-    Instruction *UI = dyn_cast<Instruction>(U); 
-    if (!UI || !L->contains(UI)) 
-      continue; 
- 
-    // At this point, we know LIC is definitely not Val. Try to use some simple 
-    // logic to simplify the user w.r.t. to the context. 
-    if (Value *Replacement = simplifyInstructionWithNotEqual(UI, LIC, Val)) { 
-      if (LI->replacementPreservesLCSSAForm(UI, Replacement)) { 
-        // This in-loop instruction has been simplified w.r.t. its context, 
-        // i.e. LIC != Val, make sure we propagate its replacement value to 
-        // all its users. 
-        // 
-        // We can not yet delete UI, the LIC user, yet, because that would invalidate 
-        // the LIC->users() iterator !. However, we can make this instruction 
-        // dead by replacing all its users and push it onto the worklist so that 
-        // it can be properly deleted and its operands simplified. 
-        UI->replaceAllUsesWith(Replacement); 
-      } 
-    } 
- 
-    // This is a LIC user, push it into the worklist so that simplifyCode can 
-    // attempt to simplify it. 
-    Worklist.push_back(UI); 
- 
-    // If we know that LIC is not Val, use this info to simplify code. 
-    SwitchInst *SI = dyn_cast<SwitchInst>(UI); 
-    if (!SI || !isa<ConstantInt>(Val)) continue; 
- 
-    // NOTE: if a case value for the switch is unswitched out, we record it 
-    // after the unswitch finishes. We can not record it here as the switch 
-    // is not a direct user of the partial LIV. 
-    SwitchInst::CaseHandle DeadCase = 
-        *SI->findCaseValue(cast<ConstantInt>(Val)); 
-    // Default case is live for multiple values. 
-    if (DeadCase == *SI->case_default()) 
-      continue; 
- 
-    // Found a dead case value.  Don't remove PHI nodes in the 
-    // successor if they become single-entry, those PHI nodes may 
-    // be in the Users list. 
- 
-    BasicBlock *Switch = SI->getParent(); 
-    BasicBlock *SISucc = DeadCase.getCaseSuccessor(); 
-    BasicBlock *Latch = L->getLoopLatch(); 
- 
-    if (!SI->findCaseDest(SISucc)) continue;  // Edge is critical. 
-    // If the DeadCase successor dominates the loop latch, then the 
-    // transformation isn't safe since it will delete the sole predecessor edge 
-    // to the latch. 
-    if (Latch && DT->dominates(SISucc, Latch)) 
-      continue; 
- 
-    // FIXME: This is a hack.  We need to keep the successor around 
-    // and hooked up so as to preserve the loop structure, because 
-    // trying to update it is complicated.  So instead we preserve the 
-    // loop structure and put the block on a dead code path. 
-    SplitEdge(Switch, SISucc, DT, LI, MSSAU.get()); 
-    // Compute the successors instead of relying on the return value 
-    // of SplitEdge, since it may have split the switch successor 
-    // after PHI nodes. 
-    BasicBlock *NewSISucc = DeadCase.getCaseSuccessor(); 
-    BasicBlock *OldSISucc = *succ_begin(NewSISucc); 
-    // Create an "unreachable" destination. 
-    BasicBlock *Abort = BasicBlock::Create(Context, "us-unreachable", 
-                                           Switch->getParent(), 
-                                           OldSISucc); 
-    new UnreachableInst(Context, Abort); 
-    // Force the new case destination to branch to the "unreachable" 
-    // block while maintaining a (dead) CFG edge to the old block. 
-    NewSISucc->getTerminator()->eraseFromParent(); 
-    BranchInst::Create(Abort, OldSISucc, 
-                       ConstantInt::getTrue(Context), NewSISucc); 
-    // Release the PHI operands for this edge. 
-    for (PHINode &PN : NewSISucc->phis()) 
-      PN.setIncomingValueForBlock(Switch, UndefValue::get(PN.getType())); 
-    // Tell the domtree about the new block. We don't fully update the 
-    // domtree here -- instead we force it to do a full recomputation 
-    // after the pass is complete -- but we do need to inform it of 
-    // new blocks. 
-    DT->addNewBlock(Abort, NewSISucc); 
-  } 
- 
-  simplifyCode(Worklist, L); 
-} 
- 
-/// Now that we have simplified some instructions in the loop, walk over it and 
-/// constant prop, dce, and fold control flow where possible. Note that this is 
-/// effectively a very simple loop-structure-aware optimizer. During processing 
-/// of this loop, L could very well be deleted, so it must not be used. 
-/// 
-/// FIXME: When the loop optimizer is more mature, separate this out to a new 
-/// pass. 
-/// 
-void LoopUnswitch::simplifyCode(std::vector<Instruction *> &Worklist, Loop *L) { 
-  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 
-  while (!Worklist.empty()) { 
-    Instruction *I = Worklist.back(); 
-    Worklist.pop_back(); 
- 
-    // Simple DCE. 
-    if (isInstructionTriviallyDead(I)) { 
-      LLVM_DEBUG(dbgs() << "Remove dead instruction '" << *I << "\n"); 
- 
-      // Add uses to the worklist, which may be dead now. 
-      for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) 
-        if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i))) 
-          Worklist.push_back(Use); 
-      removeFromWorklist(I, Worklist); 
-      if (MSSAU) 
-        MSSAU->removeMemoryAccess(I); 
-      I->eraseFromParent(); 
-      ++NumSimplify; 
-      continue; 
-    } 
- 
-    // See if instruction simplification can hack this up.  This is common for 
-    // things like "select false, X, Y" after unswitching made the condition be 
-    // 'false'.  TODO: update the domtree properly so we can pass it here. 
-    if (Value *V = SimplifyInstruction(I, DL)) 
-      if (LI->replacementPreservesLCSSAForm(I, V)) { 
-        replaceUsesOfWith(I, V, Worklist, L, LPM, MSSAU.get()); 
-        continue; 
-      } 
- 
-    // Special case hacks that appear commonly in unswitched code. 
-    if (BranchInst *BI = dyn_cast<BranchInst>(I)) { 
-      if (BI->isUnconditional()) { 
-        // If BI's parent is the only pred of the successor, fold the two blocks 
-        // together. 
-        BasicBlock *Pred = BI->getParent(); 
-        (void)Pred; 
-        BasicBlock *Succ = BI->getSuccessor(0); 
-        BasicBlock *SinglePred = Succ->getSinglePredecessor(); 
-        if (!SinglePred) continue;  // Nothing to do. 
-        assert(SinglePred == Pred && "CFG broken"); 
- 
-        // Make the LPM and Worklist updates specific to LoopUnswitch. 
-        removeFromWorklist(BI, Worklist); 
-        auto SuccIt = Succ->begin(); 
-        while (PHINode *PN = dyn_cast<PHINode>(SuccIt++)) { 
-          for (unsigned It = 0, E = PN->getNumOperands(); It != E; ++It) 
-            if (Instruction *Use = dyn_cast<Instruction>(PN->getOperand(It))) 
-              Worklist.push_back(Use); 
-          for (User *U : PN->users()) 
-            Worklist.push_back(cast<Instruction>(U)); 
-          removeFromWorklist(PN, Worklist); 
-          ++NumSimplify; 
-        } 
-        // Merge the block and make the remaining analyses updates. 
-        DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); 
-        MergeBlockIntoPredecessor(Succ, &DTU, LI, MSSAU.get()); 
-        ++NumSimplify; 
-        continue; 
-      } 
- 
-      continue; 
-    } 
-  } 
-} 
- 
-/// Simple simplifications we can do given the information that Cond is 
-/// definitely not equal to Val. 
-Value *LoopUnswitch::simplifyInstructionWithNotEqual(Instruction *Inst, 
-                                                     Value *Invariant, 
-                                                     Constant *Val) { 
-  // icmp eq cond, val -> false 
-  ICmpInst *CI = dyn_cast<ICmpInst>(Inst); 
-  if (CI && CI->isEquality()) { 
-    Value *Op0 = CI->getOperand(0); 
-    Value *Op1 = CI->getOperand(1); 
-    if ((Op0 == Invariant && Op1 == Val) || (Op0 == Val && Op1 == Invariant)) { 
-      LLVMContext &Ctx = Inst->getContext(); 
-      if (CI->getPredicate() == CmpInst::ICMP_EQ) 
-        return ConstantInt::getFalse(Ctx); 
-      else 
-        return ConstantInt::getTrue(Ctx); 
-     } 
-  } 
- 
-  // FIXME: there may be other opportunities, e.g. comparison with floating 
-  // point, or Invariant - Val != 0, etc. 
-  return nullptr; 
-} 
+}
+
+/// When we find that I really equals V, remove I from the
+/// program, replacing all uses with V and update the worklist.
+static void replaceUsesOfWith(Instruction *I, Value *V,
+                              std::vector<Instruction *> &Worklist, Loop *L,
+                              LPPassManager *LPM, MemorySSAUpdater *MSSAU) {
+  LLVM_DEBUG(dbgs() << "Replace with '" << *V << "': " << *I << "\n");
+
+  // Add uses to the worklist, which may be dead now.
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+    if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i)))
+      Worklist.push_back(Use);
+
+  // Add users to the worklist which may be simplified now.
+  for (User *U : I->users())
+    Worklist.push_back(cast<Instruction>(U));
+  removeFromWorklist(I, Worklist);
+  I->replaceAllUsesWith(V);
+  if (!I->mayHaveSideEffects()) {
+    if (MSSAU)
+      MSSAU->removeMemoryAccess(I);
+    I->eraseFromParent();
+  }
+  ++NumSimplify;
+}
+
+/// We know either that the value LIC has the value specified by Val in the
+/// specified loop, or we know it does NOT have that value.
+/// Rewrite any uses of LIC or of properties correlated to it.
+void LoopUnswitch::rewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
+                                                        Constant *Val,
+                                                        bool IsEqual) {
+  assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?");
+
+  // FIXME: Support correlated properties, like:
+  //  for (...)
+  //    if (li1 < li2)
+  //      ...
+  //    if (li1 > li2)
+  //      ...
+
+  // FOLD boolean conditions (X|LIC), (X&LIC).  Fold conditional branches,
+  // selects, switches.
+  std::vector<Instruction*> Worklist;
+  LLVMContext &Context = Val->getContext();
+
+  // If we know that LIC == Val, or that LIC == NotVal, just replace uses of LIC
+  // in the loop with the appropriate one directly.
+  if (IsEqual || (isa<ConstantInt>(Val) &&
+      Val->getType()->isIntegerTy(1))) {
+    Value *Replacement;
+    if (IsEqual)
+      Replacement = Val;
+    else
+      Replacement = ConstantInt::get(Type::getInt1Ty(Val->getContext()),
+                                     !cast<ConstantInt>(Val)->getZExtValue());
+
+    for (User *U : LIC->users()) {
+      Instruction *UI = dyn_cast<Instruction>(U);
+      if (!UI || !L->contains(UI))
+        continue;
+      Worklist.push_back(UI);
+    }
+
+    for (Instruction *UI : Worklist)
+      UI->replaceUsesOfWith(LIC, Replacement);
+
+    simplifyCode(Worklist, L);
+    return;
+  }
+
+  // Otherwise, we don't know the precise value of LIC, but we do know that it
+  // is certainly NOT "Val".  As such, simplify any uses in the loop that we
+  // can.  This case occurs when we unswitch switch statements.
+  for (User *U : LIC->users()) {
+    Instruction *UI = dyn_cast<Instruction>(U);
+    if (!UI || !L->contains(UI))
+      continue;
+
+    // At this point, we know LIC is definitely not Val. Try to use some simple
+    // logic to simplify the user w.r.t. to the context.
+    if (Value *Replacement = simplifyInstructionWithNotEqual(UI, LIC, Val)) {
+      if (LI->replacementPreservesLCSSAForm(UI, Replacement)) {
+        // This in-loop instruction has been simplified w.r.t. its context,
+        // i.e. LIC != Val, make sure we propagate its replacement value to
+        // all its users.
+        //
+        // We can not yet delete UI, the LIC user, yet, because that would invalidate
+        // the LIC->users() iterator !. However, we can make this instruction
+        // dead by replacing all its users and push it onto the worklist so that
+        // it can be properly deleted and its operands simplified.
+        UI->replaceAllUsesWith(Replacement);
+      }
+    }
+
+    // This is a LIC user, push it into the worklist so that simplifyCode can
+    // attempt to simplify it.
+    Worklist.push_back(UI);
+
+    // If we know that LIC is not Val, use this info to simplify code.
+    SwitchInst *SI = dyn_cast<SwitchInst>(UI);
+    if (!SI || !isa<ConstantInt>(Val)) continue;
+
+    // NOTE: if a case value for the switch is unswitched out, we record it
+    // after the unswitch finishes. We can not record it here as the switch
+    // is not a direct user of the partial LIV.
+    SwitchInst::CaseHandle DeadCase =
+        *SI->findCaseValue(cast<ConstantInt>(Val));
+    // Default case is live for multiple values.
+    if (DeadCase == *SI->case_default())
+      continue;
+
+    // Found a dead case value.  Don't remove PHI nodes in the
+    // successor if they become single-entry, those PHI nodes may
+    // be in the Users list.
+
+    BasicBlock *Switch = SI->getParent();
+    BasicBlock *SISucc = DeadCase.getCaseSuccessor();
+    BasicBlock *Latch = L->getLoopLatch();
+
+    if (!SI->findCaseDest(SISucc)) continue;  // Edge is critical.
+    // If the DeadCase successor dominates the loop latch, then the
+    // transformation isn't safe since it will delete the sole predecessor edge
+    // to the latch.
+    if (Latch && DT->dominates(SISucc, Latch))
+      continue;
+
+    // FIXME: This is a hack.  We need to keep the successor around
+    // and hooked up so as to preserve the loop structure, because
+    // trying to update it is complicated.  So instead we preserve the
+    // loop structure and put the block on a dead code path.
+    SplitEdge(Switch, SISucc, DT, LI, MSSAU.get());
+    // Compute the successors instead of relying on the return value
+    // of SplitEdge, since it may have split the switch successor
+    // after PHI nodes.
+    BasicBlock *NewSISucc = DeadCase.getCaseSuccessor();
+    BasicBlock *OldSISucc = *succ_begin(NewSISucc);
+    // Create an "unreachable" destination.
+    BasicBlock *Abort = BasicBlock::Create(Context, "us-unreachable",
+                                           Switch->getParent(),
+                                           OldSISucc);
+    new UnreachableInst(Context, Abort);
+    // Force the new case destination to branch to the "unreachable"
+    // block while maintaining a (dead) CFG edge to the old block.
+    NewSISucc->getTerminator()->eraseFromParent();
+    BranchInst::Create(Abort, OldSISucc,
+                       ConstantInt::getTrue(Context), NewSISucc);
+    // Release the PHI operands for this edge.
+    for (PHINode &PN : NewSISucc->phis())
+      PN.setIncomingValueForBlock(Switch, UndefValue::get(PN.getType()));
+    // Tell the domtree about the new block. We don't fully update the
+    // domtree here -- instead we force it to do a full recomputation
+    // after the pass is complete -- but we do need to inform it of
+    // new blocks.
+    DT->addNewBlock(Abort, NewSISucc);
+  }
+
+  simplifyCode(Worklist, L);
+}
+
+/// Now that we have simplified some instructions in the loop, walk over it and
+/// constant prop, dce, and fold control flow where possible. Note that this is
+/// effectively a very simple loop-structure-aware optimizer. During processing
+/// of this loop, L could very well be deleted, so it must not be used.
+///
+/// FIXME: When the loop optimizer is more mature, separate this out to a new
+/// pass.
+///
+void LoopUnswitch::simplifyCode(std::vector<Instruction *> &Worklist, Loop *L) {
+  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.back();
+    Worklist.pop_back();
+
+    // Simple DCE.
+    if (isInstructionTriviallyDead(I)) {
+      LLVM_DEBUG(dbgs() << "Remove dead instruction '" << *I << "\n");
+
+      // Add uses to the worklist, which may be dead now.
+      for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+        if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i)))
+          Worklist.push_back(Use);
+      removeFromWorklist(I, Worklist);
+      if (MSSAU)
+        MSSAU->removeMemoryAccess(I);
+      I->eraseFromParent();
+      ++NumSimplify;
+      continue;
+    }
+
+    // See if instruction simplification can hack this up.  This is common for
+    // things like "select false, X, Y" after unswitching made the condition be
+    // 'false'.  TODO: update the domtree properly so we can pass it here.
+    if (Value *V = SimplifyInstruction(I, DL))
+      if (LI->replacementPreservesLCSSAForm(I, V)) {
+        replaceUsesOfWith(I, V, Worklist, L, LPM, MSSAU.get());
+        continue;
+      }
+
+    // Special case hacks that appear commonly in unswitched code.
+    if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
+      if (BI->isUnconditional()) {
+        // If BI's parent is the only pred of the successor, fold the two blocks
+        // together.
+        BasicBlock *Pred = BI->getParent();
+        (void)Pred;
+        BasicBlock *Succ = BI->getSuccessor(0);
+        BasicBlock *SinglePred = Succ->getSinglePredecessor();
+        if (!SinglePred) continue;  // Nothing to do.
+        assert(SinglePred == Pred && "CFG broken");
+
+        // Make the LPM and Worklist updates specific to LoopUnswitch.
+        removeFromWorklist(BI, Worklist);
+        auto SuccIt = Succ->begin();
+        while (PHINode *PN = dyn_cast<PHINode>(SuccIt++)) {
+          for (unsigned It = 0, E = PN->getNumOperands(); It != E; ++It)
+            if (Instruction *Use = dyn_cast<Instruction>(PN->getOperand(It)))
+              Worklist.push_back(Use);
+          for (User *U : PN->users())
+            Worklist.push_back(cast<Instruction>(U));
+          removeFromWorklist(PN, Worklist);
+          ++NumSimplify;
+        }
+        // Merge the block and make the remaining analyses updates.
+        DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+        MergeBlockIntoPredecessor(Succ, &DTU, LI, MSSAU.get());
+        ++NumSimplify;
+        continue;
+      }
+
+      continue;
+    }
+  }
+}
+
+/// Simple simplifications we can do given the information that Cond is
+/// definitely not equal to Val.
+Value *LoopUnswitch::simplifyInstructionWithNotEqual(Instruction *Inst,
+                                                     Value *Invariant,
+                                                     Constant *Val) {
+  // icmp eq cond, val -> false
+  ICmpInst *CI = dyn_cast<ICmpInst>(Inst);
+  if (CI && CI->isEquality()) {
+    Value *Op0 = CI->getOperand(0);
+    Value *Op1 = CI->getOperand(1);
+    if ((Op0 == Invariant && Op1 == Val) || (Op0 == Val && Op1 == Invariant)) {
+      LLVMContext &Ctx = Inst->getContext();
+      if (CI->getPredicate() == CmpInst::ICMP_EQ)
+        return ConstantInt::getFalse(Ctx);
+      else
+        return ConstantInt::getTrue(Ctx);
+     }
+  }
+
+  // FIXME: there may be other opportunities, e.g. comparison with floating
+  // point, or Invariant - Val != 0, etc.
+  return nullptr;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopVersioningLICM.cpp
index c8cd007438..2ff1e84807 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopVersioningLICM.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -1,147 +1,147 @@
-//===- LoopVersioningLICM.cpp - LICM Loop Versioning ----------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// When alias analysis is uncertain about the aliasing between any two accesses, 
-// it will return MayAlias. This uncertainty from alias analysis restricts LICM 
-// from proceeding further. In cases where alias analysis is uncertain we might 
-// use loop versioning as an alternative. 
-// 
-// Loop Versioning will create a version of the loop with aggressive aliasing 
-// assumptions in addition to the original with conservative (default) aliasing 
-// assumptions. The version of the loop making aggressive aliasing assumptions 
-// will have all the memory accesses marked as no-alias. These two versions of 
-// loop will be preceded by a memory runtime check. This runtime check consists 
-// of bound checks for all unique memory accessed in loop, and it ensures the 
-// lack of memory aliasing. The result of the runtime check determines which of 
-// the loop versions is executed: If the runtime check detects any memory 
-// aliasing, then the original loop is executed. Otherwise, the version with 
-// aggressive aliasing assumptions is used. 
-// 
-// Following are the top level steps: 
-// 
-// a) Perform LoopVersioningLICM's feasibility check. 
-// b) If loop is a candidate for versioning then create a memory bound check, 
-//    by considering all the memory accesses in loop body. 
-// c) Clone original loop and set all memory accesses as no-alias in new loop. 
-// d) Set original loop & versioned loop as a branch target of the runtime check 
-//    result. 
-// 
-// It transforms loop as shown below: 
-// 
-//                         +----------------+ 
-//                         |Runtime Memcheck| 
-//                         +----------------+ 
-//                                 | 
-//              +----------+----------------+----------+ 
-//              |                                      | 
-//    +---------+----------+               +-----------+----------+ 
-//    |Orig Loop Preheader |               |Cloned Loop Preheader | 
-//    +--------------------+               +----------------------+ 
-//              |                                      | 
-//    +--------------------+               +----------------------+ 
-//    |Orig Loop Body      |               |Cloned Loop Body      | 
-//    +--------------------+               +----------------------+ 
-//              |                                      | 
-//    +--------------------+               +----------------------+ 
-//    |Orig Loop Exit Block|               |Cloned Loop Exit Block| 
-//    +--------------------+               +-----------+----------+ 
-//              |                                      | 
-//              +----------+--------------+-----------+ 
-//                                 | 
-//                           +-----+----+ 
-//                           |Join Block| 
-//                           +----------+ 
-// 
-//===----------------------------------------------------------------------===// 
- 
+//===- LoopVersioningLICM.cpp - LICM Loop Versioning ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// When alias analysis is uncertain about the aliasing between any two accesses,
+// it will return MayAlias. This uncertainty from alias analysis restricts LICM
+// from proceeding further. In cases where alias analysis is uncertain we might
+// use loop versioning as an alternative.
+//
+// Loop Versioning will create a version of the loop with aggressive aliasing
+// assumptions in addition to the original with conservative (default) aliasing
+// assumptions. The version of the loop making aggressive aliasing assumptions
+// will have all the memory accesses marked as no-alias. These two versions of
+// loop will be preceded by a memory runtime check. This runtime check consists
+// of bound checks for all unique memory accessed in loop, and it ensures the
+// lack of memory aliasing. The result of the runtime check determines which of
+// the loop versions is executed: If the runtime check detects any memory
+// aliasing, then the original loop is executed. Otherwise, the version with
+// aggressive aliasing assumptions is used.
+//
+// Following are the top level steps:
+//
+// a) Perform LoopVersioningLICM's feasibility check.
+// b) If loop is a candidate for versioning then create a memory bound check,
+//    by considering all the memory accesses in loop body.
+// c) Clone original loop and set all memory accesses as no-alias in new loop.
+// d) Set original loop & versioned loop as a branch target of the runtime check
+//    result.
+//
+// It transforms loop as shown below:
+//
+//                         +----------------+
+//                         |Runtime Memcheck|
+//                         +----------------+
+//                                 |
+//              +----------+----------------+----------+
+//              |                                      |
+//    +---------+----------+               +-----------+----------+
+//    |Orig Loop Preheader |               |Cloned Loop Preheader |
+//    +--------------------+               +----------------------+
+//              |                                      |
+//    +--------------------+               +----------------------+
+//    |Orig Loop Body      |               |Cloned Loop Body      |
+//    +--------------------+               +----------------------+
+//              |                                      |
+//    +--------------------+               +----------------------+
+//    |Orig Loop Exit Block|               |Cloned Loop Exit Block|
+//    +--------------------+               +-----------+----------+
+//              |                                      |
+//              +----------+--------------+-----------+
+//                                 |
+//                           +-----+----+
+//                           |Join Block|
+//                           +----------+
+//
+//===----------------------------------------------------------------------===//
+
 #include "llvm/Transforms/Scalar/LoopVersioningLICM.h"
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/StringRef.h" 
-#include "llvm/Analysis/AliasAnalysis.h" 
-#include "llvm/Analysis/AliasSetTracker.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/LoopAccessAnalysis.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/LoopPass.h" 
-#include "llvm/Analysis/OptimizationRemarkEmitter.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/MDBuilder.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils.h" 
-#include "llvm/Transforms/Utils/LoopUtils.h" 
-#include "llvm/Transforms/Utils/LoopVersioning.h" 
-#include <cassert> 
-#include <memory> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "loop-versioning-licm" 
- 
-static const char *LICMVersioningMetaData = "llvm.loop.licm_versioning.disable"; 
- 
-/// Threshold minimum allowed percentage for possible 
-/// invariant instructions in a loop. 
-static cl::opt<float> 
-    LVInvarThreshold("licm-versioning-invariant-threshold", 
-                     cl::desc("LoopVersioningLICM's minimum allowed percentage" 
-                              "of possible invariant instructions per loop"), 
-                     cl::init(25), cl::Hidden); 
- 
-/// Threshold for maximum allowed loop nest/depth 
-static cl::opt<unsigned> LVLoopDepthThreshold( 
-    "licm-versioning-max-depth-threshold", 
-    cl::desc( 
-        "LoopVersioningLICM's threshold for maximum allowed loop nest/depth"), 
-    cl::init(2), cl::Hidden); 
- 
-namespace { 
- 
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include <cassert>
+#include <memory>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-versioning-licm"
+
+static const char *LICMVersioningMetaData = "llvm.loop.licm_versioning.disable";
+
+/// Threshold minimum allowed percentage for possible
+/// invariant instructions in a loop.
+static cl::opt<float>
+    LVInvarThreshold("licm-versioning-invariant-threshold",
+                     cl::desc("LoopVersioningLICM's minimum allowed percentage"
+                              "of possible invariant instructions per loop"),
+                     cl::init(25), cl::Hidden);
+
+/// Threshold for maximum allowed loop nest/depth
+static cl::opt<unsigned> LVLoopDepthThreshold(
+    "licm-versioning-max-depth-threshold",
+    cl::desc(
+        "LoopVersioningLICM's threshold for maximum allowed loop nest/depth"),
+    cl::init(2), cl::Hidden);
+
+namespace {
+
 struct LoopVersioningLICMLegacyPass : public LoopPass {
-  static char ID; 
- 
+  static char ID;
+
   LoopVersioningLICMLegacyPass() : LoopPass(ID) {
     initializeLoopVersioningLICMLegacyPassPass(
         *PassRegistry::getPassRegistry());
-  } 
- 
-  bool runOnLoop(Loop *L, LPPassManager &LPM) override; 
- 
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+
   StringRef getPassName() const override { return "Loop Versioning for LICM"; }
 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.setPreservesCFG(); 
-    AU.addRequired<AAResultsWrapperPass>(); 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addRequiredID(LCSSAID); 
-    AU.addRequired<LoopAccessLegacyAnalysis>(); 
-    AU.addRequired<LoopInfoWrapperPass>(); 
-    AU.addRequiredID(LoopSimplifyID); 
-    AU.addRequired<ScalarEvolutionWrapperPass>(); 
-    AU.addPreserved<AAResultsWrapperPass>(); 
-    AU.addPreserved<GlobalsAAWrapperPass>(); 
-    AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 
-  } 
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequiredID(LCSSAID);
+    AU.addRequired<LoopAccessLegacyAnalysis>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequiredID(LoopSimplifyID);
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addPreserved<AAResultsWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+  }
 };
- 
+
 struct LoopVersioningLICM {
   // We don't explicitly pass in LoopAccessInfo to the constructor since the
   // loop versioning might return early due to instructions that are not safe
@@ -153,429 +153,429 @@ struct LoopVersioningLICM {
       : AA(AA), SE(SE), GetLAI(GetLAI),
         LoopDepthThreshold(LVLoopDepthThreshold),
         InvariantThreshold(LVInvarThreshold), ORE(ORE) {}
- 
+
   bool runOnLoop(Loop *L, LoopInfo *LI, DominatorTree *DT);
 
-  void reset() { 
-    AA = nullptr; 
-    SE = nullptr; 
-    CurLoop = nullptr; 
-    LoadAndStoreCounter = 0; 
-    InvariantCounter = 0; 
-    IsReadOnlyLoop = true; 
-    ORE = nullptr; 
-    CurAST.reset(); 
-  } 
- 
-  class AutoResetter { 
-  public: 
-    AutoResetter(LoopVersioningLICM &LVLICM) : LVLICM(LVLICM) {} 
-    ~AutoResetter() { LVLICM.reset(); } 
- 
-  private: 
-    LoopVersioningLICM &LVLICM; 
-  }; 
- 
-private: 
-  // Current AliasAnalysis information 
-  AliasAnalysis *AA = nullptr; 
- 
-  // Current ScalarEvolution 
-  ScalarEvolution *SE = nullptr; 
- 
-  // Current Loop's LoopAccessInfo 
-  const LoopAccessInfo *LAI = nullptr; 
- 
+  void reset() {
+    AA = nullptr;
+    SE = nullptr;
+    CurLoop = nullptr;
+    LoadAndStoreCounter = 0;
+    InvariantCounter = 0;
+    IsReadOnlyLoop = true;
+    ORE = nullptr;
+    CurAST.reset();
+  }
+
+  class AutoResetter {
+  public:
+    AutoResetter(LoopVersioningLICM &LVLICM) : LVLICM(LVLICM) {}
+    ~AutoResetter() { LVLICM.reset(); }
+
+  private:
+    LoopVersioningLICM &LVLICM;
+  };
+
+private:
+  // Current AliasAnalysis information
+  AliasAnalysis *AA = nullptr;
+
+  // Current ScalarEvolution
+  ScalarEvolution *SE = nullptr;
+
+  // Current Loop's LoopAccessInfo
+  const LoopAccessInfo *LAI = nullptr;
+
   // Proxy for retrieving LoopAccessInfo.
   function_ref<const LoopAccessInfo &(Loop *)> GetLAI;
 
-  // The current loop we are working on. 
-  Loop *CurLoop = nullptr; 
- 
-  // AliasSet information for the current loop. 
-  std::unique_ptr<AliasSetTracker> CurAST; 
- 
-  // Maximum loop nest threshold 
-  unsigned LoopDepthThreshold; 
- 
-  // Minimum invariant threshold 
-  float InvariantThreshold; 
- 
-  // Counter to track num of load & store 
-  unsigned LoadAndStoreCounter = 0; 
- 
-  // Counter to track num of invariant 
-  unsigned InvariantCounter = 0; 
- 
-  // Read only loop marker. 
-  bool IsReadOnlyLoop = true; 
- 
-  // OptimizationRemarkEmitter 
-  OptimizationRemarkEmitter *ORE; 
- 
-  bool isLegalForVersioning(); 
-  bool legalLoopStructure(); 
-  bool legalLoopInstructions(); 
-  bool legalLoopMemoryAccesses(); 
-  bool isLoopAlreadyVisited(); 
-  void setNoAliasToLoop(Loop *VerLoop); 
-  bool instructionSafeForVersioning(Instruction *I); 
-}; 
- 
-} // end anonymous namespace 
- 
-/// Check loop structure and confirms it's good for LoopVersioningLICM. 
-bool LoopVersioningLICM::legalLoopStructure() { 
-  // Loop must be in loop simplify form. 
-  if (!CurLoop->isLoopSimplifyForm()) { 
-    LLVM_DEBUG(dbgs() << "    loop is not in loop-simplify form.\n"); 
-    return false; 
-  } 
-  // Loop should be innermost loop, if not return false. 
-  if (!CurLoop->getSubLoops().empty()) { 
-    LLVM_DEBUG(dbgs() << "    loop is not innermost\n"); 
-    return false; 
-  } 
-  // Loop should have a single backedge, if not return false. 
-  if (CurLoop->getNumBackEdges() != 1) { 
-    LLVM_DEBUG(dbgs() << "    loop has multiple backedges\n"); 
-    return false; 
-  } 
-  // Loop must have a single exiting block, if not return false. 
-  if (!CurLoop->getExitingBlock()) { 
-    LLVM_DEBUG(dbgs() << "    loop has multiple exiting block\n"); 
-    return false; 
-  } 
-  // We only handle bottom-tested loop, i.e. loop in which the condition is 
-  // checked at the end of each iteration. With that we can assume that all 
-  // instructions in the loop are executed the same number of times. 
-  if (CurLoop->getExitingBlock() != CurLoop->getLoopLatch()) { 
-    LLVM_DEBUG(dbgs() << "    loop is not bottom tested\n"); 
-    return false; 
-  } 
-  // Parallel loops must not have aliasing loop-invariant memory accesses. 
-  // Hence we don't need to version anything in this case. 
-  if (CurLoop->isAnnotatedParallel()) { 
-    LLVM_DEBUG(dbgs() << "    Parallel loop is not worth versioning\n"); 
-    return false; 
-  } 
-  // Loop depth more then LoopDepthThreshold are not allowed 
-  if (CurLoop->getLoopDepth() > LoopDepthThreshold) { 
-    LLVM_DEBUG(dbgs() << "    loop depth is more then threshold\n"); 
-    return false; 
-  } 
-  // We need to be able to compute the loop trip count in order 
-  // to generate the bound checks. 
-  const SCEV *ExitCount = SE->getBackedgeTakenCount(CurLoop); 
+  // The current loop we are working on.
+  Loop *CurLoop = nullptr;
+
+  // AliasSet information for the current loop.
+  std::unique_ptr<AliasSetTracker> CurAST;
+
+  // Maximum loop nest threshold
+  unsigned LoopDepthThreshold;
+
+  // Minimum invariant threshold
+  float InvariantThreshold;
+
+  // Counter to track num of load & store
+  unsigned LoadAndStoreCounter = 0;
+
+  // Counter to track num of invariant
+  unsigned InvariantCounter = 0;
+
+  // Read only loop marker.
+  bool IsReadOnlyLoop = true;
+
+  // OptimizationRemarkEmitter
+  OptimizationRemarkEmitter *ORE;
+
+  bool isLegalForVersioning();
+  bool legalLoopStructure();
+  bool legalLoopInstructions();
+  bool legalLoopMemoryAccesses();
+  bool isLoopAlreadyVisited();
+  void setNoAliasToLoop(Loop *VerLoop);
+  bool instructionSafeForVersioning(Instruction *I);
+};
+
+} // end anonymous namespace
+
+/// Check loop structure and confirms it's good for LoopVersioningLICM.
+bool LoopVersioningLICM::legalLoopStructure() {
+  // Loop must be in loop simplify form.
+  if (!CurLoop->isLoopSimplifyForm()) {
+    LLVM_DEBUG(dbgs() << "    loop is not in loop-simplify form.\n");
+    return false;
+  }
+  // Loop should be innermost loop, if not return false.
+  if (!CurLoop->getSubLoops().empty()) {
+    LLVM_DEBUG(dbgs() << "    loop is not innermost\n");
+    return false;
+  }
+  // Loop should have a single backedge, if not return false.
+  if (CurLoop->getNumBackEdges() != 1) {
+    LLVM_DEBUG(dbgs() << "    loop has multiple backedges\n");
+    return false;
+  }
+  // Loop must have a single exiting block, if not return false.
+  if (!CurLoop->getExitingBlock()) {
+    LLVM_DEBUG(dbgs() << "    loop has multiple exiting block\n");
+    return false;
+  }
+  // We only handle bottom-tested loop, i.e. loop in which the condition is
+  // checked at the end of each iteration. With that we can assume that all
+  // instructions in the loop are executed the same number of times.
+  if (CurLoop->getExitingBlock() != CurLoop->getLoopLatch()) {
+    LLVM_DEBUG(dbgs() << "    loop is not bottom tested\n");
+    return false;
+  }
+  // Parallel loops must not have aliasing loop-invariant memory accesses.
+  // Hence we don't need to version anything in this case.
+  if (CurLoop->isAnnotatedParallel()) {
+    LLVM_DEBUG(dbgs() << "    Parallel loop is not worth versioning\n");
+    return false;
+  }
+  // Loop depth more then LoopDepthThreshold are not allowed
+  if (CurLoop->getLoopDepth() > LoopDepthThreshold) {
+    LLVM_DEBUG(dbgs() << "    loop depth is more then threshold\n");
+    return false;
+  }
+  // We need to be able to compute the loop trip count in order
+  // to generate the bound checks.
+  const SCEV *ExitCount = SE->getBackedgeTakenCount(CurLoop);
   if (isa<SCEVCouldNotCompute>(ExitCount)) {
-    LLVM_DEBUG(dbgs() << "    loop does not has trip count\n"); 
-    return false; 
-  } 
-  return true; 
-} 
- 
-/// Check memory accesses in loop and confirms it's good for 
-/// LoopVersioningLICM. 
-bool LoopVersioningLICM::legalLoopMemoryAccesses() { 
-  bool HasMayAlias = false; 
-  bool TypeSafety = false; 
-  bool HasMod = false; 
-  // Memory check: 
-  // Transform phase will generate a versioned loop and also a runtime check to 
-  // ensure the pointers are independent and they don’t alias. 
-  // In version variant of loop, alias meta data asserts that all access are 
-  // mutually independent. 
-  // 
-  // Pointers aliasing in alias domain are avoided because with multiple 
-  // aliasing domains we may not be able to hoist potential loop invariant 
-  // access out of the loop. 
-  // 
-  // Iterate over alias tracker sets, and confirm AliasSets doesn't have any 
-  // must alias set. 
-  for (const auto &I : *CurAST) { 
-    const AliasSet &AS = I; 
-    // Skip Forward Alias Sets, as this should be ignored as part of 
-    // the AliasSetTracker object. 
-    if (AS.isForwardingAliasSet()) 
-      continue; 
-    // With MustAlias its not worth adding runtime bound check. 
-    if (AS.isMustAlias()) 
-      return false; 
-    Value *SomePtr = AS.begin()->getValue(); 
-    bool TypeCheck = true; 
-    // Check for Mod & MayAlias 
-    HasMayAlias |= AS.isMayAlias(); 
-    HasMod |= AS.isMod(); 
-    for (const auto &A : AS) { 
-      Value *Ptr = A.getValue(); 
-      // Alias tracker should have pointers of same data type. 
-      TypeCheck = (TypeCheck && (SomePtr->getType() == Ptr->getType())); 
-    } 
-    // At least one alias tracker should have pointers of same data type. 
-    TypeSafety |= TypeCheck; 
-  } 
-  // Ensure types should be of same type. 
-  if (!TypeSafety) { 
-    LLVM_DEBUG(dbgs() << "    Alias tracker type safety failed!\n"); 
-    return false; 
-  } 
-  // Ensure loop body shouldn't be read only. 
-  if (!HasMod) { 
-    LLVM_DEBUG(dbgs() << "    No memory modified in loop body\n"); 
-    return false; 
-  } 
-  // Make sure alias set has may alias case. 
-  // If there no alias memory ambiguity, return false. 
-  if (!HasMayAlias) { 
-    LLVM_DEBUG(dbgs() << "    No ambiguity in memory access.\n"); 
-    return false; 
-  } 
-  return true; 
-} 
- 
-/// Check loop instructions safe for Loop versioning. 
-/// It returns true if it's safe else returns false. 
-/// Consider following: 
-/// 1) Check all load store in loop body are non atomic & non volatile. 
-/// 2) Check function call safety, by ensuring its not accessing memory. 
-/// 3) Loop body shouldn't have any may throw instruction. 
-/// 4) Loop body shouldn't have any convergent or noduplicate instructions. 
-bool LoopVersioningLICM::instructionSafeForVersioning(Instruction *I) { 
-  assert(I != nullptr && "Null instruction found!"); 
-  // Check function call safety 
-  if (auto *Call = dyn_cast<CallBase>(I)) { 
-    if (Call->isConvergent() || Call->cannotDuplicate()) { 
-      LLVM_DEBUG(dbgs() << "    Convergent call site found.\n"); 
-      return false; 
-    } 
- 
-    if (!AA->doesNotAccessMemory(Call)) { 
-      LLVM_DEBUG(dbgs() << "    Unsafe call site found.\n"); 
-      return false; 
-    } 
-  } 
- 
-  // Avoid loops with possiblity of throw 
-  if (I->mayThrow()) { 
-    LLVM_DEBUG(dbgs() << "    May throw instruction found in loop body\n"); 
-    return false; 
-  } 
-  // If current instruction is load instructions 
-  // make sure it's a simple load (non atomic & non volatile) 
-  if (I->mayReadFromMemory()) { 
-    LoadInst *Ld = dyn_cast<LoadInst>(I); 
-    if (!Ld || !Ld->isSimple()) { 
-      LLVM_DEBUG(dbgs() << "    Found a non-simple load.\n"); 
-      return false; 
-    } 
-    LoadAndStoreCounter++; 
-    Value *Ptr = Ld->getPointerOperand(); 
-    // Check loop invariant. 
-    if (SE->isLoopInvariant(SE->getSCEV(Ptr), CurLoop)) 
-      InvariantCounter++; 
-  } 
-  // If current instruction is store instruction 
-  // make sure it's a simple store (non atomic & non volatile) 
-  else if (I->mayWriteToMemory()) { 
-    StoreInst *St = dyn_cast<StoreInst>(I); 
-    if (!St || !St->isSimple()) { 
-      LLVM_DEBUG(dbgs() << "    Found a non-simple store.\n"); 
-      return false; 
-    } 
-    LoadAndStoreCounter++; 
-    Value *Ptr = St->getPointerOperand(); 
-    // Check loop invariant. 
-    if (SE->isLoopInvariant(SE->getSCEV(Ptr), CurLoop)) 
-      InvariantCounter++; 
- 
-    IsReadOnlyLoop = false; 
-  } 
-  return true; 
-} 
- 
-/// Check loop instructions and confirms it's good for 
-/// LoopVersioningLICM. 
-bool LoopVersioningLICM::legalLoopInstructions() { 
-  // Resetting counters. 
-  LoadAndStoreCounter = 0; 
-  InvariantCounter = 0; 
-  IsReadOnlyLoop = true; 
-  using namespace ore; 
-  // Iterate over loop blocks and instructions of each block and check 
-  // instruction safety. 
-  for (auto *Block : CurLoop->getBlocks()) 
-    for (auto &Inst : *Block) { 
-      // If instruction is unsafe just return false. 
-      if (!instructionSafeForVersioning(&Inst)) { 
-        ORE->emit([&]() { 
-          return OptimizationRemarkMissed(DEBUG_TYPE, "IllegalLoopInst", &Inst) 
-                 << " Unsafe Loop Instruction"; 
-        }); 
-        return false; 
-      } 
-    } 
+    LLVM_DEBUG(dbgs() << "    loop does not has trip count\n");
+    return false;
+  }
+  return true;
+}
+
+/// Check memory accesses in loop and confirms it's good for
+/// LoopVersioningLICM.
+bool LoopVersioningLICM::legalLoopMemoryAccesses() {
+  bool HasMayAlias = false;
+  bool TypeSafety = false;
+  bool HasMod = false;
+  // Memory check:
+  // Transform phase will generate a versioned loop and also a runtime check to
+  // ensure the pointers are independent and they don’t alias.
+  // In version variant of loop, alias meta data asserts that all access are
+  // mutually independent.
+  //
+  // Pointers aliasing in alias domain are avoided because with multiple
+  // aliasing domains we may not be able to hoist potential loop invariant
+  // access out of the loop.
+  //
+  // Iterate over alias tracker sets, and confirm AliasSets doesn't have any
+  // must alias set.
+  for (const auto &I : *CurAST) {
+    const AliasSet &AS = I;
+    // Skip Forward Alias Sets, as this should be ignored as part of
+    // the AliasSetTracker object.
+    if (AS.isForwardingAliasSet())
+      continue;
+    // With MustAlias its not worth adding runtime bound check.
+    if (AS.isMustAlias())
+      return false;
+    Value *SomePtr = AS.begin()->getValue();
+    bool TypeCheck = true;
+    // Check for Mod & MayAlias
+    HasMayAlias |= AS.isMayAlias();
+    HasMod |= AS.isMod();
+    for (const auto &A : AS) {
+      Value *Ptr = A.getValue();
+      // Alias tracker should have pointers of same data type.
+      TypeCheck = (TypeCheck && (SomePtr->getType() == Ptr->getType()));
+    }
+    // At least one alias tracker should have pointers of same data type.
+    TypeSafety |= TypeCheck;
+  }
+  // Ensure types should be of same type.
+  if (!TypeSafety) {
+    LLVM_DEBUG(dbgs() << "    Alias tracker type safety failed!\n");
+    return false;
+  }
+  // Ensure loop body shouldn't be read only.
+  if (!HasMod) {
+    LLVM_DEBUG(dbgs() << "    No memory modified in loop body\n");
+    return false;
+  }
+  // Make sure alias set has may alias case.
+  // If there no alias memory ambiguity, return false.
+  if (!HasMayAlias) {
+    LLVM_DEBUG(dbgs() << "    No ambiguity in memory access.\n");
+    return false;
+  }
+  return true;
+}
+
+/// Check loop instructions safe for Loop versioning.
+/// It returns true if it's safe else returns false.
+/// Consider following:
+/// 1) Check all load store in loop body are non atomic & non volatile.
+/// 2) Check function call safety, by ensuring its not accessing memory.
+/// 3) Loop body shouldn't have any may throw instruction.
+/// 4) Loop body shouldn't have any convergent or noduplicate instructions.
+bool LoopVersioningLICM::instructionSafeForVersioning(Instruction *I) {
+  assert(I != nullptr && "Null instruction found!");
+  // Check function call safety
+  if (auto *Call = dyn_cast<CallBase>(I)) {
+    if (Call->isConvergent() || Call->cannotDuplicate()) {
+      LLVM_DEBUG(dbgs() << "    Convergent call site found.\n");
+      return false;
+    }
+
+    if (!AA->doesNotAccessMemory(Call)) {
+      LLVM_DEBUG(dbgs() << "    Unsafe call site found.\n");
+      return false;
+    }
+  }
+
+  // Avoid loops with possiblity of throw
+  if (I->mayThrow()) {
+    LLVM_DEBUG(dbgs() << "    May throw instruction found in loop body\n");
+    return false;
+  }
+  // If current instruction is load instructions
+  // make sure it's a simple load (non atomic & non volatile)
+  if (I->mayReadFromMemory()) {
+    LoadInst *Ld = dyn_cast<LoadInst>(I);
+    if (!Ld || !Ld->isSimple()) {
+      LLVM_DEBUG(dbgs() << "    Found a non-simple load.\n");
+      return false;
+    }
+    LoadAndStoreCounter++;
+    Value *Ptr = Ld->getPointerOperand();
+    // Check loop invariant.
+    if (SE->isLoopInvariant(SE->getSCEV(Ptr), CurLoop))
+      InvariantCounter++;
+  }
+  // If current instruction is store instruction
+  // make sure it's a simple store (non atomic & non volatile)
+  else if (I->mayWriteToMemory()) {
+    StoreInst *St = dyn_cast<StoreInst>(I);
+    if (!St || !St->isSimple()) {
+      LLVM_DEBUG(dbgs() << "    Found a non-simple store.\n");
+      return false;
+    }
+    LoadAndStoreCounter++;
+    Value *Ptr = St->getPointerOperand();
+    // Check loop invariant.
+    if (SE->isLoopInvariant(SE->getSCEV(Ptr), CurLoop))
+      InvariantCounter++;
+
+    IsReadOnlyLoop = false;
+  }
+  return true;
+}
+
+/// Check loop instructions and confirms it's good for
+/// LoopVersioningLICM.
+bool LoopVersioningLICM::legalLoopInstructions() {
+  // Resetting counters.
+  LoadAndStoreCounter = 0;
+  InvariantCounter = 0;
+  IsReadOnlyLoop = true;
+  using namespace ore;
+  // Iterate over loop blocks and instructions of each block and check
+  // instruction safety.
+  for (auto *Block : CurLoop->getBlocks())
+    for (auto &Inst : *Block) {
+      // If instruction is unsafe just return false.
+      if (!instructionSafeForVersioning(&Inst)) {
+        ORE->emit([&]() {
+          return OptimizationRemarkMissed(DEBUG_TYPE, "IllegalLoopInst", &Inst)
+                 << " Unsafe Loop Instruction";
+        });
+        return false;
+      }
+    }
   // Get LoopAccessInfo from current loop via the proxy.
   LAI = &GetLAI(CurLoop);
-  // Check LoopAccessInfo for need of runtime check. 
-  if (LAI->getRuntimePointerChecking()->getChecks().empty()) { 
-    LLVM_DEBUG(dbgs() << "    LAA: Runtime check not found !!\n"); 
-    return false; 
-  } 
-  // Number of runtime-checks should be less then RuntimeMemoryCheckThreshold 
-  if (LAI->getNumRuntimePointerChecks() > 
-      VectorizerParams::RuntimeMemoryCheckThreshold) { 
-    LLVM_DEBUG( 
-        dbgs() << "    LAA: Runtime checks are more than threshold !!\n"); 
-    ORE->emit([&]() { 
-      return OptimizationRemarkMissed(DEBUG_TYPE, "RuntimeCheck", 
-                                      CurLoop->getStartLoc(), 
-                                      CurLoop->getHeader()) 
-             << "Number of runtime checks " 
-             << NV("RuntimeChecks", LAI->getNumRuntimePointerChecks()) 
-             << " exceeds threshold " 
-             << NV("Threshold", VectorizerParams::RuntimeMemoryCheckThreshold); 
-    }); 
-    return false; 
-  } 
-  // Loop should have at least one invariant load or store instruction. 
-  if (!InvariantCounter) { 
-    LLVM_DEBUG(dbgs() << "    Invariant not found !!\n"); 
-    return false; 
-  } 
-  // Read only loop not allowed. 
-  if (IsReadOnlyLoop) { 
-    LLVM_DEBUG(dbgs() << "    Found a read-only loop!\n"); 
-    return false; 
-  } 
-  // Profitablity check: 
-  // Check invariant threshold, should be in limit. 
-  if (InvariantCounter * 100 < InvariantThreshold * LoadAndStoreCounter) { 
-    LLVM_DEBUG( 
-        dbgs() 
-        << "    Invariant load & store are less then defined threshold\n"); 
-    LLVM_DEBUG(dbgs() << "    Invariant loads & stores: " 
-                      << ((InvariantCounter * 100) / LoadAndStoreCounter) 
-                      << "%\n"); 
-    LLVM_DEBUG(dbgs() << "    Invariant loads & store threshold: " 
-                      << InvariantThreshold << "%\n"); 
-    ORE->emit([&]() { 
-      return OptimizationRemarkMissed(DEBUG_TYPE, "InvariantThreshold", 
-                                      CurLoop->getStartLoc(), 
-                                      CurLoop->getHeader()) 
-             << "Invariant load & store " 
-             << NV("LoadAndStoreCounter", 
-                   ((InvariantCounter * 100) / LoadAndStoreCounter)) 
-             << " are less then defined threshold " 
-             << NV("Threshold", InvariantThreshold); 
-    }); 
-    return false; 
-  } 
-  return true; 
-} 
- 
-/// It checks loop is already visited or not. 
-/// check loop meta data, if loop revisited return true 
-/// else false. 
-bool LoopVersioningLICM::isLoopAlreadyVisited() { 
-  // Check LoopVersioningLICM metadata into loop 
-  if (findStringMetadataForLoop(CurLoop, LICMVersioningMetaData)) { 
-    return true; 
-  } 
-  return false; 
-} 
- 
-/// Checks legality for LoopVersioningLICM by considering following: 
-/// a) loop structure legality   b) loop instruction legality 
-/// c) loop memory access legality. 
-/// Return true if legal else returns false. 
-bool LoopVersioningLICM::isLegalForVersioning() { 
-  using namespace ore; 
-  LLVM_DEBUG(dbgs() << "Loop: " << *CurLoop); 
-  // Make sure not re-visiting same loop again. 
-  if (isLoopAlreadyVisited()) { 
-    LLVM_DEBUG( 
-        dbgs() << "    Revisiting loop in LoopVersioningLICM not allowed.\n\n"); 
-    return false; 
-  } 
-  // Check loop structure leagality. 
-  if (!legalLoopStructure()) { 
-    LLVM_DEBUG( 
-        dbgs() << "    Loop structure not suitable for LoopVersioningLICM\n\n"); 
-    ORE->emit([&]() { 
-      return OptimizationRemarkMissed(DEBUG_TYPE, "IllegalLoopStruct", 
-                                      CurLoop->getStartLoc(), 
-                                      CurLoop->getHeader()) 
-             << " Unsafe Loop structure"; 
-    }); 
-    return false; 
-  } 
-  // Check loop instruction leagality. 
-  if (!legalLoopInstructions()) { 
-    LLVM_DEBUG( 
-        dbgs() 
-        << "    Loop instructions not suitable for LoopVersioningLICM\n\n"); 
-    return false; 
-  } 
-  // Check loop memory access leagality. 
-  if (!legalLoopMemoryAccesses()) { 
-    LLVM_DEBUG( 
-        dbgs() 
-        << "    Loop memory access not suitable for LoopVersioningLICM\n\n"); 
-    ORE->emit([&]() { 
-      return OptimizationRemarkMissed(DEBUG_TYPE, "IllegalLoopMemoryAccess", 
-                                      CurLoop->getStartLoc(), 
-                                      CurLoop->getHeader()) 
-             << " Unsafe Loop memory access"; 
-    }); 
-    return false; 
-  } 
-  // Loop versioning is feasible, return true. 
-  LLVM_DEBUG(dbgs() << "    Loop Versioning found to be beneficial\n\n"); 
-  ORE->emit([&]() { 
-    return OptimizationRemark(DEBUG_TYPE, "IsLegalForVersioning", 
-                              CurLoop->getStartLoc(), CurLoop->getHeader()) 
-           << " Versioned loop for LICM." 
-           << " Number of runtime checks we had to insert " 
-           << NV("RuntimeChecks", LAI->getNumRuntimePointerChecks()); 
-  }); 
-  return true; 
-} 
- 
-/// Update loop with aggressive aliasing assumptions. 
-/// It marks no-alias to any pairs of memory operations by assuming 
-/// loop should not have any must-alias memory accesses pairs. 
-/// During LoopVersioningLICM legality we ignore loops having must 
-/// aliasing memory accesses. 
-void LoopVersioningLICM::setNoAliasToLoop(Loop *VerLoop) { 
-  // Get latch terminator instruction. 
-  Instruction *I = VerLoop->getLoopLatch()->getTerminator(); 
-  // Create alias scope domain. 
-  MDBuilder MDB(I->getContext()); 
-  MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain("LVDomain"); 
-  StringRef Name = "LVAliasScope"; 
-  MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name); 
+  // Check LoopAccessInfo for need of runtime check.
+  if (LAI->getRuntimePointerChecking()->getChecks().empty()) {
+    LLVM_DEBUG(dbgs() << "    LAA: Runtime check not found !!\n");
+    return false;
+  }
+  // Number of runtime-checks should be less then RuntimeMemoryCheckThreshold
+  if (LAI->getNumRuntimePointerChecks() >
+      VectorizerParams::RuntimeMemoryCheckThreshold) {
+    LLVM_DEBUG(
+        dbgs() << "    LAA: Runtime checks are more than threshold !!\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "RuntimeCheck",
+                                      CurLoop->getStartLoc(),
+                                      CurLoop->getHeader())
+             << "Number of runtime checks "
+             << NV("RuntimeChecks", LAI->getNumRuntimePointerChecks())
+             << " exceeds threshold "
+             << NV("Threshold", VectorizerParams::RuntimeMemoryCheckThreshold);
+    });
+    return false;
+  }
+  // Loop should have at least one invariant load or store instruction.
+  if (!InvariantCounter) {
+    LLVM_DEBUG(dbgs() << "    Invariant not found !!\n");
+    return false;
+  }
+  // Read only loop not allowed.
+  if (IsReadOnlyLoop) {
+    LLVM_DEBUG(dbgs() << "    Found a read-only loop!\n");
+    return false;
+  }
+  // Profitablity check:
+  // Check invariant threshold, should be in limit.
+  if (InvariantCounter * 100 < InvariantThreshold * LoadAndStoreCounter) {
+    LLVM_DEBUG(
+        dbgs()
+        << "    Invariant load & store are less then defined threshold\n");
+    LLVM_DEBUG(dbgs() << "    Invariant loads & stores: "
+                      << ((InvariantCounter * 100) / LoadAndStoreCounter)
+                      << "%\n");
+    LLVM_DEBUG(dbgs() << "    Invariant loads & store threshold: "
+                      << InvariantThreshold << "%\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "InvariantThreshold",
+                                      CurLoop->getStartLoc(),
+                                      CurLoop->getHeader())
+             << "Invariant load & store "
+             << NV("LoadAndStoreCounter",
+                   ((InvariantCounter * 100) / LoadAndStoreCounter))
+             << " are less then defined threshold "
+             << NV("Threshold", InvariantThreshold);
+    });
+    return false;
+  }
+  return true;
+}
+
+/// It checks loop is already visited or not.
+/// check loop meta data, if loop revisited return true
+/// else false.
+bool LoopVersioningLICM::isLoopAlreadyVisited() {
+  // Check LoopVersioningLICM metadata into loop
+  if (findStringMetadataForLoop(CurLoop, LICMVersioningMetaData)) {
+    return true;
+  }
+  return false;
+}
+
+/// Checks legality for LoopVersioningLICM by considering following:
+/// a) loop structure legality   b) loop instruction legality
+/// c) loop memory access legality.
+/// Return true if legal else returns false.
+bool LoopVersioningLICM::isLegalForVersioning() {
+  using namespace ore;
+  LLVM_DEBUG(dbgs() << "Loop: " << *CurLoop);
+  // Make sure not re-visiting same loop again.
+  if (isLoopAlreadyVisited()) {
+    LLVM_DEBUG(
+        dbgs() << "    Revisiting loop in LoopVersioningLICM not allowed.\n\n");
+    return false;
+  }
+  // Check loop structure leagality.
+  if (!legalLoopStructure()) {
+    LLVM_DEBUG(
+        dbgs() << "    Loop structure not suitable for LoopVersioningLICM\n\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "IllegalLoopStruct",
+                                      CurLoop->getStartLoc(),
+                                      CurLoop->getHeader())
+             << " Unsafe Loop structure";
+    });
+    return false;
+  }
+  // Check loop instruction leagality.
+  if (!legalLoopInstructions()) {
+    LLVM_DEBUG(
+        dbgs()
+        << "    Loop instructions not suitable for LoopVersioningLICM\n\n");
+    return false;
+  }
+  // Check loop memory access leagality.
+  if (!legalLoopMemoryAccesses()) {
+    LLVM_DEBUG(
+        dbgs()
+        << "    Loop memory access not suitable for LoopVersioningLICM\n\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "IllegalLoopMemoryAccess",
+                                      CurLoop->getStartLoc(),
+                                      CurLoop->getHeader())
+             << " Unsafe Loop memory access";
+    });
+    return false;
+  }
+  // Loop versioning is feasible, return true.
+  LLVM_DEBUG(dbgs() << "    Loop Versioning found to be beneficial\n\n");
+  ORE->emit([&]() {
+    return OptimizationRemark(DEBUG_TYPE, "IsLegalForVersioning",
+                              CurLoop->getStartLoc(), CurLoop->getHeader())
+           << " Versioned loop for LICM."
+           << " Number of runtime checks we had to insert "
+           << NV("RuntimeChecks", LAI->getNumRuntimePointerChecks());
+  });
+  return true;
+}
+
+/// Update loop with aggressive aliasing assumptions.
+/// It marks no-alias to any pairs of memory operations by assuming
+/// loop should not have any must-alias memory accesses pairs.
+/// During LoopVersioningLICM legality we ignore loops having must
+/// aliasing memory accesses.
+void LoopVersioningLICM::setNoAliasToLoop(Loop *VerLoop) {
+  // Get latch terminator instruction.
+  Instruction *I = VerLoop->getLoopLatch()->getTerminator();
+  // Create alias scope domain.
+  MDBuilder MDB(I->getContext());
+  MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain("LVDomain");
+  StringRef Name = "LVAliasScope";
+  MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name);
   SmallVector<Metadata *, 4> Scopes{NewScope}, NoAliases{NewScope};
-  // Iterate over each instruction of loop. 
-  // set no-alias for all load & store instructions. 
-  for (auto *Block : CurLoop->getBlocks()) { 
-    for (auto &Inst : *Block) { 
-      // Only interested in instruction that may modify or read memory. 
-      if (!Inst.mayReadFromMemory() && !Inst.mayWriteToMemory()) 
-        continue; 
-      // Set no-alias for current instruction. 
-      Inst.setMetadata( 
-          LLVMContext::MD_noalias, 
-          MDNode::concatenate(Inst.getMetadata(LLVMContext::MD_noalias), 
-                              MDNode::get(Inst.getContext(), NoAliases))); 
-      // set alias-scope for current instruction. 
-      Inst.setMetadata( 
-          LLVMContext::MD_alias_scope, 
-          MDNode::concatenate(Inst.getMetadata(LLVMContext::MD_alias_scope), 
-                              MDNode::get(Inst.getContext(), Scopes))); 
-    } 
-  } 
-} 
- 
+  // Iterate over each instruction of loop.
+  // set no-alias for all load & store instructions.
+  for (auto *Block : CurLoop->getBlocks()) {
+    for (auto &Inst : *Block) {
+      // Only interested in instruction that may modify or read memory.
+      if (!Inst.mayReadFromMemory() && !Inst.mayWriteToMemory())
+        continue;
+      // Set no-alias for current instruction.
+      Inst.setMetadata(
+          LLVMContext::MD_noalias,
+          MDNode::concatenate(Inst.getMetadata(LLVMContext::MD_noalias),
+                              MDNode::get(Inst.getContext(), NoAliases)));
+      // set alias-scope for current instruction.
+      Inst.setMetadata(
+          LLVMContext::MD_alias_scope,
+          MDNode::concatenate(Inst.getMetadata(LLVMContext::MD_alias_scope),
+                              MDNode::get(Inst.getContext(), Scopes)));
+    }
+  }
+}
+
 bool LoopVersioningLICMLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
   if (skipLoop(L))
     return false;
@@ -595,68 +595,68 @@ bool LoopVersioningLICMLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
 }
 
 bool LoopVersioningLICM::runOnLoop(Loop *L, LoopInfo *LI, DominatorTree *DT) {
-  // This will automatically release all resources hold by the current 
-  // LoopVersioningLICM object. 
-  AutoResetter Resetter(*this); 
- 
-  // Do not do the transformation if disabled by metadata. 
-  if (hasLICMVersioningTransformation(L) & TM_Disable) 
-    return false; 
- 
-  // Set Current Loop 
-  CurLoop = L; 
-  CurAST.reset(new AliasSetTracker(*AA)); 
- 
-  // Loop over the body of this loop, construct AST. 
-  for (auto *Block : L->getBlocks()) { 
-    if (LI->getLoopFor(Block) == L) // Ignore blocks in subloop. 
-      CurAST->add(*Block);          // Incorporate the specified basic block 
-  } 
- 
-  bool Changed = false; 
- 
-  // Check feasiblity of LoopVersioningLICM. 
-  // If versioning found to be feasible and beneficial then proceed 
-  // else simply return, by cleaning up memory. 
-  if (isLegalForVersioning()) { 
-    // Do loop versioning. 
-    // Create memcheck for memory accessed inside loop. 
-    // Clone original loop, and set blocks properly. 
+  // This will automatically release all resources hold by the current
+  // LoopVersioningLICM object.
+  AutoResetter Resetter(*this);
+
+  // Do not do the transformation if disabled by metadata.
+  if (hasLICMVersioningTransformation(L) & TM_Disable)
+    return false;
+
+  // Set Current Loop
+  CurLoop = L;
+  CurAST.reset(new AliasSetTracker(*AA));
+
+  // Loop over the body of this loop, construct AST.
+  for (auto *Block : L->getBlocks()) {
+    if (LI->getLoopFor(Block) == L) // Ignore blocks in subloop.
+      CurAST->add(*Block);          // Incorporate the specified basic block
+  }
+
+  bool Changed = false;
+
+  // Check feasiblity of LoopVersioningLICM.
+  // If versioning found to be feasible and beneficial then proceed
+  // else simply return, by cleaning up memory.
+  if (isLegalForVersioning()) {
+    // Do loop versioning.
+    // Create memcheck for memory accessed inside loop.
+    // Clone original loop, and set blocks properly.
     LoopVersioning LVer(*LAI, LAI->getRuntimePointerChecking()->getChecks(),
                         CurLoop, LI, DT, SE);
-    LVer.versionLoop(); 
-    // Set Loop Versioning metaData for original loop. 
-    addStringMetadataToLoop(LVer.getNonVersionedLoop(), LICMVersioningMetaData); 
-    // Set Loop Versioning metaData for version loop. 
-    addStringMetadataToLoop(LVer.getVersionedLoop(), LICMVersioningMetaData); 
-    // Set "llvm.mem.parallel_loop_access" metaData to versioned loop. 
-    // FIXME: "llvm.mem.parallel_loop_access" annotates memory access 
-    // instructions, not loops. 
-    addStringMetadataToLoop(LVer.getVersionedLoop(), 
-                            "llvm.mem.parallel_loop_access"); 
-    // Update version loop with aggressive aliasing assumption. 
-    setNoAliasToLoop(LVer.getVersionedLoop()); 
-    Changed = true; 
-  } 
-  return Changed; 
-} 
- 
+    LVer.versionLoop();
+    // Set Loop Versioning metaData for original loop.
+    addStringMetadataToLoop(LVer.getNonVersionedLoop(), LICMVersioningMetaData);
+    // Set Loop Versioning metaData for version loop.
+    addStringMetadataToLoop(LVer.getVersionedLoop(), LICMVersioningMetaData);
+    // Set "llvm.mem.parallel_loop_access" metaData to versioned loop.
+    // FIXME: "llvm.mem.parallel_loop_access" annotates memory access
+    // instructions, not loops.
+    addStringMetadataToLoop(LVer.getVersionedLoop(),
+                            "llvm.mem.parallel_loop_access");
+    // Update version loop with aggressive aliasing assumption.
+    setNoAliasToLoop(LVer.getVersionedLoop());
+    Changed = true;
+  }
+  return Changed;
+}
+
 char LoopVersioningLICMLegacyPass::ID = 0;
- 
+
 INITIALIZE_PASS_BEGIN(LoopVersioningLICMLegacyPass, "loop-versioning-licm",
-                      "Loop Versioning For LICM", false, false) 
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 
+                      "Loop Versioning For LICM", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
 INITIALIZE_PASS_END(LoopVersioningLICMLegacyPass, "loop-versioning-licm",
-                    "Loop Versioning For LICM", false, false) 
- 
+                    "Loop Versioning For LICM", false, false)
+
 Pass *llvm::createLoopVersioningLICMPass() {
   return new LoopVersioningLICMLegacyPass();
 }
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LowerAtomic.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LowerAtomic.cpp
index d9904a58a0..d1f67b355b 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LowerAtomic.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LowerAtomic.cpp
@@ -1,177 +1,177 @@
-//===- LowerAtomic.cpp - Lower atomic intrinsics --------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass lowers atomic intrinsics to non-atomic form for use in a known 
-// non-preemptible environment. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/LowerAtomic.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Transforms/Scalar.h" 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "loweratomic" 
- 
-static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) { 
-  IRBuilder<> Builder(CXI); 
-  Value *Ptr = CXI->getPointerOperand(); 
-  Value *Cmp = CXI->getCompareOperand(); 
-  Value *Val = CXI->getNewValOperand(); 
- 
-  LoadInst *Orig = Builder.CreateLoad(Val->getType(), Ptr); 
-  Value *Equal = Builder.CreateICmpEQ(Orig, Cmp); 
-  Value *Res = Builder.CreateSelect(Equal, Val, Orig); 
-  Builder.CreateStore(Res, Ptr); 
- 
-  Res = Builder.CreateInsertValue(UndefValue::get(CXI->getType()), Orig, 0); 
-  Res = Builder.CreateInsertValue(Res, Equal, 1); 
- 
-  CXI->replaceAllUsesWith(Res); 
-  CXI->eraseFromParent(); 
-  return true; 
-} 
- 
-static bool LowerAtomicRMWInst(AtomicRMWInst *RMWI) { 
-  IRBuilder<> Builder(RMWI); 
-  Value *Ptr = RMWI->getPointerOperand(); 
-  Value *Val = RMWI->getValOperand(); 
- 
-  LoadInst *Orig = Builder.CreateLoad(Val->getType(), Ptr); 
-  Value *Res = nullptr; 
- 
-  switch (RMWI->getOperation()) { 
-  default: llvm_unreachable("Unexpected RMW operation"); 
-  case AtomicRMWInst::Xchg: 
-    Res = Val; 
-    break; 
-  case AtomicRMWInst::Add: 
-    Res = Builder.CreateAdd(Orig, Val); 
-    break; 
-  case AtomicRMWInst::Sub: 
-    Res = Builder.CreateSub(Orig, Val); 
-    break; 
-  case AtomicRMWInst::And: 
-    Res = Builder.CreateAnd(Orig, Val); 
-    break; 
-  case AtomicRMWInst::Nand: 
-    Res = Builder.CreateNot(Builder.CreateAnd(Orig, Val)); 
-    break; 
-  case AtomicRMWInst::Or: 
-    Res = Builder.CreateOr(Orig, Val); 
-    break; 
-  case AtomicRMWInst::Xor: 
-    Res = Builder.CreateXor(Orig, Val); 
-    break; 
-  case AtomicRMWInst::Max: 
-    Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val), 
-                               Val, Orig); 
-    break; 
-  case AtomicRMWInst::Min: 
-    Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val), 
-                               Orig, Val); 
-    break; 
-  case AtomicRMWInst::UMax: 
-    Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val), 
-                               Val, Orig); 
-    break; 
-  case AtomicRMWInst::UMin: 
-    Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val), 
-                               Orig, Val); 
-    break; 
-  case AtomicRMWInst::FAdd: 
-    Res = Builder.CreateFAdd(Orig, Val); 
-    break; 
-  case AtomicRMWInst::FSub: 
-    Res = Builder.CreateFSub(Orig, Val); 
-    break; 
-  } 
-  Builder.CreateStore(Res, Ptr); 
-  RMWI->replaceAllUsesWith(Orig); 
-  RMWI->eraseFromParent(); 
-  return true; 
-} 
- 
-static bool LowerFenceInst(FenceInst *FI) { 
-  FI->eraseFromParent(); 
-  return true; 
-} 
- 
-static bool LowerLoadInst(LoadInst *LI) { 
-  LI->setAtomic(AtomicOrdering::NotAtomic); 
-  return true; 
-} 
- 
-static bool LowerStoreInst(StoreInst *SI) { 
-  SI->setAtomic(AtomicOrdering::NotAtomic); 
-  return true; 
-} 
- 
-static bool runOnBasicBlock(BasicBlock &BB) { 
-  bool Changed = false; 
-  for (Instruction &Inst : make_early_inc_range(BB)) { 
-    if (FenceInst *FI = dyn_cast<FenceInst>(&Inst)) 
-      Changed |= LowerFenceInst(FI); 
-    else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(&Inst)) 
-      Changed |= LowerAtomicCmpXchgInst(CXI); 
-    else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(&Inst)) 
-      Changed |= LowerAtomicRMWInst(RMWI); 
-    else if (LoadInst *LI = dyn_cast<LoadInst>(&Inst)) { 
-      if (LI->isAtomic()) 
-        LowerLoadInst(LI); 
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(&Inst)) { 
-      if (SI->isAtomic()) 
-        LowerStoreInst(SI); 
-    } 
-  } 
-  return Changed; 
-} 
- 
-static bool lowerAtomics(Function &F) { 
-  bool Changed = false; 
-  for (BasicBlock &BB : F) { 
-    Changed |= runOnBasicBlock(BB); 
-  } 
-  return Changed; 
-} 
- 
-PreservedAnalyses LowerAtomicPass::run(Function &F, FunctionAnalysisManager &) { 
-  if (lowerAtomics(F)) 
-    return PreservedAnalyses::none(); 
-  return PreservedAnalyses::all(); 
-} 
- 
-namespace { 
-class LowerAtomicLegacyPass : public FunctionPass { 
-public: 
-  static char ID; 
- 
-  LowerAtomicLegacyPass() : FunctionPass(ID) { 
-    initializeLowerAtomicLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnFunction(Function &F) override { 
-    // Don't skip optnone functions; atomics still need to be lowered. 
-    FunctionAnalysisManager DummyFAM; 
-    auto PA = Impl.run(F, DummyFAM); 
-    return !PA.areAllPreserved(); 
-  } 
- 
-private: 
-  LowerAtomicPass Impl; 
-  }; 
-} 
- 
-char LowerAtomicLegacyPass::ID = 0; 
-INITIALIZE_PASS(LowerAtomicLegacyPass, "loweratomic", 
-                "Lower atomic intrinsics to non-atomic form", false, false) 
- 
-Pass *llvm::createLowerAtomicPass() { return new LowerAtomicLegacyPass(); } 
+//===- LowerAtomic.cpp - Lower atomic intrinsics --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers atomic intrinsics to non-atomic form for use in a known
+// non-preemptible environment.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerAtomic.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loweratomic"
+
+static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) {
+  IRBuilder<> Builder(CXI);
+  Value *Ptr = CXI->getPointerOperand();
+  Value *Cmp = CXI->getCompareOperand();
+  Value *Val = CXI->getNewValOperand();
+
+  LoadInst *Orig = Builder.CreateLoad(Val->getType(), Ptr);
+  Value *Equal = Builder.CreateICmpEQ(Orig, Cmp);
+  Value *Res = Builder.CreateSelect(Equal, Val, Orig);
+  Builder.CreateStore(Res, Ptr);
+
+  Res = Builder.CreateInsertValue(UndefValue::get(CXI->getType()), Orig, 0);
+  Res = Builder.CreateInsertValue(Res, Equal, 1);
+
+  CXI->replaceAllUsesWith(Res);
+  CXI->eraseFromParent();
+  return true;
+}
+
+static bool LowerAtomicRMWInst(AtomicRMWInst *RMWI) {
+  IRBuilder<> Builder(RMWI);
+  Value *Ptr = RMWI->getPointerOperand();
+  Value *Val = RMWI->getValOperand();
+
+  LoadInst *Orig = Builder.CreateLoad(Val->getType(), Ptr);
+  Value *Res = nullptr;
+
+  switch (RMWI->getOperation()) {
+  default: llvm_unreachable("Unexpected RMW operation");
+  case AtomicRMWInst::Xchg:
+    Res = Val;
+    break;
+  case AtomicRMWInst::Add:
+    Res = Builder.CreateAdd(Orig, Val);
+    break;
+  case AtomicRMWInst::Sub:
+    Res = Builder.CreateSub(Orig, Val);
+    break;
+  case AtomicRMWInst::And:
+    Res = Builder.CreateAnd(Orig, Val);
+    break;
+  case AtomicRMWInst::Nand:
+    Res = Builder.CreateNot(Builder.CreateAnd(Orig, Val));
+    break;
+  case AtomicRMWInst::Or:
+    Res = Builder.CreateOr(Orig, Val);
+    break;
+  case AtomicRMWInst::Xor:
+    Res = Builder.CreateXor(Orig, Val);
+    break;
+  case AtomicRMWInst::Max:
+    Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val),
+                               Val, Orig);
+    break;
+  case AtomicRMWInst::Min:
+    Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val),
+                               Orig, Val);
+    break;
+  case AtomicRMWInst::UMax:
+    Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val),
+                               Val, Orig);
+    break;
+  case AtomicRMWInst::UMin:
+    Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val),
+                               Orig, Val);
+    break;
+  case AtomicRMWInst::FAdd:
+    Res = Builder.CreateFAdd(Orig, Val);
+    break;
+  case AtomicRMWInst::FSub:
+    Res = Builder.CreateFSub(Orig, Val);
+    break;
+  }
+  Builder.CreateStore(Res, Ptr);
+  RMWI->replaceAllUsesWith(Orig);
+  RMWI->eraseFromParent();
+  return true;
+}
+
+static bool LowerFenceInst(FenceInst *FI) {
+  FI->eraseFromParent();
+  return true;
+}
+
+static bool LowerLoadInst(LoadInst *LI) {
+  LI->setAtomic(AtomicOrdering::NotAtomic);
+  return true;
+}
+
+static bool LowerStoreInst(StoreInst *SI) {
+  SI->setAtomic(AtomicOrdering::NotAtomic);
+  return true;
+}
+
+static bool runOnBasicBlock(BasicBlock &BB) {
+  bool Changed = false;
+  for (Instruction &Inst : make_early_inc_range(BB)) {
+    if (FenceInst *FI = dyn_cast<FenceInst>(&Inst))
+      Changed |= LowerFenceInst(FI);
+    else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(&Inst))
+      Changed |= LowerAtomicCmpXchgInst(CXI);
+    else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(&Inst))
+      Changed |= LowerAtomicRMWInst(RMWI);
+    else if (LoadInst *LI = dyn_cast<LoadInst>(&Inst)) {
+      if (LI->isAtomic())
+        LowerLoadInst(LI);
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(&Inst)) {
+      if (SI->isAtomic())
+        LowerStoreInst(SI);
+    }
+  }
+  return Changed;
+}
+
+static bool lowerAtomics(Function &F) {
+  bool Changed = false;
+  for (BasicBlock &BB : F) {
+    Changed |= runOnBasicBlock(BB);
+  }
+  return Changed;
+}
+
+PreservedAnalyses LowerAtomicPass::run(Function &F, FunctionAnalysisManager &) {
+  if (lowerAtomics(F))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+namespace {
+class LowerAtomicLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  LowerAtomicLegacyPass() : FunctionPass(ID) {
+    initializeLowerAtomicLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    // Don't skip optnone functions; atomics still need to be lowered.
+    FunctionAnalysisManager DummyFAM;
+    auto PA = Impl.run(F, DummyFAM);
+    return !PA.areAllPreserved();
+  }
+
+private:
+  LowerAtomicPass Impl;
+  };
+}
+
+char LowerAtomicLegacyPass::ID = 0;
+INITIALIZE_PASS(LowerAtomicLegacyPass, "loweratomic",
+                "Lower atomic intrinsics to non-atomic form", false, false)
+
+Pass *llvm::createLowerAtomicPass() { return new LowerAtomicLegacyPass(); }
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
index 4ca96ec1f6..bb30c48127 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
@@ -1,181 +1,181 @@
-//===- LowerConstantIntrinsics.cpp - Lower constant intrinsic calls -------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass lowers all remaining 'objectsize' 'is.constant' intrinsic calls 
-// and provides constant propagation and basic CFG cleanup on the result. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h" 
-#include "llvm/ADT/PostOrderIterator.h" 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/Analysis/MemoryBuiltins.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/Local.h" 
- 
-using namespace llvm; 
-using namespace llvm::PatternMatch; 
- 
-#define DEBUG_TYPE "lower-is-constant-intrinsic" 
- 
-STATISTIC(IsConstantIntrinsicsHandled, 
-          "Number of 'is.constant' intrinsic calls handled"); 
-STATISTIC(ObjectSizeIntrinsicsHandled, 
-          "Number of 'objectsize' intrinsic calls handled"); 
- 
-static Value *lowerIsConstantIntrinsic(IntrinsicInst *II) { 
+//===- LowerConstantIntrinsics.cpp - Lower constant intrinsic calls -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers all remaining 'objectsize' 'is.constant' intrinsic calls
+// and provides constant propagation and basic CFG cleanup on the result.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "lower-is-constant-intrinsic"
+
+STATISTIC(IsConstantIntrinsicsHandled,
+          "Number of 'is.constant' intrinsic calls handled");
+STATISTIC(ObjectSizeIntrinsicsHandled,
+          "Number of 'objectsize' intrinsic calls handled");
+
+static Value *lowerIsConstantIntrinsic(IntrinsicInst *II) {
   if (auto *C = dyn_cast<Constant>(II->getOperand(0)))
     if (C->isManifestConstant())
       return ConstantInt::getTrue(II->getType());
   return ConstantInt::getFalse(II->getType());
-} 
- 
-static bool replaceConditionalBranchesOnConstant(Instruction *II, 
-                                                 Value *NewValue) { 
-  bool HasDeadBlocks = false; 
-  SmallSetVector<Instruction *, 8> Worklist; 
-  replaceAndRecursivelySimplify(II, NewValue, nullptr, nullptr, nullptr, 
-                                &Worklist); 
-  for (auto I : Worklist) { 
-    BranchInst *BI = dyn_cast<BranchInst>(I); 
-    if (!BI) 
-      continue; 
-    if (BI->isUnconditional()) 
-      continue; 
- 
-    BasicBlock *Target, *Other; 
-    if (match(BI->getOperand(0), m_Zero())) { 
-      Target = BI->getSuccessor(1); 
-      Other = BI->getSuccessor(0); 
-    } else if (match(BI->getOperand(0), m_One())) { 
-      Target = BI->getSuccessor(0); 
-      Other = BI->getSuccessor(1); 
-    } else { 
-      Target = nullptr; 
-      Other = nullptr; 
-    } 
-    if (Target && Target != Other) { 
-      BasicBlock *Source = BI->getParent(); 
-      Other->removePredecessor(Source); 
-      BI->eraseFromParent(); 
-      BranchInst::Create(Target, Source); 
+}
+
+static bool replaceConditionalBranchesOnConstant(Instruction *II,
+                                                 Value *NewValue) {
+  bool HasDeadBlocks = false;
+  SmallSetVector<Instruction *, 8> Worklist;
+  replaceAndRecursivelySimplify(II, NewValue, nullptr, nullptr, nullptr,
+                                &Worklist);
+  for (auto I : Worklist) {
+    BranchInst *BI = dyn_cast<BranchInst>(I);
+    if (!BI)
+      continue;
+    if (BI->isUnconditional())
+      continue;
+
+    BasicBlock *Target, *Other;
+    if (match(BI->getOperand(0), m_Zero())) {
+      Target = BI->getSuccessor(1);
+      Other = BI->getSuccessor(0);
+    } else if (match(BI->getOperand(0), m_One())) {
+      Target = BI->getSuccessor(0);
+      Other = BI->getSuccessor(1);
+    } else {
+      Target = nullptr;
+      Other = nullptr;
+    }
+    if (Target && Target != Other) {
+      BasicBlock *Source = BI->getParent();
+      Other->removePredecessor(Source);
+      BI->eraseFromParent();
+      BranchInst::Create(Target, Source);
       if (pred_empty(Other))
-        HasDeadBlocks = true; 
-    } 
-  } 
-  return HasDeadBlocks; 
-} 
- 
-static bool lowerConstantIntrinsics(Function &F, const TargetLibraryInfo *TLI) { 
-  bool HasDeadBlocks = false; 
-  const auto &DL = F.getParent()->getDataLayout(); 
-  SmallVector<WeakTrackingVH, 8> Worklist; 
- 
-  ReversePostOrderTraversal<Function *> RPOT(&F); 
-  for (BasicBlock *BB : RPOT) { 
-    for (Instruction &I: *BB) { 
-      IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I); 
-      if (!II) 
-        continue; 
-      switch (II->getIntrinsicID()) { 
-      default: 
-        break; 
-      case Intrinsic::is_constant: 
-      case Intrinsic::objectsize: 
-        Worklist.push_back(WeakTrackingVH(&I)); 
-        break; 
-      } 
-    } 
-  } 
-  for (WeakTrackingVH &VH: Worklist) { 
-    // Items on the worklist can be mutated by earlier recursive replaces. 
-    // This can remove the intrinsic as dead (VH == null), but also replace 
-    // the intrinsic in place. 
-    if (!VH) 
-      continue; 
-    IntrinsicInst *II = dyn_cast<IntrinsicInst>(&*VH); 
-    if (!II) 
-      continue; 
-    Value *NewValue; 
-    switch (II->getIntrinsicID()) { 
-    default: 
-      continue; 
-    case Intrinsic::is_constant: 
-      NewValue = lowerIsConstantIntrinsic(II); 
-      IsConstantIntrinsicsHandled++; 
-      break; 
-    case Intrinsic::objectsize: 
-      NewValue = lowerObjectSizeCall(II, DL, TLI, true); 
-      ObjectSizeIntrinsicsHandled++; 
-      break; 
-    } 
-    HasDeadBlocks |= replaceConditionalBranchesOnConstant(II, NewValue); 
-  } 
-  if (HasDeadBlocks) 
-    removeUnreachableBlocks(F); 
-  return !Worklist.empty(); 
-} 
- 
-PreservedAnalyses 
-LowerConstantIntrinsicsPass::run(Function &F, FunctionAnalysisManager &AM) { 
-  if (lowerConstantIntrinsics(F, 
-                              AM.getCachedResult<TargetLibraryAnalysis>(F))) { 
-    PreservedAnalyses PA; 
-    PA.preserve<GlobalsAA>(); 
-    return PA; 
-  } 
- 
-  return PreservedAnalyses::all(); 
-} 
- 
-namespace { 
-/// Legacy pass for lowering is.constant intrinsics out of the IR. 
-/// 
-/// When this pass is run over a function it converts is.constant intrinsics 
-/// into 'true' or 'false'. This complements the normal constant folding 
-/// to 'true' as part of Instruction Simplify passes. 
-class LowerConstantIntrinsics : public FunctionPass { 
-public: 
-  static char ID; 
-  LowerConstantIntrinsics() : FunctionPass(ID) { 
-    initializeLowerConstantIntrinsicsPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnFunction(Function &F) override { 
-    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 
-    const TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 
-    return lowerConstantIntrinsics(F, TLI); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addPreserved<GlobalsAAWrapperPass>(); 
-  } 
-}; 
-} // namespace 
- 
-char LowerConstantIntrinsics::ID = 0; 
-INITIALIZE_PASS(LowerConstantIntrinsics, "lower-constant-intrinsics", 
-                "Lower constant intrinsics", false, false) 
- 
-FunctionPass *llvm::createLowerConstantIntrinsicsPass() { 
-  return new LowerConstantIntrinsics(); 
-} 
+        HasDeadBlocks = true;
+    }
+  }
+  return HasDeadBlocks;
+}
+
+static bool lowerConstantIntrinsics(Function &F, const TargetLibraryInfo *TLI) {
+  bool HasDeadBlocks = false;
+  const auto &DL = F.getParent()->getDataLayout();
+  SmallVector<WeakTrackingVH, 8> Worklist;
+
+  ReversePostOrderTraversal<Function *> RPOT(&F);
+  for (BasicBlock *BB : RPOT) {
+    for (Instruction &I: *BB) {
+      IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
+      if (!II)
+        continue;
+      switch (II->getIntrinsicID()) {
+      default:
+        break;
+      case Intrinsic::is_constant:
+      case Intrinsic::objectsize:
+        Worklist.push_back(WeakTrackingVH(&I));
+        break;
+      }
+    }
+  }
+  for (WeakTrackingVH &VH: Worklist) {
+    // Items on the worklist can be mutated by earlier recursive replaces.
+    // This can remove the intrinsic as dead (VH == null), but also replace
+    // the intrinsic in place.
+    if (!VH)
+      continue;
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(&*VH);
+    if (!II)
+      continue;
+    Value *NewValue;
+    switch (II->getIntrinsicID()) {
+    default:
+      continue;
+    case Intrinsic::is_constant:
+      NewValue = lowerIsConstantIntrinsic(II);
+      IsConstantIntrinsicsHandled++;
+      break;
+    case Intrinsic::objectsize:
+      NewValue = lowerObjectSizeCall(II, DL, TLI, true);
+      ObjectSizeIntrinsicsHandled++;
+      break;
+    }
+    HasDeadBlocks |= replaceConditionalBranchesOnConstant(II, NewValue);
+  }
+  if (HasDeadBlocks)
+    removeUnreachableBlocks(F);
+  return !Worklist.empty();
+}
+
+PreservedAnalyses
+LowerConstantIntrinsicsPass::run(Function &F, FunctionAnalysisManager &AM) {
+  if (lowerConstantIntrinsics(F,
+                              AM.getCachedResult<TargetLibraryAnalysis>(F))) {
+    PreservedAnalyses PA;
+    PA.preserve<GlobalsAA>();
+    return PA;
+  }
+
+  return PreservedAnalyses::all();
+}
+
+namespace {
+/// Legacy pass for lowering is.constant intrinsics out of the IR.
+///
+/// When this pass is run over a function it converts is.constant intrinsics
+/// into 'true' or 'false'. This complements the normal constant folding
+/// to 'true' as part of Instruction Simplify passes.
+class LowerConstantIntrinsics : public FunctionPass {
+public:
+  static char ID;
+  LowerConstantIntrinsics() : FunctionPass(ID) {
+    initializeLowerConstantIntrinsicsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+    const TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
+    return lowerConstantIntrinsics(F, TLI);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+} // namespace
+
+char LowerConstantIntrinsics::ID = 0;
+INITIALIZE_PASS(LowerConstantIntrinsics, "lower-constant-intrinsics",
+                "Lower constant intrinsics", false, false)
+
+FunctionPass *llvm::createLowerConstantIntrinsicsPass() {
+  return new LowerConstantIntrinsics();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
index 7911d1db70..da13075dfe 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
@@ -1,420 +1,420 @@
-//===- LowerExpectIntrinsic.cpp - Lower expect intrinsic ------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass lowers the 'expect' intrinsic to LLVM metadata. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/iterator_range.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/MDBuilder.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Transforms/Scalar.h" 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "lower-expect-intrinsic" 
- 
-STATISTIC(ExpectIntrinsicsHandled, 
-          "Number of 'expect' intrinsic instructions handled"); 
- 
-// These default values are chosen to represent an extremely skewed outcome for 
-// a condition, but they leave some room for interpretation by later passes. 
-// 
-// If the documentation for __builtin_expect() was made explicit that it should 
-// only be used in extreme cases, we could make this ratio higher. As it stands, 
-// programmers may be using __builtin_expect() / llvm.expect to annotate that a 
-// branch is likely or unlikely to be taken. 
-// 
-// There is a known dependency on this ratio in CodeGenPrepare when transforming 
-// 'select' instructions. It may be worthwhile to hoist these values to some 
-// shared space, so they can be used directly by other passes. 
- 
+//===- LowerExpectIntrinsic.cpp - Lower expect intrinsic ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers the 'expect' intrinsic to LLVM metadata.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "lower-expect-intrinsic"
+
+STATISTIC(ExpectIntrinsicsHandled,
+          "Number of 'expect' intrinsic instructions handled");
+
+// These default values are chosen to represent an extremely skewed outcome for
+// a condition, but they leave some room for interpretation by later passes.
+//
+// If the documentation for __builtin_expect() was made explicit that it should
+// only be used in extreme cases, we could make this ratio higher. As it stands,
+// programmers may be using __builtin_expect() / llvm.expect to annotate that a
+// branch is likely or unlikely to be taken.
+//
+// There is a known dependency on this ratio in CodeGenPrepare when transforming
+// 'select' instructions. It may be worthwhile to hoist these values to some
+// shared space, so they can be used directly by other passes.
+
 cl::opt<uint32_t> llvm::LikelyBranchWeight(
-    "likely-branch-weight", cl::Hidden, cl::init(2000), 
-    cl::desc("Weight of the branch likely to be taken (default = 2000)")); 
+    "likely-branch-weight", cl::Hidden, cl::init(2000),
+    cl::desc("Weight of the branch likely to be taken (default = 2000)"));
 cl::opt<uint32_t> llvm::UnlikelyBranchWeight(
-    "unlikely-branch-weight", cl::Hidden, cl::init(1), 
-    cl::desc("Weight of the branch unlikely to be taken (default = 1)")); 
- 
-static std::tuple<uint32_t, uint32_t> 
-getBranchWeight(Intrinsic::ID IntrinsicID, CallInst *CI, int BranchCount) { 
-  if (IntrinsicID == Intrinsic::expect) { 
-    // __builtin_expect 
-    return std::make_tuple(LikelyBranchWeight.getValue(), 
-                           UnlikelyBranchWeight.getValue()); 
-  } else { 
-    // __builtin_expect_with_probability 
-    assert(CI->getNumOperands() >= 3 && 
-           "expect with probability must have 3 arguments"); 
-    ConstantFP *Confidence = dyn_cast<ConstantFP>(CI->getArgOperand(2)); 
-    double TrueProb = Confidence->getValueAPF().convertToDouble(); 
-    assert((TrueProb >= 0.0 && TrueProb <= 1.0) && 
-           "probability value must be in the range [0.0, 1.0]"); 
-    double FalseProb = (1.0 - TrueProb) / (BranchCount - 1); 
-    uint32_t LikelyBW = ceil((TrueProb * (double)(INT32_MAX - 1)) + 1.0); 
-    uint32_t UnlikelyBW = ceil((FalseProb * (double)(INT32_MAX - 1)) + 1.0); 
-    return std::make_tuple(LikelyBW, UnlikelyBW); 
-  } 
-} 
- 
-static bool handleSwitchExpect(SwitchInst &SI) { 
-  CallInst *CI = dyn_cast<CallInst>(SI.getCondition()); 
-  if (!CI) 
-    return false; 
- 
-  Function *Fn = CI->getCalledFunction(); 
-  if (!Fn || (Fn->getIntrinsicID() != Intrinsic::expect && 
-              Fn->getIntrinsicID() != Intrinsic::expect_with_probability)) 
-    return false; 
- 
-  Value *ArgValue = CI->getArgOperand(0); 
-  ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(CI->getArgOperand(1)); 
-  if (!ExpectedValue) 
-    return false; 
- 
-  SwitchInst::CaseHandle Case = *SI.findCaseValue(ExpectedValue); 
-  unsigned n = SI.getNumCases(); // +1 for default case. 
-  uint32_t LikelyBranchWeightVal, UnlikelyBranchWeightVal; 
-  std::tie(LikelyBranchWeightVal, UnlikelyBranchWeightVal) = 
-      getBranchWeight(Fn->getIntrinsicID(), CI, n + 1); 
- 
-  SmallVector<uint32_t, 16> Weights(n + 1, UnlikelyBranchWeightVal); 
- 
-  uint64_t Index = (Case == *SI.case_default()) ? 0 : Case.getCaseIndex() + 1; 
-  Weights[Index] = LikelyBranchWeightVal; 
- 
-  SI.setCondition(ArgValue); 
- 
-  SI.setMetadata(LLVMContext::MD_prof, 
-                 MDBuilder(CI->getContext()).createBranchWeights(Weights)); 
- 
-  return true; 
-} 
- 
-/// Handler for PHINodes that define the value argument to an 
-/// @llvm.expect call. 
-/// 
-/// If the operand of the phi has a constant value and it 'contradicts' 
-/// with the expected value of phi def, then the corresponding incoming 
-/// edge of the phi is unlikely to be taken. Using that information, 
-/// the branch probability info for the originating branch can be inferred. 
-static void handlePhiDef(CallInst *Expect) { 
-  Value &Arg = *Expect->getArgOperand(0); 
-  ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(Expect->getArgOperand(1)); 
-  if (!ExpectedValue) 
-    return; 
-  const APInt &ExpectedPhiValue = ExpectedValue->getValue(); 
- 
-  // Walk up in backward a list of instructions that 
-  // have 'copy' semantics by 'stripping' the copies 
-  // until a PHI node or an instruction of unknown kind 
-  // is reached. Negation via xor is also handled. 
-  // 
-  //       C = PHI(...); 
-  //       B = C; 
-  //       A = B; 
-  //       D = __builtin_expect(A, 0); 
-  // 
-  Value *V = &Arg; 
-  SmallVector<Instruction *, 4> Operations; 
-  while (!isa<PHINode>(V)) { 
-    if (ZExtInst *ZExt = dyn_cast<ZExtInst>(V)) { 
-      V = ZExt->getOperand(0); 
-      Operations.push_back(ZExt); 
-      continue; 
-    } 
- 
-    if (SExtInst *SExt = dyn_cast<SExtInst>(V)) { 
-      V = SExt->getOperand(0); 
-      Operations.push_back(SExt); 
-      continue; 
-    } 
- 
-    BinaryOperator *BinOp = dyn_cast<BinaryOperator>(V); 
-    if (!BinOp || BinOp->getOpcode() != Instruction::Xor) 
-      return; 
- 
-    ConstantInt *CInt = dyn_cast<ConstantInt>(BinOp->getOperand(1)); 
-    if (!CInt) 
-      return; 
- 
-    V = BinOp->getOperand(0); 
-    Operations.push_back(BinOp); 
-  } 
- 
-  // Executes the recorded operations on input 'Value'. 
-  auto ApplyOperations = [&](const APInt &Value) { 
-    APInt Result = Value; 
-    for (auto Op : llvm::reverse(Operations)) { 
-      switch (Op->getOpcode()) { 
-      case Instruction::Xor: 
-        Result ^= cast<ConstantInt>(Op->getOperand(1))->getValue(); 
-        break; 
-      case Instruction::ZExt: 
-        Result = Result.zext(Op->getType()->getIntegerBitWidth()); 
-        break; 
-      case Instruction::SExt: 
-        Result = Result.sext(Op->getType()->getIntegerBitWidth()); 
-        break; 
-      default: 
-        llvm_unreachable("Unexpected operation"); 
-      } 
-    } 
-    return Result; 
-  }; 
- 
-  auto *PhiDef = cast<PHINode>(V); 
- 
-  // Get the first dominating conditional branch of the operand 
-  // i's incoming block. 
-  auto GetDomConditional = [&](unsigned i) -> BranchInst * { 
-    BasicBlock *BB = PhiDef->getIncomingBlock(i); 
-    BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()); 
-    if (BI && BI->isConditional()) 
-      return BI; 
-    BB = BB->getSinglePredecessor(); 
-    if (!BB) 
-      return nullptr; 
-    BI = dyn_cast<BranchInst>(BB->getTerminator()); 
-    if (!BI || BI->isUnconditional()) 
-      return nullptr; 
-    return BI; 
-  }; 
- 
-  // Now walk through all Phi operands to find phi oprerands with values 
-  // conflicting with the expected phi output value. Any such operand 
-  // indicates the incoming edge to that operand is unlikely. 
-  for (unsigned i = 0, e = PhiDef->getNumIncomingValues(); i != e; ++i) { 
- 
-    Value *PhiOpnd = PhiDef->getIncomingValue(i); 
-    ConstantInt *CI = dyn_cast<ConstantInt>(PhiOpnd); 
-    if (!CI) 
-      continue; 
- 
-    // Not an interesting case when IsUnlikely is false -- we can not infer 
-    // anything useful when the operand value matches the expected phi 
-    // output. 
-    if (ExpectedPhiValue == ApplyOperations(CI->getValue())) 
-      continue; 
- 
-    BranchInst *BI = GetDomConditional(i); 
-    if (!BI) 
-      continue; 
- 
-    MDBuilder MDB(PhiDef->getContext()); 
- 
-    // There are two situations in which an operand of the PhiDef comes 
-    // from a given successor of a branch instruction BI. 
-    // 1) When the incoming block of the operand is the successor block; 
-    // 2) When the incoming block is BI's enclosing block and the 
-    // successor is the PhiDef's enclosing block. 
-    // 
-    // Returns true if the operand which comes from OpndIncomingBB 
-    // comes from outgoing edge of BI that leads to Succ block. 
-    auto *OpndIncomingBB = PhiDef->getIncomingBlock(i); 
-    auto IsOpndComingFromSuccessor = [&](BasicBlock *Succ) { 
-      if (OpndIncomingBB == Succ) 
-        // If this successor is the incoming block for this 
-        // Phi operand, then this successor does lead to the Phi. 
-        return true; 
-      if (OpndIncomingBB == BI->getParent() && Succ == PhiDef->getParent()) 
-        // Otherwise, if the edge is directly from the branch 
-        // to the Phi, this successor is the one feeding this 
-        // Phi operand. 
-        return true; 
-      return false; 
-    }; 
-    uint32_t LikelyBranchWeightVal, UnlikelyBranchWeightVal; 
-    std::tie(LikelyBranchWeightVal, UnlikelyBranchWeightVal) = getBranchWeight( 
-        Expect->getCalledFunction()->getIntrinsicID(), Expect, 2); 
- 
-    if (IsOpndComingFromSuccessor(BI->getSuccessor(1))) 
-      BI->setMetadata(LLVMContext::MD_prof, 
-                      MDB.createBranchWeights(LikelyBranchWeightVal, 
-                                              UnlikelyBranchWeightVal)); 
-    else if (IsOpndComingFromSuccessor(BI->getSuccessor(0))) 
-      BI->setMetadata(LLVMContext::MD_prof, 
-                      MDB.createBranchWeights(UnlikelyBranchWeightVal, 
-                                              LikelyBranchWeightVal)); 
-  } 
-} 
- 
-// Handle both BranchInst and SelectInst. 
-template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) { 
- 
-  // Handle non-optimized IR code like: 
-  //   %expval = call i64 @llvm.expect.i64(i64 %conv1, i64 1) 
-  //   %tobool = icmp ne i64 %expval, 0 
-  //   br i1 %tobool, label %if.then, label %if.end 
-  // 
-  // Or the following simpler case: 
-  //   %expval = call i1 @llvm.expect.i1(i1 %cmp, i1 1) 
-  //   br i1 %expval, label %if.then, label %if.end 
- 
-  CallInst *CI; 
- 
-  ICmpInst *CmpI = dyn_cast<ICmpInst>(BSI.getCondition()); 
-  CmpInst::Predicate Predicate; 
-  ConstantInt *CmpConstOperand = nullptr; 
-  if (!CmpI) { 
-    CI = dyn_cast<CallInst>(BSI.getCondition()); 
-    Predicate = CmpInst::ICMP_NE; 
-  } else { 
-    Predicate = CmpI->getPredicate(); 
-    if (Predicate != CmpInst::ICMP_NE && Predicate != CmpInst::ICMP_EQ) 
-      return false; 
- 
-    CmpConstOperand = dyn_cast<ConstantInt>(CmpI->getOperand(1)); 
-    if (!CmpConstOperand) 
-      return false; 
-    CI = dyn_cast<CallInst>(CmpI->getOperand(0)); 
-  } 
- 
-  if (!CI) 
-    return false; 
- 
-  uint64_t ValueComparedTo = 0; 
-  if (CmpConstOperand) { 
-    if (CmpConstOperand->getBitWidth() > 64) 
-      return false; 
-    ValueComparedTo = CmpConstOperand->getZExtValue(); 
-  } 
- 
-  Function *Fn = CI->getCalledFunction(); 
-  if (!Fn || (Fn->getIntrinsicID() != Intrinsic::expect && 
-              Fn->getIntrinsicID() != Intrinsic::expect_with_probability)) 
-    return false; 
- 
-  Value *ArgValue = CI->getArgOperand(0); 
-  ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(CI->getArgOperand(1)); 
-  if (!ExpectedValue) 
-    return false; 
- 
-  MDBuilder MDB(CI->getContext()); 
-  MDNode *Node; 
- 
-  uint32_t LikelyBranchWeightVal, UnlikelyBranchWeightVal; 
-  std::tie(LikelyBranchWeightVal, UnlikelyBranchWeightVal) = 
-      getBranchWeight(Fn->getIntrinsicID(), CI, 2); 
- 
-  if ((ExpectedValue->getZExtValue() == ValueComparedTo) == 
-      (Predicate == CmpInst::ICMP_EQ)) { 
-    Node = 
-        MDB.createBranchWeights(LikelyBranchWeightVal, UnlikelyBranchWeightVal); 
-  } else { 
-    Node = 
-        MDB.createBranchWeights(UnlikelyBranchWeightVal, LikelyBranchWeightVal); 
-  } 
- 
-  if (CmpI) 
-    CmpI->setOperand(0, ArgValue); 
-  else 
-    BSI.setCondition(ArgValue); 
- 
-  BSI.setMetadata(LLVMContext::MD_prof, Node); 
- 
-  return true; 
-} 
- 
-static bool handleBranchExpect(BranchInst &BI) { 
-  if (BI.isUnconditional()) 
-    return false; 
- 
-  return handleBrSelExpect<BranchInst>(BI); 
-} 
- 
-static bool lowerExpectIntrinsic(Function &F) { 
-  bool Changed = false; 
- 
-  for (BasicBlock &BB : F) { 
-    // Create "block_weights" metadata. 
-    if (BranchInst *BI = dyn_cast<BranchInst>(BB.getTerminator())) { 
-      if (handleBranchExpect(*BI)) 
-        ExpectIntrinsicsHandled++; 
-    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB.getTerminator())) { 
-      if (handleSwitchExpect(*SI)) 
-        ExpectIntrinsicsHandled++; 
-    } 
- 
-    // Remove llvm.expect intrinsics. Iterate backwards in order 
-    // to process select instructions before the intrinsic gets 
-    // removed. 
-    for (auto BI = BB.rbegin(), BE = BB.rend(); BI != BE;) { 
-      Instruction *Inst = &*BI++; 
-      CallInst *CI = dyn_cast<CallInst>(Inst); 
-      if (!CI) { 
-        if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) { 
-          if (handleBrSelExpect(*SI)) 
-            ExpectIntrinsicsHandled++; 
-        } 
-        continue; 
-      } 
- 
-      Function *Fn = CI->getCalledFunction(); 
-      if (Fn && (Fn->getIntrinsicID() == Intrinsic::expect || 
-                 Fn->getIntrinsicID() == Intrinsic::expect_with_probability)) { 
-        // Before erasing the llvm.expect, walk backward to find 
-        // phi that define llvm.expect's first arg, and 
-        // infer branch probability: 
-        handlePhiDef(CI); 
-        Value *Exp = CI->getArgOperand(0); 
-        CI->replaceAllUsesWith(Exp); 
-        CI->eraseFromParent(); 
-        Changed = true; 
-      } 
-    } 
-  } 
- 
-  return Changed; 
-} 
- 
-PreservedAnalyses LowerExpectIntrinsicPass::run(Function &F, 
-                                                FunctionAnalysisManager &) { 
-  if (lowerExpectIntrinsic(F)) 
-    return PreservedAnalyses::none(); 
- 
-  return PreservedAnalyses::all(); 
-} 
- 
-namespace { 
-/// Legacy pass for lowering expect intrinsics out of the IR. 
-/// 
-/// When this pass is run over a function it uses expect intrinsics which feed 
-/// branches and switches to provide branch weight metadata for those 
-/// terminators. It then removes the expect intrinsics from the IR so the rest 
-/// of the optimizer can ignore them. 
-class LowerExpectIntrinsic : public FunctionPass { 
-public: 
-  static char ID; 
-  LowerExpectIntrinsic() : FunctionPass(ID) { 
-    initializeLowerExpectIntrinsicPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnFunction(Function &F) override { return lowerExpectIntrinsic(F); } 
-}; 
-} 
- 
-char LowerExpectIntrinsic::ID = 0; 
-INITIALIZE_PASS(LowerExpectIntrinsic, "lower-expect", 
-                "Lower 'expect' Intrinsics", false, false) 
- 
-FunctionPass *llvm::createLowerExpectIntrinsicPass() { 
-  return new LowerExpectIntrinsic(); 
-} 
+    "unlikely-branch-weight", cl::Hidden, cl::init(1),
+    cl::desc("Weight of the branch unlikely to be taken (default = 1)"));
+
+static std::tuple<uint32_t, uint32_t>
+getBranchWeight(Intrinsic::ID IntrinsicID, CallInst *CI, int BranchCount) {
+  if (IntrinsicID == Intrinsic::expect) {
+    // __builtin_expect
+    return std::make_tuple(LikelyBranchWeight.getValue(),
+                           UnlikelyBranchWeight.getValue());
+  } else {
+    // __builtin_expect_with_probability
+    assert(CI->getNumOperands() >= 3 &&
+           "expect with probability must have 3 arguments");
+    ConstantFP *Confidence = dyn_cast<ConstantFP>(CI->getArgOperand(2));
+    double TrueProb = Confidence->getValueAPF().convertToDouble();
+    assert((TrueProb >= 0.0 && TrueProb <= 1.0) &&
+           "probability value must be in the range [0.0, 1.0]");
+    double FalseProb = (1.0 - TrueProb) / (BranchCount - 1);
+    uint32_t LikelyBW = ceil((TrueProb * (double)(INT32_MAX - 1)) + 1.0);
+    uint32_t UnlikelyBW = ceil((FalseProb * (double)(INT32_MAX - 1)) + 1.0);
+    return std::make_tuple(LikelyBW, UnlikelyBW);
+  }
+}
+
+static bool handleSwitchExpect(SwitchInst &SI) {
+  CallInst *CI = dyn_cast<CallInst>(SI.getCondition());
+  if (!CI)
+    return false;
+
+  Function *Fn = CI->getCalledFunction();
+  if (!Fn || (Fn->getIntrinsicID() != Intrinsic::expect &&
+              Fn->getIntrinsicID() != Intrinsic::expect_with_probability))
+    return false;
+
+  Value *ArgValue = CI->getArgOperand(0);
+  ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+  if (!ExpectedValue)
+    return false;
+
+  SwitchInst::CaseHandle Case = *SI.findCaseValue(ExpectedValue);
+  unsigned n = SI.getNumCases(); // +1 for default case.
+  uint32_t LikelyBranchWeightVal, UnlikelyBranchWeightVal;
+  std::tie(LikelyBranchWeightVal, UnlikelyBranchWeightVal) =
+      getBranchWeight(Fn->getIntrinsicID(), CI, n + 1);
+
+  SmallVector<uint32_t, 16> Weights(n + 1, UnlikelyBranchWeightVal);
+
+  uint64_t Index = (Case == *SI.case_default()) ? 0 : Case.getCaseIndex() + 1;
+  Weights[Index] = LikelyBranchWeightVal;
+
+  SI.setCondition(ArgValue);
+
+  SI.setMetadata(LLVMContext::MD_prof,
+                 MDBuilder(CI->getContext()).createBranchWeights(Weights));
+
+  return true;
+}
+
+/// Handler for PHINodes that define the value argument to an
+/// @llvm.expect call.
+///
+/// If the operand of the phi has a constant value and it 'contradicts'
+/// with the expected value of phi def, then the corresponding incoming
+/// edge of the phi is unlikely to be taken. Using that information,
+/// the branch probability info for the originating branch can be inferred.
+static void handlePhiDef(CallInst *Expect) {
+  Value &Arg = *Expect->getArgOperand(0);
+  ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(Expect->getArgOperand(1));
+  if (!ExpectedValue)
+    return;
+  const APInt &ExpectedPhiValue = ExpectedValue->getValue();
+
+  // Walk up in backward a list of instructions that
+  // have 'copy' semantics by 'stripping' the copies
+  // until a PHI node or an instruction of unknown kind
+  // is reached. Negation via xor is also handled.
+  //
+  //       C = PHI(...);
+  //       B = C;
+  //       A = B;
+  //       D = __builtin_expect(A, 0);
+  //
+  Value *V = &Arg;
+  SmallVector<Instruction *, 4> Operations;
+  while (!isa<PHINode>(V)) {
+    if (ZExtInst *ZExt = dyn_cast<ZExtInst>(V)) {
+      V = ZExt->getOperand(0);
+      Operations.push_back(ZExt);
+      continue;
+    }
+
+    if (SExtInst *SExt = dyn_cast<SExtInst>(V)) {
+      V = SExt->getOperand(0);
+      Operations.push_back(SExt);
+      continue;
+    }
+
+    BinaryOperator *BinOp = dyn_cast<BinaryOperator>(V);
+    if (!BinOp || BinOp->getOpcode() != Instruction::Xor)
+      return;
+
+    ConstantInt *CInt = dyn_cast<ConstantInt>(BinOp->getOperand(1));
+    if (!CInt)
+      return;
+
+    V = BinOp->getOperand(0);
+    Operations.push_back(BinOp);
+  }
+
+  // Executes the recorded operations on input 'Value'.
+  auto ApplyOperations = [&](const APInt &Value) {
+    APInt Result = Value;
+    for (auto Op : llvm::reverse(Operations)) {
+      switch (Op->getOpcode()) {
+      case Instruction::Xor:
+        Result ^= cast<ConstantInt>(Op->getOperand(1))->getValue();
+        break;
+      case Instruction::ZExt:
+        Result = Result.zext(Op->getType()->getIntegerBitWidth());
+        break;
+      case Instruction::SExt:
+        Result = Result.sext(Op->getType()->getIntegerBitWidth());
+        break;
+      default:
+        llvm_unreachable("Unexpected operation");
+      }
+    }
+    return Result;
+  };
+
+  auto *PhiDef = cast<PHINode>(V);
+
+  // Get the first dominating conditional branch of the operand
+  // i's incoming block.
+  auto GetDomConditional = [&](unsigned i) -> BranchInst * {
+    BasicBlock *BB = PhiDef->getIncomingBlock(i);
+    BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
+    if (BI && BI->isConditional())
+      return BI;
+    BB = BB->getSinglePredecessor();
+    if (!BB)
+      return nullptr;
+    BI = dyn_cast<BranchInst>(BB->getTerminator());
+    if (!BI || BI->isUnconditional())
+      return nullptr;
+    return BI;
+  };
+
+  // Now walk through all Phi operands to find phi oprerands with values
+  // conflicting with the expected phi output value. Any such operand
+  // indicates the incoming edge to that operand is unlikely.
+  for (unsigned i = 0, e = PhiDef->getNumIncomingValues(); i != e; ++i) {
+
+    Value *PhiOpnd = PhiDef->getIncomingValue(i);
+    ConstantInt *CI = dyn_cast<ConstantInt>(PhiOpnd);
+    if (!CI)
+      continue;
+
+    // Not an interesting case when IsUnlikely is false -- we can not infer
+    // anything useful when the operand value matches the expected phi
+    // output.
+    if (ExpectedPhiValue == ApplyOperations(CI->getValue()))
+      continue;
+
+    BranchInst *BI = GetDomConditional(i);
+    if (!BI)
+      continue;
+
+    MDBuilder MDB(PhiDef->getContext());
+
+    // There are two situations in which an operand of the PhiDef comes
+    // from a given successor of a branch instruction BI.
+    // 1) When the incoming block of the operand is the successor block;
+    // 2) When the incoming block is BI's enclosing block and the
+    // successor is the PhiDef's enclosing block.
+    //
+    // Returns true if the operand which comes from OpndIncomingBB
+    // comes from outgoing edge of BI that leads to Succ block.
+    auto *OpndIncomingBB = PhiDef->getIncomingBlock(i);
+    auto IsOpndComingFromSuccessor = [&](BasicBlock *Succ) {
+      if (OpndIncomingBB == Succ)
+        // If this successor is the incoming block for this
+        // Phi operand, then this successor does lead to the Phi.
+        return true;
+      if (OpndIncomingBB == BI->getParent() && Succ == PhiDef->getParent())
+        // Otherwise, if the edge is directly from the branch
+        // to the Phi, this successor is the one feeding this
+        // Phi operand.
+        return true;
+      return false;
+    };
+    uint32_t LikelyBranchWeightVal, UnlikelyBranchWeightVal;
+    std::tie(LikelyBranchWeightVal, UnlikelyBranchWeightVal) = getBranchWeight(
+        Expect->getCalledFunction()->getIntrinsicID(), Expect, 2);
+
+    if (IsOpndComingFromSuccessor(BI->getSuccessor(1)))
+      BI->setMetadata(LLVMContext::MD_prof,
+                      MDB.createBranchWeights(LikelyBranchWeightVal,
+                                              UnlikelyBranchWeightVal));
+    else if (IsOpndComingFromSuccessor(BI->getSuccessor(0)))
+      BI->setMetadata(LLVMContext::MD_prof,
+                      MDB.createBranchWeights(UnlikelyBranchWeightVal,
+                                              LikelyBranchWeightVal));
+  }
+}
+
+// Handle both BranchInst and SelectInst.
+template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) {
+
+  // Handle non-optimized IR code like:
+  //   %expval = call i64 @llvm.expect.i64(i64 %conv1, i64 1)
+  //   %tobool = icmp ne i64 %expval, 0
+  //   br i1 %tobool, label %if.then, label %if.end
+  //
+  // Or the following simpler case:
+  //   %expval = call i1 @llvm.expect.i1(i1 %cmp, i1 1)
+  //   br i1 %expval, label %if.then, label %if.end
+
+  CallInst *CI;
+
+  ICmpInst *CmpI = dyn_cast<ICmpInst>(BSI.getCondition());
+  CmpInst::Predicate Predicate;
+  ConstantInt *CmpConstOperand = nullptr;
+  if (!CmpI) {
+    CI = dyn_cast<CallInst>(BSI.getCondition());
+    Predicate = CmpInst::ICMP_NE;
+  } else {
+    Predicate = CmpI->getPredicate();
+    if (Predicate != CmpInst::ICMP_NE && Predicate != CmpInst::ICMP_EQ)
+      return false;
+
+    CmpConstOperand = dyn_cast<ConstantInt>(CmpI->getOperand(1));
+    if (!CmpConstOperand)
+      return false;
+    CI = dyn_cast<CallInst>(CmpI->getOperand(0));
+  }
+
+  if (!CI)
+    return false;
+
+  uint64_t ValueComparedTo = 0;
+  if (CmpConstOperand) {
+    if (CmpConstOperand->getBitWidth() > 64)
+      return false;
+    ValueComparedTo = CmpConstOperand->getZExtValue();
+  }
+
+  Function *Fn = CI->getCalledFunction();
+  if (!Fn || (Fn->getIntrinsicID() != Intrinsic::expect &&
+              Fn->getIntrinsicID() != Intrinsic::expect_with_probability))
+    return false;
+
+  Value *ArgValue = CI->getArgOperand(0);
+  ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+  if (!ExpectedValue)
+    return false;
+
+  MDBuilder MDB(CI->getContext());
+  MDNode *Node;
+
+  uint32_t LikelyBranchWeightVal, UnlikelyBranchWeightVal;
+  std::tie(LikelyBranchWeightVal, UnlikelyBranchWeightVal) =
+      getBranchWeight(Fn->getIntrinsicID(), CI, 2);
+
+  if ((ExpectedValue->getZExtValue() == ValueComparedTo) ==
+      (Predicate == CmpInst::ICMP_EQ)) {
+    Node =
+        MDB.createBranchWeights(LikelyBranchWeightVal, UnlikelyBranchWeightVal);
+  } else {
+    Node =
+        MDB.createBranchWeights(UnlikelyBranchWeightVal, LikelyBranchWeightVal);
+  }
+
+  if (CmpI)
+    CmpI->setOperand(0, ArgValue);
+  else
+    BSI.setCondition(ArgValue);
+
+  BSI.setMetadata(LLVMContext::MD_prof, Node);
+
+  return true;
+}
+
+static bool handleBranchExpect(BranchInst &BI) {
+  if (BI.isUnconditional())
+    return false;
+
+  return handleBrSelExpect<BranchInst>(BI);
+}
+
+static bool lowerExpectIntrinsic(Function &F) {
+  bool Changed = false;
+
+  for (BasicBlock &BB : F) {
+    // Create "block_weights" metadata.
+    if (BranchInst *BI = dyn_cast<BranchInst>(BB.getTerminator())) {
+      if (handleBranchExpect(*BI))
+        ExpectIntrinsicsHandled++;
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB.getTerminator())) {
+      if (handleSwitchExpect(*SI))
+        ExpectIntrinsicsHandled++;
+    }
+
+    // Remove llvm.expect intrinsics. Iterate backwards in order
+    // to process select instructions before the intrinsic gets
+    // removed.
+    for (auto BI = BB.rbegin(), BE = BB.rend(); BI != BE;) {
+      Instruction *Inst = &*BI++;
+      CallInst *CI = dyn_cast<CallInst>(Inst);
+      if (!CI) {
+        if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
+          if (handleBrSelExpect(*SI))
+            ExpectIntrinsicsHandled++;
+        }
+        continue;
+      }
+
+      Function *Fn = CI->getCalledFunction();
+      if (Fn && (Fn->getIntrinsicID() == Intrinsic::expect ||
+                 Fn->getIntrinsicID() == Intrinsic::expect_with_probability)) {
+        // Before erasing the llvm.expect, walk backward to find
+        // phi that define llvm.expect's first arg, and
+        // infer branch probability:
+        handlePhiDef(CI);
+        Value *Exp = CI->getArgOperand(0);
+        CI->replaceAllUsesWith(Exp);
+        CI->eraseFromParent();
+        Changed = true;
+      }
+    }
+  }
+
+  return Changed;
+}
+
+PreservedAnalyses LowerExpectIntrinsicPass::run(Function &F,
+                                                FunctionAnalysisManager &) {
+  if (lowerExpectIntrinsic(F))
+    return PreservedAnalyses::none();
+
+  return PreservedAnalyses::all();
+}
+
+namespace {
+/// Legacy pass for lowering expect intrinsics out of the IR.
+///
+/// When this pass is run over a function it uses expect intrinsics which feed
+/// branches and switches to provide branch weight metadata for those
+/// terminators. It then removes the expect intrinsics from the IR so the rest
+/// of the optimizer can ignore them.
+class LowerExpectIntrinsic : public FunctionPass {
+public:
+  static char ID;
+  LowerExpectIntrinsic() : FunctionPass(ID) {
+    initializeLowerExpectIntrinsicPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override { return lowerExpectIntrinsic(F); }
+};
+}
+
+char LowerExpectIntrinsic::ID = 0;
+INITIALIZE_PASS(LowerExpectIntrinsic, "lower-expect",
+                "Lower 'expect' Intrinsics", false, false)
+
+FunctionPass *llvm::createLowerExpectIntrinsicPass() {
+  return new LowerExpectIntrinsic();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
index a431205777..45f5929e3b 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
@@ -1,90 +1,90 @@
-//===- LowerGuardIntrinsic.cpp - Lower the guard intrinsic ---------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass lowers the llvm.experimental.guard intrinsic to a conditional call 
-// to @llvm.experimental.deoptimize.  Once this happens, the guard can no longer 
-// be widened. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/LowerGuardIntrinsic.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/Analysis/GuardUtils.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/InstIterator.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/GuardUtils.h" 
- 
-using namespace llvm; 
- 
-namespace { 
-struct LowerGuardIntrinsicLegacyPass : public FunctionPass { 
-  static char ID; 
-  LowerGuardIntrinsicLegacyPass() : FunctionPass(ID) { 
-    initializeLowerGuardIntrinsicLegacyPassPass( 
-        *PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnFunction(Function &F) override; 
-}; 
-} 
- 
-static bool lowerGuardIntrinsic(Function &F) { 
-  // Check if we can cheaply rule out the possibility of not having any work to 
-  // do. 
-  auto *GuardDecl = F.getParent()->getFunction( 
-      Intrinsic::getName(Intrinsic::experimental_guard)); 
-  if (!GuardDecl || GuardDecl->use_empty()) 
-    return false; 
- 
-  SmallVector<CallInst *, 8> ToLower; 
-  for (auto &I : instructions(F)) 
-    if (isGuard(&I)) 
-      ToLower.push_back(cast<CallInst>(&I)); 
- 
-  if (ToLower.empty()) 
-    return false; 
- 
-  auto *DeoptIntrinsic = Intrinsic::getDeclaration( 
-      F.getParent(), Intrinsic::experimental_deoptimize, {F.getReturnType()}); 
-  DeoptIntrinsic->setCallingConv(GuardDecl->getCallingConv()); 
- 
-  for (auto *CI : ToLower) { 
-    makeGuardControlFlowExplicit(DeoptIntrinsic, CI, false); 
-    CI->eraseFromParent(); 
-  } 
- 
-  return true; 
-} 
- 
-bool LowerGuardIntrinsicLegacyPass::runOnFunction(Function &F) { 
-  return lowerGuardIntrinsic(F); 
-} 
- 
-char LowerGuardIntrinsicLegacyPass::ID = 0; 
-INITIALIZE_PASS(LowerGuardIntrinsicLegacyPass, "lower-guard-intrinsic", 
-                "Lower the guard intrinsic to normal control flow", false, 
-                false) 
- 
-Pass *llvm::createLowerGuardIntrinsicPass() { 
-  return new LowerGuardIntrinsicLegacyPass(); 
-} 
- 
-PreservedAnalyses LowerGuardIntrinsicPass::run(Function &F, 
-                                               FunctionAnalysisManager &AM) { 
-  if (lowerGuardIntrinsic(F)) 
-    return PreservedAnalyses::none(); 
- 
-  return PreservedAnalyses::all(); 
-} 
+//===- LowerGuardIntrinsic.cpp - Lower the guard intrinsic ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers the llvm.experimental.guard intrinsic to a conditional call
+// to @llvm.experimental.deoptimize.  Once this happens, the guard can no longer
+// be widened.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerGuardIntrinsic.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/GuardUtils.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/GuardUtils.h"
+
+using namespace llvm;
+
+namespace {
+struct LowerGuardIntrinsicLegacyPass : public FunctionPass {
+  static char ID;
+  LowerGuardIntrinsicLegacyPass() : FunctionPass(ID) {
+    initializeLowerGuardIntrinsicLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+};
+}
+
+static bool lowerGuardIntrinsic(Function &F) {
+  // Check if we can cheaply rule out the possibility of not having any work to
+  // do.
+  auto *GuardDecl = F.getParent()->getFunction(
+      Intrinsic::getName(Intrinsic::experimental_guard));
+  if (!GuardDecl || GuardDecl->use_empty())
+    return false;
+
+  SmallVector<CallInst *, 8> ToLower;
+  for (auto &I : instructions(F))
+    if (isGuard(&I))
+      ToLower.push_back(cast<CallInst>(&I));
+
+  if (ToLower.empty())
+    return false;
+
+  auto *DeoptIntrinsic = Intrinsic::getDeclaration(
+      F.getParent(), Intrinsic::experimental_deoptimize, {F.getReturnType()});
+  DeoptIntrinsic->setCallingConv(GuardDecl->getCallingConv());
+
+  for (auto *CI : ToLower) {
+    makeGuardControlFlowExplicit(DeoptIntrinsic, CI, false);
+    CI->eraseFromParent();
+  }
+
+  return true;
+}
+
+bool LowerGuardIntrinsicLegacyPass::runOnFunction(Function &F) {
+  return lowerGuardIntrinsic(F);
+}
+
+char LowerGuardIntrinsicLegacyPass::ID = 0;
+INITIALIZE_PASS(LowerGuardIntrinsicLegacyPass, "lower-guard-intrinsic",
+                "Lower the guard intrinsic to normal control flow", false,
+                false)
+
+Pass *llvm::createLowerGuardIntrinsicPass() {
+  return new LowerGuardIntrinsicLegacyPass();
+}
+
+PreservedAnalyses LowerGuardIntrinsicPass::run(Function &F,
+                                               FunctionAnalysisManager &AM) {
+  if (lowerGuardIntrinsic(F))
+    return PreservedAnalyses::none();
+
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 78e926254e..8e251ca940 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -1,1214 +1,1214 @@
-//===- LowerMatrixIntrinsics.cpp -  Lower matrix intrinsics -----*- C++ -*-===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// Lower matrix intrinsics to vector operations. 
-// 
-// TODO: 
-//  * Improve fusion: 
-//   * Support more cases, e.g. multiply-add, multiply-sub, operands/results 
-//     transposed. 
-//   * Improve cost-modeling, e.g. choose different number of rows/columns 
-//     columns for tiles, consider cost of copies on alias. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h" 
-#include "llvm/ADT/GraphTraits.h" 
-#include "llvm/ADT/PostOrderIterator.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/Analysis/AliasAnalysis.h" 
-#include "llvm/Analysis/DomTreeUpdater.h" 
-#include "llvm/Analysis/OptimizationRemarkEmitter.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/Analysis/VectorUtils.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DebugInfoMetadata.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Alignment.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
+//===- LowerMatrixIntrinsics.cpp -  Lower matrix intrinsics -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Lower matrix intrinsics to vector operations.
+//
+// TODO:
+//  * Improve fusion:
+//   * Support more cases, e.g. multiply-add, multiply-sub, operands/results
+//     transposed.
+//   * Improve cost-modeling, e.g. choose different number of rows/columns
+//     columns for tiles, consider cost of copies on alias.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/MatrixUtils.h"
- 
-using namespace llvm; 
-using namespace PatternMatch; 
- 
-#define DEBUG_TYPE "lower-matrix-intrinsics" 
- 
-static cl::opt<bool> EnableShapePropagation( 
-    "matrix-propagate-shape", cl::init(true), cl::Hidden, 
-    cl::desc("Enable/disable shape propagation from matrix intrinsics to other " 
-             "instructions.")); 
- 
-static cl::opt<bool> 
-    FuseMatrix("fuse-matrix", cl::init(true), cl::Hidden, 
-               cl::desc("Enable/disable fusing matrix instructions.")); 
-// TODO: Allow and use non-square tiles. 
-static cl::opt<unsigned> TileSize( 
-    "fuse-matrix-tile-size", cl::init(4), cl::Hidden, 
-    cl::desc( 
-        "Tile size for matrix instruction fusion using square-shaped tiles.")); 
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "lower-matrix-intrinsics"
+
+static cl::opt<bool> EnableShapePropagation(
+    "matrix-propagate-shape", cl::init(true), cl::Hidden,
+    cl::desc("Enable/disable shape propagation from matrix intrinsics to other "
+             "instructions."));
+
+static cl::opt<bool>
+    FuseMatrix("fuse-matrix", cl::init(true), cl::Hidden,
+               cl::desc("Enable/disable fusing matrix instructions."));
+// TODO: Allow and use non-square tiles.
+static cl::opt<unsigned> TileSize(
+    "fuse-matrix-tile-size", cl::init(4), cl::Hidden,
+    cl::desc(
+        "Tile size for matrix instruction fusion using square-shaped tiles."));
 static cl::opt<bool> TileUseLoops("fuse-matrix-use-loops", cl::init(false),
                                   cl::Hidden,
                                   cl::desc("Generate loop nest for tiling."));
-static cl::opt<bool> ForceFusion( 
-    "force-fuse-matrix", cl::init(false), cl::Hidden, 
-    cl::desc("Force matrix instruction fusion even if not profitable.")); 
-static cl::opt<bool> AllowContractEnabled( 
-    "matrix-allow-contract", cl::init(false), cl::Hidden, 
-    cl::desc("Allow the use of FMAs if available and profitable. This may " 
-             "result in different results, due to less rounding error.")); 
- 
-enum class MatrixLayoutTy { ColumnMajor, RowMajor }; 
- 
-static cl::opt<MatrixLayoutTy> MatrixLayout( 
-    "matrix-default-layout", cl::init(MatrixLayoutTy::ColumnMajor), 
-    cl::desc("Sets the default matrix layout"), 
-    cl::values(clEnumValN(MatrixLayoutTy::ColumnMajor, "column-major", 
-                          "Use column-major layout"), 
-               clEnumValN(MatrixLayoutTy::RowMajor, "row-major", 
-                          "Use row-major layout"))); 
- 
-/// Helper function to either return Scope, if it is a subprogram or the 
-/// attached subprogram for a local scope. 
-static DISubprogram *getSubprogram(DIScope *Scope) { 
-  if (auto *Subprogram = dyn_cast<DISubprogram>(Scope)) 
-    return Subprogram; 
-  return cast<DILocalScope>(Scope)->getSubprogram(); 
-} 
- 
-namespace { 
- 
-// Given an element pointer \p BasePtr to the start of a (sub) matrix, compute 
-// the start address of vector \p VecIdx with type (\p EltType x \p NumElements) 
-// assuming \p Stride elements between start two consecutive vectors. 
-// \p Stride must be >= \p NumElements. 
-// For column-major matrixes, the function computes the address of a column 
-// vectors and \p NumElements must be set to the number of elements in a column 
-// (= number of rows of the matrix). For row-major matrixes, the function 
-// computes the address of a row vector and \p NumElements must be set to the 
-// number of elements in a column (= number of columns of the matrix). 
-// 
-// Consider a 4x4 matrix in column-mjaor layout like below 
-// 
-//      0       1      2      3 
-// 0   v_0_0  v_0_1  v_0_2  v_0_3 
-// 1   v_1_0  v_1_1  v_1_2  v_1_3 
-// 2   v_2_0  v_2_1  v_2_2  v_2_3 
-// 3   v_3_0  v_3_1  v_3_2  v_3_3 
- 
-// To compute the column addresses for a 2x3 sub-matrix at row 1 and column 1, 
-// we need a pointer to the first element of the submatrix as base pointer. 
-// Then we can use computeVectorAddr to compute the addresses for the columns 
-// of the sub-matrix. 
-// 
-// Column 0: computeVectorAddr(Base, 0 (column), 4 (stride), 2 (num rows), ..) 
-//           -> just returns Base 
-// Column 1: computeVectorAddr(Base, 1 (column), 4 (stride), 2 (num rows), ..) 
-//           -> returns Base + (1 * 4) 
-// Column 2: computeVectorAddr(Base, 2 (column), 4 (stride), 2 (num rows), ..) 
-//           -> returns Base + (2 * 4) 
-// 
-// The graphic below illustrates the number of elements in a column (marked 
-// with |) and the number of skipped elements (marked with }). 
-// 
-//         v_0_0  v_0_1 {v_0_2 {v_0_3 
-//                Base   Col 1  Col 2 
-//                  |     |      | 
-//         v_1_0 |v_1_1 |v_1_2 |v_1_3 
-//         v_2_0 |v_2_1 |v_2_2 |v_2_3 
-//         v_3_0 {v_3_1 {v_3_2  v_3_3 
-// 
-Value *computeVectorAddr(Value *BasePtr, Value *VecIdx, Value *Stride, 
-                         unsigned NumElements, Type *EltType, 
-                         IRBuilder<> &Builder) { 
- 
-  assert((!isa<ConstantInt>(Stride) || 
-          cast<ConstantInt>(Stride)->getZExtValue() >= NumElements) && 
-         "Stride must be >= the number of elements in the result vector."); 
-  unsigned AS = cast<PointerType>(BasePtr->getType())->getAddressSpace(); 
- 
-  // Compute the start of the vector with index VecIdx as VecIdx * Stride. 
-  Value *VecStart = Builder.CreateMul(VecIdx, Stride, "vec.start"); 
- 
-  // Get pointer to the start of the selected vector. Skip GEP creation, 
-  // if we select vector 0. 
-  if (isa<ConstantInt>(VecStart) && cast<ConstantInt>(VecStart)->isZero()) 
-    VecStart = BasePtr; 
-  else 
-    VecStart = Builder.CreateGEP(EltType, BasePtr, VecStart, "vec.gep"); 
- 
-  // Cast elementwise vector start pointer to a pointer to a vector 
-  // (EltType x NumElements)*. 
-  auto *VecType = FixedVectorType::get(EltType, NumElements); 
-  Type *VecPtrType = PointerType::get(VecType, AS); 
-  return Builder.CreatePointerCast(VecStart, VecPtrType, "vec.cast"); 
-} 
- 
-/// LowerMatrixIntrinsics contains the methods used to lower matrix intrinsics. 
-/// 
-/// Currently, the lowering for each matrix intrinsic is done as follows: 
-/// 1. Propagate the shape information from intrinsics to connected 
-/// instructions. 
-/// 2. Lower instructions with shape information (assuming column-major layout). 
-///  The lowering works similarly using row-major layout. 
-///  2.1. Get column vectors for each argument. If we already lowered the 
-///       definition of an argument, use the produced column vectors directly. 
-///       If not, split the operand vector containing an embedded matrix into 
-///       a set of column vectors, 
-///  2.2. Lower the instruction in terms of column major operations, which 
-///       yields a set of column vectors containing result matrix. Note that we 
-///       lower all instructions that have shape information. Besides the 
-///       intrinsics, this includes stores for example. 
-///  2.3. Update uses of the lowered instruction. If we have shape information 
-///       for a user, there is nothing to do, as we will look up the result 
-///       column matrix when lowering the user. For other uses, we embed the 
-///       result matrix in a flat vector and update the use. 
-///  2.4. Cache the result column matrix for the instruction we lowered 
-/// 3. After we lowered all instructions in a function, remove the now 
-///    obsolete instructions. 
-/// 
-class LowerMatrixIntrinsics { 
-  Function &Func; 
-  const DataLayout &DL; 
-  const TargetTransformInfo &TTI; 
+static cl::opt<bool> ForceFusion(
+    "force-fuse-matrix", cl::init(false), cl::Hidden,
+    cl::desc("Force matrix instruction fusion even if not profitable."));
+static cl::opt<bool> AllowContractEnabled(
+    "matrix-allow-contract", cl::init(false), cl::Hidden,
+    cl::desc("Allow the use of FMAs if available and profitable. This may "
+             "result in different results, due to less rounding error."));
+
+enum class MatrixLayoutTy { ColumnMajor, RowMajor };
+
+static cl::opt<MatrixLayoutTy> MatrixLayout(
+    "matrix-default-layout", cl::init(MatrixLayoutTy::ColumnMajor),
+    cl::desc("Sets the default matrix layout"),
+    cl::values(clEnumValN(MatrixLayoutTy::ColumnMajor, "column-major",
+                          "Use column-major layout"),
+               clEnumValN(MatrixLayoutTy::RowMajor, "row-major",
+                          "Use row-major layout")));
+
+/// Helper function to either return Scope, if it is a subprogram or the
+/// attached subprogram for a local scope.
+static DISubprogram *getSubprogram(DIScope *Scope) {
+  if (auto *Subprogram = dyn_cast<DISubprogram>(Scope))
+    return Subprogram;
+  return cast<DILocalScope>(Scope)->getSubprogram();
+}
+
+namespace {
+
+// Given an element pointer \p BasePtr to the start of a (sub) matrix, compute
+// the start address of vector \p VecIdx with type (\p EltType x \p NumElements)
+// assuming \p Stride elements between start two consecutive vectors.
+// \p Stride must be >= \p NumElements.
+// For column-major matrixes, the function computes the address of a column
+// vectors and \p NumElements must be set to the number of elements in a column
+// (= number of rows of the matrix). For row-major matrixes, the function
+// computes the address of a row vector and \p NumElements must be set to the
+// number of elements in a column (= number of columns of the matrix).
+//
+// Consider a 4x4 matrix in column-mjaor layout like below
+//
+//      0       1      2      3
+// 0   v_0_0  v_0_1  v_0_2  v_0_3
+// 1   v_1_0  v_1_1  v_1_2  v_1_3
+// 2   v_2_0  v_2_1  v_2_2  v_2_3
+// 3   v_3_0  v_3_1  v_3_2  v_3_3
+
+// To compute the column addresses for a 2x3 sub-matrix at row 1 and column 1,
+// we need a pointer to the first element of the submatrix as base pointer.
+// Then we can use computeVectorAddr to compute the addresses for the columns
+// of the sub-matrix.
+//
+// Column 0: computeVectorAddr(Base, 0 (column), 4 (stride), 2 (num rows), ..)
+//           -> just returns Base
+// Column 1: computeVectorAddr(Base, 1 (column), 4 (stride), 2 (num rows), ..)
+//           -> returns Base + (1 * 4)
+// Column 2: computeVectorAddr(Base, 2 (column), 4 (stride), 2 (num rows), ..)
+//           -> returns Base + (2 * 4)
+//
+// The graphic below illustrates the number of elements in a column (marked
+// with |) and the number of skipped elements (marked with }).
+//
+//         v_0_0  v_0_1 {v_0_2 {v_0_3
+//                Base   Col 1  Col 2
+//                  |     |      |
+//         v_1_0 |v_1_1 |v_1_2 |v_1_3
+//         v_2_0 |v_2_1 |v_2_2 |v_2_3
+//         v_3_0 {v_3_1 {v_3_2  v_3_3
+//
+Value *computeVectorAddr(Value *BasePtr, Value *VecIdx, Value *Stride,
+                         unsigned NumElements, Type *EltType,
+                         IRBuilder<> &Builder) {
+
+  assert((!isa<ConstantInt>(Stride) ||
+          cast<ConstantInt>(Stride)->getZExtValue() >= NumElements) &&
+         "Stride must be >= the number of elements in the result vector.");
+  unsigned AS = cast<PointerType>(BasePtr->getType())->getAddressSpace();
+
+  // Compute the start of the vector with index VecIdx as VecIdx * Stride.
+  Value *VecStart = Builder.CreateMul(VecIdx, Stride, "vec.start");
+
+  // Get pointer to the start of the selected vector. Skip GEP creation,
+  // if we select vector 0.
+  if (isa<ConstantInt>(VecStart) && cast<ConstantInt>(VecStart)->isZero())
+    VecStart = BasePtr;
+  else
+    VecStart = Builder.CreateGEP(EltType, BasePtr, VecStart, "vec.gep");
+
+  // Cast elementwise vector start pointer to a pointer to a vector
+  // (EltType x NumElements)*.
+  auto *VecType = FixedVectorType::get(EltType, NumElements);
+  Type *VecPtrType = PointerType::get(VecType, AS);
+  return Builder.CreatePointerCast(VecStart, VecPtrType, "vec.cast");
+}
+
+/// LowerMatrixIntrinsics contains the methods used to lower matrix intrinsics.
+///
+/// Currently, the lowering for each matrix intrinsic is done as follows:
+/// 1. Propagate the shape information from intrinsics to connected
+/// instructions.
+/// 2. Lower instructions with shape information (assuming column-major layout).
+///  The lowering works similarly using row-major layout.
+///  2.1. Get column vectors for each argument. If we already lowered the
+///       definition of an argument, use the produced column vectors directly.
+///       If not, split the operand vector containing an embedded matrix into
+///       a set of column vectors,
+///  2.2. Lower the instruction in terms of column major operations, which
+///       yields a set of column vectors containing result matrix. Note that we
+///       lower all instructions that have shape information. Besides the
+///       intrinsics, this includes stores for example.
+///  2.3. Update uses of the lowered instruction. If we have shape information
+///       for a user, there is nothing to do, as we will look up the result
+///       column matrix when lowering the user. For other uses, we embed the
+///       result matrix in a flat vector and update the use.
+///  2.4. Cache the result column matrix for the instruction we lowered
+/// 3. After we lowered all instructions in a function, remove the now
+///    obsolete instructions.
+///
+class LowerMatrixIntrinsics {
+  Function &Func;
+  const DataLayout &DL;
+  const TargetTransformInfo &TTI;
   AliasAnalysis *AA;
   DominatorTree *DT;
   LoopInfo *LI;
   OptimizationRemarkEmitter *ORE;
- 
-  /// Contains estimates of the number of operations (loads, stores, compute) required to lower a matrix operation. 
-  struct OpInfoTy { 
-    /// Number of stores emitted to generate this matrix. 
-    unsigned NumStores = 0; 
-    /// Number of loads emitted to generate this matrix. 
-    unsigned NumLoads = 0; 
-    /// Number of compute operations emitted to generate this matrix. 
-    unsigned NumComputeOps = 0; 
- 
-    OpInfoTy &operator+=(const OpInfoTy &RHS) { 
-      NumStores += RHS.NumStores; 
-      NumLoads += RHS.NumLoads; 
-      NumComputeOps += RHS.NumComputeOps; 
-      return *this; 
-    } 
-  }; 
- 
-  /// Wrapper class representing a matrix as a set of vectors, either in row or 
-  /// column major layout. All vectors must have the same vector type. 
-  class MatrixTy { 
-    SmallVector<Value *, 16> Vectors; 
- 
-    OpInfoTy OpInfo; 
- 
-    bool IsColumnMajor = true; 
- 
-  public: 
-    MatrixTy() 
-        : Vectors(), 
-          IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {} 
-    MatrixTy(ArrayRef<Value *> Vectors) 
-        : Vectors(Vectors.begin(), Vectors.end()), 
-          IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {} 
-    MatrixTy(unsigned NumRows, unsigned NumColumns, Type *EltTy) 
-        : IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) { 
- 
-      unsigned D = isColumnMajor() ? NumColumns : NumRows; 
-      for (unsigned J = 0; J < D; ++J) 
-        addVector(UndefValue::get(FixedVectorType::get( 
-            EltTy, isColumnMajor() ? NumRows : NumColumns))); 
-    } 
- 
-    Value *getVector(unsigned i) const { return Vectors[i]; } 
-    Value *getColumn(unsigned i) const { 
-      assert(isColumnMajor() && "only supported for column-major matrixes"); 
-      return Vectors[i]; 
-    } 
-    Value *getRow(unsigned i) const { 
-      assert(!isColumnMajor() && "only supported for row-major matrixes"); 
-      return Vectors[i]; 
-    } 
- 
-    void setVector(unsigned i, Value *V) { Vectors[i] = V; } 
- 
+
+  /// Contains estimates of the number of operations (loads, stores, compute) required to lower a matrix operation.
+  struct OpInfoTy {
+    /// Number of stores emitted to generate this matrix.
+    unsigned NumStores = 0;
+    /// Number of loads emitted to generate this matrix.
+    unsigned NumLoads = 0;
+    /// Number of compute operations emitted to generate this matrix.
+    unsigned NumComputeOps = 0;
+
+    OpInfoTy &operator+=(const OpInfoTy &RHS) {
+      NumStores += RHS.NumStores;
+      NumLoads += RHS.NumLoads;
+      NumComputeOps += RHS.NumComputeOps;
+      return *this;
+    }
+  };
+
+  /// Wrapper class representing a matrix as a set of vectors, either in row or
+  /// column major layout. All vectors must have the same vector type.
+  class MatrixTy {
+    SmallVector<Value *, 16> Vectors;
+
+    OpInfoTy OpInfo;
+
+    bool IsColumnMajor = true;
+
+  public:
+    MatrixTy()
+        : Vectors(),
+          IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {}
+    MatrixTy(ArrayRef<Value *> Vectors)
+        : Vectors(Vectors.begin(), Vectors.end()),
+          IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {}
+    MatrixTy(unsigned NumRows, unsigned NumColumns, Type *EltTy)
+        : IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {
+
+      unsigned D = isColumnMajor() ? NumColumns : NumRows;
+      for (unsigned J = 0; J < D; ++J)
+        addVector(UndefValue::get(FixedVectorType::get(
+            EltTy, isColumnMajor() ? NumRows : NumColumns)));
+    }
+
+    Value *getVector(unsigned i) const { return Vectors[i]; }
+    Value *getColumn(unsigned i) const {
+      assert(isColumnMajor() && "only supported for column-major matrixes");
+      return Vectors[i];
+    }
+    Value *getRow(unsigned i) const {
+      assert(!isColumnMajor() && "only supported for row-major matrixes");
+      return Vectors[i];
+    }
+
+    void setVector(unsigned i, Value *V) { Vectors[i] = V; }
+
     Type *getElementType() const { return getVectorTy()->getElementType(); }
- 
-    unsigned getNumVectors() const { 
-      if (isColumnMajor()) 
-        return getNumColumns(); 
-      return getNumRows(); 
-    } 
- 
-    unsigned getNumColumns() const { 
-      if (isColumnMajor()) 
-        return Vectors.size(); 
-      else { 
-        assert(Vectors.size() > 0 && "Cannot call getNumRows without columns"); 
-        return cast<FixedVectorType>(Vectors[0]->getType())->getNumElements(); 
-      } 
-    } 
-    unsigned getNumRows() const { 
-      if (isColumnMajor()) { 
-        assert(Vectors.size() > 0 && "Cannot call getNumRows without columns"); 
-        return cast<FixedVectorType>(Vectors[0]->getType())->getNumElements(); 
-      } else 
-        return Vectors.size(); 
-    } 
- 
-    void addVector(Value *V) { Vectors.push_back(V); } 
-    VectorType *getColumnTy() { 
-      assert(isColumnMajor() && "only supported for column-major matrixes"); 
-      return getVectorTy(); 
-    } 
- 
+
+    unsigned getNumVectors() const {
+      if (isColumnMajor())
+        return getNumColumns();
+      return getNumRows();
+    }
+
+    unsigned getNumColumns() const {
+      if (isColumnMajor())
+        return Vectors.size();
+      else {
+        assert(Vectors.size() > 0 && "Cannot call getNumRows without columns");
+        return cast<FixedVectorType>(Vectors[0]->getType())->getNumElements();
+      }
+    }
+    unsigned getNumRows() const {
+      if (isColumnMajor()) {
+        assert(Vectors.size() > 0 && "Cannot call getNumRows without columns");
+        return cast<FixedVectorType>(Vectors[0]->getType())->getNumElements();
+      } else
+        return Vectors.size();
+    }
+
+    void addVector(Value *V) { Vectors.push_back(V); }
+    VectorType *getColumnTy() {
+      assert(isColumnMajor() && "only supported for column-major matrixes");
+      return getVectorTy();
+    }
+
     VectorType *getVectorTy() const {
-      return cast<VectorType>(Vectors[0]->getType()); 
-    } 
- 
-    iterator_range<SmallVector<Value *, 8>::iterator> columns() { 
-      assert(isColumnMajor() && 
-             "columns() only supported for column-major matrixes"); 
-      return make_range(Vectors.begin(), Vectors.end()); 
-    } 
- 
-    iterator_range<SmallVector<Value *, 8>::iterator> vectors() { 
-      return make_range(Vectors.begin(), Vectors.end()); 
-    } 
- 
-    /// Embed the vectors of the matrix into a flat vector by concatenating 
-    /// them. 
-    Value *embedInVector(IRBuilder<> &Builder) const { 
-      return Vectors.size() == 1 ? Vectors[0] 
-                                 : concatenateVectors(Builder, Vectors); 
-    } 
- 
-    MatrixTy &addNumLoads(unsigned N) { 
-      OpInfo.NumLoads += N; 
-      return *this; 
-    } 
- 
-    void setNumLoads(unsigned N) { OpInfo.NumLoads = N; } 
- 
-    MatrixTy &addNumStores(unsigned N) { 
-      OpInfo.NumStores += N; 
-      return *this; 
-    } 
- 
-    MatrixTy &addNumComputeOps(unsigned N) { 
-      OpInfo.NumComputeOps += N; 
-      return *this; 
-    } 
- 
-    unsigned getNumStores() const { return OpInfo.NumStores; } 
-    unsigned getNumLoads() const { return OpInfo.NumLoads; } 
-    unsigned getNumComputeOps() const { return OpInfo.NumComputeOps; } 
- 
-    const OpInfoTy &getOpInfo() const { return OpInfo; } 
- 
-    bool isColumnMajor() const { return IsColumnMajor; } 
- 
-    unsigned getStride() const { 
-      if (isColumnMajor()) 
-        return getNumRows(); 
-      return getNumColumns(); 
-    } 
- 
-    /// Extract a vector of \p NumElts starting at index (\p I, \p J). If the 
-    /// matrix is column-major, the result vector is extracted from a column 
-    /// vector, otherwise from a row vector. 
-    Value *extractVector(unsigned I, unsigned J, unsigned NumElts, 
-                         IRBuilder<> &Builder) const { 
-      Value *Vec = isColumnMajor() ? getColumn(J) : getRow(I); 
-      return Builder.CreateShuffleVector( 
+      return cast<VectorType>(Vectors[0]->getType());
+    }
+
+    iterator_range<SmallVector<Value *, 8>::iterator> columns() {
+      assert(isColumnMajor() &&
+             "columns() only supported for column-major matrixes");
+      return make_range(Vectors.begin(), Vectors.end());
+    }
+
+    iterator_range<SmallVector<Value *, 8>::iterator> vectors() {
+      return make_range(Vectors.begin(), Vectors.end());
+    }
+
+    /// Embed the vectors of the matrix into a flat vector by concatenating
+    /// them.
+    Value *embedInVector(IRBuilder<> &Builder) const {
+      return Vectors.size() == 1 ? Vectors[0]
+                                 : concatenateVectors(Builder, Vectors);
+    }
+
+    MatrixTy &addNumLoads(unsigned N) {
+      OpInfo.NumLoads += N;
+      return *this;
+    }
+
+    void setNumLoads(unsigned N) { OpInfo.NumLoads = N; }
+
+    MatrixTy &addNumStores(unsigned N) {
+      OpInfo.NumStores += N;
+      return *this;
+    }
+
+    MatrixTy &addNumComputeOps(unsigned N) {
+      OpInfo.NumComputeOps += N;
+      return *this;
+    }
+
+    unsigned getNumStores() const { return OpInfo.NumStores; }
+    unsigned getNumLoads() const { return OpInfo.NumLoads; }
+    unsigned getNumComputeOps() const { return OpInfo.NumComputeOps; }
+
+    const OpInfoTy &getOpInfo() const { return OpInfo; }
+
+    bool isColumnMajor() const { return IsColumnMajor; }
+
+    unsigned getStride() const {
+      if (isColumnMajor())
+        return getNumRows();
+      return getNumColumns();
+    }
+
+    /// Extract a vector of \p NumElts starting at index (\p I, \p J). If the
+    /// matrix is column-major, the result vector is extracted from a column
+    /// vector, otherwise from a row vector.
+    Value *extractVector(unsigned I, unsigned J, unsigned NumElts,
+                         IRBuilder<> &Builder) const {
+      Value *Vec = isColumnMajor() ? getColumn(J) : getRow(I);
+      return Builder.CreateShuffleVector(
           Vec, createSequentialMask(isColumnMajor() ? I : J, NumElts, 0),
-          "block"); 
-    } 
-  }; 
- 
-  struct ShapeInfo { 
-    unsigned NumRows; 
-    unsigned NumColumns; 
- 
-    bool IsColumnMajor; 
- 
-    ShapeInfo(unsigned NumRows = 0, unsigned NumColumns = 0) 
-        : NumRows(NumRows), NumColumns(NumColumns), 
-          IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {} 
- 
-    ShapeInfo(Value *NumRows, Value *NumColumns) 
-        : ShapeInfo(cast<ConstantInt>(NumRows)->getZExtValue(), 
-                    cast<ConstantInt>(NumColumns)->getZExtValue()) {} 
- 
-    bool operator==(const ShapeInfo &other) { 
-      return NumRows == other.NumRows && NumColumns == other.NumColumns; 
-    } 
-    bool operator!=(const ShapeInfo &other) { return !(*this == other); } 
- 
-    /// Returns true if shape-information is defined, meaning both dimensions 
-    /// are != 0. 
-    operator bool() const { 
-      assert(NumRows == 0 || NumColumns != 0); 
-      return NumRows != 0; 
-    } 
- 
-    unsigned getStride() const { 
-      if (IsColumnMajor) 
-        return NumRows; 
-      return NumColumns; 
-    } 
- 
-    unsigned getNumVectors() const { 
-      if (IsColumnMajor) 
-        return NumColumns; 
-      return NumRows; 
-    } 
-  }; 
- 
-  /// Maps instructions to their shape information. The shape information 
-  /// describes the shape to be used while lowering. This matches the shape of 
-  /// the result value of the instruction, with the only exceptions being store 
-  /// instructions and the matrix_column_major_store intrinsics. For those, the 
-  /// shape information indicates that those instructions should be lowered 
-  /// using shape information as well. 
-  DenseMap<Value *, ShapeInfo> ShapeMap; 
- 
-  /// List of instructions to remove. While lowering, we are not replacing all 
-  /// users of a lowered instruction, if shape information is available and 
-  /// those need to be removed after we finished lowering. 
-  SmallVector<Instruction *, 16> ToRemove; 
- 
-  /// Map from instructions to their produced column matrix. 
-  MapVector<Value *, MatrixTy> Inst2ColumnMatrix; 
- 
-public: 
-  LowerMatrixIntrinsics(Function &F, TargetTransformInfo &TTI, 
+          "block");
+    }
+  };
+
+  struct ShapeInfo {
+    unsigned NumRows;
+    unsigned NumColumns;
+
+    bool IsColumnMajor;
+
+    ShapeInfo(unsigned NumRows = 0, unsigned NumColumns = 0)
+        : NumRows(NumRows), NumColumns(NumColumns),
+          IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {}
+
+    ShapeInfo(Value *NumRows, Value *NumColumns)
+        : ShapeInfo(cast<ConstantInt>(NumRows)->getZExtValue(),
+                    cast<ConstantInt>(NumColumns)->getZExtValue()) {}
+
+    bool operator==(const ShapeInfo &other) {
+      return NumRows == other.NumRows && NumColumns == other.NumColumns;
+    }
+    bool operator!=(const ShapeInfo &other) { return !(*this == other); }
+
+    /// Returns true if shape-information is defined, meaning both dimensions
+    /// are != 0.
+    operator bool() const {
+      assert(NumRows == 0 || NumColumns != 0);
+      return NumRows != 0;
+    }
+
+    unsigned getStride() const {
+      if (IsColumnMajor)
+        return NumRows;
+      return NumColumns;
+    }
+
+    unsigned getNumVectors() const {
+      if (IsColumnMajor)
+        return NumColumns;
+      return NumRows;
+    }
+  };
+
+  /// Maps instructions to their shape information. The shape information
+  /// describes the shape to be used while lowering. This matches the shape of
+  /// the result value of the instruction, with the only exceptions being store
+  /// instructions and the matrix_column_major_store intrinsics. For those, the
+  /// shape information indicates that those instructions should be lowered
+  /// using shape information as well.
+  DenseMap<Value *, ShapeInfo> ShapeMap;
+
+  /// List of instructions to remove. While lowering, we are not replacing all
+  /// users of a lowered instruction, if shape information is available and
+  /// those need to be removed after we finished lowering.
+  SmallVector<Instruction *, 16> ToRemove;
+
+  /// Map from instructions to their produced column matrix.
+  MapVector<Value *, MatrixTy> Inst2ColumnMatrix;
+
+public:
+  LowerMatrixIntrinsics(Function &F, TargetTransformInfo &TTI,
                         AliasAnalysis *AA, DominatorTree *DT, LoopInfo *LI,
                         OptimizationRemarkEmitter *ORE)
-      : Func(F), DL(F.getParent()->getDataLayout()), TTI(TTI), AA(AA), DT(DT), 
-        LI(LI), ORE(ORE) {} 
- 
-  unsigned getNumOps(Type *VT) { 
-    assert(isa<VectorType>(VT) && "Expected vector type"); 
-    return getNumOps(VT->getScalarType(), 
-                     cast<FixedVectorType>(VT)->getNumElements()); 
-  } 
- 
-  // 
-  /// Return the estimated number of vector ops required for an operation on 
-  /// \p VT * N. 
-  unsigned getNumOps(Type *ST, unsigned N) { 
-    return std::ceil((ST->getPrimitiveSizeInBits() * N).getFixedSize() / 
-                     double(TTI.getRegisterBitWidth(true))); 
-  } 
- 
-  /// Return the set of vectors that a matrix value is lowered to. 
-  /// 
-  /// If we lowered \p MatrixVal, just return the cache result matrix. Otherwise 
-  /// split the flat vector \p MatrixVal containing a matrix with shape \p SI 
-  /// into vectors. 
-  MatrixTy getMatrix(Value *MatrixVal, const ShapeInfo &SI, 
-                     IRBuilder<> &Builder) { 
-    VectorType *VType = dyn_cast<VectorType>(MatrixVal->getType()); 
-    assert(VType && "MatrixVal must be a vector type"); 
-    assert(cast<FixedVectorType>(VType)->getNumElements() == 
-               SI.NumRows * SI.NumColumns && 
-           "The vector size must match the number of matrix elements"); 
- 
-    // Check if we lowered MatrixVal using shape information. In that case, 
-    // return the existing matrix, if it matches the requested shape 
-    // information. If there is a mis-match, embed the result in a flat 
-    // vector and split it later. 
-    auto Found = Inst2ColumnMatrix.find(MatrixVal); 
-    if (Found != Inst2ColumnMatrix.end()) { 
-      MatrixTy &M = Found->second; 
-      // Return the found matrix, if its shape matches the requested shape 
-      // information 
-      if (SI.NumRows == M.getNumRows() && SI.NumColumns == M.getNumColumns()) 
-        return M; 
- 
-      MatrixVal = M.embedInVector(Builder); 
-    } 
- 
-    // Otherwise split MatrixVal. 
-    SmallVector<Value *, 16> SplitVecs; 
-    for (unsigned MaskStart = 0; 
-         MaskStart < cast<FixedVectorType>(VType)->getNumElements(); 
-         MaskStart += SI.getStride()) { 
-      Value *V = Builder.CreateShuffleVector( 
+      : Func(F), DL(F.getParent()->getDataLayout()), TTI(TTI), AA(AA), DT(DT),
+        LI(LI), ORE(ORE) {}
+
+  unsigned getNumOps(Type *VT) {
+    assert(isa<VectorType>(VT) && "Expected vector type");
+    return getNumOps(VT->getScalarType(),
+                     cast<FixedVectorType>(VT)->getNumElements());
+  }
+
+  //
+  /// Return the estimated number of vector ops required for an operation on
+  /// \p VT * N.
+  unsigned getNumOps(Type *ST, unsigned N) {
+    return std::ceil((ST->getPrimitiveSizeInBits() * N).getFixedSize() /
+                     double(TTI.getRegisterBitWidth(true)));
+  }
+
+  /// Return the set of vectors that a matrix value is lowered to.
+  ///
+  /// If we lowered \p MatrixVal, just return the cache result matrix. Otherwise
+  /// split the flat vector \p MatrixVal containing a matrix with shape \p SI
+  /// into vectors.
+  MatrixTy getMatrix(Value *MatrixVal, const ShapeInfo &SI,
+                     IRBuilder<> &Builder) {
+    VectorType *VType = dyn_cast<VectorType>(MatrixVal->getType());
+    assert(VType && "MatrixVal must be a vector type");
+    assert(cast<FixedVectorType>(VType)->getNumElements() ==
+               SI.NumRows * SI.NumColumns &&
+           "The vector size must match the number of matrix elements");
+
+    // Check if we lowered MatrixVal using shape information. In that case,
+    // return the existing matrix, if it matches the requested shape
+    // information. If there is a mis-match, embed the result in a flat
+    // vector and split it later.
+    auto Found = Inst2ColumnMatrix.find(MatrixVal);
+    if (Found != Inst2ColumnMatrix.end()) {
+      MatrixTy &M = Found->second;
+      // Return the found matrix, if its shape matches the requested shape
+      // information
+      if (SI.NumRows == M.getNumRows() && SI.NumColumns == M.getNumColumns())
+        return M;
+
+      MatrixVal = M.embedInVector(Builder);
+    }
+
+    // Otherwise split MatrixVal.
+    SmallVector<Value *, 16> SplitVecs;
+    for (unsigned MaskStart = 0;
+         MaskStart < cast<FixedVectorType>(VType)->getNumElements();
+         MaskStart += SI.getStride()) {
+      Value *V = Builder.CreateShuffleVector(
           MatrixVal, createSequentialMask(MaskStart, SI.getStride(), 0),
-          "split"); 
-      SplitVecs.push_back(V); 
-    } 
- 
-    return {SplitVecs}; 
-  } 
- 
-  /// If \p V already has a known shape return false.  Otherwise set the shape 
-  /// for instructions that support it. 
-  bool setShapeInfo(Value *V, ShapeInfo Shape) { 
-    assert(Shape && "Shape not set"); 
-    if (isa<UndefValue>(V) || !supportsShapeInfo(V)) 
-      return false; 
- 
-    auto SIter = ShapeMap.find(V); 
-    if (SIter != ShapeMap.end()) { 
-      LLVM_DEBUG(dbgs() << "  not overriding existing shape: " 
-                        << SIter->second.NumRows << " " 
-                        << SIter->second.NumColumns << " for " << *V << "\n"); 
-      return false; 
-    } 
- 
-    ShapeMap.insert({V, Shape}); 
-    LLVM_DEBUG(dbgs() << "  " << Shape.NumRows << " x " << Shape.NumColumns 
-                      << " for " << *V << "\n"); 
-    return true; 
-  } 
- 
-  bool isUniformShape(Value *V) { 
-    Instruction *I = dyn_cast<Instruction>(V); 
-    if (!I) 
-      return true; 
- 
-    switch (I->getOpcode()) { 
-    case Instruction::FAdd: 
-    case Instruction::FSub: 
-    case Instruction::FMul: // Scalar multiply. 
+          "split");
+      SplitVecs.push_back(V);
+    }
+
+    return {SplitVecs};
+  }
+
+  /// If \p V already has a known shape return false.  Otherwise set the shape
+  /// for instructions that support it.
+  bool setShapeInfo(Value *V, ShapeInfo Shape) {
+    assert(Shape && "Shape not set");
+    if (isa<UndefValue>(V) || !supportsShapeInfo(V))
+      return false;
+
+    auto SIter = ShapeMap.find(V);
+    if (SIter != ShapeMap.end()) {
+      LLVM_DEBUG(dbgs() << "  not overriding existing shape: "
+                        << SIter->second.NumRows << " "
+                        << SIter->second.NumColumns << " for " << *V << "\n");
+      return false;
+    }
+
+    ShapeMap.insert({V, Shape});
+    LLVM_DEBUG(dbgs() << "  " << Shape.NumRows << " x " << Shape.NumColumns
+                      << " for " << *V << "\n");
+    return true;
+  }
+
+  bool isUniformShape(Value *V) {
+    Instruction *I = dyn_cast<Instruction>(V);
+    if (!I)
+      return true;
+
+    switch (I->getOpcode()) {
+    case Instruction::FAdd:
+    case Instruction::FSub:
+    case Instruction::FMul: // Scalar multiply.
     case Instruction::FNeg:
-    case Instruction::Add: 
-    case Instruction::Mul: 
-    case Instruction::Sub: 
-      return true; 
-    default: 
-      return false; 
-    } 
-  } 
- 
-  /// Returns true if shape information can be used for \p V. The supported 
-  /// instructions must match the instructions that can be lowered by this pass. 
-  bool supportsShapeInfo(Value *V) { 
-    Instruction *Inst = dyn_cast<Instruction>(V); 
-    if (!Inst) 
-      return false; 
- 
-    IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst); 
-    if (II) 
-      switch (II->getIntrinsicID()) { 
-      case Intrinsic::matrix_multiply: 
-      case Intrinsic::matrix_transpose: 
-      case Intrinsic::matrix_column_major_load: 
-      case Intrinsic::matrix_column_major_store: 
-        return true; 
-      default: 
-        return false; 
-      } 
-    return isUniformShape(V) || isa<StoreInst>(V) || isa<LoadInst>(V); 
-  } 
- 
-  /// Propagate the shape information of instructions to their users. 
-  /// The work list contains instructions for which we can compute the shape, 
-  /// either based on the information provided by matrix intrinsics or known 
-  /// shapes of operands. 
-  SmallVector<Instruction *, 32> 
-  propagateShapeForward(SmallVectorImpl<Instruction *> &WorkList) { 
-    SmallVector<Instruction *, 32> NewWorkList; 
-    // Pop an element for which we guaranteed to have at least one of the 
-    // operand shapes.  Add the shape for this and then add users to the work 
-    // list. 
-    LLVM_DEBUG(dbgs() << "Forward-propagate shapes:\n"); 
-    while (!WorkList.empty()) { 
+    case Instruction::Add:
+    case Instruction::Mul:
+    case Instruction::Sub:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  /// Returns true if shape information can be used for \p V. The supported
+  /// instructions must match the instructions that can be lowered by this pass.
+  bool supportsShapeInfo(Value *V) {
+    Instruction *Inst = dyn_cast<Instruction>(V);
+    if (!Inst)
+      return false;
+
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst);
+    if (II)
+      switch (II->getIntrinsicID()) {
+      case Intrinsic::matrix_multiply:
+      case Intrinsic::matrix_transpose:
+      case Intrinsic::matrix_column_major_load:
+      case Intrinsic::matrix_column_major_store:
+        return true;
+      default:
+        return false;
+      }
+    return isUniformShape(V) || isa<StoreInst>(V) || isa<LoadInst>(V);
+  }
+
+  /// Propagate the shape information of instructions to their users.
+  /// The work list contains instructions for which we can compute the shape,
+  /// either based on the information provided by matrix intrinsics or known
+  /// shapes of operands.
+  SmallVector<Instruction *, 32>
+  propagateShapeForward(SmallVectorImpl<Instruction *> &WorkList) {
+    SmallVector<Instruction *, 32> NewWorkList;
+    // Pop an element for which we guaranteed to have at least one of the
+    // operand shapes.  Add the shape for this and then add users to the work
+    // list.
+    LLVM_DEBUG(dbgs() << "Forward-propagate shapes:\n");
+    while (!WorkList.empty()) {
       Instruction *Inst = WorkList.pop_back_val();
- 
-      // New entry, set the value and insert operands 
-      bool Propagate = false; 
- 
-      Value *MatrixA; 
-      Value *MatrixB; 
-      Value *M; 
-      Value *N; 
-      Value *K; 
-      if (match(Inst, m_Intrinsic<Intrinsic::matrix_multiply>( 
-                          m_Value(MatrixA), m_Value(MatrixB), m_Value(M), 
-                          m_Value(N), m_Value(K)))) { 
-        Propagate = setShapeInfo(Inst, {M, K}); 
-      } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_transpose>( 
-                                 m_Value(MatrixA), m_Value(M), m_Value(N)))) { 
-        // Flip dimensions. 
-        Propagate = setShapeInfo(Inst, {N, M}); 
-      } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_column_major_store>( 
-                                 m_Value(MatrixA), m_Value(), m_Value(), 
-                                 m_Value(), m_Value(M), m_Value(N)))) { 
-        Propagate = setShapeInfo(Inst, {N, M}); 
-      } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_column_major_load>( 
-                                 m_Value(), m_Value(), m_Value(), m_Value(M), 
-                                 m_Value(N)))) { 
-        Propagate = setShapeInfo(Inst, {M, N}); 
-      } else if (match(Inst, m_Store(m_Value(MatrixA), m_Value()))) { 
-        auto OpShape = ShapeMap.find(MatrixA); 
-        if (OpShape != ShapeMap.end()) 
-          setShapeInfo(Inst, OpShape->second); 
-        continue; 
-      } else if (isUniformShape(Inst)) { 
-        // Find the first operand that has a known shape and use that. 
-        for (auto &Op : Inst->operands()) { 
-          auto OpShape = ShapeMap.find(Op.get()); 
-          if (OpShape != ShapeMap.end()) { 
-            Propagate |= setShapeInfo(Inst, OpShape->second); 
-            break; 
-          } 
-        } 
-      } 
- 
-      if (Propagate) { 
-        NewWorkList.push_back(Inst); 
-        for (auto *User : Inst->users()) 
-          if (ShapeMap.count(User) == 0) 
-            WorkList.push_back(cast<Instruction>(User)); 
-      } 
-    } 
- 
-    return NewWorkList; 
-  } 
- 
-  /// Propagate the shape to operands of instructions with shape information. 
-  /// \p Worklist contains the instruction for which we already know the shape. 
-  SmallVector<Instruction *, 32> 
-  propagateShapeBackward(SmallVectorImpl<Instruction *> &WorkList) { 
-    SmallVector<Instruction *, 32> NewWorkList; 
- 
-    auto pushInstruction = [](Value *V, 
-                              SmallVectorImpl<Instruction *> &WorkList) { 
-      Instruction *I = dyn_cast<Instruction>(V); 
-      if (I) 
-        WorkList.push_back(I); 
-    }; 
-    // Pop an element with known shape.  Traverse the operands, if their shape 
-    // derives from the result shape and is unknown, add it and add them to the 
-    // worklist. 
-    LLVM_DEBUG(dbgs() << "Backward-propagate shapes:\n"); 
-    while (!WorkList.empty()) { 
+
+      // New entry, set the value and insert operands
+      bool Propagate = false;
+
+      Value *MatrixA;
+      Value *MatrixB;
+      Value *M;
+      Value *N;
+      Value *K;
+      if (match(Inst, m_Intrinsic<Intrinsic::matrix_multiply>(
+                          m_Value(MatrixA), m_Value(MatrixB), m_Value(M),
+                          m_Value(N), m_Value(K)))) {
+        Propagate = setShapeInfo(Inst, {M, K});
+      } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_transpose>(
+                                 m_Value(MatrixA), m_Value(M), m_Value(N)))) {
+        // Flip dimensions.
+        Propagate = setShapeInfo(Inst, {N, M});
+      } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_column_major_store>(
+                                 m_Value(MatrixA), m_Value(), m_Value(),
+                                 m_Value(), m_Value(M), m_Value(N)))) {
+        Propagate = setShapeInfo(Inst, {N, M});
+      } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_column_major_load>(
+                                 m_Value(), m_Value(), m_Value(), m_Value(M),
+                                 m_Value(N)))) {
+        Propagate = setShapeInfo(Inst, {M, N});
+      } else if (match(Inst, m_Store(m_Value(MatrixA), m_Value()))) {
+        auto OpShape = ShapeMap.find(MatrixA);
+        if (OpShape != ShapeMap.end())
+          setShapeInfo(Inst, OpShape->second);
+        continue;
+      } else if (isUniformShape(Inst)) {
+        // Find the first operand that has a known shape and use that.
+        for (auto &Op : Inst->operands()) {
+          auto OpShape = ShapeMap.find(Op.get());
+          if (OpShape != ShapeMap.end()) {
+            Propagate |= setShapeInfo(Inst, OpShape->second);
+            break;
+          }
+        }
+      }
+
+      if (Propagate) {
+        NewWorkList.push_back(Inst);
+        for (auto *User : Inst->users())
+          if (ShapeMap.count(User) == 0)
+            WorkList.push_back(cast<Instruction>(User));
+      }
+    }
+
+    return NewWorkList;
+  }
+
+  /// Propagate the shape to operands of instructions with shape information.
+  /// \p Worklist contains the instruction for which we already know the shape.
+  SmallVector<Instruction *, 32>
+  propagateShapeBackward(SmallVectorImpl<Instruction *> &WorkList) {
+    SmallVector<Instruction *, 32> NewWorkList;
+
+    auto pushInstruction = [](Value *V,
+                              SmallVectorImpl<Instruction *> &WorkList) {
+      Instruction *I = dyn_cast<Instruction>(V);
+      if (I)
+        WorkList.push_back(I);
+    };
+    // Pop an element with known shape.  Traverse the operands, if their shape
+    // derives from the result shape and is unknown, add it and add them to the
+    // worklist.
+    LLVM_DEBUG(dbgs() << "Backward-propagate shapes:\n");
+    while (!WorkList.empty()) {
       Value *V = WorkList.pop_back_val();
- 
-      size_t BeforeProcessingV = WorkList.size(); 
-      if (!isa<Instruction>(V)) 
-        continue; 
- 
-      Value *MatrixA; 
-      Value *MatrixB; 
-      Value *M; 
-      Value *N; 
-      Value *K; 
-      if (match(V, m_Intrinsic<Intrinsic::matrix_multiply>( 
-                       m_Value(MatrixA), m_Value(MatrixB), m_Value(M), 
-                       m_Value(N), m_Value(K)))) { 
-        if (setShapeInfo(MatrixA, {M, N})) 
-          pushInstruction(MatrixA, WorkList); 
- 
-        if (setShapeInfo(MatrixB, {N, K})) 
-          pushInstruction(MatrixB, WorkList); 
- 
-      } else if (match(V, m_Intrinsic<Intrinsic::matrix_transpose>( 
-                              m_Value(MatrixA), m_Value(M), m_Value(N)))) { 
-        // Flip dimensions. 
-        if (setShapeInfo(MatrixA, {M, N})) 
-          pushInstruction(MatrixA, WorkList); 
-      } else if (match(V, m_Intrinsic<Intrinsic::matrix_column_major_store>( 
-                              m_Value(MatrixA), m_Value(), m_Value(), m_Value(), 
-                              m_Value(M), m_Value(N)))) { 
-        if (setShapeInfo(MatrixA, {M, N})) { 
-          pushInstruction(MatrixA, WorkList); 
-        } 
-      } else if (isa<LoadInst>(V) || 
-                 match(V, m_Intrinsic<Intrinsic::matrix_column_major_load>())) { 
-        // Nothing to do, no matrix input. 
-      } else if (isa<StoreInst>(V)) { 
-        // Nothing to do.  We forward-propagated to this so we would just 
-        // backward propagate to an instruction with an already known shape. 
-      } else if (isUniformShape(V)) { 
-        // Propagate to all operands. 
-        ShapeInfo Shape = ShapeMap[V]; 
-        for (Use &U : cast<Instruction>(V)->operands()) { 
-          if (setShapeInfo(U.get(), Shape)) 
-            pushInstruction(U.get(), WorkList); 
-        } 
-      } 
-      // After we discovered new shape info for new instructions in the 
-      // worklist, we use their users as seeds for the next round of forward 
-      // propagation. 
-      for (size_t I = BeforeProcessingV; I != WorkList.size(); I++) 
-        for (User *U : WorkList[I]->users()) 
-          if (isa<Instruction>(U) && V != U) 
-            NewWorkList.push_back(cast<Instruction>(U)); 
-    } 
-    return NewWorkList; 
-  } 
- 
-  bool Visit() { 
-    if (EnableShapePropagation) { 
-      SmallVector<Instruction *, 32> WorkList; 
- 
-      // Initially only the shape of matrix intrinsics is known. 
-      // Initialize the work list with ops carrying shape information. 
-      for (BasicBlock &BB : Func) 
-        for (Instruction &Inst : BB) { 
-          IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Inst); 
-          if (!II) 
-            continue; 
- 
-          switch (II->getIntrinsicID()) { 
-          case Intrinsic::matrix_multiply: 
-          case Intrinsic::matrix_transpose: 
-          case Intrinsic::matrix_column_major_load: 
-          case Intrinsic::matrix_column_major_store: 
-            WorkList.push_back(&Inst); 
-            break; 
-          default: 
-            break; 
-          } 
-        } 
-      // Propagate shapes until nothing changes any longer. 
-      while (!WorkList.empty()) { 
-        WorkList = propagateShapeForward(WorkList); 
-        WorkList = propagateShapeBackward(WorkList); 
-      } 
-    } 
- 
-    bool Changed = false; 
-    SmallVector<CallInst *, 16> MaybeFusableInsts; 
-    SmallVector<Instruction *, 16> MatrixInsts; 
- 
-    // First, collect all instructions with shape information and candidates for 
-    // fusion (currently only matrix multiplies). 
-    ReversePostOrderTraversal<Function *> RPOT(&Func); 
-    for (auto *BB : RPOT) 
-      for (Instruction &I : *BB) { 
-        if (ShapeMap.find(&I) == ShapeMap.end()) 
-          continue; 
-        if (match(&I, m_Intrinsic<Intrinsic::matrix_multiply>())) 
-          MaybeFusableInsts.push_back(cast<CallInst>(&I)); 
-        MatrixInsts.push_back(&I); 
-      } 
- 
-    // Second, try to fuse candidates. 
-    SmallPtrSet<Instruction *, 16> FusedInsts; 
-    for (CallInst *CI : MaybeFusableInsts) 
-      LowerMatrixMultiplyFused(CI, FusedInsts); 
-    Changed = !FusedInsts.empty(); 
- 
-    // Third, lower remaining instructions with shape information. 
-    for (Instruction *Inst : MatrixInsts) { 
-      if (FusedInsts.count(Inst)) 
-        continue; 
- 
-      IRBuilder<> Builder(Inst); 
- 
-      if (CallInst *CInst = dyn_cast<CallInst>(Inst)) 
-        Changed |= VisitCallInst(CInst); 
- 
-      Value *Op1; 
-      Value *Op2; 
-      if (auto *BinOp = dyn_cast<BinaryOperator>(Inst)) 
-        Changed |= VisitBinaryOperator(BinOp); 
+
+      size_t BeforeProcessingV = WorkList.size();
+      if (!isa<Instruction>(V))
+        continue;
+
+      Value *MatrixA;
+      Value *MatrixB;
+      Value *M;
+      Value *N;
+      Value *K;
+      if (match(V, m_Intrinsic<Intrinsic::matrix_multiply>(
+                       m_Value(MatrixA), m_Value(MatrixB), m_Value(M),
+                       m_Value(N), m_Value(K)))) {
+        if (setShapeInfo(MatrixA, {M, N}))
+          pushInstruction(MatrixA, WorkList);
+
+        if (setShapeInfo(MatrixB, {N, K}))
+          pushInstruction(MatrixB, WorkList);
+
+      } else if (match(V, m_Intrinsic<Intrinsic::matrix_transpose>(
+                              m_Value(MatrixA), m_Value(M), m_Value(N)))) {
+        // Flip dimensions.
+        if (setShapeInfo(MatrixA, {M, N}))
+          pushInstruction(MatrixA, WorkList);
+      } else if (match(V, m_Intrinsic<Intrinsic::matrix_column_major_store>(
+                              m_Value(MatrixA), m_Value(), m_Value(), m_Value(),
+                              m_Value(M), m_Value(N)))) {
+        if (setShapeInfo(MatrixA, {M, N})) {
+          pushInstruction(MatrixA, WorkList);
+        }
+      } else if (isa<LoadInst>(V) ||
+                 match(V, m_Intrinsic<Intrinsic::matrix_column_major_load>())) {
+        // Nothing to do, no matrix input.
+      } else if (isa<StoreInst>(V)) {
+        // Nothing to do.  We forward-propagated to this so we would just
+        // backward propagate to an instruction with an already known shape.
+      } else if (isUniformShape(V)) {
+        // Propagate to all operands.
+        ShapeInfo Shape = ShapeMap[V];
+        for (Use &U : cast<Instruction>(V)->operands()) {
+          if (setShapeInfo(U.get(), Shape))
+            pushInstruction(U.get(), WorkList);
+        }
+      }
+      // After we discovered new shape info for new instructions in the
+      // worklist, we use their users as seeds for the next round of forward
+      // propagation.
+      for (size_t I = BeforeProcessingV; I != WorkList.size(); I++)
+        for (User *U : WorkList[I]->users())
+          if (isa<Instruction>(U) && V != U)
+            NewWorkList.push_back(cast<Instruction>(U));
+    }
+    return NewWorkList;
+  }
+
+  bool Visit() {
+    if (EnableShapePropagation) {
+      SmallVector<Instruction *, 32> WorkList;
+
+      // Initially only the shape of matrix intrinsics is known.
+      // Initialize the work list with ops carrying shape information.
+      for (BasicBlock &BB : Func)
+        for (Instruction &Inst : BB) {
+          IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Inst);
+          if (!II)
+            continue;
+
+          switch (II->getIntrinsicID()) {
+          case Intrinsic::matrix_multiply:
+          case Intrinsic::matrix_transpose:
+          case Intrinsic::matrix_column_major_load:
+          case Intrinsic::matrix_column_major_store:
+            WorkList.push_back(&Inst);
+            break;
+          default:
+            break;
+          }
+        }
+      // Propagate shapes until nothing changes any longer.
+      while (!WorkList.empty()) {
+        WorkList = propagateShapeForward(WorkList);
+        WorkList = propagateShapeBackward(WorkList);
+      }
+    }
+
+    bool Changed = false;
+    SmallVector<CallInst *, 16> MaybeFusableInsts;
+    SmallVector<Instruction *, 16> MatrixInsts;
+
+    // First, collect all instructions with shape information and candidates for
+    // fusion (currently only matrix multiplies).
+    ReversePostOrderTraversal<Function *> RPOT(&Func);
+    for (auto *BB : RPOT)
+      for (Instruction &I : *BB) {
+        if (ShapeMap.find(&I) == ShapeMap.end())
+          continue;
+        if (match(&I, m_Intrinsic<Intrinsic::matrix_multiply>()))
+          MaybeFusableInsts.push_back(cast<CallInst>(&I));
+        MatrixInsts.push_back(&I);
+      }
+
+    // Second, try to fuse candidates.
+    SmallPtrSet<Instruction *, 16> FusedInsts;
+    for (CallInst *CI : MaybeFusableInsts)
+      LowerMatrixMultiplyFused(CI, FusedInsts);
+    Changed = !FusedInsts.empty();
+
+    // Third, lower remaining instructions with shape information.
+    for (Instruction *Inst : MatrixInsts) {
+      if (FusedInsts.count(Inst))
+        continue;
+
+      IRBuilder<> Builder(Inst);
+
+      if (CallInst *CInst = dyn_cast<CallInst>(Inst))
+        Changed |= VisitCallInst(CInst);
+
+      Value *Op1;
+      Value *Op2;
+      if (auto *BinOp = dyn_cast<BinaryOperator>(Inst))
+        Changed |= VisitBinaryOperator(BinOp);
       if (auto *UnOp = dyn_cast<UnaryOperator>(Inst))
         Changed |= VisitUnaryOperator(UnOp);
-      if (match(Inst, m_Load(m_Value(Op1)))) 
-        Changed |= VisitLoad(cast<LoadInst>(Inst), Op1, Builder); 
-      else if (match(Inst, m_Store(m_Value(Op1), m_Value(Op2)))) 
-        Changed |= VisitStore(cast<StoreInst>(Inst), Op1, Op2, Builder); 
-    } 
- 
+      if (match(Inst, m_Load(m_Value(Op1))))
+        Changed |= VisitLoad(cast<LoadInst>(Inst), Op1, Builder);
+      else if (match(Inst, m_Store(m_Value(Op1), m_Value(Op2))))
+        Changed |= VisitStore(cast<StoreInst>(Inst), Op1, Op2, Builder);
+    }
+
     if (ORE) {
       RemarkGenerator RemarkGen(Inst2ColumnMatrix, *ORE, Func);
       RemarkGen.emitRemarks();
     }
- 
-    for (Instruction *Inst : reverse(ToRemove)) 
-      Inst->eraseFromParent(); 
- 
-    return Changed; 
-  } 
- 
-  /// Turns \p BasePtr into an elementwise pointer to \p EltType. 
-  Value *createElementPtr(Value *BasePtr, Type *EltType, IRBuilder<> &Builder) { 
-    unsigned AS = cast<PointerType>(BasePtr->getType())->getAddressSpace(); 
-    Type *EltPtrType = PointerType::get(EltType, AS); 
-    return Builder.CreatePointerCast(BasePtr, EltPtrType); 
-  } 
- 
-  /// Replace intrinsic calls 
-  bool VisitCallInst(CallInst *Inst) { 
-    if (!Inst->getCalledFunction() || !Inst->getCalledFunction()->isIntrinsic()) 
-      return false; 
- 
-    switch (Inst->getCalledFunction()->getIntrinsicID()) { 
-    case Intrinsic::matrix_multiply: 
-      LowerMultiply(Inst); 
-      break; 
-    case Intrinsic::matrix_transpose: 
-      LowerTranspose(Inst); 
-      break; 
-    case Intrinsic::matrix_column_major_load: 
-      LowerColumnMajorLoad(Inst); 
-      break; 
-    case Intrinsic::matrix_column_major_store: 
-      LowerColumnMajorStore(Inst); 
-      break; 
-    default: 
-      return false; 
-    } 
-    return true; 
-  } 
- 
-  /// Compute the alignment for a column/row \p Idx with \p Stride between them. 
-  /// The address at \p Idx == 0 has alignment \p A. If \p Stride is a 
-  /// ConstantInt, reduce the initial alignment based on the byte offset. For 
-  /// non-ConstantInt strides, return the common alignment of the initial 
-  /// alignment and the element size in bytes. 
-  Align getAlignForIndex(unsigned Idx, Value *Stride, Type *ElementTy, 
-                         MaybeAlign A) const { 
-    Align InitialAlign = DL.getValueOrABITypeAlignment(A, ElementTy); 
-    if (Idx == 0) 
-      return InitialAlign; 
- 
-    TypeSize ElementSizeInBits = DL.getTypeSizeInBits(ElementTy); 
-    if (auto *ConstStride = dyn_cast<ConstantInt>(Stride)) { 
-      uint64_t StrideInBytes = 
-          ConstStride->getZExtValue() * ElementSizeInBits / 8; 
-      return commonAlignment(InitialAlign, Idx * StrideInBytes); 
-    } 
-    return commonAlignment(InitialAlign, ElementSizeInBits / 8); 
-  } 
- 
-  /// Load a matrix with \p Shape starting at \p Ptr and using \p Stride between 
-  /// vectors. 
-  MatrixTy loadMatrix(Type *Ty, Value *Ptr, MaybeAlign MAlign, Value *Stride, 
-                      bool IsVolatile, ShapeInfo Shape, IRBuilder<> &Builder) { 
-    auto VType = cast<VectorType>(Ty); 
-    Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder); 
-    MatrixTy Result; 
-    for (unsigned I = 0, E = Shape.getNumVectors(); I < E; ++I) { 
-      Value *GEP = computeVectorAddr(EltPtr, Builder.getInt64(I), Stride, 
-                                     Shape.getStride(), VType->getElementType(), 
-                                     Builder); 
-      Value *Vector = Builder.CreateAlignedLoad( 
-          GEP, getAlignForIndex(I, Stride, VType->getElementType(), MAlign), 
-          IsVolatile, "col.load"); 
- 
-      Result.addVector(Vector); 
-    } 
-    return Result.addNumLoads(getNumOps(Result.getVectorTy()) * 
-                              Result.getNumVectors()); 
-  } 
- 
-  /// Loads a sub-matrix with shape \p ResultShape from a \p R x \p C matrix, 
-  /// starting at \p MatrixPtr[I][J]. 
-  MatrixTy loadMatrix(Value *MatrixPtr, MaybeAlign Align, bool IsVolatile, 
-                      ShapeInfo MatrixShape, Value *I, Value *J, 
-                      ShapeInfo ResultShape, Type *EltTy, 
-                      IRBuilder<> &Builder) { 
- 
-    Value *Offset = Builder.CreateAdd( 
-        Builder.CreateMul(J, Builder.getInt64(MatrixShape.getStride())), I); 
- 
-    unsigned AS = cast<PointerType>(MatrixPtr->getType())->getAddressSpace(); 
-    Value *EltPtr = 
-        Builder.CreatePointerCast(MatrixPtr, PointerType::get(EltTy, AS)); 
-    Value *TileStart = Builder.CreateGEP(EltTy, EltPtr, Offset); 
-    auto *TileTy = FixedVectorType::get(EltTy, ResultShape.NumRows * 
-                                                   ResultShape.NumColumns); 
-    Type *TilePtrTy = PointerType::get(TileTy, AS); 
-    Value *TilePtr = 
-        Builder.CreatePointerCast(TileStart, TilePtrTy, "col.cast"); 
- 
-    return loadMatrix(TileTy, TilePtr, Align, 
-                      Builder.getInt64(MatrixShape.getStride()), IsVolatile, 
-                      ResultShape, Builder); 
-  } 
- 
-  /// Lower a load instruction with shape information. 
-  void LowerLoad(Instruction *Inst, Value *Ptr, MaybeAlign Align, Value *Stride, 
-                 bool IsVolatile, ShapeInfo Shape) { 
-    IRBuilder<> Builder(Inst); 
-    finalizeLowering(Inst, 
-                     loadMatrix(Inst->getType(), Ptr, Align, Stride, IsVolatile, 
-                                Shape, Builder), 
-                     Builder); 
-  } 
- 
-  /// Lowers llvm.matrix.column.major.load. 
-  /// 
-  /// The intrinsic loads a matrix from memory using a stride between columns. 
-  void LowerColumnMajorLoad(CallInst *Inst) { 
-    assert(MatrixLayout == MatrixLayoutTy::ColumnMajor && 
-           "Intrinsic only supports column-major layout!"); 
-    Value *Ptr = Inst->getArgOperand(0); 
-    Value *Stride = Inst->getArgOperand(1); 
-    LowerLoad(Inst, Ptr, Inst->getParamAlign(0), Stride, 
-              cast<ConstantInt>(Inst->getArgOperand(2))->isOne(), 
-              {Inst->getArgOperand(3), Inst->getArgOperand(4)}); 
-  } 
- 
-  /// Stores a sub-matrix \p StoreVal into the \p R x \p C matrix starting at \p 
-  /// MatrixPtr[I][J]. 
-  void storeMatrix(const MatrixTy &StoreVal, Value *MatrixPtr, 
-                   MaybeAlign MAlign, bool IsVolatile, ShapeInfo MatrixShape, 
-                   Value *I, Value *J, Type *EltTy, IRBuilder<> &Builder) { 
-    Value *Offset = Builder.CreateAdd( 
-        Builder.CreateMul(J, Builder.getInt64(MatrixShape.getStride())), I); 
- 
-    unsigned AS = cast<PointerType>(MatrixPtr->getType())->getAddressSpace(); 
-    Value *EltPtr = 
-        Builder.CreatePointerCast(MatrixPtr, PointerType::get(EltTy, AS)); 
-    Value *TileStart = Builder.CreateGEP(EltTy, EltPtr, Offset); 
-    auto *TileTy = FixedVectorType::get(EltTy, StoreVal.getNumRows() * 
-                                                   StoreVal.getNumColumns()); 
-    Type *TilePtrTy = PointerType::get(TileTy, AS); 
-    Value *TilePtr = 
-        Builder.CreatePointerCast(TileStart, TilePtrTy, "col.cast"); 
- 
-    storeMatrix(TileTy, StoreVal, TilePtr, MAlign, 
-                Builder.getInt64(MatrixShape.getStride()), IsVolatile, Builder); 
-  } 
- 
-  /// Store matrix \p StoreVal starting at \p Ptr and using \p Stride between 
-  /// vectors. 
-  MatrixTy storeMatrix(Type *Ty, MatrixTy StoreVal, Value *Ptr, 
-                       MaybeAlign MAlign, Value *Stride, bool IsVolatile, 
-                       IRBuilder<> &Builder) { 
-    auto VType = cast<VectorType>(Ty); 
-    Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder); 
-    for (auto Vec : enumerate(StoreVal.vectors())) { 
-      Value *GEP = computeVectorAddr(EltPtr, Builder.getInt64(Vec.index()), 
-                                     Stride, StoreVal.getStride(), 
-                                     VType->getElementType(), Builder); 
-      Builder.CreateAlignedStore(Vec.value(), GEP, 
-                                 getAlignForIndex(Vec.index(), Stride, 
-                                                  VType->getElementType(), 
-                                                  MAlign), 
-                                 IsVolatile); 
-    } 
-    return MatrixTy().addNumStores(getNumOps(StoreVal.getVectorTy()) * 
-                                   StoreVal.getNumVectors()); 
-  } 
- 
-  /// Lower a store instruction with shape information. 
-  void LowerStore(Instruction *Inst, Value *Matrix, Value *Ptr, MaybeAlign A, 
-                  Value *Stride, bool IsVolatile, ShapeInfo Shape) { 
-    IRBuilder<> Builder(Inst); 
-    auto StoreVal = getMatrix(Matrix, Shape, Builder); 
-    finalizeLowering(Inst, 
-                     storeMatrix(Matrix->getType(), StoreVal, Ptr, A, Stride, 
-                                 IsVolatile, Builder), 
-                     Builder); 
-  } 
- 
-  /// Lowers llvm.matrix.column.major.store. 
-  /// 
-  /// The intrinsic store a matrix back memory using a stride between columns. 
-  void LowerColumnMajorStore(CallInst *Inst) { 
-    assert(MatrixLayout == MatrixLayoutTy::ColumnMajor && 
-           "Intrinsic only supports column-major layout!"); 
-    Value *Matrix = Inst->getArgOperand(0); 
-    Value *Ptr = Inst->getArgOperand(1); 
-    Value *Stride = Inst->getArgOperand(2); 
-    LowerStore(Inst, Matrix, Ptr, Inst->getParamAlign(1), Stride, 
-               cast<ConstantInt>(Inst->getArgOperand(3))->isOne(), 
-               {Inst->getArgOperand(4), Inst->getArgOperand(5)}); 
-  } 
- 
-  // Set elements I..I+NumElts-1 to Block 
-  Value *insertVector(Value *Col, unsigned I, Value *Block, 
-                      IRBuilder<> &Builder) { 
- 
-    // First, bring Block to the same size as Col 
-    unsigned BlockNumElts = 
-        cast<FixedVectorType>(Block->getType())->getNumElements(); 
-    unsigned NumElts = cast<FixedVectorType>(Col->getType())->getNumElements(); 
-    assert(NumElts >= BlockNumElts && "Too few elements for current block"); 
- 
-    Block = Builder.CreateShuffleVector( 
+
+    for (Instruction *Inst : reverse(ToRemove))
+      Inst->eraseFromParent();
+
+    return Changed;
+  }
+
+  /// Turns \p BasePtr into an elementwise pointer to \p EltType.
+  Value *createElementPtr(Value *BasePtr, Type *EltType, IRBuilder<> &Builder) {
+    unsigned AS = cast<PointerType>(BasePtr->getType())->getAddressSpace();
+    Type *EltPtrType = PointerType::get(EltType, AS);
+    return Builder.CreatePointerCast(BasePtr, EltPtrType);
+  }
+
+  /// Replace intrinsic calls
+  bool VisitCallInst(CallInst *Inst) {
+    if (!Inst->getCalledFunction() || !Inst->getCalledFunction()->isIntrinsic())
+      return false;
+
+    switch (Inst->getCalledFunction()->getIntrinsicID()) {
+    case Intrinsic::matrix_multiply:
+      LowerMultiply(Inst);
+      break;
+    case Intrinsic::matrix_transpose:
+      LowerTranspose(Inst);
+      break;
+    case Intrinsic::matrix_column_major_load:
+      LowerColumnMajorLoad(Inst);
+      break;
+    case Intrinsic::matrix_column_major_store:
+      LowerColumnMajorStore(Inst);
+      break;
+    default:
+      return false;
+    }
+    return true;
+  }
+
+  /// Compute the alignment for a column/row \p Idx with \p Stride between them.
+  /// The address at \p Idx == 0 has alignment \p A. If \p Stride is a
+  /// ConstantInt, reduce the initial alignment based on the byte offset. For
+  /// non-ConstantInt strides, return the common alignment of the initial
+  /// alignment and the element size in bytes.
+  Align getAlignForIndex(unsigned Idx, Value *Stride, Type *ElementTy,
+                         MaybeAlign A) const {
+    Align InitialAlign = DL.getValueOrABITypeAlignment(A, ElementTy);
+    if (Idx == 0)
+      return InitialAlign;
+
+    TypeSize ElementSizeInBits = DL.getTypeSizeInBits(ElementTy);
+    if (auto *ConstStride = dyn_cast<ConstantInt>(Stride)) {
+      uint64_t StrideInBytes =
+          ConstStride->getZExtValue() * ElementSizeInBits / 8;
+      return commonAlignment(InitialAlign, Idx * StrideInBytes);
+    }
+    return commonAlignment(InitialAlign, ElementSizeInBits / 8);
+  }
+
+  /// Load a matrix with \p Shape starting at \p Ptr and using \p Stride between
+  /// vectors.
+  MatrixTy loadMatrix(Type *Ty, Value *Ptr, MaybeAlign MAlign, Value *Stride,
+                      bool IsVolatile, ShapeInfo Shape, IRBuilder<> &Builder) {
+    auto VType = cast<VectorType>(Ty);
+    Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder);
+    MatrixTy Result;
+    for (unsigned I = 0, E = Shape.getNumVectors(); I < E; ++I) {
+      Value *GEP = computeVectorAddr(EltPtr, Builder.getInt64(I), Stride,
+                                     Shape.getStride(), VType->getElementType(),
+                                     Builder);
+      Value *Vector = Builder.CreateAlignedLoad(
+          GEP, getAlignForIndex(I, Stride, VType->getElementType(), MAlign),
+          IsVolatile, "col.load");
+
+      Result.addVector(Vector);
+    }
+    return Result.addNumLoads(getNumOps(Result.getVectorTy()) *
+                              Result.getNumVectors());
+  }
+
+  /// Loads a sub-matrix with shape \p ResultShape from a \p R x \p C matrix,
+  /// starting at \p MatrixPtr[I][J].
+  MatrixTy loadMatrix(Value *MatrixPtr, MaybeAlign Align, bool IsVolatile,
+                      ShapeInfo MatrixShape, Value *I, Value *J,
+                      ShapeInfo ResultShape, Type *EltTy,
+                      IRBuilder<> &Builder) {
+
+    Value *Offset = Builder.CreateAdd(
+        Builder.CreateMul(J, Builder.getInt64(MatrixShape.getStride())), I);
+
+    unsigned AS = cast<PointerType>(MatrixPtr->getType())->getAddressSpace();
+    Value *EltPtr =
+        Builder.CreatePointerCast(MatrixPtr, PointerType::get(EltTy, AS));
+    Value *TileStart = Builder.CreateGEP(EltTy, EltPtr, Offset);
+    auto *TileTy = FixedVectorType::get(EltTy, ResultShape.NumRows *
+                                                   ResultShape.NumColumns);
+    Type *TilePtrTy = PointerType::get(TileTy, AS);
+    Value *TilePtr =
+        Builder.CreatePointerCast(TileStart, TilePtrTy, "col.cast");
+
+    return loadMatrix(TileTy, TilePtr, Align,
+                      Builder.getInt64(MatrixShape.getStride()), IsVolatile,
+                      ResultShape, Builder);
+  }
+
+  /// Lower a load instruction with shape information.
+  void LowerLoad(Instruction *Inst, Value *Ptr, MaybeAlign Align, Value *Stride,
+                 bool IsVolatile, ShapeInfo Shape) {
+    IRBuilder<> Builder(Inst);
+    finalizeLowering(Inst,
+                     loadMatrix(Inst->getType(), Ptr, Align, Stride, IsVolatile,
+                                Shape, Builder),
+                     Builder);
+  }
+
+  /// Lowers llvm.matrix.column.major.load.
+  ///
+  /// The intrinsic loads a matrix from memory using a stride between columns.
+  void LowerColumnMajorLoad(CallInst *Inst) {
+    assert(MatrixLayout == MatrixLayoutTy::ColumnMajor &&
+           "Intrinsic only supports column-major layout!");
+    Value *Ptr = Inst->getArgOperand(0);
+    Value *Stride = Inst->getArgOperand(1);
+    LowerLoad(Inst, Ptr, Inst->getParamAlign(0), Stride,
+              cast<ConstantInt>(Inst->getArgOperand(2))->isOne(),
+              {Inst->getArgOperand(3), Inst->getArgOperand(4)});
+  }
+
+  /// Stores a sub-matrix \p StoreVal into the \p R x \p C matrix starting at \p
+  /// MatrixPtr[I][J].
+  void storeMatrix(const MatrixTy &StoreVal, Value *MatrixPtr,
+                   MaybeAlign MAlign, bool IsVolatile, ShapeInfo MatrixShape,
+                   Value *I, Value *J, Type *EltTy, IRBuilder<> &Builder) {
+    Value *Offset = Builder.CreateAdd(
+        Builder.CreateMul(J, Builder.getInt64(MatrixShape.getStride())), I);
+
+    unsigned AS = cast<PointerType>(MatrixPtr->getType())->getAddressSpace();
+    Value *EltPtr =
+        Builder.CreatePointerCast(MatrixPtr, PointerType::get(EltTy, AS));
+    Value *TileStart = Builder.CreateGEP(EltTy, EltPtr, Offset);
+    auto *TileTy = FixedVectorType::get(EltTy, StoreVal.getNumRows() *
+                                                   StoreVal.getNumColumns());
+    Type *TilePtrTy = PointerType::get(TileTy, AS);
+    Value *TilePtr =
+        Builder.CreatePointerCast(TileStart, TilePtrTy, "col.cast");
+
+    storeMatrix(TileTy, StoreVal, TilePtr, MAlign,
+                Builder.getInt64(MatrixShape.getStride()), IsVolatile, Builder);
+  }
+
+  /// Store matrix \p StoreVal starting at \p Ptr and using \p Stride between
+  /// vectors.
+  MatrixTy storeMatrix(Type *Ty, MatrixTy StoreVal, Value *Ptr,
+                       MaybeAlign MAlign, Value *Stride, bool IsVolatile,
+                       IRBuilder<> &Builder) {
+    auto VType = cast<VectorType>(Ty);
+    Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder);
+    for (auto Vec : enumerate(StoreVal.vectors())) {
+      Value *GEP = computeVectorAddr(EltPtr, Builder.getInt64(Vec.index()),
+                                     Stride, StoreVal.getStride(),
+                                     VType->getElementType(), Builder);
+      Builder.CreateAlignedStore(Vec.value(), GEP,
+                                 getAlignForIndex(Vec.index(), Stride,
+                                                  VType->getElementType(),
+                                                  MAlign),
+                                 IsVolatile);
+    }
+    return MatrixTy().addNumStores(getNumOps(StoreVal.getVectorTy()) *
+                                   StoreVal.getNumVectors());
+  }
+
+  /// Lower a store instruction with shape information.
+  void LowerStore(Instruction *Inst, Value *Matrix, Value *Ptr, MaybeAlign A,
+                  Value *Stride, bool IsVolatile, ShapeInfo Shape) {
+    IRBuilder<> Builder(Inst);
+    auto StoreVal = getMatrix(Matrix, Shape, Builder);
+    finalizeLowering(Inst,
+                     storeMatrix(Matrix->getType(), StoreVal, Ptr, A, Stride,
+                                 IsVolatile, Builder),
+                     Builder);
+  }
+
+  /// Lowers llvm.matrix.column.major.store.
+  ///
+  /// The intrinsic store a matrix back memory using a stride between columns.
+  void LowerColumnMajorStore(CallInst *Inst) {
+    assert(MatrixLayout == MatrixLayoutTy::ColumnMajor &&
+           "Intrinsic only supports column-major layout!");
+    Value *Matrix = Inst->getArgOperand(0);
+    Value *Ptr = Inst->getArgOperand(1);
+    Value *Stride = Inst->getArgOperand(2);
+    LowerStore(Inst, Matrix, Ptr, Inst->getParamAlign(1), Stride,
+               cast<ConstantInt>(Inst->getArgOperand(3))->isOne(),
+               {Inst->getArgOperand(4), Inst->getArgOperand(5)});
+  }
+
+  // Set elements I..I+NumElts-1 to Block
+  Value *insertVector(Value *Col, unsigned I, Value *Block,
+                      IRBuilder<> &Builder) {
+
+    // First, bring Block to the same size as Col
+    unsigned BlockNumElts =
+        cast<FixedVectorType>(Block->getType())->getNumElements();
+    unsigned NumElts = cast<FixedVectorType>(Col->getType())->getNumElements();
+    assert(NumElts >= BlockNumElts && "Too few elements for current block");
+
+    Block = Builder.CreateShuffleVector(
         Block, createSequentialMask(0, BlockNumElts, NumElts - BlockNumElts));
- 
-    // If Col is 7 long and I is 2 and BlockNumElts is 2 the mask is: 0, 1, 7, 
-    // 8, 4, 5, 6 
-    SmallVector<int, 16> Mask; 
-    unsigned i; 
-    for (i = 0; i < I; i++) 
-      Mask.push_back(i); 
- 
-    unsigned VecNumElts = 
-        cast<FixedVectorType>(Col->getType())->getNumElements(); 
-    for (; i < I + BlockNumElts; i++) 
-      Mask.push_back(i - I + VecNumElts); 
- 
-    for (; i < VecNumElts; i++) 
-      Mask.push_back(i); 
- 
-    return Builder.CreateShuffleVector(Col, Block, Mask); 
-  } 
- 
-  Value *createMulAdd(Value *Sum, Value *A, Value *B, bool UseFPOp, 
-                      IRBuilder<> &Builder, bool AllowContraction, 
-                      unsigned &NumComputeOps) { 
-    NumComputeOps += getNumOps(A->getType()); 
-    if (!Sum) 
-      return UseFPOp ? Builder.CreateFMul(A, B) : Builder.CreateMul(A, B); 
- 
-    if (UseFPOp) { 
-      if (AllowContraction) { 
-        // Use fmuladd for floating point operations and let the backend decide 
-        // if that's profitable. 
-        Function *FMulAdd = Intrinsic::getDeclaration( 
-            Func.getParent(), Intrinsic::fmuladd, A->getType()); 
-        return Builder.CreateCall(FMulAdd, {A, B, Sum}); 
-      } 
-      NumComputeOps += getNumOps(A->getType()); 
-      Value *Mul = Builder.CreateFMul(A, B); 
-      return Builder.CreateFAdd(Sum, Mul); 
-    } 
- 
-    NumComputeOps += getNumOps(A->getType()); 
-    Value *Mul = Builder.CreateMul(A, B); 
-    return Builder.CreateAdd(Sum, Mul); 
-  } 
- 
-  /// Cache \p Matrix as result of \p Inst and update the uses of \p Inst. For 
-  /// users with shape information, there's nothing to do: the will use the 
-  /// cached value when they are lowered. For other users, \p Matrix is 
-  /// flattened and the uses are updated to use it. Also marks \p Inst for 
-  /// deletion. 
-  void finalizeLowering(Instruction *Inst, MatrixTy Matrix, 
-                        IRBuilder<> &Builder) { 
-    Inst2ColumnMatrix.insert(std::make_pair(Inst, Matrix)); 
- 
-    ToRemove.push_back(Inst); 
-    Value *Flattened = nullptr; 
-    for (auto I = Inst->use_begin(), E = Inst->use_end(); I != E;) { 
-      Use &U = *I++; 
-      if (ShapeMap.find(U.getUser()) == ShapeMap.end()) { 
-        if (!Flattened) 
-          Flattened = Matrix.embedInVector(Builder); 
-        U.set(Flattened); 
-      } 
-    } 
-  } 
- 
-  /// Compute \p Result += \p A * \p B for input matrices with left-associating 
-  /// addition. 
-  void emitMatrixMultiply(MatrixTy &Result, const MatrixTy &A, 
-                          const MatrixTy &B, bool AllowContraction, 
-                          IRBuilder<> &Builder, bool isTiled) { 
-    const unsigned VF = std::max<unsigned>( 
-        TTI.getRegisterBitWidth(true) / 
-            Result.getElementType()->getPrimitiveSizeInBits().getFixedSize(), 
-        1U); 
-    unsigned R = Result.getNumRows(); 
-    unsigned C = Result.getNumColumns(); 
-    unsigned M = A.getNumColumns(); 
- 
-    bool IsFP = Result.getElementType()->isFloatingPointTy(); 
-    assert(A.isColumnMajor() == B.isColumnMajor() && 
-           Result.isColumnMajor() == A.isColumnMajor() && 
-           "operands must agree on matrix layout"); 
-    unsigned NumComputeOps = 0; 
-    if (A.isColumnMajor()) { 
-      // Multiply columns from the first operand with scalars from the second 
-      // operand. Then move along the K axes and accumulate the columns.  With 
-      // this the adds can be vectorized without reassociation. 
-      for (unsigned J = 0; J < C; ++J) { 
-        unsigned BlockSize = VF; 
-        // If Result is zero, we don't need to accumulate in the K==0 iteration. 
-        bool isSumZero = isa<ConstantAggregateZero>(Result.getColumn(J)); 
- 
-        for (unsigned I = 0; I < R; I += BlockSize) { 
-          // Gradually lower the vectorization factor to cover the remainder. 
-          while (I + BlockSize > R) 
-            BlockSize /= 2; 
- 
-          Value *Sum = isTiled ? Result.extractVector(I, J, BlockSize, Builder) 
-                               : nullptr; 
-          for (unsigned K = 0; K < M; ++K) { 
-            Value *L = A.extractVector(I, K, BlockSize, Builder); 
-            Value *RH = Builder.CreateExtractElement(B.getColumn(J), K); 
-            Value *Splat = Builder.CreateVectorSplat(BlockSize, RH, "splat"); 
-            Sum = createMulAdd(isSumZero && K == 0 ? nullptr : Sum, L, Splat, 
-                               Result.getElementType()->isFloatingPointTy(), 
-                               Builder, AllowContraction, NumComputeOps); 
-          } 
-          Result.setVector(J, 
-                           insertVector(Result.getVector(J), I, Sum, Builder)); 
-        } 
-      } 
-    } else { 
-      // Multiply rows from the second operand with scalars from the first 
-      // operand. Then move along the K axes and accumulate the rows.  With this 
-      // the adds can be vectorized without reassociation. 
-      for (unsigned I = 0; I < R; ++I) { 
-        unsigned BlockSize = VF; 
-        bool isSumZero = isa<ConstantAggregateZero>(Result.getRow(I)); 
-        for (unsigned J = 0; J < C; J += BlockSize) { 
-          // Gradually lower the vectorization factor to cover the remainder. 
-          while (J + BlockSize > C) 
-            BlockSize /= 2; 
- 
-          Value *Sum = nullptr; 
-          for (unsigned K = 0; K < M; ++K) { 
-            Value *R = B.extractVector(K, J, BlockSize, Builder); 
-            Value *LH = Builder.CreateExtractElement(A.getVector(I), K); 
-            Value *Splat = Builder.CreateVectorSplat(BlockSize, LH, "splat"); 
-            Sum = createMulAdd(isSumZero && K == 0 ? nullptr : Sum, Splat, R, 
-                               IsFP, Builder, AllowContraction, NumComputeOps); 
-          } 
-          Result.setVector(I, 
-                           insertVector(Result.getVector(I), J, Sum, Builder)); 
-        } 
-      } 
-    } 
-    Result.addNumComputeOps(NumComputeOps); 
-  } 
- 
-  /// Ensure that the memory in \p Load does not alias \p Store by potentially 
-  /// copying it to a new location.  This new or otherwise the original location 
-  /// is returned. 
-  Value *getNonAliasingPointer(LoadInst *Load, StoreInst *Store, 
-                               CallInst *MatMul) { 
-    MemoryLocation StoreLoc = MemoryLocation::get(Store); 
-    MemoryLocation LoadLoc = MemoryLocation::get(Load); 
- 
+
+    // If Col is 7 long and I is 2 and BlockNumElts is 2 the mask is: 0, 1, 7,
+    // 8, 4, 5, 6
+    SmallVector<int, 16> Mask;
+    unsigned i;
+    for (i = 0; i < I; i++)
+      Mask.push_back(i);
+
+    unsigned VecNumElts =
+        cast<FixedVectorType>(Col->getType())->getNumElements();
+    for (; i < I + BlockNumElts; i++)
+      Mask.push_back(i - I + VecNumElts);
+
+    for (; i < VecNumElts; i++)
+      Mask.push_back(i);
+
+    return Builder.CreateShuffleVector(Col, Block, Mask);
+  }
+
+  Value *createMulAdd(Value *Sum, Value *A, Value *B, bool UseFPOp,
+                      IRBuilder<> &Builder, bool AllowContraction,
+                      unsigned &NumComputeOps) {
+    NumComputeOps += getNumOps(A->getType());
+    if (!Sum)
+      return UseFPOp ? Builder.CreateFMul(A, B) : Builder.CreateMul(A, B);
+
+    if (UseFPOp) {
+      if (AllowContraction) {
+        // Use fmuladd for floating point operations and let the backend decide
+        // if that's profitable.
+        Function *FMulAdd = Intrinsic::getDeclaration(
+            Func.getParent(), Intrinsic::fmuladd, A->getType());
+        return Builder.CreateCall(FMulAdd, {A, B, Sum});
+      }
+      NumComputeOps += getNumOps(A->getType());
+      Value *Mul = Builder.CreateFMul(A, B);
+      return Builder.CreateFAdd(Sum, Mul);
+    }
+
+    NumComputeOps += getNumOps(A->getType());
+    Value *Mul = Builder.CreateMul(A, B);
+    return Builder.CreateAdd(Sum, Mul);
+  }
+
+  /// Cache \p Matrix as result of \p Inst and update the uses of \p Inst. For
+  /// users with shape information, there's nothing to do: the will use the
+  /// cached value when they are lowered. For other users, \p Matrix is
+  /// flattened and the uses are updated to use it. Also marks \p Inst for
+  /// deletion.
+  void finalizeLowering(Instruction *Inst, MatrixTy Matrix,
+                        IRBuilder<> &Builder) {
+    Inst2ColumnMatrix.insert(std::make_pair(Inst, Matrix));
+
+    ToRemove.push_back(Inst);
+    Value *Flattened = nullptr;
+    for (auto I = Inst->use_begin(), E = Inst->use_end(); I != E;) {
+      Use &U = *I++;
+      if (ShapeMap.find(U.getUser()) == ShapeMap.end()) {
+        if (!Flattened)
+          Flattened = Matrix.embedInVector(Builder);
+        U.set(Flattened);
+      }
+    }
+  }
+
+  /// Compute \p Result += \p A * \p B for input matrices with left-associating
+  /// addition.
+  void emitMatrixMultiply(MatrixTy &Result, const MatrixTy &A,
+                          const MatrixTy &B, bool AllowContraction,
+                          IRBuilder<> &Builder, bool isTiled) {
+    const unsigned VF = std::max<unsigned>(
+        TTI.getRegisterBitWidth(true) /
+            Result.getElementType()->getPrimitiveSizeInBits().getFixedSize(),
+        1U);
+    unsigned R = Result.getNumRows();
+    unsigned C = Result.getNumColumns();
+    unsigned M = A.getNumColumns();
+
+    bool IsFP = Result.getElementType()->isFloatingPointTy();
+    assert(A.isColumnMajor() == B.isColumnMajor() &&
+           Result.isColumnMajor() == A.isColumnMajor() &&
+           "operands must agree on matrix layout");
+    unsigned NumComputeOps = 0;
+    if (A.isColumnMajor()) {
+      // Multiply columns from the first operand with scalars from the second
+      // operand. Then move along the K axes and accumulate the columns.  With
+      // this the adds can be vectorized without reassociation.
+      for (unsigned J = 0; J < C; ++J) {
+        unsigned BlockSize = VF;
+        // If Result is zero, we don't need to accumulate in the K==0 iteration.
+        bool isSumZero = isa<ConstantAggregateZero>(Result.getColumn(J));
+
+        for (unsigned I = 0; I < R; I += BlockSize) {
+          // Gradually lower the vectorization factor to cover the remainder.
+          while (I + BlockSize > R)
+            BlockSize /= 2;
+
+          Value *Sum = isTiled ? Result.extractVector(I, J, BlockSize, Builder)
+                               : nullptr;
+          for (unsigned K = 0; K < M; ++K) {
+            Value *L = A.extractVector(I, K, BlockSize, Builder);
+            Value *RH = Builder.CreateExtractElement(B.getColumn(J), K);
+            Value *Splat = Builder.CreateVectorSplat(BlockSize, RH, "splat");
+            Sum = createMulAdd(isSumZero && K == 0 ? nullptr : Sum, L, Splat,
+                               Result.getElementType()->isFloatingPointTy(),
+                               Builder, AllowContraction, NumComputeOps);
+          }
+          Result.setVector(J,
+                           insertVector(Result.getVector(J), I, Sum, Builder));
+        }
+      }
+    } else {
+      // Multiply rows from the second operand with scalars from the first
+      // operand. Then move along the K axes and accumulate the rows.  With this
+      // the adds can be vectorized without reassociation.
+      for (unsigned I = 0; I < R; ++I) {
+        unsigned BlockSize = VF;
+        bool isSumZero = isa<ConstantAggregateZero>(Result.getRow(I));
+        for (unsigned J = 0; J < C; J += BlockSize) {
+          // Gradually lower the vectorization factor to cover the remainder.
+          while (J + BlockSize > C)
+            BlockSize /= 2;
+
+          Value *Sum = nullptr;
+          for (unsigned K = 0; K < M; ++K) {
+            Value *R = B.extractVector(K, J, BlockSize, Builder);
+            Value *LH = Builder.CreateExtractElement(A.getVector(I), K);
+            Value *Splat = Builder.CreateVectorSplat(BlockSize, LH, "splat");
+            Sum = createMulAdd(isSumZero && K == 0 ? nullptr : Sum, Splat, R,
+                               IsFP, Builder, AllowContraction, NumComputeOps);
+          }
+          Result.setVector(I,
+                           insertVector(Result.getVector(I), J, Sum, Builder));
+        }
+      }
+    }
+    Result.addNumComputeOps(NumComputeOps);
+  }
+
+  /// Ensure that the memory in \p Load does not alias \p Store by potentially
+  /// copying it to a new location.  This new or otherwise the original location
+  /// is returned.
+  Value *getNonAliasingPointer(LoadInst *Load, StoreInst *Store,
+                               CallInst *MatMul) {
+    MemoryLocation StoreLoc = MemoryLocation::get(Store);
+    MemoryLocation LoadLoc = MemoryLocation::get(Load);
+
     AliasResult LdAliased = AA->alias(LoadLoc, StoreLoc);
- 
-    // If we can statically determine noalias we're good. 
-    if (!LdAliased) 
-      return Load->getPointerOperand(); 
- 
-    // Create code to check if the memory locations of the Load and Store 
-    // overlap and if they do, copy Load's operand to a new buffer. 
- 
-    // First, create  new blocks for 2n part of the check and the copy. 
-    BasicBlock *Check0 = MatMul->getParent(); 
-    // FIXME: Use lazy DTU and update SplitBlock to accept a DTU instead of a 
-    // DT. Manually collect dominator tree updates, to avoid unnecessary work, 
-    // as we adjust Check0 and Check1's branches. 
-    SmallVector<DominatorTree::UpdateType, 4> DTUpdates; 
-    for (BasicBlock *Succ : successors(Check0)) 
+
+    // If we can statically determine noalias we're good.
+    if (!LdAliased)
+      return Load->getPointerOperand();
+
+    // Create code to check if the memory locations of the Load and Store
+    // overlap and if they do, copy Load's operand to a new buffer.
+
+    // First, create  new blocks for 2n part of the check and the copy.
+    BasicBlock *Check0 = MatMul->getParent();
+    // FIXME: Use lazy DTU and update SplitBlock to accept a DTU instead of a
+    // DT. Manually collect dominator tree updates, to avoid unnecessary work,
+    // as we adjust Check0 and Check1's branches.
+    SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
+    for (BasicBlock *Succ : successors(Check0))
       DTUpdates.push_back({DT->Delete, Check0, Succ});
- 
+
     BasicBlock *Check1 =
         SplitBlock(MatMul->getParent(), MatMul, (DomTreeUpdater *)nullptr, LI,
                    nullptr, "alias_cont");
-    BasicBlock *Copy = 
+    BasicBlock *Copy =
         SplitBlock(MatMul->getParent(), MatMul, (DomTreeUpdater *)nullptr, LI,
                    nullptr, "copy");
     BasicBlock *Fusion =
         SplitBlock(MatMul->getParent(), MatMul, (DomTreeUpdater *)nullptr, LI,
                    nullptr, "no_alias");
- 
-    // Check if the loaded memory location begins before the end of the store 
-    // location. If the condition holds, they might overlap, otherwise they are 
-    // guaranteed to not overlap. 
-    IRBuilder<> Builder(MatMul); 
-    Check0->getTerminator()->eraseFromParent(); 
-    Builder.SetInsertPoint(Check0); 
-    Type *IntPtrTy = Builder.getIntPtrTy(Load->getModule()->getDataLayout()); 
-    Value *StoreBegin = Builder.CreatePtrToInt( 
-        const_cast<Value *>(StoreLoc.Ptr), IntPtrTy, "store.begin"); 
-    Value *StoreEnd = Builder.CreateAdd( 
-        StoreBegin, ConstantInt::get(IntPtrTy, StoreLoc.Size.getValue()), 
-        "store.end", true, true); 
-    Value *LoadBegin = Builder.CreatePtrToInt(const_cast<Value *>(LoadLoc.Ptr), 
-                                              IntPtrTy, "load.begin"); 
-    Builder.CreateCondBr(Builder.CreateICmpULT(LoadBegin, StoreEnd), Check1, 
-                         Fusion); 
- 
-    // Check if the store begins before the end of the load location. If the 
-    // condition holds, they alias, otherwise they are guaranteed to not 
-    // overlap. 
-    Check1->getTerminator()->eraseFromParent(); 
-    Builder.SetInsertPoint(Check1, Check1->begin()); 
-    Value *LoadEnd = Builder.CreateAdd( 
-        LoadBegin, ConstantInt::get(IntPtrTy, LoadLoc.Size.getValue()), 
-        "load.end", true, true); 
-    Builder.CreateCondBr(Builder.CreateICmpULT(StoreBegin, LoadEnd), Copy, 
-                         Fusion); 
- 
-    // Copy load operand to new alloca. 
-    Builder.SetInsertPoint(Copy, Copy->begin()); 
-    AllocaInst *NewLd = 
-        Builder.CreateAlloca(Load->getType(), Load->getPointerAddressSpace()); 
-    Builder.CreateMemCpy(NewLd, NewLd->getAlign(), 
-                         Load->getPointerOperand(), Load->getAlign(), 
-                         LoadLoc.Size.getValue()); 
-    Builder.SetInsertPoint(Fusion, Fusion->begin()); 
-    PHINode *PHI = Builder.CreatePHI(Load->getPointerOperandType(), 3); 
-    PHI->addIncoming(Load->getPointerOperand(), Check0); 
-    PHI->addIncoming(Load->getPointerOperand(), Check1); 
-    PHI->addIncoming(NewLd, Copy); 
- 
-    // Adjust DT. 
+
+    // Check if the loaded memory location begins before the end of the store
+    // location. If the condition holds, they might overlap, otherwise they are
+    // guaranteed to not overlap.
+    IRBuilder<> Builder(MatMul);
+    Check0->getTerminator()->eraseFromParent();
+    Builder.SetInsertPoint(Check0);
+    Type *IntPtrTy = Builder.getIntPtrTy(Load->getModule()->getDataLayout());
+    Value *StoreBegin = Builder.CreatePtrToInt(
+        const_cast<Value *>(StoreLoc.Ptr), IntPtrTy, "store.begin");
+    Value *StoreEnd = Builder.CreateAdd(
+        StoreBegin, ConstantInt::get(IntPtrTy, StoreLoc.Size.getValue()),
+        "store.end", true, true);
+    Value *LoadBegin = Builder.CreatePtrToInt(const_cast<Value *>(LoadLoc.Ptr),
+                                              IntPtrTy, "load.begin");
+    Builder.CreateCondBr(Builder.CreateICmpULT(LoadBegin, StoreEnd), Check1,
+                         Fusion);
+
+    // Check if the store begins before the end of the load location. If the
+    // condition holds, they alias, otherwise they are guaranteed to not
+    // overlap.
+    Check1->getTerminator()->eraseFromParent();
+    Builder.SetInsertPoint(Check1, Check1->begin());
+    Value *LoadEnd = Builder.CreateAdd(
+        LoadBegin, ConstantInt::get(IntPtrTy, LoadLoc.Size.getValue()),
+        "load.end", true, true);
+    Builder.CreateCondBr(Builder.CreateICmpULT(StoreBegin, LoadEnd), Copy,
+                         Fusion);
+
+    // Copy load operand to new alloca.
+    Builder.SetInsertPoint(Copy, Copy->begin());
+    AllocaInst *NewLd =
+        Builder.CreateAlloca(Load->getType(), Load->getPointerAddressSpace());
+    Builder.CreateMemCpy(NewLd, NewLd->getAlign(),
+                         Load->getPointerOperand(), Load->getAlign(),
+                         LoadLoc.Size.getValue());
+    Builder.SetInsertPoint(Fusion, Fusion->begin());
+    PHINode *PHI = Builder.CreatePHI(Load->getPointerOperandType(), 3);
+    PHI->addIncoming(Load->getPointerOperand(), Check0);
+    PHI->addIncoming(Load->getPointerOperand(), Check1);
+    PHI->addIncoming(NewLd, Copy);
+
+    // Adjust DT.
     DTUpdates.push_back({DT->Insert, Check0, Check1});
     DTUpdates.push_back({DT->Insert, Check0, Fusion});
     DTUpdates.push_back({DT->Insert, Check1, Copy});
     DTUpdates.push_back({DT->Insert, Check1, Fusion});
     DT->applyUpdates(DTUpdates);
-    return PHI; 
-  } 
- 
-  bool isFusionProfitable(CallInst *MatMul) { 
-    if (ForceFusion) 
-      return true; 
- 
-    ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3)); 
-    ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4)); 
- 
-    const unsigned R = LShape.NumRows; 
-    const unsigned C = RShape.NumColumns; 
-    const unsigned M = LShape.NumColumns; 
-    auto *EltType = cast<VectorType>(MatMul->getType())->getElementType(); 
- 
-    const unsigned VF = 
-        std::max<unsigned>(TTI.getRegisterBitWidth(true) / 
-                               EltType->getPrimitiveSizeInBits().getFixedSize(), 
-                           1U); 
- 
-    // Cost model for tiling 
-    // 
-    // For tiling to be beneficial, we need reuse either along the R or 
-    // the C axis.  We vectorize along the R axis so that means at least 
-    // 3 elements. 
-    // TODO: Also consider cost of copying if operands alias. 
-    if (R <= VF && C == 1) 
-      return false; 
-    // Then we need enough elements to exceed the number of vector 
-    // registers we have.  Note that this is an oversimplification since 
-    // fusing also takes some extra loads which may exceed the number of 
-    // reloads necessary. 
-    unsigned Op0Regs = (R + VF - 1) / VF * M; 
-    unsigned Op1Regs = (M + VF - 1) / VF * C; 
-    return Op0Regs + Op1Regs > TTI.getNumberOfRegisters(true); 
-  } 
- 
-  MatrixTy getZeroMatrix(Type *EltType, unsigned R, unsigned C) { 
-    MatrixTy Res; 
-    auto *ColumType = FixedVectorType::get(EltType, R); 
-    for (unsigned I = 0; I < C; ++I) 
-      Res.addVector(ConstantAggregateZero::get(ColumType)); 
-    return Res; 
-  } 
- 
+    return PHI;
+  }
+
+  bool isFusionProfitable(CallInst *MatMul) {
+    if (ForceFusion)
+      return true;
+
+    ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));
+    ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));
+
+    const unsigned R = LShape.NumRows;
+    const unsigned C = RShape.NumColumns;
+    const unsigned M = LShape.NumColumns;
+    auto *EltType = cast<VectorType>(MatMul->getType())->getElementType();
+
+    const unsigned VF =
+        std::max<unsigned>(TTI.getRegisterBitWidth(true) /
+                               EltType->getPrimitiveSizeInBits().getFixedSize(),
+                           1U);
+
+    // Cost model for tiling
+    //
+    // For tiling to be beneficial, we need reuse either along the R or
+    // the C axis.  We vectorize along the R axis so that means at least
+    // 3 elements.
+    // TODO: Also consider cost of copying if operands alias.
+    if (R <= VF && C == 1)
+      return false;
+    // Then we need enough elements to exceed the number of vector
+    // registers we have.  Note that this is an oversimplification since
+    // fusing also takes some extra loads which may exceed the number of
+    // reloads necessary.
+    unsigned Op0Regs = (R + VF - 1) / VF * M;
+    unsigned Op1Regs = (M + VF - 1) / VF * C;
+    return Op0Regs + Op1Regs > TTI.getNumberOfRegisters(true);
+  }
+
+  MatrixTy getZeroMatrix(Type *EltType, unsigned R, unsigned C) {
+    MatrixTy Res;
+    auto *ColumType = FixedVectorType::get(EltType, R);
+    for (unsigned I = 0; I < C; ++I)
+      Res.addVector(ConstantAggregateZero::get(ColumType));
+    return Res;
+  }
+
   void createTiledLoops(CallInst *MatMul, Value *LPtr, ShapeInfo LShape,
                         Value *RPtr, ShapeInfo RShape, StoreInst *Store,
                         bool AllowContract) {
@@ -1266,28 +1266,28 @@ public:
                             "llvm.loop.unroll.count", InnerLoopUnrollCount);
   }
 
-  void emitSIMDTiling(CallInst *MatMul, LoadInst *LoadOp0, LoadInst *LoadOp1, 
-                      StoreInst *Store, 
-                      SmallPtrSetImpl<Instruction *> &FusedInsts) { 
-    assert(MatrixLayout == MatrixLayoutTy::ColumnMajor && 
-           "Tiling only supported for column-major matrixes at the moment!"); 
-    if (!isFusionProfitable(MatMul)) 
-      return; 
- 
-    ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3)); 
-    ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4)); 
- 
-    const unsigned R = LShape.NumRows; 
-    const unsigned C = RShape.NumColumns; 
-    const unsigned M = LShape.NumColumns; 
-    auto *EltType = cast<VectorType>(MatMul->getType())->getElementType(); 
- 
-    Value *APtr = getNonAliasingPointer(LoadOp0, Store, MatMul); 
-    Value *BPtr = getNonAliasingPointer(LoadOp1, Store, MatMul); 
-    Value *CPtr = Store->getPointerOperand(); 
- 
-    bool AllowContract = AllowContractEnabled || (isa<FPMathOperator>(MatMul) && 
-                                                  MatMul->hasAllowContract()); 
+  void emitSIMDTiling(CallInst *MatMul, LoadInst *LoadOp0, LoadInst *LoadOp1,
+                      StoreInst *Store,
+                      SmallPtrSetImpl<Instruction *> &FusedInsts) {
+    assert(MatrixLayout == MatrixLayoutTy::ColumnMajor &&
+           "Tiling only supported for column-major matrixes at the moment!");
+    if (!isFusionProfitable(MatMul))
+      return;
+
+    ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));
+    ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));
+
+    const unsigned R = LShape.NumRows;
+    const unsigned C = RShape.NumColumns;
+    const unsigned M = LShape.NumColumns;
+    auto *EltType = cast<VectorType>(MatMul->getType())->getElementType();
+
+    Value *APtr = getNonAliasingPointer(LoadOp0, Store, MatMul);
+    Value *BPtr = getNonAliasingPointer(LoadOp1, Store, MatMul);
+    Value *CPtr = Store->getPointerOperand();
+
+    bool AllowContract = AllowContractEnabled || (isa<FPMathOperator>(MatMul) &&
+                                                  MatMul->hasAllowContract());
     if (TileUseLoops && (R % TileSize == 0 && C % TileSize == 0))
       createTiledLoops(MatMul, APtr, LShape, BPtr, RShape, Store,
                        AllowContract);
@@ -1298,7 +1298,7 @@ public:
           const unsigned TileR = std::min(R - I, unsigned(TileSize));
           const unsigned TileC = std::min(C - J, unsigned(TileSize));
           MatrixTy Res = getZeroMatrix(EltType, TileR, TileC);
- 
+
           for (unsigned K = 0; K < M; K += TileSize) {
             const unsigned TileM = std::min(M - K, unsigned(TileSize));
             MatrixTy A =
@@ -1314,192 +1314,192 @@ public:
           storeMatrix(Res, CPtr, Store->getAlign(), Store->isVolatile(), {R, M},
                       Builder.getInt64(I), Builder.getInt64(J), EltType,
                       Builder);
-        } 
+        }
+    }
+
+    // Mark eliminated instructions as fused and remove them.
+    FusedInsts.insert(Store);
+    FusedInsts.insert(MatMul);
+    Store->eraseFromParent();
+    MatMul->eraseFromParent();
+    if (LoadOp0->hasNUses(0)) {
+      FusedInsts.insert(LoadOp0);
+      LoadOp0->eraseFromParent();
     }
- 
-    // Mark eliminated instructions as fused and remove them. 
-    FusedInsts.insert(Store); 
-    FusedInsts.insert(MatMul); 
-    Store->eraseFromParent(); 
-    MatMul->eraseFromParent(); 
-    if (LoadOp0->hasNUses(0)) { 
-      FusedInsts.insert(LoadOp0); 
-      LoadOp0->eraseFromParent(); 
-    } 
-    if (LoadOp1->hasNUses(0)) { 
-      FusedInsts.insert(LoadOp1); 
-      LoadOp1->eraseFromParent(); 
-    } 
-  } 
- 
-  /// Try to lower matrix multiply chains by fusing operations. 
-  /// 
-  /// Currently we only lower {ld, ld} -> matmul -> st chains. 
-  // 
-  /// No need to return a MatrixTy object for the result of the operation, since 
-  /// the single store user will be lowered as part of this. Instructions that 
-  /// are completely eliminated by fusion are added to \p FusedInsts. 
-  void LowerMatrixMultiplyFused(CallInst *MatMul, 
-                                SmallPtrSetImpl<Instruction *> &FusedInsts) { 
-    if (!FuseMatrix || !MatMul->hasOneUse() || 
+    if (LoadOp1->hasNUses(0)) {
+      FusedInsts.insert(LoadOp1);
+      LoadOp1->eraseFromParent();
+    }
+  }
+
+  /// Try to lower matrix multiply chains by fusing operations.
+  ///
+  /// Currently we only lower {ld, ld} -> matmul -> st chains.
+  //
+  /// No need to return a MatrixTy object for the result of the operation, since
+  /// the single store user will be lowered as part of this. Instructions that
+  /// are completely eliminated by fusion are added to \p FusedInsts.
+  void LowerMatrixMultiplyFused(CallInst *MatMul,
+                                SmallPtrSetImpl<Instruction *> &FusedInsts) {
+    if (!FuseMatrix || !MatMul->hasOneUse() ||
         MatrixLayout != MatrixLayoutTy::ColumnMajor || !DT)
-      return; 
- 
+      return;
+
     assert(AA && LI && "Analyses should be available");
 
-    auto *LoadOp0 = dyn_cast<LoadInst>(MatMul->getOperand(0)); 
-    auto *LoadOp1 = dyn_cast<LoadInst>(MatMul->getOperand(1)); 
-    auto *Store = dyn_cast<StoreInst>(*MatMul->user_begin()); 
-    if (LoadOp0 && LoadOp1 && Store) { 
-      // The store address must dominate the MatMul instruction, otherwise 
-      // we create invalid IR. 
-      // FIXME: See if we can hoist the store address computation. 
-      auto *AddrI = dyn_cast<Instruction>(Store->getOperand(1)); 
+    auto *LoadOp0 = dyn_cast<LoadInst>(MatMul->getOperand(0));
+    auto *LoadOp1 = dyn_cast<LoadInst>(MatMul->getOperand(1));
+    auto *Store = dyn_cast<StoreInst>(*MatMul->user_begin());
+    if (LoadOp0 && LoadOp1 && Store) {
+      // The store address must dominate the MatMul instruction, otherwise
+      // we create invalid IR.
+      // FIXME: See if we can hoist the store address computation.
+      auto *AddrI = dyn_cast<Instruction>(Store->getOperand(1));
       if (AddrI && (!DT->dominates(AddrI, MatMul)))
-        return; 
- 
-      emitSIMDTiling(MatMul, LoadOp0, LoadOp1, Store, FusedInsts); 
-      return; 
-    } 
-  } 
- 
-  /// Lowers llvm.matrix.multiply. 
-  void LowerMultiply(CallInst *MatMul) { 
-    IRBuilder<> Builder(MatMul); 
-    auto *EltType = cast<VectorType>(MatMul->getType())->getElementType(); 
-    ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3)); 
-    ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4)); 
- 
-    const MatrixTy &Lhs = getMatrix(MatMul->getArgOperand(0), LShape, Builder); 
-    const MatrixTy &Rhs = getMatrix(MatMul->getArgOperand(1), RShape, Builder); 
+        return;
+
+      emitSIMDTiling(MatMul, LoadOp0, LoadOp1, Store, FusedInsts);
+      return;
+    }
+  }
+
+  /// Lowers llvm.matrix.multiply.
+  void LowerMultiply(CallInst *MatMul) {
+    IRBuilder<> Builder(MatMul);
+    auto *EltType = cast<VectorType>(MatMul->getType())->getElementType();
+    ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));
+    ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));
+
+    const MatrixTy &Lhs = getMatrix(MatMul->getArgOperand(0), LShape, Builder);
+    const MatrixTy &Rhs = getMatrix(MatMul->getArgOperand(1), RShape, Builder);
     assert(Lhs.getElementType() == Rhs.getElementType() &&
            "Matrix multiply argument element types do not match.");
- 
-    const unsigned R = LShape.NumRows; 
-    const unsigned C = RShape.NumColumns; 
-    assert(LShape.NumColumns == RShape.NumRows); 
- 
-    // Initialize the output 
-    MatrixTy Result(R, C, EltType); 
+
+    const unsigned R = LShape.NumRows;
+    const unsigned C = RShape.NumColumns;
+    assert(LShape.NumColumns == RShape.NumRows);
+
+    // Initialize the output
+    MatrixTy Result(R, C, EltType);
     assert(Lhs.getElementType() == Result.getElementType() &&
            "Matrix multiply result element type does not match arguments.");
- 
-    bool AllowContract = AllowContractEnabled || (isa<FPMathOperator>(MatMul) && 
-                                                  MatMul->hasAllowContract()); 
-    emitMatrixMultiply(Result, Lhs, Rhs, AllowContract, Builder, false); 
-    finalizeLowering(MatMul, Result, Builder); 
-  } 
- 
-  /// Lowers llvm.matrix.transpose. 
-  void LowerTranspose(CallInst *Inst) { 
-    MatrixTy Result; 
-    IRBuilder<> Builder(Inst); 
-    Value *InputVal = Inst->getArgOperand(0); 
-    VectorType *VectorTy = cast<VectorType>(InputVal->getType()); 
-    ShapeInfo ArgShape(Inst->getArgOperand(1), Inst->getArgOperand(2)); 
-    MatrixTy InputMatrix = getMatrix(InputVal, ArgShape, Builder); 
- 
-    const unsigned NewNumVecs = 
-        InputMatrix.isColumnMajor() ? ArgShape.NumRows : ArgShape.NumColumns; 
-    const unsigned NewNumElts = 
-        InputMatrix.isColumnMajor() ? ArgShape.NumColumns : ArgShape.NumRows; 
- 
-    for (unsigned I = 0; I < NewNumVecs; ++I) { 
-      // Build a single result vector. First initialize it. 
-      Value *ResultVector = UndefValue::get( 
-          FixedVectorType::get(VectorTy->getElementType(), NewNumElts)); 
-      // Go through the old elements and insert it into the resulting vector. 
-      for (auto J : enumerate(InputMatrix.vectors())) { 
-        Value *Elt = Builder.CreateExtractElement(J.value(), I); 
-        // Row and column indices are transposed. 
-        ResultVector = 
-            Builder.CreateInsertElement(ResultVector, Elt, J.index()); 
-      } 
-      Result.addVector(ResultVector); 
-    } 
- 
-    // TODO: Improve estimate of operations needed for transposes. Currently we 
-    // just count the insertelement/extractelement instructions, but do not 
-    // account for later simplifications/combines. 
-    finalizeLowering( 
-        Inst, 
-        Result.addNumComputeOps(2 * ArgShape.NumRows * ArgShape.NumColumns), 
-        Builder); 
-  } 
- 
-  /// Lower load instructions, if shape information is available. 
-  bool VisitLoad(LoadInst *Inst, Value *Ptr, IRBuilder<> &Builder) { 
-    auto I = ShapeMap.find(Inst); 
-    if (I == ShapeMap.end()) 
-      return false; 
- 
-    LowerLoad(Inst, Ptr, Inst->getAlign(), 
-              Builder.getInt64(I->second.getStride()), Inst->isVolatile(), 
-              I->second); 
-    return true; 
-  } 
- 
-  bool VisitStore(StoreInst *Inst, Value *StoredVal, Value *Ptr, 
-                  IRBuilder<> &Builder) { 
-    auto I = ShapeMap.find(StoredVal); 
-    if (I == ShapeMap.end()) 
-      return false; 
- 
-    LowerStore(Inst, StoredVal, Ptr, Inst->getAlign(), 
-               Builder.getInt64(I->second.getStride()), Inst->isVolatile(), 
-               I->second); 
-    return true; 
-  } 
- 
-  /// Lower binary operators, if shape information is available. 
-  bool VisitBinaryOperator(BinaryOperator *Inst) { 
-    auto I = ShapeMap.find(Inst); 
-    if (I == ShapeMap.end()) 
-      return false; 
- 
-    Value *Lhs = Inst->getOperand(0); 
-    Value *Rhs = Inst->getOperand(1); 
- 
-    IRBuilder<> Builder(Inst); 
-    ShapeInfo &Shape = I->second; 
- 
-    MatrixTy Result; 
-    MatrixTy A = getMatrix(Lhs, Shape, Builder); 
-    MatrixTy B = getMatrix(Rhs, Shape, Builder); 
-    assert(A.isColumnMajor() == B.isColumnMajor() && 
-           Result.isColumnMajor() == A.isColumnMajor() && 
-           "operands must agree on matrix layout"); 
- 
-    // Helper to perform binary op on vectors. 
-    auto BuildVectorOp = [&Builder, Inst](Value *LHS, Value *RHS) { 
-      switch (Inst->getOpcode()) { 
-      case Instruction::Add: 
-        return Builder.CreateAdd(LHS, RHS); 
-      case Instruction::Mul: 
-        return Builder.CreateMul(LHS, RHS); 
-      case Instruction::Sub: 
-        return Builder.CreateSub(LHS, RHS); 
-      case Instruction::FAdd: 
-        return Builder.CreateFAdd(LHS, RHS); 
-      case Instruction::FMul: 
-        return Builder.CreateFMul(LHS, RHS); 
-      case Instruction::FSub: 
-        return Builder.CreateFSub(LHS, RHS); 
-      default: 
-        llvm_unreachable("Unsupported binary operator for matrix"); 
-      } 
-    }; 
- 
-    for (unsigned I = 0; I < Shape.getNumVectors(); ++I) 
-      Result.addVector(BuildVectorOp(A.getVector(I), B.getVector(I))); 
- 
-    finalizeLowering(Inst, 
-                     Result.addNumComputeOps(getNumOps(Result.getVectorTy()) * 
-                                             Result.getNumVectors()), 
-                     Builder); 
-    return true; 
-  } 
- 
+
+    bool AllowContract = AllowContractEnabled || (isa<FPMathOperator>(MatMul) &&
+                                                  MatMul->hasAllowContract());
+    emitMatrixMultiply(Result, Lhs, Rhs, AllowContract, Builder, false);
+    finalizeLowering(MatMul, Result, Builder);
+  }
+
+  /// Lowers llvm.matrix.transpose.
+  void LowerTranspose(CallInst *Inst) {
+    MatrixTy Result;
+    IRBuilder<> Builder(Inst);
+    Value *InputVal = Inst->getArgOperand(0);
+    VectorType *VectorTy = cast<VectorType>(InputVal->getType());
+    ShapeInfo ArgShape(Inst->getArgOperand(1), Inst->getArgOperand(2));
+    MatrixTy InputMatrix = getMatrix(InputVal, ArgShape, Builder);
+
+    const unsigned NewNumVecs =
+        InputMatrix.isColumnMajor() ? ArgShape.NumRows : ArgShape.NumColumns;
+    const unsigned NewNumElts =
+        InputMatrix.isColumnMajor() ? ArgShape.NumColumns : ArgShape.NumRows;
+
+    for (unsigned I = 0; I < NewNumVecs; ++I) {
+      // Build a single result vector. First initialize it.
+      Value *ResultVector = UndefValue::get(
+          FixedVectorType::get(VectorTy->getElementType(), NewNumElts));
+      // Go through the old elements and insert it into the resulting vector.
+      for (auto J : enumerate(InputMatrix.vectors())) {
+        Value *Elt = Builder.CreateExtractElement(J.value(), I);
+        // Row and column indices are transposed.
+        ResultVector =
+            Builder.CreateInsertElement(ResultVector, Elt, J.index());
+      }
+      Result.addVector(ResultVector);
+    }
+
+    // TODO: Improve estimate of operations needed for transposes. Currently we
+    // just count the insertelement/extractelement instructions, but do not
+    // account for later simplifications/combines.
+    finalizeLowering(
+        Inst,
+        Result.addNumComputeOps(2 * ArgShape.NumRows * ArgShape.NumColumns),
+        Builder);
+  }
+
+  /// Lower load instructions, if shape information is available.
+  bool VisitLoad(LoadInst *Inst, Value *Ptr, IRBuilder<> &Builder) {
+    auto I = ShapeMap.find(Inst);
+    if (I == ShapeMap.end())
+      return false;
+
+    LowerLoad(Inst, Ptr, Inst->getAlign(),
+              Builder.getInt64(I->second.getStride()), Inst->isVolatile(),
+              I->second);
+    return true;
+  }
+
+  bool VisitStore(StoreInst *Inst, Value *StoredVal, Value *Ptr,
+                  IRBuilder<> &Builder) {
+    auto I = ShapeMap.find(StoredVal);
+    if (I == ShapeMap.end())
+      return false;
+
+    LowerStore(Inst, StoredVal, Ptr, Inst->getAlign(),
+               Builder.getInt64(I->second.getStride()), Inst->isVolatile(),
+               I->second);
+    return true;
+  }
+
+  /// Lower binary operators, if shape information is available.
+  bool VisitBinaryOperator(BinaryOperator *Inst) {
+    auto I = ShapeMap.find(Inst);
+    if (I == ShapeMap.end())
+      return false;
+
+    Value *Lhs = Inst->getOperand(0);
+    Value *Rhs = Inst->getOperand(1);
+
+    IRBuilder<> Builder(Inst);
+    ShapeInfo &Shape = I->second;
+
+    MatrixTy Result;
+    MatrixTy A = getMatrix(Lhs, Shape, Builder);
+    MatrixTy B = getMatrix(Rhs, Shape, Builder);
+    assert(A.isColumnMajor() == B.isColumnMajor() &&
+           Result.isColumnMajor() == A.isColumnMajor() &&
+           "operands must agree on matrix layout");
+
+    // Helper to perform binary op on vectors.
+    auto BuildVectorOp = [&Builder, Inst](Value *LHS, Value *RHS) {
+      switch (Inst->getOpcode()) {
+      case Instruction::Add:
+        return Builder.CreateAdd(LHS, RHS);
+      case Instruction::Mul:
+        return Builder.CreateMul(LHS, RHS);
+      case Instruction::Sub:
+        return Builder.CreateSub(LHS, RHS);
+      case Instruction::FAdd:
+        return Builder.CreateFAdd(LHS, RHS);
+      case Instruction::FMul:
+        return Builder.CreateFMul(LHS, RHS);
+      case Instruction::FSub:
+        return Builder.CreateFSub(LHS, RHS);
+      default:
+        llvm_unreachable("Unsupported binary operator for matrix");
+      }
+    };
+
+    for (unsigned I = 0; I < Shape.getNumVectors(); ++I)
+      Result.addVector(BuildVectorOp(A.getVector(I), B.getVector(I)));
+
+    finalizeLowering(Inst,
+                     Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *
+                                             Result.getNumVectors()),
+                     Builder);
+    return true;
+  }
+
   /// Lower unary operators, if shape information is available.
   bool VisitUnaryOperator(UnaryOperator *Inst) {
     auto I = ShapeMap.find(Inst);
@@ -1534,449 +1534,449 @@ public:
     return true;
   }
 
-  /// Helper to linearize a matrix expression tree into a string. Currently 
-  /// matrix expressions are linarized by starting at an expression leaf and 
-  /// linearizing bottom up. 
-  struct ExprLinearizer { 
-    unsigned LengthToBreak = 100; 
-    std::string Str; 
-    raw_string_ostream Stream; 
-    unsigned LineLength = 0; 
-    const DataLayout &DL; 
- 
-    /// Mapping from instructions to matrixes. It is used to identify 
-    /// matrix instructions. 
-    const MapVector<Value *, MatrixTy> &Inst2Matrix; 
- 
-    /// Mapping from values to the leaves of all expressions that the value is 
-    /// part of. 
-    const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared; 
- 
-    /// Set of matrix expressions in the scope of a given DISubprogram. 
-    const SmallSetVector<Value *, 32> &ExprsInSubprogram; 
- 
-    /// Leaf node of the expression to linearize. 
-    Value *Leaf; 
- 
-    /// Used to keep track of sub-expressions that get reused while linearizing 
-    /// the expression. Re-used sub-expressions are marked as (reused). 
-    SmallPtrSet<Value *, 8> ReusedExprs; 
- 
-    ExprLinearizer(const DataLayout &DL, 
-                   const MapVector<Value *, MatrixTy> &Inst2Matrix, 
-                   const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared, 
-                   const SmallSetVector<Value *, 32> &ExprsInSubprogram, 
-                   Value *Leaf) 
-        : Str(), Stream(Str), DL(DL), Inst2Matrix(Inst2Matrix), Shared(Shared), 
-          ExprsInSubprogram(ExprsInSubprogram), Leaf(Leaf) {} 
- 
-    void indent(unsigned N) { 
-      LineLength += N; 
-      for (unsigned i = 0; i < N; i++) 
-        Stream << " "; 
-    } 
- 
-    void lineBreak() { 
-      Stream << "\n"; 
-      LineLength = 0; 
-    } 
- 
-    void maybeIndent(unsigned Indent) { 
-      if (LineLength >= LengthToBreak) 
-        lineBreak(); 
- 
-      if (LineLength == 0) 
-        indent(Indent); 
-    } 
- 
-    void write(StringRef S) { 
-      LineLength += S.size(); 
-      Stream << S; 
-    } 
- 
-    Value *getUnderlyingObjectThroughLoads(Value *V) { 
-      if (Value *Ptr = getPointerOperand(V)) 
-        return getUnderlyingObjectThroughLoads(Ptr); 
-      else if (V->getType()->isPointerTy()) 
+  /// Helper to linearize a matrix expression tree into a string. Currently
+  /// matrix expressions are linarized by starting at an expression leaf and
+  /// linearizing bottom up.
+  struct ExprLinearizer {
+    unsigned LengthToBreak = 100;
+    std::string Str;
+    raw_string_ostream Stream;
+    unsigned LineLength = 0;
+    const DataLayout &DL;
+
+    /// Mapping from instructions to matrixes. It is used to identify
+    /// matrix instructions.
+    const MapVector<Value *, MatrixTy> &Inst2Matrix;
+
+    /// Mapping from values to the leaves of all expressions that the value is
+    /// part of.
+    const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared;
+
+    /// Set of matrix expressions in the scope of a given DISubprogram.
+    const SmallSetVector<Value *, 32> &ExprsInSubprogram;
+
+    /// Leaf node of the expression to linearize.
+    Value *Leaf;
+
+    /// Used to keep track of sub-expressions that get reused while linearizing
+    /// the expression. Re-used sub-expressions are marked as (reused).
+    SmallPtrSet<Value *, 8> ReusedExprs;
+
+    ExprLinearizer(const DataLayout &DL,
+                   const MapVector<Value *, MatrixTy> &Inst2Matrix,
+                   const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared,
+                   const SmallSetVector<Value *, 32> &ExprsInSubprogram,
+                   Value *Leaf)
+        : Str(), Stream(Str), DL(DL), Inst2Matrix(Inst2Matrix), Shared(Shared),
+          ExprsInSubprogram(ExprsInSubprogram), Leaf(Leaf) {}
+
+    void indent(unsigned N) {
+      LineLength += N;
+      for (unsigned i = 0; i < N; i++)
+        Stream << " ";
+    }
+
+    void lineBreak() {
+      Stream << "\n";
+      LineLength = 0;
+    }
+
+    void maybeIndent(unsigned Indent) {
+      if (LineLength >= LengthToBreak)
+        lineBreak();
+
+      if (LineLength == 0)
+        indent(Indent);
+    }
+
+    void write(StringRef S) {
+      LineLength += S.size();
+      Stream << S;
+    }
+
+    Value *getUnderlyingObjectThroughLoads(Value *V) {
+      if (Value *Ptr = getPointerOperand(V))
+        return getUnderlyingObjectThroughLoads(Ptr);
+      else if (V->getType()->isPointerTy())
         return getUnderlyingObject(V);
-      return V; 
-    } 
- 
-    /// Returns true if \p V is a matrix value in the given subprogram. 
-    bool isMatrix(Value *V) const { return ExprsInSubprogram.count(V); } 
- 
-    /// If \p V is a matrix value, print its shape as as NumRows x NumColumns to 
-    /// \p SS. 
-    void prettyPrintMatrixType(Value *V, raw_string_ostream &SS) { 
-      auto M = Inst2Matrix.find(V); 
-      if (M == Inst2Matrix.end()) 
-        SS << "unknown"; 
-      else { 
-        SS << M->second.getNumRows(); 
-        SS << "x"; 
-        SS << M->second.getNumColumns(); 
-      } 
-    } 
- 
-    /// Write the called function name. Handles calls to llvm.matrix.* 
-    /// specially: we write the name, followed by the dimensions of the input 
-    /// matrixes, followed by the scalar type name. 
-    void writeFnName(CallInst *CI) { 
-      if (!CI->getCalledFunction()) 
-        write("<no called fn>"); 
-      else { 
-        StringRef Name = CI->getCalledFunction()->getName(); 
-        if (!Name.startswith("llvm.matrix")) { 
-          write(Name); 
-          return; 
-        } 
-        IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI); 
-        write(StringRef(Intrinsic::getName(II->getIntrinsicID(), {})) 
-                  .drop_front(StringRef("llvm.matrix.").size())); 
-        write("."); 
+      return V;
+    }
+
+    /// Returns true if \p V is a matrix value in the given subprogram.
+    bool isMatrix(Value *V) const { return ExprsInSubprogram.count(V); }
+
+    /// If \p V is a matrix value, print its shape as as NumRows x NumColumns to
+    /// \p SS.
+    void prettyPrintMatrixType(Value *V, raw_string_ostream &SS) {
+      auto M = Inst2Matrix.find(V);
+      if (M == Inst2Matrix.end())
+        SS << "unknown";
+      else {
+        SS << M->second.getNumRows();
+        SS << "x";
+        SS << M->second.getNumColumns();
+      }
+    }
+
+    /// Write the called function name. Handles calls to llvm.matrix.*
+    /// specially: we write the name, followed by the dimensions of the input
+    /// matrixes, followed by the scalar type name.
+    void writeFnName(CallInst *CI) {
+      if (!CI->getCalledFunction())
+        write("<no called fn>");
+      else {
+        StringRef Name = CI->getCalledFunction()->getName();
+        if (!Name.startswith("llvm.matrix")) {
+          write(Name);
+          return;
+        }
+        IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
+        write(StringRef(Intrinsic::getName(II->getIntrinsicID(), {}))
+                  .drop_front(StringRef("llvm.matrix.").size()));
+        write(".");
         std::string Tmp;
-        raw_string_ostream SS(Tmp); 
- 
-        switch (II->getIntrinsicID()) { 
-        case Intrinsic::matrix_multiply: 
-          prettyPrintMatrixType(II->getOperand(0), SS); 
-          SS << "."; 
-          prettyPrintMatrixType(II->getOperand(1), SS); 
-          SS << "." << *II->getType()->getScalarType(); 
-          break; 
-        case Intrinsic::matrix_transpose: 
-          prettyPrintMatrixType(II->getOperand(0), SS); 
-          SS << "." << *II->getType()->getScalarType(); 
-          break; 
-        case Intrinsic::matrix_column_major_load: 
-          prettyPrintMatrixType(II, SS); 
-          SS << "." << *II->getType()->getScalarType(); 
-          break; 
-        case Intrinsic::matrix_column_major_store: 
-          prettyPrintMatrixType(II->getOperand(0), SS); 
-          SS << "." << *II->getOperand(0)->getType()->getScalarType(); 
-          break; 
-        default: 
-          llvm_unreachable("Unhandled case"); 
-        } 
-        SS.flush(); 
-        write(Tmp); 
-      } 
-    } 
- 
-    unsigned getNumShapeArgs(CallInst *CI) const { 
-      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) { 
-        switch (II->getIntrinsicID()) { 
-        case Intrinsic::matrix_multiply: 
-          return 3; 
-        case Intrinsic::matrix_transpose: 
-          return 2; 
-        case Intrinsic::matrix_column_major_load: 
-        case Intrinsic::matrix_column_major_store: 
-          return 3; 
-        default: 
-          return 0; 
-        } 
-      } 
-      return 0; 
-    } 
- 
-    /// Special printing for values: for pointers, we print if they refer to an 
-    /// (function) external address or a stack address, for other values we 
-    /// either print the constant or "scalar"/"matrix" for other values. 
-    void write(Value *V) { 
-      V = getUnderlyingObjectThroughLoads(V); 
-      if (V->getType()->isPointerTy()) { 
-        if (isa<AllocaInst>(V)) { 
-          Stream << "stack addr"; 
-          LineLength += StringRef("stack addr").size(); 
-        } else { 
-          Stream << "addr"; 
-          LineLength += StringRef("addr").size(); 
-        } 
-        if (!V->getName().empty()) { 
-          Stream << " %" << V->getName() << ""; 
-          LineLength += V->getName().size() + 2; 
-        } 
-        return; 
-      } 
- 
-      std::string Tmp; 
-      raw_string_ostream TmpStream(Tmp); 
- 
-      if (auto *CI = dyn_cast<ConstantInt>(V)) 
-        TmpStream << CI->getValue(); 
-      else if (isa<Constant>(V)) 
-        TmpStream << "constant"; 
-      else { 
-        if (isMatrix(V)) 
-          TmpStream << "matrix"; 
-        else 
-          TmpStream << "scalar"; 
-      } 
-      TmpStream.flush(); 
-      Tmp = std::string(StringRef(Tmp).trim()); 
-      LineLength += Tmp.size(); 
-      Stream << Tmp; 
-    } 
- 
-    /// Linearize expression \p Expr starting at an indentation of \p Indent. 
-    /// Expressions that are re-used multiple times are prefixed with (reused) 
-    /// at the re-used root instruction. 
-    void linearizeExpr(Value *Expr, unsigned Indent, bool ParentReused, 
-                       bool ParentShared) { 
-      auto *I = cast<Instruction>(Expr); 
-      maybeIndent(Indent); 
-      SmallVector<Value *, 8> Ops; 
- 
-      // Is Expr shared with other expression leaves? 
-      bool ExprShared = false; 
- 
-      // Deal with shared subtrees. Mark them as shared, if required. 
-      if (!ParentShared) { 
-        auto SI = Shared.find(Expr); 
-        assert(SI != Shared.end() && SI->second.count(Leaf)); 
- 
-        for (Value *S : SI->second) { 
-          if (S == Leaf) 
-            continue; 
-          DebugLoc DL = cast<Instruction>(S)->getDebugLoc(); 
-          write("shared with remark at line " + std::to_string(DL.getLine()) + 
-                " column " + std::to_string(DL.getCol()) + " ("); 
-        } 
-        ExprShared = SI->second.size() > 1; 
-      } 
- 
-      bool Reused = !ReusedExprs.insert(Expr).second; 
-      if (Reused && !ParentReused) 
-        write("(reused) "); 
- 
-      if (auto *CI = dyn_cast<CallInst>(I)) { 
-        writeFnName(CI); 
- 
-        Ops.append(CI->arg_begin(), CI->arg_end() - getNumShapeArgs(CI)); 
-      } else if (isa<BitCastInst>(Expr)) { 
-        // Special case bitcasts, which are used to materialize matrixes from 
-        // non-matrix ops. 
-        write("matrix"); 
-        return; 
-      } else { 
-        Ops.append(I->value_op_begin(), I->value_op_end()); 
-        write(std::string(I->getOpcodeName())); 
-      } 
- 
-      write(std::string("(")); 
- 
-      unsigned NumOpsToBreak = 1; 
-      if (match(Expr, m_Intrinsic<Intrinsic::matrix_column_major_load>())) 
-        NumOpsToBreak = 2; 
- 
-      for (Value *Op : Ops) { 
-        if (Ops.size() > NumOpsToBreak) 
-          lineBreak(); 
- 
-        maybeIndent(Indent + 1); 
-        if (isMatrix(Op)) 
-          linearizeExpr(Op, Indent + 1, Reused, ExprShared); 
-        else 
-          write(Op); 
-        if (Op != Ops.back()) 
-          write(", "); 
-      } 
- 
-      write(")"); 
-    } 
- 
-    const std::string &getResult() { 
-      Stream.flush(); 
-      return Str; 
-    } 
-  }; 
- 
-  /// Generate remarks for matrix operations in a function. To generate remarks 
-  /// for matrix expressions, the following approach is used: 
-  /// 1. Use the inlined-at debug information to group matrix operations to the 
-  ///    DISubprograms they are contained in. 
-  /// 2. Collect leaves of matrix expressions (done in 
-  ///    RemarkGenerator::getExpressionLeaves) for each subprogram - expression 
-  //     mapping.  Leaves are lowered matrix instructions without other matrix 
-  //     users (like stores) in the current subprogram. 
-  /// 3. For each leaf, create a remark containing a linearizied version of the 
-  ///    matrix expression. The expression is linearized by a recursive 
-  ///    bottom-up traversal of the matrix operands, starting at a leaf. Note 
-  ///    that multiple leaves can share sub-expressions. Shared subexpressions 
-  ///    are explicitly marked as shared(). 
-  struct RemarkGenerator { 
-    const MapVector<Value *, MatrixTy> &Inst2Matrix; 
-    OptimizationRemarkEmitter &ORE; 
-    Function &Func; 
-    const DataLayout &DL; 
- 
-    RemarkGenerator(const MapVector<Value *, MatrixTy> &Inst2Matrix, 
-                    OptimizationRemarkEmitter &ORE, Function &Func) 
-        : Inst2Matrix(Inst2Matrix), ORE(ORE), Func(Func), 
-          DL(Func.getParent()->getDataLayout()) {} 
- 
-    /// Return all leaves of the expressions in \p ExprsInSubprogram. Those are 
-    /// instructions in Inst2Matrix returning void or without any users in 
-    /// \p ExprsInSubprogram. Currently that should only include stores. 
-    SmallVector<Value *, 4> 
-    getExpressionLeaves(const SmallSetVector<Value *, 32> &ExprsInSubprogram) { 
-      SmallVector<Value *, 4> Leaves; 
-      for (auto *Expr : ExprsInSubprogram) 
-        if (Expr->getType()->isVoidTy() || 
-            !any_of(Expr->users(), [&ExprsInSubprogram](User *U) { 
-              return ExprsInSubprogram.count(U); 
-            })) 
-          Leaves.push_back(Expr); 
-      return Leaves; 
-    } 
- 
-    /// Recursively traverse expression \p V starting at \p Leaf and add \p Leaf 
-    /// to all visited expressions in \p Shared. Limit the matrix operations to 
-    /// the ones in \p ExprsInSubprogram. 
-    void collectSharedInfo(Value *Leaf, Value *V, 
-                           const SmallSetVector<Value *, 32> &ExprsInSubprogram, 
-                           DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared) { 
- 
-      if (!ExprsInSubprogram.count(V)) 
-        return; 
- 
-      auto I = Shared.insert({V, {}}); 
-      I.first->second.insert(Leaf); 
- 
-      for (Value *Op : cast<Instruction>(V)->operand_values()) 
-        collectSharedInfo(Leaf, Op, ExprsInSubprogram, Shared); 
-    } 
- 
-    /// Calculate the number of exclusive and shared op counts for expression 
-    /// starting at \p V. Expressions used multiple times are counted once. 
-    /// Limit the matrix operations to the ones in \p ExprsInSubprogram. 
-    std::pair<OpInfoTy, OpInfoTy> 
-    sumOpInfos(Value *Root, SmallPtrSetImpl<Value *> &ReusedExprs, 
-               const SmallSetVector<Value *, 32> &ExprsInSubprogram, 
-               DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared) const { 
-      if (!ExprsInSubprogram.count(Root)) 
-        return {}; 
- 
-      // Already counted this expression. Stop. 
-      if (!ReusedExprs.insert(Root).second) 
-        return {}; 
- 
-      OpInfoTy SharedCount; 
-      OpInfoTy Count; 
- 
-      auto I = Shared.find(Root); 
-      auto CM = Inst2Matrix.find(Root); 
-      if (I->second.size() == 1) 
-        Count = CM->second.getOpInfo(); 
-      else 
-        SharedCount = CM->second.getOpInfo(); 
- 
-      for (Value *Op : cast<Instruction>(Root)->operand_values()) { 
-        auto C = sumOpInfos(Op, ReusedExprs, ExprsInSubprogram, Shared); 
-        Count += C.first; 
-        SharedCount += C.second; 
-      } 
-      return {Count, SharedCount}; 
-    } 
- 
-    void emitRemarks() { 
-      if (!ORE.allowExtraAnalysis(DEBUG_TYPE)) 
-        return; 
- 
-      // Map matrix operations to their containting subprograms, by traversing 
-      // the inlinedAt chain. If the function does not have a DISubprogram, we 
-      // only map them to the containing function. 
-      MapVector<DISubprogram *, SmallVector<Value *, 8>> Subprog2Exprs; 
-      for (auto &KV : Inst2Matrix) { 
-        if (Func.getSubprogram()) { 
-          auto *I = cast<Instruction>(KV.first); 
-          DILocation *Context = I->getDebugLoc(); 
-          while (Context) { 
-            auto I = 
-                Subprog2Exprs.insert({getSubprogram(Context->getScope()), {}}); 
-            I.first->second.push_back(KV.first); 
-            Context = DebugLoc(Context).getInlinedAt(); 
-          } 
-        } else { 
-          auto I = Subprog2Exprs.insert({nullptr, {}}); 
-          I.first->second.push_back(KV.first); 
-        } 
-      } 
-      for (auto &KV : Subprog2Exprs) { 
-        SmallSetVector<Value *, 32> ExprsInSubprogram(KV.second.begin(), 
-                                                      KV.second.end()); 
-        auto Leaves = getExpressionLeaves(ExprsInSubprogram); 
- 
-        DenseMap<Value *, SmallPtrSet<Value *, 2>> Shared; 
-        for (Value *Leaf : Leaves) 
-          collectSharedInfo(Leaf, Leaf, ExprsInSubprogram, Shared); 
- 
-        // Generate remarks for each leaf. 
-        for (auto *L : Leaves) { 
- 
-          DebugLoc Loc = cast<Instruction>(L)->getDebugLoc(); 
-          DILocation *Context = cast<Instruction>(L)->getDebugLoc(); 
-          while (Context) { 
-            if (getSubprogram(Context->getScope()) == KV.first) { 
-              Loc = Context; 
-              break; 
-            } 
-            Context = DebugLoc(Context).getInlinedAt(); 
-          } 
- 
-          SmallPtrSet<Value *, 8> ReusedExprs; 
-          OpInfoTy Counts, SharedCounts; 
-          std::tie(Counts, SharedCounts) = 
-              sumOpInfos(L, ReusedExprs, ExprsInSubprogram, Shared); 
- 
-          OptimizationRemark Rem(DEBUG_TYPE, "matrix-lowered", Loc, 
-                                 cast<Instruction>(L)->getParent()); 
- 
-          Rem << "Lowered with "; 
-          Rem << ore::NV("NumStores", Counts.NumStores) << " stores, " 
-              << ore::NV("NumLoads", Counts.NumLoads) << " loads, " 
-              << ore::NV("NumComputeOps", Counts.NumComputeOps) 
-              << " compute ops"; 
- 
-          if (SharedCounts.NumStores > 0 || SharedCounts.NumLoads > 0 || 
-              SharedCounts.NumComputeOps > 0) { 
-            Rem << ",\nadditionally " 
-                << ore::NV("NumStores", SharedCounts.NumStores) << " stores, " 
-                << ore::NV("NumLoads", SharedCounts.NumLoads) << " loads, " 
-                << ore::NV("NumFPOps", SharedCounts.NumComputeOps) 
-                << " compute ops" 
-                << " are shared with other expressions"; 
-          } 
- 
-          Rem << ("\n" + linearize(L, Shared, ExprsInSubprogram, DL)); 
-          ORE.emit(Rem); 
-        } 
-      } 
-    } 
- 
-    std::string 
-    linearize(Value *L, 
-              const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared, 
-              const SmallSetVector<Value *, 32> &ExprsInSubprogram, 
-              const DataLayout &DL) { 
-      ExprLinearizer Lin(DL, Inst2Matrix, Shared, ExprsInSubprogram, L); 
-      Lin.linearizeExpr(L, 0, false, false); 
-      return Lin.getResult(); 
-    } 
-  }; 
-}; 
-} // namespace 
- 
-PreservedAnalyses LowerMatrixIntrinsicsPass::run(Function &F, 
-                                                 FunctionAnalysisManager &AM) { 
-  auto &TTI = AM.getResult<TargetIRAnalysis>(F); 
+        raw_string_ostream SS(Tmp);
+
+        switch (II->getIntrinsicID()) {
+        case Intrinsic::matrix_multiply:
+          prettyPrintMatrixType(II->getOperand(0), SS);
+          SS << ".";
+          prettyPrintMatrixType(II->getOperand(1), SS);
+          SS << "." << *II->getType()->getScalarType();
+          break;
+        case Intrinsic::matrix_transpose:
+          prettyPrintMatrixType(II->getOperand(0), SS);
+          SS << "." << *II->getType()->getScalarType();
+          break;
+        case Intrinsic::matrix_column_major_load:
+          prettyPrintMatrixType(II, SS);
+          SS << "." << *II->getType()->getScalarType();
+          break;
+        case Intrinsic::matrix_column_major_store:
+          prettyPrintMatrixType(II->getOperand(0), SS);
+          SS << "." << *II->getOperand(0)->getType()->getScalarType();
+          break;
+        default:
+          llvm_unreachable("Unhandled case");
+        }
+        SS.flush();
+        write(Tmp);
+      }
+    }
+
+    unsigned getNumShapeArgs(CallInst *CI) const {
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
+        switch (II->getIntrinsicID()) {
+        case Intrinsic::matrix_multiply:
+          return 3;
+        case Intrinsic::matrix_transpose:
+          return 2;
+        case Intrinsic::matrix_column_major_load:
+        case Intrinsic::matrix_column_major_store:
+          return 3;
+        default:
+          return 0;
+        }
+      }
+      return 0;
+    }
+
+    /// Special printing for values: for pointers, we print if they refer to an
+    /// (function) external address or a stack address, for other values we
+    /// either print the constant or "scalar"/"matrix" for other values.
+    void write(Value *V) {
+      V = getUnderlyingObjectThroughLoads(V);
+      if (V->getType()->isPointerTy()) {
+        if (isa<AllocaInst>(V)) {
+          Stream << "stack addr";
+          LineLength += StringRef("stack addr").size();
+        } else {
+          Stream << "addr";
+          LineLength += StringRef("addr").size();
+        }
+        if (!V->getName().empty()) {
+          Stream << " %" << V->getName() << "";
+          LineLength += V->getName().size() + 2;
+        }
+        return;
+      }
+
+      std::string Tmp;
+      raw_string_ostream TmpStream(Tmp);
+
+      if (auto *CI = dyn_cast<ConstantInt>(V))
+        TmpStream << CI->getValue();
+      else if (isa<Constant>(V))
+        TmpStream << "constant";
+      else {
+        if (isMatrix(V))
+          TmpStream << "matrix";
+        else
+          TmpStream << "scalar";
+      }
+      TmpStream.flush();
+      Tmp = std::string(StringRef(Tmp).trim());
+      LineLength += Tmp.size();
+      Stream << Tmp;
+    }
+
+    /// Linearize expression \p Expr starting at an indentation of \p Indent.
+    /// Expressions that are re-used multiple times are prefixed with (reused)
+    /// at the re-used root instruction.
+    void linearizeExpr(Value *Expr, unsigned Indent, bool ParentReused,
+                       bool ParentShared) {
+      auto *I = cast<Instruction>(Expr);
+      maybeIndent(Indent);
+      SmallVector<Value *, 8> Ops;
+
+      // Is Expr shared with other expression leaves?
+      bool ExprShared = false;
+
+      // Deal with shared subtrees. Mark them as shared, if required.
+      if (!ParentShared) {
+        auto SI = Shared.find(Expr);
+        assert(SI != Shared.end() && SI->second.count(Leaf));
+
+        for (Value *S : SI->second) {
+          if (S == Leaf)
+            continue;
+          DebugLoc DL = cast<Instruction>(S)->getDebugLoc();
+          write("shared with remark at line " + std::to_string(DL.getLine()) +
+                " column " + std::to_string(DL.getCol()) + " (");
+        }
+        ExprShared = SI->second.size() > 1;
+      }
+
+      bool Reused = !ReusedExprs.insert(Expr).second;
+      if (Reused && !ParentReused)
+        write("(reused) ");
+
+      if (auto *CI = dyn_cast<CallInst>(I)) {
+        writeFnName(CI);
+
+        Ops.append(CI->arg_begin(), CI->arg_end() - getNumShapeArgs(CI));
+      } else if (isa<BitCastInst>(Expr)) {
+        // Special case bitcasts, which are used to materialize matrixes from
+        // non-matrix ops.
+        write("matrix");
+        return;
+      } else {
+        Ops.append(I->value_op_begin(), I->value_op_end());
+        write(std::string(I->getOpcodeName()));
+      }
+
+      write(std::string("("));
+
+      unsigned NumOpsToBreak = 1;
+      if (match(Expr, m_Intrinsic<Intrinsic::matrix_column_major_load>()))
+        NumOpsToBreak = 2;
+
+      for (Value *Op : Ops) {
+        if (Ops.size() > NumOpsToBreak)
+          lineBreak();
+
+        maybeIndent(Indent + 1);
+        if (isMatrix(Op))
+          linearizeExpr(Op, Indent + 1, Reused, ExprShared);
+        else
+          write(Op);
+        if (Op != Ops.back())
+          write(", ");
+      }
+
+      write(")");
+    }
+
+    const std::string &getResult() {
+      Stream.flush();
+      return Str;
+    }
+  };
+
+  /// Generate remarks for matrix operations in a function. To generate remarks
+  /// for matrix expressions, the following approach is used:
+  /// 1. Use the inlined-at debug information to group matrix operations to the
+  ///    DISubprograms they are contained in.
+  /// 2. Collect leaves of matrix expressions (done in
+  ///    RemarkGenerator::getExpressionLeaves) for each subprogram - expression
+  //     mapping.  Leaves are lowered matrix instructions without other matrix
+  //     users (like stores) in the current subprogram.
+  /// 3. For each leaf, create a remark containing a linearizied version of the
+  ///    matrix expression. The expression is linearized by a recursive
+  ///    bottom-up traversal of the matrix operands, starting at a leaf. Note
+  ///    that multiple leaves can share sub-expressions. Shared subexpressions
+  ///    are explicitly marked as shared().
+  struct RemarkGenerator {
+    const MapVector<Value *, MatrixTy> &Inst2Matrix;
+    OptimizationRemarkEmitter &ORE;
+    Function &Func;
+    const DataLayout &DL;
+
+    RemarkGenerator(const MapVector<Value *, MatrixTy> &Inst2Matrix,
+                    OptimizationRemarkEmitter &ORE, Function &Func)
+        : Inst2Matrix(Inst2Matrix), ORE(ORE), Func(Func),
+          DL(Func.getParent()->getDataLayout()) {}
+
+    /// Return all leaves of the expressions in \p ExprsInSubprogram. Those are
+    /// instructions in Inst2Matrix returning void or without any users in
+    /// \p ExprsInSubprogram. Currently that should only include stores.
+    SmallVector<Value *, 4>
+    getExpressionLeaves(const SmallSetVector<Value *, 32> &ExprsInSubprogram) {
+      SmallVector<Value *, 4> Leaves;
+      for (auto *Expr : ExprsInSubprogram)
+        if (Expr->getType()->isVoidTy() ||
+            !any_of(Expr->users(), [&ExprsInSubprogram](User *U) {
+              return ExprsInSubprogram.count(U);
+            }))
+          Leaves.push_back(Expr);
+      return Leaves;
+    }
+
+    /// Recursively traverse expression \p V starting at \p Leaf and add \p Leaf
+    /// to all visited expressions in \p Shared. Limit the matrix operations to
+    /// the ones in \p ExprsInSubprogram.
+    void collectSharedInfo(Value *Leaf, Value *V,
+                           const SmallSetVector<Value *, 32> &ExprsInSubprogram,
+                           DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared) {
+
+      if (!ExprsInSubprogram.count(V))
+        return;
+
+      auto I = Shared.insert({V, {}});
+      I.first->second.insert(Leaf);
+
+      for (Value *Op : cast<Instruction>(V)->operand_values())
+        collectSharedInfo(Leaf, Op, ExprsInSubprogram, Shared);
+    }
+
+    /// Calculate the number of exclusive and shared op counts for expression
+    /// starting at \p V. Expressions used multiple times are counted once.
+    /// Limit the matrix operations to the ones in \p ExprsInSubprogram.
+    std::pair<OpInfoTy, OpInfoTy>
+    sumOpInfos(Value *Root, SmallPtrSetImpl<Value *> &ReusedExprs,
+               const SmallSetVector<Value *, 32> &ExprsInSubprogram,
+               DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared) const {
+      if (!ExprsInSubprogram.count(Root))
+        return {};
+
+      // Already counted this expression. Stop.
+      if (!ReusedExprs.insert(Root).second)
+        return {};
+
+      OpInfoTy SharedCount;
+      OpInfoTy Count;
+
+      auto I = Shared.find(Root);
+      auto CM = Inst2Matrix.find(Root);
+      if (I->second.size() == 1)
+        Count = CM->second.getOpInfo();
+      else
+        SharedCount = CM->second.getOpInfo();
+
+      for (Value *Op : cast<Instruction>(Root)->operand_values()) {
+        auto C = sumOpInfos(Op, ReusedExprs, ExprsInSubprogram, Shared);
+        Count += C.first;
+        SharedCount += C.second;
+      }
+      return {Count, SharedCount};
+    }
+
+    void emitRemarks() {
+      if (!ORE.allowExtraAnalysis(DEBUG_TYPE))
+        return;
+
+      // Map matrix operations to their containting subprograms, by traversing
+      // the inlinedAt chain. If the function does not have a DISubprogram, we
+      // only map them to the containing function.
+      MapVector<DISubprogram *, SmallVector<Value *, 8>> Subprog2Exprs;
+      for (auto &KV : Inst2Matrix) {
+        if (Func.getSubprogram()) {
+          auto *I = cast<Instruction>(KV.first);
+          DILocation *Context = I->getDebugLoc();
+          while (Context) {
+            auto I =
+                Subprog2Exprs.insert({getSubprogram(Context->getScope()), {}});
+            I.first->second.push_back(KV.first);
+            Context = DebugLoc(Context).getInlinedAt();
+          }
+        } else {
+          auto I = Subprog2Exprs.insert({nullptr, {}});
+          I.first->second.push_back(KV.first);
+        }
+      }
+      for (auto &KV : Subprog2Exprs) {
+        SmallSetVector<Value *, 32> ExprsInSubprogram(KV.second.begin(),
+                                                      KV.second.end());
+        auto Leaves = getExpressionLeaves(ExprsInSubprogram);
+
+        DenseMap<Value *, SmallPtrSet<Value *, 2>> Shared;
+        for (Value *Leaf : Leaves)
+          collectSharedInfo(Leaf, Leaf, ExprsInSubprogram, Shared);
+
+        // Generate remarks for each leaf.
+        for (auto *L : Leaves) {
+
+          DebugLoc Loc = cast<Instruction>(L)->getDebugLoc();
+          DILocation *Context = cast<Instruction>(L)->getDebugLoc();
+          while (Context) {
+            if (getSubprogram(Context->getScope()) == KV.first) {
+              Loc = Context;
+              break;
+            }
+            Context = DebugLoc(Context).getInlinedAt();
+          }
+
+          SmallPtrSet<Value *, 8> ReusedExprs;
+          OpInfoTy Counts, SharedCounts;
+          std::tie(Counts, SharedCounts) =
+              sumOpInfos(L, ReusedExprs, ExprsInSubprogram, Shared);
+
+          OptimizationRemark Rem(DEBUG_TYPE, "matrix-lowered", Loc,
+                                 cast<Instruction>(L)->getParent());
+
+          Rem << "Lowered with ";
+          Rem << ore::NV("NumStores", Counts.NumStores) << " stores, "
+              << ore::NV("NumLoads", Counts.NumLoads) << " loads, "
+              << ore::NV("NumComputeOps", Counts.NumComputeOps)
+              << " compute ops";
+
+          if (SharedCounts.NumStores > 0 || SharedCounts.NumLoads > 0 ||
+              SharedCounts.NumComputeOps > 0) {
+            Rem << ",\nadditionally "
+                << ore::NV("NumStores", SharedCounts.NumStores) << " stores, "
+                << ore::NV("NumLoads", SharedCounts.NumLoads) << " loads, "
+                << ore::NV("NumFPOps", SharedCounts.NumComputeOps)
+                << " compute ops"
+                << " are shared with other expressions";
+          }
+
+          Rem << ("\n" + linearize(L, Shared, ExprsInSubprogram, DL));
+          ORE.emit(Rem);
+        }
+      }
+    }
+
+    std::string
+    linearize(Value *L,
+              const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared,
+              const SmallSetVector<Value *, 32> &ExprsInSubprogram,
+              const DataLayout &DL) {
+      ExprLinearizer Lin(DL, Inst2Matrix, Shared, ExprsInSubprogram, L);
+      Lin.linearizeExpr(L, 0, false, false);
+      return Lin.getResult();
+    }
+  };
+};
+} // namespace
+
+PreservedAnalyses LowerMatrixIntrinsicsPass::run(Function &F,
+                                                 FunctionAnalysisManager &AM) {
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
   OptimizationRemarkEmitter *ORE = nullptr;
   AAResults *AA = nullptr;
   DominatorTree *DT = nullptr;
   LoopInfo *LI = nullptr;
- 
+
   if (!Minimal) {
     ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
     AA = &AM.getResult<AAManager>(F);
@@ -1984,66 +1984,66 @@ PreservedAnalyses LowerMatrixIntrinsicsPass::run(Function &F,
     LI = &AM.getResult<LoopAnalysis>(F);
   }
 
-  LowerMatrixIntrinsics LMT(F, TTI, AA, DT, LI, ORE); 
-  if (LMT.Visit()) { 
-    PreservedAnalyses PA; 
+  LowerMatrixIntrinsics LMT(F, TTI, AA, DT, LI, ORE);
+  if (LMT.Visit()) {
+    PreservedAnalyses PA;
     if (!Minimal) {
       PA.preserve<LoopAnalysis>();
       PA.preserve<DominatorTreeAnalysis>();
     }
-    return PA; 
-  } 
-  return PreservedAnalyses::all(); 
-} 
- 
-namespace { 
- 
-class LowerMatrixIntrinsicsLegacyPass : public FunctionPass { 
-public: 
-  static char ID; 
- 
-  LowerMatrixIntrinsicsLegacyPass() : FunctionPass(ID) { 
-    initializeLowerMatrixIntrinsicsLegacyPassPass( 
-        *PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnFunction(Function &F) override { 
-    auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 
-    auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 
-    auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); 
-    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 
+    return PA;
+  }
+  return PreservedAnalyses::all();
+}
+
+namespace {
+
+class LowerMatrixIntrinsicsLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  LowerMatrixIntrinsicsLegacyPass() : FunctionPass(ID) {
+    initializeLowerMatrixIntrinsicsLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+    auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     LowerMatrixIntrinsics LMT(F, TTI, &AA, &DT, &LI, &ORE);
-    bool C = LMT.Visit(); 
-    return C; 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<TargetTransformInfoWrapperPass>(); 
-    AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 
-    AU.addRequired<AAResultsWrapperPass>(); 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addPreserved<DominatorTreeWrapperPass>(); 
-    AU.addRequired<LoopInfoWrapperPass>(); 
-    AU.addPreserved<LoopInfoWrapperPass>(); 
-  } 
-}; 
-} // namespace 
- 
-static const char pass_name[] = "Lower the matrix intrinsics"; 
-char LowerMatrixIntrinsicsLegacyPass::ID = 0; 
-INITIALIZE_PASS_BEGIN(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name, 
-                      false, false) 
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 
-INITIALIZE_PASS_END(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name, 
-                    false, false) 
- 
-Pass *llvm::createLowerMatrixIntrinsicsPass() { 
-  return new LowerMatrixIntrinsicsLegacyPass(); 
-} 
+    bool C = LMT.Visit();
+    return C;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+  }
+};
+} // namespace
+
+static const char pass_name[] = "Lower the matrix intrinsics";
+char LowerMatrixIntrinsicsLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name,
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name,
+                    false, false)
+
+Pass *llvm::createLowerMatrixIntrinsicsPass() {
+  return new LowerMatrixIntrinsicsLegacyPass();
+}
 
 namespace {
 
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LowerWidenableCondition.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LowerWidenableCondition.cpp
index c1cc1c28b9..73b2cd06fa 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LowerWidenableCondition.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LowerWidenableCondition.cpp
@@ -1,86 +1,86 @@
-//===- LowerWidenableCondition.cpp - Lower the guard intrinsic ---------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass lowers the llvm.widenable.condition intrinsic to default value 
-// which is i1 true. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/LowerWidenableCondition.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/Analysis/GuardUtils.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/InstIterator.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/GuardUtils.h" 
- 
-using namespace llvm; 
- 
-namespace { 
-struct LowerWidenableConditionLegacyPass : public FunctionPass { 
-  static char ID; 
-  LowerWidenableConditionLegacyPass() : FunctionPass(ID) { 
-    initializeLowerWidenableConditionLegacyPassPass( 
-        *PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnFunction(Function &F) override; 
-}; 
-} 
- 
-static bool lowerWidenableCondition(Function &F) { 
-  // Check if we can cheaply rule out the possibility of not having any work to 
-  // do. 
-  auto *WCDecl = F.getParent()->getFunction( 
-      Intrinsic::getName(Intrinsic::experimental_widenable_condition)); 
-  if (!WCDecl || WCDecl->use_empty()) 
-    return false; 
- 
-  using namespace llvm::PatternMatch; 
-  SmallVector<CallInst *, 8> ToLower; 
-  for (auto &I : instructions(F)) 
-    if (match(&I, m_Intrinsic<Intrinsic::experimental_widenable_condition>())) 
-      ToLower.push_back(cast<CallInst>(&I)); 
- 
-  if (ToLower.empty()) 
-    return false; 
- 
-  for (auto *CI : ToLower) { 
-    CI->replaceAllUsesWith(ConstantInt::getTrue(CI->getContext())); 
-    CI->eraseFromParent(); 
-  } 
-  return true; 
-} 
- 
-bool LowerWidenableConditionLegacyPass::runOnFunction(Function &F) { 
-  return lowerWidenableCondition(F); 
-} 
- 
-char LowerWidenableConditionLegacyPass::ID = 0; 
-INITIALIZE_PASS(LowerWidenableConditionLegacyPass, "lower-widenable-condition", 
-                "Lower the widenable condition to default true value", false, 
-                false) 
- 
-Pass *llvm::createLowerWidenableConditionPass() { 
-  return new LowerWidenableConditionLegacyPass(); 
-} 
- 
-PreservedAnalyses LowerWidenableConditionPass::run(Function &F, 
-                                               FunctionAnalysisManager &AM) { 
-  if (lowerWidenableCondition(F)) 
-    return PreservedAnalyses::none(); 
- 
-  return PreservedAnalyses::all(); 
-} 
+//===- LowerWidenableCondition.cpp - Lower the guard intrinsic ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers the llvm.widenable.condition intrinsic to default value
+// which is i1 true.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerWidenableCondition.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/GuardUtils.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/GuardUtils.h"
+
+using namespace llvm;
+
+namespace {
+struct LowerWidenableConditionLegacyPass : public FunctionPass {
+  static char ID;
+  LowerWidenableConditionLegacyPass() : FunctionPass(ID) {
+    initializeLowerWidenableConditionLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+};
+}
+
+static bool lowerWidenableCondition(Function &F) {
+  // Check if we can cheaply rule out the possibility of not having any work to
+  // do.
+  auto *WCDecl = F.getParent()->getFunction(
+      Intrinsic::getName(Intrinsic::experimental_widenable_condition));
+  if (!WCDecl || WCDecl->use_empty())
+    return false;
+
+  using namespace llvm::PatternMatch;
+  SmallVector<CallInst *, 8> ToLower;
+  for (auto &I : instructions(F))
+    if (match(&I, m_Intrinsic<Intrinsic::experimental_widenable_condition>()))
+      ToLower.push_back(cast<CallInst>(&I));
+
+  if (ToLower.empty())
+    return false;
+
+  for (auto *CI : ToLower) {
+    CI->replaceAllUsesWith(ConstantInt::getTrue(CI->getContext()));
+    CI->eraseFromParent();
+  }
+  return true;
+}
+
+bool LowerWidenableConditionLegacyPass::runOnFunction(Function &F) {
+  return lowerWidenableCondition(F);
+}
+
+char LowerWidenableConditionLegacyPass::ID = 0;
+INITIALIZE_PASS(LowerWidenableConditionLegacyPass, "lower-widenable-condition",
+                "Lower the widenable condition to default true value", false,
+                false)
+
+Pass *llvm::createLowerWidenableConditionPass() {
+  return new LowerWidenableConditionLegacyPass();
+}
+
+PreservedAnalyses LowerWidenableConditionPass::run(Function &F,
+                                               FunctionAnalysisManager &AM) {
+  if (lowerWidenableCondition(F))
+    return PreservedAnalyses::none();
+
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/MakeGuardsExplicit.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/MakeGuardsExplicit.cpp
index 760d6b198b..5ffae128f5 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/MakeGuardsExplicit.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/MakeGuardsExplicit.cpp
@@ -1,108 +1,108 @@
-//===- MakeGuardsExplicit.cpp - Turn guard intrinsics into guard branches -===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass lowers the @llvm.experimental.guard intrinsic to the new form of 
-// guard represented as widenable explicit branch to the deopt block. The 
-// difference between this pass and LowerGuardIntrinsic is that after this pass 
-// the guard represented as intrinsic: 
-// 
-//   call void(i1, ...) @llvm.experimental.guard(i1 %old_cond) [ "deopt"() ] 
-// 
-// transforms to a guard represented as widenable explicit branch: 
-// 
-//   %widenable_cond = call i1 @llvm.experimental.widenable.condition() 
-//   br i1 (%old_cond & %widenable_cond), label %guarded, label %deopt 
-// 
-// Here: 
-//   - The semantics of @llvm.experimental.widenable.condition allows to replace 
-//     %widenable_cond with the construction (%widenable_cond & %any_other_cond) 
-//     without loss of correctness; 
-//   - %guarded is the lower part of old guard intrinsic's parent block split by 
-//     the intrinsic call; 
-//   - %deopt is a block containing a sole call to @llvm.experimental.deoptimize 
-//     intrinsic. 
-// 
-// Therefore, this branch preserves the property of widenability. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/MakeGuardsExplicit.h" 
-#include "llvm/Analysis/GuardUtils.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstIterator.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/GuardUtils.h" 
- 
-using namespace llvm; 
- 
-namespace { 
-struct MakeGuardsExplicitLegacyPass : public FunctionPass { 
-  static char ID; 
-  MakeGuardsExplicitLegacyPass() : FunctionPass(ID) { 
-    initializeMakeGuardsExplicitLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnFunction(Function &F) override; 
-}; 
-} 
- 
-static void turnToExplicitForm(CallInst *Guard, Function *DeoptIntrinsic) { 
-  // Replace the guard with an explicit branch (just like in GuardWidening). 
-  BasicBlock *OriginalBB = Guard->getParent(); 
-  (void)OriginalBB; 
-  makeGuardControlFlowExplicit(DeoptIntrinsic, Guard, true); 
-  assert(isWidenableBranch(OriginalBB->getTerminator()) && "should hold"); 
- 
-  Guard->eraseFromParent(); 
-} 
- 
-static bool explicifyGuards(Function &F) { 
-  // Check if we can cheaply rule out the possibility of not having any work to 
-  // do. 
-  auto *GuardDecl = F.getParent()->getFunction( 
-      Intrinsic::getName(Intrinsic::experimental_guard)); 
-  if (!GuardDecl || GuardDecl->use_empty()) 
-    return false; 
- 
-  SmallVector<CallInst *, 8> GuardIntrinsics; 
-  for (auto &I : instructions(F)) 
-    if (isGuard(&I)) 
-      GuardIntrinsics.push_back(cast<CallInst>(&I)); 
- 
-  if (GuardIntrinsics.empty()) 
-    return false; 
- 
-  auto *DeoptIntrinsic = Intrinsic::getDeclaration( 
-      F.getParent(), Intrinsic::experimental_deoptimize, {F.getReturnType()}); 
-  DeoptIntrinsic->setCallingConv(GuardDecl->getCallingConv()); 
- 
-  for (auto *Guard : GuardIntrinsics) 
-    turnToExplicitForm(Guard, DeoptIntrinsic); 
- 
-  return true; 
-} 
- 
-bool MakeGuardsExplicitLegacyPass::runOnFunction(Function &F) { 
-  return explicifyGuards(F); 
-} 
- 
-char MakeGuardsExplicitLegacyPass::ID = 0; 
-INITIALIZE_PASS(MakeGuardsExplicitLegacyPass, "make-guards-explicit", 
-                "Lower the guard intrinsic to explicit control flow form", 
-                false, false) 
- 
-PreservedAnalyses MakeGuardsExplicitPass::run(Function &F, 
-                                           FunctionAnalysisManager &) { 
-  if (explicifyGuards(F)) 
-    return PreservedAnalyses::none(); 
-  return PreservedAnalyses::all(); 
-} 
+//===- MakeGuardsExplicit.cpp - Turn guard intrinsics into guard branches -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers the @llvm.experimental.guard intrinsic to the new form of
+// guard represented as widenable explicit branch to the deopt block. The
+// difference between this pass and LowerGuardIntrinsic is that after this pass
+// the guard represented as intrinsic:
+//
+//   call void(i1, ...) @llvm.experimental.guard(i1 %old_cond) [ "deopt"() ]
+//
+// transforms to a guard represented as widenable explicit branch:
+//
+//   %widenable_cond = call i1 @llvm.experimental.widenable.condition()
+//   br i1 (%old_cond & %widenable_cond), label %guarded, label %deopt
+//
+// Here:
+//   - The semantics of @llvm.experimental.widenable.condition allows to replace
+//     %widenable_cond with the construction (%widenable_cond & %any_other_cond)
+//     without loss of correctness;
+//   - %guarded is the lower part of old guard intrinsic's parent block split by
+//     the intrinsic call;
+//   - %deopt is a block containing a sole call to @llvm.experimental.deoptimize
+//     intrinsic.
+//
+// Therefore, this branch preserves the property of widenability.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/MakeGuardsExplicit.h"
+#include "llvm/Analysis/GuardUtils.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/GuardUtils.h"
+
+using namespace llvm;
+
+namespace {
+struct MakeGuardsExplicitLegacyPass : public FunctionPass {
+  static char ID;
+  MakeGuardsExplicitLegacyPass() : FunctionPass(ID) {
+    initializeMakeGuardsExplicitLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+};
+}
+
+static void turnToExplicitForm(CallInst *Guard, Function *DeoptIntrinsic) {
+  // Replace the guard with an explicit branch (just like in GuardWidening).
+  BasicBlock *OriginalBB = Guard->getParent();
+  (void)OriginalBB;
+  makeGuardControlFlowExplicit(DeoptIntrinsic, Guard, true);
+  assert(isWidenableBranch(OriginalBB->getTerminator()) && "should hold");
+
+  Guard->eraseFromParent();
+}
+
+static bool explicifyGuards(Function &F) {
+  // Check if we can cheaply rule out the possibility of not having any work to
+  // do.
+  auto *GuardDecl = F.getParent()->getFunction(
+      Intrinsic::getName(Intrinsic::experimental_guard));
+  if (!GuardDecl || GuardDecl->use_empty())
+    return false;
+
+  SmallVector<CallInst *, 8> GuardIntrinsics;
+  for (auto &I : instructions(F))
+    if (isGuard(&I))
+      GuardIntrinsics.push_back(cast<CallInst>(&I));
+
+  if (GuardIntrinsics.empty())
+    return false;
+
+  auto *DeoptIntrinsic = Intrinsic::getDeclaration(
+      F.getParent(), Intrinsic::experimental_deoptimize, {F.getReturnType()});
+  DeoptIntrinsic->setCallingConv(GuardDecl->getCallingConv());
+
+  for (auto *Guard : GuardIntrinsics)
+    turnToExplicitForm(Guard, DeoptIntrinsic);
+
+  return true;
+}
+
+bool MakeGuardsExplicitLegacyPass::runOnFunction(Function &F) {
+  return explicifyGuards(F);
+}
+
+char MakeGuardsExplicitLegacyPass::ID = 0;
+INITIALIZE_PASS(MakeGuardsExplicitLegacyPass, "make-guards-explicit",
+                "Lower the guard intrinsic to explicit control flow form",
+                false, false)
+
+PreservedAnalyses MakeGuardsExplicitPass::run(Function &F,
+                                           FunctionAnalysisManager &) {
+  if (explicifyGuards(F))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 0583e27906..a4e695497f 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1,316 +1,316 @@
-//===- MemCpyOptimizer.cpp - Optimize use of memcpy and friends -----------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass performs various transformations related to eliminating memcpy 
-// calls, or transforming sets of stores into memset's. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/MemCpyOptimizer.h" 
-#include "llvm/ADT/DenseSet.h" 
-#include "llvm/ADT/None.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/iterator_range.h" 
-#include "llvm/Analysis/AliasAnalysis.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
+//===- MemCpyOptimizer.cpp - Optimize use of memcpy and friends -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs various transformations related to eliminating memcpy
+// calls, or transforming sets of stores into memset's.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/MemCpyOptimizer.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/Loads.h"
-#include "llvm/Analysis/MemoryDependenceAnalysis.h" 
-#include "llvm/Analysis/MemoryLocation.h" 
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/Argument.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GetElementPtrTypeIterator.h" 
-#include "llvm/IR/GlobalVariable.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Operator.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/MathExtras.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <cstdint> 
-#include <utility> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "memcpyopt" 
- 
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "memcpyopt"
+
 static cl::opt<bool>
     EnableMemorySSA("enable-memcpyopt-memoryssa", cl::init(false), cl::Hidden,
                     cl::desc("Use MemorySSA-backed MemCpyOpt."));
 
-STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted"); 
-STATISTIC(NumMemSetInfer, "Number of memsets inferred"); 
-STATISTIC(NumMoveToCpy,   "Number of memmoves converted to memcpy"); 
-STATISTIC(NumCpyToSet,    "Number of memcpys converted to memset"); 
+STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted");
+STATISTIC(NumMemSetInfer, "Number of memsets inferred");
+STATISTIC(NumMoveToCpy,   "Number of memmoves converted to memcpy");
+STATISTIC(NumCpyToSet,    "Number of memcpys converted to memset");
 STATISTIC(NumCallSlot,    "Number of call slot optimizations performed");
- 
-namespace { 
- 
-/// Represents a range of memset'd bytes with the ByteVal value. 
-/// This allows us to analyze stores like: 
-///   store 0 -> P+1 
-///   store 0 -> P+0 
-///   store 0 -> P+3 
-///   store 0 -> P+2 
-/// which sometimes happens with stores to arrays of structs etc.  When we see 
-/// the first store, we make a range [1, 2).  The second store extends the range 
-/// to [0, 2).  The third makes a new range [2, 3).  The fourth store joins the 
-/// two ranges into [0, 3) which is memset'able. 
-struct MemsetRange { 
-  // Start/End - A semi range that describes the span that this range covers. 
-  // The range is closed at the start and open at the end: [Start, End). 
-  int64_t Start, End; 
- 
-  /// StartPtr - The getelementptr instruction that points to the start of the 
-  /// range. 
-  Value *StartPtr; 
- 
-  /// Alignment - The known alignment of the first store. 
-  unsigned Alignment; 
- 
-  /// TheStores - The actual stores that make up this range. 
-  SmallVector<Instruction*, 16> TheStores; 
- 
-  bool isProfitableToUseMemset(const DataLayout &DL) const; 
-}; 
- 
-} // end anonymous namespace 
- 
-bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const { 
-  // If we found more than 4 stores to merge or 16 bytes, use memset. 
-  if (TheStores.size() >= 4 || End-Start >= 16) return true; 
- 
-  // If there is nothing to merge, don't do anything. 
-  if (TheStores.size() < 2) return false; 
- 
-  // If any of the stores are a memset, then it is always good to extend the 
-  // memset. 
-  for (Instruction *SI : TheStores) 
-    if (!isa<StoreInst>(SI)) 
-      return true; 
- 
-  // Assume that the code generator is capable of merging pairs of stores 
-  // together if it wants to. 
-  if (TheStores.size() == 2) return false; 
- 
-  // If we have fewer than 8 stores, it can still be worthwhile to do this. 
-  // For example, merging 4 i8 stores into an i32 store is useful almost always. 
-  // However, merging 2 32-bit stores isn't useful on a 32-bit architecture (the 
-  // memset will be split into 2 32-bit stores anyway) and doing so can 
-  // pessimize the llvm optimizer. 
-  // 
-  // Since we don't have perfect knowledge here, make some assumptions: assume 
-  // the maximum GPR width is the same size as the largest legal integer 
-  // size. If so, check to see whether we will end up actually reducing the 
-  // number of stores used. 
-  unsigned Bytes = unsigned(End-Start); 
-  unsigned MaxIntSize = DL.getLargestLegalIntTypeSizeInBits() / 8; 
-  if (MaxIntSize == 0) 
-    MaxIntSize = 1; 
-  unsigned NumPointerStores = Bytes / MaxIntSize; 
- 
-  // Assume the remaining bytes if any are done a byte at a time. 
-  unsigned NumByteStores = Bytes % MaxIntSize; 
- 
-  // If we will reduce the # stores (according to this heuristic), do the 
-  // transformation.  This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32 
-  // etc. 
-  return TheStores.size() > NumPointerStores+NumByteStores; 
-} 
- 
-namespace { 
- 
-class MemsetRanges { 
-  using range_iterator = SmallVectorImpl<MemsetRange>::iterator; 
- 
-  /// A sorted list of the memset ranges. 
-  SmallVector<MemsetRange, 8> Ranges; 
- 
-  const DataLayout &DL; 
- 
-public: 
-  MemsetRanges(const DataLayout &DL) : DL(DL) {} 
- 
-  using const_iterator = SmallVectorImpl<MemsetRange>::const_iterator; 
- 
-  const_iterator begin() const { return Ranges.begin(); } 
-  const_iterator end() const { return Ranges.end(); } 
-  bool empty() const { return Ranges.empty(); } 
- 
-  void addInst(int64_t OffsetFromFirst, Instruction *Inst) { 
-    if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) 
-      addStore(OffsetFromFirst, SI); 
-    else 
-      addMemSet(OffsetFromFirst, cast<MemSetInst>(Inst)); 
-  } 
- 
-  void addStore(int64_t OffsetFromFirst, StoreInst *SI) { 
-    int64_t StoreSize = DL.getTypeStoreSize(SI->getOperand(0)->getType()); 
- 
-    addRange(OffsetFromFirst, StoreSize, SI->getPointerOperand(), 
-             SI->getAlign().value(), SI); 
-  } 
- 
-  void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) { 
-    int64_t Size = cast<ConstantInt>(MSI->getLength())->getZExtValue(); 
-    addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getDestAlignment(), MSI); 
-  } 
- 
-  void addRange(int64_t Start, int64_t Size, Value *Ptr, 
-                unsigned Alignment, Instruction *Inst); 
-}; 
- 
-} // end anonymous namespace 
- 
-/// Add a new store to the MemsetRanges data structure.  This adds a 
-/// new range for the specified store at the specified offset, merging into 
-/// existing ranges as appropriate. 
-void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr, 
-                            unsigned Alignment, Instruction *Inst) { 
-  int64_t End = Start+Size; 
- 
-  range_iterator I = partition_point( 
-      Ranges, [=](const MemsetRange &O) { return O.End < Start; }); 
- 
-  // We now know that I == E, in which case we didn't find anything to merge 
-  // with, or that Start <= I->End.  If End < I->Start or I == E, then we need 
-  // to insert a new range.  Handle this now. 
-  if (I == Ranges.end() || End < I->Start) { 
-    MemsetRange &R = *Ranges.insert(I, MemsetRange()); 
-    R.Start        = Start; 
-    R.End          = End; 
-    R.StartPtr     = Ptr; 
-    R.Alignment    = Alignment; 
-    R.TheStores.push_back(Inst); 
-    return; 
-  } 
- 
-  // This store overlaps with I, add it. 
-  I->TheStores.push_back(Inst); 
- 
-  // At this point, we may have an interval that completely contains our store. 
-  // If so, just add it to the interval and return. 
-  if (I->Start <= Start && I->End >= End) 
-    return; 
- 
-  // Now we know that Start <= I->End and End >= I->Start so the range overlaps 
-  // but is not entirely contained within the range. 
- 
-  // See if the range extends the start of the range.  In this case, it couldn't 
-  // possibly cause it to join the prior range, because otherwise we would have 
-  // stopped on *it*. 
-  if (Start < I->Start) { 
-    I->Start = Start; 
-    I->StartPtr = Ptr; 
-    I->Alignment = Alignment; 
-  } 
- 
-  // Now we know that Start <= I->End and Start >= I->Start (so the startpoint 
-  // is in or right at the end of I), and that End >= I->Start.  Extend I out to 
-  // End. 
-  if (End > I->End) { 
-    I->End = End; 
-    range_iterator NextI = I; 
-    while (++NextI != Ranges.end() && End >= NextI->Start) { 
-      // Merge the range in. 
-      I->TheStores.append(NextI->TheStores.begin(), NextI->TheStores.end()); 
-      if (NextI->End > I->End) 
-        I->End = NextI->End; 
-      Ranges.erase(NextI); 
-      NextI = I; 
-    } 
-  } 
-} 
- 
-//===----------------------------------------------------------------------===// 
-//                         MemCpyOptLegacyPass Pass 
-//===----------------------------------------------------------------------===// 
- 
-namespace { 
- 
-class MemCpyOptLegacyPass : public FunctionPass { 
-  MemCpyOptPass Impl; 
- 
-public: 
-  static char ID; // Pass identification, replacement for typeid 
- 
-  MemCpyOptLegacyPass() : FunctionPass(ID) { 
-    initializeMemCpyOptLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnFunction(Function &F) override; 
- 
-private: 
-  // This transformation requires dominator postdominator info 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.setPreservesCFG(); 
-    AU.addRequired<AssumptionCacheTracker>(); 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
+
+namespace {
+
+/// Represents a range of memset'd bytes with the ByteVal value.
+/// This allows us to analyze stores like:
+///   store 0 -> P+1
+///   store 0 -> P+0
+///   store 0 -> P+3
+///   store 0 -> P+2
+/// which sometimes happens with stores to arrays of structs etc.  When we see
+/// the first store, we make a range [1, 2).  The second store extends the range
+/// to [0, 2).  The third makes a new range [2, 3).  The fourth store joins the
+/// two ranges into [0, 3) which is memset'able.
+struct MemsetRange {
+  // Start/End - A semi range that describes the span that this range covers.
+  // The range is closed at the start and open at the end: [Start, End).
+  int64_t Start, End;
+
+  /// StartPtr - The getelementptr instruction that points to the start of the
+  /// range.
+  Value *StartPtr;
+
+  /// Alignment - The known alignment of the first store.
+  unsigned Alignment;
+
+  /// TheStores - The actual stores that make up this range.
+  SmallVector<Instruction*, 16> TheStores;
+
+  bool isProfitableToUseMemset(const DataLayout &DL) const;
+};
+
+} // end anonymous namespace
+
+bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
+  // If we found more than 4 stores to merge or 16 bytes, use memset.
+  if (TheStores.size() >= 4 || End-Start >= 16) return true;
+
+  // If there is nothing to merge, don't do anything.
+  if (TheStores.size() < 2) return false;
+
+  // If any of the stores are a memset, then it is always good to extend the
+  // memset.
+  for (Instruction *SI : TheStores)
+    if (!isa<StoreInst>(SI))
+      return true;
+
+  // Assume that the code generator is capable of merging pairs of stores
+  // together if it wants to.
+  if (TheStores.size() == 2) return false;
+
+  // If we have fewer than 8 stores, it can still be worthwhile to do this.
+  // For example, merging 4 i8 stores into an i32 store is useful almost always.
+  // However, merging 2 32-bit stores isn't useful on a 32-bit architecture (the
+  // memset will be split into 2 32-bit stores anyway) and doing so can
+  // pessimize the llvm optimizer.
+  //
+  // Since we don't have perfect knowledge here, make some assumptions: assume
+  // the maximum GPR width is the same size as the largest legal integer
+  // size. If so, check to see whether we will end up actually reducing the
+  // number of stores used.
+  unsigned Bytes = unsigned(End-Start);
+  unsigned MaxIntSize = DL.getLargestLegalIntTypeSizeInBits() / 8;
+  if (MaxIntSize == 0)
+    MaxIntSize = 1;
+  unsigned NumPointerStores = Bytes / MaxIntSize;
+
+  // Assume the remaining bytes if any are done a byte at a time.
+  unsigned NumByteStores = Bytes % MaxIntSize;
+
+  // If we will reduce the # stores (according to this heuristic), do the
+  // transformation.  This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32
+  // etc.
+  return TheStores.size() > NumPointerStores+NumByteStores;
+}
+
+namespace {
+
+class MemsetRanges {
+  using range_iterator = SmallVectorImpl<MemsetRange>::iterator;
+
+  /// A sorted list of the memset ranges.
+  SmallVector<MemsetRange, 8> Ranges;
+
+  const DataLayout &DL;
+
+public:
+  MemsetRanges(const DataLayout &DL) : DL(DL) {}
+
+  using const_iterator = SmallVectorImpl<MemsetRange>::const_iterator;
+
+  const_iterator begin() const { return Ranges.begin(); }
+  const_iterator end() const { return Ranges.end(); }
+  bool empty() const { return Ranges.empty(); }
+
+  void addInst(int64_t OffsetFromFirst, Instruction *Inst) {
+    if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+      addStore(OffsetFromFirst, SI);
+    else
+      addMemSet(OffsetFromFirst, cast<MemSetInst>(Inst));
+  }
+
+  void addStore(int64_t OffsetFromFirst, StoreInst *SI) {
+    int64_t StoreSize = DL.getTypeStoreSize(SI->getOperand(0)->getType());
+
+    addRange(OffsetFromFirst, StoreSize, SI->getPointerOperand(),
+             SI->getAlign().value(), SI);
+  }
+
+  void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) {
+    int64_t Size = cast<ConstantInt>(MSI->getLength())->getZExtValue();
+    addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getDestAlignment(), MSI);
+  }
+
+  void addRange(int64_t Start, int64_t Size, Value *Ptr,
+                unsigned Alignment, Instruction *Inst);
+};
+
+} // end anonymous namespace
+
+/// Add a new store to the MemsetRanges data structure.  This adds a
+/// new range for the specified store at the specified offset, merging into
+/// existing ranges as appropriate.
+void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
+                            unsigned Alignment, Instruction *Inst) {
+  int64_t End = Start+Size;
+
+  range_iterator I = partition_point(
+      Ranges, [=](const MemsetRange &O) { return O.End < Start; });
+
+  // We now know that I == E, in which case we didn't find anything to merge
+  // with, or that Start <= I->End.  If End < I->Start or I == E, then we need
+  // to insert a new range.  Handle this now.
+  if (I == Ranges.end() || End < I->Start) {
+    MemsetRange &R = *Ranges.insert(I, MemsetRange());
+    R.Start        = Start;
+    R.End          = End;
+    R.StartPtr     = Ptr;
+    R.Alignment    = Alignment;
+    R.TheStores.push_back(Inst);
+    return;
+  }
+
+  // This store overlaps with I, add it.
+  I->TheStores.push_back(Inst);
+
+  // At this point, we may have an interval that completely contains our store.
+  // If so, just add it to the interval and return.
+  if (I->Start <= Start && I->End >= End)
+    return;
+
+  // Now we know that Start <= I->End and End >= I->Start so the range overlaps
+  // but is not entirely contained within the range.
+
+  // See if the range extends the start of the range.  In this case, it couldn't
+  // possibly cause it to join the prior range, because otherwise we would have
+  // stopped on *it*.
+  if (Start < I->Start) {
+    I->Start = Start;
+    I->StartPtr = Ptr;
+    I->Alignment = Alignment;
+  }
+
+  // Now we know that Start <= I->End and Start >= I->Start (so the startpoint
+  // is in or right at the end of I), and that End >= I->Start.  Extend I out to
+  // End.
+  if (End > I->End) {
+    I->End = End;
+    range_iterator NextI = I;
+    while (++NextI != Ranges.end() && End >= NextI->Start) {
+      // Merge the range in.
+      I->TheStores.append(NextI->TheStores.begin(), NextI->TheStores.end());
+      if (NextI->End > I->End)
+        I->End = NextI->End;
+      Ranges.erase(NextI);
+      NextI = I;
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                         MemCpyOptLegacyPass Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class MemCpyOptLegacyPass : public FunctionPass {
+  MemCpyOptPass Impl;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  MemCpyOptLegacyPass() : FunctionPass(ID) {
+    initializeMemCpyOptLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+
+private:
+  // This transformation requires dominator postdominator info
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
     if (!EnableMemorySSA)
       AU.addRequired<MemoryDependenceWrapperPass>();
-    AU.addPreserved<MemoryDependenceWrapperPass>(); 
+    AU.addPreserved<MemoryDependenceWrapperPass>();
     AU.addRequired<AAResultsWrapperPass>();
     AU.addPreserved<AAResultsWrapperPass>();
     if (EnableMemorySSA)
       AU.addRequired<MemorySSAWrapperPass>();
     AU.addPreserved<MemorySSAWrapperPass>();
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-char MemCpyOptLegacyPass::ID = 0; 
- 
-/// The public interface to this file... 
-FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOptLegacyPass(); } 
- 
-INITIALIZE_PASS_BEGIN(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization", 
-                      false, false) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 
-INITIALIZE_PASS_END(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization", 
-                    false, false) 
- 
+  }
+};
+
+} // end anonymous namespace
+
+char MemCpyOptLegacyPass::ID = 0;
+
+/// The public interface to this file...
+FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOptLegacyPass(); }
+
+INITIALIZE_PASS_BEGIN(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_END(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization",
+                    false, false)
+
 // Check that V is either not accessible by the caller, or unwinding cannot
 // occur between Start and End.
 static bool mayBeVisibleThroughUnwinding(Value *V, Instruction *Start,
@@ -361,22 +361,22 @@ static bool writtenBetween(MemorySSA *MSSA, MemoryLocation Loc,
   return !MSSA->dominates(Clobber, Start);
 }
 
-/// When scanning forward over instructions, we look for some other patterns to 
-/// fold away. In particular, this looks for stores to neighboring locations of 
-/// memory. If it sees enough consecutive ones, it attempts to merge them 
-/// together into a memcpy/memset. 
-Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, 
-                                                 Value *StartPtr, 
-                                                 Value *ByteVal) { 
-  const DataLayout &DL = StartInst->getModule()->getDataLayout(); 
- 
-  // Okay, so we now have a single store that can be splatable.  Scan to find 
-  // all subsequent stores of the same value to offset from the same pointer. 
-  // Join these together into ranges, so we can decide whether contiguous blocks 
-  // are stored. 
-  MemsetRanges Ranges(DL); 
- 
-  BasicBlock::iterator BI(StartInst); 
+/// When scanning forward over instructions, we look for some other patterns to
+/// fold away. In particular, this looks for stores to neighboring locations of
+/// memory. If it sees enough consecutive ones, it attempts to merge them
+/// together into a memcpy/memset.
+Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
+                                                 Value *StartPtr,
+                                                 Value *ByteVal) {
+  const DataLayout &DL = StartInst->getModule()->getDataLayout();
+
+  // Okay, so we now have a single store that can be splatable.  Scan to find
+  // all subsequent stores of the same value to offset from the same pointer.
+  // Join these together into ranges, so we can decide whether contiguous blocks
+  // are stored.
+  MemsetRanges Ranges(DL);
+
+  BasicBlock::iterator BI(StartInst);
 
   // Keeps track of the last memory use or def before the insertion point for
   // the new memset. The new MemoryDef for the inserted memsets will be inserted
@@ -387,7 +387,7 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
   // for the new memset. This will become the defining access of the inserted
   // memsets.
   MemoryDef *LastMemDef = nullptr;
-  for (++BI; !BI->isTerminator(); ++BI) { 
+  for (++BI; !BI->isTerminator(); ++BI) {
     if (MSSAU) {
       auto *CurrentAcc = cast_or_null<MemoryUseOrDef>(
           MSSAU->getMemorySSA()->getMemoryAccess(&*BI));
@@ -398,19 +398,19 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
       }
     }
 
-    if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) { 
-      // If the instruction is readnone, ignore it, otherwise bail out.  We 
-      // don't even allow readonly here because we don't want something like: 
-      // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A). 
-      if (BI->mayWriteToMemory() || BI->mayReadFromMemory()) 
-        break; 
-      continue; 
-    } 
- 
-    if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) { 
-      // If this is a store, see if we can merge it in. 
-      if (!NextStore->isSimple()) break; 
- 
+    if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) {
+      // If the instruction is readnone, ignore it, otherwise bail out.  We
+      // don't even allow readonly here because we don't want something like:
+      // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A).
+      if (BI->mayWriteToMemory() || BI->mayReadFromMemory())
+        break;
+      continue;
+    }
+
+    if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) {
+      // If this is a store, see if we can merge it in.
+      if (!NextStore->isSimple()) break;
+
       Value *StoredVal = NextStore->getValueOperand();
 
       // Don't convert stores of non-integral pointer types to memsets (which
@@ -418,74 +418,74 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
       if (DL.isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
         break;
 
-      // Check to see if this stored value is of the same byte-splattable value. 
+      // Check to see if this stored value is of the same byte-splattable value.
       Value *StoredByte = isBytewiseValue(StoredVal, DL);
-      if (isa<UndefValue>(ByteVal) && StoredByte) 
-        ByteVal = StoredByte; 
-      if (ByteVal != StoredByte) 
-        break; 
- 
-      // Check to see if this store is to a constant offset from the start ptr. 
-      Optional<int64_t> Offset = 
-          isPointerOffset(StartPtr, NextStore->getPointerOperand(), DL); 
-      if (!Offset) 
-        break; 
- 
-      Ranges.addStore(*Offset, NextStore); 
-    } else { 
-      MemSetInst *MSI = cast<MemSetInst>(BI); 
- 
-      if (MSI->isVolatile() || ByteVal != MSI->getValue() || 
-          !isa<ConstantInt>(MSI->getLength())) 
-        break; 
- 
-      // Check to see if this store is to a constant offset from the start ptr. 
-      Optional<int64_t> Offset = isPointerOffset(StartPtr, MSI->getDest(), DL); 
-      if (!Offset) 
-        break; 
- 
-      Ranges.addMemSet(*Offset, MSI); 
-    } 
-  } 
- 
-  // If we have no ranges, then we just had a single store with nothing that 
-  // could be merged in.  This is a very common case of course. 
-  if (Ranges.empty()) 
-    return nullptr; 
- 
-  // If we had at least one store that could be merged in, add the starting 
-  // store as well.  We try to avoid this unless there is at least something 
-  // interesting as a small compile-time optimization. 
-  Ranges.addInst(0, StartInst); 
- 
-  // If we create any memsets, we put it right before the first instruction that 
-  // isn't part of the memset block.  This ensure that the memset is dominated 
-  // by any addressing instruction needed by the start of the block. 
-  IRBuilder<> Builder(&*BI); 
- 
-  // Now that we have full information about ranges, loop over the ranges and 
-  // emit memset's for anything big enough to be worthwhile. 
-  Instruction *AMemSet = nullptr; 
-  for (const MemsetRange &Range : Ranges) { 
-    if (Range.TheStores.size() == 1) continue; 
- 
-    // If it is profitable to lower this range to memset, do so now. 
-    if (!Range.isProfitableToUseMemset(DL)) 
-      continue; 
- 
-    // Otherwise, we do want to transform this!  Create a new memset. 
-    // Get the starting pointer of the block. 
-    StartPtr = Range.StartPtr; 
- 
-    AMemSet = Builder.CreateMemSet(StartPtr, ByteVal, Range.End - Range.Start, 
-                                   MaybeAlign(Range.Alignment)); 
-    LLVM_DEBUG(dbgs() << "Replace stores:\n"; for (Instruction *SI 
-                                                   : Range.TheStores) dbgs() 
-                                              << *SI << '\n'; 
-               dbgs() << "With: " << *AMemSet << '\n'); 
-    if (!Range.TheStores.empty()) 
-      AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc()); 
- 
+      if (isa<UndefValue>(ByteVal) && StoredByte)
+        ByteVal = StoredByte;
+      if (ByteVal != StoredByte)
+        break;
+
+      // Check to see if this store is to a constant offset from the start ptr.
+      Optional<int64_t> Offset =
+          isPointerOffset(StartPtr, NextStore->getPointerOperand(), DL);
+      if (!Offset)
+        break;
+
+      Ranges.addStore(*Offset, NextStore);
+    } else {
+      MemSetInst *MSI = cast<MemSetInst>(BI);
+
+      if (MSI->isVolatile() || ByteVal != MSI->getValue() ||
+          !isa<ConstantInt>(MSI->getLength()))
+        break;
+
+      // Check to see if this store is to a constant offset from the start ptr.
+      Optional<int64_t> Offset = isPointerOffset(StartPtr, MSI->getDest(), DL);
+      if (!Offset)
+        break;
+
+      Ranges.addMemSet(*Offset, MSI);
+    }
+  }
+
+  // If we have no ranges, then we just had a single store with nothing that
+  // could be merged in.  This is a very common case of course.
+  if (Ranges.empty())
+    return nullptr;
+
+  // If we had at least one store that could be merged in, add the starting
+  // store as well.  We try to avoid this unless there is at least something
+  // interesting as a small compile-time optimization.
+  Ranges.addInst(0, StartInst);
+
+  // If we create any memsets, we put it right before the first instruction that
+  // isn't part of the memset block.  This ensure that the memset is dominated
+  // by any addressing instruction needed by the start of the block.
+  IRBuilder<> Builder(&*BI);
+
+  // Now that we have full information about ranges, loop over the ranges and
+  // emit memset's for anything big enough to be worthwhile.
+  Instruction *AMemSet = nullptr;
+  for (const MemsetRange &Range : Ranges) {
+    if (Range.TheStores.size() == 1) continue;
+
+    // If it is profitable to lower this range to memset, do so now.
+    if (!Range.isProfitableToUseMemset(DL))
+      continue;
+
+    // Otherwise, we do want to transform this!  Create a new memset.
+    // Get the starting pointer of the block.
+    StartPtr = Range.StartPtr;
+
+    AMemSet = Builder.CreateMemSet(StartPtr, ByteVal, Range.End - Range.Start,
+                                   MaybeAlign(Range.Alignment));
+    LLVM_DEBUG(dbgs() << "Replace stores:\n"; for (Instruction *SI
+                                                   : Range.TheStores) dbgs()
+                                              << *SI << '\n';
+               dbgs() << "With: " << *AMemSet << '\n');
+    if (!Range.TheStores.empty())
+      AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc());
+
     if (MSSAU) {
       assert(LastMemDef && MemInsertPoint &&
              "Both LastMemDef and MemInsertPoint need to be set");
@@ -500,105 +500,105 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
       MemInsertPoint = NewDef;
     }
 
-    // Zap all the stores. 
+    // Zap all the stores.
     for (Instruction *SI : Range.TheStores)
       eraseInstruction(SI);
 
-    ++NumMemSetInfer; 
-  } 
- 
-  return AMemSet; 
-} 
- 
-// This method try to lift a store instruction before position P. 
-// It will lift the store and its argument + that anything that 
-// may alias with these. 
-// The method returns true if it was successful. 
+    ++NumMemSetInfer;
+  }
+
+  return AMemSet;
+}
+
+// This method try to lift a store instruction before position P.
+// It will lift the store and its argument + that anything that
+// may alias with these.
+// The method returns true if it was successful.
 bool MemCpyOptPass::moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI) {
-  // If the store alias this position, early bail out. 
-  MemoryLocation StoreLoc = MemoryLocation::get(SI); 
+  // If the store alias this position, early bail out.
+  MemoryLocation StoreLoc = MemoryLocation::get(SI);
   if (isModOrRefSet(AA->getModRefInfo(P, StoreLoc)))
-    return false; 
- 
-  // Keep track of the arguments of all instruction we plan to lift 
-  // so we can make sure to lift them as well if appropriate. 
-  DenseSet<Instruction*> Args; 
-  if (auto *Ptr = dyn_cast<Instruction>(SI->getPointerOperand())) 
-    if (Ptr->getParent() == SI->getParent()) 
-      Args.insert(Ptr); 
- 
-  // Instruction to lift before P. 
+    return false;
+
+  // Keep track of the arguments of all instruction we plan to lift
+  // so we can make sure to lift them as well if appropriate.
+  DenseSet<Instruction*> Args;
+  if (auto *Ptr = dyn_cast<Instruction>(SI->getPointerOperand()))
+    if (Ptr->getParent() == SI->getParent())
+      Args.insert(Ptr);
+
+  // Instruction to lift before P.
   SmallVector<Instruction *, 8> ToLift{SI};
- 
-  // Memory locations of lifted instructions. 
-  SmallVector<MemoryLocation, 8> MemLocs{StoreLoc}; 
- 
-  // Lifted calls. 
-  SmallVector<const CallBase *, 8> Calls; 
- 
-  const MemoryLocation LoadLoc = MemoryLocation::get(LI); 
- 
-  for (auto I = --SI->getIterator(), E = P->getIterator(); I != E; --I) { 
-    auto *C = &*I; 
- 
+
+  // Memory locations of lifted instructions.
+  SmallVector<MemoryLocation, 8> MemLocs{StoreLoc};
+
+  // Lifted calls.
+  SmallVector<const CallBase *, 8> Calls;
+
+  const MemoryLocation LoadLoc = MemoryLocation::get(LI);
+
+  for (auto I = --SI->getIterator(), E = P->getIterator(); I != E; --I) {
+    auto *C = &*I;
+
     // Make sure hoisting does not perform a store that was not guaranteed to
     // happen.
     if (!isGuaranteedToTransferExecutionToSuccessor(C))
       return false;
- 
+
     bool MayAlias = isModOrRefSet(AA->getModRefInfo(C, None));
 
-    bool NeedLift = false; 
-    if (Args.erase(C)) 
-      NeedLift = true; 
-    else if (MayAlias) { 
+    bool NeedLift = false;
+    if (Args.erase(C))
+      NeedLift = true;
+    else if (MayAlias) {
       NeedLift = llvm::any_of(MemLocs, [C, this](const MemoryLocation &ML) {
         return isModOrRefSet(AA->getModRefInfo(C, ML));
-      }); 
- 
-      if (!NeedLift) 
+      });
+
+      if (!NeedLift)
         NeedLift = llvm::any_of(Calls, [C, this](const CallBase *Call) {
           return isModOrRefSet(AA->getModRefInfo(C, Call));
-        }); 
-    } 
- 
-    if (!NeedLift) 
-      continue; 
- 
-    if (MayAlias) { 
-      // Since LI is implicitly moved downwards past the lifted instructions, 
-      // none of them may modify its source. 
+        });
+    }
+
+    if (!NeedLift)
+      continue;
+
+    if (MayAlias) {
+      // Since LI is implicitly moved downwards past the lifted instructions,
+      // none of them may modify its source.
       if (isModSet(AA->getModRefInfo(C, LoadLoc)))
-        return false; 
-      else if (const auto *Call = dyn_cast<CallBase>(C)) { 
-        // If we can't lift this before P, it's game over. 
+        return false;
+      else if (const auto *Call = dyn_cast<CallBase>(C)) {
+        // If we can't lift this before P, it's game over.
         if (isModOrRefSet(AA->getModRefInfo(P, Call)))
-          return false; 
- 
-        Calls.push_back(Call); 
-      } else if (isa<LoadInst>(C) || isa<StoreInst>(C) || isa<VAArgInst>(C)) { 
-        // If we can't lift this before P, it's game over. 
-        auto ML = MemoryLocation::get(C); 
+          return false;
+
+        Calls.push_back(Call);
+      } else if (isa<LoadInst>(C) || isa<StoreInst>(C) || isa<VAArgInst>(C)) {
+        // If we can't lift this before P, it's game over.
+        auto ML = MemoryLocation::get(C);
         if (isModOrRefSet(AA->getModRefInfo(P, ML)))
-          return false; 
- 
-        MemLocs.push_back(ML); 
-      } else 
-        // We don't know how to lift this instruction. 
-        return false; 
-    } 
- 
-    ToLift.push_back(C); 
-    for (unsigned k = 0, e = C->getNumOperands(); k != e; ++k) 
-      if (auto *A = dyn_cast<Instruction>(C->getOperand(k))) { 
-        if (A->getParent() == SI->getParent()) { 
-          // Cannot hoist user of P above P 
-          if(A == P) return false; 
-          Args.insert(A); 
-        } 
-      } 
-  } 
- 
+          return false;
+
+        MemLocs.push_back(ML);
+      } else
+        // We don't know how to lift this instruction.
+        return false;
+    }
+
+    ToLift.push_back(C);
+    for (unsigned k = 0, e = C->getNumOperands(); k != e; ++k)
+      if (auto *A = dyn_cast<Instruction>(C->getOperand(k))) {
+        if (A->getParent() == SI->getParent()) {
+          // Cannot hoist user of P above P
+          if(A == P) return false;
+          Args.insert(A);
+        }
+      }
+  }
+
   // Find MSSA insertion point. Normally P will always have a corresponding
   // memory access before which we can insert. However, with non-standard AA
   // pipelines, there may be a mismatch between AA and MSSA, in which case we
@@ -623,9 +623,9 @@ bool MemCpyOptPass::moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI) {
   }
 
   // We made it, we need to lift.
-  for (auto *I : llvm::reverse(ToLift)) { 
-    LLVM_DEBUG(dbgs() << "Lifting " << *I << " before " << *P << "\n"); 
-    I->moveBefore(P); 
+  for (auto *I : llvm::reverse(ToLift)) {
+    LLVM_DEBUG(dbgs() << "Lifting " << *I << " before " << *P << "\n");
+    I->moveBefore(P);
     if (MSSAU) {
       assert(MemInsertPoint && "Must have found insert point");
       if (MemoryUseOrDef *MA = MSSAU->getMemorySSA()->getMemoryAccess(I)) {
@@ -633,25 +633,25 @@ bool MemCpyOptPass::moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI) {
         MemInsertPoint = MA;
       }
     }
-  } 
- 
-  return true; 
-} 
- 
-bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { 
-  if (!SI->isSimple()) return false; 
- 
-  // Avoid merging nontemporal stores since the resulting 
-  // memcpy/memset would not be able to preserve the nontemporal hint. 
-  // In theory we could teach how to propagate the !nontemporal metadata to 
-  // memset calls. However, that change would force the backend to 
-  // conservatively expand !nontemporal memset calls back to sequences of 
-  // store instructions (effectively undoing the merging). 
-  if (SI->getMetadata(LLVMContext::MD_nontemporal)) 
-    return false; 
- 
-  const DataLayout &DL = SI->getModule()->getDataLayout(); 
- 
+  }
+
+  return true;
+}
+
+bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
+  if (!SI->isSimple()) return false;
+
+  // Avoid merging nontemporal stores since the resulting
+  // memcpy/memset would not be able to preserve the nontemporal hint.
+  // In theory we could teach how to propagate the !nontemporal metadata to
+  // memset calls. However, that change would force the backend to
+  // conservatively expand !nontemporal memset calls back to sequences of
+  // store instructions (effectively undoing the merging).
+  if (SI->getMetadata(LLVMContext::MD_nontemporal))
+    return false;
+
+  const DataLayout &DL = SI->getModule()->getDataLayout();
+
   Value *StoredVal = SI->getValueOperand();
 
   // Not all the transforms below are correct for non-integral pointers, bail
@@ -659,63 +659,63 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
   if (DL.isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
     return false;
 
-  // Load to store forwarding can be interpreted as memcpy. 
+  // Load to store forwarding can be interpreted as memcpy.
   if (LoadInst *LI = dyn_cast<LoadInst>(StoredVal)) {
-    if (LI->isSimple() && LI->hasOneUse() && 
-        LI->getParent() == SI->getParent()) { 
- 
-      auto *T = LI->getType(); 
-      if (T->isAggregateType()) { 
-        MemoryLocation LoadLoc = MemoryLocation::get(LI); 
- 
-        // We use alias analysis to check if an instruction may store to 
-        // the memory we load from in between the load and the store. If 
-        // such an instruction is found, we try to promote there instead 
-        // of at the store position. 
+    if (LI->isSimple() && LI->hasOneUse() &&
+        LI->getParent() == SI->getParent()) {
+
+      auto *T = LI->getType();
+      if (T->isAggregateType()) {
+        MemoryLocation LoadLoc = MemoryLocation::get(LI);
+
+        // We use alias analysis to check if an instruction may store to
+        // the memory we load from in between the load and the store. If
+        // such an instruction is found, we try to promote there instead
+        // of at the store position.
         // TODO: Can use MSSA for this.
-        Instruction *P = SI; 
-        for (auto &I : make_range(++LI->getIterator(), SI->getIterator())) { 
+        Instruction *P = SI;
+        for (auto &I : make_range(++LI->getIterator(), SI->getIterator())) {
           if (isModSet(AA->getModRefInfo(&I, LoadLoc))) {
-            P = &I; 
-            break; 
-          } 
-        } 
- 
-        // We found an instruction that may write to the loaded memory. 
-        // We can try to promote at this position instead of the store 
-        // position if nothing alias the store memory after this and the store 
-        // destination is not in the range. 
-        if (P && P != SI) { 
+            P = &I;
+            break;
+          }
+        }
+
+        // We found an instruction that may write to the loaded memory.
+        // We can try to promote at this position instead of the store
+        // position if nothing alias the store memory after this and the store
+        // destination is not in the range.
+        if (P && P != SI) {
           if (!moveUp(SI, P, LI))
-            P = nullptr; 
-        } 
- 
-        // If a valid insertion position is found, then we can promote 
-        // the load/store pair to a memcpy. 
-        if (P) { 
-          // If we load from memory that may alias the memory we store to, 
-          // memmove must be used to preserve semantic. If not, memcpy can 
-          // be used. 
-          bool UseMemMove = false; 
+            P = nullptr;
+        }
+
+        // If a valid insertion position is found, then we can promote
+        // the load/store pair to a memcpy.
+        if (P) {
+          // If we load from memory that may alias the memory we store to,
+          // memmove must be used to preserve semantic. If not, memcpy can
+          // be used.
+          bool UseMemMove = false;
           if (!AA->isNoAlias(MemoryLocation::get(SI), LoadLoc))
-            UseMemMove = true; 
- 
-          uint64_t Size = DL.getTypeStoreSize(T); 
- 
-          IRBuilder<> Builder(P); 
-          Instruction *M; 
-          if (UseMemMove) 
-            M = Builder.CreateMemMove( 
-                SI->getPointerOperand(), SI->getAlign(), 
-                LI->getPointerOperand(), LI->getAlign(), Size); 
-          else 
-            M = Builder.CreateMemCpy( 
-                SI->getPointerOperand(), SI->getAlign(), 
-                LI->getPointerOperand(), LI->getAlign(), Size); 
- 
-          LLVM_DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI << " => " 
-                            << *M << "\n"); 
- 
+            UseMemMove = true;
+
+          uint64_t Size = DL.getTypeStoreSize(T);
+
+          IRBuilder<> Builder(P);
+          Instruction *M;
+          if (UseMemMove)
+            M = Builder.CreateMemMove(
+                SI->getPointerOperand(), SI->getAlign(),
+                LI->getPointerOperand(), LI->getAlign(), Size);
+          else
+            M = Builder.CreateMemCpy(
+                SI->getPointerOperand(), SI->getAlign(),
+                LI->getPointerOperand(), LI->getAlign(), Size);
+
+          LLVM_DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI << " => "
+                            << *M << "\n");
+
           if (MSSAU) {
             auto *LastDef =
                 cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(SI));
@@ -726,18 +726,18 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
 
           eraseInstruction(SI);
           eraseInstruction(LI);
-          ++NumMemCpyInstr; 
- 
-          // Make sure we do not invalidate the iterator. 
-          BBI = M->getIterator(); 
-          return true; 
-        } 
-      } 
- 
-      // Detect cases where we're performing call slot forwarding, but 
-      // happen to be using a load-store pair to implement it, rather than 
-      // a memcpy. 
-      CallInst *C = nullptr; 
+          ++NumMemCpyInstr;
+
+          // Make sure we do not invalidate the iterator.
+          BBI = M->getIterator();
+          return true;
+        }
+      }
+
+      // Detect cases where we're performing call slot forwarding, but
+      // happen to be using a load-store pair to implement it, rather than
+      // a memcpy.
+      CallInst *C = nullptr;
       if (EnableMemorySSA) {
         if (auto *LoadClobber = dyn_cast<MemoryUseOrDef>(
                 MSSA->getWalker()->getClobberingMemoryAccess(LI))) {
@@ -751,15 +751,15 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
         if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst()))
           C = dyn_cast<CallInst>(ldep.getInst());
       }
- 
-      if (C) { 
-        // Check that nothing touches the dest of the "copy" between 
-        // the call and the store. 
-        MemoryLocation StoreLoc = MemoryLocation::get(SI); 
+
+      if (C) {
+        // Check that nothing touches the dest of the "copy" between
+        // the call and the store.
+        MemoryLocation StoreLoc = MemoryLocation::get(SI);
         if (EnableMemorySSA) {
           if (accessedBetween(*AA, StoreLoc, MSSA->getMemoryAccess(C),
                               MSSA->getMemoryAccess(SI)))
-            C = nullptr; 
+            C = nullptr;
         } else {
           for (BasicBlock::iterator I = --SI->getIterator(),
                                     E = C->getIterator();
@@ -768,52 +768,52 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
               C = nullptr;
               break;
             }
-          } 
-        } 
-      } 
- 
-      if (C) { 
-        bool changed = performCallSlotOptzn( 
+          }
+        }
+      }
+
+      if (C) {
+        bool changed = performCallSlotOptzn(
             LI, SI, SI->getPointerOperand()->stripPointerCasts(),
-            LI->getPointerOperand()->stripPointerCasts(), 
-            DL.getTypeStoreSize(SI->getOperand(0)->getType()), 
-            commonAlignment(SI->getAlign(), LI->getAlign()), C); 
-        if (changed) { 
+            LI->getPointerOperand()->stripPointerCasts(),
+            DL.getTypeStoreSize(SI->getOperand(0)->getType()),
+            commonAlignment(SI->getAlign(), LI->getAlign()), C);
+        if (changed) {
           eraseInstruction(SI);
           eraseInstruction(LI);
-          ++NumMemCpyInstr; 
-          return true; 
-        } 
-      } 
-    } 
-  } 
- 
-  // There are two cases that are interesting for this code to handle: memcpy 
-  // and memset.  Right now we only handle memset. 
- 
-  // Ensure that the value being stored is something that can be memset'able a 
-  // byte at a time like "0" or "-1" or any width, as well as things like 
-  // 0xA0A0A0A0 and 0.0. 
-  auto *V = SI->getOperand(0); 
-  if (Value *ByteVal = isBytewiseValue(V, DL)) { 
-    if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(), 
-                                              ByteVal)) { 
-      BBI = I->getIterator(); // Don't invalidate iterator. 
-      return true; 
-    } 
- 
-    // If we have an aggregate, we try to promote it to memset regardless 
-    // of opportunity for merging as it can expose optimization opportunities 
-    // in subsequent passes. 
-    auto *T = V->getType(); 
-    if (T->isAggregateType()) { 
-      uint64_t Size = DL.getTypeStoreSize(T); 
-      IRBuilder<> Builder(SI); 
-      auto *M = Builder.CreateMemSet(SI->getPointerOperand(), ByteVal, Size, 
-                                     SI->getAlign()); 
- 
-      LLVM_DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n"); 
- 
+          ++NumMemCpyInstr;
+          return true;
+        }
+      }
+    }
+  }
+
+  // There are two cases that are interesting for this code to handle: memcpy
+  // and memset.  Right now we only handle memset.
+
+  // Ensure that the value being stored is something that can be memset'able a
+  // byte at a time like "0" or "-1" or any width, as well as things like
+  // 0xA0A0A0A0 and 0.0.
+  auto *V = SI->getOperand(0);
+  if (Value *ByteVal = isBytewiseValue(V, DL)) {
+    if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(),
+                                              ByteVal)) {
+      BBI = I->getIterator(); // Don't invalidate iterator.
+      return true;
+    }
+
+    // If we have an aggregate, we try to promote it to memset regardless
+    // of opportunity for merging as it can expose optimization opportunities
+    // in subsequent passes.
+    auto *T = V->getType();
+    if (T->isAggregateType()) {
+      uint64_t Size = DL.getTypeStoreSize(T);
+      IRBuilder<> Builder(SI);
+      auto *M = Builder.CreateMemSet(SI->getPointerOperand(), ByteVal, Size,
+                                     SI->getAlign());
+
+      LLVM_DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n");
+
       if (MSSAU) {
         assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(SI)));
         auto *LastDef =
@@ -823,78 +823,78 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
       }
 
       eraseInstruction(SI);
-      NumMemSetInfer++; 
- 
-      // Make sure we do not invalidate the iterator. 
-      BBI = M->getIterator(); 
-      return true; 
-    } 
-  } 
- 
-  return false; 
-} 
- 
-bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) { 
-  // See if there is another memset or store neighboring this memset which 
-  // allows us to widen out the memset to do a single larger store. 
-  if (isa<ConstantInt>(MSI->getLength()) && !MSI->isVolatile()) 
-    if (Instruction *I = tryMergingIntoMemset(MSI, MSI->getDest(), 
-                                              MSI->getValue())) { 
-      BBI = I->getIterator(); // Don't invalidate iterator. 
-      return true; 
-    } 
-  return false; 
-} 
- 
-/// Takes a memcpy and a call that it depends on, 
-/// and checks for the possibility of a call slot optimization by having 
-/// the call write its result directly into the destination of the memcpy. 
+      NumMemSetInfer++;
+
+      // Make sure we do not invalidate the iterator.
+      BBI = M->getIterator();
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
+  // See if there is another memset or store neighboring this memset which
+  // allows us to widen out the memset to do a single larger store.
+  if (isa<ConstantInt>(MSI->getLength()) && !MSI->isVolatile())
+    if (Instruction *I = tryMergingIntoMemset(MSI, MSI->getDest(),
+                                              MSI->getValue())) {
+      BBI = I->getIterator(); // Don't invalidate iterator.
+      return true;
+    }
+  return false;
+}
+
+/// Takes a memcpy and a call that it depends on,
+/// and checks for the possibility of a call slot optimization by having
+/// the call write its result directly into the destination of the memcpy.
 bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
                                          Instruction *cpyStore, Value *cpyDest,
-                                         Value *cpySrc, uint64_t cpyLen, 
-                                         Align cpyAlign, CallInst *C) { 
-  // The general transformation to keep in mind is 
-  // 
-  //   call @func(..., src, ...) 
-  //   memcpy(dest, src, ...) 
-  // 
-  // -> 
-  // 
-  //   memcpy(dest, src, ...) 
-  //   call @func(..., dest, ...) 
-  // 
-  // Since moving the memcpy is technically awkward, we additionally check that 
-  // src only holds uninitialized values at the moment of the call, meaning that 
-  // the memcpy can be discarded rather than moved. 
- 
-  // Lifetime marks shouldn't be operated on. 
-  if (Function *F = C->getCalledFunction()) 
-    if (F->isIntrinsic() && F->getIntrinsicID() == Intrinsic::lifetime_start) 
-      return false; 
- 
-  // Require that src be an alloca.  This simplifies the reasoning considerably. 
-  AllocaInst *srcAlloca = dyn_cast<AllocaInst>(cpySrc); 
-  if (!srcAlloca) 
-    return false; 
- 
-  ConstantInt *srcArraySize = dyn_cast<ConstantInt>(srcAlloca->getArraySize()); 
-  if (!srcArraySize) 
-    return false; 
- 
+                                         Value *cpySrc, uint64_t cpyLen,
+                                         Align cpyAlign, CallInst *C) {
+  // The general transformation to keep in mind is
+  //
+  //   call @func(..., src, ...)
+  //   memcpy(dest, src, ...)
+  //
+  // ->
+  //
+  //   memcpy(dest, src, ...)
+  //   call @func(..., dest, ...)
+  //
+  // Since moving the memcpy is technically awkward, we additionally check that
+  // src only holds uninitialized values at the moment of the call, meaning that
+  // the memcpy can be discarded rather than moved.
+
+  // Lifetime marks shouldn't be operated on.
+  if (Function *F = C->getCalledFunction())
+    if (F->isIntrinsic() && F->getIntrinsicID() == Intrinsic::lifetime_start)
+      return false;
+
+  // Require that src be an alloca.  This simplifies the reasoning considerably.
+  AllocaInst *srcAlloca = dyn_cast<AllocaInst>(cpySrc);
+  if (!srcAlloca)
+    return false;
+
+  ConstantInt *srcArraySize = dyn_cast<ConstantInt>(srcAlloca->getArraySize());
+  if (!srcArraySize)
+    return false;
+
   const DataLayout &DL = cpyLoad->getModule()->getDataLayout();
-  uint64_t srcSize = DL.getTypeAllocSize(srcAlloca->getAllocatedType()) * 
-                     srcArraySize->getZExtValue(); 
- 
-  if (cpyLen < srcSize) 
-    return false; 
- 
-  // Check that accessing the first srcSize bytes of dest will not cause a 
-  // trap.  Otherwise the transform is invalid since it might cause a trap 
-  // to occur earlier than it otherwise would. 
+  uint64_t srcSize = DL.getTypeAllocSize(srcAlloca->getAllocatedType()) *
+                     srcArraySize->getZExtValue();
+
+  if (cpyLen < srcSize)
+    return false;
+
+  // Check that accessing the first srcSize bytes of dest will not cause a
+  // trap.  Otherwise the transform is invalid since it might cause a trap
+  // to occur earlier than it otherwise would.
   if (!isDereferenceableAndAlignedPointer(cpyDest, Align(1), APInt(64, cpyLen),
                                           DL, C, DT))
     return false;
- 
+
   // Make sure that nothing can observe cpyDest being written early. There are
   // a number of cases to consider:
   //  1. cpyDest cannot be accessed between C and cpyStore as a precondition of
@@ -910,51 +910,51 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
   //     renders accesses from other threads undefined.
   //     TODO: This is currently not checked.
   if (mayBeVisibleThroughUnwinding(cpyDest, C, cpyStore))
-    return false; 
- 
-  // Check that dest points to memory that is at least as aligned as src. 
-  Align srcAlign = srcAlloca->getAlign(); 
-  bool isDestSufficientlyAligned = srcAlign <= cpyAlign; 
-  // If dest is not aligned enough and we can't increase its alignment then 
-  // bail out. 
-  if (!isDestSufficientlyAligned && !isa<AllocaInst>(cpyDest)) 
-    return false; 
- 
-  // Check that src is not accessed except via the call and the memcpy.  This 
-  // guarantees that it holds only undefined values when passed in (so the final 
-  // memcpy can be dropped), that it is not read or written between the call and 
-  // the memcpy, and that writing beyond the end of it is undefined. 
+    return false;
+
+  // Check that dest points to memory that is at least as aligned as src.
+  Align srcAlign = srcAlloca->getAlign();
+  bool isDestSufficientlyAligned = srcAlign <= cpyAlign;
+  // If dest is not aligned enough and we can't increase its alignment then
+  // bail out.
+  if (!isDestSufficientlyAligned && !isa<AllocaInst>(cpyDest))
+    return false;
+
+  // Check that src is not accessed except via the call and the memcpy.  This
+  // guarantees that it holds only undefined values when passed in (so the final
+  // memcpy can be dropped), that it is not read or written between the call and
+  // the memcpy, and that writing beyond the end of it is undefined.
   SmallVector<User *, 8> srcUseList(srcAlloca->users());
-  while (!srcUseList.empty()) { 
-    User *U = srcUseList.pop_back_val(); 
- 
-    if (isa<BitCastInst>(U) || isa<AddrSpaceCastInst>(U)) { 
+  while (!srcUseList.empty()) {
+    User *U = srcUseList.pop_back_val();
+
+    if (isa<BitCastInst>(U) || isa<AddrSpaceCastInst>(U)) {
       append_range(srcUseList, U->users());
-      continue; 
-    } 
-    if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(U)) { 
-      if (!G->hasAllZeroIndices()) 
-        return false; 
- 
+      continue;
+    }
+    if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(U)) {
+      if (!G->hasAllZeroIndices())
+        return false;
+
       append_range(srcUseList, U->users());
-      continue; 
-    } 
-    if (const IntrinsicInst *IT = dyn_cast<IntrinsicInst>(U)) 
-      if (IT->isLifetimeStartOrEnd()) 
-        continue; 
- 
+      continue;
+    }
+    if (const IntrinsicInst *IT = dyn_cast<IntrinsicInst>(U))
+      if (IT->isLifetimeStartOrEnd())
+        continue;
+
     if (U != C && U != cpyLoad)
-      return false; 
-  } 
- 
-  // Check that src isn't captured by the called function since the 
-  // transformation can cause aliasing issues in that case. 
-  for (unsigned ArgI = 0, E = C->arg_size(); ArgI != E; ++ArgI) 
-    if (C->getArgOperand(ArgI) == cpySrc && !C->doesNotCapture(ArgI)) 
-      return false; 
- 
-  // Since we're changing the parameter to the callsite, we need to make sure 
-  // that what would be the new parameter dominates the callsite. 
+      return false;
+  }
+
+  // Check that src isn't captured by the called function since the
+  // transformation can cause aliasing issues in that case.
+  for (unsigned ArgI = 0, E = C->arg_size(); ArgI != E; ++ArgI)
+    if (C->getArgOperand(ArgI) == cpySrc && !C->doesNotCapture(ArgI))
+      return false;
+
+  // Since we're changing the parameter to the callsite, we need to make sure
+  // that what would be the new parameter dominates the callsite.
   if (!DT->dominates(cpyDest, C)) {
     // Support moving a constant index GEP before the call.
     auto *GEP = dyn_cast<GetElementPtrInst>(cpyDest);
@@ -962,107 +962,107 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
         DT->dominates(GEP->getPointerOperand(), C))
       GEP->moveBefore(C);
     else
-      return false; 
+      return false;
   }
- 
-  // In addition to knowing that the call does not access src in some 
-  // unexpected manner, for example via a global, which we deduce from 
-  // the use analysis, we also need to know that it does not sneakily 
-  // access dest.  We rely on AA to figure this out for us. 
+
+  // In addition to knowing that the call does not access src in some
+  // unexpected manner, for example via a global, which we deduce from
+  // the use analysis, we also need to know that it does not sneakily
+  // access dest.  We rely on AA to figure this out for us.
   ModRefInfo MR = AA->getModRefInfo(C, cpyDest, LocationSize::precise(srcSize));
-  // If necessary, perform additional analysis. 
-  if (isModOrRefSet(MR)) 
+  // If necessary, perform additional analysis.
+  if (isModOrRefSet(MR))
     MR = AA->callCapturesBefore(C, cpyDest, LocationSize::precise(srcSize), DT);
-  if (isModOrRefSet(MR)) 
-    return false; 
- 
-  // We can't create address space casts here because we don't know if they're 
-  // safe for the target. 
-  if (cpySrc->getType()->getPointerAddressSpace() != 
-      cpyDest->getType()->getPointerAddressSpace()) 
-    return false; 
-  for (unsigned ArgI = 0; ArgI < C->arg_size(); ++ArgI) 
-    if (C->getArgOperand(ArgI)->stripPointerCasts() == cpySrc && 
-        cpySrc->getType()->getPointerAddressSpace() != 
-            C->getArgOperand(ArgI)->getType()->getPointerAddressSpace()) 
-      return false; 
- 
-  // All the checks have passed, so do the transformation. 
-  bool changedArgument = false; 
-  for (unsigned ArgI = 0; ArgI < C->arg_size(); ++ArgI) 
-    if (C->getArgOperand(ArgI)->stripPointerCasts() == cpySrc) { 
-      Value *Dest = cpySrc->getType() == cpyDest->getType() ?  cpyDest 
-        : CastInst::CreatePointerCast(cpyDest, cpySrc->getType(), 
-                                      cpyDest->getName(), C); 
-      changedArgument = true; 
-      if (C->getArgOperand(ArgI)->getType() == Dest->getType()) 
-        C->setArgOperand(ArgI, Dest); 
-      else 
-        C->setArgOperand(ArgI, CastInst::CreatePointerCast( 
-                                   Dest, C->getArgOperand(ArgI)->getType(), 
-                                   Dest->getName(), C)); 
-    } 
- 
-  if (!changedArgument) 
-    return false; 
- 
-  // If the destination wasn't sufficiently aligned then increase its alignment. 
-  if (!isDestSufficientlyAligned) { 
-    assert(isa<AllocaInst>(cpyDest) && "Can only increase alloca alignment!"); 
-    cast<AllocaInst>(cpyDest)->setAlignment(srcAlign); 
-  } 
- 
-  // Drop any cached information about the call, because we may have changed 
-  // its dependence information by changing its parameter. 
+  if (isModOrRefSet(MR))
+    return false;
+
+  // We can't create address space casts here because we don't know if they're
+  // safe for the target.
+  if (cpySrc->getType()->getPointerAddressSpace() !=
+      cpyDest->getType()->getPointerAddressSpace())
+    return false;
+  for (unsigned ArgI = 0; ArgI < C->arg_size(); ++ArgI)
+    if (C->getArgOperand(ArgI)->stripPointerCasts() == cpySrc &&
+        cpySrc->getType()->getPointerAddressSpace() !=
+            C->getArgOperand(ArgI)->getType()->getPointerAddressSpace())
+      return false;
+
+  // All the checks have passed, so do the transformation.
+  bool changedArgument = false;
+  for (unsigned ArgI = 0; ArgI < C->arg_size(); ++ArgI)
+    if (C->getArgOperand(ArgI)->stripPointerCasts() == cpySrc) {
+      Value *Dest = cpySrc->getType() == cpyDest->getType() ?  cpyDest
+        : CastInst::CreatePointerCast(cpyDest, cpySrc->getType(),
+                                      cpyDest->getName(), C);
+      changedArgument = true;
+      if (C->getArgOperand(ArgI)->getType() == Dest->getType())
+        C->setArgOperand(ArgI, Dest);
+      else
+        C->setArgOperand(ArgI, CastInst::CreatePointerCast(
+                                   Dest, C->getArgOperand(ArgI)->getType(),
+                                   Dest->getName(), C));
+    }
+
+  if (!changedArgument)
+    return false;
+
+  // If the destination wasn't sufficiently aligned then increase its alignment.
+  if (!isDestSufficientlyAligned) {
+    assert(isa<AllocaInst>(cpyDest) && "Can only increase alloca alignment!");
+    cast<AllocaInst>(cpyDest)->setAlignment(srcAlign);
+  }
+
+  // Drop any cached information about the call, because we may have changed
+  // its dependence information by changing its parameter.
   if (MD)
     MD->removeInstruction(C);
- 
-  // Update AA metadata 
-  // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be 
-  // handled here, but combineMetadata doesn't support them yet 
-  unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, 
-                         LLVMContext::MD_noalias, 
-                         LLVMContext::MD_invariant_group, 
-                         LLVMContext::MD_access_group}; 
+
+  // Update AA metadata
+  // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be
+  // handled here, but combineMetadata doesn't support them yet
+  unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
+                         LLVMContext::MD_noalias,
+                         LLVMContext::MD_invariant_group,
+                         LLVMContext::MD_access_group};
   combineMetadata(C, cpyLoad, KnownIDs, true);
- 
+
   ++NumCallSlot;
-  return true; 
-} 
- 
-/// We've found that the (upward scanning) memory dependence of memcpy 'M' is 
-/// the memcpy 'MDep'. Try to simplify M to copy from MDep's input if we can. 
-bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, 
-                                                  MemCpyInst *MDep) { 
-  // We can only transforms memcpy's where the dest of one is the source of the 
-  // other. 
-  if (M->getSource() != MDep->getDest() || MDep->isVolatile()) 
-    return false; 
- 
-  // If dep instruction is reading from our current input, then it is a noop 
-  // transfer and substituting the input won't change this instruction.  Just 
-  // ignore the input and let someone else zap MDep.  This handles cases like: 
-  //    memcpy(a <- a) 
-  //    memcpy(b <- a) 
-  if (M->getSource() == MDep->getSource()) 
-    return false; 
- 
-  // Second, the length of the memcpy's must be the same, or the preceding one 
-  // must be larger than the following one. 
-  ConstantInt *MDepLen = dyn_cast<ConstantInt>(MDep->getLength()); 
-  ConstantInt *MLen = dyn_cast<ConstantInt>(M->getLength()); 
-  if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue()) 
-    return false; 
- 
-  // Verify that the copied-from memory doesn't change in between the two 
-  // transfers.  For example, in: 
-  //    memcpy(a <- b) 
-  //    *b = 42; 
-  //    memcpy(c <- a) 
-  // It would be invalid to transform the second memcpy into memcpy(c <- b). 
-  // 
-  // TODO: If the code between M and MDep is transparent to the destination "c", 
-  // then we could still perform the xform by moving M up to the first memcpy. 
+  return true;
+}
+
+/// We've found that the (upward scanning) memory dependence of memcpy 'M' is
+/// the memcpy 'MDep'. Try to simplify M to copy from MDep's input if we can.
+bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
+                                                  MemCpyInst *MDep) {
+  // We can only transforms memcpy's where the dest of one is the source of the
+  // other.
+  if (M->getSource() != MDep->getDest() || MDep->isVolatile())
+    return false;
+
+  // If dep instruction is reading from our current input, then it is a noop
+  // transfer and substituting the input won't change this instruction.  Just
+  // ignore the input and let someone else zap MDep.  This handles cases like:
+  //    memcpy(a <- a)
+  //    memcpy(b <- a)
+  if (M->getSource() == MDep->getSource())
+    return false;
+
+  // Second, the length of the memcpy's must be the same, or the preceding one
+  // must be larger than the following one.
+  ConstantInt *MDepLen = dyn_cast<ConstantInt>(MDep->getLength());
+  ConstantInt *MLen = dyn_cast<ConstantInt>(M->getLength());
+  if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue())
+    return false;
+
+  // Verify that the copied-from memory doesn't change in between the two
+  // transfers.  For example, in:
+  //    memcpy(a <- b)
+  //    *b = 42;
+  //    memcpy(c <- a)
+  // It would be invalid to transform the second memcpy into memcpy(c <- b).
+  //
+  // TODO: If the code between M and MDep is transparent to the destination "c",
+  // then we could still perform the xform by moving M up to the first memcpy.
   if (EnableMemorySSA) {
     // TODO: It would be sufficient to check the MDep source up to the memcpy
     // size of M, rather than MDep.
@@ -1078,32 +1078,32 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
     if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
       return false;
   }
- 
-  // If the dest of the second might alias the source of the first, then the 
-  // source and dest might overlap.  We still want to eliminate the intermediate 
-  // value, but we have to generate a memmove instead of memcpy. 
-  bool UseMemMove = false; 
+
+  // If the dest of the second might alias the source of the first, then the
+  // source and dest might overlap.  We still want to eliminate the intermediate
+  // value, but we have to generate a memmove instead of memcpy.
+  bool UseMemMove = false;
   if (!AA->isNoAlias(MemoryLocation::getForDest(M),
                      MemoryLocation::getForSource(MDep)))
-    UseMemMove = true; 
- 
-  // If all checks passed, then we can transform M. 
-  LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy->memcpy src:\n" 
-                    << *MDep << '\n' << *M << '\n'); 
- 
-  // TODO: Is this worth it if we're creating a less aligned memcpy? For 
-  // example we could be moving from movaps -> movq on x86. 
-  IRBuilder<> Builder(M); 
+    UseMemMove = true;
+
+  // If all checks passed, then we can transform M.
+  LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy->memcpy src:\n"
+                    << *MDep << '\n' << *M << '\n');
+
+  // TODO: Is this worth it if we're creating a less aligned memcpy? For
+  // example we could be moving from movaps -> movq on x86.
+  IRBuilder<> Builder(M);
   Instruction *NewM;
-  if (UseMemMove) 
+  if (UseMemMove)
     NewM = Builder.CreateMemMove(M->getRawDest(), M->getDestAlign(),
                                  MDep->getRawSource(), MDep->getSourceAlign(),
                                  M->getLength(), M->isVolatile());
-  else 
+  else
     NewM = Builder.CreateMemCpy(M->getRawDest(), M->getDestAlign(),
                                 MDep->getRawSource(), MDep->getSourceAlign(),
                                 M->getLength(), M->isVolatile());
- 
+
   if (MSSAU) {
     assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M)));
     auto *LastDef = cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M));
@@ -1111,40 +1111,40 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
     MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
   }
 
-  // Remove the instruction we're replacing. 
+  // Remove the instruction we're replacing.
   eraseInstruction(M);
-  ++NumMemCpyInstr; 
-  return true; 
-} 
- 
-/// We've found that the (upward scanning) memory dependence of \p MemCpy is 
-/// \p MemSet.  Try to simplify \p MemSet to only set the trailing bytes that 
-/// weren't copied over by \p MemCpy. 
-/// 
-/// In other words, transform: 
-/// \code 
-///   memset(dst, c, dst_size); 
-///   memcpy(dst, src, src_size); 
-/// \endcode 
-/// into: 
-/// \code 
-///   memcpy(dst, src, src_size); 
-///   memset(dst + src_size, c, dst_size <= src_size ? 0 : dst_size - src_size); 
-/// \endcode 
-bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy, 
-                                                  MemSetInst *MemSet) { 
-  // We can only transform memset/memcpy with the same destination. 
-  if (MemSet->getDest() != MemCpy->getDest()) 
-    return false; 
- 
+  ++NumMemCpyInstr;
+  return true;
+}
+
+/// We've found that the (upward scanning) memory dependence of \p MemCpy is
+/// \p MemSet.  Try to simplify \p MemSet to only set the trailing bytes that
+/// weren't copied over by \p MemCpy.
+///
+/// In other words, transform:
+/// \code
+///   memset(dst, c, dst_size);
+///   memcpy(dst, src, src_size);
+/// \endcode
+/// into:
+/// \code
+///   memcpy(dst, src, src_size);
+///   memset(dst + src_size, c, dst_size <= src_size ? 0 : dst_size - src_size);
+/// \endcode
+bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
+                                                  MemSetInst *MemSet) {
+  // We can only transform memset/memcpy with the same destination.
+  if (MemSet->getDest() != MemCpy->getDest())
+    return false;
+
   // Check that src and dst of the memcpy aren't the same. While memcpy
   // operands cannot partially overlap, exact equality is allowed.
   if (!AA->isNoAlias(MemoryLocation(MemCpy->getSource(),
                                     LocationSize::precise(1)),
                      MemoryLocation(MemCpy->getDest(),
                                     LocationSize::precise(1))))
-    return false; 
- 
+    return false;
+
   if (EnableMemorySSA) {
     // We know that dst up to src_size is not written. We now need to make sure
     // that dst up to dst_size is not accessed. (If we did not move the memset,
@@ -1164,44 +1164,44 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
       return false;
   }
 
-  // Use the same i8* dest as the memcpy, killing the memset dest if different. 
-  Value *Dest = MemCpy->getRawDest(); 
-  Value *DestSize = MemSet->getLength(); 
-  Value *SrcSize = MemCpy->getLength(); 
- 
+  // Use the same i8* dest as the memcpy, killing the memset dest if different.
+  Value *Dest = MemCpy->getRawDest();
+  Value *DestSize = MemSet->getLength();
+  Value *SrcSize = MemCpy->getLength();
+
   if (mayBeVisibleThroughUnwinding(Dest, MemSet, MemCpy))
     return false;
 
-  // By default, create an unaligned memset. 
-  unsigned Align = 1; 
-  // If Dest is aligned, and SrcSize is constant, use the minimum alignment 
-  // of the sum. 
-  const unsigned DestAlign = 
-      std::max(MemSet->getDestAlignment(), MemCpy->getDestAlignment()); 
-  if (DestAlign > 1) 
-    if (ConstantInt *SrcSizeC = dyn_cast<ConstantInt>(SrcSize)) 
-      Align = MinAlign(SrcSizeC->getZExtValue(), DestAlign); 
- 
-  IRBuilder<> Builder(MemCpy); 
- 
-  // If the sizes have different types, zext the smaller one. 
-  if (DestSize->getType() != SrcSize->getType()) { 
-    if (DestSize->getType()->getIntegerBitWidth() > 
-        SrcSize->getType()->getIntegerBitWidth()) 
-      SrcSize = Builder.CreateZExt(SrcSize, DestSize->getType()); 
-    else 
-      DestSize = Builder.CreateZExt(DestSize, SrcSize->getType()); 
-  } 
- 
-  Value *Ule = Builder.CreateICmpULE(DestSize, SrcSize); 
-  Value *SizeDiff = Builder.CreateSub(DestSize, SrcSize); 
-  Value *MemsetLen = Builder.CreateSelect( 
-      Ule, ConstantInt::getNullValue(DestSize->getType()), SizeDiff); 
+  // By default, create an unaligned memset.
+  unsigned Align = 1;
+  // If Dest is aligned, and SrcSize is constant, use the minimum alignment
+  // of the sum.
+  const unsigned DestAlign =
+      std::max(MemSet->getDestAlignment(), MemCpy->getDestAlignment());
+  if (DestAlign > 1)
+    if (ConstantInt *SrcSizeC = dyn_cast<ConstantInt>(SrcSize))
+      Align = MinAlign(SrcSizeC->getZExtValue(), DestAlign);
+
+  IRBuilder<> Builder(MemCpy);
+
+  // If the sizes have different types, zext the smaller one.
+  if (DestSize->getType() != SrcSize->getType()) {
+    if (DestSize->getType()->getIntegerBitWidth() >
+        SrcSize->getType()->getIntegerBitWidth())
+      SrcSize = Builder.CreateZExt(SrcSize, DestSize->getType());
+    else
+      DestSize = Builder.CreateZExt(DestSize, SrcSize->getType());
+  }
+
+  Value *Ule = Builder.CreateICmpULE(DestSize, SrcSize);
+  Value *SizeDiff = Builder.CreateSub(DestSize, SrcSize);
+  Value *MemsetLen = Builder.CreateSelect(
+      Ule, ConstantInt::getNullValue(DestSize->getType()), SizeDiff);
   Instruction *NewMemSet = Builder.CreateMemSet(
-      Builder.CreateGEP(Dest->getType()->getPointerElementType(), Dest, 
-                        SrcSize), 
-      MemSet->getOperand(1), MemsetLen, MaybeAlign(Align)); 
- 
+      Builder.CreateGEP(Dest->getType()->getPointerElementType(), Dest,
+                        SrcSize),
+      MemSet->getOperand(1), MemsetLen, MaybeAlign(Align));
+
   if (MSSAU) {
     assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy)) &&
            "MemCpy must be a MemoryDef");
@@ -1216,24 +1216,24 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
   }
 
   eraseInstruction(MemSet);
-  return true; 
-} 
- 
-/// Determine whether the instruction has undefined content for the given Size, 
-/// either because it was freshly alloca'd or started its lifetime. 
-static bool hasUndefContents(Instruction *I, ConstantInt *Size) { 
-  if (isa<AllocaInst>(I)) 
-    return true; 
- 
-  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) 
-    if (II->getIntrinsicID() == Intrinsic::lifetime_start) 
-      if (ConstantInt *LTSize = dyn_cast<ConstantInt>(II->getArgOperand(0))) 
-        if (LTSize->getZExtValue() >= Size->getZExtValue()) 
-          return true; 
- 
-  return false; 
-} 
- 
+  return true;
+}
+
+/// Determine whether the instruction has undefined content for the given Size,
+/// either because it was freshly alloca'd or started its lifetime.
+static bool hasUndefContents(Instruction *I, ConstantInt *Size) {
+  if (isa<AllocaInst>(I))
+    return true;
+
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+    if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+      if (ConstantInt *LTSize = dyn_cast<ConstantInt>(II->getArgOperand(0)))
+        if (LTSize->getZExtValue() >= Size->getZExtValue())
+          return true;
+
+  return false;
+}
+
 static bool hasUndefContentsMSSA(MemorySSA *MSSA, AliasAnalysis *AA, Value *V,
                                  MemoryDef *Def, ConstantInt *Size) {
   if (MSSA->isLiveOnEntryDef(Def))
@@ -1252,41 +1252,41 @@ static bool hasUndefContentsMSSA(MemorySSA *MSSA, AliasAnalysis *AA, Value *V,
   return false;
 }
 
-/// Transform memcpy to memset when its source was just memset. 
-/// In other words, turn: 
-/// \code 
-///   memset(dst1, c, dst1_size); 
-///   memcpy(dst2, dst1, dst2_size); 
-/// \endcode 
-/// into: 
-/// \code 
-///   memset(dst1, c, dst1_size); 
-///   memset(dst2, c, dst2_size); 
-/// \endcode 
-/// When dst2_size <= dst1_size. 
-/// 
-/// The \p MemCpy must have a Constant length. 
-bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, 
-                                               MemSetInst *MemSet) { 
-  // Make sure that memcpy(..., memset(...), ...), that is we are memsetting and 
-  // memcpying from the same address. Otherwise it is hard to reason about. 
+/// Transform memcpy to memset when its source was just memset.
+/// In other words, turn:
+/// \code
+///   memset(dst1, c, dst1_size);
+///   memcpy(dst2, dst1, dst2_size);
+/// \endcode
+/// into:
+/// \code
+///   memset(dst1, c, dst1_size);
+///   memset(dst2, c, dst2_size);
+/// \endcode
+/// When dst2_size <= dst1_size.
+///
+/// The \p MemCpy must have a Constant length.
+bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
+                                               MemSetInst *MemSet) {
+  // Make sure that memcpy(..., memset(...), ...), that is we are memsetting and
+  // memcpying from the same address. Otherwise it is hard to reason about.
   if (!AA->isMustAlias(MemSet->getRawDest(), MemCpy->getRawSource()))
-    return false; 
- 
-  // A known memset size is required. 
-  ConstantInt *MemSetSize = dyn_cast<ConstantInt>(MemSet->getLength()); 
-  if (!MemSetSize) 
-    return false; 
- 
-  // Make sure the memcpy doesn't read any more than what the memset wrote. 
-  // Don't worry about sizes larger than i64. 
-  ConstantInt *CopySize = cast<ConstantInt>(MemCpy->getLength()); 
-  if (CopySize->getZExtValue() > MemSetSize->getZExtValue()) { 
-    // If the memcpy is larger than the memset, but the memory was undef prior 
-    // to the memset, we can just ignore the tail. Technically we're only 
-    // interested in the bytes from MemSetSize..CopySize here, but as we can't 
-    // easily represent this location, we use the full 0..CopySize range. 
-    MemoryLocation MemCpyLoc = MemoryLocation::getForSource(MemCpy); 
+    return false;
+
+  // A known memset size is required.
+  ConstantInt *MemSetSize = dyn_cast<ConstantInt>(MemSet->getLength());
+  if (!MemSetSize)
+    return false;
+
+  // Make sure the memcpy doesn't read any more than what the memset wrote.
+  // Don't worry about sizes larger than i64.
+  ConstantInt *CopySize = cast<ConstantInt>(MemCpy->getLength());
+  if (CopySize->getZExtValue() > MemSetSize->getZExtValue()) {
+    // If the memcpy is larger than the memset, but the memory was undef prior
+    // to the memset, we can just ignore the tail. Technically we're only
+    // interested in the bytes from MemSetSize..CopySize here, but as we can't
+    // easily represent this location, we use the full 0..CopySize range.
+    MemoryLocation MemCpyLoc = MemoryLocation::getForSource(MemCpy);
     bool CanReduceSize = false;
     if (EnableMemorySSA) {
       MemoryUseOrDef *MemSetAccess = MSSA->getMemoryAccess(MemSet);
@@ -1303,11 +1303,11 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
     }
 
     if (!CanReduceSize)
-      return false; 
+      return false;
     CopySize = MemSetSize;
-  } 
- 
-  IRBuilder<> Builder(MemCpy); 
+  }
+
+  IRBuilder<> Builder(MemCpy);
   Instruction *NewM =
       Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1),
                            CopySize, MaybeAlign(MemCpy->getDestAlignment()));
@@ -1318,31 +1318,31 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
     MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
   }
 
-  return true; 
-} 
- 
-/// Perform simplification of memcpy's.  If we have memcpy A 
-/// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite 
-/// B to be a memcpy from X to Z (or potentially a memmove, depending on 
-/// circumstances). This allows later passes to remove the first memcpy 
-/// altogether. 
-bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) { 
-  // We can only optimize non-volatile memcpy's. 
-  if (M->isVolatile()) return false; 
- 
-  // If the source and destination of the memcpy are the same, then zap it. 
-  if (M->getSource() == M->getDest()) { 
-    ++BBI; 
+  return true;
+}
+
+/// Perform simplification of memcpy's.  If we have memcpy A
+/// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite
+/// B to be a memcpy from X to Z (or potentially a memmove, depending on
+/// circumstances). This allows later passes to remove the first memcpy
+/// altogether.
+bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
+  // We can only optimize non-volatile memcpy's.
+  if (M->isVolatile()) return false;
+
+  // If the source and destination of the memcpy are the same, then zap it.
+  if (M->getSource() == M->getDest()) {
+    ++BBI;
     eraseInstruction(M);
-    return true; 
-  } 
- 
-  // If copying from a constant, try to turn the memcpy into a memset. 
-  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(M->getSource())) 
-    if (GV->isConstant() && GV->hasDefinitiveInitializer()) 
-      if (Value *ByteVal = isBytewiseValue(GV->getInitializer(), 
-                                           M->getModule()->getDataLayout())) { 
-        IRBuilder<> Builder(M); 
+    return true;
+  }
+
+  // If copying from a constant, try to turn the memcpy into a memset.
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(M->getSource()))
+    if (GV->isConstant() && GV->hasDefinitiveInitializer())
+      if (Value *ByteVal = isBytewiseValue(GV->getInitializer(),
+                                           M->getModule()->getDataLayout())) {
+        IRBuilder<> Builder(M);
         Instruction *NewM =
             Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(),
                                  MaybeAlign(M->getDestAlignment()), false);
@@ -1355,17 +1355,17 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
         }
 
         eraseInstruction(M);
-        ++NumCpyToSet; 
-        return true; 
-      } 
- 
+        ++NumCpyToSet;
+        return true;
+      }
+
   if (EnableMemorySSA) {
     MemoryUseOrDef *MA = MSSA->getMemoryAccess(M);
     MemoryAccess *AnyClobber = MSSA->getWalker()->getClobberingMemoryAccess(MA);
     MemoryLocation DestLoc = MemoryLocation::getForDest(M);
     const MemoryAccess *DestClobber =
         MSSA->getWalker()->getClobberingMemoryAccess(AnyClobber, DestLoc);
- 
+
     // Try to turn a partially redundant memset + memcpy into
     // memcpy + smaller memset.  We don't need the memcpy size for this.
     // The memcpy most post-dom the memset, so limit this to the same basic
@@ -1375,11 +1375,11 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
         if (DestClobber->getBlock() == M->getParent())
           if (processMemSetMemCpyDependence(M, MDep))
             return true;
- 
+
     // The optimizations after this point require the memcpy size.
     ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength());
     if (!CopySize) return false;
- 
+
     MemoryAccess *SrcClobber = MSSA->getWalker()->getClobberingMemoryAccess(
         AnyClobber, MemoryLocation::getForSource(M));
 
@@ -1431,19 +1431,19 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
         LLVM_DEBUG(dbgs() << "Removed memcpy from undef\n");
         eraseInstruction(M);
         ++NumMemCpyInstr;
-        return true; 
-      } 
-    } 
+        return true;
+      }
+    }
   } else {
     MemDepResult DepInfo = MD->getDependency(M);
- 
+
     // Try to turn a partially redundant memset + memcpy into
     // memcpy + smaller memset.  We don't need the memcpy size for this.
     if (DepInfo.isClobber())
       if (MemSetInst *MDep = dyn_cast<MemSetInst>(DepInfo.getInst()))
         if (processMemSetMemCpyDependence(M, MDep))
           return true;
- 
+
     // The optimizations after this point require the memcpy size.
     ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength());
     if (!CopySize) return false;
@@ -1468,8 +1468,8 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
           return true;
         }
       }
-    } 
- 
+    }
+
     MemoryLocation SrcLoc = MemoryLocation::getForSource(M);
     MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(
         SrcLoc, true, M->getIterator(), M->getParent());
@@ -1481,10 +1481,10 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
       if (hasUndefContents(SrcDepInfo.getInst(), CopySize)) {
         eraseInstruction(M);
         ++NumMemCpyInstr;
-        return true; 
-      } 
+        return true;
+      }
     }
- 
+
     if (SrcDepInfo.isClobber())
       if (MemSetInst *MDep = dyn_cast<MemSetInst>(SrcDepInfo.getInst()))
         if (performMemCpyToMemSetOptzn(M, MDep)) {
@@ -1494,49 +1494,49 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
         }
   }
 
-  return false; 
-} 
- 
-/// Transforms memmove calls to memcpy calls when the src/dst are guaranteed 
-/// not to alias. 
-bool MemCpyOptPass::processMemMove(MemMoveInst *M) { 
-  if (!TLI->has(LibFunc_memmove)) 
-    return false; 
- 
-  // See if the pointers alias. 
+  return false;
+}
+
+/// Transforms memmove calls to memcpy calls when the src/dst are guaranteed
+/// not to alias.
+bool MemCpyOptPass::processMemMove(MemMoveInst *M) {
+  if (!TLI->has(LibFunc_memmove))
+    return false;
+
+  // See if the pointers alias.
   if (!AA->isNoAlias(MemoryLocation::getForDest(M),
                      MemoryLocation::getForSource(M)))
-    return false; 
- 
-  LLVM_DEBUG(dbgs() << "MemCpyOptPass: Optimizing memmove -> memcpy: " << *M 
-                    << "\n"); 
- 
-  // If not, then we know we can transform this. 
-  Type *ArgTys[3] = { M->getRawDest()->getType(), 
-                      M->getRawSource()->getType(), 
-                      M->getLength()->getType() }; 
-  M->setCalledFunction(Intrinsic::getDeclaration(M->getModule(), 
-                                                 Intrinsic::memcpy, ArgTys)); 
- 
+    return false;
+
+  LLVM_DEBUG(dbgs() << "MemCpyOptPass: Optimizing memmove -> memcpy: " << *M
+                    << "\n");
+
+  // If not, then we know we can transform this.
+  Type *ArgTys[3] = { M->getRawDest()->getType(),
+                      M->getRawSource()->getType(),
+                      M->getLength()->getType() };
+  M->setCalledFunction(Intrinsic::getDeclaration(M->getModule(),
+                                                 Intrinsic::memcpy, ArgTys));
+
   // For MemorySSA nothing really changes (except that memcpy may imply stricter
   // aliasing guarantees).
 
-  // MemDep may have over conservative information about this instruction, just 
-  // conservatively flush it from the cache. 
+  // MemDep may have over conservative information about this instruction, just
+  // conservatively flush it from the cache.
   if (MD)
     MD->removeInstruction(M);
- 
-  ++NumMoveToCpy; 
-  return true; 
-} 
- 
-/// This is called on every byval argument in call sites. 
-bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) { 
-  const DataLayout &DL = CB.getCaller()->getParent()->getDataLayout(); 
-  // Find out what feeds this byval argument. 
-  Value *ByValArg = CB.getArgOperand(ArgNo); 
-  Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType(); 
-  uint64_t ByValSize = DL.getTypeAllocSize(ByValTy); 
+
+  ++NumMoveToCpy;
+  return true;
+}
+
+/// This is called on every byval argument in call sites.
+bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
+  const DataLayout &DL = CB.getCaller()->getParent()->getDataLayout();
+  // Find out what feeds this byval argument.
+  Value *ByValArg = CB.getArgOperand(ArgNo);
+  Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType();
+  uint64_t ByValSize = DL.getTypeAllocSize(ByValTy);
   MemoryLocation Loc(ByValArg, LocationSize::precise(ByValSize));
   MemCpyInst *MDep = nullptr;
   if (EnableMemorySSA) {
@@ -1552,43 +1552,43 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
       return false;
     MDep = dyn_cast<MemCpyInst>(DepInfo.getInst());
   }
- 
-  // If the byval argument isn't fed by a memcpy, ignore it.  If it is fed by 
-  // a memcpy, see if we can byval from the source of the memcpy instead of the 
-  // result. 
-  if (!MDep || MDep->isVolatile() || 
-      ByValArg->stripPointerCasts() != MDep->getDest()) 
-    return false; 
- 
-  // The length of the memcpy must be larger or equal to the size of the byval. 
-  ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength()); 
-  if (!C1 || C1->getValue().getZExtValue() < ByValSize) 
-    return false; 
- 
-  // Get the alignment of the byval.  If the call doesn't specify the alignment, 
-  // then it is some target specific value that we can't know. 
-  MaybeAlign ByValAlign = CB.getParamAlign(ArgNo); 
-  if (!ByValAlign) return false; 
- 
-  // If it is greater than the memcpy, then we check to see if we can force the 
-  // source of the memcpy to the alignment we need.  If we fail, we bail out. 
-  MaybeAlign MemDepAlign = MDep->getSourceAlign(); 
-  if ((!MemDepAlign || *MemDepAlign < *ByValAlign) && 
+
+  // If the byval argument isn't fed by a memcpy, ignore it.  If it is fed by
+  // a memcpy, see if we can byval from the source of the memcpy instead of the
+  // result.
+  if (!MDep || MDep->isVolatile() ||
+      ByValArg->stripPointerCasts() != MDep->getDest())
+    return false;
+
+  // The length of the memcpy must be larger or equal to the size of the byval.
+  ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength());
+  if (!C1 || C1->getValue().getZExtValue() < ByValSize)
+    return false;
+
+  // Get the alignment of the byval.  If the call doesn't specify the alignment,
+  // then it is some target specific value that we can't know.
+  MaybeAlign ByValAlign = CB.getParamAlign(ArgNo);
+  if (!ByValAlign) return false;
+
+  // If it is greater than the memcpy, then we check to see if we can force the
+  // source of the memcpy to the alignment we need.  If we fail, we bail out.
+  MaybeAlign MemDepAlign = MDep->getSourceAlign();
+  if ((!MemDepAlign || *MemDepAlign < *ByValAlign) &&
       getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL, &CB, AC,
                                  DT) < *ByValAlign)
-    return false; 
- 
-  // The address space of the memcpy source must match the byval argument 
-  if (MDep->getSource()->getType()->getPointerAddressSpace() != 
-      ByValArg->getType()->getPointerAddressSpace()) 
-    return false; 
- 
-  // Verify that the copied-from memory doesn't change in between the memcpy and 
-  // the byval call. 
-  //    memcpy(a <- b) 
-  //    *b = 42; 
-  //    foo(*a) 
-  // It would be invalid to transform the second memcpy into foo(*b). 
+    return false;
+
+  // The address space of the memcpy source must match the byval argument
+  if (MDep->getSource()->getType()->getPointerAddressSpace() !=
+      ByValArg->getType()->getPointerAddressSpace())
+    return false;
+
+  // Verify that the copied-from memory doesn't change in between the memcpy and
+  // the byval call.
+  //    memcpy(a <- b)
+  //    *b = 42;
+  //    foo(*a)
+  // It would be invalid to transform the second memcpy into foo(*b).
   if (EnableMemorySSA) {
     if (writtenBetween(MSSA, MemoryLocation::getForSource(MDep),
                        MSSA->getMemoryAccess(MDep), MSSA->getMemoryAccess(&CB)))
@@ -1602,144 +1602,144 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
     if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
       return false;
   }
- 
-  Value *TmpCast = MDep->getSource(); 
-  if (MDep->getSource()->getType() != ByValArg->getType()) { 
-    BitCastInst *TmpBitCast = new BitCastInst(MDep->getSource(), ByValArg->getType(), 
-                                              "tmpcast", &CB); 
-    // Set the tmpcast's DebugLoc to MDep's 
-    TmpBitCast->setDebugLoc(MDep->getDebugLoc()); 
-    TmpCast = TmpBitCast; 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to byval:\n" 
-                    << "  " << *MDep << "\n" 
-                    << "  " << CB << "\n"); 
- 
-  // Otherwise we're good!  Update the byval argument. 
-  CB.setArgOperand(ArgNo, TmpCast); 
-  ++NumMemCpyInstr; 
-  return true; 
-} 
- 
-/// Executes one iteration of MemCpyOptPass. 
-bool MemCpyOptPass::iterateOnFunction(Function &F) { 
-  bool MadeChange = false; 
- 
-  // Walk all instruction in the function. 
-  for (BasicBlock &BB : F) { 
-    // Skip unreachable blocks. For example processStore assumes that an 
-    // instruction in a BB can't be dominated by a later instruction in the 
-    // same BB (which is a scenario that can happen for an unreachable BB that 
-    // has itself as a predecessor). 
+
+  Value *TmpCast = MDep->getSource();
+  if (MDep->getSource()->getType() != ByValArg->getType()) {
+    BitCastInst *TmpBitCast = new BitCastInst(MDep->getSource(), ByValArg->getType(),
+                                              "tmpcast", &CB);
+    // Set the tmpcast's DebugLoc to MDep's
+    TmpBitCast->setDebugLoc(MDep->getDebugLoc());
+    TmpCast = TmpBitCast;
+  }
+
+  LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to byval:\n"
+                    << "  " << *MDep << "\n"
+                    << "  " << CB << "\n");
+
+  // Otherwise we're good!  Update the byval argument.
+  CB.setArgOperand(ArgNo, TmpCast);
+  ++NumMemCpyInstr;
+  return true;
+}
+
+/// Executes one iteration of MemCpyOptPass.
+bool MemCpyOptPass::iterateOnFunction(Function &F) {
+  bool MadeChange = false;
+
+  // Walk all instruction in the function.
+  for (BasicBlock &BB : F) {
+    // Skip unreachable blocks. For example processStore assumes that an
+    // instruction in a BB can't be dominated by a later instruction in the
+    // same BB (which is a scenario that can happen for an unreachable BB that
+    // has itself as a predecessor).
     if (!DT->isReachableFromEntry(&BB))
-      continue; 
- 
-    for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) { 
-        // Avoid invalidating the iterator. 
-      Instruction *I = &*BI++; 
- 
-      bool RepeatInstruction = false; 
- 
-      if (StoreInst *SI = dyn_cast<StoreInst>(I)) 
-        MadeChange |= processStore(SI, BI); 
-      else if (MemSetInst *M = dyn_cast<MemSetInst>(I)) 
-        RepeatInstruction = processMemSet(M, BI); 
-      else if (MemCpyInst *M = dyn_cast<MemCpyInst>(I)) 
-        RepeatInstruction = processMemCpy(M, BI); 
-      else if (MemMoveInst *M = dyn_cast<MemMoveInst>(I)) 
-        RepeatInstruction = processMemMove(M); 
-      else if (auto *CB = dyn_cast<CallBase>(I)) { 
-        for (unsigned i = 0, e = CB->arg_size(); i != e; ++i) 
-          if (CB->isByValArgument(i)) 
-            MadeChange |= processByValArgument(*CB, i); 
-      } 
- 
-      // Reprocess the instruction if desired. 
-      if (RepeatInstruction) { 
-        if (BI != BB.begin()) 
-          --BI; 
-        MadeChange = true; 
-      } 
-    } 
-  } 
- 
-  return MadeChange; 
-} 
- 
-PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) { 
+      continue;
+
+    for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
+        // Avoid invalidating the iterator.
+      Instruction *I = &*BI++;
+
+      bool RepeatInstruction = false;
+
+      if (StoreInst *SI = dyn_cast<StoreInst>(I))
+        MadeChange |= processStore(SI, BI);
+      else if (MemSetInst *M = dyn_cast<MemSetInst>(I))
+        RepeatInstruction = processMemSet(M, BI);
+      else if (MemCpyInst *M = dyn_cast<MemCpyInst>(I))
+        RepeatInstruction = processMemCpy(M, BI);
+      else if (MemMoveInst *M = dyn_cast<MemMoveInst>(I))
+        RepeatInstruction = processMemMove(M);
+      else if (auto *CB = dyn_cast<CallBase>(I)) {
+        for (unsigned i = 0, e = CB->arg_size(); i != e; ++i)
+          if (CB->isByValArgument(i))
+            MadeChange |= processByValArgument(*CB, i);
+      }
+
+      // Reprocess the instruction if desired.
+      if (RepeatInstruction) {
+        if (BI != BB.begin())
+          --BI;
+        MadeChange = true;
+      }
+    }
+  }
+
+  return MadeChange;
+}
+
+PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) {
   auto *MD = !EnableMemorySSA ? &AM.getResult<MemoryDependenceAnalysis>(F)
                               : AM.getCachedResult<MemoryDependenceAnalysis>(F);
-  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto *AA = &AM.getResult<AAManager>(F);
   auto *AC = &AM.getResult<AssumptionAnalysis>(F);
   auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
   auto *MSSA = EnableMemorySSA ? &AM.getResult<MemorySSAAnalysis>(F)
                                : AM.getCachedResult<MemorySSAAnalysis>(F);
- 
+
   bool MadeChange =
       runImpl(F, MD, &TLI, AA, AC, DT, MSSA ? &MSSA->getMSSA() : nullptr);
-  if (!MadeChange) 
-    return PreservedAnalyses::all(); 
- 
-  PreservedAnalyses PA; 
-  PA.preserveSet<CFGAnalyses>(); 
-  PA.preserve<GlobalsAA>(); 
+  if (!MadeChange)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<GlobalsAA>();
   if (MD)
     PA.preserve<MemoryDependenceAnalysis>();
   if (MSSA)
     PA.preserve<MemorySSAAnalysis>();
-  return PA; 
-} 
- 
+  return PA;
+}
+
 bool MemCpyOptPass::runImpl(Function &F, MemoryDependenceResults *MD_,
                             TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
                             AssumptionCache *AC_, DominatorTree *DT_,
                             MemorySSA *MSSA_) {
-  bool MadeChange = false; 
-  MD = MD_; 
-  TLI = TLI_; 
+  bool MadeChange = false;
+  MD = MD_;
+  TLI = TLI_;
   AA = AA_;
   AC = AC_;
   DT = DT_;
   MSSA = MSSA_;
   MemorySSAUpdater MSSAU_(MSSA_);
   MSSAU = MSSA_ ? &MSSAU_ : nullptr;
-  // If we don't have at least memset and memcpy, there is little point of doing 
-  // anything here.  These are required by a freestanding implementation, so if 
-  // even they are disabled, there is no point in trying hard. 
-  if (!TLI->has(LibFunc_memset) || !TLI->has(LibFunc_memcpy)) 
-    return false; 
- 
-  while (true) { 
-    if (!iterateOnFunction(F)) 
-      break; 
-    MadeChange = true; 
-  } 
- 
+  // If we don't have at least memset and memcpy, there is little point of doing
+  // anything here.  These are required by a freestanding implementation, so if
+  // even they are disabled, there is no point in trying hard.
+  if (!TLI->has(LibFunc_memset) || !TLI->has(LibFunc_memcpy))
+    return false;
+
+  while (true) {
+    if (!iterateOnFunction(F))
+      break;
+    MadeChange = true;
+  }
+
   if (MSSA_ && VerifyMemorySSA)
     MSSA_->verifyMemorySSA();
 
-  MD = nullptr; 
-  return MadeChange; 
-} 
- 
-/// This is the main transformation entry point for a function. 
-bool MemCpyOptLegacyPass::runOnFunction(Function &F) { 
-  if (skipFunction(F)) 
-    return false; 
- 
+  MD = nullptr;
+  return MadeChange;
+}
+
+/// This is the main transformation entry point for a function.
+bool MemCpyOptLegacyPass::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
   auto *MDWP = !EnableMemorySSA
       ? &getAnalysis<MemoryDependenceWrapperPass>()
       : getAnalysisIfAvailable<MemoryDependenceWrapperPass>();
-  auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 
+  auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto *MSSAWP = EnableMemorySSA
       ? &getAnalysis<MemorySSAWrapperPass>()
       : getAnalysisIfAvailable<MemorySSAWrapperPass>();
- 
+
   return Impl.runImpl(F, MDWP ? & MDWP->getMemDep() : nullptr, TLI, AA, AC, DT,
                       MSSAWP ? &MSSAWP->getMSSA() : nullptr);
-} 
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/MergeICmps.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/MergeICmps.cpp
index 2d9c612494..7f8b75ac88 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/MergeICmps.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/MergeICmps.cpp
@@ -1,629 +1,629 @@
-//===- MergeICmps.cpp - Optimize chains of integer comparisons ------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass turns chains of integer comparisons into memcmp (the memcmp is 
-// later typically inlined as a chain of efficient hardware comparisons). This 
-// typically benefits c++ member or nonmember operator==(). 
-// 
-// The basic idea is to replace a longer chain of integer comparisons loaded 
-// from contiguous memory locations into a shorter chain of larger integer 
-// comparisons. Benefits are double: 
-//  - There are less jumps, and therefore less opportunities for mispredictions 
-//    and I-cache misses. 
-//  - Code size is smaller, both because jumps are removed and because the 
-//    encoding of a 2*n byte compare is smaller than that of two n-byte 
-//    compares. 
-// 
-// Example: 
-// 
-//  struct S { 
-//    int a; 
-//    char b; 
-//    char c; 
-//    uint16_t d; 
-//    bool operator==(const S& o) const { 
-//      return a == o.a && b == o.b && c == o.c && d == o.d; 
-//    } 
-//  }; 
-// 
-//  Is optimized as : 
-// 
-//    bool S::operator==(const S& o) const { 
-//      return memcmp(this, &o, 8) == 0; 
-//    } 
-// 
-//  Which will later be expanded (ExpandMemCmp) as a single 8-bytes icmp. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/MergeICmps.h" 
-#include "llvm/Analysis/DomTreeUpdater.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/Loads.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/BuildLibCalls.h" 
-#include <algorithm> 
-#include <numeric> 
-#include <utility> 
-#include <vector> 
- 
-using namespace llvm; 
- 
-namespace { 
- 
-#define DEBUG_TYPE "mergeicmps" 
- 
-// Returns true if the instruction is a simple load or a simple store 
-static bool isSimpleLoadOrStore(const Instruction *I) { 
-  if (const LoadInst *LI = dyn_cast<LoadInst>(I)) 
-    return LI->isSimple(); 
-  if (const StoreInst *SI = dyn_cast<StoreInst>(I)) 
-    return SI->isSimple(); 
-  return false; 
-} 
- 
-// A BCE atom "Binary Compare Expression Atom" represents an integer load 
-// that is a constant offset from a base value, e.g. `a` or `o.c` in the example 
-// at the top. 
-struct BCEAtom { 
-  BCEAtom() = default; 
-  BCEAtom(GetElementPtrInst *GEP, LoadInst *LoadI, int BaseId, APInt Offset) 
-      : GEP(GEP), LoadI(LoadI), BaseId(BaseId), Offset(Offset) {} 
- 
-  BCEAtom(const BCEAtom &) = delete; 
-  BCEAtom &operator=(const BCEAtom &) = delete; 
- 
-  BCEAtom(BCEAtom &&that) = default; 
-  BCEAtom &operator=(BCEAtom &&that) { 
-    if (this == &that) 
-      return *this; 
-    GEP = that.GEP; 
-    LoadI = that.LoadI; 
-    BaseId = that.BaseId; 
-    Offset = std::move(that.Offset); 
-    return *this; 
-  } 
- 
-  // We want to order BCEAtoms by (Base, Offset). However we cannot use 
-  // the pointer values for Base because these are non-deterministic. 
-  // To make sure that the sort order is stable, we first assign to each atom 
-  // base value an index based on its order of appearance in the chain of 
-  // comparisons. We call this index `BaseOrdering`. For example, for: 
-  //    b[3] == c[2] && a[1] == d[1] && b[4] == c[3] 
-  //    |  block 1 |    |  block 2 |    |  block 3 | 
-  // b gets assigned index 0 and a index 1, because b appears as LHS in block 1, 
-  // which is before block 2. 
-  // We then sort by (BaseOrdering[LHS.Base()], LHS.Offset), which is stable. 
-  bool operator<(const BCEAtom &O) const { 
-    return BaseId != O.BaseId ? BaseId < O.BaseId : Offset.slt(O.Offset); 
-  } 
- 
-  GetElementPtrInst *GEP = nullptr; 
-  LoadInst *LoadI = nullptr; 
-  unsigned BaseId = 0; 
-  APInt Offset; 
-}; 
- 
-// A class that assigns increasing ids to values in the order in which they are 
-// seen. See comment in `BCEAtom::operator<()``. 
-class BaseIdentifier { 
-public: 
-  // Returns the id for value `Base`, after assigning one if `Base` has not been 
-  // seen before. 
-  int getBaseId(const Value *Base) { 
-    assert(Base && "invalid base"); 
-    const auto Insertion = BaseToIndex.try_emplace(Base, Order); 
-    if (Insertion.second) 
-      ++Order; 
-    return Insertion.first->second; 
-  } 
- 
-private: 
-  unsigned Order = 1; 
-  DenseMap<const Value*, int> BaseToIndex; 
-}; 
- 
-// If this value is a load from a constant offset w.r.t. a base address, and 
-// there are no other users of the load or address, returns the base address and 
-// the offset. 
-BCEAtom visitICmpLoadOperand(Value *const Val, BaseIdentifier &BaseId) { 
-  auto *const LoadI = dyn_cast<LoadInst>(Val); 
-  if (!LoadI) 
-    return {}; 
-  LLVM_DEBUG(dbgs() << "load\n"); 
-  if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) { 
-    LLVM_DEBUG(dbgs() << "used outside of block\n"); 
-    return {}; 
-  } 
-  // Do not optimize atomic loads to non-atomic memcmp 
-  if (!LoadI->isSimple()) { 
-    LLVM_DEBUG(dbgs() << "volatile or atomic\n"); 
-    return {}; 
-  } 
-  Value *const Addr = LoadI->getOperand(0); 
-  auto *const GEP = dyn_cast<GetElementPtrInst>(Addr); 
-  if (!GEP) 
-    return {}; 
-  LLVM_DEBUG(dbgs() << "GEP\n"); 
-  if (GEP->isUsedOutsideOfBlock(LoadI->getParent())) { 
-    LLVM_DEBUG(dbgs() << "used outside of block\n"); 
-    return {}; 
-  } 
-  const auto &DL = GEP->getModule()->getDataLayout(); 
-  if (!isDereferenceablePointer(GEP, LoadI->getType(), DL)) { 
-    LLVM_DEBUG(dbgs() << "not dereferenceable\n"); 
-    // We need to make sure that we can do comparison in any order, so we 
-    // require memory to be unconditionnally dereferencable. 
-    return {}; 
-  } 
-  APInt Offset = APInt(DL.getPointerTypeSizeInBits(GEP->getType()), 0); 
-  if (!GEP->accumulateConstantOffset(DL, Offset)) 
-    return {}; 
-  return BCEAtom(GEP, LoadI, BaseId.getBaseId(GEP->getPointerOperand()), 
-                 Offset); 
-} 
- 
-// A basic block with a comparison between two BCE atoms, e.g. `a == o.a` in the 
-// example at the top. 
-// The block might do extra work besides the atom comparison, in which case 
-// doesOtherWork() returns true. Under some conditions, the block can be 
-// split into the atom comparison part and the "other work" part 
-// (see canSplit()). 
-// Note: the terminology is misleading: the comparison is symmetric, so there 
-// is no real {l/r}hs. What we want though is to have the same base on the 
-// left (resp. right), so that we can detect consecutive loads. To ensure this 
-// we put the smallest atom on the left. 
-class BCECmpBlock { 
- public: 
-  BCECmpBlock() {} 
- 
-  BCECmpBlock(BCEAtom L, BCEAtom R, int SizeBits) 
-      : Lhs_(std::move(L)), Rhs_(std::move(R)), SizeBits_(SizeBits) { 
-    if (Rhs_ < Lhs_) std::swap(Rhs_, Lhs_); 
-  } 
- 
-  bool IsValid() const { return Lhs_.BaseId != 0 && Rhs_.BaseId != 0; } 
- 
-  // Assert the block is consistent: If valid, it should also have 
-  // non-null members besides Lhs_ and Rhs_. 
-  void AssertConsistent() const { 
-    if (IsValid()) { 
-      assert(BB); 
-      assert(CmpI); 
-      assert(BranchI); 
-    } 
-  } 
- 
-  const BCEAtom &Lhs() const { return Lhs_; } 
-  const BCEAtom &Rhs() const { return Rhs_; } 
-  int SizeBits() const { return SizeBits_; } 
- 
-  // Returns true if the block does other works besides comparison. 
-  bool doesOtherWork() const; 
- 
-  // Returns true if the non-BCE-cmp instructions can be separated from BCE-cmp 
-  // instructions in the block. 
-  bool canSplit(AliasAnalysis &AA) const; 
- 
-  // Return true if this all the relevant instructions in the BCE-cmp-block can 
-  // be sunk below this instruction. By doing this, we know we can separate the 
-  // BCE-cmp-block instructions from the non-BCE-cmp-block instructions in the 
-  // block. 
-  bool canSinkBCECmpInst(const Instruction *, DenseSet<Instruction *> &, 
-                         AliasAnalysis &AA) const; 
- 
-  // We can separate the BCE-cmp-block instructions and the non-BCE-cmp-block 
-  // instructions. Split the old block and move all non-BCE-cmp-insts into the 
-  // new parent block. 
-  void split(BasicBlock *NewParent, AliasAnalysis &AA) const; 
- 
-  // The basic block where this comparison happens. 
-  BasicBlock *BB = nullptr; 
-  // The ICMP for this comparison. 
-  ICmpInst *CmpI = nullptr; 
-  // The terminating branch. 
-  BranchInst *BranchI = nullptr; 
-  // The block requires splitting. 
-  bool RequireSplit = false; 
- 
-private: 
-  BCEAtom Lhs_; 
-  BCEAtom Rhs_; 
-  int SizeBits_ = 0; 
-}; 
- 
-bool BCECmpBlock::canSinkBCECmpInst(const Instruction *Inst, 
-                                    DenseSet<Instruction *> &BlockInsts, 
-                                    AliasAnalysis &AA) const { 
-  // If this instruction has side effects and its in middle of the BCE cmp block 
-  // instructions, then bail for now. 
-  if (Inst->mayHaveSideEffects()) { 
-    // Bail if this is not a simple load or store 
-    if (!isSimpleLoadOrStore(Inst)) 
-      return false; 
-    // Disallow stores that might alias the BCE operands 
-    MemoryLocation LLoc = MemoryLocation::get(Lhs_.LoadI); 
-    MemoryLocation RLoc = MemoryLocation::get(Rhs_.LoadI); 
-    if (isModSet(AA.getModRefInfo(Inst, LLoc)) || 
-        isModSet(AA.getModRefInfo(Inst, RLoc))) 
-      return false; 
-  } 
-  // Make sure this instruction does not use any of the BCE cmp block 
-  // instructions as operand. 
-  for (auto BI : BlockInsts) { 
-    if (is_contained(Inst->operands(), BI)) 
-      return false; 
-  } 
-  return true; 
-} 
- 
-void BCECmpBlock::split(BasicBlock *NewParent, AliasAnalysis &AA) const { 
-  DenseSet<Instruction *> BlockInsts( 
-      {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI}); 
-  llvm::SmallVector<Instruction *, 4> OtherInsts; 
-  for (Instruction &Inst : *BB) { 
-    if (BlockInsts.count(&Inst)) 
-      continue; 
-      assert(canSinkBCECmpInst(&Inst, BlockInsts, AA) && 
-             "Split unsplittable block"); 
-    // This is a non-BCE-cmp-block instruction. And it can be separated 
-    // from the BCE-cmp-block instruction. 
-    OtherInsts.push_back(&Inst); 
-  } 
- 
-  // Do the actual spliting. 
-  for (Instruction *Inst : reverse(OtherInsts)) { 
-    Inst->moveBefore(&*NewParent->begin()); 
-  } 
-} 
- 
-bool BCECmpBlock::canSplit(AliasAnalysis &AA) const { 
-  DenseSet<Instruction *> BlockInsts( 
-      {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI}); 
-  for (Instruction &Inst : *BB) { 
-    if (!BlockInsts.count(&Inst)) { 
-      if (!canSinkBCECmpInst(&Inst, BlockInsts, AA)) 
-        return false; 
-    } 
-  } 
-  return true; 
-} 
- 
-bool BCECmpBlock::doesOtherWork() const { 
-  AssertConsistent(); 
-  // All the instructions we care about in the BCE cmp block. 
-  DenseSet<Instruction *> BlockInsts( 
-      {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI}); 
-  // TODO(courbet): Can we allow some other things ? This is very conservative. 
-  // We might be able to get away with anything does not have any side 
-  // effects outside of the basic block. 
-  // Note: The GEPs and/or loads are not necessarily in the same block. 
-  for (const Instruction &Inst : *BB) { 
-    if (!BlockInsts.count(&Inst)) 
-      return true; 
-  } 
-  return false; 
-} 
- 
-// Visit the given comparison. If this is a comparison between two valid 
-// BCE atoms, returns the comparison. 
-BCECmpBlock visitICmp(const ICmpInst *const CmpI, 
-                      const ICmpInst::Predicate ExpectedPredicate, 
-                      BaseIdentifier &BaseId) { 
-  // The comparison can only be used once: 
-  //  - For intermediate blocks, as a branch condition. 
-  //  - For the final block, as an incoming value for the Phi. 
-  // If there are any other uses of the comparison, we cannot merge it with 
-  // other comparisons as we would create an orphan use of the value. 
-  if (!CmpI->hasOneUse()) { 
-    LLVM_DEBUG(dbgs() << "cmp has several uses\n"); 
-    return {}; 
-  } 
-  if (CmpI->getPredicate() != ExpectedPredicate) 
-    return {}; 
-  LLVM_DEBUG(dbgs() << "cmp " 
-                    << (ExpectedPredicate == ICmpInst::ICMP_EQ ? "eq" : "ne") 
-                    << "\n"); 
-  auto Lhs = visitICmpLoadOperand(CmpI->getOperand(0), BaseId); 
-  if (!Lhs.BaseId) 
-    return {}; 
-  auto Rhs = visitICmpLoadOperand(CmpI->getOperand(1), BaseId); 
-  if (!Rhs.BaseId) 
-    return {}; 
-  const auto &DL = CmpI->getModule()->getDataLayout(); 
-  return BCECmpBlock(std::move(Lhs), std::move(Rhs), 
-                     DL.getTypeSizeInBits(CmpI->getOperand(0)->getType())); 
-} 
- 
-// Visit the given comparison block. If this is a comparison between two valid 
-// BCE atoms, returns the comparison. 
-BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block, 
-                          const BasicBlock *const PhiBlock, 
-                          BaseIdentifier &BaseId) { 
-  if (Block->empty()) return {}; 
-  auto *const BranchI = dyn_cast<BranchInst>(Block->getTerminator()); 
-  if (!BranchI) return {}; 
-  LLVM_DEBUG(dbgs() << "branch\n"); 
-  if (BranchI->isUnconditional()) { 
-    // In this case, we expect an incoming value which is the result of the 
-    // comparison. This is the last link in the chain of comparisons (note 
-    // that this does not mean that this is the last incoming value, blocks 
-    // can be reordered). 
-    auto *const CmpI = dyn_cast<ICmpInst>(Val); 
-    if (!CmpI) return {}; 
-    LLVM_DEBUG(dbgs() << "icmp\n"); 
-    auto Result = visitICmp(CmpI, ICmpInst::ICMP_EQ, BaseId); 
-    Result.CmpI = CmpI; 
-    Result.BranchI = BranchI; 
-    return Result; 
-  } else { 
-    // In this case, we expect a constant incoming value (the comparison is 
-    // chained). 
+//===- MergeICmps.cpp - Optimize chains of integer comparisons ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass turns chains of integer comparisons into memcmp (the memcmp is
+// later typically inlined as a chain of efficient hardware comparisons). This
+// typically benefits c++ member or nonmember operator==().
+//
+// The basic idea is to replace a longer chain of integer comparisons loaded
+// from contiguous memory locations into a shorter chain of larger integer
+// comparisons. Benefits are double:
+//  - There are less jumps, and therefore less opportunities for mispredictions
+//    and I-cache misses.
+//  - Code size is smaller, both because jumps are removed and because the
+//    encoding of a 2*n byte compare is smaller than that of two n-byte
+//    compares.
+//
+// Example:
+//
+//  struct S {
+//    int a;
+//    char b;
+//    char c;
+//    uint16_t d;
+//    bool operator==(const S& o) const {
+//      return a == o.a && b == o.b && c == o.c && d == o.d;
+//    }
+//  };
+//
+//  Is optimized as :
+//
+//    bool S::operator==(const S& o) const {
+//      return memcmp(this, &o, 8) == 0;
+//    }
+//
+//  Which will later be expanded (ExpandMemCmp) as a single 8-bytes icmp.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/MergeICmps.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
+#include <algorithm>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+namespace {
+
+#define DEBUG_TYPE "mergeicmps"
+
+// Returns true if the instruction is a simple load or a simple store
+static bool isSimpleLoadOrStore(const Instruction *I) {
+  if (const LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->isSimple();
+  if (const StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->isSimple();
+  return false;
+}
+
+// A BCE atom "Binary Compare Expression Atom" represents an integer load
+// that is a constant offset from a base value, e.g. `a` or `o.c` in the example
+// at the top.
+struct BCEAtom {
+  BCEAtom() = default;
+  BCEAtom(GetElementPtrInst *GEP, LoadInst *LoadI, int BaseId, APInt Offset)
+      : GEP(GEP), LoadI(LoadI), BaseId(BaseId), Offset(Offset) {}
+
+  BCEAtom(const BCEAtom &) = delete;
+  BCEAtom &operator=(const BCEAtom &) = delete;
+
+  BCEAtom(BCEAtom &&that) = default;
+  BCEAtom &operator=(BCEAtom &&that) {
+    if (this == &that)
+      return *this;
+    GEP = that.GEP;
+    LoadI = that.LoadI;
+    BaseId = that.BaseId;
+    Offset = std::move(that.Offset);
+    return *this;
+  }
+
+  // We want to order BCEAtoms by (Base, Offset). However we cannot use
+  // the pointer values for Base because these are non-deterministic.
+  // To make sure that the sort order is stable, we first assign to each atom
+  // base value an index based on its order of appearance in the chain of
+  // comparisons. We call this index `BaseOrdering`. For example, for:
+  //    b[3] == c[2] && a[1] == d[1] && b[4] == c[3]
+  //    |  block 1 |    |  block 2 |    |  block 3 |
+  // b gets assigned index 0 and a index 1, because b appears as LHS in block 1,
+  // which is before block 2.
+  // We then sort by (BaseOrdering[LHS.Base()], LHS.Offset), which is stable.
+  bool operator<(const BCEAtom &O) const {
+    return BaseId != O.BaseId ? BaseId < O.BaseId : Offset.slt(O.Offset);
+  }
+
+  GetElementPtrInst *GEP = nullptr;
+  LoadInst *LoadI = nullptr;
+  unsigned BaseId = 0;
+  APInt Offset;
+};
+
+// A class that assigns increasing ids to values in the order in which they are
+// seen. See comment in `BCEAtom::operator<()``.
+class BaseIdentifier {
+public:
+  // Returns the id for value `Base`, after assigning one if `Base` has not been
+  // seen before.
+  int getBaseId(const Value *Base) {
+    assert(Base && "invalid base");
+    const auto Insertion = BaseToIndex.try_emplace(Base, Order);
+    if (Insertion.second)
+      ++Order;
+    return Insertion.first->second;
+  }
+
+private:
+  unsigned Order = 1;
+  DenseMap<const Value*, int> BaseToIndex;
+};
+
+// If this value is a load from a constant offset w.r.t. a base address, and
+// there are no other users of the load or address, returns the base address and
+// the offset.
+BCEAtom visitICmpLoadOperand(Value *const Val, BaseIdentifier &BaseId) {
+  auto *const LoadI = dyn_cast<LoadInst>(Val);
+  if (!LoadI)
+    return {};
+  LLVM_DEBUG(dbgs() << "load\n");
+  if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) {
+    LLVM_DEBUG(dbgs() << "used outside of block\n");
+    return {};
+  }
+  // Do not optimize atomic loads to non-atomic memcmp
+  if (!LoadI->isSimple()) {
+    LLVM_DEBUG(dbgs() << "volatile or atomic\n");
+    return {};
+  }
+  Value *const Addr = LoadI->getOperand(0);
+  auto *const GEP = dyn_cast<GetElementPtrInst>(Addr);
+  if (!GEP)
+    return {};
+  LLVM_DEBUG(dbgs() << "GEP\n");
+  if (GEP->isUsedOutsideOfBlock(LoadI->getParent())) {
+    LLVM_DEBUG(dbgs() << "used outside of block\n");
+    return {};
+  }
+  const auto &DL = GEP->getModule()->getDataLayout();
+  if (!isDereferenceablePointer(GEP, LoadI->getType(), DL)) {
+    LLVM_DEBUG(dbgs() << "not dereferenceable\n");
+    // We need to make sure that we can do comparison in any order, so we
+    // require memory to be unconditionnally dereferencable.
+    return {};
+  }
+  APInt Offset = APInt(DL.getPointerTypeSizeInBits(GEP->getType()), 0);
+  if (!GEP->accumulateConstantOffset(DL, Offset))
+    return {};
+  return BCEAtom(GEP, LoadI, BaseId.getBaseId(GEP->getPointerOperand()),
+                 Offset);
+}
+
+// A basic block with a comparison between two BCE atoms, e.g. `a == o.a` in the
+// example at the top.
+// The block might do extra work besides the atom comparison, in which case
+// doesOtherWork() returns true. Under some conditions, the block can be
+// split into the atom comparison part and the "other work" part
+// (see canSplit()).
+// Note: the terminology is misleading: the comparison is symmetric, so there
+// is no real {l/r}hs. What we want though is to have the same base on the
+// left (resp. right), so that we can detect consecutive loads. To ensure this
+// we put the smallest atom on the left.
+class BCECmpBlock {
+ public:
+  BCECmpBlock() {}
+
+  BCECmpBlock(BCEAtom L, BCEAtom R, int SizeBits)
+      : Lhs_(std::move(L)), Rhs_(std::move(R)), SizeBits_(SizeBits) {
+    if (Rhs_ < Lhs_) std::swap(Rhs_, Lhs_);
+  }
+
+  bool IsValid() const { return Lhs_.BaseId != 0 && Rhs_.BaseId != 0; }
+
+  // Assert the block is consistent: If valid, it should also have
+  // non-null members besides Lhs_ and Rhs_.
+  void AssertConsistent() const {
+    if (IsValid()) {
+      assert(BB);
+      assert(CmpI);
+      assert(BranchI);
+    }
+  }
+
+  const BCEAtom &Lhs() const { return Lhs_; }
+  const BCEAtom &Rhs() const { return Rhs_; }
+  int SizeBits() const { return SizeBits_; }
+
+  // Returns true if the block does other works besides comparison.
+  bool doesOtherWork() const;
+
+  // Returns true if the non-BCE-cmp instructions can be separated from BCE-cmp
+  // instructions in the block.
+  bool canSplit(AliasAnalysis &AA) const;
+
+  // Return true if this all the relevant instructions in the BCE-cmp-block can
+  // be sunk below this instruction. By doing this, we know we can separate the
+  // BCE-cmp-block instructions from the non-BCE-cmp-block instructions in the
+  // block.
+  bool canSinkBCECmpInst(const Instruction *, DenseSet<Instruction *> &,
+                         AliasAnalysis &AA) const;
+
+  // We can separate the BCE-cmp-block instructions and the non-BCE-cmp-block
+  // instructions. Split the old block and move all non-BCE-cmp-insts into the
+  // new parent block.
+  void split(BasicBlock *NewParent, AliasAnalysis &AA) const;
+
+  // The basic block where this comparison happens.
+  BasicBlock *BB = nullptr;
+  // The ICMP for this comparison.
+  ICmpInst *CmpI = nullptr;
+  // The terminating branch.
+  BranchInst *BranchI = nullptr;
+  // The block requires splitting.
+  bool RequireSplit = false;
+
+private:
+  BCEAtom Lhs_;
+  BCEAtom Rhs_;
+  int SizeBits_ = 0;
+};
+
+bool BCECmpBlock::canSinkBCECmpInst(const Instruction *Inst,
+                                    DenseSet<Instruction *> &BlockInsts,
+                                    AliasAnalysis &AA) const {
+  // If this instruction has side effects and its in middle of the BCE cmp block
+  // instructions, then bail for now.
+  if (Inst->mayHaveSideEffects()) {
+    // Bail if this is not a simple load or store
+    if (!isSimpleLoadOrStore(Inst))
+      return false;
+    // Disallow stores that might alias the BCE operands
+    MemoryLocation LLoc = MemoryLocation::get(Lhs_.LoadI);
+    MemoryLocation RLoc = MemoryLocation::get(Rhs_.LoadI);
+    if (isModSet(AA.getModRefInfo(Inst, LLoc)) ||
+        isModSet(AA.getModRefInfo(Inst, RLoc)))
+      return false;
+  }
+  // Make sure this instruction does not use any of the BCE cmp block
+  // instructions as operand.
+  for (auto BI : BlockInsts) {
+    if (is_contained(Inst->operands(), BI))
+      return false;
+  }
+  return true;
+}
+
+void BCECmpBlock::split(BasicBlock *NewParent, AliasAnalysis &AA) const {
+  DenseSet<Instruction *> BlockInsts(
+      {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI});
+  llvm::SmallVector<Instruction *, 4> OtherInsts;
+  for (Instruction &Inst : *BB) {
+    if (BlockInsts.count(&Inst))
+      continue;
+      assert(canSinkBCECmpInst(&Inst, BlockInsts, AA) &&
+             "Split unsplittable block");
+    // This is a non-BCE-cmp-block instruction. And it can be separated
+    // from the BCE-cmp-block instruction.
+    OtherInsts.push_back(&Inst);
+  }
+
+  // Do the actual spliting.
+  for (Instruction *Inst : reverse(OtherInsts)) {
+    Inst->moveBefore(&*NewParent->begin());
+  }
+}
+
+bool BCECmpBlock::canSplit(AliasAnalysis &AA) const {
+  DenseSet<Instruction *> BlockInsts(
+      {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI});
+  for (Instruction &Inst : *BB) {
+    if (!BlockInsts.count(&Inst)) {
+      if (!canSinkBCECmpInst(&Inst, BlockInsts, AA))
+        return false;
+    }
+  }
+  return true;
+}
+
+bool BCECmpBlock::doesOtherWork() const {
+  AssertConsistent();
+  // All the instructions we care about in the BCE cmp block.
+  DenseSet<Instruction *> BlockInsts(
+      {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI});
+  // TODO(courbet): Can we allow some other things ? This is very conservative.
+  // We might be able to get away with anything does not have any side
+  // effects outside of the basic block.
+  // Note: The GEPs and/or loads are not necessarily in the same block.
+  for (const Instruction &Inst : *BB) {
+    if (!BlockInsts.count(&Inst))
+      return true;
+  }
+  return false;
+}
+
+// Visit the given comparison. If this is a comparison between two valid
+// BCE atoms, returns the comparison.
+BCECmpBlock visitICmp(const ICmpInst *const CmpI,
+                      const ICmpInst::Predicate ExpectedPredicate,
+                      BaseIdentifier &BaseId) {
+  // The comparison can only be used once:
+  //  - For intermediate blocks, as a branch condition.
+  //  - For the final block, as an incoming value for the Phi.
+  // If there are any other uses of the comparison, we cannot merge it with
+  // other comparisons as we would create an orphan use of the value.
+  if (!CmpI->hasOneUse()) {
+    LLVM_DEBUG(dbgs() << "cmp has several uses\n");
+    return {};
+  }
+  if (CmpI->getPredicate() != ExpectedPredicate)
+    return {};
+  LLVM_DEBUG(dbgs() << "cmp "
+                    << (ExpectedPredicate == ICmpInst::ICMP_EQ ? "eq" : "ne")
+                    << "\n");
+  auto Lhs = visitICmpLoadOperand(CmpI->getOperand(0), BaseId);
+  if (!Lhs.BaseId)
+    return {};
+  auto Rhs = visitICmpLoadOperand(CmpI->getOperand(1), BaseId);
+  if (!Rhs.BaseId)
+    return {};
+  const auto &DL = CmpI->getModule()->getDataLayout();
+  return BCECmpBlock(std::move(Lhs), std::move(Rhs),
+                     DL.getTypeSizeInBits(CmpI->getOperand(0)->getType()));
+}
+
+// Visit the given comparison block. If this is a comparison between two valid
+// BCE atoms, returns the comparison.
+BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block,
+                          const BasicBlock *const PhiBlock,
+                          BaseIdentifier &BaseId) {
+  if (Block->empty()) return {};
+  auto *const BranchI = dyn_cast<BranchInst>(Block->getTerminator());
+  if (!BranchI) return {};
+  LLVM_DEBUG(dbgs() << "branch\n");
+  if (BranchI->isUnconditional()) {
+    // In this case, we expect an incoming value which is the result of the
+    // comparison. This is the last link in the chain of comparisons (note
+    // that this does not mean that this is the last incoming value, blocks
+    // can be reordered).
+    auto *const CmpI = dyn_cast<ICmpInst>(Val);
+    if (!CmpI) return {};
+    LLVM_DEBUG(dbgs() << "icmp\n");
+    auto Result = visitICmp(CmpI, ICmpInst::ICMP_EQ, BaseId);
+    Result.CmpI = CmpI;
+    Result.BranchI = BranchI;
+    return Result;
+  } else {
+    // In this case, we expect a constant incoming value (the comparison is
+    // chained).
     const auto *const Const = cast<ConstantInt>(Val);
-    LLVM_DEBUG(dbgs() << "const\n"); 
-    if (!Const->isZero()) return {}; 
-    LLVM_DEBUG(dbgs() << "false\n"); 
-    auto *const CmpI = dyn_cast<ICmpInst>(BranchI->getCondition()); 
-    if (!CmpI) return {}; 
-    LLVM_DEBUG(dbgs() << "icmp\n"); 
-    assert(BranchI->getNumSuccessors() == 2 && "expecting a cond branch"); 
-    BasicBlock *const FalseBlock = BranchI->getSuccessor(1); 
-    auto Result = visitICmp( 
-        CmpI, FalseBlock == PhiBlock ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE, 
-        BaseId); 
-    Result.CmpI = CmpI; 
-    Result.BranchI = BranchI; 
-    return Result; 
-  } 
-  return {}; 
-} 
- 
-static inline void enqueueBlock(std::vector<BCECmpBlock> &Comparisons, 
-                                BCECmpBlock &&Comparison) { 
-  LLVM_DEBUG(dbgs() << "Block '" << Comparison.BB->getName() 
-                    << "': Found cmp of " << Comparison.SizeBits() 
-                    << " bits between " << Comparison.Lhs().BaseId << " + " 
-                    << Comparison.Lhs().Offset << " and " 
-                    << Comparison.Rhs().BaseId << " + " 
-                    << Comparison.Rhs().Offset << "\n"); 
-  LLVM_DEBUG(dbgs() << "\n"); 
-  Comparisons.push_back(std::move(Comparison)); 
-} 
- 
-// A chain of comparisons. 
-class BCECmpChain { 
- public: 
-   BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi, 
-               AliasAnalysis &AA); 
- 
-   int size() const { return Comparisons_.size(); } 
- 
-#ifdef MERGEICMPS_DOT_ON 
-  void dump() const; 
-#endif  // MERGEICMPS_DOT_ON 
- 
-  bool simplify(const TargetLibraryInfo &TLI, AliasAnalysis &AA, 
-                DomTreeUpdater &DTU); 
- 
-private: 
-  static bool IsContiguous(const BCECmpBlock &First, 
-                           const BCECmpBlock &Second) { 
-    return First.Lhs().BaseId == Second.Lhs().BaseId && 
-           First.Rhs().BaseId == Second.Rhs().BaseId && 
-           First.Lhs().Offset + First.SizeBits() / 8 == Second.Lhs().Offset && 
-           First.Rhs().Offset + First.SizeBits() / 8 == Second.Rhs().Offset; 
-  } 
- 
-  PHINode &Phi_; 
-  std::vector<BCECmpBlock> Comparisons_; 
-  // The original entry block (before sorting); 
-  BasicBlock *EntryBlock_; 
-}; 
- 
-BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi, 
-                         AliasAnalysis &AA) 
-    : Phi_(Phi) { 
-  assert(!Blocks.empty() && "a chain should have at least one block"); 
-  // Now look inside blocks to check for BCE comparisons. 
-  std::vector<BCECmpBlock> Comparisons; 
-  BaseIdentifier BaseId; 
-  for (size_t BlockIdx = 0; BlockIdx < Blocks.size(); ++BlockIdx) { 
-    BasicBlock *const Block = Blocks[BlockIdx]; 
-    assert(Block && "invalid block"); 
-    BCECmpBlock Comparison = visitCmpBlock(Phi.getIncomingValueForBlock(Block), 
-                                           Block, Phi.getParent(), BaseId); 
-    Comparison.BB = Block; 
-    if (!Comparison.IsValid()) { 
-      LLVM_DEBUG(dbgs() << "chain with invalid BCECmpBlock, no merge.\n"); 
-      return; 
-    } 
-    if (Comparison.doesOtherWork()) { 
-      LLVM_DEBUG(dbgs() << "block '" << Comparison.BB->getName() 
-                        << "' does extra work besides compare\n"); 
-      if (Comparisons.empty()) { 
-        // This is the initial block in the chain, in case this block does other 
-        // work, we can try to split the block and move the irrelevant 
-        // instructions to the predecessor. 
-        // 
-        // If this is not the initial block in the chain, splitting it wont 
-        // work. 
-        // 
-        // As once split, there will still be instructions before the BCE cmp 
-        // instructions that do other work in program order, i.e. within the 
-        // chain before sorting. Unless we can abort the chain at this point 
-        // and start anew. 
-        // 
-        // NOTE: we only handle blocks a with single predecessor for now. 
-        if (Comparison.canSplit(AA)) { 
-          LLVM_DEBUG(dbgs() 
-                     << "Split initial block '" << Comparison.BB->getName() 
-                     << "' that does extra work besides compare\n"); 
-          Comparison.RequireSplit = true; 
-          enqueueBlock(Comparisons, std::move(Comparison)); 
-        } else { 
-          LLVM_DEBUG(dbgs() 
-                     << "ignoring initial block '" << Comparison.BB->getName() 
-                     << "' that does extra work besides compare\n"); 
-        } 
-        continue; 
-      } 
-      // TODO(courbet): Right now we abort the whole chain. We could be 
-      // merging only the blocks that don't do other work and resume the 
-      // chain from there. For example: 
-      //  if (a[0] == b[0]) {  // bb1 
-      //    if (a[1] == b[1]) {  // bb2 
-      //      some_value = 3; //bb3 
-      //      if (a[2] == b[2]) { //bb3 
-      //        do a ton of stuff  //bb4 
-      //      } 
-      //    } 
-      //  } 
-      // 
-      // This is: 
-      // 
-      // bb1 --eq--> bb2 --eq--> bb3* -eq--> bb4 --+ 
-      //  \            \           \               \ 
-      //   ne           ne          ne              \ 
-      //    \            \           \               v 
-      //     +------------+-----------+----------> bb_phi 
-      // 
-      // We can only merge the first two comparisons, because bb3* does 
-      // "other work" (setting some_value to 3). 
-      // We could still merge bb1 and bb2 though. 
-      return; 
-    } 
-    enqueueBlock(Comparisons, std::move(Comparison)); 
-  } 
- 
-  // It is possible we have no suitable comparison to merge. 
-  if (Comparisons.empty()) { 
-    LLVM_DEBUG(dbgs() << "chain with no BCE basic blocks, no merge\n"); 
-    return; 
-  } 
-  EntryBlock_ = Comparisons[0].BB; 
-  Comparisons_ = std::move(Comparisons); 
-#ifdef MERGEICMPS_DOT_ON 
-  errs() << "BEFORE REORDERING:\n\n"; 
-  dump(); 
-#endif  // MERGEICMPS_DOT_ON 
-  // Reorder blocks by LHS. We can do that without changing the 
-  // semantics because we are only accessing dereferencable memory. 
-  llvm::sort(Comparisons_, 
-             [](const BCECmpBlock &LhsBlock, const BCECmpBlock &RhsBlock) { 
-               return std::tie(LhsBlock.Lhs(), LhsBlock.Rhs()) < 
-                      std::tie(RhsBlock.Lhs(), RhsBlock.Rhs()); 
-             }); 
-#ifdef MERGEICMPS_DOT_ON 
-  errs() << "AFTER REORDERING:\n\n"; 
-  dump(); 
-#endif  // MERGEICMPS_DOT_ON 
-} 
- 
-#ifdef MERGEICMPS_DOT_ON 
-void BCECmpChain::dump() const { 
-  errs() << "digraph dag {\n"; 
-  errs() << " graph [bgcolor=transparent];\n"; 
-  errs() << " node [color=black,style=filled,fillcolor=lightyellow];\n"; 
-  errs() << " edge [color=black];\n"; 
-  for (size_t I = 0; I < Comparisons_.size(); ++I) { 
-    const auto &Comparison = Comparisons_[I]; 
-    errs() << " \"" << I << "\" [label=\"%" 
-           << Comparison.Lhs().Base()->getName() << " + " 
-           << Comparison.Lhs().Offset << " == %" 
-           << Comparison.Rhs().Base()->getName() << " + " 
-           << Comparison.Rhs().Offset << " (" << (Comparison.SizeBits() / 8) 
-           << " bytes)\"];\n"; 
-    const Value *const Val = Phi_.getIncomingValueForBlock(Comparison.BB); 
-    if (I > 0) errs() << " \"" << (I - 1) << "\" -> \"" << I << "\";\n"; 
-    errs() << " \"" << I << "\" -> \"Phi\" [label=\"" << *Val << "\"];\n"; 
-  } 
-  errs() << " \"Phi\" [label=\"Phi\"];\n"; 
-  errs() << "}\n\n"; 
-} 
-#endif  // MERGEICMPS_DOT_ON 
- 
-namespace { 
- 
-// A class to compute the name of a set of merged basic blocks. 
-// This is optimized for the common case of no block names. 
-class MergedBlockName { 
-  // Storage for the uncommon case of several named blocks. 
-  SmallString<16> Scratch; 
- 
-public: 
-  explicit MergedBlockName(ArrayRef<BCECmpBlock> Comparisons) 
-      : Name(makeName(Comparisons)) {} 
-  const StringRef Name; 
- 
-private: 
-  StringRef makeName(ArrayRef<BCECmpBlock> Comparisons) { 
-    assert(!Comparisons.empty() && "no basic block"); 
-    // Fast path: only one block, or no names at all. 
-    if (Comparisons.size() == 1) 
-      return Comparisons[0].BB->getName(); 
-    const int size = std::accumulate(Comparisons.begin(), Comparisons.end(), 0, 
-                                     [](int i, const BCECmpBlock &Cmp) { 
-                                       return i + Cmp.BB->getName().size(); 
-                                     }); 
-    if (size == 0) 
-      return StringRef("", 0); 
- 
-    // Slow path: at least two blocks, at least one block with a name. 
-    Scratch.clear(); 
-    // We'll have `size` bytes for name and `Comparisons.size() - 1` bytes for 
-    // separators. 
-    Scratch.reserve(size + Comparisons.size() - 1); 
-    const auto append = [this](StringRef str) { 
-      Scratch.append(str.begin(), str.end()); 
-    }; 
-    append(Comparisons[0].BB->getName()); 
-    for (int I = 1, E = Comparisons.size(); I < E; ++I) { 
-      const BasicBlock *const BB = Comparisons[I].BB; 
-      if (!BB->getName().empty()) { 
-        append("+"); 
-        append(BB->getName()); 
-      } 
-    } 
-    return StringRef(Scratch); 
-  } 
-}; 
-} // namespace 
- 
-// Merges the given contiguous comparison blocks into one memcmp block. 
-static BasicBlock *mergeComparisons(ArrayRef<BCECmpBlock> Comparisons, 
-                                    BasicBlock *const InsertBefore, 
-                                    BasicBlock *const NextCmpBlock, 
-                                    PHINode &Phi, const TargetLibraryInfo &TLI, 
-                                    AliasAnalysis &AA, DomTreeUpdater &DTU) { 
-  assert(!Comparisons.empty() && "merging zero comparisons"); 
-  LLVMContext &Context = NextCmpBlock->getContext(); 
-  const BCECmpBlock &FirstCmp = Comparisons[0]; 
- 
-  // Create a new cmp block before next cmp block. 
-  BasicBlock *const BB = 
-      BasicBlock::Create(Context, MergedBlockName(Comparisons).Name, 
-                         NextCmpBlock->getParent(), InsertBefore); 
-  IRBuilder<> Builder(BB); 
-  // Add the GEPs from the first BCECmpBlock. 
-  Value *const Lhs = Builder.Insert(FirstCmp.Lhs().GEP->clone()); 
-  Value *const Rhs = Builder.Insert(FirstCmp.Rhs().GEP->clone()); 
- 
-  Value *IsEqual = nullptr; 
-  LLVM_DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons -> " 
-                    << BB->getName() << "\n"); 
+    LLVM_DEBUG(dbgs() << "const\n");
+    if (!Const->isZero()) return {};
+    LLVM_DEBUG(dbgs() << "false\n");
+    auto *const CmpI = dyn_cast<ICmpInst>(BranchI->getCondition());
+    if (!CmpI) return {};
+    LLVM_DEBUG(dbgs() << "icmp\n");
+    assert(BranchI->getNumSuccessors() == 2 && "expecting a cond branch");
+    BasicBlock *const FalseBlock = BranchI->getSuccessor(1);
+    auto Result = visitICmp(
+        CmpI, FalseBlock == PhiBlock ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE,
+        BaseId);
+    Result.CmpI = CmpI;
+    Result.BranchI = BranchI;
+    return Result;
+  }
+  return {};
+}
+
+static inline void enqueueBlock(std::vector<BCECmpBlock> &Comparisons,
+                                BCECmpBlock &&Comparison) {
+  LLVM_DEBUG(dbgs() << "Block '" << Comparison.BB->getName()
+                    << "': Found cmp of " << Comparison.SizeBits()
+                    << " bits between " << Comparison.Lhs().BaseId << " + "
+                    << Comparison.Lhs().Offset << " and "
+                    << Comparison.Rhs().BaseId << " + "
+                    << Comparison.Rhs().Offset << "\n");
+  LLVM_DEBUG(dbgs() << "\n");
+  Comparisons.push_back(std::move(Comparison));
+}
+
+// A chain of comparisons.
+class BCECmpChain {
+ public:
+   BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi,
+               AliasAnalysis &AA);
+
+   int size() const { return Comparisons_.size(); }
+
+#ifdef MERGEICMPS_DOT_ON
+  void dump() const;
+#endif  // MERGEICMPS_DOT_ON
+
+  bool simplify(const TargetLibraryInfo &TLI, AliasAnalysis &AA,
+                DomTreeUpdater &DTU);
+
+private:
+  static bool IsContiguous(const BCECmpBlock &First,
+                           const BCECmpBlock &Second) {
+    return First.Lhs().BaseId == Second.Lhs().BaseId &&
+           First.Rhs().BaseId == Second.Rhs().BaseId &&
+           First.Lhs().Offset + First.SizeBits() / 8 == Second.Lhs().Offset &&
+           First.Rhs().Offset + First.SizeBits() / 8 == Second.Rhs().Offset;
+  }
+
+  PHINode &Phi_;
+  std::vector<BCECmpBlock> Comparisons_;
+  // The original entry block (before sorting);
+  BasicBlock *EntryBlock_;
+};
+
+BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi,
+                         AliasAnalysis &AA)
+    : Phi_(Phi) {
+  assert(!Blocks.empty() && "a chain should have at least one block");
+  // Now look inside blocks to check for BCE comparisons.
+  std::vector<BCECmpBlock> Comparisons;
+  BaseIdentifier BaseId;
+  for (size_t BlockIdx = 0; BlockIdx < Blocks.size(); ++BlockIdx) {
+    BasicBlock *const Block = Blocks[BlockIdx];
+    assert(Block && "invalid block");
+    BCECmpBlock Comparison = visitCmpBlock(Phi.getIncomingValueForBlock(Block),
+                                           Block, Phi.getParent(), BaseId);
+    Comparison.BB = Block;
+    if (!Comparison.IsValid()) {
+      LLVM_DEBUG(dbgs() << "chain with invalid BCECmpBlock, no merge.\n");
+      return;
+    }
+    if (Comparison.doesOtherWork()) {
+      LLVM_DEBUG(dbgs() << "block '" << Comparison.BB->getName()
+                        << "' does extra work besides compare\n");
+      if (Comparisons.empty()) {
+        // This is the initial block in the chain, in case this block does other
+        // work, we can try to split the block and move the irrelevant
+        // instructions to the predecessor.
+        //
+        // If this is not the initial block in the chain, splitting it wont
+        // work.
+        //
+        // As once split, there will still be instructions before the BCE cmp
+        // instructions that do other work in program order, i.e. within the
+        // chain before sorting. Unless we can abort the chain at this point
+        // and start anew.
+        //
+        // NOTE: we only handle blocks a with single predecessor for now.
+        if (Comparison.canSplit(AA)) {
+          LLVM_DEBUG(dbgs()
+                     << "Split initial block '" << Comparison.BB->getName()
+                     << "' that does extra work besides compare\n");
+          Comparison.RequireSplit = true;
+          enqueueBlock(Comparisons, std::move(Comparison));
+        } else {
+          LLVM_DEBUG(dbgs()
+                     << "ignoring initial block '" << Comparison.BB->getName()
+                     << "' that does extra work besides compare\n");
+        }
+        continue;
+      }
+      // TODO(courbet): Right now we abort the whole chain. We could be
+      // merging only the blocks that don't do other work and resume the
+      // chain from there. For example:
+      //  if (a[0] == b[0]) {  // bb1
+      //    if (a[1] == b[1]) {  // bb2
+      //      some_value = 3; //bb3
+      //      if (a[2] == b[2]) { //bb3
+      //        do a ton of stuff  //bb4
+      //      }
+      //    }
+      //  }
+      //
+      // This is:
+      //
+      // bb1 --eq--> bb2 --eq--> bb3* -eq--> bb4 --+
+      //  \            \           \               \
+      //   ne           ne          ne              \
+      //    \            \           \               v
+      //     +------------+-----------+----------> bb_phi
+      //
+      // We can only merge the first two comparisons, because bb3* does
+      // "other work" (setting some_value to 3).
+      // We could still merge bb1 and bb2 though.
+      return;
+    }
+    enqueueBlock(Comparisons, std::move(Comparison));
+  }
+
+  // It is possible we have no suitable comparison to merge.
+  if (Comparisons.empty()) {
+    LLVM_DEBUG(dbgs() << "chain with no BCE basic blocks, no merge\n");
+    return;
+  }
+  EntryBlock_ = Comparisons[0].BB;
+  Comparisons_ = std::move(Comparisons);
+#ifdef MERGEICMPS_DOT_ON
+  errs() << "BEFORE REORDERING:\n\n";
+  dump();
+#endif  // MERGEICMPS_DOT_ON
+  // Reorder blocks by LHS. We can do that without changing the
+  // semantics because we are only accessing dereferencable memory.
+  llvm::sort(Comparisons_,
+             [](const BCECmpBlock &LhsBlock, const BCECmpBlock &RhsBlock) {
+               return std::tie(LhsBlock.Lhs(), LhsBlock.Rhs()) <
+                      std::tie(RhsBlock.Lhs(), RhsBlock.Rhs());
+             });
+#ifdef MERGEICMPS_DOT_ON
+  errs() << "AFTER REORDERING:\n\n";
+  dump();
+#endif  // MERGEICMPS_DOT_ON
+}
+
+#ifdef MERGEICMPS_DOT_ON
+void BCECmpChain::dump() const {
+  errs() << "digraph dag {\n";
+  errs() << " graph [bgcolor=transparent];\n";
+  errs() << " node [color=black,style=filled,fillcolor=lightyellow];\n";
+  errs() << " edge [color=black];\n";
+  for (size_t I = 0; I < Comparisons_.size(); ++I) {
+    const auto &Comparison = Comparisons_[I];
+    errs() << " \"" << I << "\" [label=\"%"
+           << Comparison.Lhs().Base()->getName() << " + "
+           << Comparison.Lhs().Offset << " == %"
+           << Comparison.Rhs().Base()->getName() << " + "
+           << Comparison.Rhs().Offset << " (" << (Comparison.SizeBits() / 8)
+           << " bytes)\"];\n";
+    const Value *const Val = Phi_.getIncomingValueForBlock(Comparison.BB);
+    if (I > 0) errs() << " \"" << (I - 1) << "\" -> \"" << I << "\";\n";
+    errs() << " \"" << I << "\" -> \"Phi\" [label=\"" << *Val << "\"];\n";
+  }
+  errs() << " \"Phi\" [label=\"Phi\"];\n";
+  errs() << "}\n\n";
+}
+#endif  // MERGEICMPS_DOT_ON
+
+namespace {
+
+// A class to compute the name of a set of merged basic blocks.
+// This is optimized for the common case of no block names.
+class MergedBlockName {
+  // Storage for the uncommon case of several named blocks.
+  SmallString<16> Scratch;
+
+public:
+  explicit MergedBlockName(ArrayRef<BCECmpBlock> Comparisons)
+      : Name(makeName(Comparisons)) {}
+  const StringRef Name;
+
+private:
+  StringRef makeName(ArrayRef<BCECmpBlock> Comparisons) {
+    assert(!Comparisons.empty() && "no basic block");
+    // Fast path: only one block, or no names at all.
+    if (Comparisons.size() == 1)
+      return Comparisons[0].BB->getName();
+    const int size = std::accumulate(Comparisons.begin(), Comparisons.end(), 0,
+                                     [](int i, const BCECmpBlock &Cmp) {
+                                       return i + Cmp.BB->getName().size();
+                                     });
+    if (size == 0)
+      return StringRef("", 0);
+
+    // Slow path: at least two blocks, at least one block with a name.
+    Scratch.clear();
+    // We'll have `size` bytes for name and `Comparisons.size() - 1` bytes for
+    // separators.
+    Scratch.reserve(size + Comparisons.size() - 1);
+    const auto append = [this](StringRef str) {
+      Scratch.append(str.begin(), str.end());
+    };
+    append(Comparisons[0].BB->getName());
+    for (int I = 1, E = Comparisons.size(); I < E; ++I) {
+      const BasicBlock *const BB = Comparisons[I].BB;
+      if (!BB->getName().empty()) {
+        append("+");
+        append(BB->getName());
+      }
+    }
+    return StringRef(Scratch);
+  }
+};
+} // namespace
+
+// Merges the given contiguous comparison blocks into one memcmp block.
+static BasicBlock *mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
+                                    BasicBlock *const InsertBefore,
+                                    BasicBlock *const NextCmpBlock,
+                                    PHINode &Phi, const TargetLibraryInfo &TLI,
+                                    AliasAnalysis &AA, DomTreeUpdater &DTU) {
+  assert(!Comparisons.empty() && "merging zero comparisons");
+  LLVMContext &Context = NextCmpBlock->getContext();
+  const BCECmpBlock &FirstCmp = Comparisons[0];
+
+  // Create a new cmp block before next cmp block.
+  BasicBlock *const BB =
+      BasicBlock::Create(Context, MergedBlockName(Comparisons).Name,
+                         NextCmpBlock->getParent(), InsertBefore);
+  IRBuilder<> Builder(BB);
+  // Add the GEPs from the first BCECmpBlock.
+  Value *const Lhs = Builder.Insert(FirstCmp.Lhs().GEP->clone());
+  Value *const Rhs = Builder.Insert(FirstCmp.Rhs().GEP->clone());
+
+  Value *IsEqual = nullptr;
+  LLVM_DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons -> "
+                    << BB->getName() << "\n");
 
   // If there is one block that requires splitting, we do it now, i.e.
   // just before we know we will collapse the chain. The instructions
@@ -635,312 +635,312 @@ static BasicBlock *mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
     ToSplit->split(BB, AA);
   }
 
-  if (Comparisons.size() == 1) { 
-    LLVM_DEBUG(dbgs() << "Only one comparison, updating branches\n"); 
-    Value *const LhsLoad = 
-        Builder.CreateLoad(FirstCmp.Lhs().LoadI->getType(), Lhs); 
-    Value *const RhsLoad = 
-        Builder.CreateLoad(FirstCmp.Rhs().LoadI->getType(), Rhs); 
-    // There are no blocks to merge, just do the comparison. 
-    IsEqual = Builder.CreateICmpEQ(LhsLoad, RhsLoad); 
-  } else { 
-    const unsigned TotalSizeBits = std::accumulate( 
-        Comparisons.begin(), Comparisons.end(), 0u, 
-        [](int Size, const BCECmpBlock &C) { return Size + C.SizeBits(); }); 
- 
-    // Create memcmp() == 0. 
-    const auto &DL = Phi.getModule()->getDataLayout(); 
-    Value *const MemCmpCall = emitMemCmp( 
-        Lhs, Rhs, 
-        ConstantInt::get(DL.getIntPtrType(Context), TotalSizeBits / 8), Builder, 
-        DL, &TLI); 
-    IsEqual = Builder.CreateICmpEQ( 
-        MemCmpCall, ConstantInt::get(Type::getInt32Ty(Context), 0)); 
-  } 
- 
-  BasicBlock *const PhiBB = Phi.getParent(); 
-  // Add a branch to the next basic block in the chain. 
-  if (NextCmpBlock == PhiBB) { 
-    // Continue to phi, passing it the comparison result. 
-    Builder.CreateBr(PhiBB); 
-    Phi.addIncoming(IsEqual, BB); 
-    DTU.applyUpdates({{DominatorTree::Insert, BB, PhiBB}}); 
-  } else { 
-    // Continue to next block if equal, exit to phi else. 
-    Builder.CreateCondBr(IsEqual, NextCmpBlock, PhiBB); 
-    Phi.addIncoming(ConstantInt::getFalse(Context), BB); 
-    DTU.applyUpdates({{DominatorTree::Insert, BB, NextCmpBlock}, 
-                      {DominatorTree::Insert, BB, PhiBB}}); 
-  } 
-  return BB; 
-} 
- 
-bool BCECmpChain::simplify(const TargetLibraryInfo &TLI, AliasAnalysis &AA, 
-                           DomTreeUpdater &DTU) { 
-  assert(Comparisons_.size() >= 2 && "simplifying trivial BCECmpChain"); 
-  // First pass to check if there is at least one merge. If not, we don't do 
-  // anything and we keep analysis passes intact. 
-  const auto AtLeastOneMerged = [this]() { 
-    for (size_t I = 1; I < Comparisons_.size(); ++I) { 
-      if (IsContiguous(Comparisons_[I - 1], Comparisons_[I])) 
-        return true; 
-    } 
-    return false; 
-  }; 
-  if (!AtLeastOneMerged()) 
-    return false; 
- 
-  LLVM_DEBUG(dbgs() << "Simplifying comparison chain starting at block " 
-                    << EntryBlock_->getName() << "\n"); 
- 
-  // Effectively merge blocks. We go in the reverse direction from the phi block 
-  // so that the next block is always available to branch to. 
-  const auto mergeRange = [this, &TLI, &AA, &DTU](int I, int Num, 
-                                                  BasicBlock *InsertBefore, 
-                                                  BasicBlock *Next) { 
-    return mergeComparisons(makeArrayRef(Comparisons_).slice(I, Num), 
-                            InsertBefore, Next, Phi_, TLI, AA, DTU); 
-  }; 
-  int NumMerged = 1; 
-  BasicBlock *NextCmpBlock = Phi_.getParent(); 
-  for (int I = static_cast<int>(Comparisons_.size()) - 2; I >= 0; --I) { 
-    if (IsContiguous(Comparisons_[I], Comparisons_[I + 1])) { 
-      LLVM_DEBUG(dbgs() << "Merging block " << Comparisons_[I].BB->getName() 
-                        << " into " << Comparisons_[I + 1].BB->getName() 
-                        << "\n"); 
-      ++NumMerged; 
-    } else { 
-      NextCmpBlock = mergeRange(I + 1, NumMerged, NextCmpBlock, NextCmpBlock); 
-      NumMerged = 1; 
-    } 
-  } 
-  // Insert the entry block for the new chain before the old entry block. 
-  // If the old entry block was the function entry, this ensures that the new 
-  // entry can become the function entry. 
-  NextCmpBlock = mergeRange(0, NumMerged, EntryBlock_, NextCmpBlock); 
- 
-  // Replace the original cmp chain with the new cmp chain by pointing all 
-  // predecessors of EntryBlock_ to NextCmpBlock instead. This makes all cmp 
-  // blocks in the old chain unreachable. 
-  while (!pred_empty(EntryBlock_)) { 
-    BasicBlock* const Pred = *pred_begin(EntryBlock_); 
-    LLVM_DEBUG(dbgs() << "Updating jump into old chain from " << Pred->getName() 
-                      << "\n"); 
-    Pred->getTerminator()->replaceUsesOfWith(EntryBlock_, NextCmpBlock); 
-    DTU.applyUpdates({{DominatorTree::Delete, Pred, EntryBlock_}, 
-                      {DominatorTree::Insert, Pred, NextCmpBlock}}); 
-  } 
- 
-  // If the old cmp chain was the function entry, we need to update the function 
-  // entry. 
-  const bool ChainEntryIsFnEntry = 
-      (EntryBlock_ == &EntryBlock_->getParent()->getEntryBlock()); 
-  if (ChainEntryIsFnEntry && DTU.hasDomTree()) { 
-    LLVM_DEBUG(dbgs() << "Changing function entry from " 
-                      << EntryBlock_->getName() << " to " 
-                      << NextCmpBlock->getName() << "\n"); 
-    DTU.getDomTree().setNewRoot(NextCmpBlock); 
-    DTU.applyUpdates({{DominatorTree::Delete, NextCmpBlock, EntryBlock_}}); 
-  } 
-  EntryBlock_ = nullptr; 
- 
-  // Delete merged blocks. This also removes incoming values in phi. 
-  SmallVector<BasicBlock *, 16> DeadBlocks; 
-  for (auto &Cmp : Comparisons_) { 
-    LLVM_DEBUG(dbgs() << "Deleting merged block " << Cmp.BB->getName() << "\n"); 
-    DeadBlocks.push_back(Cmp.BB); 
-  } 
-  DeleteDeadBlocks(DeadBlocks, &DTU); 
- 
-  Comparisons_.clear(); 
-  return true; 
-} 
- 
-std::vector<BasicBlock *> getOrderedBlocks(PHINode &Phi, 
-                                           BasicBlock *const LastBlock, 
-                                           int NumBlocks) { 
-  // Walk up from the last block to find other blocks. 
-  std::vector<BasicBlock *> Blocks(NumBlocks); 
-  assert(LastBlock && "invalid last block"); 
-  BasicBlock *CurBlock = LastBlock; 
-  for (int BlockIndex = NumBlocks - 1; BlockIndex > 0; --BlockIndex) { 
-    if (CurBlock->hasAddressTaken()) { 
-      // Somebody is jumping to the block through an address, all bets are 
-      // off. 
-      LLVM_DEBUG(dbgs() << "skip: block " << BlockIndex 
-                        << " has its address taken\n"); 
-      return {}; 
-    } 
-    Blocks[BlockIndex] = CurBlock; 
-    auto *SinglePredecessor = CurBlock->getSinglePredecessor(); 
-    if (!SinglePredecessor) { 
-      // The block has two or more predecessors. 
-      LLVM_DEBUG(dbgs() << "skip: block " << BlockIndex 
-                        << " has two or more predecessors\n"); 
-      return {}; 
-    } 
-    if (Phi.getBasicBlockIndex(SinglePredecessor) < 0) { 
-      // The block does not link back to the phi. 
-      LLVM_DEBUG(dbgs() << "skip: block " << BlockIndex 
-                        << " does not link back to the phi\n"); 
-      return {}; 
-    } 
-    CurBlock = SinglePredecessor; 
-  } 
-  Blocks[0] = CurBlock; 
-  return Blocks; 
-} 
- 
-bool processPhi(PHINode &Phi, const TargetLibraryInfo &TLI, AliasAnalysis &AA, 
-                DomTreeUpdater &DTU) { 
-  LLVM_DEBUG(dbgs() << "processPhi()\n"); 
-  if (Phi.getNumIncomingValues() <= 1) { 
-    LLVM_DEBUG(dbgs() << "skip: only one incoming value in phi\n"); 
-    return false; 
-  } 
-  // We are looking for something that has the following structure: 
-  //   bb1 --eq--> bb2 --eq--> bb3 --eq--> bb4 --+ 
-  //     \            \           \               \ 
-  //      ne           ne          ne              \ 
-  //       \            \           \               v 
-  //        +------------+-----------+----------> bb_phi 
-  // 
-  //  - The last basic block (bb4 here) must branch unconditionally to bb_phi. 
-  //    It's the only block that contributes a non-constant value to the Phi. 
-  //  - All other blocks (b1, b2, b3) must have exactly two successors, one of 
-  //    them being the phi block. 
-  //  - All intermediate blocks (bb2, bb3) must have only one predecessor. 
-  //  - Blocks cannot do other work besides the comparison, see doesOtherWork() 
- 
-  // The blocks are not necessarily ordered in the phi, so we start from the 
-  // last block and reconstruct the order. 
-  BasicBlock *LastBlock = nullptr; 
-  for (unsigned I = 0; I < Phi.getNumIncomingValues(); ++I) { 
-    if (isa<ConstantInt>(Phi.getIncomingValue(I))) continue; 
-    if (LastBlock) { 
-      // There are several non-constant values. 
-      LLVM_DEBUG(dbgs() << "skip: several non-constant values\n"); 
-      return false; 
-    } 
-    if (!isa<ICmpInst>(Phi.getIncomingValue(I)) || 
-        cast<ICmpInst>(Phi.getIncomingValue(I))->getParent() != 
-            Phi.getIncomingBlock(I)) { 
-      // Non-constant incoming value is not from a cmp instruction or not 
-      // produced by the last block. We could end up processing the value 
-      // producing block more than once. 
-      // 
-      // This is an uncommon case, so we bail. 
-      LLVM_DEBUG( 
-          dbgs() 
-          << "skip: non-constant value not from cmp or not from last block.\n"); 
-      return false; 
-    } 
-    LastBlock = Phi.getIncomingBlock(I); 
-  } 
-  if (!LastBlock) { 
-    // There is no non-constant block. 
-    LLVM_DEBUG(dbgs() << "skip: no non-constant block\n"); 
-    return false; 
-  } 
-  if (LastBlock->getSingleSuccessor() != Phi.getParent()) { 
-    LLVM_DEBUG(dbgs() << "skip: last block non-phi successor\n"); 
-    return false; 
-  } 
- 
-  const auto Blocks = 
-      getOrderedBlocks(Phi, LastBlock, Phi.getNumIncomingValues()); 
-  if (Blocks.empty()) return false; 
-  BCECmpChain CmpChain(Blocks, Phi, AA); 
- 
-  if (CmpChain.size() < 2) { 
-    LLVM_DEBUG(dbgs() << "skip: only one compare block\n"); 
-    return false; 
-  } 
- 
-  return CmpChain.simplify(TLI, AA, DTU); 
-} 
- 
-static bool runImpl(Function &F, const TargetLibraryInfo &TLI, 
-                    const TargetTransformInfo &TTI, AliasAnalysis &AA, 
-                    DominatorTree *DT) { 
-  LLVM_DEBUG(dbgs() << "MergeICmpsLegacyPass: " << F.getName() << "\n"); 
- 
-  // We only try merging comparisons if the target wants to expand memcmp later. 
-  // The rationale is to avoid turning small chains into memcmp calls. 
-  if (!TTI.enableMemCmpExpansion(F.hasOptSize(), true)) 
-    return false; 
- 
-  // If we don't have memcmp avaiable we can't emit calls to it. 
-  if (!TLI.has(LibFunc_memcmp)) 
-    return false; 
- 
-  DomTreeUpdater DTU(DT, /*PostDominatorTree*/ nullptr, 
-                     DomTreeUpdater::UpdateStrategy::Eager); 
- 
-  bool MadeChange = false; 
- 
-  for (auto BBIt = ++F.begin(); BBIt != F.end(); ++BBIt) { 
-    // A Phi operation is always first in a basic block. 
-    if (auto *const Phi = dyn_cast<PHINode>(&*BBIt->begin())) 
-      MadeChange |= processPhi(*Phi, TLI, AA, DTU); 
-  } 
- 
-  return MadeChange; 
-} 
- 
-class MergeICmpsLegacyPass : public FunctionPass { 
-public: 
-  static char ID; 
- 
-  MergeICmpsLegacyPass() : FunctionPass(ID) { 
-    initializeMergeICmpsLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnFunction(Function &F) override { 
-    if (skipFunction(F)) return false; 
-    const auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 
-    const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 
-    // MergeICmps does not need the DominatorTree, but we update it if it's 
-    // already available. 
-    auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 
-    auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); 
-    return runImpl(F, TLI, TTI, AA, DTWP ? &DTWP->getDomTree() : nullptr); 
-  } 
- 
- private: 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-    AU.addRequired<TargetTransformInfoWrapperPass>(); 
-    AU.addRequired<AAResultsWrapperPass>(); 
-    AU.addPreserved<GlobalsAAWrapperPass>(); 
-    AU.addPreserved<DominatorTreeWrapperPass>(); 
-  } 
-}; 
- 
-} // namespace 
- 
-char MergeICmpsLegacyPass::ID = 0; 
-INITIALIZE_PASS_BEGIN(MergeICmpsLegacyPass, "mergeicmps", 
-                      "Merge contiguous icmps into a memcmp", false, false) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 
-INITIALIZE_PASS_END(MergeICmpsLegacyPass, "mergeicmps", 
-                    "Merge contiguous icmps into a memcmp", false, false) 
- 
-Pass *llvm::createMergeICmpsLegacyPass() { return new MergeICmpsLegacyPass(); } 
- 
-PreservedAnalyses MergeICmpsPass::run(Function &F, 
-                                      FunctionAnalysisManager &AM) { 
-  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 
-  auto &TTI = AM.getResult<TargetIRAnalysis>(F); 
-  auto &AA = AM.getResult<AAManager>(F); 
-  auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F); 
-  const bool MadeChanges = runImpl(F, TLI, TTI, AA, DT); 
-  if (!MadeChanges) 
-    return PreservedAnalyses::all(); 
-  PreservedAnalyses PA; 
-  PA.preserve<GlobalsAA>(); 
-  PA.preserve<DominatorTreeAnalysis>(); 
-  return PA; 
-} 
+  if (Comparisons.size() == 1) {
+    LLVM_DEBUG(dbgs() << "Only one comparison, updating branches\n");
+    Value *const LhsLoad =
+        Builder.CreateLoad(FirstCmp.Lhs().LoadI->getType(), Lhs);
+    Value *const RhsLoad =
+        Builder.CreateLoad(FirstCmp.Rhs().LoadI->getType(), Rhs);
+    // There are no blocks to merge, just do the comparison.
+    IsEqual = Builder.CreateICmpEQ(LhsLoad, RhsLoad);
+  } else {
+    const unsigned TotalSizeBits = std::accumulate(
+        Comparisons.begin(), Comparisons.end(), 0u,
+        [](int Size, const BCECmpBlock &C) { return Size + C.SizeBits(); });
+
+    // Create memcmp() == 0.
+    const auto &DL = Phi.getModule()->getDataLayout();
+    Value *const MemCmpCall = emitMemCmp(
+        Lhs, Rhs,
+        ConstantInt::get(DL.getIntPtrType(Context), TotalSizeBits / 8), Builder,
+        DL, &TLI);
+    IsEqual = Builder.CreateICmpEQ(
+        MemCmpCall, ConstantInt::get(Type::getInt32Ty(Context), 0));
+  }
+
+  BasicBlock *const PhiBB = Phi.getParent();
+  // Add a branch to the next basic block in the chain.
+  if (NextCmpBlock == PhiBB) {
+    // Continue to phi, passing it the comparison result.
+    Builder.CreateBr(PhiBB);
+    Phi.addIncoming(IsEqual, BB);
+    DTU.applyUpdates({{DominatorTree::Insert, BB, PhiBB}});
+  } else {
+    // Continue to next block if equal, exit to phi else.
+    Builder.CreateCondBr(IsEqual, NextCmpBlock, PhiBB);
+    Phi.addIncoming(ConstantInt::getFalse(Context), BB);
+    DTU.applyUpdates({{DominatorTree::Insert, BB, NextCmpBlock},
+                      {DominatorTree::Insert, BB, PhiBB}});
+  }
+  return BB;
+}
+
+bool BCECmpChain::simplify(const TargetLibraryInfo &TLI, AliasAnalysis &AA,
+                           DomTreeUpdater &DTU) {
+  assert(Comparisons_.size() >= 2 && "simplifying trivial BCECmpChain");
+  // First pass to check if there is at least one merge. If not, we don't do
+  // anything and we keep analysis passes intact.
+  const auto AtLeastOneMerged = [this]() {
+    for (size_t I = 1; I < Comparisons_.size(); ++I) {
+      if (IsContiguous(Comparisons_[I - 1], Comparisons_[I]))
+        return true;
+    }
+    return false;
+  };
+  if (!AtLeastOneMerged())
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Simplifying comparison chain starting at block "
+                    << EntryBlock_->getName() << "\n");
+
+  // Effectively merge blocks. We go in the reverse direction from the phi block
+  // so that the next block is always available to branch to.
+  const auto mergeRange = [this, &TLI, &AA, &DTU](int I, int Num,
+                                                  BasicBlock *InsertBefore,
+                                                  BasicBlock *Next) {
+    return mergeComparisons(makeArrayRef(Comparisons_).slice(I, Num),
+                            InsertBefore, Next, Phi_, TLI, AA, DTU);
+  };
+  int NumMerged = 1;
+  BasicBlock *NextCmpBlock = Phi_.getParent();
+  for (int I = static_cast<int>(Comparisons_.size()) - 2; I >= 0; --I) {
+    if (IsContiguous(Comparisons_[I], Comparisons_[I + 1])) {
+      LLVM_DEBUG(dbgs() << "Merging block " << Comparisons_[I].BB->getName()
+                        << " into " << Comparisons_[I + 1].BB->getName()
+                        << "\n");
+      ++NumMerged;
+    } else {
+      NextCmpBlock = mergeRange(I + 1, NumMerged, NextCmpBlock, NextCmpBlock);
+      NumMerged = 1;
+    }
+  }
+  // Insert the entry block for the new chain before the old entry block.
+  // If the old entry block was the function entry, this ensures that the new
+  // entry can become the function entry.
+  NextCmpBlock = mergeRange(0, NumMerged, EntryBlock_, NextCmpBlock);
+
+  // Replace the original cmp chain with the new cmp chain by pointing all
+  // predecessors of EntryBlock_ to NextCmpBlock instead. This makes all cmp
+  // blocks in the old chain unreachable.
+  while (!pred_empty(EntryBlock_)) {
+    BasicBlock* const Pred = *pred_begin(EntryBlock_);
+    LLVM_DEBUG(dbgs() << "Updating jump into old chain from " << Pred->getName()
+                      << "\n");
+    Pred->getTerminator()->replaceUsesOfWith(EntryBlock_, NextCmpBlock);
+    DTU.applyUpdates({{DominatorTree::Delete, Pred, EntryBlock_},
+                      {DominatorTree::Insert, Pred, NextCmpBlock}});
+  }
+
+  // If the old cmp chain was the function entry, we need to update the function
+  // entry.
+  const bool ChainEntryIsFnEntry =
+      (EntryBlock_ == &EntryBlock_->getParent()->getEntryBlock());
+  if (ChainEntryIsFnEntry && DTU.hasDomTree()) {
+    LLVM_DEBUG(dbgs() << "Changing function entry from "
+                      << EntryBlock_->getName() << " to "
+                      << NextCmpBlock->getName() << "\n");
+    DTU.getDomTree().setNewRoot(NextCmpBlock);
+    DTU.applyUpdates({{DominatorTree::Delete, NextCmpBlock, EntryBlock_}});
+  }
+  EntryBlock_ = nullptr;
+
+  // Delete merged blocks. This also removes incoming values in phi.
+  SmallVector<BasicBlock *, 16> DeadBlocks;
+  for (auto &Cmp : Comparisons_) {
+    LLVM_DEBUG(dbgs() << "Deleting merged block " << Cmp.BB->getName() << "\n");
+    DeadBlocks.push_back(Cmp.BB);
+  }
+  DeleteDeadBlocks(DeadBlocks, &DTU);
+
+  Comparisons_.clear();
+  return true;
+}
+
+std::vector<BasicBlock *> getOrderedBlocks(PHINode &Phi,
+                                           BasicBlock *const LastBlock,
+                                           int NumBlocks) {
+  // Walk up from the last block to find other blocks.
+  std::vector<BasicBlock *> Blocks(NumBlocks);
+  assert(LastBlock && "invalid last block");
+  BasicBlock *CurBlock = LastBlock;
+  for (int BlockIndex = NumBlocks - 1; BlockIndex > 0; --BlockIndex) {
+    if (CurBlock->hasAddressTaken()) {
+      // Somebody is jumping to the block through an address, all bets are
+      // off.
+      LLVM_DEBUG(dbgs() << "skip: block " << BlockIndex
+                        << " has its address taken\n");
+      return {};
+    }
+    Blocks[BlockIndex] = CurBlock;
+    auto *SinglePredecessor = CurBlock->getSinglePredecessor();
+    if (!SinglePredecessor) {
+      // The block has two or more predecessors.
+      LLVM_DEBUG(dbgs() << "skip: block " << BlockIndex
+                        << " has two or more predecessors\n");
+      return {};
+    }
+    if (Phi.getBasicBlockIndex(SinglePredecessor) < 0) {
+      // The block does not link back to the phi.
+      LLVM_DEBUG(dbgs() << "skip: block " << BlockIndex
+                        << " does not link back to the phi\n");
+      return {};
+    }
+    CurBlock = SinglePredecessor;
+  }
+  Blocks[0] = CurBlock;
+  return Blocks;
+}
+
+bool processPhi(PHINode &Phi, const TargetLibraryInfo &TLI, AliasAnalysis &AA,
+                DomTreeUpdater &DTU) {
+  LLVM_DEBUG(dbgs() << "processPhi()\n");
+  if (Phi.getNumIncomingValues() <= 1) {
+    LLVM_DEBUG(dbgs() << "skip: only one incoming value in phi\n");
+    return false;
+  }
+  // We are looking for something that has the following structure:
+  //   bb1 --eq--> bb2 --eq--> bb3 --eq--> bb4 --+
+  //     \            \           \               \
+  //      ne           ne          ne              \
+  //       \            \           \               v
+  //        +------------+-----------+----------> bb_phi
+  //
+  //  - The last basic block (bb4 here) must branch unconditionally to bb_phi.
+  //    It's the only block that contributes a non-constant value to the Phi.
+  //  - All other blocks (b1, b2, b3) must have exactly two successors, one of
+  //    them being the phi block.
+  //  - All intermediate blocks (bb2, bb3) must have only one predecessor.
+  //  - Blocks cannot do other work besides the comparison, see doesOtherWork()
+
+  // The blocks are not necessarily ordered in the phi, so we start from the
+  // last block and reconstruct the order.
+  BasicBlock *LastBlock = nullptr;
+  for (unsigned I = 0; I < Phi.getNumIncomingValues(); ++I) {
+    if (isa<ConstantInt>(Phi.getIncomingValue(I))) continue;
+    if (LastBlock) {
+      // There are several non-constant values.
+      LLVM_DEBUG(dbgs() << "skip: several non-constant values\n");
+      return false;
+    }
+    if (!isa<ICmpInst>(Phi.getIncomingValue(I)) ||
+        cast<ICmpInst>(Phi.getIncomingValue(I))->getParent() !=
+            Phi.getIncomingBlock(I)) {
+      // Non-constant incoming value is not from a cmp instruction or not
+      // produced by the last block. We could end up processing the value
+      // producing block more than once.
+      //
+      // This is an uncommon case, so we bail.
+      LLVM_DEBUG(
+          dbgs()
+          << "skip: non-constant value not from cmp or not from last block.\n");
+      return false;
+    }
+    LastBlock = Phi.getIncomingBlock(I);
+  }
+  if (!LastBlock) {
+    // There is no non-constant block.
+    LLVM_DEBUG(dbgs() << "skip: no non-constant block\n");
+    return false;
+  }
+  if (LastBlock->getSingleSuccessor() != Phi.getParent()) {
+    LLVM_DEBUG(dbgs() << "skip: last block non-phi successor\n");
+    return false;
+  }
+
+  const auto Blocks =
+      getOrderedBlocks(Phi, LastBlock, Phi.getNumIncomingValues());
+  if (Blocks.empty()) return false;
+  BCECmpChain CmpChain(Blocks, Phi, AA);
+
+  if (CmpChain.size() < 2) {
+    LLVM_DEBUG(dbgs() << "skip: only one compare block\n");
+    return false;
+  }
+
+  return CmpChain.simplify(TLI, AA, DTU);
+}
+
+static bool runImpl(Function &F, const TargetLibraryInfo &TLI,
+                    const TargetTransformInfo &TTI, AliasAnalysis &AA,
+                    DominatorTree *DT) {
+  LLVM_DEBUG(dbgs() << "MergeICmpsLegacyPass: " << F.getName() << "\n");
+
+  // We only try merging comparisons if the target wants to expand memcmp later.
+  // The rationale is to avoid turning small chains into memcmp calls.
+  if (!TTI.enableMemCmpExpansion(F.hasOptSize(), true))
+    return false;
+
+  // If we don't have memcmp avaiable we can't emit calls to it.
+  if (!TLI.has(LibFunc_memcmp))
+    return false;
+
+  DomTreeUpdater DTU(DT, /*PostDominatorTree*/ nullptr,
+                     DomTreeUpdater::UpdateStrategy::Eager);
+
+  bool MadeChange = false;
+
+  for (auto BBIt = ++F.begin(); BBIt != F.end(); ++BBIt) {
+    // A Phi operation is always first in a basic block.
+    if (auto *const Phi = dyn_cast<PHINode>(&*BBIt->begin()))
+      MadeChange |= processPhi(*Phi, TLI, AA, DTU);
+  }
+
+  return MadeChange;
+}
+
+class MergeICmpsLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  MergeICmpsLegacyPass() : FunctionPass(ID) {
+    initializeMergeICmpsLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F)) return false;
+    const auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    // MergeICmps does not need the DominatorTree, but we update it if it's
+    // already available.
+    auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+    auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+    return runImpl(F, TLI, TTI, AA, DTWP ? &DTWP->getDomTree() : nullptr);
+  }
+
+ private:
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+  }
+};
+
+} // namespace
+
+char MergeICmpsLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(MergeICmpsLegacyPass, "mergeicmps",
+                      "Merge contiguous icmps into a memcmp", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(MergeICmpsLegacyPass, "mergeicmps",
+                    "Merge contiguous icmps into a memcmp", false, false)
+
+Pass *llvm::createMergeICmpsLegacyPass() { return new MergeICmpsLegacyPass(); }
+
+PreservedAnalyses MergeICmpsPass::run(Function &F,
+                                      FunctionAnalysisManager &AM) {
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  auto &AA = AM.getResult<AAManager>(F);
+  auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
+  const bool MadeChanges = runImpl(F, TLI, TTI, AA, DT);
+  if (!MadeChanges)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<GlobalsAA>();
+  PA.preserve<DominatorTreeAnalysis>();
+  return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
index ba6dac8ae8..69aa0cebe1 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -1,423 +1,423 @@
-//===- MergedLoadStoreMotion.cpp - merge and hoist/sink load/stores -------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-//! \file 
-//! This pass performs merges of loads and stores on both sides of a 
-//  diamond (hammock). It hoists the loads and sinks the stores. 
-// 
-// The algorithm iteratively hoists two loads to the same address out of a 
-// diamond (hammock) and merges them into a single load in the header. Similar 
-// it sinks and merges two stores to the tail block (footer). The algorithm 
-// iterates over the instructions of one side of the diamond and attempts to 
-// find a matching load/store on the other side. New tail/footer block may be 
-// insterted if the tail/footer block has more predecessors (not only the two 
-// predecessors that are forming the diamond). It hoists / sinks when it thinks 
-// it safe to do so.  This optimization helps with eg. hiding load latencies, 
-// triggering if-conversion, and reducing static code size. 
-// 
-// NOTE: This code no longer performs load hoisting, it is subsumed by GVNHoist. 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// 
-// Example: 
-// Diamond shaped code before merge: 
-// 
-//            header: 
-//                     br %cond, label %if.then, label %if.else 
-//                        +                    + 
-//                       +                      + 
-//                      +                        + 
-//            if.then:                         if.else: 
-//               %lt = load %addr_l               %le = load %addr_l 
-//               <use %lt>                        <use %le> 
-//               <...>                            <...> 
-//               store %st, %addr_s               store %se, %addr_s 
-//               br label %if.end                 br label %if.end 
-//                     +                         + 
-//                      +                       + 
-//                       +                     + 
-//            if.end ("footer"): 
-//                     <...> 
-// 
-// Diamond shaped code after merge: 
-// 
-//            header: 
-//                     %l = load %addr_l 
-//                     br %cond, label %if.then, label %if.else 
-//                        +                    + 
-//                       +                      + 
-//                      +                        + 
-//            if.then:                         if.else: 
-//               <use %l>                         <use %l> 
-//               <...>                            <...> 
-//               br label %if.end                 br label %if.end 
-//                      +                        + 
-//                       +                      + 
-//                        +                    + 
-//            if.end ("footer"): 
-//                     %s.sink = phi [%st, if.then], [%se, if.else] 
-//                     <...> 
-//                     store %s.sink, %addr_s 
-//                     <...> 
-// 
-// 
-//===----------------------- TODO -----------------------------------------===// 
-// 
-// 1) Generalize to regions other than diamonds 
-// 2) Be more aggressive merging memory operations 
-// Note that both changes require register pressure control 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/AliasAnalysis.h" 
-#include "llvm/Analysis/CFG.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/Loads.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "mldst-motion" 
- 
-namespace { 
-//===----------------------------------------------------------------------===// 
-//                         MergedLoadStoreMotion Pass 
-//===----------------------------------------------------------------------===// 
-class MergedLoadStoreMotion { 
-  AliasAnalysis *AA = nullptr; 
- 
-  // The mergeLoad/Store algorithms could have Size0 * Size1 complexity, 
-  // where Size0 and Size1 are the #instructions on the two sides of 
-  // the diamond. The constant chosen here is arbitrary. Compiler Time 
-  // Control is enforced by the check Size0 * Size1 < MagicCompileTimeControl. 
-  const int MagicCompileTimeControl = 250; 
- 
-  const bool SplitFooterBB; 
-public: 
-  MergedLoadStoreMotion(bool SplitFooterBB) : SplitFooterBB(SplitFooterBB) {} 
-  bool run(Function &F, AliasAnalysis &AA); 
- 
-private: 
-  BasicBlock *getDiamondTail(BasicBlock *BB); 
-  bool isDiamondHead(BasicBlock *BB); 
-  // Routines for sinking stores 
-  StoreInst *canSinkFromBlock(BasicBlock *BB, StoreInst *SI); 
-  PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1); 
-  bool isStoreSinkBarrierInRange(const Instruction &Start, 
-                                 const Instruction &End, MemoryLocation Loc); 
-  bool canSinkStoresAndGEPs(StoreInst *S0, StoreInst *S1) const; 
-  void sinkStoresAndGEPs(BasicBlock *BB, StoreInst *SinkCand, 
-                         StoreInst *ElseInst); 
-  bool mergeStores(BasicBlock *BB); 
-}; 
-} // end anonymous namespace 
- 
-/// 
-/// Return tail block of a diamond. 
-/// 
-BasicBlock *MergedLoadStoreMotion::getDiamondTail(BasicBlock *BB) { 
-  assert(isDiamondHead(BB) && "Basic block is not head of a diamond"); 
-  return BB->getTerminator()->getSuccessor(0)->getSingleSuccessor(); 
-} 
- 
-/// 
-/// True when BB is the head of a diamond (hammock) 
-/// 
-bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) { 
-  if (!BB) 
-    return false; 
-  auto *BI = dyn_cast<BranchInst>(BB->getTerminator()); 
-  if (!BI || !BI->isConditional()) 
-    return false; 
- 
-  BasicBlock *Succ0 = BI->getSuccessor(0); 
-  BasicBlock *Succ1 = BI->getSuccessor(1); 
- 
-  if (!Succ0->getSinglePredecessor()) 
-    return false; 
-  if (!Succ1->getSinglePredecessor()) 
-    return false; 
- 
-  BasicBlock *Succ0Succ = Succ0->getSingleSuccessor(); 
-  BasicBlock *Succ1Succ = Succ1->getSingleSuccessor(); 
-  // Ignore triangles. 
-  if (!Succ0Succ || !Succ1Succ || Succ0Succ != Succ1Succ) 
-    return false; 
-  return true; 
-} 
- 
- 
-/// 
-/// True when instruction is a sink barrier for a store 
-/// located in Loc 
-/// 
-/// Whenever an instruction could possibly read or modify the 
-/// value being stored or protect against the store from 
-/// happening it is considered a sink barrier. 
-/// 
-bool MergedLoadStoreMotion::isStoreSinkBarrierInRange(const Instruction &Start, 
-                                                      const Instruction &End, 
-                                                      MemoryLocation Loc) { 
-  for (const Instruction &Inst : 
-       make_range(Start.getIterator(), End.getIterator())) 
-    if (Inst.mayThrow()) 
-      return true; 
-  return AA->canInstructionRangeModRef(Start, End, Loc, ModRefInfo::ModRef); 
-} 
- 
-/// 
-/// Check if \p BB contains a store to the same address as \p SI 
-/// 
-/// \return The store in \p  when it is safe to sink. Otherwise return Null. 
-/// 
-StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1, 
-                                                   StoreInst *Store0) { 
-  LLVM_DEBUG(dbgs() << "can Sink? : "; Store0->dump(); dbgs() << "\n"); 
-  BasicBlock *BB0 = Store0->getParent(); 
-  for (Instruction &Inst : reverse(*BB1)) { 
-    auto *Store1 = dyn_cast<StoreInst>(&Inst); 
-    if (!Store1) 
-      continue; 
- 
-    MemoryLocation Loc0 = MemoryLocation::get(Store0); 
-    MemoryLocation Loc1 = MemoryLocation::get(Store1); 
-    if (AA->isMustAlias(Loc0, Loc1) && Store0->isSameOperationAs(Store1) && 
-        !isStoreSinkBarrierInRange(*Store1->getNextNode(), BB1->back(), Loc1) && 
-        !isStoreSinkBarrierInRange(*Store0->getNextNode(), BB0->back(), Loc0)) { 
-      return Store1; 
-    } 
-  } 
-  return nullptr; 
-} 
- 
-/// 
-/// Create a PHI node in BB for the operands of S0 and S1 
-/// 
-PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0, 
-                                              StoreInst *S1) { 
-  // Create a phi if the values mismatch. 
-  Value *Opd1 = S0->getValueOperand(); 
-  Value *Opd2 = S1->getValueOperand(); 
-  if (Opd1 == Opd2) 
-    return nullptr; 
- 
-  auto *NewPN = PHINode::Create(Opd1->getType(), 2, Opd2->getName() + ".sink", 
-                                &BB->front()); 
-  NewPN->applyMergedLocation(S0->getDebugLoc(), S1->getDebugLoc()); 
-  NewPN->addIncoming(Opd1, S0->getParent()); 
-  NewPN->addIncoming(Opd2, S1->getParent()); 
-  return NewPN; 
-} 
- 
-/// 
-/// Check if 2 stores can be sunk together with corresponding GEPs 
-/// 
-bool MergedLoadStoreMotion::canSinkStoresAndGEPs(StoreInst *S0, 
-                                                 StoreInst *S1) const { 
-  auto *A0 = dyn_cast<Instruction>(S0->getPointerOperand()); 
-  auto *A1 = dyn_cast<Instruction>(S1->getPointerOperand()); 
-  return A0 && A1 && A0->isIdenticalTo(A1) && A0->hasOneUse() && 
-         (A0->getParent() == S0->getParent()) && A1->hasOneUse() && 
-         (A1->getParent() == S1->getParent()) && isa<GetElementPtrInst>(A0); 
-} 
- 
-/// 
-/// Merge two stores to same address and sink into \p BB 
-/// 
-/// Also sinks GEP instruction computing the store address 
-/// 
-void MergedLoadStoreMotion::sinkStoresAndGEPs(BasicBlock *BB, StoreInst *S0, 
-                                              StoreInst *S1) { 
-  // Only one definition? 
-  auto *A0 = dyn_cast<Instruction>(S0->getPointerOperand()); 
-  auto *A1 = dyn_cast<Instruction>(S1->getPointerOperand()); 
-  LLVM_DEBUG(dbgs() << "Sink Instruction into BB \n"; BB->dump(); 
-             dbgs() << "Instruction Left\n"; S0->dump(); dbgs() << "\n"; 
-             dbgs() << "Instruction Right\n"; S1->dump(); dbgs() << "\n"); 
-  // Hoist the instruction. 
-  BasicBlock::iterator InsertPt = BB->getFirstInsertionPt(); 
-  // Intersect optional metadata. 
-  S0->andIRFlags(S1); 
-  S0->dropUnknownNonDebugMetadata(); 
- 
-  // Create the new store to be inserted at the join point. 
-  StoreInst *SNew = cast<StoreInst>(S0->clone()); 
-  Instruction *ANew = A0->clone(); 
-  SNew->insertBefore(&*InsertPt); 
-  ANew->insertBefore(SNew); 
- 
-  assert(S0->getParent() == A0->getParent()); 
-  assert(S1->getParent() == A1->getParent()); 
- 
-  // New PHI operand? Use it. 
-  if (PHINode *NewPN = getPHIOperand(BB, S0, S1)) 
-    SNew->setOperand(0, NewPN); 
-  S0->eraseFromParent(); 
-  S1->eraseFromParent(); 
-  A0->replaceAllUsesWith(ANew); 
-  A0->eraseFromParent(); 
-  A1->replaceAllUsesWith(ANew); 
-  A1->eraseFromParent(); 
-} 
- 
-/// 
-/// True when two stores are equivalent and can sink into the footer 
-/// 
-/// Starting from a diamond head block, iterate over the instructions in one 
-/// successor block and try to match a store in the second successor. 
-/// 
-bool MergedLoadStoreMotion::mergeStores(BasicBlock *HeadBB) { 
- 
-  bool MergedStores = false; 
-  BasicBlock *TailBB = getDiamondTail(HeadBB); 
-  BasicBlock *SinkBB = TailBB; 
-  assert(SinkBB && "Footer of a diamond cannot be empty"); 
- 
-  succ_iterator SI = succ_begin(HeadBB); 
-  assert(SI != succ_end(HeadBB) && "Diamond head cannot have zero successors"); 
-  BasicBlock *Pred0 = *SI; 
-  ++SI; 
-  assert(SI != succ_end(HeadBB) && "Diamond head cannot have single successor"); 
-  BasicBlock *Pred1 = *SI; 
-  // tail block  of a diamond/hammock? 
-  if (Pred0 == Pred1) 
-    return false; // No. 
-  // bail out early if we can not merge into the footer BB 
-  if (!SplitFooterBB && TailBB->hasNPredecessorsOrMore(3)) 
-    return false; 
-  // #Instructions in Pred1 for Compile Time Control 
-  auto InstsNoDbg = Pred1->instructionsWithoutDebug(); 
-  int Size1 = std::distance(InstsNoDbg.begin(), InstsNoDbg.end()); 
-  int NStores = 0; 
- 
-  for (BasicBlock::reverse_iterator RBI = Pred0->rbegin(), RBE = Pred0->rend(); 
-       RBI != RBE;) { 
- 
-    Instruction *I = &*RBI; 
-    ++RBI; 
- 
-    // Don't sink non-simple (atomic, volatile) stores. 
-    auto *S0 = dyn_cast<StoreInst>(I); 
-    if (!S0 || !S0->isSimple()) 
-      continue; 
- 
-    ++NStores; 
-    if (NStores * Size1 >= MagicCompileTimeControl) 
-      break; 
-    if (StoreInst *S1 = canSinkFromBlock(Pred1, S0)) { 
-      if (!canSinkStoresAndGEPs(S0, S1)) 
-        // Don't attempt to sink below stores that had to stick around 
-        // But after removal of a store and some of its feeding 
-        // instruction search again from the beginning since the iterator 
-        // is likely stale at this point. 
-        break; 
- 
-      if (SinkBB == TailBB && TailBB->hasNPredecessorsOrMore(3)) { 
-        // We have more than 2 predecessors. Insert a new block 
-        // postdominating 2 predecessors we're going to sink from. 
-        SinkBB = SplitBlockPredecessors(TailBB, {Pred0, Pred1}, ".sink.split"); 
-        if (!SinkBB) 
-          break; 
-      } 
- 
-      MergedStores = true; 
-      sinkStoresAndGEPs(SinkBB, S0, S1); 
-      RBI = Pred0->rbegin(); 
-      RBE = Pred0->rend(); 
-      LLVM_DEBUG(dbgs() << "Search again\n"; Instruction *I = &*RBI; I->dump()); 
-    } 
-  } 
-  return MergedStores; 
-} 
- 
-bool MergedLoadStoreMotion::run(Function &F, AliasAnalysis &AA) { 
-  this->AA = &AA; 
- 
-  bool Changed = false; 
-  LLVM_DEBUG(dbgs() << "Instruction Merger\n"); 
- 
-  // Merge unconditional branches, allowing PRE to catch more 
-  // optimization opportunities. 
-  // This loop doesn't care about newly inserted/split blocks  
-  // since they never will be diamond heads. 
-  for (BasicBlock &BB : make_early_inc_range(F)) 
-    // Hoist equivalent loads and sink stores 
-    // outside diamonds when possible 
-    if (isDiamondHead(&BB)) 
-      Changed |= mergeStores(&BB); 
-  return Changed; 
-} 
- 
-namespace { 
-class MergedLoadStoreMotionLegacyPass : public FunctionPass { 
-  const bool SplitFooterBB; 
-public: 
-  static char ID; // Pass identification, replacement for typeid 
-  MergedLoadStoreMotionLegacyPass(bool SplitFooterBB = false) 
-      : FunctionPass(ID), SplitFooterBB(SplitFooterBB) { 
-    initializeMergedLoadStoreMotionLegacyPassPass( 
-        *PassRegistry::getPassRegistry()); 
-  } 
- 
-  /// 
-  /// Run the transformation for each function 
-  /// 
-  bool runOnFunction(Function &F) override { 
-    if (skipFunction(F)) 
-      return false; 
-    MergedLoadStoreMotion Impl(SplitFooterBB); 
-    return Impl.run(F, getAnalysis<AAResultsWrapperPass>().getAAResults()); 
-  } 
- 
-private: 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    if (!SplitFooterBB) 
-      AU.setPreservesCFG(); 
-    AU.addRequired<AAResultsWrapperPass>(); 
-    AU.addPreserved<GlobalsAAWrapperPass>(); 
-  } 
-}; 
- 
-char MergedLoadStoreMotionLegacyPass::ID = 0; 
-} // anonymous namespace 
- 
-/// 
-/// createMergedLoadStoreMotionPass - The public interface to this file. 
-/// 
-FunctionPass *llvm::createMergedLoadStoreMotionPass(bool SplitFooterBB) { 
-  return new MergedLoadStoreMotionLegacyPass(SplitFooterBB); 
-} 
- 
-INITIALIZE_PASS_BEGIN(MergedLoadStoreMotionLegacyPass, "mldst-motion", 
-                      "MergedLoadStoreMotion", false, false) 
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 
-INITIALIZE_PASS_END(MergedLoadStoreMotionLegacyPass, "mldst-motion", 
-                    "MergedLoadStoreMotion", false, false) 
- 
-PreservedAnalyses 
-MergedLoadStoreMotionPass::run(Function &F, FunctionAnalysisManager &AM) { 
-  MergedLoadStoreMotion Impl(Options.SplitFooterBB); 
-  auto &AA = AM.getResult<AAManager>(F); 
-  if (!Impl.run(F, AA)) 
-    return PreservedAnalyses::all(); 
- 
-  PreservedAnalyses PA; 
-  if (!Options.SplitFooterBB) 
-    PA.preserveSet<CFGAnalyses>(); 
-  PA.preserve<GlobalsAA>(); 
-  return PA; 
-} 
+//===- MergedLoadStoreMotion.cpp - merge and hoist/sink load/stores -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//! \file
+//! This pass performs merges of loads and stores on both sides of a
+//  diamond (hammock). It hoists the loads and sinks the stores.
+//
+// The algorithm iteratively hoists two loads to the same address out of a
+// diamond (hammock) and merges them into a single load in the header. Similar
+// it sinks and merges two stores to the tail block (footer). The algorithm
+// iterates over the instructions of one side of the diamond and attempts to
+// find a matching load/store on the other side. New tail/footer block may be
+// insterted if the tail/footer block has more predecessors (not only the two
+// predecessors that are forming the diamond). It hoists / sinks when it thinks
+// it safe to do so.  This optimization helps with eg. hiding load latencies,
+// triggering if-conversion, and reducing static code size.
+//
+// NOTE: This code no longer performs load hoisting, it is subsumed by GVNHoist.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+// Example:
+// Diamond shaped code before merge:
+//
+//            header:
+//                     br %cond, label %if.then, label %if.else
+//                        +                    +
+//                       +                      +
+//                      +                        +
+//            if.then:                         if.else:
+//               %lt = load %addr_l               %le = load %addr_l
+//               <use %lt>                        <use %le>
+//               <...>                            <...>
+//               store %st, %addr_s               store %se, %addr_s
+//               br label %if.end                 br label %if.end
+//                     +                         +
+//                      +                       +
+//                       +                     +
+//            if.end ("footer"):
+//                     <...>
+//
+// Diamond shaped code after merge:
+//
+//            header:
+//                     %l = load %addr_l
+//                     br %cond, label %if.then, label %if.else
+//                        +                    +
+//                       +                      +
+//                      +                        +
+//            if.then:                         if.else:
+//               <use %l>                         <use %l>
+//               <...>                            <...>
+//               br label %if.end                 br label %if.end
+//                      +                        +
+//                       +                      +
+//                        +                    +
+//            if.end ("footer"):
+//                     %s.sink = phi [%st, if.then], [%se, if.else]
+//                     <...>
+//                     store %s.sink, %addr_s
+//                     <...>
+//
+//
+//===----------------------- TODO -----------------------------------------===//
+//
+// 1) Generalize to regions other than diamonds
+// 2) Be more aggressive merging memory operations
+// Note that both changes require register pressure control
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mldst-motion"
+
+namespace {
+//===----------------------------------------------------------------------===//
+//                         MergedLoadStoreMotion Pass
+//===----------------------------------------------------------------------===//
+class MergedLoadStoreMotion {
+  AliasAnalysis *AA = nullptr;
+
+  // The mergeLoad/Store algorithms could have Size0 * Size1 complexity,
+  // where Size0 and Size1 are the #instructions on the two sides of
+  // the diamond. The constant chosen here is arbitrary. Compiler Time
+  // Control is enforced by the check Size0 * Size1 < MagicCompileTimeControl.
+  const int MagicCompileTimeControl = 250;
+
+  const bool SplitFooterBB;
+public:
+  MergedLoadStoreMotion(bool SplitFooterBB) : SplitFooterBB(SplitFooterBB) {}
+  bool run(Function &F, AliasAnalysis &AA);
+
+private:
+  BasicBlock *getDiamondTail(BasicBlock *BB);
+  bool isDiamondHead(BasicBlock *BB);
+  // Routines for sinking stores
+  StoreInst *canSinkFromBlock(BasicBlock *BB, StoreInst *SI);
+  PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1);
+  bool isStoreSinkBarrierInRange(const Instruction &Start,
+                                 const Instruction &End, MemoryLocation Loc);
+  bool canSinkStoresAndGEPs(StoreInst *S0, StoreInst *S1) const;
+  void sinkStoresAndGEPs(BasicBlock *BB, StoreInst *SinkCand,
+                         StoreInst *ElseInst);
+  bool mergeStores(BasicBlock *BB);
+};
+} // end anonymous namespace
+
+///
+/// Return tail block of a diamond.
+///
+BasicBlock *MergedLoadStoreMotion::getDiamondTail(BasicBlock *BB) {
+  assert(isDiamondHead(BB) && "Basic block is not head of a diamond");
+  return BB->getTerminator()->getSuccessor(0)->getSingleSuccessor();
+}
+
+///
+/// True when BB is the head of a diamond (hammock)
+///
+bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) {
+  if (!BB)
+    return false;
+  auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
+  if (!BI || !BI->isConditional())
+    return false;
+
+  BasicBlock *Succ0 = BI->getSuccessor(0);
+  BasicBlock *Succ1 = BI->getSuccessor(1);
+
+  if (!Succ0->getSinglePredecessor())
+    return false;
+  if (!Succ1->getSinglePredecessor())
+    return false;
+
+  BasicBlock *Succ0Succ = Succ0->getSingleSuccessor();
+  BasicBlock *Succ1Succ = Succ1->getSingleSuccessor();
+  // Ignore triangles.
+  if (!Succ0Succ || !Succ1Succ || Succ0Succ != Succ1Succ)
+    return false;
+  return true;
+}
+
+
+///
+/// True when instruction is a sink barrier for a store
+/// located in Loc
+///
+/// Whenever an instruction could possibly read or modify the
+/// value being stored or protect against the store from
+/// happening it is considered a sink barrier.
+///
+bool MergedLoadStoreMotion::isStoreSinkBarrierInRange(const Instruction &Start,
+                                                      const Instruction &End,
+                                                      MemoryLocation Loc) {
+  for (const Instruction &Inst :
+       make_range(Start.getIterator(), End.getIterator()))
+    if (Inst.mayThrow())
+      return true;
+  return AA->canInstructionRangeModRef(Start, End, Loc, ModRefInfo::ModRef);
+}
+
+///
+/// Check if \p BB contains a store to the same address as \p SI
+///
+/// \return The store in \p  when it is safe to sink. Otherwise return Null.
+///
+StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1,
+                                                   StoreInst *Store0) {
+  LLVM_DEBUG(dbgs() << "can Sink? : "; Store0->dump(); dbgs() << "\n");
+  BasicBlock *BB0 = Store0->getParent();
+  for (Instruction &Inst : reverse(*BB1)) {
+    auto *Store1 = dyn_cast<StoreInst>(&Inst);
+    if (!Store1)
+      continue;
+
+    MemoryLocation Loc0 = MemoryLocation::get(Store0);
+    MemoryLocation Loc1 = MemoryLocation::get(Store1);
+    if (AA->isMustAlias(Loc0, Loc1) && Store0->isSameOperationAs(Store1) &&
+        !isStoreSinkBarrierInRange(*Store1->getNextNode(), BB1->back(), Loc1) &&
+        !isStoreSinkBarrierInRange(*Store0->getNextNode(), BB0->back(), Loc0)) {
+      return Store1;
+    }
+  }
+  return nullptr;
+}
+
+///
+/// Create a PHI node in BB for the operands of S0 and S1
+///
+PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0,
+                                              StoreInst *S1) {
+  // Create a phi if the values mismatch.
+  Value *Opd1 = S0->getValueOperand();
+  Value *Opd2 = S1->getValueOperand();
+  if (Opd1 == Opd2)
+    return nullptr;
+
+  auto *NewPN = PHINode::Create(Opd1->getType(), 2, Opd2->getName() + ".sink",
+                                &BB->front());
+  NewPN->applyMergedLocation(S0->getDebugLoc(), S1->getDebugLoc());
+  NewPN->addIncoming(Opd1, S0->getParent());
+  NewPN->addIncoming(Opd2, S1->getParent());
+  return NewPN;
+}
+
+///
+/// Check if 2 stores can be sunk together with corresponding GEPs
+///
+bool MergedLoadStoreMotion::canSinkStoresAndGEPs(StoreInst *S0,
+                                                 StoreInst *S1) const {
+  auto *A0 = dyn_cast<Instruction>(S0->getPointerOperand());
+  auto *A1 = dyn_cast<Instruction>(S1->getPointerOperand());
+  return A0 && A1 && A0->isIdenticalTo(A1) && A0->hasOneUse() &&
+         (A0->getParent() == S0->getParent()) && A1->hasOneUse() &&
+         (A1->getParent() == S1->getParent()) && isa<GetElementPtrInst>(A0);
+}
+
+///
+/// Merge two stores to same address and sink into \p BB
+///
+/// Also sinks GEP instruction computing the store address
+///
+void MergedLoadStoreMotion::sinkStoresAndGEPs(BasicBlock *BB, StoreInst *S0,
+                                              StoreInst *S1) {
+  // Only one definition?
+  auto *A0 = dyn_cast<Instruction>(S0->getPointerOperand());
+  auto *A1 = dyn_cast<Instruction>(S1->getPointerOperand());
+  LLVM_DEBUG(dbgs() << "Sink Instruction into BB \n"; BB->dump();
+             dbgs() << "Instruction Left\n"; S0->dump(); dbgs() << "\n";
+             dbgs() << "Instruction Right\n"; S1->dump(); dbgs() << "\n");
+  // Hoist the instruction.
+  BasicBlock::iterator InsertPt = BB->getFirstInsertionPt();
+  // Intersect optional metadata.
+  S0->andIRFlags(S1);
+  S0->dropUnknownNonDebugMetadata();
+
+  // Create the new store to be inserted at the join point.
+  StoreInst *SNew = cast<StoreInst>(S0->clone());
+  Instruction *ANew = A0->clone();
+  SNew->insertBefore(&*InsertPt);
+  ANew->insertBefore(SNew);
+
+  assert(S0->getParent() == A0->getParent());
+  assert(S1->getParent() == A1->getParent());
+
+  // New PHI operand? Use it.
+  if (PHINode *NewPN = getPHIOperand(BB, S0, S1))
+    SNew->setOperand(0, NewPN);
+  S0->eraseFromParent();
+  S1->eraseFromParent();
+  A0->replaceAllUsesWith(ANew);
+  A0->eraseFromParent();
+  A1->replaceAllUsesWith(ANew);
+  A1->eraseFromParent();
+}
+
+///
+/// True when two stores are equivalent and can sink into the footer
+///
+/// Starting from a diamond head block, iterate over the instructions in one
+/// successor block and try to match a store in the second successor.
+///
+bool MergedLoadStoreMotion::mergeStores(BasicBlock *HeadBB) {
+
+  bool MergedStores = false;
+  BasicBlock *TailBB = getDiamondTail(HeadBB);
+  BasicBlock *SinkBB = TailBB;
+  assert(SinkBB && "Footer of a diamond cannot be empty");
+
+  succ_iterator SI = succ_begin(HeadBB);
+  assert(SI != succ_end(HeadBB) && "Diamond head cannot have zero successors");
+  BasicBlock *Pred0 = *SI;
+  ++SI;
+  assert(SI != succ_end(HeadBB) && "Diamond head cannot have single successor");
+  BasicBlock *Pred1 = *SI;
+  // tail block  of a diamond/hammock?
+  if (Pred0 == Pred1)
+    return false; // No.
+  // bail out early if we can not merge into the footer BB
+  if (!SplitFooterBB && TailBB->hasNPredecessorsOrMore(3))
+    return false;
+  // #Instructions in Pred1 for Compile Time Control
+  auto InstsNoDbg = Pred1->instructionsWithoutDebug();
+  int Size1 = std::distance(InstsNoDbg.begin(), InstsNoDbg.end());
+  int NStores = 0;
+
+  for (BasicBlock::reverse_iterator RBI = Pred0->rbegin(), RBE = Pred0->rend();
+       RBI != RBE;) {
+
+    Instruction *I = &*RBI;
+    ++RBI;
+
+    // Don't sink non-simple (atomic, volatile) stores.
+    auto *S0 = dyn_cast<StoreInst>(I);
+    if (!S0 || !S0->isSimple())
+      continue;
+
+    ++NStores;
+    if (NStores * Size1 >= MagicCompileTimeControl)
+      break;
+    if (StoreInst *S1 = canSinkFromBlock(Pred1, S0)) {
+      if (!canSinkStoresAndGEPs(S0, S1))
+        // Don't attempt to sink below stores that had to stick around
+        // But after removal of a store and some of its feeding
+        // instruction search again from the beginning since the iterator
+        // is likely stale at this point.
+        break;
+
+      if (SinkBB == TailBB && TailBB->hasNPredecessorsOrMore(3)) {
+        // We have more than 2 predecessors. Insert a new block
+        // postdominating 2 predecessors we're going to sink from.
+        SinkBB = SplitBlockPredecessors(TailBB, {Pred0, Pred1}, ".sink.split");
+        if (!SinkBB)
+          break;
+      }
+
+      MergedStores = true;
+      sinkStoresAndGEPs(SinkBB, S0, S1);
+      RBI = Pred0->rbegin();
+      RBE = Pred0->rend();
+      LLVM_DEBUG(dbgs() << "Search again\n"; Instruction *I = &*RBI; I->dump());
+    }
+  }
+  return MergedStores;
+}
+
+bool MergedLoadStoreMotion::run(Function &F, AliasAnalysis &AA) {
+  this->AA = &AA;
+
+  bool Changed = false;
+  LLVM_DEBUG(dbgs() << "Instruction Merger\n");
+
+  // Merge unconditional branches, allowing PRE to catch more
+  // optimization opportunities.
+  // This loop doesn't care about newly inserted/split blocks 
+  // since they never will be diamond heads.
+  for (BasicBlock &BB : make_early_inc_range(F))
+    // Hoist equivalent loads and sink stores
+    // outside diamonds when possible
+    if (isDiamondHead(&BB))
+      Changed |= mergeStores(&BB);
+  return Changed;
+}
+
+namespace {
+class MergedLoadStoreMotionLegacyPass : public FunctionPass {
+  const bool SplitFooterBB;
+public:
+  static char ID; // Pass identification, replacement for typeid
+  MergedLoadStoreMotionLegacyPass(bool SplitFooterBB = false)
+      : FunctionPass(ID), SplitFooterBB(SplitFooterBB) {
+    initializeMergedLoadStoreMotionLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  ///
+  /// Run the transformation for each function
+  ///
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    MergedLoadStoreMotion Impl(SplitFooterBB);
+    return Impl.run(F, getAnalysis<AAResultsWrapperPass>().getAAResults());
+  }
+
+private:
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    if (!SplitFooterBB)
+      AU.setPreservesCFG();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+
+char MergedLoadStoreMotionLegacyPass::ID = 0;
+} // anonymous namespace
+
+///
+/// createMergedLoadStoreMotionPass - The public interface to this file.
+///
+FunctionPass *llvm::createMergedLoadStoreMotionPass(bool SplitFooterBB) {
+  return new MergedLoadStoreMotionLegacyPass(SplitFooterBB);
+}
+
+INITIALIZE_PASS_BEGIN(MergedLoadStoreMotionLegacyPass, "mldst-motion",
+                      "MergedLoadStoreMotion", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(MergedLoadStoreMotionLegacyPass, "mldst-motion",
+                    "MergedLoadStoreMotion", false, false)
+
+PreservedAnalyses
+MergedLoadStoreMotionPass::run(Function &F, FunctionAnalysisManager &AM) {
+  MergedLoadStoreMotion Impl(Options.SplitFooterBB);
+  auto &AA = AM.getResult<AAManager>(F);
+  if (!Impl.run(F, AA))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  if (!Options.SplitFooterBB)
+    PA.preserveSet<CFGAnalyses>();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/NaryReassociate.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/NaryReassociate.cpp
index bb49b06b35..32bb62129e 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -1,228 +1,228 @@
-//===- NaryReassociate.cpp - Reassociate n-ary expressions ----------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass reassociates n-ary add expressions and eliminates the redundancy 
-// exposed by the reassociation. 
-// 
-// A motivating example: 
-// 
-//   void foo(int a, int b) { 
-//     bar(a + b); 
-//     bar((a + 2) + b); 
-//   } 
-// 
-// An ideal compiler should reassociate (a + 2) + b to (a + b) + 2 and simplify 
-// the above code to 
-// 
-//   int t = a + b; 
-//   bar(t); 
-//   bar(t + 2); 
-// 
-// However, the Reassociate pass is unable to do that because it processes each 
-// instruction individually and believes (a + 2) + b is the best form according 
-// to its rank system. 
-// 
-// To address this limitation, NaryReassociate reassociates an expression in a 
-// form that reuses existing instructions. As a result, NaryReassociate can 
-// reassociate (a + 2) + b in the example to (a + b) + 2 because it detects that 
-// (a + b) is computed before. 
-// 
-// NaryReassociate works as follows. For every instruction in the form of (a + 
-// b) + c, it checks whether a + c or b + c is already computed by a dominating 
-// instruction. If so, it then reassociates (a + b) + c into (a + c) + b or (b + 
-// c) + a and removes the redundancy accordingly. To efficiently look up whether 
-// an expression is computed before, we store each instruction seen and its SCEV 
-// into an SCEV-to-instruction map. 
-// 
-// Although the algorithm pattern-matches only ternary additions, it 
-// automatically handles many >3-ary expressions by walking through the function 
-// in the depth-first order. For example, given 
-// 
-//   (a + c) + d 
-//   ((a + b) + c) + d 
-// 
-// NaryReassociate first rewrites (a + b) + c to (a + c) + b, and then rewrites 
-// ((a + c) + b) + d into ((a + c) + d) + b. 
-// 
-// Finally, the above dominator-based algorithm may need to be run multiple 
-// iterations before emitting optimal code. One source of this need is that we 
-// only split an operand when it is used only once. The above algorithm can 
-// eliminate an instruction and decrease the usage count of its operands. As a 
-// result, an instruction that previously had multiple uses may become a 
-// single-use instruction and thus eligible for split consideration. For 
-// example, 
-// 
-//   ac = a + c 
-//   ab = a + b 
-//   abc = ab + c 
-//   ab2 = ab + b 
-//   ab2c = ab2 + c 
-// 
-// In the first iteration, we cannot reassociate abc to ac+b because ab is used 
-// twice. However, we can reassociate ab2c to abc+b in the first iteration. As a 
-// result, ab2 becomes dead and ab will be used only once in the second 
-// iteration. 
-// 
-// Limitations and TODO items: 
-// 
-// 1) We only considers n-ary adds and muls for now. This should be extended 
-// and generalized. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/NaryReassociate.h" 
-#include "llvm/ADT/DepthFirstIterator.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GetElementPtrTypeIterator.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Operator.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/IR/ValueHandle.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include <cassert> 
-#include <cstdint> 
- 
-using namespace llvm; 
-using namespace PatternMatch; 
- 
-#define DEBUG_TYPE "nary-reassociate" 
- 
-namespace { 
- 
-class NaryReassociateLegacyPass : public FunctionPass { 
-public: 
-  static char ID; 
- 
-  NaryReassociateLegacyPass() : FunctionPass(ID) { 
-    initializeNaryReassociateLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool doInitialization(Module &M) override { 
-    return false; 
-  } 
- 
-  bool runOnFunction(Function &F) override; 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addPreserved<DominatorTreeWrapperPass>(); 
-    AU.addPreserved<ScalarEvolutionWrapperPass>(); 
-    AU.addPreserved<TargetLibraryInfoWrapperPass>(); 
-    AU.addRequired<AssumptionCacheTracker>(); 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addRequired<ScalarEvolutionWrapperPass>(); 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-    AU.addRequired<TargetTransformInfoWrapperPass>(); 
-    AU.setPreservesCFG(); 
-  } 
- 
-private: 
-  NaryReassociatePass Impl; 
-}; 
- 
-} // end anonymous namespace 
- 
-char NaryReassociateLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(NaryReassociateLegacyPass, "nary-reassociate", 
-                      "Nary reassociation", false, false) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 
-INITIALIZE_PASS_END(NaryReassociateLegacyPass, "nary-reassociate", 
-                    "Nary reassociation", false, false) 
- 
-FunctionPass *llvm::createNaryReassociatePass() { 
-  return new NaryReassociateLegacyPass(); 
-} 
- 
-bool NaryReassociateLegacyPass::runOnFunction(Function &F) { 
-  if (skipFunction(F)) 
-    return false; 
- 
-  auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 
-  auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-  auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 
-  auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 
-  auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 
- 
-  return Impl.runImpl(F, AC, DT, SE, TLI, TTI); 
-} 
- 
-PreservedAnalyses NaryReassociatePass::run(Function &F, 
-                                           FunctionAnalysisManager &AM) { 
-  auto *AC = &AM.getResult<AssumptionAnalysis>(F); 
-  auto *DT = &AM.getResult<DominatorTreeAnalysis>(F); 
-  auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F); 
-  auto *TLI = &AM.getResult<TargetLibraryAnalysis>(F); 
-  auto *TTI = &AM.getResult<TargetIRAnalysis>(F); 
- 
-  if (!runImpl(F, AC, DT, SE, TLI, TTI)) 
-    return PreservedAnalyses::all(); 
- 
-  PreservedAnalyses PA; 
-  PA.preserveSet<CFGAnalyses>(); 
-  PA.preserve<ScalarEvolutionAnalysis>(); 
-  return PA; 
-} 
- 
-bool NaryReassociatePass::runImpl(Function &F, AssumptionCache *AC_, 
-                                  DominatorTree *DT_, ScalarEvolution *SE_, 
-                                  TargetLibraryInfo *TLI_, 
-                                  TargetTransformInfo *TTI_) { 
-  AC = AC_; 
-  DT = DT_; 
-  SE = SE_; 
-  TLI = TLI_; 
-  TTI = TTI_; 
-  DL = &F.getParent()->getDataLayout(); 
- 
-  bool Changed = false, ChangedInThisIteration; 
-  do { 
-    ChangedInThisIteration = doOneIteration(F); 
-    Changed |= ChangedInThisIteration; 
-  } while (ChangedInThisIteration); 
-  return Changed; 
-} 
- 
-bool NaryReassociatePass::doOneIteration(Function &F) { 
-  bool Changed = false; 
-  SeenExprs.clear(); 
-  // Process the basic blocks in a depth first traversal of the dominator 
-  // tree. This order ensures that all bases of a candidate are in Candidates 
-  // when we process it. 
+//===- NaryReassociate.cpp - Reassociate n-ary expressions ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass reassociates n-ary add expressions and eliminates the redundancy
+// exposed by the reassociation.
+//
+// A motivating example:
+//
+//   void foo(int a, int b) {
+//     bar(a + b);
+//     bar((a + 2) + b);
+//   }
+//
+// An ideal compiler should reassociate (a + 2) + b to (a + b) + 2 and simplify
+// the above code to
+//
+//   int t = a + b;
+//   bar(t);
+//   bar(t + 2);
+//
+// However, the Reassociate pass is unable to do that because it processes each
+// instruction individually and believes (a + 2) + b is the best form according
+// to its rank system.
+//
+// To address this limitation, NaryReassociate reassociates an expression in a
+// form that reuses existing instructions. As a result, NaryReassociate can
+// reassociate (a + 2) + b in the example to (a + b) + 2 because it detects that
+// (a + b) is computed before.
+//
+// NaryReassociate works as follows. For every instruction in the form of (a +
+// b) + c, it checks whether a + c or b + c is already computed by a dominating
+// instruction. If so, it then reassociates (a + b) + c into (a + c) + b or (b +
+// c) + a and removes the redundancy accordingly. To efficiently look up whether
+// an expression is computed before, we store each instruction seen and its SCEV
+// into an SCEV-to-instruction map.
+//
+// Although the algorithm pattern-matches only ternary additions, it
+// automatically handles many >3-ary expressions by walking through the function
+// in the depth-first order. For example, given
+//
+//   (a + c) + d
+//   ((a + b) + c) + d
+//
+// NaryReassociate first rewrites (a + b) + c to (a + c) + b, and then rewrites
+// ((a + c) + b) + d into ((a + c) + d) + b.
+//
+// Finally, the above dominator-based algorithm may need to be run multiple
+// iterations before emitting optimal code. One source of this need is that we
+// only split an operand when it is used only once. The above algorithm can
+// eliminate an instruction and decrease the usage count of its operands. As a
+// result, an instruction that previously had multiple uses may become a
+// single-use instruction and thus eligible for split consideration. For
+// example,
+//
+//   ac = a + c
+//   ab = a + b
+//   abc = ab + c
+//   ab2 = ab + b
+//   ab2c = ab2 + c
+//
+// In the first iteration, we cannot reassociate abc to ac+b because ab is used
+// twice. However, we can reassociate ab2c to abc+b in the first iteration. As a
+// result, ab2 becomes dead and ab will be used only once in the second
+// iteration.
+//
+// Limitations and TODO items:
+//
+// 1) We only considers n-ary adds and muls for now. This should be extended
+// and generalized.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/NaryReassociate.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <cstdint>
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "nary-reassociate"
+
+namespace {
+
+class NaryReassociateLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  NaryReassociateLegacyPass() : FunctionPass(ID) {
+    initializeNaryReassociateLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool doInitialization(Module &M) override {
+    return false;
+  }
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<ScalarEvolutionWrapperPass>();
+    AU.addPreserved<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.setPreservesCFG();
+  }
+
+private:
+  NaryReassociatePass Impl;
+};
+
+} // end anonymous namespace
+
+char NaryReassociateLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(NaryReassociateLegacyPass, "nary-reassociate",
+                      "Nary reassociation", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(NaryReassociateLegacyPass, "nary-reassociate",
+                    "Nary reassociation", false, false)
+
+FunctionPass *llvm::createNaryReassociatePass() {
+  return new NaryReassociateLegacyPass();
+}
+
+bool NaryReassociateLegacyPass::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+  return Impl.runImpl(F, AC, DT, SE, TLI, TTI);
+}
+
+PreservedAnalyses NaryReassociatePass::run(Function &F,
+                                           FunctionAnalysisManager &AM) {
+  auto *AC = &AM.getResult<AssumptionAnalysis>(F);
+  auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+  auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
+  auto *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
+  auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
+
+  if (!runImpl(F, AC, DT, SE, TLI, TTI))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<ScalarEvolutionAnalysis>();
+  return PA;
+}
+
+bool NaryReassociatePass::runImpl(Function &F, AssumptionCache *AC_,
+                                  DominatorTree *DT_, ScalarEvolution *SE_,
+                                  TargetLibraryInfo *TLI_,
+                                  TargetTransformInfo *TTI_) {
+  AC = AC_;
+  DT = DT_;
+  SE = SE_;
+  TLI = TLI_;
+  TTI = TTI_;
+  DL = &F.getParent()->getDataLayout();
+
+  bool Changed = false, ChangedInThisIteration;
+  do {
+    ChangedInThisIteration = doOneIteration(F);
+    Changed |= ChangedInThisIteration;
+  } while (ChangedInThisIteration);
+  return Changed;
+}
+
+bool NaryReassociatePass::doOneIteration(Function &F) {
+  bool Changed = false;
+  SeenExprs.clear();
+  // Process the basic blocks in a depth first traversal of the dominator
+  // tree. This order ensures that all bases of a candidate are in Candidates
+  // when we process it.
   SmallVector<WeakTrackingVH, 16> DeadInsts;
-  for (const auto Node : depth_first(DT)) { 
-    BasicBlock *BB = Node->getBlock(); 
-    for (auto I = BB->begin(); I != BB->end(); ++I) { 
+  for (const auto Node : depth_first(DT)) {
+    BasicBlock *BB = Node->getBlock();
+    for (auto I = BB->begin(); I != BB->end(); ++I) {
       Instruction *OrigI = &*I;
       const SCEV *OrigSCEV = nullptr;
       if (Instruction *NewI = tryReassociate(OrigI, OrigSCEV)) {
@@ -236,307 +236,307 @@ bool NaryReassociatePass::doOneIteration(Function &F) {
         const SCEV *NewSCEV = SE->getSCEV(NewI);
         SeenExprs[NewSCEV].push_back(WeakTrackingVH(NewI));
 
-        // Ideally, NewSCEV should equal OldSCEV because tryReassociate(I) 
-        // is equivalent to I. However, ScalarEvolution::getSCEV may 
+        // Ideally, NewSCEV should equal OldSCEV because tryReassociate(I)
+        // is equivalent to I. However, ScalarEvolution::getSCEV may
         // weaken nsw causing NewSCEV not to equal OldSCEV. For example,
         // suppose we reassociate
-        //   I = &a[sext(i +nsw j)] // assuming sizeof(a[0]) = 4 
-        // to 
-        //   NewI = &a[sext(i)] + sext(j). 
-        // 
-        // ScalarEvolution computes 
-        //   getSCEV(I)    = a + 4 * sext(i + j) 
-        //   getSCEV(newI) = a + 4 * sext(i) + 4 * sext(j) 
-        // which are different SCEVs. 
-        // 
-        // To alleviate this issue of ScalarEvolution not always capturing 
-        // equivalence, we add I to SeenExprs[OldSCEV] as well so that we can 
-        // map both SCEV before and after tryReassociate(I) to I. 
-        // 
+        //   I = &a[sext(i +nsw j)] // assuming sizeof(a[0]) = 4
+        // to
+        //   NewI = &a[sext(i)] + sext(j).
+        //
+        // ScalarEvolution computes
+        //   getSCEV(I)    = a + 4 * sext(i + j)
+        //   getSCEV(newI) = a + 4 * sext(i) + 4 * sext(j)
+        // which are different SCEVs.
+        //
+        // To alleviate this issue of ScalarEvolution not always capturing
+        // equivalence, we add I to SeenExprs[OldSCEV] as well so that we can
+        // map both SCEV before and after tryReassociate(I) to I.
+        //
         // This improvement is exercised in @reassociate_gep_nsw in
         // nary-gep.ll.
         if (NewSCEV != OrigSCEV)
           SeenExprs[OrigSCEV].push_back(WeakTrackingVH(NewI));
       } else if (OrigSCEV)
         SeenExprs[OrigSCEV].push_back(WeakTrackingVH(OrigI));
-    } 
-  } 
+    }
+  }
   // Delete all dead instructions from 'DeadInsts'.
   // Please note ScalarEvolution is updated along the way.
   RecursivelyDeleteTriviallyDeadInstructionsPermissive(
       DeadInsts, TLI, nullptr, [this](Value *V) { SE->forgetValue(V); });
 
-  return Changed; 
-} 
- 
+  return Changed;
+}
+
 Instruction *NaryReassociatePass::tryReassociate(Instruction * I,
                                                  const SCEV *&OrigSCEV) {
 
   if (!SE->isSCEVable(I->getType()))
     return nullptr;
 
-  switch (I->getOpcode()) { 
-  case Instruction::Add: 
-  case Instruction::Mul: 
+  switch (I->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Mul:
     OrigSCEV = SE->getSCEV(I);
-    return tryReassociateBinaryOp(cast<BinaryOperator>(I)); 
-  case Instruction::GetElementPtr: 
+    return tryReassociateBinaryOp(cast<BinaryOperator>(I));
+  case Instruction::GetElementPtr:
     OrigSCEV = SE->getSCEV(I);
-    return tryReassociateGEP(cast<GetElementPtrInst>(I)); 
-  default: 
+    return tryReassociateGEP(cast<GetElementPtrInst>(I));
+  default:
     return nullptr;
-  } 
+  }
 
   llvm_unreachable("should not be reached");
   return nullptr;
-} 
- 
-static bool isGEPFoldable(GetElementPtrInst *GEP, 
-                          const TargetTransformInfo *TTI) { 
+}
+
+static bool isGEPFoldable(GetElementPtrInst *GEP,
+                          const TargetTransformInfo *TTI) {
   SmallVector<const Value *, 4> Indices(GEP->indices());
-  return TTI->getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(), 
-                         Indices) == TargetTransformInfo::TCC_Free; 
-} 
- 
-Instruction *NaryReassociatePass::tryReassociateGEP(GetElementPtrInst *GEP) { 
-  // Not worth reassociating GEP if it is foldable. 
-  if (isGEPFoldable(GEP, TTI)) 
-    return nullptr; 
- 
-  gep_type_iterator GTI = gep_type_begin(*GEP); 
-  for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) { 
-    if (GTI.isSequential()) { 
-      if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I - 1, 
-                                                  GTI.getIndexedType())) { 
-        return NewGEP; 
-      } 
-    } 
-  } 
-  return nullptr; 
-} 
- 
-bool NaryReassociatePass::requiresSignExtension(Value *Index, 
-                                                GetElementPtrInst *GEP) { 
-  unsigned PointerSizeInBits = 
-      DL->getPointerSizeInBits(GEP->getType()->getPointerAddressSpace()); 
-  return cast<IntegerType>(Index->getType())->getBitWidth() < PointerSizeInBits; 
-} 
- 
-GetElementPtrInst * 
-NaryReassociatePass::tryReassociateGEPAtIndex(GetElementPtrInst *GEP, 
-                                              unsigned I, Type *IndexedType) { 
-  Value *IndexToSplit = GEP->getOperand(I + 1); 
-  if (SExtInst *SExt = dyn_cast<SExtInst>(IndexToSplit)) { 
-    IndexToSplit = SExt->getOperand(0); 
-  } else if (ZExtInst *ZExt = dyn_cast<ZExtInst>(IndexToSplit)) { 
-    // zext can be treated as sext if the source is non-negative. 
-    if (isKnownNonNegative(ZExt->getOperand(0), *DL, 0, AC, GEP, DT)) 
-      IndexToSplit = ZExt->getOperand(0); 
-  } 
- 
-  if (AddOperator *AO = dyn_cast<AddOperator>(IndexToSplit)) { 
-    // If the I-th index needs sext and the underlying add is not equipped with 
-    // nsw, we cannot split the add because 
-    //   sext(LHS + RHS) != sext(LHS) + sext(RHS). 
-    if (requiresSignExtension(IndexToSplit, GEP) && 
-        computeOverflowForSignedAdd(AO, *DL, AC, GEP, DT) != 
-            OverflowResult::NeverOverflows) 
-      return nullptr; 
- 
-    Value *LHS = AO->getOperand(0), *RHS = AO->getOperand(1); 
-    // IndexToSplit = LHS + RHS. 
-    if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType)) 
-      return NewGEP; 
-    // Symmetrically, try IndexToSplit = RHS + LHS. 
-    if (LHS != RHS) { 
-      if (auto *NewGEP = 
-              tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType)) 
-        return NewGEP; 
-    } 
-  } 
-  return nullptr; 
-} 
- 
-GetElementPtrInst * 
-NaryReassociatePass::tryReassociateGEPAtIndex(GetElementPtrInst *GEP, 
-                                              unsigned I, Value *LHS, 
-                                              Value *RHS, Type *IndexedType) { 
-  // Look for GEP's closest dominator that has the same SCEV as GEP except that 
-  // the I-th index is replaced with LHS. 
-  SmallVector<const SCEV *, 4> IndexExprs; 
-  for (auto Index = GEP->idx_begin(); Index != GEP->idx_end(); ++Index) 
-    IndexExprs.push_back(SE->getSCEV(*Index)); 
-  // Replace the I-th index with LHS. 
-  IndexExprs[I] = SE->getSCEV(LHS); 
-  if (isKnownNonNegative(LHS, *DL, 0, AC, GEP, DT) && 
+  return TTI->getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
+                         Indices) == TargetTransformInfo::TCC_Free;
+}
+
+Instruction *NaryReassociatePass::tryReassociateGEP(GetElementPtrInst *GEP) {
+  // Not worth reassociating GEP if it is foldable.
+  if (isGEPFoldable(GEP, TTI))
+    return nullptr;
+
+  gep_type_iterator GTI = gep_type_begin(*GEP);
+  for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
+    if (GTI.isSequential()) {
+      if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I - 1,
+                                                  GTI.getIndexedType())) {
+        return NewGEP;
+      }
+    }
+  }
+  return nullptr;
+}
+
+bool NaryReassociatePass::requiresSignExtension(Value *Index,
+                                                GetElementPtrInst *GEP) {
+  unsigned PointerSizeInBits =
+      DL->getPointerSizeInBits(GEP->getType()->getPointerAddressSpace());
+  return cast<IntegerType>(Index->getType())->getBitWidth() < PointerSizeInBits;
+}
+
+GetElementPtrInst *
+NaryReassociatePass::tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
+                                              unsigned I, Type *IndexedType) {
+  Value *IndexToSplit = GEP->getOperand(I + 1);
+  if (SExtInst *SExt = dyn_cast<SExtInst>(IndexToSplit)) {
+    IndexToSplit = SExt->getOperand(0);
+  } else if (ZExtInst *ZExt = dyn_cast<ZExtInst>(IndexToSplit)) {
+    // zext can be treated as sext if the source is non-negative.
+    if (isKnownNonNegative(ZExt->getOperand(0), *DL, 0, AC, GEP, DT))
+      IndexToSplit = ZExt->getOperand(0);
+  }
+
+  if (AddOperator *AO = dyn_cast<AddOperator>(IndexToSplit)) {
+    // If the I-th index needs sext and the underlying add is not equipped with
+    // nsw, we cannot split the add because
+    //   sext(LHS + RHS) != sext(LHS) + sext(RHS).
+    if (requiresSignExtension(IndexToSplit, GEP) &&
+        computeOverflowForSignedAdd(AO, *DL, AC, GEP, DT) !=
+            OverflowResult::NeverOverflows)
+      return nullptr;
+
+    Value *LHS = AO->getOperand(0), *RHS = AO->getOperand(1);
+    // IndexToSplit = LHS + RHS.
+    if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
+      return NewGEP;
+    // Symmetrically, try IndexToSplit = RHS + LHS.
+    if (LHS != RHS) {
+      if (auto *NewGEP =
+              tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType))
+        return NewGEP;
+    }
+  }
+  return nullptr;
+}
+
+GetElementPtrInst *
+NaryReassociatePass::tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
+                                              unsigned I, Value *LHS,
+                                              Value *RHS, Type *IndexedType) {
+  // Look for GEP's closest dominator that has the same SCEV as GEP except that
+  // the I-th index is replaced with LHS.
+  SmallVector<const SCEV *, 4> IndexExprs;
+  for (auto Index = GEP->idx_begin(); Index != GEP->idx_end(); ++Index)
+    IndexExprs.push_back(SE->getSCEV(*Index));
+  // Replace the I-th index with LHS.
+  IndexExprs[I] = SE->getSCEV(LHS);
+  if (isKnownNonNegative(LHS, *DL, 0, AC, GEP, DT) &&
       DL->getTypeSizeInBits(LHS->getType()).getFixedSize() <
           DL->getTypeSizeInBits(GEP->getOperand(I)->getType()).getFixedSize()) {
-    // Zero-extend LHS if it is non-negative. InstCombine canonicalizes sext to 
-    // zext if the source operand is proved non-negative. We should do that 
-    // consistently so that CandidateExpr more likely appears before. See 
-    // @reassociate_gep_assume for an example of this canonicalization. 
-    IndexExprs[I] = 
-        SE->getZeroExtendExpr(IndexExprs[I], GEP->getOperand(I)->getType()); 
-  } 
-  const SCEV *CandidateExpr = SE->getGEPExpr(cast<GEPOperator>(GEP), 
-                                             IndexExprs); 
- 
-  Value *Candidate = findClosestMatchingDominator(CandidateExpr, GEP); 
-  if (Candidate == nullptr) 
-    return nullptr; 
- 
-  IRBuilder<> Builder(GEP); 
-  // Candidate does not necessarily have the same pointer type as GEP. Use 
-  // bitcast or pointer cast to make sure they have the same type, so that the 
-  // later RAUW doesn't complain. 
-  Candidate = Builder.CreateBitOrPointerCast(Candidate, GEP->getType()); 
-  assert(Candidate->getType() == GEP->getType()); 
- 
-  // NewGEP = (char *)Candidate + RHS * sizeof(IndexedType) 
-  uint64_t IndexedSize = DL->getTypeAllocSize(IndexedType); 
-  Type *ElementType = GEP->getResultElementType(); 
-  uint64_t ElementSize = DL->getTypeAllocSize(ElementType); 
-  // Another less rare case: because I is not necessarily the last index of the 
-  // GEP, the size of the type at the I-th index (IndexedSize) is not 
-  // necessarily divisible by ElementSize. For example, 
-  // 
-  // #pragma pack(1) 
-  // struct S { 
-  //   int a[3]; 
-  //   int64 b[8]; 
-  // }; 
-  // #pragma pack() 
-  // 
-  // sizeof(S) = 100 is indivisible by sizeof(int64) = 8. 
-  // 
-  // TODO: bail out on this case for now. We could emit uglygep. 
-  if (IndexedSize % ElementSize != 0) 
-    return nullptr; 
- 
-  // NewGEP = &Candidate[RHS * (sizeof(IndexedType) / sizeof(Candidate[0]))); 
-  Type *IntPtrTy = DL->getIntPtrType(GEP->getType()); 
-  if (RHS->getType() != IntPtrTy) 
-    RHS = Builder.CreateSExtOrTrunc(RHS, IntPtrTy); 
-  if (IndexedSize != ElementSize) { 
-    RHS = Builder.CreateMul( 
-        RHS, ConstantInt::get(IntPtrTy, IndexedSize / ElementSize)); 
-  } 
-  GetElementPtrInst *NewGEP = cast<GetElementPtrInst>( 
-      Builder.CreateGEP(GEP->getResultElementType(), Candidate, RHS)); 
-  NewGEP->setIsInBounds(GEP->isInBounds()); 
-  NewGEP->takeName(GEP); 
-  return NewGEP; 
-} 
- 
-Instruction *NaryReassociatePass::tryReassociateBinaryOp(BinaryOperator *I) { 
-  Value *LHS = I->getOperand(0), *RHS = I->getOperand(1); 
-  // There is no need to reassociate 0. 
-  if (SE->getSCEV(I)->isZero()) 
-    return nullptr; 
-  if (auto *NewI = tryReassociateBinaryOp(LHS, RHS, I)) 
-    return NewI; 
-  if (auto *NewI = tryReassociateBinaryOp(RHS, LHS, I)) 
-    return NewI; 
-  return nullptr; 
-} 
- 
-Instruction *NaryReassociatePass::tryReassociateBinaryOp(Value *LHS, Value *RHS, 
-                                                         BinaryOperator *I) { 
-  Value *A = nullptr, *B = nullptr; 
-  // To be conservative, we reassociate I only when it is the only user of (A op 
-  // B). 
-  if (LHS->hasOneUse() && matchTernaryOp(I, LHS, A, B)) { 
-    // I = (A op B) op RHS 
-    //   = (A op RHS) op B or (B op RHS) op A 
-    const SCEV *AExpr = SE->getSCEV(A), *BExpr = SE->getSCEV(B); 
-    const SCEV *RHSExpr = SE->getSCEV(RHS); 
-    if (BExpr != RHSExpr) { 
-      if (auto *NewI = 
-              tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I)) 
-        return NewI; 
-    } 
-    if (AExpr != RHSExpr) { 
-      if (auto *NewI = 
-              tryReassociatedBinaryOp(getBinarySCEV(I, BExpr, RHSExpr), A, I)) 
-        return NewI; 
-    } 
-  } 
-  return nullptr; 
-} 
- 
-Instruction *NaryReassociatePass::tryReassociatedBinaryOp(const SCEV *LHSExpr, 
-                                                          Value *RHS, 
-                                                          BinaryOperator *I) { 
-  // Look for the closest dominator LHS of I that computes LHSExpr, and replace 
-  // I with LHS op RHS. 
-  auto *LHS = findClosestMatchingDominator(LHSExpr, I); 
-  if (LHS == nullptr) 
-    return nullptr; 
- 
-  Instruction *NewI = nullptr; 
-  switch (I->getOpcode()) { 
-  case Instruction::Add: 
-    NewI = BinaryOperator::CreateAdd(LHS, RHS, "", I); 
-    break; 
-  case Instruction::Mul: 
-    NewI = BinaryOperator::CreateMul(LHS, RHS, "", I); 
-    break; 
-  default: 
-    llvm_unreachable("Unexpected instruction."); 
-  } 
-  NewI->takeName(I); 
-  return NewI; 
-} 
- 
-bool NaryReassociatePass::matchTernaryOp(BinaryOperator *I, Value *V, 
-                                         Value *&Op1, Value *&Op2) { 
-  switch (I->getOpcode()) { 
-  case Instruction::Add: 
-    return match(V, m_Add(m_Value(Op1), m_Value(Op2))); 
-  case Instruction::Mul: 
-    return match(V, m_Mul(m_Value(Op1), m_Value(Op2))); 
-  default: 
-    llvm_unreachable("Unexpected instruction."); 
-  } 
-  return false; 
-} 
- 
-const SCEV *NaryReassociatePass::getBinarySCEV(BinaryOperator *I, 
-                                               const SCEV *LHS, 
-                                               const SCEV *RHS) { 
-  switch (I->getOpcode()) { 
-  case Instruction::Add: 
-    return SE->getAddExpr(LHS, RHS); 
-  case Instruction::Mul: 
-    return SE->getMulExpr(LHS, RHS); 
-  default: 
-    llvm_unreachable("Unexpected instruction."); 
-  } 
-  return nullptr; 
-} 
- 
-Instruction * 
-NaryReassociatePass::findClosestMatchingDominator(const SCEV *CandidateExpr, 
-                                                  Instruction *Dominatee) { 
-  auto Pos = SeenExprs.find(CandidateExpr); 
-  if (Pos == SeenExprs.end()) 
-    return nullptr; 
- 
-  auto &Candidates = Pos->second; 
-  // Because we process the basic blocks in pre-order of the dominator tree, a 
-  // candidate that doesn't dominate the current instruction won't dominate any 
-  // future instruction either. Therefore, we pop it out of the stack. This 
-  // optimization makes the algorithm O(n). 
-  while (!Candidates.empty()) { 
-    // Candidates stores WeakTrackingVHs, so a candidate can be nullptr if it's 
-    // removed 
-    // during rewriting. 
-    if (Value *Candidate = Candidates.back()) { 
-      Instruction *CandidateInstruction = cast<Instruction>(Candidate); 
-      if (DT->dominates(CandidateInstruction, Dominatee)) 
-        return CandidateInstruction; 
-    } 
-    Candidates.pop_back(); 
-  } 
-  return nullptr; 
-} 
+    // Zero-extend LHS if it is non-negative. InstCombine canonicalizes sext to
+    // zext if the source operand is proved non-negative. We should do that
+    // consistently so that CandidateExpr more likely appears before. See
+    // @reassociate_gep_assume for an example of this canonicalization.
+    IndexExprs[I] =
+        SE->getZeroExtendExpr(IndexExprs[I], GEP->getOperand(I)->getType());
+  }
+  const SCEV *CandidateExpr = SE->getGEPExpr(cast<GEPOperator>(GEP),
+                                             IndexExprs);
+
+  Value *Candidate = findClosestMatchingDominator(CandidateExpr, GEP);
+  if (Candidate == nullptr)
+    return nullptr;
+
+  IRBuilder<> Builder(GEP);
+  // Candidate does not necessarily have the same pointer type as GEP. Use
+  // bitcast or pointer cast to make sure they have the same type, so that the
+  // later RAUW doesn't complain.
+  Candidate = Builder.CreateBitOrPointerCast(Candidate, GEP->getType());
+  assert(Candidate->getType() == GEP->getType());
+
+  // NewGEP = (char *)Candidate + RHS * sizeof(IndexedType)
+  uint64_t IndexedSize = DL->getTypeAllocSize(IndexedType);
+  Type *ElementType = GEP->getResultElementType();
+  uint64_t ElementSize = DL->getTypeAllocSize(ElementType);
+  // Another less rare case: because I is not necessarily the last index of the
+  // GEP, the size of the type at the I-th index (IndexedSize) is not
+  // necessarily divisible by ElementSize. For example,
+  //
+  // #pragma pack(1)
+  // struct S {
+  //   int a[3];
+  //   int64 b[8];
+  // };
+  // #pragma pack()
+  //
+  // sizeof(S) = 100 is indivisible by sizeof(int64) = 8.
+  //
+  // TODO: bail out on this case for now. We could emit uglygep.
+  if (IndexedSize % ElementSize != 0)
+    return nullptr;
+
+  // NewGEP = &Candidate[RHS * (sizeof(IndexedType) / sizeof(Candidate[0])));
+  Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
+  if (RHS->getType() != IntPtrTy)
+    RHS = Builder.CreateSExtOrTrunc(RHS, IntPtrTy);
+  if (IndexedSize != ElementSize) {
+    RHS = Builder.CreateMul(
+        RHS, ConstantInt::get(IntPtrTy, IndexedSize / ElementSize));
+  }
+  GetElementPtrInst *NewGEP = cast<GetElementPtrInst>(
+      Builder.CreateGEP(GEP->getResultElementType(), Candidate, RHS));
+  NewGEP->setIsInBounds(GEP->isInBounds());
+  NewGEP->takeName(GEP);
+  return NewGEP;
+}
+
+Instruction *NaryReassociatePass::tryReassociateBinaryOp(BinaryOperator *I) {
+  Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
+  // There is no need to reassociate 0.
+  if (SE->getSCEV(I)->isZero())
+    return nullptr;
+  if (auto *NewI = tryReassociateBinaryOp(LHS, RHS, I))
+    return NewI;
+  if (auto *NewI = tryReassociateBinaryOp(RHS, LHS, I))
+    return NewI;
+  return nullptr;
+}
+
+Instruction *NaryReassociatePass::tryReassociateBinaryOp(Value *LHS, Value *RHS,
+                                                         BinaryOperator *I) {
+  Value *A = nullptr, *B = nullptr;
+  // To be conservative, we reassociate I only when it is the only user of (A op
+  // B).
+  if (LHS->hasOneUse() && matchTernaryOp(I, LHS, A, B)) {
+    // I = (A op B) op RHS
+    //   = (A op RHS) op B or (B op RHS) op A
+    const SCEV *AExpr = SE->getSCEV(A), *BExpr = SE->getSCEV(B);
+    const SCEV *RHSExpr = SE->getSCEV(RHS);
+    if (BExpr != RHSExpr) {
+      if (auto *NewI =
+              tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I))
+        return NewI;
+    }
+    if (AExpr != RHSExpr) {
+      if (auto *NewI =
+              tryReassociatedBinaryOp(getBinarySCEV(I, BExpr, RHSExpr), A, I))
+        return NewI;
+    }
+  }
+  return nullptr;
+}
+
+Instruction *NaryReassociatePass::tryReassociatedBinaryOp(const SCEV *LHSExpr,
+                                                          Value *RHS,
+                                                          BinaryOperator *I) {
+  // Look for the closest dominator LHS of I that computes LHSExpr, and replace
+  // I with LHS op RHS.
+  auto *LHS = findClosestMatchingDominator(LHSExpr, I);
+  if (LHS == nullptr)
+    return nullptr;
+
+  Instruction *NewI = nullptr;
+  switch (I->getOpcode()) {
+  case Instruction::Add:
+    NewI = BinaryOperator::CreateAdd(LHS, RHS, "", I);
+    break;
+  case Instruction::Mul:
+    NewI = BinaryOperator::CreateMul(LHS, RHS, "", I);
+    break;
+  default:
+    llvm_unreachable("Unexpected instruction.");
+  }
+  NewI->takeName(I);
+  return NewI;
+}
+
+bool NaryReassociatePass::matchTernaryOp(BinaryOperator *I, Value *V,
+                                         Value *&Op1, Value *&Op2) {
+  switch (I->getOpcode()) {
+  case Instruction::Add:
+    return match(V, m_Add(m_Value(Op1), m_Value(Op2)));
+  case Instruction::Mul:
+    return match(V, m_Mul(m_Value(Op1), m_Value(Op2)));
+  default:
+    llvm_unreachable("Unexpected instruction.");
+  }
+  return false;
+}
+
+const SCEV *NaryReassociatePass::getBinarySCEV(BinaryOperator *I,
+                                               const SCEV *LHS,
+                                               const SCEV *RHS) {
+  switch (I->getOpcode()) {
+  case Instruction::Add:
+    return SE->getAddExpr(LHS, RHS);
+  case Instruction::Mul:
+    return SE->getMulExpr(LHS, RHS);
+  default:
+    llvm_unreachable("Unexpected instruction.");
+  }
+  return nullptr;
+}
+
+Instruction *
+NaryReassociatePass::findClosestMatchingDominator(const SCEV *CandidateExpr,
+                                                  Instruction *Dominatee) {
+  auto Pos = SeenExprs.find(CandidateExpr);
+  if (Pos == SeenExprs.end())
+    return nullptr;
+
+  auto &Candidates = Pos->second;
+  // Because we process the basic blocks in pre-order of the dominator tree, a
+  // candidate that doesn't dominate the current instruction won't dominate any
+  // future instruction either. Therefore, we pop it out of the stack. This
+  // optimization makes the algorithm O(n).
+  while (!Candidates.empty()) {
+    // Candidates stores WeakTrackingVHs, so a candidate can be nullptr if it's
+    // removed
+    // during rewriting.
+    if (Value *Candidate = Candidates.back()) {
+      Instruction *CandidateInstruction = cast<Instruction>(Candidate);
+      if (DT->dominates(CandidateInstruction, Dominatee))
+        return CandidateInstruction;
+    }
+    Candidates.pop_back();
+  }
+  return nullptr;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/NewGVN.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/NewGVN.cpp
index 330f3e9509..281d47c862 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/NewGVN.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/NewGVN.cpp
@@ -1,1564 +1,1564 @@
-//===- NewGVN.cpp - Global Value Numbering Pass ---------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-/// \file 
-/// This file implements the new LLVM's Global Value Numbering pass. 
-/// GVN partitions values computed by a function into congruence classes. 
-/// Values ending up in the same congruence class are guaranteed to be the same 
-/// for every execution of the program. In that respect, congruency is a 
-/// compile-time approximation of equivalence of values at runtime. 
-/// The algorithm implemented here uses a sparse formulation and it's based 
-/// on the ideas described in the paper: 
-/// "A Sparse Algorithm for Predicated Global Value Numbering" from 
-/// Karthik Gargi. 
-/// 
-/// A brief overview of the algorithm: The algorithm is essentially the same as 
-/// the standard RPO value numbering algorithm (a good reference is the paper 
-/// "SCC based value numbering" by L. Taylor Simpson) with one major difference: 
-/// The RPO algorithm proceeds, on every iteration, to process every reachable 
-/// block and every instruction in that block.  This is because the standard RPO 
-/// algorithm does not track what things have the same value number, it only 
-/// tracks what the value number of a given operation is (the mapping is 
-/// operation -> value number).  Thus, when a value number of an operation 
-/// changes, it must reprocess everything to ensure all uses of a value number 
-/// get updated properly.  In constrast, the sparse algorithm we use *also* 
-/// tracks what operations have a given value number (IE it also tracks the 
-/// reverse mapping from value number -> operations with that value number), so 
-/// that it only needs to reprocess the instructions that are affected when 
-/// something's value number changes.  The vast majority of complexity and code 
-/// in this file is devoted to tracking what value numbers could change for what 
-/// instructions when various things happen.  The rest of the algorithm is 
-/// devoted to performing symbolic evaluation, forward propagation, and 
-/// simplification of operations based on the value numbers deduced so far 
-/// 
-/// In order to make the GVN mostly-complete, we use a technique derived from 
-/// "Detection of Redundant Expressions: A Complete and Polynomial-time 
-/// Algorithm in SSA" by R.R. Pai.  The source of incompleteness in most SSA 
-/// based GVN algorithms is related to their inability to detect equivalence 
-/// between phi of ops (IE phi(a+b, c+d)) and op of phis (phi(a,c) + phi(b, d)). 
-/// We resolve this issue by generating the equivalent "phi of ops" form for 
-/// each op of phis we see, in a way that only takes polynomial time to resolve. 
-/// 
-/// We also do not perform elimination by using any published algorithm.  All 
-/// published algorithms are O(Instructions). Instead, we use a technique that 
-/// is O(number of operations with the same value number), enabling us to skip 
-/// trying to eliminate things that have unique value numbers. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/NewGVN.h" 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/BitVector.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/DenseMapInfo.h" 
-#include "llvm/ADT/DenseSet.h" 
-#include "llvm/ADT/DepthFirstIterator.h" 
-#include "llvm/ADT/GraphTraits.h" 
-#include "llvm/ADT/Hashing.h" 
-#include "llvm/ADT/PointerIntPair.h" 
-#include "llvm/ADT/PostOrderIterator.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/SparseBitVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/iterator_range.h" 
-#include "llvm/Analysis/AliasAnalysis.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/CFGPrinter.h" 
-#include "llvm/Analysis/ConstantFolding.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/Analysis/MemoryBuiltins.h" 
-#include "llvm/Analysis/MemorySSA.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/IR/Argument.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Allocator.h" 
-#include "llvm/Support/ArrayRecycler.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/DebugCounter.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/PointerLikeTypeTraits.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Scalar/GVNExpression.h" 
-#include "llvm/Transforms/Utils/AssumeBundleBuilder.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Transforms/Utils/PredicateInfo.h" 
-#include "llvm/Transforms/Utils/VNCoercion.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <cstdint> 
-#include <iterator> 
-#include <map> 
-#include <memory> 
-#include <set> 
-#include <string> 
-#include <tuple> 
-#include <utility> 
-#include <vector> 
- 
-using namespace llvm; 
-using namespace llvm::GVNExpression; 
-using namespace llvm::VNCoercion; 
-using namespace llvm::PatternMatch; 
- 
-#define DEBUG_TYPE "newgvn" 
- 
-STATISTIC(NumGVNInstrDeleted, "Number of instructions deleted"); 
-STATISTIC(NumGVNBlocksDeleted, "Number of blocks deleted"); 
-STATISTIC(NumGVNOpsSimplified, "Number of Expressions simplified"); 
-STATISTIC(NumGVNPhisAllSame, "Number of PHIs whos arguments are all the same"); 
-STATISTIC(NumGVNMaxIterations, 
-          "Maximum Number of iterations it took to converge GVN"); 
-STATISTIC(NumGVNLeaderChanges, "Number of leader changes"); 
-STATISTIC(NumGVNSortedLeaderChanges, "Number of sorted leader changes"); 
-STATISTIC(NumGVNAvoidedSortedLeaderChanges, 
-          "Number of avoided sorted leader changes"); 
-STATISTIC(NumGVNDeadStores, "Number of redundant/dead stores eliminated"); 
-STATISTIC(NumGVNPHIOfOpsCreated, "Number of PHI of ops created"); 
-STATISTIC(NumGVNPHIOfOpsEliminations, 
-          "Number of things eliminated using PHI of ops"); 
-DEBUG_COUNTER(VNCounter, "newgvn-vn", 
-              "Controls which instructions are value numbered"); 
-DEBUG_COUNTER(PHIOfOpsCounter, "newgvn-phi", 
-              "Controls which instructions we create phi of ops for"); 
-// Currently store defining access refinement is too slow due to basicaa being 
-// egregiously slow.  This flag lets us keep it working while we work on this 
-// issue. 
-static cl::opt<bool> EnableStoreRefinement("enable-store-refinement", 
-                                           cl::init(false), cl::Hidden); 
- 
-/// Currently, the generation "phi of ops" can result in correctness issues. 
-static cl::opt<bool> EnablePhiOfOps("enable-phi-of-ops", cl::init(true), 
-                                    cl::Hidden); 
- 
-//===----------------------------------------------------------------------===// 
-//                                GVN Pass 
-//===----------------------------------------------------------------------===// 
- 
-// Anchor methods. 
-namespace llvm { 
-namespace GVNExpression { 
- 
-Expression::~Expression() = default; 
-BasicExpression::~BasicExpression() = default; 
-CallExpression::~CallExpression() = default; 
-LoadExpression::~LoadExpression() = default; 
-StoreExpression::~StoreExpression() = default; 
-AggregateValueExpression::~AggregateValueExpression() = default; 
-PHIExpression::~PHIExpression() = default; 
- 
-} // end namespace GVNExpression 
-} // end namespace llvm 
- 
-namespace { 
- 
-// Tarjan's SCC finding algorithm with Nuutila's improvements 
-// SCCIterator is actually fairly complex for the simple thing we want. 
-// It also wants to hand us SCC's that are unrelated to the phi node we ask 
-// about, and have us process them there or risk redoing work. 
-// Graph traits over a filter iterator also doesn't work that well here. 
-// This SCC finder is specialized to walk use-def chains, and only follows 
-// instructions, 
-// not generic values (arguments, etc). 
-struct TarjanSCC { 
-  TarjanSCC() : Components(1) {} 
- 
-  void Start(const Instruction *Start) { 
-    if (Root.lookup(Start) == 0) 
-      FindSCC(Start); 
-  } 
- 
-  const SmallPtrSetImpl<const Value *> &getComponentFor(const Value *V) const { 
-    unsigned ComponentID = ValueToComponent.lookup(V); 
- 
-    assert(ComponentID > 0 && 
-           "Asking for a component for a value we never processed"); 
-    return Components[ComponentID]; 
-  } 
- 
-private: 
-  void FindSCC(const Instruction *I) { 
-    Root[I] = ++DFSNum; 
-    // Store the DFS Number we had before it possibly gets incremented. 
-    unsigned int OurDFS = DFSNum; 
-    for (auto &Op : I->operands()) { 
-      if (auto *InstOp = dyn_cast<Instruction>(Op)) { 
-        if (Root.lookup(Op) == 0) 
-          FindSCC(InstOp); 
-        if (!InComponent.count(Op)) 
-          Root[I] = std::min(Root.lookup(I), Root.lookup(Op)); 
-      } 
-    } 
-    // See if we really were the root of a component, by seeing if we still have 
-    // our DFSNumber.  If we do, we are the root of the component, and we have 
-    // completed a component. If we do not, we are not the root of a component, 
-    // and belong on the component stack. 
-    if (Root.lookup(I) == OurDFS) { 
-      unsigned ComponentID = Components.size(); 
-      Components.resize(Components.size() + 1); 
-      auto &Component = Components.back(); 
-      Component.insert(I); 
-      LLVM_DEBUG(dbgs() << "Component root is " << *I << "\n"); 
-      InComponent.insert(I); 
-      ValueToComponent[I] = ComponentID; 
-      // Pop a component off the stack and label it. 
-      while (!Stack.empty() && Root.lookup(Stack.back()) >= OurDFS) { 
-        auto *Member = Stack.back(); 
-        LLVM_DEBUG(dbgs() << "Component member is " << *Member << "\n"); 
-        Component.insert(Member); 
-        InComponent.insert(Member); 
-        ValueToComponent[Member] = ComponentID; 
-        Stack.pop_back(); 
-      } 
-    } else { 
-      // Part of a component, push to stack 
-      Stack.push_back(I); 
-    } 
-  } 
- 
-  unsigned int DFSNum = 1; 
-  SmallPtrSet<const Value *, 8> InComponent; 
-  DenseMap<const Value *, unsigned int> Root; 
-  SmallVector<const Value *, 8> Stack; 
- 
-  // Store the components as vector of ptr sets, because we need the topo order 
-  // of SCC's, but not individual member order 
-  SmallVector<SmallPtrSet<const Value *, 8>, 8> Components; 
- 
-  DenseMap<const Value *, unsigned> ValueToComponent; 
-}; 
- 
-// Congruence classes represent the set of expressions/instructions 
-// that are all the same *during some scope in the function*. 
-// That is, because of the way we perform equality propagation, and 
-// because of memory value numbering, it is not correct to assume 
-// you can willy-nilly replace any member with any other at any 
-// point in the function. 
-// 
-// For any Value in the Member set, it is valid to replace any dominated member 
-// with that Value. 
-// 
-// Every congruence class has a leader, and the leader is used to symbolize 
-// instructions in a canonical way (IE every operand of an instruction that is a 
-// member of the same congruence class will always be replaced with leader 
-// during symbolization).  To simplify symbolization, we keep the leader as a 
-// constant if class can be proved to be a constant value.  Otherwise, the 
-// leader is the member of the value set with the smallest DFS number.  Each 
-// congruence class also has a defining expression, though the expression may be 
-// null.  If it exists, it can be used for forward propagation and reassociation 
-// of values. 
- 
-// For memory, we also track a representative MemoryAccess, and a set of memory 
-// members for MemoryPhis (which have no real instructions). Note that for 
-// memory, it seems tempting to try to split the memory members into a 
-// MemoryCongruenceClass or something.  Unfortunately, this does not work 
-// easily.  The value numbering of a given memory expression depends on the 
-// leader of the memory congruence class, and the leader of memory congruence 
-// class depends on the value numbering of a given memory expression.  This 
-// leads to wasted propagation, and in some cases, missed optimization.  For 
-// example: If we had value numbered two stores together before, but now do not, 
-// we move them to a new value congruence class.  This in turn will move at one 
-// of the memorydefs to a new memory congruence class.  Which in turn, affects 
-// the value numbering of the stores we just value numbered (because the memory 
-// congruence class is part of the value number).  So while theoretically 
-// possible to split them up, it turns out to be *incredibly* complicated to get 
-// it to work right, because of the interdependency.  While structurally 
-// slightly messier, it is algorithmically much simpler and faster to do what we 
-// do here, and track them both at once in the same class. 
-// Note: The default iterators for this class iterate over values 
-class CongruenceClass { 
-public: 
-  using MemberType = Value; 
-  using MemberSet = SmallPtrSet<MemberType *, 4>; 
-  using MemoryMemberType = MemoryPhi; 
-  using MemoryMemberSet = SmallPtrSet<const MemoryMemberType *, 2>; 
- 
-  explicit CongruenceClass(unsigned ID) : ID(ID) {} 
-  CongruenceClass(unsigned ID, Value *Leader, const Expression *E) 
-      : ID(ID), RepLeader(Leader), DefiningExpr(E) {} 
- 
-  unsigned getID() const { return ID; } 
- 
-  // True if this class has no members left.  This is mainly used for assertion 
-  // purposes, and for skipping empty classes. 
-  bool isDead() const { 
-    // If it's both dead from a value perspective, and dead from a memory 
-    // perspective, it's really dead. 
-    return empty() && memory_empty(); 
-  } 
- 
-  // Leader functions 
-  Value *getLeader() const { return RepLeader; } 
-  void setLeader(Value *Leader) { RepLeader = Leader; } 
-  const std::pair<Value *, unsigned int> &getNextLeader() const { 
-    return NextLeader; 
-  } 
-  void resetNextLeader() { NextLeader = {nullptr, ~0}; } 
-  void addPossibleNextLeader(std::pair<Value *, unsigned int> LeaderPair) { 
-    if (LeaderPair.second < NextLeader.second) 
-      NextLeader = LeaderPair; 
-  } 
- 
-  Value *getStoredValue() const { return RepStoredValue; } 
-  void setStoredValue(Value *Leader) { RepStoredValue = Leader; } 
-  const MemoryAccess *getMemoryLeader() const { return RepMemoryAccess; } 
-  void setMemoryLeader(const MemoryAccess *Leader) { RepMemoryAccess = Leader; } 
- 
-  // Forward propagation info 
-  const Expression *getDefiningExpr() const { return DefiningExpr; } 
- 
-  // Value member set 
-  bool empty() const { return Members.empty(); } 
-  unsigned size() const { return Members.size(); } 
-  MemberSet::const_iterator begin() const { return Members.begin(); } 
-  MemberSet::const_iterator end() const { return Members.end(); } 
-  void insert(MemberType *M) { Members.insert(M); } 
-  void erase(MemberType *M) { Members.erase(M); } 
-  void swap(MemberSet &Other) { Members.swap(Other); } 
- 
-  // Memory member set 
-  bool memory_empty() const { return MemoryMembers.empty(); } 
-  unsigned memory_size() const { return MemoryMembers.size(); } 
-  MemoryMemberSet::const_iterator memory_begin() const { 
-    return MemoryMembers.begin(); 
-  } 
-  MemoryMemberSet::const_iterator memory_end() const { 
-    return MemoryMembers.end(); 
-  } 
-  iterator_range<MemoryMemberSet::const_iterator> memory() const { 
-    return make_range(memory_begin(), memory_end()); 
-  } 
- 
-  void memory_insert(const MemoryMemberType *M) { MemoryMembers.insert(M); } 
-  void memory_erase(const MemoryMemberType *M) { MemoryMembers.erase(M); } 
- 
-  // Store count 
-  unsigned getStoreCount() const { return StoreCount; } 
-  void incStoreCount() { ++StoreCount; } 
-  void decStoreCount() { 
-    assert(StoreCount != 0 && "Store count went negative"); 
-    --StoreCount; 
-  } 
- 
-  // True if this class has no memory members. 
-  bool definesNoMemory() const { return StoreCount == 0 && memory_empty(); } 
- 
-  // Return true if two congruence classes are equivalent to each other. This 
-  // means that every field but the ID number and the dead field are equivalent. 
-  bool isEquivalentTo(const CongruenceClass *Other) const { 
-    if (!Other) 
-      return false; 
-    if (this == Other) 
-      return true; 
- 
-    if (std::tie(StoreCount, RepLeader, RepStoredValue, RepMemoryAccess) != 
-        std::tie(Other->StoreCount, Other->RepLeader, Other->RepStoredValue, 
-                 Other->RepMemoryAccess)) 
-      return false; 
-    if (DefiningExpr != Other->DefiningExpr) 
-      if (!DefiningExpr || !Other->DefiningExpr || 
-          *DefiningExpr != *Other->DefiningExpr) 
-        return false; 
- 
-    if (Members.size() != Other->Members.size()) 
-      return false; 
- 
-    return all_of(Members, 
-                  [&](const Value *V) { return Other->Members.count(V); }); 
-  } 
- 
-private: 
-  unsigned ID; 
- 
-  // Representative leader. 
-  Value *RepLeader = nullptr; 
- 
-  // The most dominating leader after our current leader, because the member set 
-  // is not sorted and is expensive to keep sorted all the time. 
-  std::pair<Value *, unsigned int> NextLeader = {nullptr, ~0U}; 
- 
-  // If this is represented by a store, the value of the store. 
-  Value *RepStoredValue = nullptr; 
- 
-  // If this class contains MemoryDefs or MemoryPhis, this is the leading memory 
-  // access. 
-  const MemoryAccess *RepMemoryAccess = nullptr; 
- 
-  // Defining Expression. 
-  const Expression *DefiningExpr = nullptr; 
- 
-  // Actual members of this class. 
-  MemberSet Members; 
- 
-  // This is the set of MemoryPhis that exist in the class. MemoryDefs and 
-  // MemoryUses have real instructions representing them, so we only need to 
-  // track MemoryPhis here. 
-  MemoryMemberSet MemoryMembers; 
- 
-  // Number of stores in this congruence class. 
-  // This is used so we can detect store equivalence changes properly. 
-  int StoreCount = 0; 
-}; 
- 
-} // end anonymous namespace 
- 
-namespace llvm { 
- 
-struct ExactEqualsExpression { 
-  const Expression &E; 
- 
-  explicit ExactEqualsExpression(const Expression &E) : E(E) {} 
- 
-  hash_code getComputedHash() const { return E.getComputedHash(); } 
- 
-  bool operator==(const Expression &Other) const { 
-    return E.exactlyEquals(Other); 
-  } 
-}; 
- 
-template <> struct DenseMapInfo<const Expression *> { 
-  static const Expression *getEmptyKey() { 
-    auto Val = static_cast<uintptr_t>(-1); 
-    Val <<= PointerLikeTypeTraits<const Expression *>::NumLowBitsAvailable; 
-    return reinterpret_cast<const Expression *>(Val); 
-  } 
- 
-  static const Expression *getTombstoneKey() { 
-    auto Val = static_cast<uintptr_t>(~1U); 
-    Val <<= PointerLikeTypeTraits<const Expression *>::NumLowBitsAvailable; 
-    return reinterpret_cast<const Expression *>(Val); 
-  } 
- 
-  static unsigned getHashValue(const Expression *E) { 
-    return E->getComputedHash(); 
-  } 
- 
-  static unsigned getHashValue(const ExactEqualsExpression &E) { 
-    return E.getComputedHash(); 
-  } 
- 
-  static bool isEqual(const ExactEqualsExpression &LHS, const Expression *RHS) { 
-    if (RHS == getTombstoneKey() || RHS == getEmptyKey()) 
-      return false; 
-    return LHS == *RHS; 
-  } 
- 
-  static bool isEqual(const Expression *LHS, const Expression *RHS) { 
-    if (LHS == RHS) 
-      return true; 
-    if (LHS == getTombstoneKey() || RHS == getTombstoneKey() || 
-        LHS == getEmptyKey() || RHS == getEmptyKey()) 
-      return false; 
-    // Compare hashes before equality.  This is *not* what the hashtable does, 
-    // since it is computing it modulo the number of buckets, whereas we are 
-    // using the full hash keyspace.  Since the hashes are precomputed, this 
-    // check is *much* faster than equality. 
-    if (LHS->getComputedHash() != RHS->getComputedHash()) 
-      return false; 
-    return *LHS == *RHS; 
-  } 
-}; 
- 
-} // end namespace llvm 
- 
-namespace { 
- 
-class NewGVN { 
-  Function &F; 
-  DominatorTree *DT = nullptr; 
-  const TargetLibraryInfo *TLI = nullptr; 
-  AliasAnalysis *AA = nullptr; 
-  MemorySSA *MSSA = nullptr; 
-  MemorySSAWalker *MSSAWalker = nullptr; 
-  AssumptionCache *AC = nullptr; 
-  const DataLayout &DL; 
-  std::unique_ptr<PredicateInfo> PredInfo; 
- 
-  // These are the only two things the create* functions should have 
-  // side-effects on due to allocating memory. 
-  mutable BumpPtrAllocator ExpressionAllocator; 
-  mutable ArrayRecycler<Value *> ArgRecycler; 
-  mutable TarjanSCC SCCFinder; 
-  const SimplifyQuery SQ; 
- 
-  // Number of function arguments, used by ranking 
-  unsigned int NumFuncArgs = 0; 
- 
-  // RPOOrdering of basic blocks 
-  DenseMap<const DomTreeNode *, unsigned> RPOOrdering; 
- 
-  // Congruence class info. 
- 
-  // This class is called INITIAL in the paper. It is the class everything 
-  // startsout in, and represents any value. Being an optimistic analysis, 
-  // anything in the TOP class has the value TOP, which is indeterminate and 
-  // equivalent to everything. 
-  CongruenceClass *TOPClass = nullptr; 
-  std::vector<CongruenceClass *> CongruenceClasses; 
-  unsigned NextCongruenceNum = 0; 
- 
-  // Value Mappings. 
-  DenseMap<Value *, CongruenceClass *> ValueToClass; 
-  DenseMap<Value *, const Expression *> ValueToExpression; 
- 
-  // Value PHI handling, used to make equivalence between phi(op, op) and 
-  // op(phi, phi). 
-  // These mappings just store various data that would normally be part of the 
-  // IR. 
-  SmallPtrSet<const Instruction *, 8> PHINodeUses; 
- 
-  DenseMap<const Value *, bool> OpSafeForPHIOfOps; 
- 
-  // Map a temporary instruction we created to a parent block. 
-  DenseMap<const Value *, BasicBlock *> TempToBlock; 
- 
-  // Map between the already in-program instructions and the temporary phis we 
-  // created that they are known equivalent to. 
-  DenseMap<const Value *, PHINode *> RealToTemp; 
- 
-  // In order to know when we should re-process instructions that have 
-  // phi-of-ops, we track the set of expressions that they needed as 
-  // leaders. When we discover new leaders for those expressions, we process the 
-  // associated phi-of-op instructions again in case they have changed.  The 
-  // other way they may change is if they had leaders, and those leaders 
-  // disappear.  However, at the point they have leaders, there are uses of the 
-  // relevant operands in the created phi node, and so they will get reprocessed 
-  // through the normal user marking we perform. 
-  mutable DenseMap<const Value *, SmallPtrSet<Value *, 2>> AdditionalUsers; 
-  DenseMap<const Expression *, SmallPtrSet<Instruction *, 2>> 
-      ExpressionToPhiOfOps; 
- 
-  // Map from temporary operation to MemoryAccess. 
-  DenseMap<const Instruction *, MemoryUseOrDef *> TempToMemory; 
- 
-  // Set of all temporary instructions we created. 
-  // Note: This will include instructions that were just created during value 
-  // numbering.  The way to test if something is using them is to check 
-  // RealToTemp. 
-  DenseSet<Instruction *> AllTempInstructions; 
- 
-  // This is the set of instructions to revisit on a reachability change.  At 
-  // the end of the main iteration loop it will contain at least all the phi of 
-  // ops instructions that will be changed to phis, as well as regular phis. 
-  // During the iteration loop, it may contain other things, such as phi of ops 
-  // instructions that used edge reachability to reach a result, and so need to 
-  // be revisited when the edge changes, independent of whether the phi they 
-  // depended on changes. 
-  DenseMap<BasicBlock *, SparseBitVector<>> RevisitOnReachabilityChange; 
- 
-  // Mapping from predicate info we used to the instructions we used it with. 
-  // In order to correctly ensure propagation, we must keep track of what 
-  // comparisons we used, so that when the values of the comparisons change, we 
-  // propagate the information to the places we used the comparison. 
-  mutable DenseMap<const Value *, SmallPtrSet<Instruction *, 2>> 
-      PredicateToUsers; 
- 
-  // the same reasoning as PredicateToUsers.  When we skip MemoryAccesses for 
-  // stores, we no longer can rely solely on the def-use chains of MemorySSA. 
-  mutable DenseMap<const MemoryAccess *, SmallPtrSet<MemoryAccess *, 2>> 
-      MemoryToUsers; 
- 
-  // A table storing which memorydefs/phis represent a memory state provably 
-  // equivalent to another memory state. 
-  // We could use the congruence class machinery, but the MemoryAccess's are 
-  // abstract memory states, so they can only ever be equivalent to each other, 
-  // and not to constants, etc. 
-  DenseMap<const MemoryAccess *, CongruenceClass *> MemoryAccessToClass; 
- 
-  // We could, if we wanted, build MemoryPhiExpressions and 
-  // MemoryVariableExpressions, etc, and value number them the same way we value 
-  // number phi expressions.  For the moment, this seems like overkill.  They 
-  // can only exist in one of three states: they can be TOP (equal to 
-  // everything), Equivalent to something else, or unique.  Because we do not 
-  // create expressions for them, we need to simulate leader change not just 
-  // when they change class, but when they change state.  Note: We can do the 
-  // same thing for phis, and avoid having phi expressions if we wanted, We 
-  // should eventually unify in one direction or the other, so this is a little 
-  // bit of an experiment in which turns out easier to maintain. 
-  enum MemoryPhiState { MPS_Invalid, MPS_TOP, MPS_Equivalent, MPS_Unique }; 
-  DenseMap<const MemoryPhi *, MemoryPhiState> MemoryPhiState; 
- 
-  enum InstCycleState { ICS_Unknown, ICS_CycleFree, ICS_Cycle }; 
-  mutable DenseMap<const Instruction *, InstCycleState> InstCycleState; 
- 
-  // Expression to class mapping. 
-  using ExpressionClassMap = DenseMap<const Expression *, CongruenceClass *>; 
-  ExpressionClassMap ExpressionToClass; 
- 
-  // We have a single expression that represents currently DeadExpressions. 
-  // For dead expressions we can prove will stay dead, we mark them with 
-  // DFS number zero.  However, it's possible in the case of phi nodes 
-  // for us to assume/prove all arguments are dead during fixpointing. 
-  // We use DeadExpression for that case. 
-  DeadExpression *SingletonDeadExpression = nullptr; 
- 
-  // Which values have changed as a result of leader changes. 
-  SmallPtrSet<Value *, 8> LeaderChanges; 
- 
-  // Reachability info. 
-  using BlockEdge = BasicBlockEdge; 
-  DenseSet<BlockEdge> ReachableEdges; 
-  SmallPtrSet<const BasicBlock *, 8> ReachableBlocks; 
- 
-  // This is a bitvector because, on larger functions, we may have 
-  // thousands of touched instructions at once (entire blocks, 
-  // instructions with hundreds of uses, etc).  Even with optimization 
-  // for when we mark whole blocks as touched, when this was a 
-  // SmallPtrSet or DenseSet, for some functions, we spent >20% of all 
-  // the time in GVN just managing this list.  The bitvector, on the 
-  // other hand, efficiently supports test/set/clear of both 
-  // individual and ranges, as well as "find next element" This 
-  // enables us to use it as a worklist with essentially 0 cost. 
-  BitVector TouchedInstructions; 
- 
-  DenseMap<const BasicBlock *, std::pair<unsigned, unsigned>> BlockInstRange; 
- 
-#ifndef NDEBUG 
-  // Debugging for how many times each block and instruction got processed. 
-  DenseMap<const Value *, unsigned> ProcessedCount; 
-#endif 
- 
-  // DFS info. 
-  // This contains a mapping from Instructions to DFS numbers. 
-  // The numbering starts at 1. An instruction with DFS number zero 
-  // means that the instruction is dead. 
-  DenseMap<const Value *, unsigned> InstrDFS; 
- 
-  // This contains the mapping DFS numbers to instructions. 
-  SmallVector<Value *, 32> DFSToInstr; 
- 
-  // Deletion info. 
-  SmallPtrSet<Instruction *, 8> InstructionsToErase; 
- 
-public: 
-  NewGVN(Function &F, DominatorTree *DT, AssumptionCache *AC, 
-         TargetLibraryInfo *TLI, AliasAnalysis *AA, MemorySSA *MSSA, 
-         const DataLayout &DL) 
-      : F(F), DT(DT), TLI(TLI), AA(AA), MSSA(MSSA), AC(AC), DL(DL), 
-        PredInfo(std::make_unique<PredicateInfo>(F, *DT, *AC)), 
+//===- NewGVN.cpp - Global Value Numbering Pass ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file implements the new LLVM's Global Value Numbering pass.
+/// GVN partitions values computed by a function into congruence classes.
+/// Values ending up in the same congruence class are guaranteed to be the same
+/// for every execution of the program. In that respect, congruency is a
+/// compile-time approximation of equivalence of values at runtime.
+/// The algorithm implemented here uses a sparse formulation and it's based
+/// on the ideas described in the paper:
+/// "A Sparse Algorithm for Predicated Global Value Numbering" from
+/// Karthik Gargi.
+///
+/// A brief overview of the algorithm: The algorithm is essentially the same as
+/// the standard RPO value numbering algorithm (a good reference is the paper
+/// "SCC based value numbering" by L. Taylor Simpson) with one major difference:
+/// The RPO algorithm proceeds, on every iteration, to process every reachable
+/// block and every instruction in that block.  This is because the standard RPO
+/// algorithm does not track what things have the same value number, it only
+/// tracks what the value number of a given operation is (the mapping is
+/// operation -> value number).  Thus, when a value number of an operation
+/// changes, it must reprocess everything to ensure all uses of a value number
+/// get updated properly.  In constrast, the sparse algorithm we use *also*
+/// tracks what operations have a given value number (IE it also tracks the
+/// reverse mapping from value number -> operations with that value number), so
+/// that it only needs to reprocess the instructions that are affected when
+/// something's value number changes.  The vast majority of complexity and code
+/// in this file is devoted to tracking what value numbers could change for what
+/// instructions when various things happen.  The rest of the algorithm is
+/// devoted to performing symbolic evaluation, forward propagation, and
+/// simplification of operations based on the value numbers deduced so far
+///
+/// In order to make the GVN mostly-complete, we use a technique derived from
+/// "Detection of Redundant Expressions: A Complete and Polynomial-time
+/// Algorithm in SSA" by R.R. Pai.  The source of incompleteness in most SSA
+/// based GVN algorithms is related to their inability to detect equivalence
+/// between phi of ops (IE phi(a+b, c+d)) and op of phis (phi(a,c) + phi(b, d)).
+/// We resolve this issue by generating the equivalent "phi of ops" form for
+/// each op of phis we see, in a way that only takes polynomial time to resolve.
+///
+/// We also do not perform elimination by using any published algorithm.  All
+/// published algorithms are O(Instructions). Instead, we use a technique that
+/// is O(number of operations with the same value number), enabling us to skip
+/// trying to eliminate things that have unique value numbers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/NewGVN.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CFGPrinter.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/ArrayRecycler.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/PointerLikeTypeTraits.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVNExpression.h"
+#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PredicateInfo.h"
+#include "llvm/Transforms/Utils/VNCoercion.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::GVNExpression;
+using namespace llvm::VNCoercion;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "newgvn"
+
+STATISTIC(NumGVNInstrDeleted, "Number of instructions deleted");
+STATISTIC(NumGVNBlocksDeleted, "Number of blocks deleted");
+STATISTIC(NumGVNOpsSimplified, "Number of Expressions simplified");
+STATISTIC(NumGVNPhisAllSame, "Number of PHIs whos arguments are all the same");
+STATISTIC(NumGVNMaxIterations,
+          "Maximum Number of iterations it took to converge GVN");
+STATISTIC(NumGVNLeaderChanges, "Number of leader changes");
+STATISTIC(NumGVNSortedLeaderChanges, "Number of sorted leader changes");
+STATISTIC(NumGVNAvoidedSortedLeaderChanges,
+          "Number of avoided sorted leader changes");
+STATISTIC(NumGVNDeadStores, "Number of redundant/dead stores eliminated");
+STATISTIC(NumGVNPHIOfOpsCreated, "Number of PHI of ops created");
+STATISTIC(NumGVNPHIOfOpsEliminations,
+          "Number of things eliminated using PHI of ops");
+DEBUG_COUNTER(VNCounter, "newgvn-vn",
+              "Controls which instructions are value numbered");
+DEBUG_COUNTER(PHIOfOpsCounter, "newgvn-phi",
+              "Controls which instructions we create phi of ops for");
+// Currently store defining access refinement is too slow due to basicaa being
+// egregiously slow.  This flag lets us keep it working while we work on this
+// issue.
+static cl::opt<bool> EnableStoreRefinement("enable-store-refinement",
+                                           cl::init(false), cl::Hidden);
+
+/// Currently, the generation "phi of ops" can result in correctness issues.
+static cl::opt<bool> EnablePhiOfOps("enable-phi-of-ops", cl::init(true),
+                                    cl::Hidden);
+
+//===----------------------------------------------------------------------===//
+//                                GVN Pass
+//===----------------------------------------------------------------------===//
+
+// Anchor methods.
+namespace llvm {
+namespace GVNExpression {
+
+Expression::~Expression() = default;
+BasicExpression::~BasicExpression() = default;
+CallExpression::~CallExpression() = default;
+LoadExpression::~LoadExpression() = default;
+StoreExpression::~StoreExpression() = default;
+AggregateValueExpression::~AggregateValueExpression() = default;
+PHIExpression::~PHIExpression() = default;
+
+} // end namespace GVNExpression
+} // end namespace llvm
+
+namespace {
+
+// Tarjan's SCC finding algorithm with Nuutila's improvements
+// SCCIterator is actually fairly complex for the simple thing we want.
+// It also wants to hand us SCC's that are unrelated to the phi node we ask
+// about, and have us process them there or risk redoing work.
+// Graph traits over a filter iterator also doesn't work that well here.
+// This SCC finder is specialized to walk use-def chains, and only follows
+// instructions,
+// not generic values (arguments, etc).
+struct TarjanSCC {
+  TarjanSCC() : Components(1) {}
+
+  void Start(const Instruction *Start) {
+    if (Root.lookup(Start) == 0)
+      FindSCC(Start);
+  }
+
+  const SmallPtrSetImpl<const Value *> &getComponentFor(const Value *V) const {
+    unsigned ComponentID = ValueToComponent.lookup(V);
+
+    assert(ComponentID > 0 &&
+           "Asking for a component for a value we never processed");
+    return Components[ComponentID];
+  }
+
+private:
+  void FindSCC(const Instruction *I) {
+    Root[I] = ++DFSNum;
+    // Store the DFS Number we had before it possibly gets incremented.
+    unsigned int OurDFS = DFSNum;
+    for (auto &Op : I->operands()) {
+      if (auto *InstOp = dyn_cast<Instruction>(Op)) {
+        if (Root.lookup(Op) == 0)
+          FindSCC(InstOp);
+        if (!InComponent.count(Op))
+          Root[I] = std::min(Root.lookup(I), Root.lookup(Op));
+      }
+    }
+    // See if we really were the root of a component, by seeing if we still have
+    // our DFSNumber.  If we do, we are the root of the component, and we have
+    // completed a component. If we do not, we are not the root of a component,
+    // and belong on the component stack.
+    if (Root.lookup(I) == OurDFS) {
+      unsigned ComponentID = Components.size();
+      Components.resize(Components.size() + 1);
+      auto &Component = Components.back();
+      Component.insert(I);
+      LLVM_DEBUG(dbgs() << "Component root is " << *I << "\n");
+      InComponent.insert(I);
+      ValueToComponent[I] = ComponentID;
+      // Pop a component off the stack and label it.
+      while (!Stack.empty() && Root.lookup(Stack.back()) >= OurDFS) {
+        auto *Member = Stack.back();
+        LLVM_DEBUG(dbgs() << "Component member is " << *Member << "\n");
+        Component.insert(Member);
+        InComponent.insert(Member);
+        ValueToComponent[Member] = ComponentID;
+        Stack.pop_back();
+      }
+    } else {
+      // Part of a component, push to stack
+      Stack.push_back(I);
+    }
+  }
+
+  unsigned int DFSNum = 1;
+  SmallPtrSet<const Value *, 8> InComponent;
+  DenseMap<const Value *, unsigned int> Root;
+  SmallVector<const Value *, 8> Stack;
+
+  // Store the components as vector of ptr sets, because we need the topo order
+  // of SCC's, but not individual member order
+  SmallVector<SmallPtrSet<const Value *, 8>, 8> Components;
+
+  DenseMap<const Value *, unsigned> ValueToComponent;
+};
+
+// Congruence classes represent the set of expressions/instructions
+// that are all the same *during some scope in the function*.
+// That is, because of the way we perform equality propagation, and
+// because of memory value numbering, it is not correct to assume
+// you can willy-nilly replace any member with any other at any
+// point in the function.
+//
+// For any Value in the Member set, it is valid to replace any dominated member
+// with that Value.
+//
+// Every congruence class has a leader, and the leader is used to symbolize
+// instructions in a canonical way (IE every operand of an instruction that is a
+// member of the same congruence class will always be replaced with leader
+// during symbolization).  To simplify symbolization, we keep the leader as a
+// constant if class can be proved to be a constant value.  Otherwise, the
+// leader is the member of the value set with the smallest DFS number.  Each
+// congruence class also has a defining expression, though the expression may be
+// null.  If it exists, it can be used for forward propagation and reassociation
+// of values.
+
+// For memory, we also track a representative MemoryAccess, and a set of memory
+// members for MemoryPhis (which have no real instructions). Note that for
+// memory, it seems tempting to try to split the memory members into a
+// MemoryCongruenceClass or something.  Unfortunately, this does not work
+// easily.  The value numbering of a given memory expression depends on the
+// leader of the memory congruence class, and the leader of memory congruence
+// class depends on the value numbering of a given memory expression.  This
+// leads to wasted propagation, and in some cases, missed optimization.  For
+// example: If we had value numbered two stores together before, but now do not,
+// we move them to a new value congruence class.  This in turn will move at one
+// of the memorydefs to a new memory congruence class.  Which in turn, affects
+// the value numbering of the stores we just value numbered (because the memory
+// congruence class is part of the value number).  So while theoretically
+// possible to split them up, it turns out to be *incredibly* complicated to get
+// it to work right, because of the interdependency.  While structurally
+// slightly messier, it is algorithmically much simpler and faster to do what we
+// do here, and track them both at once in the same class.
+// Note: The default iterators for this class iterate over values
+class CongruenceClass {
+public:
+  using MemberType = Value;
+  using MemberSet = SmallPtrSet<MemberType *, 4>;
+  using MemoryMemberType = MemoryPhi;
+  using MemoryMemberSet = SmallPtrSet<const MemoryMemberType *, 2>;
+
+  explicit CongruenceClass(unsigned ID) : ID(ID) {}
+  CongruenceClass(unsigned ID, Value *Leader, const Expression *E)
+      : ID(ID), RepLeader(Leader), DefiningExpr(E) {}
+
+  unsigned getID() const { return ID; }
+
+  // True if this class has no members left.  This is mainly used for assertion
+  // purposes, and for skipping empty classes.
+  bool isDead() const {
+    // If it's both dead from a value perspective, and dead from a memory
+    // perspective, it's really dead.
+    return empty() && memory_empty();
+  }
+
+  // Leader functions
+  Value *getLeader() const { return RepLeader; }
+  void setLeader(Value *Leader) { RepLeader = Leader; }
+  const std::pair<Value *, unsigned int> &getNextLeader() const {
+    return NextLeader;
+  }
+  void resetNextLeader() { NextLeader = {nullptr, ~0}; }
+  void addPossibleNextLeader(std::pair<Value *, unsigned int> LeaderPair) {
+    if (LeaderPair.second < NextLeader.second)
+      NextLeader = LeaderPair;
+  }
+
+  Value *getStoredValue() const { return RepStoredValue; }
+  void setStoredValue(Value *Leader) { RepStoredValue = Leader; }
+  const MemoryAccess *getMemoryLeader() const { return RepMemoryAccess; }
+  void setMemoryLeader(const MemoryAccess *Leader) { RepMemoryAccess = Leader; }
+
+  // Forward propagation info
+  const Expression *getDefiningExpr() const { return DefiningExpr; }
+
+  // Value member set
+  bool empty() const { return Members.empty(); }
+  unsigned size() const { return Members.size(); }
+  MemberSet::const_iterator begin() const { return Members.begin(); }
+  MemberSet::const_iterator end() const { return Members.end(); }
+  void insert(MemberType *M) { Members.insert(M); }
+  void erase(MemberType *M) { Members.erase(M); }
+  void swap(MemberSet &Other) { Members.swap(Other); }
+
+  // Memory member set
+  bool memory_empty() const { return MemoryMembers.empty(); }
+  unsigned memory_size() const { return MemoryMembers.size(); }
+  MemoryMemberSet::const_iterator memory_begin() const {
+    return MemoryMembers.begin();
+  }
+  MemoryMemberSet::const_iterator memory_end() const {
+    return MemoryMembers.end();
+  }
+  iterator_range<MemoryMemberSet::const_iterator> memory() const {
+    return make_range(memory_begin(), memory_end());
+  }
+
+  void memory_insert(const MemoryMemberType *M) { MemoryMembers.insert(M); }
+  void memory_erase(const MemoryMemberType *M) { MemoryMembers.erase(M); }
+
+  // Store count
+  unsigned getStoreCount() const { return StoreCount; }
+  void incStoreCount() { ++StoreCount; }
+  void decStoreCount() {
+    assert(StoreCount != 0 && "Store count went negative");
+    --StoreCount;
+  }
+
+  // True if this class has no memory members.
+  bool definesNoMemory() const { return StoreCount == 0 && memory_empty(); }
+
+  // Return true if two congruence classes are equivalent to each other. This
+  // means that every field but the ID number and the dead field are equivalent.
+  bool isEquivalentTo(const CongruenceClass *Other) const {
+    if (!Other)
+      return false;
+    if (this == Other)
+      return true;
+
+    if (std::tie(StoreCount, RepLeader, RepStoredValue, RepMemoryAccess) !=
+        std::tie(Other->StoreCount, Other->RepLeader, Other->RepStoredValue,
+                 Other->RepMemoryAccess))
+      return false;
+    if (DefiningExpr != Other->DefiningExpr)
+      if (!DefiningExpr || !Other->DefiningExpr ||
+          *DefiningExpr != *Other->DefiningExpr)
+        return false;
+
+    if (Members.size() != Other->Members.size())
+      return false;
+
+    return all_of(Members,
+                  [&](const Value *V) { return Other->Members.count(V); });
+  }
+
+private:
+  unsigned ID;
+
+  // Representative leader.
+  Value *RepLeader = nullptr;
+
+  // The most dominating leader after our current leader, because the member set
+  // is not sorted and is expensive to keep sorted all the time.
+  std::pair<Value *, unsigned int> NextLeader = {nullptr, ~0U};
+
+  // If this is represented by a store, the value of the store.
+  Value *RepStoredValue = nullptr;
+
+  // If this class contains MemoryDefs or MemoryPhis, this is the leading memory
+  // access.
+  const MemoryAccess *RepMemoryAccess = nullptr;
+
+  // Defining Expression.
+  const Expression *DefiningExpr = nullptr;
+
+  // Actual members of this class.
+  MemberSet Members;
+
+  // This is the set of MemoryPhis that exist in the class. MemoryDefs and
+  // MemoryUses have real instructions representing them, so we only need to
+  // track MemoryPhis here.
+  MemoryMemberSet MemoryMembers;
+
+  // Number of stores in this congruence class.
+  // This is used so we can detect store equivalence changes properly.
+  int StoreCount = 0;
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+struct ExactEqualsExpression {
+  const Expression &E;
+
+  explicit ExactEqualsExpression(const Expression &E) : E(E) {}
+
+  hash_code getComputedHash() const { return E.getComputedHash(); }
+
+  bool operator==(const Expression &Other) const {
+    return E.exactlyEquals(Other);
+  }
+};
+
+template <> struct DenseMapInfo<const Expression *> {
+  static const Expression *getEmptyKey() {
+    auto Val = static_cast<uintptr_t>(-1);
+    Val <<= PointerLikeTypeTraits<const Expression *>::NumLowBitsAvailable;
+    return reinterpret_cast<const Expression *>(Val);
+  }
+
+  static const Expression *getTombstoneKey() {
+    auto Val = static_cast<uintptr_t>(~1U);
+    Val <<= PointerLikeTypeTraits<const Expression *>::NumLowBitsAvailable;
+    return reinterpret_cast<const Expression *>(Val);
+  }
+
+  static unsigned getHashValue(const Expression *E) {
+    return E->getComputedHash();
+  }
+
+  static unsigned getHashValue(const ExactEqualsExpression &E) {
+    return E.getComputedHash();
+  }
+
+  static bool isEqual(const ExactEqualsExpression &LHS, const Expression *RHS) {
+    if (RHS == getTombstoneKey() || RHS == getEmptyKey())
+      return false;
+    return LHS == *RHS;
+  }
+
+  static bool isEqual(const Expression *LHS, const Expression *RHS) {
+    if (LHS == RHS)
+      return true;
+    if (LHS == getTombstoneKey() || RHS == getTombstoneKey() ||
+        LHS == getEmptyKey() || RHS == getEmptyKey())
+      return false;
+    // Compare hashes before equality.  This is *not* what the hashtable does,
+    // since it is computing it modulo the number of buckets, whereas we are
+    // using the full hash keyspace.  Since the hashes are precomputed, this
+    // check is *much* faster than equality.
+    if (LHS->getComputedHash() != RHS->getComputedHash())
+      return false;
+    return *LHS == *RHS;
+  }
+};
+
+} // end namespace llvm
+
+namespace {
+
+class NewGVN {
+  Function &F;
+  DominatorTree *DT = nullptr;
+  const TargetLibraryInfo *TLI = nullptr;
+  AliasAnalysis *AA = nullptr;
+  MemorySSA *MSSA = nullptr;
+  MemorySSAWalker *MSSAWalker = nullptr;
+  AssumptionCache *AC = nullptr;
+  const DataLayout &DL;
+  std::unique_ptr<PredicateInfo> PredInfo;
+
+  // These are the only two things the create* functions should have
+  // side-effects on due to allocating memory.
+  mutable BumpPtrAllocator ExpressionAllocator;
+  mutable ArrayRecycler<Value *> ArgRecycler;
+  mutable TarjanSCC SCCFinder;
+  const SimplifyQuery SQ;
+
+  // Number of function arguments, used by ranking
+  unsigned int NumFuncArgs = 0;
+
+  // RPOOrdering of basic blocks
+  DenseMap<const DomTreeNode *, unsigned> RPOOrdering;
+
+  // Congruence class info.
+
+  // This class is called INITIAL in the paper. It is the class everything
+  // startsout in, and represents any value. Being an optimistic analysis,
+  // anything in the TOP class has the value TOP, which is indeterminate and
+  // equivalent to everything.
+  CongruenceClass *TOPClass = nullptr;
+  std::vector<CongruenceClass *> CongruenceClasses;
+  unsigned NextCongruenceNum = 0;
+
+  // Value Mappings.
+  DenseMap<Value *, CongruenceClass *> ValueToClass;
+  DenseMap<Value *, const Expression *> ValueToExpression;
+
+  // Value PHI handling, used to make equivalence between phi(op, op) and
+  // op(phi, phi).
+  // These mappings just store various data that would normally be part of the
+  // IR.
+  SmallPtrSet<const Instruction *, 8> PHINodeUses;
+
+  DenseMap<const Value *, bool> OpSafeForPHIOfOps;
+
+  // Map a temporary instruction we created to a parent block.
+  DenseMap<const Value *, BasicBlock *> TempToBlock;
+
+  // Map between the already in-program instructions and the temporary phis we
+  // created that they are known equivalent to.
+  DenseMap<const Value *, PHINode *> RealToTemp;
+
+  // In order to know when we should re-process instructions that have
+  // phi-of-ops, we track the set of expressions that they needed as
+  // leaders. When we discover new leaders for those expressions, we process the
+  // associated phi-of-op instructions again in case they have changed.  The
+  // other way they may change is if they had leaders, and those leaders
+  // disappear.  However, at the point they have leaders, there are uses of the
+  // relevant operands in the created phi node, and so they will get reprocessed
+  // through the normal user marking we perform.
+  mutable DenseMap<const Value *, SmallPtrSet<Value *, 2>> AdditionalUsers;
+  DenseMap<const Expression *, SmallPtrSet<Instruction *, 2>>
+      ExpressionToPhiOfOps;
+
+  // Map from temporary operation to MemoryAccess.
+  DenseMap<const Instruction *, MemoryUseOrDef *> TempToMemory;
+
+  // Set of all temporary instructions we created.
+  // Note: This will include instructions that were just created during value
+  // numbering.  The way to test if something is using them is to check
+  // RealToTemp.
+  DenseSet<Instruction *> AllTempInstructions;
+
+  // This is the set of instructions to revisit on a reachability change.  At
+  // the end of the main iteration loop it will contain at least all the phi of
+  // ops instructions that will be changed to phis, as well as regular phis.
+  // During the iteration loop, it may contain other things, such as phi of ops
+  // instructions that used edge reachability to reach a result, and so need to
+  // be revisited when the edge changes, independent of whether the phi they
+  // depended on changes.
+  DenseMap<BasicBlock *, SparseBitVector<>> RevisitOnReachabilityChange;
+
+  // Mapping from predicate info we used to the instructions we used it with.
+  // In order to correctly ensure propagation, we must keep track of what
+  // comparisons we used, so that when the values of the comparisons change, we
+  // propagate the information to the places we used the comparison.
+  mutable DenseMap<const Value *, SmallPtrSet<Instruction *, 2>>
+      PredicateToUsers;
+
+  // the same reasoning as PredicateToUsers.  When we skip MemoryAccesses for
+  // stores, we no longer can rely solely on the def-use chains of MemorySSA.
+  mutable DenseMap<const MemoryAccess *, SmallPtrSet<MemoryAccess *, 2>>
+      MemoryToUsers;
+
+  // A table storing which memorydefs/phis represent a memory state provably
+  // equivalent to another memory state.
+  // We could use the congruence class machinery, but the MemoryAccess's are
+  // abstract memory states, so they can only ever be equivalent to each other,
+  // and not to constants, etc.
+  DenseMap<const MemoryAccess *, CongruenceClass *> MemoryAccessToClass;
+
+  // We could, if we wanted, build MemoryPhiExpressions and
+  // MemoryVariableExpressions, etc, and value number them the same way we value
+  // number phi expressions.  For the moment, this seems like overkill.  They
+  // can only exist in one of three states: they can be TOP (equal to
+  // everything), Equivalent to something else, or unique.  Because we do not
+  // create expressions for them, we need to simulate leader change not just
+  // when they change class, but when they change state.  Note: We can do the
+  // same thing for phis, and avoid having phi expressions if we wanted, We
+  // should eventually unify in one direction or the other, so this is a little
+  // bit of an experiment in which turns out easier to maintain.
+  enum MemoryPhiState { MPS_Invalid, MPS_TOP, MPS_Equivalent, MPS_Unique };
+  DenseMap<const MemoryPhi *, MemoryPhiState> MemoryPhiState;
+
+  enum InstCycleState { ICS_Unknown, ICS_CycleFree, ICS_Cycle };
+  mutable DenseMap<const Instruction *, InstCycleState> InstCycleState;
+
+  // Expression to class mapping.
+  using ExpressionClassMap = DenseMap<const Expression *, CongruenceClass *>;
+  ExpressionClassMap ExpressionToClass;
+
+  // We have a single expression that represents currently DeadExpressions.
+  // For dead expressions we can prove will stay dead, we mark them with
+  // DFS number zero.  However, it's possible in the case of phi nodes
+  // for us to assume/prove all arguments are dead during fixpointing.
+  // We use DeadExpression for that case.
+  DeadExpression *SingletonDeadExpression = nullptr;
+
+  // Which values have changed as a result of leader changes.
+  SmallPtrSet<Value *, 8> LeaderChanges;
+
+  // Reachability info.
+  using BlockEdge = BasicBlockEdge;
+  DenseSet<BlockEdge> ReachableEdges;
+  SmallPtrSet<const BasicBlock *, 8> ReachableBlocks;
+
+  // This is a bitvector because, on larger functions, we may have
+  // thousands of touched instructions at once (entire blocks,
+  // instructions with hundreds of uses, etc).  Even with optimization
+  // for when we mark whole blocks as touched, when this was a
+  // SmallPtrSet or DenseSet, for some functions, we spent >20% of all
+  // the time in GVN just managing this list.  The bitvector, on the
+  // other hand, efficiently supports test/set/clear of both
+  // individual and ranges, as well as "find next element" This
+  // enables us to use it as a worklist with essentially 0 cost.
+  BitVector TouchedInstructions;
+
+  DenseMap<const BasicBlock *, std::pair<unsigned, unsigned>> BlockInstRange;
+
+#ifndef NDEBUG
+  // Debugging for how many times each block and instruction got processed.
+  DenseMap<const Value *, unsigned> ProcessedCount;
+#endif
+
+  // DFS info.
+  // This contains a mapping from Instructions to DFS numbers.
+  // The numbering starts at 1. An instruction with DFS number zero
+  // means that the instruction is dead.
+  DenseMap<const Value *, unsigned> InstrDFS;
+
+  // This contains the mapping DFS numbers to instructions.
+  SmallVector<Value *, 32> DFSToInstr;
+
+  // Deletion info.
+  SmallPtrSet<Instruction *, 8> InstructionsToErase;
+
+public:
+  NewGVN(Function &F, DominatorTree *DT, AssumptionCache *AC,
+         TargetLibraryInfo *TLI, AliasAnalysis *AA, MemorySSA *MSSA,
+         const DataLayout &DL)
+      : F(F), DT(DT), TLI(TLI), AA(AA), MSSA(MSSA), AC(AC), DL(DL),
+        PredInfo(std::make_unique<PredicateInfo>(F, *DT, *AC)),
         SQ(DL, TLI, DT, AC, /*CtxI=*/nullptr, /*UseInstrInfo=*/false,
            /*CanUseUndef=*/false) {}
- 
-  bool runGVN(); 
- 
-private: 
-  // Expression handling. 
-  const Expression *createExpression(Instruction *) const; 
-  const Expression *createBinaryExpression(unsigned, Type *, Value *, Value *, 
-                                           Instruction *) const; 
- 
-  // Our canonical form for phi arguments is a pair of incoming value, incoming 
-  // basic block. 
-  using ValPair = std::pair<Value *, BasicBlock *>; 
- 
-  PHIExpression *createPHIExpression(ArrayRef<ValPair>, const Instruction *, 
-                                     BasicBlock *, bool &HasBackEdge, 
-                                     bool &OriginalOpsConstant) const; 
-  const DeadExpression *createDeadExpression() const; 
-  const VariableExpression *createVariableExpression(Value *) const; 
-  const ConstantExpression *createConstantExpression(Constant *) const; 
-  const Expression *createVariableOrConstant(Value *V) const; 
-  const UnknownExpression *createUnknownExpression(Instruction *) const; 
-  const StoreExpression *createStoreExpression(StoreInst *, 
-                                               const MemoryAccess *) const; 
-  LoadExpression *createLoadExpression(Type *, Value *, LoadInst *, 
-                                       const MemoryAccess *) const; 
-  const CallExpression *createCallExpression(CallInst *, 
-                                             const MemoryAccess *) const; 
-  const AggregateValueExpression * 
-  createAggregateValueExpression(Instruction *) const; 
-  bool setBasicExpressionInfo(Instruction *, BasicExpression *) const; 
- 
-  // Congruence class handling. 
-  CongruenceClass *createCongruenceClass(Value *Leader, const Expression *E) { 
-    auto *result = new CongruenceClass(NextCongruenceNum++, Leader, E); 
-    CongruenceClasses.emplace_back(result); 
-    return result; 
-  } 
- 
-  CongruenceClass *createMemoryClass(MemoryAccess *MA) { 
-    auto *CC = createCongruenceClass(nullptr, nullptr); 
-    CC->setMemoryLeader(MA); 
-    return CC; 
-  } 
- 
-  CongruenceClass *ensureLeaderOfMemoryClass(MemoryAccess *MA) { 
-    auto *CC = getMemoryClass(MA); 
-    if (CC->getMemoryLeader() != MA) 
-      CC = createMemoryClass(MA); 
-    return CC; 
-  } 
- 
-  CongruenceClass *createSingletonCongruenceClass(Value *Member) { 
-    CongruenceClass *CClass = createCongruenceClass(Member, nullptr); 
-    CClass->insert(Member); 
-    ValueToClass[Member] = CClass; 
-    return CClass; 
-  } 
- 
-  void initializeCongruenceClasses(Function &F); 
-  const Expression *makePossiblePHIOfOps(Instruction *, 
-                                         SmallPtrSetImpl<Value *> &); 
-  Value *findLeaderForInst(Instruction *ValueOp, 
-                           SmallPtrSetImpl<Value *> &Visited, 
-                           MemoryAccess *MemAccess, Instruction *OrigInst, 
-                           BasicBlock *PredBB); 
-  bool OpIsSafeForPHIOfOpsHelper(Value *V, const BasicBlock *PHIBlock, 
-                                 SmallPtrSetImpl<const Value *> &Visited, 
-                                 SmallVectorImpl<Instruction *> &Worklist); 
-  bool OpIsSafeForPHIOfOps(Value *Op, const BasicBlock *PHIBlock, 
-                           SmallPtrSetImpl<const Value *> &); 
-  void addPhiOfOps(PHINode *Op, BasicBlock *BB, Instruction *ExistingValue); 
-  void removePhiOfOps(Instruction *I, PHINode *PHITemp); 
- 
-  // Value number an Instruction or MemoryPhi. 
-  void valueNumberMemoryPhi(MemoryPhi *); 
-  void valueNumberInstruction(Instruction *); 
- 
-  // Symbolic evaluation. 
-  const Expression *checkSimplificationResults(Expression *, Instruction *, 
-                                               Value *) const; 
-  const Expression *performSymbolicEvaluation(Value *, 
-                                              SmallPtrSetImpl<Value *> &) const; 
-  const Expression *performSymbolicLoadCoercion(Type *, Value *, LoadInst *, 
-                                                Instruction *, 
-                                                MemoryAccess *) const; 
-  const Expression *performSymbolicLoadEvaluation(Instruction *) const; 
-  const Expression *performSymbolicStoreEvaluation(Instruction *) const; 
-  const Expression *performSymbolicCallEvaluation(Instruction *) const; 
-  void sortPHIOps(MutableArrayRef<ValPair> Ops) const; 
-  const Expression *performSymbolicPHIEvaluation(ArrayRef<ValPair>, 
-                                                 Instruction *I, 
-                                                 BasicBlock *PHIBlock) const; 
-  const Expression *performSymbolicAggrValueEvaluation(Instruction *) const; 
-  const Expression *performSymbolicCmpEvaluation(Instruction *) const; 
-  const Expression *performSymbolicPredicateInfoEvaluation(Instruction *) const; 
- 
-  // Congruence finding. 
-  bool someEquivalentDominates(const Instruction *, const Instruction *) const; 
-  Value *lookupOperandLeader(Value *) const; 
-  CongruenceClass *getClassForExpression(const Expression *E) const; 
-  void performCongruenceFinding(Instruction *, const Expression *); 
-  void moveValueToNewCongruenceClass(Instruction *, const Expression *, 
-                                     CongruenceClass *, CongruenceClass *); 
-  void moveMemoryToNewCongruenceClass(Instruction *, MemoryAccess *, 
-                                      CongruenceClass *, CongruenceClass *); 
-  Value *getNextValueLeader(CongruenceClass *) const; 
-  const MemoryAccess *getNextMemoryLeader(CongruenceClass *) const; 
-  bool setMemoryClass(const MemoryAccess *From, CongruenceClass *To); 
-  CongruenceClass *getMemoryClass(const MemoryAccess *MA) const; 
-  const MemoryAccess *lookupMemoryLeader(const MemoryAccess *) const; 
-  bool isMemoryAccessTOP(const MemoryAccess *) const; 
- 
-  // Ranking 
-  unsigned int getRank(const Value *) const; 
-  bool shouldSwapOperands(const Value *, const Value *) const; 
- 
-  // Reachability handling. 
-  void updateReachableEdge(BasicBlock *, BasicBlock *); 
-  void processOutgoingEdges(Instruction *, BasicBlock *); 
-  Value *findConditionEquivalence(Value *) const; 
- 
-  // Elimination. 
-  struct ValueDFS; 
-  void convertClassToDFSOrdered(const CongruenceClass &, 
-                                SmallVectorImpl<ValueDFS> &, 
-                                DenseMap<const Value *, unsigned int> &, 
-                                SmallPtrSetImpl<Instruction *> &) const; 
-  void convertClassToLoadsAndStores(const CongruenceClass &, 
-                                    SmallVectorImpl<ValueDFS> &) const; 
- 
-  bool eliminateInstructions(Function &); 
-  void replaceInstruction(Instruction *, Value *); 
-  void markInstructionForDeletion(Instruction *); 
-  void deleteInstructionsInBlock(BasicBlock *); 
-  Value *findPHIOfOpsLeader(const Expression *, const Instruction *, 
-                            const BasicBlock *) const; 
- 
-  // Various instruction touch utilities 
-  template <typename Map, typename KeyType> 
-  void touchAndErase(Map &, const KeyType &); 
-  void markUsersTouched(Value *); 
-  void markMemoryUsersTouched(const MemoryAccess *); 
-  void markMemoryDefTouched(const MemoryAccess *); 
-  void markPredicateUsersTouched(Instruction *); 
-  void markValueLeaderChangeTouched(CongruenceClass *CC); 
-  void markMemoryLeaderChangeTouched(CongruenceClass *CC); 
-  void markPhiOfOpsChanged(const Expression *E); 
-  void addPredicateUsers(const PredicateBase *, Instruction *) const; 
-  void addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const; 
-  void addAdditionalUsers(Value *To, Value *User) const; 
- 
-  // Main loop of value numbering 
-  void iterateTouchedInstructions(); 
- 
-  // Utilities. 
-  void cleanupTables(); 
-  std::pair<unsigned, unsigned> assignDFSNumbers(BasicBlock *, unsigned); 
-  void updateProcessedCount(const Value *V); 
-  void verifyMemoryCongruency() const; 
-  void verifyIterationSettled(Function &F); 
-  void verifyStoreExpressions() const; 
-  bool singleReachablePHIPath(SmallPtrSet<const MemoryAccess *, 8> &, 
-                              const MemoryAccess *, const MemoryAccess *) const; 
-  BasicBlock *getBlockForValue(Value *V) const; 
-  void deleteExpression(const Expression *E) const; 
-  MemoryUseOrDef *getMemoryAccess(const Instruction *) const; 
-  MemoryPhi *getMemoryAccess(const BasicBlock *) const; 
-  template <class T, class Range> T *getMinDFSOfRange(const Range &) const; 
- 
-  unsigned InstrToDFSNum(const Value *V) const { 
-    assert(isa<Instruction>(V) && "This should not be used for MemoryAccesses"); 
-    return InstrDFS.lookup(V); 
-  } 
- 
-  unsigned InstrToDFSNum(const MemoryAccess *MA) const { 
-    return MemoryToDFSNum(MA); 
-  } 
- 
-  Value *InstrFromDFSNum(unsigned DFSNum) { return DFSToInstr[DFSNum]; } 
- 
-  // Given a MemoryAccess, return the relevant instruction DFS number.  Note: 
-  // This deliberately takes a value so it can be used with Use's, which will 
-  // auto-convert to Value's but not to MemoryAccess's. 
-  unsigned MemoryToDFSNum(const Value *MA) const { 
-    assert(isa<MemoryAccess>(MA) && 
-           "This should not be used with instructions"); 
-    return isa<MemoryUseOrDef>(MA) 
-               ? InstrToDFSNum(cast<MemoryUseOrDef>(MA)->getMemoryInst()) 
-               : InstrDFS.lookup(MA); 
-  } 
- 
-  bool isCycleFree(const Instruction *) const; 
-  bool isBackedge(BasicBlock *From, BasicBlock *To) const; 
- 
-  // Debug counter info.  When verifying, we have to reset the value numbering 
-  // debug counter to the same state it started in to get the same results. 
-  int64_t StartingVNCounter = 0; 
-}; 
- 
-} // end anonymous namespace 
- 
-template <typename T> 
-static bool equalsLoadStoreHelper(const T &LHS, const Expression &RHS) { 
-  if (!isa<LoadExpression>(RHS) && !isa<StoreExpression>(RHS)) 
-    return false; 
-  return LHS.MemoryExpression::equals(RHS); 
-} 
- 
-bool LoadExpression::equals(const Expression &Other) const { 
-  return equalsLoadStoreHelper(*this, Other); 
-} 
- 
-bool StoreExpression::equals(const Expression &Other) const { 
-  if (!equalsLoadStoreHelper(*this, Other)) 
-    return false; 
-  // Make sure that store vs store includes the value operand. 
-  if (const auto *S = dyn_cast<StoreExpression>(&Other)) 
-    if (getStoredValue() != S->getStoredValue()) 
-      return false; 
-  return true; 
-} 
- 
-// Determine if the edge From->To is a backedge 
-bool NewGVN::isBackedge(BasicBlock *From, BasicBlock *To) const { 
-  return From == To || 
-         RPOOrdering.lookup(DT->getNode(From)) >= 
-             RPOOrdering.lookup(DT->getNode(To)); 
-} 
- 
-#ifndef NDEBUG 
-static std::string getBlockName(const BasicBlock *B) { 
-  return DOTGraphTraits<DOTFuncInfo *>::getSimpleNodeLabel(B, nullptr); 
-} 
-#endif 
- 
-// Get a MemoryAccess for an instruction, fake or real. 
-MemoryUseOrDef *NewGVN::getMemoryAccess(const Instruction *I) const { 
-  auto *Result = MSSA->getMemoryAccess(I); 
-  return Result ? Result : TempToMemory.lookup(I); 
-} 
- 
-// Get a MemoryPhi for a basic block. These are all real. 
-MemoryPhi *NewGVN::getMemoryAccess(const BasicBlock *BB) const { 
-  return MSSA->getMemoryAccess(BB); 
-} 
- 
-// Get the basic block from an instruction/memory value. 
-BasicBlock *NewGVN::getBlockForValue(Value *V) const { 
-  if (auto *I = dyn_cast<Instruction>(V)) { 
-    auto *Parent = I->getParent(); 
-    if (Parent) 
-      return Parent; 
-    Parent = TempToBlock.lookup(V); 
-    assert(Parent && "Every fake instruction should have a block"); 
-    return Parent; 
-  } 
- 
-  auto *MP = dyn_cast<MemoryPhi>(V); 
-  assert(MP && "Should have been an instruction or a MemoryPhi"); 
-  return MP->getBlock(); 
-} 
- 
-// Delete a definitely dead expression, so it can be reused by the expression 
-// allocator.  Some of these are not in creation functions, so we have to accept 
-// const versions. 
-void NewGVN::deleteExpression(const Expression *E) const { 
-  assert(isa<BasicExpression>(E)); 
-  auto *BE = cast<BasicExpression>(E); 
-  const_cast<BasicExpression *>(BE)->deallocateOperands(ArgRecycler); 
-  ExpressionAllocator.Deallocate(E); 
-} 
- 
-// If V is a predicateinfo copy, get the thing it is a copy of. 
-static Value *getCopyOf(const Value *V) { 
-  if (auto *II = dyn_cast<IntrinsicInst>(V)) 
-    if (II->getIntrinsicID() == Intrinsic::ssa_copy) 
-      return II->getOperand(0); 
-  return nullptr; 
-} 
- 
-// Return true if V is really PN, even accounting for predicateinfo copies. 
-static bool isCopyOfPHI(const Value *V, const PHINode *PN) { 
-  return V == PN || getCopyOf(V) == PN; 
-} 
- 
-static bool isCopyOfAPHI(const Value *V) { 
-  auto *CO = getCopyOf(V); 
-  return CO && isa<PHINode>(CO); 
-} 
- 
-// Sort PHI Operands into a canonical order.  What we use here is an RPO 
-// order. The BlockInstRange numbers are generated in an RPO walk of the basic 
-// blocks. 
-void NewGVN::sortPHIOps(MutableArrayRef<ValPair> Ops) const { 
-  llvm::sort(Ops, [&](const ValPair &P1, const ValPair &P2) { 
-    return BlockInstRange.lookup(P1.second).first < 
-           BlockInstRange.lookup(P2.second).first; 
-  }); 
-} 
- 
-// Return true if V is a value that will always be available (IE can 
-// be placed anywhere) in the function.  We don't do globals here 
-// because they are often worse to put in place. 
-static bool alwaysAvailable(Value *V) { 
-  return isa<Constant>(V) || isa<Argument>(V); 
-} 
- 
-// Create a PHIExpression from an array of {incoming edge, value} pairs.  I is 
-// the original instruction we are creating a PHIExpression for (but may not be 
-// a phi node). We require, as an invariant, that all the PHIOperands in the 
-// same block are sorted the same way. sortPHIOps will sort them into a 
-// canonical order. 
-PHIExpression *NewGVN::createPHIExpression(ArrayRef<ValPair> PHIOperands, 
-                                           const Instruction *I, 
-                                           BasicBlock *PHIBlock, 
-                                           bool &HasBackedge, 
-                                           bool &OriginalOpsConstant) const { 
-  unsigned NumOps = PHIOperands.size(); 
-  auto *E = new (ExpressionAllocator) PHIExpression(NumOps, PHIBlock); 
- 
-  E->allocateOperands(ArgRecycler, ExpressionAllocator); 
-  E->setType(PHIOperands.begin()->first->getType()); 
-  E->setOpcode(Instruction::PHI); 
- 
-  // Filter out unreachable phi operands. 
-  auto Filtered = make_filter_range(PHIOperands, [&](const ValPair &P) { 
-    auto *BB = P.second; 
-    if (auto *PHIOp = dyn_cast<PHINode>(I)) 
-      if (isCopyOfPHI(P.first, PHIOp)) 
-        return false; 
-    if (!ReachableEdges.count({BB, PHIBlock})) 
-      return false; 
-    // Things in TOPClass are equivalent to everything. 
-    if (ValueToClass.lookup(P.first) == TOPClass) 
-      return false; 
-    OriginalOpsConstant = OriginalOpsConstant && isa<Constant>(P.first); 
-    HasBackedge = HasBackedge || isBackedge(BB, PHIBlock); 
-    return lookupOperandLeader(P.first) != I; 
-  }); 
-  std::transform(Filtered.begin(), Filtered.end(), op_inserter(E), 
-                 [&](const ValPair &P) -> Value * { 
-                   return lookupOperandLeader(P.first); 
-                 }); 
-  return E; 
-} 
- 
-// Set basic expression info (Arguments, type, opcode) for Expression 
-// E from Instruction I in block B. 
-bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) const { 
-  bool AllConstant = true; 
-  if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) 
-    E->setType(GEP->getSourceElementType()); 
-  else 
-    E->setType(I->getType()); 
-  E->setOpcode(I->getOpcode()); 
-  E->allocateOperands(ArgRecycler, ExpressionAllocator); 
- 
-  // Transform the operand array into an operand leader array, and keep track of 
-  // whether all members are constant. 
-  std::transform(I->op_begin(), I->op_end(), op_inserter(E), [&](Value *O) { 
-    auto Operand = lookupOperandLeader(O); 
-    AllConstant = AllConstant && isa<Constant>(Operand); 
-    return Operand; 
-  }); 
- 
-  return AllConstant; 
-} 
- 
-const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T, 
-                                                 Value *Arg1, Value *Arg2, 
-                                                 Instruction *I) const { 
-  auto *E = new (ExpressionAllocator) BasicExpression(2); 
- 
-  E->setType(T); 
-  E->setOpcode(Opcode); 
-  E->allocateOperands(ArgRecycler, ExpressionAllocator); 
-  if (Instruction::isCommutative(Opcode)) { 
-    // Ensure that commutative instructions that only differ by a permutation 
-    // of their operands get the same value number by sorting the operand value 
-    // numbers.  Since all commutative instructions have two operands it is more 
-    // efficient to sort by hand rather than using, say, std::sort. 
-    if (shouldSwapOperands(Arg1, Arg2)) 
-      std::swap(Arg1, Arg2); 
-  } 
-  E->op_push_back(lookupOperandLeader(Arg1)); 
-  E->op_push_back(lookupOperandLeader(Arg2)); 
- 
-  Value *V = SimplifyBinOp(Opcode, E->getOperand(0), E->getOperand(1), SQ); 
-  if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V)) 
-    return SimplifiedE; 
-  return E; 
-} 
- 
-// Take a Value returned by simplification of Expression E/Instruction 
-// I, and see if it resulted in a simpler expression. If so, return 
-// that expression. 
-const Expression *NewGVN::checkSimplificationResults(Expression *E, 
-                                                     Instruction *I, 
-                                                     Value *V) const { 
-  if (!V) 
-    return nullptr; 
-  if (auto *C = dyn_cast<Constant>(V)) { 
-    if (I) 
-      LLVM_DEBUG(dbgs() << "Simplified " << *I << " to " 
-                        << " constant " << *C << "\n"); 
-    NumGVNOpsSimplified++; 
-    assert(isa<BasicExpression>(E) && 
-           "We should always have had a basic expression here"); 
-    deleteExpression(E); 
-    return createConstantExpression(C); 
-  } else if (isa<Argument>(V) || isa<GlobalVariable>(V)) { 
-    if (I) 
-      LLVM_DEBUG(dbgs() << "Simplified " << *I << " to " 
-                        << " variable " << *V << "\n"); 
-    deleteExpression(E); 
-    return createVariableExpression(V); 
-  } 
- 
-  CongruenceClass *CC = ValueToClass.lookup(V); 
-  if (CC) { 
-    if (CC->getLeader() && CC->getLeader() != I) { 
-      // If we simplified to something else, we need to communicate 
-      // that we're users of the value we simplified to. 
-      if (I != V) { 
-        // Don't add temporary instructions to the user lists. 
-        if (!AllTempInstructions.count(I)) 
-          addAdditionalUsers(V, I); 
-      } 
-      return createVariableOrConstant(CC->getLeader()); 
-    } 
-    if (CC->getDefiningExpr()) { 
-      // If we simplified to something else, we need to communicate 
-      // that we're users of the value we simplified to. 
-      if (I != V) { 
-        // Don't add temporary instructions to the user lists. 
-        if (!AllTempInstructions.count(I)) 
-          addAdditionalUsers(V, I); 
-      } 
- 
-      if (I) 
-        LLVM_DEBUG(dbgs() << "Simplified " << *I << " to " 
-                          << " expression " << *CC->getDefiningExpr() << "\n"); 
-      NumGVNOpsSimplified++; 
-      deleteExpression(E); 
-      return CC->getDefiningExpr(); 
-    } 
-  } 
- 
-  return nullptr; 
-} 
- 
-// Create a value expression from the instruction I, replacing operands with 
-// their leaders. 
- 
-const Expression *NewGVN::createExpression(Instruction *I) const { 
-  auto *E = new (ExpressionAllocator) BasicExpression(I->getNumOperands()); 
- 
-  bool AllConstant = setBasicExpressionInfo(I, E); 
- 
-  if (I->isCommutative()) { 
-    // Ensure that commutative instructions that only differ by a permutation 
-    // of their operands get the same value number by sorting the operand value 
-    // numbers.  Since all commutative instructions have two operands it is more 
-    // efficient to sort by hand rather than using, say, std::sort. 
-    assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!"); 
-    if (shouldSwapOperands(E->getOperand(0), E->getOperand(1))) 
-      E->swapOperands(0, 1); 
-  } 
-  // Perform simplification. 
-  if (auto *CI = dyn_cast<CmpInst>(I)) { 
-    // Sort the operand value numbers so x<y and y>x get the same value 
-    // number. 
-    CmpInst::Predicate Predicate = CI->getPredicate(); 
-    if (shouldSwapOperands(E->getOperand(0), E->getOperand(1))) { 
-      E->swapOperands(0, 1); 
-      Predicate = CmpInst::getSwappedPredicate(Predicate); 
-    } 
-    E->setOpcode((CI->getOpcode() << 8) | Predicate); 
-    // TODO: 25% of our time is spent in SimplifyCmpInst with pointer operands 
-    assert(I->getOperand(0)->getType() == I->getOperand(1)->getType() && 
-           "Wrong types on cmp instruction"); 
-    assert((E->getOperand(0)->getType() == I->getOperand(0)->getType() && 
-            E->getOperand(1)->getType() == I->getOperand(1)->getType())); 
-    Value *V = 
-        SimplifyCmpInst(Predicate, E->getOperand(0), E->getOperand(1), SQ); 
-    if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V)) 
-      return SimplifiedE; 
-  } else if (isa<SelectInst>(I)) { 
-    if (isa<Constant>(E->getOperand(0)) || 
-        E->getOperand(1) == E->getOperand(2)) { 
-      assert(E->getOperand(1)->getType() == I->getOperand(1)->getType() && 
-             E->getOperand(2)->getType() == I->getOperand(2)->getType()); 
-      Value *V = SimplifySelectInst(E->getOperand(0), E->getOperand(1), 
-                                    E->getOperand(2), SQ); 
-      if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V)) 
-        return SimplifiedE; 
-    } 
-  } else if (I->isBinaryOp()) { 
-    Value *V = 
-        SimplifyBinOp(E->getOpcode(), E->getOperand(0), E->getOperand(1), SQ); 
-    if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V)) 
-      return SimplifiedE; 
-  } else if (auto *CI = dyn_cast<CastInst>(I)) { 
-    Value *V = 
-        SimplifyCastInst(CI->getOpcode(), E->getOperand(0), CI->getType(), SQ); 
-    if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V)) 
-      return SimplifiedE; 
-  } else if (isa<GetElementPtrInst>(I)) { 
-    Value *V = SimplifyGEPInst( 
-        E->getType(), ArrayRef<Value *>(E->op_begin(), E->op_end()), SQ); 
-    if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V)) 
-      return SimplifiedE; 
-  } else if (AllConstant) { 
-    // We don't bother trying to simplify unless all of the operands 
-    // were constant. 
-    // TODO: There are a lot of Simplify*'s we could call here, if we 
-    // wanted to.  The original motivating case for this code was a 
-    // zext i1 false to i8, which we don't have an interface to 
-    // simplify (IE there is no SimplifyZExt). 
- 
-    SmallVector<Constant *, 8> C; 
-    for (Value *Arg : E->operands()) 
-      C.emplace_back(cast<Constant>(Arg)); 
- 
-    if (Value *V = ConstantFoldInstOperands(I, C, DL, TLI)) 
-      if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V)) 
-        return SimplifiedE; 
-  } 
-  return E; 
-} 
- 
-const AggregateValueExpression * 
-NewGVN::createAggregateValueExpression(Instruction *I) const { 
-  if (auto *II = dyn_cast<InsertValueInst>(I)) { 
-    auto *E = new (ExpressionAllocator) 
-        AggregateValueExpression(I->getNumOperands(), II->getNumIndices()); 
-    setBasicExpressionInfo(I, E); 
-    E->allocateIntOperands(ExpressionAllocator); 
-    std::copy(II->idx_begin(), II->idx_end(), int_op_inserter(E)); 
-    return E; 
-  } else if (auto *EI = dyn_cast<ExtractValueInst>(I)) { 
-    auto *E = new (ExpressionAllocator) 
-        AggregateValueExpression(I->getNumOperands(), EI->getNumIndices()); 
-    setBasicExpressionInfo(EI, E); 
-    E->allocateIntOperands(ExpressionAllocator); 
-    std::copy(EI->idx_begin(), EI->idx_end(), int_op_inserter(E)); 
-    return E; 
-  } 
-  llvm_unreachable("Unhandled type of aggregate value operation"); 
-} 
- 
-const DeadExpression *NewGVN::createDeadExpression() const { 
-  // DeadExpression has no arguments and all DeadExpression's are the same, 
-  // so we only need one of them. 
-  return SingletonDeadExpression; 
-} 
- 
-const VariableExpression *NewGVN::createVariableExpression(Value *V) const { 
-  auto *E = new (ExpressionAllocator) VariableExpression(V); 
-  E->setOpcode(V->getValueID()); 
-  return E; 
-} 
- 
-const Expression *NewGVN::createVariableOrConstant(Value *V) const { 
-  if (auto *C = dyn_cast<Constant>(V)) 
-    return createConstantExpression(C); 
-  return createVariableExpression(V); 
-} 
- 
-const ConstantExpression *NewGVN::createConstantExpression(Constant *C) const { 
-  auto *E = new (ExpressionAllocator) ConstantExpression(C); 
-  E->setOpcode(C->getValueID()); 
-  return E; 
-} 
- 
-const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) const { 
-  auto *E = new (ExpressionAllocator) UnknownExpression(I); 
-  E->setOpcode(I->getOpcode()); 
-  return E; 
-} 
- 
-const CallExpression * 
-NewGVN::createCallExpression(CallInst *CI, const MemoryAccess *MA) const { 
-  // FIXME: Add operand bundles for calls. 
+
+  bool runGVN();
+
+private:
+  // Expression handling.
+  const Expression *createExpression(Instruction *) const;
+  const Expression *createBinaryExpression(unsigned, Type *, Value *, Value *,
+                                           Instruction *) const;
+
+  // Our canonical form for phi arguments is a pair of incoming value, incoming
+  // basic block.
+  using ValPair = std::pair<Value *, BasicBlock *>;
+
+  PHIExpression *createPHIExpression(ArrayRef<ValPair>, const Instruction *,
+                                     BasicBlock *, bool &HasBackEdge,
+                                     bool &OriginalOpsConstant) const;
+  const DeadExpression *createDeadExpression() const;
+  const VariableExpression *createVariableExpression(Value *) const;
+  const ConstantExpression *createConstantExpression(Constant *) const;
+  const Expression *createVariableOrConstant(Value *V) const;
+  const UnknownExpression *createUnknownExpression(Instruction *) const;
+  const StoreExpression *createStoreExpression(StoreInst *,
+                                               const MemoryAccess *) const;
+  LoadExpression *createLoadExpression(Type *, Value *, LoadInst *,
+                                       const MemoryAccess *) const;
+  const CallExpression *createCallExpression(CallInst *,
+                                             const MemoryAccess *) const;
+  const AggregateValueExpression *
+  createAggregateValueExpression(Instruction *) const;
+  bool setBasicExpressionInfo(Instruction *, BasicExpression *) const;
+
+  // Congruence class handling.
+  CongruenceClass *createCongruenceClass(Value *Leader, const Expression *E) {
+    auto *result = new CongruenceClass(NextCongruenceNum++, Leader, E);
+    CongruenceClasses.emplace_back(result);
+    return result;
+  }
+
+  CongruenceClass *createMemoryClass(MemoryAccess *MA) {
+    auto *CC = createCongruenceClass(nullptr, nullptr);
+    CC->setMemoryLeader(MA);
+    return CC;
+  }
+
+  CongruenceClass *ensureLeaderOfMemoryClass(MemoryAccess *MA) {
+    auto *CC = getMemoryClass(MA);
+    if (CC->getMemoryLeader() != MA)
+      CC = createMemoryClass(MA);
+    return CC;
+  }
+
+  CongruenceClass *createSingletonCongruenceClass(Value *Member) {
+    CongruenceClass *CClass = createCongruenceClass(Member, nullptr);
+    CClass->insert(Member);
+    ValueToClass[Member] = CClass;
+    return CClass;
+  }
+
+  void initializeCongruenceClasses(Function &F);
+  const Expression *makePossiblePHIOfOps(Instruction *,
+                                         SmallPtrSetImpl<Value *> &);
+  Value *findLeaderForInst(Instruction *ValueOp,
+                           SmallPtrSetImpl<Value *> &Visited,
+                           MemoryAccess *MemAccess, Instruction *OrigInst,
+                           BasicBlock *PredBB);
+  bool OpIsSafeForPHIOfOpsHelper(Value *V, const BasicBlock *PHIBlock,
+                                 SmallPtrSetImpl<const Value *> &Visited,
+                                 SmallVectorImpl<Instruction *> &Worklist);
+  bool OpIsSafeForPHIOfOps(Value *Op, const BasicBlock *PHIBlock,
+                           SmallPtrSetImpl<const Value *> &);
+  void addPhiOfOps(PHINode *Op, BasicBlock *BB, Instruction *ExistingValue);
+  void removePhiOfOps(Instruction *I, PHINode *PHITemp);
+
+  // Value number an Instruction or MemoryPhi.
+  void valueNumberMemoryPhi(MemoryPhi *);
+  void valueNumberInstruction(Instruction *);
+
+  // Symbolic evaluation.
+  const Expression *checkSimplificationResults(Expression *, Instruction *,
+                                               Value *) const;
+  const Expression *performSymbolicEvaluation(Value *,
+                                              SmallPtrSetImpl<Value *> &) const;
+  const Expression *performSymbolicLoadCoercion(Type *, Value *, LoadInst *,
+                                                Instruction *,
+                                                MemoryAccess *) const;
+  const Expression *performSymbolicLoadEvaluation(Instruction *) const;
+  const Expression *performSymbolicStoreEvaluation(Instruction *) const;
+  const Expression *performSymbolicCallEvaluation(Instruction *) const;
+  void sortPHIOps(MutableArrayRef<ValPair> Ops) const;
+  const Expression *performSymbolicPHIEvaluation(ArrayRef<ValPair>,
+                                                 Instruction *I,
+                                                 BasicBlock *PHIBlock) const;
+  const Expression *performSymbolicAggrValueEvaluation(Instruction *) const;
+  const Expression *performSymbolicCmpEvaluation(Instruction *) const;
+  const Expression *performSymbolicPredicateInfoEvaluation(Instruction *) const;
+
+  // Congruence finding.
+  bool someEquivalentDominates(const Instruction *, const Instruction *) const;
+  Value *lookupOperandLeader(Value *) const;
+  CongruenceClass *getClassForExpression(const Expression *E) const;
+  void performCongruenceFinding(Instruction *, const Expression *);
+  void moveValueToNewCongruenceClass(Instruction *, const Expression *,
+                                     CongruenceClass *, CongruenceClass *);
+  void moveMemoryToNewCongruenceClass(Instruction *, MemoryAccess *,
+                                      CongruenceClass *, CongruenceClass *);
+  Value *getNextValueLeader(CongruenceClass *) const;
+  const MemoryAccess *getNextMemoryLeader(CongruenceClass *) const;
+  bool setMemoryClass(const MemoryAccess *From, CongruenceClass *To);
+  CongruenceClass *getMemoryClass(const MemoryAccess *MA) const;
+  const MemoryAccess *lookupMemoryLeader(const MemoryAccess *) const;
+  bool isMemoryAccessTOP(const MemoryAccess *) const;
+
+  // Ranking
+  unsigned int getRank(const Value *) const;
+  bool shouldSwapOperands(const Value *, const Value *) const;
+
+  // Reachability handling.
+  void updateReachableEdge(BasicBlock *, BasicBlock *);
+  void processOutgoingEdges(Instruction *, BasicBlock *);
+  Value *findConditionEquivalence(Value *) const;
+
+  // Elimination.
+  struct ValueDFS;
+  void convertClassToDFSOrdered(const CongruenceClass &,
+                                SmallVectorImpl<ValueDFS> &,
+                                DenseMap<const Value *, unsigned int> &,
+                                SmallPtrSetImpl<Instruction *> &) const;
+  void convertClassToLoadsAndStores(const CongruenceClass &,
+                                    SmallVectorImpl<ValueDFS> &) const;
+
+  bool eliminateInstructions(Function &);
+  void replaceInstruction(Instruction *, Value *);
+  void markInstructionForDeletion(Instruction *);
+  void deleteInstructionsInBlock(BasicBlock *);
+  Value *findPHIOfOpsLeader(const Expression *, const Instruction *,
+                            const BasicBlock *) const;
+
+  // Various instruction touch utilities
+  template <typename Map, typename KeyType>
+  void touchAndErase(Map &, const KeyType &);
+  void markUsersTouched(Value *);
+  void markMemoryUsersTouched(const MemoryAccess *);
+  void markMemoryDefTouched(const MemoryAccess *);
+  void markPredicateUsersTouched(Instruction *);
+  void markValueLeaderChangeTouched(CongruenceClass *CC);
+  void markMemoryLeaderChangeTouched(CongruenceClass *CC);
+  void markPhiOfOpsChanged(const Expression *E);
+  void addPredicateUsers(const PredicateBase *, Instruction *) const;
+  void addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const;
+  void addAdditionalUsers(Value *To, Value *User) const;
+
+  // Main loop of value numbering
+  void iterateTouchedInstructions();
+
+  // Utilities.
+  void cleanupTables();
+  std::pair<unsigned, unsigned> assignDFSNumbers(BasicBlock *, unsigned);
+  void updateProcessedCount(const Value *V);
+  void verifyMemoryCongruency() const;
+  void verifyIterationSettled(Function &F);
+  void verifyStoreExpressions() const;
+  bool singleReachablePHIPath(SmallPtrSet<const MemoryAccess *, 8> &,
+                              const MemoryAccess *, const MemoryAccess *) const;
+  BasicBlock *getBlockForValue(Value *V) const;
+  void deleteExpression(const Expression *E) const;
+  MemoryUseOrDef *getMemoryAccess(const Instruction *) const;
+  MemoryPhi *getMemoryAccess(const BasicBlock *) const;
+  template <class T, class Range> T *getMinDFSOfRange(const Range &) const;
+
+  unsigned InstrToDFSNum(const Value *V) const {
+    assert(isa<Instruction>(V) && "This should not be used for MemoryAccesses");
+    return InstrDFS.lookup(V);
+  }
+
+  unsigned InstrToDFSNum(const MemoryAccess *MA) const {
+    return MemoryToDFSNum(MA);
+  }
+
+  Value *InstrFromDFSNum(unsigned DFSNum) { return DFSToInstr[DFSNum]; }
+
+  // Given a MemoryAccess, return the relevant instruction DFS number.  Note:
+  // This deliberately takes a value so it can be used with Use's, which will
+  // auto-convert to Value's but not to MemoryAccess's.
+  unsigned MemoryToDFSNum(const Value *MA) const {
+    assert(isa<MemoryAccess>(MA) &&
+           "This should not be used with instructions");
+    return isa<MemoryUseOrDef>(MA)
+               ? InstrToDFSNum(cast<MemoryUseOrDef>(MA)->getMemoryInst())
+               : InstrDFS.lookup(MA);
+  }
+
+  bool isCycleFree(const Instruction *) const;
+  bool isBackedge(BasicBlock *From, BasicBlock *To) const;
+
+  // Debug counter info.  When verifying, we have to reset the value numbering
+  // debug counter to the same state it started in to get the same results.
+  int64_t StartingVNCounter = 0;
+};
+
+} // end anonymous namespace
+
+template <typename T>
+static bool equalsLoadStoreHelper(const T &LHS, const Expression &RHS) {
+  if (!isa<LoadExpression>(RHS) && !isa<StoreExpression>(RHS))
+    return false;
+  return LHS.MemoryExpression::equals(RHS);
+}
+
+bool LoadExpression::equals(const Expression &Other) const {
+  return equalsLoadStoreHelper(*this, Other);
+}
+
+bool StoreExpression::equals(const Expression &Other) const {
+  if (!equalsLoadStoreHelper(*this, Other))
+    return false;
+  // Make sure that store vs store includes the value operand.
+  if (const auto *S = dyn_cast<StoreExpression>(&Other))
+    if (getStoredValue() != S->getStoredValue())
+      return false;
+  return true;
+}
+
+// Determine if the edge From->To is a backedge
+bool NewGVN::isBackedge(BasicBlock *From, BasicBlock *To) const {
+  return From == To ||
+         RPOOrdering.lookup(DT->getNode(From)) >=
+             RPOOrdering.lookup(DT->getNode(To));
+}
+
+#ifndef NDEBUG
+static std::string getBlockName(const BasicBlock *B) {
+  return DOTGraphTraits<DOTFuncInfo *>::getSimpleNodeLabel(B, nullptr);
+}
+#endif
+
+// Get a MemoryAccess for an instruction, fake or real.
+MemoryUseOrDef *NewGVN::getMemoryAccess(const Instruction *I) const {
+  auto *Result = MSSA->getMemoryAccess(I);
+  return Result ? Result : TempToMemory.lookup(I);
+}
+
+// Get a MemoryPhi for a basic block. These are all real.
+MemoryPhi *NewGVN::getMemoryAccess(const BasicBlock *BB) const {
+  return MSSA->getMemoryAccess(BB);
+}
+
+// Get the basic block from an instruction/memory value.
+BasicBlock *NewGVN::getBlockForValue(Value *V) const {
+  if (auto *I = dyn_cast<Instruction>(V)) {
+    auto *Parent = I->getParent();
+    if (Parent)
+      return Parent;
+    Parent = TempToBlock.lookup(V);
+    assert(Parent && "Every fake instruction should have a block");
+    return Parent;
+  }
+
+  auto *MP = dyn_cast<MemoryPhi>(V);
+  assert(MP && "Should have been an instruction or a MemoryPhi");
+  return MP->getBlock();
+}
+
+// Delete a definitely dead expression, so it can be reused by the expression
+// allocator.  Some of these are not in creation functions, so we have to accept
+// const versions.
+void NewGVN::deleteExpression(const Expression *E) const {
+  assert(isa<BasicExpression>(E));
+  auto *BE = cast<BasicExpression>(E);
+  const_cast<BasicExpression *>(BE)->deallocateOperands(ArgRecycler);
+  ExpressionAllocator.Deallocate(E);
+}
+
+// If V is a predicateinfo copy, get the thing it is a copy of.
+static Value *getCopyOf(const Value *V) {
+  if (auto *II = dyn_cast<IntrinsicInst>(V))
+    if (II->getIntrinsicID() == Intrinsic::ssa_copy)
+      return II->getOperand(0);
+  return nullptr;
+}
+
+// Return true if V is really PN, even accounting for predicateinfo copies.
+static bool isCopyOfPHI(const Value *V, const PHINode *PN) {
+  return V == PN || getCopyOf(V) == PN;
+}
+
+static bool isCopyOfAPHI(const Value *V) {
+  auto *CO = getCopyOf(V);
+  return CO && isa<PHINode>(CO);
+}
+
+// Sort PHI Operands into a canonical order.  What we use here is an RPO
+// order. The BlockInstRange numbers are generated in an RPO walk of the basic
+// blocks.
+void NewGVN::sortPHIOps(MutableArrayRef<ValPair> Ops) const {
+  llvm::sort(Ops, [&](const ValPair &P1, const ValPair &P2) {
+    return BlockInstRange.lookup(P1.second).first <
+           BlockInstRange.lookup(P2.second).first;
+  });
+}
+
+// Return true if V is a value that will always be available (IE can
+// be placed anywhere) in the function.  We don't do globals here
+// because they are often worse to put in place.
+static bool alwaysAvailable(Value *V) {
+  return isa<Constant>(V) || isa<Argument>(V);
+}
+
+// Create a PHIExpression from an array of {incoming edge, value} pairs.  I is
+// the original instruction we are creating a PHIExpression for (but may not be
+// a phi node). We require, as an invariant, that all the PHIOperands in the
+// same block are sorted the same way. sortPHIOps will sort them into a
+// canonical order.
+PHIExpression *NewGVN::createPHIExpression(ArrayRef<ValPair> PHIOperands,
+                                           const Instruction *I,
+                                           BasicBlock *PHIBlock,
+                                           bool &HasBackedge,
+                                           bool &OriginalOpsConstant) const {
+  unsigned NumOps = PHIOperands.size();
+  auto *E = new (ExpressionAllocator) PHIExpression(NumOps, PHIBlock);
+
+  E->allocateOperands(ArgRecycler, ExpressionAllocator);
+  E->setType(PHIOperands.begin()->first->getType());
+  E->setOpcode(Instruction::PHI);
+
+  // Filter out unreachable phi operands.
+  auto Filtered = make_filter_range(PHIOperands, [&](const ValPair &P) {
+    auto *BB = P.second;
+    if (auto *PHIOp = dyn_cast<PHINode>(I))
+      if (isCopyOfPHI(P.first, PHIOp))
+        return false;
+    if (!ReachableEdges.count({BB, PHIBlock}))
+      return false;
+    // Things in TOPClass are equivalent to everything.
+    if (ValueToClass.lookup(P.first) == TOPClass)
+      return false;
+    OriginalOpsConstant = OriginalOpsConstant && isa<Constant>(P.first);
+    HasBackedge = HasBackedge || isBackedge(BB, PHIBlock);
+    return lookupOperandLeader(P.first) != I;
+  });
+  std::transform(Filtered.begin(), Filtered.end(), op_inserter(E),
+                 [&](const ValPair &P) -> Value * {
+                   return lookupOperandLeader(P.first);
+                 });
+  return E;
+}
+
+// Set basic expression info (Arguments, type, opcode) for Expression
+// E from Instruction I in block B.
+bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) const {
+  bool AllConstant = true;
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
+    E->setType(GEP->getSourceElementType());
+  else
+    E->setType(I->getType());
+  E->setOpcode(I->getOpcode());
+  E->allocateOperands(ArgRecycler, ExpressionAllocator);
+
+  // Transform the operand array into an operand leader array, and keep track of
+  // whether all members are constant.
+  std::transform(I->op_begin(), I->op_end(), op_inserter(E), [&](Value *O) {
+    auto Operand = lookupOperandLeader(O);
+    AllConstant = AllConstant && isa<Constant>(Operand);
+    return Operand;
+  });
+
+  return AllConstant;
+}
+
+const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T,
+                                                 Value *Arg1, Value *Arg2,
+                                                 Instruction *I) const {
+  auto *E = new (ExpressionAllocator) BasicExpression(2);
+
+  E->setType(T);
+  E->setOpcode(Opcode);
+  E->allocateOperands(ArgRecycler, ExpressionAllocator);
+  if (Instruction::isCommutative(Opcode)) {
+    // Ensure that commutative instructions that only differ by a permutation
+    // of their operands get the same value number by sorting the operand value
+    // numbers.  Since all commutative instructions have two operands it is more
+    // efficient to sort by hand rather than using, say, std::sort.
+    if (shouldSwapOperands(Arg1, Arg2))
+      std::swap(Arg1, Arg2);
+  }
+  E->op_push_back(lookupOperandLeader(Arg1));
+  E->op_push_back(lookupOperandLeader(Arg2));
+
+  Value *V = SimplifyBinOp(Opcode, E->getOperand(0), E->getOperand(1), SQ);
+  if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+    return SimplifiedE;
+  return E;
+}
+
+// Take a Value returned by simplification of Expression E/Instruction
+// I, and see if it resulted in a simpler expression. If so, return
+// that expression.
+const Expression *NewGVN::checkSimplificationResults(Expression *E,
+                                                     Instruction *I,
+                                                     Value *V) const {
+  if (!V)
+    return nullptr;
+  if (auto *C = dyn_cast<Constant>(V)) {
+    if (I)
+      LLVM_DEBUG(dbgs() << "Simplified " << *I << " to "
+                        << " constant " << *C << "\n");
+    NumGVNOpsSimplified++;
+    assert(isa<BasicExpression>(E) &&
+           "We should always have had a basic expression here");
+    deleteExpression(E);
+    return createConstantExpression(C);
+  } else if (isa<Argument>(V) || isa<GlobalVariable>(V)) {
+    if (I)
+      LLVM_DEBUG(dbgs() << "Simplified " << *I << " to "
+                        << " variable " << *V << "\n");
+    deleteExpression(E);
+    return createVariableExpression(V);
+  }
+
+  CongruenceClass *CC = ValueToClass.lookup(V);
+  if (CC) {
+    if (CC->getLeader() && CC->getLeader() != I) {
+      // If we simplified to something else, we need to communicate
+      // that we're users of the value we simplified to.
+      if (I != V) {
+        // Don't add temporary instructions to the user lists.
+        if (!AllTempInstructions.count(I))
+          addAdditionalUsers(V, I);
+      }
+      return createVariableOrConstant(CC->getLeader());
+    }
+    if (CC->getDefiningExpr()) {
+      // If we simplified to something else, we need to communicate
+      // that we're users of the value we simplified to.
+      if (I != V) {
+        // Don't add temporary instructions to the user lists.
+        if (!AllTempInstructions.count(I))
+          addAdditionalUsers(V, I);
+      }
+
+      if (I)
+        LLVM_DEBUG(dbgs() << "Simplified " << *I << " to "
+                          << " expression " << *CC->getDefiningExpr() << "\n");
+      NumGVNOpsSimplified++;
+      deleteExpression(E);
+      return CC->getDefiningExpr();
+    }
+  }
+
+  return nullptr;
+}
+
+// Create a value expression from the instruction I, replacing operands with
+// their leaders.
+
+const Expression *NewGVN::createExpression(Instruction *I) const {
+  auto *E = new (ExpressionAllocator) BasicExpression(I->getNumOperands());
+
+  bool AllConstant = setBasicExpressionInfo(I, E);
+
+  if (I->isCommutative()) {
+    // Ensure that commutative instructions that only differ by a permutation
+    // of their operands get the same value number by sorting the operand value
+    // numbers.  Since all commutative instructions have two operands it is more
+    // efficient to sort by hand rather than using, say, std::sort.
+    assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!");
+    if (shouldSwapOperands(E->getOperand(0), E->getOperand(1)))
+      E->swapOperands(0, 1);
+  }
+  // Perform simplification.
+  if (auto *CI = dyn_cast<CmpInst>(I)) {
+    // Sort the operand value numbers so x<y and y>x get the same value
+    // number.
+    CmpInst::Predicate Predicate = CI->getPredicate();
+    if (shouldSwapOperands(E->getOperand(0), E->getOperand(1))) {
+      E->swapOperands(0, 1);
+      Predicate = CmpInst::getSwappedPredicate(Predicate);
+    }
+    E->setOpcode((CI->getOpcode() << 8) | Predicate);
+    // TODO: 25% of our time is spent in SimplifyCmpInst with pointer operands
+    assert(I->getOperand(0)->getType() == I->getOperand(1)->getType() &&
+           "Wrong types on cmp instruction");
+    assert((E->getOperand(0)->getType() == I->getOperand(0)->getType() &&
+            E->getOperand(1)->getType() == I->getOperand(1)->getType()));
+    Value *V =
+        SimplifyCmpInst(Predicate, E->getOperand(0), E->getOperand(1), SQ);
+    if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+      return SimplifiedE;
+  } else if (isa<SelectInst>(I)) {
+    if (isa<Constant>(E->getOperand(0)) ||
+        E->getOperand(1) == E->getOperand(2)) {
+      assert(E->getOperand(1)->getType() == I->getOperand(1)->getType() &&
+             E->getOperand(2)->getType() == I->getOperand(2)->getType());
+      Value *V = SimplifySelectInst(E->getOperand(0), E->getOperand(1),
+                                    E->getOperand(2), SQ);
+      if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+        return SimplifiedE;
+    }
+  } else if (I->isBinaryOp()) {
+    Value *V =
+        SimplifyBinOp(E->getOpcode(), E->getOperand(0), E->getOperand(1), SQ);
+    if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+      return SimplifiedE;
+  } else if (auto *CI = dyn_cast<CastInst>(I)) {
+    Value *V =
+        SimplifyCastInst(CI->getOpcode(), E->getOperand(0), CI->getType(), SQ);
+    if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+      return SimplifiedE;
+  } else if (isa<GetElementPtrInst>(I)) {
+    Value *V = SimplifyGEPInst(
+        E->getType(), ArrayRef<Value *>(E->op_begin(), E->op_end()), SQ);
+    if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+      return SimplifiedE;
+  } else if (AllConstant) {
+    // We don't bother trying to simplify unless all of the operands
+    // were constant.
+    // TODO: There are a lot of Simplify*'s we could call here, if we
+    // wanted to.  The original motivating case for this code was a
+    // zext i1 false to i8, which we don't have an interface to
+    // simplify (IE there is no SimplifyZExt).
+
+    SmallVector<Constant *, 8> C;
+    for (Value *Arg : E->operands())
+      C.emplace_back(cast<Constant>(Arg));
+
+    if (Value *V = ConstantFoldInstOperands(I, C, DL, TLI))
+      if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+        return SimplifiedE;
+  }
+  return E;
+}
+
+const AggregateValueExpression *
+NewGVN::createAggregateValueExpression(Instruction *I) const {
+  if (auto *II = dyn_cast<InsertValueInst>(I)) {
+    auto *E = new (ExpressionAllocator)
+        AggregateValueExpression(I->getNumOperands(), II->getNumIndices());
+    setBasicExpressionInfo(I, E);
+    E->allocateIntOperands(ExpressionAllocator);
+    std::copy(II->idx_begin(), II->idx_end(), int_op_inserter(E));
+    return E;
+  } else if (auto *EI = dyn_cast<ExtractValueInst>(I)) {
+    auto *E = new (ExpressionAllocator)
+        AggregateValueExpression(I->getNumOperands(), EI->getNumIndices());
+    setBasicExpressionInfo(EI, E);
+    E->allocateIntOperands(ExpressionAllocator);
+    std::copy(EI->idx_begin(), EI->idx_end(), int_op_inserter(E));
+    return E;
+  }
+  llvm_unreachable("Unhandled type of aggregate value operation");
+}
+
+const DeadExpression *NewGVN::createDeadExpression() const {
+  // DeadExpression has no arguments and all DeadExpression's are the same,
+  // so we only need one of them.
+  return SingletonDeadExpression;
+}
+
+const VariableExpression *NewGVN::createVariableExpression(Value *V) const {
+  auto *E = new (ExpressionAllocator) VariableExpression(V);
+  E->setOpcode(V->getValueID());
+  return E;
+}
+
+const Expression *NewGVN::createVariableOrConstant(Value *V) const {
+  if (auto *C = dyn_cast<Constant>(V))
+    return createConstantExpression(C);
+  return createVariableExpression(V);
+}
+
+const ConstantExpression *NewGVN::createConstantExpression(Constant *C) const {
+  auto *E = new (ExpressionAllocator) ConstantExpression(C);
+  E->setOpcode(C->getValueID());
+  return E;
+}
+
+const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) const {
+  auto *E = new (ExpressionAllocator) UnknownExpression(I);
+  E->setOpcode(I->getOpcode());
+  return E;
+}
+
+const CallExpression *
+NewGVN::createCallExpression(CallInst *CI, const MemoryAccess *MA) const {
+  // FIXME: Add operand bundles for calls.
   // FIXME: Allow commutative matching for intrinsics.
-  auto *E = 
-      new (ExpressionAllocator) CallExpression(CI->getNumOperands(), CI, MA); 
-  setBasicExpressionInfo(CI, E); 
-  return E; 
-} 
- 
-// Return true if some equivalent of instruction Inst dominates instruction U. 
-bool NewGVN::someEquivalentDominates(const Instruction *Inst, 
-                                     const Instruction *U) const { 
-  auto *CC = ValueToClass.lookup(Inst); 
-   // This must be an instruction because we are only called from phi nodes 
-  // in the case that the value it needs to check against is an instruction. 
- 
-  // The most likely candidates for dominance are the leader and the next leader. 
-  // The leader or nextleader will dominate in all cases where there is an 
-  // equivalent that is higher up in the dom tree. 
-  // We can't *only* check them, however, because the 
-  // dominator tree could have an infinite number of non-dominating siblings 
-  // with instructions that are in the right congruence class. 
-  //       A 
-  // B C D E F G 
-  // | 
-  // H 
-  // Instruction U could be in H,  with equivalents in every other sibling. 
-  // Depending on the rpo order picked, the leader could be the equivalent in 
-  // any of these siblings. 
-  if (!CC) 
-    return false; 
-  if (alwaysAvailable(CC->getLeader())) 
-    return true; 
-  if (DT->dominates(cast<Instruction>(CC->getLeader()), U)) 
-    return true; 
-  if (CC->getNextLeader().first && 
-      DT->dominates(cast<Instruction>(CC->getNextLeader().first), U)) 
-    return true; 
-  return llvm::any_of(*CC, [&](const Value *Member) { 
-    return Member != CC->getLeader() && 
-           DT->dominates(cast<Instruction>(Member), U); 
-  }); 
-} 
- 
-// See if we have a congruence class and leader for this operand, and if so, 
-// return it. Otherwise, return the operand itself. 
-Value *NewGVN::lookupOperandLeader(Value *V) const { 
-  CongruenceClass *CC = ValueToClass.lookup(V); 
-  if (CC) { 
-    // Everything in TOP is represented by undef, as it can be any value. 
-    // We do have to make sure we get the type right though, so we can't set the 
-    // RepLeader to undef. 
-    if (CC == TOPClass) 
-      return UndefValue::get(V->getType()); 
-    return CC->getStoredValue() ? CC->getStoredValue() : CC->getLeader(); 
-  } 
- 
-  return V; 
-} 
- 
-const MemoryAccess *NewGVN::lookupMemoryLeader(const MemoryAccess *MA) const { 
-  auto *CC = getMemoryClass(MA); 
-  assert(CC->getMemoryLeader() && 
-         "Every MemoryAccess should be mapped to a congruence class with a " 
-         "representative memory access"); 
-  return CC->getMemoryLeader(); 
-} 
- 
-// Return true if the MemoryAccess is really equivalent to everything. This is 
-// equivalent to the lattice value "TOP" in most lattices.  This is the initial 
-// state of all MemoryAccesses. 
-bool NewGVN::isMemoryAccessTOP(const MemoryAccess *MA) const { 
-  return getMemoryClass(MA) == TOPClass; 
-} 
- 
-LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp, 
-                                             LoadInst *LI, 
-                                             const MemoryAccess *MA) const { 
-  auto *E = 
-      new (ExpressionAllocator) LoadExpression(1, LI, lookupMemoryLeader(MA)); 
-  E->allocateOperands(ArgRecycler, ExpressionAllocator); 
-  E->setType(LoadType); 
- 
-  // Give store and loads same opcode so they value number together. 
-  E->setOpcode(0); 
-  E->op_push_back(PointerOp); 
- 
-  // TODO: Value number heap versions. We may be able to discover 
-  // things alias analysis can't on it's own (IE that a store and a 
-  // load have the same value, and thus, it isn't clobbering the load). 
-  return E; 
-} 
- 
-const StoreExpression * 
-NewGVN::createStoreExpression(StoreInst *SI, const MemoryAccess *MA) const { 
-  auto *StoredValueLeader = lookupOperandLeader(SI->getValueOperand()); 
-  auto *E = new (ExpressionAllocator) 
-      StoreExpression(SI->getNumOperands(), SI, StoredValueLeader, MA); 
-  E->allocateOperands(ArgRecycler, ExpressionAllocator); 
-  E->setType(SI->getValueOperand()->getType()); 
- 
-  // Give store and loads same opcode so they value number together. 
-  E->setOpcode(0); 
-  E->op_push_back(lookupOperandLeader(SI->getPointerOperand())); 
- 
-  // TODO: Value number heap versions. We may be able to discover 
-  // things alias analysis can't on it's own (IE that a store and a 
-  // load have the same value, and thus, it isn't clobbering the load). 
-  return E; 
-} 
- 
-const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) const { 
-  // Unlike loads, we never try to eliminate stores, so we do not check if they 
-  // are simple and avoid value numbering them. 
-  auto *SI = cast<StoreInst>(I); 
-  auto *StoreAccess = getMemoryAccess(SI); 
-  // Get the expression, if any, for the RHS of the MemoryDef. 
-  const MemoryAccess *StoreRHS = StoreAccess->getDefiningAccess(); 
-  if (EnableStoreRefinement) 
-    StoreRHS = MSSAWalker->getClobberingMemoryAccess(StoreAccess); 
-  // If we bypassed the use-def chains, make sure we add a use. 
-  StoreRHS = lookupMemoryLeader(StoreRHS); 
-  if (StoreRHS != StoreAccess->getDefiningAccess()) 
-    addMemoryUsers(StoreRHS, StoreAccess); 
-  // If we are defined by ourselves, use the live on entry def. 
-  if (StoreRHS == StoreAccess) 
-    StoreRHS = MSSA->getLiveOnEntryDef(); 
- 
-  if (SI->isSimple()) { 
-    // See if we are defined by a previous store expression, it already has a 
-    // value, and it's the same value as our current store. FIXME: Right now, we 
-    // only do this for simple stores, we should expand to cover memcpys, etc. 
-    const auto *LastStore = createStoreExpression(SI, StoreRHS); 
-    const auto *LastCC = ExpressionToClass.lookup(LastStore); 
-    // We really want to check whether the expression we matched was a store. No 
-    // easy way to do that. However, we can check that the class we found has a 
-    // store, which, assuming the value numbering state is not corrupt, is 
-    // sufficient, because we must also be equivalent to that store's expression 
-    // for it to be in the same class as the load. 
-    if (LastCC && LastCC->getStoredValue() == LastStore->getStoredValue()) 
-      return LastStore; 
-    // Also check if our value operand is defined by a load of the same memory 
-    // location, and the memory state is the same as it was then (otherwise, it 
-    // could have been overwritten later. See test32 in 
-    // transforms/DeadStoreElimination/simple.ll). 
-    if (auto *LI = dyn_cast<LoadInst>(LastStore->getStoredValue())) 
-      if ((lookupOperandLeader(LI->getPointerOperand()) == 
-           LastStore->getOperand(0)) && 
-          (lookupMemoryLeader(getMemoryAccess(LI)->getDefiningAccess()) == 
-           StoreRHS)) 
-        return LastStore; 
-    deleteExpression(LastStore); 
-  } 
- 
-  // If the store is not equivalent to anything, value number it as a store that 
-  // produces a unique memory state (instead of using it's MemoryUse, we use 
-  // it's MemoryDef). 
-  return createStoreExpression(SI, StoreAccess); 
-} 
- 
-// See if we can extract the value of a loaded pointer from a load, a store, or 
-// a memory instruction. 
-const Expression * 
-NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr, 
-                                    LoadInst *LI, Instruction *DepInst, 
-                                    MemoryAccess *DefiningAccess) const { 
-  assert((!LI || LI->isSimple()) && "Not a simple load"); 
-  if (auto *DepSI = dyn_cast<StoreInst>(DepInst)) { 
-    // Can't forward from non-atomic to atomic without violating memory model. 
-    // Also don't need to coerce if they are the same type, we will just 
-    // propagate. 
-    if (LI->isAtomic() > DepSI->isAtomic() || 
-        LoadType == DepSI->getValueOperand()->getType()) 
-      return nullptr; 
-    int Offset = analyzeLoadFromClobberingStore(LoadType, LoadPtr, DepSI, DL); 
-    if (Offset >= 0) { 
-      if (auto *C = dyn_cast<Constant>( 
-              lookupOperandLeader(DepSI->getValueOperand()))) { 
-        LLVM_DEBUG(dbgs() << "Coercing load from store " << *DepSI 
-                          << " to constant " << *C << "\n"); 
-        return createConstantExpression( 
-            getConstantStoreValueForLoad(C, Offset, LoadType, DL)); 
-      } 
-    } 
-  } else if (auto *DepLI = dyn_cast<LoadInst>(DepInst)) { 
-    // Can't forward from non-atomic to atomic without violating memory model. 
-    if (LI->isAtomic() > DepLI->isAtomic()) 
-      return nullptr; 
-    int Offset = analyzeLoadFromClobberingLoad(LoadType, LoadPtr, DepLI, DL); 
-    if (Offset >= 0) { 
-      // We can coerce a constant load into a load. 
-      if (auto *C = dyn_cast<Constant>(lookupOperandLeader(DepLI))) 
-        if (auto *PossibleConstant = 
-                getConstantLoadValueForLoad(C, Offset, LoadType, DL)) { 
-          LLVM_DEBUG(dbgs() << "Coercing load from load " << *LI 
-                            << " to constant " << *PossibleConstant << "\n"); 
-          return createConstantExpression(PossibleConstant); 
-        } 
-    } 
-  } else if (auto *DepMI = dyn_cast<MemIntrinsic>(DepInst)) { 
-    int Offset = analyzeLoadFromClobberingMemInst(LoadType, LoadPtr, DepMI, DL); 
-    if (Offset >= 0) { 
-      if (auto *PossibleConstant = 
-              getConstantMemInstValueForLoad(DepMI, Offset, LoadType, DL)) { 
-        LLVM_DEBUG(dbgs() << "Coercing load from meminst " << *DepMI 
-                          << " to constant " << *PossibleConstant << "\n"); 
-        return createConstantExpression(PossibleConstant); 
-      } 
-    } 
-  } 
- 
-  // All of the below are only true if the loaded pointer is produced 
-  // by the dependent instruction. 
-  if (LoadPtr != lookupOperandLeader(DepInst) && 
-      !AA->isMustAlias(LoadPtr, DepInst)) 
-    return nullptr; 
-  // If this load really doesn't depend on anything, then we must be loading an 
-  // undef value.  This can happen when loading for a fresh allocation with no 
-  // intervening stores, for example.  Note that this is only true in the case 
-  // that the result of the allocation is pointer equal to the load ptr. 
-  if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) || 
-      isAlignedAllocLikeFn(DepInst, TLI)) { 
-    return createConstantExpression(UndefValue::get(LoadType)); 
-  } 
-  // If this load occurs either right after a lifetime begin, 
-  // then the loaded value is undefined. 
-  else if (auto *II = dyn_cast<IntrinsicInst>(DepInst)) { 
-    if (II->getIntrinsicID() == Intrinsic::lifetime_start) 
-      return createConstantExpression(UndefValue::get(LoadType)); 
-  } 
-  // If this load follows a calloc (which zero initializes memory), 
-  // then the loaded value is zero 
-  else if (isCallocLikeFn(DepInst, TLI)) { 
-    return createConstantExpression(Constant::getNullValue(LoadType)); 
-  } 
- 
-  return nullptr; 
-} 
- 
-const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const { 
-  auto *LI = cast<LoadInst>(I); 
- 
-  // We can eliminate in favor of non-simple loads, but we won't be able to 
-  // eliminate the loads themselves. 
-  if (!LI->isSimple()) 
-    return nullptr; 
- 
-  Value *LoadAddressLeader = lookupOperandLeader(LI->getPointerOperand()); 
-  // Load of undef is undef. 
-  if (isa<UndefValue>(LoadAddressLeader)) 
-    return createConstantExpression(UndefValue::get(LI->getType())); 
-  MemoryAccess *OriginalAccess = getMemoryAccess(I); 
-  MemoryAccess *DefiningAccess = 
-      MSSAWalker->getClobberingMemoryAccess(OriginalAccess); 
- 
-  if (!MSSA->isLiveOnEntryDef(DefiningAccess)) { 
-    if (auto *MD = dyn_cast<MemoryDef>(DefiningAccess)) { 
-      Instruction *DefiningInst = MD->getMemoryInst(); 
-      // If the defining instruction is not reachable, replace with undef. 
-      if (!ReachableBlocks.count(DefiningInst->getParent())) 
-        return createConstantExpression(UndefValue::get(LI->getType())); 
-      // This will handle stores and memory insts.  We only do if it the 
-      // defining access has a different type, or it is a pointer produced by 
-      // certain memory operations that cause the memory to have a fixed value 
-      // (IE things like calloc). 
-      if (const auto *CoercionResult = 
-              performSymbolicLoadCoercion(LI->getType(), LoadAddressLeader, LI, 
-                                          DefiningInst, DefiningAccess)) 
-        return CoercionResult; 
-    } 
-  } 
- 
-  const auto *LE = createLoadExpression(LI->getType(), LoadAddressLeader, LI, 
-                                        DefiningAccess); 
-  // If our MemoryLeader is not our defining access, add a use to the 
-  // MemoryLeader, so that we get reprocessed when it changes. 
-  if (LE->getMemoryLeader() != DefiningAccess) 
-    addMemoryUsers(LE->getMemoryLeader(), OriginalAccess); 
-  return LE; 
-} 
- 
-const Expression * 
-NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const { 
-  auto *PI = PredInfo->getPredicateInfoFor(I); 
-  if (!PI) 
-    return nullptr; 
- 
-  LLVM_DEBUG(dbgs() << "Found predicate info from instruction !\n"); 
- 
+  auto *E =
+      new (ExpressionAllocator) CallExpression(CI->getNumOperands(), CI, MA);
+  setBasicExpressionInfo(CI, E);
+  return E;
+}
+
+// Return true if some equivalent of instruction Inst dominates instruction U.
+bool NewGVN::someEquivalentDominates(const Instruction *Inst,
+                                     const Instruction *U) const {
+  auto *CC = ValueToClass.lookup(Inst);
+   // This must be an instruction because we are only called from phi nodes
+  // in the case that the value it needs to check against is an instruction.
+
+  // The most likely candidates for dominance are the leader and the next leader.
+  // The leader or nextleader will dominate in all cases where there is an
+  // equivalent that is higher up in the dom tree.
+  // We can't *only* check them, however, because the
+  // dominator tree could have an infinite number of non-dominating siblings
+  // with instructions that are in the right congruence class.
+  //       A
+  // B C D E F G
+  // |
+  // H
+  // Instruction U could be in H,  with equivalents in every other sibling.
+  // Depending on the rpo order picked, the leader could be the equivalent in
+  // any of these siblings.
+  if (!CC)
+    return false;
+  if (alwaysAvailable(CC->getLeader()))
+    return true;
+  if (DT->dominates(cast<Instruction>(CC->getLeader()), U))
+    return true;
+  if (CC->getNextLeader().first &&
+      DT->dominates(cast<Instruction>(CC->getNextLeader().first), U))
+    return true;
+  return llvm::any_of(*CC, [&](const Value *Member) {
+    return Member != CC->getLeader() &&
+           DT->dominates(cast<Instruction>(Member), U);
+  });
+}
+
+// See if we have a congruence class and leader for this operand, and if so,
+// return it. Otherwise, return the operand itself.
+Value *NewGVN::lookupOperandLeader(Value *V) const {
+  CongruenceClass *CC = ValueToClass.lookup(V);
+  if (CC) {
+    // Everything in TOP is represented by undef, as it can be any value.
+    // We do have to make sure we get the type right though, so we can't set the
+    // RepLeader to undef.
+    if (CC == TOPClass)
+      return UndefValue::get(V->getType());
+    return CC->getStoredValue() ? CC->getStoredValue() : CC->getLeader();
+  }
+
+  return V;
+}
+
+const MemoryAccess *NewGVN::lookupMemoryLeader(const MemoryAccess *MA) const {
+  auto *CC = getMemoryClass(MA);
+  assert(CC->getMemoryLeader() &&
+         "Every MemoryAccess should be mapped to a congruence class with a "
+         "representative memory access");
+  return CC->getMemoryLeader();
+}
+
+// Return true if the MemoryAccess is really equivalent to everything. This is
+// equivalent to the lattice value "TOP" in most lattices.  This is the initial
+// state of all MemoryAccesses.
+bool NewGVN::isMemoryAccessTOP(const MemoryAccess *MA) const {
+  return getMemoryClass(MA) == TOPClass;
+}
+
+LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp,
+                                             LoadInst *LI,
+                                             const MemoryAccess *MA) const {
+  auto *E =
+      new (ExpressionAllocator) LoadExpression(1, LI, lookupMemoryLeader(MA));
+  E->allocateOperands(ArgRecycler, ExpressionAllocator);
+  E->setType(LoadType);
+
+  // Give store and loads same opcode so they value number together.
+  E->setOpcode(0);
+  E->op_push_back(PointerOp);
+
+  // TODO: Value number heap versions. We may be able to discover
+  // things alias analysis can't on it's own (IE that a store and a
+  // load have the same value, and thus, it isn't clobbering the load).
+  return E;
+}
+
+const StoreExpression *
+NewGVN::createStoreExpression(StoreInst *SI, const MemoryAccess *MA) const {
+  auto *StoredValueLeader = lookupOperandLeader(SI->getValueOperand());
+  auto *E = new (ExpressionAllocator)
+      StoreExpression(SI->getNumOperands(), SI, StoredValueLeader, MA);
+  E->allocateOperands(ArgRecycler, ExpressionAllocator);
+  E->setType(SI->getValueOperand()->getType());
+
+  // Give store and loads same opcode so they value number together.
+  E->setOpcode(0);
+  E->op_push_back(lookupOperandLeader(SI->getPointerOperand()));
+
+  // TODO: Value number heap versions. We may be able to discover
+  // things alias analysis can't on it's own (IE that a store and a
+  // load have the same value, and thus, it isn't clobbering the load).
+  return E;
+}
+
+const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) const {
+  // Unlike loads, we never try to eliminate stores, so we do not check if they
+  // are simple and avoid value numbering them.
+  auto *SI = cast<StoreInst>(I);
+  auto *StoreAccess = getMemoryAccess(SI);
+  // Get the expression, if any, for the RHS of the MemoryDef.
+  const MemoryAccess *StoreRHS = StoreAccess->getDefiningAccess();
+  if (EnableStoreRefinement)
+    StoreRHS = MSSAWalker->getClobberingMemoryAccess(StoreAccess);
+  // If we bypassed the use-def chains, make sure we add a use.
+  StoreRHS = lookupMemoryLeader(StoreRHS);
+  if (StoreRHS != StoreAccess->getDefiningAccess())
+    addMemoryUsers(StoreRHS, StoreAccess);
+  // If we are defined by ourselves, use the live on entry def.
+  if (StoreRHS == StoreAccess)
+    StoreRHS = MSSA->getLiveOnEntryDef();
+
+  if (SI->isSimple()) {
+    // See if we are defined by a previous store expression, it already has a
+    // value, and it's the same value as our current store. FIXME: Right now, we
+    // only do this for simple stores, we should expand to cover memcpys, etc.
+    const auto *LastStore = createStoreExpression(SI, StoreRHS);
+    const auto *LastCC = ExpressionToClass.lookup(LastStore);
+    // We really want to check whether the expression we matched was a store. No
+    // easy way to do that. However, we can check that the class we found has a
+    // store, which, assuming the value numbering state is not corrupt, is
+    // sufficient, because we must also be equivalent to that store's expression
+    // for it to be in the same class as the load.
+    if (LastCC && LastCC->getStoredValue() == LastStore->getStoredValue())
+      return LastStore;
+    // Also check if our value operand is defined by a load of the same memory
+    // location, and the memory state is the same as it was then (otherwise, it
+    // could have been overwritten later. See test32 in
+    // transforms/DeadStoreElimination/simple.ll).
+    if (auto *LI = dyn_cast<LoadInst>(LastStore->getStoredValue()))
+      if ((lookupOperandLeader(LI->getPointerOperand()) ==
+           LastStore->getOperand(0)) &&
+          (lookupMemoryLeader(getMemoryAccess(LI)->getDefiningAccess()) ==
+           StoreRHS))
+        return LastStore;
+    deleteExpression(LastStore);
+  }
+
+  // If the store is not equivalent to anything, value number it as a store that
+  // produces a unique memory state (instead of using it's MemoryUse, we use
+  // it's MemoryDef).
+  return createStoreExpression(SI, StoreAccess);
+}
+
+// See if we can extract the value of a loaded pointer from a load, a store, or
+// a memory instruction.
+const Expression *
+NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
+                                    LoadInst *LI, Instruction *DepInst,
+                                    MemoryAccess *DefiningAccess) const {
+  assert((!LI || LI->isSimple()) && "Not a simple load");
+  if (auto *DepSI = dyn_cast<StoreInst>(DepInst)) {
+    // Can't forward from non-atomic to atomic without violating memory model.
+    // Also don't need to coerce if they are the same type, we will just
+    // propagate.
+    if (LI->isAtomic() > DepSI->isAtomic() ||
+        LoadType == DepSI->getValueOperand()->getType())
+      return nullptr;
+    int Offset = analyzeLoadFromClobberingStore(LoadType, LoadPtr, DepSI, DL);
+    if (Offset >= 0) {
+      if (auto *C = dyn_cast<Constant>(
+              lookupOperandLeader(DepSI->getValueOperand()))) {
+        LLVM_DEBUG(dbgs() << "Coercing load from store " << *DepSI
+                          << " to constant " << *C << "\n");
+        return createConstantExpression(
+            getConstantStoreValueForLoad(C, Offset, LoadType, DL));
+      }
+    }
+  } else if (auto *DepLI = dyn_cast<LoadInst>(DepInst)) {
+    // Can't forward from non-atomic to atomic without violating memory model.
+    if (LI->isAtomic() > DepLI->isAtomic())
+      return nullptr;
+    int Offset = analyzeLoadFromClobberingLoad(LoadType, LoadPtr, DepLI, DL);
+    if (Offset >= 0) {
+      // We can coerce a constant load into a load.
+      if (auto *C = dyn_cast<Constant>(lookupOperandLeader(DepLI)))
+        if (auto *PossibleConstant =
+                getConstantLoadValueForLoad(C, Offset, LoadType, DL)) {
+          LLVM_DEBUG(dbgs() << "Coercing load from load " << *LI
+                            << " to constant " << *PossibleConstant << "\n");
+          return createConstantExpression(PossibleConstant);
+        }
+    }
+  } else if (auto *DepMI = dyn_cast<MemIntrinsic>(DepInst)) {
+    int Offset = analyzeLoadFromClobberingMemInst(LoadType, LoadPtr, DepMI, DL);
+    if (Offset >= 0) {
+      if (auto *PossibleConstant =
+              getConstantMemInstValueForLoad(DepMI, Offset, LoadType, DL)) {
+        LLVM_DEBUG(dbgs() << "Coercing load from meminst " << *DepMI
+                          << " to constant " << *PossibleConstant << "\n");
+        return createConstantExpression(PossibleConstant);
+      }
+    }
+  }
+
+  // All of the below are only true if the loaded pointer is produced
+  // by the dependent instruction.
+  if (LoadPtr != lookupOperandLeader(DepInst) &&
+      !AA->isMustAlias(LoadPtr, DepInst))
+    return nullptr;
+  // If this load really doesn't depend on anything, then we must be loading an
+  // undef value.  This can happen when loading for a fresh allocation with no
+  // intervening stores, for example.  Note that this is only true in the case
+  // that the result of the allocation is pointer equal to the load ptr.
+  if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) ||
+      isAlignedAllocLikeFn(DepInst, TLI)) {
+    return createConstantExpression(UndefValue::get(LoadType));
+  }
+  // If this load occurs either right after a lifetime begin,
+  // then the loaded value is undefined.
+  else if (auto *II = dyn_cast<IntrinsicInst>(DepInst)) {
+    if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+      return createConstantExpression(UndefValue::get(LoadType));
+  }
+  // If this load follows a calloc (which zero initializes memory),
+  // then the loaded value is zero
+  else if (isCallocLikeFn(DepInst, TLI)) {
+    return createConstantExpression(Constant::getNullValue(LoadType));
+  }
+
+  return nullptr;
+}
+
+const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const {
+  auto *LI = cast<LoadInst>(I);
+
+  // We can eliminate in favor of non-simple loads, but we won't be able to
+  // eliminate the loads themselves.
+  if (!LI->isSimple())
+    return nullptr;
+
+  Value *LoadAddressLeader = lookupOperandLeader(LI->getPointerOperand());
+  // Load of undef is undef.
+  if (isa<UndefValue>(LoadAddressLeader))
+    return createConstantExpression(UndefValue::get(LI->getType()));
+  MemoryAccess *OriginalAccess = getMemoryAccess(I);
+  MemoryAccess *DefiningAccess =
+      MSSAWalker->getClobberingMemoryAccess(OriginalAccess);
+
+  if (!MSSA->isLiveOnEntryDef(DefiningAccess)) {
+    if (auto *MD = dyn_cast<MemoryDef>(DefiningAccess)) {
+      Instruction *DefiningInst = MD->getMemoryInst();
+      // If the defining instruction is not reachable, replace with undef.
+      if (!ReachableBlocks.count(DefiningInst->getParent()))
+        return createConstantExpression(UndefValue::get(LI->getType()));
+      // This will handle stores and memory insts.  We only do if it the
+      // defining access has a different type, or it is a pointer produced by
+      // certain memory operations that cause the memory to have a fixed value
+      // (IE things like calloc).
+      if (const auto *CoercionResult =
+              performSymbolicLoadCoercion(LI->getType(), LoadAddressLeader, LI,
+                                          DefiningInst, DefiningAccess))
+        return CoercionResult;
+    }
+  }
+
+  const auto *LE = createLoadExpression(LI->getType(), LoadAddressLeader, LI,
+                                        DefiningAccess);
+  // If our MemoryLeader is not our defining access, add a use to the
+  // MemoryLeader, so that we get reprocessed when it changes.
+  if (LE->getMemoryLeader() != DefiningAccess)
+    addMemoryUsers(LE->getMemoryLeader(), OriginalAccess);
+  return LE;
+}
+
+const Expression *
+NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
+  auto *PI = PredInfo->getPredicateInfoFor(I);
+  if (!PI)
+    return nullptr;
+
+  LLVM_DEBUG(dbgs() << "Found predicate info from instruction !\n");
+
   const Optional<PredicateConstraint> &Constraint = PI->getConstraint();
   if (!Constraint)
-    return nullptr; 
- 
+    return nullptr;
+
   CmpInst::Predicate Predicate = Constraint->Predicate;
   Value *CmpOp0 = I->getOperand(0);
   Value *CmpOp1 = Constraint->OtherOp;
- 
+
   Value *FirstOp = lookupOperandLeader(CmpOp0);
   Value *SecondOp = lookupOperandLeader(CmpOp1);
   Value *AdditionallyUsedValue = CmpOp0;
- 
-  // Sort the ops. 
-  if (shouldSwapOperands(FirstOp, SecondOp)) { 
-    std::swap(FirstOp, SecondOp); 
+
+  // Sort the ops.
+  if (shouldSwapOperands(FirstOp, SecondOp)) {
+    std::swap(FirstOp, SecondOp);
     Predicate = CmpInst::getSwappedPredicate(Predicate);
     AdditionallyUsedValue = CmpOp1;
-  } 
- 
+  }
+
   if (Predicate == CmpInst::ICMP_EQ) {
     addPredicateUsers(PI, I);
     addAdditionalUsers(AdditionallyUsedValue, I);
     return createVariableOrConstant(FirstOp);
-  } 
+  }
 
   // Handle the special case of floating point.
   if (Predicate == CmpInst::FCMP_OEQ && isa<ConstantFP>(FirstOp) &&
@@ -1566,2616 +1566,2616 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
     addPredicateUsers(PI, I);
     addAdditionalUsers(AdditionallyUsedValue, I);
     return createConstantExpression(cast<Constant>(FirstOp));
-  } 
-
-  return nullptr; 
-} 
- 
-// Evaluate read only and pure calls, and create an expression result. 
-const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) const { 
-  auto *CI = cast<CallInst>(I); 
-  if (auto *II = dyn_cast<IntrinsicInst>(I)) { 
-    // Intrinsics with the returned attribute are copies of arguments. 
-    if (auto *ReturnedValue = II->getReturnedArgOperand()) { 
-      if (II->getIntrinsicID() == Intrinsic::ssa_copy) 
-        if (const auto *Result = performSymbolicPredicateInfoEvaluation(I)) 
-          return Result; 
-      return createVariableOrConstant(ReturnedValue); 
-    } 
-  } 
-  if (AA->doesNotAccessMemory(CI)) { 
-    return createCallExpression(CI, TOPClass->getMemoryLeader()); 
-  } else if (AA->onlyReadsMemory(CI)) { 
-    if (auto *MA = MSSA->getMemoryAccess(CI)) { 
-      auto *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(MA); 
-      return createCallExpression(CI, DefiningAccess); 
-    } else // MSSA determined that CI does not access memory. 
-      return createCallExpression(CI, TOPClass->getMemoryLeader()); 
-  } 
-  return nullptr; 
-} 
- 
-// Retrieve the memory class for a given MemoryAccess. 
-CongruenceClass *NewGVN::getMemoryClass(const MemoryAccess *MA) const { 
-  auto *Result = MemoryAccessToClass.lookup(MA); 
-  assert(Result && "Should have found memory class"); 
-  return Result; 
-} 
- 
-// Update the MemoryAccess equivalence table to say that From is equal to To, 
-// and return true if this is different from what already existed in the table. 
-bool NewGVN::setMemoryClass(const MemoryAccess *From, 
-                            CongruenceClass *NewClass) { 
-  assert(NewClass && 
-         "Every MemoryAccess should be getting mapped to a non-null class"); 
-  LLVM_DEBUG(dbgs() << "Setting " << *From); 
-  LLVM_DEBUG(dbgs() << " equivalent to congruence class "); 
-  LLVM_DEBUG(dbgs() << NewClass->getID() 
-                    << " with current MemoryAccess leader "); 
-  LLVM_DEBUG(dbgs() << *NewClass->getMemoryLeader() << "\n"); 
- 
-  auto LookupResult = MemoryAccessToClass.find(From); 
-  bool Changed = false; 
-  // If it's already in the table, see if the value changed. 
-  if (LookupResult != MemoryAccessToClass.end()) { 
-    auto *OldClass = LookupResult->second; 
-    if (OldClass != NewClass) { 
-      // If this is a phi, we have to handle memory member updates. 
-      if (auto *MP = dyn_cast<MemoryPhi>(From)) { 
-        OldClass->memory_erase(MP); 
-        NewClass->memory_insert(MP); 
-        // This may have killed the class if it had no non-memory members 
-        if (OldClass->getMemoryLeader() == From) { 
-          if (OldClass->definesNoMemory()) { 
-            OldClass->setMemoryLeader(nullptr); 
-          } else { 
-            OldClass->setMemoryLeader(getNextMemoryLeader(OldClass)); 
-            LLVM_DEBUG(dbgs() << "Memory class leader change for class " 
-                              << OldClass->getID() << " to " 
-                              << *OldClass->getMemoryLeader() 
-                              << " due to removal of a memory member " << *From 
-                              << "\n"); 
-            markMemoryLeaderChangeTouched(OldClass); 
-          } 
-        } 
-      } 
-      // It wasn't equivalent before, and now it is. 
-      LookupResult->second = NewClass; 
-      Changed = true; 
-    } 
-  } 
- 
-  return Changed; 
-} 
- 
-// Determine if a instruction is cycle-free.  That means the values in the 
-// instruction don't depend on any expressions that can change value as a result 
-// of the instruction.  For example, a non-cycle free instruction would be v = 
-// phi(0, v+1). 
-bool NewGVN::isCycleFree(const Instruction *I) const { 
-  // In order to compute cycle-freeness, we do SCC finding on the instruction, 
-  // and see what kind of SCC it ends up in.  If it is a singleton, it is 
-  // cycle-free.  If it is not in a singleton, it is only cycle free if the 
-  // other members are all phi nodes (as they do not compute anything, they are 
-  // copies). 
-  auto ICS = InstCycleState.lookup(I); 
-  if (ICS == ICS_Unknown) { 
-    SCCFinder.Start(I); 
-    auto &SCC = SCCFinder.getComponentFor(I); 
-    // It's cycle free if it's size 1 or the SCC is *only* phi nodes. 
-    if (SCC.size() == 1) 
-      InstCycleState.insert({I, ICS_CycleFree}); 
-    else { 
-      bool AllPhis = llvm::all_of(SCC, [](const Value *V) { 
-        return isa<PHINode>(V) || isCopyOfAPHI(V); 
-      }); 
-      ICS = AllPhis ? ICS_CycleFree : ICS_Cycle; 
-      for (auto *Member : SCC) 
-        if (auto *MemberPhi = dyn_cast<PHINode>(Member)) 
-          InstCycleState.insert({MemberPhi, ICS}); 
-    } 
-  } 
-  if (ICS == ICS_Cycle) 
-    return false; 
-  return true; 
-} 
- 
-// Evaluate PHI nodes symbolically and create an expression result. 
-const Expression * 
-NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps, 
-                                     Instruction *I, 
-                                     BasicBlock *PHIBlock) const { 
-  // True if one of the incoming phi edges is a backedge. 
-  bool HasBackedge = false; 
-  // All constant tracks the state of whether all the *original* phi operands 
-  // This is really shorthand for "this phi cannot cycle due to forward 
-  // change in value of the phi is guaranteed not to later change the value of 
-  // the phi. IE it can't be v = phi(undef, v+1) 
-  bool OriginalOpsConstant = true; 
-  auto *E = cast<PHIExpression>(createPHIExpression( 
-      PHIOps, I, PHIBlock, HasBackedge, OriginalOpsConstant)); 
-  // We match the semantics of SimplifyPhiNode from InstructionSimplify here. 
-  // See if all arguments are the same. 
-  // We track if any were undef because they need special handling. 
-  bool HasUndef = false; 
-  auto Filtered = make_filter_range(E->operands(), [&](Value *Arg) { 
-    if (isa<UndefValue>(Arg)) { 
-      HasUndef = true; 
-      return false; 
-    } 
-    return true; 
-  }); 
-  // If we are left with no operands, it's dead. 
-  if (Filtered.empty()) { 
-    // If it has undef at this point, it means there are no-non-undef arguments, 
-    // and thus, the value of the phi node must be undef. 
-    if (HasUndef) { 
-      LLVM_DEBUG( 
-          dbgs() << "PHI Node " << *I 
-                 << " has no non-undef arguments, valuing it as undef\n"); 
-      return createConstantExpression(UndefValue::get(I->getType())); 
-    } 
- 
-    LLVM_DEBUG(dbgs() << "No arguments of PHI node " << *I << " are live\n"); 
-    deleteExpression(E); 
-    return createDeadExpression(); 
-  } 
-  Value *AllSameValue = *(Filtered.begin()); 
-  ++Filtered.begin(); 
-  // Can't use std::equal here, sadly, because filter.begin moves. 
-  if (llvm::all_of(Filtered, [&](Value *Arg) { return Arg == AllSameValue; })) { 
-    // In LLVM's non-standard representation of phi nodes, it's possible to have 
-    // phi nodes with cycles (IE dependent on other phis that are .... dependent 
-    // on the original phi node), especially in weird CFG's where some arguments 
-    // are unreachable, or uninitialized along certain paths.  This can cause 
-    // infinite loops during evaluation. We work around this by not trying to 
-    // really evaluate them independently, but instead using a variable 
-    // expression to say if one is equivalent to the other. 
-    // We also special case undef, so that if we have an undef, we can't use the 
-    // common value unless it dominates the phi block. 
-    if (HasUndef) { 
-      // If we have undef and at least one other value, this is really a 
-      // multivalued phi, and we need to know if it's cycle free in order to 
-      // evaluate whether we can ignore the undef.  The other parts of this are 
-      // just shortcuts.  If there is no backedge, or all operands are 
-      // constants, it also must be cycle free. 
-      if (HasBackedge && !OriginalOpsConstant && 
-          !isa<UndefValue>(AllSameValue) && !isCycleFree(I)) 
-        return E; 
- 
-      // Only have to check for instructions 
-      if (auto *AllSameInst = dyn_cast<Instruction>(AllSameValue)) 
-        if (!someEquivalentDominates(AllSameInst, I)) 
-          return E; 
-    } 
-    // Can't simplify to something that comes later in the iteration. 
-    // Otherwise, when and if it changes congruence class, we will never catch 
-    // up. We will always be a class behind it. 
-    if (isa<Instruction>(AllSameValue) && 
-        InstrToDFSNum(AllSameValue) > InstrToDFSNum(I)) 
-      return E; 
-    NumGVNPhisAllSame++; 
-    LLVM_DEBUG(dbgs() << "Simplified PHI node " << *I << " to " << *AllSameValue 
-                      << "\n"); 
-    deleteExpression(E); 
-    return createVariableOrConstant(AllSameValue); 
-  } 
-  return E; 
-} 
- 
-const Expression * 
-NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) const { 
-  if (auto *EI = dyn_cast<ExtractValueInst>(I)) { 
-    auto *WO = dyn_cast<WithOverflowInst>(EI->getAggregateOperand()); 
-    if (WO && EI->getNumIndices() == 1 && *EI->idx_begin() == 0) 
-      // EI is an extract from one of our with.overflow intrinsics. Synthesize 
-      // a semantically equivalent expression instead of an extract value 
-      // expression. 
-      return createBinaryExpression(WO->getBinaryOp(), EI->getType(), 
-                                    WO->getLHS(), WO->getRHS(), I); 
-  } 
- 
-  return createAggregateValueExpression(I); 
-} 
- 
-const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const { 
-  assert(isa<CmpInst>(I) && "Expected a cmp instruction."); 
- 
-  auto *CI = cast<CmpInst>(I); 
-  // See if our operands are equal to those of a previous predicate, and if so, 
-  // if it implies true or false. 
-  auto Op0 = lookupOperandLeader(CI->getOperand(0)); 
-  auto Op1 = lookupOperandLeader(CI->getOperand(1)); 
-  auto OurPredicate = CI->getPredicate(); 
-  if (shouldSwapOperands(Op0, Op1)) { 
-    std::swap(Op0, Op1); 
-    OurPredicate = CI->getSwappedPredicate(); 
-  } 
- 
-  // Avoid processing the same info twice. 
-  const PredicateBase *LastPredInfo = nullptr; 
-  // See if we know something about the comparison itself, like it is the target 
-  // of an assume. 
-  auto *CmpPI = PredInfo->getPredicateInfoFor(I); 
-  if (dyn_cast_or_null<PredicateAssume>(CmpPI)) 
-    return createConstantExpression(ConstantInt::getTrue(CI->getType())); 
- 
-  if (Op0 == Op1) { 
-    // This condition does not depend on predicates, no need to add users 
-    if (CI->isTrueWhenEqual()) 
-      return createConstantExpression(ConstantInt::getTrue(CI->getType())); 
-    else if (CI->isFalseWhenEqual()) 
-      return createConstantExpression(ConstantInt::getFalse(CI->getType())); 
-  } 
- 
-  // NOTE: Because we are comparing both operands here and below, and using 
-  // previous comparisons, we rely on fact that predicateinfo knows to mark 
-  // comparisons that use renamed operands as users of the earlier comparisons. 
-  // It is *not* enough to just mark predicateinfo renamed operands as users of 
-  // the earlier comparisons, because the *other* operand may have changed in a 
-  // previous iteration. 
-  // Example: 
-  // icmp slt %a, %b 
-  // %b.0 = ssa.copy(%b) 
-  // false branch: 
-  // icmp slt %c, %b.0 
- 
-  // %c and %a may start out equal, and thus, the code below will say the second 
-  // %icmp is false.  c may become equal to something else, and in that case the 
-  // %second icmp *must* be reexamined, but would not if only the renamed 
-  // %operands are considered users of the icmp. 
- 
-  // *Currently* we only check one level of comparisons back, and only mark one 
-  // level back as touched when changes happen.  If you modify this code to look 
-  // back farther through comparisons, you *must* mark the appropriate 
-  // comparisons as users in PredicateInfo.cpp, or you will cause bugs.  See if 
-  // we know something just from the operands themselves 
- 
-  // See if our operands have predicate info, so that we may be able to derive 
-  // something from a previous comparison. 
-  for (const auto &Op : CI->operands()) { 
-    auto *PI = PredInfo->getPredicateInfoFor(Op); 
-    if (const auto *PBranch = dyn_cast_or_null<PredicateBranch>(PI)) { 
-      if (PI == LastPredInfo) 
-        continue; 
-      LastPredInfo = PI; 
-      // In phi of ops cases, we may have predicate info that we are evaluating 
-      // in a different context. 
-      if (!DT->dominates(PBranch->To, getBlockForValue(I))) 
-        continue; 
-      // TODO: Along the false edge, we may know more things too, like 
-      // icmp of 
-      // same operands is false. 
-      // TODO: We only handle actual comparison conditions below, not 
-      // and/or. 
-      auto *BranchCond = dyn_cast<CmpInst>(PBranch->Condition); 
-      if (!BranchCond) 
-        continue; 
-      auto *BranchOp0 = lookupOperandLeader(BranchCond->getOperand(0)); 
-      auto *BranchOp1 = lookupOperandLeader(BranchCond->getOperand(1)); 
-      auto BranchPredicate = BranchCond->getPredicate(); 
-      if (shouldSwapOperands(BranchOp0, BranchOp1)) { 
-        std::swap(BranchOp0, BranchOp1); 
-        BranchPredicate = BranchCond->getSwappedPredicate(); 
-      } 
-      if (BranchOp0 == Op0 && BranchOp1 == Op1) { 
-        if (PBranch->TrueEdge) { 
-          // If we know the previous predicate is true and we are in the true 
-          // edge then we may be implied true or false. 
-          if (CmpInst::isImpliedTrueByMatchingCmp(BranchPredicate, 
-                                                  OurPredicate)) { 
-            addPredicateUsers(PI, I); 
-            return createConstantExpression( 
-                ConstantInt::getTrue(CI->getType())); 
-          } 
- 
-          if (CmpInst::isImpliedFalseByMatchingCmp(BranchPredicate, 
-                                                   OurPredicate)) { 
-            addPredicateUsers(PI, I); 
-            return createConstantExpression( 
-                ConstantInt::getFalse(CI->getType())); 
-          } 
-        } else { 
-          // Just handle the ne and eq cases, where if we have the same 
-          // operands, we may know something. 
-          if (BranchPredicate == OurPredicate) { 
-            addPredicateUsers(PI, I); 
-            // Same predicate, same ops,we know it was false, so this is false. 
-            return createConstantExpression( 
-                ConstantInt::getFalse(CI->getType())); 
-          } else if (BranchPredicate == 
-                     CmpInst::getInversePredicate(OurPredicate)) { 
-            addPredicateUsers(PI, I); 
-            // Inverse predicate, we know the other was false, so this is true. 
-            return createConstantExpression( 
-                ConstantInt::getTrue(CI->getType())); 
-          } 
-        } 
-      } 
-    } 
-  } 
-  // Create expression will take care of simplifyCmpInst 
-  return createExpression(I); 
-} 
- 
-// Substitute and symbolize the value before value numbering. 
-const Expression * 
-NewGVN::performSymbolicEvaluation(Value *V, 
-                                  SmallPtrSetImpl<Value *> &Visited) const { 
-  const Expression *E = nullptr; 
-  if (auto *C = dyn_cast<Constant>(V)) 
-    E = createConstantExpression(C); 
-  else if (isa<Argument>(V) || isa<GlobalVariable>(V)) { 
-    E = createVariableExpression(V); 
-  } else { 
-    // TODO: memory intrinsics. 
-    // TODO: Some day, we should do the forward propagation and reassociation 
-    // parts of the algorithm. 
-    auto *I = cast<Instruction>(V); 
-    switch (I->getOpcode()) { 
-    case Instruction::ExtractValue: 
-    case Instruction::InsertValue: 
-      E = performSymbolicAggrValueEvaluation(I); 
-      break; 
-    case Instruction::PHI: { 
-      SmallVector<ValPair, 3> Ops; 
-      auto *PN = cast<PHINode>(I); 
-      for (unsigned i = 0; i < PN->getNumOperands(); ++i) 
-        Ops.push_back({PN->getIncomingValue(i), PN->getIncomingBlock(i)}); 
-      // Sort to ensure the invariant createPHIExpression requires is met. 
-      sortPHIOps(Ops); 
-      E = performSymbolicPHIEvaluation(Ops, I, getBlockForValue(I)); 
-    } break; 
-    case Instruction::Call: 
-      E = performSymbolicCallEvaluation(I); 
-      break; 
-    case Instruction::Store: 
-      E = performSymbolicStoreEvaluation(I); 
-      break; 
-    case Instruction::Load: 
-      E = performSymbolicLoadEvaluation(I); 
-      break; 
-    case Instruction::BitCast: 
-    case Instruction::AddrSpaceCast: 
-      E = createExpression(I); 
-      break; 
-    case Instruction::ICmp: 
-    case Instruction::FCmp: 
-      E = performSymbolicCmpEvaluation(I); 
-      break; 
-    case Instruction::FNeg: 
-    case Instruction::Add: 
-    case Instruction::FAdd: 
-    case Instruction::Sub: 
-    case Instruction::FSub: 
-    case Instruction::Mul: 
-    case Instruction::FMul: 
-    case Instruction::UDiv: 
-    case Instruction::SDiv: 
-    case Instruction::FDiv: 
-    case Instruction::URem: 
-    case Instruction::SRem: 
-    case Instruction::FRem: 
-    case Instruction::Shl: 
-    case Instruction::LShr: 
-    case Instruction::AShr: 
-    case Instruction::And: 
-    case Instruction::Or: 
-    case Instruction::Xor: 
-    case Instruction::Trunc: 
-    case Instruction::ZExt: 
-    case Instruction::SExt: 
-    case Instruction::FPToUI: 
-    case Instruction::FPToSI: 
-    case Instruction::UIToFP: 
-    case Instruction::SIToFP: 
-    case Instruction::FPTrunc: 
-    case Instruction::FPExt: 
-    case Instruction::PtrToInt: 
-    case Instruction::IntToPtr: 
-    case Instruction::Select: 
-    case Instruction::ExtractElement: 
-    case Instruction::InsertElement: 
-    case Instruction::GetElementPtr: 
-      E = createExpression(I); 
-      break; 
-    case Instruction::ShuffleVector: 
-      // FIXME: Add support for shufflevector to createExpression. 
-      return nullptr; 
-    default: 
-      return nullptr; 
-    } 
-  } 
-  return E; 
-} 
- 
-// Look up a container of values/instructions in a map, and touch all the 
-// instructions in the container.  Then erase value from the map. 
-template <typename Map, typename KeyType> 
-void NewGVN::touchAndErase(Map &M, const KeyType &Key) { 
-  const auto Result = M.find_as(Key); 
-  if (Result != M.end()) { 
-    for (const typename Map::mapped_type::value_type Mapped : Result->second) 
-      TouchedInstructions.set(InstrToDFSNum(Mapped)); 
-    M.erase(Result); 
-  } 
-} 
- 
-void NewGVN::addAdditionalUsers(Value *To, Value *User) const { 
-  assert(User && To != User); 
-  if (isa<Instruction>(To)) 
-    AdditionalUsers[To].insert(User); 
-} 
- 
-void NewGVN::markUsersTouched(Value *V) { 
-  // Now mark the users as touched. 
-  for (auto *User : V->users()) { 
-    assert(isa<Instruction>(User) && "Use of value not within an instruction?"); 
-    TouchedInstructions.set(InstrToDFSNum(User)); 
-  } 
-  touchAndErase(AdditionalUsers, V); 
-} 
- 
-void NewGVN::addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const { 
-  LLVM_DEBUG(dbgs() << "Adding memory user " << *U << " to " << *To << "\n"); 
-  MemoryToUsers[To].insert(U); 
-} 
- 
-void NewGVN::markMemoryDefTouched(const MemoryAccess *MA) { 
-  TouchedInstructions.set(MemoryToDFSNum(MA)); 
-} 
- 
-void NewGVN::markMemoryUsersTouched(const MemoryAccess *MA) { 
-  if (isa<MemoryUse>(MA)) 
-    return; 
-  for (auto U : MA->users()) 
-    TouchedInstructions.set(MemoryToDFSNum(U)); 
-  touchAndErase(MemoryToUsers, MA); 
-} 
- 
-// Add I to the set of users of a given predicate. 
-void NewGVN::addPredicateUsers(const PredicateBase *PB, Instruction *I) const { 
-  // Don't add temporary instructions to the user lists. 
-  if (AllTempInstructions.count(I)) 
-    return; 
- 
-  if (auto *PBranch = dyn_cast<PredicateBranch>(PB)) 
-    PredicateToUsers[PBranch->Condition].insert(I); 
-  else if (auto *PAssume = dyn_cast<PredicateAssume>(PB)) 
-    PredicateToUsers[PAssume->Condition].insert(I); 
-} 
- 
-// Touch all the predicates that depend on this instruction. 
-void NewGVN::markPredicateUsersTouched(Instruction *I) { 
-  touchAndErase(PredicateToUsers, I); 
-} 
- 
-// Mark users affected by a memory leader change. 
-void NewGVN::markMemoryLeaderChangeTouched(CongruenceClass *CC) { 
-  for (auto M : CC->memory()) 
-    markMemoryDefTouched(M); 
-} 
- 
-// Touch the instructions that need to be updated after a congruence class has a 
-// leader change, and mark changed values. 
-void NewGVN::markValueLeaderChangeTouched(CongruenceClass *CC) { 
-  for (auto M : *CC) { 
-    if (auto *I = dyn_cast<Instruction>(M)) 
-      TouchedInstructions.set(InstrToDFSNum(I)); 
-    LeaderChanges.insert(M); 
-  } 
-} 
- 
-// Give a range of things that have instruction DFS numbers, this will return 
-// the member of the range with the smallest dfs number. 
-template <class T, class Range> 
-T *NewGVN::getMinDFSOfRange(const Range &R) const { 
-  std::pair<T *, unsigned> MinDFS = {nullptr, ~0U}; 
-  for (const auto X : R) { 
-    auto DFSNum = InstrToDFSNum(X); 
-    if (DFSNum < MinDFS.second) 
-      MinDFS = {X, DFSNum}; 
-  } 
-  return MinDFS.first; 
-} 
- 
-// This function returns the MemoryAccess that should be the next leader of 
-// congruence class CC, under the assumption that the current leader is going to 
-// disappear. 
-const MemoryAccess *NewGVN::getNextMemoryLeader(CongruenceClass *CC) const { 
-  // TODO: If this ends up to slow, we can maintain a next memory leader like we 
-  // do for regular leaders. 
-  // Make sure there will be a leader to find. 
-  assert(!CC->definesNoMemory() && "Can't get next leader if there is none"); 
-  if (CC->getStoreCount() > 0) { 
-    if (auto *NL = dyn_cast_or_null<StoreInst>(CC->getNextLeader().first)) 
-      return getMemoryAccess(NL); 
-    // Find the store with the minimum DFS number. 
-    auto *V = getMinDFSOfRange<Value>(make_filter_range( 
-        *CC, [&](const Value *V) { return isa<StoreInst>(V); })); 
-    return getMemoryAccess(cast<StoreInst>(V)); 
-  } 
-  assert(CC->getStoreCount() == 0); 
- 
-  // Given our assertion, hitting this part must mean 
-  // !OldClass->memory_empty() 
-  if (CC->memory_size() == 1) 
-    return *CC->memory_begin(); 
-  return getMinDFSOfRange<const MemoryPhi>(CC->memory()); 
-} 
- 
-// This function returns the next value leader of a congruence class, under the 
-// assumption that the current leader is going away.  This should end up being 
-// the next most dominating member. 
-Value *NewGVN::getNextValueLeader(CongruenceClass *CC) const { 
-  // We don't need to sort members if there is only 1, and we don't care about 
-  // sorting the TOP class because everything either gets out of it or is 
-  // unreachable. 
- 
-  if (CC->size() == 1 || CC == TOPClass) { 
-    return *(CC->begin()); 
-  } else if (CC->getNextLeader().first) { 
-    ++NumGVNAvoidedSortedLeaderChanges; 
-    return CC->getNextLeader().first; 
-  } else { 
-    ++NumGVNSortedLeaderChanges; 
-    // NOTE: If this ends up to slow, we can maintain a dual structure for 
-    // member testing/insertion, or keep things mostly sorted, and sort only 
-    // here, or use SparseBitVector or .... 
-    return getMinDFSOfRange<Value>(*CC); 
-  } 
-} 
- 
-// Move a MemoryAccess, currently in OldClass, to NewClass, including updates to 
-// the memory members, etc for the move. 
-// 
-// The invariants of this function are: 
-// 
-// - I must be moving to NewClass from OldClass 
-// - The StoreCount of OldClass and NewClass is expected to have been updated 
-//   for I already if it is a store. 
-// - The OldClass memory leader has not been updated yet if I was the leader. 
-void NewGVN::moveMemoryToNewCongruenceClass(Instruction *I, 
-                                            MemoryAccess *InstMA, 
-                                            CongruenceClass *OldClass, 
-                                            CongruenceClass *NewClass) { 
-  // If the leader is I, and we had a representative MemoryAccess, it should 
-  // be the MemoryAccess of OldClass. 
-  assert((!InstMA || !OldClass->getMemoryLeader() || 
-          OldClass->getLeader() != I || 
-          MemoryAccessToClass.lookup(OldClass->getMemoryLeader()) == 
-              MemoryAccessToClass.lookup(InstMA)) && 
-         "Representative MemoryAccess mismatch"); 
-  // First, see what happens to the new class 
-  if (!NewClass->getMemoryLeader()) { 
-    // Should be a new class, or a store becoming a leader of a new class. 
-    assert(NewClass->size() == 1 || 
-           (isa<StoreInst>(I) && NewClass->getStoreCount() == 1)); 
-    NewClass->setMemoryLeader(InstMA); 
-    // Mark it touched if we didn't just create a singleton 
-    LLVM_DEBUG(dbgs() << "Memory class leader change for class " 
-                      << NewClass->getID() 
-                      << " due to new memory instruction becoming leader\n"); 
-    markMemoryLeaderChangeTouched(NewClass); 
-  } 
-  setMemoryClass(InstMA, NewClass); 
-  // Now, fixup the old class if necessary 
-  if (OldClass->getMemoryLeader() == InstMA) { 
-    if (!OldClass->definesNoMemory()) { 
-      OldClass->setMemoryLeader(getNextMemoryLeader(OldClass)); 
-      LLVM_DEBUG(dbgs() << "Memory class leader change for class " 
-                        << OldClass->getID() << " to " 
-                        << *OldClass->getMemoryLeader() 
-                        << " due to removal of old leader " << *InstMA << "\n"); 
-      markMemoryLeaderChangeTouched(OldClass); 
-    } else 
-      OldClass->setMemoryLeader(nullptr); 
-  } 
-} 
- 
-// Move a value, currently in OldClass, to be part of NewClass 
-// Update OldClass and NewClass for the move (including changing leaders, etc). 
-void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E, 
-                                           CongruenceClass *OldClass, 
-                                           CongruenceClass *NewClass) { 
-  if (I == OldClass->getNextLeader().first) 
-    OldClass->resetNextLeader(); 
- 
-  OldClass->erase(I); 
-  NewClass->insert(I); 
- 
-  if (NewClass->getLeader() != I) 
-    NewClass->addPossibleNextLeader({I, InstrToDFSNum(I)}); 
-  // Handle our special casing of stores. 
-  if (auto *SI = dyn_cast<StoreInst>(I)) { 
-    OldClass->decStoreCount(); 
-    // Okay, so when do we want to make a store a leader of a class? 
-    // If we have a store defined by an earlier load, we want the earlier load 
-    // to lead the class. 
-    // If we have a store defined by something else, we want the store to lead 
-    // the class so everything else gets the "something else" as a value. 
-    // If we have a store as the single member of the class, we want the store 
-    // as the leader 
-    if (NewClass->getStoreCount() == 0 && !NewClass->getStoredValue()) { 
-      // If it's a store expression we are using, it means we are not equivalent 
-      // to something earlier. 
-      if (auto *SE = dyn_cast<StoreExpression>(E)) { 
-        NewClass->setStoredValue(SE->getStoredValue()); 
-        markValueLeaderChangeTouched(NewClass); 
-        // Shift the new class leader to be the store 
-        LLVM_DEBUG(dbgs() << "Changing leader of congruence class " 
-                          << NewClass->getID() << " from " 
-                          << *NewClass->getLeader() << " to  " << *SI 
-                          << " because store joined class\n"); 
-        // If we changed the leader, we have to mark it changed because we don't 
-        // know what it will do to symbolic evaluation. 
-        NewClass->setLeader(SI); 
-      } 
-      // We rely on the code below handling the MemoryAccess change. 
-    } 
-    NewClass->incStoreCount(); 
-  } 
-  // True if there is no memory instructions left in a class that had memory 
-  // instructions before. 
- 
-  // If it's not a memory use, set the MemoryAccess equivalence 
-  auto *InstMA = dyn_cast_or_null<MemoryDef>(getMemoryAccess(I)); 
-  if (InstMA) 
-    moveMemoryToNewCongruenceClass(I, InstMA, OldClass, NewClass); 
-  ValueToClass[I] = NewClass; 
-  // See if we destroyed the class or need to swap leaders. 
-  if (OldClass->empty() && OldClass != TOPClass) { 
-    if (OldClass->getDefiningExpr()) { 
-      LLVM_DEBUG(dbgs() << "Erasing expression " << *OldClass->getDefiningExpr() 
-                        << " from table\n"); 
-      // We erase it as an exact expression to make sure we don't just erase an 
-      // equivalent one. 
-      auto Iter = ExpressionToClass.find_as( 
-          ExactEqualsExpression(*OldClass->getDefiningExpr())); 
-      if (Iter != ExpressionToClass.end()) 
-        ExpressionToClass.erase(Iter); 
-#ifdef EXPENSIVE_CHECKS 
-      assert( 
-          (*OldClass->getDefiningExpr() != *E || ExpressionToClass.lookup(E)) && 
-          "We erased the expression we just inserted, which should not happen"); 
-#endif 
-    } 
-  } else if (OldClass->getLeader() == I) { 
-    // When the leader changes, the value numbering of 
-    // everything may change due to symbolization changes, so we need to 
-    // reprocess. 
-    LLVM_DEBUG(dbgs() << "Value class leader change for class " 
-                      << OldClass->getID() << "\n"); 
-    ++NumGVNLeaderChanges; 
-    // Destroy the stored value if there are no more stores to represent it. 
-    // Note that this is basically clean up for the expression removal that 
-    // happens below.  If we remove stores from a class, we may leave it as a 
-    // class of equivalent memory phis. 
-    if (OldClass->getStoreCount() == 0) { 
-      if (OldClass->getStoredValue()) 
-        OldClass->setStoredValue(nullptr); 
-    } 
-    OldClass->setLeader(getNextValueLeader(OldClass)); 
-    OldClass->resetNextLeader(); 
-    markValueLeaderChangeTouched(OldClass); 
-  } 
-} 
- 
-// For a given expression, mark the phi of ops instructions that could have 
-// changed as a result. 
-void NewGVN::markPhiOfOpsChanged(const Expression *E) { 
-  touchAndErase(ExpressionToPhiOfOps, E); 
-} 
- 
-// Perform congruence finding on a given value numbering expression. 
-void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) { 
-  // This is guaranteed to return something, since it will at least find 
-  // TOP. 
- 
-  CongruenceClass *IClass = ValueToClass.lookup(I); 
-  assert(IClass && "Should have found a IClass"); 
-  // Dead classes should have been eliminated from the mapping. 
-  assert(!IClass->isDead() && "Found a dead class"); 
- 
-  CongruenceClass *EClass = nullptr; 
-  if (const auto *VE = dyn_cast<VariableExpression>(E)) { 
-    EClass = ValueToClass.lookup(VE->getVariableValue()); 
-  } else if (isa<DeadExpression>(E)) { 
-    EClass = TOPClass; 
-  } 
-  if (!EClass) { 
-    auto lookupResult = ExpressionToClass.insert({E, nullptr}); 
- 
-    // If it's not in the value table, create a new congruence class. 
-    if (lookupResult.second) { 
-      CongruenceClass *NewClass = createCongruenceClass(nullptr, E); 
-      auto place = lookupResult.first; 
-      place->second = NewClass; 
- 
-      // Constants and variables should always be made the leader. 
-      if (const auto *CE = dyn_cast<ConstantExpression>(E)) { 
-        NewClass->setLeader(CE->getConstantValue()); 
-      } else if (const auto *SE = dyn_cast<StoreExpression>(E)) { 
-        StoreInst *SI = SE->getStoreInst(); 
-        NewClass->setLeader(SI); 
-        NewClass->setStoredValue(SE->getStoredValue()); 
-        // The RepMemoryAccess field will be filled in properly by the 
-        // moveValueToNewCongruenceClass call. 
-      } else { 
-        NewClass->setLeader(I); 
-      } 
-      assert(!isa<VariableExpression>(E) && 
-             "VariableExpression should have been handled already"); 
- 
-      EClass = NewClass; 
-      LLVM_DEBUG(dbgs() << "Created new congruence class for " << *I 
-                        << " using expression " << *E << " at " 
-                        << NewClass->getID() << " and leader " 
-                        << *(NewClass->getLeader())); 
-      if (NewClass->getStoredValue()) 
-        LLVM_DEBUG(dbgs() << " and stored value " 
-                          << *(NewClass->getStoredValue())); 
-      LLVM_DEBUG(dbgs() << "\n"); 
-    } else { 
-      EClass = lookupResult.first->second; 
-      if (isa<ConstantExpression>(E)) 
-        assert((isa<Constant>(EClass->getLeader()) || 
-                (EClass->getStoredValue() && 
-                 isa<Constant>(EClass->getStoredValue()))) && 
-               "Any class with a constant expression should have a " 
-               "constant leader"); 
- 
-      assert(EClass && "Somehow don't have an eclass"); 
- 
-      assert(!EClass->isDead() && "We accidentally looked up a dead class"); 
-    } 
-  } 
-  bool ClassChanged = IClass != EClass; 
-  bool LeaderChanged = LeaderChanges.erase(I); 
-  if (ClassChanged || LeaderChanged) { 
-    LLVM_DEBUG(dbgs() << "New class " << EClass->getID() << " for expression " 
-                      << *E << "\n"); 
-    if (ClassChanged) { 
-      moveValueToNewCongruenceClass(I, E, IClass, EClass); 
-      markPhiOfOpsChanged(E); 
-    } 
- 
-    markUsersTouched(I); 
-    if (MemoryAccess *MA = getMemoryAccess(I)) 
-      markMemoryUsersTouched(MA); 
-    if (auto *CI = dyn_cast<CmpInst>(I)) 
-      markPredicateUsersTouched(CI); 
-  } 
-  // If we changed the class of the store, we want to ensure nothing finds the 
-  // old store expression.  In particular, loads do not compare against stored 
-  // value, so they will find old store expressions (and associated class 
-  // mappings) if we leave them in the table. 
-  if (ClassChanged && isa<StoreInst>(I)) { 
-    auto *OldE = ValueToExpression.lookup(I); 
-    // It could just be that the old class died. We don't want to erase it if we 
-    // just moved classes. 
-    if (OldE && isa<StoreExpression>(OldE) && *E != *OldE) { 
-      // Erase this as an exact expression to ensure we don't erase expressions 
-      // equivalent to it. 
-      auto Iter = ExpressionToClass.find_as(ExactEqualsExpression(*OldE)); 
-      if (Iter != ExpressionToClass.end()) 
-        ExpressionToClass.erase(Iter); 
-    } 
-  } 
-  ValueToExpression[I] = E; 
-} 
- 
-// Process the fact that Edge (from, to) is reachable, including marking 
-// any newly reachable blocks and instructions for processing. 
-void NewGVN::updateReachableEdge(BasicBlock *From, BasicBlock *To) { 
-  // Check if the Edge was reachable before. 
-  if (ReachableEdges.insert({From, To}).second) { 
-    // If this block wasn't reachable before, all instructions are touched. 
-    if (ReachableBlocks.insert(To).second) { 
-      LLVM_DEBUG(dbgs() << "Block " << getBlockName(To) 
-                        << " marked reachable\n"); 
-      const auto &InstRange = BlockInstRange.lookup(To); 
-      TouchedInstructions.set(InstRange.first, InstRange.second); 
-    } else { 
-      LLVM_DEBUG(dbgs() << "Block " << getBlockName(To) 
-                        << " was reachable, but new edge {" 
-                        << getBlockName(From) << "," << getBlockName(To) 
-                        << "} to it found\n"); 
- 
-      // We've made an edge reachable to an existing block, which may 
-      // impact predicates. Otherwise, only mark the phi nodes as touched, as 
-      // they are the only thing that depend on new edges. Anything using their 
-      // values will get propagated to if necessary. 
-      if (MemoryAccess *MemPhi = getMemoryAccess(To)) 
-        TouchedInstructions.set(InstrToDFSNum(MemPhi)); 
- 
-      // FIXME: We should just add a union op on a Bitvector and 
-      // SparseBitVector.  We can do it word by word faster than we are doing it 
-      // here. 
-      for (auto InstNum : RevisitOnReachabilityChange[To]) 
-        TouchedInstructions.set(InstNum); 
-    } 
-  } 
-} 
- 
-// Given a predicate condition (from a switch, cmp, or whatever) and a block, 
-// see if we know some constant value for it already. 
-Value *NewGVN::findConditionEquivalence(Value *Cond) const { 
-  auto Result = lookupOperandLeader(Cond); 
-  return isa<Constant>(Result) ? Result : nullptr; 
-} 
- 
-// Process the outgoing edges of a block for reachability. 
-void NewGVN::processOutgoingEdges(Instruction *TI, BasicBlock *B) { 
-  // Evaluate reachability of terminator instruction. 
-  Value *Cond; 
-  BasicBlock *TrueSucc, *FalseSucc; 
-  if (match(TI, m_Br(m_Value(Cond), TrueSucc, FalseSucc))) { 
-    Value *CondEvaluated = findConditionEquivalence(Cond); 
-    if (!CondEvaluated) { 
-      if (auto *I = dyn_cast<Instruction>(Cond)) { 
-        const Expression *E = createExpression(I); 
-        if (const auto *CE = dyn_cast<ConstantExpression>(E)) { 
-          CondEvaluated = CE->getConstantValue(); 
-        } 
-      } else if (isa<ConstantInt>(Cond)) { 
-        CondEvaluated = Cond; 
-      } 
-    } 
-    ConstantInt *CI; 
-    if (CondEvaluated && (CI = dyn_cast<ConstantInt>(CondEvaluated))) { 
-      if (CI->isOne()) { 
-        LLVM_DEBUG(dbgs() << "Condition for Terminator " << *TI 
-                          << " evaluated to true\n"); 
-        updateReachableEdge(B, TrueSucc); 
-      } else if (CI->isZero()) { 
-        LLVM_DEBUG(dbgs() << "Condition for Terminator " << *TI 
-                          << " evaluated to false\n"); 
-        updateReachableEdge(B, FalseSucc); 
-      } 
-    } else { 
-      updateReachableEdge(B, TrueSucc); 
-      updateReachableEdge(B, FalseSucc); 
-    } 
-  } else if (auto *SI = dyn_cast<SwitchInst>(TI)) { 
-    // For switches, propagate the case values into the case 
-    // destinations. 
- 
-    Value *SwitchCond = SI->getCondition(); 
-    Value *CondEvaluated = findConditionEquivalence(SwitchCond); 
-    // See if we were able to turn this switch statement into a constant. 
-    if (CondEvaluated && isa<ConstantInt>(CondEvaluated)) { 
-      auto *CondVal = cast<ConstantInt>(CondEvaluated); 
-      // We should be able to get case value for this. 
-      auto Case = *SI->findCaseValue(CondVal); 
-      if (Case.getCaseSuccessor() == SI->getDefaultDest()) { 
-        // We proved the value is outside of the range of the case. 
-        // We can't do anything other than mark the default dest as reachable, 
-        // and go home. 
-        updateReachableEdge(B, SI->getDefaultDest()); 
-        return; 
-      } 
-      // Now get where it goes and mark it reachable. 
-      BasicBlock *TargetBlock = Case.getCaseSuccessor(); 
-      updateReachableEdge(B, TargetBlock); 
-    } else { 
-      for (unsigned i = 0, e = SI->getNumSuccessors(); i != e; ++i) { 
-        BasicBlock *TargetBlock = SI->getSuccessor(i); 
-        updateReachableEdge(B, TargetBlock); 
-      } 
-    } 
-  } else { 
-    // Otherwise this is either unconditional, or a type we have no 
-    // idea about. Just mark successors as reachable. 
-    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) { 
-      BasicBlock *TargetBlock = TI->getSuccessor(i); 
-      updateReachableEdge(B, TargetBlock); 
-    } 
- 
-    // This also may be a memory defining terminator, in which case, set it 
-    // equivalent only to itself. 
-    // 
-    auto *MA = getMemoryAccess(TI); 
-    if (MA && !isa<MemoryUse>(MA)) { 
-      auto *CC = ensureLeaderOfMemoryClass(MA); 
-      if (setMemoryClass(MA, CC)) 
-        markMemoryUsersTouched(MA); 
-    } 
-  } 
-} 
- 
-// Remove the PHI of Ops PHI for I 
-void NewGVN::removePhiOfOps(Instruction *I, PHINode *PHITemp) { 
-  InstrDFS.erase(PHITemp); 
-  // It's still a temp instruction. We keep it in the array so it gets erased. 
-  // However, it's no longer used by I, or in the block 
-  TempToBlock.erase(PHITemp); 
-  RealToTemp.erase(I); 
-  // We don't remove the users from the phi node uses. This wastes a little 
-  // time, but such is life.  We could use two sets to track which were there 
-  // are the start of NewGVN, and which were added, but right nowt he cost of 
-  // tracking is more than the cost of checking for more phi of ops. 
-} 
- 
-// Add PHI Op in BB as a PHI of operations version of ExistingValue. 
-void NewGVN::addPhiOfOps(PHINode *Op, BasicBlock *BB, 
-                         Instruction *ExistingValue) { 
-  InstrDFS[Op] = InstrToDFSNum(ExistingValue); 
-  AllTempInstructions.insert(Op); 
-  TempToBlock[Op] = BB; 
-  RealToTemp[ExistingValue] = Op; 
-  // Add all users to phi node use, as they are now uses of the phi of ops phis 
-  // and may themselves be phi of ops. 
-  for (auto *U : ExistingValue->users()) 
-    if (auto *UI = dyn_cast<Instruction>(U)) 
-      PHINodeUses.insert(UI); 
-} 
- 
-static bool okayForPHIOfOps(const Instruction *I) { 
-  if (!EnablePhiOfOps) 
-    return false; 
-  return isa<BinaryOperator>(I) || isa<SelectInst>(I) || isa<CmpInst>(I) || 
-         isa<LoadInst>(I); 
-} 
- 
-bool NewGVN::OpIsSafeForPHIOfOpsHelper( 
-    Value *V, const BasicBlock *PHIBlock, 
-    SmallPtrSetImpl<const Value *> &Visited, 
-    SmallVectorImpl<Instruction *> &Worklist) { 
- 
-  if (!isa<Instruction>(V)) 
-    return true; 
-  auto OISIt = OpSafeForPHIOfOps.find(V); 
-  if (OISIt != OpSafeForPHIOfOps.end()) 
-    return OISIt->second; 
- 
-  // Keep walking until we either dominate the phi block, or hit a phi, or run 
-  // out of things to check. 
-  if (DT->properlyDominates(getBlockForValue(V), PHIBlock)) { 
-    OpSafeForPHIOfOps.insert({V, true}); 
-    return true; 
-  } 
-  // PHI in the same block. 
-  if (isa<PHINode>(V) && getBlockForValue(V) == PHIBlock) { 
-    OpSafeForPHIOfOps.insert({V, false}); 
-    return false; 
-  } 
- 
-  auto *OrigI = cast<Instruction>(V); 
-  for (auto *Op : OrigI->operand_values()) { 
-    if (!isa<Instruction>(Op)) 
-      continue; 
-    // Stop now if we find an unsafe operand. 
-    auto OISIt = OpSafeForPHIOfOps.find(OrigI); 
-    if (OISIt != OpSafeForPHIOfOps.end()) { 
-      if (!OISIt->second) { 
-        OpSafeForPHIOfOps.insert({V, false}); 
-        return false; 
-      } 
-      continue; 
-    } 
-    if (!Visited.insert(Op).second) 
-      continue; 
-    Worklist.push_back(cast<Instruction>(Op)); 
-  } 
-  return true; 
-} 
- 
-// Return true if this operand will be safe to use for phi of ops. 
-// 
-// The reason some operands are unsafe is that we are not trying to recursively 
-// translate everything back through phi nodes.  We actually expect some lookups 
-// of expressions to fail.  In particular, a lookup where the expression cannot 
-// exist in the predecessor.  This is true even if the expression, as shown, can 
-// be determined to be constant. 
-bool NewGVN::OpIsSafeForPHIOfOps(Value *V, const BasicBlock *PHIBlock, 
-                                 SmallPtrSetImpl<const Value *> &Visited) { 
-  SmallVector<Instruction *, 4> Worklist; 
-  if (!OpIsSafeForPHIOfOpsHelper(V, PHIBlock, Visited, Worklist)) 
-    return false; 
-  while (!Worklist.empty()) { 
-    auto *I = Worklist.pop_back_val(); 
-    if (!OpIsSafeForPHIOfOpsHelper(I, PHIBlock, Visited, Worklist)) 
-      return false; 
-  } 
-  OpSafeForPHIOfOps.insert({V, true}); 
-  return true; 
-} 
- 
-// Try to find a leader for instruction TransInst, which is a phi translated 
-// version of something in our original program.  Visited is used to ensure we 
-// don't infinite loop during translations of cycles.  OrigInst is the 
-// instruction in the original program, and PredBB is the predecessor we 
-// translated it through. 
-Value *NewGVN::findLeaderForInst(Instruction *TransInst, 
-                                 SmallPtrSetImpl<Value *> &Visited, 
-                                 MemoryAccess *MemAccess, Instruction *OrigInst, 
-                                 BasicBlock *PredBB) { 
-  unsigned IDFSNum = InstrToDFSNum(OrigInst); 
-  // Make sure it's marked as a temporary instruction. 
-  AllTempInstructions.insert(TransInst); 
-  // and make sure anything that tries to add it's DFS number is 
-  // redirected to the instruction we are making a phi of ops 
-  // for. 
-  TempToBlock.insert({TransInst, PredBB}); 
-  InstrDFS.insert({TransInst, IDFSNum}); 
- 
-  const Expression *E = performSymbolicEvaluation(TransInst, Visited); 
-  InstrDFS.erase(TransInst); 
-  AllTempInstructions.erase(TransInst); 
-  TempToBlock.erase(TransInst); 
-  if (MemAccess) 
-    TempToMemory.erase(TransInst); 
-  if (!E) 
-    return nullptr; 
-  auto *FoundVal = findPHIOfOpsLeader(E, OrigInst, PredBB); 
-  if (!FoundVal) { 
-    ExpressionToPhiOfOps[E].insert(OrigInst); 
-    LLVM_DEBUG(dbgs() << "Cannot find phi of ops operand for " << *TransInst 
-                      << " in block " << getBlockName(PredBB) << "\n"); 
-    return nullptr; 
-  } 
-  if (auto *SI = dyn_cast<StoreInst>(FoundVal)) 
-    FoundVal = SI->getValueOperand(); 
-  return FoundVal; 
-} 
- 
-// When we see an instruction that is an op of phis, generate the equivalent phi 
-// of ops form. 
-const Expression * 
-NewGVN::makePossiblePHIOfOps(Instruction *I, 
-                             SmallPtrSetImpl<Value *> &Visited) { 
-  if (!okayForPHIOfOps(I)) 
-    return nullptr; 
- 
-  if (!Visited.insert(I).second) 
-    return nullptr; 
-  // For now, we require the instruction be cycle free because we don't 
-  // *always* create a phi of ops for instructions that could be done as phi 
-  // of ops, we only do it if we think it is useful.  If we did do it all the 
-  // time, we could remove the cycle free check. 
-  if (!isCycleFree(I)) 
-    return nullptr; 
- 
-  SmallPtrSet<const Value *, 8> ProcessedPHIs; 
-  // TODO: We don't do phi translation on memory accesses because it's 
-  // complicated. For a load, we'd need to be able to simulate a new memoryuse, 
-  // which we don't have a good way of doing ATM. 
-  auto *MemAccess = getMemoryAccess(I); 
-  // If the memory operation is defined by a memory operation this block that 
-  // isn't a MemoryPhi, transforming the pointer backwards through a scalar phi 
-  // can't help, as it would still be killed by that memory operation. 
-  if (MemAccess && !isa<MemoryPhi>(MemAccess->getDefiningAccess()) && 
-      MemAccess->getDefiningAccess()->getBlock() == I->getParent()) 
-    return nullptr; 
- 
-  // Convert op of phis to phi of ops 
-  SmallPtrSet<const Value *, 10> VisitedOps; 
-  SmallVector<Value *, 4> Ops(I->operand_values()); 
-  BasicBlock *SamePHIBlock = nullptr; 
-  PHINode *OpPHI = nullptr; 
-  if (!DebugCounter::shouldExecute(PHIOfOpsCounter)) 
-    return nullptr; 
-  for (auto *Op : Ops) { 
-    if (!isa<PHINode>(Op)) { 
-      auto *ValuePHI = RealToTemp.lookup(Op); 
-      if (!ValuePHI) 
-        continue; 
-      LLVM_DEBUG(dbgs() << "Found possible dependent phi of ops\n"); 
-      Op = ValuePHI; 
-    } 
-    OpPHI = cast<PHINode>(Op); 
-    if (!SamePHIBlock) { 
-      SamePHIBlock = getBlockForValue(OpPHI); 
-    } else if (SamePHIBlock != getBlockForValue(OpPHI)) { 
-      LLVM_DEBUG( 
-          dbgs() 
-          << "PHIs for operands are not all in the same block, aborting\n"); 
-      return nullptr; 
-    } 
-    // No point in doing this for one-operand phis. 
-    if (OpPHI->getNumOperands() == 1) { 
-      OpPHI = nullptr; 
-      continue; 
-    } 
-  } 
- 
-  if (!OpPHI) 
-    return nullptr; 
- 
-  SmallVector<ValPair, 4> PHIOps; 
-  SmallPtrSet<Value *, 4> Deps; 
-  auto *PHIBlock = getBlockForValue(OpPHI); 
-  RevisitOnReachabilityChange[PHIBlock].reset(InstrToDFSNum(I)); 
-  for (unsigned PredNum = 0; PredNum < OpPHI->getNumOperands(); ++PredNum) { 
-    auto *PredBB = OpPHI->getIncomingBlock(PredNum); 
-    Value *FoundVal = nullptr; 
-    SmallPtrSet<Value *, 4> CurrentDeps; 
-    // We could just skip unreachable edges entirely but it's tricky to do 
-    // with rewriting existing phi nodes. 
-    if (ReachableEdges.count({PredBB, PHIBlock})) { 
-      // Clone the instruction, create an expression from it that is 
-      // translated back into the predecessor, and see if we have a leader. 
-      Instruction *ValueOp = I->clone(); 
-      if (MemAccess) 
-        TempToMemory.insert({ValueOp, MemAccess}); 
-      bool SafeForPHIOfOps = true; 
-      VisitedOps.clear(); 
-      for (auto &Op : ValueOp->operands()) { 
-        auto *OrigOp = &*Op; 
-        // When these operand changes, it could change whether there is a 
-        // leader for us or not, so we have to add additional users. 
-        if (isa<PHINode>(Op)) { 
-          Op = Op->DoPHITranslation(PHIBlock, PredBB); 
-          if (Op != OrigOp && Op != I) 
-            CurrentDeps.insert(Op); 
-        } else if (auto *ValuePHI = RealToTemp.lookup(Op)) { 
-          if (getBlockForValue(ValuePHI) == PHIBlock) 
-            Op = ValuePHI->getIncomingValueForBlock(PredBB); 
-        } 
-        // If we phi-translated the op, it must be safe. 
-        SafeForPHIOfOps = 
-            SafeForPHIOfOps && 
-            (Op != OrigOp || OpIsSafeForPHIOfOps(Op, PHIBlock, VisitedOps)); 
-      } 
-      // FIXME: For those things that are not safe we could generate 
-      // expressions all the way down, and see if this comes out to a 
-      // constant.  For anything where that is true, and unsafe, we should 
-      // have made a phi-of-ops (or value numbered it equivalent to something) 
-      // for the pieces already. 
-      FoundVal = !SafeForPHIOfOps ? nullptr 
-                                  : findLeaderForInst(ValueOp, Visited, 
-                                                      MemAccess, I, PredBB); 
-      ValueOp->deleteValue(); 
-      if (!FoundVal) { 
-        // We failed to find a leader for the current ValueOp, but this might 
-        // change in case of the translated operands change. 
-        if (SafeForPHIOfOps) 
-          for (auto Dep : CurrentDeps) 
-            addAdditionalUsers(Dep, I); 
- 
-        return nullptr; 
-      } 
-      Deps.insert(CurrentDeps.begin(), CurrentDeps.end()); 
-    } else { 
-      LLVM_DEBUG(dbgs() << "Skipping phi of ops operand for incoming block " 
-                        << getBlockName(PredBB) 
-                        << " because the block is unreachable\n"); 
-      FoundVal = UndefValue::get(I->getType()); 
-      RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I)); 
-    } 
- 
-    PHIOps.push_back({FoundVal, PredBB}); 
-    LLVM_DEBUG(dbgs() << "Found phi of ops operand " << *FoundVal << " in " 
-                      << getBlockName(PredBB) << "\n"); 
-  } 
-  for (auto Dep : Deps) 
-    addAdditionalUsers(Dep, I); 
-  sortPHIOps(PHIOps); 
-  auto *E = performSymbolicPHIEvaluation(PHIOps, I, PHIBlock); 
-  if (isa<ConstantExpression>(E) || isa<VariableExpression>(E)) { 
-    LLVM_DEBUG( 
-        dbgs() 
-        << "Not creating real PHI of ops because it simplified to existing " 
-           "value or constant\n"); 
-    return E; 
-  } 
-  auto *ValuePHI = RealToTemp.lookup(I); 
-  bool NewPHI = false; 
-  if (!ValuePHI) { 
-    ValuePHI = 
-        PHINode::Create(I->getType(), OpPHI->getNumOperands(), "phiofops"); 
-    addPhiOfOps(ValuePHI, PHIBlock, I); 
-    NewPHI = true; 
-    NumGVNPHIOfOpsCreated++; 
-  } 
-  if (NewPHI) { 
-    for (auto PHIOp : PHIOps) 
-      ValuePHI->addIncoming(PHIOp.first, PHIOp.second); 
-  } else { 
-    TempToBlock[ValuePHI] = PHIBlock; 
-    unsigned int i = 0; 
-    for (auto PHIOp : PHIOps) { 
-      ValuePHI->setIncomingValue(i, PHIOp.first); 
-      ValuePHI->setIncomingBlock(i, PHIOp.second); 
-      ++i; 
-    } 
-  } 
-  RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I)); 
-  LLVM_DEBUG(dbgs() << "Created phi of ops " << *ValuePHI << " for " << *I 
-                    << "\n"); 
- 
-  return E; 
-} 
- 
-// The algorithm initially places the values of the routine in the TOP 
-// congruence class. The leader of TOP is the undetermined value `undef`. 
-// When the algorithm has finished, values still in TOP are unreachable. 
-void NewGVN::initializeCongruenceClasses(Function &F) { 
-  NextCongruenceNum = 0; 
- 
-  // Note that even though we use the live on entry def as a representative 
-  // MemoryAccess, it is *not* the same as the actual live on entry def. We 
-  // have no real equivalemnt to undef for MemoryAccesses, and so we really 
-  // should be checking whether the MemoryAccess is top if we want to know if it 
-  // is equivalent to everything.  Otherwise, what this really signifies is that 
-  // the access "it reaches all the way back to the beginning of the function" 
- 
-  // Initialize all other instructions to be in TOP class. 
-  TOPClass = createCongruenceClass(nullptr, nullptr); 
-  TOPClass->setMemoryLeader(MSSA->getLiveOnEntryDef()); 
-  //  The live on entry def gets put into it's own class 
-  MemoryAccessToClass[MSSA->getLiveOnEntryDef()] = 
-      createMemoryClass(MSSA->getLiveOnEntryDef()); 
- 
-  for (auto DTN : nodes(DT)) { 
-    BasicBlock *BB = DTN->getBlock(); 
-    // All MemoryAccesses are equivalent to live on entry to start. They must 
-    // be initialized to something so that initial changes are noticed. For 
-    // the maximal answer, we initialize them all to be the same as 
-    // liveOnEntry. 
-    auto *MemoryBlockDefs = MSSA->getBlockDefs(BB); 
-    if (MemoryBlockDefs) 
-      for (const auto &Def : *MemoryBlockDefs) { 
-        MemoryAccessToClass[&Def] = TOPClass; 
-        auto *MD = dyn_cast<MemoryDef>(&Def); 
-        // Insert the memory phis into the member list. 
-        if (!MD) { 
-          const MemoryPhi *MP = cast<MemoryPhi>(&Def); 
-          TOPClass->memory_insert(MP); 
-          MemoryPhiState.insert({MP, MPS_TOP}); 
-        } 
- 
-        if (MD && isa<StoreInst>(MD->getMemoryInst())) 
-          TOPClass->incStoreCount(); 
-      } 
- 
-    // FIXME: This is trying to discover which instructions are uses of phi 
-    // nodes.  We should move this into one of the myriad of places that walk 
-    // all the operands already. 
-    for (auto &I : *BB) { 
-      if (isa<PHINode>(&I)) 
-        for (auto *U : I.users()) 
-          if (auto *UInst = dyn_cast<Instruction>(U)) 
-            if (InstrToDFSNum(UInst) != 0 && okayForPHIOfOps(UInst)) 
-              PHINodeUses.insert(UInst); 
-      // Don't insert void terminators into the class. We don't value number 
-      // them, and they just end up sitting in TOP. 
-      if (I.isTerminator() && I.getType()->isVoidTy()) 
-        continue; 
-      TOPClass->insert(&I); 
-      ValueToClass[&I] = TOPClass; 
-    } 
-  } 
- 
-  // Initialize arguments to be in their own unique congruence classes 
-  for (auto &FA : F.args()) 
-    createSingletonCongruenceClass(&FA); 
-} 
- 
-void NewGVN::cleanupTables() { 
-  for (unsigned i = 0, e = CongruenceClasses.size(); i != e; ++i) { 
-    LLVM_DEBUG(dbgs() << "Congruence class " << CongruenceClasses[i]->getID() 
-                      << " has " << CongruenceClasses[i]->size() 
-                      << " members\n"); 
-    // Make sure we delete the congruence class (probably worth switching to 
-    // a unique_ptr at some point. 
-    delete CongruenceClasses[i]; 
-    CongruenceClasses[i] = nullptr; 
-  } 
- 
-  // Destroy the value expressions 
-  SmallVector<Instruction *, 8> TempInst(AllTempInstructions.begin(), 
-                                         AllTempInstructions.end()); 
-  AllTempInstructions.clear(); 
- 
-  // We have to drop all references for everything first, so there are no uses 
-  // left as we delete them. 
-  for (auto *I : TempInst) { 
-    I->dropAllReferences(); 
-  } 
- 
-  while (!TempInst.empty()) { 
+  }
+
+  return nullptr;
+}
+
+// Evaluate read only and pure calls, and create an expression result.
+const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) const {
+  auto *CI = cast<CallInst>(I);
+  if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+    // Intrinsics with the returned attribute are copies of arguments.
+    if (auto *ReturnedValue = II->getReturnedArgOperand()) {
+      if (II->getIntrinsicID() == Intrinsic::ssa_copy)
+        if (const auto *Result = performSymbolicPredicateInfoEvaluation(I))
+          return Result;
+      return createVariableOrConstant(ReturnedValue);
+    }
+  }
+  if (AA->doesNotAccessMemory(CI)) {
+    return createCallExpression(CI, TOPClass->getMemoryLeader());
+  } else if (AA->onlyReadsMemory(CI)) {
+    if (auto *MA = MSSA->getMemoryAccess(CI)) {
+      auto *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(MA);
+      return createCallExpression(CI, DefiningAccess);
+    } else // MSSA determined that CI does not access memory.
+      return createCallExpression(CI, TOPClass->getMemoryLeader());
+  }
+  return nullptr;
+}
+
+// Retrieve the memory class for a given MemoryAccess.
+CongruenceClass *NewGVN::getMemoryClass(const MemoryAccess *MA) const {
+  auto *Result = MemoryAccessToClass.lookup(MA);
+  assert(Result && "Should have found memory class");
+  return Result;
+}
+
+// Update the MemoryAccess equivalence table to say that From is equal to To,
+// and return true if this is different from what already existed in the table.
+bool NewGVN::setMemoryClass(const MemoryAccess *From,
+                            CongruenceClass *NewClass) {
+  assert(NewClass &&
+         "Every MemoryAccess should be getting mapped to a non-null class");
+  LLVM_DEBUG(dbgs() << "Setting " << *From);
+  LLVM_DEBUG(dbgs() << " equivalent to congruence class ");
+  LLVM_DEBUG(dbgs() << NewClass->getID()
+                    << " with current MemoryAccess leader ");
+  LLVM_DEBUG(dbgs() << *NewClass->getMemoryLeader() << "\n");
+
+  auto LookupResult = MemoryAccessToClass.find(From);
+  bool Changed = false;
+  // If it's already in the table, see if the value changed.
+  if (LookupResult != MemoryAccessToClass.end()) {
+    auto *OldClass = LookupResult->second;
+    if (OldClass != NewClass) {
+      // If this is a phi, we have to handle memory member updates.
+      if (auto *MP = dyn_cast<MemoryPhi>(From)) {
+        OldClass->memory_erase(MP);
+        NewClass->memory_insert(MP);
+        // This may have killed the class if it had no non-memory members
+        if (OldClass->getMemoryLeader() == From) {
+          if (OldClass->definesNoMemory()) {
+            OldClass->setMemoryLeader(nullptr);
+          } else {
+            OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
+            LLVM_DEBUG(dbgs() << "Memory class leader change for class "
+                              << OldClass->getID() << " to "
+                              << *OldClass->getMemoryLeader()
+                              << " due to removal of a memory member " << *From
+                              << "\n");
+            markMemoryLeaderChangeTouched(OldClass);
+          }
+        }
+      }
+      // It wasn't equivalent before, and now it is.
+      LookupResult->second = NewClass;
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+// Determine if a instruction is cycle-free.  That means the values in the
+// instruction don't depend on any expressions that can change value as a result
+// of the instruction.  For example, a non-cycle free instruction would be v =
+// phi(0, v+1).
+bool NewGVN::isCycleFree(const Instruction *I) const {
+  // In order to compute cycle-freeness, we do SCC finding on the instruction,
+  // and see what kind of SCC it ends up in.  If it is a singleton, it is
+  // cycle-free.  If it is not in a singleton, it is only cycle free if the
+  // other members are all phi nodes (as they do not compute anything, they are
+  // copies).
+  auto ICS = InstCycleState.lookup(I);
+  if (ICS == ICS_Unknown) {
+    SCCFinder.Start(I);
+    auto &SCC = SCCFinder.getComponentFor(I);
+    // It's cycle free if it's size 1 or the SCC is *only* phi nodes.
+    if (SCC.size() == 1)
+      InstCycleState.insert({I, ICS_CycleFree});
+    else {
+      bool AllPhis = llvm::all_of(SCC, [](const Value *V) {
+        return isa<PHINode>(V) || isCopyOfAPHI(V);
+      });
+      ICS = AllPhis ? ICS_CycleFree : ICS_Cycle;
+      for (auto *Member : SCC)
+        if (auto *MemberPhi = dyn_cast<PHINode>(Member))
+          InstCycleState.insert({MemberPhi, ICS});
+    }
+  }
+  if (ICS == ICS_Cycle)
+    return false;
+  return true;
+}
+
+// Evaluate PHI nodes symbolically and create an expression result.
+const Expression *
+NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps,
+                                     Instruction *I,
+                                     BasicBlock *PHIBlock) const {
+  // True if one of the incoming phi edges is a backedge.
+  bool HasBackedge = false;
+  // All constant tracks the state of whether all the *original* phi operands
+  // This is really shorthand for "this phi cannot cycle due to forward
+  // change in value of the phi is guaranteed not to later change the value of
+  // the phi. IE it can't be v = phi(undef, v+1)
+  bool OriginalOpsConstant = true;
+  auto *E = cast<PHIExpression>(createPHIExpression(
+      PHIOps, I, PHIBlock, HasBackedge, OriginalOpsConstant));
+  // We match the semantics of SimplifyPhiNode from InstructionSimplify here.
+  // See if all arguments are the same.
+  // We track if any were undef because they need special handling.
+  bool HasUndef = false;
+  auto Filtered = make_filter_range(E->operands(), [&](Value *Arg) {
+    if (isa<UndefValue>(Arg)) {
+      HasUndef = true;
+      return false;
+    }
+    return true;
+  });
+  // If we are left with no operands, it's dead.
+  if (Filtered.empty()) {
+    // If it has undef at this point, it means there are no-non-undef arguments,
+    // and thus, the value of the phi node must be undef.
+    if (HasUndef) {
+      LLVM_DEBUG(
+          dbgs() << "PHI Node " << *I
+                 << " has no non-undef arguments, valuing it as undef\n");
+      return createConstantExpression(UndefValue::get(I->getType()));
+    }
+
+    LLVM_DEBUG(dbgs() << "No arguments of PHI node " << *I << " are live\n");
+    deleteExpression(E);
+    return createDeadExpression();
+  }
+  Value *AllSameValue = *(Filtered.begin());
+  ++Filtered.begin();
+  // Can't use std::equal here, sadly, because filter.begin moves.
+  if (llvm::all_of(Filtered, [&](Value *Arg) { return Arg == AllSameValue; })) {
+    // In LLVM's non-standard representation of phi nodes, it's possible to have
+    // phi nodes with cycles (IE dependent on other phis that are .... dependent
+    // on the original phi node), especially in weird CFG's where some arguments
+    // are unreachable, or uninitialized along certain paths.  This can cause
+    // infinite loops during evaluation. We work around this by not trying to
+    // really evaluate them independently, but instead using a variable
+    // expression to say if one is equivalent to the other.
+    // We also special case undef, so that if we have an undef, we can't use the
+    // common value unless it dominates the phi block.
+    if (HasUndef) {
+      // If we have undef and at least one other value, this is really a
+      // multivalued phi, and we need to know if it's cycle free in order to
+      // evaluate whether we can ignore the undef.  The other parts of this are
+      // just shortcuts.  If there is no backedge, or all operands are
+      // constants, it also must be cycle free.
+      if (HasBackedge && !OriginalOpsConstant &&
+          !isa<UndefValue>(AllSameValue) && !isCycleFree(I))
+        return E;
+
+      // Only have to check for instructions
+      if (auto *AllSameInst = dyn_cast<Instruction>(AllSameValue))
+        if (!someEquivalentDominates(AllSameInst, I))
+          return E;
+    }
+    // Can't simplify to something that comes later in the iteration.
+    // Otherwise, when and if it changes congruence class, we will never catch
+    // up. We will always be a class behind it.
+    if (isa<Instruction>(AllSameValue) &&
+        InstrToDFSNum(AllSameValue) > InstrToDFSNum(I))
+      return E;
+    NumGVNPhisAllSame++;
+    LLVM_DEBUG(dbgs() << "Simplified PHI node " << *I << " to " << *AllSameValue
+                      << "\n");
+    deleteExpression(E);
+    return createVariableOrConstant(AllSameValue);
+  }
+  return E;
+}
+
+const Expression *
+NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) const {
+  if (auto *EI = dyn_cast<ExtractValueInst>(I)) {
+    auto *WO = dyn_cast<WithOverflowInst>(EI->getAggregateOperand());
+    if (WO && EI->getNumIndices() == 1 && *EI->idx_begin() == 0)
+      // EI is an extract from one of our with.overflow intrinsics. Synthesize
+      // a semantically equivalent expression instead of an extract value
+      // expression.
+      return createBinaryExpression(WO->getBinaryOp(), EI->getType(),
+                                    WO->getLHS(), WO->getRHS(), I);
+  }
+
+  return createAggregateValueExpression(I);
+}
+
+const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const {
+  assert(isa<CmpInst>(I) && "Expected a cmp instruction.");
+
+  auto *CI = cast<CmpInst>(I);
+  // See if our operands are equal to those of a previous predicate, and if so,
+  // if it implies true or false.
+  auto Op0 = lookupOperandLeader(CI->getOperand(0));
+  auto Op1 = lookupOperandLeader(CI->getOperand(1));
+  auto OurPredicate = CI->getPredicate();
+  if (shouldSwapOperands(Op0, Op1)) {
+    std::swap(Op0, Op1);
+    OurPredicate = CI->getSwappedPredicate();
+  }
+
+  // Avoid processing the same info twice.
+  const PredicateBase *LastPredInfo = nullptr;
+  // See if we know something about the comparison itself, like it is the target
+  // of an assume.
+  auto *CmpPI = PredInfo->getPredicateInfoFor(I);
+  if (dyn_cast_or_null<PredicateAssume>(CmpPI))
+    return createConstantExpression(ConstantInt::getTrue(CI->getType()));
+
+  if (Op0 == Op1) {
+    // This condition does not depend on predicates, no need to add users
+    if (CI->isTrueWhenEqual())
+      return createConstantExpression(ConstantInt::getTrue(CI->getType()));
+    else if (CI->isFalseWhenEqual())
+      return createConstantExpression(ConstantInt::getFalse(CI->getType()));
+  }
+
+  // NOTE: Because we are comparing both operands here and below, and using
+  // previous comparisons, we rely on fact that predicateinfo knows to mark
+  // comparisons that use renamed operands as users of the earlier comparisons.
+  // It is *not* enough to just mark predicateinfo renamed operands as users of
+  // the earlier comparisons, because the *other* operand may have changed in a
+  // previous iteration.
+  // Example:
+  // icmp slt %a, %b
+  // %b.0 = ssa.copy(%b)
+  // false branch:
+  // icmp slt %c, %b.0
+
+  // %c and %a may start out equal, and thus, the code below will say the second
+  // %icmp is false.  c may become equal to something else, and in that case the
+  // %second icmp *must* be reexamined, but would not if only the renamed
+  // %operands are considered users of the icmp.
+
+  // *Currently* we only check one level of comparisons back, and only mark one
+  // level back as touched when changes happen.  If you modify this code to look
+  // back farther through comparisons, you *must* mark the appropriate
+  // comparisons as users in PredicateInfo.cpp, or you will cause bugs.  See if
+  // we know something just from the operands themselves
+
+  // See if our operands have predicate info, so that we may be able to derive
+  // something from a previous comparison.
+  for (const auto &Op : CI->operands()) {
+    auto *PI = PredInfo->getPredicateInfoFor(Op);
+    if (const auto *PBranch = dyn_cast_or_null<PredicateBranch>(PI)) {
+      if (PI == LastPredInfo)
+        continue;
+      LastPredInfo = PI;
+      // In phi of ops cases, we may have predicate info that we are evaluating
+      // in a different context.
+      if (!DT->dominates(PBranch->To, getBlockForValue(I)))
+        continue;
+      // TODO: Along the false edge, we may know more things too, like
+      // icmp of
+      // same operands is false.
+      // TODO: We only handle actual comparison conditions below, not
+      // and/or.
+      auto *BranchCond = dyn_cast<CmpInst>(PBranch->Condition);
+      if (!BranchCond)
+        continue;
+      auto *BranchOp0 = lookupOperandLeader(BranchCond->getOperand(0));
+      auto *BranchOp1 = lookupOperandLeader(BranchCond->getOperand(1));
+      auto BranchPredicate = BranchCond->getPredicate();
+      if (shouldSwapOperands(BranchOp0, BranchOp1)) {
+        std::swap(BranchOp0, BranchOp1);
+        BranchPredicate = BranchCond->getSwappedPredicate();
+      }
+      if (BranchOp0 == Op0 && BranchOp1 == Op1) {
+        if (PBranch->TrueEdge) {
+          // If we know the previous predicate is true and we are in the true
+          // edge then we may be implied true or false.
+          if (CmpInst::isImpliedTrueByMatchingCmp(BranchPredicate,
+                                                  OurPredicate)) {
+            addPredicateUsers(PI, I);
+            return createConstantExpression(
+                ConstantInt::getTrue(CI->getType()));
+          }
+
+          if (CmpInst::isImpliedFalseByMatchingCmp(BranchPredicate,
+                                                   OurPredicate)) {
+            addPredicateUsers(PI, I);
+            return createConstantExpression(
+                ConstantInt::getFalse(CI->getType()));
+          }
+        } else {
+          // Just handle the ne and eq cases, where if we have the same
+          // operands, we may know something.
+          if (BranchPredicate == OurPredicate) {
+            addPredicateUsers(PI, I);
+            // Same predicate, same ops,we know it was false, so this is false.
+            return createConstantExpression(
+                ConstantInt::getFalse(CI->getType()));
+          } else if (BranchPredicate ==
+                     CmpInst::getInversePredicate(OurPredicate)) {
+            addPredicateUsers(PI, I);
+            // Inverse predicate, we know the other was false, so this is true.
+            return createConstantExpression(
+                ConstantInt::getTrue(CI->getType()));
+          }
+        }
+      }
+    }
+  }
+  // Create expression will take care of simplifyCmpInst
+  return createExpression(I);
+}
+
+// Substitute and symbolize the value before value numbering.
+const Expression *
+NewGVN::performSymbolicEvaluation(Value *V,
+                                  SmallPtrSetImpl<Value *> &Visited) const {
+  const Expression *E = nullptr;
+  if (auto *C = dyn_cast<Constant>(V))
+    E = createConstantExpression(C);
+  else if (isa<Argument>(V) || isa<GlobalVariable>(V)) {
+    E = createVariableExpression(V);
+  } else {
+    // TODO: memory intrinsics.
+    // TODO: Some day, we should do the forward propagation and reassociation
+    // parts of the algorithm.
+    auto *I = cast<Instruction>(V);
+    switch (I->getOpcode()) {
+    case Instruction::ExtractValue:
+    case Instruction::InsertValue:
+      E = performSymbolicAggrValueEvaluation(I);
+      break;
+    case Instruction::PHI: {
+      SmallVector<ValPair, 3> Ops;
+      auto *PN = cast<PHINode>(I);
+      for (unsigned i = 0; i < PN->getNumOperands(); ++i)
+        Ops.push_back({PN->getIncomingValue(i), PN->getIncomingBlock(i)});
+      // Sort to ensure the invariant createPHIExpression requires is met.
+      sortPHIOps(Ops);
+      E = performSymbolicPHIEvaluation(Ops, I, getBlockForValue(I));
+    } break;
+    case Instruction::Call:
+      E = performSymbolicCallEvaluation(I);
+      break;
+    case Instruction::Store:
+      E = performSymbolicStoreEvaluation(I);
+      break;
+    case Instruction::Load:
+      E = performSymbolicLoadEvaluation(I);
+      break;
+    case Instruction::BitCast:
+    case Instruction::AddrSpaceCast:
+      E = createExpression(I);
+      break;
+    case Instruction::ICmp:
+    case Instruction::FCmp:
+      E = performSymbolicCmpEvaluation(I);
+      break;
+    case Instruction::FNeg:
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::Select:
+    case Instruction::ExtractElement:
+    case Instruction::InsertElement:
+    case Instruction::GetElementPtr:
+      E = createExpression(I);
+      break;
+    case Instruction::ShuffleVector:
+      // FIXME: Add support for shufflevector to createExpression.
+      return nullptr;
+    default:
+      return nullptr;
+    }
+  }
+  return E;
+}
+
+// Look up a container of values/instructions in a map, and touch all the
+// instructions in the container.  Then erase value from the map.
+template <typename Map, typename KeyType>
+void NewGVN::touchAndErase(Map &M, const KeyType &Key) {
+  const auto Result = M.find_as(Key);
+  if (Result != M.end()) {
+    for (const typename Map::mapped_type::value_type Mapped : Result->second)
+      TouchedInstructions.set(InstrToDFSNum(Mapped));
+    M.erase(Result);
+  }
+}
+
+void NewGVN::addAdditionalUsers(Value *To, Value *User) const {
+  assert(User && To != User);
+  if (isa<Instruction>(To))
+    AdditionalUsers[To].insert(User);
+}
+
+void NewGVN::markUsersTouched(Value *V) {
+  // Now mark the users as touched.
+  for (auto *User : V->users()) {
+    assert(isa<Instruction>(User) && "Use of value not within an instruction?");
+    TouchedInstructions.set(InstrToDFSNum(User));
+  }
+  touchAndErase(AdditionalUsers, V);
+}
+
+void NewGVN::addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const {
+  LLVM_DEBUG(dbgs() << "Adding memory user " << *U << " to " << *To << "\n");
+  MemoryToUsers[To].insert(U);
+}
+
+void NewGVN::markMemoryDefTouched(const MemoryAccess *MA) {
+  TouchedInstructions.set(MemoryToDFSNum(MA));
+}
+
+void NewGVN::markMemoryUsersTouched(const MemoryAccess *MA) {
+  if (isa<MemoryUse>(MA))
+    return;
+  for (auto U : MA->users())
+    TouchedInstructions.set(MemoryToDFSNum(U));
+  touchAndErase(MemoryToUsers, MA);
+}
+
+// Add I to the set of users of a given predicate.
+void NewGVN::addPredicateUsers(const PredicateBase *PB, Instruction *I) const {
+  // Don't add temporary instructions to the user lists.
+  if (AllTempInstructions.count(I))
+    return;
+
+  if (auto *PBranch = dyn_cast<PredicateBranch>(PB))
+    PredicateToUsers[PBranch->Condition].insert(I);
+  else if (auto *PAssume = dyn_cast<PredicateAssume>(PB))
+    PredicateToUsers[PAssume->Condition].insert(I);
+}
+
+// Touch all the predicates that depend on this instruction.
+void NewGVN::markPredicateUsersTouched(Instruction *I) {
+  touchAndErase(PredicateToUsers, I);
+}
+
+// Mark users affected by a memory leader change.
+void NewGVN::markMemoryLeaderChangeTouched(CongruenceClass *CC) {
+  for (auto M : CC->memory())
+    markMemoryDefTouched(M);
+}
+
+// Touch the instructions that need to be updated after a congruence class has a
+// leader change, and mark changed values.
+void NewGVN::markValueLeaderChangeTouched(CongruenceClass *CC) {
+  for (auto M : *CC) {
+    if (auto *I = dyn_cast<Instruction>(M))
+      TouchedInstructions.set(InstrToDFSNum(I));
+    LeaderChanges.insert(M);
+  }
+}
+
+// Give a range of things that have instruction DFS numbers, this will return
+// the member of the range with the smallest dfs number.
+template <class T, class Range>
+T *NewGVN::getMinDFSOfRange(const Range &R) const {
+  std::pair<T *, unsigned> MinDFS = {nullptr, ~0U};
+  for (const auto X : R) {
+    auto DFSNum = InstrToDFSNum(X);
+    if (DFSNum < MinDFS.second)
+      MinDFS = {X, DFSNum};
+  }
+  return MinDFS.first;
+}
+
+// This function returns the MemoryAccess that should be the next leader of
+// congruence class CC, under the assumption that the current leader is going to
+// disappear.
+const MemoryAccess *NewGVN::getNextMemoryLeader(CongruenceClass *CC) const {
+  // TODO: If this ends up to slow, we can maintain a next memory leader like we
+  // do for regular leaders.
+  // Make sure there will be a leader to find.
+  assert(!CC->definesNoMemory() && "Can't get next leader if there is none");
+  if (CC->getStoreCount() > 0) {
+    if (auto *NL = dyn_cast_or_null<StoreInst>(CC->getNextLeader().first))
+      return getMemoryAccess(NL);
+    // Find the store with the minimum DFS number.
+    auto *V = getMinDFSOfRange<Value>(make_filter_range(
+        *CC, [&](const Value *V) { return isa<StoreInst>(V); }));
+    return getMemoryAccess(cast<StoreInst>(V));
+  }
+  assert(CC->getStoreCount() == 0);
+
+  // Given our assertion, hitting this part must mean
+  // !OldClass->memory_empty()
+  if (CC->memory_size() == 1)
+    return *CC->memory_begin();
+  return getMinDFSOfRange<const MemoryPhi>(CC->memory());
+}
+
+// This function returns the next value leader of a congruence class, under the
+// assumption that the current leader is going away.  This should end up being
+// the next most dominating member.
+Value *NewGVN::getNextValueLeader(CongruenceClass *CC) const {
+  // We don't need to sort members if there is only 1, and we don't care about
+  // sorting the TOP class because everything either gets out of it or is
+  // unreachable.
+
+  if (CC->size() == 1 || CC == TOPClass) {
+    return *(CC->begin());
+  } else if (CC->getNextLeader().first) {
+    ++NumGVNAvoidedSortedLeaderChanges;
+    return CC->getNextLeader().first;
+  } else {
+    ++NumGVNSortedLeaderChanges;
+    // NOTE: If this ends up to slow, we can maintain a dual structure for
+    // member testing/insertion, or keep things mostly sorted, and sort only
+    // here, or use SparseBitVector or ....
+    return getMinDFSOfRange<Value>(*CC);
+  }
+}
+
+// Move a MemoryAccess, currently in OldClass, to NewClass, including updates to
+// the memory members, etc for the move.
+//
+// The invariants of this function are:
+//
+// - I must be moving to NewClass from OldClass
+// - The StoreCount of OldClass and NewClass is expected to have been updated
+//   for I already if it is a store.
+// - The OldClass memory leader has not been updated yet if I was the leader.
+void NewGVN::moveMemoryToNewCongruenceClass(Instruction *I,
+                                            MemoryAccess *InstMA,
+                                            CongruenceClass *OldClass,
+                                            CongruenceClass *NewClass) {
+  // If the leader is I, and we had a representative MemoryAccess, it should
+  // be the MemoryAccess of OldClass.
+  assert((!InstMA || !OldClass->getMemoryLeader() ||
+          OldClass->getLeader() != I ||
+          MemoryAccessToClass.lookup(OldClass->getMemoryLeader()) ==
+              MemoryAccessToClass.lookup(InstMA)) &&
+         "Representative MemoryAccess mismatch");
+  // First, see what happens to the new class
+  if (!NewClass->getMemoryLeader()) {
+    // Should be a new class, or a store becoming a leader of a new class.
+    assert(NewClass->size() == 1 ||
+           (isa<StoreInst>(I) && NewClass->getStoreCount() == 1));
+    NewClass->setMemoryLeader(InstMA);
+    // Mark it touched if we didn't just create a singleton
+    LLVM_DEBUG(dbgs() << "Memory class leader change for class "
+                      << NewClass->getID()
+                      << " due to new memory instruction becoming leader\n");
+    markMemoryLeaderChangeTouched(NewClass);
+  }
+  setMemoryClass(InstMA, NewClass);
+  // Now, fixup the old class if necessary
+  if (OldClass->getMemoryLeader() == InstMA) {
+    if (!OldClass->definesNoMemory()) {
+      OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
+      LLVM_DEBUG(dbgs() << "Memory class leader change for class "
+                        << OldClass->getID() << " to "
+                        << *OldClass->getMemoryLeader()
+                        << " due to removal of old leader " << *InstMA << "\n");
+      markMemoryLeaderChangeTouched(OldClass);
+    } else
+      OldClass->setMemoryLeader(nullptr);
+  }
+}
+
+// Move a value, currently in OldClass, to be part of NewClass
+// Update OldClass and NewClass for the move (including changing leaders, etc).
+void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
+                                           CongruenceClass *OldClass,
+                                           CongruenceClass *NewClass) {
+  if (I == OldClass->getNextLeader().first)
+    OldClass->resetNextLeader();
+
+  OldClass->erase(I);
+  NewClass->insert(I);
+
+  if (NewClass->getLeader() != I)
+    NewClass->addPossibleNextLeader({I, InstrToDFSNum(I)});
+  // Handle our special casing of stores.
+  if (auto *SI = dyn_cast<StoreInst>(I)) {
+    OldClass->decStoreCount();
+    // Okay, so when do we want to make a store a leader of a class?
+    // If we have a store defined by an earlier load, we want the earlier load
+    // to lead the class.
+    // If we have a store defined by something else, we want the store to lead
+    // the class so everything else gets the "something else" as a value.
+    // If we have a store as the single member of the class, we want the store
+    // as the leader
+    if (NewClass->getStoreCount() == 0 && !NewClass->getStoredValue()) {
+      // If it's a store expression we are using, it means we are not equivalent
+      // to something earlier.
+      if (auto *SE = dyn_cast<StoreExpression>(E)) {
+        NewClass->setStoredValue(SE->getStoredValue());
+        markValueLeaderChangeTouched(NewClass);
+        // Shift the new class leader to be the store
+        LLVM_DEBUG(dbgs() << "Changing leader of congruence class "
+                          << NewClass->getID() << " from "
+                          << *NewClass->getLeader() << " to  " << *SI
+                          << " because store joined class\n");
+        // If we changed the leader, we have to mark it changed because we don't
+        // know what it will do to symbolic evaluation.
+        NewClass->setLeader(SI);
+      }
+      // We rely on the code below handling the MemoryAccess change.
+    }
+    NewClass->incStoreCount();
+  }
+  // True if there is no memory instructions left in a class that had memory
+  // instructions before.
+
+  // If it's not a memory use, set the MemoryAccess equivalence
+  auto *InstMA = dyn_cast_or_null<MemoryDef>(getMemoryAccess(I));
+  if (InstMA)
+    moveMemoryToNewCongruenceClass(I, InstMA, OldClass, NewClass);
+  ValueToClass[I] = NewClass;
+  // See if we destroyed the class or need to swap leaders.
+  if (OldClass->empty() && OldClass != TOPClass) {
+    if (OldClass->getDefiningExpr()) {
+      LLVM_DEBUG(dbgs() << "Erasing expression " << *OldClass->getDefiningExpr()
+                        << " from table\n");
+      // We erase it as an exact expression to make sure we don't just erase an
+      // equivalent one.
+      auto Iter = ExpressionToClass.find_as(
+          ExactEqualsExpression(*OldClass->getDefiningExpr()));
+      if (Iter != ExpressionToClass.end())
+        ExpressionToClass.erase(Iter);
+#ifdef EXPENSIVE_CHECKS
+      assert(
+          (*OldClass->getDefiningExpr() != *E || ExpressionToClass.lookup(E)) &&
+          "We erased the expression we just inserted, which should not happen");
+#endif
+    }
+  } else if (OldClass->getLeader() == I) {
+    // When the leader changes, the value numbering of
+    // everything may change due to symbolization changes, so we need to
+    // reprocess.
+    LLVM_DEBUG(dbgs() << "Value class leader change for class "
+                      << OldClass->getID() << "\n");
+    ++NumGVNLeaderChanges;
+    // Destroy the stored value if there are no more stores to represent it.
+    // Note that this is basically clean up for the expression removal that
+    // happens below.  If we remove stores from a class, we may leave it as a
+    // class of equivalent memory phis.
+    if (OldClass->getStoreCount() == 0) {
+      if (OldClass->getStoredValue())
+        OldClass->setStoredValue(nullptr);
+    }
+    OldClass->setLeader(getNextValueLeader(OldClass));
+    OldClass->resetNextLeader();
+    markValueLeaderChangeTouched(OldClass);
+  }
+}
+
+// For a given expression, mark the phi of ops instructions that could have
+// changed as a result.
+void NewGVN::markPhiOfOpsChanged(const Expression *E) {
+  touchAndErase(ExpressionToPhiOfOps, E);
+}
+
+// Perform congruence finding on a given value numbering expression.
+void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
+  // This is guaranteed to return something, since it will at least find
+  // TOP.
+
+  CongruenceClass *IClass = ValueToClass.lookup(I);
+  assert(IClass && "Should have found a IClass");
+  // Dead classes should have been eliminated from the mapping.
+  assert(!IClass->isDead() && "Found a dead class");
+
+  CongruenceClass *EClass = nullptr;
+  if (const auto *VE = dyn_cast<VariableExpression>(E)) {
+    EClass = ValueToClass.lookup(VE->getVariableValue());
+  } else if (isa<DeadExpression>(E)) {
+    EClass = TOPClass;
+  }
+  if (!EClass) {
+    auto lookupResult = ExpressionToClass.insert({E, nullptr});
+
+    // If it's not in the value table, create a new congruence class.
+    if (lookupResult.second) {
+      CongruenceClass *NewClass = createCongruenceClass(nullptr, E);
+      auto place = lookupResult.first;
+      place->second = NewClass;
+
+      // Constants and variables should always be made the leader.
+      if (const auto *CE = dyn_cast<ConstantExpression>(E)) {
+        NewClass->setLeader(CE->getConstantValue());
+      } else if (const auto *SE = dyn_cast<StoreExpression>(E)) {
+        StoreInst *SI = SE->getStoreInst();
+        NewClass->setLeader(SI);
+        NewClass->setStoredValue(SE->getStoredValue());
+        // The RepMemoryAccess field will be filled in properly by the
+        // moveValueToNewCongruenceClass call.
+      } else {
+        NewClass->setLeader(I);
+      }
+      assert(!isa<VariableExpression>(E) &&
+             "VariableExpression should have been handled already");
+
+      EClass = NewClass;
+      LLVM_DEBUG(dbgs() << "Created new congruence class for " << *I
+                        << " using expression " << *E << " at "
+                        << NewClass->getID() << " and leader "
+                        << *(NewClass->getLeader()));
+      if (NewClass->getStoredValue())
+        LLVM_DEBUG(dbgs() << " and stored value "
+                          << *(NewClass->getStoredValue()));
+      LLVM_DEBUG(dbgs() << "\n");
+    } else {
+      EClass = lookupResult.first->second;
+      if (isa<ConstantExpression>(E))
+        assert((isa<Constant>(EClass->getLeader()) ||
+                (EClass->getStoredValue() &&
+                 isa<Constant>(EClass->getStoredValue()))) &&
+               "Any class with a constant expression should have a "
+               "constant leader");
+
+      assert(EClass && "Somehow don't have an eclass");
+
+      assert(!EClass->isDead() && "We accidentally looked up a dead class");
+    }
+  }
+  bool ClassChanged = IClass != EClass;
+  bool LeaderChanged = LeaderChanges.erase(I);
+  if (ClassChanged || LeaderChanged) {
+    LLVM_DEBUG(dbgs() << "New class " << EClass->getID() << " for expression "
+                      << *E << "\n");
+    if (ClassChanged) {
+      moveValueToNewCongruenceClass(I, E, IClass, EClass);
+      markPhiOfOpsChanged(E);
+    }
+
+    markUsersTouched(I);
+    if (MemoryAccess *MA = getMemoryAccess(I))
+      markMemoryUsersTouched(MA);
+    if (auto *CI = dyn_cast<CmpInst>(I))
+      markPredicateUsersTouched(CI);
+  }
+  // If we changed the class of the store, we want to ensure nothing finds the
+  // old store expression.  In particular, loads do not compare against stored
+  // value, so they will find old store expressions (and associated class
+  // mappings) if we leave them in the table.
+  if (ClassChanged && isa<StoreInst>(I)) {
+    auto *OldE = ValueToExpression.lookup(I);
+    // It could just be that the old class died. We don't want to erase it if we
+    // just moved classes.
+    if (OldE && isa<StoreExpression>(OldE) && *E != *OldE) {
+      // Erase this as an exact expression to ensure we don't erase expressions
+      // equivalent to it.
+      auto Iter = ExpressionToClass.find_as(ExactEqualsExpression(*OldE));
+      if (Iter != ExpressionToClass.end())
+        ExpressionToClass.erase(Iter);
+    }
+  }
+  ValueToExpression[I] = E;
+}
+
+// Process the fact that Edge (from, to) is reachable, including marking
+// any newly reachable blocks and instructions for processing.
+void NewGVN::updateReachableEdge(BasicBlock *From, BasicBlock *To) {
+  // Check if the Edge was reachable before.
+  if (ReachableEdges.insert({From, To}).second) {
+    // If this block wasn't reachable before, all instructions are touched.
+    if (ReachableBlocks.insert(To).second) {
+      LLVM_DEBUG(dbgs() << "Block " << getBlockName(To)
+                        << " marked reachable\n");
+      const auto &InstRange = BlockInstRange.lookup(To);
+      TouchedInstructions.set(InstRange.first, InstRange.second);
+    } else {
+      LLVM_DEBUG(dbgs() << "Block " << getBlockName(To)
+                        << " was reachable, but new edge {"
+                        << getBlockName(From) << "," << getBlockName(To)
+                        << "} to it found\n");
+
+      // We've made an edge reachable to an existing block, which may
+      // impact predicates. Otherwise, only mark the phi nodes as touched, as
+      // they are the only thing that depend on new edges. Anything using their
+      // values will get propagated to if necessary.
+      if (MemoryAccess *MemPhi = getMemoryAccess(To))
+        TouchedInstructions.set(InstrToDFSNum(MemPhi));
+
+      // FIXME: We should just add a union op on a Bitvector and
+      // SparseBitVector.  We can do it word by word faster than we are doing it
+      // here.
+      for (auto InstNum : RevisitOnReachabilityChange[To])
+        TouchedInstructions.set(InstNum);
+    }
+  }
+}
+
+// Given a predicate condition (from a switch, cmp, or whatever) and a block,
+// see if we know some constant value for it already.
+Value *NewGVN::findConditionEquivalence(Value *Cond) const {
+  auto Result = lookupOperandLeader(Cond);
+  return isa<Constant>(Result) ? Result : nullptr;
+}
+
+// Process the outgoing edges of a block for reachability.
+void NewGVN::processOutgoingEdges(Instruction *TI, BasicBlock *B) {
+  // Evaluate reachability of terminator instruction.
+  Value *Cond;
+  BasicBlock *TrueSucc, *FalseSucc;
+  if (match(TI, m_Br(m_Value(Cond), TrueSucc, FalseSucc))) {
+    Value *CondEvaluated = findConditionEquivalence(Cond);
+    if (!CondEvaluated) {
+      if (auto *I = dyn_cast<Instruction>(Cond)) {
+        const Expression *E = createExpression(I);
+        if (const auto *CE = dyn_cast<ConstantExpression>(E)) {
+          CondEvaluated = CE->getConstantValue();
+        }
+      } else if (isa<ConstantInt>(Cond)) {
+        CondEvaluated = Cond;
+      }
+    }
+    ConstantInt *CI;
+    if (CondEvaluated && (CI = dyn_cast<ConstantInt>(CondEvaluated))) {
+      if (CI->isOne()) {
+        LLVM_DEBUG(dbgs() << "Condition for Terminator " << *TI
+                          << " evaluated to true\n");
+        updateReachableEdge(B, TrueSucc);
+      } else if (CI->isZero()) {
+        LLVM_DEBUG(dbgs() << "Condition for Terminator " << *TI
+                          << " evaluated to false\n");
+        updateReachableEdge(B, FalseSucc);
+      }
+    } else {
+      updateReachableEdge(B, TrueSucc);
+      updateReachableEdge(B, FalseSucc);
+    }
+  } else if (auto *SI = dyn_cast<SwitchInst>(TI)) {
+    // For switches, propagate the case values into the case
+    // destinations.
+
+    Value *SwitchCond = SI->getCondition();
+    Value *CondEvaluated = findConditionEquivalence(SwitchCond);
+    // See if we were able to turn this switch statement into a constant.
+    if (CondEvaluated && isa<ConstantInt>(CondEvaluated)) {
+      auto *CondVal = cast<ConstantInt>(CondEvaluated);
+      // We should be able to get case value for this.
+      auto Case = *SI->findCaseValue(CondVal);
+      if (Case.getCaseSuccessor() == SI->getDefaultDest()) {
+        // We proved the value is outside of the range of the case.
+        // We can't do anything other than mark the default dest as reachable,
+        // and go home.
+        updateReachableEdge(B, SI->getDefaultDest());
+        return;
+      }
+      // Now get where it goes and mark it reachable.
+      BasicBlock *TargetBlock = Case.getCaseSuccessor();
+      updateReachableEdge(B, TargetBlock);
+    } else {
+      for (unsigned i = 0, e = SI->getNumSuccessors(); i != e; ++i) {
+        BasicBlock *TargetBlock = SI->getSuccessor(i);
+        updateReachableEdge(B, TargetBlock);
+      }
+    }
+  } else {
+    // Otherwise this is either unconditional, or a type we have no
+    // idea about. Just mark successors as reachable.
+    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) {
+      BasicBlock *TargetBlock = TI->getSuccessor(i);
+      updateReachableEdge(B, TargetBlock);
+    }
+
+    // This also may be a memory defining terminator, in which case, set it
+    // equivalent only to itself.
+    //
+    auto *MA = getMemoryAccess(TI);
+    if (MA && !isa<MemoryUse>(MA)) {
+      auto *CC = ensureLeaderOfMemoryClass(MA);
+      if (setMemoryClass(MA, CC))
+        markMemoryUsersTouched(MA);
+    }
+  }
+}
+
+// Remove the PHI of Ops PHI for I
+void NewGVN::removePhiOfOps(Instruction *I, PHINode *PHITemp) {
+  InstrDFS.erase(PHITemp);
+  // It's still a temp instruction. We keep it in the array so it gets erased.
+  // However, it's no longer used by I, or in the block
+  TempToBlock.erase(PHITemp);
+  RealToTemp.erase(I);
+  // We don't remove the users from the phi node uses. This wastes a little
+  // time, but such is life.  We could use two sets to track which were there
+  // are the start of NewGVN, and which were added, but right nowt he cost of
+  // tracking is more than the cost of checking for more phi of ops.
+}
+
+// Add PHI Op in BB as a PHI of operations version of ExistingValue.
+void NewGVN::addPhiOfOps(PHINode *Op, BasicBlock *BB,
+                         Instruction *ExistingValue) {
+  InstrDFS[Op] = InstrToDFSNum(ExistingValue);
+  AllTempInstructions.insert(Op);
+  TempToBlock[Op] = BB;
+  RealToTemp[ExistingValue] = Op;
+  // Add all users to phi node use, as they are now uses of the phi of ops phis
+  // and may themselves be phi of ops.
+  for (auto *U : ExistingValue->users())
+    if (auto *UI = dyn_cast<Instruction>(U))
+      PHINodeUses.insert(UI);
+}
+
+static bool okayForPHIOfOps(const Instruction *I) {
+  if (!EnablePhiOfOps)
+    return false;
+  return isa<BinaryOperator>(I) || isa<SelectInst>(I) || isa<CmpInst>(I) ||
+         isa<LoadInst>(I);
+}
+
+bool NewGVN::OpIsSafeForPHIOfOpsHelper(
+    Value *V, const BasicBlock *PHIBlock,
+    SmallPtrSetImpl<const Value *> &Visited,
+    SmallVectorImpl<Instruction *> &Worklist) {
+
+  if (!isa<Instruction>(V))
+    return true;
+  auto OISIt = OpSafeForPHIOfOps.find(V);
+  if (OISIt != OpSafeForPHIOfOps.end())
+    return OISIt->second;
+
+  // Keep walking until we either dominate the phi block, or hit a phi, or run
+  // out of things to check.
+  if (DT->properlyDominates(getBlockForValue(V), PHIBlock)) {
+    OpSafeForPHIOfOps.insert({V, true});
+    return true;
+  }
+  // PHI in the same block.
+  if (isa<PHINode>(V) && getBlockForValue(V) == PHIBlock) {
+    OpSafeForPHIOfOps.insert({V, false});
+    return false;
+  }
+
+  auto *OrigI = cast<Instruction>(V);
+  for (auto *Op : OrigI->operand_values()) {
+    if (!isa<Instruction>(Op))
+      continue;
+    // Stop now if we find an unsafe operand.
+    auto OISIt = OpSafeForPHIOfOps.find(OrigI);
+    if (OISIt != OpSafeForPHIOfOps.end()) {
+      if (!OISIt->second) {
+        OpSafeForPHIOfOps.insert({V, false});
+        return false;
+      }
+      continue;
+    }
+    if (!Visited.insert(Op).second)
+      continue;
+    Worklist.push_back(cast<Instruction>(Op));
+  }
+  return true;
+}
+
+// Return true if this operand will be safe to use for phi of ops.
+//
+// The reason some operands are unsafe is that we are not trying to recursively
+// translate everything back through phi nodes.  We actually expect some lookups
+// of expressions to fail.  In particular, a lookup where the expression cannot
+// exist in the predecessor.  This is true even if the expression, as shown, can
+// be determined to be constant.
+bool NewGVN::OpIsSafeForPHIOfOps(Value *V, const BasicBlock *PHIBlock,
+                                 SmallPtrSetImpl<const Value *> &Visited) {
+  SmallVector<Instruction *, 4> Worklist;
+  if (!OpIsSafeForPHIOfOpsHelper(V, PHIBlock, Visited, Worklist))
+    return false;
+  while (!Worklist.empty()) {
+    auto *I = Worklist.pop_back_val();
+    if (!OpIsSafeForPHIOfOpsHelper(I, PHIBlock, Visited, Worklist))
+      return false;
+  }
+  OpSafeForPHIOfOps.insert({V, true});
+  return true;
+}
+
+// Try to find a leader for instruction TransInst, which is a phi translated
+// version of something in our original program.  Visited is used to ensure we
+// don't infinite loop during translations of cycles.  OrigInst is the
+// instruction in the original program, and PredBB is the predecessor we
+// translated it through.
+Value *NewGVN::findLeaderForInst(Instruction *TransInst,
+                                 SmallPtrSetImpl<Value *> &Visited,
+                                 MemoryAccess *MemAccess, Instruction *OrigInst,
+                                 BasicBlock *PredBB) {
+  unsigned IDFSNum = InstrToDFSNum(OrigInst);
+  // Make sure it's marked as a temporary instruction.
+  AllTempInstructions.insert(TransInst);
+  // and make sure anything that tries to add it's DFS number is
+  // redirected to the instruction we are making a phi of ops
+  // for.
+  TempToBlock.insert({TransInst, PredBB});
+  InstrDFS.insert({TransInst, IDFSNum});
+
+  const Expression *E = performSymbolicEvaluation(TransInst, Visited);
+  InstrDFS.erase(TransInst);
+  AllTempInstructions.erase(TransInst);
+  TempToBlock.erase(TransInst);
+  if (MemAccess)
+    TempToMemory.erase(TransInst);
+  if (!E)
+    return nullptr;
+  auto *FoundVal = findPHIOfOpsLeader(E, OrigInst, PredBB);
+  if (!FoundVal) {
+    ExpressionToPhiOfOps[E].insert(OrigInst);
+    LLVM_DEBUG(dbgs() << "Cannot find phi of ops operand for " << *TransInst
+                      << " in block " << getBlockName(PredBB) << "\n");
+    return nullptr;
+  }
+  if (auto *SI = dyn_cast<StoreInst>(FoundVal))
+    FoundVal = SI->getValueOperand();
+  return FoundVal;
+}
+
+// When we see an instruction that is an op of phis, generate the equivalent phi
+// of ops form.
+const Expression *
+NewGVN::makePossiblePHIOfOps(Instruction *I,
+                             SmallPtrSetImpl<Value *> &Visited) {
+  if (!okayForPHIOfOps(I))
+    return nullptr;
+
+  if (!Visited.insert(I).second)
+    return nullptr;
+  // For now, we require the instruction be cycle free because we don't
+  // *always* create a phi of ops for instructions that could be done as phi
+  // of ops, we only do it if we think it is useful.  If we did do it all the
+  // time, we could remove the cycle free check.
+  if (!isCycleFree(I))
+    return nullptr;
+
+  SmallPtrSet<const Value *, 8> ProcessedPHIs;
+  // TODO: We don't do phi translation on memory accesses because it's
+  // complicated. For a load, we'd need to be able to simulate a new memoryuse,
+  // which we don't have a good way of doing ATM.
+  auto *MemAccess = getMemoryAccess(I);
+  // If the memory operation is defined by a memory operation this block that
+  // isn't a MemoryPhi, transforming the pointer backwards through a scalar phi
+  // can't help, as it would still be killed by that memory operation.
+  if (MemAccess && !isa<MemoryPhi>(MemAccess->getDefiningAccess()) &&
+      MemAccess->getDefiningAccess()->getBlock() == I->getParent())
+    return nullptr;
+
+  // Convert op of phis to phi of ops
+  SmallPtrSet<const Value *, 10> VisitedOps;
+  SmallVector<Value *, 4> Ops(I->operand_values());
+  BasicBlock *SamePHIBlock = nullptr;
+  PHINode *OpPHI = nullptr;
+  if (!DebugCounter::shouldExecute(PHIOfOpsCounter))
+    return nullptr;
+  for (auto *Op : Ops) {
+    if (!isa<PHINode>(Op)) {
+      auto *ValuePHI = RealToTemp.lookup(Op);
+      if (!ValuePHI)
+        continue;
+      LLVM_DEBUG(dbgs() << "Found possible dependent phi of ops\n");
+      Op = ValuePHI;
+    }
+    OpPHI = cast<PHINode>(Op);
+    if (!SamePHIBlock) {
+      SamePHIBlock = getBlockForValue(OpPHI);
+    } else if (SamePHIBlock != getBlockForValue(OpPHI)) {
+      LLVM_DEBUG(
+          dbgs()
+          << "PHIs for operands are not all in the same block, aborting\n");
+      return nullptr;
+    }
+    // No point in doing this for one-operand phis.
+    if (OpPHI->getNumOperands() == 1) {
+      OpPHI = nullptr;
+      continue;
+    }
+  }
+
+  if (!OpPHI)
+    return nullptr;
+
+  SmallVector<ValPair, 4> PHIOps;
+  SmallPtrSet<Value *, 4> Deps;
+  auto *PHIBlock = getBlockForValue(OpPHI);
+  RevisitOnReachabilityChange[PHIBlock].reset(InstrToDFSNum(I));
+  for (unsigned PredNum = 0; PredNum < OpPHI->getNumOperands(); ++PredNum) {
+    auto *PredBB = OpPHI->getIncomingBlock(PredNum);
+    Value *FoundVal = nullptr;
+    SmallPtrSet<Value *, 4> CurrentDeps;
+    // We could just skip unreachable edges entirely but it's tricky to do
+    // with rewriting existing phi nodes.
+    if (ReachableEdges.count({PredBB, PHIBlock})) {
+      // Clone the instruction, create an expression from it that is
+      // translated back into the predecessor, and see if we have a leader.
+      Instruction *ValueOp = I->clone();
+      if (MemAccess)
+        TempToMemory.insert({ValueOp, MemAccess});
+      bool SafeForPHIOfOps = true;
+      VisitedOps.clear();
+      for (auto &Op : ValueOp->operands()) {
+        auto *OrigOp = &*Op;
+        // When these operand changes, it could change whether there is a
+        // leader for us or not, so we have to add additional users.
+        if (isa<PHINode>(Op)) {
+          Op = Op->DoPHITranslation(PHIBlock, PredBB);
+          if (Op != OrigOp && Op != I)
+            CurrentDeps.insert(Op);
+        } else if (auto *ValuePHI = RealToTemp.lookup(Op)) {
+          if (getBlockForValue(ValuePHI) == PHIBlock)
+            Op = ValuePHI->getIncomingValueForBlock(PredBB);
+        }
+        // If we phi-translated the op, it must be safe.
+        SafeForPHIOfOps =
+            SafeForPHIOfOps &&
+            (Op != OrigOp || OpIsSafeForPHIOfOps(Op, PHIBlock, VisitedOps));
+      }
+      // FIXME: For those things that are not safe we could generate
+      // expressions all the way down, and see if this comes out to a
+      // constant.  For anything where that is true, and unsafe, we should
+      // have made a phi-of-ops (or value numbered it equivalent to something)
+      // for the pieces already.
+      FoundVal = !SafeForPHIOfOps ? nullptr
+                                  : findLeaderForInst(ValueOp, Visited,
+                                                      MemAccess, I, PredBB);
+      ValueOp->deleteValue();
+      if (!FoundVal) {
+        // We failed to find a leader for the current ValueOp, but this might
+        // change in case of the translated operands change.
+        if (SafeForPHIOfOps)
+          for (auto Dep : CurrentDeps)
+            addAdditionalUsers(Dep, I);
+
+        return nullptr;
+      }
+      Deps.insert(CurrentDeps.begin(), CurrentDeps.end());
+    } else {
+      LLVM_DEBUG(dbgs() << "Skipping phi of ops operand for incoming block "
+                        << getBlockName(PredBB)
+                        << " because the block is unreachable\n");
+      FoundVal = UndefValue::get(I->getType());
+      RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I));
+    }
+
+    PHIOps.push_back({FoundVal, PredBB});
+    LLVM_DEBUG(dbgs() << "Found phi of ops operand " << *FoundVal << " in "
+                      << getBlockName(PredBB) << "\n");
+  }
+  for (auto Dep : Deps)
+    addAdditionalUsers(Dep, I);
+  sortPHIOps(PHIOps);
+  auto *E = performSymbolicPHIEvaluation(PHIOps, I, PHIBlock);
+  if (isa<ConstantExpression>(E) || isa<VariableExpression>(E)) {
+    LLVM_DEBUG(
+        dbgs()
+        << "Not creating real PHI of ops because it simplified to existing "
+           "value or constant\n");
+    return E;
+  }
+  auto *ValuePHI = RealToTemp.lookup(I);
+  bool NewPHI = false;
+  if (!ValuePHI) {
+    ValuePHI =
+        PHINode::Create(I->getType(), OpPHI->getNumOperands(), "phiofops");
+    addPhiOfOps(ValuePHI, PHIBlock, I);
+    NewPHI = true;
+    NumGVNPHIOfOpsCreated++;
+  }
+  if (NewPHI) {
+    for (auto PHIOp : PHIOps)
+      ValuePHI->addIncoming(PHIOp.first, PHIOp.second);
+  } else {
+    TempToBlock[ValuePHI] = PHIBlock;
+    unsigned int i = 0;
+    for (auto PHIOp : PHIOps) {
+      ValuePHI->setIncomingValue(i, PHIOp.first);
+      ValuePHI->setIncomingBlock(i, PHIOp.second);
+      ++i;
+    }
+  }
+  RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I));
+  LLVM_DEBUG(dbgs() << "Created phi of ops " << *ValuePHI << " for " << *I
+                    << "\n");
+
+  return E;
+}
+
+// The algorithm initially places the values of the routine in the TOP
+// congruence class. The leader of TOP is the undetermined value `undef`.
+// When the algorithm has finished, values still in TOP are unreachable.
+void NewGVN::initializeCongruenceClasses(Function &F) {
+  NextCongruenceNum = 0;
+
+  // Note that even though we use the live on entry def as a representative
+  // MemoryAccess, it is *not* the same as the actual live on entry def. We
+  // have no real equivalemnt to undef for MemoryAccesses, and so we really
+  // should be checking whether the MemoryAccess is top if we want to know if it
+  // is equivalent to everything.  Otherwise, what this really signifies is that
+  // the access "it reaches all the way back to the beginning of the function"
+
+  // Initialize all other instructions to be in TOP class.
+  TOPClass = createCongruenceClass(nullptr, nullptr);
+  TOPClass->setMemoryLeader(MSSA->getLiveOnEntryDef());
+  //  The live on entry def gets put into it's own class
+  MemoryAccessToClass[MSSA->getLiveOnEntryDef()] =
+      createMemoryClass(MSSA->getLiveOnEntryDef());
+
+  for (auto DTN : nodes(DT)) {
+    BasicBlock *BB = DTN->getBlock();
+    // All MemoryAccesses are equivalent to live on entry to start. They must
+    // be initialized to something so that initial changes are noticed. For
+    // the maximal answer, we initialize them all to be the same as
+    // liveOnEntry.
+    auto *MemoryBlockDefs = MSSA->getBlockDefs(BB);
+    if (MemoryBlockDefs)
+      for (const auto &Def : *MemoryBlockDefs) {
+        MemoryAccessToClass[&Def] = TOPClass;
+        auto *MD = dyn_cast<MemoryDef>(&Def);
+        // Insert the memory phis into the member list.
+        if (!MD) {
+          const MemoryPhi *MP = cast<MemoryPhi>(&Def);
+          TOPClass->memory_insert(MP);
+          MemoryPhiState.insert({MP, MPS_TOP});
+        }
+
+        if (MD && isa<StoreInst>(MD->getMemoryInst()))
+          TOPClass->incStoreCount();
+      }
+
+    // FIXME: This is trying to discover which instructions are uses of phi
+    // nodes.  We should move this into one of the myriad of places that walk
+    // all the operands already.
+    for (auto &I : *BB) {
+      if (isa<PHINode>(&I))
+        for (auto *U : I.users())
+          if (auto *UInst = dyn_cast<Instruction>(U))
+            if (InstrToDFSNum(UInst) != 0 && okayForPHIOfOps(UInst))
+              PHINodeUses.insert(UInst);
+      // Don't insert void terminators into the class. We don't value number
+      // them, and they just end up sitting in TOP.
+      if (I.isTerminator() && I.getType()->isVoidTy())
+        continue;
+      TOPClass->insert(&I);
+      ValueToClass[&I] = TOPClass;
+    }
+  }
+
+  // Initialize arguments to be in their own unique congruence classes
+  for (auto &FA : F.args())
+    createSingletonCongruenceClass(&FA);
+}
+
+void NewGVN::cleanupTables() {
+  for (unsigned i = 0, e = CongruenceClasses.size(); i != e; ++i) {
+    LLVM_DEBUG(dbgs() << "Congruence class " << CongruenceClasses[i]->getID()
+                      << " has " << CongruenceClasses[i]->size()
+                      << " members\n");
+    // Make sure we delete the congruence class (probably worth switching to
+    // a unique_ptr at some point.
+    delete CongruenceClasses[i];
+    CongruenceClasses[i] = nullptr;
+  }
+
+  // Destroy the value expressions
+  SmallVector<Instruction *, 8> TempInst(AllTempInstructions.begin(),
+                                         AllTempInstructions.end());
+  AllTempInstructions.clear();
+
+  // We have to drop all references for everything first, so there are no uses
+  // left as we delete them.
+  for (auto *I : TempInst) {
+    I->dropAllReferences();
+  }
+
+  while (!TempInst.empty()) {
     auto *I = TempInst.pop_back_val();
-    I->deleteValue(); 
-  } 
- 
-  ValueToClass.clear(); 
-  ArgRecycler.clear(ExpressionAllocator); 
-  ExpressionAllocator.Reset(); 
-  CongruenceClasses.clear(); 
-  ExpressionToClass.clear(); 
-  ValueToExpression.clear(); 
-  RealToTemp.clear(); 
-  AdditionalUsers.clear(); 
-  ExpressionToPhiOfOps.clear(); 
-  TempToBlock.clear(); 
-  TempToMemory.clear(); 
-  PHINodeUses.clear(); 
-  OpSafeForPHIOfOps.clear(); 
-  ReachableBlocks.clear(); 
-  ReachableEdges.clear(); 
-#ifndef NDEBUG 
-  ProcessedCount.clear(); 
-#endif 
-  InstrDFS.clear(); 
-  InstructionsToErase.clear(); 
-  DFSToInstr.clear(); 
-  BlockInstRange.clear(); 
-  TouchedInstructions.clear(); 
-  MemoryAccessToClass.clear(); 
-  PredicateToUsers.clear(); 
-  MemoryToUsers.clear(); 
-  RevisitOnReachabilityChange.clear(); 
-} 
- 
-// Assign local DFS number mapping to instructions, and leave space for Value 
-// PHI's. 
-std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B, 
-                                                       unsigned Start) { 
-  unsigned End = Start; 
-  if (MemoryAccess *MemPhi = getMemoryAccess(B)) { 
-    InstrDFS[MemPhi] = End++; 
-    DFSToInstr.emplace_back(MemPhi); 
-  } 
- 
-  // Then the real block goes next. 
-  for (auto &I : *B) { 
-    // There's no need to call isInstructionTriviallyDead more than once on 
-    // an instruction. Therefore, once we know that an instruction is dead 
-    // we change its DFS number so that it doesn't get value numbered. 
-    if (isInstructionTriviallyDead(&I, TLI)) { 
-      InstrDFS[&I] = 0; 
-      LLVM_DEBUG(dbgs() << "Skipping trivially dead instruction " << I << "\n"); 
-      markInstructionForDeletion(&I); 
-      continue; 
-    } 
-    if (isa<PHINode>(&I)) 
-      RevisitOnReachabilityChange[B].set(End); 
-    InstrDFS[&I] = End++; 
-    DFSToInstr.emplace_back(&I); 
-  } 
- 
-  // All of the range functions taken half-open ranges (open on the end side). 
-  // So we do not subtract one from count, because at this point it is one 
-  // greater than the last instruction. 
-  return std::make_pair(Start, End); 
-} 
- 
-void NewGVN::updateProcessedCount(const Value *V) { 
-#ifndef NDEBUG 
-  if (ProcessedCount.count(V) == 0) { 
-    ProcessedCount.insert({V, 1}); 
-  } else { 
-    ++ProcessedCount[V]; 
-    assert(ProcessedCount[V] < 100 && 
-           "Seem to have processed the same Value a lot"); 
-  } 
-#endif 
-} 
- 
-// Evaluate MemoryPhi nodes symbolically, just like PHI nodes 
-void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) { 
-  // If all the arguments are the same, the MemoryPhi has the same value as the 
-  // argument.  Filter out unreachable blocks and self phis from our operands. 
-  // TODO: We could do cycle-checking on the memory phis to allow valueizing for 
-  // self-phi checking. 
-  const BasicBlock *PHIBlock = MP->getBlock(); 
-  auto Filtered = make_filter_range(MP->operands(), [&](const Use &U) { 
-    return cast<MemoryAccess>(U) != MP && 
-           !isMemoryAccessTOP(cast<MemoryAccess>(U)) && 
-           ReachableEdges.count({MP->getIncomingBlock(U), PHIBlock}); 
-  }); 
-  // If all that is left is nothing, our memoryphi is undef. We keep it as 
-  // InitialClass.  Note: The only case this should happen is if we have at 
-  // least one self-argument. 
-  if (Filtered.begin() == Filtered.end()) { 
-    if (setMemoryClass(MP, TOPClass)) 
-      markMemoryUsersTouched(MP); 
-    return; 
-  } 
- 
-  // Transform the remaining operands into operand leaders. 
-  // FIXME: mapped_iterator should have a range version. 
-  auto LookupFunc = [&](const Use &U) { 
-    return lookupMemoryLeader(cast<MemoryAccess>(U)); 
-  }; 
-  auto MappedBegin = map_iterator(Filtered.begin(), LookupFunc); 
-  auto MappedEnd = map_iterator(Filtered.end(), LookupFunc); 
- 
-  // and now check if all the elements are equal. 
-  // Sadly, we can't use std::equals since these are random access iterators. 
-  const auto *AllSameValue = *MappedBegin; 
-  ++MappedBegin; 
-  bool AllEqual = std::all_of( 
-      MappedBegin, MappedEnd, 
-      [&AllSameValue](const MemoryAccess *V) { return V == AllSameValue; }); 
- 
-  if (AllEqual) 
-    LLVM_DEBUG(dbgs() << "Memory Phi value numbered to " << *AllSameValue 
-                      << "\n"); 
-  else 
-    LLVM_DEBUG(dbgs() << "Memory Phi value numbered to itself\n"); 
-  // If it's equal to something, it's in that class. Otherwise, it has to be in 
-  // a class where it is the leader (other things may be equivalent to it, but 
-  // it needs to start off in its own class, which means it must have been the 
-  // leader, and it can't have stopped being the leader because it was never 
-  // removed). 
-  CongruenceClass *CC = 
-      AllEqual ? getMemoryClass(AllSameValue) : ensureLeaderOfMemoryClass(MP); 
-  auto OldState = MemoryPhiState.lookup(MP); 
-  assert(OldState != MPS_Invalid && "Invalid memory phi state"); 
-  auto NewState = AllEqual ? MPS_Equivalent : MPS_Unique; 
-  MemoryPhiState[MP] = NewState; 
-  if (setMemoryClass(MP, CC) || OldState != NewState) 
-    markMemoryUsersTouched(MP); 
-} 
- 
-// Value number a single instruction, symbolically evaluating, performing 
-// congruence finding, and updating mappings. 
-void NewGVN::valueNumberInstruction(Instruction *I) { 
-  LLVM_DEBUG(dbgs() << "Processing instruction " << *I << "\n"); 
-  if (!I->isTerminator()) { 
-    const Expression *Symbolized = nullptr; 
-    SmallPtrSet<Value *, 2> Visited; 
-    if (DebugCounter::shouldExecute(VNCounter)) { 
-      Symbolized = performSymbolicEvaluation(I, Visited); 
-      // Make a phi of ops if necessary 
-      if (Symbolized && !isa<ConstantExpression>(Symbolized) && 
-          !isa<VariableExpression>(Symbolized) && PHINodeUses.count(I)) { 
-        auto *PHIE = makePossiblePHIOfOps(I, Visited); 
-        // If we created a phi of ops, use it. 
-        // If we couldn't create one, make sure we don't leave one lying around 
-        if (PHIE) { 
-          Symbolized = PHIE; 
-        } else if (auto *Op = RealToTemp.lookup(I)) { 
-          removePhiOfOps(I, Op); 
-        } 
-      } 
-    } else { 
-      // Mark the instruction as unused so we don't value number it again. 
-      InstrDFS[I] = 0; 
-    } 
-    // If we couldn't come up with a symbolic expression, use the unknown 
-    // expression 
-    if (Symbolized == nullptr) 
-      Symbolized = createUnknownExpression(I); 
-    performCongruenceFinding(I, Symbolized); 
-  } else { 
-    // Handle terminators that return values. All of them produce values we 
-    // don't currently understand.  We don't place non-value producing 
-    // terminators in a class. 
-    if (!I->getType()->isVoidTy()) { 
-      auto *Symbolized = createUnknownExpression(I); 
-      performCongruenceFinding(I, Symbolized); 
-    } 
-    processOutgoingEdges(I, I->getParent()); 
-  } 
-} 
- 
-// Check if there is a path, using single or equal argument phi nodes, from 
-// First to Second. 
-bool NewGVN::singleReachablePHIPath( 
-    SmallPtrSet<const MemoryAccess *, 8> &Visited, const MemoryAccess *First, 
-    const MemoryAccess *Second) const { 
-  if (First == Second) 
-    return true; 
-  if (MSSA->isLiveOnEntryDef(First)) 
-    return false; 
- 
-  // This is not perfect, but as we're just verifying here, we can live with 
-  // the loss of precision. The real solution would be that of doing strongly 
-  // connected component finding in this routine, and it's probably not worth 
-  // the complexity for the time being. So, we just keep a set of visited 
-  // MemoryAccess and return true when we hit a cycle. 
-  if (Visited.count(First)) 
-    return true; 
-  Visited.insert(First); 
- 
-  const auto *EndDef = First; 
-  for (auto *ChainDef : optimized_def_chain(First)) { 
-    if (ChainDef == Second) 
-      return true; 
-    if (MSSA->isLiveOnEntryDef(ChainDef)) 
-      return false; 
-    EndDef = ChainDef; 
-  } 
-  auto *MP = cast<MemoryPhi>(EndDef); 
-  auto ReachableOperandPred = [&](const Use &U) { 
-    return ReachableEdges.count({MP->getIncomingBlock(U), MP->getBlock()}); 
-  }; 
-  auto FilteredPhiArgs = 
-      make_filter_range(MP->operands(), ReachableOperandPred); 
-  SmallVector<const Value *, 32> OperandList; 
-  llvm::copy(FilteredPhiArgs, std::back_inserter(OperandList)); 
-  bool Okay = is_splat(OperandList); 
-  if (Okay) 
-    return singleReachablePHIPath(Visited, cast<MemoryAccess>(OperandList[0]), 
-                                  Second); 
-  return false; 
-} 
- 
-// Verify the that the memory equivalence table makes sense relative to the 
-// congruence classes.  Note that this checking is not perfect, and is currently 
-// subject to very rare false negatives. It is only useful for 
-// testing/debugging. 
-void NewGVN::verifyMemoryCongruency() const { 
-#ifndef NDEBUG 
-  // Verify that the memory table equivalence and memory member set match 
-  for (const auto *CC : CongruenceClasses) { 
-    if (CC == TOPClass || CC->isDead()) 
-      continue; 
-    if (CC->getStoreCount() != 0) { 
-      assert((CC->getStoredValue() || !isa<StoreInst>(CC->getLeader())) && 
-             "Any class with a store as a leader should have a " 
-             "representative stored value"); 
-      assert(CC->getMemoryLeader() && 
-             "Any congruence class with a store should have a " 
-             "representative access"); 
-    } 
- 
-    if (CC->getMemoryLeader()) 
-      assert(MemoryAccessToClass.lookup(CC->getMemoryLeader()) == CC && 
-             "Representative MemoryAccess does not appear to be reverse " 
-             "mapped properly"); 
-    for (auto M : CC->memory()) 
-      assert(MemoryAccessToClass.lookup(M) == CC && 
-             "Memory member does not appear to be reverse mapped properly"); 
-  } 
- 
-  // Anything equivalent in the MemoryAccess table should be in the same 
-  // congruence class. 
- 
-  // Filter out the unreachable and trivially dead entries, because they may 
-  // never have been updated if the instructions were not processed. 
-  auto ReachableAccessPred = 
-      [&](const std::pair<const MemoryAccess *, CongruenceClass *> Pair) { 
-        bool Result = ReachableBlocks.count(Pair.first->getBlock()); 
-        if (!Result || MSSA->isLiveOnEntryDef(Pair.first) || 
-            MemoryToDFSNum(Pair.first) == 0) 
-          return false; 
-        if (auto *MemDef = dyn_cast<MemoryDef>(Pair.first)) 
-          return !isInstructionTriviallyDead(MemDef->getMemoryInst()); 
- 
-        // We could have phi nodes which operands are all trivially dead, 
-        // so we don't process them. 
-        if (auto *MemPHI = dyn_cast<MemoryPhi>(Pair.first)) { 
-          for (auto &U : MemPHI->incoming_values()) { 
-            if (auto *I = dyn_cast<Instruction>(&*U)) { 
-              if (!isInstructionTriviallyDead(I)) 
-                return true; 
-            } 
-          } 
-          return false; 
-        } 
- 
-        return true; 
-      }; 
- 
-  auto Filtered = make_filter_range(MemoryAccessToClass, ReachableAccessPred); 
-  for (auto KV : Filtered) { 
-    if (auto *FirstMUD = dyn_cast<MemoryUseOrDef>(KV.first)) { 
-      auto *SecondMUD = dyn_cast<MemoryUseOrDef>(KV.second->getMemoryLeader()); 
-      if (FirstMUD && SecondMUD) { 
-        SmallPtrSet<const MemoryAccess *, 8> VisitedMAS; 
-        assert((singleReachablePHIPath(VisitedMAS, FirstMUD, SecondMUD) || 
-                ValueToClass.lookup(FirstMUD->getMemoryInst()) == 
-                    ValueToClass.lookup(SecondMUD->getMemoryInst())) && 
-               "The instructions for these memory operations should have " 
-               "been in the same congruence class or reachable through" 
-               "a single argument phi"); 
-      } 
-    } else if (auto *FirstMP = dyn_cast<MemoryPhi>(KV.first)) { 
-      // We can only sanely verify that MemoryDefs in the operand list all have 
-      // the same class. 
-      auto ReachableOperandPred = [&](const Use &U) { 
-        return ReachableEdges.count( 
-                   {FirstMP->getIncomingBlock(U), FirstMP->getBlock()}) && 
-               isa<MemoryDef>(U); 
- 
-      }; 
-      // All arguments should in the same class, ignoring unreachable arguments 
-      auto FilteredPhiArgs = 
-          make_filter_range(FirstMP->operands(), ReachableOperandPred); 
-      SmallVector<const CongruenceClass *, 16> PhiOpClasses; 
-      std::transform(FilteredPhiArgs.begin(), FilteredPhiArgs.end(), 
-                     std::back_inserter(PhiOpClasses), [&](const Use &U) { 
-                       const MemoryDef *MD = cast<MemoryDef>(U); 
-                       return ValueToClass.lookup(MD->getMemoryInst()); 
-                     }); 
-      assert(is_splat(PhiOpClasses) && 
-             "All MemoryPhi arguments should be in the same class"); 
-    } 
-  } 
-#endif 
-} 
- 
-// Verify that the sparse propagation we did actually found the maximal fixpoint 
-// We do this by storing the value to class mapping, touching all instructions, 
-// and redoing the iteration to see if anything changed. 
-void NewGVN::verifyIterationSettled(Function &F) { 
-#ifndef NDEBUG 
-  LLVM_DEBUG(dbgs() << "Beginning iteration verification\n"); 
-  if (DebugCounter::isCounterSet(VNCounter)) 
-    DebugCounter::setCounterValue(VNCounter, StartingVNCounter); 
- 
-  // Note that we have to store the actual classes, as we may change existing 
-  // classes during iteration.  This is because our memory iteration propagation 
-  // is not perfect, and so may waste a little work.  But it should generate 
-  // exactly the same congruence classes we have now, with different IDs. 
-  std::map<const Value *, CongruenceClass> BeforeIteration; 
- 
-  for (auto &KV : ValueToClass) { 
-    if (auto *I = dyn_cast<Instruction>(KV.first)) 
-      // Skip unused/dead instructions. 
-      if (InstrToDFSNum(I) == 0) 
-        continue; 
-    BeforeIteration.insert({KV.first, *KV.second}); 
-  } 
- 
-  TouchedInstructions.set(); 
-  TouchedInstructions.reset(0); 
-  iterateTouchedInstructions(); 
-  DenseSet<std::pair<const CongruenceClass *, const CongruenceClass *>> 
-      EqualClasses; 
-  for (const auto &KV : ValueToClass) { 
-    if (auto *I = dyn_cast<Instruction>(KV.first)) 
-      // Skip unused/dead instructions. 
-      if (InstrToDFSNum(I) == 0) 
-        continue; 
-    // We could sink these uses, but i think this adds a bit of clarity here as 
-    // to what we are comparing. 
-    auto *BeforeCC = &BeforeIteration.find(KV.first)->second; 
-    auto *AfterCC = KV.second; 
-    // Note that the classes can't change at this point, so we memoize the set 
-    // that are equal. 
-    if (!EqualClasses.count({BeforeCC, AfterCC})) { 
-      assert(BeforeCC->isEquivalentTo(AfterCC) && 
-             "Value number changed after main loop completed!"); 
-      EqualClasses.insert({BeforeCC, AfterCC}); 
-    } 
-  } 
-#endif 
-} 
- 
-// Verify that for each store expression in the expression to class mapping, 
-// only the latest appears, and multiple ones do not appear. 
-// Because loads do not use the stored value when doing equality with stores, 
-// if we don't erase the old store expressions from the table, a load can find 
-// a no-longer valid StoreExpression. 
-void NewGVN::verifyStoreExpressions() const { 
-#ifndef NDEBUG 
-  // This is the only use of this, and it's not worth defining a complicated 
-  // densemapinfo hash/equality function for it. 
-  std::set< 
-      std::pair<const Value *, 
-                std::tuple<const Value *, const CongruenceClass *, Value *>>> 
-      StoreExpressionSet; 
-  for (const auto &KV : ExpressionToClass) { 
-    if (auto *SE = dyn_cast<StoreExpression>(KV.first)) { 
-      // Make sure a version that will conflict with loads is not already there 
-      auto Res = StoreExpressionSet.insert( 
-          {SE->getOperand(0), std::make_tuple(SE->getMemoryLeader(), KV.second, 
-                                              SE->getStoredValue())}); 
-      bool Okay = Res.second; 
-      // It's okay to have the same expression already in there if it is 
-      // identical in nature. 
-      // This can happen when the leader of the stored value changes over time. 
-      if (!Okay) 
-        Okay = (std::get<1>(Res.first->second) == KV.second) && 
-               (lookupOperandLeader(std::get<2>(Res.first->second)) == 
-                lookupOperandLeader(SE->getStoredValue())); 
-      assert(Okay && "Stored expression conflict exists in expression table"); 
-      auto *ValueExpr = ValueToExpression.lookup(SE->getStoreInst()); 
-      assert(ValueExpr && ValueExpr->equals(*SE) && 
-             "StoreExpression in ExpressionToClass is not latest " 
-             "StoreExpression for value"); 
-    } 
-  } 
-#endif 
-} 
- 
-// This is the main value numbering loop, it iterates over the initial touched 
-// instruction set, propagating value numbers, marking things touched, etc, 
-// until the set of touched instructions is completely empty. 
-void NewGVN::iterateTouchedInstructions() { 
-  unsigned int Iterations = 0; 
-  // Figure out where touchedinstructions starts 
-  int FirstInstr = TouchedInstructions.find_first(); 
-  // Nothing set, nothing to iterate, just return. 
-  if (FirstInstr == -1) 
-    return; 
-  const BasicBlock *LastBlock = getBlockForValue(InstrFromDFSNum(FirstInstr)); 
-  while (TouchedInstructions.any()) { 
-    ++Iterations; 
-    // Walk through all the instructions in all the blocks in RPO. 
-    // TODO: As we hit a new block, we should push and pop equalities into a 
-    // table lookupOperandLeader can use, to catch things PredicateInfo 
-    // might miss, like edge-only equivalences. 
-    for (unsigned InstrNum : TouchedInstructions.set_bits()) { 
- 
-      // This instruction was found to be dead. We don't bother looking 
-      // at it again. 
-      if (InstrNum == 0) { 
-        TouchedInstructions.reset(InstrNum); 
-        continue; 
-      } 
- 
-      Value *V = InstrFromDFSNum(InstrNum); 
-      const BasicBlock *CurrBlock = getBlockForValue(V); 
- 
-      // If we hit a new block, do reachability processing. 
-      if (CurrBlock != LastBlock) { 
-        LastBlock = CurrBlock; 
-        bool BlockReachable = ReachableBlocks.count(CurrBlock); 
-        const auto &CurrInstRange = BlockInstRange.lookup(CurrBlock); 
- 
-        // If it's not reachable, erase any touched instructions and move on. 
-        if (!BlockReachable) { 
-          TouchedInstructions.reset(CurrInstRange.first, CurrInstRange.second); 
-          LLVM_DEBUG(dbgs() << "Skipping instructions in block " 
-                            << getBlockName(CurrBlock) 
-                            << " because it is unreachable\n"); 
-          continue; 
-        } 
-        updateProcessedCount(CurrBlock); 
-      } 
-      // Reset after processing (because we may mark ourselves as touched when 
-      // we propagate equalities). 
-      TouchedInstructions.reset(InstrNum); 
- 
-      if (auto *MP = dyn_cast<MemoryPhi>(V)) { 
-        LLVM_DEBUG(dbgs() << "Processing MemoryPhi " << *MP << "\n"); 
-        valueNumberMemoryPhi(MP); 
-      } else if (auto *I = dyn_cast<Instruction>(V)) { 
-        valueNumberInstruction(I); 
-      } else { 
-        llvm_unreachable("Should have been a MemoryPhi or Instruction"); 
-      } 
-      updateProcessedCount(V); 
-    } 
-  } 
-  NumGVNMaxIterations = std::max(NumGVNMaxIterations.getValue(), Iterations); 
-} 
- 
-// This is the main transformation entry point. 
-bool NewGVN::runGVN() { 
-  if (DebugCounter::isCounterSet(VNCounter)) 
-    StartingVNCounter = DebugCounter::getCounterValue(VNCounter); 
-  bool Changed = false; 
-  NumFuncArgs = F.arg_size(); 
-  MSSAWalker = MSSA->getWalker(); 
-  SingletonDeadExpression = new (ExpressionAllocator) DeadExpression(); 
- 
-  // Count number of instructions for sizing of hash tables, and come 
-  // up with a global dfs numbering for instructions. 
-  unsigned ICount = 1; 
-  // Add an empty instruction to account for the fact that we start at 1 
-  DFSToInstr.emplace_back(nullptr); 
-  // Note: We want ideal RPO traversal of the blocks, which is not quite the 
-  // same as dominator tree order, particularly with regard whether backedges 
-  // get visited first or second, given a block with multiple successors. 
-  // If we visit in the wrong order, we will end up performing N times as many 
-  // iterations. 
-  // The dominator tree does guarantee that, for a given dom tree node, it's 
-  // parent must occur before it in the RPO ordering. Thus, we only need to sort 
-  // the siblings. 
-  ReversePostOrderTraversal<Function *> RPOT(&F); 
-  unsigned Counter = 0; 
-  for (auto &B : RPOT) { 
-    auto *Node = DT->getNode(B); 
-    assert(Node && "RPO and Dominator tree should have same reachability"); 
-    RPOOrdering[Node] = ++Counter; 
-  } 
-  // Sort dominator tree children arrays into RPO. 
-  for (auto &B : RPOT) { 
-    auto *Node = DT->getNode(B); 
-    if (Node->getNumChildren() > 1) 
+    I->deleteValue();
+  }
+
+  ValueToClass.clear();
+  ArgRecycler.clear(ExpressionAllocator);
+  ExpressionAllocator.Reset();
+  CongruenceClasses.clear();
+  ExpressionToClass.clear();
+  ValueToExpression.clear();
+  RealToTemp.clear();
+  AdditionalUsers.clear();
+  ExpressionToPhiOfOps.clear();
+  TempToBlock.clear();
+  TempToMemory.clear();
+  PHINodeUses.clear();
+  OpSafeForPHIOfOps.clear();
+  ReachableBlocks.clear();
+  ReachableEdges.clear();
+#ifndef NDEBUG
+  ProcessedCount.clear();
+#endif
+  InstrDFS.clear();
+  InstructionsToErase.clear();
+  DFSToInstr.clear();
+  BlockInstRange.clear();
+  TouchedInstructions.clear();
+  MemoryAccessToClass.clear();
+  PredicateToUsers.clear();
+  MemoryToUsers.clear();
+  RevisitOnReachabilityChange.clear();
+}
+
+// Assign local DFS number mapping to instructions, and leave space for Value
+// PHI's.
+std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B,
+                                                       unsigned Start) {
+  unsigned End = Start;
+  if (MemoryAccess *MemPhi = getMemoryAccess(B)) {
+    InstrDFS[MemPhi] = End++;
+    DFSToInstr.emplace_back(MemPhi);
+  }
+
+  // Then the real block goes next.
+  for (auto &I : *B) {
+    // There's no need to call isInstructionTriviallyDead more than once on
+    // an instruction. Therefore, once we know that an instruction is dead
+    // we change its DFS number so that it doesn't get value numbered.
+    if (isInstructionTriviallyDead(&I, TLI)) {
+      InstrDFS[&I] = 0;
+      LLVM_DEBUG(dbgs() << "Skipping trivially dead instruction " << I << "\n");
+      markInstructionForDeletion(&I);
+      continue;
+    }
+    if (isa<PHINode>(&I))
+      RevisitOnReachabilityChange[B].set(End);
+    InstrDFS[&I] = End++;
+    DFSToInstr.emplace_back(&I);
+  }
+
+  // All of the range functions taken half-open ranges (open on the end side).
+  // So we do not subtract one from count, because at this point it is one
+  // greater than the last instruction.
+  return std::make_pair(Start, End);
+}
+
+void NewGVN::updateProcessedCount(const Value *V) {
+#ifndef NDEBUG
+  if (ProcessedCount.count(V) == 0) {
+    ProcessedCount.insert({V, 1});
+  } else {
+    ++ProcessedCount[V];
+    assert(ProcessedCount[V] < 100 &&
+           "Seem to have processed the same Value a lot");
+  }
+#endif
+}
+
+// Evaluate MemoryPhi nodes symbolically, just like PHI nodes
+void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) {
+  // If all the arguments are the same, the MemoryPhi has the same value as the
+  // argument.  Filter out unreachable blocks and self phis from our operands.
+  // TODO: We could do cycle-checking on the memory phis to allow valueizing for
+  // self-phi checking.
+  const BasicBlock *PHIBlock = MP->getBlock();
+  auto Filtered = make_filter_range(MP->operands(), [&](const Use &U) {
+    return cast<MemoryAccess>(U) != MP &&
+           !isMemoryAccessTOP(cast<MemoryAccess>(U)) &&
+           ReachableEdges.count({MP->getIncomingBlock(U), PHIBlock});
+  });
+  // If all that is left is nothing, our memoryphi is undef. We keep it as
+  // InitialClass.  Note: The only case this should happen is if we have at
+  // least one self-argument.
+  if (Filtered.begin() == Filtered.end()) {
+    if (setMemoryClass(MP, TOPClass))
+      markMemoryUsersTouched(MP);
+    return;
+  }
+
+  // Transform the remaining operands into operand leaders.
+  // FIXME: mapped_iterator should have a range version.
+  auto LookupFunc = [&](const Use &U) {
+    return lookupMemoryLeader(cast<MemoryAccess>(U));
+  };
+  auto MappedBegin = map_iterator(Filtered.begin(), LookupFunc);
+  auto MappedEnd = map_iterator(Filtered.end(), LookupFunc);
+
+  // and now check if all the elements are equal.
+  // Sadly, we can't use std::equals since these are random access iterators.
+  const auto *AllSameValue = *MappedBegin;
+  ++MappedBegin;
+  bool AllEqual = std::all_of(
+      MappedBegin, MappedEnd,
+      [&AllSameValue](const MemoryAccess *V) { return V == AllSameValue; });
+
+  if (AllEqual)
+    LLVM_DEBUG(dbgs() << "Memory Phi value numbered to " << *AllSameValue
+                      << "\n");
+  else
+    LLVM_DEBUG(dbgs() << "Memory Phi value numbered to itself\n");
+  // If it's equal to something, it's in that class. Otherwise, it has to be in
+  // a class where it is the leader (other things may be equivalent to it, but
+  // it needs to start off in its own class, which means it must have been the
+  // leader, and it can't have stopped being the leader because it was never
+  // removed).
+  CongruenceClass *CC =
+      AllEqual ? getMemoryClass(AllSameValue) : ensureLeaderOfMemoryClass(MP);
+  auto OldState = MemoryPhiState.lookup(MP);
+  assert(OldState != MPS_Invalid && "Invalid memory phi state");
+  auto NewState = AllEqual ? MPS_Equivalent : MPS_Unique;
+  MemoryPhiState[MP] = NewState;
+  if (setMemoryClass(MP, CC) || OldState != NewState)
+    markMemoryUsersTouched(MP);
+}
+
+// Value number a single instruction, symbolically evaluating, performing
+// congruence finding, and updating mappings.
+void NewGVN::valueNumberInstruction(Instruction *I) {
+  LLVM_DEBUG(dbgs() << "Processing instruction " << *I << "\n");
+  if (!I->isTerminator()) {
+    const Expression *Symbolized = nullptr;
+    SmallPtrSet<Value *, 2> Visited;
+    if (DebugCounter::shouldExecute(VNCounter)) {
+      Symbolized = performSymbolicEvaluation(I, Visited);
+      // Make a phi of ops if necessary
+      if (Symbolized && !isa<ConstantExpression>(Symbolized) &&
+          !isa<VariableExpression>(Symbolized) && PHINodeUses.count(I)) {
+        auto *PHIE = makePossiblePHIOfOps(I, Visited);
+        // If we created a phi of ops, use it.
+        // If we couldn't create one, make sure we don't leave one lying around
+        if (PHIE) {
+          Symbolized = PHIE;
+        } else if (auto *Op = RealToTemp.lookup(I)) {
+          removePhiOfOps(I, Op);
+        }
+      }
+    } else {
+      // Mark the instruction as unused so we don't value number it again.
+      InstrDFS[I] = 0;
+    }
+    // If we couldn't come up with a symbolic expression, use the unknown
+    // expression
+    if (Symbolized == nullptr)
+      Symbolized = createUnknownExpression(I);
+    performCongruenceFinding(I, Symbolized);
+  } else {
+    // Handle terminators that return values. All of them produce values we
+    // don't currently understand.  We don't place non-value producing
+    // terminators in a class.
+    if (!I->getType()->isVoidTy()) {
+      auto *Symbolized = createUnknownExpression(I);
+      performCongruenceFinding(I, Symbolized);
+    }
+    processOutgoingEdges(I, I->getParent());
+  }
+}
+
+// Check if there is a path, using single or equal argument phi nodes, from
+// First to Second.
+bool NewGVN::singleReachablePHIPath(
+    SmallPtrSet<const MemoryAccess *, 8> &Visited, const MemoryAccess *First,
+    const MemoryAccess *Second) const {
+  if (First == Second)
+    return true;
+  if (MSSA->isLiveOnEntryDef(First))
+    return false;
+
+  // This is not perfect, but as we're just verifying here, we can live with
+  // the loss of precision. The real solution would be that of doing strongly
+  // connected component finding in this routine, and it's probably not worth
+  // the complexity for the time being. So, we just keep a set of visited
+  // MemoryAccess and return true when we hit a cycle.
+  if (Visited.count(First))
+    return true;
+  Visited.insert(First);
+
+  const auto *EndDef = First;
+  for (auto *ChainDef : optimized_def_chain(First)) {
+    if (ChainDef == Second)
+      return true;
+    if (MSSA->isLiveOnEntryDef(ChainDef))
+      return false;
+    EndDef = ChainDef;
+  }
+  auto *MP = cast<MemoryPhi>(EndDef);
+  auto ReachableOperandPred = [&](const Use &U) {
+    return ReachableEdges.count({MP->getIncomingBlock(U), MP->getBlock()});
+  };
+  auto FilteredPhiArgs =
+      make_filter_range(MP->operands(), ReachableOperandPred);
+  SmallVector<const Value *, 32> OperandList;
+  llvm::copy(FilteredPhiArgs, std::back_inserter(OperandList));
+  bool Okay = is_splat(OperandList);
+  if (Okay)
+    return singleReachablePHIPath(Visited, cast<MemoryAccess>(OperandList[0]),
+                                  Second);
+  return false;
+}
+
+// Verify the that the memory equivalence table makes sense relative to the
+// congruence classes.  Note that this checking is not perfect, and is currently
+// subject to very rare false negatives. It is only useful for
+// testing/debugging.
+void NewGVN::verifyMemoryCongruency() const {
+#ifndef NDEBUG
+  // Verify that the memory table equivalence and memory member set match
+  for (const auto *CC : CongruenceClasses) {
+    if (CC == TOPClass || CC->isDead())
+      continue;
+    if (CC->getStoreCount() != 0) {
+      assert((CC->getStoredValue() || !isa<StoreInst>(CC->getLeader())) &&
+             "Any class with a store as a leader should have a "
+             "representative stored value");
+      assert(CC->getMemoryLeader() &&
+             "Any congruence class with a store should have a "
+             "representative access");
+    }
+
+    if (CC->getMemoryLeader())
+      assert(MemoryAccessToClass.lookup(CC->getMemoryLeader()) == CC &&
+             "Representative MemoryAccess does not appear to be reverse "
+             "mapped properly");
+    for (auto M : CC->memory())
+      assert(MemoryAccessToClass.lookup(M) == CC &&
+             "Memory member does not appear to be reverse mapped properly");
+  }
+
+  // Anything equivalent in the MemoryAccess table should be in the same
+  // congruence class.
+
+  // Filter out the unreachable and trivially dead entries, because they may
+  // never have been updated if the instructions were not processed.
+  auto ReachableAccessPred =
+      [&](const std::pair<const MemoryAccess *, CongruenceClass *> Pair) {
+        bool Result = ReachableBlocks.count(Pair.first->getBlock());
+        if (!Result || MSSA->isLiveOnEntryDef(Pair.first) ||
+            MemoryToDFSNum(Pair.first) == 0)
+          return false;
+        if (auto *MemDef = dyn_cast<MemoryDef>(Pair.first))
+          return !isInstructionTriviallyDead(MemDef->getMemoryInst());
+
+        // We could have phi nodes which operands are all trivially dead,
+        // so we don't process them.
+        if (auto *MemPHI = dyn_cast<MemoryPhi>(Pair.first)) {
+          for (auto &U : MemPHI->incoming_values()) {
+            if (auto *I = dyn_cast<Instruction>(&*U)) {
+              if (!isInstructionTriviallyDead(I))
+                return true;
+            }
+          }
+          return false;
+        }
+
+        return true;
+      };
+
+  auto Filtered = make_filter_range(MemoryAccessToClass, ReachableAccessPred);
+  for (auto KV : Filtered) {
+    if (auto *FirstMUD = dyn_cast<MemoryUseOrDef>(KV.first)) {
+      auto *SecondMUD = dyn_cast<MemoryUseOrDef>(KV.second->getMemoryLeader());
+      if (FirstMUD && SecondMUD) {
+        SmallPtrSet<const MemoryAccess *, 8> VisitedMAS;
+        assert((singleReachablePHIPath(VisitedMAS, FirstMUD, SecondMUD) ||
+                ValueToClass.lookup(FirstMUD->getMemoryInst()) ==
+                    ValueToClass.lookup(SecondMUD->getMemoryInst())) &&
+               "The instructions for these memory operations should have "
+               "been in the same congruence class or reachable through"
+               "a single argument phi");
+      }
+    } else if (auto *FirstMP = dyn_cast<MemoryPhi>(KV.first)) {
+      // We can only sanely verify that MemoryDefs in the operand list all have
+      // the same class.
+      auto ReachableOperandPred = [&](const Use &U) {
+        return ReachableEdges.count(
+                   {FirstMP->getIncomingBlock(U), FirstMP->getBlock()}) &&
+               isa<MemoryDef>(U);
+
+      };
+      // All arguments should in the same class, ignoring unreachable arguments
+      auto FilteredPhiArgs =
+          make_filter_range(FirstMP->operands(), ReachableOperandPred);
+      SmallVector<const CongruenceClass *, 16> PhiOpClasses;
+      std::transform(FilteredPhiArgs.begin(), FilteredPhiArgs.end(),
+                     std::back_inserter(PhiOpClasses), [&](const Use &U) {
+                       const MemoryDef *MD = cast<MemoryDef>(U);
+                       return ValueToClass.lookup(MD->getMemoryInst());
+                     });
+      assert(is_splat(PhiOpClasses) &&
+             "All MemoryPhi arguments should be in the same class");
+    }
+  }
+#endif
+}
+
+// Verify that the sparse propagation we did actually found the maximal fixpoint
+// We do this by storing the value to class mapping, touching all instructions,
+// and redoing the iteration to see if anything changed.
+void NewGVN::verifyIterationSettled(Function &F) {
+#ifndef NDEBUG
+  LLVM_DEBUG(dbgs() << "Beginning iteration verification\n");
+  if (DebugCounter::isCounterSet(VNCounter))
+    DebugCounter::setCounterValue(VNCounter, StartingVNCounter);
+
+  // Note that we have to store the actual classes, as we may change existing
+  // classes during iteration.  This is because our memory iteration propagation
+  // is not perfect, and so may waste a little work.  But it should generate
+  // exactly the same congruence classes we have now, with different IDs.
+  std::map<const Value *, CongruenceClass> BeforeIteration;
+
+  for (auto &KV : ValueToClass) {
+    if (auto *I = dyn_cast<Instruction>(KV.first))
+      // Skip unused/dead instructions.
+      if (InstrToDFSNum(I) == 0)
+        continue;
+    BeforeIteration.insert({KV.first, *KV.second});
+  }
+
+  TouchedInstructions.set();
+  TouchedInstructions.reset(0);
+  iterateTouchedInstructions();
+  DenseSet<std::pair<const CongruenceClass *, const CongruenceClass *>>
+      EqualClasses;
+  for (const auto &KV : ValueToClass) {
+    if (auto *I = dyn_cast<Instruction>(KV.first))
+      // Skip unused/dead instructions.
+      if (InstrToDFSNum(I) == 0)
+        continue;
+    // We could sink these uses, but i think this adds a bit of clarity here as
+    // to what we are comparing.
+    auto *BeforeCC = &BeforeIteration.find(KV.first)->second;
+    auto *AfterCC = KV.second;
+    // Note that the classes can't change at this point, so we memoize the set
+    // that are equal.
+    if (!EqualClasses.count({BeforeCC, AfterCC})) {
+      assert(BeforeCC->isEquivalentTo(AfterCC) &&
+             "Value number changed after main loop completed!");
+      EqualClasses.insert({BeforeCC, AfterCC});
+    }
+  }
+#endif
+}
+
+// Verify that for each store expression in the expression to class mapping,
+// only the latest appears, and multiple ones do not appear.
+// Because loads do not use the stored value when doing equality with stores,
+// if we don't erase the old store expressions from the table, a load can find
+// a no-longer valid StoreExpression.
+void NewGVN::verifyStoreExpressions() const {
+#ifndef NDEBUG
+  // This is the only use of this, and it's not worth defining a complicated
+  // densemapinfo hash/equality function for it.
+  std::set<
+      std::pair<const Value *,
+                std::tuple<const Value *, const CongruenceClass *, Value *>>>
+      StoreExpressionSet;
+  for (const auto &KV : ExpressionToClass) {
+    if (auto *SE = dyn_cast<StoreExpression>(KV.first)) {
+      // Make sure a version that will conflict with loads is not already there
+      auto Res = StoreExpressionSet.insert(
+          {SE->getOperand(0), std::make_tuple(SE->getMemoryLeader(), KV.second,
+                                              SE->getStoredValue())});
+      bool Okay = Res.second;
+      // It's okay to have the same expression already in there if it is
+      // identical in nature.
+      // This can happen when the leader of the stored value changes over time.
+      if (!Okay)
+        Okay = (std::get<1>(Res.first->second) == KV.second) &&
+               (lookupOperandLeader(std::get<2>(Res.first->second)) ==
+                lookupOperandLeader(SE->getStoredValue()));
+      assert(Okay && "Stored expression conflict exists in expression table");
+      auto *ValueExpr = ValueToExpression.lookup(SE->getStoreInst());
+      assert(ValueExpr && ValueExpr->equals(*SE) &&
+             "StoreExpression in ExpressionToClass is not latest "
+             "StoreExpression for value");
+    }
+  }
+#endif
+}
+
+// This is the main value numbering loop, it iterates over the initial touched
+// instruction set, propagating value numbers, marking things touched, etc,
+// until the set of touched instructions is completely empty.
+void NewGVN::iterateTouchedInstructions() {
+  unsigned int Iterations = 0;
+  // Figure out where touchedinstructions starts
+  int FirstInstr = TouchedInstructions.find_first();
+  // Nothing set, nothing to iterate, just return.
+  if (FirstInstr == -1)
+    return;
+  const BasicBlock *LastBlock = getBlockForValue(InstrFromDFSNum(FirstInstr));
+  while (TouchedInstructions.any()) {
+    ++Iterations;
+    // Walk through all the instructions in all the blocks in RPO.
+    // TODO: As we hit a new block, we should push and pop equalities into a
+    // table lookupOperandLeader can use, to catch things PredicateInfo
+    // might miss, like edge-only equivalences.
+    for (unsigned InstrNum : TouchedInstructions.set_bits()) {
+
+      // This instruction was found to be dead. We don't bother looking
+      // at it again.
+      if (InstrNum == 0) {
+        TouchedInstructions.reset(InstrNum);
+        continue;
+      }
+
+      Value *V = InstrFromDFSNum(InstrNum);
+      const BasicBlock *CurrBlock = getBlockForValue(V);
+
+      // If we hit a new block, do reachability processing.
+      if (CurrBlock != LastBlock) {
+        LastBlock = CurrBlock;
+        bool BlockReachable = ReachableBlocks.count(CurrBlock);
+        const auto &CurrInstRange = BlockInstRange.lookup(CurrBlock);
+
+        // If it's not reachable, erase any touched instructions and move on.
+        if (!BlockReachable) {
+          TouchedInstructions.reset(CurrInstRange.first, CurrInstRange.second);
+          LLVM_DEBUG(dbgs() << "Skipping instructions in block "
+                            << getBlockName(CurrBlock)
+                            << " because it is unreachable\n");
+          continue;
+        }
+        updateProcessedCount(CurrBlock);
+      }
+      // Reset after processing (because we may mark ourselves as touched when
+      // we propagate equalities).
+      TouchedInstructions.reset(InstrNum);
+
+      if (auto *MP = dyn_cast<MemoryPhi>(V)) {
+        LLVM_DEBUG(dbgs() << "Processing MemoryPhi " << *MP << "\n");
+        valueNumberMemoryPhi(MP);
+      } else if (auto *I = dyn_cast<Instruction>(V)) {
+        valueNumberInstruction(I);
+      } else {
+        llvm_unreachable("Should have been a MemoryPhi or Instruction");
+      }
+      updateProcessedCount(V);
+    }
+  }
+  NumGVNMaxIterations = std::max(NumGVNMaxIterations.getValue(), Iterations);
+}
+
+// This is the main transformation entry point.
+bool NewGVN::runGVN() {
+  if (DebugCounter::isCounterSet(VNCounter))
+    StartingVNCounter = DebugCounter::getCounterValue(VNCounter);
+  bool Changed = false;
+  NumFuncArgs = F.arg_size();
+  MSSAWalker = MSSA->getWalker();
+  SingletonDeadExpression = new (ExpressionAllocator) DeadExpression();
+
+  // Count number of instructions for sizing of hash tables, and come
+  // up with a global dfs numbering for instructions.
+  unsigned ICount = 1;
+  // Add an empty instruction to account for the fact that we start at 1
+  DFSToInstr.emplace_back(nullptr);
+  // Note: We want ideal RPO traversal of the blocks, which is not quite the
+  // same as dominator tree order, particularly with regard whether backedges
+  // get visited first or second, given a block with multiple successors.
+  // If we visit in the wrong order, we will end up performing N times as many
+  // iterations.
+  // The dominator tree does guarantee that, for a given dom tree node, it's
+  // parent must occur before it in the RPO ordering. Thus, we only need to sort
+  // the siblings.
+  ReversePostOrderTraversal<Function *> RPOT(&F);
+  unsigned Counter = 0;
+  for (auto &B : RPOT) {
+    auto *Node = DT->getNode(B);
+    assert(Node && "RPO and Dominator tree should have same reachability");
+    RPOOrdering[Node] = ++Counter;
+  }
+  // Sort dominator tree children arrays into RPO.
+  for (auto &B : RPOT) {
+    auto *Node = DT->getNode(B);
+    if (Node->getNumChildren() > 1)
       llvm::sort(*Node, [&](const DomTreeNode *A, const DomTreeNode *B) {
         return RPOOrdering[A] < RPOOrdering[B];
       });
-  } 
- 
-  // Now a standard depth first ordering of the domtree is equivalent to RPO. 
-  for (auto DTN : depth_first(DT->getRootNode())) { 
-    BasicBlock *B = DTN->getBlock(); 
-    const auto &BlockRange = assignDFSNumbers(B, ICount); 
-    BlockInstRange.insert({B, BlockRange}); 
-    ICount += BlockRange.second - BlockRange.first; 
-  } 
-  initializeCongruenceClasses(F); 
- 
-  TouchedInstructions.resize(ICount); 
-  // Ensure we don't end up resizing the expressionToClass map, as 
-  // that can be quite expensive. At most, we have one expression per 
-  // instruction. 
-  ExpressionToClass.reserve(ICount); 
- 
-  // Initialize the touched instructions to include the entry block. 
-  const auto &InstRange = BlockInstRange.lookup(&F.getEntryBlock()); 
-  TouchedInstructions.set(InstRange.first, InstRange.second); 
-  LLVM_DEBUG(dbgs() << "Block " << getBlockName(&F.getEntryBlock()) 
-                    << " marked reachable\n"); 
-  ReachableBlocks.insert(&F.getEntryBlock()); 
- 
-  iterateTouchedInstructions(); 
-  verifyMemoryCongruency(); 
-  verifyIterationSettled(F); 
-  verifyStoreExpressions(); 
- 
-  Changed |= eliminateInstructions(F); 
- 
-  // Delete all instructions marked for deletion. 
-  for (Instruction *ToErase : InstructionsToErase) { 
-    if (!ToErase->use_empty()) 
-      ToErase->replaceAllUsesWith(UndefValue::get(ToErase->getType())); 
- 
-    assert(ToErase->getParent() && 
-           "BB containing ToErase deleted unexpectedly!"); 
-    ToErase->eraseFromParent(); 
-  } 
-  Changed |= !InstructionsToErase.empty(); 
- 
-  // Delete all unreachable blocks. 
-  auto UnreachableBlockPred = [&](const BasicBlock &BB) { 
-    return !ReachableBlocks.count(&BB); 
-  }; 
- 
-  for (auto &BB : make_filter_range(F, UnreachableBlockPred)) { 
-    LLVM_DEBUG(dbgs() << "We believe block " << getBlockName(&BB) 
-                      << " is unreachable\n"); 
-    deleteInstructionsInBlock(&BB); 
-    Changed = true; 
-  } 
- 
-  cleanupTables(); 
-  return Changed; 
-} 
- 
-struct NewGVN::ValueDFS { 
-  int DFSIn = 0; 
-  int DFSOut = 0; 
-  int LocalNum = 0; 
- 
-  // Only one of Def and U will be set. 
-  // The bool in the Def tells us whether the Def is the stored value of a 
-  // store. 
-  PointerIntPair<Value *, 1, bool> Def; 
-  Use *U = nullptr; 
- 
-  bool operator<(const ValueDFS &Other) const { 
-    // It's not enough that any given field be less than - we have sets 
-    // of fields that need to be evaluated together to give a proper ordering. 
-    // For example, if you have; 
-    // DFS (1, 3) 
-    // Val 0 
-    // DFS (1, 2) 
-    // Val 50 
-    // We want the second to be less than the first, but if we just go field 
-    // by field, we will get to Val 0 < Val 50 and say the first is less than 
-    // the second. We only want it to be less than if the DFS orders are equal. 
-    // 
-    // Each LLVM instruction only produces one value, and thus the lowest-level 
-    // differentiator that really matters for the stack (and what we use as as a 
-    // replacement) is the local dfs number. 
-    // Everything else in the structure is instruction level, and only affects 
-    // the order in which we will replace operands of a given instruction. 
-    // 
-    // For a given instruction (IE things with equal dfsin, dfsout, localnum), 
-    // the order of replacement of uses does not matter. 
-    // IE given, 
-    //  a = 5 
-    //  b = a + a 
-    // When you hit b, you will have two valuedfs with the same dfsin, out, and 
-    // localnum. 
-    // The .val will be the same as well. 
-    // The .u's will be different. 
-    // You will replace both, and it does not matter what order you replace them 
-    // in (IE whether you replace operand 2, then operand 1, or operand 1, then 
-    // operand 2). 
-    // Similarly for the case of same dfsin, dfsout, localnum, but different 
-    // .val's 
-    //  a = 5 
-    //  b  = 6 
-    //  c = a + b 
-    // in c, we will a valuedfs for a, and one for b,with everything the same 
-    // but .val  and .u. 
-    // It does not matter what order we replace these operands in. 
-    // You will always end up with the same IR, and this is guaranteed. 
-    return std::tie(DFSIn, DFSOut, LocalNum, Def, U) < 
-           std::tie(Other.DFSIn, Other.DFSOut, Other.LocalNum, Other.Def, 
-                    Other.U); 
-  } 
-}; 
- 
-// This function converts the set of members for a congruence class from values, 
-// to sets of defs and uses with associated DFS info.  The total number of 
-// reachable uses for each value is stored in UseCount, and instructions that 
-// seem 
-// dead (have no non-dead uses) are stored in ProbablyDead. 
-void NewGVN::convertClassToDFSOrdered( 
-    const CongruenceClass &Dense, SmallVectorImpl<ValueDFS> &DFSOrderedSet, 
-    DenseMap<const Value *, unsigned int> &UseCounts, 
-    SmallPtrSetImpl<Instruction *> &ProbablyDead) const { 
-  for (auto D : Dense) { 
-    // First add the value. 
-    BasicBlock *BB = getBlockForValue(D); 
-    // Constants are handled prior to ever calling this function, so 
-    // we should only be left with instructions as members. 
-    assert(BB && "Should have figured out a basic block for value"); 
-    ValueDFS VDDef; 
-    DomTreeNode *DomNode = DT->getNode(BB); 
-    VDDef.DFSIn = DomNode->getDFSNumIn(); 
-    VDDef.DFSOut = DomNode->getDFSNumOut(); 
-    // If it's a store, use the leader of the value operand, if it's always 
-    // available, or the value operand.  TODO: We could do dominance checks to 
-    // find a dominating leader, but not worth it ATM. 
-    if (auto *SI = dyn_cast<StoreInst>(D)) { 
-      auto Leader = lookupOperandLeader(SI->getValueOperand()); 
-      if (alwaysAvailable(Leader)) { 
-        VDDef.Def.setPointer(Leader); 
-      } else { 
-        VDDef.Def.setPointer(SI->getValueOperand()); 
-        VDDef.Def.setInt(true); 
-      } 
-    } else { 
-      VDDef.Def.setPointer(D); 
-    } 
-    assert(isa<Instruction>(D) && 
-           "The dense set member should always be an instruction"); 
-    Instruction *Def = cast<Instruction>(D); 
-    VDDef.LocalNum = InstrToDFSNum(D); 
-    DFSOrderedSet.push_back(VDDef); 
-    // If there is a phi node equivalent, add it 
-    if (auto *PN = RealToTemp.lookup(Def)) { 
-      auto *PHIE = 
-          dyn_cast_or_null<PHIExpression>(ValueToExpression.lookup(Def)); 
-      if (PHIE) { 
-        VDDef.Def.setInt(false); 
-        VDDef.Def.setPointer(PN); 
-        VDDef.LocalNum = 0; 
-        DFSOrderedSet.push_back(VDDef); 
-      } 
-    } 
- 
-    unsigned int UseCount = 0; 
-    // Now add the uses. 
-    for (auto &U : Def->uses()) { 
-      if (auto *I = dyn_cast<Instruction>(U.getUser())) { 
-        // Don't try to replace into dead uses 
-        if (InstructionsToErase.count(I)) 
-          continue; 
-        ValueDFS VDUse; 
-        // Put the phi node uses in the incoming block. 
-        BasicBlock *IBlock; 
-        if (auto *P = dyn_cast<PHINode>(I)) { 
-          IBlock = P->getIncomingBlock(U); 
-          // Make phi node users appear last in the incoming block 
-          // they are from. 
-          VDUse.LocalNum = InstrDFS.size() + 1; 
-        } else { 
-          IBlock = getBlockForValue(I); 
-          VDUse.LocalNum = InstrToDFSNum(I); 
-        } 
- 
-        // Skip uses in unreachable blocks, as we're going 
-        // to delete them. 
-        if (ReachableBlocks.count(IBlock) == 0) 
-          continue; 
- 
-        DomTreeNode *DomNode = DT->getNode(IBlock); 
-        VDUse.DFSIn = DomNode->getDFSNumIn(); 
-        VDUse.DFSOut = DomNode->getDFSNumOut(); 
-        VDUse.U = &U; 
-        ++UseCount; 
-        DFSOrderedSet.emplace_back(VDUse); 
-      } 
-    } 
- 
-    // If there are no uses, it's probably dead (but it may have side-effects, 
-    // so not definitely dead. Otherwise, store the number of uses so we can 
-    // track if it becomes dead later). 
-    if (UseCount == 0) 
-      ProbablyDead.insert(Def); 
-    else 
-      UseCounts[Def] = UseCount; 
-  } 
-} 
- 
-// This function converts the set of members for a congruence class from values, 
-// to the set of defs for loads and stores, with associated DFS info. 
-void NewGVN::convertClassToLoadsAndStores( 
-    const CongruenceClass &Dense, 
-    SmallVectorImpl<ValueDFS> &LoadsAndStores) const { 
-  for (auto D : Dense) { 
-    if (!isa<LoadInst>(D) && !isa<StoreInst>(D)) 
-      continue; 
- 
-    BasicBlock *BB = getBlockForValue(D); 
-    ValueDFS VD; 
-    DomTreeNode *DomNode = DT->getNode(BB); 
-    VD.DFSIn = DomNode->getDFSNumIn(); 
-    VD.DFSOut = DomNode->getDFSNumOut(); 
-    VD.Def.setPointer(D); 
- 
-    // If it's an instruction, use the real local dfs number. 
-    if (auto *I = dyn_cast<Instruction>(D)) 
-      VD.LocalNum = InstrToDFSNum(I); 
-    else 
-      llvm_unreachable("Should have been an instruction"); 
- 
-    LoadsAndStores.emplace_back(VD); 
-  } 
-} 
- 
-static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) { 
-  patchReplacementInstruction(I, Repl); 
-  I->replaceAllUsesWith(Repl); 
-} 
- 
-void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) { 
-  LLVM_DEBUG(dbgs() << "  BasicBlock Dead:" << *BB); 
-  ++NumGVNBlocksDeleted; 
- 
-  // Delete the instructions backwards, as it has a reduced likelihood of having 
-  // to update as many def-use and use-def chains. Start after the terminator. 
-  auto StartPoint = BB->rbegin(); 
-  ++StartPoint; 
-  // Note that we explicitly recalculate BB->rend() on each iteration, 
-  // as it may change when we remove the first instruction. 
-  for (BasicBlock::reverse_iterator I(StartPoint); I != BB->rend();) { 
-    Instruction &Inst = *I++; 
-    if (!Inst.use_empty()) 
-      Inst.replaceAllUsesWith(UndefValue::get(Inst.getType())); 
-    if (isa<LandingPadInst>(Inst)) 
-      continue; 
-    salvageKnowledge(&Inst, AC); 
- 
-    Inst.eraseFromParent(); 
-    ++NumGVNInstrDeleted; 
-  } 
-  // Now insert something that simplifycfg will turn into an unreachable. 
-  Type *Int8Ty = Type::getInt8Ty(BB->getContext()); 
-  new StoreInst(UndefValue::get(Int8Ty), 
-                Constant::getNullValue(Int8Ty->getPointerTo()), 
-                BB->getTerminator()); 
-} 
- 
-void NewGVN::markInstructionForDeletion(Instruction *I) { 
-  LLVM_DEBUG(dbgs() << "Marking " << *I << " for deletion\n"); 
-  InstructionsToErase.insert(I); 
-} 
- 
-void NewGVN::replaceInstruction(Instruction *I, Value *V) { 
-  LLVM_DEBUG(dbgs() << "Replacing " << *I << " with " << *V << "\n"); 
-  patchAndReplaceAllUsesWith(I, V); 
-  // We save the actual erasing to avoid invalidating memory 
-  // dependencies until we are done with everything. 
-  markInstructionForDeletion(I); 
-} 
- 
-namespace { 
- 
-// This is a stack that contains both the value and dfs info of where 
-// that value is valid. 
-class ValueDFSStack { 
-public: 
-  Value *back() const { return ValueStack.back(); } 
-  std::pair<int, int> dfs_back() const { return DFSStack.back(); } 
- 
-  void push_back(Value *V, int DFSIn, int DFSOut) { 
-    ValueStack.emplace_back(V); 
-    DFSStack.emplace_back(DFSIn, DFSOut); 
-  } 
- 
-  bool empty() const { return DFSStack.empty(); } 
- 
-  bool isInScope(int DFSIn, int DFSOut) const { 
-    if (empty()) 
-      return false; 
-    return DFSIn >= DFSStack.back().first && DFSOut <= DFSStack.back().second; 
-  } 
- 
-  void popUntilDFSScope(int DFSIn, int DFSOut) { 
- 
-    // These two should always be in sync at this point. 
-    assert(ValueStack.size() == DFSStack.size() && 
-           "Mismatch between ValueStack and DFSStack"); 
-    while ( 
-        !DFSStack.empty() && 
-        !(DFSIn >= DFSStack.back().first && DFSOut <= DFSStack.back().second)) { 
-      DFSStack.pop_back(); 
-      ValueStack.pop_back(); 
-    } 
-  } 
- 
-private: 
-  SmallVector<Value *, 8> ValueStack; 
-  SmallVector<std::pair<int, int>, 8> DFSStack; 
-}; 
- 
-} // end anonymous namespace 
- 
-// Given an expression, get the congruence class for it. 
-CongruenceClass *NewGVN::getClassForExpression(const Expression *E) const { 
-  if (auto *VE = dyn_cast<VariableExpression>(E)) 
-    return ValueToClass.lookup(VE->getVariableValue()); 
-  else if (isa<DeadExpression>(E)) 
-    return TOPClass; 
-  return ExpressionToClass.lookup(E); 
-} 
- 
-// Given a value and a basic block we are trying to see if it is available in, 
-// see if the value has a leader available in that block. 
-Value *NewGVN::findPHIOfOpsLeader(const Expression *E, 
-                                  const Instruction *OrigInst, 
-                                  const BasicBlock *BB) const { 
-  // It would already be constant if we could make it constant 
-  if (auto *CE = dyn_cast<ConstantExpression>(E)) 
-    return CE->getConstantValue(); 
-  if (auto *VE = dyn_cast<VariableExpression>(E)) { 
-    auto *V = VE->getVariableValue(); 
-    if (alwaysAvailable(V) || DT->dominates(getBlockForValue(V), BB)) 
-      return VE->getVariableValue(); 
-  } 
- 
-  auto *CC = getClassForExpression(E); 
-  if (!CC) 
-    return nullptr; 
-  if (alwaysAvailable(CC->getLeader())) 
-    return CC->getLeader(); 
- 
-  for (auto Member : *CC) { 
-    auto *MemberInst = dyn_cast<Instruction>(Member); 
-    if (MemberInst == OrigInst) 
-      continue; 
-    // Anything that isn't an instruction is always available. 
-    if (!MemberInst) 
-      return Member; 
-    if (DT->dominates(getBlockForValue(MemberInst), BB)) 
-      return Member; 
-  } 
-  return nullptr; 
-} 
- 
-bool NewGVN::eliminateInstructions(Function &F) { 
-  // This is a non-standard eliminator. The normal way to eliminate is 
-  // to walk the dominator tree in order, keeping track of available 
-  // values, and eliminating them.  However, this is mildly 
-  // pointless. It requires doing lookups on every instruction, 
-  // regardless of whether we will ever eliminate it.  For 
-  // instructions part of most singleton congruence classes, we know we 
-  // will never eliminate them. 
- 
-  // Instead, this eliminator looks at the congruence classes directly, sorts 
-  // them into a DFS ordering of the dominator tree, and then we just 
-  // perform elimination straight on the sets by walking the congruence 
-  // class member uses in order, and eliminate the ones dominated by the 
-  // last member.   This is worst case O(E log E) where E = number of 
-  // instructions in a single congruence class.  In theory, this is all 
-  // instructions.   In practice, it is much faster, as most instructions are 
-  // either in singleton congruence classes or can't possibly be eliminated 
-  // anyway (if there are no overlapping DFS ranges in class). 
-  // When we find something not dominated, it becomes the new leader 
-  // for elimination purposes. 
-  // TODO: If we wanted to be faster, We could remove any members with no 
-  // overlapping ranges while sorting, as we will never eliminate anything 
-  // with those members, as they don't dominate anything else in our set. 
- 
-  bool AnythingReplaced = false; 
- 
-  // Since we are going to walk the domtree anyway, and we can't guarantee the 
-  // DFS numbers are updated, we compute some ourselves. 
-  DT->updateDFSNumbers(); 
- 
-  // Go through all of our phi nodes, and kill the arguments associated with 
-  // unreachable edges. 
-  auto ReplaceUnreachablePHIArgs = [&](PHINode *PHI, BasicBlock *BB) { 
-    for (auto &Operand : PHI->incoming_values()) 
-      if (!ReachableEdges.count({PHI->getIncomingBlock(Operand), BB})) { 
-        LLVM_DEBUG(dbgs() << "Replacing incoming value of " << PHI 
-                          << " for block " 
-                          << getBlockName(PHI->getIncomingBlock(Operand)) 
-                          << " with undef due to it being unreachable\n"); 
-        Operand.set(UndefValue::get(PHI->getType())); 
-      } 
-  }; 
-  // Replace unreachable phi arguments. 
-  // At this point, RevisitOnReachabilityChange only contains: 
-  // 
-  // 1. PHIs 
-  // 2. Temporaries that will convert to PHIs 
-  // 3. Operations that are affected by an unreachable edge but do not fit into 
-  // 1 or 2 (rare). 
-  // So it is a slight overshoot of what we want. We could make it exact by 
-  // using two SparseBitVectors per block. 
-  DenseMap<const BasicBlock *, unsigned> ReachablePredCount; 
-  for (auto &KV : ReachableEdges) 
-    ReachablePredCount[KV.getEnd()]++; 
-  for (auto &BBPair : RevisitOnReachabilityChange) { 
-    for (auto InstNum : BBPair.second) { 
-      auto *Inst = InstrFromDFSNum(InstNum); 
-      auto *PHI = dyn_cast<PHINode>(Inst); 
-      PHI = PHI ? PHI : dyn_cast_or_null<PHINode>(RealToTemp.lookup(Inst)); 
-      if (!PHI) 
-        continue; 
-      auto *BB = BBPair.first; 
-      if (ReachablePredCount.lookup(BB) != PHI->getNumIncomingValues()) 
-        ReplaceUnreachablePHIArgs(PHI, BB); 
-    } 
-  } 
- 
-  // Map to store the use counts 
-  DenseMap<const Value *, unsigned int> UseCounts; 
-  for (auto *CC : reverse(CongruenceClasses)) { 
-    LLVM_DEBUG(dbgs() << "Eliminating in congruence class " << CC->getID() 
-                      << "\n"); 
-    // Track the equivalent store info so we can decide whether to try 
-    // dead store elimination. 
-    SmallVector<ValueDFS, 8> PossibleDeadStores; 
-    SmallPtrSet<Instruction *, 8> ProbablyDead; 
-    if (CC->isDead() || CC->empty()) 
-      continue; 
-    // Everything still in the TOP class is unreachable or dead. 
-    if (CC == TOPClass) { 
-      for (auto M : *CC) { 
-        auto *VTE = ValueToExpression.lookup(M); 
-        if (VTE && isa<DeadExpression>(VTE)) 
-          markInstructionForDeletion(cast<Instruction>(M)); 
-        assert((!ReachableBlocks.count(cast<Instruction>(M)->getParent()) || 
-                InstructionsToErase.count(cast<Instruction>(M))) && 
-               "Everything in TOP should be unreachable or dead at this " 
-               "point"); 
-      } 
-      continue; 
-    } 
- 
-    assert(CC->getLeader() && "We should have had a leader"); 
-    // If this is a leader that is always available, and it's a 
-    // constant or has no equivalences, just replace everything with 
-    // it. We then update the congruence class with whatever members 
-    // are left. 
-    Value *Leader = 
-        CC->getStoredValue() ? CC->getStoredValue() : CC->getLeader(); 
-    if (alwaysAvailable(Leader)) { 
-      CongruenceClass::MemberSet MembersLeft; 
-      for (auto M : *CC) { 
-        Value *Member = M; 
-        // Void things have no uses we can replace. 
-        if (Member == Leader || !isa<Instruction>(Member) || 
-            Member->getType()->isVoidTy()) { 
-          MembersLeft.insert(Member); 
-          continue; 
-        } 
-        LLVM_DEBUG(dbgs() << "Found replacement " << *(Leader) << " for " 
-                          << *Member << "\n"); 
-        auto *I = cast<Instruction>(Member); 
-        assert(Leader != I && "About to accidentally remove our leader"); 
-        replaceInstruction(I, Leader); 
-        AnythingReplaced = true; 
-      } 
-      CC->swap(MembersLeft); 
-    } else { 
-      // If this is a singleton, we can skip it. 
-      if (CC->size() != 1 || RealToTemp.count(Leader)) { 
-        // This is a stack because equality replacement/etc may place 
-        // constants in the middle of the member list, and we want to use 
-        // those constant values in preference to the current leader, over 
-        // the scope of those constants. 
-        ValueDFSStack EliminationStack; 
- 
-        // Convert the members to DFS ordered sets and then merge them. 
-        SmallVector<ValueDFS, 8> DFSOrderedSet; 
-        convertClassToDFSOrdered(*CC, DFSOrderedSet, UseCounts, ProbablyDead); 
- 
-        // Sort the whole thing. 
-        llvm::sort(DFSOrderedSet); 
-        for (auto &VD : DFSOrderedSet) { 
-          int MemberDFSIn = VD.DFSIn; 
-          int MemberDFSOut = VD.DFSOut; 
-          Value *Def = VD.Def.getPointer(); 
-          bool FromStore = VD.Def.getInt(); 
-          Use *U = VD.U; 
-          // We ignore void things because we can't get a value from them. 
-          if (Def && Def->getType()->isVoidTy()) 
-            continue; 
-          auto *DefInst = dyn_cast_or_null<Instruction>(Def); 
-          if (DefInst && AllTempInstructions.count(DefInst)) { 
-            auto *PN = cast<PHINode>(DefInst); 
- 
-            // If this is a value phi and that's the expression we used, insert 
-            // it into the program 
-            // remove from temp instruction list. 
-            AllTempInstructions.erase(PN); 
-            auto *DefBlock = getBlockForValue(Def); 
-            LLVM_DEBUG(dbgs() << "Inserting fully real phi of ops" << *Def 
-                              << " into block " 
-                              << getBlockName(getBlockForValue(Def)) << "\n"); 
-            PN->insertBefore(&DefBlock->front()); 
-            Def = PN; 
-            NumGVNPHIOfOpsEliminations++; 
-          } 
- 
-          if (EliminationStack.empty()) { 
-            LLVM_DEBUG(dbgs() << "Elimination Stack is empty\n"); 
-          } else { 
-            LLVM_DEBUG(dbgs() << "Elimination Stack Top DFS numbers are (" 
-                              << EliminationStack.dfs_back().first << "," 
-                              << EliminationStack.dfs_back().second << ")\n"); 
-          } 
- 
-          LLVM_DEBUG(dbgs() << "Current DFS numbers are (" << MemberDFSIn << "," 
-                            << MemberDFSOut << ")\n"); 
-          // First, we see if we are out of scope or empty.  If so, 
-          // and there equivalences, we try to replace the top of 
-          // stack with equivalences (if it's on the stack, it must 
-          // not have been eliminated yet). 
-          // Then we synchronize to our current scope, by 
-          // popping until we are back within a DFS scope that 
-          // dominates the current member. 
-          // Then, what happens depends on a few factors 
-          // If the stack is now empty, we need to push 
-          // If we have a constant or a local equivalence we want to 
-          // start using, we also push. 
-          // Otherwise, we walk along, processing members who are 
-          // dominated by this scope, and eliminate them. 
-          bool ShouldPush = Def && EliminationStack.empty(); 
-          bool OutOfScope = 
-              !EliminationStack.isInScope(MemberDFSIn, MemberDFSOut); 
- 
-          if (OutOfScope || ShouldPush) { 
-            // Sync to our current scope. 
-            EliminationStack.popUntilDFSScope(MemberDFSIn, MemberDFSOut); 
-            bool ShouldPush = Def && EliminationStack.empty(); 
-            if (ShouldPush) { 
-              EliminationStack.push_back(Def, MemberDFSIn, MemberDFSOut); 
-            } 
-          } 
- 
-          // Skip the Def's, we only want to eliminate on their uses.  But mark 
-          // dominated defs as dead. 
-          if (Def) { 
-            // For anything in this case, what and how we value number 
-            // guarantees that any side-effets that would have occurred (ie 
-            // throwing, etc) can be proven to either still occur (because it's 
-            // dominated by something that has the same side-effects), or never 
-            // occur.  Otherwise, we would not have been able to prove it value 
-            // equivalent to something else. For these things, we can just mark 
-            // it all dead.  Note that this is different from the "ProbablyDead" 
-            // set, which may not be dominated by anything, and thus, are only 
-            // easy to prove dead if they are also side-effect free. Note that 
-            // because stores are put in terms of the stored value, we skip 
-            // stored values here. If the stored value is really dead, it will 
-            // still be marked for deletion when we process it in its own class. 
-            if (!EliminationStack.empty() && Def != EliminationStack.back() && 
-                isa<Instruction>(Def) && !FromStore) 
-              markInstructionForDeletion(cast<Instruction>(Def)); 
-            continue; 
-          } 
-          // At this point, we know it is a Use we are trying to possibly 
-          // replace. 
- 
-          assert(isa<Instruction>(U->get()) && 
-                 "Current def should have been an instruction"); 
-          assert(isa<Instruction>(U->getUser()) && 
-                 "Current user should have been an instruction"); 
- 
-          // If the thing we are replacing into is already marked to be dead, 
-          // this use is dead.  Note that this is true regardless of whether 
-          // we have anything dominating the use or not.  We do this here 
-          // because we are already walking all the uses anyway. 
-          Instruction *InstUse = cast<Instruction>(U->getUser()); 
-          if (InstructionsToErase.count(InstUse)) { 
-            auto &UseCount = UseCounts[U->get()]; 
-            if (--UseCount == 0) { 
-              ProbablyDead.insert(cast<Instruction>(U->get())); 
-            } 
-          } 
- 
-          // If we get to this point, and the stack is empty we must have a use 
-          // with nothing we can use to eliminate this use, so just skip it. 
-          if (EliminationStack.empty()) 
-            continue; 
- 
-          Value *DominatingLeader = EliminationStack.back(); 
- 
-          auto *II = dyn_cast<IntrinsicInst>(DominatingLeader); 
-          bool isSSACopy = II && II->getIntrinsicID() == Intrinsic::ssa_copy; 
-          if (isSSACopy) 
-            DominatingLeader = II->getOperand(0); 
- 
-          // Don't replace our existing users with ourselves. 
-          if (U->get() == DominatingLeader) 
-            continue; 
-          LLVM_DEBUG(dbgs() 
-                     << "Found replacement " << *DominatingLeader << " for " 
-                     << *U->get() << " in " << *(U->getUser()) << "\n"); 
- 
-          // If we replaced something in an instruction, handle the patching of 
-          // metadata.  Skip this if we are replacing predicateinfo with its 
-          // original operand, as we already know we can just drop it. 
-          auto *ReplacedInst = cast<Instruction>(U->get()); 
-          auto *PI = PredInfo->getPredicateInfoFor(ReplacedInst); 
-          if (!PI || DominatingLeader != PI->OriginalOp) 
-            patchReplacementInstruction(ReplacedInst, DominatingLeader); 
-          U->set(DominatingLeader); 
-          // This is now a use of the dominating leader, which means if the 
-          // dominating leader was dead, it's now live! 
-          auto &LeaderUseCount = UseCounts[DominatingLeader]; 
-          // It's about to be alive again. 
-          if (LeaderUseCount == 0 && isa<Instruction>(DominatingLeader)) 
-            ProbablyDead.erase(cast<Instruction>(DominatingLeader)); 
-          // For copy instructions, we use their operand as a leader, 
-          // which means we remove a user of the copy and it may become dead. 
-          if (isSSACopy) { 
-            unsigned &IIUseCount = UseCounts[II]; 
-            if (--IIUseCount == 0) 
-              ProbablyDead.insert(II); 
-          } 
-          ++LeaderUseCount; 
-          AnythingReplaced = true; 
-        } 
-      } 
-    } 
- 
-    // At this point, anything still in the ProbablyDead set is actually dead if 
-    // would be trivially dead. 
-    for (auto *I : ProbablyDead) 
-      if (wouldInstructionBeTriviallyDead(I)) 
-        markInstructionForDeletion(I); 
- 
-    // Cleanup the congruence class. 
-    CongruenceClass::MemberSet MembersLeft; 
-    for (auto *Member : *CC) 
-      if (!isa<Instruction>(Member) || 
-          !InstructionsToErase.count(cast<Instruction>(Member))) 
-        MembersLeft.insert(Member); 
-    CC->swap(MembersLeft); 
- 
-    // If we have possible dead stores to look at, try to eliminate them. 
-    if (CC->getStoreCount() > 0) { 
-      convertClassToLoadsAndStores(*CC, PossibleDeadStores); 
-      llvm::sort(PossibleDeadStores); 
-      ValueDFSStack EliminationStack; 
-      for (auto &VD : PossibleDeadStores) { 
-        int MemberDFSIn = VD.DFSIn; 
-        int MemberDFSOut = VD.DFSOut; 
-        Instruction *Member = cast<Instruction>(VD.Def.getPointer()); 
-        if (EliminationStack.empty() || 
-            !EliminationStack.isInScope(MemberDFSIn, MemberDFSOut)) { 
-          // Sync to our current scope. 
-          EliminationStack.popUntilDFSScope(MemberDFSIn, MemberDFSOut); 
-          if (EliminationStack.empty()) { 
-            EliminationStack.push_back(Member, MemberDFSIn, MemberDFSOut); 
-            continue; 
-          } 
-        } 
-        // We already did load elimination, so nothing to do here. 
-        if (isa<LoadInst>(Member)) 
-          continue; 
-        assert(!EliminationStack.empty()); 
-        Instruction *Leader = cast<Instruction>(EliminationStack.back()); 
-        (void)Leader; 
-        assert(DT->dominates(Leader->getParent(), Member->getParent())); 
-        // Member is dominater by Leader, and thus dead 
-        LLVM_DEBUG(dbgs() << "Marking dead store " << *Member 
-                          << " that is dominated by " << *Leader << "\n"); 
-        markInstructionForDeletion(Member); 
-        CC->erase(Member); 
-        ++NumGVNDeadStores; 
-      } 
-    } 
-  } 
-  return AnythingReplaced; 
-} 
- 
-// This function provides global ranking of operations so that we can place them 
-// in a canonical order.  Note that rank alone is not necessarily enough for a 
-// complete ordering, as constants all have the same rank.  However, generally, 
-// we will simplify an operation with all constants so that it doesn't matter 
-// what order they appear in. 
-unsigned int NewGVN::getRank(const Value *V) const { 
-  // Prefer constants to undef to anything else 
-  // Undef is a constant, have to check it first. 
-  // Prefer smaller constants to constantexprs 
-  if (isa<ConstantExpr>(V)) 
-    return 2; 
-  if (isa<UndefValue>(V)) 
-    return 1; 
-  if (isa<Constant>(V)) 
-    return 0; 
-  else if (auto *A = dyn_cast<Argument>(V)) 
-    return 3 + A->getArgNo(); 
- 
-  // Need to shift the instruction DFS by number of arguments + 3 to account for 
-  // the constant and argument ranking above. 
-  unsigned Result = InstrToDFSNum(V); 
-  if (Result > 0) 
-    return 4 + NumFuncArgs + Result; 
-  // Unreachable or something else, just return a really large number. 
-  return ~0; 
-} 
- 
-// This is a function that says whether two commutative operations should 
-// have their order swapped when canonicalizing. 
-bool NewGVN::shouldSwapOperands(const Value *A, const Value *B) const { 
-  // Because we only care about a total ordering, and don't rewrite expressions 
-  // in this order, we order by rank, which will give a strict weak ordering to 
-  // everything but constants, and then we order by pointer address. 
-  return std::make_pair(getRank(A), A) > std::make_pair(getRank(B), B); 
-} 
- 
-namespace { 
- 
-class NewGVNLegacyPass : public FunctionPass { 
-public: 
-  // Pass identification, replacement for typeid. 
-  static char ID; 
- 
-  NewGVNLegacyPass() : FunctionPass(ID) { 
-    initializeNewGVNLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnFunction(Function &F) override; 
- 
-private: 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<AssumptionCacheTracker>(); 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-    AU.addRequired<MemorySSAWrapperPass>(); 
-    AU.addRequired<AAResultsWrapperPass>(); 
-    AU.addPreserved<DominatorTreeWrapperPass>(); 
-    AU.addPreserved<GlobalsAAWrapperPass>(); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-bool NewGVNLegacyPass::runOnFunction(Function &F) { 
-  if (skipFunction(F)) 
-    return false; 
-  return NewGVN(F, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(), 
-                &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F), 
-                &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F), 
-                &getAnalysis<AAResultsWrapperPass>().getAAResults(), 
-                &getAnalysis<MemorySSAWrapperPass>().getMSSA(), 
-                F.getParent()->getDataLayout()) 
-      .runGVN(); 
-} 
- 
-char NewGVNLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(NewGVNLegacyPass, "newgvn", "Global Value Numbering", 
-                      false, false) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
-INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 
-INITIALIZE_PASS_END(NewGVNLegacyPass, "newgvn", "Global Value Numbering", false, 
-                    false) 
- 
-// createGVNPass - The public interface to this file. 
-FunctionPass *llvm::createNewGVNPass() { return new NewGVNLegacyPass(); } 
- 
-PreservedAnalyses NewGVNPass::run(Function &F, AnalysisManager<Function> &AM) { 
-  // Apparently the order in which we get these results matter for 
-  // the old GVN (see Chandler's comment in GVN.cpp). I'll keep 
-  // the same order here, just in case. 
-  auto &AC = AM.getResult<AssumptionAnalysis>(F); 
-  auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 
-  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 
-  auto &AA = AM.getResult<AAManager>(F); 
-  auto &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA(); 
-  bool Changed = 
-      NewGVN(F, &DT, &AC, &TLI, &AA, &MSSA, F.getParent()->getDataLayout()) 
-          .runGVN(); 
-  if (!Changed) 
-    return PreservedAnalyses::all(); 
-  PreservedAnalyses PA; 
-  PA.preserve<DominatorTreeAnalysis>(); 
-  PA.preserve<GlobalsAA>(); 
-  return PA; 
-} 
+  }
+
+  // Now a standard depth first ordering of the domtree is equivalent to RPO.
+  for (auto DTN : depth_first(DT->getRootNode())) {
+    BasicBlock *B = DTN->getBlock();
+    const auto &BlockRange = assignDFSNumbers(B, ICount);
+    BlockInstRange.insert({B, BlockRange});
+    ICount += BlockRange.second - BlockRange.first;
+  }
+  initializeCongruenceClasses(F);
+
+  TouchedInstructions.resize(ICount);
+  // Ensure we don't end up resizing the expressionToClass map, as
+  // that can be quite expensive. At most, we have one expression per
+  // instruction.
+  ExpressionToClass.reserve(ICount);
+
+  // Initialize the touched instructions to include the entry block.
+  const auto &InstRange = BlockInstRange.lookup(&F.getEntryBlock());
+  TouchedInstructions.set(InstRange.first, InstRange.second);
+  LLVM_DEBUG(dbgs() << "Block " << getBlockName(&F.getEntryBlock())
+                    << " marked reachable\n");
+  ReachableBlocks.insert(&F.getEntryBlock());
+
+  iterateTouchedInstructions();
+  verifyMemoryCongruency();
+  verifyIterationSettled(F);
+  verifyStoreExpressions();
+
+  Changed |= eliminateInstructions(F);
+
+  // Delete all instructions marked for deletion.
+  for (Instruction *ToErase : InstructionsToErase) {
+    if (!ToErase->use_empty())
+      ToErase->replaceAllUsesWith(UndefValue::get(ToErase->getType()));
+
+    assert(ToErase->getParent() &&
+           "BB containing ToErase deleted unexpectedly!");
+    ToErase->eraseFromParent();
+  }
+  Changed |= !InstructionsToErase.empty();
+
+  // Delete all unreachable blocks.
+  auto UnreachableBlockPred = [&](const BasicBlock &BB) {
+    return !ReachableBlocks.count(&BB);
+  };
+
+  for (auto &BB : make_filter_range(F, UnreachableBlockPred)) {
+    LLVM_DEBUG(dbgs() << "We believe block " << getBlockName(&BB)
+                      << " is unreachable\n");
+    deleteInstructionsInBlock(&BB);
+    Changed = true;
+  }
+
+  cleanupTables();
+  return Changed;
+}
+
+struct NewGVN::ValueDFS {
+  int DFSIn = 0;
+  int DFSOut = 0;
+  int LocalNum = 0;
+
+  // Only one of Def and U will be set.
+  // The bool in the Def tells us whether the Def is the stored value of a
+  // store.
+  PointerIntPair<Value *, 1, bool> Def;
+  Use *U = nullptr;
+
+  bool operator<(const ValueDFS &Other) const {
+    // It's not enough that any given field be less than - we have sets
+    // of fields that need to be evaluated together to give a proper ordering.
+    // For example, if you have;
+    // DFS (1, 3)
+    // Val 0
+    // DFS (1, 2)
+    // Val 50
+    // We want the second to be less than the first, but if we just go field
+    // by field, we will get to Val 0 < Val 50 and say the first is less than
+    // the second. We only want it to be less than if the DFS orders are equal.
+    //
+    // Each LLVM instruction only produces one value, and thus the lowest-level
+    // differentiator that really matters for the stack (and what we use as as a
+    // replacement) is the local dfs number.
+    // Everything else in the structure is instruction level, and only affects
+    // the order in which we will replace operands of a given instruction.
+    //
+    // For a given instruction (IE things with equal dfsin, dfsout, localnum),
+    // the order of replacement of uses does not matter.
+    // IE given,
+    //  a = 5
+    //  b = a + a
+    // When you hit b, you will have two valuedfs with the same dfsin, out, and
+    // localnum.
+    // The .val will be the same as well.
+    // The .u's will be different.
+    // You will replace both, and it does not matter what order you replace them
+    // in (IE whether you replace operand 2, then operand 1, or operand 1, then
+    // operand 2).
+    // Similarly for the case of same dfsin, dfsout, localnum, but different
+    // .val's
+    //  a = 5
+    //  b  = 6
+    //  c = a + b
+    // in c, we will a valuedfs for a, and one for b,with everything the same
+    // but .val  and .u.
+    // It does not matter what order we replace these operands in.
+    // You will always end up with the same IR, and this is guaranteed.
+    return std::tie(DFSIn, DFSOut, LocalNum, Def, U) <
+           std::tie(Other.DFSIn, Other.DFSOut, Other.LocalNum, Other.Def,
+                    Other.U);
+  }
+};
+
+// This function converts the set of members for a congruence class from values,
+// to sets of defs and uses with associated DFS info.  The total number of
+// reachable uses for each value is stored in UseCount, and instructions that
+// seem
+// dead (have no non-dead uses) are stored in ProbablyDead.
+void NewGVN::convertClassToDFSOrdered(
+    const CongruenceClass &Dense, SmallVectorImpl<ValueDFS> &DFSOrderedSet,
+    DenseMap<const Value *, unsigned int> &UseCounts,
+    SmallPtrSetImpl<Instruction *> &ProbablyDead) const {
+  for (auto D : Dense) {
+    // First add the value.
+    BasicBlock *BB = getBlockForValue(D);
+    // Constants are handled prior to ever calling this function, so
+    // we should only be left with instructions as members.
+    assert(BB && "Should have figured out a basic block for value");
+    ValueDFS VDDef;
+    DomTreeNode *DomNode = DT->getNode(BB);
+    VDDef.DFSIn = DomNode->getDFSNumIn();
+    VDDef.DFSOut = DomNode->getDFSNumOut();
+    // If it's a store, use the leader of the value operand, if it's always
+    // available, or the value operand.  TODO: We could do dominance checks to
+    // find a dominating leader, but not worth it ATM.
+    if (auto *SI = dyn_cast<StoreInst>(D)) {
+      auto Leader = lookupOperandLeader(SI->getValueOperand());
+      if (alwaysAvailable(Leader)) {
+        VDDef.Def.setPointer(Leader);
+      } else {
+        VDDef.Def.setPointer(SI->getValueOperand());
+        VDDef.Def.setInt(true);
+      }
+    } else {
+      VDDef.Def.setPointer(D);
+    }
+    assert(isa<Instruction>(D) &&
+           "The dense set member should always be an instruction");
+    Instruction *Def = cast<Instruction>(D);
+    VDDef.LocalNum = InstrToDFSNum(D);
+    DFSOrderedSet.push_back(VDDef);
+    // If there is a phi node equivalent, add it
+    if (auto *PN = RealToTemp.lookup(Def)) {
+      auto *PHIE =
+          dyn_cast_or_null<PHIExpression>(ValueToExpression.lookup(Def));
+      if (PHIE) {
+        VDDef.Def.setInt(false);
+        VDDef.Def.setPointer(PN);
+        VDDef.LocalNum = 0;
+        DFSOrderedSet.push_back(VDDef);
+      }
+    }
+
+    unsigned int UseCount = 0;
+    // Now add the uses.
+    for (auto &U : Def->uses()) {
+      if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+        // Don't try to replace into dead uses
+        if (InstructionsToErase.count(I))
+          continue;
+        ValueDFS VDUse;
+        // Put the phi node uses in the incoming block.
+        BasicBlock *IBlock;
+        if (auto *P = dyn_cast<PHINode>(I)) {
+          IBlock = P->getIncomingBlock(U);
+          // Make phi node users appear last in the incoming block
+          // they are from.
+          VDUse.LocalNum = InstrDFS.size() + 1;
+        } else {
+          IBlock = getBlockForValue(I);
+          VDUse.LocalNum = InstrToDFSNum(I);
+        }
+
+        // Skip uses in unreachable blocks, as we're going
+        // to delete them.
+        if (ReachableBlocks.count(IBlock) == 0)
+          continue;
+
+        DomTreeNode *DomNode = DT->getNode(IBlock);
+        VDUse.DFSIn = DomNode->getDFSNumIn();
+        VDUse.DFSOut = DomNode->getDFSNumOut();
+        VDUse.U = &U;
+        ++UseCount;
+        DFSOrderedSet.emplace_back(VDUse);
+      }
+    }
+
+    // If there are no uses, it's probably dead (but it may have side-effects,
+    // so not definitely dead. Otherwise, store the number of uses so we can
+    // track if it becomes dead later).
+    if (UseCount == 0)
+      ProbablyDead.insert(Def);
+    else
+      UseCounts[Def] = UseCount;
+  }
+}
+
+// This function converts the set of members for a congruence class from values,
+// to the set of defs for loads and stores, with associated DFS info.
+void NewGVN::convertClassToLoadsAndStores(
+    const CongruenceClass &Dense,
+    SmallVectorImpl<ValueDFS> &LoadsAndStores) const {
+  for (auto D : Dense) {
+    if (!isa<LoadInst>(D) && !isa<StoreInst>(D))
+      continue;
+
+    BasicBlock *BB = getBlockForValue(D);
+    ValueDFS VD;
+    DomTreeNode *DomNode = DT->getNode(BB);
+    VD.DFSIn = DomNode->getDFSNumIn();
+    VD.DFSOut = DomNode->getDFSNumOut();
+    VD.Def.setPointer(D);
+
+    // If it's an instruction, use the real local dfs number.
+    if (auto *I = dyn_cast<Instruction>(D))
+      VD.LocalNum = InstrToDFSNum(I);
+    else
+      llvm_unreachable("Should have been an instruction");
+
+    LoadsAndStores.emplace_back(VD);
+  }
+}
+
+static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) {
+  patchReplacementInstruction(I, Repl);
+  I->replaceAllUsesWith(Repl);
+}
+
+void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) {
+  LLVM_DEBUG(dbgs() << "  BasicBlock Dead:" << *BB);
+  ++NumGVNBlocksDeleted;
+
+  // Delete the instructions backwards, as it has a reduced likelihood of having
+  // to update as many def-use and use-def chains. Start after the terminator.
+  auto StartPoint = BB->rbegin();
+  ++StartPoint;
+  // Note that we explicitly recalculate BB->rend() on each iteration,
+  // as it may change when we remove the first instruction.
+  for (BasicBlock::reverse_iterator I(StartPoint); I != BB->rend();) {
+    Instruction &Inst = *I++;
+    if (!Inst.use_empty())
+      Inst.replaceAllUsesWith(UndefValue::get(Inst.getType()));
+    if (isa<LandingPadInst>(Inst))
+      continue;
+    salvageKnowledge(&Inst, AC);
+
+    Inst.eraseFromParent();
+    ++NumGVNInstrDeleted;
+  }
+  // Now insert something that simplifycfg will turn into an unreachable.
+  Type *Int8Ty = Type::getInt8Ty(BB->getContext());
+  new StoreInst(UndefValue::get(Int8Ty),
+                Constant::getNullValue(Int8Ty->getPointerTo()),
+                BB->getTerminator());
+}
+
+void NewGVN::markInstructionForDeletion(Instruction *I) {
+  LLVM_DEBUG(dbgs() << "Marking " << *I << " for deletion\n");
+  InstructionsToErase.insert(I);
+}
+
+void NewGVN::replaceInstruction(Instruction *I, Value *V) {
+  LLVM_DEBUG(dbgs() << "Replacing " << *I << " with " << *V << "\n");
+  patchAndReplaceAllUsesWith(I, V);
+  // We save the actual erasing to avoid invalidating memory
+  // dependencies until we are done with everything.
+  markInstructionForDeletion(I);
+}
+
+namespace {
+
+// This is a stack that contains both the value and dfs info of where
+// that value is valid.
+class ValueDFSStack {
+public:
+  Value *back() const { return ValueStack.back(); }
+  std::pair<int, int> dfs_back() const { return DFSStack.back(); }
+
+  void push_back(Value *V, int DFSIn, int DFSOut) {
+    ValueStack.emplace_back(V);
+    DFSStack.emplace_back(DFSIn, DFSOut);
+  }
+
+  bool empty() const { return DFSStack.empty(); }
+
+  bool isInScope(int DFSIn, int DFSOut) const {
+    if (empty())
+      return false;
+    return DFSIn >= DFSStack.back().first && DFSOut <= DFSStack.back().second;
+  }
+
+  void popUntilDFSScope(int DFSIn, int DFSOut) {
+
+    // These two should always be in sync at this point.
+    assert(ValueStack.size() == DFSStack.size() &&
+           "Mismatch between ValueStack and DFSStack");
+    while (
+        !DFSStack.empty() &&
+        !(DFSIn >= DFSStack.back().first && DFSOut <= DFSStack.back().second)) {
+      DFSStack.pop_back();
+      ValueStack.pop_back();
+    }
+  }
+
+private:
+  SmallVector<Value *, 8> ValueStack;
+  SmallVector<std::pair<int, int>, 8> DFSStack;
+};
+
+} // end anonymous namespace
+
+// Given an expression, get the congruence class for it.
+CongruenceClass *NewGVN::getClassForExpression(const Expression *E) const {
+  if (auto *VE = dyn_cast<VariableExpression>(E))
+    return ValueToClass.lookup(VE->getVariableValue());
+  else if (isa<DeadExpression>(E))
+    return TOPClass;
+  return ExpressionToClass.lookup(E);
+}
+
+// Given a value and a basic block we are trying to see if it is available in,
+// see if the value has a leader available in that block.
+Value *NewGVN::findPHIOfOpsLeader(const Expression *E,
+                                  const Instruction *OrigInst,
+                                  const BasicBlock *BB) const {
+  // It would already be constant if we could make it constant
+  if (auto *CE = dyn_cast<ConstantExpression>(E))
+    return CE->getConstantValue();
+  if (auto *VE = dyn_cast<VariableExpression>(E)) {
+    auto *V = VE->getVariableValue();
+    if (alwaysAvailable(V) || DT->dominates(getBlockForValue(V), BB))
+      return VE->getVariableValue();
+  }
+
+  auto *CC = getClassForExpression(E);
+  if (!CC)
+    return nullptr;
+  if (alwaysAvailable(CC->getLeader()))
+    return CC->getLeader();
+
+  for (auto Member : *CC) {
+    auto *MemberInst = dyn_cast<Instruction>(Member);
+    if (MemberInst == OrigInst)
+      continue;
+    // Anything that isn't an instruction is always available.
+    if (!MemberInst)
+      return Member;
+    if (DT->dominates(getBlockForValue(MemberInst), BB))
+      return Member;
+  }
+  return nullptr;
+}
+
+bool NewGVN::eliminateInstructions(Function &F) {
+  // This is a non-standard eliminator. The normal way to eliminate is
+  // to walk the dominator tree in order, keeping track of available
+  // values, and eliminating them.  However, this is mildly
+  // pointless. It requires doing lookups on every instruction,
+  // regardless of whether we will ever eliminate it.  For
+  // instructions part of most singleton congruence classes, we know we
+  // will never eliminate them.
+
+  // Instead, this eliminator looks at the congruence classes directly, sorts
+  // them into a DFS ordering of the dominator tree, and then we just
+  // perform elimination straight on the sets by walking the congruence
+  // class member uses in order, and eliminate the ones dominated by the
+  // last member.   This is worst case O(E log E) where E = number of
+  // instructions in a single congruence class.  In theory, this is all
+  // instructions.   In practice, it is much faster, as most instructions are
+  // either in singleton congruence classes or can't possibly be eliminated
+  // anyway (if there are no overlapping DFS ranges in class).
+  // When we find something not dominated, it becomes the new leader
+  // for elimination purposes.
+  // TODO: If we wanted to be faster, We could remove any members with no
+  // overlapping ranges while sorting, as we will never eliminate anything
+  // with those members, as they don't dominate anything else in our set.
+
+  bool AnythingReplaced = false;
+
+  // Since we are going to walk the domtree anyway, and we can't guarantee the
+  // DFS numbers are updated, we compute some ourselves.
+  DT->updateDFSNumbers();
+
+  // Go through all of our phi nodes, and kill the arguments associated with
+  // unreachable edges.
+  auto ReplaceUnreachablePHIArgs = [&](PHINode *PHI, BasicBlock *BB) {
+    for (auto &Operand : PHI->incoming_values())
+      if (!ReachableEdges.count({PHI->getIncomingBlock(Operand), BB})) {
+        LLVM_DEBUG(dbgs() << "Replacing incoming value of " << PHI
+                          << " for block "
+                          << getBlockName(PHI->getIncomingBlock(Operand))
+                          << " with undef due to it being unreachable\n");
+        Operand.set(UndefValue::get(PHI->getType()));
+      }
+  };
+  // Replace unreachable phi arguments.
+  // At this point, RevisitOnReachabilityChange only contains:
+  //
+  // 1. PHIs
+  // 2. Temporaries that will convert to PHIs
+  // 3. Operations that are affected by an unreachable edge but do not fit into
+  // 1 or 2 (rare).
+  // So it is a slight overshoot of what we want. We could make it exact by
+  // using two SparseBitVectors per block.
+  DenseMap<const BasicBlock *, unsigned> ReachablePredCount;
+  for (auto &KV : ReachableEdges)
+    ReachablePredCount[KV.getEnd()]++;
+  for (auto &BBPair : RevisitOnReachabilityChange) {
+    for (auto InstNum : BBPair.second) {
+      auto *Inst = InstrFromDFSNum(InstNum);
+      auto *PHI = dyn_cast<PHINode>(Inst);
+      PHI = PHI ? PHI : dyn_cast_or_null<PHINode>(RealToTemp.lookup(Inst));
+      if (!PHI)
+        continue;
+      auto *BB = BBPair.first;
+      if (ReachablePredCount.lookup(BB) != PHI->getNumIncomingValues())
+        ReplaceUnreachablePHIArgs(PHI, BB);
+    }
+  }
+
+  // Map to store the use counts
+  DenseMap<const Value *, unsigned int> UseCounts;
+  for (auto *CC : reverse(CongruenceClasses)) {
+    LLVM_DEBUG(dbgs() << "Eliminating in congruence class " << CC->getID()
+                      << "\n");
+    // Track the equivalent store info so we can decide whether to try
+    // dead store elimination.
+    SmallVector<ValueDFS, 8> PossibleDeadStores;
+    SmallPtrSet<Instruction *, 8> ProbablyDead;
+    if (CC->isDead() || CC->empty())
+      continue;
+    // Everything still in the TOP class is unreachable or dead.
+    if (CC == TOPClass) {
+      for (auto M : *CC) {
+        auto *VTE = ValueToExpression.lookup(M);
+        if (VTE && isa<DeadExpression>(VTE))
+          markInstructionForDeletion(cast<Instruction>(M));
+        assert((!ReachableBlocks.count(cast<Instruction>(M)->getParent()) ||
+                InstructionsToErase.count(cast<Instruction>(M))) &&
+               "Everything in TOP should be unreachable or dead at this "
+               "point");
+      }
+      continue;
+    }
+
+    assert(CC->getLeader() && "We should have had a leader");
+    // If this is a leader that is always available, and it's a
+    // constant or has no equivalences, just replace everything with
+    // it. We then update the congruence class with whatever members
+    // are left.
+    Value *Leader =
+        CC->getStoredValue() ? CC->getStoredValue() : CC->getLeader();
+    if (alwaysAvailable(Leader)) {
+      CongruenceClass::MemberSet MembersLeft;
+      for (auto M : *CC) {
+        Value *Member = M;
+        // Void things have no uses we can replace.
+        if (Member == Leader || !isa<Instruction>(Member) ||
+            Member->getType()->isVoidTy()) {
+          MembersLeft.insert(Member);
+          continue;
+        }
+        LLVM_DEBUG(dbgs() << "Found replacement " << *(Leader) << " for "
+                          << *Member << "\n");
+        auto *I = cast<Instruction>(Member);
+        assert(Leader != I && "About to accidentally remove our leader");
+        replaceInstruction(I, Leader);
+        AnythingReplaced = true;
+      }
+      CC->swap(MembersLeft);
+    } else {
+      // If this is a singleton, we can skip it.
+      if (CC->size() != 1 || RealToTemp.count(Leader)) {
+        // This is a stack because equality replacement/etc may place
+        // constants in the middle of the member list, and we want to use
+        // those constant values in preference to the current leader, over
+        // the scope of those constants.
+        ValueDFSStack EliminationStack;
+
+        // Convert the members to DFS ordered sets and then merge them.
+        SmallVector<ValueDFS, 8> DFSOrderedSet;
+        convertClassToDFSOrdered(*CC, DFSOrderedSet, UseCounts, ProbablyDead);
+
+        // Sort the whole thing.
+        llvm::sort(DFSOrderedSet);
+        for (auto &VD : DFSOrderedSet) {
+          int MemberDFSIn = VD.DFSIn;
+          int MemberDFSOut = VD.DFSOut;
+          Value *Def = VD.Def.getPointer();
+          bool FromStore = VD.Def.getInt();
+          Use *U = VD.U;
+          // We ignore void things because we can't get a value from them.
+          if (Def && Def->getType()->isVoidTy())
+            continue;
+          auto *DefInst = dyn_cast_or_null<Instruction>(Def);
+          if (DefInst && AllTempInstructions.count(DefInst)) {
+            auto *PN = cast<PHINode>(DefInst);
+
+            // If this is a value phi and that's the expression we used, insert
+            // it into the program
+            // remove from temp instruction list.
+            AllTempInstructions.erase(PN);
+            auto *DefBlock = getBlockForValue(Def);
+            LLVM_DEBUG(dbgs() << "Inserting fully real phi of ops" << *Def
+                              << " into block "
+                              << getBlockName(getBlockForValue(Def)) << "\n");
+            PN->insertBefore(&DefBlock->front());
+            Def = PN;
+            NumGVNPHIOfOpsEliminations++;
+          }
+
+          if (EliminationStack.empty()) {
+            LLVM_DEBUG(dbgs() << "Elimination Stack is empty\n");
+          } else {
+            LLVM_DEBUG(dbgs() << "Elimination Stack Top DFS numbers are ("
+                              << EliminationStack.dfs_back().first << ","
+                              << EliminationStack.dfs_back().second << ")\n");
+          }
+
+          LLVM_DEBUG(dbgs() << "Current DFS numbers are (" << MemberDFSIn << ","
+                            << MemberDFSOut << ")\n");
+          // First, we see if we are out of scope or empty.  If so,
+          // and there equivalences, we try to replace the top of
+          // stack with equivalences (if it's on the stack, it must
+          // not have been eliminated yet).
+          // Then we synchronize to our current scope, by
+          // popping until we are back within a DFS scope that
+          // dominates the current member.
+          // Then, what happens depends on a few factors
+          // If the stack is now empty, we need to push
+          // If we have a constant or a local equivalence we want to
+          // start using, we also push.
+          // Otherwise, we walk along, processing members who are
+          // dominated by this scope, and eliminate them.
+          bool ShouldPush = Def && EliminationStack.empty();
+          bool OutOfScope =
+              !EliminationStack.isInScope(MemberDFSIn, MemberDFSOut);
+
+          if (OutOfScope || ShouldPush) {
+            // Sync to our current scope.
+            EliminationStack.popUntilDFSScope(MemberDFSIn, MemberDFSOut);
+            bool ShouldPush = Def && EliminationStack.empty();
+            if (ShouldPush) {
+              EliminationStack.push_back(Def, MemberDFSIn, MemberDFSOut);
+            }
+          }
+
+          // Skip the Def's, we only want to eliminate on their uses.  But mark
+          // dominated defs as dead.
+          if (Def) {
+            // For anything in this case, what and how we value number
+            // guarantees that any side-effets that would have occurred (ie
+            // throwing, etc) can be proven to either still occur (because it's
+            // dominated by something that has the same side-effects), or never
+            // occur.  Otherwise, we would not have been able to prove it value
+            // equivalent to something else. For these things, we can just mark
+            // it all dead.  Note that this is different from the "ProbablyDead"
+            // set, which may not be dominated by anything, and thus, are only
+            // easy to prove dead if they are also side-effect free. Note that
+            // because stores are put in terms of the stored value, we skip
+            // stored values here. If the stored value is really dead, it will
+            // still be marked for deletion when we process it in its own class.
+            if (!EliminationStack.empty() && Def != EliminationStack.back() &&
+                isa<Instruction>(Def) && !FromStore)
+              markInstructionForDeletion(cast<Instruction>(Def));
+            continue;
+          }
+          // At this point, we know it is a Use we are trying to possibly
+          // replace.
+
+          assert(isa<Instruction>(U->get()) &&
+                 "Current def should have been an instruction");
+          assert(isa<Instruction>(U->getUser()) &&
+                 "Current user should have been an instruction");
+
+          // If the thing we are replacing into is already marked to be dead,
+          // this use is dead.  Note that this is true regardless of whether
+          // we have anything dominating the use or not.  We do this here
+          // because we are already walking all the uses anyway.
+          Instruction *InstUse = cast<Instruction>(U->getUser());
+          if (InstructionsToErase.count(InstUse)) {
+            auto &UseCount = UseCounts[U->get()];
+            if (--UseCount == 0) {
+              ProbablyDead.insert(cast<Instruction>(U->get()));
+            }
+          }
+
+          // If we get to this point, and the stack is empty we must have a use
+          // with nothing we can use to eliminate this use, so just skip it.
+          if (EliminationStack.empty())
+            continue;
+
+          Value *DominatingLeader = EliminationStack.back();
+
+          auto *II = dyn_cast<IntrinsicInst>(DominatingLeader);
+          bool isSSACopy = II && II->getIntrinsicID() == Intrinsic::ssa_copy;
+          if (isSSACopy)
+            DominatingLeader = II->getOperand(0);
+
+          // Don't replace our existing users with ourselves.
+          if (U->get() == DominatingLeader)
+            continue;
+          LLVM_DEBUG(dbgs()
+                     << "Found replacement " << *DominatingLeader << " for "
+                     << *U->get() << " in " << *(U->getUser()) << "\n");
+
+          // If we replaced something in an instruction, handle the patching of
+          // metadata.  Skip this if we are replacing predicateinfo with its
+          // original operand, as we already know we can just drop it.
+          auto *ReplacedInst = cast<Instruction>(U->get());
+          auto *PI = PredInfo->getPredicateInfoFor(ReplacedInst);
+          if (!PI || DominatingLeader != PI->OriginalOp)
+            patchReplacementInstruction(ReplacedInst, DominatingLeader);
+          U->set(DominatingLeader);
+          // This is now a use of the dominating leader, which means if the
+          // dominating leader was dead, it's now live!
+          auto &LeaderUseCount = UseCounts[DominatingLeader];
+          // It's about to be alive again.
+          if (LeaderUseCount == 0 && isa<Instruction>(DominatingLeader))
+            ProbablyDead.erase(cast<Instruction>(DominatingLeader));
+          // For copy instructions, we use their operand as a leader,
+          // which means we remove a user of the copy and it may become dead.
+          if (isSSACopy) {
+            unsigned &IIUseCount = UseCounts[II];
+            if (--IIUseCount == 0)
+              ProbablyDead.insert(II);
+          }
+          ++LeaderUseCount;
+          AnythingReplaced = true;
+        }
+      }
+    }
+
+    // At this point, anything still in the ProbablyDead set is actually dead if
+    // would be trivially dead.
+    for (auto *I : ProbablyDead)
+      if (wouldInstructionBeTriviallyDead(I))
+        markInstructionForDeletion(I);
+
+    // Cleanup the congruence class.
+    CongruenceClass::MemberSet MembersLeft;
+    for (auto *Member : *CC)
+      if (!isa<Instruction>(Member) ||
+          !InstructionsToErase.count(cast<Instruction>(Member)))
+        MembersLeft.insert(Member);
+    CC->swap(MembersLeft);
+
+    // If we have possible dead stores to look at, try to eliminate them.
+    if (CC->getStoreCount() > 0) {
+      convertClassToLoadsAndStores(*CC, PossibleDeadStores);
+      llvm::sort(PossibleDeadStores);
+      ValueDFSStack EliminationStack;
+      for (auto &VD : PossibleDeadStores) {
+        int MemberDFSIn = VD.DFSIn;
+        int MemberDFSOut = VD.DFSOut;
+        Instruction *Member = cast<Instruction>(VD.Def.getPointer());
+        if (EliminationStack.empty() ||
+            !EliminationStack.isInScope(MemberDFSIn, MemberDFSOut)) {
+          // Sync to our current scope.
+          EliminationStack.popUntilDFSScope(MemberDFSIn, MemberDFSOut);
+          if (EliminationStack.empty()) {
+            EliminationStack.push_back(Member, MemberDFSIn, MemberDFSOut);
+            continue;
+          }
+        }
+        // We already did load elimination, so nothing to do here.
+        if (isa<LoadInst>(Member))
+          continue;
+        assert(!EliminationStack.empty());
+        Instruction *Leader = cast<Instruction>(EliminationStack.back());
+        (void)Leader;
+        assert(DT->dominates(Leader->getParent(), Member->getParent()));
+        // Member is dominater by Leader, and thus dead
+        LLVM_DEBUG(dbgs() << "Marking dead store " << *Member
+                          << " that is dominated by " << *Leader << "\n");
+        markInstructionForDeletion(Member);
+        CC->erase(Member);
+        ++NumGVNDeadStores;
+      }
+    }
+  }
+  return AnythingReplaced;
+}
+
+// This function provides global ranking of operations so that we can place them
+// in a canonical order.  Note that rank alone is not necessarily enough for a
+// complete ordering, as constants all have the same rank.  However, generally,
+// we will simplify an operation with all constants so that it doesn't matter
+// what order they appear in.
+unsigned int NewGVN::getRank(const Value *V) const {
+  // Prefer constants to undef to anything else
+  // Undef is a constant, have to check it first.
+  // Prefer smaller constants to constantexprs
+  if (isa<ConstantExpr>(V))
+    return 2;
+  if (isa<UndefValue>(V))
+    return 1;
+  if (isa<Constant>(V))
+    return 0;
+  else if (auto *A = dyn_cast<Argument>(V))
+    return 3 + A->getArgNo();
+
+  // Need to shift the instruction DFS by number of arguments + 3 to account for
+  // the constant and argument ranking above.
+  unsigned Result = InstrToDFSNum(V);
+  if (Result > 0)
+    return 4 + NumFuncArgs + Result;
+  // Unreachable or something else, just return a really large number.
+  return ~0;
+}
+
+// This is a function that says whether two commutative operations should
+// have their order swapped when canonicalizing.
+bool NewGVN::shouldSwapOperands(const Value *A, const Value *B) const {
+  // Because we only care about a total ordering, and don't rewrite expressions
+  // in this order, we order by rank, which will give a strict weak ordering to
+  // everything but constants, and then we order by pointer address.
+  return std::make_pair(getRank(A), A) > std::make_pair(getRank(B), B);
+}
+
+namespace {
+
+class NewGVNLegacyPass : public FunctionPass {
+public:
+  // Pass identification, replacement for typeid.
+  static char ID;
+
+  NewGVNLegacyPass() : FunctionPass(ID) {
+    initializeNewGVNLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+
+private:
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<MemorySSAWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+
+} // end anonymous namespace
+
+bool NewGVNLegacyPass::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+  return NewGVN(F, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+                &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
+                &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F),
+                &getAnalysis<AAResultsWrapperPass>().getAAResults(),
+                &getAnalysis<MemorySSAWrapperPass>().getMSSA(),
+                F.getParent()->getDataLayout())
+      .runGVN();
+}
+
+char NewGVNLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(NewGVNLegacyPass, "newgvn", "Global Value Numbering",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_END(NewGVNLegacyPass, "newgvn", "Global Value Numbering", false,
+                    false)
+
+// createGVNPass - The public interface to this file.
+FunctionPass *llvm::createNewGVNPass() { return new NewGVNLegacyPass(); }
+
+PreservedAnalyses NewGVNPass::run(Function &F, AnalysisManager<Function> &AM) {
+  // Apparently the order in which we get these results matter for
+  // the old GVN (see Chandler's comment in GVN.cpp). I'll keep
+  // the same order here, just in case.
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &AA = AM.getResult<AAManager>(F);
+  auto &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
+  bool Changed =
+      NewGVN(F, &DT, &AC, &TLI, &AA, &MSSA, F.getParent()->getDataLayout())
+          .runGVN();
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
index 9ae47d54e8..58763ec72e 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -1,186 +1,186 @@
-//===--- PartiallyInlineLibCalls.cpp - Partially inline libcalls ----------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass tries to partially inline the fast path of well-known library 
-// functions, such as using square-root instructions for cases where sqrt() 
-// does not need to set errno. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/PartiallyInlineLibCalls.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Support/DebugCounter.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "partially-inline-libcalls" 
- 
-DEBUG_COUNTER(PILCounter, "partially-inline-libcalls-transform", 
-              "Controls transformations in partially-inline-libcalls"); 
- 
-static bool optimizeSQRT(CallInst *Call, Function *CalledFunc, 
-                         BasicBlock &CurrBB, Function::iterator &BB, 
-                         const TargetTransformInfo *TTI) { 
-  // There is no need to change the IR, since backend will emit sqrt 
-  // instruction if the call has already been marked read-only. 
-  if (Call->onlyReadsMemory()) 
-    return false; 
- 
-  if (!DebugCounter::shouldExecute(PILCounter)) 
-    return false; 
- 
-  // Do the following transformation: 
-  // 
-  // (before) 
-  // dst = sqrt(src) 
-  // 
-  // (after) 
-  // v0 = sqrt_noreadmem(src) # native sqrt instruction. 
-  // [if (v0 is a NaN) || if (src < 0)] 
-  //   v1 = sqrt(src)         # library call. 
-  // dst = phi(v0, v1) 
-  // 
- 
-  // Move all instructions following Call to newly created block JoinBB. 
-  // Create phi and replace all uses. 
-  BasicBlock *JoinBB = llvm::SplitBlock(&CurrBB, Call->getNextNode()); 
-  IRBuilder<> Builder(JoinBB, JoinBB->begin()); 
-  Type *Ty = Call->getType(); 
-  PHINode *Phi = Builder.CreatePHI(Ty, 2); 
-  Call->replaceAllUsesWith(Phi); 
- 
-  // Create basic block LibCallBB and insert a call to library function sqrt. 
-  BasicBlock *LibCallBB = BasicBlock::Create(CurrBB.getContext(), "call.sqrt", 
-                                             CurrBB.getParent(), JoinBB); 
-  Builder.SetInsertPoint(LibCallBB); 
-  Instruction *LibCall = Call->clone(); 
-  Builder.Insert(LibCall); 
-  Builder.CreateBr(JoinBB); 
- 
-  // Add attribute "readnone" so that backend can use a native sqrt instruction 
-  // for this call. Insert a FP compare instruction and a conditional branch 
-  // at the end of CurrBB. 
-  Call->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone); 
-  CurrBB.getTerminator()->eraseFromParent(); 
-  Builder.SetInsertPoint(&CurrBB); 
-  Value *FCmp = TTI->isFCmpOrdCheaperThanFCmpZero(Ty) 
-                    ? Builder.CreateFCmpORD(Call, Call) 
-                    : Builder.CreateFCmpOGE(Call->getOperand(0), 
-                                            ConstantFP::get(Ty, 0.0)); 
-  Builder.CreateCondBr(FCmp, JoinBB, LibCallBB); 
- 
-  // Add phi operands. 
-  Phi->addIncoming(Call, &CurrBB); 
-  Phi->addIncoming(LibCall, LibCallBB); 
- 
-  BB = JoinBB->getIterator(); 
-  return true; 
-} 
- 
-static bool runPartiallyInlineLibCalls(Function &F, TargetLibraryInfo *TLI, 
-                                       const TargetTransformInfo *TTI) { 
-  bool Changed = false; 
- 
-  Function::iterator CurrBB; 
-  for (Function::iterator BB = F.begin(), BE = F.end(); BB != BE;) { 
-    CurrBB = BB++; 
- 
-    for (BasicBlock::iterator II = CurrBB->begin(), IE = CurrBB->end(); 
-         II != IE; ++II) { 
-      CallInst *Call = dyn_cast<CallInst>(&*II); 
-      Function *CalledFunc; 
- 
-      if (!Call || !(CalledFunc = Call->getCalledFunction())) 
-        continue; 
- 
-      if (Call->isNoBuiltin()) 
-        continue; 
- 
-      // Skip if function either has local linkage or is not a known library 
-      // function. 
-      LibFunc LF; 
-      if (CalledFunc->hasLocalLinkage() || 
-          !TLI->getLibFunc(*CalledFunc, LF) || !TLI->has(LF)) 
-        continue; 
- 
-      switch (LF) { 
-      case LibFunc_sqrtf: 
-      case LibFunc_sqrt: 
-        if (TTI->haveFastSqrt(Call->getType()) && 
-            optimizeSQRT(Call, CalledFunc, *CurrBB, BB, TTI)) 
-          break; 
-        continue; 
-      default: 
-        continue; 
-      } 
- 
-      Changed = true; 
-      break; 
-    } 
-  } 
- 
-  return Changed; 
-} 
- 
-PreservedAnalyses 
-PartiallyInlineLibCallsPass::run(Function &F, FunctionAnalysisManager &AM) { 
-  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 
-  auto &TTI = AM.getResult<TargetIRAnalysis>(F); 
-  if (!runPartiallyInlineLibCalls(F, &TLI, &TTI)) 
-    return PreservedAnalyses::all(); 
-  return PreservedAnalyses::none(); 
-} 
- 
-namespace { 
-class PartiallyInlineLibCallsLegacyPass : public FunctionPass { 
-public: 
-  static char ID; 
- 
-  PartiallyInlineLibCallsLegacyPass() : FunctionPass(ID) { 
-    initializePartiallyInlineLibCallsLegacyPassPass( 
-        *PassRegistry::getPassRegistry()); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-    AU.addRequired<TargetTransformInfoWrapperPass>(); 
-    FunctionPass::getAnalysisUsage(AU); 
-  } 
- 
-  bool runOnFunction(Function &F) override { 
-    if (skipFunction(F)) 
-      return false; 
- 
-    TargetLibraryInfo *TLI = 
-        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 
-    const TargetTransformInfo *TTI = 
-        &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 
-    return runPartiallyInlineLibCalls(F, TLI, TTI); 
-  } 
-}; 
-} 
- 
-char PartiallyInlineLibCallsLegacyPass::ID = 0; 
-INITIALIZE_PASS_BEGIN(PartiallyInlineLibCallsLegacyPass, 
-                      "partially-inline-libcalls", 
-                      "Partially inline calls to library functions", false, 
-                      false) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 
-INITIALIZE_PASS_END(PartiallyInlineLibCallsLegacyPass, 
-                    "partially-inline-libcalls", 
-                    "Partially inline calls to library functions", false, false) 
- 
-FunctionPass *llvm::createPartiallyInlineLibCallsPass() { 
-  return new PartiallyInlineLibCallsLegacyPass(); 
-} 
+//===--- PartiallyInlineLibCalls.cpp - Partially inline libcalls ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to partially inline the fast path of well-known library
+// functions, such as using square-root instructions for cases where sqrt()
+// does not need to set errno.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/PartiallyInlineLibCalls.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "partially-inline-libcalls"
+
+DEBUG_COUNTER(PILCounter, "partially-inline-libcalls-transform",
+              "Controls transformations in partially-inline-libcalls");
+
+static bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
+                         BasicBlock &CurrBB, Function::iterator &BB,
+                         const TargetTransformInfo *TTI) {
+  // There is no need to change the IR, since backend will emit sqrt
+  // instruction if the call has already been marked read-only.
+  if (Call->onlyReadsMemory())
+    return false;
+
+  if (!DebugCounter::shouldExecute(PILCounter))
+    return false;
+
+  // Do the following transformation:
+  //
+  // (before)
+  // dst = sqrt(src)
+  //
+  // (after)
+  // v0 = sqrt_noreadmem(src) # native sqrt instruction.
+  // [if (v0 is a NaN) || if (src < 0)]
+  //   v1 = sqrt(src)         # library call.
+  // dst = phi(v0, v1)
+  //
+
+  // Move all instructions following Call to newly created block JoinBB.
+  // Create phi and replace all uses.
+  BasicBlock *JoinBB = llvm::SplitBlock(&CurrBB, Call->getNextNode());
+  IRBuilder<> Builder(JoinBB, JoinBB->begin());
+  Type *Ty = Call->getType();
+  PHINode *Phi = Builder.CreatePHI(Ty, 2);
+  Call->replaceAllUsesWith(Phi);
+
+  // Create basic block LibCallBB and insert a call to library function sqrt.
+  BasicBlock *LibCallBB = BasicBlock::Create(CurrBB.getContext(), "call.sqrt",
+                                             CurrBB.getParent(), JoinBB);
+  Builder.SetInsertPoint(LibCallBB);
+  Instruction *LibCall = Call->clone();
+  Builder.Insert(LibCall);
+  Builder.CreateBr(JoinBB);
+
+  // Add attribute "readnone" so that backend can use a native sqrt instruction
+  // for this call. Insert a FP compare instruction and a conditional branch
+  // at the end of CurrBB.
+  Call->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
+  CurrBB.getTerminator()->eraseFromParent();
+  Builder.SetInsertPoint(&CurrBB);
+  Value *FCmp = TTI->isFCmpOrdCheaperThanFCmpZero(Ty)
+                    ? Builder.CreateFCmpORD(Call, Call)
+                    : Builder.CreateFCmpOGE(Call->getOperand(0),
+                                            ConstantFP::get(Ty, 0.0));
+  Builder.CreateCondBr(FCmp, JoinBB, LibCallBB);
+
+  // Add phi operands.
+  Phi->addIncoming(Call, &CurrBB);
+  Phi->addIncoming(LibCall, LibCallBB);
+
+  BB = JoinBB->getIterator();
+  return true;
+}
+
+static bool runPartiallyInlineLibCalls(Function &F, TargetLibraryInfo *TLI,
+                                       const TargetTransformInfo *TTI) {
+  bool Changed = false;
+
+  Function::iterator CurrBB;
+  for (Function::iterator BB = F.begin(), BE = F.end(); BB != BE;) {
+    CurrBB = BB++;
+
+    for (BasicBlock::iterator II = CurrBB->begin(), IE = CurrBB->end();
+         II != IE; ++II) {
+      CallInst *Call = dyn_cast<CallInst>(&*II);
+      Function *CalledFunc;
+
+      if (!Call || !(CalledFunc = Call->getCalledFunction()))
+        continue;
+
+      if (Call->isNoBuiltin())
+        continue;
+
+      // Skip if function either has local linkage or is not a known library
+      // function.
+      LibFunc LF;
+      if (CalledFunc->hasLocalLinkage() ||
+          !TLI->getLibFunc(*CalledFunc, LF) || !TLI->has(LF))
+        continue;
+
+      switch (LF) {
+      case LibFunc_sqrtf:
+      case LibFunc_sqrt:
+        if (TTI->haveFastSqrt(Call->getType()) &&
+            optimizeSQRT(Call, CalledFunc, *CurrBB, BB, TTI))
+          break;
+        continue;
+      default:
+        continue;
+      }
+
+      Changed = true;
+      break;
+    }
+  }
+
+  return Changed;
+}
+
+PreservedAnalyses
+PartiallyInlineLibCallsPass::run(Function &F, FunctionAnalysisManager &AM) {
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  if (!runPartiallyInlineLibCalls(F, &TLI, &TTI))
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
+}
+
+namespace {
+class PartiallyInlineLibCallsLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  PartiallyInlineLibCallsLegacyPass() : FunctionPass(ID) {
+    initializePartiallyInlineLibCallsLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    TargetLibraryInfo *TLI =
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    const TargetTransformInfo *TTI =
+        &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    return runPartiallyInlineLibCalls(F, TLI, TTI);
+  }
+};
+}
+
+char PartiallyInlineLibCallsLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(PartiallyInlineLibCallsLegacyPass,
+                      "partially-inline-libcalls",
+                      "Partially inline calls to library functions", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(PartiallyInlineLibCallsLegacyPass,
+                    "partially-inline-libcalls",
+                    "Partially inline calls to library functions", false, false)
+
+FunctionPass *llvm::createPartiallyInlineLibCallsPass() {
+  return new PartiallyInlineLibCallsLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/PlaceSafepoints.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/PlaceSafepoints.cpp
index 5d91a49723..a110f7d5c2 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/PlaceSafepoints.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/PlaceSafepoints.cpp
@@ -1,690 +1,690 @@
-//===- PlaceSafepoints.cpp - Place GC Safepoints --------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// Place garbage collection safepoints at appropriate locations in the IR. This 
-// does not make relocation semantics or variable liveness explicit.  That's 
-// done by RewriteStatepointsForGC. 
-// 
-// Terminology: 
-// - A call is said to be "parseable" if there is a stack map generated for the 
-// return PC of the call.  A runtime can determine where values listed in the 
-// deopt arguments and (after RewriteStatepointsForGC) gc arguments are located 
-// on the stack when the code is suspended inside such a call.  Every parse 
-// point is represented by a call wrapped in an gc.statepoint intrinsic. 
-// - A "poll" is an explicit check in the generated code to determine if the 
-// runtime needs the generated code to cooperate by calling a helper routine 
-// and thus suspending its execution at a known state. The call to the helper 
-// routine will be parseable.  The (gc & runtime specific) logic of a poll is 
-// assumed to be provided in a function of the name "gc.safepoint_poll". 
-// 
-// We aim to insert polls such that running code can quickly be brought to a 
-// well defined state for inspection by the collector.  In the current 
-// implementation, this is done via the insertion of poll sites at method entry 
-// and the backedge of most loops.  We try to avoid inserting more polls than 
-// are necessary to ensure a finite period between poll sites.  This is not 
-// because the poll itself is expensive in the generated code; it's not.  Polls 
-// do tend to impact the optimizer itself in negative ways; we'd like to avoid 
-// perturbing the optimization of the method as much as we can. 
-// 
-// We also need to make most call sites parseable.  The callee might execute a 
-// poll (or otherwise be inspected by the GC).  If so, the entire stack 
-// (including the suspended frame of the current method) must be parseable. 
-// 
-// This pass will insert: 
-// - Call parse points ("call safepoints") for any call which may need to 
-// reach a safepoint during the execution of the callee function. 
-// - Backedge safepoint polls and entry safepoint polls to ensure that 
-// executing code reaches a safepoint poll in a finite amount of time. 
-// 
-// We do not currently support return statepoints, but adding them would not 
-// be hard.  They are not required for correctness - entry safepoints are an 
-// alternative - but some GCs may prefer them.  Patches welcome. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
- 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/CFG.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/LegacyPassManager.h" 
-#include "llvm/IR/Statepoint.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/Cloning.h" 
- 
-#define DEBUG_TYPE "safepoint-placement" 
- 
-STATISTIC(NumEntrySafepoints, "Number of entry safepoints inserted"); 
-STATISTIC(NumBackedgeSafepoints, "Number of backedge safepoints inserted"); 
- 
-STATISTIC(CallInLoop, 
-          "Number of loops without safepoints due to calls in loop"); 
-STATISTIC(FiniteExecution, 
-          "Number of loops without safepoints finite execution"); 
- 
-using namespace llvm; 
- 
-// Ignore opportunities to avoid placing safepoints on backedges, useful for 
-// validation 
-static cl::opt<bool> AllBackedges("spp-all-backedges", cl::Hidden, 
-                                  cl::init(false)); 
- 
-/// How narrow does the trip count of a loop have to be to have to be considered 
-/// "counted"?  Counted loops do not get safepoints at backedges. 
-static cl::opt<int> CountedLoopTripWidth("spp-counted-loop-trip-width", 
-                                         cl::Hidden, cl::init(32)); 
- 
-// If true, split the backedge of a loop when placing the safepoint, otherwise 
-// split the latch block itself.  Both are useful to support for 
-// experimentation, but in practice, it looks like splitting the backedge 
-// optimizes better. 
-static cl::opt<bool> SplitBackedge("spp-split-backedge", cl::Hidden, 
-                                   cl::init(false)); 
- 
-namespace { 
- 
-/// An analysis pass whose purpose is to identify each of the backedges in 
-/// the function which require a safepoint poll to be inserted. 
-struct PlaceBackedgeSafepointsImpl : public FunctionPass { 
-  static char ID; 
- 
-  /// The output of the pass - gives a list of each backedge (described by 
-  /// pointing at the branch) which need a poll inserted. 
-  std::vector<Instruction *> PollLocations; 
- 
-  /// True unless we're running spp-no-calls in which case we need to disable 
-  /// the call-dependent placement opts. 
-  bool CallSafepointsEnabled; 
- 
-  ScalarEvolution *SE = nullptr; 
-  DominatorTree *DT = nullptr; 
-  LoopInfo *LI = nullptr; 
-  TargetLibraryInfo *TLI = nullptr; 
- 
-  PlaceBackedgeSafepointsImpl(bool CallSafepoints = false) 
-      : FunctionPass(ID), CallSafepointsEnabled(CallSafepoints) { 
-    initializePlaceBackedgeSafepointsImplPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnLoop(Loop *); 
-  void runOnLoopAndSubLoops(Loop *L) { 
-    // Visit all the subloops 
-    for (Loop *I : *L) 
-      runOnLoopAndSubLoops(I); 
-    runOnLoop(L); 
-  } 
- 
-  bool runOnFunction(Function &F) override { 
-    SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 
-    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 
-    TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 
-    for (Loop *I : *LI) { 
-      runOnLoopAndSubLoops(I); 
-    } 
-    return false; 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addRequired<ScalarEvolutionWrapperPass>(); 
-    AU.addRequired<LoopInfoWrapperPass>(); 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-    // We no longer modify the IR at all in this pass.  Thus all 
-    // analysis are preserved. 
-    AU.setPreservesAll(); 
-  } 
-}; 
-} 
- 
-static cl::opt<bool> NoEntry("spp-no-entry", cl::Hidden, cl::init(false)); 
-static cl::opt<bool> NoCall("spp-no-call", cl::Hidden, cl::init(false)); 
-static cl::opt<bool> NoBackedge("spp-no-backedge", cl::Hidden, cl::init(false)); 
- 
-namespace { 
-struct PlaceSafepoints : public FunctionPass { 
-  static char ID; // Pass identification, replacement for typeid 
- 
-  PlaceSafepoints() : FunctionPass(ID) { 
-    initializePlaceSafepointsPass(*PassRegistry::getPassRegistry()); 
-  } 
-  bool runOnFunction(Function &F) override; 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    // We modify the graph wholesale (inlining, block insertion, etc).  We 
-    // preserve nothing at the moment.  We could potentially preserve dom tree 
-    // if that was worth doing 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-  } 
-}; 
-} 
- 
-// Insert a safepoint poll immediately before the given instruction.  Does 
-// not handle the parsability of state at the runtime call, that's the 
-// callers job. 
-static void 
-InsertSafepointPoll(Instruction *InsertBefore, 
-                    std::vector<CallBase *> &ParsePointsNeeded /*rval*/, 
-                    const TargetLibraryInfo &TLI); 
- 
-static bool needsStatepoint(CallBase *Call, const TargetLibraryInfo &TLI) { 
-  if (callsGCLeafFunction(Call, TLI)) 
-    return false; 
-  if (auto *CI = dyn_cast<CallInst>(Call)) { 
-    if (CI->isInlineAsm()) 
-      return false; 
-  } 
- 
-  return !(isa<GCStatepointInst>(Call) || isa<GCRelocateInst>(Call) || 
-           isa<GCResultInst>(Call)); 
-} 
- 
-/// Returns true if this loop is known to contain a call safepoint which 
-/// must unconditionally execute on any iteration of the loop which returns 
-/// to the loop header via an edge from Pred.  Returns a conservative correct 
-/// answer; i.e. false is always valid. 
-static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header, 
-                                               BasicBlock *Pred, 
-                                               DominatorTree &DT, 
-                                               const TargetLibraryInfo &TLI) { 
-  // In general, we're looking for any cut of the graph which ensures 
-  // there's a call safepoint along every edge between Header and Pred. 
-  // For the moment, we look only for the 'cuts' that consist of a single call 
-  // instruction in a block which is dominated by the Header and dominates the 
-  // loop latch (Pred) block.  Somewhat surprisingly, walking the entire chain 
-  // of such dominating blocks gets substantially more occurrences than just 
-  // checking the Pred and Header blocks themselves.  This may be due to the 
-  // density of loop exit conditions caused by range and null checks. 
-  // TODO: structure this as an analysis pass, cache the result for subloops, 
-  // avoid dom tree recalculations 
-  assert(DT.dominates(Header, Pred) && "loop latch not dominated by header?"); 
- 
-  BasicBlock *Current = Pred; 
-  while (true) { 
-    for (Instruction &I : *Current) { 
-      if (auto *Call = dyn_cast<CallBase>(&I)) 
-        // Note: Technically, needing a safepoint isn't quite the right 
-        // condition here.  We should instead be checking if the target method 
-        // has an 
-        // unconditional poll. In practice, this is only a theoretical concern 
-        // since we don't have any methods with conditional-only safepoint 
-        // polls. 
-        if (needsStatepoint(Call, TLI)) 
-          return true; 
-    } 
- 
-    if (Current == Header) 
-      break; 
-    Current = DT.getNode(Current)->getIDom()->getBlock(); 
-  } 
- 
-  return false; 
-} 
- 
-/// Returns true if this loop is known to terminate in a finite number of 
-/// iterations.  Note that this function may return false for a loop which 
-/// does actual terminate in a finite constant number of iterations due to 
-/// conservatism in the analysis. 
-static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE, 
-                                    BasicBlock *Pred) { 
-  // A conservative bound on the loop as a whole. 
-  const SCEV *MaxTrips = SE->getConstantMaxBackedgeTakenCount(L); 
+//===- PlaceSafepoints.cpp - Place GC Safepoints --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Place garbage collection safepoints at appropriate locations in the IR. This
+// does not make relocation semantics or variable liveness explicit.  That's
+// done by RewriteStatepointsForGC.
+//
+// Terminology:
+// - A call is said to be "parseable" if there is a stack map generated for the
+// return PC of the call.  A runtime can determine where values listed in the
+// deopt arguments and (after RewriteStatepointsForGC) gc arguments are located
+// on the stack when the code is suspended inside such a call.  Every parse
+// point is represented by a call wrapped in an gc.statepoint intrinsic.
+// - A "poll" is an explicit check in the generated code to determine if the
+// runtime needs the generated code to cooperate by calling a helper routine
+// and thus suspending its execution at a known state. The call to the helper
+// routine will be parseable.  The (gc & runtime specific) logic of a poll is
+// assumed to be provided in a function of the name "gc.safepoint_poll".
+//
+// We aim to insert polls such that running code can quickly be brought to a
+// well defined state for inspection by the collector.  In the current
+// implementation, this is done via the insertion of poll sites at method entry
+// and the backedge of most loops.  We try to avoid inserting more polls than
+// are necessary to ensure a finite period between poll sites.  This is not
+// because the poll itself is expensive in the generated code; it's not.  Polls
+// do tend to impact the optimizer itself in negative ways; we'd like to avoid
+// perturbing the optimization of the method as much as we can.
+//
+// We also need to make most call sites parseable.  The callee might execute a
+// poll (or otherwise be inspected by the GC).  If so, the entire stack
+// (including the suspended frame of the current method) must be parseable.
+//
+// This pass will insert:
+// - Call parse points ("call safepoints") for any call which may need to
+// reach a safepoint during the execution of the callee function.
+// - Backedge safepoint polls and entry safepoint polls to ensure that
+// executing code reaches a safepoint poll in a finite amount of time.
+//
+// We do not currently support return statepoints, but adding them would not
+// be hard.  They are not required for correctness - entry safepoints are an
+// alternative - but some GCs may prefer them.  Patches welcome.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+#define DEBUG_TYPE "safepoint-placement"
+
+STATISTIC(NumEntrySafepoints, "Number of entry safepoints inserted");
+STATISTIC(NumBackedgeSafepoints, "Number of backedge safepoints inserted");
+
+STATISTIC(CallInLoop,
+          "Number of loops without safepoints due to calls in loop");
+STATISTIC(FiniteExecution,
+          "Number of loops without safepoints finite execution");
+
+using namespace llvm;
+
+// Ignore opportunities to avoid placing safepoints on backedges, useful for
+// validation
+static cl::opt<bool> AllBackedges("spp-all-backedges", cl::Hidden,
+                                  cl::init(false));
+
+/// How narrow does the trip count of a loop have to be to have to be considered
+/// "counted"?  Counted loops do not get safepoints at backedges.
+static cl::opt<int> CountedLoopTripWidth("spp-counted-loop-trip-width",
+                                         cl::Hidden, cl::init(32));
+
+// If true, split the backedge of a loop when placing the safepoint, otherwise
+// split the latch block itself.  Both are useful to support for
+// experimentation, but in practice, it looks like splitting the backedge
+// optimizes better.
+static cl::opt<bool> SplitBackedge("spp-split-backedge", cl::Hidden,
+                                   cl::init(false));
+
+namespace {
+
+/// An analysis pass whose purpose is to identify each of the backedges in
+/// the function which require a safepoint poll to be inserted.
+struct PlaceBackedgeSafepointsImpl : public FunctionPass {
+  static char ID;
+
+  /// The output of the pass - gives a list of each backedge (described by
+  /// pointing at the branch) which need a poll inserted.
+  std::vector<Instruction *> PollLocations;
+
+  /// True unless we're running spp-no-calls in which case we need to disable
+  /// the call-dependent placement opts.
+  bool CallSafepointsEnabled;
+
+  ScalarEvolution *SE = nullptr;
+  DominatorTree *DT = nullptr;
+  LoopInfo *LI = nullptr;
+  TargetLibraryInfo *TLI = nullptr;
+
+  PlaceBackedgeSafepointsImpl(bool CallSafepoints = false)
+      : FunctionPass(ID), CallSafepointsEnabled(CallSafepoints) {
+    initializePlaceBackedgeSafepointsImplPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *);
+  void runOnLoopAndSubLoops(Loop *L) {
+    // Visit all the subloops
+    for (Loop *I : *L)
+      runOnLoopAndSubLoops(I);
+    runOnLoop(L);
+  }
+
+  bool runOnFunction(Function &F) override {
+    SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    for (Loop *I : *LI) {
+      runOnLoopAndSubLoops(I);
+    }
+    return false;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    // We no longer modify the IR at all in this pass.  Thus all
+    // analysis are preserved.
+    AU.setPreservesAll();
+  }
+};
+}
+
+static cl::opt<bool> NoEntry("spp-no-entry", cl::Hidden, cl::init(false));
+static cl::opt<bool> NoCall("spp-no-call", cl::Hidden, cl::init(false));
+static cl::opt<bool> NoBackedge("spp-no-backedge", cl::Hidden, cl::init(false));
+
+namespace {
+struct PlaceSafepoints : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+
+  PlaceSafepoints() : FunctionPass(ID) {
+    initializePlaceSafepointsPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    // We modify the graph wholesale (inlining, block insertion, etc).  We
+    // preserve nothing at the moment.  We could potentially preserve dom tree
+    // if that was worth doing
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+};
+}
+
+// Insert a safepoint poll immediately before the given instruction.  Does
+// not handle the parsability of state at the runtime call, that's the
+// callers job.
+static void
+InsertSafepointPoll(Instruction *InsertBefore,
+                    std::vector<CallBase *> &ParsePointsNeeded /*rval*/,
+                    const TargetLibraryInfo &TLI);
+
+static bool needsStatepoint(CallBase *Call, const TargetLibraryInfo &TLI) {
+  if (callsGCLeafFunction(Call, TLI))
+    return false;
+  if (auto *CI = dyn_cast<CallInst>(Call)) {
+    if (CI->isInlineAsm())
+      return false;
+  }
+
+  return !(isa<GCStatepointInst>(Call) || isa<GCRelocateInst>(Call) ||
+           isa<GCResultInst>(Call));
+}
+
+/// Returns true if this loop is known to contain a call safepoint which
+/// must unconditionally execute on any iteration of the loop which returns
+/// to the loop header via an edge from Pred.  Returns a conservative correct
+/// answer; i.e. false is always valid.
+static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header,
+                                               BasicBlock *Pred,
+                                               DominatorTree &DT,
+                                               const TargetLibraryInfo &TLI) {
+  // In general, we're looking for any cut of the graph which ensures
+  // there's a call safepoint along every edge between Header and Pred.
+  // For the moment, we look only for the 'cuts' that consist of a single call
+  // instruction in a block which is dominated by the Header and dominates the
+  // loop latch (Pred) block.  Somewhat surprisingly, walking the entire chain
+  // of such dominating blocks gets substantially more occurrences than just
+  // checking the Pred and Header blocks themselves.  This may be due to the
+  // density of loop exit conditions caused by range and null checks.
+  // TODO: structure this as an analysis pass, cache the result for subloops,
+  // avoid dom tree recalculations
+  assert(DT.dominates(Header, Pred) && "loop latch not dominated by header?");
+
+  BasicBlock *Current = Pred;
+  while (true) {
+    for (Instruction &I : *Current) {
+      if (auto *Call = dyn_cast<CallBase>(&I))
+        // Note: Technically, needing a safepoint isn't quite the right
+        // condition here.  We should instead be checking if the target method
+        // has an
+        // unconditional poll. In practice, this is only a theoretical concern
+        // since we don't have any methods with conditional-only safepoint
+        // polls.
+        if (needsStatepoint(Call, TLI))
+          return true;
+    }
+
+    if (Current == Header)
+      break;
+    Current = DT.getNode(Current)->getIDom()->getBlock();
+  }
+
+  return false;
+}
+
+/// Returns true if this loop is known to terminate in a finite number of
+/// iterations.  Note that this function may return false for a loop which
+/// does actual terminate in a finite constant number of iterations due to
+/// conservatism in the analysis.
+static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE,
+                                    BasicBlock *Pred) {
+  // A conservative bound on the loop as a whole.
+  const SCEV *MaxTrips = SE->getConstantMaxBackedgeTakenCount(L);
   if (!isa<SCEVCouldNotCompute>(MaxTrips) &&
-      SE->getUnsignedRange(MaxTrips).getUnsignedMax().isIntN( 
-          CountedLoopTripWidth)) 
-    return true; 
- 
-  // If this is a conditional branch to the header with the alternate path 
-  // being outside the loop, we can ask questions about the execution frequency 
-  // of the exit block. 
-  if (L->isLoopExiting(Pred)) { 
-    // This returns an exact expression only.  TODO: We really only need an 
-    // upper bound here, but SE doesn't expose that. 
-    const SCEV *MaxExec = SE->getExitCount(L, Pred); 
+      SE->getUnsignedRange(MaxTrips).getUnsignedMax().isIntN(
+          CountedLoopTripWidth))
+    return true;
+
+  // If this is a conditional branch to the header with the alternate path
+  // being outside the loop, we can ask questions about the execution frequency
+  // of the exit block.
+  if (L->isLoopExiting(Pred)) {
+    // This returns an exact expression only.  TODO: We really only need an
+    // upper bound here, but SE doesn't expose that.
+    const SCEV *MaxExec = SE->getExitCount(L, Pred);
     if (!isa<SCEVCouldNotCompute>(MaxExec) &&
-        SE->getUnsignedRange(MaxExec).getUnsignedMax().isIntN( 
-            CountedLoopTripWidth)) 
-        return true; 
-  } 
- 
-  return /* not finite */ false; 
-} 
- 
-static void scanOneBB(Instruction *Start, Instruction *End, 
-                      std::vector<CallInst *> &Calls, 
-                      DenseSet<BasicBlock *> &Seen, 
-                      std::vector<BasicBlock *> &Worklist) { 
-  for (BasicBlock::iterator BBI(Start), BBE0 = Start->getParent()->end(), 
-                                        BBE1 = BasicBlock::iterator(End); 
-       BBI != BBE0 && BBI != BBE1; BBI++) { 
-    if (CallInst *CI = dyn_cast<CallInst>(&*BBI)) 
-      Calls.push_back(CI); 
- 
-    // FIXME: This code does not handle invokes 
-    assert(!isa<InvokeInst>(&*BBI) && 
-           "support for invokes in poll code needed"); 
- 
-    // Only add the successor blocks if we reach the terminator instruction 
-    // without encountering end first 
-    if (BBI->isTerminator()) { 
-      BasicBlock *BB = BBI->getParent(); 
-      for (BasicBlock *Succ : successors(BB)) { 
-        if (Seen.insert(Succ).second) { 
-          Worklist.push_back(Succ); 
-        } 
-      } 
-    } 
-  } 
-} 
- 
-static void scanInlinedCode(Instruction *Start, Instruction *End, 
-                            std::vector<CallInst *> &Calls, 
-                            DenseSet<BasicBlock *> &Seen) { 
-  Calls.clear(); 
-  std::vector<BasicBlock *> Worklist; 
-  Seen.insert(Start->getParent()); 
-  scanOneBB(Start, End, Calls, Seen, Worklist); 
-  while (!Worklist.empty()) { 
-    BasicBlock *BB = Worklist.back(); 
-    Worklist.pop_back(); 
-    scanOneBB(&*BB->begin(), End, Calls, Seen, Worklist); 
-  } 
-} 
- 
-bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L) { 
-  // Loop through all loop latches (branches controlling backedges).  We need 
-  // to place a safepoint on every backedge (potentially). 
-  // Note: In common usage, there will be only one edge due to LoopSimplify 
-  // having run sometime earlier in the pipeline, but this code must be correct 
-  // w.r.t. loops with multiple backedges. 
-  BasicBlock *Header = L->getHeader(); 
-  SmallVector<BasicBlock*, 16> LoopLatches; 
-  L->getLoopLatches(LoopLatches); 
-  for (BasicBlock *Pred : LoopLatches) { 
-    assert(L->contains(Pred)); 
- 
-    // Make a policy decision about whether this loop needs a safepoint or 
-    // not.  Note that this is about unburdening the optimizer in loops, not 
-    // avoiding the runtime cost of the actual safepoint. 
-    if (!AllBackedges) { 
-      if (mustBeFiniteCountedLoop(L, SE, Pred)) { 
-        LLVM_DEBUG(dbgs() << "skipping safepoint placement in finite loop\n"); 
-        FiniteExecution++; 
-        continue; 
-      } 
-      if (CallSafepointsEnabled && 
-          containsUnconditionalCallSafepoint(L, Header, Pred, *DT, *TLI)) { 
-        // Note: This is only semantically legal since we won't do any further 
-        // IPO or inlining before the actual call insertion..  If we hadn't, we 
-        // might latter loose this call safepoint. 
-        LLVM_DEBUG( 
-            dbgs() 
-            << "skipping safepoint placement due to unconditional call\n"); 
-        CallInLoop++; 
-        continue; 
-      } 
-    } 
- 
-    // TODO: We can create an inner loop which runs a finite number of 
-    // iterations with an outer loop which contains a safepoint.  This would 
-    // not help runtime performance that much, but it might help our ability to 
-    // optimize the inner loop. 
- 
-    // Safepoint insertion would involve creating a new basic block (as the 
-    // target of the current backedge) which does the safepoint (of all live 
-    // variables) and branches to the true header 
-    Instruction *Term = Pred->getTerminator(); 
- 
-    LLVM_DEBUG(dbgs() << "[LSP] terminator instruction: " << *Term); 
- 
-    PollLocations.push_back(Term); 
-  } 
- 
-  return false; 
-} 
- 
-/// Returns true if an entry safepoint is not required before this callsite in 
-/// the caller function. 
-static bool doesNotRequireEntrySafepointBefore(CallBase *Call) { 
-  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Call)) { 
-    switch (II->getIntrinsicID()) { 
-    case Intrinsic::experimental_gc_statepoint: 
-    case Intrinsic::experimental_patchpoint_void: 
-    case Intrinsic::experimental_patchpoint_i64: 
-      // The can wrap an actual call which may grow the stack by an unbounded 
-      // amount or run forever. 
-      return false; 
-    default: 
-      // Most LLVM intrinsics are things which do not expand to actual calls, or 
-      // at least if they do, are leaf functions that cause only finite stack 
-      // growth.  In particular, the optimizer likes to form things like memsets 
-      // out of stores in the original IR.  Another important example is 
-      // llvm.localescape which must occur in the entry block.  Inserting a 
-      // safepoint before it is not legal since it could push the localescape 
-      // out of the entry block. 
-      return true; 
-    } 
-  } 
-  return false; 
-} 
- 
-static Instruction *findLocationForEntrySafepoint(Function &F, 
-                                                  DominatorTree &DT) { 
- 
-  // Conceptually, this poll needs to be on method entry, but in 
-  // practice, we place it as late in the entry block as possible.  We 
-  // can place it as late as we want as long as it dominates all calls 
-  // that can grow the stack.  This, combined with backedge polls, 
-  // give us all the progress guarantees we need. 
- 
-  // hasNextInstruction and nextInstruction are used to iterate 
-  // through a "straight line" execution sequence. 
- 
-  auto HasNextInstruction = [](Instruction *I) { 
-    if (!I->isTerminator()) 
-      return true; 
- 
-    BasicBlock *nextBB = I->getParent()->getUniqueSuccessor(); 
-    return nextBB && (nextBB->getUniquePredecessor() != nullptr); 
-  }; 
- 
-  auto NextInstruction = [&](Instruction *I) { 
-    assert(HasNextInstruction(I) && 
-           "first check if there is a next instruction!"); 
- 
-    if (I->isTerminator()) 
-      return &I->getParent()->getUniqueSuccessor()->front(); 
-    return &*++I->getIterator(); 
-  }; 
- 
-  Instruction *Cursor = nullptr; 
-  for (Cursor = &F.getEntryBlock().front(); HasNextInstruction(Cursor); 
-       Cursor = NextInstruction(Cursor)) { 
- 
-    // We need to ensure a safepoint poll occurs before any 'real' call.  The 
-    // easiest way to ensure finite execution between safepoints in the face of 
-    // recursive and mutually recursive functions is to enforce that each take 
-    // a safepoint.  Additionally, we need to ensure a poll before any call 
-    // which can grow the stack by an unbounded amount.  This isn't required 
-    // for GC semantics per se, but is a common requirement for languages 
-    // which detect stack overflow via guard pages and then throw exceptions. 
-    if (auto *Call = dyn_cast<CallBase>(Cursor)) { 
-      if (doesNotRequireEntrySafepointBefore(Call)) 
-        continue; 
-      break; 
-    } 
-  } 
- 
-  assert((HasNextInstruction(Cursor) || Cursor->isTerminator()) && 
-         "either we stopped because of a call, or because of terminator"); 
- 
-  return Cursor; 
-} 
- 
+        SE->getUnsignedRange(MaxExec).getUnsignedMax().isIntN(
+            CountedLoopTripWidth))
+        return true;
+  }
+
+  return /* not finite */ false;
+}
+
+static void scanOneBB(Instruction *Start, Instruction *End,
+                      std::vector<CallInst *> &Calls,
+                      DenseSet<BasicBlock *> &Seen,
+                      std::vector<BasicBlock *> &Worklist) {
+  for (BasicBlock::iterator BBI(Start), BBE0 = Start->getParent()->end(),
+                                        BBE1 = BasicBlock::iterator(End);
+       BBI != BBE0 && BBI != BBE1; BBI++) {
+    if (CallInst *CI = dyn_cast<CallInst>(&*BBI))
+      Calls.push_back(CI);
+
+    // FIXME: This code does not handle invokes
+    assert(!isa<InvokeInst>(&*BBI) &&
+           "support for invokes in poll code needed");
+
+    // Only add the successor blocks if we reach the terminator instruction
+    // without encountering end first
+    if (BBI->isTerminator()) {
+      BasicBlock *BB = BBI->getParent();
+      for (BasicBlock *Succ : successors(BB)) {
+        if (Seen.insert(Succ).second) {
+          Worklist.push_back(Succ);
+        }
+      }
+    }
+  }
+}
+
+static void scanInlinedCode(Instruction *Start, Instruction *End,
+                            std::vector<CallInst *> &Calls,
+                            DenseSet<BasicBlock *> &Seen) {
+  Calls.clear();
+  std::vector<BasicBlock *> Worklist;
+  Seen.insert(Start->getParent());
+  scanOneBB(Start, End, Calls, Seen, Worklist);
+  while (!Worklist.empty()) {
+    BasicBlock *BB = Worklist.back();
+    Worklist.pop_back();
+    scanOneBB(&*BB->begin(), End, Calls, Seen, Worklist);
+  }
+}
+
+bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L) {
+  // Loop through all loop latches (branches controlling backedges).  We need
+  // to place a safepoint on every backedge (potentially).
+  // Note: In common usage, there will be only one edge due to LoopSimplify
+  // having run sometime earlier in the pipeline, but this code must be correct
+  // w.r.t. loops with multiple backedges.
+  BasicBlock *Header = L->getHeader();
+  SmallVector<BasicBlock*, 16> LoopLatches;
+  L->getLoopLatches(LoopLatches);
+  for (BasicBlock *Pred : LoopLatches) {
+    assert(L->contains(Pred));
+
+    // Make a policy decision about whether this loop needs a safepoint or
+    // not.  Note that this is about unburdening the optimizer in loops, not
+    // avoiding the runtime cost of the actual safepoint.
+    if (!AllBackedges) {
+      if (mustBeFiniteCountedLoop(L, SE, Pred)) {
+        LLVM_DEBUG(dbgs() << "skipping safepoint placement in finite loop\n");
+        FiniteExecution++;
+        continue;
+      }
+      if (CallSafepointsEnabled &&
+          containsUnconditionalCallSafepoint(L, Header, Pred, *DT, *TLI)) {
+        // Note: This is only semantically legal since we won't do any further
+        // IPO or inlining before the actual call insertion..  If we hadn't, we
+        // might latter loose this call safepoint.
+        LLVM_DEBUG(
+            dbgs()
+            << "skipping safepoint placement due to unconditional call\n");
+        CallInLoop++;
+        continue;
+      }
+    }
+
+    // TODO: We can create an inner loop which runs a finite number of
+    // iterations with an outer loop which contains a safepoint.  This would
+    // not help runtime performance that much, but it might help our ability to
+    // optimize the inner loop.
+
+    // Safepoint insertion would involve creating a new basic block (as the
+    // target of the current backedge) which does the safepoint (of all live
+    // variables) and branches to the true header
+    Instruction *Term = Pred->getTerminator();
+
+    LLVM_DEBUG(dbgs() << "[LSP] terminator instruction: " << *Term);
+
+    PollLocations.push_back(Term);
+  }
+
+  return false;
+}
+
+/// Returns true if an entry safepoint is not required before this callsite in
+/// the caller function.
+static bool doesNotRequireEntrySafepointBefore(CallBase *Call) {
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Call)) {
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::experimental_gc_statepoint:
+    case Intrinsic::experimental_patchpoint_void:
+    case Intrinsic::experimental_patchpoint_i64:
+      // The can wrap an actual call which may grow the stack by an unbounded
+      // amount or run forever.
+      return false;
+    default:
+      // Most LLVM intrinsics are things which do not expand to actual calls, or
+      // at least if they do, are leaf functions that cause only finite stack
+      // growth.  In particular, the optimizer likes to form things like memsets
+      // out of stores in the original IR.  Another important example is
+      // llvm.localescape which must occur in the entry block.  Inserting a
+      // safepoint before it is not legal since it could push the localescape
+      // out of the entry block.
+      return true;
+    }
+  }
+  return false;
+}
+
+static Instruction *findLocationForEntrySafepoint(Function &F,
+                                                  DominatorTree &DT) {
+
+  // Conceptually, this poll needs to be on method entry, but in
+  // practice, we place it as late in the entry block as possible.  We
+  // can place it as late as we want as long as it dominates all calls
+  // that can grow the stack.  This, combined with backedge polls,
+  // give us all the progress guarantees we need.
+
+  // hasNextInstruction and nextInstruction are used to iterate
+  // through a "straight line" execution sequence.
+
+  auto HasNextInstruction = [](Instruction *I) {
+    if (!I->isTerminator())
+      return true;
+
+    BasicBlock *nextBB = I->getParent()->getUniqueSuccessor();
+    return nextBB && (nextBB->getUniquePredecessor() != nullptr);
+  };
+
+  auto NextInstruction = [&](Instruction *I) {
+    assert(HasNextInstruction(I) &&
+           "first check if there is a next instruction!");
+
+    if (I->isTerminator())
+      return &I->getParent()->getUniqueSuccessor()->front();
+    return &*++I->getIterator();
+  };
+
+  Instruction *Cursor = nullptr;
+  for (Cursor = &F.getEntryBlock().front(); HasNextInstruction(Cursor);
+       Cursor = NextInstruction(Cursor)) {
+
+    // We need to ensure a safepoint poll occurs before any 'real' call.  The
+    // easiest way to ensure finite execution between safepoints in the face of
+    // recursive and mutually recursive functions is to enforce that each take
+    // a safepoint.  Additionally, we need to ensure a poll before any call
+    // which can grow the stack by an unbounded amount.  This isn't required
+    // for GC semantics per se, but is a common requirement for languages
+    // which detect stack overflow via guard pages and then throw exceptions.
+    if (auto *Call = dyn_cast<CallBase>(Cursor)) {
+      if (doesNotRequireEntrySafepointBefore(Call))
+        continue;
+      break;
+    }
+  }
+
+  assert((HasNextInstruction(Cursor) || Cursor->isTerminator()) &&
+         "either we stopped because of a call, or because of terminator");
+
+  return Cursor;
+}
+
 const char GCSafepointPollName[] = "gc.safepoint_poll";
- 
-static bool isGCSafepointPoll(Function &F) { 
-  return F.getName().equals(GCSafepointPollName); 
-} 
- 
-/// Returns true if this function should be rewritten to include safepoint 
-/// polls and parseable call sites.  The main point of this function is to be 
-/// an extension point for custom logic. 
-static bool shouldRewriteFunction(Function &F) { 
-  // TODO: This should check the GCStrategy 
-  if (F.hasGC()) { 
-    const auto &FunctionGCName = F.getGC(); 
-    const StringRef StatepointExampleName("statepoint-example"); 
-    const StringRef CoreCLRName("coreclr"); 
-    return (StatepointExampleName == FunctionGCName) || 
-           (CoreCLRName == FunctionGCName); 
-  } else 
-    return false; 
-} 
- 
-// TODO: These should become properties of the GCStrategy, possibly with 
-// command line overrides. 
-static bool enableEntrySafepoints(Function &F) { return !NoEntry; } 
-static bool enableBackedgeSafepoints(Function &F) { return !NoBackedge; } 
-static bool enableCallSafepoints(Function &F) { return !NoCall; } 
- 
-bool PlaceSafepoints::runOnFunction(Function &F) { 
-  if (F.isDeclaration() || F.empty()) { 
-    // This is a declaration, nothing to do.  Must exit early to avoid crash in 
-    // dom tree calculation 
-    return false; 
-  } 
- 
-  if (isGCSafepointPoll(F)) { 
-    // Given we're inlining this inside of safepoint poll insertion, this 
-    // doesn't make any sense.  Note that we do make any contained calls 
-    // parseable after we inline a poll. 
-    return false; 
-  } 
- 
-  if (!shouldRewriteFunction(F)) 
-    return false; 
- 
-  const TargetLibraryInfo &TLI = 
-      getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 
- 
-  bool Modified = false; 
- 
-  // In various bits below, we rely on the fact that uses are reachable from 
-  // defs.  When there are basic blocks unreachable from the entry, dominance 
-  // and reachablity queries return non-sensical results.  Thus, we preprocess 
-  // the function to ensure these properties hold. 
-  Modified |= removeUnreachableBlocks(F); 
- 
-  // STEP 1 - Insert the safepoint polling locations.  We do not need to 
-  // actually insert parse points yet.  That will be done for all polls and 
-  // calls in a single pass. 
- 
-  DominatorTree DT; 
-  DT.recalculate(F); 
- 
-  SmallVector<Instruction *, 16> PollsNeeded; 
-  std::vector<CallBase *> ParsePointNeeded; 
- 
-  if (enableBackedgeSafepoints(F)) { 
-    // Construct a pass manager to run the LoopPass backedge logic.  We 
-    // need the pass manager to handle scheduling all the loop passes 
-    // appropriately.  Doing this by hand is painful and just not worth messing 
-    // with for the moment. 
-    legacy::FunctionPassManager FPM(F.getParent()); 
-    bool CanAssumeCallSafepoints = enableCallSafepoints(F); 
-    auto *PBS = new PlaceBackedgeSafepointsImpl(CanAssumeCallSafepoints); 
-    FPM.add(PBS); 
-    FPM.run(F); 
- 
-    // We preserve dominance information when inserting the poll, otherwise 
-    // we'd have to recalculate this on every insert 
-    DT.recalculate(F); 
- 
-    auto &PollLocations = PBS->PollLocations; 
- 
-    auto OrderByBBName = [](Instruction *a, Instruction *b) { 
-      return a->getParent()->getName() < b->getParent()->getName(); 
-    }; 
-    // We need the order of list to be stable so that naming ends up stable 
-    // when we split edges.  This makes test cases much easier to write. 
-    llvm::sort(PollLocations, OrderByBBName); 
- 
-    // We can sometimes end up with duplicate poll locations.  This happens if 
-    // a single loop is visited more than once.   The fact this happens seems 
-    // wrong, but it does happen for the split-backedge.ll test case. 
-    PollLocations.erase(std::unique(PollLocations.begin(), 
-                                    PollLocations.end()), 
-                        PollLocations.end()); 
- 
-    // Insert a poll at each point the analysis pass identified 
-    // The poll location must be the terminator of a loop latch block. 
-    for (Instruction *Term : PollLocations) { 
-      // We are inserting a poll, the function is modified 
-      Modified = true; 
- 
-      if (SplitBackedge) { 
-        // Split the backedge of the loop and insert the poll within that new 
-        // basic block.  This creates a loop with two latches per original 
-        // latch (which is non-ideal), but this appears to be easier to 
-        // optimize in practice than inserting the poll immediately before the 
-        // latch test. 
- 
-        // Since this is a latch, at least one of the successors must dominate 
-        // it. Its possible that we have a) duplicate edges to the same header 
-        // and b) edges to distinct loop headers.  We need to insert pools on 
-        // each. 
-        SetVector<BasicBlock *> Headers; 
-        for (unsigned i = 0; i < Term->getNumSuccessors(); i++) { 
-          BasicBlock *Succ = Term->getSuccessor(i); 
-          if (DT.dominates(Succ, Term->getParent())) { 
-            Headers.insert(Succ); 
-          } 
-        } 
-        assert(!Headers.empty() && "poll location is not a loop latch?"); 
- 
-        // The split loop structure here is so that we only need to recalculate 
-        // the dominator tree once.  Alternatively, we could just keep it up to 
-        // date and use a more natural merged loop. 
-        SetVector<BasicBlock *> SplitBackedges; 
-        for (BasicBlock *Header : Headers) { 
-          BasicBlock *NewBB = SplitEdge(Term->getParent(), Header, &DT); 
-          PollsNeeded.push_back(NewBB->getTerminator()); 
-          NumBackedgeSafepoints++; 
-        } 
-      } else { 
-        // Split the latch block itself, right before the terminator. 
-        PollsNeeded.push_back(Term); 
-        NumBackedgeSafepoints++; 
-      } 
-    } 
-  } 
- 
-  if (enableEntrySafepoints(F)) { 
-    if (Instruction *Location = findLocationForEntrySafepoint(F, DT)) { 
-      PollsNeeded.push_back(Location); 
-      Modified = true; 
-      NumEntrySafepoints++; 
-    } 
-    // TODO: else we should assert that there was, in fact, a policy choice to 
-    // not insert a entry safepoint poll. 
-  } 
- 
-  // Now that we've identified all the needed safepoint poll locations, insert 
-  // safepoint polls themselves. 
-  for (Instruction *PollLocation : PollsNeeded) { 
-    std::vector<CallBase *> RuntimeCalls; 
-    InsertSafepointPoll(PollLocation, RuntimeCalls, TLI); 
+
+static bool isGCSafepointPoll(Function &F) {
+  return F.getName().equals(GCSafepointPollName);
+}
+
+/// Returns true if this function should be rewritten to include safepoint
+/// polls and parseable call sites.  The main point of this function is to be
+/// an extension point for custom logic.
+static bool shouldRewriteFunction(Function &F) {
+  // TODO: This should check the GCStrategy
+  if (F.hasGC()) {
+    const auto &FunctionGCName = F.getGC();
+    const StringRef StatepointExampleName("statepoint-example");
+    const StringRef CoreCLRName("coreclr");
+    return (StatepointExampleName == FunctionGCName) ||
+           (CoreCLRName == FunctionGCName);
+  } else
+    return false;
+}
+
+// TODO: These should become properties of the GCStrategy, possibly with
+// command line overrides.
+static bool enableEntrySafepoints(Function &F) { return !NoEntry; }
+static bool enableBackedgeSafepoints(Function &F) { return !NoBackedge; }
+static bool enableCallSafepoints(Function &F) { return !NoCall; }
+
+bool PlaceSafepoints::runOnFunction(Function &F) {
+  if (F.isDeclaration() || F.empty()) {
+    // This is a declaration, nothing to do.  Must exit early to avoid crash in
+    // dom tree calculation
+    return false;
+  }
+
+  if (isGCSafepointPoll(F)) {
+    // Given we're inlining this inside of safepoint poll insertion, this
+    // doesn't make any sense.  Note that we do make any contained calls
+    // parseable after we inline a poll.
+    return false;
+  }
+
+  if (!shouldRewriteFunction(F))
+    return false;
+
+  const TargetLibraryInfo &TLI =
+      getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+
+  bool Modified = false;
+
+  // In various bits below, we rely on the fact that uses are reachable from
+  // defs.  When there are basic blocks unreachable from the entry, dominance
+  // and reachablity queries return non-sensical results.  Thus, we preprocess
+  // the function to ensure these properties hold.
+  Modified |= removeUnreachableBlocks(F);
+
+  // STEP 1 - Insert the safepoint polling locations.  We do not need to
+  // actually insert parse points yet.  That will be done for all polls and
+  // calls in a single pass.
+
+  DominatorTree DT;
+  DT.recalculate(F);
+
+  SmallVector<Instruction *, 16> PollsNeeded;
+  std::vector<CallBase *> ParsePointNeeded;
+
+  if (enableBackedgeSafepoints(F)) {
+    // Construct a pass manager to run the LoopPass backedge logic.  We
+    // need the pass manager to handle scheduling all the loop passes
+    // appropriately.  Doing this by hand is painful and just not worth messing
+    // with for the moment.
+    legacy::FunctionPassManager FPM(F.getParent());
+    bool CanAssumeCallSafepoints = enableCallSafepoints(F);
+    auto *PBS = new PlaceBackedgeSafepointsImpl(CanAssumeCallSafepoints);
+    FPM.add(PBS);
+    FPM.run(F);
+
+    // We preserve dominance information when inserting the poll, otherwise
+    // we'd have to recalculate this on every insert
+    DT.recalculate(F);
+
+    auto &PollLocations = PBS->PollLocations;
+
+    auto OrderByBBName = [](Instruction *a, Instruction *b) {
+      return a->getParent()->getName() < b->getParent()->getName();
+    };
+    // We need the order of list to be stable so that naming ends up stable
+    // when we split edges.  This makes test cases much easier to write.
+    llvm::sort(PollLocations, OrderByBBName);
+
+    // We can sometimes end up with duplicate poll locations.  This happens if
+    // a single loop is visited more than once.   The fact this happens seems
+    // wrong, but it does happen for the split-backedge.ll test case.
+    PollLocations.erase(std::unique(PollLocations.begin(),
+                                    PollLocations.end()),
+                        PollLocations.end());
+
+    // Insert a poll at each point the analysis pass identified
+    // The poll location must be the terminator of a loop latch block.
+    for (Instruction *Term : PollLocations) {
+      // We are inserting a poll, the function is modified
+      Modified = true;
+
+      if (SplitBackedge) {
+        // Split the backedge of the loop and insert the poll within that new
+        // basic block.  This creates a loop with two latches per original
+        // latch (which is non-ideal), but this appears to be easier to
+        // optimize in practice than inserting the poll immediately before the
+        // latch test.
+
+        // Since this is a latch, at least one of the successors must dominate
+        // it. Its possible that we have a) duplicate edges to the same header
+        // and b) edges to distinct loop headers.  We need to insert pools on
+        // each.
+        SetVector<BasicBlock *> Headers;
+        for (unsigned i = 0; i < Term->getNumSuccessors(); i++) {
+          BasicBlock *Succ = Term->getSuccessor(i);
+          if (DT.dominates(Succ, Term->getParent())) {
+            Headers.insert(Succ);
+          }
+        }
+        assert(!Headers.empty() && "poll location is not a loop latch?");
+
+        // The split loop structure here is so that we only need to recalculate
+        // the dominator tree once.  Alternatively, we could just keep it up to
+        // date and use a more natural merged loop.
+        SetVector<BasicBlock *> SplitBackedges;
+        for (BasicBlock *Header : Headers) {
+          BasicBlock *NewBB = SplitEdge(Term->getParent(), Header, &DT);
+          PollsNeeded.push_back(NewBB->getTerminator());
+          NumBackedgeSafepoints++;
+        }
+      } else {
+        // Split the latch block itself, right before the terminator.
+        PollsNeeded.push_back(Term);
+        NumBackedgeSafepoints++;
+      }
+    }
+  }
+
+  if (enableEntrySafepoints(F)) {
+    if (Instruction *Location = findLocationForEntrySafepoint(F, DT)) {
+      PollsNeeded.push_back(Location);
+      Modified = true;
+      NumEntrySafepoints++;
+    }
+    // TODO: else we should assert that there was, in fact, a policy choice to
+    // not insert a entry safepoint poll.
+  }
+
+  // Now that we've identified all the needed safepoint poll locations, insert
+  // safepoint polls themselves.
+  for (Instruction *PollLocation : PollsNeeded) {
+    std::vector<CallBase *> RuntimeCalls;
+    InsertSafepointPoll(PollLocation, RuntimeCalls, TLI);
     llvm::append_range(ParsePointNeeded, RuntimeCalls);
-  } 
- 
-  return Modified; 
-} 
- 
-char PlaceBackedgeSafepointsImpl::ID = 0; 
-char PlaceSafepoints::ID = 0; 
- 
-FunctionPass *llvm::createPlaceSafepointsPass() { 
-  return new PlaceSafepoints(); 
-} 
- 
-INITIALIZE_PASS_BEGIN(PlaceBackedgeSafepointsImpl, 
-                      "place-backedge-safepoints-impl", 
-                      "Place Backedge Safepoints", false, false) 
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 
-INITIALIZE_PASS_END(PlaceBackedgeSafepointsImpl, 
-                    "place-backedge-safepoints-impl", 
-                    "Place Backedge Safepoints", false, false) 
- 
-INITIALIZE_PASS_BEGIN(PlaceSafepoints, "place-safepoints", "Place Safepoints", 
-                      false, false) 
-INITIALIZE_PASS_END(PlaceSafepoints, "place-safepoints", "Place Safepoints", 
-                    false, false) 
- 
-static void 
-InsertSafepointPoll(Instruction *InsertBefore, 
-                    std::vector<CallBase *> &ParsePointsNeeded /*rval*/, 
-                    const TargetLibraryInfo &TLI) { 
-  BasicBlock *OrigBB = InsertBefore->getParent(); 
-  Module *M = InsertBefore->getModule(); 
-  assert(M && "must be part of a module"); 
- 
-  // Inline the safepoint poll implementation - this will get all the branch, 
-  // control flow, etc..  Most importantly, it will introduce the actual slow 
-  // path call - where we need to insert a safepoint (parsepoint). 
- 
-  auto *F = M->getFunction(GCSafepointPollName); 
-  assert(F && "gc.safepoint_poll function is missing"); 
-  assert(F->getValueType() == 
-         FunctionType::get(Type::getVoidTy(M->getContext()), false) && 
-         "gc.safepoint_poll declared with wrong type"); 
-  assert(!F->empty() && "gc.safepoint_poll must be a non-empty function"); 
-  CallInst *PollCall = CallInst::Create(F, "", InsertBefore); 
- 
-  // Record some information about the call site we're replacing 
-  BasicBlock::iterator Before(PollCall), After(PollCall); 
-  bool IsBegin = false; 
-  if (Before == OrigBB->begin()) 
-    IsBegin = true; 
-  else 
-    Before--; 
- 
-  After++; 
-  assert(After != OrigBB->end() && "must have successor"); 
- 
-  // Do the actual inlining 
-  InlineFunctionInfo IFI; 
-  bool InlineStatus = InlineFunction(*PollCall, IFI).isSuccess(); 
-  assert(InlineStatus && "inline must succeed"); 
-  (void)InlineStatus; // suppress warning in release-asserts 
- 
-  // Check post-conditions 
-  assert(IFI.StaticAllocas.empty() && "can't have allocs"); 
- 
-  std::vector<CallInst *> Calls; // new calls 
-  DenseSet<BasicBlock *> BBs;    // new BBs + insertee 
- 
-  // Include only the newly inserted instructions, Note: begin may not be valid 
-  // if we inserted to the beginning of the basic block 
-  BasicBlock::iterator Start = IsBegin ? OrigBB->begin() : std::next(Before); 
- 
-  // If your poll function includes an unreachable at the end, that's not 
-  // valid.  Bugpoint likes to create this, so check for it. 
-  assert(isPotentiallyReachable(&*Start, &*After) && 
-         "malformed poll function"); 
- 
-  scanInlinedCode(&*Start, &*After, Calls, BBs); 
-  assert(!Calls.empty() && "slow path not found for safepoint poll"); 
- 
-  // Record the fact we need a parsable state at the runtime call contained in 
-  // the poll function.  This is required so that the runtime knows how to 
-  // parse the last frame when we actually take  the safepoint (i.e. execute 
-  // the slow path) 
-  assert(ParsePointsNeeded.empty()); 
-  for (auto *CI : Calls) { 
-    // No safepoint needed or wanted 
-    if (!needsStatepoint(CI, TLI)) 
-      continue; 
- 
-    // These are likely runtime calls.  Should we assert that via calling 
-    // convention or something? 
-    ParsePointsNeeded.push_back(CI); 
-  } 
-  assert(ParsePointsNeeded.size() <= Calls.size()); 
-} 
+  }
+
+  return Modified;
+}
+
+char PlaceBackedgeSafepointsImpl::ID = 0;
+char PlaceSafepoints::ID = 0;
+
+FunctionPass *llvm::createPlaceSafepointsPass() {
+  return new PlaceSafepoints();
+}
+
+INITIALIZE_PASS_BEGIN(PlaceBackedgeSafepointsImpl,
+                      "place-backedge-safepoints-impl",
+                      "Place Backedge Safepoints", false, false)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(PlaceBackedgeSafepointsImpl,
+                    "place-backedge-safepoints-impl",
+                    "Place Backedge Safepoints", false, false)
+
+INITIALIZE_PASS_BEGIN(PlaceSafepoints, "place-safepoints", "Place Safepoints",
+                      false, false)
+INITIALIZE_PASS_END(PlaceSafepoints, "place-safepoints", "Place Safepoints",
+                    false, false)
+
+static void
+InsertSafepointPoll(Instruction *InsertBefore,
+                    std::vector<CallBase *> &ParsePointsNeeded /*rval*/,
+                    const TargetLibraryInfo &TLI) {
+  BasicBlock *OrigBB = InsertBefore->getParent();
+  Module *M = InsertBefore->getModule();
+  assert(M && "must be part of a module");
+
+  // Inline the safepoint poll implementation - this will get all the branch,
+  // control flow, etc..  Most importantly, it will introduce the actual slow
+  // path call - where we need to insert a safepoint (parsepoint).
+
+  auto *F = M->getFunction(GCSafepointPollName);
+  assert(F && "gc.safepoint_poll function is missing");
+  assert(F->getValueType() ==
+         FunctionType::get(Type::getVoidTy(M->getContext()), false) &&
+         "gc.safepoint_poll declared with wrong type");
+  assert(!F->empty() && "gc.safepoint_poll must be a non-empty function");
+  CallInst *PollCall = CallInst::Create(F, "", InsertBefore);
+
+  // Record some information about the call site we're replacing
+  BasicBlock::iterator Before(PollCall), After(PollCall);
+  bool IsBegin = false;
+  if (Before == OrigBB->begin())
+    IsBegin = true;
+  else
+    Before--;
+
+  After++;
+  assert(After != OrigBB->end() && "must have successor");
+
+  // Do the actual inlining
+  InlineFunctionInfo IFI;
+  bool InlineStatus = InlineFunction(*PollCall, IFI).isSuccess();
+  assert(InlineStatus && "inline must succeed");
+  (void)InlineStatus; // suppress warning in release-asserts
+
+  // Check post-conditions
+  assert(IFI.StaticAllocas.empty() && "can't have allocs");
+
+  std::vector<CallInst *> Calls; // new calls
+  DenseSet<BasicBlock *> BBs;    // new BBs + insertee
+
+  // Include only the newly inserted instructions, Note: begin may not be valid
+  // if we inserted to the beginning of the basic block
+  BasicBlock::iterator Start = IsBegin ? OrigBB->begin() : std::next(Before);
+
+  // If your poll function includes an unreachable at the end, that's not
+  // valid.  Bugpoint likes to create this, so check for it.
+  assert(isPotentiallyReachable(&*Start, &*After) &&
+         "malformed poll function");
+
+  scanInlinedCode(&*Start, &*After, Calls, BBs);
+  assert(!Calls.empty() && "slow path not found for safepoint poll");
+
+  // Record the fact we need a parsable state at the runtime call contained in
+  // the poll function.  This is required so that the runtime knows how to
+  // parse the last frame when we actually take  the safepoint (i.e. execute
+  // the slow path)
+  assert(ParsePointsNeeded.empty());
+  for (auto *CI : Calls) {
+    // No safepoint needed or wanted
+    if (!needsStatepoint(CI, TLI))
+      continue;
+
+    // These are likely runtime calls.  Should we assert that via calling
+    // convention or something?
+    ParsePointsNeeded.push_back(CI);
+  }
+  assert(ParsePointsNeeded.size() <= Calls.size());
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/Reassociate.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/Reassociate.cpp
index 569b4b260e..dffeb7cc22 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/Reassociate.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/Reassociate.cpp
@@ -1,925 +1,925 @@
-//===- Reassociate.cpp - Reassociate binary expressions -------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass reassociates commutative expressions in an order that is designed 
-// to promote better constant propagation, GCSE, LICM, PRE, etc. 
-// 
-// For example: 4 + (x + 5) -> x + (4 + 5) 
-// 
-// In the implementation of this algorithm, constants are assigned rank = 0, 
-// function arguments are rank = 1, and other values are assigned ranks 
-// corresponding to the reverse post order traversal of current function 
-// (starting at 2), which effectively gives values in deep loops higher rank 
-// than values not in loops. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/Reassociate.h" 
-#include "llvm/ADT/APFloat.h" 
-#include "llvm/ADT/APInt.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/PostOrderIterator.h" 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/BasicAliasAnalysis.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/Argument.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Operator.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/IR/ValueHandle.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <utility> 
- 
-using namespace llvm; 
-using namespace reassociate; 
-using namespace PatternMatch; 
- 
-#define DEBUG_TYPE "reassociate" 
- 
-STATISTIC(NumChanged, "Number of insts reassociated"); 
-STATISTIC(NumAnnihil, "Number of expr tree annihilated"); 
-STATISTIC(NumFactor , "Number of multiplies factored"); 
- 
-#ifndef NDEBUG 
-/// Print out the expression identified in the Ops list. 
-static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) { 
-  Module *M = I->getModule(); 
-  dbgs() << Instruction::getOpcodeName(I->getOpcode()) << " " 
-       << *Ops[0].Op->getType() << '\t'; 
-  for (unsigned i = 0, e = Ops.size(); i != e; ++i) { 
-    dbgs() << "[ "; 
-    Ops[i].Op->printAsOperand(dbgs(), false, M); 
-    dbgs() << ", #" << Ops[i].Rank << "] "; 
-  } 
-} 
-#endif 
- 
-/// Utility class representing a non-constant Xor-operand. We classify 
-/// non-constant Xor-Operands into two categories: 
-///  C1) The operand is in the form "X & C", where C is a constant and C != ~0 
-///  C2) 
-///    C2.1) The operand is in the form of "X | C", where C is a non-zero 
-///          constant. 
-///    C2.2) Any operand E which doesn't fall into C1 and C2.1, we view this 
-///          operand as "E | 0" 
-class llvm::reassociate::XorOpnd { 
-public: 
-  XorOpnd(Value *V); 
- 
-  bool isInvalid() const { return SymbolicPart == nullptr; } 
-  bool isOrExpr() const { return isOr; } 
-  Value *getValue() const { return OrigVal; } 
-  Value *getSymbolicPart() const { return SymbolicPart; } 
-  unsigned getSymbolicRank() const { return SymbolicRank; } 
-  const APInt &getConstPart() const { return ConstPart; } 
- 
-  void Invalidate() { SymbolicPart = OrigVal = nullptr; } 
-  void setSymbolicRank(unsigned R) { SymbolicRank = R; } 
- 
-private: 
-  Value *OrigVal; 
-  Value *SymbolicPart; 
-  APInt ConstPart; 
-  unsigned SymbolicRank; 
-  bool isOr; 
-}; 
- 
-XorOpnd::XorOpnd(Value *V) { 
-  assert(!isa<ConstantInt>(V) && "No ConstantInt"); 
-  OrigVal = V; 
-  Instruction *I = dyn_cast<Instruction>(V); 
-  SymbolicRank = 0; 
- 
-  if (I && (I->getOpcode() == Instruction::Or || 
-            I->getOpcode() == Instruction::And)) { 
-    Value *V0 = I->getOperand(0); 
-    Value *V1 = I->getOperand(1); 
-    const APInt *C; 
-    if (match(V0, m_APInt(C))) 
-      std::swap(V0, V1); 
- 
-    if (match(V1, m_APInt(C))) { 
-      ConstPart = *C; 
-      SymbolicPart = V0; 
-      isOr = (I->getOpcode() == Instruction::Or); 
-      return; 
-    } 
-  } 
- 
-  // view the operand as "V | 0" 
-  SymbolicPart = V; 
-  ConstPart = APInt::getNullValue(V->getType()->getScalarSizeInBits()); 
-  isOr = true; 
-} 
- 
-/// Return true if V is an instruction of the specified opcode and if it 
-/// only has one use. 
-static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) { 
-  auto *I = dyn_cast<Instruction>(V); 
-  if (I && I->hasOneUse() && I->getOpcode() == Opcode) 
-    if (!isa<FPMathOperator>(I) || I->isFast()) 
-      return cast<BinaryOperator>(I); 
-  return nullptr; 
-} 
- 
-static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode1, 
-                                        unsigned Opcode2) { 
-  auto *I = dyn_cast<Instruction>(V); 
-  if (I && I->hasOneUse() && 
-      (I->getOpcode() == Opcode1 || I->getOpcode() == Opcode2)) 
-    if (!isa<FPMathOperator>(I) || I->isFast()) 
-      return cast<BinaryOperator>(I); 
-  return nullptr; 
-} 
- 
-void ReassociatePass::BuildRankMap(Function &F, 
-                                   ReversePostOrderTraversal<Function*> &RPOT) { 
-  unsigned Rank = 2; 
- 
-  // Assign distinct ranks to function arguments. 
-  for (auto &Arg : F.args()) { 
-    ValueRankMap[&Arg] = ++Rank; 
-    LLVM_DEBUG(dbgs() << "Calculated Rank[" << Arg.getName() << "] = " << Rank 
-                      << "\n"); 
-  } 
- 
-  // Traverse basic blocks in ReversePostOrder. 
-  for (BasicBlock *BB : RPOT) { 
-    unsigned BBRank = RankMap[BB] = ++Rank << 16; 
- 
-    // Walk the basic block, adding precomputed ranks for any instructions that 
-    // we cannot move.  This ensures that the ranks for these instructions are 
-    // all different in the block. 
-    for (Instruction &I : *BB) 
-      if (mayBeMemoryDependent(I)) 
-        ValueRankMap[&I] = ++BBRank; 
-  } 
-} 
- 
-unsigned ReassociatePass::getRank(Value *V) { 
-  Instruction *I = dyn_cast<Instruction>(V); 
-  if (!I) { 
-    if (isa<Argument>(V)) return ValueRankMap[V];   // Function argument. 
-    return 0;  // Otherwise it's a global or constant, rank 0. 
-  } 
- 
-  if (unsigned Rank = ValueRankMap[I]) 
-    return Rank;    // Rank already known? 
- 
-  // If this is an expression, return the 1+MAX(rank(LHS), rank(RHS)) so that 
-  // we can reassociate expressions for code motion!  Since we do not recurse 
-  // for PHI nodes, we cannot have infinite recursion here, because there 
-  // cannot be loops in the value graph that do not go through PHI nodes. 
-  unsigned Rank = 0, MaxRank = RankMap[I->getParent()]; 
-  for (unsigned i = 0, e = I->getNumOperands(); i != e && Rank != MaxRank; ++i) 
-    Rank = std::max(Rank, getRank(I->getOperand(i))); 
- 
-  // If this is a 'not' or 'neg' instruction, do not count it for rank. This 
-  // assures us that X and ~X will have the same rank. 
-  if (!match(I, m_Not(m_Value())) && !match(I, m_Neg(m_Value())) && 
-      !match(I, m_FNeg(m_Value()))) 
-    ++Rank; 
- 
-  LLVM_DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = " << Rank 
-                    << "\n"); 
- 
-  return ValueRankMap[I] = Rank; 
-} 
- 
-// Canonicalize constants to RHS.  Otherwise, sort the operands by rank. 
-void ReassociatePass::canonicalizeOperands(Instruction *I) { 
-  assert(isa<BinaryOperator>(I) && "Expected binary operator."); 
-  assert(I->isCommutative() && "Expected commutative operator."); 
- 
-  Value *LHS = I->getOperand(0); 
-  Value *RHS = I->getOperand(1); 
-  if (LHS == RHS || isa<Constant>(RHS)) 
-    return; 
-  if (isa<Constant>(LHS) || getRank(RHS) < getRank(LHS)) 
-    cast<BinaryOperator>(I)->swapOperands(); 
-} 
- 
-static BinaryOperator *CreateAdd(Value *S1, Value *S2, const Twine &Name, 
-                                 Instruction *InsertBefore, Value *FlagsOp) { 
-  if (S1->getType()->isIntOrIntVectorTy()) 
-    return BinaryOperator::CreateAdd(S1, S2, Name, InsertBefore); 
-  else { 
-    BinaryOperator *Res = 
-        BinaryOperator::CreateFAdd(S1, S2, Name, InsertBefore); 
-    Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags()); 
-    return Res; 
-  } 
-} 
- 
-static BinaryOperator *CreateMul(Value *S1, Value *S2, const Twine &Name, 
-                                 Instruction *InsertBefore, Value *FlagsOp) { 
-  if (S1->getType()->isIntOrIntVectorTy()) 
-    return BinaryOperator::CreateMul(S1, S2, Name, InsertBefore); 
-  else { 
-    BinaryOperator *Res = 
-      BinaryOperator::CreateFMul(S1, S2, Name, InsertBefore); 
-    Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags()); 
-    return Res; 
-  } 
-} 
- 
-static Instruction *CreateNeg(Value *S1, const Twine &Name, 
-                              Instruction *InsertBefore, Value *FlagsOp) { 
-  if (S1->getType()->isIntOrIntVectorTy()) 
-    return BinaryOperator::CreateNeg(S1, Name, InsertBefore); 
- 
-  if (auto *FMFSource = dyn_cast<Instruction>(FlagsOp)) 
-    return UnaryOperator::CreateFNegFMF(S1, FMFSource, Name, InsertBefore); 
- 
-  return UnaryOperator::CreateFNeg(S1, Name, InsertBefore); 
-} 
- 
-/// Replace 0-X with X*-1. 
-static BinaryOperator *LowerNegateToMultiply(Instruction *Neg) { 
-  assert((isa<UnaryOperator>(Neg) || isa<BinaryOperator>(Neg)) && 
-         "Expected a Negate!"); 
-  // FIXME: It's not safe to lower a unary FNeg into a FMul by -1.0. 
-  unsigned OpNo = isa<BinaryOperator>(Neg) ? 1 : 0; 
-  Type *Ty = Neg->getType(); 
-  Constant *NegOne = Ty->isIntOrIntVectorTy() ? 
-    ConstantInt::getAllOnesValue(Ty) : ConstantFP::get(Ty, -1.0); 
- 
-  BinaryOperator *Res = CreateMul(Neg->getOperand(OpNo), NegOne, "", Neg, Neg); 
-  Neg->setOperand(OpNo, Constant::getNullValue(Ty)); // Drop use of op. 
-  Res->takeName(Neg); 
-  Neg->replaceAllUsesWith(Res); 
-  Res->setDebugLoc(Neg->getDebugLoc()); 
-  return Res; 
-} 
- 
-/// Returns k such that lambda(2^Bitwidth) = 2^k, where lambda is the Carmichael 
-/// function. This means that x^(2^k) === 1 mod 2^Bitwidth for 
-/// every odd x, i.e. x^(2^k) = 1 for every odd x in Bitwidth-bit arithmetic. 
-/// Note that 0 <= k < Bitwidth, and if Bitwidth > 3 then x^(2^k) = 0 for every 
-/// even x in Bitwidth-bit arithmetic. 
-static unsigned CarmichaelShift(unsigned Bitwidth) { 
-  if (Bitwidth < 3) 
-    return Bitwidth - 1; 
-  return Bitwidth - 2; 
-} 
- 
-/// Add the extra weight 'RHS' to the existing weight 'LHS', 
-/// reducing the combined weight using any special properties of the operation. 
-/// The existing weight LHS represents the computation X op X op ... op X where 
-/// X occurs LHS times.  The combined weight represents  X op X op ... op X with 
-/// X occurring LHS + RHS times.  If op is "Xor" for example then the combined 
-/// operation is equivalent to X if LHS + RHS is odd, or 0 if LHS + RHS is even; 
-/// the routine returns 1 in LHS in the first case, and 0 in LHS in the second. 
-static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) { 
-  // If we were working with infinite precision arithmetic then the combined 
-  // weight would be LHS + RHS.  But we are using finite precision arithmetic, 
-  // and the APInt sum LHS + RHS may not be correct if it wraps (it is correct 
-  // for nilpotent operations and addition, but not for idempotent operations 
-  // and multiplication), so it is important to correctly reduce the combined 
-  // weight back into range if wrapping would be wrong. 
- 
-  // If RHS is zero then the weight didn't change. 
-  if (RHS.isMinValue()) 
-    return; 
-  // If LHS is zero then the combined weight is RHS. 
-  if (LHS.isMinValue()) { 
-    LHS = RHS; 
-    return; 
-  } 
-  // From this point on we know that neither LHS nor RHS is zero. 
- 
-  if (Instruction::isIdempotent(Opcode)) { 
-    // Idempotent means X op X === X, so any non-zero weight is equivalent to a 
-    // weight of 1.  Keeping weights at zero or one also means that wrapping is 
-    // not a problem. 
-    assert(LHS == 1 && RHS == 1 && "Weights not reduced!"); 
-    return; // Return a weight of 1. 
-  } 
-  if (Instruction::isNilpotent(Opcode)) { 
-    // Nilpotent means X op X === 0, so reduce weights modulo 2. 
-    assert(LHS == 1 && RHS == 1 && "Weights not reduced!"); 
-    LHS = 0; // 1 + 1 === 0 modulo 2. 
-    return; 
-  } 
-  if (Opcode == Instruction::Add || Opcode == Instruction::FAdd) { 
-    // TODO: Reduce the weight by exploiting nsw/nuw? 
-    LHS += RHS; 
-    return; 
-  } 
- 
-  assert((Opcode == Instruction::Mul || Opcode == Instruction::FMul) && 
-         "Unknown associative operation!"); 
-  unsigned Bitwidth = LHS.getBitWidth(); 
-  // If CM is the Carmichael number then a weight W satisfying W >= CM+Bitwidth 
-  // can be replaced with W-CM.  That's because x^W=x^(W-CM) for every Bitwidth 
-  // bit number x, since either x is odd in which case x^CM = 1, or x is even in 
-  // which case both x^W and x^(W - CM) are zero.  By subtracting off multiples 
-  // of CM like this weights can always be reduced to the range [0, CM+Bitwidth) 
-  // which by a happy accident means that they can always be represented using 
-  // Bitwidth bits. 
-  // TODO: Reduce the weight by exploiting nsw/nuw?  (Could do much better than 
-  // the Carmichael number). 
-  if (Bitwidth > 3) { 
-    /// CM - The value of Carmichael's lambda function. 
-    APInt CM = APInt::getOneBitSet(Bitwidth, CarmichaelShift(Bitwidth)); 
-    // Any weight W >= Threshold can be replaced with W - CM. 
-    APInt Threshold = CM + Bitwidth; 
-    assert(LHS.ult(Threshold) && RHS.ult(Threshold) && "Weights not reduced!"); 
-    // For Bitwidth 4 or more the following sum does not overflow. 
-    LHS += RHS; 
-    while (LHS.uge(Threshold)) 
-      LHS -= CM; 
-  } else { 
-    // To avoid problems with overflow do everything the same as above but using 
-    // a larger type. 
-    unsigned CM = 1U << CarmichaelShift(Bitwidth); 
-    unsigned Threshold = CM + Bitwidth; 
-    assert(LHS.getZExtValue() < Threshold && RHS.getZExtValue() < Threshold && 
-           "Weights not reduced!"); 
-    unsigned Total = LHS.getZExtValue() + RHS.getZExtValue(); 
-    while (Total >= Threshold) 
-      Total -= CM; 
-    LHS = Total; 
-  } 
-} 
- 
-using RepeatedValue = std::pair<Value*, APInt>; 
- 
-/// Given an associative binary expression, return the leaf 
-/// nodes in Ops along with their weights (how many times the leaf occurs).  The 
-/// original expression is the same as 
-///   (Ops[0].first op Ops[0].first op ... Ops[0].first)  <- Ops[0].second times 
-/// op 
-///   (Ops[1].first op Ops[1].first op ... Ops[1].first)  <- Ops[1].second times 
-/// op 
-///   ... 
-/// op 
-///   (Ops[N].first op Ops[N].first op ... Ops[N].first)  <- Ops[N].second times 
-/// 
-/// Note that the values Ops[0].first, ..., Ops[N].first are all distinct. 
-/// 
-/// This routine may modify the function, in which case it returns 'true'.  The 
-/// changes it makes may well be destructive, changing the value computed by 'I' 
-/// to something completely different.  Thus if the routine returns 'true' then 
-/// you MUST either replace I with a new expression computed from the Ops array, 
-/// or use RewriteExprTree to put the values back in. 
-/// 
-/// A leaf node is either not a binary operation of the same kind as the root 
-/// node 'I' (i.e. is not a binary operator at all, or is, but with a different 
-/// opcode), or is the same kind of binary operator but has a use which either 
-/// does not belong to the expression, or does belong to the expression but is 
-/// a leaf node.  Every leaf node has at least one use that is a non-leaf node 
-/// of the expression, while for non-leaf nodes (except for the root 'I') every 
-/// use is a non-leaf node of the expression. 
-/// 
-/// For example: 
-///           expression graph        node names 
-/// 
-///                     +        |        I 
-///                    / \       | 
-///                   +   +      |      A,  B 
-///                  / \ / \     | 
-///                 *   +   *    |    C,  D,  E 
-///                / \ / \ / \   | 
-///                   +   *      |      F,  G 
-/// 
-/// The leaf nodes are C, E, F and G.  The Ops array will contain (maybe not in 
-/// that order) (C, 1), (E, 1), (F, 2), (G, 2). 
-/// 
-/// The expression is maximal: if some instruction is a binary operator of the 
-/// same kind as 'I', and all of its uses are non-leaf nodes of the expression, 
-/// then the instruction also belongs to the expression, is not a leaf node of 
-/// it, and its operands also belong to the expression (but may be leaf nodes). 
-/// 
-/// NOTE: This routine will set operands of non-leaf non-root nodes to undef in 
-/// order to ensure that every non-root node in the expression has *exactly one* 
-/// use by a non-leaf node of the expression.  This destruction means that the 
-/// caller MUST either replace 'I' with a new expression or use something like 
-/// RewriteExprTree to put the values back in if the routine indicates that it 
-/// made a change by returning 'true'. 
-/// 
-/// In the above example either the right operand of A or the left operand of B 
-/// will be replaced by undef.  If it is B's operand then this gives: 
-/// 
-///                     +        |        I 
-///                    / \       | 
-///                   +   +      |      A,  B - operand of B replaced with undef 
-///                  / \   \     | 
-///                 *   +   *    |    C,  D,  E 
-///                / \ / \ / \   | 
-///                   +   *      |      F,  G 
-/// 
-/// Note that such undef operands can only be reached by passing through 'I'. 
-/// For example, if you visit operands recursively starting from a leaf node 
-/// then you will never see such an undef operand unless you get back to 'I', 
-/// which requires passing through a phi node. 
-/// 
-/// Note that this routine may also mutate binary operators of the wrong type 
-/// that have all uses inside the expression (i.e. only used by non-leaf nodes 
-/// of the expression) if it can turn them into binary operators of the right 
-/// type and thus make the expression bigger. 
-static bool LinearizeExprTree(Instruction *I, 
-                              SmallVectorImpl<RepeatedValue> &Ops) { 
-  assert((isa<UnaryOperator>(I) || isa<BinaryOperator>(I)) && 
-         "Expected a UnaryOperator or BinaryOperator!"); 
-  LLVM_DEBUG(dbgs() << "LINEARIZE: " << *I << '\n'); 
-  unsigned Bitwidth = I->getType()->getScalarType()->getPrimitiveSizeInBits(); 
-  unsigned Opcode = I->getOpcode(); 
-  assert(I->isAssociative() && I->isCommutative() && 
-         "Expected an associative and commutative operation!"); 
- 
-  // Visit all operands of the expression, keeping track of their weight (the 
-  // number of paths from the expression root to the operand, or if you like 
-  // the number of times that operand occurs in the linearized expression). 
-  // For example, if I = X + A, where X = A + B, then I, X and B have weight 1 
-  // while A has weight two. 
- 
-  // Worklist of non-leaf nodes (their operands are in the expression too) along 
-  // with their weights, representing a certain number of paths to the operator. 
-  // If an operator occurs in the worklist multiple times then we found multiple 
-  // ways to get to it. 
-  SmallVector<std::pair<Instruction*, APInt>, 8> Worklist; // (Op, Weight) 
-  Worklist.push_back(std::make_pair(I, APInt(Bitwidth, 1))); 
-  bool Changed = false; 
- 
-  // Leaves of the expression are values that either aren't the right kind of 
-  // operation (eg: a constant, or a multiply in an add tree), or are, but have 
-  // some uses that are not inside the expression.  For example, in I = X + X, 
-  // X = A + B, the value X has two uses (by I) that are in the expression.  If 
-  // X has any other uses, for example in a return instruction, then we consider 
-  // X to be a leaf, and won't analyze it further.  When we first visit a value, 
-  // if it has more than one use then at first we conservatively consider it to 
-  // be a leaf.  Later, as the expression is explored, we may discover some more 
-  // uses of the value from inside the expression.  If all uses turn out to be 
-  // from within the expression (and the value is a binary operator of the right 
-  // kind) then the value is no longer considered to be a leaf, and its operands 
-  // are explored. 
- 
-  // Leaves - Keeps track of the set of putative leaves as well as the number of 
-  // paths to each leaf seen so far. 
-  using LeafMap = DenseMap<Value *, APInt>; 
-  LeafMap Leaves; // Leaf -> Total weight so far. 
-  SmallVector<Value *, 8> LeafOrder; // Ensure deterministic leaf output order. 
- 
-#ifndef NDEBUG 
-  SmallPtrSet<Value *, 8> Visited; // For sanity checking the iteration scheme. 
-#endif 
-  while (!Worklist.empty()) { 
-    std::pair<Instruction*, APInt> P = Worklist.pop_back_val(); 
-    I = P.first; // We examine the operands of this binary operator. 
- 
-    for (unsigned OpIdx = 0; OpIdx < I->getNumOperands(); ++OpIdx) { // Visit operands. 
-      Value *Op = I->getOperand(OpIdx); 
-      APInt Weight = P.second; // Number of paths to this operand. 
-      LLVM_DEBUG(dbgs() << "OPERAND: " << *Op << " (" << Weight << ")\n"); 
-      assert(!Op->use_empty() && "No uses, so how did we get to it?!"); 
- 
-      // If this is a binary operation of the right kind with only one use then 
-      // add its operands to the expression. 
-      if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) { 
-        assert(Visited.insert(Op).second && "Not first visit!"); 
-        LLVM_DEBUG(dbgs() << "DIRECT ADD: " << *Op << " (" << Weight << ")\n"); 
-        Worklist.push_back(std::make_pair(BO, Weight)); 
-        continue; 
-      } 
- 
-      // Appears to be a leaf.  Is the operand already in the set of leaves? 
-      LeafMap::iterator It = Leaves.find(Op); 
-      if (It == Leaves.end()) { 
-        // Not in the leaf map.  Must be the first time we saw this operand. 
-        assert(Visited.insert(Op).second && "Not first visit!"); 
-        if (!Op->hasOneUse()) { 
-          // This value has uses not accounted for by the expression, so it is 
-          // not safe to modify.  Mark it as being a leaf. 
-          LLVM_DEBUG(dbgs() 
-                     << "ADD USES LEAF: " << *Op << " (" << Weight << ")\n"); 
-          LeafOrder.push_back(Op); 
-          Leaves[Op] = Weight; 
-          continue; 
-        } 
-        // No uses outside the expression, try morphing it. 
-      } else { 
-        // Already in the leaf map. 
-        assert(It != Leaves.end() && Visited.count(Op) && 
-               "In leaf map but not visited!"); 
- 
-        // Update the number of paths to the leaf. 
-        IncorporateWeight(It->second, Weight, Opcode); 
- 
-#if 0   // TODO: Re-enable once PR13021 is fixed. 
-        // The leaf already has one use from inside the expression.  As we want 
-        // exactly one such use, drop this new use of the leaf. 
-        assert(!Op->hasOneUse() && "Only one use, but we got here twice!"); 
-        I->setOperand(OpIdx, UndefValue::get(I->getType())); 
-        Changed = true; 
- 
-        // If the leaf is a binary operation of the right kind and we now see 
-        // that its multiple original uses were in fact all by nodes belonging 
-        // to the expression, then no longer consider it to be a leaf and add 
-        // its operands to the expression. 
-        if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) { 
-          LLVM_DEBUG(dbgs() << "UNLEAF: " << *Op << " (" << It->second << ")\n"); 
-          Worklist.push_back(std::make_pair(BO, It->second)); 
-          Leaves.erase(It); 
-          continue; 
-        } 
-#endif 
- 
-        // If we still have uses that are not accounted for by the expression 
-        // then it is not safe to modify the value. 
-        if (!Op->hasOneUse()) 
-          continue; 
- 
-        // No uses outside the expression, try morphing it. 
-        Weight = It->second; 
-        Leaves.erase(It); // Since the value may be morphed below. 
-      } 
- 
-      // At this point we have a value which, first of all, is not a binary 
-      // expression of the right kind, and secondly, is only used inside the 
-      // expression.  This means that it can safely be modified.  See if we 
-      // can usefully morph it into an expression of the right kind. 
-      assert((!isa<Instruction>(Op) || 
-              cast<Instruction>(Op)->getOpcode() != Opcode 
-              || (isa<FPMathOperator>(Op) && 
-                  !cast<Instruction>(Op)->isFast())) && 
-             "Should have been handled above!"); 
-      assert(Op->hasOneUse() && "Has uses outside the expression tree!"); 
- 
-      // If this is a multiply expression, turn any internal negations into 
-      // multiplies by -1 so they can be reassociated. 
-      if (Instruction *Tmp = dyn_cast<Instruction>(Op)) 
-        if ((Opcode == Instruction::Mul && match(Tmp, m_Neg(m_Value()))) || 
-            (Opcode == Instruction::FMul && match(Tmp, m_FNeg(m_Value())))) { 
-          LLVM_DEBUG(dbgs() 
-                     << "MORPH LEAF: " << *Op << " (" << Weight << ") TO "); 
-          Tmp = LowerNegateToMultiply(Tmp); 
-          LLVM_DEBUG(dbgs() << *Tmp << '\n'); 
-          Worklist.push_back(std::make_pair(Tmp, Weight)); 
-          Changed = true; 
-          continue; 
-        } 
- 
-      // Failed to morph into an expression of the right type.  This really is 
-      // a leaf. 
-      LLVM_DEBUG(dbgs() << "ADD LEAF: " << *Op << " (" << Weight << ")\n"); 
-      assert(!isReassociableOp(Op, Opcode) && "Value was morphed?"); 
-      LeafOrder.push_back(Op); 
-      Leaves[Op] = Weight; 
-    } 
-  } 
- 
-  // The leaves, repeated according to their weights, represent the linearized 
-  // form of the expression. 
-  for (unsigned i = 0, e = LeafOrder.size(); i != e; ++i) { 
-    Value *V = LeafOrder[i]; 
-    LeafMap::iterator It = Leaves.find(V); 
-    if (It == Leaves.end()) 
-      // Node initially thought to be a leaf wasn't. 
-      continue; 
-    assert(!isReassociableOp(V, Opcode) && "Shouldn't be a leaf!"); 
-    APInt Weight = It->second; 
-    if (Weight.isMinValue()) 
-      // Leaf already output or weight reduction eliminated it. 
-      continue; 
-    // Ensure the leaf is only output once. 
-    It->second = 0; 
-    Ops.push_back(std::make_pair(V, Weight)); 
-  } 
- 
-  // For nilpotent operations or addition there may be no operands, for example 
-  // because the expression was "X xor X" or consisted of 2^Bitwidth additions: 
-  // in both cases the weight reduces to 0 causing the value to be skipped. 
-  if (Ops.empty()) { 
-    Constant *Identity = ConstantExpr::getBinOpIdentity(Opcode, I->getType()); 
-    assert(Identity && "Associative operation without identity!"); 
-    Ops.emplace_back(Identity, APInt(Bitwidth, 1)); 
-  } 
- 
-  return Changed; 
-} 
- 
-/// Now that the operands for this expression tree are 
-/// linearized and optimized, emit them in-order. 
-void ReassociatePass::RewriteExprTree(BinaryOperator *I, 
-                                      SmallVectorImpl<ValueEntry> &Ops) { 
-  assert(Ops.size() > 1 && "Single values should be used directly!"); 
- 
-  // Since our optimizations should never increase the number of operations, the 
-  // new expression can usually be written reusing the existing binary operators 
-  // from the original expression tree, without creating any new instructions, 
-  // though the rewritten expression may have a completely different topology. 
-  // We take care to not change anything if the new expression will be the same 
-  // as the original.  If more than trivial changes (like commuting operands) 
-  // were made then we are obliged to clear out any optional subclass data like 
-  // nsw flags. 
- 
-  /// NodesToRewrite - Nodes from the original expression available for writing 
-  /// the new expression into. 
-  SmallVector<BinaryOperator*, 8> NodesToRewrite; 
-  unsigned Opcode = I->getOpcode(); 
-  BinaryOperator *Op = I; 
- 
-  /// NotRewritable - The operands being written will be the leaves of the new 
-  /// expression and must not be used as inner nodes (via NodesToRewrite) by 
-  /// mistake.  Inner nodes are always reassociable, and usually leaves are not 
-  /// (if they were they would have been incorporated into the expression and so 
-  /// would not be leaves), so most of the time there is no danger of this.  But 
-  /// in rare cases a leaf may become reassociable if an optimization kills uses 
-  /// of it, or it may momentarily become reassociable during rewriting (below) 
-  /// due it being removed as an operand of one of its uses.  Ensure that misuse 
-  /// of leaf nodes as inner nodes cannot occur by remembering all of the future 
-  /// leaves and refusing to reuse any of them as inner nodes. 
-  SmallPtrSet<Value*, 8> NotRewritable; 
-  for (unsigned i = 0, e = Ops.size(); i != e; ++i) 
-    NotRewritable.insert(Ops[i].Op); 
- 
-  // ExpressionChanged - Non-null if the rewritten expression differs from the 
-  // original in some non-trivial way, requiring the clearing of optional flags. 
-  // Flags are cleared from the operator in ExpressionChanged up to I inclusive. 
-  BinaryOperator *ExpressionChanged = nullptr; 
-  for (unsigned i = 0; ; ++i) { 
-    // The last operation (which comes earliest in the IR) is special as both 
-    // operands will come from Ops, rather than just one with the other being 
-    // a subexpression. 
-    if (i+2 == Ops.size()) { 
-      Value *NewLHS = Ops[i].Op; 
-      Value *NewRHS = Ops[i+1].Op; 
-      Value *OldLHS = Op->getOperand(0); 
-      Value *OldRHS = Op->getOperand(1); 
- 
-      if (NewLHS == OldLHS && NewRHS == OldRHS) 
-        // Nothing changed, leave it alone. 
-        break; 
- 
-      if (NewLHS == OldRHS && NewRHS == OldLHS) { 
-        // The order of the operands was reversed.  Swap them. 
-        LLVM_DEBUG(dbgs() << "RA: " << *Op << '\n'); 
-        Op->swapOperands(); 
-        LLVM_DEBUG(dbgs() << "TO: " << *Op << '\n'); 
-        MadeChange = true; 
-        ++NumChanged; 
-        break; 
-      } 
- 
-      // The new operation differs non-trivially from the original. Overwrite 
-      // the old operands with the new ones. 
-      LLVM_DEBUG(dbgs() << "RA: " << *Op << '\n'); 
-      if (NewLHS != OldLHS) { 
-        BinaryOperator *BO = isReassociableOp(OldLHS, Opcode); 
-        if (BO && !NotRewritable.count(BO)) 
-          NodesToRewrite.push_back(BO); 
-        Op->setOperand(0, NewLHS); 
-      } 
-      if (NewRHS != OldRHS) { 
-        BinaryOperator *BO = isReassociableOp(OldRHS, Opcode); 
-        if (BO && !NotRewritable.count(BO)) 
-          NodesToRewrite.push_back(BO); 
-        Op->setOperand(1, NewRHS); 
-      } 
-      LLVM_DEBUG(dbgs() << "TO: " << *Op << '\n'); 
- 
-      ExpressionChanged = Op; 
-      MadeChange = true; 
-      ++NumChanged; 
- 
-      break; 
-    } 
- 
-    // Not the last operation.  The left-hand side will be a sub-expression 
-    // while the right-hand side will be the current element of Ops. 
-    Value *NewRHS = Ops[i].Op; 
-    if (NewRHS != Op->getOperand(1)) { 
-      LLVM_DEBUG(dbgs() << "RA: " << *Op << '\n'); 
-      if (NewRHS == Op->getOperand(0)) { 
-        // The new right-hand side was already present as the left operand.  If 
-        // we are lucky then swapping the operands will sort out both of them. 
-        Op->swapOperands(); 
-      } else { 
-        // Overwrite with the new right-hand side. 
-        BinaryOperator *BO = isReassociableOp(Op->getOperand(1), Opcode); 
-        if (BO && !NotRewritable.count(BO)) 
-          NodesToRewrite.push_back(BO); 
-        Op->setOperand(1, NewRHS); 
-        ExpressionChanged = Op; 
-      } 
-      LLVM_DEBUG(dbgs() << "TO: " << *Op << '\n'); 
-      MadeChange = true; 
-      ++NumChanged; 
-    } 
- 
-    // Now deal with the left-hand side.  If this is already an operation node 
-    // from the original expression then just rewrite the rest of the expression 
-    // into it. 
-    BinaryOperator *BO = isReassociableOp(Op->getOperand(0), Opcode); 
-    if (BO && !NotRewritable.count(BO)) { 
-      Op = BO; 
-      continue; 
-    } 
- 
-    // Otherwise, grab a spare node from the original expression and use that as 
-    // the left-hand side.  If there are no nodes left then the optimizers made 
-    // an expression with more nodes than the original!  This usually means that 
-    // they did something stupid but it might mean that the problem was just too 
-    // hard (finding the mimimal number of multiplications needed to realize a 
-    // multiplication expression is NP-complete).  Whatever the reason, smart or 
-    // stupid, create a new node if there are none left. 
-    BinaryOperator *NewOp; 
-    if (NodesToRewrite.empty()) { 
-      Constant *Undef = UndefValue::get(I->getType()); 
-      NewOp = BinaryOperator::Create(Instruction::BinaryOps(Opcode), 
-                                     Undef, Undef, "", I); 
-      if (NewOp->getType()->isFPOrFPVectorTy()) 
-        NewOp->setFastMathFlags(I->getFastMathFlags()); 
-    } else { 
-      NewOp = NodesToRewrite.pop_back_val(); 
-    } 
- 
-    LLVM_DEBUG(dbgs() << "RA: " << *Op << '\n'); 
-    Op->setOperand(0, NewOp); 
-    LLVM_DEBUG(dbgs() << "TO: " << *Op << '\n'); 
-    ExpressionChanged = Op; 
-    MadeChange = true; 
-    ++NumChanged; 
-    Op = NewOp; 
-  } 
- 
-  // If the expression changed non-trivially then clear out all subclass data 
-  // starting from the operator specified in ExpressionChanged, and compactify 
-  // the operators to just before the expression root to guarantee that the 
-  // expression tree is dominated by all of Ops. 
-  if (ExpressionChanged) 
-    do { 
-      // Preserve FastMathFlags. 
-      if (isa<FPMathOperator>(I)) { 
-        FastMathFlags Flags = I->getFastMathFlags(); 
-        ExpressionChanged->clearSubclassOptionalData(); 
-        ExpressionChanged->setFastMathFlags(Flags); 
-      } else 
-        ExpressionChanged->clearSubclassOptionalData(); 
- 
-      if (ExpressionChanged == I) 
-        break; 
- 
-      // Discard any debug info related to the expressions that has changed (we 
-      // can leave debug infor related to the root, since the result of the 
-      // expression tree should be the same even after reassociation). 
-      replaceDbgUsesWithUndef(ExpressionChanged); 
- 
-      ExpressionChanged->moveBefore(I); 
-      ExpressionChanged = cast<BinaryOperator>(*ExpressionChanged->user_begin()); 
-    } while (true); 
- 
-  // Throw away any left over nodes from the original expression. 
-  for (unsigned i = 0, e = NodesToRewrite.size(); i != e; ++i) 
-    RedoInsts.insert(NodesToRewrite[i]); 
-} 
- 
-/// Insert instructions before the instruction pointed to by BI, 
-/// that computes the negative version of the value specified.  The negative 
-/// version of the value is returned, and BI is left pointing at the instruction 
-/// that should be processed next by the reassociation pass. 
-/// Also add intermediate instructions to the redo list that are modified while 
-/// pushing the negates through adds.  These will be revisited to see if 
-/// additional opportunities have been exposed. 
-static Value *NegateValue(Value *V, Instruction *BI, 
-                          ReassociatePass::OrderedSet &ToRedo) { 
-  if (auto *C = dyn_cast<Constant>(V)) 
-    return C->getType()->isFPOrFPVectorTy() ? ConstantExpr::getFNeg(C) : 
-                                              ConstantExpr::getNeg(C); 
- 
-  // We are trying to expose opportunity for reassociation.  One of the things 
-  // that we want to do to achieve this is to push a negation as deep into an 
-  // expression chain as possible, to expose the add instructions.  In practice, 
-  // this means that we turn this: 
-  //   X = -(A+12+C+D)   into    X = -A + -12 + -C + -D = -12 + -A + -C + -D 
-  // so that later, a: Y = 12+X could get reassociated with the -12 to eliminate 
-  // the constants.  We assume that instcombine will clean up the mess later if 
-  // we introduce tons of unnecessary negation instructions. 
-  // 
-  if (BinaryOperator *I = 
-          isReassociableOp(V, Instruction::Add, Instruction::FAdd)) { 
-    // Push the negates through the add. 
-    I->setOperand(0, NegateValue(I->getOperand(0), BI, ToRedo)); 
-    I->setOperand(1, NegateValue(I->getOperand(1), BI, ToRedo)); 
-    if (I->getOpcode() == Instruction::Add) { 
-      I->setHasNoUnsignedWrap(false); 
-      I->setHasNoSignedWrap(false); 
-    } 
- 
-    // We must move the add instruction here, because the neg instructions do 
-    // not dominate the old add instruction in general.  By moving it, we are 
-    // assured that the neg instructions we just inserted dominate the 
-    // instruction we are about to insert after them. 
-    // 
-    I->moveBefore(BI); 
-    I->setName(I->getName()+".neg"); 
- 
-    // Add the intermediate negates to the redo list as processing them later 
-    // could expose more reassociating opportunities. 
-    ToRedo.insert(I); 
-    return I; 
-  } 
- 
-  // Okay, we need to materialize a negated version of V with an instruction. 
-  // Scan the use lists of V to see if we have one already. 
-  for (User *U : V->users()) { 
-    if (!match(U, m_Neg(m_Value())) && !match(U, m_FNeg(m_Value()))) 
-      continue; 
- 
-    // We found one!  Now we have to make sure that the definition dominates 
-    // this use.  We do this by moving it to the entry block (if it is a 
-    // non-instruction value) or right after the definition.  These negates will 
-    // be zapped by reassociate later, so we don't need much finesse here. 
-    Instruction *TheNeg = cast<Instruction>(U); 
- 
-    // Verify that the negate is in this function, V might be a constant expr. 
-    if (TheNeg->getParent()->getParent() != BI->getParent()->getParent()) 
-      continue; 
- 
-    bool FoundCatchSwitch = false; 
- 
-    BasicBlock::iterator InsertPt; 
-    if (Instruction *InstInput = dyn_cast<Instruction>(V)) { 
-      if (InvokeInst *II = dyn_cast<InvokeInst>(InstInput)) { 
-        InsertPt = II->getNormalDest()->begin(); 
-      } else { 
-        InsertPt = ++InstInput->getIterator(); 
-      } 
- 
-      const BasicBlock *BB = InsertPt->getParent(); 
- 
-      // Make sure we don't move anything before PHIs or exception 
-      // handling pads. 
-      while (InsertPt != BB->end() && (isa<PHINode>(InsertPt) || 
-                                       InsertPt->isEHPad())) { 
-        if (isa<CatchSwitchInst>(InsertPt)) 
-          // A catchswitch cannot have anything in the block except 
-          // itself and PHIs.  We'll bail out below. 
-          FoundCatchSwitch = true; 
-        ++InsertPt; 
-      } 
-    } else { 
-      InsertPt = TheNeg->getParent()->getParent()->getEntryBlock().begin(); 
-    } 
- 
-    // We found a catchswitch in the block where we want to move the 
-    // neg.  We cannot move anything into that block.  Bail and just 
-    // create the neg before BI, as if we hadn't found an existing 
-    // neg. 
-    if (FoundCatchSwitch) 
-      break; 
- 
-    TheNeg->moveBefore(&*InsertPt); 
-    if (TheNeg->getOpcode() == Instruction::Sub) { 
-      TheNeg->setHasNoUnsignedWrap(false); 
-      TheNeg->setHasNoSignedWrap(false); 
-    } else { 
-      TheNeg->andIRFlags(BI); 
-    } 
-    ToRedo.insert(TheNeg); 
-    return TheNeg; 
-  } 
- 
-  // Insert a 'neg' instruction that subtracts the value from zero to get the 
-  // negation. 
-  Instruction *NewNeg = CreateNeg(V, V->getName() + ".neg", BI, BI); 
-  ToRedo.insert(NewNeg); 
-  return NewNeg; 
-} 
- 
+//===- Reassociate.cpp - Reassociate binary expressions -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass reassociates commutative expressions in an order that is designed
+// to promote better constant propagation, GCSE, LICM, PRE, etc.
+//
+// For example: 4 + (x + 5) -> x + (4 + 5)
+//
+// In the implementation of this algorithm, constants are assigned rank = 0,
+// function arguments are rank = 1, and other values are assigned ranks
+// corresponding to the reverse post order traversal of current function
+// (starting at 2), which effectively gives values in deep loops higher rank
+// than values not in loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/Reassociate.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <cassert>
+#include <utility>
+
+using namespace llvm;
+using namespace reassociate;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "reassociate"
+
+STATISTIC(NumChanged, "Number of insts reassociated");
+STATISTIC(NumAnnihil, "Number of expr tree annihilated");
+STATISTIC(NumFactor , "Number of multiplies factored");
+
+#ifndef NDEBUG
+/// Print out the expression identified in the Ops list.
+static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) {
+  Module *M = I->getModule();
+  dbgs() << Instruction::getOpcodeName(I->getOpcode()) << " "
+       << *Ops[0].Op->getType() << '\t';
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    dbgs() << "[ ";
+    Ops[i].Op->printAsOperand(dbgs(), false, M);
+    dbgs() << ", #" << Ops[i].Rank << "] ";
+  }
+}
+#endif
+
+/// Utility class representing a non-constant Xor-operand. We classify
+/// non-constant Xor-Operands into two categories:
+///  C1) The operand is in the form "X & C", where C is a constant and C != ~0
+///  C2)
+///    C2.1) The operand is in the form of "X | C", where C is a non-zero
+///          constant.
+///    C2.2) Any operand E which doesn't fall into C1 and C2.1, we view this
+///          operand as "E | 0"
+class llvm::reassociate::XorOpnd {
+public:
+  XorOpnd(Value *V);
+
+  bool isInvalid() const { return SymbolicPart == nullptr; }
+  bool isOrExpr() const { return isOr; }
+  Value *getValue() const { return OrigVal; }
+  Value *getSymbolicPart() const { return SymbolicPart; }
+  unsigned getSymbolicRank() const { return SymbolicRank; }
+  const APInt &getConstPart() const { return ConstPart; }
+
+  void Invalidate() { SymbolicPart = OrigVal = nullptr; }
+  void setSymbolicRank(unsigned R) { SymbolicRank = R; }
+
+private:
+  Value *OrigVal;
+  Value *SymbolicPart;
+  APInt ConstPart;
+  unsigned SymbolicRank;
+  bool isOr;
+};
+
+XorOpnd::XorOpnd(Value *V) {
+  assert(!isa<ConstantInt>(V) && "No ConstantInt");
+  OrigVal = V;
+  Instruction *I = dyn_cast<Instruction>(V);
+  SymbolicRank = 0;
+
+  if (I && (I->getOpcode() == Instruction::Or ||
+            I->getOpcode() == Instruction::And)) {
+    Value *V0 = I->getOperand(0);
+    Value *V1 = I->getOperand(1);
+    const APInt *C;
+    if (match(V0, m_APInt(C)))
+      std::swap(V0, V1);
+
+    if (match(V1, m_APInt(C))) {
+      ConstPart = *C;
+      SymbolicPart = V0;
+      isOr = (I->getOpcode() == Instruction::Or);
+      return;
+    }
+  }
+
+  // view the operand as "V | 0"
+  SymbolicPart = V;
+  ConstPart = APInt::getNullValue(V->getType()->getScalarSizeInBits());
+  isOr = true;
+}
+
+/// Return true if V is an instruction of the specified opcode and if it
+/// only has one use.
+static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) {
+  auto *I = dyn_cast<Instruction>(V);
+  if (I && I->hasOneUse() && I->getOpcode() == Opcode)
+    if (!isa<FPMathOperator>(I) || I->isFast())
+      return cast<BinaryOperator>(I);
+  return nullptr;
+}
+
+static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode1,
+                                        unsigned Opcode2) {
+  auto *I = dyn_cast<Instruction>(V);
+  if (I && I->hasOneUse() &&
+      (I->getOpcode() == Opcode1 || I->getOpcode() == Opcode2))
+    if (!isa<FPMathOperator>(I) || I->isFast())
+      return cast<BinaryOperator>(I);
+  return nullptr;
+}
+
+void ReassociatePass::BuildRankMap(Function &F,
+                                   ReversePostOrderTraversal<Function*> &RPOT) {
+  unsigned Rank = 2;
+
+  // Assign distinct ranks to function arguments.
+  for (auto &Arg : F.args()) {
+    ValueRankMap[&Arg] = ++Rank;
+    LLVM_DEBUG(dbgs() << "Calculated Rank[" << Arg.getName() << "] = " << Rank
+                      << "\n");
+  }
+
+  // Traverse basic blocks in ReversePostOrder.
+  for (BasicBlock *BB : RPOT) {
+    unsigned BBRank = RankMap[BB] = ++Rank << 16;
+
+    // Walk the basic block, adding precomputed ranks for any instructions that
+    // we cannot move.  This ensures that the ranks for these instructions are
+    // all different in the block.
+    for (Instruction &I : *BB)
+      if (mayBeMemoryDependent(I))
+        ValueRankMap[&I] = ++BBRank;
+  }
+}
+
+unsigned ReassociatePass::getRank(Value *V) {
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) {
+    if (isa<Argument>(V)) return ValueRankMap[V];   // Function argument.
+    return 0;  // Otherwise it's a global or constant, rank 0.
+  }
+
+  if (unsigned Rank = ValueRankMap[I])
+    return Rank;    // Rank already known?
+
+  // If this is an expression, return the 1+MAX(rank(LHS), rank(RHS)) so that
+  // we can reassociate expressions for code motion!  Since we do not recurse
+  // for PHI nodes, we cannot have infinite recursion here, because there
+  // cannot be loops in the value graph that do not go through PHI nodes.
+  unsigned Rank = 0, MaxRank = RankMap[I->getParent()];
+  for (unsigned i = 0, e = I->getNumOperands(); i != e && Rank != MaxRank; ++i)
+    Rank = std::max(Rank, getRank(I->getOperand(i)));
+
+  // If this is a 'not' or 'neg' instruction, do not count it for rank. This
+  // assures us that X and ~X will have the same rank.
+  if (!match(I, m_Not(m_Value())) && !match(I, m_Neg(m_Value())) &&
+      !match(I, m_FNeg(m_Value())))
+    ++Rank;
+
+  LLVM_DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = " << Rank
+                    << "\n");
+
+  return ValueRankMap[I] = Rank;
+}
+
+// Canonicalize constants to RHS.  Otherwise, sort the operands by rank.
+void ReassociatePass::canonicalizeOperands(Instruction *I) {
+  assert(isa<BinaryOperator>(I) && "Expected binary operator.");
+  assert(I->isCommutative() && "Expected commutative operator.");
+
+  Value *LHS = I->getOperand(0);
+  Value *RHS = I->getOperand(1);
+  if (LHS == RHS || isa<Constant>(RHS))
+    return;
+  if (isa<Constant>(LHS) || getRank(RHS) < getRank(LHS))
+    cast<BinaryOperator>(I)->swapOperands();
+}
+
+static BinaryOperator *CreateAdd(Value *S1, Value *S2, const Twine &Name,
+                                 Instruction *InsertBefore, Value *FlagsOp) {
+  if (S1->getType()->isIntOrIntVectorTy())
+    return BinaryOperator::CreateAdd(S1, S2, Name, InsertBefore);
+  else {
+    BinaryOperator *Res =
+        BinaryOperator::CreateFAdd(S1, S2, Name, InsertBefore);
+    Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
+    return Res;
+  }
+}
+
+static BinaryOperator *CreateMul(Value *S1, Value *S2, const Twine &Name,
+                                 Instruction *InsertBefore, Value *FlagsOp) {
+  if (S1->getType()->isIntOrIntVectorTy())
+    return BinaryOperator::CreateMul(S1, S2, Name, InsertBefore);
+  else {
+    BinaryOperator *Res =
+      BinaryOperator::CreateFMul(S1, S2, Name, InsertBefore);
+    Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
+    return Res;
+  }
+}
+
+static Instruction *CreateNeg(Value *S1, const Twine &Name,
+                              Instruction *InsertBefore, Value *FlagsOp) {
+  if (S1->getType()->isIntOrIntVectorTy())
+    return BinaryOperator::CreateNeg(S1, Name, InsertBefore);
+
+  if (auto *FMFSource = dyn_cast<Instruction>(FlagsOp))
+    return UnaryOperator::CreateFNegFMF(S1, FMFSource, Name, InsertBefore);
+
+  return UnaryOperator::CreateFNeg(S1, Name, InsertBefore);
+}
+
+/// Replace 0-X with X*-1.
+static BinaryOperator *LowerNegateToMultiply(Instruction *Neg) {
+  assert((isa<UnaryOperator>(Neg) || isa<BinaryOperator>(Neg)) &&
+         "Expected a Negate!");
+  // FIXME: It's not safe to lower a unary FNeg into a FMul by -1.0.
+  unsigned OpNo = isa<BinaryOperator>(Neg) ? 1 : 0;
+  Type *Ty = Neg->getType();
+  Constant *NegOne = Ty->isIntOrIntVectorTy() ?
+    ConstantInt::getAllOnesValue(Ty) : ConstantFP::get(Ty, -1.0);
+
+  BinaryOperator *Res = CreateMul(Neg->getOperand(OpNo), NegOne, "", Neg, Neg);
+  Neg->setOperand(OpNo, Constant::getNullValue(Ty)); // Drop use of op.
+  Res->takeName(Neg);
+  Neg->replaceAllUsesWith(Res);
+  Res->setDebugLoc(Neg->getDebugLoc());
+  return Res;
+}
+
+/// Returns k such that lambda(2^Bitwidth) = 2^k, where lambda is the Carmichael
+/// function. This means that x^(2^k) === 1 mod 2^Bitwidth for
+/// every odd x, i.e. x^(2^k) = 1 for every odd x in Bitwidth-bit arithmetic.
+/// Note that 0 <= k < Bitwidth, and if Bitwidth > 3 then x^(2^k) = 0 for every
+/// even x in Bitwidth-bit arithmetic.
+static unsigned CarmichaelShift(unsigned Bitwidth) {
+  if (Bitwidth < 3)
+    return Bitwidth - 1;
+  return Bitwidth - 2;
+}
+
+/// Add the extra weight 'RHS' to the existing weight 'LHS',
+/// reducing the combined weight using any special properties of the operation.
+/// The existing weight LHS represents the computation X op X op ... op X where
+/// X occurs LHS times.  The combined weight represents  X op X op ... op X with
+/// X occurring LHS + RHS times.  If op is "Xor" for example then the combined
+/// operation is equivalent to X if LHS + RHS is odd, or 0 if LHS + RHS is even;
+/// the routine returns 1 in LHS in the first case, and 0 in LHS in the second.
+static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) {
+  // If we were working with infinite precision arithmetic then the combined
+  // weight would be LHS + RHS.  But we are using finite precision arithmetic,
+  // and the APInt sum LHS + RHS may not be correct if it wraps (it is correct
+  // for nilpotent operations and addition, but not for idempotent operations
+  // and multiplication), so it is important to correctly reduce the combined
+  // weight back into range if wrapping would be wrong.
+
+  // If RHS is zero then the weight didn't change.
+  if (RHS.isMinValue())
+    return;
+  // If LHS is zero then the combined weight is RHS.
+  if (LHS.isMinValue()) {
+    LHS = RHS;
+    return;
+  }
+  // From this point on we know that neither LHS nor RHS is zero.
+
+  if (Instruction::isIdempotent(Opcode)) {
+    // Idempotent means X op X === X, so any non-zero weight is equivalent to a
+    // weight of 1.  Keeping weights at zero or one also means that wrapping is
+    // not a problem.
+    assert(LHS == 1 && RHS == 1 && "Weights not reduced!");
+    return; // Return a weight of 1.
+  }
+  if (Instruction::isNilpotent(Opcode)) {
+    // Nilpotent means X op X === 0, so reduce weights modulo 2.
+    assert(LHS == 1 && RHS == 1 && "Weights not reduced!");
+    LHS = 0; // 1 + 1 === 0 modulo 2.
+    return;
+  }
+  if (Opcode == Instruction::Add || Opcode == Instruction::FAdd) {
+    // TODO: Reduce the weight by exploiting nsw/nuw?
+    LHS += RHS;
+    return;
+  }
+
+  assert((Opcode == Instruction::Mul || Opcode == Instruction::FMul) &&
+         "Unknown associative operation!");
+  unsigned Bitwidth = LHS.getBitWidth();
+  // If CM is the Carmichael number then a weight W satisfying W >= CM+Bitwidth
+  // can be replaced with W-CM.  That's because x^W=x^(W-CM) for every Bitwidth
+  // bit number x, since either x is odd in which case x^CM = 1, or x is even in
+  // which case both x^W and x^(W - CM) are zero.  By subtracting off multiples
+  // of CM like this weights can always be reduced to the range [0, CM+Bitwidth)
+  // which by a happy accident means that they can always be represented using
+  // Bitwidth bits.
+  // TODO: Reduce the weight by exploiting nsw/nuw?  (Could do much better than
+  // the Carmichael number).
+  if (Bitwidth > 3) {
+    /// CM - The value of Carmichael's lambda function.
+    APInt CM = APInt::getOneBitSet(Bitwidth, CarmichaelShift(Bitwidth));
+    // Any weight W >= Threshold can be replaced with W - CM.
+    APInt Threshold = CM + Bitwidth;
+    assert(LHS.ult(Threshold) && RHS.ult(Threshold) && "Weights not reduced!");
+    // For Bitwidth 4 or more the following sum does not overflow.
+    LHS += RHS;
+    while (LHS.uge(Threshold))
+      LHS -= CM;
+  } else {
+    // To avoid problems with overflow do everything the same as above but using
+    // a larger type.
+    unsigned CM = 1U << CarmichaelShift(Bitwidth);
+    unsigned Threshold = CM + Bitwidth;
+    assert(LHS.getZExtValue() < Threshold && RHS.getZExtValue() < Threshold &&
+           "Weights not reduced!");
+    unsigned Total = LHS.getZExtValue() + RHS.getZExtValue();
+    while (Total >= Threshold)
+      Total -= CM;
+    LHS = Total;
+  }
+}
+
+using RepeatedValue = std::pair<Value*, APInt>;
+
+/// Given an associative binary expression, return the leaf
+/// nodes in Ops along with their weights (how many times the leaf occurs).  The
+/// original expression is the same as
+///   (Ops[0].first op Ops[0].first op ... Ops[0].first)  <- Ops[0].second times
+/// op
+///   (Ops[1].first op Ops[1].first op ... Ops[1].first)  <- Ops[1].second times
+/// op
+///   ...
+/// op
+///   (Ops[N].first op Ops[N].first op ... Ops[N].first)  <- Ops[N].second times
+///
+/// Note that the values Ops[0].first, ..., Ops[N].first are all distinct.
+///
+/// This routine may modify the function, in which case it returns 'true'.  The
+/// changes it makes may well be destructive, changing the value computed by 'I'
+/// to something completely different.  Thus if the routine returns 'true' then
+/// you MUST either replace I with a new expression computed from the Ops array,
+/// or use RewriteExprTree to put the values back in.
+///
+/// A leaf node is either not a binary operation of the same kind as the root
+/// node 'I' (i.e. is not a binary operator at all, or is, but with a different
+/// opcode), or is the same kind of binary operator but has a use which either
+/// does not belong to the expression, or does belong to the expression but is
+/// a leaf node.  Every leaf node has at least one use that is a non-leaf node
+/// of the expression, while for non-leaf nodes (except for the root 'I') every
+/// use is a non-leaf node of the expression.
+///
+/// For example:
+///           expression graph        node names
+///
+///                     +        |        I
+///                    / \       |
+///                   +   +      |      A,  B
+///                  / \ / \     |
+///                 *   +   *    |    C,  D,  E
+///                / \ / \ / \   |
+///                   +   *      |      F,  G
+///
+/// The leaf nodes are C, E, F and G.  The Ops array will contain (maybe not in
+/// that order) (C, 1), (E, 1), (F, 2), (G, 2).
+///
+/// The expression is maximal: if some instruction is a binary operator of the
+/// same kind as 'I', and all of its uses are non-leaf nodes of the expression,
+/// then the instruction also belongs to the expression, is not a leaf node of
+/// it, and its operands also belong to the expression (but may be leaf nodes).
+///
+/// NOTE: This routine will set operands of non-leaf non-root nodes to undef in
+/// order to ensure that every non-root node in the expression has *exactly one*
+/// use by a non-leaf node of the expression.  This destruction means that the
+/// caller MUST either replace 'I' with a new expression or use something like
+/// RewriteExprTree to put the values back in if the routine indicates that it
+/// made a change by returning 'true'.
+///
+/// In the above example either the right operand of A or the left operand of B
+/// will be replaced by undef.  If it is B's operand then this gives:
+///
+///                     +        |        I
+///                    / \       |
+///                   +   +      |      A,  B - operand of B replaced with undef
+///                  / \   \     |
+///                 *   +   *    |    C,  D,  E
+///                / \ / \ / \   |
+///                   +   *      |      F,  G
+///
+/// Note that such undef operands can only be reached by passing through 'I'.
+/// For example, if you visit operands recursively starting from a leaf node
+/// then you will never see such an undef operand unless you get back to 'I',
+/// which requires passing through a phi node.
+///
+/// Note that this routine may also mutate binary operators of the wrong type
+/// that have all uses inside the expression (i.e. only used by non-leaf nodes
+/// of the expression) if it can turn them into binary operators of the right
+/// type and thus make the expression bigger.
+static bool LinearizeExprTree(Instruction *I,
+                              SmallVectorImpl<RepeatedValue> &Ops) {
+  assert((isa<UnaryOperator>(I) || isa<BinaryOperator>(I)) &&
+         "Expected a UnaryOperator or BinaryOperator!");
+  LLVM_DEBUG(dbgs() << "LINEARIZE: " << *I << '\n');
+  unsigned Bitwidth = I->getType()->getScalarType()->getPrimitiveSizeInBits();
+  unsigned Opcode = I->getOpcode();
+  assert(I->isAssociative() && I->isCommutative() &&
+         "Expected an associative and commutative operation!");
+
+  // Visit all operands of the expression, keeping track of their weight (the
+  // number of paths from the expression root to the operand, or if you like
+  // the number of times that operand occurs in the linearized expression).
+  // For example, if I = X + A, where X = A + B, then I, X and B have weight 1
+  // while A has weight two.
+
+  // Worklist of non-leaf nodes (their operands are in the expression too) along
+  // with their weights, representing a certain number of paths to the operator.
+  // If an operator occurs in the worklist multiple times then we found multiple
+  // ways to get to it.
+  SmallVector<std::pair<Instruction*, APInt>, 8> Worklist; // (Op, Weight)
+  Worklist.push_back(std::make_pair(I, APInt(Bitwidth, 1)));
+  bool Changed = false;
+
+  // Leaves of the expression are values that either aren't the right kind of
+  // operation (eg: a constant, or a multiply in an add tree), or are, but have
+  // some uses that are not inside the expression.  For example, in I = X + X,
+  // X = A + B, the value X has two uses (by I) that are in the expression.  If
+  // X has any other uses, for example in a return instruction, then we consider
+  // X to be a leaf, and won't analyze it further.  When we first visit a value,
+  // if it has more than one use then at first we conservatively consider it to
+  // be a leaf.  Later, as the expression is explored, we may discover some more
+  // uses of the value from inside the expression.  If all uses turn out to be
+  // from within the expression (and the value is a binary operator of the right
+  // kind) then the value is no longer considered to be a leaf, and its operands
+  // are explored.
+
+  // Leaves - Keeps track of the set of putative leaves as well as the number of
+  // paths to each leaf seen so far.
+  using LeafMap = DenseMap<Value *, APInt>;
+  LeafMap Leaves; // Leaf -> Total weight so far.
+  SmallVector<Value *, 8> LeafOrder; // Ensure deterministic leaf output order.
+
+#ifndef NDEBUG
+  SmallPtrSet<Value *, 8> Visited; // For sanity checking the iteration scheme.
+#endif
+  while (!Worklist.empty()) {
+    std::pair<Instruction*, APInt> P = Worklist.pop_back_val();
+    I = P.first; // We examine the operands of this binary operator.
+
+    for (unsigned OpIdx = 0; OpIdx < I->getNumOperands(); ++OpIdx) { // Visit operands.
+      Value *Op = I->getOperand(OpIdx);
+      APInt Weight = P.second; // Number of paths to this operand.
+      LLVM_DEBUG(dbgs() << "OPERAND: " << *Op << " (" << Weight << ")\n");
+      assert(!Op->use_empty() && "No uses, so how did we get to it?!");
+
+      // If this is a binary operation of the right kind with only one use then
+      // add its operands to the expression.
+      if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
+        assert(Visited.insert(Op).second && "Not first visit!");
+        LLVM_DEBUG(dbgs() << "DIRECT ADD: " << *Op << " (" << Weight << ")\n");
+        Worklist.push_back(std::make_pair(BO, Weight));
+        continue;
+      }
+
+      // Appears to be a leaf.  Is the operand already in the set of leaves?
+      LeafMap::iterator It = Leaves.find(Op);
+      if (It == Leaves.end()) {
+        // Not in the leaf map.  Must be the first time we saw this operand.
+        assert(Visited.insert(Op).second && "Not first visit!");
+        if (!Op->hasOneUse()) {
+          // This value has uses not accounted for by the expression, so it is
+          // not safe to modify.  Mark it as being a leaf.
+          LLVM_DEBUG(dbgs()
+                     << "ADD USES LEAF: " << *Op << " (" << Weight << ")\n");
+          LeafOrder.push_back(Op);
+          Leaves[Op] = Weight;
+          continue;
+        }
+        // No uses outside the expression, try morphing it.
+      } else {
+        // Already in the leaf map.
+        assert(It != Leaves.end() && Visited.count(Op) &&
+               "In leaf map but not visited!");
+
+        // Update the number of paths to the leaf.
+        IncorporateWeight(It->second, Weight, Opcode);
+
+#if 0   // TODO: Re-enable once PR13021 is fixed.
+        // The leaf already has one use from inside the expression.  As we want
+        // exactly one such use, drop this new use of the leaf.
+        assert(!Op->hasOneUse() && "Only one use, but we got here twice!");
+        I->setOperand(OpIdx, UndefValue::get(I->getType()));
+        Changed = true;
+
+        // If the leaf is a binary operation of the right kind and we now see
+        // that its multiple original uses were in fact all by nodes belonging
+        // to the expression, then no longer consider it to be a leaf and add
+        // its operands to the expression.
+        if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
+          LLVM_DEBUG(dbgs() << "UNLEAF: " << *Op << " (" << It->second << ")\n");
+          Worklist.push_back(std::make_pair(BO, It->second));
+          Leaves.erase(It);
+          continue;
+        }
+#endif
+
+        // If we still have uses that are not accounted for by the expression
+        // then it is not safe to modify the value.
+        if (!Op->hasOneUse())
+          continue;
+
+        // No uses outside the expression, try morphing it.
+        Weight = It->second;
+        Leaves.erase(It); // Since the value may be morphed below.
+      }
+
+      // At this point we have a value which, first of all, is not a binary
+      // expression of the right kind, and secondly, is only used inside the
+      // expression.  This means that it can safely be modified.  See if we
+      // can usefully morph it into an expression of the right kind.
+      assert((!isa<Instruction>(Op) ||
+              cast<Instruction>(Op)->getOpcode() != Opcode
+              || (isa<FPMathOperator>(Op) &&
+                  !cast<Instruction>(Op)->isFast())) &&
+             "Should have been handled above!");
+      assert(Op->hasOneUse() && "Has uses outside the expression tree!");
+
+      // If this is a multiply expression, turn any internal negations into
+      // multiplies by -1 so they can be reassociated.
+      if (Instruction *Tmp = dyn_cast<Instruction>(Op))
+        if ((Opcode == Instruction::Mul && match(Tmp, m_Neg(m_Value()))) ||
+            (Opcode == Instruction::FMul && match(Tmp, m_FNeg(m_Value())))) {
+          LLVM_DEBUG(dbgs()
+                     << "MORPH LEAF: " << *Op << " (" << Weight << ") TO ");
+          Tmp = LowerNegateToMultiply(Tmp);
+          LLVM_DEBUG(dbgs() << *Tmp << '\n');
+          Worklist.push_back(std::make_pair(Tmp, Weight));
+          Changed = true;
+          continue;
+        }
+
+      // Failed to morph into an expression of the right type.  This really is
+      // a leaf.
+      LLVM_DEBUG(dbgs() << "ADD LEAF: " << *Op << " (" << Weight << ")\n");
+      assert(!isReassociableOp(Op, Opcode) && "Value was morphed?");
+      LeafOrder.push_back(Op);
+      Leaves[Op] = Weight;
+    }
+  }
+
+  // The leaves, repeated according to their weights, represent the linearized
+  // form of the expression.
+  for (unsigned i = 0, e = LeafOrder.size(); i != e; ++i) {
+    Value *V = LeafOrder[i];
+    LeafMap::iterator It = Leaves.find(V);
+    if (It == Leaves.end())
+      // Node initially thought to be a leaf wasn't.
+      continue;
+    assert(!isReassociableOp(V, Opcode) && "Shouldn't be a leaf!");
+    APInt Weight = It->second;
+    if (Weight.isMinValue())
+      // Leaf already output or weight reduction eliminated it.
+      continue;
+    // Ensure the leaf is only output once.
+    It->second = 0;
+    Ops.push_back(std::make_pair(V, Weight));
+  }
+
+  // For nilpotent operations or addition there may be no operands, for example
+  // because the expression was "X xor X" or consisted of 2^Bitwidth additions:
+  // in both cases the weight reduces to 0 causing the value to be skipped.
+  if (Ops.empty()) {
+    Constant *Identity = ConstantExpr::getBinOpIdentity(Opcode, I->getType());
+    assert(Identity && "Associative operation without identity!");
+    Ops.emplace_back(Identity, APInt(Bitwidth, 1));
+  }
+
+  return Changed;
+}
+
+/// Now that the operands for this expression tree are
+/// linearized and optimized, emit them in-order.
+void ReassociatePass::RewriteExprTree(BinaryOperator *I,
+                                      SmallVectorImpl<ValueEntry> &Ops) {
+  assert(Ops.size() > 1 && "Single values should be used directly!");
+
+  // Since our optimizations should never increase the number of operations, the
+  // new expression can usually be written reusing the existing binary operators
+  // from the original expression tree, without creating any new instructions,
+  // though the rewritten expression may have a completely different topology.
+  // We take care to not change anything if the new expression will be the same
+  // as the original.  If more than trivial changes (like commuting operands)
+  // were made then we are obliged to clear out any optional subclass data like
+  // nsw flags.
+
+  /// NodesToRewrite - Nodes from the original expression available for writing
+  /// the new expression into.
+  SmallVector<BinaryOperator*, 8> NodesToRewrite;
+  unsigned Opcode = I->getOpcode();
+  BinaryOperator *Op = I;
+
+  /// NotRewritable - The operands being written will be the leaves of the new
+  /// expression and must not be used as inner nodes (via NodesToRewrite) by
+  /// mistake.  Inner nodes are always reassociable, and usually leaves are not
+  /// (if they were they would have been incorporated into the expression and so
+  /// would not be leaves), so most of the time there is no danger of this.  But
+  /// in rare cases a leaf may become reassociable if an optimization kills uses
+  /// of it, or it may momentarily become reassociable during rewriting (below)
+  /// due it being removed as an operand of one of its uses.  Ensure that misuse
+  /// of leaf nodes as inner nodes cannot occur by remembering all of the future
+  /// leaves and refusing to reuse any of them as inner nodes.
+  SmallPtrSet<Value*, 8> NotRewritable;
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    NotRewritable.insert(Ops[i].Op);
+
+  // ExpressionChanged - Non-null if the rewritten expression differs from the
+  // original in some non-trivial way, requiring the clearing of optional flags.
+  // Flags are cleared from the operator in ExpressionChanged up to I inclusive.
+  BinaryOperator *ExpressionChanged = nullptr;
+  for (unsigned i = 0; ; ++i) {
+    // The last operation (which comes earliest in the IR) is special as both
+    // operands will come from Ops, rather than just one with the other being
+    // a subexpression.
+    if (i+2 == Ops.size()) {
+      Value *NewLHS = Ops[i].Op;
+      Value *NewRHS = Ops[i+1].Op;
+      Value *OldLHS = Op->getOperand(0);
+      Value *OldRHS = Op->getOperand(1);
+
+      if (NewLHS == OldLHS && NewRHS == OldRHS)
+        // Nothing changed, leave it alone.
+        break;
+
+      if (NewLHS == OldRHS && NewRHS == OldLHS) {
+        // The order of the operands was reversed.  Swap them.
+        LLVM_DEBUG(dbgs() << "RA: " << *Op << '\n');
+        Op->swapOperands();
+        LLVM_DEBUG(dbgs() << "TO: " << *Op << '\n');
+        MadeChange = true;
+        ++NumChanged;
+        break;
+      }
+
+      // The new operation differs non-trivially from the original. Overwrite
+      // the old operands with the new ones.
+      LLVM_DEBUG(dbgs() << "RA: " << *Op << '\n');
+      if (NewLHS != OldLHS) {
+        BinaryOperator *BO = isReassociableOp(OldLHS, Opcode);
+        if (BO && !NotRewritable.count(BO))
+          NodesToRewrite.push_back(BO);
+        Op->setOperand(0, NewLHS);
+      }
+      if (NewRHS != OldRHS) {
+        BinaryOperator *BO = isReassociableOp(OldRHS, Opcode);
+        if (BO && !NotRewritable.count(BO))
+          NodesToRewrite.push_back(BO);
+        Op->setOperand(1, NewRHS);
+      }
+      LLVM_DEBUG(dbgs() << "TO: " << *Op << '\n');
+
+      ExpressionChanged = Op;
+      MadeChange = true;
+      ++NumChanged;
+
+      break;
+    }
+
+    // Not the last operation.  The left-hand side will be a sub-expression
+    // while the right-hand side will be the current element of Ops.
+    Value *NewRHS = Ops[i].Op;
+    if (NewRHS != Op->getOperand(1)) {
+      LLVM_DEBUG(dbgs() << "RA: " << *Op << '\n');
+      if (NewRHS == Op->getOperand(0)) {
+        // The new right-hand side was already present as the left operand.  If
+        // we are lucky then swapping the operands will sort out both of them.
+        Op->swapOperands();
+      } else {
+        // Overwrite with the new right-hand side.
+        BinaryOperator *BO = isReassociableOp(Op->getOperand(1), Opcode);
+        if (BO && !NotRewritable.count(BO))
+          NodesToRewrite.push_back(BO);
+        Op->setOperand(1, NewRHS);
+        ExpressionChanged = Op;
+      }
+      LLVM_DEBUG(dbgs() << "TO: " << *Op << '\n');
+      MadeChange = true;
+      ++NumChanged;
+    }
+
+    // Now deal with the left-hand side.  If this is already an operation node
+    // from the original expression then just rewrite the rest of the expression
+    // into it.
+    BinaryOperator *BO = isReassociableOp(Op->getOperand(0), Opcode);
+    if (BO && !NotRewritable.count(BO)) {
+      Op = BO;
+      continue;
+    }
+
+    // Otherwise, grab a spare node from the original expression and use that as
+    // the left-hand side.  If there are no nodes left then the optimizers made
+    // an expression with more nodes than the original!  This usually means that
+    // they did something stupid but it might mean that the problem was just too
+    // hard (finding the mimimal number of multiplications needed to realize a
+    // multiplication expression is NP-complete).  Whatever the reason, smart or
+    // stupid, create a new node if there are none left.
+    BinaryOperator *NewOp;
+    if (NodesToRewrite.empty()) {
+      Constant *Undef = UndefValue::get(I->getType());
+      NewOp = BinaryOperator::Create(Instruction::BinaryOps(Opcode),
+                                     Undef, Undef, "", I);
+      if (NewOp->getType()->isFPOrFPVectorTy())
+        NewOp->setFastMathFlags(I->getFastMathFlags());
+    } else {
+      NewOp = NodesToRewrite.pop_back_val();
+    }
+
+    LLVM_DEBUG(dbgs() << "RA: " << *Op << '\n');
+    Op->setOperand(0, NewOp);
+    LLVM_DEBUG(dbgs() << "TO: " << *Op << '\n');
+    ExpressionChanged = Op;
+    MadeChange = true;
+    ++NumChanged;
+    Op = NewOp;
+  }
+
+  // If the expression changed non-trivially then clear out all subclass data
+  // starting from the operator specified in ExpressionChanged, and compactify
+  // the operators to just before the expression root to guarantee that the
+  // expression tree is dominated by all of Ops.
+  if (ExpressionChanged)
+    do {
+      // Preserve FastMathFlags.
+      if (isa<FPMathOperator>(I)) {
+        FastMathFlags Flags = I->getFastMathFlags();
+        ExpressionChanged->clearSubclassOptionalData();
+        ExpressionChanged->setFastMathFlags(Flags);
+      } else
+        ExpressionChanged->clearSubclassOptionalData();
+
+      if (ExpressionChanged == I)
+        break;
+
+      // Discard any debug info related to the expressions that has changed (we
+      // can leave debug infor related to the root, since the result of the
+      // expression tree should be the same even after reassociation).
+      replaceDbgUsesWithUndef(ExpressionChanged);
+
+      ExpressionChanged->moveBefore(I);
+      ExpressionChanged = cast<BinaryOperator>(*ExpressionChanged->user_begin());
+    } while (true);
+
+  // Throw away any left over nodes from the original expression.
+  for (unsigned i = 0, e = NodesToRewrite.size(); i != e; ++i)
+    RedoInsts.insert(NodesToRewrite[i]);
+}
+
+/// Insert instructions before the instruction pointed to by BI,
+/// that computes the negative version of the value specified.  The negative
+/// version of the value is returned, and BI is left pointing at the instruction
+/// that should be processed next by the reassociation pass.
+/// Also add intermediate instructions to the redo list that are modified while
+/// pushing the negates through adds.  These will be revisited to see if
+/// additional opportunities have been exposed.
+static Value *NegateValue(Value *V, Instruction *BI,
+                          ReassociatePass::OrderedSet &ToRedo) {
+  if (auto *C = dyn_cast<Constant>(V))
+    return C->getType()->isFPOrFPVectorTy() ? ConstantExpr::getFNeg(C) :
+                                              ConstantExpr::getNeg(C);
+
+  // We are trying to expose opportunity for reassociation.  One of the things
+  // that we want to do to achieve this is to push a negation as deep into an
+  // expression chain as possible, to expose the add instructions.  In practice,
+  // this means that we turn this:
+  //   X = -(A+12+C+D)   into    X = -A + -12 + -C + -D = -12 + -A + -C + -D
+  // so that later, a: Y = 12+X could get reassociated with the -12 to eliminate
+  // the constants.  We assume that instcombine will clean up the mess later if
+  // we introduce tons of unnecessary negation instructions.
+  //
+  if (BinaryOperator *I =
+          isReassociableOp(V, Instruction::Add, Instruction::FAdd)) {
+    // Push the negates through the add.
+    I->setOperand(0, NegateValue(I->getOperand(0), BI, ToRedo));
+    I->setOperand(1, NegateValue(I->getOperand(1), BI, ToRedo));
+    if (I->getOpcode() == Instruction::Add) {
+      I->setHasNoUnsignedWrap(false);
+      I->setHasNoSignedWrap(false);
+    }
+
+    // We must move the add instruction here, because the neg instructions do
+    // not dominate the old add instruction in general.  By moving it, we are
+    // assured that the neg instructions we just inserted dominate the
+    // instruction we are about to insert after them.
+    //
+    I->moveBefore(BI);
+    I->setName(I->getName()+".neg");
+
+    // Add the intermediate negates to the redo list as processing them later
+    // could expose more reassociating opportunities.
+    ToRedo.insert(I);
+    return I;
+  }
+
+  // Okay, we need to materialize a negated version of V with an instruction.
+  // Scan the use lists of V to see if we have one already.
+  for (User *U : V->users()) {
+    if (!match(U, m_Neg(m_Value())) && !match(U, m_FNeg(m_Value())))
+      continue;
+
+    // We found one!  Now we have to make sure that the definition dominates
+    // this use.  We do this by moving it to the entry block (if it is a
+    // non-instruction value) or right after the definition.  These negates will
+    // be zapped by reassociate later, so we don't need much finesse here.
+    Instruction *TheNeg = cast<Instruction>(U);
+
+    // Verify that the negate is in this function, V might be a constant expr.
+    if (TheNeg->getParent()->getParent() != BI->getParent()->getParent())
+      continue;
+
+    bool FoundCatchSwitch = false;
+
+    BasicBlock::iterator InsertPt;
+    if (Instruction *InstInput = dyn_cast<Instruction>(V)) {
+      if (InvokeInst *II = dyn_cast<InvokeInst>(InstInput)) {
+        InsertPt = II->getNormalDest()->begin();
+      } else {
+        InsertPt = ++InstInput->getIterator();
+      }
+
+      const BasicBlock *BB = InsertPt->getParent();
+
+      // Make sure we don't move anything before PHIs or exception
+      // handling pads.
+      while (InsertPt != BB->end() && (isa<PHINode>(InsertPt) ||
+                                       InsertPt->isEHPad())) {
+        if (isa<CatchSwitchInst>(InsertPt))
+          // A catchswitch cannot have anything in the block except
+          // itself and PHIs.  We'll bail out below.
+          FoundCatchSwitch = true;
+        ++InsertPt;
+      }
+    } else {
+      InsertPt = TheNeg->getParent()->getParent()->getEntryBlock().begin();
+    }
+
+    // We found a catchswitch in the block where we want to move the
+    // neg.  We cannot move anything into that block.  Bail and just
+    // create the neg before BI, as if we hadn't found an existing
+    // neg.
+    if (FoundCatchSwitch)
+      break;
+
+    TheNeg->moveBefore(&*InsertPt);
+    if (TheNeg->getOpcode() == Instruction::Sub) {
+      TheNeg->setHasNoUnsignedWrap(false);
+      TheNeg->setHasNoSignedWrap(false);
+    } else {
+      TheNeg->andIRFlags(BI);
+    }
+    ToRedo.insert(TheNeg);
+    return TheNeg;
+  }
+
+  // Insert a 'neg' instruction that subtracts the value from zero to get the
+  // negation.
+  Instruction *NewNeg = CreateNeg(V, V->getName() + ".neg", BI, BI);
+  ToRedo.insert(NewNeg);
+  return NewNeg;
+}
+
 // See if this `or` looks like an load widening reduction, i.e. that it
 // consists of an `or`/`shl`/`zext`/`load` nodes only. Note that we don't
 // ensure that the pattern is *really* a load widening reduction,
@@ -1014,1201 +1014,1201 @@ static BinaryOperator *ConvertOrWithNoCommonBitsToAdd(Instruction *Or) {
   return New;
 }
 
-/// Return true if we should break up this subtract of X-Y into (X + -Y). 
-static bool ShouldBreakUpSubtract(Instruction *Sub) { 
-  // If this is a negation, we can't split it up! 
-  if (match(Sub, m_Neg(m_Value())) || match(Sub, m_FNeg(m_Value())))  
-    return false; 
- 
-  // Don't breakup X - undef. 
-  if (isa<UndefValue>(Sub->getOperand(1))) 
-    return false; 
- 
-  // Don't bother to break this up unless either the LHS is an associable add or 
-  // subtract or if this is only used by one. 
-  Value *V0 = Sub->getOperand(0); 
-  if (isReassociableOp(V0, Instruction::Add, Instruction::FAdd) || 
-      isReassociableOp(V0, Instruction::Sub, Instruction::FSub)) 
-    return true; 
-  Value *V1 = Sub->getOperand(1); 
-  if (isReassociableOp(V1, Instruction::Add, Instruction::FAdd) || 
-      isReassociableOp(V1, Instruction::Sub, Instruction::FSub)) 
-    return true; 
-  Value *VB = Sub->user_back(); 
-  if (Sub->hasOneUse() && 
-      (isReassociableOp(VB, Instruction::Add, Instruction::FAdd) || 
-       isReassociableOp(VB, Instruction::Sub, Instruction::FSub))) 
-    return true; 
- 
-  return false; 
-} 
- 
-/// If we have (X-Y), and if either X is an add, or if this is only used by an 
-/// add, transform this into (X+(0-Y)) to promote better reassociation. 
-static BinaryOperator *BreakUpSubtract(Instruction *Sub, 
-                                       ReassociatePass::OrderedSet &ToRedo) { 
-  // Convert a subtract into an add and a neg instruction. This allows sub 
-  // instructions to be commuted with other add instructions. 
-  // 
-  // Calculate the negative value of Operand 1 of the sub instruction, 
-  // and set it as the RHS of the add instruction we just made. 
-  Value *NegVal = NegateValue(Sub->getOperand(1), Sub, ToRedo); 
-  BinaryOperator *New = CreateAdd(Sub->getOperand(0), NegVal, "", Sub, Sub); 
-  Sub->setOperand(0, Constant::getNullValue(Sub->getType())); // Drop use of op. 
-  Sub->setOperand(1, Constant::getNullValue(Sub->getType())); // Drop use of op. 
-  New->takeName(Sub); 
- 
-  // Everyone now refers to the add instruction. 
-  Sub->replaceAllUsesWith(New); 
-  New->setDebugLoc(Sub->getDebugLoc()); 
- 
-  LLVM_DEBUG(dbgs() << "Negated: " << *New << '\n'); 
-  return New; 
-} 
- 
-/// If this is a shift of a reassociable multiply or is used by one, change 
-/// this into a multiply by a constant to assist with further reassociation. 
-static BinaryOperator *ConvertShiftToMul(Instruction *Shl) { 
-  Constant *MulCst = ConstantInt::get(Shl->getType(), 1); 
-  auto *SA = cast<ConstantInt>(Shl->getOperand(1)); 
-  MulCst = ConstantExpr::getShl(MulCst, SA); 
- 
-  BinaryOperator *Mul = 
-    BinaryOperator::CreateMul(Shl->getOperand(0), MulCst, "", Shl); 
-  Shl->setOperand(0, UndefValue::get(Shl->getType())); // Drop use of op. 
-  Mul->takeName(Shl); 
- 
-  // Everyone now refers to the mul instruction. 
-  Shl->replaceAllUsesWith(Mul); 
-  Mul->setDebugLoc(Shl->getDebugLoc()); 
- 
-  // We can safely preserve the nuw flag in all cases.  It's also safe to turn a 
-  // nuw nsw shl into a nuw nsw mul.  However, nsw in isolation requires special 
-  // handling.  It can be preserved as long as we're not left shifting by 
-  // bitwidth - 1. 
-  bool NSW = cast<BinaryOperator>(Shl)->hasNoSignedWrap(); 
-  bool NUW = cast<BinaryOperator>(Shl)->hasNoUnsignedWrap(); 
-  unsigned BitWidth = Shl->getType()->getIntegerBitWidth(); 
-  if (NSW && (NUW || SA->getValue().ult(BitWidth - 1))) 
-    Mul->setHasNoSignedWrap(true); 
-  Mul->setHasNoUnsignedWrap(NUW); 
-  return Mul; 
-} 
- 
-/// Scan backwards and forwards among values with the same rank as element i 
-/// to see if X exists.  If X does not exist, return i.  This is useful when 
-/// scanning for 'x' when we see '-x' because they both get the same rank. 
-static unsigned FindInOperandList(const SmallVectorImpl<ValueEntry> &Ops, 
-                                  unsigned i, Value *X) { 
-  unsigned XRank = Ops[i].Rank; 
-  unsigned e = Ops.size(); 
-  for (unsigned j = i+1; j != e && Ops[j].Rank == XRank; ++j) { 
-    if (Ops[j].Op == X) 
-      return j; 
-    if (Instruction *I1 = dyn_cast<Instruction>(Ops[j].Op)) 
-      if (Instruction *I2 = dyn_cast<Instruction>(X)) 
-        if (I1->isIdenticalTo(I2)) 
-          return j; 
-  } 
-  // Scan backwards. 
-  for (unsigned j = i-1; j != ~0U && Ops[j].Rank == XRank; --j) { 
-    if (Ops[j].Op == X) 
-      return j; 
-    if (Instruction *I1 = dyn_cast<Instruction>(Ops[j].Op)) 
-      if (Instruction *I2 = dyn_cast<Instruction>(X)) 
-        if (I1->isIdenticalTo(I2)) 
-          return j; 
-  } 
-  return i; 
-} 
- 
-/// Emit a tree of add instructions, summing Ops together 
-/// and returning the result.  Insert the tree before I. 
-static Value *EmitAddTreeOfValues(Instruction *I, 
-                                  SmallVectorImpl<WeakTrackingVH> &Ops) { 
-  if (Ops.size() == 1) return Ops.back(); 
- 
+/// Return true if we should break up this subtract of X-Y into (X + -Y).
+static bool ShouldBreakUpSubtract(Instruction *Sub) {
+  // If this is a negation, we can't split it up!
+  if (match(Sub, m_Neg(m_Value())) || match(Sub, m_FNeg(m_Value()))) 
+    return false;
+
+  // Don't breakup X - undef.
+  if (isa<UndefValue>(Sub->getOperand(1)))
+    return false;
+
+  // Don't bother to break this up unless either the LHS is an associable add or
+  // subtract or if this is only used by one.
+  Value *V0 = Sub->getOperand(0);
+  if (isReassociableOp(V0, Instruction::Add, Instruction::FAdd) ||
+      isReassociableOp(V0, Instruction::Sub, Instruction::FSub))
+    return true;
+  Value *V1 = Sub->getOperand(1);
+  if (isReassociableOp(V1, Instruction::Add, Instruction::FAdd) ||
+      isReassociableOp(V1, Instruction::Sub, Instruction::FSub))
+    return true;
+  Value *VB = Sub->user_back();
+  if (Sub->hasOneUse() &&
+      (isReassociableOp(VB, Instruction::Add, Instruction::FAdd) ||
+       isReassociableOp(VB, Instruction::Sub, Instruction::FSub)))
+    return true;
+
+  return false;
+}
+
+/// If we have (X-Y), and if either X is an add, or if this is only used by an
+/// add, transform this into (X+(0-Y)) to promote better reassociation.
+static BinaryOperator *BreakUpSubtract(Instruction *Sub,
+                                       ReassociatePass::OrderedSet &ToRedo) {
+  // Convert a subtract into an add and a neg instruction. This allows sub
+  // instructions to be commuted with other add instructions.
+  //
+  // Calculate the negative value of Operand 1 of the sub instruction,
+  // and set it as the RHS of the add instruction we just made.
+  Value *NegVal = NegateValue(Sub->getOperand(1), Sub, ToRedo);
+  BinaryOperator *New = CreateAdd(Sub->getOperand(0), NegVal, "", Sub, Sub);
+  Sub->setOperand(0, Constant::getNullValue(Sub->getType())); // Drop use of op.
+  Sub->setOperand(1, Constant::getNullValue(Sub->getType())); // Drop use of op.
+  New->takeName(Sub);
+
+  // Everyone now refers to the add instruction.
+  Sub->replaceAllUsesWith(New);
+  New->setDebugLoc(Sub->getDebugLoc());
+
+  LLVM_DEBUG(dbgs() << "Negated: " << *New << '\n');
+  return New;
+}
+
+/// If this is a shift of a reassociable multiply or is used by one, change
+/// this into a multiply by a constant to assist with further reassociation.
+static BinaryOperator *ConvertShiftToMul(Instruction *Shl) {
+  Constant *MulCst = ConstantInt::get(Shl->getType(), 1);
+  auto *SA = cast<ConstantInt>(Shl->getOperand(1));
+  MulCst = ConstantExpr::getShl(MulCst, SA);
+
+  BinaryOperator *Mul =
+    BinaryOperator::CreateMul(Shl->getOperand(0), MulCst, "", Shl);
+  Shl->setOperand(0, UndefValue::get(Shl->getType())); // Drop use of op.
+  Mul->takeName(Shl);
+
+  // Everyone now refers to the mul instruction.
+  Shl->replaceAllUsesWith(Mul);
+  Mul->setDebugLoc(Shl->getDebugLoc());
+
+  // We can safely preserve the nuw flag in all cases.  It's also safe to turn a
+  // nuw nsw shl into a nuw nsw mul.  However, nsw in isolation requires special
+  // handling.  It can be preserved as long as we're not left shifting by
+  // bitwidth - 1.
+  bool NSW = cast<BinaryOperator>(Shl)->hasNoSignedWrap();
+  bool NUW = cast<BinaryOperator>(Shl)->hasNoUnsignedWrap();
+  unsigned BitWidth = Shl->getType()->getIntegerBitWidth();
+  if (NSW && (NUW || SA->getValue().ult(BitWidth - 1)))
+    Mul->setHasNoSignedWrap(true);
+  Mul->setHasNoUnsignedWrap(NUW);
+  return Mul;
+}
+
+/// Scan backwards and forwards among values with the same rank as element i
+/// to see if X exists.  If X does not exist, return i.  This is useful when
+/// scanning for 'x' when we see '-x' because they both get the same rank.
+static unsigned FindInOperandList(const SmallVectorImpl<ValueEntry> &Ops,
+                                  unsigned i, Value *X) {
+  unsigned XRank = Ops[i].Rank;
+  unsigned e = Ops.size();
+  for (unsigned j = i+1; j != e && Ops[j].Rank == XRank; ++j) {
+    if (Ops[j].Op == X)
+      return j;
+    if (Instruction *I1 = dyn_cast<Instruction>(Ops[j].Op))
+      if (Instruction *I2 = dyn_cast<Instruction>(X))
+        if (I1->isIdenticalTo(I2))
+          return j;
+  }
+  // Scan backwards.
+  for (unsigned j = i-1; j != ~0U && Ops[j].Rank == XRank; --j) {
+    if (Ops[j].Op == X)
+      return j;
+    if (Instruction *I1 = dyn_cast<Instruction>(Ops[j].Op))
+      if (Instruction *I2 = dyn_cast<Instruction>(X))
+        if (I1->isIdenticalTo(I2))
+          return j;
+  }
+  return i;
+}
+
+/// Emit a tree of add instructions, summing Ops together
+/// and returning the result.  Insert the tree before I.
+static Value *EmitAddTreeOfValues(Instruction *I,
+                                  SmallVectorImpl<WeakTrackingVH> &Ops) {
+  if (Ops.size() == 1) return Ops.back();
+
   Value *V1 = Ops.pop_back_val();
-  Value *V2 = EmitAddTreeOfValues(I, Ops); 
-  return CreateAdd(V2, V1, "reass.add", I, I); 
-} 
- 
-/// If V is an expression tree that is a multiplication sequence, 
-/// and if this sequence contains a multiply by Factor, 
-/// remove Factor from the tree and return the new tree. 
-Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) { 
-  BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul); 
-  if (!BO) 
-    return nullptr; 
- 
-  SmallVector<RepeatedValue, 8> Tree; 
-  MadeChange |= LinearizeExprTree(BO, Tree); 
-  SmallVector<ValueEntry, 8> Factors; 
-  Factors.reserve(Tree.size()); 
-  for (unsigned i = 0, e = Tree.size(); i != e; ++i) { 
-    RepeatedValue E = Tree[i]; 
-    Factors.append(E.second.getZExtValue(), 
-                   ValueEntry(getRank(E.first), E.first)); 
-  } 
- 
-  bool FoundFactor = false; 
-  bool NeedsNegate = false; 
-  for (unsigned i = 0, e = Factors.size(); i != e; ++i) { 
-    if (Factors[i].Op == Factor) { 
-      FoundFactor = true; 
-      Factors.erase(Factors.begin()+i); 
-      break; 
-    } 
- 
-    // If this is a negative version of this factor, remove it. 
-    if (ConstantInt *FC1 = dyn_cast<ConstantInt>(Factor)) { 
-      if (ConstantInt *FC2 = dyn_cast<ConstantInt>(Factors[i].Op)) 
-        if (FC1->getValue() == -FC2->getValue()) { 
-          FoundFactor = NeedsNegate = true; 
-          Factors.erase(Factors.begin()+i); 
-          break; 
-        } 
-    } else if (ConstantFP *FC1 = dyn_cast<ConstantFP>(Factor)) { 
-      if (ConstantFP *FC2 = dyn_cast<ConstantFP>(Factors[i].Op)) { 
-        const APFloat &F1 = FC1->getValueAPF(); 
-        APFloat F2(FC2->getValueAPF()); 
-        F2.changeSign(); 
-        if (F1 == F2) { 
-          FoundFactor = NeedsNegate = true; 
-          Factors.erase(Factors.begin() + i); 
-          break; 
-        } 
-      } 
-    } 
-  } 
- 
-  if (!FoundFactor) { 
-    // Make sure to restore the operands to the expression tree. 
-    RewriteExprTree(BO, Factors); 
-    return nullptr; 
-  } 
- 
-  BasicBlock::iterator InsertPt = ++BO->getIterator(); 
- 
-  // If this was just a single multiply, remove the multiply and return the only 
-  // remaining operand. 
-  if (Factors.size() == 1) { 
-    RedoInsts.insert(BO); 
-    V = Factors[0].Op; 
-  } else { 
-    RewriteExprTree(BO, Factors); 
-    V = BO; 
-  } 
- 
-  if (NeedsNegate) 
-    V = CreateNeg(V, "neg", &*InsertPt, BO); 
- 
-  return V; 
-} 
- 
-/// If V is a single-use multiply, recursively add its operands as factors, 
-/// otherwise add V to the list of factors. 
-/// 
-/// Ops is the top-level list of add operands we're trying to factor. 
-static void FindSingleUseMultiplyFactors(Value *V, 
-                                         SmallVectorImpl<Value*> &Factors) { 
-  BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul); 
-  if (!BO) { 
-    Factors.push_back(V); 
-    return; 
-  } 
- 
-  // Otherwise, add the LHS and RHS to the list of factors. 
-  FindSingleUseMultiplyFactors(BO->getOperand(1), Factors); 
-  FindSingleUseMultiplyFactors(BO->getOperand(0), Factors); 
-} 
- 
-/// Optimize a series of operands to an 'and', 'or', or 'xor' instruction. 
-/// This optimizes based on identities.  If it can be reduced to a single Value, 
-/// it is returned, otherwise the Ops list is mutated as necessary. 
-static Value *OptimizeAndOrXor(unsigned Opcode, 
-                               SmallVectorImpl<ValueEntry> &Ops) { 
-  // Scan the operand lists looking for X and ~X pairs, along with X,X pairs. 
-  // If we find any, we can simplify the expression. X&~X == 0, X|~X == -1. 
-  for (unsigned i = 0, e = Ops.size(); i != e; ++i) { 
-    // First, check for X and ~X in the operand list. 
-    assert(i < Ops.size()); 
-    Value *X; 
-    if (match(Ops[i].Op, m_Not(m_Value(X)))) {    // Cannot occur for ^. 
-      unsigned FoundX = FindInOperandList(Ops, i, X); 
-      if (FoundX != i) { 
-        if (Opcode == Instruction::And)   // ...&X&~X = 0 
-          return Constant::getNullValue(X->getType()); 
- 
-        if (Opcode == Instruction::Or)    // ...|X|~X = -1 
-          return Constant::getAllOnesValue(X->getType()); 
-      } 
-    } 
- 
-    // Next, check for duplicate pairs of values, which we assume are next to 
-    // each other, due to our sorting criteria. 
-    assert(i < Ops.size()); 
-    if (i+1 != Ops.size() && Ops[i+1].Op == Ops[i].Op) { 
-      if (Opcode == Instruction::And || Opcode == Instruction::Or) { 
-        // Drop duplicate values for And and Or. 
-        Ops.erase(Ops.begin()+i); 
-        --i; --e; 
-        ++NumAnnihil; 
-        continue; 
-      } 
- 
-      // Drop pairs of values for Xor. 
-      assert(Opcode == Instruction::Xor); 
-      if (e == 2) 
-        return Constant::getNullValue(Ops[0].Op->getType()); 
- 
-      // Y ^ X^X -> Y 
-      Ops.erase(Ops.begin()+i, Ops.begin()+i+2); 
-      i -= 1; e -= 2; 
-      ++NumAnnihil; 
-    } 
-  } 
-  return nullptr; 
-} 
- 
-/// Helper function of CombineXorOpnd(). It creates a bitwise-and 
-/// instruction with the given two operands, and return the resulting 
-/// instruction. There are two special cases: 1) if the constant operand is 0, 
-/// it will return NULL. 2) if the constant is ~0, the symbolic operand will 
-/// be returned. 
-static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd, 
-                             const APInt &ConstOpnd) { 
-  if (ConstOpnd.isNullValue()) 
-    return nullptr; 
- 
-  if (ConstOpnd.isAllOnesValue()) 
-    return Opnd; 
- 
-  Instruction *I = BinaryOperator::CreateAnd( 
-      Opnd, ConstantInt::get(Opnd->getType(), ConstOpnd), "and.ra", 
-      InsertBefore); 
-  I->setDebugLoc(InsertBefore->getDebugLoc()); 
-  return I; 
-} 
- 
-// Helper function of OptimizeXor(). It tries to simplify "Opnd1 ^ ConstOpnd" 
-// into "R ^ C", where C would be 0, and R is a symbolic value. 
-// 
-// If it was successful, true is returned, and the "R" and "C" is returned 
-// via "Res" and "ConstOpnd", respectively; otherwise, false is returned, 
-// and both "Res" and "ConstOpnd" remain unchanged. 
-bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, 
-                                     APInt &ConstOpnd, Value *&Res) { 
-  // Xor-Rule 1: (x | c1) ^ c2 = (x | c1) ^ (c1 ^ c1) ^ c2 
-  //                       = ((x | c1) ^ c1) ^ (c1 ^ c2) 
-  //                       = (x & ~c1) ^ (c1 ^ c2) 
-  // It is useful only when c1 == c2. 
-  if (!Opnd1->isOrExpr() || Opnd1->getConstPart().isNullValue()) 
-    return false; 
- 
-  if (!Opnd1->getValue()->hasOneUse()) 
-    return false; 
- 
-  const APInt &C1 = Opnd1->getConstPart(); 
-  if (C1 != ConstOpnd) 
-    return false; 
- 
-  Value *X = Opnd1->getSymbolicPart(); 
-  Res = createAndInstr(I, X, ~C1); 
-  // ConstOpnd was C2, now C1 ^ C2. 
-  ConstOpnd ^= C1; 
- 
-  if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue())) 
-    RedoInsts.insert(T); 
-  return true; 
-} 
- 
-// Helper function of OptimizeXor(). It tries to simplify 
-// "Opnd1 ^ Opnd2 ^ ConstOpnd" into "R ^ C", where C would be 0, and R is a 
-// symbolic value. 
-// 
-// If it was successful, true is returned, and the "R" and "C" is returned 
-// via "Res" and "ConstOpnd", respectively (If the entire expression is 
-// evaluated to a constant, the Res is set to NULL); otherwise, false is 
-// returned, and both "Res" and "ConstOpnd" remain unchanged. 
-bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, 
-                                     XorOpnd *Opnd2, APInt &ConstOpnd, 
-                                     Value *&Res) { 
-  Value *X = Opnd1->getSymbolicPart(); 
-  if (X != Opnd2->getSymbolicPart()) 
-    return false; 
- 
-  // This many instruction become dead.(At least "Opnd1 ^ Opnd2" will die.) 
-  int DeadInstNum = 1; 
-  if (Opnd1->getValue()->hasOneUse()) 
-    DeadInstNum++; 
-  if (Opnd2->getValue()->hasOneUse()) 
-    DeadInstNum++; 
- 
-  // Xor-Rule 2: 
-  //  (x | c1) ^ (x & c2) 
-  //   = (x|c1) ^ (x&c2) ^ (c1 ^ c1) = ((x|c1) ^ c1) ^ (x & c2) ^ c1 
-  //   = (x & ~c1) ^ (x & c2) ^ c1               // Xor-Rule 1 
-  //   = (x & c3) ^ c1, where c3 = ~c1 ^ c2      // Xor-rule 3 
-  // 
-  if (Opnd1->isOrExpr() != Opnd2->isOrExpr()) { 
-    if (Opnd2->isOrExpr()) 
-      std::swap(Opnd1, Opnd2); 
- 
-    const APInt &C1 = Opnd1->getConstPart(); 
-    const APInt &C2 = Opnd2->getConstPart(); 
-    APInt C3((~C1) ^ C2); 
- 
-    // Do not increase code size! 
-    if (!C3.isNullValue() && !C3.isAllOnesValue()) { 
-      int NewInstNum = ConstOpnd.getBoolValue() ? 1 : 2; 
-      if (NewInstNum > DeadInstNum) 
-        return false; 
-    } 
- 
-    Res = createAndInstr(I, X, C3); 
-    ConstOpnd ^= C1; 
-  } else if (Opnd1->isOrExpr()) { 
-    // Xor-Rule 3: (x | c1) ^ (x | c2) = (x & c3) ^ c3 where c3 = c1 ^ c2 
-    // 
-    const APInt &C1 = Opnd1->getConstPart(); 
-    const APInt &C2 = Opnd2->getConstPart(); 
-    APInt C3 = C1 ^ C2; 
- 
-    // Do not increase code size 
-    if (!C3.isNullValue() && !C3.isAllOnesValue()) { 
-      int NewInstNum = ConstOpnd.getBoolValue() ? 1 : 2; 
-      if (NewInstNum > DeadInstNum) 
-        return false; 
-    } 
- 
-    Res = createAndInstr(I, X, C3); 
-    ConstOpnd ^= C3; 
-  } else { 
-    // Xor-Rule 4: (x & c1) ^ (x & c2) = (x & (c1^c2)) 
-    // 
-    const APInt &C1 = Opnd1->getConstPart(); 
-    const APInt &C2 = Opnd2->getConstPart(); 
-    APInt C3 = C1 ^ C2; 
-    Res = createAndInstr(I, X, C3); 
-  } 
- 
-  // Put the original operands in the Redo list; hope they will be deleted 
-  // as dead code. 
-  if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue())) 
-    RedoInsts.insert(T); 
-  if (Instruction *T = dyn_cast<Instruction>(Opnd2->getValue())) 
-    RedoInsts.insert(T); 
- 
-  return true; 
-} 
- 
-/// Optimize a series of operands to an 'xor' instruction. If it can be reduced 
-/// to a single Value, it is returned, otherwise the Ops list is mutated as 
-/// necessary. 
-Value *ReassociatePass::OptimizeXor(Instruction *I, 
-                                    SmallVectorImpl<ValueEntry> &Ops) { 
-  if (Value *V = OptimizeAndOrXor(Instruction::Xor, Ops)) 
-    return V; 
- 
-  if (Ops.size() == 1) 
-    return nullptr; 
- 
-  SmallVector<XorOpnd, 8> Opnds; 
-  SmallVector<XorOpnd*, 8> OpndPtrs; 
-  Type *Ty = Ops[0].Op->getType(); 
-  APInt ConstOpnd(Ty->getScalarSizeInBits(), 0); 
- 
-  // Step 1: Convert ValueEntry to XorOpnd 
-  for (unsigned i = 0, e = Ops.size(); i != e; ++i) { 
-    Value *V = Ops[i].Op; 
-    const APInt *C; 
-    // TODO: Support non-splat vectors. 
-    if (match(V, m_APInt(C))) { 
-      ConstOpnd ^= *C; 
-    } else { 
-      XorOpnd O(V); 
-      O.setSymbolicRank(getRank(O.getSymbolicPart())); 
-      Opnds.push_back(O); 
-    } 
-  } 
- 
-  // NOTE: From this point on, do *NOT* add/delete element to/from "Opnds". 
-  //  It would otherwise invalidate the "Opnds"'s iterator, and hence invalidate 
-  //  the "OpndPtrs" as well. For the similar reason, do not fuse this loop 
-  //  with the previous loop --- the iterator of the "Opnds" may be invalidated 
-  //  when new elements are added to the vector. 
-  for (unsigned i = 0, e = Opnds.size(); i != e; ++i) 
-    OpndPtrs.push_back(&Opnds[i]); 
- 
-  // Step 2: Sort the Xor-Operands in a way such that the operands containing 
-  //  the same symbolic value cluster together. For instance, the input operand 
-  //  sequence ("x | 123", "y & 456", "x & 789") will be sorted into: 
-  //  ("x | 123", "x & 789", "y & 456"). 
-  // 
-  //  The purpose is twofold: 
-  //  1) Cluster together the operands sharing the same symbolic-value. 
-  //  2) Operand having smaller symbolic-value-rank is permuted earlier, which 
-  //     could potentially shorten crital path, and expose more loop-invariants. 
-  //     Note that values' rank are basically defined in RPO order (FIXME). 
-  //     So, if Rank(X) < Rank(Y) < Rank(Z), it means X is defined earlier 
-  //     than Y which is defined earlier than Z. Permute "x | 1", "Y & 2", 
-  //     "z" in the order of X-Y-Z is better than any other orders. 
-  llvm::stable_sort(OpndPtrs, [](XorOpnd *LHS, XorOpnd *RHS) { 
-    return LHS->getSymbolicRank() < RHS->getSymbolicRank(); 
-  }); 
- 
-  // Step 3: Combine adjacent operands 
-  XorOpnd *PrevOpnd = nullptr; 
-  bool Changed = false; 
-  for (unsigned i = 0, e = Opnds.size(); i < e; i++) { 
-    XorOpnd *CurrOpnd = OpndPtrs[i]; 
-    // The combined value 
-    Value *CV; 
- 
-    // Step 3.1: Try simplifying "CurrOpnd ^ ConstOpnd" 
-    if (!ConstOpnd.isNullValue() && 
-        CombineXorOpnd(I, CurrOpnd, ConstOpnd, CV)) { 
-      Changed = true; 
-      if (CV) 
-        *CurrOpnd = XorOpnd(CV); 
-      else { 
-        CurrOpnd->Invalidate(); 
-        continue; 
-      } 
-    } 
- 
-    if (!PrevOpnd || CurrOpnd->getSymbolicPart() != PrevOpnd->getSymbolicPart()) { 
-      PrevOpnd = CurrOpnd; 
-      continue; 
-    } 
- 
-    // step 3.2: When previous and current operands share the same symbolic 
-    //  value, try to simplify "PrevOpnd ^ CurrOpnd ^ ConstOpnd" 
-    if (CombineXorOpnd(I, CurrOpnd, PrevOpnd, ConstOpnd, CV)) { 
-      // Remove previous operand 
-      PrevOpnd->Invalidate(); 
-      if (CV) { 
-        *CurrOpnd = XorOpnd(CV); 
-        PrevOpnd = CurrOpnd; 
-      } else { 
-        CurrOpnd->Invalidate(); 
-        PrevOpnd = nullptr; 
-      } 
-      Changed = true; 
-    } 
-  } 
- 
-  // Step 4: Reassemble the Ops 
-  if (Changed) { 
-    Ops.clear(); 
-    for (unsigned int i = 0, e = Opnds.size(); i < e; i++) { 
-      XorOpnd &O = Opnds[i]; 
-      if (O.isInvalid()) 
-        continue; 
-      ValueEntry VE(getRank(O.getValue()), O.getValue()); 
-      Ops.push_back(VE); 
-    } 
-    if (!ConstOpnd.isNullValue()) { 
-      Value *C = ConstantInt::get(Ty, ConstOpnd); 
-      ValueEntry VE(getRank(C), C); 
-      Ops.push_back(VE); 
-    } 
-    unsigned Sz = Ops.size(); 
-    if (Sz == 1) 
-      return Ops.back().Op; 
-    if (Sz == 0) { 
-      assert(ConstOpnd.isNullValue()); 
-      return ConstantInt::get(Ty, ConstOpnd); 
-    } 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Optimize a series of operands to an 'add' instruction.  This 
-/// optimizes based on identities.  If it can be reduced to a single Value, it 
-/// is returned, otherwise the Ops list is mutated as necessary. 
-Value *ReassociatePass::OptimizeAdd(Instruction *I, 
-                                    SmallVectorImpl<ValueEntry> &Ops) { 
-  // Scan the operand lists looking for X and -X pairs.  If we find any, we 
-  // can simplify expressions like X+-X == 0 and X+~X ==-1.  While we're at it, 
-  // scan for any 
-  // duplicates.  We want to canonicalize Y+Y+Y+Z -> 3*Y+Z. 
- 
-  for (unsigned i = 0, e = Ops.size(); i != e; ++i) { 
-    Value *TheOp = Ops[i].Op; 
-    // Check to see if we've seen this operand before.  If so, we factor all 
-    // instances of the operand together.  Due to our sorting criteria, we know 
-    // that these need to be next to each other in the vector. 
-    if (i+1 != Ops.size() && Ops[i+1].Op == TheOp) { 
-      // Rescan the list, remove all instances of this operand from the expr. 
-      unsigned NumFound = 0; 
-      do { 
-        Ops.erase(Ops.begin()+i); 
-        ++NumFound; 
-      } while (i != Ops.size() && Ops[i].Op == TheOp); 
- 
-      LLVM_DEBUG(dbgs() << "\nFACTORING [" << NumFound << "]: " << *TheOp 
-                        << '\n'); 
-      ++NumFactor; 
- 
-      // Insert a new multiply. 
-      Type *Ty = TheOp->getType(); 
-      Constant *C = Ty->isIntOrIntVectorTy() ? 
-        ConstantInt::get(Ty, NumFound) : ConstantFP::get(Ty, NumFound); 
-      Instruction *Mul = CreateMul(TheOp, C, "factor", I, I); 
- 
-      // Now that we have inserted a multiply, optimize it. This allows us to 
-      // handle cases that require multiple factoring steps, such as this: 
-      // (X*2) + (X*2) + (X*2) -> (X*2)*3 -> X*6 
-      RedoInsts.insert(Mul); 
- 
-      // If every add operand was a duplicate, return the multiply. 
-      if (Ops.empty()) 
-        return Mul; 
- 
-      // Otherwise, we had some input that didn't have the dupe, such as 
-      // "A + A + B" -> "A*2 + B".  Add the new multiply to the list of 
-      // things being added by this operation. 
-      Ops.insert(Ops.begin(), ValueEntry(getRank(Mul), Mul)); 
- 
-      --i; 
-      e = Ops.size(); 
-      continue; 
-    } 
- 
-    // Check for X and -X or X and ~X in the operand list. 
-    Value *X; 
-    if (!match(TheOp, m_Neg(m_Value(X))) && !match(TheOp, m_Not(m_Value(X))) && 
-        !match(TheOp, m_FNeg(m_Value(X)))) 
-      continue; 
- 
-    unsigned FoundX = FindInOperandList(Ops, i, X); 
-    if (FoundX == i) 
-      continue; 
- 
-    // Remove X and -X from the operand list. 
-    if (Ops.size() == 2 && 
-        (match(TheOp, m_Neg(m_Value())) || match(TheOp, m_FNeg(m_Value())))) 
-      return Constant::getNullValue(X->getType()); 
- 
-    // Remove X and ~X from the operand list. 
-    if (Ops.size() == 2 && match(TheOp, m_Not(m_Value()))) 
-      return Constant::getAllOnesValue(X->getType()); 
- 
-    Ops.erase(Ops.begin()+i); 
-    if (i < FoundX) 
-      --FoundX; 
-    else 
-      --i;   // Need to back up an extra one. 
-    Ops.erase(Ops.begin()+FoundX); 
-    ++NumAnnihil; 
-    --i;     // Revisit element. 
-    e -= 2;  // Removed two elements. 
- 
-    // if X and ~X we append -1 to the operand list. 
-    if (match(TheOp, m_Not(m_Value()))) { 
-      Value *V = Constant::getAllOnesValue(X->getType()); 
-      Ops.insert(Ops.end(), ValueEntry(getRank(V), V)); 
-      e += 1; 
-    } 
-  } 
- 
-  // Scan the operand list, checking to see if there are any common factors 
-  // between operands.  Consider something like A*A+A*B*C+D.  We would like to 
-  // reassociate this to A*(A+B*C)+D, which reduces the number of multiplies. 
-  // To efficiently find this, we count the number of times a factor occurs 
-  // for any ADD operands that are MULs. 
-  DenseMap<Value*, unsigned> FactorOccurrences; 
- 
-  // Keep track of each multiply we see, to avoid triggering on (X*4)+(X*4) 
-  // where they are actually the same multiply. 
-  unsigned MaxOcc = 0; 
-  Value *MaxOccVal = nullptr; 
-  for (unsigned i = 0, e = Ops.size(); i != e; ++i) { 
-    BinaryOperator *BOp = 
-        isReassociableOp(Ops[i].Op, Instruction::Mul, Instruction::FMul); 
-    if (!BOp) 
-      continue; 
- 
-    // Compute all of the factors of this added value. 
-    SmallVector<Value*, 8> Factors; 
-    FindSingleUseMultiplyFactors(BOp, Factors); 
-    assert(Factors.size() > 1 && "Bad linearize!"); 
- 
-    // Add one to FactorOccurrences for each unique factor in this op. 
-    SmallPtrSet<Value*, 8> Duplicates; 
-    for (unsigned i = 0, e = Factors.size(); i != e; ++i) { 
-      Value *Factor = Factors[i]; 
-      if (!Duplicates.insert(Factor).second) 
-        continue; 
- 
-      unsigned Occ = ++FactorOccurrences[Factor]; 
-      if (Occ > MaxOcc) { 
-        MaxOcc = Occ; 
-        MaxOccVal = Factor; 
-      } 
- 
-      // If Factor is a negative constant, add the negated value as a factor 
-      // because we can percolate the negate out.  Watch for minint, which 
-      // cannot be positivified. 
-      if (ConstantInt *CI = dyn_cast<ConstantInt>(Factor)) { 
-        if (CI->isNegative() && !CI->isMinValue(true)) { 
-          Factor = ConstantInt::get(CI->getContext(), -CI->getValue()); 
-          if (!Duplicates.insert(Factor).second) 
-            continue; 
-          unsigned Occ = ++FactorOccurrences[Factor]; 
-          if (Occ > MaxOcc) { 
-            MaxOcc = Occ; 
-            MaxOccVal = Factor; 
-          } 
-        } 
-      } else if (ConstantFP *CF = dyn_cast<ConstantFP>(Factor)) { 
-        if (CF->isNegative()) { 
-          APFloat F(CF->getValueAPF()); 
-          F.changeSign(); 
-          Factor = ConstantFP::get(CF->getContext(), F); 
-          if (!Duplicates.insert(Factor).second) 
-            continue; 
-          unsigned Occ = ++FactorOccurrences[Factor]; 
-          if (Occ > MaxOcc) { 
-            MaxOcc = Occ; 
-            MaxOccVal = Factor; 
-          } 
-        } 
-      } 
-    } 
-  } 
- 
-  // If any factor occurred more than one time, we can pull it out. 
-  if (MaxOcc > 1) { 
-    LLVM_DEBUG(dbgs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal 
-                      << '\n'); 
-    ++NumFactor; 
- 
-    // Create a new instruction that uses the MaxOccVal twice.  If we don't do 
-    // this, we could otherwise run into situations where removing a factor 
-    // from an expression will drop a use of maxocc, and this can cause 
-    // RemoveFactorFromExpression on successive values to behave differently. 
-    Instruction *DummyInst = 
-        I->getType()->isIntOrIntVectorTy() 
-            ? BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal) 
-            : BinaryOperator::CreateFAdd(MaxOccVal, MaxOccVal); 
- 
-    SmallVector<WeakTrackingVH, 4> NewMulOps; 
-    for (unsigned i = 0; i != Ops.size(); ++i) { 
-      // Only try to remove factors from expressions we're allowed to. 
-      BinaryOperator *BOp = 
-          isReassociableOp(Ops[i].Op, Instruction::Mul, Instruction::FMul); 
-      if (!BOp) 
-        continue; 
- 
-      if (Value *V = RemoveFactorFromExpression(Ops[i].Op, MaxOccVal)) { 
-        // The factorized operand may occur several times.  Convert them all in 
-        // one fell swoop. 
-        for (unsigned j = Ops.size(); j != i;) { 
-          --j; 
-          if (Ops[j].Op == Ops[i].Op) { 
-            NewMulOps.push_back(V); 
-            Ops.erase(Ops.begin()+j); 
-          } 
-        } 
-        --i; 
-      } 
-    } 
- 
-    // No need for extra uses anymore. 
-    DummyInst->deleteValue(); 
- 
-    unsigned NumAddedValues = NewMulOps.size(); 
-    Value *V = EmitAddTreeOfValues(I, NewMulOps); 
- 
-    // Now that we have inserted the add tree, optimize it. This allows us to 
-    // handle cases that require multiple factoring steps, such as this: 
-    // A*A*B + A*A*C   -->   A*(A*B+A*C)   -->   A*(A*(B+C)) 
-    assert(NumAddedValues > 1 && "Each occurrence should contribute a value"); 
-    (void)NumAddedValues; 
-    if (Instruction *VI = dyn_cast<Instruction>(V)) 
-      RedoInsts.insert(VI); 
- 
-    // Create the multiply. 
-    Instruction *V2 = CreateMul(V, MaxOccVal, "reass.mul", I, I); 
- 
-    // Rerun associate on the multiply in case the inner expression turned into 
-    // a multiply.  We want to make sure that we keep things in canonical form. 
-    RedoInsts.insert(V2); 
- 
-    // If every add operand included the factor (e.g. "A*B + A*C"), then the 
-    // entire result expression is just the multiply "A*(B+C)". 
-    if (Ops.empty()) 
-      return V2; 
- 
-    // Otherwise, we had some input that didn't have the factor, such as 
-    // "A*B + A*C + D" -> "A*(B+C) + D".  Add the new multiply to the list of 
-    // things being added by this operation. 
-    Ops.insert(Ops.begin(), ValueEntry(getRank(V2), V2)); 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Build up a vector of value/power pairs factoring a product. 
-/// 
-/// Given a series of multiplication operands, build a vector of factors and 
-/// the powers each is raised to when forming the final product. Sort them in 
-/// the order of descending power. 
-/// 
-///      (x*x)          -> [(x, 2)] 
-///     ((x*x)*x)       -> [(x, 3)] 
-///   ((((x*y)*x)*y)*x) -> [(x, 3), (y, 2)] 
-/// 
-/// \returns Whether any factors have a power greater than one. 
-static bool collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops, 
-                                   SmallVectorImpl<Factor> &Factors) { 
-  // FIXME: Have Ops be (ValueEntry, Multiplicity) pairs, simplifying this. 
-  // Compute the sum of powers of simplifiable factors. 
-  unsigned FactorPowerSum = 0; 
-  for (unsigned Idx = 1, Size = Ops.size(); Idx < Size; ++Idx) { 
-    Value *Op = Ops[Idx-1].Op; 
- 
-    // Count the number of occurrences of this value. 
-    unsigned Count = 1; 
-    for (; Idx < Size && Ops[Idx].Op == Op; ++Idx) 
-      ++Count; 
-    // Track for simplification all factors which occur 2 or more times. 
-    if (Count > 1) 
-      FactorPowerSum += Count; 
-  } 
- 
-  // We can only simplify factors if the sum of the powers of our simplifiable 
-  // factors is 4 or higher. When that is the case, we will *always* have 
-  // a simplification. This is an important invariant to prevent cyclicly 
-  // trying to simplify already minimal formations. 
-  if (FactorPowerSum < 4) 
-    return false; 
- 
-  // Now gather the simplifiable factors, removing them from Ops. 
-  FactorPowerSum = 0; 
-  for (unsigned Idx = 1; Idx < Ops.size(); ++Idx) { 
-    Value *Op = Ops[Idx-1].Op; 
- 
-    // Count the number of occurrences of this value. 
-    unsigned Count = 1; 
-    for (; Idx < Ops.size() && Ops[Idx].Op == Op; ++Idx) 
-      ++Count; 
-    if (Count == 1) 
-      continue; 
-    // Move an even number of occurrences to Factors. 
-    Count &= ~1U; 
-    Idx -= Count; 
-    FactorPowerSum += Count; 
-    Factors.push_back(Factor(Op, Count)); 
-    Ops.erase(Ops.begin()+Idx, Ops.begin()+Idx+Count); 
-  } 
- 
-  // None of the adjustments above should have reduced the sum of factor powers 
-  // below our mininum of '4'. 
-  assert(FactorPowerSum >= 4); 
- 
-  llvm::stable_sort(Factors, [](const Factor &LHS, const Factor &RHS) { 
-    return LHS.Power > RHS.Power; 
-  }); 
-  return true; 
-} 
- 
-/// Build a tree of multiplies, computing the product of Ops. 
-static Value *buildMultiplyTree(IRBuilderBase &Builder, 
-                                SmallVectorImpl<Value*> &Ops) { 
-  if (Ops.size() == 1) 
-    return Ops.back(); 
- 
-  Value *LHS = Ops.pop_back_val(); 
-  do { 
-    if (LHS->getType()->isIntOrIntVectorTy()) 
-      LHS = Builder.CreateMul(LHS, Ops.pop_back_val()); 
-    else 
-      LHS = Builder.CreateFMul(LHS, Ops.pop_back_val()); 
-  } while (!Ops.empty()); 
- 
-  return LHS; 
-} 
- 
-/// Build a minimal multiplication DAG for (a^x)*(b^y)*(c^z)*... 
-/// 
-/// Given a vector of values raised to various powers, where no two values are 
-/// equal and the powers are sorted in decreasing order, compute the minimal 
-/// DAG of multiplies to compute the final product, and return that product 
-/// value. 
-Value * 
-ReassociatePass::buildMinimalMultiplyDAG(IRBuilderBase &Builder, 
-                                         SmallVectorImpl<Factor> &Factors) { 
-  assert(Factors[0].Power); 
-  SmallVector<Value *, 4> OuterProduct; 
-  for (unsigned LastIdx = 0, Idx = 1, Size = Factors.size(); 
-       Idx < Size && Factors[Idx].Power > 0; ++Idx) { 
-    if (Factors[Idx].Power != Factors[LastIdx].Power) { 
-      LastIdx = Idx; 
-      continue; 
-    } 
- 
-    // We want to multiply across all the factors with the same power so that 
-    // we can raise them to that power as a single entity. Build a mini tree 
-    // for that. 
-    SmallVector<Value *, 4> InnerProduct; 
-    InnerProduct.push_back(Factors[LastIdx].Base); 
-    do { 
-      InnerProduct.push_back(Factors[Idx].Base); 
-      ++Idx; 
-    } while (Idx < Size && Factors[Idx].Power == Factors[LastIdx].Power); 
- 
-    // Reset the base value of the first factor to the new expression tree. 
-    // We'll remove all the factors with the same power in a second pass. 
-    Value *M = Factors[LastIdx].Base = buildMultiplyTree(Builder, InnerProduct); 
-    if (Instruction *MI = dyn_cast<Instruction>(M)) 
-      RedoInsts.insert(MI); 
- 
-    LastIdx = Idx; 
-  } 
-  // Unique factors with equal powers -- we've folded them into the first one's 
-  // base. 
-  Factors.erase(std::unique(Factors.begin(), Factors.end(), 
-                            [](const Factor &LHS, const Factor &RHS) { 
-                              return LHS.Power == RHS.Power; 
-                            }), 
-                Factors.end()); 
- 
-  // Iteratively collect the base of each factor with an add power into the 
-  // outer product, and halve each power in preparation for squaring the 
-  // expression. 
-  for (unsigned Idx = 0, Size = Factors.size(); Idx != Size; ++Idx) { 
-    if (Factors[Idx].Power & 1) 
-      OuterProduct.push_back(Factors[Idx].Base); 
-    Factors[Idx].Power >>= 1; 
-  } 
-  if (Factors[0].Power) { 
-    Value *SquareRoot = buildMinimalMultiplyDAG(Builder, Factors); 
-    OuterProduct.push_back(SquareRoot); 
-    OuterProduct.push_back(SquareRoot); 
-  } 
-  if (OuterProduct.size() == 1) 
-    return OuterProduct.front(); 
- 
-  Value *V = buildMultiplyTree(Builder, OuterProduct); 
-  return V; 
-} 
- 
-Value *ReassociatePass::OptimizeMul(BinaryOperator *I, 
-                                    SmallVectorImpl<ValueEntry> &Ops) { 
-  // We can only optimize the multiplies when there is a chain of more than 
-  // three, such that a balanced tree might require fewer total multiplies. 
-  if (Ops.size() < 4) 
-    return nullptr; 
- 
-  // Try to turn linear trees of multiplies without other uses of the 
-  // intermediate stages into minimal multiply DAGs with perfect sub-expression 
-  // re-use. 
-  SmallVector<Factor, 4> Factors; 
-  if (!collectMultiplyFactors(Ops, Factors)) 
-    return nullptr; // All distinct factors, so nothing left for us to do. 
- 
-  IRBuilder<> Builder(I); 
-  // The reassociate transformation for FP operations is performed only 
-  // if unsafe algebra is permitted by FastMathFlags. Propagate those flags 
-  // to the newly generated operations. 
-  if (auto FPI = dyn_cast<FPMathOperator>(I)) 
-    Builder.setFastMathFlags(FPI->getFastMathFlags()); 
- 
-  Value *V = buildMinimalMultiplyDAG(Builder, Factors); 
-  if (Ops.empty()) 
-    return V; 
- 
-  ValueEntry NewEntry = ValueEntry(getRank(V), V); 
-  Ops.insert(llvm::lower_bound(Ops, NewEntry), NewEntry); 
-  return nullptr; 
-} 
- 
-Value *ReassociatePass::OptimizeExpression(BinaryOperator *I, 
-                                           SmallVectorImpl<ValueEntry> &Ops) { 
-  // Now that we have the linearized expression tree, try to optimize it. 
-  // Start by folding any constants that we found. 
-  Constant *Cst = nullptr; 
-  unsigned Opcode = I->getOpcode(); 
-  while (!Ops.empty() && isa<Constant>(Ops.back().Op)) { 
-    Constant *C = cast<Constant>(Ops.pop_back_val().Op); 
-    Cst = Cst ? ConstantExpr::get(Opcode, C, Cst) : C; 
-  } 
-  // If there was nothing but constants then we are done. 
-  if (Ops.empty()) 
-    return Cst; 
- 
-  // Put the combined constant back at the end of the operand list, except if 
-  // there is no point.  For example, an add of 0 gets dropped here, while a 
-  // multiplication by zero turns the whole expression into zero. 
-  if (Cst && Cst != ConstantExpr::getBinOpIdentity(Opcode, I->getType())) { 
-    if (Cst == ConstantExpr::getBinOpAbsorber(Opcode, I->getType())) 
-      return Cst; 
-    Ops.push_back(ValueEntry(0, Cst)); 
-  } 
- 
-  if (Ops.size() == 1) return Ops[0].Op; 
- 
-  // Handle destructive annihilation due to identities between elements in the 
-  // argument list here. 
-  unsigned NumOps = Ops.size(); 
-  switch (Opcode) { 
-  default: break; 
-  case Instruction::And: 
-  case Instruction::Or: 
-    if (Value *Result = OptimizeAndOrXor(Opcode, Ops)) 
-      return Result; 
-    break; 
- 
-  case Instruction::Xor: 
-    if (Value *Result = OptimizeXor(I, Ops)) 
-      return Result; 
-    break; 
- 
-  case Instruction::Add: 
-  case Instruction::FAdd: 
-    if (Value *Result = OptimizeAdd(I, Ops)) 
-      return Result; 
-    break; 
- 
-  case Instruction::Mul: 
-  case Instruction::FMul: 
-    if (Value *Result = OptimizeMul(I, Ops)) 
-      return Result; 
-    break; 
-  } 
- 
-  if (Ops.size() != NumOps) 
-    return OptimizeExpression(I, Ops); 
-  return nullptr; 
-} 
- 
-// Remove dead instructions and if any operands are trivially dead add them to 
-// Insts so they will be removed as well. 
-void ReassociatePass::RecursivelyEraseDeadInsts(Instruction *I, 
-                                                OrderedSet &Insts) { 
-  assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!"); 
+  Value *V2 = EmitAddTreeOfValues(I, Ops);
+  return CreateAdd(V2, V1, "reass.add", I, I);
+}
+
+/// If V is an expression tree that is a multiplication sequence,
+/// and if this sequence contains a multiply by Factor,
+/// remove Factor from the tree and return the new tree.
+Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) {
+  BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul);
+  if (!BO)
+    return nullptr;
+
+  SmallVector<RepeatedValue, 8> Tree;
+  MadeChange |= LinearizeExprTree(BO, Tree);
+  SmallVector<ValueEntry, 8> Factors;
+  Factors.reserve(Tree.size());
+  for (unsigned i = 0, e = Tree.size(); i != e; ++i) {
+    RepeatedValue E = Tree[i];
+    Factors.append(E.second.getZExtValue(),
+                   ValueEntry(getRank(E.first), E.first));
+  }
+
+  bool FoundFactor = false;
+  bool NeedsNegate = false;
+  for (unsigned i = 0, e = Factors.size(); i != e; ++i) {
+    if (Factors[i].Op == Factor) {
+      FoundFactor = true;
+      Factors.erase(Factors.begin()+i);
+      break;
+    }
+
+    // If this is a negative version of this factor, remove it.
+    if (ConstantInt *FC1 = dyn_cast<ConstantInt>(Factor)) {
+      if (ConstantInt *FC2 = dyn_cast<ConstantInt>(Factors[i].Op))
+        if (FC1->getValue() == -FC2->getValue()) {
+          FoundFactor = NeedsNegate = true;
+          Factors.erase(Factors.begin()+i);
+          break;
+        }
+    } else if (ConstantFP *FC1 = dyn_cast<ConstantFP>(Factor)) {
+      if (ConstantFP *FC2 = dyn_cast<ConstantFP>(Factors[i].Op)) {
+        const APFloat &F1 = FC1->getValueAPF();
+        APFloat F2(FC2->getValueAPF());
+        F2.changeSign();
+        if (F1 == F2) {
+          FoundFactor = NeedsNegate = true;
+          Factors.erase(Factors.begin() + i);
+          break;
+        }
+      }
+    }
+  }
+
+  if (!FoundFactor) {
+    // Make sure to restore the operands to the expression tree.
+    RewriteExprTree(BO, Factors);
+    return nullptr;
+  }
+
+  BasicBlock::iterator InsertPt = ++BO->getIterator();
+
+  // If this was just a single multiply, remove the multiply and return the only
+  // remaining operand.
+  if (Factors.size() == 1) {
+    RedoInsts.insert(BO);
+    V = Factors[0].Op;
+  } else {
+    RewriteExprTree(BO, Factors);
+    V = BO;
+  }
+
+  if (NeedsNegate)
+    V = CreateNeg(V, "neg", &*InsertPt, BO);
+
+  return V;
+}
+
+/// If V is a single-use multiply, recursively add its operands as factors,
+/// otherwise add V to the list of factors.
+///
+/// Ops is the top-level list of add operands we're trying to factor.
+static void FindSingleUseMultiplyFactors(Value *V,
+                                         SmallVectorImpl<Value*> &Factors) {
+  BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul);
+  if (!BO) {
+    Factors.push_back(V);
+    return;
+  }
+
+  // Otherwise, add the LHS and RHS to the list of factors.
+  FindSingleUseMultiplyFactors(BO->getOperand(1), Factors);
+  FindSingleUseMultiplyFactors(BO->getOperand(0), Factors);
+}
+
+/// Optimize a series of operands to an 'and', 'or', or 'xor' instruction.
+/// This optimizes based on identities.  If it can be reduced to a single Value,
+/// it is returned, otherwise the Ops list is mutated as necessary.
+static Value *OptimizeAndOrXor(unsigned Opcode,
+                               SmallVectorImpl<ValueEntry> &Ops) {
+  // Scan the operand lists looking for X and ~X pairs, along with X,X pairs.
+  // If we find any, we can simplify the expression. X&~X == 0, X|~X == -1.
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    // First, check for X and ~X in the operand list.
+    assert(i < Ops.size());
+    Value *X;
+    if (match(Ops[i].Op, m_Not(m_Value(X)))) {    // Cannot occur for ^.
+      unsigned FoundX = FindInOperandList(Ops, i, X);
+      if (FoundX != i) {
+        if (Opcode == Instruction::And)   // ...&X&~X = 0
+          return Constant::getNullValue(X->getType());
+
+        if (Opcode == Instruction::Or)    // ...|X|~X = -1
+          return Constant::getAllOnesValue(X->getType());
+      }
+    }
+
+    // Next, check for duplicate pairs of values, which we assume are next to
+    // each other, due to our sorting criteria.
+    assert(i < Ops.size());
+    if (i+1 != Ops.size() && Ops[i+1].Op == Ops[i].Op) {
+      if (Opcode == Instruction::And || Opcode == Instruction::Or) {
+        // Drop duplicate values for And and Or.
+        Ops.erase(Ops.begin()+i);
+        --i; --e;
+        ++NumAnnihil;
+        continue;
+      }
+
+      // Drop pairs of values for Xor.
+      assert(Opcode == Instruction::Xor);
+      if (e == 2)
+        return Constant::getNullValue(Ops[0].Op->getType());
+
+      // Y ^ X^X -> Y
+      Ops.erase(Ops.begin()+i, Ops.begin()+i+2);
+      i -= 1; e -= 2;
+      ++NumAnnihil;
+    }
+  }
+  return nullptr;
+}
+
+/// Helper function of CombineXorOpnd(). It creates a bitwise-and
+/// instruction with the given two operands, and return the resulting
+/// instruction. There are two special cases: 1) if the constant operand is 0,
+/// it will return NULL. 2) if the constant is ~0, the symbolic operand will
+/// be returned.
+static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd,
+                             const APInt &ConstOpnd) {
+  if (ConstOpnd.isNullValue())
+    return nullptr;
+
+  if (ConstOpnd.isAllOnesValue())
+    return Opnd;
+
+  Instruction *I = BinaryOperator::CreateAnd(
+      Opnd, ConstantInt::get(Opnd->getType(), ConstOpnd), "and.ra",
+      InsertBefore);
+  I->setDebugLoc(InsertBefore->getDebugLoc());
+  return I;
+}
+
+// Helper function of OptimizeXor(). It tries to simplify "Opnd1 ^ ConstOpnd"
+// into "R ^ C", where C would be 0, and R is a symbolic value.
+//
+// If it was successful, true is returned, and the "R" and "C" is returned
+// via "Res" and "ConstOpnd", respectively; otherwise, false is returned,
+// and both "Res" and "ConstOpnd" remain unchanged.
+bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
+                                     APInt &ConstOpnd, Value *&Res) {
+  // Xor-Rule 1: (x | c1) ^ c2 = (x | c1) ^ (c1 ^ c1) ^ c2
+  //                       = ((x | c1) ^ c1) ^ (c1 ^ c2)
+  //                       = (x & ~c1) ^ (c1 ^ c2)
+  // It is useful only when c1 == c2.
+  if (!Opnd1->isOrExpr() || Opnd1->getConstPart().isNullValue())
+    return false;
+
+  if (!Opnd1->getValue()->hasOneUse())
+    return false;
+
+  const APInt &C1 = Opnd1->getConstPart();
+  if (C1 != ConstOpnd)
+    return false;
+
+  Value *X = Opnd1->getSymbolicPart();
+  Res = createAndInstr(I, X, ~C1);
+  // ConstOpnd was C2, now C1 ^ C2.
+  ConstOpnd ^= C1;
+
+  if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
+    RedoInsts.insert(T);
+  return true;
+}
+
+// Helper function of OptimizeXor(). It tries to simplify
+// "Opnd1 ^ Opnd2 ^ ConstOpnd" into "R ^ C", where C would be 0, and R is a
+// symbolic value.
+//
+// If it was successful, true is returned, and the "R" and "C" is returned
+// via "Res" and "ConstOpnd", respectively (If the entire expression is
+// evaluated to a constant, the Res is set to NULL); otherwise, false is
+// returned, and both "Res" and "ConstOpnd" remain unchanged.
+bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
+                                     XorOpnd *Opnd2, APInt &ConstOpnd,
+                                     Value *&Res) {
+  Value *X = Opnd1->getSymbolicPart();
+  if (X != Opnd2->getSymbolicPart())
+    return false;
+
+  // This many instruction become dead.(At least "Opnd1 ^ Opnd2" will die.)
+  int DeadInstNum = 1;
+  if (Opnd1->getValue()->hasOneUse())
+    DeadInstNum++;
+  if (Opnd2->getValue()->hasOneUse())
+    DeadInstNum++;
+
+  // Xor-Rule 2:
+  //  (x | c1) ^ (x & c2)
+  //   = (x|c1) ^ (x&c2) ^ (c1 ^ c1) = ((x|c1) ^ c1) ^ (x & c2) ^ c1
+  //   = (x & ~c1) ^ (x & c2) ^ c1               // Xor-Rule 1
+  //   = (x & c3) ^ c1, where c3 = ~c1 ^ c2      // Xor-rule 3
+  //
+  if (Opnd1->isOrExpr() != Opnd2->isOrExpr()) {
+    if (Opnd2->isOrExpr())
+      std::swap(Opnd1, Opnd2);
+
+    const APInt &C1 = Opnd1->getConstPart();
+    const APInt &C2 = Opnd2->getConstPart();
+    APInt C3((~C1) ^ C2);
+
+    // Do not increase code size!
+    if (!C3.isNullValue() && !C3.isAllOnesValue()) {
+      int NewInstNum = ConstOpnd.getBoolValue() ? 1 : 2;
+      if (NewInstNum > DeadInstNum)
+        return false;
+    }
+
+    Res = createAndInstr(I, X, C3);
+    ConstOpnd ^= C1;
+  } else if (Opnd1->isOrExpr()) {
+    // Xor-Rule 3: (x | c1) ^ (x | c2) = (x & c3) ^ c3 where c3 = c1 ^ c2
+    //
+    const APInt &C1 = Opnd1->getConstPart();
+    const APInt &C2 = Opnd2->getConstPart();
+    APInt C3 = C1 ^ C2;
+
+    // Do not increase code size
+    if (!C3.isNullValue() && !C3.isAllOnesValue()) {
+      int NewInstNum = ConstOpnd.getBoolValue() ? 1 : 2;
+      if (NewInstNum > DeadInstNum)
+        return false;
+    }
+
+    Res = createAndInstr(I, X, C3);
+    ConstOpnd ^= C3;
+  } else {
+    // Xor-Rule 4: (x & c1) ^ (x & c2) = (x & (c1^c2))
+    //
+    const APInt &C1 = Opnd1->getConstPart();
+    const APInt &C2 = Opnd2->getConstPart();
+    APInt C3 = C1 ^ C2;
+    Res = createAndInstr(I, X, C3);
+  }
+
+  // Put the original operands in the Redo list; hope they will be deleted
+  // as dead code.
+  if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
+    RedoInsts.insert(T);
+  if (Instruction *T = dyn_cast<Instruction>(Opnd2->getValue()))
+    RedoInsts.insert(T);
+
+  return true;
+}
+
+/// Optimize a series of operands to an 'xor' instruction. If it can be reduced
+/// to a single Value, it is returned, otherwise the Ops list is mutated as
+/// necessary.
+Value *ReassociatePass::OptimizeXor(Instruction *I,
+                                    SmallVectorImpl<ValueEntry> &Ops) {
+  if (Value *V = OptimizeAndOrXor(Instruction::Xor, Ops))
+    return V;
+
+  if (Ops.size() == 1)
+    return nullptr;
+
+  SmallVector<XorOpnd, 8> Opnds;
+  SmallVector<XorOpnd*, 8> OpndPtrs;
+  Type *Ty = Ops[0].Op->getType();
+  APInt ConstOpnd(Ty->getScalarSizeInBits(), 0);
+
+  // Step 1: Convert ValueEntry to XorOpnd
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    Value *V = Ops[i].Op;
+    const APInt *C;
+    // TODO: Support non-splat vectors.
+    if (match(V, m_APInt(C))) {
+      ConstOpnd ^= *C;
+    } else {
+      XorOpnd O(V);
+      O.setSymbolicRank(getRank(O.getSymbolicPart()));
+      Opnds.push_back(O);
+    }
+  }
+
+  // NOTE: From this point on, do *NOT* add/delete element to/from "Opnds".
+  //  It would otherwise invalidate the "Opnds"'s iterator, and hence invalidate
+  //  the "OpndPtrs" as well. For the similar reason, do not fuse this loop
+  //  with the previous loop --- the iterator of the "Opnds" may be invalidated
+  //  when new elements are added to the vector.
+  for (unsigned i = 0, e = Opnds.size(); i != e; ++i)
+    OpndPtrs.push_back(&Opnds[i]);
+
+  // Step 2: Sort the Xor-Operands in a way such that the operands containing
+  //  the same symbolic value cluster together. For instance, the input operand
+  //  sequence ("x | 123", "y & 456", "x & 789") will be sorted into:
+  //  ("x | 123", "x & 789", "y & 456").
+  //
+  //  The purpose is twofold:
+  //  1) Cluster together the operands sharing the same symbolic-value.
+  //  2) Operand having smaller symbolic-value-rank is permuted earlier, which
+  //     could potentially shorten crital path, and expose more loop-invariants.
+  //     Note that values' rank are basically defined in RPO order (FIXME).
+  //     So, if Rank(X) < Rank(Y) < Rank(Z), it means X is defined earlier
+  //     than Y which is defined earlier than Z. Permute "x | 1", "Y & 2",
+  //     "z" in the order of X-Y-Z is better than any other orders.
+  llvm::stable_sort(OpndPtrs, [](XorOpnd *LHS, XorOpnd *RHS) {
+    return LHS->getSymbolicRank() < RHS->getSymbolicRank();
+  });
+
+  // Step 3: Combine adjacent operands
+  XorOpnd *PrevOpnd = nullptr;
+  bool Changed = false;
+  for (unsigned i = 0, e = Opnds.size(); i < e; i++) {
+    XorOpnd *CurrOpnd = OpndPtrs[i];
+    // The combined value
+    Value *CV;
+
+    // Step 3.1: Try simplifying "CurrOpnd ^ ConstOpnd"
+    if (!ConstOpnd.isNullValue() &&
+        CombineXorOpnd(I, CurrOpnd, ConstOpnd, CV)) {
+      Changed = true;
+      if (CV)
+        *CurrOpnd = XorOpnd(CV);
+      else {
+        CurrOpnd->Invalidate();
+        continue;
+      }
+    }
+
+    if (!PrevOpnd || CurrOpnd->getSymbolicPart() != PrevOpnd->getSymbolicPart()) {
+      PrevOpnd = CurrOpnd;
+      continue;
+    }
+
+    // step 3.2: When previous and current operands share the same symbolic
+    //  value, try to simplify "PrevOpnd ^ CurrOpnd ^ ConstOpnd"
+    if (CombineXorOpnd(I, CurrOpnd, PrevOpnd, ConstOpnd, CV)) {
+      // Remove previous operand
+      PrevOpnd->Invalidate();
+      if (CV) {
+        *CurrOpnd = XorOpnd(CV);
+        PrevOpnd = CurrOpnd;
+      } else {
+        CurrOpnd->Invalidate();
+        PrevOpnd = nullptr;
+      }
+      Changed = true;
+    }
+  }
+
+  // Step 4: Reassemble the Ops
+  if (Changed) {
+    Ops.clear();
+    for (unsigned int i = 0, e = Opnds.size(); i < e; i++) {
+      XorOpnd &O = Opnds[i];
+      if (O.isInvalid())
+        continue;
+      ValueEntry VE(getRank(O.getValue()), O.getValue());
+      Ops.push_back(VE);
+    }
+    if (!ConstOpnd.isNullValue()) {
+      Value *C = ConstantInt::get(Ty, ConstOpnd);
+      ValueEntry VE(getRank(C), C);
+      Ops.push_back(VE);
+    }
+    unsigned Sz = Ops.size();
+    if (Sz == 1)
+      return Ops.back().Op;
+    if (Sz == 0) {
+      assert(ConstOpnd.isNullValue());
+      return ConstantInt::get(Ty, ConstOpnd);
+    }
+  }
+
+  return nullptr;
+}
+
+/// Optimize a series of operands to an 'add' instruction.  This
+/// optimizes based on identities.  If it can be reduced to a single Value, it
+/// is returned, otherwise the Ops list is mutated as necessary.
+Value *ReassociatePass::OptimizeAdd(Instruction *I,
+                                    SmallVectorImpl<ValueEntry> &Ops) {
+  // Scan the operand lists looking for X and -X pairs.  If we find any, we
+  // can simplify expressions like X+-X == 0 and X+~X ==-1.  While we're at it,
+  // scan for any
+  // duplicates.  We want to canonicalize Y+Y+Y+Z -> 3*Y+Z.
+
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    Value *TheOp = Ops[i].Op;
+    // Check to see if we've seen this operand before.  If so, we factor all
+    // instances of the operand together.  Due to our sorting criteria, we know
+    // that these need to be next to each other in the vector.
+    if (i+1 != Ops.size() && Ops[i+1].Op == TheOp) {
+      // Rescan the list, remove all instances of this operand from the expr.
+      unsigned NumFound = 0;
+      do {
+        Ops.erase(Ops.begin()+i);
+        ++NumFound;
+      } while (i != Ops.size() && Ops[i].Op == TheOp);
+
+      LLVM_DEBUG(dbgs() << "\nFACTORING [" << NumFound << "]: " << *TheOp
+                        << '\n');
+      ++NumFactor;
+
+      // Insert a new multiply.
+      Type *Ty = TheOp->getType();
+      Constant *C = Ty->isIntOrIntVectorTy() ?
+        ConstantInt::get(Ty, NumFound) : ConstantFP::get(Ty, NumFound);
+      Instruction *Mul = CreateMul(TheOp, C, "factor", I, I);
+
+      // Now that we have inserted a multiply, optimize it. This allows us to
+      // handle cases that require multiple factoring steps, such as this:
+      // (X*2) + (X*2) + (X*2) -> (X*2)*3 -> X*6
+      RedoInsts.insert(Mul);
+
+      // If every add operand was a duplicate, return the multiply.
+      if (Ops.empty())
+        return Mul;
+
+      // Otherwise, we had some input that didn't have the dupe, such as
+      // "A + A + B" -> "A*2 + B".  Add the new multiply to the list of
+      // things being added by this operation.
+      Ops.insert(Ops.begin(), ValueEntry(getRank(Mul), Mul));
+
+      --i;
+      e = Ops.size();
+      continue;
+    }
+
+    // Check for X and -X or X and ~X in the operand list.
+    Value *X;
+    if (!match(TheOp, m_Neg(m_Value(X))) && !match(TheOp, m_Not(m_Value(X))) &&
+        !match(TheOp, m_FNeg(m_Value(X))))
+      continue;
+
+    unsigned FoundX = FindInOperandList(Ops, i, X);
+    if (FoundX == i)
+      continue;
+
+    // Remove X and -X from the operand list.
+    if (Ops.size() == 2 &&
+        (match(TheOp, m_Neg(m_Value())) || match(TheOp, m_FNeg(m_Value()))))
+      return Constant::getNullValue(X->getType());
+
+    // Remove X and ~X from the operand list.
+    if (Ops.size() == 2 && match(TheOp, m_Not(m_Value())))
+      return Constant::getAllOnesValue(X->getType());
+
+    Ops.erase(Ops.begin()+i);
+    if (i < FoundX)
+      --FoundX;
+    else
+      --i;   // Need to back up an extra one.
+    Ops.erase(Ops.begin()+FoundX);
+    ++NumAnnihil;
+    --i;     // Revisit element.
+    e -= 2;  // Removed two elements.
+
+    // if X and ~X we append -1 to the operand list.
+    if (match(TheOp, m_Not(m_Value()))) {
+      Value *V = Constant::getAllOnesValue(X->getType());
+      Ops.insert(Ops.end(), ValueEntry(getRank(V), V));
+      e += 1;
+    }
+  }
+
+  // Scan the operand list, checking to see if there are any common factors
+  // between operands.  Consider something like A*A+A*B*C+D.  We would like to
+  // reassociate this to A*(A+B*C)+D, which reduces the number of multiplies.
+  // To efficiently find this, we count the number of times a factor occurs
+  // for any ADD operands that are MULs.
+  DenseMap<Value*, unsigned> FactorOccurrences;
+
+  // Keep track of each multiply we see, to avoid triggering on (X*4)+(X*4)
+  // where they are actually the same multiply.
+  unsigned MaxOcc = 0;
+  Value *MaxOccVal = nullptr;
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    BinaryOperator *BOp =
+        isReassociableOp(Ops[i].Op, Instruction::Mul, Instruction::FMul);
+    if (!BOp)
+      continue;
+
+    // Compute all of the factors of this added value.
+    SmallVector<Value*, 8> Factors;
+    FindSingleUseMultiplyFactors(BOp, Factors);
+    assert(Factors.size() > 1 && "Bad linearize!");
+
+    // Add one to FactorOccurrences for each unique factor in this op.
+    SmallPtrSet<Value*, 8> Duplicates;
+    for (unsigned i = 0, e = Factors.size(); i != e; ++i) {
+      Value *Factor = Factors[i];
+      if (!Duplicates.insert(Factor).second)
+        continue;
+
+      unsigned Occ = ++FactorOccurrences[Factor];
+      if (Occ > MaxOcc) {
+        MaxOcc = Occ;
+        MaxOccVal = Factor;
+      }
+
+      // If Factor is a negative constant, add the negated value as a factor
+      // because we can percolate the negate out.  Watch for minint, which
+      // cannot be positivified.
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Factor)) {
+        if (CI->isNegative() && !CI->isMinValue(true)) {
+          Factor = ConstantInt::get(CI->getContext(), -CI->getValue());
+          if (!Duplicates.insert(Factor).second)
+            continue;
+          unsigned Occ = ++FactorOccurrences[Factor];
+          if (Occ > MaxOcc) {
+            MaxOcc = Occ;
+            MaxOccVal = Factor;
+          }
+        }
+      } else if (ConstantFP *CF = dyn_cast<ConstantFP>(Factor)) {
+        if (CF->isNegative()) {
+          APFloat F(CF->getValueAPF());
+          F.changeSign();
+          Factor = ConstantFP::get(CF->getContext(), F);
+          if (!Duplicates.insert(Factor).second)
+            continue;
+          unsigned Occ = ++FactorOccurrences[Factor];
+          if (Occ > MaxOcc) {
+            MaxOcc = Occ;
+            MaxOccVal = Factor;
+          }
+        }
+      }
+    }
+  }
+
+  // If any factor occurred more than one time, we can pull it out.
+  if (MaxOcc > 1) {
+    LLVM_DEBUG(dbgs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal
+                      << '\n');
+    ++NumFactor;
+
+    // Create a new instruction that uses the MaxOccVal twice.  If we don't do
+    // this, we could otherwise run into situations where removing a factor
+    // from an expression will drop a use of maxocc, and this can cause
+    // RemoveFactorFromExpression on successive values to behave differently.
+    Instruction *DummyInst =
+        I->getType()->isIntOrIntVectorTy()
+            ? BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal)
+            : BinaryOperator::CreateFAdd(MaxOccVal, MaxOccVal);
+
+    SmallVector<WeakTrackingVH, 4> NewMulOps;
+    for (unsigned i = 0; i != Ops.size(); ++i) {
+      // Only try to remove factors from expressions we're allowed to.
+      BinaryOperator *BOp =
+          isReassociableOp(Ops[i].Op, Instruction::Mul, Instruction::FMul);
+      if (!BOp)
+        continue;
+
+      if (Value *V = RemoveFactorFromExpression(Ops[i].Op, MaxOccVal)) {
+        // The factorized operand may occur several times.  Convert them all in
+        // one fell swoop.
+        for (unsigned j = Ops.size(); j != i;) {
+          --j;
+          if (Ops[j].Op == Ops[i].Op) {
+            NewMulOps.push_back(V);
+            Ops.erase(Ops.begin()+j);
+          }
+        }
+        --i;
+      }
+    }
+
+    // No need for extra uses anymore.
+    DummyInst->deleteValue();
+
+    unsigned NumAddedValues = NewMulOps.size();
+    Value *V = EmitAddTreeOfValues(I, NewMulOps);
+
+    // Now that we have inserted the add tree, optimize it. This allows us to
+    // handle cases that require multiple factoring steps, such as this:
+    // A*A*B + A*A*C   -->   A*(A*B+A*C)   -->   A*(A*(B+C))
+    assert(NumAddedValues > 1 && "Each occurrence should contribute a value");
+    (void)NumAddedValues;
+    if (Instruction *VI = dyn_cast<Instruction>(V))
+      RedoInsts.insert(VI);
+
+    // Create the multiply.
+    Instruction *V2 = CreateMul(V, MaxOccVal, "reass.mul", I, I);
+
+    // Rerun associate on the multiply in case the inner expression turned into
+    // a multiply.  We want to make sure that we keep things in canonical form.
+    RedoInsts.insert(V2);
+
+    // If every add operand included the factor (e.g. "A*B + A*C"), then the
+    // entire result expression is just the multiply "A*(B+C)".
+    if (Ops.empty())
+      return V2;
+
+    // Otherwise, we had some input that didn't have the factor, such as
+    // "A*B + A*C + D" -> "A*(B+C) + D".  Add the new multiply to the list of
+    // things being added by this operation.
+    Ops.insert(Ops.begin(), ValueEntry(getRank(V2), V2));
+  }
+
+  return nullptr;
+}
+
+/// Build up a vector of value/power pairs factoring a product.
+///
+/// Given a series of multiplication operands, build a vector of factors and
+/// the powers each is raised to when forming the final product. Sort them in
+/// the order of descending power.
+///
+///      (x*x)          -> [(x, 2)]
+///     ((x*x)*x)       -> [(x, 3)]
+///   ((((x*y)*x)*y)*x) -> [(x, 3), (y, 2)]
+///
+/// \returns Whether any factors have a power greater than one.
+static bool collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
+                                   SmallVectorImpl<Factor> &Factors) {
+  // FIXME: Have Ops be (ValueEntry, Multiplicity) pairs, simplifying this.
+  // Compute the sum of powers of simplifiable factors.
+  unsigned FactorPowerSum = 0;
+  for (unsigned Idx = 1, Size = Ops.size(); Idx < Size; ++Idx) {
+    Value *Op = Ops[Idx-1].Op;
+
+    // Count the number of occurrences of this value.
+    unsigned Count = 1;
+    for (; Idx < Size && Ops[Idx].Op == Op; ++Idx)
+      ++Count;
+    // Track for simplification all factors which occur 2 or more times.
+    if (Count > 1)
+      FactorPowerSum += Count;
+  }
+
+  // We can only simplify factors if the sum of the powers of our simplifiable
+  // factors is 4 or higher. When that is the case, we will *always* have
+  // a simplification. This is an important invariant to prevent cyclicly
+  // trying to simplify already minimal formations.
+  if (FactorPowerSum < 4)
+    return false;
+
+  // Now gather the simplifiable factors, removing them from Ops.
+  FactorPowerSum = 0;
+  for (unsigned Idx = 1; Idx < Ops.size(); ++Idx) {
+    Value *Op = Ops[Idx-1].Op;
+
+    // Count the number of occurrences of this value.
+    unsigned Count = 1;
+    for (; Idx < Ops.size() && Ops[Idx].Op == Op; ++Idx)
+      ++Count;
+    if (Count == 1)
+      continue;
+    // Move an even number of occurrences to Factors.
+    Count &= ~1U;
+    Idx -= Count;
+    FactorPowerSum += Count;
+    Factors.push_back(Factor(Op, Count));
+    Ops.erase(Ops.begin()+Idx, Ops.begin()+Idx+Count);
+  }
+
+  // None of the adjustments above should have reduced the sum of factor powers
+  // below our mininum of '4'.
+  assert(FactorPowerSum >= 4);
+
+  llvm::stable_sort(Factors, [](const Factor &LHS, const Factor &RHS) {
+    return LHS.Power > RHS.Power;
+  });
+  return true;
+}
+
+/// Build a tree of multiplies, computing the product of Ops.
+static Value *buildMultiplyTree(IRBuilderBase &Builder,
+                                SmallVectorImpl<Value*> &Ops) {
+  if (Ops.size() == 1)
+    return Ops.back();
+
+  Value *LHS = Ops.pop_back_val();
+  do {
+    if (LHS->getType()->isIntOrIntVectorTy())
+      LHS = Builder.CreateMul(LHS, Ops.pop_back_val());
+    else
+      LHS = Builder.CreateFMul(LHS, Ops.pop_back_val());
+  } while (!Ops.empty());
+
+  return LHS;
+}
+
+/// Build a minimal multiplication DAG for (a^x)*(b^y)*(c^z)*...
+///
+/// Given a vector of values raised to various powers, where no two values are
+/// equal and the powers are sorted in decreasing order, compute the minimal
+/// DAG of multiplies to compute the final product, and return that product
+/// value.
+Value *
+ReassociatePass::buildMinimalMultiplyDAG(IRBuilderBase &Builder,
+                                         SmallVectorImpl<Factor> &Factors) {
+  assert(Factors[0].Power);
+  SmallVector<Value *, 4> OuterProduct;
+  for (unsigned LastIdx = 0, Idx = 1, Size = Factors.size();
+       Idx < Size && Factors[Idx].Power > 0; ++Idx) {
+    if (Factors[Idx].Power != Factors[LastIdx].Power) {
+      LastIdx = Idx;
+      continue;
+    }
+
+    // We want to multiply across all the factors with the same power so that
+    // we can raise them to that power as a single entity. Build a mini tree
+    // for that.
+    SmallVector<Value *, 4> InnerProduct;
+    InnerProduct.push_back(Factors[LastIdx].Base);
+    do {
+      InnerProduct.push_back(Factors[Idx].Base);
+      ++Idx;
+    } while (Idx < Size && Factors[Idx].Power == Factors[LastIdx].Power);
+
+    // Reset the base value of the first factor to the new expression tree.
+    // We'll remove all the factors with the same power in a second pass.
+    Value *M = Factors[LastIdx].Base = buildMultiplyTree(Builder, InnerProduct);
+    if (Instruction *MI = dyn_cast<Instruction>(M))
+      RedoInsts.insert(MI);
+
+    LastIdx = Idx;
+  }
+  // Unique factors with equal powers -- we've folded them into the first one's
+  // base.
+  Factors.erase(std::unique(Factors.begin(), Factors.end(),
+                            [](const Factor &LHS, const Factor &RHS) {
+                              return LHS.Power == RHS.Power;
+                            }),
+                Factors.end());
+
+  // Iteratively collect the base of each factor with an add power into the
+  // outer product, and halve each power in preparation for squaring the
+  // expression.
+  for (unsigned Idx = 0, Size = Factors.size(); Idx != Size; ++Idx) {
+    if (Factors[Idx].Power & 1)
+      OuterProduct.push_back(Factors[Idx].Base);
+    Factors[Idx].Power >>= 1;
+  }
+  if (Factors[0].Power) {
+    Value *SquareRoot = buildMinimalMultiplyDAG(Builder, Factors);
+    OuterProduct.push_back(SquareRoot);
+    OuterProduct.push_back(SquareRoot);
+  }
+  if (OuterProduct.size() == 1)
+    return OuterProduct.front();
+
+  Value *V = buildMultiplyTree(Builder, OuterProduct);
+  return V;
+}
+
+Value *ReassociatePass::OptimizeMul(BinaryOperator *I,
+                                    SmallVectorImpl<ValueEntry> &Ops) {
+  // We can only optimize the multiplies when there is a chain of more than
+  // three, such that a balanced tree might require fewer total multiplies.
+  if (Ops.size() < 4)
+    return nullptr;
+
+  // Try to turn linear trees of multiplies without other uses of the
+  // intermediate stages into minimal multiply DAGs with perfect sub-expression
+  // re-use.
+  SmallVector<Factor, 4> Factors;
+  if (!collectMultiplyFactors(Ops, Factors))
+    return nullptr; // All distinct factors, so nothing left for us to do.
+
+  IRBuilder<> Builder(I);
+  // The reassociate transformation for FP operations is performed only
+  // if unsafe algebra is permitted by FastMathFlags. Propagate those flags
+  // to the newly generated operations.
+  if (auto FPI = dyn_cast<FPMathOperator>(I))
+    Builder.setFastMathFlags(FPI->getFastMathFlags());
+
+  Value *V = buildMinimalMultiplyDAG(Builder, Factors);
+  if (Ops.empty())
+    return V;
+
+  ValueEntry NewEntry = ValueEntry(getRank(V), V);
+  Ops.insert(llvm::lower_bound(Ops, NewEntry), NewEntry);
+  return nullptr;
+}
+
+Value *ReassociatePass::OptimizeExpression(BinaryOperator *I,
+                                           SmallVectorImpl<ValueEntry> &Ops) {
+  // Now that we have the linearized expression tree, try to optimize it.
+  // Start by folding any constants that we found.
+  Constant *Cst = nullptr;
+  unsigned Opcode = I->getOpcode();
+  while (!Ops.empty() && isa<Constant>(Ops.back().Op)) {
+    Constant *C = cast<Constant>(Ops.pop_back_val().Op);
+    Cst = Cst ? ConstantExpr::get(Opcode, C, Cst) : C;
+  }
+  // If there was nothing but constants then we are done.
+  if (Ops.empty())
+    return Cst;
+
+  // Put the combined constant back at the end of the operand list, except if
+  // there is no point.  For example, an add of 0 gets dropped here, while a
+  // multiplication by zero turns the whole expression into zero.
+  if (Cst && Cst != ConstantExpr::getBinOpIdentity(Opcode, I->getType())) {
+    if (Cst == ConstantExpr::getBinOpAbsorber(Opcode, I->getType()))
+      return Cst;
+    Ops.push_back(ValueEntry(0, Cst));
+  }
+
+  if (Ops.size() == 1) return Ops[0].Op;
+
+  // Handle destructive annihilation due to identities between elements in the
+  // argument list here.
+  unsigned NumOps = Ops.size();
+  switch (Opcode) {
+  default: break;
+  case Instruction::And:
+  case Instruction::Or:
+    if (Value *Result = OptimizeAndOrXor(Opcode, Ops))
+      return Result;
+    break;
+
+  case Instruction::Xor:
+    if (Value *Result = OptimizeXor(I, Ops))
+      return Result;
+    break;
+
+  case Instruction::Add:
+  case Instruction::FAdd:
+    if (Value *Result = OptimizeAdd(I, Ops))
+      return Result;
+    break;
+
+  case Instruction::Mul:
+  case Instruction::FMul:
+    if (Value *Result = OptimizeMul(I, Ops))
+      return Result;
+    break;
+  }
+
+  if (Ops.size() != NumOps)
+    return OptimizeExpression(I, Ops);
+  return nullptr;
+}
+
+// Remove dead instructions and if any operands are trivially dead add them to
+// Insts so they will be removed as well.
+void ReassociatePass::RecursivelyEraseDeadInsts(Instruction *I,
+                                                OrderedSet &Insts) {
+  assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
   SmallVector<Value *, 4> Ops(I->operands());
-  ValueRankMap.erase(I); 
-  Insts.remove(I); 
-  RedoInsts.remove(I); 
-  llvm::salvageDebugInfo(*I); 
-  I->eraseFromParent(); 
-  for (auto Op : Ops) 
-    if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 
-      if (OpInst->use_empty()) 
-        Insts.insert(OpInst); 
-} 
- 
-/// Zap the given instruction, adding interesting operands to the work list. 
-void ReassociatePass::EraseInst(Instruction *I) { 
-  assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!"); 
-  LLVM_DEBUG(dbgs() << "Erasing dead inst: "; I->dump()); 
- 
+  ValueRankMap.erase(I);
+  Insts.remove(I);
+  RedoInsts.remove(I);
+  llvm::salvageDebugInfo(*I);
+  I->eraseFromParent();
+  for (auto Op : Ops)
+    if (Instruction *OpInst = dyn_cast<Instruction>(Op))
+      if (OpInst->use_empty())
+        Insts.insert(OpInst);
+}
+
+/// Zap the given instruction, adding interesting operands to the work list.
+void ReassociatePass::EraseInst(Instruction *I) {
+  assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
+  LLVM_DEBUG(dbgs() << "Erasing dead inst: "; I->dump());
+
   SmallVector<Value *, 8> Ops(I->operands());
-  // Erase the dead instruction. 
-  ValueRankMap.erase(I); 
-  RedoInsts.remove(I); 
-  llvm::salvageDebugInfo(*I); 
-  I->eraseFromParent(); 
-  // Optimize its operands. 
-  SmallPtrSet<Instruction *, 8> Visited; // Detect self-referential nodes. 
-  for (unsigned i = 0, e = Ops.size(); i != e; ++i) 
-    if (Instruction *Op = dyn_cast<Instruction>(Ops[i])) { 
-      // If this is a node in an expression tree, climb to the expression root 
-      // and add that since that's where optimization actually happens. 
-      unsigned Opcode = Op->getOpcode(); 
-      while (Op->hasOneUse() && Op->user_back()->getOpcode() == Opcode && 
-             Visited.insert(Op).second) 
-        Op = Op->user_back(); 
- 
-      // The instruction we're going to push may be coming from a 
-      // dead block, and Reassociate skips the processing of unreachable 
-      // blocks because it's a waste of time and also because it can 
-      // lead to infinite loop due to LLVM's non-standard definition 
-      // of dominance. 
-      if (ValueRankMap.find(Op) != ValueRankMap.end()) 
-        RedoInsts.insert(Op); 
-    } 
- 
-  MadeChange = true; 
-} 
- 
-/// Recursively analyze an expression to build a list of instructions that have 
-/// negative floating-point constant operands. The caller can then transform 
-/// the list to create positive constants for better reassociation and CSE. 
-static void getNegatibleInsts(Value *V, 
-                              SmallVectorImpl<Instruction *> &Candidates) { 
-  // Handle only one-use instructions. Combining negations does not justify 
-  // replicating instructions. 
-  Instruction *I; 
-  if (!match(V, m_OneUse(m_Instruction(I)))) 
-    return; 
- 
-  // Handle expressions of multiplications and divisions. 
-  // TODO: This could look through floating-point casts. 
-  const APFloat *C; 
-  switch (I->getOpcode()) { 
-    case Instruction::FMul: 
-      // Not expecting non-canonical code here. Bail out and wait. 
-      if (match(I->getOperand(0), m_Constant())) 
-        break; 
- 
-      if (match(I->getOperand(1), m_APFloat(C)) && C->isNegative()) { 
-        Candidates.push_back(I); 
-        LLVM_DEBUG(dbgs() << "FMul with negative constant: " << *I << '\n'); 
-      } 
-      getNegatibleInsts(I->getOperand(0), Candidates); 
-      getNegatibleInsts(I->getOperand(1), Candidates); 
-      break; 
-    case Instruction::FDiv: 
-      // Not expecting non-canonical code here. Bail out and wait. 
-      if (match(I->getOperand(0), m_Constant()) && 
-          match(I->getOperand(1), m_Constant())) 
-        break; 
- 
-      if ((match(I->getOperand(0), m_APFloat(C)) && C->isNegative()) || 
-          (match(I->getOperand(1), m_APFloat(C)) && C->isNegative())) { 
-        Candidates.push_back(I); 
-        LLVM_DEBUG(dbgs() << "FDiv with negative constant: " << *I << '\n'); 
-      } 
-      getNegatibleInsts(I->getOperand(0), Candidates); 
-      getNegatibleInsts(I->getOperand(1), Candidates); 
-      break; 
-    default: 
-      break; 
-  } 
-} 
- 
-/// Given an fadd/fsub with an operand that is a one-use instruction 
-/// (the fadd/fsub), try to change negative floating-point constants into 
-/// positive constants to increase potential for reassociation and CSE. 
-Instruction *ReassociatePass::canonicalizeNegFPConstantsForOp(Instruction *I, 
-                                                              Instruction *Op, 
-                                                              Value *OtherOp) { 
-  assert((I->getOpcode() == Instruction::FAdd || 
-          I->getOpcode() == Instruction::FSub) && "Expected fadd/fsub"); 
- 
-  // Collect instructions with negative FP constants from the subtree that ends 
-  // in Op. 
-  SmallVector<Instruction *, 4> Candidates; 
-  getNegatibleInsts(Op, Candidates); 
-  if (Candidates.empty()) 
-    return nullptr; 
- 
-  // Don't canonicalize x + (-Constant * y) -> x - (Constant * y), if the 
-  // resulting subtract will be broken up later.  This can get us into an 
-  // infinite loop during reassociation. 
-  bool IsFSub = I->getOpcode() == Instruction::FSub; 
-  bool NeedsSubtract = !IsFSub && Candidates.size() % 2 == 1; 
-  if (NeedsSubtract && ShouldBreakUpSubtract(I)) 
-    return nullptr; 
- 
-  for (Instruction *Negatible : Candidates) { 
-    const APFloat *C; 
-    if (match(Negatible->getOperand(0), m_APFloat(C))) { 
-      assert(!match(Negatible->getOperand(1), m_Constant()) && 
-             "Expecting only 1 constant operand"); 
-      assert(C->isNegative() && "Expected negative FP constant"); 
-      Negatible->setOperand(0, ConstantFP::get(Negatible->getType(), abs(*C))); 
-      MadeChange = true; 
-    } 
-    if (match(Negatible->getOperand(1), m_APFloat(C))) { 
-      assert(!match(Negatible->getOperand(0), m_Constant()) && 
-             "Expecting only 1 constant operand"); 
-      assert(C->isNegative() && "Expected negative FP constant"); 
-      Negatible->setOperand(1, ConstantFP::get(Negatible->getType(), abs(*C))); 
-      MadeChange = true; 
-    } 
-  } 
-  assert(MadeChange == true && "Negative constant candidate was not changed"); 
- 
-  // Negations cancelled out. 
-  if (Candidates.size() % 2 == 0) 
-    return I; 
- 
-  // Negate the final operand in the expression by flipping the opcode of this 
-  // fadd/fsub. 
-  assert(Candidates.size() % 2 == 1 && "Expected odd number"); 
-  IRBuilder<> Builder(I); 
-  Value *NewInst = IsFSub ? Builder.CreateFAddFMF(OtherOp, Op, I) 
-                          : Builder.CreateFSubFMF(OtherOp, Op, I); 
-  I->replaceAllUsesWith(NewInst); 
-  RedoInsts.insert(I); 
-  return dyn_cast<Instruction>(NewInst); 
-} 
- 
-/// Canonicalize expressions that contain a negative floating-point constant 
-/// of the following form: 
-///   OtherOp + (subtree) -> OtherOp {+/-} (canonical subtree) 
-///   (subtree) + OtherOp -> OtherOp {+/-} (canonical subtree) 
-///   OtherOp - (subtree) -> OtherOp {+/-} (canonical subtree) 
-/// 
-/// The fadd/fsub opcode may be switched to allow folding a negation into the 
-/// input instruction. 
-Instruction *ReassociatePass::canonicalizeNegFPConstants(Instruction *I) { 
-  LLVM_DEBUG(dbgs() << "Combine negations for: " << *I << '\n'); 
-  Value *X; 
-  Instruction *Op; 
-  if (match(I, m_FAdd(m_Value(X), m_OneUse(m_Instruction(Op))))) 
-    if (Instruction *R = canonicalizeNegFPConstantsForOp(I, Op, X)) 
-      I = R; 
-  if (match(I, m_FAdd(m_OneUse(m_Instruction(Op)), m_Value(X)))) 
-    if (Instruction *R = canonicalizeNegFPConstantsForOp(I, Op, X)) 
-      I = R; 
-  if (match(I, m_FSub(m_Value(X), m_OneUse(m_Instruction(Op))))) 
-    if (Instruction *R = canonicalizeNegFPConstantsForOp(I, Op, X)) 
-      I = R; 
-  return I; 
-} 
- 
-/// Inspect and optimize the given instruction. Note that erasing 
-/// instructions is not allowed. 
-void ReassociatePass::OptimizeInst(Instruction *I) { 
-  // Only consider operations that we understand. 
-  if (!isa<UnaryOperator>(I) && !isa<BinaryOperator>(I)) 
-    return; 
- 
-  if (I->getOpcode() == Instruction::Shl && isa<ConstantInt>(I->getOperand(1))) 
-    // If an operand of this shift is a reassociable multiply, or if the shift 
-    // is used by a reassociable multiply or add, turn into a multiply. 
-    if (isReassociableOp(I->getOperand(0), Instruction::Mul) || 
-        (I->hasOneUse() && 
-         (isReassociableOp(I->user_back(), Instruction::Mul) || 
-          isReassociableOp(I->user_back(), Instruction::Add)))) { 
-      Instruction *NI = ConvertShiftToMul(I); 
-      RedoInsts.insert(I); 
-      MadeChange = true; 
-      I = NI; 
-    } 
- 
-  // Commute binary operators, to canonicalize the order of their operands. 
-  // This can potentially expose more CSE opportunities, and makes writing other 
-  // transformations simpler. 
-  if (I->isCommutative()) 
-    canonicalizeOperands(I); 
- 
-  // Canonicalize negative constants out of expressions. 
-  if (Instruction *Res = canonicalizeNegFPConstants(I)) 
-    I = Res; 
- 
-  // Don't optimize floating-point instructions unless they are 'fast'. 
-  if (I->getType()->isFPOrFPVectorTy() && !I->isFast()) 
-    return; 
- 
-  // Do not reassociate boolean (i1) expressions.  We want to preserve the 
-  // original order of evaluation for short-circuited comparisons that 
-  // SimplifyCFG has folded to AND/OR expressions.  If the expression 
-  // is not further optimized, it is likely to be transformed back to a 
-  // short-circuited form for code gen, and the source order may have been 
-  // optimized for the most likely conditions. 
-  if (I->getType()->isIntegerTy(1)) 
-    return; 
- 
+  // Erase the dead instruction.
+  ValueRankMap.erase(I);
+  RedoInsts.remove(I);
+  llvm::salvageDebugInfo(*I);
+  I->eraseFromParent();
+  // Optimize its operands.
+  SmallPtrSet<Instruction *, 8> Visited; // Detect self-referential nodes.
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    if (Instruction *Op = dyn_cast<Instruction>(Ops[i])) {
+      // If this is a node in an expression tree, climb to the expression root
+      // and add that since that's where optimization actually happens.
+      unsigned Opcode = Op->getOpcode();
+      while (Op->hasOneUse() && Op->user_back()->getOpcode() == Opcode &&
+             Visited.insert(Op).second)
+        Op = Op->user_back();
+
+      // The instruction we're going to push may be coming from a
+      // dead block, and Reassociate skips the processing of unreachable
+      // blocks because it's a waste of time and also because it can
+      // lead to infinite loop due to LLVM's non-standard definition
+      // of dominance.
+      if (ValueRankMap.find(Op) != ValueRankMap.end())
+        RedoInsts.insert(Op);
+    }
+
+  MadeChange = true;
+}
+
+/// Recursively analyze an expression to build a list of instructions that have
+/// negative floating-point constant operands. The caller can then transform
+/// the list to create positive constants for better reassociation and CSE.
+static void getNegatibleInsts(Value *V,
+                              SmallVectorImpl<Instruction *> &Candidates) {
+  // Handle only one-use instructions. Combining negations does not justify
+  // replicating instructions.
+  Instruction *I;
+  if (!match(V, m_OneUse(m_Instruction(I))))
+    return;
+
+  // Handle expressions of multiplications and divisions.
+  // TODO: This could look through floating-point casts.
+  const APFloat *C;
+  switch (I->getOpcode()) {
+    case Instruction::FMul:
+      // Not expecting non-canonical code here. Bail out and wait.
+      if (match(I->getOperand(0), m_Constant()))
+        break;
+
+      if (match(I->getOperand(1), m_APFloat(C)) && C->isNegative()) {
+        Candidates.push_back(I);
+        LLVM_DEBUG(dbgs() << "FMul with negative constant: " << *I << '\n');
+      }
+      getNegatibleInsts(I->getOperand(0), Candidates);
+      getNegatibleInsts(I->getOperand(1), Candidates);
+      break;
+    case Instruction::FDiv:
+      // Not expecting non-canonical code here. Bail out and wait.
+      if (match(I->getOperand(0), m_Constant()) &&
+          match(I->getOperand(1), m_Constant()))
+        break;
+
+      if ((match(I->getOperand(0), m_APFloat(C)) && C->isNegative()) ||
+          (match(I->getOperand(1), m_APFloat(C)) && C->isNegative())) {
+        Candidates.push_back(I);
+        LLVM_DEBUG(dbgs() << "FDiv with negative constant: " << *I << '\n');
+      }
+      getNegatibleInsts(I->getOperand(0), Candidates);
+      getNegatibleInsts(I->getOperand(1), Candidates);
+      break;
+    default:
+      break;
+  }
+}
+
+/// Given an fadd/fsub with an operand that is a one-use instruction
+/// (the fadd/fsub), try to change negative floating-point constants into
+/// positive constants to increase potential for reassociation and CSE.
+Instruction *ReassociatePass::canonicalizeNegFPConstantsForOp(Instruction *I,
+                                                              Instruction *Op,
+                                                              Value *OtherOp) {
+  assert((I->getOpcode() == Instruction::FAdd ||
+          I->getOpcode() == Instruction::FSub) && "Expected fadd/fsub");
+
+  // Collect instructions with negative FP constants from the subtree that ends
+  // in Op.
+  SmallVector<Instruction *, 4> Candidates;
+  getNegatibleInsts(Op, Candidates);
+  if (Candidates.empty())
+    return nullptr;
+
+  // Don't canonicalize x + (-Constant * y) -> x - (Constant * y), if the
+  // resulting subtract will be broken up later.  This can get us into an
+  // infinite loop during reassociation.
+  bool IsFSub = I->getOpcode() == Instruction::FSub;
+  bool NeedsSubtract = !IsFSub && Candidates.size() % 2 == 1;
+  if (NeedsSubtract && ShouldBreakUpSubtract(I))
+    return nullptr;
+
+  for (Instruction *Negatible : Candidates) {
+    const APFloat *C;
+    if (match(Negatible->getOperand(0), m_APFloat(C))) {
+      assert(!match(Negatible->getOperand(1), m_Constant()) &&
+             "Expecting only 1 constant operand");
+      assert(C->isNegative() && "Expected negative FP constant");
+      Negatible->setOperand(0, ConstantFP::get(Negatible->getType(), abs(*C)));
+      MadeChange = true;
+    }
+    if (match(Negatible->getOperand(1), m_APFloat(C))) {
+      assert(!match(Negatible->getOperand(0), m_Constant()) &&
+             "Expecting only 1 constant operand");
+      assert(C->isNegative() && "Expected negative FP constant");
+      Negatible->setOperand(1, ConstantFP::get(Negatible->getType(), abs(*C)));
+      MadeChange = true;
+    }
+  }
+  assert(MadeChange == true && "Negative constant candidate was not changed");
+
+  // Negations cancelled out.
+  if (Candidates.size() % 2 == 0)
+    return I;
+
+  // Negate the final operand in the expression by flipping the opcode of this
+  // fadd/fsub.
+  assert(Candidates.size() % 2 == 1 && "Expected odd number");
+  IRBuilder<> Builder(I);
+  Value *NewInst = IsFSub ? Builder.CreateFAddFMF(OtherOp, Op, I)
+                          : Builder.CreateFSubFMF(OtherOp, Op, I);
+  I->replaceAllUsesWith(NewInst);
+  RedoInsts.insert(I);
+  return dyn_cast<Instruction>(NewInst);
+}
+
+/// Canonicalize expressions that contain a negative floating-point constant
+/// of the following form:
+///   OtherOp + (subtree) -> OtherOp {+/-} (canonical subtree)
+///   (subtree) + OtherOp -> OtherOp {+/-} (canonical subtree)
+///   OtherOp - (subtree) -> OtherOp {+/-} (canonical subtree)
+///
+/// The fadd/fsub opcode may be switched to allow folding a negation into the
+/// input instruction.
+Instruction *ReassociatePass::canonicalizeNegFPConstants(Instruction *I) {
+  LLVM_DEBUG(dbgs() << "Combine negations for: " << *I << '\n');
+  Value *X;
+  Instruction *Op;
+  if (match(I, m_FAdd(m_Value(X), m_OneUse(m_Instruction(Op)))))
+    if (Instruction *R = canonicalizeNegFPConstantsForOp(I, Op, X))
+      I = R;
+  if (match(I, m_FAdd(m_OneUse(m_Instruction(Op)), m_Value(X))))
+    if (Instruction *R = canonicalizeNegFPConstantsForOp(I, Op, X))
+      I = R;
+  if (match(I, m_FSub(m_Value(X), m_OneUse(m_Instruction(Op)))))
+    if (Instruction *R = canonicalizeNegFPConstantsForOp(I, Op, X))
+      I = R;
+  return I;
+}
+
+/// Inspect and optimize the given instruction. Note that erasing
+/// instructions is not allowed.
+void ReassociatePass::OptimizeInst(Instruction *I) {
+  // Only consider operations that we understand.
+  if (!isa<UnaryOperator>(I) && !isa<BinaryOperator>(I))
+    return;
+
+  if (I->getOpcode() == Instruction::Shl && isa<ConstantInt>(I->getOperand(1)))
+    // If an operand of this shift is a reassociable multiply, or if the shift
+    // is used by a reassociable multiply or add, turn into a multiply.
+    if (isReassociableOp(I->getOperand(0), Instruction::Mul) ||
+        (I->hasOneUse() &&
+         (isReassociableOp(I->user_back(), Instruction::Mul) ||
+          isReassociableOp(I->user_back(), Instruction::Add)))) {
+      Instruction *NI = ConvertShiftToMul(I);
+      RedoInsts.insert(I);
+      MadeChange = true;
+      I = NI;
+    }
+
+  // Commute binary operators, to canonicalize the order of their operands.
+  // This can potentially expose more CSE opportunities, and makes writing other
+  // transformations simpler.
+  if (I->isCommutative())
+    canonicalizeOperands(I);
+
+  // Canonicalize negative constants out of expressions.
+  if (Instruction *Res = canonicalizeNegFPConstants(I))
+    I = Res;
+
+  // Don't optimize floating-point instructions unless they are 'fast'.
+  if (I->getType()->isFPOrFPVectorTy() && !I->isFast())
+    return;
+
+  // Do not reassociate boolean (i1) expressions.  We want to preserve the
+  // original order of evaluation for short-circuited comparisons that
+  // SimplifyCFG has folded to AND/OR expressions.  If the expression
+  // is not further optimized, it is likely to be transformed back to a
+  // short-circuited form for code gen, and the source order may have been
+  // optimized for the most likely conditions.
+  if (I->getType()->isIntegerTy(1))
+    return;
+
   // If this is a bitwise or instruction of operands
   // with no common bits set, convert it to X+Y.
   if (I->getOpcode() == Instruction::Or &&
@@ -2222,397 +2222,397 @@ void ReassociatePass::OptimizeInst(Instruction *I) {
     I = NI;
   }
 
-  // If this is a subtract instruction which is not already in negate form, 
-  // see if we can convert it to X+-Y. 
-  if (I->getOpcode() == Instruction::Sub) { 
-    if (ShouldBreakUpSubtract(I)) { 
-      Instruction *NI = BreakUpSubtract(I, RedoInsts); 
-      RedoInsts.insert(I); 
-      MadeChange = true; 
-      I = NI; 
-    } else if (match(I, m_Neg(m_Value()))) { 
-      // Otherwise, this is a negation.  See if the operand is a multiply tree 
-      // and if this is not an inner node of a multiply tree. 
-      if (isReassociableOp(I->getOperand(1), Instruction::Mul) && 
-          (!I->hasOneUse() || 
-           !isReassociableOp(I->user_back(), Instruction::Mul))) { 
-        Instruction *NI = LowerNegateToMultiply(I); 
-        // If the negate was simplified, revisit the users to see if we can 
-        // reassociate further. 
-        for (User *U : NI->users()) { 
-          if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U)) 
-            RedoInsts.insert(Tmp); 
-        } 
-        RedoInsts.insert(I); 
-        MadeChange = true; 
-        I = NI; 
-      } 
-    } 
-  } else if (I->getOpcode() == Instruction::FNeg || 
-             I->getOpcode() == Instruction::FSub) { 
-    if (ShouldBreakUpSubtract(I)) { 
-      Instruction *NI = BreakUpSubtract(I, RedoInsts); 
-      RedoInsts.insert(I); 
-      MadeChange = true; 
-      I = NI; 
-    } else if (match(I, m_FNeg(m_Value()))) { 
-      // Otherwise, this is a negation.  See if the operand is a multiply tree 
-      // and if this is not an inner node of a multiply tree. 
-      Value *Op = isa<BinaryOperator>(I) ? I->getOperand(1) : 
-                                           I->getOperand(0); 
-      if (isReassociableOp(Op, Instruction::FMul) && 
-          (!I->hasOneUse() || 
-           !isReassociableOp(I->user_back(), Instruction::FMul))) { 
-        // If the negate was simplified, revisit the users to see if we can 
-        // reassociate further. 
-        Instruction *NI = LowerNegateToMultiply(I); 
-        for (User *U : NI->users()) { 
-          if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U)) 
-            RedoInsts.insert(Tmp); 
-        } 
-        RedoInsts.insert(I); 
-        MadeChange = true; 
-        I = NI; 
-      } 
-    } 
-  } 
- 
-  // If this instruction is an associative binary operator, process it. 
-  if (!I->isAssociative()) return; 
-  BinaryOperator *BO = cast<BinaryOperator>(I); 
- 
-  // If this is an interior node of a reassociable tree, ignore it until we 
-  // get to the root of the tree, to avoid N^2 analysis. 
-  unsigned Opcode = BO->getOpcode(); 
-  if (BO->hasOneUse() && BO->user_back()->getOpcode() == Opcode) { 
-    // During the initial run we will get to the root of the tree. 
-    // But if we get here while we are redoing instructions, there is no 
-    // guarantee that the root will be visited. So Redo later 
-    if (BO->user_back() != BO && 
-        BO->getParent() == BO->user_back()->getParent()) 
-      RedoInsts.insert(BO->user_back()); 
-    return; 
-  } 
- 
-  // If this is an add tree that is used by a sub instruction, ignore it 
-  // until we process the subtract. 
-  if (BO->hasOneUse() && BO->getOpcode() == Instruction::Add && 
-      cast<Instruction>(BO->user_back())->getOpcode() == Instruction::Sub) 
-    return; 
-  if (BO->hasOneUse() && BO->getOpcode() == Instruction::FAdd && 
-      cast<Instruction>(BO->user_back())->getOpcode() == Instruction::FSub) 
-    return; 
- 
-  ReassociateExpression(BO); 
-} 
- 
-void ReassociatePass::ReassociateExpression(BinaryOperator *I) { 
-  // First, walk the expression tree, linearizing the tree, collecting the 
-  // operand information. 
-  SmallVector<RepeatedValue, 8> Tree; 
-  MadeChange |= LinearizeExprTree(I, Tree); 
-  SmallVector<ValueEntry, 8> Ops; 
-  Ops.reserve(Tree.size()); 
-  for (unsigned i = 0, e = Tree.size(); i != e; ++i) { 
-    RepeatedValue E = Tree[i]; 
-    Ops.append(E.second.getZExtValue(), 
-               ValueEntry(getRank(E.first), E.first)); 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "RAIn:\t"; PrintOps(I, Ops); dbgs() << '\n'); 
- 
-  // Now that we have linearized the tree to a list and have gathered all of 
-  // the operands and their ranks, sort the operands by their rank.  Use a 
-  // stable_sort so that values with equal ranks will have their relative 
-  // positions maintained (and so the compiler is deterministic).  Note that 
-  // this sorts so that the highest ranking values end up at the beginning of 
-  // the vector. 
-  llvm::stable_sort(Ops); 
- 
-  // Now that we have the expression tree in a convenient 
-  // sorted form, optimize it globally if possible. 
-  if (Value *V = OptimizeExpression(I, Ops)) { 
-    if (V == I) 
-      // Self-referential expression in unreachable code. 
-      return; 
-    // This expression tree simplified to something that isn't a tree, 
-    // eliminate it. 
-    LLVM_DEBUG(dbgs() << "Reassoc to scalar: " << *V << '\n'); 
-    I->replaceAllUsesWith(V); 
-    if (Instruction *VI = dyn_cast<Instruction>(V)) 
-      if (I->getDebugLoc()) 
-        VI->setDebugLoc(I->getDebugLoc()); 
-    RedoInsts.insert(I); 
-    ++NumAnnihil; 
-    return; 
-  } 
- 
-  // We want to sink immediates as deeply as possible except in the case where 
-  // this is a multiply tree used only by an add, and the immediate is a -1. 
-  // In this case we reassociate to put the negation on the outside so that we 
-  // can fold the negation into the add: (-X)*Y + Z -> Z-X*Y 
-  if (I->hasOneUse()) { 
-    if (I->getOpcode() == Instruction::Mul && 
-        cast<Instruction>(I->user_back())->getOpcode() == Instruction::Add && 
-        isa<ConstantInt>(Ops.back().Op) && 
-        cast<ConstantInt>(Ops.back().Op)->isMinusOne()) { 
-      ValueEntry Tmp = Ops.pop_back_val(); 
-      Ops.insert(Ops.begin(), Tmp); 
-    } else if (I->getOpcode() == Instruction::FMul && 
-               cast<Instruction>(I->user_back())->getOpcode() == 
-                   Instruction::FAdd && 
-               isa<ConstantFP>(Ops.back().Op) && 
-               cast<ConstantFP>(Ops.back().Op)->isExactlyValue(-1.0)) { 
-      ValueEntry Tmp = Ops.pop_back_val(); 
-      Ops.insert(Ops.begin(), Tmp); 
-    } 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "RAOut:\t"; PrintOps(I, Ops); dbgs() << '\n'); 
- 
-  if (Ops.size() == 1) { 
-    if (Ops[0].Op == I) 
-      // Self-referential expression in unreachable code. 
-      return; 
- 
-    // This expression tree simplified to something that isn't a tree, 
-    // eliminate it. 
-    I->replaceAllUsesWith(Ops[0].Op); 
-    if (Instruction *OI = dyn_cast<Instruction>(Ops[0].Op)) 
-      OI->setDebugLoc(I->getDebugLoc()); 
-    RedoInsts.insert(I); 
-    return; 
-  } 
- 
-  if (Ops.size() > 2 && Ops.size() <= GlobalReassociateLimit) { 
-    // Find the pair with the highest count in the pairmap and move it to the 
-    // back of the list so that it can later be CSE'd. 
-    // example: 
-    //   a*b*c*d*e 
-    // if c*e is the most "popular" pair, we can express this as 
-    //   (((c*e)*d)*b)*a 
-    unsigned Max = 1; 
-    unsigned BestRank = 0; 
-    std::pair<unsigned, unsigned> BestPair; 
-    unsigned Idx = I->getOpcode() - Instruction::BinaryOpsBegin; 
-    for (unsigned i = 0; i < Ops.size() - 1; ++i) 
-      for (unsigned j = i + 1; j < Ops.size(); ++j) { 
-        unsigned Score = 0; 
-        Value *Op0 = Ops[i].Op; 
-        Value *Op1 = Ops[j].Op; 
-        if (std::less<Value *>()(Op1, Op0)) 
-          std::swap(Op0, Op1); 
-        auto it = PairMap[Idx].find({Op0, Op1}); 
-        if (it != PairMap[Idx].end()) { 
-          // Functions like BreakUpSubtract() can erase the Values we're using 
-          // as keys and create new Values after we built the PairMap. There's a 
-          // small chance that the new nodes can have the same address as 
-          // something already in the table. We shouldn't accumulate the stored 
-          // score in that case as it refers to the wrong Value. 
-          if (it->second.isValid()) 
-            Score += it->second.Score; 
-        } 
- 
-        unsigned MaxRank = std::max(Ops[i].Rank, Ops[j].Rank); 
-        if (Score > Max || (Score == Max && MaxRank < BestRank)) { 
-          BestPair = {i, j}; 
-          Max = Score; 
-          BestRank = MaxRank; 
-        } 
-      } 
-    if (Max > 1) { 
-      auto Op0 = Ops[BestPair.first]; 
-      auto Op1 = Ops[BestPair.second]; 
-      Ops.erase(&Ops[BestPair.second]); 
-      Ops.erase(&Ops[BestPair.first]); 
-      Ops.push_back(Op0); 
-      Ops.push_back(Op1); 
-    } 
-  } 
-  // Now that we ordered and optimized the expressions, splat them back into 
-  // the expression tree, removing any unneeded nodes. 
-  RewriteExprTree(I, Ops); 
-} 
- 
-void 
-ReassociatePass::BuildPairMap(ReversePostOrderTraversal<Function *> &RPOT) { 
-  // Make a "pairmap" of how often each operand pair occurs. 
-  for (BasicBlock *BI : RPOT) { 
-    for (Instruction &I : *BI) { 
-      if (!I.isAssociative()) 
-        continue; 
- 
-      // Ignore nodes that aren't at the root of trees. 
-      if (I.hasOneUse() && I.user_back()->getOpcode() == I.getOpcode()) 
-        continue; 
- 
-      // Collect all operands in a single reassociable expression. 
-      // Since Reassociate has already been run once, we can assume things 
-      // are already canonical according to Reassociation's regime. 
-      SmallVector<Value *, 8> Worklist = { I.getOperand(0), I.getOperand(1) }; 
-      SmallVector<Value *, 8> Ops; 
-      while (!Worklist.empty() && Ops.size() <= GlobalReassociateLimit) { 
-        Value *Op = Worklist.pop_back_val(); 
-        Instruction *OpI = dyn_cast<Instruction>(Op); 
-        if (!OpI || OpI->getOpcode() != I.getOpcode() || !OpI->hasOneUse()) { 
-          Ops.push_back(Op); 
-          continue; 
-        } 
-        // Be paranoid about self-referencing expressions in unreachable code. 
-        if (OpI->getOperand(0) != OpI) 
-          Worklist.push_back(OpI->getOperand(0)); 
-        if (OpI->getOperand(1) != OpI) 
-          Worklist.push_back(OpI->getOperand(1)); 
-      } 
-      // Skip extremely long expressions. 
-      if (Ops.size() > GlobalReassociateLimit) 
-        continue; 
- 
-      // Add all pairwise combinations of operands to the pair map. 
-      unsigned BinaryIdx = I.getOpcode() - Instruction::BinaryOpsBegin; 
-      SmallSet<std::pair<Value *, Value*>, 32> Visited; 
-      for (unsigned i = 0; i < Ops.size() - 1; ++i) { 
-        for (unsigned j = i + 1; j < Ops.size(); ++j) { 
-          // Canonicalize operand orderings. 
-          Value *Op0 = Ops[i]; 
-          Value *Op1 = Ops[j]; 
-          if (std::less<Value *>()(Op1, Op0)) 
-            std::swap(Op0, Op1); 
-          if (!Visited.insert({Op0, Op1}).second) 
-            continue; 
-          auto res = PairMap[BinaryIdx].insert({{Op0, Op1}, {Op0, Op1, 1}}); 
-          if (!res.second) { 
-            // If either key value has been erased then we've got the same 
-            // address by coincidence. That can't happen here because nothing is 
-            // erasing values but it can happen by the time we're querying the 
-            // map. 
-            assert(res.first->second.isValid() && "WeakVH invalidated"); 
-            ++res.first->second.Score; 
-          } 
-        } 
-      } 
-    } 
-  } 
-} 
- 
-PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) { 
-  // Get the functions basic blocks in Reverse Post Order. This order is used by 
-  // BuildRankMap to pre calculate ranks correctly. It also excludes dead basic 
-  // blocks (it has been seen that the analysis in this pass could hang when 
-  // analysing dead basic blocks). 
-  ReversePostOrderTraversal<Function *> RPOT(&F); 
- 
-  // Calculate the rank map for F. 
-  BuildRankMap(F, RPOT); 
- 
-  // Build the pair map before running reassociate. 
-  // Technically this would be more accurate if we did it after one round 
-  // of reassociation, but in practice it doesn't seem to help much on 
-  // real-world code, so don't waste the compile time running reassociate 
-  // twice. 
-  // If a user wants, they could expicitly run reassociate twice in their 
-  // pass pipeline for further potential gains. 
-  // It might also be possible to update the pair map during runtime, but the 
-  // overhead of that may be large if there's many reassociable chains. 
-  BuildPairMap(RPOT); 
- 
-  MadeChange = false; 
- 
-  // Traverse the same blocks that were analysed by BuildRankMap. 
-  for (BasicBlock *BI : RPOT) { 
-    assert(RankMap.count(&*BI) && "BB should be ranked."); 
-    // Optimize every instruction in the basic block. 
-    for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;) 
-      if (isInstructionTriviallyDead(&*II)) { 
-        EraseInst(&*II++); 
-      } else { 
-        OptimizeInst(&*II); 
-        assert(II->getParent() == &*BI && "Moved to a different block!"); 
-        ++II; 
-      } 
- 
-    // Make a copy of all the instructions to be redone so we can remove dead 
-    // instructions. 
-    OrderedSet ToRedo(RedoInsts); 
-    // Iterate over all instructions to be reevaluated and remove trivially dead 
-    // instructions. If any operand of the trivially dead instruction becomes 
-    // dead mark it for deletion as well. Continue this process until all 
-    // trivially dead instructions have been removed. 
-    while (!ToRedo.empty()) { 
-      Instruction *I = ToRedo.pop_back_val(); 
-      if (isInstructionTriviallyDead(I)) { 
-        RecursivelyEraseDeadInsts(I, ToRedo); 
-        MadeChange = true; 
-      } 
-    } 
- 
-    // Now that we have removed dead instructions, we can reoptimize the 
-    // remaining instructions. 
-    while (!RedoInsts.empty()) { 
-      Instruction *I = RedoInsts.front(); 
-      RedoInsts.erase(RedoInsts.begin()); 
-      if (isInstructionTriviallyDead(I)) 
-        EraseInst(I); 
-      else 
-        OptimizeInst(I); 
-    } 
-  } 
- 
-  // We are done with the rank map and pair map. 
-  RankMap.clear(); 
-  ValueRankMap.clear(); 
-  for (auto &Entry : PairMap) 
-    Entry.clear(); 
- 
-  if (MadeChange) { 
-    PreservedAnalyses PA; 
-    PA.preserveSet<CFGAnalyses>(); 
-    PA.preserve<AAManager>(); 
-    PA.preserve<BasicAA>(); 
-    PA.preserve<GlobalsAA>(); 
-    return PA; 
-  } 
- 
-  return PreservedAnalyses::all(); 
-} 
- 
-namespace { 
- 
-  class ReassociateLegacyPass : public FunctionPass { 
-    ReassociatePass Impl; 
- 
-  public: 
-    static char ID; // Pass identification, replacement for typeid 
- 
-    ReassociateLegacyPass() : FunctionPass(ID) { 
-      initializeReassociateLegacyPassPass(*PassRegistry::getPassRegistry()); 
-    } 
- 
-    bool runOnFunction(Function &F) override { 
-      if (skipFunction(F)) 
-        return false; 
- 
-      FunctionAnalysisManager DummyFAM; 
-      auto PA = Impl.run(F, DummyFAM); 
-      return !PA.areAllPreserved(); 
-    } 
- 
-    void getAnalysisUsage(AnalysisUsage &AU) const override { 
-      AU.setPreservesCFG(); 
-      AU.addPreserved<AAResultsWrapperPass>(); 
-      AU.addPreserved<BasicAAWrapperPass>(); 
-      AU.addPreserved<GlobalsAAWrapperPass>(); 
-    } 
-  }; 
- 
-} // end anonymous namespace 
- 
-char ReassociateLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS(ReassociateLegacyPass, "reassociate", 
-                "Reassociate expressions", false, false) 
- 
-// Public interface to the Reassociate pass 
-FunctionPass *llvm::createReassociatePass() { 
-  return new ReassociateLegacyPass(); 
-} 
+  // If this is a subtract instruction which is not already in negate form,
+  // see if we can convert it to X+-Y.
+  if (I->getOpcode() == Instruction::Sub) {
+    if (ShouldBreakUpSubtract(I)) {
+      Instruction *NI = BreakUpSubtract(I, RedoInsts);
+      RedoInsts.insert(I);
+      MadeChange = true;
+      I = NI;
+    } else if (match(I, m_Neg(m_Value()))) {
+      // Otherwise, this is a negation.  See if the operand is a multiply tree
+      // and if this is not an inner node of a multiply tree.
+      if (isReassociableOp(I->getOperand(1), Instruction::Mul) &&
+          (!I->hasOneUse() ||
+           !isReassociableOp(I->user_back(), Instruction::Mul))) {
+        Instruction *NI = LowerNegateToMultiply(I);
+        // If the negate was simplified, revisit the users to see if we can
+        // reassociate further.
+        for (User *U : NI->users()) {
+          if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U))
+            RedoInsts.insert(Tmp);
+        }
+        RedoInsts.insert(I);
+        MadeChange = true;
+        I = NI;
+      }
+    }
+  } else if (I->getOpcode() == Instruction::FNeg ||
+             I->getOpcode() == Instruction::FSub) {
+    if (ShouldBreakUpSubtract(I)) {
+      Instruction *NI = BreakUpSubtract(I, RedoInsts);
+      RedoInsts.insert(I);
+      MadeChange = true;
+      I = NI;
+    } else if (match(I, m_FNeg(m_Value()))) {
+      // Otherwise, this is a negation.  See if the operand is a multiply tree
+      // and if this is not an inner node of a multiply tree.
+      Value *Op = isa<BinaryOperator>(I) ? I->getOperand(1) :
+                                           I->getOperand(0);
+      if (isReassociableOp(Op, Instruction::FMul) &&
+          (!I->hasOneUse() ||
+           !isReassociableOp(I->user_back(), Instruction::FMul))) {
+        // If the negate was simplified, revisit the users to see if we can
+        // reassociate further.
+        Instruction *NI = LowerNegateToMultiply(I);
+        for (User *U : NI->users()) {
+          if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U))
+            RedoInsts.insert(Tmp);
+        }
+        RedoInsts.insert(I);
+        MadeChange = true;
+        I = NI;
+      }
+    }
+  }
+
+  // If this instruction is an associative binary operator, process it.
+  if (!I->isAssociative()) return;
+  BinaryOperator *BO = cast<BinaryOperator>(I);
+
+  // If this is an interior node of a reassociable tree, ignore it until we
+  // get to the root of the tree, to avoid N^2 analysis.
+  unsigned Opcode = BO->getOpcode();
+  if (BO->hasOneUse() && BO->user_back()->getOpcode() == Opcode) {
+    // During the initial run we will get to the root of the tree.
+    // But if we get here while we are redoing instructions, there is no
+    // guarantee that the root will be visited. So Redo later
+    if (BO->user_back() != BO &&
+        BO->getParent() == BO->user_back()->getParent())
+      RedoInsts.insert(BO->user_back());
+    return;
+  }
+
+  // If this is an add tree that is used by a sub instruction, ignore it
+  // until we process the subtract.
+  if (BO->hasOneUse() && BO->getOpcode() == Instruction::Add &&
+      cast<Instruction>(BO->user_back())->getOpcode() == Instruction::Sub)
+    return;
+  if (BO->hasOneUse() && BO->getOpcode() == Instruction::FAdd &&
+      cast<Instruction>(BO->user_back())->getOpcode() == Instruction::FSub)
+    return;
+
+  ReassociateExpression(BO);
+}
+
+void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
+  // First, walk the expression tree, linearizing the tree, collecting the
+  // operand information.
+  SmallVector<RepeatedValue, 8> Tree;
+  MadeChange |= LinearizeExprTree(I, Tree);
+  SmallVector<ValueEntry, 8> Ops;
+  Ops.reserve(Tree.size());
+  for (unsigned i = 0, e = Tree.size(); i != e; ++i) {
+    RepeatedValue E = Tree[i];
+    Ops.append(E.second.getZExtValue(),
+               ValueEntry(getRank(E.first), E.first));
+  }
+
+  LLVM_DEBUG(dbgs() << "RAIn:\t"; PrintOps(I, Ops); dbgs() << '\n');
+
+  // Now that we have linearized the tree to a list and have gathered all of
+  // the operands and their ranks, sort the operands by their rank.  Use a
+  // stable_sort so that values with equal ranks will have their relative
+  // positions maintained (and so the compiler is deterministic).  Note that
+  // this sorts so that the highest ranking values end up at the beginning of
+  // the vector.
+  llvm::stable_sort(Ops);
+
+  // Now that we have the expression tree in a convenient
+  // sorted form, optimize it globally if possible.
+  if (Value *V = OptimizeExpression(I, Ops)) {
+    if (V == I)
+      // Self-referential expression in unreachable code.
+      return;
+    // This expression tree simplified to something that isn't a tree,
+    // eliminate it.
+    LLVM_DEBUG(dbgs() << "Reassoc to scalar: " << *V << '\n');
+    I->replaceAllUsesWith(V);
+    if (Instruction *VI = dyn_cast<Instruction>(V))
+      if (I->getDebugLoc())
+        VI->setDebugLoc(I->getDebugLoc());
+    RedoInsts.insert(I);
+    ++NumAnnihil;
+    return;
+  }
+
+  // We want to sink immediates as deeply as possible except in the case where
+  // this is a multiply tree used only by an add, and the immediate is a -1.
+  // In this case we reassociate to put the negation on the outside so that we
+  // can fold the negation into the add: (-X)*Y + Z -> Z-X*Y
+  if (I->hasOneUse()) {
+    if (I->getOpcode() == Instruction::Mul &&
+        cast<Instruction>(I->user_back())->getOpcode() == Instruction::Add &&
+        isa<ConstantInt>(Ops.back().Op) &&
+        cast<ConstantInt>(Ops.back().Op)->isMinusOne()) {
+      ValueEntry Tmp = Ops.pop_back_val();
+      Ops.insert(Ops.begin(), Tmp);
+    } else if (I->getOpcode() == Instruction::FMul &&
+               cast<Instruction>(I->user_back())->getOpcode() ==
+                   Instruction::FAdd &&
+               isa<ConstantFP>(Ops.back().Op) &&
+               cast<ConstantFP>(Ops.back().Op)->isExactlyValue(-1.0)) {
+      ValueEntry Tmp = Ops.pop_back_val();
+      Ops.insert(Ops.begin(), Tmp);
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "RAOut:\t"; PrintOps(I, Ops); dbgs() << '\n');
+
+  if (Ops.size() == 1) {
+    if (Ops[0].Op == I)
+      // Self-referential expression in unreachable code.
+      return;
+
+    // This expression tree simplified to something that isn't a tree,
+    // eliminate it.
+    I->replaceAllUsesWith(Ops[0].Op);
+    if (Instruction *OI = dyn_cast<Instruction>(Ops[0].Op))
+      OI->setDebugLoc(I->getDebugLoc());
+    RedoInsts.insert(I);
+    return;
+  }
+
+  if (Ops.size() > 2 && Ops.size() <= GlobalReassociateLimit) {
+    // Find the pair with the highest count in the pairmap and move it to the
+    // back of the list so that it can later be CSE'd.
+    // example:
+    //   a*b*c*d*e
+    // if c*e is the most "popular" pair, we can express this as
+    //   (((c*e)*d)*b)*a
+    unsigned Max = 1;
+    unsigned BestRank = 0;
+    std::pair<unsigned, unsigned> BestPair;
+    unsigned Idx = I->getOpcode() - Instruction::BinaryOpsBegin;
+    for (unsigned i = 0; i < Ops.size() - 1; ++i)
+      for (unsigned j = i + 1; j < Ops.size(); ++j) {
+        unsigned Score = 0;
+        Value *Op0 = Ops[i].Op;
+        Value *Op1 = Ops[j].Op;
+        if (std::less<Value *>()(Op1, Op0))
+          std::swap(Op0, Op1);
+        auto it = PairMap[Idx].find({Op0, Op1});
+        if (it != PairMap[Idx].end()) {
+          // Functions like BreakUpSubtract() can erase the Values we're using
+          // as keys and create new Values after we built the PairMap. There's a
+          // small chance that the new nodes can have the same address as
+          // something already in the table. We shouldn't accumulate the stored
+          // score in that case as it refers to the wrong Value.
+          if (it->second.isValid())
+            Score += it->second.Score;
+        }
+
+        unsigned MaxRank = std::max(Ops[i].Rank, Ops[j].Rank);
+        if (Score > Max || (Score == Max && MaxRank < BestRank)) {
+          BestPair = {i, j};
+          Max = Score;
+          BestRank = MaxRank;
+        }
+      }
+    if (Max > 1) {
+      auto Op0 = Ops[BestPair.first];
+      auto Op1 = Ops[BestPair.second];
+      Ops.erase(&Ops[BestPair.second]);
+      Ops.erase(&Ops[BestPair.first]);
+      Ops.push_back(Op0);
+      Ops.push_back(Op1);
+    }
+  }
+  // Now that we ordered and optimized the expressions, splat them back into
+  // the expression tree, removing any unneeded nodes.
+  RewriteExprTree(I, Ops);
+}
+
+void
+ReassociatePass::BuildPairMap(ReversePostOrderTraversal<Function *> &RPOT) {
+  // Make a "pairmap" of how often each operand pair occurs.
+  for (BasicBlock *BI : RPOT) {
+    for (Instruction &I : *BI) {
+      if (!I.isAssociative())
+        continue;
+
+      // Ignore nodes that aren't at the root of trees.
+      if (I.hasOneUse() && I.user_back()->getOpcode() == I.getOpcode())
+        continue;
+
+      // Collect all operands in a single reassociable expression.
+      // Since Reassociate has already been run once, we can assume things
+      // are already canonical according to Reassociation's regime.
+      SmallVector<Value *, 8> Worklist = { I.getOperand(0), I.getOperand(1) };
+      SmallVector<Value *, 8> Ops;
+      while (!Worklist.empty() && Ops.size() <= GlobalReassociateLimit) {
+        Value *Op = Worklist.pop_back_val();
+        Instruction *OpI = dyn_cast<Instruction>(Op);
+        if (!OpI || OpI->getOpcode() != I.getOpcode() || !OpI->hasOneUse()) {
+          Ops.push_back(Op);
+          continue;
+        }
+        // Be paranoid about self-referencing expressions in unreachable code.
+        if (OpI->getOperand(0) != OpI)
+          Worklist.push_back(OpI->getOperand(0));
+        if (OpI->getOperand(1) != OpI)
+          Worklist.push_back(OpI->getOperand(1));
+      }
+      // Skip extremely long expressions.
+      if (Ops.size() > GlobalReassociateLimit)
+        continue;
+
+      // Add all pairwise combinations of operands to the pair map.
+      unsigned BinaryIdx = I.getOpcode() - Instruction::BinaryOpsBegin;
+      SmallSet<std::pair<Value *, Value*>, 32> Visited;
+      for (unsigned i = 0; i < Ops.size() - 1; ++i) {
+        for (unsigned j = i + 1; j < Ops.size(); ++j) {
+          // Canonicalize operand orderings.
+          Value *Op0 = Ops[i];
+          Value *Op1 = Ops[j];
+          if (std::less<Value *>()(Op1, Op0))
+            std::swap(Op0, Op1);
+          if (!Visited.insert({Op0, Op1}).second)
+            continue;
+          auto res = PairMap[BinaryIdx].insert({{Op0, Op1}, {Op0, Op1, 1}});
+          if (!res.second) {
+            // If either key value has been erased then we've got the same
+            // address by coincidence. That can't happen here because nothing is
+            // erasing values but it can happen by the time we're querying the
+            // map.
+            assert(res.first->second.isValid() && "WeakVH invalidated");
+            ++res.first->second.Score;
+          }
+        }
+      }
+    }
+  }
+}
+
+PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
+  // Get the functions basic blocks in Reverse Post Order. This order is used by
+  // BuildRankMap to pre calculate ranks correctly. It also excludes dead basic
+  // blocks (it has been seen that the analysis in this pass could hang when
+  // analysing dead basic blocks).
+  ReversePostOrderTraversal<Function *> RPOT(&F);
+
+  // Calculate the rank map for F.
+  BuildRankMap(F, RPOT);
+
+  // Build the pair map before running reassociate.
+  // Technically this would be more accurate if we did it after one round
+  // of reassociation, but in practice it doesn't seem to help much on
+  // real-world code, so don't waste the compile time running reassociate
+  // twice.
+  // If a user wants, they could expicitly run reassociate twice in their
+  // pass pipeline for further potential gains.
+  // It might also be possible to update the pair map during runtime, but the
+  // overhead of that may be large if there's many reassociable chains.
+  BuildPairMap(RPOT);
+
+  MadeChange = false;
+
+  // Traverse the same blocks that were analysed by BuildRankMap.
+  for (BasicBlock *BI : RPOT) {
+    assert(RankMap.count(&*BI) && "BB should be ranked.");
+    // Optimize every instruction in the basic block.
+    for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;)
+      if (isInstructionTriviallyDead(&*II)) {
+        EraseInst(&*II++);
+      } else {
+        OptimizeInst(&*II);
+        assert(II->getParent() == &*BI && "Moved to a different block!");
+        ++II;
+      }
+
+    // Make a copy of all the instructions to be redone so we can remove dead
+    // instructions.
+    OrderedSet ToRedo(RedoInsts);
+    // Iterate over all instructions to be reevaluated and remove trivially dead
+    // instructions. If any operand of the trivially dead instruction becomes
+    // dead mark it for deletion as well. Continue this process until all
+    // trivially dead instructions have been removed.
+    while (!ToRedo.empty()) {
+      Instruction *I = ToRedo.pop_back_val();
+      if (isInstructionTriviallyDead(I)) {
+        RecursivelyEraseDeadInsts(I, ToRedo);
+        MadeChange = true;
+      }
+    }
+
+    // Now that we have removed dead instructions, we can reoptimize the
+    // remaining instructions.
+    while (!RedoInsts.empty()) {
+      Instruction *I = RedoInsts.front();
+      RedoInsts.erase(RedoInsts.begin());
+      if (isInstructionTriviallyDead(I))
+        EraseInst(I);
+      else
+        OptimizeInst(I);
+    }
+  }
+
+  // We are done with the rank map and pair map.
+  RankMap.clear();
+  ValueRankMap.clear();
+  for (auto &Entry : PairMap)
+    Entry.clear();
+
+  if (MadeChange) {
+    PreservedAnalyses PA;
+    PA.preserveSet<CFGAnalyses>();
+    PA.preserve<AAManager>();
+    PA.preserve<BasicAA>();
+    PA.preserve<GlobalsAA>();
+    return PA;
+  }
+
+  return PreservedAnalyses::all();
+}
+
+namespace {
+
+  class ReassociateLegacyPass : public FunctionPass {
+    ReassociatePass Impl;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+
+    ReassociateLegacyPass() : FunctionPass(ID) {
+      initializeReassociateLegacyPassPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override {
+      if (skipFunction(F))
+        return false;
+
+      FunctionAnalysisManager DummyFAM;
+      auto PA = Impl.run(F, DummyFAM);
+      return !PA.areAllPreserved();
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      AU.addPreserved<AAResultsWrapperPass>();
+      AU.addPreserved<BasicAAWrapperPass>();
+      AU.addPreserved<GlobalsAAWrapperPass>();
+    }
+  };
+
+} // end anonymous namespace
+
+char ReassociateLegacyPass::ID = 0;
+
+INITIALIZE_PASS(ReassociateLegacyPass, "reassociate",
+                "Reassociate expressions", false, false)
+
+// Public interface to the Reassociate pass
+FunctionPass *llvm::createReassociatePass() {
+  return new ReassociateLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/Reg2Mem.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/Reg2Mem.cpp
index 6d7adb2e07..a49b9ad3f6 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/Reg2Mem.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -1,46 +1,46 @@
-//===- Reg2Mem.cpp - Convert registers to allocas -------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file demotes all registers to memory references.  It is intended to be 
-// the inverse of PromoteMemoryToRegister.  By converting to loads, the only 
-// values live across basic blocks are allocas and loads before phi nodes. 
-// It is intended that this should make CFG hacking much easier. 
-// To make later hacking easier, the entry block is split into two, such that 
-// all introduced allocas and nothing else are in the entry block. 
-// 
-//===----------------------------------------------------------------------===// 
- 
+//===- Reg2Mem.cpp - Convert registers to allocas -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file demotes all registers to memory references.  It is intended to be
+// the inverse of PromoteMemoryToRegister.  By converting to loads, the only
+// values live across basic blocks are allocas and loads before phi nodes.
+// It is intended that this should make CFG hacking much easier.
+// To make later hacking easier, the entry block is split into two, such that
+// all introduced allocas and nothing else are in the entry block.
+//
+//===----------------------------------------------------------------------===//
+
 #include "llvm/Transforms/Scalar/Reg2Mem.h"
-#include "llvm/ADT/Statistic.h" 
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/CFG.h" 
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
 #include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h" 
+#include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/Module.h" 
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils.h" 
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h" 
-#include <list> 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "reg2mem" 
- 
-STATISTIC(NumRegsDemoted, "Number of registers demoted"); 
-STATISTIC(NumPhisDemoted, "Number of phi-nodes demoted"); 
- 
+#include "llvm/Transforms/Utils/Local.h"
+#include <list>
+using namespace llvm;
+
+#define DEBUG_TYPE "reg2mem"
+
+STATISTIC(NumRegsDemoted, "Number of registers demoted");
+STATISTIC(NumPhisDemoted, "Number of phi-nodes demoted");
+
 static bool valueEscapes(const Instruction &Inst) {
   const BasicBlock *BB = Inst.getParent();
   for (const User *U : Inst.users()) {
@@ -49,51 +49,51 @@ static bool valueEscapes(const Instruction &Inst) {
       return true;
   }
   return false;
-} 
- 
+}
+
 static bool runPass(Function &F) {
-  // Insert all new allocas into entry block. 
-  BasicBlock *BBEntry = &F.getEntryBlock(); 
-  assert(pred_empty(BBEntry) && 
-         "Entry block to function must not have predecessors!"); 
- 
-  // Find first non-alloca instruction and create insertion point. This is 
-  // safe if block is well-formed: it always have terminator, otherwise 
-  // we'll get and assertion. 
-  BasicBlock::iterator I = BBEntry->begin(); 
-  while (isa<AllocaInst>(I)) ++I; 
- 
-  CastInst *AllocaInsertionPoint = new BitCastInst( 
-      Constant::getNullValue(Type::getInt32Ty(F.getContext())), 
-      Type::getInt32Ty(F.getContext()), "reg2mem alloca point", &*I); 
- 
-  // Find the escaped instructions. But don't create stack slots for 
-  // allocas in entry block. 
-  std::list<Instruction*> WorkList; 
+  // Insert all new allocas into entry block.
+  BasicBlock *BBEntry = &F.getEntryBlock();
+  assert(pred_empty(BBEntry) &&
+         "Entry block to function must not have predecessors!");
+
+  // Find first non-alloca instruction and create insertion point. This is
+  // safe if block is well-formed: it always have terminator, otherwise
+  // we'll get and assertion.
+  BasicBlock::iterator I = BBEntry->begin();
+  while (isa<AllocaInst>(I)) ++I;
+
+  CastInst *AllocaInsertionPoint = new BitCastInst(
+      Constant::getNullValue(Type::getInt32Ty(F.getContext())),
+      Type::getInt32Ty(F.getContext()), "reg2mem alloca point", &*I);
+
+  // Find the escaped instructions. But don't create stack slots for
+  // allocas in entry block.
+  std::list<Instruction*> WorkList;
   for (Instruction &I : instructions(F))
     if (!(isa<AllocaInst>(I) && I.getParent() == BBEntry) && valueEscapes(I))
       WorkList.push_front(&I);
- 
-  // Demote escaped instructions 
-  NumRegsDemoted += WorkList.size(); 
+
+  // Demote escaped instructions
+  NumRegsDemoted += WorkList.size();
   for (Instruction *I : WorkList)
     DemoteRegToStack(*I, false, AllocaInsertionPoint);
- 
-  WorkList.clear(); 
- 
-  // Find all phi's 
+
+  WorkList.clear();
+
+  // Find all phi's
   for (BasicBlock &BB : F)
     for (auto &Phi : BB.phis())
       WorkList.push_front(&Phi);
- 
-  // Demote phi nodes 
-  NumPhisDemoted += WorkList.size(); 
+
+  // Demote phi nodes
+  NumPhisDemoted += WorkList.size();
   for (Instruction *I : WorkList)
     DemotePHIToStack(cast<PHINode>(I), AllocaInsertionPoint);
- 
-  return true; 
-} 
- 
+
+  return true;
+}
+
 PreservedAnalyses RegToMemPass::run(Function &F, FunctionAnalysisManager &AM) {
   auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
   auto *LI = &AM.getResult<LoopAnalysis>(F);
@@ -106,7 +106,7 @@ PreservedAnalyses RegToMemPass::run(Function &F, FunctionAnalysisManager &AM) {
   PA.preserve<LoopAnalysis>();
   return PA;
 }
- 
+
 namespace {
 struct RegToMemLegacy : public FunctionPass {
   static char ID; // Pass identification, replacement for typeid
@@ -134,8 +134,8 @@ INITIALIZE_PASS_DEPENDENCY(BreakCriticalEdges)
 INITIALIZE_PASS_END(RegToMemLegacy, "reg2mem",
                     "Demote all values to stack slots", false, false)
 
-// createDemoteRegisterToMemory - Provide an entry point to create this pass. 
+// createDemoteRegisterToMemory - Provide an entry point to create this pass.
 char &llvm::DemoteRegisterToMemoryID = RegToMemLegacy::ID;
-FunctionPass *llvm::createDemoteRegisterToMemoryPass() { 
+FunctionPass *llvm::createDemoteRegisterToMemoryPass() {
   return new RegToMemLegacy();
-} 
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index b245d1e9d1..b7830555bf 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -1,1546 +1,1546 @@
-//===- RewriteStatepointsForGC.cpp - Make GC relocations explicit ---------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// Rewrite call/invoke instructions so as to make potential relocations 
-// performed by the garbage collector explicit in the IR. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/RewriteStatepointsForGC.h" 
- 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/DenseSet.h" 
-#include "llvm/ADT/MapVector.h" 
-#include "llvm/ADT/None.h" 
-#include "llvm/ADT/Optional.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/ADT/SmallSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/StringRef.h" 
-#include "llvm/ADT/iterator_range.h" 
-#include "llvm/Analysis/DomTreeUpdater.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/IR/Argument.h" 
-#include "llvm/IR/Attributes.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/CallingConv.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstIterator.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/MDBuilder.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Statepoint.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/IR/ValueHandle.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Compiler.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Transforms/Utils/PromoteMemToReg.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <cstddef> 
-#include <cstdint> 
-#include <iterator> 
-#include <set> 
-#include <string> 
-#include <utility> 
-#include <vector> 
- 
-#define DEBUG_TYPE "rewrite-statepoints-for-gc" 
- 
-using namespace llvm; 
- 
-// Print the liveset found at the insert location 
-static cl::opt<bool> PrintLiveSet("spp-print-liveset", cl::Hidden, 
-                                  cl::init(false)); 
-static cl::opt<bool> PrintLiveSetSize("spp-print-liveset-size", cl::Hidden, 
-                                      cl::init(false)); 
- 
-// Print out the base pointers for debugging 
-static cl::opt<bool> PrintBasePointers("spp-print-base-pointers", cl::Hidden, 
-                                       cl::init(false)); 
- 
-// Cost threshold measuring when it is profitable to rematerialize value instead 
-// of relocating it 
-static cl::opt<unsigned> 
-RematerializationThreshold("spp-rematerialization-threshold", cl::Hidden, 
-                           cl::init(6)); 
- 
-#ifdef EXPENSIVE_CHECKS 
-static bool ClobberNonLive = true; 
-#else 
-static bool ClobberNonLive = false; 
-#endif 
- 
-static cl::opt<bool, true> ClobberNonLiveOverride("rs4gc-clobber-non-live", 
-                                                  cl::location(ClobberNonLive), 
-                                                  cl::Hidden); 
- 
-static cl::opt<bool> 
-    AllowStatepointWithNoDeoptInfo("rs4gc-allow-statepoint-with-no-deopt-info", 
-                                   cl::Hidden, cl::init(true)); 
- 
-/// The IR fed into RewriteStatepointsForGC may have had attributes and 
-/// metadata implying dereferenceability that are no longer valid/correct after 
-/// RewriteStatepointsForGC has run. This is because semantically, after 
-/// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire 
-/// heap. stripNonValidData (conservatively) restores 
-/// correctness by erasing all attributes in the module that externally imply 
-/// dereferenceability. Similar reasoning also applies to the noalias 
-/// attributes and metadata. gc.statepoint can touch the entire heap including 
-/// noalias objects. 
-/// Apart from attributes and metadata, we also remove instructions that imply 
-/// constant physical memory: llvm.invariant.start. 
-static void stripNonValidData(Module &M); 
- 
-static bool shouldRewriteStatepointsIn(Function &F); 
- 
-PreservedAnalyses RewriteStatepointsForGC::run(Module &M, 
-                                               ModuleAnalysisManager &AM) { 
-  bool Changed = false; 
-  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 
-  for (Function &F : M) { 
-    // Nothing to do for declarations. 
-    if (F.isDeclaration() || F.empty()) 
-      continue; 
- 
-    // Policy choice says not to rewrite - the most common reason is that we're 
-    // compiling code without a GCStrategy. 
-    if (!shouldRewriteStatepointsIn(F)) 
-      continue; 
- 
-    auto &DT = FAM.getResult<DominatorTreeAnalysis>(F); 
-    auto &TTI = FAM.getResult<TargetIRAnalysis>(F); 
-    auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F); 
-    Changed |= runOnFunction(F, DT, TTI, TLI); 
-  } 
-  if (!Changed) 
-    return PreservedAnalyses::all(); 
- 
-  // stripNonValidData asserts that shouldRewriteStatepointsIn 
-  // returns true for at least one function in the module.  Since at least 
-  // one function changed, we know that the precondition is satisfied. 
-  stripNonValidData(M); 
- 
-  PreservedAnalyses PA; 
-  PA.preserve<TargetIRAnalysis>(); 
-  PA.preserve<TargetLibraryAnalysis>(); 
-  return PA; 
-} 
- 
-namespace { 
- 
-class RewriteStatepointsForGCLegacyPass : public ModulePass { 
-  RewriteStatepointsForGC Impl; 
- 
-public: 
-  static char ID; // Pass identification, replacement for typeid 
- 
-  RewriteStatepointsForGCLegacyPass() : ModulePass(ID), Impl() { 
-    initializeRewriteStatepointsForGCLegacyPassPass( 
-        *PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnModule(Module &M) override { 
-    bool Changed = false; 
-    for (Function &F : M) { 
-      // Nothing to do for declarations. 
-      if (F.isDeclaration() || F.empty()) 
-        continue; 
- 
-      // Policy choice says not to rewrite - the most common reason is that 
-      // we're compiling code without a GCStrategy. 
-      if (!shouldRewriteStatepointsIn(F)) 
-        continue; 
- 
-      TargetTransformInfo &TTI = 
-          getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 
-      const TargetLibraryInfo &TLI = 
-          getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 
-      auto &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree(); 
- 
-      Changed |= Impl.runOnFunction(F, DT, TTI, TLI); 
-    } 
- 
-    if (!Changed) 
-      return false; 
- 
-    // stripNonValidData asserts that shouldRewriteStatepointsIn 
-    // returns true for at least one function in the module.  Since at least 
-    // one function changed, we know that the precondition is satisfied. 
-    stripNonValidData(M); 
-    return true; 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    // We add and rewrite a bunch of instructions, but don't really do much 
-    // else.  We could in theory preserve a lot more analyses here. 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addRequired<TargetTransformInfoWrapperPass>(); 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-char RewriteStatepointsForGCLegacyPass::ID = 0; 
- 
-ModulePass *llvm::createRewriteStatepointsForGCLegacyPass() { 
-  return new RewriteStatepointsForGCLegacyPass(); 
-} 
- 
-INITIALIZE_PASS_BEGIN(RewriteStatepointsForGCLegacyPass, 
-                      "rewrite-statepoints-for-gc", 
-                      "Make relocations explicit at statepoints", false, false) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 
-INITIALIZE_PASS_END(RewriteStatepointsForGCLegacyPass, 
-                    "rewrite-statepoints-for-gc", 
-                    "Make relocations explicit at statepoints", false, false) 
- 
-namespace { 
- 
-struct GCPtrLivenessData { 
-  /// Values defined in this block. 
-  MapVector<BasicBlock *, SetVector<Value *>> KillSet; 
- 
-  /// Values used in this block (and thus live); does not included values 
-  /// killed within this block. 
-  MapVector<BasicBlock *, SetVector<Value *>> LiveSet; 
- 
-  /// Values live into this basic block (i.e. used by any 
-  /// instruction in this basic block or ones reachable from here) 
-  MapVector<BasicBlock *, SetVector<Value *>> LiveIn; 
- 
-  /// Values live out of this basic block (i.e. live into 
-  /// any successor block) 
-  MapVector<BasicBlock *, SetVector<Value *>> LiveOut; 
-}; 
- 
-// The type of the internal cache used inside the findBasePointers family 
-// of functions.  From the callers perspective, this is an opaque type and 
-// should not be inspected. 
-// 
-// In the actual implementation this caches two relations: 
-// - The base relation itself (i.e. this pointer is based on that one) 
-// - The base defining value relation (i.e. before base_phi insertion) 
-// Generally, after the execution of a full findBasePointer call, only the 
-// base relation will remain.  Internally, we add a mixture of the two 
-// types, then update all the second type to the first type 
-using DefiningValueMapTy = MapVector<Value *, Value *>; 
-using StatepointLiveSetTy = SetVector<Value *>; 
-using RematerializedValueMapTy = 
-    MapVector<AssertingVH<Instruction>, AssertingVH<Value>>; 
- 
-struct PartiallyConstructedSafepointRecord { 
-  /// The set of values known to be live across this safepoint 
-  StatepointLiveSetTy LiveSet; 
- 
-  /// Mapping from live pointers to a base-defining-value 
-  MapVector<Value *, Value *> PointerToBase; 
- 
-  /// The *new* gc.statepoint instruction itself.  This produces the token 
-  /// that normal path gc.relocates and the gc.result are tied to. 
-  GCStatepointInst *StatepointToken; 
- 
-  /// Instruction to which exceptional gc relocates are attached 
-  /// Makes it easier to iterate through them during relocationViaAlloca. 
-  Instruction *UnwindToken; 
- 
-  /// Record live values we are rematerialized instead of relocating. 
-  /// They are not included into 'LiveSet' field. 
-  /// Maps rematerialized copy to it's original value. 
-  RematerializedValueMapTy RematerializedValues; 
-}; 
- 
-} // end anonymous namespace 
- 
-static ArrayRef<Use> GetDeoptBundleOperands(const CallBase *Call) { 
-  Optional<OperandBundleUse> DeoptBundle = 
-      Call->getOperandBundle(LLVMContext::OB_deopt); 
- 
-  if (!DeoptBundle.hasValue()) { 
-    assert(AllowStatepointWithNoDeoptInfo && 
-           "Found non-leaf call without deopt info!"); 
-    return None; 
-  } 
- 
-  return DeoptBundle.getValue().Inputs; 
-} 
- 
-/// Compute the live-in set for every basic block in the function 
-static void computeLiveInValues(DominatorTree &DT, Function &F, 
-                                GCPtrLivenessData &Data); 
- 
-/// Given results from the dataflow liveness computation, find the set of live 
-/// Values at a particular instruction. 
-static void findLiveSetAtInst(Instruction *inst, GCPtrLivenessData &Data, 
-                              StatepointLiveSetTy &out); 
- 
-// TODO: Once we can get to the GCStrategy, this becomes 
-// Optional<bool> isGCManagedPointer(const Type *Ty) const override { 
- 
-static bool isGCPointerType(Type *T) { 
-  if (auto *PT = dyn_cast<PointerType>(T)) 
-    // For the sake of this example GC, we arbitrarily pick addrspace(1) as our 
-    // GC managed heap.  We know that a pointer into this heap needs to be 
-    // updated and that no other pointer does. 
-    return PT->getAddressSpace() == 1; 
-  return false; 
-} 
- 
-// Return true if this type is one which a) is a gc pointer or contains a GC 
-// pointer and b) is of a type this code expects to encounter as a live value. 
-// (The insertion code will assert that a type which matches (a) and not (b) 
-// is not encountered.) 
-static bool isHandledGCPointerType(Type *T) { 
-  // We fully support gc pointers 
-  if (isGCPointerType(T)) 
-    return true; 
-  // We partially support vectors of gc pointers. The code will assert if it 
-  // can't handle something. 
-  if (auto VT = dyn_cast<VectorType>(T)) 
-    if (isGCPointerType(VT->getElementType())) 
-      return true; 
-  return false; 
-} 
- 
-#ifndef NDEBUG 
-/// Returns true if this type contains a gc pointer whether we know how to 
-/// handle that type or not. 
-static bool containsGCPtrType(Type *Ty) { 
-  if (isGCPointerType(Ty)) 
-    return true; 
-  if (VectorType *VT = dyn_cast<VectorType>(Ty)) 
-    return isGCPointerType(VT->getScalarType()); 
-  if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) 
-    return containsGCPtrType(AT->getElementType()); 
-  if (StructType *ST = dyn_cast<StructType>(Ty)) 
-    return llvm::any_of(ST->elements(), containsGCPtrType); 
-  return false; 
-} 
- 
-// Returns true if this is a type which a) is a gc pointer or contains a GC 
-// pointer and b) is of a type which the code doesn't expect (i.e. first class 
-// aggregates).  Used to trip assertions. 
-static bool isUnhandledGCPointerType(Type *Ty) { 
-  return containsGCPtrType(Ty) && !isHandledGCPointerType(Ty); 
-} 
-#endif 
- 
-// Return the name of the value suffixed with the provided value, or if the 
-// value didn't have a name, the default value specified. 
-static std::string suffixed_name_or(Value *V, StringRef Suffix, 
-                                    StringRef DefaultName) { 
-  return V->hasName() ? (V->getName() + Suffix).str() : DefaultName.str(); 
-} 
- 
-// Conservatively identifies any definitions which might be live at the 
-// given instruction. The  analysis is performed immediately before the 
-// given instruction. Values defined by that instruction are not considered 
-// live.  Values used by that instruction are considered live. 
-static void analyzeParsePointLiveness( 
-    DominatorTree &DT, GCPtrLivenessData &OriginalLivenessData, CallBase *Call, 
-    PartiallyConstructedSafepointRecord &Result) { 
-  StatepointLiveSetTy LiveSet; 
-  findLiveSetAtInst(Call, OriginalLivenessData, LiveSet); 
- 
-  if (PrintLiveSet) { 
-    dbgs() << "Live Variables:\n"; 
-    for (Value *V : LiveSet) 
-      dbgs() << " " << V->getName() << " " << *V << "\n"; 
-  } 
-  if (PrintLiveSetSize) { 
-    dbgs() << "Safepoint For: " << Call->getCalledOperand()->getName() << "\n"; 
-    dbgs() << "Number live values: " << LiveSet.size() << "\n"; 
-  } 
-  Result.LiveSet = LiveSet; 
-} 
- 
-// Returns true is V is a knownBaseResult. 
-static bool isKnownBaseResult(Value *V); 
- 
-// Returns true if V is a BaseResult that already exists in the IR, i.e. it is 
-// not created by the findBasePointers algorithm. 
-static bool isOriginalBaseResult(Value *V); 
- 
-namespace { 
- 
-/// A single base defining value - An immediate base defining value for an 
-/// instruction 'Def' is an input to 'Def' whose base is also a base of 'Def'. 
-/// For instructions which have multiple pointer [vector] inputs or that 
-/// transition between vector and scalar types, there is no immediate base 
-/// defining value.  The 'base defining value' for 'Def' is the transitive 
-/// closure of this relation stopping at the first instruction which has no 
-/// immediate base defining value.  The b.d.v. might itself be a base pointer, 
-/// but it can also be an arbitrary derived pointer. 
-struct BaseDefiningValueResult { 
-  /// Contains the value which is the base defining value. 
-  Value * const BDV; 
- 
-  /// True if the base defining value is also known to be an actual base 
-  /// pointer. 
-  const bool IsKnownBase; 
- 
-  BaseDefiningValueResult(Value *BDV, bool IsKnownBase) 
-    : BDV(BDV), IsKnownBase(IsKnownBase) { 
-#ifndef NDEBUG 
-    // Check consistency between new and old means of checking whether a BDV is 
-    // a base. 
-    bool MustBeBase = isKnownBaseResult(BDV); 
-    assert(!MustBeBase || MustBeBase == IsKnownBase); 
-#endif 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-static BaseDefiningValueResult findBaseDefiningValue(Value *I); 
- 
-/// Return a base defining value for the 'Index' element of the given vector 
-/// instruction 'I'.  If Index is null, returns a BDV for the entire vector 
-/// 'I'.  As an optimization, this method will try to determine when the 
-/// element is known to already be a base pointer.  If this can be established, 
-/// the second value in the returned pair will be true.  Note that either a 
-/// vector or a pointer typed value can be returned.  For the former, the 
-/// vector returned is a BDV (and possibly a base) of the entire vector 'I'. 
-/// If the later, the return pointer is a BDV (or possibly a base) for the 
-/// particular element in 'I'. 
-static BaseDefiningValueResult 
-findBaseDefiningValueOfVector(Value *I) { 
-  // Each case parallels findBaseDefiningValue below, see that code for 
-  // detailed motivation. 
- 
-  if (isa<Argument>(I)) 
-    // An incoming argument to the function is a base pointer 
-    return BaseDefiningValueResult(I, true); 
- 
-  if (isa<Constant>(I)) 
-    // Base of constant vector consists only of constant null pointers. 
-    // For reasoning see similar case inside 'findBaseDefiningValue' function. 
-    return BaseDefiningValueResult(ConstantAggregateZero::get(I->getType()), 
-                                   true); 
- 
-  if (isa<LoadInst>(I)) 
-    return BaseDefiningValueResult(I, true); 
- 
-  if (isa<InsertElementInst>(I)) 
-    // We don't know whether this vector contains entirely base pointers or 
-    // not.  To be conservatively correct, we treat it as a BDV and will 
-    // duplicate code as needed to construct a parallel vector of bases. 
-    return BaseDefiningValueResult(I, false); 
- 
-  if (isa<ShuffleVectorInst>(I)) 
-    // We don't know whether this vector contains entirely base pointers or 
-    // not.  To be conservatively correct, we treat it as a BDV and will 
-    // duplicate code as needed to construct a parallel vector of bases. 
-    // TODO: There a number of local optimizations which could be applied here 
-    // for particular sufflevector patterns. 
-    return BaseDefiningValueResult(I, false); 
- 
-  // The behavior of getelementptr instructions is the same for vector and 
-  // non-vector data types. 
-  if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) 
-    return findBaseDefiningValue(GEP->getPointerOperand()); 
- 
-  // If the pointer comes through a bitcast of a vector of pointers to 
-  // a vector of another type of pointer, then look through the bitcast 
-  if (auto *BC = dyn_cast<BitCastInst>(I)) 
-    return findBaseDefiningValue(BC->getOperand(0)); 
- 
-  // We assume that functions in the source language only return base 
-  // pointers.  This should probably be generalized via attributes to support 
-  // both source language and internal functions. 
-  if (isa<CallInst>(I) || isa<InvokeInst>(I)) 
-    return BaseDefiningValueResult(I, true); 
- 
-  // A PHI or Select is a base defining value.  The outer findBasePointer 
-  // algorithm is responsible for constructing a base value for this BDV. 
-  assert((isa<SelectInst>(I) || isa<PHINode>(I)) && 
-         "unknown vector instruction - no base found for vector element"); 
-  return BaseDefiningValueResult(I, false); 
-} 
- 
-/// Helper function for findBasePointer - Will return a value which either a) 
-/// defines the base pointer for the input, b) blocks the simple search 
-/// (i.e. a PHI or Select of two derived pointers), or c) involves a change 
-/// from pointer to vector type or back. 
-static BaseDefiningValueResult findBaseDefiningValue(Value *I) { 
-  assert(I->getType()->isPtrOrPtrVectorTy() && 
-         "Illegal to ask for the base pointer of a non-pointer type"); 
- 
-  if (I->getType()->isVectorTy()) 
-    return findBaseDefiningValueOfVector(I); 
- 
-  if (isa<Argument>(I)) 
-    // An incoming argument to the function is a base pointer 
-    // We should have never reached here if this argument isn't an gc value 
-    return BaseDefiningValueResult(I, true); 
- 
-  if (isa<Constant>(I)) { 
-    // We assume that objects with a constant base (e.g. a global) can't move 
-    // and don't need to be reported to the collector because they are always 
-    // live. Besides global references, all kinds of constants (e.g. undef, 
-    // constant expressions, null pointers) can be introduced by the inliner or 
-    // the optimizer, especially on dynamically dead paths. 
-    // Here we treat all of them as having single null base. By doing this we 
-    // trying to avoid problems reporting various conflicts in a form of 
-    // "phi (const1, const2)" or "phi (const, regular gc ptr)". 
-    // See constant.ll file for relevant test cases. 
- 
-    return BaseDefiningValueResult( 
-        ConstantPointerNull::get(cast<PointerType>(I->getType())), true); 
-  } 
- 
-  if (CastInst *CI = dyn_cast<CastInst>(I)) { 
-    Value *Def = CI->stripPointerCasts(); 
-    // If stripping pointer casts changes the address space there is an 
-    // addrspacecast in between. 
-    assert(cast<PointerType>(Def->getType())->getAddressSpace() == 
-               cast<PointerType>(CI->getType())->getAddressSpace() && 
-           "unsupported addrspacecast"); 
-    // If we find a cast instruction here, it means we've found a cast which is 
-    // not simply a pointer cast (i.e. an inttoptr).  We don't know how to 
-    // handle int->ptr conversion. 
-    assert(!isa<CastInst>(Def) && "shouldn't find another cast here"); 
-    return findBaseDefiningValue(Def); 
-  } 
- 
-  if (isa<LoadInst>(I)) 
-    // The value loaded is an gc base itself 
-    return BaseDefiningValueResult(I, true); 
- 
-  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) 
-    // The base of this GEP is the base 
-    return findBaseDefiningValue(GEP->getPointerOperand()); 
- 
-  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 
-    switch (II->getIntrinsicID()) { 
-    default: 
-      // fall through to general call handling 
-      break; 
-    case Intrinsic::experimental_gc_statepoint: 
-      llvm_unreachable("statepoints don't produce pointers"); 
-    case Intrinsic::experimental_gc_relocate: 
-      // Rerunning safepoint insertion after safepoints are already 
-      // inserted is not supported.  It could probably be made to work, 
-      // but why are you doing this?  There's no good reason. 
-      llvm_unreachable("repeat safepoint insertion is not supported"); 
-    case Intrinsic::gcroot: 
-      // Currently, this mechanism hasn't been extended to work with gcroot. 
-      // There's no reason it couldn't be, but I haven't thought about the 
-      // implications much. 
-      llvm_unreachable( 
-          "interaction with the gcroot mechanism is not supported"); 
-    } 
-  } 
-  // We assume that functions in the source language only return base 
-  // pointers.  This should probably be generalized via attributes to support 
-  // both source language and internal functions. 
-  if (isa<CallInst>(I) || isa<InvokeInst>(I)) 
-    return BaseDefiningValueResult(I, true); 
- 
-  // TODO: I have absolutely no idea how to implement this part yet.  It's not 
-  // necessarily hard, I just haven't really looked at it yet. 
-  assert(!isa<LandingPadInst>(I) && "Landing Pad is unimplemented"); 
- 
-  if (isa<AtomicCmpXchgInst>(I)) 
-    // A CAS is effectively a atomic store and load combined under a 
-    // predicate.  From the perspective of base pointers, we just treat it 
-    // like a load. 
-    return BaseDefiningValueResult(I, true); 
- 
-  assert(!isa<AtomicRMWInst>(I) && "Xchg handled above, all others are " 
-                                   "binary ops which don't apply to pointers"); 
- 
-  // The aggregate ops.  Aggregates can either be in the heap or on the 
-  // stack, but in either case, this is simply a field load.  As a result, 
-  // this is a defining definition of the base just like a load is. 
-  if (isa<ExtractValueInst>(I)) 
-    return BaseDefiningValueResult(I, true); 
- 
-  // We should never see an insert vector since that would require we be 
-  // tracing back a struct value not a pointer value. 
-  assert(!isa<InsertValueInst>(I) && 
-         "Base pointer for a struct is meaningless"); 
- 
-  // An extractelement produces a base result exactly when it's input does. 
-  // We may need to insert a parallel instruction to extract the appropriate 
-  // element out of the base vector corresponding to the input. Given this, 
-  // it's analogous to the phi and select case even though it's not a merge. 
-  if (isa<ExtractElementInst>(I)) 
-    // Note: There a lot of obvious peephole cases here.  This are deliberately 
-    // handled after the main base pointer inference algorithm to make writing 
-    // test cases to exercise that code easier. 
-    return BaseDefiningValueResult(I, false); 
- 
-  // The last two cases here don't return a base pointer.  Instead, they 
-  // return a value which dynamically selects from among several base 
-  // derived pointers (each with it's own base potentially).  It's the job of 
-  // the caller to resolve these. 
-  assert((isa<SelectInst>(I) || isa<PHINode>(I)) && 
-         "missing instruction case in findBaseDefiningValing"); 
-  return BaseDefiningValueResult(I, false); 
-} 
- 
-/// Returns the base defining value for this value. 
-static Value *findBaseDefiningValueCached(Value *I, DefiningValueMapTy &Cache) { 
-  Value *&Cached = Cache[I]; 
-  if (!Cached) { 
-    Cached = findBaseDefiningValue(I).BDV; 
-    LLVM_DEBUG(dbgs() << "fBDV-cached: " << I->getName() << " -> " 
-                      << Cached->getName() << "\n"); 
-  } 
-  assert(Cache[I] != nullptr); 
-  return Cached; 
-} 
- 
-/// Return a base pointer for this value if known.  Otherwise, return it's 
-/// base defining value. 
-static Value *findBaseOrBDV(Value *I, DefiningValueMapTy &Cache) { 
-  Value *Def = findBaseDefiningValueCached(I, Cache); 
-  auto Found = Cache.find(Def); 
-  if (Found != Cache.end()) { 
-    // Either a base-of relation, or a self reference.  Caller must check. 
-    return Found->second; 
-  } 
-  // Only a BDV available 
-  return Def; 
-} 
- 
-/// This value is a base pointer that is not generated by RS4GC, i.e. it already 
-/// exists in the code. 
-static bool isOriginalBaseResult(Value *V) { 
-  // no recursion possible 
-  return !isa<PHINode>(V) && !isa<SelectInst>(V) && 
-         !isa<ExtractElementInst>(V) && !isa<InsertElementInst>(V) && 
-         !isa<ShuffleVectorInst>(V); 
-} 
- 
-/// Given the result of a call to findBaseDefiningValue, or findBaseOrBDV, 
-/// is it known to be a base pointer?  Or do we need to continue searching. 
-static bool isKnownBaseResult(Value *V) { 
-  if (isOriginalBaseResult(V)) 
-    return true; 
-  if (isa<Instruction>(V) && 
-      cast<Instruction>(V)->getMetadata("is_base_value")) { 
-    // This is a previously inserted base phi or select.  We know 
-    // that this is a base value. 
-    return true; 
-  } 
- 
-  // We need to keep searching 
-  return false; 
-} 
- 
-// Returns true if First and Second values are both scalar or both vector. 
-static bool areBothVectorOrScalar(Value *First, Value *Second) { 
-  return isa<VectorType>(First->getType()) == 
-         isa<VectorType>(Second->getType()); 
-} 
- 
-namespace { 
- 
-/// Models the state of a single base defining value in the findBasePointer 
-/// algorithm for determining where a new instruction is needed to propagate 
-/// the base of this BDV. 
-class BDVState { 
-public: 
-  enum Status { Unknown, Base, Conflict }; 
- 
-  BDVState() : BaseValue(nullptr) {} 
- 
-  explicit BDVState(Status Status, Value *BaseValue = nullptr) 
-      : Status(Status), BaseValue(BaseValue) { 
-    assert(Status != Base || BaseValue); 
-  } 
- 
-  explicit BDVState(Value *BaseValue) : Status(Base), BaseValue(BaseValue) {} 
- 
-  Status getStatus() const { return Status; } 
-  Value *getBaseValue() const { return BaseValue; } 
- 
-  bool isBase() const { return getStatus() == Base; } 
-  bool isUnknown() const { return getStatus() == Unknown; } 
-  bool isConflict() const { return getStatus() == Conflict; } 
- 
-  bool operator==(const BDVState &Other) const { 
-    return BaseValue == Other.BaseValue && Status == Other.Status; 
-  } 
- 
-  bool operator!=(const BDVState &other) const { return !(*this == other); } 
- 
-  LLVM_DUMP_METHOD 
-  void dump() const { 
-    print(dbgs()); 
-    dbgs() << '\n'; 
-  } 
- 
-  void print(raw_ostream &OS) const { 
-    switch (getStatus()) { 
-    case Unknown: 
-      OS << "U"; 
-      break; 
-    case Base: 
-      OS << "B"; 
-      break; 
-    case Conflict: 
-      OS << "C"; 
-      break; 
-    } 
-    OS << " (" << getBaseValue() << " - " 
-       << (getBaseValue() ? getBaseValue()->getName() : "nullptr") << "): "; 
-  } 
- 
-private: 
-  Status Status = Unknown; 
-  AssertingVH<Value> BaseValue; // Non-null only if Status == Base. 
-}; 
- 
-} // end anonymous namespace 
- 
-#ifndef NDEBUG 
-static raw_ostream &operator<<(raw_ostream &OS, const BDVState &State) { 
-  State.print(OS); 
-  return OS; 
-} 
-#endif 
- 
-static BDVState meetBDVStateImpl(const BDVState &LHS, const BDVState &RHS) { 
-  switch (LHS.getStatus()) { 
-  case BDVState::Unknown: 
-    return RHS; 
- 
-  case BDVState::Base: 
-    assert(LHS.getBaseValue() && "can't be null"); 
-    if (RHS.isUnknown()) 
-      return LHS; 
- 
-    if (RHS.isBase()) { 
-      if (LHS.getBaseValue() == RHS.getBaseValue()) { 
-        assert(LHS == RHS && "equality broken!"); 
-        return LHS; 
-      } 
-      return BDVState(BDVState::Conflict); 
-    } 
-    assert(RHS.isConflict() && "only three states!"); 
-    return BDVState(BDVState::Conflict); 
- 
-  case BDVState::Conflict: 
-    return LHS; 
-  } 
-  llvm_unreachable("only three states!"); 
-} 
- 
-// Values of type BDVState form a lattice, and this function implements the meet 
-// operation. 
-static BDVState meetBDVState(const BDVState &LHS, const BDVState &RHS) { 
-  BDVState Result = meetBDVStateImpl(LHS, RHS); 
-  assert(Result == meetBDVStateImpl(RHS, LHS) && 
-         "Math is wrong: meet does not commute!"); 
-  return Result; 
-} 
- 
-/// For a given value or instruction, figure out what base ptr its derived from. 
-/// For gc objects, this is simply itself.  On success, returns a value which is 
-/// the base pointer.  (This is reliable and can be used for relocation.)  On 
-/// failure, returns nullptr. 
-static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) { 
-  Value *Def = findBaseOrBDV(I, Cache); 
- 
-  if (isKnownBaseResult(Def) && areBothVectorOrScalar(Def, I)) 
-    return Def; 
- 
-  // Here's the rough algorithm: 
-  // - For every SSA value, construct a mapping to either an actual base 
-  //   pointer or a PHI which obscures the base pointer. 
-  // - Construct a mapping from PHI to unknown TOP state.  Use an 
-  //   optimistic algorithm to propagate base pointer information.  Lattice 
-  //   looks like: 
-  //   UNKNOWN 
-  //   b1 b2 b3 b4 
-  //   CONFLICT 
-  //   When algorithm terminates, all PHIs will either have a single concrete 
-  //   base or be in a conflict state. 
-  // - For every conflict, insert a dummy PHI node without arguments.  Add 
-  //   these to the base[Instruction] = BasePtr mapping.  For every 
-  //   non-conflict, add the actual base. 
-  //  - For every conflict, add arguments for the base[a] of each input 
-  //   arguments. 
-  // 
-  // Note: A simpler form of this would be to add the conflict form of all 
-  // PHIs without running the optimistic algorithm.  This would be 
-  // analogous to pessimistic data flow and would likely lead to an 
-  // overall worse solution. 
- 
-#ifndef NDEBUG 
-  auto isExpectedBDVType = [](Value *BDV) { 
-    return isa<PHINode>(BDV) || isa<SelectInst>(BDV) || 
-           isa<ExtractElementInst>(BDV) || isa<InsertElementInst>(BDV) || 
-           isa<ShuffleVectorInst>(BDV); 
-  }; 
-#endif 
- 
-  // Once populated, will contain a mapping from each potentially non-base BDV 
-  // to a lattice value (described above) which corresponds to that BDV. 
-  // We use the order of insertion (DFS over the def/use graph) to provide a 
-  // stable deterministic ordering for visiting DenseMaps (which are unordered) 
-  // below.  This is important for deterministic compilation. 
-  MapVector<Value *, BDVState> States; 
- 
-  // Recursively fill in all base defining values reachable from the initial 
-  // one for which we don't already know a definite base value for 
-  /* scope */ { 
-    SmallVector<Value*, 16> Worklist; 
-    Worklist.push_back(Def); 
-    States.insert({Def, BDVState()}); 
-    while (!Worklist.empty()) { 
-      Value *Current = Worklist.pop_back_val(); 
-      assert(!isOriginalBaseResult(Current) && "why did it get added?"); 
- 
-      auto visitIncomingValue = [&](Value *InVal) { 
-        Value *Base = findBaseOrBDV(InVal, Cache); 
-        if (isKnownBaseResult(Base) && areBothVectorOrScalar(Base, InVal)) 
-          // Known bases won't need new instructions introduced and can be 
-          // ignored safely. However, this can only be done when InVal and Base 
-          // are both scalar or both vector. Otherwise, we need to find a 
-          // correct BDV for InVal, by creating an entry in the lattice 
-          // (States). 
-          return; 
-        assert(isExpectedBDVType(Base) && "the only non-base values " 
-               "we see should be base defining values"); 
-        if (States.insert(std::make_pair(Base, BDVState())).second) 
-          Worklist.push_back(Base); 
-      }; 
-      if (PHINode *PN = dyn_cast<PHINode>(Current)) { 
-        for (Value *InVal : PN->incoming_values()) 
-          visitIncomingValue(InVal); 
-      } else if (SelectInst *SI = dyn_cast<SelectInst>(Current)) { 
-        visitIncomingValue(SI->getTrueValue()); 
-        visitIncomingValue(SI->getFalseValue()); 
-      } else if (auto *EE = dyn_cast<ExtractElementInst>(Current)) { 
-        visitIncomingValue(EE->getVectorOperand()); 
-      } else if (auto *IE = dyn_cast<InsertElementInst>(Current)) { 
-        visitIncomingValue(IE->getOperand(0)); // vector operand 
-        visitIncomingValue(IE->getOperand(1)); // scalar operand 
-      } else if (auto *SV = dyn_cast<ShuffleVectorInst>(Current)) { 
-        visitIncomingValue(SV->getOperand(0)); 
-        visitIncomingValue(SV->getOperand(1)); 
-      } 
-      else { 
-        llvm_unreachable("Unimplemented instruction case"); 
-      } 
-    } 
-  } 
- 
-#ifndef NDEBUG 
-  LLVM_DEBUG(dbgs() << "States after initialization:\n"); 
-  for (auto Pair : States) { 
-    LLVM_DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n"); 
-  } 
-#endif 
- 
-  // Return a phi state for a base defining value.  We'll generate a new 
-  // base state for known bases and expect to find a cached state otherwise. 
-  auto GetStateForBDV = [&](Value *BaseValue, Value *Input) { 
-    if (isKnownBaseResult(BaseValue) && areBothVectorOrScalar(BaseValue, Input)) 
-      return BDVState(BaseValue); 
-    auto I = States.find(BaseValue); 
-    assert(I != States.end() && "lookup failed!"); 
-    return I->second; 
-  }; 
- 
-  bool Progress = true; 
-  while (Progress) { 
-#ifndef NDEBUG 
-    const size_t OldSize = States.size(); 
-#endif 
-    Progress = false; 
-    // We're only changing values in this loop, thus safe to keep iterators. 
-    // Since this is computing a fixed point, the order of visit does not 
-    // effect the result.  TODO: We could use a worklist here and make this run 
-    // much faster. 
-    for (auto Pair : States) { 
-      Value *BDV = Pair.first; 
-      // Only values that do not have known bases or those that have differing 
-      // type (scalar versus vector) from a possible known base should be in the 
-      // lattice. 
-      assert((!isKnownBaseResult(BDV) || 
-             !areBothVectorOrScalar(BDV, Pair.second.getBaseValue())) && 
-                 "why did it get added?"); 
- 
-      // Given an input value for the current instruction, return a BDVState 
-      // instance which represents the BDV of that value. 
-      auto getStateForInput = [&](Value *V) mutable { 
-        Value *BDV = findBaseOrBDV(V, Cache); 
-        return GetStateForBDV(BDV, V); 
-      }; 
- 
-      BDVState NewState; 
-      if (SelectInst *SI = dyn_cast<SelectInst>(BDV)) { 
-        NewState = meetBDVState(NewState, getStateForInput(SI->getTrueValue())); 
-        NewState = 
-            meetBDVState(NewState, getStateForInput(SI->getFalseValue())); 
-      } else if (PHINode *PN = dyn_cast<PHINode>(BDV)) { 
-        for (Value *Val : PN->incoming_values()) 
-          NewState = meetBDVState(NewState, getStateForInput(Val)); 
-      } else if (auto *EE = dyn_cast<ExtractElementInst>(BDV)) { 
-        // The 'meet' for an extractelement is slightly trivial, but it's still 
-        // useful in that it drives us to conflict if our input is. 
-        NewState = 
-            meetBDVState(NewState, getStateForInput(EE->getVectorOperand())); 
-      } else if (auto *IE = dyn_cast<InsertElementInst>(BDV)){ 
-        // Given there's a inherent type mismatch between the operands, will 
-        // *always* produce Conflict. 
-        NewState = meetBDVState(NewState, getStateForInput(IE->getOperand(0))); 
-        NewState = meetBDVState(NewState, getStateForInput(IE->getOperand(1))); 
-      } else { 
-        // The only instance this does not return a Conflict is when both the 
-        // vector operands are the same vector. 
-        auto *SV = cast<ShuffleVectorInst>(BDV); 
-        NewState = meetBDVState(NewState, getStateForInput(SV->getOperand(0))); 
-        NewState = meetBDVState(NewState, getStateForInput(SV->getOperand(1))); 
-      } 
- 
-      BDVState OldState = States[BDV]; 
-      if (OldState != NewState) { 
-        Progress = true; 
-        States[BDV] = NewState; 
-      } 
-    } 
- 
-    assert(OldSize == States.size() && 
-           "fixed point shouldn't be adding any new nodes to state"); 
-  } 
- 
-#ifndef NDEBUG 
-  LLVM_DEBUG(dbgs() << "States after meet iteration:\n"); 
-  for (auto Pair : States) { 
-    LLVM_DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n"); 
-  } 
-#endif 
- 
-  // Handle all instructions that have a vector BDV, but the instruction itself 
-  // is of scalar type. 
-  for (auto Pair : States) { 
-    Instruction *I = cast<Instruction>(Pair.first); 
-    BDVState State = Pair.second; 
-    auto *BaseValue = State.getBaseValue(); 
-    // Only values that do not have known bases or those that have differing 
-    // type (scalar versus vector) from a possible known base should be in the 
-    // lattice. 
-    assert((!isKnownBaseResult(I) || !areBothVectorOrScalar(I, BaseValue)) && 
-           "why did it get added?"); 
-    assert(!State.isUnknown() && "Optimistic algorithm didn't complete!"); 
- 
-    if (!State.isBase() || !isa<VectorType>(BaseValue->getType())) 
-      continue; 
-    // extractelement instructions are a bit special in that we may need to 
-    // insert an extract even when we know an exact base for the instruction. 
-    // The problem is that we need to convert from a vector base to a scalar 
-    // base for the particular indice we're interested in. 
-    if (isa<ExtractElementInst>(I)) { 
-      auto *EE = cast<ExtractElementInst>(I); 
-      // TODO: In many cases, the new instruction is just EE itself.  We should 
-      // exploit this, but can't do it here since it would break the invariant 
-      // about the BDV not being known to be a base. 
-      auto *BaseInst = ExtractElementInst::Create( 
-          State.getBaseValue(), EE->getIndexOperand(), "base_ee", EE); 
-      BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {})); 
-      States[I] = BDVState(BDVState::Base, BaseInst); 
-    } else if (!isa<VectorType>(I->getType())) { 
-      // We need to handle cases that have a vector base but the instruction is 
-      // a scalar type (these could be phis or selects or any instruction that 
-      // are of scalar type, but the base can be a vector type).  We 
-      // conservatively set this as conflict.  Setting the base value for these 
-      // conflicts is handled in the next loop which traverses States. 
-      States[I] = BDVState(BDVState::Conflict); 
-    } 
-  } 
- 
-  // Insert Phis for all conflicts 
-  // TODO: adjust naming patterns to avoid this order of iteration dependency 
-  for (auto Pair : States) { 
-    Instruction *I = cast<Instruction>(Pair.first); 
-    BDVState State = Pair.second; 
-    // Only values that do not have known bases or those that have differing 
-    // type (scalar versus vector) from a possible known base should be in the 
-    // lattice. 
-    assert((!isKnownBaseResult(I) || !areBothVectorOrScalar(I, State.getBaseValue())) && 
-           "why did it get added?"); 
-    assert(!State.isUnknown() && "Optimistic algorithm didn't complete!"); 
- 
-    // Since we're joining a vector and scalar base, they can never be the 
-    // same.  As a result, we should always see insert element having reached 
-    // the conflict state. 
-    assert(!isa<InsertElementInst>(I) || State.isConflict()); 
- 
-    if (!State.isConflict()) 
-      continue; 
- 
-    /// Create and insert a new instruction which will represent the base of 
-    /// the given instruction 'I'. 
-    auto MakeBaseInstPlaceholder = [](Instruction *I) -> Instruction* { 
-      if (isa<PHINode>(I)) { 
-        BasicBlock *BB = I->getParent(); 
-        int NumPreds = pred_size(BB); 
-        assert(NumPreds > 0 && "how did we reach here"); 
-        std::string Name = suffixed_name_or(I, ".base", "base_phi"); 
-        return PHINode::Create(I->getType(), NumPreds, Name, I); 
-      } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) { 
-        // The undef will be replaced later 
-        UndefValue *Undef = UndefValue::get(SI->getType()); 
-        std::string Name = suffixed_name_or(I, ".base", "base_select"); 
-        return SelectInst::Create(SI->getCondition(), Undef, Undef, Name, SI); 
-      } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 
-        UndefValue *Undef = UndefValue::get(EE->getVectorOperand()->getType()); 
-        std::string Name = suffixed_name_or(I, ".base", "base_ee"); 
-        return ExtractElementInst::Create(Undef, EE->getIndexOperand(), Name, 
-                                          EE); 
-      } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 
-        UndefValue *VecUndef = UndefValue::get(IE->getOperand(0)->getType()); 
-        UndefValue *ScalarUndef = UndefValue::get(IE->getOperand(1)->getType()); 
-        std::string Name = suffixed_name_or(I, ".base", "base_ie"); 
-        return InsertElementInst::Create(VecUndef, ScalarUndef, 
-                                         IE->getOperand(2), Name, IE); 
-      } else { 
-        auto *SV = cast<ShuffleVectorInst>(I); 
-        UndefValue *VecUndef = UndefValue::get(SV->getOperand(0)->getType()); 
-        std::string Name = suffixed_name_or(I, ".base", "base_sv"); 
-        return new ShuffleVectorInst(VecUndef, VecUndef, SV->getShuffleMask(), 
-                                     Name, SV); 
-      } 
-    }; 
-    Instruction *BaseInst = MakeBaseInstPlaceholder(I); 
-    // Add metadata marking this as a base value 
-    BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {})); 
-    States[I] = BDVState(BDVState::Conflict, BaseInst); 
-  } 
- 
-  // Returns a instruction which produces the base pointer for a given 
-  // instruction.  The instruction is assumed to be an input to one of the BDVs 
-  // seen in the inference algorithm above.  As such, we must either already 
-  // know it's base defining value is a base, or have inserted a new 
-  // instruction to propagate the base of it's BDV and have entered that newly 
-  // introduced instruction into the state table.  In either case, we are 
-  // assured to be able to determine an instruction which produces it's base 
-  // pointer. 
-  auto getBaseForInput = [&](Value *Input, Instruction *InsertPt) { 
-    Value *BDV = findBaseOrBDV(Input, Cache); 
-    Value *Base = nullptr; 
-    if (isKnownBaseResult(BDV) && areBothVectorOrScalar(BDV, Input)) { 
-      Base = BDV; 
-    } else { 
-      // Either conflict or base. 
-      assert(States.count(BDV)); 
-      Base = States[BDV].getBaseValue(); 
-    } 
-    assert(Base && "Can't be null"); 
-    // The cast is needed since base traversal may strip away bitcasts 
-    if (Base->getType() != Input->getType() && InsertPt) 
-      Base = new BitCastInst(Base, Input->getType(), "cast", InsertPt); 
-    return Base; 
-  }; 
- 
-  // Fixup all the inputs of the new PHIs.  Visit order needs to be 
-  // deterministic and predictable because we're naming newly created 
-  // instructions. 
-  for (auto Pair : States) { 
-    Instruction *BDV = cast<Instruction>(Pair.first); 
-    BDVState State = Pair.second; 
- 
-    // Only values that do not have known bases or those that have differing 
-    // type (scalar versus vector) from a possible known base should be in the 
-    // lattice. 
-    assert((!isKnownBaseResult(BDV) || 
-            !areBothVectorOrScalar(BDV, State.getBaseValue())) && 
-           "why did it get added?"); 
-    assert(!State.isUnknown() && "Optimistic algorithm didn't complete!"); 
-    if (!State.isConflict()) 
-      continue; 
- 
-    if (PHINode *BasePHI = dyn_cast<PHINode>(State.getBaseValue())) { 
-      PHINode *PN = cast<PHINode>(BDV); 
-      unsigned NumPHIValues = PN->getNumIncomingValues(); 
-      for (unsigned i = 0; i < NumPHIValues; i++) { 
-        Value *InVal = PN->getIncomingValue(i); 
-        BasicBlock *InBB = PN->getIncomingBlock(i); 
- 
-        // If we've already seen InBB, add the same incoming value 
-        // we added for it earlier.  The IR verifier requires phi 
-        // nodes with multiple entries from the same basic block 
-        // to have the same incoming value for each of those 
-        // entries.  If we don't do this check here and basephi 
-        // has a different type than base, we'll end up adding two 
-        // bitcasts (and hence two distinct values) as incoming 
-        // values for the same basic block. 
- 
-        int BlockIndex = BasePHI->getBasicBlockIndex(InBB); 
-        if (BlockIndex != -1) { 
-          Value *OldBase = BasePHI->getIncomingValue(BlockIndex); 
-          BasePHI->addIncoming(OldBase, InBB); 
- 
-#ifndef NDEBUG 
-          Value *Base = getBaseForInput(InVal, nullptr); 
-          // In essence this assert states: the only way two values 
-          // incoming from the same basic block may be different is by 
-          // being different bitcasts of the same value.  A cleanup 
-          // that remains TODO is changing findBaseOrBDV to return an 
-          // llvm::Value of the correct type (and still remain pure). 
-          // This will remove the need to add bitcasts. 
-          assert(Base->stripPointerCasts() == OldBase->stripPointerCasts() && 
-                 "Sanity -- findBaseOrBDV should be pure!"); 
-#endif 
-          continue; 
-        } 
- 
-        // Find the instruction which produces the base for each input.  We may 
-        // need to insert a bitcast in the incoming block. 
-        // TODO: Need to split critical edges if insertion is needed 
-        Value *Base = getBaseForInput(InVal, InBB->getTerminator()); 
-        BasePHI->addIncoming(Base, InBB); 
-      } 
-      assert(BasePHI->getNumIncomingValues() == NumPHIValues); 
-    } else if (SelectInst *BaseSI = 
-                   dyn_cast<SelectInst>(State.getBaseValue())) { 
-      SelectInst *SI = cast<SelectInst>(BDV); 
- 
-      // Find the instruction which produces the base for each input. 
-      // We may need to insert a bitcast. 
-      BaseSI->setTrueValue(getBaseForInput(SI->getTrueValue(), BaseSI)); 
-      BaseSI->setFalseValue(getBaseForInput(SI->getFalseValue(), BaseSI)); 
-    } else if (auto *BaseEE = 
-                   dyn_cast<ExtractElementInst>(State.getBaseValue())) { 
-      Value *InVal = cast<ExtractElementInst>(BDV)->getVectorOperand(); 
-      // Find the instruction which produces the base for each input.  We may 
-      // need to insert a bitcast. 
-      BaseEE->setOperand(0, getBaseForInput(InVal, BaseEE)); 
-    } else if (auto *BaseIE = dyn_cast<InsertElementInst>(State.getBaseValue())){ 
-      auto *BdvIE = cast<InsertElementInst>(BDV); 
-      auto UpdateOperand = [&](int OperandIdx) { 
-        Value *InVal = BdvIE->getOperand(OperandIdx); 
-        Value *Base = getBaseForInput(InVal, BaseIE); 
-        BaseIE->setOperand(OperandIdx, Base); 
-      }; 
-      UpdateOperand(0); // vector operand 
-      UpdateOperand(1); // scalar operand 
-    } else { 
-      auto *BaseSV = cast<ShuffleVectorInst>(State.getBaseValue()); 
-      auto *BdvSV = cast<ShuffleVectorInst>(BDV); 
-      auto UpdateOperand = [&](int OperandIdx) { 
-        Value *InVal = BdvSV->getOperand(OperandIdx); 
-        Value *Base = getBaseForInput(InVal, BaseSV); 
-        BaseSV->setOperand(OperandIdx, Base); 
-      }; 
-      UpdateOperand(0); // vector operand 
-      UpdateOperand(1); // vector operand 
-    } 
-  } 
- 
-  // Cache all of our results so we can cheaply reuse them 
-  // NOTE: This is actually two caches: one of the base defining value 
-  // relation and one of the base pointer relation!  FIXME 
-  for (auto Pair : States) { 
-    auto *BDV = Pair.first; 
-    Value *Base = Pair.second.getBaseValue(); 
-    assert(BDV && Base); 
-    // Only values that do not have known bases or those that have differing 
-    // type (scalar versus vector) from a possible known base should be in the 
-    // lattice. 
-    assert((!isKnownBaseResult(BDV) || !areBothVectorOrScalar(BDV, Base)) && 
-           "why did it get added?"); 
- 
-    LLVM_DEBUG( 
-        dbgs() << "Updating base value cache" 
-               << " for: " << BDV->getName() << " from: " 
-               << (Cache.count(BDV) ? Cache[BDV]->getName().str() : "none") 
-               << " to: " << Base->getName() << "\n"); 
- 
-    if (Cache.count(BDV)) { 
-      assert(isKnownBaseResult(Base) && 
-             "must be something we 'know' is a base pointer"); 
-      // Once we transition from the BDV relation being store in the Cache to 
-      // the base relation being stored, it must be stable 
-      assert((!isKnownBaseResult(Cache[BDV]) || Cache[BDV] == Base) && 
-             "base relation should be stable"); 
-    } 
-    Cache[BDV] = Base; 
-  } 
-  assert(Cache.count(Def)); 
-  return Cache[Def]; 
-} 
- 
-// For a set of live pointers (base and/or derived), identify the base 
-// pointer of the object which they are derived from.  This routine will 
-// mutate the IR graph as needed to make the 'base' pointer live at the 
-// definition site of 'derived'.  This ensures that any use of 'derived' can 
-// also use 'base'.  This may involve the insertion of a number of 
-// additional PHI nodes. 
-// 
-// preconditions: live is a set of pointer type Values 
-// 
-// side effects: may insert PHI nodes into the existing CFG, will preserve 
-// CFG, will not remove or mutate any existing nodes 
-// 
-// post condition: PointerToBase contains one (derived, base) pair for every 
-// pointer in live.  Note that derived can be equal to base if the original 
-// pointer was a base pointer. 
-static void 
-findBasePointers(const StatepointLiveSetTy &live, 
-                 MapVector<Value *, Value *> &PointerToBase, 
-                 DominatorTree *DT, DefiningValueMapTy &DVCache) { 
-  for (Value *ptr : live) { 
-    Value *base = findBasePointer(ptr, DVCache); 
-    assert(base && "failed to find base pointer"); 
-    PointerToBase[ptr] = base; 
-    assert((!isa<Instruction>(base) || !isa<Instruction>(ptr) || 
-            DT->dominates(cast<Instruction>(base)->getParent(), 
-                          cast<Instruction>(ptr)->getParent())) && 
-           "The base we found better dominate the derived pointer"); 
-  } 
-} 
- 
-/// Find the required based pointers (and adjust the live set) for the given 
-/// parse point. 
-static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache, 
-                             CallBase *Call, 
-                             PartiallyConstructedSafepointRecord &result) { 
-  MapVector<Value *, Value *> PointerToBase; 
-  findBasePointers(result.LiveSet, PointerToBase, &DT, DVCache); 
- 
-  if (PrintBasePointers) { 
-    errs() << "Base Pairs (w/o Relocation):\n"; 
-    for (auto &Pair : PointerToBase) { 
-      errs() << " derived "; 
-      Pair.first->printAsOperand(errs(), false); 
-      errs() << " base "; 
-      Pair.second->printAsOperand(errs(), false); 
-      errs() << "\n";; 
-    } 
-  } 
- 
-  result.PointerToBase = PointerToBase; 
-} 
- 
-/// Given an updated version of the dataflow liveness results, update the 
-/// liveset and base pointer maps for the call site CS. 
-static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData, 
-                                  CallBase *Call, 
-                                  PartiallyConstructedSafepointRecord &result); 
- 
-static void recomputeLiveInValues( 
-    Function &F, DominatorTree &DT, ArrayRef<CallBase *> toUpdate, 
-    MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) { 
-  // TODO-PERF: reuse the original liveness, then simply run the dataflow 
-  // again.  The old values are still live and will help it stabilize quickly. 
-  GCPtrLivenessData RevisedLivenessData; 
-  computeLiveInValues(DT, F, RevisedLivenessData); 
-  for (size_t i = 0; i < records.size(); i++) { 
-    struct PartiallyConstructedSafepointRecord &info = records[i]; 
-    recomputeLiveInValues(RevisedLivenessData, toUpdate[i], info); 
-  } 
-} 
- 
-// When inserting gc.relocate and gc.result calls, we need to ensure there are 
-// no uses of the original value / return value between the gc.statepoint and 
-// the gc.relocate / gc.result call.  One case which can arise is a phi node 
-// starting one of the successor blocks.  We also need to be able to insert the 
-// gc.relocates only on the path which goes through the statepoint.  We might 
-// need to split an edge to make this possible. 
-static BasicBlock * 
-normalizeForInvokeSafepoint(BasicBlock *BB, BasicBlock *InvokeParent, 
-                            DominatorTree &DT) { 
-  BasicBlock *Ret = BB; 
-  if (!BB->getUniquePredecessor()) 
-    Ret = SplitBlockPredecessors(BB, InvokeParent, "", &DT); 
- 
-  // Now that 'Ret' has unique predecessor we can safely remove all phi nodes 
-  // from it 
-  FoldSingleEntryPHINodes(Ret); 
-  assert(!isa<PHINode>(Ret->begin()) && 
-         "All PHI nodes should have been removed!"); 
- 
-  // At this point, we can safely insert a gc.relocate or gc.result as the first 
-  // instruction in Ret if needed. 
-  return Ret; 
-} 
- 
-// Create new attribute set containing only attributes which can be transferred 
-// from original call to the safepoint. 
-static AttributeList legalizeCallAttributes(LLVMContext &Ctx, 
-                                            AttributeList AL) { 
-  if (AL.isEmpty()) 
-    return AL; 
- 
-  // Remove the readonly, readnone, and statepoint function attributes. 
-  AttrBuilder FnAttrs = AL.getFnAttributes(); 
-  FnAttrs.removeAttribute(Attribute::ReadNone); 
-  FnAttrs.removeAttribute(Attribute::ReadOnly); 
-  for (Attribute A : AL.getFnAttributes()) { 
-    if (isStatepointDirectiveAttr(A)) 
-      FnAttrs.remove(A); 
-  } 
- 
-  // Just skip parameter and return attributes for now 
-  return AttributeList::get(Ctx, AttributeList::FunctionIndex, 
-                            AttributeSet::get(Ctx, FnAttrs)); 
-} 
- 
-/// Helper function to place all gc relocates necessary for the given 
-/// statepoint. 
-/// Inputs: 
-///   liveVariables - list of variables to be relocated. 
-///   basePtrs - base pointers. 
-///   statepointToken - statepoint instruction to which relocates should be 
-///   bound. 
-///   Builder - Llvm IR builder to be used to construct new calls. 
-static void CreateGCRelocates(ArrayRef<Value *> LiveVariables, 
-                              ArrayRef<Value *> BasePtrs, 
-                              Instruction *StatepointToken, 
-                              IRBuilder<> &Builder) { 
-  if (LiveVariables.empty()) 
-    return; 
- 
-  auto FindIndex = [](ArrayRef<Value *> LiveVec, Value *Val) { 
-    auto ValIt = llvm::find(LiveVec, Val); 
-    assert(ValIt != LiveVec.end() && "Val not found in LiveVec!"); 
-    size_t Index = std::distance(LiveVec.begin(), ValIt); 
-    assert(Index < LiveVec.size() && "Bug in std::find?"); 
-    return Index; 
-  }; 
-  Module *M = StatepointToken->getModule(); 
- 
-  // All gc_relocate are generated as i8 addrspace(1)* (or a vector type whose 
-  // element type is i8 addrspace(1)*). We originally generated unique 
-  // declarations for each pointer type, but this proved problematic because 
-  // the intrinsic mangling code is incomplete and fragile.  Since we're moving 
-  // towards a single unified pointer type anyways, we can just cast everything 
-  // to an i8* of the right address space.  A bitcast is added later to convert 
-  // gc_relocate to the actual value's type. 
-  auto getGCRelocateDecl = [&] (Type *Ty) { 
-    assert(isHandledGCPointerType(Ty)); 
-    auto AS = Ty->getScalarType()->getPointerAddressSpace(); 
-    Type *NewTy = Type::getInt8PtrTy(M->getContext(), AS); 
-    if (auto *VT = dyn_cast<VectorType>(Ty)) 
-      NewTy = FixedVectorType::get(NewTy, 
-                                   cast<FixedVectorType>(VT)->getNumElements()); 
-    return Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_relocate, 
-                                     {NewTy}); 
-  }; 
- 
-  // Lazily populated map from input types to the canonicalized form mentioned 
-  // in the comment above.  This should probably be cached somewhere more 
-  // broadly. 
-  DenseMap<Type *, Function *> TypeToDeclMap; 
- 
-  for (unsigned i = 0; i < LiveVariables.size(); i++) { 
-    // Generate the gc.relocate call and save the result 
-    Value *BaseIdx = Builder.getInt32(FindIndex(LiveVariables, BasePtrs[i])); 
-    Value *LiveIdx = Builder.getInt32(i); 
- 
-    Type *Ty = LiveVariables[i]->getType(); 
-    if (!TypeToDeclMap.count(Ty)) 
-      TypeToDeclMap[Ty] = getGCRelocateDecl(Ty); 
-    Function *GCRelocateDecl = TypeToDeclMap[Ty]; 
- 
-    // only specify a debug name if we can give a useful one 
-    CallInst *Reloc = Builder.CreateCall( 
-        GCRelocateDecl, {StatepointToken, BaseIdx, LiveIdx}, 
-        suffixed_name_or(LiveVariables[i], ".relocated", "")); 
-    // Trick CodeGen into thinking there are lots of free registers at this 
-    // fake call. 
-    Reloc->setCallingConv(CallingConv::Cold); 
-  } 
-} 
- 
-namespace { 
- 
-/// This struct is used to defer RAUWs and `eraseFromParent` s.  Using this 
-/// avoids having to worry about keeping around dangling pointers to Values. 
-class DeferredReplacement { 
-  AssertingVH<Instruction> Old; 
-  AssertingVH<Instruction> New; 
-  bool IsDeoptimize = false; 
- 
-  DeferredReplacement() = default; 
- 
-public: 
-  static DeferredReplacement createRAUW(Instruction *Old, Instruction *New) { 
-    assert(Old != New && Old && New && 
-           "Cannot RAUW equal values or to / from null!"); 
- 
-    DeferredReplacement D; 
-    D.Old = Old; 
-    D.New = New; 
-    return D; 
-  } 
- 
-  static DeferredReplacement createDelete(Instruction *ToErase) { 
-    DeferredReplacement D; 
-    D.Old = ToErase; 
-    return D; 
-  } 
- 
-  static DeferredReplacement createDeoptimizeReplacement(Instruction *Old) { 
-#ifndef NDEBUG 
-    auto *F = cast<CallInst>(Old)->getCalledFunction(); 
-    assert(F && F->getIntrinsicID() == Intrinsic::experimental_deoptimize && 
-           "Only way to construct a deoptimize deferred replacement"); 
-#endif 
-    DeferredReplacement D; 
-    D.Old = Old; 
-    D.IsDeoptimize = true; 
-    return D; 
-  } 
- 
-  /// Does the task represented by this instance. 
-  void doReplacement() { 
-    Instruction *OldI = Old; 
-    Instruction *NewI = New; 
- 
-    assert(OldI != NewI && "Disallowed at construction?!"); 
-    assert((!IsDeoptimize || !New) && 
-           "Deoptimize intrinsics are not replaced!"); 
- 
-    Old = nullptr; 
-    New = nullptr; 
- 
-    if (NewI) 
-      OldI->replaceAllUsesWith(NewI); 
- 
-    if (IsDeoptimize) { 
-      // Note: we've inserted instructions, so the call to llvm.deoptimize may 
-      // not necessarily be followed by the matching return. 
-      auto *RI = cast<ReturnInst>(OldI->getParent()->getTerminator()); 
-      new UnreachableInst(RI->getContext(), RI); 
-      RI->eraseFromParent(); 
-    } 
- 
-    OldI->eraseFromParent(); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-static StringRef getDeoptLowering(CallBase *Call) { 
-  const char *DeoptLowering = "deopt-lowering"; 
-  if (Call->hasFnAttr(DeoptLowering)) { 
-    // FIXME: Calls have a *really* confusing interface around attributes 
-    // with values. 
-    const AttributeList &CSAS = Call->getAttributes(); 
-    if (CSAS.hasAttribute(AttributeList::FunctionIndex, DeoptLowering)) 
-      return CSAS.getAttribute(AttributeList::FunctionIndex, DeoptLowering) 
-          .getValueAsString(); 
-    Function *F = Call->getCalledFunction(); 
-    assert(F && F->hasFnAttribute(DeoptLowering)); 
-    return F->getFnAttribute(DeoptLowering).getValueAsString(); 
-  } 
-  return "live-through"; 
-} 
- 
-static void 
-makeStatepointExplicitImpl(CallBase *Call, /* to replace */ 
-                           const SmallVectorImpl<Value *> &BasePtrs, 
-                           const SmallVectorImpl<Value *> &LiveVariables, 
-                           PartiallyConstructedSafepointRecord &Result, 
-                           std::vector<DeferredReplacement> &Replacements) { 
-  assert(BasePtrs.size() == LiveVariables.size()); 
- 
-  // Then go ahead and use the builder do actually do the inserts.  We insert 
-  // immediately before the previous instruction under the assumption that all 
-  // arguments will be available here.  We can't insert afterwards since we may 
-  // be replacing a terminator. 
-  IRBuilder<> Builder(Call); 
- 
-  ArrayRef<Value *> GCArgs(LiveVariables); 
-  uint64_t StatepointID = StatepointDirectives::DefaultStatepointID; 
-  uint32_t NumPatchBytes = 0; 
-  uint32_t Flags = uint32_t(StatepointFlags::None); 
- 
+//===- RewriteStatepointsForGC.cpp - Make GC relocations explicit ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Rewrite call/invoke instructions so as to make potential relocations
+// performed by the garbage collector explicit in the IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/RewriteStatepointsForGC.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#define DEBUG_TYPE "rewrite-statepoints-for-gc"
+
+using namespace llvm;
+
+// Print the liveset found at the insert location
+static cl::opt<bool> PrintLiveSet("spp-print-liveset", cl::Hidden,
+                                  cl::init(false));
+static cl::opt<bool> PrintLiveSetSize("spp-print-liveset-size", cl::Hidden,
+                                      cl::init(false));
+
+// Print out the base pointers for debugging
+static cl::opt<bool> PrintBasePointers("spp-print-base-pointers", cl::Hidden,
+                                       cl::init(false));
+
+// Cost threshold measuring when it is profitable to rematerialize value instead
+// of relocating it
+static cl::opt<unsigned>
+RematerializationThreshold("spp-rematerialization-threshold", cl::Hidden,
+                           cl::init(6));
+
+#ifdef EXPENSIVE_CHECKS
+static bool ClobberNonLive = true;
+#else
+static bool ClobberNonLive = false;
+#endif
+
+static cl::opt<bool, true> ClobberNonLiveOverride("rs4gc-clobber-non-live",
+                                                  cl::location(ClobberNonLive),
+                                                  cl::Hidden);
+
+static cl::opt<bool>
+    AllowStatepointWithNoDeoptInfo("rs4gc-allow-statepoint-with-no-deopt-info",
+                                   cl::Hidden, cl::init(true));
+
+/// The IR fed into RewriteStatepointsForGC may have had attributes and
+/// metadata implying dereferenceability that are no longer valid/correct after
+/// RewriteStatepointsForGC has run. This is because semantically, after
+/// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire
+/// heap. stripNonValidData (conservatively) restores
+/// correctness by erasing all attributes in the module that externally imply
+/// dereferenceability. Similar reasoning also applies to the noalias
+/// attributes and metadata. gc.statepoint can touch the entire heap including
+/// noalias objects.
+/// Apart from attributes and metadata, we also remove instructions that imply
+/// constant physical memory: llvm.invariant.start.
+static void stripNonValidData(Module &M);
+
+static bool shouldRewriteStatepointsIn(Function &F);
+
+PreservedAnalyses RewriteStatepointsForGC::run(Module &M,
+                                               ModuleAnalysisManager &AM) {
+  bool Changed = false;
+  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  for (Function &F : M) {
+    // Nothing to do for declarations.
+    if (F.isDeclaration() || F.empty())
+      continue;
+
+    // Policy choice says not to rewrite - the most common reason is that we're
+    // compiling code without a GCStrategy.
+    if (!shouldRewriteStatepointsIn(F))
+      continue;
+
+    auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+    auto &TTI = FAM.getResult<TargetIRAnalysis>(F);
+    auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
+    Changed |= runOnFunction(F, DT, TTI, TLI);
+  }
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  // stripNonValidData asserts that shouldRewriteStatepointsIn
+  // returns true for at least one function in the module.  Since at least
+  // one function changed, we know that the precondition is satisfied.
+  stripNonValidData(M);
+
+  PreservedAnalyses PA;
+  PA.preserve<TargetIRAnalysis>();
+  PA.preserve<TargetLibraryAnalysis>();
+  return PA;
+}
+
+namespace {
+
+class RewriteStatepointsForGCLegacyPass : public ModulePass {
+  RewriteStatepointsForGC Impl;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  RewriteStatepointsForGCLegacyPass() : ModulePass(ID), Impl() {
+    initializeRewriteStatepointsForGCLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    bool Changed = false;
+    for (Function &F : M) {
+      // Nothing to do for declarations.
+      if (F.isDeclaration() || F.empty())
+        continue;
+
+      // Policy choice says not to rewrite - the most common reason is that
+      // we're compiling code without a GCStrategy.
+      if (!shouldRewriteStatepointsIn(F))
+        continue;
+
+      TargetTransformInfo &TTI =
+          getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+      const TargetLibraryInfo &TLI =
+          getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+      auto &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+
+      Changed |= Impl.runOnFunction(F, DT, TTI, TLI);
+    }
+
+    if (!Changed)
+      return false;
+
+    // stripNonValidData asserts that shouldRewriteStatepointsIn
+    // returns true for at least one function in the module.  Since at least
+    // one function changed, we know that the precondition is satisfied.
+    stripNonValidData(M);
+    return true;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    // We add and rewrite a bunch of instructions, but don't really do much
+    // else.  We could in theory preserve a lot more analyses here.
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+};
+
+} // end anonymous namespace
+
+char RewriteStatepointsForGCLegacyPass::ID = 0;
+
+ModulePass *llvm::createRewriteStatepointsForGCLegacyPass() {
+  return new RewriteStatepointsForGCLegacyPass();
+}
+
+INITIALIZE_PASS_BEGIN(RewriteStatepointsForGCLegacyPass,
+                      "rewrite-statepoints-for-gc",
+                      "Make relocations explicit at statepoints", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(RewriteStatepointsForGCLegacyPass,
+                    "rewrite-statepoints-for-gc",
+                    "Make relocations explicit at statepoints", false, false)
+
+namespace {
+
+struct GCPtrLivenessData {
+  /// Values defined in this block.
+  MapVector<BasicBlock *, SetVector<Value *>> KillSet;
+
+  /// Values used in this block (and thus live); does not included values
+  /// killed within this block.
+  MapVector<BasicBlock *, SetVector<Value *>> LiveSet;
+
+  /// Values live into this basic block (i.e. used by any
+  /// instruction in this basic block or ones reachable from here)
+  MapVector<BasicBlock *, SetVector<Value *>> LiveIn;
+
+  /// Values live out of this basic block (i.e. live into
+  /// any successor block)
+  MapVector<BasicBlock *, SetVector<Value *>> LiveOut;
+};
+
+// The type of the internal cache used inside the findBasePointers family
+// of functions.  From the callers perspective, this is an opaque type and
+// should not be inspected.
+//
+// In the actual implementation this caches two relations:
+// - The base relation itself (i.e. this pointer is based on that one)
+// - The base defining value relation (i.e. before base_phi insertion)
+// Generally, after the execution of a full findBasePointer call, only the
+// base relation will remain.  Internally, we add a mixture of the two
+// types, then update all the second type to the first type
+using DefiningValueMapTy = MapVector<Value *, Value *>;
+using StatepointLiveSetTy = SetVector<Value *>;
+using RematerializedValueMapTy =
+    MapVector<AssertingVH<Instruction>, AssertingVH<Value>>;
+
+struct PartiallyConstructedSafepointRecord {
+  /// The set of values known to be live across this safepoint
+  StatepointLiveSetTy LiveSet;
+
+  /// Mapping from live pointers to a base-defining-value
+  MapVector<Value *, Value *> PointerToBase;
+
+  /// The *new* gc.statepoint instruction itself.  This produces the token
+  /// that normal path gc.relocates and the gc.result are tied to.
+  GCStatepointInst *StatepointToken;
+
+  /// Instruction to which exceptional gc relocates are attached
+  /// Makes it easier to iterate through them during relocationViaAlloca.
+  Instruction *UnwindToken;
+
+  /// Record live values we are rematerialized instead of relocating.
+  /// They are not included into 'LiveSet' field.
+  /// Maps rematerialized copy to it's original value.
+  RematerializedValueMapTy RematerializedValues;
+};
+
+} // end anonymous namespace
+
+static ArrayRef<Use> GetDeoptBundleOperands(const CallBase *Call) {
+  Optional<OperandBundleUse> DeoptBundle =
+      Call->getOperandBundle(LLVMContext::OB_deopt);
+
+  if (!DeoptBundle.hasValue()) {
+    assert(AllowStatepointWithNoDeoptInfo &&
+           "Found non-leaf call without deopt info!");
+    return None;
+  }
+
+  return DeoptBundle.getValue().Inputs;
+}
+
+/// Compute the live-in set for every basic block in the function
+static void computeLiveInValues(DominatorTree &DT, Function &F,
+                                GCPtrLivenessData &Data);
+
+/// Given results from the dataflow liveness computation, find the set of live
+/// Values at a particular instruction.
+static void findLiveSetAtInst(Instruction *inst, GCPtrLivenessData &Data,
+                              StatepointLiveSetTy &out);
+
+// TODO: Once we can get to the GCStrategy, this becomes
+// Optional<bool> isGCManagedPointer(const Type *Ty) const override {
+
+static bool isGCPointerType(Type *T) {
+  if (auto *PT = dyn_cast<PointerType>(T))
+    // For the sake of this example GC, we arbitrarily pick addrspace(1) as our
+    // GC managed heap.  We know that a pointer into this heap needs to be
+    // updated and that no other pointer does.
+    return PT->getAddressSpace() == 1;
+  return false;
+}
+
+// Return true if this type is one which a) is a gc pointer or contains a GC
+// pointer and b) is of a type this code expects to encounter as a live value.
+// (The insertion code will assert that a type which matches (a) and not (b)
+// is not encountered.)
+static bool isHandledGCPointerType(Type *T) {
+  // We fully support gc pointers
+  if (isGCPointerType(T))
+    return true;
+  // We partially support vectors of gc pointers. The code will assert if it
+  // can't handle something.
+  if (auto VT = dyn_cast<VectorType>(T))
+    if (isGCPointerType(VT->getElementType()))
+      return true;
+  return false;
+}
+
+#ifndef NDEBUG
+/// Returns true if this type contains a gc pointer whether we know how to
+/// handle that type or not.
+static bool containsGCPtrType(Type *Ty) {
+  if (isGCPointerType(Ty))
+    return true;
+  if (VectorType *VT = dyn_cast<VectorType>(Ty))
+    return isGCPointerType(VT->getScalarType());
+  if (ArrayType *AT = dyn_cast<ArrayType>(Ty))
+    return containsGCPtrType(AT->getElementType());
+  if (StructType *ST = dyn_cast<StructType>(Ty))
+    return llvm::any_of(ST->elements(), containsGCPtrType);
+  return false;
+}
+
+// Returns true if this is a type which a) is a gc pointer or contains a GC
+// pointer and b) is of a type which the code doesn't expect (i.e. first class
+// aggregates).  Used to trip assertions.
+static bool isUnhandledGCPointerType(Type *Ty) {
+  return containsGCPtrType(Ty) && !isHandledGCPointerType(Ty);
+}
+#endif
+
+// Return the name of the value suffixed with the provided value, or if the
+// value didn't have a name, the default value specified.
+static std::string suffixed_name_or(Value *V, StringRef Suffix,
+                                    StringRef DefaultName) {
+  return V->hasName() ? (V->getName() + Suffix).str() : DefaultName.str();
+}
+
+// Conservatively identifies any definitions which might be live at the
+// given instruction. The  analysis is performed immediately before the
+// given instruction. Values defined by that instruction are not considered
+// live.  Values used by that instruction are considered live.
+static void analyzeParsePointLiveness(
+    DominatorTree &DT, GCPtrLivenessData &OriginalLivenessData, CallBase *Call,
+    PartiallyConstructedSafepointRecord &Result) {
+  StatepointLiveSetTy LiveSet;
+  findLiveSetAtInst(Call, OriginalLivenessData, LiveSet);
+
+  if (PrintLiveSet) {
+    dbgs() << "Live Variables:\n";
+    for (Value *V : LiveSet)
+      dbgs() << " " << V->getName() << " " << *V << "\n";
+  }
+  if (PrintLiveSetSize) {
+    dbgs() << "Safepoint For: " << Call->getCalledOperand()->getName() << "\n";
+    dbgs() << "Number live values: " << LiveSet.size() << "\n";
+  }
+  Result.LiveSet = LiveSet;
+}
+
+// Returns true is V is a knownBaseResult.
+static bool isKnownBaseResult(Value *V);
+
+// Returns true if V is a BaseResult that already exists in the IR, i.e. it is
+// not created by the findBasePointers algorithm.
+static bool isOriginalBaseResult(Value *V);
+
+namespace {
+
+/// A single base defining value - An immediate base defining value for an
+/// instruction 'Def' is an input to 'Def' whose base is also a base of 'Def'.
+/// For instructions which have multiple pointer [vector] inputs or that
+/// transition between vector and scalar types, there is no immediate base
+/// defining value.  The 'base defining value' for 'Def' is the transitive
+/// closure of this relation stopping at the first instruction which has no
+/// immediate base defining value.  The b.d.v. might itself be a base pointer,
+/// but it can also be an arbitrary derived pointer.
+struct BaseDefiningValueResult {
+  /// Contains the value which is the base defining value.
+  Value * const BDV;
+
+  /// True if the base defining value is also known to be an actual base
+  /// pointer.
+  const bool IsKnownBase;
+
+  BaseDefiningValueResult(Value *BDV, bool IsKnownBase)
+    : BDV(BDV), IsKnownBase(IsKnownBase) {
+#ifndef NDEBUG
+    // Check consistency between new and old means of checking whether a BDV is
+    // a base.
+    bool MustBeBase = isKnownBaseResult(BDV);
+    assert(!MustBeBase || MustBeBase == IsKnownBase);
+#endif
+  }
+};
+
+} // end anonymous namespace
+
+static BaseDefiningValueResult findBaseDefiningValue(Value *I);
+
+/// Return a base defining value for the 'Index' element of the given vector
+/// instruction 'I'.  If Index is null, returns a BDV for the entire vector
+/// 'I'.  As an optimization, this method will try to determine when the
+/// element is known to already be a base pointer.  If this can be established,
+/// the second value in the returned pair will be true.  Note that either a
+/// vector or a pointer typed value can be returned.  For the former, the
+/// vector returned is a BDV (and possibly a base) of the entire vector 'I'.
+/// If the later, the return pointer is a BDV (or possibly a base) for the
+/// particular element in 'I'.
+static BaseDefiningValueResult
+findBaseDefiningValueOfVector(Value *I) {
+  // Each case parallels findBaseDefiningValue below, see that code for
+  // detailed motivation.
+
+  if (isa<Argument>(I))
+    // An incoming argument to the function is a base pointer
+    return BaseDefiningValueResult(I, true);
+
+  if (isa<Constant>(I))
+    // Base of constant vector consists only of constant null pointers.
+    // For reasoning see similar case inside 'findBaseDefiningValue' function.
+    return BaseDefiningValueResult(ConstantAggregateZero::get(I->getType()),
+                                   true);
+
+  if (isa<LoadInst>(I))
+    return BaseDefiningValueResult(I, true);
+
+  if (isa<InsertElementInst>(I))
+    // We don't know whether this vector contains entirely base pointers or
+    // not.  To be conservatively correct, we treat it as a BDV and will
+    // duplicate code as needed to construct a parallel vector of bases.
+    return BaseDefiningValueResult(I, false);
+
+  if (isa<ShuffleVectorInst>(I))
+    // We don't know whether this vector contains entirely base pointers or
+    // not.  To be conservatively correct, we treat it as a BDV and will
+    // duplicate code as needed to construct a parallel vector of bases.
+    // TODO: There a number of local optimizations which could be applied here
+    // for particular sufflevector patterns.
+    return BaseDefiningValueResult(I, false);
+
+  // The behavior of getelementptr instructions is the same for vector and
+  // non-vector data types.
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
+    return findBaseDefiningValue(GEP->getPointerOperand());
+
+  // If the pointer comes through a bitcast of a vector of pointers to
+  // a vector of another type of pointer, then look through the bitcast
+  if (auto *BC = dyn_cast<BitCastInst>(I))
+    return findBaseDefiningValue(BC->getOperand(0));
+
+  // We assume that functions in the source language only return base
+  // pointers.  This should probably be generalized via attributes to support
+  // both source language and internal functions.
+  if (isa<CallInst>(I) || isa<InvokeInst>(I))
+    return BaseDefiningValueResult(I, true);
+
+  // A PHI or Select is a base defining value.  The outer findBasePointer
+  // algorithm is responsible for constructing a base value for this BDV.
+  assert((isa<SelectInst>(I) || isa<PHINode>(I)) &&
+         "unknown vector instruction - no base found for vector element");
+  return BaseDefiningValueResult(I, false);
+}
+
+/// Helper function for findBasePointer - Will return a value which either a)
+/// defines the base pointer for the input, b) blocks the simple search
+/// (i.e. a PHI or Select of two derived pointers), or c) involves a change
+/// from pointer to vector type or back.
+static BaseDefiningValueResult findBaseDefiningValue(Value *I) {
+  assert(I->getType()->isPtrOrPtrVectorTy() &&
+         "Illegal to ask for the base pointer of a non-pointer type");
+
+  if (I->getType()->isVectorTy())
+    return findBaseDefiningValueOfVector(I);
+
+  if (isa<Argument>(I))
+    // An incoming argument to the function is a base pointer
+    // We should have never reached here if this argument isn't an gc value
+    return BaseDefiningValueResult(I, true);
+
+  if (isa<Constant>(I)) {
+    // We assume that objects with a constant base (e.g. a global) can't move
+    // and don't need to be reported to the collector because they are always
+    // live. Besides global references, all kinds of constants (e.g. undef,
+    // constant expressions, null pointers) can be introduced by the inliner or
+    // the optimizer, especially on dynamically dead paths.
+    // Here we treat all of them as having single null base. By doing this we
+    // trying to avoid problems reporting various conflicts in a form of
+    // "phi (const1, const2)" or "phi (const, regular gc ptr)".
+    // See constant.ll file for relevant test cases.
+
+    return BaseDefiningValueResult(
+        ConstantPointerNull::get(cast<PointerType>(I->getType())), true);
+  }
+
+  if (CastInst *CI = dyn_cast<CastInst>(I)) {
+    Value *Def = CI->stripPointerCasts();
+    // If stripping pointer casts changes the address space there is an
+    // addrspacecast in between.
+    assert(cast<PointerType>(Def->getType())->getAddressSpace() ==
+               cast<PointerType>(CI->getType())->getAddressSpace() &&
+           "unsupported addrspacecast");
+    // If we find a cast instruction here, it means we've found a cast which is
+    // not simply a pointer cast (i.e. an inttoptr).  We don't know how to
+    // handle int->ptr conversion.
+    assert(!isa<CastInst>(Def) && "shouldn't find another cast here");
+    return findBaseDefiningValue(Def);
+  }
+
+  if (isa<LoadInst>(I))
+    // The value loaded is an gc base itself
+    return BaseDefiningValueResult(I, true);
+
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I))
+    // The base of this GEP is the base
+    return findBaseDefiningValue(GEP->getPointerOperand());
+
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+    default:
+      // fall through to general call handling
+      break;
+    case Intrinsic::experimental_gc_statepoint:
+      llvm_unreachable("statepoints don't produce pointers");
+    case Intrinsic::experimental_gc_relocate:
+      // Rerunning safepoint insertion after safepoints are already
+      // inserted is not supported.  It could probably be made to work,
+      // but why are you doing this?  There's no good reason.
+      llvm_unreachable("repeat safepoint insertion is not supported");
+    case Intrinsic::gcroot:
+      // Currently, this mechanism hasn't been extended to work with gcroot.
+      // There's no reason it couldn't be, but I haven't thought about the
+      // implications much.
+      llvm_unreachable(
+          "interaction with the gcroot mechanism is not supported");
+    }
+  }
+  // We assume that functions in the source language only return base
+  // pointers.  This should probably be generalized via attributes to support
+  // both source language and internal functions.
+  if (isa<CallInst>(I) || isa<InvokeInst>(I))
+    return BaseDefiningValueResult(I, true);
+
+  // TODO: I have absolutely no idea how to implement this part yet.  It's not
+  // necessarily hard, I just haven't really looked at it yet.
+  assert(!isa<LandingPadInst>(I) && "Landing Pad is unimplemented");
+
+  if (isa<AtomicCmpXchgInst>(I))
+    // A CAS is effectively a atomic store and load combined under a
+    // predicate.  From the perspective of base pointers, we just treat it
+    // like a load.
+    return BaseDefiningValueResult(I, true);
+
+  assert(!isa<AtomicRMWInst>(I) && "Xchg handled above, all others are "
+                                   "binary ops which don't apply to pointers");
+
+  // The aggregate ops.  Aggregates can either be in the heap or on the
+  // stack, but in either case, this is simply a field load.  As a result,
+  // this is a defining definition of the base just like a load is.
+  if (isa<ExtractValueInst>(I))
+    return BaseDefiningValueResult(I, true);
+
+  // We should never see an insert vector since that would require we be
+  // tracing back a struct value not a pointer value.
+  assert(!isa<InsertValueInst>(I) &&
+         "Base pointer for a struct is meaningless");
+
+  // An extractelement produces a base result exactly when it's input does.
+  // We may need to insert a parallel instruction to extract the appropriate
+  // element out of the base vector corresponding to the input. Given this,
+  // it's analogous to the phi and select case even though it's not a merge.
+  if (isa<ExtractElementInst>(I))
+    // Note: There a lot of obvious peephole cases here.  This are deliberately
+    // handled after the main base pointer inference algorithm to make writing
+    // test cases to exercise that code easier.
+    return BaseDefiningValueResult(I, false);
+
+  // The last two cases here don't return a base pointer.  Instead, they
+  // return a value which dynamically selects from among several base
+  // derived pointers (each with it's own base potentially).  It's the job of
+  // the caller to resolve these.
+  assert((isa<SelectInst>(I) || isa<PHINode>(I)) &&
+         "missing instruction case in findBaseDefiningValing");
+  return BaseDefiningValueResult(I, false);
+}
+
+/// Returns the base defining value for this value.
+static Value *findBaseDefiningValueCached(Value *I, DefiningValueMapTy &Cache) {
+  Value *&Cached = Cache[I];
+  if (!Cached) {
+    Cached = findBaseDefiningValue(I).BDV;
+    LLVM_DEBUG(dbgs() << "fBDV-cached: " << I->getName() << " -> "
+                      << Cached->getName() << "\n");
+  }
+  assert(Cache[I] != nullptr);
+  return Cached;
+}
+
+/// Return a base pointer for this value if known.  Otherwise, return it's
+/// base defining value.
+static Value *findBaseOrBDV(Value *I, DefiningValueMapTy &Cache) {
+  Value *Def = findBaseDefiningValueCached(I, Cache);
+  auto Found = Cache.find(Def);
+  if (Found != Cache.end()) {
+    // Either a base-of relation, or a self reference.  Caller must check.
+    return Found->second;
+  }
+  // Only a BDV available
+  return Def;
+}
+
+/// This value is a base pointer that is not generated by RS4GC, i.e. it already
+/// exists in the code.
+static bool isOriginalBaseResult(Value *V) {
+  // no recursion possible
+  return !isa<PHINode>(V) && !isa<SelectInst>(V) &&
+         !isa<ExtractElementInst>(V) && !isa<InsertElementInst>(V) &&
+         !isa<ShuffleVectorInst>(V);
+}
+
+/// Given the result of a call to findBaseDefiningValue, or findBaseOrBDV,
+/// is it known to be a base pointer?  Or do we need to continue searching.
+static bool isKnownBaseResult(Value *V) {
+  if (isOriginalBaseResult(V))
+    return true;
+  if (isa<Instruction>(V) &&
+      cast<Instruction>(V)->getMetadata("is_base_value")) {
+    // This is a previously inserted base phi or select.  We know
+    // that this is a base value.
+    return true;
+  }
+
+  // We need to keep searching
+  return false;
+}
+
+// Returns true if First and Second values are both scalar or both vector.
+static bool areBothVectorOrScalar(Value *First, Value *Second) {
+  return isa<VectorType>(First->getType()) ==
+         isa<VectorType>(Second->getType());
+}
+
+namespace {
+
+/// Models the state of a single base defining value in the findBasePointer
+/// algorithm for determining where a new instruction is needed to propagate
+/// the base of this BDV.
+class BDVState {
+public:
+  enum Status { Unknown, Base, Conflict };
+
+  BDVState() : BaseValue(nullptr) {}
+
+  explicit BDVState(Status Status, Value *BaseValue = nullptr)
+      : Status(Status), BaseValue(BaseValue) {
+    assert(Status != Base || BaseValue);
+  }
+
+  explicit BDVState(Value *BaseValue) : Status(Base), BaseValue(BaseValue) {}
+
+  Status getStatus() const { return Status; }
+  Value *getBaseValue() const { return BaseValue; }
+
+  bool isBase() const { return getStatus() == Base; }
+  bool isUnknown() const { return getStatus() == Unknown; }
+  bool isConflict() const { return getStatus() == Conflict; }
+
+  bool operator==(const BDVState &Other) const {
+    return BaseValue == Other.BaseValue && Status == Other.Status;
+  }
+
+  bool operator!=(const BDVState &other) const { return !(*this == other); }
+
+  LLVM_DUMP_METHOD
+  void dump() const {
+    print(dbgs());
+    dbgs() << '\n';
+  }
+
+  void print(raw_ostream &OS) const {
+    switch (getStatus()) {
+    case Unknown:
+      OS << "U";
+      break;
+    case Base:
+      OS << "B";
+      break;
+    case Conflict:
+      OS << "C";
+      break;
+    }
+    OS << " (" << getBaseValue() << " - "
+       << (getBaseValue() ? getBaseValue()->getName() : "nullptr") << "): ";
+  }
+
+private:
+  Status Status = Unknown;
+  AssertingVH<Value> BaseValue; // Non-null only if Status == Base.
+};
+
+} // end anonymous namespace
+
+#ifndef NDEBUG
+static raw_ostream &operator<<(raw_ostream &OS, const BDVState &State) {
+  State.print(OS);
+  return OS;
+}
+#endif
+
+static BDVState meetBDVStateImpl(const BDVState &LHS, const BDVState &RHS) {
+  switch (LHS.getStatus()) {
+  case BDVState::Unknown:
+    return RHS;
+
+  case BDVState::Base:
+    assert(LHS.getBaseValue() && "can't be null");
+    if (RHS.isUnknown())
+      return LHS;
+
+    if (RHS.isBase()) {
+      if (LHS.getBaseValue() == RHS.getBaseValue()) {
+        assert(LHS == RHS && "equality broken!");
+        return LHS;
+      }
+      return BDVState(BDVState::Conflict);
+    }
+    assert(RHS.isConflict() && "only three states!");
+    return BDVState(BDVState::Conflict);
+
+  case BDVState::Conflict:
+    return LHS;
+  }
+  llvm_unreachable("only three states!");
+}
+
+// Values of type BDVState form a lattice, and this function implements the meet
+// operation.
+static BDVState meetBDVState(const BDVState &LHS, const BDVState &RHS) {
+  BDVState Result = meetBDVStateImpl(LHS, RHS);
+  assert(Result == meetBDVStateImpl(RHS, LHS) &&
+         "Math is wrong: meet does not commute!");
+  return Result;
+}
+
+/// For a given value or instruction, figure out what base ptr its derived from.
+/// For gc objects, this is simply itself.  On success, returns a value which is
+/// the base pointer.  (This is reliable and can be used for relocation.)  On
+/// failure, returns nullptr.
+static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
+  Value *Def = findBaseOrBDV(I, Cache);
+
+  if (isKnownBaseResult(Def) && areBothVectorOrScalar(Def, I))
+    return Def;
+
+  // Here's the rough algorithm:
+  // - For every SSA value, construct a mapping to either an actual base
+  //   pointer or a PHI which obscures the base pointer.
+  // - Construct a mapping from PHI to unknown TOP state.  Use an
+  //   optimistic algorithm to propagate base pointer information.  Lattice
+  //   looks like:
+  //   UNKNOWN
+  //   b1 b2 b3 b4
+  //   CONFLICT
+  //   When algorithm terminates, all PHIs will either have a single concrete
+  //   base or be in a conflict state.
+  // - For every conflict, insert a dummy PHI node without arguments.  Add
+  //   these to the base[Instruction] = BasePtr mapping.  For every
+  //   non-conflict, add the actual base.
+  //  - For every conflict, add arguments for the base[a] of each input
+  //   arguments.
+  //
+  // Note: A simpler form of this would be to add the conflict form of all
+  // PHIs without running the optimistic algorithm.  This would be
+  // analogous to pessimistic data flow and would likely lead to an
+  // overall worse solution.
+
+#ifndef NDEBUG
+  auto isExpectedBDVType = [](Value *BDV) {
+    return isa<PHINode>(BDV) || isa<SelectInst>(BDV) ||
+           isa<ExtractElementInst>(BDV) || isa<InsertElementInst>(BDV) ||
+           isa<ShuffleVectorInst>(BDV);
+  };
+#endif
+
+  // Once populated, will contain a mapping from each potentially non-base BDV
+  // to a lattice value (described above) which corresponds to that BDV.
+  // We use the order of insertion (DFS over the def/use graph) to provide a
+  // stable deterministic ordering for visiting DenseMaps (which are unordered)
+  // below.  This is important for deterministic compilation.
+  MapVector<Value *, BDVState> States;
+
+  // Recursively fill in all base defining values reachable from the initial
+  // one for which we don't already know a definite base value for
+  /* scope */ {
+    SmallVector<Value*, 16> Worklist;
+    Worklist.push_back(Def);
+    States.insert({Def, BDVState()});
+    while (!Worklist.empty()) {
+      Value *Current = Worklist.pop_back_val();
+      assert(!isOriginalBaseResult(Current) && "why did it get added?");
+
+      auto visitIncomingValue = [&](Value *InVal) {
+        Value *Base = findBaseOrBDV(InVal, Cache);
+        if (isKnownBaseResult(Base) && areBothVectorOrScalar(Base, InVal))
+          // Known bases won't need new instructions introduced and can be
+          // ignored safely. However, this can only be done when InVal and Base
+          // are both scalar or both vector. Otherwise, we need to find a
+          // correct BDV for InVal, by creating an entry in the lattice
+          // (States).
+          return;
+        assert(isExpectedBDVType(Base) && "the only non-base values "
+               "we see should be base defining values");
+        if (States.insert(std::make_pair(Base, BDVState())).second)
+          Worklist.push_back(Base);
+      };
+      if (PHINode *PN = dyn_cast<PHINode>(Current)) {
+        for (Value *InVal : PN->incoming_values())
+          visitIncomingValue(InVal);
+      } else if (SelectInst *SI = dyn_cast<SelectInst>(Current)) {
+        visitIncomingValue(SI->getTrueValue());
+        visitIncomingValue(SI->getFalseValue());
+      } else if (auto *EE = dyn_cast<ExtractElementInst>(Current)) {
+        visitIncomingValue(EE->getVectorOperand());
+      } else if (auto *IE = dyn_cast<InsertElementInst>(Current)) {
+        visitIncomingValue(IE->getOperand(0)); // vector operand
+        visitIncomingValue(IE->getOperand(1)); // scalar operand
+      } else if (auto *SV = dyn_cast<ShuffleVectorInst>(Current)) {
+        visitIncomingValue(SV->getOperand(0));
+        visitIncomingValue(SV->getOperand(1));
+      }
+      else {
+        llvm_unreachable("Unimplemented instruction case");
+      }
+    }
+  }
+
+#ifndef NDEBUG
+  LLVM_DEBUG(dbgs() << "States after initialization:\n");
+  for (auto Pair : States) {
+    LLVM_DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");
+  }
+#endif
+
+  // Return a phi state for a base defining value.  We'll generate a new
+  // base state for known bases and expect to find a cached state otherwise.
+  auto GetStateForBDV = [&](Value *BaseValue, Value *Input) {
+    if (isKnownBaseResult(BaseValue) && areBothVectorOrScalar(BaseValue, Input))
+      return BDVState(BaseValue);
+    auto I = States.find(BaseValue);
+    assert(I != States.end() && "lookup failed!");
+    return I->second;
+  };
+
+  bool Progress = true;
+  while (Progress) {
+#ifndef NDEBUG
+    const size_t OldSize = States.size();
+#endif
+    Progress = false;
+    // We're only changing values in this loop, thus safe to keep iterators.
+    // Since this is computing a fixed point, the order of visit does not
+    // effect the result.  TODO: We could use a worklist here and make this run
+    // much faster.
+    for (auto Pair : States) {
+      Value *BDV = Pair.first;
+      // Only values that do not have known bases or those that have differing
+      // type (scalar versus vector) from a possible known base should be in the
+      // lattice.
+      assert((!isKnownBaseResult(BDV) ||
+             !areBothVectorOrScalar(BDV, Pair.second.getBaseValue())) &&
+                 "why did it get added?");
+
+      // Given an input value for the current instruction, return a BDVState
+      // instance which represents the BDV of that value.
+      auto getStateForInput = [&](Value *V) mutable {
+        Value *BDV = findBaseOrBDV(V, Cache);
+        return GetStateForBDV(BDV, V);
+      };
+
+      BDVState NewState;
+      if (SelectInst *SI = dyn_cast<SelectInst>(BDV)) {
+        NewState = meetBDVState(NewState, getStateForInput(SI->getTrueValue()));
+        NewState =
+            meetBDVState(NewState, getStateForInput(SI->getFalseValue()));
+      } else if (PHINode *PN = dyn_cast<PHINode>(BDV)) {
+        for (Value *Val : PN->incoming_values())
+          NewState = meetBDVState(NewState, getStateForInput(Val));
+      } else if (auto *EE = dyn_cast<ExtractElementInst>(BDV)) {
+        // The 'meet' for an extractelement is slightly trivial, but it's still
+        // useful in that it drives us to conflict if our input is.
+        NewState =
+            meetBDVState(NewState, getStateForInput(EE->getVectorOperand()));
+      } else if (auto *IE = dyn_cast<InsertElementInst>(BDV)){
+        // Given there's a inherent type mismatch between the operands, will
+        // *always* produce Conflict.
+        NewState = meetBDVState(NewState, getStateForInput(IE->getOperand(0)));
+        NewState = meetBDVState(NewState, getStateForInput(IE->getOperand(1)));
+      } else {
+        // The only instance this does not return a Conflict is when both the
+        // vector operands are the same vector.
+        auto *SV = cast<ShuffleVectorInst>(BDV);
+        NewState = meetBDVState(NewState, getStateForInput(SV->getOperand(0)));
+        NewState = meetBDVState(NewState, getStateForInput(SV->getOperand(1)));
+      }
+
+      BDVState OldState = States[BDV];
+      if (OldState != NewState) {
+        Progress = true;
+        States[BDV] = NewState;
+      }
+    }
+
+    assert(OldSize == States.size() &&
+           "fixed point shouldn't be adding any new nodes to state");
+  }
+
+#ifndef NDEBUG
+  LLVM_DEBUG(dbgs() << "States after meet iteration:\n");
+  for (auto Pair : States) {
+    LLVM_DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");
+  }
+#endif
+
+  // Handle all instructions that have a vector BDV, but the instruction itself
+  // is of scalar type.
+  for (auto Pair : States) {
+    Instruction *I = cast<Instruction>(Pair.first);
+    BDVState State = Pair.second;
+    auto *BaseValue = State.getBaseValue();
+    // Only values that do not have known bases or those that have differing
+    // type (scalar versus vector) from a possible known base should be in the
+    // lattice.
+    assert((!isKnownBaseResult(I) || !areBothVectorOrScalar(I, BaseValue)) &&
+           "why did it get added?");
+    assert(!State.isUnknown() && "Optimistic algorithm didn't complete!");
+
+    if (!State.isBase() || !isa<VectorType>(BaseValue->getType()))
+      continue;
+    // extractelement instructions are a bit special in that we may need to
+    // insert an extract even when we know an exact base for the instruction.
+    // The problem is that we need to convert from a vector base to a scalar
+    // base for the particular indice we're interested in.
+    if (isa<ExtractElementInst>(I)) {
+      auto *EE = cast<ExtractElementInst>(I);
+      // TODO: In many cases, the new instruction is just EE itself.  We should
+      // exploit this, but can't do it here since it would break the invariant
+      // about the BDV not being known to be a base.
+      auto *BaseInst = ExtractElementInst::Create(
+          State.getBaseValue(), EE->getIndexOperand(), "base_ee", EE);
+      BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {}));
+      States[I] = BDVState(BDVState::Base, BaseInst);
+    } else if (!isa<VectorType>(I->getType())) {
+      // We need to handle cases that have a vector base but the instruction is
+      // a scalar type (these could be phis or selects or any instruction that
+      // are of scalar type, but the base can be a vector type).  We
+      // conservatively set this as conflict.  Setting the base value for these
+      // conflicts is handled in the next loop which traverses States.
+      States[I] = BDVState(BDVState::Conflict);
+    }
+  }
+
+  // Insert Phis for all conflicts
+  // TODO: adjust naming patterns to avoid this order of iteration dependency
+  for (auto Pair : States) {
+    Instruction *I = cast<Instruction>(Pair.first);
+    BDVState State = Pair.second;
+    // Only values that do not have known bases or those that have differing
+    // type (scalar versus vector) from a possible known base should be in the
+    // lattice.
+    assert((!isKnownBaseResult(I) || !areBothVectorOrScalar(I, State.getBaseValue())) &&
+           "why did it get added?");
+    assert(!State.isUnknown() && "Optimistic algorithm didn't complete!");
+
+    // Since we're joining a vector and scalar base, they can never be the
+    // same.  As a result, we should always see insert element having reached
+    // the conflict state.
+    assert(!isa<InsertElementInst>(I) || State.isConflict());
+
+    if (!State.isConflict())
+      continue;
+
+    /// Create and insert a new instruction which will represent the base of
+    /// the given instruction 'I'.
+    auto MakeBaseInstPlaceholder = [](Instruction *I) -> Instruction* {
+      if (isa<PHINode>(I)) {
+        BasicBlock *BB = I->getParent();
+        int NumPreds = pred_size(BB);
+        assert(NumPreds > 0 && "how did we reach here");
+        std::string Name = suffixed_name_or(I, ".base", "base_phi");
+        return PHINode::Create(I->getType(), NumPreds, Name, I);
+      } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+        // The undef will be replaced later
+        UndefValue *Undef = UndefValue::get(SI->getType());
+        std::string Name = suffixed_name_or(I, ".base", "base_select");
+        return SelectInst::Create(SI->getCondition(), Undef, Undef, Name, SI);
+      } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
+        UndefValue *Undef = UndefValue::get(EE->getVectorOperand()->getType());
+        std::string Name = suffixed_name_or(I, ".base", "base_ee");
+        return ExtractElementInst::Create(Undef, EE->getIndexOperand(), Name,
+                                          EE);
+      } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
+        UndefValue *VecUndef = UndefValue::get(IE->getOperand(0)->getType());
+        UndefValue *ScalarUndef = UndefValue::get(IE->getOperand(1)->getType());
+        std::string Name = suffixed_name_or(I, ".base", "base_ie");
+        return InsertElementInst::Create(VecUndef, ScalarUndef,
+                                         IE->getOperand(2), Name, IE);
+      } else {
+        auto *SV = cast<ShuffleVectorInst>(I);
+        UndefValue *VecUndef = UndefValue::get(SV->getOperand(0)->getType());
+        std::string Name = suffixed_name_or(I, ".base", "base_sv");
+        return new ShuffleVectorInst(VecUndef, VecUndef, SV->getShuffleMask(),
+                                     Name, SV);
+      }
+    };
+    Instruction *BaseInst = MakeBaseInstPlaceholder(I);
+    // Add metadata marking this as a base value
+    BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {}));
+    States[I] = BDVState(BDVState::Conflict, BaseInst);
+  }
+
+  // Returns a instruction which produces the base pointer for a given
+  // instruction.  The instruction is assumed to be an input to one of the BDVs
+  // seen in the inference algorithm above.  As such, we must either already
+  // know it's base defining value is a base, or have inserted a new
+  // instruction to propagate the base of it's BDV and have entered that newly
+  // introduced instruction into the state table.  In either case, we are
+  // assured to be able to determine an instruction which produces it's base
+  // pointer.
+  auto getBaseForInput = [&](Value *Input, Instruction *InsertPt) {
+    Value *BDV = findBaseOrBDV(Input, Cache);
+    Value *Base = nullptr;
+    if (isKnownBaseResult(BDV) && areBothVectorOrScalar(BDV, Input)) {
+      Base = BDV;
+    } else {
+      // Either conflict or base.
+      assert(States.count(BDV));
+      Base = States[BDV].getBaseValue();
+    }
+    assert(Base && "Can't be null");
+    // The cast is needed since base traversal may strip away bitcasts
+    if (Base->getType() != Input->getType() && InsertPt)
+      Base = new BitCastInst(Base, Input->getType(), "cast", InsertPt);
+    return Base;
+  };
+
+  // Fixup all the inputs of the new PHIs.  Visit order needs to be
+  // deterministic and predictable because we're naming newly created
+  // instructions.
+  for (auto Pair : States) {
+    Instruction *BDV = cast<Instruction>(Pair.first);
+    BDVState State = Pair.second;
+
+    // Only values that do not have known bases or those that have differing
+    // type (scalar versus vector) from a possible known base should be in the
+    // lattice.
+    assert((!isKnownBaseResult(BDV) ||
+            !areBothVectorOrScalar(BDV, State.getBaseValue())) &&
+           "why did it get added?");
+    assert(!State.isUnknown() && "Optimistic algorithm didn't complete!");
+    if (!State.isConflict())
+      continue;
+
+    if (PHINode *BasePHI = dyn_cast<PHINode>(State.getBaseValue())) {
+      PHINode *PN = cast<PHINode>(BDV);
+      unsigned NumPHIValues = PN->getNumIncomingValues();
+      for (unsigned i = 0; i < NumPHIValues; i++) {
+        Value *InVal = PN->getIncomingValue(i);
+        BasicBlock *InBB = PN->getIncomingBlock(i);
+
+        // If we've already seen InBB, add the same incoming value
+        // we added for it earlier.  The IR verifier requires phi
+        // nodes with multiple entries from the same basic block
+        // to have the same incoming value for each of those
+        // entries.  If we don't do this check here and basephi
+        // has a different type than base, we'll end up adding two
+        // bitcasts (and hence two distinct values) as incoming
+        // values for the same basic block.
+
+        int BlockIndex = BasePHI->getBasicBlockIndex(InBB);
+        if (BlockIndex != -1) {
+          Value *OldBase = BasePHI->getIncomingValue(BlockIndex);
+          BasePHI->addIncoming(OldBase, InBB);
+
+#ifndef NDEBUG
+          Value *Base = getBaseForInput(InVal, nullptr);
+          // In essence this assert states: the only way two values
+          // incoming from the same basic block may be different is by
+          // being different bitcasts of the same value.  A cleanup
+          // that remains TODO is changing findBaseOrBDV to return an
+          // llvm::Value of the correct type (and still remain pure).
+          // This will remove the need to add bitcasts.
+          assert(Base->stripPointerCasts() == OldBase->stripPointerCasts() &&
+                 "Sanity -- findBaseOrBDV should be pure!");
+#endif
+          continue;
+        }
+
+        // Find the instruction which produces the base for each input.  We may
+        // need to insert a bitcast in the incoming block.
+        // TODO: Need to split critical edges if insertion is needed
+        Value *Base = getBaseForInput(InVal, InBB->getTerminator());
+        BasePHI->addIncoming(Base, InBB);
+      }
+      assert(BasePHI->getNumIncomingValues() == NumPHIValues);
+    } else if (SelectInst *BaseSI =
+                   dyn_cast<SelectInst>(State.getBaseValue())) {
+      SelectInst *SI = cast<SelectInst>(BDV);
+
+      // Find the instruction which produces the base for each input.
+      // We may need to insert a bitcast.
+      BaseSI->setTrueValue(getBaseForInput(SI->getTrueValue(), BaseSI));
+      BaseSI->setFalseValue(getBaseForInput(SI->getFalseValue(), BaseSI));
+    } else if (auto *BaseEE =
+                   dyn_cast<ExtractElementInst>(State.getBaseValue())) {
+      Value *InVal = cast<ExtractElementInst>(BDV)->getVectorOperand();
+      // Find the instruction which produces the base for each input.  We may
+      // need to insert a bitcast.
+      BaseEE->setOperand(0, getBaseForInput(InVal, BaseEE));
+    } else if (auto *BaseIE = dyn_cast<InsertElementInst>(State.getBaseValue())){
+      auto *BdvIE = cast<InsertElementInst>(BDV);
+      auto UpdateOperand = [&](int OperandIdx) {
+        Value *InVal = BdvIE->getOperand(OperandIdx);
+        Value *Base = getBaseForInput(InVal, BaseIE);
+        BaseIE->setOperand(OperandIdx, Base);
+      };
+      UpdateOperand(0); // vector operand
+      UpdateOperand(1); // scalar operand
+    } else {
+      auto *BaseSV = cast<ShuffleVectorInst>(State.getBaseValue());
+      auto *BdvSV = cast<ShuffleVectorInst>(BDV);
+      auto UpdateOperand = [&](int OperandIdx) {
+        Value *InVal = BdvSV->getOperand(OperandIdx);
+        Value *Base = getBaseForInput(InVal, BaseSV);
+        BaseSV->setOperand(OperandIdx, Base);
+      };
+      UpdateOperand(0); // vector operand
+      UpdateOperand(1); // vector operand
+    }
+  }
+
+  // Cache all of our results so we can cheaply reuse them
+  // NOTE: This is actually two caches: one of the base defining value
+  // relation and one of the base pointer relation!  FIXME
+  for (auto Pair : States) {
+    auto *BDV = Pair.first;
+    Value *Base = Pair.second.getBaseValue();
+    assert(BDV && Base);
+    // Only values that do not have known bases or those that have differing
+    // type (scalar versus vector) from a possible known base should be in the
+    // lattice.
+    assert((!isKnownBaseResult(BDV) || !areBothVectorOrScalar(BDV, Base)) &&
+           "why did it get added?");
+
+    LLVM_DEBUG(
+        dbgs() << "Updating base value cache"
+               << " for: " << BDV->getName() << " from: "
+               << (Cache.count(BDV) ? Cache[BDV]->getName().str() : "none")
+               << " to: " << Base->getName() << "\n");
+
+    if (Cache.count(BDV)) {
+      assert(isKnownBaseResult(Base) &&
+             "must be something we 'know' is a base pointer");
+      // Once we transition from the BDV relation being store in the Cache to
+      // the base relation being stored, it must be stable
+      assert((!isKnownBaseResult(Cache[BDV]) || Cache[BDV] == Base) &&
+             "base relation should be stable");
+    }
+    Cache[BDV] = Base;
+  }
+  assert(Cache.count(Def));
+  return Cache[Def];
+}
+
+// For a set of live pointers (base and/or derived), identify the base
+// pointer of the object which they are derived from.  This routine will
+// mutate the IR graph as needed to make the 'base' pointer live at the
+// definition site of 'derived'.  This ensures that any use of 'derived' can
+// also use 'base'.  This may involve the insertion of a number of
+// additional PHI nodes.
+//
+// preconditions: live is a set of pointer type Values
+//
+// side effects: may insert PHI nodes into the existing CFG, will preserve
+// CFG, will not remove or mutate any existing nodes
+//
+// post condition: PointerToBase contains one (derived, base) pair for every
+// pointer in live.  Note that derived can be equal to base if the original
+// pointer was a base pointer.
+static void
+findBasePointers(const StatepointLiveSetTy &live,
+                 MapVector<Value *, Value *> &PointerToBase,
+                 DominatorTree *DT, DefiningValueMapTy &DVCache) {
+  for (Value *ptr : live) {
+    Value *base = findBasePointer(ptr, DVCache);
+    assert(base && "failed to find base pointer");
+    PointerToBase[ptr] = base;
+    assert((!isa<Instruction>(base) || !isa<Instruction>(ptr) ||
+            DT->dominates(cast<Instruction>(base)->getParent(),
+                          cast<Instruction>(ptr)->getParent())) &&
+           "The base we found better dominate the derived pointer");
+  }
+}
+
+/// Find the required based pointers (and adjust the live set) for the given
+/// parse point.
+static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache,
+                             CallBase *Call,
+                             PartiallyConstructedSafepointRecord &result) {
+  MapVector<Value *, Value *> PointerToBase;
+  findBasePointers(result.LiveSet, PointerToBase, &DT, DVCache);
+
+  if (PrintBasePointers) {
+    errs() << "Base Pairs (w/o Relocation):\n";
+    for (auto &Pair : PointerToBase) {
+      errs() << " derived ";
+      Pair.first->printAsOperand(errs(), false);
+      errs() << " base ";
+      Pair.second->printAsOperand(errs(), false);
+      errs() << "\n";;
+    }
+  }
+
+  result.PointerToBase = PointerToBase;
+}
+
+/// Given an updated version of the dataflow liveness results, update the
+/// liveset and base pointer maps for the call site CS.
+static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData,
+                                  CallBase *Call,
+                                  PartiallyConstructedSafepointRecord &result);
+
+static void recomputeLiveInValues(
+    Function &F, DominatorTree &DT, ArrayRef<CallBase *> toUpdate,
+    MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) {
+  // TODO-PERF: reuse the original liveness, then simply run the dataflow
+  // again.  The old values are still live and will help it stabilize quickly.
+  GCPtrLivenessData RevisedLivenessData;
+  computeLiveInValues(DT, F, RevisedLivenessData);
+  for (size_t i = 0; i < records.size(); i++) {
+    struct PartiallyConstructedSafepointRecord &info = records[i];
+    recomputeLiveInValues(RevisedLivenessData, toUpdate[i], info);
+  }
+}
+
+// When inserting gc.relocate and gc.result calls, we need to ensure there are
+// no uses of the original value / return value between the gc.statepoint and
+// the gc.relocate / gc.result call.  One case which can arise is a phi node
+// starting one of the successor blocks.  We also need to be able to insert the
+// gc.relocates only on the path which goes through the statepoint.  We might
+// need to split an edge to make this possible.
+static BasicBlock *
+normalizeForInvokeSafepoint(BasicBlock *BB, BasicBlock *InvokeParent,
+                            DominatorTree &DT) {
+  BasicBlock *Ret = BB;
+  if (!BB->getUniquePredecessor())
+    Ret = SplitBlockPredecessors(BB, InvokeParent, "", &DT);
+
+  // Now that 'Ret' has unique predecessor we can safely remove all phi nodes
+  // from it
+  FoldSingleEntryPHINodes(Ret);
+  assert(!isa<PHINode>(Ret->begin()) &&
+         "All PHI nodes should have been removed!");
+
+  // At this point, we can safely insert a gc.relocate or gc.result as the first
+  // instruction in Ret if needed.
+  return Ret;
+}
+
+// Create new attribute set containing only attributes which can be transferred
+// from original call to the safepoint.
+static AttributeList legalizeCallAttributes(LLVMContext &Ctx,
+                                            AttributeList AL) {
+  if (AL.isEmpty())
+    return AL;
+
+  // Remove the readonly, readnone, and statepoint function attributes.
+  AttrBuilder FnAttrs = AL.getFnAttributes();
+  FnAttrs.removeAttribute(Attribute::ReadNone);
+  FnAttrs.removeAttribute(Attribute::ReadOnly);
+  for (Attribute A : AL.getFnAttributes()) {
+    if (isStatepointDirectiveAttr(A))
+      FnAttrs.remove(A);
+  }
+
+  // Just skip parameter and return attributes for now
+  return AttributeList::get(Ctx, AttributeList::FunctionIndex,
+                            AttributeSet::get(Ctx, FnAttrs));
+}
+
+/// Helper function to place all gc relocates necessary for the given
+/// statepoint.
+/// Inputs:
+///   liveVariables - list of variables to be relocated.
+///   basePtrs - base pointers.
+///   statepointToken - statepoint instruction to which relocates should be
+///   bound.
+///   Builder - Llvm IR builder to be used to construct new calls.
+static void CreateGCRelocates(ArrayRef<Value *> LiveVariables,
+                              ArrayRef<Value *> BasePtrs,
+                              Instruction *StatepointToken,
+                              IRBuilder<> &Builder) {
+  if (LiveVariables.empty())
+    return;
+
+  auto FindIndex = [](ArrayRef<Value *> LiveVec, Value *Val) {
+    auto ValIt = llvm::find(LiveVec, Val);
+    assert(ValIt != LiveVec.end() && "Val not found in LiveVec!");
+    size_t Index = std::distance(LiveVec.begin(), ValIt);
+    assert(Index < LiveVec.size() && "Bug in std::find?");
+    return Index;
+  };
+  Module *M = StatepointToken->getModule();
+
+  // All gc_relocate are generated as i8 addrspace(1)* (or a vector type whose
+  // element type is i8 addrspace(1)*). We originally generated unique
+  // declarations for each pointer type, but this proved problematic because
+  // the intrinsic mangling code is incomplete and fragile.  Since we're moving
+  // towards a single unified pointer type anyways, we can just cast everything
+  // to an i8* of the right address space.  A bitcast is added later to convert
+  // gc_relocate to the actual value's type.
+  auto getGCRelocateDecl = [&] (Type *Ty) {
+    assert(isHandledGCPointerType(Ty));
+    auto AS = Ty->getScalarType()->getPointerAddressSpace();
+    Type *NewTy = Type::getInt8PtrTy(M->getContext(), AS);
+    if (auto *VT = dyn_cast<VectorType>(Ty))
+      NewTy = FixedVectorType::get(NewTy,
+                                   cast<FixedVectorType>(VT)->getNumElements());
+    return Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_relocate,
+                                     {NewTy});
+  };
+
+  // Lazily populated map from input types to the canonicalized form mentioned
+  // in the comment above.  This should probably be cached somewhere more
+  // broadly.
+  DenseMap<Type *, Function *> TypeToDeclMap;
+
+  for (unsigned i = 0; i < LiveVariables.size(); i++) {
+    // Generate the gc.relocate call and save the result
+    Value *BaseIdx = Builder.getInt32(FindIndex(LiveVariables, BasePtrs[i]));
+    Value *LiveIdx = Builder.getInt32(i);
+
+    Type *Ty = LiveVariables[i]->getType();
+    if (!TypeToDeclMap.count(Ty))
+      TypeToDeclMap[Ty] = getGCRelocateDecl(Ty);
+    Function *GCRelocateDecl = TypeToDeclMap[Ty];
+
+    // only specify a debug name if we can give a useful one
+    CallInst *Reloc = Builder.CreateCall(
+        GCRelocateDecl, {StatepointToken, BaseIdx, LiveIdx},
+        suffixed_name_or(LiveVariables[i], ".relocated", ""));
+    // Trick CodeGen into thinking there are lots of free registers at this
+    // fake call.
+    Reloc->setCallingConv(CallingConv::Cold);
+  }
+}
+
+namespace {
+
+/// This struct is used to defer RAUWs and `eraseFromParent` s.  Using this
+/// avoids having to worry about keeping around dangling pointers to Values.
+class DeferredReplacement {
+  AssertingVH<Instruction> Old;
+  AssertingVH<Instruction> New;
+  bool IsDeoptimize = false;
+
+  DeferredReplacement() = default;
+
+public:
+  static DeferredReplacement createRAUW(Instruction *Old, Instruction *New) {
+    assert(Old != New && Old && New &&
+           "Cannot RAUW equal values or to / from null!");
+
+    DeferredReplacement D;
+    D.Old = Old;
+    D.New = New;
+    return D;
+  }
+
+  static DeferredReplacement createDelete(Instruction *ToErase) {
+    DeferredReplacement D;
+    D.Old = ToErase;
+    return D;
+  }
+
+  static DeferredReplacement createDeoptimizeReplacement(Instruction *Old) {
+#ifndef NDEBUG
+    auto *F = cast<CallInst>(Old)->getCalledFunction();
+    assert(F && F->getIntrinsicID() == Intrinsic::experimental_deoptimize &&
+           "Only way to construct a deoptimize deferred replacement");
+#endif
+    DeferredReplacement D;
+    D.Old = Old;
+    D.IsDeoptimize = true;
+    return D;
+  }
+
+  /// Does the task represented by this instance.
+  void doReplacement() {
+    Instruction *OldI = Old;
+    Instruction *NewI = New;
+
+    assert(OldI != NewI && "Disallowed at construction?!");
+    assert((!IsDeoptimize || !New) &&
+           "Deoptimize intrinsics are not replaced!");
+
+    Old = nullptr;
+    New = nullptr;
+
+    if (NewI)
+      OldI->replaceAllUsesWith(NewI);
+
+    if (IsDeoptimize) {
+      // Note: we've inserted instructions, so the call to llvm.deoptimize may
+      // not necessarily be followed by the matching return.
+      auto *RI = cast<ReturnInst>(OldI->getParent()->getTerminator());
+      new UnreachableInst(RI->getContext(), RI);
+      RI->eraseFromParent();
+    }
+
+    OldI->eraseFromParent();
+  }
+};
+
+} // end anonymous namespace
+
+static StringRef getDeoptLowering(CallBase *Call) {
+  const char *DeoptLowering = "deopt-lowering";
+  if (Call->hasFnAttr(DeoptLowering)) {
+    // FIXME: Calls have a *really* confusing interface around attributes
+    // with values.
+    const AttributeList &CSAS = Call->getAttributes();
+    if (CSAS.hasAttribute(AttributeList::FunctionIndex, DeoptLowering))
+      return CSAS.getAttribute(AttributeList::FunctionIndex, DeoptLowering)
+          .getValueAsString();
+    Function *F = Call->getCalledFunction();
+    assert(F && F->hasFnAttribute(DeoptLowering));
+    return F->getFnAttribute(DeoptLowering).getValueAsString();
+  }
+  return "live-through";
+}
+
+static void
+makeStatepointExplicitImpl(CallBase *Call, /* to replace */
+                           const SmallVectorImpl<Value *> &BasePtrs,
+                           const SmallVectorImpl<Value *> &LiveVariables,
+                           PartiallyConstructedSafepointRecord &Result,
+                           std::vector<DeferredReplacement> &Replacements) {
+  assert(BasePtrs.size() == LiveVariables.size());
+
+  // Then go ahead and use the builder do actually do the inserts.  We insert
+  // immediately before the previous instruction under the assumption that all
+  // arguments will be available here.  We can't insert afterwards since we may
+  // be replacing a terminator.
+  IRBuilder<> Builder(Call);
+
+  ArrayRef<Value *> GCArgs(LiveVariables);
+  uint64_t StatepointID = StatepointDirectives::DefaultStatepointID;
+  uint32_t NumPatchBytes = 0;
+  uint32_t Flags = uint32_t(StatepointFlags::None);
+
   SmallVector<Value *, 8> CallArgs(Call->args());
-  Optional<ArrayRef<Use>> DeoptArgs; 
-  if (auto Bundle = Call->getOperandBundle(LLVMContext::OB_deopt)) 
-    DeoptArgs = Bundle->Inputs; 
-  Optional<ArrayRef<Use>> TransitionArgs; 
-  if (auto Bundle = Call->getOperandBundle(LLVMContext::OB_gc_transition)) { 
-    TransitionArgs = Bundle->Inputs; 
-    // TODO: This flag no longer serves a purpose and can be removed later 
-    Flags |= uint32_t(StatepointFlags::GCTransition); 
-  } 
- 
-  // Instead of lowering calls to @llvm.experimental.deoptimize as normal calls 
-  // with a return value, we lower then as never returning calls to 
-  // __llvm_deoptimize that are followed by unreachable to get better codegen. 
-  bool IsDeoptimize = false; 
- 
-  StatepointDirectives SD = 
-      parseStatepointDirectivesFromAttrs(Call->getAttributes()); 
-  if (SD.NumPatchBytes) 
-    NumPatchBytes = *SD.NumPatchBytes; 
-  if (SD.StatepointID) 
-    StatepointID = *SD.StatepointID; 
- 
-  // Pass through the requested lowering if any.  The default is live-through. 
-  StringRef DeoptLowering = getDeoptLowering(Call); 
-  if (DeoptLowering.equals("live-in")) 
-    Flags |= uint32_t(StatepointFlags::DeoptLiveIn); 
-  else { 
-    assert(DeoptLowering.equals("live-through") && "Unsupported value!"); 
-  } 
- 
-  Value *CallTarget = Call->getCalledOperand(); 
-  if (Function *F = dyn_cast<Function>(CallTarget)) { 
+  Optional<ArrayRef<Use>> DeoptArgs;
+  if (auto Bundle = Call->getOperandBundle(LLVMContext::OB_deopt))
+    DeoptArgs = Bundle->Inputs;
+  Optional<ArrayRef<Use>> TransitionArgs;
+  if (auto Bundle = Call->getOperandBundle(LLVMContext::OB_gc_transition)) {
+    TransitionArgs = Bundle->Inputs;
+    // TODO: This flag no longer serves a purpose and can be removed later
+    Flags |= uint32_t(StatepointFlags::GCTransition);
+  }
+
+  // Instead of lowering calls to @llvm.experimental.deoptimize as normal calls
+  // with a return value, we lower then as never returning calls to
+  // __llvm_deoptimize that are followed by unreachable to get better codegen.
+  bool IsDeoptimize = false;
+
+  StatepointDirectives SD =
+      parseStatepointDirectivesFromAttrs(Call->getAttributes());
+  if (SD.NumPatchBytes)
+    NumPatchBytes = *SD.NumPatchBytes;
+  if (SD.StatepointID)
+    StatepointID = *SD.StatepointID;
+
+  // Pass through the requested lowering if any.  The default is live-through.
+  StringRef DeoptLowering = getDeoptLowering(Call);
+  if (DeoptLowering.equals("live-in"))
+    Flags |= uint32_t(StatepointFlags::DeoptLiveIn);
+  else {
+    assert(DeoptLowering.equals("live-through") && "Unsupported value!");
+  }
+
+  Value *CallTarget = Call->getCalledOperand();
+  if (Function *F = dyn_cast<Function>(CallTarget)) {
     auto IID = F->getIntrinsicID();
     if (IID == Intrinsic::experimental_deoptimize) {
-      // Calls to llvm.experimental.deoptimize are lowered to calls to the 
-      // __llvm_deoptimize symbol.  We want to resolve this now, since the 
-      // verifier does not allow taking the address of an intrinsic function. 
- 
-      SmallVector<Type *, 8> DomainTy; 
-      for (Value *Arg : CallArgs) 
-        DomainTy.push_back(Arg->getType()); 
-      auto *FTy = FunctionType::get(Type::getVoidTy(F->getContext()), DomainTy, 
-                                    /* isVarArg = */ false); 
- 
-      // Note: CallTarget can be a bitcast instruction of a symbol if there are 
-      // calls to @llvm.experimental.deoptimize with different argument types in 
-      // the same module.  This is fine -- we assume the frontend knew what it 
-      // was doing when generating this kind of IR. 
-      CallTarget = F->getParent() 
-                       ->getOrInsertFunction("__llvm_deoptimize", FTy) 
-                       .getCallee(); 
- 
-      IsDeoptimize = true; 
+      // Calls to llvm.experimental.deoptimize are lowered to calls to the
+      // __llvm_deoptimize symbol.  We want to resolve this now, since the
+      // verifier does not allow taking the address of an intrinsic function.
+
+      SmallVector<Type *, 8> DomainTy;
+      for (Value *Arg : CallArgs)
+        DomainTy.push_back(Arg->getType());
+      auto *FTy = FunctionType::get(Type::getVoidTy(F->getContext()), DomainTy,
+                                    /* isVarArg = */ false);
+
+      // Note: CallTarget can be a bitcast instruction of a symbol if there are
+      // calls to @llvm.experimental.deoptimize with different argument types in
+      // the same module.  This is fine -- we assume the frontend knew what it
+      // was doing when generating this kind of IR.
+      CallTarget = F->getParent()
+                       ->getOrInsertFunction("__llvm_deoptimize", FTy)
+                       .getCallee();
+
+      IsDeoptimize = true;
     } else if (IID == Intrinsic::memcpy_element_unordered_atomic ||
                IID == Intrinsic::memmove_element_unordered_atomic) {
       // Unordered atomic memcpy and memmove intrinsics which are not explicitly
@@ -1636,1045 +1636,1045 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
           F->getParent()
               ->getOrInsertFunction(GetFunctionName(IID, ElementSizeCI), FTy)
               .getCallee();
-    } 
-  } 
- 
-  // Create the statepoint given all the arguments 
-  GCStatepointInst *Token = nullptr; 
-  if (auto *CI = dyn_cast<CallInst>(Call)) { 
-    CallInst *SPCall = Builder.CreateGCStatepointCall( 
-        StatepointID, NumPatchBytes, CallTarget, Flags, CallArgs, 
-        TransitionArgs, DeoptArgs, GCArgs, "safepoint_token"); 
- 
-    SPCall->setTailCallKind(CI->getTailCallKind()); 
-    SPCall->setCallingConv(CI->getCallingConv()); 
- 
-    // Currently we will fail on parameter attributes and on certain 
-    // function attributes.  In case if we can handle this set of attributes - 
-    // set up function attrs directly on statepoint and return attrs later for 
-    // gc_result intrinsic. 
-    SPCall->setAttributes( 
-        legalizeCallAttributes(CI->getContext(), CI->getAttributes())); 
- 
-    Token = cast<GCStatepointInst>(SPCall); 
- 
-    // Put the following gc_result and gc_relocate calls immediately after the 
-    // the old call (which we're about to delete) 
-    assert(CI->getNextNode() && "Not a terminator, must have next!"); 
-    Builder.SetInsertPoint(CI->getNextNode()); 
-    Builder.SetCurrentDebugLocation(CI->getNextNode()->getDebugLoc()); 
-  } else { 
-    auto *II = cast<InvokeInst>(Call); 
- 
-    // Insert the new invoke into the old block.  We'll remove the old one in a 
-    // moment at which point this will become the new terminator for the 
-    // original block. 
-    InvokeInst *SPInvoke = Builder.CreateGCStatepointInvoke( 
-        StatepointID, NumPatchBytes, CallTarget, II->getNormalDest(), 
-        II->getUnwindDest(), Flags, CallArgs, TransitionArgs, DeoptArgs, GCArgs, 
-        "statepoint_token"); 
- 
-    SPInvoke->setCallingConv(II->getCallingConv()); 
- 
-    // Currently we will fail on parameter attributes and on certain 
-    // function attributes.  In case if we can handle this set of attributes - 
-    // set up function attrs directly on statepoint and return attrs later for 
-    // gc_result intrinsic. 
-    SPInvoke->setAttributes( 
-        legalizeCallAttributes(II->getContext(), II->getAttributes())); 
- 
-    Token = cast<GCStatepointInst>(SPInvoke); 
- 
-    // Generate gc relocates in exceptional path 
-    BasicBlock *UnwindBlock = II->getUnwindDest(); 
-    assert(!isa<PHINode>(UnwindBlock->begin()) && 
-           UnwindBlock->getUniquePredecessor() && 
-           "can't safely insert in this block!"); 
- 
-    Builder.SetInsertPoint(&*UnwindBlock->getFirstInsertionPt()); 
-    Builder.SetCurrentDebugLocation(II->getDebugLoc()); 
- 
-    // Attach exceptional gc relocates to the landingpad. 
-    Instruction *ExceptionalToken = UnwindBlock->getLandingPadInst(); 
-    Result.UnwindToken = ExceptionalToken; 
- 
-    CreateGCRelocates(LiveVariables, BasePtrs, ExceptionalToken, Builder); 
- 
-    // Generate gc relocates and returns for normal block 
-    BasicBlock *NormalDest = II->getNormalDest(); 
-    assert(!isa<PHINode>(NormalDest->begin()) && 
-           NormalDest->getUniquePredecessor() && 
-           "can't safely insert in this block!"); 
- 
-    Builder.SetInsertPoint(&*NormalDest->getFirstInsertionPt()); 
- 
-    // gc relocates will be generated later as if it were regular call 
-    // statepoint 
-  } 
-  assert(Token && "Should be set in one of the above branches!"); 
- 
-  if (IsDeoptimize) { 
-    // If we're wrapping an @llvm.experimental.deoptimize in a statepoint, we 
-    // transform the tail-call like structure to a call to a void function 
-    // followed by unreachable to get better codegen. 
-    Replacements.push_back( 
-        DeferredReplacement::createDeoptimizeReplacement(Call)); 
-  } else { 
-    Token->setName("statepoint_token"); 
-    if (!Call->getType()->isVoidTy() && !Call->use_empty()) { 
-      StringRef Name = Call->hasName() ? Call->getName() : ""; 
-      CallInst *GCResult = Builder.CreateGCResult(Token, Call->getType(), Name); 
-      GCResult->setAttributes( 
-          AttributeList::get(GCResult->getContext(), AttributeList::ReturnIndex, 
-                             Call->getAttributes().getRetAttributes())); 
- 
-      // We cannot RAUW or delete CS.getInstruction() because it could be in the 
-      // live set of some other safepoint, in which case that safepoint's 
-      // PartiallyConstructedSafepointRecord will hold a raw pointer to this 
-      // llvm::Instruction.  Instead, we defer the replacement and deletion to 
-      // after the live sets have been made explicit in the IR, and we no longer 
-      // have raw pointers to worry about. 
-      Replacements.emplace_back( 
-          DeferredReplacement::createRAUW(Call, GCResult)); 
-    } else { 
-      Replacements.emplace_back(DeferredReplacement::createDelete(Call)); 
-    } 
-  } 
- 
-  Result.StatepointToken = Token; 
- 
-  // Second, create a gc.relocate for every live variable 
-  CreateGCRelocates(LiveVariables, BasePtrs, Token, Builder); 
-} 
- 
-// Replace an existing gc.statepoint with a new one and a set of gc.relocates 
-// which make the relocations happening at this safepoint explicit. 
-// 
-// WARNING: Does not do any fixup to adjust users of the original live 
-// values.  That's the callers responsibility. 
-static void 
-makeStatepointExplicit(DominatorTree &DT, CallBase *Call, 
-                       PartiallyConstructedSafepointRecord &Result, 
-                       std::vector<DeferredReplacement> &Replacements) { 
-  const auto &LiveSet = Result.LiveSet; 
-  const auto &PointerToBase = Result.PointerToBase; 
- 
-  // Convert to vector for efficient cross referencing. 
-  SmallVector<Value *, 64> BaseVec, LiveVec; 
-  LiveVec.reserve(LiveSet.size()); 
-  BaseVec.reserve(LiveSet.size()); 
-  for (Value *L : LiveSet) { 
-    LiveVec.push_back(L); 
-    assert(PointerToBase.count(L)); 
-    Value *Base = PointerToBase.find(L)->second; 
-    BaseVec.push_back(Base); 
-  } 
-  assert(LiveVec.size() == BaseVec.size()); 
- 
-  // Do the actual rewriting and delete the old statepoint 
-  makeStatepointExplicitImpl(Call, BaseVec, LiveVec, Result, Replacements); 
-} 
- 
-// Helper function for the relocationViaAlloca. 
-// 
-// It receives iterator to the statepoint gc relocates and emits a store to the 
-// assigned location (via allocaMap) for the each one of them.  It adds the 
-// visited values into the visitedLiveValues set, which we will later use them 
-// for sanity checking. 
-static void 
-insertRelocationStores(iterator_range<Value::user_iterator> GCRelocs, 
-                       DenseMap<Value *, AllocaInst *> &AllocaMap, 
-                       DenseSet<Value *> &VisitedLiveValues) { 
-  for (User *U : GCRelocs) { 
-    GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U); 
-    if (!Relocate) 
-      continue; 
- 
-    Value *OriginalValue = Relocate->getDerivedPtr(); 
-    assert(AllocaMap.count(OriginalValue)); 
-    Value *Alloca = AllocaMap[OriginalValue]; 
- 
-    // Emit store into the related alloca 
-    // All gc_relocates are i8 addrspace(1)* typed, and it must be bitcasted to 
-    // the correct type according to alloca. 
-    assert(Relocate->getNextNode() && 
-           "Should always have one since it's not a terminator"); 
-    IRBuilder<> Builder(Relocate->getNextNode()); 
-    Value *CastedRelocatedValue = 
-      Builder.CreateBitCast(Relocate, 
-                            cast<AllocaInst>(Alloca)->getAllocatedType(), 
-                            suffixed_name_or(Relocate, ".casted", "")); 
- 
-    new StoreInst(CastedRelocatedValue, Alloca, 
-                  cast<Instruction>(CastedRelocatedValue)->getNextNode()); 
- 
-#ifndef NDEBUG 
-    VisitedLiveValues.insert(OriginalValue); 
-#endif 
-  } 
-} 
- 
-// Helper function for the "relocationViaAlloca". Similar to the 
-// "insertRelocationStores" but works for rematerialized values. 
-static void insertRematerializationStores( 
-    const RematerializedValueMapTy &RematerializedValues, 
-    DenseMap<Value *, AllocaInst *> &AllocaMap, 
-    DenseSet<Value *> &VisitedLiveValues) { 
-  for (auto RematerializedValuePair: RematerializedValues) { 
-    Instruction *RematerializedValue = RematerializedValuePair.first; 
-    Value *OriginalValue = RematerializedValuePair.second; 
- 
-    assert(AllocaMap.count(OriginalValue) && 
-           "Can not find alloca for rematerialized value"); 
-    Value *Alloca = AllocaMap[OriginalValue]; 
- 
-    new StoreInst(RematerializedValue, Alloca, 
-                  RematerializedValue->getNextNode()); 
- 
-#ifndef NDEBUG 
-    VisitedLiveValues.insert(OriginalValue); 
-#endif 
-  } 
-} 
- 
-/// Do all the relocation update via allocas and mem2reg 
-static void relocationViaAlloca( 
-    Function &F, DominatorTree &DT, ArrayRef<Value *> Live, 
-    ArrayRef<PartiallyConstructedSafepointRecord> Records) { 
-#ifndef NDEBUG 
-  // record initial number of (static) allocas; we'll check we have the same 
-  // number when we get done. 
-  int InitialAllocaNum = 0; 
-  for (Instruction &I : F.getEntryBlock()) 
-    if (isa<AllocaInst>(I)) 
-      InitialAllocaNum++; 
-#endif 
- 
-  // TODO-PERF: change data structures, reserve 
-  DenseMap<Value *, AllocaInst *> AllocaMap; 
-  SmallVector<AllocaInst *, 200> PromotableAllocas; 
-  // Used later to chack that we have enough allocas to store all values 
-  std::size_t NumRematerializedValues = 0; 
-  PromotableAllocas.reserve(Live.size()); 
- 
-  // Emit alloca for "LiveValue" and record it in "allocaMap" and 
-  // "PromotableAllocas" 
-  const DataLayout &DL = F.getParent()->getDataLayout(); 
-  auto emitAllocaFor = [&](Value *LiveValue) { 
-    AllocaInst *Alloca = new AllocaInst(LiveValue->getType(), 
-                                        DL.getAllocaAddrSpace(), "", 
-                                        F.getEntryBlock().getFirstNonPHI()); 
-    AllocaMap[LiveValue] = Alloca; 
-    PromotableAllocas.push_back(Alloca); 
-  }; 
- 
-  // Emit alloca for each live gc pointer 
-  for (Value *V : Live) 
-    emitAllocaFor(V); 
- 
-  // Emit allocas for rematerialized values 
-  for (const auto &Info : Records) 
-    for (auto RematerializedValuePair : Info.RematerializedValues) { 
-      Value *OriginalValue = RematerializedValuePair.second; 
-      if (AllocaMap.count(OriginalValue) != 0) 
-        continue; 
- 
-      emitAllocaFor(OriginalValue); 
-      ++NumRematerializedValues; 
-    } 
- 
-  // The next two loops are part of the same conceptual operation.  We need to 
-  // insert a store to the alloca after the original def and at each 
-  // redefinition.  We need to insert a load before each use.  These are split 
-  // into distinct loops for performance reasons. 
- 
-  // Update gc pointer after each statepoint: either store a relocated value or 
-  // null (if no relocated value was found for this gc pointer and it is not a 
-  // gc_result).  This must happen before we update the statepoint with load of 
-  // alloca otherwise we lose the link between statepoint and old def. 
-  for (const auto &Info : Records) { 
-    Value *Statepoint = Info.StatepointToken; 
- 
-    // This will be used for consistency check 
-    DenseSet<Value *> VisitedLiveValues; 
- 
-    // Insert stores for normal statepoint gc relocates 
-    insertRelocationStores(Statepoint->users(), AllocaMap, VisitedLiveValues); 
- 
-    // In case if it was invoke statepoint 
-    // we will insert stores for exceptional path gc relocates. 
-    if (isa<InvokeInst>(Statepoint)) { 
-      insertRelocationStores(Info.UnwindToken->users(), AllocaMap, 
-                             VisitedLiveValues); 
-    } 
- 
-    // Do similar thing with rematerialized values 
-    insertRematerializationStores(Info.RematerializedValues, AllocaMap, 
-                                  VisitedLiveValues); 
- 
-    if (ClobberNonLive) { 
-      // As a debugging aid, pretend that an unrelocated pointer becomes null at 
-      // the gc.statepoint.  This will turn some subtle GC problems into 
-      // slightly easier to debug SEGVs.  Note that on large IR files with 
-      // lots of gc.statepoints this is extremely costly both memory and time 
-      // wise. 
-      SmallVector<AllocaInst *, 64> ToClobber; 
-      for (auto Pair : AllocaMap) { 
-        Value *Def = Pair.first; 
-        AllocaInst *Alloca = Pair.second; 
- 
-        // This value was relocated 
-        if (VisitedLiveValues.count(Def)) { 
-          continue; 
-        } 
-        ToClobber.push_back(Alloca); 
-      } 
- 
-      auto InsertClobbersAt = [&](Instruction *IP) { 
-        for (auto *AI : ToClobber) { 
-          auto PT = cast<PointerType>(AI->getAllocatedType()); 
-          Constant *CPN = ConstantPointerNull::get(PT); 
-          new StoreInst(CPN, AI, IP); 
-        } 
-      }; 
- 
-      // Insert the clobbering stores.  These may get intermixed with the 
-      // gc.results and gc.relocates, but that's fine. 
-      if (auto II = dyn_cast<InvokeInst>(Statepoint)) { 
-        InsertClobbersAt(&*II->getNormalDest()->getFirstInsertionPt()); 
-        InsertClobbersAt(&*II->getUnwindDest()->getFirstInsertionPt()); 
-      } else { 
-        InsertClobbersAt(cast<Instruction>(Statepoint)->getNextNode()); 
-      } 
-    } 
-  } 
- 
-  // Update use with load allocas and add store for gc_relocated. 
-  for (auto Pair : AllocaMap) { 
-    Value *Def = Pair.first; 
-    AllocaInst *Alloca = Pair.second; 
- 
-    // We pre-record the uses of allocas so that we dont have to worry about 
-    // later update that changes the user information.. 
- 
-    SmallVector<Instruction *, 20> Uses; 
-    // PERF: trade a linear scan for repeated reallocation 
-    Uses.reserve(Def->getNumUses()); 
-    for (User *U : Def->users()) { 
-      if (!isa<ConstantExpr>(U)) { 
-        // If the def has a ConstantExpr use, then the def is either a 
-        // ConstantExpr use itself or null.  In either case 
-        // (recursively in the first, directly in the second), the oop 
-        // it is ultimately dependent on is null and this particular 
-        // use does not need to be fixed up. 
-        Uses.push_back(cast<Instruction>(U)); 
-      } 
-    } 
- 
-    llvm::sort(Uses); 
-    auto Last = std::unique(Uses.begin(), Uses.end()); 
-    Uses.erase(Last, Uses.end()); 
- 
-    for (Instruction *Use : Uses) { 
-      if (isa<PHINode>(Use)) { 
-        PHINode *Phi = cast<PHINode>(Use); 
-        for (unsigned i = 0; i < Phi->getNumIncomingValues(); i++) { 
-          if (Def == Phi->getIncomingValue(i)) { 
-            LoadInst *Load = 
-                new LoadInst(Alloca->getAllocatedType(), Alloca, "", 
-                             Phi->getIncomingBlock(i)->getTerminator()); 
-            Phi->setIncomingValue(i, Load); 
-          } 
-        } 
-      } else { 
-        LoadInst *Load = 
-            new LoadInst(Alloca->getAllocatedType(), Alloca, "", Use); 
-        Use->replaceUsesOfWith(Def, Load); 
-      } 
-    } 
- 
-    // Emit store for the initial gc value.  Store must be inserted after load, 
-    // otherwise store will be in alloca's use list and an extra load will be 
-    // inserted before it. 
-    StoreInst *Store = new StoreInst(Def, Alloca, /*volatile*/ false, 
-                                     DL.getABITypeAlign(Def->getType())); 
-    if (Instruction *Inst = dyn_cast<Instruction>(Def)) { 
-      if (InvokeInst *Invoke = dyn_cast<InvokeInst>(Inst)) { 
-        // InvokeInst is a terminator so the store need to be inserted into its 
-        // normal destination block. 
-        BasicBlock *NormalDest = Invoke->getNormalDest(); 
-        Store->insertBefore(NormalDest->getFirstNonPHI()); 
-      } else { 
-        assert(!Inst->isTerminator() && 
-               "The only terminator that can produce a value is " 
-               "InvokeInst which is handled above."); 
-        Store->insertAfter(Inst); 
-      } 
-    } else { 
-      assert(isa<Argument>(Def)); 
-      Store->insertAfter(cast<Instruction>(Alloca)); 
-    } 
-  } 
- 
-  assert(PromotableAllocas.size() == Live.size() + NumRematerializedValues && 
-         "we must have the same allocas with lives"); 
-  if (!PromotableAllocas.empty()) { 
-    // Apply mem2reg to promote alloca to SSA 
-    PromoteMemToReg(PromotableAllocas, DT); 
-  } 
- 
-#ifndef NDEBUG 
-  for (auto &I : F.getEntryBlock()) 
-    if (isa<AllocaInst>(I)) 
-      InitialAllocaNum--; 
-  assert(InitialAllocaNum == 0 && "We must not introduce any extra allocas"); 
-#endif 
-} 
- 
-/// Implement a unique function which doesn't require we sort the input 
-/// vector.  Doing so has the effect of changing the output of a couple of 
-/// tests in ways which make them less useful in testing fused safepoints. 
-template <typename T> static void unique_unsorted(SmallVectorImpl<T> &Vec) { 
-  SmallSet<T, 8> Seen; 
+    }
+  }
+
+  // Create the statepoint given all the arguments
+  GCStatepointInst *Token = nullptr;
+  if (auto *CI = dyn_cast<CallInst>(Call)) {
+    CallInst *SPCall = Builder.CreateGCStatepointCall(
+        StatepointID, NumPatchBytes, CallTarget, Flags, CallArgs,
+        TransitionArgs, DeoptArgs, GCArgs, "safepoint_token");
+
+    SPCall->setTailCallKind(CI->getTailCallKind());
+    SPCall->setCallingConv(CI->getCallingConv());
+
+    // Currently we will fail on parameter attributes and on certain
+    // function attributes.  In case if we can handle this set of attributes -
+    // set up function attrs directly on statepoint and return attrs later for
+    // gc_result intrinsic.
+    SPCall->setAttributes(
+        legalizeCallAttributes(CI->getContext(), CI->getAttributes()));
+
+    Token = cast<GCStatepointInst>(SPCall);
+
+    // Put the following gc_result and gc_relocate calls immediately after the
+    // the old call (which we're about to delete)
+    assert(CI->getNextNode() && "Not a terminator, must have next!");
+    Builder.SetInsertPoint(CI->getNextNode());
+    Builder.SetCurrentDebugLocation(CI->getNextNode()->getDebugLoc());
+  } else {
+    auto *II = cast<InvokeInst>(Call);
+
+    // Insert the new invoke into the old block.  We'll remove the old one in a
+    // moment at which point this will become the new terminator for the
+    // original block.
+    InvokeInst *SPInvoke = Builder.CreateGCStatepointInvoke(
+        StatepointID, NumPatchBytes, CallTarget, II->getNormalDest(),
+        II->getUnwindDest(), Flags, CallArgs, TransitionArgs, DeoptArgs, GCArgs,
+        "statepoint_token");
+
+    SPInvoke->setCallingConv(II->getCallingConv());
+
+    // Currently we will fail on parameter attributes and on certain
+    // function attributes.  In case if we can handle this set of attributes -
+    // set up function attrs directly on statepoint and return attrs later for
+    // gc_result intrinsic.
+    SPInvoke->setAttributes(
+        legalizeCallAttributes(II->getContext(), II->getAttributes()));
+
+    Token = cast<GCStatepointInst>(SPInvoke);
+
+    // Generate gc relocates in exceptional path
+    BasicBlock *UnwindBlock = II->getUnwindDest();
+    assert(!isa<PHINode>(UnwindBlock->begin()) &&
+           UnwindBlock->getUniquePredecessor() &&
+           "can't safely insert in this block!");
+
+    Builder.SetInsertPoint(&*UnwindBlock->getFirstInsertionPt());
+    Builder.SetCurrentDebugLocation(II->getDebugLoc());
+
+    // Attach exceptional gc relocates to the landingpad.
+    Instruction *ExceptionalToken = UnwindBlock->getLandingPadInst();
+    Result.UnwindToken = ExceptionalToken;
+
+    CreateGCRelocates(LiveVariables, BasePtrs, ExceptionalToken, Builder);
+
+    // Generate gc relocates and returns for normal block
+    BasicBlock *NormalDest = II->getNormalDest();
+    assert(!isa<PHINode>(NormalDest->begin()) &&
+           NormalDest->getUniquePredecessor() &&
+           "can't safely insert in this block!");
+
+    Builder.SetInsertPoint(&*NormalDest->getFirstInsertionPt());
+
+    // gc relocates will be generated later as if it were regular call
+    // statepoint
+  }
+  assert(Token && "Should be set in one of the above branches!");
+
+  if (IsDeoptimize) {
+    // If we're wrapping an @llvm.experimental.deoptimize in a statepoint, we
+    // transform the tail-call like structure to a call to a void function
+    // followed by unreachable to get better codegen.
+    Replacements.push_back(
+        DeferredReplacement::createDeoptimizeReplacement(Call));
+  } else {
+    Token->setName("statepoint_token");
+    if (!Call->getType()->isVoidTy() && !Call->use_empty()) {
+      StringRef Name = Call->hasName() ? Call->getName() : "";
+      CallInst *GCResult = Builder.CreateGCResult(Token, Call->getType(), Name);
+      GCResult->setAttributes(
+          AttributeList::get(GCResult->getContext(), AttributeList::ReturnIndex,
+                             Call->getAttributes().getRetAttributes()));
+
+      // We cannot RAUW or delete CS.getInstruction() because it could be in the
+      // live set of some other safepoint, in which case that safepoint's
+      // PartiallyConstructedSafepointRecord will hold a raw pointer to this
+      // llvm::Instruction.  Instead, we defer the replacement and deletion to
+      // after the live sets have been made explicit in the IR, and we no longer
+      // have raw pointers to worry about.
+      Replacements.emplace_back(
+          DeferredReplacement::createRAUW(Call, GCResult));
+    } else {
+      Replacements.emplace_back(DeferredReplacement::createDelete(Call));
+    }
+  }
+
+  Result.StatepointToken = Token;
+
+  // Second, create a gc.relocate for every live variable
+  CreateGCRelocates(LiveVariables, BasePtrs, Token, Builder);
+}
+
+// Replace an existing gc.statepoint with a new one and a set of gc.relocates
+// which make the relocations happening at this safepoint explicit.
+//
+// WARNING: Does not do any fixup to adjust users of the original live
+// values.  That's the callers responsibility.
+static void
+makeStatepointExplicit(DominatorTree &DT, CallBase *Call,
+                       PartiallyConstructedSafepointRecord &Result,
+                       std::vector<DeferredReplacement> &Replacements) {
+  const auto &LiveSet = Result.LiveSet;
+  const auto &PointerToBase = Result.PointerToBase;
+
+  // Convert to vector for efficient cross referencing.
+  SmallVector<Value *, 64> BaseVec, LiveVec;
+  LiveVec.reserve(LiveSet.size());
+  BaseVec.reserve(LiveSet.size());
+  for (Value *L : LiveSet) {
+    LiveVec.push_back(L);
+    assert(PointerToBase.count(L));
+    Value *Base = PointerToBase.find(L)->second;
+    BaseVec.push_back(Base);
+  }
+  assert(LiveVec.size() == BaseVec.size());
+
+  // Do the actual rewriting and delete the old statepoint
+  makeStatepointExplicitImpl(Call, BaseVec, LiveVec, Result, Replacements);
+}
+
+// Helper function for the relocationViaAlloca.
+//
+// It receives iterator to the statepoint gc relocates and emits a store to the
+// assigned location (via allocaMap) for the each one of them.  It adds the
+// visited values into the visitedLiveValues set, which we will later use them
+// for sanity checking.
+static void
+insertRelocationStores(iterator_range<Value::user_iterator> GCRelocs,
+                       DenseMap<Value *, AllocaInst *> &AllocaMap,
+                       DenseSet<Value *> &VisitedLiveValues) {
+  for (User *U : GCRelocs) {
+    GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U);
+    if (!Relocate)
+      continue;
+
+    Value *OriginalValue = Relocate->getDerivedPtr();
+    assert(AllocaMap.count(OriginalValue));
+    Value *Alloca = AllocaMap[OriginalValue];
+
+    // Emit store into the related alloca
+    // All gc_relocates are i8 addrspace(1)* typed, and it must be bitcasted to
+    // the correct type according to alloca.
+    assert(Relocate->getNextNode() &&
+           "Should always have one since it's not a terminator");
+    IRBuilder<> Builder(Relocate->getNextNode());
+    Value *CastedRelocatedValue =
+      Builder.CreateBitCast(Relocate,
+                            cast<AllocaInst>(Alloca)->getAllocatedType(),
+                            suffixed_name_or(Relocate, ".casted", ""));
+
+    new StoreInst(CastedRelocatedValue, Alloca,
+                  cast<Instruction>(CastedRelocatedValue)->getNextNode());
+
+#ifndef NDEBUG
+    VisitedLiveValues.insert(OriginalValue);
+#endif
+  }
+}
+
+// Helper function for the "relocationViaAlloca". Similar to the
+// "insertRelocationStores" but works for rematerialized values.
+static void insertRematerializationStores(
+    const RematerializedValueMapTy &RematerializedValues,
+    DenseMap<Value *, AllocaInst *> &AllocaMap,
+    DenseSet<Value *> &VisitedLiveValues) {
+  for (auto RematerializedValuePair: RematerializedValues) {
+    Instruction *RematerializedValue = RematerializedValuePair.first;
+    Value *OriginalValue = RematerializedValuePair.second;
+
+    assert(AllocaMap.count(OriginalValue) &&
+           "Can not find alloca for rematerialized value");
+    Value *Alloca = AllocaMap[OriginalValue];
+
+    new StoreInst(RematerializedValue, Alloca,
+                  RematerializedValue->getNextNode());
+
+#ifndef NDEBUG
+    VisitedLiveValues.insert(OriginalValue);
+#endif
+  }
+}
+
+/// Do all the relocation update via allocas and mem2reg
+static void relocationViaAlloca(
+    Function &F, DominatorTree &DT, ArrayRef<Value *> Live,
+    ArrayRef<PartiallyConstructedSafepointRecord> Records) {
+#ifndef NDEBUG
+  // record initial number of (static) allocas; we'll check we have the same
+  // number when we get done.
+  int InitialAllocaNum = 0;
+  for (Instruction &I : F.getEntryBlock())
+    if (isa<AllocaInst>(I))
+      InitialAllocaNum++;
+#endif
+
+  // TODO-PERF: change data structures, reserve
+  DenseMap<Value *, AllocaInst *> AllocaMap;
+  SmallVector<AllocaInst *, 200> PromotableAllocas;
+  // Used later to chack that we have enough allocas to store all values
+  std::size_t NumRematerializedValues = 0;
+  PromotableAllocas.reserve(Live.size());
+
+  // Emit alloca for "LiveValue" and record it in "allocaMap" and
+  // "PromotableAllocas"
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  auto emitAllocaFor = [&](Value *LiveValue) {
+    AllocaInst *Alloca = new AllocaInst(LiveValue->getType(),
+                                        DL.getAllocaAddrSpace(), "",
+                                        F.getEntryBlock().getFirstNonPHI());
+    AllocaMap[LiveValue] = Alloca;
+    PromotableAllocas.push_back(Alloca);
+  };
+
+  // Emit alloca for each live gc pointer
+  for (Value *V : Live)
+    emitAllocaFor(V);
+
+  // Emit allocas for rematerialized values
+  for (const auto &Info : Records)
+    for (auto RematerializedValuePair : Info.RematerializedValues) {
+      Value *OriginalValue = RematerializedValuePair.second;
+      if (AllocaMap.count(OriginalValue) != 0)
+        continue;
+
+      emitAllocaFor(OriginalValue);
+      ++NumRematerializedValues;
+    }
+
+  // The next two loops are part of the same conceptual operation.  We need to
+  // insert a store to the alloca after the original def and at each
+  // redefinition.  We need to insert a load before each use.  These are split
+  // into distinct loops for performance reasons.
+
+  // Update gc pointer after each statepoint: either store a relocated value or
+  // null (if no relocated value was found for this gc pointer and it is not a
+  // gc_result).  This must happen before we update the statepoint with load of
+  // alloca otherwise we lose the link between statepoint and old def.
+  for (const auto &Info : Records) {
+    Value *Statepoint = Info.StatepointToken;
+
+    // This will be used for consistency check
+    DenseSet<Value *> VisitedLiveValues;
+
+    // Insert stores for normal statepoint gc relocates
+    insertRelocationStores(Statepoint->users(), AllocaMap, VisitedLiveValues);
+
+    // In case if it was invoke statepoint
+    // we will insert stores for exceptional path gc relocates.
+    if (isa<InvokeInst>(Statepoint)) {
+      insertRelocationStores(Info.UnwindToken->users(), AllocaMap,
+                             VisitedLiveValues);
+    }
+
+    // Do similar thing with rematerialized values
+    insertRematerializationStores(Info.RematerializedValues, AllocaMap,
+                                  VisitedLiveValues);
+
+    if (ClobberNonLive) {
+      // As a debugging aid, pretend that an unrelocated pointer becomes null at
+      // the gc.statepoint.  This will turn some subtle GC problems into
+      // slightly easier to debug SEGVs.  Note that on large IR files with
+      // lots of gc.statepoints this is extremely costly both memory and time
+      // wise.
+      SmallVector<AllocaInst *, 64> ToClobber;
+      for (auto Pair : AllocaMap) {
+        Value *Def = Pair.first;
+        AllocaInst *Alloca = Pair.second;
+
+        // This value was relocated
+        if (VisitedLiveValues.count(Def)) {
+          continue;
+        }
+        ToClobber.push_back(Alloca);
+      }
+
+      auto InsertClobbersAt = [&](Instruction *IP) {
+        for (auto *AI : ToClobber) {
+          auto PT = cast<PointerType>(AI->getAllocatedType());
+          Constant *CPN = ConstantPointerNull::get(PT);
+          new StoreInst(CPN, AI, IP);
+        }
+      };
+
+      // Insert the clobbering stores.  These may get intermixed with the
+      // gc.results and gc.relocates, but that's fine.
+      if (auto II = dyn_cast<InvokeInst>(Statepoint)) {
+        InsertClobbersAt(&*II->getNormalDest()->getFirstInsertionPt());
+        InsertClobbersAt(&*II->getUnwindDest()->getFirstInsertionPt());
+      } else {
+        InsertClobbersAt(cast<Instruction>(Statepoint)->getNextNode());
+      }
+    }
+  }
+
+  // Update use with load allocas and add store for gc_relocated.
+  for (auto Pair : AllocaMap) {
+    Value *Def = Pair.first;
+    AllocaInst *Alloca = Pair.second;
+
+    // We pre-record the uses of allocas so that we dont have to worry about
+    // later update that changes the user information..
+
+    SmallVector<Instruction *, 20> Uses;
+    // PERF: trade a linear scan for repeated reallocation
+    Uses.reserve(Def->getNumUses());
+    for (User *U : Def->users()) {
+      if (!isa<ConstantExpr>(U)) {
+        // If the def has a ConstantExpr use, then the def is either a
+        // ConstantExpr use itself or null.  In either case
+        // (recursively in the first, directly in the second), the oop
+        // it is ultimately dependent on is null and this particular
+        // use does not need to be fixed up.
+        Uses.push_back(cast<Instruction>(U));
+      }
+    }
+
+    llvm::sort(Uses);
+    auto Last = std::unique(Uses.begin(), Uses.end());
+    Uses.erase(Last, Uses.end());
+
+    for (Instruction *Use : Uses) {
+      if (isa<PHINode>(Use)) {
+        PHINode *Phi = cast<PHINode>(Use);
+        for (unsigned i = 0; i < Phi->getNumIncomingValues(); i++) {
+          if (Def == Phi->getIncomingValue(i)) {
+            LoadInst *Load =
+                new LoadInst(Alloca->getAllocatedType(), Alloca, "",
+                             Phi->getIncomingBlock(i)->getTerminator());
+            Phi->setIncomingValue(i, Load);
+          }
+        }
+      } else {
+        LoadInst *Load =
+            new LoadInst(Alloca->getAllocatedType(), Alloca, "", Use);
+        Use->replaceUsesOfWith(Def, Load);
+      }
+    }
+
+    // Emit store for the initial gc value.  Store must be inserted after load,
+    // otherwise store will be in alloca's use list and an extra load will be
+    // inserted before it.
+    StoreInst *Store = new StoreInst(Def, Alloca, /*volatile*/ false,
+                                     DL.getABITypeAlign(Def->getType()));
+    if (Instruction *Inst = dyn_cast<Instruction>(Def)) {
+      if (InvokeInst *Invoke = dyn_cast<InvokeInst>(Inst)) {
+        // InvokeInst is a terminator so the store need to be inserted into its
+        // normal destination block.
+        BasicBlock *NormalDest = Invoke->getNormalDest();
+        Store->insertBefore(NormalDest->getFirstNonPHI());
+      } else {
+        assert(!Inst->isTerminator() &&
+               "The only terminator that can produce a value is "
+               "InvokeInst which is handled above.");
+        Store->insertAfter(Inst);
+      }
+    } else {
+      assert(isa<Argument>(Def));
+      Store->insertAfter(cast<Instruction>(Alloca));
+    }
+  }
+
+  assert(PromotableAllocas.size() == Live.size() + NumRematerializedValues &&
+         "we must have the same allocas with lives");
+  if (!PromotableAllocas.empty()) {
+    // Apply mem2reg to promote alloca to SSA
+    PromoteMemToReg(PromotableAllocas, DT);
+  }
+
+#ifndef NDEBUG
+  for (auto &I : F.getEntryBlock())
+    if (isa<AllocaInst>(I))
+      InitialAllocaNum--;
+  assert(InitialAllocaNum == 0 && "We must not introduce any extra allocas");
+#endif
+}
+
+/// Implement a unique function which doesn't require we sort the input
+/// vector.  Doing so has the effect of changing the output of a couple of
+/// tests in ways which make them less useful in testing fused safepoints.
+template <typename T> static void unique_unsorted(SmallVectorImpl<T> &Vec) {
+  SmallSet<T, 8> Seen;
   erase_if(Vec, [&](const T &V) { return !Seen.insert(V).second; });
-} 
- 
-/// Insert holders so that each Value is obviously live through the entire 
-/// lifetime of the call. 
-static void insertUseHolderAfter(CallBase *Call, const ArrayRef<Value *> Values, 
-                                 SmallVectorImpl<CallInst *> &Holders) { 
-  if (Values.empty()) 
-    // No values to hold live, might as well not insert the empty holder 
-    return; 
- 
-  Module *M = Call->getModule(); 
-  // Use a dummy vararg function to actually hold the values live 
-  FunctionCallee Func = M->getOrInsertFunction( 
-      "__tmp_use", FunctionType::get(Type::getVoidTy(M->getContext()), true)); 
-  if (isa<CallInst>(Call)) { 
-    // For call safepoints insert dummy calls right after safepoint 
-    Holders.push_back( 
-        CallInst::Create(Func, Values, "", &*++Call->getIterator())); 
-    return; 
-  } 
-  // For invoke safepooints insert dummy calls both in normal and 
-  // exceptional destination blocks 
-  auto *II = cast<InvokeInst>(Call); 
-  Holders.push_back(CallInst::Create( 
-      Func, Values, "", &*II->getNormalDest()->getFirstInsertionPt())); 
-  Holders.push_back(CallInst::Create( 
-      Func, Values, "", &*II->getUnwindDest()->getFirstInsertionPt())); 
-} 
- 
-static void findLiveReferences( 
-    Function &F, DominatorTree &DT, ArrayRef<CallBase *> toUpdate, 
-    MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) { 
-  GCPtrLivenessData OriginalLivenessData; 
-  computeLiveInValues(DT, F, OriginalLivenessData); 
-  for (size_t i = 0; i < records.size(); i++) { 
-    struct PartiallyConstructedSafepointRecord &info = records[i]; 
-    analyzeParsePointLiveness(DT, OriginalLivenessData, toUpdate[i], info); 
-  } 
-} 
- 
-// Helper function for the "rematerializeLiveValues". It walks use chain 
-// starting from the "CurrentValue" until it reaches the root of the chain, i.e. 
-// the base or a value it cannot process. Only "simple" values are processed 
-// (currently it is GEP's and casts). The returned root is  examined by the 
-// callers of findRematerializableChainToBasePointer.  Fills "ChainToBase" array 
-// with all visited values. 
-static Value* findRematerializableChainToBasePointer( 
-  SmallVectorImpl<Instruction*> &ChainToBase, 
-  Value *CurrentValue) { 
-  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurrentValue)) { 
-    ChainToBase.push_back(GEP); 
-    return findRematerializableChainToBasePointer(ChainToBase, 
-                                                  GEP->getPointerOperand()); 
-  } 
- 
-  if (CastInst *CI = dyn_cast<CastInst>(CurrentValue)) { 
-    if (!CI->isNoopCast(CI->getModule()->getDataLayout())) 
-      return CI; 
- 
-    ChainToBase.push_back(CI); 
-    return findRematerializableChainToBasePointer(ChainToBase, 
-                                                  CI->getOperand(0)); 
-  } 
- 
-  // We have reached the root of the chain, which is either equal to the base or 
-  // is the first unsupported value along the use chain. 
-  return CurrentValue; 
-} 
- 
-// Helper function for the "rematerializeLiveValues". Compute cost of the use 
-// chain we are going to rematerialize. 
+}
+
+/// Insert holders so that each Value is obviously live through the entire
+/// lifetime of the call.
+static void insertUseHolderAfter(CallBase *Call, const ArrayRef<Value *> Values,
+                                 SmallVectorImpl<CallInst *> &Holders) {
+  if (Values.empty())
+    // No values to hold live, might as well not insert the empty holder
+    return;
+
+  Module *M = Call->getModule();
+  // Use a dummy vararg function to actually hold the values live
+  FunctionCallee Func = M->getOrInsertFunction(
+      "__tmp_use", FunctionType::get(Type::getVoidTy(M->getContext()), true));
+  if (isa<CallInst>(Call)) {
+    // For call safepoints insert dummy calls right after safepoint
+    Holders.push_back(
+        CallInst::Create(Func, Values, "", &*++Call->getIterator()));
+    return;
+  }
+  // For invoke safepooints insert dummy calls both in normal and
+  // exceptional destination blocks
+  auto *II = cast<InvokeInst>(Call);
+  Holders.push_back(CallInst::Create(
+      Func, Values, "", &*II->getNormalDest()->getFirstInsertionPt()));
+  Holders.push_back(CallInst::Create(
+      Func, Values, "", &*II->getUnwindDest()->getFirstInsertionPt()));
+}
+
+static void findLiveReferences(
+    Function &F, DominatorTree &DT, ArrayRef<CallBase *> toUpdate,
+    MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) {
+  GCPtrLivenessData OriginalLivenessData;
+  computeLiveInValues(DT, F, OriginalLivenessData);
+  for (size_t i = 0; i < records.size(); i++) {
+    struct PartiallyConstructedSafepointRecord &info = records[i];
+    analyzeParsePointLiveness(DT, OriginalLivenessData, toUpdate[i], info);
+  }
+}
+
+// Helper function for the "rematerializeLiveValues". It walks use chain
+// starting from the "CurrentValue" until it reaches the root of the chain, i.e.
+// the base or a value it cannot process. Only "simple" values are processed
+// (currently it is GEP's and casts). The returned root is  examined by the
+// callers of findRematerializableChainToBasePointer.  Fills "ChainToBase" array
+// with all visited values.
+static Value* findRematerializableChainToBasePointer(
+  SmallVectorImpl<Instruction*> &ChainToBase,
+  Value *CurrentValue) {
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurrentValue)) {
+    ChainToBase.push_back(GEP);
+    return findRematerializableChainToBasePointer(ChainToBase,
+                                                  GEP->getPointerOperand());
+  }
+
+  if (CastInst *CI = dyn_cast<CastInst>(CurrentValue)) {
+    if (!CI->isNoopCast(CI->getModule()->getDataLayout()))
+      return CI;
+
+    ChainToBase.push_back(CI);
+    return findRematerializableChainToBasePointer(ChainToBase,
+                                                  CI->getOperand(0));
+  }
+
+  // We have reached the root of the chain, which is either equal to the base or
+  // is the first unsupported value along the use chain.
+  return CurrentValue;
+}
+
+// Helper function for the "rematerializeLiveValues". Compute cost of the use
+// chain we are going to rematerialize.
 static InstructionCost
 chainToBasePointerCost(SmallVectorImpl<Instruction *> &Chain,
-                       TargetTransformInfo &TTI) { 
+                       TargetTransformInfo &TTI) {
   InstructionCost Cost = 0;
- 
-  for (Instruction *Instr : Chain) { 
-    if (CastInst *CI = dyn_cast<CastInst>(Instr)) { 
-      assert(CI->isNoopCast(CI->getModule()->getDataLayout()) && 
-             "non noop cast is found during rematerialization"); 
- 
-      Type *SrcTy = CI->getOperand(0)->getType(); 
-      Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy, 
+
+  for (Instruction *Instr : Chain) {
+    if (CastInst *CI = dyn_cast<CastInst>(Instr)) {
+      assert(CI->isNoopCast(CI->getModule()->getDataLayout()) &&
+             "non noop cast is found during rematerialization");
+
+      Type *SrcTy = CI->getOperand(0)->getType();
+      Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy,
                                    TTI::getCastContextHint(CI),
                                    TargetTransformInfo::TCK_SizeAndLatency, CI);
- 
-    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) { 
-      // Cost of the address calculation 
-      Type *ValTy = GEP->getSourceElementType(); 
-      Cost += TTI.getAddressComputationCost(ValTy); 
- 
-      // And cost of the GEP itself 
-      // TODO: Use TTI->getGEPCost here (it exists, but appears to be not 
-      //       allowed for the external usage) 
-      if (!GEP->hasAllConstantIndices()) 
-        Cost += 2; 
- 
-    } else { 
-      llvm_unreachable("unsupported instruction type during rematerialization"); 
-    } 
-  } 
- 
-  return Cost; 
-} 
- 
-static bool AreEquivalentPhiNodes(PHINode &OrigRootPhi, PHINode &AlternateRootPhi) { 
-  unsigned PhiNum = OrigRootPhi.getNumIncomingValues(); 
-  if (PhiNum != AlternateRootPhi.getNumIncomingValues() || 
-      OrigRootPhi.getParent() != AlternateRootPhi.getParent()) 
-    return false; 
-  // Map of incoming values and their corresponding basic blocks of 
-  // OrigRootPhi. 
-  SmallDenseMap<Value *, BasicBlock *, 8> CurrentIncomingValues; 
-  for (unsigned i = 0; i < PhiNum; i++) 
-    CurrentIncomingValues[OrigRootPhi.getIncomingValue(i)] = 
-        OrigRootPhi.getIncomingBlock(i); 
- 
-  // Both current and base PHIs should have same incoming values and 
-  // the same basic blocks corresponding to the incoming values. 
-  for (unsigned i = 0; i < PhiNum; i++) { 
-    auto CIVI = 
-        CurrentIncomingValues.find(AlternateRootPhi.getIncomingValue(i)); 
-    if (CIVI == CurrentIncomingValues.end()) 
-      return false; 
-    BasicBlock *CurrentIncomingBB = CIVI->second; 
-    if (CurrentIncomingBB != AlternateRootPhi.getIncomingBlock(i)) 
-      return false; 
-  } 
-  return true; 
-} 
- 
-// From the statepoint live set pick values that are cheaper to recompute then 
-// to relocate. Remove this values from the live set, rematerialize them after 
-// statepoint and record them in "Info" structure. Note that similar to 
-// relocated values we don't do any user adjustments here. 
-static void rematerializeLiveValues(CallBase *Call, 
-                                    PartiallyConstructedSafepointRecord &Info, 
-                                    TargetTransformInfo &TTI) { 
-  const unsigned int ChainLengthThreshold = 10; 
- 
-  // Record values we are going to delete from this statepoint live set. 
-  // We can not di this in following loop due to iterator invalidation. 
-  SmallVector<Value *, 32> LiveValuesToBeDeleted; 
- 
-  for (Value *LiveValue: Info.LiveSet) { 
-    // For each live pointer find its defining chain 
-    SmallVector<Instruction *, 3> ChainToBase; 
-    assert(Info.PointerToBase.count(LiveValue)); 
-    Value *RootOfChain = 
-      findRematerializableChainToBasePointer(ChainToBase, 
-                                             LiveValue); 
- 
-    // Nothing to do, or chain is too long 
-    if ( ChainToBase.size() == 0 || 
-        ChainToBase.size() > ChainLengthThreshold) 
-      continue; 
- 
-    // Handle the scenario where the RootOfChain is not equal to the 
-    // Base Value, but they are essentially the same phi values. 
-    if (RootOfChain != Info.PointerToBase[LiveValue]) { 
-      PHINode *OrigRootPhi = dyn_cast<PHINode>(RootOfChain); 
-      PHINode *AlternateRootPhi = dyn_cast<PHINode>(Info.PointerToBase[LiveValue]); 
-      if (!OrigRootPhi || !AlternateRootPhi) 
-        continue; 
-      // PHI nodes that have the same incoming values, and belonging to the same 
-      // basic blocks are essentially the same SSA value.  When the original phi 
-      // has incoming values with different base pointers, the original phi is 
-      // marked as conflict, and an additional `AlternateRootPhi` with the same 
-      // incoming values get generated by the findBasePointer function. We need 
-      // to identify the newly generated AlternateRootPhi (.base version of phi) 
-      // and RootOfChain (the original phi node itself) are the same, so that we 
-      // can rematerialize the gep and casts. This is a workaround for the 
-      // deficiency in the findBasePointer algorithm. 
-      if (!AreEquivalentPhiNodes(*OrigRootPhi, *AlternateRootPhi)) 
-        continue; 
-      // Now that the phi nodes are proved to be the same, assert that 
-      // findBasePointer's newly generated AlternateRootPhi is present in the 
-      // liveset of the call. 
-      assert(Info.LiveSet.count(AlternateRootPhi)); 
-    } 
-    // Compute cost of this chain 
+
+    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
+      // Cost of the address calculation
+      Type *ValTy = GEP->getSourceElementType();
+      Cost += TTI.getAddressComputationCost(ValTy);
+
+      // And cost of the GEP itself
+      // TODO: Use TTI->getGEPCost here (it exists, but appears to be not
+      //       allowed for the external usage)
+      if (!GEP->hasAllConstantIndices())
+        Cost += 2;
+
+    } else {
+      llvm_unreachable("unsupported instruction type during rematerialization");
+    }
+  }
+
+  return Cost;
+}
+
+static bool AreEquivalentPhiNodes(PHINode &OrigRootPhi, PHINode &AlternateRootPhi) {
+  unsigned PhiNum = OrigRootPhi.getNumIncomingValues();
+  if (PhiNum != AlternateRootPhi.getNumIncomingValues() ||
+      OrigRootPhi.getParent() != AlternateRootPhi.getParent())
+    return false;
+  // Map of incoming values and their corresponding basic blocks of
+  // OrigRootPhi.
+  SmallDenseMap<Value *, BasicBlock *, 8> CurrentIncomingValues;
+  for (unsigned i = 0; i < PhiNum; i++)
+    CurrentIncomingValues[OrigRootPhi.getIncomingValue(i)] =
+        OrigRootPhi.getIncomingBlock(i);
+
+  // Both current and base PHIs should have same incoming values and
+  // the same basic blocks corresponding to the incoming values.
+  for (unsigned i = 0; i < PhiNum; i++) {
+    auto CIVI =
+        CurrentIncomingValues.find(AlternateRootPhi.getIncomingValue(i));
+    if (CIVI == CurrentIncomingValues.end())
+      return false;
+    BasicBlock *CurrentIncomingBB = CIVI->second;
+    if (CurrentIncomingBB != AlternateRootPhi.getIncomingBlock(i))
+      return false;
+  }
+  return true;
+}
+
+// From the statepoint live set pick values that are cheaper to recompute then
+// to relocate. Remove this values from the live set, rematerialize them after
+// statepoint and record them in "Info" structure. Note that similar to
+// relocated values we don't do any user adjustments here.
+static void rematerializeLiveValues(CallBase *Call,
+                                    PartiallyConstructedSafepointRecord &Info,
+                                    TargetTransformInfo &TTI) {
+  const unsigned int ChainLengthThreshold = 10;
+
+  // Record values we are going to delete from this statepoint live set.
+  // We can not di this in following loop due to iterator invalidation.
+  SmallVector<Value *, 32> LiveValuesToBeDeleted;
+
+  for (Value *LiveValue: Info.LiveSet) {
+    // For each live pointer find its defining chain
+    SmallVector<Instruction *, 3> ChainToBase;
+    assert(Info.PointerToBase.count(LiveValue));
+    Value *RootOfChain =
+      findRematerializableChainToBasePointer(ChainToBase,
+                                             LiveValue);
+
+    // Nothing to do, or chain is too long
+    if ( ChainToBase.size() == 0 ||
+        ChainToBase.size() > ChainLengthThreshold)
+      continue;
+
+    // Handle the scenario where the RootOfChain is not equal to the
+    // Base Value, but they are essentially the same phi values.
+    if (RootOfChain != Info.PointerToBase[LiveValue]) {
+      PHINode *OrigRootPhi = dyn_cast<PHINode>(RootOfChain);
+      PHINode *AlternateRootPhi = dyn_cast<PHINode>(Info.PointerToBase[LiveValue]);
+      if (!OrigRootPhi || !AlternateRootPhi)
+        continue;
+      // PHI nodes that have the same incoming values, and belonging to the same
+      // basic blocks are essentially the same SSA value.  When the original phi
+      // has incoming values with different base pointers, the original phi is
+      // marked as conflict, and an additional `AlternateRootPhi` with the same
+      // incoming values get generated by the findBasePointer function. We need
+      // to identify the newly generated AlternateRootPhi (.base version of phi)
+      // and RootOfChain (the original phi node itself) are the same, so that we
+      // can rematerialize the gep and casts. This is a workaround for the
+      // deficiency in the findBasePointer algorithm.
+      if (!AreEquivalentPhiNodes(*OrigRootPhi, *AlternateRootPhi))
+        continue;
+      // Now that the phi nodes are proved to be the same, assert that
+      // findBasePointer's newly generated AlternateRootPhi is present in the
+      // liveset of the call.
+      assert(Info.LiveSet.count(AlternateRootPhi));
+    }
+    // Compute cost of this chain
     InstructionCost Cost = chainToBasePointerCost(ChainToBase, TTI);
-    // TODO: We can also account for cases when we will be able to remove some 
-    //       of the rematerialized values by later optimization passes. I.e if 
-    //       we rematerialized several intersecting chains. Or if original values 
-    //       don't have any uses besides this statepoint. 
- 
-    // For invokes we need to rematerialize each chain twice - for normal and 
-    // for unwind basic blocks. Model this by multiplying cost by two. 
-    if (isa<InvokeInst>(Call)) { 
-      Cost *= 2; 
-    } 
-    // If it's too expensive - skip it 
-    if (Cost >= RematerializationThreshold) 
-      continue; 
- 
-    // Remove value from the live set 
-    LiveValuesToBeDeleted.push_back(LiveValue); 
- 
-    // Clone instructions and record them inside "Info" structure 
- 
-    // Walk backwards to visit top-most instructions first 
-    std::reverse(ChainToBase.begin(), ChainToBase.end()); 
- 
-    // Utility function which clones all instructions from "ChainToBase" 
-    // and inserts them before "InsertBefore". Returns rematerialized value 
-    // which should be used after statepoint. 
-    auto rematerializeChain = [&ChainToBase]( 
-        Instruction *InsertBefore, Value *RootOfChain, Value *AlternateLiveBase) { 
-      Instruction *LastClonedValue = nullptr; 
-      Instruction *LastValue = nullptr; 
-      for (Instruction *Instr: ChainToBase) { 
-        // Only GEP's and casts are supported as we need to be careful to not 
-        // introduce any new uses of pointers not in the liveset. 
-        // Note that it's fine to introduce new uses of pointers which were 
-        // otherwise not used after this statepoint. 
-        assert(isa<GetElementPtrInst>(Instr) || isa<CastInst>(Instr)); 
- 
-        Instruction *ClonedValue = Instr->clone(); 
-        ClonedValue->insertBefore(InsertBefore); 
-        ClonedValue->setName(Instr->getName() + ".remat"); 
- 
-        // If it is not first instruction in the chain then it uses previously 
-        // cloned value. We should update it to use cloned value. 
-        if (LastClonedValue) { 
-          assert(LastValue); 
-          ClonedValue->replaceUsesOfWith(LastValue, LastClonedValue); 
-#ifndef NDEBUG 
-          for (auto OpValue : ClonedValue->operand_values()) { 
-            // Assert that cloned instruction does not use any instructions from 
-            // this chain other than LastClonedValue 
-            assert(!is_contained(ChainToBase, OpValue) && 
-                   "incorrect use in rematerialization chain"); 
-            // Assert that the cloned instruction does not use the RootOfChain 
-            // or the AlternateLiveBase. 
-            assert(OpValue != RootOfChain && OpValue != AlternateLiveBase); 
-          } 
-#endif 
-        } else { 
-          // For the first instruction, replace the use of unrelocated base i.e. 
-          // RootOfChain/OrigRootPhi, with the corresponding PHI present in the 
-          // live set. They have been proved to be the same PHI nodes.  Note 
-          // that the *only* use of the RootOfChain in the ChainToBase list is 
-          // the first Value in the list. 
-          if (RootOfChain != AlternateLiveBase) 
-            ClonedValue->replaceUsesOfWith(RootOfChain, AlternateLiveBase); 
-        } 
- 
-        LastClonedValue = ClonedValue; 
-        LastValue = Instr; 
-      } 
-      assert(LastClonedValue); 
-      return LastClonedValue; 
-    }; 
- 
-    // Different cases for calls and invokes. For invokes we need to clone 
-    // instructions both on normal and unwind path. 
-    if (isa<CallInst>(Call)) { 
-      Instruction *InsertBefore = Call->getNextNode(); 
-      assert(InsertBefore); 
-      Instruction *RematerializedValue = rematerializeChain( 
-          InsertBefore, RootOfChain, Info.PointerToBase[LiveValue]); 
-      Info.RematerializedValues[RematerializedValue] = LiveValue; 
-    } else { 
-      auto *Invoke = cast<InvokeInst>(Call); 
- 
-      Instruction *NormalInsertBefore = 
-          &*Invoke->getNormalDest()->getFirstInsertionPt(); 
-      Instruction *UnwindInsertBefore = 
-          &*Invoke->getUnwindDest()->getFirstInsertionPt(); 
- 
-      Instruction *NormalRematerializedValue = rematerializeChain( 
-          NormalInsertBefore, RootOfChain, Info.PointerToBase[LiveValue]); 
-      Instruction *UnwindRematerializedValue = rematerializeChain( 
-          UnwindInsertBefore, RootOfChain, Info.PointerToBase[LiveValue]); 
- 
-      Info.RematerializedValues[NormalRematerializedValue] = LiveValue; 
-      Info.RematerializedValues[UnwindRematerializedValue] = LiveValue; 
-    } 
-  } 
- 
-  // Remove rematerializaed values from the live set 
-  for (auto LiveValue: LiveValuesToBeDeleted) { 
-    Info.LiveSet.remove(LiveValue); 
-  } 
-} 
- 
-static bool insertParsePoints(Function &F, DominatorTree &DT, 
-                              TargetTransformInfo &TTI, 
-                              SmallVectorImpl<CallBase *> &ToUpdate) { 
-#ifndef NDEBUG 
-  // sanity check the input 
-  std::set<CallBase *> Uniqued; 
-  Uniqued.insert(ToUpdate.begin(), ToUpdate.end()); 
-  assert(Uniqued.size() == ToUpdate.size() && "no duplicates please!"); 
- 
-  for (CallBase *Call : ToUpdate) 
-    assert(Call->getFunction() == &F); 
-#endif 
- 
-  // When inserting gc.relocates for invokes, we need to be able to insert at 
-  // the top of the successor blocks.  See the comment on 
-  // normalForInvokeSafepoint on exactly what is needed.  Note that this step 
-  // may restructure the CFG. 
-  for (CallBase *Call : ToUpdate) { 
-    auto *II = dyn_cast<InvokeInst>(Call); 
-    if (!II) 
-      continue; 
-    normalizeForInvokeSafepoint(II->getNormalDest(), II->getParent(), DT); 
-    normalizeForInvokeSafepoint(II->getUnwindDest(), II->getParent(), DT); 
-  } 
- 
-  // A list of dummy calls added to the IR to keep various values obviously 
-  // live in the IR.  We'll remove all of these when done. 
-  SmallVector<CallInst *, 64> Holders; 
- 
-  // Insert a dummy call with all of the deopt operands we'll need for the 
-  // actual safepoint insertion as arguments.  This ensures reference operands 
-  // in the deopt argument list are considered live through the safepoint (and 
-  // thus makes sure they get relocated.) 
-  for (CallBase *Call : ToUpdate) { 
-    SmallVector<Value *, 64> DeoptValues; 
- 
-    for (Value *Arg : GetDeoptBundleOperands(Call)) { 
-      assert(!isUnhandledGCPointerType(Arg->getType()) && 
-             "support for FCA unimplemented"); 
-      if (isHandledGCPointerType(Arg->getType())) 
-        DeoptValues.push_back(Arg); 
-    } 
- 
-    insertUseHolderAfter(Call, DeoptValues, Holders); 
-  } 
- 
-  SmallVector<PartiallyConstructedSafepointRecord, 64> Records(ToUpdate.size()); 
- 
-  // A) Identify all gc pointers which are statically live at the given call 
-  // site. 
-  findLiveReferences(F, DT, ToUpdate, Records); 
- 
-  // B) Find the base pointers for each live pointer 
-  /* scope for caching */ { 
-    // Cache the 'defining value' relation used in the computation and 
-    // insertion of base phis and selects.  This ensures that we don't insert 
-    // large numbers of duplicate base_phis. 
-    DefiningValueMapTy DVCache; 
- 
-    for (size_t i = 0; i < Records.size(); i++) { 
-      PartiallyConstructedSafepointRecord &info = Records[i]; 
-      findBasePointers(DT, DVCache, ToUpdate[i], info); 
-    } 
-  } // end of cache scope 
- 
-  // The base phi insertion logic (for any safepoint) may have inserted new 
-  // instructions which are now live at some safepoint.  The simplest such 
-  // example is: 
-  // loop: 
-  //   phi a  <-- will be a new base_phi here 
-  //   safepoint 1 <-- that needs to be live here 
-  //   gep a + 1 
-  //   safepoint 2 
-  //   br loop 
-  // We insert some dummy calls after each safepoint to definitely hold live 
-  // the base pointers which were identified for that safepoint.  We'll then 
-  // ask liveness for _every_ base inserted to see what is now live.  Then we 
-  // remove the dummy calls. 
-  Holders.reserve(Holders.size() + Records.size()); 
-  for (size_t i = 0; i < Records.size(); i++) { 
-    PartiallyConstructedSafepointRecord &Info = Records[i]; 
- 
-    SmallVector<Value *, 128> Bases; 
-    for (auto Pair : Info.PointerToBase) 
-      Bases.push_back(Pair.second); 
- 
-    insertUseHolderAfter(ToUpdate[i], Bases, Holders); 
-  } 
- 
-  // By selecting base pointers, we've effectively inserted new uses. Thus, we 
-  // need to rerun liveness.  We may *also* have inserted new defs, but that's 
-  // not the key issue. 
-  recomputeLiveInValues(F, DT, ToUpdate, Records); 
- 
-  if (PrintBasePointers) { 
-    for (auto &Info : Records) { 
-      errs() << "Base Pairs: (w/Relocation)\n"; 
-      for (auto Pair : Info.PointerToBase) { 
-        errs() << " derived "; 
-        Pair.first->printAsOperand(errs(), false); 
-        errs() << " base "; 
-        Pair.second->printAsOperand(errs(), false); 
-        errs() << "\n"; 
-      } 
-    } 
-  } 
- 
-  // It is possible that non-constant live variables have a constant base.  For 
-  // example, a GEP with a variable offset from a global.  In this case we can 
-  // remove it from the liveset.  We already don't add constants to the liveset 
-  // because we assume they won't move at runtime and the GC doesn't need to be 
-  // informed about them.  The same reasoning applies if the base is constant. 
-  // Note that the relocation placement code relies on this filtering for 
-  // correctness as it expects the base to be in the liveset, which isn't true 
-  // if the base is constant. 
-  for (auto &Info : Records) 
-    for (auto &BasePair : Info.PointerToBase) 
-      if (isa<Constant>(BasePair.second)) 
-        Info.LiveSet.remove(BasePair.first); 
- 
-  for (CallInst *CI : Holders) 
-    CI->eraseFromParent(); 
- 
-  Holders.clear(); 
- 
-  // In order to reduce live set of statepoint we might choose to rematerialize 
-  // some values instead of relocating them. This is purely an optimization and 
-  // does not influence correctness. 
-  for (size_t i = 0; i < Records.size(); i++) 
-    rematerializeLiveValues(ToUpdate[i], Records[i], TTI); 
- 
-  // We need this to safely RAUW and delete call or invoke return values that 
-  // may themselves be live over a statepoint.  For details, please see usage in 
-  // makeStatepointExplicitImpl. 
-  std::vector<DeferredReplacement> Replacements; 
- 
-  // Now run through and replace the existing statepoints with new ones with 
-  // the live variables listed.  We do not yet update uses of the values being 
-  // relocated. We have references to live variables that need to 
-  // survive to the last iteration of this loop.  (By construction, the 
-  // previous statepoint can not be a live variable, thus we can and remove 
-  // the old statepoint calls as we go.) 
-  for (size_t i = 0; i < Records.size(); i++) 
-    makeStatepointExplicit(DT, ToUpdate[i], Records[i], Replacements); 
- 
-  ToUpdate.clear(); // prevent accident use of invalid calls. 
- 
-  for (auto &PR : Replacements) 
-    PR.doReplacement(); 
- 
-  Replacements.clear(); 
- 
-  for (auto &Info : Records) { 
-    // These live sets may contain state Value pointers, since we replaced calls 
-    // with operand bundles with calls wrapped in gc.statepoint, and some of 
-    // those calls may have been def'ing live gc pointers.  Clear these out to 
-    // avoid accidentally using them. 
-    // 
-    // TODO: We should create a separate data structure that does not contain 
-    // these live sets, and migrate to using that data structure from this point 
-    // onward. 
-    Info.LiveSet.clear(); 
-    Info.PointerToBase.clear(); 
-  } 
- 
-  // Do all the fixups of the original live variables to their relocated selves 
-  SmallVector<Value *, 128> Live; 
-  for (size_t i = 0; i < Records.size(); i++) { 
-    PartiallyConstructedSafepointRecord &Info = Records[i]; 
- 
-    // We can't simply save the live set from the original insertion.  One of 
-    // the live values might be the result of a call which needs a safepoint. 
-    // That Value* no longer exists and we need to use the new gc_result. 
-    // Thankfully, the live set is embedded in the statepoint (and updated), so 
-    // we just grab that. 
+    // TODO: We can also account for cases when we will be able to remove some
+    //       of the rematerialized values by later optimization passes. I.e if
+    //       we rematerialized several intersecting chains. Or if original values
+    //       don't have any uses besides this statepoint.
+
+    // For invokes we need to rematerialize each chain twice - for normal and
+    // for unwind basic blocks. Model this by multiplying cost by two.
+    if (isa<InvokeInst>(Call)) {
+      Cost *= 2;
+    }
+    // If it's too expensive - skip it
+    if (Cost >= RematerializationThreshold)
+      continue;
+
+    // Remove value from the live set
+    LiveValuesToBeDeleted.push_back(LiveValue);
+
+    // Clone instructions and record them inside "Info" structure
+
+    // Walk backwards to visit top-most instructions first
+    std::reverse(ChainToBase.begin(), ChainToBase.end());
+
+    // Utility function which clones all instructions from "ChainToBase"
+    // and inserts them before "InsertBefore". Returns rematerialized value
+    // which should be used after statepoint.
+    auto rematerializeChain = [&ChainToBase](
+        Instruction *InsertBefore, Value *RootOfChain, Value *AlternateLiveBase) {
+      Instruction *LastClonedValue = nullptr;
+      Instruction *LastValue = nullptr;
+      for (Instruction *Instr: ChainToBase) {
+        // Only GEP's and casts are supported as we need to be careful to not
+        // introduce any new uses of pointers not in the liveset.
+        // Note that it's fine to introduce new uses of pointers which were
+        // otherwise not used after this statepoint.
+        assert(isa<GetElementPtrInst>(Instr) || isa<CastInst>(Instr));
+
+        Instruction *ClonedValue = Instr->clone();
+        ClonedValue->insertBefore(InsertBefore);
+        ClonedValue->setName(Instr->getName() + ".remat");
+
+        // If it is not first instruction in the chain then it uses previously
+        // cloned value. We should update it to use cloned value.
+        if (LastClonedValue) {
+          assert(LastValue);
+          ClonedValue->replaceUsesOfWith(LastValue, LastClonedValue);
+#ifndef NDEBUG
+          for (auto OpValue : ClonedValue->operand_values()) {
+            // Assert that cloned instruction does not use any instructions from
+            // this chain other than LastClonedValue
+            assert(!is_contained(ChainToBase, OpValue) &&
+                   "incorrect use in rematerialization chain");
+            // Assert that the cloned instruction does not use the RootOfChain
+            // or the AlternateLiveBase.
+            assert(OpValue != RootOfChain && OpValue != AlternateLiveBase);
+          }
+#endif
+        } else {
+          // For the first instruction, replace the use of unrelocated base i.e.
+          // RootOfChain/OrigRootPhi, with the corresponding PHI present in the
+          // live set. They have been proved to be the same PHI nodes.  Note
+          // that the *only* use of the RootOfChain in the ChainToBase list is
+          // the first Value in the list.
+          if (RootOfChain != AlternateLiveBase)
+            ClonedValue->replaceUsesOfWith(RootOfChain, AlternateLiveBase);
+        }
+
+        LastClonedValue = ClonedValue;
+        LastValue = Instr;
+      }
+      assert(LastClonedValue);
+      return LastClonedValue;
+    };
+
+    // Different cases for calls and invokes. For invokes we need to clone
+    // instructions both on normal and unwind path.
+    if (isa<CallInst>(Call)) {
+      Instruction *InsertBefore = Call->getNextNode();
+      assert(InsertBefore);
+      Instruction *RematerializedValue = rematerializeChain(
+          InsertBefore, RootOfChain, Info.PointerToBase[LiveValue]);
+      Info.RematerializedValues[RematerializedValue] = LiveValue;
+    } else {
+      auto *Invoke = cast<InvokeInst>(Call);
+
+      Instruction *NormalInsertBefore =
+          &*Invoke->getNormalDest()->getFirstInsertionPt();
+      Instruction *UnwindInsertBefore =
+          &*Invoke->getUnwindDest()->getFirstInsertionPt();
+
+      Instruction *NormalRematerializedValue = rematerializeChain(
+          NormalInsertBefore, RootOfChain, Info.PointerToBase[LiveValue]);
+      Instruction *UnwindRematerializedValue = rematerializeChain(
+          UnwindInsertBefore, RootOfChain, Info.PointerToBase[LiveValue]);
+
+      Info.RematerializedValues[NormalRematerializedValue] = LiveValue;
+      Info.RematerializedValues[UnwindRematerializedValue] = LiveValue;
+    }
+  }
+
+  // Remove rematerializaed values from the live set
+  for (auto LiveValue: LiveValuesToBeDeleted) {
+    Info.LiveSet.remove(LiveValue);
+  }
+}
+
+static bool insertParsePoints(Function &F, DominatorTree &DT,
+                              TargetTransformInfo &TTI,
+                              SmallVectorImpl<CallBase *> &ToUpdate) {
+#ifndef NDEBUG
+  // sanity check the input
+  std::set<CallBase *> Uniqued;
+  Uniqued.insert(ToUpdate.begin(), ToUpdate.end());
+  assert(Uniqued.size() == ToUpdate.size() && "no duplicates please!");
+
+  for (CallBase *Call : ToUpdate)
+    assert(Call->getFunction() == &F);
+#endif
+
+  // When inserting gc.relocates for invokes, we need to be able to insert at
+  // the top of the successor blocks.  See the comment on
+  // normalForInvokeSafepoint on exactly what is needed.  Note that this step
+  // may restructure the CFG.
+  for (CallBase *Call : ToUpdate) {
+    auto *II = dyn_cast<InvokeInst>(Call);
+    if (!II)
+      continue;
+    normalizeForInvokeSafepoint(II->getNormalDest(), II->getParent(), DT);
+    normalizeForInvokeSafepoint(II->getUnwindDest(), II->getParent(), DT);
+  }
+
+  // A list of dummy calls added to the IR to keep various values obviously
+  // live in the IR.  We'll remove all of these when done.
+  SmallVector<CallInst *, 64> Holders;
+
+  // Insert a dummy call with all of the deopt operands we'll need for the
+  // actual safepoint insertion as arguments.  This ensures reference operands
+  // in the deopt argument list are considered live through the safepoint (and
+  // thus makes sure they get relocated.)
+  for (CallBase *Call : ToUpdate) {
+    SmallVector<Value *, 64> DeoptValues;
+
+    for (Value *Arg : GetDeoptBundleOperands(Call)) {
+      assert(!isUnhandledGCPointerType(Arg->getType()) &&
+             "support for FCA unimplemented");
+      if (isHandledGCPointerType(Arg->getType()))
+        DeoptValues.push_back(Arg);
+    }
+
+    insertUseHolderAfter(Call, DeoptValues, Holders);
+  }
+
+  SmallVector<PartiallyConstructedSafepointRecord, 64> Records(ToUpdate.size());
+
+  // A) Identify all gc pointers which are statically live at the given call
+  // site.
+  findLiveReferences(F, DT, ToUpdate, Records);
+
+  // B) Find the base pointers for each live pointer
+  /* scope for caching */ {
+    // Cache the 'defining value' relation used in the computation and
+    // insertion of base phis and selects.  This ensures that we don't insert
+    // large numbers of duplicate base_phis.
+    DefiningValueMapTy DVCache;
+
+    for (size_t i = 0; i < Records.size(); i++) {
+      PartiallyConstructedSafepointRecord &info = Records[i];
+      findBasePointers(DT, DVCache, ToUpdate[i], info);
+    }
+  } // end of cache scope
+
+  // The base phi insertion logic (for any safepoint) may have inserted new
+  // instructions which are now live at some safepoint.  The simplest such
+  // example is:
+  // loop:
+  //   phi a  <-- will be a new base_phi here
+  //   safepoint 1 <-- that needs to be live here
+  //   gep a + 1
+  //   safepoint 2
+  //   br loop
+  // We insert some dummy calls after each safepoint to definitely hold live
+  // the base pointers which were identified for that safepoint.  We'll then
+  // ask liveness for _every_ base inserted to see what is now live.  Then we
+  // remove the dummy calls.
+  Holders.reserve(Holders.size() + Records.size());
+  for (size_t i = 0; i < Records.size(); i++) {
+    PartiallyConstructedSafepointRecord &Info = Records[i];
+
+    SmallVector<Value *, 128> Bases;
+    for (auto Pair : Info.PointerToBase)
+      Bases.push_back(Pair.second);
+
+    insertUseHolderAfter(ToUpdate[i], Bases, Holders);
+  }
+
+  // By selecting base pointers, we've effectively inserted new uses. Thus, we
+  // need to rerun liveness.  We may *also* have inserted new defs, but that's
+  // not the key issue.
+  recomputeLiveInValues(F, DT, ToUpdate, Records);
+
+  if (PrintBasePointers) {
+    for (auto &Info : Records) {
+      errs() << "Base Pairs: (w/Relocation)\n";
+      for (auto Pair : Info.PointerToBase) {
+        errs() << " derived ";
+        Pair.first->printAsOperand(errs(), false);
+        errs() << " base ";
+        Pair.second->printAsOperand(errs(), false);
+        errs() << "\n";
+      }
+    }
+  }
+
+  // It is possible that non-constant live variables have a constant base.  For
+  // example, a GEP with a variable offset from a global.  In this case we can
+  // remove it from the liveset.  We already don't add constants to the liveset
+  // because we assume they won't move at runtime and the GC doesn't need to be
+  // informed about them.  The same reasoning applies if the base is constant.
+  // Note that the relocation placement code relies on this filtering for
+  // correctness as it expects the base to be in the liveset, which isn't true
+  // if the base is constant.
+  for (auto &Info : Records)
+    for (auto &BasePair : Info.PointerToBase)
+      if (isa<Constant>(BasePair.second))
+        Info.LiveSet.remove(BasePair.first);
+
+  for (CallInst *CI : Holders)
+    CI->eraseFromParent();
+
+  Holders.clear();
+
+  // In order to reduce live set of statepoint we might choose to rematerialize
+  // some values instead of relocating them. This is purely an optimization and
+  // does not influence correctness.
+  for (size_t i = 0; i < Records.size(); i++)
+    rematerializeLiveValues(ToUpdate[i], Records[i], TTI);
+
+  // We need this to safely RAUW and delete call or invoke return values that
+  // may themselves be live over a statepoint.  For details, please see usage in
+  // makeStatepointExplicitImpl.
+  std::vector<DeferredReplacement> Replacements;
+
+  // Now run through and replace the existing statepoints with new ones with
+  // the live variables listed.  We do not yet update uses of the values being
+  // relocated. We have references to live variables that need to
+  // survive to the last iteration of this loop.  (By construction, the
+  // previous statepoint can not be a live variable, thus we can and remove
+  // the old statepoint calls as we go.)
+  for (size_t i = 0; i < Records.size(); i++)
+    makeStatepointExplicit(DT, ToUpdate[i], Records[i], Replacements);
+
+  ToUpdate.clear(); // prevent accident use of invalid calls.
+
+  for (auto &PR : Replacements)
+    PR.doReplacement();
+
+  Replacements.clear();
+
+  for (auto &Info : Records) {
+    // These live sets may contain state Value pointers, since we replaced calls
+    // with operand bundles with calls wrapped in gc.statepoint, and some of
+    // those calls may have been def'ing live gc pointers.  Clear these out to
+    // avoid accidentally using them.
+    //
+    // TODO: We should create a separate data structure that does not contain
+    // these live sets, and migrate to using that data structure from this point
+    // onward.
+    Info.LiveSet.clear();
+    Info.PointerToBase.clear();
+  }
+
+  // Do all the fixups of the original live variables to their relocated selves
+  SmallVector<Value *, 128> Live;
+  for (size_t i = 0; i < Records.size(); i++) {
+    PartiallyConstructedSafepointRecord &Info = Records[i];
+
+    // We can't simply save the live set from the original insertion.  One of
+    // the live values might be the result of a call which needs a safepoint.
+    // That Value* no longer exists and we need to use the new gc_result.
+    // Thankfully, the live set is embedded in the statepoint (and updated), so
+    // we just grab that.
     llvm::append_range(Live, Info.StatepointToken->gc_args());
-#ifndef NDEBUG 
-    // Do some basic sanity checks on our liveness results before performing 
-    // relocation.  Relocation can and will turn mistakes in liveness results 
-    // into non-sensical code which is must harder to debug. 
-    // TODO: It would be nice to test consistency as well 
-    assert(DT.isReachableFromEntry(Info.StatepointToken->getParent()) && 
-           "statepoint must be reachable or liveness is meaningless"); 
-    for (Value *V : Info.StatepointToken->gc_args()) { 
-      if (!isa<Instruction>(V)) 
-        // Non-instruction values trivial dominate all possible uses 
-        continue; 
-      auto *LiveInst = cast<Instruction>(V); 
-      assert(DT.isReachableFromEntry(LiveInst->getParent()) && 
-             "unreachable values should never be live"); 
-      assert(DT.dominates(LiveInst, Info.StatepointToken) && 
-             "basic SSA liveness expectation violated by liveness analysis"); 
-    } 
-#endif 
-  } 
-  unique_unsorted(Live); 
- 
-#ifndef NDEBUG 
-  // sanity check 
-  for (auto *Ptr : Live) 
-    assert(isHandledGCPointerType(Ptr->getType()) && 
-           "must be a gc pointer type"); 
-#endif 
- 
-  relocationViaAlloca(F, DT, Live, Records); 
-  return !Records.empty(); 
-} 
- 
-// Handles both return values and arguments for Functions and calls. 
-template <typename AttrHolder> 
-static void RemoveNonValidAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH, 
-                                      unsigned Index) { 
-  AttrBuilder R; 
-  if (AH.getDereferenceableBytes(Index)) 
-    R.addAttribute(Attribute::get(Ctx, Attribute::Dereferenceable, 
-                                  AH.getDereferenceableBytes(Index))); 
-  if (AH.getDereferenceableOrNullBytes(Index)) 
-    R.addAttribute(Attribute::get(Ctx, Attribute::DereferenceableOrNull, 
-                                  AH.getDereferenceableOrNullBytes(Index))); 
-  if (AH.getAttributes().hasAttribute(Index, Attribute::NoAlias)) 
-    R.addAttribute(Attribute::NoAlias); 
- 
-  if (!R.empty()) 
-    AH.setAttributes(AH.getAttributes().removeAttributes(Ctx, Index, R)); 
-} 
- 
-static void stripNonValidAttributesFromPrototype(Function &F) { 
-  LLVMContext &Ctx = F.getContext(); 
- 
-  for (Argument &A : F.args()) 
-    if (isa<PointerType>(A.getType())) 
-      RemoveNonValidAttrAtIndex(Ctx, F, 
-                                A.getArgNo() + AttributeList::FirstArgIndex); 
- 
-  if (isa<PointerType>(F.getReturnType())) 
-    RemoveNonValidAttrAtIndex(Ctx, F, AttributeList::ReturnIndex); 
-} 
- 
-/// Certain metadata on instructions are invalid after running RS4GC. 
-/// Optimizations that run after RS4GC can incorrectly use this metadata to 
-/// optimize functions. We drop such metadata on the instruction. 
-static void stripInvalidMetadataFromInstruction(Instruction &I) { 
-  if (!isa<LoadInst>(I) && !isa<StoreInst>(I)) 
-    return; 
-  // These are the attributes that are still valid on loads and stores after 
-  // RS4GC. 
-  // The metadata implying dereferenceability and noalias are (conservatively) 
-  // dropped.  This is because semantically, after RewriteStatepointsForGC runs, 
-  // all calls to gc.statepoint "free" the entire heap. Also, gc.statepoint can 
-  // touch the entire heap including noalias objects. Note: The reasoning is 
-  // same as stripping the dereferenceability and noalias attributes that are 
-  // analogous to the metadata counterparts. 
-  // We also drop the invariant.load metadata on the load because that metadata 
-  // implies the address operand to the load points to memory that is never 
-  // changed once it became dereferenceable. This is no longer true after RS4GC. 
-  // Similar reasoning applies to invariant.group metadata, which applies to 
-  // loads within a group. 
-  unsigned ValidMetadataAfterRS4GC[] = {LLVMContext::MD_tbaa, 
-                         LLVMContext::MD_range, 
-                         LLVMContext::MD_alias_scope, 
-                         LLVMContext::MD_nontemporal, 
-                         LLVMContext::MD_nonnull, 
-                         LLVMContext::MD_align, 
-                         LLVMContext::MD_type}; 
- 
-  // Drops all metadata on the instruction other than ValidMetadataAfterRS4GC. 
-  I.dropUnknownNonDebugMetadata(ValidMetadataAfterRS4GC); 
-} 
- 
-static void stripNonValidDataFromBody(Function &F) { 
-  if (F.empty()) 
-    return; 
- 
-  LLVMContext &Ctx = F.getContext(); 
-  MDBuilder Builder(Ctx); 
- 
-  // Set of invariantstart instructions that we need to remove. 
-  // Use this to avoid invalidating the instruction iterator. 
-  SmallVector<IntrinsicInst*, 12> InvariantStartInstructions; 
- 
-  for (Instruction &I : instructions(F)) { 
-    // invariant.start on memory location implies that the referenced memory 
-    // location is constant and unchanging. This is no longer true after 
-    // RewriteStatepointsForGC runs because there can be calls to gc.statepoint 
-    // which frees the entire heap and the presence of invariant.start allows 
-    // the optimizer to sink the load of a memory location past a statepoint, 
-    // which is incorrect. 
-    if (auto *II = dyn_cast<IntrinsicInst>(&I)) 
-      if (II->getIntrinsicID() == Intrinsic::invariant_start) { 
-        InvariantStartInstructions.push_back(II); 
-        continue; 
-      } 
- 
-    if (MDNode *Tag = I.getMetadata(LLVMContext::MD_tbaa)) { 
-      MDNode *MutableTBAA = Builder.createMutableTBAAAccessTag(Tag); 
-      I.setMetadata(LLVMContext::MD_tbaa, MutableTBAA); 
-    } 
- 
-    stripInvalidMetadataFromInstruction(I); 
- 
-    if (auto *Call = dyn_cast<CallBase>(&I)) { 
-      for (int i = 0, e = Call->arg_size(); i != e; i++) 
-        if (isa<PointerType>(Call->getArgOperand(i)->getType())) 
-          RemoveNonValidAttrAtIndex(Ctx, *Call, 
-                                    i + AttributeList::FirstArgIndex); 
-      if (isa<PointerType>(Call->getType())) 
-        RemoveNonValidAttrAtIndex(Ctx, *Call, AttributeList::ReturnIndex); 
-    } 
-  } 
- 
-  // Delete the invariant.start instructions and RAUW undef. 
-  for (auto *II : InvariantStartInstructions) { 
-    II->replaceAllUsesWith(UndefValue::get(II->getType())); 
-    II->eraseFromParent(); 
-  } 
-} 
- 
-/// Returns true if this function should be rewritten by this pass.  The main 
-/// point of this function is as an extension point for custom logic. 
-static bool shouldRewriteStatepointsIn(Function &F) { 
-  // TODO: This should check the GCStrategy 
-  if (F.hasGC()) { 
-    const auto &FunctionGCName = F.getGC(); 
-    const StringRef StatepointExampleName("statepoint-example"); 
-    const StringRef CoreCLRName("coreclr"); 
-    return (StatepointExampleName == FunctionGCName) || 
-           (CoreCLRName == FunctionGCName); 
-  } else 
-    return false; 
-} 
- 
-static void stripNonValidData(Module &M) { 
-#ifndef NDEBUG 
-  assert(llvm::any_of(M, shouldRewriteStatepointsIn) && "precondition!"); 
-#endif 
- 
-  for (Function &F : M) 
-    stripNonValidAttributesFromPrototype(F); 
- 
-  for (Function &F : M) 
-    stripNonValidDataFromBody(F); 
-} 
- 
-bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT, 
-                                            TargetTransformInfo &TTI, 
-                                            const TargetLibraryInfo &TLI) { 
-  assert(!F.isDeclaration() && !F.empty() && 
-         "need function body to rewrite statepoints in"); 
-  assert(shouldRewriteStatepointsIn(F) && "mismatch in rewrite decision"); 
- 
-  auto NeedsRewrite = [&TLI](Instruction &I) { 
+#ifndef NDEBUG
+    // Do some basic sanity checks on our liveness results before performing
+    // relocation.  Relocation can and will turn mistakes in liveness results
+    // into non-sensical code which is must harder to debug.
+    // TODO: It would be nice to test consistency as well
+    assert(DT.isReachableFromEntry(Info.StatepointToken->getParent()) &&
+           "statepoint must be reachable or liveness is meaningless");
+    for (Value *V : Info.StatepointToken->gc_args()) {
+      if (!isa<Instruction>(V))
+        // Non-instruction values trivial dominate all possible uses
+        continue;
+      auto *LiveInst = cast<Instruction>(V);
+      assert(DT.isReachableFromEntry(LiveInst->getParent()) &&
+             "unreachable values should never be live");
+      assert(DT.dominates(LiveInst, Info.StatepointToken) &&
+             "basic SSA liveness expectation violated by liveness analysis");
+    }
+#endif
+  }
+  unique_unsorted(Live);
+
+#ifndef NDEBUG
+  // sanity check
+  for (auto *Ptr : Live)
+    assert(isHandledGCPointerType(Ptr->getType()) &&
+           "must be a gc pointer type");
+#endif
+
+  relocationViaAlloca(F, DT, Live, Records);
+  return !Records.empty();
+}
+
+// Handles both return values and arguments for Functions and calls.
+template <typename AttrHolder>
+static void RemoveNonValidAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH,
+                                      unsigned Index) {
+  AttrBuilder R;
+  if (AH.getDereferenceableBytes(Index))
+    R.addAttribute(Attribute::get(Ctx, Attribute::Dereferenceable,
+                                  AH.getDereferenceableBytes(Index)));
+  if (AH.getDereferenceableOrNullBytes(Index))
+    R.addAttribute(Attribute::get(Ctx, Attribute::DereferenceableOrNull,
+                                  AH.getDereferenceableOrNullBytes(Index)));
+  if (AH.getAttributes().hasAttribute(Index, Attribute::NoAlias))
+    R.addAttribute(Attribute::NoAlias);
+
+  if (!R.empty())
+    AH.setAttributes(AH.getAttributes().removeAttributes(Ctx, Index, R));
+}
+
+static void stripNonValidAttributesFromPrototype(Function &F) {
+  LLVMContext &Ctx = F.getContext();
+
+  for (Argument &A : F.args())
+    if (isa<PointerType>(A.getType()))
+      RemoveNonValidAttrAtIndex(Ctx, F,
+                                A.getArgNo() + AttributeList::FirstArgIndex);
+
+  if (isa<PointerType>(F.getReturnType()))
+    RemoveNonValidAttrAtIndex(Ctx, F, AttributeList::ReturnIndex);
+}
+
+/// Certain metadata on instructions are invalid after running RS4GC.
+/// Optimizations that run after RS4GC can incorrectly use this metadata to
+/// optimize functions. We drop such metadata on the instruction.
+static void stripInvalidMetadataFromInstruction(Instruction &I) {
+  if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
+    return;
+  // These are the attributes that are still valid on loads and stores after
+  // RS4GC.
+  // The metadata implying dereferenceability and noalias are (conservatively)
+  // dropped.  This is because semantically, after RewriteStatepointsForGC runs,
+  // all calls to gc.statepoint "free" the entire heap. Also, gc.statepoint can
+  // touch the entire heap including noalias objects. Note: The reasoning is
+  // same as stripping the dereferenceability and noalias attributes that are
+  // analogous to the metadata counterparts.
+  // We also drop the invariant.load metadata on the load because that metadata
+  // implies the address operand to the load points to memory that is never
+  // changed once it became dereferenceable. This is no longer true after RS4GC.
+  // Similar reasoning applies to invariant.group metadata, which applies to
+  // loads within a group.
+  unsigned ValidMetadataAfterRS4GC[] = {LLVMContext::MD_tbaa,
+                         LLVMContext::MD_range,
+                         LLVMContext::MD_alias_scope,
+                         LLVMContext::MD_nontemporal,
+                         LLVMContext::MD_nonnull,
+                         LLVMContext::MD_align,
+                         LLVMContext::MD_type};
+
+  // Drops all metadata on the instruction other than ValidMetadataAfterRS4GC.
+  I.dropUnknownNonDebugMetadata(ValidMetadataAfterRS4GC);
+}
+
+static void stripNonValidDataFromBody(Function &F) {
+  if (F.empty())
+    return;
+
+  LLVMContext &Ctx = F.getContext();
+  MDBuilder Builder(Ctx);
+
+  // Set of invariantstart instructions that we need to remove.
+  // Use this to avoid invalidating the instruction iterator.
+  SmallVector<IntrinsicInst*, 12> InvariantStartInstructions;
+
+  for (Instruction &I : instructions(F)) {
+    // invariant.start on memory location implies that the referenced memory
+    // location is constant and unchanging. This is no longer true after
+    // RewriteStatepointsForGC runs because there can be calls to gc.statepoint
+    // which frees the entire heap and the presence of invariant.start allows
+    // the optimizer to sink the load of a memory location past a statepoint,
+    // which is incorrect.
+    if (auto *II = dyn_cast<IntrinsicInst>(&I))
+      if (II->getIntrinsicID() == Intrinsic::invariant_start) {
+        InvariantStartInstructions.push_back(II);
+        continue;
+      }
+
+    if (MDNode *Tag = I.getMetadata(LLVMContext::MD_tbaa)) {
+      MDNode *MutableTBAA = Builder.createMutableTBAAAccessTag(Tag);
+      I.setMetadata(LLVMContext::MD_tbaa, MutableTBAA);
+    }
+
+    stripInvalidMetadataFromInstruction(I);
+
+    if (auto *Call = dyn_cast<CallBase>(&I)) {
+      for (int i = 0, e = Call->arg_size(); i != e; i++)
+        if (isa<PointerType>(Call->getArgOperand(i)->getType()))
+          RemoveNonValidAttrAtIndex(Ctx, *Call,
+                                    i + AttributeList::FirstArgIndex);
+      if (isa<PointerType>(Call->getType()))
+        RemoveNonValidAttrAtIndex(Ctx, *Call, AttributeList::ReturnIndex);
+    }
+  }
+
+  // Delete the invariant.start instructions and RAUW undef.
+  for (auto *II : InvariantStartInstructions) {
+    II->replaceAllUsesWith(UndefValue::get(II->getType()));
+    II->eraseFromParent();
+  }
+}
+
+/// Returns true if this function should be rewritten by this pass.  The main
+/// point of this function is as an extension point for custom logic.
+static bool shouldRewriteStatepointsIn(Function &F) {
+  // TODO: This should check the GCStrategy
+  if (F.hasGC()) {
+    const auto &FunctionGCName = F.getGC();
+    const StringRef StatepointExampleName("statepoint-example");
+    const StringRef CoreCLRName("coreclr");
+    return (StatepointExampleName == FunctionGCName) ||
+           (CoreCLRName == FunctionGCName);
+  } else
+    return false;
+}
+
+static void stripNonValidData(Module &M) {
+#ifndef NDEBUG
+  assert(llvm::any_of(M, shouldRewriteStatepointsIn) && "precondition!");
+#endif
+
+  for (Function &F : M)
+    stripNonValidAttributesFromPrototype(F);
+
+  for (Function &F : M)
+    stripNonValidDataFromBody(F);
+}
+
+bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT,
+                                            TargetTransformInfo &TTI,
+                                            const TargetLibraryInfo &TLI) {
+  assert(!F.isDeclaration() && !F.empty() &&
+         "need function body to rewrite statepoints in");
+  assert(shouldRewriteStatepointsIn(F) && "mismatch in rewrite decision");
+
+  auto NeedsRewrite = [&TLI](Instruction &I) {
     if (const auto *Call = dyn_cast<CallBase>(&I)) {
       if (isa<GCStatepointInst>(Call))
         return false;
@@ -2696,322 +2696,322 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT,
       }
       return true;
     }
-    return false; 
-  }; 
- 
-  // Delete any unreachable statepoints so that we don't have unrewritten 
-  // statepoints surviving this pass.  This makes testing easier and the 
-  // resulting IR less confusing to human readers. 
-  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); 
-  bool MadeChange = removeUnreachableBlocks(F, &DTU); 
-  // Flush the Dominator Tree. 
-  DTU.getDomTree(); 
- 
-  // Gather all the statepoints which need rewritten.  Be careful to only 
-  // consider those in reachable code since we need to ask dominance queries 
-  // when rewriting.  We'll delete the unreachable ones in a moment. 
-  SmallVector<CallBase *, 64> ParsePointNeeded; 
-  for (Instruction &I : instructions(F)) { 
-    // TODO: only the ones with the flag set! 
-    if (NeedsRewrite(I)) { 
-      // NOTE removeUnreachableBlocks() is stronger than 
-      // DominatorTree::isReachableFromEntry(). In other words 
-      // removeUnreachableBlocks can remove some blocks for which 
-      // isReachableFromEntry() returns true. 
-      assert(DT.isReachableFromEntry(I.getParent()) && 
-            "no unreachable blocks expected"); 
-      ParsePointNeeded.push_back(cast<CallBase>(&I)); 
-    } 
-  } 
- 
-  // Return early if no work to do. 
-  if (ParsePointNeeded.empty()) 
-    return MadeChange; 
- 
-  // As a prepass, go ahead and aggressively destroy single entry phi nodes. 
-  // These are created by LCSSA.  They have the effect of increasing the size 
-  // of liveness sets for no good reason.  It may be harder to do this post 
-  // insertion since relocations and base phis can confuse things. 
-  for (BasicBlock &BB : F) 
+    return false;
+  };
+
+  // Delete any unreachable statepoints so that we don't have unrewritten
+  // statepoints surviving this pass.  This makes testing easier and the
+  // resulting IR less confusing to human readers.
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+  bool MadeChange = removeUnreachableBlocks(F, &DTU);
+  // Flush the Dominator Tree.
+  DTU.getDomTree();
+
+  // Gather all the statepoints which need rewritten.  Be careful to only
+  // consider those in reachable code since we need to ask dominance queries
+  // when rewriting.  We'll delete the unreachable ones in a moment.
+  SmallVector<CallBase *, 64> ParsePointNeeded;
+  for (Instruction &I : instructions(F)) {
+    // TODO: only the ones with the flag set!
+    if (NeedsRewrite(I)) {
+      // NOTE removeUnreachableBlocks() is stronger than
+      // DominatorTree::isReachableFromEntry(). In other words
+      // removeUnreachableBlocks can remove some blocks for which
+      // isReachableFromEntry() returns true.
+      assert(DT.isReachableFromEntry(I.getParent()) &&
+            "no unreachable blocks expected");
+      ParsePointNeeded.push_back(cast<CallBase>(&I));
+    }
+  }
+
+  // Return early if no work to do.
+  if (ParsePointNeeded.empty())
+    return MadeChange;
+
+  // As a prepass, go ahead and aggressively destroy single entry phi nodes.
+  // These are created by LCSSA.  They have the effect of increasing the size
+  // of liveness sets for no good reason.  It may be harder to do this post
+  // insertion since relocations and base phis can confuse things.
+  for (BasicBlock &BB : F)
     if (BB.getUniquePredecessor())
       MadeChange |= FoldSingleEntryPHINodes(&BB);
- 
-  // Before we start introducing relocations, we want to tweak the IR a bit to 
-  // avoid unfortunate code generation effects.  The main example is that we 
-  // want to try to make sure the comparison feeding a branch is after any 
-  // safepoints.  Otherwise, we end up with a comparison of pre-relocation 
-  // values feeding a branch after relocation.  This is semantically correct, 
-  // but results in extra register pressure since both the pre-relocation and 
-  // post-relocation copies must be available in registers.  For code without 
-  // relocations this is handled elsewhere, but teaching the scheduler to 
-  // reverse the transform we're about to do would be slightly complex. 
-  // Note: This may extend the live range of the inputs to the icmp and thus 
-  // increase the liveset of any statepoint we move over.  This is profitable 
-  // as long as all statepoints are in rare blocks.  If we had in-register 
-  // lowering for live values this would be a much safer transform. 
-  auto getConditionInst = [](Instruction *TI) -> Instruction * { 
-    if (auto *BI = dyn_cast<BranchInst>(TI)) 
-      if (BI->isConditional()) 
-        return dyn_cast<Instruction>(BI->getCondition()); 
-    // TODO: Extend this to handle switches 
-    return nullptr; 
-  }; 
-  for (BasicBlock &BB : F) { 
-    Instruction *TI = BB.getTerminator(); 
-    if (auto *Cond = getConditionInst(TI)) 
-      // TODO: Handle more than just ICmps here.  We should be able to move 
-      // most instructions without side effects or memory access. 
-      if (isa<ICmpInst>(Cond) && Cond->hasOneUse()) { 
-        MadeChange = true; 
-        Cond->moveBefore(TI); 
-      } 
-  } 
- 
-  // Nasty workaround - The base computation code in the main algorithm doesn't 
-  // consider the fact that a GEP can be used to convert a scalar to a vector. 
-  // The right fix for this is to integrate GEPs into the base rewriting 
-  // algorithm properly, this is just a short term workaround to prevent 
-  // crashes by canonicalizing such GEPs into fully vector GEPs. 
-  for (Instruction &I : instructions(F)) { 
-    if (!isa<GetElementPtrInst>(I)) 
-      continue; 
- 
-    unsigned VF = 0; 
-    for (unsigned i = 0; i < I.getNumOperands(); i++) 
-      if (auto *OpndVTy = dyn_cast<VectorType>(I.getOperand(i)->getType())) { 
-        assert(VF == 0 || 
-               VF == cast<FixedVectorType>(OpndVTy)->getNumElements()); 
-        VF = cast<FixedVectorType>(OpndVTy)->getNumElements(); 
-      } 
- 
-    // It's the vector to scalar traversal through the pointer operand which 
-    // confuses base pointer rewriting, so limit ourselves to that case. 
-    if (!I.getOperand(0)->getType()->isVectorTy() && VF != 0) { 
-      IRBuilder<> B(&I); 
-      auto *Splat = B.CreateVectorSplat(VF, I.getOperand(0)); 
-      I.setOperand(0, Splat); 
-      MadeChange = true; 
-    } 
-  } 
- 
-  MadeChange |= insertParsePoints(F, DT, TTI, ParsePointNeeded); 
-  return MadeChange; 
-} 
- 
-// liveness computation via standard dataflow 
-// ------------------------------------------------------------------- 
- 
-// TODO: Consider using bitvectors for liveness, the set of potentially 
-// interesting values should be small and easy to pre-compute. 
- 
-/// Compute the live-in set for the location rbegin starting from 
-/// the live-out set of the basic block 
-static void computeLiveInValues(BasicBlock::reverse_iterator Begin, 
-                                BasicBlock::reverse_iterator End, 
-                                SetVector<Value *> &LiveTmp) { 
-  for (auto &I : make_range(Begin, End)) { 
-    // KILL/Def - Remove this definition from LiveIn 
-    LiveTmp.remove(&I); 
- 
-    // Don't consider *uses* in PHI nodes, we handle their contribution to 
-    // predecessor blocks when we seed the LiveOut sets 
-    if (isa<PHINode>(I)) 
-      continue; 
- 
-    // USE - Add to the LiveIn set for this instruction 
-    for (Value *V : I.operands()) { 
-      assert(!isUnhandledGCPointerType(V->getType()) && 
-             "support for FCA unimplemented"); 
-      if (isHandledGCPointerType(V->getType()) && !isa<Constant>(V)) { 
-        // The choice to exclude all things constant here is slightly subtle. 
-        // There are two independent reasons: 
-        // - We assume that things which are constant (from LLVM's definition) 
-        // do not move at runtime.  For example, the address of a global 
-        // variable is fixed, even though it's contents may not be. 
-        // - Second, we can't disallow arbitrary inttoptr constants even 
-        // if the language frontend does.  Optimization passes are free to 
-        // locally exploit facts without respect to global reachability.  This 
-        // can create sections of code which are dynamically unreachable and 
-        // contain just about anything.  (see constants.ll in tests) 
-        LiveTmp.insert(V); 
-      } 
-    } 
-  } 
-} 
- 
-static void computeLiveOutSeed(BasicBlock *BB, SetVector<Value *> &LiveTmp) { 
-  for (BasicBlock *Succ : successors(BB)) { 
-    for (auto &I : *Succ) { 
-      PHINode *PN = dyn_cast<PHINode>(&I); 
-      if (!PN) 
-        break; 
- 
-      Value *V = PN->getIncomingValueForBlock(BB); 
-      assert(!isUnhandledGCPointerType(V->getType()) && 
-             "support for FCA unimplemented"); 
-      if (isHandledGCPointerType(V->getType()) && !isa<Constant>(V)) 
-        LiveTmp.insert(V); 
-    } 
-  } 
-} 
- 
-static SetVector<Value *> computeKillSet(BasicBlock *BB) { 
-  SetVector<Value *> KillSet; 
-  for (Instruction &I : *BB) 
-    if (isHandledGCPointerType(I.getType())) 
-      KillSet.insert(&I); 
-  return KillSet; 
-} 
- 
-#ifndef NDEBUG 
-/// Check that the items in 'Live' dominate 'TI'.  This is used as a basic 
-/// sanity check for the liveness computation. 
-static void checkBasicSSA(DominatorTree &DT, SetVector<Value *> &Live, 
-                          Instruction *TI, bool TermOkay = false) { 
-  for (Value *V : Live) { 
-    if (auto *I = dyn_cast<Instruction>(V)) { 
-      // The terminator can be a member of the LiveOut set.  LLVM's definition 
-      // of instruction dominance states that V does not dominate itself.  As 
-      // such, we need to special case this to allow it. 
-      if (TermOkay && TI == I) 
-        continue; 
-      assert(DT.dominates(I, TI) && 
-             "basic SSA liveness expectation violated by liveness analysis"); 
-    } 
-  } 
-} 
- 
-/// Check that all the liveness sets used during the computation of liveness 
-/// obey basic SSA properties.  This is useful for finding cases where we miss 
-/// a def. 
-static void checkBasicSSA(DominatorTree &DT, GCPtrLivenessData &Data, 
-                          BasicBlock &BB) { 
-  checkBasicSSA(DT, Data.LiveSet[&BB], BB.getTerminator()); 
-  checkBasicSSA(DT, Data.LiveOut[&BB], BB.getTerminator(), true); 
-  checkBasicSSA(DT, Data.LiveIn[&BB], BB.getTerminator()); 
-} 
-#endif 
- 
-static void computeLiveInValues(DominatorTree &DT, Function &F, 
-                                GCPtrLivenessData &Data) { 
-  SmallSetVector<BasicBlock *, 32> Worklist; 
- 
-  // Seed the liveness for each individual block 
-  for (BasicBlock &BB : F) { 
-    Data.KillSet[&BB] = computeKillSet(&BB); 
-    Data.LiveSet[&BB].clear(); 
-    computeLiveInValues(BB.rbegin(), BB.rend(), Data.LiveSet[&BB]); 
- 
-#ifndef NDEBUG 
-    for (Value *Kill : Data.KillSet[&BB]) 
-      assert(!Data.LiveSet[&BB].count(Kill) && "live set contains kill"); 
-#endif 
- 
-    Data.LiveOut[&BB] = SetVector<Value *>(); 
-    computeLiveOutSeed(&BB, Data.LiveOut[&BB]); 
-    Data.LiveIn[&BB] = Data.LiveSet[&BB]; 
-    Data.LiveIn[&BB].set_union(Data.LiveOut[&BB]); 
-    Data.LiveIn[&BB].set_subtract(Data.KillSet[&BB]); 
-    if (!Data.LiveIn[&BB].empty()) 
-      Worklist.insert(pred_begin(&BB), pred_end(&BB)); 
-  } 
- 
-  // Propagate that liveness until stable 
-  while (!Worklist.empty()) { 
-    BasicBlock *BB = Worklist.pop_back_val(); 
- 
-    // Compute our new liveout set, then exit early if it hasn't changed despite 
-    // the contribution of our successor. 
-    SetVector<Value *> LiveOut = Data.LiveOut[BB]; 
-    const auto OldLiveOutSize = LiveOut.size(); 
-    for (BasicBlock *Succ : successors(BB)) { 
-      assert(Data.LiveIn.count(Succ)); 
-      LiveOut.set_union(Data.LiveIn[Succ]); 
-    } 
-    // assert OutLiveOut is a subset of LiveOut 
-    if (OldLiveOutSize == LiveOut.size()) { 
-      // If the sets are the same size, then we didn't actually add anything 
-      // when unioning our successors LiveIn.  Thus, the LiveIn of this block 
-      // hasn't changed. 
-      continue; 
-    } 
-    Data.LiveOut[BB] = LiveOut; 
- 
-    // Apply the effects of this basic block 
-    SetVector<Value *> LiveTmp = LiveOut; 
-    LiveTmp.set_union(Data.LiveSet[BB]); 
-    LiveTmp.set_subtract(Data.KillSet[BB]); 
- 
-    assert(Data.LiveIn.count(BB)); 
-    const SetVector<Value *> &OldLiveIn = Data.LiveIn[BB]; 
-    // assert: OldLiveIn is a subset of LiveTmp 
-    if (OldLiveIn.size() != LiveTmp.size()) { 
-      Data.LiveIn[BB] = LiveTmp; 
-      Worklist.insert(pred_begin(BB), pred_end(BB)); 
-    } 
-  } // while (!Worklist.empty()) 
- 
-#ifndef NDEBUG 
-  // Sanity check our output against SSA properties.  This helps catch any 
-  // missing kills during the above iteration. 
-  for (BasicBlock &BB : F) 
-    checkBasicSSA(DT, Data, BB); 
-#endif 
-} 
- 
-static void findLiveSetAtInst(Instruction *Inst, GCPtrLivenessData &Data, 
-                              StatepointLiveSetTy &Out) { 
-  BasicBlock *BB = Inst->getParent(); 
- 
-  // Note: The copy is intentional and required 
-  assert(Data.LiveOut.count(BB)); 
-  SetVector<Value *> LiveOut = Data.LiveOut[BB]; 
- 
-  // We want to handle the statepoint itself oddly.  It's 
-  // call result is not live (normal), nor are it's arguments 
-  // (unless they're used again later).  This adjustment is 
-  // specifically what we need to relocate 
-  computeLiveInValues(BB->rbegin(), ++Inst->getIterator().getReverse(), 
-                      LiveOut); 
-  LiveOut.remove(Inst); 
-  Out.insert(LiveOut.begin(), LiveOut.end()); 
-} 
- 
-static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData, 
-                                  CallBase *Call, 
-                                  PartiallyConstructedSafepointRecord &Info) { 
-  StatepointLiveSetTy Updated; 
-  findLiveSetAtInst(Call, RevisedLivenessData, Updated); 
- 
-  // We may have base pointers which are now live that weren't before.  We need 
-  // to update the PointerToBase structure to reflect this. 
-  for (auto V : Updated) 
-    if (Info.PointerToBase.insert({V, V}).second) { 
-      assert(isKnownBaseResult(V) && 
-             "Can't find base for unexpected live value!"); 
-      continue; 
-    } 
- 
-#ifndef NDEBUG 
-  for (auto V : Updated) 
-    assert(Info.PointerToBase.count(V) && 
-           "Must be able to find base for live value!"); 
-#endif 
- 
-  // Remove any stale base mappings - this can happen since our liveness is 
-  // more precise then the one inherent in the base pointer analysis. 
-  DenseSet<Value *> ToErase; 
-  for (auto KVPair : Info.PointerToBase) 
-    if (!Updated.count(KVPair.first)) 
-      ToErase.insert(KVPair.first); 
- 
-  for (auto *V : ToErase) 
-    Info.PointerToBase.erase(V); 
- 
-#ifndef NDEBUG 
-  for (auto KVPair : Info.PointerToBase) 
-    assert(Updated.count(KVPair.first) && "record for non-live value"); 
-#endif 
- 
-  Info.LiveSet = Updated; 
-} 
+
+  // Before we start introducing relocations, we want to tweak the IR a bit to
+  // avoid unfortunate code generation effects.  The main example is that we
+  // want to try to make sure the comparison feeding a branch is after any
+  // safepoints.  Otherwise, we end up with a comparison of pre-relocation
+  // values feeding a branch after relocation.  This is semantically correct,
+  // but results in extra register pressure since both the pre-relocation and
+  // post-relocation copies must be available in registers.  For code without
+  // relocations this is handled elsewhere, but teaching the scheduler to
+  // reverse the transform we're about to do would be slightly complex.
+  // Note: This may extend the live range of the inputs to the icmp and thus
+  // increase the liveset of any statepoint we move over.  This is profitable
+  // as long as all statepoints are in rare blocks.  If we had in-register
+  // lowering for live values this would be a much safer transform.
+  auto getConditionInst = [](Instruction *TI) -> Instruction * {
+    if (auto *BI = dyn_cast<BranchInst>(TI))
+      if (BI->isConditional())
+        return dyn_cast<Instruction>(BI->getCondition());
+    // TODO: Extend this to handle switches
+    return nullptr;
+  };
+  for (BasicBlock &BB : F) {
+    Instruction *TI = BB.getTerminator();
+    if (auto *Cond = getConditionInst(TI))
+      // TODO: Handle more than just ICmps here.  We should be able to move
+      // most instructions without side effects or memory access.
+      if (isa<ICmpInst>(Cond) && Cond->hasOneUse()) {
+        MadeChange = true;
+        Cond->moveBefore(TI);
+      }
+  }
+
+  // Nasty workaround - The base computation code in the main algorithm doesn't
+  // consider the fact that a GEP can be used to convert a scalar to a vector.
+  // The right fix for this is to integrate GEPs into the base rewriting
+  // algorithm properly, this is just a short term workaround to prevent
+  // crashes by canonicalizing such GEPs into fully vector GEPs.
+  for (Instruction &I : instructions(F)) {
+    if (!isa<GetElementPtrInst>(I))
+      continue;
+
+    unsigned VF = 0;
+    for (unsigned i = 0; i < I.getNumOperands(); i++)
+      if (auto *OpndVTy = dyn_cast<VectorType>(I.getOperand(i)->getType())) {
+        assert(VF == 0 ||
+               VF == cast<FixedVectorType>(OpndVTy)->getNumElements());
+        VF = cast<FixedVectorType>(OpndVTy)->getNumElements();
+      }
+
+    // It's the vector to scalar traversal through the pointer operand which
+    // confuses base pointer rewriting, so limit ourselves to that case.
+    if (!I.getOperand(0)->getType()->isVectorTy() && VF != 0) {
+      IRBuilder<> B(&I);
+      auto *Splat = B.CreateVectorSplat(VF, I.getOperand(0));
+      I.setOperand(0, Splat);
+      MadeChange = true;
+    }
+  }
+
+  MadeChange |= insertParsePoints(F, DT, TTI, ParsePointNeeded);
+  return MadeChange;
+}
+
+// liveness computation via standard dataflow
+// -------------------------------------------------------------------
+
+// TODO: Consider using bitvectors for liveness, the set of potentially
+// interesting values should be small and easy to pre-compute.
+
+/// Compute the live-in set for the location rbegin starting from
+/// the live-out set of the basic block
+static void computeLiveInValues(BasicBlock::reverse_iterator Begin,
+                                BasicBlock::reverse_iterator End,
+                                SetVector<Value *> &LiveTmp) {
+  for (auto &I : make_range(Begin, End)) {
+    // KILL/Def - Remove this definition from LiveIn
+    LiveTmp.remove(&I);
+
+    // Don't consider *uses* in PHI nodes, we handle their contribution to
+    // predecessor blocks when we seed the LiveOut sets
+    if (isa<PHINode>(I))
+      continue;
+
+    // USE - Add to the LiveIn set for this instruction
+    for (Value *V : I.operands()) {
+      assert(!isUnhandledGCPointerType(V->getType()) &&
+             "support for FCA unimplemented");
+      if (isHandledGCPointerType(V->getType()) && !isa<Constant>(V)) {
+        // The choice to exclude all things constant here is slightly subtle.
+        // There are two independent reasons:
+        // - We assume that things which are constant (from LLVM's definition)
+        // do not move at runtime.  For example, the address of a global
+        // variable is fixed, even though it's contents may not be.
+        // - Second, we can't disallow arbitrary inttoptr constants even
+        // if the language frontend does.  Optimization passes are free to
+        // locally exploit facts without respect to global reachability.  This
+        // can create sections of code which are dynamically unreachable and
+        // contain just about anything.  (see constants.ll in tests)
+        LiveTmp.insert(V);
+      }
+    }
+  }
+}
+
+static void computeLiveOutSeed(BasicBlock *BB, SetVector<Value *> &LiveTmp) {
+  for (BasicBlock *Succ : successors(BB)) {
+    for (auto &I : *Succ) {
+      PHINode *PN = dyn_cast<PHINode>(&I);
+      if (!PN)
+        break;
+
+      Value *V = PN->getIncomingValueForBlock(BB);
+      assert(!isUnhandledGCPointerType(V->getType()) &&
+             "support for FCA unimplemented");
+      if (isHandledGCPointerType(V->getType()) && !isa<Constant>(V))
+        LiveTmp.insert(V);
+    }
+  }
+}
+
+static SetVector<Value *> computeKillSet(BasicBlock *BB) {
+  SetVector<Value *> KillSet;
+  for (Instruction &I : *BB)
+    if (isHandledGCPointerType(I.getType()))
+      KillSet.insert(&I);
+  return KillSet;
+}
+
+#ifndef NDEBUG
+/// Check that the items in 'Live' dominate 'TI'.  This is used as a basic
+/// sanity check for the liveness computation.
+static void checkBasicSSA(DominatorTree &DT, SetVector<Value *> &Live,
+                          Instruction *TI, bool TermOkay = false) {
+  for (Value *V : Live) {
+    if (auto *I = dyn_cast<Instruction>(V)) {
+      // The terminator can be a member of the LiveOut set.  LLVM's definition
+      // of instruction dominance states that V does not dominate itself.  As
+      // such, we need to special case this to allow it.
+      if (TermOkay && TI == I)
+        continue;
+      assert(DT.dominates(I, TI) &&
+             "basic SSA liveness expectation violated by liveness analysis");
+    }
+  }
+}
+
+/// Check that all the liveness sets used during the computation of liveness
+/// obey basic SSA properties.  This is useful for finding cases where we miss
+/// a def.
+static void checkBasicSSA(DominatorTree &DT, GCPtrLivenessData &Data,
+                          BasicBlock &BB) {
+  checkBasicSSA(DT, Data.LiveSet[&BB], BB.getTerminator());
+  checkBasicSSA(DT, Data.LiveOut[&BB], BB.getTerminator(), true);
+  checkBasicSSA(DT, Data.LiveIn[&BB], BB.getTerminator());
+}
+#endif
+
+static void computeLiveInValues(DominatorTree &DT, Function &F,
+                                GCPtrLivenessData &Data) {
+  SmallSetVector<BasicBlock *, 32> Worklist;
+
+  // Seed the liveness for each individual block
+  for (BasicBlock &BB : F) {
+    Data.KillSet[&BB] = computeKillSet(&BB);
+    Data.LiveSet[&BB].clear();
+    computeLiveInValues(BB.rbegin(), BB.rend(), Data.LiveSet[&BB]);
+
+#ifndef NDEBUG
+    for (Value *Kill : Data.KillSet[&BB])
+      assert(!Data.LiveSet[&BB].count(Kill) && "live set contains kill");
+#endif
+
+    Data.LiveOut[&BB] = SetVector<Value *>();
+    computeLiveOutSeed(&BB, Data.LiveOut[&BB]);
+    Data.LiveIn[&BB] = Data.LiveSet[&BB];
+    Data.LiveIn[&BB].set_union(Data.LiveOut[&BB]);
+    Data.LiveIn[&BB].set_subtract(Data.KillSet[&BB]);
+    if (!Data.LiveIn[&BB].empty())
+      Worklist.insert(pred_begin(&BB), pred_end(&BB));
+  }
+
+  // Propagate that liveness until stable
+  while (!Worklist.empty()) {
+    BasicBlock *BB = Worklist.pop_back_val();
+
+    // Compute our new liveout set, then exit early if it hasn't changed despite
+    // the contribution of our successor.
+    SetVector<Value *> LiveOut = Data.LiveOut[BB];
+    const auto OldLiveOutSize = LiveOut.size();
+    for (BasicBlock *Succ : successors(BB)) {
+      assert(Data.LiveIn.count(Succ));
+      LiveOut.set_union(Data.LiveIn[Succ]);
+    }
+    // assert OutLiveOut is a subset of LiveOut
+    if (OldLiveOutSize == LiveOut.size()) {
+      // If the sets are the same size, then we didn't actually add anything
+      // when unioning our successors LiveIn.  Thus, the LiveIn of this block
+      // hasn't changed.
+      continue;
+    }
+    Data.LiveOut[BB] = LiveOut;
+
+    // Apply the effects of this basic block
+    SetVector<Value *> LiveTmp = LiveOut;
+    LiveTmp.set_union(Data.LiveSet[BB]);
+    LiveTmp.set_subtract(Data.KillSet[BB]);
+
+    assert(Data.LiveIn.count(BB));
+    const SetVector<Value *> &OldLiveIn = Data.LiveIn[BB];
+    // assert: OldLiveIn is a subset of LiveTmp
+    if (OldLiveIn.size() != LiveTmp.size()) {
+      Data.LiveIn[BB] = LiveTmp;
+      Worklist.insert(pred_begin(BB), pred_end(BB));
+    }
+  } // while (!Worklist.empty())
+
+#ifndef NDEBUG
+  // Sanity check our output against SSA properties.  This helps catch any
+  // missing kills during the above iteration.
+  for (BasicBlock &BB : F)
+    checkBasicSSA(DT, Data, BB);
+#endif
+}
+
+static void findLiveSetAtInst(Instruction *Inst, GCPtrLivenessData &Data,
+                              StatepointLiveSetTy &Out) {
+  BasicBlock *BB = Inst->getParent();
+
+  // Note: The copy is intentional and required
+  assert(Data.LiveOut.count(BB));
+  SetVector<Value *> LiveOut = Data.LiveOut[BB];
+
+  // We want to handle the statepoint itself oddly.  It's
+  // call result is not live (normal), nor are it's arguments
+  // (unless they're used again later).  This adjustment is
+  // specifically what we need to relocate
+  computeLiveInValues(BB->rbegin(), ++Inst->getIterator().getReverse(),
+                      LiveOut);
+  LiveOut.remove(Inst);
+  Out.insert(LiveOut.begin(), LiveOut.end());
+}
+
+static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData,
+                                  CallBase *Call,
+                                  PartiallyConstructedSafepointRecord &Info) {
+  StatepointLiveSetTy Updated;
+  findLiveSetAtInst(Call, RevisedLivenessData, Updated);
+
+  // We may have base pointers which are now live that weren't before.  We need
+  // to update the PointerToBase structure to reflect this.
+  for (auto V : Updated)
+    if (Info.PointerToBase.insert({V, V}).second) {
+      assert(isKnownBaseResult(V) &&
+             "Can't find base for unexpected live value!");
+      continue;
+    }
+
+#ifndef NDEBUG
+  for (auto V : Updated)
+    assert(Info.PointerToBase.count(V) &&
+           "Must be able to find base for live value!");
+#endif
+
+  // Remove any stale base mappings - this can happen since our liveness is
+  // more precise then the one inherent in the base pointer analysis.
+  DenseSet<Value *> ToErase;
+  for (auto KVPair : Info.PointerToBase)
+    if (!Updated.count(KVPair.first))
+      ToErase.insert(KVPair.first);
+
+  for (auto *V : ToErase)
+    Info.PointerToBase.erase(V);
+
+#ifndef NDEBUG
+  for (auto KVPair : Info.PointerToBase)
+    assert(Updated.count(KVPair.first) && "record for non-live value");
+#endif
+
+  Info.LiveSet = Updated;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/SCCP.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/SCCP.cpp
index 8dba00e11b..8feed9e9eb 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/SCCP.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/SCCP.cpp
@@ -1,665 +1,665 @@
-//===- SCCP.cpp - Sparse Conditional Constant Propagation -----------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements sparse conditional constant propagation and merging: 
-// 
-// Specifically, this: 
-//   * Assumes values are constant unless proven otherwise 
-//   * Assumes BasicBlocks are dead unless proven otherwise 
-//   * Proves values to be constant, and replaces them with constants 
-//   * Proves conditional branches to be unconditional 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/SCCP.h" 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/DenseSet.h" 
-#include "llvm/ADT/MapVector.h" 
-#include "llvm/ADT/PointerIntPair.h" 
-#include "llvm/ADT/STLExtras.h" 
+//===- SCCP.cpp - Sparse Conditional Constant Propagation -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements sparse conditional constant propagation and merging:
+//
+// Specifically, this:
+//   * Assumes values are constant unless proven otherwise
+//   * Assumes BasicBlocks are dead unless proven otherwise
+//   * Proves values to be constant, and replaces them with constants
+//   * Proves conditional branches to be unconditional
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/SCCP.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/ConstantFolding.h" 
-#include "llvm/Analysis/DomTreeUpdater.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/Analysis/ValueLattice.h" 
-#include "llvm/Analysis/ValueLatticeUtils.h" 
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueLattice.h"
+#include "llvm/Analysis/ValueLatticeUtils.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GlobalVariable.h" 
-#include "llvm/IR/InstVisitor.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Transforms/Utils/PredicateInfo.h" 
-#include <cassert> 
-#include <utility> 
-#include <vector> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "sccp" 
- 
-STATISTIC(NumInstRemoved, "Number of instructions removed"); 
-STATISTIC(NumDeadBlocks , "Number of basic blocks unreachable"); 
-STATISTIC(NumInstReplaced, 
-          "Number of instructions replaced with (simpler) instruction"); 
- 
-STATISTIC(IPNumInstRemoved, "Number of instructions removed by IPSCCP"); 
-STATISTIC(IPNumArgsElimed ,"Number of arguments constant propagated by IPSCCP"); 
-STATISTIC(IPNumGlobalConst, "Number of globals found to be constant by IPSCCP"); 
-STATISTIC( 
-    IPNumInstReplaced, 
-    "Number of instructions replaced with (simpler) instruction by IPSCCP"); 
- 
-// The maximum number of range extensions allowed for operations requiring 
-// widening. 
-static const unsigned MaxNumRangeExtensions = 10; 
- 
-/// Returns MergeOptions with MaxWidenSteps set to MaxNumRangeExtensions. 
-static ValueLatticeElement::MergeOptions getMaxWidenStepsOpts() { 
-  return ValueLatticeElement::MergeOptions().setMaxWidenSteps( 
-      MaxNumRangeExtensions); 
-} 
-namespace { 
- 
-// Helper to check if \p LV is either a constant or a constant 
-// range with a single element. This should cover exactly the same cases as the 
-// old ValueLatticeElement::isConstant() and is intended to be used in the 
-// transition to ValueLatticeElement. 
-bool isConstant(const ValueLatticeElement &LV) { 
-  return LV.isConstant() || 
-         (LV.isConstantRange() && LV.getConstantRange().isSingleElement()); 
-} 
- 
-// Helper to check if \p LV is either overdefined or a constant range with more 
-// than a single element. This should cover exactly the same cases as the old 
-// ValueLatticeElement::isOverdefined() and is intended to be used in the 
-// transition to ValueLatticeElement. 
-bool isOverdefined(const ValueLatticeElement &LV) { 
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PredicateInfo.h"
+#include <cassert>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "sccp"
+
+STATISTIC(NumInstRemoved, "Number of instructions removed");
+STATISTIC(NumDeadBlocks , "Number of basic blocks unreachable");
+STATISTIC(NumInstReplaced,
+          "Number of instructions replaced with (simpler) instruction");
+
+STATISTIC(IPNumInstRemoved, "Number of instructions removed by IPSCCP");
+STATISTIC(IPNumArgsElimed ,"Number of arguments constant propagated by IPSCCP");
+STATISTIC(IPNumGlobalConst, "Number of globals found to be constant by IPSCCP");
+STATISTIC(
+    IPNumInstReplaced,
+    "Number of instructions replaced with (simpler) instruction by IPSCCP");
+
+// The maximum number of range extensions allowed for operations requiring
+// widening.
+static const unsigned MaxNumRangeExtensions = 10;
+
+/// Returns MergeOptions with MaxWidenSteps set to MaxNumRangeExtensions.
+static ValueLatticeElement::MergeOptions getMaxWidenStepsOpts() {
+  return ValueLatticeElement::MergeOptions().setMaxWidenSteps(
+      MaxNumRangeExtensions);
+}
+namespace {
+
+// Helper to check if \p LV is either a constant or a constant
+// range with a single element. This should cover exactly the same cases as the
+// old ValueLatticeElement::isConstant() and is intended to be used in the
+// transition to ValueLatticeElement.
+bool isConstant(const ValueLatticeElement &LV) {
+  return LV.isConstant() ||
+         (LV.isConstantRange() && LV.getConstantRange().isSingleElement());
+}
+
+// Helper to check if \p LV is either overdefined or a constant range with more
+// than a single element. This should cover exactly the same cases as the old
+// ValueLatticeElement::isOverdefined() and is intended to be used in the
+// transition to ValueLatticeElement.
+bool isOverdefined(const ValueLatticeElement &LV) {
   return !LV.isUnknownOrUndef() && !isConstant(LV);
-} 
- 
-//===----------------------------------------------------------------------===// 
-// 
-/// SCCPSolver - This class is a general purpose solver for Sparse Conditional 
-/// Constant Propagation. 
-/// 
-class SCCPSolver : public InstVisitor<SCCPSolver> { 
-  const DataLayout &DL; 
-  std::function<const TargetLibraryInfo &(Function &)> GetTLI; 
-  SmallPtrSet<BasicBlock *, 8> BBExecutable; // The BBs that are executable. 
-  DenseMap<Value *, ValueLatticeElement> 
-      ValueState; // The state each value is in. 
- 
-  /// StructValueState - This maintains ValueState for values that have 
-  /// StructType, for example for formal arguments, calls, insertelement, etc. 
-  DenseMap<std::pair<Value *, unsigned>, ValueLatticeElement> StructValueState; 
- 
-  /// GlobalValue - If we are tracking any values for the contents of a global 
-  /// variable, we keep a mapping from the constant accessor to the element of 
-  /// the global, to the currently known value.  If the value becomes 
-  /// overdefined, it's entry is simply removed from this map. 
-  DenseMap<GlobalVariable *, ValueLatticeElement> TrackedGlobals; 
- 
-  /// TrackedRetVals - If we are tracking arguments into and the return 
-  /// value out of a function, it will have an entry in this map, indicating 
-  /// what the known return value for the function is. 
-  MapVector<Function *, ValueLatticeElement> TrackedRetVals; 
- 
-  /// TrackedMultipleRetVals - Same as TrackedRetVals, but used for functions 
-  /// that return multiple values. 
-  MapVector<std::pair<Function *, unsigned>, ValueLatticeElement> 
-      TrackedMultipleRetVals; 
- 
-  /// MRVFunctionsTracked - Each function in TrackedMultipleRetVals is 
-  /// represented here for efficient lookup. 
-  SmallPtrSet<Function *, 16> MRVFunctionsTracked; 
- 
-  /// MustTailFunctions - Each function here is a callee of non-removable 
-  /// musttail call site. 
-  SmallPtrSet<Function *, 16> MustTailCallees; 
- 
-  /// TrackingIncomingArguments - This is the set of functions for whose 
-  /// arguments we make optimistic assumptions about and try to prove as 
-  /// constants. 
-  SmallPtrSet<Function *, 16> TrackingIncomingArguments; 
- 
-  /// The reason for two worklists is that overdefined is the lowest state 
-  /// on the lattice, and moving things to overdefined as fast as possible 
-  /// makes SCCP converge much faster. 
-  /// 
-  /// By having a separate worklist, we accomplish this because everything 
-  /// possibly overdefined will become overdefined at the soonest possible 
-  /// point. 
-  SmallVector<Value *, 64> OverdefinedInstWorkList; 
-  SmallVector<Value *, 64> InstWorkList; 
- 
-  // The BasicBlock work list 
-  SmallVector<BasicBlock *, 64>  BBWorkList; 
- 
-  /// KnownFeasibleEdges - Entries in this set are edges which have already had 
-  /// PHI nodes retriggered. 
-  using Edge = std::pair<BasicBlock *, BasicBlock *>; 
-  DenseSet<Edge> KnownFeasibleEdges; 
- 
-  DenseMap<Function *, AnalysisResultsForFn> AnalysisResults; 
-  DenseMap<Value *, SmallPtrSet<User *, 2>> AdditionalUsers; 
- 
-  LLVMContext &Ctx; 
- 
-public: 
-  void addAnalysis(Function &F, AnalysisResultsForFn A) { 
-    AnalysisResults.insert({&F, std::move(A)}); 
-  } 
- 
-  const PredicateBase *getPredicateInfoFor(Instruction *I) { 
-    auto A = AnalysisResults.find(I->getParent()->getParent()); 
-    if (A == AnalysisResults.end()) 
-      return nullptr; 
-    return A->second.PredInfo->getPredicateInfoFor(I); 
-  } 
- 
-  DomTreeUpdater getDTU(Function &F) { 
-    auto A = AnalysisResults.find(&F); 
-    assert(A != AnalysisResults.end() && "Need analysis results for function."); 
-    return {A->second.DT, A->second.PDT, DomTreeUpdater::UpdateStrategy::Lazy}; 
-  } 
- 
-  SCCPSolver(const DataLayout &DL, 
-             std::function<const TargetLibraryInfo &(Function &)> GetTLI, 
-             LLVMContext &Ctx) 
-      : DL(DL), GetTLI(std::move(GetTLI)), Ctx(Ctx) {} 
- 
-  /// MarkBlockExecutable - This method can be used by clients to mark all of 
-  /// the blocks that are known to be intrinsically live in the processed unit. 
-  /// 
-  /// This returns true if the block was not considered live before. 
-  bool MarkBlockExecutable(BasicBlock *BB) { 
-    if (!BBExecutable.insert(BB).second) 
-      return false; 
-    LLVM_DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << '\n'); 
-    BBWorkList.push_back(BB);  // Add the block to the work list! 
-    return true; 
-  } 
- 
-  /// TrackValueOfGlobalVariable - Clients can use this method to 
-  /// inform the SCCPSolver that it should track loads and stores to the 
-  /// specified global variable if it can.  This is only legal to call if 
-  /// performing Interprocedural SCCP. 
-  void TrackValueOfGlobalVariable(GlobalVariable *GV) { 
-    // We only track the contents of scalar globals. 
-    if (GV->getValueType()->isSingleValueType()) { 
-      ValueLatticeElement &IV = TrackedGlobals[GV]; 
-      if (!isa<UndefValue>(GV->getInitializer())) 
-        IV.markConstant(GV->getInitializer()); 
-    } 
-  } 
- 
-  /// AddTrackedFunction - If the SCCP solver is supposed to track calls into 
-  /// and out of the specified function (which cannot have its address taken), 
-  /// this method must be called. 
-  void AddTrackedFunction(Function *F) { 
-    // Add an entry, F -> undef. 
-    if (auto *STy = dyn_cast<StructType>(F->getReturnType())) { 
-      MRVFunctionsTracked.insert(F); 
-      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) 
-        TrackedMultipleRetVals.insert( 
-            std::make_pair(std::make_pair(F, i), ValueLatticeElement())); 
+}
+
+//===----------------------------------------------------------------------===//
+//
+/// SCCPSolver - This class is a general purpose solver for Sparse Conditional
+/// Constant Propagation.
+///
+class SCCPSolver : public InstVisitor<SCCPSolver> {
+  const DataLayout &DL;
+  std::function<const TargetLibraryInfo &(Function &)> GetTLI;
+  SmallPtrSet<BasicBlock *, 8> BBExecutable; // The BBs that are executable.
+  DenseMap<Value *, ValueLatticeElement>
+      ValueState; // The state each value is in.
+
+  /// StructValueState - This maintains ValueState for values that have
+  /// StructType, for example for formal arguments, calls, insertelement, etc.
+  DenseMap<std::pair<Value *, unsigned>, ValueLatticeElement> StructValueState;
+
+  /// GlobalValue - If we are tracking any values for the contents of a global
+  /// variable, we keep a mapping from the constant accessor to the element of
+  /// the global, to the currently known value.  If the value becomes
+  /// overdefined, it's entry is simply removed from this map.
+  DenseMap<GlobalVariable *, ValueLatticeElement> TrackedGlobals;
+
+  /// TrackedRetVals - If we are tracking arguments into and the return
+  /// value out of a function, it will have an entry in this map, indicating
+  /// what the known return value for the function is.
+  MapVector<Function *, ValueLatticeElement> TrackedRetVals;
+
+  /// TrackedMultipleRetVals - Same as TrackedRetVals, but used for functions
+  /// that return multiple values.
+  MapVector<std::pair<Function *, unsigned>, ValueLatticeElement>
+      TrackedMultipleRetVals;
+
+  /// MRVFunctionsTracked - Each function in TrackedMultipleRetVals is
+  /// represented here for efficient lookup.
+  SmallPtrSet<Function *, 16> MRVFunctionsTracked;
+
+  /// MustTailFunctions - Each function here is a callee of non-removable
+  /// musttail call site.
+  SmallPtrSet<Function *, 16> MustTailCallees;
+
+  /// TrackingIncomingArguments - This is the set of functions for whose
+  /// arguments we make optimistic assumptions about and try to prove as
+  /// constants.
+  SmallPtrSet<Function *, 16> TrackingIncomingArguments;
+
+  /// The reason for two worklists is that overdefined is the lowest state
+  /// on the lattice, and moving things to overdefined as fast as possible
+  /// makes SCCP converge much faster.
+  ///
+  /// By having a separate worklist, we accomplish this because everything
+  /// possibly overdefined will become overdefined at the soonest possible
+  /// point.
+  SmallVector<Value *, 64> OverdefinedInstWorkList;
+  SmallVector<Value *, 64> InstWorkList;
+
+  // The BasicBlock work list
+  SmallVector<BasicBlock *, 64>  BBWorkList;
+
+  /// KnownFeasibleEdges - Entries in this set are edges which have already had
+  /// PHI nodes retriggered.
+  using Edge = std::pair<BasicBlock *, BasicBlock *>;
+  DenseSet<Edge> KnownFeasibleEdges;
+
+  DenseMap<Function *, AnalysisResultsForFn> AnalysisResults;
+  DenseMap<Value *, SmallPtrSet<User *, 2>> AdditionalUsers;
+
+  LLVMContext &Ctx;
+
+public:
+  void addAnalysis(Function &F, AnalysisResultsForFn A) {
+    AnalysisResults.insert({&F, std::move(A)});
+  }
+
+  const PredicateBase *getPredicateInfoFor(Instruction *I) {
+    auto A = AnalysisResults.find(I->getParent()->getParent());
+    if (A == AnalysisResults.end())
+      return nullptr;
+    return A->second.PredInfo->getPredicateInfoFor(I);
+  }
+
+  DomTreeUpdater getDTU(Function &F) {
+    auto A = AnalysisResults.find(&F);
+    assert(A != AnalysisResults.end() && "Need analysis results for function.");
+    return {A->second.DT, A->second.PDT, DomTreeUpdater::UpdateStrategy::Lazy};
+  }
+
+  SCCPSolver(const DataLayout &DL,
+             std::function<const TargetLibraryInfo &(Function &)> GetTLI,
+             LLVMContext &Ctx)
+      : DL(DL), GetTLI(std::move(GetTLI)), Ctx(Ctx) {}
+
+  /// MarkBlockExecutable - This method can be used by clients to mark all of
+  /// the blocks that are known to be intrinsically live in the processed unit.
+  ///
+  /// This returns true if the block was not considered live before.
+  bool MarkBlockExecutable(BasicBlock *BB) {
+    if (!BBExecutable.insert(BB).second)
+      return false;
+    LLVM_DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << '\n');
+    BBWorkList.push_back(BB);  // Add the block to the work list!
+    return true;
+  }
+
+  /// TrackValueOfGlobalVariable - Clients can use this method to
+  /// inform the SCCPSolver that it should track loads and stores to the
+  /// specified global variable if it can.  This is only legal to call if
+  /// performing Interprocedural SCCP.
+  void TrackValueOfGlobalVariable(GlobalVariable *GV) {
+    // We only track the contents of scalar globals.
+    if (GV->getValueType()->isSingleValueType()) {
+      ValueLatticeElement &IV = TrackedGlobals[GV];
+      if (!isa<UndefValue>(GV->getInitializer()))
+        IV.markConstant(GV->getInitializer());
+    }
+  }
+
+  /// AddTrackedFunction - If the SCCP solver is supposed to track calls into
+  /// and out of the specified function (which cannot have its address taken),
+  /// this method must be called.
+  void AddTrackedFunction(Function *F) {
+    // Add an entry, F -> undef.
+    if (auto *STy = dyn_cast<StructType>(F->getReturnType())) {
+      MRVFunctionsTracked.insert(F);
+      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+        TrackedMultipleRetVals.insert(
+            std::make_pair(std::make_pair(F, i), ValueLatticeElement()));
     } else if (!F->getReturnType()->isVoidTy())
-      TrackedRetVals.insert(std::make_pair(F, ValueLatticeElement())); 
-  } 
- 
-  /// AddMustTailCallee - If the SCCP solver finds that this function is called 
-  /// from non-removable musttail call site. 
-  void AddMustTailCallee(Function *F) { 
-    MustTailCallees.insert(F); 
-  } 
- 
-  /// Returns true if the given function is called from non-removable musttail 
-  /// call site. 
-  bool isMustTailCallee(Function *F) { 
-    return MustTailCallees.count(F); 
-  } 
- 
-  void AddArgumentTrackedFunction(Function *F) { 
-    TrackingIncomingArguments.insert(F); 
-  } 
- 
-  /// Returns true if the given function is in the solver's set of 
-  /// argument-tracked functions. 
-  bool isArgumentTrackedFunction(Function *F) { 
-    return TrackingIncomingArguments.count(F); 
-  } 
- 
-  /// Solve - Solve for constants and executable blocks. 
-  void Solve(); 
- 
-  /// ResolvedUndefsIn - While solving the dataflow for a function, we assume 
-  /// that branches on undef values cannot reach any of their successors. 
-  /// However, this is not a safe assumption.  After we solve dataflow, this 
-  /// method should be use to handle this.  If this returns true, the solver 
-  /// should be rerun. 
-  bool ResolvedUndefsIn(Function &F); 
- 
-  bool isBlockExecutable(BasicBlock *BB) const { 
-    return BBExecutable.count(BB); 
-  } 
- 
-  // isEdgeFeasible - Return true if the control flow edge from the 'From' basic 
-  // block to the 'To' basic block is currently feasible. 
+      TrackedRetVals.insert(std::make_pair(F, ValueLatticeElement()));
+  }
+
+  /// AddMustTailCallee - If the SCCP solver finds that this function is called
+  /// from non-removable musttail call site.
+  void AddMustTailCallee(Function *F) {
+    MustTailCallees.insert(F);
+  }
+
+  /// Returns true if the given function is called from non-removable musttail
+  /// call site.
+  bool isMustTailCallee(Function *F) {
+    return MustTailCallees.count(F);
+  }
+
+  void AddArgumentTrackedFunction(Function *F) {
+    TrackingIncomingArguments.insert(F);
+  }
+
+  /// Returns true if the given function is in the solver's set of
+  /// argument-tracked functions.
+  bool isArgumentTrackedFunction(Function *F) {
+    return TrackingIncomingArguments.count(F);
+  }
+
+  /// Solve - Solve for constants and executable blocks.
+  void Solve();
+
+  /// ResolvedUndefsIn - While solving the dataflow for a function, we assume
+  /// that branches on undef values cannot reach any of their successors.
+  /// However, this is not a safe assumption.  After we solve dataflow, this
+  /// method should be use to handle this.  If this returns true, the solver
+  /// should be rerun.
+  bool ResolvedUndefsIn(Function &F);
+
+  bool isBlockExecutable(BasicBlock *BB) const {
+    return BBExecutable.count(BB);
+  }
+
+  // isEdgeFeasible - Return true if the control flow edge from the 'From' basic
+  // block to the 'To' basic block is currently feasible.
   bool isEdgeFeasible(BasicBlock *From, BasicBlock *To) const;
- 
-  std::vector<ValueLatticeElement> getStructLatticeValueFor(Value *V) const { 
-    std::vector<ValueLatticeElement> StructValues; 
-    auto *STy = dyn_cast<StructType>(V->getType()); 
-    assert(STy && "getStructLatticeValueFor() can be called only on structs"); 
-    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 
-      auto I = StructValueState.find(std::make_pair(V, i)); 
-      assert(I != StructValueState.end() && "Value not in valuemap!"); 
-      StructValues.push_back(I->second); 
-    } 
-    return StructValues; 
-  } 
- 
-  void removeLatticeValueFor(Value *V) { ValueState.erase(V); } 
- 
-  const ValueLatticeElement &getLatticeValueFor(Value *V) const { 
-    assert(!V->getType()->isStructTy() && 
-           "Should use getStructLatticeValueFor"); 
-    DenseMap<Value *, ValueLatticeElement>::const_iterator I = 
-        ValueState.find(V); 
-    assert(I != ValueState.end() && 
-           "V not found in ValueState nor Paramstate map!"); 
-    return I->second; 
-  } 
- 
-  /// getTrackedRetVals - Get the inferred return value map. 
-  const MapVector<Function *, ValueLatticeElement> &getTrackedRetVals() { 
-    return TrackedRetVals; 
-  } 
- 
-  /// getTrackedGlobals - Get and return the set of inferred initializers for 
-  /// global variables. 
-  const DenseMap<GlobalVariable *, ValueLatticeElement> &getTrackedGlobals() { 
-    return TrackedGlobals; 
-  } 
- 
-  /// getMRVFunctionsTracked - Get the set of functions which return multiple 
-  /// values tracked by the pass. 
-  const SmallPtrSet<Function *, 16> getMRVFunctionsTracked() { 
-    return MRVFunctionsTracked; 
-  } 
- 
-  /// getMustTailCallees - Get the set of functions which are called 
-  /// from non-removable musttail call sites. 
-  const SmallPtrSet<Function *, 16> getMustTailCallees() { 
-    return MustTailCallees; 
-  } 
- 
-  /// markOverdefined - Mark the specified value overdefined.  This 
-  /// works with both scalars and structs. 
-  void markOverdefined(Value *V) { 
-    if (auto *STy = dyn_cast<StructType>(V->getType())) 
-      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) 
-        markOverdefined(getStructValueState(V, i), V); 
-    else 
-      markOverdefined(ValueState[V], V); 
-  } 
- 
-  // isStructLatticeConstant - Return true if all the lattice values 
-  // corresponding to elements of the structure are constants, 
-  // false otherwise. 
-  bool isStructLatticeConstant(Function *F, StructType *STy) { 
-    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 
-      const auto &It = TrackedMultipleRetVals.find(std::make_pair(F, i)); 
-      assert(It != TrackedMultipleRetVals.end()); 
-      ValueLatticeElement LV = It->second; 
-      if (!isConstant(LV)) 
-        return false; 
-    } 
-    return true; 
-  } 
- 
-  /// Helper to return a Constant if \p LV is either a constant or a constant 
-  /// range with a single element. 
-  Constant *getConstant(const ValueLatticeElement &LV) const { 
-    if (LV.isConstant()) 
-      return LV.getConstant(); 
- 
-    if (LV.isConstantRange()) { 
-      auto &CR = LV.getConstantRange(); 
-      if (CR.getSingleElement()) 
-        return ConstantInt::get(Ctx, *CR.getSingleElement()); 
-    } 
-    return nullptr; 
-  } 
- 
-private: 
-  ConstantInt *getConstantInt(const ValueLatticeElement &IV) const { 
-    return dyn_cast_or_null<ConstantInt>(getConstant(IV)); 
-  } 
- 
-  // pushToWorkList - Helper for markConstant/markOverdefined 
-  void pushToWorkList(ValueLatticeElement &IV, Value *V) { 
-    if (IV.isOverdefined()) 
-      return OverdefinedInstWorkList.push_back(V); 
-    InstWorkList.push_back(V); 
-  } 
- 
-  // Helper to push \p V to the worklist, after updating it to \p IV. Also 
-  // prints a debug message with the updated value. 
-  void pushToWorkListMsg(ValueLatticeElement &IV, Value *V) { 
-    LLVM_DEBUG(dbgs() << "updated " << IV << ": " << *V << '\n'); 
-    pushToWorkList(IV, V); 
-  } 
- 
-  // markConstant - Make a value be marked as "constant".  If the value 
-  // is not already a constant, add it to the instruction work list so that 
-  // the users of the instruction are updated later. 
-  bool markConstant(ValueLatticeElement &IV, Value *V, Constant *C, 
-                    bool MayIncludeUndef = false) { 
-    if (!IV.markConstant(C, MayIncludeUndef)) 
-      return false; 
-    LLVM_DEBUG(dbgs() << "markConstant: " << *C << ": " << *V << '\n'); 
-    pushToWorkList(IV, V); 
-    return true; 
-  } 
- 
-  bool markConstant(Value *V, Constant *C) { 
-    assert(!V->getType()->isStructTy() && "structs should use mergeInValue"); 
-    return markConstant(ValueState[V], V, C); 
-  } 
- 
-  // markOverdefined - Make a value be marked as "overdefined". If the 
-  // value is not already overdefined, add it to the overdefined instruction 
-  // work list so that the users of the instruction are updated later. 
-  bool markOverdefined(ValueLatticeElement &IV, Value *V) { 
-    if (!IV.markOverdefined()) return false; 
- 
-    LLVM_DEBUG(dbgs() << "markOverdefined: "; 
-               if (auto *F = dyn_cast<Function>(V)) dbgs() 
-               << "Function '" << F->getName() << "'\n"; 
-               else dbgs() << *V << '\n'); 
-    // Only instructions go on the work list 
-    pushToWorkList(IV, V); 
-    return true; 
-  } 
- 
-  /// Merge \p MergeWithV into \p IV and push \p V to the worklist, if \p IV 
-  /// changes. 
-  bool mergeInValue(ValueLatticeElement &IV, Value *V, 
-                    ValueLatticeElement MergeWithV, 
-                    ValueLatticeElement::MergeOptions Opts = { 
-                        /*MayIncludeUndef=*/false, /*CheckWiden=*/false}) { 
-    if (IV.mergeIn(MergeWithV, Opts)) { 
-      pushToWorkList(IV, V); 
-      LLVM_DEBUG(dbgs() << "Merged " << MergeWithV << " into " << *V << " : " 
-                        << IV << "\n"); 
-      return true; 
-    } 
-    return false; 
-  } 
- 
-  bool mergeInValue(Value *V, ValueLatticeElement MergeWithV, 
-                    ValueLatticeElement::MergeOptions Opts = { 
-                        /*MayIncludeUndef=*/false, /*CheckWiden=*/false}) { 
-    assert(!V->getType()->isStructTy() && 
-           "non-structs should use markConstant"); 
-    return mergeInValue(ValueState[V], V, MergeWithV, Opts); 
-  } 
- 
-  /// getValueState - Return the ValueLatticeElement object that corresponds to 
-  /// the value.  This function handles the case when the value hasn't been seen 
-  /// yet by properly seeding constants etc. 
-  ValueLatticeElement &getValueState(Value *V) { 
-    assert(!V->getType()->isStructTy() && "Should use getStructValueState"); 
- 
-    auto I = ValueState.insert(std::make_pair(V, ValueLatticeElement())); 
-    ValueLatticeElement &LV = I.first->second; 
- 
-    if (!I.second) 
-      return LV;  // Common case, already in the map. 
- 
-    if (auto *C = dyn_cast<Constant>(V)) 
-      LV.markConstant(C);          // Constants are constant 
- 
-    // All others are unknown by default. 
-    return LV; 
-  } 
- 
-  /// getStructValueState - Return the ValueLatticeElement object that 
-  /// corresponds to the value/field pair.  This function handles the case when 
-  /// the value hasn't been seen yet by properly seeding constants etc. 
-  ValueLatticeElement &getStructValueState(Value *V, unsigned i) { 
-    assert(V->getType()->isStructTy() && "Should use getValueState"); 
-    assert(i < cast<StructType>(V->getType())->getNumElements() && 
-           "Invalid element #"); 
- 
-    auto I = StructValueState.insert( 
-        std::make_pair(std::make_pair(V, i), ValueLatticeElement())); 
-    ValueLatticeElement &LV = I.first->second; 
- 
-    if (!I.second) 
-      return LV;  // Common case, already in the map. 
- 
-    if (auto *C = dyn_cast<Constant>(V)) { 
-      Constant *Elt = C->getAggregateElement(i); 
- 
-      if (!Elt) 
-        LV.markOverdefined();      // Unknown sort of constant. 
-      else if (isa<UndefValue>(Elt)) 
-        ; // Undef values remain unknown. 
-      else 
-        LV.markConstant(Elt);      // Constants are constant. 
-    } 
- 
-    // All others are underdefined by default. 
-    return LV; 
-  } 
- 
-  /// markEdgeExecutable - Mark a basic block as executable, adding it to the BB 
-  /// work list if it is not already executable. 
-  bool markEdgeExecutable(BasicBlock *Source, BasicBlock *Dest) { 
-    if (!KnownFeasibleEdges.insert(Edge(Source, Dest)).second) 
-      return false;  // This edge is already known to be executable! 
- 
-    if (!MarkBlockExecutable(Dest)) { 
-      // If the destination is already executable, we just made an *edge* 
-      // feasible that wasn't before.  Revisit the PHI nodes in the block 
-      // because they have potentially new operands. 
-      LLVM_DEBUG(dbgs() << "Marking Edge Executable: " << Source->getName() 
-                        << " -> " << Dest->getName() << '\n'); 
- 
-      for (PHINode &PN : Dest->phis()) 
-        visitPHINode(PN); 
-    } 
-    return true; 
-  } 
- 
-  // getFeasibleSuccessors - Return a vector of booleans to indicate which 
-  // successors are reachable from a given terminator instruction. 
-  void getFeasibleSuccessors(Instruction &TI, SmallVectorImpl<bool> &Succs); 
- 
-  // OperandChangedState - This method is invoked on all of the users of an 
-  // instruction that was just changed state somehow.  Based on this 
-  // information, we need to update the specified user of this instruction. 
-  void OperandChangedState(Instruction *I) { 
-    if (BBExecutable.count(I->getParent()))   // Inst is executable? 
-      visit(*I); 
-  } 
- 
-  // Add U as additional user of V. 
-  void addAdditionalUser(Value *V, User *U) { 
-    auto Iter = AdditionalUsers.insert({V, {}}); 
-    Iter.first->second.insert(U); 
-  } 
- 
-  // Mark I's users as changed, including AdditionalUsers. 
-  void markUsersAsChanged(Value *I) { 
-    // Functions include their arguments in the use-list. Changed function 
-    // values mean that the result of the function changed. We only need to 
-    // update the call sites with the new function result and do not have to 
-    // propagate the call arguments. 
-    if (isa<Function>(I)) { 
-      for (User *U : I->users()) { 
-        if (auto *CB = dyn_cast<CallBase>(U)) 
-          handleCallResult(*CB); 
-      } 
-    } else { 
-      for (User *U : I->users()) 
-        if (auto *UI = dyn_cast<Instruction>(U)) 
-          OperandChangedState(UI); 
-    } 
- 
-    auto Iter = AdditionalUsers.find(I); 
-    if (Iter != AdditionalUsers.end()) { 
+
+  std::vector<ValueLatticeElement> getStructLatticeValueFor(Value *V) const {
+    std::vector<ValueLatticeElement> StructValues;
+    auto *STy = dyn_cast<StructType>(V->getType());
+    assert(STy && "getStructLatticeValueFor() can be called only on structs");
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+      auto I = StructValueState.find(std::make_pair(V, i));
+      assert(I != StructValueState.end() && "Value not in valuemap!");
+      StructValues.push_back(I->second);
+    }
+    return StructValues;
+  }
+
+  void removeLatticeValueFor(Value *V) { ValueState.erase(V); }
+
+  const ValueLatticeElement &getLatticeValueFor(Value *V) const {
+    assert(!V->getType()->isStructTy() &&
+           "Should use getStructLatticeValueFor");
+    DenseMap<Value *, ValueLatticeElement>::const_iterator I =
+        ValueState.find(V);
+    assert(I != ValueState.end() &&
+           "V not found in ValueState nor Paramstate map!");
+    return I->second;
+  }
+
+  /// getTrackedRetVals - Get the inferred return value map.
+  const MapVector<Function *, ValueLatticeElement> &getTrackedRetVals() {
+    return TrackedRetVals;
+  }
+
+  /// getTrackedGlobals - Get and return the set of inferred initializers for
+  /// global variables.
+  const DenseMap<GlobalVariable *, ValueLatticeElement> &getTrackedGlobals() {
+    return TrackedGlobals;
+  }
+
+  /// getMRVFunctionsTracked - Get the set of functions which return multiple
+  /// values tracked by the pass.
+  const SmallPtrSet<Function *, 16> getMRVFunctionsTracked() {
+    return MRVFunctionsTracked;
+  }
+
+  /// getMustTailCallees - Get the set of functions which are called
+  /// from non-removable musttail call sites.
+  const SmallPtrSet<Function *, 16> getMustTailCallees() {
+    return MustTailCallees;
+  }
+
+  /// markOverdefined - Mark the specified value overdefined.  This
+  /// works with both scalars and structs.
+  void markOverdefined(Value *V) {
+    if (auto *STy = dyn_cast<StructType>(V->getType()))
+      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+        markOverdefined(getStructValueState(V, i), V);
+    else
+      markOverdefined(ValueState[V], V);
+  }
+
+  // isStructLatticeConstant - Return true if all the lattice values
+  // corresponding to elements of the structure are constants,
+  // false otherwise.
+  bool isStructLatticeConstant(Function *F, StructType *STy) {
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+      const auto &It = TrackedMultipleRetVals.find(std::make_pair(F, i));
+      assert(It != TrackedMultipleRetVals.end());
+      ValueLatticeElement LV = It->second;
+      if (!isConstant(LV))
+        return false;
+    }
+    return true;
+  }
+
+  /// Helper to return a Constant if \p LV is either a constant or a constant
+  /// range with a single element.
+  Constant *getConstant(const ValueLatticeElement &LV) const {
+    if (LV.isConstant())
+      return LV.getConstant();
+
+    if (LV.isConstantRange()) {
+      auto &CR = LV.getConstantRange();
+      if (CR.getSingleElement())
+        return ConstantInt::get(Ctx, *CR.getSingleElement());
+    }
+    return nullptr;
+  }
+
+private:
+  ConstantInt *getConstantInt(const ValueLatticeElement &IV) const {
+    return dyn_cast_or_null<ConstantInt>(getConstant(IV));
+  }
+
+  // pushToWorkList - Helper for markConstant/markOverdefined
+  void pushToWorkList(ValueLatticeElement &IV, Value *V) {
+    if (IV.isOverdefined())
+      return OverdefinedInstWorkList.push_back(V);
+    InstWorkList.push_back(V);
+  }
+
+  // Helper to push \p V to the worklist, after updating it to \p IV. Also
+  // prints a debug message with the updated value.
+  void pushToWorkListMsg(ValueLatticeElement &IV, Value *V) {
+    LLVM_DEBUG(dbgs() << "updated " << IV << ": " << *V << '\n');
+    pushToWorkList(IV, V);
+  }
+
+  // markConstant - Make a value be marked as "constant".  If the value
+  // is not already a constant, add it to the instruction work list so that
+  // the users of the instruction are updated later.
+  bool markConstant(ValueLatticeElement &IV, Value *V, Constant *C,
+                    bool MayIncludeUndef = false) {
+    if (!IV.markConstant(C, MayIncludeUndef))
+      return false;
+    LLVM_DEBUG(dbgs() << "markConstant: " << *C << ": " << *V << '\n');
+    pushToWorkList(IV, V);
+    return true;
+  }
+
+  bool markConstant(Value *V, Constant *C) {
+    assert(!V->getType()->isStructTy() && "structs should use mergeInValue");
+    return markConstant(ValueState[V], V, C);
+  }
+
+  // markOverdefined - Make a value be marked as "overdefined". If the
+  // value is not already overdefined, add it to the overdefined instruction
+  // work list so that the users of the instruction are updated later.
+  bool markOverdefined(ValueLatticeElement &IV, Value *V) {
+    if (!IV.markOverdefined()) return false;
+
+    LLVM_DEBUG(dbgs() << "markOverdefined: ";
+               if (auto *F = dyn_cast<Function>(V)) dbgs()
+               << "Function '" << F->getName() << "'\n";
+               else dbgs() << *V << '\n');
+    // Only instructions go on the work list
+    pushToWorkList(IV, V);
+    return true;
+  }
+
+  /// Merge \p MergeWithV into \p IV and push \p V to the worklist, if \p IV
+  /// changes.
+  bool mergeInValue(ValueLatticeElement &IV, Value *V,
+                    ValueLatticeElement MergeWithV,
+                    ValueLatticeElement::MergeOptions Opts = {
+                        /*MayIncludeUndef=*/false, /*CheckWiden=*/false}) {
+    if (IV.mergeIn(MergeWithV, Opts)) {
+      pushToWorkList(IV, V);
+      LLVM_DEBUG(dbgs() << "Merged " << MergeWithV << " into " << *V << " : "
+                        << IV << "\n");
+      return true;
+    }
+    return false;
+  }
+
+  bool mergeInValue(Value *V, ValueLatticeElement MergeWithV,
+                    ValueLatticeElement::MergeOptions Opts = {
+                        /*MayIncludeUndef=*/false, /*CheckWiden=*/false}) {
+    assert(!V->getType()->isStructTy() &&
+           "non-structs should use markConstant");
+    return mergeInValue(ValueState[V], V, MergeWithV, Opts);
+  }
+
+  /// getValueState - Return the ValueLatticeElement object that corresponds to
+  /// the value.  This function handles the case when the value hasn't been seen
+  /// yet by properly seeding constants etc.
+  ValueLatticeElement &getValueState(Value *V) {
+    assert(!V->getType()->isStructTy() && "Should use getStructValueState");
+
+    auto I = ValueState.insert(std::make_pair(V, ValueLatticeElement()));
+    ValueLatticeElement &LV = I.first->second;
+
+    if (!I.second)
+      return LV;  // Common case, already in the map.
+
+    if (auto *C = dyn_cast<Constant>(V))
+      LV.markConstant(C);          // Constants are constant
+
+    // All others are unknown by default.
+    return LV;
+  }
+
+  /// getStructValueState - Return the ValueLatticeElement object that
+  /// corresponds to the value/field pair.  This function handles the case when
+  /// the value hasn't been seen yet by properly seeding constants etc.
+  ValueLatticeElement &getStructValueState(Value *V, unsigned i) {
+    assert(V->getType()->isStructTy() && "Should use getValueState");
+    assert(i < cast<StructType>(V->getType())->getNumElements() &&
+           "Invalid element #");
+
+    auto I = StructValueState.insert(
+        std::make_pair(std::make_pair(V, i), ValueLatticeElement()));
+    ValueLatticeElement &LV = I.first->second;
+
+    if (!I.second)
+      return LV;  // Common case, already in the map.
+
+    if (auto *C = dyn_cast<Constant>(V)) {
+      Constant *Elt = C->getAggregateElement(i);
+
+      if (!Elt)
+        LV.markOverdefined();      // Unknown sort of constant.
+      else if (isa<UndefValue>(Elt))
+        ; // Undef values remain unknown.
+      else
+        LV.markConstant(Elt);      // Constants are constant.
+    }
+
+    // All others are underdefined by default.
+    return LV;
+  }
+
+  /// markEdgeExecutable - Mark a basic block as executable, adding it to the BB
+  /// work list if it is not already executable.
+  bool markEdgeExecutable(BasicBlock *Source, BasicBlock *Dest) {
+    if (!KnownFeasibleEdges.insert(Edge(Source, Dest)).second)
+      return false;  // This edge is already known to be executable!
+
+    if (!MarkBlockExecutable(Dest)) {
+      // If the destination is already executable, we just made an *edge*
+      // feasible that wasn't before.  Revisit the PHI nodes in the block
+      // because they have potentially new operands.
+      LLVM_DEBUG(dbgs() << "Marking Edge Executable: " << Source->getName()
+                        << " -> " << Dest->getName() << '\n');
+
+      for (PHINode &PN : Dest->phis())
+        visitPHINode(PN);
+    }
+    return true;
+  }
+
+  // getFeasibleSuccessors - Return a vector of booleans to indicate which
+  // successors are reachable from a given terminator instruction.
+  void getFeasibleSuccessors(Instruction &TI, SmallVectorImpl<bool> &Succs);
+
+  // OperandChangedState - This method is invoked on all of the users of an
+  // instruction that was just changed state somehow.  Based on this
+  // information, we need to update the specified user of this instruction.
+  void OperandChangedState(Instruction *I) {
+    if (BBExecutable.count(I->getParent()))   // Inst is executable?
+      visit(*I);
+  }
+
+  // Add U as additional user of V.
+  void addAdditionalUser(Value *V, User *U) {
+    auto Iter = AdditionalUsers.insert({V, {}});
+    Iter.first->second.insert(U);
+  }
+
+  // Mark I's users as changed, including AdditionalUsers.
+  void markUsersAsChanged(Value *I) {
+    // Functions include their arguments in the use-list. Changed function
+    // values mean that the result of the function changed. We only need to
+    // update the call sites with the new function result and do not have to
+    // propagate the call arguments.
+    if (isa<Function>(I)) {
+      for (User *U : I->users()) {
+        if (auto *CB = dyn_cast<CallBase>(U))
+          handleCallResult(*CB);
+      }
+    } else {
+      for (User *U : I->users())
+        if (auto *UI = dyn_cast<Instruction>(U))
+          OperandChangedState(UI);
+    }
+
+    auto Iter = AdditionalUsers.find(I);
+    if (Iter != AdditionalUsers.end()) {
       // Copy additional users before notifying them of changes, because new
       // users may be added, potentially invalidating the iterator.
       SmallVector<Instruction *, 2> ToNotify;
-      for (User *U : Iter->second) 
-        if (auto *UI = dyn_cast<Instruction>(U)) 
+      for (User *U : Iter->second)
+        if (auto *UI = dyn_cast<Instruction>(U))
           ToNotify.push_back(UI);
       for (Instruction *UI : ToNotify)
         OperandChangedState(UI);
-    } 
-  } 
-  void handleCallOverdefined(CallBase &CB); 
-  void handleCallResult(CallBase &CB); 
-  void handleCallArguments(CallBase &CB); 
- 
-private: 
-  friend class InstVisitor<SCCPSolver>; 
- 
-  // visit implementations - Something changed in this instruction.  Either an 
-  // operand made a transition, or the instruction is newly executable.  Change 
-  // the value type of I to reflect these changes if appropriate. 
-  void visitPHINode(PHINode &I); 
- 
-  // Terminators 
- 
-  void visitReturnInst(ReturnInst &I); 
-  void visitTerminator(Instruction &TI); 
- 
-  void visitCastInst(CastInst &I); 
-  void visitSelectInst(SelectInst &I); 
-  void visitUnaryOperator(Instruction &I); 
-  void visitBinaryOperator(Instruction &I); 
-  void visitCmpInst(CmpInst &I); 
-  void visitExtractValueInst(ExtractValueInst &EVI); 
-  void visitInsertValueInst(InsertValueInst &IVI); 
- 
-  void visitCatchSwitchInst(CatchSwitchInst &CPI) { 
-    markOverdefined(&CPI); 
-    visitTerminator(CPI); 
-  } 
- 
-  // Instructions that cannot be folded away. 
- 
-  void visitStoreInst     (StoreInst &I); 
-  void visitLoadInst      (LoadInst &I); 
-  void visitGetElementPtrInst(GetElementPtrInst &I); 
- 
-  void visitCallInst      (CallInst &I) { 
-    visitCallBase(I); 
-  } 
- 
-  void visitInvokeInst    (InvokeInst &II) { 
-    visitCallBase(II); 
-    visitTerminator(II); 
-  } 
- 
-  void visitCallBrInst    (CallBrInst &CBI) { 
-    visitCallBase(CBI); 
-    visitTerminator(CBI); 
-  } 
- 
-  void visitCallBase      (CallBase &CB); 
-  void visitResumeInst    (ResumeInst &I) { /*returns void*/ } 
-  void visitUnreachableInst(UnreachableInst &I) { /*returns void*/ } 
-  void visitFenceInst     (FenceInst &I) { /*returns void*/ } 
- 
-  void visitInstruction(Instruction &I) { 
-    // All the instructions we don't do any special handling for just 
-    // go to overdefined. 
-    LLVM_DEBUG(dbgs() << "SCCP: Don't know how to handle: " << I << '\n'); 
-    markOverdefined(&I); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-// getFeasibleSuccessors - Return a vector of booleans to indicate which 
-// successors are reachable from a given terminator instruction. 
-void SCCPSolver::getFeasibleSuccessors(Instruction &TI, 
-                                       SmallVectorImpl<bool> &Succs) { 
-  Succs.resize(TI.getNumSuccessors()); 
-  if (auto *BI = dyn_cast<BranchInst>(&TI)) { 
-    if (BI->isUnconditional()) { 
-      Succs[0] = true; 
-      return; 
-    } 
- 
-    ValueLatticeElement BCValue = getValueState(BI->getCondition()); 
-    ConstantInt *CI = getConstantInt(BCValue); 
-    if (!CI) { 
-      // Overdefined condition variables, and branches on unfoldable constant 
-      // conditions, mean the branch could go either way. 
-      if (!BCValue.isUnknownOrUndef()) 
-        Succs[0] = Succs[1] = true; 
-      return; 
-    } 
- 
-    // Constant condition variables mean the branch can only go a single way. 
-    Succs[CI->isZero()] = true; 
-    return; 
-  } 
- 
-  // Unwinding instructions successors are always executable. 
-  if (TI.isExceptionalTerminator()) { 
-    Succs.assign(TI.getNumSuccessors(), true); 
-    return; 
-  } 
- 
-  if (auto *SI = dyn_cast<SwitchInst>(&TI)) { 
-    if (!SI->getNumCases()) { 
-      Succs[0] = true; 
-      return; 
-    } 
+    }
+  }
+  void handleCallOverdefined(CallBase &CB);
+  void handleCallResult(CallBase &CB);
+  void handleCallArguments(CallBase &CB);
+
+private:
+  friend class InstVisitor<SCCPSolver>;
+
+  // visit implementations - Something changed in this instruction.  Either an
+  // operand made a transition, or the instruction is newly executable.  Change
+  // the value type of I to reflect these changes if appropriate.
+  void visitPHINode(PHINode &I);
+
+  // Terminators
+
+  void visitReturnInst(ReturnInst &I);
+  void visitTerminator(Instruction &TI);
+
+  void visitCastInst(CastInst &I);
+  void visitSelectInst(SelectInst &I);
+  void visitUnaryOperator(Instruction &I);
+  void visitBinaryOperator(Instruction &I);
+  void visitCmpInst(CmpInst &I);
+  void visitExtractValueInst(ExtractValueInst &EVI);
+  void visitInsertValueInst(InsertValueInst &IVI);
+
+  void visitCatchSwitchInst(CatchSwitchInst &CPI) {
+    markOverdefined(&CPI);
+    visitTerminator(CPI);
+  }
+
+  // Instructions that cannot be folded away.
+
+  void visitStoreInst     (StoreInst &I);
+  void visitLoadInst      (LoadInst &I);
+  void visitGetElementPtrInst(GetElementPtrInst &I);
+
+  void visitCallInst      (CallInst &I) {
+    visitCallBase(I);
+  }
+
+  void visitInvokeInst    (InvokeInst &II) {
+    visitCallBase(II);
+    visitTerminator(II);
+  }
+
+  void visitCallBrInst    (CallBrInst &CBI) {
+    visitCallBase(CBI);
+    visitTerminator(CBI);
+  }
+
+  void visitCallBase      (CallBase &CB);
+  void visitResumeInst    (ResumeInst &I) { /*returns void*/ }
+  void visitUnreachableInst(UnreachableInst &I) { /*returns void*/ }
+  void visitFenceInst     (FenceInst &I) { /*returns void*/ }
+
+  void visitInstruction(Instruction &I) {
+    // All the instructions we don't do any special handling for just
+    // go to overdefined.
+    LLVM_DEBUG(dbgs() << "SCCP: Don't know how to handle: " << I << '\n');
+    markOverdefined(&I);
+  }
+};
+
+} // end anonymous namespace
+
+// getFeasibleSuccessors - Return a vector of booleans to indicate which
+// successors are reachable from a given terminator instruction.
+void SCCPSolver::getFeasibleSuccessors(Instruction &TI,
+                                       SmallVectorImpl<bool> &Succs) {
+  Succs.resize(TI.getNumSuccessors());
+  if (auto *BI = dyn_cast<BranchInst>(&TI)) {
+    if (BI->isUnconditional()) {
+      Succs[0] = true;
+      return;
+    }
+
+    ValueLatticeElement BCValue = getValueState(BI->getCondition());
+    ConstantInt *CI = getConstantInt(BCValue);
+    if (!CI) {
+      // Overdefined condition variables, and branches on unfoldable constant
+      // conditions, mean the branch could go either way.
+      if (!BCValue.isUnknownOrUndef())
+        Succs[0] = Succs[1] = true;
+      return;
+    }
+
+    // Constant condition variables mean the branch can only go a single way.
+    Succs[CI->isZero()] = true;
+    return;
+  }
+
+  // Unwinding instructions successors are always executable.
+  if (TI.isExceptionalTerminator()) {
+    Succs.assign(TI.getNumSuccessors(), true);
+    return;
+  }
+
+  if (auto *SI = dyn_cast<SwitchInst>(&TI)) {
+    if (!SI->getNumCases()) {
+      Succs[0] = true;
+      return;
+    }
     const ValueLatticeElement &SCValue = getValueState(SI->getCondition());
     if (ConstantInt *CI = getConstantInt(SCValue)) {
       Succs[SI->findCaseValue(CI)->getSuccessorIndex()] = true;
       return;
     }
- 
+
     // TODO: Switch on undef is UB. Stop passing false once the rest of LLVM
     // is ready.
     if (SCValue.isConstantRange(/*UndefAllowed=*/false)) {
@@ -672,182 +672,182 @@ void SCCPSolver::getFeasibleSuccessors(Instruction &TI,
 
       // TODO: Determine whether default case is reachable.
       Succs[SI->case_default()->getSuccessorIndex()] = true;
-      return; 
-    } 
- 
+      return;
+    }
+
     // Overdefined or unknown condition? All destinations are executable!
     if (!SCValue.isUnknownOrUndef())
       Succs.assign(TI.getNumSuccessors(), true);
-    return; 
-  } 
- 
-  // In case of indirect branch and its address is a blockaddress, we mark 
-  // the target as executable. 
-  if (auto *IBR = dyn_cast<IndirectBrInst>(&TI)) { 
-    // Casts are folded by visitCastInst. 
-    ValueLatticeElement IBRValue = getValueState(IBR->getAddress()); 
-    BlockAddress *Addr = dyn_cast_or_null<BlockAddress>(getConstant(IBRValue)); 
-    if (!Addr) {   // Overdefined or unknown condition? 
-      // All destinations are executable! 
-      if (!IBRValue.isUnknownOrUndef()) 
-        Succs.assign(TI.getNumSuccessors(), true); 
-      return; 
-    } 
- 
-    BasicBlock* T = Addr->getBasicBlock(); 
-    assert(Addr->getFunction() == T->getParent() && 
-           "Block address of a different function ?"); 
-    for (unsigned i = 0; i < IBR->getNumSuccessors(); ++i) { 
-      // This is the target. 
-      if (IBR->getDestination(i) == T) { 
-        Succs[i] = true; 
-        return; 
-      } 
-    } 
- 
-    // If we didn't find our destination in the IBR successor list, then we 
-    // have undefined behavior. Its ok to assume no successor is executable. 
-    return; 
-  } 
- 
-  // In case of callbr, we pessimistically assume that all successors are 
-  // feasible. 
-  if (isa<CallBrInst>(&TI)) { 
-    Succs.assign(TI.getNumSuccessors(), true); 
-    return; 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "Unknown terminator instruction: " << TI << '\n'); 
-  llvm_unreachable("SCCP: Don't know how to handle this terminator!"); 
-} 
- 
-// isEdgeFeasible - Return true if the control flow edge from the 'From' basic 
-// block to the 'To' basic block is currently feasible. 
+    return;
+  }
+
+  // In case of indirect branch and its address is a blockaddress, we mark
+  // the target as executable.
+  if (auto *IBR = dyn_cast<IndirectBrInst>(&TI)) {
+    // Casts are folded by visitCastInst.
+    ValueLatticeElement IBRValue = getValueState(IBR->getAddress());
+    BlockAddress *Addr = dyn_cast_or_null<BlockAddress>(getConstant(IBRValue));
+    if (!Addr) {   // Overdefined or unknown condition?
+      // All destinations are executable!
+      if (!IBRValue.isUnknownOrUndef())
+        Succs.assign(TI.getNumSuccessors(), true);
+      return;
+    }
+
+    BasicBlock* T = Addr->getBasicBlock();
+    assert(Addr->getFunction() == T->getParent() &&
+           "Block address of a different function ?");
+    for (unsigned i = 0; i < IBR->getNumSuccessors(); ++i) {
+      // This is the target.
+      if (IBR->getDestination(i) == T) {
+        Succs[i] = true;
+        return;
+      }
+    }
+
+    // If we didn't find our destination in the IBR successor list, then we
+    // have undefined behavior. Its ok to assume no successor is executable.
+    return;
+  }
+
+  // In case of callbr, we pessimistically assume that all successors are
+  // feasible.
+  if (isa<CallBrInst>(&TI)) {
+    Succs.assign(TI.getNumSuccessors(), true);
+    return;
+  }
+
+  LLVM_DEBUG(dbgs() << "Unknown terminator instruction: " << TI << '\n');
+  llvm_unreachable("SCCP: Don't know how to handle this terminator!");
+}
+
+// isEdgeFeasible - Return true if the control flow edge from the 'From' basic
+// block to the 'To' basic block is currently feasible.
 bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) const {
-  // Check if we've called markEdgeExecutable on the edge yet. (We could 
-  // be more aggressive and try to consider edges which haven't been marked 
-  // yet, but there isn't any need.) 
-  return KnownFeasibleEdges.count(Edge(From, To)); 
-} 
- 
-// visit Implementations - Something changed in this instruction, either an 
-// operand made a transition, or the instruction is newly executable.  Change 
-// the value type of I to reflect these changes if appropriate.  This method 
-// makes sure to do the following actions: 
-// 
-// 1. If a phi node merges two constants in, and has conflicting value coming 
-//    from different branches, or if the PHI node merges in an overdefined 
-//    value, then the PHI node becomes overdefined. 
-// 2. If a phi node merges only constants in, and they all agree on value, the 
-//    PHI node becomes a constant value equal to that. 
-// 3. If V <- x (op) y && isConstant(x) && isConstant(y) V = Constant 
-// 4. If V <- x (op) y && (isOverdefined(x) || isOverdefined(y)) V = Overdefined 
-// 5. If V <- MEM or V <- CALL or V <- (unknown) then V = Overdefined 
-// 6. If a conditional branch has a value that is constant, make the selected 
-//    destination executable 
-// 7. If a conditional branch has a value that is overdefined, make all 
-//    successors executable. 
-void SCCPSolver::visitPHINode(PHINode &PN) { 
-  // If this PN returns a struct, just mark the result overdefined. 
-  // TODO: We could do a lot better than this if code actually uses this. 
-  if (PN.getType()->isStructTy()) 
-    return (void)markOverdefined(&PN); 
- 
-  if (getValueState(&PN).isOverdefined()) 
-    return; // Quick exit 
- 
-  // Super-extra-high-degree PHI nodes are unlikely to ever be marked constant, 
-  // and slow us down a lot.  Just mark them overdefined. 
-  if (PN.getNumIncomingValues() > 64) 
-    return (void)markOverdefined(&PN); 
- 
-  unsigned NumActiveIncoming = 0; 
- 
-  // Look at all of the executable operands of the PHI node.  If any of them 
-  // are overdefined, the PHI becomes overdefined as well.  If they are all 
-  // constant, and they agree with each other, the PHI becomes the identical 
-  // constant.  If they are constant and don't agree, the PHI is a constant 
-  // range. If there are no executable operands, the PHI remains unknown. 
-  ValueLatticeElement PhiState = getValueState(&PN); 
-  for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) { 
-    if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent())) 
-      continue; 
- 
-    ValueLatticeElement IV = getValueState(PN.getIncomingValue(i)); 
-    PhiState.mergeIn(IV); 
-    NumActiveIncoming++; 
-    if (PhiState.isOverdefined()) 
-      break; 
-  } 
- 
-  // We allow up to 1 range extension per active incoming value and one 
-  // additional extension. Note that we manually adjust the number of range 
-  // extensions to match the number of active incoming values. This helps to 
-  // limit multiple extensions caused by the same incoming value, if other 
-  // incoming values are equal. 
-  mergeInValue(&PN, PhiState, 
-               ValueLatticeElement::MergeOptions().setMaxWidenSteps( 
-                   NumActiveIncoming + 1)); 
-  ValueLatticeElement &PhiStateRef = getValueState(&PN); 
-  PhiStateRef.setNumRangeExtensions( 
-      std::max(NumActiveIncoming, PhiStateRef.getNumRangeExtensions())); 
-} 
- 
-void SCCPSolver::visitReturnInst(ReturnInst &I) { 
-  if (I.getNumOperands() == 0) return;  // ret void 
- 
-  Function *F = I.getParent()->getParent(); 
-  Value *ResultOp = I.getOperand(0); 
- 
-  // If we are tracking the return value of this function, merge it in. 
-  if (!TrackedRetVals.empty() && !ResultOp->getType()->isStructTy()) { 
-    auto TFRVI = TrackedRetVals.find(F); 
-    if (TFRVI != TrackedRetVals.end()) { 
-      mergeInValue(TFRVI->second, F, getValueState(ResultOp)); 
-      return; 
-    } 
-  } 
- 
-  // Handle functions that return multiple values. 
-  if (!TrackedMultipleRetVals.empty()) { 
-    if (auto *STy = dyn_cast<StructType>(ResultOp->getType())) 
-      if (MRVFunctionsTracked.count(F)) 
-        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) 
-          mergeInValue(TrackedMultipleRetVals[std::make_pair(F, i)], F, 
-                       getStructValueState(ResultOp, i)); 
-  } 
-} 
- 
-void SCCPSolver::visitTerminator(Instruction &TI) { 
-  SmallVector<bool, 16> SuccFeasible; 
-  getFeasibleSuccessors(TI, SuccFeasible); 
- 
-  BasicBlock *BB = TI.getParent(); 
- 
-  // Mark all feasible successors executable. 
-  for (unsigned i = 0, e = SuccFeasible.size(); i != e; ++i) 
-    if (SuccFeasible[i]) 
-      markEdgeExecutable(BB, TI.getSuccessor(i)); 
-} 
- 
-void SCCPSolver::visitCastInst(CastInst &I) { 
-  // ResolvedUndefsIn might mark I as overdefined. Bail out, even if we would 
-  // discover a concrete value later. 
-  if (ValueState[&I].isOverdefined()) 
-    return; 
- 
-  ValueLatticeElement OpSt = getValueState(I.getOperand(0)); 
-  if (Constant *OpC = getConstant(OpSt)) { 
-    // Fold the constant as we build. 
-    Constant *C = ConstantFoldCastOperand(I.getOpcode(), OpC, I.getType(), DL); 
-    if (isa<UndefValue>(C)) 
-      return; 
-    // Propagate constant value 
-    markConstant(&I, C); 
-  } else if (OpSt.isConstantRange() && I.getDestTy()->isIntegerTy()) { 
-    auto &LV = getValueState(&I); 
-    ConstantRange OpRange = OpSt.getConstantRange(); 
-    Type *DestTy = I.getDestTy(); 
+  // Check if we've called markEdgeExecutable on the edge yet. (We could
+  // be more aggressive and try to consider edges which haven't been marked
+  // yet, but there isn't any need.)
+  return KnownFeasibleEdges.count(Edge(From, To));
+}
+
+// visit Implementations - Something changed in this instruction, either an
+// operand made a transition, or the instruction is newly executable.  Change
+// the value type of I to reflect these changes if appropriate.  This method
+// makes sure to do the following actions:
+//
+// 1. If a phi node merges two constants in, and has conflicting value coming
+//    from different branches, or if the PHI node merges in an overdefined
+//    value, then the PHI node becomes overdefined.
+// 2. If a phi node merges only constants in, and they all agree on value, the
+//    PHI node becomes a constant value equal to that.
+// 3. If V <- x (op) y && isConstant(x) && isConstant(y) V = Constant
+// 4. If V <- x (op) y && (isOverdefined(x) || isOverdefined(y)) V = Overdefined
+// 5. If V <- MEM or V <- CALL or V <- (unknown) then V = Overdefined
+// 6. If a conditional branch has a value that is constant, make the selected
+//    destination executable
+// 7. If a conditional branch has a value that is overdefined, make all
+//    successors executable.
+void SCCPSolver::visitPHINode(PHINode &PN) {
+  // If this PN returns a struct, just mark the result overdefined.
+  // TODO: We could do a lot better than this if code actually uses this.
+  if (PN.getType()->isStructTy())
+    return (void)markOverdefined(&PN);
+
+  if (getValueState(&PN).isOverdefined())
+    return; // Quick exit
+
+  // Super-extra-high-degree PHI nodes are unlikely to ever be marked constant,
+  // and slow us down a lot.  Just mark them overdefined.
+  if (PN.getNumIncomingValues() > 64)
+    return (void)markOverdefined(&PN);
+
+  unsigned NumActiveIncoming = 0;
+
+  // Look at all of the executable operands of the PHI node.  If any of them
+  // are overdefined, the PHI becomes overdefined as well.  If they are all
+  // constant, and they agree with each other, the PHI becomes the identical
+  // constant.  If they are constant and don't agree, the PHI is a constant
+  // range. If there are no executable operands, the PHI remains unknown.
+  ValueLatticeElement PhiState = getValueState(&PN);
+  for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
+    if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent()))
+      continue;
+
+    ValueLatticeElement IV = getValueState(PN.getIncomingValue(i));
+    PhiState.mergeIn(IV);
+    NumActiveIncoming++;
+    if (PhiState.isOverdefined())
+      break;
+  }
+
+  // We allow up to 1 range extension per active incoming value and one
+  // additional extension. Note that we manually adjust the number of range
+  // extensions to match the number of active incoming values. This helps to
+  // limit multiple extensions caused by the same incoming value, if other
+  // incoming values are equal.
+  mergeInValue(&PN, PhiState,
+               ValueLatticeElement::MergeOptions().setMaxWidenSteps(
+                   NumActiveIncoming + 1));
+  ValueLatticeElement &PhiStateRef = getValueState(&PN);
+  PhiStateRef.setNumRangeExtensions(
+      std::max(NumActiveIncoming, PhiStateRef.getNumRangeExtensions()));
+}
+
+void SCCPSolver::visitReturnInst(ReturnInst &I) {
+  if (I.getNumOperands() == 0) return;  // ret void
+
+  Function *F = I.getParent()->getParent();
+  Value *ResultOp = I.getOperand(0);
+
+  // If we are tracking the return value of this function, merge it in.
+  if (!TrackedRetVals.empty() && !ResultOp->getType()->isStructTy()) {
+    auto TFRVI = TrackedRetVals.find(F);
+    if (TFRVI != TrackedRetVals.end()) {
+      mergeInValue(TFRVI->second, F, getValueState(ResultOp));
+      return;
+    }
+  }
+
+  // Handle functions that return multiple values.
+  if (!TrackedMultipleRetVals.empty()) {
+    if (auto *STy = dyn_cast<StructType>(ResultOp->getType()))
+      if (MRVFunctionsTracked.count(F))
+        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+          mergeInValue(TrackedMultipleRetVals[std::make_pair(F, i)], F,
+                       getStructValueState(ResultOp, i));
+  }
+}
+
+void SCCPSolver::visitTerminator(Instruction &TI) {
+  SmallVector<bool, 16> SuccFeasible;
+  getFeasibleSuccessors(TI, SuccFeasible);
+
+  BasicBlock *BB = TI.getParent();
+
+  // Mark all feasible successors executable.
+  for (unsigned i = 0, e = SuccFeasible.size(); i != e; ++i)
+    if (SuccFeasible[i])
+      markEdgeExecutable(BB, TI.getSuccessor(i));
+}
+
+void SCCPSolver::visitCastInst(CastInst &I) {
+  // ResolvedUndefsIn might mark I as overdefined. Bail out, even if we would
+  // discover a concrete value later.
+  if (ValueState[&I].isOverdefined())
+    return;
+
+  ValueLatticeElement OpSt = getValueState(I.getOperand(0));
+  if (Constant *OpC = getConstant(OpSt)) {
+    // Fold the constant as we build.
+    Constant *C = ConstantFoldCastOperand(I.getOpcode(), OpC, I.getType(), DL);
+    if (isa<UndefValue>(C))
+      return;
+    // Propagate constant value
+    markConstant(&I, C);
+  } else if (OpSt.isConstantRange() && I.getDestTy()->isIntegerTy()) {
+    auto &LV = getValueState(&I);
+    ConstantRange OpRange = OpSt.getConstantRange();
+    Type *DestTy = I.getDestTy();
     // Vectors where all elements have the same known constant range are treated
     // as a single constant range in the lattice. When bitcasting such vectors,
     // there is a mis-match between the width of the lattice value (single
@@ -858,456 +858,456 @@ void SCCPSolver::visitCastInst(CastInst &I) {
         OpRange.getBitWidth() < DL.getTypeSizeInBits(DestTy))
       return (void)markOverdefined(&I);
 
-    ConstantRange Res = 
-        OpRange.castOp(I.getOpcode(), DL.getTypeSizeInBits(DestTy)); 
-    mergeInValue(LV, &I, ValueLatticeElement::getRange(Res)); 
-  } else if (!OpSt.isUnknownOrUndef()) 
-    markOverdefined(&I); 
-} 
- 
-void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) { 
-  // If this returns a struct, mark all elements over defined, we don't track 
-  // structs in structs. 
-  if (EVI.getType()->isStructTy()) 
-    return (void)markOverdefined(&EVI); 
- 
-  // ResolvedUndefsIn might mark I as overdefined. Bail out, even if we would 
-  // discover a concrete value later. 
-  if (ValueState[&EVI].isOverdefined()) 
-    return (void)markOverdefined(&EVI); 
- 
-  // If this is extracting from more than one level of struct, we don't know. 
-  if (EVI.getNumIndices() != 1) 
-    return (void)markOverdefined(&EVI); 
- 
-  Value *AggVal = EVI.getAggregateOperand(); 
-  if (AggVal->getType()->isStructTy()) { 
-    unsigned i = *EVI.idx_begin(); 
-    ValueLatticeElement EltVal = getStructValueState(AggVal, i); 
-    mergeInValue(getValueState(&EVI), &EVI, EltVal); 
-  } else { 
-    // Otherwise, must be extracting from an array. 
-    return (void)markOverdefined(&EVI); 
-  } 
-} 
- 
-void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) { 
-  auto *STy = dyn_cast<StructType>(IVI.getType()); 
-  if (!STy) 
-    return (void)markOverdefined(&IVI); 
- 
-  // ResolvedUndefsIn might mark I as overdefined. Bail out, even if we would 
-  // discover a concrete value later. 
-  if (isOverdefined(ValueState[&IVI])) 
-    return (void)markOverdefined(&IVI); 
- 
-  // If this has more than one index, we can't handle it, drive all results to 
-  // undef. 
-  if (IVI.getNumIndices() != 1) 
-    return (void)markOverdefined(&IVI); 
- 
-  Value *Aggr = IVI.getAggregateOperand(); 
-  unsigned Idx = *IVI.idx_begin(); 
- 
-  // Compute the result based on what we're inserting. 
-  for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 
-    // This passes through all values that aren't the inserted element. 
-    if (i != Idx) { 
-      ValueLatticeElement EltVal = getStructValueState(Aggr, i); 
-      mergeInValue(getStructValueState(&IVI, i), &IVI, EltVal); 
-      continue; 
-    } 
- 
-    Value *Val = IVI.getInsertedValueOperand(); 
-    if (Val->getType()->isStructTy()) 
-      // We don't track structs in structs. 
-      markOverdefined(getStructValueState(&IVI, i), &IVI); 
-    else { 
-      ValueLatticeElement InVal = getValueState(Val); 
-      mergeInValue(getStructValueState(&IVI, i), &IVI, InVal); 
-    } 
-  } 
-} 
- 
-void SCCPSolver::visitSelectInst(SelectInst &I) { 
-  // If this select returns a struct, just mark the result overdefined. 
-  // TODO: We could do a lot better than this if code actually uses this. 
-  if (I.getType()->isStructTy()) 
-    return (void)markOverdefined(&I); 
- 
-  // ResolvedUndefsIn might mark I as overdefined. Bail out, even if we would 
-  // discover a concrete value later. 
-  if (ValueState[&I].isOverdefined()) 
-    return (void)markOverdefined(&I); 
- 
-  ValueLatticeElement CondValue = getValueState(I.getCondition()); 
-  if (CondValue.isUnknownOrUndef()) 
-    return; 
- 
-  if (ConstantInt *CondCB = getConstantInt(CondValue)) { 
-    Value *OpVal = CondCB->isZero() ? I.getFalseValue() : I.getTrueValue(); 
-    mergeInValue(&I, getValueState(OpVal)); 
-    return; 
-  } 
- 
-  // Otherwise, the condition is overdefined or a constant we can't evaluate. 
-  // See if we can produce something better than overdefined based on the T/F 
-  // value. 
-  ValueLatticeElement TVal = getValueState(I.getTrueValue()); 
-  ValueLatticeElement FVal = getValueState(I.getFalseValue()); 
- 
-  bool Changed = ValueState[&I].mergeIn(TVal); 
-  Changed |= ValueState[&I].mergeIn(FVal); 
-  if (Changed) 
-    pushToWorkListMsg(ValueState[&I], &I); 
-} 
- 
-// Handle Unary Operators. 
-void SCCPSolver::visitUnaryOperator(Instruction &I) { 
-  ValueLatticeElement V0State = getValueState(I.getOperand(0)); 
- 
-  ValueLatticeElement &IV = ValueState[&I]; 
-  // ResolvedUndefsIn might mark I as overdefined. Bail out, even if we would 
-  // discover a concrete value later. 
-  if (isOverdefined(IV)) 
-    return (void)markOverdefined(&I); 
- 
-  if (isConstant(V0State)) { 
-    Constant *C = ConstantExpr::get(I.getOpcode(), getConstant(V0State)); 
- 
-    // op Y -> undef. 
-    if (isa<UndefValue>(C)) 
-      return; 
-    return (void)markConstant(IV, &I, C); 
-  } 
- 
-  // If something is undef, wait for it to resolve. 
-  if (!isOverdefined(V0State)) 
-    return; 
- 
-  markOverdefined(&I); 
-} 
- 
-// Handle Binary Operators. 
-void SCCPSolver::visitBinaryOperator(Instruction &I) { 
-  ValueLatticeElement V1State = getValueState(I.getOperand(0)); 
-  ValueLatticeElement V2State = getValueState(I.getOperand(1)); 
- 
-  ValueLatticeElement &IV = ValueState[&I]; 
-  if (IV.isOverdefined()) 
-    return; 
- 
-  // If something is undef, wait for it to resolve. 
-  if (V1State.isUnknownOrUndef() || V2State.isUnknownOrUndef()) 
-    return; 
- 
-  if (V1State.isOverdefined() && V2State.isOverdefined()) 
-    return (void)markOverdefined(&I); 
- 
-  // If either of the operands is a constant, try to fold it to a constant. 
-  // TODO: Use information from notconstant better. 
-  if ((V1State.isConstant() || V2State.isConstant())) { 
-    Value *V1 = isConstant(V1State) ? getConstant(V1State) : I.getOperand(0); 
-    Value *V2 = isConstant(V2State) ? getConstant(V2State) : I.getOperand(1); 
-    Value *R = SimplifyBinOp(I.getOpcode(), V1, V2, SimplifyQuery(DL)); 
-    auto *C = dyn_cast_or_null<Constant>(R); 
-    if (C) { 
-      // X op Y -> undef. 
-      if (isa<UndefValue>(C)) 
-        return; 
-      // Conservatively assume that the result may be based on operands that may 
-      // be undef. Note that we use mergeInValue to combine the constant with 
-      // the existing lattice value for I, as different constants might be found 
-      // after one of the operands go to overdefined, e.g. due to one operand 
-      // being a special floating value. 
-      ValueLatticeElement NewV; 
-      NewV.markConstant(C, /*MayIncludeUndef=*/true); 
-      return (void)mergeInValue(&I, NewV); 
-    } 
-  } 
- 
-  // Only use ranges for binary operators on integers. 
-  if (!I.getType()->isIntegerTy()) 
-    return markOverdefined(&I); 
- 
-  // Try to simplify to a constant range. 
-  ConstantRange A = ConstantRange::getFull(I.getType()->getScalarSizeInBits()); 
-  ConstantRange B = ConstantRange::getFull(I.getType()->getScalarSizeInBits()); 
-  if (V1State.isConstantRange()) 
-    A = V1State.getConstantRange(); 
-  if (V2State.isConstantRange()) 
-    B = V2State.getConstantRange(); 
- 
-  ConstantRange R = A.binaryOp(cast<BinaryOperator>(&I)->getOpcode(), B); 
-  mergeInValue(&I, ValueLatticeElement::getRange(R)); 
- 
-  // TODO: Currently we do not exploit special values that produce something 
-  // better than overdefined with an overdefined operand for vector or floating 
-  // point types, like and <4 x i32> overdefined, zeroinitializer. 
-} 
- 
-// Handle ICmpInst instruction. 
-void SCCPSolver::visitCmpInst(CmpInst &I) { 
-  // Do not cache this lookup, getValueState calls later in the function might 
-  // invalidate the reference. 
-  if (isOverdefined(ValueState[&I])) 
-    return (void)markOverdefined(&I); 
- 
-  Value *Op1 = I.getOperand(0); 
-  Value *Op2 = I.getOperand(1); 
- 
-  // For parameters, use ParamState which includes constant range info if 
-  // available. 
-  auto V1State = getValueState(Op1); 
-  auto V2State = getValueState(Op2); 
- 
-  Constant *C = V1State.getCompare(I.getPredicate(), I.getType(), V2State); 
-  if (C) { 
-    if (isa<UndefValue>(C)) 
-      return; 
-    ValueLatticeElement CV; 
-    CV.markConstant(C); 
-    mergeInValue(&I, CV); 
-    return; 
-  } 
- 
-  // If operands are still unknown, wait for it to resolve. 
-  if ((V1State.isUnknownOrUndef() || V2State.isUnknownOrUndef()) && 
-      !isConstant(ValueState[&I])) 
-    return; 
- 
-  markOverdefined(&I); 
-} 
- 
-// Handle getelementptr instructions.  If all operands are constants then we 
-// can turn this into a getelementptr ConstantExpr. 
-void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) { 
-  if (isOverdefined(ValueState[&I])) 
-    return (void)markOverdefined(&I); 
- 
-  SmallVector<Constant*, 8> Operands; 
-  Operands.reserve(I.getNumOperands()); 
- 
-  for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) { 
-    ValueLatticeElement State = getValueState(I.getOperand(i)); 
-    if (State.isUnknownOrUndef()) 
-      return;  // Operands are not resolved yet. 
- 
-    if (isOverdefined(State)) 
-      return (void)markOverdefined(&I); 
- 
-    if (Constant *C = getConstant(State)) { 
-      Operands.push_back(C); 
-      continue; 
-    } 
- 
-    return (void)markOverdefined(&I); 
-  } 
- 
-  Constant *Ptr = Operands[0]; 
-  auto Indices = makeArrayRef(Operands.begin() + 1, Operands.end()); 
-  Constant *C = 
-      ConstantExpr::getGetElementPtr(I.getSourceElementType(), Ptr, Indices); 
-  if (isa<UndefValue>(C)) 
-      return; 
-  markConstant(&I, C); 
-} 
- 
-void SCCPSolver::visitStoreInst(StoreInst &SI) { 
-  // If this store is of a struct, ignore it. 
-  if (SI.getOperand(0)->getType()->isStructTy()) 
-    return; 
- 
-  if (TrackedGlobals.empty() || !isa<GlobalVariable>(SI.getOperand(1))) 
-    return; 
- 
-  GlobalVariable *GV = cast<GlobalVariable>(SI.getOperand(1)); 
-  auto I = TrackedGlobals.find(GV); 
-  if (I == TrackedGlobals.end()) 
-    return; 
- 
-  // Get the value we are storing into the global, then merge it. 
-  mergeInValue(I->second, GV, getValueState(SI.getOperand(0)), 
-               ValueLatticeElement::MergeOptions().setCheckWiden(false)); 
-  if (I->second.isOverdefined()) 
-    TrackedGlobals.erase(I);      // No need to keep tracking this! 
-} 
- 
-static ValueLatticeElement getValueFromMetadata(const Instruction *I) { 
-  if (MDNode *Ranges = I->getMetadata(LLVMContext::MD_range)) 
-    if (I->getType()->isIntegerTy()) 
-      return ValueLatticeElement::getRange( 
-          getConstantRangeFromMetadata(*Ranges)); 
+    ConstantRange Res =
+        OpRange.castOp(I.getOpcode(), DL.getTypeSizeInBits(DestTy));
+    mergeInValue(LV, &I, ValueLatticeElement::getRange(Res));
+  } else if (!OpSt.isUnknownOrUndef())
+    markOverdefined(&I);
+}
+
+void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
+  // If this returns a struct, mark all elements over defined, we don't track
+  // structs in structs.
+  if (EVI.getType()->isStructTy())
+    return (void)markOverdefined(&EVI);
+
+  // ResolvedUndefsIn might mark I as overdefined. Bail out, even if we would
+  // discover a concrete value later.
+  if (ValueState[&EVI].isOverdefined())
+    return (void)markOverdefined(&EVI);
+
+  // If this is extracting from more than one level of struct, we don't know.
+  if (EVI.getNumIndices() != 1)
+    return (void)markOverdefined(&EVI);
+
+  Value *AggVal = EVI.getAggregateOperand();
+  if (AggVal->getType()->isStructTy()) {
+    unsigned i = *EVI.idx_begin();
+    ValueLatticeElement EltVal = getStructValueState(AggVal, i);
+    mergeInValue(getValueState(&EVI), &EVI, EltVal);
+  } else {
+    // Otherwise, must be extracting from an array.
+    return (void)markOverdefined(&EVI);
+  }
+}
+
+void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) {
+  auto *STy = dyn_cast<StructType>(IVI.getType());
+  if (!STy)
+    return (void)markOverdefined(&IVI);
+
+  // ResolvedUndefsIn might mark I as overdefined. Bail out, even if we would
+  // discover a concrete value later.
+  if (isOverdefined(ValueState[&IVI]))
+    return (void)markOverdefined(&IVI);
+
+  // If this has more than one index, we can't handle it, drive all results to
+  // undef.
+  if (IVI.getNumIndices() != 1)
+    return (void)markOverdefined(&IVI);
+
+  Value *Aggr = IVI.getAggregateOperand();
+  unsigned Idx = *IVI.idx_begin();
+
+  // Compute the result based on what we're inserting.
+  for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+    // This passes through all values that aren't the inserted element.
+    if (i != Idx) {
+      ValueLatticeElement EltVal = getStructValueState(Aggr, i);
+      mergeInValue(getStructValueState(&IVI, i), &IVI, EltVal);
+      continue;
+    }
+
+    Value *Val = IVI.getInsertedValueOperand();
+    if (Val->getType()->isStructTy())
+      // We don't track structs in structs.
+      markOverdefined(getStructValueState(&IVI, i), &IVI);
+    else {
+      ValueLatticeElement InVal = getValueState(Val);
+      mergeInValue(getStructValueState(&IVI, i), &IVI, InVal);
+    }
+  }
+}
+
+void SCCPSolver::visitSelectInst(SelectInst &I) {
+  // If this select returns a struct, just mark the result overdefined.
+  // TODO: We could do a lot better than this if code actually uses this.
+  if (I.getType()->isStructTy())
+    return (void)markOverdefined(&I);
+
+  // ResolvedUndefsIn might mark I as overdefined. Bail out, even if we would
+  // discover a concrete value later.
+  if (ValueState[&I].isOverdefined())
+    return (void)markOverdefined(&I);
+
+  ValueLatticeElement CondValue = getValueState(I.getCondition());
+  if (CondValue.isUnknownOrUndef())
+    return;
+
+  if (ConstantInt *CondCB = getConstantInt(CondValue)) {
+    Value *OpVal = CondCB->isZero() ? I.getFalseValue() : I.getTrueValue();
+    mergeInValue(&I, getValueState(OpVal));
+    return;
+  }
+
+  // Otherwise, the condition is overdefined or a constant we can't evaluate.
+  // See if we can produce something better than overdefined based on the T/F
+  // value.
+  ValueLatticeElement TVal = getValueState(I.getTrueValue());
+  ValueLatticeElement FVal = getValueState(I.getFalseValue());
+
+  bool Changed = ValueState[&I].mergeIn(TVal);
+  Changed |= ValueState[&I].mergeIn(FVal);
+  if (Changed)
+    pushToWorkListMsg(ValueState[&I], &I);
+}
+
+// Handle Unary Operators.
+void SCCPSolver::visitUnaryOperator(Instruction &I) {
+  ValueLatticeElement V0State = getValueState(I.getOperand(0));
+
+  ValueLatticeElement &IV = ValueState[&I];
+  // ResolvedUndefsIn might mark I as overdefined. Bail out, even if we would
+  // discover a concrete value later.
+  if (isOverdefined(IV))
+    return (void)markOverdefined(&I);
+
+  if (isConstant(V0State)) {
+    Constant *C = ConstantExpr::get(I.getOpcode(), getConstant(V0State));
+
+    // op Y -> undef.
+    if (isa<UndefValue>(C))
+      return;
+    return (void)markConstant(IV, &I, C);
+  }
+
+  // If something is undef, wait for it to resolve.
+  if (!isOverdefined(V0State))
+    return;
+
+  markOverdefined(&I);
+}
+
+// Handle Binary Operators.
+void SCCPSolver::visitBinaryOperator(Instruction &I) {
+  ValueLatticeElement V1State = getValueState(I.getOperand(0));
+  ValueLatticeElement V2State = getValueState(I.getOperand(1));
+
+  ValueLatticeElement &IV = ValueState[&I];
+  if (IV.isOverdefined())
+    return;
+
+  // If something is undef, wait for it to resolve.
+  if (V1State.isUnknownOrUndef() || V2State.isUnknownOrUndef())
+    return;
+
+  if (V1State.isOverdefined() && V2State.isOverdefined())
+    return (void)markOverdefined(&I);
+
+  // If either of the operands is a constant, try to fold it to a constant.
+  // TODO: Use information from notconstant better.
+  if ((V1State.isConstant() || V2State.isConstant())) {
+    Value *V1 = isConstant(V1State) ? getConstant(V1State) : I.getOperand(0);
+    Value *V2 = isConstant(V2State) ? getConstant(V2State) : I.getOperand(1);
+    Value *R = SimplifyBinOp(I.getOpcode(), V1, V2, SimplifyQuery(DL));
+    auto *C = dyn_cast_or_null<Constant>(R);
+    if (C) {
+      // X op Y -> undef.
+      if (isa<UndefValue>(C))
+        return;
+      // Conservatively assume that the result may be based on operands that may
+      // be undef. Note that we use mergeInValue to combine the constant with
+      // the existing lattice value for I, as different constants might be found
+      // after one of the operands go to overdefined, e.g. due to one operand
+      // being a special floating value.
+      ValueLatticeElement NewV;
+      NewV.markConstant(C, /*MayIncludeUndef=*/true);
+      return (void)mergeInValue(&I, NewV);
+    }
+  }
+
+  // Only use ranges for binary operators on integers.
+  if (!I.getType()->isIntegerTy())
+    return markOverdefined(&I);
+
+  // Try to simplify to a constant range.
+  ConstantRange A = ConstantRange::getFull(I.getType()->getScalarSizeInBits());
+  ConstantRange B = ConstantRange::getFull(I.getType()->getScalarSizeInBits());
+  if (V1State.isConstantRange())
+    A = V1State.getConstantRange();
+  if (V2State.isConstantRange())
+    B = V2State.getConstantRange();
+
+  ConstantRange R = A.binaryOp(cast<BinaryOperator>(&I)->getOpcode(), B);
+  mergeInValue(&I, ValueLatticeElement::getRange(R));
+
+  // TODO: Currently we do not exploit special values that produce something
+  // better than overdefined with an overdefined operand for vector or floating
+  // point types, like and <4 x i32> overdefined, zeroinitializer.
+}
+
+// Handle ICmpInst instruction.
+void SCCPSolver::visitCmpInst(CmpInst &I) {
+  // Do not cache this lookup, getValueState calls later in the function might
+  // invalidate the reference.
+  if (isOverdefined(ValueState[&I]))
+    return (void)markOverdefined(&I);
+
+  Value *Op1 = I.getOperand(0);
+  Value *Op2 = I.getOperand(1);
+
+  // For parameters, use ParamState which includes constant range info if
+  // available.
+  auto V1State = getValueState(Op1);
+  auto V2State = getValueState(Op2);
+
+  Constant *C = V1State.getCompare(I.getPredicate(), I.getType(), V2State);
+  if (C) {
+    if (isa<UndefValue>(C))
+      return;
+    ValueLatticeElement CV;
+    CV.markConstant(C);
+    mergeInValue(&I, CV);
+    return;
+  }
+
+  // If operands are still unknown, wait for it to resolve.
+  if ((V1State.isUnknownOrUndef() || V2State.isUnknownOrUndef()) &&
+      !isConstant(ValueState[&I]))
+    return;
+
+  markOverdefined(&I);
+}
+
+// Handle getelementptr instructions.  If all operands are constants then we
+// can turn this into a getelementptr ConstantExpr.
+void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) {
+  if (isOverdefined(ValueState[&I]))
+    return (void)markOverdefined(&I);
+
+  SmallVector<Constant*, 8> Operands;
+  Operands.reserve(I.getNumOperands());
+
+  for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
+    ValueLatticeElement State = getValueState(I.getOperand(i));
+    if (State.isUnknownOrUndef())
+      return;  // Operands are not resolved yet.
+
+    if (isOverdefined(State))
+      return (void)markOverdefined(&I);
+
+    if (Constant *C = getConstant(State)) {
+      Operands.push_back(C);
+      continue;
+    }
+
+    return (void)markOverdefined(&I);
+  }
+
+  Constant *Ptr = Operands[0];
+  auto Indices = makeArrayRef(Operands.begin() + 1, Operands.end());
+  Constant *C =
+      ConstantExpr::getGetElementPtr(I.getSourceElementType(), Ptr, Indices);
+  if (isa<UndefValue>(C))
+      return;
+  markConstant(&I, C);
+}
+
+void SCCPSolver::visitStoreInst(StoreInst &SI) {
+  // If this store is of a struct, ignore it.
+  if (SI.getOperand(0)->getType()->isStructTy())
+    return;
+
+  if (TrackedGlobals.empty() || !isa<GlobalVariable>(SI.getOperand(1)))
+    return;
+
+  GlobalVariable *GV = cast<GlobalVariable>(SI.getOperand(1));
+  auto I = TrackedGlobals.find(GV);
+  if (I == TrackedGlobals.end())
+    return;
+
+  // Get the value we are storing into the global, then merge it.
+  mergeInValue(I->second, GV, getValueState(SI.getOperand(0)),
+               ValueLatticeElement::MergeOptions().setCheckWiden(false));
+  if (I->second.isOverdefined())
+    TrackedGlobals.erase(I);      // No need to keep tracking this!
+}
+
+static ValueLatticeElement getValueFromMetadata(const Instruction *I) {
+  if (MDNode *Ranges = I->getMetadata(LLVMContext::MD_range))
+    if (I->getType()->isIntegerTy())
+      return ValueLatticeElement::getRange(
+          getConstantRangeFromMetadata(*Ranges));
   if (I->hasMetadata(LLVMContext::MD_nonnull))
     return ValueLatticeElement::getNot(
         ConstantPointerNull::get(cast<PointerType>(I->getType())));
-  return ValueLatticeElement::getOverdefined(); 
-} 
- 
-// Handle load instructions.  If the operand is a constant pointer to a constant 
-// global, we can replace the load with the loaded constant value! 
-void SCCPSolver::visitLoadInst(LoadInst &I) { 
-  // If this load is of a struct or the load is volatile, just mark the result 
-  // as overdefined. 
-  if (I.getType()->isStructTy() || I.isVolatile()) 
-    return (void)markOverdefined(&I); 
- 
-  // ResolvedUndefsIn might mark I as overdefined. Bail out, even if we would 
-  // discover a concrete value later. 
-  if (ValueState[&I].isOverdefined()) 
-    return (void)markOverdefined(&I); 
- 
-  ValueLatticeElement PtrVal = getValueState(I.getOperand(0)); 
-  if (PtrVal.isUnknownOrUndef()) 
-    return; // The pointer is not resolved yet! 
- 
-  ValueLatticeElement &IV = ValueState[&I]; 
- 
-  if (isConstant(PtrVal)) { 
-    Constant *Ptr = getConstant(PtrVal); 
- 
-    // load null is undefined. 
-    if (isa<ConstantPointerNull>(Ptr)) { 
-      if (NullPointerIsDefined(I.getFunction(), I.getPointerAddressSpace())) 
-        return (void)markOverdefined(IV, &I); 
-      else 
-        return; 
-    } 
- 
-    // Transform load (constant global) into the value loaded. 
-    if (auto *GV = dyn_cast<GlobalVariable>(Ptr)) { 
-      if (!TrackedGlobals.empty()) { 
-        // If we are tracking this global, merge in the known value for it. 
-        auto It = TrackedGlobals.find(GV); 
-        if (It != TrackedGlobals.end()) { 
-          mergeInValue(IV, &I, It->second, getMaxWidenStepsOpts()); 
-          return; 
-        } 
-      } 
-    } 
- 
-    // Transform load from a constant into a constant if possible. 
-    if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, I.getType(), DL)) { 
-      if (isa<UndefValue>(C)) 
-        return; 
-      return (void)markConstant(IV, &I, C); 
-    } 
-  } 
- 
-  // Fall back to metadata. 
-  mergeInValue(&I, getValueFromMetadata(&I)); 
-} 
- 
-void SCCPSolver::visitCallBase(CallBase &CB) { 
-  handleCallResult(CB); 
-  handleCallArguments(CB); 
-} 
- 
-void SCCPSolver::handleCallOverdefined(CallBase &CB) { 
-  Function *F = CB.getCalledFunction(); 
- 
-  // Void return and not tracking callee, just bail. 
-  if (CB.getType()->isVoidTy()) 
-    return; 
- 
-  // Always mark struct return as overdefined. 
-  if (CB.getType()->isStructTy()) 
-    return (void)markOverdefined(&CB); 
- 
-  // Otherwise, if we have a single return value case, and if the function is 
-  // a declaration, maybe we can constant fold it. 
-  if (F && F->isDeclaration() && canConstantFoldCallTo(&CB, F)) { 
-    SmallVector<Constant *, 8> Operands; 
-    for (auto AI = CB.arg_begin(), E = CB.arg_end(); AI != E; ++AI) { 
-      if (AI->get()->getType()->isStructTy()) 
-        return markOverdefined(&CB); // Can't handle struct args. 
-      ValueLatticeElement State = getValueState(*AI); 
- 
-      if (State.isUnknownOrUndef()) 
-        return; // Operands are not resolved yet. 
-      if (isOverdefined(State)) 
-        return (void)markOverdefined(&CB); 
-      assert(isConstant(State) && "Unknown state!"); 
-      Operands.push_back(getConstant(State)); 
-    } 
- 
-    if (isOverdefined(getValueState(&CB))) 
-      return (void)markOverdefined(&CB); 
- 
-    // If we can constant fold this, mark the result of the call as a 
-    // constant. 
-    if (Constant *C = ConstantFoldCall(&CB, F, Operands, &GetTLI(*F))) { 
-      // call -> undef. 
-      if (isa<UndefValue>(C)) 
-        return; 
-      return (void)markConstant(&CB, C); 
-    } 
-  } 
- 
-  // Fall back to metadata. 
-  mergeInValue(&CB, getValueFromMetadata(&CB)); 
-} 
- 
-void SCCPSolver::handleCallArguments(CallBase &CB) { 
-  Function *F = CB.getCalledFunction(); 
-  // If this is a local function that doesn't have its address taken, mark its 
-  // entry block executable and merge in the actual arguments to the call into 
-  // the formal arguments of the function. 
-  if (!TrackingIncomingArguments.empty() && 
-      TrackingIncomingArguments.count(F)) { 
-    MarkBlockExecutable(&F->front()); 
- 
-    // Propagate information from this call site into the callee. 
-    auto CAI = CB.arg_begin(); 
-    for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); AI != E; 
-         ++AI, ++CAI) { 
-      // If this argument is byval, and if the function is not readonly, there 
-      // will be an implicit copy formed of the input aggregate. 
-      if (AI->hasByValAttr() && !F->onlyReadsMemory()) { 
-        markOverdefined(&*AI); 
-        continue; 
-      } 
- 
-      if (auto *STy = dyn_cast<StructType>(AI->getType())) { 
-        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 
-          ValueLatticeElement CallArg = getStructValueState(*CAI, i); 
-          mergeInValue(getStructValueState(&*AI, i), &*AI, CallArg, 
-                       getMaxWidenStepsOpts()); 
-        } 
-      } else 
-        mergeInValue(&*AI, getValueState(*CAI), getMaxWidenStepsOpts()); 
-    } 
-  } 
-} 
- 
-void SCCPSolver::handleCallResult(CallBase &CB) { 
-  Function *F = CB.getCalledFunction(); 
- 
-  if (auto *II = dyn_cast<IntrinsicInst>(&CB)) { 
-    if (II->getIntrinsicID() == Intrinsic::ssa_copy) { 
-      if (ValueState[&CB].isOverdefined()) 
-        return; 
- 
-      Value *CopyOf = CB.getOperand(0); 
-      ValueLatticeElement CopyOfVal = getValueState(CopyOf); 
-      auto *PI = getPredicateInfoFor(&CB); 
-      assert(PI && "Missing predicate info for ssa.copy"); 
- 
+  return ValueLatticeElement::getOverdefined();
+}
+
+// Handle load instructions.  If the operand is a constant pointer to a constant
+// global, we can replace the load with the loaded constant value!
+void SCCPSolver::visitLoadInst(LoadInst &I) {
+  // If this load is of a struct or the load is volatile, just mark the result
+  // as overdefined.
+  if (I.getType()->isStructTy() || I.isVolatile())
+    return (void)markOverdefined(&I);
+
+  // ResolvedUndefsIn might mark I as overdefined. Bail out, even if we would
+  // discover a concrete value later.
+  if (ValueState[&I].isOverdefined())
+    return (void)markOverdefined(&I);
+
+  ValueLatticeElement PtrVal = getValueState(I.getOperand(0));
+  if (PtrVal.isUnknownOrUndef())
+    return; // The pointer is not resolved yet!
+
+  ValueLatticeElement &IV = ValueState[&I];
+
+  if (isConstant(PtrVal)) {
+    Constant *Ptr = getConstant(PtrVal);
+
+    // load null is undefined.
+    if (isa<ConstantPointerNull>(Ptr)) {
+      if (NullPointerIsDefined(I.getFunction(), I.getPointerAddressSpace()))
+        return (void)markOverdefined(IV, &I);
+      else
+        return;
+    }
+
+    // Transform load (constant global) into the value loaded.
+    if (auto *GV = dyn_cast<GlobalVariable>(Ptr)) {
+      if (!TrackedGlobals.empty()) {
+        // If we are tracking this global, merge in the known value for it.
+        auto It = TrackedGlobals.find(GV);
+        if (It != TrackedGlobals.end()) {
+          mergeInValue(IV, &I, It->second, getMaxWidenStepsOpts());
+          return;
+        }
+      }
+    }
+
+    // Transform load from a constant into a constant if possible.
+    if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, I.getType(), DL)) {
+      if (isa<UndefValue>(C))
+        return;
+      return (void)markConstant(IV, &I, C);
+    }
+  }
+
+  // Fall back to metadata.
+  mergeInValue(&I, getValueFromMetadata(&I));
+}
+
+void SCCPSolver::visitCallBase(CallBase &CB) {
+  handleCallResult(CB);
+  handleCallArguments(CB);
+}
+
+void SCCPSolver::handleCallOverdefined(CallBase &CB) {
+  Function *F = CB.getCalledFunction();
+
+  // Void return and not tracking callee, just bail.
+  if (CB.getType()->isVoidTy())
+    return;
+
+  // Always mark struct return as overdefined.
+  if (CB.getType()->isStructTy())
+    return (void)markOverdefined(&CB);
+
+  // Otherwise, if we have a single return value case, and if the function is
+  // a declaration, maybe we can constant fold it.
+  if (F && F->isDeclaration() && canConstantFoldCallTo(&CB, F)) {
+    SmallVector<Constant *, 8> Operands;
+    for (auto AI = CB.arg_begin(), E = CB.arg_end(); AI != E; ++AI) {
+      if (AI->get()->getType()->isStructTy())
+        return markOverdefined(&CB); // Can't handle struct args.
+      ValueLatticeElement State = getValueState(*AI);
+
+      if (State.isUnknownOrUndef())
+        return; // Operands are not resolved yet.
+      if (isOverdefined(State))
+        return (void)markOverdefined(&CB);
+      assert(isConstant(State) && "Unknown state!");
+      Operands.push_back(getConstant(State));
+    }
+
+    if (isOverdefined(getValueState(&CB)))
+      return (void)markOverdefined(&CB);
+
+    // If we can constant fold this, mark the result of the call as a
+    // constant.
+    if (Constant *C = ConstantFoldCall(&CB, F, Operands, &GetTLI(*F))) {
+      // call -> undef.
+      if (isa<UndefValue>(C))
+        return;
+      return (void)markConstant(&CB, C);
+    }
+  }
+
+  // Fall back to metadata.
+  mergeInValue(&CB, getValueFromMetadata(&CB));
+}
+
+void SCCPSolver::handleCallArguments(CallBase &CB) {
+  Function *F = CB.getCalledFunction();
+  // If this is a local function that doesn't have its address taken, mark its
+  // entry block executable and merge in the actual arguments to the call into
+  // the formal arguments of the function.
+  if (!TrackingIncomingArguments.empty() &&
+      TrackingIncomingArguments.count(F)) {
+    MarkBlockExecutable(&F->front());
+
+    // Propagate information from this call site into the callee.
+    auto CAI = CB.arg_begin();
+    for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); AI != E;
+         ++AI, ++CAI) {
+      // If this argument is byval, and if the function is not readonly, there
+      // will be an implicit copy formed of the input aggregate.
+      if (AI->hasByValAttr() && !F->onlyReadsMemory()) {
+        markOverdefined(&*AI);
+        continue;
+      }
+
+      if (auto *STy = dyn_cast<StructType>(AI->getType())) {
+        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+          ValueLatticeElement CallArg = getStructValueState(*CAI, i);
+          mergeInValue(getStructValueState(&*AI, i), &*AI, CallArg,
+                       getMaxWidenStepsOpts());
+        }
+      } else
+        mergeInValue(&*AI, getValueState(*CAI), getMaxWidenStepsOpts());
+    }
+  }
+}
+
+void SCCPSolver::handleCallResult(CallBase &CB) {
+  Function *F = CB.getCalledFunction();
+
+  if (auto *II = dyn_cast<IntrinsicInst>(&CB)) {
+    if (II->getIntrinsicID() == Intrinsic::ssa_copy) {
+      if (ValueState[&CB].isOverdefined())
+        return;
+
+      Value *CopyOf = CB.getOperand(0);
+      ValueLatticeElement CopyOfVal = getValueState(CopyOf);
+      auto *PI = getPredicateInfoFor(&CB);
+      assert(PI && "Missing predicate info for ssa.copy");
+
       const Optional<PredicateConstraint> &Constraint = PI->getConstraint();
       if (!Constraint) {
-        mergeInValue(ValueState[&CB], &CB, CopyOfVal); 
-        return; 
-      } 
- 
+        mergeInValue(ValueState[&CB], &CB, CopyOfVal);
+        return;
+      }
+
       CmpInst::Predicate Pred = Constraint->Predicate;
       Value *OtherOp = Constraint->OtherOp;
- 
+
       // Wait until OtherOp is resolved.
       if (getValueState(OtherOp).isUnknown()) {
         addAdditionalUser(OtherOp, &CB);
-        return; 
-      } 
- 
+        return;
+      }
+
       // TODO: Actually filp MayIncludeUndef for the created range to false,
       // once most places in the optimizer respect the branches on
       // undef/poison are UB rule. The reason why the new range cannot be
@@ -1318,42 +1318,42 @@ void SCCPSolver::handleCallResult(CallBase &CB) {
       // i32, %a, i32_max). For the latter overdefined/empty range will be
       // inferred, but the branch will get folded accordingly anyways.
       bool MayIncludeUndef = !isa<PredicateAssume>(PI);
- 
+
       ValueLatticeElement CondVal = getValueState(OtherOp);
-      ValueLatticeElement &IV = ValueState[&CB]; 
-      if (CondVal.isConstantRange() || CopyOfVal.isConstantRange()) { 
-        auto ImposedCR = 
-            ConstantRange::getFull(DL.getTypeSizeInBits(CopyOf->getType())); 
- 
-        // Get the range imposed by the condition. 
-        if (CondVal.isConstantRange()) 
-          ImposedCR = ConstantRange::makeAllowedICmpRegion( 
-              Pred, CondVal.getConstantRange()); 
- 
-        // Combine range info for the original value with the new range from the 
-        // condition. 
-        auto CopyOfCR = CopyOfVal.isConstantRange() 
-                            ? CopyOfVal.getConstantRange() 
-                            : ConstantRange::getFull( 
-                                  DL.getTypeSizeInBits(CopyOf->getType())); 
-        auto NewCR = ImposedCR.intersectWith(CopyOfCR); 
-        // If the existing information is != x, do not use the information from 
-        // a chained predicate, as the != x information is more likely to be 
-        // helpful in practice. 
-        if (!CopyOfCR.contains(NewCR) && CopyOfCR.getSingleMissingElement()) 
-          NewCR = CopyOfCR; 
- 
+      ValueLatticeElement &IV = ValueState[&CB];
+      if (CondVal.isConstantRange() || CopyOfVal.isConstantRange()) {
+        auto ImposedCR =
+            ConstantRange::getFull(DL.getTypeSizeInBits(CopyOf->getType()));
+
+        // Get the range imposed by the condition.
+        if (CondVal.isConstantRange())
+          ImposedCR = ConstantRange::makeAllowedICmpRegion(
+              Pred, CondVal.getConstantRange());
+
+        // Combine range info for the original value with the new range from the
+        // condition.
+        auto CopyOfCR = CopyOfVal.isConstantRange()
+                            ? CopyOfVal.getConstantRange()
+                            : ConstantRange::getFull(
+                                  DL.getTypeSizeInBits(CopyOf->getType()));
+        auto NewCR = ImposedCR.intersectWith(CopyOfCR);
+        // If the existing information is != x, do not use the information from
+        // a chained predicate, as the != x information is more likely to be
+        // helpful in practice.
+        if (!CopyOfCR.contains(NewCR) && CopyOfCR.getSingleMissingElement())
+          NewCR = CopyOfCR;
+
         addAdditionalUser(OtherOp, &CB);
-        mergeInValue( 
-            IV, &CB, 
+        mergeInValue(
+            IV, &CB,
             ValueLatticeElement::getRange(NewCR, MayIncludeUndef));
-        return; 
-      } else if (Pred == CmpInst::ICMP_EQ && CondVal.isConstant()) { 
-        // For non-integer values or integer constant expressions, only 
-        // propagate equal constants. 
+        return;
+      } else if (Pred == CmpInst::ICMP_EQ && CondVal.isConstant()) {
+        // For non-integer values or integer constant expressions, only
+        // propagate equal constants.
         addAdditionalUser(OtherOp, &CB);
-        mergeInValue(IV, &CB, CondVal); 
-        return; 
+        mergeInValue(IV, &CB, CondVal);
+        return;
       } else if (Pred == CmpInst::ICMP_NE && CondVal.isConstant() &&
                  !MayIncludeUndef) {
         // Propagate inequalities.
@@ -1361,10 +1361,10 @@ void SCCPSolver::handleCallResult(CallBase &CB) {
         mergeInValue(IV, &CB,
                      ValueLatticeElement::getNot(CondVal.getConstant()));
         return;
-      } 
- 
-      return (void)mergeInValue(IV, &CB, CopyOfVal); 
-    } 
+      }
+
+      return (void)mergeInValue(IV, &CB, CopyOfVal);
+    }
 
     if (ConstantRange::isIntrinsicSupported(II->getIntrinsicID())) {
       // Compute result range for intrinsics supported by ConstantRange.
@@ -1384,492 +1384,492 @@ void SCCPSolver::handleCallResult(CallBase &CB) {
           ConstantRange::intrinsic(II->getIntrinsicID(), OpRanges);
       return (void)mergeInValue(II, ValueLatticeElement::getRange(Result));
     }
-  } 
- 
-  // The common case is that we aren't tracking the callee, either because we 
-  // are not doing interprocedural analysis or the callee is indirect, or is 
-  // external.  Handle these cases first. 
-  if (!F || F->isDeclaration()) 
-    return handleCallOverdefined(CB); 
- 
-  // If this is a single/zero retval case, see if we're tracking the function. 
-  if (auto *STy = dyn_cast<StructType>(F->getReturnType())) { 
-    if (!MRVFunctionsTracked.count(F)) 
-      return handleCallOverdefined(CB); // Not tracking this callee. 
- 
-    // If we are tracking this callee, propagate the result of the function 
-    // into this call site. 
-    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) 
-      mergeInValue(getStructValueState(&CB, i), &CB, 
-                   TrackedMultipleRetVals[std::make_pair(F, i)], 
-                   getMaxWidenStepsOpts()); 
-  } else { 
-    auto TFRVI = TrackedRetVals.find(F); 
-    if (TFRVI == TrackedRetVals.end()) 
-      return handleCallOverdefined(CB); // Not tracking this callee. 
- 
-    // If so, propagate the return value of the callee into this call result. 
-    mergeInValue(&CB, TFRVI->second, getMaxWidenStepsOpts()); 
-  } 
-} 
- 
-void SCCPSolver::Solve() { 
-  // Process the work lists until they are empty! 
-  while (!BBWorkList.empty() || !InstWorkList.empty() || 
-         !OverdefinedInstWorkList.empty()) { 
-    // Process the overdefined instruction's work list first, which drives other 
-    // things to overdefined more quickly. 
-    while (!OverdefinedInstWorkList.empty()) { 
-      Value *I = OverdefinedInstWorkList.pop_back_val(); 
- 
-      LLVM_DEBUG(dbgs() << "\nPopped off OI-WL: " << *I << '\n'); 
- 
-      // "I" got into the work list because it either made the transition from 
-      // bottom to constant, or to overdefined. 
-      // 
-      // Anything on this worklist that is overdefined need not be visited 
-      // since all of its users will have already been marked as overdefined 
-      // Update all of the users of this instruction's value. 
-      // 
-      markUsersAsChanged(I); 
-    } 
- 
-    // Process the instruction work list. 
-    while (!InstWorkList.empty()) { 
-      Value *I = InstWorkList.pop_back_val(); 
- 
-      LLVM_DEBUG(dbgs() << "\nPopped off I-WL: " << *I << '\n'); 
- 
-      // "I" got into the work list because it made the transition from undef to 
-      // constant. 
-      // 
-      // Anything on this worklist that is overdefined need not be visited 
-      // since all of its users will have already been marked as overdefined. 
-      // Update all of the users of this instruction's value. 
-      // 
-      if (I->getType()->isStructTy() || !getValueState(I).isOverdefined()) 
-        markUsersAsChanged(I); 
-    } 
- 
-    // Process the basic block work list. 
-    while (!BBWorkList.empty()) { 
+  }
+
+  // The common case is that we aren't tracking the callee, either because we
+  // are not doing interprocedural analysis or the callee is indirect, or is
+  // external.  Handle these cases first.
+  if (!F || F->isDeclaration())
+    return handleCallOverdefined(CB);
+
+  // If this is a single/zero retval case, see if we're tracking the function.
+  if (auto *STy = dyn_cast<StructType>(F->getReturnType())) {
+    if (!MRVFunctionsTracked.count(F))
+      return handleCallOverdefined(CB); // Not tracking this callee.
+
+    // If we are tracking this callee, propagate the result of the function
+    // into this call site.
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+      mergeInValue(getStructValueState(&CB, i), &CB,
+                   TrackedMultipleRetVals[std::make_pair(F, i)],
+                   getMaxWidenStepsOpts());
+  } else {
+    auto TFRVI = TrackedRetVals.find(F);
+    if (TFRVI == TrackedRetVals.end())
+      return handleCallOverdefined(CB); // Not tracking this callee.
+
+    // If so, propagate the return value of the callee into this call result.
+    mergeInValue(&CB, TFRVI->second, getMaxWidenStepsOpts());
+  }
+}
+
+void SCCPSolver::Solve() {
+  // Process the work lists until they are empty!
+  while (!BBWorkList.empty() || !InstWorkList.empty() ||
+         !OverdefinedInstWorkList.empty()) {
+    // Process the overdefined instruction's work list first, which drives other
+    // things to overdefined more quickly.
+    while (!OverdefinedInstWorkList.empty()) {
+      Value *I = OverdefinedInstWorkList.pop_back_val();
+
+      LLVM_DEBUG(dbgs() << "\nPopped off OI-WL: " << *I << '\n');
+
+      // "I" got into the work list because it either made the transition from
+      // bottom to constant, or to overdefined.
+      //
+      // Anything on this worklist that is overdefined need not be visited
+      // since all of its users will have already been marked as overdefined
+      // Update all of the users of this instruction's value.
+      //
+      markUsersAsChanged(I);
+    }
+
+    // Process the instruction work list.
+    while (!InstWorkList.empty()) {
+      Value *I = InstWorkList.pop_back_val();
+
+      LLVM_DEBUG(dbgs() << "\nPopped off I-WL: " << *I << '\n');
+
+      // "I" got into the work list because it made the transition from undef to
+      // constant.
+      //
+      // Anything on this worklist that is overdefined need not be visited
+      // since all of its users will have already been marked as overdefined.
+      // Update all of the users of this instruction's value.
+      //
+      if (I->getType()->isStructTy() || !getValueState(I).isOverdefined())
+        markUsersAsChanged(I);
+    }
+
+    // Process the basic block work list.
+    while (!BBWorkList.empty()) {
       BasicBlock *BB = BBWorkList.pop_back_val();
- 
-      LLVM_DEBUG(dbgs() << "\nPopped off BBWL: " << *BB << '\n'); 
- 
-      // Notify all instructions in this basic block that they are newly 
-      // executable. 
-      visit(BB); 
-    } 
-  } 
-} 
- 
-/// ResolvedUndefsIn - While solving the dataflow for a function, we assume 
-/// that branches on undef values cannot reach any of their successors. 
-/// However, this is not a safe assumption.  After we solve dataflow, this 
-/// method should be use to handle this.  If this returns true, the solver 
-/// should be rerun. 
-/// 
-/// This method handles this by finding an unresolved branch and marking it one 
-/// of the edges from the block as being feasible, even though the condition 
-/// doesn't say it would otherwise be.  This allows SCCP to find the rest of the 
-/// CFG and only slightly pessimizes the analysis results (by marking one, 
-/// potentially infeasible, edge feasible).  This cannot usefully modify the 
-/// constraints on the condition of the branch, as that would impact other users 
-/// of the value. 
-/// 
-/// This scan also checks for values that use undefs. It conservatively marks 
-/// them as overdefined. 
-bool SCCPSolver::ResolvedUndefsIn(Function &F) { 
+
+      LLVM_DEBUG(dbgs() << "\nPopped off BBWL: " << *BB << '\n');
+
+      // Notify all instructions in this basic block that they are newly
+      // executable.
+      visit(BB);
+    }
+  }
+}
+
+/// ResolvedUndefsIn - While solving the dataflow for a function, we assume
+/// that branches on undef values cannot reach any of their successors.
+/// However, this is not a safe assumption.  After we solve dataflow, this
+/// method should be use to handle this.  If this returns true, the solver
+/// should be rerun.
+///
+/// This method handles this by finding an unresolved branch and marking it one
+/// of the edges from the block as being feasible, even though the condition
+/// doesn't say it would otherwise be.  This allows SCCP to find the rest of the
+/// CFG and only slightly pessimizes the analysis results (by marking one,
+/// potentially infeasible, edge feasible).  This cannot usefully modify the
+/// constraints on the condition of the branch, as that would impact other users
+/// of the value.
+///
+/// This scan also checks for values that use undefs. It conservatively marks
+/// them as overdefined.
+bool SCCPSolver::ResolvedUndefsIn(Function &F) {
   bool MadeChange = false;
-  for (BasicBlock &BB : F) { 
-    if (!BBExecutable.count(&BB)) 
-      continue; 
- 
-    for (Instruction &I : BB) { 
-      // Look for instructions which produce undef values. 
-      if (I.getType()->isVoidTy()) continue; 
- 
-      if (auto *STy = dyn_cast<StructType>(I.getType())) { 
-        // Only a few things that can be structs matter for undef. 
- 
-        // Tracked calls must never be marked overdefined in ResolvedUndefsIn. 
-        if (auto *CB = dyn_cast<CallBase>(&I)) 
-          if (Function *F = CB->getCalledFunction()) 
-            if (MRVFunctionsTracked.count(F)) 
-              continue; 
- 
-        // extractvalue and insertvalue don't need to be marked; they are 
-        // tracked as precisely as their operands. 
-        if (isa<ExtractValueInst>(I) || isa<InsertValueInst>(I)) 
-          continue; 
-        // Send the results of everything else to overdefined.  We could be 
-        // more precise than this but it isn't worth bothering. 
-        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 
-          ValueLatticeElement &LV = getStructValueState(&I, i); 
+  for (BasicBlock &BB : F) {
+    if (!BBExecutable.count(&BB))
+      continue;
+
+    for (Instruction &I : BB) {
+      // Look for instructions which produce undef values.
+      if (I.getType()->isVoidTy()) continue;
+
+      if (auto *STy = dyn_cast<StructType>(I.getType())) {
+        // Only a few things that can be structs matter for undef.
+
+        // Tracked calls must never be marked overdefined in ResolvedUndefsIn.
+        if (auto *CB = dyn_cast<CallBase>(&I))
+          if (Function *F = CB->getCalledFunction())
+            if (MRVFunctionsTracked.count(F))
+              continue;
+
+        // extractvalue and insertvalue don't need to be marked; they are
+        // tracked as precisely as their operands.
+        if (isa<ExtractValueInst>(I) || isa<InsertValueInst>(I))
+          continue;
+        // Send the results of everything else to overdefined.  We could be
+        // more precise than this but it isn't worth bothering.
+        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+          ValueLatticeElement &LV = getStructValueState(&I, i);
           if (LV.isUnknownOrUndef()) {
-            markOverdefined(LV, &I); 
+            markOverdefined(LV, &I);
             MadeChange = true;
           }
-        } 
-        continue; 
-      } 
- 
-      ValueLatticeElement &LV = getValueState(&I); 
-      if (!LV.isUnknownOrUndef()) 
-        continue; 
- 
-      // There are two reasons a call can have an undef result 
-      // 1. It could be tracked. 
-      // 2. It could be constant-foldable. 
-      // Because of the way we solve return values, tracked calls must 
-      // never be marked overdefined in ResolvedUndefsIn. 
-      if (auto *CB = dyn_cast<CallBase>(&I)) 
-        if (Function *F = CB->getCalledFunction()) 
-          if (TrackedRetVals.count(F)) 
-            continue; 
- 
-      if (isa<LoadInst>(I)) { 
-        // A load here means one of two things: a load of undef from a global, 
-        // a load from an unknown pointer.  Either way, having it return undef 
-        // is okay. 
-        continue; 
-      } 
- 
-      markOverdefined(&I); 
+        }
+        continue;
+      }
+
+      ValueLatticeElement &LV = getValueState(&I);
+      if (!LV.isUnknownOrUndef())
+        continue;
+
+      // There are two reasons a call can have an undef result
+      // 1. It could be tracked.
+      // 2. It could be constant-foldable.
+      // Because of the way we solve return values, tracked calls must
+      // never be marked overdefined in ResolvedUndefsIn.
+      if (auto *CB = dyn_cast<CallBase>(&I))
+        if (Function *F = CB->getCalledFunction())
+          if (TrackedRetVals.count(F))
+            continue;
+
+      if (isa<LoadInst>(I)) {
+        // A load here means one of two things: a load of undef from a global,
+        // a load from an unknown pointer.  Either way, having it return undef
+        // is okay.
+        continue;
+      }
+
+      markOverdefined(&I);
       MadeChange = true;
-    } 
- 
-    // Check to see if we have a branch or switch on an undefined value.  If so 
-    // we force the branch to go one way or the other to make the successor 
-    // values live.  It doesn't really matter which way we force it. 
-    Instruction *TI = BB.getTerminator(); 
-    if (auto *BI = dyn_cast<BranchInst>(TI)) { 
-      if (!BI->isConditional()) continue; 
-      if (!getValueState(BI->getCondition()).isUnknownOrUndef()) 
-        continue; 
- 
-      // If the input to SCCP is actually branch on undef, fix the undef to 
-      // false. 
-      if (isa<UndefValue>(BI->getCondition())) { 
-        BI->setCondition(ConstantInt::getFalse(BI->getContext())); 
-        markEdgeExecutable(&BB, TI->getSuccessor(1)); 
+    }
+
+    // Check to see if we have a branch or switch on an undefined value.  If so
+    // we force the branch to go one way or the other to make the successor
+    // values live.  It doesn't really matter which way we force it.
+    Instruction *TI = BB.getTerminator();
+    if (auto *BI = dyn_cast<BranchInst>(TI)) {
+      if (!BI->isConditional()) continue;
+      if (!getValueState(BI->getCondition()).isUnknownOrUndef())
+        continue;
+
+      // If the input to SCCP is actually branch on undef, fix the undef to
+      // false.
+      if (isa<UndefValue>(BI->getCondition())) {
+        BI->setCondition(ConstantInt::getFalse(BI->getContext()));
+        markEdgeExecutable(&BB, TI->getSuccessor(1));
         MadeChange = true;
         continue;
-      } 
- 
-      // Otherwise, it is a branch on a symbolic value which is currently 
-      // considered to be undef.  Make sure some edge is executable, so a 
-      // branch on "undef" always flows somewhere. 
-      // FIXME: Distinguish between dead code and an LLVM "undef" value. 
-      BasicBlock *DefaultSuccessor = TI->getSuccessor(1); 
-      if (markEdgeExecutable(&BB, DefaultSuccessor)) 
+      }
+
+      // Otherwise, it is a branch on a symbolic value which is currently
+      // considered to be undef.  Make sure some edge is executable, so a
+      // branch on "undef" always flows somewhere.
+      // FIXME: Distinguish between dead code and an LLVM "undef" value.
+      BasicBlock *DefaultSuccessor = TI->getSuccessor(1);
+      if (markEdgeExecutable(&BB, DefaultSuccessor))
         MadeChange = true;
- 
-      continue; 
-    } 
- 
-   if (auto *IBR = dyn_cast<IndirectBrInst>(TI)) { 
-      // Indirect branch with no successor ?. Its ok to assume it branches 
-      // to no target. 
-      if (IBR->getNumSuccessors() < 1) 
-        continue; 
- 
-      if (!getValueState(IBR->getAddress()).isUnknownOrUndef()) 
-        continue; 
- 
-      // If the input to SCCP is actually branch on undef, fix the undef to 
-      // the first successor of the indirect branch. 
-      if (isa<UndefValue>(IBR->getAddress())) { 
-        IBR->setAddress(BlockAddress::get(IBR->getSuccessor(0))); 
-        markEdgeExecutable(&BB, IBR->getSuccessor(0)); 
+
+      continue;
+    }
+
+   if (auto *IBR = dyn_cast<IndirectBrInst>(TI)) {
+      // Indirect branch with no successor ?. Its ok to assume it branches
+      // to no target.
+      if (IBR->getNumSuccessors() < 1)
+        continue;
+
+      if (!getValueState(IBR->getAddress()).isUnknownOrUndef())
+        continue;
+
+      // If the input to SCCP is actually branch on undef, fix the undef to
+      // the first successor of the indirect branch.
+      if (isa<UndefValue>(IBR->getAddress())) {
+        IBR->setAddress(BlockAddress::get(IBR->getSuccessor(0)));
+        markEdgeExecutable(&BB, IBR->getSuccessor(0));
         MadeChange = true;
         continue;
-      } 
- 
-      // Otherwise, it is a branch on a symbolic value which is currently 
-      // considered to be undef.  Make sure some edge is executable, so a 
-      // branch on "undef" always flows somewhere. 
-      // FIXME: IndirectBr on "undef" doesn't actually need to go anywhere: 
-      // we can assume the branch has undefined behavior instead. 
-      BasicBlock *DefaultSuccessor = IBR->getSuccessor(0); 
-      if (markEdgeExecutable(&BB, DefaultSuccessor)) 
+      }
+
+      // Otherwise, it is a branch on a symbolic value which is currently
+      // considered to be undef.  Make sure some edge is executable, so a
+      // branch on "undef" always flows somewhere.
+      // FIXME: IndirectBr on "undef" doesn't actually need to go anywhere:
+      // we can assume the branch has undefined behavior instead.
+      BasicBlock *DefaultSuccessor = IBR->getSuccessor(0);
+      if (markEdgeExecutable(&BB, DefaultSuccessor))
         MadeChange = true;
- 
-      continue; 
-    } 
- 
-    if (auto *SI = dyn_cast<SwitchInst>(TI)) { 
-      if (!SI->getNumCases() || 
-          !getValueState(SI->getCondition()).isUnknownOrUndef()) 
-        continue; 
- 
-      // If the input to SCCP is actually switch on undef, fix the undef to 
-      // the first constant. 
-      if (isa<UndefValue>(SI->getCondition())) { 
-        SI->setCondition(SI->case_begin()->getCaseValue()); 
-        markEdgeExecutable(&BB, SI->case_begin()->getCaseSuccessor()); 
+
+      continue;
+    }
+
+    if (auto *SI = dyn_cast<SwitchInst>(TI)) {
+      if (!SI->getNumCases() ||
+          !getValueState(SI->getCondition()).isUnknownOrUndef())
+        continue;
+
+      // If the input to SCCP is actually switch on undef, fix the undef to
+      // the first constant.
+      if (isa<UndefValue>(SI->getCondition())) {
+        SI->setCondition(SI->case_begin()->getCaseValue());
+        markEdgeExecutable(&BB, SI->case_begin()->getCaseSuccessor());
         MadeChange = true;
         continue;
-      } 
- 
-      // Otherwise, it is a branch on a symbolic value which is currently 
-      // considered to be undef.  Make sure some edge is executable, so a 
-      // branch on "undef" always flows somewhere. 
-      // FIXME: Distinguish between dead code and an LLVM "undef" value. 
-      BasicBlock *DefaultSuccessor = SI->case_begin()->getCaseSuccessor(); 
-      if (markEdgeExecutable(&BB, DefaultSuccessor)) 
+      }
+
+      // Otherwise, it is a branch on a symbolic value which is currently
+      // considered to be undef.  Make sure some edge is executable, so a
+      // branch on "undef" always flows somewhere.
+      // FIXME: Distinguish between dead code and an LLVM "undef" value.
+      BasicBlock *DefaultSuccessor = SI->case_begin()->getCaseSuccessor();
+      if (markEdgeExecutable(&BB, DefaultSuccessor))
         MadeChange = true;
- 
-      continue; 
-    } 
-  } 
- 
+
+      continue;
+    }
+  }
+
   return MadeChange;
-} 
- 
-static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) { 
-  Constant *Const = nullptr; 
-  if (V->getType()->isStructTy()) { 
-    std::vector<ValueLatticeElement> IVs = Solver.getStructLatticeValueFor(V); 
-    if (any_of(IVs, 
-               [](const ValueLatticeElement &LV) { return isOverdefined(LV); })) 
-      return false; 
-    std::vector<Constant *> ConstVals; 
-    auto *ST = cast<StructType>(V->getType()); 
-    for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { 
-      ValueLatticeElement V = IVs[i]; 
-      ConstVals.push_back(isConstant(V) 
-                              ? Solver.getConstant(V) 
-                              : UndefValue::get(ST->getElementType(i))); 
-    } 
-    Const = ConstantStruct::get(ST, ConstVals); 
-  } else { 
-    const ValueLatticeElement &IV = Solver.getLatticeValueFor(V); 
-    if (isOverdefined(IV)) 
-      return false; 
- 
-    Const = 
-        isConstant(IV) ? Solver.getConstant(IV) : UndefValue::get(V->getType()); 
-  } 
-  assert(Const && "Constant is nullptr here!"); 
- 
-  // Replacing `musttail` instructions with constant breaks `musttail` invariant 
-  // unless the call itself can be removed 
-  CallInst *CI = dyn_cast<CallInst>(V); 
-  if (CI && CI->isMustTailCall() && !CI->isSafeToRemove()) { 
-    Function *F = CI->getCalledFunction(); 
- 
-    // Don't zap returns of the callee 
-    if (F) 
-      Solver.AddMustTailCallee(F); 
- 
-    LLVM_DEBUG(dbgs() << "  Can\'t treat the result of musttail call : " << *CI 
-                      << " as a constant\n"); 
-    return false; 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "  Constant: " << *Const << " = " << *V << '\n'); 
- 
-  // Replaces all of the uses of a variable with uses of the constant. 
-  V->replaceAllUsesWith(Const); 
-  return true; 
-} 
- 
-static bool simplifyInstsInBlock(SCCPSolver &Solver, BasicBlock &BB, 
-                                 SmallPtrSetImpl<Value *> &InsertedValues, 
-                                 Statistic &InstRemovedStat, 
-                                 Statistic &InstReplacedStat) { 
-  bool MadeChanges = false; 
-  for (Instruction &Inst : make_early_inc_range(BB)) { 
-    if (Inst.getType()->isVoidTy()) 
-      continue; 
-    if (tryToReplaceWithConstant(Solver, &Inst)) { 
-      if (Inst.isSafeToRemove()) 
-        Inst.eraseFromParent(); 
-      // Hey, we just changed something! 
-      MadeChanges = true; 
-      ++InstRemovedStat; 
-    } else if (isa<SExtInst>(&Inst)) { 
-      Value *ExtOp = Inst.getOperand(0); 
-      if (isa<Constant>(ExtOp) || InsertedValues.count(ExtOp)) 
-        continue; 
-      const ValueLatticeElement &IV = Solver.getLatticeValueFor(ExtOp); 
-      if (!IV.isConstantRange(/*UndefAllowed=*/false)) 
-        continue; 
-      if (IV.getConstantRange().isAllNonNegative()) { 
-        auto *ZExt = new ZExtInst(ExtOp, Inst.getType(), "", &Inst); 
-        InsertedValues.insert(ZExt); 
-        Inst.replaceAllUsesWith(ZExt); 
-        Solver.removeLatticeValueFor(&Inst); 
-        Inst.eraseFromParent(); 
-        InstReplacedStat++; 
-        MadeChanges = true; 
-      } 
-    } 
-  } 
-  return MadeChanges; 
-} 
- 
-// runSCCP() - Run the Sparse Conditional Constant Propagation algorithm, 
-// and return true if the function was modified. 
-static bool runSCCP(Function &F, const DataLayout &DL, 
-                    const TargetLibraryInfo *TLI) { 
-  LLVM_DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n"); 
-  SCCPSolver Solver( 
-      DL, [TLI](Function &F) -> const TargetLibraryInfo & { return *TLI; }, 
-      F.getContext()); 
- 
-  // Mark the first block of the function as being executable. 
-  Solver.MarkBlockExecutable(&F.front()); 
- 
-  // Mark all arguments to the function as being overdefined. 
-  for (Argument &AI : F.args()) 
-    Solver.markOverdefined(&AI); 
- 
-  // Solve for constants. 
-  bool ResolvedUndefs = true; 
-  while (ResolvedUndefs) { 
-    Solver.Solve(); 
-    LLVM_DEBUG(dbgs() << "RESOLVING UNDEFs\n"); 
-    ResolvedUndefs = Solver.ResolvedUndefsIn(F); 
-  } 
- 
-  bool MadeChanges = false; 
- 
-  // If we decided that there are basic blocks that are dead in this function, 
-  // delete their contents now.  Note that we cannot actually delete the blocks, 
-  // as we cannot modify the CFG of the function. 
- 
-  SmallPtrSet<Value *, 32> InsertedValues; 
-  for (BasicBlock &BB : F) { 
-    if (!Solver.isBlockExecutable(&BB)) { 
-      LLVM_DEBUG(dbgs() << "  BasicBlock Dead:" << BB); 
- 
-      ++NumDeadBlocks; 
+}
+
+static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
+  Constant *Const = nullptr;
+  if (V->getType()->isStructTy()) {
+    std::vector<ValueLatticeElement> IVs = Solver.getStructLatticeValueFor(V);
+    if (any_of(IVs,
+               [](const ValueLatticeElement &LV) { return isOverdefined(LV); }))
+      return false;
+    std::vector<Constant *> ConstVals;
+    auto *ST = cast<StructType>(V->getType());
+    for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
+      ValueLatticeElement V = IVs[i];
+      ConstVals.push_back(isConstant(V)
+                              ? Solver.getConstant(V)
+                              : UndefValue::get(ST->getElementType(i)));
+    }
+    Const = ConstantStruct::get(ST, ConstVals);
+  } else {
+    const ValueLatticeElement &IV = Solver.getLatticeValueFor(V);
+    if (isOverdefined(IV))
+      return false;
+
+    Const =
+        isConstant(IV) ? Solver.getConstant(IV) : UndefValue::get(V->getType());
+  }
+  assert(Const && "Constant is nullptr here!");
+
+  // Replacing `musttail` instructions with constant breaks `musttail` invariant
+  // unless the call itself can be removed
+  CallInst *CI = dyn_cast<CallInst>(V);
+  if (CI && CI->isMustTailCall() && !CI->isSafeToRemove()) {
+    Function *F = CI->getCalledFunction();
+
+    // Don't zap returns of the callee
+    if (F)
+      Solver.AddMustTailCallee(F);
+
+    LLVM_DEBUG(dbgs() << "  Can\'t treat the result of musttail call : " << *CI
+                      << " as a constant\n");
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "  Constant: " << *Const << " = " << *V << '\n');
+
+  // Replaces all of the uses of a variable with uses of the constant.
+  V->replaceAllUsesWith(Const);
+  return true;
+}
+
+static bool simplifyInstsInBlock(SCCPSolver &Solver, BasicBlock &BB,
+                                 SmallPtrSetImpl<Value *> &InsertedValues,
+                                 Statistic &InstRemovedStat,
+                                 Statistic &InstReplacedStat) {
+  bool MadeChanges = false;
+  for (Instruction &Inst : make_early_inc_range(BB)) {
+    if (Inst.getType()->isVoidTy())
+      continue;
+    if (tryToReplaceWithConstant(Solver, &Inst)) {
+      if (Inst.isSafeToRemove())
+        Inst.eraseFromParent();
+      // Hey, we just changed something!
+      MadeChanges = true;
+      ++InstRemovedStat;
+    } else if (isa<SExtInst>(&Inst)) {
+      Value *ExtOp = Inst.getOperand(0);
+      if (isa<Constant>(ExtOp) || InsertedValues.count(ExtOp))
+        continue;
+      const ValueLatticeElement &IV = Solver.getLatticeValueFor(ExtOp);
+      if (!IV.isConstantRange(/*UndefAllowed=*/false))
+        continue;
+      if (IV.getConstantRange().isAllNonNegative()) {
+        auto *ZExt = new ZExtInst(ExtOp, Inst.getType(), "", &Inst);
+        InsertedValues.insert(ZExt);
+        Inst.replaceAllUsesWith(ZExt);
+        Solver.removeLatticeValueFor(&Inst);
+        Inst.eraseFromParent();
+        InstReplacedStat++;
+        MadeChanges = true;
+      }
+    }
+  }
+  return MadeChanges;
+}
+
+// runSCCP() - Run the Sparse Conditional Constant Propagation algorithm,
+// and return true if the function was modified.
+static bool runSCCP(Function &F, const DataLayout &DL,
+                    const TargetLibraryInfo *TLI) {
+  LLVM_DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n");
+  SCCPSolver Solver(
+      DL, [TLI](Function &F) -> const TargetLibraryInfo & { return *TLI; },
+      F.getContext());
+
+  // Mark the first block of the function as being executable.
+  Solver.MarkBlockExecutable(&F.front());
+
+  // Mark all arguments to the function as being overdefined.
+  for (Argument &AI : F.args())
+    Solver.markOverdefined(&AI);
+
+  // Solve for constants.
+  bool ResolvedUndefs = true;
+  while (ResolvedUndefs) {
+    Solver.Solve();
+    LLVM_DEBUG(dbgs() << "RESOLVING UNDEFs\n");
+    ResolvedUndefs = Solver.ResolvedUndefsIn(F);
+  }
+
+  bool MadeChanges = false;
+
+  // If we decided that there are basic blocks that are dead in this function,
+  // delete their contents now.  Note that we cannot actually delete the blocks,
+  // as we cannot modify the CFG of the function.
+
+  SmallPtrSet<Value *, 32> InsertedValues;
+  for (BasicBlock &BB : F) {
+    if (!Solver.isBlockExecutable(&BB)) {
+      LLVM_DEBUG(dbgs() << "  BasicBlock Dead:" << BB);
+
+      ++NumDeadBlocks;
       NumInstRemoved += removeAllNonTerminatorAndEHPadInstructions(&BB).first;
- 
-      MadeChanges = true; 
-      continue; 
-    } 
- 
-    MadeChanges |= simplifyInstsInBlock(Solver, BB, InsertedValues, 
-                                        NumInstRemoved, NumInstReplaced); 
-  } 
- 
-  return MadeChanges; 
-} 
- 
-PreservedAnalyses SCCPPass::run(Function &F, FunctionAnalysisManager &AM) { 
-  const DataLayout &DL = F.getParent()->getDataLayout(); 
-  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 
-  if (!runSCCP(F, DL, &TLI)) 
-    return PreservedAnalyses::all(); 
- 
-  auto PA = PreservedAnalyses(); 
-  PA.preserve<GlobalsAA>(); 
-  PA.preserveSet<CFGAnalyses>(); 
-  return PA; 
-} 
- 
-namespace { 
- 
-//===--------------------------------------------------------------------===// 
-// 
-/// SCCP Class - This class uses the SCCPSolver to implement a per-function 
-/// Sparse Conditional Constant Propagator. 
-/// 
-class SCCPLegacyPass : public FunctionPass { 
-public: 
-  // Pass identification, replacement for typeid 
-  static char ID; 
- 
-  SCCPLegacyPass() : FunctionPass(ID) { 
-    initializeSCCPLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-    AU.addPreserved<GlobalsAAWrapperPass>(); 
-    AU.setPreservesCFG(); 
-  } 
- 
-  // runOnFunction - Run the Sparse Conditional Constant Propagation 
-  // algorithm, and return true if the function was modified. 
-  bool runOnFunction(Function &F) override { 
-    if (skipFunction(F)) 
-      return false; 
-    const DataLayout &DL = F.getParent()->getDataLayout(); 
-    const TargetLibraryInfo *TLI = 
-        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 
-    return runSCCP(F, DL, TLI); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-char SCCPLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(SCCPLegacyPass, "sccp", 
-                      "Sparse Conditional Constant Propagation", false, false) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_END(SCCPLegacyPass, "sccp", 
-                    "Sparse Conditional Constant Propagation", false, false) 
- 
-// createSCCPPass - This is the public interface to this file. 
-FunctionPass *llvm::createSCCPPass() { return new SCCPLegacyPass(); } 
- 
-static void findReturnsToZap(Function &F, 
-                             SmallVector<ReturnInst *, 8> &ReturnsToZap, 
-                             SCCPSolver &Solver) { 
-  // We can only do this if we know that nothing else can call the function. 
-  if (!Solver.isArgumentTrackedFunction(&F)) 
-    return; 
- 
-  // There is a non-removable musttail call site of this function. Zapping 
-  // returns is not allowed. 
-  if (Solver.isMustTailCallee(&F)) { 
-    LLVM_DEBUG(dbgs() << "Can't zap returns of the function : " << F.getName() 
-                      << " due to present musttail call of it\n"); 
-    return; 
-  } 
- 
-  assert( 
-      all_of(F.users(), 
-             [&Solver](User *U) { 
-               if (isa<Instruction>(U) && 
-                   !Solver.isBlockExecutable(cast<Instruction>(U)->getParent())) 
-                 return true; 
-               // Non-callsite uses are not impacted by zapping. Also, constant 
-               // uses (like blockaddresses) could stuck around, without being 
-               // used in the underlying IR, meaning we do not have lattice 
-               // values for them. 
-               if (!isa<CallBase>(U)) 
-                 return true; 
-               if (U->getType()->isStructTy()) { 
-                 return all_of(Solver.getStructLatticeValueFor(U), 
-                               [](const ValueLatticeElement &LV) { 
-                                 return !isOverdefined(LV); 
-                               }); 
-               } 
-               return !isOverdefined(Solver.getLatticeValueFor(U)); 
-             }) && 
-      "We can only zap functions where all live users have a concrete value"); 
- 
-  for (BasicBlock &BB : F) { 
-    if (CallInst *CI = BB.getTerminatingMustTailCall()) { 
-      LLVM_DEBUG(dbgs() << "Can't zap return of the block due to present " 
-                        << "musttail call : " << *CI << "\n"); 
-      (void)CI; 
-      return; 
-    } 
- 
-    if (auto *RI = dyn_cast<ReturnInst>(BB.getTerminator())) 
-      if (!isa<UndefValue>(RI->getOperand(0))) 
-        ReturnsToZap.push_back(RI); 
-  } 
-} 
- 
+
+      MadeChanges = true;
+      continue;
+    }
+
+    MadeChanges |= simplifyInstsInBlock(Solver, BB, InsertedValues,
+                                        NumInstRemoved, NumInstReplaced);
+  }
+
+  return MadeChanges;
+}
+
+PreservedAnalyses SCCPPass::run(Function &F, FunctionAnalysisManager &AM) {
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  if (!runSCCP(F, DL, &TLI))
+    return PreservedAnalyses::all();
+
+  auto PA = PreservedAnalyses();
+  PA.preserve<GlobalsAA>();
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
+namespace {
+
+//===--------------------------------------------------------------------===//
+//
+/// SCCP Class - This class uses the SCCPSolver to implement a per-function
+/// Sparse Conditional Constant Propagator.
+///
+class SCCPLegacyPass : public FunctionPass {
+public:
+  // Pass identification, replacement for typeid
+  static char ID;
+
+  SCCPLegacyPass() : FunctionPass(ID) {
+    initializeSCCPLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.setPreservesCFG();
+  }
+
+  // runOnFunction - Run the Sparse Conditional Constant Propagation
+  // algorithm, and return true if the function was modified.
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    const TargetLibraryInfo *TLI =
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    return runSCCP(F, DL, TLI);
+  }
+};
+
+} // end anonymous namespace
+
+char SCCPLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(SCCPLegacyPass, "sccp",
+                      "Sparse Conditional Constant Propagation", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(SCCPLegacyPass, "sccp",
+                    "Sparse Conditional Constant Propagation", false, false)
+
+// createSCCPPass - This is the public interface to this file.
+FunctionPass *llvm::createSCCPPass() { return new SCCPLegacyPass(); }
+
+static void findReturnsToZap(Function &F,
+                             SmallVector<ReturnInst *, 8> &ReturnsToZap,
+                             SCCPSolver &Solver) {
+  // We can only do this if we know that nothing else can call the function.
+  if (!Solver.isArgumentTrackedFunction(&F))
+    return;
+
+  // There is a non-removable musttail call site of this function. Zapping
+  // returns is not allowed.
+  if (Solver.isMustTailCallee(&F)) {
+    LLVM_DEBUG(dbgs() << "Can't zap returns of the function : " << F.getName()
+                      << " due to present musttail call of it\n");
+    return;
+  }
+
+  assert(
+      all_of(F.users(),
+             [&Solver](User *U) {
+               if (isa<Instruction>(U) &&
+                   !Solver.isBlockExecutable(cast<Instruction>(U)->getParent()))
+                 return true;
+               // Non-callsite uses are not impacted by zapping. Also, constant
+               // uses (like blockaddresses) could stuck around, without being
+               // used in the underlying IR, meaning we do not have lattice
+               // values for them.
+               if (!isa<CallBase>(U))
+                 return true;
+               if (U->getType()->isStructTy()) {
+                 return all_of(Solver.getStructLatticeValueFor(U),
+                               [](const ValueLatticeElement &LV) {
+                                 return !isOverdefined(LV);
+                               });
+               }
+               return !isOverdefined(Solver.getLatticeValueFor(U));
+             }) &&
+      "We can only zap functions where all live users have a concrete value");
+
+  for (BasicBlock &BB : F) {
+    if (CallInst *CI = BB.getTerminatingMustTailCall()) {
+      LLVM_DEBUG(dbgs() << "Can't zap return of the block due to present "
+                        << "musttail call : " << *CI << "\n");
+      (void)CI;
+      return;
+    }
+
+    if (auto *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
+      if (!isa<UndefValue>(RI->getOperand(0)))
+        ReturnsToZap.push_back(RI);
+  }
+}
+
 static bool removeNonFeasibleEdges(const SCCPSolver &Solver, BasicBlock *BB,
                                    DomTreeUpdater &DTU) {
   SmallPtrSet<BasicBlock *, 8> FeasibleSuccessors;
@@ -1906,7 +1906,7 @@ static bool removeNonFeasibleEdges(const SCCPSolver &Solver, BasicBlock *BB,
 
       Succ->removePredecessor(BB);
       Updates.push_back({DominatorTree::Delete, BB, Succ});
-    } 
+    }
 
     BranchInst::Create(OnlyFeasibleSuccessor, BB);
     TI->eraseFromParent();
@@ -1925,92 +1925,92 @@ static bool removeNonFeasibleEdges(const SCCPSolver &Solver, BasicBlock *BB,
       Updates.push_back({DominatorTree::Delete, BB, Succ});
       SI.removeCase(CI);
       // Don't increment CI, as we removed a case.
-    } 
+    }
 
     DTU.applyUpdatesPermissive(Updates);
-  } else { 
+  } else {
     llvm_unreachable("Must have at least one feasible successor");
-  } 
+  }
   return true;
-} 
- 
-bool llvm::runIPSCCP( 
-    Module &M, const DataLayout &DL, 
-    std::function<const TargetLibraryInfo &(Function &)> GetTLI, 
-    function_ref<AnalysisResultsForFn(Function &)> getAnalysis) { 
-  SCCPSolver Solver(DL, GetTLI, M.getContext()); 
- 
-  // Loop over all functions, marking arguments to those with their addresses 
-  // taken or that are external as overdefined. 
-  for (Function &F : M) { 
-    if (F.isDeclaration()) 
-      continue; 
- 
-    Solver.addAnalysis(F, getAnalysis(F)); 
- 
-    // Determine if we can track the function's return values. If so, add the 
-    // function to the solver's set of return-tracked functions. 
-    if (canTrackReturnsInterprocedurally(&F)) 
-      Solver.AddTrackedFunction(&F); 
- 
-    // Determine if we can track the function's arguments. If so, add the 
-    // function to the solver's set of argument-tracked functions. 
-    if (canTrackArgumentsInterprocedurally(&F)) { 
-      Solver.AddArgumentTrackedFunction(&F); 
-      continue; 
-    } 
- 
-    // Assume the function is called. 
-    Solver.MarkBlockExecutable(&F.front()); 
- 
-    // Assume nothing about the incoming arguments. 
-    for (Argument &AI : F.args()) 
-      Solver.markOverdefined(&AI); 
-  } 
- 
-  // Determine if we can track any of the module's global variables. If so, add 
-  // the global variables we can track to the solver's set of tracked global 
-  // variables. 
-  for (GlobalVariable &G : M.globals()) { 
-    G.removeDeadConstantUsers(); 
-    if (canTrackGlobalVariableInterprocedurally(&G)) 
-      Solver.TrackValueOfGlobalVariable(&G); 
-  } 
- 
-  // Solve for constants. 
-  bool ResolvedUndefs = true; 
-  Solver.Solve(); 
-  while (ResolvedUndefs) { 
-    LLVM_DEBUG(dbgs() << "RESOLVING UNDEFS\n"); 
-    ResolvedUndefs = false; 
+}
+
+bool llvm::runIPSCCP(
+    Module &M, const DataLayout &DL,
+    std::function<const TargetLibraryInfo &(Function &)> GetTLI,
+    function_ref<AnalysisResultsForFn(Function &)> getAnalysis) {
+  SCCPSolver Solver(DL, GetTLI, M.getContext());
+
+  // Loop over all functions, marking arguments to those with their addresses
+  // taken or that are external as overdefined.
+  for (Function &F : M) {
+    if (F.isDeclaration())
+      continue;
+
+    Solver.addAnalysis(F, getAnalysis(F));
+
+    // Determine if we can track the function's return values. If so, add the
+    // function to the solver's set of return-tracked functions.
+    if (canTrackReturnsInterprocedurally(&F))
+      Solver.AddTrackedFunction(&F);
+
+    // Determine if we can track the function's arguments. If so, add the
+    // function to the solver's set of argument-tracked functions.
+    if (canTrackArgumentsInterprocedurally(&F)) {
+      Solver.AddArgumentTrackedFunction(&F);
+      continue;
+    }
+
+    // Assume the function is called.
+    Solver.MarkBlockExecutable(&F.front());
+
+    // Assume nothing about the incoming arguments.
+    for (Argument &AI : F.args())
+      Solver.markOverdefined(&AI);
+  }
+
+  // Determine if we can track any of the module's global variables. If so, add
+  // the global variables we can track to the solver's set of tracked global
+  // variables.
+  for (GlobalVariable &G : M.globals()) {
+    G.removeDeadConstantUsers();
+    if (canTrackGlobalVariableInterprocedurally(&G))
+      Solver.TrackValueOfGlobalVariable(&G);
+  }
+
+  // Solve for constants.
+  bool ResolvedUndefs = true;
+  Solver.Solve();
+  while (ResolvedUndefs) {
+    LLVM_DEBUG(dbgs() << "RESOLVING UNDEFS\n");
+    ResolvedUndefs = false;
     for (Function &F : M) {
       if (Solver.ResolvedUndefsIn(F))
-        ResolvedUndefs = true; 
+        ResolvedUndefs = true;
     }
     if (ResolvedUndefs)
       Solver.Solve();
-  } 
- 
-  bool MadeChanges = false; 
- 
-  // Iterate over all of the instructions in the module, replacing them with 
-  // constants if we have found them to be of constant values. 
- 
-  for (Function &F : M) { 
-    if (F.isDeclaration()) 
-      continue; 
- 
-    SmallVector<BasicBlock *, 512> BlocksToErase; 
- 
+  }
+
+  bool MadeChanges = false;
+
+  // Iterate over all of the instructions in the module, replacing them with
+  // constants if we have found them to be of constant values.
+
+  for (Function &F : M) {
+    if (F.isDeclaration())
+      continue;
+
+    SmallVector<BasicBlock *, 512> BlocksToErase;
+
     if (Solver.isBlockExecutable(&F.front())) {
       bool ReplacedPointerArg = false;
       for (Argument &Arg : F.args()) {
         if (!Arg.use_empty() && tryToReplaceWithConstant(Solver, &Arg)) {
           ReplacedPointerArg |= Arg.getType()->isPointerTy();
-          ++IPNumArgsElimed; 
-        } 
-      } 
- 
+          ++IPNumArgsElimed;
+        }
+      }
+
       // If we replaced an argument, the argmemonly and
       // inaccessiblemem_or_argmemonly attributes do not hold any longer. Remove
       // them from both the function and callsites.
@@ -2031,74 +2031,74 @@ bool llvm::runIPSCCP(
       }
     }
 
-    SmallPtrSet<Value *, 32> InsertedValues; 
-    for (BasicBlock &BB : F) { 
-      if (!Solver.isBlockExecutable(&BB)) { 
-        LLVM_DEBUG(dbgs() << "  BasicBlock Dead:" << BB); 
-        ++NumDeadBlocks; 
- 
-        MadeChanges = true; 
- 
-        if (&BB != &F.front()) 
-          BlocksToErase.push_back(&BB); 
-        continue; 
-      } 
- 
-      MadeChanges |= simplifyInstsInBlock(Solver, BB, InsertedValues, 
-                                          IPNumInstRemoved, IPNumInstReplaced); 
-    } 
- 
-    DomTreeUpdater DTU = Solver.getDTU(F); 
-    // Change dead blocks to unreachable. We do it after replacing constants 
-    // in all executable blocks, because changeToUnreachable may remove PHI 
-    // nodes in executable blocks we found values for. The function's entry 
-    // block is not part of BlocksToErase, so we have to handle it separately. 
-    for (BasicBlock *BB : BlocksToErase) { 
-      NumInstRemoved += 
-          changeToUnreachable(BB->getFirstNonPHI(), /*UseLLVMTrap=*/false, 
-                              /*PreserveLCSSA=*/false, &DTU); 
-    } 
-    if (!Solver.isBlockExecutable(&F.front())) 
-      NumInstRemoved += changeToUnreachable(F.front().getFirstNonPHI(), 
-                                            /*UseLLVMTrap=*/false, 
-                                            /*PreserveLCSSA=*/false, &DTU); 
- 
+    SmallPtrSet<Value *, 32> InsertedValues;
+    for (BasicBlock &BB : F) {
+      if (!Solver.isBlockExecutable(&BB)) {
+        LLVM_DEBUG(dbgs() << "  BasicBlock Dead:" << BB);
+        ++NumDeadBlocks;
+
+        MadeChanges = true;
+
+        if (&BB != &F.front())
+          BlocksToErase.push_back(&BB);
+        continue;
+      }
+
+      MadeChanges |= simplifyInstsInBlock(Solver, BB, InsertedValues,
+                                          IPNumInstRemoved, IPNumInstReplaced);
+    }
+
+    DomTreeUpdater DTU = Solver.getDTU(F);
+    // Change dead blocks to unreachable. We do it after replacing constants
+    // in all executable blocks, because changeToUnreachable may remove PHI
+    // nodes in executable blocks we found values for. The function's entry
+    // block is not part of BlocksToErase, so we have to handle it separately.
+    for (BasicBlock *BB : BlocksToErase) {
+      NumInstRemoved +=
+          changeToUnreachable(BB->getFirstNonPHI(), /*UseLLVMTrap=*/false,
+                              /*PreserveLCSSA=*/false, &DTU);
+    }
+    if (!Solver.isBlockExecutable(&F.front()))
+      NumInstRemoved += changeToUnreachable(F.front().getFirstNonPHI(),
+                                            /*UseLLVMTrap=*/false,
+                                            /*PreserveLCSSA=*/false, &DTU);
+
     for (BasicBlock &BB : F)
       MadeChanges |= removeNonFeasibleEdges(Solver, &BB, DTU);
- 
+
     for (BasicBlock *DeadBB : BlocksToErase)
-      DTU.deleteBB(DeadBB); 
- 
-    for (BasicBlock &BB : F) { 
-      for (BasicBlock::iterator BI = BB.begin(), E = BB.end(); BI != E;) { 
-        Instruction *Inst = &*BI++; 
-        if (Solver.getPredicateInfoFor(Inst)) { 
-          if (auto *II = dyn_cast<IntrinsicInst>(Inst)) { 
-            if (II->getIntrinsicID() == Intrinsic::ssa_copy) { 
-              Value *Op = II->getOperand(0); 
-              Inst->replaceAllUsesWith(Op); 
-              Inst->eraseFromParent(); 
-            } 
-          } 
-        } 
-      } 
-    } 
-  } 
- 
-  // If we inferred constant or undef return values for a function, we replaced 
-  // all call uses with the inferred value.  This means we don't need to bother 
-  // actually returning anything from the function.  Replace all return 
-  // instructions with return undef. 
-  // 
-  // Do this in two stages: first identify the functions we should process, then 
-  // actually zap their returns.  This is important because we can only do this 
-  // if the address of the function isn't taken.  In cases where a return is the 
-  // last use of a function, the order of processing functions would affect 
-  // whether other functions are optimizable. 
-  SmallVector<ReturnInst*, 8> ReturnsToZap; 
- 
-  for (const auto &I : Solver.getTrackedRetVals()) { 
-    Function *F = I.first; 
+      DTU.deleteBB(DeadBB);
+
+    for (BasicBlock &BB : F) {
+      for (BasicBlock::iterator BI = BB.begin(), E = BB.end(); BI != E;) {
+        Instruction *Inst = &*BI++;
+        if (Solver.getPredicateInfoFor(Inst)) {
+          if (auto *II = dyn_cast<IntrinsicInst>(Inst)) {
+            if (II->getIntrinsicID() == Intrinsic::ssa_copy) {
+              Value *Op = II->getOperand(0);
+              Inst->replaceAllUsesWith(Op);
+              Inst->eraseFromParent();
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // If we inferred constant or undef return values for a function, we replaced
+  // all call uses with the inferred value.  This means we don't need to bother
+  // actually returning anything from the function.  Replace all return
+  // instructions with return undef.
+  //
+  // Do this in two stages: first identify the functions we should process, then
+  // actually zap their returns.  This is important because we can only do this
+  // if the address of the function isn't taken.  In cases where a return is the
+  // last use of a function, the order of processing functions would affect
+  // whether other functions are optimizable.
+  SmallVector<ReturnInst*, 8> ReturnsToZap;
+
+  for (const auto &I : Solver.getTrackedRetVals()) {
+    Function *F = I.first;
     const ValueLatticeElement &ReturnValue = I.second;
 
     // If there is a known constant range for the return value, add !range
@@ -2134,31 +2134,31 @@ bool llvm::runIPSCCP(
             ConstantAsMetadata::get(ConstantInt::get(Context, CR.getUpper()))};
         CB->setMetadata(LLVMContext::MD_range, MDNode::get(Context, RangeMD));
       }
-      continue; 
+      continue;
     }
     if (F->getReturnType()->isVoidTy())
       continue;
     if (isConstant(ReturnValue) || ReturnValue.isUnknownOrUndef())
       findReturnsToZap(*F, ReturnsToZap, Solver);
-  } 
- 
-  for (auto F : Solver.getMRVFunctionsTracked()) { 
-    assert(F->getReturnType()->isStructTy() && 
-           "The return type should be a struct"); 
-    StructType *STy = cast<StructType>(F->getReturnType()); 
-    if (Solver.isStructLatticeConstant(F, STy)) 
-      findReturnsToZap(*F, ReturnsToZap, Solver); 
-  } 
- 
-  // Zap all returns which we've identified as zap to change. 
+  }
+
+  for (auto F : Solver.getMRVFunctionsTracked()) {
+    assert(F->getReturnType()->isStructTy() &&
+           "The return type should be a struct");
+    StructType *STy = cast<StructType>(F->getReturnType());
+    if (Solver.isStructLatticeConstant(F, STy))
+      findReturnsToZap(*F, ReturnsToZap, Solver);
+  }
+
+  // Zap all returns which we've identified as zap to change.
   SmallSetVector<Function *, 8> FuncZappedReturn;
-  for (unsigned i = 0, e = ReturnsToZap.size(); i != e; ++i) { 
-    Function *F = ReturnsToZap[i]->getParent()->getParent(); 
-    ReturnsToZap[i]->setOperand(0, UndefValue::get(F->getReturnType())); 
+  for (unsigned i = 0, e = ReturnsToZap.size(); i != e; ++i) {
+    Function *F = ReturnsToZap[i]->getParent()->getParent();
+    ReturnsToZap[i]->setOperand(0, UndefValue::get(F->getReturnType()));
     // Record all functions that are zapped.
     FuncZappedReturn.insert(F);
-  } 
- 
+  }
+
   // Remove the returned attribute for zapped functions and the
   // corresponding call sites.
   for (Function *F : FuncZappedReturn) {
@@ -2174,22 +2174,22 @@ bool llvm::runIPSCCP(
     }
   }
 
-  // If we inferred constant or undef values for globals variables, we can 
-  // delete the global and any stores that remain to it. 
-  for (auto &I : make_early_inc_range(Solver.getTrackedGlobals())) { 
-    GlobalVariable *GV = I.first; 
-    if (isOverdefined(I.second)) 
-      continue; 
-    LLVM_DEBUG(dbgs() << "Found that GV '" << GV->getName() 
-                      << "' is constant!\n"); 
-    while (!GV->use_empty()) { 
-      StoreInst *SI = cast<StoreInst>(GV->user_back()); 
-      SI->eraseFromParent(); 
-      MadeChanges = true; 
-    } 
-    M.getGlobalList().erase(GV); 
-    ++IPNumGlobalConst; 
-  } 
- 
-  return MadeChanges; 
-} 
+  // If we inferred constant or undef values for globals variables, we can
+  // delete the global and any stores that remain to it.
+  for (auto &I : make_early_inc_range(Solver.getTrackedGlobals())) {
+    GlobalVariable *GV = I.first;
+    if (isOverdefined(I.second))
+      continue;
+    LLVM_DEBUG(dbgs() << "Found that GV '" << GV->getName()
+                      << "' is constant!\n");
+    while (!GV->use_empty()) {
+      StoreInst *SI = cast<StoreInst>(GV->user_back());
+      SI->eraseFromParent();
+      MadeChanges = true;
+    }
+    M.getGlobalList().erase(GV);
+    ++IPNumGlobalConst;
+  }
+
+  return MadeChanges;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/SROA.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/SROA.cpp
index 6a43dd3b17..af510f1a84 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/SROA.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/SROA.cpp
@@ -1,3098 +1,3098 @@
-//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-/// \file 
-/// This transformation implements the well known scalar replacement of 
-/// aggregates transformation. It tries to identify promotable elements of an 
-/// aggregate alloca, and promote them to registers. It will also try to 
-/// convert uses of an element (or set of elements) of an alloca into a vector 
-/// or bitfield-style integer scalar if appropriate. 
-/// 
-/// It works to do this with minimal slicing of the alloca so that regions 
-/// which are merely transferred in and out of external memory remain unchanged 
-/// and are not decomposed to scalar code. 
-/// 
-/// Because this also performs alloca promotion, it can be thought of as also 
-/// serving the purpose of SSA formation. The algorithm iterates on the 
-/// function until all opportunities for promotion have been realized. 
-/// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/SROA.h" 
-#include "llvm/ADT/APInt.h" 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/PointerIntPair.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/ADT/SmallBitVector.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/StringRef.h" 
-#include "llvm/ADT/Twine.h" 
-#include "llvm/ADT/iterator.h" 
-#include "llvm/ADT/iterator_range.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/Loads.h" 
-#include "llvm/Analysis/PtrUseVisitor.h" 
-#include "llvm/Config/llvm-config.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/ConstantFolder.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DIBuilder.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DebugInfoMetadata.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GetElementPtrTypeIterator.h" 
-#include "llvm/IR/GlobalAlias.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstVisitor.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Operator.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Compiler.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/MathExtras.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Transforms/Utils/PromoteMemToReg.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <chrono> 
-#include <cstddef> 
-#include <cstdint> 
-#include <cstring> 
-#include <iterator> 
-#include <string> 
-#include <tuple> 
-#include <utility> 
-#include <vector> 
- 
-using namespace llvm; 
-using namespace llvm::sroa; 
- 
-#define DEBUG_TYPE "sroa" 
- 
-STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement"); 
-STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed"); 
-STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca"); 
-STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten"); 
-STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition"); 
-STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced"); 
-STATISTIC(NumPromoted, "Number of allocas promoted to SSA values"); 
-STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion"); 
-STATISTIC(NumDeleted, "Number of instructions deleted"); 
-STATISTIC(NumVectorized, "Number of vectorized aggregates"); 
- 
-/// Hidden option to experiment with completely strict handling of inbounds 
-/// GEPs. 
-static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false), 
-                                        cl::Hidden); 
- 
-namespace { 
- 
-/// A custom IRBuilder inserter which prefixes all names, but only in 
-/// Assert builds. 
-class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter { 
-  std::string Prefix; 
- 
-  const Twine getNameWithPrefix(const Twine &Name) const { 
-    return Name.isTriviallyEmpty() ? Name : Prefix + Name; 
-  } 
- 
-public: 
-  void SetNamePrefix(const Twine &P) { Prefix = P.str(); } 
- 
-  void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB, 
-                    BasicBlock::iterator InsertPt) const override { 
-    IRBuilderDefaultInserter::InsertHelper(I, getNameWithPrefix(Name), BB, 
-                                           InsertPt); 
-  } 
-}; 
- 
-/// Provide a type for IRBuilder that drops names in release builds. 
-using IRBuilderTy = IRBuilder<ConstantFolder, IRBuilderPrefixedInserter>; 
- 
-/// A used slice of an alloca. 
-/// 
-/// This structure represents a slice of an alloca used by some instruction. It 
-/// stores both the begin and end offsets of this use, a pointer to the use 
-/// itself, and a flag indicating whether we can classify the use as splittable 
-/// or not when forming partitions of the alloca. 
-class Slice { 
-  /// The beginning offset of the range. 
-  uint64_t BeginOffset = 0; 
- 
-  /// The ending offset, not included in the range. 
-  uint64_t EndOffset = 0; 
- 
-  /// Storage for both the use of this slice and whether it can be 
-  /// split. 
-  PointerIntPair<Use *, 1, bool> UseAndIsSplittable; 
- 
-public: 
-  Slice() = default; 
- 
-  Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable) 
-      : BeginOffset(BeginOffset), EndOffset(EndOffset), 
-        UseAndIsSplittable(U, IsSplittable) {} 
- 
-  uint64_t beginOffset() const { return BeginOffset; } 
-  uint64_t endOffset() const { return EndOffset; } 
- 
-  bool isSplittable() const { return UseAndIsSplittable.getInt(); } 
-  void makeUnsplittable() { UseAndIsSplittable.setInt(false); } 
- 
-  Use *getUse() const { return UseAndIsSplittable.getPointer(); } 
- 
-  bool isDead() const { return getUse() == nullptr; } 
-  void kill() { UseAndIsSplittable.setPointer(nullptr); } 
- 
-  /// Support for ordering ranges. 
-  /// 
-  /// This provides an ordering over ranges such that start offsets are 
-  /// always increasing, and within equal start offsets, the end offsets are 
-  /// decreasing. Thus the spanning range comes first in a cluster with the 
-  /// same start position. 
-  bool operator<(const Slice &RHS) const { 
-    if (beginOffset() < RHS.beginOffset()) 
-      return true; 
-    if (beginOffset() > RHS.beginOffset()) 
-      return false; 
-    if (isSplittable() != RHS.isSplittable()) 
-      return !isSplittable(); 
-    if (endOffset() > RHS.endOffset()) 
-      return true; 
-    return false; 
-  } 
- 
-  /// Support comparison with a single offset to allow binary searches. 
-  friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS, 
-                                              uint64_t RHSOffset) { 
-    return LHS.beginOffset() < RHSOffset; 
-  } 
-  friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset, 
-                                              const Slice &RHS) { 
-    return LHSOffset < RHS.beginOffset(); 
-  } 
- 
-  bool operator==(const Slice &RHS) const { 
-    return isSplittable() == RHS.isSplittable() && 
-           beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset(); 
-  } 
-  bool operator!=(const Slice &RHS) const { return !operator==(RHS); } 
-}; 
- 
-} // end anonymous namespace 
- 
-/// Representation of the alloca slices. 
-/// 
-/// This class represents the slices of an alloca which are formed by its 
-/// various uses. If a pointer escapes, we can't fully build a representation 
-/// for the slices used and we reflect that in this structure. The uses are 
-/// stored, sorted by increasing beginning offset and with unsplittable slices 
-/// starting at a particular offset before splittable slices. 
-class llvm::sroa::AllocaSlices { 
-public: 
-  /// Construct the slices of a particular alloca. 
-  AllocaSlices(const DataLayout &DL, AllocaInst &AI); 
- 
-  /// Test whether a pointer to the allocation escapes our analysis. 
-  /// 
-  /// If this is true, the slices are never fully built and should be 
-  /// ignored. 
-  bool isEscaped() const { return PointerEscapingInstr; } 
- 
-  /// Support for iterating over the slices. 
-  /// @{ 
-  using iterator = SmallVectorImpl<Slice>::iterator; 
-  using range = iterator_range<iterator>; 
- 
-  iterator begin() { return Slices.begin(); } 
-  iterator end() { return Slices.end(); } 
- 
-  using const_iterator = SmallVectorImpl<Slice>::const_iterator; 
-  using const_range = iterator_range<const_iterator>; 
- 
-  const_iterator begin() const { return Slices.begin(); } 
-  const_iterator end() const { return Slices.end(); } 
-  /// @} 
- 
-  /// Erase a range of slices. 
-  void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); } 
- 
-  /// Insert new slices for this alloca. 
-  /// 
-  /// This moves the slices into the alloca's slices collection, and re-sorts 
-  /// everything so that the usual ordering properties of the alloca's slices 
-  /// hold. 
-  void insert(ArrayRef<Slice> NewSlices) { 
-    int OldSize = Slices.size(); 
-    Slices.append(NewSlices.begin(), NewSlices.end()); 
-    auto SliceI = Slices.begin() + OldSize; 
-    llvm::sort(SliceI, Slices.end()); 
-    std::inplace_merge(Slices.begin(), SliceI, Slices.end()); 
-  } 
- 
-  // Forward declare the iterator and range accessor for walking the 
-  // partitions. 
-  class partition_iterator; 
-  iterator_range<partition_iterator> partitions(); 
- 
-  /// Access the dead users for this alloca. 
-  ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; } 
- 
+//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This transformation implements the well known scalar replacement of
+/// aggregates transformation. It tries to identify promotable elements of an
+/// aggregate alloca, and promote them to registers. It will also try to
+/// convert uses of an element (or set of elements) of an alloca into a vector
+/// or bitfield-style integer scalar if appropriate.
+///
+/// It works to do this with minimal slicing of the alloca so that regions
+/// which are merely transferred in and out of external memory remain unchanged
+/// and are not decomposed to scalar code.
+///
+/// Because this also performs alloca promotion, it can be thought of as also
+/// serving the purpose of SSA formation. The algorithm iterates on the
+/// function until all opportunities for promotion have been realized.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/SROA.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/PtrUseVisitor.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/ConstantFolder.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include <algorithm>
+#include <cassert>
+#include <chrono>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::sroa;
+
+#define DEBUG_TYPE "sroa"
+
+STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
+STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
+STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
+STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
+STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
+STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
+STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
+STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
+STATISTIC(NumDeleted, "Number of instructions deleted");
+STATISTIC(NumVectorized, "Number of vectorized aggregates");
+
+/// Hidden option to experiment with completely strict handling of inbounds
+/// GEPs.
+static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false),
+                                        cl::Hidden);
+
+namespace {
+
+/// A custom IRBuilder inserter which prefixes all names, but only in
+/// Assert builds.
+class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter {
+  std::string Prefix;
+
+  const Twine getNameWithPrefix(const Twine &Name) const {
+    return Name.isTriviallyEmpty() ? Name : Prefix + Name;
+  }
+
+public:
+  void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
+
+  void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB,
+                    BasicBlock::iterator InsertPt) const override {
+    IRBuilderDefaultInserter::InsertHelper(I, getNameWithPrefix(Name), BB,
+                                           InsertPt);
+  }
+};
+
+/// Provide a type for IRBuilder that drops names in release builds.
+using IRBuilderTy = IRBuilder<ConstantFolder, IRBuilderPrefixedInserter>;
+
+/// A used slice of an alloca.
+///
+/// This structure represents a slice of an alloca used by some instruction. It
+/// stores both the begin and end offsets of this use, a pointer to the use
+/// itself, and a flag indicating whether we can classify the use as splittable
+/// or not when forming partitions of the alloca.
+class Slice {
+  /// The beginning offset of the range.
+  uint64_t BeginOffset = 0;
+
+  /// The ending offset, not included in the range.
+  uint64_t EndOffset = 0;
+
+  /// Storage for both the use of this slice and whether it can be
+  /// split.
+  PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
+
+public:
+  Slice() = default;
+
+  Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable)
+      : BeginOffset(BeginOffset), EndOffset(EndOffset),
+        UseAndIsSplittable(U, IsSplittable) {}
+
+  uint64_t beginOffset() const { return BeginOffset; }
+  uint64_t endOffset() const { return EndOffset; }
+
+  bool isSplittable() const { return UseAndIsSplittable.getInt(); }
+  void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
+
+  Use *getUse() const { return UseAndIsSplittable.getPointer(); }
+
+  bool isDead() const { return getUse() == nullptr; }
+  void kill() { UseAndIsSplittable.setPointer(nullptr); }
+
+  /// Support for ordering ranges.
+  ///
+  /// This provides an ordering over ranges such that start offsets are
+  /// always increasing, and within equal start offsets, the end offsets are
+  /// decreasing. Thus the spanning range comes first in a cluster with the
+  /// same start position.
+  bool operator<(const Slice &RHS) const {
+    if (beginOffset() < RHS.beginOffset())
+      return true;
+    if (beginOffset() > RHS.beginOffset())
+      return false;
+    if (isSplittable() != RHS.isSplittable())
+      return !isSplittable();
+    if (endOffset() > RHS.endOffset())
+      return true;
+    return false;
+  }
+
+  /// Support comparison with a single offset to allow binary searches.
+  friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS,
+                                              uint64_t RHSOffset) {
+    return LHS.beginOffset() < RHSOffset;
+  }
+  friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset,
+                                              const Slice &RHS) {
+    return LHSOffset < RHS.beginOffset();
+  }
+
+  bool operator==(const Slice &RHS) const {
+    return isSplittable() == RHS.isSplittable() &&
+           beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
+  }
+  bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
+};
+
+} // end anonymous namespace
+
+/// Representation of the alloca slices.
+///
+/// This class represents the slices of an alloca which are formed by its
+/// various uses. If a pointer escapes, we can't fully build a representation
+/// for the slices used and we reflect that in this structure. The uses are
+/// stored, sorted by increasing beginning offset and with unsplittable slices
+/// starting at a particular offset before splittable slices.
+class llvm::sroa::AllocaSlices {
+public:
+  /// Construct the slices of a particular alloca.
+  AllocaSlices(const DataLayout &DL, AllocaInst &AI);
+
+  /// Test whether a pointer to the allocation escapes our analysis.
+  ///
+  /// If this is true, the slices are never fully built and should be
+  /// ignored.
+  bool isEscaped() const { return PointerEscapingInstr; }
+
+  /// Support for iterating over the slices.
+  /// @{
+  using iterator = SmallVectorImpl<Slice>::iterator;
+  using range = iterator_range<iterator>;
+
+  iterator begin() { return Slices.begin(); }
+  iterator end() { return Slices.end(); }
+
+  using const_iterator = SmallVectorImpl<Slice>::const_iterator;
+  using const_range = iterator_range<const_iterator>;
+
+  const_iterator begin() const { return Slices.begin(); }
+  const_iterator end() const { return Slices.end(); }
+  /// @}
+
+  /// Erase a range of slices.
+  void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); }
+
+  /// Insert new slices for this alloca.
+  ///
+  /// This moves the slices into the alloca's slices collection, and re-sorts
+  /// everything so that the usual ordering properties of the alloca's slices
+  /// hold.
+  void insert(ArrayRef<Slice> NewSlices) {
+    int OldSize = Slices.size();
+    Slices.append(NewSlices.begin(), NewSlices.end());
+    auto SliceI = Slices.begin() + OldSize;
+    llvm::sort(SliceI, Slices.end());
+    std::inplace_merge(Slices.begin(), SliceI, Slices.end());
+  }
+
+  // Forward declare the iterator and range accessor for walking the
+  // partitions.
+  class partition_iterator;
+  iterator_range<partition_iterator> partitions();
+
+  /// Access the dead users for this alloca.
+  ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
+
   /// Access Uses that should be dropped if the alloca is promotable.
   ArrayRef<Use *> getDeadUsesIfPromotable() const {
     return DeadUseIfPromotable;
   }
 
-  /// Access the dead operands referring to this alloca. 
-  /// 
-  /// These are operands which have cannot actually be used to refer to the 
-  /// alloca as they are outside its range and the user doesn't correct for 
-  /// that. These mostly consist of PHI node inputs and the like which we just 
-  /// need to replace with undef. 
-  ArrayRef<Use *> getDeadOperands() const { return DeadOperands; } 
- 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 
-  void print(raw_ostream &OS, const_iterator I, StringRef Indent = "  ") const; 
-  void printSlice(raw_ostream &OS, const_iterator I, 
-                  StringRef Indent = "  ") const; 
-  void printUse(raw_ostream &OS, const_iterator I, 
-                StringRef Indent = "  ") const; 
-  void print(raw_ostream &OS) const; 
-  void dump(const_iterator I) const; 
-  void dump() const; 
-#endif 
- 
-private: 
-  template <typename DerivedT, typename RetT = void> class BuilderBase; 
-  class SliceBuilder; 
- 
-  friend class AllocaSlices::SliceBuilder; 
- 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 
-  /// Handle to alloca instruction to simplify method interfaces. 
-  AllocaInst &AI; 
-#endif 
- 
-  /// The instruction responsible for this alloca not having a known set 
-  /// of slices. 
-  /// 
-  /// When an instruction (potentially) escapes the pointer to the alloca, we 
-  /// store a pointer to that here and abort trying to form slices of the 
-  /// alloca. This will be null if the alloca slices are analyzed successfully. 
-  Instruction *PointerEscapingInstr; 
- 
-  /// The slices of the alloca. 
-  /// 
-  /// We store a vector of the slices formed by uses of the alloca here. This 
-  /// vector is sorted by increasing begin offset, and then the unsplittable 
-  /// slices before the splittable ones. See the Slice inner class for more 
-  /// details. 
-  SmallVector<Slice, 8> Slices; 
- 
-  /// Instructions which will become dead if we rewrite the alloca. 
-  /// 
-  /// Note that these are not separated by slice. This is because we expect an 
-  /// alloca to be completely rewritten or not rewritten at all. If rewritten, 
-  /// all these instructions can simply be removed and replaced with undef as 
-  /// they come from outside of the allocated space. 
-  SmallVector<Instruction *, 8> DeadUsers; 
- 
+  /// Access the dead operands referring to this alloca.
+  ///
+  /// These are operands which have cannot actually be used to refer to the
+  /// alloca as they are outside its range and the user doesn't correct for
+  /// that. These mostly consist of PHI node inputs and the like which we just
+  /// need to replace with undef.
+  ArrayRef<Use *> getDeadOperands() const { return DeadOperands; }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  void print(raw_ostream &OS, const_iterator I, StringRef Indent = "  ") const;
+  void printSlice(raw_ostream &OS, const_iterator I,
+                  StringRef Indent = "  ") const;
+  void printUse(raw_ostream &OS, const_iterator I,
+                StringRef Indent = "  ") const;
+  void print(raw_ostream &OS) const;
+  void dump(const_iterator I) const;
+  void dump() const;
+#endif
+
+private:
+  template <typename DerivedT, typename RetT = void> class BuilderBase;
+  class SliceBuilder;
+
+  friend class AllocaSlices::SliceBuilder;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Handle to alloca instruction to simplify method interfaces.
+  AllocaInst &AI;
+#endif
+
+  /// The instruction responsible for this alloca not having a known set
+  /// of slices.
+  ///
+  /// When an instruction (potentially) escapes the pointer to the alloca, we
+  /// store a pointer to that here and abort trying to form slices of the
+  /// alloca. This will be null if the alloca slices are analyzed successfully.
+  Instruction *PointerEscapingInstr;
+
+  /// The slices of the alloca.
+  ///
+  /// We store a vector of the slices formed by uses of the alloca here. This
+  /// vector is sorted by increasing begin offset, and then the unsplittable
+  /// slices before the splittable ones. See the Slice inner class for more
+  /// details.
+  SmallVector<Slice, 8> Slices;
+
+  /// Instructions which will become dead if we rewrite the alloca.
+  ///
+  /// Note that these are not separated by slice. This is because we expect an
+  /// alloca to be completely rewritten or not rewritten at all. If rewritten,
+  /// all these instructions can simply be removed and replaced with undef as
+  /// they come from outside of the allocated space.
+  SmallVector<Instruction *, 8> DeadUsers;
+
   /// Uses which will become dead if can promote the alloca.
   SmallVector<Use *, 8> DeadUseIfPromotable;
 
-  /// Operands which will become dead if we rewrite the alloca. 
-  /// 
-  /// These are operands that in their particular use can be replaced with 
-  /// undef when we rewrite the alloca. These show up in out-of-bounds inputs 
-  /// to PHI nodes and the like. They aren't entirely dead (there might be 
-  /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we 
-  /// want to swap this particular input for undef to simplify the use lists of 
-  /// the alloca. 
-  SmallVector<Use *, 8> DeadOperands; 
-}; 
- 
-/// A partition of the slices. 
-/// 
-/// An ephemeral representation for a range of slices which can be viewed as 
-/// a partition of the alloca. This range represents a span of the alloca's 
-/// memory which cannot be split, and provides access to all of the slices 
-/// overlapping some part of the partition. 
-/// 
-/// Objects of this type are produced by traversing the alloca's slices, but 
-/// are only ephemeral and not persistent. 
-class llvm::sroa::Partition { 
-private: 
-  friend class AllocaSlices; 
-  friend class AllocaSlices::partition_iterator; 
- 
-  using iterator = AllocaSlices::iterator; 
- 
-  /// The beginning and ending offsets of the alloca for this 
-  /// partition. 
-  uint64_t BeginOffset = 0, EndOffset = 0; 
- 
-  /// The start and end iterators of this partition. 
-  iterator SI, SJ; 
- 
-  /// A collection of split slice tails overlapping the partition. 
-  SmallVector<Slice *, 4> SplitTails; 
- 
-  /// Raw constructor builds an empty partition starting and ending at 
-  /// the given iterator. 
-  Partition(iterator SI) : SI(SI), SJ(SI) {} 
- 
-public: 
-  /// The start offset of this partition. 
-  /// 
-  /// All of the contained slices start at or after this offset. 
-  uint64_t beginOffset() const { return BeginOffset; } 
- 
-  /// The end offset of this partition. 
-  /// 
-  /// All of the contained slices end at or before this offset. 
-  uint64_t endOffset() const { return EndOffset; } 
- 
-  /// The size of the partition. 
-  /// 
-  /// Note that this can never be zero. 
-  uint64_t size() const { 
-    assert(BeginOffset < EndOffset && "Partitions must span some bytes!"); 
-    return EndOffset - BeginOffset; 
-  } 
- 
-  /// Test whether this partition contains no slices, and merely spans 
-  /// a region occupied by split slices. 
-  bool empty() const { return SI == SJ; } 
- 
-  /// \name Iterate slices that start within the partition. 
-  /// These may be splittable or unsplittable. They have a begin offset >= the 
-  /// partition begin offset. 
-  /// @{ 
-  // FIXME: We should probably define a "concat_iterator" helper and use that 
-  // to stitch together pointee_iterators over the split tails and the 
-  // contiguous iterators of the partition. That would give a much nicer 
-  // interface here. We could then additionally expose filtered iterators for 
-  // split, unsplit, and unsplittable splices based on the usage patterns. 
-  iterator begin() const { return SI; } 
-  iterator end() const { return SJ; } 
-  /// @} 
- 
-  /// Get the sequence of split slice tails. 
-  /// 
-  /// These tails are of slices which start before this partition but are 
-  /// split and overlap into the partition. We accumulate these while forming 
-  /// partitions. 
-  ArrayRef<Slice *> splitSliceTails() const { return SplitTails; } 
-}; 
- 
-/// An iterator over partitions of the alloca's slices. 
-/// 
-/// This iterator implements the core algorithm for partitioning the alloca's 
-/// slices. It is a forward iterator as we don't support backtracking for 
-/// efficiency reasons, and re-use a single storage area to maintain the 
-/// current set of split slices. 
-/// 
-/// It is templated on the slice iterator type to use so that it can operate 
-/// with either const or non-const slice iterators. 
-class AllocaSlices::partition_iterator 
-    : public iterator_facade_base<partition_iterator, std::forward_iterator_tag, 
-                                  Partition> { 
-  friend class AllocaSlices; 
- 
-  /// Most of the state for walking the partitions is held in a class 
-  /// with a nice interface for examining them. 
-  Partition P; 
- 
-  /// We need to keep the end of the slices to know when to stop. 
-  AllocaSlices::iterator SE; 
- 
-  /// We also need to keep track of the maximum split end offset seen. 
-  /// FIXME: Do we really? 
-  uint64_t MaxSplitSliceEndOffset = 0; 
- 
-  /// Sets the partition to be empty at given iterator, and sets the 
-  /// end iterator. 
-  partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE) 
-      : P(SI), SE(SE) { 
-    // If not already at the end, advance our state to form the initial 
-    // partition. 
-    if (SI != SE) 
-      advance(); 
-  } 
- 
-  /// Advance the iterator to the next partition. 
-  /// 
-  /// Requires that the iterator not be at the end of the slices. 
-  void advance() { 
-    assert((P.SI != SE || !P.SplitTails.empty()) && 
-           "Cannot advance past the end of the slices!"); 
- 
-    // Clear out any split uses which have ended. 
-    if (!P.SplitTails.empty()) { 
-      if (P.EndOffset >= MaxSplitSliceEndOffset) { 
-        // If we've finished all splits, this is easy. 
-        P.SplitTails.clear(); 
-        MaxSplitSliceEndOffset = 0; 
-      } else { 
-        // Remove the uses which have ended in the prior partition. This 
-        // cannot change the max split slice end because we just checked that 
-        // the prior partition ended prior to that max. 
+  /// Operands which will become dead if we rewrite the alloca.
+  ///
+  /// These are operands that in their particular use can be replaced with
+  /// undef when we rewrite the alloca. These show up in out-of-bounds inputs
+  /// to PHI nodes and the like. They aren't entirely dead (there might be
+  /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
+  /// want to swap this particular input for undef to simplify the use lists of
+  /// the alloca.
+  SmallVector<Use *, 8> DeadOperands;
+};
+
+/// A partition of the slices.
+///
+/// An ephemeral representation for a range of slices which can be viewed as
+/// a partition of the alloca. This range represents a span of the alloca's
+/// memory which cannot be split, and provides access to all of the slices
+/// overlapping some part of the partition.
+///
+/// Objects of this type are produced by traversing the alloca's slices, but
+/// are only ephemeral and not persistent.
+class llvm::sroa::Partition {
+private:
+  friend class AllocaSlices;
+  friend class AllocaSlices::partition_iterator;
+
+  using iterator = AllocaSlices::iterator;
+
+  /// The beginning and ending offsets of the alloca for this
+  /// partition.
+  uint64_t BeginOffset = 0, EndOffset = 0;
+
+  /// The start and end iterators of this partition.
+  iterator SI, SJ;
+
+  /// A collection of split slice tails overlapping the partition.
+  SmallVector<Slice *, 4> SplitTails;
+
+  /// Raw constructor builds an empty partition starting and ending at
+  /// the given iterator.
+  Partition(iterator SI) : SI(SI), SJ(SI) {}
+
+public:
+  /// The start offset of this partition.
+  ///
+  /// All of the contained slices start at or after this offset.
+  uint64_t beginOffset() const { return BeginOffset; }
+
+  /// The end offset of this partition.
+  ///
+  /// All of the contained slices end at or before this offset.
+  uint64_t endOffset() const { return EndOffset; }
+
+  /// The size of the partition.
+  ///
+  /// Note that this can never be zero.
+  uint64_t size() const {
+    assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
+    return EndOffset - BeginOffset;
+  }
+
+  /// Test whether this partition contains no slices, and merely spans
+  /// a region occupied by split slices.
+  bool empty() const { return SI == SJ; }
+
+  /// \name Iterate slices that start within the partition.
+  /// These may be splittable or unsplittable. They have a begin offset >= the
+  /// partition begin offset.
+  /// @{
+  // FIXME: We should probably define a "concat_iterator" helper and use that
+  // to stitch together pointee_iterators over the split tails and the
+  // contiguous iterators of the partition. That would give a much nicer
+  // interface here. We could then additionally expose filtered iterators for
+  // split, unsplit, and unsplittable splices based on the usage patterns.
+  iterator begin() const { return SI; }
+  iterator end() const { return SJ; }
+  /// @}
+
+  /// Get the sequence of split slice tails.
+  ///
+  /// These tails are of slices which start before this partition but are
+  /// split and overlap into the partition. We accumulate these while forming
+  /// partitions.
+  ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
+};
+
+/// An iterator over partitions of the alloca's slices.
+///
+/// This iterator implements the core algorithm for partitioning the alloca's
+/// slices. It is a forward iterator as we don't support backtracking for
+/// efficiency reasons, and re-use a single storage area to maintain the
+/// current set of split slices.
+///
+/// It is templated on the slice iterator type to use so that it can operate
+/// with either const or non-const slice iterators.
+class AllocaSlices::partition_iterator
+    : public iterator_facade_base<partition_iterator, std::forward_iterator_tag,
+                                  Partition> {
+  friend class AllocaSlices;
+
+  /// Most of the state for walking the partitions is held in a class
+  /// with a nice interface for examining them.
+  Partition P;
+
+  /// We need to keep the end of the slices to know when to stop.
+  AllocaSlices::iterator SE;
+
+  /// We also need to keep track of the maximum split end offset seen.
+  /// FIXME: Do we really?
+  uint64_t MaxSplitSliceEndOffset = 0;
+
+  /// Sets the partition to be empty at given iterator, and sets the
+  /// end iterator.
+  partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
+      : P(SI), SE(SE) {
+    // If not already at the end, advance our state to form the initial
+    // partition.
+    if (SI != SE)
+      advance();
+  }
+
+  /// Advance the iterator to the next partition.
+  ///
+  /// Requires that the iterator not be at the end of the slices.
+  void advance() {
+    assert((P.SI != SE || !P.SplitTails.empty()) &&
+           "Cannot advance past the end of the slices!");
+
+    // Clear out any split uses which have ended.
+    if (!P.SplitTails.empty()) {
+      if (P.EndOffset >= MaxSplitSliceEndOffset) {
+        // If we've finished all splits, this is easy.
+        P.SplitTails.clear();
+        MaxSplitSliceEndOffset = 0;
+      } else {
+        // Remove the uses which have ended in the prior partition. This
+        // cannot change the max split slice end because we just checked that
+        // the prior partition ended prior to that max.
         llvm::erase_if(P.SplitTails,
                        [&](Slice *S) { return S->endOffset() <= P.EndOffset; });
-        assert(llvm::any_of(P.SplitTails, 
-                            [&](Slice *S) { 
-                              return S->endOffset() == MaxSplitSliceEndOffset; 
-                            }) && 
-               "Could not find the current max split slice offset!"); 
-        assert(llvm::all_of(P.SplitTails, 
-                            [&](Slice *S) { 
-                              return S->endOffset() <= MaxSplitSliceEndOffset; 
-                            }) && 
-               "Max split slice end offset is not actually the max!"); 
-      } 
-    } 
- 
-    // If P.SI is already at the end, then we've cleared the split tail and 
-    // now have an end iterator. 
-    if (P.SI == SE) { 
-      assert(P.SplitTails.empty() && "Failed to clear the split slices!"); 
-      return; 
-    } 
- 
-    // If we had a non-empty partition previously, set up the state for 
-    // subsequent partitions. 
-    if (P.SI != P.SJ) { 
-      // Accumulate all the splittable slices which started in the old 
-      // partition into the split list. 
-      for (Slice &S : P) 
-        if (S.isSplittable() && S.endOffset() > P.EndOffset) { 
-          P.SplitTails.push_back(&S); 
-          MaxSplitSliceEndOffset = 
-              std::max(S.endOffset(), MaxSplitSliceEndOffset); 
-        } 
- 
-      // Start from the end of the previous partition. 
-      P.SI = P.SJ; 
- 
-      // If P.SI is now at the end, we at most have a tail of split slices. 
-      if (P.SI == SE) { 
-        P.BeginOffset = P.EndOffset; 
-        P.EndOffset = MaxSplitSliceEndOffset; 
-        return; 
-      } 
- 
-      // If the we have split slices and the next slice is after a gap and is 
-      // not splittable immediately form an empty partition for the split 
-      // slices up until the next slice begins. 
-      if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset && 
-          !P.SI->isSplittable()) { 
-        P.BeginOffset = P.EndOffset; 
-        P.EndOffset = P.SI->beginOffset(); 
-        return; 
-      } 
-    } 
- 
-    // OK, we need to consume new slices. Set the end offset based on the 
-    // current slice, and step SJ past it. The beginning offset of the 
-    // partition is the beginning offset of the next slice unless we have 
-    // pre-existing split slices that are continuing, in which case we begin 
-    // at the prior end offset. 
-    P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset; 
-    P.EndOffset = P.SI->endOffset(); 
-    ++P.SJ; 
- 
-    // There are two strategies to form a partition based on whether the 
-    // partition starts with an unsplittable slice or a splittable slice. 
-    if (!P.SI->isSplittable()) { 
-      // When we're forming an unsplittable region, it must always start at 
-      // the first slice and will extend through its end. 
-      assert(P.BeginOffset == P.SI->beginOffset()); 
- 
-      // Form a partition including all of the overlapping slices with this 
-      // unsplittable slice. 
-      while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) { 
-        if (!P.SJ->isSplittable()) 
-          P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset()); 
-        ++P.SJ; 
-      } 
- 
-      // We have a partition across a set of overlapping unsplittable 
-      // partitions. 
-      return; 
-    } 
- 
-    // If we're starting with a splittable slice, then we need to form 
-    // a synthetic partition spanning it and any other overlapping splittable 
-    // splices. 
-    assert(P.SI->isSplittable() && "Forming a splittable partition!"); 
- 
-    // Collect all of the overlapping splittable slices. 
-    while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset && 
-           P.SJ->isSplittable()) { 
-      P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset()); 
-      ++P.SJ; 
-    } 
- 
-    // Back upiP.EndOffset if we ended the span early when encountering an 
-    // unsplittable slice. This synthesizes the early end offset of 
-    // a partition spanning only splittable slices. 
-    if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) { 
-      assert(!P.SJ->isSplittable()); 
-      P.EndOffset = P.SJ->beginOffset(); 
-    } 
-  } 
- 
-public: 
-  bool operator==(const partition_iterator &RHS) const { 
-    assert(SE == RHS.SE && 
-           "End iterators don't match between compared partition iterators!"); 
- 
-    // The observed positions of partitions is marked by the P.SI iterator and 
-    // the emptiness of the split slices. The latter is only relevant when 
-    // P.SI == SE, as the end iterator will additionally have an empty split 
-    // slices list, but the prior may have the same P.SI and a tail of split 
-    // slices. 
-    if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) { 
-      assert(P.SJ == RHS.P.SJ && 
-             "Same set of slices formed two different sized partitions!"); 
-      assert(P.SplitTails.size() == RHS.P.SplitTails.size() && 
-             "Same slice position with differently sized non-empty split " 
-             "slice tails!"); 
-      return true; 
-    } 
-    return false; 
-  } 
- 
-  partition_iterator &operator++() { 
-    advance(); 
-    return *this; 
-  } 
- 
-  Partition &operator*() { return P; } 
-}; 
- 
-/// A forward range over the partitions of the alloca's slices. 
-/// 
-/// This accesses an iterator range over the partitions of the alloca's 
-/// slices. It computes these partitions on the fly based on the overlapping 
-/// offsets of the slices and the ability to split them. It will visit "empty" 
-/// partitions to cover regions of the alloca only accessed via split 
-/// slices. 
-iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() { 
-  return make_range(partition_iterator(begin(), end()), 
-                    partition_iterator(end(), end())); 
-} 
- 
-static Value *foldSelectInst(SelectInst &SI) { 
-  // If the condition being selected on is a constant or the same value is 
-  // being selected between, fold the select. Yes this does (rarely) happen 
-  // early on. 
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition())) 
-    return SI.getOperand(1 + CI->isZero()); 
-  if (SI.getOperand(1) == SI.getOperand(2)) 
-    return SI.getOperand(1); 
- 
-  return nullptr; 
-} 
- 
-/// A helper that folds a PHI node or a select. 
-static Value *foldPHINodeOrSelectInst(Instruction &I) { 
-  if (PHINode *PN = dyn_cast<PHINode>(&I)) { 
-    // If PN merges together the same value, return that value. 
-    return PN->hasConstantValue(); 
-  } 
-  return foldSelectInst(cast<SelectInst>(I)); 
-} 
- 
-/// Builder for the alloca slices. 
-/// 
-/// This class builds a set of alloca slices by recursively visiting the uses 
-/// of an alloca and making a slice for each load and store at each offset. 
-class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> { 
-  friend class PtrUseVisitor<SliceBuilder>; 
-  friend class InstVisitor<SliceBuilder>; 
- 
-  using Base = PtrUseVisitor<SliceBuilder>; 
- 
-  const uint64_t AllocSize; 
-  AllocaSlices &AS; 
- 
-  SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap; 
-  SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes; 
- 
-  /// Set to de-duplicate dead instructions found in the use walk. 
-  SmallPtrSet<Instruction *, 4> VisitedDeadInsts; 
- 
-public: 
-  SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS) 
-      : PtrUseVisitor<SliceBuilder>(DL), 
-        AllocSize(DL.getTypeAllocSize(AI.getAllocatedType()).getFixedSize()), 
-        AS(AS) {} 
- 
-private: 
-  void markAsDead(Instruction &I) { 
-    if (VisitedDeadInsts.insert(&I).second) 
-      AS.DeadUsers.push_back(&I); 
-  } 
- 
-  void insertUse(Instruction &I, const APInt &Offset, uint64_t Size, 
-                 bool IsSplittable = false) { 
-    // Completely skip uses which have a zero size or start either before or 
-    // past the end of the allocation. 
-    if (Size == 0 || Offset.uge(AllocSize)) { 
-      LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" 
-                        << Offset 
-                        << " which has zero size or starts outside of the " 
-                        << AllocSize << " byte alloca:\n" 
-                        << "    alloca: " << AS.AI << "\n" 
-                        << "       use: " << I << "\n"); 
-      return markAsDead(I); 
-    } 
- 
-    uint64_t BeginOffset = Offset.getZExtValue(); 
-    uint64_t EndOffset = BeginOffset + Size; 
- 
-    // Clamp the end offset to the end of the allocation. Note that this is 
-    // formulated to handle even the case where "BeginOffset + Size" overflows. 
-    // This may appear superficially to be something we could ignore entirely, 
-    // but that is not so! There may be widened loads or PHI-node uses where 
-    // some instructions are dead but not others. We can't completely ignore 
-    // them, and so have to record at least the information here. 
-    assert(AllocSize >= BeginOffset); // Established above. 
-    if (Size > AllocSize - BeginOffset) { 
-      LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" 
-                        << Offset << " to remain within the " << AllocSize 
-                        << " byte alloca:\n" 
-                        << "    alloca: " << AS.AI << "\n" 
-                        << "       use: " << I << "\n"); 
-      EndOffset = AllocSize; 
-    } 
- 
-    AS.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable)); 
-  } 
- 
-  void visitBitCastInst(BitCastInst &BC) { 
-    if (BC.use_empty()) 
-      return markAsDead(BC); 
- 
-    return Base::visitBitCastInst(BC); 
-  } 
- 
-  void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) { 
-    if (ASC.use_empty()) 
-      return markAsDead(ASC); 
- 
-    return Base::visitAddrSpaceCastInst(ASC); 
-  } 
- 
-  void visitGetElementPtrInst(GetElementPtrInst &GEPI) { 
-    if (GEPI.use_empty()) 
-      return markAsDead(GEPI); 
- 
-    if (SROAStrictInbounds && GEPI.isInBounds()) { 
-      // FIXME: This is a manually un-factored variant of the basic code inside 
-      // of GEPs with checking of the inbounds invariant specified in the 
-      // langref in a very strict sense. If we ever want to enable 
-      // SROAStrictInbounds, this code should be factored cleanly into 
-      // PtrUseVisitor, but it is easier to experiment with SROAStrictInbounds 
-      // by writing out the code here where we have the underlying allocation 
-      // size readily available. 
-      APInt GEPOffset = Offset; 
-      const DataLayout &DL = GEPI.getModule()->getDataLayout(); 
-      for (gep_type_iterator GTI = gep_type_begin(GEPI), 
-                             GTE = gep_type_end(GEPI); 
-           GTI != GTE; ++GTI) { 
-        ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); 
-        if (!OpC) 
-          break; 
- 
-        // Handle a struct index, which adds its field offset to the pointer. 
-        if (StructType *STy = GTI.getStructTypeOrNull()) { 
-          unsigned ElementIdx = OpC->getZExtValue(); 
-          const StructLayout *SL = DL.getStructLayout(STy); 
-          GEPOffset += 
-              APInt(Offset.getBitWidth(), SL->getElementOffset(ElementIdx)); 
-        } else { 
-          // For array or vector indices, scale the index by the size of the 
-          // type. 
-          APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth()); 
-          GEPOffset += 
-              Index * 
-              APInt(Offset.getBitWidth(), 
-                    DL.getTypeAllocSize(GTI.getIndexedType()).getFixedSize()); 
-        } 
- 
-        // If this index has computed an intermediate pointer which is not 
-        // inbounds, then the result of the GEP is a poison value and we can 
-        // delete it and all uses. 
-        if (GEPOffset.ugt(AllocSize)) 
-          return markAsDead(GEPI); 
-      } 
-    } 
- 
-    return Base::visitGetElementPtrInst(GEPI); 
-  } 
- 
-  void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset, 
-                         uint64_t Size, bool IsVolatile) { 
-    // We allow splitting of non-volatile loads and stores where the type is an 
-    // integer type. These may be used to implement 'memcpy' or other "transfer 
-    // of bits" patterns. 
-    bool IsSplittable = Ty->isIntegerTy() && !IsVolatile; 
- 
-    insertUse(I, Offset, Size, IsSplittable); 
-  } 
- 
-  void visitLoadInst(LoadInst &LI) { 
-    assert((!LI.isSimple() || LI.getType()->isSingleValueType()) && 
-           "All simple FCA loads should have been pre-split"); 
- 
-    if (!IsOffsetKnown) 
-      return PI.setAborted(&LI); 
- 
-    if (LI.isVolatile() && 
-        LI.getPointerAddressSpace() != DL.getAllocaAddrSpace()) 
-      return PI.setAborted(&LI); 
- 
+        assert(llvm::any_of(P.SplitTails,
+                            [&](Slice *S) {
+                              return S->endOffset() == MaxSplitSliceEndOffset;
+                            }) &&
+               "Could not find the current max split slice offset!");
+        assert(llvm::all_of(P.SplitTails,
+                            [&](Slice *S) {
+                              return S->endOffset() <= MaxSplitSliceEndOffset;
+                            }) &&
+               "Max split slice end offset is not actually the max!");
+      }
+    }
+
+    // If P.SI is already at the end, then we've cleared the split tail and
+    // now have an end iterator.
+    if (P.SI == SE) {
+      assert(P.SplitTails.empty() && "Failed to clear the split slices!");
+      return;
+    }
+
+    // If we had a non-empty partition previously, set up the state for
+    // subsequent partitions.
+    if (P.SI != P.SJ) {
+      // Accumulate all the splittable slices which started in the old
+      // partition into the split list.
+      for (Slice &S : P)
+        if (S.isSplittable() && S.endOffset() > P.EndOffset) {
+          P.SplitTails.push_back(&S);
+          MaxSplitSliceEndOffset =
+              std::max(S.endOffset(), MaxSplitSliceEndOffset);
+        }
+
+      // Start from the end of the previous partition.
+      P.SI = P.SJ;
+
+      // If P.SI is now at the end, we at most have a tail of split slices.
+      if (P.SI == SE) {
+        P.BeginOffset = P.EndOffset;
+        P.EndOffset = MaxSplitSliceEndOffset;
+        return;
+      }
+
+      // If the we have split slices and the next slice is after a gap and is
+      // not splittable immediately form an empty partition for the split
+      // slices up until the next slice begins.
+      if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
+          !P.SI->isSplittable()) {
+        P.BeginOffset = P.EndOffset;
+        P.EndOffset = P.SI->beginOffset();
+        return;
+      }
+    }
+
+    // OK, we need to consume new slices. Set the end offset based on the
+    // current slice, and step SJ past it. The beginning offset of the
+    // partition is the beginning offset of the next slice unless we have
+    // pre-existing split slices that are continuing, in which case we begin
+    // at the prior end offset.
+    P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
+    P.EndOffset = P.SI->endOffset();
+    ++P.SJ;
+
+    // There are two strategies to form a partition based on whether the
+    // partition starts with an unsplittable slice or a splittable slice.
+    if (!P.SI->isSplittable()) {
+      // When we're forming an unsplittable region, it must always start at
+      // the first slice and will extend through its end.
+      assert(P.BeginOffset == P.SI->beginOffset());
+
+      // Form a partition including all of the overlapping slices with this
+      // unsplittable slice.
+      while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
+        if (!P.SJ->isSplittable())
+          P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
+        ++P.SJ;
+      }
+
+      // We have a partition across a set of overlapping unsplittable
+      // partitions.
+      return;
+    }
+
+    // If we're starting with a splittable slice, then we need to form
+    // a synthetic partition spanning it and any other overlapping splittable
+    // splices.
+    assert(P.SI->isSplittable() && "Forming a splittable partition!");
+
+    // Collect all of the overlapping splittable slices.
+    while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
+           P.SJ->isSplittable()) {
+      P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
+      ++P.SJ;
+    }
+
+    // Back upiP.EndOffset if we ended the span early when encountering an
+    // unsplittable slice. This synthesizes the early end offset of
+    // a partition spanning only splittable slices.
+    if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
+      assert(!P.SJ->isSplittable());
+      P.EndOffset = P.SJ->beginOffset();
+    }
+  }
+
+public:
+  bool operator==(const partition_iterator &RHS) const {
+    assert(SE == RHS.SE &&
+           "End iterators don't match between compared partition iterators!");
+
+    // The observed positions of partitions is marked by the P.SI iterator and
+    // the emptiness of the split slices. The latter is only relevant when
+    // P.SI == SE, as the end iterator will additionally have an empty split
+    // slices list, but the prior may have the same P.SI and a tail of split
+    // slices.
+    if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
+      assert(P.SJ == RHS.P.SJ &&
+             "Same set of slices formed two different sized partitions!");
+      assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
+             "Same slice position with differently sized non-empty split "
+             "slice tails!");
+      return true;
+    }
+    return false;
+  }
+
+  partition_iterator &operator++() {
+    advance();
+    return *this;
+  }
+
+  Partition &operator*() { return P; }
+};
+
+/// A forward range over the partitions of the alloca's slices.
+///
+/// This accesses an iterator range over the partitions of the alloca's
+/// slices. It computes these partitions on the fly based on the overlapping
+/// offsets of the slices and the ability to split them. It will visit "empty"
+/// partitions to cover regions of the alloca only accessed via split
+/// slices.
+iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() {
+  return make_range(partition_iterator(begin(), end()),
+                    partition_iterator(end(), end()));
+}
+
+static Value *foldSelectInst(SelectInst &SI) {
+  // If the condition being selected on is a constant or the same value is
+  // being selected between, fold the select. Yes this does (rarely) happen
+  // early on.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
+    return SI.getOperand(1 + CI->isZero());
+  if (SI.getOperand(1) == SI.getOperand(2))
+    return SI.getOperand(1);
+
+  return nullptr;
+}
+
+/// A helper that folds a PHI node or a select.
+static Value *foldPHINodeOrSelectInst(Instruction &I) {
+  if (PHINode *PN = dyn_cast<PHINode>(&I)) {
+    // If PN merges together the same value, return that value.
+    return PN->hasConstantValue();
+  }
+  return foldSelectInst(cast<SelectInst>(I));
+}
+
+/// Builder for the alloca slices.
+///
+/// This class builds a set of alloca slices by recursively visiting the uses
+/// of an alloca and making a slice for each load and store at each offset.
+class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
+  friend class PtrUseVisitor<SliceBuilder>;
+  friend class InstVisitor<SliceBuilder>;
+
+  using Base = PtrUseVisitor<SliceBuilder>;
+
+  const uint64_t AllocSize;
+  AllocaSlices &AS;
+
+  SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
+  SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes;
+
+  /// Set to de-duplicate dead instructions found in the use walk.
+  SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
+
+public:
+  SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
+      : PtrUseVisitor<SliceBuilder>(DL),
+        AllocSize(DL.getTypeAllocSize(AI.getAllocatedType()).getFixedSize()),
+        AS(AS) {}
+
+private:
+  void markAsDead(Instruction &I) {
+    if (VisitedDeadInsts.insert(&I).second)
+      AS.DeadUsers.push_back(&I);
+  }
+
+  void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
+                 bool IsSplittable = false) {
+    // Completely skip uses which have a zero size or start either before or
+    // past the end of the allocation.
+    if (Size == 0 || Offset.uge(AllocSize)) {
+      LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @"
+                        << Offset
+                        << " which has zero size or starts outside of the "
+                        << AllocSize << " byte alloca:\n"
+                        << "    alloca: " << AS.AI << "\n"
+                        << "       use: " << I << "\n");
+      return markAsDead(I);
+    }
+
+    uint64_t BeginOffset = Offset.getZExtValue();
+    uint64_t EndOffset = BeginOffset + Size;
+
+    // Clamp the end offset to the end of the allocation. Note that this is
+    // formulated to handle even the case where "BeginOffset + Size" overflows.
+    // This may appear superficially to be something we could ignore entirely,
+    // but that is not so! There may be widened loads or PHI-node uses where
+    // some instructions are dead but not others. We can't completely ignore
+    // them, and so have to record at least the information here.
+    assert(AllocSize >= BeginOffset); // Established above.
+    if (Size > AllocSize - BeginOffset) {
+      LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @"
+                        << Offset << " to remain within the " << AllocSize
+                        << " byte alloca:\n"
+                        << "    alloca: " << AS.AI << "\n"
+                        << "       use: " << I << "\n");
+      EndOffset = AllocSize;
+    }
+
+    AS.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable));
+  }
+
+  void visitBitCastInst(BitCastInst &BC) {
+    if (BC.use_empty())
+      return markAsDead(BC);
+
+    return Base::visitBitCastInst(BC);
+  }
+
+  void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
+    if (ASC.use_empty())
+      return markAsDead(ASC);
+
+    return Base::visitAddrSpaceCastInst(ASC);
+  }
+
+  void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+    if (GEPI.use_empty())
+      return markAsDead(GEPI);
+
+    if (SROAStrictInbounds && GEPI.isInBounds()) {
+      // FIXME: This is a manually un-factored variant of the basic code inside
+      // of GEPs with checking of the inbounds invariant specified in the
+      // langref in a very strict sense. If we ever want to enable
+      // SROAStrictInbounds, this code should be factored cleanly into
+      // PtrUseVisitor, but it is easier to experiment with SROAStrictInbounds
+      // by writing out the code here where we have the underlying allocation
+      // size readily available.
+      APInt GEPOffset = Offset;
+      const DataLayout &DL = GEPI.getModule()->getDataLayout();
+      for (gep_type_iterator GTI = gep_type_begin(GEPI),
+                             GTE = gep_type_end(GEPI);
+           GTI != GTE; ++GTI) {
+        ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
+        if (!OpC)
+          break;
+
+        // Handle a struct index, which adds its field offset to the pointer.
+        if (StructType *STy = GTI.getStructTypeOrNull()) {
+          unsigned ElementIdx = OpC->getZExtValue();
+          const StructLayout *SL = DL.getStructLayout(STy);
+          GEPOffset +=
+              APInt(Offset.getBitWidth(), SL->getElementOffset(ElementIdx));
+        } else {
+          // For array or vector indices, scale the index by the size of the
+          // type.
+          APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth());
+          GEPOffset +=
+              Index *
+              APInt(Offset.getBitWidth(),
+                    DL.getTypeAllocSize(GTI.getIndexedType()).getFixedSize());
+        }
+
+        // If this index has computed an intermediate pointer which is not
+        // inbounds, then the result of the GEP is a poison value and we can
+        // delete it and all uses.
+        if (GEPOffset.ugt(AllocSize))
+          return markAsDead(GEPI);
+      }
+    }
+
+    return Base::visitGetElementPtrInst(GEPI);
+  }
+
+  void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
+                         uint64_t Size, bool IsVolatile) {
+    // We allow splitting of non-volatile loads and stores where the type is an
+    // integer type. These may be used to implement 'memcpy' or other "transfer
+    // of bits" patterns.
+    bool IsSplittable = Ty->isIntegerTy() && !IsVolatile;
+
+    insertUse(I, Offset, Size, IsSplittable);
+  }
+
+  void visitLoadInst(LoadInst &LI) {
+    assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
+           "All simple FCA loads should have been pre-split");
+
+    if (!IsOffsetKnown)
+      return PI.setAborted(&LI);
+
+    if (LI.isVolatile() &&
+        LI.getPointerAddressSpace() != DL.getAllocaAddrSpace())
+      return PI.setAborted(&LI);
+
     if (isa<ScalableVectorType>(LI.getType()))
       return PI.setAborted(&LI);
 
-    uint64_t Size = DL.getTypeStoreSize(LI.getType()).getFixedSize(); 
-    return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile()); 
-  } 
- 
-  void visitStoreInst(StoreInst &SI) { 
-    Value *ValOp = SI.getValueOperand(); 
-    if (ValOp == *U) 
-      return PI.setEscapedAndAborted(&SI); 
-    if (!IsOffsetKnown) 
-      return PI.setAborted(&SI); 
- 
-    if (SI.isVolatile() && 
-        SI.getPointerAddressSpace() != DL.getAllocaAddrSpace()) 
-      return PI.setAborted(&SI); 
- 
+    uint64_t Size = DL.getTypeStoreSize(LI.getType()).getFixedSize();
+    return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile());
+  }
+
+  void visitStoreInst(StoreInst &SI) {
+    Value *ValOp = SI.getValueOperand();
+    if (ValOp == *U)
+      return PI.setEscapedAndAborted(&SI);
+    if (!IsOffsetKnown)
+      return PI.setAborted(&SI);
+
+    if (SI.isVolatile() &&
+        SI.getPointerAddressSpace() != DL.getAllocaAddrSpace())
+      return PI.setAborted(&SI);
+
     if (isa<ScalableVectorType>(ValOp->getType()))
       return PI.setAborted(&SI);
 
-    uint64_t Size = DL.getTypeStoreSize(ValOp->getType()).getFixedSize(); 
- 
-    // If this memory access can be shown to *statically* extend outside the 
-    // bounds of the allocation, it's behavior is undefined, so simply 
-    // ignore it. Note that this is more strict than the generic clamping 
-    // behavior of insertUse. We also try to handle cases which might run the 
-    // risk of overflow. 
-    // FIXME: We should instead consider the pointer to have escaped if this 
-    // function is being instrumented for addressing bugs or race conditions. 
-    if (Size > AllocSize || Offset.ugt(AllocSize - Size)) { 
-      LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" 
-                        << Offset << " which extends past the end of the " 
-                        << AllocSize << " byte alloca:\n" 
-                        << "    alloca: " << AS.AI << "\n" 
-                        << "       use: " << SI << "\n"); 
-      return markAsDead(SI); 
-    } 
- 
-    assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) && 
-           "All simple FCA stores should have been pre-split"); 
-    handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile()); 
-  } 
- 
-  void visitMemSetInst(MemSetInst &II) { 
-    assert(II.getRawDest() == *U && "Pointer use is not the destination?"); 
-    ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 
-    if ((Length && Length->getValue() == 0) || 
-        (IsOffsetKnown && Offset.uge(AllocSize))) 
-      // Zero-length mem transfer intrinsics can be ignored entirely. 
-      return markAsDead(II); 
- 
-    if (!IsOffsetKnown) 
-      return PI.setAborted(&II); 
- 
-    // Don't replace this with a store with a different address space.  TODO: 
-    // Use a store with the casted new alloca? 
-    if (II.isVolatile() && II.getDestAddressSpace() != DL.getAllocaAddrSpace()) 
-      return PI.setAborted(&II); 
- 
-    insertUse(II, Offset, Length ? Length->getLimitedValue() 
-                                 : AllocSize - Offset.getLimitedValue(), 
-              (bool)Length); 
-  } 
- 
-  void visitMemTransferInst(MemTransferInst &II) { 
-    ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 
-    if (Length && Length->getValue() == 0) 
-      // Zero-length mem transfer intrinsics can be ignored entirely. 
-      return markAsDead(II); 
- 
-    // Because we can visit these intrinsics twice, also check to see if the 
-    // first time marked this instruction as dead. If so, skip it. 
-    if (VisitedDeadInsts.count(&II)) 
-      return; 
- 
-    if (!IsOffsetKnown) 
-      return PI.setAborted(&II); 
- 
-    // Don't replace this with a load/store with a different address space. 
-    // TODO: Use a store with the casted new alloca? 
-    if (II.isVolatile() && 
-        (II.getDestAddressSpace() != DL.getAllocaAddrSpace() || 
-         II.getSourceAddressSpace() != DL.getAllocaAddrSpace())) 
-      return PI.setAborted(&II); 
- 
-    // This side of the transfer is completely out-of-bounds, and so we can 
-    // nuke the entire transfer. However, we also need to nuke the other side 
-    // if already added to our partitions. 
-    // FIXME: Yet another place we really should bypass this when 
-    // instrumenting for ASan. 
-    if (Offset.uge(AllocSize)) { 
-      SmallDenseMap<Instruction *, unsigned>::iterator MTPI = 
-          MemTransferSliceMap.find(&II); 
-      if (MTPI != MemTransferSliceMap.end()) 
-        AS.Slices[MTPI->second].kill(); 
-      return markAsDead(II); 
-    } 
- 
-    uint64_t RawOffset = Offset.getLimitedValue(); 
-    uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset; 
- 
-    // Check for the special case where the same exact value is used for both 
-    // source and dest. 
-    if (*U == II.getRawDest() && *U == II.getRawSource()) { 
-      // For non-volatile transfers this is a no-op. 
-      if (!II.isVolatile()) 
-        return markAsDead(II); 
- 
-      return insertUse(II, Offset, Size, /*IsSplittable=*/false); 
-    } 
- 
-    // If we have seen both source and destination for a mem transfer, then 
-    // they both point to the same alloca. 
-    bool Inserted; 
-    SmallDenseMap<Instruction *, unsigned>::iterator MTPI; 
-    std::tie(MTPI, Inserted) = 
-        MemTransferSliceMap.insert(std::make_pair(&II, AS.Slices.size())); 
-    unsigned PrevIdx = MTPI->second; 
-    if (!Inserted) { 
-      Slice &PrevP = AS.Slices[PrevIdx]; 
- 
-      // Check if the begin offsets match and this is a non-volatile transfer. 
-      // In that case, we can completely elide the transfer. 
-      if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) { 
-        PrevP.kill(); 
-        return markAsDead(II); 
-      } 
- 
-      // Otherwise we have an offset transfer within the same alloca. We can't 
-      // split those. 
-      PrevP.makeUnsplittable(); 
-    } 
- 
-    // Insert the use now that we've fixed up the splittable nature. 
-    insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length); 
- 
-    // Check that we ended up with a valid index in the map. 
-    assert(AS.Slices[PrevIdx].getUse()->getUser() == &II && 
-           "Map index doesn't point back to a slice with this user."); 
-  } 
- 
-  // Disable SRoA for any intrinsics except for lifetime invariants. 
-  // FIXME: What about debug intrinsics? This matches old behavior, but 
-  // doesn't make sense. 
-  void visitIntrinsicInst(IntrinsicInst &II) { 
+    uint64_t Size = DL.getTypeStoreSize(ValOp->getType()).getFixedSize();
+
+    // If this memory access can be shown to *statically* extend outside the
+    // bounds of the allocation, it's behavior is undefined, so simply
+    // ignore it. Note that this is more strict than the generic clamping
+    // behavior of insertUse. We also try to handle cases which might run the
+    // risk of overflow.
+    // FIXME: We should instead consider the pointer to have escaped if this
+    // function is being instrumented for addressing bugs or race conditions.
+    if (Size > AllocSize || Offset.ugt(AllocSize - Size)) {
+      LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @"
+                        << Offset << " which extends past the end of the "
+                        << AllocSize << " byte alloca:\n"
+                        << "    alloca: " << AS.AI << "\n"
+                        << "       use: " << SI << "\n");
+      return markAsDead(SI);
+    }
+
+    assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
+           "All simple FCA stores should have been pre-split");
+    handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile());
+  }
+
+  void visitMemSetInst(MemSetInst &II) {
+    assert(II.getRawDest() == *U && "Pointer use is not the destination?");
+    ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
+    if ((Length && Length->getValue() == 0) ||
+        (IsOffsetKnown && Offset.uge(AllocSize)))
+      // Zero-length mem transfer intrinsics can be ignored entirely.
+      return markAsDead(II);
+
+    if (!IsOffsetKnown)
+      return PI.setAborted(&II);
+
+    // Don't replace this with a store with a different address space.  TODO:
+    // Use a store with the casted new alloca?
+    if (II.isVolatile() && II.getDestAddressSpace() != DL.getAllocaAddrSpace())
+      return PI.setAborted(&II);
+
+    insertUse(II, Offset, Length ? Length->getLimitedValue()
+                                 : AllocSize - Offset.getLimitedValue(),
+              (bool)Length);
+  }
+
+  void visitMemTransferInst(MemTransferInst &II) {
+    ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
+    if (Length && Length->getValue() == 0)
+      // Zero-length mem transfer intrinsics can be ignored entirely.
+      return markAsDead(II);
+
+    // Because we can visit these intrinsics twice, also check to see if the
+    // first time marked this instruction as dead. If so, skip it.
+    if (VisitedDeadInsts.count(&II))
+      return;
+
+    if (!IsOffsetKnown)
+      return PI.setAborted(&II);
+
+    // Don't replace this with a load/store with a different address space.
+    // TODO: Use a store with the casted new alloca?
+    if (II.isVolatile() &&
+        (II.getDestAddressSpace() != DL.getAllocaAddrSpace() ||
+         II.getSourceAddressSpace() != DL.getAllocaAddrSpace()))
+      return PI.setAborted(&II);
+
+    // This side of the transfer is completely out-of-bounds, and so we can
+    // nuke the entire transfer. However, we also need to nuke the other side
+    // if already added to our partitions.
+    // FIXME: Yet another place we really should bypass this when
+    // instrumenting for ASan.
+    if (Offset.uge(AllocSize)) {
+      SmallDenseMap<Instruction *, unsigned>::iterator MTPI =
+          MemTransferSliceMap.find(&II);
+      if (MTPI != MemTransferSliceMap.end())
+        AS.Slices[MTPI->second].kill();
+      return markAsDead(II);
+    }
+
+    uint64_t RawOffset = Offset.getLimitedValue();
+    uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
+
+    // Check for the special case where the same exact value is used for both
+    // source and dest.
+    if (*U == II.getRawDest() && *U == II.getRawSource()) {
+      // For non-volatile transfers this is a no-op.
+      if (!II.isVolatile())
+        return markAsDead(II);
+
+      return insertUse(II, Offset, Size, /*IsSplittable=*/false);
+    }
+
+    // If we have seen both source and destination for a mem transfer, then
+    // they both point to the same alloca.
+    bool Inserted;
+    SmallDenseMap<Instruction *, unsigned>::iterator MTPI;
+    std::tie(MTPI, Inserted) =
+        MemTransferSliceMap.insert(std::make_pair(&II, AS.Slices.size()));
+    unsigned PrevIdx = MTPI->second;
+    if (!Inserted) {
+      Slice &PrevP = AS.Slices[PrevIdx];
+
+      // Check if the begin offsets match and this is a non-volatile transfer.
+      // In that case, we can completely elide the transfer.
+      if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
+        PrevP.kill();
+        return markAsDead(II);
+      }
+
+      // Otherwise we have an offset transfer within the same alloca. We can't
+      // split those.
+      PrevP.makeUnsplittable();
+    }
+
+    // Insert the use now that we've fixed up the splittable nature.
+    insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length);
+
+    // Check that we ended up with a valid index in the map.
+    assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
+           "Map index doesn't point back to a slice with this user.");
+  }
+
+  // Disable SRoA for any intrinsics except for lifetime invariants.
+  // FIXME: What about debug intrinsics? This matches old behavior, but
+  // doesn't make sense.
+  void visitIntrinsicInst(IntrinsicInst &II) {
     if (II.isDroppable()) {
       AS.DeadUseIfPromotable.push_back(U);
       return;
     }
 
-    if (!IsOffsetKnown) 
-      return PI.setAborted(&II); 
- 
-    if (II.isLifetimeStartOrEnd()) { 
-      ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0)); 
-      uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(), 
-                               Length->getLimitedValue()); 
-      insertUse(II, Offset, Size, true); 
-      return; 
-    } 
- 
-    Base::visitIntrinsicInst(II); 
-  } 
- 
-  Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) { 
-    // We consider any PHI or select that results in a direct load or store of 
-    // the same offset to be a viable use for slicing purposes. These uses 
-    // are considered unsplittable and the size is the maximum loaded or stored 
-    // size. 
-    SmallPtrSet<Instruction *, 4> Visited; 
-    SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses; 
-    Visited.insert(Root); 
-    Uses.push_back(std::make_pair(cast<Instruction>(*U), Root)); 
-    const DataLayout &DL = Root->getModule()->getDataLayout(); 
-    // If there are no loads or stores, the access is dead. We mark that as 
-    // a size zero access. 
-    Size = 0; 
-    do { 
-      Instruction *I, *UsedI; 
-      std::tie(UsedI, I) = Uses.pop_back_val(); 
- 
-      if (LoadInst *LI = dyn_cast<LoadInst>(I)) { 
-        Size = std::max(Size, 
-                        DL.getTypeStoreSize(LI->getType()).getFixedSize()); 
-        continue; 
-      } 
-      if (StoreInst *SI = dyn_cast<StoreInst>(I)) { 
-        Value *Op = SI->getOperand(0); 
-        if (Op == UsedI) 
-          return SI; 
-        Size = std::max(Size, 
-                        DL.getTypeStoreSize(Op->getType()).getFixedSize()); 
-        continue; 
-      } 
- 
-      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) { 
-        if (!GEP->hasAllZeroIndices()) 
-          return GEP; 
-      } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) && 
-                 !isa<SelectInst>(I) && !isa<AddrSpaceCastInst>(I)) { 
-        return I; 
-      } 
- 
-      for (User *U : I->users()) 
-        if (Visited.insert(cast<Instruction>(U)).second) 
-          Uses.push_back(std::make_pair(I, cast<Instruction>(U))); 
-    } while (!Uses.empty()); 
- 
-    return nullptr; 
-  } 
- 
-  void visitPHINodeOrSelectInst(Instruction &I) { 
-    assert(isa<PHINode>(I) || isa<SelectInst>(I)); 
-    if (I.use_empty()) 
-      return markAsDead(I); 
- 
-    // TODO: We could use SimplifyInstruction here to fold PHINodes and 
-    // SelectInsts. However, doing so requires to change the current 
-    // dead-operand-tracking mechanism. For instance, suppose neither loading 
-    // from %U nor %other traps. Then "load (select undef, %U, %other)" does not 
-    // trap either.  However, if we simply replace %U with undef using the 
-    // current dead-operand-tracking mechanism, "load (select undef, undef, 
-    // %other)" may trap because the select may return the first operand 
-    // "undef". 
-    if (Value *Result = foldPHINodeOrSelectInst(I)) { 
-      if (Result == *U) 
-        // If the result of the constant fold will be the pointer, recurse 
-        // through the PHI/select as if we had RAUW'ed it. 
-        enqueueUsers(I); 
-      else 
-        // Otherwise the operand to the PHI/select is dead, and we can replace 
-        // it with undef. 
-        AS.DeadOperands.push_back(U); 
- 
-      return; 
-    } 
- 
-    if (!IsOffsetKnown) 
-      return PI.setAborted(&I); 
- 
-    // See if we already have computed info on this node. 
-    uint64_t &Size = PHIOrSelectSizes[&I]; 
-    if (!Size) { 
-      // This is a new PHI/Select, check for an unsafe use of it. 
-      if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size)) 
-        return PI.setAborted(UnsafeI); 
-    } 
- 
-    // For PHI and select operands outside the alloca, we can't nuke the entire 
-    // phi or select -- the other side might still be relevant, so we special 
-    // case them here and use a separate structure to track the operands 
-    // themselves which should be replaced with undef. 
-    // FIXME: This should instead be escaped in the event we're instrumenting 
-    // for address sanitization. 
-    if (Offset.uge(AllocSize)) { 
-      AS.DeadOperands.push_back(U); 
-      return; 
-    } 
- 
-    insertUse(I, Offset, Size); 
-  } 
- 
-  void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); } 
- 
-  void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); } 
- 
-  /// Disable SROA entirely if there are unhandled users of the alloca. 
-  void visitInstruction(Instruction &I) { PI.setAborted(&I); } 
-}; 
- 
-AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI) 
-    : 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 
-      AI(AI), 
-#endif 
-      PointerEscapingInstr(nullptr) { 
-  SliceBuilder PB(DL, AI, *this); 
-  SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI); 
-  if (PtrI.isEscaped() || PtrI.isAborted()) { 
-    // FIXME: We should sink the escape vs. abort info into the caller nicely, 
-    // possibly by just storing the PtrInfo in the AllocaSlices. 
-    PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst() 
-                                                  : PtrI.getAbortingInst(); 
-    assert(PointerEscapingInstr && "Did not track a bad instruction"); 
-    return; 
-  } 
- 
+    if (!IsOffsetKnown)
+      return PI.setAborted(&II);
+
+    if (II.isLifetimeStartOrEnd()) {
+      ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
+      uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(),
+                               Length->getLimitedValue());
+      insertUse(II, Offset, Size, true);
+      return;
+    }
+
+    Base::visitIntrinsicInst(II);
+  }
+
+  Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
+    // We consider any PHI or select that results in a direct load or store of
+    // the same offset to be a viable use for slicing purposes. These uses
+    // are considered unsplittable and the size is the maximum loaded or stored
+    // size.
+    SmallPtrSet<Instruction *, 4> Visited;
+    SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses;
+    Visited.insert(Root);
+    Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
+    const DataLayout &DL = Root->getModule()->getDataLayout();
+    // If there are no loads or stores, the access is dead. We mark that as
+    // a size zero access.
+    Size = 0;
+    do {
+      Instruction *I, *UsedI;
+      std::tie(UsedI, I) = Uses.pop_back_val();
+
+      if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+        Size = std::max(Size,
+                        DL.getTypeStoreSize(LI->getType()).getFixedSize());
+        continue;
+      }
+      if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+        Value *Op = SI->getOperand(0);
+        if (Op == UsedI)
+          return SI;
+        Size = std::max(Size,
+                        DL.getTypeStoreSize(Op->getType()).getFixedSize());
+        continue;
+      }
+
+      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
+        if (!GEP->hasAllZeroIndices())
+          return GEP;
+      } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) &&
+                 !isa<SelectInst>(I) && !isa<AddrSpaceCastInst>(I)) {
+        return I;
+      }
+
+      for (User *U : I->users())
+        if (Visited.insert(cast<Instruction>(U)).second)
+          Uses.push_back(std::make_pair(I, cast<Instruction>(U)));
+    } while (!Uses.empty());
+
+    return nullptr;
+  }
+
+  void visitPHINodeOrSelectInst(Instruction &I) {
+    assert(isa<PHINode>(I) || isa<SelectInst>(I));
+    if (I.use_empty())
+      return markAsDead(I);
+
+    // TODO: We could use SimplifyInstruction here to fold PHINodes and
+    // SelectInsts. However, doing so requires to change the current
+    // dead-operand-tracking mechanism. For instance, suppose neither loading
+    // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
+    // trap either.  However, if we simply replace %U with undef using the
+    // current dead-operand-tracking mechanism, "load (select undef, undef,
+    // %other)" may trap because the select may return the first operand
+    // "undef".
+    if (Value *Result = foldPHINodeOrSelectInst(I)) {
+      if (Result == *U)
+        // If the result of the constant fold will be the pointer, recurse
+        // through the PHI/select as if we had RAUW'ed it.
+        enqueueUsers(I);
+      else
+        // Otherwise the operand to the PHI/select is dead, and we can replace
+        // it with undef.
+        AS.DeadOperands.push_back(U);
+
+      return;
+    }
+
+    if (!IsOffsetKnown)
+      return PI.setAborted(&I);
+
+    // See if we already have computed info on this node.
+    uint64_t &Size = PHIOrSelectSizes[&I];
+    if (!Size) {
+      // This is a new PHI/Select, check for an unsafe use of it.
+      if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size))
+        return PI.setAborted(UnsafeI);
+    }
+
+    // For PHI and select operands outside the alloca, we can't nuke the entire
+    // phi or select -- the other side might still be relevant, so we special
+    // case them here and use a separate structure to track the operands
+    // themselves which should be replaced with undef.
+    // FIXME: This should instead be escaped in the event we're instrumenting
+    // for address sanitization.
+    if (Offset.uge(AllocSize)) {
+      AS.DeadOperands.push_back(U);
+      return;
+    }
+
+    insertUse(I, Offset, Size);
+  }
+
+  void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); }
+
+  void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
+
+  /// Disable SROA entirely if there are unhandled users of the alloca.
+  void visitInstruction(Instruction &I) { PI.setAborted(&I); }
+};
+
+AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
+    :
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+      AI(AI),
+#endif
+      PointerEscapingInstr(nullptr) {
+  SliceBuilder PB(DL, AI, *this);
+  SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
+  if (PtrI.isEscaped() || PtrI.isAborted()) {
+    // FIXME: We should sink the escape vs. abort info into the caller nicely,
+    // possibly by just storing the PtrInfo in the AllocaSlices.
+    PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
+                                                  : PtrI.getAbortingInst();
+    assert(PointerEscapingInstr && "Did not track a bad instruction");
+    return;
+  }
+
   llvm::erase_if(Slices, [](const Slice &S) { return S.isDead(); });
- 
-  // Sort the uses. This arranges for the offsets to be in ascending order, 
-  // and the sizes to be in descending order. 
+
+  // Sort the uses. This arranges for the offsets to be in ascending order,
+  // and the sizes to be in descending order.
   llvm::stable_sort(Slices);
-} 
- 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 
- 
-void AllocaSlices::print(raw_ostream &OS, const_iterator I, 
-                         StringRef Indent) const { 
-  printSlice(OS, I, Indent); 
-  OS << "\n"; 
-  printUse(OS, I, Indent); 
-} 
- 
-void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I, 
-                              StringRef Indent) const { 
-  OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")" 
-     << " slice #" << (I - begin()) 
-     << (I->isSplittable() ? " (splittable)" : ""); 
-} 
- 
-void AllocaSlices::printUse(raw_ostream &OS, const_iterator I, 
-                            StringRef Indent) const { 
-  OS << Indent << "  used by: " << *I->getUse()->getUser() << "\n"; 
-} 
- 
-void AllocaSlices::print(raw_ostream &OS) const { 
-  if (PointerEscapingInstr) { 
-    OS << "Can't analyze slices for alloca: " << AI << "\n" 
-       << "  A pointer to this alloca escaped by:\n" 
-       << "  " << *PointerEscapingInstr << "\n"; 
-    return; 
-  } 
- 
-  OS << "Slices of alloca: " << AI << "\n"; 
-  for (const_iterator I = begin(), E = end(); I != E; ++I) 
-    print(OS, I); 
-} 
- 
-LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const { 
-  print(dbgs(), I); 
-} 
-LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); } 
- 
-#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 
- 
-/// Walk the range of a partitioning looking for a common type to cover this 
-/// sequence of slices. 
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+
+void AllocaSlices::print(raw_ostream &OS, const_iterator I,
+                         StringRef Indent) const {
+  printSlice(OS, I, Indent);
+  OS << "\n";
+  printUse(OS, I, Indent);
+}
+
+void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
+                              StringRef Indent) const {
+  OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
+     << " slice #" << (I - begin())
+     << (I->isSplittable() ? " (splittable)" : "");
+}
+
+void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
+                            StringRef Indent) const {
+  OS << Indent << "  used by: " << *I->getUse()->getUser() << "\n";
+}
+
+void AllocaSlices::print(raw_ostream &OS) const {
+  if (PointerEscapingInstr) {
+    OS << "Can't analyze slices for alloca: " << AI << "\n"
+       << "  A pointer to this alloca escaped by:\n"
+       << "  " << *PointerEscapingInstr << "\n";
+    return;
+  }
+
+  OS << "Slices of alloca: " << AI << "\n";
+  for (const_iterator I = begin(), E = end(); I != E; ++I)
+    print(OS, I);
+}
+
+LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
+  print(dbgs(), I);
+}
+LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
+
+#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+
+/// Walk the range of a partitioning looking for a common type to cover this
+/// sequence of slices.
 static std::pair<Type *, IntegerType *>
 findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
                uint64_t EndOffset) {
-  Type *Ty = nullptr; 
-  bool TyIsCommon = true; 
-  IntegerType *ITy = nullptr; 
- 
-  // Note that we need to look at *every* alloca slice's Use to ensure we 
-  // always get consistent results regardless of the order of slices. 
-  for (AllocaSlices::const_iterator I = B; I != E; ++I) { 
-    Use *U = I->getUse(); 
-    if (isa<IntrinsicInst>(*U->getUser())) 
-      continue; 
-    if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset) 
-      continue; 
- 
-    Type *UserTy = nullptr; 
-    if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { 
-      UserTy = LI->getType(); 
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) { 
-      UserTy = SI->getValueOperand()->getType(); 
-    } 
- 
-    if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) { 
-      // If the type is larger than the partition, skip it. We only encounter 
-      // this for split integer operations where we want to use the type of the 
-      // entity causing the split. Also skip if the type is not a byte width 
-      // multiple. 
-      if (UserITy->getBitWidth() % 8 != 0 || 
-          UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset())) 
-        continue; 
- 
-      // Track the largest bitwidth integer type used in this way in case there 
-      // is no common type. 
-      if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth()) 
-        ITy = UserITy; 
-    } 
- 
-    // To avoid depending on the order of slices, Ty and TyIsCommon must not 
-    // depend on types skipped above. 
-    if (!UserTy || (Ty && Ty != UserTy)) 
-      TyIsCommon = false; // Give up on anything but an iN type. 
-    else 
-      Ty = UserTy; 
-  } 
- 
+  Type *Ty = nullptr;
+  bool TyIsCommon = true;
+  IntegerType *ITy = nullptr;
+
+  // Note that we need to look at *every* alloca slice's Use to ensure we
+  // always get consistent results regardless of the order of slices.
+  for (AllocaSlices::const_iterator I = B; I != E; ++I) {
+    Use *U = I->getUse();
+    if (isa<IntrinsicInst>(*U->getUser()))
+      continue;
+    if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
+      continue;
+
+    Type *UserTy = nullptr;
+    if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
+      UserTy = LI->getType();
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
+      UserTy = SI->getValueOperand()->getType();
+    }
+
+    if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) {
+      // If the type is larger than the partition, skip it. We only encounter
+      // this for split integer operations where we want to use the type of the
+      // entity causing the split. Also skip if the type is not a byte width
+      // multiple.
+      if (UserITy->getBitWidth() % 8 != 0 ||
+          UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
+        continue;
+
+      // Track the largest bitwidth integer type used in this way in case there
+      // is no common type.
+      if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
+        ITy = UserITy;
+    }
+
+    // To avoid depending on the order of slices, Ty and TyIsCommon must not
+    // depend on types skipped above.
+    if (!UserTy || (Ty && Ty != UserTy))
+      TyIsCommon = false; // Give up on anything but an iN type.
+    else
+      Ty = UserTy;
+  }
+
   return {TyIsCommon ? Ty : nullptr, ITy};
-} 
- 
-/// PHI instructions that use an alloca and are subsequently loaded can be 
-/// rewritten to load both input pointers in the pred blocks and then PHI the 
-/// results, allowing the load of the alloca to be promoted. 
-/// From this: 
-///   %P2 = phi [i32* %Alloca, i32* %Other] 
-///   %V = load i32* %P2 
-/// to: 
-///   %V1 = load i32* %Alloca      -> will be mem2reg'd 
-///   ... 
-///   %V2 = load i32* %Other 
-///   ... 
-///   %V = phi [i32 %V1, i32 %V2] 
-/// 
-/// We can do this to a select if its only uses are loads and if the operands 
-/// to the select can be loaded unconditionally. 
-/// 
-/// FIXME: This should be hoisted into a generic utility, likely in 
-/// Transforms/Util/Local.h 
-static bool isSafePHIToSpeculate(PHINode &PN) { 
-  const DataLayout &DL = PN.getModule()->getDataLayout(); 
- 
-  // For now, we can only do this promotion if the load is in the same block 
-  // as the PHI, and if there are no stores between the phi and load. 
-  // TODO: Allow recursive phi users. 
-  // TODO: Allow stores. 
-  BasicBlock *BB = PN.getParent(); 
-  Align MaxAlign; 
-  uint64_t APWidth = DL.getIndexTypeSizeInBits(PN.getType()); 
-  APInt MaxSize(APWidth, 0); 
-  bool HaveLoad = false; 
-  for (User *U : PN.users()) { 
-    LoadInst *LI = dyn_cast<LoadInst>(U); 
-    if (!LI || !LI->isSimple()) 
-      return false; 
- 
-    // For now we only allow loads in the same block as the PHI.  This is 
-    // a common case that happens when instcombine merges two loads through 
-    // a PHI. 
-    if (LI->getParent() != BB) 
-      return false; 
- 
-    // Ensure that there are no instructions between the PHI and the load that 
-    // could store. 
-    for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI) 
-      if (BBI->mayWriteToMemory()) 
-        return false; 
- 
-    uint64_t Size = DL.getTypeStoreSize(LI->getType()).getFixedSize(); 
-    MaxAlign = std::max(MaxAlign, LI->getAlign()); 
-    MaxSize = MaxSize.ult(Size) ? APInt(APWidth, Size) : MaxSize; 
-    HaveLoad = true; 
-  } 
- 
-  if (!HaveLoad) 
-    return false; 
- 
-  // We can only transform this if it is safe to push the loads into the 
-  // predecessor blocks. The only thing to watch out for is that we can't put 
-  // a possibly trapping load in the predecessor if it is a critical edge. 
-  for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { 
-    Instruction *TI = PN.getIncomingBlock(Idx)->getTerminator(); 
-    Value *InVal = PN.getIncomingValue(Idx); 
- 
-    // If the value is produced by the terminator of the predecessor (an 
-    // invoke) or it has side-effects, there is no valid place to put a load 
-    // in the predecessor. 
-    if (TI == InVal || TI->mayHaveSideEffects()) 
-      return false; 
- 
-    // If the predecessor has a single successor, then the edge isn't 
-    // critical. 
-    if (TI->getNumSuccessors() == 1) 
-      continue; 
- 
-    // If this pointer is always safe to load, or if we can prove that there 
-    // is already a load in the block, then we can move the load to the pred 
-    // block. 
-    if (isSafeToLoadUnconditionally(InVal, MaxAlign, MaxSize, DL, TI)) 
-      continue; 
- 
-    return false; 
-  } 
- 
-  return true; 
-} 
- 
-static void speculatePHINodeLoads(PHINode &PN) { 
-  LLVM_DEBUG(dbgs() << "    original: " << PN << "\n"); 
- 
-  LoadInst *SomeLoad = cast<LoadInst>(PN.user_back()); 
-  Type *LoadTy = SomeLoad->getType(); 
-  IRBuilderTy PHIBuilder(&PN); 
-  PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(), 
-                                        PN.getName() + ".sroa.speculated"); 
- 
-  // Get the AA tags and alignment to use from one of the loads. It does not 
-  // matter which one we get and if any differ. 
-  AAMDNodes AATags; 
-  SomeLoad->getAAMetadata(AATags); 
-  Align Alignment = SomeLoad->getAlign(); 
- 
-  // Rewrite all loads of the PN to use the new PHI. 
-  while (!PN.use_empty()) { 
-    LoadInst *LI = cast<LoadInst>(PN.user_back()); 
-    LI->replaceAllUsesWith(NewPN); 
-    LI->eraseFromParent(); 
-  } 
- 
-  // Inject loads into all of the pred blocks. 
-  DenseMap<BasicBlock*, Value*> InjectedLoads; 
-  for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { 
-    BasicBlock *Pred = PN.getIncomingBlock(Idx); 
-    Value *InVal = PN.getIncomingValue(Idx); 
- 
-    // A PHI node is allowed to have multiple (duplicated) entries for the same 
-    // basic block, as long as the value is the same. So if we already injected 
-    // a load in the predecessor, then we should reuse the same load for all 
-    // duplicated entries. 
-    if (Value* V = InjectedLoads.lookup(Pred)) { 
-      NewPN->addIncoming(V, Pred); 
-      continue; 
-    } 
- 
-    Instruction *TI = Pred->getTerminator(); 
-    IRBuilderTy PredBuilder(TI); 
- 
-    LoadInst *Load = PredBuilder.CreateAlignedLoad( 
-        LoadTy, InVal, Alignment, 
-        (PN.getName() + ".sroa.speculate.load." + Pred->getName())); 
-    ++NumLoadsSpeculated; 
-    if (AATags) 
-      Load->setAAMetadata(AATags); 
-    NewPN->addIncoming(Load, Pred); 
-    InjectedLoads[Pred] = Load; 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "          speculated to: " << *NewPN << "\n"); 
-  PN.eraseFromParent(); 
-} 
- 
-/// Select instructions that use an alloca and are subsequently loaded can be 
-/// rewritten to load both input pointers and then select between the result, 
-/// allowing the load of the alloca to be promoted. 
-/// From this: 
-///   %P2 = select i1 %cond, i32* %Alloca, i32* %Other 
-///   %V = load i32* %P2 
-/// to: 
-///   %V1 = load i32* %Alloca      -> will be mem2reg'd 
-///   %V2 = load i32* %Other 
-///   %V = select i1 %cond, i32 %V1, i32 %V2 
-/// 
-/// We can do this to a select if its only uses are loads and if the operand 
-/// to the select can be loaded unconditionally. 
-static bool isSafeSelectToSpeculate(SelectInst &SI) { 
-  Value *TValue = SI.getTrueValue(); 
-  Value *FValue = SI.getFalseValue(); 
-  const DataLayout &DL = SI.getModule()->getDataLayout(); 
- 
-  for (User *U : SI.users()) { 
-    LoadInst *LI = dyn_cast<LoadInst>(U); 
-    if (!LI || !LI->isSimple()) 
-      return false; 
- 
-    // Both operands to the select need to be dereferenceable, either 
-    // absolutely (e.g. allocas) or at this point because we can see other 
-    // accesses to it. 
-    if (!isSafeToLoadUnconditionally(TValue, LI->getType(), 
-                                     LI->getAlign(), DL, LI)) 
-      return false; 
-    if (!isSafeToLoadUnconditionally(FValue, LI->getType(), 
-                                     LI->getAlign(), DL, LI)) 
-      return false; 
-  } 
- 
-  return true; 
-} 
- 
-static void speculateSelectInstLoads(SelectInst &SI) { 
-  LLVM_DEBUG(dbgs() << "    original: " << SI << "\n"); 
- 
-  IRBuilderTy IRB(&SI); 
-  Value *TV = SI.getTrueValue(); 
-  Value *FV = SI.getFalseValue(); 
-  // Replace the loads of the select with a select of two loads. 
-  while (!SI.use_empty()) { 
-    LoadInst *LI = cast<LoadInst>(SI.user_back()); 
-    assert(LI->isSimple() && "We only speculate simple loads"); 
- 
-    IRB.SetInsertPoint(LI); 
-    LoadInst *TL = IRB.CreateLoad(LI->getType(), TV, 
-                                  LI->getName() + ".sroa.speculate.load.true"); 
-    LoadInst *FL = IRB.CreateLoad(LI->getType(), FV, 
-                                  LI->getName() + ".sroa.speculate.load.false"); 
-    NumLoadsSpeculated += 2; 
- 
-    // Transfer alignment and AA info if present. 
-    TL->setAlignment(LI->getAlign()); 
-    FL->setAlignment(LI->getAlign()); 
- 
-    AAMDNodes Tags; 
-    LI->getAAMetadata(Tags); 
-    if (Tags) { 
-      TL->setAAMetadata(Tags); 
-      FL->setAAMetadata(Tags); 
-    } 
- 
-    Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL, 
-                                LI->getName() + ".sroa.speculated"); 
- 
-    LLVM_DEBUG(dbgs() << "          speculated to: " << *V << "\n"); 
-    LI->replaceAllUsesWith(V); 
-    LI->eraseFromParent(); 
-  } 
-  SI.eraseFromParent(); 
-} 
- 
-/// Build a GEP out of a base pointer and indices. 
-/// 
-/// This will return the BasePtr if that is valid, or build a new GEP 
-/// instruction using the IRBuilder if GEP-ing is needed. 
-static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr, 
+}
+
+/// PHI instructions that use an alloca and are subsequently loaded can be
+/// rewritten to load both input pointers in the pred blocks and then PHI the
+/// results, allowing the load of the alloca to be promoted.
+/// From this:
+///   %P2 = phi [i32* %Alloca, i32* %Other]
+///   %V = load i32* %P2
+/// to:
+///   %V1 = load i32* %Alloca      -> will be mem2reg'd
+///   ...
+///   %V2 = load i32* %Other
+///   ...
+///   %V = phi [i32 %V1, i32 %V2]
+///
+/// We can do this to a select if its only uses are loads and if the operands
+/// to the select can be loaded unconditionally.
+///
+/// FIXME: This should be hoisted into a generic utility, likely in
+/// Transforms/Util/Local.h
+static bool isSafePHIToSpeculate(PHINode &PN) {
+  const DataLayout &DL = PN.getModule()->getDataLayout();
+
+  // For now, we can only do this promotion if the load is in the same block
+  // as the PHI, and if there are no stores between the phi and load.
+  // TODO: Allow recursive phi users.
+  // TODO: Allow stores.
+  BasicBlock *BB = PN.getParent();
+  Align MaxAlign;
+  uint64_t APWidth = DL.getIndexTypeSizeInBits(PN.getType());
+  APInt MaxSize(APWidth, 0);
+  bool HaveLoad = false;
+  for (User *U : PN.users()) {
+    LoadInst *LI = dyn_cast<LoadInst>(U);
+    if (!LI || !LI->isSimple())
+      return false;
+
+    // For now we only allow loads in the same block as the PHI.  This is
+    // a common case that happens when instcombine merges two loads through
+    // a PHI.
+    if (LI->getParent() != BB)
+      return false;
+
+    // Ensure that there are no instructions between the PHI and the load that
+    // could store.
+    for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
+      if (BBI->mayWriteToMemory())
+        return false;
+
+    uint64_t Size = DL.getTypeStoreSize(LI->getType()).getFixedSize();
+    MaxAlign = std::max(MaxAlign, LI->getAlign());
+    MaxSize = MaxSize.ult(Size) ? APInt(APWidth, Size) : MaxSize;
+    HaveLoad = true;
+  }
+
+  if (!HaveLoad)
+    return false;
+
+  // We can only transform this if it is safe to push the loads into the
+  // predecessor blocks. The only thing to watch out for is that we can't put
+  // a possibly trapping load in the predecessor if it is a critical edge.
+  for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
+    Instruction *TI = PN.getIncomingBlock(Idx)->getTerminator();
+    Value *InVal = PN.getIncomingValue(Idx);
+
+    // If the value is produced by the terminator of the predecessor (an
+    // invoke) or it has side-effects, there is no valid place to put a load
+    // in the predecessor.
+    if (TI == InVal || TI->mayHaveSideEffects())
+      return false;
+
+    // If the predecessor has a single successor, then the edge isn't
+    // critical.
+    if (TI->getNumSuccessors() == 1)
+      continue;
+
+    // If this pointer is always safe to load, or if we can prove that there
+    // is already a load in the block, then we can move the load to the pred
+    // block.
+    if (isSafeToLoadUnconditionally(InVal, MaxAlign, MaxSize, DL, TI))
+      continue;
+
+    return false;
+  }
+
+  return true;
+}
+
+static void speculatePHINodeLoads(PHINode &PN) {
+  LLVM_DEBUG(dbgs() << "    original: " << PN << "\n");
+
+  LoadInst *SomeLoad = cast<LoadInst>(PN.user_back());
+  Type *LoadTy = SomeLoad->getType();
+  IRBuilderTy PHIBuilder(&PN);
+  PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(),
+                                        PN.getName() + ".sroa.speculated");
+
+  // Get the AA tags and alignment to use from one of the loads. It does not
+  // matter which one we get and if any differ.
+  AAMDNodes AATags;
+  SomeLoad->getAAMetadata(AATags);
+  Align Alignment = SomeLoad->getAlign();
+
+  // Rewrite all loads of the PN to use the new PHI.
+  while (!PN.use_empty()) {
+    LoadInst *LI = cast<LoadInst>(PN.user_back());
+    LI->replaceAllUsesWith(NewPN);
+    LI->eraseFromParent();
+  }
+
+  // Inject loads into all of the pred blocks.
+  DenseMap<BasicBlock*, Value*> InjectedLoads;
+  for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
+    BasicBlock *Pred = PN.getIncomingBlock(Idx);
+    Value *InVal = PN.getIncomingValue(Idx);
+
+    // A PHI node is allowed to have multiple (duplicated) entries for the same
+    // basic block, as long as the value is the same. So if we already injected
+    // a load in the predecessor, then we should reuse the same load for all
+    // duplicated entries.
+    if (Value* V = InjectedLoads.lookup(Pred)) {
+      NewPN->addIncoming(V, Pred);
+      continue;
+    }
+
+    Instruction *TI = Pred->getTerminator();
+    IRBuilderTy PredBuilder(TI);
+
+    LoadInst *Load = PredBuilder.CreateAlignedLoad(
+        LoadTy, InVal, Alignment,
+        (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
+    ++NumLoadsSpeculated;
+    if (AATags)
+      Load->setAAMetadata(AATags);
+    NewPN->addIncoming(Load, Pred);
+    InjectedLoads[Pred] = Load;
+  }
+
+  LLVM_DEBUG(dbgs() << "          speculated to: " << *NewPN << "\n");
+  PN.eraseFromParent();
+}
+
+/// Select instructions that use an alloca and are subsequently loaded can be
+/// rewritten to load both input pointers and then select between the result,
+/// allowing the load of the alloca to be promoted.
+/// From this:
+///   %P2 = select i1 %cond, i32* %Alloca, i32* %Other
+///   %V = load i32* %P2
+/// to:
+///   %V1 = load i32* %Alloca      -> will be mem2reg'd
+///   %V2 = load i32* %Other
+///   %V = select i1 %cond, i32 %V1, i32 %V2
+///
+/// We can do this to a select if its only uses are loads and if the operand
+/// to the select can be loaded unconditionally.
+static bool isSafeSelectToSpeculate(SelectInst &SI) {
+  Value *TValue = SI.getTrueValue();
+  Value *FValue = SI.getFalseValue();
+  const DataLayout &DL = SI.getModule()->getDataLayout();
+
+  for (User *U : SI.users()) {
+    LoadInst *LI = dyn_cast<LoadInst>(U);
+    if (!LI || !LI->isSimple())
+      return false;
+
+    // Both operands to the select need to be dereferenceable, either
+    // absolutely (e.g. allocas) or at this point because we can see other
+    // accesses to it.
+    if (!isSafeToLoadUnconditionally(TValue, LI->getType(),
+                                     LI->getAlign(), DL, LI))
+      return false;
+    if (!isSafeToLoadUnconditionally(FValue, LI->getType(),
+                                     LI->getAlign(), DL, LI))
+      return false;
+  }
+
+  return true;
+}
+
+static void speculateSelectInstLoads(SelectInst &SI) {
+  LLVM_DEBUG(dbgs() << "    original: " << SI << "\n");
+
+  IRBuilderTy IRB(&SI);
+  Value *TV = SI.getTrueValue();
+  Value *FV = SI.getFalseValue();
+  // Replace the loads of the select with a select of two loads.
+  while (!SI.use_empty()) {
+    LoadInst *LI = cast<LoadInst>(SI.user_back());
+    assert(LI->isSimple() && "We only speculate simple loads");
+
+    IRB.SetInsertPoint(LI);
+    LoadInst *TL = IRB.CreateLoad(LI->getType(), TV,
+                                  LI->getName() + ".sroa.speculate.load.true");
+    LoadInst *FL = IRB.CreateLoad(LI->getType(), FV,
+                                  LI->getName() + ".sroa.speculate.load.false");
+    NumLoadsSpeculated += 2;
+
+    // Transfer alignment and AA info if present.
+    TL->setAlignment(LI->getAlign());
+    FL->setAlignment(LI->getAlign());
+
+    AAMDNodes Tags;
+    LI->getAAMetadata(Tags);
+    if (Tags) {
+      TL->setAAMetadata(Tags);
+      FL->setAAMetadata(Tags);
+    }
+
+    Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
+                                LI->getName() + ".sroa.speculated");
+
+    LLVM_DEBUG(dbgs() << "          speculated to: " << *V << "\n");
+    LI->replaceAllUsesWith(V);
+    LI->eraseFromParent();
+  }
+  SI.eraseFromParent();
+}
+
+/// Build a GEP out of a base pointer and indices.
+///
+/// This will return the BasePtr if that is valid, or build a new GEP
+/// instruction using the IRBuilder if GEP-ing is needed.
+static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
                        SmallVectorImpl<Value *> &Indices,
                        const Twine &NamePrefix) {
-  if (Indices.empty()) 
-    return BasePtr; 
- 
-  // A single zero index is a no-op, so check for this and avoid building a GEP 
-  // in that case. 
-  if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero()) 
-    return BasePtr; 
- 
-  return IRB.CreateInBoundsGEP(BasePtr->getType()->getPointerElementType(), 
-                               BasePtr, Indices, NamePrefix + "sroa_idx"); 
-} 
- 
-/// Get a natural GEP off of the BasePtr walking through Ty toward 
-/// TargetTy without changing the offset of the pointer. 
-/// 
-/// This routine assumes we've already established a properly offset GEP with 
-/// Indices, and arrived at the Ty type. The goal is to continue to GEP with 
-/// zero-indices down through type layers until we find one the same as 
-/// TargetTy. If we can't find one with the same type, we at least try to use 
-/// one with the same size. If none of that works, we just produce the GEP as 
-/// indicated by Indices to have the correct offset. 
-static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL, 
-                                    Value *BasePtr, Type *Ty, Type *TargetTy, 
-                                    SmallVectorImpl<Value *> &Indices, 
+  if (Indices.empty())
+    return BasePtr;
+
+  // A single zero index is a no-op, so check for this and avoid building a GEP
+  // in that case.
+  if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero())
+    return BasePtr;
+
+  return IRB.CreateInBoundsGEP(BasePtr->getType()->getPointerElementType(),
+                               BasePtr, Indices, NamePrefix + "sroa_idx");
+}
+
+/// Get a natural GEP off of the BasePtr walking through Ty toward
+/// TargetTy without changing the offset of the pointer.
+///
+/// This routine assumes we've already established a properly offset GEP with
+/// Indices, and arrived at the Ty type. The goal is to continue to GEP with
+/// zero-indices down through type layers until we find one the same as
+/// TargetTy. If we can't find one with the same type, we at least try to use
+/// one with the same size. If none of that works, we just produce the GEP as
+/// indicated by Indices to have the correct offset.
+static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
+                                    Value *BasePtr, Type *Ty, Type *TargetTy,
+                                    SmallVectorImpl<Value *> &Indices,
                                     const Twine &NamePrefix) {
-  if (Ty == TargetTy) 
-    return buildGEP(IRB, BasePtr, Indices, NamePrefix); 
- 
-  // Offset size to use for the indices. 
-  unsigned OffsetSize = DL.getIndexTypeSizeInBits(BasePtr->getType()); 
- 
-  // See if we can descend into a struct and locate a field with the correct 
-  // type. 
-  unsigned NumLayers = 0; 
-  Type *ElementTy = Ty; 
-  do { 
-    if (ElementTy->isPointerTy()) 
-      break; 
- 
-    if (ArrayType *ArrayTy = dyn_cast<ArrayType>(ElementTy)) { 
-      ElementTy = ArrayTy->getElementType(); 
-      Indices.push_back(IRB.getIntN(OffsetSize, 0)); 
-    } else if (VectorType *VectorTy = dyn_cast<VectorType>(ElementTy)) { 
-      ElementTy = VectorTy->getElementType(); 
-      Indices.push_back(IRB.getInt32(0)); 
-    } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) { 
-      if (STy->element_begin() == STy->element_end()) 
-        break; // Nothing left to descend into. 
-      ElementTy = *STy->element_begin(); 
-      Indices.push_back(IRB.getInt32(0)); 
-    } else { 
-      break; 
-    } 
-    ++NumLayers; 
-  } while (ElementTy != TargetTy); 
-  if (ElementTy != TargetTy) 
-    Indices.erase(Indices.end() - NumLayers, Indices.end()); 
- 
-  return buildGEP(IRB, BasePtr, Indices, NamePrefix); 
-} 
- 
-/// Recursively compute indices for a natural GEP. 
-/// 
-/// This is the recursive step for getNaturalGEPWithOffset that walks down the 
-/// element types adding appropriate indices for the GEP. 
-static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL, 
-                                       Value *Ptr, Type *Ty, APInt &Offset, 
-                                       Type *TargetTy, 
-                                       SmallVectorImpl<Value *> &Indices, 
+  if (Ty == TargetTy)
+    return buildGEP(IRB, BasePtr, Indices, NamePrefix);
+
+  // Offset size to use for the indices.
+  unsigned OffsetSize = DL.getIndexTypeSizeInBits(BasePtr->getType());
+
+  // See if we can descend into a struct and locate a field with the correct
+  // type.
+  unsigned NumLayers = 0;
+  Type *ElementTy = Ty;
+  do {
+    if (ElementTy->isPointerTy())
+      break;
+
+    if (ArrayType *ArrayTy = dyn_cast<ArrayType>(ElementTy)) {
+      ElementTy = ArrayTy->getElementType();
+      Indices.push_back(IRB.getIntN(OffsetSize, 0));
+    } else if (VectorType *VectorTy = dyn_cast<VectorType>(ElementTy)) {
+      ElementTy = VectorTy->getElementType();
+      Indices.push_back(IRB.getInt32(0));
+    } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) {
+      if (STy->element_begin() == STy->element_end())
+        break; // Nothing left to descend into.
+      ElementTy = *STy->element_begin();
+      Indices.push_back(IRB.getInt32(0));
+    } else {
+      break;
+    }
+    ++NumLayers;
+  } while (ElementTy != TargetTy);
+  if (ElementTy != TargetTy)
+    Indices.erase(Indices.end() - NumLayers, Indices.end());
+
+  return buildGEP(IRB, BasePtr, Indices, NamePrefix);
+}
+
+/// Recursively compute indices for a natural GEP.
+///
+/// This is the recursive step for getNaturalGEPWithOffset that walks down the
+/// element types adding appropriate indices for the GEP.
+static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
+                                       Value *Ptr, Type *Ty, APInt &Offset,
+                                       Type *TargetTy,
+                                       SmallVectorImpl<Value *> &Indices,
                                        const Twine &NamePrefix) {
-  if (Offset == 0) 
-    return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices, 
-                                 NamePrefix); 
- 
-  // We can't recurse through pointer types. 
-  if (Ty->isPointerTy()) 
-    return nullptr; 
- 
-  // We try to analyze GEPs over vectors here, but note that these GEPs are 
-  // extremely poorly defined currently. The long-term goal is to remove GEPing 
-  // over a vector from the IR completely. 
-  if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) { 
-    unsigned ElementSizeInBits = 
-        DL.getTypeSizeInBits(VecTy->getScalarType()).getFixedSize(); 
-    if (ElementSizeInBits % 8 != 0) { 
-      // GEPs over non-multiple of 8 size vector elements are invalid. 
-      return nullptr; 
-    } 
-    APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8); 
-    APInt NumSkippedElements = Offset.sdiv(ElementSize); 
-    if (NumSkippedElements.ugt(cast<FixedVectorType>(VecTy)->getNumElements())) 
-      return nullptr; 
-    Offset -= NumSkippedElements * ElementSize; 
-    Indices.push_back(IRB.getInt(NumSkippedElements)); 
-    return getNaturalGEPRecursively(IRB, DL, Ptr, VecTy->getElementType(), 
-                                    Offset, TargetTy, Indices, NamePrefix); 
-  } 
- 
-  if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { 
-    Type *ElementTy = ArrTy->getElementType(); 
-    APInt ElementSize(Offset.getBitWidth(), 
-                      DL.getTypeAllocSize(ElementTy).getFixedSize()); 
-    APInt NumSkippedElements = Offset.sdiv(ElementSize); 
-    if (NumSkippedElements.ugt(ArrTy->getNumElements())) 
-      return nullptr; 
- 
-    Offset -= NumSkippedElements * ElementSize; 
-    Indices.push_back(IRB.getInt(NumSkippedElements)); 
-    return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy, 
-                                    Indices, NamePrefix); 
-  } 
- 
-  StructType *STy = dyn_cast<StructType>(Ty); 
-  if (!STy) 
-    return nullptr; 
- 
-  const StructLayout *SL = DL.getStructLayout(STy); 
-  uint64_t StructOffset = Offset.getZExtValue(); 
-  if (StructOffset >= SL->getSizeInBytes()) 
-    return nullptr; 
-  unsigned Index = SL->getElementContainingOffset(StructOffset); 
-  Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index)); 
-  Type *ElementTy = STy->getElementType(Index); 
-  if (Offset.uge(DL.getTypeAllocSize(ElementTy).getFixedSize())) 
-    return nullptr; // The offset points into alignment padding. 
- 
-  Indices.push_back(IRB.getInt32(Index)); 
-  return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy, 
-                                  Indices, NamePrefix); 
-} 
- 
-/// Get a natural GEP from a base pointer to a particular offset and 
-/// resulting in a particular type. 
-/// 
-/// The goal is to produce a "natural" looking GEP that works with the existing 
-/// composite types to arrive at the appropriate offset and element type for 
-/// a pointer. TargetTy is the element type the returned GEP should point-to if 
-/// possible. We recurse by decreasing Offset, adding the appropriate index to 
-/// Indices, and setting Ty to the result subtype. 
-/// 
-/// If no natural GEP can be constructed, this function returns null. 
-static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL, 
-                                      Value *Ptr, APInt Offset, Type *TargetTy, 
-                                      SmallVectorImpl<Value *> &Indices, 
+  if (Offset == 0)
+    return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices,
+                                 NamePrefix);
+
+  // We can't recurse through pointer types.
+  if (Ty->isPointerTy())
+    return nullptr;
+
+  // We try to analyze GEPs over vectors here, but note that these GEPs are
+  // extremely poorly defined currently. The long-term goal is to remove GEPing
+  // over a vector from the IR completely.
+  if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) {
+    unsigned ElementSizeInBits =
+        DL.getTypeSizeInBits(VecTy->getScalarType()).getFixedSize();
+    if (ElementSizeInBits % 8 != 0) {
+      // GEPs over non-multiple of 8 size vector elements are invalid.
+      return nullptr;
+    }
+    APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8);
+    APInt NumSkippedElements = Offset.sdiv(ElementSize);
+    if (NumSkippedElements.ugt(cast<FixedVectorType>(VecTy)->getNumElements()))
+      return nullptr;
+    Offset -= NumSkippedElements * ElementSize;
+    Indices.push_back(IRB.getInt(NumSkippedElements));
+    return getNaturalGEPRecursively(IRB, DL, Ptr, VecTy->getElementType(),
+                                    Offset, TargetTy, Indices, NamePrefix);
+  }
+
+  if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
+    Type *ElementTy = ArrTy->getElementType();
+    APInt ElementSize(Offset.getBitWidth(),
+                      DL.getTypeAllocSize(ElementTy).getFixedSize());
+    APInt NumSkippedElements = Offset.sdiv(ElementSize);
+    if (NumSkippedElements.ugt(ArrTy->getNumElements()))
+      return nullptr;
+
+    Offset -= NumSkippedElements * ElementSize;
+    Indices.push_back(IRB.getInt(NumSkippedElements));
+    return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
+                                    Indices, NamePrefix);
+  }
+
+  StructType *STy = dyn_cast<StructType>(Ty);
+  if (!STy)
+    return nullptr;
+
+  const StructLayout *SL = DL.getStructLayout(STy);
+  uint64_t StructOffset = Offset.getZExtValue();
+  if (StructOffset >= SL->getSizeInBytes())
+    return nullptr;
+  unsigned Index = SL->getElementContainingOffset(StructOffset);
+  Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index));
+  Type *ElementTy = STy->getElementType(Index);
+  if (Offset.uge(DL.getTypeAllocSize(ElementTy).getFixedSize()))
+    return nullptr; // The offset points into alignment padding.
+
+  Indices.push_back(IRB.getInt32(Index));
+  return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
+                                  Indices, NamePrefix);
+}
+
+/// Get a natural GEP from a base pointer to a particular offset and
+/// resulting in a particular type.
+///
+/// The goal is to produce a "natural" looking GEP that works with the existing
+/// composite types to arrive at the appropriate offset and element type for
+/// a pointer. TargetTy is the element type the returned GEP should point-to if
+/// possible. We recurse by decreasing Offset, adding the appropriate index to
+/// Indices, and setting Ty to the result subtype.
+///
+/// If no natural GEP can be constructed, this function returns null.
+static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
+                                      Value *Ptr, APInt Offset, Type *TargetTy,
+                                      SmallVectorImpl<Value *> &Indices,
                                       const Twine &NamePrefix) {
-  PointerType *Ty = cast<PointerType>(Ptr->getType()); 
- 
-  // Don't consider any GEPs through an i8* as natural unless the TargetTy is 
-  // an i8. 
-  if (Ty == IRB.getInt8PtrTy(Ty->getAddressSpace()) && TargetTy->isIntegerTy(8)) 
-    return nullptr; 
- 
-  Type *ElementTy = Ty->getElementType(); 
-  if (!ElementTy->isSized()) 
-    return nullptr; // We can't GEP through an unsized element. 
+  PointerType *Ty = cast<PointerType>(Ptr->getType());
+
+  // Don't consider any GEPs through an i8* as natural unless the TargetTy is
+  // an i8.
+  if (Ty == IRB.getInt8PtrTy(Ty->getAddressSpace()) && TargetTy->isIntegerTy(8))
+    return nullptr;
+
+  Type *ElementTy = Ty->getElementType();
+  if (!ElementTy->isSized())
+    return nullptr; // We can't GEP through an unsized element.
   if (isa<ScalableVectorType>(ElementTy))
     return nullptr;
-  APInt ElementSize(Offset.getBitWidth(), 
-                    DL.getTypeAllocSize(ElementTy).getFixedSize()); 
-  if (ElementSize == 0) 
-    return nullptr; // Zero-length arrays can't help us build a natural GEP. 
-  APInt NumSkippedElements = Offset.sdiv(ElementSize); 
- 
-  Offset -= NumSkippedElements * ElementSize; 
-  Indices.push_back(IRB.getInt(NumSkippedElements)); 
-  return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy, 
-                                  Indices, NamePrefix); 
-} 
- 
-/// Compute an adjusted pointer from Ptr by Offset bytes where the 
-/// resulting pointer has PointerTy. 
-/// 
-/// This tries very hard to compute a "natural" GEP which arrives at the offset 
-/// and produces the pointer type desired. Where it cannot, it will try to use 
-/// the natural GEP to arrive at the offset and bitcast to the type. Where that 
-/// fails, it will try to use an existing i8* and GEP to the byte offset and 
-/// bitcast to the type. 
-/// 
-/// The strategy for finding the more natural GEPs is to peel off layers of the 
-/// pointer, walking back through bit casts and GEPs, searching for a base 
-/// pointer from which we can compute a natural GEP with the desired 
-/// properties. The algorithm tries to fold as many constant indices into 
-/// a single GEP as possible, thus making each GEP more independent of the 
-/// surrounding code. 
-static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, 
+  APInt ElementSize(Offset.getBitWidth(),
+                    DL.getTypeAllocSize(ElementTy).getFixedSize());
+  if (ElementSize == 0)
+    return nullptr; // Zero-length arrays can't help us build a natural GEP.
+  APInt NumSkippedElements = Offset.sdiv(ElementSize);
+
+  Offset -= NumSkippedElements * ElementSize;
+  Indices.push_back(IRB.getInt(NumSkippedElements));
+  return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
+                                  Indices, NamePrefix);
+}
+
+/// Compute an adjusted pointer from Ptr by Offset bytes where the
+/// resulting pointer has PointerTy.
+///
+/// This tries very hard to compute a "natural" GEP which arrives at the offset
+/// and produces the pointer type desired. Where it cannot, it will try to use
+/// the natural GEP to arrive at the offset and bitcast to the type. Where that
+/// fails, it will try to use an existing i8* and GEP to the byte offset and
+/// bitcast to the type.
+///
+/// The strategy for finding the more natural GEPs is to peel off layers of the
+/// pointer, walking back through bit casts and GEPs, searching for a base
+/// pointer from which we can compute a natural GEP with the desired
+/// properties. The algorithm tries to fold as many constant indices into
+/// a single GEP as possible, thus making each GEP more independent of the
+/// surrounding code.
+static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
                              APInt Offset, Type *PointerTy,
                              const Twine &NamePrefix) {
-  // Even though we don't look through PHI nodes, we could be called on an 
-  // instruction in an unreachable block, which may be on a cycle. 
-  SmallPtrSet<Value *, 4> Visited; 
-  Visited.insert(Ptr); 
-  SmallVector<Value *, 4> Indices; 
- 
-  // We may end up computing an offset pointer that has the wrong type. If we 
-  // never are able to compute one directly that has the correct type, we'll 
-  // fall back to it, so keep it and the base it was computed from around here. 
-  Value *OffsetPtr = nullptr; 
-  Value *OffsetBasePtr; 
- 
-  // Remember any i8 pointer we come across to re-use if we need to do a raw 
-  // byte offset. 
-  Value *Int8Ptr = nullptr; 
-  APInt Int8PtrOffset(Offset.getBitWidth(), 0); 
- 
-  PointerType *TargetPtrTy = cast<PointerType>(PointerTy); 
-  Type *TargetTy = TargetPtrTy->getElementType(); 
- 
-  // As `addrspacecast` is , `Ptr` (the storage pointer) may have different 
-  // address space from the expected `PointerTy` (the pointer to be used). 
-  // Adjust the pointer type based the original storage pointer. 
-  auto AS = cast<PointerType>(Ptr->getType())->getAddressSpace(); 
-  PointerTy = TargetTy->getPointerTo(AS); 
- 
-  do { 
-    // First fold any existing GEPs into the offset. 
-    while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) { 
-      APInt GEPOffset(Offset.getBitWidth(), 0); 
-      if (!GEP->accumulateConstantOffset(DL, GEPOffset)) 
-        break; 
-      Offset += GEPOffset; 
-      Ptr = GEP->getPointerOperand(); 
-      if (!Visited.insert(Ptr).second) 
-        break; 
-    } 
- 
-    // See if we can perform a natural GEP here. 
-    Indices.clear(); 
-    if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy, 
-                                           Indices, NamePrefix)) { 
-      // If we have a new natural pointer at the offset, clear out any old 
-      // offset pointer we computed. Unless it is the base pointer or 
-      // a non-instruction, we built a GEP we don't need. Zap it. 
-      if (OffsetPtr && OffsetPtr != OffsetBasePtr) 
-        if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) { 
-          assert(I->use_empty() && "Built a GEP with uses some how!"); 
-          I->eraseFromParent(); 
-        } 
-      OffsetPtr = P; 
-      OffsetBasePtr = Ptr; 
-      // If we also found a pointer of the right type, we're done. 
-      if (P->getType() == PointerTy) 
-        break; 
-    } 
- 
-    // Stash this pointer if we've found an i8*. 
-    if (Ptr->getType()->isIntegerTy(8)) { 
-      Int8Ptr = Ptr; 
-      Int8PtrOffset = Offset; 
-    } 
- 
-    // Peel off a layer of the pointer and update the offset appropriately. 
-    if (Operator::getOpcode(Ptr) == Instruction::BitCast) { 
-      Ptr = cast<Operator>(Ptr)->getOperand(0); 
-    } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) { 
-      if (GA->isInterposable()) 
-        break; 
-      Ptr = GA->getAliasee(); 
-    } else { 
-      break; 
-    } 
-    assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!"); 
-  } while (Visited.insert(Ptr).second); 
- 
-  if (!OffsetPtr) { 
-    if (!Int8Ptr) { 
-      Int8Ptr = IRB.CreateBitCast( 
-          Ptr, IRB.getInt8PtrTy(PointerTy->getPointerAddressSpace()), 
-          NamePrefix + "sroa_raw_cast"); 
-      Int8PtrOffset = Offset; 
-    } 
- 
-    OffsetPtr = Int8PtrOffset == 0 
-                    ? Int8Ptr 
-                    : IRB.CreateInBoundsGEP(IRB.getInt8Ty(), Int8Ptr, 
-                                            IRB.getInt(Int8PtrOffset), 
-                                            NamePrefix + "sroa_raw_idx"); 
-  } 
-  Ptr = OffsetPtr; 
- 
-  // On the off chance we were targeting i8*, guard the bitcast here. 
-  if (cast<PointerType>(Ptr->getType()) != TargetPtrTy) { 
-    Ptr = IRB.CreatePointerBitCastOrAddrSpaceCast(Ptr, 
-                                                  TargetPtrTy, 
-                                                  NamePrefix + "sroa_cast"); 
-  } 
- 
-  return Ptr; 
-} 
- 
-/// Compute the adjusted alignment for a load or store from an offset. 
-static Align getAdjustedAlignment(Instruction *I, uint64_t Offset) { 
-  return commonAlignment(getLoadStoreAlignment(I), Offset); 
-} 
- 
-/// Test whether we can convert a value from the old to the new type. 
-/// 
-/// This predicate should be used to guard calls to convertValue in order to 
-/// ensure that we only try to convert viable values. The strategy is that we 
-/// will peel off single element struct and array wrappings to get to an 
-/// underlying value, and convert that value. 
-static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { 
-  if (OldTy == NewTy) 
-    return true; 
- 
-  // For integer types, we can't handle any bit-width differences. This would 
-  // break both vector conversions with extension and introduce endianness 
-  // issues when in conjunction with loads and stores. 
-  if (isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) { 
-    assert(cast<IntegerType>(OldTy)->getBitWidth() != 
-               cast<IntegerType>(NewTy)->getBitWidth() && 
-           "We can't have the same bitwidth for different int types"); 
-    return false; 
-  } 
- 
-  if (DL.getTypeSizeInBits(NewTy).getFixedSize() != 
-      DL.getTypeSizeInBits(OldTy).getFixedSize()) 
-    return false; 
-  if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType()) 
-    return false; 
- 
-  // We can convert pointers to integers and vice-versa. Same for vectors 
-  // of pointers and integers. 
-  OldTy = OldTy->getScalarType(); 
-  NewTy = NewTy->getScalarType(); 
-  if (NewTy->isPointerTy() || OldTy->isPointerTy()) { 
-    if (NewTy->isPointerTy() && OldTy->isPointerTy()) { 
-      unsigned OldAS = OldTy->getPointerAddressSpace(); 
-      unsigned NewAS = NewTy->getPointerAddressSpace(); 
-      // Convert pointers if they are pointers from the same address space or 
-      // different integral (not non-integral) address spaces with the same 
-      // pointer size. 
-      return OldAS == NewAS || 
-             (!DL.isNonIntegralAddressSpace(OldAS) && 
-              !DL.isNonIntegralAddressSpace(NewAS) && 
-              DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS)); 
-    } 
- 
-    // We can convert integers to integral pointers, but not to non-integral 
-    // pointers. 
-    if (OldTy->isIntegerTy()) 
-      return !DL.isNonIntegralPointerType(NewTy); 
- 
-    // We can convert integral pointers to integers, but non-integral pointers 
-    // need to remain pointers. 
-    if (!DL.isNonIntegralPointerType(OldTy)) 
-      return NewTy->isIntegerTy(); 
- 
-    return false; 
-  } 
- 
-  return true; 
-} 
- 
-/// Generic routine to convert an SSA value to a value of a different 
-/// type. 
-/// 
-/// This will try various different casting techniques, such as bitcasts, 
-/// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test 
-/// two types for viability with this routine. 
-static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, 
-                           Type *NewTy) { 
-  Type *OldTy = V->getType(); 
-  assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type"); 
- 
-  if (OldTy == NewTy) 
-    return V; 
- 
-  assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) && 
-         "Integer types must be the exact same to convert."); 
- 
-  // See if we need inttoptr for this type pair. May require additional bitcast. 
-  if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) { 
-    // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8* 
-    // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*> 
-    // Expand <4 x i32> to <2 x i8*> --> <4 x i32> to <2 x i64> to <2 x i8*> 
-    // Directly handle i64 to i8* 
-    return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)), 
-                              NewTy); 
-  } 
- 
-  // See if we need ptrtoint for this type pair. May require additional bitcast. 
-  if (OldTy->isPtrOrPtrVectorTy() && NewTy->isIntOrIntVectorTy()) { 
-    // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128 
-    // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32> 
-    // Expand <2 x i8*> to <4 x i32> --> <2 x i8*> to <2 x i64> to <4 x i32> 
-    // Expand i8* to i64 --> i8* to i64 to i64 
-    return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), 
-                             NewTy); 
-  } 
- 
-  if (OldTy->isPtrOrPtrVectorTy() && NewTy->isPtrOrPtrVectorTy()) { 
-    unsigned OldAS = OldTy->getPointerAddressSpace(); 
-    unsigned NewAS = NewTy->getPointerAddressSpace(); 
-    // To convert pointers with different address spaces (they are already 
-    // checked convertible, i.e. they have the same pointer size), so far we 
-    // cannot use `bitcast` (which has restrict on the same address space) or 
-    // `addrspacecast` (which is not always no-op casting). Instead, use a pair 
-    // of no-op `ptrtoint`/`inttoptr` casts through an integer with the same bit 
-    // size. 
-    if (OldAS != NewAS) { 
-      assert(DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS)); 
-      return IRB.CreateIntToPtr(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), 
-                                NewTy); 
-    } 
-  } 
- 
-  return IRB.CreateBitCast(V, NewTy); 
-} 
- 
-/// Test whether the given slice use can be promoted to a vector. 
-/// 
-/// This function is called to test each entry in a partition which is slated 
-/// for a single slice. 
-static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, 
-                                            VectorType *Ty, 
-                                            uint64_t ElementSize, 
-                                            const DataLayout &DL) { 
-  // First validate the slice offsets. 
-  uint64_t BeginOffset = 
-      std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset(); 
-  uint64_t BeginIndex = BeginOffset / ElementSize; 
-  if (BeginIndex * ElementSize != BeginOffset || 
-      BeginIndex >= cast<FixedVectorType>(Ty)->getNumElements()) 
-    return false; 
-  uint64_t EndOffset = 
-      std::min(S.endOffset(), P.endOffset()) - P.beginOffset(); 
-  uint64_t EndIndex = EndOffset / ElementSize; 
-  if (EndIndex * ElementSize != EndOffset || 
-      EndIndex > cast<FixedVectorType>(Ty)->getNumElements()) 
-    return false; 
- 
-  assert(EndIndex > BeginIndex && "Empty vector!"); 
-  uint64_t NumElements = EndIndex - BeginIndex; 
-  Type *SliceTy = (NumElements == 1) 
-                      ? Ty->getElementType() 
-                      : FixedVectorType::get(Ty->getElementType(), NumElements); 
- 
-  Type *SplitIntTy = 
-      Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8); 
- 
-  Use *U = S.getUse(); 
- 
-  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) { 
-    if (MI->isVolatile()) 
-      return false; 
-    if (!S.isSplittable()) 
-      return false; // Skip any unsplittable intrinsics. 
-  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) { 
+  // Even though we don't look through PHI nodes, we could be called on an
+  // instruction in an unreachable block, which may be on a cycle.
+  SmallPtrSet<Value *, 4> Visited;
+  Visited.insert(Ptr);
+  SmallVector<Value *, 4> Indices;
+
+  // We may end up computing an offset pointer that has the wrong type. If we
+  // never are able to compute one directly that has the correct type, we'll
+  // fall back to it, so keep it and the base it was computed from around here.
+  Value *OffsetPtr = nullptr;
+  Value *OffsetBasePtr;
+
+  // Remember any i8 pointer we come across to re-use if we need to do a raw
+  // byte offset.
+  Value *Int8Ptr = nullptr;
+  APInt Int8PtrOffset(Offset.getBitWidth(), 0);
+
+  PointerType *TargetPtrTy = cast<PointerType>(PointerTy);
+  Type *TargetTy = TargetPtrTy->getElementType();
+
+  // As `addrspacecast` is , `Ptr` (the storage pointer) may have different
+  // address space from the expected `PointerTy` (the pointer to be used).
+  // Adjust the pointer type based the original storage pointer.
+  auto AS = cast<PointerType>(Ptr->getType())->getAddressSpace();
+  PointerTy = TargetTy->getPointerTo(AS);
+
+  do {
+    // First fold any existing GEPs into the offset.
+    while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
+      APInt GEPOffset(Offset.getBitWidth(), 0);
+      if (!GEP->accumulateConstantOffset(DL, GEPOffset))
+        break;
+      Offset += GEPOffset;
+      Ptr = GEP->getPointerOperand();
+      if (!Visited.insert(Ptr).second)
+        break;
+    }
+
+    // See if we can perform a natural GEP here.
+    Indices.clear();
+    if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy,
+                                           Indices, NamePrefix)) {
+      // If we have a new natural pointer at the offset, clear out any old
+      // offset pointer we computed. Unless it is the base pointer or
+      // a non-instruction, we built a GEP we don't need. Zap it.
+      if (OffsetPtr && OffsetPtr != OffsetBasePtr)
+        if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) {
+          assert(I->use_empty() && "Built a GEP with uses some how!");
+          I->eraseFromParent();
+        }
+      OffsetPtr = P;
+      OffsetBasePtr = Ptr;
+      // If we also found a pointer of the right type, we're done.
+      if (P->getType() == PointerTy)
+        break;
+    }
+
+    // Stash this pointer if we've found an i8*.
+    if (Ptr->getType()->isIntegerTy(8)) {
+      Int8Ptr = Ptr;
+      Int8PtrOffset = Offset;
+    }
+
+    // Peel off a layer of the pointer and update the offset appropriately.
+    if (Operator::getOpcode(Ptr) == Instruction::BitCast) {
+      Ptr = cast<Operator>(Ptr)->getOperand(0);
+    } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) {
+      if (GA->isInterposable())
+        break;
+      Ptr = GA->getAliasee();
+    } else {
+      break;
+    }
+    assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!");
+  } while (Visited.insert(Ptr).second);
+
+  if (!OffsetPtr) {
+    if (!Int8Ptr) {
+      Int8Ptr = IRB.CreateBitCast(
+          Ptr, IRB.getInt8PtrTy(PointerTy->getPointerAddressSpace()),
+          NamePrefix + "sroa_raw_cast");
+      Int8PtrOffset = Offset;
+    }
+
+    OffsetPtr = Int8PtrOffset == 0
+                    ? Int8Ptr
+                    : IRB.CreateInBoundsGEP(IRB.getInt8Ty(), Int8Ptr,
+                                            IRB.getInt(Int8PtrOffset),
+                                            NamePrefix + "sroa_raw_idx");
+  }
+  Ptr = OffsetPtr;
+
+  // On the off chance we were targeting i8*, guard the bitcast here.
+  if (cast<PointerType>(Ptr->getType()) != TargetPtrTy) {
+    Ptr = IRB.CreatePointerBitCastOrAddrSpaceCast(Ptr,
+                                                  TargetPtrTy,
+                                                  NamePrefix + "sroa_cast");
+  }
+
+  return Ptr;
+}
+
+/// Compute the adjusted alignment for a load or store from an offset.
+static Align getAdjustedAlignment(Instruction *I, uint64_t Offset) {
+  return commonAlignment(getLoadStoreAlignment(I), Offset);
+}
+
+/// Test whether we can convert a value from the old to the new type.
+///
+/// This predicate should be used to guard calls to convertValue in order to
+/// ensure that we only try to convert viable values. The strategy is that we
+/// will peel off single element struct and array wrappings to get to an
+/// underlying value, and convert that value.
+static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
+  if (OldTy == NewTy)
+    return true;
+
+  // For integer types, we can't handle any bit-width differences. This would
+  // break both vector conversions with extension and introduce endianness
+  // issues when in conjunction with loads and stores.
+  if (isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) {
+    assert(cast<IntegerType>(OldTy)->getBitWidth() !=
+               cast<IntegerType>(NewTy)->getBitWidth() &&
+           "We can't have the same bitwidth for different int types");
+    return false;
+  }
+
+  if (DL.getTypeSizeInBits(NewTy).getFixedSize() !=
+      DL.getTypeSizeInBits(OldTy).getFixedSize())
+    return false;
+  if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
+    return false;
+
+  // We can convert pointers to integers and vice-versa. Same for vectors
+  // of pointers and integers.
+  OldTy = OldTy->getScalarType();
+  NewTy = NewTy->getScalarType();
+  if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
+    if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
+      unsigned OldAS = OldTy->getPointerAddressSpace();
+      unsigned NewAS = NewTy->getPointerAddressSpace();
+      // Convert pointers if they are pointers from the same address space or
+      // different integral (not non-integral) address spaces with the same
+      // pointer size.
+      return OldAS == NewAS ||
+             (!DL.isNonIntegralAddressSpace(OldAS) &&
+              !DL.isNonIntegralAddressSpace(NewAS) &&
+              DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
+    }
+
+    // We can convert integers to integral pointers, but not to non-integral
+    // pointers.
+    if (OldTy->isIntegerTy())
+      return !DL.isNonIntegralPointerType(NewTy);
+
+    // We can convert integral pointers to integers, but non-integral pointers
+    // need to remain pointers.
+    if (!DL.isNonIntegralPointerType(OldTy))
+      return NewTy->isIntegerTy();
+
+    return false;
+  }
+
+  return true;
+}
+
+/// Generic routine to convert an SSA value to a value of a different
+/// type.
+///
+/// This will try various different casting techniques, such as bitcasts,
+/// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
+/// two types for viability with this routine.
+static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
+                           Type *NewTy) {
+  Type *OldTy = V->getType();
+  assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type");
+
+  if (OldTy == NewTy)
+    return V;
+
+  assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) &&
+         "Integer types must be the exact same to convert.");
+
+  // See if we need inttoptr for this type pair. May require additional bitcast.
+  if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
+    // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
+    // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*>
+    // Expand <4 x i32> to <2 x i8*> --> <4 x i32> to <2 x i64> to <2 x i8*>
+    // Directly handle i64 to i8*
+    return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
+                              NewTy);
+  }
+
+  // See if we need ptrtoint for this type pair. May require additional bitcast.
+  if (OldTy->isPtrOrPtrVectorTy() && NewTy->isIntOrIntVectorTy()) {
+    // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
+    // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32>
+    // Expand <2 x i8*> to <4 x i32> --> <2 x i8*> to <2 x i64> to <4 x i32>
+    // Expand i8* to i64 --> i8* to i64 to i64
+    return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
+                             NewTy);
+  }
+
+  if (OldTy->isPtrOrPtrVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
+    unsigned OldAS = OldTy->getPointerAddressSpace();
+    unsigned NewAS = NewTy->getPointerAddressSpace();
+    // To convert pointers with different address spaces (they are already
+    // checked convertible, i.e. they have the same pointer size), so far we
+    // cannot use `bitcast` (which has restrict on the same address space) or
+    // `addrspacecast` (which is not always no-op casting). Instead, use a pair
+    // of no-op `ptrtoint`/`inttoptr` casts through an integer with the same bit
+    // size.
+    if (OldAS != NewAS) {
+      assert(DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
+      return IRB.CreateIntToPtr(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
+                                NewTy);
+    }
+  }
+
+  return IRB.CreateBitCast(V, NewTy);
+}
+
+/// Test whether the given slice use can be promoted to a vector.
+///
+/// This function is called to test each entry in a partition which is slated
+/// for a single slice.
+static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
+                                            VectorType *Ty,
+                                            uint64_t ElementSize,
+                                            const DataLayout &DL) {
+  // First validate the slice offsets.
+  uint64_t BeginOffset =
+      std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
+  uint64_t BeginIndex = BeginOffset / ElementSize;
+  if (BeginIndex * ElementSize != BeginOffset ||
+      BeginIndex >= cast<FixedVectorType>(Ty)->getNumElements())
+    return false;
+  uint64_t EndOffset =
+      std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
+  uint64_t EndIndex = EndOffset / ElementSize;
+  if (EndIndex * ElementSize != EndOffset ||
+      EndIndex > cast<FixedVectorType>(Ty)->getNumElements())
+    return false;
+
+  assert(EndIndex > BeginIndex && "Empty vector!");
+  uint64_t NumElements = EndIndex - BeginIndex;
+  Type *SliceTy = (NumElements == 1)
+                      ? Ty->getElementType()
+                      : FixedVectorType::get(Ty->getElementType(), NumElements);
+
+  Type *SplitIntTy =
+      Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8);
+
+  Use *U = S.getUse();
+
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
+    if (MI->isVolatile())
+      return false;
+    if (!S.isSplittable())
+      return false; // Skip any unsplittable intrinsics.
+  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
     if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
-      return false; 
-  } else if (U->get()->getType()->getPointerElementType()->isStructTy()) { 
-    // Disable vector promotion when there are loads or stores of an FCA. 
-    return false; 
-  } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { 
-    if (LI->isVolatile()) 
-      return false; 
-    Type *LTy = LI->getType(); 
-    if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) { 
-      assert(LTy->isIntegerTy()); 
-      LTy = SplitIntTy; 
-    } 
-    if (!canConvertValue(DL, SliceTy, LTy)) 
-      return false; 
-  } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) { 
-    if (SI->isVolatile()) 
-      return false; 
-    Type *STy = SI->getValueOperand()->getType(); 
-    if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) { 
-      assert(STy->isIntegerTy()); 
-      STy = SplitIntTy; 
-    } 
-    if (!canConvertValue(DL, STy, SliceTy)) 
-      return false; 
-  } else { 
-    return false; 
-  } 
- 
-  return true; 
-} 
- 
-/// Test whether the given alloca partitioning and range of slices can be 
-/// promoted to a vector. 
-/// 
-/// This is a quick test to check whether we can rewrite a particular alloca 
-/// partition (and its newly formed alloca) into a vector alloca with only 
-/// whole-vector loads and stores such that it could be promoted to a vector 
-/// SSA value. We only can ensure this for a limited set of operations, and we 
-/// don't want to do the rewrites unless we are confident that the result will 
-/// be promotable, so we have an early test here. 
-static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) { 
-  // Collect the candidate types for vector-based promotion. Also track whether 
-  // we have different element types. 
-  SmallVector<VectorType *, 4> CandidateTys; 
-  Type *CommonEltTy = nullptr; 
-  bool HaveCommonEltTy = true; 
-  auto CheckCandidateType = [&](Type *Ty) { 
-    if (auto *VTy = dyn_cast<VectorType>(Ty)) { 
-      // Return if bitcast to vectors is different for total size in bits. 
-      if (!CandidateTys.empty()) { 
-        VectorType *V = CandidateTys[0]; 
-        if (DL.getTypeSizeInBits(VTy).getFixedSize() != 
-            DL.getTypeSizeInBits(V).getFixedSize()) { 
-          CandidateTys.clear(); 
-          return; 
-        } 
-      } 
-      CandidateTys.push_back(VTy); 
-      if (!CommonEltTy) 
-        CommonEltTy = VTy->getElementType(); 
-      else if (CommonEltTy != VTy->getElementType()) 
-        HaveCommonEltTy = false; 
-    } 
-  }; 
-  // Consider any loads or stores that are the exact size of the slice. 
-  for (const Slice &S : P) 
-    if (S.beginOffset() == P.beginOffset() && 
-        S.endOffset() == P.endOffset()) { 
-      if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser())) 
-        CheckCandidateType(LI->getType()); 
-      else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser())) 
-        CheckCandidateType(SI->getValueOperand()->getType()); 
-    } 
- 
-  // If we didn't find a vector type, nothing to do here. 
-  if (CandidateTys.empty()) 
-    return nullptr; 
- 
-  // Remove non-integer vector types if we had multiple common element types. 
-  // FIXME: It'd be nice to replace them with integer vector types, but we can't 
-  // do that until all the backends are known to produce good code for all 
-  // integer vector types. 
-  if (!HaveCommonEltTy) { 
+      return false;
+  } else if (U->get()->getType()->getPointerElementType()->isStructTy()) {
+    // Disable vector promotion when there are loads or stores of an FCA.
+    return false;
+  } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
+    if (LI->isVolatile())
+      return false;
+    Type *LTy = LI->getType();
+    if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
+      assert(LTy->isIntegerTy());
+      LTy = SplitIntTy;
+    }
+    if (!canConvertValue(DL, SliceTy, LTy))
+      return false;
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
+    if (SI->isVolatile())
+      return false;
+    Type *STy = SI->getValueOperand()->getType();
+    if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
+      assert(STy->isIntegerTy());
+      STy = SplitIntTy;
+    }
+    if (!canConvertValue(DL, STy, SliceTy))
+      return false;
+  } else {
+    return false;
+  }
+
+  return true;
+}
+
+/// Test whether the given alloca partitioning and range of slices can be
+/// promoted to a vector.
+///
+/// This is a quick test to check whether we can rewrite a particular alloca
+/// partition (and its newly formed alloca) into a vector alloca with only
+/// whole-vector loads and stores such that it could be promoted to a vector
+/// SSA value. We only can ensure this for a limited set of operations, and we
+/// don't want to do the rewrites unless we are confident that the result will
+/// be promotable, so we have an early test here.
+static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
+  // Collect the candidate types for vector-based promotion. Also track whether
+  // we have different element types.
+  SmallVector<VectorType *, 4> CandidateTys;
+  Type *CommonEltTy = nullptr;
+  bool HaveCommonEltTy = true;
+  auto CheckCandidateType = [&](Type *Ty) {
+    if (auto *VTy = dyn_cast<VectorType>(Ty)) {
+      // Return if bitcast to vectors is different for total size in bits.
+      if (!CandidateTys.empty()) {
+        VectorType *V = CandidateTys[0];
+        if (DL.getTypeSizeInBits(VTy).getFixedSize() !=
+            DL.getTypeSizeInBits(V).getFixedSize()) {
+          CandidateTys.clear();
+          return;
+        }
+      }
+      CandidateTys.push_back(VTy);
+      if (!CommonEltTy)
+        CommonEltTy = VTy->getElementType();
+      else if (CommonEltTy != VTy->getElementType())
+        HaveCommonEltTy = false;
+    }
+  };
+  // Consider any loads or stores that are the exact size of the slice.
+  for (const Slice &S : P)
+    if (S.beginOffset() == P.beginOffset() &&
+        S.endOffset() == P.endOffset()) {
+      if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
+        CheckCandidateType(LI->getType());
+      else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
+        CheckCandidateType(SI->getValueOperand()->getType());
+    }
+
+  // If we didn't find a vector type, nothing to do here.
+  if (CandidateTys.empty())
+    return nullptr;
+
+  // Remove non-integer vector types if we had multiple common element types.
+  // FIXME: It'd be nice to replace them with integer vector types, but we can't
+  // do that until all the backends are known to produce good code for all
+  // integer vector types.
+  if (!HaveCommonEltTy) {
     llvm::erase_if(CandidateTys, [](VectorType *VTy) {
       return !VTy->getElementType()->isIntegerTy();
     });
- 
-    // If there were no integer vector types, give up. 
-    if (CandidateTys.empty()) 
-      return nullptr; 
- 
-    // Rank the remaining candidate vector types. This is easy because we know 
-    // they're all integer vectors. We sort by ascending number of elements. 
-    auto RankVectorTypes = [&DL](VectorType *RHSTy, VectorType *LHSTy) { 
-      (void)DL; 
-      assert(DL.getTypeSizeInBits(RHSTy).getFixedSize() == 
-                 DL.getTypeSizeInBits(LHSTy).getFixedSize() && 
-             "Cannot have vector types of different sizes!"); 
-      assert(RHSTy->getElementType()->isIntegerTy() && 
-             "All non-integer types eliminated!"); 
-      assert(LHSTy->getElementType()->isIntegerTy() && 
-             "All non-integer types eliminated!"); 
-      return cast<FixedVectorType>(RHSTy)->getNumElements() < 
-             cast<FixedVectorType>(LHSTy)->getNumElements(); 
-    }; 
-    llvm::sort(CandidateTys, RankVectorTypes); 
-    CandidateTys.erase( 
-        std::unique(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes), 
-        CandidateTys.end()); 
-  } else { 
-// The only way to have the same element type in every vector type is to 
-// have the same vector type. Check that and remove all but one. 
-#ifndef NDEBUG 
-    for (VectorType *VTy : CandidateTys) { 
-      assert(VTy->getElementType() == CommonEltTy && 
-             "Unaccounted for element type!"); 
-      assert(VTy == CandidateTys[0] && 
-             "Different vector types with the same element type!"); 
-    } 
-#endif 
-    CandidateTys.resize(1); 
-  } 
- 
-  // Try each vector type, and return the one which works. 
-  auto CheckVectorTypeForPromotion = [&](VectorType *VTy) { 
-    uint64_t ElementSize = 
-        DL.getTypeSizeInBits(VTy->getElementType()).getFixedSize(); 
- 
-    // While the definition of LLVM vectors is bitpacked, we don't support sizes 
-    // that aren't byte sized. 
-    if (ElementSize % 8) 
-      return false; 
-    assert((DL.getTypeSizeInBits(VTy).getFixedSize() % 8) == 0 && 
-           "vector size not a multiple of element size?"); 
-    ElementSize /= 8; 
- 
-    for (const Slice &S : P) 
-      if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL)) 
-        return false; 
- 
-    for (const Slice *S : P.splitSliceTails()) 
-      if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL)) 
-        return false; 
- 
-    return true; 
-  }; 
-  for (VectorType *VTy : CandidateTys) 
-    if (CheckVectorTypeForPromotion(VTy)) 
-      return VTy; 
- 
-  return nullptr; 
-} 
- 
-/// Test whether a slice of an alloca is valid for integer widening. 
-/// 
-/// This implements the necessary checking for the \c isIntegerWideningViable 
-/// test below on a single slice of the alloca. 
-static bool isIntegerWideningViableForSlice(const Slice &S, 
-                                            uint64_t AllocBeginOffset, 
-                                            Type *AllocaTy, 
-                                            const DataLayout &DL, 
-                                            bool &WholeAllocaOp) { 
-  uint64_t Size = DL.getTypeStoreSize(AllocaTy).getFixedSize(); 
- 
-  uint64_t RelBegin = S.beginOffset() - AllocBeginOffset; 
-  uint64_t RelEnd = S.endOffset() - AllocBeginOffset; 
- 
-  // We can't reasonably handle cases where the load or store extends past 
-  // the end of the alloca's type and into its padding. 
-  if (RelEnd > Size) 
-    return false; 
- 
-  Use *U = S.getUse(); 
- 
-  if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { 
-    if (LI->isVolatile()) 
-      return false; 
-    // We can't handle loads that extend past the allocated memory. 
-    if (DL.getTypeStoreSize(LI->getType()).getFixedSize() > Size) 
-      return false; 
-    // So far, AllocaSliceRewriter does not support widening split slice tails 
-    // in rewriteIntegerLoad. 
-    if (S.beginOffset() < AllocBeginOffset) 
-      return false; 
-    // Note that we don't count vector loads or stores as whole-alloca 
-    // operations which enable integer widening because we would prefer to use 
-    // vector widening instead. 
-    if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size) 
-      WholeAllocaOp = true; 
-    if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) { 
-      if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedSize()) 
-        return false; 
-    } else if (RelBegin != 0 || RelEnd != Size || 
-               !canConvertValue(DL, AllocaTy, LI->getType())) { 
-      // Non-integer loads need to be convertible from the alloca type so that 
-      // they are promotable. 
-      return false; 
-    } 
-  } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) { 
-    Type *ValueTy = SI->getValueOperand()->getType(); 
-    if (SI->isVolatile()) 
-      return false; 
-    // We can't handle stores that extend past the allocated memory. 
-    if (DL.getTypeStoreSize(ValueTy).getFixedSize() > Size) 
-      return false; 
-    // So far, AllocaSliceRewriter does not support widening split slice tails 
-    // in rewriteIntegerStore. 
-    if (S.beginOffset() < AllocBeginOffset) 
-      return false; 
-    // Note that we don't count vector loads or stores as whole-alloca 
-    // operations which enable integer widening because we would prefer to use 
-    // vector widening instead. 
-    if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size) 
-      WholeAllocaOp = true; 
-    if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) { 
-      if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedSize()) 
-        return false; 
-    } else if (RelBegin != 0 || RelEnd != Size || 
-               !canConvertValue(DL, ValueTy, AllocaTy)) { 
-      // Non-integer stores need to be convertible to the alloca type so that 
-      // they are promotable. 
-      return false; 
-    } 
-  } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) { 
-    if (MI->isVolatile() || !isa<Constant>(MI->getLength())) 
-      return false; 
-    if (!S.isSplittable()) 
-      return false; // Skip any unsplittable intrinsics. 
-  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) { 
+
+    // If there were no integer vector types, give up.
+    if (CandidateTys.empty())
+      return nullptr;
+
+    // Rank the remaining candidate vector types. This is easy because we know
+    // they're all integer vectors. We sort by ascending number of elements.
+    auto RankVectorTypes = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
+      (void)DL;
+      assert(DL.getTypeSizeInBits(RHSTy).getFixedSize() ==
+                 DL.getTypeSizeInBits(LHSTy).getFixedSize() &&
+             "Cannot have vector types of different sizes!");
+      assert(RHSTy->getElementType()->isIntegerTy() &&
+             "All non-integer types eliminated!");
+      assert(LHSTy->getElementType()->isIntegerTy() &&
+             "All non-integer types eliminated!");
+      return cast<FixedVectorType>(RHSTy)->getNumElements() <
+             cast<FixedVectorType>(LHSTy)->getNumElements();
+    };
+    llvm::sort(CandidateTys, RankVectorTypes);
+    CandidateTys.erase(
+        std::unique(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes),
+        CandidateTys.end());
+  } else {
+// The only way to have the same element type in every vector type is to
+// have the same vector type. Check that and remove all but one.
+#ifndef NDEBUG
+    for (VectorType *VTy : CandidateTys) {
+      assert(VTy->getElementType() == CommonEltTy &&
+             "Unaccounted for element type!");
+      assert(VTy == CandidateTys[0] &&
+             "Different vector types with the same element type!");
+    }
+#endif
+    CandidateTys.resize(1);
+  }
+
+  // Try each vector type, and return the one which works.
+  auto CheckVectorTypeForPromotion = [&](VectorType *VTy) {
+    uint64_t ElementSize =
+        DL.getTypeSizeInBits(VTy->getElementType()).getFixedSize();
+
+    // While the definition of LLVM vectors is bitpacked, we don't support sizes
+    // that aren't byte sized.
+    if (ElementSize % 8)
+      return false;
+    assert((DL.getTypeSizeInBits(VTy).getFixedSize() % 8) == 0 &&
+           "vector size not a multiple of element size?");
+    ElementSize /= 8;
+
+    for (const Slice &S : P)
+      if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL))
+        return false;
+
+    for (const Slice *S : P.splitSliceTails())
+      if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL))
+        return false;
+
+    return true;
+  };
+  for (VectorType *VTy : CandidateTys)
+    if (CheckVectorTypeForPromotion(VTy))
+      return VTy;
+
+  return nullptr;
+}
+
+/// Test whether a slice of an alloca is valid for integer widening.
+///
+/// This implements the necessary checking for the \c isIntegerWideningViable
+/// test below on a single slice of the alloca.
+static bool isIntegerWideningViableForSlice(const Slice &S,
+                                            uint64_t AllocBeginOffset,
+                                            Type *AllocaTy,
+                                            const DataLayout &DL,
+                                            bool &WholeAllocaOp) {
+  uint64_t Size = DL.getTypeStoreSize(AllocaTy).getFixedSize();
+
+  uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
+  uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
+
+  // We can't reasonably handle cases where the load or store extends past
+  // the end of the alloca's type and into its padding.
+  if (RelEnd > Size)
+    return false;
+
+  Use *U = S.getUse();
+
+  if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
+    if (LI->isVolatile())
+      return false;
+    // We can't handle loads that extend past the allocated memory.
+    if (DL.getTypeStoreSize(LI->getType()).getFixedSize() > Size)
+      return false;
+    // So far, AllocaSliceRewriter does not support widening split slice tails
+    // in rewriteIntegerLoad.
+    if (S.beginOffset() < AllocBeginOffset)
+      return false;
+    // Note that we don't count vector loads or stores as whole-alloca
+    // operations which enable integer widening because we would prefer to use
+    // vector widening instead.
+    if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size)
+      WholeAllocaOp = true;
+    if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
+      if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedSize())
+        return false;
+    } else if (RelBegin != 0 || RelEnd != Size ||
+               !canConvertValue(DL, AllocaTy, LI->getType())) {
+      // Non-integer loads need to be convertible from the alloca type so that
+      // they are promotable.
+      return false;
+    }
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
+    Type *ValueTy = SI->getValueOperand()->getType();
+    if (SI->isVolatile())
+      return false;
+    // We can't handle stores that extend past the allocated memory.
+    if (DL.getTypeStoreSize(ValueTy).getFixedSize() > Size)
+      return false;
+    // So far, AllocaSliceRewriter does not support widening split slice tails
+    // in rewriteIntegerStore.
+    if (S.beginOffset() < AllocBeginOffset)
+      return false;
+    // Note that we don't count vector loads or stores as whole-alloca
+    // operations which enable integer widening because we would prefer to use
+    // vector widening instead.
+    if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size)
+      WholeAllocaOp = true;
+    if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
+      if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedSize())
+        return false;
+    } else if (RelBegin != 0 || RelEnd != Size ||
+               !canConvertValue(DL, ValueTy, AllocaTy)) {
+      // Non-integer stores need to be convertible to the alloca type so that
+      // they are promotable.
+      return false;
+    }
+  } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
+    if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
+      return false;
+    if (!S.isSplittable())
+      return false; // Skip any unsplittable intrinsics.
+  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
     if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
-      return false; 
-  } else { 
-    return false; 
-  } 
- 
-  return true; 
-} 
- 
-/// Test whether the given alloca partition's integer operations can be 
-/// widened to promotable ones. 
-/// 
-/// This is a quick test to check whether we can rewrite the integer loads and 
-/// stores to a particular alloca into wider loads and stores and be able to 
-/// promote the resulting alloca. 
-static bool isIntegerWideningViable(Partition &P, Type *AllocaTy, 
-                                    const DataLayout &DL) { 
-  uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy).getFixedSize(); 
-  // Don't create integer types larger than the maximum bitwidth. 
-  if (SizeInBits > IntegerType::MAX_INT_BITS) 
-    return false; 
- 
-  // Don't try to handle allocas with bit-padding. 
-  if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy).getFixedSize()) 
-    return false; 
- 
-  // We need to ensure that an integer type with the appropriate bitwidth can 
-  // be converted to the alloca type, whatever that is. We don't want to force 
-  // the alloca itself to have an integer type if there is a more suitable one. 
-  Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits); 
-  if (!canConvertValue(DL, AllocaTy, IntTy) || 
-      !canConvertValue(DL, IntTy, AllocaTy)) 
-    return false; 
- 
-  // While examining uses, we ensure that the alloca has a covering load or 
-  // store. We don't want to widen the integer operations only to fail to 
-  // promote due to some other unsplittable entry (which we may make splittable 
-  // later). However, if there are only splittable uses, go ahead and assume 
-  // that we cover the alloca. 
-  // FIXME: We shouldn't consider split slices that happen to start in the 
-  // partition here... 
+      return false;
+  } else {
+    return false;
+  }
+
+  return true;
+}
+
+/// Test whether the given alloca partition's integer operations can be
+/// widened to promotable ones.
+///
+/// This is a quick test to check whether we can rewrite the integer loads and
+/// stores to a particular alloca into wider loads and stores and be able to
+/// promote the resulting alloca.
+static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
+                                    const DataLayout &DL) {
+  uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy).getFixedSize();
+  // Don't create integer types larger than the maximum bitwidth.
+  if (SizeInBits > IntegerType::MAX_INT_BITS)
+    return false;
+
+  // Don't try to handle allocas with bit-padding.
+  if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy).getFixedSize())
+    return false;
+
+  // We need to ensure that an integer type with the appropriate bitwidth can
+  // be converted to the alloca type, whatever that is. We don't want to force
+  // the alloca itself to have an integer type if there is a more suitable one.
+  Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
+  if (!canConvertValue(DL, AllocaTy, IntTy) ||
+      !canConvertValue(DL, IntTy, AllocaTy))
+    return false;
+
+  // While examining uses, we ensure that the alloca has a covering load or
+  // store. We don't want to widen the integer operations only to fail to
+  // promote due to some other unsplittable entry (which we may make splittable
+  // later). However, if there are only splittable uses, go ahead and assume
+  // that we cover the alloca.
+  // FIXME: We shouldn't consider split slices that happen to start in the
+  // partition here...
   bool WholeAllocaOp = P.empty() && DL.isLegalInteger(SizeInBits);
- 
-  for (const Slice &S : P) 
-    if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL, 
-                                         WholeAllocaOp)) 
-      return false; 
- 
-  for (const Slice *S : P.splitSliceTails()) 
-    if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL, 
-                                         WholeAllocaOp)) 
-      return false; 
- 
-  return WholeAllocaOp; 
-} 
- 
-static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V, 
-                             IntegerType *Ty, uint64_t Offset, 
-                             const Twine &Name) { 
-  LLVM_DEBUG(dbgs() << "       start: " << *V << "\n"); 
-  IntegerType *IntTy = cast<IntegerType>(V->getType()); 
-  assert(DL.getTypeStoreSize(Ty).getFixedSize() + Offset <= 
-             DL.getTypeStoreSize(IntTy).getFixedSize() && 
-         "Element extends past full value"); 
-  uint64_t ShAmt = 8 * Offset; 
-  if (DL.isBigEndian()) 
-    ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedSize() - 
-                 DL.getTypeStoreSize(Ty).getFixedSize() - Offset); 
-  if (ShAmt) { 
-    V = IRB.CreateLShr(V, ShAmt, Name + ".shift"); 
-    LLVM_DEBUG(dbgs() << "     shifted: " << *V << "\n"); 
-  } 
-  assert(Ty->getBitWidth() <= IntTy->getBitWidth() && 
-         "Cannot extract to a larger integer!"); 
-  if (Ty != IntTy) { 
-    V = IRB.CreateTrunc(V, Ty, Name + ".trunc"); 
-    LLVM_DEBUG(dbgs() << "     trunced: " << *V << "\n"); 
-  } 
-  return V; 
-} 
- 
-static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old, 
-                            Value *V, uint64_t Offset, const Twine &Name) { 
-  IntegerType *IntTy = cast<IntegerType>(Old->getType()); 
-  IntegerType *Ty = cast<IntegerType>(V->getType()); 
-  assert(Ty->getBitWidth() <= IntTy->getBitWidth() && 
-         "Cannot insert a larger integer!"); 
-  LLVM_DEBUG(dbgs() << "       start: " << *V << "\n"); 
-  if (Ty != IntTy) { 
-    V = IRB.CreateZExt(V, IntTy, Name + ".ext"); 
-    LLVM_DEBUG(dbgs() << "    extended: " << *V << "\n"); 
-  } 
-  assert(DL.getTypeStoreSize(Ty).getFixedSize() + Offset <= 
-             DL.getTypeStoreSize(IntTy).getFixedSize() && 
-         "Element store outside of alloca store"); 
-  uint64_t ShAmt = 8 * Offset; 
-  if (DL.isBigEndian()) 
-    ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedSize() - 
-                 DL.getTypeStoreSize(Ty).getFixedSize() - Offset); 
-  if (ShAmt) { 
-    V = IRB.CreateShl(V, ShAmt, Name + ".shift"); 
-    LLVM_DEBUG(dbgs() << "     shifted: " << *V << "\n"); 
-  } 
- 
-  if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) { 
-    APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt); 
-    Old = IRB.CreateAnd(Old, Mask, Name + ".mask"); 
-    LLVM_DEBUG(dbgs() << "      masked: " << *Old << "\n"); 
-    V = IRB.CreateOr(Old, V, Name + ".insert"); 
-    LLVM_DEBUG(dbgs() << "    inserted: " << *V << "\n"); 
-  } 
-  return V; 
-} 
- 
-static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex, 
-                            unsigned EndIndex, const Twine &Name) { 
-  auto *VecTy = cast<FixedVectorType>(V->getType()); 
-  unsigned NumElements = EndIndex - BeginIndex; 
-  assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); 
- 
-  if (NumElements == VecTy->getNumElements()) 
-    return V; 
- 
-  if (NumElements == 1) { 
-    V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex), 
-                                 Name + ".extract"); 
-    LLVM_DEBUG(dbgs() << "     extract: " << *V << "\n"); 
-    return V; 
-  } 
- 
-  SmallVector<int, 8> Mask; 
-  Mask.reserve(NumElements); 
-  for (unsigned i = BeginIndex; i != EndIndex; ++i) 
-    Mask.push_back(i); 
+
+  for (const Slice &S : P)
+    if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL,
+                                         WholeAllocaOp))
+      return false;
+
+  for (const Slice *S : P.splitSliceTails())
+    if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL,
+                                         WholeAllocaOp))
+      return false;
+
+  return WholeAllocaOp;
+}
+
+static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
+                             IntegerType *Ty, uint64_t Offset,
+                             const Twine &Name) {
+  LLVM_DEBUG(dbgs() << "       start: " << *V << "\n");
+  IntegerType *IntTy = cast<IntegerType>(V->getType());
+  assert(DL.getTypeStoreSize(Ty).getFixedSize() + Offset <=
+             DL.getTypeStoreSize(IntTy).getFixedSize() &&
+         "Element extends past full value");
+  uint64_t ShAmt = 8 * Offset;
+  if (DL.isBigEndian())
+    ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedSize() -
+                 DL.getTypeStoreSize(Ty).getFixedSize() - Offset);
+  if (ShAmt) {
+    V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
+    LLVM_DEBUG(dbgs() << "     shifted: " << *V << "\n");
+  }
+  assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
+         "Cannot extract to a larger integer!");
+  if (Ty != IntTy) {
+    V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
+    LLVM_DEBUG(dbgs() << "     trunced: " << *V << "\n");
+  }
+  return V;
+}
+
+static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
+                            Value *V, uint64_t Offset, const Twine &Name) {
+  IntegerType *IntTy = cast<IntegerType>(Old->getType());
+  IntegerType *Ty = cast<IntegerType>(V->getType());
+  assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
+         "Cannot insert a larger integer!");
+  LLVM_DEBUG(dbgs() << "       start: " << *V << "\n");
+  if (Ty != IntTy) {
+    V = IRB.CreateZExt(V, IntTy, Name + ".ext");
+    LLVM_DEBUG(dbgs() << "    extended: " << *V << "\n");
+  }
+  assert(DL.getTypeStoreSize(Ty).getFixedSize() + Offset <=
+             DL.getTypeStoreSize(IntTy).getFixedSize() &&
+         "Element store outside of alloca store");
+  uint64_t ShAmt = 8 * Offset;
+  if (DL.isBigEndian())
+    ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedSize() -
+                 DL.getTypeStoreSize(Ty).getFixedSize() - Offset);
+  if (ShAmt) {
+    V = IRB.CreateShl(V, ShAmt, Name + ".shift");
+    LLVM_DEBUG(dbgs() << "     shifted: " << *V << "\n");
+  }
+
+  if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
+    APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
+    Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
+    LLVM_DEBUG(dbgs() << "      masked: " << *Old << "\n");
+    V = IRB.CreateOr(Old, V, Name + ".insert");
+    LLVM_DEBUG(dbgs() << "    inserted: " << *V << "\n");
+  }
+  return V;
+}
+
+static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
+                            unsigned EndIndex, const Twine &Name) {
+  auto *VecTy = cast<FixedVectorType>(V->getType());
+  unsigned NumElements = EndIndex - BeginIndex;
+  assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
+
+  if (NumElements == VecTy->getNumElements())
+    return V;
+
+  if (NumElements == 1) {
+    V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
+                                 Name + ".extract");
+    LLVM_DEBUG(dbgs() << "     extract: " << *V << "\n");
+    return V;
+  }
+
+  SmallVector<int, 8> Mask;
+  Mask.reserve(NumElements);
+  for (unsigned i = BeginIndex; i != EndIndex; ++i)
+    Mask.push_back(i);
   V = IRB.CreateShuffleVector(V, Mask, Name + ".extract");
-  LLVM_DEBUG(dbgs() << "     shuffle: " << *V << "\n"); 
-  return V; 
-} 
- 
-static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V, 
-                           unsigned BeginIndex, const Twine &Name) { 
-  VectorType *VecTy = cast<VectorType>(Old->getType()); 
-  assert(VecTy && "Can only insert a vector into a vector"); 
- 
-  VectorType *Ty = dyn_cast<VectorType>(V->getType()); 
-  if (!Ty) { 
-    // Single element to insert. 
-    V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex), 
-                                Name + ".insert"); 
-    LLVM_DEBUG(dbgs() << "     insert: " << *V << "\n"); 
-    return V; 
-  } 
- 
-  assert(cast<FixedVectorType>(Ty)->getNumElements() <= 
-             cast<FixedVectorType>(VecTy)->getNumElements() && 
-         "Too many elements!"); 
-  if (cast<FixedVectorType>(Ty)->getNumElements() == 
-      cast<FixedVectorType>(VecTy)->getNumElements()) { 
-    assert(V->getType() == VecTy && "Vector type mismatch"); 
-    return V; 
-  } 
-  unsigned EndIndex = BeginIndex + cast<FixedVectorType>(Ty)->getNumElements(); 
- 
-  // When inserting a smaller vector into the larger to store, we first 
-  // use a shuffle vector to widen it with undef elements, and then 
-  // a second shuffle vector to select between the loaded vector and the 
-  // incoming vector. 
+  LLVM_DEBUG(dbgs() << "     shuffle: " << *V << "\n");
+  return V;
+}
+
+static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
+                           unsigned BeginIndex, const Twine &Name) {
+  VectorType *VecTy = cast<VectorType>(Old->getType());
+  assert(VecTy && "Can only insert a vector into a vector");
+
+  VectorType *Ty = dyn_cast<VectorType>(V->getType());
+  if (!Ty) {
+    // Single element to insert.
+    V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
+                                Name + ".insert");
+    LLVM_DEBUG(dbgs() << "     insert: " << *V << "\n");
+    return V;
+  }
+
+  assert(cast<FixedVectorType>(Ty)->getNumElements() <=
+             cast<FixedVectorType>(VecTy)->getNumElements() &&
+         "Too many elements!");
+  if (cast<FixedVectorType>(Ty)->getNumElements() ==
+      cast<FixedVectorType>(VecTy)->getNumElements()) {
+    assert(V->getType() == VecTy && "Vector type mismatch");
+    return V;
+  }
+  unsigned EndIndex = BeginIndex + cast<FixedVectorType>(Ty)->getNumElements();
+
+  // When inserting a smaller vector into the larger to store, we first
+  // use a shuffle vector to widen it with undef elements, and then
+  // a second shuffle vector to select between the loaded vector and the
+  // incoming vector.
   SmallVector<int, 8> Mask;
-  Mask.reserve(cast<FixedVectorType>(VecTy)->getNumElements()); 
-  for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i) 
-    if (i >= BeginIndex && i < EndIndex) 
+  Mask.reserve(cast<FixedVectorType>(VecTy)->getNumElements());
+  for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
+    if (i >= BeginIndex && i < EndIndex)
       Mask.push_back(i - BeginIndex);
-    else 
+    else
       Mask.push_back(-1);
   V = IRB.CreateShuffleVector(V, Mask, Name + ".expand");
-  LLVM_DEBUG(dbgs() << "    shuffle: " << *V << "\n"); 
- 
+  LLVM_DEBUG(dbgs() << "    shuffle: " << *V << "\n");
+
   SmallVector<Constant *, 8> Mask2;
   Mask2.reserve(cast<FixedVectorType>(VecTy)->getNumElements());
-  for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i) 
+  for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
     Mask2.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex));
- 
+
   V = IRB.CreateSelect(ConstantVector::get(Mask2), V, Old, Name + "blend");
- 
-  LLVM_DEBUG(dbgs() << "    blend: " << *V << "\n"); 
-  return V; 
-} 
- 
-/// Visitor to rewrite instructions using p particular slice of an alloca 
-/// to use a new alloca. 
-/// 
-/// Also implements the rewriting to vector-based accesses when the partition 
-/// passes the isVectorPromotionViable predicate. Most of the rewriting logic 
-/// lives here. 
-class llvm::sroa::AllocaSliceRewriter 
-    : public InstVisitor<AllocaSliceRewriter, bool> { 
-  // Befriend the base class so it can delegate to private visit methods. 
-  friend class InstVisitor<AllocaSliceRewriter, bool>; 
- 
-  using Base = InstVisitor<AllocaSliceRewriter, bool>; 
- 
-  const DataLayout &DL; 
-  AllocaSlices &AS; 
-  SROA &Pass; 
-  AllocaInst &OldAI, &NewAI; 
-  const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset; 
-  Type *NewAllocaTy; 
- 
-  // This is a convenience and flag variable that will be null unless the new 
-  // alloca's integer operations should be widened to this integer type due to 
-  // passing isIntegerWideningViable above. If it is non-null, the desired 
-  // integer type will be stored here for easy access during rewriting. 
-  IntegerType *IntTy; 
- 
-  // If we are rewriting an alloca partition which can be written as pure 
-  // vector operations, we stash extra information here. When VecTy is 
-  // non-null, we have some strict guarantees about the rewritten alloca: 
-  //   - The new alloca is exactly the size of the vector type here. 
-  //   - The accesses all either map to the entire vector or to a single 
-  //     element. 
-  //   - The set of accessing instructions is only one of those handled above 
-  //     in isVectorPromotionViable. Generally these are the same access kinds 
-  //     which are promotable via mem2reg. 
-  VectorType *VecTy; 
-  Type *ElementTy; 
-  uint64_t ElementSize; 
- 
-  // The original offset of the slice currently being rewritten relative to 
-  // the original alloca. 
-  uint64_t BeginOffset = 0; 
-  uint64_t EndOffset = 0; 
- 
-  // The new offsets of the slice currently being rewritten relative to the 
-  // original alloca. 
-  uint64_t NewBeginOffset = 0, NewEndOffset = 0; 
- 
-  uint64_t SliceSize = 0; 
-  bool IsSplittable = false; 
-  bool IsSplit = false; 
-  Use *OldUse = nullptr; 
-  Instruction *OldPtr = nullptr; 
- 
-  // Track post-rewrite users which are PHI nodes and Selects. 
-  SmallSetVector<PHINode *, 8> &PHIUsers; 
-  SmallSetVector<SelectInst *, 8> &SelectUsers; 
- 
-  // Utility IR builder, whose name prefix is setup for each visited use, and 
-  // the insertion point is set to point to the user. 
-  IRBuilderTy IRB; 
- 
-public: 
-  AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass, 
-                      AllocaInst &OldAI, AllocaInst &NewAI, 
-                      uint64_t NewAllocaBeginOffset, 
-                      uint64_t NewAllocaEndOffset, bool IsIntegerPromotable, 
-                      VectorType *PromotableVecTy, 
-                      SmallSetVector<PHINode *, 8> &PHIUsers, 
-                      SmallSetVector<SelectInst *, 8> &SelectUsers) 
-      : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI), 
-        NewAllocaBeginOffset(NewAllocaBeginOffset), 
-        NewAllocaEndOffset(NewAllocaEndOffset), 
-        NewAllocaTy(NewAI.getAllocatedType()), 
-        IntTy( 
-            IsIntegerPromotable 
-                ? Type::getIntNTy(NewAI.getContext(), 
-                                  DL.getTypeSizeInBits(NewAI.getAllocatedType()) 
-                                      .getFixedSize()) 
-                : nullptr), 
-        VecTy(PromotableVecTy), 
-        ElementTy(VecTy ? VecTy->getElementType() : nullptr), 
-        ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy).getFixedSize() / 8 
-                          : 0), 
-        PHIUsers(PHIUsers), SelectUsers(SelectUsers), 
-        IRB(NewAI.getContext(), ConstantFolder()) { 
-    if (VecTy) { 
-      assert((DL.getTypeSizeInBits(ElementTy).getFixedSize() % 8) == 0 && 
-             "Only multiple-of-8 sized vector elements are viable"); 
-      ++NumVectorized; 
-    } 
-    assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy)); 
-  } 
- 
-  bool visit(AllocaSlices::const_iterator I) { 
-    bool CanSROA = true; 
-    BeginOffset = I->beginOffset(); 
-    EndOffset = I->endOffset(); 
-    IsSplittable = I->isSplittable(); 
-    IsSplit = 
-        BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset; 
-    LLVM_DEBUG(dbgs() << "  rewriting " << (IsSplit ? "split " : "")); 
-    LLVM_DEBUG(AS.printSlice(dbgs(), I, "")); 
-    LLVM_DEBUG(dbgs() << "\n"); 
- 
-    // Compute the intersecting offset range. 
-    assert(BeginOffset < NewAllocaEndOffset); 
-    assert(EndOffset > NewAllocaBeginOffset); 
-    NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset); 
-    NewEndOffset = std::min(EndOffset, NewAllocaEndOffset); 
- 
-    SliceSize = NewEndOffset - NewBeginOffset; 
- 
-    OldUse = I->getUse(); 
-    OldPtr = cast<Instruction>(OldUse->get()); 
- 
-    Instruction *OldUserI = cast<Instruction>(OldUse->getUser()); 
-    IRB.SetInsertPoint(OldUserI); 
-    IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc()); 
-    IRB.getInserter().SetNamePrefix( 
-        Twine(NewAI.getName()) + "." + Twine(BeginOffset) + "."); 
- 
-    CanSROA &= visit(cast<Instruction>(OldUse->getUser())); 
-    if (VecTy || IntTy) 
-      assert(CanSROA); 
-    return CanSROA; 
-  } 
- 
-private: 
-  // Make sure the other visit overloads are visible. 
-  using Base::visit; 
- 
-  // Every instruction which can end up as a user must have a rewrite rule. 
-  bool visitInstruction(Instruction &I) { 
-    LLVM_DEBUG(dbgs() << "    !!!! Cannot rewrite: " << I << "\n"); 
-    llvm_unreachable("No rewrite rule for this instruction!"); 
-  } 
- 
-  Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) { 
-    // Note that the offset computation can use BeginOffset or NewBeginOffset 
-    // interchangeably for unsplit slices. 
-    assert(IsSplit || BeginOffset == NewBeginOffset); 
-    uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; 
- 
-#ifndef NDEBUG 
-    StringRef OldName = OldPtr->getName(); 
-    // Skip through the last '.sroa.' component of the name. 
-    size_t LastSROAPrefix = OldName.rfind(".sroa."); 
-    if (LastSROAPrefix != StringRef::npos) { 
-      OldName = OldName.substr(LastSROAPrefix + strlen(".sroa.")); 
-      // Look for an SROA slice index. 
-      size_t IndexEnd = OldName.find_first_not_of("0123456789"); 
-      if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') { 
-        // Strip the index and look for the offset. 
-        OldName = OldName.substr(IndexEnd + 1); 
-        size_t OffsetEnd = OldName.find_first_not_of("0123456789"); 
-        if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.') 
-          // Strip the offset. 
-          OldName = OldName.substr(OffsetEnd + 1); 
-      } 
-    } 
-    // Strip any SROA suffixes as well. 
-    OldName = OldName.substr(0, OldName.find(".sroa_")); 
-#endif 
- 
-    return getAdjustedPtr(IRB, DL, &NewAI, 
-                          APInt(DL.getIndexTypeSizeInBits(PointerTy), Offset), 
-                          PointerTy, 
-#ifndef NDEBUG 
-                          Twine(OldName) + "." 
-#else 
-                          Twine() 
-#endif 
-                          ); 
-  } 
- 
-  /// Compute suitable alignment to access this slice of the *new* 
-  /// alloca. 
-  /// 
-  /// You can optionally pass a type to this routine and if that type's ABI 
-  /// alignment is itself suitable, this will return zero. 
-  Align getSliceAlign() { 
-    return commonAlignment(NewAI.getAlign(), 
-                           NewBeginOffset - NewAllocaBeginOffset); 
-  } 
- 
-  unsigned getIndex(uint64_t Offset) { 
-    assert(VecTy && "Can only call getIndex when rewriting a vector"); 
-    uint64_t RelOffset = Offset - NewAllocaBeginOffset; 
-    assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds"); 
-    uint32_t Index = RelOffset / ElementSize; 
-    assert(Index * ElementSize == RelOffset); 
-    return Index; 
-  } 
- 
-  void deleteIfTriviallyDead(Value *V) { 
-    Instruction *I = cast<Instruction>(V); 
-    if (isInstructionTriviallyDead(I)) 
+
+  LLVM_DEBUG(dbgs() << "    blend: " << *V << "\n");
+  return V;
+}
+
+/// Visitor to rewrite instructions using p particular slice of an alloca
+/// to use a new alloca.
+///
+/// Also implements the rewriting to vector-based accesses when the partition
+/// passes the isVectorPromotionViable predicate. Most of the rewriting logic
+/// lives here.
+class llvm::sroa::AllocaSliceRewriter
+    : public InstVisitor<AllocaSliceRewriter, bool> {
+  // Befriend the base class so it can delegate to private visit methods.
+  friend class InstVisitor<AllocaSliceRewriter, bool>;
+
+  using Base = InstVisitor<AllocaSliceRewriter, bool>;
+
+  const DataLayout &DL;
+  AllocaSlices &AS;
+  SROA &Pass;
+  AllocaInst &OldAI, &NewAI;
+  const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
+  Type *NewAllocaTy;
+
+  // This is a convenience and flag variable that will be null unless the new
+  // alloca's integer operations should be widened to this integer type due to
+  // passing isIntegerWideningViable above. If it is non-null, the desired
+  // integer type will be stored here for easy access during rewriting.
+  IntegerType *IntTy;
+
+  // If we are rewriting an alloca partition which can be written as pure
+  // vector operations, we stash extra information here. When VecTy is
+  // non-null, we have some strict guarantees about the rewritten alloca:
+  //   - The new alloca is exactly the size of the vector type here.
+  //   - The accesses all either map to the entire vector or to a single
+  //     element.
+  //   - The set of accessing instructions is only one of those handled above
+  //     in isVectorPromotionViable. Generally these are the same access kinds
+  //     which are promotable via mem2reg.
+  VectorType *VecTy;
+  Type *ElementTy;
+  uint64_t ElementSize;
+
+  // The original offset of the slice currently being rewritten relative to
+  // the original alloca.
+  uint64_t BeginOffset = 0;
+  uint64_t EndOffset = 0;
+
+  // The new offsets of the slice currently being rewritten relative to the
+  // original alloca.
+  uint64_t NewBeginOffset = 0, NewEndOffset = 0;
+
+  uint64_t SliceSize = 0;
+  bool IsSplittable = false;
+  bool IsSplit = false;
+  Use *OldUse = nullptr;
+  Instruction *OldPtr = nullptr;
+
+  // Track post-rewrite users which are PHI nodes and Selects.
+  SmallSetVector<PHINode *, 8> &PHIUsers;
+  SmallSetVector<SelectInst *, 8> &SelectUsers;
+
+  // Utility IR builder, whose name prefix is setup for each visited use, and
+  // the insertion point is set to point to the user.
+  IRBuilderTy IRB;
+
+public:
+  AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass,
+                      AllocaInst &OldAI, AllocaInst &NewAI,
+                      uint64_t NewAllocaBeginOffset,
+                      uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
+                      VectorType *PromotableVecTy,
+                      SmallSetVector<PHINode *, 8> &PHIUsers,
+                      SmallSetVector<SelectInst *, 8> &SelectUsers)
+      : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
+        NewAllocaBeginOffset(NewAllocaBeginOffset),
+        NewAllocaEndOffset(NewAllocaEndOffset),
+        NewAllocaTy(NewAI.getAllocatedType()),
+        IntTy(
+            IsIntegerPromotable
+                ? Type::getIntNTy(NewAI.getContext(),
+                                  DL.getTypeSizeInBits(NewAI.getAllocatedType())
+                                      .getFixedSize())
+                : nullptr),
+        VecTy(PromotableVecTy),
+        ElementTy(VecTy ? VecTy->getElementType() : nullptr),
+        ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy).getFixedSize() / 8
+                          : 0),
+        PHIUsers(PHIUsers), SelectUsers(SelectUsers),
+        IRB(NewAI.getContext(), ConstantFolder()) {
+    if (VecTy) {
+      assert((DL.getTypeSizeInBits(ElementTy).getFixedSize() % 8) == 0 &&
+             "Only multiple-of-8 sized vector elements are viable");
+      ++NumVectorized;
+    }
+    assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy));
+  }
+
+  bool visit(AllocaSlices::const_iterator I) {
+    bool CanSROA = true;
+    BeginOffset = I->beginOffset();
+    EndOffset = I->endOffset();
+    IsSplittable = I->isSplittable();
+    IsSplit =
+        BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
+    LLVM_DEBUG(dbgs() << "  rewriting " << (IsSplit ? "split " : ""));
+    LLVM_DEBUG(AS.printSlice(dbgs(), I, ""));
+    LLVM_DEBUG(dbgs() << "\n");
+
+    // Compute the intersecting offset range.
+    assert(BeginOffset < NewAllocaEndOffset);
+    assert(EndOffset > NewAllocaBeginOffset);
+    NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
+    NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
+
+    SliceSize = NewEndOffset - NewBeginOffset;
+
+    OldUse = I->getUse();
+    OldPtr = cast<Instruction>(OldUse->get());
+
+    Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
+    IRB.SetInsertPoint(OldUserI);
+    IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
+    IRB.getInserter().SetNamePrefix(
+        Twine(NewAI.getName()) + "." + Twine(BeginOffset) + ".");
+
+    CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
+    if (VecTy || IntTy)
+      assert(CanSROA);
+    return CanSROA;
+  }
+
+private:
+  // Make sure the other visit overloads are visible.
+  using Base::visit;
+
+  // Every instruction which can end up as a user must have a rewrite rule.
+  bool visitInstruction(Instruction &I) {
+    LLVM_DEBUG(dbgs() << "    !!!! Cannot rewrite: " << I << "\n");
+    llvm_unreachable("No rewrite rule for this instruction!");
+  }
+
+  Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) {
+    // Note that the offset computation can use BeginOffset or NewBeginOffset
+    // interchangeably for unsplit slices.
+    assert(IsSplit || BeginOffset == NewBeginOffset);
+    uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+
+#ifndef NDEBUG
+    StringRef OldName = OldPtr->getName();
+    // Skip through the last '.sroa.' component of the name.
+    size_t LastSROAPrefix = OldName.rfind(".sroa.");
+    if (LastSROAPrefix != StringRef::npos) {
+      OldName = OldName.substr(LastSROAPrefix + strlen(".sroa."));
+      // Look for an SROA slice index.
+      size_t IndexEnd = OldName.find_first_not_of("0123456789");
+      if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') {
+        // Strip the index and look for the offset.
+        OldName = OldName.substr(IndexEnd + 1);
+        size_t OffsetEnd = OldName.find_first_not_of("0123456789");
+        if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.')
+          // Strip the offset.
+          OldName = OldName.substr(OffsetEnd + 1);
+      }
+    }
+    // Strip any SROA suffixes as well.
+    OldName = OldName.substr(0, OldName.find(".sroa_"));
+#endif
+
+    return getAdjustedPtr(IRB, DL, &NewAI,
+                          APInt(DL.getIndexTypeSizeInBits(PointerTy), Offset),
+                          PointerTy,
+#ifndef NDEBUG
+                          Twine(OldName) + "."
+#else
+                          Twine()
+#endif
+                          );
+  }
+
+  /// Compute suitable alignment to access this slice of the *new*
+  /// alloca.
+  ///
+  /// You can optionally pass a type to this routine and if that type's ABI
+  /// alignment is itself suitable, this will return zero.
+  Align getSliceAlign() {
+    return commonAlignment(NewAI.getAlign(),
+                           NewBeginOffset - NewAllocaBeginOffset);
+  }
+
+  unsigned getIndex(uint64_t Offset) {
+    assert(VecTy && "Can only call getIndex when rewriting a vector");
+    uint64_t RelOffset = Offset - NewAllocaBeginOffset;
+    assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
+    uint32_t Index = RelOffset / ElementSize;
+    assert(Index * ElementSize == RelOffset);
+    return Index;
+  }
+
+  void deleteIfTriviallyDead(Value *V) {
+    Instruction *I = cast<Instruction>(V);
+    if (isInstructionTriviallyDead(I))
       Pass.DeadInsts.push_back(I);
-  } 
- 
-  Value *rewriteVectorizedLoadInst() { 
-    unsigned BeginIndex = getIndex(NewBeginOffset); 
-    unsigned EndIndex = getIndex(NewEndOffset); 
-    assert(EndIndex > BeginIndex && "Empty vector!"); 
- 
-    Value *V = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, 
-                                     NewAI.getAlign(), "load"); 
-    return extractVector(IRB, V, BeginIndex, EndIndex, "vec"); 
-  } 
- 
-  Value *rewriteIntegerLoad(LoadInst &LI) { 
-    assert(IntTy && "We cannot insert an integer to the alloca"); 
-    assert(!LI.isVolatile()); 
-    Value *V = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, 
-                                     NewAI.getAlign(), "load"); 
-    V = convertValue(DL, IRB, V, IntTy); 
-    assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 
-    uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; 
-    if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) { 
-      IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8); 
-      V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract"); 
-    } 
-    // It is possible that the extracted type is not the load type. This 
-    // happens if there is a load past the end of the alloca, and as 
-    // a consequence the slice is narrower but still a candidate for integer 
-    // lowering. To handle this case, we just zero extend the extracted 
-    // integer. 
-    assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 && 
-           "Can only handle an extract for an overly wide load"); 
-    if (cast<IntegerType>(LI.getType())->getBitWidth() > SliceSize * 8) 
-      V = IRB.CreateZExt(V, LI.getType()); 
-    return V; 
-  } 
- 
-  bool visitLoadInst(LoadInst &LI) { 
-    LLVM_DEBUG(dbgs() << "    original: " << LI << "\n"); 
-    Value *OldOp = LI.getOperand(0); 
-    assert(OldOp == OldPtr); 
- 
-    AAMDNodes AATags; 
-    LI.getAAMetadata(AATags); 
- 
-    unsigned AS = LI.getPointerAddressSpace(); 
- 
-    Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8) 
-                             : LI.getType(); 
-    const bool IsLoadPastEnd = 
-        DL.getTypeStoreSize(TargetTy).getFixedSize() > SliceSize; 
-    bool IsPtrAdjusted = false; 
-    Value *V; 
-    if (VecTy) { 
-      V = rewriteVectorizedLoadInst(); 
-    } else if (IntTy && LI.getType()->isIntegerTy()) { 
-      V = rewriteIntegerLoad(LI); 
-    } else if (NewBeginOffset == NewAllocaBeginOffset && 
-               NewEndOffset == NewAllocaEndOffset && 
-               (canConvertValue(DL, NewAllocaTy, TargetTy) || 
-                (IsLoadPastEnd && NewAllocaTy->isIntegerTy() && 
-                 TargetTy->isIntegerTy()))) { 
-      LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, 
-                                              NewAI.getAlign(), LI.isVolatile(), 
-                                              LI.getName()); 
-      if (AATags) 
+  }
+
+  Value *rewriteVectorizedLoadInst() {
+    unsigned BeginIndex = getIndex(NewBeginOffset);
+    unsigned EndIndex = getIndex(NewEndOffset);
+    assert(EndIndex > BeginIndex && "Empty vector!");
+
+    Value *V = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
+                                     NewAI.getAlign(), "load");
+    return extractVector(IRB, V, BeginIndex, EndIndex, "vec");
+  }
+
+  Value *rewriteIntegerLoad(LoadInst &LI) {
+    assert(IntTy && "We cannot insert an integer to the alloca");
+    assert(!LI.isVolatile());
+    Value *V = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
+                                     NewAI.getAlign(), "load");
+    V = convertValue(DL, IRB, V, IntTy);
+    assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+    uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+    if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) {
+      IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8);
+      V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract");
+    }
+    // It is possible that the extracted type is not the load type. This
+    // happens if there is a load past the end of the alloca, and as
+    // a consequence the slice is narrower but still a candidate for integer
+    // lowering. To handle this case, we just zero extend the extracted
+    // integer.
+    assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 &&
+           "Can only handle an extract for an overly wide load");
+    if (cast<IntegerType>(LI.getType())->getBitWidth() > SliceSize * 8)
+      V = IRB.CreateZExt(V, LI.getType());
+    return V;
+  }
+
+  bool visitLoadInst(LoadInst &LI) {
+    LLVM_DEBUG(dbgs() << "    original: " << LI << "\n");
+    Value *OldOp = LI.getOperand(0);
+    assert(OldOp == OldPtr);
+
+    AAMDNodes AATags;
+    LI.getAAMetadata(AATags);
+
+    unsigned AS = LI.getPointerAddressSpace();
+
+    Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
+                             : LI.getType();
+    const bool IsLoadPastEnd =
+        DL.getTypeStoreSize(TargetTy).getFixedSize() > SliceSize;
+    bool IsPtrAdjusted = false;
+    Value *V;
+    if (VecTy) {
+      V = rewriteVectorizedLoadInst();
+    } else if (IntTy && LI.getType()->isIntegerTy()) {
+      V = rewriteIntegerLoad(LI);
+    } else if (NewBeginOffset == NewAllocaBeginOffset &&
+               NewEndOffset == NewAllocaEndOffset &&
+               (canConvertValue(DL, NewAllocaTy, TargetTy) ||
+                (IsLoadPastEnd && NewAllocaTy->isIntegerTy() &&
+                 TargetTy->isIntegerTy()))) {
+      LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
+                                              NewAI.getAlign(), LI.isVolatile(),
+                                              LI.getName());
+      if (AATags)
         NewLI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
-      if (LI.isVolatile()) 
-        NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID()); 
-      if (NewLI->isAtomic()) 
-        NewLI->setAlignment(LI.getAlign()); 
- 
-      // Any !nonnull metadata or !range metadata on the old load is also valid 
-      // on the new load. This is even true in some cases even when the loads 
-      // are different types, for example by mapping !nonnull metadata to 
-      // !range metadata by modeling the null pointer constant converted to the 
-      // integer type. 
-      // FIXME: Add support for range metadata here. Currently the utilities 
-      // for this don't propagate range metadata in trivial cases from one 
-      // integer load to another, don't handle non-addrspace-0 null pointers 
-      // correctly, and don't have any support for mapping ranges as the 
-      // integer type becomes winder or narrower. 
-      if (MDNode *N = LI.getMetadata(LLVMContext::MD_nonnull)) 
-        copyNonnullMetadata(LI, N, *NewLI); 
- 
-      // Try to preserve nonnull metadata 
-      V = NewLI; 
- 
-      // If this is an integer load past the end of the slice (which means the 
-      // bytes outside the slice are undef or this load is dead) just forcibly 
-      // fix the integer size with correct handling of endianness. 
-      if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy)) 
-        if (auto *TITy = dyn_cast<IntegerType>(TargetTy)) 
-          if (AITy->getBitWidth() < TITy->getBitWidth()) { 
-            V = IRB.CreateZExt(V, TITy, "load.ext"); 
-            if (DL.isBigEndian()) 
-              V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(), 
-                                "endian_shift"); 
-          } 
-    } else { 
-      Type *LTy = TargetTy->getPointerTo(AS); 
-      LoadInst *NewLI = 
-          IRB.CreateAlignedLoad(TargetTy, getNewAllocaSlicePtr(IRB, LTy), 
-                                getSliceAlign(), LI.isVolatile(), LI.getName()); 
-      if (AATags) 
+      if (LI.isVolatile())
+        NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
+      if (NewLI->isAtomic())
+        NewLI->setAlignment(LI.getAlign());
+
+      // Any !nonnull metadata or !range metadata on the old load is also valid
+      // on the new load. This is even true in some cases even when the loads
+      // are different types, for example by mapping !nonnull metadata to
+      // !range metadata by modeling the null pointer constant converted to the
+      // integer type.
+      // FIXME: Add support for range metadata here. Currently the utilities
+      // for this don't propagate range metadata in trivial cases from one
+      // integer load to another, don't handle non-addrspace-0 null pointers
+      // correctly, and don't have any support for mapping ranges as the
+      // integer type becomes winder or narrower.
+      if (MDNode *N = LI.getMetadata(LLVMContext::MD_nonnull))
+        copyNonnullMetadata(LI, N, *NewLI);
+
+      // Try to preserve nonnull metadata
+      V = NewLI;
+
+      // If this is an integer load past the end of the slice (which means the
+      // bytes outside the slice are undef or this load is dead) just forcibly
+      // fix the integer size with correct handling of endianness.
+      if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
+        if (auto *TITy = dyn_cast<IntegerType>(TargetTy))
+          if (AITy->getBitWidth() < TITy->getBitWidth()) {
+            V = IRB.CreateZExt(V, TITy, "load.ext");
+            if (DL.isBigEndian())
+              V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(),
+                                "endian_shift");
+          }
+    } else {
+      Type *LTy = TargetTy->getPointerTo(AS);
+      LoadInst *NewLI =
+          IRB.CreateAlignedLoad(TargetTy, getNewAllocaSlicePtr(IRB, LTy),
+                                getSliceAlign(), LI.isVolatile(), LI.getName());
+      if (AATags)
         NewLI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
-      if (LI.isVolatile()) 
-        NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID()); 
- 
-      V = NewLI; 
-      IsPtrAdjusted = true; 
-    } 
-    V = convertValue(DL, IRB, V, TargetTy); 
- 
-    if (IsSplit) { 
-      assert(!LI.isVolatile()); 
-      assert(LI.getType()->isIntegerTy() && 
-             "Only integer type loads and stores are split"); 
-      assert(SliceSize < DL.getTypeStoreSize(LI.getType()).getFixedSize() && 
-             "Split load isn't smaller than original load"); 
-      assert(DL.typeSizeEqualsStoreSize(LI.getType()) && 
-             "Non-byte-multiple bit width"); 
-      // Move the insertion point just past the load so that we can refer to it. 
-      IRB.SetInsertPoint(&*std::next(BasicBlock::iterator(&LI))); 
-      // Create a placeholder value with the same type as LI to use as the 
-      // basis for the new value. This allows us to replace the uses of LI with 
-      // the computed value, and then replace the placeholder with LI, leaving 
-      // LI only used for this computation. 
-      Value *Placeholder = new LoadInst( 
-          LI.getType(), UndefValue::get(LI.getType()->getPointerTo(AS)), "", 
-          false, Align(1)); 
-      V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset, 
-                        "insert"); 
-      LI.replaceAllUsesWith(V); 
-      Placeholder->replaceAllUsesWith(&LI); 
-      Placeholder->deleteValue(); 
-    } else { 
-      LI.replaceAllUsesWith(V); 
-    } 
- 
+      if (LI.isVolatile())
+        NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
+
+      V = NewLI;
+      IsPtrAdjusted = true;
+    }
+    V = convertValue(DL, IRB, V, TargetTy);
+
+    if (IsSplit) {
+      assert(!LI.isVolatile());
+      assert(LI.getType()->isIntegerTy() &&
+             "Only integer type loads and stores are split");
+      assert(SliceSize < DL.getTypeStoreSize(LI.getType()).getFixedSize() &&
+             "Split load isn't smaller than original load");
+      assert(DL.typeSizeEqualsStoreSize(LI.getType()) &&
+             "Non-byte-multiple bit width");
+      // Move the insertion point just past the load so that we can refer to it.
+      IRB.SetInsertPoint(&*std::next(BasicBlock::iterator(&LI)));
+      // Create a placeholder value with the same type as LI to use as the
+      // basis for the new value. This allows us to replace the uses of LI with
+      // the computed value, and then replace the placeholder with LI, leaving
+      // LI only used for this computation.
+      Value *Placeholder = new LoadInst(
+          LI.getType(), UndefValue::get(LI.getType()->getPointerTo(AS)), "",
+          false, Align(1));
+      V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
+                        "insert");
+      LI.replaceAllUsesWith(V);
+      Placeholder->replaceAllUsesWith(&LI);
+      Placeholder->deleteValue();
+    } else {
+      LI.replaceAllUsesWith(V);
+    }
+
     Pass.DeadInsts.push_back(&LI);
-    deleteIfTriviallyDead(OldOp); 
-    LLVM_DEBUG(dbgs() << "          to: " << *V << "\n"); 
-    return !LI.isVolatile() && !IsPtrAdjusted; 
-  } 
- 
-  bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp, 
-                                  AAMDNodes AATags) { 
-    if (V->getType() != VecTy) { 
-      unsigned BeginIndex = getIndex(NewBeginOffset); 
-      unsigned EndIndex = getIndex(NewEndOffset); 
-      assert(EndIndex > BeginIndex && "Empty vector!"); 
-      unsigned NumElements = EndIndex - BeginIndex; 
-      assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() && 
-             "Too many elements!"); 
-      Type *SliceTy = (NumElements == 1) 
-                          ? ElementTy 
-                          : FixedVectorType::get(ElementTy, NumElements); 
-      if (V->getType() != SliceTy) 
-        V = convertValue(DL, IRB, V, SliceTy); 
- 
-      // Mix in the existing elements. 
-      Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, 
-                                         NewAI.getAlign(), "load"); 
-      V = insertVector(IRB, Old, V, BeginIndex, "vec"); 
-    } 
-    StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign()); 
-    if (AATags) 
+    deleteIfTriviallyDead(OldOp);
+    LLVM_DEBUG(dbgs() << "          to: " << *V << "\n");
+    return !LI.isVolatile() && !IsPtrAdjusted;
+  }
+
+  bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp,
+                                  AAMDNodes AATags) {
+    if (V->getType() != VecTy) {
+      unsigned BeginIndex = getIndex(NewBeginOffset);
+      unsigned EndIndex = getIndex(NewEndOffset);
+      assert(EndIndex > BeginIndex && "Empty vector!");
+      unsigned NumElements = EndIndex - BeginIndex;
+      assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
+             "Too many elements!");
+      Type *SliceTy = (NumElements == 1)
+                          ? ElementTy
+                          : FixedVectorType::get(ElementTy, NumElements);
+      if (V->getType() != SliceTy)
+        V = convertValue(DL, IRB, V, SliceTy);
+
+      // Mix in the existing elements.
+      Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
+                                         NewAI.getAlign(), "load");
+      V = insertVector(IRB, Old, V, BeginIndex, "vec");
+    }
+    StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
+    if (AATags)
       Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
     Pass.DeadInsts.push_back(&SI);
- 
-    LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n"); 
-    return true; 
-  } 
- 
-  bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) { 
-    assert(IntTy && "We cannot extract an integer from the alloca"); 
-    assert(!SI.isVolatile()); 
-    if (DL.getTypeSizeInBits(V->getType()).getFixedSize() != 
-        IntTy->getBitWidth()) { 
-      Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, 
-                                         NewAI.getAlign(), "oldload"); 
-      Old = convertValue(DL, IRB, Old, IntTy); 
-      assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 
-      uint64_t Offset = BeginOffset - NewAllocaBeginOffset; 
-      V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert"); 
-    } 
-    V = convertValue(DL, IRB, V, NewAllocaTy); 
-    StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign()); 
-    Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access, 
-                             LLVMContext::MD_access_group}); 
-    if (AATags) 
+
+    LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n");
+    return true;
+  }
+
+  bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) {
+    assert(IntTy && "We cannot extract an integer from the alloca");
+    assert(!SI.isVolatile());
+    if (DL.getTypeSizeInBits(V->getType()).getFixedSize() !=
+        IntTy->getBitWidth()) {
+      Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
+                                         NewAI.getAlign(), "oldload");
+      Old = convertValue(DL, IRB, Old, IntTy);
+      assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+      uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
+      V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert");
+    }
+    V = convertValue(DL, IRB, V, NewAllocaTy);
+    StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
+    Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
+                             LLVMContext::MD_access_group});
+    if (AATags)
       Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
     Pass.DeadInsts.push_back(&SI);
-    LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n"); 
-    return true; 
-  } 
- 
-  bool visitStoreInst(StoreInst &SI) { 
-    LLVM_DEBUG(dbgs() << "    original: " << SI << "\n"); 
-    Value *OldOp = SI.getOperand(1); 
-    assert(OldOp == OldPtr); 
- 
-    AAMDNodes AATags; 
-    SI.getAAMetadata(AATags); 
- 
-    Value *V = SI.getValueOperand(); 
- 
-    // Strip all inbounds GEPs and pointer casts to try to dig out any root 
-    // alloca that should be re-examined after promoting this alloca. 
-    if (V->getType()->isPointerTy()) 
-      if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets())) 
-        Pass.PostPromotionWorklist.insert(AI); 
- 
-    if (SliceSize < DL.getTypeStoreSize(V->getType()).getFixedSize()) { 
-      assert(!SI.isVolatile()); 
-      assert(V->getType()->isIntegerTy() && 
-             "Only integer type loads and stores are split"); 
-      assert(DL.typeSizeEqualsStoreSize(V->getType()) && 
-             "Non-byte-multiple bit width"); 
-      IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8); 
-      V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset, 
-                         "extract"); 
-    } 
- 
-    if (VecTy) 
-      return rewriteVectorizedStoreInst(V, SI, OldOp, AATags); 
-    if (IntTy && V->getType()->isIntegerTy()) 
-      return rewriteIntegerStore(V, SI, AATags); 
- 
-    const bool IsStorePastEnd = 
-        DL.getTypeStoreSize(V->getType()).getFixedSize() > SliceSize; 
-    StoreInst *NewSI; 
-    if (NewBeginOffset == NewAllocaBeginOffset && 
-        NewEndOffset == NewAllocaEndOffset && 
-        (canConvertValue(DL, V->getType(), NewAllocaTy) || 
-         (IsStorePastEnd && NewAllocaTy->isIntegerTy() && 
-          V->getType()->isIntegerTy()))) { 
-      // If this is an integer store past the end of slice (and thus the bytes 
-      // past that point are irrelevant or this is unreachable), truncate the 
-      // value prior to storing. 
-      if (auto *VITy = dyn_cast<IntegerType>(V->getType())) 
-        if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy)) 
-          if (VITy->getBitWidth() > AITy->getBitWidth()) { 
-            if (DL.isBigEndian()) 
-              V = IRB.CreateLShr(V, VITy->getBitWidth() - AITy->getBitWidth(), 
-                                 "endian_shift"); 
-            V = IRB.CreateTrunc(V, AITy, "load.trunc"); 
-          } 
- 
-      V = convertValue(DL, IRB, V, NewAllocaTy); 
-      NewSI = 
-          IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), SI.isVolatile()); 
-    } else { 
-      unsigned AS = SI.getPointerAddressSpace(); 
-      Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo(AS)); 
-      NewSI = 
-          IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(), SI.isVolatile()); 
-    } 
-    NewSI->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access, 
-                             LLVMContext::MD_access_group}); 
-    if (AATags) 
+    LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n");
+    return true;
+  }
+
+  bool visitStoreInst(StoreInst &SI) {
+    LLVM_DEBUG(dbgs() << "    original: " << SI << "\n");
+    Value *OldOp = SI.getOperand(1);
+    assert(OldOp == OldPtr);
+
+    AAMDNodes AATags;
+    SI.getAAMetadata(AATags);
+
+    Value *V = SI.getValueOperand();
+
+    // Strip all inbounds GEPs and pointer casts to try to dig out any root
+    // alloca that should be re-examined after promoting this alloca.
+    if (V->getType()->isPointerTy())
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
+        Pass.PostPromotionWorklist.insert(AI);
+
+    if (SliceSize < DL.getTypeStoreSize(V->getType()).getFixedSize()) {
+      assert(!SI.isVolatile());
+      assert(V->getType()->isIntegerTy() &&
+             "Only integer type loads and stores are split");
+      assert(DL.typeSizeEqualsStoreSize(V->getType()) &&
+             "Non-byte-multiple bit width");
+      IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
+      V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset,
+                         "extract");
+    }
+
+    if (VecTy)
+      return rewriteVectorizedStoreInst(V, SI, OldOp, AATags);
+    if (IntTy && V->getType()->isIntegerTy())
+      return rewriteIntegerStore(V, SI, AATags);
+
+    const bool IsStorePastEnd =
+        DL.getTypeStoreSize(V->getType()).getFixedSize() > SliceSize;
+    StoreInst *NewSI;
+    if (NewBeginOffset == NewAllocaBeginOffset &&
+        NewEndOffset == NewAllocaEndOffset &&
+        (canConvertValue(DL, V->getType(), NewAllocaTy) ||
+         (IsStorePastEnd && NewAllocaTy->isIntegerTy() &&
+          V->getType()->isIntegerTy()))) {
+      // If this is an integer store past the end of slice (and thus the bytes
+      // past that point are irrelevant or this is unreachable), truncate the
+      // value prior to storing.
+      if (auto *VITy = dyn_cast<IntegerType>(V->getType()))
+        if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
+          if (VITy->getBitWidth() > AITy->getBitWidth()) {
+            if (DL.isBigEndian())
+              V = IRB.CreateLShr(V, VITy->getBitWidth() - AITy->getBitWidth(),
+                                 "endian_shift");
+            V = IRB.CreateTrunc(V, AITy, "load.trunc");
+          }
+
+      V = convertValue(DL, IRB, V, NewAllocaTy);
+      NewSI =
+          IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), SI.isVolatile());
+    } else {
+      unsigned AS = SI.getPointerAddressSpace();
+      Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo(AS));
+      NewSI =
+          IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(), SI.isVolatile());
+    }
+    NewSI->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
+                             LLVMContext::MD_access_group});
+    if (AATags)
       NewSI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
-    if (SI.isVolatile()) 
-      NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID()); 
-    if (NewSI->isAtomic()) 
-      NewSI->setAlignment(SI.getAlign()); 
+    if (SI.isVolatile())
+      NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
+    if (NewSI->isAtomic())
+      NewSI->setAlignment(SI.getAlign());
     Pass.DeadInsts.push_back(&SI);
-    deleteIfTriviallyDead(OldOp); 
- 
-    LLVM_DEBUG(dbgs() << "          to: " << *NewSI << "\n"); 
-    return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile(); 
-  } 
- 
-  /// Compute an integer value from splatting an i8 across the given 
-  /// number of bytes. 
-  /// 
-  /// Note that this routine assumes an i8 is a byte. If that isn't true, don't 
-  /// call this routine. 
-  /// FIXME: Heed the advice above. 
-  /// 
-  /// \param V The i8 value to splat. 
-  /// \param Size The number of bytes in the output (assuming i8 is one byte) 
-  Value *getIntegerSplat(Value *V, unsigned Size) { 
-    assert(Size > 0 && "Expected a positive number of bytes."); 
-    IntegerType *VTy = cast<IntegerType>(V->getType()); 
-    assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte"); 
-    if (Size == 1) 
-      return V; 
- 
-    Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8); 
-    V = IRB.CreateMul( 
-        IRB.CreateZExt(V, SplatIntTy, "zext"), 
-        ConstantExpr::getUDiv( 
-            Constant::getAllOnesValue(SplatIntTy), 
-            ConstantExpr::getZExt(Constant::getAllOnesValue(V->getType()), 
-                                  SplatIntTy)), 
-        "isplat"); 
-    return V; 
-  } 
- 
-  /// Compute a vector splat for a given element value. 
-  Value *getVectorSplat(Value *V, unsigned NumElements) { 
-    V = IRB.CreateVectorSplat(NumElements, V, "vsplat"); 
-    LLVM_DEBUG(dbgs() << "       splat: " << *V << "\n"); 
-    return V; 
-  } 
- 
-  bool visitMemSetInst(MemSetInst &II) { 
-    LLVM_DEBUG(dbgs() << "    original: " << II << "\n"); 
-    assert(II.getRawDest() == OldPtr); 
- 
-    AAMDNodes AATags; 
-    II.getAAMetadata(AATags); 
- 
-    // If the memset has a variable size, it cannot be split, just adjust the 
-    // pointer to the new alloca. 
-    if (!isa<Constant>(II.getLength())) { 
-      assert(!IsSplit); 
-      assert(NewBeginOffset == BeginOffset); 
-      II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType())); 
-      II.setDestAlignment(getSliceAlign()); 
- 
-      deleteIfTriviallyDead(OldPtr); 
-      return false; 
-    } 
- 
-    // Record this instruction for deletion. 
+    deleteIfTriviallyDead(OldOp);
+
+    LLVM_DEBUG(dbgs() << "          to: " << *NewSI << "\n");
+    return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile();
+  }
+
+  /// Compute an integer value from splatting an i8 across the given
+  /// number of bytes.
+  ///
+  /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
+  /// call this routine.
+  /// FIXME: Heed the advice above.
+  ///
+  /// \param V The i8 value to splat.
+  /// \param Size The number of bytes in the output (assuming i8 is one byte)
+  Value *getIntegerSplat(Value *V, unsigned Size) {
+    assert(Size > 0 && "Expected a positive number of bytes.");
+    IntegerType *VTy = cast<IntegerType>(V->getType());
+    assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
+    if (Size == 1)
+      return V;
+
+    Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8);
+    V = IRB.CreateMul(
+        IRB.CreateZExt(V, SplatIntTy, "zext"),
+        ConstantExpr::getUDiv(
+            Constant::getAllOnesValue(SplatIntTy),
+            ConstantExpr::getZExt(Constant::getAllOnesValue(V->getType()),
+                                  SplatIntTy)),
+        "isplat");
+    return V;
+  }
+
+  /// Compute a vector splat for a given element value.
+  Value *getVectorSplat(Value *V, unsigned NumElements) {
+    V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
+    LLVM_DEBUG(dbgs() << "       splat: " << *V << "\n");
+    return V;
+  }
+
+  bool visitMemSetInst(MemSetInst &II) {
+    LLVM_DEBUG(dbgs() << "    original: " << II << "\n");
+    assert(II.getRawDest() == OldPtr);
+
+    AAMDNodes AATags;
+    II.getAAMetadata(AATags);
+
+    // If the memset has a variable size, it cannot be split, just adjust the
+    // pointer to the new alloca.
+    if (!isa<Constant>(II.getLength())) {
+      assert(!IsSplit);
+      assert(NewBeginOffset == BeginOffset);
+      II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType()));
+      II.setDestAlignment(getSliceAlign());
+
+      deleteIfTriviallyDead(OldPtr);
+      return false;
+    }
+
+    // Record this instruction for deletion.
     Pass.DeadInsts.push_back(&II);
- 
-    Type *AllocaTy = NewAI.getAllocatedType(); 
-    Type *ScalarTy = AllocaTy->getScalarType(); 
-
-    const bool CanContinue = [&]() { 
-      if (VecTy || IntTy) 
-        return true; 
-      if (BeginOffset > NewAllocaBeginOffset || 
-          EndOffset < NewAllocaEndOffset) 
-        return false; 
-      auto *C = cast<ConstantInt>(II.getLength()); 
-      if (C->getBitWidth() > 64) 
-        return false; 
-      const auto Len = C->getZExtValue(); 
-      auto *Int8Ty = IntegerType::getInt8Ty(NewAI.getContext()); 
-      auto *SrcTy = FixedVectorType::get(Int8Ty, Len); 
-      return canConvertValue(DL, SrcTy, AllocaTy) && 
-             DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy).getFixedSize()); 
-    }(); 
- 
-    // If this doesn't map cleanly onto the alloca type, and that type isn't 
-    // a single value type, just emit a memset. 
-    if (!CanContinue) { 
-      Type *SizeTy = II.getLength()->getType(); 
-      Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset); 
-      CallInst *New = IRB.CreateMemSet( 
-          getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size, 
-          MaybeAlign(getSliceAlign()), II.isVolatile()); 
-      if (AATags) 
+
+    Type *AllocaTy = NewAI.getAllocatedType();
+    Type *ScalarTy = AllocaTy->getScalarType();
+
+    const bool CanContinue = [&]() {
+      if (VecTy || IntTy)
+        return true;
+      if (BeginOffset > NewAllocaBeginOffset ||
+          EndOffset < NewAllocaEndOffset)
+        return false;
+      auto *C = cast<ConstantInt>(II.getLength());
+      if (C->getBitWidth() > 64)
+        return false;
+      const auto Len = C->getZExtValue();
+      auto *Int8Ty = IntegerType::getInt8Ty(NewAI.getContext());
+      auto *SrcTy = FixedVectorType::get(Int8Ty, Len);
+      return canConvertValue(DL, SrcTy, AllocaTy) &&
+             DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy).getFixedSize());
+    }();
+
+    // If this doesn't map cleanly onto the alloca type, and that type isn't
+    // a single value type, just emit a memset.
+    if (!CanContinue) {
+      Type *SizeTy = II.getLength()->getType();
+      Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
+      CallInst *New = IRB.CreateMemSet(
+          getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
+          MaybeAlign(getSliceAlign()), II.isVolatile());
+      if (AATags)
         New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
-      LLVM_DEBUG(dbgs() << "          to: " << *New << "\n"); 
-      return false; 
-    } 
- 
-    // If we can represent this as a simple value, we have to build the actual 
-    // value to store, which requires expanding the byte present in memset to 
-    // a sensible representation for the alloca type. This is essentially 
-    // splatting the byte to a sufficiently wide integer, splatting it across 
-    // any desired vector width, and bitcasting to the final type. 
-    Value *V; 
- 
-    if (VecTy) { 
-      // If this is a memset of a vectorized alloca, insert it. 
-      assert(ElementTy == ScalarTy); 
- 
-      unsigned BeginIndex = getIndex(NewBeginOffset); 
-      unsigned EndIndex = getIndex(NewEndOffset); 
-      assert(EndIndex > BeginIndex && "Empty vector!"); 
-      unsigned NumElements = EndIndex - BeginIndex; 
-      assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() && 
-             "Too many elements!"); 
- 
-      Value *Splat = getIntegerSplat( 
-          II.getValue(), DL.getTypeSizeInBits(ElementTy).getFixedSize() / 8); 
-      Splat = convertValue(DL, IRB, Splat, ElementTy); 
-      if (NumElements > 1) 
-        Splat = getVectorSplat(Splat, NumElements); 
- 
-      Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, 
-                                         NewAI.getAlign(), "oldload"); 
-      V = insertVector(IRB, Old, Splat, BeginIndex, "vec"); 
-    } else if (IntTy) { 
-      // If this is a memset on an alloca where we can widen stores, insert the 
-      // set integer. 
-      assert(!II.isVolatile()); 
- 
-      uint64_t Size = NewEndOffset - NewBeginOffset; 
-      V = getIntegerSplat(II.getValue(), Size); 
- 
-      if (IntTy && (BeginOffset != NewAllocaBeginOffset || 
-                    EndOffset != NewAllocaBeginOffset)) { 
-        Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, 
-                                           NewAI.getAlign(), "oldload"); 
-        Old = convertValue(DL, IRB, Old, IntTy); 
-        uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; 
-        V = insertInteger(DL, IRB, Old, V, Offset, "insert"); 
-      } else { 
-        assert(V->getType() == IntTy && 
-               "Wrong type for an alloca wide integer!"); 
-      } 
-      V = convertValue(DL, IRB, V, AllocaTy); 
-    } else { 
-      // Established these invariants above. 
-      assert(NewBeginOffset == NewAllocaBeginOffset); 
-      assert(NewEndOffset == NewAllocaEndOffset); 
- 
-      V = getIntegerSplat(II.getValue(), 
-                          DL.getTypeSizeInBits(ScalarTy).getFixedSize() / 8); 
-      if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy)) 
-        V = getVectorSplat( 
-            V, cast<FixedVectorType>(AllocaVecTy)->getNumElements()); 
- 
-      V = convertValue(DL, IRB, V, AllocaTy); 
-    } 
- 
-    StoreInst *New = 
-        IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), II.isVolatile()); 
-    if (AATags) 
+      LLVM_DEBUG(dbgs() << "          to: " << *New << "\n");
+      return false;
+    }
+
+    // If we can represent this as a simple value, we have to build the actual
+    // value to store, which requires expanding the byte present in memset to
+    // a sensible representation for the alloca type. This is essentially
+    // splatting the byte to a sufficiently wide integer, splatting it across
+    // any desired vector width, and bitcasting to the final type.
+    Value *V;
+
+    if (VecTy) {
+      // If this is a memset of a vectorized alloca, insert it.
+      assert(ElementTy == ScalarTy);
+
+      unsigned BeginIndex = getIndex(NewBeginOffset);
+      unsigned EndIndex = getIndex(NewEndOffset);
+      assert(EndIndex > BeginIndex && "Empty vector!");
+      unsigned NumElements = EndIndex - BeginIndex;
+      assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
+             "Too many elements!");
+
+      Value *Splat = getIntegerSplat(
+          II.getValue(), DL.getTypeSizeInBits(ElementTy).getFixedSize() / 8);
+      Splat = convertValue(DL, IRB, Splat, ElementTy);
+      if (NumElements > 1)
+        Splat = getVectorSplat(Splat, NumElements);
+
+      Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
+                                         NewAI.getAlign(), "oldload");
+      V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
+    } else if (IntTy) {
+      // If this is a memset on an alloca where we can widen stores, insert the
+      // set integer.
+      assert(!II.isVolatile());
+
+      uint64_t Size = NewEndOffset - NewBeginOffset;
+      V = getIntegerSplat(II.getValue(), Size);
+
+      if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
+                    EndOffset != NewAllocaBeginOffset)) {
+        Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
+                                           NewAI.getAlign(), "oldload");
+        Old = convertValue(DL, IRB, Old, IntTy);
+        uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+        V = insertInteger(DL, IRB, Old, V, Offset, "insert");
+      } else {
+        assert(V->getType() == IntTy &&
+               "Wrong type for an alloca wide integer!");
+      }
+      V = convertValue(DL, IRB, V, AllocaTy);
+    } else {
+      // Established these invariants above.
+      assert(NewBeginOffset == NewAllocaBeginOffset);
+      assert(NewEndOffset == NewAllocaEndOffset);
+
+      V = getIntegerSplat(II.getValue(),
+                          DL.getTypeSizeInBits(ScalarTy).getFixedSize() / 8);
+      if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
+        V = getVectorSplat(
+            V, cast<FixedVectorType>(AllocaVecTy)->getNumElements());
+
+      V = convertValue(DL, IRB, V, AllocaTy);
+    }
+
+    StoreInst *New =
+        IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), II.isVolatile());
+    if (AATags)
       New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
-    LLVM_DEBUG(dbgs() << "          to: " << *New << "\n"); 
-    return !II.isVolatile(); 
-  } 
- 
-  bool visitMemTransferInst(MemTransferInst &II) { 
-    // Rewriting of memory transfer instructions can be a bit tricky. We break 
-    // them into two categories: split intrinsics and unsplit intrinsics. 
- 
-    LLVM_DEBUG(dbgs() << "    original: " << II << "\n"); 
- 
-    AAMDNodes AATags; 
-    II.getAAMetadata(AATags); 
- 
-    bool IsDest = &II.getRawDestUse() == OldUse; 
-    assert((IsDest && II.getRawDest() == OldPtr) || 
-           (!IsDest && II.getRawSource() == OldPtr)); 
- 
-    MaybeAlign SliceAlign = getSliceAlign(); 
- 
-    // For unsplit intrinsics, we simply modify the source and destination 
-    // pointers in place. This isn't just an optimization, it is a matter of 
-    // correctness. With unsplit intrinsics we may be dealing with transfers 
-    // within a single alloca before SROA ran, or with transfers that have 
-    // a variable length. We may also be dealing with memmove instead of 
-    // memcpy, and so simply updating the pointers is the necessary for us to 
-    // update both source and dest of a single call. 
-    if (!IsSplittable) { 
-      Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); 
-      if (IsDest) { 
-        II.setDest(AdjustedPtr); 
-        II.setDestAlignment(SliceAlign); 
-      } 
-      else { 
-        II.setSource(AdjustedPtr); 
-        II.setSourceAlignment(SliceAlign); 
-      } 
- 
-      LLVM_DEBUG(dbgs() << "          to: " << II << "\n"); 
-      deleteIfTriviallyDead(OldPtr); 
-      return false; 
-    } 
-    // For split transfer intrinsics we have an incredibly useful assurance: 
-    // the source and destination do not reside within the same alloca, and at 
-    // least one of them does not escape. This means that we can replace 
-    // memmove with memcpy, and we don't need to worry about all manner of 
-    // downsides to splitting and transforming the operations. 
- 
-    // If this doesn't map cleanly onto the alloca type, and that type isn't 
-    // a single value type, just emit a memcpy. 
-    bool EmitMemCpy = 
-        !VecTy && !IntTy && 
-        (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset || 
-         SliceSize != 
-             DL.getTypeStoreSize(NewAI.getAllocatedType()).getFixedSize() || 
-         !NewAI.getAllocatedType()->isSingleValueType()); 
- 
-    // If we're just going to emit a memcpy, the alloca hasn't changed, and the 
-    // size hasn't been shrunk based on analysis of the viable range, this is 
-    // a no-op. 
-    if (EmitMemCpy && &OldAI == &NewAI) { 
-      // Ensure the start lines up. 
-      assert(NewBeginOffset == BeginOffset); 
- 
-      // Rewrite the size as needed. 
-      if (NewEndOffset != EndOffset) 
-        II.setLength(ConstantInt::get(II.getLength()->getType(), 
-                                      NewEndOffset - NewBeginOffset)); 
-      return false; 
-    } 
-    // Record this instruction for deletion. 
+    LLVM_DEBUG(dbgs() << "          to: " << *New << "\n");
+    return !II.isVolatile();
+  }
+
+  bool visitMemTransferInst(MemTransferInst &II) {
+    // Rewriting of memory transfer instructions can be a bit tricky. We break
+    // them into two categories: split intrinsics and unsplit intrinsics.
+
+    LLVM_DEBUG(dbgs() << "    original: " << II << "\n");
+
+    AAMDNodes AATags;
+    II.getAAMetadata(AATags);
+
+    bool IsDest = &II.getRawDestUse() == OldUse;
+    assert((IsDest && II.getRawDest() == OldPtr) ||
+           (!IsDest && II.getRawSource() == OldPtr));
+
+    MaybeAlign SliceAlign = getSliceAlign();
+
+    // For unsplit intrinsics, we simply modify the source and destination
+    // pointers in place. This isn't just an optimization, it is a matter of
+    // correctness. With unsplit intrinsics we may be dealing with transfers
+    // within a single alloca before SROA ran, or with transfers that have
+    // a variable length. We may also be dealing with memmove instead of
+    // memcpy, and so simply updating the pointers is the necessary for us to
+    // update both source and dest of a single call.
+    if (!IsSplittable) {
+      Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
+      if (IsDest) {
+        II.setDest(AdjustedPtr);
+        II.setDestAlignment(SliceAlign);
+      }
+      else {
+        II.setSource(AdjustedPtr);
+        II.setSourceAlignment(SliceAlign);
+      }
+
+      LLVM_DEBUG(dbgs() << "          to: " << II << "\n");
+      deleteIfTriviallyDead(OldPtr);
+      return false;
+    }
+    // For split transfer intrinsics we have an incredibly useful assurance:
+    // the source and destination do not reside within the same alloca, and at
+    // least one of them does not escape. This means that we can replace
+    // memmove with memcpy, and we don't need to worry about all manner of
+    // downsides to splitting and transforming the operations.
+
+    // If this doesn't map cleanly onto the alloca type, and that type isn't
+    // a single value type, just emit a memcpy.
+    bool EmitMemCpy =
+        !VecTy && !IntTy &&
+        (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
+         SliceSize !=
+             DL.getTypeStoreSize(NewAI.getAllocatedType()).getFixedSize() ||
+         !NewAI.getAllocatedType()->isSingleValueType());
+
+    // If we're just going to emit a memcpy, the alloca hasn't changed, and the
+    // size hasn't been shrunk based on analysis of the viable range, this is
+    // a no-op.
+    if (EmitMemCpy && &OldAI == &NewAI) {
+      // Ensure the start lines up.
+      assert(NewBeginOffset == BeginOffset);
+
+      // Rewrite the size as needed.
+      if (NewEndOffset != EndOffset)
+        II.setLength(ConstantInt::get(II.getLength()->getType(),
+                                      NewEndOffset - NewBeginOffset));
+      return false;
+    }
+    // Record this instruction for deletion.
     Pass.DeadInsts.push_back(&II);
- 
-    // Strip all inbounds GEPs and pointer casts to try to dig out any root 
-    // alloca that should be re-examined after rewriting this instruction. 
-    Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest(); 
-    if (AllocaInst *AI = 
-            dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) { 
-      assert(AI != &OldAI && AI != &NewAI && 
-             "Splittable transfers cannot reach the same alloca on both ends."); 
-      Pass.Worklist.insert(AI); 
-    } 
- 
-    Type *OtherPtrTy = OtherPtr->getType(); 
-    unsigned OtherAS = OtherPtrTy->getPointerAddressSpace(); 
- 
-    // Compute the relative offset for the other pointer within the transfer. 
-    unsigned OffsetWidth = DL.getIndexSizeInBits(OtherAS); 
-    APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset); 
-    Align OtherAlign = 
-        (IsDest ? II.getSourceAlign() : II.getDestAlign()).valueOrOne(); 
-    OtherAlign = 
-        commonAlignment(OtherAlign, OtherOffset.zextOrTrunc(64).getZExtValue()); 
- 
-    if (EmitMemCpy) { 
-      // Compute the other pointer, folding as much as possible to produce 
-      // a single, simple GEP in most cases. 
-      OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy, 
-                                OtherPtr->getName() + "."); 
- 
-      Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); 
-      Type *SizeTy = II.getLength()->getType(); 
-      Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset); 
- 
-      Value *DestPtr, *SrcPtr; 
-      MaybeAlign DestAlign, SrcAlign; 
-      // Note: IsDest is true iff we're copying into the new alloca slice 
-      if (IsDest) { 
-        DestPtr = OurPtr; 
-        DestAlign = SliceAlign; 
-        SrcPtr = OtherPtr; 
-        SrcAlign = OtherAlign; 
-      } else { 
-        DestPtr = OtherPtr; 
-        DestAlign = OtherAlign; 
-        SrcPtr = OurPtr; 
-        SrcAlign = SliceAlign; 
-      } 
-      CallInst *New = IRB.CreateMemCpy(DestPtr, DestAlign, SrcPtr, SrcAlign, 
-                                       Size, II.isVolatile()); 
-      if (AATags) 
+
+    // Strip all inbounds GEPs and pointer casts to try to dig out any root
+    // alloca that should be re-examined after rewriting this instruction.
+    Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
+    if (AllocaInst *AI =
+            dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) {
+      assert(AI != &OldAI && AI != &NewAI &&
+             "Splittable transfers cannot reach the same alloca on both ends.");
+      Pass.Worklist.insert(AI);
+    }
+
+    Type *OtherPtrTy = OtherPtr->getType();
+    unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
+
+    // Compute the relative offset for the other pointer within the transfer.
+    unsigned OffsetWidth = DL.getIndexSizeInBits(OtherAS);
+    APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset);
+    Align OtherAlign =
+        (IsDest ? II.getSourceAlign() : II.getDestAlign()).valueOrOne();
+    OtherAlign =
+        commonAlignment(OtherAlign, OtherOffset.zextOrTrunc(64).getZExtValue());
+
+    if (EmitMemCpy) {
+      // Compute the other pointer, folding as much as possible to produce
+      // a single, simple GEP in most cases.
+      OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
+                                OtherPtr->getName() + ".");
+
+      Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
+      Type *SizeTy = II.getLength()->getType();
+      Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
+
+      Value *DestPtr, *SrcPtr;
+      MaybeAlign DestAlign, SrcAlign;
+      // Note: IsDest is true iff we're copying into the new alloca slice
+      if (IsDest) {
+        DestPtr = OurPtr;
+        DestAlign = SliceAlign;
+        SrcPtr = OtherPtr;
+        SrcAlign = OtherAlign;
+      } else {
+        DestPtr = OtherPtr;
+        DestAlign = OtherAlign;
+        SrcPtr = OurPtr;
+        SrcAlign = SliceAlign;
+      }
+      CallInst *New = IRB.CreateMemCpy(DestPtr, DestAlign, SrcPtr, SrcAlign,
+                                       Size, II.isVolatile());
+      if (AATags)
         New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
-      LLVM_DEBUG(dbgs() << "          to: " << *New << "\n"); 
-      return false; 
-    } 
- 
-    bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset && 
-                         NewEndOffset == NewAllocaEndOffset; 
-    uint64_t Size = NewEndOffset - NewBeginOffset; 
-    unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0; 
-    unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0; 
-    unsigned NumElements = EndIndex - BeginIndex; 
-    IntegerType *SubIntTy = 
-        IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr; 
- 
-    // Reset the other pointer type to match the register type we're going to 
-    // use, but using the address space of the original other pointer. 
-    Type *OtherTy; 
-    if (VecTy && !IsWholeAlloca) { 
-      if (NumElements == 1) 
-        OtherTy = VecTy->getElementType(); 
-      else 
-        OtherTy = FixedVectorType::get(VecTy->getElementType(), NumElements); 
-    } else if (IntTy && !IsWholeAlloca) { 
-      OtherTy = SubIntTy; 
-    } else { 
-      OtherTy = NewAllocaTy; 
-    } 
-    OtherPtrTy = OtherTy->getPointerTo(OtherAS); 
- 
-    Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy, 
-                                   OtherPtr->getName() + "."); 
-    MaybeAlign SrcAlign = OtherAlign; 
-    Value *DstPtr = &NewAI; 
-    MaybeAlign DstAlign = SliceAlign; 
-    if (!IsDest) { 
-      std::swap(SrcPtr, DstPtr); 
-      std::swap(SrcAlign, DstAlign); 
-    } 
- 
-    Value *Src; 
-    if (VecTy && !IsWholeAlloca && !IsDest) { 
-      Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, 
-                                  NewAI.getAlign(), "load"); 
-      Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec"); 
-    } else if (IntTy && !IsWholeAlloca && !IsDest) { 
-      Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, 
-                                  NewAI.getAlign(), "load"); 
-      Src = convertValue(DL, IRB, Src, IntTy); 
-      uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; 
-      Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract"); 
-    } else { 
-      LoadInst *Load = IRB.CreateAlignedLoad(OtherTy, SrcPtr, SrcAlign, 
-                                             II.isVolatile(), "copyload"); 
-      if (AATags) 
+      LLVM_DEBUG(dbgs() << "          to: " << *New << "\n");
+      return false;
+    }
+
+    bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
+                         NewEndOffset == NewAllocaEndOffset;
+    uint64_t Size = NewEndOffset - NewBeginOffset;
+    unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0;
+    unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0;
+    unsigned NumElements = EndIndex - BeginIndex;
+    IntegerType *SubIntTy =
+        IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr;
+
+    // Reset the other pointer type to match the register type we're going to
+    // use, but using the address space of the original other pointer.
+    Type *OtherTy;
+    if (VecTy && !IsWholeAlloca) {
+      if (NumElements == 1)
+        OtherTy = VecTy->getElementType();
+      else
+        OtherTy = FixedVectorType::get(VecTy->getElementType(), NumElements);
+    } else if (IntTy && !IsWholeAlloca) {
+      OtherTy = SubIntTy;
+    } else {
+      OtherTy = NewAllocaTy;
+    }
+    OtherPtrTy = OtherTy->getPointerTo(OtherAS);
+
+    Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
+                                   OtherPtr->getName() + ".");
+    MaybeAlign SrcAlign = OtherAlign;
+    Value *DstPtr = &NewAI;
+    MaybeAlign DstAlign = SliceAlign;
+    if (!IsDest) {
+      std::swap(SrcPtr, DstPtr);
+      std::swap(SrcAlign, DstAlign);
+    }
+
+    Value *Src;
+    if (VecTy && !IsWholeAlloca && !IsDest) {
+      Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
+                                  NewAI.getAlign(), "load");
+      Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
+    } else if (IntTy && !IsWholeAlloca && !IsDest) {
+      Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
+                                  NewAI.getAlign(), "load");
+      Src = convertValue(DL, IRB, Src, IntTy);
+      uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+      Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
+    } else {
+      LoadInst *Load = IRB.CreateAlignedLoad(OtherTy, SrcPtr, SrcAlign,
+                                             II.isVolatile(), "copyload");
+      if (AATags)
         Load->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
-      Src = Load; 
-    } 
- 
-    if (VecTy && !IsWholeAlloca && IsDest) { 
-      Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, 
-                                         NewAI.getAlign(), "oldload"); 
-      Src = insertVector(IRB, Old, Src, BeginIndex, "vec"); 
-    } else if (IntTy && !IsWholeAlloca && IsDest) { 
-      Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, 
-                                         NewAI.getAlign(), "oldload"); 
-      Old = convertValue(DL, IRB, Old, IntTy); 
-      uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; 
-      Src = insertInteger(DL, IRB, Old, Src, Offset, "insert"); 
-      Src = convertValue(DL, IRB, Src, NewAllocaTy); 
-    } 
- 
-    StoreInst *Store = cast<StoreInst>( 
-        IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile())); 
-    if (AATags) 
+      Src = Load;
+    }
+
+    if (VecTy && !IsWholeAlloca && IsDest) {
+      Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
+                                         NewAI.getAlign(), "oldload");
+      Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
+    } else if (IntTy && !IsWholeAlloca && IsDest) {
+      Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
+                                         NewAI.getAlign(), "oldload");
+      Old = convertValue(DL, IRB, Old, IntTy);
+      uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+      Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
+      Src = convertValue(DL, IRB, Src, NewAllocaTy);
+    }
+
+    StoreInst *Store = cast<StoreInst>(
+        IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
+    if (AATags)
       Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
-    LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n"); 
-    return !II.isVolatile(); 
-  } 
- 
-  bool visitIntrinsicInst(IntrinsicInst &II) { 
+    LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n");
+    return !II.isVolatile();
+  }
+
+  bool visitIntrinsicInst(IntrinsicInst &II) {
     assert((II.isLifetimeStartOrEnd() || II.isDroppable()) &&
            "Unexpected intrinsic!");
-    LLVM_DEBUG(dbgs() << "    original: " << II << "\n"); 
- 
-    // Record this instruction for deletion. 
+    LLVM_DEBUG(dbgs() << "    original: " << II << "\n");
+
+    // Record this instruction for deletion.
     Pass.DeadInsts.push_back(&II);
- 
+
     if (II.isDroppable()) {
       assert(II.getIntrinsicID() == Intrinsic::assume && "Expected assume");
       // TODO For now we forget assumed information, this can be improved.
@@ -3101,286 +3101,286 @@ private:
     }
 
     assert(II.getArgOperand(1) == OldPtr);
-    // Lifetime intrinsics are only promotable if they cover the whole alloca. 
-    // Therefore, we drop lifetime intrinsics which don't cover the whole 
-    // alloca. 
-    // (In theory, intrinsics which partially cover an alloca could be 
-    // promoted, but PromoteMemToReg doesn't handle that case.) 
-    // FIXME: Check whether the alloca is promotable before dropping the 
-    // lifetime intrinsics? 
-    if (NewBeginOffset != NewAllocaBeginOffset || 
-        NewEndOffset != NewAllocaEndOffset) 
-      return true; 
- 
-    ConstantInt *Size = 
-        ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()), 
-                         NewEndOffset - NewBeginOffset); 
-    // Lifetime intrinsics always expect an i8* so directly get such a pointer 
-    // for the new alloca slice. 
-    Type *PointerTy = IRB.getInt8PtrTy(OldPtr->getType()->getPointerAddressSpace()); 
-    Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy); 
-    Value *New; 
-    if (II.getIntrinsicID() == Intrinsic::lifetime_start) 
-      New = IRB.CreateLifetimeStart(Ptr, Size); 
-    else 
-      New = IRB.CreateLifetimeEnd(Ptr, Size); 
- 
-    (void)New; 
-    LLVM_DEBUG(dbgs() << "          to: " << *New << "\n"); 
- 
-    return true; 
-  } 
- 
-  void fixLoadStoreAlign(Instruction &Root) { 
-    // This algorithm implements the same visitor loop as 
-    // hasUnsafePHIOrSelectUse, and fixes the alignment of each load 
-    // or store found. 
-    SmallPtrSet<Instruction *, 4> Visited; 
-    SmallVector<Instruction *, 4> Uses; 
-    Visited.insert(&Root); 
-    Uses.push_back(&Root); 
-    do { 
-      Instruction *I = Uses.pop_back_val(); 
- 
-      if (LoadInst *LI = dyn_cast<LoadInst>(I)) { 
-        LI->setAlignment(std::min(LI->getAlign(), getSliceAlign())); 
-        continue; 
-      } 
-      if (StoreInst *SI = dyn_cast<StoreInst>(I)) { 
-        SI->setAlignment(std::min(SI->getAlign(), getSliceAlign())); 
-        continue; 
-      } 
- 
-      assert(isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I) || 
-             isa<PHINode>(I) || isa<SelectInst>(I) || 
-             isa<GetElementPtrInst>(I)); 
-      for (User *U : I->users()) 
-        if (Visited.insert(cast<Instruction>(U)).second) 
-          Uses.push_back(cast<Instruction>(U)); 
-    } while (!Uses.empty()); 
-  } 
- 
-  bool visitPHINode(PHINode &PN) { 
-    LLVM_DEBUG(dbgs() << "    original: " << PN << "\n"); 
-    assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable"); 
-    assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable"); 
- 
-    // We would like to compute a new pointer in only one place, but have it be 
-    // as local as possible to the PHI. To do that, we re-use the location of 
-    // the old pointer, which necessarily must be in the right position to 
-    // dominate the PHI. 
-    IRBuilderBase::InsertPointGuard Guard(IRB); 
-    if (isa<PHINode>(OldPtr)) 
-      IRB.SetInsertPoint(&*OldPtr->getParent()->getFirstInsertionPt()); 
-    else 
-      IRB.SetInsertPoint(OldPtr); 
-    IRB.SetCurrentDebugLocation(OldPtr->getDebugLoc()); 
- 
-    Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); 
-    // Replace the operands which were using the old pointer. 
-    std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr); 
- 
-    LLVM_DEBUG(dbgs() << "          to: " << PN << "\n"); 
-    deleteIfTriviallyDead(OldPtr); 
- 
-    // Fix the alignment of any loads or stores using this PHI node. 
-    fixLoadStoreAlign(PN); 
- 
-    // PHIs can't be promoted on their own, but often can be speculated. We 
-    // check the speculation outside of the rewriter so that we see the 
-    // fully-rewritten alloca. 
-    PHIUsers.insert(&PN); 
-    return true; 
-  } 
- 
-  bool visitSelectInst(SelectInst &SI) { 
-    LLVM_DEBUG(dbgs() << "    original: " << SI << "\n"); 
-    assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) && 
-           "Pointer isn't an operand!"); 
-    assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable"); 
-    assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable"); 
- 
-    Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); 
-    // Replace the operands which were using the old pointer. 
-    if (SI.getOperand(1) == OldPtr) 
-      SI.setOperand(1, NewPtr); 
-    if (SI.getOperand(2) == OldPtr) 
-      SI.setOperand(2, NewPtr); 
- 
-    LLVM_DEBUG(dbgs() << "          to: " << SI << "\n"); 
-    deleteIfTriviallyDead(OldPtr); 
- 
-    // Fix the alignment of any loads or stores using this select. 
-    fixLoadStoreAlign(SI); 
- 
-    // Selects can't be promoted on their own, but often can be speculated. We 
-    // check the speculation outside of the rewriter so that we see the 
-    // fully-rewritten alloca. 
-    SelectUsers.insert(&SI); 
-    return true; 
-  } 
-}; 
- 
-namespace { 
- 
-/// Visitor to rewrite aggregate loads and stores as scalar. 
-/// 
-/// This pass aggressively rewrites all aggregate loads and stores on 
-/// a particular pointer (or any pointer derived from it which we can identify) 
-/// with scalar loads and stores. 
-class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> { 
-  // Befriend the base class so it can delegate to private visit methods. 
-  friend class InstVisitor<AggLoadStoreRewriter, bool>; 
- 
-  /// Queue of pointer uses to analyze and potentially rewrite. 
-  SmallVector<Use *, 8> Queue; 
- 
-  /// Set to prevent us from cycling with phi nodes and loops. 
-  SmallPtrSet<User *, 8> Visited; 
- 
-  /// The current pointer use being rewritten. This is used to dig up the used 
-  /// value (as opposed to the user). 
-  Use *U = nullptr; 
- 
-  /// Used to calculate offsets, and hence alignment, of subobjects. 
-  const DataLayout &DL; 
- 
-public: 
-  AggLoadStoreRewriter(const DataLayout &DL) : DL(DL) {} 
- 
-  /// Rewrite loads and stores through a pointer and all pointers derived from 
-  /// it. 
-  bool rewrite(Instruction &I) { 
-    LLVM_DEBUG(dbgs() << "  Rewriting FCA loads and stores...\n"); 
-    enqueueUsers(I); 
-    bool Changed = false; 
-    while (!Queue.empty()) { 
-      U = Queue.pop_back_val(); 
-      Changed |= visit(cast<Instruction>(U->getUser())); 
-    } 
-    return Changed; 
-  } 
- 
-private: 
-  /// Enqueue all the users of the given instruction for further processing. 
-  /// This uses a set to de-duplicate users. 
-  void enqueueUsers(Instruction &I) { 
-    for (Use &U : I.uses()) 
-      if (Visited.insert(U.getUser()).second) 
-        Queue.push_back(&U); 
-  } 
- 
-  // Conservative default is to not rewrite anything. 
-  bool visitInstruction(Instruction &I) { return false; } 
- 
-  /// Generic recursive split emission class. 
-  template <typename Derived> class OpSplitter { 
-  protected: 
-    /// The builder used to form new instructions. 
-    IRBuilderTy IRB; 
- 
-    /// The indices which to be used with insert- or extractvalue to select the 
-    /// appropriate value within the aggregate. 
-    SmallVector<unsigned, 4> Indices; 
- 
-    /// The indices to a GEP instruction which will move Ptr to the correct slot 
-    /// within the aggregate. 
-    SmallVector<Value *, 4> GEPIndices; 
- 
-    /// The base pointer of the original op, used as a base for GEPing the 
-    /// split operations. 
-    Value *Ptr; 
- 
-    /// The base pointee type being GEPed into. 
-    Type *BaseTy; 
- 
-    /// Known alignment of the base pointer. 
-    Align BaseAlign; 
- 
-    /// To calculate offset of each component so we can correctly deduce 
-    /// alignments. 
-    const DataLayout &DL; 
- 
-    /// Initialize the splitter with an insertion point, Ptr and start with a 
-    /// single zero GEP index. 
-    OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy, 
-               Align BaseAlign, const DataLayout &DL) 
-        : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr), 
-          BaseTy(BaseTy), BaseAlign(BaseAlign), DL(DL) {} 
- 
-  public: 
-    /// Generic recursive split emission routine. 
-    /// 
-    /// This method recursively splits an aggregate op (load or store) into 
-    /// scalar or vector ops. It splits recursively until it hits a single value 
-    /// and emits that single value operation via the template argument. 
-    /// 
-    /// The logic of this routine relies on GEPs and insertvalue and 
-    /// extractvalue all operating with the same fundamental index list, merely 
-    /// formatted differently (GEPs need actual values). 
-    /// 
-    /// \param Ty  The type being split recursively into smaller ops. 
-    /// \param Agg The aggregate value being built up or stored, depending on 
-    /// whether this is splitting a load or a store respectively. 
-    void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) { 
-      if (Ty->isSingleValueType()) { 
-        unsigned Offset = DL.getIndexedOffsetInType(BaseTy, GEPIndices); 
-        return static_cast<Derived *>(this)->emitFunc( 
-            Ty, Agg, commonAlignment(BaseAlign, Offset), Name); 
-      } 
- 
-      if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 
-        unsigned OldSize = Indices.size(); 
-        (void)OldSize; 
-        for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size; 
-             ++Idx) { 
-          assert(Indices.size() == OldSize && "Did not return to the old size"); 
-          Indices.push_back(Idx); 
-          GEPIndices.push_back(IRB.getInt32(Idx)); 
-          emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx)); 
-          GEPIndices.pop_back(); 
-          Indices.pop_back(); 
-        } 
-        return; 
-      } 
- 
-      if (StructType *STy = dyn_cast<StructType>(Ty)) { 
-        unsigned OldSize = Indices.size(); 
-        (void)OldSize; 
-        for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size; 
-             ++Idx) { 
-          assert(Indices.size() == OldSize && "Did not return to the old size"); 
-          Indices.push_back(Idx); 
-          GEPIndices.push_back(IRB.getInt32(Idx)); 
-          emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx)); 
-          GEPIndices.pop_back(); 
-          Indices.pop_back(); 
-        } 
-        return; 
-      } 
- 
-      llvm_unreachable("Only arrays and structs are aggregate loadable types"); 
-    } 
-  }; 
- 
-  struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> { 
-    AAMDNodes AATags; 
- 
-    LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy, 
-                   AAMDNodes AATags, Align BaseAlign, const DataLayout &DL) 
-        : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, 
-                                     DL), 
-          AATags(AATags) {} 
- 
-    /// Emit a leaf load of a single value. This is called at the leaves of the 
-    /// recursive emission to actually load values. 
-    void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) { 
-      assert(Ty->isSingleValueType()); 
-      // Load the single value and insert it using the indices. 
-      Value *GEP = 
-          IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep"); 
-      LoadInst *Load = 
-          IRB.CreateAlignedLoad(Ty, GEP, Alignment, Name + ".load"); 
+    // Lifetime intrinsics are only promotable if they cover the whole alloca.
+    // Therefore, we drop lifetime intrinsics which don't cover the whole
+    // alloca.
+    // (In theory, intrinsics which partially cover an alloca could be
+    // promoted, but PromoteMemToReg doesn't handle that case.)
+    // FIXME: Check whether the alloca is promotable before dropping the
+    // lifetime intrinsics?
+    if (NewBeginOffset != NewAllocaBeginOffset ||
+        NewEndOffset != NewAllocaEndOffset)
+      return true;
+
+    ConstantInt *Size =
+        ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
+                         NewEndOffset - NewBeginOffset);
+    // Lifetime intrinsics always expect an i8* so directly get such a pointer
+    // for the new alloca slice.
+    Type *PointerTy = IRB.getInt8PtrTy(OldPtr->getType()->getPointerAddressSpace());
+    Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy);
+    Value *New;
+    if (II.getIntrinsicID() == Intrinsic::lifetime_start)
+      New = IRB.CreateLifetimeStart(Ptr, Size);
+    else
+      New = IRB.CreateLifetimeEnd(Ptr, Size);
+
+    (void)New;
+    LLVM_DEBUG(dbgs() << "          to: " << *New << "\n");
+
+    return true;
+  }
+
+  void fixLoadStoreAlign(Instruction &Root) {
+    // This algorithm implements the same visitor loop as
+    // hasUnsafePHIOrSelectUse, and fixes the alignment of each load
+    // or store found.
+    SmallPtrSet<Instruction *, 4> Visited;
+    SmallVector<Instruction *, 4> Uses;
+    Visited.insert(&Root);
+    Uses.push_back(&Root);
+    do {
+      Instruction *I = Uses.pop_back_val();
+
+      if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+        LI->setAlignment(std::min(LI->getAlign(), getSliceAlign()));
+        continue;
+      }
+      if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+        SI->setAlignment(std::min(SI->getAlign(), getSliceAlign()));
+        continue;
+      }
+
+      assert(isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I) ||
+             isa<PHINode>(I) || isa<SelectInst>(I) ||
+             isa<GetElementPtrInst>(I));
+      for (User *U : I->users())
+        if (Visited.insert(cast<Instruction>(U)).second)
+          Uses.push_back(cast<Instruction>(U));
+    } while (!Uses.empty());
+  }
+
+  bool visitPHINode(PHINode &PN) {
+    LLVM_DEBUG(dbgs() << "    original: " << PN << "\n");
+    assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
+    assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
+
+    // We would like to compute a new pointer in only one place, but have it be
+    // as local as possible to the PHI. To do that, we re-use the location of
+    // the old pointer, which necessarily must be in the right position to
+    // dominate the PHI.
+    IRBuilderBase::InsertPointGuard Guard(IRB);
+    if (isa<PHINode>(OldPtr))
+      IRB.SetInsertPoint(&*OldPtr->getParent()->getFirstInsertionPt());
+    else
+      IRB.SetInsertPoint(OldPtr);
+    IRB.SetCurrentDebugLocation(OldPtr->getDebugLoc());
+
+    Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
+    // Replace the operands which were using the old pointer.
+    std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
+
+    LLVM_DEBUG(dbgs() << "          to: " << PN << "\n");
+    deleteIfTriviallyDead(OldPtr);
+
+    // Fix the alignment of any loads or stores using this PHI node.
+    fixLoadStoreAlign(PN);
+
+    // PHIs can't be promoted on their own, but often can be speculated. We
+    // check the speculation outside of the rewriter so that we see the
+    // fully-rewritten alloca.
+    PHIUsers.insert(&PN);
+    return true;
+  }
+
+  bool visitSelectInst(SelectInst &SI) {
+    LLVM_DEBUG(dbgs() << "    original: " << SI << "\n");
+    assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
+           "Pointer isn't an operand!");
+    assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
+    assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
+
+    Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
+    // Replace the operands which were using the old pointer.
+    if (SI.getOperand(1) == OldPtr)
+      SI.setOperand(1, NewPtr);
+    if (SI.getOperand(2) == OldPtr)
+      SI.setOperand(2, NewPtr);
+
+    LLVM_DEBUG(dbgs() << "          to: " << SI << "\n");
+    deleteIfTriviallyDead(OldPtr);
+
+    // Fix the alignment of any loads or stores using this select.
+    fixLoadStoreAlign(SI);
+
+    // Selects can't be promoted on their own, but often can be speculated. We
+    // check the speculation outside of the rewriter so that we see the
+    // fully-rewritten alloca.
+    SelectUsers.insert(&SI);
+    return true;
+  }
+};
+
+namespace {
+
+/// Visitor to rewrite aggregate loads and stores as scalar.
+///
+/// This pass aggressively rewrites all aggregate loads and stores on
+/// a particular pointer (or any pointer derived from it which we can identify)
+/// with scalar loads and stores.
+class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
+  // Befriend the base class so it can delegate to private visit methods.
+  friend class InstVisitor<AggLoadStoreRewriter, bool>;
+
+  /// Queue of pointer uses to analyze and potentially rewrite.
+  SmallVector<Use *, 8> Queue;
+
+  /// Set to prevent us from cycling with phi nodes and loops.
+  SmallPtrSet<User *, 8> Visited;
+
+  /// The current pointer use being rewritten. This is used to dig up the used
+  /// value (as opposed to the user).
+  Use *U = nullptr;
+
+  /// Used to calculate offsets, and hence alignment, of subobjects.
+  const DataLayout &DL;
+
+public:
+  AggLoadStoreRewriter(const DataLayout &DL) : DL(DL) {}
+
+  /// Rewrite loads and stores through a pointer and all pointers derived from
+  /// it.
+  bool rewrite(Instruction &I) {
+    LLVM_DEBUG(dbgs() << "  Rewriting FCA loads and stores...\n");
+    enqueueUsers(I);
+    bool Changed = false;
+    while (!Queue.empty()) {
+      U = Queue.pop_back_val();
+      Changed |= visit(cast<Instruction>(U->getUser()));
+    }
+    return Changed;
+  }
+
+private:
+  /// Enqueue all the users of the given instruction for further processing.
+  /// This uses a set to de-duplicate users.
+  void enqueueUsers(Instruction &I) {
+    for (Use &U : I.uses())
+      if (Visited.insert(U.getUser()).second)
+        Queue.push_back(&U);
+  }
+
+  // Conservative default is to not rewrite anything.
+  bool visitInstruction(Instruction &I) { return false; }
+
+  /// Generic recursive split emission class.
+  template <typename Derived> class OpSplitter {
+  protected:
+    /// The builder used to form new instructions.
+    IRBuilderTy IRB;
+
+    /// The indices which to be used with insert- or extractvalue to select the
+    /// appropriate value within the aggregate.
+    SmallVector<unsigned, 4> Indices;
+
+    /// The indices to a GEP instruction which will move Ptr to the correct slot
+    /// within the aggregate.
+    SmallVector<Value *, 4> GEPIndices;
+
+    /// The base pointer of the original op, used as a base for GEPing the
+    /// split operations.
+    Value *Ptr;
+
+    /// The base pointee type being GEPed into.
+    Type *BaseTy;
+
+    /// Known alignment of the base pointer.
+    Align BaseAlign;
+
+    /// To calculate offset of each component so we can correctly deduce
+    /// alignments.
+    const DataLayout &DL;
+
+    /// Initialize the splitter with an insertion point, Ptr and start with a
+    /// single zero GEP index.
+    OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
+               Align BaseAlign, const DataLayout &DL)
+        : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr),
+          BaseTy(BaseTy), BaseAlign(BaseAlign), DL(DL) {}
+
+  public:
+    /// Generic recursive split emission routine.
+    ///
+    /// This method recursively splits an aggregate op (load or store) into
+    /// scalar or vector ops. It splits recursively until it hits a single value
+    /// and emits that single value operation via the template argument.
+    ///
+    /// The logic of this routine relies on GEPs and insertvalue and
+    /// extractvalue all operating with the same fundamental index list, merely
+    /// formatted differently (GEPs need actual values).
+    ///
+    /// \param Ty  The type being split recursively into smaller ops.
+    /// \param Agg The aggregate value being built up or stored, depending on
+    /// whether this is splitting a load or a store respectively.
+    void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
+      if (Ty->isSingleValueType()) {
+        unsigned Offset = DL.getIndexedOffsetInType(BaseTy, GEPIndices);
+        return static_cast<Derived *>(this)->emitFunc(
+            Ty, Agg, commonAlignment(BaseAlign, Offset), Name);
+      }
+
+      if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+        unsigned OldSize = Indices.size();
+        (void)OldSize;
+        for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
+             ++Idx) {
+          assert(Indices.size() == OldSize && "Did not return to the old size");
+          Indices.push_back(Idx);
+          GEPIndices.push_back(IRB.getInt32(Idx));
+          emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx));
+          GEPIndices.pop_back();
+          Indices.pop_back();
+        }
+        return;
+      }
+
+      if (StructType *STy = dyn_cast<StructType>(Ty)) {
+        unsigned OldSize = Indices.size();
+        (void)OldSize;
+        for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
+             ++Idx) {
+          assert(Indices.size() == OldSize && "Did not return to the old size");
+          Indices.push_back(Idx);
+          GEPIndices.push_back(IRB.getInt32(Idx));
+          emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx));
+          GEPIndices.pop_back();
+          Indices.pop_back();
+        }
+        return;
+      }
+
+      llvm_unreachable("Only arrays and structs are aggregate loadable types");
+    }
+  };
+
+  struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
+    AAMDNodes AATags;
+
+    LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
+                   AAMDNodes AATags, Align BaseAlign, const DataLayout &DL)
+        : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
+                                     DL),
+          AATags(AATags) {}
+
+    /// Emit a leaf load of a single value. This is called at the leaves of the
+    /// recursive emission to actually load values.
+    void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
+      assert(Ty->isSingleValueType());
+      // Load the single value and insert it using the indices.
+      Value *GEP =
+          IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
+      LoadInst *Load =
+          IRB.CreateAlignedLoad(Ty, GEP, Alignment, Name + ".load");
 
       APInt Offset(
           DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
@@ -3388,51 +3388,51 @@ private:
           GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset))
         Load->setAAMetadata(AATags.shift(Offset.getZExtValue()));
 
-      Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert"); 
-      LLVM_DEBUG(dbgs() << "          to: " << *Load << "\n"); 
-    } 
-  }; 
- 
-  bool visitLoadInst(LoadInst &LI) { 
-    assert(LI.getPointerOperand() == *U); 
-    if (!LI.isSimple() || LI.getType()->isSingleValueType()) 
-      return false; 
- 
-    // We have an aggregate being loaded, split it apart. 
-    LLVM_DEBUG(dbgs() << "    original: " << LI << "\n"); 
-    AAMDNodes AATags; 
-    LI.getAAMetadata(AATags); 
-    LoadOpSplitter Splitter(&LI, *U, LI.getType(), AATags, 
-                            getAdjustedAlignment(&LI, 0), DL); 
-    Value *V = UndefValue::get(LI.getType()); 
-    Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca"); 
-    Visited.erase(&LI); 
-    LI.replaceAllUsesWith(V); 
-    LI.eraseFromParent(); 
-    return true; 
-  } 
- 
-  struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> { 
-    StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy, 
-                    AAMDNodes AATags, Align BaseAlign, const DataLayout &DL) 
-        : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, 
-                                      DL), 
-          AATags(AATags) {} 
-    AAMDNodes AATags; 
-    /// Emit a leaf store of a single value. This is called at the leaves of the 
-    /// recursive emission to actually produce stores. 
-    void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) { 
-      assert(Ty->isSingleValueType()); 
-      // Extract the single value and store it using the indices. 
-      // 
-      // The gep and extractvalue values are factored out of the CreateStore 
-      // call to make the output independent of the argument evaluation order. 
-      Value *ExtractValue = 
-          IRB.CreateExtractValue(Agg, Indices, Name + ".extract"); 
-      Value *InBoundsGEP = 
-          IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep"); 
-      StoreInst *Store = 
-          IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Alignment); 
+      Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
+      LLVM_DEBUG(dbgs() << "          to: " << *Load << "\n");
+    }
+  };
+
+  bool visitLoadInst(LoadInst &LI) {
+    assert(LI.getPointerOperand() == *U);
+    if (!LI.isSimple() || LI.getType()->isSingleValueType())
+      return false;
+
+    // We have an aggregate being loaded, split it apart.
+    LLVM_DEBUG(dbgs() << "    original: " << LI << "\n");
+    AAMDNodes AATags;
+    LI.getAAMetadata(AATags);
+    LoadOpSplitter Splitter(&LI, *U, LI.getType(), AATags,
+                            getAdjustedAlignment(&LI, 0), DL);
+    Value *V = UndefValue::get(LI.getType());
+    Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
+    Visited.erase(&LI);
+    LI.replaceAllUsesWith(V);
+    LI.eraseFromParent();
+    return true;
+  }
+
+  struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
+    StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
+                    AAMDNodes AATags, Align BaseAlign, const DataLayout &DL)
+        : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
+                                      DL),
+          AATags(AATags) {}
+    AAMDNodes AATags;
+    /// Emit a leaf store of a single value. This is called at the leaves of the
+    /// recursive emission to actually produce stores.
+    void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
+      assert(Ty->isSingleValueType());
+      // Extract the single value and store it using the indices.
+      //
+      // The gep and extractvalue values are factored out of the CreateStore
+      // call to make the output independent of the argument evaluation order.
+      Value *ExtractValue =
+          IRB.CreateExtractValue(Agg, Indices, Name + ".extract");
+      Value *InBoundsGEP =
+          IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
+      StoreInst *Store =
+          IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Alignment);
 
       APInt Offset(
           DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
@@ -3440,112 +3440,112 @@ private:
           GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset))
         Store->setAAMetadata(AATags.shift(Offset.getZExtValue()));
 
-      LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n"); 
-    } 
-  }; 
- 
-  bool visitStoreInst(StoreInst &SI) { 
-    if (!SI.isSimple() || SI.getPointerOperand() != *U) 
-      return false; 
-    Value *V = SI.getValueOperand(); 
-    if (V->getType()->isSingleValueType()) 
-      return false; 
- 
-    // We have an aggregate being stored, split it apart. 
-    LLVM_DEBUG(dbgs() << "    original: " << SI << "\n"); 
-    AAMDNodes AATags; 
-    SI.getAAMetadata(AATags); 
-    StoreOpSplitter Splitter(&SI, *U, V->getType(), AATags, 
-                             getAdjustedAlignment(&SI, 0), DL); 
-    Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca"); 
-    Visited.erase(&SI); 
-    SI.eraseFromParent(); 
-    return true; 
-  } 
- 
-  bool visitBitCastInst(BitCastInst &BC) { 
-    enqueueUsers(BC); 
-    return false; 
-  } 
- 
-  bool visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) { 
-    enqueueUsers(ASC); 
-    return false; 
-  } 
- 
-  // Fold gep (select cond, ptr1, ptr2) => select cond, gep(ptr1), gep(ptr2) 
-  bool foldGEPSelect(GetElementPtrInst &GEPI) { 
-    if (!GEPI.hasAllConstantIndices()) 
-      return false; 
- 
-    SelectInst *Sel = cast<SelectInst>(GEPI.getPointerOperand()); 
- 
-    LLVM_DEBUG(dbgs() << "  Rewriting gep(select) -> select(gep):" 
-                      << "\n    original: " << *Sel 
-                      << "\n              " << GEPI); 
- 
-    IRBuilderTy Builder(&GEPI); 
+      LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n");
+    }
+  };
+
+  bool visitStoreInst(StoreInst &SI) {
+    if (!SI.isSimple() || SI.getPointerOperand() != *U)
+      return false;
+    Value *V = SI.getValueOperand();
+    if (V->getType()->isSingleValueType())
+      return false;
+
+    // We have an aggregate being stored, split it apart.
+    LLVM_DEBUG(dbgs() << "    original: " << SI << "\n");
+    AAMDNodes AATags;
+    SI.getAAMetadata(AATags);
+    StoreOpSplitter Splitter(&SI, *U, V->getType(), AATags,
+                             getAdjustedAlignment(&SI, 0), DL);
+    Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
+    Visited.erase(&SI);
+    SI.eraseFromParent();
+    return true;
+  }
+
+  bool visitBitCastInst(BitCastInst &BC) {
+    enqueueUsers(BC);
+    return false;
+  }
+
+  bool visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
+    enqueueUsers(ASC);
+    return false;
+  }
+
+  // Fold gep (select cond, ptr1, ptr2) => select cond, gep(ptr1), gep(ptr2)
+  bool foldGEPSelect(GetElementPtrInst &GEPI) {
+    if (!GEPI.hasAllConstantIndices())
+      return false;
+
+    SelectInst *Sel = cast<SelectInst>(GEPI.getPointerOperand());
+
+    LLVM_DEBUG(dbgs() << "  Rewriting gep(select) -> select(gep):"
+                      << "\n    original: " << *Sel
+                      << "\n              " << GEPI);
+
+    IRBuilderTy Builder(&GEPI);
     SmallVector<Value *, 4> Index(GEPI.indices());
-    bool IsInBounds = GEPI.isInBounds(); 
- 
-    Value *True = Sel->getTrueValue(); 
-    Value *NTrue = 
-        IsInBounds 
-            ? Builder.CreateInBoundsGEP(True, Index, 
-                                        True->getName() + ".sroa.gep") 
-            : Builder.CreateGEP(True, Index, True->getName() + ".sroa.gep"); 
- 
-    Value *False = Sel->getFalseValue(); 
- 
-    Value *NFalse = 
-        IsInBounds 
-            ? Builder.CreateInBoundsGEP(False, Index, 
-                                        False->getName() + ".sroa.gep") 
-            : Builder.CreateGEP(False, Index, False->getName() + ".sroa.gep"); 
- 
-    Value *NSel = Builder.CreateSelect(Sel->getCondition(), NTrue, NFalse, 
-                                       Sel->getName() + ".sroa.sel"); 
-    Visited.erase(&GEPI); 
-    GEPI.replaceAllUsesWith(NSel); 
-    GEPI.eraseFromParent(); 
-    Instruction *NSelI = cast<Instruction>(NSel); 
-    Visited.insert(NSelI); 
-    enqueueUsers(*NSelI); 
- 
-    LLVM_DEBUG(dbgs() << "\n          to: " << *NTrue 
-                      << "\n              " << *NFalse 
-                      << "\n              " << *NSel << '\n'); 
- 
-    return true; 
-  } 
- 
-  // Fold gep (phi ptr1, ptr2) => phi gep(ptr1), gep(ptr2) 
-  bool foldGEPPhi(GetElementPtrInst &GEPI) { 
-    if (!GEPI.hasAllConstantIndices()) 
-      return false; 
- 
-    PHINode *PHI = cast<PHINode>(GEPI.getPointerOperand()); 
-    if (GEPI.getParent() != PHI->getParent() || 
-        llvm::any_of(PHI->incoming_values(), [](Value *In) 
-          { Instruction *I = dyn_cast<Instruction>(In); 
-            return !I || isa<GetElementPtrInst>(I) || isa<PHINode>(I) || 
-                   succ_empty(I->getParent()) || 
-                   !I->getParent()->isLegalToHoistInto(); 
-          })) 
-      return false; 
- 
-    LLVM_DEBUG(dbgs() << "  Rewriting gep(phi) -> phi(gep):" 
-                      << "\n    original: " << *PHI 
-                      << "\n              " << GEPI 
-                      << "\n          to: "); 
- 
+    bool IsInBounds = GEPI.isInBounds();
+
+    Value *True = Sel->getTrueValue();
+    Value *NTrue =
+        IsInBounds
+            ? Builder.CreateInBoundsGEP(True, Index,
+                                        True->getName() + ".sroa.gep")
+            : Builder.CreateGEP(True, Index, True->getName() + ".sroa.gep");
+
+    Value *False = Sel->getFalseValue();
+
+    Value *NFalse =
+        IsInBounds
+            ? Builder.CreateInBoundsGEP(False, Index,
+                                        False->getName() + ".sroa.gep")
+            : Builder.CreateGEP(False, Index, False->getName() + ".sroa.gep");
+
+    Value *NSel = Builder.CreateSelect(Sel->getCondition(), NTrue, NFalse,
+                                       Sel->getName() + ".sroa.sel");
+    Visited.erase(&GEPI);
+    GEPI.replaceAllUsesWith(NSel);
+    GEPI.eraseFromParent();
+    Instruction *NSelI = cast<Instruction>(NSel);
+    Visited.insert(NSelI);
+    enqueueUsers(*NSelI);
+
+    LLVM_DEBUG(dbgs() << "\n          to: " << *NTrue
+                      << "\n              " << *NFalse
+                      << "\n              " << *NSel << '\n');
+
+    return true;
+  }
+
+  // Fold gep (phi ptr1, ptr2) => phi gep(ptr1), gep(ptr2)
+  bool foldGEPPhi(GetElementPtrInst &GEPI) {
+    if (!GEPI.hasAllConstantIndices())
+      return false;
+
+    PHINode *PHI = cast<PHINode>(GEPI.getPointerOperand());
+    if (GEPI.getParent() != PHI->getParent() ||
+        llvm::any_of(PHI->incoming_values(), [](Value *In)
+          { Instruction *I = dyn_cast<Instruction>(In);
+            return !I || isa<GetElementPtrInst>(I) || isa<PHINode>(I) ||
+                   succ_empty(I->getParent()) ||
+                   !I->getParent()->isLegalToHoistInto();
+          }))
+      return false;
+
+    LLVM_DEBUG(dbgs() << "  Rewriting gep(phi) -> phi(gep):"
+                      << "\n    original: " << *PHI
+                      << "\n              " << GEPI
+                      << "\n          to: ");
+
     SmallVector<Value *, 4> Index(GEPI.indices());
-    bool IsInBounds = GEPI.isInBounds(); 
-    IRBuilderTy PHIBuilder(GEPI.getParent()->getFirstNonPHI()); 
-    PHINode *NewPN = PHIBuilder.CreatePHI(GEPI.getType(), 
-                                          PHI->getNumIncomingValues(), 
-                                          PHI->getName() + ".sroa.phi"); 
-    for (unsigned I = 0, E = PHI->getNumIncomingValues(); I != E; ++I) { 
+    bool IsInBounds = GEPI.isInBounds();
+    IRBuilderTy PHIBuilder(GEPI.getParent()->getFirstNonPHI());
+    PHINode *NewPN = PHIBuilder.CreatePHI(GEPI.getType(),
+                                          PHI->getNumIncomingValues(),
+                                          PHI->getName() + ".sroa.phi");
+    for (unsigned I = 0, E = PHI->getNumIncomingValues(); I != E; ++I) {
       BasicBlock *B = PHI->getIncomingBlock(I);
       Value *NewVal = nullptr;
       int Idx = NewPN->getBasicBlockIndex(B);
@@ -3553,354 +3553,354 @@ private:
         NewVal = NewPN->getIncomingValue(Idx);
       } else {
         Instruction *In = cast<Instruction>(PHI->getIncomingValue(I));
- 
+
         IRBuilderTy B(In->getParent(), std::next(In->getIterator()));
         NewVal = IsInBounds
             ? B.CreateInBoundsGEP(In, Index, In->getName() + ".sroa.gep")
             : B.CreateGEP(In, Index, In->getName() + ".sroa.gep");
       }
       NewPN->addIncoming(NewVal, B);
-    } 
- 
-    Visited.erase(&GEPI); 
-    GEPI.replaceAllUsesWith(NewPN); 
-    GEPI.eraseFromParent(); 
-    Visited.insert(NewPN); 
-    enqueueUsers(*NewPN); 
- 
-    LLVM_DEBUG(for (Value *In : NewPN->incoming_values()) 
-                 dbgs() << "\n              " << *In; 
-               dbgs() << "\n              " << *NewPN << '\n'); 
- 
-    return true; 
-  } 
- 
-  bool visitGetElementPtrInst(GetElementPtrInst &GEPI) { 
-    if (isa<SelectInst>(GEPI.getPointerOperand()) && 
-        foldGEPSelect(GEPI)) 
-      return true; 
- 
-    if (isa<PHINode>(GEPI.getPointerOperand()) && 
-        foldGEPPhi(GEPI)) 
-      return true; 
- 
-    enqueueUsers(GEPI); 
-    return false; 
-  } 
- 
-  bool visitPHINode(PHINode &PN) { 
-    enqueueUsers(PN); 
-    return false; 
-  } 
- 
-  bool visitSelectInst(SelectInst &SI) { 
-    enqueueUsers(SI); 
-    return false; 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-/// Strip aggregate type wrapping. 
-/// 
-/// This removes no-op aggregate types wrapping an underlying type. It will 
-/// strip as many layers of types as it can without changing either the type 
-/// size or the allocated size. 
-static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) { 
-  if (Ty->isSingleValueType()) 
-    return Ty; 
- 
-  uint64_t AllocSize = DL.getTypeAllocSize(Ty).getFixedSize(); 
-  uint64_t TypeSize = DL.getTypeSizeInBits(Ty).getFixedSize(); 
- 
-  Type *InnerTy; 
-  if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { 
-    InnerTy = ArrTy->getElementType(); 
-  } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 
-    const StructLayout *SL = DL.getStructLayout(STy); 
-    unsigned Index = SL->getElementContainingOffset(0); 
-    InnerTy = STy->getElementType(Index); 
-  } else { 
-    return Ty; 
-  } 
- 
-  if (AllocSize > DL.getTypeAllocSize(InnerTy).getFixedSize() || 
-      TypeSize > DL.getTypeSizeInBits(InnerTy).getFixedSize()) 
-    return Ty; 
- 
-  return stripAggregateTypeWrapping(DL, InnerTy); 
-} 
- 
-/// Try to find a partition of the aggregate type passed in for a given 
-/// offset and size. 
-/// 
-/// This recurses through the aggregate type and tries to compute a subtype 
-/// based on the offset and size. When the offset and size span a sub-section 
-/// of an array, it will even compute a new array type for that sub-section, 
-/// and the same for structs. 
-/// 
-/// Note that this routine is very strict and tries to find a partition of the 
-/// type which produces the *exact* right offset and size. It is not forgiving 
-/// when the size or offset cause either end of type-based partition to be off. 
-/// Also, this is a best-effort routine. It is reasonable to give up and not 
-/// return a type if necessary. 
-static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset, 
-                              uint64_t Size) { 
-  if (Offset == 0 && DL.getTypeAllocSize(Ty).getFixedSize() == Size) 
-    return stripAggregateTypeWrapping(DL, Ty); 
-  if (Offset > DL.getTypeAllocSize(Ty).getFixedSize() || 
-      (DL.getTypeAllocSize(Ty).getFixedSize() - Offset) < Size) 
-    return nullptr; 
- 
-  if (isa<ArrayType>(Ty) || isa<VectorType>(Ty)) { 
-     Type *ElementTy; 
-     uint64_t TyNumElements; 
-     if (auto *AT = dyn_cast<ArrayType>(Ty)) { 
-       ElementTy = AT->getElementType(); 
-       TyNumElements = AT->getNumElements(); 
-     } else { 
-       // FIXME: This isn't right for vectors with non-byte-sized or 
-       // non-power-of-two sized elements. 
-       auto *VT = cast<FixedVectorType>(Ty); 
-       ElementTy = VT->getElementType(); 
-       TyNumElements = VT->getNumElements(); 
-    } 
-    uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedSize(); 
-    uint64_t NumSkippedElements = Offset / ElementSize; 
-    if (NumSkippedElements >= TyNumElements) 
-      return nullptr; 
-    Offset -= NumSkippedElements * ElementSize; 
- 
-    // First check if we need to recurse. 
-    if (Offset > 0 || Size < ElementSize) { 
-      // Bail if the partition ends in a different array element. 
-      if ((Offset + Size) > ElementSize) 
-        return nullptr; 
-      // Recurse through the element type trying to peel off offset bytes. 
-      return getTypePartition(DL, ElementTy, Offset, Size); 
-    } 
-    assert(Offset == 0); 
- 
-    if (Size == ElementSize) 
-      return stripAggregateTypeWrapping(DL, ElementTy); 
-    assert(Size > ElementSize); 
-    uint64_t NumElements = Size / ElementSize; 
-    if (NumElements * ElementSize != Size) 
-      return nullptr; 
-    return ArrayType::get(ElementTy, NumElements); 
-  } 
- 
-  StructType *STy = dyn_cast<StructType>(Ty); 
-  if (!STy) 
-    return nullptr; 
- 
-  const StructLayout *SL = DL.getStructLayout(STy); 
-  if (Offset >= SL->getSizeInBytes()) 
-    return nullptr; 
-  uint64_t EndOffset = Offset + Size; 
-  if (EndOffset > SL->getSizeInBytes()) 
-    return nullptr; 
- 
-  unsigned Index = SL->getElementContainingOffset(Offset); 
-  Offset -= SL->getElementOffset(Index); 
- 
-  Type *ElementTy = STy->getElementType(Index); 
-  uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedSize(); 
-  if (Offset >= ElementSize) 
-    return nullptr; // The offset points into alignment padding. 
- 
-  // See if any partition must be contained by the element. 
-  if (Offset > 0 || Size < ElementSize) { 
-    if ((Offset + Size) > ElementSize) 
-      return nullptr; 
-    return getTypePartition(DL, ElementTy, Offset, Size); 
-  } 
-  assert(Offset == 0); 
- 
-  if (Size == ElementSize) 
-    return stripAggregateTypeWrapping(DL, ElementTy); 
- 
-  StructType::element_iterator EI = STy->element_begin() + Index, 
-                               EE = STy->element_end(); 
-  if (EndOffset < SL->getSizeInBytes()) { 
-    unsigned EndIndex = SL->getElementContainingOffset(EndOffset); 
-    if (Index == EndIndex) 
-      return nullptr; // Within a single element and its padding. 
- 
-    // Don't try to form "natural" types if the elements don't line up with the 
-    // expected size. 
-    // FIXME: We could potentially recurse down through the last element in the 
-    // sub-struct to find a natural end point. 
-    if (SL->getElementOffset(EndIndex) != EndOffset) 
-      return nullptr; 
- 
-    assert(Index < EndIndex); 
-    EE = STy->element_begin() + EndIndex; 
-  } 
- 
-  // Try to build up a sub-structure. 
-  StructType *SubTy = 
-      StructType::get(STy->getContext(), makeArrayRef(EI, EE), STy->isPacked()); 
-  const StructLayout *SubSL = DL.getStructLayout(SubTy); 
-  if (Size != SubSL->getSizeInBytes()) 
-    return nullptr; // The sub-struct doesn't have quite the size needed. 
- 
-  return SubTy; 
-} 
- 
-/// Pre-split loads and stores to simplify rewriting. 
-/// 
-/// We want to break up the splittable load+store pairs as much as 
-/// possible. This is important to do as a preprocessing step, as once we 
-/// start rewriting the accesses to partitions of the alloca we lose the 
-/// necessary information to correctly split apart paired loads and stores 
-/// which both point into this alloca. The case to consider is something like 
-/// the following: 
-/// 
-///   %a = alloca [12 x i8] 
-///   %gep1 = getelementptr [12 x i8]* %a, i32 0, i32 0 
-///   %gep2 = getelementptr [12 x i8]* %a, i32 0, i32 4 
-///   %gep3 = getelementptr [12 x i8]* %a, i32 0, i32 8 
-///   %iptr1 = bitcast i8* %gep1 to i64* 
-///   %iptr2 = bitcast i8* %gep2 to i64* 
-///   %fptr1 = bitcast i8* %gep1 to float* 
-///   %fptr2 = bitcast i8* %gep2 to float* 
-///   %fptr3 = bitcast i8* %gep3 to float* 
-///   store float 0.0, float* %fptr1 
-///   store float 1.0, float* %fptr2 
-///   %v = load i64* %iptr1 
-///   store i64 %v, i64* %iptr2 
-///   %f1 = load float* %fptr2 
-///   %f2 = load float* %fptr3 
-/// 
-/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and 
-/// promote everything so we recover the 2 SSA values that should have been 
-/// there all along. 
-/// 
-/// \returns true if any changes are made. 
-bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { 
-  LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n"); 
- 
-  // Track the loads and stores which are candidates for pre-splitting here, in 
-  // the order they first appear during the partition scan. These give stable 
-  // iteration order and a basis for tracking which loads and stores we 
-  // actually split. 
-  SmallVector<LoadInst *, 4> Loads; 
-  SmallVector<StoreInst *, 4> Stores; 
- 
-  // We need to accumulate the splits required of each load or store where we 
-  // can find them via a direct lookup. This is important to cross-check loads 
-  // and stores against each other. We also track the slice so that we can kill 
-  // all the slices that end up split. 
-  struct SplitOffsets { 
-    Slice *S; 
-    std::vector<uint64_t> Splits; 
-  }; 
-  SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap; 
- 
-  // Track loads out of this alloca which cannot, for any reason, be pre-split. 
-  // This is important as we also cannot pre-split stores of those loads! 
-  // FIXME: This is all pretty gross. It means that we can be more aggressive 
-  // in pre-splitting when the load feeding the store happens to come from 
-  // a separate alloca. Put another way, the effectiveness of SROA would be 
-  // decreased by a frontend which just concatenated all of its local allocas 
-  // into one big flat alloca. But defeating such patterns is exactly the job 
-  // SROA is tasked with! Sadly, to not have this discrepancy we would have 
-  // change store pre-splitting to actually force pre-splitting of the load 
-  // that feeds it *and all stores*. That makes pre-splitting much harder, but 
-  // maybe it would make it more principled? 
-  SmallPtrSet<LoadInst *, 8> UnsplittableLoads; 
- 
-  LLVM_DEBUG(dbgs() << "  Searching for candidate loads and stores\n"); 
-  for (auto &P : AS.partitions()) { 
-    for (Slice &S : P) { 
-      Instruction *I = cast<Instruction>(S.getUse()->getUser()); 
-      if (!S.isSplittable() || S.endOffset() <= P.endOffset()) { 
-        // If this is a load we have to track that it can't participate in any 
-        // pre-splitting. If this is a store of a load we have to track that 
-        // that load also can't participate in any pre-splitting. 
-        if (auto *LI = dyn_cast<LoadInst>(I)) 
-          UnsplittableLoads.insert(LI); 
-        else if (auto *SI = dyn_cast<StoreInst>(I)) 
-          if (auto *LI = dyn_cast<LoadInst>(SI->getValueOperand())) 
-            UnsplittableLoads.insert(LI); 
-        continue; 
-      } 
-      assert(P.endOffset() > S.beginOffset() && 
-             "Empty or backwards partition!"); 
- 
-      // Determine if this is a pre-splittable slice. 
-      if (auto *LI = dyn_cast<LoadInst>(I)) { 
-        assert(!LI->isVolatile() && "Cannot split volatile loads!"); 
- 
-        // The load must be used exclusively to store into other pointers for 
-        // us to be able to arbitrarily pre-split it. The stores must also be 
-        // simple to avoid changing semantics. 
-        auto IsLoadSimplyStored = [](LoadInst *LI) { 
-          for (User *LU : LI->users()) { 
-            auto *SI = dyn_cast<StoreInst>(LU); 
-            if (!SI || !SI->isSimple()) 
-              return false; 
-          } 
-          return true; 
-        }; 
-        if (!IsLoadSimplyStored(LI)) { 
-          UnsplittableLoads.insert(LI); 
-          continue; 
-        } 
- 
-        Loads.push_back(LI); 
-      } else if (auto *SI = dyn_cast<StoreInst>(I)) { 
-        if (S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex())) 
-          // Skip stores *of* pointers. FIXME: This shouldn't even be possible! 
-          continue; 
-        auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand()); 
-        if (!StoredLoad || !StoredLoad->isSimple()) 
-          continue; 
-        assert(!SI->isVolatile() && "Cannot split volatile stores!"); 
- 
-        Stores.push_back(SI); 
-      } else { 
-        // Other uses cannot be pre-split. 
-        continue; 
-      } 
- 
-      // Record the initial split. 
-      LLVM_DEBUG(dbgs() << "    Candidate: " << *I << "\n"); 
-      auto &Offsets = SplitOffsetsMap[I]; 
-      assert(Offsets.Splits.empty() && 
-             "Should not have splits the first time we see an instruction!"); 
-      Offsets.S = &S; 
-      Offsets.Splits.push_back(P.endOffset() - S.beginOffset()); 
-    } 
- 
-    // Now scan the already split slices, and add a split for any of them which 
-    // we're going to pre-split. 
-    for (Slice *S : P.splitSliceTails()) { 
-      auto SplitOffsetsMapI = 
-          SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser())); 
-      if (SplitOffsetsMapI == SplitOffsetsMap.end()) 
-        continue; 
-      auto &Offsets = SplitOffsetsMapI->second; 
- 
-      assert(Offsets.S == S && "Found a mismatched slice!"); 
-      assert(!Offsets.Splits.empty() && 
-             "Cannot have an empty set of splits on the second partition!"); 
-      assert(Offsets.Splits.back() == 
-                 P.beginOffset() - Offsets.S->beginOffset() && 
-             "Previous split does not end where this one begins!"); 
- 
-      // Record each split. The last partition's end isn't needed as the size 
-      // of the slice dictates that. 
-      if (S->endOffset() > P.endOffset()) 
-        Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset()); 
-    } 
-  } 
- 
-  // We may have split loads where some of their stores are split stores. For 
-  // such loads and stores, we can only pre-split them if their splits exactly 
-  // match relative to their starting offset. We have to verify this prior to 
-  // any rewriting. 
+    }
+
+    Visited.erase(&GEPI);
+    GEPI.replaceAllUsesWith(NewPN);
+    GEPI.eraseFromParent();
+    Visited.insert(NewPN);
+    enqueueUsers(*NewPN);
+
+    LLVM_DEBUG(for (Value *In : NewPN->incoming_values())
+                 dbgs() << "\n              " << *In;
+               dbgs() << "\n              " << *NewPN << '\n');
+
+    return true;
+  }
+
+  bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+    if (isa<SelectInst>(GEPI.getPointerOperand()) &&
+        foldGEPSelect(GEPI))
+      return true;
+
+    if (isa<PHINode>(GEPI.getPointerOperand()) &&
+        foldGEPPhi(GEPI))
+      return true;
+
+    enqueueUsers(GEPI);
+    return false;
+  }
+
+  bool visitPHINode(PHINode &PN) {
+    enqueueUsers(PN);
+    return false;
+  }
+
+  bool visitSelectInst(SelectInst &SI) {
+    enqueueUsers(SI);
+    return false;
+  }
+};
+
+} // end anonymous namespace
+
+/// Strip aggregate type wrapping.
+///
+/// This removes no-op aggregate types wrapping an underlying type. It will
+/// strip as many layers of types as it can without changing either the type
+/// size or the allocated size.
+static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
+  if (Ty->isSingleValueType())
+    return Ty;
+
+  uint64_t AllocSize = DL.getTypeAllocSize(Ty).getFixedSize();
+  uint64_t TypeSize = DL.getTypeSizeInBits(Ty).getFixedSize();
+
+  Type *InnerTy;
+  if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
+    InnerTy = ArrTy->getElementType();
+  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
+    const StructLayout *SL = DL.getStructLayout(STy);
+    unsigned Index = SL->getElementContainingOffset(0);
+    InnerTy = STy->getElementType(Index);
+  } else {
+    return Ty;
+  }
+
+  if (AllocSize > DL.getTypeAllocSize(InnerTy).getFixedSize() ||
+      TypeSize > DL.getTypeSizeInBits(InnerTy).getFixedSize())
+    return Ty;
+
+  return stripAggregateTypeWrapping(DL, InnerTy);
+}
+
+/// Try to find a partition of the aggregate type passed in for a given
+/// offset and size.
+///
+/// This recurses through the aggregate type and tries to compute a subtype
+/// based on the offset and size. When the offset and size span a sub-section
+/// of an array, it will even compute a new array type for that sub-section,
+/// and the same for structs.
+///
+/// Note that this routine is very strict and tries to find a partition of the
+/// type which produces the *exact* right offset and size. It is not forgiving
+/// when the size or offset cause either end of type-based partition to be off.
+/// Also, this is a best-effort routine. It is reasonable to give up and not
+/// return a type if necessary.
+static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
+                              uint64_t Size) {
+  if (Offset == 0 && DL.getTypeAllocSize(Ty).getFixedSize() == Size)
+    return stripAggregateTypeWrapping(DL, Ty);
+  if (Offset > DL.getTypeAllocSize(Ty).getFixedSize() ||
+      (DL.getTypeAllocSize(Ty).getFixedSize() - Offset) < Size)
+    return nullptr;
+
+  if (isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
+     Type *ElementTy;
+     uint64_t TyNumElements;
+     if (auto *AT = dyn_cast<ArrayType>(Ty)) {
+       ElementTy = AT->getElementType();
+       TyNumElements = AT->getNumElements();
+     } else {
+       // FIXME: This isn't right for vectors with non-byte-sized or
+       // non-power-of-two sized elements.
+       auto *VT = cast<FixedVectorType>(Ty);
+       ElementTy = VT->getElementType();
+       TyNumElements = VT->getNumElements();
+    }
+    uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedSize();
+    uint64_t NumSkippedElements = Offset / ElementSize;
+    if (NumSkippedElements >= TyNumElements)
+      return nullptr;
+    Offset -= NumSkippedElements * ElementSize;
+
+    // First check if we need to recurse.
+    if (Offset > 0 || Size < ElementSize) {
+      // Bail if the partition ends in a different array element.
+      if ((Offset + Size) > ElementSize)
+        return nullptr;
+      // Recurse through the element type trying to peel off offset bytes.
+      return getTypePartition(DL, ElementTy, Offset, Size);
+    }
+    assert(Offset == 0);
+
+    if (Size == ElementSize)
+      return stripAggregateTypeWrapping(DL, ElementTy);
+    assert(Size > ElementSize);
+    uint64_t NumElements = Size / ElementSize;
+    if (NumElements * ElementSize != Size)
+      return nullptr;
+    return ArrayType::get(ElementTy, NumElements);
+  }
+
+  StructType *STy = dyn_cast<StructType>(Ty);
+  if (!STy)
+    return nullptr;
+
+  const StructLayout *SL = DL.getStructLayout(STy);
+  if (Offset >= SL->getSizeInBytes())
+    return nullptr;
+  uint64_t EndOffset = Offset + Size;
+  if (EndOffset > SL->getSizeInBytes())
+    return nullptr;
+
+  unsigned Index = SL->getElementContainingOffset(Offset);
+  Offset -= SL->getElementOffset(Index);
+
+  Type *ElementTy = STy->getElementType(Index);
+  uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedSize();
+  if (Offset >= ElementSize)
+    return nullptr; // The offset points into alignment padding.
+
+  // See if any partition must be contained by the element.
+  if (Offset > 0 || Size < ElementSize) {
+    if ((Offset + Size) > ElementSize)
+      return nullptr;
+    return getTypePartition(DL, ElementTy, Offset, Size);
+  }
+  assert(Offset == 0);
+
+  if (Size == ElementSize)
+    return stripAggregateTypeWrapping(DL, ElementTy);
+
+  StructType::element_iterator EI = STy->element_begin() + Index,
+                               EE = STy->element_end();
+  if (EndOffset < SL->getSizeInBytes()) {
+    unsigned EndIndex = SL->getElementContainingOffset(EndOffset);
+    if (Index == EndIndex)
+      return nullptr; // Within a single element and its padding.
+
+    // Don't try to form "natural" types if the elements don't line up with the
+    // expected size.
+    // FIXME: We could potentially recurse down through the last element in the
+    // sub-struct to find a natural end point.
+    if (SL->getElementOffset(EndIndex) != EndOffset)
+      return nullptr;
+
+    assert(Index < EndIndex);
+    EE = STy->element_begin() + EndIndex;
+  }
+
+  // Try to build up a sub-structure.
+  StructType *SubTy =
+      StructType::get(STy->getContext(), makeArrayRef(EI, EE), STy->isPacked());
+  const StructLayout *SubSL = DL.getStructLayout(SubTy);
+  if (Size != SubSL->getSizeInBytes())
+    return nullptr; // The sub-struct doesn't have quite the size needed.
+
+  return SubTy;
+}
+
+/// Pre-split loads and stores to simplify rewriting.
+///
+/// We want to break up the splittable load+store pairs as much as
+/// possible. This is important to do as a preprocessing step, as once we
+/// start rewriting the accesses to partitions of the alloca we lose the
+/// necessary information to correctly split apart paired loads and stores
+/// which both point into this alloca. The case to consider is something like
+/// the following:
+///
+///   %a = alloca [12 x i8]
+///   %gep1 = getelementptr [12 x i8]* %a, i32 0, i32 0
+///   %gep2 = getelementptr [12 x i8]* %a, i32 0, i32 4
+///   %gep3 = getelementptr [12 x i8]* %a, i32 0, i32 8
+///   %iptr1 = bitcast i8* %gep1 to i64*
+///   %iptr2 = bitcast i8* %gep2 to i64*
+///   %fptr1 = bitcast i8* %gep1 to float*
+///   %fptr2 = bitcast i8* %gep2 to float*
+///   %fptr3 = bitcast i8* %gep3 to float*
+///   store float 0.0, float* %fptr1
+///   store float 1.0, float* %fptr2
+///   %v = load i64* %iptr1
+///   store i64 %v, i64* %iptr2
+///   %f1 = load float* %fptr2
+///   %f2 = load float* %fptr3
+///
+/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
+/// promote everything so we recover the 2 SSA values that should have been
+/// there all along.
+///
+/// \returns true if any changes are made.
+bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
+  LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n");
+
+  // Track the loads and stores which are candidates for pre-splitting here, in
+  // the order they first appear during the partition scan. These give stable
+  // iteration order and a basis for tracking which loads and stores we
+  // actually split.
+  SmallVector<LoadInst *, 4> Loads;
+  SmallVector<StoreInst *, 4> Stores;
+
+  // We need to accumulate the splits required of each load or store where we
+  // can find them via a direct lookup. This is important to cross-check loads
+  // and stores against each other. We also track the slice so that we can kill
+  // all the slices that end up split.
+  struct SplitOffsets {
+    Slice *S;
+    std::vector<uint64_t> Splits;
+  };
+  SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap;
+
+  // Track loads out of this alloca which cannot, for any reason, be pre-split.
+  // This is important as we also cannot pre-split stores of those loads!
+  // FIXME: This is all pretty gross. It means that we can be more aggressive
+  // in pre-splitting when the load feeding the store happens to come from
+  // a separate alloca. Put another way, the effectiveness of SROA would be
+  // decreased by a frontend which just concatenated all of its local allocas
+  // into one big flat alloca. But defeating such patterns is exactly the job
+  // SROA is tasked with! Sadly, to not have this discrepancy we would have
+  // change store pre-splitting to actually force pre-splitting of the load
+  // that feeds it *and all stores*. That makes pre-splitting much harder, but
+  // maybe it would make it more principled?
+  SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
+
+  LLVM_DEBUG(dbgs() << "  Searching for candidate loads and stores\n");
+  for (auto &P : AS.partitions()) {
+    for (Slice &S : P) {
+      Instruction *I = cast<Instruction>(S.getUse()->getUser());
+      if (!S.isSplittable() || S.endOffset() <= P.endOffset()) {
+        // If this is a load we have to track that it can't participate in any
+        // pre-splitting. If this is a store of a load we have to track that
+        // that load also can't participate in any pre-splitting.
+        if (auto *LI = dyn_cast<LoadInst>(I))
+          UnsplittableLoads.insert(LI);
+        else if (auto *SI = dyn_cast<StoreInst>(I))
+          if (auto *LI = dyn_cast<LoadInst>(SI->getValueOperand()))
+            UnsplittableLoads.insert(LI);
+        continue;
+      }
+      assert(P.endOffset() > S.beginOffset() &&
+             "Empty or backwards partition!");
+
+      // Determine if this is a pre-splittable slice.
+      if (auto *LI = dyn_cast<LoadInst>(I)) {
+        assert(!LI->isVolatile() && "Cannot split volatile loads!");
+
+        // The load must be used exclusively to store into other pointers for
+        // us to be able to arbitrarily pre-split it. The stores must also be
+        // simple to avoid changing semantics.
+        auto IsLoadSimplyStored = [](LoadInst *LI) {
+          for (User *LU : LI->users()) {
+            auto *SI = dyn_cast<StoreInst>(LU);
+            if (!SI || !SI->isSimple())
+              return false;
+          }
+          return true;
+        };
+        if (!IsLoadSimplyStored(LI)) {
+          UnsplittableLoads.insert(LI);
+          continue;
+        }
+
+        Loads.push_back(LI);
+      } else if (auto *SI = dyn_cast<StoreInst>(I)) {
+        if (S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex()))
+          // Skip stores *of* pointers. FIXME: This shouldn't even be possible!
+          continue;
+        auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand());
+        if (!StoredLoad || !StoredLoad->isSimple())
+          continue;
+        assert(!SI->isVolatile() && "Cannot split volatile stores!");
+
+        Stores.push_back(SI);
+      } else {
+        // Other uses cannot be pre-split.
+        continue;
+      }
+
+      // Record the initial split.
+      LLVM_DEBUG(dbgs() << "    Candidate: " << *I << "\n");
+      auto &Offsets = SplitOffsetsMap[I];
+      assert(Offsets.Splits.empty() &&
+             "Should not have splits the first time we see an instruction!");
+      Offsets.S = &S;
+      Offsets.Splits.push_back(P.endOffset() - S.beginOffset());
+    }
+
+    // Now scan the already split slices, and add a split for any of them which
+    // we're going to pre-split.
+    for (Slice *S : P.splitSliceTails()) {
+      auto SplitOffsetsMapI =
+          SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser()));
+      if (SplitOffsetsMapI == SplitOffsetsMap.end())
+        continue;
+      auto &Offsets = SplitOffsetsMapI->second;
+
+      assert(Offsets.S == S && "Found a mismatched slice!");
+      assert(!Offsets.Splits.empty() &&
+             "Cannot have an empty set of splits on the second partition!");
+      assert(Offsets.Splits.back() ==
+                 P.beginOffset() - Offsets.S->beginOffset() &&
+             "Previous split does not end where this one begins!");
+
+      // Record each split. The last partition's end isn't needed as the size
+      // of the slice dictates that.
+      if (S->endOffset() > P.endOffset())
+        Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset());
+    }
+  }
+
+  // We may have split loads where some of their stores are split stores. For
+  // such loads and stores, we can only pre-split them if their splits exactly
+  // match relative to their starting offset. We have to verify this prior to
+  // any rewriting.
   llvm::erase_if(Stores, [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
     // Lookup the load we are storing in our map of split
     // offsets.
@@ -3909,25 +3909,25 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
     // and this store can't be pre-split.
     if (UnsplittableLoads.count(LI))
       return true;
- 
+
     auto LoadOffsetsI = SplitOffsetsMap.find(LI);
     if (LoadOffsetsI == SplitOffsetsMap.end())
       return false; // Unrelated loads are definitely safe.
     auto &LoadOffsets = LoadOffsetsI->second;
- 
+
     // Now lookup the store's offsets.
     auto &StoreOffsets = SplitOffsetsMap[SI];
- 
+
     // If the relative offsets of each split in the load and
     // store match exactly, then we can split them and we
     // don't need to remove them here.
     if (LoadOffsets.Splits == StoreOffsets.Splits)
       return false;
- 
+
     LLVM_DEBUG(dbgs() << "    Mismatched splits for load and store:\n"
                       << "      " << *LI << "\n"
                       << "      " << *SI << "\n");
- 
+
     // We've found a store and load that we need to split
     // with mismatched relative splits. Just give up on them
     // and remove both instructions from our list of
@@ -3935,330 +3935,330 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
     UnsplittableLoads.insert(LI);
     return true;
   });
-  // Now we have to go *back* through all the stores, because a later store may 
-  // have caused an earlier store's load to become unsplittable and if it is 
-  // unsplittable for the later store, then we can't rely on it being split in 
-  // the earlier store either. 
+  // Now we have to go *back* through all the stores, because a later store may
+  // have caused an earlier store's load to become unsplittable and if it is
+  // unsplittable for the later store, then we can't rely on it being split in
+  // the earlier store either.
   llvm::erase_if(Stores, [&UnsplittableLoads](StoreInst *SI) {
     auto *LI = cast<LoadInst>(SI->getValueOperand());
     return UnsplittableLoads.count(LI);
   });
-  // Once we've established all the loads that can't be split for some reason, 
-  // filter any that made it into our list out. 
+  // Once we've established all the loads that can't be split for some reason,
+  // filter any that made it into our list out.
   llvm::erase_if(Loads, [&UnsplittableLoads](LoadInst *LI) {
     return UnsplittableLoads.count(LI);
   });
- 
-  // If no loads or stores are left, there is no pre-splitting to be done for 
-  // this alloca. 
-  if (Loads.empty() && Stores.empty()) 
-    return false; 
- 
-  // From here on, we can't fail and will be building new accesses, so rig up 
-  // an IR builder. 
-  IRBuilderTy IRB(&AI); 
- 
-  // Collect the new slices which we will merge into the alloca slices. 
-  SmallVector<Slice, 4> NewSlices; 
- 
-  // Track any allocas we end up splitting loads and stores for so we iterate 
-  // on them. 
-  SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas; 
- 
-  // At this point, we have collected all of the loads and stores we can 
-  // pre-split, and the specific splits needed for them. We actually do the 
-  // splitting in a specific order in order to handle when one of the loads in 
-  // the value operand to one of the stores. 
-  // 
-  // First, we rewrite all of the split loads, and just accumulate each split 
-  // load in a parallel structure. We also build the slices for them and append 
-  // them to the alloca slices. 
-  SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap; 
-  std::vector<LoadInst *> SplitLoads; 
-  const DataLayout &DL = AI.getModule()->getDataLayout(); 
-  for (LoadInst *LI : Loads) { 
-    SplitLoads.clear(); 
- 
-    IntegerType *Ty = cast<IntegerType>(LI->getType()); 
-    uint64_t LoadSize = Ty->getBitWidth() / 8; 
-    assert(LoadSize > 0 && "Cannot have a zero-sized integer load!"); 
- 
-    auto &Offsets = SplitOffsetsMap[LI]; 
-    assert(LoadSize == Offsets.S->endOffset() - Offsets.S->beginOffset() && 
-           "Slice size should always match load size exactly!"); 
-    uint64_t BaseOffset = Offsets.S->beginOffset(); 
-    assert(BaseOffset + LoadSize > BaseOffset && 
-           "Cannot represent alloca access size using 64-bit integers!"); 
- 
-    Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand()); 
-    IRB.SetInsertPoint(LI); 
- 
-    LLVM_DEBUG(dbgs() << "  Splitting load: " << *LI << "\n"); 
- 
-    uint64_t PartOffset = 0, PartSize = Offsets.Splits.front(); 
-    int Idx = 0, Size = Offsets.Splits.size(); 
-    for (;;) { 
-      auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8); 
-      auto AS = LI->getPointerAddressSpace(); 
-      auto *PartPtrTy = PartTy->getPointerTo(AS); 
-      LoadInst *PLoad = IRB.CreateAlignedLoad( 
-          PartTy, 
-          getAdjustedPtr(IRB, DL, BasePtr, 
-                         APInt(DL.getIndexSizeInBits(AS), PartOffset), 
-                         PartPtrTy, BasePtr->getName() + "."), 
-          getAdjustedAlignment(LI, PartOffset), 
-          /*IsVolatile*/ false, LI->getName()); 
-      PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access, 
-                                LLVMContext::MD_access_group}); 
- 
-      // Append this load onto the list of split loads so we can find it later 
-      // to rewrite the stores. 
-      SplitLoads.push_back(PLoad); 
- 
-      // Now build a new slice for the alloca. 
-      NewSlices.push_back( 
-          Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize, 
-                &PLoad->getOperandUse(PLoad->getPointerOperandIndex()), 
-                /*IsSplittable*/ false)); 
-      LLVM_DEBUG(dbgs() << "    new slice [" << NewSlices.back().beginOffset() 
-                        << ", " << NewSlices.back().endOffset() 
-                        << "): " << *PLoad << "\n"); 
- 
-      // See if we've handled all the splits. 
-      if (Idx >= Size) 
-        break; 
- 
-      // Setup the next partition. 
-      PartOffset = Offsets.Splits[Idx]; 
-      ++Idx; 
-      PartSize = (Idx < Size ? Offsets.Splits[Idx] : LoadSize) - PartOffset; 
-    } 
- 
-    // Now that we have the split loads, do the slow walk over all uses of the 
-    // load and rewrite them as split stores, or save the split loads to use 
-    // below if the store is going to be split there anyways. 
-    bool DeferredStores = false; 
-    for (User *LU : LI->users()) { 
-      StoreInst *SI = cast<StoreInst>(LU); 
-      if (!Stores.empty() && SplitOffsetsMap.count(SI)) { 
-        DeferredStores = true; 
-        LLVM_DEBUG(dbgs() << "    Deferred splitting of store: " << *SI 
-                          << "\n"); 
-        continue; 
-      } 
- 
-      Value *StoreBasePtr = SI->getPointerOperand(); 
-      IRB.SetInsertPoint(SI); 
- 
-      LLVM_DEBUG(dbgs() << "    Splitting store of load: " << *SI << "\n"); 
- 
-      for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) { 
-        LoadInst *PLoad = SplitLoads[Idx]; 
-        uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1]; 
-        auto *PartPtrTy = 
-            PLoad->getType()->getPointerTo(SI->getPointerAddressSpace()); 
- 
-        auto AS = SI->getPointerAddressSpace(); 
-        StoreInst *PStore = IRB.CreateAlignedStore( 
-            PLoad, 
-            getAdjustedPtr(IRB, DL, StoreBasePtr, 
-                           APInt(DL.getIndexSizeInBits(AS), PartOffset), 
-                           PartPtrTy, StoreBasePtr->getName() + "."), 
-            getAdjustedAlignment(SI, PartOffset), 
-            /*IsVolatile*/ false); 
-        PStore->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access, 
-                                   LLVMContext::MD_access_group}); 
-        LLVM_DEBUG(dbgs() << "      +" << PartOffset << ":" << *PStore << "\n"); 
-      } 
- 
-      // We want to immediately iterate on any allocas impacted by splitting 
-      // this store, and we have to track any promotable alloca (indicated by 
-      // a direct store) as needing to be resplit because it is no longer 
-      // promotable. 
-      if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) { 
-        ResplitPromotableAllocas.insert(OtherAI); 
-        Worklist.insert(OtherAI); 
-      } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>( 
-                     StoreBasePtr->stripInBoundsOffsets())) { 
-        Worklist.insert(OtherAI); 
-      } 
- 
-      // Mark the original store as dead. 
+
+  // If no loads or stores are left, there is no pre-splitting to be done for
+  // this alloca.
+  if (Loads.empty() && Stores.empty())
+    return false;
+
+  // From here on, we can't fail and will be building new accesses, so rig up
+  // an IR builder.
+  IRBuilderTy IRB(&AI);
+
+  // Collect the new slices which we will merge into the alloca slices.
+  SmallVector<Slice, 4> NewSlices;
+
+  // Track any allocas we end up splitting loads and stores for so we iterate
+  // on them.
+  SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
+
+  // At this point, we have collected all of the loads and stores we can
+  // pre-split, and the specific splits needed for them. We actually do the
+  // splitting in a specific order in order to handle when one of the loads in
+  // the value operand to one of the stores.
+  //
+  // First, we rewrite all of the split loads, and just accumulate each split
+  // load in a parallel structure. We also build the slices for them and append
+  // them to the alloca slices.
+  SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap;
+  std::vector<LoadInst *> SplitLoads;
+  const DataLayout &DL = AI.getModule()->getDataLayout();
+  for (LoadInst *LI : Loads) {
+    SplitLoads.clear();
+
+    IntegerType *Ty = cast<IntegerType>(LI->getType());
+    uint64_t LoadSize = Ty->getBitWidth() / 8;
+    assert(LoadSize > 0 && "Cannot have a zero-sized integer load!");
+
+    auto &Offsets = SplitOffsetsMap[LI];
+    assert(LoadSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
+           "Slice size should always match load size exactly!");
+    uint64_t BaseOffset = Offsets.S->beginOffset();
+    assert(BaseOffset + LoadSize > BaseOffset &&
+           "Cannot represent alloca access size using 64-bit integers!");
+
+    Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand());
+    IRB.SetInsertPoint(LI);
+
+    LLVM_DEBUG(dbgs() << "  Splitting load: " << *LI << "\n");
+
+    uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
+    int Idx = 0, Size = Offsets.Splits.size();
+    for (;;) {
+      auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
+      auto AS = LI->getPointerAddressSpace();
+      auto *PartPtrTy = PartTy->getPointerTo(AS);
+      LoadInst *PLoad = IRB.CreateAlignedLoad(
+          PartTy,
+          getAdjustedPtr(IRB, DL, BasePtr,
+                         APInt(DL.getIndexSizeInBits(AS), PartOffset),
+                         PartPtrTy, BasePtr->getName() + "."),
+          getAdjustedAlignment(LI, PartOffset),
+          /*IsVolatile*/ false, LI->getName());
+      PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
+                                LLVMContext::MD_access_group});
+
+      // Append this load onto the list of split loads so we can find it later
+      // to rewrite the stores.
+      SplitLoads.push_back(PLoad);
+
+      // Now build a new slice for the alloca.
+      NewSlices.push_back(
+          Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
+                &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
+                /*IsSplittable*/ false));
+      LLVM_DEBUG(dbgs() << "    new slice [" << NewSlices.back().beginOffset()
+                        << ", " << NewSlices.back().endOffset()
+                        << "): " << *PLoad << "\n");
+
+      // See if we've handled all the splits.
+      if (Idx >= Size)
+        break;
+
+      // Setup the next partition.
+      PartOffset = Offsets.Splits[Idx];
+      ++Idx;
+      PartSize = (Idx < Size ? Offsets.Splits[Idx] : LoadSize) - PartOffset;
+    }
+
+    // Now that we have the split loads, do the slow walk over all uses of the
+    // load and rewrite them as split stores, or save the split loads to use
+    // below if the store is going to be split there anyways.
+    bool DeferredStores = false;
+    for (User *LU : LI->users()) {
+      StoreInst *SI = cast<StoreInst>(LU);
+      if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
+        DeferredStores = true;
+        LLVM_DEBUG(dbgs() << "    Deferred splitting of store: " << *SI
+                          << "\n");
+        continue;
+      }
+
+      Value *StoreBasePtr = SI->getPointerOperand();
+      IRB.SetInsertPoint(SI);
+
+      LLVM_DEBUG(dbgs() << "    Splitting store of load: " << *SI << "\n");
+
+      for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
+        LoadInst *PLoad = SplitLoads[Idx];
+        uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
+        auto *PartPtrTy =
+            PLoad->getType()->getPointerTo(SI->getPointerAddressSpace());
+
+        auto AS = SI->getPointerAddressSpace();
+        StoreInst *PStore = IRB.CreateAlignedStore(
+            PLoad,
+            getAdjustedPtr(IRB, DL, StoreBasePtr,
+                           APInt(DL.getIndexSizeInBits(AS), PartOffset),
+                           PartPtrTy, StoreBasePtr->getName() + "."),
+            getAdjustedAlignment(SI, PartOffset),
+            /*IsVolatile*/ false);
+        PStore->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
+                                   LLVMContext::MD_access_group});
+        LLVM_DEBUG(dbgs() << "      +" << PartOffset << ":" << *PStore << "\n");
+      }
+
+      // We want to immediately iterate on any allocas impacted by splitting
+      // this store, and we have to track any promotable alloca (indicated by
+      // a direct store) as needing to be resplit because it is no longer
+      // promotable.
+      if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) {
+        ResplitPromotableAllocas.insert(OtherAI);
+        Worklist.insert(OtherAI);
+      } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
+                     StoreBasePtr->stripInBoundsOffsets())) {
+        Worklist.insert(OtherAI);
+      }
+
+      // Mark the original store as dead.
       DeadInsts.push_back(SI);
-    } 
- 
-    // Save the split loads if there are deferred stores among the users. 
-    if (DeferredStores) 
-      SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads))); 
- 
-    // Mark the original load as dead and kill the original slice. 
+    }
+
+    // Save the split loads if there are deferred stores among the users.
+    if (DeferredStores)
+      SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads)));
+
+    // Mark the original load as dead and kill the original slice.
     DeadInsts.push_back(LI);
-    Offsets.S->kill(); 
-  } 
- 
-  // Second, we rewrite all of the split stores. At this point, we know that 
-  // all loads from this alloca have been split already. For stores of such 
-  // loads, we can simply look up the pre-existing split loads. For stores of 
-  // other loads, we split those loads first and then write split stores of 
-  // them. 
-  for (StoreInst *SI : Stores) { 
-    auto *LI = cast<LoadInst>(SI->getValueOperand()); 
-    IntegerType *Ty = cast<IntegerType>(LI->getType()); 
-    uint64_t StoreSize = Ty->getBitWidth() / 8; 
-    assert(StoreSize > 0 && "Cannot have a zero-sized integer store!"); 
- 
-    auto &Offsets = SplitOffsetsMap[SI]; 
-    assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() && 
-           "Slice size should always match load size exactly!"); 
-    uint64_t BaseOffset = Offsets.S->beginOffset(); 
-    assert(BaseOffset + StoreSize > BaseOffset && 
-           "Cannot represent alloca access size using 64-bit integers!"); 
- 
-    Value *LoadBasePtr = LI->getPointerOperand(); 
-    Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand()); 
- 
-    LLVM_DEBUG(dbgs() << "  Splitting store: " << *SI << "\n"); 
- 
-    // Check whether we have an already split load. 
-    auto SplitLoadsMapI = SplitLoadsMap.find(LI); 
-    std::vector<LoadInst *> *SplitLoads = nullptr; 
-    if (SplitLoadsMapI != SplitLoadsMap.end()) { 
-      SplitLoads = &SplitLoadsMapI->second; 
-      assert(SplitLoads->size() == Offsets.Splits.size() + 1 && 
-             "Too few split loads for the number of splits in the store!"); 
-    } else { 
-      LLVM_DEBUG(dbgs() << "          of load: " << *LI << "\n"); 
-    } 
- 
-    uint64_t PartOffset = 0, PartSize = Offsets.Splits.front(); 
-    int Idx = 0, Size = Offsets.Splits.size(); 
-    for (;;) { 
-      auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8); 
-      auto *LoadPartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace()); 
-      auto *StorePartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace()); 
- 
-      // Either lookup a split load or create one. 
-      LoadInst *PLoad; 
-      if (SplitLoads) { 
-        PLoad = (*SplitLoads)[Idx]; 
-      } else { 
-        IRB.SetInsertPoint(LI); 
-        auto AS = LI->getPointerAddressSpace(); 
-        PLoad = IRB.CreateAlignedLoad( 
-            PartTy, 
-            getAdjustedPtr(IRB, DL, LoadBasePtr, 
-                           APInt(DL.getIndexSizeInBits(AS), PartOffset), 
-                           LoadPartPtrTy, LoadBasePtr->getName() + "."), 
-            getAdjustedAlignment(LI, PartOffset), 
-            /*IsVolatile*/ false, LI->getName()); 
-      } 
- 
-      // And store this partition. 
-      IRB.SetInsertPoint(SI); 
-      auto AS = SI->getPointerAddressSpace(); 
-      StoreInst *PStore = IRB.CreateAlignedStore( 
-          PLoad, 
-          getAdjustedPtr(IRB, DL, StoreBasePtr, 
-                         APInt(DL.getIndexSizeInBits(AS), PartOffset), 
-                         StorePartPtrTy, StoreBasePtr->getName() + "."), 
-          getAdjustedAlignment(SI, PartOffset), 
-          /*IsVolatile*/ false); 
- 
-      // Now build a new slice for the alloca. 
-      NewSlices.push_back( 
-          Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize, 
-                &PStore->getOperandUse(PStore->getPointerOperandIndex()), 
-                /*IsSplittable*/ false)); 
-      LLVM_DEBUG(dbgs() << "    new slice [" << NewSlices.back().beginOffset() 
-                        << ", " << NewSlices.back().endOffset() 
-                        << "): " << *PStore << "\n"); 
-      if (!SplitLoads) { 
-        LLVM_DEBUG(dbgs() << "      of split load: " << *PLoad << "\n"); 
-      } 
- 
-      // See if we've finished all the splits. 
-      if (Idx >= Size) 
-        break; 
- 
-      // Setup the next partition. 
-      PartOffset = Offsets.Splits[Idx]; 
-      ++Idx; 
-      PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset; 
-    } 
- 
-    // We want to immediately iterate on any allocas impacted by splitting 
-    // this load, which is only relevant if it isn't a load of this alloca and 
-    // thus we didn't already split the loads above. We also have to keep track 
-    // of any promotable allocas we split loads on as they can no longer be 
-    // promoted. 
-    if (!SplitLoads) { 
-      if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) { 
-        assert(OtherAI != &AI && "We can't re-split our own alloca!"); 
-        ResplitPromotableAllocas.insert(OtherAI); 
-        Worklist.insert(OtherAI); 
-      } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>( 
-                     LoadBasePtr->stripInBoundsOffsets())) { 
-        assert(OtherAI != &AI && "We can't re-split our own alloca!"); 
-        Worklist.insert(OtherAI); 
-      } 
-    } 
- 
-    // Mark the original store as dead now that we've split it up and kill its 
-    // slice. Note that we leave the original load in place unless this store 
-    // was its only use. It may in turn be split up if it is an alloca load 
-    // for some other alloca, but it may be a normal load. This may introduce 
-    // redundant loads, but where those can be merged the rest of the optimizer 
-    // should handle the merging, and this uncovers SSA splits which is more 
-    // important. In practice, the original loads will almost always be fully 
-    // split and removed eventually, and the splits will be merged by any 
-    // trivial CSE, including instcombine. 
-    if (LI->hasOneUse()) { 
-      assert(*LI->user_begin() == SI && "Single use isn't this store!"); 
+    Offsets.S->kill();
+  }
+
+  // Second, we rewrite all of the split stores. At this point, we know that
+  // all loads from this alloca have been split already. For stores of such
+  // loads, we can simply look up the pre-existing split loads. For stores of
+  // other loads, we split those loads first and then write split stores of
+  // them.
+  for (StoreInst *SI : Stores) {
+    auto *LI = cast<LoadInst>(SI->getValueOperand());
+    IntegerType *Ty = cast<IntegerType>(LI->getType());
+    uint64_t StoreSize = Ty->getBitWidth() / 8;
+    assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
+
+    auto &Offsets = SplitOffsetsMap[SI];
+    assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
+           "Slice size should always match load size exactly!");
+    uint64_t BaseOffset = Offsets.S->beginOffset();
+    assert(BaseOffset + StoreSize > BaseOffset &&
+           "Cannot represent alloca access size using 64-bit integers!");
+
+    Value *LoadBasePtr = LI->getPointerOperand();
+    Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
+
+    LLVM_DEBUG(dbgs() << "  Splitting store: " << *SI << "\n");
+
+    // Check whether we have an already split load.
+    auto SplitLoadsMapI = SplitLoadsMap.find(LI);
+    std::vector<LoadInst *> *SplitLoads = nullptr;
+    if (SplitLoadsMapI != SplitLoadsMap.end()) {
+      SplitLoads = &SplitLoadsMapI->second;
+      assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
+             "Too few split loads for the number of splits in the store!");
+    } else {
+      LLVM_DEBUG(dbgs() << "          of load: " << *LI << "\n");
+    }
+
+    uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
+    int Idx = 0, Size = Offsets.Splits.size();
+    for (;;) {
+      auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
+      auto *LoadPartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace());
+      auto *StorePartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace());
+
+      // Either lookup a split load or create one.
+      LoadInst *PLoad;
+      if (SplitLoads) {
+        PLoad = (*SplitLoads)[Idx];
+      } else {
+        IRB.SetInsertPoint(LI);
+        auto AS = LI->getPointerAddressSpace();
+        PLoad = IRB.CreateAlignedLoad(
+            PartTy,
+            getAdjustedPtr(IRB, DL, LoadBasePtr,
+                           APInt(DL.getIndexSizeInBits(AS), PartOffset),
+                           LoadPartPtrTy, LoadBasePtr->getName() + "."),
+            getAdjustedAlignment(LI, PartOffset),
+            /*IsVolatile*/ false, LI->getName());
+      }
+
+      // And store this partition.
+      IRB.SetInsertPoint(SI);
+      auto AS = SI->getPointerAddressSpace();
+      StoreInst *PStore = IRB.CreateAlignedStore(
+          PLoad,
+          getAdjustedPtr(IRB, DL, StoreBasePtr,
+                         APInt(DL.getIndexSizeInBits(AS), PartOffset),
+                         StorePartPtrTy, StoreBasePtr->getName() + "."),
+          getAdjustedAlignment(SI, PartOffset),
+          /*IsVolatile*/ false);
+
+      // Now build a new slice for the alloca.
+      NewSlices.push_back(
+          Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
+                &PStore->getOperandUse(PStore->getPointerOperandIndex()),
+                /*IsSplittable*/ false));
+      LLVM_DEBUG(dbgs() << "    new slice [" << NewSlices.back().beginOffset()
+                        << ", " << NewSlices.back().endOffset()
+                        << "): " << *PStore << "\n");
+      if (!SplitLoads) {
+        LLVM_DEBUG(dbgs() << "      of split load: " << *PLoad << "\n");
+      }
+
+      // See if we've finished all the splits.
+      if (Idx >= Size)
+        break;
+
+      // Setup the next partition.
+      PartOffset = Offsets.Splits[Idx];
+      ++Idx;
+      PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
+    }
+
+    // We want to immediately iterate on any allocas impacted by splitting
+    // this load, which is only relevant if it isn't a load of this alloca and
+    // thus we didn't already split the loads above. We also have to keep track
+    // of any promotable allocas we split loads on as they can no longer be
+    // promoted.
+    if (!SplitLoads) {
+      if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) {
+        assert(OtherAI != &AI && "We can't re-split our own alloca!");
+        ResplitPromotableAllocas.insert(OtherAI);
+        Worklist.insert(OtherAI);
+      } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
+                     LoadBasePtr->stripInBoundsOffsets())) {
+        assert(OtherAI != &AI && "We can't re-split our own alloca!");
+        Worklist.insert(OtherAI);
+      }
+    }
+
+    // Mark the original store as dead now that we've split it up and kill its
+    // slice. Note that we leave the original load in place unless this store
+    // was its only use. It may in turn be split up if it is an alloca load
+    // for some other alloca, but it may be a normal load. This may introduce
+    // redundant loads, but where those can be merged the rest of the optimizer
+    // should handle the merging, and this uncovers SSA splits which is more
+    // important. In practice, the original loads will almost always be fully
+    // split and removed eventually, and the splits will be merged by any
+    // trivial CSE, including instcombine.
+    if (LI->hasOneUse()) {
+      assert(*LI->user_begin() == SI && "Single use isn't this store!");
       DeadInsts.push_back(LI);
-    } 
+    }
     DeadInsts.push_back(SI);
-    Offsets.S->kill(); 
-  } 
- 
-  // Remove the killed slices that have ben pre-split. 
+    Offsets.S->kill();
+  }
+
+  // Remove the killed slices that have ben pre-split.
   llvm::erase_if(AS, [](const Slice &S) { return S.isDead(); });
- 
-  // Insert our new slices. This will sort and merge them into the sorted 
-  // sequence. 
-  AS.insert(NewSlices); 
- 
-  LLVM_DEBUG(dbgs() << "  Pre-split slices:\n"); 
-#ifndef NDEBUG 
-  for (auto I = AS.begin(), E = AS.end(); I != E; ++I) 
-    LLVM_DEBUG(AS.print(dbgs(), I, "    ")); 
-#endif 
- 
-  // Finally, don't try to promote any allocas that new require re-splitting. 
-  // They have already been added to the worklist above. 
+
+  // Insert our new slices. This will sort and merge them into the sorted
+  // sequence.
+  AS.insert(NewSlices);
+
+  LLVM_DEBUG(dbgs() << "  Pre-split slices:\n");
+#ifndef NDEBUG
+  for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
+    LLVM_DEBUG(AS.print(dbgs(), I, "    "));
+#endif
+
+  // Finally, don't try to promote any allocas that new require re-splitting.
+  // They have already been added to the worklist above.
   llvm::erase_if(PromotableAllocas, [&](AllocaInst *AI) {
     return ResplitPromotableAllocas.count(AI);
   });
- 
-  return true; 
-} 
- 
-/// Rewrite an alloca partition's users. 
-/// 
-/// This routine drives both of the rewriting goals of the SROA pass. It tries 
-/// to rewrite uses of an alloca partition to be conducive for SSA value 
-/// promotion. If the partition needs a new, more refined alloca, this will 
-/// build that new alloca, preserving as much type information as possible, and 
-/// rewrite the uses of the old alloca to point at the new one and have the 
-/// appropriate new offsets. It also evaluates how successful the rewrite was 
-/// at enabling promotion and if it was successful queues the alloca to be 
-/// promoted. 
-AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, 
-                                   Partition &P) { 
-  // Try to compute a friendly type for this partition of the alloca. This 
-  // won't always succeed, in which case we fall back to a legal integer type 
-  // or an i8 array of an appropriate size. 
-  Type *SliceTy = nullptr; 
-  const DataLayout &DL = AI.getModule()->getDataLayout(); 
+
+  return true;
+}
+
+/// Rewrite an alloca partition's users.
+///
+/// This routine drives both of the rewriting goals of the SROA pass. It tries
+/// to rewrite uses of an alloca partition to be conducive for SSA value
+/// promotion. If the partition needs a new, more refined alloca, this will
+/// build that new alloca, preserving as much type information as possible, and
+/// rewrite the uses of the old alloca to point at the new one and have the
+/// appropriate new offsets. It also evaluates how successful the rewrite was
+/// at enabling promotion and if it was successful queues the alloca to be
+/// promoted.
+AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
+                                   Partition &P) {
+  // Try to compute a friendly type for this partition of the alloca. This
+  // won't always succeed, in which case we fall back to a legal integer type
+  // or an i8 array of an appropriate size.
+  Type *SliceTy = nullptr;
+  const DataLayout &DL = AI.getModule()->getDataLayout();
   std::pair<Type *, IntegerType *> CommonUseTy =
       findCommonType(P.begin(), P.end(), P.endOffset());
   // Do all uses operate on the same type?
@@ -4266,103 +4266,103 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
     if (DL.getTypeAllocSize(CommonUseTy.first).getFixedSize() >= P.size())
       SliceTy = CommonUseTy.first;
   // If not, can we find an appropriate subtype in the original allocated type?
-  if (!SliceTy) 
-    if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(), 
-                                                 P.beginOffset(), P.size())) 
-      SliceTy = TypePartitionTy; 
+  if (!SliceTy)
+    if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
+                                                 P.beginOffset(), P.size()))
+      SliceTy = TypePartitionTy;
   // If still not, can we use the largest bitwidth integer type used?
   if (!SliceTy && CommonUseTy.second)
     if (DL.getTypeAllocSize(CommonUseTy.second).getFixedSize() >= P.size())
       SliceTy = CommonUseTy.second;
-  if ((!SliceTy || (SliceTy->isArrayTy() && 
-                    SliceTy->getArrayElementType()->isIntegerTy())) && 
-      DL.isLegalInteger(P.size() * 8)) 
-    SliceTy = Type::getIntNTy(*C, P.size() * 8); 
-  if (!SliceTy) 
-    SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size()); 
-  assert(DL.getTypeAllocSize(SliceTy).getFixedSize() >= P.size()); 
- 
-  bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL); 
- 
-  VectorType *VecTy = 
-      IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL); 
-  if (VecTy) 
-    SliceTy = VecTy; 
- 
-  // Check for the case where we're going to rewrite to a new alloca of the 
-  // exact same type as the original, and with the same access offsets. In that 
-  // case, re-use the existing alloca, but still run through the rewriter to 
-  // perform phi and select speculation. 
-  // P.beginOffset() can be non-zero even with the same type in a case with 
-  // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll). 
-  AllocaInst *NewAI; 
-  if (SliceTy == AI.getAllocatedType() && P.beginOffset() == 0) { 
-    NewAI = &AI; 
-    // FIXME: We should be able to bail at this point with "nothing changed". 
-    // FIXME: We might want to defer PHI speculation until after here. 
-    // FIXME: return nullptr; 
-  } else { 
-    // Make sure the alignment is compatible with P.beginOffset(). 
-    const Align Alignment = commonAlignment(AI.getAlign(), P.beginOffset()); 
-    // If we will get at least this much alignment from the type alone, leave 
-    // the alloca's alignment unconstrained. 
-    const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(SliceTy); 
-    NewAI = new AllocaInst( 
-        SliceTy, AI.getType()->getAddressSpace(), nullptr, 
-        IsUnconstrained ? DL.getPrefTypeAlign(SliceTy) : Alignment, 
-        AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI); 
-    // Copy the old AI debug location over to the new one. 
-    NewAI->setDebugLoc(AI.getDebugLoc()); 
-    ++NumNewAllocas; 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "Rewriting alloca partition " 
-                    << "[" << P.beginOffset() << "," << P.endOffset() 
-                    << ") to: " << *NewAI << "\n"); 
- 
-  // Track the high watermark on the worklist as it is only relevant for 
-  // promoted allocas. We will reset it to this point if the alloca is not in 
-  // fact scheduled for promotion. 
-  unsigned PPWOldSize = PostPromotionWorklist.size(); 
-  unsigned NumUses = 0; 
-  SmallSetVector<PHINode *, 8> PHIUsers; 
-  SmallSetVector<SelectInst *, 8> SelectUsers; 
- 
-  AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(), 
-                               P.endOffset(), IsIntegerPromotable, VecTy, 
-                               PHIUsers, SelectUsers); 
-  bool Promotable = true; 
-  for (Slice *S : P.splitSliceTails()) { 
-    Promotable &= Rewriter.visit(S); 
-    ++NumUses; 
-  } 
-  for (Slice &S : P) { 
-    Promotable &= Rewriter.visit(&S); 
-    ++NumUses; 
-  } 
- 
-  NumAllocaPartitionUses += NumUses; 
-  MaxUsesPerAllocaPartition.updateMax(NumUses); 
- 
-  // Now that we've processed all the slices in the new partition, check if any 
-  // PHIs or Selects would block promotion. 
-  for (PHINode *PHI : PHIUsers) 
-    if (!isSafePHIToSpeculate(*PHI)) { 
-      Promotable = false; 
-      PHIUsers.clear(); 
-      SelectUsers.clear(); 
-      break; 
-    } 
- 
-  for (SelectInst *Sel : SelectUsers) 
-    if (!isSafeSelectToSpeculate(*Sel)) { 
-      Promotable = false; 
-      PHIUsers.clear(); 
-      SelectUsers.clear(); 
-      break; 
-    } 
- 
-  if (Promotable) { 
+  if ((!SliceTy || (SliceTy->isArrayTy() &&
+                    SliceTy->getArrayElementType()->isIntegerTy())) &&
+      DL.isLegalInteger(P.size() * 8))
+    SliceTy = Type::getIntNTy(*C, P.size() * 8);
+  if (!SliceTy)
+    SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
+  assert(DL.getTypeAllocSize(SliceTy).getFixedSize() >= P.size());
+
+  bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
+
+  VectorType *VecTy =
+      IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL);
+  if (VecTy)
+    SliceTy = VecTy;
+
+  // Check for the case where we're going to rewrite to a new alloca of the
+  // exact same type as the original, and with the same access offsets. In that
+  // case, re-use the existing alloca, but still run through the rewriter to
+  // perform phi and select speculation.
+  // P.beginOffset() can be non-zero even with the same type in a case with
+  // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
+  AllocaInst *NewAI;
+  if (SliceTy == AI.getAllocatedType() && P.beginOffset() == 0) {
+    NewAI = &AI;
+    // FIXME: We should be able to bail at this point with "nothing changed".
+    // FIXME: We might want to defer PHI speculation until after here.
+    // FIXME: return nullptr;
+  } else {
+    // Make sure the alignment is compatible with P.beginOffset().
+    const Align Alignment = commonAlignment(AI.getAlign(), P.beginOffset());
+    // If we will get at least this much alignment from the type alone, leave
+    // the alloca's alignment unconstrained.
+    const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(SliceTy);
+    NewAI = new AllocaInst(
+        SliceTy, AI.getType()->getAddressSpace(), nullptr,
+        IsUnconstrained ? DL.getPrefTypeAlign(SliceTy) : Alignment,
+        AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI);
+    // Copy the old AI debug location over to the new one.
+    NewAI->setDebugLoc(AI.getDebugLoc());
+    ++NumNewAllocas;
+  }
+
+  LLVM_DEBUG(dbgs() << "Rewriting alloca partition "
+                    << "[" << P.beginOffset() << "," << P.endOffset()
+                    << ") to: " << *NewAI << "\n");
+
+  // Track the high watermark on the worklist as it is only relevant for
+  // promoted allocas. We will reset it to this point if the alloca is not in
+  // fact scheduled for promotion.
+  unsigned PPWOldSize = PostPromotionWorklist.size();
+  unsigned NumUses = 0;
+  SmallSetVector<PHINode *, 8> PHIUsers;
+  SmallSetVector<SelectInst *, 8> SelectUsers;
+
+  AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(),
+                               P.endOffset(), IsIntegerPromotable, VecTy,
+                               PHIUsers, SelectUsers);
+  bool Promotable = true;
+  for (Slice *S : P.splitSliceTails()) {
+    Promotable &= Rewriter.visit(S);
+    ++NumUses;
+  }
+  for (Slice &S : P) {
+    Promotable &= Rewriter.visit(&S);
+    ++NumUses;
+  }
+
+  NumAllocaPartitionUses += NumUses;
+  MaxUsesPerAllocaPartition.updateMax(NumUses);
+
+  // Now that we've processed all the slices in the new partition, check if any
+  // PHIs or Selects would block promotion.
+  for (PHINode *PHI : PHIUsers)
+    if (!isSafePHIToSpeculate(*PHI)) {
+      Promotable = false;
+      PHIUsers.clear();
+      SelectUsers.clear();
+      break;
+    }
+
+  for (SelectInst *Sel : SelectUsers)
+    if (!isSafeSelectToSpeculate(*Sel)) {
+      Promotable = false;
+      PHIUsers.clear();
+      SelectUsers.clear();
+      break;
+    }
+
+  if (Promotable) {
     for (Use *U : AS.getDeadUsesIfPromotable()) {
       auto *OldInst = dyn_cast<Instruction>(U->get());
       Value::dropDroppableUse(*U);
@@ -4370,190 +4370,190 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
         if (isInstructionTriviallyDead(OldInst))
           DeadInsts.push_back(OldInst);
     }
-    if (PHIUsers.empty() && SelectUsers.empty()) { 
-      // Promote the alloca. 
-      PromotableAllocas.push_back(NewAI); 
-    } else { 
-      // If we have either PHIs or Selects to speculate, add them to those 
-      // worklists and re-queue the new alloca so that we promote in on the 
-      // next iteration. 
-      for (PHINode *PHIUser : PHIUsers) 
-        SpeculatablePHIs.insert(PHIUser); 
-      for (SelectInst *SelectUser : SelectUsers) 
-        SpeculatableSelects.insert(SelectUser); 
-      Worklist.insert(NewAI); 
-    } 
-  } else { 
-    // Drop any post-promotion work items if promotion didn't happen. 
-    while (PostPromotionWorklist.size() > PPWOldSize) 
-      PostPromotionWorklist.pop_back(); 
- 
-    // We couldn't promote and we didn't create a new partition, nothing 
-    // happened. 
-    if (NewAI == &AI) 
-      return nullptr; 
- 
-    // If we can't promote the alloca, iterate on it to check for new 
-    // refinements exposed by splitting the current alloca. Don't iterate on an 
-    // alloca which didn't actually change and didn't get promoted. 
-    Worklist.insert(NewAI); 
-  } 
- 
-  return NewAI; 
-} 
- 
-/// Walks the slices of an alloca and form partitions based on them, 
-/// rewriting each of their uses. 
-bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { 
-  if (AS.begin() == AS.end()) 
-    return false; 
- 
-  unsigned NumPartitions = 0; 
-  bool Changed = false; 
-  const DataLayout &DL = AI.getModule()->getDataLayout(); 
- 
-  // First try to pre-split loads and stores. 
-  Changed |= presplitLoadsAndStores(AI, AS); 
- 
-  // Now that we have identified any pre-splitting opportunities, 
-  // mark loads and stores unsplittable except for the following case. 
-  // We leave a slice splittable if all other slices are disjoint or fully 
-  // included in the slice, such as whole-alloca loads and stores. 
-  // If we fail to split these during pre-splitting, we want to force them 
-  // to be rewritten into a partition. 
-  bool IsSorted = true; 
- 
-  uint64_t AllocaSize = 
-      DL.getTypeAllocSize(AI.getAllocatedType()).getFixedSize(); 
-  const uint64_t MaxBitVectorSize = 1024; 
-  if (AllocaSize <= MaxBitVectorSize) { 
-    // If a byte boundary is included in any load or store, a slice starting or 
-    // ending at the boundary is not splittable. 
-    SmallBitVector SplittableOffset(AllocaSize + 1, true); 
-    for (Slice &S : AS) 
-      for (unsigned O = S.beginOffset() + 1; 
-           O < S.endOffset() && O < AllocaSize; O++) 
-        SplittableOffset.reset(O); 
- 
-    for (Slice &S : AS) { 
-      if (!S.isSplittable()) 
-        continue; 
- 
-      if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) && 
-          (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()])) 
-        continue; 
- 
-      if (isa<LoadInst>(S.getUse()->getUser()) || 
-          isa<StoreInst>(S.getUse()->getUser())) { 
-        S.makeUnsplittable(); 
-        IsSorted = false; 
-      } 
-    } 
-  } 
-  else { 
-    // We only allow whole-alloca splittable loads and stores 
-    // for a large alloca to avoid creating too large BitVector. 
-    for (Slice &S : AS) { 
-      if (!S.isSplittable()) 
-        continue; 
- 
-      if (S.beginOffset() == 0 && S.endOffset() >= AllocaSize) 
-        continue; 
- 
-      if (isa<LoadInst>(S.getUse()->getUser()) || 
-          isa<StoreInst>(S.getUse()->getUser())) { 
-        S.makeUnsplittable(); 
-        IsSorted = false; 
-      } 
-    } 
-  } 
- 
-  if (!IsSorted) 
-    llvm::sort(AS); 
- 
-  /// Describes the allocas introduced by rewritePartition in order to migrate 
-  /// the debug info. 
-  struct Fragment { 
-    AllocaInst *Alloca; 
-    uint64_t Offset; 
-    uint64_t Size; 
-    Fragment(AllocaInst *AI, uint64_t O, uint64_t S) 
-      : Alloca(AI), Offset(O), Size(S) {} 
-  }; 
-  SmallVector<Fragment, 4> Fragments; 
- 
-  // Rewrite each partition. 
-  for (auto &P : AS.partitions()) { 
-    if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) { 
-      Changed = true; 
-      if (NewAI != &AI) { 
-        uint64_t SizeOfByte = 8; 
-        uint64_t AllocaSize = 
-            DL.getTypeSizeInBits(NewAI->getAllocatedType()).getFixedSize(); 
-        // Don't include any padding. 
-        uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte); 
-        Fragments.push_back(Fragment(NewAI, P.beginOffset() * SizeOfByte, Size)); 
-      } 
-    } 
-    ++NumPartitions; 
-  } 
- 
-  NumAllocaPartitions += NumPartitions; 
-  MaxPartitionsPerAlloca.updateMax(NumPartitions); 
- 
-  // Migrate debug information from the old alloca to the new alloca(s) 
-  // and the individual partitions. 
-  TinyPtrVector<DbgVariableIntrinsic *> DbgDeclares = FindDbgAddrUses(&AI); 
+    if (PHIUsers.empty() && SelectUsers.empty()) {
+      // Promote the alloca.
+      PromotableAllocas.push_back(NewAI);
+    } else {
+      // If we have either PHIs or Selects to speculate, add them to those
+      // worklists and re-queue the new alloca so that we promote in on the
+      // next iteration.
+      for (PHINode *PHIUser : PHIUsers)
+        SpeculatablePHIs.insert(PHIUser);
+      for (SelectInst *SelectUser : SelectUsers)
+        SpeculatableSelects.insert(SelectUser);
+      Worklist.insert(NewAI);
+    }
+  } else {
+    // Drop any post-promotion work items if promotion didn't happen.
+    while (PostPromotionWorklist.size() > PPWOldSize)
+      PostPromotionWorklist.pop_back();
+
+    // We couldn't promote and we didn't create a new partition, nothing
+    // happened.
+    if (NewAI == &AI)
+      return nullptr;
+
+    // If we can't promote the alloca, iterate on it to check for new
+    // refinements exposed by splitting the current alloca. Don't iterate on an
+    // alloca which didn't actually change and didn't get promoted.
+    Worklist.insert(NewAI);
+  }
+
+  return NewAI;
+}
+
+/// Walks the slices of an alloca and form partitions based on them,
+/// rewriting each of their uses.
+bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
+  if (AS.begin() == AS.end())
+    return false;
+
+  unsigned NumPartitions = 0;
+  bool Changed = false;
+  const DataLayout &DL = AI.getModule()->getDataLayout();
+
+  // First try to pre-split loads and stores.
+  Changed |= presplitLoadsAndStores(AI, AS);
+
+  // Now that we have identified any pre-splitting opportunities,
+  // mark loads and stores unsplittable except for the following case.
+  // We leave a slice splittable if all other slices are disjoint or fully
+  // included in the slice, such as whole-alloca loads and stores.
+  // If we fail to split these during pre-splitting, we want to force them
+  // to be rewritten into a partition.
+  bool IsSorted = true;
+
+  uint64_t AllocaSize =
+      DL.getTypeAllocSize(AI.getAllocatedType()).getFixedSize();
+  const uint64_t MaxBitVectorSize = 1024;
+  if (AllocaSize <= MaxBitVectorSize) {
+    // If a byte boundary is included in any load or store, a slice starting or
+    // ending at the boundary is not splittable.
+    SmallBitVector SplittableOffset(AllocaSize + 1, true);
+    for (Slice &S : AS)
+      for (unsigned O = S.beginOffset() + 1;
+           O < S.endOffset() && O < AllocaSize; O++)
+        SplittableOffset.reset(O);
+
+    for (Slice &S : AS) {
+      if (!S.isSplittable())
+        continue;
+
+      if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) &&
+          (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()]))
+        continue;
+
+      if (isa<LoadInst>(S.getUse()->getUser()) ||
+          isa<StoreInst>(S.getUse()->getUser())) {
+        S.makeUnsplittable();
+        IsSorted = false;
+      }
+    }
+  }
+  else {
+    // We only allow whole-alloca splittable loads and stores
+    // for a large alloca to avoid creating too large BitVector.
+    for (Slice &S : AS) {
+      if (!S.isSplittable())
+        continue;
+
+      if (S.beginOffset() == 0 && S.endOffset() >= AllocaSize)
+        continue;
+
+      if (isa<LoadInst>(S.getUse()->getUser()) ||
+          isa<StoreInst>(S.getUse()->getUser())) {
+        S.makeUnsplittable();
+        IsSorted = false;
+      }
+    }
+  }
+
+  if (!IsSorted)
+    llvm::sort(AS);
+
+  /// Describes the allocas introduced by rewritePartition in order to migrate
+  /// the debug info.
+  struct Fragment {
+    AllocaInst *Alloca;
+    uint64_t Offset;
+    uint64_t Size;
+    Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
+      : Alloca(AI), Offset(O), Size(S) {}
+  };
+  SmallVector<Fragment, 4> Fragments;
+
+  // Rewrite each partition.
+  for (auto &P : AS.partitions()) {
+    if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) {
+      Changed = true;
+      if (NewAI != &AI) {
+        uint64_t SizeOfByte = 8;
+        uint64_t AllocaSize =
+            DL.getTypeSizeInBits(NewAI->getAllocatedType()).getFixedSize();
+        // Don't include any padding.
+        uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
+        Fragments.push_back(Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
+      }
+    }
+    ++NumPartitions;
+  }
+
+  NumAllocaPartitions += NumPartitions;
+  MaxPartitionsPerAlloca.updateMax(NumPartitions);
+
+  // Migrate debug information from the old alloca to the new alloca(s)
+  // and the individual partitions.
+  TinyPtrVector<DbgVariableIntrinsic *> DbgDeclares = FindDbgAddrUses(&AI);
   for (DbgVariableIntrinsic *DbgDeclare : DbgDeclares) {
     auto *Expr = DbgDeclare->getExpression();
-    DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false); 
-    uint64_t AllocaSize = 
-        DL.getTypeSizeInBits(AI.getAllocatedType()).getFixedSize(); 
-    for (auto Fragment : Fragments) { 
-      // Create a fragment expression describing the new partition or reuse AI's 
-      // expression if there is only one partition. 
-      auto *FragmentExpr = Expr; 
-      if (Fragment.Size < AllocaSize || Expr->isFragment()) { 
-        // If this alloca is already a scalar replacement of a larger aggregate, 
-        // Fragment.Offset describes the offset inside the scalar. 
-        auto ExprFragment = Expr->getFragmentInfo(); 
-        uint64_t Offset = ExprFragment ? ExprFragment->OffsetInBits : 0; 
-        uint64_t Start = Offset + Fragment.Offset; 
-        uint64_t Size = Fragment.Size; 
-        if (ExprFragment) { 
-          uint64_t AbsEnd = 
-              ExprFragment->OffsetInBits + ExprFragment->SizeInBits; 
-          if (Start >= AbsEnd) 
-            // No need to describe a SROAed padding. 
-            continue; 
-          Size = std::min(Size, AbsEnd - Start); 
-        } 
-        // The new, smaller fragment is stenciled out from the old fragment. 
-        if (auto OrigFragment = FragmentExpr->getFragmentInfo()) { 
-          assert(Start >= OrigFragment->OffsetInBits && 
-                 "new fragment is outside of original fragment"); 
-          Start -= OrigFragment->OffsetInBits; 
-        } 
- 
-        // The alloca may be larger than the variable. 
+    DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
+    uint64_t AllocaSize =
+        DL.getTypeSizeInBits(AI.getAllocatedType()).getFixedSize();
+    for (auto Fragment : Fragments) {
+      // Create a fragment expression describing the new partition or reuse AI's
+      // expression if there is only one partition.
+      auto *FragmentExpr = Expr;
+      if (Fragment.Size < AllocaSize || Expr->isFragment()) {
+        // If this alloca is already a scalar replacement of a larger aggregate,
+        // Fragment.Offset describes the offset inside the scalar.
+        auto ExprFragment = Expr->getFragmentInfo();
+        uint64_t Offset = ExprFragment ? ExprFragment->OffsetInBits : 0;
+        uint64_t Start = Offset + Fragment.Offset;
+        uint64_t Size = Fragment.Size;
+        if (ExprFragment) {
+          uint64_t AbsEnd =
+              ExprFragment->OffsetInBits + ExprFragment->SizeInBits;
+          if (Start >= AbsEnd)
+            // No need to describe a SROAed padding.
+            continue;
+          Size = std::min(Size, AbsEnd - Start);
+        }
+        // The new, smaller fragment is stenciled out from the old fragment.
+        if (auto OrigFragment = FragmentExpr->getFragmentInfo()) {
+          assert(Start >= OrigFragment->OffsetInBits &&
+                 "new fragment is outside of original fragment");
+          Start -= OrigFragment->OffsetInBits;
+        }
+
+        // The alloca may be larger than the variable.
         auto VarSize = DbgDeclare->getVariable()->getSizeInBits();
-        if (VarSize) { 
-          if (Size > *VarSize) 
-            Size = *VarSize; 
-          if (Size == 0 || Start + Size > *VarSize) 
-            continue; 
-        } 
- 
-        // Avoid creating a fragment expression that covers the entire variable. 
-        if (!VarSize || *VarSize != Size) { 
-          if (auto E = 
-                  DIExpression::createFragmentExpression(Expr, Start, Size)) 
-            FragmentExpr = *E; 
-          else 
-            continue; 
-        } 
-      } 
- 
+        if (VarSize) {
+          if (Size > *VarSize)
+            Size = *VarSize;
+          if (Size == 0 || Start + Size > *VarSize)
+            continue;
+        }
+
+        // Avoid creating a fragment expression that covers the entire variable.
+        if (!VarSize || *VarSize != Size) {
+          if (auto E =
+                  DIExpression::createFragmentExpression(Expr, Start, Size))
+            FragmentExpr = *E;
+          else
+            continue;
+        }
+      }
+
       // Remove any existing intrinsics on the new alloca describing
       // the variable fragment.
       for (DbgVariableIntrinsic *OldDII : FindDbgAddrUses(Fragment.Alloca)) {
@@ -4566,262 +4566,262 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
         if (SameVariableFragment(OldDII, DbgDeclare))
           OldDII->eraseFromParent();
       }
- 
+
       DIB.insertDeclare(Fragment.Alloca, DbgDeclare->getVariable(), FragmentExpr,
                         DbgDeclare->getDebugLoc(), &AI);
-    } 
-  } 
-  return Changed; 
-} 
- 
-/// Clobber a use with undef, deleting the used value if it becomes dead. 
-void SROA::clobberUse(Use &U) { 
-  Value *OldV = U; 
-  // Replace the use with an undef value. 
-  U = UndefValue::get(OldV->getType()); 
- 
-  // Check for this making an instruction dead. We have to garbage collect 
-  // all the dead instructions to ensure the uses of any alloca end up being 
-  // minimal. 
-  if (Instruction *OldI = dyn_cast<Instruction>(OldV)) 
-    if (isInstructionTriviallyDead(OldI)) { 
+    }
+  }
+  return Changed;
+}
+
+/// Clobber a use with undef, deleting the used value if it becomes dead.
+void SROA::clobberUse(Use &U) {
+  Value *OldV = U;
+  // Replace the use with an undef value.
+  U = UndefValue::get(OldV->getType());
+
+  // Check for this making an instruction dead. We have to garbage collect
+  // all the dead instructions to ensure the uses of any alloca end up being
+  // minimal.
+  if (Instruction *OldI = dyn_cast<Instruction>(OldV))
+    if (isInstructionTriviallyDead(OldI)) {
       DeadInsts.push_back(OldI);
-    } 
-} 
- 
-/// Analyze an alloca for SROA. 
-/// 
-/// This analyzes the alloca to ensure we can reason about it, builds 
-/// the slices of the alloca, and then hands it off to be split and 
-/// rewritten as needed. 
-bool SROA::runOnAlloca(AllocaInst &AI) { 
-  LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n"); 
-  ++NumAllocasAnalyzed; 
- 
-  // Special case dead allocas, as they're trivial. 
-  if (AI.use_empty()) { 
-    AI.eraseFromParent(); 
-    return true; 
-  } 
-  const DataLayout &DL = AI.getModule()->getDataLayout(); 
- 
-  // Skip alloca forms that this analysis can't handle. 
-  auto *AT = AI.getAllocatedType(); 
-  if (AI.isArrayAllocation() || !AT->isSized() || isa<ScalableVectorType>(AT) || 
-      DL.getTypeAllocSize(AT).getFixedSize() == 0) 
-    return false; 
- 
-  bool Changed = false; 
- 
-  // First, split any FCA loads and stores touching this alloca to promote 
-  // better splitting and promotion opportunities. 
-  AggLoadStoreRewriter AggRewriter(DL); 
-  Changed |= AggRewriter.rewrite(AI); 
- 
-  // Build the slices using a recursive instruction-visiting builder. 
-  AllocaSlices AS(DL, AI); 
-  LLVM_DEBUG(AS.print(dbgs())); 
-  if (AS.isEscaped()) 
-    return Changed; 
- 
-  // Delete all the dead users of this alloca before splitting and rewriting it. 
-  for (Instruction *DeadUser : AS.getDeadUsers()) { 
-    // Free up everything used by this instruction. 
-    for (Use &DeadOp : DeadUser->operands()) 
-      clobberUse(DeadOp); 
- 
-    // Now replace the uses of this instruction. 
-    DeadUser->replaceAllUsesWith(UndefValue::get(DeadUser->getType())); 
- 
-    // And mark it for deletion. 
+    }
+}
+
+/// Analyze an alloca for SROA.
+///
+/// This analyzes the alloca to ensure we can reason about it, builds
+/// the slices of the alloca, and then hands it off to be split and
+/// rewritten as needed.
+bool SROA::runOnAlloca(AllocaInst &AI) {
+  LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
+  ++NumAllocasAnalyzed;
+
+  // Special case dead allocas, as they're trivial.
+  if (AI.use_empty()) {
+    AI.eraseFromParent();
+    return true;
+  }
+  const DataLayout &DL = AI.getModule()->getDataLayout();
+
+  // Skip alloca forms that this analysis can't handle.
+  auto *AT = AI.getAllocatedType();
+  if (AI.isArrayAllocation() || !AT->isSized() || isa<ScalableVectorType>(AT) ||
+      DL.getTypeAllocSize(AT).getFixedSize() == 0)
+    return false;
+
+  bool Changed = false;
+
+  // First, split any FCA loads and stores touching this alloca to promote
+  // better splitting and promotion opportunities.
+  AggLoadStoreRewriter AggRewriter(DL);
+  Changed |= AggRewriter.rewrite(AI);
+
+  // Build the slices using a recursive instruction-visiting builder.
+  AllocaSlices AS(DL, AI);
+  LLVM_DEBUG(AS.print(dbgs()));
+  if (AS.isEscaped())
+    return Changed;
+
+  // Delete all the dead users of this alloca before splitting and rewriting it.
+  for (Instruction *DeadUser : AS.getDeadUsers()) {
+    // Free up everything used by this instruction.
+    for (Use &DeadOp : DeadUser->operands())
+      clobberUse(DeadOp);
+
+    // Now replace the uses of this instruction.
+    DeadUser->replaceAllUsesWith(UndefValue::get(DeadUser->getType()));
+
+    // And mark it for deletion.
     DeadInsts.push_back(DeadUser);
-    Changed = true; 
-  } 
-  for (Use *DeadOp : AS.getDeadOperands()) { 
-    clobberUse(*DeadOp); 
-    Changed = true; 
-  } 
- 
-  // No slices to split. Leave the dead alloca for a later pass to clean up. 
-  if (AS.begin() == AS.end()) 
-    return Changed; 
- 
-  Changed |= splitAlloca(AI, AS); 
- 
-  LLVM_DEBUG(dbgs() << "  Speculating PHIs\n"); 
-  while (!SpeculatablePHIs.empty()) 
-    speculatePHINodeLoads(*SpeculatablePHIs.pop_back_val()); 
- 
-  LLVM_DEBUG(dbgs() << "  Speculating Selects\n"); 
-  while (!SpeculatableSelects.empty()) 
-    speculateSelectInstLoads(*SpeculatableSelects.pop_back_val()); 
- 
-  return Changed; 
-} 
- 
-/// Delete the dead instructions accumulated in this run. 
-/// 
-/// Recursively deletes the dead instructions we've accumulated. This is done 
-/// at the very end to maximize locality of the recursive delete and to 
-/// minimize the problems of invalidated instruction pointers as such pointers 
-/// are used heavily in the intermediate stages of the algorithm. 
-/// 
-/// We also record the alloca instructions deleted here so that they aren't 
-/// subsequently handed to mem2reg to promote. 
-bool SROA::deleteDeadInstructions( 
-    SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) { 
-  bool Changed = false; 
-  while (!DeadInsts.empty()) { 
+    Changed = true;
+  }
+  for (Use *DeadOp : AS.getDeadOperands()) {
+    clobberUse(*DeadOp);
+    Changed = true;
+  }
+
+  // No slices to split. Leave the dead alloca for a later pass to clean up.
+  if (AS.begin() == AS.end())
+    return Changed;
+
+  Changed |= splitAlloca(AI, AS);
+
+  LLVM_DEBUG(dbgs() << "  Speculating PHIs\n");
+  while (!SpeculatablePHIs.empty())
+    speculatePHINodeLoads(*SpeculatablePHIs.pop_back_val());
+
+  LLVM_DEBUG(dbgs() << "  Speculating Selects\n");
+  while (!SpeculatableSelects.empty())
+    speculateSelectInstLoads(*SpeculatableSelects.pop_back_val());
+
+  return Changed;
+}
+
+/// Delete the dead instructions accumulated in this run.
+///
+/// Recursively deletes the dead instructions we've accumulated. This is done
+/// at the very end to maximize locality of the recursive delete and to
+/// minimize the problems of invalidated instruction pointers as such pointers
+/// are used heavily in the intermediate stages of the algorithm.
+///
+/// We also record the alloca instructions deleted here so that they aren't
+/// subsequently handed to mem2reg to promote.
+bool SROA::deleteDeadInstructions(
+    SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
+  bool Changed = false;
+  while (!DeadInsts.empty()) {
     Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
     if (!I) continue; 
-    LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n"); 
- 
-    // If the instruction is an alloca, find the possible dbg.declare connected 
-    // to it, and remove it too. We must do this before calling RAUW or we will 
-    // not be able to find it. 
-    if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) { 
-      DeletedAllocas.insert(AI); 
-      for (DbgVariableIntrinsic *OldDII : FindDbgAddrUses(AI)) 
-        OldDII->eraseFromParent(); 
-    } 
- 
-    I->replaceAllUsesWith(UndefValue::get(I->getType())); 
- 
-    for (Use &Operand : I->operands()) 
-      if (Instruction *U = dyn_cast<Instruction>(Operand)) { 
-        // Zero out the operand and see if it becomes trivially dead. 
-        Operand = nullptr; 
-        if (isInstructionTriviallyDead(U)) 
+    LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
+
+    // If the instruction is an alloca, find the possible dbg.declare connected
+    // to it, and remove it too. We must do this before calling RAUW or we will
+    // not be able to find it.
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
+      DeletedAllocas.insert(AI);
+      for (DbgVariableIntrinsic *OldDII : FindDbgAddrUses(AI))
+        OldDII->eraseFromParent();
+    }
+
+    I->replaceAllUsesWith(UndefValue::get(I->getType()));
+
+    for (Use &Operand : I->operands())
+      if (Instruction *U = dyn_cast<Instruction>(Operand)) {
+        // Zero out the operand and see if it becomes trivially dead.
+        Operand = nullptr;
+        if (isInstructionTriviallyDead(U))
           DeadInsts.push_back(U);
-      } 
- 
-    ++NumDeleted; 
-    I->eraseFromParent(); 
-    Changed = true; 
-  } 
-  return Changed; 
-} 
- 
-/// Promote the allocas, using the best available technique. 
-/// 
-/// This attempts to promote whatever allocas have been identified as viable in 
-/// the PromotableAllocas list. If that list is empty, there is nothing to do. 
-/// This function returns whether any promotion occurred. 
-bool SROA::promoteAllocas(Function &F) { 
-  if (PromotableAllocas.empty()) 
-    return false; 
- 
-  NumPromoted += PromotableAllocas.size(); 
- 
-  LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); 
-  PromoteMemToReg(PromotableAllocas, *DT, AC); 
-  PromotableAllocas.clear(); 
-  return true; 
-} 
- 
-PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT, 
-                                AssumptionCache &RunAC) { 
-  LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); 
-  C = &F.getContext(); 
-  DT = &RunDT; 
-  AC = &RunAC; 
- 
-  BasicBlock &EntryBB = F.getEntryBlock(); 
-  for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end()); 
-       I != E; ++I) { 
-    if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) { 
-      if (isa<ScalableVectorType>(AI->getAllocatedType())) { 
-        if (isAllocaPromotable(AI)) 
-          PromotableAllocas.push_back(AI); 
-      } else { 
-        Worklist.insert(AI); 
-      } 
-    } 
-  } 
- 
-  bool Changed = false; 
-  // A set of deleted alloca instruction pointers which should be removed from 
-  // the list of promotable allocas. 
-  SmallPtrSet<AllocaInst *, 4> DeletedAllocas; 
- 
-  do { 
-    while (!Worklist.empty()) { 
-      Changed |= runOnAlloca(*Worklist.pop_back_val()); 
-      Changed |= deleteDeadInstructions(DeletedAllocas); 
- 
-      // Remove the deleted allocas from various lists so that we don't try to 
-      // continue processing them. 
-      if (!DeletedAllocas.empty()) { 
-        auto IsInSet = [&](AllocaInst *AI) { return DeletedAllocas.count(AI); }; 
-        Worklist.remove_if(IsInSet); 
-        PostPromotionWorklist.remove_if(IsInSet); 
+      }
+
+    ++NumDeleted;
+    I->eraseFromParent();
+    Changed = true;
+  }
+  return Changed;
+}
+
+/// Promote the allocas, using the best available technique.
+///
+/// This attempts to promote whatever allocas have been identified as viable in
+/// the PromotableAllocas list. If that list is empty, there is nothing to do.
+/// This function returns whether any promotion occurred.
+bool SROA::promoteAllocas(Function &F) {
+  if (PromotableAllocas.empty())
+    return false;
+
+  NumPromoted += PromotableAllocas.size();
+
+  LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
+  PromoteMemToReg(PromotableAllocas, *DT, AC);
+  PromotableAllocas.clear();
+  return true;
+}
+
+PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
+                                AssumptionCache &RunAC) {
+  LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
+  C = &F.getContext();
+  DT = &RunDT;
+  AC = &RunAC;
+
+  BasicBlock &EntryBB = F.getEntryBlock();
+  for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
+       I != E; ++I) {
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
+      if (isa<ScalableVectorType>(AI->getAllocatedType())) {
+        if (isAllocaPromotable(AI))
+          PromotableAllocas.push_back(AI);
+      } else {
+        Worklist.insert(AI);
+      }
+    }
+  }
+
+  bool Changed = false;
+  // A set of deleted alloca instruction pointers which should be removed from
+  // the list of promotable allocas.
+  SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
+
+  do {
+    while (!Worklist.empty()) {
+      Changed |= runOnAlloca(*Worklist.pop_back_val());
+      Changed |= deleteDeadInstructions(DeletedAllocas);
+
+      // Remove the deleted allocas from various lists so that we don't try to
+      // continue processing them.
+      if (!DeletedAllocas.empty()) {
+        auto IsInSet = [&](AllocaInst *AI) { return DeletedAllocas.count(AI); };
+        Worklist.remove_if(IsInSet);
+        PostPromotionWorklist.remove_if(IsInSet);
         llvm::erase_if(PromotableAllocas, IsInSet);
-        DeletedAllocas.clear(); 
-      } 
-    } 
- 
-    Changed |= promoteAllocas(F); 
- 
-    Worklist = PostPromotionWorklist; 
-    PostPromotionWorklist.clear(); 
-  } while (!Worklist.empty()); 
- 
-  if (!Changed) 
-    return PreservedAnalyses::all(); 
- 
-  PreservedAnalyses PA; 
-  PA.preserveSet<CFGAnalyses>(); 
-  PA.preserve<GlobalsAA>(); 
-  return PA; 
-} 
- 
-PreservedAnalyses SROA::run(Function &F, FunctionAnalysisManager &AM) { 
-  return runImpl(F, AM.getResult<DominatorTreeAnalysis>(F), 
-                 AM.getResult<AssumptionAnalysis>(F)); 
-} 
- 
-/// A legacy pass for the legacy pass manager that wraps the \c SROA pass. 
-/// 
-/// This is in the llvm namespace purely to allow it to be a friend of the \c 
-/// SROA pass. 
-class llvm::sroa::SROALegacyPass : public FunctionPass { 
-  /// The SROA implementation. 
-  SROA Impl; 
- 
-public: 
-  static char ID; 
- 
-  SROALegacyPass() : FunctionPass(ID) { 
-    initializeSROALegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnFunction(Function &F) override { 
-    if (skipFunction(F)) 
-      return false; 
- 
-    auto PA = Impl.runImpl( 
-        F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(), 
-        getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F)); 
-    return !PA.areAllPreserved(); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<AssumptionCacheTracker>(); 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addPreserved<GlobalsAAWrapperPass>(); 
-    AU.setPreservesCFG(); 
-  } 
- 
-  StringRef getPassName() const override { return "SROA"; } 
-}; 
- 
-char SROALegacyPass::ID = 0; 
- 
-FunctionPass *llvm::createSROAPass() { return new SROALegacyPass(); } 
- 
-INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa", 
-                      "Scalar Replacement Of Aggregates", false, false) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates", 
-                    false, false) 
+        DeletedAllocas.clear();
+      }
+    }
+
+    Changed |= promoteAllocas(F);
+
+    Worklist = PostPromotionWorklist;
+    PostPromotionWorklist.clear();
+  } while (!Worklist.empty());
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+
+PreservedAnalyses SROA::run(Function &F, FunctionAnalysisManager &AM) {
+  return runImpl(F, AM.getResult<DominatorTreeAnalysis>(F),
+                 AM.getResult<AssumptionAnalysis>(F));
+}
+
+/// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
+///
+/// This is in the llvm namespace purely to allow it to be a friend of the \c
+/// SROA pass.
+class llvm::sroa::SROALegacyPass : public FunctionPass {
+  /// The SROA implementation.
+  SROA Impl;
+
+public:
+  static char ID;
+
+  SROALegacyPass() : FunctionPass(ID) {
+    initializeSROALegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    auto PA = Impl.runImpl(
+        F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+        getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F));
+    return !PA.areAllPreserved();
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.setPreservesCFG();
+  }
+
+  StringRef getPassName() const override { return "SROA"; }
+};
+
+char SROALegacyPass::ID = 0;
+
+FunctionPass *llvm::createSROAPass() { return new SROALegacyPass(); }
+
+INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa",
+                      "Scalar Replacement Of Aggregates", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates",
+                    false, false)
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/Scalar.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/Scalar.cpp
index 1a19157cdb..dba3dba24e 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/Scalar.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/Scalar.cpp
@@ -1,307 +1,307 @@
-//===-- Scalar.cpp --------------------------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements common infrastructure for libLLVMScalarOpts.a, which 
-// implements several scalar transformations over the LLVM intermediate 
-// representation, including the C bindings for that library. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm-c/Initialization.h" 
-#include "llvm-c/Transforms/Scalar.h" 
-#include "llvm/Analysis/BasicAliasAnalysis.h" 
-#include "llvm/Analysis/Passes.h" 
-#include "llvm/Analysis/ScopedNoAliasAA.h" 
-#include "llvm/Analysis/TypeBasedAliasAnalysis.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/LegacyPassManager.h" 
-#include "llvm/IR/Verifier.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Transforms/Scalar/GVN.h" 
-#include "llvm/Transforms/Scalar/Scalarizer.h" 
-#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" 
-#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h" 
- 
-using namespace llvm; 
- 
-/// initializeScalarOptsPasses - Initialize all passes linked into the 
-/// ScalarOpts library. 
-void llvm::initializeScalarOpts(PassRegistry &Registry) { 
-  initializeADCELegacyPassPass(Registry); 
+//===-- Scalar.cpp --------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements common infrastructure for libLLVMScalarOpts.a, which
+// implements several scalar transformations over the LLVM intermediate
+// representation, including the C bindings for that library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm-c/Initialization.h"
+#include "llvm-c/Transforms/Scalar.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/ScopedNoAliasAA.h"
+#include "llvm/Analysis/TypeBasedAliasAnalysis.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Scalar/Scalarizer.h"
+#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+
+using namespace llvm;
+
+/// initializeScalarOptsPasses - Initialize all passes linked into the
+/// ScalarOpts library.
+void llvm::initializeScalarOpts(PassRegistry &Registry) {
+  initializeADCELegacyPassPass(Registry);
   initializeAnnotationRemarksLegacyPass(Registry);
-  initializeBDCELegacyPassPass(Registry); 
-  initializeAlignmentFromAssumptionsPass(Registry); 
-  initializeCallSiteSplittingLegacyPassPass(Registry); 
-  initializeConstantHoistingLegacyPassPass(Registry); 
+  initializeBDCELegacyPassPass(Registry);
+  initializeAlignmentFromAssumptionsPass(Registry);
+  initializeCallSiteSplittingLegacyPassPass(Registry);
+  initializeConstantHoistingLegacyPassPass(Registry);
   initializeConstraintEliminationPass(Registry);
-  initializeCorrelatedValuePropagationPass(Registry); 
-  initializeDCELegacyPassPass(Registry); 
-  initializeDivRemPairsLegacyPassPass(Registry); 
-  initializeScalarizerLegacyPassPass(Registry); 
-  initializeDSELegacyPassPass(Registry); 
-  initializeGuardWideningLegacyPassPass(Registry); 
-  initializeLoopGuardWideningLegacyPassPass(Registry); 
-  initializeGVNLegacyPassPass(Registry); 
-  initializeNewGVNLegacyPassPass(Registry); 
-  initializeEarlyCSELegacyPassPass(Registry); 
-  initializeEarlyCSEMemSSALegacyPassPass(Registry); 
-  initializeMakeGuardsExplicitLegacyPassPass(Registry); 
-  initializeGVNHoistLegacyPassPass(Registry); 
-  initializeGVNSinkLegacyPassPass(Registry); 
-  initializeFlattenCFGPassPass(Registry); 
-  initializeIRCELegacyPassPass(Registry); 
-  initializeIndVarSimplifyLegacyPassPass(Registry); 
-  initializeInferAddressSpacesPass(Registry); 
-  initializeInstSimplifyLegacyPassPass(Registry); 
-  initializeJumpThreadingPass(Registry); 
-  initializeLegacyLICMPassPass(Registry); 
-  initializeLegacyLoopSinkPassPass(Registry); 
-  initializeLoopFuseLegacyPass(Registry); 
-  initializeLoopDataPrefetchLegacyPassPass(Registry); 
-  initializeLoopDeletionLegacyPassPass(Registry); 
-  initializeLoopAccessLegacyAnalysisPass(Registry); 
-  initializeLoopInstSimplifyLegacyPassPass(Registry); 
+  initializeCorrelatedValuePropagationPass(Registry);
+  initializeDCELegacyPassPass(Registry);
+  initializeDivRemPairsLegacyPassPass(Registry);
+  initializeScalarizerLegacyPassPass(Registry);
+  initializeDSELegacyPassPass(Registry);
+  initializeGuardWideningLegacyPassPass(Registry);
+  initializeLoopGuardWideningLegacyPassPass(Registry);
+  initializeGVNLegacyPassPass(Registry);
+  initializeNewGVNLegacyPassPass(Registry);
+  initializeEarlyCSELegacyPassPass(Registry);
+  initializeEarlyCSEMemSSALegacyPassPass(Registry);
+  initializeMakeGuardsExplicitLegacyPassPass(Registry);
+  initializeGVNHoistLegacyPassPass(Registry);
+  initializeGVNSinkLegacyPassPass(Registry);
+  initializeFlattenCFGPassPass(Registry);
+  initializeIRCELegacyPassPass(Registry);
+  initializeIndVarSimplifyLegacyPassPass(Registry);
+  initializeInferAddressSpacesPass(Registry);
+  initializeInstSimplifyLegacyPassPass(Registry);
+  initializeJumpThreadingPass(Registry);
+  initializeLegacyLICMPassPass(Registry);
+  initializeLegacyLoopSinkPassPass(Registry);
+  initializeLoopFuseLegacyPass(Registry);
+  initializeLoopDataPrefetchLegacyPassPass(Registry);
+  initializeLoopDeletionLegacyPassPass(Registry);
+  initializeLoopAccessLegacyAnalysisPass(Registry);
+  initializeLoopInstSimplifyLegacyPassPass(Registry);
   initializeLoopInterchangeLegacyPassPass(Registry);
   initializeLoopFlattenLegacyPassPass(Registry);
-  initializeLoopPredicationLegacyPassPass(Registry); 
-  initializeLoopRotateLegacyPassPass(Registry); 
-  initializeLoopStrengthReducePass(Registry); 
+  initializeLoopPredicationLegacyPassPass(Registry);
+  initializeLoopRotateLegacyPassPass(Registry);
+  initializeLoopStrengthReducePass(Registry);
   initializeLoopRerollLegacyPassPass(Registry);
-  initializeLoopUnrollPass(Registry); 
-  initializeLoopUnrollAndJamPass(Registry); 
-  initializeLoopUnswitchPass(Registry); 
-  initializeWarnMissedTransformationsLegacyPass(Registry); 
+  initializeLoopUnrollPass(Registry);
+  initializeLoopUnrollAndJamPass(Registry);
+  initializeLoopUnswitchPass(Registry);
+  initializeWarnMissedTransformationsLegacyPass(Registry);
   initializeLoopVersioningLICMLegacyPassPass(Registry);
-  initializeLoopIdiomRecognizeLegacyPassPass(Registry); 
-  initializeLowerAtomicLegacyPassPass(Registry); 
-  initializeLowerConstantIntrinsicsPass(Registry); 
-  initializeLowerExpectIntrinsicPass(Registry); 
-  initializeLowerGuardIntrinsicLegacyPassPass(Registry); 
-  initializeLowerMatrixIntrinsicsLegacyPassPass(Registry); 
+  initializeLoopIdiomRecognizeLegacyPassPass(Registry);
+  initializeLowerAtomicLegacyPassPass(Registry);
+  initializeLowerConstantIntrinsicsPass(Registry);
+  initializeLowerExpectIntrinsicPass(Registry);
+  initializeLowerGuardIntrinsicLegacyPassPass(Registry);
+  initializeLowerMatrixIntrinsicsLegacyPassPass(Registry);
   initializeLowerMatrixIntrinsicsMinimalLegacyPassPass(Registry);
-  initializeLowerWidenableConditionLegacyPassPass(Registry); 
-  initializeMemCpyOptLegacyPassPass(Registry); 
-  initializeMergeICmpsLegacyPassPass(Registry); 
-  initializeMergedLoadStoreMotionLegacyPassPass(Registry); 
-  initializeNaryReassociateLegacyPassPass(Registry); 
-  initializePartiallyInlineLibCallsLegacyPassPass(Registry); 
-  initializeReassociateLegacyPassPass(Registry); 
-  initializeRedundantDbgInstEliminationPass(Registry); 
+  initializeLowerWidenableConditionLegacyPassPass(Registry);
+  initializeMemCpyOptLegacyPassPass(Registry);
+  initializeMergeICmpsLegacyPassPass(Registry);
+  initializeMergedLoadStoreMotionLegacyPassPass(Registry);
+  initializeNaryReassociateLegacyPassPass(Registry);
+  initializePartiallyInlineLibCallsLegacyPassPass(Registry);
+  initializeReassociateLegacyPassPass(Registry);
+  initializeRedundantDbgInstEliminationPass(Registry);
   initializeRegToMemLegacyPass(Registry);
-  initializeRewriteStatepointsForGCLegacyPassPass(Registry); 
+  initializeRewriteStatepointsForGCLegacyPassPass(Registry);
   initializeScalarizeMaskedMemIntrinLegacyPassPass(Registry);
-  initializeSCCPLegacyPassPass(Registry); 
-  initializeSROALegacyPassPass(Registry); 
-  initializeCFGSimplifyPassPass(Registry); 
+  initializeSCCPLegacyPassPass(Registry);
+  initializeSROALegacyPassPass(Registry);
+  initializeCFGSimplifyPassPass(Registry);
   initializeStructurizeCFGLegacyPassPass(Registry);
-  initializeSimpleLoopUnswitchLegacyPassPass(Registry); 
-  initializeSinkingLegacyPassPass(Registry); 
-  initializeTailCallElimPass(Registry); 
+  initializeSimpleLoopUnswitchLegacyPassPass(Registry);
+  initializeSinkingLegacyPassPass(Registry);
+  initializeTailCallElimPass(Registry);
   initializeSeparateConstOffsetFromGEPLegacyPassPass(Registry);
-  initializeSpeculativeExecutionLegacyPassPass(Registry); 
+  initializeSpeculativeExecutionLegacyPassPass(Registry);
   initializeStraightLineStrengthReduceLegacyPassPass(Registry);
-  initializePlaceBackedgeSafepointsImplPass(Registry); 
-  initializePlaceSafepointsPass(Registry); 
-  initializeFloat2IntLegacyPassPass(Registry); 
-  initializeLoopDistributeLegacyPass(Registry); 
-  initializeLoopLoadEliminationPass(Registry); 
-  initializeLoopSimplifyCFGLegacyPassPass(Registry); 
+  initializePlaceBackedgeSafepointsImplPass(Registry);
+  initializePlaceSafepointsPass(Registry);
+  initializeFloat2IntLegacyPassPass(Registry);
+  initializeLoopDistributeLegacyPass(Registry);
+  initializeLoopLoadEliminationPass(Registry);
+  initializeLoopSimplifyCFGLegacyPassPass(Registry);
   initializeLoopVersioningLegacyPassPass(Registry);
-  initializeEntryExitInstrumenterPass(Registry); 
-  initializePostInlineEntryExitInstrumenterPass(Registry); 
-} 
- 
-void LLVMAddLoopSimplifyCFGPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createLoopSimplifyCFGPass()); 
-} 
- 
-void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) { 
-  initializeScalarOpts(*unwrap(R)); 
-} 
- 
-void LLVMAddAggressiveDCEPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createAggressiveDCEPass()); 
-} 
- 
-void LLVMAddDCEPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createDeadCodeEliminationPass()); 
-} 
- 
-void LLVMAddBitTrackingDCEPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createBitTrackingDCEPass()); 
-} 
- 
-void LLVMAddAlignmentFromAssumptionsPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createAlignmentFromAssumptionsPass()); 
-} 
- 
-void LLVMAddCFGSimplificationPass(LLVMPassManagerRef PM) { 
+  initializeEntryExitInstrumenterPass(Registry);
+  initializePostInlineEntryExitInstrumenterPass(Registry);
+}
+
+void LLVMAddLoopSimplifyCFGPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopSimplifyCFGPass());
+}
+
+void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) {
+  initializeScalarOpts(*unwrap(R));
+}
+
+void LLVMAddAggressiveDCEPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createAggressiveDCEPass());
+}
+
+void LLVMAddDCEPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createDeadCodeEliminationPass());
+}
+
+void LLVMAddBitTrackingDCEPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createBitTrackingDCEPass());
+}
+
+void LLVMAddAlignmentFromAssumptionsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createAlignmentFromAssumptionsPass());
+}
+
+void LLVMAddCFGSimplificationPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createCFGSimplificationPass());
-} 
- 
-void LLVMAddDeadStoreEliminationPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createDeadStoreEliminationPass()); 
-} 
- 
-void LLVMAddScalarizerPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createScalarizerPass()); 
-} 
- 
-void LLVMAddGVNPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createGVNPass()); 
-} 
- 
-void LLVMAddNewGVNPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createNewGVNPass()); 
-} 
- 
-void LLVMAddMergedLoadStoreMotionPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createMergedLoadStoreMotionPass()); 
-} 
- 
-void LLVMAddIndVarSimplifyPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createIndVarSimplifyPass()); 
-} 
- 
+}
+
+void LLVMAddDeadStoreEliminationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createDeadStoreEliminationPass());
+}
+
+void LLVMAddScalarizerPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createScalarizerPass());
+}
+
+void LLVMAddGVNPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createGVNPass());
+}
+
+void LLVMAddNewGVNPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createNewGVNPass());
+}
+
+void LLVMAddMergedLoadStoreMotionPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createMergedLoadStoreMotionPass());
+}
+
+void LLVMAddIndVarSimplifyPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createIndVarSimplifyPass());
+}
+
 void LLVMAddInstructionSimplifyPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createInstSimplifyLegacyPass());
 }
 
-void LLVMAddJumpThreadingPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createJumpThreadingPass()); 
-} 
- 
-void LLVMAddLoopSinkPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createLoopSinkPass()); 
-} 
- 
-void LLVMAddLICMPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createLICMPass()); 
-} 
- 
-void LLVMAddLoopDeletionPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createLoopDeletionPass()); 
-} 
- 
+void LLVMAddJumpThreadingPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createJumpThreadingPass());
+}
+
+void LLVMAddLoopSinkPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopSinkPass());
+}
+
+void LLVMAddLICMPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLICMPass());
+}
+
+void LLVMAddLoopDeletionPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopDeletionPass());
+}
+
 void LLVMAddLoopFlattenPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createLoopFlattenPass());
 }
 
-void LLVMAddLoopIdiomPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createLoopIdiomPass()); 
-} 
- 
-void LLVMAddLoopRotatePass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createLoopRotatePass()); 
-} 
- 
-void LLVMAddLoopRerollPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createLoopRerollPass()); 
-} 
- 
-void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createLoopUnrollPass()); 
-} 
- 
-void LLVMAddLoopUnrollAndJamPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createLoopUnrollAndJamPass()); 
-} 
- 
-void LLVMAddLoopUnswitchPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createLoopUnswitchPass()); 
-} 
- 
-void LLVMAddLowerAtomicPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createLowerAtomicPass()); 
-} 
- 
-void LLVMAddMemCpyOptPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createMemCpyOptPass()); 
-} 
- 
-void LLVMAddPartiallyInlineLibCallsPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createPartiallyInlineLibCallsPass()); 
-} 
- 
-void LLVMAddReassociatePass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createReassociatePass()); 
-} 
- 
-void LLVMAddSCCPPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createSCCPPass()); 
-} 
- 
-void LLVMAddScalarReplAggregatesPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createSROAPass()); 
-} 
- 
-void LLVMAddScalarReplAggregatesPassSSA(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createSROAPass()); 
-} 
- 
-void LLVMAddScalarReplAggregatesPassWithThreshold(LLVMPassManagerRef PM, 
-                                                  int Threshold) { 
-  unwrap(PM)->add(createSROAPass()); 
-} 
- 
-void LLVMAddSimplifyLibCallsPass(LLVMPassManagerRef PM) { 
-  // NOTE: The simplify-libcalls pass has been removed. 
-} 
- 
-void LLVMAddTailCallEliminationPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createTailCallEliminationPass()); 
-} 
- 
-void LLVMAddDemoteMemoryToRegisterPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createDemoteRegisterToMemoryPass()); 
-} 
- 
-void LLVMAddVerifierPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createVerifierPass()); 
-} 
- 
-void LLVMAddCorrelatedValuePropagationPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createCorrelatedValuePropagationPass()); 
-} 
- 
-void LLVMAddEarlyCSEPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createEarlyCSEPass(false/*=UseMemorySSA*/)); 
-} 
- 
-void LLVMAddEarlyCSEMemSSAPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createEarlyCSEPass(true/*=UseMemorySSA*/)); 
-} 
- 
-void LLVMAddGVNHoistLegacyPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createGVNHoistPass()); 
-} 
- 
-void LLVMAddTypeBasedAliasAnalysisPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createTypeBasedAAWrapperPass()); 
-} 
- 
-void LLVMAddScopedNoAliasAAPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createScopedNoAliasAAWrapperPass()); 
-} 
- 
-void LLVMAddBasicAliasAnalysisPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createBasicAAWrapperPass()); 
-} 
- 
-void LLVMAddLowerConstantIntrinsicsPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createLowerConstantIntrinsicsPass()); 
-} 
- 
-void LLVMAddLowerExpectIntrinsicPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createLowerExpectIntrinsicPass()); 
-} 
- 
-void LLVMAddUnifyFunctionExitNodesPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createUnifyFunctionExitNodesPass()); 
-} 
+void LLVMAddLoopIdiomPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopIdiomPass());
+}
+
+void LLVMAddLoopRotatePass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopRotatePass());
+}
+
+void LLVMAddLoopRerollPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopRerollPass());
+}
+
+void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopUnrollPass());
+}
+
+void LLVMAddLoopUnrollAndJamPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopUnrollAndJamPass());
+}
+
+void LLVMAddLoopUnswitchPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopUnswitchPass());
+}
+
+void LLVMAddLowerAtomicPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLowerAtomicPass());
+}
+
+void LLVMAddMemCpyOptPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createMemCpyOptPass());
+}
+
+void LLVMAddPartiallyInlineLibCallsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createPartiallyInlineLibCallsPass());
+}
+
+void LLVMAddReassociatePass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createReassociatePass());
+}
+
+void LLVMAddSCCPPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createSCCPPass());
+}
+
+void LLVMAddScalarReplAggregatesPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createSROAPass());
+}
+
+void LLVMAddScalarReplAggregatesPassSSA(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createSROAPass());
+}
+
+void LLVMAddScalarReplAggregatesPassWithThreshold(LLVMPassManagerRef PM,
+                                                  int Threshold) {
+  unwrap(PM)->add(createSROAPass());
+}
+
+void LLVMAddSimplifyLibCallsPass(LLVMPassManagerRef PM) {
+  // NOTE: The simplify-libcalls pass has been removed.
+}
+
+void LLVMAddTailCallEliminationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createTailCallEliminationPass());
+}
+
+void LLVMAddDemoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createDemoteRegisterToMemoryPass());
+}
+
+void LLVMAddVerifierPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createVerifierPass());
+}
+
+void LLVMAddCorrelatedValuePropagationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createCorrelatedValuePropagationPass());
+}
+
+void LLVMAddEarlyCSEPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createEarlyCSEPass(false/*=UseMemorySSA*/));
+}
+
+void LLVMAddEarlyCSEMemSSAPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createEarlyCSEPass(true/*=UseMemorySSA*/));
+}
+
+void LLVMAddGVNHoistLegacyPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createGVNHoistPass());
+}
+
+void LLVMAddTypeBasedAliasAnalysisPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createTypeBasedAAWrapperPass());
+}
+
+void LLVMAddScopedNoAliasAAPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createScopedNoAliasAAWrapperPass());
+}
+
+void LLVMAddBasicAliasAnalysisPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createBasicAAWrapperPass());
+}
+
+void LLVMAddLowerConstantIntrinsicsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLowerConstantIntrinsicsPass());
+}
+
+void LLVMAddLowerExpectIntrinsicPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLowerExpectIntrinsicPass());
+}
+
+void LLVMAddUnifyFunctionExitNodesPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createUnifyFunctionExitNodesPass());
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/Scalarizer.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/Scalarizer.cpp
index 45af72520f..c95984fe19 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/Scalarizer.cpp
@@ -1,974 +1,974 @@
-//===- Scalarizer.cpp - Scalarize vector operations -----------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass converts vector operations into scalar operations, in order 
-// to expose optimization opportunities on the individual scalar operations. 
-// It is mainly intended for targets that do not have vector units, but it 
-// may also be useful for revectorizing code to different vector widths. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/Scalarizer.h" 
-#include "llvm/ADT/PostOrderIterator.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Twine.h" 
-#include "llvm/Analysis/VectorUtils.h" 
-#include "llvm/IR/Argument.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstVisitor.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/MathExtras.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include <cassert> 
-#include <cstdint> 
-#include <iterator> 
-#include <map> 
-#include <utility> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "scalarizer" 
- 
-static cl::opt<bool> ScalarizeVariableInsertExtract( 
-    "scalarize-variable-insert-extract", cl::init(true), cl::Hidden, 
-    cl::desc("Allow the scalarizer pass to scalarize " 
-             "insertelement/extractelement with variable index")); 
- 
-// This is disabled by default because having separate loads and stores 
-// makes it more likely that the -combiner-alias-analysis limits will be 
-// reached. 
-static cl::opt<bool> 
-    ScalarizeLoadStore("scalarize-load-store", cl::init(false), cl::Hidden, 
-                       cl::desc("Allow the scalarizer pass to scalarize loads and store")); 
- 
-namespace { 
- 
-// Used to store the scattered form of a vector. 
-using ValueVector = SmallVector<Value *, 8>; 
- 
-// Used to map a vector Value to its scattered form.  We use std::map 
-// because we want iterators to persist across insertion and because the 
-// values are relatively large. 
-using ScatterMap = std::map<Value *, ValueVector>; 
- 
-// Lists Instructions that have been replaced with scalar implementations, 
-// along with a pointer to their scattered forms. 
-using GatherList = SmallVector<std::pair<Instruction *, ValueVector *>, 16>; 
- 
-// Provides a very limited vector-like interface for lazily accessing one 
-// component of a scattered vector or vector pointer. 
-class Scatterer { 
-public: 
-  Scatterer() = default; 
- 
-  // Scatter V into Size components.  If new instructions are needed, 
-  // insert them before BBI in BB.  If Cache is nonnull, use it to cache 
-  // the results. 
-  Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v, 
-            ValueVector *cachePtr = nullptr); 
- 
-  // Return component I, creating a new Value for it if necessary. 
-  Value *operator[](unsigned I); 
- 
-  // Return the number of components. 
-  unsigned size() const { return Size; } 
- 
-private: 
-  BasicBlock *BB; 
-  BasicBlock::iterator BBI; 
-  Value *V; 
-  ValueVector *CachePtr; 
-  PointerType *PtrTy; 
-  ValueVector Tmp; 
-  unsigned Size; 
-}; 
- 
-// FCmpSpliiter(FCI)(Builder, X, Y, Name) uses Builder to create an FCmp 
-// called Name that compares X and Y in the same way as FCI. 
-struct FCmpSplitter { 
-  FCmpSplitter(FCmpInst &fci) : FCI(fci) {} 
- 
-  Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1, 
-                    const Twine &Name) const { 
-    return Builder.CreateFCmp(FCI.getPredicate(), Op0, Op1, Name); 
-  } 
- 
-  FCmpInst &FCI; 
-}; 
- 
-// ICmpSpliiter(ICI)(Builder, X, Y, Name) uses Builder to create an ICmp 
-// called Name that compares X and Y in the same way as ICI. 
-struct ICmpSplitter { 
-  ICmpSplitter(ICmpInst &ici) : ICI(ici) {} 
- 
-  Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1, 
-                    const Twine &Name) const { 
-    return Builder.CreateICmp(ICI.getPredicate(), Op0, Op1, Name); 
-  } 
- 
-  ICmpInst &ICI; 
-}; 
- 
-// UnarySpliiter(UO)(Builder, X, Name) uses Builder to create 
-// a unary operator like UO called Name with operand X. 
-struct UnarySplitter { 
-  UnarySplitter(UnaryOperator &uo) : UO(uo) {} 
- 
-  Value *operator()(IRBuilder<> &Builder, Value *Op, const Twine &Name) const { 
-    return Builder.CreateUnOp(UO.getOpcode(), Op, Name); 
-  } 
- 
-  UnaryOperator &UO; 
-}; 
- 
-// BinarySpliiter(BO)(Builder, X, Y, Name) uses Builder to create 
-// a binary operator like BO called Name with operands X and Y. 
-struct BinarySplitter { 
-  BinarySplitter(BinaryOperator &bo) : BO(bo) {} 
- 
-  Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1, 
-                    const Twine &Name) const { 
-    return Builder.CreateBinOp(BO.getOpcode(), Op0, Op1, Name); 
-  } 
- 
-  BinaryOperator &BO; 
-}; 
- 
-// Information about a load or store that we're scalarizing. 
-struct VectorLayout { 
-  VectorLayout() = default; 
- 
-  // Return the alignment of element I. 
-  Align getElemAlign(unsigned I) { 
-    return commonAlignment(VecAlign, I * ElemSize); 
-  } 
- 
-  // The type of the vector. 
-  VectorType *VecTy = nullptr; 
- 
-  // The type of each element. 
-  Type *ElemTy = nullptr; 
- 
-  // The alignment of the vector. 
-  Align VecAlign; 
- 
-  // The size of each element. 
-  uint64_t ElemSize = 0; 
-}; 
- 
-class ScalarizerVisitor : public InstVisitor<ScalarizerVisitor, bool> { 
-public: 
-  ScalarizerVisitor(unsigned ParallelLoopAccessMDKind, DominatorTree *DT) 
-    : ParallelLoopAccessMDKind(ParallelLoopAccessMDKind), DT(DT) { 
-  } 
- 
-  bool visit(Function &F); 
- 
-  // InstVisitor methods.  They return true if the instruction was scalarized, 
-  // false if nothing changed. 
-  bool visitInstruction(Instruction &I) { return false; } 
-  bool visitSelectInst(SelectInst &SI); 
-  bool visitICmpInst(ICmpInst &ICI); 
-  bool visitFCmpInst(FCmpInst &FCI); 
-  bool visitUnaryOperator(UnaryOperator &UO); 
-  bool visitBinaryOperator(BinaryOperator &BO); 
-  bool visitGetElementPtrInst(GetElementPtrInst &GEPI); 
-  bool visitCastInst(CastInst &CI); 
-  bool visitBitCastInst(BitCastInst &BCI); 
-  bool visitInsertElementInst(InsertElementInst &IEI); 
-  bool visitExtractElementInst(ExtractElementInst &EEI); 
-  bool visitShuffleVectorInst(ShuffleVectorInst &SVI); 
-  bool visitPHINode(PHINode &PHI); 
-  bool visitLoadInst(LoadInst &LI); 
-  bool visitStoreInst(StoreInst &SI); 
-  bool visitCallInst(CallInst &ICI); 
- 
-private: 
-  Scatterer scatter(Instruction *Point, Value *V); 
-  void gather(Instruction *Op, const ValueVector &CV); 
-  bool canTransferMetadata(unsigned Kind); 
-  void transferMetadataAndIRFlags(Instruction *Op, const ValueVector &CV); 
-  Optional<VectorLayout> getVectorLayout(Type *Ty, Align Alignment, 
-                                         const DataLayout &DL); 
-  bool finish(); 
- 
-  template<typename T> bool splitUnary(Instruction &, const T &); 
-  template<typename T> bool splitBinary(Instruction &, const T &); 
- 
-  bool splitCall(CallInst &CI); 
- 
-  ScatterMap Scattered; 
-  GatherList Gathered; 
- 
-  SmallVector<WeakTrackingVH, 32> PotentiallyDeadInstrs; 
- 
-  unsigned ParallelLoopAccessMDKind; 
- 
-  DominatorTree *DT; 
-}; 
- 
-class ScalarizerLegacyPass : public FunctionPass { 
-public: 
-  static char ID; 
- 
-  ScalarizerLegacyPass() : FunctionPass(ID) { 
-    initializeScalarizerLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnFunction(Function &F) override; 
- 
-  void getAnalysisUsage(AnalysisUsage& AU) const override { 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addPreserved<DominatorTreeWrapperPass>(); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-char ScalarizerLegacyPass::ID = 0; 
-INITIALIZE_PASS_BEGIN(ScalarizerLegacyPass, "scalarizer", 
-                      "Scalarize vector operations", false, false) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_END(ScalarizerLegacyPass, "scalarizer", 
-                    "Scalarize vector operations", false, false) 
- 
-Scatterer::Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v, 
-                     ValueVector *cachePtr) 
-  : BB(bb), BBI(bbi), V(v), CachePtr(cachePtr) { 
-  Type *Ty = V->getType(); 
-  PtrTy = dyn_cast<PointerType>(Ty); 
-  if (PtrTy) 
-    Ty = PtrTy->getElementType(); 
-  Size = cast<FixedVectorType>(Ty)->getNumElements(); 
-  if (!CachePtr) 
-    Tmp.resize(Size, nullptr); 
-  else if (CachePtr->empty()) 
-    CachePtr->resize(Size, nullptr); 
-  else 
-    assert(Size == CachePtr->size() && "Inconsistent vector sizes"); 
-} 
- 
-// Return component I, creating a new Value for it if necessary. 
-Value *Scatterer::operator[](unsigned I) { 
-  ValueVector &CV = (CachePtr ? *CachePtr : Tmp); 
-  // Try to reuse a previous value. 
-  if (CV[I]) 
-    return CV[I]; 
-  IRBuilder<> Builder(BB, BBI); 
-  if (PtrTy) { 
-    Type *ElTy = cast<VectorType>(PtrTy->getElementType())->getElementType(); 
-    if (!CV[0]) { 
-      Type *NewPtrTy = PointerType::get(ElTy, PtrTy->getAddressSpace()); 
-      CV[0] = Builder.CreateBitCast(V, NewPtrTy, V->getName() + ".i0"); 
-    } 
-    if (I != 0) 
-      CV[I] = Builder.CreateConstGEP1_32(ElTy, CV[0], I, 
-                                         V->getName() + ".i" + Twine(I)); 
-  } else { 
-    // Search through a chain of InsertElementInsts looking for element I. 
-    // Record other elements in the cache.  The new V is still suitable 
-    // for all uncached indices. 
-    while (true) { 
-      InsertElementInst *Insert = dyn_cast<InsertElementInst>(V); 
-      if (!Insert) 
-        break; 
-      ConstantInt *Idx = dyn_cast<ConstantInt>(Insert->getOperand(2)); 
-      if (!Idx) 
-        break; 
-      unsigned J = Idx->getZExtValue(); 
-      V = Insert->getOperand(0); 
-      if (I == J) { 
-        CV[J] = Insert->getOperand(1); 
-        return CV[J]; 
-      } else if (!CV[J]) { 
-        // Only cache the first entry we find for each index we're not actively 
-        // searching for. This prevents us from going too far up the chain and 
-        // caching incorrect entries. 
-        CV[J] = Insert->getOperand(1); 
-      } 
-    } 
-    CV[I] = Builder.CreateExtractElement(V, Builder.getInt32(I), 
-                                         V->getName() + ".i" + Twine(I)); 
-  } 
-  return CV[I]; 
-} 
- 
-bool ScalarizerLegacyPass::runOnFunction(Function &F) { 
-  if (skipFunction(F)) 
-    return false; 
- 
-  Module &M = *F.getParent(); 
-  unsigned ParallelLoopAccessMDKind = 
-      M.getContext().getMDKindID("llvm.mem.parallel_loop_access"); 
-  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-  ScalarizerVisitor Impl(ParallelLoopAccessMDKind, DT); 
-  return Impl.visit(F); 
-} 
- 
-FunctionPass *llvm::createScalarizerPass() { 
-  return new ScalarizerLegacyPass(); 
-} 
- 
-bool ScalarizerVisitor::visit(Function &F) { 
-  assert(Gathered.empty() && Scattered.empty()); 
- 
-  // To ensure we replace gathered components correctly we need to do an ordered 
-  // traversal of the basic blocks in the function. 
-  ReversePostOrderTraversal<BasicBlock *> RPOT(&F.getEntryBlock()); 
-  for (BasicBlock *BB : RPOT) { 
-    for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;) { 
-      Instruction *I = &*II; 
-      bool Done = InstVisitor::visit(I); 
-      ++II; 
-      if (Done && I->getType()->isVoidTy()) 
-        I->eraseFromParent(); 
-    } 
-  } 
-  return finish(); 
-} 
- 
-// Return a scattered form of V that can be accessed by Point.  V must be a 
-// vector or a pointer to a vector. 
-Scatterer ScalarizerVisitor::scatter(Instruction *Point, Value *V) { 
-  if (Argument *VArg = dyn_cast<Argument>(V)) { 
-    // Put the scattered form of arguments in the entry block, 
-    // so that it can be used everywhere. 
-    Function *F = VArg->getParent(); 
-    BasicBlock *BB = &F->getEntryBlock(); 
-    return Scatterer(BB, BB->begin(), V, &Scattered[V]); 
-  } 
-  if (Instruction *VOp = dyn_cast<Instruction>(V)) { 
-    // When scalarizing PHI nodes we might try to examine/rewrite InsertElement 
-    // nodes in predecessors. If those predecessors are unreachable from entry, 
-    // then the IR in those blocks could have unexpected properties resulting in 
-    // infinite loops in Scatterer::operator[]. By simply treating values 
-    // originating from instructions in unreachable blocks as undef we do not 
-    // need to analyse them further. 
-    if (!DT->isReachableFromEntry(VOp->getParent())) 
-      return Scatterer(Point->getParent(), Point->getIterator(), 
-                       UndefValue::get(V->getType())); 
-    // Put the scattered form of an instruction directly after the 
-    // instruction. 
-    BasicBlock *BB = VOp->getParent(); 
-    return Scatterer(BB, std::next(BasicBlock::iterator(VOp)), 
-                     V, &Scattered[V]); 
-  } 
-  // In the fallback case, just put the scattered before Point and 
-  // keep the result local to Point. 
-  return Scatterer(Point->getParent(), Point->getIterator(), V); 
-} 
- 
-// Replace Op with the gathered form of the components in CV.  Defer the 
-// deletion of Op and creation of the gathered form to the end of the pass, 
-// so that we can avoid creating the gathered form if all uses of Op are 
-// replaced with uses of CV. 
-void ScalarizerVisitor::gather(Instruction *Op, const ValueVector &CV) { 
-  transferMetadataAndIRFlags(Op, CV); 
- 
-  // If we already have a scattered form of Op (created from ExtractElements 
-  // of Op itself), replace them with the new form. 
-  ValueVector &SV = Scattered[Op]; 
-  if (!SV.empty()) { 
-    for (unsigned I = 0, E = SV.size(); I != E; ++I) { 
-      Value *V = SV[I]; 
-      if (V == nullptr || SV[I] == CV[I]) 
-        continue; 
- 
-      Instruction *Old = cast<Instruction>(V); 
+//===- Scalarizer.cpp - Scalarize vector operations -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass converts vector operations into scalar operations, in order
+// to expose optimization opportunities on the individual scalar operations.
+// It is mainly intended for targets that do not have vector units, but it
+// may also be useful for revectorizing code to different vector widths.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/Scalarizer.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <map>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "scalarizer"
+
+static cl::opt<bool> ScalarizeVariableInsertExtract(
+    "scalarize-variable-insert-extract", cl::init(true), cl::Hidden,
+    cl::desc("Allow the scalarizer pass to scalarize "
+             "insertelement/extractelement with variable index"));
+
+// This is disabled by default because having separate loads and stores
+// makes it more likely that the -combiner-alias-analysis limits will be
+// reached.
+static cl::opt<bool>
+    ScalarizeLoadStore("scalarize-load-store", cl::init(false), cl::Hidden,
+                       cl::desc("Allow the scalarizer pass to scalarize loads and store"));
+
+namespace {
+
+// Used to store the scattered form of a vector.
+using ValueVector = SmallVector<Value *, 8>;
+
+// Used to map a vector Value to its scattered form.  We use std::map
+// because we want iterators to persist across insertion and because the
+// values are relatively large.
+using ScatterMap = std::map<Value *, ValueVector>;
+
+// Lists Instructions that have been replaced with scalar implementations,
+// along with a pointer to their scattered forms.
+using GatherList = SmallVector<std::pair<Instruction *, ValueVector *>, 16>;
+
+// Provides a very limited vector-like interface for lazily accessing one
+// component of a scattered vector or vector pointer.
+class Scatterer {
+public:
+  Scatterer() = default;
+
+  // Scatter V into Size components.  If new instructions are needed,
+  // insert them before BBI in BB.  If Cache is nonnull, use it to cache
+  // the results.
+  Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v,
+            ValueVector *cachePtr = nullptr);
+
+  // Return component I, creating a new Value for it if necessary.
+  Value *operator[](unsigned I);
+
+  // Return the number of components.
+  unsigned size() const { return Size; }
+
+private:
+  BasicBlock *BB;
+  BasicBlock::iterator BBI;
+  Value *V;
+  ValueVector *CachePtr;
+  PointerType *PtrTy;
+  ValueVector Tmp;
+  unsigned Size;
+};
+
+// FCmpSpliiter(FCI)(Builder, X, Y, Name) uses Builder to create an FCmp
+// called Name that compares X and Y in the same way as FCI.
+struct FCmpSplitter {
+  FCmpSplitter(FCmpInst &fci) : FCI(fci) {}
+
+  Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1,
+                    const Twine &Name) const {
+    return Builder.CreateFCmp(FCI.getPredicate(), Op0, Op1, Name);
+  }
+
+  FCmpInst &FCI;
+};
+
+// ICmpSpliiter(ICI)(Builder, X, Y, Name) uses Builder to create an ICmp
+// called Name that compares X and Y in the same way as ICI.
+struct ICmpSplitter {
+  ICmpSplitter(ICmpInst &ici) : ICI(ici) {}
+
+  Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1,
+                    const Twine &Name) const {
+    return Builder.CreateICmp(ICI.getPredicate(), Op0, Op1, Name);
+  }
+
+  ICmpInst &ICI;
+};
+
+// UnarySpliiter(UO)(Builder, X, Name) uses Builder to create
+// a unary operator like UO called Name with operand X.
+struct UnarySplitter {
+  UnarySplitter(UnaryOperator &uo) : UO(uo) {}
+
+  Value *operator()(IRBuilder<> &Builder, Value *Op, const Twine &Name) const {
+    return Builder.CreateUnOp(UO.getOpcode(), Op, Name);
+  }
+
+  UnaryOperator &UO;
+};
+
+// BinarySpliiter(BO)(Builder, X, Y, Name) uses Builder to create
+// a binary operator like BO called Name with operands X and Y.
+struct BinarySplitter {
+  BinarySplitter(BinaryOperator &bo) : BO(bo) {}
+
+  Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1,
+                    const Twine &Name) const {
+    return Builder.CreateBinOp(BO.getOpcode(), Op0, Op1, Name);
+  }
+
+  BinaryOperator &BO;
+};
+
+// Information about a load or store that we're scalarizing.
+struct VectorLayout {
+  VectorLayout() = default;
+
+  // Return the alignment of element I.
+  Align getElemAlign(unsigned I) {
+    return commonAlignment(VecAlign, I * ElemSize);
+  }
+
+  // The type of the vector.
+  VectorType *VecTy = nullptr;
+
+  // The type of each element.
+  Type *ElemTy = nullptr;
+
+  // The alignment of the vector.
+  Align VecAlign;
+
+  // The size of each element.
+  uint64_t ElemSize = 0;
+};
+
+class ScalarizerVisitor : public InstVisitor<ScalarizerVisitor, bool> {
+public:
+  ScalarizerVisitor(unsigned ParallelLoopAccessMDKind, DominatorTree *DT)
+    : ParallelLoopAccessMDKind(ParallelLoopAccessMDKind), DT(DT) {
+  }
+
+  bool visit(Function &F);
+
+  // InstVisitor methods.  They return true if the instruction was scalarized,
+  // false if nothing changed.
+  bool visitInstruction(Instruction &I) { return false; }
+  bool visitSelectInst(SelectInst &SI);
+  bool visitICmpInst(ICmpInst &ICI);
+  bool visitFCmpInst(FCmpInst &FCI);
+  bool visitUnaryOperator(UnaryOperator &UO);
+  bool visitBinaryOperator(BinaryOperator &BO);
+  bool visitGetElementPtrInst(GetElementPtrInst &GEPI);
+  bool visitCastInst(CastInst &CI);
+  bool visitBitCastInst(BitCastInst &BCI);
+  bool visitInsertElementInst(InsertElementInst &IEI);
+  bool visitExtractElementInst(ExtractElementInst &EEI);
+  bool visitShuffleVectorInst(ShuffleVectorInst &SVI);
+  bool visitPHINode(PHINode &PHI);
+  bool visitLoadInst(LoadInst &LI);
+  bool visitStoreInst(StoreInst &SI);
+  bool visitCallInst(CallInst &ICI);
+
+private:
+  Scatterer scatter(Instruction *Point, Value *V);
+  void gather(Instruction *Op, const ValueVector &CV);
+  bool canTransferMetadata(unsigned Kind);
+  void transferMetadataAndIRFlags(Instruction *Op, const ValueVector &CV);
+  Optional<VectorLayout> getVectorLayout(Type *Ty, Align Alignment,
+                                         const DataLayout &DL);
+  bool finish();
+
+  template<typename T> bool splitUnary(Instruction &, const T &);
+  template<typename T> bool splitBinary(Instruction &, const T &);
+
+  bool splitCall(CallInst &CI);
+
+  ScatterMap Scattered;
+  GatherList Gathered;
+
+  SmallVector<WeakTrackingVH, 32> PotentiallyDeadInstrs;
+
+  unsigned ParallelLoopAccessMDKind;
+
+  DominatorTree *DT;
+};
+
+class ScalarizerLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  ScalarizerLegacyPass() : FunctionPass(ID) {
+    initializeScalarizerLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage& AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+  }
+};
+
+} // end anonymous namespace
+
+char ScalarizerLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(ScalarizerLegacyPass, "scalarizer",
+                      "Scalarize vector operations", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(ScalarizerLegacyPass, "scalarizer",
+                    "Scalarize vector operations", false, false)
+
+Scatterer::Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v,
+                     ValueVector *cachePtr)
+  : BB(bb), BBI(bbi), V(v), CachePtr(cachePtr) {
+  Type *Ty = V->getType();
+  PtrTy = dyn_cast<PointerType>(Ty);
+  if (PtrTy)
+    Ty = PtrTy->getElementType();
+  Size = cast<FixedVectorType>(Ty)->getNumElements();
+  if (!CachePtr)
+    Tmp.resize(Size, nullptr);
+  else if (CachePtr->empty())
+    CachePtr->resize(Size, nullptr);
+  else
+    assert(Size == CachePtr->size() && "Inconsistent vector sizes");
+}
+
+// Return component I, creating a new Value for it if necessary.
+Value *Scatterer::operator[](unsigned I) {
+  ValueVector &CV = (CachePtr ? *CachePtr : Tmp);
+  // Try to reuse a previous value.
+  if (CV[I])
+    return CV[I];
+  IRBuilder<> Builder(BB, BBI);
+  if (PtrTy) {
+    Type *ElTy = cast<VectorType>(PtrTy->getElementType())->getElementType();
+    if (!CV[0]) {
+      Type *NewPtrTy = PointerType::get(ElTy, PtrTy->getAddressSpace());
+      CV[0] = Builder.CreateBitCast(V, NewPtrTy, V->getName() + ".i0");
+    }
+    if (I != 0)
+      CV[I] = Builder.CreateConstGEP1_32(ElTy, CV[0], I,
+                                         V->getName() + ".i" + Twine(I));
+  } else {
+    // Search through a chain of InsertElementInsts looking for element I.
+    // Record other elements in the cache.  The new V is still suitable
+    // for all uncached indices.
+    while (true) {
+      InsertElementInst *Insert = dyn_cast<InsertElementInst>(V);
+      if (!Insert)
+        break;
+      ConstantInt *Idx = dyn_cast<ConstantInt>(Insert->getOperand(2));
+      if (!Idx)
+        break;
+      unsigned J = Idx->getZExtValue();
+      V = Insert->getOperand(0);
+      if (I == J) {
+        CV[J] = Insert->getOperand(1);
+        return CV[J];
+      } else if (!CV[J]) {
+        // Only cache the first entry we find for each index we're not actively
+        // searching for. This prevents us from going too far up the chain and
+        // caching incorrect entries.
+        CV[J] = Insert->getOperand(1);
+      }
+    }
+    CV[I] = Builder.CreateExtractElement(V, Builder.getInt32(I),
+                                         V->getName() + ".i" + Twine(I));
+  }
+  return CV[I];
+}
+
+bool ScalarizerLegacyPass::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  Module &M = *F.getParent();
+  unsigned ParallelLoopAccessMDKind =
+      M.getContext().getMDKindID("llvm.mem.parallel_loop_access");
+  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  ScalarizerVisitor Impl(ParallelLoopAccessMDKind, DT);
+  return Impl.visit(F);
+}
+
+FunctionPass *llvm::createScalarizerPass() {
+  return new ScalarizerLegacyPass();
+}
+
+bool ScalarizerVisitor::visit(Function &F) {
+  assert(Gathered.empty() && Scattered.empty());
+
+  // To ensure we replace gathered components correctly we need to do an ordered
+  // traversal of the basic blocks in the function.
+  ReversePostOrderTraversal<BasicBlock *> RPOT(&F.getEntryBlock());
+  for (BasicBlock *BB : RPOT) {
+    for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;) {
+      Instruction *I = &*II;
+      bool Done = InstVisitor::visit(I);
+      ++II;
+      if (Done && I->getType()->isVoidTy())
+        I->eraseFromParent();
+    }
+  }
+  return finish();
+}
+
+// Return a scattered form of V that can be accessed by Point.  V must be a
+// vector or a pointer to a vector.
+Scatterer ScalarizerVisitor::scatter(Instruction *Point, Value *V) {
+  if (Argument *VArg = dyn_cast<Argument>(V)) {
+    // Put the scattered form of arguments in the entry block,
+    // so that it can be used everywhere.
+    Function *F = VArg->getParent();
+    BasicBlock *BB = &F->getEntryBlock();
+    return Scatterer(BB, BB->begin(), V, &Scattered[V]);
+  }
+  if (Instruction *VOp = dyn_cast<Instruction>(V)) {
+    // When scalarizing PHI nodes we might try to examine/rewrite InsertElement
+    // nodes in predecessors. If those predecessors are unreachable from entry,
+    // then the IR in those blocks could have unexpected properties resulting in
+    // infinite loops in Scatterer::operator[]. By simply treating values
+    // originating from instructions in unreachable blocks as undef we do not
+    // need to analyse them further.
+    if (!DT->isReachableFromEntry(VOp->getParent()))
+      return Scatterer(Point->getParent(), Point->getIterator(),
+                       UndefValue::get(V->getType()));
+    // Put the scattered form of an instruction directly after the
+    // instruction.
+    BasicBlock *BB = VOp->getParent();
+    return Scatterer(BB, std::next(BasicBlock::iterator(VOp)),
+                     V, &Scattered[V]);
+  }
+  // In the fallback case, just put the scattered before Point and
+  // keep the result local to Point.
+  return Scatterer(Point->getParent(), Point->getIterator(), V);
+}
+
+// Replace Op with the gathered form of the components in CV.  Defer the
+// deletion of Op and creation of the gathered form to the end of the pass,
+// so that we can avoid creating the gathered form if all uses of Op are
+// replaced with uses of CV.
+void ScalarizerVisitor::gather(Instruction *Op, const ValueVector &CV) {
+  transferMetadataAndIRFlags(Op, CV);
+
+  // If we already have a scattered form of Op (created from ExtractElements
+  // of Op itself), replace them with the new form.
+  ValueVector &SV = Scattered[Op];
+  if (!SV.empty()) {
+    for (unsigned I = 0, E = SV.size(); I != E; ++I) {
+      Value *V = SV[I];
+      if (V == nullptr || SV[I] == CV[I])
+        continue;
+
+      Instruction *Old = cast<Instruction>(V);
       if (isa<Instruction>(CV[I]))
         CV[I]->takeName(Old);
-      Old->replaceAllUsesWith(CV[I]); 
-      PotentiallyDeadInstrs.emplace_back(Old); 
-    } 
-  } 
-  SV = CV; 
-  Gathered.push_back(GatherList::value_type(Op, &SV)); 
-} 
- 
-// Return true if it is safe to transfer the given metadata tag from 
-// vector to scalar instructions. 
-bool ScalarizerVisitor::canTransferMetadata(unsigned Tag) { 
-  return (Tag == LLVMContext::MD_tbaa 
-          || Tag == LLVMContext::MD_fpmath 
-          || Tag == LLVMContext::MD_tbaa_struct 
-          || Tag == LLVMContext::MD_invariant_load 
-          || Tag == LLVMContext::MD_alias_scope 
-          || Tag == LLVMContext::MD_noalias 
-          || Tag == ParallelLoopAccessMDKind 
-          || Tag == LLVMContext::MD_access_group); 
-} 
- 
-// Transfer metadata from Op to the instructions in CV if it is known 
-// to be safe to do so. 
-void ScalarizerVisitor::transferMetadataAndIRFlags(Instruction *Op, 
-                                                   const ValueVector &CV) { 
-  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs; 
-  Op->getAllMetadataOtherThanDebugLoc(MDs); 
-  for (unsigned I = 0, E = CV.size(); I != E; ++I) { 
-    if (Instruction *New = dyn_cast<Instruction>(CV[I])) { 
-      for (const auto &MD : MDs) 
-        if (canTransferMetadata(MD.first)) 
-          New->setMetadata(MD.first, MD.second); 
-      New->copyIRFlags(Op); 
-      if (Op->getDebugLoc() && !New->getDebugLoc()) 
-        New->setDebugLoc(Op->getDebugLoc()); 
-    } 
-  } 
-} 
- 
-// Try to fill in Layout from Ty, returning true on success.  Alignment is 
-// the alignment of the vector, or None if the ABI default should be used. 
-Optional<VectorLayout> 
-ScalarizerVisitor::getVectorLayout(Type *Ty, Align Alignment, 
-                                   const DataLayout &DL) { 
-  VectorLayout Layout; 
-  // Make sure we're dealing with a vector. 
-  Layout.VecTy = dyn_cast<VectorType>(Ty); 
-  if (!Layout.VecTy) 
-    return None; 
-  // Check that we're dealing with full-byte elements. 
-  Layout.ElemTy = Layout.VecTy->getElementType(); 
-  if (!DL.typeSizeEqualsStoreSize(Layout.ElemTy)) 
-    return None; 
-  Layout.VecAlign = Alignment; 
-  Layout.ElemSize = DL.getTypeStoreSize(Layout.ElemTy); 
-  return Layout; 
-} 
- 
-// Scalarize one-operand instruction I, using Split(Builder, X, Name) 
-// to create an instruction like I with operand X and name Name. 
-template<typename Splitter> 
-bool ScalarizerVisitor::splitUnary(Instruction &I, const Splitter &Split) { 
-  VectorType *VT = dyn_cast<VectorType>(I.getType()); 
-  if (!VT) 
-    return false; 
- 
-  unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements(); 
-  IRBuilder<> Builder(&I); 
-  Scatterer Op = scatter(&I, I.getOperand(0)); 
-  assert(Op.size() == NumElems && "Mismatched unary operation"); 
-  ValueVector Res; 
-  Res.resize(NumElems); 
-  for (unsigned Elem = 0; Elem < NumElems; ++Elem) 
-    Res[Elem] = Split(Builder, Op[Elem], I.getName() + ".i" + Twine(Elem)); 
-  gather(&I, Res); 
-  return true; 
-} 
- 
-// Scalarize two-operand instruction I, using Split(Builder, X, Y, Name) 
-// to create an instruction like I with operands X and Y and name Name. 
-template<typename Splitter> 
-bool ScalarizerVisitor::splitBinary(Instruction &I, const Splitter &Split) { 
-  VectorType *VT = dyn_cast<VectorType>(I.getType()); 
-  if (!VT) 
-    return false; 
- 
-  unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements(); 
-  IRBuilder<> Builder(&I); 
-  Scatterer VOp0 = scatter(&I, I.getOperand(0)); 
-  Scatterer VOp1 = scatter(&I, I.getOperand(1)); 
-  assert(VOp0.size() == NumElems && "Mismatched binary operation"); 
-  assert(VOp1.size() == NumElems && "Mismatched binary operation"); 
-  ValueVector Res; 
-  Res.resize(NumElems); 
-  for (unsigned Elem = 0; Elem < NumElems; ++Elem) { 
-    Value *Op0 = VOp0[Elem]; 
-    Value *Op1 = VOp1[Elem]; 
-    Res[Elem] = Split(Builder, Op0, Op1, I.getName() + ".i" + Twine(Elem)); 
-  } 
-  gather(&I, Res); 
-  return true; 
-} 
- 
-static bool isTriviallyScalariable(Intrinsic::ID ID) { 
-  return isTriviallyVectorizable(ID); 
-} 
- 
-// All of the current scalarizable intrinsics only have one mangled type. 
-static Function *getScalarIntrinsicDeclaration(Module *M, 
-                                               Intrinsic::ID ID, 
-                                               VectorType *Ty) { 
-  return Intrinsic::getDeclaration(M, ID, { Ty->getScalarType() }); 
-} 
- 
-/// If a call to a vector typed intrinsic function, split into a scalar call per 
-/// element if possible for the intrinsic. 
-bool ScalarizerVisitor::splitCall(CallInst &CI) { 
-  VectorType *VT = dyn_cast<VectorType>(CI.getType()); 
-  if (!VT) 
-    return false; 
- 
-  Function *F = CI.getCalledFunction(); 
-  if (!F) 
-    return false; 
- 
-  Intrinsic::ID ID = F->getIntrinsicID(); 
-  if (ID == Intrinsic::not_intrinsic || !isTriviallyScalariable(ID)) 
-    return false; 
- 
-  unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements(); 
-  unsigned NumArgs = CI.getNumArgOperands(); 
- 
-  ValueVector ScalarOperands(NumArgs); 
-  SmallVector<Scatterer, 8> Scattered(NumArgs); 
- 
-  Scattered.resize(NumArgs); 
- 
-  // Assumes that any vector type has the same number of elements as the return 
-  // vector type, which is true for all current intrinsics. 
-  for (unsigned I = 0; I != NumArgs; ++I) { 
-    Value *OpI = CI.getOperand(I); 
-    if (OpI->getType()->isVectorTy()) { 
-      Scattered[I] = scatter(&CI, OpI); 
-      assert(Scattered[I].size() == NumElems && "mismatched call operands"); 
-    } else { 
-      ScalarOperands[I] = OpI; 
-    } 
-  } 
- 
-  ValueVector Res(NumElems); 
-  ValueVector ScalarCallOps(NumArgs); 
- 
-  Function *NewIntrin = getScalarIntrinsicDeclaration(F->getParent(), ID, VT); 
-  IRBuilder<> Builder(&CI); 
- 
-  // Perform actual scalarization, taking care to preserve any scalar operands. 
-  for (unsigned Elem = 0; Elem < NumElems; ++Elem) { 
-    ScalarCallOps.clear(); 
- 
-    for (unsigned J = 0; J != NumArgs; ++J) { 
-      if (hasVectorInstrinsicScalarOpd(ID, J)) 
-        ScalarCallOps.push_back(ScalarOperands[J]); 
-      else 
-        ScalarCallOps.push_back(Scattered[J][Elem]); 
-    } 
- 
-    Res[Elem] = Builder.CreateCall(NewIntrin, ScalarCallOps, 
-                                   CI.getName() + ".i" + Twine(Elem)); 
-  } 
- 
-  gather(&CI, Res); 
-  return true; 
-} 
- 
-bool ScalarizerVisitor::visitSelectInst(SelectInst &SI) { 
-  VectorType *VT = dyn_cast<VectorType>(SI.getType()); 
-  if (!VT) 
-    return false; 
- 
-  unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements(); 
-  IRBuilder<> Builder(&SI); 
-  Scatterer VOp1 = scatter(&SI, SI.getOperand(1)); 
-  Scatterer VOp2 = scatter(&SI, SI.getOperand(2)); 
-  assert(VOp1.size() == NumElems && "Mismatched select"); 
-  assert(VOp2.size() == NumElems && "Mismatched select"); 
-  ValueVector Res; 
-  Res.resize(NumElems); 
- 
-  if (SI.getOperand(0)->getType()->isVectorTy()) { 
-    Scatterer VOp0 = scatter(&SI, SI.getOperand(0)); 
-    assert(VOp0.size() == NumElems && "Mismatched select"); 
-    for (unsigned I = 0; I < NumElems; ++I) { 
-      Value *Op0 = VOp0[I]; 
-      Value *Op1 = VOp1[I]; 
-      Value *Op2 = VOp2[I]; 
-      Res[I] = Builder.CreateSelect(Op0, Op1, Op2, 
-                                    SI.getName() + ".i" + Twine(I)); 
-    } 
-  } else { 
-    Value *Op0 = SI.getOperand(0); 
-    for (unsigned I = 0; I < NumElems; ++I) { 
-      Value *Op1 = VOp1[I]; 
-      Value *Op2 = VOp2[I]; 
-      Res[I] = Builder.CreateSelect(Op0, Op1, Op2, 
-                                    SI.getName() + ".i" + Twine(I)); 
-    } 
-  } 
-  gather(&SI, Res); 
-  return true; 
-} 
- 
-bool ScalarizerVisitor::visitICmpInst(ICmpInst &ICI) { 
-  return splitBinary(ICI, ICmpSplitter(ICI)); 
-} 
- 
-bool ScalarizerVisitor::visitFCmpInst(FCmpInst &FCI) { 
-  return splitBinary(FCI, FCmpSplitter(FCI)); 
-} 
- 
-bool ScalarizerVisitor::visitUnaryOperator(UnaryOperator &UO) { 
-  return splitUnary(UO, UnarySplitter(UO)); 
-} 
- 
-bool ScalarizerVisitor::visitBinaryOperator(BinaryOperator &BO) { 
-  return splitBinary(BO, BinarySplitter(BO)); 
-} 
- 
-bool ScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) { 
-  VectorType *VT = dyn_cast<VectorType>(GEPI.getType()); 
-  if (!VT) 
-    return false; 
- 
-  IRBuilder<> Builder(&GEPI); 
-  unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements(); 
-  unsigned NumIndices = GEPI.getNumIndices(); 
- 
-  // The base pointer might be scalar even if it's a vector GEP. In those cases, 
-  // splat the pointer into a vector value, and scatter that vector. 
-  Value *Op0 = GEPI.getOperand(0); 
-  if (!Op0->getType()->isVectorTy()) 
-    Op0 = Builder.CreateVectorSplat(NumElems, Op0); 
-  Scatterer Base = scatter(&GEPI, Op0); 
- 
-  SmallVector<Scatterer, 8> Ops; 
-  Ops.resize(NumIndices); 
-  for (unsigned I = 0; I < NumIndices; ++I) { 
-    Value *Op = GEPI.getOperand(I + 1); 
- 
-    // The indices might be scalars even if it's a vector GEP. In those cases, 
-    // splat the scalar into a vector value, and scatter that vector. 
-    if (!Op->getType()->isVectorTy()) 
-      Op = Builder.CreateVectorSplat(NumElems, Op); 
- 
-    Ops[I] = scatter(&GEPI, Op); 
-  } 
- 
-  ValueVector Res; 
-  Res.resize(NumElems); 
-  for (unsigned I = 0; I < NumElems; ++I) { 
-    SmallVector<Value *, 8> Indices; 
-    Indices.resize(NumIndices); 
-    for (unsigned J = 0; J < NumIndices; ++J) 
-      Indices[J] = Ops[J][I]; 
-    Res[I] = Builder.CreateGEP(GEPI.getSourceElementType(), Base[I], Indices, 
-                               GEPI.getName() + ".i" + Twine(I)); 
-    if (GEPI.isInBounds()) 
-      if (GetElementPtrInst *NewGEPI = dyn_cast<GetElementPtrInst>(Res[I])) 
-        NewGEPI->setIsInBounds(); 
-  } 
-  gather(&GEPI, Res); 
-  return true; 
-} 
- 
-bool ScalarizerVisitor::visitCastInst(CastInst &CI) { 
-  VectorType *VT = dyn_cast<VectorType>(CI.getDestTy()); 
-  if (!VT) 
-    return false; 
- 
-  unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements(); 
-  IRBuilder<> Builder(&CI); 
-  Scatterer Op0 = scatter(&CI, CI.getOperand(0)); 
-  assert(Op0.size() == NumElems && "Mismatched cast"); 
-  ValueVector Res; 
-  Res.resize(NumElems); 
-  for (unsigned I = 0; I < NumElems; ++I) 
-    Res[I] = Builder.CreateCast(CI.getOpcode(), Op0[I], VT->getElementType(), 
-                                CI.getName() + ".i" + Twine(I)); 
-  gather(&CI, Res); 
-  return true; 
-} 
- 
-bool ScalarizerVisitor::visitBitCastInst(BitCastInst &BCI) { 
-  VectorType *DstVT = dyn_cast<VectorType>(BCI.getDestTy()); 
-  VectorType *SrcVT = dyn_cast<VectorType>(BCI.getSrcTy()); 
-  if (!DstVT || !SrcVT) 
-    return false; 
- 
-  unsigned DstNumElems = cast<FixedVectorType>(DstVT)->getNumElements(); 
-  unsigned SrcNumElems = cast<FixedVectorType>(SrcVT)->getNumElements(); 
-  IRBuilder<> Builder(&BCI); 
-  Scatterer Op0 = scatter(&BCI, BCI.getOperand(0)); 
-  ValueVector Res; 
-  Res.resize(DstNumElems); 
- 
-  if (DstNumElems == SrcNumElems) { 
-    for (unsigned I = 0; I < DstNumElems; ++I) 
-      Res[I] = Builder.CreateBitCast(Op0[I], DstVT->getElementType(), 
-                                     BCI.getName() + ".i" + Twine(I)); 
-  } else if (DstNumElems > SrcNumElems) { 
-    // <M x t1> -> <N*M x t2>.  Convert each t1 to <N x t2> and copy the 
-    // individual elements to the destination. 
-    unsigned FanOut = DstNumElems / SrcNumElems; 
-    auto *MidTy = FixedVectorType::get(DstVT->getElementType(), FanOut); 
-    unsigned ResI = 0; 
-    for (unsigned Op0I = 0; Op0I < SrcNumElems; ++Op0I) { 
-      Value *V = Op0[Op0I]; 
-      Instruction *VI; 
-      // Look through any existing bitcasts before converting to <N x t2>. 
-      // In the best case, the resulting conversion might be a no-op. 
-      while ((VI = dyn_cast<Instruction>(V)) && 
-             VI->getOpcode() == Instruction::BitCast) 
-        V = VI->getOperand(0); 
-      V = Builder.CreateBitCast(V, MidTy, V->getName() + ".cast"); 
-      Scatterer Mid = scatter(&BCI, V); 
-      for (unsigned MidI = 0; MidI < FanOut; ++MidI) 
-        Res[ResI++] = Mid[MidI]; 
-    } 
-  } else { 
-    // <N*M x t1> -> <M x t2>.  Convert each group of <N x t1> into a t2. 
-    unsigned FanIn = SrcNumElems / DstNumElems; 
-    auto *MidTy = FixedVectorType::get(SrcVT->getElementType(), FanIn); 
-    unsigned Op0I = 0; 
-    for (unsigned ResI = 0; ResI < DstNumElems; ++ResI) { 
+      Old->replaceAllUsesWith(CV[I]);
+      PotentiallyDeadInstrs.emplace_back(Old);
+    }
+  }
+  SV = CV;
+  Gathered.push_back(GatherList::value_type(Op, &SV));
+}
+
+// Return true if it is safe to transfer the given metadata tag from
+// vector to scalar instructions.
+bool ScalarizerVisitor::canTransferMetadata(unsigned Tag) {
+  return (Tag == LLVMContext::MD_tbaa
+          || Tag == LLVMContext::MD_fpmath
+          || Tag == LLVMContext::MD_tbaa_struct
+          || Tag == LLVMContext::MD_invariant_load
+          || Tag == LLVMContext::MD_alias_scope
+          || Tag == LLVMContext::MD_noalias
+          || Tag == ParallelLoopAccessMDKind
+          || Tag == LLVMContext::MD_access_group);
+}
+
+// Transfer metadata from Op to the instructions in CV if it is known
+// to be safe to do so.
+void ScalarizerVisitor::transferMetadataAndIRFlags(Instruction *Op,
+                                                   const ValueVector &CV) {
+  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+  Op->getAllMetadataOtherThanDebugLoc(MDs);
+  for (unsigned I = 0, E = CV.size(); I != E; ++I) {
+    if (Instruction *New = dyn_cast<Instruction>(CV[I])) {
+      for (const auto &MD : MDs)
+        if (canTransferMetadata(MD.first))
+          New->setMetadata(MD.first, MD.second);
+      New->copyIRFlags(Op);
+      if (Op->getDebugLoc() && !New->getDebugLoc())
+        New->setDebugLoc(Op->getDebugLoc());
+    }
+  }
+}
+
+// Try to fill in Layout from Ty, returning true on success.  Alignment is
+// the alignment of the vector, or None if the ABI default should be used.
+Optional<VectorLayout>
+ScalarizerVisitor::getVectorLayout(Type *Ty, Align Alignment,
+                                   const DataLayout &DL) {
+  VectorLayout Layout;
+  // Make sure we're dealing with a vector.
+  Layout.VecTy = dyn_cast<VectorType>(Ty);
+  if (!Layout.VecTy)
+    return None;
+  // Check that we're dealing with full-byte elements.
+  Layout.ElemTy = Layout.VecTy->getElementType();
+  if (!DL.typeSizeEqualsStoreSize(Layout.ElemTy))
+    return None;
+  Layout.VecAlign = Alignment;
+  Layout.ElemSize = DL.getTypeStoreSize(Layout.ElemTy);
+  return Layout;
+}
+
+// Scalarize one-operand instruction I, using Split(Builder, X, Name)
+// to create an instruction like I with operand X and name Name.
+template<typename Splitter>
+bool ScalarizerVisitor::splitUnary(Instruction &I, const Splitter &Split) {
+  VectorType *VT = dyn_cast<VectorType>(I.getType());
+  if (!VT)
+    return false;
+
+  unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
+  IRBuilder<> Builder(&I);
+  Scatterer Op = scatter(&I, I.getOperand(0));
+  assert(Op.size() == NumElems && "Mismatched unary operation");
+  ValueVector Res;
+  Res.resize(NumElems);
+  for (unsigned Elem = 0; Elem < NumElems; ++Elem)
+    Res[Elem] = Split(Builder, Op[Elem], I.getName() + ".i" + Twine(Elem));
+  gather(&I, Res);
+  return true;
+}
+
+// Scalarize two-operand instruction I, using Split(Builder, X, Y, Name)
+// to create an instruction like I with operands X and Y and name Name.
+template<typename Splitter>
+bool ScalarizerVisitor::splitBinary(Instruction &I, const Splitter &Split) {
+  VectorType *VT = dyn_cast<VectorType>(I.getType());
+  if (!VT)
+    return false;
+
+  unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
+  IRBuilder<> Builder(&I);
+  Scatterer VOp0 = scatter(&I, I.getOperand(0));
+  Scatterer VOp1 = scatter(&I, I.getOperand(1));
+  assert(VOp0.size() == NumElems && "Mismatched binary operation");
+  assert(VOp1.size() == NumElems && "Mismatched binary operation");
+  ValueVector Res;
+  Res.resize(NumElems);
+  for (unsigned Elem = 0; Elem < NumElems; ++Elem) {
+    Value *Op0 = VOp0[Elem];
+    Value *Op1 = VOp1[Elem];
+    Res[Elem] = Split(Builder, Op0, Op1, I.getName() + ".i" + Twine(Elem));
+  }
+  gather(&I, Res);
+  return true;
+}
+
+static bool isTriviallyScalariable(Intrinsic::ID ID) {
+  return isTriviallyVectorizable(ID);
+}
+
+// All of the current scalarizable intrinsics only have one mangled type.
+static Function *getScalarIntrinsicDeclaration(Module *M,
+                                               Intrinsic::ID ID,
+                                               VectorType *Ty) {
+  return Intrinsic::getDeclaration(M, ID, { Ty->getScalarType() });
+}
+
+/// If a call to a vector typed intrinsic function, split into a scalar call per
+/// element if possible for the intrinsic.
+bool ScalarizerVisitor::splitCall(CallInst &CI) {
+  VectorType *VT = dyn_cast<VectorType>(CI.getType());
+  if (!VT)
+    return false;
+
+  Function *F = CI.getCalledFunction();
+  if (!F)
+    return false;
+
+  Intrinsic::ID ID = F->getIntrinsicID();
+  if (ID == Intrinsic::not_intrinsic || !isTriviallyScalariable(ID))
+    return false;
+
+  unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
+  unsigned NumArgs = CI.getNumArgOperands();
+
+  ValueVector ScalarOperands(NumArgs);
+  SmallVector<Scatterer, 8> Scattered(NumArgs);
+
+  Scattered.resize(NumArgs);
+
+  // Assumes that any vector type has the same number of elements as the return
+  // vector type, which is true for all current intrinsics.
+  for (unsigned I = 0; I != NumArgs; ++I) {
+    Value *OpI = CI.getOperand(I);
+    if (OpI->getType()->isVectorTy()) {
+      Scattered[I] = scatter(&CI, OpI);
+      assert(Scattered[I].size() == NumElems && "mismatched call operands");
+    } else {
+      ScalarOperands[I] = OpI;
+    }
+  }
+
+  ValueVector Res(NumElems);
+  ValueVector ScalarCallOps(NumArgs);
+
+  Function *NewIntrin = getScalarIntrinsicDeclaration(F->getParent(), ID, VT);
+  IRBuilder<> Builder(&CI);
+
+  // Perform actual scalarization, taking care to preserve any scalar operands.
+  for (unsigned Elem = 0; Elem < NumElems; ++Elem) {
+    ScalarCallOps.clear();
+
+    for (unsigned J = 0; J != NumArgs; ++J) {
+      if (hasVectorInstrinsicScalarOpd(ID, J))
+        ScalarCallOps.push_back(ScalarOperands[J]);
+      else
+        ScalarCallOps.push_back(Scattered[J][Elem]);
+    }
+
+    Res[Elem] = Builder.CreateCall(NewIntrin, ScalarCallOps,
+                                   CI.getName() + ".i" + Twine(Elem));
+  }
+
+  gather(&CI, Res);
+  return true;
+}
+
+bool ScalarizerVisitor::visitSelectInst(SelectInst &SI) {
+  VectorType *VT = dyn_cast<VectorType>(SI.getType());
+  if (!VT)
+    return false;
+
+  unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
+  IRBuilder<> Builder(&SI);
+  Scatterer VOp1 = scatter(&SI, SI.getOperand(1));
+  Scatterer VOp2 = scatter(&SI, SI.getOperand(2));
+  assert(VOp1.size() == NumElems && "Mismatched select");
+  assert(VOp2.size() == NumElems && "Mismatched select");
+  ValueVector Res;
+  Res.resize(NumElems);
+
+  if (SI.getOperand(0)->getType()->isVectorTy()) {
+    Scatterer VOp0 = scatter(&SI, SI.getOperand(0));
+    assert(VOp0.size() == NumElems && "Mismatched select");
+    for (unsigned I = 0; I < NumElems; ++I) {
+      Value *Op0 = VOp0[I];
+      Value *Op1 = VOp1[I];
+      Value *Op2 = VOp2[I];
+      Res[I] = Builder.CreateSelect(Op0, Op1, Op2,
+                                    SI.getName() + ".i" + Twine(I));
+    }
+  } else {
+    Value *Op0 = SI.getOperand(0);
+    for (unsigned I = 0; I < NumElems; ++I) {
+      Value *Op1 = VOp1[I];
+      Value *Op2 = VOp2[I];
+      Res[I] = Builder.CreateSelect(Op0, Op1, Op2,
+                                    SI.getName() + ".i" + Twine(I));
+    }
+  }
+  gather(&SI, Res);
+  return true;
+}
+
+bool ScalarizerVisitor::visitICmpInst(ICmpInst &ICI) {
+  return splitBinary(ICI, ICmpSplitter(ICI));
+}
+
+bool ScalarizerVisitor::visitFCmpInst(FCmpInst &FCI) {
+  return splitBinary(FCI, FCmpSplitter(FCI));
+}
+
+bool ScalarizerVisitor::visitUnaryOperator(UnaryOperator &UO) {
+  return splitUnary(UO, UnarySplitter(UO));
+}
+
+bool ScalarizerVisitor::visitBinaryOperator(BinaryOperator &BO) {
+  return splitBinary(BO, BinarySplitter(BO));
+}
+
+bool ScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+  VectorType *VT = dyn_cast<VectorType>(GEPI.getType());
+  if (!VT)
+    return false;
+
+  IRBuilder<> Builder(&GEPI);
+  unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
+  unsigned NumIndices = GEPI.getNumIndices();
+
+  // The base pointer might be scalar even if it's a vector GEP. In those cases,
+  // splat the pointer into a vector value, and scatter that vector.
+  Value *Op0 = GEPI.getOperand(0);
+  if (!Op0->getType()->isVectorTy())
+    Op0 = Builder.CreateVectorSplat(NumElems, Op0);
+  Scatterer Base = scatter(&GEPI, Op0);
+
+  SmallVector<Scatterer, 8> Ops;
+  Ops.resize(NumIndices);
+  for (unsigned I = 0; I < NumIndices; ++I) {
+    Value *Op = GEPI.getOperand(I + 1);
+
+    // The indices might be scalars even if it's a vector GEP. In those cases,
+    // splat the scalar into a vector value, and scatter that vector.
+    if (!Op->getType()->isVectorTy())
+      Op = Builder.CreateVectorSplat(NumElems, Op);
+
+    Ops[I] = scatter(&GEPI, Op);
+  }
+
+  ValueVector Res;
+  Res.resize(NumElems);
+  for (unsigned I = 0; I < NumElems; ++I) {
+    SmallVector<Value *, 8> Indices;
+    Indices.resize(NumIndices);
+    for (unsigned J = 0; J < NumIndices; ++J)
+      Indices[J] = Ops[J][I];
+    Res[I] = Builder.CreateGEP(GEPI.getSourceElementType(), Base[I], Indices,
+                               GEPI.getName() + ".i" + Twine(I));
+    if (GEPI.isInBounds())
+      if (GetElementPtrInst *NewGEPI = dyn_cast<GetElementPtrInst>(Res[I]))
+        NewGEPI->setIsInBounds();
+  }
+  gather(&GEPI, Res);
+  return true;
+}
+
+bool ScalarizerVisitor::visitCastInst(CastInst &CI) {
+  VectorType *VT = dyn_cast<VectorType>(CI.getDestTy());
+  if (!VT)
+    return false;
+
+  unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
+  IRBuilder<> Builder(&CI);
+  Scatterer Op0 = scatter(&CI, CI.getOperand(0));
+  assert(Op0.size() == NumElems && "Mismatched cast");
+  ValueVector Res;
+  Res.resize(NumElems);
+  for (unsigned I = 0; I < NumElems; ++I)
+    Res[I] = Builder.CreateCast(CI.getOpcode(), Op0[I], VT->getElementType(),
+                                CI.getName() + ".i" + Twine(I));
+  gather(&CI, Res);
+  return true;
+}
+
+bool ScalarizerVisitor::visitBitCastInst(BitCastInst &BCI) {
+  VectorType *DstVT = dyn_cast<VectorType>(BCI.getDestTy());
+  VectorType *SrcVT = dyn_cast<VectorType>(BCI.getSrcTy());
+  if (!DstVT || !SrcVT)
+    return false;
+
+  unsigned DstNumElems = cast<FixedVectorType>(DstVT)->getNumElements();
+  unsigned SrcNumElems = cast<FixedVectorType>(SrcVT)->getNumElements();
+  IRBuilder<> Builder(&BCI);
+  Scatterer Op0 = scatter(&BCI, BCI.getOperand(0));
+  ValueVector Res;
+  Res.resize(DstNumElems);
+
+  if (DstNumElems == SrcNumElems) {
+    for (unsigned I = 0; I < DstNumElems; ++I)
+      Res[I] = Builder.CreateBitCast(Op0[I], DstVT->getElementType(),
+                                     BCI.getName() + ".i" + Twine(I));
+  } else if (DstNumElems > SrcNumElems) {
+    // <M x t1> -> <N*M x t2>.  Convert each t1 to <N x t2> and copy the
+    // individual elements to the destination.
+    unsigned FanOut = DstNumElems / SrcNumElems;
+    auto *MidTy = FixedVectorType::get(DstVT->getElementType(), FanOut);
+    unsigned ResI = 0;
+    for (unsigned Op0I = 0; Op0I < SrcNumElems; ++Op0I) {
+      Value *V = Op0[Op0I];
+      Instruction *VI;
+      // Look through any existing bitcasts before converting to <N x t2>.
+      // In the best case, the resulting conversion might be a no-op.
+      while ((VI = dyn_cast<Instruction>(V)) &&
+             VI->getOpcode() == Instruction::BitCast)
+        V = VI->getOperand(0);
+      V = Builder.CreateBitCast(V, MidTy, V->getName() + ".cast");
+      Scatterer Mid = scatter(&BCI, V);
+      for (unsigned MidI = 0; MidI < FanOut; ++MidI)
+        Res[ResI++] = Mid[MidI];
+    }
+  } else {
+    // <N*M x t1> -> <M x t2>.  Convert each group of <N x t1> into a t2.
+    unsigned FanIn = SrcNumElems / DstNumElems;
+    auto *MidTy = FixedVectorType::get(SrcVT->getElementType(), FanIn);
+    unsigned Op0I = 0;
+    for (unsigned ResI = 0; ResI < DstNumElems; ++ResI) {
       Value *V = PoisonValue::get(MidTy);
-      for (unsigned MidI = 0; MidI < FanIn; ++MidI) 
-        V = Builder.CreateInsertElement(V, Op0[Op0I++], Builder.getInt32(MidI), 
-                                        BCI.getName() + ".i" + Twine(ResI) 
-                                        + ".upto" + Twine(MidI)); 
-      Res[ResI] = Builder.CreateBitCast(V, DstVT->getElementType(), 
-                                        BCI.getName() + ".i" + Twine(ResI)); 
-    } 
-  } 
-  gather(&BCI, Res); 
-  return true; 
-} 
- 
-bool ScalarizerVisitor::visitInsertElementInst(InsertElementInst &IEI) { 
-  VectorType *VT = dyn_cast<VectorType>(IEI.getType()); 
-  if (!VT) 
-    return false; 
- 
-  unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements(); 
-  IRBuilder<> Builder(&IEI); 
-  Scatterer Op0 = scatter(&IEI, IEI.getOperand(0)); 
-  Value *NewElt = IEI.getOperand(1); 
-  Value *InsIdx = IEI.getOperand(2); 
- 
-  ValueVector Res; 
-  Res.resize(NumElems); 
- 
-  if (auto *CI = dyn_cast<ConstantInt>(InsIdx)) { 
-    for (unsigned I = 0; I < NumElems; ++I) 
-      Res[I] = CI->getValue().getZExtValue() == I ? NewElt : Op0[I]; 
-  } else { 
-    if (!ScalarizeVariableInsertExtract) 
-      return false; 
- 
-    for (unsigned I = 0; I < NumElems; ++I) { 
-      Value *ShouldReplace = 
-          Builder.CreateICmpEQ(InsIdx, ConstantInt::get(InsIdx->getType(), I), 
-                               InsIdx->getName() + ".is." + Twine(I)); 
-      Value *OldElt = Op0[I]; 
-      Res[I] = Builder.CreateSelect(ShouldReplace, NewElt, OldElt, 
-                                    IEI.getName() + ".i" + Twine(I)); 
-    } 
-  } 
- 
-  gather(&IEI, Res); 
-  return true; 
-} 
- 
-bool ScalarizerVisitor::visitExtractElementInst(ExtractElementInst &EEI) { 
-  VectorType *VT = dyn_cast<VectorType>(EEI.getOperand(0)->getType()); 
-  if (!VT) 
-    return false; 
- 
-  unsigned NumSrcElems = cast<FixedVectorType>(VT)->getNumElements(); 
-  IRBuilder<> Builder(&EEI); 
-  Scatterer Op0 = scatter(&EEI, EEI.getOperand(0)); 
-  Value *ExtIdx = EEI.getOperand(1); 
- 
-  if (auto *CI = dyn_cast<ConstantInt>(ExtIdx)) { 
-    Value *Res = Op0[CI->getValue().getZExtValue()]; 
-    gather(&EEI, {Res}); 
-    return true; 
-  } 
- 
-  if (!ScalarizeVariableInsertExtract) 
-    return false; 
- 
-  Value *Res = UndefValue::get(VT->getElementType()); 
-  for (unsigned I = 0; I < NumSrcElems; ++I) { 
-    Value *ShouldExtract = 
-        Builder.CreateICmpEQ(ExtIdx, ConstantInt::get(ExtIdx->getType(), I), 
-                             ExtIdx->getName() + ".is." + Twine(I)); 
-    Value *Elt = Op0[I]; 
-    Res = Builder.CreateSelect(ShouldExtract, Elt, Res, 
-                               EEI.getName() + ".upto" + Twine(I)); 
-  } 
-  gather(&EEI, {Res}); 
-  return true; 
-} 
- 
-bool ScalarizerVisitor::visitShuffleVectorInst(ShuffleVectorInst &SVI) { 
-  VectorType *VT = dyn_cast<VectorType>(SVI.getType()); 
-  if (!VT) 
-    return false; 
- 
-  unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements(); 
-  Scatterer Op0 = scatter(&SVI, SVI.getOperand(0)); 
-  Scatterer Op1 = scatter(&SVI, SVI.getOperand(1)); 
-  ValueVector Res; 
-  Res.resize(NumElems); 
- 
-  for (unsigned I = 0; I < NumElems; ++I) { 
-    int Selector = SVI.getMaskValue(I); 
-    if (Selector < 0) 
-      Res[I] = UndefValue::get(VT->getElementType()); 
-    else if (unsigned(Selector) < Op0.size()) 
-      Res[I] = Op0[Selector]; 
-    else 
-      Res[I] = Op1[Selector - Op0.size()]; 
-  } 
-  gather(&SVI, Res); 
-  return true; 
-} 
- 
-bool ScalarizerVisitor::visitPHINode(PHINode &PHI) { 
-  VectorType *VT = dyn_cast<VectorType>(PHI.getType()); 
-  if (!VT) 
-    return false; 
- 
-  unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements(); 
-  IRBuilder<> Builder(&PHI); 
-  ValueVector Res; 
-  Res.resize(NumElems); 
- 
-  unsigned NumOps = PHI.getNumOperands(); 
-  for (unsigned I = 0; I < NumElems; ++I) 
-    Res[I] = Builder.CreatePHI(VT->getElementType(), NumOps, 
-                               PHI.getName() + ".i" + Twine(I)); 
- 
-  for (unsigned I = 0; I < NumOps; ++I) { 
-    Scatterer Op = scatter(&PHI, PHI.getIncomingValue(I)); 
-    BasicBlock *IncomingBlock = PHI.getIncomingBlock(I); 
-    for (unsigned J = 0; J < NumElems; ++J) 
-      cast<PHINode>(Res[J])->addIncoming(Op[J], IncomingBlock); 
-  } 
-  gather(&PHI, Res); 
-  return true; 
-} 
- 
-bool ScalarizerVisitor::visitLoadInst(LoadInst &LI) { 
-  if (!ScalarizeLoadStore) 
-    return false; 
-  if (!LI.isSimple()) 
-    return false; 
- 
-  Optional<VectorLayout> Layout = getVectorLayout( 
-      LI.getType(), LI.getAlign(), LI.getModule()->getDataLayout()); 
-  if (!Layout) 
-    return false; 
- 
-  unsigned NumElems = cast<FixedVectorType>(Layout->VecTy)->getNumElements(); 
-  IRBuilder<> Builder(&LI); 
-  Scatterer Ptr = scatter(&LI, LI.getPointerOperand()); 
-  ValueVector Res; 
-  Res.resize(NumElems); 
- 
-  for (unsigned I = 0; I < NumElems; ++I) 
-    Res[I] = Builder.CreateAlignedLoad(Layout->VecTy->getElementType(), Ptr[I], 
-                                       Align(Layout->getElemAlign(I)), 
-                                       LI.getName() + ".i" + Twine(I)); 
-  gather(&LI, Res); 
-  return true; 
-} 
- 
-bool ScalarizerVisitor::visitStoreInst(StoreInst &SI) { 
-  if (!ScalarizeLoadStore) 
-    return false; 
-  if (!SI.isSimple()) 
-    return false; 
- 
-  Value *FullValue = SI.getValueOperand(); 
-  Optional<VectorLayout> Layout = getVectorLayout( 
-      FullValue->getType(), SI.getAlign(), SI.getModule()->getDataLayout()); 
-  if (!Layout) 
-    return false; 
- 
-  unsigned NumElems = cast<FixedVectorType>(Layout->VecTy)->getNumElements(); 
-  IRBuilder<> Builder(&SI); 
-  Scatterer VPtr = scatter(&SI, SI.getPointerOperand()); 
-  Scatterer VVal = scatter(&SI, FullValue); 
- 
-  ValueVector Stores; 
-  Stores.resize(NumElems); 
-  for (unsigned I = 0; I < NumElems; ++I) { 
-    Value *Val = VVal[I]; 
-    Value *Ptr = VPtr[I]; 
-    Stores[I] = Builder.CreateAlignedStore(Val, Ptr, Layout->getElemAlign(I)); 
-  } 
-  transferMetadataAndIRFlags(&SI, Stores); 
-  return true; 
-} 
- 
-bool ScalarizerVisitor::visitCallInst(CallInst &CI) { 
-  return splitCall(CI); 
-} 
- 
-// Delete the instructions that we scalarized.  If a full vector result 
-// is still needed, recreate it using InsertElements. 
-bool ScalarizerVisitor::finish() { 
-  // The presence of data in Gathered or Scattered indicates changes 
-  // made to the Function. 
-  if (Gathered.empty() && Scattered.empty()) 
-    return false; 
-  for (const auto &GMI : Gathered) { 
-    Instruction *Op = GMI.first; 
-    ValueVector &CV = *GMI.second; 
-    if (!Op->use_empty()) { 
-      // The value is still needed, so recreate it using a series of 
-      // InsertElements. 
+      for (unsigned MidI = 0; MidI < FanIn; ++MidI)
+        V = Builder.CreateInsertElement(V, Op0[Op0I++], Builder.getInt32(MidI),
+                                        BCI.getName() + ".i" + Twine(ResI)
+                                        + ".upto" + Twine(MidI));
+      Res[ResI] = Builder.CreateBitCast(V, DstVT->getElementType(),
+                                        BCI.getName() + ".i" + Twine(ResI));
+    }
+  }
+  gather(&BCI, Res);
+  return true;
+}
+
+bool ScalarizerVisitor::visitInsertElementInst(InsertElementInst &IEI) {
+  VectorType *VT = dyn_cast<VectorType>(IEI.getType());
+  if (!VT)
+    return false;
+
+  unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
+  IRBuilder<> Builder(&IEI);
+  Scatterer Op0 = scatter(&IEI, IEI.getOperand(0));
+  Value *NewElt = IEI.getOperand(1);
+  Value *InsIdx = IEI.getOperand(2);
+
+  ValueVector Res;
+  Res.resize(NumElems);
+
+  if (auto *CI = dyn_cast<ConstantInt>(InsIdx)) {
+    for (unsigned I = 0; I < NumElems; ++I)
+      Res[I] = CI->getValue().getZExtValue() == I ? NewElt : Op0[I];
+  } else {
+    if (!ScalarizeVariableInsertExtract)
+      return false;
+
+    for (unsigned I = 0; I < NumElems; ++I) {
+      Value *ShouldReplace =
+          Builder.CreateICmpEQ(InsIdx, ConstantInt::get(InsIdx->getType(), I),
+                               InsIdx->getName() + ".is." + Twine(I));
+      Value *OldElt = Op0[I];
+      Res[I] = Builder.CreateSelect(ShouldReplace, NewElt, OldElt,
+                                    IEI.getName() + ".i" + Twine(I));
+    }
+  }
+
+  gather(&IEI, Res);
+  return true;
+}
+
+bool ScalarizerVisitor::visitExtractElementInst(ExtractElementInst &EEI) {
+  VectorType *VT = dyn_cast<VectorType>(EEI.getOperand(0)->getType());
+  if (!VT)
+    return false;
+
+  unsigned NumSrcElems = cast<FixedVectorType>(VT)->getNumElements();
+  IRBuilder<> Builder(&EEI);
+  Scatterer Op0 = scatter(&EEI, EEI.getOperand(0));
+  Value *ExtIdx = EEI.getOperand(1);
+
+  if (auto *CI = dyn_cast<ConstantInt>(ExtIdx)) {
+    Value *Res = Op0[CI->getValue().getZExtValue()];
+    gather(&EEI, {Res});
+    return true;
+  }
+
+  if (!ScalarizeVariableInsertExtract)
+    return false;
+
+  Value *Res = UndefValue::get(VT->getElementType());
+  for (unsigned I = 0; I < NumSrcElems; ++I) {
+    Value *ShouldExtract =
+        Builder.CreateICmpEQ(ExtIdx, ConstantInt::get(ExtIdx->getType(), I),
+                             ExtIdx->getName() + ".is." + Twine(I));
+    Value *Elt = Op0[I];
+    Res = Builder.CreateSelect(ShouldExtract, Elt, Res,
+                               EEI.getName() + ".upto" + Twine(I));
+  }
+  gather(&EEI, {Res});
+  return true;
+}
+
+bool ScalarizerVisitor::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
+  VectorType *VT = dyn_cast<VectorType>(SVI.getType());
+  if (!VT)
+    return false;
+
+  unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
+  Scatterer Op0 = scatter(&SVI, SVI.getOperand(0));
+  Scatterer Op1 = scatter(&SVI, SVI.getOperand(1));
+  ValueVector Res;
+  Res.resize(NumElems);
+
+  for (unsigned I = 0; I < NumElems; ++I) {
+    int Selector = SVI.getMaskValue(I);
+    if (Selector < 0)
+      Res[I] = UndefValue::get(VT->getElementType());
+    else if (unsigned(Selector) < Op0.size())
+      Res[I] = Op0[Selector];
+    else
+      Res[I] = Op1[Selector - Op0.size()];
+  }
+  gather(&SVI, Res);
+  return true;
+}
+
+bool ScalarizerVisitor::visitPHINode(PHINode &PHI) {
+  VectorType *VT = dyn_cast<VectorType>(PHI.getType());
+  if (!VT)
+    return false;
+
+  unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
+  IRBuilder<> Builder(&PHI);
+  ValueVector Res;
+  Res.resize(NumElems);
+
+  unsigned NumOps = PHI.getNumOperands();
+  for (unsigned I = 0; I < NumElems; ++I)
+    Res[I] = Builder.CreatePHI(VT->getElementType(), NumOps,
+                               PHI.getName() + ".i" + Twine(I));
+
+  for (unsigned I = 0; I < NumOps; ++I) {
+    Scatterer Op = scatter(&PHI, PHI.getIncomingValue(I));
+    BasicBlock *IncomingBlock = PHI.getIncomingBlock(I);
+    for (unsigned J = 0; J < NumElems; ++J)
+      cast<PHINode>(Res[J])->addIncoming(Op[J], IncomingBlock);
+  }
+  gather(&PHI, Res);
+  return true;
+}
+
+bool ScalarizerVisitor::visitLoadInst(LoadInst &LI) {
+  if (!ScalarizeLoadStore)
+    return false;
+  if (!LI.isSimple())
+    return false;
+
+  Optional<VectorLayout> Layout = getVectorLayout(
+      LI.getType(), LI.getAlign(), LI.getModule()->getDataLayout());
+  if (!Layout)
+    return false;
+
+  unsigned NumElems = cast<FixedVectorType>(Layout->VecTy)->getNumElements();
+  IRBuilder<> Builder(&LI);
+  Scatterer Ptr = scatter(&LI, LI.getPointerOperand());
+  ValueVector Res;
+  Res.resize(NumElems);
+
+  for (unsigned I = 0; I < NumElems; ++I)
+    Res[I] = Builder.CreateAlignedLoad(Layout->VecTy->getElementType(), Ptr[I],
+                                       Align(Layout->getElemAlign(I)),
+                                       LI.getName() + ".i" + Twine(I));
+  gather(&LI, Res);
+  return true;
+}
+
+bool ScalarizerVisitor::visitStoreInst(StoreInst &SI) {
+  if (!ScalarizeLoadStore)
+    return false;
+  if (!SI.isSimple())
+    return false;
+
+  Value *FullValue = SI.getValueOperand();
+  Optional<VectorLayout> Layout = getVectorLayout(
+      FullValue->getType(), SI.getAlign(), SI.getModule()->getDataLayout());
+  if (!Layout)
+    return false;
+
+  unsigned NumElems = cast<FixedVectorType>(Layout->VecTy)->getNumElements();
+  IRBuilder<> Builder(&SI);
+  Scatterer VPtr = scatter(&SI, SI.getPointerOperand());
+  Scatterer VVal = scatter(&SI, FullValue);
+
+  ValueVector Stores;
+  Stores.resize(NumElems);
+  for (unsigned I = 0; I < NumElems; ++I) {
+    Value *Val = VVal[I];
+    Value *Ptr = VPtr[I];
+    Stores[I] = Builder.CreateAlignedStore(Val, Ptr, Layout->getElemAlign(I));
+  }
+  transferMetadataAndIRFlags(&SI, Stores);
+  return true;
+}
+
+bool ScalarizerVisitor::visitCallInst(CallInst &CI) {
+  return splitCall(CI);
+}
+
+// Delete the instructions that we scalarized.  If a full vector result
+// is still needed, recreate it using InsertElements.
+bool ScalarizerVisitor::finish() {
+  // The presence of data in Gathered or Scattered indicates changes
+  // made to the Function.
+  if (Gathered.empty() && Scattered.empty())
+    return false;
+  for (const auto &GMI : Gathered) {
+    Instruction *Op = GMI.first;
+    ValueVector &CV = *GMI.second;
+    if (!Op->use_empty()) {
+      // The value is still needed, so recreate it using a series of
+      // InsertElements.
       Value *Res = PoisonValue::get(Op->getType());
-      if (auto *Ty = dyn_cast<VectorType>(Op->getType())) { 
-        BasicBlock *BB = Op->getParent(); 
-        unsigned Count = cast<FixedVectorType>(Ty)->getNumElements(); 
-        IRBuilder<> Builder(Op); 
-        if (isa<PHINode>(Op)) 
-          Builder.SetInsertPoint(BB, BB->getFirstInsertionPt()); 
-        for (unsigned I = 0; I < Count; ++I) 
-          Res = Builder.CreateInsertElement(Res, CV[I], Builder.getInt32(I), 
-                                            Op->getName() + ".upto" + Twine(I)); 
+      if (auto *Ty = dyn_cast<VectorType>(Op->getType())) {
+        BasicBlock *BB = Op->getParent();
+        unsigned Count = cast<FixedVectorType>(Ty)->getNumElements();
+        IRBuilder<> Builder(Op);
+        if (isa<PHINode>(Op))
+          Builder.SetInsertPoint(BB, BB->getFirstInsertionPt());
+        for (unsigned I = 0; I < Count; ++I)
+          Res = Builder.CreateInsertElement(Res, CV[I], Builder.getInt32(I),
+                                            Op->getName() + ".upto" + Twine(I));
         Res->takeName(Op);
-      } else { 
-        assert(CV.size() == 1 && Op->getType() == CV[0]->getType()); 
-        Res = CV[0]; 
-        if (Op == Res) 
-          continue; 
-      } 
-      Op->replaceAllUsesWith(Res); 
-    } 
-    PotentiallyDeadInstrs.emplace_back(Op); 
-  } 
-  Gathered.clear(); 
-  Scattered.clear(); 
- 
-  RecursivelyDeleteTriviallyDeadInstructionsPermissive(PotentiallyDeadInstrs); 
- 
-  return true; 
-} 
- 
-PreservedAnalyses ScalarizerPass::run(Function &F, FunctionAnalysisManager &AM) { 
-  Module &M = *F.getParent(); 
-  unsigned ParallelLoopAccessMDKind = 
-      M.getContext().getMDKindID("llvm.mem.parallel_loop_access"); 
-  DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F); 
-  ScalarizerVisitor Impl(ParallelLoopAccessMDKind, DT); 
-  bool Changed = Impl.visit(F); 
-  PreservedAnalyses PA; 
-  PA.preserve<DominatorTreeAnalysis>(); 
-  return Changed ? PA : PreservedAnalyses::all(); 
-} 
+      } else {
+        assert(CV.size() == 1 && Op->getType() == CV[0]->getType());
+        Res = CV[0];
+        if (Op == Res)
+          continue;
+      }
+      Op->replaceAllUsesWith(Res);
+    }
+    PotentiallyDeadInstrs.emplace_back(Op);
+  }
+  Gathered.clear();
+  Scattered.clear();
+
+  RecursivelyDeleteTriviallyDeadInstructionsPermissive(PotentiallyDeadInstrs);
+
+  return true;
+}
+
+PreservedAnalyses ScalarizerPass::run(Function &F, FunctionAnalysisManager &AM) {
+  Module &M = *F.getParent();
+  unsigned ParallelLoopAccessMDKind =
+      M.getContext().getMDKindID("llvm.mem.parallel_loop_access");
+  DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+  ScalarizerVisitor Impl(ParallelLoopAccessMDKind, DT);
+  bool Changed = Impl.visit(F);
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  return Changed ? PA : PreservedAnalyses::all();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 64bdd151fb..f216956406 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -1,371 +1,371 @@
-//===- SeparateConstOffsetFromGEP.cpp -------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// Loop unrolling may create many similar GEPs for array accesses. 
-// e.g., a 2-level loop 
-// 
-// float a[32][32]; // global variable 
-// 
-// for (int i = 0; i < 2; ++i) { 
-//   for (int j = 0; j < 2; ++j) { 
-//     ... 
-//     ... = a[x + i][y + j]; 
-//     ... 
-//   } 
-// } 
-// 
-// will probably be unrolled to: 
-// 
-// gep %a, 0, %x, %y; load 
-// gep %a, 0, %x, %y + 1; load 
-// gep %a, 0, %x + 1, %y; load 
-// gep %a, 0, %x + 1, %y + 1; load 
-// 
-// LLVM's GVN does not use partial redundancy elimination yet, and is thus 
-// unable to reuse (gep %a, 0, %x, %y). As a result, this misoptimization incurs 
-// significant slowdown in targets with limited addressing modes. For instance, 
-// because the PTX target does not support the reg+reg addressing mode, the 
-// NVPTX backend emits PTX code that literally computes the pointer address of 
-// each GEP, wasting tons of registers. It emits the following PTX for the 
-// first load and similar PTX for other loads. 
-// 
-// mov.u32         %r1, %x; 
-// mov.u32         %r2, %y; 
-// mul.wide.u32    %rl2, %r1, 128; 
-// mov.u64         %rl3, a; 
-// add.s64         %rl4, %rl3, %rl2; 
-// mul.wide.u32    %rl5, %r2, 4; 
-// add.s64         %rl6, %rl4, %rl5; 
-// ld.global.f32   %f1, [%rl6]; 
-// 
-// To reduce the register pressure, the optimization implemented in this file 
-// merges the common part of a group of GEPs, so we can compute each pointer 
-// address by adding a simple offset to the common part, saving many registers. 
-// 
-// It works by splitting each GEP into a variadic base and a constant offset. 
-// The variadic base can be computed once and reused by multiple GEPs, and the 
-// constant offsets can be nicely folded into the reg+immediate addressing mode 
-// (supported by most targets) without using any extra register. 
-// 
-// For instance, we transform the four GEPs and four loads in the above example 
-// into: 
-// 
-// base = gep a, 0, x, y 
-// load base 
-// laod base + 1  * sizeof(float) 
-// load base + 32 * sizeof(float) 
-// load base + 33 * sizeof(float) 
-// 
-// Given the transformed IR, a backend that supports the reg+immediate 
-// addressing mode can easily fold the pointer arithmetics into the loads. For 
-// example, the NVPTX backend can easily fold the pointer arithmetics into the 
-// ld.global.f32 instructions, and the resultant PTX uses much fewer registers. 
-// 
-// mov.u32         %r1, %tid.x; 
-// mov.u32         %r2, %tid.y; 
-// mul.wide.u32    %rl2, %r1, 128; 
-// mov.u64         %rl3, a; 
-// add.s64         %rl4, %rl3, %rl2; 
-// mul.wide.u32    %rl5, %r2, 4; 
-// add.s64         %rl6, %rl4, %rl5; 
-// ld.global.f32   %f1, [%rl6]; // so far the same as unoptimized PTX 
-// ld.global.f32   %f2, [%rl6+4]; // much better 
-// ld.global.f32   %f3, [%rl6+128]; // much better 
-// ld.global.f32   %f4, [%rl6+132]; // much better 
-// 
-// Another improvement enabled by the LowerGEP flag is to lower a GEP with 
-// multiple indices to either multiple GEPs with a single index or arithmetic 
-// operations (depending on whether the target uses alias analysis in codegen). 
-// Such transformation can have following benefits: 
-// (1) It can always extract constants in the indices of structure type. 
-// (2) After such Lowering, there are more optimization opportunities such as 
-//     CSE, LICM and CGP. 
-// 
-// E.g. The following GEPs have multiple indices: 
-//  BB1: 
-//    %p = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 3 
-//    load %p 
-//    ... 
-//  BB2: 
-//    %p2 = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 2 
-//    load %p2 
-//    ... 
-// 
-// We can not do CSE to the common part related to index "i64 %i". Lowering 
-// GEPs can achieve such goals. 
-// If the target does not use alias analysis in codegen, this pass will 
-// lower a GEP with multiple indices into arithmetic operations: 
-//  BB1: 
-//    %1 = ptrtoint [10 x %struct]* %ptr to i64    ; CSE opportunity 
-//    %2 = mul i64 %i, length_of_10xstruct         ; CSE opportunity 
-//    %3 = add i64 %1, %2                          ; CSE opportunity 
-//    %4 = mul i64 %j1, length_of_struct 
-//    %5 = add i64 %3, %4 
-//    %6 = add i64 %3, struct_field_3              ; Constant offset 
-//    %p = inttoptr i64 %6 to i32* 
-//    load %p 
-//    ... 
-//  BB2: 
-//    %7 = ptrtoint [10 x %struct]* %ptr to i64    ; CSE opportunity 
-//    %8 = mul i64 %i, length_of_10xstruct         ; CSE opportunity 
-//    %9 = add i64 %7, %8                          ; CSE opportunity 
-//    %10 = mul i64 %j2, length_of_struct 
-//    %11 = add i64 %9, %10 
-//    %12 = add i64 %11, struct_field_2            ; Constant offset 
-//    %p = inttoptr i64 %12 to i32* 
-//    load %p2 
-//    ... 
-// 
-// If the target uses alias analysis in codegen, this pass will lower a GEP 
-// with multiple indices into multiple GEPs with a single index: 
-//  BB1: 
-//    %1 = bitcast [10 x %struct]* %ptr to i8*     ; CSE opportunity 
-//    %2 = mul i64 %i, length_of_10xstruct         ; CSE opportunity 
-//    %3 = getelementptr i8* %1, i64 %2            ; CSE opportunity 
-//    %4 = mul i64 %j1, length_of_struct 
-//    %5 = getelementptr i8* %3, i64 %4 
-//    %6 = getelementptr i8* %5, struct_field_3    ; Constant offset 
-//    %p = bitcast i8* %6 to i32* 
-//    load %p 
-//    ... 
-//  BB2: 
-//    %7 = bitcast [10 x %struct]* %ptr to i8*     ; CSE opportunity 
-//    %8 = mul i64 %i, length_of_10xstruct         ; CSE opportunity 
-//    %9 = getelementptr i8* %7, i64 %8            ; CSE opportunity 
-//    %10 = mul i64 %j2, length_of_struct 
-//    %11 = getelementptr i8* %9, i64 %10 
-//    %12 = getelementptr i8* %11, struct_field_2  ; Constant offset 
-//    %p2 = bitcast i8* %12 to i32* 
-//    load %p2 
-//    ... 
-// 
-// Lowering GEPs can also benefit other passes such as LICM and CGP. 
-// LICM (Loop Invariant Code Motion) can not hoist/sink a GEP of multiple 
-// indices if one of the index is variant. If we lower such GEP into invariant 
-// parts and variant parts, LICM can hoist/sink those invariant parts. 
-// CGP (CodeGen Prepare) tries to sink address calculations that match the 
-// target's addressing modes. A GEP with multiple indices may not match and will 
-// not be sunk. If we lower such GEP into smaller parts, CGP may sink some of 
-// them. So we end up with a better addressing mode. 
-// 
-//===----------------------------------------------------------------------===// 
- 
+//===- SeparateConstOffsetFromGEP.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Loop unrolling may create many similar GEPs for array accesses.
+// e.g., a 2-level loop
+//
+// float a[32][32]; // global variable
+//
+// for (int i = 0; i < 2; ++i) {
+//   for (int j = 0; j < 2; ++j) {
+//     ...
+//     ... = a[x + i][y + j];
+//     ...
+//   }
+// }
+//
+// will probably be unrolled to:
+//
+// gep %a, 0, %x, %y; load
+// gep %a, 0, %x, %y + 1; load
+// gep %a, 0, %x + 1, %y; load
+// gep %a, 0, %x + 1, %y + 1; load
+//
+// LLVM's GVN does not use partial redundancy elimination yet, and is thus
+// unable to reuse (gep %a, 0, %x, %y). As a result, this misoptimization incurs
+// significant slowdown in targets with limited addressing modes. For instance,
+// because the PTX target does not support the reg+reg addressing mode, the
+// NVPTX backend emits PTX code that literally computes the pointer address of
+// each GEP, wasting tons of registers. It emits the following PTX for the
+// first load and similar PTX for other loads.
+//
+// mov.u32         %r1, %x;
+// mov.u32         %r2, %y;
+// mul.wide.u32    %rl2, %r1, 128;
+// mov.u64         %rl3, a;
+// add.s64         %rl4, %rl3, %rl2;
+// mul.wide.u32    %rl5, %r2, 4;
+// add.s64         %rl6, %rl4, %rl5;
+// ld.global.f32   %f1, [%rl6];
+//
+// To reduce the register pressure, the optimization implemented in this file
+// merges the common part of a group of GEPs, so we can compute each pointer
+// address by adding a simple offset to the common part, saving many registers.
+//
+// It works by splitting each GEP into a variadic base and a constant offset.
+// The variadic base can be computed once and reused by multiple GEPs, and the
+// constant offsets can be nicely folded into the reg+immediate addressing mode
+// (supported by most targets) without using any extra register.
+//
+// For instance, we transform the four GEPs and four loads in the above example
+// into:
+//
+// base = gep a, 0, x, y
+// load base
+// laod base + 1  * sizeof(float)
+// load base + 32 * sizeof(float)
+// load base + 33 * sizeof(float)
+//
+// Given the transformed IR, a backend that supports the reg+immediate
+// addressing mode can easily fold the pointer arithmetics into the loads. For
+// example, the NVPTX backend can easily fold the pointer arithmetics into the
+// ld.global.f32 instructions, and the resultant PTX uses much fewer registers.
+//
+// mov.u32         %r1, %tid.x;
+// mov.u32         %r2, %tid.y;
+// mul.wide.u32    %rl2, %r1, 128;
+// mov.u64         %rl3, a;
+// add.s64         %rl4, %rl3, %rl2;
+// mul.wide.u32    %rl5, %r2, 4;
+// add.s64         %rl6, %rl4, %rl5;
+// ld.global.f32   %f1, [%rl6]; // so far the same as unoptimized PTX
+// ld.global.f32   %f2, [%rl6+4]; // much better
+// ld.global.f32   %f3, [%rl6+128]; // much better
+// ld.global.f32   %f4, [%rl6+132]; // much better
+//
+// Another improvement enabled by the LowerGEP flag is to lower a GEP with
+// multiple indices to either multiple GEPs with a single index or arithmetic
+// operations (depending on whether the target uses alias analysis in codegen).
+// Such transformation can have following benefits:
+// (1) It can always extract constants in the indices of structure type.
+// (2) After such Lowering, there are more optimization opportunities such as
+//     CSE, LICM and CGP.
+//
+// E.g. The following GEPs have multiple indices:
+//  BB1:
+//    %p = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 3
+//    load %p
+//    ...
+//  BB2:
+//    %p2 = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 2
+//    load %p2
+//    ...
+//
+// We can not do CSE to the common part related to index "i64 %i". Lowering
+// GEPs can achieve such goals.
+// If the target does not use alias analysis in codegen, this pass will
+// lower a GEP with multiple indices into arithmetic operations:
+//  BB1:
+//    %1 = ptrtoint [10 x %struct]* %ptr to i64    ; CSE opportunity
+//    %2 = mul i64 %i, length_of_10xstruct         ; CSE opportunity
+//    %3 = add i64 %1, %2                          ; CSE opportunity
+//    %4 = mul i64 %j1, length_of_struct
+//    %5 = add i64 %3, %4
+//    %6 = add i64 %3, struct_field_3              ; Constant offset
+//    %p = inttoptr i64 %6 to i32*
+//    load %p
+//    ...
+//  BB2:
+//    %7 = ptrtoint [10 x %struct]* %ptr to i64    ; CSE opportunity
+//    %8 = mul i64 %i, length_of_10xstruct         ; CSE opportunity
+//    %9 = add i64 %7, %8                          ; CSE opportunity
+//    %10 = mul i64 %j2, length_of_struct
+//    %11 = add i64 %9, %10
+//    %12 = add i64 %11, struct_field_2            ; Constant offset
+//    %p = inttoptr i64 %12 to i32*
+//    load %p2
+//    ...
+//
+// If the target uses alias analysis in codegen, this pass will lower a GEP
+// with multiple indices into multiple GEPs with a single index:
+//  BB1:
+//    %1 = bitcast [10 x %struct]* %ptr to i8*     ; CSE opportunity
+//    %2 = mul i64 %i, length_of_10xstruct         ; CSE opportunity
+//    %3 = getelementptr i8* %1, i64 %2            ; CSE opportunity
+//    %4 = mul i64 %j1, length_of_struct
+//    %5 = getelementptr i8* %3, i64 %4
+//    %6 = getelementptr i8* %5, struct_field_3    ; Constant offset
+//    %p = bitcast i8* %6 to i32*
+//    load %p
+//    ...
+//  BB2:
+//    %7 = bitcast [10 x %struct]* %ptr to i8*     ; CSE opportunity
+//    %8 = mul i64 %i, length_of_10xstruct         ; CSE opportunity
+//    %9 = getelementptr i8* %7, i64 %8            ; CSE opportunity
+//    %10 = mul i64 %j2, length_of_struct
+//    %11 = getelementptr i8* %9, i64 %10
+//    %12 = getelementptr i8* %11, struct_field_2  ; Constant offset
+//    %p2 = bitcast i8* %12 to i32*
+//    load %p2
+//    ...
+//
+// Lowering GEPs can also benefit other passes such as LICM and CGP.
+// LICM (Loop Invariant Code Motion) can not hoist/sink a GEP of multiple
+// indices if one of the index is variant. If we lower such GEP into invariant
+// parts and variant parts, LICM can hoist/sink those invariant parts.
+// CGP (CodeGen Prepare) tries to sink address calculations that match the
+// target's addressing modes. A GEP with multiple indices may not match and will
+// not be sunk. If we lower such GEP into smaller parts, CGP may sink some of
+// them. So we end up with a better addressing mode.
+//
+//===----------------------------------------------------------------------===//
+
 #include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h"
-#include "llvm/ADT/APInt.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/DepthFirstIterator.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/MemoryBuiltins.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GetElementPtrTypeIterator.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Module.h" 
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Target/TargetMachine.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include <cassert> 
-#include <cstdint> 
-#include <string> 
- 
-using namespace llvm; 
-using namespace llvm::PatternMatch; 
- 
-static cl::opt<bool> DisableSeparateConstOffsetFromGEP( 
-    "disable-separate-const-offset-from-gep", cl::init(false), 
-    cl::desc("Do not separate the constant offset from a GEP instruction"), 
-    cl::Hidden); 
- 
-// Setting this flag may emit false positives when the input module already 
-// contains dead instructions. Therefore, we set it only in unit tests that are 
-// free of dead code. 
-static cl::opt<bool> 
-    VerifyNoDeadCode("reassociate-geps-verify-no-dead-code", cl::init(false), 
-                     cl::desc("Verify this pass produces no dead code"), 
-                     cl::Hidden); 
- 
-namespace { 
- 
-/// A helper class for separating a constant offset from a GEP index. 
-/// 
-/// In real programs, a GEP index may be more complicated than a simple addition 
-/// of something and a constant integer which can be trivially splitted. For 
-/// example, to split ((a << 3) | 5) + b, we need to search deeper for the 
-/// constant offset, so that we can separate the index to (a << 3) + b and 5. 
-/// 
-/// Therefore, this class looks into the expression that computes a given GEP 
-/// index, and tries to find a constant integer that can be hoisted to the 
-/// outermost level of the expression as an addition. Not every constant in an 
-/// expression can jump out. e.g., we cannot transform (b * (a + 5)) to (b * a + 
-/// 5); nor can we transform (3 * (a + 5)) to (3 * a + 5), however in this case, 
-/// -instcombine probably already optimized (3 * (a + 5)) to (3 * a + 15). 
-class ConstantOffsetExtractor { 
-public: 
-  /// Extracts a constant offset from the given GEP index. It returns the 
-  /// new index representing the remainder (equal to the original index minus 
-  /// the constant offset), or nullptr if we cannot extract a constant offset. 
-  /// \p Idx The given GEP index 
-  /// \p GEP The given GEP 
-  /// \p UserChainTail Outputs the tail of UserChain so that we can 
-  ///                  garbage-collect unused instructions in UserChain. 
-  static Value *Extract(Value *Idx, GetElementPtrInst *GEP, 
-                        User *&UserChainTail, const DominatorTree *DT); 
- 
-  /// Looks for a constant offset from the given GEP index without extracting 
-  /// it. It returns the numeric value of the extracted constant offset (0 if 
-  /// failed). The meaning of the arguments are the same as Extract. 
-  static int64_t Find(Value *Idx, GetElementPtrInst *GEP, 
-                      const DominatorTree *DT); 
- 
-private: 
-  ConstantOffsetExtractor(Instruction *InsertionPt, const DominatorTree *DT) 
-      : IP(InsertionPt), DL(InsertionPt->getModule()->getDataLayout()), DT(DT) { 
-  } 
- 
-  /// Searches the expression that computes V for a non-zero constant C s.t. 
-  /// V can be reassociated into the form V' + C. If the searching is 
-  /// successful, returns C and update UserChain as a def-use chain from C to V; 
-  /// otherwise, UserChain is empty. 
-  /// 
-  /// \p V            The given expression 
-  /// \p SignExtended Whether V will be sign-extended in the computation of the 
-  ///                 GEP index 
-  /// \p ZeroExtended Whether V will be zero-extended in the computation of the 
-  ///                 GEP index 
-  /// \p NonNegative  Whether V is guaranteed to be non-negative. For example, 
-  ///                 an index of an inbounds GEP is guaranteed to be 
-  ///                 non-negative. Levaraging this, we can better split 
-  ///                 inbounds GEPs. 
-  APInt find(Value *V, bool SignExtended, bool ZeroExtended, bool NonNegative); 
- 
-  /// A helper function to look into both operands of a binary operator. 
-  APInt findInEitherOperand(BinaryOperator *BO, bool SignExtended, 
-                            bool ZeroExtended); 
- 
-  /// After finding the constant offset C from the GEP index I, we build a new 
-  /// index I' s.t. I' + C = I. This function builds and returns the new 
-  /// index I' according to UserChain produced by function "find". 
-  /// 
-  /// The building conceptually takes two steps: 
-  /// 1) iteratively distribute s/zext towards the leaves of the expression tree 
-  /// that computes I 
-  /// 2) reassociate the expression tree to the form I' + C. 
-  /// 
-  /// For example, to extract the 5 from sext(a + (b + 5)), we first distribute 
-  /// sext to a, b and 5 so that we have 
-  ///   sext(a) + (sext(b) + 5). 
-  /// Then, we reassociate it to 
-  ///   (sext(a) + sext(b)) + 5. 
-  /// Given this form, we know I' is sext(a) + sext(b). 
-  Value *rebuildWithoutConstOffset(); 
- 
-  /// After the first step of rebuilding the GEP index without the constant 
-  /// offset, distribute s/zext to the operands of all operators in UserChain. 
-  /// e.g., zext(sext(a + (b + 5)) (assuming no overflow) => 
-  /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))). 
-  /// 
-  /// The function also updates UserChain to point to new subexpressions after 
-  /// distributing s/zext. e.g., the old UserChain of the above example is 
-  /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)), 
-  /// and the new UserChain is 
-  /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) -> 
-  ///   zext(sext(a)) + (zext(sext(b)) + zext(sext(5)) 
-  /// 
-  /// \p ChainIndex The index to UserChain. ChainIndex is initially 
-  ///               UserChain.size() - 1, and is decremented during 
-  ///               the recursion. 
-  Value *distributeExtsAndCloneChain(unsigned ChainIndex); 
- 
-  /// Reassociates the GEP index to the form I' + C and returns I'. 
-  Value *removeConstOffset(unsigned ChainIndex); 
- 
-  /// A helper function to apply ExtInsts, a list of s/zext, to value V. 
-  /// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function 
-  /// returns "sext i32 (zext i16 V to i32) to i64". 
-  Value *applyExts(Value *V); 
- 
-  /// A helper function that returns whether we can trace into the operands 
-  /// of binary operator BO for a constant offset. 
-  /// 
-  /// \p SignExtended Whether BO is surrounded by sext 
-  /// \p ZeroExtended Whether BO is surrounded by zext 
-  /// \p NonNegative Whether BO is known to be non-negative, e.g., an in-bound 
-  ///                array index. 
-  bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO, 
-                    bool NonNegative); 
- 
-  /// The path from the constant offset to the old GEP index. e.g., if the GEP 
-  /// index is "a * b + (c + 5)". After running function find, UserChain[0] will 
-  /// be the constant 5, UserChain[1] will be the subexpression "c + 5", and 
-  /// UserChain[2] will be the entire expression "a * b + (c + 5)". 
-  /// 
-  /// This path helps to rebuild the new GEP index. 
-  SmallVector<User *, 8> UserChain; 
- 
-  /// A data structure used in rebuildWithoutConstOffset. Contains all 
-  /// sext/zext instructions along UserChain. 
-  SmallVector<CastInst *, 16> ExtInsts; 
- 
-  /// Insertion position of cloned instructions. 
-  Instruction *IP; 
- 
-  const DataLayout &DL; 
-  const DominatorTree *DT; 
-}; 
- 
-/// A pass that tries to split every GEP in the function into a variadic 
-/// base and a constant offset. It is a FunctionPass because searching for the 
-/// constant offset may inspect other basic blocks. 
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <cstdint>
+#include <string>
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+static cl::opt<bool> DisableSeparateConstOffsetFromGEP(
+    "disable-separate-const-offset-from-gep", cl::init(false),
+    cl::desc("Do not separate the constant offset from a GEP instruction"),
+    cl::Hidden);
+
+// Setting this flag may emit false positives when the input module already
+// contains dead instructions. Therefore, we set it only in unit tests that are
+// free of dead code.
+static cl::opt<bool>
+    VerifyNoDeadCode("reassociate-geps-verify-no-dead-code", cl::init(false),
+                     cl::desc("Verify this pass produces no dead code"),
+                     cl::Hidden);
+
+namespace {
+
+/// A helper class for separating a constant offset from a GEP index.
+///
+/// In real programs, a GEP index may be more complicated than a simple addition
+/// of something and a constant integer which can be trivially splitted. For
+/// example, to split ((a << 3) | 5) + b, we need to search deeper for the
+/// constant offset, so that we can separate the index to (a << 3) + b and 5.
+///
+/// Therefore, this class looks into the expression that computes a given GEP
+/// index, and tries to find a constant integer that can be hoisted to the
+/// outermost level of the expression as an addition. Not every constant in an
+/// expression can jump out. e.g., we cannot transform (b * (a + 5)) to (b * a +
+/// 5); nor can we transform (3 * (a + 5)) to (3 * a + 5), however in this case,
+/// -instcombine probably already optimized (3 * (a + 5)) to (3 * a + 15).
+class ConstantOffsetExtractor {
+public:
+  /// Extracts a constant offset from the given GEP index. It returns the
+  /// new index representing the remainder (equal to the original index minus
+  /// the constant offset), or nullptr if we cannot extract a constant offset.
+  /// \p Idx The given GEP index
+  /// \p GEP The given GEP
+  /// \p UserChainTail Outputs the tail of UserChain so that we can
+  ///                  garbage-collect unused instructions in UserChain.
+  static Value *Extract(Value *Idx, GetElementPtrInst *GEP,
+                        User *&UserChainTail, const DominatorTree *DT);
+
+  /// Looks for a constant offset from the given GEP index without extracting
+  /// it. It returns the numeric value of the extracted constant offset (0 if
+  /// failed). The meaning of the arguments are the same as Extract.
+  static int64_t Find(Value *Idx, GetElementPtrInst *GEP,
+                      const DominatorTree *DT);
+
+private:
+  ConstantOffsetExtractor(Instruction *InsertionPt, const DominatorTree *DT)
+      : IP(InsertionPt), DL(InsertionPt->getModule()->getDataLayout()), DT(DT) {
+  }
+
+  /// Searches the expression that computes V for a non-zero constant C s.t.
+  /// V can be reassociated into the form V' + C. If the searching is
+  /// successful, returns C and update UserChain as a def-use chain from C to V;
+  /// otherwise, UserChain is empty.
+  ///
+  /// \p V            The given expression
+  /// \p SignExtended Whether V will be sign-extended in the computation of the
+  ///                 GEP index
+  /// \p ZeroExtended Whether V will be zero-extended in the computation of the
+  ///                 GEP index
+  /// \p NonNegative  Whether V is guaranteed to be non-negative. For example,
+  ///                 an index of an inbounds GEP is guaranteed to be
+  ///                 non-negative. Levaraging this, we can better split
+  ///                 inbounds GEPs.
+  APInt find(Value *V, bool SignExtended, bool ZeroExtended, bool NonNegative);
+
+  /// A helper function to look into both operands of a binary operator.
+  APInt findInEitherOperand(BinaryOperator *BO, bool SignExtended,
+                            bool ZeroExtended);
+
+  /// After finding the constant offset C from the GEP index I, we build a new
+  /// index I' s.t. I' + C = I. This function builds and returns the new
+  /// index I' according to UserChain produced by function "find".
+  ///
+  /// The building conceptually takes two steps:
+  /// 1) iteratively distribute s/zext towards the leaves of the expression tree
+  /// that computes I
+  /// 2) reassociate the expression tree to the form I' + C.
+  ///
+  /// For example, to extract the 5 from sext(a + (b + 5)), we first distribute
+  /// sext to a, b and 5 so that we have
+  ///   sext(a) + (sext(b) + 5).
+  /// Then, we reassociate it to
+  ///   (sext(a) + sext(b)) + 5.
+  /// Given this form, we know I' is sext(a) + sext(b).
+  Value *rebuildWithoutConstOffset();
+
+  /// After the first step of rebuilding the GEP index without the constant
+  /// offset, distribute s/zext to the operands of all operators in UserChain.
+  /// e.g., zext(sext(a + (b + 5)) (assuming no overflow) =>
+  /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))).
+  ///
+  /// The function also updates UserChain to point to new subexpressions after
+  /// distributing s/zext. e.g., the old UserChain of the above example is
+  /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)),
+  /// and the new UserChain is
+  /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) ->
+  ///   zext(sext(a)) + (zext(sext(b)) + zext(sext(5))
+  ///
+  /// \p ChainIndex The index to UserChain. ChainIndex is initially
+  ///               UserChain.size() - 1, and is decremented during
+  ///               the recursion.
+  Value *distributeExtsAndCloneChain(unsigned ChainIndex);
+
+  /// Reassociates the GEP index to the form I' + C and returns I'.
+  Value *removeConstOffset(unsigned ChainIndex);
+
+  /// A helper function to apply ExtInsts, a list of s/zext, to value V.
+  /// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function
+  /// returns "sext i32 (zext i16 V to i32) to i64".
+  Value *applyExts(Value *V);
+
+  /// A helper function that returns whether we can trace into the operands
+  /// of binary operator BO for a constant offset.
+  ///
+  /// \p SignExtended Whether BO is surrounded by sext
+  /// \p ZeroExtended Whether BO is surrounded by zext
+  /// \p NonNegative Whether BO is known to be non-negative, e.g., an in-bound
+  ///                array index.
+  bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO,
+                    bool NonNegative);
+
+  /// The path from the constant offset to the old GEP index. e.g., if the GEP
+  /// index is "a * b + (c + 5)". After running function find, UserChain[0] will
+  /// be the constant 5, UserChain[1] will be the subexpression "c + 5", and
+  /// UserChain[2] will be the entire expression "a * b + (c + 5)".
+  ///
+  /// This path helps to rebuild the new GEP index.
+  SmallVector<User *, 8> UserChain;
+
+  /// A data structure used in rebuildWithoutConstOffset. Contains all
+  /// sext/zext instructions along UserChain.
+  SmallVector<CastInst *, 16> ExtInsts;
+
+  /// Insertion position of cloned instructions.
+  Instruction *IP;
+
+  const DataLayout &DL;
+  const DominatorTree *DT;
+};
+
+/// A pass that tries to split every GEP in the function into a variadic
+/// base and a constant offset. It is a FunctionPass because searching for the
+/// constant offset may inspect other basic blocks.
 class SeparateConstOffsetFromGEPLegacyPass : public FunctionPass {
-public: 
-  static char ID; 
- 
+public:
+  static char ID;
+
   SeparateConstOffsetFromGEPLegacyPass(bool LowerGEP = false)
-      : FunctionPass(ID), LowerGEP(LowerGEP) { 
+      : FunctionPass(ID), LowerGEP(LowerGEP) {
     initializeSeparateConstOffsetFromGEPLegacyPassPass(
         *PassRegistry::getPassRegistry());
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addRequired<ScalarEvolutionWrapperPass>(); 
-    AU.addRequired<TargetTransformInfoWrapperPass>(); 
-    AU.addRequired<LoopInfoWrapperPass>(); 
-    AU.setPreservesCFG(); 
-    AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-  } 
- 
-  bool runOnFunction(Function &F) override; 
- 
-private: 
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.setPreservesCFG();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override;
+
+private:
   bool LowerGEP;
 };
 
@@ -383,769 +383,769 @@ public:
   bool run(Function &F);
 
 private:
-  /// Tries to split the given GEP into a variadic base and a constant offset, 
-  /// and returns true if the splitting succeeds. 
-  bool splitGEP(GetElementPtrInst *GEP); 
- 
-  /// Lower a GEP with multiple indices into multiple GEPs with a single index. 
-  /// Function splitGEP already split the original GEP into a variadic part and 
-  /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the 
-  /// variadic part into a set of GEPs with a single index and applies 
-  /// AccumulativeByteOffset to it. 
-  /// \p Variadic                  The variadic part of the original GEP. 
-  /// \p AccumulativeByteOffset    The constant offset. 
-  void lowerToSingleIndexGEPs(GetElementPtrInst *Variadic, 
-                              int64_t AccumulativeByteOffset); 
- 
-  /// Lower a GEP with multiple indices into ptrtoint+arithmetics+inttoptr form. 
-  /// Function splitGEP already split the original GEP into a variadic part and 
-  /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the 
-  /// variadic part into a set of arithmetic operations and applies 
-  /// AccumulativeByteOffset to it. 
-  /// \p Variadic                  The variadic part of the original GEP. 
-  /// \p AccumulativeByteOffset    The constant offset. 
-  void lowerToArithmetics(GetElementPtrInst *Variadic, 
-                          int64_t AccumulativeByteOffset); 
- 
-  /// Finds the constant offset within each index and accumulates them. If 
-  /// LowerGEP is true, it finds in indices of both sequential and structure 
-  /// types, otherwise it only finds in sequential indices. The output 
-  /// NeedsExtraction indicates whether we successfully find a non-zero constant 
-  /// offset. 
-  int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction); 
- 
-  /// Canonicalize array indices to pointer-size integers. This helps to 
-  /// simplify the logic of splitting a GEP. For example, if a + b is a 
-  /// pointer-size integer, we have 
-  ///   gep base, a + b = gep (gep base, a), b 
-  /// However, this equality may not hold if the size of a + b is smaller than 
-  /// the pointer size, because LLVM conceptually sign-extends GEP indices to 
-  /// pointer size before computing the address 
-  /// (http://llvm.org/docs/LangRef.html#id181). 
-  /// 
-  /// This canonicalization is very likely already done in clang and 
-  /// instcombine. Therefore, the program will probably remain the same. 
-  /// 
-  /// Returns true if the module changes. 
-  /// 
-  /// Verified in @i32_add in split-gep.ll 
-  bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP); 
- 
-  /// Optimize sext(a)+sext(b) to sext(a+b) when a+b can't sign overflow. 
-  /// SeparateConstOffsetFromGEP distributes a sext to leaves before extracting 
-  /// the constant offset. After extraction, it becomes desirable to reunion the 
-  /// distributed sexts. For example, 
-  /// 
-  ///                              &a[sext(i +nsw (j +nsw 5)] 
-  ///   => distribute              &a[sext(i) +nsw (sext(j) +nsw 5)] 
-  ///   => constant extraction     &a[sext(i) + sext(j)] + 5 
-  ///   => reunion                 &a[sext(i +nsw j)] + 5 
-  bool reuniteExts(Function &F); 
- 
-  /// A helper that reunites sexts in an instruction. 
-  bool reuniteExts(Instruction *I); 
- 
-  /// Find the closest dominator of <Dominatee> that is equivalent to <Key>. 
-  Instruction *findClosestMatchingDominator( 
-      const SCEV *Key, Instruction *Dominatee, 
-      DenseMap<const SCEV *, SmallVector<Instruction *, 2>> &DominatingExprs); 
- 
-  /// Verify F is free of dead code. 
-  void verifyNoDeadCode(Function &F); 
- 
-  bool hasMoreThanOneUseInLoop(Value *v, Loop *L); 
- 
-  // Swap the index operand of two GEP. 
-  void swapGEPOperand(GetElementPtrInst *First, GetElementPtrInst *Second); 
- 
-  // Check if it is safe to swap operand of two GEP. 
-  bool isLegalToSwapOperand(GetElementPtrInst *First, GetElementPtrInst *Second, 
-                            Loop *CurLoop); 
- 
-  const DataLayout *DL = nullptr; 
-  DominatorTree *DT = nullptr; 
-  ScalarEvolution *SE; 
-  LoopInfo *LI; 
-  TargetLibraryInfo *TLI; 
+  /// Tries to split the given GEP into a variadic base and a constant offset,
+  /// and returns true if the splitting succeeds.
+  bool splitGEP(GetElementPtrInst *GEP);
+
+  /// Lower a GEP with multiple indices into multiple GEPs with a single index.
+  /// Function splitGEP already split the original GEP into a variadic part and
+  /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
+  /// variadic part into a set of GEPs with a single index and applies
+  /// AccumulativeByteOffset to it.
+  /// \p Variadic                  The variadic part of the original GEP.
+  /// \p AccumulativeByteOffset    The constant offset.
+  void lowerToSingleIndexGEPs(GetElementPtrInst *Variadic,
+                              int64_t AccumulativeByteOffset);
+
+  /// Lower a GEP with multiple indices into ptrtoint+arithmetics+inttoptr form.
+  /// Function splitGEP already split the original GEP into a variadic part and
+  /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
+  /// variadic part into a set of arithmetic operations and applies
+  /// AccumulativeByteOffset to it.
+  /// \p Variadic                  The variadic part of the original GEP.
+  /// \p AccumulativeByteOffset    The constant offset.
+  void lowerToArithmetics(GetElementPtrInst *Variadic,
+                          int64_t AccumulativeByteOffset);
+
+  /// Finds the constant offset within each index and accumulates them. If
+  /// LowerGEP is true, it finds in indices of both sequential and structure
+  /// types, otherwise it only finds in sequential indices. The output
+  /// NeedsExtraction indicates whether we successfully find a non-zero constant
+  /// offset.
+  int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction);
+
+  /// Canonicalize array indices to pointer-size integers. This helps to
+  /// simplify the logic of splitting a GEP. For example, if a + b is a
+  /// pointer-size integer, we have
+  ///   gep base, a + b = gep (gep base, a), b
+  /// However, this equality may not hold if the size of a + b is smaller than
+  /// the pointer size, because LLVM conceptually sign-extends GEP indices to
+  /// pointer size before computing the address
+  /// (http://llvm.org/docs/LangRef.html#id181).
+  ///
+  /// This canonicalization is very likely already done in clang and
+  /// instcombine. Therefore, the program will probably remain the same.
+  ///
+  /// Returns true if the module changes.
+  ///
+  /// Verified in @i32_add in split-gep.ll
+  bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP);
+
+  /// Optimize sext(a)+sext(b) to sext(a+b) when a+b can't sign overflow.
+  /// SeparateConstOffsetFromGEP distributes a sext to leaves before extracting
+  /// the constant offset. After extraction, it becomes desirable to reunion the
+  /// distributed sexts. For example,
+  ///
+  ///                              &a[sext(i +nsw (j +nsw 5)]
+  ///   => distribute              &a[sext(i) +nsw (sext(j) +nsw 5)]
+  ///   => constant extraction     &a[sext(i) + sext(j)] + 5
+  ///   => reunion                 &a[sext(i +nsw j)] + 5
+  bool reuniteExts(Function &F);
+
+  /// A helper that reunites sexts in an instruction.
+  bool reuniteExts(Instruction *I);
+
+  /// Find the closest dominator of <Dominatee> that is equivalent to <Key>.
+  Instruction *findClosestMatchingDominator(
+      const SCEV *Key, Instruction *Dominatee,
+      DenseMap<const SCEV *, SmallVector<Instruction *, 2>> &DominatingExprs);
+
+  /// Verify F is free of dead code.
+  void verifyNoDeadCode(Function &F);
+
+  bool hasMoreThanOneUseInLoop(Value *v, Loop *L);
+
+  // Swap the index operand of two GEP.
+  void swapGEPOperand(GetElementPtrInst *First, GetElementPtrInst *Second);
+
+  // Check if it is safe to swap operand of two GEP.
+  bool isLegalToSwapOperand(GetElementPtrInst *First, GetElementPtrInst *Second,
+                            Loop *CurLoop);
+
+  const DataLayout *DL = nullptr;
+  DominatorTree *DT = nullptr;
+  ScalarEvolution *SE;
+  LoopInfo *LI;
+  TargetLibraryInfo *TLI;
   // Retrieved lazily since not always used.
   function_ref<TargetTransformInfo &(Function &)> GetTTI;
- 
-  /// Whether to lower a GEP with multiple indices into arithmetic operations or 
-  /// multiple GEPs with a single index. 
-  bool LowerGEP; 
- 
-  DenseMap<const SCEV *, SmallVector<Instruction *, 2>> DominatingAdds; 
-  DenseMap<const SCEV *, SmallVector<Instruction *, 2>> DominatingSubs; 
-}; 
- 
-} // end anonymous namespace 
- 
+
+  /// Whether to lower a GEP with multiple indices into arithmetic operations or
+  /// multiple GEPs with a single index.
+  bool LowerGEP;
+
+  DenseMap<const SCEV *, SmallVector<Instruction *, 2>> DominatingAdds;
+  DenseMap<const SCEV *, SmallVector<Instruction *, 2>> DominatingSubs;
+};
+
+} // end anonymous namespace
+
 char SeparateConstOffsetFromGEPLegacyPass::ID = 0;
- 
-INITIALIZE_PASS_BEGIN( 
+
+INITIALIZE_PASS_BEGIN(
     SeparateConstOffsetFromGEPLegacyPass, "separate-const-offset-from-gep",
-    "Split GEPs to a variadic base and a constant offset for better CSE", false, 
-    false) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_END( 
+    "Split GEPs to a variadic base and a constant offset for better CSE", false,
+    false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(
     SeparateConstOffsetFromGEPLegacyPass, "separate-const-offset-from-gep",
-    "Split GEPs to a variadic base and a constant offset for better CSE", false, 
-    false) 
- 
-FunctionPass *llvm::createSeparateConstOffsetFromGEPPass(bool LowerGEP) { 
+    "Split GEPs to a variadic base and a constant offset for better CSE", false,
+    false)
+
+FunctionPass *llvm::createSeparateConstOffsetFromGEPPass(bool LowerGEP) {
   return new SeparateConstOffsetFromGEPLegacyPass(LowerGEP);
-} 
- 
-bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended, 
-                                            bool ZeroExtended, 
-                                            BinaryOperator *BO, 
-                                            bool NonNegative) { 
-  // We only consider ADD, SUB and OR, because a non-zero constant found in 
-  // expressions composed of these operations can be easily hoisted as a 
-  // constant offset by reassociation. 
-  if (BO->getOpcode() != Instruction::Add && 
-      BO->getOpcode() != Instruction::Sub && 
-      BO->getOpcode() != Instruction::Or) { 
-    return false; 
-  } 
- 
-  Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1); 
-  // Do not trace into "or" unless it is equivalent to "add". If LHS and RHS 
-  // don't have common bits, (LHS | RHS) is equivalent to (LHS + RHS). 
-  // FIXME: this does not appear to be covered by any tests 
-  //        (with x86/aarch64 backends at least) 
-  if (BO->getOpcode() == Instruction::Or && 
-      !haveNoCommonBitsSet(LHS, RHS, DL, nullptr, BO, DT)) 
-    return false; 
- 
-  // In addition, tracing into BO requires that its surrounding s/zext (if 
-  // any) is distributable to both operands. 
-  // 
-  // Suppose BO = A op B. 
-  //  SignExtended | ZeroExtended | Distributable? 
-  // --------------+--------------+---------------------------------- 
-  //       0       |      0       | true because no s/zext exists 
-  //       0       |      1       | zext(BO) == zext(A) op zext(B) 
-  //       1       |      0       | sext(BO) == sext(A) op sext(B) 
-  //       1       |      1       | zext(sext(BO)) == 
-  //               |              |     zext(sext(A)) op zext(sext(B)) 
-  if (BO->getOpcode() == Instruction::Add && !ZeroExtended && NonNegative) { 
-    // If a + b >= 0 and (a >= 0 or b >= 0), then 
-    //   sext(a + b) = sext(a) + sext(b) 
-    // even if the addition is not marked nsw. 
-    // 
-    // Leveraging this invariant, we can trace into an sext'ed inbound GEP 
-    // index if the constant offset is non-negative. 
-    // 
-    // Verified in @sext_add in split-gep.ll. 
-    if (ConstantInt *ConstLHS = dyn_cast<ConstantInt>(LHS)) { 
-      if (!ConstLHS->isNegative()) 
-        return true; 
-    } 
-    if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(RHS)) { 
-      if (!ConstRHS->isNegative()) 
-        return true; 
-    } 
-  } 
- 
-  // sext (add/sub nsw A, B) == add/sub nsw (sext A), (sext B) 
-  // zext (add/sub nuw A, B) == add/sub nuw (zext A), (zext B) 
-  if (BO->getOpcode() == Instruction::Add || 
-      BO->getOpcode() == Instruction::Sub) { 
-    if (SignExtended && !BO->hasNoSignedWrap()) 
-      return false; 
-    if (ZeroExtended && !BO->hasNoUnsignedWrap()) 
-      return false; 
-  } 
- 
-  return true; 
-} 
- 
-APInt ConstantOffsetExtractor::findInEitherOperand(BinaryOperator *BO, 
-                                                   bool SignExtended, 
-                                                   bool ZeroExtended) { 
-  // Save off the current height of the chain, in case we need to restore it. 
-  size_t ChainLength = UserChain.size(); 
- 
-  // BO being non-negative does not shed light on whether its operands are 
-  // non-negative. Clear the NonNegative flag here. 
-  APInt ConstantOffset = find(BO->getOperand(0), SignExtended, ZeroExtended, 
-                              /* NonNegative */ false); 
-  // If we found a constant offset in the left operand, stop and return that. 
-  // This shortcut might cause us to miss opportunities of combining the 
-  // constant offsets in both operands, e.g., (a + 4) + (b + 5) => (a + b) + 9. 
-  // However, such cases are probably already handled by -instcombine, 
-  // given this pass runs after the standard optimizations. 
-  if (ConstantOffset != 0) return ConstantOffset; 
- 
-  // Reset the chain back to where it was when we started exploring this node, 
-  // since visiting the LHS didn't pan out. 
-  UserChain.resize(ChainLength); 
- 
-  ConstantOffset = find(BO->getOperand(1), SignExtended, ZeroExtended, 
-                        /* NonNegative */ false); 
-  // If U is a sub operator, negate the constant offset found in the right 
-  // operand. 
-  if (BO->getOpcode() == Instruction::Sub) 
-    ConstantOffset = -ConstantOffset; 
- 
-  // If RHS wasn't a suitable candidate either, reset the chain again. 
-  if (ConstantOffset == 0) 
-    UserChain.resize(ChainLength); 
- 
-  return ConstantOffset; 
-} 
- 
-APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended, 
-                                    bool ZeroExtended, bool NonNegative) { 
-  // TODO(jingyue): We could trace into integer/pointer casts, such as 
-  // inttoptr, ptrtoint, bitcast, and addrspacecast. We choose to handle only 
-  // integers because it gives good enough results for our benchmarks. 
-  unsigned BitWidth = cast<IntegerType>(V->getType())->getBitWidth(); 
- 
-  // We cannot do much with Values that are not a User, such as an Argument. 
-  User *U = dyn_cast<User>(V); 
-  if (U == nullptr) return APInt(BitWidth, 0); 
- 
-  APInt ConstantOffset(BitWidth, 0); 
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) { 
-    // Hooray, we found it! 
-    ConstantOffset = CI->getValue(); 
-  } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V)) { 
-    // Trace into subexpressions for more hoisting opportunities. 
-    if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative)) 
-      ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended); 
-  } else if (isa<TruncInst>(V)) { 
-    ConstantOffset = 
-        find(U->getOperand(0), SignExtended, ZeroExtended, NonNegative) 
-            .trunc(BitWidth); 
-  } else if (isa<SExtInst>(V)) { 
-    ConstantOffset = find(U->getOperand(0), /* SignExtended */ true, 
-                          ZeroExtended, NonNegative).sext(BitWidth); 
-  } else if (isa<ZExtInst>(V)) { 
-    // As an optimization, we can clear the SignExtended flag because 
-    // sext(zext(a)) = zext(a). Verified in @sext_zext in split-gep.ll. 
-    // 
-    // Clear the NonNegative flag, because zext(a) >= 0 does not imply a >= 0. 
-    ConstantOffset = 
-        find(U->getOperand(0), /* SignExtended */ false, 
-             /* ZeroExtended */ true, /* NonNegative */ false).zext(BitWidth); 
-  } 
- 
-  // If we found a non-zero constant offset, add it to the path for 
-  // rebuildWithoutConstOffset. Zero is a valid constant offset, but doesn't 
-  // help this optimization. 
-  if (ConstantOffset != 0) 
-    UserChain.push_back(U); 
-  return ConstantOffset; 
-} 
- 
-Value *ConstantOffsetExtractor::applyExts(Value *V) { 
-  Value *Current = V; 
-  // ExtInsts is built in the use-def order. Therefore, we apply them to V 
-  // in the reversed order. 
-  for (auto I = ExtInsts.rbegin(), E = ExtInsts.rend(); I != E; ++I) { 
-    if (Constant *C = dyn_cast<Constant>(Current)) { 
-      // If Current is a constant, apply s/zext using ConstantExpr::getCast. 
-      // ConstantExpr::getCast emits a ConstantInt if C is a ConstantInt. 
-      Current = ConstantExpr::getCast((*I)->getOpcode(), C, (*I)->getType()); 
-    } else { 
-      Instruction *Ext = (*I)->clone(); 
-      Ext->setOperand(0, Current); 
-      Ext->insertBefore(IP); 
-      Current = Ext; 
-    } 
-  } 
-  return Current; 
-} 
- 
-Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() { 
-  distributeExtsAndCloneChain(UserChain.size() - 1); 
-  // Remove all nullptrs (used to be s/zext) from UserChain. 
-  unsigned NewSize = 0; 
-  for (User *I : UserChain) { 
-    if (I != nullptr) { 
-      UserChain[NewSize] = I; 
-      NewSize++; 
-    } 
-  } 
-  UserChain.resize(NewSize); 
-  return removeConstOffset(UserChain.size() - 1); 
-} 
- 
-Value * 
-ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) { 
-  User *U = UserChain[ChainIndex]; 
-  if (ChainIndex == 0) { 
-    assert(isa<ConstantInt>(U)); 
-    // If U is a ConstantInt, applyExts will return a ConstantInt as well. 
-    return UserChain[ChainIndex] = cast<ConstantInt>(applyExts(U)); 
-  } 
- 
-  if (CastInst *Cast = dyn_cast<CastInst>(U)) { 
-    assert( 
-        (isa<SExtInst>(Cast) || isa<ZExtInst>(Cast) || isa<TruncInst>(Cast)) && 
-        "Only following instructions can be traced: sext, zext & trunc"); 
-    ExtInsts.push_back(Cast); 
-    UserChain[ChainIndex] = nullptr; 
-    return distributeExtsAndCloneChain(ChainIndex - 1); 
-  } 
- 
-  // Function find only trace into BinaryOperator and CastInst. 
-  BinaryOperator *BO = cast<BinaryOperator>(U); 
-  // OpNo = which operand of BO is UserChain[ChainIndex - 1] 
-  unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1); 
-  Value *TheOther = applyExts(BO->getOperand(1 - OpNo)); 
-  Value *NextInChain = distributeExtsAndCloneChain(ChainIndex - 1); 
- 
-  BinaryOperator *NewBO = nullptr; 
-  if (OpNo == 0) { 
-    NewBO = BinaryOperator::Create(BO->getOpcode(), NextInChain, TheOther, 
-                                   BO->getName(), IP); 
-  } else { 
-    NewBO = BinaryOperator::Create(BO->getOpcode(), TheOther, NextInChain, 
-                                   BO->getName(), IP); 
-  } 
-  return UserChain[ChainIndex] = NewBO; 
-} 
- 
-Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) { 
-  if (ChainIndex == 0) { 
-    assert(isa<ConstantInt>(UserChain[ChainIndex])); 
-    return ConstantInt::getNullValue(UserChain[ChainIndex]->getType()); 
-  } 
- 
-  BinaryOperator *BO = cast<BinaryOperator>(UserChain[ChainIndex]); 
-  assert((BO->use_empty() || BO->hasOneUse()) && 
-         "distributeExtsAndCloneChain clones each BinaryOperator in " 
-         "UserChain, so no one should be used more than " 
-         "once"); 
- 
-  unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1); 
-  assert(BO->getOperand(OpNo) == UserChain[ChainIndex - 1]); 
-  Value *NextInChain = removeConstOffset(ChainIndex - 1); 
-  Value *TheOther = BO->getOperand(1 - OpNo); 
- 
-  // If NextInChain is 0 and not the LHS of a sub, we can simplify the 
-  // sub-expression to be just TheOther. 
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) { 
-    if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0)) 
-      return TheOther; 
-  } 
- 
-  BinaryOperator::BinaryOps NewOp = BO->getOpcode(); 
-  if (BO->getOpcode() == Instruction::Or) { 
-    // Rebuild "or" as "add", because "or" may be invalid for the new 
-    // expression. 
-    // 
-    // For instance, given 
-    //   a | (b + 5) where a and b + 5 have no common bits, 
-    // we can extract 5 as the constant offset. 
-    // 
-    // However, reusing the "or" in the new index would give us 
-    //   (a | b) + 5 
-    // which does not equal a | (b + 5). 
-    // 
-    // Replacing the "or" with "add" is fine, because 
-    //   a | (b + 5) = a + (b + 5) = (a + b) + 5 
-    NewOp = Instruction::Add; 
-  } 
- 
-  BinaryOperator *NewBO; 
-  if (OpNo == 0) { 
-    NewBO = BinaryOperator::Create(NewOp, NextInChain, TheOther, "", IP); 
-  } else { 
-    NewBO = BinaryOperator::Create(NewOp, TheOther, NextInChain, "", IP); 
-  } 
-  NewBO->takeName(BO); 
-  return NewBO; 
-} 
- 
-Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP, 
-                                        User *&UserChainTail, 
-                                        const DominatorTree *DT) { 
-  ConstantOffsetExtractor Extractor(GEP, DT); 
-  // Find a non-zero constant offset first. 
-  APInt ConstantOffset = 
-      Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false, 
-                     GEP->isInBounds()); 
-  if (ConstantOffset == 0) { 
-    UserChainTail = nullptr; 
-    return nullptr; 
-  } 
-  // Separates the constant offset from the GEP index. 
-  Value *IdxWithoutConstOffset = Extractor.rebuildWithoutConstOffset(); 
-  UserChainTail = Extractor.UserChain.back(); 
-  return IdxWithoutConstOffset; 
-} 
- 
-int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP, 
-                                      const DominatorTree *DT) { 
-  // If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative. 
-  return ConstantOffsetExtractor(GEP, DT) 
-      .find(Idx, /* SignExtended */ false, /* ZeroExtended */ false, 
-            GEP->isInBounds()) 
-      .getSExtValue(); 
-} 
- 
-bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToPointerSize( 
-    GetElementPtrInst *GEP) { 
-  bool Changed = false; 
-  Type *IntPtrTy = DL->getIntPtrType(GEP->getType()); 
-  gep_type_iterator GTI = gep_type_begin(*GEP); 
-  for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end(); 
-       I != E; ++I, ++GTI) { 
-    // Skip struct member indices which must be i32. 
-    if (GTI.isSequential()) { 
-      if ((*I)->getType() != IntPtrTy) { 
-        *I = CastInst::CreateIntegerCast(*I, IntPtrTy, true, "idxprom", GEP); 
-        Changed = true; 
-      } 
-    } 
-  } 
-  return Changed; 
-} 
- 
-int64_t 
-SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP, 
-                                                 bool &NeedsExtraction) { 
-  NeedsExtraction = false; 
-  int64_t AccumulativeByteOffset = 0; 
-  gep_type_iterator GTI = gep_type_begin(*GEP); 
-  for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) { 
-    if (GTI.isSequential()) { 
-      // Tries to extract a constant offset from this GEP index. 
-      int64_t ConstantOffset = 
-          ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP, DT); 
-      if (ConstantOffset != 0) { 
-        NeedsExtraction = true; 
-        // A GEP may have multiple indices.  We accumulate the extracted 
-        // constant offset to a byte offset, and later offset the remainder of 
-        // the original GEP with this byte offset. 
-        AccumulativeByteOffset += 
-            ConstantOffset * DL->getTypeAllocSize(GTI.getIndexedType()); 
-      } 
-    } else if (LowerGEP) { 
-      StructType *StTy = GTI.getStructType(); 
-      uint64_t Field = cast<ConstantInt>(GEP->getOperand(I))->getZExtValue(); 
-      // Skip field 0 as the offset is always 0. 
-      if (Field != 0) { 
-        NeedsExtraction = true; 
-        AccumulativeByteOffset += 
-            DL->getStructLayout(StTy)->getElementOffset(Field); 
-      } 
-    } 
-  } 
-  return AccumulativeByteOffset; 
-} 
- 
-void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs( 
-    GetElementPtrInst *Variadic, int64_t AccumulativeByteOffset) { 
-  IRBuilder<> Builder(Variadic); 
-  Type *IntPtrTy = DL->getIntPtrType(Variadic->getType()); 
- 
-  Type *I8PtrTy = 
-      Builder.getInt8PtrTy(Variadic->getType()->getPointerAddressSpace()); 
-  Value *ResultPtr = Variadic->getOperand(0); 
-  Loop *L = LI->getLoopFor(Variadic->getParent()); 
-  // Check if the base is not loop invariant or used more than once. 
-  bool isSwapCandidate = 
-      L && L->isLoopInvariant(ResultPtr) && 
-      !hasMoreThanOneUseInLoop(ResultPtr, L); 
-  Value *FirstResult = nullptr; 
- 
-  if (ResultPtr->getType() != I8PtrTy) 
-    ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy); 
- 
-  gep_type_iterator GTI = gep_type_begin(*Variadic); 
-  // Create an ugly GEP for each sequential index. We don't create GEPs for 
-  // structure indices, as they are accumulated in the constant offset index. 
-  for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) { 
-    if (GTI.isSequential()) { 
-      Value *Idx = Variadic->getOperand(I); 
-      // Skip zero indices. 
-      if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) 
-        if (CI->isZero()) 
-          continue; 
- 
-      APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(), 
-                                DL->getTypeAllocSize(GTI.getIndexedType())); 
-      // Scale the index by element size. 
-      if (ElementSize != 1) { 
-        if (ElementSize.isPowerOf2()) { 
-          Idx = Builder.CreateShl( 
-              Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2())); 
-        } else { 
-          Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize)); 
-        } 
-      } 
-      // Create an ugly GEP with a single index for each index. 
-      ResultPtr = 
-          Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Idx, "uglygep"); 
-      if (FirstResult == nullptr) 
-        FirstResult = ResultPtr; 
-    } 
-  } 
- 
-  // Create a GEP with the constant offset index. 
-  if (AccumulativeByteOffset != 0) { 
-    Value *Offset = ConstantInt::get(IntPtrTy, AccumulativeByteOffset); 
-    ResultPtr = 
-        Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Offset, "uglygep"); 
-  } else 
-    isSwapCandidate = false; 
- 
-  // If we created a GEP with constant index, and the base is loop invariant, 
-  // then we swap the first one with it, so LICM can move constant GEP out 
-  // later. 
+}
+
+bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
+                                            bool ZeroExtended,
+                                            BinaryOperator *BO,
+                                            bool NonNegative) {
+  // We only consider ADD, SUB and OR, because a non-zero constant found in
+  // expressions composed of these operations can be easily hoisted as a
+  // constant offset by reassociation.
+  if (BO->getOpcode() != Instruction::Add &&
+      BO->getOpcode() != Instruction::Sub &&
+      BO->getOpcode() != Instruction::Or) {
+    return false;
+  }
+
+  Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1);
+  // Do not trace into "or" unless it is equivalent to "add". If LHS and RHS
+  // don't have common bits, (LHS | RHS) is equivalent to (LHS + RHS).
+  // FIXME: this does not appear to be covered by any tests
+  //        (with x86/aarch64 backends at least)
+  if (BO->getOpcode() == Instruction::Or &&
+      !haveNoCommonBitsSet(LHS, RHS, DL, nullptr, BO, DT))
+    return false;
+
+  // In addition, tracing into BO requires that its surrounding s/zext (if
+  // any) is distributable to both operands.
+  //
+  // Suppose BO = A op B.
+  //  SignExtended | ZeroExtended | Distributable?
+  // --------------+--------------+----------------------------------
+  //       0       |      0       | true because no s/zext exists
+  //       0       |      1       | zext(BO) == zext(A) op zext(B)
+  //       1       |      0       | sext(BO) == sext(A) op sext(B)
+  //       1       |      1       | zext(sext(BO)) ==
+  //               |              |     zext(sext(A)) op zext(sext(B))
+  if (BO->getOpcode() == Instruction::Add && !ZeroExtended && NonNegative) {
+    // If a + b >= 0 and (a >= 0 or b >= 0), then
+    //   sext(a + b) = sext(a) + sext(b)
+    // even if the addition is not marked nsw.
+    //
+    // Leveraging this invariant, we can trace into an sext'ed inbound GEP
+    // index if the constant offset is non-negative.
+    //
+    // Verified in @sext_add in split-gep.ll.
+    if (ConstantInt *ConstLHS = dyn_cast<ConstantInt>(LHS)) {
+      if (!ConstLHS->isNegative())
+        return true;
+    }
+    if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(RHS)) {
+      if (!ConstRHS->isNegative())
+        return true;
+    }
+  }
+
+  // sext (add/sub nsw A, B) == add/sub nsw (sext A), (sext B)
+  // zext (add/sub nuw A, B) == add/sub nuw (zext A), (zext B)
+  if (BO->getOpcode() == Instruction::Add ||
+      BO->getOpcode() == Instruction::Sub) {
+    if (SignExtended && !BO->hasNoSignedWrap())
+      return false;
+    if (ZeroExtended && !BO->hasNoUnsignedWrap())
+      return false;
+  }
+
+  return true;
+}
+
+APInt ConstantOffsetExtractor::findInEitherOperand(BinaryOperator *BO,
+                                                   bool SignExtended,
+                                                   bool ZeroExtended) {
+  // Save off the current height of the chain, in case we need to restore it.
+  size_t ChainLength = UserChain.size();
+
+  // BO being non-negative does not shed light on whether its operands are
+  // non-negative. Clear the NonNegative flag here.
+  APInt ConstantOffset = find(BO->getOperand(0), SignExtended, ZeroExtended,
+                              /* NonNegative */ false);
+  // If we found a constant offset in the left operand, stop and return that.
+  // This shortcut might cause us to miss opportunities of combining the
+  // constant offsets in both operands, e.g., (a + 4) + (b + 5) => (a + b) + 9.
+  // However, such cases are probably already handled by -instcombine,
+  // given this pass runs after the standard optimizations.
+  if (ConstantOffset != 0) return ConstantOffset;
+
+  // Reset the chain back to where it was when we started exploring this node,
+  // since visiting the LHS didn't pan out.
+  UserChain.resize(ChainLength);
+
+  ConstantOffset = find(BO->getOperand(1), SignExtended, ZeroExtended,
+                        /* NonNegative */ false);
+  // If U is a sub operator, negate the constant offset found in the right
+  // operand.
+  if (BO->getOpcode() == Instruction::Sub)
+    ConstantOffset = -ConstantOffset;
+
+  // If RHS wasn't a suitable candidate either, reset the chain again.
+  if (ConstantOffset == 0)
+    UserChain.resize(ChainLength);
+
+  return ConstantOffset;
+}
+
+APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended,
+                                    bool ZeroExtended, bool NonNegative) {
+  // TODO(jingyue): We could trace into integer/pointer casts, such as
+  // inttoptr, ptrtoint, bitcast, and addrspacecast. We choose to handle only
+  // integers because it gives good enough results for our benchmarks.
+  unsigned BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
+
+  // We cannot do much with Values that are not a User, such as an Argument.
+  User *U = dyn_cast<User>(V);
+  if (U == nullptr) return APInt(BitWidth, 0);
+
+  APInt ConstantOffset(BitWidth, 0);
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+    // Hooray, we found it!
+    ConstantOffset = CI->getValue();
+  } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V)) {
+    // Trace into subexpressions for more hoisting opportunities.
+    if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative))
+      ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended);
+  } else if (isa<TruncInst>(V)) {
+    ConstantOffset =
+        find(U->getOperand(0), SignExtended, ZeroExtended, NonNegative)
+            .trunc(BitWidth);
+  } else if (isa<SExtInst>(V)) {
+    ConstantOffset = find(U->getOperand(0), /* SignExtended */ true,
+                          ZeroExtended, NonNegative).sext(BitWidth);
+  } else if (isa<ZExtInst>(V)) {
+    // As an optimization, we can clear the SignExtended flag because
+    // sext(zext(a)) = zext(a). Verified in @sext_zext in split-gep.ll.
+    //
+    // Clear the NonNegative flag, because zext(a) >= 0 does not imply a >= 0.
+    ConstantOffset =
+        find(U->getOperand(0), /* SignExtended */ false,
+             /* ZeroExtended */ true, /* NonNegative */ false).zext(BitWidth);
+  }
+
+  // If we found a non-zero constant offset, add it to the path for
+  // rebuildWithoutConstOffset. Zero is a valid constant offset, but doesn't
+  // help this optimization.
+  if (ConstantOffset != 0)
+    UserChain.push_back(U);
+  return ConstantOffset;
+}
+
+Value *ConstantOffsetExtractor::applyExts(Value *V) {
+  Value *Current = V;
+  // ExtInsts is built in the use-def order. Therefore, we apply them to V
+  // in the reversed order.
+  for (auto I = ExtInsts.rbegin(), E = ExtInsts.rend(); I != E; ++I) {
+    if (Constant *C = dyn_cast<Constant>(Current)) {
+      // If Current is a constant, apply s/zext using ConstantExpr::getCast.
+      // ConstantExpr::getCast emits a ConstantInt if C is a ConstantInt.
+      Current = ConstantExpr::getCast((*I)->getOpcode(), C, (*I)->getType());
+    } else {
+      Instruction *Ext = (*I)->clone();
+      Ext->setOperand(0, Current);
+      Ext->insertBefore(IP);
+      Current = Ext;
+    }
+  }
+  return Current;
+}
+
+Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() {
+  distributeExtsAndCloneChain(UserChain.size() - 1);
+  // Remove all nullptrs (used to be s/zext) from UserChain.
+  unsigned NewSize = 0;
+  for (User *I : UserChain) {
+    if (I != nullptr) {
+      UserChain[NewSize] = I;
+      NewSize++;
+    }
+  }
+  UserChain.resize(NewSize);
+  return removeConstOffset(UserChain.size() - 1);
+}
+
+Value *
+ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) {
+  User *U = UserChain[ChainIndex];
+  if (ChainIndex == 0) {
+    assert(isa<ConstantInt>(U));
+    // If U is a ConstantInt, applyExts will return a ConstantInt as well.
+    return UserChain[ChainIndex] = cast<ConstantInt>(applyExts(U));
+  }
+
+  if (CastInst *Cast = dyn_cast<CastInst>(U)) {
+    assert(
+        (isa<SExtInst>(Cast) || isa<ZExtInst>(Cast) || isa<TruncInst>(Cast)) &&
+        "Only following instructions can be traced: sext, zext & trunc");
+    ExtInsts.push_back(Cast);
+    UserChain[ChainIndex] = nullptr;
+    return distributeExtsAndCloneChain(ChainIndex - 1);
+  }
+
+  // Function find only trace into BinaryOperator and CastInst.
+  BinaryOperator *BO = cast<BinaryOperator>(U);
+  // OpNo = which operand of BO is UserChain[ChainIndex - 1]
+  unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
+  Value *TheOther = applyExts(BO->getOperand(1 - OpNo));
+  Value *NextInChain = distributeExtsAndCloneChain(ChainIndex - 1);
+
+  BinaryOperator *NewBO = nullptr;
+  if (OpNo == 0) {
+    NewBO = BinaryOperator::Create(BO->getOpcode(), NextInChain, TheOther,
+                                   BO->getName(), IP);
+  } else {
+    NewBO = BinaryOperator::Create(BO->getOpcode(), TheOther, NextInChain,
+                                   BO->getName(), IP);
+  }
+  return UserChain[ChainIndex] = NewBO;
+}
+
+Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
+  if (ChainIndex == 0) {
+    assert(isa<ConstantInt>(UserChain[ChainIndex]));
+    return ConstantInt::getNullValue(UserChain[ChainIndex]->getType());
+  }
+
+  BinaryOperator *BO = cast<BinaryOperator>(UserChain[ChainIndex]);
+  assert((BO->use_empty() || BO->hasOneUse()) &&
+         "distributeExtsAndCloneChain clones each BinaryOperator in "
+         "UserChain, so no one should be used more than "
+         "once");
+
+  unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
+  assert(BO->getOperand(OpNo) == UserChain[ChainIndex - 1]);
+  Value *NextInChain = removeConstOffset(ChainIndex - 1);
+  Value *TheOther = BO->getOperand(1 - OpNo);
+
+  // If NextInChain is 0 and not the LHS of a sub, we can simplify the
+  // sub-expression to be just TheOther.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) {
+    if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0))
+      return TheOther;
+  }
+
+  BinaryOperator::BinaryOps NewOp = BO->getOpcode();
+  if (BO->getOpcode() == Instruction::Or) {
+    // Rebuild "or" as "add", because "or" may be invalid for the new
+    // expression.
+    //
+    // For instance, given
+    //   a | (b + 5) where a and b + 5 have no common bits,
+    // we can extract 5 as the constant offset.
+    //
+    // However, reusing the "or" in the new index would give us
+    //   (a | b) + 5
+    // which does not equal a | (b + 5).
+    //
+    // Replacing the "or" with "add" is fine, because
+    //   a | (b + 5) = a + (b + 5) = (a + b) + 5
+    NewOp = Instruction::Add;
+  }
+
+  BinaryOperator *NewBO;
+  if (OpNo == 0) {
+    NewBO = BinaryOperator::Create(NewOp, NextInChain, TheOther, "", IP);
+  } else {
+    NewBO = BinaryOperator::Create(NewOp, TheOther, NextInChain, "", IP);
+  }
+  NewBO->takeName(BO);
+  return NewBO;
+}
+
+Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP,
+                                        User *&UserChainTail,
+                                        const DominatorTree *DT) {
+  ConstantOffsetExtractor Extractor(GEP, DT);
+  // Find a non-zero constant offset first.
+  APInt ConstantOffset =
+      Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
+                     GEP->isInBounds());
+  if (ConstantOffset == 0) {
+    UserChainTail = nullptr;
+    return nullptr;
+  }
+  // Separates the constant offset from the GEP index.
+  Value *IdxWithoutConstOffset = Extractor.rebuildWithoutConstOffset();
+  UserChainTail = Extractor.UserChain.back();
+  return IdxWithoutConstOffset;
+}
+
+int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP,
+                                      const DominatorTree *DT) {
+  // If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative.
+  return ConstantOffsetExtractor(GEP, DT)
+      .find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
+            GEP->isInBounds())
+      .getSExtValue();
+}
+
+bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToPointerSize(
+    GetElementPtrInst *GEP) {
+  bool Changed = false;
+  Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
+  gep_type_iterator GTI = gep_type_begin(*GEP);
+  for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end();
+       I != E; ++I, ++GTI) {
+    // Skip struct member indices which must be i32.
+    if (GTI.isSequential()) {
+      if ((*I)->getType() != IntPtrTy) {
+        *I = CastInst::CreateIntegerCast(*I, IntPtrTy, true, "idxprom", GEP);
+        Changed = true;
+      }
+    }
+  }
+  return Changed;
+}
+
+int64_t
+SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP,
+                                                 bool &NeedsExtraction) {
+  NeedsExtraction = false;
+  int64_t AccumulativeByteOffset = 0;
+  gep_type_iterator GTI = gep_type_begin(*GEP);
+  for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
+    if (GTI.isSequential()) {
+      // Tries to extract a constant offset from this GEP index.
+      int64_t ConstantOffset =
+          ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP, DT);
+      if (ConstantOffset != 0) {
+        NeedsExtraction = true;
+        // A GEP may have multiple indices.  We accumulate the extracted
+        // constant offset to a byte offset, and later offset the remainder of
+        // the original GEP with this byte offset.
+        AccumulativeByteOffset +=
+            ConstantOffset * DL->getTypeAllocSize(GTI.getIndexedType());
+      }
+    } else if (LowerGEP) {
+      StructType *StTy = GTI.getStructType();
+      uint64_t Field = cast<ConstantInt>(GEP->getOperand(I))->getZExtValue();
+      // Skip field 0 as the offset is always 0.
+      if (Field != 0) {
+        NeedsExtraction = true;
+        AccumulativeByteOffset +=
+            DL->getStructLayout(StTy)->getElementOffset(Field);
+      }
+    }
+  }
+  return AccumulativeByteOffset;
+}
+
+void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
+    GetElementPtrInst *Variadic, int64_t AccumulativeByteOffset) {
+  IRBuilder<> Builder(Variadic);
+  Type *IntPtrTy = DL->getIntPtrType(Variadic->getType());
+
+  Type *I8PtrTy =
+      Builder.getInt8PtrTy(Variadic->getType()->getPointerAddressSpace());
+  Value *ResultPtr = Variadic->getOperand(0);
+  Loop *L = LI->getLoopFor(Variadic->getParent());
+  // Check if the base is not loop invariant or used more than once.
+  bool isSwapCandidate =
+      L && L->isLoopInvariant(ResultPtr) &&
+      !hasMoreThanOneUseInLoop(ResultPtr, L);
+  Value *FirstResult = nullptr;
+
+  if (ResultPtr->getType() != I8PtrTy)
+    ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
+
+  gep_type_iterator GTI = gep_type_begin(*Variadic);
+  // Create an ugly GEP for each sequential index. We don't create GEPs for
+  // structure indices, as they are accumulated in the constant offset index.
+  for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
+    if (GTI.isSequential()) {
+      Value *Idx = Variadic->getOperand(I);
+      // Skip zero indices.
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
+        if (CI->isZero())
+          continue;
+
+      APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
+                                DL->getTypeAllocSize(GTI.getIndexedType()));
+      // Scale the index by element size.
+      if (ElementSize != 1) {
+        if (ElementSize.isPowerOf2()) {
+          Idx = Builder.CreateShl(
+              Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2()));
+        } else {
+          Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize));
+        }
+      }
+      // Create an ugly GEP with a single index for each index.
+      ResultPtr =
+          Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Idx, "uglygep");
+      if (FirstResult == nullptr)
+        FirstResult = ResultPtr;
+    }
+  }
+
+  // Create a GEP with the constant offset index.
+  if (AccumulativeByteOffset != 0) {
+    Value *Offset = ConstantInt::get(IntPtrTy, AccumulativeByteOffset);
+    ResultPtr =
+        Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Offset, "uglygep");
+  } else
+    isSwapCandidate = false;
+
+  // If we created a GEP with constant index, and the base is loop invariant,
+  // then we swap the first one with it, so LICM can move constant GEP out
+  // later.
   auto *FirstGEP = dyn_cast_or_null<GetElementPtrInst>(FirstResult);
   auto *SecondGEP = dyn_cast<GetElementPtrInst>(ResultPtr);
-  if (isSwapCandidate && isLegalToSwapOperand(FirstGEP, SecondGEP, L)) 
-    swapGEPOperand(FirstGEP, SecondGEP); 
- 
-  if (ResultPtr->getType() != Variadic->getType()) 
-    ResultPtr = Builder.CreateBitCast(ResultPtr, Variadic->getType()); 
- 
-  Variadic->replaceAllUsesWith(ResultPtr); 
-  Variadic->eraseFromParent(); 
-} 
- 
-void 
-SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic, 
-                                               int64_t AccumulativeByteOffset) { 
-  IRBuilder<> Builder(Variadic); 
-  Type *IntPtrTy = DL->getIntPtrType(Variadic->getType()); 
- 
-  Value *ResultPtr = Builder.CreatePtrToInt(Variadic->getOperand(0), IntPtrTy); 
-  gep_type_iterator GTI = gep_type_begin(*Variadic); 
-  // Create ADD/SHL/MUL arithmetic operations for each sequential indices. We 
-  // don't create arithmetics for structure indices, as they are accumulated 
-  // in the constant offset index. 
-  for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) { 
-    if (GTI.isSequential()) { 
-      Value *Idx = Variadic->getOperand(I); 
-      // Skip zero indices. 
-      if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) 
-        if (CI->isZero()) 
-          continue; 
- 
-      APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(), 
-                                DL->getTypeAllocSize(GTI.getIndexedType())); 
-      // Scale the index by element size. 
-      if (ElementSize != 1) { 
-        if (ElementSize.isPowerOf2()) { 
-          Idx = Builder.CreateShl( 
-              Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2())); 
-        } else { 
-          Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize)); 
-        } 
-      } 
-      // Create an ADD for each index. 
-      ResultPtr = Builder.CreateAdd(ResultPtr, Idx); 
-    } 
-  } 
- 
-  // Create an ADD for the constant offset index. 
-  if (AccumulativeByteOffset != 0) { 
-    ResultPtr = Builder.CreateAdd( 
-        ResultPtr, ConstantInt::get(IntPtrTy, AccumulativeByteOffset)); 
-  } 
- 
-  ResultPtr = Builder.CreateIntToPtr(ResultPtr, Variadic->getType()); 
-  Variadic->replaceAllUsesWith(ResultPtr); 
-  Variadic->eraseFromParent(); 
-} 
- 
-bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { 
-  // Skip vector GEPs. 
-  if (GEP->getType()->isVectorTy()) 
-    return false; 
- 
-  // The backend can already nicely handle the case where all indices are 
-  // constant. 
-  if (GEP->hasAllConstantIndices()) 
-    return false; 
- 
-  bool Changed = canonicalizeArrayIndicesToPointerSize(GEP); 
- 
-  bool NeedsExtraction; 
-  int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction); 
- 
-  if (!NeedsExtraction) 
-    return Changed; 
- 
+  if (isSwapCandidate && isLegalToSwapOperand(FirstGEP, SecondGEP, L))
+    swapGEPOperand(FirstGEP, SecondGEP);
+
+  if (ResultPtr->getType() != Variadic->getType())
+    ResultPtr = Builder.CreateBitCast(ResultPtr, Variadic->getType());
+
+  Variadic->replaceAllUsesWith(ResultPtr);
+  Variadic->eraseFromParent();
+}
+
+void
+SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic,
+                                               int64_t AccumulativeByteOffset) {
+  IRBuilder<> Builder(Variadic);
+  Type *IntPtrTy = DL->getIntPtrType(Variadic->getType());
+
+  Value *ResultPtr = Builder.CreatePtrToInt(Variadic->getOperand(0), IntPtrTy);
+  gep_type_iterator GTI = gep_type_begin(*Variadic);
+  // Create ADD/SHL/MUL arithmetic operations for each sequential indices. We
+  // don't create arithmetics for structure indices, as they are accumulated
+  // in the constant offset index.
+  for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
+    if (GTI.isSequential()) {
+      Value *Idx = Variadic->getOperand(I);
+      // Skip zero indices.
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
+        if (CI->isZero())
+          continue;
+
+      APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
+                                DL->getTypeAllocSize(GTI.getIndexedType()));
+      // Scale the index by element size.
+      if (ElementSize != 1) {
+        if (ElementSize.isPowerOf2()) {
+          Idx = Builder.CreateShl(
+              Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2()));
+        } else {
+          Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize));
+        }
+      }
+      // Create an ADD for each index.
+      ResultPtr = Builder.CreateAdd(ResultPtr, Idx);
+    }
+  }
+
+  // Create an ADD for the constant offset index.
+  if (AccumulativeByteOffset != 0) {
+    ResultPtr = Builder.CreateAdd(
+        ResultPtr, ConstantInt::get(IntPtrTy, AccumulativeByteOffset));
+  }
+
+  ResultPtr = Builder.CreateIntToPtr(ResultPtr, Variadic->getType());
+  Variadic->replaceAllUsesWith(ResultPtr);
+  Variadic->eraseFromParent();
+}
+
+bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
+  // Skip vector GEPs.
+  if (GEP->getType()->isVectorTy())
+    return false;
+
+  // The backend can already nicely handle the case where all indices are
+  // constant.
+  if (GEP->hasAllConstantIndices())
+    return false;
+
+  bool Changed = canonicalizeArrayIndicesToPointerSize(GEP);
+
+  bool NeedsExtraction;
+  int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction);
+
+  if (!NeedsExtraction)
+    return Changed;
+
   TargetTransformInfo &TTI = GetTTI(*GEP->getFunction());
- 
-  // If LowerGEP is disabled, before really splitting the GEP, check whether the 
-  // backend supports the addressing mode we are about to produce. If no, this 
-  // splitting probably won't be beneficial. 
-  // If LowerGEP is enabled, even the extracted constant offset can not match 
-  // the addressing mode, we can still do optimizations to other lowered parts 
-  // of variable indices. Therefore, we don't check for addressing modes in that 
-  // case. 
-  if (!LowerGEP) { 
-    unsigned AddrSpace = GEP->getPointerAddressSpace(); 
-    if (!TTI.isLegalAddressingMode(GEP->getResultElementType(), 
-                                   /*BaseGV=*/nullptr, AccumulativeByteOffset, 
-                                   /*HasBaseReg=*/true, /*Scale=*/0, 
-                                   AddrSpace)) { 
-      return Changed; 
-    } 
-  } 
- 
-  // Remove the constant offset in each sequential index. The resultant GEP 
-  // computes the variadic base. 
-  // Notice that we don't remove struct field indices here. If LowerGEP is 
-  // disabled, a structure index is not accumulated and we still use the old 
-  // one. If LowerGEP is enabled, a structure index is accumulated in the 
-  // constant offset. LowerToSingleIndexGEPs or lowerToArithmetics will later 
-  // handle the constant offset and won't need a new structure index. 
-  gep_type_iterator GTI = gep_type_begin(*GEP); 
-  for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) { 
-    if (GTI.isSequential()) { 
-      // Splits this GEP index into a variadic part and a constant offset, and 
-      // uses the variadic part as the new index. 
-      Value *OldIdx = GEP->getOperand(I); 
-      User *UserChainTail; 
-      Value *NewIdx = 
-          ConstantOffsetExtractor::Extract(OldIdx, GEP, UserChainTail, DT); 
-      if (NewIdx != nullptr) { 
-        // Switches to the index with the constant offset removed. 
-        GEP->setOperand(I, NewIdx); 
-        // After switching to the new index, we can garbage-collect UserChain 
-        // and the old index if they are not used. 
-        RecursivelyDeleteTriviallyDeadInstructions(UserChainTail); 
-        RecursivelyDeleteTriviallyDeadInstructions(OldIdx); 
-      } 
-    } 
-  } 
- 
-  // Clear the inbounds attribute because the new index may be off-bound. 
-  // e.g., 
-  // 
-  //   b     = add i64 a, 5 
-  //   addr  = gep inbounds float, float* p, i64 b 
-  // 
-  // is transformed to: 
-  // 
-  //   addr2 = gep float, float* p, i64 a ; inbounds removed 
-  //   addr  = gep inbounds float, float* addr2, i64 5 
-  // 
-  // If a is -4, although the old index b is in bounds, the new index a is 
-  // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the 
-  // inbounds keyword is not present, the offsets are added to the base 
-  // address with silently-wrapping two's complement arithmetic". 
-  // Therefore, the final code will be a semantically equivalent. 
-  // 
-  // TODO(jingyue): do some range analysis to keep as many inbounds as 
-  // possible. GEPs with inbounds are more friendly to alias analysis. 
-  bool GEPWasInBounds = GEP->isInBounds(); 
-  GEP->setIsInBounds(false); 
- 
-  // Lowers a GEP to either GEPs with a single index or arithmetic operations. 
-  if (LowerGEP) { 
-    // As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to 
-    // arithmetic operations if the target uses alias analysis in codegen. 
-    if (TTI.useAA()) 
-      lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset); 
-    else 
-      lowerToArithmetics(GEP, AccumulativeByteOffset); 
-    return true; 
-  } 
- 
-  // No need to create another GEP if the accumulative byte offset is 0. 
-  if (AccumulativeByteOffset == 0) 
-    return true; 
- 
-  // Offsets the base with the accumulative byte offset. 
-  // 
-  //   %gep                        ; the base 
-  //   ... %gep ... 
-  // 
-  // => add the offset 
-  // 
-  //   %gep2                       ; clone of %gep 
-  //   %new.gep = gep %gep2, <offset / sizeof(*%gep)> 
-  //   %gep                        ; will be removed 
-  //   ... %gep ... 
-  // 
-  // => replace all uses of %gep with %new.gep and remove %gep 
-  // 
-  //   %gep2                       ; clone of %gep 
-  //   %new.gep = gep %gep2, <offset / sizeof(*%gep)> 
-  //   ... %new.gep ... 
-  // 
-  // If AccumulativeByteOffset is not a multiple of sizeof(*%gep), we emit an 
-  // uglygep (http://llvm.org/docs/GetElementPtr.html#what-s-an-uglygep): 
-  // bitcast %gep2 to i8*, add the offset, and bitcast the result back to the 
-  // type of %gep. 
-  // 
-  //   %gep2                       ; clone of %gep 
-  //   %0       = bitcast %gep2 to i8* 
-  //   %uglygep = gep %0, <offset> 
-  //   %new.gep = bitcast %uglygep to <type of %gep> 
-  //   ... %new.gep ... 
-  Instruction *NewGEP = GEP->clone(); 
-  NewGEP->insertBefore(GEP); 
- 
-  // Per ANSI C standard, signed / unsigned = unsigned and signed % unsigned = 
-  // unsigned.. Therefore, we cast ElementTypeSizeOfGEP to signed because it is 
-  // used with unsigned integers later. 
-  int64_t ElementTypeSizeOfGEP = static_cast<int64_t>( 
-      DL->getTypeAllocSize(GEP->getResultElementType())); 
-  Type *IntPtrTy = DL->getIntPtrType(GEP->getType()); 
-  if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) { 
-    // Very likely. As long as %gep is naturally aligned, the byte offset we 
-    // extracted should be a multiple of sizeof(*%gep). 
-    int64_t Index = AccumulativeByteOffset / ElementTypeSizeOfGEP; 
-    NewGEP = GetElementPtrInst::Create(GEP->getResultElementType(), NewGEP, 
-                                       ConstantInt::get(IntPtrTy, Index, true), 
-                                       GEP->getName(), GEP); 
-    NewGEP->copyMetadata(*GEP); 
-    // Inherit the inbounds attribute of the original GEP. 
-    cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds); 
-  } else { 
-    // Unlikely but possible. For example, 
-    // #pragma pack(1) 
-    // struct S { 
-    //   int a[3]; 
-    //   int64 b[8]; 
-    // }; 
-    // #pragma pack() 
-    // 
-    // Suppose the gep before extraction is &s[i + 1].b[j + 3]. After 
-    // extraction, it becomes &s[i].b[j] and AccumulativeByteOffset is 
-    // sizeof(S) + 3 * sizeof(int64) = 100, which is not a multiple of 
-    // sizeof(int64). 
-    // 
-    // Emit an uglygep in this case. 
-    Type *I8PtrTy = Type::getInt8PtrTy(GEP->getContext(), 
-                                       GEP->getPointerAddressSpace()); 
-    NewGEP = new BitCastInst(NewGEP, I8PtrTy, "", GEP); 
-    NewGEP = GetElementPtrInst::Create( 
-        Type::getInt8Ty(GEP->getContext()), NewGEP, 
-        ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), "uglygep", 
-        GEP); 
-    NewGEP->copyMetadata(*GEP); 
-    // Inherit the inbounds attribute of the original GEP. 
-    cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds); 
-    if (GEP->getType() != I8PtrTy) 
-      NewGEP = new BitCastInst(NewGEP, GEP->getType(), GEP->getName(), GEP); 
-  } 
- 
-  GEP->replaceAllUsesWith(NewGEP); 
-  GEP->eraseFromParent(); 
- 
-  return true; 
-} 
- 
+
+  // If LowerGEP is disabled, before really splitting the GEP, check whether the
+  // backend supports the addressing mode we are about to produce. If no, this
+  // splitting probably won't be beneficial.
+  // If LowerGEP is enabled, even the extracted constant offset can not match
+  // the addressing mode, we can still do optimizations to other lowered parts
+  // of variable indices. Therefore, we don't check for addressing modes in that
+  // case.
+  if (!LowerGEP) {
+    unsigned AddrSpace = GEP->getPointerAddressSpace();
+    if (!TTI.isLegalAddressingMode(GEP->getResultElementType(),
+                                   /*BaseGV=*/nullptr, AccumulativeByteOffset,
+                                   /*HasBaseReg=*/true, /*Scale=*/0,
+                                   AddrSpace)) {
+      return Changed;
+    }
+  }
+
+  // Remove the constant offset in each sequential index. The resultant GEP
+  // computes the variadic base.
+  // Notice that we don't remove struct field indices here. If LowerGEP is
+  // disabled, a structure index is not accumulated and we still use the old
+  // one. If LowerGEP is enabled, a structure index is accumulated in the
+  // constant offset. LowerToSingleIndexGEPs or lowerToArithmetics will later
+  // handle the constant offset and won't need a new structure index.
+  gep_type_iterator GTI = gep_type_begin(*GEP);
+  for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
+    if (GTI.isSequential()) {
+      // Splits this GEP index into a variadic part and a constant offset, and
+      // uses the variadic part as the new index.
+      Value *OldIdx = GEP->getOperand(I);
+      User *UserChainTail;
+      Value *NewIdx =
+          ConstantOffsetExtractor::Extract(OldIdx, GEP, UserChainTail, DT);
+      if (NewIdx != nullptr) {
+        // Switches to the index with the constant offset removed.
+        GEP->setOperand(I, NewIdx);
+        // After switching to the new index, we can garbage-collect UserChain
+        // and the old index if they are not used.
+        RecursivelyDeleteTriviallyDeadInstructions(UserChainTail);
+        RecursivelyDeleteTriviallyDeadInstructions(OldIdx);
+      }
+    }
+  }
+
+  // Clear the inbounds attribute because the new index may be off-bound.
+  // e.g.,
+  //
+  //   b     = add i64 a, 5
+  //   addr  = gep inbounds float, float* p, i64 b
+  //
+  // is transformed to:
+  //
+  //   addr2 = gep float, float* p, i64 a ; inbounds removed
+  //   addr  = gep inbounds float, float* addr2, i64 5
+  //
+  // If a is -4, although the old index b is in bounds, the new index a is
+  // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the
+  // inbounds keyword is not present, the offsets are added to the base
+  // address with silently-wrapping two's complement arithmetic".
+  // Therefore, the final code will be a semantically equivalent.
+  //
+  // TODO(jingyue): do some range analysis to keep as many inbounds as
+  // possible. GEPs with inbounds are more friendly to alias analysis.
+  bool GEPWasInBounds = GEP->isInBounds();
+  GEP->setIsInBounds(false);
+
+  // Lowers a GEP to either GEPs with a single index or arithmetic operations.
+  if (LowerGEP) {
+    // As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to
+    // arithmetic operations if the target uses alias analysis in codegen.
+    if (TTI.useAA())
+      lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset);
+    else
+      lowerToArithmetics(GEP, AccumulativeByteOffset);
+    return true;
+  }
+
+  // No need to create another GEP if the accumulative byte offset is 0.
+  if (AccumulativeByteOffset == 0)
+    return true;
+
+  // Offsets the base with the accumulative byte offset.
+  //
+  //   %gep                        ; the base
+  //   ... %gep ...
+  //
+  // => add the offset
+  //
+  //   %gep2                       ; clone of %gep
+  //   %new.gep = gep %gep2, <offset / sizeof(*%gep)>
+  //   %gep                        ; will be removed
+  //   ... %gep ...
+  //
+  // => replace all uses of %gep with %new.gep and remove %gep
+  //
+  //   %gep2                       ; clone of %gep
+  //   %new.gep = gep %gep2, <offset / sizeof(*%gep)>
+  //   ... %new.gep ...
+  //
+  // If AccumulativeByteOffset is not a multiple of sizeof(*%gep), we emit an
+  // uglygep (http://llvm.org/docs/GetElementPtr.html#what-s-an-uglygep):
+  // bitcast %gep2 to i8*, add the offset, and bitcast the result back to the
+  // type of %gep.
+  //
+  //   %gep2                       ; clone of %gep
+  //   %0       = bitcast %gep2 to i8*
+  //   %uglygep = gep %0, <offset>
+  //   %new.gep = bitcast %uglygep to <type of %gep>
+  //   ... %new.gep ...
+  Instruction *NewGEP = GEP->clone();
+  NewGEP->insertBefore(GEP);
+
+  // Per ANSI C standard, signed / unsigned = unsigned and signed % unsigned =
+  // unsigned.. Therefore, we cast ElementTypeSizeOfGEP to signed because it is
+  // used with unsigned integers later.
+  int64_t ElementTypeSizeOfGEP = static_cast<int64_t>(
+      DL->getTypeAllocSize(GEP->getResultElementType()));
+  Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
+  if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) {
+    // Very likely. As long as %gep is naturally aligned, the byte offset we
+    // extracted should be a multiple of sizeof(*%gep).
+    int64_t Index = AccumulativeByteOffset / ElementTypeSizeOfGEP;
+    NewGEP = GetElementPtrInst::Create(GEP->getResultElementType(), NewGEP,
+                                       ConstantInt::get(IntPtrTy, Index, true),
+                                       GEP->getName(), GEP);
+    NewGEP->copyMetadata(*GEP);
+    // Inherit the inbounds attribute of the original GEP.
+    cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds);
+  } else {
+    // Unlikely but possible. For example,
+    // #pragma pack(1)
+    // struct S {
+    //   int a[3];
+    //   int64 b[8];
+    // };
+    // #pragma pack()
+    //
+    // Suppose the gep before extraction is &s[i + 1].b[j + 3]. After
+    // extraction, it becomes &s[i].b[j] and AccumulativeByteOffset is
+    // sizeof(S) + 3 * sizeof(int64) = 100, which is not a multiple of
+    // sizeof(int64).
+    //
+    // Emit an uglygep in this case.
+    Type *I8PtrTy = Type::getInt8PtrTy(GEP->getContext(),
+                                       GEP->getPointerAddressSpace());
+    NewGEP = new BitCastInst(NewGEP, I8PtrTy, "", GEP);
+    NewGEP = GetElementPtrInst::Create(
+        Type::getInt8Ty(GEP->getContext()), NewGEP,
+        ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), "uglygep",
+        GEP);
+    NewGEP->copyMetadata(*GEP);
+    // Inherit the inbounds attribute of the original GEP.
+    cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds);
+    if (GEP->getType() != I8PtrTy)
+      NewGEP = new BitCastInst(NewGEP, GEP->getType(), GEP->getName(), GEP);
+  }
+
+  GEP->replaceAllUsesWith(NewGEP);
+  GEP->eraseFromParent();
+
+  return true;
+}
+
 bool SeparateConstOffsetFromGEPLegacyPass::runOnFunction(Function &F) {
-  if (skipFunction(F)) 
-    return false; 
+  if (skipFunction(F))
+    return false;
   auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
@@ -1156,218 +1156,218 @@ bool SeparateConstOffsetFromGEPLegacyPass::runOnFunction(Function &F) {
   SeparateConstOffsetFromGEP Impl(DT, SE, LI, TLI, GetTTI, LowerGEP);
   return Impl.run(F);
 }
- 
+
 bool SeparateConstOffsetFromGEP::run(Function &F) {
-  if (DisableSeparateConstOffsetFromGEP) 
-    return false; 
- 
+  if (DisableSeparateConstOffsetFromGEP)
+    return false;
+
   DL = &F.getParent()->getDataLayout();
-  bool Changed = false; 
-  for (BasicBlock &B : F) { 
-    for (BasicBlock::iterator I = B.begin(), IE = B.end(); I != IE;) 
-      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++)) 
-        Changed |= splitGEP(GEP); 
-    // No need to split GEP ConstantExprs because all its indices are constant 
-    // already. 
-  } 
- 
-  Changed |= reuniteExts(F); 
- 
-  if (VerifyNoDeadCode) 
-    verifyNoDeadCode(F); 
- 
-  return Changed; 
-} 
- 
-Instruction *SeparateConstOffsetFromGEP::findClosestMatchingDominator( 
-    const SCEV *Key, Instruction *Dominatee, 
-    DenseMap<const SCEV *, SmallVector<Instruction *, 2>> &DominatingExprs) { 
-  auto Pos = DominatingExprs.find(Key); 
-  if (Pos == DominatingExprs.end()) 
-    return nullptr; 
- 
-  auto &Candidates = Pos->second; 
-  // Because we process the basic blocks in pre-order of the dominator tree, a 
-  // candidate that doesn't dominate the current instruction won't dominate any 
-  // future instruction either. Therefore, we pop it out of the stack. This 
-  // optimization makes the algorithm O(n). 
-  while (!Candidates.empty()) { 
-    Instruction *Candidate = Candidates.back(); 
-    if (DT->dominates(Candidate, Dominatee)) 
-      return Candidate; 
-    Candidates.pop_back(); 
-  } 
-  return nullptr; 
-} 
- 
-bool SeparateConstOffsetFromGEP::reuniteExts(Instruction *I) { 
-  if (!SE->isSCEVable(I->getType())) 
-    return false; 
- 
-  //   Dom: LHS+RHS 
-  //   I: sext(LHS)+sext(RHS) 
-  // If Dom can't sign overflow and Dom dominates I, optimize I to sext(Dom). 
-  // TODO: handle zext 
-  Value *LHS = nullptr, *RHS = nullptr; 
-  if (match(I, m_Add(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) { 
-    if (LHS->getType() == RHS->getType()) { 
-      const SCEV *Key = 
-          SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS)); 
-      if (auto *Dom = findClosestMatchingDominator(Key, I, DominatingAdds)) { 
-        Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I); 
-        NewSExt->takeName(I); 
-        I->replaceAllUsesWith(NewSExt); 
-        RecursivelyDeleteTriviallyDeadInstructions(I); 
-        return true; 
-      } 
-    } 
-  } else if (match(I, m_Sub(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) { 
-    if (LHS->getType() == RHS->getType()) { 
-      const SCEV *Key = 
-          SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS)); 
-      if (auto *Dom = findClosestMatchingDominator(Key, I, DominatingSubs)) { 
-        Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I); 
-        NewSExt->takeName(I); 
-        I->replaceAllUsesWith(NewSExt); 
-        RecursivelyDeleteTriviallyDeadInstructions(I); 
-        return true; 
-      } 
-    } 
-  } 
- 
-  // Add I to DominatingExprs if it's an add/sub that can't sign overflow. 
-  if (match(I, m_NSWAdd(m_Value(LHS), m_Value(RHS)))) { 
-    if (programUndefinedIfPoison(I)) { 
-      const SCEV *Key = 
-          SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS)); 
-      DominatingAdds[Key].push_back(I); 
-    } 
-  } else if (match(I, m_NSWSub(m_Value(LHS), m_Value(RHS)))) { 
-    if (programUndefinedIfPoison(I)) { 
-      const SCEV *Key = 
-          SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS)); 
-      DominatingSubs[Key].push_back(I); 
-    } 
-  } 
-  return false; 
-} 
- 
-bool SeparateConstOffsetFromGEP::reuniteExts(Function &F) { 
-  bool Changed = false; 
-  DominatingAdds.clear(); 
-  DominatingSubs.clear(); 
-  for (const auto Node : depth_first(DT)) { 
-    BasicBlock *BB = Node->getBlock(); 
-    for (auto I = BB->begin(); I != BB->end(); ) { 
-      Instruction *Cur = &*I++; 
-      Changed |= reuniteExts(Cur); 
-    } 
-  } 
-  return Changed; 
-} 
- 
-void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) { 
-  for (BasicBlock &B : F) { 
-    for (Instruction &I : B) { 
-      if (isInstructionTriviallyDead(&I)) { 
-        std::string ErrMessage; 
-        raw_string_ostream RSO(ErrMessage); 
-        RSO << "Dead instruction detected!\n" << I << "\n"; 
-        llvm_unreachable(RSO.str().c_str()); 
-      } 
-    } 
-  } 
-} 
- 
-bool SeparateConstOffsetFromGEP::isLegalToSwapOperand( 
-    GetElementPtrInst *FirstGEP, GetElementPtrInst *SecondGEP, Loop *CurLoop) { 
-  if (!FirstGEP || !FirstGEP->hasOneUse()) 
-    return false; 
- 
-  if (!SecondGEP || FirstGEP->getParent() != SecondGEP->getParent()) 
-    return false; 
- 
-  if (FirstGEP == SecondGEP) 
-    return false; 
- 
-  unsigned FirstNum = FirstGEP->getNumOperands(); 
-  unsigned SecondNum = SecondGEP->getNumOperands(); 
-  // Give up if the number of operands are not 2. 
-  if (FirstNum != SecondNum || FirstNum != 2) 
-    return false; 
- 
-  Value *FirstBase = FirstGEP->getOperand(0); 
-  Value *SecondBase = SecondGEP->getOperand(0); 
-  Value *FirstOffset = FirstGEP->getOperand(1); 
-  // Give up if the index of the first GEP is loop invariant. 
-  if (CurLoop->isLoopInvariant(FirstOffset)) 
-    return false; 
- 
-  // Give up if base doesn't have same type. 
-  if (FirstBase->getType() != SecondBase->getType()) 
-    return false; 
- 
-  Instruction *FirstOffsetDef = dyn_cast<Instruction>(FirstOffset); 
- 
-  // Check if the second operand of first GEP has constant coefficient. 
-  // For an example, for the following code,  we won't gain anything by 
-  // hoisting the second GEP out because the second GEP can be folded away. 
-  //   %scevgep.sum.ur159 = add i64 %idxprom48.ur, 256 
-  //   %67 = shl i64 %scevgep.sum.ur159, 2 
-  //   %uglygep160 = getelementptr i8* %65, i64 %67 
-  //   %uglygep161 = getelementptr i8* %uglygep160, i64 -1024 
- 
-  // Skip constant shift instruction which may be generated by Splitting GEPs. 
-  if (FirstOffsetDef && FirstOffsetDef->isShift() && 
-      isa<ConstantInt>(FirstOffsetDef->getOperand(1))) 
-    FirstOffsetDef = dyn_cast<Instruction>(FirstOffsetDef->getOperand(0)); 
- 
-  // Give up if FirstOffsetDef is an Add or Sub with constant. 
-  // Because it may not profitable at all due to constant folding. 
-  if (FirstOffsetDef) 
-    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FirstOffsetDef)) { 
-      unsigned opc = BO->getOpcode(); 
-      if ((opc == Instruction::Add || opc == Instruction::Sub) && 
-          (isa<ConstantInt>(BO->getOperand(0)) || 
-           isa<ConstantInt>(BO->getOperand(1)))) 
-        return false; 
-    } 
-  return true; 
-} 
- 
-bool SeparateConstOffsetFromGEP::hasMoreThanOneUseInLoop(Value *V, Loop *L) { 
-  int UsesInLoop = 0; 
-  for (User *U : V->users()) { 
-    if (Instruction *User = dyn_cast<Instruction>(U)) 
-      if (L->contains(User)) 
-        if (++UsesInLoop > 1) 
-          return true; 
-  } 
-  return false; 
-} 
- 
-void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First, 
-                                                GetElementPtrInst *Second) { 
-  Value *Offset1 = First->getOperand(1); 
-  Value *Offset2 = Second->getOperand(1); 
-  First->setOperand(1, Offset2); 
-  Second->setOperand(1, Offset1); 
- 
-  // We changed p+o+c to p+c+o, p+c may not be inbound anymore. 
-  const DataLayout &DAL = First->getModule()->getDataLayout(); 
-  APInt Offset(DAL.getIndexSizeInBits( 
-                   cast<PointerType>(First->getType())->getAddressSpace()), 
-               0); 
-  Value *NewBase = 
-      First->stripAndAccumulateInBoundsConstantOffsets(DAL, Offset); 
-  uint64_t ObjectSize; 
-  if (!getObjectSize(NewBase, ObjectSize, DAL, TLI) || 
-     Offset.ugt(ObjectSize)) { 
-    First->setIsInBounds(false); 
-    Second->setIsInBounds(false); 
-  } else 
-    First->setIsInBounds(true); 
-} 
+  bool Changed = false;
+  for (BasicBlock &B : F) {
+    for (BasicBlock::iterator I = B.begin(), IE = B.end(); I != IE;)
+      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++))
+        Changed |= splitGEP(GEP);
+    // No need to split GEP ConstantExprs because all its indices are constant
+    // already.
+  }
+
+  Changed |= reuniteExts(F);
+
+  if (VerifyNoDeadCode)
+    verifyNoDeadCode(F);
+
+  return Changed;
+}
+
+Instruction *SeparateConstOffsetFromGEP::findClosestMatchingDominator(
+    const SCEV *Key, Instruction *Dominatee,
+    DenseMap<const SCEV *, SmallVector<Instruction *, 2>> &DominatingExprs) {
+  auto Pos = DominatingExprs.find(Key);
+  if (Pos == DominatingExprs.end())
+    return nullptr;
+
+  auto &Candidates = Pos->second;
+  // Because we process the basic blocks in pre-order of the dominator tree, a
+  // candidate that doesn't dominate the current instruction won't dominate any
+  // future instruction either. Therefore, we pop it out of the stack. This
+  // optimization makes the algorithm O(n).
+  while (!Candidates.empty()) {
+    Instruction *Candidate = Candidates.back();
+    if (DT->dominates(Candidate, Dominatee))
+      return Candidate;
+    Candidates.pop_back();
+  }
+  return nullptr;
+}
+
+bool SeparateConstOffsetFromGEP::reuniteExts(Instruction *I) {
+  if (!SE->isSCEVable(I->getType()))
+    return false;
+
+  //   Dom: LHS+RHS
+  //   I: sext(LHS)+sext(RHS)
+  // If Dom can't sign overflow and Dom dominates I, optimize I to sext(Dom).
+  // TODO: handle zext
+  Value *LHS = nullptr, *RHS = nullptr;
+  if (match(I, m_Add(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) {
+    if (LHS->getType() == RHS->getType()) {
+      const SCEV *Key =
+          SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
+      if (auto *Dom = findClosestMatchingDominator(Key, I, DominatingAdds)) {
+        Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I);
+        NewSExt->takeName(I);
+        I->replaceAllUsesWith(NewSExt);
+        RecursivelyDeleteTriviallyDeadInstructions(I);
+        return true;
+      }
+    }
+  } else if (match(I, m_Sub(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) {
+    if (LHS->getType() == RHS->getType()) {
+      const SCEV *Key =
+          SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
+      if (auto *Dom = findClosestMatchingDominator(Key, I, DominatingSubs)) {
+        Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I);
+        NewSExt->takeName(I);
+        I->replaceAllUsesWith(NewSExt);
+        RecursivelyDeleteTriviallyDeadInstructions(I);
+        return true;
+      }
+    }
+  }
+
+  // Add I to DominatingExprs if it's an add/sub that can't sign overflow.
+  if (match(I, m_NSWAdd(m_Value(LHS), m_Value(RHS)))) {
+    if (programUndefinedIfPoison(I)) {
+      const SCEV *Key =
+          SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
+      DominatingAdds[Key].push_back(I);
+    }
+  } else if (match(I, m_NSWSub(m_Value(LHS), m_Value(RHS)))) {
+    if (programUndefinedIfPoison(I)) {
+      const SCEV *Key =
+          SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
+      DominatingSubs[Key].push_back(I);
+    }
+  }
+  return false;
+}
+
+bool SeparateConstOffsetFromGEP::reuniteExts(Function &F) {
+  bool Changed = false;
+  DominatingAdds.clear();
+  DominatingSubs.clear();
+  for (const auto Node : depth_first(DT)) {
+    BasicBlock *BB = Node->getBlock();
+    for (auto I = BB->begin(); I != BB->end(); ) {
+      Instruction *Cur = &*I++;
+      Changed |= reuniteExts(Cur);
+    }
+  }
+  return Changed;
+}
+
+void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) {
+  for (BasicBlock &B : F) {
+    for (Instruction &I : B) {
+      if (isInstructionTriviallyDead(&I)) {
+        std::string ErrMessage;
+        raw_string_ostream RSO(ErrMessage);
+        RSO << "Dead instruction detected!\n" << I << "\n";
+        llvm_unreachable(RSO.str().c_str());
+      }
+    }
+  }
+}
+
+bool SeparateConstOffsetFromGEP::isLegalToSwapOperand(
+    GetElementPtrInst *FirstGEP, GetElementPtrInst *SecondGEP, Loop *CurLoop) {
+  if (!FirstGEP || !FirstGEP->hasOneUse())
+    return false;
+
+  if (!SecondGEP || FirstGEP->getParent() != SecondGEP->getParent())
+    return false;
+
+  if (FirstGEP == SecondGEP)
+    return false;
+
+  unsigned FirstNum = FirstGEP->getNumOperands();
+  unsigned SecondNum = SecondGEP->getNumOperands();
+  // Give up if the number of operands are not 2.
+  if (FirstNum != SecondNum || FirstNum != 2)
+    return false;
+
+  Value *FirstBase = FirstGEP->getOperand(0);
+  Value *SecondBase = SecondGEP->getOperand(0);
+  Value *FirstOffset = FirstGEP->getOperand(1);
+  // Give up if the index of the first GEP is loop invariant.
+  if (CurLoop->isLoopInvariant(FirstOffset))
+    return false;
+
+  // Give up if base doesn't have same type.
+  if (FirstBase->getType() != SecondBase->getType())
+    return false;
+
+  Instruction *FirstOffsetDef = dyn_cast<Instruction>(FirstOffset);
+
+  // Check if the second operand of first GEP has constant coefficient.
+  // For an example, for the following code,  we won't gain anything by
+  // hoisting the second GEP out because the second GEP can be folded away.
+  //   %scevgep.sum.ur159 = add i64 %idxprom48.ur, 256
+  //   %67 = shl i64 %scevgep.sum.ur159, 2
+  //   %uglygep160 = getelementptr i8* %65, i64 %67
+  //   %uglygep161 = getelementptr i8* %uglygep160, i64 -1024
+
+  // Skip constant shift instruction which may be generated by Splitting GEPs.
+  if (FirstOffsetDef && FirstOffsetDef->isShift() &&
+      isa<ConstantInt>(FirstOffsetDef->getOperand(1)))
+    FirstOffsetDef = dyn_cast<Instruction>(FirstOffsetDef->getOperand(0));
+
+  // Give up if FirstOffsetDef is an Add or Sub with constant.
+  // Because it may not profitable at all due to constant folding.
+  if (FirstOffsetDef)
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FirstOffsetDef)) {
+      unsigned opc = BO->getOpcode();
+      if ((opc == Instruction::Add || opc == Instruction::Sub) &&
+          (isa<ConstantInt>(BO->getOperand(0)) ||
+           isa<ConstantInt>(BO->getOperand(1))))
+        return false;
+    }
+  return true;
+}
+
+bool SeparateConstOffsetFromGEP::hasMoreThanOneUseInLoop(Value *V, Loop *L) {
+  int UsesInLoop = 0;
+  for (User *U : V->users()) {
+    if (Instruction *User = dyn_cast<Instruction>(U))
+      if (L->contains(User))
+        if (++UsesInLoop > 1)
+          return true;
+  }
+  return false;
+}
+
+void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First,
+                                                GetElementPtrInst *Second) {
+  Value *Offset1 = First->getOperand(1);
+  Value *Offset2 = Second->getOperand(1);
+  First->setOperand(1, Offset2);
+  Second->setOperand(1, Offset1);
+
+  // We changed p+o+c to p+c+o, p+c may not be inbound anymore.
+  const DataLayout &DAL = First->getModule()->getDataLayout();
+  APInt Offset(DAL.getIndexSizeInBits(
+                   cast<PointerType>(First->getType())->getAddressSpace()),
+               0);
+  Value *NewBase =
+      First->stripAndAccumulateInBoundsConstantOffsets(DAL, Offset);
+  uint64_t ObjectSize;
+  if (!getObjectSize(NewBase, ObjectSize, DAL, TLI) ||
+     Offset.ugt(ObjectSize)) {
+    First->setIsInBounds(false);
+    Second->setIsInBounds(false);
+  } else
+    First->setIsInBounds(true);
+}
 
 PreservedAnalyses
 SeparateConstOffsetFromGEPPass::run(Function &F, FunctionAnalysisManager &AM) {
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 43ed0957ed..9d3c8d0f37 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -1,1145 +1,1145 @@
-///===- SimpleLoopUnswitch.cpp - Hoist loop-invariant control flow ---------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/Sequence.h" 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/Twine.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/CFG.h" 
-#include "llvm/Analysis/CodeMetrics.h" 
-#include "llvm/Analysis/GuardUtils.h" 
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/Analysis/LoopAnalysisManager.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/LoopIterator.h" 
-#include "llvm/Analysis/LoopPass.h" 
-#include "llvm/Analysis/MemorySSA.h" 
-#include "llvm/Analysis/MemorySSAUpdater.h" 
+///===- SimpleLoopUnswitch.cpp - Hoist loop-invariant control flow ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/GuardUtils.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/MustExecute.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/GenericDomTree.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/Cloning.h" 
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GenericDomTree.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/LoopUtils.h" 
-#include "llvm/Transforms/Utils/ValueMapper.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <iterator> 
-#include <numeric> 
-#include <utility> 
- 
-#define DEBUG_TYPE "simple-loop-unswitch" 
- 
-using namespace llvm; 
- 
-STATISTIC(NumBranches, "Number of branches unswitched"); 
-STATISTIC(NumSwitches, "Number of switches unswitched"); 
-STATISTIC(NumGuards, "Number of guards turned into branches for unswitching"); 
-STATISTIC(NumTrivial, "Number of unswitches that are trivial"); 
-STATISTIC( 
-    NumCostMultiplierSkipped, 
-    "Number of unswitch candidates that had their cost multiplier skipped"); 
- 
-static cl::opt<bool> EnableNonTrivialUnswitch( 
-    "enable-nontrivial-unswitch", cl::init(false), cl::Hidden, 
-    cl::desc("Forcibly enables non-trivial loop unswitching rather than " 
-             "following the configuration passed into the pass.")); 
- 
-static cl::opt<int> 
-    UnswitchThreshold("unswitch-threshold", cl::init(50), cl::Hidden, 
-                      cl::desc("The cost threshold for unswitching a loop.")); 
- 
-static cl::opt<bool> EnableUnswitchCostMultiplier( 
-    "enable-unswitch-cost-multiplier", cl::init(true), cl::Hidden, 
-    cl::desc("Enable unswitch cost multiplier that prohibits exponential " 
-             "explosion in nontrivial unswitch.")); 
-static cl::opt<int> UnswitchSiblingsToplevelDiv( 
-    "unswitch-siblings-toplevel-div", cl::init(2), cl::Hidden, 
-    cl::desc("Toplevel siblings divisor for cost multiplier.")); 
-static cl::opt<int> UnswitchNumInitialUnscaledCandidates( 
-    "unswitch-num-initial-unscaled-candidates", cl::init(8), cl::Hidden, 
-    cl::desc("Number of unswitch candidates that are ignored when calculating " 
-             "cost multiplier.")); 
-static cl::opt<bool> UnswitchGuards( 
-    "simple-loop-unswitch-guards", cl::init(true), cl::Hidden, 
-    cl::desc("If enabled, simple loop unswitching will also consider " 
-             "llvm.experimental.guard intrinsics as unswitch candidates.")); 
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <numeric>
+#include <utility>
+
+#define DEBUG_TYPE "simple-loop-unswitch"
+
+using namespace llvm;
+
+STATISTIC(NumBranches, "Number of branches unswitched");
+STATISTIC(NumSwitches, "Number of switches unswitched");
+STATISTIC(NumGuards, "Number of guards turned into branches for unswitching");
+STATISTIC(NumTrivial, "Number of unswitches that are trivial");
+STATISTIC(
+    NumCostMultiplierSkipped,
+    "Number of unswitch candidates that had their cost multiplier skipped");
+
+static cl::opt<bool> EnableNonTrivialUnswitch(
+    "enable-nontrivial-unswitch", cl::init(false), cl::Hidden,
+    cl::desc("Forcibly enables non-trivial loop unswitching rather than "
+             "following the configuration passed into the pass."));
+
+static cl::opt<int>
+    UnswitchThreshold("unswitch-threshold", cl::init(50), cl::Hidden,
+                      cl::desc("The cost threshold for unswitching a loop."));
+
+static cl::opt<bool> EnableUnswitchCostMultiplier(
+    "enable-unswitch-cost-multiplier", cl::init(true), cl::Hidden,
+    cl::desc("Enable unswitch cost multiplier that prohibits exponential "
+             "explosion in nontrivial unswitch."));
+static cl::opt<int> UnswitchSiblingsToplevelDiv(
+    "unswitch-siblings-toplevel-div", cl::init(2), cl::Hidden,
+    cl::desc("Toplevel siblings divisor for cost multiplier."));
+static cl::opt<int> UnswitchNumInitialUnscaledCandidates(
+    "unswitch-num-initial-unscaled-candidates", cl::init(8), cl::Hidden,
+    cl::desc("Number of unswitch candidates that are ignored when calculating "
+             "cost multiplier."));
+static cl::opt<bool> UnswitchGuards(
+    "simple-loop-unswitch-guards", cl::init(true), cl::Hidden,
+    cl::desc("If enabled, simple loop unswitching will also consider "
+             "llvm.experimental.guard intrinsics as unswitch candidates."));
 static cl::opt<bool> DropNonTrivialImplicitNullChecks(
     "simple-loop-unswitch-drop-non-trivial-implicit-null-checks",
     cl::init(false), cl::Hidden,
     cl::desc("If enabled, drop make.implicit metadata in unswitched implicit "
              "null checks to save time analyzing if we can keep it."));
- 
-/// Collect all of the loop invariant input values transitively used by the 
-/// homogeneous instruction graph from a given root. 
-/// 
-/// This essentially walks from a root recursively through loop variant operands 
-/// which have the exact same opcode and finds all inputs which are loop 
-/// invariant. For some operations these can be re-associated and unswitched out 
-/// of the loop entirely. 
-static TinyPtrVector<Value *> 
-collectHomogenousInstGraphLoopInvariants(Loop &L, Instruction &Root, 
-                                         LoopInfo &LI) { 
-  assert(!L.isLoopInvariant(&Root) && 
-         "Only need to walk the graph if root itself is not invariant."); 
-  TinyPtrVector<Value *> Invariants; 
- 
-  // Build a worklist and recurse through operators collecting invariants. 
-  SmallVector<Instruction *, 4> Worklist; 
-  SmallPtrSet<Instruction *, 8> Visited; 
-  Worklist.push_back(&Root); 
-  Visited.insert(&Root); 
-  do { 
-    Instruction &I = *Worklist.pop_back_val(); 
-    for (Value *OpV : I.operand_values()) { 
-      // Skip constants as unswitching isn't interesting for them. 
-      if (isa<Constant>(OpV)) 
-        continue; 
- 
-      // Add it to our result if loop invariant. 
-      if (L.isLoopInvariant(OpV)) { 
-        Invariants.push_back(OpV); 
-        continue; 
-      } 
- 
-      // If not an instruction with the same opcode, nothing we can do. 
-      Instruction *OpI = dyn_cast<Instruction>(OpV); 
-      if (!OpI || OpI->getOpcode() != Root.getOpcode()) 
-        continue; 
- 
-      // Visit this operand. 
-      if (Visited.insert(OpI).second) 
-        Worklist.push_back(OpI); 
-    } 
-  } while (!Worklist.empty()); 
- 
-  return Invariants; 
-} 
- 
-static void replaceLoopInvariantUses(Loop &L, Value *Invariant, 
-                                     Constant &Replacement) { 
-  assert(!isa<Constant>(Invariant) && "Why are we unswitching on a constant?"); 
- 
-  // Replace uses of LIC in the loop with the given constant. 
-  for (auto UI = Invariant->use_begin(), UE = Invariant->use_end(); UI != UE;) { 
-    // Grab the use and walk past it so we can clobber it in the use list. 
-    Use *U = &*UI++; 
-    Instruction *UserI = dyn_cast<Instruction>(U->getUser()); 
- 
-    // Replace this use within the loop body. 
-    if (UserI && L.contains(UserI)) 
-      U->set(&Replacement); 
-  } 
-} 
- 
-/// Check that all the LCSSA PHI nodes in the loop exit block have trivial 
-/// incoming values along this edge. 
-static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB, 
-                                         BasicBlock &ExitBB) { 
-  for (Instruction &I : ExitBB) { 
-    auto *PN = dyn_cast<PHINode>(&I); 
-    if (!PN) 
-      // No more PHIs to check. 
-      return true; 
- 
-    // If the incoming value for this edge isn't loop invariant the unswitch 
-    // won't be trivial. 
-    if (!L.isLoopInvariant(PN->getIncomingValueForBlock(&ExitingBB))) 
-      return false; 
-  } 
-  llvm_unreachable("Basic blocks should never be empty!"); 
-} 
- 
-/// Insert code to test a set of loop invariant values, and conditionally branch 
-/// on them. 
-static void buildPartialUnswitchConditionalBranch(BasicBlock &BB, 
-                                                  ArrayRef<Value *> Invariants, 
-                                                  bool Direction, 
-                                                  BasicBlock &UnswitchedSucc, 
-                                                  BasicBlock &NormalSucc) { 
-  IRBuilder<> IRB(&BB); 
- 
-  Value *Cond = Direction ? IRB.CreateOr(Invariants) : 
-    IRB.CreateAnd(Invariants); 
-  IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc, 
-                   Direction ? &NormalSucc : &UnswitchedSucc); 
-} 
- 
-/// Rewrite the PHI nodes in an unswitched loop exit basic block. 
-/// 
-/// Requires that the loop exit and unswitched basic block are the same, and 
-/// that the exiting block was a unique predecessor of that block. Rewrites the 
-/// PHI nodes in that block such that what were LCSSA PHI nodes become trivial 
-/// PHI nodes from the old preheader that now contains the unswitched 
-/// terminator. 
-static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB, 
-                                                  BasicBlock &OldExitingBB, 
-                                                  BasicBlock &OldPH) { 
-  for (PHINode &PN : UnswitchedBB.phis()) { 
-    // When the loop exit is directly unswitched we just need to update the 
-    // incoming basic block. We loop to handle weird cases with repeated 
-    // incoming blocks, but expect to typically only have one operand here. 
-    for (auto i : seq<int>(0, PN.getNumOperands())) { 
-      assert(PN.getIncomingBlock(i) == &OldExitingBB && 
-             "Found incoming block different from unique predecessor!"); 
-      PN.setIncomingBlock(i, &OldPH); 
-    } 
-  } 
-} 
- 
-/// Rewrite the PHI nodes in the loop exit basic block and the split off 
-/// unswitched block. 
-/// 
-/// Because the exit block remains an exit from the loop, this rewrites the 
-/// LCSSA PHI nodes in it to remove the unswitched edge and introduces PHI 
-/// nodes into the unswitched basic block to select between the value in the 
-/// old preheader and the loop exit. 
-static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB, 
-                                                      BasicBlock &UnswitchedBB, 
-                                                      BasicBlock &OldExitingBB, 
-                                                      BasicBlock &OldPH, 
-                                                      bool FullUnswitch) { 
-  assert(&ExitBB != &UnswitchedBB && 
-         "Must have different loop exit and unswitched blocks!"); 
-  Instruction *InsertPt = &*UnswitchedBB.begin(); 
-  for (PHINode &PN : ExitBB.phis()) { 
-    auto *NewPN = PHINode::Create(PN.getType(), /*NumReservedValues*/ 2, 
-                                  PN.getName() + ".split", InsertPt); 
- 
-    // Walk backwards over the old PHI node's inputs to minimize the cost of 
-    // removing each one. We have to do this weird loop manually so that we 
-    // create the same number of new incoming edges in the new PHI as we expect 
-    // each case-based edge to be included in the unswitched switch in some 
-    // cases. 
-    // FIXME: This is really, really gross. It would be much cleaner if LLVM 
-    // allowed us to create a single entry for a predecessor block without 
-    // having separate entries for each "edge" even though these edges are 
-    // required to produce identical results. 
-    for (int i = PN.getNumIncomingValues() - 1; i >= 0; --i) { 
-      if (PN.getIncomingBlock(i) != &OldExitingBB) 
-        continue; 
- 
-      Value *Incoming = PN.getIncomingValue(i); 
-      if (FullUnswitch) 
-        // No more edge from the old exiting block to the exit block. 
-        PN.removeIncomingValue(i); 
- 
-      NewPN->addIncoming(Incoming, &OldPH); 
-    } 
- 
-    // Now replace the old PHI with the new one and wire the old one in as an 
-    // input to the new one. 
-    PN.replaceAllUsesWith(NewPN); 
-    NewPN->addIncoming(&PN, &ExitBB); 
-  } 
-} 
- 
-/// Hoist the current loop up to the innermost loop containing a remaining exit. 
-/// 
-/// Because we've removed an exit from the loop, we may have changed the set of 
-/// loops reachable and need to move the current loop up the loop nest or even 
-/// to an entirely separate nest. 
-static void hoistLoopToNewParent(Loop &L, BasicBlock &Preheader, 
-                                 DominatorTree &DT, LoopInfo &LI, 
-                                 MemorySSAUpdater *MSSAU, ScalarEvolution *SE) { 
-  // If the loop is already at the top level, we can't hoist it anywhere. 
-  Loop *OldParentL = L.getParentLoop(); 
-  if (!OldParentL) 
-    return; 
- 
-  SmallVector<BasicBlock *, 4> Exits; 
-  L.getExitBlocks(Exits); 
-  Loop *NewParentL = nullptr; 
-  for (auto *ExitBB : Exits) 
-    if (Loop *ExitL = LI.getLoopFor(ExitBB)) 
-      if (!NewParentL || NewParentL->contains(ExitL)) 
-        NewParentL = ExitL; 
- 
-  if (NewParentL == OldParentL) 
-    return; 
- 
-  // The new parent loop (if different) should always contain the old one. 
-  if (NewParentL) 
-    assert(NewParentL->contains(OldParentL) && 
-           "Can only hoist this loop up the nest!"); 
- 
-  // The preheader will need to move with the body of this loop. However, 
-  // because it isn't in this loop we also need to update the primary loop map. 
-  assert(OldParentL == LI.getLoopFor(&Preheader) && 
-         "Parent loop of this loop should contain this loop's preheader!"); 
-  LI.changeLoopFor(&Preheader, NewParentL); 
- 
-  // Remove this loop from its old parent. 
-  OldParentL->removeChildLoop(&L); 
- 
-  // Add the loop either to the new parent or as a top-level loop. 
-  if (NewParentL) 
-    NewParentL->addChildLoop(&L); 
-  else 
-    LI.addTopLevelLoop(&L); 
- 
-  // Remove this loops blocks from the old parent and every other loop up the 
-  // nest until reaching the new parent. Also update all of these 
-  // no-longer-containing loops to reflect the nesting change. 
-  for (Loop *OldContainingL = OldParentL; OldContainingL != NewParentL; 
-       OldContainingL = OldContainingL->getParentLoop()) { 
-    llvm::erase_if(OldContainingL->getBlocksVector(), 
-                   [&](const BasicBlock *BB) { 
-                     return BB == &Preheader || L.contains(BB); 
-                   }); 
- 
-    OldContainingL->getBlocksSet().erase(&Preheader); 
-    for (BasicBlock *BB : L.blocks()) 
-      OldContainingL->getBlocksSet().erase(BB); 
- 
-    // Because we just hoisted a loop out of this one, we have essentially 
-    // created new exit paths from it. That means we need to form LCSSA PHI 
-    // nodes for values used in the no-longer-nested loop. 
-    formLCSSA(*OldContainingL, DT, &LI, SE); 
- 
-    // We shouldn't need to form dedicated exits because the exit introduced 
-    // here is the (just split by unswitching) preheader. However, after trivial 
-    // unswitching it is possible to get new non-dedicated exits out of parent 
-    // loop so let's conservatively form dedicated exit blocks and figure out 
-    // if we can optimize later. 
-    formDedicatedExitBlocks(OldContainingL, &DT, &LI, MSSAU, 
-                            /*PreserveLCSSA*/ true); 
-  } 
-} 
- 
-// Return the top-most loop containing ExitBB and having ExitBB as exiting block 
-// or the loop containing ExitBB, if there is no parent loop containing ExitBB 
-// as exiting block. 
-static Loop *getTopMostExitingLoop(BasicBlock *ExitBB, LoopInfo &LI) { 
-  Loop *TopMost = LI.getLoopFor(ExitBB); 
-  Loop *Current = TopMost; 
-  while (Current) { 
-    if (Current->isLoopExiting(ExitBB)) 
-      TopMost = Current; 
-    Current = Current->getParentLoop(); 
-  } 
-  return TopMost; 
-} 
- 
-/// Unswitch a trivial branch if the condition is loop invariant. 
-/// 
-/// This routine should only be called when loop code leading to the branch has 
-/// been validated as trivial (no side effects). This routine checks if the 
-/// condition is invariant and one of the successors is a loop exit. This 
-/// allows us to unswitch without duplicating the loop, making it trivial. 
-/// 
-/// If this routine fails to unswitch the branch it returns false. 
-/// 
-/// If the branch can be unswitched, this routine splits the preheader and 
-/// hoists the branch above that split. Preserves loop simplified form 
-/// (splitting the exit block as necessary). It simplifies the branch within 
-/// the loop to an unconditional branch but doesn't remove it entirely. Further 
-/// cleanup can be done with some simplify-cfg like pass. 
-/// 
-/// If `SE` is not null, it will be updated based on the potential loop SCEVs 
-/// invalidated by this. 
-static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, 
-                                  LoopInfo &LI, ScalarEvolution *SE, 
-                                  MemorySSAUpdater *MSSAU) { 
-  assert(BI.isConditional() && "Can only unswitch a conditional branch!"); 
-  LLVM_DEBUG(dbgs() << "  Trying to unswitch branch: " << BI << "\n"); 
- 
-  // The loop invariant values that we want to unswitch. 
-  TinyPtrVector<Value *> Invariants; 
- 
-  // When true, we're fully unswitching the branch rather than just unswitching 
-  // some input conditions to the branch. 
-  bool FullUnswitch = false; 
- 
-  if (L.isLoopInvariant(BI.getCondition())) { 
-    Invariants.push_back(BI.getCondition()); 
-    FullUnswitch = true; 
-  } else { 
-    if (auto *CondInst = dyn_cast<Instruction>(BI.getCondition())) 
-      Invariants = collectHomogenousInstGraphLoopInvariants(L, *CondInst, LI); 
-    if (Invariants.empty()) 
-      // Couldn't find invariant inputs! 
-      return false; 
-  } 
- 
-  // Check that one of the branch's successors exits, and which one. 
-  bool ExitDirection = true; 
-  int LoopExitSuccIdx = 0; 
-  auto *LoopExitBB = BI.getSuccessor(0); 
-  if (L.contains(LoopExitBB)) { 
-    ExitDirection = false; 
-    LoopExitSuccIdx = 1; 
-    LoopExitBB = BI.getSuccessor(1); 
-    if (L.contains(LoopExitBB)) 
-      return false; 
-  } 
-  auto *ContinueBB = BI.getSuccessor(1 - LoopExitSuccIdx); 
-  auto *ParentBB = BI.getParent(); 
-  if (!areLoopExitPHIsLoopInvariant(L, *ParentBB, *LoopExitBB)) 
-    return false; 
- 
-  // When unswitching only part of the branch's condition, we need the exit 
-  // block to be reached directly from the partially unswitched input. This can 
-  // be done when the exit block is along the true edge and the branch condition 
-  // is a graph of `or` operations, or the exit block is along the false edge 
-  // and the condition is a graph of `and` operations. 
-  if (!FullUnswitch) { 
-    if (ExitDirection) { 
-      if (cast<Instruction>(BI.getCondition())->getOpcode() != Instruction::Or) 
-        return false; 
-    } else { 
-      if (cast<Instruction>(BI.getCondition())->getOpcode() != Instruction::And) 
-        return false; 
-    } 
-  } 
- 
-  LLVM_DEBUG({ 
-    dbgs() << "    unswitching trivial invariant conditions for: " << BI 
-           << "\n"; 
-    for (Value *Invariant : Invariants) { 
-      dbgs() << "      " << *Invariant << " == true"; 
-      if (Invariant != Invariants.back()) 
-        dbgs() << " ||"; 
-      dbgs() << "\n"; 
-    } 
-  }); 
- 
-  // If we have scalar evolutions, we need to invalidate them including this 
-  // loop, the loop containing the exit block and the topmost parent loop 
-  // exiting via LoopExitBB. 
-  if (SE) { 
-    if (Loop *ExitL = getTopMostExitingLoop(LoopExitBB, LI)) 
-      SE->forgetLoop(ExitL); 
-    else 
-      // Forget the entire nest as this exits the entire nest. 
-      SE->forgetTopmostLoop(&L); 
-  } 
- 
-  if (MSSAU && VerifyMemorySSA) 
-    MSSAU->getMemorySSA()->verifyMemorySSA(); 
- 
-  // Split the preheader, so that we know that there is a safe place to insert 
-  // the conditional branch. We will change the preheader to have a conditional 
-  // branch on LoopCond. 
-  BasicBlock *OldPH = L.getLoopPreheader(); 
-  BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI, MSSAU); 
- 
-  // Now that we have a place to insert the conditional branch, create a place 
-  // to branch to: this is the exit block out of the loop that we are 
-  // unswitching. We need to split this if there are other loop predecessors. 
-  // Because the loop is in simplified form, *any* other predecessor is enough. 
-  BasicBlock *UnswitchedBB; 
-  if (FullUnswitch && LoopExitBB->getUniquePredecessor()) { 
-    assert(LoopExitBB->getUniquePredecessor() == BI.getParent() && 
-           "A branch's parent isn't a predecessor!"); 
-    UnswitchedBB = LoopExitBB; 
-  } else { 
-    UnswitchedBB = 
-        SplitBlock(LoopExitBB, &LoopExitBB->front(), &DT, &LI, MSSAU); 
-  } 
- 
-  if (MSSAU && VerifyMemorySSA) 
-    MSSAU->getMemorySSA()->verifyMemorySSA(); 
- 
-  // Actually move the invariant uses into the unswitched position. If possible, 
-  // we do this by moving the instructions, but when doing partial unswitching 
-  // we do it by building a new merge of the values in the unswitched position. 
-  OldPH->getTerminator()->eraseFromParent(); 
-  if (FullUnswitch) { 
-    // If fully unswitching, we can use the existing branch instruction. 
-    // Splice it into the old PH to gate reaching the new preheader and re-point 
-    // its successors. 
-    OldPH->getInstList().splice(OldPH->end(), BI.getParent()->getInstList(), 
-                                BI); 
-    if (MSSAU) { 
-      // Temporarily clone the terminator, to make MSSA update cheaper by 
-      // separating "insert edge" updates from "remove edge" ones. 
-      ParentBB->getInstList().push_back(BI.clone()); 
-    } else { 
-      // Create a new unconditional branch that will continue the loop as a new 
-      // terminator. 
-      BranchInst::Create(ContinueBB, ParentBB); 
-    } 
-    BI.setSuccessor(LoopExitSuccIdx, UnswitchedBB); 
-    BI.setSuccessor(1 - LoopExitSuccIdx, NewPH); 
-  } else { 
-    // Only unswitching a subset of inputs to the condition, so we will need to 
-    // build a new branch that merges the invariant inputs. 
-    if (ExitDirection) 
-      assert(cast<Instruction>(BI.getCondition())->getOpcode() == 
-                 Instruction::Or && 
-             "Must have an `or` of `i1`s for the condition!"); 
-    else 
-      assert(cast<Instruction>(BI.getCondition())->getOpcode() == 
-                 Instruction::And && 
-             "Must have an `and` of `i1`s for the condition!"); 
-    buildPartialUnswitchConditionalBranch(*OldPH, Invariants, ExitDirection, 
-                                          *UnswitchedBB, *NewPH); 
-  } 
- 
-  // Update the dominator tree with the added edge. 
-  DT.insertEdge(OldPH, UnswitchedBB); 
- 
-  // After the dominator tree was updated with the added edge, update MemorySSA 
-  // if available. 
-  if (MSSAU) { 
-    SmallVector<CFGUpdate, 1> Updates; 
-    Updates.push_back({cfg::UpdateKind::Insert, OldPH, UnswitchedBB}); 
-    MSSAU->applyInsertUpdates(Updates, DT); 
-  } 
- 
-  // Finish updating dominator tree and memory ssa for full unswitch. 
-  if (FullUnswitch) { 
-    if (MSSAU) { 
-      // Remove the cloned branch instruction. 
-      ParentBB->getTerminator()->eraseFromParent(); 
-      // Create unconditional branch now. 
-      BranchInst::Create(ContinueBB, ParentBB); 
-      MSSAU->removeEdge(ParentBB, LoopExitBB); 
-    } 
-    DT.deleteEdge(ParentBB, LoopExitBB); 
-  } 
- 
-  if (MSSAU && VerifyMemorySSA) 
-    MSSAU->getMemorySSA()->verifyMemorySSA(); 
- 
-  // Rewrite the relevant PHI nodes. 
-  if (UnswitchedBB == LoopExitBB) 
-    rewritePHINodesForUnswitchedExitBlock(*UnswitchedBB, *ParentBB, *OldPH); 
-  else 
-    rewritePHINodesForExitAndUnswitchedBlocks(*LoopExitBB, *UnswitchedBB, 
-                                              *ParentBB, *OldPH, FullUnswitch); 
- 
-  // The constant we can replace all of our invariants with inside the loop 
-  // body. If any of the invariants have a value other than this the loop won't 
-  // be entered. 
-  ConstantInt *Replacement = ExitDirection 
-                                 ? ConstantInt::getFalse(BI.getContext()) 
-                                 : ConstantInt::getTrue(BI.getContext()); 
- 
-  // Since this is an i1 condition we can also trivially replace uses of it 
-  // within the loop with a constant. 
-  for (Value *Invariant : Invariants) 
-    replaceLoopInvariantUses(L, Invariant, *Replacement); 
- 
-  // If this was full unswitching, we may have changed the nesting relationship 
-  // for this loop so hoist it to its correct parent if needed. 
-  if (FullUnswitch) 
-    hoistLoopToNewParent(L, *NewPH, DT, LI, MSSAU, SE); 
- 
-  if (MSSAU && VerifyMemorySSA) 
-    MSSAU->getMemorySSA()->verifyMemorySSA(); 
- 
-  LLVM_DEBUG(dbgs() << "    done: unswitching trivial branch...\n"); 
-  ++NumTrivial; 
-  ++NumBranches; 
-  return true; 
-} 
- 
-/// Unswitch a trivial switch if the condition is loop invariant. 
-/// 
-/// This routine should only be called when loop code leading to the switch has 
-/// been validated as trivial (no side effects). This routine checks if the 
-/// condition is invariant and that at least one of the successors is a loop 
-/// exit. This allows us to unswitch without duplicating the loop, making it 
-/// trivial. 
-/// 
-/// If this routine fails to unswitch the switch it returns false. 
-/// 
-/// If the switch can be unswitched, this routine splits the preheader and 
-/// copies the switch above that split. If the default case is one of the 
-/// exiting cases, it copies the non-exiting cases and points them at the new 
-/// preheader. If the default case is not exiting, it copies the exiting cases 
-/// and points the default at the preheader. It preserves loop simplified form 
-/// (splitting the exit blocks as necessary). It simplifies the switch within 
-/// the loop by removing now-dead cases. If the default case is one of those 
-/// unswitched, it replaces its destination with a new basic block containing 
-/// only unreachable. Such basic blocks, while technically loop exits, are not 
-/// considered for unswitching so this is a stable transform and the same 
-/// switch will not be revisited. If after unswitching there is only a single 
-/// in-loop successor, the switch is further simplified to an unconditional 
-/// branch. Still more cleanup can be done with some simplify-cfg like pass. 
-/// 
-/// If `SE` is not null, it will be updated based on the potential loop SCEVs 
-/// invalidated by this. 
-static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, 
-                                  LoopInfo &LI, ScalarEvolution *SE, 
-                                  MemorySSAUpdater *MSSAU) { 
-  LLVM_DEBUG(dbgs() << "  Trying to unswitch switch: " << SI << "\n"); 
-  Value *LoopCond = SI.getCondition(); 
- 
-  // If this isn't switching on an invariant condition, we can't unswitch it. 
-  if (!L.isLoopInvariant(LoopCond)) 
-    return false; 
- 
-  auto *ParentBB = SI.getParent(); 
- 
-  // The same check must be used both for the default and the exit cases. We 
-  // should never leave edges from the switch instruction to a basic block that 
-  // we are unswitching, hence the condition used to determine the default case 
-  // needs to also be used to populate ExitCaseIndices, which is then used to 
-  // remove cases from the switch. 
-  auto IsTriviallyUnswitchableExitBlock = [&](BasicBlock &BBToCheck) { 
-    // BBToCheck is not an exit block if it is inside loop L. 
-    if (L.contains(&BBToCheck)) 
-      return false; 
-    // BBToCheck is not trivial to unswitch if its phis aren't loop invariant. 
-    if (!areLoopExitPHIsLoopInvariant(L, *ParentBB, BBToCheck)) 
-      return false; 
-    // We do not unswitch a block that only has an unreachable statement, as 
-    // it's possible this is a previously unswitched block. Only unswitch if 
-    // either the terminator is not unreachable, or, if it is, it's not the only 
-    // instruction in the block. 
-    auto *TI = BBToCheck.getTerminator(); 
-    bool isUnreachable = isa<UnreachableInst>(TI); 
-    return !isUnreachable || 
-           (isUnreachable && (BBToCheck.getFirstNonPHIOrDbg() != TI)); 
-  }; 
- 
-  SmallVector<int, 4> ExitCaseIndices; 
-  for (auto Case : SI.cases()) 
-    if (IsTriviallyUnswitchableExitBlock(*Case.getCaseSuccessor())) 
-      ExitCaseIndices.push_back(Case.getCaseIndex()); 
-  BasicBlock *DefaultExitBB = nullptr; 
-  SwitchInstProfUpdateWrapper::CaseWeightOpt DefaultCaseWeight = 
-      SwitchInstProfUpdateWrapper::getSuccessorWeight(SI, 0); 
-  if (IsTriviallyUnswitchableExitBlock(*SI.getDefaultDest())) { 
-    DefaultExitBB = SI.getDefaultDest(); 
-  } else if (ExitCaseIndices.empty()) 
-    return false; 
- 
-  LLVM_DEBUG(dbgs() << "    unswitching trivial switch...\n"); 
- 
-  if (MSSAU && VerifyMemorySSA) 
-    MSSAU->getMemorySSA()->verifyMemorySSA(); 
- 
-  // We may need to invalidate SCEVs for the outermost loop reached by any of 
-  // the exits. 
-  Loop *OuterL = &L; 
- 
-  if (DefaultExitBB) { 
-    // Clear out the default destination temporarily to allow accurate 
-    // predecessor lists to be examined below. 
-    SI.setDefaultDest(nullptr); 
-    // Check the loop containing this exit. 
-    Loop *ExitL = LI.getLoopFor(DefaultExitBB); 
-    if (!ExitL || ExitL->contains(OuterL)) 
-      OuterL = ExitL; 
-  } 
- 
-  // Store the exit cases into a separate data structure and remove them from 
-  // the switch. 
-  SmallVector<std::tuple<ConstantInt *, BasicBlock *, 
-                         SwitchInstProfUpdateWrapper::CaseWeightOpt>, 
-              4> ExitCases; 
-  ExitCases.reserve(ExitCaseIndices.size()); 
-  SwitchInstProfUpdateWrapper SIW(SI); 
-  // We walk the case indices backwards so that we remove the last case first 
-  // and don't disrupt the earlier indices. 
-  for (unsigned Index : reverse(ExitCaseIndices)) { 
-    auto CaseI = SI.case_begin() + Index; 
-    // Compute the outer loop from this exit. 
-    Loop *ExitL = LI.getLoopFor(CaseI->getCaseSuccessor()); 
-    if (!ExitL || ExitL->contains(OuterL)) 
-      OuterL = ExitL; 
-    // Save the value of this case. 
-    auto W = SIW.getSuccessorWeight(CaseI->getSuccessorIndex()); 
-    ExitCases.emplace_back(CaseI->getCaseValue(), CaseI->getCaseSuccessor(), W); 
-    // Delete the unswitched cases. 
-    SIW.removeCase(CaseI); 
-  } 
- 
-  if (SE) { 
-    if (OuterL) 
-      SE->forgetLoop(OuterL); 
-    else 
-      SE->forgetTopmostLoop(&L); 
-  } 
- 
-  // Check if after this all of the remaining cases point at the same 
-  // successor. 
-  BasicBlock *CommonSuccBB = nullptr; 
-  if (SI.getNumCases() > 0 && 
+
+/// Collect all of the loop invariant input values transitively used by the
+/// homogeneous instruction graph from a given root.
+///
+/// This essentially walks from a root recursively through loop variant operands
+/// which have the exact same opcode and finds all inputs which are loop
+/// invariant. For some operations these can be re-associated and unswitched out
+/// of the loop entirely.
+static TinyPtrVector<Value *>
+collectHomogenousInstGraphLoopInvariants(Loop &L, Instruction &Root,
+                                         LoopInfo &LI) {
+  assert(!L.isLoopInvariant(&Root) &&
+         "Only need to walk the graph if root itself is not invariant.");
+  TinyPtrVector<Value *> Invariants;
+
+  // Build a worklist and recurse through operators collecting invariants.
+  SmallVector<Instruction *, 4> Worklist;
+  SmallPtrSet<Instruction *, 8> Visited;
+  Worklist.push_back(&Root);
+  Visited.insert(&Root);
+  do {
+    Instruction &I = *Worklist.pop_back_val();
+    for (Value *OpV : I.operand_values()) {
+      // Skip constants as unswitching isn't interesting for them.
+      if (isa<Constant>(OpV))
+        continue;
+
+      // Add it to our result if loop invariant.
+      if (L.isLoopInvariant(OpV)) {
+        Invariants.push_back(OpV);
+        continue;
+      }
+
+      // If not an instruction with the same opcode, nothing we can do.
+      Instruction *OpI = dyn_cast<Instruction>(OpV);
+      if (!OpI || OpI->getOpcode() != Root.getOpcode())
+        continue;
+
+      // Visit this operand.
+      if (Visited.insert(OpI).second)
+        Worklist.push_back(OpI);
+    }
+  } while (!Worklist.empty());
+
+  return Invariants;
+}
+
+static void replaceLoopInvariantUses(Loop &L, Value *Invariant,
+                                     Constant &Replacement) {
+  assert(!isa<Constant>(Invariant) && "Why are we unswitching on a constant?");
+
+  // Replace uses of LIC in the loop with the given constant.
+  for (auto UI = Invariant->use_begin(), UE = Invariant->use_end(); UI != UE;) {
+    // Grab the use and walk past it so we can clobber it in the use list.
+    Use *U = &*UI++;
+    Instruction *UserI = dyn_cast<Instruction>(U->getUser());
+
+    // Replace this use within the loop body.
+    if (UserI && L.contains(UserI))
+      U->set(&Replacement);
+  }
+}
+
+/// Check that all the LCSSA PHI nodes in the loop exit block have trivial
+/// incoming values along this edge.
+static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB,
+                                         BasicBlock &ExitBB) {
+  for (Instruction &I : ExitBB) {
+    auto *PN = dyn_cast<PHINode>(&I);
+    if (!PN)
+      // No more PHIs to check.
+      return true;
+
+    // If the incoming value for this edge isn't loop invariant the unswitch
+    // won't be trivial.
+    if (!L.isLoopInvariant(PN->getIncomingValueForBlock(&ExitingBB)))
+      return false;
+  }
+  llvm_unreachable("Basic blocks should never be empty!");
+}
+
+/// Insert code to test a set of loop invariant values, and conditionally branch
+/// on them.
+static void buildPartialUnswitchConditionalBranch(BasicBlock &BB,
+                                                  ArrayRef<Value *> Invariants,
+                                                  bool Direction,
+                                                  BasicBlock &UnswitchedSucc,
+                                                  BasicBlock &NormalSucc) {
+  IRBuilder<> IRB(&BB);
+
+  Value *Cond = Direction ? IRB.CreateOr(Invariants) :
+    IRB.CreateAnd(Invariants);
+  IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc,
+                   Direction ? &NormalSucc : &UnswitchedSucc);
+}
+
+/// Rewrite the PHI nodes in an unswitched loop exit basic block.
+///
+/// Requires that the loop exit and unswitched basic block are the same, and
+/// that the exiting block was a unique predecessor of that block. Rewrites the
+/// PHI nodes in that block such that what were LCSSA PHI nodes become trivial
+/// PHI nodes from the old preheader that now contains the unswitched
+/// terminator.
+static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB,
+                                                  BasicBlock &OldExitingBB,
+                                                  BasicBlock &OldPH) {
+  for (PHINode &PN : UnswitchedBB.phis()) {
+    // When the loop exit is directly unswitched we just need to update the
+    // incoming basic block. We loop to handle weird cases with repeated
+    // incoming blocks, but expect to typically only have one operand here.
+    for (auto i : seq<int>(0, PN.getNumOperands())) {
+      assert(PN.getIncomingBlock(i) == &OldExitingBB &&
+             "Found incoming block different from unique predecessor!");
+      PN.setIncomingBlock(i, &OldPH);
+    }
+  }
+}
+
+/// Rewrite the PHI nodes in the loop exit basic block and the split off
+/// unswitched block.
+///
+/// Because the exit block remains an exit from the loop, this rewrites the
+/// LCSSA PHI nodes in it to remove the unswitched edge and introduces PHI
+/// nodes into the unswitched basic block to select between the value in the
+/// old preheader and the loop exit.
+static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB,
+                                                      BasicBlock &UnswitchedBB,
+                                                      BasicBlock &OldExitingBB,
+                                                      BasicBlock &OldPH,
+                                                      bool FullUnswitch) {
+  assert(&ExitBB != &UnswitchedBB &&
+         "Must have different loop exit and unswitched blocks!");
+  Instruction *InsertPt = &*UnswitchedBB.begin();
+  for (PHINode &PN : ExitBB.phis()) {
+    auto *NewPN = PHINode::Create(PN.getType(), /*NumReservedValues*/ 2,
+                                  PN.getName() + ".split", InsertPt);
+
+    // Walk backwards over the old PHI node's inputs to minimize the cost of
+    // removing each one. We have to do this weird loop manually so that we
+    // create the same number of new incoming edges in the new PHI as we expect
+    // each case-based edge to be included in the unswitched switch in some
+    // cases.
+    // FIXME: This is really, really gross. It would be much cleaner if LLVM
+    // allowed us to create a single entry for a predecessor block without
+    // having separate entries for each "edge" even though these edges are
+    // required to produce identical results.
+    for (int i = PN.getNumIncomingValues() - 1; i >= 0; --i) {
+      if (PN.getIncomingBlock(i) != &OldExitingBB)
+        continue;
+
+      Value *Incoming = PN.getIncomingValue(i);
+      if (FullUnswitch)
+        // No more edge from the old exiting block to the exit block.
+        PN.removeIncomingValue(i);
+
+      NewPN->addIncoming(Incoming, &OldPH);
+    }
+
+    // Now replace the old PHI with the new one and wire the old one in as an
+    // input to the new one.
+    PN.replaceAllUsesWith(NewPN);
+    NewPN->addIncoming(&PN, &ExitBB);
+  }
+}
+
+/// Hoist the current loop up to the innermost loop containing a remaining exit.
+///
+/// Because we've removed an exit from the loop, we may have changed the set of
+/// loops reachable and need to move the current loop up the loop nest or even
+/// to an entirely separate nest.
+static void hoistLoopToNewParent(Loop &L, BasicBlock &Preheader,
+                                 DominatorTree &DT, LoopInfo &LI,
+                                 MemorySSAUpdater *MSSAU, ScalarEvolution *SE) {
+  // If the loop is already at the top level, we can't hoist it anywhere.
+  Loop *OldParentL = L.getParentLoop();
+  if (!OldParentL)
+    return;
+
+  SmallVector<BasicBlock *, 4> Exits;
+  L.getExitBlocks(Exits);
+  Loop *NewParentL = nullptr;
+  for (auto *ExitBB : Exits)
+    if (Loop *ExitL = LI.getLoopFor(ExitBB))
+      if (!NewParentL || NewParentL->contains(ExitL))
+        NewParentL = ExitL;
+
+  if (NewParentL == OldParentL)
+    return;
+
+  // The new parent loop (if different) should always contain the old one.
+  if (NewParentL)
+    assert(NewParentL->contains(OldParentL) &&
+           "Can only hoist this loop up the nest!");
+
+  // The preheader will need to move with the body of this loop. However,
+  // because it isn't in this loop we also need to update the primary loop map.
+  assert(OldParentL == LI.getLoopFor(&Preheader) &&
+         "Parent loop of this loop should contain this loop's preheader!");
+  LI.changeLoopFor(&Preheader, NewParentL);
+
+  // Remove this loop from its old parent.
+  OldParentL->removeChildLoop(&L);
+
+  // Add the loop either to the new parent or as a top-level loop.
+  if (NewParentL)
+    NewParentL->addChildLoop(&L);
+  else
+    LI.addTopLevelLoop(&L);
+
+  // Remove this loops blocks from the old parent and every other loop up the
+  // nest until reaching the new parent. Also update all of these
+  // no-longer-containing loops to reflect the nesting change.
+  for (Loop *OldContainingL = OldParentL; OldContainingL != NewParentL;
+       OldContainingL = OldContainingL->getParentLoop()) {
+    llvm::erase_if(OldContainingL->getBlocksVector(),
+                   [&](const BasicBlock *BB) {
+                     return BB == &Preheader || L.contains(BB);
+                   });
+
+    OldContainingL->getBlocksSet().erase(&Preheader);
+    for (BasicBlock *BB : L.blocks())
+      OldContainingL->getBlocksSet().erase(BB);
+
+    // Because we just hoisted a loop out of this one, we have essentially
+    // created new exit paths from it. That means we need to form LCSSA PHI
+    // nodes for values used in the no-longer-nested loop.
+    formLCSSA(*OldContainingL, DT, &LI, SE);
+
+    // We shouldn't need to form dedicated exits because the exit introduced
+    // here is the (just split by unswitching) preheader. However, after trivial
+    // unswitching it is possible to get new non-dedicated exits out of parent
+    // loop so let's conservatively form dedicated exit blocks and figure out
+    // if we can optimize later.
+    formDedicatedExitBlocks(OldContainingL, &DT, &LI, MSSAU,
+                            /*PreserveLCSSA*/ true);
+  }
+}
+
+// Return the top-most loop containing ExitBB and having ExitBB as exiting block
+// or the loop containing ExitBB, if there is no parent loop containing ExitBB
+// as exiting block.
+static Loop *getTopMostExitingLoop(BasicBlock *ExitBB, LoopInfo &LI) {
+  Loop *TopMost = LI.getLoopFor(ExitBB);
+  Loop *Current = TopMost;
+  while (Current) {
+    if (Current->isLoopExiting(ExitBB))
+      TopMost = Current;
+    Current = Current->getParentLoop();
+  }
+  return TopMost;
+}
+
+/// Unswitch a trivial branch if the condition is loop invariant.
+///
+/// This routine should only be called when loop code leading to the branch has
+/// been validated as trivial (no side effects). This routine checks if the
+/// condition is invariant and one of the successors is a loop exit. This
+/// allows us to unswitch without duplicating the loop, making it trivial.
+///
+/// If this routine fails to unswitch the branch it returns false.
+///
+/// If the branch can be unswitched, this routine splits the preheader and
+/// hoists the branch above that split. Preserves loop simplified form
+/// (splitting the exit block as necessary). It simplifies the branch within
+/// the loop to an unconditional branch but doesn't remove it entirely. Further
+/// cleanup can be done with some simplify-cfg like pass.
+///
+/// If `SE` is not null, it will be updated based on the potential loop SCEVs
+/// invalidated by this.
+static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
+                                  LoopInfo &LI, ScalarEvolution *SE,
+                                  MemorySSAUpdater *MSSAU) {
+  assert(BI.isConditional() && "Can only unswitch a conditional branch!");
+  LLVM_DEBUG(dbgs() << "  Trying to unswitch branch: " << BI << "\n");
+
+  // The loop invariant values that we want to unswitch.
+  TinyPtrVector<Value *> Invariants;
+
+  // When true, we're fully unswitching the branch rather than just unswitching
+  // some input conditions to the branch.
+  bool FullUnswitch = false;
+
+  if (L.isLoopInvariant(BI.getCondition())) {
+    Invariants.push_back(BI.getCondition());
+    FullUnswitch = true;
+  } else {
+    if (auto *CondInst = dyn_cast<Instruction>(BI.getCondition()))
+      Invariants = collectHomogenousInstGraphLoopInvariants(L, *CondInst, LI);
+    if (Invariants.empty())
+      // Couldn't find invariant inputs!
+      return false;
+  }
+
+  // Check that one of the branch's successors exits, and which one.
+  bool ExitDirection = true;
+  int LoopExitSuccIdx = 0;
+  auto *LoopExitBB = BI.getSuccessor(0);
+  if (L.contains(LoopExitBB)) {
+    ExitDirection = false;
+    LoopExitSuccIdx = 1;
+    LoopExitBB = BI.getSuccessor(1);
+    if (L.contains(LoopExitBB))
+      return false;
+  }
+  auto *ContinueBB = BI.getSuccessor(1 - LoopExitSuccIdx);
+  auto *ParentBB = BI.getParent();
+  if (!areLoopExitPHIsLoopInvariant(L, *ParentBB, *LoopExitBB))
+    return false;
+
+  // When unswitching only part of the branch's condition, we need the exit
+  // block to be reached directly from the partially unswitched input. This can
+  // be done when the exit block is along the true edge and the branch condition
+  // is a graph of `or` operations, or the exit block is along the false edge
+  // and the condition is a graph of `and` operations.
+  if (!FullUnswitch) {
+    if (ExitDirection) {
+      if (cast<Instruction>(BI.getCondition())->getOpcode() != Instruction::Or)
+        return false;
+    } else {
+      if (cast<Instruction>(BI.getCondition())->getOpcode() != Instruction::And)
+        return false;
+    }
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "    unswitching trivial invariant conditions for: " << BI
+           << "\n";
+    for (Value *Invariant : Invariants) {
+      dbgs() << "      " << *Invariant << " == true";
+      if (Invariant != Invariants.back())
+        dbgs() << " ||";
+      dbgs() << "\n";
+    }
+  });
+
+  // If we have scalar evolutions, we need to invalidate them including this
+  // loop, the loop containing the exit block and the topmost parent loop
+  // exiting via LoopExitBB.
+  if (SE) {
+    if (Loop *ExitL = getTopMostExitingLoop(LoopExitBB, LI))
+      SE->forgetLoop(ExitL);
+    else
+      // Forget the entire nest as this exits the entire nest.
+      SE->forgetTopmostLoop(&L);
+  }
+
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
+  // Split the preheader, so that we know that there is a safe place to insert
+  // the conditional branch. We will change the preheader to have a conditional
+  // branch on LoopCond.
+  BasicBlock *OldPH = L.getLoopPreheader();
+  BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI, MSSAU);
+
+  // Now that we have a place to insert the conditional branch, create a place
+  // to branch to: this is the exit block out of the loop that we are
+  // unswitching. We need to split this if there are other loop predecessors.
+  // Because the loop is in simplified form, *any* other predecessor is enough.
+  BasicBlock *UnswitchedBB;
+  if (FullUnswitch && LoopExitBB->getUniquePredecessor()) {
+    assert(LoopExitBB->getUniquePredecessor() == BI.getParent() &&
+           "A branch's parent isn't a predecessor!");
+    UnswitchedBB = LoopExitBB;
+  } else {
+    UnswitchedBB =
+        SplitBlock(LoopExitBB, &LoopExitBB->front(), &DT, &LI, MSSAU);
+  }
+
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
+  // Actually move the invariant uses into the unswitched position. If possible,
+  // we do this by moving the instructions, but when doing partial unswitching
+  // we do it by building a new merge of the values in the unswitched position.
+  OldPH->getTerminator()->eraseFromParent();
+  if (FullUnswitch) {
+    // If fully unswitching, we can use the existing branch instruction.
+    // Splice it into the old PH to gate reaching the new preheader and re-point
+    // its successors.
+    OldPH->getInstList().splice(OldPH->end(), BI.getParent()->getInstList(),
+                                BI);
+    if (MSSAU) {
+      // Temporarily clone the terminator, to make MSSA update cheaper by
+      // separating "insert edge" updates from "remove edge" ones.
+      ParentBB->getInstList().push_back(BI.clone());
+    } else {
+      // Create a new unconditional branch that will continue the loop as a new
+      // terminator.
+      BranchInst::Create(ContinueBB, ParentBB);
+    }
+    BI.setSuccessor(LoopExitSuccIdx, UnswitchedBB);
+    BI.setSuccessor(1 - LoopExitSuccIdx, NewPH);
+  } else {
+    // Only unswitching a subset of inputs to the condition, so we will need to
+    // build a new branch that merges the invariant inputs.
+    if (ExitDirection)
+      assert(cast<Instruction>(BI.getCondition())->getOpcode() ==
+                 Instruction::Or &&
+             "Must have an `or` of `i1`s for the condition!");
+    else
+      assert(cast<Instruction>(BI.getCondition())->getOpcode() ==
+                 Instruction::And &&
+             "Must have an `and` of `i1`s for the condition!");
+    buildPartialUnswitchConditionalBranch(*OldPH, Invariants, ExitDirection,
+                                          *UnswitchedBB, *NewPH);
+  }
+
+  // Update the dominator tree with the added edge.
+  DT.insertEdge(OldPH, UnswitchedBB);
+
+  // After the dominator tree was updated with the added edge, update MemorySSA
+  // if available.
+  if (MSSAU) {
+    SmallVector<CFGUpdate, 1> Updates;
+    Updates.push_back({cfg::UpdateKind::Insert, OldPH, UnswitchedBB});
+    MSSAU->applyInsertUpdates(Updates, DT);
+  }
+
+  // Finish updating dominator tree and memory ssa for full unswitch.
+  if (FullUnswitch) {
+    if (MSSAU) {
+      // Remove the cloned branch instruction.
+      ParentBB->getTerminator()->eraseFromParent();
+      // Create unconditional branch now.
+      BranchInst::Create(ContinueBB, ParentBB);
+      MSSAU->removeEdge(ParentBB, LoopExitBB);
+    }
+    DT.deleteEdge(ParentBB, LoopExitBB);
+  }
+
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
+  // Rewrite the relevant PHI nodes.
+  if (UnswitchedBB == LoopExitBB)
+    rewritePHINodesForUnswitchedExitBlock(*UnswitchedBB, *ParentBB, *OldPH);
+  else
+    rewritePHINodesForExitAndUnswitchedBlocks(*LoopExitBB, *UnswitchedBB,
+                                              *ParentBB, *OldPH, FullUnswitch);
+
+  // The constant we can replace all of our invariants with inside the loop
+  // body. If any of the invariants have a value other than this the loop won't
+  // be entered.
+  ConstantInt *Replacement = ExitDirection
+                                 ? ConstantInt::getFalse(BI.getContext())
+                                 : ConstantInt::getTrue(BI.getContext());
+
+  // Since this is an i1 condition we can also trivially replace uses of it
+  // within the loop with a constant.
+  for (Value *Invariant : Invariants)
+    replaceLoopInvariantUses(L, Invariant, *Replacement);
+
+  // If this was full unswitching, we may have changed the nesting relationship
+  // for this loop so hoist it to its correct parent if needed.
+  if (FullUnswitch)
+    hoistLoopToNewParent(L, *NewPH, DT, LI, MSSAU, SE);
+
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
+  LLVM_DEBUG(dbgs() << "    done: unswitching trivial branch...\n");
+  ++NumTrivial;
+  ++NumBranches;
+  return true;
+}
+
+/// Unswitch a trivial switch if the condition is loop invariant.
+///
+/// This routine should only be called when loop code leading to the switch has
+/// been validated as trivial (no side effects). This routine checks if the
+/// condition is invariant and that at least one of the successors is a loop
+/// exit. This allows us to unswitch without duplicating the loop, making it
+/// trivial.
+///
+/// If this routine fails to unswitch the switch it returns false.
+///
+/// If the switch can be unswitched, this routine splits the preheader and
+/// copies the switch above that split. If the default case is one of the
+/// exiting cases, it copies the non-exiting cases and points them at the new
+/// preheader. If the default case is not exiting, it copies the exiting cases
+/// and points the default at the preheader. It preserves loop simplified form
+/// (splitting the exit blocks as necessary). It simplifies the switch within
+/// the loop by removing now-dead cases. If the default case is one of those
+/// unswitched, it replaces its destination with a new basic block containing
+/// only unreachable. Such basic blocks, while technically loop exits, are not
+/// considered for unswitching so this is a stable transform and the same
+/// switch will not be revisited. If after unswitching there is only a single
+/// in-loop successor, the switch is further simplified to an unconditional
+/// branch. Still more cleanup can be done with some simplify-cfg like pass.
+///
+/// If `SE` is not null, it will be updated based on the potential loop SCEVs
+/// invalidated by this.
+static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
+                                  LoopInfo &LI, ScalarEvolution *SE,
+                                  MemorySSAUpdater *MSSAU) {
+  LLVM_DEBUG(dbgs() << "  Trying to unswitch switch: " << SI << "\n");
+  Value *LoopCond = SI.getCondition();
+
+  // If this isn't switching on an invariant condition, we can't unswitch it.
+  if (!L.isLoopInvariant(LoopCond))
+    return false;
+
+  auto *ParentBB = SI.getParent();
+
+  // The same check must be used both for the default and the exit cases. We
+  // should never leave edges from the switch instruction to a basic block that
+  // we are unswitching, hence the condition used to determine the default case
+  // needs to also be used to populate ExitCaseIndices, which is then used to
+  // remove cases from the switch.
+  auto IsTriviallyUnswitchableExitBlock = [&](BasicBlock &BBToCheck) {
+    // BBToCheck is not an exit block if it is inside loop L.
+    if (L.contains(&BBToCheck))
+      return false;
+    // BBToCheck is not trivial to unswitch if its phis aren't loop invariant.
+    if (!areLoopExitPHIsLoopInvariant(L, *ParentBB, BBToCheck))
+      return false;
+    // We do not unswitch a block that only has an unreachable statement, as
+    // it's possible this is a previously unswitched block. Only unswitch if
+    // either the terminator is not unreachable, or, if it is, it's not the only
+    // instruction in the block.
+    auto *TI = BBToCheck.getTerminator();
+    bool isUnreachable = isa<UnreachableInst>(TI);
+    return !isUnreachable ||
+           (isUnreachable && (BBToCheck.getFirstNonPHIOrDbg() != TI));
+  };
+
+  SmallVector<int, 4> ExitCaseIndices;
+  for (auto Case : SI.cases())
+    if (IsTriviallyUnswitchableExitBlock(*Case.getCaseSuccessor()))
+      ExitCaseIndices.push_back(Case.getCaseIndex());
+  BasicBlock *DefaultExitBB = nullptr;
+  SwitchInstProfUpdateWrapper::CaseWeightOpt DefaultCaseWeight =
+      SwitchInstProfUpdateWrapper::getSuccessorWeight(SI, 0);
+  if (IsTriviallyUnswitchableExitBlock(*SI.getDefaultDest())) {
+    DefaultExitBB = SI.getDefaultDest();
+  } else if (ExitCaseIndices.empty())
+    return false;
+
+  LLVM_DEBUG(dbgs() << "    unswitching trivial switch...\n");
+
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
+  // We may need to invalidate SCEVs for the outermost loop reached by any of
+  // the exits.
+  Loop *OuterL = &L;
+
+  if (DefaultExitBB) {
+    // Clear out the default destination temporarily to allow accurate
+    // predecessor lists to be examined below.
+    SI.setDefaultDest(nullptr);
+    // Check the loop containing this exit.
+    Loop *ExitL = LI.getLoopFor(DefaultExitBB);
+    if (!ExitL || ExitL->contains(OuterL))
+      OuterL = ExitL;
+  }
+
+  // Store the exit cases into a separate data structure and remove them from
+  // the switch.
+  SmallVector<std::tuple<ConstantInt *, BasicBlock *,
+                         SwitchInstProfUpdateWrapper::CaseWeightOpt>,
+              4> ExitCases;
+  ExitCases.reserve(ExitCaseIndices.size());
+  SwitchInstProfUpdateWrapper SIW(SI);
+  // We walk the case indices backwards so that we remove the last case first
+  // and don't disrupt the earlier indices.
+  for (unsigned Index : reverse(ExitCaseIndices)) {
+    auto CaseI = SI.case_begin() + Index;
+    // Compute the outer loop from this exit.
+    Loop *ExitL = LI.getLoopFor(CaseI->getCaseSuccessor());
+    if (!ExitL || ExitL->contains(OuterL))
+      OuterL = ExitL;
+    // Save the value of this case.
+    auto W = SIW.getSuccessorWeight(CaseI->getSuccessorIndex());
+    ExitCases.emplace_back(CaseI->getCaseValue(), CaseI->getCaseSuccessor(), W);
+    // Delete the unswitched cases.
+    SIW.removeCase(CaseI);
+  }
+
+  if (SE) {
+    if (OuterL)
+      SE->forgetLoop(OuterL);
+    else
+      SE->forgetTopmostLoop(&L);
+  }
+
+  // Check if after this all of the remaining cases point at the same
+  // successor.
+  BasicBlock *CommonSuccBB = nullptr;
+  if (SI.getNumCases() > 0 &&
       all_of(drop_begin(SI.cases()), [&SI](const SwitchInst::CaseHandle &Case) {
         return Case.getCaseSuccessor() == SI.case_begin()->getCaseSuccessor();
       }))
-    CommonSuccBB = SI.case_begin()->getCaseSuccessor(); 
-  if (!DefaultExitBB) { 
-    // If we're not unswitching the default, we need it to match any cases to 
-    // have a common successor or if we have no cases it is the common 
-    // successor. 
-    if (SI.getNumCases() == 0) 
-      CommonSuccBB = SI.getDefaultDest(); 
-    else if (SI.getDefaultDest() != CommonSuccBB) 
-      CommonSuccBB = nullptr; 
-  } 
- 
-  // Split the preheader, so that we know that there is a safe place to insert 
-  // the switch. 
-  BasicBlock *OldPH = L.getLoopPreheader(); 
-  BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI, MSSAU); 
-  OldPH->getTerminator()->eraseFromParent(); 
- 
-  // Now add the unswitched switch. 
-  auto *NewSI = SwitchInst::Create(LoopCond, NewPH, ExitCases.size(), OldPH); 
-  SwitchInstProfUpdateWrapper NewSIW(*NewSI); 
- 
-  // Rewrite the IR for the unswitched basic blocks. This requires two steps. 
-  // First, we split any exit blocks with remaining in-loop predecessors. Then 
-  // we update the PHIs in one of two ways depending on if there was a split. 
-  // We walk in reverse so that we split in the same order as the cases 
-  // appeared. This is purely for convenience of reading the resulting IR, but 
-  // it doesn't cost anything really. 
-  SmallPtrSet<BasicBlock *, 2> UnswitchedExitBBs; 
-  SmallDenseMap<BasicBlock *, BasicBlock *, 2> SplitExitBBMap; 
-  // Handle the default exit if necessary. 
-  // FIXME: It'd be great if we could merge this with the loop below but LLVM's 
-  // ranges aren't quite powerful enough yet. 
-  if (DefaultExitBB) { 
-    if (pred_empty(DefaultExitBB)) { 
-      UnswitchedExitBBs.insert(DefaultExitBB); 
-      rewritePHINodesForUnswitchedExitBlock(*DefaultExitBB, *ParentBB, *OldPH); 
-    } else { 
-      auto *SplitBB = 
-          SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI, MSSAU); 
-      rewritePHINodesForExitAndUnswitchedBlocks(*DefaultExitBB, *SplitBB, 
-                                                *ParentBB, *OldPH, 
-                                                /*FullUnswitch*/ true); 
-      DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB; 
-    } 
-  } 
-  // Note that we must use a reference in the for loop so that we update the 
-  // container. 
-  for (auto &ExitCase : reverse(ExitCases)) { 
-    // Grab a reference to the exit block in the pair so that we can update it. 
-    BasicBlock *ExitBB = std::get<1>(ExitCase); 
- 
-    // If this case is the last edge into the exit block, we can simply reuse it 
-    // as it will no longer be a loop exit. No mapping necessary. 
-    if (pred_empty(ExitBB)) { 
-      // Only rewrite once. 
-      if (UnswitchedExitBBs.insert(ExitBB).second) 
-        rewritePHINodesForUnswitchedExitBlock(*ExitBB, *ParentBB, *OldPH); 
-      continue; 
-    } 
- 
-    // Otherwise we need to split the exit block so that we retain an exit 
-    // block from the loop and a target for the unswitched condition. 
-    BasicBlock *&SplitExitBB = SplitExitBBMap[ExitBB]; 
-    if (!SplitExitBB) { 
-      // If this is the first time we see this, do the split and remember it. 
-      SplitExitBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI, MSSAU); 
-      rewritePHINodesForExitAndUnswitchedBlocks(*ExitBB, *SplitExitBB, 
-                                                *ParentBB, *OldPH, 
-                                                /*FullUnswitch*/ true); 
-    } 
-    // Update the case pair to point to the split block. 
-    std::get<1>(ExitCase) = SplitExitBB; 
-  } 
- 
-  // Now add the unswitched cases. We do this in reverse order as we built them 
-  // in reverse order. 
-  for (auto &ExitCase : reverse(ExitCases)) { 
-    ConstantInt *CaseVal = std::get<0>(ExitCase); 
-    BasicBlock *UnswitchedBB = std::get<1>(ExitCase); 
- 
-    NewSIW.addCase(CaseVal, UnswitchedBB, std::get<2>(ExitCase)); 
-  } 
- 
-  // If the default was unswitched, re-point it and add explicit cases for 
-  // entering the loop. 
-  if (DefaultExitBB) { 
-    NewSIW->setDefaultDest(DefaultExitBB); 
-    NewSIW.setSuccessorWeight(0, DefaultCaseWeight); 
- 
-    // We removed all the exit cases, so we just copy the cases to the 
-    // unswitched switch. 
-    for (const auto &Case : SI.cases()) 
-      NewSIW.addCase(Case.getCaseValue(), NewPH, 
-                     SIW.getSuccessorWeight(Case.getSuccessorIndex())); 
-  } else if (DefaultCaseWeight) { 
-    // We have to set branch weight of the default case. 
-    uint64_t SW = *DefaultCaseWeight; 
-    for (const auto &Case : SI.cases()) { 
-      auto W = SIW.getSuccessorWeight(Case.getSuccessorIndex()); 
-      assert(W && 
-             "case weight must be defined as default case weight is defined"); 
-      SW += *W; 
-    } 
-    NewSIW.setSuccessorWeight(0, SW); 
-  } 
- 
-  // If we ended up with a common successor for every path through the switch 
-  // after unswitching, rewrite it to an unconditional branch to make it easy 
-  // to recognize. Otherwise we potentially have to recognize the default case 
-  // pointing at unreachable and other complexity. 
-  if (CommonSuccBB) { 
-    BasicBlock *BB = SI.getParent(); 
-    // We may have had multiple edges to this common successor block, so remove 
-    // them as predecessors. We skip the first one, either the default or the 
-    // actual first case. 
-    bool SkippedFirst = DefaultExitBB == nullptr; 
-    for (auto Case : SI.cases()) { 
-      assert(Case.getCaseSuccessor() == CommonSuccBB && 
-             "Non-common successor!"); 
-      (void)Case; 
-      if (!SkippedFirst) { 
-        SkippedFirst = true; 
-        continue; 
-      } 
-      CommonSuccBB->removePredecessor(BB, 
-                                      /*KeepOneInputPHIs*/ true); 
-    } 
-    // Now nuke the switch and replace it with a direct branch. 
-    SIW.eraseFromParent(); 
-    BranchInst::Create(CommonSuccBB, BB); 
-  } else if (DefaultExitBB) { 
-    assert(SI.getNumCases() > 0 && 
-           "If we had no cases we'd have a common successor!"); 
-    // Move the last case to the default successor. This is valid as if the 
-    // default got unswitched it cannot be reached. This has the advantage of 
-    // being simple and keeping the number of edges from this switch to 
-    // successors the same, and avoiding any PHI update complexity. 
-    auto LastCaseI = std::prev(SI.case_end()); 
- 
-    SI.setDefaultDest(LastCaseI->getCaseSuccessor()); 
-    SIW.setSuccessorWeight( 
-        0, SIW.getSuccessorWeight(LastCaseI->getSuccessorIndex())); 
-    SIW.removeCase(LastCaseI); 
-  } 
- 
-  // Walk the unswitched exit blocks and the unswitched split blocks and update 
-  // the dominator tree based on the CFG edits. While we are walking unordered 
-  // containers here, the API for applyUpdates takes an unordered list of 
-  // updates and requires them to not contain duplicates. 
-  SmallVector<DominatorTree::UpdateType, 4> DTUpdates; 
-  for (auto *UnswitchedExitBB : UnswitchedExitBBs) { 
-    DTUpdates.push_back({DT.Delete, ParentBB, UnswitchedExitBB}); 
-    DTUpdates.push_back({DT.Insert, OldPH, UnswitchedExitBB}); 
-  } 
-  for (auto SplitUnswitchedPair : SplitExitBBMap) { 
-    DTUpdates.push_back({DT.Delete, ParentBB, SplitUnswitchedPair.first}); 
-    DTUpdates.push_back({DT.Insert, OldPH, SplitUnswitchedPair.second}); 
-  } 
- 
-  if (MSSAU) { 
+    CommonSuccBB = SI.case_begin()->getCaseSuccessor();
+  if (!DefaultExitBB) {
+    // If we're not unswitching the default, we need it to match any cases to
+    // have a common successor or if we have no cases it is the common
+    // successor.
+    if (SI.getNumCases() == 0)
+      CommonSuccBB = SI.getDefaultDest();
+    else if (SI.getDefaultDest() != CommonSuccBB)
+      CommonSuccBB = nullptr;
+  }
+
+  // Split the preheader, so that we know that there is a safe place to insert
+  // the switch.
+  BasicBlock *OldPH = L.getLoopPreheader();
+  BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI, MSSAU);
+  OldPH->getTerminator()->eraseFromParent();
+
+  // Now add the unswitched switch.
+  auto *NewSI = SwitchInst::Create(LoopCond, NewPH, ExitCases.size(), OldPH);
+  SwitchInstProfUpdateWrapper NewSIW(*NewSI);
+
+  // Rewrite the IR for the unswitched basic blocks. This requires two steps.
+  // First, we split any exit blocks with remaining in-loop predecessors. Then
+  // we update the PHIs in one of two ways depending on if there was a split.
+  // We walk in reverse so that we split in the same order as the cases
+  // appeared. This is purely for convenience of reading the resulting IR, but
+  // it doesn't cost anything really.
+  SmallPtrSet<BasicBlock *, 2> UnswitchedExitBBs;
+  SmallDenseMap<BasicBlock *, BasicBlock *, 2> SplitExitBBMap;
+  // Handle the default exit if necessary.
+  // FIXME: It'd be great if we could merge this with the loop below but LLVM's
+  // ranges aren't quite powerful enough yet.
+  if (DefaultExitBB) {
+    if (pred_empty(DefaultExitBB)) {
+      UnswitchedExitBBs.insert(DefaultExitBB);
+      rewritePHINodesForUnswitchedExitBlock(*DefaultExitBB, *ParentBB, *OldPH);
+    } else {
+      auto *SplitBB =
+          SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI, MSSAU);
+      rewritePHINodesForExitAndUnswitchedBlocks(*DefaultExitBB, *SplitBB,
+                                                *ParentBB, *OldPH,
+                                                /*FullUnswitch*/ true);
+      DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB;
+    }
+  }
+  // Note that we must use a reference in the for loop so that we update the
+  // container.
+  for (auto &ExitCase : reverse(ExitCases)) {
+    // Grab a reference to the exit block in the pair so that we can update it.
+    BasicBlock *ExitBB = std::get<1>(ExitCase);
+
+    // If this case is the last edge into the exit block, we can simply reuse it
+    // as it will no longer be a loop exit. No mapping necessary.
+    if (pred_empty(ExitBB)) {
+      // Only rewrite once.
+      if (UnswitchedExitBBs.insert(ExitBB).second)
+        rewritePHINodesForUnswitchedExitBlock(*ExitBB, *ParentBB, *OldPH);
+      continue;
+    }
+
+    // Otherwise we need to split the exit block so that we retain an exit
+    // block from the loop and a target for the unswitched condition.
+    BasicBlock *&SplitExitBB = SplitExitBBMap[ExitBB];
+    if (!SplitExitBB) {
+      // If this is the first time we see this, do the split and remember it.
+      SplitExitBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI, MSSAU);
+      rewritePHINodesForExitAndUnswitchedBlocks(*ExitBB, *SplitExitBB,
+                                                *ParentBB, *OldPH,
+                                                /*FullUnswitch*/ true);
+    }
+    // Update the case pair to point to the split block.
+    std::get<1>(ExitCase) = SplitExitBB;
+  }
+
+  // Now add the unswitched cases. We do this in reverse order as we built them
+  // in reverse order.
+  for (auto &ExitCase : reverse(ExitCases)) {
+    ConstantInt *CaseVal = std::get<0>(ExitCase);
+    BasicBlock *UnswitchedBB = std::get<1>(ExitCase);
+
+    NewSIW.addCase(CaseVal, UnswitchedBB, std::get<2>(ExitCase));
+  }
+
+  // If the default was unswitched, re-point it and add explicit cases for
+  // entering the loop.
+  if (DefaultExitBB) {
+    NewSIW->setDefaultDest(DefaultExitBB);
+    NewSIW.setSuccessorWeight(0, DefaultCaseWeight);
+
+    // We removed all the exit cases, so we just copy the cases to the
+    // unswitched switch.
+    for (const auto &Case : SI.cases())
+      NewSIW.addCase(Case.getCaseValue(), NewPH,
+                     SIW.getSuccessorWeight(Case.getSuccessorIndex()));
+  } else if (DefaultCaseWeight) {
+    // We have to set branch weight of the default case.
+    uint64_t SW = *DefaultCaseWeight;
+    for (const auto &Case : SI.cases()) {
+      auto W = SIW.getSuccessorWeight(Case.getSuccessorIndex());
+      assert(W &&
+             "case weight must be defined as default case weight is defined");
+      SW += *W;
+    }
+    NewSIW.setSuccessorWeight(0, SW);
+  }
+
+  // If we ended up with a common successor for every path through the switch
+  // after unswitching, rewrite it to an unconditional branch to make it easy
+  // to recognize. Otherwise we potentially have to recognize the default case
+  // pointing at unreachable and other complexity.
+  if (CommonSuccBB) {
+    BasicBlock *BB = SI.getParent();
+    // We may have had multiple edges to this common successor block, so remove
+    // them as predecessors. We skip the first one, either the default or the
+    // actual first case.
+    bool SkippedFirst = DefaultExitBB == nullptr;
+    for (auto Case : SI.cases()) {
+      assert(Case.getCaseSuccessor() == CommonSuccBB &&
+             "Non-common successor!");
+      (void)Case;
+      if (!SkippedFirst) {
+        SkippedFirst = true;
+        continue;
+      }
+      CommonSuccBB->removePredecessor(BB,
+                                      /*KeepOneInputPHIs*/ true);
+    }
+    // Now nuke the switch and replace it with a direct branch.
+    SIW.eraseFromParent();
+    BranchInst::Create(CommonSuccBB, BB);
+  } else if (DefaultExitBB) {
+    assert(SI.getNumCases() > 0 &&
+           "If we had no cases we'd have a common successor!");
+    // Move the last case to the default successor. This is valid as if the
+    // default got unswitched it cannot be reached. This has the advantage of
+    // being simple and keeping the number of edges from this switch to
+    // successors the same, and avoiding any PHI update complexity.
+    auto LastCaseI = std::prev(SI.case_end());
+
+    SI.setDefaultDest(LastCaseI->getCaseSuccessor());
+    SIW.setSuccessorWeight(
+        0, SIW.getSuccessorWeight(LastCaseI->getSuccessorIndex()));
+    SIW.removeCase(LastCaseI);
+  }
+
+  // Walk the unswitched exit blocks and the unswitched split blocks and update
+  // the dominator tree based on the CFG edits. While we are walking unordered
+  // containers here, the API for applyUpdates takes an unordered list of
+  // updates and requires them to not contain duplicates.
+  SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
+  for (auto *UnswitchedExitBB : UnswitchedExitBBs) {
+    DTUpdates.push_back({DT.Delete, ParentBB, UnswitchedExitBB});
+    DTUpdates.push_back({DT.Insert, OldPH, UnswitchedExitBB});
+  }
+  for (auto SplitUnswitchedPair : SplitExitBBMap) {
+    DTUpdates.push_back({DT.Delete, ParentBB, SplitUnswitchedPair.first});
+    DTUpdates.push_back({DT.Insert, OldPH, SplitUnswitchedPair.second});
+  }
+
+  if (MSSAU) {
     MSSAU->applyUpdates(DTUpdates, DT, /*UpdateDT=*/true);
-    if (VerifyMemorySSA) 
-      MSSAU->getMemorySSA()->verifyMemorySSA(); 
+    if (VerifyMemorySSA)
+      MSSAU->getMemorySSA()->verifyMemorySSA();
   } else {
     DT.applyUpdates(DTUpdates);
-  } 
- 
-  assert(DT.verify(DominatorTree::VerificationLevel::Fast)); 
- 
-  // We may have changed the nesting relationship for this loop so hoist it to 
-  // its correct parent if needed. 
-  hoistLoopToNewParent(L, *NewPH, DT, LI, MSSAU, SE); 
- 
-  if (MSSAU && VerifyMemorySSA) 
-    MSSAU->getMemorySSA()->verifyMemorySSA(); 
- 
-  ++NumTrivial; 
-  ++NumSwitches; 
-  LLVM_DEBUG(dbgs() << "    done: unswitching trivial switch...\n"); 
-  return true; 
-} 
- 
-/// This routine scans the loop to find a branch or switch which occurs before 
-/// any side effects occur. These can potentially be unswitched without 
-/// duplicating the loop. If a branch or switch is successfully unswitched the 
-/// scanning continues to see if subsequent branches or switches have become 
-/// trivial. Once all trivial candidates have been unswitched, this routine 
-/// returns. 
-/// 
-/// The return value indicates whether anything was unswitched (and therefore 
-/// changed). 
-/// 
-/// If `SE` is not null, it will be updated based on the potential loop SCEVs 
-/// invalidated by this. 
-static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT, 
-                                         LoopInfo &LI, ScalarEvolution *SE, 
-                                         MemorySSAUpdater *MSSAU) { 
-  bool Changed = false; 
- 
-  // If loop header has only one reachable successor we should keep looking for 
-  // trivial condition candidates in the successor as well. An alternative is 
-  // to constant fold conditions and merge successors into loop header (then we 
-  // only need to check header's terminator). The reason for not doing this in 
-  // LoopUnswitch pass is that it could potentially break LoopPassManager's 
-  // invariants. Folding dead branches could either eliminate the current loop 
-  // or make other loops unreachable. LCSSA form might also not be preserved 
-  // after deleting branches. The following code keeps traversing loop header's 
-  // successors until it finds the trivial condition candidate (condition that 
-  // is not a constant). Since unswitching generates branches with constant 
-  // conditions, this scenario could be very common in practice. 
-  BasicBlock *CurrentBB = L.getHeader(); 
-  SmallPtrSet<BasicBlock *, 8> Visited; 
-  Visited.insert(CurrentBB); 
-  do { 
-    // Check if there are any side-effecting instructions (e.g. stores, calls, 
-    // volatile loads) in the part of the loop that the code *would* execute 
-    // without unswitching. 
-    if (MSSAU) // Possible early exit with MSSA 
-      if (auto *Defs = MSSAU->getMemorySSA()->getBlockDefs(CurrentBB)) 
-        if (!isa<MemoryPhi>(*Defs->begin()) || (++Defs->begin() != Defs->end())) 
-          return Changed; 
-    if (llvm::any_of(*CurrentBB, 
-                     [](Instruction &I) { return I.mayHaveSideEffects(); })) 
-      return Changed; 
- 
-    Instruction *CurrentTerm = CurrentBB->getTerminator(); 
- 
-    if (auto *SI = dyn_cast<SwitchInst>(CurrentTerm)) { 
-      // Don't bother trying to unswitch past a switch with a constant 
-      // condition. This should be removed prior to running this pass by 
-      // simplify-cfg. 
-      if (isa<Constant>(SI->getCondition())) 
-        return Changed; 
- 
-      if (!unswitchTrivialSwitch(L, *SI, DT, LI, SE, MSSAU)) 
-        // Couldn't unswitch this one so we're done. 
-        return Changed; 
- 
-      // Mark that we managed to unswitch something. 
-      Changed = true; 
- 
-      // If unswitching turned the terminator into an unconditional branch then 
-      // we can continue. The unswitching logic specifically works to fold any 
-      // cases it can into an unconditional branch to make it easier to 
-      // recognize here. 
-      auto *BI = dyn_cast<BranchInst>(CurrentBB->getTerminator()); 
-      if (!BI || BI->isConditional()) 
-        return Changed; 
- 
-      CurrentBB = BI->getSuccessor(0); 
-      continue; 
-    } 
- 
-    auto *BI = dyn_cast<BranchInst>(CurrentTerm); 
-    if (!BI) 
-      // We do not understand other terminator instructions. 
-      return Changed; 
- 
-    // Don't bother trying to unswitch past an unconditional branch or a branch 
-    // with a constant value. These should be removed by simplify-cfg prior to 
-    // running this pass. 
-    if (!BI->isConditional() || isa<Constant>(BI->getCondition())) 
-      return Changed; 
- 
-    // Found a trivial condition candidate: non-foldable conditional branch. If 
-    // we fail to unswitch this, we can't do anything else that is trivial. 
-    if (!unswitchTrivialBranch(L, *BI, DT, LI, SE, MSSAU)) 
-      return Changed; 
- 
-    // Mark that we managed to unswitch something. 
-    Changed = true; 
- 
-    // If we only unswitched some of the conditions feeding the branch, we won't 
-    // have collapsed it to a single successor. 
-    BI = cast<BranchInst>(CurrentBB->getTerminator()); 
-    if (BI->isConditional()) 
-      return Changed; 
- 
-    // Follow the newly unconditional branch into its successor. 
-    CurrentBB = BI->getSuccessor(0); 
- 
-    // When continuing, if we exit the loop or reach a previous visited block, 
-    // then we can not reach any trivial condition candidates (unfoldable 
-    // branch instructions or switch instructions) and no unswitch can happen. 
-  } while (L.contains(CurrentBB) && Visited.insert(CurrentBB).second); 
- 
-  return Changed; 
-} 
- 
-/// Build the cloned blocks for an unswitched copy of the given loop. 
-/// 
-/// The cloned blocks are inserted before the loop preheader (`LoopPH`) and 
-/// after the split block (`SplitBB`) that will be used to select between the 
-/// cloned and original loop. 
-/// 
-/// This routine handles cloning all of the necessary loop blocks and exit 
-/// blocks including rewriting their instructions and the relevant PHI nodes. 
-/// Any loop blocks or exit blocks which are dominated by a different successor 
-/// than the one for this clone of the loop blocks can be trivially skipped. We 
-/// use the `DominatingSucc` map to determine whether a block satisfies that 
-/// property with a simple map lookup. 
-/// 
-/// It also correctly creates the unconditional branch in the cloned 
-/// unswitched parent block to only point at the unswitched successor. 
-/// 
-/// This does not handle most of the necessary updates to `LoopInfo`. Only exit 
-/// block splitting is correctly reflected in `LoopInfo`, essentially all of 
-/// the cloned blocks (and their loops) are left without full `LoopInfo` 
-/// updates. This also doesn't fully update `DominatorTree`. It adds the cloned 
-/// blocks to them but doesn't create the cloned `DominatorTree` structure and 
-/// instead the caller must recompute an accurate DT. It *does* correctly 
-/// update the `AssumptionCache` provided in `AC`. 
-static BasicBlock *buildClonedLoopBlocks( 
-    Loop &L, BasicBlock *LoopPH, BasicBlock *SplitBB, 
-    ArrayRef<BasicBlock *> ExitBlocks, BasicBlock *ParentBB, 
-    BasicBlock *UnswitchedSuccBB, BasicBlock *ContinueSuccBB, 
-    const SmallDenseMap<BasicBlock *, BasicBlock *, 16> &DominatingSucc, 
-    ValueToValueMapTy &VMap, 
-    SmallVectorImpl<DominatorTree::UpdateType> &DTUpdates, AssumptionCache &AC, 
-    DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU) { 
-  SmallVector<BasicBlock *, 4> NewBlocks; 
-  NewBlocks.reserve(L.getNumBlocks() + ExitBlocks.size()); 
- 
-  // We will need to clone a bunch of blocks, wrap up the clone operation in 
-  // a helper. 
-  auto CloneBlock = [&](BasicBlock *OldBB) { 
-    // Clone the basic block and insert it before the new preheader. 
-    BasicBlock *NewBB = CloneBasicBlock(OldBB, VMap, ".us", OldBB->getParent()); 
-    NewBB->moveBefore(LoopPH); 
- 
-    // Record this block and the mapping. 
-    NewBlocks.push_back(NewBB); 
-    VMap[OldBB] = NewBB; 
- 
-    return NewBB; 
-  }; 
- 
-  // We skip cloning blocks when they have a dominating succ that is not the 
-  // succ we are cloning for. 
-  auto SkipBlock = [&](BasicBlock *BB) { 
-    auto It = DominatingSucc.find(BB); 
-    return It != DominatingSucc.end() && It->second != UnswitchedSuccBB; 
-  }; 
- 
-  // First, clone the preheader. 
-  auto *ClonedPH = CloneBlock(LoopPH); 
- 
-  // Then clone all the loop blocks, skipping the ones that aren't necessary. 
-  for (auto *LoopBB : L.blocks()) 
-    if (!SkipBlock(LoopBB)) 
-      CloneBlock(LoopBB); 
- 
-  // Split all the loop exit edges so that when we clone the exit blocks, if 
-  // any of the exit blocks are *also* a preheader for some other loop, we 
-  // don't create multiple predecessors entering the loop header. 
-  for (auto *ExitBB : ExitBlocks) { 
-    if (SkipBlock(ExitBB)) 
-      continue; 
- 
-    // When we are going to clone an exit, we don't need to clone all the 
-    // instructions in the exit block and we want to ensure we have an easy 
-    // place to merge the CFG, so split the exit first. This is always safe to 
-    // do because there cannot be any non-loop predecessors of a loop exit in 
-    // loop simplified form. 
-    auto *MergeBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI, MSSAU); 
- 
-    // Rearrange the names to make it easier to write test cases by having the 
-    // exit block carry the suffix rather than the merge block carrying the 
-    // suffix. 
-    MergeBB->takeName(ExitBB); 
-    ExitBB->setName(Twine(MergeBB->getName()) + ".split"); 
- 
-    // Now clone the original exit block. 
-    auto *ClonedExitBB = CloneBlock(ExitBB); 
-    assert(ClonedExitBB->getTerminator()->getNumSuccessors() == 1 && 
-           "Exit block should have been split to have one successor!"); 
-    assert(ClonedExitBB->getTerminator()->getSuccessor(0) == MergeBB && 
-           "Cloned exit block has the wrong successor!"); 
- 
-    // Remap any cloned instructions and create a merge phi node for them. 
-    for (auto ZippedInsts : llvm::zip_first( 
-             llvm::make_range(ExitBB->begin(), std::prev(ExitBB->end())), 
-             llvm::make_range(ClonedExitBB->begin(), 
-                              std::prev(ClonedExitBB->end())))) { 
-      Instruction &I = std::get<0>(ZippedInsts); 
-      Instruction &ClonedI = std::get<1>(ZippedInsts); 
- 
-      // The only instructions in the exit block should be PHI nodes and 
-      // potentially a landing pad. 
-      assert( 
-          (isa<PHINode>(I) || isa<LandingPadInst>(I) || isa<CatchPadInst>(I)) && 
-          "Bad instruction in exit block!"); 
-      // We should have a value map between the instruction and its clone. 
-      assert(VMap.lookup(&I) == &ClonedI && "Mismatch in the value map!"); 
- 
-      auto *MergePN = 
-          PHINode::Create(I.getType(), /*NumReservedValues*/ 2, ".us-phi", 
-                          &*MergeBB->getFirstInsertionPt()); 
-      I.replaceAllUsesWith(MergePN); 
-      MergePN->addIncoming(&I, ExitBB); 
-      MergePN->addIncoming(&ClonedI, ClonedExitBB); 
-    } 
-  } 
- 
-  // Rewrite the instructions in the cloned blocks to refer to the instructions 
-  // in the cloned blocks. We have to do this as a second pass so that we have 
-  // everything available. Also, we have inserted new instructions which may 
-  // include assume intrinsics, so we update the assumption cache while 
-  // processing this. 
-  for (auto *ClonedBB : NewBlocks) 
-    for (Instruction &I : *ClonedBB) { 
-      RemapInstruction(&I, VMap, 
-                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); 
-      if (auto *II = dyn_cast<IntrinsicInst>(&I)) 
-        if (II->getIntrinsicID() == Intrinsic::assume) 
-          AC.registerAssumption(II); 
-    } 
- 
-  // Update any PHI nodes in the cloned successors of the skipped blocks to not 
-  // have spurious incoming values. 
-  for (auto *LoopBB : L.blocks()) 
-    if (SkipBlock(LoopBB)) 
-      for (auto *SuccBB : successors(LoopBB)) 
-        if (auto *ClonedSuccBB = cast_or_null<BasicBlock>(VMap.lookup(SuccBB))) 
-          for (PHINode &PN : ClonedSuccBB->phis()) 
-            PN.removeIncomingValue(LoopBB, /*DeletePHIIfEmpty*/ false); 
- 
-  // Remove the cloned parent as a predecessor of any successor we ended up 
-  // cloning other than the unswitched one. 
-  auto *ClonedParentBB = cast<BasicBlock>(VMap.lookup(ParentBB)); 
-  for (auto *SuccBB : successors(ParentBB)) { 
-    if (SuccBB == UnswitchedSuccBB) 
-      continue; 
- 
-    auto *ClonedSuccBB = cast_or_null<BasicBlock>(VMap.lookup(SuccBB)); 
-    if (!ClonedSuccBB) 
-      continue; 
- 
-    ClonedSuccBB->removePredecessor(ClonedParentBB, 
-                                    /*KeepOneInputPHIs*/ true); 
-  } 
- 
-  // Replace the cloned branch with an unconditional branch to the cloned 
-  // unswitched successor. 
-  auto *ClonedSuccBB = cast<BasicBlock>(VMap.lookup(UnswitchedSuccBB)); 
+  }
+
+  assert(DT.verify(DominatorTree::VerificationLevel::Fast));
+
+  // We may have changed the nesting relationship for this loop so hoist it to
+  // its correct parent if needed.
+  hoistLoopToNewParent(L, *NewPH, DT, LI, MSSAU, SE);
+
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
+  ++NumTrivial;
+  ++NumSwitches;
+  LLVM_DEBUG(dbgs() << "    done: unswitching trivial switch...\n");
+  return true;
+}
+
+/// This routine scans the loop to find a branch or switch which occurs before
+/// any side effects occur. These can potentially be unswitched without
+/// duplicating the loop. If a branch or switch is successfully unswitched the
+/// scanning continues to see if subsequent branches or switches have become
+/// trivial. Once all trivial candidates have been unswitched, this routine
+/// returns.
+///
+/// The return value indicates whether anything was unswitched (and therefore
+/// changed).
+///
+/// If `SE` is not null, it will be updated based on the potential loop SCEVs
+/// invalidated by this.
+static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT,
+                                         LoopInfo &LI, ScalarEvolution *SE,
+                                         MemorySSAUpdater *MSSAU) {
+  bool Changed = false;
+
+  // If loop header has only one reachable successor we should keep looking for
+  // trivial condition candidates in the successor as well. An alternative is
+  // to constant fold conditions and merge successors into loop header (then we
+  // only need to check header's terminator). The reason for not doing this in
+  // LoopUnswitch pass is that it could potentially break LoopPassManager's
+  // invariants. Folding dead branches could either eliminate the current loop
+  // or make other loops unreachable. LCSSA form might also not be preserved
+  // after deleting branches. The following code keeps traversing loop header's
+  // successors until it finds the trivial condition candidate (condition that
+  // is not a constant). Since unswitching generates branches with constant
+  // conditions, this scenario could be very common in practice.
+  BasicBlock *CurrentBB = L.getHeader();
+  SmallPtrSet<BasicBlock *, 8> Visited;
+  Visited.insert(CurrentBB);
+  do {
+    // Check if there are any side-effecting instructions (e.g. stores, calls,
+    // volatile loads) in the part of the loop that the code *would* execute
+    // without unswitching.
+    if (MSSAU) // Possible early exit with MSSA
+      if (auto *Defs = MSSAU->getMemorySSA()->getBlockDefs(CurrentBB))
+        if (!isa<MemoryPhi>(*Defs->begin()) || (++Defs->begin() != Defs->end()))
+          return Changed;
+    if (llvm::any_of(*CurrentBB,
+                     [](Instruction &I) { return I.mayHaveSideEffects(); }))
+      return Changed;
+
+    Instruction *CurrentTerm = CurrentBB->getTerminator();
+
+    if (auto *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
+      // Don't bother trying to unswitch past a switch with a constant
+      // condition. This should be removed prior to running this pass by
+      // simplify-cfg.
+      if (isa<Constant>(SI->getCondition()))
+        return Changed;
+
+      if (!unswitchTrivialSwitch(L, *SI, DT, LI, SE, MSSAU))
+        // Couldn't unswitch this one so we're done.
+        return Changed;
+
+      // Mark that we managed to unswitch something.
+      Changed = true;
+
+      // If unswitching turned the terminator into an unconditional branch then
+      // we can continue. The unswitching logic specifically works to fold any
+      // cases it can into an unconditional branch to make it easier to
+      // recognize here.
+      auto *BI = dyn_cast<BranchInst>(CurrentBB->getTerminator());
+      if (!BI || BI->isConditional())
+        return Changed;
+
+      CurrentBB = BI->getSuccessor(0);
+      continue;
+    }
+
+    auto *BI = dyn_cast<BranchInst>(CurrentTerm);
+    if (!BI)
+      // We do not understand other terminator instructions.
+      return Changed;
+
+    // Don't bother trying to unswitch past an unconditional branch or a branch
+    // with a constant value. These should be removed by simplify-cfg prior to
+    // running this pass.
+    if (!BI->isConditional() || isa<Constant>(BI->getCondition()))
+      return Changed;
+
+    // Found a trivial condition candidate: non-foldable conditional branch. If
+    // we fail to unswitch this, we can't do anything else that is trivial.
+    if (!unswitchTrivialBranch(L, *BI, DT, LI, SE, MSSAU))
+      return Changed;
+
+    // Mark that we managed to unswitch something.
+    Changed = true;
+
+    // If we only unswitched some of the conditions feeding the branch, we won't
+    // have collapsed it to a single successor.
+    BI = cast<BranchInst>(CurrentBB->getTerminator());
+    if (BI->isConditional())
+      return Changed;
+
+    // Follow the newly unconditional branch into its successor.
+    CurrentBB = BI->getSuccessor(0);
+
+    // When continuing, if we exit the loop or reach a previous visited block,
+    // then we can not reach any trivial condition candidates (unfoldable
+    // branch instructions or switch instructions) and no unswitch can happen.
+  } while (L.contains(CurrentBB) && Visited.insert(CurrentBB).second);
+
+  return Changed;
+}
+
+/// Build the cloned blocks for an unswitched copy of the given loop.
+///
+/// The cloned blocks are inserted before the loop preheader (`LoopPH`) and
+/// after the split block (`SplitBB`) that will be used to select between the
+/// cloned and original loop.
+///
+/// This routine handles cloning all of the necessary loop blocks and exit
+/// blocks including rewriting their instructions and the relevant PHI nodes.
+/// Any loop blocks or exit blocks which are dominated by a different successor
+/// than the one for this clone of the loop blocks can be trivially skipped. We
+/// use the `DominatingSucc` map to determine whether a block satisfies that
+/// property with a simple map lookup.
+///
+/// It also correctly creates the unconditional branch in the cloned
+/// unswitched parent block to only point at the unswitched successor.
+///
+/// This does not handle most of the necessary updates to `LoopInfo`. Only exit
+/// block splitting is correctly reflected in `LoopInfo`, essentially all of
+/// the cloned blocks (and their loops) are left without full `LoopInfo`
+/// updates. This also doesn't fully update `DominatorTree`. It adds the cloned
+/// blocks to them but doesn't create the cloned `DominatorTree` structure and
+/// instead the caller must recompute an accurate DT. It *does* correctly
+/// update the `AssumptionCache` provided in `AC`.
+static BasicBlock *buildClonedLoopBlocks(
+    Loop &L, BasicBlock *LoopPH, BasicBlock *SplitBB,
+    ArrayRef<BasicBlock *> ExitBlocks, BasicBlock *ParentBB,
+    BasicBlock *UnswitchedSuccBB, BasicBlock *ContinueSuccBB,
+    const SmallDenseMap<BasicBlock *, BasicBlock *, 16> &DominatingSucc,
+    ValueToValueMapTy &VMap,
+    SmallVectorImpl<DominatorTree::UpdateType> &DTUpdates, AssumptionCache &AC,
+    DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU) {
+  SmallVector<BasicBlock *, 4> NewBlocks;
+  NewBlocks.reserve(L.getNumBlocks() + ExitBlocks.size());
+
+  // We will need to clone a bunch of blocks, wrap up the clone operation in
+  // a helper.
+  auto CloneBlock = [&](BasicBlock *OldBB) {
+    // Clone the basic block and insert it before the new preheader.
+    BasicBlock *NewBB = CloneBasicBlock(OldBB, VMap, ".us", OldBB->getParent());
+    NewBB->moveBefore(LoopPH);
+
+    // Record this block and the mapping.
+    NewBlocks.push_back(NewBB);
+    VMap[OldBB] = NewBB;
+
+    return NewBB;
+  };
+
+  // We skip cloning blocks when they have a dominating succ that is not the
+  // succ we are cloning for.
+  auto SkipBlock = [&](BasicBlock *BB) {
+    auto It = DominatingSucc.find(BB);
+    return It != DominatingSucc.end() && It->second != UnswitchedSuccBB;
+  };
+
+  // First, clone the preheader.
+  auto *ClonedPH = CloneBlock(LoopPH);
+
+  // Then clone all the loop blocks, skipping the ones that aren't necessary.
+  for (auto *LoopBB : L.blocks())
+    if (!SkipBlock(LoopBB))
+      CloneBlock(LoopBB);
+
+  // Split all the loop exit edges so that when we clone the exit blocks, if
+  // any of the exit blocks are *also* a preheader for some other loop, we
+  // don't create multiple predecessors entering the loop header.
+  for (auto *ExitBB : ExitBlocks) {
+    if (SkipBlock(ExitBB))
+      continue;
+
+    // When we are going to clone an exit, we don't need to clone all the
+    // instructions in the exit block and we want to ensure we have an easy
+    // place to merge the CFG, so split the exit first. This is always safe to
+    // do because there cannot be any non-loop predecessors of a loop exit in
+    // loop simplified form.
+    auto *MergeBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI, MSSAU);
+
+    // Rearrange the names to make it easier to write test cases by having the
+    // exit block carry the suffix rather than the merge block carrying the
+    // suffix.
+    MergeBB->takeName(ExitBB);
+    ExitBB->setName(Twine(MergeBB->getName()) + ".split");
+
+    // Now clone the original exit block.
+    auto *ClonedExitBB = CloneBlock(ExitBB);
+    assert(ClonedExitBB->getTerminator()->getNumSuccessors() == 1 &&
+           "Exit block should have been split to have one successor!");
+    assert(ClonedExitBB->getTerminator()->getSuccessor(0) == MergeBB &&
+           "Cloned exit block has the wrong successor!");
+
+    // Remap any cloned instructions and create a merge phi node for them.
+    for (auto ZippedInsts : llvm::zip_first(
+             llvm::make_range(ExitBB->begin(), std::prev(ExitBB->end())),
+             llvm::make_range(ClonedExitBB->begin(),
+                              std::prev(ClonedExitBB->end())))) {
+      Instruction &I = std::get<0>(ZippedInsts);
+      Instruction &ClonedI = std::get<1>(ZippedInsts);
+
+      // The only instructions in the exit block should be PHI nodes and
+      // potentially a landing pad.
+      assert(
+          (isa<PHINode>(I) || isa<LandingPadInst>(I) || isa<CatchPadInst>(I)) &&
+          "Bad instruction in exit block!");
+      // We should have a value map between the instruction and its clone.
+      assert(VMap.lookup(&I) == &ClonedI && "Mismatch in the value map!");
+
+      auto *MergePN =
+          PHINode::Create(I.getType(), /*NumReservedValues*/ 2, ".us-phi",
+                          &*MergeBB->getFirstInsertionPt());
+      I.replaceAllUsesWith(MergePN);
+      MergePN->addIncoming(&I, ExitBB);
+      MergePN->addIncoming(&ClonedI, ClonedExitBB);
+    }
+  }
+
+  // Rewrite the instructions in the cloned blocks to refer to the instructions
+  // in the cloned blocks. We have to do this as a second pass so that we have
+  // everything available. Also, we have inserted new instructions which may
+  // include assume intrinsics, so we update the assumption cache while
+  // processing this.
+  for (auto *ClonedBB : NewBlocks)
+    for (Instruction &I : *ClonedBB) {
+      RemapInstruction(&I, VMap,
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+      if (auto *II = dyn_cast<IntrinsicInst>(&I))
+        if (II->getIntrinsicID() == Intrinsic::assume)
+          AC.registerAssumption(II);
+    }
+
+  // Update any PHI nodes in the cloned successors of the skipped blocks to not
+  // have spurious incoming values.
+  for (auto *LoopBB : L.blocks())
+    if (SkipBlock(LoopBB))
+      for (auto *SuccBB : successors(LoopBB))
+        if (auto *ClonedSuccBB = cast_or_null<BasicBlock>(VMap.lookup(SuccBB)))
+          for (PHINode &PN : ClonedSuccBB->phis())
+            PN.removeIncomingValue(LoopBB, /*DeletePHIIfEmpty*/ false);
+
+  // Remove the cloned parent as a predecessor of any successor we ended up
+  // cloning other than the unswitched one.
+  auto *ClonedParentBB = cast<BasicBlock>(VMap.lookup(ParentBB));
+  for (auto *SuccBB : successors(ParentBB)) {
+    if (SuccBB == UnswitchedSuccBB)
+      continue;
+
+    auto *ClonedSuccBB = cast_or_null<BasicBlock>(VMap.lookup(SuccBB));
+    if (!ClonedSuccBB)
+      continue;
+
+    ClonedSuccBB->removePredecessor(ClonedParentBB,
+                                    /*KeepOneInputPHIs*/ true);
+  }
+
+  // Replace the cloned branch with an unconditional branch to the cloned
+  // unswitched successor.
+  auto *ClonedSuccBB = cast<BasicBlock>(VMap.lookup(UnswitchedSuccBB));
   Instruction *ClonedTerminator = ClonedParentBB->getTerminator();
   // Trivial Simplification. If Terminator is a conditional branch and
   // condition becomes dead - erase it.
@@ -1150,946 +1150,946 @@ static BasicBlock *buildClonedLoopBlocks(
     ClonedConditionToErase = SI->getCondition();
 
   ClonedTerminator->eraseFromParent();
-  BranchInst::Create(ClonedSuccBB, ClonedParentBB); 
- 
+  BranchInst::Create(ClonedSuccBB, ClonedParentBB);
+
   if (ClonedConditionToErase)
     RecursivelyDeleteTriviallyDeadInstructions(ClonedConditionToErase, nullptr,
                                                MSSAU);
 
-  // If there are duplicate entries in the PHI nodes because of multiple edges 
-  // to the unswitched successor, we need to nuke all but one as we replaced it 
-  // with a direct branch. 
-  for (PHINode &PN : ClonedSuccBB->phis()) { 
-    bool Found = false; 
-    // Loop over the incoming operands backwards so we can easily delete as we 
-    // go without invalidating the index. 
-    for (int i = PN.getNumOperands() - 1; i >= 0; --i) { 
-      if (PN.getIncomingBlock(i) != ClonedParentBB) 
-        continue; 
-      if (!Found) { 
-        Found = true; 
-        continue; 
-      } 
-      PN.removeIncomingValue(i, /*DeletePHIIfEmpty*/ false); 
-    } 
-  } 
- 
-  // Record the domtree updates for the new blocks. 
-  SmallPtrSet<BasicBlock *, 4> SuccSet; 
-  for (auto *ClonedBB : NewBlocks) { 
-    for (auto *SuccBB : successors(ClonedBB)) 
-      if (SuccSet.insert(SuccBB).second) 
-        DTUpdates.push_back({DominatorTree::Insert, ClonedBB, SuccBB}); 
-    SuccSet.clear(); 
-  } 
- 
-  return ClonedPH; 
-} 
- 
-/// Recursively clone the specified loop and all of its children. 
-/// 
-/// The target parent loop for the clone should be provided, or can be null if 
-/// the clone is a top-level loop. While cloning, all the blocks are mapped 
-/// with the provided value map. The entire original loop must be present in 
-/// the value map. The cloned loop is returned. 
-static Loop *cloneLoopNest(Loop &OrigRootL, Loop *RootParentL, 
-                           const ValueToValueMapTy &VMap, LoopInfo &LI) { 
-  auto AddClonedBlocksToLoop = [&](Loop &OrigL, Loop &ClonedL) { 
-    assert(ClonedL.getBlocks().empty() && "Must start with an empty loop!"); 
-    ClonedL.reserveBlocks(OrigL.getNumBlocks()); 
-    for (auto *BB : OrigL.blocks()) { 
-      auto *ClonedBB = cast<BasicBlock>(VMap.lookup(BB)); 
-      ClonedL.addBlockEntry(ClonedBB); 
-      if (LI.getLoopFor(BB) == &OrigL) 
-        LI.changeLoopFor(ClonedBB, &ClonedL); 
-    } 
-  }; 
- 
-  // We specially handle the first loop because it may get cloned into 
-  // a different parent and because we most commonly are cloning leaf loops. 
-  Loop *ClonedRootL = LI.AllocateLoop(); 
-  if (RootParentL) 
-    RootParentL->addChildLoop(ClonedRootL); 
-  else 
-    LI.addTopLevelLoop(ClonedRootL); 
-  AddClonedBlocksToLoop(OrigRootL, *ClonedRootL); 
- 
+  // If there are duplicate entries in the PHI nodes because of multiple edges
+  // to the unswitched successor, we need to nuke all but one as we replaced it
+  // with a direct branch.
+  for (PHINode &PN : ClonedSuccBB->phis()) {
+    bool Found = false;
+    // Loop over the incoming operands backwards so we can easily delete as we
+    // go without invalidating the index.
+    for (int i = PN.getNumOperands() - 1; i >= 0; --i) {
+      if (PN.getIncomingBlock(i) != ClonedParentBB)
+        continue;
+      if (!Found) {
+        Found = true;
+        continue;
+      }
+      PN.removeIncomingValue(i, /*DeletePHIIfEmpty*/ false);
+    }
+  }
+
+  // Record the domtree updates for the new blocks.
+  SmallPtrSet<BasicBlock *, 4> SuccSet;
+  for (auto *ClonedBB : NewBlocks) {
+    for (auto *SuccBB : successors(ClonedBB))
+      if (SuccSet.insert(SuccBB).second)
+        DTUpdates.push_back({DominatorTree::Insert, ClonedBB, SuccBB});
+    SuccSet.clear();
+  }
+
+  return ClonedPH;
+}
+
+/// Recursively clone the specified loop and all of its children.
+///
+/// The target parent loop for the clone should be provided, or can be null if
+/// the clone is a top-level loop. While cloning, all the blocks are mapped
+/// with the provided value map. The entire original loop must be present in
+/// the value map. The cloned loop is returned.
+static Loop *cloneLoopNest(Loop &OrigRootL, Loop *RootParentL,
+                           const ValueToValueMapTy &VMap, LoopInfo &LI) {
+  auto AddClonedBlocksToLoop = [&](Loop &OrigL, Loop &ClonedL) {
+    assert(ClonedL.getBlocks().empty() && "Must start with an empty loop!");
+    ClonedL.reserveBlocks(OrigL.getNumBlocks());
+    for (auto *BB : OrigL.blocks()) {
+      auto *ClonedBB = cast<BasicBlock>(VMap.lookup(BB));
+      ClonedL.addBlockEntry(ClonedBB);
+      if (LI.getLoopFor(BB) == &OrigL)
+        LI.changeLoopFor(ClonedBB, &ClonedL);
+    }
+  };
+
+  // We specially handle the first loop because it may get cloned into
+  // a different parent and because we most commonly are cloning leaf loops.
+  Loop *ClonedRootL = LI.AllocateLoop();
+  if (RootParentL)
+    RootParentL->addChildLoop(ClonedRootL);
+  else
+    LI.addTopLevelLoop(ClonedRootL);
+  AddClonedBlocksToLoop(OrigRootL, *ClonedRootL);
+
   if (OrigRootL.isInnermost())
-    return ClonedRootL; 
- 
-  // If we have a nest, we can quickly clone the entire loop nest using an 
-  // iterative approach because it is a tree. We keep the cloned parent in the 
-  // data structure to avoid repeatedly querying through a map to find it. 
-  SmallVector<std::pair<Loop *, Loop *>, 16> LoopsToClone; 
-  // Build up the loops to clone in reverse order as we'll clone them from the 
-  // back. 
-  for (Loop *ChildL : llvm::reverse(OrigRootL)) 
-    LoopsToClone.push_back({ClonedRootL, ChildL}); 
-  do { 
-    Loop *ClonedParentL, *L; 
-    std::tie(ClonedParentL, L) = LoopsToClone.pop_back_val(); 
-    Loop *ClonedL = LI.AllocateLoop(); 
-    ClonedParentL->addChildLoop(ClonedL); 
-    AddClonedBlocksToLoop(*L, *ClonedL); 
-    for (Loop *ChildL : llvm::reverse(*L)) 
-      LoopsToClone.push_back({ClonedL, ChildL}); 
-  } while (!LoopsToClone.empty()); 
- 
-  return ClonedRootL; 
-} 
- 
-/// Build the cloned loops of an original loop from unswitching. 
-/// 
-/// Because unswitching simplifies the CFG of the loop, this isn't a trivial 
-/// operation. We need to re-verify that there even is a loop (as the backedge 
-/// may not have been cloned), and even if there are remaining backedges the 
-/// backedge set may be different. However, we know that each child loop is 
-/// undisturbed, we only need to find where to place each child loop within 
-/// either any parent loop or within a cloned version of the original loop. 
-/// 
-/// Because child loops may end up cloned outside of any cloned version of the 
-/// original loop, multiple cloned sibling loops may be created. All of them 
-/// are returned so that the newly introduced loop nest roots can be 
-/// identified. 
-static void buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks, 
-                             const ValueToValueMapTy &VMap, LoopInfo &LI, 
-                             SmallVectorImpl<Loop *> &NonChildClonedLoops) { 
-  Loop *ClonedL = nullptr; 
- 
-  auto *OrigPH = OrigL.getLoopPreheader(); 
-  auto *OrigHeader = OrigL.getHeader(); 
- 
-  auto *ClonedPH = cast<BasicBlock>(VMap.lookup(OrigPH)); 
-  auto *ClonedHeader = cast<BasicBlock>(VMap.lookup(OrigHeader)); 
- 
-  // We need to know the loops of the cloned exit blocks to even compute the 
-  // accurate parent loop. If we only clone exits to some parent of the 
-  // original parent, we want to clone into that outer loop. We also keep track 
-  // of the loops that our cloned exit blocks participate in. 
-  Loop *ParentL = nullptr; 
-  SmallVector<BasicBlock *, 4> ClonedExitsInLoops; 
-  SmallDenseMap<BasicBlock *, Loop *, 16> ExitLoopMap; 
-  ClonedExitsInLoops.reserve(ExitBlocks.size()); 
-  for (auto *ExitBB : ExitBlocks) 
-    if (auto *ClonedExitBB = cast_or_null<BasicBlock>(VMap.lookup(ExitBB))) 
-      if (Loop *ExitL = LI.getLoopFor(ExitBB)) { 
-        ExitLoopMap[ClonedExitBB] = ExitL; 
-        ClonedExitsInLoops.push_back(ClonedExitBB); 
-        if (!ParentL || (ParentL != ExitL && ParentL->contains(ExitL))) 
-          ParentL = ExitL; 
-      } 
-  assert((!ParentL || ParentL == OrigL.getParentLoop() || 
-          ParentL->contains(OrigL.getParentLoop())) && 
-         "The computed parent loop should always contain (or be) the parent of " 
-         "the original loop."); 
- 
-  // We build the set of blocks dominated by the cloned header from the set of 
-  // cloned blocks out of the original loop. While not all of these will 
-  // necessarily be in the cloned loop, it is enough to establish that they 
-  // aren't in unreachable cycles, etc. 
-  SmallSetVector<BasicBlock *, 16> ClonedLoopBlocks; 
-  for (auto *BB : OrigL.blocks()) 
-    if (auto *ClonedBB = cast_or_null<BasicBlock>(VMap.lookup(BB))) 
-      ClonedLoopBlocks.insert(ClonedBB); 
- 
-  // Rebuild the set of blocks that will end up in the cloned loop. We may have 
-  // skipped cloning some region of this loop which can in turn skip some of 
-  // the backedges so we have to rebuild the blocks in the loop based on the 
-  // backedges that remain after cloning. 
-  SmallVector<BasicBlock *, 16> Worklist; 
-  SmallPtrSet<BasicBlock *, 16> BlocksInClonedLoop; 
-  for (auto *Pred : predecessors(ClonedHeader)) { 
-    // The only possible non-loop header predecessor is the preheader because 
-    // we know we cloned the loop in simplified form. 
-    if (Pred == ClonedPH) 
-      continue; 
- 
-    // Because the loop was in simplified form, the only non-loop predecessor 
-    // should be the preheader. 
-    assert(ClonedLoopBlocks.count(Pred) && "Found a predecessor of the loop " 
-                                           "header other than the preheader " 
-                                           "that is not part of the loop!"); 
- 
-    // Insert this block into the loop set and on the first visit (and if it 
-    // isn't the header we're currently walking) put it into the worklist to 
-    // recurse through. 
-    if (BlocksInClonedLoop.insert(Pred).second && Pred != ClonedHeader) 
-      Worklist.push_back(Pred); 
-  } 
- 
-  // If we had any backedges then there *is* a cloned loop. Put the header into 
-  // the loop set and then walk the worklist backwards to find all the blocks 
-  // that remain within the loop after cloning. 
-  if (!BlocksInClonedLoop.empty()) { 
-    BlocksInClonedLoop.insert(ClonedHeader); 
- 
-    while (!Worklist.empty()) { 
-      BasicBlock *BB = Worklist.pop_back_val(); 
-      assert(BlocksInClonedLoop.count(BB) && 
-             "Didn't put block into the loop set!"); 
- 
-      // Insert any predecessors that are in the possible set into the cloned 
-      // set, and if the insert is successful, add them to the worklist. Note 
-      // that we filter on the blocks that are definitely reachable via the 
-      // backedge to the loop header so we may prune out dead code within the 
-      // cloned loop. 
-      for (auto *Pred : predecessors(BB)) 
-        if (ClonedLoopBlocks.count(Pred) && 
-            BlocksInClonedLoop.insert(Pred).second) 
-          Worklist.push_back(Pred); 
-    } 
- 
-    ClonedL = LI.AllocateLoop(); 
-    if (ParentL) { 
-      ParentL->addBasicBlockToLoop(ClonedPH, LI); 
-      ParentL->addChildLoop(ClonedL); 
-    } else { 
-      LI.addTopLevelLoop(ClonedL); 
-    } 
-    NonChildClonedLoops.push_back(ClonedL); 
- 
-    ClonedL->reserveBlocks(BlocksInClonedLoop.size()); 
-    // We don't want to just add the cloned loop blocks based on how we 
-    // discovered them. The original order of blocks was carefully built in 
-    // a way that doesn't rely on predecessor ordering. Rather than re-invent 
-    // that logic, we just re-walk the original blocks (and those of the child 
-    // loops) and filter them as we add them into the cloned loop. 
-    for (auto *BB : OrigL.blocks()) { 
-      auto *ClonedBB = cast_or_null<BasicBlock>(VMap.lookup(BB)); 
-      if (!ClonedBB || !BlocksInClonedLoop.count(ClonedBB)) 
-        continue; 
- 
-      // Directly add the blocks that are only in this loop. 
-      if (LI.getLoopFor(BB) == &OrigL) { 
-        ClonedL->addBasicBlockToLoop(ClonedBB, LI); 
-        continue; 
-      } 
- 
-      // We want to manually add it to this loop and parents. 
-      // Registering it with LoopInfo will happen when we clone the top 
-      // loop for this block. 
-      for (Loop *PL = ClonedL; PL; PL = PL->getParentLoop()) 
-        PL->addBlockEntry(ClonedBB); 
-    } 
- 
-    // Now add each child loop whose header remains within the cloned loop. All 
-    // of the blocks within the loop must satisfy the same constraints as the 
-    // header so once we pass the header checks we can just clone the entire 
-    // child loop nest. 
-    for (Loop *ChildL : OrigL) { 
-      auto *ClonedChildHeader = 
-          cast_or_null<BasicBlock>(VMap.lookup(ChildL->getHeader())); 
-      if (!ClonedChildHeader || !BlocksInClonedLoop.count(ClonedChildHeader)) 
-        continue; 
- 
-#ifndef NDEBUG 
-      // We should never have a cloned child loop header but fail to have 
-      // all of the blocks for that child loop. 
-      for (auto *ChildLoopBB : ChildL->blocks()) 
-        assert(BlocksInClonedLoop.count( 
-                   cast<BasicBlock>(VMap.lookup(ChildLoopBB))) && 
-               "Child cloned loop has a header within the cloned outer " 
-               "loop but not all of its blocks!"); 
-#endif 
- 
-      cloneLoopNest(*ChildL, ClonedL, VMap, LI); 
-    } 
-  } 
- 
-  // Now that we've handled all the components of the original loop that were 
-  // cloned into a new loop, we still need to handle anything from the original 
-  // loop that wasn't in a cloned loop. 
- 
-  // Figure out what blocks are left to place within any loop nest containing 
-  // the unswitched loop. If we never formed a loop, the cloned PH is one of 
-  // them. 
-  SmallPtrSet<BasicBlock *, 16> UnloopedBlockSet; 
-  if (BlocksInClonedLoop.empty()) 
-    UnloopedBlockSet.insert(ClonedPH); 
-  for (auto *ClonedBB : ClonedLoopBlocks) 
-    if (!BlocksInClonedLoop.count(ClonedBB)) 
-      UnloopedBlockSet.insert(ClonedBB); 
- 
-  // Copy the cloned exits and sort them in ascending loop depth, we'll work 
-  // backwards across these to process them inside out. The order shouldn't 
-  // matter as we're just trying to build up the map from inside-out; we use 
-  // the map in a more stably ordered way below. 
-  auto OrderedClonedExitsInLoops = ClonedExitsInLoops; 
-  llvm::sort(OrderedClonedExitsInLoops, [&](BasicBlock *LHS, BasicBlock *RHS) { 
-    return ExitLoopMap.lookup(LHS)->getLoopDepth() < 
-           ExitLoopMap.lookup(RHS)->getLoopDepth(); 
-  }); 
- 
-  // Populate the existing ExitLoopMap with everything reachable from each 
-  // exit, starting from the inner most exit. 
-  while (!UnloopedBlockSet.empty() && !OrderedClonedExitsInLoops.empty()) { 
-    assert(Worklist.empty() && "Didn't clear worklist!"); 
- 
-    BasicBlock *ExitBB = OrderedClonedExitsInLoops.pop_back_val(); 
-    Loop *ExitL = ExitLoopMap.lookup(ExitBB); 
- 
-    // Walk the CFG back until we hit the cloned PH adding everything reachable 
-    // and in the unlooped set to this exit block's loop. 
-    Worklist.push_back(ExitBB); 
-    do { 
-      BasicBlock *BB = Worklist.pop_back_val(); 
-      // We can stop recursing at the cloned preheader (if we get there). 
-      if (BB == ClonedPH) 
-        continue; 
- 
-      for (BasicBlock *PredBB : predecessors(BB)) { 
-        // If this pred has already been moved to our set or is part of some 
-        // (inner) loop, no update needed. 
-        if (!UnloopedBlockSet.erase(PredBB)) { 
-          assert( 
-              (BlocksInClonedLoop.count(PredBB) || ExitLoopMap.count(PredBB)) && 
-              "Predecessor not mapped to a loop!"); 
-          continue; 
-        } 
- 
-        // We just insert into the loop set here. We'll add these blocks to the 
-        // exit loop after we build up the set in an order that doesn't rely on 
-        // predecessor order (which in turn relies on use list order). 
-        bool Inserted = ExitLoopMap.insert({PredBB, ExitL}).second; 
-        (void)Inserted; 
-        assert(Inserted && "Should only visit an unlooped block once!"); 
- 
-        // And recurse through to its predecessors. 
-        Worklist.push_back(PredBB); 
-      } 
-    } while (!Worklist.empty()); 
-  } 
- 
-  // Now that the ExitLoopMap gives as  mapping for all the non-looping cloned 
-  // blocks to their outer loops, walk the cloned blocks and the cloned exits 
-  // in their original order adding them to the correct loop. 
- 
-  // We need a stable insertion order. We use the order of the original loop 
-  // order and map into the correct parent loop. 
-  for (auto *BB : llvm::concat<BasicBlock *const>( 
-           makeArrayRef(ClonedPH), ClonedLoopBlocks, ClonedExitsInLoops)) 
-    if (Loop *OuterL = ExitLoopMap.lookup(BB)) 
-      OuterL->addBasicBlockToLoop(BB, LI); 
- 
-#ifndef NDEBUG 
-  for (auto &BBAndL : ExitLoopMap) { 
-    auto *BB = BBAndL.first; 
-    auto *OuterL = BBAndL.second; 
-    assert(LI.getLoopFor(BB) == OuterL && 
-           "Failed to put all blocks into outer loops!"); 
-  } 
-#endif 
- 
-  // Now that all the blocks are placed into the correct containing loop in the 
-  // absence of child loops, find all the potentially cloned child loops and 
-  // clone them into whatever outer loop we placed their header into. 
-  for (Loop *ChildL : OrigL) { 
-    auto *ClonedChildHeader = 
-        cast_or_null<BasicBlock>(VMap.lookup(ChildL->getHeader())); 
-    if (!ClonedChildHeader || BlocksInClonedLoop.count(ClonedChildHeader)) 
-      continue; 
- 
-#ifndef NDEBUG 
-    for (auto *ChildLoopBB : ChildL->blocks()) 
-      assert(VMap.count(ChildLoopBB) && 
-             "Cloned a child loop header but not all of that loops blocks!"); 
-#endif 
- 
-    NonChildClonedLoops.push_back(cloneLoopNest( 
-        *ChildL, ExitLoopMap.lookup(ClonedChildHeader), VMap, LI)); 
-  } 
-} 
- 
-static void 
-deleteDeadClonedBlocks(Loop &L, ArrayRef<BasicBlock *> ExitBlocks, 
-                       ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps, 
-                       DominatorTree &DT, MemorySSAUpdater *MSSAU) { 
-  // Find all the dead clones, and remove them from their successors. 
-  SmallVector<BasicBlock *, 16> DeadBlocks; 
-  for (BasicBlock *BB : llvm::concat<BasicBlock *const>(L.blocks(), ExitBlocks)) 
-    for (auto &VMap : VMaps) 
-      if (BasicBlock *ClonedBB = cast_or_null<BasicBlock>(VMap->lookup(BB))) 
-        if (!DT.isReachableFromEntry(ClonedBB)) { 
-          for (BasicBlock *SuccBB : successors(ClonedBB)) 
-            SuccBB->removePredecessor(ClonedBB); 
-          DeadBlocks.push_back(ClonedBB); 
-        } 
- 
-  // Remove all MemorySSA in the dead blocks 
-  if (MSSAU) { 
-    SmallSetVector<BasicBlock *, 8> DeadBlockSet(DeadBlocks.begin(), 
-                                                 DeadBlocks.end()); 
-    MSSAU->removeBlocks(DeadBlockSet); 
-  } 
- 
-  // Drop any remaining references to break cycles. 
-  for (BasicBlock *BB : DeadBlocks) 
-    BB->dropAllReferences(); 
-  // Erase them from the IR. 
-  for (BasicBlock *BB : DeadBlocks) 
-    BB->eraseFromParent(); 
-} 
- 
-static void deleteDeadBlocksFromLoop(Loop &L, 
-                                     SmallVectorImpl<BasicBlock *> &ExitBlocks, 
-                                     DominatorTree &DT, LoopInfo &LI, 
-                                     MemorySSAUpdater *MSSAU) { 
-  // Find all the dead blocks tied to this loop, and remove them from their 
-  // successors. 
-  SmallSetVector<BasicBlock *, 8> DeadBlockSet; 
- 
-  // Start with loop/exit blocks and get a transitive closure of reachable dead 
-  // blocks. 
-  SmallVector<BasicBlock *, 16> DeathCandidates(ExitBlocks.begin(), 
-                                                ExitBlocks.end()); 
-  DeathCandidates.append(L.blocks().begin(), L.blocks().end()); 
-  while (!DeathCandidates.empty()) { 
-    auto *BB = DeathCandidates.pop_back_val(); 
-    if (!DeadBlockSet.count(BB) && !DT.isReachableFromEntry(BB)) { 
-      for (BasicBlock *SuccBB : successors(BB)) { 
-        SuccBB->removePredecessor(BB); 
-        DeathCandidates.push_back(SuccBB); 
-      } 
-      DeadBlockSet.insert(BB); 
-    } 
-  } 
- 
-  // Remove all MemorySSA in the dead blocks 
-  if (MSSAU) 
-    MSSAU->removeBlocks(DeadBlockSet); 
- 
-  // Filter out the dead blocks from the exit blocks list so that it can be 
-  // used in the caller. 
-  llvm::erase_if(ExitBlocks, 
-                 [&](BasicBlock *BB) { return DeadBlockSet.count(BB); }); 
- 
-  // Walk from this loop up through its parents removing all of the dead blocks. 
-  for (Loop *ParentL = &L; ParentL; ParentL = ParentL->getParentLoop()) { 
-    for (auto *BB : DeadBlockSet) 
-      ParentL->getBlocksSet().erase(BB); 
-    llvm::erase_if(ParentL->getBlocksVector(), 
-                   [&](BasicBlock *BB) { return DeadBlockSet.count(BB); }); 
-  } 
- 
-  // Now delete the dead child loops. This raw delete will clear them 
-  // recursively. 
-  llvm::erase_if(L.getSubLoopsVector(), [&](Loop *ChildL) { 
-    if (!DeadBlockSet.count(ChildL->getHeader())) 
-      return false; 
- 
-    assert(llvm::all_of(ChildL->blocks(), 
-                        [&](BasicBlock *ChildBB) { 
-                          return DeadBlockSet.count(ChildBB); 
-                        }) && 
-           "If the child loop header is dead all blocks in the child loop must " 
-           "be dead as well!"); 
-    LI.destroy(ChildL); 
-    return true; 
-  }); 
- 
-  // Remove the loop mappings for the dead blocks and drop all the references 
-  // from these blocks to others to handle cyclic references as we start 
-  // deleting the blocks themselves. 
-  for (auto *BB : DeadBlockSet) { 
-    // Check that the dominator tree has already been updated. 
-    assert(!DT.getNode(BB) && "Should already have cleared domtree!"); 
-    LI.changeLoopFor(BB, nullptr); 
-    // Drop all uses of the instructions to make sure we won't have dangling 
-    // uses in other blocks. 
-    for (auto &I : *BB) 
-      if (!I.use_empty()) 
-        I.replaceAllUsesWith(UndefValue::get(I.getType())); 
-    BB->dropAllReferences(); 
-  } 
- 
-  // Actually delete the blocks now that they've been fully unhooked from the 
-  // IR. 
-  for (auto *BB : DeadBlockSet) 
-    BB->eraseFromParent(); 
-} 
- 
-/// Recompute the set of blocks in a loop after unswitching. 
-/// 
-/// This walks from the original headers predecessors to rebuild the loop. We 
-/// take advantage of the fact that new blocks can't have been added, and so we 
-/// filter by the original loop's blocks. This also handles potentially 
-/// unreachable code that we don't want to explore but might be found examining 
-/// the predecessors of the header. 
-/// 
-/// If the original loop is no longer a loop, this will return an empty set. If 
-/// it remains a loop, all the blocks within it will be added to the set 
-/// (including those blocks in inner loops). 
-static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L, 
-                                                                 LoopInfo &LI) { 
-  SmallPtrSet<const BasicBlock *, 16> LoopBlockSet; 
- 
-  auto *PH = L.getLoopPreheader(); 
-  auto *Header = L.getHeader(); 
- 
-  // A worklist to use while walking backwards from the header. 
-  SmallVector<BasicBlock *, 16> Worklist; 
- 
-  // First walk the predecessors of the header to find the backedges. This will 
-  // form the basis of our walk. 
-  for (auto *Pred : predecessors(Header)) { 
-    // Skip the preheader. 
-    if (Pred == PH) 
-      continue; 
- 
-    // Because the loop was in simplified form, the only non-loop predecessor 
-    // is the preheader. 
-    assert(L.contains(Pred) && "Found a predecessor of the loop header other " 
-                               "than the preheader that is not part of the " 
-                               "loop!"); 
- 
-    // Insert this block into the loop set and on the first visit and, if it 
-    // isn't the header we're currently walking, put it into the worklist to 
-    // recurse through. 
-    if (LoopBlockSet.insert(Pred).second && Pred != Header) 
-      Worklist.push_back(Pred); 
-  } 
- 
-  // If no backedges were found, we're done. 
-  if (LoopBlockSet.empty()) 
-    return LoopBlockSet; 
- 
-  // We found backedges, recurse through them to identify the loop blocks. 
-  while (!Worklist.empty()) { 
-    BasicBlock *BB = Worklist.pop_back_val(); 
-    assert(LoopBlockSet.count(BB) && "Didn't put block into the loop set!"); 
- 
-    // No need to walk past the header. 
-    if (BB == Header) 
-      continue; 
- 
-    // Because we know the inner loop structure remains valid we can use the 
-    // loop structure to jump immediately across the entire nested loop. 
-    // Further, because it is in loop simplified form, we can directly jump 
-    // to its preheader afterward. 
-    if (Loop *InnerL = LI.getLoopFor(BB)) 
-      if (InnerL != &L) { 
-        assert(L.contains(InnerL) && 
-               "Should not reach a loop *outside* this loop!"); 
-        // The preheader is the only possible predecessor of the loop so 
-        // insert it into the set and check whether it was already handled. 
-        auto *InnerPH = InnerL->getLoopPreheader(); 
-        assert(L.contains(InnerPH) && "Cannot contain an inner loop block " 
-                                      "but not contain the inner loop " 
-                                      "preheader!"); 
-        if (!LoopBlockSet.insert(InnerPH).second) 
-          // The only way to reach the preheader is through the loop body 
-          // itself so if it has been visited the loop is already handled. 
-          continue; 
- 
-        // Insert all of the blocks (other than those already present) into 
-        // the loop set. We expect at least the block that led us to find the 
-        // inner loop to be in the block set, but we may also have other loop 
-        // blocks if they were already enqueued as predecessors of some other 
-        // outer loop block. 
-        for (auto *InnerBB : InnerL->blocks()) { 
-          if (InnerBB == BB) { 
-            assert(LoopBlockSet.count(InnerBB) && 
-                   "Block should already be in the set!"); 
-            continue; 
-          } 
- 
-          LoopBlockSet.insert(InnerBB); 
-        } 
- 
-        // Add the preheader to the worklist so we will continue past the 
-        // loop body. 
-        Worklist.push_back(InnerPH); 
-        continue; 
-      } 
- 
-    // Insert any predecessors that were in the original loop into the new 
-    // set, and if the insert is successful, add them to the worklist. 
-    for (auto *Pred : predecessors(BB)) 
-      if (L.contains(Pred) && LoopBlockSet.insert(Pred).second) 
-        Worklist.push_back(Pred); 
-  } 
- 
-  assert(LoopBlockSet.count(Header) && "Cannot fail to add the header!"); 
- 
-  // We've found all the blocks participating in the loop, return our completed 
-  // set. 
-  return LoopBlockSet; 
-} 
- 
-/// Rebuild a loop after unswitching removes some subset of blocks and edges. 
-/// 
-/// The removal may have removed some child loops entirely but cannot have 
-/// disturbed any remaining child loops. However, they may need to be hoisted 
-/// to the parent loop (or to be top-level loops). The original loop may be 
-/// completely removed. 
-/// 
-/// The sibling loops resulting from this update are returned. If the original 
-/// loop remains a valid loop, it will be the first entry in this list with all 
-/// of the newly sibling loops following it. 
-/// 
-/// Returns true if the loop remains a loop after unswitching, and false if it 
-/// is no longer a loop after unswitching (and should not continue to be 
-/// referenced). 
-static bool rebuildLoopAfterUnswitch(Loop &L, ArrayRef<BasicBlock *> ExitBlocks, 
-                                     LoopInfo &LI, 
-                                     SmallVectorImpl<Loop *> &HoistedLoops) { 
-  auto *PH = L.getLoopPreheader(); 
- 
-  // Compute the actual parent loop from the exit blocks. Because we may have 
-  // pruned some exits the loop may be different from the original parent. 
-  Loop *ParentL = nullptr; 
-  SmallVector<Loop *, 4> ExitLoops; 
-  SmallVector<BasicBlock *, 4> ExitsInLoops; 
-  ExitsInLoops.reserve(ExitBlocks.size()); 
-  for (auto *ExitBB : ExitBlocks) 
-    if (Loop *ExitL = LI.getLoopFor(ExitBB)) { 
-      ExitLoops.push_back(ExitL); 
-      ExitsInLoops.push_back(ExitBB); 
-      if (!ParentL || (ParentL != ExitL && ParentL->contains(ExitL))) 
-        ParentL = ExitL; 
-    } 
- 
-  // Recompute the blocks participating in this loop. This may be empty if it 
-  // is no longer a loop. 
-  auto LoopBlockSet = recomputeLoopBlockSet(L, LI); 
- 
-  // If we still have a loop, we need to re-set the loop's parent as the exit 
-  // block set changing may have moved it within the loop nest. Note that this 
-  // can only happen when this loop has a parent as it can only hoist the loop 
-  // *up* the nest. 
-  if (!LoopBlockSet.empty() && L.getParentLoop() != ParentL) { 
-    // Remove this loop's (original) blocks from all of the intervening loops. 
-    for (Loop *IL = L.getParentLoop(); IL != ParentL; 
-         IL = IL->getParentLoop()) { 
-      IL->getBlocksSet().erase(PH); 
-      for (auto *BB : L.blocks()) 
-        IL->getBlocksSet().erase(BB); 
-      llvm::erase_if(IL->getBlocksVector(), [&](BasicBlock *BB) { 
-        return BB == PH || L.contains(BB); 
-      }); 
-    } 
- 
-    LI.changeLoopFor(PH, ParentL); 
-    L.getParentLoop()->removeChildLoop(&L); 
-    if (ParentL) 
-      ParentL->addChildLoop(&L); 
-    else 
-      LI.addTopLevelLoop(&L); 
-  } 
- 
-  // Now we update all the blocks which are no longer within the loop. 
-  auto &Blocks = L.getBlocksVector(); 
-  auto BlocksSplitI = 
-      LoopBlockSet.empty() 
-          ? Blocks.begin() 
-          : std::stable_partition( 
-                Blocks.begin(), Blocks.end(), 
-                [&](BasicBlock *BB) { return LoopBlockSet.count(BB); }); 
- 
-  // Before we erase the list of unlooped blocks, build a set of them. 
-  SmallPtrSet<BasicBlock *, 16> UnloopedBlocks(BlocksSplitI, Blocks.end()); 
-  if (LoopBlockSet.empty()) 
-    UnloopedBlocks.insert(PH); 
- 
-  // Now erase these blocks from the loop. 
-  for (auto *BB : make_range(BlocksSplitI, Blocks.end())) 
-    L.getBlocksSet().erase(BB); 
-  Blocks.erase(BlocksSplitI, Blocks.end()); 
- 
-  // Sort the exits in ascending loop depth, we'll work backwards across these 
-  // to process them inside out. 
-  llvm::stable_sort(ExitsInLoops, [&](BasicBlock *LHS, BasicBlock *RHS) { 
-    return LI.getLoopDepth(LHS) < LI.getLoopDepth(RHS); 
-  }); 
- 
-  // We'll build up a set for each exit loop. 
-  SmallPtrSet<BasicBlock *, 16> NewExitLoopBlocks; 
-  Loop *PrevExitL = L.getParentLoop(); // The deepest possible exit loop. 
- 
-  auto RemoveUnloopedBlocksFromLoop = 
-      [](Loop &L, SmallPtrSetImpl<BasicBlock *> &UnloopedBlocks) { 
-        for (auto *BB : UnloopedBlocks) 
-          L.getBlocksSet().erase(BB); 
-        llvm::erase_if(L.getBlocksVector(), [&](BasicBlock *BB) { 
-          return UnloopedBlocks.count(BB); 
-        }); 
-      }; 
- 
-  SmallVector<BasicBlock *, 16> Worklist; 
-  while (!UnloopedBlocks.empty() && !ExitsInLoops.empty()) { 
-    assert(Worklist.empty() && "Didn't clear worklist!"); 
-    assert(NewExitLoopBlocks.empty() && "Didn't clear loop set!"); 
- 
-    // Grab the next exit block, in decreasing loop depth order. 
-    BasicBlock *ExitBB = ExitsInLoops.pop_back_val(); 
-    Loop &ExitL = *LI.getLoopFor(ExitBB); 
-    assert(ExitL.contains(&L) && "Exit loop must contain the inner loop!"); 
- 
-    // Erase all of the unlooped blocks from the loops between the previous 
-    // exit loop and this exit loop. This works because the ExitInLoops list is 
-    // sorted in increasing order of loop depth and thus we visit loops in 
-    // decreasing order of loop depth. 
-    for (; PrevExitL != &ExitL; PrevExitL = PrevExitL->getParentLoop()) 
-      RemoveUnloopedBlocksFromLoop(*PrevExitL, UnloopedBlocks); 
- 
-    // Walk the CFG back until we hit the cloned PH adding everything reachable 
-    // and in the unlooped set to this exit block's loop. 
-    Worklist.push_back(ExitBB); 
-    do { 
-      BasicBlock *BB = Worklist.pop_back_val(); 
-      // We can stop recursing at the cloned preheader (if we get there). 
-      if (BB == PH) 
-        continue; 
- 
-      for (BasicBlock *PredBB : predecessors(BB)) { 
-        // If this pred has already been moved to our set or is part of some 
-        // (inner) loop, no update needed. 
-        if (!UnloopedBlocks.erase(PredBB)) { 
-          assert((NewExitLoopBlocks.count(PredBB) || 
-                  ExitL.contains(LI.getLoopFor(PredBB))) && 
-                 "Predecessor not in a nested loop (or already visited)!"); 
-          continue; 
-        } 
- 
-        // We just insert into the loop set here. We'll add these blocks to the 
-        // exit loop after we build up the set in a deterministic order rather 
-        // than the predecessor-influenced visit order. 
-        bool Inserted = NewExitLoopBlocks.insert(PredBB).second; 
-        (void)Inserted; 
-        assert(Inserted && "Should only visit an unlooped block once!"); 
- 
-        // And recurse through to its predecessors. 
-        Worklist.push_back(PredBB); 
-      } 
-    } while (!Worklist.empty()); 
- 
-    // If blocks in this exit loop were directly part of the original loop (as 
-    // opposed to a child loop) update the map to point to this exit loop. This 
-    // just updates a map and so the fact that the order is unstable is fine. 
-    for (auto *BB : NewExitLoopBlocks) 
-      if (Loop *BBL = LI.getLoopFor(BB)) 
-        if (BBL == &L || !L.contains(BBL)) 
-          LI.changeLoopFor(BB, &ExitL); 
- 
-    // We will remove the remaining unlooped blocks from this loop in the next 
-    // iteration or below. 
-    NewExitLoopBlocks.clear(); 
-  } 
- 
-  // Any remaining unlooped blocks are no longer part of any loop unless they 
-  // are part of some child loop. 
-  for (; PrevExitL; PrevExitL = PrevExitL->getParentLoop()) 
-    RemoveUnloopedBlocksFromLoop(*PrevExitL, UnloopedBlocks); 
-  for (auto *BB : UnloopedBlocks) 
-    if (Loop *BBL = LI.getLoopFor(BB)) 
-      if (BBL == &L || !L.contains(BBL)) 
-        LI.changeLoopFor(BB, nullptr); 
- 
-  // Sink all the child loops whose headers are no longer in the loop set to 
-  // the parent (or to be top level loops). We reach into the loop and directly 
-  // update its subloop vector to make this batch update efficient. 
-  auto &SubLoops = L.getSubLoopsVector(); 
-  auto SubLoopsSplitI = 
-      LoopBlockSet.empty() 
-          ? SubLoops.begin() 
-          : std::stable_partition( 
-                SubLoops.begin(), SubLoops.end(), [&](Loop *SubL) { 
-                  return LoopBlockSet.count(SubL->getHeader()); 
-                }); 
-  for (auto *HoistedL : make_range(SubLoopsSplitI, SubLoops.end())) { 
-    HoistedLoops.push_back(HoistedL); 
-    HoistedL->setParentLoop(nullptr); 
- 
-    // To compute the new parent of this hoisted loop we look at where we 
-    // placed the preheader above. We can't lookup the header itself because we 
-    // retained the mapping from the header to the hoisted loop. But the 
-    // preheader and header should have the exact same new parent computed 
-    // based on the set of exit blocks from the original loop as the preheader 
-    // is a predecessor of the header and so reached in the reverse walk. And 
-    // because the loops were all in simplified form the preheader of the 
-    // hoisted loop can't be part of some *other* loop. 
-    if (auto *NewParentL = LI.getLoopFor(HoistedL->getLoopPreheader())) 
-      NewParentL->addChildLoop(HoistedL); 
-    else 
-      LI.addTopLevelLoop(HoistedL); 
-  } 
-  SubLoops.erase(SubLoopsSplitI, SubLoops.end()); 
- 
-  // Actually delete the loop if nothing remained within it. 
-  if (Blocks.empty()) { 
-    assert(SubLoops.empty() && 
-           "Failed to remove all subloops from the original loop!"); 
-    if (Loop *ParentL = L.getParentLoop()) 
-      ParentL->removeChildLoop(llvm::find(*ParentL, &L)); 
-    else 
-      LI.removeLoop(llvm::find(LI, &L)); 
-    LI.destroy(&L); 
-    return false; 
-  } 
- 
-  return true; 
-} 
- 
-/// Helper to visit a dominator subtree, invoking a callable on each node. 
-/// 
-/// Returning false at any point will stop walking past that node of the tree. 
-template <typename CallableT> 
-void visitDomSubTree(DominatorTree &DT, BasicBlock *BB, CallableT Callable) { 
-  SmallVector<DomTreeNode *, 4> DomWorklist; 
-  DomWorklist.push_back(DT[BB]); 
-#ifndef NDEBUG 
-  SmallPtrSet<DomTreeNode *, 4> Visited; 
-  Visited.insert(DT[BB]); 
-#endif 
-  do { 
-    DomTreeNode *N = DomWorklist.pop_back_val(); 
- 
-    // Visit this node. 
-    if (!Callable(N->getBlock())) 
-      continue; 
- 
-    // Accumulate the child nodes. 
-    for (DomTreeNode *ChildN : *N) { 
-      assert(Visited.insert(ChildN).second && 
-             "Cannot visit a node twice when walking a tree!"); 
-      DomWorklist.push_back(ChildN); 
-    } 
-  } while (!DomWorklist.empty()); 
-} 
- 
-static void unswitchNontrivialInvariants( 
-    Loop &L, Instruction &TI, ArrayRef<Value *> Invariants, 
-    SmallVectorImpl<BasicBlock *> &ExitBlocks, DominatorTree &DT, LoopInfo &LI, 
-    AssumptionCache &AC, function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB, 
-    ScalarEvolution *SE, MemorySSAUpdater *MSSAU) { 
-  auto *ParentBB = TI.getParent(); 
-  BranchInst *BI = dyn_cast<BranchInst>(&TI); 
-  SwitchInst *SI = BI ? nullptr : cast<SwitchInst>(&TI); 
- 
-  // We can only unswitch switches, conditional branches with an invariant 
-  // condition, or combining invariant conditions with an instruction. 
-  assert((SI || (BI && BI->isConditional())) && 
-         "Can only unswitch switches and conditional branch!"); 
-  bool FullUnswitch = SI || BI->getCondition() == Invariants[0]; 
-  if (FullUnswitch) 
-    assert(Invariants.size() == 1 && 
-           "Cannot have other invariants with full unswitching!"); 
-  else 
-    assert(isa<Instruction>(BI->getCondition()) && 
-           "Partial unswitching requires an instruction as the condition!"); 
- 
-  if (MSSAU && VerifyMemorySSA) 
-    MSSAU->getMemorySSA()->verifyMemorySSA(); 
- 
-  // Constant and BBs tracking the cloned and continuing successor. When we are 
-  // unswitching the entire condition, this can just be trivially chosen to 
-  // unswitch towards `true`. However, when we are unswitching a set of 
-  // invariants combined with `and` or `or`, the combining operation determines 
-  // the best direction to unswitch: we want to unswitch the direction that will 
-  // collapse the branch. 
-  bool Direction = true; 
-  int ClonedSucc = 0; 
-  if (!FullUnswitch) { 
-    if (cast<Instruction>(BI->getCondition())->getOpcode() != Instruction::Or) { 
-      assert(cast<Instruction>(BI->getCondition())->getOpcode() == 
-                 Instruction::And && 
-             "Only `or` and `and` instructions can combine invariants being " 
-             "unswitched."); 
-      Direction = false; 
-      ClonedSucc = 1; 
-    } 
-  } 
- 
-  BasicBlock *RetainedSuccBB = 
-      BI ? BI->getSuccessor(1 - ClonedSucc) : SI->getDefaultDest(); 
-  SmallSetVector<BasicBlock *, 4> UnswitchedSuccBBs; 
-  if (BI) 
-    UnswitchedSuccBBs.insert(BI->getSuccessor(ClonedSucc)); 
-  else 
-    for (auto Case : SI->cases()) 
-      if (Case.getCaseSuccessor() != RetainedSuccBB) 
-        UnswitchedSuccBBs.insert(Case.getCaseSuccessor()); 
- 
-  assert(!UnswitchedSuccBBs.count(RetainedSuccBB) && 
-         "Should not unswitch the same successor we are retaining!"); 
- 
-  // The branch should be in this exact loop. Any inner loop's invariant branch 
-  // should be handled by unswitching that inner loop. The caller of this 
-  // routine should filter out any candidates that remain (but were skipped for 
-  // whatever reason). 
-  assert(LI.getLoopFor(ParentBB) == &L && "Branch in an inner loop!"); 
- 
-  // Compute the parent loop now before we start hacking on things. 
-  Loop *ParentL = L.getParentLoop(); 
-  // Get blocks in RPO order for MSSA update, before changing the CFG. 
-  LoopBlocksRPO LBRPO(&L); 
-  if (MSSAU) 
-    LBRPO.perform(&LI); 
- 
-  // Compute the outer-most loop containing one of our exit blocks. This is the 
-  // furthest up our loopnest which can be mutated, which we will use below to 
-  // update things. 
-  Loop *OuterExitL = &L; 
-  for (auto *ExitBB : ExitBlocks) { 
-    Loop *NewOuterExitL = LI.getLoopFor(ExitBB); 
-    if (!NewOuterExitL) { 
-      // We exited the entire nest with this block, so we're done. 
-      OuterExitL = nullptr; 
-      break; 
-    } 
-    if (NewOuterExitL != OuterExitL && NewOuterExitL->contains(OuterExitL)) 
-      OuterExitL = NewOuterExitL; 
-  } 
- 
-  // At this point, we're definitely going to unswitch something so invalidate 
-  // any cached information in ScalarEvolution for the outer most loop 
-  // containing an exit block and all nested loops. 
-  if (SE) { 
-    if (OuterExitL) 
-      SE->forgetLoop(OuterExitL); 
-    else 
-      SE->forgetTopmostLoop(&L); 
-  } 
- 
-  // If the edge from this terminator to a successor dominates that successor, 
-  // store a map from each block in its dominator subtree to it. This lets us 
-  // tell when cloning for a particular successor if a block is dominated by 
-  // some *other* successor with a single data structure. We use this to 
-  // significantly reduce cloning. 
-  SmallDenseMap<BasicBlock *, BasicBlock *, 16> DominatingSucc; 
-  for (auto *SuccBB : llvm::concat<BasicBlock *const>( 
-           makeArrayRef(RetainedSuccBB), UnswitchedSuccBBs)) 
-    if (SuccBB->getUniquePredecessor() || 
-        llvm::all_of(predecessors(SuccBB), [&](BasicBlock *PredBB) { 
-          return PredBB == ParentBB || DT.dominates(SuccBB, PredBB); 
-        })) 
-      visitDomSubTree(DT, SuccBB, [&](BasicBlock *BB) { 
-        DominatingSucc[BB] = SuccBB; 
-        return true; 
-      }); 
- 
-  // Split the preheader, so that we know that there is a safe place to insert 
-  // the conditional branch. We will change the preheader to have a conditional 
-  // branch on LoopCond. The original preheader will become the split point 
-  // between the unswitched versions, and we will have a new preheader for the 
-  // original loop. 
-  BasicBlock *SplitBB = L.getLoopPreheader(); 
-  BasicBlock *LoopPH = SplitEdge(SplitBB, L.getHeader(), &DT, &LI, MSSAU); 
- 
-  // Keep track of the dominator tree updates needed. 
-  SmallVector<DominatorTree::UpdateType, 4> DTUpdates; 
- 
-  // Clone the loop for each unswitched successor. 
-  SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps; 
-  VMaps.reserve(UnswitchedSuccBBs.size()); 
-  SmallDenseMap<BasicBlock *, BasicBlock *, 4> ClonedPHs; 
-  for (auto *SuccBB : UnswitchedSuccBBs) { 
-    VMaps.emplace_back(new ValueToValueMapTy()); 
-    ClonedPHs[SuccBB] = buildClonedLoopBlocks( 
-        L, LoopPH, SplitBB, ExitBlocks, ParentBB, SuccBB, RetainedSuccBB, 
-        DominatingSucc, *VMaps.back(), DTUpdates, AC, DT, LI, MSSAU); 
-  } 
- 
+    return ClonedRootL;
+
+  // If we have a nest, we can quickly clone the entire loop nest using an
+  // iterative approach because it is a tree. We keep the cloned parent in the
+  // data structure to avoid repeatedly querying through a map to find it.
+  SmallVector<std::pair<Loop *, Loop *>, 16> LoopsToClone;
+  // Build up the loops to clone in reverse order as we'll clone them from the
+  // back.
+  for (Loop *ChildL : llvm::reverse(OrigRootL))
+    LoopsToClone.push_back({ClonedRootL, ChildL});
+  do {
+    Loop *ClonedParentL, *L;
+    std::tie(ClonedParentL, L) = LoopsToClone.pop_back_val();
+    Loop *ClonedL = LI.AllocateLoop();
+    ClonedParentL->addChildLoop(ClonedL);
+    AddClonedBlocksToLoop(*L, *ClonedL);
+    for (Loop *ChildL : llvm::reverse(*L))
+      LoopsToClone.push_back({ClonedL, ChildL});
+  } while (!LoopsToClone.empty());
+
+  return ClonedRootL;
+}
+
+/// Build the cloned loops of an original loop from unswitching.
+///
+/// Because unswitching simplifies the CFG of the loop, this isn't a trivial
+/// operation. We need to re-verify that there even is a loop (as the backedge
+/// may not have been cloned), and even if there are remaining backedges the
+/// backedge set may be different. However, we know that each child loop is
+/// undisturbed, we only need to find where to place each child loop within
+/// either any parent loop or within a cloned version of the original loop.
+///
+/// Because child loops may end up cloned outside of any cloned version of the
+/// original loop, multiple cloned sibling loops may be created. All of them
+/// are returned so that the newly introduced loop nest roots can be
+/// identified.
+static void buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks,
+                             const ValueToValueMapTy &VMap, LoopInfo &LI,
+                             SmallVectorImpl<Loop *> &NonChildClonedLoops) {
+  Loop *ClonedL = nullptr;
+
+  auto *OrigPH = OrigL.getLoopPreheader();
+  auto *OrigHeader = OrigL.getHeader();
+
+  auto *ClonedPH = cast<BasicBlock>(VMap.lookup(OrigPH));
+  auto *ClonedHeader = cast<BasicBlock>(VMap.lookup(OrigHeader));
+
+  // We need to know the loops of the cloned exit blocks to even compute the
+  // accurate parent loop. If we only clone exits to some parent of the
+  // original parent, we want to clone into that outer loop. We also keep track
+  // of the loops that our cloned exit blocks participate in.
+  Loop *ParentL = nullptr;
+  SmallVector<BasicBlock *, 4> ClonedExitsInLoops;
+  SmallDenseMap<BasicBlock *, Loop *, 16> ExitLoopMap;
+  ClonedExitsInLoops.reserve(ExitBlocks.size());
+  for (auto *ExitBB : ExitBlocks)
+    if (auto *ClonedExitBB = cast_or_null<BasicBlock>(VMap.lookup(ExitBB)))
+      if (Loop *ExitL = LI.getLoopFor(ExitBB)) {
+        ExitLoopMap[ClonedExitBB] = ExitL;
+        ClonedExitsInLoops.push_back(ClonedExitBB);
+        if (!ParentL || (ParentL != ExitL && ParentL->contains(ExitL)))
+          ParentL = ExitL;
+      }
+  assert((!ParentL || ParentL == OrigL.getParentLoop() ||
+          ParentL->contains(OrigL.getParentLoop())) &&
+         "The computed parent loop should always contain (or be) the parent of "
+         "the original loop.");
+
+  // We build the set of blocks dominated by the cloned header from the set of
+  // cloned blocks out of the original loop. While not all of these will
+  // necessarily be in the cloned loop, it is enough to establish that they
+  // aren't in unreachable cycles, etc.
+  SmallSetVector<BasicBlock *, 16> ClonedLoopBlocks;
+  for (auto *BB : OrigL.blocks())
+    if (auto *ClonedBB = cast_or_null<BasicBlock>(VMap.lookup(BB)))
+      ClonedLoopBlocks.insert(ClonedBB);
+
+  // Rebuild the set of blocks that will end up in the cloned loop. We may have
+  // skipped cloning some region of this loop which can in turn skip some of
+  // the backedges so we have to rebuild the blocks in the loop based on the
+  // backedges that remain after cloning.
+  SmallVector<BasicBlock *, 16> Worklist;
+  SmallPtrSet<BasicBlock *, 16> BlocksInClonedLoop;
+  for (auto *Pred : predecessors(ClonedHeader)) {
+    // The only possible non-loop header predecessor is the preheader because
+    // we know we cloned the loop in simplified form.
+    if (Pred == ClonedPH)
+      continue;
+
+    // Because the loop was in simplified form, the only non-loop predecessor
+    // should be the preheader.
+    assert(ClonedLoopBlocks.count(Pred) && "Found a predecessor of the loop "
+                                           "header other than the preheader "
+                                           "that is not part of the loop!");
+
+    // Insert this block into the loop set and on the first visit (and if it
+    // isn't the header we're currently walking) put it into the worklist to
+    // recurse through.
+    if (BlocksInClonedLoop.insert(Pred).second && Pred != ClonedHeader)
+      Worklist.push_back(Pred);
+  }
+
+  // If we had any backedges then there *is* a cloned loop. Put the header into
+  // the loop set and then walk the worklist backwards to find all the blocks
+  // that remain within the loop after cloning.
+  if (!BlocksInClonedLoop.empty()) {
+    BlocksInClonedLoop.insert(ClonedHeader);
+
+    while (!Worklist.empty()) {
+      BasicBlock *BB = Worklist.pop_back_val();
+      assert(BlocksInClonedLoop.count(BB) &&
+             "Didn't put block into the loop set!");
+
+      // Insert any predecessors that are in the possible set into the cloned
+      // set, and if the insert is successful, add them to the worklist. Note
+      // that we filter on the blocks that are definitely reachable via the
+      // backedge to the loop header so we may prune out dead code within the
+      // cloned loop.
+      for (auto *Pred : predecessors(BB))
+        if (ClonedLoopBlocks.count(Pred) &&
+            BlocksInClonedLoop.insert(Pred).second)
+          Worklist.push_back(Pred);
+    }
+
+    ClonedL = LI.AllocateLoop();
+    if (ParentL) {
+      ParentL->addBasicBlockToLoop(ClonedPH, LI);
+      ParentL->addChildLoop(ClonedL);
+    } else {
+      LI.addTopLevelLoop(ClonedL);
+    }
+    NonChildClonedLoops.push_back(ClonedL);
+
+    ClonedL->reserveBlocks(BlocksInClonedLoop.size());
+    // We don't want to just add the cloned loop blocks based on how we
+    // discovered them. The original order of blocks was carefully built in
+    // a way that doesn't rely on predecessor ordering. Rather than re-invent
+    // that logic, we just re-walk the original blocks (and those of the child
+    // loops) and filter them as we add them into the cloned loop.
+    for (auto *BB : OrigL.blocks()) {
+      auto *ClonedBB = cast_or_null<BasicBlock>(VMap.lookup(BB));
+      if (!ClonedBB || !BlocksInClonedLoop.count(ClonedBB))
+        continue;
+
+      // Directly add the blocks that are only in this loop.
+      if (LI.getLoopFor(BB) == &OrigL) {
+        ClonedL->addBasicBlockToLoop(ClonedBB, LI);
+        continue;
+      }
+
+      // We want to manually add it to this loop and parents.
+      // Registering it with LoopInfo will happen when we clone the top
+      // loop for this block.
+      for (Loop *PL = ClonedL; PL; PL = PL->getParentLoop())
+        PL->addBlockEntry(ClonedBB);
+    }
+
+    // Now add each child loop whose header remains within the cloned loop. All
+    // of the blocks within the loop must satisfy the same constraints as the
+    // header so once we pass the header checks we can just clone the entire
+    // child loop nest.
+    for (Loop *ChildL : OrigL) {
+      auto *ClonedChildHeader =
+          cast_or_null<BasicBlock>(VMap.lookup(ChildL->getHeader()));
+      if (!ClonedChildHeader || !BlocksInClonedLoop.count(ClonedChildHeader))
+        continue;
+
+#ifndef NDEBUG
+      // We should never have a cloned child loop header but fail to have
+      // all of the blocks for that child loop.
+      for (auto *ChildLoopBB : ChildL->blocks())
+        assert(BlocksInClonedLoop.count(
+                   cast<BasicBlock>(VMap.lookup(ChildLoopBB))) &&
+               "Child cloned loop has a header within the cloned outer "
+               "loop but not all of its blocks!");
+#endif
+
+      cloneLoopNest(*ChildL, ClonedL, VMap, LI);
+    }
+  }
+
+  // Now that we've handled all the components of the original loop that were
+  // cloned into a new loop, we still need to handle anything from the original
+  // loop that wasn't in a cloned loop.
+
+  // Figure out what blocks are left to place within any loop nest containing
+  // the unswitched loop. If we never formed a loop, the cloned PH is one of
+  // them.
+  SmallPtrSet<BasicBlock *, 16> UnloopedBlockSet;
+  if (BlocksInClonedLoop.empty())
+    UnloopedBlockSet.insert(ClonedPH);
+  for (auto *ClonedBB : ClonedLoopBlocks)
+    if (!BlocksInClonedLoop.count(ClonedBB))
+      UnloopedBlockSet.insert(ClonedBB);
+
+  // Copy the cloned exits and sort them in ascending loop depth, we'll work
+  // backwards across these to process them inside out. The order shouldn't
+  // matter as we're just trying to build up the map from inside-out; we use
+  // the map in a more stably ordered way below.
+  auto OrderedClonedExitsInLoops = ClonedExitsInLoops;
+  llvm::sort(OrderedClonedExitsInLoops, [&](BasicBlock *LHS, BasicBlock *RHS) {
+    return ExitLoopMap.lookup(LHS)->getLoopDepth() <
+           ExitLoopMap.lookup(RHS)->getLoopDepth();
+  });
+
+  // Populate the existing ExitLoopMap with everything reachable from each
+  // exit, starting from the inner most exit.
+  while (!UnloopedBlockSet.empty() && !OrderedClonedExitsInLoops.empty()) {
+    assert(Worklist.empty() && "Didn't clear worklist!");
+
+    BasicBlock *ExitBB = OrderedClonedExitsInLoops.pop_back_val();
+    Loop *ExitL = ExitLoopMap.lookup(ExitBB);
+
+    // Walk the CFG back until we hit the cloned PH adding everything reachable
+    // and in the unlooped set to this exit block's loop.
+    Worklist.push_back(ExitBB);
+    do {
+      BasicBlock *BB = Worklist.pop_back_val();
+      // We can stop recursing at the cloned preheader (if we get there).
+      if (BB == ClonedPH)
+        continue;
+
+      for (BasicBlock *PredBB : predecessors(BB)) {
+        // If this pred has already been moved to our set or is part of some
+        // (inner) loop, no update needed.
+        if (!UnloopedBlockSet.erase(PredBB)) {
+          assert(
+              (BlocksInClonedLoop.count(PredBB) || ExitLoopMap.count(PredBB)) &&
+              "Predecessor not mapped to a loop!");
+          continue;
+        }
+
+        // We just insert into the loop set here. We'll add these blocks to the
+        // exit loop after we build up the set in an order that doesn't rely on
+        // predecessor order (which in turn relies on use list order).
+        bool Inserted = ExitLoopMap.insert({PredBB, ExitL}).second;
+        (void)Inserted;
+        assert(Inserted && "Should only visit an unlooped block once!");
+
+        // And recurse through to its predecessors.
+        Worklist.push_back(PredBB);
+      }
+    } while (!Worklist.empty());
+  }
+
+  // Now that the ExitLoopMap gives as  mapping for all the non-looping cloned
+  // blocks to their outer loops, walk the cloned blocks and the cloned exits
+  // in their original order adding them to the correct loop.
+
+  // We need a stable insertion order. We use the order of the original loop
+  // order and map into the correct parent loop.
+  for (auto *BB : llvm::concat<BasicBlock *const>(
+           makeArrayRef(ClonedPH), ClonedLoopBlocks, ClonedExitsInLoops))
+    if (Loop *OuterL = ExitLoopMap.lookup(BB))
+      OuterL->addBasicBlockToLoop(BB, LI);
+
+#ifndef NDEBUG
+  for (auto &BBAndL : ExitLoopMap) {
+    auto *BB = BBAndL.first;
+    auto *OuterL = BBAndL.second;
+    assert(LI.getLoopFor(BB) == OuterL &&
+           "Failed to put all blocks into outer loops!");
+  }
+#endif
+
+  // Now that all the blocks are placed into the correct containing loop in the
+  // absence of child loops, find all the potentially cloned child loops and
+  // clone them into whatever outer loop we placed their header into.
+  for (Loop *ChildL : OrigL) {
+    auto *ClonedChildHeader =
+        cast_or_null<BasicBlock>(VMap.lookup(ChildL->getHeader()));
+    if (!ClonedChildHeader || BlocksInClonedLoop.count(ClonedChildHeader))
+      continue;
+
+#ifndef NDEBUG
+    for (auto *ChildLoopBB : ChildL->blocks())
+      assert(VMap.count(ChildLoopBB) &&
+             "Cloned a child loop header but not all of that loops blocks!");
+#endif
+
+    NonChildClonedLoops.push_back(cloneLoopNest(
+        *ChildL, ExitLoopMap.lookup(ClonedChildHeader), VMap, LI));
+  }
+}
+
+static void
+deleteDeadClonedBlocks(Loop &L, ArrayRef<BasicBlock *> ExitBlocks,
+                       ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
+                       DominatorTree &DT, MemorySSAUpdater *MSSAU) {
+  // Find all the dead clones, and remove them from their successors.
+  SmallVector<BasicBlock *, 16> DeadBlocks;
+  for (BasicBlock *BB : llvm::concat<BasicBlock *const>(L.blocks(), ExitBlocks))
+    for (auto &VMap : VMaps)
+      if (BasicBlock *ClonedBB = cast_or_null<BasicBlock>(VMap->lookup(BB)))
+        if (!DT.isReachableFromEntry(ClonedBB)) {
+          for (BasicBlock *SuccBB : successors(ClonedBB))
+            SuccBB->removePredecessor(ClonedBB);
+          DeadBlocks.push_back(ClonedBB);
+        }
+
+  // Remove all MemorySSA in the dead blocks
+  if (MSSAU) {
+    SmallSetVector<BasicBlock *, 8> DeadBlockSet(DeadBlocks.begin(),
+                                                 DeadBlocks.end());
+    MSSAU->removeBlocks(DeadBlockSet);
+  }
+
+  // Drop any remaining references to break cycles.
+  for (BasicBlock *BB : DeadBlocks)
+    BB->dropAllReferences();
+  // Erase them from the IR.
+  for (BasicBlock *BB : DeadBlocks)
+    BB->eraseFromParent();
+}
+
+static void deleteDeadBlocksFromLoop(Loop &L,
+                                     SmallVectorImpl<BasicBlock *> &ExitBlocks,
+                                     DominatorTree &DT, LoopInfo &LI,
+                                     MemorySSAUpdater *MSSAU) {
+  // Find all the dead blocks tied to this loop, and remove them from their
+  // successors.
+  SmallSetVector<BasicBlock *, 8> DeadBlockSet;
+
+  // Start with loop/exit blocks and get a transitive closure of reachable dead
+  // blocks.
+  SmallVector<BasicBlock *, 16> DeathCandidates(ExitBlocks.begin(),
+                                                ExitBlocks.end());
+  DeathCandidates.append(L.blocks().begin(), L.blocks().end());
+  while (!DeathCandidates.empty()) {
+    auto *BB = DeathCandidates.pop_back_val();
+    if (!DeadBlockSet.count(BB) && !DT.isReachableFromEntry(BB)) {
+      for (BasicBlock *SuccBB : successors(BB)) {
+        SuccBB->removePredecessor(BB);
+        DeathCandidates.push_back(SuccBB);
+      }
+      DeadBlockSet.insert(BB);
+    }
+  }
+
+  // Remove all MemorySSA in the dead blocks
+  if (MSSAU)
+    MSSAU->removeBlocks(DeadBlockSet);
+
+  // Filter out the dead blocks from the exit blocks list so that it can be
+  // used in the caller.
+  llvm::erase_if(ExitBlocks,
+                 [&](BasicBlock *BB) { return DeadBlockSet.count(BB); });
+
+  // Walk from this loop up through its parents removing all of the dead blocks.
+  for (Loop *ParentL = &L; ParentL; ParentL = ParentL->getParentLoop()) {
+    for (auto *BB : DeadBlockSet)
+      ParentL->getBlocksSet().erase(BB);
+    llvm::erase_if(ParentL->getBlocksVector(),
+                   [&](BasicBlock *BB) { return DeadBlockSet.count(BB); });
+  }
+
+  // Now delete the dead child loops. This raw delete will clear them
+  // recursively.
+  llvm::erase_if(L.getSubLoopsVector(), [&](Loop *ChildL) {
+    if (!DeadBlockSet.count(ChildL->getHeader()))
+      return false;
+
+    assert(llvm::all_of(ChildL->blocks(),
+                        [&](BasicBlock *ChildBB) {
+                          return DeadBlockSet.count(ChildBB);
+                        }) &&
+           "If the child loop header is dead all blocks in the child loop must "
+           "be dead as well!");
+    LI.destroy(ChildL);
+    return true;
+  });
+
+  // Remove the loop mappings for the dead blocks and drop all the references
+  // from these blocks to others to handle cyclic references as we start
+  // deleting the blocks themselves.
+  for (auto *BB : DeadBlockSet) {
+    // Check that the dominator tree has already been updated.
+    assert(!DT.getNode(BB) && "Should already have cleared domtree!");
+    LI.changeLoopFor(BB, nullptr);
+    // Drop all uses of the instructions to make sure we won't have dangling
+    // uses in other blocks.
+    for (auto &I : *BB)
+      if (!I.use_empty())
+        I.replaceAllUsesWith(UndefValue::get(I.getType()));
+    BB->dropAllReferences();
+  }
+
+  // Actually delete the blocks now that they've been fully unhooked from the
+  // IR.
+  for (auto *BB : DeadBlockSet)
+    BB->eraseFromParent();
+}
+
+/// Recompute the set of blocks in a loop after unswitching.
+///
+/// This walks from the original headers predecessors to rebuild the loop. We
+/// take advantage of the fact that new blocks can't have been added, and so we
+/// filter by the original loop's blocks. This also handles potentially
+/// unreachable code that we don't want to explore but might be found examining
+/// the predecessors of the header.
+///
+/// If the original loop is no longer a loop, this will return an empty set. If
+/// it remains a loop, all the blocks within it will be added to the set
+/// (including those blocks in inner loops).
+static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L,
+                                                                 LoopInfo &LI) {
+  SmallPtrSet<const BasicBlock *, 16> LoopBlockSet;
+
+  auto *PH = L.getLoopPreheader();
+  auto *Header = L.getHeader();
+
+  // A worklist to use while walking backwards from the header.
+  SmallVector<BasicBlock *, 16> Worklist;
+
+  // First walk the predecessors of the header to find the backedges. This will
+  // form the basis of our walk.
+  for (auto *Pred : predecessors(Header)) {
+    // Skip the preheader.
+    if (Pred == PH)
+      continue;
+
+    // Because the loop was in simplified form, the only non-loop predecessor
+    // is the preheader.
+    assert(L.contains(Pred) && "Found a predecessor of the loop header other "
+                               "than the preheader that is not part of the "
+                               "loop!");
+
+    // Insert this block into the loop set and on the first visit and, if it
+    // isn't the header we're currently walking, put it into the worklist to
+    // recurse through.
+    if (LoopBlockSet.insert(Pred).second && Pred != Header)
+      Worklist.push_back(Pred);
+  }
+
+  // If no backedges were found, we're done.
+  if (LoopBlockSet.empty())
+    return LoopBlockSet;
+
+  // We found backedges, recurse through them to identify the loop blocks.
+  while (!Worklist.empty()) {
+    BasicBlock *BB = Worklist.pop_back_val();
+    assert(LoopBlockSet.count(BB) && "Didn't put block into the loop set!");
+
+    // No need to walk past the header.
+    if (BB == Header)
+      continue;
+
+    // Because we know the inner loop structure remains valid we can use the
+    // loop structure to jump immediately across the entire nested loop.
+    // Further, because it is in loop simplified form, we can directly jump
+    // to its preheader afterward.
+    if (Loop *InnerL = LI.getLoopFor(BB))
+      if (InnerL != &L) {
+        assert(L.contains(InnerL) &&
+               "Should not reach a loop *outside* this loop!");
+        // The preheader is the only possible predecessor of the loop so
+        // insert it into the set and check whether it was already handled.
+        auto *InnerPH = InnerL->getLoopPreheader();
+        assert(L.contains(InnerPH) && "Cannot contain an inner loop block "
+                                      "but not contain the inner loop "
+                                      "preheader!");
+        if (!LoopBlockSet.insert(InnerPH).second)
+          // The only way to reach the preheader is through the loop body
+          // itself so if it has been visited the loop is already handled.
+          continue;
+
+        // Insert all of the blocks (other than those already present) into
+        // the loop set. We expect at least the block that led us to find the
+        // inner loop to be in the block set, but we may also have other loop
+        // blocks if they were already enqueued as predecessors of some other
+        // outer loop block.
+        for (auto *InnerBB : InnerL->blocks()) {
+          if (InnerBB == BB) {
+            assert(LoopBlockSet.count(InnerBB) &&
+                   "Block should already be in the set!");
+            continue;
+          }
+
+          LoopBlockSet.insert(InnerBB);
+        }
+
+        // Add the preheader to the worklist so we will continue past the
+        // loop body.
+        Worklist.push_back(InnerPH);
+        continue;
+      }
+
+    // Insert any predecessors that were in the original loop into the new
+    // set, and if the insert is successful, add them to the worklist.
+    for (auto *Pred : predecessors(BB))
+      if (L.contains(Pred) && LoopBlockSet.insert(Pred).second)
+        Worklist.push_back(Pred);
+  }
+
+  assert(LoopBlockSet.count(Header) && "Cannot fail to add the header!");
+
+  // We've found all the blocks participating in the loop, return our completed
+  // set.
+  return LoopBlockSet;
+}
+
+/// Rebuild a loop after unswitching removes some subset of blocks and edges.
+///
+/// The removal may have removed some child loops entirely but cannot have
+/// disturbed any remaining child loops. However, they may need to be hoisted
+/// to the parent loop (or to be top-level loops). The original loop may be
+/// completely removed.
+///
+/// The sibling loops resulting from this update are returned. If the original
+/// loop remains a valid loop, it will be the first entry in this list with all
+/// of the newly sibling loops following it.
+///
+/// Returns true if the loop remains a loop after unswitching, and false if it
+/// is no longer a loop after unswitching (and should not continue to be
+/// referenced).
+static bool rebuildLoopAfterUnswitch(Loop &L, ArrayRef<BasicBlock *> ExitBlocks,
+                                     LoopInfo &LI,
+                                     SmallVectorImpl<Loop *> &HoistedLoops) {
+  auto *PH = L.getLoopPreheader();
+
+  // Compute the actual parent loop from the exit blocks. Because we may have
+  // pruned some exits the loop may be different from the original parent.
+  Loop *ParentL = nullptr;
+  SmallVector<Loop *, 4> ExitLoops;
+  SmallVector<BasicBlock *, 4> ExitsInLoops;
+  ExitsInLoops.reserve(ExitBlocks.size());
+  for (auto *ExitBB : ExitBlocks)
+    if (Loop *ExitL = LI.getLoopFor(ExitBB)) {
+      ExitLoops.push_back(ExitL);
+      ExitsInLoops.push_back(ExitBB);
+      if (!ParentL || (ParentL != ExitL && ParentL->contains(ExitL)))
+        ParentL = ExitL;
+    }
+
+  // Recompute the blocks participating in this loop. This may be empty if it
+  // is no longer a loop.
+  auto LoopBlockSet = recomputeLoopBlockSet(L, LI);
+
+  // If we still have a loop, we need to re-set the loop's parent as the exit
+  // block set changing may have moved it within the loop nest. Note that this
+  // can only happen when this loop has a parent as it can only hoist the loop
+  // *up* the nest.
+  if (!LoopBlockSet.empty() && L.getParentLoop() != ParentL) {
+    // Remove this loop's (original) blocks from all of the intervening loops.
+    for (Loop *IL = L.getParentLoop(); IL != ParentL;
+         IL = IL->getParentLoop()) {
+      IL->getBlocksSet().erase(PH);
+      for (auto *BB : L.blocks())
+        IL->getBlocksSet().erase(BB);
+      llvm::erase_if(IL->getBlocksVector(), [&](BasicBlock *BB) {
+        return BB == PH || L.contains(BB);
+      });
+    }
+
+    LI.changeLoopFor(PH, ParentL);
+    L.getParentLoop()->removeChildLoop(&L);
+    if (ParentL)
+      ParentL->addChildLoop(&L);
+    else
+      LI.addTopLevelLoop(&L);
+  }
+
+  // Now we update all the blocks which are no longer within the loop.
+  auto &Blocks = L.getBlocksVector();
+  auto BlocksSplitI =
+      LoopBlockSet.empty()
+          ? Blocks.begin()
+          : std::stable_partition(
+                Blocks.begin(), Blocks.end(),
+                [&](BasicBlock *BB) { return LoopBlockSet.count(BB); });
+
+  // Before we erase the list of unlooped blocks, build a set of them.
+  SmallPtrSet<BasicBlock *, 16> UnloopedBlocks(BlocksSplitI, Blocks.end());
+  if (LoopBlockSet.empty())
+    UnloopedBlocks.insert(PH);
+
+  // Now erase these blocks from the loop.
+  for (auto *BB : make_range(BlocksSplitI, Blocks.end()))
+    L.getBlocksSet().erase(BB);
+  Blocks.erase(BlocksSplitI, Blocks.end());
+
+  // Sort the exits in ascending loop depth, we'll work backwards across these
+  // to process them inside out.
+  llvm::stable_sort(ExitsInLoops, [&](BasicBlock *LHS, BasicBlock *RHS) {
+    return LI.getLoopDepth(LHS) < LI.getLoopDepth(RHS);
+  });
+
+  // We'll build up a set for each exit loop.
+  SmallPtrSet<BasicBlock *, 16> NewExitLoopBlocks;
+  Loop *PrevExitL = L.getParentLoop(); // The deepest possible exit loop.
+
+  auto RemoveUnloopedBlocksFromLoop =
+      [](Loop &L, SmallPtrSetImpl<BasicBlock *> &UnloopedBlocks) {
+        for (auto *BB : UnloopedBlocks)
+          L.getBlocksSet().erase(BB);
+        llvm::erase_if(L.getBlocksVector(), [&](BasicBlock *BB) {
+          return UnloopedBlocks.count(BB);
+        });
+      };
+
+  SmallVector<BasicBlock *, 16> Worklist;
+  while (!UnloopedBlocks.empty() && !ExitsInLoops.empty()) {
+    assert(Worklist.empty() && "Didn't clear worklist!");
+    assert(NewExitLoopBlocks.empty() && "Didn't clear loop set!");
+
+    // Grab the next exit block, in decreasing loop depth order.
+    BasicBlock *ExitBB = ExitsInLoops.pop_back_val();
+    Loop &ExitL = *LI.getLoopFor(ExitBB);
+    assert(ExitL.contains(&L) && "Exit loop must contain the inner loop!");
+
+    // Erase all of the unlooped blocks from the loops between the previous
+    // exit loop and this exit loop. This works because the ExitInLoops list is
+    // sorted in increasing order of loop depth and thus we visit loops in
+    // decreasing order of loop depth.
+    for (; PrevExitL != &ExitL; PrevExitL = PrevExitL->getParentLoop())
+      RemoveUnloopedBlocksFromLoop(*PrevExitL, UnloopedBlocks);
+
+    // Walk the CFG back until we hit the cloned PH adding everything reachable
+    // and in the unlooped set to this exit block's loop.
+    Worklist.push_back(ExitBB);
+    do {
+      BasicBlock *BB = Worklist.pop_back_val();
+      // We can stop recursing at the cloned preheader (if we get there).
+      if (BB == PH)
+        continue;
+
+      for (BasicBlock *PredBB : predecessors(BB)) {
+        // If this pred has already been moved to our set or is part of some
+        // (inner) loop, no update needed.
+        if (!UnloopedBlocks.erase(PredBB)) {
+          assert((NewExitLoopBlocks.count(PredBB) ||
+                  ExitL.contains(LI.getLoopFor(PredBB))) &&
+                 "Predecessor not in a nested loop (or already visited)!");
+          continue;
+        }
+
+        // We just insert into the loop set here. We'll add these blocks to the
+        // exit loop after we build up the set in a deterministic order rather
+        // than the predecessor-influenced visit order.
+        bool Inserted = NewExitLoopBlocks.insert(PredBB).second;
+        (void)Inserted;
+        assert(Inserted && "Should only visit an unlooped block once!");
+
+        // And recurse through to its predecessors.
+        Worklist.push_back(PredBB);
+      }
+    } while (!Worklist.empty());
+
+    // If blocks in this exit loop were directly part of the original loop (as
+    // opposed to a child loop) update the map to point to this exit loop. This
+    // just updates a map and so the fact that the order is unstable is fine.
+    for (auto *BB : NewExitLoopBlocks)
+      if (Loop *BBL = LI.getLoopFor(BB))
+        if (BBL == &L || !L.contains(BBL))
+          LI.changeLoopFor(BB, &ExitL);
+
+    // We will remove the remaining unlooped blocks from this loop in the next
+    // iteration or below.
+    NewExitLoopBlocks.clear();
+  }
+
+  // Any remaining unlooped blocks are no longer part of any loop unless they
+  // are part of some child loop.
+  for (; PrevExitL; PrevExitL = PrevExitL->getParentLoop())
+    RemoveUnloopedBlocksFromLoop(*PrevExitL, UnloopedBlocks);
+  for (auto *BB : UnloopedBlocks)
+    if (Loop *BBL = LI.getLoopFor(BB))
+      if (BBL == &L || !L.contains(BBL))
+        LI.changeLoopFor(BB, nullptr);
+
+  // Sink all the child loops whose headers are no longer in the loop set to
+  // the parent (or to be top level loops). We reach into the loop and directly
+  // update its subloop vector to make this batch update efficient.
+  auto &SubLoops = L.getSubLoopsVector();
+  auto SubLoopsSplitI =
+      LoopBlockSet.empty()
+          ? SubLoops.begin()
+          : std::stable_partition(
+                SubLoops.begin(), SubLoops.end(), [&](Loop *SubL) {
+                  return LoopBlockSet.count(SubL->getHeader());
+                });
+  for (auto *HoistedL : make_range(SubLoopsSplitI, SubLoops.end())) {
+    HoistedLoops.push_back(HoistedL);
+    HoistedL->setParentLoop(nullptr);
+
+    // To compute the new parent of this hoisted loop we look at where we
+    // placed the preheader above. We can't lookup the header itself because we
+    // retained the mapping from the header to the hoisted loop. But the
+    // preheader and header should have the exact same new parent computed
+    // based on the set of exit blocks from the original loop as the preheader
+    // is a predecessor of the header and so reached in the reverse walk. And
+    // because the loops were all in simplified form the preheader of the
+    // hoisted loop can't be part of some *other* loop.
+    if (auto *NewParentL = LI.getLoopFor(HoistedL->getLoopPreheader()))
+      NewParentL->addChildLoop(HoistedL);
+    else
+      LI.addTopLevelLoop(HoistedL);
+  }
+  SubLoops.erase(SubLoopsSplitI, SubLoops.end());
+
+  // Actually delete the loop if nothing remained within it.
+  if (Blocks.empty()) {
+    assert(SubLoops.empty() &&
+           "Failed to remove all subloops from the original loop!");
+    if (Loop *ParentL = L.getParentLoop())
+      ParentL->removeChildLoop(llvm::find(*ParentL, &L));
+    else
+      LI.removeLoop(llvm::find(LI, &L));
+    LI.destroy(&L);
+    return false;
+  }
+
+  return true;
+}
+
+/// Helper to visit a dominator subtree, invoking a callable on each node.
+///
+/// Returning false at any point will stop walking past that node of the tree.
+template <typename CallableT>
+void visitDomSubTree(DominatorTree &DT, BasicBlock *BB, CallableT Callable) {
+  SmallVector<DomTreeNode *, 4> DomWorklist;
+  DomWorklist.push_back(DT[BB]);
+#ifndef NDEBUG
+  SmallPtrSet<DomTreeNode *, 4> Visited;
+  Visited.insert(DT[BB]);
+#endif
+  do {
+    DomTreeNode *N = DomWorklist.pop_back_val();
+
+    // Visit this node.
+    if (!Callable(N->getBlock()))
+      continue;
+
+    // Accumulate the child nodes.
+    for (DomTreeNode *ChildN : *N) {
+      assert(Visited.insert(ChildN).second &&
+             "Cannot visit a node twice when walking a tree!");
+      DomWorklist.push_back(ChildN);
+    }
+  } while (!DomWorklist.empty());
+}
+
+static void unswitchNontrivialInvariants(
+    Loop &L, Instruction &TI, ArrayRef<Value *> Invariants,
+    SmallVectorImpl<BasicBlock *> &ExitBlocks, DominatorTree &DT, LoopInfo &LI,
+    AssumptionCache &AC, function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB,
+    ScalarEvolution *SE, MemorySSAUpdater *MSSAU) {
+  auto *ParentBB = TI.getParent();
+  BranchInst *BI = dyn_cast<BranchInst>(&TI);
+  SwitchInst *SI = BI ? nullptr : cast<SwitchInst>(&TI);
+
+  // We can only unswitch switches, conditional branches with an invariant
+  // condition, or combining invariant conditions with an instruction.
+  assert((SI || (BI && BI->isConditional())) &&
+         "Can only unswitch switches and conditional branch!");
+  bool FullUnswitch = SI || BI->getCondition() == Invariants[0];
+  if (FullUnswitch)
+    assert(Invariants.size() == 1 &&
+           "Cannot have other invariants with full unswitching!");
+  else
+    assert(isa<Instruction>(BI->getCondition()) &&
+           "Partial unswitching requires an instruction as the condition!");
+
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
+  // Constant and BBs tracking the cloned and continuing successor. When we are
+  // unswitching the entire condition, this can just be trivially chosen to
+  // unswitch towards `true`. However, when we are unswitching a set of
+  // invariants combined with `and` or `or`, the combining operation determines
+  // the best direction to unswitch: we want to unswitch the direction that will
+  // collapse the branch.
+  bool Direction = true;
+  int ClonedSucc = 0;
+  if (!FullUnswitch) {
+    if (cast<Instruction>(BI->getCondition())->getOpcode() != Instruction::Or) {
+      assert(cast<Instruction>(BI->getCondition())->getOpcode() ==
+                 Instruction::And &&
+             "Only `or` and `and` instructions can combine invariants being "
+             "unswitched.");
+      Direction = false;
+      ClonedSucc = 1;
+    }
+  }
+
+  BasicBlock *RetainedSuccBB =
+      BI ? BI->getSuccessor(1 - ClonedSucc) : SI->getDefaultDest();
+  SmallSetVector<BasicBlock *, 4> UnswitchedSuccBBs;
+  if (BI)
+    UnswitchedSuccBBs.insert(BI->getSuccessor(ClonedSucc));
+  else
+    for (auto Case : SI->cases())
+      if (Case.getCaseSuccessor() != RetainedSuccBB)
+        UnswitchedSuccBBs.insert(Case.getCaseSuccessor());
+
+  assert(!UnswitchedSuccBBs.count(RetainedSuccBB) &&
+         "Should not unswitch the same successor we are retaining!");
+
+  // The branch should be in this exact loop. Any inner loop's invariant branch
+  // should be handled by unswitching that inner loop. The caller of this
+  // routine should filter out any candidates that remain (but were skipped for
+  // whatever reason).
+  assert(LI.getLoopFor(ParentBB) == &L && "Branch in an inner loop!");
+
+  // Compute the parent loop now before we start hacking on things.
+  Loop *ParentL = L.getParentLoop();
+  // Get blocks in RPO order for MSSA update, before changing the CFG.
+  LoopBlocksRPO LBRPO(&L);
+  if (MSSAU)
+    LBRPO.perform(&LI);
+
+  // Compute the outer-most loop containing one of our exit blocks. This is the
+  // furthest up our loopnest which can be mutated, which we will use below to
+  // update things.
+  Loop *OuterExitL = &L;
+  for (auto *ExitBB : ExitBlocks) {
+    Loop *NewOuterExitL = LI.getLoopFor(ExitBB);
+    if (!NewOuterExitL) {
+      // We exited the entire nest with this block, so we're done.
+      OuterExitL = nullptr;
+      break;
+    }
+    if (NewOuterExitL != OuterExitL && NewOuterExitL->contains(OuterExitL))
+      OuterExitL = NewOuterExitL;
+  }
+
+  // At this point, we're definitely going to unswitch something so invalidate
+  // any cached information in ScalarEvolution for the outer most loop
+  // containing an exit block and all nested loops.
+  if (SE) {
+    if (OuterExitL)
+      SE->forgetLoop(OuterExitL);
+    else
+      SE->forgetTopmostLoop(&L);
+  }
+
+  // If the edge from this terminator to a successor dominates that successor,
+  // store a map from each block in its dominator subtree to it. This lets us
+  // tell when cloning for a particular successor if a block is dominated by
+  // some *other* successor with a single data structure. We use this to
+  // significantly reduce cloning.
+  SmallDenseMap<BasicBlock *, BasicBlock *, 16> DominatingSucc;
+  for (auto *SuccBB : llvm::concat<BasicBlock *const>(
+           makeArrayRef(RetainedSuccBB), UnswitchedSuccBBs))
+    if (SuccBB->getUniquePredecessor() ||
+        llvm::all_of(predecessors(SuccBB), [&](BasicBlock *PredBB) {
+          return PredBB == ParentBB || DT.dominates(SuccBB, PredBB);
+        }))
+      visitDomSubTree(DT, SuccBB, [&](BasicBlock *BB) {
+        DominatingSucc[BB] = SuccBB;
+        return true;
+      });
+
+  // Split the preheader, so that we know that there is a safe place to insert
+  // the conditional branch. We will change the preheader to have a conditional
+  // branch on LoopCond. The original preheader will become the split point
+  // between the unswitched versions, and we will have a new preheader for the
+  // original loop.
+  BasicBlock *SplitBB = L.getLoopPreheader();
+  BasicBlock *LoopPH = SplitEdge(SplitBB, L.getHeader(), &DT, &LI, MSSAU);
+
+  // Keep track of the dominator tree updates needed.
+  SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
+
+  // Clone the loop for each unswitched successor.
+  SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
+  VMaps.reserve(UnswitchedSuccBBs.size());
+  SmallDenseMap<BasicBlock *, BasicBlock *, 4> ClonedPHs;
+  for (auto *SuccBB : UnswitchedSuccBBs) {
+    VMaps.emplace_back(new ValueToValueMapTy());
+    ClonedPHs[SuccBB] = buildClonedLoopBlocks(
+        L, LoopPH, SplitBB, ExitBlocks, ParentBB, SuccBB, RetainedSuccBB,
+        DominatingSucc, *VMaps.back(), DTUpdates, AC, DT, LI, MSSAU);
+  }
+
   // Drop metadata if we may break its semantics by moving this instr into the
   // split block.
   if (TI.getMetadata(LLVMContext::MD_make_implicit)) {
@@ -2107,967 +2107,967 @@ static void unswitchNontrivialInvariants(
     }
   }
 
-  // The stitching of the branched code back together depends on whether we're 
-  // doing full unswitching or not with the exception that we always want to 
-  // nuke the initial terminator placed in the split block. 
-  SplitBB->getTerminator()->eraseFromParent(); 
-  if (FullUnswitch) { 
-    // Splice the terminator from the original loop and rewrite its 
-    // successors. 
-    SplitBB->getInstList().splice(SplitBB->end(), ParentBB->getInstList(), TI); 
- 
-    // Keep a clone of the terminator for MSSA updates. 
-    Instruction *NewTI = TI.clone(); 
-    ParentBB->getInstList().push_back(NewTI); 
- 
-    // First wire up the moved terminator to the preheaders. 
-    if (BI) { 
-      BasicBlock *ClonedPH = ClonedPHs.begin()->second; 
-      BI->setSuccessor(ClonedSucc, ClonedPH); 
-      BI->setSuccessor(1 - ClonedSucc, LoopPH); 
-      DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH}); 
-    } else { 
-      assert(SI && "Must either be a branch or switch!"); 
- 
-      // Walk the cases and directly update their successors. 
-      assert(SI->getDefaultDest() == RetainedSuccBB && 
-             "Not retaining default successor!"); 
-      SI->setDefaultDest(LoopPH); 
-      for (auto &Case : SI->cases()) 
-        if (Case.getCaseSuccessor() == RetainedSuccBB) 
-          Case.setSuccessor(LoopPH); 
-        else 
-          Case.setSuccessor(ClonedPHs.find(Case.getCaseSuccessor())->second); 
- 
-      // We need to use the set to populate domtree updates as even when there 
-      // are multiple cases pointing at the same successor we only want to 
-      // remove and insert one edge in the domtree. 
-      for (BasicBlock *SuccBB : UnswitchedSuccBBs) 
-        DTUpdates.push_back( 
-            {DominatorTree::Insert, SplitBB, ClonedPHs.find(SuccBB)->second}); 
-    } 
- 
-    if (MSSAU) { 
-      DT.applyUpdates(DTUpdates); 
-      DTUpdates.clear(); 
- 
-      // Remove all but one edge to the retained block and all unswitched 
-      // blocks. This is to avoid having duplicate entries in the cloned Phis, 
-      // when we know we only keep a single edge for each case. 
-      MSSAU->removeDuplicatePhiEdgesBetween(ParentBB, RetainedSuccBB); 
-      for (BasicBlock *SuccBB : UnswitchedSuccBBs) 
-        MSSAU->removeDuplicatePhiEdgesBetween(ParentBB, SuccBB); 
- 
-      for (auto &VMap : VMaps) 
-        MSSAU->updateForClonedLoop(LBRPO, ExitBlocks, *VMap, 
-                                   /*IgnoreIncomingWithNoClones=*/true); 
-      MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMaps, DT); 
- 
-      // Remove all edges to unswitched blocks. 
-      for (BasicBlock *SuccBB : UnswitchedSuccBBs) 
-        MSSAU->removeEdge(ParentBB, SuccBB); 
-    } 
- 
-    // Now unhook the successor relationship as we'll be replacing 
-    // the terminator with a direct branch. This is much simpler for branches 
-    // than switches so we handle those first. 
-    if (BI) { 
-      // Remove the parent as a predecessor of the unswitched successor. 
-      assert(UnswitchedSuccBBs.size() == 1 && 
-             "Only one possible unswitched block for a branch!"); 
-      BasicBlock *UnswitchedSuccBB = *UnswitchedSuccBBs.begin(); 
-      UnswitchedSuccBB->removePredecessor(ParentBB, 
-                                          /*KeepOneInputPHIs*/ true); 
-      DTUpdates.push_back({DominatorTree::Delete, ParentBB, UnswitchedSuccBB}); 
-    } else { 
-      // Note that we actually want to remove the parent block as a predecessor 
-      // of *every* case successor. The case successor is either unswitched, 
-      // completely eliminating an edge from the parent to that successor, or it 
-      // is a duplicate edge to the retained successor as the retained successor 
-      // is always the default successor and as we'll replace this with a direct 
-      // branch we no longer need the duplicate entries in the PHI nodes. 
-      SwitchInst *NewSI = cast<SwitchInst>(NewTI); 
-      assert(NewSI->getDefaultDest() == RetainedSuccBB && 
-             "Not retaining default successor!"); 
-      for (auto &Case : NewSI->cases()) 
-        Case.getCaseSuccessor()->removePredecessor( 
-            ParentBB, 
-            /*KeepOneInputPHIs*/ true); 
- 
-      // We need to use the set to populate domtree updates as even when there 
-      // are multiple cases pointing at the same successor we only want to 
-      // remove and insert one edge in the domtree. 
-      for (BasicBlock *SuccBB : UnswitchedSuccBBs) 
-        DTUpdates.push_back({DominatorTree::Delete, ParentBB, SuccBB}); 
-    } 
- 
-    // After MSSAU update, remove the cloned terminator instruction NewTI. 
-    ParentBB->getTerminator()->eraseFromParent(); 
- 
-    // Create a new unconditional branch to the continuing block (as opposed to 
-    // the one cloned). 
-    BranchInst::Create(RetainedSuccBB, ParentBB); 
-  } else { 
-    assert(BI && "Only branches have partial unswitching."); 
-    assert(UnswitchedSuccBBs.size() == 1 && 
-           "Only one possible unswitched block for a branch!"); 
-    BasicBlock *ClonedPH = ClonedPHs.begin()->second; 
-    // When doing a partial unswitch, we have to do a bit more work to build up 
-    // the branch in the split block. 
-    buildPartialUnswitchConditionalBranch(*SplitBB, Invariants, Direction, 
-                                          *ClonedPH, *LoopPH); 
-    DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH}); 
- 
-    if (MSSAU) { 
-      DT.applyUpdates(DTUpdates); 
-      DTUpdates.clear(); 
- 
-      // Perform MSSA cloning updates. 
-      for (auto &VMap : VMaps) 
-        MSSAU->updateForClonedLoop(LBRPO, ExitBlocks, *VMap, 
-                                   /*IgnoreIncomingWithNoClones=*/true); 
-      MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMaps, DT); 
-    } 
-  } 
- 
-  // Apply the updates accumulated above to get an up-to-date dominator tree. 
-  DT.applyUpdates(DTUpdates); 
- 
-  // Now that we have an accurate dominator tree, first delete the dead cloned 
-  // blocks so that we can accurately build any cloned loops. It is important to 
-  // not delete the blocks from the original loop yet because we still want to 
-  // reference the original loop to understand the cloned loop's structure. 
-  deleteDeadClonedBlocks(L, ExitBlocks, VMaps, DT, MSSAU); 
- 
-  // Build the cloned loop structure itself. This may be substantially 
-  // different from the original structure due to the simplified CFG. This also 
-  // handles inserting all the cloned blocks into the correct loops. 
-  SmallVector<Loop *, 4> NonChildClonedLoops; 
-  for (std::unique_ptr<ValueToValueMapTy> &VMap : VMaps) 
-    buildClonedLoops(L, ExitBlocks, *VMap, LI, NonChildClonedLoops); 
- 
-  // Now that our cloned loops have been built, we can update the original loop. 
-  // First we delete the dead blocks from it and then we rebuild the loop 
-  // structure taking these deletions into account. 
-  deleteDeadBlocksFromLoop(L, ExitBlocks, DT, LI, MSSAU); 
- 
-  if (MSSAU && VerifyMemorySSA) 
-    MSSAU->getMemorySSA()->verifyMemorySSA(); 
- 
-  SmallVector<Loop *, 4> HoistedLoops; 
-  bool IsStillLoop = rebuildLoopAfterUnswitch(L, ExitBlocks, LI, HoistedLoops); 
- 
-  if (MSSAU && VerifyMemorySSA) 
-    MSSAU->getMemorySSA()->verifyMemorySSA(); 
- 
-  // This transformation has a high risk of corrupting the dominator tree, and 
-  // the below steps to rebuild loop structures will result in hard to debug 
-  // errors in that case so verify that the dominator tree is sane first. 
-  // FIXME: Remove this when the bugs stop showing up and rely on existing 
-  // verification steps. 
-  assert(DT.verify(DominatorTree::VerificationLevel::Fast)); 
- 
-  if (BI) { 
-    // If we unswitched a branch which collapses the condition to a known 
-    // constant we want to replace all the uses of the invariants within both 
-    // the original and cloned blocks. We do this here so that we can use the 
-    // now updated dominator tree to identify which side the users are on. 
-    assert(UnswitchedSuccBBs.size() == 1 && 
-           "Only one possible unswitched block for a branch!"); 
-    BasicBlock *ClonedPH = ClonedPHs.begin()->second; 
- 
-    // When considering multiple partially-unswitched invariants 
-    // we cant just go replace them with constants in both branches. 
-    // 
-    // For 'AND' we infer that true branch ("continue") means true 
-    // for each invariant operand. 
-    // For 'OR' we can infer that false branch ("continue") means false 
-    // for each invariant operand. 
-    // So it happens that for multiple-partial case we dont replace 
-    // in the unswitched branch. 
-    bool ReplaceUnswitched = FullUnswitch || (Invariants.size() == 1); 
- 
-    ConstantInt *UnswitchedReplacement = 
-        Direction ? ConstantInt::getTrue(BI->getContext()) 
-                  : ConstantInt::getFalse(BI->getContext()); 
-    ConstantInt *ContinueReplacement = 
-        Direction ? ConstantInt::getFalse(BI->getContext()) 
-                  : ConstantInt::getTrue(BI->getContext()); 
-    for (Value *Invariant : Invariants) 
-      for (auto UI = Invariant->use_begin(), UE = Invariant->use_end(); 
-           UI != UE;) { 
-        // Grab the use and walk past it so we can clobber it in the use list. 
-        Use *U = &*UI++; 
-        Instruction *UserI = dyn_cast<Instruction>(U->getUser()); 
-        if (!UserI) 
-          continue; 
- 
-        // Replace it with the 'continue' side if in the main loop body, and the 
-        // unswitched if in the cloned blocks. 
-        if (DT.dominates(LoopPH, UserI->getParent())) 
-          U->set(ContinueReplacement); 
-        else if (ReplaceUnswitched && 
-                 DT.dominates(ClonedPH, UserI->getParent())) 
-          U->set(UnswitchedReplacement); 
-      } 
-  } 
- 
-  // We can change which blocks are exit blocks of all the cloned sibling 
-  // loops, the current loop, and any parent loops which shared exit blocks 
-  // with the current loop. As a consequence, we need to re-form LCSSA for 
-  // them. But we shouldn't need to re-form LCSSA for any child loops. 
-  // FIXME: This could be made more efficient by tracking which exit blocks are 
-  // new, and focusing on them, but that isn't likely to be necessary. 
-  // 
-  // In order to reasonably rebuild LCSSA we need to walk inside-out across the 
-  // loop nest and update every loop that could have had its exits changed. We 
-  // also need to cover any intervening loops. We add all of these loops to 
-  // a list and sort them by loop depth to achieve this without updating 
-  // unnecessary loops. 
-  auto UpdateLoop = [&](Loop &UpdateL) { 
-#ifndef NDEBUG 
-    UpdateL.verifyLoop(); 
-    for (Loop *ChildL : UpdateL) { 
-      ChildL->verifyLoop(); 
-      assert(ChildL->isRecursivelyLCSSAForm(DT, LI) && 
-             "Perturbed a child loop's LCSSA form!"); 
-    } 
-#endif 
-    // First build LCSSA for this loop so that we can preserve it when 
-    // forming dedicated exits. We don't want to perturb some other loop's 
-    // LCSSA while doing that CFG edit. 
-    formLCSSA(UpdateL, DT, &LI, SE); 
- 
-    // For loops reached by this loop's original exit blocks we may 
-    // introduced new, non-dedicated exits. At least try to re-form dedicated 
-    // exits for these loops. This may fail if they couldn't have dedicated 
-    // exits to start with. 
-    formDedicatedExitBlocks(&UpdateL, &DT, &LI, MSSAU, /*PreserveLCSSA*/ true); 
-  }; 
- 
-  // For non-child cloned loops and hoisted loops, we just need to update LCSSA 
-  // and we can do it in any order as they don't nest relative to each other. 
-  // 
-  // Also check if any of the loops we have updated have become top-level loops 
-  // as that will necessitate widening the outer loop scope. 
-  for (Loop *UpdatedL : 
-       llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops)) { 
-    UpdateLoop(*UpdatedL); 
+  // The stitching of the branched code back together depends on whether we're
+  // doing full unswitching or not with the exception that we always want to
+  // nuke the initial terminator placed in the split block.
+  SplitBB->getTerminator()->eraseFromParent();
+  if (FullUnswitch) {
+    // Splice the terminator from the original loop and rewrite its
+    // successors.
+    SplitBB->getInstList().splice(SplitBB->end(), ParentBB->getInstList(), TI);
+
+    // Keep a clone of the terminator for MSSA updates.
+    Instruction *NewTI = TI.clone();
+    ParentBB->getInstList().push_back(NewTI);
+
+    // First wire up the moved terminator to the preheaders.
+    if (BI) {
+      BasicBlock *ClonedPH = ClonedPHs.begin()->second;
+      BI->setSuccessor(ClonedSucc, ClonedPH);
+      BI->setSuccessor(1 - ClonedSucc, LoopPH);
+      DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
+    } else {
+      assert(SI && "Must either be a branch or switch!");
+
+      // Walk the cases and directly update their successors.
+      assert(SI->getDefaultDest() == RetainedSuccBB &&
+             "Not retaining default successor!");
+      SI->setDefaultDest(LoopPH);
+      for (auto &Case : SI->cases())
+        if (Case.getCaseSuccessor() == RetainedSuccBB)
+          Case.setSuccessor(LoopPH);
+        else
+          Case.setSuccessor(ClonedPHs.find(Case.getCaseSuccessor())->second);
+
+      // We need to use the set to populate domtree updates as even when there
+      // are multiple cases pointing at the same successor we only want to
+      // remove and insert one edge in the domtree.
+      for (BasicBlock *SuccBB : UnswitchedSuccBBs)
+        DTUpdates.push_back(
+            {DominatorTree::Insert, SplitBB, ClonedPHs.find(SuccBB)->second});
+    }
+
+    if (MSSAU) {
+      DT.applyUpdates(DTUpdates);
+      DTUpdates.clear();
+
+      // Remove all but one edge to the retained block and all unswitched
+      // blocks. This is to avoid having duplicate entries in the cloned Phis,
+      // when we know we only keep a single edge for each case.
+      MSSAU->removeDuplicatePhiEdgesBetween(ParentBB, RetainedSuccBB);
+      for (BasicBlock *SuccBB : UnswitchedSuccBBs)
+        MSSAU->removeDuplicatePhiEdgesBetween(ParentBB, SuccBB);
+
+      for (auto &VMap : VMaps)
+        MSSAU->updateForClonedLoop(LBRPO, ExitBlocks, *VMap,
+                                   /*IgnoreIncomingWithNoClones=*/true);
+      MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMaps, DT);
+
+      // Remove all edges to unswitched blocks.
+      for (BasicBlock *SuccBB : UnswitchedSuccBBs)
+        MSSAU->removeEdge(ParentBB, SuccBB);
+    }
+
+    // Now unhook the successor relationship as we'll be replacing
+    // the terminator with a direct branch. This is much simpler for branches
+    // than switches so we handle those first.
+    if (BI) {
+      // Remove the parent as a predecessor of the unswitched successor.
+      assert(UnswitchedSuccBBs.size() == 1 &&
+             "Only one possible unswitched block for a branch!");
+      BasicBlock *UnswitchedSuccBB = *UnswitchedSuccBBs.begin();
+      UnswitchedSuccBB->removePredecessor(ParentBB,
+                                          /*KeepOneInputPHIs*/ true);
+      DTUpdates.push_back({DominatorTree::Delete, ParentBB, UnswitchedSuccBB});
+    } else {
+      // Note that we actually want to remove the parent block as a predecessor
+      // of *every* case successor. The case successor is either unswitched,
+      // completely eliminating an edge from the parent to that successor, or it
+      // is a duplicate edge to the retained successor as the retained successor
+      // is always the default successor and as we'll replace this with a direct
+      // branch we no longer need the duplicate entries in the PHI nodes.
+      SwitchInst *NewSI = cast<SwitchInst>(NewTI);
+      assert(NewSI->getDefaultDest() == RetainedSuccBB &&
+             "Not retaining default successor!");
+      for (auto &Case : NewSI->cases())
+        Case.getCaseSuccessor()->removePredecessor(
+            ParentBB,
+            /*KeepOneInputPHIs*/ true);
+
+      // We need to use the set to populate domtree updates as even when there
+      // are multiple cases pointing at the same successor we only want to
+      // remove and insert one edge in the domtree.
+      for (BasicBlock *SuccBB : UnswitchedSuccBBs)
+        DTUpdates.push_back({DominatorTree::Delete, ParentBB, SuccBB});
+    }
+
+    // After MSSAU update, remove the cloned terminator instruction NewTI.
+    ParentBB->getTerminator()->eraseFromParent();
+
+    // Create a new unconditional branch to the continuing block (as opposed to
+    // the one cloned).
+    BranchInst::Create(RetainedSuccBB, ParentBB);
+  } else {
+    assert(BI && "Only branches have partial unswitching.");
+    assert(UnswitchedSuccBBs.size() == 1 &&
+           "Only one possible unswitched block for a branch!");
+    BasicBlock *ClonedPH = ClonedPHs.begin()->second;
+    // When doing a partial unswitch, we have to do a bit more work to build up
+    // the branch in the split block.
+    buildPartialUnswitchConditionalBranch(*SplitBB, Invariants, Direction,
+                                          *ClonedPH, *LoopPH);
+    DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
+
+    if (MSSAU) {
+      DT.applyUpdates(DTUpdates);
+      DTUpdates.clear();
+
+      // Perform MSSA cloning updates.
+      for (auto &VMap : VMaps)
+        MSSAU->updateForClonedLoop(LBRPO, ExitBlocks, *VMap,
+                                   /*IgnoreIncomingWithNoClones=*/true);
+      MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMaps, DT);
+    }
+  }
+
+  // Apply the updates accumulated above to get an up-to-date dominator tree.
+  DT.applyUpdates(DTUpdates);
+
+  // Now that we have an accurate dominator tree, first delete the dead cloned
+  // blocks so that we can accurately build any cloned loops. It is important to
+  // not delete the blocks from the original loop yet because we still want to
+  // reference the original loop to understand the cloned loop's structure.
+  deleteDeadClonedBlocks(L, ExitBlocks, VMaps, DT, MSSAU);
+
+  // Build the cloned loop structure itself. This may be substantially
+  // different from the original structure due to the simplified CFG. This also
+  // handles inserting all the cloned blocks into the correct loops.
+  SmallVector<Loop *, 4> NonChildClonedLoops;
+  for (std::unique_ptr<ValueToValueMapTy> &VMap : VMaps)
+    buildClonedLoops(L, ExitBlocks, *VMap, LI, NonChildClonedLoops);
+
+  // Now that our cloned loops have been built, we can update the original loop.
+  // First we delete the dead blocks from it and then we rebuild the loop
+  // structure taking these deletions into account.
+  deleteDeadBlocksFromLoop(L, ExitBlocks, DT, LI, MSSAU);
+
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
+  SmallVector<Loop *, 4> HoistedLoops;
+  bool IsStillLoop = rebuildLoopAfterUnswitch(L, ExitBlocks, LI, HoistedLoops);
+
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
+  // This transformation has a high risk of corrupting the dominator tree, and
+  // the below steps to rebuild loop structures will result in hard to debug
+  // errors in that case so verify that the dominator tree is sane first.
+  // FIXME: Remove this when the bugs stop showing up and rely on existing
+  // verification steps.
+  assert(DT.verify(DominatorTree::VerificationLevel::Fast));
+
+  if (BI) {
+    // If we unswitched a branch which collapses the condition to a known
+    // constant we want to replace all the uses of the invariants within both
+    // the original and cloned blocks. We do this here so that we can use the
+    // now updated dominator tree to identify which side the users are on.
+    assert(UnswitchedSuccBBs.size() == 1 &&
+           "Only one possible unswitched block for a branch!");
+    BasicBlock *ClonedPH = ClonedPHs.begin()->second;
+
+    // When considering multiple partially-unswitched invariants
+    // we cant just go replace them with constants in both branches.
+    //
+    // For 'AND' we infer that true branch ("continue") means true
+    // for each invariant operand.
+    // For 'OR' we can infer that false branch ("continue") means false
+    // for each invariant operand.
+    // So it happens that for multiple-partial case we dont replace
+    // in the unswitched branch.
+    bool ReplaceUnswitched = FullUnswitch || (Invariants.size() == 1);
+
+    ConstantInt *UnswitchedReplacement =
+        Direction ? ConstantInt::getTrue(BI->getContext())
+                  : ConstantInt::getFalse(BI->getContext());
+    ConstantInt *ContinueReplacement =
+        Direction ? ConstantInt::getFalse(BI->getContext())
+                  : ConstantInt::getTrue(BI->getContext());
+    for (Value *Invariant : Invariants)
+      for (auto UI = Invariant->use_begin(), UE = Invariant->use_end();
+           UI != UE;) {
+        // Grab the use and walk past it so we can clobber it in the use list.
+        Use *U = &*UI++;
+        Instruction *UserI = dyn_cast<Instruction>(U->getUser());
+        if (!UserI)
+          continue;
+
+        // Replace it with the 'continue' side if in the main loop body, and the
+        // unswitched if in the cloned blocks.
+        if (DT.dominates(LoopPH, UserI->getParent()))
+          U->set(ContinueReplacement);
+        else if (ReplaceUnswitched &&
+                 DT.dominates(ClonedPH, UserI->getParent()))
+          U->set(UnswitchedReplacement);
+      }
+  }
+
+  // We can change which blocks are exit blocks of all the cloned sibling
+  // loops, the current loop, and any parent loops which shared exit blocks
+  // with the current loop. As a consequence, we need to re-form LCSSA for
+  // them. But we shouldn't need to re-form LCSSA for any child loops.
+  // FIXME: This could be made more efficient by tracking which exit blocks are
+  // new, and focusing on them, but that isn't likely to be necessary.
+  //
+  // In order to reasonably rebuild LCSSA we need to walk inside-out across the
+  // loop nest and update every loop that could have had its exits changed. We
+  // also need to cover any intervening loops. We add all of these loops to
+  // a list and sort them by loop depth to achieve this without updating
+  // unnecessary loops.
+  auto UpdateLoop = [&](Loop &UpdateL) {
+#ifndef NDEBUG
+    UpdateL.verifyLoop();
+    for (Loop *ChildL : UpdateL) {
+      ChildL->verifyLoop();
+      assert(ChildL->isRecursivelyLCSSAForm(DT, LI) &&
+             "Perturbed a child loop's LCSSA form!");
+    }
+#endif
+    // First build LCSSA for this loop so that we can preserve it when
+    // forming dedicated exits. We don't want to perturb some other loop's
+    // LCSSA while doing that CFG edit.
+    formLCSSA(UpdateL, DT, &LI, SE);
+
+    // For loops reached by this loop's original exit blocks we may
+    // introduced new, non-dedicated exits. At least try to re-form dedicated
+    // exits for these loops. This may fail if they couldn't have dedicated
+    // exits to start with.
+    formDedicatedExitBlocks(&UpdateL, &DT, &LI, MSSAU, /*PreserveLCSSA*/ true);
+  };
+
+  // For non-child cloned loops and hoisted loops, we just need to update LCSSA
+  // and we can do it in any order as they don't nest relative to each other.
+  //
+  // Also check if any of the loops we have updated have become top-level loops
+  // as that will necessitate widening the outer loop scope.
+  for (Loop *UpdatedL :
+       llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops)) {
+    UpdateLoop(*UpdatedL);
     if (UpdatedL->isOutermost())
-      OuterExitL = nullptr; 
-  } 
-  if (IsStillLoop) { 
-    UpdateLoop(L); 
+      OuterExitL = nullptr;
+  }
+  if (IsStillLoop) {
+    UpdateLoop(L);
     if (L.isOutermost())
-      OuterExitL = nullptr; 
-  } 
- 
-  // If the original loop had exit blocks, walk up through the outer most loop 
-  // of those exit blocks to update LCSSA and form updated dedicated exits. 
-  if (OuterExitL != &L) 
-    for (Loop *OuterL = ParentL; OuterL != OuterExitL; 
-         OuterL = OuterL->getParentLoop()) 
-      UpdateLoop(*OuterL); 
- 
-#ifndef NDEBUG 
-  // Verify the entire loop structure to catch any incorrect updates before we 
-  // progress in the pass pipeline. 
-  LI.verify(DT); 
-#endif 
- 
-  // Now that we've unswitched something, make callbacks to report the changes. 
-  // For that we need to merge together the updated loops and the cloned loops 
-  // and check whether the original loop survived. 
-  SmallVector<Loop *, 4> SibLoops; 
-  for (Loop *UpdatedL : llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops)) 
-    if (UpdatedL->getParentLoop() == ParentL) 
-      SibLoops.push_back(UpdatedL); 
-  UnswitchCB(IsStillLoop, SibLoops); 
- 
-  if (MSSAU && VerifyMemorySSA) 
-    MSSAU->getMemorySSA()->verifyMemorySSA(); 
- 
-  if (BI) 
-    ++NumBranches; 
-  else 
-    ++NumSwitches; 
-} 
- 
-/// Recursively compute the cost of a dominator subtree based on the per-block 
-/// cost map provided. 
-/// 
-/// The recursive computation is memozied into the provided DT-indexed cost map 
-/// to allow querying it for most nodes in the domtree without it becoming 
-/// quadratic. 
-static int 
-computeDomSubtreeCost(DomTreeNode &N, 
-                      const SmallDenseMap<BasicBlock *, int, 4> &BBCostMap, 
-                      SmallDenseMap<DomTreeNode *, int, 4> &DTCostMap) { 
-  // Don't accumulate cost (or recurse through) blocks not in our block cost 
-  // map and thus not part of the duplication cost being considered. 
-  auto BBCostIt = BBCostMap.find(N.getBlock()); 
-  if (BBCostIt == BBCostMap.end()) 
-    return 0; 
- 
-  // Lookup this node to see if we already computed its cost. 
-  auto DTCostIt = DTCostMap.find(&N); 
-  if (DTCostIt != DTCostMap.end()) 
-    return DTCostIt->second; 
- 
-  // If not, we have to compute it. We can't use insert above and update 
-  // because computing the cost may insert more things into the map. 
-  int Cost = std::accumulate( 
-      N.begin(), N.end(), BBCostIt->second, [&](int Sum, DomTreeNode *ChildN) { 
-        return Sum + computeDomSubtreeCost(*ChildN, BBCostMap, DTCostMap); 
-      }); 
-  bool Inserted = DTCostMap.insert({&N, Cost}).second; 
-  (void)Inserted; 
-  assert(Inserted && "Should not insert a node while visiting children!"); 
-  return Cost; 
-} 
- 
-/// Turns a llvm.experimental.guard intrinsic into implicit control flow branch, 
-/// making the following replacement: 
-/// 
-///   --code before guard-- 
-///   call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ] 
-///   --code after guard-- 
-/// 
-/// into 
-/// 
-///   --code before guard-- 
-///   br i1 %cond, label %guarded, label %deopt 
-/// 
-/// guarded: 
-///   --code after guard-- 
-/// 
-/// deopt: 
-///   call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] 
-///   unreachable 
-/// 
-/// It also makes all relevant DT and LI updates, so that all structures are in 
-/// valid state after this transform. 
-static BranchInst * 
-turnGuardIntoBranch(IntrinsicInst *GI, Loop &L, 
-                    SmallVectorImpl<BasicBlock *> &ExitBlocks, 
-                    DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU) { 
-  SmallVector<DominatorTree::UpdateType, 4> DTUpdates; 
-  LLVM_DEBUG(dbgs() << "Turning " << *GI << " into a branch.\n"); 
-  BasicBlock *CheckBB = GI->getParent(); 
- 
-  if (MSSAU && VerifyMemorySSA) 
-     MSSAU->getMemorySSA()->verifyMemorySSA(); 
- 
-  // Remove all CheckBB's successors from DomTree. A block can be seen among 
-  // successors more than once, but for DomTree it should be added only once. 
-  SmallPtrSet<BasicBlock *, 4> Successors; 
-  for (auto *Succ : successors(CheckBB)) 
-    if (Successors.insert(Succ).second) 
-      DTUpdates.push_back({DominatorTree::Delete, CheckBB, Succ}); 
- 
-  Instruction *DeoptBlockTerm = 
-      SplitBlockAndInsertIfThen(GI->getArgOperand(0), GI, true); 
-  BranchInst *CheckBI = cast<BranchInst>(CheckBB->getTerminator()); 
-  // SplitBlockAndInsertIfThen inserts control flow that branches to 
-  // DeoptBlockTerm if the condition is true.  We want the opposite. 
-  CheckBI->swapSuccessors(); 
- 
-  BasicBlock *GuardedBlock = CheckBI->getSuccessor(0); 
-  GuardedBlock->setName("guarded"); 
-  CheckBI->getSuccessor(1)->setName("deopt"); 
-  BasicBlock *DeoptBlock = CheckBI->getSuccessor(1); 
- 
-  // We now have a new exit block. 
-  ExitBlocks.push_back(CheckBI->getSuccessor(1)); 
- 
-  if (MSSAU) 
-    MSSAU->moveAllAfterSpliceBlocks(CheckBB, GuardedBlock, GI); 
- 
-  GI->moveBefore(DeoptBlockTerm); 
-  GI->setArgOperand(0, ConstantInt::getFalse(GI->getContext())); 
- 
-  // Add new successors of CheckBB into DomTree. 
-  for (auto *Succ : successors(CheckBB)) 
-    DTUpdates.push_back({DominatorTree::Insert, CheckBB, Succ}); 
- 
-  // Now the blocks that used to be CheckBB's successors are GuardedBlock's 
-  // successors. 
-  for (auto *Succ : Successors) 
-    DTUpdates.push_back({DominatorTree::Insert, GuardedBlock, Succ}); 
- 
-  // Make proper changes to DT. 
-  DT.applyUpdates(DTUpdates); 
-  // Inform LI of a new loop block. 
-  L.addBasicBlockToLoop(GuardedBlock, LI); 
- 
-  if (MSSAU) { 
-    MemoryDef *MD = cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(GI)); 
-    MSSAU->moveToPlace(MD, DeoptBlock, MemorySSA::BeforeTerminator); 
-    if (VerifyMemorySSA) 
-      MSSAU->getMemorySSA()->verifyMemorySSA(); 
-  } 
- 
-  ++NumGuards; 
-  return CheckBI; 
-} 
- 
-/// Cost multiplier is a way to limit potentially exponential behavior 
-/// of loop-unswitch. Cost is multipied in proportion of 2^number of unswitch 
-/// candidates available. Also accounting for the number of "sibling" loops with 
-/// the idea to account for previous unswitches that already happened on this 
-/// cluster of loops. There was an attempt to keep this formula simple, 
-/// just enough to limit the worst case behavior. Even if it is not that simple 
-/// now it is still not an attempt to provide a detailed heuristic size 
-/// prediction. 
-/// 
-/// TODO: Make a proper accounting of "explosion" effect for all kinds of 
-/// unswitch candidates, making adequate predictions instead of wild guesses. 
-/// That requires knowing not just the number of "remaining" candidates but 
-/// also costs of unswitching for each of these candidates. 
-static int CalculateUnswitchCostMultiplier( 
-    Instruction &TI, Loop &L, LoopInfo &LI, DominatorTree &DT, 
-    ArrayRef<std::pair<Instruction *, TinyPtrVector<Value *>>> 
-        UnswitchCandidates) { 
- 
-  // Guards and other exiting conditions do not contribute to exponential 
-  // explosion as soon as they dominate the latch (otherwise there might be 
-  // another path to the latch remaining that does not allow to eliminate the 
-  // loop copy on unswitch). 
-  BasicBlock *Latch = L.getLoopLatch(); 
-  BasicBlock *CondBlock = TI.getParent(); 
-  if (DT.dominates(CondBlock, Latch) && 
-      (isGuard(&TI) || 
-       llvm::count_if(successors(&TI), [&L](BasicBlock *SuccBB) { 
-         return L.contains(SuccBB); 
-       }) <= 1)) { 
-    NumCostMultiplierSkipped++; 
-    return 1; 
-  } 
- 
-  auto *ParentL = L.getParentLoop(); 
-  int SiblingsCount = (ParentL ? ParentL->getSubLoopsVector().size() 
-                               : std::distance(LI.begin(), LI.end())); 
-  // Count amount of clones that all the candidates might cause during 
-  // unswitching. Branch/guard counts as 1, switch counts as log2 of its cases. 
-  int UnswitchedClones = 0; 
-  for (auto Candidate : UnswitchCandidates) { 
-    Instruction *CI = Candidate.first; 
-    BasicBlock *CondBlock = CI->getParent(); 
-    bool SkipExitingSuccessors = DT.dominates(CondBlock, Latch); 
-    if (isGuard(CI)) { 
-      if (!SkipExitingSuccessors) 
-        UnswitchedClones++; 
-      continue; 
-    } 
-    int NonExitingSuccessors = llvm::count_if( 
-        successors(CondBlock), [SkipExitingSuccessors, &L](BasicBlock *SuccBB) { 
-          return !SkipExitingSuccessors || L.contains(SuccBB); 
-        }); 
-    UnswitchedClones += Log2_32(NonExitingSuccessors); 
-  } 
- 
-  // Ignore up to the "unscaled candidates" number of unswitch candidates 
-  // when calculating the power-of-two scaling of the cost. The main idea 
-  // with this control is to allow a small number of unswitches to happen 
-  // and rely more on siblings multiplier (see below) when the number 
-  // of candidates is small. 
-  unsigned ClonesPower = 
-      std::max(UnswitchedClones - (int)UnswitchNumInitialUnscaledCandidates, 0); 
- 
-  // Allowing top-level loops to spread a bit more than nested ones. 
-  int SiblingsMultiplier = 
-      std::max((ParentL ? SiblingsCount 
-                        : SiblingsCount / (int)UnswitchSiblingsToplevelDiv), 
-               1); 
-  // Compute the cost multiplier in a way that won't overflow by saturating 
-  // at an upper bound. 
-  int CostMultiplier; 
-  if (ClonesPower > Log2_32(UnswitchThreshold) || 
-      SiblingsMultiplier > UnswitchThreshold) 
-    CostMultiplier = UnswitchThreshold; 
-  else 
-    CostMultiplier = std::min(SiblingsMultiplier * (1 << ClonesPower), 
-                              (int)UnswitchThreshold); 
- 
-  LLVM_DEBUG(dbgs() << "  Computed multiplier  " << CostMultiplier 
-                    << " (siblings " << SiblingsMultiplier << " * clones " 
-                    << (1 << ClonesPower) << ")" 
-                    << " for unswitch candidate: " << TI << "\n"); 
-  return CostMultiplier; 
-} 
- 
-static bool 
-unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI, 
-                      AssumptionCache &AC, TargetTransformInfo &TTI, 
-                      function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB, 
-                      ScalarEvolution *SE, MemorySSAUpdater *MSSAU) { 
-  // Collect all invariant conditions within this loop (as opposed to an inner 
-  // loop which would be handled when visiting that inner loop). 
-  SmallVector<std::pair<Instruction *, TinyPtrVector<Value *>>, 4> 
-      UnswitchCandidates; 
- 
-  // Whether or not we should also collect guards in the loop. 
-  bool CollectGuards = false; 
-  if (UnswitchGuards) { 
-    auto *GuardDecl = L.getHeader()->getParent()->getParent()->getFunction( 
-        Intrinsic::getName(Intrinsic::experimental_guard)); 
-    if (GuardDecl && !GuardDecl->use_empty()) 
-      CollectGuards = true; 
-  } 
- 
-  for (auto *BB : L.blocks()) { 
-    if (LI.getLoopFor(BB) != &L) 
-      continue; 
- 
-    if (CollectGuards) 
-      for (auto &I : *BB) 
-        if (isGuard(&I)) { 
-          auto *Cond = cast<IntrinsicInst>(&I)->getArgOperand(0); 
-          // TODO: Support AND, OR conditions and partial unswitching. 
-          if (!isa<Constant>(Cond) && L.isLoopInvariant(Cond)) 
-            UnswitchCandidates.push_back({&I, {Cond}}); 
-        } 
- 
-    if (auto *SI = dyn_cast<SwitchInst>(BB->getTerminator())) { 
-      // We can only consider fully loop-invariant switch conditions as we need 
-      // to completely eliminate the switch after unswitching. 
-      if (!isa<Constant>(SI->getCondition()) && 
-          L.isLoopInvariant(SI->getCondition()) && !BB->getUniqueSuccessor()) 
-        UnswitchCandidates.push_back({SI, {SI->getCondition()}}); 
-      continue; 
-    } 
- 
-    auto *BI = dyn_cast<BranchInst>(BB->getTerminator()); 
-    if (!BI || !BI->isConditional() || isa<Constant>(BI->getCondition()) || 
-        BI->getSuccessor(0) == BI->getSuccessor(1)) 
-      continue; 
- 
-    if (L.isLoopInvariant(BI->getCondition())) { 
-      UnswitchCandidates.push_back({BI, {BI->getCondition()}}); 
-      continue; 
-    } 
- 
-    Instruction &CondI = *cast<Instruction>(BI->getCondition()); 
-    if (CondI.getOpcode() != Instruction::And && 
-      CondI.getOpcode() != Instruction::Or) 
-      continue; 
- 
-    TinyPtrVector<Value *> Invariants = 
-        collectHomogenousInstGraphLoopInvariants(L, CondI, LI); 
-    if (Invariants.empty()) 
-      continue; 
- 
-    UnswitchCandidates.push_back({BI, std::move(Invariants)}); 
-  } 
- 
-  // If we didn't find any candidates, we're done. 
-  if (UnswitchCandidates.empty()) 
-    return false; 
- 
-  // Check if there are irreducible CFG cycles in this loop. If so, we cannot 
-  // easily unswitch non-trivial edges out of the loop. Doing so might turn the 
-  // irreducible control flow into reducible control flow and introduce new 
-  // loops "out of thin air". If we ever discover important use cases for doing 
-  // this, we can add support to loop unswitch, but it is a lot of complexity 
-  // for what seems little or no real world benefit. 
-  LoopBlocksRPO RPOT(&L); 
-  RPOT.perform(&LI); 
-  if (containsIrreducibleCFG<const BasicBlock *>(RPOT, LI)) 
-    return false; 
- 
-  SmallVector<BasicBlock *, 4> ExitBlocks; 
-  L.getUniqueExitBlocks(ExitBlocks); 
- 
-  // We cannot unswitch if exit blocks contain a cleanuppad instruction as we 
-  // don't know how to split those exit blocks. 
-  // FIXME: We should teach SplitBlock to handle this and remove this 
-  // restriction. 
-  for (auto *ExitBB : ExitBlocks) 
-    if (isa<CleanupPadInst>(ExitBB->getFirstNonPHI())) { 
-      dbgs() << "Cannot unswitch because of cleanuppad in exit block\n"; 
-      return false; 
-    } 
- 
-  LLVM_DEBUG( 
-      dbgs() << "Considering " << UnswitchCandidates.size() 
-             << " non-trivial loop invariant conditions for unswitching.\n"); 
- 
-  // Given that unswitching these terminators will require duplicating parts of 
-  // the loop, so we need to be able to model that cost. Compute the ephemeral 
-  // values and set up a data structure to hold per-BB costs. We cache each 
-  // block's cost so that we don't recompute this when considering different 
-  // subsets of the loop for duplication during unswitching. 
-  SmallPtrSet<const Value *, 4> EphValues; 
-  CodeMetrics::collectEphemeralValues(&L, &AC, EphValues); 
-  SmallDenseMap<BasicBlock *, int, 4> BBCostMap; 
- 
-  // Compute the cost of each block, as well as the total loop cost. Also, bail 
-  // out if we see instructions which are incompatible with loop unswitching 
-  // (convergent, noduplicate, or cross-basic-block tokens). 
-  // FIXME: We might be able to safely handle some of these in non-duplicated 
-  // regions. 
+      OuterExitL = nullptr;
+  }
+
+  // If the original loop had exit blocks, walk up through the outer most loop
+  // of those exit blocks to update LCSSA and form updated dedicated exits.
+  if (OuterExitL != &L)
+    for (Loop *OuterL = ParentL; OuterL != OuterExitL;
+         OuterL = OuterL->getParentLoop())
+      UpdateLoop(*OuterL);
+
+#ifndef NDEBUG
+  // Verify the entire loop structure to catch any incorrect updates before we
+  // progress in the pass pipeline.
+  LI.verify(DT);
+#endif
+
+  // Now that we've unswitched something, make callbacks to report the changes.
+  // For that we need to merge together the updated loops and the cloned loops
+  // and check whether the original loop survived.
+  SmallVector<Loop *, 4> SibLoops;
+  for (Loop *UpdatedL : llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops))
+    if (UpdatedL->getParentLoop() == ParentL)
+      SibLoops.push_back(UpdatedL);
+  UnswitchCB(IsStillLoop, SibLoops);
+
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
+  if (BI)
+    ++NumBranches;
+  else
+    ++NumSwitches;
+}
+
+/// Recursively compute the cost of a dominator subtree based on the per-block
+/// cost map provided.
+///
+/// The recursive computation is memozied into the provided DT-indexed cost map
+/// to allow querying it for most nodes in the domtree without it becoming
+/// quadratic.
+static int
+computeDomSubtreeCost(DomTreeNode &N,
+                      const SmallDenseMap<BasicBlock *, int, 4> &BBCostMap,
+                      SmallDenseMap<DomTreeNode *, int, 4> &DTCostMap) {
+  // Don't accumulate cost (or recurse through) blocks not in our block cost
+  // map and thus not part of the duplication cost being considered.
+  auto BBCostIt = BBCostMap.find(N.getBlock());
+  if (BBCostIt == BBCostMap.end())
+    return 0;
+
+  // Lookup this node to see if we already computed its cost.
+  auto DTCostIt = DTCostMap.find(&N);
+  if (DTCostIt != DTCostMap.end())
+    return DTCostIt->second;
+
+  // If not, we have to compute it. We can't use insert above and update
+  // because computing the cost may insert more things into the map.
+  int Cost = std::accumulate(
+      N.begin(), N.end(), BBCostIt->second, [&](int Sum, DomTreeNode *ChildN) {
+        return Sum + computeDomSubtreeCost(*ChildN, BBCostMap, DTCostMap);
+      });
+  bool Inserted = DTCostMap.insert({&N, Cost}).second;
+  (void)Inserted;
+  assert(Inserted && "Should not insert a node while visiting children!");
+  return Cost;
+}
+
+/// Turns a llvm.experimental.guard intrinsic into implicit control flow branch,
+/// making the following replacement:
+///
+///   --code before guard--
+///   call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ]
+///   --code after guard--
+///
+/// into
+///
+///   --code before guard--
+///   br i1 %cond, label %guarded, label %deopt
+///
+/// guarded:
+///   --code after guard--
+///
+/// deopt:
+///   call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
+///   unreachable
+///
+/// It also makes all relevant DT and LI updates, so that all structures are in
+/// valid state after this transform.
+static BranchInst *
+turnGuardIntoBranch(IntrinsicInst *GI, Loop &L,
+                    SmallVectorImpl<BasicBlock *> &ExitBlocks,
+                    DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU) {
+  SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
+  LLVM_DEBUG(dbgs() << "Turning " << *GI << " into a branch.\n");
+  BasicBlock *CheckBB = GI->getParent();
+
+  if (MSSAU && VerifyMemorySSA)
+     MSSAU->getMemorySSA()->verifyMemorySSA();
+
+  // Remove all CheckBB's successors from DomTree. A block can be seen among
+  // successors more than once, but for DomTree it should be added only once.
+  SmallPtrSet<BasicBlock *, 4> Successors;
+  for (auto *Succ : successors(CheckBB))
+    if (Successors.insert(Succ).second)
+      DTUpdates.push_back({DominatorTree::Delete, CheckBB, Succ});
+
+  Instruction *DeoptBlockTerm =
+      SplitBlockAndInsertIfThen(GI->getArgOperand(0), GI, true);
+  BranchInst *CheckBI = cast<BranchInst>(CheckBB->getTerminator());
+  // SplitBlockAndInsertIfThen inserts control flow that branches to
+  // DeoptBlockTerm if the condition is true.  We want the opposite.
+  CheckBI->swapSuccessors();
+
+  BasicBlock *GuardedBlock = CheckBI->getSuccessor(0);
+  GuardedBlock->setName("guarded");
+  CheckBI->getSuccessor(1)->setName("deopt");
+  BasicBlock *DeoptBlock = CheckBI->getSuccessor(1);
+
+  // We now have a new exit block.
+  ExitBlocks.push_back(CheckBI->getSuccessor(1));
+
+  if (MSSAU)
+    MSSAU->moveAllAfterSpliceBlocks(CheckBB, GuardedBlock, GI);
+
+  GI->moveBefore(DeoptBlockTerm);
+  GI->setArgOperand(0, ConstantInt::getFalse(GI->getContext()));
+
+  // Add new successors of CheckBB into DomTree.
+  for (auto *Succ : successors(CheckBB))
+    DTUpdates.push_back({DominatorTree::Insert, CheckBB, Succ});
+
+  // Now the blocks that used to be CheckBB's successors are GuardedBlock's
+  // successors.
+  for (auto *Succ : Successors)
+    DTUpdates.push_back({DominatorTree::Insert, GuardedBlock, Succ});
+
+  // Make proper changes to DT.
+  DT.applyUpdates(DTUpdates);
+  // Inform LI of a new loop block.
+  L.addBasicBlockToLoop(GuardedBlock, LI);
+
+  if (MSSAU) {
+    MemoryDef *MD = cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(GI));
+    MSSAU->moveToPlace(MD, DeoptBlock, MemorySSA::BeforeTerminator);
+    if (VerifyMemorySSA)
+      MSSAU->getMemorySSA()->verifyMemorySSA();
+  }
+
+  ++NumGuards;
+  return CheckBI;
+}
+
+/// Cost multiplier is a way to limit potentially exponential behavior
+/// of loop-unswitch. Cost is multipied in proportion of 2^number of unswitch
+/// candidates available. Also accounting for the number of "sibling" loops with
+/// the idea to account for previous unswitches that already happened on this
+/// cluster of loops. There was an attempt to keep this formula simple,
+/// just enough to limit the worst case behavior. Even if it is not that simple
+/// now it is still not an attempt to provide a detailed heuristic size
+/// prediction.
+///
+/// TODO: Make a proper accounting of "explosion" effect for all kinds of
+/// unswitch candidates, making adequate predictions instead of wild guesses.
+/// That requires knowing not just the number of "remaining" candidates but
+/// also costs of unswitching for each of these candidates.
+static int CalculateUnswitchCostMultiplier(
+    Instruction &TI, Loop &L, LoopInfo &LI, DominatorTree &DT,
+    ArrayRef<std::pair<Instruction *, TinyPtrVector<Value *>>>
+        UnswitchCandidates) {
+
+  // Guards and other exiting conditions do not contribute to exponential
+  // explosion as soon as they dominate the latch (otherwise there might be
+  // another path to the latch remaining that does not allow to eliminate the
+  // loop copy on unswitch).
+  BasicBlock *Latch = L.getLoopLatch();
+  BasicBlock *CondBlock = TI.getParent();
+  if (DT.dominates(CondBlock, Latch) &&
+      (isGuard(&TI) ||
+       llvm::count_if(successors(&TI), [&L](BasicBlock *SuccBB) {
+         return L.contains(SuccBB);
+       }) <= 1)) {
+    NumCostMultiplierSkipped++;
+    return 1;
+  }
+
+  auto *ParentL = L.getParentLoop();
+  int SiblingsCount = (ParentL ? ParentL->getSubLoopsVector().size()
+                               : std::distance(LI.begin(), LI.end()));
+  // Count amount of clones that all the candidates might cause during
+  // unswitching. Branch/guard counts as 1, switch counts as log2 of its cases.
+  int UnswitchedClones = 0;
+  for (auto Candidate : UnswitchCandidates) {
+    Instruction *CI = Candidate.first;
+    BasicBlock *CondBlock = CI->getParent();
+    bool SkipExitingSuccessors = DT.dominates(CondBlock, Latch);
+    if (isGuard(CI)) {
+      if (!SkipExitingSuccessors)
+        UnswitchedClones++;
+      continue;
+    }
+    int NonExitingSuccessors = llvm::count_if(
+        successors(CondBlock), [SkipExitingSuccessors, &L](BasicBlock *SuccBB) {
+          return !SkipExitingSuccessors || L.contains(SuccBB);
+        });
+    UnswitchedClones += Log2_32(NonExitingSuccessors);
+  }
+
+  // Ignore up to the "unscaled candidates" number of unswitch candidates
+  // when calculating the power-of-two scaling of the cost. The main idea
+  // with this control is to allow a small number of unswitches to happen
+  // and rely more on siblings multiplier (see below) when the number
+  // of candidates is small.
+  unsigned ClonesPower =
+      std::max(UnswitchedClones - (int)UnswitchNumInitialUnscaledCandidates, 0);
+
+  // Allowing top-level loops to spread a bit more than nested ones.
+  int SiblingsMultiplier =
+      std::max((ParentL ? SiblingsCount
+                        : SiblingsCount / (int)UnswitchSiblingsToplevelDiv),
+               1);
+  // Compute the cost multiplier in a way that won't overflow by saturating
+  // at an upper bound.
+  int CostMultiplier;
+  if (ClonesPower > Log2_32(UnswitchThreshold) ||
+      SiblingsMultiplier > UnswitchThreshold)
+    CostMultiplier = UnswitchThreshold;
+  else
+    CostMultiplier = std::min(SiblingsMultiplier * (1 << ClonesPower),
+                              (int)UnswitchThreshold);
+
+  LLVM_DEBUG(dbgs() << "  Computed multiplier  " << CostMultiplier
+                    << " (siblings " << SiblingsMultiplier << " * clones "
+                    << (1 << ClonesPower) << ")"
+                    << " for unswitch candidate: " << TI << "\n");
+  return CostMultiplier;
+}
+
+static bool
+unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
+                      AssumptionCache &AC, TargetTransformInfo &TTI,
+                      function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB,
+                      ScalarEvolution *SE, MemorySSAUpdater *MSSAU) {
+  // Collect all invariant conditions within this loop (as opposed to an inner
+  // loop which would be handled when visiting that inner loop).
+  SmallVector<std::pair<Instruction *, TinyPtrVector<Value *>>, 4>
+      UnswitchCandidates;
+
+  // Whether or not we should also collect guards in the loop.
+  bool CollectGuards = false;
+  if (UnswitchGuards) {
+    auto *GuardDecl = L.getHeader()->getParent()->getParent()->getFunction(
+        Intrinsic::getName(Intrinsic::experimental_guard));
+    if (GuardDecl && !GuardDecl->use_empty())
+      CollectGuards = true;
+  }
+
+  for (auto *BB : L.blocks()) {
+    if (LI.getLoopFor(BB) != &L)
+      continue;
+
+    if (CollectGuards)
+      for (auto &I : *BB)
+        if (isGuard(&I)) {
+          auto *Cond = cast<IntrinsicInst>(&I)->getArgOperand(0);
+          // TODO: Support AND, OR conditions and partial unswitching.
+          if (!isa<Constant>(Cond) && L.isLoopInvariant(Cond))
+            UnswitchCandidates.push_back({&I, {Cond}});
+        }
+
+    if (auto *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
+      // We can only consider fully loop-invariant switch conditions as we need
+      // to completely eliminate the switch after unswitching.
+      if (!isa<Constant>(SI->getCondition()) &&
+          L.isLoopInvariant(SI->getCondition()) && !BB->getUniqueSuccessor())
+        UnswitchCandidates.push_back({SI, {SI->getCondition()}});
+      continue;
+    }
+
+    auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
+    if (!BI || !BI->isConditional() || isa<Constant>(BI->getCondition()) ||
+        BI->getSuccessor(0) == BI->getSuccessor(1))
+      continue;
+
+    if (L.isLoopInvariant(BI->getCondition())) {
+      UnswitchCandidates.push_back({BI, {BI->getCondition()}});
+      continue;
+    }
+
+    Instruction &CondI = *cast<Instruction>(BI->getCondition());
+    if (CondI.getOpcode() != Instruction::And &&
+      CondI.getOpcode() != Instruction::Or)
+      continue;
+
+    TinyPtrVector<Value *> Invariants =
+        collectHomogenousInstGraphLoopInvariants(L, CondI, LI);
+    if (Invariants.empty())
+      continue;
+
+    UnswitchCandidates.push_back({BI, std::move(Invariants)});
+  }
+
+  // If we didn't find any candidates, we're done.
+  if (UnswitchCandidates.empty())
+    return false;
+
+  // Check if there are irreducible CFG cycles in this loop. If so, we cannot
+  // easily unswitch non-trivial edges out of the loop. Doing so might turn the
+  // irreducible control flow into reducible control flow and introduce new
+  // loops "out of thin air". If we ever discover important use cases for doing
+  // this, we can add support to loop unswitch, but it is a lot of complexity
+  // for what seems little or no real world benefit.
+  LoopBlocksRPO RPOT(&L);
+  RPOT.perform(&LI);
+  if (containsIrreducibleCFG<const BasicBlock *>(RPOT, LI))
+    return false;
+
+  SmallVector<BasicBlock *, 4> ExitBlocks;
+  L.getUniqueExitBlocks(ExitBlocks);
+
+  // We cannot unswitch if exit blocks contain a cleanuppad instruction as we
+  // don't know how to split those exit blocks.
+  // FIXME: We should teach SplitBlock to handle this and remove this
+  // restriction.
+  for (auto *ExitBB : ExitBlocks)
+    if (isa<CleanupPadInst>(ExitBB->getFirstNonPHI())) {
+      dbgs() << "Cannot unswitch because of cleanuppad in exit block\n";
+      return false;
+    }
+
+  LLVM_DEBUG(
+      dbgs() << "Considering " << UnswitchCandidates.size()
+             << " non-trivial loop invariant conditions for unswitching.\n");
+
+  // Given that unswitching these terminators will require duplicating parts of
+  // the loop, so we need to be able to model that cost. Compute the ephemeral
+  // values and set up a data structure to hold per-BB costs. We cache each
+  // block's cost so that we don't recompute this when considering different
+  // subsets of the loop for duplication during unswitching.
+  SmallPtrSet<const Value *, 4> EphValues;
+  CodeMetrics::collectEphemeralValues(&L, &AC, EphValues);
+  SmallDenseMap<BasicBlock *, int, 4> BBCostMap;
+
+  // Compute the cost of each block, as well as the total loop cost. Also, bail
+  // out if we see instructions which are incompatible with loop unswitching
+  // (convergent, noduplicate, or cross-basic-block tokens).
+  // FIXME: We might be able to safely handle some of these in non-duplicated
+  // regions.
   TargetTransformInfo::TargetCostKind CostKind =
       L.getHeader()->getParent()->hasMinSize()
       ? TargetTransformInfo::TCK_CodeSize
       : TargetTransformInfo::TCK_SizeAndLatency;
-  int LoopCost = 0; 
-  for (auto *BB : L.blocks()) { 
-    int Cost = 0; 
-    for (auto &I : *BB) { 
-      if (EphValues.count(&I)) 
-        continue; 
- 
-      if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB)) 
-        return false; 
-      if (auto *CB = dyn_cast<CallBase>(&I)) 
-        if (CB->isConvergent() || CB->cannotDuplicate()) 
-          return false; 
- 
+  int LoopCost = 0;
+  for (auto *BB : L.blocks()) {
+    int Cost = 0;
+    for (auto &I : *BB) {
+      if (EphValues.count(&I))
+        continue;
+
+      if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB))
+        return false;
+      if (auto *CB = dyn_cast<CallBase>(&I))
+        if (CB->isConvergent() || CB->cannotDuplicate())
+          return false;
+
       Cost += TTI.getUserCost(&I, CostKind);
-    } 
-    assert(Cost >= 0 && "Must not have negative costs!"); 
-    LoopCost += Cost; 
-    assert(LoopCost >= 0 && "Must not have negative loop costs!"); 
-    BBCostMap[BB] = Cost; 
-  } 
-  LLVM_DEBUG(dbgs() << "  Total loop cost: " << LoopCost << "\n"); 
- 
-  // Now we find the best candidate by searching for the one with the following 
-  // properties in order: 
-  // 
-  // 1) An unswitching cost below the threshold 
-  // 2) The smallest number of duplicated unswitch candidates (to avoid 
-  //    creating redundant subsequent unswitching) 
-  // 3) The smallest cost after unswitching. 
-  // 
-  // We prioritize reducing fanout of unswitch candidates provided the cost 
-  // remains below the threshold because this has a multiplicative effect. 
-  // 
-  // This requires memoizing each dominator subtree to avoid redundant work. 
-  // 
-  // FIXME: Need to actually do the number of candidates part above. 
-  SmallDenseMap<DomTreeNode *, int, 4> DTCostMap; 
-  // Given a terminator which might be unswitched, computes the non-duplicated 
-  // cost for that terminator. 
-  auto ComputeUnswitchedCost = [&](Instruction &TI, bool FullUnswitch) { 
-    BasicBlock &BB = *TI.getParent(); 
-    SmallPtrSet<BasicBlock *, 4> Visited; 
- 
-    int Cost = LoopCost; 
-    for (BasicBlock *SuccBB : successors(&BB)) { 
-      // Don't count successors more than once. 
-      if (!Visited.insert(SuccBB).second) 
-        continue; 
- 
-      // If this is a partial unswitch candidate, then it must be a conditional 
-      // branch with a condition of either `or` or `and`. In that case, one of 
-      // the successors is necessarily duplicated, so don't even try to remove 
-      // its cost. 
-      if (!FullUnswitch) { 
-        auto &BI = cast<BranchInst>(TI); 
-        if (cast<Instruction>(BI.getCondition())->getOpcode() == 
-            Instruction::And) { 
-          if (SuccBB == BI.getSuccessor(1)) 
-            continue; 
-        } else { 
-          assert(cast<Instruction>(BI.getCondition())->getOpcode() == 
-                     Instruction::Or && 
-                 "Only `and` and `or` conditions can result in a partial " 
-                 "unswitch!"); 
-          if (SuccBB == BI.getSuccessor(0)) 
-            continue; 
-        } 
-      } 
- 
-      // This successor's domtree will not need to be duplicated after 
-      // unswitching if the edge to the successor dominates it (and thus the 
-      // entire tree). This essentially means there is no other path into this 
-      // subtree and so it will end up live in only one clone of the loop. 
-      if (SuccBB->getUniquePredecessor() || 
-          llvm::all_of(predecessors(SuccBB), [&](BasicBlock *PredBB) { 
-            return PredBB == &BB || DT.dominates(SuccBB, PredBB); 
-          })) { 
-        Cost -= computeDomSubtreeCost(*DT[SuccBB], BBCostMap, DTCostMap); 
-        assert(Cost >= 0 && 
-               "Non-duplicated cost should never exceed total loop cost!"); 
-      } 
-    } 
- 
-    // Now scale the cost by the number of unique successors minus one. We 
-    // subtract one because there is already at least one copy of the entire 
-    // loop. This is computing the new cost of unswitching a condition. 
-    // Note that guards always have 2 unique successors that are implicit and 
-    // will be materialized if we decide to unswitch it. 
-    int SuccessorsCount = isGuard(&TI) ? 2 : Visited.size(); 
-    assert(SuccessorsCount > 1 && 
-           "Cannot unswitch a condition without multiple distinct successors!"); 
-    return Cost * (SuccessorsCount - 1); 
-  }; 
-  Instruction *BestUnswitchTI = nullptr; 
-  int BestUnswitchCost = 0; 
-  ArrayRef<Value *> BestUnswitchInvariants; 
-  for (auto &TerminatorAndInvariants : UnswitchCandidates) { 
-    Instruction &TI = *TerminatorAndInvariants.first; 
-    ArrayRef<Value *> Invariants = TerminatorAndInvariants.second; 
-    BranchInst *BI = dyn_cast<BranchInst>(&TI); 
-    int CandidateCost = ComputeUnswitchedCost( 
-        TI, /*FullUnswitch*/ !BI || (Invariants.size() == 1 && 
-                                     Invariants[0] == BI->getCondition())); 
-    // Calculate cost multiplier which is a tool to limit potentially 
-    // exponential behavior of loop-unswitch. 
-    if (EnableUnswitchCostMultiplier) { 
-      int CostMultiplier = 
-          CalculateUnswitchCostMultiplier(TI, L, LI, DT, UnswitchCandidates); 
-      assert( 
-          (CostMultiplier > 0 && CostMultiplier <= UnswitchThreshold) && 
-          "cost multiplier needs to be in the range of 1..UnswitchThreshold"); 
-      CandidateCost *= CostMultiplier; 
-      LLVM_DEBUG(dbgs() << "  Computed cost of " << CandidateCost 
-                        << " (multiplier: " << CostMultiplier << ")" 
-                        << " for unswitch candidate: " << TI << "\n"); 
-    } else { 
-      LLVM_DEBUG(dbgs() << "  Computed cost of " << CandidateCost 
-                        << " for unswitch candidate: " << TI << "\n"); 
-    } 
- 
-    if (!BestUnswitchTI || CandidateCost < BestUnswitchCost) { 
-      BestUnswitchTI = &TI; 
-      BestUnswitchCost = CandidateCost; 
-      BestUnswitchInvariants = Invariants; 
-    } 
-  } 
-  assert(BestUnswitchTI && "Failed to find loop unswitch candidate"); 
- 
-  if (BestUnswitchCost >= UnswitchThreshold) { 
-    LLVM_DEBUG(dbgs() << "Cannot unswitch, lowest cost found: " 
-                      << BestUnswitchCost << "\n"); 
-    return false; 
-  } 
- 
-  // If the best candidate is a guard, turn it into a branch. 
-  if (isGuard(BestUnswitchTI)) 
-    BestUnswitchTI = turnGuardIntoBranch(cast<IntrinsicInst>(BestUnswitchTI), L, 
-                                         ExitBlocks, DT, LI, MSSAU); 
- 
-  LLVM_DEBUG(dbgs() << "  Unswitching non-trivial (cost = " 
-                    << BestUnswitchCost << ") terminator: " << *BestUnswitchTI 
-                    << "\n"); 
-  unswitchNontrivialInvariants(L, *BestUnswitchTI, BestUnswitchInvariants, 
-                               ExitBlocks, DT, LI, AC, UnswitchCB, SE, MSSAU); 
-  return true; 
-} 
- 
-/// Unswitch control flow predicated on loop invariant conditions. 
-/// 
-/// This first hoists all branches or switches which are trivial (IE, do not 
-/// require duplicating any part of the loop) out of the loop body. It then 
-/// looks at other loop invariant control flows and tries to unswitch those as 
-/// well by cloning the loop if the result is small enough. 
-/// 
-/// The `DT`, `LI`, `AC`, `TTI` parameters are required analyses that are also 
-/// updated based on the unswitch. 
-/// The `MSSA` analysis is also updated if valid (i.e. its use is enabled). 
-/// 
-/// If either `NonTrivial` is true or the flag `EnableNonTrivialUnswitch` is 
-/// true, we will attempt to do non-trivial unswitching as well as trivial 
-/// unswitching. 
-/// 
-/// The `UnswitchCB` callback provided will be run after unswitching is 
-/// complete, with the first parameter set to `true` if the provided loop 
-/// remains a loop, and a list of new sibling loops created. 
-/// 
-/// If `SE` is non-null, we will update that analysis based on the unswitching 
-/// done. 
-static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, 
-                         AssumptionCache &AC, TargetTransformInfo &TTI, 
-                         bool NonTrivial, 
-                         function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB, 
-                         ScalarEvolution *SE, MemorySSAUpdater *MSSAU) { 
-  assert(L.isRecursivelyLCSSAForm(DT, LI) && 
-         "Loops must be in LCSSA form before unswitching."); 
- 
-  // Must be in loop simplified form: we need a preheader and dedicated exits. 
-  if (!L.isLoopSimplifyForm()) 
-    return false; 
- 
-  // Try trivial unswitch first before loop over other basic blocks in the loop. 
-  if (unswitchAllTrivialConditions(L, DT, LI, SE, MSSAU)) { 
-    // If we unswitched successfully we will want to clean up the loop before 
-    // processing it further so just mark it as unswitched and return. 
-    UnswitchCB(/*CurrentLoopValid*/ true, {}); 
-    return true; 
-  } 
- 
-  // If we're not doing non-trivial unswitching, we're done. We both accept 
-  // a parameter but also check a local flag that can be used for testing 
-  // a debugging. 
-  if (!NonTrivial && !EnableNonTrivialUnswitch) 
-    return false; 
- 
+    }
+    assert(Cost >= 0 && "Must not have negative costs!");
+    LoopCost += Cost;
+    assert(LoopCost >= 0 && "Must not have negative loop costs!");
+    BBCostMap[BB] = Cost;
+  }
+  LLVM_DEBUG(dbgs() << "  Total loop cost: " << LoopCost << "\n");
+
+  // Now we find the best candidate by searching for the one with the following
+  // properties in order:
+  //
+  // 1) An unswitching cost below the threshold
+  // 2) The smallest number of duplicated unswitch candidates (to avoid
+  //    creating redundant subsequent unswitching)
+  // 3) The smallest cost after unswitching.
+  //
+  // We prioritize reducing fanout of unswitch candidates provided the cost
+  // remains below the threshold because this has a multiplicative effect.
+  //
+  // This requires memoizing each dominator subtree to avoid redundant work.
+  //
+  // FIXME: Need to actually do the number of candidates part above.
+  SmallDenseMap<DomTreeNode *, int, 4> DTCostMap;
+  // Given a terminator which might be unswitched, computes the non-duplicated
+  // cost for that terminator.
+  auto ComputeUnswitchedCost = [&](Instruction &TI, bool FullUnswitch) {
+    BasicBlock &BB = *TI.getParent();
+    SmallPtrSet<BasicBlock *, 4> Visited;
+
+    int Cost = LoopCost;
+    for (BasicBlock *SuccBB : successors(&BB)) {
+      // Don't count successors more than once.
+      if (!Visited.insert(SuccBB).second)
+        continue;
+
+      // If this is a partial unswitch candidate, then it must be a conditional
+      // branch with a condition of either `or` or `and`. In that case, one of
+      // the successors is necessarily duplicated, so don't even try to remove
+      // its cost.
+      if (!FullUnswitch) {
+        auto &BI = cast<BranchInst>(TI);
+        if (cast<Instruction>(BI.getCondition())->getOpcode() ==
+            Instruction::And) {
+          if (SuccBB == BI.getSuccessor(1))
+            continue;
+        } else {
+          assert(cast<Instruction>(BI.getCondition())->getOpcode() ==
+                     Instruction::Or &&
+                 "Only `and` and `or` conditions can result in a partial "
+                 "unswitch!");
+          if (SuccBB == BI.getSuccessor(0))
+            continue;
+        }
+      }
+
+      // This successor's domtree will not need to be duplicated after
+      // unswitching if the edge to the successor dominates it (and thus the
+      // entire tree). This essentially means there is no other path into this
+      // subtree and so it will end up live in only one clone of the loop.
+      if (SuccBB->getUniquePredecessor() ||
+          llvm::all_of(predecessors(SuccBB), [&](BasicBlock *PredBB) {
+            return PredBB == &BB || DT.dominates(SuccBB, PredBB);
+          })) {
+        Cost -= computeDomSubtreeCost(*DT[SuccBB], BBCostMap, DTCostMap);
+        assert(Cost >= 0 &&
+               "Non-duplicated cost should never exceed total loop cost!");
+      }
+    }
+
+    // Now scale the cost by the number of unique successors minus one. We
+    // subtract one because there is already at least one copy of the entire
+    // loop. This is computing the new cost of unswitching a condition.
+    // Note that guards always have 2 unique successors that are implicit and
+    // will be materialized if we decide to unswitch it.
+    int SuccessorsCount = isGuard(&TI) ? 2 : Visited.size();
+    assert(SuccessorsCount > 1 &&
+           "Cannot unswitch a condition without multiple distinct successors!");
+    return Cost * (SuccessorsCount - 1);
+  };
+  Instruction *BestUnswitchTI = nullptr;
+  int BestUnswitchCost = 0;
+  ArrayRef<Value *> BestUnswitchInvariants;
+  for (auto &TerminatorAndInvariants : UnswitchCandidates) {
+    Instruction &TI = *TerminatorAndInvariants.first;
+    ArrayRef<Value *> Invariants = TerminatorAndInvariants.second;
+    BranchInst *BI = dyn_cast<BranchInst>(&TI);
+    int CandidateCost = ComputeUnswitchedCost(
+        TI, /*FullUnswitch*/ !BI || (Invariants.size() == 1 &&
+                                     Invariants[0] == BI->getCondition()));
+    // Calculate cost multiplier which is a tool to limit potentially
+    // exponential behavior of loop-unswitch.
+    if (EnableUnswitchCostMultiplier) {
+      int CostMultiplier =
+          CalculateUnswitchCostMultiplier(TI, L, LI, DT, UnswitchCandidates);
+      assert(
+          (CostMultiplier > 0 && CostMultiplier <= UnswitchThreshold) &&
+          "cost multiplier needs to be in the range of 1..UnswitchThreshold");
+      CandidateCost *= CostMultiplier;
+      LLVM_DEBUG(dbgs() << "  Computed cost of " << CandidateCost
+                        << " (multiplier: " << CostMultiplier << ")"
+                        << " for unswitch candidate: " << TI << "\n");
+    } else {
+      LLVM_DEBUG(dbgs() << "  Computed cost of " << CandidateCost
+                        << " for unswitch candidate: " << TI << "\n");
+    }
+
+    if (!BestUnswitchTI || CandidateCost < BestUnswitchCost) {
+      BestUnswitchTI = &TI;
+      BestUnswitchCost = CandidateCost;
+      BestUnswitchInvariants = Invariants;
+    }
+  }
+  assert(BestUnswitchTI && "Failed to find loop unswitch candidate");
+
+  if (BestUnswitchCost >= UnswitchThreshold) {
+    LLVM_DEBUG(dbgs() << "Cannot unswitch, lowest cost found: "
+                      << BestUnswitchCost << "\n");
+    return false;
+  }
+
+  // If the best candidate is a guard, turn it into a branch.
+  if (isGuard(BestUnswitchTI))
+    BestUnswitchTI = turnGuardIntoBranch(cast<IntrinsicInst>(BestUnswitchTI), L,
+                                         ExitBlocks, DT, LI, MSSAU);
+
+  LLVM_DEBUG(dbgs() << "  Unswitching non-trivial (cost = "
+                    << BestUnswitchCost << ") terminator: " << *BestUnswitchTI
+                    << "\n");
+  unswitchNontrivialInvariants(L, *BestUnswitchTI, BestUnswitchInvariants,
+                               ExitBlocks, DT, LI, AC, UnswitchCB, SE, MSSAU);
+  return true;
+}
+
+/// Unswitch control flow predicated on loop invariant conditions.
+///
+/// This first hoists all branches or switches which are trivial (IE, do not
+/// require duplicating any part of the loop) out of the loop body. It then
+/// looks at other loop invariant control flows and tries to unswitch those as
+/// well by cloning the loop if the result is small enough.
+///
+/// The `DT`, `LI`, `AC`, `TTI` parameters are required analyses that are also
+/// updated based on the unswitch.
+/// The `MSSA` analysis is also updated if valid (i.e. its use is enabled).
+///
+/// If either `NonTrivial` is true or the flag `EnableNonTrivialUnswitch` is
+/// true, we will attempt to do non-trivial unswitching as well as trivial
+/// unswitching.
+///
+/// The `UnswitchCB` callback provided will be run after unswitching is
+/// complete, with the first parameter set to `true` if the provided loop
+/// remains a loop, and a list of new sibling loops created.
+///
+/// If `SE` is non-null, we will update that analysis based on the unswitching
+/// done.
+static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI,
+                         AssumptionCache &AC, TargetTransformInfo &TTI,
+                         bool NonTrivial,
+                         function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB,
+                         ScalarEvolution *SE, MemorySSAUpdater *MSSAU) {
+  assert(L.isRecursivelyLCSSAForm(DT, LI) &&
+         "Loops must be in LCSSA form before unswitching.");
+
+  // Must be in loop simplified form: we need a preheader and dedicated exits.
+  if (!L.isLoopSimplifyForm())
+    return false;
+
+  // Try trivial unswitch first before loop over other basic blocks in the loop.
+  if (unswitchAllTrivialConditions(L, DT, LI, SE, MSSAU)) {
+    // If we unswitched successfully we will want to clean up the loop before
+    // processing it further so just mark it as unswitched and return.
+    UnswitchCB(/*CurrentLoopValid*/ true, {});
+    return true;
+  }
+
+  // If we're not doing non-trivial unswitching, we're done. We both accept
+  // a parameter but also check a local flag that can be used for testing
+  // a debugging.
+  if (!NonTrivial && !EnableNonTrivialUnswitch)
+    return false;
+
   // Skip non-trivial unswitching for optsize functions.
   if (L.getHeader()->getParent()->hasOptSize())
     return false;
 
-  // For non-trivial unswitching, because it often creates new loops, we rely on 
-  // the pass manager to iterate on the loops rather than trying to immediately 
-  // reach a fixed point. There is no substantial advantage to iterating 
-  // internally, and if any of the new loops are simplified enough to contain 
-  // trivial unswitching we want to prefer those. 
- 
-  // Try to unswitch the best invariant condition. We prefer this full unswitch to 
-  // a partial unswitch when possible below the threshold. 
-  if (unswitchBestCondition(L, DT, LI, AC, TTI, UnswitchCB, SE, MSSAU)) 
-    return true; 
- 
-  // No other opportunities to unswitch. 
+  // For non-trivial unswitching, because it often creates new loops, we rely on
+  // the pass manager to iterate on the loops rather than trying to immediately
+  // reach a fixed point. There is no substantial advantage to iterating
+  // internally, and if any of the new loops are simplified enough to contain
+  // trivial unswitching we want to prefer those.
+
+  // Try to unswitch the best invariant condition. We prefer this full unswitch to
+  // a partial unswitch when possible below the threshold.
+  if (unswitchBestCondition(L, DT, LI, AC, TTI, UnswitchCB, SE, MSSAU))
+    return true;
+
+  // No other opportunities to unswitch.
   return false;
-} 
- 
-PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM, 
-                                              LoopStandardAnalysisResults &AR, 
-                                              LPMUpdater &U) { 
-  Function &F = *L.getHeader()->getParent(); 
-  (void)F; 
- 
-  LLVM_DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L 
-                    << "\n"); 
- 
-  // Save the current loop name in a variable so that we can report it even 
-  // after it has been deleted. 
-  std::string LoopName = std::string(L.getName()); 
- 
-  auto UnswitchCB = [&L, &U, &LoopName](bool CurrentLoopValid, 
-                                        ArrayRef<Loop *> NewLoops) { 
-    // If we did a non-trivial unswitch, we have added new (cloned) loops. 
-    if (!NewLoops.empty()) 
-      U.addSiblingLoops(NewLoops); 
- 
-    // If the current loop remains valid, we should revisit it to catch any 
-    // other unswitch opportunities. Otherwise, we need to mark it as deleted. 
-    if (CurrentLoopValid) 
-      U.revisitCurrentLoop(); 
-    else 
-      U.markLoopAsDeleted(L, LoopName); 
-  }; 
- 
-  Optional<MemorySSAUpdater> MSSAU; 
-  if (AR.MSSA) { 
-    MSSAU = MemorySSAUpdater(AR.MSSA); 
-    if (VerifyMemorySSA) 
-      AR.MSSA->verifyMemorySSA(); 
-  } 
-  if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.TTI, NonTrivial, UnswitchCB, 
-                    &AR.SE, MSSAU.hasValue() ? MSSAU.getPointer() : nullptr)) 
-    return PreservedAnalyses::all(); 
- 
-  if (AR.MSSA && VerifyMemorySSA) 
-    AR.MSSA->verifyMemorySSA(); 
- 
-  // Historically this pass has had issues with the dominator tree so verify it 
-  // in asserts builds. 
-  assert(AR.DT.verify(DominatorTree::VerificationLevel::Fast)); 
- 
-  auto PA = getLoopPassPreservedAnalyses(); 
-  if (AR.MSSA) 
-    PA.preserve<MemorySSAAnalysis>(); 
-  return PA; 
-} 
- 
-namespace { 
- 
-class SimpleLoopUnswitchLegacyPass : public LoopPass { 
-  bool NonTrivial; 
- 
-public: 
-  static char ID; // Pass ID, replacement for typeid 
- 
-  explicit SimpleLoopUnswitchLegacyPass(bool NonTrivial = false) 
-      : LoopPass(ID), NonTrivial(NonTrivial) { 
-    initializeSimpleLoopUnswitchLegacyPassPass( 
-        *PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnLoop(Loop *L, LPPassManager &LPM) override; 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<AssumptionCacheTracker>(); 
-    AU.addRequired<TargetTransformInfoWrapperPass>(); 
-    if (EnableMSSALoopDependency) { 
-      AU.addRequired<MemorySSAWrapperPass>(); 
-      AU.addPreserved<MemorySSAWrapperPass>(); 
-    } 
-    getLoopAnalysisUsage(AU); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) { 
-  if (skipLoop(L)) 
-    return false; 
- 
-  Function &F = *L->getHeader()->getParent(); 
- 
-  LLVM_DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << *L 
-                    << "\n"); 
- 
-  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 
-  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 
-  auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 
-  MemorySSA *MSSA = nullptr; 
-  Optional<MemorySSAUpdater> MSSAU; 
-  if (EnableMSSALoopDependency) { 
-    MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA(); 
-    MSSAU = MemorySSAUpdater(MSSA); 
-  } 
- 
-  auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); 
-  auto *SE = SEWP ? &SEWP->getSE() : nullptr; 
- 
-  auto UnswitchCB = [&L, &LPM](bool CurrentLoopValid, 
-                               ArrayRef<Loop *> NewLoops) { 
-    // If we did a non-trivial unswitch, we have added new (cloned) loops. 
-    for (auto *NewL : NewLoops) 
-      LPM.addLoop(*NewL); 
- 
-    // If the current loop remains valid, re-add it to the queue. This is 
-    // a little wasteful as we'll finish processing the current loop as well, 
-    // but it is the best we can do in the old PM. 
-    if (CurrentLoopValid) 
-      LPM.addLoop(*L); 
-    else 
-      LPM.markLoopAsDeleted(*L); 
-  }; 
- 
-  if (MSSA && VerifyMemorySSA) 
-    MSSA->verifyMemorySSA(); 
- 
-  bool Changed = unswitchLoop(*L, DT, LI, AC, TTI, NonTrivial, UnswitchCB, SE, 
-                              MSSAU.hasValue() ? MSSAU.getPointer() : nullptr); 
- 
-  if (MSSA && VerifyMemorySSA) 
-    MSSA->verifyMemorySSA(); 
- 
-  // Historically this pass has had issues with the dominator tree so verify it 
-  // in asserts builds. 
-  assert(DT.verify(DominatorTree::VerificationLevel::Fast)); 
- 
-  return Changed; 
-} 
- 
-char SimpleLoopUnswitchLegacyPass::ID = 0; 
-INITIALIZE_PASS_BEGIN(SimpleLoopUnswitchLegacyPass, "simple-loop-unswitch", 
-                      "Simple unswitch loops", false, false) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LoopPass) 
-INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 
-INITIALIZE_PASS_END(SimpleLoopUnswitchLegacyPass, "simple-loop-unswitch", 
-                    "Simple unswitch loops", false, false) 
- 
-Pass *llvm::createSimpleLoopUnswitchLegacyPass(bool NonTrivial) { 
-  return new SimpleLoopUnswitchLegacyPass(NonTrivial); 
-} 
+}
+
+PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
+                                              LoopStandardAnalysisResults &AR,
+                                              LPMUpdater &U) {
+  Function &F = *L.getHeader()->getParent();
+  (void)F;
+
+  LLVM_DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L
+                    << "\n");
+
+  // Save the current loop name in a variable so that we can report it even
+  // after it has been deleted.
+  std::string LoopName = std::string(L.getName());
+
+  auto UnswitchCB = [&L, &U, &LoopName](bool CurrentLoopValid,
+                                        ArrayRef<Loop *> NewLoops) {
+    // If we did a non-trivial unswitch, we have added new (cloned) loops.
+    if (!NewLoops.empty())
+      U.addSiblingLoops(NewLoops);
+
+    // If the current loop remains valid, we should revisit it to catch any
+    // other unswitch opportunities. Otherwise, we need to mark it as deleted.
+    if (CurrentLoopValid)
+      U.revisitCurrentLoop();
+    else
+      U.markLoopAsDeleted(L, LoopName);
+  };
+
+  Optional<MemorySSAUpdater> MSSAU;
+  if (AR.MSSA) {
+    MSSAU = MemorySSAUpdater(AR.MSSA);
+    if (VerifyMemorySSA)
+      AR.MSSA->verifyMemorySSA();
+  }
+  if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.TTI, NonTrivial, UnswitchCB,
+                    &AR.SE, MSSAU.hasValue() ? MSSAU.getPointer() : nullptr))
+    return PreservedAnalyses::all();
+
+  if (AR.MSSA && VerifyMemorySSA)
+    AR.MSSA->verifyMemorySSA();
+
+  // Historically this pass has had issues with the dominator tree so verify it
+  // in asserts builds.
+  assert(AR.DT.verify(DominatorTree::VerificationLevel::Fast));
+
+  auto PA = getLoopPassPreservedAnalyses();
+  if (AR.MSSA)
+    PA.preserve<MemorySSAAnalysis>();
+  return PA;
+}
+
+namespace {
+
+class SimpleLoopUnswitchLegacyPass : public LoopPass {
+  bool NonTrivial;
+
+public:
+  static char ID; // Pass ID, replacement for typeid
+
+  explicit SimpleLoopUnswitchLegacyPass(bool NonTrivial = false)
+      : LoopPass(ID), NonTrivial(NonTrivial) {
+    initializeSimpleLoopUnswitchLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    if (EnableMSSALoopDependency) {
+      AU.addRequired<MemorySSAWrapperPass>();
+      AU.addPreserved<MemorySSAWrapperPass>();
+    }
+    getLoopAnalysisUsage(AU);
+  }
+};
+
+} // end anonymous namespace
+
+bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
+  if (skipLoop(L))
+    return false;
+
+  Function &F = *L->getHeader()->getParent();
+
+  LLVM_DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << *L
+                    << "\n");
+
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  MemorySSA *MSSA = nullptr;
+  Optional<MemorySSAUpdater> MSSAU;
+  if (EnableMSSALoopDependency) {
+    MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
+    MSSAU = MemorySSAUpdater(MSSA);
+  }
+
+  auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+  auto *SE = SEWP ? &SEWP->getSE() : nullptr;
+
+  auto UnswitchCB = [&L, &LPM](bool CurrentLoopValid,
+                               ArrayRef<Loop *> NewLoops) {
+    // If we did a non-trivial unswitch, we have added new (cloned) loops.
+    for (auto *NewL : NewLoops)
+      LPM.addLoop(*NewL);
+
+    // If the current loop remains valid, re-add it to the queue. This is
+    // a little wasteful as we'll finish processing the current loop as well,
+    // but it is the best we can do in the old PM.
+    if (CurrentLoopValid)
+      LPM.addLoop(*L);
+    else
+      LPM.markLoopAsDeleted(*L);
+  };
+
+  if (MSSA && VerifyMemorySSA)
+    MSSA->verifyMemorySSA();
+
+  bool Changed = unswitchLoop(*L, DT, LI, AC, TTI, NonTrivial, UnswitchCB, SE,
+                              MSSAU.hasValue() ? MSSAU.getPointer() : nullptr);
+
+  if (MSSA && VerifyMemorySSA)
+    MSSA->verifyMemorySSA();
+
+  // Historically this pass has had issues with the dominator tree so verify it
+  // in asserts builds.
+  assert(DT.verify(DominatorTree::VerificationLevel::Fast));
+
+  return Changed;
+}
+
+char SimpleLoopUnswitchLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(SimpleLoopUnswitchLegacyPass, "simple-loop-unswitch",
+                      "Simple unswitch loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(SimpleLoopUnswitchLegacyPass, "simple-loop-unswitch",
+                    "Simple unswitch loops", false, false)
+
+Pass *llvm::createSimpleLoopUnswitchLegacyPass(bool NonTrivial) {
+  return new SimpleLoopUnswitchLegacyPass(NonTrivial);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index f06efd7f85..38e7109ead 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -1,145 +1,145 @@
-//===- SimplifyCFGPass.cpp - CFG Simplification Pass ----------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements dead code elimination and basic block merging, along 
-// with a collection of other peephole control flow optimizations.  For example: 
-// 
-//   * Removes basic blocks with no predecessors. 
-//   * Merges a basic block into its predecessor if there is only one and the 
-//     predecessor only has one successor. 
-//   * Eliminates PHI nodes for basic blocks with a single predecessor. 
-//   * Eliminates a basic block that only contains an unconditional branch. 
-//   * Changes invoke instructions to nounwind functions to be calls. 
-//   * Change things like "if (x) if (y)" into "if (x&y)". 
-//   * etc.. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/CFG.h" 
+//===- SimplifyCFGPass.cpp - CFG Simplification Pass ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements dead code elimination and basic block merging, along
+// with a collection of other peephole control flow optimizations.  For example:
+//
+//   * Removes basic blocks with no predecessors.
+//   * Merges a basic block into its predecessor if there is only one and the
+//     predecessor only has one successor.
+//   * Eliminates PHI nodes for basic blocks with a single predecessor.
+//   * Eliminates a basic block that only contains an unconditional branch.
+//   * Changes invoke instructions to nounwind functions to be calls.
+//   * Change things like "if (x) if (y)" into "if (x&y)".
+//   * etc..
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/IR/Attributes.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Module.h" 
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/ValueHandle.h"
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Scalar/SimplifyCFG.h" 
-#include "llvm/Transforms/Utils/Local.h" 
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/SimplifyCFG.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SimplifyCFGOptions.h"
-#include <utility> 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "simplifycfg" 
- 
-static cl::opt<unsigned> UserBonusInstThreshold( 
-    "bonus-inst-threshold", cl::Hidden, cl::init(1), 
-    cl::desc("Control the number of bonus instructions (default = 1)")); 
- 
-static cl::opt<bool> UserKeepLoops( 
-    "keep-loops", cl::Hidden, cl::init(true), 
-    cl::desc("Preserve canonical loop structure (default = true)")); 
- 
-static cl::opt<bool> UserSwitchToLookup( 
-    "switch-to-lookup", cl::Hidden, cl::init(false), 
-    cl::desc("Convert switches to lookup tables (default = false)")); 
- 
-static cl::opt<bool> UserForwardSwitchCond( 
-    "forward-switch-cond", cl::Hidden, cl::init(false), 
-    cl::desc("Forward switch condition to phi ops (default = false)")); 
- 
+#include <utility>
+using namespace llvm;
+
+#define DEBUG_TYPE "simplifycfg"
+
+static cl::opt<unsigned> UserBonusInstThreshold(
+    "bonus-inst-threshold", cl::Hidden, cl::init(1),
+    cl::desc("Control the number of bonus instructions (default = 1)"));
+
+static cl::opt<bool> UserKeepLoops(
+    "keep-loops", cl::Hidden, cl::init(true),
+    cl::desc("Preserve canonical loop structure (default = true)"));
+
+static cl::opt<bool> UserSwitchToLookup(
+    "switch-to-lookup", cl::Hidden, cl::init(false),
+    cl::desc("Convert switches to lookup tables (default = false)"));
+
+static cl::opt<bool> UserForwardSwitchCond(
+    "forward-switch-cond", cl::Hidden, cl::init(false),
+    cl::desc("Forward switch condition to phi ops (default = false)"));
+
 static cl::opt<bool> UserHoistCommonInsts(
     "hoist-common-insts", cl::Hidden, cl::init(false),
     cl::desc("hoist common instructions (default = false)"));
 
-static cl::opt<bool> UserSinkCommonInsts( 
-    "sink-common-insts", cl::Hidden, cl::init(false), 
-    cl::desc("Sink common instructions (default = false)")); 
- 
- 
-STATISTIC(NumSimpl, "Number of blocks simplified"); 
- 
-/// If we have more than one empty (other than phi node) return blocks, 
-/// merge them together to promote recursive block merging. 
+static cl::opt<bool> UserSinkCommonInsts(
+    "sink-common-insts", cl::Hidden, cl::init(false),
+    cl::desc("Sink common instructions (default = false)"));
+
+
+STATISTIC(NumSimpl, "Number of blocks simplified");
+
+/// If we have more than one empty (other than phi node) return blocks,
+/// merge them together to promote recursive block merging.
 static bool mergeEmptyReturnBlocks(Function &F, DomTreeUpdater *DTU) {
-  bool Changed = false; 
- 
+  bool Changed = false;
+
   std::vector<DominatorTree::UpdateType> Updates;
   SmallVector<BasicBlock *, 8> DeadBlocks;
 
-  BasicBlock *RetBlock = nullptr; 
- 
-  // Scan all the blocks in the function, looking for empty return blocks. 
+  BasicBlock *RetBlock = nullptr;
+
+  // Scan all the blocks in the function, looking for empty return blocks.
   for (BasicBlock &BB : make_early_inc_range(F)) {
     if (DTU && DTU->isBBPendingDeletion(&BB))
       continue;
- 
-    // Only look at return blocks. 
-    ReturnInst *Ret = dyn_cast<ReturnInst>(BB.getTerminator()); 
-    if (!Ret) continue; 
- 
-    // Only look at the block if it is empty or the only other thing in it is a 
-    // single PHI node that is the operand to the return. 
-    if (Ret != &BB.front()) { 
-      // Check for something else in the block. 
-      BasicBlock::iterator I(Ret); 
-      --I; 
-      // Skip over debug info. 
-      while (isa<DbgInfoIntrinsic>(I) && I != BB.begin()) 
-        --I; 
-      if (!isa<DbgInfoIntrinsic>(I) && 
-          (!isa<PHINode>(I) || I != BB.begin() || Ret->getNumOperands() == 0 || 
-           Ret->getOperand(0) != &*I)) 
-        continue; 
-    } 
- 
-    // If this is the first returning block, remember it and keep going. 
-    if (!RetBlock) { 
-      RetBlock = &BB; 
-      continue; 
-    } 
- 
-    // Skip merging if this would result in a CallBr instruction with a 
-    // duplicate destination. FIXME: See note in CodeGenPrepare.cpp. 
-    bool SkipCallBr = false; 
-    for (pred_iterator PI = pred_begin(&BB), E = pred_end(&BB); 
-         PI != E && !SkipCallBr; ++PI) { 
-      if (auto *CBI = dyn_cast<CallBrInst>((*PI)->getTerminator())) 
-        for (unsigned i = 0, e = CBI->getNumSuccessors(); i != e; ++i) 
-          if (RetBlock == CBI->getSuccessor(i)) { 
-            SkipCallBr = true; 
-            break; 
-          } 
-    } 
-    if (SkipCallBr) 
-      continue; 
- 
-    // Otherwise, we found a duplicate return block.  Merge the two. 
-    Changed = true; 
- 
-    // Case when there is no input to the return or when the returned values 
-    // agree is trivial.  Note that they can't agree if there are phis in the 
-    // blocks. 
-    if (Ret->getNumOperands() == 0 || 
-        Ret->getOperand(0) == 
-          cast<ReturnInst>(RetBlock->getTerminator())->getOperand(0)) { 
+
+    // Only look at return blocks.
+    ReturnInst *Ret = dyn_cast<ReturnInst>(BB.getTerminator());
+    if (!Ret) continue;
+
+    // Only look at the block if it is empty or the only other thing in it is a
+    // single PHI node that is the operand to the return.
+    if (Ret != &BB.front()) {
+      // Check for something else in the block.
+      BasicBlock::iterator I(Ret);
+      --I;
+      // Skip over debug info.
+      while (isa<DbgInfoIntrinsic>(I) && I != BB.begin())
+        --I;
+      if (!isa<DbgInfoIntrinsic>(I) &&
+          (!isa<PHINode>(I) || I != BB.begin() || Ret->getNumOperands() == 0 ||
+           Ret->getOperand(0) != &*I))
+        continue;
+    }
+
+    // If this is the first returning block, remember it and keep going.
+    if (!RetBlock) {
+      RetBlock = &BB;
+      continue;
+    }
+
+    // Skip merging if this would result in a CallBr instruction with a
+    // duplicate destination. FIXME: See note in CodeGenPrepare.cpp.
+    bool SkipCallBr = false;
+    for (pred_iterator PI = pred_begin(&BB), E = pred_end(&BB);
+         PI != E && !SkipCallBr; ++PI) {
+      if (auto *CBI = dyn_cast<CallBrInst>((*PI)->getTerminator()))
+        for (unsigned i = 0, e = CBI->getNumSuccessors(); i != e; ++i)
+          if (RetBlock == CBI->getSuccessor(i)) {
+            SkipCallBr = true;
+            break;
+          }
+    }
+    if (SkipCallBr)
+      continue;
+
+    // Otherwise, we found a duplicate return block.  Merge the two.
+    Changed = true;
+
+    // Case when there is no input to the return or when the returned values
+    // agree is trivial.  Note that they can't agree if there are phis in the
+    // blocks.
+    if (Ret->getNumOperands() == 0 ||
+        Ret->getOperand(0) ==
+          cast<ReturnInst>(RetBlock->getTerminator())->getOperand(0)) {
       // All predecessors of BB should now branch to RetBlock instead.
       if (DTU) {
         for (auto *Predecessor : predecessors(&BB)) {
@@ -150,35 +150,35 @@ static bool mergeEmptyReturnBlocks(Function &F, DomTreeUpdater *DTU) {
           Updates.push_back({DominatorTree::Delete, Predecessor, &BB});
         }
       }
-      BB.replaceAllUsesWith(RetBlock); 
+      BB.replaceAllUsesWith(RetBlock);
       DeadBlocks.emplace_back(&BB);
-      continue; 
-    } 
- 
-    // If the canonical return block has no PHI node, create one now. 
-    PHINode *RetBlockPHI = dyn_cast<PHINode>(RetBlock->begin()); 
-    if (!RetBlockPHI) { 
-      Value *InVal = cast<ReturnInst>(RetBlock->getTerminator())->getOperand(0); 
-      pred_iterator PB = pred_begin(RetBlock), PE = pred_end(RetBlock); 
-      RetBlockPHI = PHINode::Create(Ret->getOperand(0)->getType(), 
-                                    std::distance(PB, PE), "merge", 
-                                    &RetBlock->front()); 
- 
-      for (pred_iterator PI = PB; PI != PE; ++PI) 
-        RetBlockPHI->addIncoming(InVal, *PI); 
-      RetBlock->getTerminator()->setOperand(0, RetBlockPHI); 
-    } 
- 
-    // Turn BB into a block that just unconditionally branches to the return 
-    // block.  This handles the case when the two return blocks have a common 
-    // predecessor but that return different things. 
-    RetBlockPHI->addIncoming(Ret->getOperand(0), &BB); 
-    BB.getTerminator()->eraseFromParent(); 
-    BranchInst::Create(RetBlock, &BB); 
+      continue;
+    }
+
+    // If the canonical return block has no PHI node, create one now.
+    PHINode *RetBlockPHI = dyn_cast<PHINode>(RetBlock->begin());
+    if (!RetBlockPHI) {
+      Value *InVal = cast<ReturnInst>(RetBlock->getTerminator())->getOperand(0);
+      pred_iterator PB = pred_begin(RetBlock), PE = pred_end(RetBlock);
+      RetBlockPHI = PHINode::Create(Ret->getOperand(0)->getType(),
+                                    std::distance(PB, PE), "merge",
+                                    &RetBlock->front());
+
+      for (pred_iterator PI = PB; PI != PE; ++PI)
+        RetBlockPHI->addIncoming(InVal, *PI);
+      RetBlock->getTerminator()->setOperand(0, RetBlockPHI);
+    }
+
+    // Turn BB into a block that just unconditionally branches to the return
+    // block.  This handles the case when the two return blocks have a common
+    // predecessor but that return different things.
+    RetBlockPHI->addIncoming(Ret->getOperand(0), &BB);
+    BB.getTerminator()->eraseFromParent();
+    BranchInst::Create(RetBlock, &BB);
     if (DTU)
       Updates.push_back({DominatorTree::Insert, &BB, RetBlock});
-  } 
- 
+  }
+
   if (DTU) {
     DTU->applyUpdates(Updates);
     for (auto *BB : DeadBlocks)
@@ -188,31 +188,31 @@ static bool mergeEmptyReturnBlocks(Function &F, DomTreeUpdater *DTU) {
       BB->eraseFromParent();
   }
 
-  return Changed; 
-} 
- 
-/// Call SimplifyCFG on all the blocks in the function, 
-/// iterating until no more changes are made. 
-static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, 
+  return Changed;
+}
+
+/// Call SimplifyCFG on all the blocks in the function,
+/// iterating until no more changes are made.
+static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
                                    DomTreeUpdater *DTU,
-                                   const SimplifyCFGOptions &Options) { 
-  bool Changed = false; 
-  bool LocalChange = true; 
- 
-  SmallVector<std::pair<const BasicBlock *, const BasicBlock *>, 32> Edges; 
-  FindFunctionBackedges(F, Edges); 
+                                   const SimplifyCFGOptions &Options) {
+  bool Changed = false;
+  bool LocalChange = true;
+
+  SmallVector<std::pair<const BasicBlock *, const BasicBlock *>, 32> Edges;
+  FindFunctionBackedges(F, Edges);
   SmallPtrSet<BasicBlock *, 16> UniqueLoopHeaders;
-  for (unsigned i = 0, e = Edges.size(); i != e; ++i) 
+  for (unsigned i = 0, e = Edges.size(); i != e; ++i)
     UniqueLoopHeaders.insert(const_cast<BasicBlock *>(Edges[i].second));
- 
+
   SmallVector<WeakVH, 16> LoopHeaders(UniqueLoopHeaders.begin(),
                                       UniqueLoopHeaders.end());
 
-  while (LocalChange) { 
-    LocalChange = false; 
- 
-    // Loop over all of the basic blocks and remove them if they are unneeded. 
-    for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) { 
+  while (LocalChange) {
+    LocalChange = false;
+
+    // Loop over all of the basic blocks and remove them if they are unneeded.
+    for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) {
       BasicBlock &BB = *BBIt++;
       if (DTU) {
         assert(
@@ -224,43 +224,43 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
           ++BBIt;
       }
       if (simplifyCFG(&BB, TTI, DTU, Options, LoopHeaders)) {
-        LocalChange = true; 
-        ++NumSimpl; 
-      } 
-    } 
-    Changed |= LocalChange; 
-  } 
-  return Changed; 
-} 
- 
+        LocalChange = true;
+        ++NumSimpl;
+      }
+    }
+    Changed |= LocalChange;
+  }
+  return Changed;
+}
+
 static bool simplifyFunctionCFGImpl(Function &F, const TargetTransformInfo &TTI,
                                     DominatorTree *DT,
                                     const SimplifyCFGOptions &Options) {
   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
- 
+
   bool EverChanged = removeUnreachableBlocks(F, DT ? &DTU : nullptr);
   EverChanged |= mergeEmptyReturnBlocks(F, DT ? &DTU : nullptr);
   EverChanged |= iterativelySimplifyCFG(F, TTI, DT ? &DTU : nullptr, Options);
 
-  // If neither pass changed anything, we're done. 
-  if (!EverChanged) return false; 
- 
-  // iterativelySimplifyCFG can (rarely) make some loops dead.  If this happens, 
-  // removeUnreachableBlocks is needed to nuke them, which means we should 
-  // iterate between the two optimizations.  We structure the code like this to 
-  // avoid rerunning iterativelySimplifyCFG if the second pass of 
-  // removeUnreachableBlocks doesn't do anything. 
+  // If neither pass changed anything, we're done.
+  if (!EverChanged) return false;
+
+  // iterativelySimplifyCFG can (rarely) make some loops dead.  If this happens,
+  // removeUnreachableBlocks is needed to nuke them, which means we should
+  // iterate between the two optimizations.  We structure the code like this to
+  // avoid rerunning iterativelySimplifyCFG if the second pass of
+  // removeUnreachableBlocks doesn't do anything.
   if (!removeUnreachableBlocks(F, DT ? &DTU : nullptr))
-    return true; 
- 
-  do { 
+    return true;
+
+  do {
     EverChanged = iterativelySimplifyCFG(F, TTI, DT ? &DTU : nullptr, Options);
     EverChanged |= removeUnreachableBlocks(F, DT ? &DTU : nullptr);
-  } while (EverChanged); 
- 
-  return true; 
-} 
- 
+  } while (EverChanged);
+
+  return true;
+}
+
 static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
                                 DominatorTree *DT,
                                 const SimplifyCFGOptions &Options) {
@@ -277,7 +277,7 @@ static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
   return Changed;
 }
 
-// Command-line settings override compile-time settings. 
+// Command-line settings override compile-time settings.
 static void applyCommandLineOverridesToOptions(SimplifyCFGOptions &Options) {
   if (UserBonusInstThreshold.getNumOccurrences())
     Options.BonusInstThreshold = UserBonusInstThreshold;
@@ -291,8 +291,8 @@ static void applyCommandLineOverridesToOptions(SimplifyCFGOptions &Options) {
     Options.HoistCommonInsts = UserHoistCommonInsts;
   if (UserSinkCommonInsts.getNumOccurrences())
     Options.SinkCommonInsts = UserSinkCommonInsts;
-} 
- 
+}
+
 SimplifyCFGPass::SimplifyCFGPass() : Options() {
   applyCommandLineOverridesToOptions(Options);
 }
@@ -302,10 +302,10 @@ SimplifyCFGPass::SimplifyCFGPass(const SimplifyCFGOptions &Opts)
   applyCommandLineOverridesToOptions(Options);
 }
 
-PreservedAnalyses SimplifyCFGPass::run(Function &F, 
-                                       FunctionAnalysisManager &AM) { 
-  auto &TTI = AM.getResult<TargetIRAnalysis>(F); 
-  Options.AC = &AM.getResult<AssumptionAnalysis>(F); 
+PreservedAnalyses SimplifyCFGPass::run(Function &F,
+                                       FunctionAnalysisManager &AM) {
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  Options.AC = &AM.getResult<AssumptionAnalysis>(F);
   DominatorTree *DT = nullptr;
   if (RequireAndPreserveDomTree)
     DT = &AM.getResult<DominatorTreeAnalysis>(F);
@@ -315,73 +315,73 @@ PreservedAnalyses SimplifyCFGPass::run(Function &F,
     Options.setSimplifyCondBranch(true).setFoldTwoEntryPHINode(true);
   }
   if (!simplifyFunctionCFG(F, TTI, DT, Options))
-    return PreservedAnalyses::all(); 
-  PreservedAnalyses PA; 
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
   if (RequireAndPreserveDomTree)
     PA.preserve<DominatorTreeAnalysis>();
-  PA.preserve<GlobalsAA>(); 
-  return PA; 
-} 
- 
-namespace { 
-struct CFGSimplifyPass : public FunctionPass { 
-  static char ID; 
-  SimplifyCFGOptions Options; 
-  std::function<bool(const Function &)> PredicateFtor; 
- 
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+
+namespace {
+struct CFGSimplifyPass : public FunctionPass {
+  static char ID;
+  SimplifyCFGOptions Options;
+  std::function<bool(const Function &)> PredicateFtor;
+
   CFGSimplifyPass(SimplifyCFGOptions Options_ = SimplifyCFGOptions(),
-                  std::function<bool(const Function &)> Ftor = nullptr) 
+                  std::function<bool(const Function &)> Ftor = nullptr)
       : FunctionPass(ID), Options(Options_), PredicateFtor(std::move(Ftor)) {
- 
-    initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry()); 
- 
-    // Check for command-line overrides of options for debug/customization. 
+
+    initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
+
+    // Check for command-line overrides of options for debug/customization.
     applyCommandLineOverridesToOptions(Options);
-  } 
- 
-  bool runOnFunction(Function &F) override { 
-    if (skipFunction(F) || (PredicateFtor && !PredicateFtor(F))) 
-      return false; 
- 
-    Options.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F) || (PredicateFtor && !PredicateFtor(F)))
+      return false;
+
+    Options.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
     DominatorTree *DT = nullptr;
     if (RequireAndPreserveDomTree)
       DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-    if (F.hasFnAttribute(Attribute::OptForFuzzing)) { 
-      Options.setSimplifyCondBranch(false) 
-             .setFoldTwoEntryPHINode(false); 
-    } else { 
-      Options.setSimplifyCondBranch(true) 
-             .setFoldTwoEntryPHINode(true); 
-    } 
- 
-    auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 
+    if (F.hasFnAttribute(Attribute::OptForFuzzing)) {
+      Options.setSimplifyCondBranch(false)
+             .setFoldTwoEntryPHINode(false);
+    } else {
+      Options.setSimplifyCondBranch(true)
+             .setFoldTwoEntryPHINode(true);
+    }
+
+    auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     return simplifyFunctionCFG(F, TTI, DT, Options);
-  } 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<AssumptionCacheTracker>(); 
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
     if (RequireAndPreserveDomTree)
       AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<TargetTransformInfoWrapperPass>(); 
+    AU.addRequired<TargetTransformInfoWrapperPass>();
     if (RequireAndPreserveDomTree)
       AU.addPreserved<DominatorTreeWrapperPass>();
-    AU.addPreserved<GlobalsAAWrapperPass>(); 
-  } 
-}; 
-} 
- 
-char CFGSimplifyPass::ID = 0; 
-INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false, 
-                      false) 
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+}
+
+char CFGSimplifyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false, 
-                    false) 
- 
-// Public interface to the CFGSimplification pass 
-FunctionPass * 
+INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
+                    false)
+
+// Public interface to the CFGSimplification pass
+FunctionPass *
 llvm::createCFGSimplificationPass(SimplifyCFGOptions Options,
-                                  std::function<bool(const Function &)> Ftor) { 
+                                  std::function<bool(const Function &)> Ftor) {
   return new CFGSimplifyPass(Options, std::move(Ftor));
-} 
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/Sink.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/Sink.cpp
index 152614695d..89cfbe384b 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/Sink.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/Sink.cpp
@@ -1,135 +1,135 @@
-//===-- Sink.cpp - Code Sinking -------------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass moves instructions into successor blocks, when possible, so that 
-// they aren't executed on paths where their results aren't needed. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/Sink.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/AliasAnalysis.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "sink" 
- 
-STATISTIC(NumSunk, "Number of instructions sunk"); 
-STATISTIC(NumSinkIter, "Number of sinking iterations"); 
- 
-static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA, 
-                         SmallPtrSetImpl<Instruction *> &Stores) { 
- 
-  if (Inst->mayWriteToMemory()) { 
-    Stores.insert(Inst); 
-    return false; 
-  } 
- 
-  if (LoadInst *L = dyn_cast<LoadInst>(Inst)) { 
-    MemoryLocation Loc = MemoryLocation::get(L); 
-    for (Instruction *S : Stores) 
-      if (isModSet(AA.getModRefInfo(S, Loc))) 
-        return false; 
-  } 
- 
-  if (Inst->isTerminator() || isa<PHINode>(Inst) || Inst->isEHPad() || 
-      Inst->mayThrow()) 
-    return false; 
- 
-  if (auto *Call = dyn_cast<CallBase>(Inst)) { 
-    // Convergent operations cannot be made control-dependent on additional 
-    // values. 
-    if (Call->isConvergent()) 
-      return false; 
- 
-    for (Instruction *S : Stores) 
-      if (isModSet(AA.getModRefInfo(S, Call))) 
-        return false; 
-  } 
- 
-  return true; 
-} 
- 
-/// IsAcceptableTarget - Return true if it is possible to sink the instruction 
-/// in the specified basic block. 
-static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo, 
-                               DominatorTree &DT, LoopInfo &LI) { 
-  assert(Inst && "Instruction to be sunk is null"); 
-  assert(SuccToSinkTo && "Candidate sink target is null"); 
- 
-  // It's never legal to sink an instruction into a block which terminates in an 
-  // EH-pad. 
-  if (SuccToSinkTo->getTerminator()->isExceptionalTerminator()) 
-    return false; 
- 
-  // If the block has multiple predecessors, this would introduce computation 
-  // on different code paths.  We could split the critical edge, but for now we 
-  // just punt. 
-  // FIXME: Split critical edges if not backedges. 
-  if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) { 
-    // We cannot sink a load across a critical edge - there may be stores in 
-    // other code paths. 
-    if (Inst->mayReadFromMemory()) 
-      return false; 
- 
-    // We don't want to sink across a critical edge if we don't dominate the 
-    // successor. We could be introducing calculations to new code paths. 
-    if (!DT.dominates(Inst->getParent(), SuccToSinkTo)) 
-      return false; 
- 
-    // Don't sink instructions into a loop. 
-    Loop *succ = LI.getLoopFor(SuccToSinkTo); 
-    Loop *cur = LI.getLoopFor(Inst->getParent()); 
-    if (succ != nullptr && succ != cur) 
-      return false; 
-  } 
- 
+//===-- Sink.cpp - Code Sinking -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass moves instructions into successor blocks, when possible, so that
+// they aren't executed on paths where their results aren't needed.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/Sink.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "sink"
+
+STATISTIC(NumSunk, "Number of instructions sunk");
+STATISTIC(NumSinkIter, "Number of sinking iterations");
+
+static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA,
+                         SmallPtrSetImpl<Instruction *> &Stores) {
+
+  if (Inst->mayWriteToMemory()) {
+    Stores.insert(Inst);
+    return false;
+  }
+
+  if (LoadInst *L = dyn_cast<LoadInst>(Inst)) {
+    MemoryLocation Loc = MemoryLocation::get(L);
+    for (Instruction *S : Stores)
+      if (isModSet(AA.getModRefInfo(S, Loc)))
+        return false;
+  }
+
+  if (Inst->isTerminator() || isa<PHINode>(Inst) || Inst->isEHPad() ||
+      Inst->mayThrow())
+    return false;
+
+  if (auto *Call = dyn_cast<CallBase>(Inst)) {
+    // Convergent operations cannot be made control-dependent on additional
+    // values.
+    if (Call->isConvergent())
+      return false;
+
+    for (Instruction *S : Stores)
+      if (isModSet(AA.getModRefInfo(S, Call)))
+        return false;
+  }
+
+  return true;
+}
+
+/// IsAcceptableTarget - Return true if it is possible to sink the instruction
+/// in the specified basic block.
+static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo,
+                               DominatorTree &DT, LoopInfo &LI) {
+  assert(Inst && "Instruction to be sunk is null");
+  assert(SuccToSinkTo && "Candidate sink target is null");
+
+  // It's never legal to sink an instruction into a block which terminates in an
+  // EH-pad.
+  if (SuccToSinkTo->getTerminator()->isExceptionalTerminator())
+    return false;
+
+  // If the block has multiple predecessors, this would introduce computation
+  // on different code paths.  We could split the critical edge, but for now we
+  // just punt.
+  // FIXME: Split critical edges if not backedges.
+  if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) {
+    // We cannot sink a load across a critical edge - there may be stores in
+    // other code paths.
+    if (Inst->mayReadFromMemory())
+      return false;
+
+    // We don't want to sink across a critical edge if we don't dominate the
+    // successor. We could be introducing calculations to new code paths.
+    if (!DT.dominates(Inst->getParent(), SuccToSinkTo))
+      return false;
+
+    // Don't sink instructions into a loop.
+    Loop *succ = LI.getLoopFor(SuccToSinkTo);
+    Loop *cur = LI.getLoopFor(Inst->getParent());
+    if (succ != nullptr && succ != cur)
+      return false;
+  }
+
   return true;
-} 
- 
-/// SinkInstruction - Determine whether it is safe to sink the specified machine 
-/// instruction out of its current block into a successor. 
-static bool SinkInstruction(Instruction *Inst, 
-                            SmallPtrSetImpl<Instruction *> &Stores, 
-                            DominatorTree &DT, LoopInfo &LI, AAResults &AA) { 
- 
-  // Don't sink static alloca instructions.  CodeGen assumes allocas outside the 
-  // entry block are dynamically sized stack objects. 
-  if (AllocaInst *AI = dyn_cast<AllocaInst>(Inst)) 
-    if (AI->isStaticAlloca()) 
-      return false; 
- 
-  // Check if it's safe to move the instruction. 
-  if (!isSafeToMove(Inst, AA, Stores)) 
-    return false; 
- 
-  // FIXME: This should include support for sinking instructions within the 
-  // block they are currently in to shorten the live ranges.  We often get 
-  // instructions sunk into the top of a large block, but it would be better to 
-  // also sink them down before their first use in the block.  This xform has to 
-  // be careful not to *increase* register pressure though, e.g. sinking 
-  // "x = y + z" down if it kills y and z would increase the live ranges of y 
-  // and z and only shrink the live range of x. 
- 
-  // SuccToSinkTo - This is the successor to sink this instruction to, once we 
-  // decide. 
-  BasicBlock *SuccToSinkTo = nullptr; 
- 
+}
+
+/// SinkInstruction - Determine whether it is safe to sink the specified machine
+/// instruction out of its current block into a successor.
+static bool SinkInstruction(Instruction *Inst,
+                            SmallPtrSetImpl<Instruction *> &Stores,
+                            DominatorTree &DT, LoopInfo &LI, AAResults &AA) {
+
+  // Don't sink static alloca instructions.  CodeGen assumes allocas outside the
+  // entry block are dynamically sized stack objects.
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(Inst))
+    if (AI->isStaticAlloca())
+      return false;
+
+  // Check if it's safe to move the instruction.
+  if (!isSafeToMove(Inst, AA, Stores))
+    return false;
+
+  // FIXME: This should include support for sinking instructions within the
+  // block they are currently in to shorten the live ranges.  We often get
+  // instructions sunk into the top of a large block, but it would be better to
+  // also sink them down before their first use in the block.  This xform has to
+  // be careful not to *increase* register pressure though, e.g. sinking
+  // "x = y + z" down if it kills y and z would increase the live ranges of y
+  // and z and only shrink the live range of x.
+
+  // SuccToSinkTo - This is the successor to sink this instruction to, once we
+  // decide.
+  BasicBlock *SuccToSinkTo = nullptr;
+
   // Find the nearest common dominator of all users as the candidate.
   BasicBlock *BB = Inst->getParent();
   for (Use &U : Inst->uses()) {
@@ -151,8 +151,8 @@ static bool SinkInstruction(Instruction *Inst,
     // The current basic block needs to dominate the candidate.
     if (!DT.dominates(BB, SuccToSinkTo))
       return false;
-  } 
- 
+  }
+
   if (SuccToSinkTo) {
     // The nearest common dominator may be in a parent loop of BB, which may not
     // be beneficial. Find an ancestor.
@@ -161,124 +161,124 @@ static bool SinkInstruction(Instruction *Inst,
       SuccToSinkTo = DT.getNode(SuccToSinkTo)->getIDom()->getBlock();
     if (SuccToSinkTo == BB)
       SuccToSinkTo = nullptr;
-  } 
- 
-  // If we couldn't find a block to sink to, ignore this instruction. 
-  if (!SuccToSinkTo) 
-    return false; 
- 
-  LLVM_DEBUG(dbgs() << "Sink" << *Inst << " ("; 
-             Inst->getParent()->printAsOperand(dbgs(), false); dbgs() << " -> "; 
-             SuccToSinkTo->printAsOperand(dbgs(), false); dbgs() << ")\n"); 
- 
-  // Move the instruction. 
-  Inst->moveBefore(&*SuccToSinkTo->getFirstInsertionPt()); 
-  return true; 
-} 
- 
-static bool ProcessBlock(BasicBlock &BB, DominatorTree &DT, LoopInfo &LI, 
-                         AAResults &AA) { 
-  // Can't sink anything out of a block that has less than two successors. 
-  if (BB.getTerminator()->getNumSuccessors() <= 1) return false; 
- 
-  // Don't bother sinking code out of unreachable blocks. In addition to being 
-  // unprofitable, it can also lead to infinite looping, because in an 
-  // unreachable loop there may be nowhere to stop. 
-  if (!DT.isReachableFromEntry(&BB)) return false; 
- 
-  bool MadeChange = false; 
- 
-  // Walk the basic block bottom-up.  Remember if we saw a store. 
-  BasicBlock::iterator I = BB.end(); 
-  --I; 
-  bool ProcessedBegin = false; 
-  SmallPtrSet<Instruction *, 8> Stores; 
-  do { 
-    Instruction *Inst = &*I; // The instruction to sink. 
- 
-    // Predecrement I (if it's not begin) so that it isn't invalidated by 
-    // sinking. 
-    ProcessedBegin = I == BB.begin(); 
-    if (!ProcessedBegin) 
-      --I; 
- 
-    if (isa<DbgInfoIntrinsic>(Inst)) 
-      continue; 
- 
-    if (SinkInstruction(Inst, Stores, DT, LI, AA)) { 
-      ++NumSunk; 
-      MadeChange = true; 
-    } 
- 
-    // If we just processed the first instruction in the block, we're done. 
-  } while (!ProcessedBegin); 
- 
-  return MadeChange; 
-} 
- 
-static bool iterativelySinkInstructions(Function &F, DominatorTree &DT, 
-                                        LoopInfo &LI, AAResults &AA) { 
-  bool MadeChange, EverMadeChange = false; 
- 
-  do { 
-    MadeChange = false; 
-    LLVM_DEBUG(dbgs() << "Sinking iteration " << NumSinkIter << "\n"); 
-    // Process all basic blocks. 
-    for (BasicBlock &I : F) 
-      MadeChange |= ProcessBlock(I, DT, LI, AA); 
-    EverMadeChange |= MadeChange; 
-    NumSinkIter++; 
-  } while (MadeChange); 
- 
-  return EverMadeChange; 
-} 
- 
-PreservedAnalyses SinkingPass::run(Function &F, FunctionAnalysisManager &AM) { 
-  auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 
-  auto &LI = AM.getResult<LoopAnalysis>(F); 
-  auto &AA = AM.getResult<AAManager>(F); 
- 
-  if (!iterativelySinkInstructions(F, DT, LI, AA)) 
-    return PreservedAnalyses::all(); 
- 
-  PreservedAnalyses PA; 
-  PA.preserveSet<CFGAnalyses>(); 
-  return PA; 
-} 
- 
-namespace { 
-  class SinkingLegacyPass : public FunctionPass { 
-  public: 
-    static char ID; // Pass identification 
-    SinkingLegacyPass() : FunctionPass(ID) { 
-      initializeSinkingLegacyPassPass(*PassRegistry::getPassRegistry()); 
-    } 
- 
-    bool runOnFunction(Function &F) override { 
-      auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-      auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 
-      auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); 
- 
-      return iterativelySinkInstructions(F, DT, LI, AA); 
-    } 
- 
-    void getAnalysisUsage(AnalysisUsage &AU) const override { 
-      AU.setPreservesCFG(); 
-      FunctionPass::getAnalysisUsage(AU); 
-      AU.addRequired<AAResultsWrapperPass>(); 
-      AU.addRequired<DominatorTreeWrapperPass>(); 
-      AU.addRequired<LoopInfoWrapperPass>(); 
-      AU.addPreserved<DominatorTreeWrapperPass>(); 
-      AU.addPreserved<LoopInfoWrapperPass>(); 
-    } 
-  }; 
-} // end anonymous namespace 
- 
-char SinkingLegacyPass::ID = 0; 
-INITIALIZE_PASS_BEGIN(SinkingLegacyPass, "sink", "Code sinking", false, false) 
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 
-INITIALIZE_PASS_END(SinkingLegacyPass, "sink", "Code sinking", false, false) 
- 
-FunctionPass *llvm::createSinkingPass() { return new SinkingLegacyPass(); } 
+  }
+
+  // If we couldn't find a block to sink to, ignore this instruction.
+  if (!SuccToSinkTo)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Sink" << *Inst << " (";
+             Inst->getParent()->printAsOperand(dbgs(), false); dbgs() << " -> ";
+             SuccToSinkTo->printAsOperand(dbgs(), false); dbgs() << ")\n");
+
+  // Move the instruction.
+  Inst->moveBefore(&*SuccToSinkTo->getFirstInsertionPt());
+  return true;
+}
+
+static bool ProcessBlock(BasicBlock &BB, DominatorTree &DT, LoopInfo &LI,
+                         AAResults &AA) {
+  // Can't sink anything out of a block that has less than two successors.
+  if (BB.getTerminator()->getNumSuccessors() <= 1) return false;
+
+  // Don't bother sinking code out of unreachable blocks. In addition to being
+  // unprofitable, it can also lead to infinite looping, because in an
+  // unreachable loop there may be nowhere to stop.
+  if (!DT.isReachableFromEntry(&BB)) return false;
+
+  bool MadeChange = false;
+
+  // Walk the basic block bottom-up.  Remember if we saw a store.
+  BasicBlock::iterator I = BB.end();
+  --I;
+  bool ProcessedBegin = false;
+  SmallPtrSet<Instruction *, 8> Stores;
+  do {
+    Instruction *Inst = &*I; // The instruction to sink.
+
+    // Predecrement I (if it's not begin) so that it isn't invalidated by
+    // sinking.
+    ProcessedBegin = I == BB.begin();
+    if (!ProcessedBegin)
+      --I;
+
+    if (isa<DbgInfoIntrinsic>(Inst))
+      continue;
+
+    if (SinkInstruction(Inst, Stores, DT, LI, AA)) {
+      ++NumSunk;
+      MadeChange = true;
+    }
+
+    // If we just processed the first instruction in the block, we're done.
+  } while (!ProcessedBegin);
+
+  return MadeChange;
+}
+
+static bool iterativelySinkInstructions(Function &F, DominatorTree &DT,
+                                        LoopInfo &LI, AAResults &AA) {
+  bool MadeChange, EverMadeChange = false;
+
+  do {
+    MadeChange = false;
+    LLVM_DEBUG(dbgs() << "Sinking iteration " << NumSinkIter << "\n");
+    // Process all basic blocks.
+    for (BasicBlock &I : F)
+      MadeChange |= ProcessBlock(I, DT, LI, AA);
+    EverMadeChange |= MadeChange;
+    NumSinkIter++;
+  } while (MadeChange);
+
+  return EverMadeChange;
+}
+
+PreservedAnalyses SinkingPass::run(Function &F, FunctionAnalysisManager &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &AA = AM.getResult<AAManager>(F);
+
+  if (!iterativelySinkInstructions(F, DT, LI, AA))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
+namespace {
+  class SinkingLegacyPass : public FunctionPass {
+  public:
+    static char ID; // Pass identification
+    SinkingLegacyPass() : FunctionPass(ID) {
+      initializeSinkingLegacyPassPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override {
+      auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+      auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+      auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+      return iterativelySinkInstructions(F, DT, LI, AA);
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      FunctionPass::getAnalysisUsage(AU);
+      AU.addRequired<AAResultsWrapperPass>();
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addRequired<LoopInfoWrapperPass>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addPreserved<LoopInfoWrapperPass>();
+    }
+  };
+} // end anonymous namespace
+
+char SinkingLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(SinkingLegacyPass, "sink", "Code sinking", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(SinkingLegacyPass, "sink", "Code sinking", false, false)
+
+FunctionPass *llvm::createSinkingPass() { return new SinkingLegacyPass(); }
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
index f5529f9e4f..9b18c945d9 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
@@ -1,830 +1,830 @@
-//===- SpeculateAroundPHIs.cpp --------------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/SpeculateAroundPHIs.h" 
-#include "llvm/ADT/PostOrderIterator.h" 
-#include "llvm/ADT/Sequence.h" 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "spec-phis" 
- 
-STATISTIC(NumPHIsSpeculated, "Number of PHI nodes we speculated around"); 
-STATISTIC(NumEdgesSplit, 
-          "Number of critical edges which were split for speculation"); 
-STATISTIC(NumSpeculatedInstructions, 
-          "Number of instructions we speculated around the PHI nodes"); 
-STATISTIC(NumNewRedundantInstructions, 
-          "Number of new, redundant instructions inserted"); 
- 
-/// Check whether speculating the users of a PHI node around the PHI 
-/// will be safe. 
-/// 
-/// This checks both that all of the users are safe and also that all of their 
-/// operands are either recursively safe or already available along an incoming 
-/// edge to the PHI. 
-/// 
-/// This routine caches both all the safe nodes explored in `PotentialSpecSet` 
-/// and the chain of nodes that definitively reach any unsafe node in 
-/// `UnsafeSet`. By preserving these between repeated calls to this routine for 
-/// PHIs in the same basic block, the exploration here can be reused. However, 
-/// these caches must no be reused for PHIs in a different basic block as they 
-/// reflect what is available along incoming edges. 
-static bool 
-isSafeToSpeculatePHIUsers(PHINode &PN, DominatorTree &DT, 
-                          SmallPtrSetImpl<Instruction *> &PotentialSpecSet, 
-                          SmallPtrSetImpl<Instruction *> &UnsafeSet) { 
-  auto *PhiBB = PN.getParent(); 
-  SmallPtrSet<Instruction *, 4> Visited; 
-  SmallVector<std::pair<Instruction *, User::value_op_iterator>, 16> DFSStack; 
- 
-  // Walk each user of the PHI node. 
-  for (Use &U : PN.uses()) { 
-    auto *UI = cast<Instruction>(U.getUser()); 
- 
-    // Ensure the use post-dominates the PHI node. This ensures that, in the 
-    // absence of unwinding, the use will actually be reached. 
-    // FIXME: We use a blunt hammer of requiring them to be in the same basic 
-    // block. We should consider using actual post-dominance here in the 
-    // future. 
-    if (UI->getParent() != PhiBB) { 
-      LLVM_DEBUG(dbgs() << "  Unsafe: use in a different BB: " << *UI << "\n"); 
-      return false; 
-    } 
- 
-    if (const auto *CS = dyn_cast<CallBase>(UI)) { 
-      if (CS->isConvergent() || CS->cannotDuplicate()) { 
-        LLVM_DEBUG(dbgs() << "  Unsafe: convergent " 
-                   "callsite cannot de duplicated: " << *UI << '\n'); 
-        return false; 
-      } 
-    } 
- 
-    // FIXME: This check is much too conservative. We're not going to move these 
-    // instructions onto new dynamic paths through the program unless there is 
-    // a call instruction between the use and the PHI node. And memory isn't 
-    // changing unless there is a store in that same sequence. We should 
-    // probably change this to do at least a limited scan of the intervening 
-    // instructions and allow handling stores in easily proven safe cases. 
-    if (mayBeMemoryDependent(*UI)) { 
-      LLVM_DEBUG(dbgs() << "  Unsafe: can't speculate use: " << *UI << "\n"); 
-      return false; 
-    } 
- 
-    // Now do a depth-first search of everything these users depend on to make 
-    // sure they are transitively safe. This is a depth-first search, but we 
-    // check nodes in preorder to minimize the amount of checking. 
-    Visited.insert(UI); 
-    DFSStack.push_back({UI, UI->value_op_begin()}); 
-    do { 
-      User::value_op_iterator OpIt; 
-      std::tie(UI, OpIt) = DFSStack.pop_back_val(); 
- 
-      while (OpIt != UI->value_op_end()) { 
-        auto *OpI = dyn_cast<Instruction>(*OpIt); 
-        // Increment to the next operand for whenever we continue. 
-        ++OpIt; 
-        // No need to visit non-instructions, which can't form dependencies. 
-        if (!OpI) 
-          continue; 
- 
-        // Now do the main pre-order checks that this operand is a viable 
-        // dependency of something we want to speculate. 
- 
-        // First do a few checks for instructions that won't require 
-        // speculation at all because they are trivially available on the 
-        // incoming edge (either through dominance or through an incoming value 
-        // to a PHI). 
-        // 
-        // The cases in the current block will be trivially dominated by the 
-        // edge. 
-        auto *ParentBB = OpI->getParent(); 
-        if (ParentBB == PhiBB) { 
-          if (isa<PHINode>(OpI)) { 
-            // We can trivially map through phi nodes in the same block. 
-            continue; 
-          } 
-        } else if (DT.dominates(ParentBB, PhiBB)) { 
-          // Instructions from dominating blocks are already available. 
-          continue; 
-        } 
- 
-        // Once we know that we're considering speculating the operand, check 
-        // if we've already explored this subgraph and found it to be safe. 
-        if (PotentialSpecSet.count(OpI)) 
-          continue; 
- 
-        // If we've already explored this subgraph and found it unsafe, bail. 
-        // If when we directly test whether this is safe it fails, bail. 
-        if (UnsafeSet.count(OpI) || ParentBB != PhiBB || 
-            mayBeMemoryDependent(*OpI)) { 
-          LLVM_DEBUG(dbgs() << "  Unsafe: can't speculate transitive use: " 
-                            << *OpI << "\n"); 
-          // Record the stack of instructions which reach this node as unsafe 
-          // so we prune subsequent searches. 
-          UnsafeSet.insert(OpI); 
-          for (auto &StackPair : DFSStack) { 
-            Instruction *I = StackPair.first; 
-            UnsafeSet.insert(I); 
-          } 
-          return false; 
-        } 
- 
-        // Skip any operands we're already recursively checking. 
-        if (!Visited.insert(OpI).second) 
-          continue; 
- 
-        // Push onto the stack and descend. We can directly continue this 
-        // loop when ascending. 
-        DFSStack.push_back({UI, OpIt}); 
-        UI = OpI; 
-        OpIt = OpI->value_op_begin(); 
-      } 
- 
-      // This node and all its operands are safe. Go ahead and cache that for 
-      // reuse later. 
-      PotentialSpecSet.insert(UI); 
- 
-      // Continue with the next node on the stack. 
-    } while (!DFSStack.empty()); 
-  } 
- 
-#ifndef NDEBUG 
-  // Every visited operand should have been marked as safe for speculation at 
-  // this point. Verify this and return success. 
-  for (auto *I : Visited) 
-    assert(PotentialSpecSet.count(I) && 
-           "Failed to mark a visited instruction as safe!"); 
-#endif 
-  return true; 
-} 
- 
-/// Check whether, in isolation, a given PHI node is both safe and profitable 
-/// to speculate users around. 
-/// 
-/// This handles checking whether there are any constant operands to a PHI 
-/// which could represent a useful speculation candidate, whether the users of 
-/// the PHI are safe to speculate including all their transitive dependencies, 
-/// and whether after speculation there will be some cost savings (profit) to 
-/// folding the operands into the users of the PHI node. Returns true if both 
-/// safe and profitable with relevant cost savings updated in the map and with 
-/// an update to the `PotentialSpecSet`. Returns false if either safety or 
-/// profitability are absent. Some new entries may be made to the 
-/// `PotentialSpecSet` even when this routine returns false, but they remain 
-/// conservatively correct. 
-/// 
-/// The profitability check here is a local one, but it checks this in an 
-/// interesting way. Beyond checking that the total cost of materializing the 
-/// constants will be less than the cost of folding them into their users, it 
-/// also checks that no one incoming constant will have a higher cost when 
-/// folded into its users rather than materialized. This higher cost could 
-/// result in a dynamic *path* that is more expensive even when the total cost 
-/// is lower. Currently, all of the interesting cases where this optimization 
-/// should fire are ones where it is a no-loss operation in this sense. If we 
-/// ever want to be more aggressive here, we would need to balance the 
-/// different incoming edges' cost by looking at their respective 
-/// probabilities. 
-static bool isSafeAndProfitableToSpeculateAroundPHI( 
-    PHINode &PN, SmallDenseMap<PHINode *, int, 16> &CostSavingsMap, 
-    SmallPtrSetImpl<Instruction *> &PotentialSpecSet, 
-    SmallPtrSetImpl<Instruction *> &UnsafeSet, DominatorTree &DT, 
-    TargetTransformInfo &TTI) { 
-  // First see whether there is any cost savings to speculating around this 
-  // PHI, and build up a map of the constant inputs to how many times they 
-  // occur. 
-  bool NonFreeMat = false; 
-  struct CostsAndCount { 
-    int MatCost = TargetTransformInfo::TCC_Free; 
-    int FoldedCost = TargetTransformInfo::TCC_Free; 
-    int Count = 0; 
-  }; 
-  SmallDenseMap<ConstantInt *, CostsAndCount, 16> CostsAndCounts; 
-  SmallPtrSet<BasicBlock *, 16> IncomingConstantBlocks; 
-  for (int i : llvm::seq<int>(0, PN.getNumIncomingValues())) { 
-    auto *IncomingC = dyn_cast<ConstantInt>(PN.getIncomingValue(i)); 
-    if (!IncomingC) 
-      continue; 
- 
-    // Only visit each incoming edge with a constant input once. 
-    if (!IncomingConstantBlocks.insert(PN.getIncomingBlock(i)).second) 
-      continue; 
- 
-    auto InsertResult = CostsAndCounts.insert({IncomingC, {}}); 
-    // Count how many edges share a given incoming costant. 
-    ++InsertResult.first->second.Count; 
-    // Only compute the cost the first time we see a particular constant. 
-    if (!InsertResult.second) 
-      continue; 
- 
-    int &MatCost = InsertResult.first->second.MatCost; 
-    MatCost = TTI.getIntImmCost(IncomingC->getValue(), IncomingC->getType(), 
-                                TargetTransformInfo::TCK_SizeAndLatency); 
-    NonFreeMat |= MatCost != TTI.TCC_Free; 
-  } 
-  if (!NonFreeMat) { 
-    LLVM_DEBUG(dbgs() << "    Free: " << PN << "\n"); 
-    // No profit in free materialization. 
-    return false; 
-  } 
- 
-  // Now check that the uses of this PHI can actually be speculated, 
-  // otherwise we'll still have to materialize the PHI value. 
-  if (!isSafeToSpeculatePHIUsers(PN, DT, PotentialSpecSet, UnsafeSet)) { 
-    LLVM_DEBUG(dbgs() << "    Unsafe PHI: " << PN << "\n"); 
-    return false; 
-  } 
- 
-  // Compute how much (if any) savings are available by speculating around this 
-  // PHI. 
-  for (Use &U : PN.uses()) { 
-    auto *UserI = cast<Instruction>(U.getUser()); 
-    // Now check whether there is any savings to folding the incoming constants 
-    // into this use. 
-    unsigned Idx = U.getOperandNo(); 
- 
-    // If we have a binary operator that is commutative, an actual constant 
-    // operand would end up on the RHS, so pretend the use of the PHI is on the 
-    // RHS. 
-    // 
-    // Technically, this is a bit weird if *both* operands are PHIs we're 
-    // speculating. But if that is the case, giving an "optimistic" cost isn't 
-    // a bad thing because after speculation it will constant fold. And 
-    // moreover, such cases should likely have been constant folded already by 
-    // some other pass, so we shouldn't worry about "modeling" them terribly 
-    // accurately here. Similarly, if the other operand is a constant, it still 
-    // seems fine to be "optimistic" in our cost modeling, because when the 
-    // incoming operand from the PHI node is also a constant, we will end up 
-    // constant folding. 
-    if (UserI->isBinaryOp() && UserI->isCommutative() && Idx != 1) 
-      // Assume we will commute the constant to the RHS to be canonical. 
-      Idx = 1; 
- 
-    // Get the intrinsic ID if this user is an intrinsic. 
-    Intrinsic::ID IID = Intrinsic::not_intrinsic; 
-    if (auto *UserII = dyn_cast<IntrinsicInst>(UserI)) 
-      IID = UserII->getIntrinsicID(); 
- 
-    for (auto &IncomingConstantAndCostsAndCount : CostsAndCounts) { 
-      ConstantInt *IncomingC = IncomingConstantAndCostsAndCount.first; 
-      int MatCost = IncomingConstantAndCostsAndCount.second.MatCost; 
-      int &FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost; 
-      if (IID) 
-        FoldedCost += 
-          TTI.getIntImmCostIntrin(IID, Idx, IncomingC->getValue(), 
-                                  IncomingC->getType(), 
-                                  TargetTransformInfo::TCK_SizeAndLatency); 
-      else 
-        FoldedCost += 
-            TTI.getIntImmCostInst(UserI->getOpcode(), Idx, 
-                                  IncomingC->getValue(), IncomingC->getType(), 
-                                  TargetTransformInfo::TCK_SizeAndLatency); 
- 
-      // If we accumulate more folded cost for this incoming constant than 
-      // materialized cost, then we'll regress any edge with this constant so 
-      // just bail. We're only interested in cases where folding the incoming 
-      // constants is at least break-even on all paths. 
-      if (FoldedCost > MatCost) { 
-        LLVM_DEBUG(dbgs() << "  Not profitable to fold imm: " << *IncomingC 
-                          << "\n" 
-                             "    Materializing cost:    " 
-                          << MatCost 
-                          << "\n" 
-                             "    Accumulated folded cost: " 
-                          << FoldedCost << "\n"); 
-        return false; 
-      } 
-    } 
-  } 
- 
-  // Compute the total cost savings afforded by this PHI node. 
-  int TotalMatCost = TTI.TCC_Free, TotalFoldedCost = TTI.TCC_Free; 
-  for (auto IncomingConstantAndCostsAndCount : CostsAndCounts) { 
-    int MatCost = IncomingConstantAndCostsAndCount.second.MatCost; 
-    int FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost; 
-    int Count = IncomingConstantAndCostsAndCount.second.Count; 
- 
-    TotalMatCost += MatCost * Count; 
-    TotalFoldedCost += FoldedCost * Count; 
-  } 
-  assert(TotalFoldedCost <= TotalMatCost && "If each constant's folded cost is " 
-                                            "less that its materialized cost, " 
-                                            "the sum must be as well."); 
- 
-  LLVM_DEBUG(dbgs() << "    Cost savings " << (TotalMatCost - TotalFoldedCost) 
-                    << ": " << PN << "\n"); 
-  CostSavingsMap[&PN] = TotalMatCost - TotalFoldedCost; 
-  return true; 
-} 
- 
-/// Simple helper to walk all the users of a list of phis depth first, and call 
-/// a visit function on each one in post-order. 
-/// 
-/// All of the PHIs should be in the same basic block, and this is primarily 
-/// used to make a single depth-first walk across their collective users 
-/// without revisiting any subgraphs. Callers should provide a fast, idempotent 
-/// callable to test whether a node has been visited and the more important 
-/// callable to actually visit a particular node. 
-/// 
-/// Depth-first and postorder here refer to the *operand* graph -- we start 
-/// from a collection of users of PHI nodes and walk "up" the operands 
-/// depth-first. 
-template <typename IsVisitedT, typename VisitT> 
-static void visitPHIUsersAndDepsInPostOrder(ArrayRef<PHINode *> PNs, 
-                                            IsVisitedT IsVisited, 
-                                            VisitT Visit) { 
-  SmallVector<std::pair<Instruction *, User::value_op_iterator>, 16> DFSStack; 
-  for (auto *PN : PNs) 
-    for (Use &U : PN->uses()) { 
-      auto *UI = cast<Instruction>(U.getUser()); 
-      if (IsVisited(UI)) 
-        // Already visited this user, continue across the roots. 
-        continue; 
- 
-      // Otherwise, walk the operand graph depth-first and visit each 
-      // dependency in postorder. 
-      DFSStack.push_back({UI, UI->value_op_begin()}); 
-      do { 
-        User::value_op_iterator OpIt; 
-        std::tie(UI, OpIt) = DFSStack.pop_back_val(); 
-        while (OpIt != UI->value_op_end()) { 
-          auto *OpI = dyn_cast<Instruction>(*OpIt); 
-          // Increment to the next operand for whenever we continue. 
-          ++OpIt; 
-          // No need to visit non-instructions, which can't form dependencies, 
-          // or instructions outside of our potential dependency set that we 
-          // were given. Finally, if we've already visited the node, continue 
-          // to the next. 
-          if (!OpI || IsVisited(OpI)) 
-            continue; 
- 
-          // Push onto the stack and descend. We can directly continue this 
-          // loop when ascending. 
-          DFSStack.push_back({UI, OpIt}); 
-          UI = OpI; 
-          OpIt = OpI->value_op_begin(); 
-        } 
- 
-        // Finished visiting children, visit this node. 
-        assert(!IsVisited(UI) && "Should not have already visited a node!"); 
-        Visit(UI); 
-      } while (!DFSStack.empty()); 
-    } 
-} 
- 
-/// Find profitable PHIs to speculate. 
-/// 
-/// For a PHI node to be profitable, we need the cost of speculating its users 
-/// (and their dependencies) to not exceed the savings of folding the PHI's 
-/// constant operands into the speculated users. 
-/// 
-/// Computing this is surprisingly challenging. Because users of two different 
-/// PHI nodes can depend on each other or on common other instructions, it may 
-/// be profitable to speculate two PHI nodes together even though neither one 
-/// in isolation is profitable. The straightforward way to find all the 
-/// profitable PHIs would be to check each combination of PHIs' cost, but this 
-/// is exponential in complexity. 
-/// 
-/// Even if we assume that we only care about cases where we can consider each 
-/// PHI node in isolation (rather than considering cases where none are 
-/// profitable in isolation but some subset are profitable as a set), we still 
-/// have a challenge. The obvious way to find all individually profitable PHIs 
-/// is to iterate until reaching a fixed point, but this will be quadratic in 
-/// complexity. =/ 
-/// 
-/// This code currently uses a linear-to-compute order for a greedy approach. 
-/// It won't find cases where a set of PHIs must be considered together, but it 
-/// handles most cases of order dependence without quadratic iteration. The 
-/// specific order used is the post-order across the operand DAG. When the last 
-/// user of a PHI is visited in this postorder walk, we check it for 
-/// profitability. 
-/// 
-/// There is an orthogonal extra complexity to all of this: computing the cost 
-/// itself can easily become a linear computation making everything again (at 
-/// best) quadratic. Using a postorder over the operand graph makes it 
-/// particularly easy to avoid this through dynamic programming. As we do the 
-/// postorder walk, we build the transitive cost of that subgraph. It is also 
-/// straightforward to then update these costs when we mark a PHI for 
-/// speculation so that subsequent PHIs don't re-pay the cost of already 
-/// speculated instructions. 
-static SmallVector<PHINode *, 16> 
-findProfitablePHIs(ArrayRef<PHINode *> PNs, 
-                   const SmallDenseMap<PHINode *, int, 16> &CostSavingsMap, 
-                   const SmallPtrSetImpl<Instruction *> &PotentialSpecSet, 
-                   int NumPreds, DominatorTree &DT, TargetTransformInfo &TTI) { 
-  SmallVector<PHINode *, 16> SpecPNs; 
- 
-  // First, establish a reverse mapping from immediate users of the PHI nodes 
-  // to the nodes themselves, and count how many users each PHI node has in 
-  // a way we can update while processing them. 
-  SmallDenseMap<Instruction *, TinyPtrVector<PHINode *>, 16> UserToPNMap; 
-  SmallDenseMap<PHINode *, int, 16> PNUserCountMap; 
-  SmallPtrSet<Instruction *, 16> UserSet; 
-  for (auto *PN : PNs) { 
-    assert(UserSet.empty() && "Must start with an empty user set!"); 
-    for (Use &U : PN->uses()) 
-      UserSet.insert(cast<Instruction>(U.getUser())); 
-    PNUserCountMap[PN] = UserSet.size(); 
-    for (auto *UI : UserSet) 
-      UserToPNMap.insert({UI, {}}).first->second.push_back(PN); 
-    UserSet.clear(); 
-  } 
- 
-  // Now do a DFS across the operand graph of the users, computing cost as we 
-  // go and when all costs for a given PHI are known, checking that PHI for 
-  // profitability. 
-  SmallDenseMap<Instruction *, int, 16> SpecCostMap; 
-  visitPHIUsersAndDepsInPostOrder( 
-      PNs, 
-      /*IsVisited*/ 
-      [&](Instruction *I) { 
-        // We consider anything that isn't potentially speculated to be 
-        // "visited" as it is already handled. Similarly, anything that *is* 
-        // potentially speculated but for which we have an entry in our cost 
-        // map, we're done. 
-        return !PotentialSpecSet.count(I) || SpecCostMap.count(I); 
-      }, 
-      /*Visit*/ 
-      [&](Instruction *I) { 
-        // We've fully visited the operands, so sum their cost with this node 
-        // and update the cost map. 
-        int Cost = TTI.TCC_Free; 
-        for (Value *OpV : I->operand_values()) 
-          if (auto *OpI = dyn_cast<Instruction>(OpV)) { 
-            auto CostMapIt = SpecCostMap.find(OpI); 
-            if (CostMapIt != SpecCostMap.end()) 
-              Cost += CostMapIt->second; 
-          } 
-        Cost += TTI.getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency); 
-        bool Inserted = SpecCostMap.insert({I, Cost}).second; 
-        (void)Inserted; 
-        assert(Inserted && "Must not re-insert a cost during the DFS!"); 
- 
-        // Now check if this node had a corresponding PHI node using it. If so, 
-        // we need to decrement the outstanding user count for it. 
-        auto UserPNsIt = UserToPNMap.find(I); 
-        if (UserPNsIt == UserToPNMap.end()) 
-          return; 
-        auto &UserPNs = UserPNsIt->second; 
-        auto UserPNsSplitIt = std::stable_partition( 
-            UserPNs.begin(), UserPNs.end(), [&](PHINode *UserPN) { 
-              int &PNUserCount = PNUserCountMap.find(UserPN)->second; 
-              assert( 
-                  PNUserCount > 0 && 
-                  "Should never re-visit a PN after its user count hits zero!"); 
-              --PNUserCount; 
-              return PNUserCount != 0; 
-            }); 
- 
-        // FIXME: Rather than one at a time, we should sum the savings as the 
-        // cost will be completely shared. 
-        SmallVector<Instruction *, 16> SpecWorklist; 
-        for (auto *PN : llvm::make_range(UserPNsSplitIt, UserPNs.end())) { 
-          int SpecCost = TTI.TCC_Free; 
-          for (Use &U : PN->uses()) 
-            SpecCost += 
-                SpecCostMap.find(cast<Instruction>(U.getUser()))->second; 
-          SpecCost *= (NumPreds - 1); 
-          // When the user count of a PHI node hits zero, we should check its 
-          // profitability. If profitable, we should mark it for speculation 
-          // and zero out the cost of everything it depends on. 
-          int CostSavings = CostSavingsMap.find(PN)->second; 
-          if (SpecCost > CostSavings) { 
-            LLVM_DEBUG(dbgs() << "  Not profitable, speculation cost: " << *PN 
-                              << "\n" 
-                                 "    Cost savings:     " 
-                              << CostSavings 
-                              << "\n" 
-                                 "    Speculation cost: " 
-                              << SpecCost << "\n"); 
-            continue; 
-          } 
- 
-          // We're going to speculate this user-associated PHI. Copy it out and 
-          // add its users to the worklist to update their cost. 
-          SpecPNs.push_back(PN); 
-          for (Use &U : PN->uses()) { 
-            auto *UI = cast<Instruction>(U.getUser()); 
-            auto CostMapIt = SpecCostMap.find(UI); 
-            if (CostMapIt->second == 0) 
-              continue; 
-            // Zero out this cost entry to avoid duplicates. 
-            CostMapIt->second = 0; 
-            SpecWorklist.push_back(UI); 
-          } 
-        } 
- 
-        // Now walk all the operands of the users in the worklist transitively 
-        // to zero out all the memoized costs. 
-        while (!SpecWorklist.empty()) { 
-          Instruction *SpecI = SpecWorklist.pop_back_val(); 
-          assert(SpecCostMap.find(SpecI)->second == 0 && 
-                 "Didn't zero out a cost!"); 
- 
-          // Walk the operands recursively to zero out their cost as well. 
-          for (auto *OpV : SpecI->operand_values()) { 
-            auto *OpI = dyn_cast<Instruction>(OpV); 
-            if (!OpI) 
-              continue; 
-            auto CostMapIt = SpecCostMap.find(OpI); 
-            if (CostMapIt == SpecCostMap.end() || CostMapIt->second == 0) 
-              continue; 
-            CostMapIt->second = 0; 
-            SpecWorklist.push_back(OpI); 
-          } 
-        } 
-      }); 
- 
-  return SpecPNs; 
-} 
- 
-/// Speculate users around a set of PHI nodes. 
-/// 
-/// This routine does the actual speculation around a set of PHI nodes where we 
-/// have determined this to be both safe and profitable. 
-/// 
-/// This routine handles any spliting of critical edges necessary to create 
-/// a safe block to speculate into as well as cloning the instructions and 
-/// rewriting all uses. 
-static void speculatePHIs(ArrayRef<PHINode *> SpecPNs, 
-                          SmallPtrSetImpl<Instruction *> &PotentialSpecSet, 
-                          SmallSetVector<BasicBlock *, 16> &PredSet, 
-                          DominatorTree &DT) { 
-  LLVM_DEBUG(dbgs() << "  Speculating around " << SpecPNs.size() << " PHIs!\n"); 
-  NumPHIsSpeculated += SpecPNs.size(); 
- 
-  // Split any critical edges so that we have a block to hoist into. 
-  auto *ParentBB = SpecPNs[0]->getParent(); 
-  SmallVector<BasicBlock *, 16> SpecPreds; 
-  SpecPreds.reserve(PredSet.size()); 
-  for (auto *PredBB : PredSet) { 
-    auto *NewPredBB = SplitCriticalEdge( 
-        PredBB, ParentBB, 
-        CriticalEdgeSplittingOptions(&DT).setMergeIdenticalEdges()); 
-    if (NewPredBB) { 
-      ++NumEdgesSplit; 
-      LLVM_DEBUG(dbgs() << "  Split critical edge from: " << PredBB->getName() 
-                        << "\n"); 
-      SpecPreds.push_back(NewPredBB); 
-    } else { 
-      assert(PredBB->getSingleSuccessor() == ParentBB && 
-             "We need a non-critical predecessor to speculate into."); 
-      assert(!isa<InvokeInst>(PredBB->getTerminator()) && 
-             "Cannot have a non-critical invoke!"); 
- 
-      // Already non-critical, use existing pred. 
-      SpecPreds.push_back(PredBB); 
-    } 
-  } 
- 
-  SmallPtrSet<Instruction *, 16> SpecSet; 
-  SmallVector<Instruction *, 16> SpecList; 
-  visitPHIUsersAndDepsInPostOrder(SpecPNs, 
-                                  /*IsVisited*/ 
-                                  [&](Instruction *I) { 
-                                    // This is visited if we don't need to 
-                                    // speculate it or we already have 
-                                    // speculated it. 
-                                    return !PotentialSpecSet.count(I) || 
-                                           SpecSet.count(I); 
-                                  }, 
-                                  /*Visit*/ 
-                                  [&](Instruction *I) { 
-                                    // All operands scheduled, schedule this 
-                                    // node. 
-                                    SpecSet.insert(I); 
-                                    SpecList.push_back(I); 
-                                  }); 
- 
-  int NumSpecInsts = SpecList.size() * SpecPreds.size(); 
-  int NumRedundantInsts = NumSpecInsts - SpecList.size(); 
-  LLVM_DEBUG(dbgs() << "  Inserting " << NumSpecInsts 
-                    << " speculated instructions, " << NumRedundantInsts 
-                    << " redundancies\n"); 
-  NumSpeculatedInstructions += NumSpecInsts; 
-  NumNewRedundantInstructions += NumRedundantInsts; 
- 
-  // Each predecessor is numbered by its index in `SpecPreds`, so for each 
-  // instruction we speculate, the speculated instruction is stored in that 
-  // index of the vector associated with the original instruction. We also 
-  // store the incoming values for each predecessor from any PHIs used. 
-  SmallDenseMap<Instruction *, SmallVector<Value *, 2>, 16> SpeculatedValueMap; 
- 
-  // Inject the synthetic mappings to rewrite PHIs to the appropriate incoming 
-  // value. This handles both the PHIs we are speculating around and any other 
-  // PHIs that happen to be used. 
-  for (auto *OrigI : SpecList) 
-    for (auto *OpV : OrigI->operand_values()) { 
-      auto *OpPN = dyn_cast<PHINode>(OpV); 
-      if (!OpPN || OpPN->getParent() != ParentBB) 
-        continue; 
- 
-      auto InsertResult = SpeculatedValueMap.insert({OpPN, {}}); 
-      if (!InsertResult.second) 
-        continue; 
- 
-      auto &SpeculatedVals = InsertResult.first->second; 
- 
-      // Populating our structure for mapping is particularly annoying because 
-      // finding an incoming value for a particular predecessor block in a PHI 
-      // node is a linear time operation! To avoid quadratic behavior, we build 
-      // a map for this PHI node's incoming values and then translate it into 
-      // the more compact representation used below. 
-      SmallDenseMap<BasicBlock *, Value *, 16> IncomingValueMap; 
-      for (int i : llvm::seq<int>(0, OpPN->getNumIncomingValues())) 
-        IncomingValueMap[OpPN->getIncomingBlock(i)] = OpPN->getIncomingValue(i); 
- 
-      for (auto *PredBB : SpecPreds) 
-        SpeculatedVals.push_back(IncomingValueMap.find(PredBB)->second); 
-    } 
- 
-  // Speculate into each predecessor. 
-  for (int PredIdx : llvm::seq<int>(0, SpecPreds.size())) { 
-    auto *PredBB = SpecPreds[PredIdx]; 
-    assert(PredBB->getSingleSuccessor() == ParentBB && 
-           "We need a non-critical predecessor to speculate into."); 
- 
-    for (auto *OrigI : SpecList) { 
-      auto *NewI = OrigI->clone(); 
-      NewI->setName(Twine(OrigI->getName()) + "." + Twine(PredIdx)); 
-      NewI->insertBefore(PredBB->getTerminator()); 
- 
-      // Rewrite all the operands to the previously speculated instructions. 
-      // Because we're walking in-order, the defs must precede the uses and we 
-      // should already have these mappings. 
-      for (Use &U : NewI->operands()) { 
-        auto *OpI = dyn_cast<Instruction>(U.get()); 
-        if (!OpI) 
-          continue; 
-        auto MapIt = SpeculatedValueMap.find(OpI); 
-        if (MapIt == SpeculatedValueMap.end()) 
-          continue; 
-        const auto &SpeculatedVals = MapIt->second; 
-        assert(SpeculatedVals[PredIdx] && 
-               "Must have a speculated value for this predecessor!"); 
-        assert(SpeculatedVals[PredIdx]->getType() == OpI->getType() && 
-               "Speculated value has the wrong type!"); 
- 
-        // Rewrite the use to this predecessor's speculated instruction. 
-        U.set(SpeculatedVals[PredIdx]); 
-      } 
- 
-      // Commute instructions which now have a constant in the LHS but not the 
-      // RHS. 
-      if (NewI->isBinaryOp() && NewI->isCommutative() && 
-          isa<Constant>(NewI->getOperand(0)) && 
-          !isa<Constant>(NewI->getOperand(1))) 
-        NewI->getOperandUse(0).swap(NewI->getOperandUse(1)); 
- 
-      SpeculatedValueMap[OrigI].push_back(NewI); 
-      assert(SpeculatedValueMap[OrigI][PredIdx] == NewI && 
-             "Mismatched speculated instruction index!"); 
-    } 
-  } 
- 
-  // Walk the speculated instruction list and if they have uses, insert a PHI 
-  // for them from the speculated versions, and replace the uses with the PHI. 
-  // Then erase the instructions as they have been fully speculated. The walk 
-  // needs to be in reverse so that we don't think there are users when we'll 
-  // actually eventually remove them later. 
-  IRBuilder<> IRB(SpecPNs[0]); 
-  for (auto *OrigI : llvm::reverse(SpecList)) { 
-    // Check if we need a PHI for any remaining users and if so, insert it. 
-    if (!OrigI->use_empty()) { 
-      auto *SpecIPN = IRB.CreatePHI(OrigI->getType(), SpecPreds.size(), 
-                                    Twine(OrigI->getName()) + ".phi"); 
-      // Add the incoming values we speculated. 
-      auto &SpeculatedVals = SpeculatedValueMap.find(OrigI)->second; 
-      for (int PredIdx : llvm::seq<int>(0, SpecPreds.size())) 
-        SpecIPN->addIncoming(SpeculatedVals[PredIdx], SpecPreds[PredIdx]); 
- 
-      // And replace the uses with the PHI node. 
-      OrigI->replaceAllUsesWith(SpecIPN); 
-    } 
- 
-    // It is important to immediately erase this so that it stops using other 
-    // instructions. This avoids inserting needless PHIs of them. 
-    OrigI->eraseFromParent(); 
-  } 
- 
-  // All of the uses of the speculated phi nodes should be removed at this 
-  // point, so erase them. 
-  for (auto *SpecPN : SpecPNs) { 
-    assert(SpecPN->use_empty() && "All users should have been speculated!"); 
-    SpecPN->eraseFromParent(); 
-  } 
-} 
- 
-/// Try to speculate around a series of PHIs from a single basic block. 
-/// 
-/// This routine checks whether any of these PHIs are profitable to speculate 
-/// users around. If safe and profitable, it does the speculation. It returns 
-/// true when at least some speculation occurs. 
-static bool tryToSpeculatePHIs(SmallVectorImpl<PHINode *> &PNs, 
-                               DominatorTree &DT, TargetTransformInfo &TTI) { 
-  LLVM_DEBUG(dbgs() << "Evaluating phi nodes for speculation:\n"); 
- 
-  // Savings in cost from speculating around a PHI node. 
-  SmallDenseMap<PHINode *, int, 16> CostSavingsMap; 
- 
-  // Remember the set of instructions that are candidates for speculation so 
-  // that we can quickly walk things within that space. This prunes out 
-  // instructions already available along edges, etc. 
-  SmallPtrSet<Instruction *, 16> PotentialSpecSet; 
- 
-  // Remember the set of instructions that are (transitively) unsafe to 
-  // speculate into the incoming edges of this basic block. This avoids 
-  // recomputing them for each PHI node we check. This set is specific to this 
-  // block though as things are pruned out of it based on what is available 
-  // along incoming edges. 
-  SmallPtrSet<Instruction *, 16> UnsafeSet; 
- 
-  // For each PHI node in this block, check whether there are immediate folding 
-  // opportunities from speculation, and whether that speculation will be 
-  // valid. This determise the set of safe PHIs to speculate. 
+//===- SpeculateAroundPHIs.cpp --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/SpeculateAroundPHIs.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "spec-phis"
+
+STATISTIC(NumPHIsSpeculated, "Number of PHI nodes we speculated around");
+STATISTIC(NumEdgesSplit,
+          "Number of critical edges which were split for speculation");
+STATISTIC(NumSpeculatedInstructions,
+          "Number of instructions we speculated around the PHI nodes");
+STATISTIC(NumNewRedundantInstructions,
+          "Number of new, redundant instructions inserted");
+
+/// Check whether speculating the users of a PHI node around the PHI
+/// will be safe.
+///
+/// This checks both that all of the users are safe and also that all of their
+/// operands are either recursively safe or already available along an incoming
+/// edge to the PHI.
+///
+/// This routine caches both all the safe nodes explored in `PotentialSpecSet`
+/// and the chain of nodes that definitively reach any unsafe node in
+/// `UnsafeSet`. By preserving these between repeated calls to this routine for
+/// PHIs in the same basic block, the exploration here can be reused. However,
+/// these caches must no be reused for PHIs in a different basic block as they
+/// reflect what is available along incoming edges.
+static bool
+isSafeToSpeculatePHIUsers(PHINode &PN, DominatorTree &DT,
+                          SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
+                          SmallPtrSetImpl<Instruction *> &UnsafeSet) {
+  auto *PhiBB = PN.getParent();
+  SmallPtrSet<Instruction *, 4> Visited;
+  SmallVector<std::pair<Instruction *, User::value_op_iterator>, 16> DFSStack;
+
+  // Walk each user of the PHI node.
+  for (Use &U : PN.uses()) {
+    auto *UI = cast<Instruction>(U.getUser());
+
+    // Ensure the use post-dominates the PHI node. This ensures that, in the
+    // absence of unwinding, the use will actually be reached.
+    // FIXME: We use a blunt hammer of requiring them to be in the same basic
+    // block. We should consider using actual post-dominance here in the
+    // future.
+    if (UI->getParent() != PhiBB) {
+      LLVM_DEBUG(dbgs() << "  Unsafe: use in a different BB: " << *UI << "\n");
+      return false;
+    }
+
+    if (const auto *CS = dyn_cast<CallBase>(UI)) {
+      if (CS->isConvergent() || CS->cannotDuplicate()) {
+        LLVM_DEBUG(dbgs() << "  Unsafe: convergent "
+                   "callsite cannot de duplicated: " << *UI << '\n');
+        return false;
+      }
+    }
+
+    // FIXME: This check is much too conservative. We're not going to move these
+    // instructions onto new dynamic paths through the program unless there is
+    // a call instruction between the use and the PHI node. And memory isn't
+    // changing unless there is a store in that same sequence. We should
+    // probably change this to do at least a limited scan of the intervening
+    // instructions and allow handling stores in easily proven safe cases.
+    if (mayBeMemoryDependent(*UI)) {
+      LLVM_DEBUG(dbgs() << "  Unsafe: can't speculate use: " << *UI << "\n");
+      return false;
+    }
+
+    // Now do a depth-first search of everything these users depend on to make
+    // sure they are transitively safe. This is a depth-first search, but we
+    // check nodes in preorder to minimize the amount of checking.
+    Visited.insert(UI);
+    DFSStack.push_back({UI, UI->value_op_begin()});
+    do {
+      User::value_op_iterator OpIt;
+      std::tie(UI, OpIt) = DFSStack.pop_back_val();
+
+      while (OpIt != UI->value_op_end()) {
+        auto *OpI = dyn_cast<Instruction>(*OpIt);
+        // Increment to the next operand for whenever we continue.
+        ++OpIt;
+        // No need to visit non-instructions, which can't form dependencies.
+        if (!OpI)
+          continue;
+
+        // Now do the main pre-order checks that this operand is a viable
+        // dependency of something we want to speculate.
+
+        // First do a few checks for instructions that won't require
+        // speculation at all because they are trivially available on the
+        // incoming edge (either through dominance or through an incoming value
+        // to a PHI).
+        //
+        // The cases in the current block will be trivially dominated by the
+        // edge.
+        auto *ParentBB = OpI->getParent();
+        if (ParentBB == PhiBB) {
+          if (isa<PHINode>(OpI)) {
+            // We can trivially map through phi nodes in the same block.
+            continue;
+          }
+        } else if (DT.dominates(ParentBB, PhiBB)) {
+          // Instructions from dominating blocks are already available.
+          continue;
+        }
+
+        // Once we know that we're considering speculating the operand, check
+        // if we've already explored this subgraph and found it to be safe.
+        if (PotentialSpecSet.count(OpI))
+          continue;
+
+        // If we've already explored this subgraph and found it unsafe, bail.
+        // If when we directly test whether this is safe it fails, bail.
+        if (UnsafeSet.count(OpI) || ParentBB != PhiBB ||
+            mayBeMemoryDependent(*OpI)) {
+          LLVM_DEBUG(dbgs() << "  Unsafe: can't speculate transitive use: "
+                            << *OpI << "\n");
+          // Record the stack of instructions which reach this node as unsafe
+          // so we prune subsequent searches.
+          UnsafeSet.insert(OpI);
+          for (auto &StackPair : DFSStack) {
+            Instruction *I = StackPair.first;
+            UnsafeSet.insert(I);
+          }
+          return false;
+        }
+
+        // Skip any operands we're already recursively checking.
+        if (!Visited.insert(OpI).second)
+          continue;
+
+        // Push onto the stack and descend. We can directly continue this
+        // loop when ascending.
+        DFSStack.push_back({UI, OpIt});
+        UI = OpI;
+        OpIt = OpI->value_op_begin();
+      }
+
+      // This node and all its operands are safe. Go ahead and cache that for
+      // reuse later.
+      PotentialSpecSet.insert(UI);
+
+      // Continue with the next node on the stack.
+    } while (!DFSStack.empty());
+  }
+
+#ifndef NDEBUG
+  // Every visited operand should have been marked as safe for speculation at
+  // this point. Verify this and return success.
+  for (auto *I : Visited)
+    assert(PotentialSpecSet.count(I) &&
+           "Failed to mark a visited instruction as safe!");
+#endif
+  return true;
+}
+
+/// Check whether, in isolation, a given PHI node is both safe and profitable
+/// to speculate users around.
+///
+/// This handles checking whether there are any constant operands to a PHI
+/// which could represent a useful speculation candidate, whether the users of
+/// the PHI are safe to speculate including all their transitive dependencies,
+/// and whether after speculation there will be some cost savings (profit) to
+/// folding the operands into the users of the PHI node. Returns true if both
+/// safe and profitable with relevant cost savings updated in the map and with
+/// an update to the `PotentialSpecSet`. Returns false if either safety or
+/// profitability are absent. Some new entries may be made to the
+/// `PotentialSpecSet` even when this routine returns false, but they remain
+/// conservatively correct.
+///
+/// The profitability check here is a local one, but it checks this in an
+/// interesting way. Beyond checking that the total cost of materializing the
+/// constants will be less than the cost of folding them into their users, it
+/// also checks that no one incoming constant will have a higher cost when
+/// folded into its users rather than materialized. This higher cost could
+/// result in a dynamic *path* that is more expensive even when the total cost
+/// is lower. Currently, all of the interesting cases where this optimization
+/// should fire are ones where it is a no-loss operation in this sense. If we
+/// ever want to be more aggressive here, we would need to balance the
+/// different incoming edges' cost by looking at their respective
+/// probabilities.
+static bool isSafeAndProfitableToSpeculateAroundPHI(
+    PHINode &PN, SmallDenseMap<PHINode *, int, 16> &CostSavingsMap,
+    SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
+    SmallPtrSetImpl<Instruction *> &UnsafeSet, DominatorTree &DT,
+    TargetTransformInfo &TTI) {
+  // First see whether there is any cost savings to speculating around this
+  // PHI, and build up a map of the constant inputs to how many times they
+  // occur.
+  bool NonFreeMat = false;
+  struct CostsAndCount {
+    int MatCost = TargetTransformInfo::TCC_Free;
+    int FoldedCost = TargetTransformInfo::TCC_Free;
+    int Count = 0;
+  };
+  SmallDenseMap<ConstantInt *, CostsAndCount, 16> CostsAndCounts;
+  SmallPtrSet<BasicBlock *, 16> IncomingConstantBlocks;
+  for (int i : llvm::seq<int>(0, PN.getNumIncomingValues())) {
+    auto *IncomingC = dyn_cast<ConstantInt>(PN.getIncomingValue(i));
+    if (!IncomingC)
+      continue;
+
+    // Only visit each incoming edge with a constant input once.
+    if (!IncomingConstantBlocks.insert(PN.getIncomingBlock(i)).second)
+      continue;
+
+    auto InsertResult = CostsAndCounts.insert({IncomingC, {}});
+    // Count how many edges share a given incoming costant.
+    ++InsertResult.first->second.Count;
+    // Only compute the cost the first time we see a particular constant.
+    if (!InsertResult.second)
+      continue;
+
+    int &MatCost = InsertResult.first->second.MatCost;
+    MatCost = TTI.getIntImmCost(IncomingC->getValue(), IncomingC->getType(),
+                                TargetTransformInfo::TCK_SizeAndLatency);
+    NonFreeMat |= MatCost != TTI.TCC_Free;
+  }
+  if (!NonFreeMat) {
+    LLVM_DEBUG(dbgs() << "    Free: " << PN << "\n");
+    // No profit in free materialization.
+    return false;
+  }
+
+  // Now check that the uses of this PHI can actually be speculated,
+  // otherwise we'll still have to materialize the PHI value.
+  if (!isSafeToSpeculatePHIUsers(PN, DT, PotentialSpecSet, UnsafeSet)) {
+    LLVM_DEBUG(dbgs() << "    Unsafe PHI: " << PN << "\n");
+    return false;
+  }
+
+  // Compute how much (if any) savings are available by speculating around this
+  // PHI.
+  for (Use &U : PN.uses()) {
+    auto *UserI = cast<Instruction>(U.getUser());
+    // Now check whether there is any savings to folding the incoming constants
+    // into this use.
+    unsigned Idx = U.getOperandNo();
+
+    // If we have a binary operator that is commutative, an actual constant
+    // operand would end up on the RHS, so pretend the use of the PHI is on the
+    // RHS.
+    //
+    // Technically, this is a bit weird if *both* operands are PHIs we're
+    // speculating. But if that is the case, giving an "optimistic" cost isn't
+    // a bad thing because after speculation it will constant fold. And
+    // moreover, such cases should likely have been constant folded already by
+    // some other pass, so we shouldn't worry about "modeling" them terribly
+    // accurately here. Similarly, if the other operand is a constant, it still
+    // seems fine to be "optimistic" in our cost modeling, because when the
+    // incoming operand from the PHI node is also a constant, we will end up
+    // constant folding.
+    if (UserI->isBinaryOp() && UserI->isCommutative() && Idx != 1)
+      // Assume we will commute the constant to the RHS to be canonical.
+      Idx = 1;
+
+    // Get the intrinsic ID if this user is an intrinsic.
+    Intrinsic::ID IID = Intrinsic::not_intrinsic;
+    if (auto *UserII = dyn_cast<IntrinsicInst>(UserI))
+      IID = UserII->getIntrinsicID();
+
+    for (auto &IncomingConstantAndCostsAndCount : CostsAndCounts) {
+      ConstantInt *IncomingC = IncomingConstantAndCostsAndCount.first;
+      int MatCost = IncomingConstantAndCostsAndCount.second.MatCost;
+      int &FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost;
+      if (IID)
+        FoldedCost +=
+          TTI.getIntImmCostIntrin(IID, Idx, IncomingC->getValue(),
+                                  IncomingC->getType(),
+                                  TargetTransformInfo::TCK_SizeAndLatency);
+      else
+        FoldedCost +=
+            TTI.getIntImmCostInst(UserI->getOpcode(), Idx,
+                                  IncomingC->getValue(), IncomingC->getType(),
+                                  TargetTransformInfo::TCK_SizeAndLatency);
+
+      // If we accumulate more folded cost for this incoming constant than
+      // materialized cost, then we'll regress any edge with this constant so
+      // just bail. We're only interested in cases where folding the incoming
+      // constants is at least break-even on all paths.
+      if (FoldedCost > MatCost) {
+        LLVM_DEBUG(dbgs() << "  Not profitable to fold imm: " << *IncomingC
+                          << "\n"
+                             "    Materializing cost:    "
+                          << MatCost
+                          << "\n"
+                             "    Accumulated folded cost: "
+                          << FoldedCost << "\n");
+        return false;
+      }
+    }
+  }
+
+  // Compute the total cost savings afforded by this PHI node.
+  int TotalMatCost = TTI.TCC_Free, TotalFoldedCost = TTI.TCC_Free;
+  for (auto IncomingConstantAndCostsAndCount : CostsAndCounts) {
+    int MatCost = IncomingConstantAndCostsAndCount.second.MatCost;
+    int FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost;
+    int Count = IncomingConstantAndCostsAndCount.second.Count;
+
+    TotalMatCost += MatCost * Count;
+    TotalFoldedCost += FoldedCost * Count;
+  }
+  assert(TotalFoldedCost <= TotalMatCost && "If each constant's folded cost is "
+                                            "less that its materialized cost, "
+                                            "the sum must be as well.");
+
+  LLVM_DEBUG(dbgs() << "    Cost savings " << (TotalMatCost - TotalFoldedCost)
+                    << ": " << PN << "\n");
+  CostSavingsMap[&PN] = TotalMatCost - TotalFoldedCost;
+  return true;
+}
+
+/// Simple helper to walk all the users of a list of phis depth first, and call
+/// a visit function on each one in post-order.
+///
+/// All of the PHIs should be in the same basic block, and this is primarily
+/// used to make a single depth-first walk across their collective users
+/// without revisiting any subgraphs. Callers should provide a fast, idempotent
+/// callable to test whether a node has been visited and the more important
+/// callable to actually visit a particular node.
+///
+/// Depth-first and postorder here refer to the *operand* graph -- we start
+/// from a collection of users of PHI nodes and walk "up" the operands
+/// depth-first.
+template <typename IsVisitedT, typename VisitT>
+static void visitPHIUsersAndDepsInPostOrder(ArrayRef<PHINode *> PNs,
+                                            IsVisitedT IsVisited,
+                                            VisitT Visit) {
+  SmallVector<std::pair<Instruction *, User::value_op_iterator>, 16> DFSStack;
+  for (auto *PN : PNs)
+    for (Use &U : PN->uses()) {
+      auto *UI = cast<Instruction>(U.getUser());
+      if (IsVisited(UI))
+        // Already visited this user, continue across the roots.
+        continue;
+
+      // Otherwise, walk the operand graph depth-first and visit each
+      // dependency in postorder.
+      DFSStack.push_back({UI, UI->value_op_begin()});
+      do {
+        User::value_op_iterator OpIt;
+        std::tie(UI, OpIt) = DFSStack.pop_back_val();
+        while (OpIt != UI->value_op_end()) {
+          auto *OpI = dyn_cast<Instruction>(*OpIt);
+          // Increment to the next operand for whenever we continue.
+          ++OpIt;
+          // No need to visit non-instructions, which can't form dependencies,
+          // or instructions outside of our potential dependency set that we
+          // were given. Finally, if we've already visited the node, continue
+          // to the next.
+          if (!OpI || IsVisited(OpI))
+            continue;
+
+          // Push onto the stack and descend. We can directly continue this
+          // loop when ascending.
+          DFSStack.push_back({UI, OpIt});
+          UI = OpI;
+          OpIt = OpI->value_op_begin();
+        }
+
+        // Finished visiting children, visit this node.
+        assert(!IsVisited(UI) && "Should not have already visited a node!");
+        Visit(UI);
+      } while (!DFSStack.empty());
+    }
+}
+
+/// Find profitable PHIs to speculate.
+///
+/// For a PHI node to be profitable, we need the cost of speculating its users
+/// (and their dependencies) to not exceed the savings of folding the PHI's
+/// constant operands into the speculated users.
+///
+/// Computing this is surprisingly challenging. Because users of two different
+/// PHI nodes can depend on each other or on common other instructions, it may
+/// be profitable to speculate two PHI nodes together even though neither one
+/// in isolation is profitable. The straightforward way to find all the
+/// profitable PHIs would be to check each combination of PHIs' cost, but this
+/// is exponential in complexity.
+///
+/// Even if we assume that we only care about cases where we can consider each
+/// PHI node in isolation (rather than considering cases where none are
+/// profitable in isolation but some subset are profitable as a set), we still
+/// have a challenge. The obvious way to find all individually profitable PHIs
+/// is to iterate until reaching a fixed point, but this will be quadratic in
+/// complexity. =/
+///
+/// This code currently uses a linear-to-compute order for a greedy approach.
+/// It won't find cases where a set of PHIs must be considered together, but it
+/// handles most cases of order dependence without quadratic iteration. The
+/// specific order used is the post-order across the operand DAG. When the last
+/// user of a PHI is visited in this postorder walk, we check it for
+/// profitability.
+///
+/// There is an orthogonal extra complexity to all of this: computing the cost
+/// itself can easily become a linear computation making everything again (at
+/// best) quadratic. Using a postorder over the operand graph makes it
+/// particularly easy to avoid this through dynamic programming. As we do the
+/// postorder walk, we build the transitive cost of that subgraph. It is also
+/// straightforward to then update these costs when we mark a PHI for
+/// speculation so that subsequent PHIs don't re-pay the cost of already
+/// speculated instructions.
+static SmallVector<PHINode *, 16>
+findProfitablePHIs(ArrayRef<PHINode *> PNs,
+                   const SmallDenseMap<PHINode *, int, 16> &CostSavingsMap,
+                   const SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
+                   int NumPreds, DominatorTree &DT, TargetTransformInfo &TTI) {
+  SmallVector<PHINode *, 16> SpecPNs;
+
+  // First, establish a reverse mapping from immediate users of the PHI nodes
+  // to the nodes themselves, and count how many users each PHI node has in
+  // a way we can update while processing them.
+  SmallDenseMap<Instruction *, TinyPtrVector<PHINode *>, 16> UserToPNMap;
+  SmallDenseMap<PHINode *, int, 16> PNUserCountMap;
+  SmallPtrSet<Instruction *, 16> UserSet;
+  for (auto *PN : PNs) {
+    assert(UserSet.empty() && "Must start with an empty user set!");
+    for (Use &U : PN->uses())
+      UserSet.insert(cast<Instruction>(U.getUser()));
+    PNUserCountMap[PN] = UserSet.size();
+    for (auto *UI : UserSet)
+      UserToPNMap.insert({UI, {}}).first->second.push_back(PN);
+    UserSet.clear();
+  }
+
+  // Now do a DFS across the operand graph of the users, computing cost as we
+  // go and when all costs for a given PHI are known, checking that PHI for
+  // profitability.
+  SmallDenseMap<Instruction *, int, 16> SpecCostMap;
+  visitPHIUsersAndDepsInPostOrder(
+      PNs,
+      /*IsVisited*/
+      [&](Instruction *I) {
+        // We consider anything that isn't potentially speculated to be
+        // "visited" as it is already handled. Similarly, anything that *is*
+        // potentially speculated but for which we have an entry in our cost
+        // map, we're done.
+        return !PotentialSpecSet.count(I) || SpecCostMap.count(I);
+      },
+      /*Visit*/
+      [&](Instruction *I) {
+        // We've fully visited the operands, so sum their cost with this node
+        // and update the cost map.
+        int Cost = TTI.TCC_Free;
+        for (Value *OpV : I->operand_values())
+          if (auto *OpI = dyn_cast<Instruction>(OpV)) {
+            auto CostMapIt = SpecCostMap.find(OpI);
+            if (CostMapIt != SpecCostMap.end())
+              Cost += CostMapIt->second;
+          }
+        Cost += TTI.getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency);
+        bool Inserted = SpecCostMap.insert({I, Cost}).second;
+        (void)Inserted;
+        assert(Inserted && "Must not re-insert a cost during the DFS!");
+
+        // Now check if this node had a corresponding PHI node using it. If so,
+        // we need to decrement the outstanding user count for it.
+        auto UserPNsIt = UserToPNMap.find(I);
+        if (UserPNsIt == UserToPNMap.end())
+          return;
+        auto &UserPNs = UserPNsIt->second;
+        auto UserPNsSplitIt = std::stable_partition(
+            UserPNs.begin(), UserPNs.end(), [&](PHINode *UserPN) {
+              int &PNUserCount = PNUserCountMap.find(UserPN)->second;
+              assert(
+                  PNUserCount > 0 &&
+                  "Should never re-visit a PN after its user count hits zero!");
+              --PNUserCount;
+              return PNUserCount != 0;
+            });
+
+        // FIXME: Rather than one at a time, we should sum the savings as the
+        // cost will be completely shared.
+        SmallVector<Instruction *, 16> SpecWorklist;
+        for (auto *PN : llvm::make_range(UserPNsSplitIt, UserPNs.end())) {
+          int SpecCost = TTI.TCC_Free;
+          for (Use &U : PN->uses())
+            SpecCost +=
+                SpecCostMap.find(cast<Instruction>(U.getUser()))->second;
+          SpecCost *= (NumPreds - 1);
+          // When the user count of a PHI node hits zero, we should check its
+          // profitability. If profitable, we should mark it for speculation
+          // and zero out the cost of everything it depends on.
+          int CostSavings = CostSavingsMap.find(PN)->second;
+          if (SpecCost > CostSavings) {
+            LLVM_DEBUG(dbgs() << "  Not profitable, speculation cost: " << *PN
+                              << "\n"
+                                 "    Cost savings:     "
+                              << CostSavings
+                              << "\n"
+                                 "    Speculation cost: "
+                              << SpecCost << "\n");
+            continue;
+          }
+
+          // We're going to speculate this user-associated PHI. Copy it out and
+          // add its users to the worklist to update their cost.
+          SpecPNs.push_back(PN);
+          for (Use &U : PN->uses()) {
+            auto *UI = cast<Instruction>(U.getUser());
+            auto CostMapIt = SpecCostMap.find(UI);
+            if (CostMapIt->second == 0)
+              continue;
+            // Zero out this cost entry to avoid duplicates.
+            CostMapIt->second = 0;
+            SpecWorklist.push_back(UI);
+          }
+        }
+
+        // Now walk all the operands of the users in the worklist transitively
+        // to zero out all the memoized costs.
+        while (!SpecWorklist.empty()) {
+          Instruction *SpecI = SpecWorklist.pop_back_val();
+          assert(SpecCostMap.find(SpecI)->second == 0 &&
+                 "Didn't zero out a cost!");
+
+          // Walk the operands recursively to zero out their cost as well.
+          for (auto *OpV : SpecI->operand_values()) {
+            auto *OpI = dyn_cast<Instruction>(OpV);
+            if (!OpI)
+              continue;
+            auto CostMapIt = SpecCostMap.find(OpI);
+            if (CostMapIt == SpecCostMap.end() || CostMapIt->second == 0)
+              continue;
+            CostMapIt->second = 0;
+            SpecWorklist.push_back(OpI);
+          }
+        }
+      });
+
+  return SpecPNs;
+}
+
+/// Speculate users around a set of PHI nodes.
+///
+/// This routine does the actual speculation around a set of PHI nodes where we
+/// have determined this to be both safe and profitable.
+///
+/// This routine handles any spliting of critical edges necessary to create
+/// a safe block to speculate into as well as cloning the instructions and
+/// rewriting all uses.
+static void speculatePHIs(ArrayRef<PHINode *> SpecPNs,
+                          SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
+                          SmallSetVector<BasicBlock *, 16> &PredSet,
+                          DominatorTree &DT) {
+  LLVM_DEBUG(dbgs() << "  Speculating around " << SpecPNs.size() << " PHIs!\n");
+  NumPHIsSpeculated += SpecPNs.size();
+
+  // Split any critical edges so that we have a block to hoist into.
+  auto *ParentBB = SpecPNs[0]->getParent();
+  SmallVector<BasicBlock *, 16> SpecPreds;
+  SpecPreds.reserve(PredSet.size());
+  for (auto *PredBB : PredSet) {
+    auto *NewPredBB = SplitCriticalEdge(
+        PredBB, ParentBB,
+        CriticalEdgeSplittingOptions(&DT).setMergeIdenticalEdges());
+    if (NewPredBB) {
+      ++NumEdgesSplit;
+      LLVM_DEBUG(dbgs() << "  Split critical edge from: " << PredBB->getName()
+                        << "\n");
+      SpecPreds.push_back(NewPredBB);
+    } else {
+      assert(PredBB->getSingleSuccessor() == ParentBB &&
+             "We need a non-critical predecessor to speculate into.");
+      assert(!isa<InvokeInst>(PredBB->getTerminator()) &&
+             "Cannot have a non-critical invoke!");
+
+      // Already non-critical, use existing pred.
+      SpecPreds.push_back(PredBB);
+    }
+  }
+
+  SmallPtrSet<Instruction *, 16> SpecSet;
+  SmallVector<Instruction *, 16> SpecList;
+  visitPHIUsersAndDepsInPostOrder(SpecPNs,
+                                  /*IsVisited*/
+                                  [&](Instruction *I) {
+                                    // This is visited if we don't need to
+                                    // speculate it or we already have
+                                    // speculated it.
+                                    return !PotentialSpecSet.count(I) ||
+                                           SpecSet.count(I);
+                                  },
+                                  /*Visit*/
+                                  [&](Instruction *I) {
+                                    // All operands scheduled, schedule this
+                                    // node.
+                                    SpecSet.insert(I);
+                                    SpecList.push_back(I);
+                                  });
+
+  int NumSpecInsts = SpecList.size() * SpecPreds.size();
+  int NumRedundantInsts = NumSpecInsts - SpecList.size();
+  LLVM_DEBUG(dbgs() << "  Inserting " << NumSpecInsts
+                    << " speculated instructions, " << NumRedundantInsts
+                    << " redundancies\n");
+  NumSpeculatedInstructions += NumSpecInsts;
+  NumNewRedundantInstructions += NumRedundantInsts;
+
+  // Each predecessor is numbered by its index in `SpecPreds`, so for each
+  // instruction we speculate, the speculated instruction is stored in that
+  // index of the vector associated with the original instruction. We also
+  // store the incoming values for each predecessor from any PHIs used.
+  SmallDenseMap<Instruction *, SmallVector<Value *, 2>, 16> SpeculatedValueMap;
+
+  // Inject the synthetic mappings to rewrite PHIs to the appropriate incoming
+  // value. This handles both the PHIs we are speculating around and any other
+  // PHIs that happen to be used.
+  for (auto *OrigI : SpecList)
+    for (auto *OpV : OrigI->operand_values()) {
+      auto *OpPN = dyn_cast<PHINode>(OpV);
+      if (!OpPN || OpPN->getParent() != ParentBB)
+        continue;
+
+      auto InsertResult = SpeculatedValueMap.insert({OpPN, {}});
+      if (!InsertResult.second)
+        continue;
+
+      auto &SpeculatedVals = InsertResult.first->second;
+
+      // Populating our structure for mapping is particularly annoying because
+      // finding an incoming value for a particular predecessor block in a PHI
+      // node is a linear time operation! To avoid quadratic behavior, we build
+      // a map for this PHI node's incoming values and then translate it into
+      // the more compact representation used below.
+      SmallDenseMap<BasicBlock *, Value *, 16> IncomingValueMap;
+      for (int i : llvm::seq<int>(0, OpPN->getNumIncomingValues()))
+        IncomingValueMap[OpPN->getIncomingBlock(i)] = OpPN->getIncomingValue(i);
+
+      for (auto *PredBB : SpecPreds)
+        SpeculatedVals.push_back(IncomingValueMap.find(PredBB)->second);
+    }
+
+  // Speculate into each predecessor.
+  for (int PredIdx : llvm::seq<int>(0, SpecPreds.size())) {
+    auto *PredBB = SpecPreds[PredIdx];
+    assert(PredBB->getSingleSuccessor() == ParentBB &&
+           "We need a non-critical predecessor to speculate into.");
+
+    for (auto *OrigI : SpecList) {
+      auto *NewI = OrigI->clone();
+      NewI->setName(Twine(OrigI->getName()) + "." + Twine(PredIdx));
+      NewI->insertBefore(PredBB->getTerminator());
+
+      // Rewrite all the operands to the previously speculated instructions.
+      // Because we're walking in-order, the defs must precede the uses and we
+      // should already have these mappings.
+      for (Use &U : NewI->operands()) {
+        auto *OpI = dyn_cast<Instruction>(U.get());
+        if (!OpI)
+          continue;
+        auto MapIt = SpeculatedValueMap.find(OpI);
+        if (MapIt == SpeculatedValueMap.end())
+          continue;
+        const auto &SpeculatedVals = MapIt->second;
+        assert(SpeculatedVals[PredIdx] &&
+               "Must have a speculated value for this predecessor!");
+        assert(SpeculatedVals[PredIdx]->getType() == OpI->getType() &&
+               "Speculated value has the wrong type!");
+
+        // Rewrite the use to this predecessor's speculated instruction.
+        U.set(SpeculatedVals[PredIdx]);
+      }
+
+      // Commute instructions which now have a constant in the LHS but not the
+      // RHS.
+      if (NewI->isBinaryOp() && NewI->isCommutative() &&
+          isa<Constant>(NewI->getOperand(0)) &&
+          !isa<Constant>(NewI->getOperand(1)))
+        NewI->getOperandUse(0).swap(NewI->getOperandUse(1));
+
+      SpeculatedValueMap[OrigI].push_back(NewI);
+      assert(SpeculatedValueMap[OrigI][PredIdx] == NewI &&
+             "Mismatched speculated instruction index!");
+    }
+  }
+
+  // Walk the speculated instruction list and if they have uses, insert a PHI
+  // for them from the speculated versions, and replace the uses with the PHI.
+  // Then erase the instructions as they have been fully speculated. The walk
+  // needs to be in reverse so that we don't think there are users when we'll
+  // actually eventually remove them later.
+  IRBuilder<> IRB(SpecPNs[0]);
+  for (auto *OrigI : llvm::reverse(SpecList)) {
+    // Check if we need a PHI for any remaining users and if so, insert it.
+    if (!OrigI->use_empty()) {
+      auto *SpecIPN = IRB.CreatePHI(OrigI->getType(), SpecPreds.size(),
+                                    Twine(OrigI->getName()) + ".phi");
+      // Add the incoming values we speculated.
+      auto &SpeculatedVals = SpeculatedValueMap.find(OrigI)->second;
+      for (int PredIdx : llvm::seq<int>(0, SpecPreds.size()))
+        SpecIPN->addIncoming(SpeculatedVals[PredIdx], SpecPreds[PredIdx]);
+
+      // And replace the uses with the PHI node.
+      OrigI->replaceAllUsesWith(SpecIPN);
+    }
+
+    // It is important to immediately erase this so that it stops using other
+    // instructions. This avoids inserting needless PHIs of them.
+    OrigI->eraseFromParent();
+  }
+
+  // All of the uses of the speculated phi nodes should be removed at this
+  // point, so erase them.
+  for (auto *SpecPN : SpecPNs) {
+    assert(SpecPN->use_empty() && "All users should have been speculated!");
+    SpecPN->eraseFromParent();
+  }
+}
+
+/// Try to speculate around a series of PHIs from a single basic block.
+///
+/// This routine checks whether any of these PHIs are profitable to speculate
+/// users around. If safe and profitable, it does the speculation. It returns
+/// true when at least some speculation occurs.
+static bool tryToSpeculatePHIs(SmallVectorImpl<PHINode *> &PNs,
+                               DominatorTree &DT, TargetTransformInfo &TTI) {
+  LLVM_DEBUG(dbgs() << "Evaluating phi nodes for speculation:\n");
+
+  // Savings in cost from speculating around a PHI node.
+  SmallDenseMap<PHINode *, int, 16> CostSavingsMap;
+
+  // Remember the set of instructions that are candidates for speculation so
+  // that we can quickly walk things within that space. This prunes out
+  // instructions already available along edges, etc.
+  SmallPtrSet<Instruction *, 16> PotentialSpecSet;
+
+  // Remember the set of instructions that are (transitively) unsafe to
+  // speculate into the incoming edges of this basic block. This avoids
+  // recomputing them for each PHI node we check. This set is specific to this
+  // block though as things are pruned out of it based on what is available
+  // along incoming edges.
+  SmallPtrSet<Instruction *, 16> UnsafeSet;
+
+  // For each PHI node in this block, check whether there are immediate folding
+  // opportunities from speculation, and whether that speculation will be
+  // valid. This determise the set of safe PHIs to speculate.
   llvm::erase_if(PNs, [&](PHINode *PN) {
     return !isSafeAndProfitableToSpeculateAroundPHI(
         *PN, CostSavingsMap, PotentialSpecSet, UnsafeSet, DT, TTI);
   });
-  // If no PHIs were profitable, skip. 
-  if (PNs.empty()) { 
-    LLVM_DEBUG(dbgs() << "  No safe and profitable PHIs found!\n"); 
-    return false; 
-  } 
- 
-  // We need to know how much speculation will cost which is determined by how 
-  // many incoming edges will need a copy of each speculated instruction. 
-  SmallSetVector<BasicBlock *, 16> PredSet; 
-  for (auto *PredBB : PNs[0]->blocks()) { 
-    if (!PredSet.insert(PredBB)) 
-      continue; 
- 
-    // We cannot speculate when a predecessor is an indirect branch. 
-    // FIXME: We also can't reliably create a non-critical edge block for 
-    // speculation if the predecessor is an invoke. This doesn't seem 
-    // fundamental and we should probably be splitting critical edges 
-    // differently. 
-    const auto *TermInst = PredBB->getTerminator(); 
-    if (isa<IndirectBrInst>(TermInst) || 
-        isa<InvokeInst>(TermInst) || 
-        isa<CallBrInst>(TermInst)) { 
-      LLVM_DEBUG(dbgs() << "  Invalid: predecessor terminator: " 
-                        << PredBB->getName() << "\n"); 
-      return false; 
-    } 
-  } 
-  if (PredSet.size() < 2) { 
-    LLVM_DEBUG(dbgs() << "  Unimportant: phi with only one predecessor\n"); 
-    return false; 
-  } 
- 
-  SmallVector<PHINode *, 16> SpecPNs = findProfitablePHIs( 
-      PNs, CostSavingsMap, PotentialSpecSet, PredSet.size(), DT, TTI); 
-  if (SpecPNs.empty()) 
-    // Nothing to do. 
-    return false; 
- 
-  speculatePHIs(SpecPNs, PotentialSpecSet, PredSet, DT); 
-  return true; 
-} 
- 
-PreservedAnalyses SpeculateAroundPHIsPass::run(Function &F, 
-                                               FunctionAnalysisManager &AM) { 
-  auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 
-  auto &TTI = AM.getResult<TargetIRAnalysis>(F); 
- 
-  bool Changed = false; 
-  for (auto *BB : ReversePostOrderTraversal<Function *>(&F)) { 
-    SmallVector<PHINode *, 16> PNs; 
-    auto BBI = BB->begin(); 
-    while (auto *PN = dyn_cast<PHINode>(&*BBI)) { 
-      PNs.push_back(PN); 
-      ++BBI; 
-    } 
- 
-    if (PNs.empty()) 
-      continue; 
- 
-    Changed |= tryToSpeculatePHIs(PNs, DT, TTI); 
-  } 
- 
-  if (!Changed) 
-    return PreservedAnalyses::all(); 
- 
-  PreservedAnalyses PA; 
-  return PA; 
-} 
+  // If no PHIs were profitable, skip.
+  if (PNs.empty()) {
+    LLVM_DEBUG(dbgs() << "  No safe and profitable PHIs found!\n");
+    return false;
+  }
+
+  // We need to know how much speculation will cost which is determined by how
+  // many incoming edges will need a copy of each speculated instruction.
+  SmallSetVector<BasicBlock *, 16> PredSet;
+  for (auto *PredBB : PNs[0]->blocks()) {
+    if (!PredSet.insert(PredBB))
+      continue;
+
+    // We cannot speculate when a predecessor is an indirect branch.
+    // FIXME: We also can't reliably create a non-critical edge block for
+    // speculation if the predecessor is an invoke. This doesn't seem
+    // fundamental and we should probably be splitting critical edges
+    // differently.
+    const auto *TermInst = PredBB->getTerminator();
+    if (isa<IndirectBrInst>(TermInst) ||
+        isa<InvokeInst>(TermInst) ||
+        isa<CallBrInst>(TermInst)) {
+      LLVM_DEBUG(dbgs() << "  Invalid: predecessor terminator: "
+                        << PredBB->getName() << "\n");
+      return false;
+    }
+  }
+  if (PredSet.size() < 2) {
+    LLVM_DEBUG(dbgs() << "  Unimportant: phi with only one predecessor\n");
+    return false;
+  }
+
+  SmallVector<PHINode *, 16> SpecPNs = findProfitablePHIs(
+      PNs, CostSavingsMap, PotentialSpecSet, PredSet.size(), DT, TTI);
+  if (SpecPNs.empty())
+    // Nothing to do.
+    return false;
+
+  speculatePHIs(SpecPNs, PotentialSpecSet, PredSet, DT);
+  return true;
+}
+
+PreservedAnalyses SpeculateAroundPHIsPass::run(Function &F,
+                                               FunctionAnalysisManager &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+
+  bool Changed = false;
+  for (auto *BB : ReversePostOrderTraversal<Function *>(&F)) {
+    SmallVector<PHINode *, 16> PNs;
+    auto BBI = BB->begin();
+    while (auto *PN = dyn_cast<PHINode>(&*BBI)) {
+      PNs.push_back(PN);
+      ++BBI;
+    }
+
+    if (PNs.empty())
+      continue;
+
+    Changed |= tryToSpeculatePHIs(PNs, DT, TTI);
+  }
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/SpeculativeExecution.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/SpeculativeExecution.cpp
index a1fc58d8f3..c78185f2a6 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -1,250 +1,250 @@
-//===- SpeculativeExecution.cpp ---------------------------------*- C++ -*-===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass hoists instructions to enable speculative execution on 
-// targets where branches are expensive. This is aimed at GPUs. It 
-// currently works on simple if-then and if-then-else 
-// patterns. 
-// 
-// Removing branches is not the only motivation for this 
-// pass. E.g. consider this code and assume that there is no 
-// addressing mode for multiplying by sizeof(*a): 
-// 
-//   if (b > 0) 
-//     c = a[i + 1] 
-//   if (d > 0) 
-//     e = a[i + 2] 
-// 
-// turns into 
-// 
-//   p = &a[i + 1]; 
-//   if (b > 0) 
-//     c = *p; 
-//   q = &a[i + 2]; 
-//   if (d > 0) 
-//     e = *q; 
-// 
-// which could later be optimized to 
-// 
-//   r = &a[i]; 
-//   if (b > 0) 
-//     c = r[1]; 
-//   if (d > 0) 
-//     e = r[2]; 
-// 
-// Later passes sink back much of the speculated code that did not enable 
-// further optimization. 
-// 
-// This pass is more aggressive than the function SpeculativeyExecuteBB in 
-// SimplifyCFG. SimplifyCFG will not speculate if no selects are introduced and 
-// it will speculate at most one instruction. It also will not speculate if 
-// there is a value defined in the if-block that is only used in the then-block. 
-// These restrictions make sense since the speculation in SimplifyCFG seems 
-// aimed at introducing cheap selects, while this pass is intended to do more 
-// aggressive speculation while counting on later passes to either capitalize on 
-// that or clean it up. 
-// 
-// If the pass was created by calling 
-// createSpeculativeExecutionIfHasBranchDivergencePass or the 
-// -spec-exec-only-if-divergent-target option is present, this pass only has an 
-// effect on targets where TargetTransformInfo::hasBranchDivergence() is true; 
-// on other targets, it is a nop. 
-// 
-// This lets you include this pass unconditionally in the IR pass pipeline, but 
-// only enable it for relevant targets. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/SpeculativeExecution.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Operator.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "speculative-execution" 
- 
-// The risk that speculation will not pay off increases with the 
-// number of instructions speculated, so we put a limit on that. 
-static cl::opt<unsigned> SpecExecMaxSpeculationCost( 
-    "spec-exec-max-speculation-cost", cl::init(7), cl::Hidden, 
-    cl::desc("Speculative execution is not applied to basic blocks where " 
-             "the cost of the instructions to speculatively execute " 
-             "exceeds this limit.")); 
- 
-// Speculating just a few instructions from a larger block tends not 
-// to be profitable and this limit prevents that. A reason for that is 
-// that small basic blocks are more likely to be candidates for 
-// further optimization. 
-static cl::opt<unsigned> SpecExecMaxNotHoisted( 
-    "spec-exec-max-not-hoisted", cl::init(5), cl::Hidden, 
-    cl::desc("Speculative execution is not applied to basic blocks where the " 
-             "number of instructions that would not be speculatively executed " 
-             "exceeds this limit.")); 
- 
-static cl::opt<bool> SpecExecOnlyIfDivergentTarget( 
-    "spec-exec-only-if-divergent-target", cl::init(false), cl::Hidden, 
-    cl::desc("Speculative execution is applied only to targets with divergent " 
-             "branches, even if the pass was configured to apply only to all " 
-             "targets.")); 
- 
-namespace { 
- 
-class SpeculativeExecutionLegacyPass : public FunctionPass { 
-public: 
-  static char ID; 
-  explicit SpeculativeExecutionLegacyPass(bool OnlyIfDivergentTarget = false) 
-      : FunctionPass(ID), OnlyIfDivergentTarget(OnlyIfDivergentTarget || 
-                                                SpecExecOnlyIfDivergentTarget), 
-        Impl(OnlyIfDivergentTarget) {} 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override; 
-  bool runOnFunction(Function &F) override; 
- 
-  StringRef getPassName() const override { 
-    if (OnlyIfDivergentTarget) 
-      return "Speculatively execute instructions if target has divergent " 
-             "branches"; 
-    return "Speculatively execute instructions"; 
-  } 
- 
-private: 
-  // Variable preserved purely for correct name printing. 
-  const bool OnlyIfDivergentTarget; 
- 
-  SpeculativeExecutionPass Impl; 
-}; 
-} // namespace 
- 
-char SpeculativeExecutionLegacyPass::ID = 0; 
-INITIALIZE_PASS_BEGIN(SpeculativeExecutionLegacyPass, "speculative-execution", 
-                      "Speculatively execute instructions", false, false) 
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 
-INITIALIZE_PASS_END(SpeculativeExecutionLegacyPass, "speculative-execution", 
-                    "Speculatively execute instructions", false, false) 
- 
-void SpeculativeExecutionLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const { 
-  AU.addRequired<TargetTransformInfoWrapperPass>(); 
-  AU.addPreserved<GlobalsAAWrapperPass>(); 
-  AU.setPreservesCFG(); 
-} 
- 
-bool SpeculativeExecutionLegacyPass::runOnFunction(Function &F) { 
-  if (skipFunction(F)) 
-    return false; 
- 
-  auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 
-  return Impl.runImpl(F, TTI); 
-} 
- 
-namespace llvm { 
- 
-bool SpeculativeExecutionPass::runImpl(Function &F, TargetTransformInfo *TTI) { 
-  if (OnlyIfDivergentTarget && !TTI->hasBranchDivergence()) { 
-    LLVM_DEBUG(dbgs() << "Not running SpeculativeExecution because " 
-                         "TTI->hasBranchDivergence() is false.\n"); 
-    return false; 
-  } 
- 
-  this->TTI = TTI; 
-  bool Changed = false; 
-  for (auto& B : F) { 
-    Changed |= runOnBasicBlock(B); 
-  } 
-  return Changed; 
-} 
- 
-bool SpeculativeExecutionPass::runOnBasicBlock(BasicBlock &B) { 
-  BranchInst *BI = dyn_cast<BranchInst>(B.getTerminator()); 
-  if (BI == nullptr) 
-    return false; 
- 
-  if (BI->getNumSuccessors() != 2) 
-    return false; 
-  BasicBlock &Succ0 = *BI->getSuccessor(0); 
-  BasicBlock &Succ1 = *BI->getSuccessor(1); 
- 
-  if (&B == &Succ0 || &B == &Succ1 || &Succ0 == &Succ1) { 
-    return false; 
-  } 
- 
-  // Hoist from if-then (triangle). 
-  if (Succ0.getSinglePredecessor() != nullptr && 
-      Succ0.getSingleSuccessor() == &Succ1) { 
-    return considerHoistingFromTo(Succ0, B); 
-  } 
- 
-  // Hoist from if-else (triangle). 
-  if (Succ1.getSinglePredecessor() != nullptr && 
-      Succ1.getSingleSuccessor() == &Succ0) { 
-    return considerHoistingFromTo(Succ1, B); 
-  } 
- 
-  // Hoist from if-then-else (diamond), but only if it is equivalent to 
-  // an if-else or if-then due to one of the branches doing nothing. 
-  if (Succ0.getSinglePredecessor() != nullptr && 
-      Succ1.getSinglePredecessor() != nullptr && 
-      Succ1.getSingleSuccessor() != nullptr && 
-      Succ1.getSingleSuccessor() != &B && 
-      Succ1.getSingleSuccessor() == Succ0.getSingleSuccessor()) { 
-    // If a block has only one instruction, then that is a terminator 
-    // instruction so that the block does nothing. This does happen. 
-    if (Succ1.size() == 1) // equivalent to if-then 
-      return considerHoistingFromTo(Succ0, B); 
-    if (Succ0.size() == 1) // equivalent to if-else 
-      return considerHoistingFromTo(Succ1, B); 
-  } 
- 
-  return false; 
-} 
- 
-static unsigned ComputeSpeculationCost(const Instruction *I, 
-                                       const TargetTransformInfo &TTI) { 
-  switch (Operator::getOpcode(I)) { 
-    case Instruction::GetElementPtr: 
-    case Instruction::Add: 
-    case Instruction::Mul: 
-    case Instruction::And: 
-    case Instruction::Or: 
-    case Instruction::Select: 
-    case Instruction::Shl: 
-    case Instruction::Sub: 
-    case Instruction::LShr: 
-    case Instruction::AShr: 
-    case Instruction::Xor: 
-    case Instruction::ZExt: 
-    case Instruction::SExt: 
-    case Instruction::Call: 
-    case Instruction::BitCast: 
-    case Instruction::PtrToInt: 
-    case Instruction::IntToPtr: 
-    case Instruction::AddrSpaceCast: 
-    case Instruction::FPToUI: 
-    case Instruction::FPToSI: 
-    case Instruction::UIToFP: 
-    case Instruction::SIToFP: 
-    case Instruction::FPExt: 
-    case Instruction::FPTrunc: 
-    case Instruction::FAdd: 
-    case Instruction::FSub: 
-    case Instruction::FMul: 
-    case Instruction::FDiv: 
-    case Instruction::FRem: 
-    case Instruction::FNeg: 
-    case Instruction::ICmp: 
-    case Instruction::FCmp: 
+//===- SpeculativeExecution.cpp ---------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass hoists instructions to enable speculative execution on
+// targets where branches are expensive. This is aimed at GPUs. It
+// currently works on simple if-then and if-then-else
+// patterns.
+//
+// Removing branches is not the only motivation for this
+// pass. E.g. consider this code and assume that there is no
+// addressing mode for multiplying by sizeof(*a):
+//
+//   if (b > 0)
+//     c = a[i + 1]
+//   if (d > 0)
+//     e = a[i + 2]
+//
+// turns into
+//
+//   p = &a[i + 1];
+//   if (b > 0)
+//     c = *p;
+//   q = &a[i + 2];
+//   if (d > 0)
+//     e = *q;
+//
+// which could later be optimized to
+//
+//   r = &a[i];
+//   if (b > 0)
+//     c = r[1];
+//   if (d > 0)
+//     e = r[2];
+//
+// Later passes sink back much of the speculated code that did not enable
+// further optimization.
+//
+// This pass is more aggressive than the function SpeculativeyExecuteBB in
+// SimplifyCFG. SimplifyCFG will not speculate if no selects are introduced and
+// it will speculate at most one instruction. It also will not speculate if
+// there is a value defined in the if-block that is only used in the then-block.
+// These restrictions make sense since the speculation in SimplifyCFG seems
+// aimed at introducing cheap selects, while this pass is intended to do more
+// aggressive speculation while counting on later passes to either capitalize on
+// that or clean it up.
+//
+// If the pass was created by calling
+// createSpeculativeExecutionIfHasBranchDivergencePass or the
+// -spec-exec-only-if-divergent-target option is present, this pass only has an
+// effect on targets where TargetTransformInfo::hasBranchDivergence() is true;
+// on other targets, it is a nop.
+//
+// This lets you include this pass unconditionally in the IR pass pipeline, but
+// only enable it for relevant targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/SpeculativeExecution.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "speculative-execution"
+
+// The risk that speculation will not pay off increases with the
+// number of instructions speculated, so we put a limit on that.
+static cl::opt<unsigned> SpecExecMaxSpeculationCost(
+    "spec-exec-max-speculation-cost", cl::init(7), cl::Hidden,
+    cl::desc("Speculative execution is not applied to basic blocks where "
+             "the cost of the instructions to speculatively execute "
+             "exceeds this limit."));
+
+// Speculating just a few instructions from a larger block tends not
+// to be profitable and this limit prevents that. A reason for that is
+// that small basic blocks are more likely to be candidates for
+// further optimization.
+static cl::opt<unsigned> SpecExecMaxNotHoisted(
+    "spec-exec-max-not-hoisted", cl::init(5), cl::Hidden,
+    cl::desc("Speculative execution is not applied to basic blocks where the "
+             "number of instructions that would not be speculatively executed "
+             "exceeds this limit."));
+
+static cl::opt<bool> SpecExecOnlyIfDivergentTarget(
+    "spec-exec-only-if-divergent-target", cl::init(false), cl::Hidden,
+    cl::desc("Speculative execution is applied only to targets with divergent "
+             "branches, even if the pass was configured to apply only to all "
+             "targets."));
+
+namespace {
+
+class SpeculativeExecutionLegacyPass : public FunctionPass {
+public:
+  static char ID;
+  explicit SpeculativeExecutionLegacyPass(bool OnlyIfDivergentTarget = false)
+      : FunctionPass(ID), OnlyIfDivergentTarget(OnlyIfDivergentTarget ||
+                                                SpecExecOnlyIfDivergentTarget),
+        Impl(OnlyIfDivergentTarget) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnFunction(Function &F) override;
+
+  StringRef getPassName() const override {
+    if (OnlyIfDivergentTarget)
+      return "Speculatively execute instructions if target has divergent "
+             "branches";
+    return "Speculatively execute instructions";
+  }
+
+private:
+  // Variable preserved purely for correct name printing.
+  const bool OnlyIfDivergentTarget;
+
+  SpeculativeExecutionPass Impl;
+};
+} // namespace
+
+char SpeculativeExecutionLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(SpeculativeExecutionLegacyPass, "speculative-execution",
+                      "Speculatively execute instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(SpeculativeExecutionLegacyPass, "speculative-execution",
+                    "Speculatively execute instructions", false, false)
+
+void SpeculativeExecutionLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetTransformInfoWrapperPass>();
+  AU.addPreserved<GlobalsAAWrapperPass>();
+  AU.setPreservesCFG();
+}
+
+bool SpeculativeExecutionLegacyPass::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  return Impl.runImpl(F, TTI);
+}
+
+namespace llvm {
+
+bool SpeculativeExecutionPass::runImpl(Function &F, TargetTransformInfo *TTI) {
+  if (OnlyIfDivergentTarget && !TTI->hasBranchDivergence()) {
+    LLVM_DEBUG(dbgs() << "Not running SpeculativeExecution because "
+                         "TTI->hasBranchDivergence() is false.\n");
+    return false;
+  }
+
+  this->TTI = TTI;
+  bool Changed = false;
+  for (auto& B : F) {
+    Changed |= runOnBasicBlock(B);
+  }
+  return Changed;
+}
+
+bool SpeculativeExecutionPass::runOnBasicBlock(BasicBlock &B) {
+  BranchInst *BI = dyn_cast<BranchInst>(B.getTerminator());
+  if (BI == nullptr)
+    return false;
+
+  if (BI->getNumSuccessors() != 2)
+    return false;
+  BasicBlock &Succ0 = *BI->getSuccessor(0);
+  BasicBlock &Succ1 = *BI->getSuccessor(1);
+
+  if (&B == &Succ0 || &B == &Succ1 || &Succ0 == &Succ1) {
+    return false;
+  }
+
+  // Hoist from if-then (triangle).
+  if (Succ0.getSinglePredecessor() != nullptr &&
+      Succ0.getSingleSuccessor() == &Succ1) {
+    return considerHoistingFromTo(Succ0, B);
+  }
+
+  // Hoist from if-else (triangle).
+  if (Succ1.getSinglePredecessor() != nullptr &&
+      Succ1.getSingleSuccessor() == &Succ0) {
+    return considerHoistingFromTo(Succ1, B);
+  }
+
+  // Hoist from if-then-else (diamond), but only if it is equivalent to
+  // an if-else or if-then due to one of the branches doing nothing.
+  if (Succ0.getSinglePredecessor() != nullptr &&
+      Succ1.getSinglePredecessor() != nullptr &&
+      Succ1.getSingleSuccessor() != nullptr &&
+      Succ1.getSingleSuccessor() != &B &&
+      Succ1.getSingleSuccessor() == Succ0.getSingleSuccessor()) {
+    // If a block has only one instruction, then that is a terminator
+    // instruction so that the block does nothing. This does happen.
+    if (Succ1.size() == 1) // equivalent to if-then
+      return considerHoistingFromTo(Succ0, B);
+    if (Succ0.size() == 1) // equivalent to if-else
+      return considerHoistingFromTo(Succ1, B);
+  }
+
+  return false;
+}
+
+static unsigned ComputeSpeculationCost(const Instruction *I,
+                                       const TargetTransformInfo &TTI) {
+  switch (Operator::getOpcode(I)) {
+    case Instruction::GetElementPtr:
+    case Instruction::Add:
+    case Instruction::Mul:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Select:
+    case Instruction::Shl:
+    case Instruction::Sub:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::Xor:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::Call:
+    case Instruction::BitCast:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::AddrSpaceCast:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPExt:
+    case Instruction::FPTrunc:
+    case Instruction::FAdd:
+    case Instruction::FSub:
+    case Instruction::FMul:
+    case Instruction::FDiv:
+    case Instruction::FRem:
+    case Instruction::FNeg:
+    case Instruction::ICmp:
+    case Instruction::FCmp:
     case Instruction::Trunc:
     case Instruction::Freeze:
     case Instruction::ExtractElement:
@@ -252,96 +252,96 @@ static unsigned ComputeSpeculationCost(const Instruction *I,
     case Instruction::ShuffleVector:
     case Instruction::ExtractValue:
     case Instruction::InsertValue:
-      return TTI.getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency); 
- 
-    default: 
-      return UINT_MAX; // Disallow anything not explicitly listed. 
-  } 
-} 
- 
-bool SpeculativeExecutionPass::considerHoistingFromTo( 
-    BasicBlock &FromBlock, BasicBlock &ToBlock) { 
-  SmallPtrSet<const Instruction *, 8> NotHoisted; 
-  const auto AllPrecedingUsesFromBlockHoisted = [&NotHoisted](const User *U) { 
-    // Debug variable has special operand to check it's not hoisted. 
-    if (const auto *DVI = dyn_cast<DbgVariableIntrinsic>(U)) { 
-      if (const auto *I = 
-              dyn_cast_or_null<Instruction>(DVI->getVariableLocation())) 
-        if (NotHoisted.count(I) == 0) 
-          return true; 
-      return false; 
-    } 
- 
-    // Usially debug label instrinsic corresponds to label in LLVM IR. In these 
-    // cases we should not move it here. 
-    // TODO: Possible special processing needed to detect it is related to a 
-    // hoisted instruction. 
-    if (isa<DbgLabelInst>(U)) 
-      return false; 
- 
-    for (const Value *V : U->operand_values()) { 
-      if (const Instruction *I = dyn_cast<Instruction>(V)) { 
+      return TTI.getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency);
+
+    default:
+      return UINT_MAX; // Disallow anything not explicitly listed.
+  }
+}
+
+bool SpeculativeExecutionPass::considerHoistingFromTo(
+    BasicBlock &FromBlock, BasicBlock &ToBlock) {
+  SmallPtrSet<const Instruction *, 8> NotHoisted;
+  const auto AllPrecedingUsesFromBlockHoisted = [&NotHoisted](const User *U) {
+    // Debug variable has special operand to check it's not hoisted.
+    if (const auto *DVI = dyn_cast<DbgVariableIntrinsic>(U)) {
+      if (const auto *I =
+              dyn_cast_or_null<Instruction>(DVI->getVariableLocation()))
+        if (NotHoisted.count(I) == 0)
+          return true;
+      return false;
+    }
+
+    // Usially debug label instrinsic corresponds to label in LLVM IR. In these
+    // cases we should not move it here.
+    // TODO: Possible special processing needed to detect it is related to a
+    // hoisted instruction.
+    if (isa<DbgLabelInst>(U))
+      return false;
+
+    for (const Value *V : U->operand_values()) {
+      if (const Instruction *I = dyn_cast<Instruction>(V)) {
         if (NotHoisted.contains(I))
-          return false; 
-      } 
-    } 
-    return true; 
-  }; 
- 
-  unsigned TotalSpeculationCost = 0; 
-  unsigned NotHoistedInstCount = 0; 
-  for (const auto &I : FromBlock) { 
-    const unsigned Cost = ComputeSpeculationCost(&I, *TTI); 
-    if (Cost != UINT_MAX && isSafeToSpeculativelyExecute(&I) && 
-        AllPrecedingUsesFromBlockHoisted(&I)) { 
-      TotalSpeculationCost += Cost; 
-      if (TotalSpeculationCost > SpecExecMaxSpeculationCost) 
-        return false;  // too much to hoist 
-    } else { 
-      // Debug info instrinsics should not be counted for threshold. 
-      if (!isa<DbgInfoIntrinsic>(I)) 
-        NotHoistedInstCount++; 
-      if (NotHoistedInstCount > SpecExecMaxNotHoisted) 
-        return false; // too much left behind 
-      NotHoisted.insert(&I); 
-    } 
-  } 
- 
-  for (auto I = FromBlock.begin(); I != FromBlock.end();) { 
-    // We have to increment I before moving Current as moving Current 
-    // changes the list that I is iterating through. 
-    auto Current = I; 
-    ++I; 
-    if (!NotHoisted.count(&*Current)) { 
-      Current->moveBefore(ToBlock.getTerminator()); 
-    } 
-  } 
-  return true; 
-} 
- 
-FunctionPass *createSpeculativeExecutionPass() { 
-  return new SpeculativeExecutionLegacyPass(); 
-} 
- 
-FunctionPass *createSpeculativeExecutionIfHasBranchDivergencePass() { 
-  return new SpeculativeExecutionLegacyPass(/* OnlyIfDivergentTarget = */ true); 
-} 
- 
-SpeculativeExecutionPass::SpeculativeExecutionPass(bool OnlyIfDivergentTarget) 
-    : OnlyIfDivergentTarget(OnlyIfDivergentTarget || 
-                            SpecExecOnlyIfDivergentTarget) {} 
- 
-PreservedAnalyses SpeculativeExecutionPass::run(Function &F, 
-                                                FunctionAnalysisManager &AM) { 
-  auto *TTI = &AM.getResult<TargetIRAnalysis>(F); 
- 
-  bool Changed = runImpl(F, TTI); 
- 
-  if (!Changed) 
-    return PreservedAnalyses::all(); 
-  PreservedAnalyses PA; 
-  PA.preserve<GlobalsAA>(); 
-  PA.preserveSet<CFGAnalyses>(); 
-  return PA; 
-} 
-}  // namespace llvm 
+          return false;
+      }
+    }
+    return true;
+  };
+
+  unsigned TotalSpeculationCost = 0;
+  unsigned NotHoistedInstCount = 0;
+  for (const auto &I : FromBlock) {
+    const unsigned Cost = ComputeSpeculationCost(&I, *TTI);
+    if (Cost != UINT_MAX && isSafeToSpeculativelyExecute(&I) &&
+        AllPrecedingUsesFromBlockHoisted(&I)) {
+      TotalSpeculationCost += Cost;
+      if (TotalSpeculationCost > SpecExecMaxSpeculationCost)
+        return false;  // too much to hoist
+    } else {
+      // Debug info instrinsics should not be counted for threshold.
+      if (!isa<DbgInfoIntrinsic>(I))
+        NotHoistedInstCount++;
+      if (NotHoistedInstCount > SpecExecMaxNotHoisted)
+        return false; // too much left behind
+      NotHoisted.insert(&I);
+    }
+  }
+
+  for (auto I = FromBlock.begin(); I != FromBlock.end();) {
+    // We have to increment I before moving Current as moving Current
+    // changes the list that I is iterating through.
+    auto Current = I;
+    ++I;
+    if (!NotHoisted.count(&*Current)) {
+      Current->moveBefore(ToBlock.getTerminator());
+    }
+  }
+  return true;
+}
+
+FunctionPass *createSpeculativeExecutionPass() {
+  return new SpeculativeExecutionLegacyPass();
+}
+
+FunctionPass *createSpeculativeExecutionIfHasBranchDivergencePass() {
+  return new SpeculativeExecutionLegacyPass(/* OnlyIfDivergentTarget = */ true);
+}
+
+SpeculativeExecutionPass::SpeculativeExecutionPass(bool OnlyIfDivergentTarget)
+    : OnlyIfDivergentTarget(OnlyIfDivergentTarget ||
+                            SpecExecOnlyIfDivergentTarget) {}
+
+PreservedAnalyses SpeculativeExecutionPass::run(Function &F,
+                                                FunctionAnalysisManager &AM) {
+  auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
+
+  bool Changed = runImpl(F, TTI);
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<GlobalsAA>();
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+}  // namespace llvm
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index 9aa2663941..577992ccb5 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -1,105 +1,105 @@
-//===- StraightLineStrengthReduce.cpp - -----------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements straight-line strength reduction (SLSR). Unlike loop 
-// strength reduction, this algorithm is designed to reduce arithmetic 
-// redundancy in straight-line code instead of loops. It has proven to be 
-// effective in simplifying arithmetic statements derived from an unrolled loop. 
-// It can also simplify the logic of SeparateConstOffsetFromGEP. 
-// 
-// There are many optimizations we can perform in the domain of SLSR. This file 
-// for now contains only an initial step. Specifically, we look for strength 
-// reduction candidates in the following forms: 
-// 
-// Form 1: B + i * S 
-// Form 2: (B + i) * S 
-// Form 3: &B[i * S] 
-// 
-// where S is an integer variable, and i is a constant integer. If we found two 
-// candidates S1 and S2 in the same form and S1 dominates S2, we may rewrite S2 
-// in a simpler way with respect to S1. For example, 
-// 
-// S1: X = B + i * S 
-// S2: Y = B + i' * S   => X + (i' - i) * S 
-// 
-// S1: X = (B + i) * S 
-// S2: Y = (B + i') * S => X + (i' - i) * S 
-// 
-// S1: X = &B[i * S] 
-// S2: Y = &B[i' * S]   => &X[(i' - i) * S] 
-// 
-// Note: (i' - i) * S is folded to the extent possible. 
-// 
-// This rewriting is in general a good idea. The code patterns we focus on 
-// usually come from loop unrolling, so (i' - i) * S is likely the same 
-// across iterations and can be reused. When that happens, the optimized form 
-// takes only one add starting from the second iteration. 
-// 
-// When such rewriting is possible, we call S1 a "basis" of S2. When S2 has 
-// multiple bases, we choose to rewrite S2 with respect to its "immediate" 
-// basis, the basis that is the closest ancestor in the dominator tree. 
-// 
-// TODO: 
-// 
-// - Floating point arithmetics when fast math is enabled. 
-// 
-// - SLSR may decrease ILP at the architecture level. Targets that are very 
-//   sensitive to ILP may want to disable it. Having SLSR to consider ILP is 
-//   left as future work. 
-// 
-// - When (i' - i) is constant but i and i' are not, we could still perform 
-//   SLSR. 
- 
+//===- StraightLineStrengthReduce.cpp - -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements straight-line strength reduction (SLSR). Unlike loop
+// strength reduction, this algorithm is designed to reduce arithmetic
+// redundancy in straight-line code instead of loops. It has proven to be
+// effective in simplifying arithmetic statements derived from an unrolled loop.
+// It can also simplify the logic of SeparateConstOffsetFromGEP.
+//
+// There are many optimizations we can perform in the domain of SLSR. This file
+// for now contains only an initial step. Specifically, we look for strength
+// reduction candidates in the following forms:
+//
+// Form 1: B + i * S
+// Form 2: (B + i) * S
+// Form 3: &B[i * S]
+//
+// where S is an integer variable, and i is a constant integer. If we found two
+// candidates S1 and S2 in the same form and S1 dominates S2, we may rewrite S2
+// in a simpler way with respect to S1. For example,
+//
+// S1: X = B + i * S
+// S2: Y = B + i' * S   => X + (i' - i) * S
+//
+// S1: X = (B + i) * S
+// S2: Y = (B + i') * S => X + (i' - i) * S
+//
+// S1: X = &B[i * S]
+// S2: Y = &B[i' * S]   => &X[(i' - i) * S]
+//
+// Note: (i' - i) * S is folded to the extent possible.
+//
+// This rewriting is in general a good idea. The code patterns we focus on
+// usually come from loop unrolling, so (i' - i) * S is likely the same
+// across iterations and can be reused. When that happens, the optimized form
+// takes only one add starting from the second iteration.
+//
+// When such rewriting is possible, we call S1 a "basis" of S2. When S2 has
+// multiple bases, we choose to rewrite S2 with respect to its "immediate"
+// basis, the basis that is the closest ancestor in the dominator tree.
+//
+// TODO:
+//
+// - Floating point arithmetics when fast math is enabled.
+//
+// - SLSR may decrease ILP at the architecture level. Targets that are very
+//   sensitive to ILP may want to disable it. Having SLSR to consider ILP is
+//   left as future work.
+//
+// - When (i' - i) is constant but i and i' are not, we could still perform
+//   SLSR.
+
 #include "llvm/Transforms/Scalar/StraightLineStrengthReduce.h"
-#include "llvm/ADT/APInt.h" 
-#include "llvm/ADT/DepthFirstIterator.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/GetElementPtrTypeIterator.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Operator.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include <cassert> 
-#include <cstdint> 
-#include <limits> 
-#include <list> 
-#include <vector> 
- 
-using namespace llvm; 
-using namespace PatternMatch; 
- 
-static const unsigned UnknownAddressSpace = 
-    std::numeric_limits<unsigned>::max(); 
- 
-namespace { 
- 
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <cstdint>
+#include <limits>
+#include <list>
+#include <vector>
+
+using namespace llvm;
+using namespace PatternMatch;
+
+static const unsigned UnknownAddressSpace =
+    std::numeric_limits<unsigned>::max();
+
+namespace {
+
 class StraightLineStrengthReduceLegacyPass : public FunctionPass {
   const DataLayout *DL = nullptr;
 
-public: 
+public:
   static char ID;
 
   StraightLineStrengthReduceLegacyPass() : FunctionPass(ID) {
@@ -129,596 +129,596 @@ public:
                              ScalarEvolution *SE, TargetTransformInfo *TTI)
       : DL(DL), DT(DT), SE(SE), TTI(TTI) {}
 
-  // SLSR candidate. Such a candidate must be in one of the forms described in 
-  // the header comments. 
-  struct Candidate { 
-    enum Kind { 
-      Invalid, // reserved for the default constructor 
-      Add,     // B + i * S 
-      Mul,     // (B + i) * S 
-      GEP,     // &B[..][i * S][..] 
-    }; 
- 
-    Candidate() = default; 
-    Candidate(Kind CT, const SCEV *B, ConstantInt *Idx, Value *S, 
-              Instruction *I) 
-        : CandidateKind(CT), Base(B), Index(Idx), Stride(S), Ins(I) {} 
- 
-    Kind CandidateKind = Invalid; 
- 
-    const SCEV *Base = nullptr; 
- 
-    // Note that Index and Stride of a GEP candidate do not necessarily have the 
-    // same integer type. In that case, during rewriting, Stride will be 
-    // sign-extended or truncated to Index's type. 
-    ConstantInt *Index = nullptr; 
- 
-    Value *Stride = nullptr; 
- 
-    // The instruction this candidate corresponds to. It helps us to rewrite a 
-    // candidate with respect to its immediate basis. Note that one instruction 
-    // can correspond to multiple candidates depending on how you associate the 
-    // expression. For instance, 
-    // 
-    // (a + 1) * (b + 2) 
-    // 
-    // can be treated as 
-    // 
-    // <Base: a, Index: 1, Stride: b + 2> 
-    // 
-    // or 
-    // 
-    // <Base: b, Index: 2, Stride: a + 1> 
-    Instruction *Ins = nullptr; 
- 
-    // Points to the immediate basis of this candidate, or nullptr if we cannot 
-    // find any basis for this candidate. 
-    Candidate *Basis = nullptr; 
-  }; 
- 
+  // SLSR candidate. Such a candidate must be in one of the forms described in
+  // the header comments.
+  struct Candidate {
+    enum Kind {
+      Invalid, // reserved for the default constructor
+      Add,     // B + i * S
+      Mul,     // (B + i) * S
+      GEP,     // &B[..][i * S][..]
+    };
+
+    Candidate() = default;
+    Candidate(Kind CT, const SCEV *B, ConstantInt *Idx, Value *S,
+              Instruction *I)
+        : CandidateKind(CT), Base(B), Index(Idx), Stride(S), Ins(I) {}
+
+    Kind CandidateKind = Invalid;
+
+    const SCEV *Base = nullptr;
+
+    // Note that Index and Stride of a GEP candidate do not necessarily have the
+    // same integer type. In that case, during rewriting, Stride will be
+    // sign-extended or truncated to Index's type.
+    ConstantInt *Index = nullptr;
+
+    Value *Stride = nullptr;
+
+    // The instruction this candidate corresponds to. It helps us to rewrite a
+    // candidate with respect to its immediate basis. Note that one instruction
+    // can correspond to multiple candidates depending on how you associate the
+    // expression. For instance,
+    //
+    // (a + 1) * (b + 2)
+    //
+    // can be treated as
+    //
+    // <Base: a, Index: 1, Stride: b + 2>
+    //
+    // or
+    //
+    // <Base: b, Index: 2, Stride: a + 1>
+    Instruction *Ins = nullptr;
+
+    // Points to the immediate basis of this candidate, or nullptr if we cannot
+    // find any basis for this candidate.
+    Candidate *Basis = nullptr;
+  };
+
   bool runOnFunction(Function &F);
- 
-private: 
-  // Returns true if Basis is a basis for C, i.e., Basis dominates C and they 
-  // share the same base and stride. 
-  bool isBasisFor(const Candidate &Basis, const Candidate &C); 
- 
-  // Returns whether the candidate can be folded into an addressing mode. 
-  bool isFoldable(const Candidate &C, TargetTransformInfo *TTI, 
-                  const DataLayout *DL); 
- 
-  // Returns true if C is already in a simplest form and not worth being 
-  // rewritten. 
-  bool isSimplestForm(const Candidate &C); 
- 
-  // Checks whether I is in a candidate form. If so, adds all the matching forms 
-  // to Candidates, and tries to find the immediate basis for each of them. 
-  void allocateCandidatesAndFindBasis(Instruction *I); 
- 
-  // Allocate candidates and find bases for Add instructions. 
-  void allocateCandidatesAndFindBasisForAdd(Instruction *I); 
- 
-  // Given I = LHS + RHS, factors RHS into i * S and makes (LHS + i * S) a 
-  // candidate. 
-  void allocateCandidatesAndFindBasisForAdd(Value *LHS, Value *RHS, 
-                                            Instruction *I); 
-  // Allocate candidates and find bases for Mul instructions. 
-  void allocateCandidatesAndFindBasisForMul(Instruction *I); 
- 
-  // Splits LHS into Base + Index and, if succeeds, calls 
-  // allocateCandidatesAndFindBasis. 
-  void allocateCandidatesAndFindBasisForMul(Value *LHS, Value *RHS, 
-                                            Instruction *I); 
- 
-  // Allocate candidates and find bases for GetElementPtr instructions. 
-  void allocateCandidatesAndFindBasisForGEP(GetElementPtrInst *GEP); 
- 
-  // A helper function that scales Idx with ElementSize before invoking 
-  // allocateCandidatesAndFindBasis. 
-  void allocateCandidatesAndFindBasisForGEP(const SCEV *B, ConstantInt *Idx, 
-                                            Value *S, uint64_t ElementSize, 
-                                            Instruction *I); 
- 
-  // Adds the given form <CT, B, Idx, S> to Candidates, and finds its immediate 
-  // basis. 
-  void allocateCandidatesAndFindBasis(Candidate::Kind CT, const SCEV *B, 
-                                      ConstantInt *Idx, Value *S, 
-                                      Instruction *I); 
- 
-  // Rewrites candidate C with respect to Basis. 
-  void rewriteCandidateWithBasis(const Candidate &C, const Candidate &Basis); 
- 
-  // A helper function that factors ArrayIdx to a product of a stride and a 
-  // constant index, and invokes allocateCandidatesAndFindBasis with the 
-  // factorings. 
-  void factorArrayIndex(Value *ArrayIdx, const SCEV *Base, uint64_t ElementSize, 
-                        GetElementPtrInst *GEP); 
- 
-  // Emit code that computes the "bump" from Basis to C. If the candidate is a 
-  // GEP and the bump is not divisible by the element size of the GEP, this 
-  // function sets the BumpWithUglyGEP flag to notify its caller to bump the 
-  // basis using an ugly GEP. 
-  static Value *emitBump(const Candidate &Basis, const Candidate &C, 
-                         IRBuilder<> &Builder, const DataLayout *DL, 
-                         bool &BumpWithUglyGEP); 
- 
-  const DataLayout *DL = nullptr; 
-  DominatorTree *DT = nullptr; 
-  ScalarEvolution *SE; 
-  TargetTransformInfo *TTI = nullptr; 
-  std::list<Candidate> Candidates; 
- 
-  // Temporarily holds all instructions that are unlinked (but not deleted) by 
-  // rewriteCandidateWithBasis. These instructions will be actually removed 
-  // after all rewriting finishes. 
-  std::vector<Instruction *> UnlinkedInstructions; 
-}; 
- 
-} // end anonymous namespace 
- 
+
+private:
+  // Returns true if Basis is a basis for C, i.e., Basis dominates C and they
+  // share the same base and stride.
+  bool isBasisFor(const Candidate &Basis, const Candidate &C);
+
+  // Returns whether the candidate can be folded into an addressing mode.
+  bool isFoldable(const Candidate &C, TargetTransformInfo *TTI,
+                  const DataLayout *DL);
+
+  // Returns true if C is already in a simplest form and not worth being
+  // rewritten.
+  bool isSimplestForm(const Candidate &C);
+
+  // Checks whether I is in a candidate form. If so, adds all the matching forms
+  // to Candidates, and tries to find the immediate basis for each of them.
+  void allocateCandidatesAndFindBasis(Instruction *I);
+
+  // Allocate candidates and find bases for Add instructions.
+  void allocateCandidatesAndFindBasisForAdd(Instruction *I);
+
+  // Given I = LHS + RHS, factors RHS into i * S and makes (LHS + i * S) a
+  // candidate.
+  void allocateCandidatesAndFindBasisForAdd(Value *LHS, Value *RHS,
+                                            Instruction *I);
+  // Allocate candidates and find bases for Mul instructions.
+  void allocateCandidatesAndFindBasisForMul(Instruction *I);
+
+  // Splits LHS into Base + Index and, if succeeds, calls
+  // allocateCandidatesAndFindBasis.
+  void allocateCandidatesAndFindBasisForMul(Value *LHS, Value *RHS,
+                                            Instruction *I);
+
+  // Allocate candidates and find bases for GetElementPtr instructions.
+  void allocateCandidatesAndFindBasisForGEP(GetElementPtrInst *GEP);
+
+  // A helper function that scales Idx with ElementSize before invoking
+  // allocateCandidatesAndFindBasis.
+  void allocateCandidatesAndFindBasisForGEP(const SCEV *B, ConstantInt *Idx,
+                                            Value *S, uint64_t ElementSize,
+                                            Instruction *I);
+
+  // Adds the given form <CT, B, Idx, S> to Candidates, and finds its immediate
+  // basis.
+  void allocateCandidatesAndFindBasis(Candidate::Kind CT, const SCEV *B,
+                                      ConstantInt *Idx, Value *S,
+                                      Instruction *I);
+
+  // Rewrites candidate C with respect to Basis.
+  void rewriteCandidateWithBasis(const Candidate &C, const Candidate &Basis);
+
+  // A helper function that factors ArrayIdx to a product of a stride and a
+  // constant index, and invokes allocateCandidatesAndFindBasis with the
+  // factorings.
+  void factorArrayIndex(Value *ArrayIdx, const SCEV *Base, uint64_t ElementSize,
+                        GetElementPtrInst *GEP);
+
+  // Emit code that computes the "bump" from Basis to C. If the candidate is a
+  // GEP and the bump is not divisible by the element size of the GEP, this
+  // function sets the BumpWithUglyGEP flag to notify its caller to bump the
+  // basis using an ugly GEP.
+  static Value *emitBump(const Candidate &Basis, const Candidate &C,
+                         IRBuilder<> &Builder, const DataLayout *DL,
+                         bool &BumpWithUglyGEP);
+
+  const DataLayout *DL = nullptr;
+  DominatorTree *DT = nullptr;
+  ScalarEvolution *SE;
+  TargetTransformInfo *TTI = nullptr;
+  std::list<Candidate> Candidates;
+
+  // Temporarily holds all instructions that are unlinked (but not deleted) by
+  // rewriteCandidateWithBasis. These instructions will be actually removed
+  // after all rewriting finishes.
+  std::vector<Instruction *> UnlinkedInstructions;
+};
+
+} // end anonymous namespace
+
 char StraightLineStrengthReduceLegacyPass::ID = 0;
- 
+
 INITIALIZE_PASS_BEGIN(StraightLineStrengthReduceLegacyPass, "slsr",
-                      "Straight line strength reduction", false, false) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 
+                      "Straight line strength reduction", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(StraightLineStrengthReduceLegacyPass, "slsr",
-                    "Straight line strength reduction", false, false) 
- 
-FunctionPass *llvm::createStraightLineStrengthReducePass() { 
+                    "Straight line strength reduction", false, false)
+
+FunctionPass *llvm::createStraightLineStrengthReducePass() {
   return new StraightLineStrengthReduceLegacyPass();
-} 
- 
-bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis, 
-                                            const Candidate &C) { 
-  return (Basis.Ins != C.Ins && // skip the same instruction 
-          // They must have the same type too. Basis.Base == C.Base doesn't 
-          // guarantee their types are the same (PR23975). 
-          Basis.Ins->getType() == C.Ins->getType() && 
-          // Basis must dominate C in order to rewrite C with respect to Basis. 
-          DT->dominates(Basis.Ins->getParent(), C.Ins->getParent()) && 
-          // They share the same base, stride, and candidate kind. 
-          Basis.Base == C.Base && Basis.Stride == C.Stride && 
-          Basis.CandidateKind == C.CandidateKind); 
-} 
- 
-static bool isGEPFoldable(GetElementPtrInst *GEP, 
-                          const TargetTransformInfo *TTI) { 
+}
+
+bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis,
+                                            const Candidate &C) {
+  return (Basis.Ins != C.Ins && // skip the same instruction
+          // They must have the same type too. Basis.Base == C.Base doesn't
+          // guarantee their types are the same (PR23975).
+          Basis.Ins->getType() == C.Ins->getType() &&
+          // Basis must dominate C in order to rewrite C with respect to Basis.
+          DT->dominates(Basis.Ins->getParent(), C.Ins->getParent()) &&
+          // They share the same base, stride, and candidate kind.
+          Basis.Base == C.Base && Basis.Stride == C.Stride &&
+          Basis.CandidateKind == C.CandidateKind);
+}
+
+static bool isGEPFoldable(GetElementPtrInst *GEP,
+                          const TargetTransformInfo *TTI) {
   SmallVector<const Value *, 4> Indices(GEP->indices());
-  return TTI->getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(), 
-                         Indices) == TargetTransformInfo::TCC_Free; 
-} 
- 
-// Returns whether (Base + Index * Stride) can be folded to an addressing mode. 
-static bool isAddFoldable(const SCEV *Base, ConstantInt *Index, Value *Stride, 
-                          TargetTransformInfo *TTI) { 
-  // Index->getSExtValue() may crash if Index is wider than 64-bit. 
-  return Index->getBitWidth() <= 64 && 
-         TTI->isLegalAddressingMode(Base->getType(), nullptr, 0, true, 
-                                    Index->getSExtValue(), UnknownAddressSpace); 
-} 
- 
-bool StraightLineStrengthReduce::isFoldable(const Candidate &C, 
-                                            TargetTransformInfo *TTI, 
-                                            const DataLayout *DL) { 
-  if (C.CandidateKind == Candidate::Add) 
-    return isAddFoldable(C.Base, C.Index, C.Stride, TTI); 
-  if (C.CandidateKind == Candidate::GEP) 
-    return isGEPFoldable(cast<GetElementPtrInst>(C.Ins), TTI); 
-  return false; 
-} 
- 
-// Returns true if GEP has zero or one non-zero index. 
-static bool hasOnlyOneNonZeroIndex(GetElementPtrInst *GEP) { 
-  unsigned NumNonZeroIndices = 0; 
-  for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I) { 
-    ConstantInt *ConstIdx = dyn_cast<ConstantInt>(*I); 
-    if (ConstIdx == nullptr || !ConstIdx->isZero()) 
-      ++NumNonZeroIndices; 
-  } 
-  return NumNonZeroIndices <= 1; 
-} 
- 
-bool StraightLineStrengthReduce::isSimplestForm(const Candidate &C) { 
-  if (C.CandidateKind == Candidate::Add) { 
-    // B + 1 * S or B + (-1) * S 
-    return C.Index->isOne() || C.Index->isMinusOne(); 
-  } 
-  if (C.CandidateKind == Candidate::Mul) { 
-    // (B + 0) * S 
-    return C.Index->isZero(); 
-  } 
-  if (C.CandidateKind == Candidate::GEP) { 
-    // (char*)B + S or (char*)B - S 
-    return ((C.Index->isOne() || C.Index->isMinusOne()) && 
-            hasOnlyOneNonZeroIndex(cast<GetElementPtrInst>(C.Ins))); 
-  } 
-  return false; 
-} 
- 
-// TODO: We currently implement an algorithm whose time complexity is linear in 
-// the number of existing candidates. However, we could do better by using 
-// ScopedHashTable. Specifically, while traversing the dominator tree, we could 
-// maintain all the candidates that dominate the basic block being traversed in 
-// a ScopedHashTable. This hash table is indexed by the base and the stride of 
-// a candidate. Therefore, finding the immediate basis of a candidate boils down 
-// to one hash-table look up. 
-void StraightLineStrengthReduce::allocateCandidatesAndFindBasis( 
-    Candidate::Kind CT, const SCEV *B, ConstantInt *Idx, Value *S, 
-    Instruction *I) { 
-  Candidate C(CT, B, Idx, S, I); 
-  // SLSR can complicate an instruction in two cases: 
-  // 
-  // 1. If we can fold I into an addressing mode, computing I is likely free or 
-  // takes only one instruction. 
-  // 
-  // 2. I is already in a simplest form. For example, when 
-  //      X = B + 8 * S 
-  //      Y = B + S, 
-  //    rewriting Y to X - 7 * S is probably a bad idea. 
-  // 
-  // In the above cases, we still add I to the candidate list so that I can be 
-  // the basis of other candidates, but we leave I's basis blank so that I 
-  // won't be rewritten. 
-  if (!isFoldable(C, TTI, DL) && !isSimplestForm(C)) { 
-    // Try to compute the immediate basis of C. 
-    unsigned NumIterations = 0; 
-    // Limit the scan radius to avoid running in quadratice time. 
-    static const unsigned MaxNumIterations = 50; 
-    for (auto Basis = Candidates.rbegin(); 
-         Basis != Candidates.rend() && NumIterations < MaxNumIterations; 
-         ++Basis, ++NumIterations) { 
-      if (isBasisFor(*Basis, C)) { 
-        C.Basis = &(*Basis); 
-        break; 
-      } 
-    } 
-  } 
-  // Regardless of whether we find a basis for C, we need to push C to the 
-  // candidate list so that it can be the basis of other candidates. 
-  Candidates.push_back(C); 
-} 
- 
-void StraightLineStrengthReduce::allocateCandidatesAndFindBasis( 
-    Instruction *I) { 
-  switch (I->getOpcode()) { 
-  case Instruction::Add: 
-    allocateCandidatesAndFindBasisForAdd(I); 
-    break; 
-  case Instruction::Mul: 
-    allocateCandidatesAndFindBasisForMul(I); 
-    break; 
-  case Instruction::GetElementPtr: 
-    allocateCandidatesAndFindBasisForGEP(cast<GetElementPtrInst>(I)); 
-    break; 
-  } 
-} 
- 
-void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForAdd( 
-    Instruction *I) { 
-  // Try matching B + i * S. 
-  if (!isa<IntegerType>(I->getType())) 
-    return; 
- 
-  assert(I->getNumOperands() == 2 && "isn't I an add?"); 
-  Value *LHS = I->getOperand(0), *RHS = I->getOperand(1); 
-  allocateCandidatesAndFindBasisForAdd(LHS, RHS, I); 
-  if (LHS != RHS) 
-    allocateCandidatesAndFindBasisForAdd(RHS, LHS, I); 
-} 
- 
-void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForAdd( 
-    Value *LHS, Value *RHS, Instruction *I) { 
-  Value *S = nullptr; 
-  ConstantInt *Idx = nullptr; 
-  if (match(RHS, m_Mul(m_Value(S), m_ConstantInt(Idx)))) { 
-    // I = LHS + RHS = LHS + Idx * S 
-    allocateCandidatesAndFindBasis(Candidate::Add, SE->getSCEV(LHS), Idx, S, I); 
-  } else if (match(RHS, m_Shl(m_Value(S), m_ConstantInt(Idx)))) { 
-    // I = LHS + RHS = LHS + (S << Idx) = LHS + S * (1 << Idx) 
-    APInt One(Idx->getBitWidth(), 1); 
-    Idx = ConstantInt::get(Idx->getContext(), One << Idx->getValue()); 
-    allocateCandidatesAndFindBasis(Candidate::Add, SE->getSCEV(LHS), Idx, S, I); 
-  } else { 
-    // At least, I = LHS + 1 * RHS 
-    ConstantInt *One = ConstantInt::get(cast<IntegerType>(I->getType()), 1); 
-    allocateCandidatesAndFindBasis(Candidate::Add, SE->getSCEV(LHS), One, RHS, 
-                                   I); 
-  } 
-} 
- 
-// Returns true if A matches B + C where C is constant. 
-static bool matchesAdd(Value *A, Value *&B, ConstantInt *&C) { 
-  return (match(A, m_Add(m_Value(B), m_ConstantInt(C))) || 
-          match(A, m_Add(m_ConstantInt(C), m_Value(B)))); 
-} 
- 
-// Returns true if A matches B | C where C is constant. 
-static bool matchesOr(Value *A, Value *&B, ConstantInt *&C) { 
-  return (match(A, m_Or(m_Value(B), m_ConstantInt(C))) || 
-          match(A, m_Or(m_ConstantInt(C), m_Value(B)))); 
-} 
- 
-void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForMul( 
-    Value *LHS, Value *RHS, Instruction *I) { 
-  Value *B = nullptr; 
-  ConstantInt *Idx = nullptr; 
-  if (matchesAdd(LHS, B, Idx)) { 
-    // If LHS is in the form of "Base + Index", then I is in the form of 
-    // "(Base + Index) * RHS". 
-    allocateCandidatesAndFindBasis(Candidate::Mul, SE->getSCEV(B), Idx, RHS, I); 
-  } else if (matchesOr(LHS, B, Idx) && haveNoCommonBitsSet(B, Idx, *DL)) { 
-    // If LHS is in the form of "Base | Index" and Base and Index have no common 
-    // bits set, then 
-    //   Base | Index = Base + Index 
-    // and I is thus in the form of "(Base + Index) * RHS". 
-    allocateCandidatesAndFindBasis(Candidate::Mul, SE->getSCEV(B), Idx, RHS, I); 
-  } else { 
-    // Otherwise, at least try the form (LHS + 0) * RHS. 
-    ConstantInt *Zero = ConstantInt::get(cast<IntegerType>(I->getType()), 0); 
-    allocateCandidatesAndFindBasis(Candidate::Mul, SE->getSCEV(LHS), Zero, RHS, 
-                                   I); 
-  } 
-} 
- 
-void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForMul( 
-    Instruction *I) { 
-  // Try matching (B + i) * S. 
-  // TODO: we could extend SLSR to float and vector types. 
-  if (!isa<IntegerType>(I->getType())) 
-    return; 
- 
-  assert(I->getNumOperands() == 2 && "isn't I a mul?"); 
-  Value *LHS = I->getOperand(0), *RHS = I->getOperand(1); 
-  allocateCandidatesAndFindBasisForMul(LHS, RHS, I); 
-  if (LHS != RHS) { 
-    // Symmetrically, try to split RHS to Base + Index. 
-    allocateCandidatesAndFindBasisForMul(RHS, LHS, I); 
-  } 
-} 
- 
-void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP( 
-    const SCEV *B, ConstantInt *Idx, Value *S, uint64_t ElementSize, 
-    Instruction *I) { 
-  // I = B + sext(Idx *nsw S) * ElementSize 
-  //   = B + (sext(Idx) * sext(S)) * ElementSize 
-  //   = B + (sext(Idx) * ElementSize) * sext(S) 
-  // Casting to IntegerType is safe because we skipped vector GEPs. 
-  IntegerType *IntPtrTy = cast<IntegerType>(DL->getIntPtrType(I->getType())); 
-  ConstantInt *ScaledIdx = ConstantInt::get( 
-      IntPtrTy, Idx->getSExtValue() * (int64_t)ElementSize, true); 
-  allocateCandidatesAndFindBasis(Candidate::GEP, B, ScaledIdx, S, I); 
-} 
- 
-void StraightLineStrengthReduce::factorArrayIndex(Value *ArrayIdx, 
-                                                  const SCEV *Base, 
-                                                  uint64_t ElementSize, 
-                                                  GetElementPtrInst *GEP) { 
-  // At least, ArrayIdx = ArrayIdx *nsw 1. 
-  allocateCandidatesAndFindBasisForGEP( 
-      Base, ConstantInt::get(cast<IntegerType>(ArrayIdx->getType()), 1), 
-      ArrayIdx, ElementSize, GEP); 
-  Value *LHS = nullptr; 
-  ConstantInt *RHS = nullptr; 
-  // One alternative is matching the SCEV of ArrayIdx instead of ArrayIdx 
-  // itself. This would allow us to handle the shl case for free. However, 
-  // matching SCEVs has two issues: 
-  // 
-  // 1. this would complicate rewriting because the rewriting procedure 
-  // would have to translate SCEVs back to IR instructions. This translation 
-  // is difficult when LHS is further evaluated to a composite SCEV. 
-  // 
-  // 2. ScalarEvolution is designed to be control-flow oblivious. It tends 
-  // to strip nsw/nuw flags which are critical for SLSR to trace into 
-  // sext'ed multiplication. 
-  if (match(ArrayIdx, m_NSWMul(m_Value(LHS), m_ConstantInt(RHS)))) { 
-    // SLSR is currently unsafe if i * S may overflow. 
-    // GEP = Base + sext(LHS *nsw RHS) * ElementSize 
-    allocateCandidatesAndFindBasisForGEP(Base, RHS, LHS, ElementSize, GEP); 
-  } else if (match(ArrayIdx, m_NSWShl(m_Value(LHS), m_ConstantInt(RHS)))) { 
-    // GEP = Base + sext(LHS <<nsw RHS) * ElementSize 
-    //     = Base + sext(LHS *nsw (1 << RHS)) * ElementSize 
-    APInt One(RHS->getBitWidth(), 1); 
-    ConstantInt *PowerOf2 = 
-        ConstantInt::get(RHS->getContext(), One << RHS->getValue()); 
-    allocateCandidatesAndFindBasisForGEP(Base, PowerOf2, LHS, ElementSize, GEP); 
-  } 
-} 
- 
-void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP( 
-    GetElementPtrInst *GEP) { 
-  // TODO: handle vector GEPs 
-  if (GEP->getType()->isVectorTy()) 
-    return; 
- 
-  SmallVector<const SCEV *, 4> IndexExprs; 
-  for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I) 
-    IndexExprs.push_back(SE->getSCEV(*I)); 
- 
-  gep_type_iterator GTI = gep_type_begin(GEP); 
-  for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) { 
-    if (GTI.isStruct()) 
-      continue; 
- 
-    const SCEV *OrigIndexExpr = IndexExprs[I - 1]; 
-    IndexExprs[I - 1] = SE->getZero(OrigIndexExpr->getType()); 
- 
-    // The base of this candidate is GEP's base plus the offsets of all 
-    // indices except this current one. 
-    const SCEV *BaseExpr = SE->getGEPExpr(cast<GEPOperator>(GEP), IndexExprs); 
-    Value *ArrayIdx = GEP->getOperand(I); 
-    uint64_t ElementSize = DL->getTypeAllocSize(GTI.getIndexedType()); 
-    if (ArrayIdx->getType()->getIntegerBitWidth() <= 
-        DL->getPointerSizeInBits(GEP->getAddressSpace())) { 
-      // Skip factoring if ArrayIdx is wider than the pointer size, because 
-      // ArrayIdx is implicitly truncated to the pointer size. 
-      factorArrayIndex(ArrayIdx, BaseExpr, ElementSize, GEP); 
-    } 
-    // When ArrayIdx is the sext of a value, we try to factor that value as 
-    // well.  Handling this case is important because array indices are 
-    // typically sign-extended to the pointer size. 
-    Value *TruncatedArrayIdx = nullptr; 
-    if (match(ArrayIdx, m_SExt(m_Value(TruncatedArrayIdx))) && 
-        TruncatedArrayIdx->getType()->getIntegerBitWidth() <= 
-            DL->getPointerSizeInBits(GEP->getAddressSpace())) { 
-      // Skip factoring if TruncatedArrayIdx is wider than the pointer size, 
-      // because TruncatedArrayIdx is implicitly truncated to the pointer size. 
-      factorArrayIndex(TruncatedArrayIdx, BaseExpr, ElementSize, GEP); 
-    } 
- 
-    IndexExprs[I - 1] = OrigIndexExpr; 
-  } 
-} 
- 
-// A helper function that unifies the bitwidth of A and B. 
-static void unifyBitWidth(APInt &A, APInt &B) { 
-  if (A.getBitWidth() < B.getBitWidth()) 
-    A = A.sext(B.getBitWidth()); 
-  else if (A.getBitWidth() > B.getBitWidth()) 
-    B = B.sext(A.getBitWidth()); 
-} 
- 
-Value *StraightLineStrengthReduce::emitBump(const Candidate &Basis, 
-                                            const Candidate &C, 
-                                            IRBuilder<> &Builder, 
-                                            const DataLayout *DL, 
-                                            bool &BumpWithUglyGEP) { 
-  APInt Idx = C.Index->getValue(), BasisIdx = Basis.Index->getValue(); 
-  unifyBitWidth(Idx, BasisIdx); 
-  APInt IndexOffset = Idx - BasisIdx; 
- 
-  BumpWithUglyGEP = false; 
-  if (Basis.CandidateKind == Candidate::GEP) { 
-    APInt ElementSize( 
-        IndexOffset.getBitWidth(), 
-        DL->getTypeAllocSize( 
-            cast<GetElementPtrInst>(Basis.Ins)->getResultElementType())); 
-    APInt Q, R; 
-    APInt::sdivrem(IndexOffset, ElementSize, Q, R); 
-    if (R == 0) 
-      IndexOffset = Q; 
-    else 
-      BumpWithUglyGEP = true; 
-  } 
- 
-  // Compute Bump = C - Basis = (i' - i) * S. 
-  // Common case 1: if (i' - i) is 1, Bump = S. 
-  if (IndexOffset == 1) 
-    return C.Stride; 
-  // Common case 2: if (i' - i) is -1, Bump = -S. 
-  if (IndexOffset.isAllOnesValue()) 
-    return Builder.CreateNeg(C.Stride); 
- 
-  // Otherwise, Bump = (i' - i) * sext/trunc(S). Note that (i' - i) and S may 
-  // have different bit widths. 
-  IntegerType *DeltaType = 
-      IntegerType::get(Basis.Ins->getContext(), IndexOffset.getBitWidth()); 
-  Value *ExtendedStride = Builder.CreateSExtOrTrunc(C.Stride, DeltaType); 
-  if (IndexOffset.isPowerOf2()) { 
-    // If (i' - i) is a power of 2, Bump = sext/trunc(S) << log(i' - i). 
-    ConstantInt *Exponent = ConstantInt::get(DeltaType, IndexOffset.logBase2()); 
-    return Builder.CreateShl(ExtendedStride, Exponent); 
-  } 
-  if ((-IndexOffset).isPowerOf2()) { 
-    // If (i - i') is a power of 2, Bump = -sext/trunc(S) << log(i' - i). 
-    ConstantInt *Exponent = 
-        ConstantInt::get(DeltaType, (-IndexOffset).logBase2()); 
-    return Builder.CreateNeg(Builder.CreateShl(ExtendedStride, Exponent)); 
-  } 
-  Constant *Delta = ConstantInt::get(DeltaType, IndexOffset); 
-  return Builder.CreateMul(ExtendedStride, Delta); 
-} 
- 
-void StraightLineStrengthReduce::rewriteCandidateWithBasis( 
-    const Candidate &C, const Candidate &Basis) { 
-  assert(C.CandidateKind == Basis.CandidateKind && C.Base == Basis.Base && 
-         C.Stride == Basis.Stride); 
-  // We run rewriteCandidateWithBasis on all candidates in a post-order, so the 
-  // basis of a candidate cannot be unlinked before the candidate. 
-  assert(Basis.Ins->getParent() != nullptr && "the basis is unlinked"); 
- 
-  // An instruction can correspond to multiple candidates. Therefore, instead of 
-  // simply deleting an instruction when we rewrite it, we mark its parent as 
-  // nullptr (i.e. unlink it) so that we can skip the candidates whose 
-  // instruction is already rewritten. 
-  if (!C.Ins->getParent()) 
-    return; 
- 
-  IRBuilder<> Builder(C.Ins); 
-  bool BumpWithUglyGEP; 
-  Value *Bump = emitBump(Basis, C, Builder, DL, BumpWithUglyGEP); 
-  Value *Reduced = nullptr; // equivalent to but weaker than C.Ins 
-  switch (C.CandidateKind) { 
-  case Candidate::Add: 
-  case Candidate::Mul: { 
-    // C = Basis + Bump 
-    Value *NegBump; 
-    if (match(Bump, m_Neg(m_Value(NegBump)))) { 
-      // If Bump is a neg instruction, emit C = Basis - (-Bump). 
-      Reduced = Builder.CreateSub(Basis.Ins, NegBump); 
-      // We only use the negative argument of Bump, and Bump itself may be 
-      // trivially dead. 
-      RecursivelyDeleteTriviallyDeadInstructions(Bump); 
-    } else { 
-      // It's tempting to preserve nsw on Bump and/or Reduced. However, it's 
-      // usually unsound, e.g., 
-      // 
-      // X = (-2 +nsw 1) *nsw INT_MAX 
-      // Y = (-2 +nsw 3) *nsw INT_MAX 
-      //   => 
-      // Y = X + 2 * INT_MAX 
-      // 
-      // Neither + and * in the resultant expression are nsw. 
-      Reduced = Builder.CreateAdd(Basis.Ins, Bump); 
-    } 
-    break; 
-  } 
-  case Candidate::GEP: 
-    { 
-      Type *IntPtrTy = DL->getIntPtrType(C.Ins->getType()); 
-      bool InBounds = cast<GetElementPtrInst>(C.Ins)->isInBounds(); 
-      if (BumpWithUglyGEP) { 
-        // C = (char *)Basis + Bump 
-        unsigned AS = Basis.Ins->getType()->getPointerAddressSpace(); 
-        Type *CharTy = Type::getInt8PtrTy(Basis.Ins->getContext(), AS); 
-        Reduced = Builder.CreateBitCast(Basis.Ins, CharTy); 
-        if (InBounds) 
-          Reduced = 
-              Builder.CreateInBoundsGEP(Builder.getInt8Ty(), Reduced, Bump); 
-        else 
-          Reduced = Builder.CreateGEP(Builder.getInt8Ty(), Reduced, Bump); 
-        Reduced = Builder.CreateBitCast(Reduced, C.Ins->getType()); 
-      } else { 
-        // C = gep Basis, Bump 
-        // Canonicalize bump to pointer size. 
-        Bump = Builder.CreateSExtOrTrunc(Bump, IntPtrTy); 
-        if (InBounds) 
-          Reduced = Builder.CreateInBoundsGEP( 
-              cast<GetElementPtrInst>(Basis.Ins)->getResultElementType(), 
-              Basis.Ins, Bump); 
-        else 
-          Reduced = Builder.CreateGEP( 
-              cast<GetElementPtrInst>(Basis.Ins)->getResultElementType(), 
-              Basis.Ins, Bump); 
-      } 
-      break; 
-    } 
-  default: 
-    llvm_unreachable("C.CandidateKind is invalid"); 
-  }; 
-  Reduced->takeName(C.Ins); 
-  C.Ins->replaceAllUsesWith(Reduced); 
-  // Unlink C.Ins so that we can skip other candidates also corresponding to 
-  // C.Ins. The actual deletion is postponed to the end of runOnFunction. 
-  C.Ins->removeFromParent(); 
-  UnlinkedInstructions.push_back(C.Ins); 
-} 
- 
+  return TTI->getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
+                         Indices) == TargetTransformInfo::TCC_Free;
+}
+
+// Returns whether (Base + Index * Stride) can be folded to an addressing mode.
+static bool isAddFoldable(const SCEV *Base, ConstantInt *Index, Value *Stride,
+                          TargetTransformInfo *TTI) {
+  // Index->getSExtValue() may crash if Index is wider than 64-bit.
+  return Index->getBitWidth() <= 64 &&
+         TTI->isLegalAddressingMode(Base->getType(), nullptr, 0, true,
+                                    Index->getSExtValue(), UnknownAddressSpace);
+}
+
+bool StraightLineStrengthReduce::isFoldable(const Candidate &C,
+                                            TargetTransformInfo *TTI,
+                                            const DataLayout *DL) {
+  if (C.CandidateKind == Candidate::Add)
+    return isAddFoldable(C.Base, C.Index, C.Stride, TTI);
+  if (C.CandidateKind == Candidate::GEP)
+    return isGEPFoldable(cast<GetElementPtrInst>(C.Ins), TTI);
+  return false;
+}
+
+// Returns true if GEP has zero or one non-zero index.
+static bool hasOnlyOneNonZeroIndex(GetElementPtrInst *GEP) {
+  unsigned NumNonZeroIndices = 0;
+  for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I) {
+    ConstantInt *ConstIdx = dyn_cast<ConstantInt>(*I);
+    if (ConstIdx == nullptr || !ConstIdx->isZero())
+      ++NumNonZeroIndices;
+  }
+  return NumNonZeroIndices <= 1;
+}
+
+bool StraightLineStrengthReduce::isSimplestForm(const Candidate &C) {
+  if (C.CandidateKind == Candidate::Add) {
+    // B + 1 * S or B + (-1) * S
+    return C.Index->isOne() || C.Index->isMinusOne();
+  }
+  if (C.CandidateKind == Candidate::Mul) {
+    // (B + 0) * S
+    return C.Index->isZero();
+  }
+  if (C.CandidateKind == Candidate::GEP) {
+    // (char*)B + S or (char*)B - S
+    return ((C.Index->isOne() || C.Index->isMinusOne()) &&
+            hasOnlyOneNonZeroIndex(cast<GetElementPtrInst>(C.Ins)));
+  }
+  return false;
+}
+
+// TODO: We currently implement an algorithm whose time complexity is linear in
+// the number of existing candidates. However, we could do better by using
+// ScopedHashTable. Specifically, while traversing the dominator tree, we could
+// maintain all the candidates that dominate the basic block being traversed in
+// a ScopedHashTable. This hash table is indexed by the base and the stride of
+// a candidate. Therefore, finding the immediate basis of a candidate boils down
+// to one hash-table look up.
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasis(
+    Candidate::Kind CT, const SCEV *B, ConstantInt *Idx, Value *S,
+    Instruction *I) {
+  Candidate C(CT, B, Idx, S, I);
+  // SLSR can complicate an instruction in two cases:
+  //
+  // 1. If we can fold I into an addressing mode, computing I is likely free or
+  // takes only one instruction.
+  //
+  // 2. I is already in a simplest form. For example, when
+  //      X = B + 8 * S
+  //      Y = B + S,
+  //    rewriting Y to X - 7 * S is probably a bad idea.
+  //
+  // In the above cases, we still add I to the candidate list so that I can be
+  // the basis of other candidates, but we leave I's basis blank so that I
+  // won't be rewritten.
+  if (!isFoldable(C, TTI, DL) && !isSimplestForm(C)) {
+    // Try to compute the immediate basis of C.
+    unsigned NumIterations = 0;
+    // Limit the scan radius to avoid running in quadratice time.
+    static const unsigned MaxNumIterations = 50;
+    for (auto Basis = Candidates.rbegin();
+         Basis != Candidates.rend() && NumIterations < MaxNumIterations;
+         ++Basis, ++NumIterations) {
+      if (isBasisFor(*Basis, C)) {
+        C.Basis = &(*Basis);
+        break;
+      }
+    }
+  }
+  // Regardless of whether we find a basis for C, we need to push C to the
+  // candidate list so that it can be the basis of other candidates.
+  Candidates.push_back(C);
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasis(
+    Instruction *I) {
+  switch (I->getOpcode()) {
+  case Instruction::Add:
+    allocateCandidatesAndFindBasisForAdd(I);
+    break;
+  case Instruction::Mul:
+    allocateCandidatesAndFindBasisForMul(I);
+    break;
+  case Instruction::GetElementPtr:
+    allocateCandidatesAndFindBasisForGEP(cast<GetElementPtrInst>(I));
+    break;
+  }
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForAdd(
+    Instruction *I) {
+  // Try matching B + i * S.
+  if (!isa<IntegerType>(I->getType()))
+    return;
+
+  assert(I->getNumOperands() == 2 && "isn't I an add?");
+  Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
+  allocateCandidatesAndFindBasisForAdd(LHS, RHS, I);
+  if (LHS != RHS)
+    allocateCandidatesAndFindBasisForAdd(RHS, LHS, I);
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForAdd(
+    Value *LHS, Value *RHS, Instruction *I) {
+  Value *S = nullptr;
+  ConstantInt *Idx = nullptr;
+  if (match(RHS, m_Mul(m_Value(S), m_ConstantInt(Idx)))) {
+    // I = LHS + RHS = LHS + Idx * S
+    allocateCandidatesAndFindBasis(Candidate::Add, SE->getSCEV(LHS), Idx, S, I);
+  } else if (match(RHS, m_Shl(m_Value(S), m_ConstantInt(Idx)))) {
+    // I = LHS + RHS = LHS + (S << Idx) = LHS + S * (1 << Idx)
+    APInt One(Idx->getBitWidth(), 1);
+    Idx = ConstantInt::get(Idx->getContext(), One << Idx->getValue());
+    allocateCandidatesAndFindBasis(Candidate::Add, SE->getSCEV(LHS), Idx, S, I);
+  } else {
+    // At least, I = LHS + 1 * RHS
+    ConstantInt *One = ConstantInt::get(cast<IntegerType>(I->getType()), 1);
+    allocateCandidatesAndFindBasis(Candidate::Add, SE->getSCEV(LHS), One, RHS,
+                                   I);
+  }
+}
+
+// Returns true if A matches B + C where C is constant.
+static bool matchesAdd(Value *A, Value *&B, ConstantInt *&C) {
+  return (match(A, m_Add(m_Value(B), m_ConstantInt(C))) ||
+          match(A, m_Add(m_ConstantInt(C), m_Value(B))));
+}
+
+// Returns true if A matches B | C where C is constant.
+static bool matchesOr(Value *A, Value *&B, ConstantInt *&C) {
+  return (match(A, m_Or(m_Value(B), m_ConstantInt(C))) ||
+          match(A, m_Or(m_ConstantInt(C), m_Value(B))));
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForMul(
+    Value *LHS, Value *RHS, Instruction *I) {
+  Value *B = nullptr;
+  ConstantInt *Idx = nullptr;
+  if (matchesAdd(LHS, B, Idx)) {
+    // If LHS is in the form of "Base + Index", then I is in the form of
+    // "(Base + Index) * RHS".
+    allocateCandidatesAndFindBasis(Candidate::Mul, SE->getSCEV(B), Idx, RHS, I);
+  } else if (matchesOr(LHS, B, Idx) && haveNoCommonBitsSet(B, Idx, *DL)) {
+    // If LHS is in the form of "Base | Index" and Base and Index have no common
+    // bits set, then
+    //   Base | Index = Base + Index
+    // and I is thus in the form of "(Base + Index) * RHS".
+    allocateCandidatesAndFindBasis(Candidate::Mul, SE->getSCEV(B), Idx, RHS, I);
+  } else {
+    // Otherwise, at least try the form (LHS + 0) * RHS.
+    ConstantInt *Zero = ConstantInt::get(cast<IntegerType>(I->getType()), 0);
+    allocateCandidatesAndFindBasis(Candidate::Mul, SE->getSCEV(LHS), Zero, RHS,
+                                   I);
+  }
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForMul(
+    Instruction *I) {
+  // Try matching (B + i) * S.
+  // TODO: we could extend SLSR to float and vector types.
+  if (!isa<IntegerType>(I->getType()))
+    return;
+
+  assert(I->getNumOperands() == 2 && "isn't I a mul?");
+  Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
+  allocateCandidatesAndFindBasisForMul(LHS, RHS, I);
+  if (LHS != RHS) {
+    // Symmetrically, try to split RHS to Base + Index.
+    allocateCandidatesAndFindBasisForMul(RHS, LHS, I);
+  }
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP(
+    const SCEV *B, ConstantInt *Idx, Value *S, uint64_t ElementSize,
+    Instruction *I) {
+  // I = B + sext(Idx *nsw S) * ElementSize
+  //   = B + (sext(Idx) * sext(S)) * ElementSize
+  //   = B + (sext(Idx) * ElementSize) * sext(S)
+  // Casting to IntegerType is safe because we skipped vector GEPs.
+  IntegerType *IntPtrTy = cast<IntegerType>(DL->getIntPtrType(I->getType()));
+  ConstantInt *ScaledIdx = ConstantInt::get(
+      IntPtrTy, Idx->getSExtValue() * (int64_t)ElementSize, true);
+  allocateCandidatesAndFindBasis(Candidate::GEP, B, ScaledIdx, S, I);
+}
+
+void StraightLineStrengthReduce::factorArrayIndex(Value *ArrayIdx,
+                                                  const SCEV *Base,
+                                                  uint64_t ElementSize,
+                                                  GetElementPtrInst *GEP) {
+  // At least, ArrayIdx = ArrayIdx *nsw 1.
+  allocateCandidatesAndFindBasisForGEP(
+      Base, ConstantInt::get(cast<IntegerType>(ArrayIdx->getType()), 1),
+      ArrayIdx, ElementSize, GEP);
+  Value *LHS = nullptr;
+  ConstantInt *RHS = nullptr;
+  // One alternative is matching the SCEV of ArrayIdx instead of ArrayIdx
+  // itself. This would allow us to handle the shl case for free. However,
+  // matching SCEVs has two issues:
+  //
+  // 1. this would complicate rewriting because the rewriting procedure
+  // would have to translate SCEVs back to IR instructions. This translation
+  // is difficult when LHS is further evaluated to a composite SCEV.
+  //
+  // 2. ScalarEvolution is designed to be control-flow oblivious. It tends
+  // to strip nsw/nuw flags which are critical for SLSR to trace into
+  // sext'ed multiplication.
+  if (match(ArrayIdx, m_NSWMul(m_Value(LHS), m_ConstantInt(RHS)))) {
+    // SLSR is currently unsafe if i * S may overflow.
+    // GEP = Base + sext(LHS *nsw RHS) * ElementSize
+    allocateCandidatesAndFindBasisForGEP(Base, RHS, LHS, ElementSize, GEP);
+  } else if (match(ArrayIdx, m_NSWShl(m_Value(LHS), m_ConstantInt(RHS)))) {
+    // GEP = Base + sext(LHS <<nsw RHS) * ElementSize
+    //     = Base + sext(LHS *nsw (1 << RHS)) * ElementSize
+    APInt One(RHS->getBitWidth(), 1);
+    ConstantInt *PowerOf2 =
+        ConstantInt::get(RHS->getContext(), One << RHS->getValue());
+    allocateCandidatesAndFindBasisForGEP(Base, PowerOf2, LHS, ElementSize, GEP);
+  }
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP(
+    GetElementPtrInst *GEP) {
+  // TODO: handle vector GEPs
+  if (GEP->getType()->isVectorTy())
+    return;
+
+  SmallVector<const SCEV *, 4> IndexExprs;
+  for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I)
+    IndexExprs.push_back(SE->getSCEV(*I));
+
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
+    if (GTI.isStruct())
+      continue;
+
+    const SCEV *OrigIndexExpr = IndexExprs[I - 1];
+    IndexExprs[I - 1] = SE->getZero(OrigIndexExpr->getType());
+
+    // The base of this candidate is GEP's base plus the offsets of all
+    // indices except this current one.
+    const SCEV *BaseExpr = SE->getGEPExpr(cast<GEPOperator>(GEP), IndexExprs);
+    Value *ArrayIdx = GEP->getOperand(I);
+    uint64_t ElementSize = DL->getTypeAllocSize(GTI.getIndexedType());
+    if (ArrayIdx->getType()->getIntegerBitWidth() <=
+        DL->getPointerSizeInBits(GEP->getAddressSpace())) {
+      // Skip factoring if ArrayIdx is wider than the pointer size, because
+      // ArrayIdx is implicitly truncated to the pointer size.
+      factorArrayIndex(ArrayIdx, BaseExpr, ElementSize, GEP);
+    }
+    // When ArrayIdx is the sext of a value, we try to factor that value as
+    // well.  Handling this case is important because array indices are
+    // typically sign-extended to the pointer size.
+    Value *TruncatedArrayIdx = nullptr;
+    if (match(ArrayIdx, m_SExt(m_Value(TruncatedArrayIdx))) &&
+        TruncatedArrayIdx->getType()->getIntegerBitWidth() <=
+            DL->getPointerSizeInBits(GEP->getAddressSpace())) {
+      // Skip factoring if TruncatedArrayIdx is wider than the pointer size,
+      // because TruncatedArrayIdx is implicitly truncated to the pointer size.
+      factorArrayIndex(TruncatedArrayIdx, BaseExpr, ElementSize, GEP);
+    }
+
+    IndexExprs[I - 1] = OrigIndexExpr;
+  }
+}
+
+// A helper function that unifies the bitwidth of A and B.
+static void unifyBitWidth(APInt &A, APInt &B) {
+  if (A.getBitWidth() < B.getBitWidth())
+    A = A.sext(B.getBitWidth());
+  else if (A.getBitWidth() > B.getBitWidth())
+    B = B.sext(A.getBitWidth());
+}
+
+Value *StraightLineStrengthReduce::emitBump(const Candidate &Basis,
+                                            const Candidate &C,
+                                            IRBuilder<> &Builder,
+                                            const DataLayout *DL,
+                                            bool &BumpWithUglyGEP) {
+  APInt Idx = C.Index->getValue(), BasisIdx = Basis.Index->getValue();
+  unifyBitWidth(Idx, BasisIdx);
+  APInt IndexOffset = Idx - BasisIdx;
+
+  BumpWithUglyGEP = false;
+  if (Basis.CandidateKind == Candidate::GEP) {
+    APInt ElementSize(
+        IndexOffset.getBitWidth(),
+        DL->getTypeAllocSize(
+            cast<GetElementPtrInst>(Basis.Ins)->getResultElementType()));
+    APInt Q, R;
+    APInt::sdivrem(IndexOffset, ElementSize, Q, R);
+    if (R == 0)
+      IndexOffset = Q;
+    else
+      BumpWithUglyGEP = true;
+  }
+
+  // Compute Bump = C - Basis = (i' - i) * S.
+  // Common case 1: if (i' - i) is 1, Bump = S.
+  if (IndexOffset == 1)
+    return C.Stride;
+  // Common case 2: if (i' - i) is -1, Bump = -S.
+  if (IndexOffset.isAllOnesValue())
+    return Builder.CreateNeg(C.Stride);
+
+  // Otherwise, Bump = (i' - i) * sext/trunc(S). Note that (i' - i) and S may
+  // have different bit widths.
+  IntegerType *DeltaType =
+      IntegerType::get(Basis.Ins->getContext(), IndexOffset.getBitWidth());
+  Value *ExtendedStride = Builder.CreateSExtOrTrunc(C.Stride, DeltaType);
+  if (IndexOffset.isPowerOf2()) {
+    // If (i' - i) is a power of 2, Bump = sext/trunc(S) << log(i' - i).
+    ConstantInt *Exponent = ConstantInt::get(DeltaType, IndexOffset.logBase2());
+    return Builder.CreateShl(ExtendedStride, Exponent);
+  }
+  if ((-IndexOffset).isPowerOf2()) {
+    // If (i - i') is a power of 2, Bump = -sext/trunc(S) << log(i' - i).
+    ConstantInt *Exponent =
+        ConstantInt::get(DeltaType, (-IndexOffset).logBase2());
+    return Builder.CreateNeg(Builder.CreateShl(ExtendedStride, Exponent));
+  }
+  Constant *Delta = ConstantInt::get(DeltaType, IndexOffset);
+  return Builder.CreateMul(ExtendedStride, Delta);
+}
+
+void StraightLineStrengthReduce::rewriteCandidateWithBasis(
+    const Candidate &C, const Candidate &Basis) {
+  assert(C.CandidateKind == Basis.CandidateKind && C.Base == Basis.Base &&
+         C.Stride == Basis.Stride);
+  // We run rewriteCandidateWithBasis on all candidates in a post-order, so the
+  // basis of a candidate cannot be unlinked before the candidate.
+  assert(Basis.Ins->getParent() != nullptr && "the basis is unlinked");
+
+  // An instruction can correspond to multiple candidates. Therefore, instead of
+  // simply deleting an instruction when we rewrite it, we mark its parent as
+  // nullptr (i.e. unlink it) so that we can skip the candidates whose
+  // instruction is already rewritten.
+  if (!C.Ins->getParent())
+    return;
+
+  IRBuilder<> Builder(C.Ins);
+  bool BumpWithUglyGEP;
+  Value *Bump = emitBump(Basis, C, Builder, DL, BumpWithUglyGEP);
+  Value *Reduced = nullptr; // equivalent to but weaker than C.Ins
+  switch (C.CandidateKind) {
+  case Candidate::Add:
+  case Candidate::Mul: {
+    // C = Basis + Bump
+    Value *NegBump;
+    if (match(Bump, m_Neg(m_Value(NegBump)))) {
+      // If Bump is a neg instruction, emit C = Basis - (-Bump).
+      Reduced = Builder.CreateSub(Basis.Ins, NegBump);
+      // We only use the negative argument of Bump, and Bump itself may be
+      // trivially dead.
+      RecursivelyDeleteTriviallyDeadInstructions(Bump);
+    } else {
+      // It's tempting to preserve nsw on Bump and/or Reduced. However, it's
+      // usually unsound, e.g.,
+      //
+      // X = (-2 +nsw 1) *nsw INT_MAX
+      // Y = (-2 +nsw 3) *nsw INT_MAX
+      //   =>
+      // Y = X + 2 * INT_MAX
+      //
+      // Neither + and * in the resultant expression are nsw.
+      Reduced = Builder.CreateAdd(Basis.Ins, Bump);
+    }
+    break;
+  }
+  case Candidate::GEP:
+    {
+      Type *IntPtrTy = DL->getIntPtrType(C.Ins->getType());
+      bool InBounds = cast<GetElementPtrInst>(C.Ins)->isInBounds();
+      if (BumpWithUglyGEP) {
+        // C = (char *)Basis + Bump
+        unsigned AS = Basis.Ins->getType()->getPointerAddressSpace();
+        Type *CharTy = Type::getInt8PtrTy(Basis.Ins->getContext(), AS);
+        Reduced = Builder.CreateBitCast(Basis.Ins, CharTy);
+        if (InBounds)
+          Reduced =
+              Builder.CreateInBoundsGEP(Builder.getInt8Ty(), Reduced, Bump);
+        else
+          Reduced = Builder.CreateGEP(Builder.getInt8Ty(), Reduced, Bump);
+        Reduced = Builder.CreateBitCast(Reduced, C.Ins->getType());
+      } else {
+        // C = gep Basis, Bump
+        // Canonicalize bump to pointer size.
+        Bump = Builder.CreateSExtOrTrunc(Bump, IntPtrTy);
+        if (InBounds)
+          Reduced = Builder.CreateInBoundsGEP(
+              cast<GetElementPtrInst>(Basis.Ins)->getResultElementType(),
+              Basis.Ins, Bump);
+        else
+          Reduced = Builder.CreateGEP(
+              cast<GetElementPtrInst>(Basis.Ins)->getResultElementType(),
+              Basis.Ins, Bump);
+      }
+      break;
+    }
+  default:
+    llvm_unreachable("C.CandidateKind is invalid");
+  };
+  Reduced->takeName(C.Ins);
+  C.Ins->replaceAllUsesWith(Reduced);
+  // Unlink C.Ins so that we can skip other candidates also corresponding to
+  // C.Ins. The actual deletion is postponed to the end of runOnFunction.
+  C.Ins->removeFromParent();
+  UnlinkedInstructions.push_back(C.Ins);
+}
+
 bool StraightLineStrengthReduceLegacyPass::runOnFunction(Function &F) {
-  if (skipFunction(F)) 
-    return false; 
- 
+  if (skipFunction(F))
+    return false;
+
   auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
@@ -726,35 +726,35 @@ bool StraightLineStrengthReduceLegacyPass::runOnFunction(Function &F) {
 }
 
 bool StraightLineStrengthReduce::runOnFunction(Function &F) {
-  // Traverse the dominator tree in the depth-first order. This order makes sure 
-  // all bases of a candidate are in Candidates when we process it. 
-  for (const auto Node : depth_first(DT)) 
-    for (auto &I : *(Node->getBlock())) 
-      allocateCandidatesAndFindBasis(&I); 
- 
-  // Rewrite candidates in the reverse depth-first order. This order makes sure 
-  // a candidate being rewritten is not a basis for any other candidate. 
-  while (!Candidates.empty()) { 
-    const Candidate &C = Candidates.back(); 
-    if (C.Basis != nullptr) { 
-      rewriteCandidateWithBasis(C, *C.Basis); 
-    } 
-    Candidates.pop_back(); 
-  } 
- 
-  // Delete all unlink instructions. 
-  for (auto *UnlinkedInst : UnlinkedInstructions) { 
-    for (unsigned I = 0, E = UnlinkedInst->getNumOperands(); I != E; ++I) { 
-      Value *Op = UnlinkedInst->getOperand(I); 
-      UnlinkedInst->setOperand(I, nullptr); 
-      RecursivelyDeleteTriviallyDeadInstructions(Op); 
-    } 
-    UnlinkedInst->deleteValue(); 
-  } 
-  bool Ret = !UnlinkedInstructions.empty(); 
-  UnlinkedInstructions.clear(); 
-  return Ret; 
-} 
+  // Traverse the dominator tree in the depth-first order. This order makes sure
+  // all bases of a candidate are in Candidates when we process it.
+  for (const auto Node : depth_first(DT))
+    for (auto &I : *(Node->getBlock()))
+      allocateCandidatesAndFindBasis(&I);
+
+  // Rewrite candidates in the reverse depth-first order. This order makes sure
+  // a candidate being rewritten is not a basis for any other candidate.
+  while (!Candidates.empty()) {
+    const Candidate &C = Candidates.back();
+    if (C.Basis != nullptr) {
+      rewriteCandidateWithBasis(C, *C.Basis);
+    }
+    Candidates.pop_back();
+  }
+
+  // Delete all unlink instructions.
+  for (auto *UnlinkedInst : UnlinkedInstructions) {
+    for (unsigned I = 0, E = UnlinkedInst->getNumOperands(); I != E; ++I) {
+      Value *Op = UnlinkedInst->getOperand(I);
+      UnlinkedInst->setOperand(I, nullptr);
+      RecursivelyDeleteTriviallyDeadInstructions(Op);
+    }
+    UnlinkedInst->deleteValue();
+  }
+  bool Ret = !UnlinkedInstructions.empty();
+  UnlinkedInstructions.clear();
+  return Ret;
+}
 
 namespace llvm {
 
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/StructurizeCFG.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/StructurizeCFG.cpp
index 5fd33b57e3..3e15cad5f3 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -1,315 +1,315 @@
-//===- StructurizeCFG.cpp -------------------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
- 
+//===- StructurizeCFG.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include "llvm/Transforms/Scalar/StructurizeCFG.h"
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/MapVector.h" 
-#include "llvm/ADT/SCCIterator.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/Analysis/LegacyDivergenceAnalysis.h" 
-#include "llvm/Analysis/RegionInfo.h" 
-#include "llvm/Analysis/RegionIterator.h" 
-#include "llvm/Analysis/RegionPass.h" 
-#include "llvm/IR/Argument.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Metadata.h" 
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
+#include "llvm/Analysis/RegionInfo.h"
+#include "llvm/Analysis/RegionIterator.h"
+#include "llvm/Analysis/RegionPass.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/IR/ValueHandle.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Transforms/Utils/SSAUpdater.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <utility> 
- 
-using namespace llvm; 
-using namespace llvm::PatternMatch; 
- 
-#define DEBUG_TYPE "structurizecfg" 
- 
-// The name for newly created blocks. 
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include <algorithm>
+#include <cassert>
+#include <utility>
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "structurizecfg"
+
+// The name for newly created blocks.
 const char FlowBlockName[] = "Flow";
- 
-namespace { 
- 
-static cl::opt<bool> ForceSkipUniformRegions( 
-  "structurizecfg-skip-uniform-regions", 
-  cl::Hidden, 
-  cl::desc("Force whether the StructurizeCFG pass skips uniform regions"), 
-  cl::init(false)); 
- 
-static cl::opt<bool> 
-    RelaxedUniformRegions("structurizecfg-relaxed-uniform-regions", cl::Hidden, 
-                          cl::desc("Allow relaxed uniform region checks"), 
-                          cl::init(true)); 
- 
-// Definition of the complex types used in this pass. 
- 
-using BBValuePair = std::pair<BasicBlock *, Value *>; 
- 
-using RNVector = SmallVector<RegionNode *, 8>; 
-using BBVector = SmallVector<BasicBlock *, 8>; 
-using BranchVector = SmallVector<BranchInst *, 8>; 
-using BBValueVector = SmallVector<BBValuePair, 2>; 
- 
-using BBSet = SmallPtrSet<BasicBlock *, 8>; 
- 
-using PhiMap = MapVector<PHINode *, BBValueVector>; 
-using BB2BBVecMap = MapVector<BasicBlock *, BBVector>; 
- 
-using BBPhiMap = DenseMap<BasicBlock *, PhiMap>; 
-using BBPredicates = DenseMap<BasicBlock *, Value *>; 
-using PredMap = DenseMap<BasicBlock *, BBPredicates>; 
-using BB2BBMap = DenseMap<BasicBlock *, BasicBlock *>; 
- 
-// A traits type that is intended to be used in graph algorithms. The graph 
-// traits starts at an entry node, and traverses the RegionNodes that are in 
-// the Nodes set. 
-struct SubGraphTraits { 
-  using NodeRef = std::pair<RegionNode *, SmallDenseSet<RegionNode *> *>; 
-  using BaseSuccIterator = GraphTraits<RegionNode *>::ChildIteratorType; 
- 
-  // This wraps a set of Nodes into the iterator, so we know which edges to 
-  // filter out. 
-  class WrappedSuccIterator 
-      : public iterator_adaptor_base< 
-            WrappedSuccIterator, BaseSuccIterator, 
-            typename std::iterator_traits<BaseSuccIterator>::iterator_category, 
-            NodeRef, std::ptrdiff_t, NodeRef *, NodeRef> { 
-    SmallDenseSet<RegionNode *> *Nodes; 
- 
-  public: 
-    WrappedSuccIterator(BaseSuccIterator It, SmallDenseSet<RegionNode *> *Nodes) 
-        : iterator_adaptor_base(It), Nodes(Nodes) {} 
- 
-    NodeRef operator*() const { return {*I, Nodes}; } 
-  }; 
- 
-  static bool filterAll(const NodeRef &N) { return true; } 
-  static bool filterSet(const NodeRef &N) { return N.second->count(N.first); } 
- 
-  using ChildIteratorType = 
-      filter_iterator<WrappedSuccIterator, bool (*)(const NodeRef &)>; 
- 
-  static NodeRef getEntryNode(Region *R) { 
-    return {GraphTraits<Region *>::getEntryNode(R), nullptr}; 
-  } 
- 
-  static NodeRef getEntryNode(NodeRef N) { return N; } 
- 
-  static iterator_range<ChildIteratorType> children(const NodeRef &N) { 
-    auto *filter = N.second ? &filterSet : &filterAll; 
-    return make_filter_range( 
-        make_range<WrappedSuccIterator>( 
-            {GraphTraits<RegionNode *>::child_begin(N.first), N.second}, 
-            {GraphTraits<RegionNode *>::child_end(N.first), N.second}), 
-        filter); 
-  } 
- 
-  static ChildIteratorType child_begin(const NodeRef &N) { 
-    return children(N).begin(); 
-  } 
- 
-  static ChildIteratorType child_end(const NodeRef &N) { 
-    return children(N).end(); 
-  } 
-}; 
- 
-/// Finds the nearest common dominator of a set of BasicBlocks. 
-/// 
-/// For every BB you add to the set, you can specify whether we "remember" the 
-/// block.  When you get the common dominator, you can also ask whether it's one 
-/// of the blocks we remembered. 
-class NearestCommonDominator { 
-  DominatorTree *DT; 
-  BasicBlock *Result = nullptr; 
-  bool ResultIsRemembered = false; 
- 
-  /// Add BB to the resulting dominator. 
-  void addBlock(BasicBlock *BB, bool Remember) { 
-    if (!Result) { 
-      Result = BB; 
-      ResultIsRemembered = Remember; 
-      return; 
-    } 
- 
-    BasicBlock *NewResult = DT->findNearestCommonDominator(Result, BB); 
-    if (NewResult != Result) 
-      ResultIsRemembered = false; 
-    if (NewResult == BB) 
-      ResultIsRemembered |= Remember; 
-    Result = NewResult; 
-  } 
- 
-public: 
-  explicit NearestCommonDominator(DominatorTree *DomTree) : DT(DomTree) {} 
- 
-  void addBlock(BasicBlock *BB) { 
-    addBlock(BB, /* Remember = */ false); 
-  } 
- 
-  void addAndRememberBlock(BasicBlock *BB) { 
-    addBlock(BB, /* Remember = */ true); 
-  } 
- 
-  /// Get the nearest common dominator of all the BBs added via addBlock() and 
-  /// addAndRememberBlock(). 
-  BasicBlock *result() { return Result; } 
- 
-  /// Is the BB returned by getResult() one of the blocks we added to the set 
-  /// with addAndRememberBlock()? 
-  bool resultIsRememberedBlock() { return ResultIsRemembered; } 
-}; 
- 
-/// Transforms the control flow graph on one single entry/exit region 
-/// at a time. 
-/// 
-/// After the transform all "If"/"Then"/"Else" style control flow looks like 
-/// this: 
-/// 
-/// \verbatim 
-/// 1 
-/// || 
-/// | | 
-/// 2 | 
-/// | / 
-/// |/ 
-/// 3 
-/// ||   Where: 
-/// | |  1 = "If" block, calculates the condition 
-/// 4 |  2 = "Then" subregion, runs if the condition is true 
-/// | /  3 = "Flow" blocks, newly inserted flow blocks, rejoins the flow 
-/// |/   4 = "Else" optional subregion, runs if the condition is false 
-/// 5    5 = "End" block, also rejoins the control flow 
-/// \endverbatim 
-/// 
-/// Control flow is expressed as a branch where the true exit goes into the 
-/// "Then"/"Else" region, while the false exit skips the region 
-/// The condition for the optional "Else" region is expressed as a PHI node. 
-/// The incoming values of the PHI node are true for the "If" edge and false 
-/// for the "Then" edge. 
-/// 
-/// Additionally to that even complicated loops look like this: 
-/// 
-/// \verbatim 
-/// 1 
-/// || 
-/// | | 
-/// 2 ^  Where: 
-/// | /  1 = "Entry" block 
-/// |/   2 = "Loop" optional subregion, with all exits at "Flow" block 
-/// 3    3 = "Flow" block, with back edge to entry block 
-/// | 
-/// \endverbatim 
-/// 
-/// The back edge of the "Flow" block is always on the false side of the branch 
-/// while the true side continues the general flow. So the loop condition 
-/// consist of a network of PHI nodes where the true incoming values expresses 
-/// breaks and the false values expresses continue states. 
- 
+
+namespace {
+
+static cl::opt<bool> ForceSkipUniformRegions(
+  "structurizecfg-skip-uniform-regions",
+  cl::Hidden,
+  cl::desc("Force whether the StructurizeCFG pass skips uniform regions"),
+  cl::init(false));
+
+static cl::opt<bool>
+    RelaxedUniformRegions("structurizecfg-relaxed-uniform-regions", cl::Hidden,
+                          cl::desc("Allow relaxed uniform region checks"),
+                          cl::init(true));
+
+// Definition of the complex types used in this pass.
+
+using BBValuePair = std::pair<BasicBlock *, Value *>;
+
+using RNVector = SmallVector<RegionNode *, 8>;
+using BBVector = SmallVector<BasicBlock *, 8>;
+using BranchVector = SmallVector<BranchInst *, 8>;
+using BBValueVector = SmallVector<BBValuePair, 2>;
+
+using BBSet = SmallPtrSet<BasicBlock *, 8>;
+
+using PhiMap = MapVector<PHINode *, BBValueVector>;
+using BB2BBVecMap = MapVector<BasicBlock *, BBVector>;
+
+using BBPhiMap = DenseMap<BasicBlock *, PhiMap>;
+using BBPredicates = DenseMap<BasicBlock *, Value *>;
+using PredMap = DenseMap<BasicBlock *, BBPredicates>;
+using BB2BBMap = DenseMap<BasicBlock *, BasicBlock *>;
+
+// A traits type that is intended to be used in graph algorithms. The graph
+// traits starts at an entry node, and traverses the RegionNodes that are in
+// the Nodes set.
+struct SubGraphTraits {
+  using NodeRef = std::pair<RegionNode *, SmallDenseSet<RegionNode *> *>;
+  using BaseSuccIterator = GraphTraits<RegionNode *>::ChildIteratorType;
+
+  // This wraps a set of Nodes into the iterator, so we know which edges to
+  // filter out.
+  class WrappedSuccIterator
+      : public iterator_adaptor_base<
+            WrappedSuccIterator, BaseSuccIterator,
+            typename std::iterator_traits<BaseSuccIterator>::iterator_category,
+            NodeRef, std::ptrdiff_t, NodeRef *, NodeRef> {
+    SmallDenseSet<RegionNode *> *Nodes;
+
+  public:
+    WrappedSuccIterator(BaseSuccIterator It, SmallDenseSet<RegionNode *> *Nodes)
+        : iterator_adaptor_base(It), Nodes(Nodes) {}
+
+    NodeRef operator*() const { return {*I, Nodes}; }
+  };
+
+  static bool filterAll(const NodeRef &N) { return true; }
+  static bool filterSet(const NodeRef &N) { return N.second->count(N.first); }
+
+  using ChildIteratorType =
+      filter_iterator<WrappedSuccIterator, bool (*)(const NodeRef &)>;
+
+  static NodeRef getEntryNode(Region *R) {
+    return {GraphTraits<Region *>::getEntryNode(R), nullptr};
+  }
+
+  static NodeRef getEntryNode(NodeRef N) { return N; }
+
+  static iterator_range<ChildIteratorType> children(const NodeRef &N) {
+    auto *filter = N.second ? &filterSet : &filterAll;
+    return make_filter_range(
+        make_range<WrappedSuccIterator>(
+            {GraphTraits<RegionNode *>::child_begin(N.first), N.second},
+            {GraphTraits<RegionNode *>::child_end(N.first), N.second}),
+        filter);
+  }
+
+  static ChildIteratorType child_begin(const NodeRef &N) {
+    return children(N).begin();
+  }
+
+  static ChildIteratorType child_end(const NodeRef &N) {
+    return children(N).end();
+  }
+};
+
+/// Finds the nearest common dominator of a set of BasicBlocks.
+///
+/// For every BB you add to the set, you can specify whether we "remember" the
+/// block.  When you get the common dominator, you can also ask whether it's one
+/// of the blocks we remembered.
+class NearestCommonDominator {
+  DominatorTree *DT;
+  BasicBlock *Result = nullptr;
+  bool ResultIsRemembered = false;
+
+  /// Add BB to the resulting dominator.
+  void addBlock(BasicBlock *BB, bool Remember) {
+    if (!Result) {
+      Result = BB;
+      ResultIsRemembered = Remember;
+      return;
+    }
+
+    BasicBlock *NewResult = DT->findNearestCommonDominator(Result, BB);
+    if (NewResult != Result)
+      ResultIsRemembered = false;
+    if (NewResult == BB)
+      ResultIsRemembered |= Remember;
+    Result = NewResult;
+  }
+
+public:
+  explicit NearestCommonDominator(DominatorTree *DomTree) : DT(DomTree) {}
+
+  void addBlock(BasicBlock *BB) {
+    addBlock(BB, /* Remember = */ false);
+  }
+
+  void addAndRememberBlock(BasicBlock *BB) {
+    addBlock(BB, /* Remember = */ true);
+  }
+
+  /// Get the nearest common dominator of all the BBs added via addBlock() and
+  /// addAndRememberBlock().
+  BasicBlock *result() { return Result; }
+
+  /// Is the BB returned by getResult() one of the blocks we added to the set
+  /// with addAndRememberBlock()?
+  bool resultIsRememberedBlock() { return ResultIsRemembered; }
+};
+
+/// Transforms the control flow graph on one single entry/exit region
+/// at a time.
+///
+/// After the transform all "If"/"Then"/"Else" style control flow looks like
+/// this:
+///
+/// \verbatim
+/// 1
+/// ||
+/// | |
+/// 2 |
+/// | /
+/// |/
+/// 3
+/// ||   Where:
+/// | |  1 = "If" block, calculates the condition
+/// 4 |  2 = "Then" subregion, runs if the condition is true
+/// | /  3 = "Flow" blocks, newly inserted flow blocks, rejoins the flow
+/// |/   4 = "Else" optional subregion, runs if the condition is false
+/// 5    5 = "End" block, also rejoins the control flow
+/// \endverbatim
+///
+/// Control flow is expressed as a branch where the true exit goes into the
+/// "Then"/"Else" region, while the false exit skips the region
+/// The condition for the optional "Else" region is expressed as a PHI node.
+/// The incoming values of the PHI node are true for the "If" edge and false
+/// for the "Then" edge.
+///
+/// Additionally to that even complicated loops look like this:
+///
+/// \verbatim
+/// 1
+/// ||
+/// | |
+/// 2 ^  Where:
+/// | /  1 = "Entry" block
+/// |/   2 = "Loop" optional subregion, with all exits at "Flow" block
+/// 3    3 = "Flow" block, with back edge to entry block
+/// |
+/// \endverbatim
+///
+/// The back edge of the "Flow" block is always on the false side of the branch
+/// while the true side continues the general flow. So the loop condition
+/// consist of a network of PHI nodes where the true incoming values expresses
+/// breaks and the false values expresses continue states.
+
 class StructurizeCFG {
-  Type *Boolean; 
-  ConstantInt *BoolTrue; 
-  ConstantInt *BoolFalse; 
-  UndefValue *BoolUndef; 
- 
-  Function *Func; 
-  Region *ParentRegion; 
- 
+  Type *Boolean;
+  ConstantInt *BoolTrue;
+  ConstantInt *BoolFalse;
+  UndefValue *BoolUndef;
+
+  Function *Func;
+  Region *ParentRegion;
+
   LegacyDivergenceAnalysis *DA = nullptr;
-  DominatorTree *DT; 
- 
-  SmallVector<RegionNode *, 8> Order; 
-  BBSet Visited; 
- 
-  SmallVector<WeakVH, 8> AffectedPhis; 
-  BBPhiMap DeletedPhis; 
-  BB2BBVecMap AddedPhis; 
- 
-  PredMap Predicates; 
-  BranchVector Conditions; 
- 
-  BB2BBMap Loops; 
-  PredMap LoopPreds; 
-  BranchVector LoopConds; 
- 
-  RegionNode *PrevNode; 
- 
-  void orderNodes(); 
- 
-  void analyzeLoops(RegionNode *N); 
- 
-  Value *buildCondition(BranchInst *Term, unsigned Idx, bool Invert); 
- 
-  void gatherPredicates(RegionNode *N); 
- 
-  void collectInfos(); 
- 
-  void insertConditions(bool Loops); 
- 
-  void delPhiValues(BasicBlock *From, BasicBlock *To); 
- 
-  void addPhiValues(BasicBlock *From, BasicBlock *To); 
- 
-  void setPhiValues(); 
- 
-  void simplifyAffectedPhis(); 
- 
-  void killTerminator(BasicBlock *BB); 
- 
-  void changeExit(RegionNode *Node, BasicBlock *NewExit, 
-                  bool IncludeDominator); 
- 
-  BasicBlock *getNextFlow(BasicBlock *Dominator); 
- 
-  BasicBlock *needPrefix(bool NeedEmpty); 
- 
-  BasicBlock *needPostfix(BasicBlock *Flow, bool ExitUseAllowed); 
- 
-  void setPrevNode(BasicBlock *BB); 
- 
-  bool dominatesPredicates(BasicBlock *BB, RegionNode *Node); 
- 
-  bool isPredictableTrue(RegionNode *Node); 
- 
-  void wireFlow(bool ExitUseAllowed, BasicBlock *LoopEnd); 
- 
-  void handleLoops(bool ExitUseAllowed, BasicBlock *LoopEnd); 
- 
-  void createFlow(); 
- 
-  void rebuildSSA(); 
- 
-public: 
+  DominatorTree *DT;
+
+  SmallVector<RegionNode *, 8> Order;
+  BBSet Visited;
+
+  SmallVector<WeakVH, 8> AffectedPhis;
+  BBPhiMap DeletedPhis;
+  BB2BBVecMap AddedPhis;
+
+  PredMap Predicates;
+  BranchVector Conditions;
+
+  BB2BBMap Loops;
+  PredMap LoopPreds;
+  BranchVector LoopConds;
+
+  RegionNode *PrevNode;
+
+  void orderNodes();
+
+  void analyzeLoops(RegionNode *N);
+
+  Value *buildCondition(BranchInst *Term, unsigned Idx, bool Invert);
+
+  void gatherPredicates(RegionNode *N);
+
+  void collectInfos();
+
+  void insertConditions(bool Loops);
+
+  void delPhiValues(BasicBlock *From, BasicBlock *To);
+
+  void addPhiValues(BasicBlock *From, BasicBlock *To);
+
+  void setPhiValues();
+
+  void simplifyAffectedPhis();
+
+  void killTerminator(BasicBlock *BB);
+
+  void changeExit(RegionNode *Node, BasicBlock *NewExit,
+                  bool IncludeDominator);
+
+  BasicBlock *getNextFlow(BasicBlock *Dominator);
+
+  BasicBlock *needPrefix(bool NeedEmpty);
+
+  BasicBlock *needPostfix(BasicBlock *Flow, bool ExitUseAllowed);
+
+  void setPrevNode(BasicBlock *BB);
+
+  bool dominatesPredicates(BasicBlock *BB, RegionNode *Node);
+
+  bool isPredictableTrue(RegionNode *Node);
+
+  void wireFlow(bool ExitUseAllowed, BasicBlock *LoopEnd);
+
+  void handleLoops(bool ExitUseAllowed, BasicBlock *LoopEnd);
+
+  void createFlow();
+
+  void rebuildSSA();
+
+public:
   void init(Region *R);
   bool run(Region *R, DominatorTree *DT);
   bool makeUniformRegion(Region *R, LegacyDivergenceAnalysis *DA);
@@ -319,15 +319,15 @@ class StructurizeCFGLegacyPass : public RegionPass {
   bool SkipUniformRegions;
 
 public:
-  static char ID; 
- 
+  static char ID;
+
   explicit StructurizeCFGLegacyPass(bool SkipUniformRegions_ = false)
       : RegionPass(ID), SkipUniformRegions(SkipUniformRegions_) {
-    if (ForceSkipUniformRegions.getNumOccurrences()) 
-      SkipUniformRegions = ForceSkipUniformRegions.getValue(); 
+    if (ForceSkipUniformRegions.getNumOccurrences())
+      SkipUniformRegions = ForceSkipUniformRegions.getValue();
     initializeStructurizeCFGLegacyPassPass(*PassRegistry::getPassRegistry());
-  } 
- 
+  }
+
   bool runOnRegion(Region *R, RGPassManager &RGM) override {
     StructurizeCFG SCFG;
     SCFG.init(R);
@@ -339,675 +339,675 @@ public:
     DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     return SCFG.run(R, DT);
   }
- 
-  StringRef getPassName() const override { return "Structurize control flow"; } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    if (SkipUniformRegions) 
-      AU.addRequired<LegacyDivergenceAnalysis>(); 
-    AU.addRequiredID(LowerSwitchID); 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
- 
-    AU.addPreserved<DominatorTreeWrapperPass>(); 
-    RegionPass::getAnalysisUsage(AU); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
+
+  StringRef getPassName() const override { return "Structurize control flow"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    if (SkipUniformRegions)
+      AU.addRequired<LegacyDivergenceAnalysis>();
+    AU.addRequiredID(LowerSwitchID);
+    AU.addRequired<DominatorTreeWrapperPass>();
+
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    RegionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // end anonymous namespace
+
 char StructurizeCFGLegacyPass::ID = 0;
- 
+
 INITIALIZE_PASS_BEGIN(StructurizeCFGLegacyPass, "structurizecfg",
                       "Structurize the CFG", false, false)
-INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) 
+INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
 INITIALIZE_PASS_DEPENDENCY(LowerSwitchLegacyPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(RegionInfoPass) 
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(RegionInfoPass)
 INITIALIZE_PASS_END(StructurizeCFGLegacyPass, "structurizecfg",
                     "Structurize the CFG", false, false)
- 
-/// Build up the general order of nodes, by performing a topological sort of the 
-/// parent region's nodes, while ensuring that there is no outer cycle node 
-/// between any two inner cycle nodes. 
-void StructurizeCFG::orderNodes() { 
-  Order.resize(std::distance(GraphTraits<Region *>::nodes_begin(ParentRegion), 
-                             GraphTraits<Region *>::nodes_end(ParentRegion))); 
-  if (Order.empty()) 
-    return; 
- 
-  SmallDenseSet<RegionNode *> Nodes; 
-  auto EntryNode = SubGraphTraits::getEntryNode(ParentRegion); 
- 
-  // A list of range indices of SCCs in Order, to be processed. 
-  SmallVector<std::pair<unsigned, unsigned>, 8> WorkList; 
-  unsigned I = 0, E = Order.size(); 
-  while (true) { 
-    // Run through all the SCCs in the subgraph starting with Entry. 
-    for (auto SCCI = 
-             scc_iterator<SubGraphTraits::NodeRef, SubGraphTraits>::begin( 
-                 EntryNode); 
-         !SCCI.isAtEnd(); ++SCCI) { 
-      auto &SCC = *SCCI; 
- 
-      // An SCC up to the size of 2, can be reduced to an entry (the last node), 
-      // and a possible additional node. Therefore, it is already in order, and 
-      // there is no need to add it to the work-list. 
-      unsigned Size = SCC.size(); 
-      if (Size > 2) 
-        WorkList.emplace_back(I, I + Size); 
- 
-      // Add the SCC nodes to the Order array. 
-      for (auto &N : SCC) { 
-        assert(I < E && "SCC size mismatch!"); 
-        Order[I++] = N.first; 
-      } 
-    } 
-    assert(I == E && "SCC size mismatch!"); 
- 
-    // If there are no more SCCs to order, then we are done. 
-    if (WorkList.empty()) 
-      break; 
- 
-    std::tie(I, E) = WorkList.pop_back_val(); 
- 
-    // Collect the set of nodes in the SCC's subgraph. These are only the 
-    // possible child nodes; we do not add the entry (last node) otherwise we 
-    // will have the same exact SCC all over again. 
-    Nodes.clear(); 
-    Nodes.insert(Order.begin() + I, Order.begin() + E - 1); 
- 
-    // Update the entry node. 
-    EntryNode.first = Order[E - 1]; 
-    EntryNode.second = &Nodes; 
-  } 
-} 
- 
-/// Determine the end of the loops 
-void StructurizeCFG::analyzeLoops(RegionNode *N) { 
-  if (N->isSubRegion()) { 
-    // Test for exit as back edge 
-    BasicBlock *Exit = N->getNodeAs<Region>()->getExit(); 
-    if (Visited.count(Exit)) 
-      Loops[Exit] = N->getEntry(); 
- 
-  } else { 
-    // Test for successors as back edge 
-    BasicBlock *BB = N->getNodeAs<BasicBlock>(); 
-    BranchInst *Term = cast<BranchInst>(BB->getTerminator()); 
- 
-    for (BasicBlock *Succ : Term->successors()) 
-      if (Visited.count(Succ)) 
-        Loops[Succ] = BB; 
-  } 
-} 
- 
-/// Build the condition for one edge 
-Value *StructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx, 
-                                      bool Invert) { 
-  Value *Cond = Invert ? BoolFalse : BoolTrue; 
-  if (Term->isConditional()) { 
-    Cond = Term->getCondition(); 
- 
-    if (Idx != (unsigned)Invert) 
-      Cond = invertCondition(Cond); 
-  } 
-  return Cond; 
-} 
- 
-/// Analyze the predecessors of each block and build up predicates 
-void StructurizeCFG::gatherPredicates(RegionNode *N) { 
-  RegionInfo *RI = ParentRegion->getRegionInfo(); 
-  BasicBlock *BB = N->getEntry(); 
-  BBPredicates &Pred = Predicates[BB]; 
-  BBPredicates &LPred = LoopPreds[BB]; 
- 
-  for (BasicBlock *P : predecessors(BB)) { 
-    // Ignore it if it's a branch from outside into our region entry 
-    if (!ParentRegion->contains(P)) 
-      continue; 
- 
-    Region *R = RI->getRegionFor(P); 
-    if (R == ParentRegion) { 
-      // It's a top level block in our region 
-      BranchInst *Term = cast<BranchInst>(P->getTerminator()); 
-      for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) { 
-        BasicBlock *Succ = Term->getSuccessor(i); 
-        if (Succ != BB) 
-          continue; 
- 
-        if (Visited.count(P)) { 
-          // Normal forward edge 
-          if (Term->isConditional()) { 
-            // Try to treat it like an ELSE block 
-            BasicBlock *Other = Term->getSuccessor(!i); 
-            if (Visited.count(Other) && !Loops.count(Other) && 
-                !Pred.count(Other) && !Pred.count(P)) { 
- 
-              Pred[Other] = BoolFalse; 
-              Pred[P] = BoolTrue; 
-              continue; 
-            } 
-          } 
-          Pred[P] = buildCondition(Term, i, false); 
-        } else { 
-          // Back edge 
-          LPred[P] = buildCondition(Term, i, true); 
-        } 
-      } 
-    } else { 
-      // It's an exit from a sub region 
-      while (R->getParent() != ParentRegion) 
-        R = R->getParent(); 
- 
-      // Edge from inside a subregion to its entry, ignore it 
-      if (*R == *N) 
-        continue; 
- 
-      BasicBlock *Entry = R->getEntry(); 
-      if (Visited.count(Entry)) 
-        Pred[Entry] = BoolTrue; 
-      else 
-        LPred[Entry] = BoolFalse; 
-    } 
-  } 
-} 
- 
-/// Collect various loop and predicate infos 
-void StructurizeCFG::collectInfos() { 
-  // Reset predicate 
-  Predicates.clear(); 
- 
-  // and loop infos 
-  Loops.clear(); 
-  LoopPreds.clear(); 
- 
-  // Reset the visited nodes 
-  Visited.clear(); 
- 
-  for (RegionNode *RN : reverse(Order)) { 
-    LLVM_DEBUG(dbgs() << "Visiting: " 
-                      << (RN->isSubRegion() ? "SubRegion with entry: " : "") 
-                      << RN->getEntry()->getName() << "\n"); 
- 
-    // Analyze all the conditions leading to a node 
-    gatherPredicates(RN); 
- 
-    // Remember that we've seen this node 
-    Visited.insert(RN->getEntry()); 
- 
-    // Find the last back edges 
-    analyzeLoops(RN); 
-  } 
-} 
- 
-/// Insert the missing branch conditions 
-void StructurizeCFG::insertConditions(bool Loops) { 
-  BranchVector &Conds = Loops ? LoopConds : Conditions; 
-  Value *Default = Loops ? BoolTrue : BoolFalse; 
-  SSAUpdater PhiInserter; 
- 
-  for (BranchInst *Term : Conds) { 
-    assert(Term->isConditional()); 
- 
-    BasicBlock *Parent = Term->getParent(); 
-    BasicBlock *SuccTrue = Term->getSuccessor(0); 
-    BasicBlock *SuccFalse = Term->getSuccessor(1); 
- 
-    PhiInserter.Initialize(Boolean, ""); 
-    PhiInserter.AddAvailableValue(&Func->getEntryBlock(), Default); 
-    PhiInserter.AddAvailableValue(Loops ? SuccFalse : Parent, Default); 
- 
-    BBPredicates &Preds = Loops ? LoopPreds[SuccFalse] : Predicates[SuccTrue]; 
- 
-    NearestCommonDominator Dominator(DT); 
-    Dominator.addBlock(Parent); 
- 
-    Value *ParentValue = nullptr; 
-    for (std::pair<BasicBlock *, Value *> BBAndPred : Preds) { 
-      BasicBlock *BB = BBAndPred.first; 
-      Value *Pred = BBAndPred.second; 
- 
-      if (BB == Parent) { 
-        ParentValue = Pred; 
-        break; 
-      } 
-      PhiInserter.AddAvailableValue(BB, Pred); 
-      Dominator.addAndRememberBlock(BB); 
-    } 
- 
-    if (ParentValue) { 
-      Term->setCondition(ParentValue); 
-    } else { 
-      if (!Dominator.resultIsRememberedBlock()) 
-        PhiInserter.AddAvailableValue(Dominator.result(), Default); 
- 
-      Term->setCondition(PhiInserter.GetValueInMiddleOfBlock(Parent)); 
-    } 
-  } 
-} 
- 
-/// Remove all PHI values coming from "From" into "To" and remember 
-/// them in DeletedPhis 
-void StructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) { 
-  PhiMap &Map = DeletedPhis[To]; 
-  for (PHINode &Phi : To->phis()) { 
-    bool Recorded = false; 
-    while (Phi.getBasicBlockIndex(From) != -1) { 
-      Value *Deleted = Phi.removeIncomingValue(From, false); 
-      Map[&Phi].push_back(std::make_pair(From, Deleted)); 
-      if (!Recorded) { 
-        AffectedPhis.push_back(&Phi); 
-        Recorded = true; 
-      } 
-    } 
-  } 
-} 
- 
-/// Add a dummy PHI value as soon as we knew the new predecessor 
-void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) { 
-  for (PHINode &Phi : To->phis()) { 
-    Value *Undef = UndefValue::get(Phi.getType()); 
-    Phi.addIncoming(Undef, From); 
-  } 
-  AddedPhis[To].push_back(From); 
-} 
- 
-/// Add the real PHI value as soon as everything is set up 
-void StructurizeCFG::setPhiValues() { 
-  SmallVector<PHINode *, 8> InsertedPhis; 
-  SSAUpdater Updater(&InsertedPhis); 
-  for (const auto &AddedPhi : AddedPhis) { 
-    BasicBlock *To = AddedPhi.first; 
-    const BBVector &From = AddedPhi.second; 
- 
-    if (!DeletedPhis.count(To)) 
-      continue; 
- 
-    PhiMap &Map = DeletedPhis[To]; 
-    for (const auto &PI : Map) { 
-      PHINode *Phi = PI.first; 
-      Value *Undef = UndefValue::get(Phi->getType()); 
-      Updater.Initialize(Phi->getType(), ""); 
-      Updater.AddAvailableValue(&Func->getEntryBlock(), Undef); 
-      Updater.AddAvailableValue(To, Undef); 
- 
-      NearestCommonDominator Dominator(DT); 
-      Dominator.addBlock(To); 
-      for (const auto &VI : PI.second) { 
-        Updater.AddAvailableValue(VI.first, VI.second); 
-        Dominator.addAndRememberBlock(VI.first); 
-      } 
- 
-      if (!Dominator.resultIsRememberedBlock()) 
-        Updater.AddAvailableValue(Dominator.result(), Undef); 
- 
-      for (BasicBlock *FI : From) 
-        Phi->setIncomingValueForBlock(FI, Updater.GetValueAtEndOfBlock(FI)); 
-      AffectedPhis.push_back(Phi); 
-    } 
- 
-    DeletedPhis.erase(To); 
-  } 
-  assert(DeletedPhis.empty()); 
- 
-  AffectedPhis.append(InsertedPhis.begin(), InsertedPhis.end()); 
-} 
- 
-void StructurizeCFG::simplifyAffectedPhis() { 
-  bool Changed; 
-  do { 
-    Changed = false; 
-    SimplifyQuery Q(Func->getParent()->getDataLayout()); 
-    Q.DT = DT; 
-    for (WeakVH VH : AffectedPhis) { 
-      if (auto Phi = dyn_cast_or_null<PHINode>(VH)) { 
-        if (auto NewValue = SimplifyInstruction(Phi, Q)) { 
-          Phi->replaceAllUsesWith(NewValue); 
-          Phi->eraseFromParent(); 
-          Changed = true; 
-        } 
-      } 
-    } 
-  } while (Changed); 
-} 
- 
-/// Remove phi values from all successors and then remove the terminator. 
-void StructurizeCFG::killTerminator(BasicBlock *BB) { 
-  Instruction *Term = BB->getTerminator(); 
-  if (!Term) 
-    return; 
- 
-  for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); 
-       SI != SE; ++SI) 
-    delPhiValues(BB, *SI); 
- 
-  if (DA) 
-    DA->removeValue(Term); 
-  Term->eraseFromParent(); 
-} 
- 
-/// Let node exit(s) point to NewExit 
-void StructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit, 
-                                bool IncludeDominator) { 
-  if (Node->isSubRegion()) { 
-    Region *SubRegion = Node->getNodeAs<Region>(); 
-    BasicBlock *OldExit = SubRegion->getExit(); 
-    BasicBlock *Dominator = nullptr; 
- 
-    // Find all the edges from the sub region to the exit 
-    for (auto BBI = pred_begin(OldExit), E = pred_end(OldExit); BBI != E;) { 
-      // Incrememt BBI before mucking with BB's terminator. 
-      BasicBlock *BB = *BBI++; 
- 
-      if (!SubRegion->contains(BB)) 
-        continue; 
- 
-      // Modify the edges to point to the new exit 
-      delPhiValues(BB, OldExit); 
-      BB->getTerminator()->replaceUsesOfWith(OldExit, NewExit); 
-      addPhiValues(BB, NewExit); 
- 
-      // Find the new dominator (if requested) 
-      if (IncludeDominator) { 
-        if (!Dominator) 
-          Dominator = BB; 
-        else 
-          Dominator = DT->findNearestCommonDominator(Dominator, BB); 
-      } 
-    } 
- 
-    // Change the dominator (if requested) 
-    if (Dominator) 
-      DT->changeImmediateDominator(NewExit, Dominator); 
- 
-    // Update the region info 
-    SubRegion->replaceExit(NewExit); 
-  } else { 
-    BasicBlock *BB = Node->getNodeAs<BasicBlock>(); 
-    killTerminator(BB); 
-    BranchInst::Create(NewExit, BB); 
-    addPhiValues(BB, NewExit); 
-    if (IncludeDominator) 
-      DT->changeImmediateDominator(NewExit, BB); 
-  } 
-} 
- 
-/// Create a new flow node and update dominator tree and region info 
-BasicBlock *StructurizeCFG::getNextFlow(BasicBlock *Dominator) { 
-  LLVMContext &Context = Func->getContext(); 
-  BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() : 
-                       Order.back()->getEntry(); 
-  BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName, 
-                                        Func, Insert); 
-  DT->addNewBlock(Flow, Dominator); 
-  ParentRegion->getRegionInfo()->setRegionFor(Flow, ParentRegion); 
-  return Flow; 
-} 
- 
-/// Create a new or reuse the previous node as flow node 
-BasicBlock *StructurizeCFG::needPrefix(bool NeedEmpty) { 
-  BasicBlock *Entry = PrevNode->getEntry(); 
- 
-  if (!PrevNode->isSubRegion()) { 
-    killTerminator(Entry); 
-    if (!NeedEmpty || Entry->getFirstInsertionPt() == Entry->end()) 
-      return Entry; 
-  } 
- 
-  // create a new flow node 
-  BasicBlock *Flow = getNextFlow(Entry); 
- 
-  // and wire it up 
-  changeExit(PrevNode, Flow, true); 
-  PrevNode = ParentRegion->getBBNode(Flow); 
-  return Flow; 
-} 
- 
-/// Returns the region exit if possible, otherwise just a new flow node 
-BasicBlock *StructurizeCFG::needPostfix(BasicBlock *Flow, 
-                                        bool ExitUseAllowed) { 
-  if (!Order.empty() || !ExitUseAllowed) 
-    return getNextFlow(Flow); 
- 
-  BasicBlock *Exit = ParentRegion->getExit(); 
-  DT->changeImmediateDominator(Exit, Flow); 
-  addPhiValues(Flow, Exit); 
-  return Exit; 
-} 
- 
-/// Set the previous node 
-void StructurizeCFG::setPrevNode(BasicBlock *BB) { 
-  PrevNode = ParentRegion->contains(BB) ? ParentRegion->getBBNode(BB) 
-                                        : nullptr; 
-} 
- 
-/// Does BB dominate all the predicates of Node? 
-bool StructurizeCFG::dominatesPredicates(BasicBlock *BB, RegionNode *Node) { 
-  BBPredicates &Preds = Predicates[Node->getEntry()]; 
-  return llvm::all_of(Preds, [&](std::pair<BasicBlock *, Value *> Pred) { 
-    return DT->dominates(BB, Pred.first); 
-  }); 
-} 
- 
-/// Can we predict that this node will always be called? 
-bool StructurizeCFG::isPredictableTrue(RegionNode *Node) { 
-  BBPredicates &Preds = Predicates[Node->getEntry()]; 
-  bool Dominated = false; 
- 
-  // Regionentry is always true 
-  if (!PrevNode) 
-    return true; 
- 
-  for (std::pair<BasicBlock*, Value*> Pred : Preds) { 
-    BasicBlock *BB = Pred.first; 
-    Value *V = Pred.second; 
- 
-    if (V != BoolTrue) 
-      return false; 
- 
-    if (!Dominated && DT->dominates(BB, PrevNode->getEntry())) 
-      Dominated = true; 
-  } 
- 
-  // TODO: The dominator check is too strict 
-  return Dominated; 
-} 
- 
-/// Take one node from the order vector and wire it up 
-void StructurizeCFG::wireFlow(bool ExitUseAllowed, 
-                              BasicBlock *LoopEnd) { 
-  RegionNode *Node = Order.pop_back_val(); 
-  Visited.insert(Node->getEntry()); 
- 
-  if (isPredictableTrue(Node)) { 
-    // Just a linear flow 
-    if (PrevNode) { 
-      changeExit(PrevNode, Node->getEntry(), true); 
-    } 
-    PrevNode = Node; 
-  } else { 
-    // Insert extra prefix node (or reuse last one) 
-    BasicBlock *Flow = needPrefix(false); 
- 
-    // Insert extra postfix node (or use exit instead) 
-    BasicBlock *Entry = Node->getEntry(); 
-    BasicBlock *Next = needPostfix(Flow, ExitUseAllowed); 
- 
-    // let it point to entry and next block 
-    Conditions.push_back(BranchInst::Create(Entry, Next, BoolUndef, Flow)); 
-    addPhiValues(Flow, Entry); 
-    DT->changeImmediateDominator(Entry, Flow); 
- 
-    PrevNode = Node; 
-    while (!Order.empty() && !Visited.count(LoopEnd) && 
-           dominatesPredicates(Entry, Order.back())) { 
-      handleLoops(false, LoopEnd); 
-    } 
- 
-    changeExit(PrevNode, Next, false); 
-    setPrevNode(Next); 
-  } 
-} 
- 
-void StructurizeCFG::handleLoops(bool ExitUseAllowed, 
-                                 BasicBlock *LoopEnd) { 
-  RegionNode *Node = Order.back(); 
-  BasicBlock *LoopStart = Node->getEntry(); 
- 
-  if (!Loops.count(LoopStart)) { 
-    wireFlow(ExitUseAllowed, LoopEnd); 
-    return; 
-  } 
- 
-  if (!isPredictableTrue(Node)) 
-    LoopStart = needPrefix(true); 
- 
-  LoopEnd = Loops[Node->getEntry()]; 
-  wireFlow(false, LoopEnd); 
-  while (!Visited.count(LoopEnd)) { 
-    handleLoops(false, LoopEnd); 
-  } 
- 
-  // If the start of the loop is the entry block, we can't branch to it so 
-  // insert a new dummy entry block. 
-  Function *LoopFunc = LoopStart->getParent(); 
-  if (LoopStart == &LoopFunc->getEntryBlock()) { 
-    LoopStart->setName("entry.orig"); 
- 
-    BasicBlock *NewEntry = 
-      BasicBlock::Create(LoopStart->getContext(), 
-                         "entry", 
-                         LoopFunc, 
-                         LoopStart); 
-    BranchInst::Create(LoopStart, NewEntry); 
-    DT->setNewRoot(NewEntry); 
-  } 
- 
-  // Create an extra loop end node 
-  LoopEnd = needPrefix(false); 
-  BasicBlock *Next = needPostfix(LoopEnd, ExitUseAllowed); 
-  LoopConds.push_back(BranchInst::Create(Next, LoopStart, 
-                                         BoolUndef, LoopEnd)); 
-  addPhiValues(LoopEnd, LoopStart); 
-  setPrevNode(Next); 
-} 
- 
-/// After this function control flow looks like it should be, but 
-/// branches and PHI nodes only have undefined conditions. 
-void StructurizeCFG::createFlow() { 
-  BasicBlock *Exit = ParentRegion->getExit(); 
-  bool EntryDominatesExit = DT->dominates(ParentRegion->getEntry(), Exit); 
- 
-  AffectedPhis.clear(); 
-  DeletedPhis.clear(); 
-  AddedPhis.clear(); 
-  Conditions.clear(); 
-  LoopConds.clear(); 
- 
-  PrevNode = nullptr; 
-  Visited.clear(); 
- 
-  while (!Order.empty()) { 
-    handleLoops(EntryDominatesExit, nullptr); 
-  } 
- 
-  if (PrevNode) 
-    changeExit(PrevNode, Exit, EntryDominatesExit); 
-  else 
-    assert(EntryDominatesExit); 
-} 
- 
-/// Handle a rare case where the disintegrated nodes instructions 
-/// no longer dominate all their uses. Not sure if this is really necessary 
-void StructurizeCFG::rebuildSSA() { 
-  SSAUpdater Updater; 
-  for (BasicBlock *BB : ParentRegion->blocks()) 
-    for (Instruction &I : *BB) { 
-      bool Initialized = false; 
-      // We may modify the use list as we iterate over it, so be careful to 
-      // compute the next element in the use list at the top of the loop. 
-      for (auto UI = I.use_begin(), E = I.use_end(); UI != E;) { 
-        Use &U = *UI++; 
-        Instruction *User = cast<Instruction>(U.getUser()); 
-        if (User->getParent() == BB) { 
-          continue; 
-        } else if (PHINode *UserPN = dyn_cast<PHINode>(User)) { 
-          if (UserPN->getIncomingBlock(U) == BB) 
-            continue; 
-        } 
- 
-        if (DT->dominates(&I, User)) 
-          continue; 
- 
-        if (!Initialized) { 
-          Value *Undef = UndefValue::get(I.getType()); 
-          Updater.Initialize(I.getType(), ""); 
-          Updater.AddAvailableValue(&Func->getEntryBlock(), Undef); 
-          Updater.AddAvailableValue(BB, &I); 
-          Initialized = true; 
-        } 
-        Updater.RewriteUseAfterInsertions(U); 
-      } 
-    } 
-} 
- 
-static bool hasOnlyUniformBranches(Region *R, unsigned UniformMDKindID, 
-                                   const LegacyDivergenceAnalysis &DA) { 
-  // Bool for if all sub-regions are uniform. 
-  bool SubRegionsAreUniform = true; 
-  // Count of how many direct children are conditional. 
-  unsigned ConditionalDirectChildren = 0; 
- 
-  for (auto E : R->elements()) { 
-    if (!E->isSubRegion()) { 
-      auto Br = dyn_cast<BranchInst>(E->getEntry()->getTerminator()); 
-      if (!Br || !Br->isConditional()) 
-        continue; 
- 
-      if (!DA.isUniform(Br)) 
-        return false; 
- 
-      // One of our direct children is conditional. 
-      ConditionalDirectChildren++; 
- 
-      LLVM_DEBUG(dbgs() << "BB: " << Br->getParent()->getName() 
-                        << " has uniform terminator\n"); 
-    } else { 
-      // Explicitly refuse to treat regions as uniform if they have non-uniform 
-      // subregions. We cannot rely on DivergenceAnalysis for branches in 
-      // subregions because those branches may have been removed and re-created, 
-      // so we look for our metadata instead. 
-      // 
-      // Warning: It would be nice to treat regions as uniform based only on 
-      // their direct child basic blocks' terminators, regardless of whether 
-      // subregions are uniform or not. However, this requires a very careful 
-      // look at SIAnnotateControlFlow to make sure nothing breaks there. 
-      for (auto BB : E->getNodeAs<Region>()->blocks()) { 
-        auto Br = dyn_cast<BranchInst>(BB->getTerminator()); 
-        if (!Br || !Br->isConditional()) 
-          continue; 
- 
-        if (!Br->getMetadata(UniformMDKindID)) { 
-          // Early exit if we cannot have relaxed uniform regions. 
-          if (!RelaxedUniformRegions) 
-            return false; 
- 
-          SubRegionsAreUniform = false; 
-          break; 
-        } 
-      } 
-    } 
-  } 
- 
-  // Our region is uniform if: 
-  // 1. All conditional branches that are direct children are uniform (checked 
-  // above). 
-  // 2. And either: 
-  //   a. All sub-regions are uniform. 
-  //   b. There is one or less conditional branches among the direct children. 
-  return SubRegionsAreUniform || (ConditionalDirectChildren <= 1); 
-} 
- 
+
+/// Build up the general order of nodes, by performing a topological sort of the
+/// parent region's nodes, while ensuring that there is no outer cycle node
+/// between any two inner cycle nodes.
+void StructurizeCFG::orderNodes() {
+  Order.resize(std::distance(GraphTraits<Region *>::nodes_begin(ParentRegion),
+                             GraphTraits<Region *>::nodes_end(ParentRegion)));
+  if (Order.empty())
+    return;
+
+  SmallDenseSet<RegionNode *> Nodes;
+  auto EntryNode = SubGraphTraits::getEntryNode(ParentRegion);
+
+  // A list of range indices of SCCs in Order, to be processed.
+  SmallVector<std::pair<unsigned, unsigned>, 8> WorkList;
+  unsigned I = 0, E = Order.size();
+  while (true) {
+    // Run through all the SCCs in the subgraph starting with Entry.
+    for (auto SCCI =
+             scc_iterator<SubGraphTraits::NodeRef, SubGraphTraits>::begin(
+                 EntryNode);
+         !SCCI.isAtEnd(); ++SCCI) {
+      auto &SCC = *SCCI;
+
+      // An SCC up to the size of 2, can be reduced to an entry (the last node),
+      // and a possible additional node. Therefore, it is already in order, and
+      // there is no need to add it to the work-list.
+      unsigned Size = SCC.size();
+      if (Size > 2)
+        WorkList.emplace_back(I, I + Size);
+
+      // Add the SCC nodes to the Order array.
+      for (auto &N : SCC) {
+        assert(I < E && "SCC size mismatch!");
+        Order[I++] = N.first;
+      }
+    }
+    assert(I == E && "SCC size mismatch!");
+
+    // If there are no more SCCs to order, then we are done.
+    if (WorkList.empty())
+      break;
+
+    std::tie(I, E) = WorkList.pop_back_val();
+
+    // Collect the set of nodes in the SCC's subgraph. These are only the
+    // possible child nodes; we do not add the entry (last node) otherwise we
+    // will have the same exact SCC all over again.
+    Nodes.clear();
+    Nodes.insert(Order.begin() + I, Order.begin() + E - 1);
+
+    // Update the entry node.
+    EntryNode.first = Order[E - 1];
+    EntryNode.second = &Nodes;
+  }
+}
+
+/// Determine the end of the loops
+void StructurizeCFG::analyzeLoops(RegionNode *N) {
+  if (N->isSubRegion()) {
+    // Test for exit as back edge
+    BasicBlock *Exit = N->getNodeAs<Region>()->getExit();
+    if (Visited.count(Exit))
+      Loops[Exit] = N->getEntry();
+
+  } else {
+    // Test for successors as back edge
+    BasicBlock *BB = N->getNodeAs<BasicBlock>();
+    BranchInst *Term = cast<BranchInst>(BB->getTerminator());
+
+    for (BasicBlock *Succ : Term->successors())
+      if (Visited.count(Succ))
+        Loops[Succ] = BB;
+  }
+}
+
+/// Build the condition for one edge
+Value *StructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx,
+                                      bool Invert) {
+  Value *Cond = Invert ? BoolFalse : BoolTrue;
+  if (Term->isConditional()) {
+    Cond = Term->getCondition();
+
+    if (Idx != (unsigned)Invert)
+      Cond = invertCondition(Cond);
+  }
+  return Cond;
+}
+
+/// Analyze the predecessors of each block and build up predicates
+void StructurizeCFG::gatherPredicates(RegionNode *N) {
+  RegionInfo *RI = ParentRegion->getRegionInfo();
+  BasicBlock *BB = N->getEntry();
+  BBPredicates &Pred = Predicates[BB];
+  BBPredicates &LPred = LoopPreds[BB];
+
+  for (BasicBlock *P : predecessors(BB)) {
+    // Ignore it if it's a branch from outside into our region entry
+    if (!ParentRegion->contains(P))
+      continue;
+
+    Region *R = RI->getRegionFor(P);
+    if (R == ParentRegion) {
+      // It's a top level block in our region
+      BranchInst *Term = cast<BranchInst>(P->getTerminator());
+      for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
+        BasicBlock *Succ = Term->getSuccessor(i);
+        if (Succ != BB)
+          continue;
+
+        if (Visited.count(P)) {
+          // Normal forward edge
+          if (Term->isConditional()) {
+            // Try to treat it like an ELSE block
+            BasicBlock *Other = Term->getSuccessor(!i);
+            if (Visited.count(Other) && !Loops.count(Other) &&
+                !Pred.count(Other) && !Pred.count(P)) {
+
+              Pred[Other] = BoolFalse;
+              Pred[P] = BoolTrue;
+              continue;
+            }
+          }
+          Pred[P] = buildCondition(Term, i, false);
+        } else {
+          // Back edge
+          LPred[P] = buildCondition(Term, i, true);
+        }
+      }
+    } else {
+      // It's an exit from a sub region
+      while (R->getParent() != ParentRegion)
+        R = R->getParent();
+
+      // Edge from inside a subregion to its entry, ignore it
+      if (*R == *N)
+        continue;
+
+      BasicBlock *Entry = R->getEntry();
+      if (Visited.count(Entry))
+        Pred[Entry] = BoolTrue;
+      else
+        LPred[Entry] = BoolFalse;
+    }
+  }
+}
+
+/// Collect various loop and predicate infos
+void StructurizeCFG::collectInfos() {
+  // Reset predicate
+  Predicates.clear();
+
+  // and loop infos
+  Loops.clear();
+  LoopPreds.clear();
+
+  // Reset the visited nodes
+  Visited.clear();
+
+  for (RegionNode *RN : reverse(Order)) {
+    LLVM_DEBUG(dbgs() << "Visiting: "
+                      << (RN->isSubRegion() ? "SubRegion with entry: " : "")
+                      << RN->getEntry()->getName() << "\n");
+
+    // Analyze all the conditions leading to a node
+    gatherPredicates(RN);
+
+    // Remember that we've seen this node
+    Visited.insert(RN->getEntry());
+
+    // Find the last back edges
+    analyzeLoops(RN);
+  }
+}
+
+/// Insert the missing branch conditions
+void StructurizeCFG::insertConditions(bool Loops) {
+  BranchVector &Conds = Loops ? LoopConds : Conditions;
+  Value *Default = Loops ? BoolTrue : BoolFalse;
+  SSAUpdater PhiInserter;
+
+  for (BranchInst *Term : Conds) {
+    assert(Term->isConditional());
+
+    BasicBlock *Parent = Term->getParent();
+    BasicBlock *SuccTrue = Term->getSuccessor(0);
+    BasicBlock *SuccFalse = Term->getSuccessor(1);
+
+    PhiInserter.Initialize(Boolean, "");
+    PhiInserter.AddAvailableValue(&Func->getEntryBlock(), Default);
+    PhiInserter.AddAvailableValue(Loops ? SuccFalse : Parent, Default);
+
+    BBPredicates &Preds = Loops ? LoopPreds[SuccFalse] : Predicates[SuccTrue];
+
+    NearestCommonDominator Dominator(DT);
+    Dominator.addBlock(Parent);
+
+    Value *ParentValue = nullptr;
+    for (std::pair<BasicBlock *, Value *> BBAndPred : Preds) {
+      BasicBlock *BB = BBAndPred.first;
+      Value *Pred = BBAndPred.second;
+
+      if (BB == Parent) {
+        ParentValue = Pred;
+        break;
+      }
+      PhiInserter.AddAvailableValue(BB, Pred);
+      Dominator.addAndRememberBlock(BB);
+    }
+
+    if (ParentValue) {
+      Term->setCondition(ParentValue);
+    } else {
+      if (!Dominator.resultIsRememberedBlock())
+        PhiInserter.AddAvailableValue(Dominator.result(), Default);
+
+      Term->setCondition(PhiInserter.GetValueInMiddleOfBlock(Parent));
+    }
+  }
+}
+
+/// Remove all PHI values coming from "From" into "To" and remember
+/// them in DeletedPhis
+void StructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) {
+  PhiMap &Map = DeletedPhis[To];
+  for (PHINode &Phi : To->phis()) {
+    bool Recorded = false;
+    while (Phi.getBasicBlockIndex(From) != -1) {
+      Value *Deleted = Phi.removeIncomingValue(From, false);
+      Map[&Phi].push_back(std::make_pair(From, Deleted));
+      if (!Recorded) {
+        AffectedPhis.push_back(&Phi);
+        Recorded = true;
+      }
+    }
+  }
+}
+
+/// Add a dummy PHI value as soon as we knew the new predecessor
+void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) {
+  for (PHINode &Phi : To->phis()) {
+    Value *Undef = UndefValue::get(Phi.getType());
+    Phi.addIncoming(Undef, From);
+  }
+  AddedPhis[To].push_back(From);
+}
+
+/// Add the real PHI value as soon as everything is set up
+void StructurizeCFG::setPhiValues() {
+  SmallVector<PHINode *, 8> InsertedPhis;
+  SSAUpdater Updater(&InsertedPhis);
+  for (const auto &AddedPhi : AddedPhis) {
+    BasicBlock *To = AddedPhi.first;
+    const BBVector &From = AddedPhi.second;
+
+    if (!DeletedPhis.count(To))
+      continue;
+
+    PhiMap &Map = DeletedPhis[To];
+    for (const auto &PI : Map) {
+      PHINode *Phi = PI.first;
+      Value *Undef = UndefValue::get(Phi->getType());
+      Updater.Initialize(Phi->getType(), "");
+      Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
+      Updater.AddAvailableValue(To, Undef);
+
+      NearestCommonDominator Dominator(DT);
+      Dominator.addBlock(To);
+      for (const auto &VI : PI.second) {
+        Updater.AddAvailableValue(VI.first, VI.second);
+        Dominator.addAndRememberBlock(VI.first);
+      }
+
+      if (!Dominator.resultIsRememberedBlock())
+        Updater.AddAvailableValue(Dominator.result(), Undef);
+
+      for (BasicBlock *FI : From)
+        Phi->setIncomingValueForBlock(FI, Updater.GetValueAtEndOfBlock(FI));
+      AffectedPhis.push_back(Phi);
+    }
+
+    DeletedPhis.erase(To);
+  }
+  assert(DeletedPhis.empty());
+
+  AffectedPhis.append(InsertedPhis.begin(), InsertedPhis.end());
+}
+
+void StructurizeCFG::simplifyAffectedPhis() {
+  bool Changed;
+  do {
+    Changed = false;
+    SimplifyQuery Q(Func->getParent()->getDataLayout());
+    Q.DT = DT;
+    for (WeakVH VH : AffectedPhis) {
+      if (auto Phi = dyn_cast_or_null<PHINode>(VH)) {
+        if (auto NewValue = SimplifyInstruction(Phi, Q)) {
+          Phi->replaceAllUsesWith(NewValue);
+          Phi->eraseFromParent();
+          Changed = true;
+        }
+      }
+    }
+  } while (Changed);
+}
+
+/// Remove phi values from all successors and then remove the terminator.
+void StructurizeCFG::killTerminator(BasicBlock *BB) {
+  Instruction *Term = BB->getTerminator();
+  if (!Term)
+    return;
+
+  for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB);
+       SI != SE; ++SI)
+    delPhiValues(BB, *SI);
+
+  if (DA)
+    DA->removeValue(Term);
+  Term->eraseFromParent();
+}
+
+/// Let node exit(s) point to NewExit
+void StructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit,
+                                bool IncludeDominator) {
+  if (Node->isSubRegion()) {
+    Region *SubRegion = Node->getNodeAs<Region>();
+    BasicBlock *OldExit = SubRegion->getExit();
+    BasicBlock *Dominator = nullptr;
+
+    // Find all the edges from the sub region to the exit
+    for (auto BBI = pred_begin(OldExit), E = pred_end(OldExit); BBI != E;) {
+      // Incrememt BBI before mucking with BB's terminator.
+      BasicBlock *BB = *BBI++;
+
+      if (!SubRegion->contains(BB))
+        continue;
+
+      // Modify the edges to point to the new exit
+      delPhiValues(BB, OldExit);
+      BB->getTerminator()->replaceUsesOfWith(OldExit, NewExit);
+      addPhiValues(BB, NewExit);
+
+      // Find the new dominator (if requested)
+      if (IncludeDominator) {
+        if (!Dominator)
+          Dominator = BB;
+        else
+          Dominator = DT->findNearestCommonDominator(Dominator, BB);
+      }
+    }
+
+    // Change the dominator (if requested)
+    if (Dominator)
+      DT->changeImmediateDominator(NewExit, Dominator);
+
+    // Update the region info
+    SubRegion->replaceExit(NewExit);
+  } else {
+    BasicBlock *BB = Node->getNodeAs<BasicBlock>();
+    killTerminator(BB);
+    BranchInst::Create(NewExit, BB);
+    addPhiValues(BB, NewExit);
+    if (IncludeDominator)
+      DT->changeImmediateDominator(NewExit, BB);
+  }
+}
+
+/// Create a new flow node and update dominator tree and region info
+BasicBlock *StructurizeCFG::getNextFlow(BasicBlock *Dominator) {
+  LLVMContext &Context = Func->getContext();
+  BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() :
+                       Order.back()->getEntry();
+  BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName,
+                                        Func, Insert);
+  DT->addNewBlock(Flow, Dominator);
+  ParentRegion->getRegionInfo()->setRegionFor(Flow, ParentRegion);
+  return Flow;
+}
+
+/// Create a new or reuse the previous node as flow node
+BasicBlock *StructurizeCFG::needPrefix(bool NeedEmpty) {
+  BasicBlock *Entry = PrevNode->getEntry();
+
+  if (!PrevNode->isSubRegion()) {
+    killTerminator(Entry);
+    if (!NeedEmpty || Entry->getFirstInsertionPt() == Entry->end())
+      return Entry;
+  }
+
+  // create a new flow node
+  BasicBlock *Flow = getNextFlow(Entry);
+
+  // and wire it up
+  changeExit(PrevNode, Flow, true);
+  PrevNode = ParentRegion->getBBNode(Flow);
+  return Flow;
+}
+
+/// Returns the region exit if possible, otherwise just a new flow node
+BasicBlock *StructurizeCFG::needPostfix(BasicBlock *Flow,
+                                        bool ExitUseAllowed) {
+  if (!Order.empty() || !ExitUseAllowed)
+    return getNextFlow(Flow);
+
+  BasicBlock *Exit = ParentRegion->getExit();
+  DT->changeImmediateDominator(Exit, Flow);
+  addPhiValues(Flow, Exit);
+  return Exit;
+}
+
+/// Set the previous node
+void StructurizeCFG::setPrevNode(BasicBlock *BB) {
+  PrevNode = ParentRegion->contains(BB) ? ParentRegion->getBBNode(BB)
+                                        : nullptr;
+}
+
+/// Does BB dominate all the predicates of Node?
+bool StructurizeCFG::dominatesPredicates(BasicBlock *BB, RegionNode *Node) {
+  BBPredicates &Preds = Predicates[Node->getEntry()];
+  return llvm::all_of(Preds, [&](std::pair<BasicBlock *, Value *> Pred) {
+    return DT->dominates(BB, Pred.first);
+  });
+}
+
+/// Can we predict that this node will always be called?
+bool StructurizeCFG::isPredictableTrue(RegionNode *Node) {
+  BBPredicates &Preds = Predicates[Node->getEntry()];
+  bool Dominated = false;
+
+  // Regionentry is always true
+  if (!PrevNode)
+    return true;
+
+  for (std::pair<BasicBlock*, Value*> Pred : Preds) {
+    BasicBlock *BB = Pred.first;
+    Value *V = Pred.second;
+
+    if (V != BoolTrue)
+      return false;
+
+    if (!Dominated && DT->dominates(BB, PrevNode->getEntry()))
+      Dominated = true;
+  }
+
+  // TODO: The dominator check is too strict
+  return Dominated;
+}
+
+/// Take one node from the order vector and wire it up
+void StructurizeCFG::wireFlow(bool ExitUseAllowed,
+                              BasicBlock *LoopEnd) {
+  RegionNode *Node = Order.pop_back_val();
+  Visited.insert(Node->getEntry());
+
+  if (isPredictableTrue(Node)) {
+    // Just a linear flow
+    if (PrevNode) {
+      changeExit(PrevNode, Node->getEntry(), true);
+    }
+    PrevNode = Node;
+  } else {
+    // Insert extra prefix node (or reuse last one)
+    BasicBlock *Flow = needPrefix(false);
+
+    // Insert extra postfix node (or use exit instead)
+    BasicBlock *Entry = Node->getEntry();
+    BasicBlock *Next = needPostfix(Flow, ExitUseAllowed);
+
+    // let it point to entry and next block
+    Conditions.push_back(BranchInst::Create(Entry, Next, BoolUndef, Flow));
+    addPhiValues(Flow, Entry);
+    DT->changeImmediateDominator(Entry, Flow);
+
+    PrevNode = Node;
+    while (!Order.empty() && !Visited.count(LoopEnd) &&
+           dominatesPredicates(Entry, Order.back())) {
+      handleLoops(false, LoopEnd);
+    }
+
+    changeExit(PrevNode, Next, false);
+    setPrevNode(Next);
+  }
+}
+
+void StructurizeCFG::handleLoops(bool ExitUseAllowed,
+                                 BasicBlock *LoopEnd) {
+  RegionNode *Node = Order.back();
+  BasicBlock *LoopStart = Node->getEntry();
+
+  if (!Loops.count(LoopStart)) {
+    wireFlow(ExitUseAllowed, LoopEnd);
+    return;
+  }
+
+  if (!isPredictableTrue(Node))
+    LoopStart = needPrefix(true);
+
+  LoopEnd = Loops[Node->getEntry()];
+  wireFlow(false, LoopEnd);
+  while (!Visited.count(LoopEnd)) {
+    handleLoops(false, LoopEnd);
+  }
+
+  // If the start of the loop is the entry block, we can't branch to it so
+  // insert a new dummy entry block.
+  Function *LoopFunc = LoopStart->getParent();
+  if (LoopStart == &LoopFunc->getEntryBlock()) {
+    LoopStart->setName("entry.orig");
+
+    BasicBlock *NewEntry =
+      BasicBlock::Create(LoopStart->getContext(),
+                         "entry",
+                         LoopFunc,
+                         LoopStart);
+    BranchInst::Create(LoopStart, NewEntry);
+    DT->setNewRoot(NewEntry);
+  }
+
+  // Create an extra loop end node
+  LoopEnd = needPrefix(false);
+  BasicBlock *Next = needPostfix(LoopEnd, ExitUseAllowed);
+  LoopConds.push_back(BranchInst::Create(Next, LoopStart,
+                                         BoolUndef, LoopEnd));
+  addPhiValues(LoopEnd, LoopStart);
+  setPrevNode(Next);
+}
+
+/// After this function control flow looks like it should be, but
+/// branches and PHI nodes only have undefined conditions.
+void StructurizeCFG::createFlow() {
+  BasicBlock *Exit = ParentRegion->getExit();
+  bool EntryDominatesExit = DT->dominates(ParentRegion->getEntry(), Exit);
+
+  AffectedPhis.clear();
+  DeletedPhis.clear();
+  AddedPhis.clear();
+  Conditions.clear();
+  LoopConds.clear();
+
+  PrevNode = nullptr;
+  Visited.clear();
+
+  while (!Order.empty()) {
+    handleLoops(EntryDominatesExit, nullptr);
+  }
+
+  if (PrevNode)
+    changeExit(PrevNode, Exit, EntryDominatesExit);
+  else
+    assert(EntryDominatesExit);
+}
+
+/// Handle a rare case where the disintegrated nodes instructions
+/// no longer dominate all their uses. Not sure if this is really necessary
+void StructurizeCFG::rebuildSSA() {
+  SSAUpdater Updater;
+  for (BasicBlock *BB : ParentRegion->blocks())
+    for (Instruction &I : *BB) {
+      bool Initialized = false;
+      // We may modify the use list as we iterate over it, so be careful to
+      // compute the next element in the use list at the top of the loop.
+      for (auto UI = I.use_begin(), E = I.use_end(); UI != E;) {
+        Use &U = *UI++;
+        Instruction *User = cast<Instruction>(U.getUser());
+        if (User->getParent() == BB) {
+          continue;
+        } else if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
+          if (UserPN->getIncomingBlock(U) == BB)
+            continue;
+        }
+
+        if (DT->dominates(&I, User))
+          continue;
+
+        if (!Initialized) {
+          Value *Undef = UndefValue::get(I.getType());
+          Updater.Initialize(I.getType(), "");
+          Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
+          Updater.AddAvailableValue(BB, &I);
+          Initialized = true;
+        }
+        Updater.RewriteUseAfterInsertions(U);
+      }
+    }
+}
+
+static bool hasOnlyUniformBranches(Region *R, unsigned UniformMDKindID,
+                                   const LegacyDivergenceAnalysis &DA) {
+  // Bool for if all sub-regions are uniform.
+  bool SubRegionsAreUniform = true;
+  // Count of how many direct children are conditional.
+  unsigned ConditionalDirectChildren = 0;
+
+  for (auto E : R->elements()) {
+    if (!E->isSubRegion()) {
+      auto Br = dyn_cast<BranchInst>(E->getEntry()->getTerminator());
+      if (!Br || !Br->isConditional())
+        continue;
+
+      if (!DA.isUniform(Br))
+        return false;
+
+      // One of our direct children is conditional.
+      ConditionalDirectChildren++;
+
+      LLVM_DEBUG(dbgs() << "BB: " << Br->getParent()->getName()
+                        << " has uniform terminator\n");
+    } else {
+      // Explicitly refuse to treat regions as uniform if they have non-uniform
+      // subregions. We cannot rely on DivergenceAnalysis for branches in
+      // subregions because those branches may have been removed and re-created,
+      // so we look for our metadata instead.
+      //
+      // Warning: It would be nice to treat regions as uniform based only on
+      // their direct child basic blocks' terminators, regardless of whether
+      // subregions are uniform or not. However, this requires a very careful
+      // look at SIAnnotateControlFlow to make sure nothing breaks there.
+      for (auto BB : E->getNodeAs<Region>()->blocks()) {
+        auto Br = dyn_cast<BranchInst>(BB->getTerminator());
+        if (!Br || !Br->isConditional())
+          continue;
+
+        if (!Br->getMetadata(UniformMDKindID)) {
+          // Early exit if we cannot have relaxed uniform regions.
+          if (!RelaxedUniformRegions)
+            return false;
+
+          SubRegionsAreUniform = false;
+          break;
+        }
+      }
+    }
+  }
+
+  // Our region is uniform if:
+  // 1. All conditional branches that are direct children are uniform (checked
+  // above).
+  // 2. And either:
+  //   a. All sub-regions are uniform.
+  //   b. There is one or less conditional branches among the direct children.
+  return SubRegionsAreUniform || (ConditionalDirectChildren <= 1);
+}
+
 void StructurizeCFG::init(Region *R) {
   LLVMContext &Context = R->getEntry()->getContext();
 
@@ -1021,9 +1021,9 @@ void StructurizeCFG::init(Region *R) {
 
 bool StructurizeCFG::makeUniformRegion(Region *R,
                                        LegacyDivergenceAnalysis *DA) {
-  if (R->isTopLevelRegion()) 
-    return false; 
- 
+  if (R->isTopLevelRegion())
+    return false;
+
   this->DA = DA;
   // TODO: We could probably be smarter here with how we handle sub-regions.
   // We currently rely on the fact that metadata is set by earlier invocations
@@ -1031,11 +1031,11 @@ bool StructurizeCFG::makeUniformRegion(Region *R,
   // but we shouldn't rely on metadata for correctness!
   unsigned UniformMDKindID =
       R->getEntry()->getContext().getMDKindID("structurizecfg.uniform");
- 
+
   if (hasOnlyUniformBranches(R, UniformMDKindID, *DA)) {
     LLVM_DEBUG(dbgs() << "Skipping region with uniform control flow: " << *R
                       << '\n');
- 
+
     // Mark all direct child block terminators as having been treated as
     // uniform. To account for a possible future in which non-uniform
     // sub-regions are treated more cleverly, indirect children are not
@@ -1044,16 +1044,16 @@ bool StructurizeCFG::makeUniformRegion(Region *R,
     for (RegionNode *E : R->elements()) {
       if (E->isSubRegion())
         continue;
- 
+
       if (Instruction *Term = E->getEntry()->getTerminator())
         Term->setMetadata(UniformMDKindID, MD);
     }
- 
+
     return true;
-  } 
+  }
   return false;
 }
- 
+
 /// Run the transformation for each region found
 bool StructurizeCFG::run(Region *R, DominatorTree *DT) {
   if (R->isTopLevelRegion())
@@ -1061,35 +1061,35 @@ bool StructurizeCFG::run(Region *R, DominatorTree *DT) {
 
   this->DT = DT;
 
-  Func = R->getEntry()->getParent(); 
-  ParentRegion = R; 
- 
-  orderNodes(); 
-  collectInfos(); 
-  createFlow(); 
-  insertConditions(false); 
-  insertConditions(true); 
-  setPhiValues(); 
-  simplifyAffectedPhis(); 
-  rebuildSSA(); 
- 
-  // Cleanup 
-  Order.clear(); 
-  Visited.clear(); 
-  DeletedPhis.clear(); 
-  AddedPhis.clear(); 
-  Predicates.clear(); 
-  Conditions.clear(); 
-  Loops.clear(); 
-  LoopPreds.clear(); 
-  LoopConds.clear(); 
- 
-  return true; 
-} 
- 
-Pass *llvm::createStructurizeCFGPass(bool SkipUniformRegions) { 
+  Func = R->getEntry()->getParent();
+  ParentRegion = R;
+
+  orderNodes();
+  collectInfos();
+  createFlow();
+  insertConditions(false);
+  insertConditions(true);
+  setPhiValues();
+  simplifyAffectedPhis();
+  rebuildSSA();
+
+  // Cleanup
+  Order.clear();
+  Visited.clear();
+  DeletedPhis.clear();
+  AddedPhis.clear();
+  Predicates.clear();
+  Conditions.clear();
+  Loops.clear();
+  LoopPreds.clear();
+  LoopConds.clear();
+
+  return true;
+}
+
+Pass *llvm::createStructurizeCFGPass(bool SkipUniformRegions) {
   return new StructurizeCFGLegacyPass(SkipUniformRegions);
-} 
+}
 
 static void addRegionIntoQueue(Region &R, std::vector<Region *> &Regions) {
   Regions.push_back(&R);
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/TailRecursionElimination.cpp
index cfd50023b4..9e7cccc884 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -1,752 +1,752 @@
-//===- TailRecursionElimination.cpp - Eliminate Tail Calls ----------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file transforms calls of the current function (self recursion) followed 
-// by a return instruction with a branch to the entry of the function, creating 
-// a loop.  This pass also implements the following extensions to the basic 
-// algorithm: 
-// 
-//  1. Trivial instructions between the call and return do not prevent the 
-//     transformation from taking place, though currently the analysis cannot 
-//     support moving any really useful instructions (only dead ones). 
-//  2. This pass transforms functions that are prevented from being tail 
-//     recursive by an associative and commutative expression to use an 
-//     accumulator variable, thus compiling the typical naive factorial or 
-//     'fib' implementation into efficient code. 
-//  3. TRE is performed if the function returns void, if the return 
-//     returns the result returned by the call, or if the function returns a 
-//     run-time constant on all exits from the function.  It is possible, though 
-//     unlikely, that the return returns something else (like constant 0), and 
-//     can still be TRE'd.  It can be TRE'd if ALL OTHER return instructions in 
-//     the function return the exact same value. 
-//  4. If it can prove that callees do not access their caller stack frame, 
-//     they are marked as eligible for tail call elimination (by the code 
-//     generator). 
-// 
-// There are several improvements that could be made: 
-// 
-//  1. If the function has any alloca instructions, these instructions will be 
-//     moved out of the entry block of the function, causing them to be 
-//     evaluated each time through the tail recursion.  Safely keeping allocas 
-//     in the entry block requires analysis to proves that the tail-called 
-//     function does not read or write the stack object. 
-//  2. Tail recursion is only performed if the call immediately precedes the 
-//     return instruction.  It's possible that there could be a jump between 
-//     the call and the return. 
-//  3. There can be intervening operations between the call and the return that 
-//     prevent the TRE from occurring.  For example, there could be GEP's and 
-//     stores to memory that will not be read or written by the call.  This 
-//     requires some substantial analysis (such as with DSA) to prove safe to 
-//     move ahead of the call, but doing so could allow many more TREs to be 
-//     performed, for example in TreeAdd/TreeAlloc from the treeadd benchmark. 
-//  4. The algorithm we use to detect if callees access their caller stack 
-//     frames is very primitive. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/TailRecursionElimination.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/CFG.h" 
-#include "llvm/Analysis/CaptureTracking.h" 
-#include "llvm/Analysis/DomTreeUpdater.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/InlineCost.h" 
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/Analysis/Loads.h" 
-#include "llvm/Analysis/OptimizationRemarkEmitter.h" 
-#include "llvm/Analysis/PostDominators.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/DiagnosticInfo.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/InstIterator.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/ValueHandle.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Scalar.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "tailcallelim" 
- 
-STATISTIC(NumEliminated, "Number of tail calls removed"); 
-STATISTIC(NumRetDuped,   "Number of return duplicated"); 
-STATISTIC(NumAccumAdded, "Number of accumulators introduced"); 
- 
-/// Scan the specified function for alloca instructions. 
-/// If it contains any dynamic allocas, returns false. 
-static bool canTRE(Function &F) { 
+//===- TailRecursionElimination.cpp - Eliminate Tail Calls ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file transforms calls of the current function (self recursion) followed
+// by a return instruction with a branch to the entry of the function, creating
+// a loop.  This pass also implements the following extensions to the basic
+// algorithm:
+//
+//  1. Trivial instructions between the call and return do not prevent the
+//     transformation from taking place, though currently the analysis cannot
+//     support moving any really useful instructions (only dead ones).
+//  2. This pass transforms functions that are prevented from being tail
+//     recursive by an associative and commutative expression to use an
+//     accumulator variable, thus compiling the typical naive factorial or
+//     'fib' implementation into efficient code.
+//  3. TRE is performed if the function returns void, if the return
+//     returns the result returned by the call, or if the function returns a
+//     run-time constant on all exits from the function.  It is possible, though
+//     unlikely, that the return returns something else (like constant 0), and
+//     can still be TRE'd.  It can be TRE'd if ALL OTHER return instructions in
+//     the function return the exact same value.
+//  4. If it can prove that callees do not access their caller stack frame,
+//     they are marked as eligible for tail call elimination (by the code
+//     generator).
+//
+// There are several improvements that could be made:
+//
+//  1. If the function has any alloca instructions, these instructions will be
+//     moved out of the entry block of the function, causing them to be
+//     evaluated each time through the tail recursion.  Safely keeping allocas
+//     in the entry block requires analysis to proves that the tail-called
+//     function does not read or write the stack object.
+//  2. Tail recursion is only performed if the call immediately precedes the
+//     return instruction.  It's possible that there could be a jump between
+//     the call and the return.
+//  3. There can be intervening operations between the call and the return that
+//     prevent the TRE from occurring.  For example, there could be GEP's and
+//     stores to memory that will not be read or written by the call.  This
+//     requires some substantial analysis (such as with DSA) to prove safe to
+//     move ahead of the call, but doing so could allow many more TREs to be
+//     performed, for example in TreeAdd/TreeAlloc from the treeadd benchmark.
+//  4. The algorithm we use to detect if callees access their caller stack
+//     frames is very primitive.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/TailRecursionElimination.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "tailcallelim"
+
+STATISTIC(NumEliminated, "Number of tail calls removed");
+STATISTIC(NumRetDuped,   "Number of return duplicated");
+STATISTIC(NumAccumAdded, "Number of accumulators introduced");
+
+/// Scan the specified function for alloca instructions.
+/// If it contains any dynamic allocas, returns false.
+static bool canTRE(Function &F) {
   // FIXME: The code generator produces really bad code when an 'escaping
   // alloca' is changed from being a static alloca to being a dynamic alloca.
   // Until this is resolved, disable this transformation if that would ever
   // happen.  This bug is PR962.
-  return llvm::all_of(instructions(F), [](Instruction &I) { 
-    auto *AI = dyn_cast<AllocaInst>(&I); 
-    return !AI || AI->isStaticAlloca(); 
-  }); 
-} 
- 
-namespace { 
-struct AllocaDerivedValueTracker { 
-  // Start at a root value and walk its use-def chain to mark calls that use the 
-  // value or a derived value in AllocaUsers, and places where it may escape in 
-  // EscapePoints. 
-  void walk(Value *Root) { 
-    SmallVector<Use *, 32> Worklist; 
-    SmallPtrSet<Use *, 32> Visited; 
- 
-    auto AddUsesToWorklist = [&](Value *V) { 
-      for (auto &U : V->uses()) { 
-        if (!Visited.insert(&U).second) 
-          continue; 
-        Worklist.push_back(&U); 
-      } 
-    }; 
- 
-    AddUsesToWorklist(Root); 
- 
-    while (!Worklist.empty()) { 
-      Use *U = Worklist.pop_back_val(); 
-      Instruction *I = cast<Instruction>(U->getUser()); 
- 
-      switch (I->getOpcode()) { 
-      case Instruction::Call: 
-      case Instruction::Invoke: { 
-        auto &CB = cast<CallBase>(*I); 
-        // If the alloca-derived argument is passed byval it is not an escape 
-        // point, or a use of an alloca. Calling with byval copies the contents 
-        // of the alloca into argument registers or stack slots, which exist 
-        // beyond the lifetime of the current frame. 
-        if (CB.isArgOperand(U) && CB.isByValArgument(CB.getArgOperandNo(U))) 
-          continue; 
-        bool IsNocapture = 
-            CB.isDataOperand(U) && CB.doesNotCapture(CB.getDataOperandNo(U)); 
-        callUsesLocalStack(CB, IsNocapture); 
-        if (IsNocapture) { 
-          // If the alloca-derived argument is passed in as nocapture, then it 
-          // can't propagate to the call's return. That would be capturing. 
-          continue; 
-        } 
-        break; 
-      } 
-      case Instruction::Load: { 
-        // The result of a load is not alloca-derived (unless an alloca has 
-        // otherwise escaped, but this is a local analysis). 
-        continue; 
-      } 
-      case Instruction::Store: { 
-        if (U->getOperandNo() == 0) 
-          EscapePoints.insert(I); 
-        continue;  // Stores have no users to analyze. 
-      } 
-      case Instruction::BitCast: 
-      case Instruction::GetElementPtr: 
-      case Instruction::PHI: 
-      case Instruction::Select: 
-      case Instruction::AddrSpaceCast: 
-        break; 
-      default: 
-        EscapePoints.insert(I); 
-        break; 
-      } 
- 
-      AddUsesToWorklist(I); 
-    } 
-  } 
- 
-  void callUsesLocalStack(CallBase &CB, bool IsNocapture) { 
-    // Add it to the list of alloca users. 
-    AllocaUsers.insert(&CB); 
- 
-    // If it's nocapture then it can't capture this alloca. 
-    if (IsNocapture) 
-      return; 
- 
-    // If it can write to memory, it can leak the alloca value. 
-    if (!CB.onlyReadsMemory()) 
-      EscapePoints.insert(&CB); 
-  } 
- 
-  SmallPtrSet<Instruction *, 32> AllocaUsers; 
-  SmallPtrSet<Instruction *, 32> EscapePoints; 
-}; 
-} 
- 
-static bool markTails(Function &F, bool &AllCallsAreTailCalls, 
-                      OptimizationRemarkEmitter *ORE) { 
-  if (F.callsFunctionThatReturnsTwice()) 
-    return false; 
-  AllCallsAreTailCalls = true; 
- 
-  // The local stack holds all alloca instructions and all byval arguments. 
-  AllocaDerivedValueTracker Tracker; 
-  for (Argument &Arg : F.args()) { 
-    if (Arg.hasByValAttr()) 
-      Tracker.walk(&Arg); 
-  } 
-  for (auto &BB : F) { 
-    for (auto &I : BB) 
-      if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) 
-        Tracker.walk(AI); 
-  } 
- 
-  bool Modified = false; 
- 
-  // Track whether a block is reachable after an alloca has escaped. Blocks that 
-  // contain the escaping instruction will be marked as being visited without an 
-  // escaped alloca, since that is how the block began. 
-  enum VisitType { 
-    UNVISITED, 
-    UNESCAPED, 
-    ESCAPED 
-  }; 
-  DenseMap<BasicBlock *, VisitType> Visited; 
- 
-  // We propagate the fact that an alloca has escaped from block to successor. 
-  // Visit the blocks that are propagating the escapedness first. To do this, we 
-  // maintain two worklists. 
-  SmallVector<BasicBlock *, 32> WorklistUnescaped, WorklistEscaped; 
- 
-  // We may enter a block and visit it thinking that no alloca has escaped yet, 
-  // then see an escape point and go back around a loop edge and come back to 
-  // the same block twice. Because of this, we defer setting tail on calls when 
-  // we first encounter them in a block. Every entry in this list does not 
-  // statically use an alloca via use-def chain analysis, but may find an alloca 
-  // through other means if the block turns out to be reachable after an escape 
-  // point. 
-  SmallVector<CallInst *, 32> DeferredTails; 
- 
-  BasicBlock *BB = &F.getEntryBlock(); 
-  VisitType Escaped = UNESCAPED; 
-  do { 
-    for (auto &I : *BB) { 
-      if (Tracker.EscapePoints.count(&I)) 
-        Escaped = ESCAPED; 
- 
-      CallInst *CI = dyn_cast<CallInst>(&I); 
+  return llvm::all_of(instructions(F), [](Instruction &I) {
+    auto *AI = dyn_cast<AllocaInst>(&I);
+    return !AI || AI->isStaticAlloca();
+  });
+}
+
+namespace {
+struct AllocaDerivedValueTracker {
+  // Start at a root value and walk its use-def chain to mark calls that use the
+  // value or a derived value in AllocaUsers, and places where it may escape in
+  // EscapePoints.
+  void walk(Value *Root) {
+    SmallVector<Use *, 32> Worklist;
+    SmallPtrSet<Use *, 32> Visited;
+
+    auto AddUsesToWorklist = [&](Value *V) {
+      for (auto &U : V->uses()) {
+        if (!Visited.insert(&U).second)
+          continue;
+        Worklist.push_back(&U);
+      }
+    };
+
+    AddUsesToWorklist(Root);
+
+    while (!Worklist.empty()) {
+      Use *U = Worklist.pop_back_val();
+      Instruction *I = cast<Instruction>(U->getUser());
+
+      switch (I->getOpcode()) {
+      case Instruction::Call:
+      case Instruction::Invoke: {
+        auto &CB = cast<CallBase>(*I);
+        // If the alloca-derived argument is passed byval it is not an escape
+        // point, or a use of an alloca. Calling with byval copies the contents
+        // of the alloca into argument registers or stack slots, which exist
+        // beyond the lifetime of the current frame.
+        if (CB.isArgOperand(U) && CB.isByValArgument(CB.getArgOperandNo(U)))
+          continue;
+        bool IsNocapture =
+            CB.isDataOperand(U) && CB.doesNotCapture(CB.getDataOperandNo(U));
+        callUsesLocalStack(CB, IsNocapture);
+        if (IsNocapture) {
+          // If the alloca-derived argument is passed in as nocapture, then it
+          // can't propagate to the call's return. That would be capturing.
+          continue;
+        }
+        break;
+      }
+      case Instruction::Load: {
+        // The result of a load is not alloca-derived (unless an alloca has
+        // otherwise escaped, but this is a local analysis).
+        continue;
+      }
+      case Instruction::Store: {
+        if (U->getOperandNo() == 0)
+          EscapePoints.insert(I);
+        continue;  // Stores have no users to analyze.
+      }
+      case Instruction::BitCast:
+      case Instruction::GetElementPtr:
+      case Instruction::PHI:
+      case Instruction::Select:
+      case Instruction::AddrSpaceCast:
+        break;
+      default:
+        EscapePoints.insert(I);
+        break;
+      }
+
+      AddUsesToWorklist(I);
+    }
+  }
+
+  void callUsesLocalStack(CallBase &CB, bool IsNocapture) {
+    // Add it to the list of alloca users.
+    AllocaUsers.insert(&CB);
+
+    // If it's nocapture then it can't capture this alloca.
+    if (IsNocapture)
+      return;
+
+    // If it can write to memory, it can leak the alloca value.
+    if (!CB.onlyReadsMemory())
+      EscapePoints.insert(&CB);
+  }
+
+  SmallPtrSet<Instruction *, 32> AllocaUsers;
+  SmallPtrSet<Instruction *, 32> EscapePoints;
+};
+}
+
+static bool markTails(Function &F, bool &AllCallsAreTailCalls,
+                      OptimizationRemarkEmitter *ORE) {
+  if (F.callsFunctionThatReturnsTwice())
+    return false;
+  AllCallsAreTailCalls = true;
+
+  // The local stack holds all alloca instructions and all byval arguments.
+  AllocaDerivedValueTracker Tracker;
+  for (Argument &Arg : F.args()) {
+    if (Arg.hasByValAttr())
+      Tracker.walk(&Arg);
+  }
+  for (auto &BB : F) {
+    for (auto &I : BB)
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
+        Tracker.walk(AI);
+  }
+
+  bool Modified = false;
+
+  // Track whether a block is reachable after an alloca has escaped. Blocks that
+  // contain the escaping instruction will be marked as being visited without an
+  // escaped alloca, since that is how the block began.
+  enum VisitType {
+    UNVISITED,
+    UNESCAPED,
+    ESCAPED
+  };
+  DenseMap<BasicBlock *, VisitType> Visited;
+
+  // We propagate the fact that an alloca has escaped from block to successor.
+  // Visit the blocks that are propagating the escapedness first. To do this, we
+  // maintain two worklists.
+  SmallVector<BasicBlock *, 32> WorklistUnescaped, WorklistEscaped;
+
+  // We may enter a block and visit it thinking that no alloca has escaped yet,
+  // then see an escape point and go back around a loop edge and come back to
+  // the same block twice. Because of this, we defer setting tail on calls when
+  // we first encounter them in a block. Every entry in this list does not
+  // statically use an alloca via use-def chain analysis, but may find an alloca
+  // through other means if the block turns out to be reachable after an escape
+  // point.
+  SmallVector<CallInst *, 32> DeferredTails;
+
+  BasicBlock *BB = &F.getEntryBlock();
+  VisitType Escaped = UNESCAPED;
+  do {
+    for (auto &I : *BB) {
+      if (Tracker.EscapePoints.count(&I))
+        Escaped = ESCAPED;
+
+      CallInst *CI = dyn_cast<CallInst>(&I);
       // A PseudoProbeInst has the IntrInaccessibleMemOnly tag hence it is
       // considered accessing memory and will be marked as a tail call if we
       // don't bail out here.
       if (!CI || CI->isTailCall() || isa<DbgInfoIntrinsic>(&I) ||
           isa<PseudoProbeInst>(&I))
-        continue; 
- 
-      bool IsNoTail = CI->isNoTailCall() || CI->hasOperandBundles(); 
- 
-      if (!IsNoTail && CI->doesNotAccessMemory()) { 
-        // A call to a readnone function whose arguments are all things computed 
-        // outside this function can be marked tail. Even if you stored the 
-        // alloca address into a global, a readnone function can't load the 
-        // global anyhow. 
-        // 
-        // Note that this runs whether we know an alloca has escaped or not. If 
-        // it has, then we can't trust Tracker.AllocaUsers to be accurate. 
-        bool SafeToTail = true; 
-        for (auto &Arg : CI->arg_operands()) { 
-          if (isa<Constant>(Arg.getUser())) 
-            continue; 
-          if (Argument *A = dyn_cast<Argument>(Arg.getUser())) 
-            if (!A->hasByValAttr()) 
-              continue; 
-          SafeToTail = false; 
-          break; 
-        } 
-        if (SafeToTail) { 
-          using namespace ore; 
-          ORE->emit([&]() { 
-            return OptimizationRemark(DEBUG_TYPE, "tailcall-readnone", CI) 
-                   << "marked as tail call candidate (readnone)"; 
-          }); 
-          CI->setTailCall(); 
-          Modified = true; 
-          continue; 
-        } 
-      } 
- 
-      if (!IsNoTail && Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI)) { 
-        DeferredTails.push_back(CI); 
-      } else { 
-        AllCallsAreTailCalls = false; 
-      } 
-    } 
- 
+        continue;
+
+      bool IsNoTail = CI->isNoTailCall() || CI->hasOperandBundles();
+
+      if (!IsNoTail && CI->doesNotAccessMemory()) {
+        // A call to a readnone function whose arguments are all things computed
+        // outside this function can be marked tail. Even if you stored the
+        // alloca address into a global, a readnone function can't load the
+        // global anyhow.
+        //
+        // Note that this runs whether we know an alloca has escaped or not. If
+        // it has, then we can't trust Tracker.AllocaUsers to be accurate.
+        bool SafeToTail = true;
+        for (auto &Arg : CI->arg_operands()) {
+          if (isa<Constant>(Arg.getUser()))
+            continue;
+          if (Argument *A = dyn_cast<Argument>(Arg.getUser()))
+            if (!A->hasByValAttr())
+              continue;
+          SafeToTail = false;
+          break;
+        }
+        if (SafeToTail) {
+          using namespace ore;
+          ORE->emit([&]() {
+            return OptimizationRemark(DEBUG_TYPE, "tailcall-readnone", CI)
+                   << "marked as tail call candidate (readnone)";
+          });
+          CI->setTailCall();
+          Modified = true;
+          continue;
+        }
+      }
+
+      if (!IsNoTail && Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI)) {
+        DeferredTails.push_back(CI);
+      } else {
+        AllCallsAreTailCalls = false;
+      }
+    }
+
     for (auto *SuccBB : successors(BB)) {
-      auto &State = Visited[SuccBB]; 
-      if (State < Escaped) { 
-        State = Escaped; 
-        if (State == ESCAPED) 
-          WorklistEscaped.push_back(SuccBB); 
-        else 
-          WorklistUnescaped.push_back(SuccBB); 
-      } 
-    } 
- 
-    if (!WorklistEscaped.empty()) { 
-      BB = WorklistEscaped.pop_back_val(); 
-      Escaped = ESCAPED; 
-    } else { 
-      BB = nullptr; 
-      while (!WorklistUnescaped.empty()) { 
-        auto *NextBB = WorklistUnescaped.pop_back_val(); 
-        if (Visited[NextBB] == UNESCAPED) { 
-          BB = NextBB; 
-          Escaped = UNESCAPED; 
-          break; 
-        } 
-      } 
-    } 
-  } while (BB); 
- 
-  for (CallInst *CI : DeferredTails) { 
-    if (Visited[CI->getParent()] != ESCAPED) { 
-      // If the escape point was part way through the block, calls after the 
-      // escape point wouldn't have been put into DeferredTails. 
-      LLVM_DEBUG(dbgs() << "Marked as tail call candidate: " << *CI << "\n"); 
-      CI->setTailCall(); 
-      Modified = true; 
-    } else { 
-      AllCallsAreTailCalls = false; 
-    } 
-  } 
- 
-  return Modified; 
-} 
- 
-/// Return true if it is safe to move the specified 
-/// instruction from after the call to before the call, assuming that all 
-/// instructions between the call and this instruction are movable. 
-/// 
-static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) { 
-  // FIXME: We can move load/store/call/free instructions above the call if the 
-  // call does not mod/ref the memory location being processed. 
-  if (I->mayHaveSideEffects())  // This also handles volatile loads. 
-    return false; 
- 
-  if (LoadInst *L = dyn_cast<LoadInst>(I)) { 
-    // Loads may always be moved above calls without side effects. 
-    if (CI->mayHaveSideEffects()) { 
-      // Non-volatile loads may be moved above a call with side effects if it 
-      // does not write to memory and the load provably won't trap. 
-      // Writes to memory only matter if they may alias the pointer 
-      // being loaded from. 
-      const DataLayout &DL = L->getModule()->getDataLayout(); 
-      if (isModSet(AA->getModRefInfo(CI, MemoryLocation::get(L))) || 
-          !isSafeToLoadUnconditionally(L->getPointerOperand(), L->getType(), 
-                                       L->getAlign(), DL, L)) 
-        return false; 
-    } 
-  } 
- 
-  // Otherwise, if this is a side-effect free instruction, check to make sure 
-  // that it does not use the return value of the call.  If it doesn't use the 
-  // return value of the call, it must only use things that are defined before 
-  // the call, or movable instructions between the call and the instruction 
-  // itself. 
-  return !is_contained(I->operands(), CI); 
-} 
- 
-static bool canTransformAccumulatorRecursion(Instruction *I, CallInst *CI) { 
-  if (!I->isAssociative() || !I->isCommutative()) 
-    return false; 
- 
-  assert(I->getNumOperands() == 2 && 
-         "Associative/commutative operations should have 2 args!"); 
- 
-  // Exactly one operand should be the result of the call instruction. 
-  if ((I->getOperand(0) == CI && I->getOperand(1) == CI) || 
-      (I->getOperand(0) != CI && I->getOperand(1) != CI)) 
-    return false; 
- 
-  // The only user of this instruction we allow is a single return instruction. 
-  if (!I->hasOneUse() || !isa<ReturnInst>(I->user_back())) 
-    return false; 
- 
-  return true; 
-} 
- 
-static Instruction *firstNonDbg(BasicBlock::iterator I) { 
-  while (isa<DbgInfoIntrinsic>(I)) 
-    ++I; 
-  return &*I; 
-} 
- 
-namespace { 
-class TailRecursionEliminator { 
-  Function &F; 
-  const TargetTransformInfo *TTI; 
-  AliasAnalysis *AA; 
-  OptimizationRemarkEmitter *ORE; 
-  DomTreeUpdater &DTU; 
- 
-  // The below are shared state we want to have available when eliminating any 
-  // calls in the function. There values should be populated by 
-  // createTailRecurseLoopHeader the first time we find a call we can eliminate. 
-  BasicBlock *HeaderBB = nullptr; 
-  SmallVector<PHINode *, 8> ArgumentPHIs; 
-  bool RemovableCallsMustBeMarkedTail = false; 
- 
-  // PHI node to store our return value. 
-  PHINode *RetPN = nullptr; 
- 
-  // i1 PHI node to track if we have a valid return value stored in RetPN. 
-  PHINode *RetKnownPN = nullptr; 
- 
-  // Vector of select instructions we insereted. These selects use RetKnownPN 
-  // to either propagate RetPN or select a new return value. 
-  SmallVector<SelectInst *, 8> RetSelects; 
- 
-  // The below are shared state needed when performing accumulator recursion. 
-  // There values should be populated by insertAccumulator the first time we 
-  // find an elimination that requires an accumulator. 
- 
-  // PHI node to store our current accumulated value. 
-  PHINode *AccPN = nullptr; 
- 
-  // The instruction doing the accumulating. 
-  Instruction *AccumulatorRecursionInstr = nullptr; 
- 
-  TailRecursionEliminator(Function &F, const TargetTransformInfo *TTI, 
-                          AliasAnalysis *AA, OptimizationRemarkEmitter *ORE, 
-                          DomTreeUpdater &DTU) 
-      : F(F), TTI(TTI), AA(AA), ORE(ORE), DTU(DTU) {} 
- 
+      auto &State = Visited[SuccBB];
+      if (State < Escaped) {
+        State = Escaped;
+        if (State == ESCAPED)
+          WorklistEscaped.push_back(SuccBB);
+        else
+          WorklistUnescaped.push_back(SuccBB);
+      }
+    }
+
+    if (!WorklistEscaped.empty()) {
+      BB = WorklistEscaped.pop_back_val();
+      Escaped = ESCAPED;
+    } else {
+      BB = nullptr;
+      while (!WorklistUnescaped.empty()) {
+        auto *NextBB = WorklistUnescaped.pop_back_val();
+        if (Visited[NextBB] == UNESCAPED) {
+          BB = NextBB;
+          Escaped = UNESCAPED;
+          break;
+        }
+      }
+    }
+  } while (BB);
+
+  for (CallInst *CI : DeferredTails) {
+    if (Visited[CI->getParent()] != ESCAPED) {
+      // If the escape point was part way through the block, calls after the
+      // escape point wouldn't have been put into DeferredTails.
+      LLVM_DEBUG(dbgs() << "Marked as tail call candidate: " << *CI << "\n");
+      CI->setTailCall();
+      Modified = true;
+    } else {
+      AllCallsAreTailCalls = false;
+    }
+  }
+
+  return Modified;
+}
+
+/// Return true if it is safe to move the specified
+/// instruction from after the call to before the call, assuming that all
+/// instructions between the call and this instruction are movable.
+///
+static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) {
+  // FIXME: We can move load/store/call/free instructions above the call if the
+  // call does not mod/ref the memory location being processed.
+  if (I->mayHaveSideEffects())  // This also handles volatile loads.
+    return false;
+
+  if (LoadInst *L = dyn_cast<LoadInst>(I)) {
+    // Loads may always be moved above calls without side effects.
+    if (CI->mayHaveSideEffects()) {
+      // Non-volatile loads may be moved above a call with side effects if it
+      // does not write to memory and the load provably won't trap.
+      // Writes to memory only matter if they may alias the pointer
+      // being loaded from.
+      const DataLayout &DL = L->getModule()->getDataLayout();
+      if (isModSet(AA->getModRefInfo(CI, MemoryLocation::get(L))) ||
+          !isSafeToLoadUnconditionally(L->getPointerOperand(), L->getType(),
+                                       L->getAlign(), DL, L))
+        return false;
+    }
+  }
+
+  // Otherwise, if this is a side-effect free instruction, check to make sure
+  // that it does not use the return value of the call.  If it doesn't use the
+  // return value of the call, it must only use things that are defined before
+  // the call, or movable instructions between the call and the instruction
+  // itself.
+  return !is_contained(I->operands(), CI);
+}
+
+static bool canTransformAccumulatorRecursion(Instruction *I, CallInst *CI) {
+  if (!I->isAssociative() || !I->isCommutative())
+    return false;
+
+  assert(I->getNumOperands() == 2 &&
+         "Associative/commutative operations should have 2 args!");
+
+  // Exactly one operand should be the result of the call instruction.
+  if ((I->getOperand(0) == CI && I->getOperand(1) == CI) ||
+      (I->getOperand(0) != CI && I->getOperand(1) != CI))
+    return false;
+
+  // The only user of this instruction we allow is a single return instruction.
+  if (!I->hasOneUse() || !isa<ReturnInst>(I->user_back()))
+    return false;
+
+  return true;
+}
+
+static Instruction *firstNonDbg(BasicBlock::iterator I) {
+  while (isa<DbgInfoIntrinsic>(I))
+    ++I;
+  return &*I;
+}
+
+namespace {
+class TailRecursionEliminator {
+  Function &F;
+  const TargetTransformInfo *TTI;
+  AliasAnalysis *AA;
+  OptimizationRemarkEmitter *ORE;
+  DomTreeUpdater &DTU;
+
+  // The below are shared state we want to have available when eliminating any
+  // calls in the function. There values should be populated by
+  // createTailRecurseLoopHeader the first time we find a call we can eliminate.
+  BasicBlock *HeaderBB = nullptr;
+  SmallVector<PHINode *, 8> ArgumentPHIs;
+  bool RemovableCallsMustBeMarkedTail = false;
+
+  // PHI node to store our return value.
+  PHINode *RetPN = nullptr;
+
+  // i1 PHI node to track if we have a valid return value stored in RetPN.
+  PHINode *RetKnownPN = nullptr;
+
+  // Vector of select instructions we insereted. These selects use RetKnownPN
+  // to either propagate RetPN or select a new return value.
+  SmallVector<SelectInst *, 8> RetSelects;
+
+  // The below are shared state needed when performing accumulator recursion.
+  // There values should be populated by insertAccumulator the first time we
+  // find an elimination that requires an accumulator.
+
+  // PHI node to store our current accumulated value.
+  PHINode *AccPN = nullptr;
+
+  // The instruction doing the accumulating.
+  Instruction *AccumulatorRecursionInstr = nullptr;
+
+  TailRecursionEliminator(Function &F, const TargetTransformInfo *TTI,
+                          AliasAnalysis *AA, OptimizationRemarkEmitter *ORE,
+                          DomTreeUpdater &DTU)
+      : F(F), TTI(TTI), AA(AA), ORE(ORE), DTU(DTU) {}
+
   CallInst *findTRECandidate(BasicBlock *BB,
-                             bool CannotTailCallElimCallsMarkedTail); 
- 
-  void createTailRecurseLoopHeader(CallInst *CI); 
- 
-  void insertAccumulator(Instruction *AccRecInstr); 
- 
-  bool eliminateCall(CallInst *CI); 
- 
+                             bool CannotTailCallElimCallsMarkedTail);
+
+  void createTailRecurseLoopHeader(CallInst *CI);
+
+  void insertAccumulator(Instruction *AccRecInstr);
+
+  bool eliminateCall(CallInst *CI);
+
   void cleanupAndFinalize();
- 
+
   bool processBlock(BasicBlock &BB, bool CannotTailCallElimCallsMarkedTail);
- 
-public: 
-  static bool eliminate(Function &F, const TargetTransformInfo *TTI, 
-                        AliasAnalysis *AA, OptimizationRemarkEmitter *ORE, 
-                        DomTreeUpdater &DTU); 
-}; 
-} // namespace 
- 
-CallInst *TailRecursionEliminator::findTRECandidate( 
+
+public:
+  static bool eliminate(Function &F, const TargetTransformInfo *TTI,
+                        AliasAnalysis *AA, OptimizationRemarkEmitter *ORE,
+                        DomTreeUpdater &DTU);
+};
+} // namespace
+
+CallInst *TailRecursionEliminator::findTRECandidate(
     BasicBlock *BB, bool CannotTailCallElimCallsMarkedTail) {
   Instruction *TI = BB->getTerminator();
- 
-  if (&BB->front() == TI) // Make sure there is something before the terminator. 
-    return nullptr; 
- 
-  // Scan backwards from the return, checking to see if there is a tail call in 
-  // this block.  If so, set CI to it. 
-  CallInst *CI = nullptr; 
-  BasicBlock::iterator BBI(TI); 
-  while (true) { 
-    CI = dyn_cast<CallInst>(BBI); 
-    if (CI && CI->getCalledFunction() == &F) 
-      break; 
- 
-    if (BBI == BB->begin()) 
-      return nullptr;          // Didn't find a potential tail call. 
-    --BBI; 
-  } 
- 
-  // If this call is marked as a tail call, and if there are dynamic allocas in 
-  // the function, we cannot perform this optimization. 
-  if (CI->isTailCall() && CannotTailCallElimCallsMarkedTail) 
-    return nullptr; 
- 
-  // As a special case, detect code like this: 
-  //   double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call 
-  // and disable this xform in this case, because the code generator will 
-  // lower the call to fabs into inline code. 
-  if (BB == &F.getEntryBlock() && 
-      firstNonDbg(BB->front().getIterator()) == CI && 
-      firstNonDbg(std::next(BB->begin())) == TI && CI->getCalledFunction() && 
-      !TTI->isLoweredToCall(CI->getCalledFunction())) { 
-    // A single-block function with just a call and a return. Check that 
-    // the arguments match. 
-    auto I = CI->arg_begin(), E = CI->arg_end(); 
-    Function::arg_iterator FI = F.arg_begin(), FE = F.arg_end(); 
-    for (; I != E && FI != FE; ++I, ++FI) 
-      if (*I != &*FI) break; 
-    if (I == E && FI == FE) 
-      return nullptr; 
-  } 
- 
-  return CI; 
-} 
- 
-void TailRecursionEliminator::createTailRecurseLoopHeader(CallInst *CI) { 
-  HeaderBB = &F.getEntryBlock(); 
-  BasicBlock *NewEntry = BasicBlock::Create(F.getContext(), "", &F, HeaderBB); 
-  NewEntry->takeName(HeaderBB); 
-  HeaderBB->setName("tailrecurse"); 
-  BranchInst *BI = BranchInst::Create(HeaderBB, NewEntry); 
-  BI->setDebugLoc(CI->getDebugLoc()); 
- 
-  // If this function has self recursive calls in the tail position where some 
-  // are marked tail and some are not, only transform one flavor or another. 
-  // We have to choose whether we move allocas in the entry block to the new 
-  // entry block or not, so we can't make a good choice for both. We make this 
-  // decision here based on whether the first call we found to remove is 
-  // marked tail. 
-  // NOTE: We could do slightly better here in the case that the function has 
-  // no entry block allocas. 
-  RemovableCallsMustBeMarkedTail = CI->isTailCall(); 
- 
-  // If this tail call is marked 'tail' and if there are any allocas in the 
-  // entry block, move them up to the new entry block. 
-  if (RemovableCallsMustBeMarkedTail) 
-    // Move all fixed sized allocas from HeaderBB to NewEntry. 
-    for (BasicBlock::iterator OEBI = HeaderBB->begin(), E = HeaderBB->end(), 
-                              NEBI = NewEntry->begin(); 
-         OEBI != E;) 
-      if (AllocaInst *AI = dyn_cast<AllocaInst>(OEBI++)) 
-        if (isa<ConstantInt>(AI->getArraySize())) 
-          AI->moveBefore(&*NEBI); 
- 
-  // Now that we have created a new block, which jumps to the entry 
-  // block, insert a PHI node for each argument of the function. 
-  // For now, we initialize each PHI to only have the real arguments 
-  // which are passed in. 
-  Instruction *InsertPos = &HeaderBB->front(); 
-  for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) { 
-    PHINode *PN = 
-        PHINode::Create(I->getType(), 2, I->getName() + ".tr", InsertPos); 
-    I->replaceAllUsesWith(PN); // Everyone use the PHI node now! 
-    PN->addIncoming(&*I, NewEntry); 
-    ArgumentPHIs.push_back(PN); 
-  } 
- 
-  // If the function doen't return void, create the RetPN and RetKnownPN PHI 
-  // nodes to track our return value. We initialize RetPN with undef and 
-  // RetKnownPN with false since we can't know our return value at function 
-  // entry. 
-  Type *RetType = F.getReturnType(); 
-  if (!RetType->isVoidTy()) { 
-    Type *BoolType = Type::getInt1Ty(F.getContext()); 
-    RetPN = PHINode::Create(RetType, 2, "ret.tr", InsertPos); 
-    RetKnownPN = PHINode::Create(BoolType, 2, "ret.known.tr", InsertPos); 
- 
-    RetPN->addIncoming(UndefValue::get(RetType), NewEntry); 
-    RetKnownPN->addIncoming(ConstantInt::getFalse(BoolType), NewEntry); 
-  } 
- 
-  // The entry block was changed from HeaderBB to NewEntry. 
-  // The forward DominatorTree needs to be recalculated when the EntryBB is 
-  // changed. In this corner-case we recalculate the entire tree. 
-  DTU.recalculate(*NewEntry->getParent()); 
-} 
- 
-void TailRecursionEliminator::insertAccumulator(Instruction *AccRecInstr) { 
-  assert(!AccPN && "Trying to insert multiple accumulators"); 
- 
-  AccumulatorRecursionInstr = AccRecInstr; 
- 
-  // Start by inserting a new PHI node for the accumulator. 
-  pred_iterator PB = pred_begin(HeaderBB), PE = pred_end(HeaderBB); 
-  AccPN = PHINode::Create(F.getReturnType(), std::distance(PB, PE) + 1, 
-                          "accumulator.tr", &HeaderBB->front()); 
- 
-  // Loop over all of the predecessors of the tail recursion block.  For the 
-  // real entry into the function we seed the PHI with the identity constant for 
-  // the accumulation operation.  For any other existing branches to this block 
-  // (due to other tail recursions eliminated) the accumulator is not modified. 
-  // Because we haven't added the branch in the current block to HeaderBB yet, 
-  // it will not show up as a predecessor. 
-  for (pred_iterator PI = PB; PI != PE; ++PI) { 
-    BasicBlock *P = *PI; 
-    if (P == &F.getEntryBlock()) { 
-      Constant *Identity = ConstantExpr::getBinOpIdentity( 
-          AccRecInstr->getOpcode(), AccRecInstr->getType()); 
-      AccPN->addIncoming(Identity, P); 
-    } else { 
-      AccPN->addIncoming(AccPN, P); 
-    } 
-  } 
- 
-  ++NumAccumAdded; 
-} 
- 
-bool TailRecursionEliminator::eliminateCall(CallInst *CI) { 
-  ReturnInst *Ret = cast<ReturnInst>(CI->getParent()->getTerminator()); 
- 
-  // Ok, we found a potential tail call.  We can currently only transform the 
-  // tail call if all of the instructions between the call and the return are 
-  // movable to above the call itself, leaving the call next to the return. 
-  // Check that this is the case now. 
-  Instruction *AccRecInstr = nullptr; 
-  BasicBlock::iterator BBI(CI); 
-  for (++BBI; &*BBI != Ret; ++BBI) { 
-    if (canMoveAboveCall(&*BBI, CI, AA)) 
-      continue; 
- 
-    // If we can't move the instruction above the call, it might be because it 
-    // is an associative and commutative operation that could be transformed 
-    // using accumulator recursion elimination.  Check to see if this is the 
-    // case, and if so, remember which instruction accumulates for later. 
-    if (AccPN || !canTransformAccumulatorRecursion(&*BBI, CI)) 
-      return false; // We cannot eliminate the tail recursion! 
- 
-    // Yes, this is accumulator recursion.  Remember which instruction 
-    // accumulates. 
-    AccRecInstr = &*BBI; 
-  } 
- 
-  BasicBlock *BB = Ret->getParent(); 
- 
-  using namespace ore; 
-  ORE->emit([&]() { 
-    return OptimizationRemark(DEBUG_TYPE, "tailcall-recursion", CI) 
-           << "transforming tail recursion into loop"; 
-  }); 
- 
-  // OK! We can transform this tail call.  If this is the first one found, 
-  // create the new entry block, allowing us to branch back to the old entry. 
-  if (!HeaderBB) 
-    createTailRecurseLoopHeader(CI); 
- 
-  if (RemovableCallsMustBeMarkedTail && !CI->isTailCall()) 
-    return false; 
- 
-  // Ok, now that we know we have a pseudo-entry block WITH all of the 
-  // required PHI nodes, add entries into the PHI node for the actual 
-  // parameters passed into the tail-recursive call. 
-  for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) 
-    ArgumentPHIs[i]->addIncoming(CI->getArgOperand(i), BB); 
- 
-  if (AccRecInstr) { 
-    insertAccumulator(AccRecInstr); 
- 
-    // Rewrite the accumulator recursion instruction so that it does not use 
-    // the result of the call anymore, instead, use the PHI node we just 
-    // inserted. 
-    AccRecInstr->setOperand(AccRecInstr->getOperand(0) != CI, AccPN); 
-  } 
- 
-  // Update our return value tracking 
-  if (RetPN) { 
-    if (Ret->getReturnValue() == CI || AccRecInstr) { 
-      // Defer selecting a return value 
-      RetPN->addIncoming(RetPN, BB); 
-      RetKnownPN->addIncoming(RetKnownPN, BB); 
-    } else { 
-      // We found a return value we want to use, insert a select instruction to 
-      // select it if we don't already know what our return value will be and 
-      // store the result in our return value PHI node. 
-      SelectInst *SI = SelectInst::Create( 
-          RetKnownPN, RetPN, Ret->getReturnValue(), "current.ret.tr", Ret); 
-      RetSelects.push_back(SI); 
- 
-      RetPN->addIncoming(SI, BB); 
-      RetKnownPN->addIncoming(ConstantInt::getTrue(RetKnownPN->getType()), BB); 
-    } 
- 
-    if (AccPN) 
-      AccPN->addIncoming(AccRecInstr ? AccRecInstr : AccPN, BB); 
-  } 
- 
-  // Now that all of the PHI nodes are in place, remove the call and 
-  // ret instructions, replacing them with an unconditional branch. 
-  BranchInst *NewBI = BranchInst::Create(HeaderBB, Ret); 
-  NewBI->setDebugLoc(CI->getDebugLoc()); 
- 
-  BB->getInstList().erase(Ret);  // Remove return. 
-  BB->getInstList().erase(CI);   // Remove call. 
-  DTU.applyUpdates({{DominatorTree::Insert, BB, HeaderBB}}); 
-  ++NumEliminated; 
-  return true; 
-} 
- 
-void TailRecursionEliminator::cleanupAndFinalize() { 
-  // If we eliminated any tail recursions, it's possible that we inserted some 
-  // silly PHI nodes which just merge an initial value (the incoming operand) 
-  // with themselves.  Check to see if we did and clean up our mess if so.  This 
-  // occurs when a function passes an argument straight through to its tail 
-  // call. 
-  for (PHINode *PN : ArgumentPHIs) { 
-    // If the PHI Node is a dynamic constant, replace it with the value it is. 
-    if (Value *PNV = SimplifyInstruction(PN, F.getParent()->getDataLayout())) { 
-      PN->replaceAllUsesWith(PNV); 
-      PN->eraseFromParent(); 
-    } 
-  } 
- 
-  if (RetPN) { 
-    if (RetSelects.empty()) { 
-      // If we didn't insert any select instructions, then we know we didn't 
-      // store a return value and we can remove the PHI nodes we inserted. 
-      RetPN->dropAllReferences(); 
-      RetPN->eraseFromParent(); 
- 
-      RetKnownPN->dropAllReferences(); 
-      RetKnownPN->eraseFromParent(); 
- 
-      if (AccPN) { 
-        // We need to insert a copy of our accumulator instruction before any 
-        // return in the function, and return its result instead. 
-        Instruction *AccRecInstr = AccumulatorRecursionInstr; 
-        for (BasicBlock &BB : F) { 
-          ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator()); 
-          if (!RI) 
-            continue; 
- 
-          Instruction *AccRecInstrNew = AccRecInstr->clone(); 
-          AccRecInstrNew->setName("accumulator.ret.tr"); 
-          AccRecInstrNew->setOperand(AccRecInstr->getOperand(0) == AccPN, 
-                                     RI->getOperand(0)); 
-          AccRecInstrNew->insertBefore(RI); 
-          RI->setOperand(0, AccRecInstrNew); 
-        } 
-      } 
-    } else { 
-      // We need to insert a select instruction before any return left in the 
-      // function to select our stored return value if we have one. 
-      for (BasicBlock &BB : F) { 
-        ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator()); 
-        if (!RI) 
-          continue; 
- 
-        SelectInst *SI = SelectInst::Create( 
-            RetKnownPN, RetPN, RI->getOperand(0), "current.ret.tr", RI); 
-        RetSelects.push_back(SI); 
-        RI->setOperand(0, SI); 
-      } 
- 
-      if (AccPN) { 
-        // We need to insert a copy of our accumulator instruction before any 
-        // of the selects we inserted, and select its result instead. 
-        Instruction *AccRecInstr = AccumulatorRecursionInstr; 
-        for (SelectInst *SI : RetSelects) { 
-          Instruction *AccRecInstrNew = AccRecInstr->clone(); 
-          AccRecInstrNew->setName("accumulator.ret.tr"); 
-          AccRecInstrNew->setOperand(AccRecInstr->getOperand(0) == AccPN, 
-                                     SI->getFalseValue()); 
-          AccRecInstrNew->insertBefore(SI); 
-          SI->setFalseValue(AccRecInstrNew); 
-        } 
-      } 
-    } 
-  } 
-} 
- 
+
+  if (&BB->front() == TI) // Make sure there is something before the terminator.
+    return nullptr;
+
+  // Scan backwards from the return, checking to see if there is a tail call in
+  // this block.  If so, set CI to it.
+  CallInst *CI = nullptr;
+  BasicBlock::iterator BBI(TI);
+  while (true) {
+    CI = dyn_cast<CallInst>(BBI);
+    if (CI && CI->getCalledFunction() == &F)
+      break;
+
+    if (BBI == BB->begin())
+      return nullptr;          // Didn't find a potential tail call.
+    --BBI;
+  }
+
+  // If this call is marked as a tail call, and if there are dynamic allocas in
+  // the function, we cannot perform this optimization.
+  if (CI->isTailCall() && CannotTailCallElimCallsMarkedTail)
+    return nullptr;
+
+  // As a special case, detect code like this:
+  //   double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call
+  // and disable this xform in this case, because the code generator will
+  // lower the call to fabs into inline code.
+  if (BB == &F.getEntryBlock() &&
+      firstNonDbg(BB->front().getIterator()) == CI &&
+      firstNonDbg(std::next(BB->begin())) == TI && CI->getCalledFunction() &&
+      !TTI->isLoweredToCall(CI->getCalledFunction())) {
+    // A single-block function with just a call and a return. Check that
+    // the arguments match.
+    auto I = CI->arg_begin(), E = CI->arg_end();
+    Function::arg_iterator FI = F.arg_begin(), FE = F.arg_end();
+    for (; I != E && FI != FE; ++I, ++FI)
+      if (*I != &*FI) break;
+    if (I == E && FI == FE)
+      return nullptr;
+  }
+
+  return CI;
+}
+
+void TailRecursionEliminator::createTailRecurseLoopHeader(CallInst *CI) {
+  HeaderBB = &F.getEntryBlock();
+  BasicBlock *NewEntry = BasicBlock::Create(F.getContext(), "", &F, HeaderBB);
+  NewEntry->takeName(HeaderBB);
+  HeaderBB->setName("tailrecurse");
+  BranchInst *BI = BranchInst::Create(HeaderBB, NewEntry);
+  BI->setDebugLoc(CI->getDebugLoc());
+
+  // If this function has self recursive calls in the tail position where some
+  // are marked tail and some are not, only transform one flavor or another.
+  // We have to choose whether we move allocas in the entry block to the new
+  // entry block or not, so we can't make a good choice for both. We make this
+  // decision here based on whether the first call we found to remove is
+  // marked tail.
+  // NOTE: We could do slightly better here in the case that the function has
+  // no entry block allocas.
+  RemovableCallsMustBeMarkedTail = CI->isTailCall();
+
+  // If this tail call is marked 'tail' and if there are any allocas in the
+  // entry block, move them up to the new entry block.
+  if (RemovableCallsMustBeMarkedTail)
+    // Move all fixed sized allocas from HeaderBB to NewEntry.
+    for (BasicBlock::iterator OEBI = HeaderBB->begin(), E = HeaderBB->end(),
+                              NEBI = NewEntry->begin();
+         OEBI != E;)
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(OEBI++))
+        if (isa<ConstantInt>(AI->getArraySize()))
+          AI->moveBefore(&*NEBI);
+
+  // Now that we have created a new block, which jumps to the entry
+  // block, insert a PHI node for each argument of the function.
+  // For now, we initialize each PHI to only have the real arguments
+  // which are passed in.
+  Instruction *InsertPos = &HeaderBB->front();
+  for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) {
+    PHINode *PN =
+        PHINode::Create(I->getType(), 2, I->getName() + ".tr", InsertPos);
+    I->replaceAllUsesWith(PN); // Everyone use the PHI node now!
+    PN->addIncoming(&*I, NewEntry);
+    ArgumentPHIs.push_back(PN);
+  }
+
+  // If the function doen't return void, create the RetPN and RetKnownPN PHI
+  // nodes to track our return value. We initialize RetPN with undef and
+  // RetKnownPN with false since we can't know our return value at function
+  // entry.
+  Type *RetType = F.getReturnType();
+  if (!RetType->isVoidTy()) {
+    Type *BoolType = Type::getInt1Ty(F.getContext());
+    RetPN = PHINode::Create(RetType, 2, "ret.tr", InsertPos);
+    RetKnownPN = PHINode::Create(BoolType, 2, "ret.known.tr", InsertPos);
+
+    RetPN->addIncoming(UndefValue::get(RetType), NewEntry);
+    RetKnownPN->addIncoming(ConstantInt::getFalse(BoolType), NewEntry);
+  }
+
+  // The entry block was changed from HeaderBB to NewEntry.
+  // The forward DominatorTree needs to be recalculated when the EntryBB is
+  // changed. In this corner-case we recalculate the entire tree.
+  DTU.recalculate(*NewEntry->getParent());
+}
+
+void TailRecursionEliminator::insertAccumulator(Instruction *AccRecInstr) {
+  assert(!AccPN && "Trying to insert multiple accumulators");
+
+  AccumulatorRecursionInstr = AccRecInstr;
+
+  // Start by inserting a new PHI node for the accumulator.
+  pred_iterator PB = pred_begin(HeaderBB), PE = pred_end(HeaderBB);
+  AccPN = PHINode::Create(F.getReturnType(), std::distance(PB, PE) + 1,
+                          "accumulator.tr", &HeaderBB->front());
+
+  // Loop over all of the predecessors of the tail recursion block.  For the
+  // real entry into the function we seed the PHI with the identity constant for
+  // the accumulation operation.  For any other existing branches to this block
+  // (due to other tail recursions eliminated) the accumulator is not modified.
+  // Because we haven't added the branch in the current block to HeaderBB yet,
+  // it will not show up as a predecessor.
+  for (pred_iterator PI = PB; PI != PE; ++PI) {
+    BasicBlock *P = *PI;
+    if (P == &F.getEntryBlock()) {
+      Constant *Identity = ConstantExpr::getBinOpIdentity(
+          AccRecInstr->getOpcode(), AccRecInstr->getType());
+      AccPN->addIncoming(Identity, P);
+    } else {
+      AccPN->addIncoming(AccPN, P);
+    }
+  }
+
+  ++NumAccumAdded;
+}
+
+bool TailRecursionEliminator::eliminateCall(CallInst *CI) {
+  ReturnInst *Ret = cast<ReturnInst>(CI->getParent()->getTerminator());
+
+  // Ok, we found a potential tail call.  We can currently only transform the
+  // tail call if all of the instructions between the call and the return are
+  // movable to above the call itself, leaving the call next to the return.
+  // Check that this is the case now.
+  Instruction *AccRecInstr = nullptr;
+  BasicBlock::iterator BBI(CI);
+  for (++BBI; &*BBI != Ret; ++BBI) {
+    if (canMoveAboveCall(&*BBI, CI, AA))
+      continue;
+
+    // If we can't move the instruction above the call, it might be because it
+    // is an associative and commutative operation that could be transformed
+    // using accumulator recursion elimination.  Check to see if this is the
+    // case, and if so, remember which instruction accumulates for later.
+    if (AccPN || !canTransformAccumulatorRecursion(&*BBI, CI))
+      return false; // We cannot eliminate the tail recursion!
+
+    // Yes, this is accumulator recursion.  Remember which instruction
+    // accumulates.
+    AccRecInstr = &*BBI;
+  }
+
+  BasicBlock *BB = Ret->getParent();
+
+  using namespace ore;
+  ORE->emit([&]() {
+    return OptimizationRemark(DEBUG_TYPE, "tailcall-recursion", CI)
+           << "transforming tail recursion into loop";
+  });
+
+  // OK! We can transform this tail call.  If this is the first one found,
+  // create the new entry block, allowing us to branch back to the old entry.
+  if (!HeaderBB)
+    createTailRecurseLoopHeader(CI);
+
+  if (RemovableCallsMustBeMarkedTail && !CI->isTailCall())
+    return false;
+
+  // Ok, now that we know we have a pseudo-entry block WITH all of the
+  // required PHI nodes, add entries into the PHI node for the actual
+  // parameters passed into the tail-recursive call.
+  for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i)
+    ArgumentPHIs[i]->addIncoming(CI->getArgOperand(i), BB);
+
+  if (AccRecInstr) {
+    insertAccumulator(AccRecInstr);
+
+    // Rewrite the accumulator recursion instruction so that it does not use
+    // the result of the call anymore, instead, use the PHI node we just
+    // inserted.
+    AccRecInstr->setOperand(AccRecInstr->getOperand(0) != CI, AccPN);
+  }
+
+  // Update our return value tracking
+  if (RetPN) {
+    if (Ret->getReturnValue() == CI || AccRecInstr) {
+      // Defer selecting a return value
+      RetPN->addIncoming(RetPN, BB);
+      RetKnownPN->addIncoming(RetKnownPN, BB);
+    } else {
+      // We found a return value we want to use, insert a select instruction to
+      // select it if we don't already know what our return value will be and
+      // store the result in our return value PHI node.
+      SelectInst *SI = SelectInst::Create(
+          RetKnownPN, RetPN, Ret->getReturnValue(), "current.ret.tr", Ret);
+      RetSelects.push_back(SI);
+
+      RetPN->addIncoming(SI, BB);
+      RetKnownPN->addIncoming(ConstantInt::getTrue(RetKnownPN->getType()), BB);
+    }
+
+    if (AccPN)
+      AccPN->addIncoming(AccRecInstr ? AccRecInstr : AccPN, BB);
+  }
+
+  // Now that all of the PHI nodes are in place, remove the call and
+  // ret instructions, replacing them with an unconditional branch.
+  BranchInst *NewBI = BranchInst::Create(HeaderBB, Ret);
+  NewBI->setDebugLoc(CI->getDebugLoc());
+
+  BB->getInstList().erase(Ret);  // Remove return.
+  BB->getInstList().erase(CI);   // Remove call.
+  DTU.applyUpdates({{DominatorTree::Insert, BB, HeaderBB}});
+  ++NumEliminated;
+  return true;
+}
+
+void TailRecursionEliminator::cleanupAndFinalize() {
+  // If we eliminated any tail recursions, it's possible that we inserted some
+  // silly PHI nodes which just merge an initial value (the incoming operand)
+  // with themselves.  Check to see if we did and clean up our mess if so.  This
+  // occurs when a function passes an argument straight through to its tail
+  // call.
+  for (PHINode *PN : ArgumentPHIs) {
+    // If the PHI Node is a dynamic constant, replace it with the value it is.
+    if (Value *PNV = SimplifyInstruction(PN, F.getParent()->getDataLayout())) {
+      PN->replaceAllUsesWith(PNV);
+      PN->eraseFromParent();
+    }
+  }
+
+  if (RetPN) {
+    if (RetSelects.empty()) {
+      // If we didn't insert any select instructions, then we know we didn't
+      // store a return value and we can remove the PHI nodes we inserted.
+      RetPN->dropAllReferences();
+      RetPN->eraseFromParent();
+
+      RetKnownPN->dropAllReferences();
+      RetKnownPN->eraseFromParent();
+
+      if (AccPN) {
+        // We need to insert a copy of our accumulator instruction before any
+        // return in the function, and return its result instead.
+        Instruction *AccRecInstr = AccumulatorRecursionInstr;
+        for (BasicBlock &BB : F) {
+          ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator());
+          if (!RI)
+            continue;
+
+          Instruction *AccRecInstrNew = AccRecInstr->clone();
+          AccRecInstrNew->setName("accumulator.ret.tr");
+          AccRecInstrNew->setOperand(AccRecInstr->getOperand(0) == AccPN,
+                                     RI->getOperand(0));
+          AccRecInstrNew->insertBefore(RI);
+          RI->setOperand(0, AccRecInstrNew);
+        }
+      }
+    } else {
+      // We need to insert a select instruction before any return left in the
+      // function to select our stored return value if we have one.
+      for (BasicBlock &BB : F) {
+        ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator());
+        if (!RI)
+          continue;
+
+        SelectInst *SI = SelectInst::Create(
+            RetKnownPN, RetPN, RI->getOperand(0), "current.ret.tr", RI);
+        RetSelects.push_back(SI);
+        RI->setOperand(0, SI);
+      }
+
+      if (AccPN) {
+        // We need to insert a copy of our accumulator instruction before any
+        // of the selects we inserted, and select its result instead.
+        Instruction *AccRecInstr = AccumulatorRecursionInstr;
+        for (SelectInst *SI : RetSelects) {
+          Instruction *AccRecInstrNew = AccRecInstr->clone();
+          AccRecInstrNew->setName("accumulator.ret.tr");
+          AccRecInstrNew->setOperand(AccRecInstr->getOperand(0) == AccPN,
+                                     SI->getFalseValue());
+          AccRecInstrNew->insertBefore(SI);
+          SI->setFalseValue(AccRecInstrNew);
+        }
+      }
+    }
+  }
+}
+
 bool TailRecursionEliminator::processBlock(
     BasicBlock &BB, bool CannotTailCallElimCallsMarkedTail) {
   Instruction *TI = BB.getTerminator();
@@ -791,110 +791,110 @@ bool TailRecursionEliminator::processBlock(
   return false;
 }
 
-bool TailRecursionEliminator::eliminate(Function &F, 
-                                        const TargetTransformInfo *TTI, 
-                                        AliasAnalysis *AA, 
-                                        OptimizationRemarkEmitter *ORE, 
-                                        DomTreeUpdater &DTU) { 
-  if (F.getFnAttribute("disable-tail-calls").getValueAsString() == "true") 
-    return false; 
- 
-  bool MadeChange = false; 
-  bool AllCallsAreTailCalls = false; 
-  MadeChange |= markTails(F, AllCallsAreTailCalls, ORE); 
-  if (!AllCallsAreTailCalls) 
-    return MadeChange; 
- 
-  // If this function is a varargs function, we won't be able to PHI the args 
-  // right, so don't even try to convert it... 
-  if (F.getFunctionType()->isVarArg()) 
-    return MadeChange; 
- 
-  // If false, we cannot perform TRE on tail calls marked with the 'tail' 
-  // attribute, because doing so would cause the stack size to increase (real 
-  // TRE would deallocate variable sized allocas, TRE doesn't). 
-  bool CanTRETailMarkedCall = canTRE(F); 
- 
+bool TailRecursionEliminator::eliminate(Function &F,
+                                        const TargetTransformInfo *TTI,
+                                        AliasAnalysis *AA,
+                                        OptimizationRemarkEmitter *ORE,
+                                        DomTreeUpdater &DTU) {
+  if (F.getFnAttribute("disable-tail-calls").getValueAsString() == "true")
+    return false;
+
+  bool MadeChange = false;
+  bool AllCallsAreTailCalls = false;
+  MadeChange |= markTails(F, AllCallsAreTailCalls, ORE);
+  if (!AllCallsAreTailCalls)
+    return MadeChange;
+
+  // If this function is a varargs function, we won't be able to PHI the args
+  // right, so don't even try to convert it...
+  if (F.getFunctionType()->isVarArg())
+    return MadeChange;
+
+  // If false, we cannot perform TRE on tail calls marked with the 'tail'
+  // attribute, because doing so would cause the stack size to increase (real
+  // TRE would deallocate variable sized allocas, TRE doesn't).
+  bool CanTRETailMarkedCall = canTRE(F);
+
   // Change any tail recursive calls to loops.
-  TailRecursionEliminator TRE(F, TTI, AA, ORE, DTU); 
- 
+  TailRecursionEliminator TRE(F, TTI, AA, ORE, DTU);
+
   for (BasicBlock &BB : F)
     MadeChange |= TRE.processBlock(BB, !CanTRETailMarkedCall);
- 
-  TRE.cleanupAndFinalize(); 
- 
-  return MadeChange; 
-} 
- 
-namespace { 
-struct TailCallElim : public FunctionPass { 
-  static char ID; // Pass identification, replacement for typeid 
-  TailCallElim() : FunctionPass(ID) { 
-    initializeTailCallElimPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<TargetTransformInfoWrapperPass>(); 
-    AU.addRequired<AAResultsWrapperPass>(); 
-    AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 
-    AU.addPreserved<GlobalsAAWrapperPass>(); 
-    AU.addPreserved<DominatorTreeWrapperPass>(); 
-    AU.addPreserved<PostDominatorTreeWrapperPass>(); 
-  } 
- 
-  bool runOnFunction(Function &F) override { 
-    if (skipFunction(F)) 
-      return false; 
- 
-    auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 
-    auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; 
-    auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>(); 
-    auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr; 
-    // There is no noticable performance difference here between Lazy and Eager 
-    // UpdateStrategy based on some test results. It is feasible to switch the 
-    // UpdateStrategy to Lazy if we find it profitable later. 
-    DomTreeUpdater DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Eager); 
- 
-    return TailRecursionEliminator::eliminate( 
-        F, &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F), 
-        &getAnalysis<AAResultsWrapperPass>().getAAResults(), 
-        &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(), DTU); 
-  } 
-}; 
-} 
- 
-char TailCallElim::ID = 0; 
-INITIALIZE_PASS_BEGIN(TailCallElim, "tailcallelim", "Tail Call Elimination", 
-                      false, false) 
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 
-INITIALIZE_PASS_END(TailCallElim, "tailcallelim", "Tail Call Elimination", 
-                    false, false) 
- 
-// Public interface to the TailCallElimination pass 
-FunctionPass *llvm::createTailCallEliminationPass() { 
-  return new TailCallElim(); 
-} 
- 
-PreservedAnalyses TailCallElimPass::run(Function &F, 
-                                        FunctionAnalysisManager &AM) { 
- 
-  TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F); 
-  AliasAnalysis &AA = AM.getResult<AAManager>(F); 
-  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 
-  auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F); 
-  auto *PDT = AM.getCachedResult<PostDominatorTreeAnalysis>(F); 
-  // There is no noticable performance difference here between Lazy and Eager 
-  // UpdateStrategy based on some test results. It is feasible to switch the 
-  // UpdateStrategy to Lazy if we find it profitable later. 
-  DomTreeUpdater DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Eager); 
-  bool Changed = TailRecursionEliminator::eliminate(F, &TTI, &AA, &ORE, DTU); 
- 
-  if (!Changed) 
-    return PreservedAnalyses::all(); 
-  PreservedAnalyses PA; 
-  PA.preserve<GlobalsAA>(); 
-  PA.preserve<DominatorTreeAnalysis>(); 
-  PA.preserve<PostDominatorTreeAnalysis>(); 
-  return PA; 
-} 
+
+  TRE.cleanupAndFinalize();
+
+  return MadeChange;
+}
+
+namespace {
+struct TailCallElim : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  TailCallElim() : FunctionPass(ID) {
+    initializeTailCallElimPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<PostDominatorTreeWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+    auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+    auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>();
+    auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr;
+    // There is no noticable performance difference here between Lazy and Eager
+    // UpdateStrategy based on some test results. It is feasible to switch the
+    // UpdateStrategy to Lazy if we find it profitable later.
+    DomTreeUpdater DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Eager);
+
+    return TailRecursionEliminator::eliminate(
+        F, &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F),
+        &getAnalysis<AAResultsWrapperPass>().getAAResults(),
+        &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(), DTU);
+  }
+};
+}
+
+char TailCallElim::ID = 0;
+INITIALIZE_PASS_BEGIN(TailCallElim, "tailcallelim", "Tail Call Elimination",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_END(TailCallElim, "tailcallelim", "Tail Call Elimination",
+                    false, false)
+
+// Public interface to the TailCallElimination pass
+FunctionPass *llvm::createTailCallEliminationPass() {
+  return new TailCallElim();
+}
+
+PreservedAnalyses TailCallElimPass::run(Function &F,
+                                        FunctionAnalysisManager &AM) {
+
+  TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
+  AliasAnalysis &AA = AM.getResult<AAManager>(F);
+  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
+  auto *PDT = AM.getCachedResult<PostDominatorTreeAnalysis>(F);
+  // There is no noticable performance difference here between Lazy and Eager
+  // UpdateStrategy based on some test results. It is feasible to switch the
+  // UpdateStrategy to Lazy if we find it profitable later.
+  DomTreeUpdater DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Eager);
+  bool Changed = TailRecursionEliminator::eliminate(F, &TTI, &AA, &ORE, DTU);
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<GlobalsAA>();
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<PostDominatorTreeAnalysis>();
+  return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/WarnMissedTransforms.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/WarnMissedTransforms.cpp
index 464eb20d8f..80a7d3a43a 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/WarnMissedTransforms.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/WarnMissedTransforms.cpp
@@ -1,150 +1,150 @@
-//===- LoopTransformWarning.cpp -  ----------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// Emit warnings if forced code transformations have not been performed. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Scalar/WarnMissedTransforms.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/OptimizationRemarkEmitter.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Transforms/Utils/LoopUtils.h" 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "transform-warning" 
- 
-/// Emit warnings for forced (i.e. user-defined) loop transformations which have 
-/// still not been performed. 
-static void warnAboutLeftoverTransformations(Loop *L, 
-                                             OptimizationRemarkEmitter *ORE) { 
-  if (hasUnrollTransformation(L) == TM_ForcedByUser) { 
-    LLVM_DEBUG(dbgs() << "Leftover unroll transformation\n"); 
-    ORE->emit( 
-        DiagnosticInfoOptimizationFailure(DEBUG_TYPE, 
-                                          "FailedRequestedUnrolling", 
-                                          L->getStartLoc(), L->getHeader()) 
-        << "loop not unrolled: the optimizer was unable to perform the " 
-           "requested transformation; the transformation might be disabled or " 
-           "specified as part of an unsupported transformation ordering"); 
-  } 
- 
-  if (hasUnrollAndJamTransformation(L) == TM_ForcedByUser) { 
-    LLVM_DEBUG(dbgs() << "Leftover unroll-and-jam transformation\n"); 
-    ORE->emit( 
-        DiagnosticInfoOptimizationFailure(DEBUG_TYPE, 
-                                          "FailedRequestedUnrollAndJamming", 
-                                          L->getStartLoc(), L->getHeader()) 
-        << "loop not unroll-and-jammed: the optimizer was unable to perform " 
-           "the requested transformation; the transformation might be disabled " 
-           "or specified as part of an unsupported transformation ordering"); 
-  } 
- 
-  if (hasVectorizeTransformation(L) == TM_ForcedByUser) { 
-    LLVM_DEBUG(dbgs() << "Leftover vectorization transformation\n"); 
+//===- LoopTransformWarning.cpp -  ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Emit warnings if forced code transformations have not been performed.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/WarnMissedTransforms.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "transform-warning"
+
+/// Emit warnings for forced (i.e. user-defined) loop transformations which have
+/// still not been performed.
+static void warnAboutLeftoverTransformations(Loop *L,
+                                             OptimizationRemarkEmitter *ORE) {
+  if (hasUnrollTransformation(L) == TM_ForcedByUser) {
+    LLVM_DEBUG(dbgs() << "Leftover unroll transformation\n");
+    ORE->emit(
+        DiagnosticInfoOptimizationFailure(DEBUG_TYPE,
+                                          "FailedRequestedUnrolling",
+                                          L->getStartLoc(), L->getHeader())
+        << "loop not unrolled: the optimizer was unable to perform the "
+           "requested transformation; the transformation might be disabled or "
+           "specified as part of an unsupported transformation ordering");
+  }
+
+  if (hasUnrollAndJamTransformation(L) == TM_ForcedByUser) {
+    LLVM_DEBUG(dbgs() << "Leftover unroll-and-jam transformation\n");
+    ORE->emit(
+        DiagnosticInfoOptimizationFailure(DEBUG_TYPE,
+                                          "FailedRequestedUnrollAndJamming",
+                                          L->getStartLoc(), L->getHeader())
+        << "loop not unroll-and-jammed: the optimizer was unable to perform "
+           "the requested transformation; the transformation might be disabled "
+           "or specified as part of an unsupported transformation ordering");
+  }
+
+  if (hasVectorizeTransformation(L) == TM_ForcedByUser) {
+    LLVM_DEBUG(dbgs() << "Leftover vectorization transformation\n");
     Optional<ElementCount> VectorizeWidth =
         getOptionalElementCountLoopAttribute(L);
-    Optional<int> InterleaveCount = 
-        getOptionalIntLoopAttribute(L, "llvm.loop.interleave.count"); 
- 
+    Optional<int> InterleaveCount =
+        getOptionalIntLoopAttribute(L, "llvm.loop.interleave.count");
+
     if (!VectorizeWidth || VectorizeWidth->isVector())
-      ORE->emit( 
-          DiagnosticInfoOptimizationFailure(DEBUG_TYPE, 
-                                            "FailedRequestedVectorization", 
-                                            L->getStartLoc(), L->getHeader()) 
-          << "loop not vectorized: the optimizer was unable to perform the " 
-             "requested transformation; the transformation might be disabled " 
-             "or specified as part of an unsupported transformation ordering"); 
-    else if (InterleaveCount.getValueOr(0) != 1) 
-      ORE->emit( 
-          DiagnosticInfoOptimizationFailure(DEBUG_TYPE, 
-                                            "FailedRequestedInterleaving", 
-                                            L->getStartLoc(), L->getHeader()) 
-          << "loop not interleaved: the optimizer was unable to perform the " 
-             "requested transformation; the transformation might be disabled " 
-             "or specified as part of an unsupported transformation ordering"); 
-  } 
- 
-  if (hasDistributeTransformation(L) == TM_ForcedByUser) { 
-    LLVM_DEBUG(dbgs() << "Leftover distribute transformation\n"); 
-    ORE->emit( 
-        DiagnosticInfoOptimizationFailure(DEBUG_TYPE, 
-                                          "FailedRequestedDistribution", 
-                                          L->getStartLoc(), L->getHeader()) 
-        << "loop not distributed: the optimizer was unable to perform the " 
-           "requested transformation; the transformation might be disabled or " 
-           "specified as part of an unsupported transformation ordering"); 
-  } 
-} 
- 
-static void warnAboutLeftoverTransformations(Function *F, LoopInfo *LI, 
-                                             OptimizationRemarkEmitter *ORE) { 
-  for (auto *L : LI->getLoopsInPreorder()) 
-    warnAboutLeftoverTransformations(L, ORE); 
-} 
- 
-// New pass manager boilerplate 
-PreservedAnalyses 
-WarnMissedTransformationsPass::run(Function &F, FunctionAnalysisManager &AM) { 
-  // Do not warn about not applied transformations if optimizations are 
-  // disabled. 
-  if (F.hasOptNone()) 
-    return PreservedAnalyses::all(); 
- 
-  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 
-  auto &LI = AM.getResult<LoopAnalysis>(F); 
- 
-  warnAboutLeftoverTransformations(&F, &LI, &ORE); 
- 
-  return PreservedAnalyses::all(); 
-} 
- 
-// Legacy pass manager boilerplate 
-namespace { 
-class WarnMissedTransformationsLegacy : public FunctionPass { 
-public: 
-  static char ID; 
- 
-  explicit WarnMissedTransformationsLegacy() : FunctionPass(ID) { 
-    initializeWarnMissedTransformationsLegacyPass( 
-        *PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnFunction(Function &F) override { 
-    if (skipFunction(F)) 
-      return false; 
- 
-    auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 
-    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 
- 
-    warnAboutLeftoverTransformations(&F, &LI, &ORE); 
-    return false; 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 
-    AU.addRequired<LoopInfoWrapperPass>(); 
- 
-    AU.setPreservesAll(); 
-  } 
-}; 
-} // end anonymous namespace 
- 
-char WarnMissedTransformationsLegacy::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(WarnMissedTransformationsLegacy, "transform-warning", 
-                      "Warn about non-applied transformations", false, false) 
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 
-INITIALIZE_PASS_END(WarnMissedTransformationsLegacy, "transform-warning", 
-                    "Warn about non-applied transformations", false, false) 
- 
-Pass *llvm::createWarnMissedTransformationsPass() { 
-  return new WarnMissedTransformationsLegacy(); 
-} 
+      ORE->emit(
+          DiagnosticInfoOptimizationFailure(DEBUG_TYPE,
+                                            "FailedRequestedVectorization",
+                                            L->getStartLoc(), L->getHeader())
+          << "loop not vectorized: the optimizer was unable to perform the "
+             "requested transformation; the transformation might be disabled "
+             "or specified as part of an unsupported transformation ordering");
+    else if (InterleaveCount.getValueOr(0) != 1)
+      ORE->emit(
+          DiagnosticInfoOptimizationFailure(DEBUG_TYPE,
+                                            "FailedRequestedInterleaving",
+                                            L->getStartLoc(), L->getHeader())
+          << "loop not interleaved: the optimizer was unable to perform the "
+             "requested transformation; the transformation might be disabled "
+             "or specified as part of an unsupported transformation ordering");
+  }
+
+  if (hasDistributeTransformation(L) == TM_ForcedByUser) {
+    LLVM_DEBUG(dbgs() << "Leftover distribute transformation\n");
+    ORE->emit(
+        DiagnosticInfoOptimizationFailure(DEBUG_TYPE,
+                                          "FailedRequestedDistribution",
+                                          L->getStartLoc(), L->getHeader())
+        << "loop not distributed: the optimizer was unable to perform the "
+           "requested transformation; the transformation might be disabled or "
+           "specified as part of an unsupported transformation ordering");
+  }
+}
+
+static void warnAboutLeftoverTransformations(Function *F, LoopInfo *LI,
+                                             OptimizationRemarkEmitter *ORE) {
+  for (auto *L : LI->getLoopsInPreorder())
+    warnAboutLeftoverTransformations(L, ORE);
+}
+
+// New pass manager boilerplate
+PreservedAnalyses
+WarnMissedTransformationsPass::run(Function &F, FunctionAnalysisManager &AM) {
+  // Do not warn about not applied transformations if optimizations are
+  // disabled.
+  if (F.hasOptNone())
+    return PreservedAnalyses::all();
+
+  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+
+  warnAboutLeftoverTransformations(&F, &LI, &ORE);
+
+  return PreservedAnalyses::all();
+}
+
+// Legacy pass manager boilerplate
+namespace {
+class WarnMissedTransformationsLegacy : public FunctionPass {
+public:
+  static char ID;
+
+  explicit WarnMissedTransformationsLegacy() : FunctionPass(ID) {
+    initializeWarnMissedTransformationsLegacyPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+
+    warnAboutLeftoverTransformations(&F, &LI, &ORE);
+    return false;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+
+    AU.setPreservesAll();
+  }
+};
+} // end anonymous namespace
+
+char WarnMissedTransformationsLegacy::ID = 0;
+
+INITIALIZE_PASS_BEGIN(WarnMissedTransformationsLegacy, "transform-warning",
+                      "Warn about non-applied transformations", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_END(WarnMissedTransformationsLegacy, "transform-warning",
+                    "Warn about non-applied transformations", false, false)
+
+Pass *llvm::createWarnMissedTransformationsPass() {
+  return new WarnMissedTransformationsLegacy();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/ya.make b/contrib/libs/llvm12/lib/Transforms/Scalar/ya.make
index beb88625e7..75501ae81a 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/ya.make
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/ya.make
@@ -1,17 +1,17 @@
-# Generated by devtools/yamaker. 
- 
-LIBRARY() 
- 
+# Generated by devtools/yamaker.
+
+LIBRARY()
+
 OWNER(
     orivej
     g:cpp-contrib
 )
- 
+
 LICENSE(Apache-2.0 WITH LLVM-exception)
 
 LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
 
-PEERDIR( 
+PEERDIR(
     contrib/libs/llvm12
     contrib/libs/llvm12/include
     contrib/libs/llvm12/lib/Analysis
@@ -20,95 +20,95 @@ PEERDIR(
     contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine
     contrib/libs/llvm12/lib/Transforms/InstCombine
     contrib/libs/llvm12/lib/Transforms/Utils
-) 
- 
+)
+
 ADDINCL(
     contrib/libs/llvm12/lib/Transforms/Scalar
 )
- 
-NO_COMPILER_WARNINGS() 
- 
-NO_UTIL() 
- 
-SRCS( 
-    ADCE.cpp 
-    AlignmentFromAssumptions.cpp 
+
+NO_COMPILER_WARNINGS()
+
+NO_UTIL()
+
+SRCS(
+    ADCE.cpp
+    AlignmentFromAssumptions.cpp
     AnnotationRemarks.cpp
-    BDCE.cpp 
-    CallSiteSplitting.cpp 
-    ConstantHoisting.cpp 
+    BDCE.cpp
+    CallSiteSplitting.cpp
+    ConstantHoisting.cpp
     ConstraintElimination.cpp
-    CorrelatedValuePropagation.cpp 
-    DCE.cpp 
-    DeadStoreElimination.cpp 
-    DivRemPairs.cpp 
-    EarlyCSE.cpp 
-    FlattenCFGPass.cpp 
-    Float2Int.cpp 
-    GVN.cpp 
-    GVNHoist.cpp 
-    GVNSink.cpp 
-    GuardWidening.cpp 
-    IVUsersPrinter.cpp 
-    IndVarSimplify.cpp 
-    InductiveRangeCheckElimination.cpp 
-    InferAddressSpaces.cpp 
-    InstSimplifyPass.cpp 
-    JumpThreading.cpp 
-    LICM.cpp 
-    LoopAccessAnalysisPrinter.cpp 
-    LoopDataPrefetch.cpp 
-    LoopDeletion.cpp 
-    LoopDistribute.cpp 
+    CorrelatedValuePropagation.cpp
+    DCE.cpp
+    DeadStoreElimination.cpp
+    DivRemPairs.cpp
+    EarlyCSE.cpp
+    FlattenCFGPass.cpp
+    Float2Int.cpp
+    GVN.cpp
+    GVNHoist.cpp
+    GVNSink.cpp
+    GuardWidening.cpp
+    IVUsersPrinter.cpp
+    IndVarSimplify.cpp
+    InductiveRangeCheckElimination.cpp
+    InferAddressSpaces.cpp
+    InstSimplifyPass.cpp
+    JumpThreading.cpp
+    LICM.cpp
+    LoopAccessAnalysisPrinter.cpp
+    LoopDataPrefetch.cpp
+    LoopDeletion.cpp
+    LoopDistribute.cpp
     LoopFlatten.cpp
-    LoopFuse.cpp 
-    LoopIdiomRecognize.cpp 
-    LoopInstSimplify.cpp 
-    LoopInterchange.cpp 
-    LoopLoadElimination.cpp 
-    LoopPassManager.cpp 
-    LoopPredication.cpp 
-    LoopRerollPass.cpp 
-    LoopRotation.cpp 
-    LoopSimplifyCFG.cpp 
-    LoopSink.cpp 
-    LoopStrengthReduce.cpp 
-    LoopUnrollAndJamPass.cpp 
-    LoopUnrollPass.cpp 
-    LoopUnswitch.cpp 
-    LoopVersioningLICM.cpp 
-    LowerAtomic.cpp 
-    LowerConstantIntrinsics.cpp 
-    LowerExpectIntrinsic.cpp 
-    LowerGuardIntrinsic.cpp 
-    LowerMatrixIntrinsics.cpp 
-    LowerWidenableCondition.cpp 
-    MakeGuardsExplicit.cpp 
-    MemCpyOptimizer.cpp 
-    MergeICmps.cpp 
-    MergedLoadStoreMotion.cpp 
-    NaryReassociate.cpp 
-    NewGVN.cpp 
-    PartiallyInlineLibCalls.cpp 
-    PlaceSafepoints.cpp 
-    Reassociate.cpp 
-    Reg2Mem.cpp 
-    RewriteStatepointsForGC.cpp 
-    SCCP.cpp 
-    SROA.cpp 
-    Scalar.cpp 
+    LoopFuse.cpp
+    LoopIdiomRecognize.cpp
+    LoopInstSimplify.cpp
+    LoopInterchange.cpp
+    LoopLoadElimination.cpp
+    LoopPassManager.cpp
+    LoopPredication.cpp
+    LoopRerollPass.cpp
+    LoopRotation.cpp
+    LoopSimplifyCFG.cpp
+    LoopSink.cpp
+    LoopStrengthReduce.cpp
+    LoopUnrollAndJamPass.cpp
+    LoopUnrollPass.cpp
+    LoopUnswitch.cpp
+    LoopVersioningLICM.cpp
+    LowerAtomic.cpp
+    LowerConstantIntrinsics.cpp
+    LowerExpectIntrinsic.cpp
+    LowerGuardIntrinsic.cpp
+    LowerMatrixIntrinsics.cpp
+    LowerWidenableCondition.cpp
+    MakeGuardsExplicit.cpp
+    MemCpyOptimizer.cpp
+    MergeICmps.cpp
+    MergedLoadStoreMotion.cpp
+    NaryReassociate.cpp
+    NewGVN.cpp
+    PartiallyInlineLibCalls.cpp
+    PlaceSafepoints.cpp
+    Reassociate.cpp
+    Reg2Mem.cpp
+    RewriteStatepointsForGC.cpp
+    SCCP.cpp
+    SROA.cpp
+    Scalar.cpp
     ScalarizeMaskedMemIntrin.cpp
-    Scalarizer.cpp 
-    SeparateConstOffsetFromGEP.cpp 
-    SimpleLoopUnswitch.cpp 
-    SimplifyCFGPass.cpp 
-    Sink.cpp 
-    SpeculateAroundPHIs.cpp 
-    SpeculativeExecution.cpp 
-    StraightLineStrengthReduce.cpp 
-    StructurizeCFG.cpp 
-    TailRecursionElimination.cpp 
-    WarnMissedTransforms.cpp 
-) 
- 
-END() 
+    Scalarizer.cpp
+    SeparateConstOffsetFromGEP.cpp
+    SimpleLoopUnswitch.cpp
+    SimplifyCFGPass.cpp
+    Sink.cpp
+    SpeculateAroundPHIs.cpp
+    SpeculativeExecution.cpp
+    StraightLineStrengthReduce.cpp
+    StructurizeCFG.cpp
+    TailRecursionElimination.cpp
+    WarnMissedTransforms.cpp
+)
+
+END()
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
index 3692462855..ccdcf7cbce 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
@@ -1,243 +1,243 @@
-//===- AMDGPUEmitPrintf.cpp -----------------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// Utility function to lower a printf call into a series of device 
-// library calls on the AMDGPU target. 
-// 
-// WARNING: This file knows about certain library functions. It recognizes them 
-// by name, and hardwires knowledge of their semantics. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/AMDGPUEmitPrintf.h" 
-#include "llvm/ADT/SparseBitVector.h" 
-#include "llvm/Analysis/ValueTracking.h" 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "amdgpu-emit-printf" 
- 
-static bool isCString(const Value *Arg) { 
-  auto Ty = Arg->getType(); 
-  auto PtrTy = dyn_cast<PointerType>(Ty); 
-  if (!PtrTy) 
-    return false; 
- 
-  auto IntTy = dyn_cast<IntegerType>(PtrTy->getElementType()); 
-  if (!IntTy) 
-    return false; 
- 
-  return IntTy->getBitWidth() == 8; 
-} 
- 
-static Value *fitArgInto64Bits(IRBuilder<> &Builder, Value *Arg) { 
-  auto Int64Ty = Builder.getInt64Ty(); 
-  auto Ty = Arg->getType(); 
- 
-  if (auto IntTy = dyn_cast<IntegerType>(Ty)) { 
-    switch (IntTy->getBitWidth()) { 
-    case 32: 
-      return Builder.CreateZExt(Arg, Int64Ty); 
-    case 64: 
-      return Arg; 
-    } 
-  } 
- 
-  if (Ty->getTypeID() == Type::DoubleTyID) { 
-    return Builder.CreateBitCast(Arg, Int64Ty); 
-  } 
- 
-  if (isa<PointerType>(Ty)) { 
-    return Builder.CreatePtrToInt(Arg, Int64Ty); 
-  } 
- 
-  llvm_unreachable("unexpected type"); 
-} 
- 
-static Value *callPrintfBegin(IRBuilder<> &Builder, Value *Version) { 
-  auto Int64Ty = Builder.getInt64Ty(); 
-  auto M = Builder.GetInsertBlock()->getModule(); 
-  auto Fn = M->getOrInsertFunction("__ockl_printf_begin", Int64Ty, Int64Ty); 
-  return Builder.CreateCall(Fn, Version); 
-} 
- 
-static Value *callAppendArgs(IRBuilder<> &Builder, Value *Desc, int NumArgs, 
-                             Value *Arg0, Value *Arg1, Value *Arg2, Value *Arg3, 
-                             Value *Arg4, Value *Arg5, Value *Arg6, 
-                             bool IsLast) { 
-  auto Int64Ty = Builder.getInt64Ty(); 
-  auto Int32Ty = Builder.getInt32Ty(); 
-  auto M = Builder.GetInsertBlock()->getModule(); 
-  auto Fn = M->getOrInsertFunction("__ockl_printf_append_args", Int64Ty, 
-                                   Int64Ty, Int32Ty, Int64Ty, Int64Ty, Int64Ty, 
-                                   Int64Ty, Int64Ty, Int64Ty, Int64Ty, Int32Ty); 
-  auto IsLastValue = Builder.getInt32(IsLast); 
-  auto NumArgsValue = Builder.getInt32(NumArgs); 
-  return Builder.CreateCall(Fn, {Desc, NumArgsValue, Arg0, Arg1, Arg2, Arg3, 
-                                 Arg4, Arg5, Arg6, IsLastValue}); 
-} 
- 
-static Value *appendArg(IRBuilder<> &Builder, Value *Desc, Value *Arg, 
-                        bool IsLast) { 
-  auto Arg0 = fitArgInto64Bits(Builder, Arg); 
-  auto Zero = Builder.getInt64(0); 
-  return callAppendArgs(Builder, Desc, 1, Arg0, Zero, Zero, Zero, Zero, Zero, 
-                        Zero, IsLast); 
-} 
- 
-// The device library does not provide strlen, so we build our own loop 
-// here. While we are at it, we also include the terminating null in the length. 
-static Value *getStrlenWithNull(IRBuilder<> &Builder, Value *Str) { 
-  auto *Prev = Builder.GetInsertBlock(); 
-  Module *M = Prev->getModule(); 
- 
-  auto CharZero = Builder.getInt8(0); 
-  auto One = Builder.getInt64(1); 
-  auto Zero = Builder.getInt64(0); 
-  auto Int64Ty = Builder.getInt64Ty(); 
- 
-  // The length is either zero for a null pointer, or the computed value for an 
-  // actual string. We need a join block for a phi that represents the final 
-  // value. 
-  // 
-  //  Strictly speaking, the zero does not matter since 
-  // __ockl_printf_append_string_n ignores the length if the pointer is null. 
-  BasicBlock *Join = nullptr; 
-  if (Prev->getTerminator()) { 
-    Join = Prev->splitBasicBlock(Builder.GetInsertPoint(), 
-                                 "strlen.join"); 
-    Prev->getTerminator()->eraseFromParent(); 
-  } else { 
-    Join = BasicBlock::Create(M->getContext(), "strlen.join", 
-                              Prev->getParent()); 
-  } 
-  BasicBlock *While = 
-      BasicBlock::Create(M->getContext(), "strlen.while", 
-                         Prev->getParent(), Join); 
-  BasicBlock *WhileDone = BasicBlock::Create( 
-      M->getContext(), "strlen.while.done", 
-      Prev->getParent(), Join); 
- 
-  // Emit an early return for when the pointer is null. 
-  Builder.SetInsertPoint(Prev); 
-  auto CmpNull = 
-      Builder.CreateICmpEQ(Str, Constant::getNullValue(Str->getType())); 
-  BranchInst::Create(Join, While, CmpNull, Prev); 
- 
-  // Entry to the while loop. 
-  Builder.SetInsertPoint(While); 
- 
-  auto PtrPhi = Builder.CreatePHI(Str->getType(), 2); 
-  PtrPhi->addIncoming(Str, Prev); 
-  auto PtrNext = Builder.CreateGEP(PtrPhi, One); 
-  PtrPhi->addIncoming(PtrNext, While); 
- 
-  // Condition for the while loop. 
-  auto Data = Builder.CreateLoad(PtrPhi); 
-  auto Cmp = Builder.CreateICmpEQ(Data, CharZero); 
-  Builder.CreateCondBr(Cmp, WhileDone, While); 
- 
-  // Add one to the computed length. 
-  Builder.SetInsertPoint(WhileDone, WhileDone->begin()); 
-  auto Begin = Builder.CreatePtrToInt(Str, Int64Ty); 
-  auto End = Builder.CreatePtrToInt(PtrPhi, Int64Ty); 
-  auto Len = Builder.CreateSub(End, Begin); 
-  Len = Builder.CreateAdd(Len, One); 
- 
-  // Final join. 
-  BranchInst::Create(Join, WhileDone); 
-  Builder.SetInsertPoint(Join, Join->begin()); 
-  auto LenPhi = Builder.CreatePHI(Len->getType(), 2); 
-  LenPhi->addIncoming(Len, WhileDone); 
-  LenPhi->addIncoming(Zero, Prev); 
- 
-  return LenPhi; 
-} 
- 
-static Value *callAppendStringN(IRBuilder<> &Builder, Value *Desc, Value *Str, 
-                                Value *Length, bool isLast) { 
-  auto Int64Ty = Builder.getInt64Ty(); 
-  auto CharPtrTy = Builder.getInt8PtrTy(); 
-  auto Int32Ty = Builder.getInt32Ty(); 
-  auto M = Builder.GetInsertBlock()->getModule(); 
-  auto Fn = M->getOrInsertFunction("__ockl_printf_append_string_n", Int64Ty, 
-                                   Int64Ty, CharPtrTy, Int64Ty, Int32Ty); 
-  auto IsLastInt32 = Builder.getInt32(isLast); 
-  return Builder.CreateCall(Fn, {Desc, Str, Length, IsLastInt32}); 
-} 
- 
-static Value *appendString(IRBuilder<> &Builder, Value *Desc, Value *Arg, 
-                           bool IsLast) { 
-  auto Length = getStrlenWithNull(Builder, Arg); 
-  return callAppendStringN(Builder, Desc, Arg, Length, IsLast); 
-} 
- 
-static Value *processArg(IRBuilder<> &Builder, Value *Desc, Value *Arg, 
-                         bool SpecIsCString, bool IsLast) { 
-  if (SpecIsCString && isCString(Arg)) { 
-    return appendString(Builder, Desc, Arg, IsLast); 
-  } 
-  // If the format specifies a string but the argument is not, the frontend will 
-  // have printed a warning. We just rely on undefined behaviour and send the 
-  // argument anyway. 
-  return appendArg(Builder, Desc, Arg, IsLast); 
-} 
- 
-// Scan the format string to locate all specifiers, and mark the ones that 
-// specify a string, i.e, the "%s" specifier with optional '*' characters. 
-static void locateCStrings(SparseBitVector<8> &BV, Value *Fmt) { 
-  StringRef Str; 
-  if (!getConstantStringInfo(Fmt, Str) || Str.empty()) 
-    return; 
- 
-  static const char ConvSpecifiers[] = "diouxXfFeEgGaAcspn"; 
-  size_t SpecPos = 0; 
-  // Skip the first argument, the format string. 
-  unsigned ArgIdx = 1; 
- 
-  while ((SpecPos = Str.find_first_of('%', SpecPos)) != StringRef::npos) { 
-    if (Str[SpecPos + 1] == '%') { 
-      SpecPos += 2; 
-      continue; 
-    } 
-    auto SpecEnd = Str.find_first_of(ConvSpecifiers, SpecPos); 
-    if (SpecEnd == StringRef::npos) 
-      return; 
-    auto Spec = Str.slice(SpecPos, SpecEnd + 1); 
-    ArgIdx += Spec.count('*'); 
-    if (Str[SpecEnd] == 's') { 
-      BV.set(ArgIdx); 
-    } 
-    SpecPos = SpecEnd + 1; 
-    ++ArgIdx; 
-  } 
-} 
- 
-Value *llvm::emitAMDGPUPrintfCall(IRBuilder<> &Builder, 
-                                  ArrayRef<Value *> Args) { 
-  auto NumOps = Args.size(); 
-  assert(NumOps >= 1); 
- 
-  auto Fmt = Args[0]; 
-  SparseBitVector<8> SpecIsCString; 
-  locateCStrings(SpecIsCString, Fmt); 
- 
-  auto Desc = callPrintfBegin(Builder, Builder.getIntN(64, 0)); 
-  Desc = appendString(Builder, Desc, Fmt, NumOps == 1); 
- 
-  // FIXME: This invokes hostcall once for each argument. We can pack up to 
-  // seven scalar printf arguments in a single hostcall. See the signature of 
-  // callAppendArgs(). 
-  for (unsigned int i = 1; i != NumOps; ++i) { 
-    bool IsLast = i == NumOps - 1; 
-    bool IsCString = SpecIsCString.test(i); 
-    Desc = processArg(Builder, Desc, Args[i], IsCString, IsLast); 
-  } 
- 
-  return Builder.CreateTrunc(Desc, Builder.getInt32Ty()); 
-} 
+//===- AMDGPUEmitPrintf.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Utility function to lower a printf call into a series of device
+// library calls on the AMDGPU target.
+//
+// WARNING: This file knows about certain library functions. It recognizes them
+// by name, and hardwires knowledge of their semantics.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/AMDGPUEmitPrintf.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/Analysis/ValueTracking.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-emit-printf"
+
+static bool isCString(const Value *Arg) {
+  auto Ty = Arg->getType();
+  auto PtrTy = dyn_cast<PointerType>(Ty);
+  if (!PtrTy)
+    return false;
+
+  auto IntTy = dyn_cast<IntegerType>(PtrTy->getElementType());
+  if (!IntTy)
+    return false;
+
+  return IntTy->getBitWidth() == 8;
+}
+
+static Value *fitArgInto64Bits(IRBuilder<> &Builder, Value *Arg) {
+  auto Int64Ty = Builder.getInt64Ty();
+  auto Ty = Arg->getType();
+
+  if (auto IntTy = dyn_cast<IntegerType>(Ty)) {
+    switch (IntTy->getBitWidth()) {
+    case 32:
+      return Builder.CreateZExt(Arg, Int64Ty);
+    case 64:
+      return Arg;
+    }
+  }
+
+  if (Ty->getTypeID() == Type::DoubleTyID) {
+    return Builder.CreateBitCast(Arg, Int64Ty);
+  }
+
+  if (isa<PointerType>(Ty)) {
+    return Builder.CreatePtrToInt(Arg, Int64Ty);
+  }
+
+  llvm_unreachable("unexpected type");
+}
+
+static Value *callPrintfBegin(IRBuilder<> &Builder, Value *Version) {
+  auto Int64Ty = Builder.getInt64Ty();
+  auto M = Builder.GetInsertBlock()->getModule();
+  auto Fn = M->getOrInsertFunction("__ockl_printf_begin", Int64Ty, Int64Ty);
+  return Builder.CreateCall(Fn, Version);
+}
+
+static Value *callAppendArgs(IRBuilder<> &Builder, Value *Desc, int NumArgs,
+                             Value *Arg0, Value *Arg1, Value *Arg2, Value *Arg3,
+                             Value *Arg4, Value *Arg5, Value *Arg6,
+                             bool IsLast) {
+  auto Int64Ty = Builder.getInt64Ty();
+  auto Int32Ty = Builder.getInt32Ty();
+  auto M = Builder.GetInsertBlock()->getModule();
+  auto Fn = M->getOrInsertFunction("__ockl_printf_append_args", Int64Ty,
+                                   Int64Ty, Int32Ty, Int64Ty, Int64Ty, Int64Ty,
+                                   Int64Ty, Int64Ty, Int64Ty, Int64Ty, Int32Ty);
+  auto IsLastValue = Builder.getInt32(IsLast);
+  auto NumArgsValue = Builder.getInt32(NumArgs);
+  return Builder.CreateCall(Fn, {Desc, NumArgsValue, Arg0, Arg1, Arg2, Arg3,
+                                 Arg4, Arg5, Arg6, IsLastValue});
+}
+
+static Value *appendArg(IRBuilder<> &Builder, Value *Desc, Value *Arg,
+                        bool IsLast) {
+  auto Arg0 = fitArgInto64Bits(Builder, Arg);
+  auto Zero = Builder.getInt64(0);
+  return callAppendArgs(Builder, Desc, 1, Arg0, Zero, Zero, Zero, Zero, Zero,
+                        Zero, IsLast);
+}
+
+// The device library does not provide strlen, so we build our own loop
+// here. While we are at it, we also include the terminating null in the length.
+static Value *getStrlenWithNull(IRBuilder<> &Builder, Value *Str) {
+  auto *Prev = Builder.GetInsertBlock();
+  Module *M = Prev->getModule();
+
+  auto CharZero = Builder.getInt8(0);
+  auto One = Builder.getInt64(1);
+  auto Zero = Builder.getInt64(0);
+  auto Int64Ty = Builder.getInt64Ty();
+
+  // The length is either zero for a null pointer, or the computed value for an
+  // actual string. We need a join block for a phi that represents the final
+  // value.
+  //
+  //  Strictly speaking, the zero does not matter since
+  // __ockl_printf_append_string_n ignores the length if the pointer is null.
+  BasicBlock *Join = nullptr;
+  if (Prev->getTerminator()) {
+    Join = Prev->splitBasicBlock(Builder.GetInsertPoint(),
+                                 "strlen.join");
+    Prev->getTerminator()->eraseFromParent();
+  } else {
+    Join = BasicBlock::Create(M->getContext(), "strlen.join",
+                              Prev->getParent());
+  }
+  BasicBlock *While =
+      BasicBlock::Create(M->getContext(), "strlen.while",
+                         Prev->getParent(), Join);
+  BasicBlock *WhileDone = BasicBlock::Create(
+      M->getContext(), "strlen.while.done",
+      Prev->getParent(), Join);
+
+  // Emit an early return for when the pointer is null.
+  Builder.SetInsertPoint(Prev);
+  auto CmpNull =
+      Builder.CreateICmpEQ(Str, Constant::getNullValue(Str->getType()));
+  BranchInst::Create(Join, While, CmpNull, Prev);
+
+  // Entry to the while loop.
+  Builder.SetInsertPoint(While);
+
+  auto PtrPhi = Builder.CreatePHI(Str->getType(), 2);
+  PtrPhi->addIncoming(Str, Prev);
+  auto PtrNext = Builder.CreateGEP(PtrPhi, One);
+  PtrPhi->addIncoming(PtrNext, While);
+
+  // Condition for the while loop.
+  auto Data = Builder.CreateLoad(PtrPhi);
+  auto Cmp = Builder.CreateICmpEQ(Data, CharZero);
+  Builder.CreateCondBr(Cmp, WhileDone, While);
+
+  // Add one to the computed length.
+  Builder.SetInsertPoint(WhileDone, WhileDone->begin());
+  auto Begin = Builder.CreatePtrToInt(Str, Int64Ty);
+  auto End = Builder.CreatePtrToInt(PtrPhi, Int64Ty);
+  auto Len = Builder.CreateSub(End, Begin);
+  Len = Builder.CreateAdd(Len, One);
+
+  // Final join.
+  BranchInst::Create(Join, WhileDone);
+  Builder.SetInsertPoint(Join, Join->begin());
+  auto LenPhi = Builder.CreatePHI(Len->getType(), 2);
+  LenPhi->addIncoming(Len, WhileDone);
+  LenPhi->addIncoming(Zero, Prev);
+
+  return LenPhi;
+}
+
+static Value *callAppendStringN(IRBuilder<> &Builder, Value *Desc, Value *Str,
+                                Value *Length, bool isLast) {
+  auto Int64Ty = Builder.getInt64Ty();
+  auto CharPtrTy = Builder.getInt8PtrTy();
+  auto Int32Ty = Builder.getInt32Ty();
+  auto M = Builder.GetInsertBlock()->getModule();
+  auto Fn = M->getOrInsertFunction("__ockl_printf_append_string_n", Int64Ty,
+                                   Int64Ty, CharPtrTy, Int64Ty, Int32Ty);
+  auto IsLastInt32 = Builder.getInt32(isLast);
+  return Builder.CreateCall(Fn, {Desc, Str, Length, IsLastInt32});
+}
+
+static Value *appendString(IRBuilder<> &Builder, Value *Desc, Value *Arg,
+                           bool IsLast) {
+  auto Length = getStrlenWithNull(Builder, Arg);
+  return callAppendStringN(Builder, Desc, Arg, Length, IsLast);
+}
+
+static Value *processArg(IRBuilder<> &Builder, Value *Desc, Value *Arg,
+                         bool SpecIsCString, bool IsLast) {
+  if (SpecIsCString && isCString(Arg)) {
+    return appendString(Builder, Desc, Arg, IsLast);
+  }
+  // If the format specifies a string but the argument is not, the frontend will
+  // have printed a warning. We just rely on undefined behaviour and send the
+  // argument anyway.
+  return appendArg(Builder, Desc, Arg, IsLast);
+}
+
+// Scan the format string to locate all specifiers, and mark the ones that
+// specify a string, i.e, the "%s" specifier with optional '*' characters.
+static void locateCStrings(SparseBitVector<8> &BV, Value *Fmt) {
+  StringRef Str;
+  if (!getConstantStringInfo(Fmt, Str) || Str.empty())
+    return;
+
+  static const char ConvSpecifiers[] = "diouxXfFeEgGaAcspn";
+  size_t SpecPos = 0;
+  // Skip the first argument, the format string.
+  unsigned ArgIdx = 1;
+
+  while ((SpecPos = Str.find_first_of('%', SpecPos)) != StringRef::npos) {
+    if (Str[SpecPos + 1] == '%') {
+      SpecPos += 2;
+      continue;
+    }
+    auto SpecEnd = Str.find_first_of(ConvSpecifiers, SpecPos);
+    if (SpecEnd == StringRef::npos)
+      return;
+    auto Spec = Str.slice(SpecPos, SpecEnd + 1);
+    ArgIdx += Spec.count('*');
+    if (Str[SpecEnd] == 's') {
+      BV.set(ArgIdx);
+    }
+    SpecPos = SpecEnd + 1;
+    ++ArgIdx;
+  }
+}
+
+Value *llvm::emitAMDGPUPrintfCall(IRBuilder<> &Builder,
+                                  ArrayRef<Value *> Args) {
+  auto NumOps = Args.size();
+  assert(NumOps >= 1);
+
+  auto Fmt = Args[0];
+  SparseBitVector<8> SpecIsCString;
+  locateCStrings(SpecIsCString, Fmt);
+
+  auto Desc = callPrintfBegin(Builder, Builder.getIntN(64, 0));
+  Desc = appendString(Builder, Desc, Fmt, NumOps == 1);
+
+  // FIXME: This invokes hostcall once for each argument. We can pack up to
+  // seven scalar printf arguments in a single hostcall. See the signature of
+  // callAppendArgs().
+  for (unsigned int i = 1; i != NumOps; ++i) {
+    bool IsLast = i == NumOps - 1;
+    bool IsCString = SpecIsCString.test(i);
+    Desc = processArg(Builder, Desc, Args[i], IsCString, IsLast);
+  }
+
+  return Builder.CreateTrunc(Desc, Builder.getInt32Ty());
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/ASanStackFrameLayout.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/ASanStackFrameLayout.cpp
index 0834298936..0191229732 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/ASanStackFrameLayout.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/ASanStackFrameLayout.cpp
@@ -1,152 +1,152 @@
-//===-- ASanStackFrameLayout.cpp - helper for AddressSanitizer ------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// Definition of ComputeASanStackFrameLayout (see ASanStackFrameLayout.h). 
-// 
-//===----------------------------------------------------------------------===// 
-#include "llvm/Transforms/Utils/ASanStackFrameLayout.h" 
-#include "llvm/ADT/SmallString.h" 
-#include "llvm/IR/DebugInfo.h" 
-#include "llvm/Support/MathExtras.h" 
-#include "llvm/Support/ScopedPrinter.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include <algorithm> 
- 
-namespace llvm { 
- 
-// We sort the stack variables by alignment (largest first) to minimize 
-// unnecessary large gaps due to alignment. 
-// It is tempting to also sort variables by size so that larger variables 
-// have larger redzones at both ends. But reordering will make report analysis 
-// harder, especially when temporary unnamed variables are present. 
-// So, until we can provide more information (type, line number, etc) 
-// for the stack variables we avoid reordering them too much. 
-static inline bool CompareVars(const ASanStackVariableDescription &a, 
-                               const ASanStackVariableDescription &b) { 
-  return a.Alignment > b.Alignment; 
-} 
- 
-// We also force minimal alignment for all vars to kMinAlignment so that vars 
-// with e.g. alignment 1 and alignment 16 do not get reordered by CompareVars. 
-static const size_t kMinAlignment = 16; 
- 
-// We want to add a full redzone after every variable. 
-// The larger the variable Size the larger is the redzone. 
-// The resulting frame size is a multiple of Alignment. 
-static size_t VarAndRedzoneSize(size_t Size, size_t Granularity, 
-                                size_t Alignment) { 
-  size_t Res = 0; 
-  if (Size <= 4)  Res = 16; 
-  else if (Size <= 16) Res = 32; 
-  else if (Size <= 128) Res = Size + 32; 
-  else if (Size <= 512) Res = Size + 64; 
-  else if (Size <= 4096) Res = Size + 128; 
-  else                   Res = Size + 256; 
-  return alignTo(std::max(Res, 2 * Granularity), Alignment); 
-} 
- 
-ASanStackFrameLayout 
-ComputeASanStackFrameLayout(SmallVectorImpl<ASanStackVariableDescription> &Vars, 
-                            size_t Granularity, size_t MinHeaderSize) { 
-  assert(Granularity >= 8 && Granularity <= 64 && 
-         (Granularity & (Granularity - 1)) == 0); 
-  assert(MinHeaderSize >= 16 && (MinHeaderSize & (MinHeaderSize - 1)) == 0 && 
-         MinHeaderSize >= Granularity); 
-  const size_t NumVars = Vars.size(); 
-  assert(NumVars > 0); 
-  for (size_t i = 0; i < NumVars; i++) 
-    Vars[i].Alignment = std::max(Vars[i].Alignment, kMinAlignment); 
- 
-  llvm::stable_sort(Vars, CompareVars); 
- 
-  ASanStackFrameLayout Layout; 
-  Layout.Granularity = Granularity; 
-  Layout.FrameAlignment = std::max(Granularity, Vars[0].Alignment); 
-  size_t Offset = std::max(std::max(MinHeaderSize, Granularity), 
-     Vars[0].Alignment); 
-  assert((Offset % Granularity) == 0); 
-  for (size_t i = 0; i < NumVars; i++) { 
-    bool IsLast = i == NumVars - 1; 
-    size_t Alignment = std::max(Granularity, Vars[i].Alignment); 
-    (void)Alignment;  // Used only in asserts. 
-    size_t Size = Vars[i].Size; 
-    assert((Alignment & (Alignment - 1)) == 0); 
-    assert(Layout.FrameAlignment >= Alignment); 
-    assert((Offset % Alignment) == 0); 
-    assert(Size > 0); 
-    size_t NextAlignment = IsLast ? Granularity 
-                   : std::max(Granularity, Vars[i + 1].Alignment); 
-    size_t SizeWithRedzone = VarAndRedzoneSize(Size, Granularity, 
-                                               NextAlignment); 
-    Vars[i].Offset = Offset; 
-    Offset += SizeWithRedzone; 
-  } 
-  if (Offset % MinHeaderSize) { 
-    Offset += MinHeaderSize - (Offset % MinHeaderSize); 
-  } 
-  Layout.FrameSize = Offset; 
-  assert((Layout.FrameSize % MinHeaderSize) == 0); 
-  return Layout; 
-} 
- 
-SmallString<64> ComputeASanStackFrameDescription( 
-    const SmallVectorImpl<ASanStackVariableDescription> &Vars) { 
-  SmallString<2048> StackDescriptionStorage; 
-  raw_svector_ostream StackDescription(StackDescriptionStorage); 
-  StackDescription << Vars.size(); 
- 
-  for (const auto &Var : Vars) { 
-    std::string Name = Var.Name; 
-    if (Var.Line) { 
-      Name += ":"; 
-      Name += to_string(Var.Line); 
-    } 
-    StackDescription << " " << Var.Offset << " " << Var.Size << " " 
-                     << Name.size() << " " << Name; 
-  } 
-  return StackDescription.str(); 
-} 
- 
-SmallVector<uint8_t, 64> 
-GetShadowBytes(const SmallVectorImpl<ASanStackVariableDescription> &Vars, 
-               const ASanStackFrameLayout &Layout) { 
-  assert(Vars.size() > 0); 
-  SmallVector<uint8_t, 64> SB; 
-  SB.clear(); 
-  const size_t Granularity = Layout.Granularity; 
-  SB.resize(Vars[0].Offset / Granularity, kAsanStackLeftRedzoneMagic); 
-  for (const auto &Var : Vars) { 
-    SB.resize(Var.Offset / Granularity, kAsanStackMidRedzoneMagic); 
- 
-    SB.resize(SB.size() + Var.Size / Granularity, 0); 
-    if (Var.Size % Granularity) 
-      SB.push_back(Var.Size % Granularity); 
-  } 
-  SB.resize(Layout.FrameSize / Granularity, kAsanStackRightRedzoneMagic); 
-  return SB; 
-} 
- 
-SmallVector<uint8_t, 64> GetShadowBytesAfterScope( 
-    const SmallVectorImpl<ASanStackVariableDescription> &Vars, 
-    const ASanStackFrameLayout &Layout) { 
-  SmallVector<uint8_t, 64> SB = GetShadowBytes(Vars, Layout); 
-  const size_t Granularity = Layout.Granularity; 
- 
-  for (const auto &Var : Vars) { 
-    assert(Var.LifetimeSize <= Var.Size); 
-    const size_t LifetimeShadowSize = 
-        (Var.LifetimeSize + Granularity - 1) / Granularity; 
-    const size_t Offset = Var.Offset / Granularity; 
-    std::fill(SB.begin() + Offset, SB.begin() + Offset + LifetimeShadowSize, 
-              kAsanStackUseAfterScopeMagic); 
-  } 
- 
-  return SB; 
-} 
- 
-} // llvm namespace 
+//===-- ASanStackFrameLayout.cpp - helper for AddressSanitizer ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Definition of ComputeASanStackFrameLayout (see ASanStackFrameLayout.h).
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/Transforms/Utils/ASanStackFrameLayout.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+
+namespace llvm {
+
+// We sort the stack variables by alignment (largest first) to minimize
+// unnecessary large gaps due to alignment.
+// It is tempting to also sort variables by size so that larger variables
+// have larger redzones at both ends. But reordering will make report analysis
+// harder, especially when temporary unnamed variables are present.
+// So, until we can provide more information (type, line number, etc)
+// for the stack variables we avoid reordering them too much.
+static inline bool CompareVars(const ASanStackVariableDescription &a,
+                               const ASanStackVariableDescription &b) {
+  return a.Alignment > b.Alignment;
+}
+
+// We also force minimal alignment for all vars to kMinAlignment so that vars
+// with e.g. alignment 1 and alignment 16 do not get reordered by CompareVars.
+static const size_t kMinAlignment = 16;
+
+// We want to add a full redzone after every variable.
+// The larger the variable Size the larger is the redzone.
+// The resulting frame size is a multiple of Alignment.
+static size_t VarAndRedzoneSize(size_t Size, size_t Granularity,
+                                size_t Alignment) {
+  size_t Res = 0;
+  if (Size <= 4)  Res = 16;
+  else if (Size <= 16) Res = 32;
+  else if (Size <= 128) Res = Size + 32;
+  else if (Size <= 512) Res = Size + 64;
+  else if (Size <= 4096) Res = Size + 128;
+  else                   Res = Size + 256;
+  return alignTo(std::max(Res, 2 * Granularity), Alignment);
+}
+
+ASanStackFrameLayout
+ComputeASanStackFrameLayout(SmallVectorImpl<ASanStackVariableDescription> &Vars,
+                            size_t Granularity, size_t MinHeaderSize) {
+  assert(Granularity >= 8 && Granularity <= 64 &&
+         (Granularity & (Granularity - 1)) == 0);
+  assert(MinHeaderSize >= 16 && (MinHeaderSize & (MinHeaderSize - 1)) == 0 &&
+         MinHeaderSize >= Granularity);
+  const size_t NumVars = Vars.size();
+  assert(NumVars > 0);
+  for (size_t i = 0; i < NumVars; i++)
+    Vars[i].Alignment = std::max(Vars[i].Alignment, kMinAlignment);
+
+  llvm::stable_sort(Vars, CompareVars);
+
+  ASanStackFrameLayout Layout;
+  Layout.Granularity = Granularity;
+  Layout.FrameAlignment = std::max(Granularity, Vars[0].Alignment);
+  size_t Offset = std::max(std::max(MinHeaderSize, Granularity),
+     Vars[0].Alignment);
+  assert((Offset % Granularity) == 0);
+  for (size_t i = 0; i < NumVars; i++) {
+    bool IsLast = i == NumVars - 1;
+    size_t Alignment = std::max(Granularity, Vars[i].Alignment);
+    (void)Alignment;  // Used only in asserts.
+    size_t Size = Vars[i].Size;
+    assert((Alignment & (Alignment - 1)) == 0);
+    assert(Layout.FrameAlignment >= Alignment);
+    assert((Offset % Alignment) == 0);
+    assert(Size > 0);
+    size_t NextAlignment = IsLast ? Granularity
+                   : std::max(Granularity, Vars[i + 1].Alignment);
+    size_t SizeWithRedzone = VarAndRedzoneSize(Size, Granularity,
+                                               NextAlignment);
+    Vars[i].Offset = Offset;
+    Offset += SizeWithRedzone;
+  }
+  if (Offset % MinHeaderSize) {
+    Offset += MinHeaderSize - (Offset % MinHeaderSize);
+  }
+  Layout.FrameSize = Offset;
+  assert((Layout.FrameSize % MinHeaderSize) == 0);
+  return Layout;
+}
+
+SmallString<64> ComputeASanStackFrameDescription(
+    const SmallVectorImpl<ASanStackVariableDescription> &Vars) {
+  SmallString<2048> StackDescriptionStorage;
+  raw_svector_ostream StackDescription(StackDescriptionStorage);
+  StackDescription << Vars.size();
+
+  for (const auto &Var : Vars) {
+    std::string Name = Var.Name;
+    if (Var.Line) {
+      Name += ":";
+      Name += to_string(Var.Line);
+    }
+    StackDescription << " " << Var.Offset << " " << Var.Size << " "
+                     << Name.size() << " " << Name;
+  }
+  return StackDescription.str();
+}
+
+SmallVector<uint8_t, 64>
+GetShadowBytes(const SmallVectorImpl<ASanStackVariableDescription> &Vars,
+               const ASanStackFrameLayout &Layout) {
+  assert(Vars.size() > 0);
+  SmallVector<uint8_t, 64> SB;
+  SB.clear();
+  const size_t Granularity = Layout.Granularity;
+  SB.resize(Vars[0].Offset / Granularity, kAsanStackLeftRedzoneMagic);
+  for (const auto &Var : Vars) {
+    SB.resize(Var.Offset / Granularity, kAsanStackMidRedzoneMagic);
+
+    SB.resize(SB.size() + Var.Size / Granularity, 0);
+    if (Var.Size % Granularity)
+      SB.push_back(Var.Size % Granularity);
+  }
+  SB.resize(Layout.FrameSize / Granularity, kAsanStackRightRedzoneMagic);
+  return SB;
+}
+
+SmallVector<uint8_t, 64> GetShadowBytesAfterScope(
+    const SmallVectorImpl<ASanStackVariableDescription> &Vars,
+    const ASanStackFrameLayout &Layout) {
+  SmallVector<uint8_t, 64> SB = GetShadowBytes(Vars, Layout);
+  const size_t Granularity = Layout.Granularity;
+
+  for (const auto &Var : Vars) {
+    assert(Var.LifetimeSize <= Var.Size);
+    const size_t LifetimeShadowSize =
+        (Var.LifetimeSize + Granularity - 1) / Granularity;
+    const size_t Offset = Var.Offset / Granularity;
+    std::fill(SB.begin() + Offset, SB.begin() + Offset + LifetimeShadowSize,
+              kAsanStackUseAfterScopeMagic);
+  }
+
+  return SB;
+}
+
+} // llvm namespace
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/AddDiscriminators.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/AddDiscriminators.cpp
index e2ad63143f..0908b361a4 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/AddDiscriminators.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/AddDiscriminators.cpp
@@ -1,277 +1,277 @@
-//===- AddDiscriminators.cpp - Insert DWARF path discriminators -----------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file adds DWARF discriminators to the IR. Path discriminators are 
-// used to decide what CFG path was taken inside sub-graphs whose instructions 
-// share the same line and column number information. 
-// 
-// The main user of this is the sample profiler. Instruction samples are 
-// mapped to line number information. Since a single line may be spread 
-// out over several basic blocks, discriminators add more precise location 
-// for the samples. 
-// 
-// For example, 
-// 
-//   1  #define ASSERT(P) 
-//   2      if (!(P)) 
-//   3        abort() 
-//   ... 
-//   100   while (true) { 
-//   101     ASSERT (sum < 0); 
-//   102     ... 
-//   130   } 
-// 
-// when converted to IR, this snippet looks something like: 
-// 
-// while.body:                                       ; preds = %entry, %if.end 
-//   %0 = load i32* %sum, align 4, !dbg !15 
-//   %cmp = icmp slt i32 %0, 0, !dbg !15 
-//   br i1 %cmp, label %if.end, label %if.then, !dbg !15 
-// 
-// if.then:                                          ; preds = %while.body 
-//   call void @abort(), !dbg !15 
-//   br label %if.end, !dbg !15 
-// 
-// Notice that all the instructions in blocks 'while.body' and 'if.then' 
-// have exactly the same debug information. When this program is sampled 
-// at runtime, the profiler will assume that all these instructions are 
-// equally frequent. This, in turn, will consider the edge while.body->if.then 
-// to be frequently taken (which is incorrect). 
-// 
-// By adding a discriminator value to the instructions in block 'if.then', 
-// we can distinguish instructions at line 101 with discriminator 0 from 
-// the instructions at line 101 with discriminator 1. 
-// 
-// For more details about DWARF discriminators, please visit 
-// http://wiki.dwarfstd.org/index.php?title=Path_Discriminators 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/AddDiscriminators.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/DenseSet.h" 
-#include "llvm/ADT/StringRef.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/DebugInfoMetadata.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Utils.h" 
-#include <utility> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "add-discriminators" 
- 
-// Command line option to disable discriminator generation even in the 
-// presence of debug information. This is only needed when debugging 
-// debug info generation issues. 
-static cl::opt<bool> NoDiscriminators( 
-    "no-discriminators", cl::init(false), 
-    cl::desc("Disable generation of discriminator information.")); 
- 
-namespace { 
- 
-// The legacy pass of AddDiscriminators. 
-struct AddDiscriminatorsLegacyPass : public FunctionPass { 
-  static char ID; // Pass identification, replacement for typeid 
- 
-  AddDiscriminatorsLegacyPass() : FunctionPass(ID) { 
-    initializeAddDiscriminatorsLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnFunction(Function &F) override; 
-}; 
- 
-} // end anonymous namespace 
- 
-char AddDiscriminatorsLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(AddDiscriminatorsLegacyPass, "add-discriminators", 
-                      "Add DWARF path discriminators", false, false) 
-INITIALIZE_PASS_END(AddDiscriminatorsLegacyPass, "add-discriminators", 
-                    "Add DWARF path discriminators", false, false) 
- 
-// Create the legacy AddDiscriminatorsPass. 
-FunctionPass *llvm::createAddDiscriminatorsPass() { 
-  return new AddDiscriminatorsLegacyPass(); 
-} 
- 
-static bool shouldHaveDiscriminator(const Instruction *I) { 
-  return !isa<IntrinsicInst>(I) || isa<MemIntrinsic>(I); 
-} 
- 
-/// Assign DWARF discriminators. 
-/// 
-/// To assign discriminators, we examine the boundaries of every 
-/// basic block and its successors. Suppose there is a basic block B1 
-/// with successor B2. The last instruction I1 in B1 and the first 
-/// instruction I2 in B2 are located at the same file and line number. 
-/// This situation is illustrated in the following code snippet: 
-/// 
-///       if (i < 10) x = i; 
-/// 
-///     entry: 
-///       br i1 %cmp, label %if.then, label %if.end, !dbg !10 
-///     if.then: 
-///       %1 = load i32* %i.addr, align 4, !dbg !10 
-///       store i32 %1, i32* %x, align 4, !dbg !10 
-///       br label %if.end, !dbg !10 
-///     if.end: 
-///       ret void, !dbg !12 
-/// 
-/// Notice how the branch instruction in block 'entry' and all the 
-/// instructions in block 'if.then' have the exact same debug location 
-/// information (!dbg !10). 
-/// 
-/// To distinguish instructions in block 'entry' from instructions in 
-/// block 'if.then', we generate a new lexical block for all the 
-/// instruction in block 'if.then' that share the same file and line 
-/// location with the last instruction of block 'entry'. 
-/// 
-/// This new lexical block will have the same location information as 
-/// the previous one, but with a new DWARF discriminator value. 
-/// 
-/// One of the main uses of this discriminator value is in runtime 
-/// sample profilers. It allows the profiler to distinguish instructions 
-/// at location !dbg !10 that execute on different basic blocks. This is 
-/// important because while the predicate 'if (x < 10)' may have been 
-/// executed millions of times, the assignment 'x = i' may have only 
-/// executed a handful of times (meaning that the entry->if.then edge is 
-/// seldom taken). 
-/// 
-/// If we did not have discriminator information, the profiler would 
-/// assign the same weight to both blocks 'entry' and 'if.then', which 
-/// in turn will make it conclude that the entry->if.then edge is very 
-/// hot. 
-/// 
-/// To decide where to create new discriminator values, this function 
-/// traverses the CFG and examines instruction at basic block boundaries. 
-/// If the last instruction I1 of a block B1 is at the same file and line 
-/// location as instruction I2 of successor B2, then it creates a new 
-/// lexical block for I2 and all the instruction in B2 that share the same 
-/// file and line location as I2. This new lexical block will have a 
-/// different discriminator number than I1. 
-static bool addDiscriminators(Function &F) { 
-  // If the function has debug information, but the user has disabled 
-  // discriminators, do nothing. 
-  // Simlarly, if the function has no debug info, do nothing. 
-  if (NoDiscriminators || !F.getSubprogram()) 
-    return false; 
- 
-  bool Changed = false; 
- 
-  using Location = std::pair<StringRef, unsigned>; 
-  using BBSet = DenseSet<const BasicBlock *>; 
-  using LocationBBMap = DenseMap<Location, BBSet>; 
-  using LocationDiscriminatorMap = DenseMap<Location, unsigned>; 
-  using LocationSet = DenseSet<Location>; 
- 
-  LocationBBMap LBM; 
-  LocationDiscriminatorMap LDM; 
- 
-  // Traverse all instructions in the function. If the source line location 
-  // of the instruction appears in other basic block, assign a new 
-  // discriminator for this instruction. 
-  for (BasicBlock &B : F) { 
-    for (auto &I : B.getInstList()) { 
-      // Not all intrinsic calls should have a discriminator. 
-      // We want to avoid a non-deterministic assignment of discriminators at 
-      // different debug levels. We still allow discriminators on memory 
-      // intrinsic calls because those can be early expanded by SROA into 
-      // pairs of loads and stores, and the expanded load/store instructions 
-      // should have a valid discriminator. 
-      if (!shouldHaveDiscriminator(&I)) 
-        continue; 
-      const DILocation *DIL = I.getDebugLoc(); 
-      if (!DIL) 
-        continue; 
-      Location L = std::make_pair(DIL->getFilename(), DIL->getLine()); 
-      auto &BBMap = LBM[L]; 
-      auto R = BBMap.insert(&B); 
-      if (BBMap.size() == 1) 
-        continue; 
-      // If we could insert more than one block with the same line+file, a 
-      // discriminator is needed to distinguish both instructions. 
-      // Only the lowest 7 bits are used to represent a discriminator to fit 
-      // it in 1 byte ULEB128 representation. 
-      unsigned Discriminator = R.second ? ++LDM[L] : LDM[L]; 
-      auto NewDIL = DIL->cloneWithBaseDiscriminator(Discriminator); 
-      if (!NewDIL) { 
-        LLVM_DEBUG(dbgs() << "Could not encode discriminator: " 
-                          << DIL->getFilename() << ":" << DIL->getLine() << ":" 
-                          << DIL->getColumn() << ":" << Discriminator << " " 
-                          << I << "\n"); 
-      } else { 
-        I.setDebugLoc(NewDIL.getValue()); 
-        LLVM_DEBUG(dbgs() << DIL->getFilename() << ":" << DIL->getLine() << ":" 
-                   << DIL->getColumn() << ":" << Discriminator << " " << I 
-                   << "\n"); 
-      } 
-      Changed = true; 
-    } 
-  } 
- 
-  // Traverse all instructions and assign new discriminators to call 
-  // instructions with the same lineno that are in the same basic block. 
-  // Sample base profile needs to distinguish different function calls within 
-  // a same source line for correct profile annotation. 
-  for (BasicBlock &B : F) { 
-    LocationSet CallLocations; 
-    for (auto &I : B.getInstList()) { 
-      // We bypass intrinsic calls for the following two reasons: 
-      //  1) We want to avoid a non-deterministic assignment of 
-      //     discriminators. 
-      //  2) We want to minimize the number of base discriminators used. 
-      if (!isa<InvokeInst>(I) && (!isa<CallInst>(I) || isa<IntrinsicInst>(I)))   
-        continue; 
- 
-      DILocation *CurrentDIL = I.getDebugLoc(); 
-      if (!CurrentDIL) 
-        continue; 
-      Location L = 
-          std::make_pair(CurrentDIL->getFilename(), CurrentDIL->getLine()); 
-      if (!CallLocations.insert(L).second) { 
-        unsigned Discriminator = ++LDM[L]; 
-        auto NewDIL = CurrentDIL->cloneWithBaseDiscriminator(Discriminator); 
-        if (!NewDIL) { 
-          LLVM_DEBUG(dbgs() 
-                     << "Could not encode discriminator: " 
-                     << CurrentDIL->getFilename() << ":" 
-                     << CurrentDIL->getLine() << ":" << CurrentDIL->getColumn() 
-                     << ":" << Discriminator << " " << I << "\n"); 
-        } else { 
-          I.setDebugLoc(NewDIL.getValue()); 
-          Changed = true; 
-        } 
-      } 
-    } 
-  } 
-  return Changed; 
-} 
- 
-bool AddDiscriminatorsLegacyPass::runOnFunction(Function &F) { 
-  return addDiscriminators(F); 
-} 
- 
-PreservedAnalyses AddDiscriminatorsPass::run(Function &F, 
-                                             FunctionAnalysisManager &AM) { 
-  if (!addDiscriminators(F)) 
-    return PreservedAnalyses::all(); 
- 
-  // FIXME: should be all() 
-  return PreservedAnalyses::none(); 
-} 
+//===- AddDiscriminators.cpp - Insert DWARF path discriminators -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file adds DWARF discriminators to the IR. Path discriminators are
+// used to decide what CFG path was taken inside sub-graphs whose instructions
+// share the same line and column number information.
+//
+// The main user of this is the sample profiler. Instruction samples are
+// mapped to line number information. Since a single line may be spread
+// out over several basic blocks, discriminators add more precise location
+// for the samples.
+//
+// For example,
+//
+//   1  #define ASSERT(P)
+//   2      if (!(P))
+//   3        abort()
+//   ...
+//   100   while (true) {
+//   101     ASSERT (sum < 0);
+//   102     ...
+//   130   }
+//
+// when converted to IR, this snippet looks something like:
+//
+// while.body:                                       ; preds = %entry, %if.end
+//   %0 = load i32* %sum, align 4, !dbg !15
+//   %cmp = icmp slt i32 %0, 0, !dbg !15
+//   br i1 %cmp, label %if.end, label %if.then, !dbg !15
+//
+// if.then:                                          ; preds = %while.body
+//   call void @abort(), !dbg !15
+//   br label %if.end, !dbg !15
+//
+// Notice that all the instructions in blocks 'while.body' and 'if.then'
+// have exactly the same debug information. When this program is sampled
+// at runtime, the profiler will assume that all these instructions are
+// equally frequent. This, in turn, will consider the edge while.body->if.then
+// to be frequently taken (which is incorrect).
+//
+// By adding a discriminator value to the instructions in block 'if.then',
+// we can distinguish instructions at line 101 with discriminator 0 from
+// the instructions at line 101 with discriminator 1.
+//
+// For more details about DWARF discriminators, please visit
+// http://wiki.dwarfstd.org/index.php?title=Path_Discriminators
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/AddDiscriminators.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils.h"
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "add-discriminators"
+
+// Command line option to disable discriminator generation even in the
+// presence of debug information. This is only needed when debugging
+// debug info generation issues.
+static cl::opt<bool> NoDiscriminators(
+    "no-discriminators", cl::init(false),
+    cl::desc("Disable generation of discriminator information."));
+
+namespace {
+
+// The legacy pass of AddDiscriminators.
+struct AddDiscriminatorsLegacyPass : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+
+  AddDiscriminatorsLegacyPass() : FunctionPass(ID) {
+    initializeAddDiscriminatorsLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+};
+
+} // end anonymous namespace
+
+char AddDiscriminatorsLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AddDiscriminatorsLegacyPass, "add-discriminators",
+                      "Add DWARF path discriminators", false, false)
+INITIALIZE_PASS_END(AddDiscriminatorsLegacyPass, "add-discriminators",
+                    "Add DWARF path discriminators", false, false)
+
+// Create the legacy AddDiscriminatorsPass.
+FunctionPass *llvm::createAddDiscriminatorsPass() {
+  return new AddDiscriminatorsLegacyPass();
+}
+
+static bool shouldHaveDiscriminator(const Instruction *I) {
+  return !isa<IntrinsicInst>(I) || isa<MemIntrinsic>(I);
+}
+
+/// Assign DWARF discriminators.
+///
+/// To assign discriminators, we examine the boundaries of every
+/// basic block and its successors. Suppose there is a basic block B1
+/// with successor B2. The last instruction I1 in B1 and the first
+/// instruction I2 in B2 are located at the same file and line number.
+/// This situation is illustrated in the following code snippet:
+///
+///       if (i < 10) x = i;
+///
+///     entry:
+///       br i1 %cmp, label %if.then, label %if.end, !dbg !10
+///     if.then:
+///       %1 = load i32* %i.addr, align 4, !dbg !10
+///       store i32 %1, i32* %x, align 4, !dbg !10
+///       br label %if.end, !dbg !10
+///     if.end:
+///       ret void, !dbg !12
+///
+/// Notice how the branch instruction in block 'entry' and all the
+/// instructions in block 'if.then' have the exact same debug location
+/// information (!dbg !10).
+///
+/// To distinguish instructions in block 'entry' from instructions in
+/// block 'if.then', we generate a new lexical block for all the
+/// instruction in block 'if.then' that share the same file and line
+/// location with the last instruction of block 'entry'.
+///
+/// This new lexical block will have the same location information as
+/// the previous one, but with a new DWARF discriminator value.
+///
+/// One of the main uses of this discriminator value is in runtime
+/// sample profilers. It allows the profiler to distinguish instructions
+/// at location !dbg !10 that execute on different basic blocks. This is
+/// important because while the predicate 'if (x < 10)' may have been
+/// executed millions of times, the assignment 'x = i' may have only
+/// executed a handful of times (meaning that the entry->if.then edge is
+/// seldom taken).
+///
+/// If we did not have discriminator information, the profiler would
+/// assign the same weight to both blocks 'entry' and 'if.then', which
+/// in turn will make it conclude that the entry->if.then edge is very
+/// hot.
+///
+/// To decide where to create new discriminator values, this function
+/// traverses the CFG and examines instruction at basic block boundaries.
+/// If the last instruction I1 of a block B1 is at the same file and line
+/// location as instruction I2 of successor B2, then it creates a new
+/// lexical block for I2 and all the instruction in B2 that share the same
+/// file and line location as I2. This new lexical block will have a
+/// different discriminator number than I1.
+static bool addDiscriminators(Function &F) {
+  // If the function has debug information, but the user has disabled
+  // discriminators, do nothing.
+  // Simlarly, if the function has no debug info, do nothing.
+  if (NoDiscriminators || !F.getSubprogram())
+    return false;
+
+  bool Changed = false;
+
+  using Location = std::pair<StringRef, unsigned>;
+  using BBSet = DenseSet<const BasicBlock *>;
+  using LocationBBMap = DenseMap<Location, BBSet>;
+  using LocationDiscriminatorMap = DenseMap<Location, unsigned>;
+  using LocationSet = DenseSet<Location>;
+
+  LocationBBMap LBM;
+  LocationDiscriminatorMap LDM;
+
+  // Traverse all instructions in the function. If the source line location
+  // of the instruction appears in other basic block, assign a new
+  // discriminator for this instruction.
+  for (BasicBlock &B : F) {
+    for (auto &I : B.getInstList()) {
+      // Not all intrinsic calls should have a discriminator.
+      // We want to avoid a non-deterministic assignment of discriminators at
+      // different debug levels. We still allow discriminators on memory
+      // intrinsic calls because those can be early expanded by SROA into
+      // pairs of loads and stores, and the expanded load/store instructions
+      // should have a valid discriminator.
+      if (!shouldHaveDiscriminator(&I))
+        continue;
+      const DILocation *DIL = I.getDebugLoc();
+      if (!DIL)
+        continue;
+      Location L = std::make_pair(DIL->getFilename(), DIL->getLine());
+      auto &BBMap = LBM[L];
+      auto R = BBMap.insert(&B);
+      if (BBMap.size() == 1)
+        continue;
+      // If we could insert more than one block with the same line+file, a
+      // discriminator is needed to distinguish both instructions.
+      // Only the lowest 7 bits are used to represent a discriminator to fit
+      // it in 1 byte ULEB128 representation.
+      unsigned Discriminator = R.second ? ++LDM[L] : LDM[L];
+      auto NewDIL = DIL->cloneWithBaseDiscriminator(Discriminator);
+      if (!NewDIL) {
+        LLVM_DEBUG(dbgs() << "Could not encode discriminator: "
+                          << DIL->getFilename() << ":" << DIL->getLine() << ":"
+                          << DIL->getColumn() << ":" << Discriminator << " "
+                          << I << "\n");
+      } else {
+        I.setDebugLoc(NewDIL.getValue());
+        LLVM_DEBUG(dbgs() << DIL->getFilename() << ":" << DIL->getLine() << ":"
+                   << DIL->getColumn() << ":" << Discriminator << " " << I
+                   << "\n");
+      }
+      Changed = true;
+    }
+  }
+
+  // Traverse all instructions and assign new discriminators to call
+  // instructions with the same lineno that are in the same basic block.
+  // Sample base profile needs to distinguish different function calls within
+  // a same source line for correct profile annotation.
+  for (BasicBlock &B : F) {
+    LocationSet CallLocations;
+    for (auto &I : B.getInstList()) {
+      // We bypass intrinsic calls for the following two reasons:
+      //  1) We want to avoid a non-deterministic assignment of
+      //     discriminators.
+      //  2) We want to minimize the number of base discriminators used.
+      if (!isa<InvokeInst>(I) && (!isa<CallInst>(I) || isa<IntrinsicInst>(I)))  
+        continue;
+
+      DILocation *CurrentDIL = I.getDebugLoc();
+      if (!CurrentDIL)
+        continue;
+      Location L =
+          std::make_pair(CurrentDIL->getFilename(), CurrentDIL->getLine());
+      if (!CallLocations.insert(L).second) {
+        unsigned Discriminator = ++LDM[L];
+        auto NewDIL = CurrentDIL->cloneWithBaseDiscriminator(Discriminator);
+        if (!NewDIL) {
+          LLVM_DEBUG(dbgs()
+                     << "Could not encode discriminator: "
+                     << CurrentDIL->getFilename() << ":"
+                     << CurrentDIL->getLine() << ":" << CurrentDIL->getColumn()
+                     << ":" << Discriminator << " " << I << "\n");
+        } else {
+          I.setDebugLoc(NewDIL.getValue());
+          Changed = true;
+        }
+      }
+    }
+  }
+  return Changed;
+}
+
+bool AddDiscriminatorsLegacyPass::runOnFunction(Function &F) {
+  return addDiscriminators(F);
+}
+
+PreservedAnalyses AddDiscriminatorsPass::run(Function &F,
+                                             FunctionAnalysisManager &AM) {
+  if (!addDiscriminators(F))
+    return PreservedAnalyses::all();
+
+  // FIXME: should be all()
+  return PreservedAnalyses::none();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/AssumeBundleBuilder.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/AssumeBundleBuilder.cpp
index a9d283aeeb..3daff3b443 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/AssumeBundleBuilder.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/AssumeBundleBuilder.cpp
@@ -1,619 +1,619 @@
-//===- AssumeBundleBuilder.cpp - tools to preserve informations -*- C++ -*-===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#define DEBUG_TYPE "assume-builder" 
- 
-#include "llvm/Transforms/Utils/AssumeBundleBuilder.h" 
-#include "llvm/ADT/DepthFirstIterator.h" 
-#include "llvm/ADT/MapVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/AssumeBundleQueries.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/InstIterator.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/DebugCounter.h" 
-#include "llvm/Transforms/Utils/Local.h" 
- 
-using namespace llvm; 
- 
-cl::opt<bool> ShouldPreserveAllAttributes( 
-    "assume-preserve-all", cl::init(false), cl::Hidden, 
-    cl::desc("enable preservation of all attrbitues. even those that are " 
-             "unlikely to be usefull")); 
- 
-cl::opt<bool> EnableKnowledgeRetention( 
-    "enable-knowledge-retention", cl::init(false), cl::Hidden, 
-    cl::desc( 
-        "enable preservation of attributes throughout code transformation")); 
- 
-STATISTIC(NumAssumeBuilt, "Number of assume built by the assume builder"); 
-STATISTIC(NumBundlesInAssumes, "Total number of Bundles in the assume built"); 
-STATISTIC(NumAssumesMerged, 
-          "Number of assume merged by the assume simplify pass"); 
-STATISTIC(NumAssumesRemoved, 
-          "Number of assume removed by the assume simplify pass"); 
- 
-DEBUG_COUNTER(BuildAssumeCounter, "assume-builder-counter", 
-              "Controls which assumes gets created"); 
- 
-namespace { 
- 
-bool isUsefullToPreserve(Attribute::AttrKind Kind) { 
-  switch (Kind) { 
-    case Attribute::NonNull: 
+//===- AssumeBundleBuilder.cpp - tools to preserve informations -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "assume-builder"
+
+#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumeBundleQueries.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+cl::opt<bool> ShouldPreserveAllAttributes(
+    "assume-preserve-all", cl::init(false), cl::Hidden,
+    cl::desc("enable preservation of all attrbitues. even those that are "
+             "unlikely to be usefull"));
+
+cl::opt<bool> EnableKnowledgeRetention(
+    "enable-knowledge-retention", cl::init(false), cl::Hidden,
+    cl::desc(
+        "enable preservation of attributes throughout code transformation"));
+
+STATISTIC(NumAssumeBuilt, "Number of assume built by the assume builder");
+STATISTIC(NumBundlesInAssumes, "Total number of Bundles in the assume built");
+STATISTIC(NumAssumesMerged,
+          "Number of assume merged by the assume simplify pass");
+STATISTIC(NumAssumesRemoved,
+          "Number of assume removed by the assume simplify pass");
+
+DEBUG_COUNTER(BuildAssumeCounter, "assume-builder-counter",
+              "Controls which assumes gets created");
+
+namespace {
+
+bool isUsefullToPreserve(Attribute::AttrKind Kind) {
+  switch (Kind) {
+    case Attribute::NonNull:
     case Attribute::NoUndef:
-    case Attribute::Alignment: 
-    case Attribute::Dereferenceable: 
-    case Attribute::DereferenceableOrNull: 
-    case Attribute::Cold: 
-      return true; 
-    default: 
-      return false; 
-  } 
-} 
- 
-/// This function will try to transform the given knowledge into a more 
-/// canonical one. the canonical knowledge maybe the given one. 
-RetainedKnowledge canonicalizedKnowledge(RetainedKnowledge RK, Module *M) { 
-  switch (RK.AttrKind) { 
-  default: 
-    return RK; 
-  case Attribute::NonNull: 
+    case Attribute::Alignment:
+    case Attribute::Dereferenceable:
+    case Attribute::DereferenceableOrNull:
+    case Attribute::Cold:
+      return true;
+    default:
+      return false;
+  }
+}
+
+/// This function will try to transform the given knowledge into a more
+/// canonical one. the canonical knowledge maybe the given one.
+RetainedKnowledge canonicalizedKnowledge(RetainedKnowledge RK, Module *M) {
+  switch (RK.AttrKind) {
+  default:
+    return RK;
+  case Attribute::NonNull:
     RK.WasOn = getUnderlyingObject(RK.WasOn);
-    return RK; 
-  case Attribute::Alignment: { 
-    Value *V = RK.WasOn->stripInBoundsOffsets([&](const Value *Strip) { 
-      if (auto *GEP = dyn_cast<GEPOperator>(Strip)) 
-        RK.ArgValue = 
-            MinAlign(RK.ArgValue, 
-                     GEP->getMaxPreservedAlignment(M->getDataLayout()).value()); 
-    }); 
-    RK.WasOn = V; 
-    return RK; 
-  } 
-  case Attribute::Dereferenceable: 
-  case Attribute::DereferenceableOrNull: { 
-    int64_t Offset = 0; 
-    Value *V = GetPointerBaseWithConstantOffset( 
-        RK.WasOn, Offset, M->getDataLayout(), /*AllowNonInBounds*/ false); 
-    if (Offset < 0) 
-      return RK; 
-    RK.ArgValue = RK.ArgValue + Offset; 
-    RK.WasOn = V; 
-  } 
-  } 
-  return RK; 
-} 
- 
-/// This class contain all knowledge that have been gather while building an 
-/// llvm.assume and the function to manipulate it. 
-struct AssumeBuilderState { 
-  Module *M; 
- 
-  using MapKey = std::pair<Value *, Attribute::AttrKind>; 
-  SmallMapVector<MapKey, unsigned, 8> AssumedKnowledgeMap; 
-  Instruction *InstBeingRemoved = nullptr; 
-  AssumptionCache* AC = nullptr; 
-  DominatorTree* DT = nullptr; 
- 
-  AssumeBuilderState(Module *M, Instruction *I = nullptr, 
-                     AssumptionCache *AC = nullptr, DominatorTree *DT = nullptr) 
-      : M(M), InstBeingRemoved(I), AC(AC), DT(DT) {} 
- 
-  bool tryToPreserveWithoutAddingAssume(RetainedKnowledge RK) { 
-    if (!InstBeingRemoved || !RK.WasOn) 
-      return false; 
-    bool HasBeenPreserved = false; 
-    Use* ToUpdate = nullptr; 
-    getKnowledgeForValue( 
-        RK.WasOn, {RK.AttrKind}, AC, 
-        [&](RetainedKnowledge RKOther, Instruction *Assume, 
-            const CallInst::BundleOpInfo *Bundle) { 
-          if (!isValidAssumeForContext(Assume, InstBeingRemoved, DT)) 
-            return false; 
-          if (RKOther.ArgValue >= RK.ArgValue) { 
-            HasBeenPreserved = true; 
-            return true; 
-          } else if (isValidAssumeForContext(InstBeingRemoved, Assume, 
-                                             DT)) { 
-            HasBeenPreserved = true; 
-            IntrinsicInst *Intr = cast<IntrinsicInst>(Assume); 
-            ToUpdate = &Intr->op_begin()[Bundle->Begin + ABA_Argument]; 
-            return true; 
-          } 
-          return false; 
-        }); 
-    if (ToUpdate) 
-      ToUpdate->set( 
-          ConstantInt::get(Type::getInt64Ty(M->getContext()), RK.ArgValue)); 
-    return HasBeenPreserved; 
-  } 
- 
-  bool isKnowledgeWorthPreserving(RetainedKnowledge RK) { 
-    if (!RK) 
-      return false; 
-    if (!RK.WasOn) 
-      return true; 
-    if (RK.WasOn->getType()->isPointerTy()) { 
+    return RK;
+  case Attribute::Alignment: {
+    Value *V = RK.WasOn->stripInBoundsOffsets([&](const Value *Strip) {
+      if (auto *GEP = dyn_cast<GEPOperator>(Strip))
+        RK.ArgValue =
+            MinAlign(RK.ArgValue,
+                     GEP->getMaxPreservedAlignment(M->getDataLayout()).value());
+    });
+    RK.WasOn = V;
+    return RK;
+  }
+  case Attribute::Dereferenceable:
+  case Attribute::DereferenceableOrNull: {
+    int64_t Offset = 0;
+    Value *V = GetPointerBaseWithConstantOffset(
+        RK.WasOn, Offset, M->getDataLayout(), /*AllowNonInBounds*/ false);
+    if (Offset < 0)
+      return RK;
+    RK.ArgValue = RK.ArgValue + Offset;
+    RK.WasOn = V;
+  }
+  }
+  return RK;
+}
+
+/// This class contain all knowledge that have been gather while building an
+/// llvm.assume and the function to manipulate it.
+struct AssumeBuilderState {
+  Module *M;
+
+  using MapKey = std::pair<Value *, Attribute::AttrKind>;
+  SmallMapVector<MapKey, unsigned, 8> AssumedKnowledgeMap;
+  Instruction *InstBeingRemoved = nullptr;
+  AssumptionCache* AC = nullptr;
+  DominatorTree* DT = nullptr;
+
+  AssumeBuilderState(Module *M, Instruction *I = nullptr,
+                     AssumptionCache *AC = nullptr, DominatorTree *DT = nullptr)
+      : M(M), InstBeingRemoved(I), AC(AC), DT(DT) {}
+
+  bool tryToPreserveWithoutAddingAssume(RetainedKnowledge RK) {
+    if (!InstBeingRemoved || !RK.WasOn)
+      return false;
+    bool HasBeenPreserved = false;
+    Use* ToUpdate = nullptr;
+    getKnowledgeForValue(
+        RK.WasOn, {RK.AttrKind}, AC,
+        [&](RetainedKnowledge RKOther, Instruction *Assume,
+            const CallInst::BundleOpInfo *Bundle) {
+          if (!isValidAssumeForContext(Assume, InstBeingRemoved, DT))
+            return false;
+          if (RKOther.ArgValue >= RK.ArgValue) {
+            HasBeenPreserved = true;
+            return true;
+          } else if (isValidAssumeForContext(InstBeingRemoved, Assume,
+                                             DT)) {
+            HasBeenPreserved = true;
+            IntrinsicInst *Intr = cast<IntrinsicInst>(Assume);
+            ToUpdate = &Intr->op_begin()[Bundle->Begin + ABA_Argument];
+            return true;
+          }
+          return false;
+        });
+    if (ToUpdate)
+      ToUpdate->set(
+          ConstantInt::get(Type::getInt64Ty(M->getContext()), RK.ArgValue));
+    return HasBeenPreserved;
+  }
+
+  bool isKnowledgeWorthPreserving(RetainedKnowledge RK) {
+    if (!RK)
+      return false;
+    if (!RK.WasOn)
+      return true;
+    if (RK.WasOn->getType()->isPointerTy()) {
       Value *UnderlyingPtr = getUnderlyingObject(RK.WasOn);
-      if (isa<AllocaInst>(UnderlyingPtr) || isa<GlobalValue>(UnderlyingPtr)) 
-        return false; 
-    } 
-    if (auto *Arg = dyn_cast<Argument>(RK.WasOn)) { 
-      if (Arg->hasAttribute(RK.AttrKind) && 
-          (!Attribute::doesAttrKindHaveArgument(RK.AttrKind) || 
-           Arg->getAttribute(RK.AttrKind).getValueAsInt() >= RK.ArgValue)) 
-        return false; 
-      return true; 
-    } 
-    if (auto *Inst = dyn_cast<Instruction>(RK.WasOn)) 
-      if (wouldInstructionBeTriviallyDead(Inst)) { 
-        if (RK.WasOn->use_empty()) 
-          return false; 
-        Use *SingleUse = RK.WasOn->getSingleUndroppableUse(); 
-        if (SingleUse && SingleUse->getUser() == InstBeingRemoved) 
-          return false; 
-      } 
-    return true; 
-  } 
- 
-  void addKnowledge(RetainedKnowledge RK) { 
-    RK = canonicalizedKnowledge(RK, M); 
- 
-    if (!isKnowledgeWorthPreserving(RK)) 
-      return; 
- 
-    if (tryToPreserveWithoutAddingAssume(RK)) 
-      return; 
-    MapKey Key{RK.WasOn, RK.AttrKind}; 
-    auto Lookup = AssumedKnowledgeMap.find(Key); 
-    if (Lookup == AssumedKnowledgeMap.end()) { 
-      AssumedKnowledgeMap[Key] = RK.ArgValue; 
-      return; 
-    } 
-    assert(((Lookup->second == 0 && RK.ArgValue == 0) || 
-            (Lookup->second != 0 && RK.ArgValue != 0)) && 
-           "inconsistent argument value"); 
- 
-    /// This is only desirable because for all attributes taking an argument 
-    /// higher is better. 
-    Lookup->second = std::max(Lookup->second, RK.ArgValue); 
-  } 
- 
-  void addAttribute(Attribute Attr, Value *WasOn) { 
-    if (Attr.isTypeAttribute() || Attr.isStringAttribute() || 
-        (!ShouldPreserveAllAttributes && 
-         !isUsefullToPreserve(Attr.getKindAsEnum()))) 
-      return; 
-    unsigned AttrArg = 0; 
-    if (Attr.isIntAttribute()) 
-      AttrArg = Attr.getValueAsInt(); 
-    addKnowledge({Attr.getKindAsEnum(), AttrArg, WasOn}); 
-  } 
- 
-  void addCall(const CallBase *Call) { 
-    auto addAttrList = [&](AttributeList AttrList) { 
-      for (unsigned Idx = AttributeList::FirstArgIndex; 
-           Idx < AttrList.getNumAttrSets(); Idx++) 
-        for (Attribute Attr : AttrList.getAttributes(Idx)) 
-          addAttribute(Attr, Call->getArgOperand(Idx - 1)); 
-      for (Attribute Attr : AttrList.getFnAttributes()) 
-        addAttribute(Attr, nullptr); 
-    }; 
-    addAttrList(Call->getAttributes()); 
-    if (Function *Fn = Call->getCalledFunction()) 
-      addAttrList(Fn->getAttributes()); 
-  } 
- 
-  IntrinsicInst *build() { 
-    if (AssumedKnowledgeMap.empty()) 
-      return nullptr; 
-    if (!DebugCounter::shouldExecute(BuildAssumeCounter)) 
-      return nullptr; 
-    Function *FnAssume = Intrinsic::getDeclaration(M, Intrinsic::assume); 
-    LLVMContext &C = M->getContext(); 
-    SmallVector<OperandBundleDef, 8> OpBundle; 
-    for (auto &MapElem : AssumedKnowledgeMap) { 
-      SmallVector<Value *, 2> Args; 
-      if (MapElem.first.first) 
-        Args.push_back(MapElem.first.first); 
- 
-      /// This is only valid because for all attribute that currently exist a 
-      /// value of 0 is useless. and should not be preserved. 
-      if (MapElem.second) 
-        Args.push_back(ConstantInt::get(Type::getInt64Ty(M->getContext()), 
-                                        MapElem.second)); 
-      OpBundle.push_back(OperandBundleDefT<Value *>( 
-          std::string(Attribute::getNameFromAttrKind(MapElem.first.second)), 
-          Args)); 
-      NumBundlesInAssumes++; 
-    } 
-    NumAssumeBuilt++; 
-    return cast<IntrinsicInst>(CallInst::Create( 
-        FnAssume, ArrayRef<Value *>({ConstantInt::getTrue(C)}), OpBundle)); 
-  } 
- 
-  void addAccessedPtr(Instruction *MemInst, Value *Pointer, Type *AccType, 
-                      MaybeAlign MA) { 
-    unsigned DerefSize = MemInst->getModule() 
-                             ->getDataLayout() 
-                             .getTypeStoreSize(AccType) 
-                             .getKnownMinSize(); 
-    if (DerefSize != 0) { 
-      addKnowledge({Attribute::Dereferenceable, DerefSize, Pointer}); 
-      if (!NullPointerIsDefined(MemInst->getFunction(), 
-                                Pointer->getType()->getPointerAddressSpace())) 
-        addKnowledge({Attribute::NonNull, 0u, Pointer}); 
-    } 
-    if (MA.valueOrOne() > 1) 
-      addKnowledge( 
-          {Attribute::Alignment, unsigned(MA.valueOrOne().value()), Pointer}); 
-  } 
- 
-  void addInstruction(Instruction *I) { 
-    if (auto *Call = dyn_cast<CallBase>(I)) 
-      return addCall(Call); 
-    if (auto *Load = dyn_cast<LoadInst>(I)) 
-      return addAccessedPtr(I, Load->getPointerOperand(), Load->getType(), 
-                            Load->getAlign()); 
-    if (auto *Store = dyn_cast<StoreInst>(I)) 
-      return addAccessedPtr(I, Store->getPointerOperand(), 
-                            Store->getValueOperand()->getType(), 
-                            Store->getAlign()); 
-    // TODO: Add support for the other Instructions. 
-    // TODO: Maybe we should look around and merge with other llvm.assume. 
-  } 
-}; 
- 
-} // namespace 
- 
-IntrinsicInst *llvm::buildAssumeFromInst(Instruction *I) { 
-  if (!EnableKnowledgeRetention) 
-    return nullptr; 
-  AssumeBuilderState Builder(I->getModule()); 
-  Builder.addInstruction(I); 
-  return Builder.build(); 
-} 
- 
-void llvm::salvageKnowledge(Instruction *I, AssumptionCache *AC, 
-                            DominatorTree *DT) { 
-  if (!EnableKnowledgeRetention || I->isTerminator()) 
-    return; 
-  AssumeBuilderState Builder(I->getModule(), I, AC, DT); 
-  Builder.addInstruction(I); 
-  if (IntrinsicInst *Intr = Builder.build()) { 
-    Intr->insertBefore(I); 
-    if (AC) 
-      AC->registerAssumption(Intr); 
-  } 
-} 
- 
-namespace { 
- 
-struct AssumeSimplify { 
-  Function &F; 
-  AssumptionCache &AC; 
-  DominatorTree *DT; 
-  LLVMContext &C; 
-  SmallDenseSet<IntrinsicInst *> CleanupToDo; 
-  StringMapEntry<uint32_t> *IgnoreTag; 
-  SmallDenseMap<BasicBlock *, SmallVector<IntrinsicInst *, 4>, 8> BBToAssume; 
-  bool MadeChange = false; 
- 
-  AssumeSimplify(Function &F, AssumptionCache &AC, DominatorTree *DT, 
-                 LLVMContext &C) 
-      : F(F), AC(AC), DT(DT), C(C), 
-        IgnoreTag(C.getOrInsertBundleTag(IgnoreBundleTag)) {} 
- 
-  void buildMapping(bool FilterBooleanArgument) { 
-    BBToAssume.clear(); 
-    for (Value *V : AC.assumptions()) { 
-      if (!V) 
-        continue; 
-      IntrinsicInst *Assume = cast<IntrinsicInst>(V); 
-      if (FilterBooleanArgument) { 
-        auto *Arg = dyn_cast<ConstantInt>(Assume->getOperand(0)); 
-        if (!Arg || Arg->isZero()) 
-          continue; 
-      } 
-      BBToAssume[Assume->getParent()].push_back(Assume); 
-    } 
- 
-    for (auto &Elem : BBToAssume) { 
-      llvm::sort(Elem.second, 
-                 [](const IntrinsicInst *LHS, const IntrinsicInst *RHS) { 
-                   return LHS->comesBefore(RHS); 
-                 }); 
-    } 
-  } 
- 
-  /// Remove all asumes in CleanupToDo if there boolean argument is true and 
-  /// ForceCleanup is set or the assume doesn't hold valuable knowledge. 
-  void RunCleanup(bool ForceCleanup) { 
-    for (IntrinsicInst *Assume : CleanupToDo) { 
-      auto *Arg = dyn_cast<ConstantInt>(Assume->getOperand(0)); 
-      if (!Arg || Arg->isZero() || 
-          (!ForceCleanup && !isAssumeWithEmptyBundle(*Assume))) 
-        continue; 
-      MadeChange = true; 
-      if (ForceCleanup) 
-        NumAssumesMerged++; 
-      else 
-        NumAssumesRemoved++; 
-      Assume->eraseFromParent(); 
-    } 
-    CleanupToDo.clear(); 
-  } 
- 
-  /// Remove knowledge stored in assume when it is already know by an attribute 
-  /// or an other assume. This can when valid update an existing knowledge in an 
-  /// attribute or an other assume. 
-  void dropRedundantKnowledge() { 
-    struct MapValue { 
-      IntrinsicInst *Assume; 
-      unsigned ArgValue; 
-      CallInst::BundleOpInfo *BOI; 
-    }; 
-    buildMapping(false); 
-    SmallDenseMap<std::pair<Value *, Attribute::AttrKind>, 
-                  SmallVector<MapValue, 2>, 16> 
-        Knowledge; 
-    for (BasicBlock *BB : depth_first(&F)) 
-      for (Value *V : BBToAssume[BB]) { 
-        if (!V) 
-          continue; 
-        IntrinsicInst *Assume = cast<IntrinsicInst>(V); 
-        for (CallInst::BundleOpInfo &BOI : Assume->bundle_op_infos()) { 
-          auto RemoveFromAssume = [&]() { 
-            CleanupToDo.insert(Assume); 
-            if (BOI.Begin != BOI.End) { 
-              Use *U = &Assume->op_begin()[BOI.Begin + ABA_WasOn]; 
-              U->set(UndefValue::get(U->get()->getType())); 
-            } 
-            BOI.Tag = IgnoreTag; 
-          }; 
-          if (BOI.Tag == IgnoreTag) { 
-            CleanupToDo.insert(Assume); 
-            continue; 
-          } 
-          RetainedKnowledge RK = getKnowledgeFromBundle(*Assume, BOI); 
-          if (auto *Arg = dyn_cast_or_null<Argument>(RK.WasOn)) { 
-            bool HasSameKindAttr = Arg->hasAttribute(RK.AttrKind); 
-            if (HasSameKindAttr) 
-              if (!Attribute::doesAttrKindHaveArgument(RK.AttrKind) || 
-                  Arg->getAttribute(RK.AttrKind).getValueAsInt() >= 
-                      RK.ArgValue) { 
-                RemoveFromAssume(); 
-                continue; 
-              } 
-            if (isValidAssumeForContext( 
-                    Assume, &*F.getEntryBlock().getFirstInsertionPt()) || 
-                Assume == &*F.getEntryBlock().getFirstInsertionPt()) { 
-              if (HasSameKindAttr) 
-                Arg->removeAttr(RK.AttrKind); 
-              Arg->addAttr(Attribute::get(C, RK.AttrKind, RK.ArgValue)); 
-              MadeChange = true; 
-              RemoveFromAssume(); 
-              continue; 
-            } 
-          } 
-          auto &Lookup = Knowledge[{RK.WasOn, RK.AttrKind}]; 
-          for (MapValue &Elem : Lookup) { 
-            if (!isValidAssumeForContext(Elem.Assume, Assume, DT)) 
-              continue; 
-            if (Elem.ArgValue >= RK.ArgValue) { 
-              RemoveFromAssume(); 
-              continue; 
-            } else if (isValidAssumeForContext(Assume, Elem.Assume, DT)) { 
-              Elem.Assume->op_begin()[Elem.BOI->Begin + ABA_Argument].set( 
-                  ConstantInt::get(Type::getInt64Ty(C), RK.ArgValue)); 
-              MadeChange = true; 
-              RemoveFromAssume(); 
-              continue; 
-            } 
-          } 
-          Lookup.push_back({Assume, RK.ArgValue, &BOI}); 
-        } 
-      } 
-  } 
- 
-  using MergeIterator = SmallVectorImpl<IntrinsicInst *>::iterator; 
- 
-  /// Merge all Assumes from Begin to End in and insert the resulting assume as 
-  /// high as possible in the basicblock. 
-  void mergeRange(BasicBlock *BB, MergeIterator Begin, MergeIterator End) { 
-    if (Begin == End || std::next(Begin) == End) 
-      return; 
-    /// Provide no additional information so that AssumeBuilderState doesn't 
-    /// try to do any punning since it already has been done better. 
-    AssumeBuilderState Builder(F.getParent()); 
- 
-    /// For now it is initialized to the best value it could have 
-    Instruction *InsertPt = BB->getFirstNonPHI(); 
-    if (isa<LandingPadInst>(InsertPt)) 
-      InsertPt = InsertPt->getNextNode(); 
-    for (IntrinsicInst *I : make_range(Begin, End)) { 
-      CleanupToDo.insert(I); 
-      for (CallInst::BundleOpInfo &BOI : I->bundle_op_infos()) { 
-        RetainedKnowledge RK = getKnowledgeFromBundle(*I, BOI); 
-        if (!RK) 
-          continue; 
-        Builder.addKnowledge(RK); 
-        if (auto *I = dyn_cast_or_null<Instruction>(RK.WasOn)) 
-          if (I->getParent() == InsertPt->getParent() && 
-              (InsertPt->comesBefore(I) || InsertPt == I)) 
-            InsertPt = I->getNextNode(); 
-      } 
-    } 
- 
-    /// Adjust InsertPt if it is before Begin, since mergeAssumes only 
-    /// guarantees we can place the resulting assume between Begin and End. 
-    if (InsertPt->comesBefore(*Begin)) 
-      for (auto It = (*Begin)->getIterator(), E = InsertPt->getIterator(); 
-           It != E; --It) 
-        if (!isGuaranteedToTransferExecutionToSuccessor(&*It)) { 
-          InsertPt = It->getNextNode(); 
-          break; 
-        } 
-    IntrinsicInst *MergedAssume = Builder.build(); 
-    if (!MergedAssume) 
-      return; 
-    MadeChange = true; 
-    MergedAssume->insertBefore(InsertPt); 
-    AC.registerAssumption(MergedAssume); 
-  } 
- 
-  /// Merge assume when they are in the same BasicBlock and for all instruction 
-  /// between them isGuaranteedToTransferExecutionToSuccessor returns true. 
-  void mergeAssumes() { 
-    buildMapping(true); 
- 
-    SmallVector<MergeIterator, 4> SplitPoints; 
-    for (auto &Elem : BBToAssume) { 
-      SmallVectorImpl<IntrinsicInst *> &AssumesInBB = Elem.second; 
-      if (AssumesInBB.size() < 2) 
-        continue; 
-      /// AssumesInBB is already sorted by order in the block. 
- 
-      BasicBlock::iterator It = AssumesInBB.front()->getIterator(); 
-      BasicBlock::iterator E = AssumesInBB.back()->getIterator(); 
-      SplitPoints.push_back(AssumesInBB.begin()); 
-      MergeIterator LastSplit = AssumesInBB.begin(); 
-      for (; It != E; ++It) 
-        if (!isGuaranteedToTransferExecutionToSuccessor(&*It)) { 
-          for (; (*LastSplit)->comesBefore(&*It); ++LastSplit) 
-            ; 
-          if (SplitPoints.back() != LastSplit) 
-            SplitPoints.push_back(LastSplit); 
-        } 
-      SplitPoints.push_back(AssumesInBB.end()); 
-      for (auto SplitIt = SplitPoints.begin(); 
-           SplitIt != std::prev(SplitPoints.end()); SplitIt++) { 
-        mergeRange(Elem.first, *SplitIt, *(SplitIt + 1)); 
-      } 
-      SplitPoints.clear(); 
-    } 
-  } 
-}; 
- 
-bool simplifyAssumes(Function &F, AssumptionCache *AC, DominatorTree *DT) { 
-  AssumeSimplify AS(F, *AC, DT, F.getContext()); 
- 
-  /// Remove knowledge that is already known by a dominating other assume or an 
-  /// attribute. 
-  AS.dropRedundantKnowledge(); 
- 
-  /// Remove assume that are empty. 
-  AS.RunCleanup(false); 
- 
-  /// Merge assume in the same basicblock when possible. 
-  AS.mergeAssumes(); 
- 
-  /// Remove assume that were merged. 
-  AS.RunCleanup(true); 
-  return AS.MadeChange; 
-} 
- 
-} // namespace 
- 
-PreservedAnalyses AssumeSimplifyPass::run(Function &F, 
-                                          FunctionAnalysisManager &AM) { 
-  if (!EnableKnowledgeRetention) 
-    return PreservedAnalyses::all(); 
-  simplifyAssumes(F, &AM.getResult<AssumptionAnalysis>(F), 
-                  AM.getCachedResult<DominatorTreeAnalysis>(F)); 
-  return PreservedAnalyses::all(); 
-} 
- 
-namespace { 
-class AssumeSimplifyPassLegacyPass : public FunctionPass { 
-public: 
-  static char ID; 
- 
-  AssumeSimplifyPassLegacyPass() : FunctionPass(ID) { 
-    initializeAssumeSimplifyPassLegacyPassPass( 
-        *PassRegistry::getPassRegistry()); 
-  } 
-  bool runOnFunction(Function &F) override { 
-    if (skipFunction(F) || !EnableKnowledgeRetention) 
-      return false; 
-    AssumptionCache &AC = 
-        getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 
-    DominatorTreeWrapperPass *DTWP = 
-        getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 
-    return simplifyAssumes(F, &AC, DTWP ? &DTWP->getDomTree() : nullptr); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<AssumptionCacheTracker>(); 
- 
-    AU.setPreservesAll(); 
-  } 
-}; 
-} // namespace 
- 
-char AssumeSimplifyPassLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(AssumeSimplifyPassLegacyPass, "assume-simplify", 
-                      "Assume Simplify", false, false) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
-INITIALIZE_PASS_END(AssumeSimplifyPassLegacyPass, "assume-simplify", 
-                    "Assume Simplify", false, false) 
- 
-FunctionPass *llvm::createAssumeSimplifyPass() { 
-  return new AssumeSimplifyPassLegacyPass(); 
-} 
- 
-PreservedAnalyses AssumeBuilderPass::run(Function &F, 
-                                         FunctionAnalysisManager &AM) { 
-  AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F); 
-  DominatorTree* DT = AM.getCachedResult<DominatorTreeAnalysis>(F); 
-  for (Instruction &I : instructions(F)) 
-    salvageKnowledge(&I, AC, DT); 
-  return PreservedAnalyses::all(); 
-} 
- 
-namespace { 
-class AssumeBuilderPassLegacyPass : public FunctionPass { 
-public: 
-  static char ID; 
- 
-  AssumeBuilderPassLegacyPass() : FunctionPass(ID) { 
-    initializeAssumeBuilderPassLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
-  bool runOnFunction(Function &F) override { 
-    AssumptionCache &AC = 
-        getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 
-    DominatorTreeWrapperPass *DTWP = 
-        getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 
-    for (Instruction &I : instructions(F)) 
-      salvageKnowledge(&I, &AC, DTWP ? &DTWP->getDomTree() : nullptr); 
-    return true; 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<AssumptionCacheTracker>(); 
- 
-    AU.setPreservesAll(); 
-  } 
-}; 
-} // namespace 
- 
-char AssumeBuilderPassLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(AssumeBuilderPassLegacyPass, "assume-builder", 
-                      "Assume Builder", false, false) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
-INITIALIZE_PASS_END(AssumeBuilderPassLegacyPass, "assume-builder", 
-                    "Assume Builder", false, false) 
+      if (isa<AllocaInst>(UnderlyingPtr) || isa<GlobalValue>(UnderlyingPtr))
+        return false;
+    }
+    if (auto *Arg = dyn_cast<Argument>(RK.WasOn)) {
+      if (Arg->hasAttribute(RK.AttrKind) &&
+          (!Attribute::doesAttrKindHaveArgument(RK.AttrKind) ||
+           Arg->getAttribute(RK.AttrKind).getValueAsInt() >= RK.ArgValue))
+        return false;
+      return true;
+    }
+    if (auto *Inst = dyn_cast<Instruction>(RK.WasOn))
+      if (wouldInstructionBeTriviallyDead(Inst)) {
+        if (RK.WasOn->use_empty())
+          return false;
+        Use *SingleUse = RK.WasOn->getSingleUndroppableUse();
+        if (SingleUse && SingleUse->getUser() == InstBeingRemoved)
+          return false;
+      }
+    return true;
+  }
+
+  void addKnowledge(RetainedKnowledge RK) {
+    RK = canonicalizedKnowledge(RK, M);
+
+    if (!isKnowledgeWorthPreserving(RK))
+      return;
+
+    if (tryToPreserveWithoutAddingAssume(RK))
+      return;
+    MapKey Key{RK.WasOn, RK.AttrKind};
+    auto Lookup = AssumedKnowledgeMap.find(Key);
+    if (Lookup == AssumedKnowledgeMap.end()) {
+      AssumedKnowledgeMap[Key] = RK.ArgValue;
+      return;
+    }
+    assert(((Lookup->second == 0 && RK.ArgValue == 0) ||
+            (Lookup->second != 0 && RK.ArgValue != 0)) &&
+           "inconsistent argument value");
+
+    /// This is only desirable because for all attributes taking an argument
+    /// higher is better.
+    Lookup->second = std::max(Lookup->second, RK.ArgValue);
+  }
+
+  void addAttribute(Attribute Attr, Value *WasOn) {
+    if (Attr.isTypeAttribute() || Attr.isStringAttribute() ||
+        (!ShouldPreserveAllAttributes &&
+         !isUsefullToPreserve(Attr.getKindAsEnum())))
+      return;
+    unsigned AttrArg = 0;
+    if (Attr.isIntAttribute())
+      AttrArg = Attr.getValueAsInt();
+    addKnowledge({Attr.getKindAsEnum(), AttrArg, WasOn});
+  }
+
+  void addCall(const CallBase *Call) {
+    auto addAttrList = [&](AttributeList AttrList) {
+      for (unsigned Idx = AttributeList::FirstArgIndex;
+           Idx < AttrList.getNumAttrSets(); Idx++)
+        for (Attribute Attr : AttrList.getAttributes(Idx))
+          addAttribute(Attr, Call->getArgOperand(Idx - 1));
+      for (Attribute Attr : AttrList.getFnAttributes())
+        addAttribute(Attr, nullptr);
+    };
+    addAttrList(Call->getAttributes());
+    if (Function *Fn = Call->getCalledFunction())
+      addAttrList(Fn->getAttributes());
+  }
+
+  IntrinsicInst *build() {
+    if (AssumedKnowledgeMap.empty())
+      return nullptr;
+    if (!DebugCounter::shouldExecute(BuildAssumeCounter))
+      return nullptr;
+    Function *FnAssume = Intrinsic::getDeclaration(M, Intrinsic::assume);
+    LLVMContext &C = M->getContext();
+    SmallVector<OperandBundleDef, 8> OpBundle;
+    for (auto &MapElem : AssumedKnowledgeMap) {
+      SmallVector<Value *, 2> Args;
+      if (MapElem.first.first)
+        Args.push_back(MapElem.first.first);
+
+      /// This is only valid because for all attribute that currently exist a
+      /// value of 0 is useless. and should not be preserved.
+      if (MapElem.second)
+        Args.push_back(ConstantInt::get(Type::getInt64Ty(M->getContext()),
+                                        MapElem.second));
+      OpBundle.push_back(OperandBundleDefT<Value *>(
+          std::string(Attribute::getNameFromAttrKind(MapElem.first.second)),
+          Args));
+      NumBundlesInAssumes++;
+    }
+    NumAssumeBuilt++;
+    return cast<IntrinsicInst>(CallInst::Create(
+        FnAssume, ArrayRef<Value *>({ConstantInt::getTrue(C)}), OpBundle));
+  }
+
+  void addAccessedPtr(Instruction *MemInst, Value *Pointer, Type *AccType,
+                      MaybeAlign MA) {
+    unsigned DerefSize = MemInst->getModule()
+                             ->getDataLayout()
+                             .getTypeStoreSize(AccType)
+                             .getKnownMinSize();
+    if (DerefSize != 0) {
+      addKnowledge({Attribute::Dereferenceable, DerefSize, Pointer});
+      if (!NullPointerIsDefined(MemInst->getFunction(),
+                                Pointer->getType()->getPointerAddressSpace()))
+        addKnowledge({Attribute::NonNull, 0u, Pointer});
+    }
+    if (MA.valueOrOne() > 1)
+      addKnowledge(
+          {Attribute::Alignment, unsigned(MA.valueOrOne().value()), Pointer});
+  }
+
+  void addInstruction(Instruction *I) {
+    if (auto *Call = dyn_cast<CallBase>(I))
+      return addCall(Call);
+    if (auto *Load = dyn_cast<LoadInst>(I))
+      return addAccessedPtr(I, Load->getPointerOperand(), Load->getType(),
+                            Load->getAlign());
+    if (auto *Store = dyn_cast<StoreInst>(I))
+      return addAccessedPtr(I, Store->getPointerOperand(),
+                            Store->getValueOperand()->getType(),
+                            Store->getAlign());
+    // TODO: Add support for the other Instructions.
+    // TODO: Maybe we should look around and merge with other llvm.assume.
+  }
+};
+
+} // namespace
+
+IntrinsicInst *llvm::buildAssumeFromInst(Instruction *I) {
+  if (!EnableKnowledgeRetention)
+    return nullptr;
+  AssumeBuilderState Builder(I->getModule());
+  Builder.addInstruction(I);
+  return Builder.build();
+}
+
+void llvm::salvageKnowledge(Instruction *I, AssumptionCache *AC,
+                            DominatorTree *DT) {
+  if (!EnableKnowledgeRetention || I->isTerminator())
+    return;
+  AssumeBuilderState Builder(I->getModule(), I, AC, DT);
+  Builder.addInstruction(I);
+  if (IntrinsicInst *Intr = Builder.build()) {
+    Intr->insertBefore(I);
+    if (AC)
+      AC->registerAssumption(Intr);
+  }
+}
+
+namespace {
+
+struct AssumeSimplify {
+  Function &F;
+  AssumptionCache &AC;
+  DominatorTree *DT;
+  LLVMContext &C;
+  SmallDenseSet<IntrinsicInst *> CleanupToDo;
+  StringMapEntry<uint32_t> *IgnoreTag;
+  SmallDenseMap<BasicBlock *, SmallVector<IntrinsicInst *, 4>, 8> BBToAssume;
+  bool MadeChange = false;
+
+  AssumeSimplify(Function &F, AssumptionCache &AC, DominatorTree *DT,
+                 LLVMContext &C)
+      : F(F), AC(AC), DT(DT), C(C),
+        IgnoreTag(C.getOrInsertBundleTag(IgnoreBundleTag)) {}
+
+  void buildMapping(bool FilterBooleanArgument) {
+    BBToAssume.clear();
+    for (Value *V : AC.assumptions()) {
+      if (!V)
+        continue;
+      IntrinsicInst *Assume = cast<IntrinsicInst>(V);
+      if (FilterBooleanArgument) {
+        auto *Arg = dyn_cast<ConstantInt>(Assume->getOperand(0));
+        if (!Arg || Arg->isZero())
+          continue;
+      }
+      BBToAssume[Assume->getParent()].push_back(Assume);
+    }
+
+    for (auto &Elem : BBToAssume) {
+      llvm::sort(Elem.second,
+                 [](const IntrinsicInst *LHS, const IntrinsicInst *RHS) {
+                   return LHS->comesBefore(RHS);
+                 });
+    }
+  }
+
+  /// Remove all asumes in CleanupToDo if there boolean argument is true and
+  /// ForceCleanup is set or the assume doesn't hold valuable knowledge.
+  void RunCleanup(bool ForceCleanup) {
+    for (IntrinsicInst *Assume : CleanupToDo) {
+      auto *Arg = dyn_cast<ConstantInt>(Assume->getOperand(0));
+      if (!Arg || Arg->isZero() ||
+          (!ForceCleanup && !isAssumeWithEmptyBundle(*Assume)))
+        continue;
+      MadeChange = true;
+      if (ForceCleanup)
+        NumAssumesMerged++;
+      else
+        NumAssumesRemoved++;
+      Assume->eraseFromParent();
+    }
+    CleanupToDo.clear();
+  }
+
+  /// Remove knowledge stored in assume when it is already know by an attribute
+  /// or an other assume. This can when valid update an existing knowledge in an
+  /// attribute or an other assume.
+  void dropRedundantKnowledge() {
+    struct MapValue {
+      IntrinsicInst *Assume;
+      unsigned ArgValue;
+      CallInst::BundleOpInfo *BOI;
+    };
+    buildMapping(false);
+    SmallDenseMap<std::pair<Value *, Attribute::AttrKind>,
+                  SmallVector<MapValue, 2>, 16>
+        Knowledge;
+    for (BasicBlock *BB : depth_first(&F))
+      for (Value *V : BBToAssume[BB]) {
+        if (!V)
+          continue;
+        IntrinsicInst *Assume = cast<IntrinsicInst>(V);
+        for (CallInst::BundleOpInfo &BOI : Assume->bundle_op_infos()) {
+          auto RemoveFromAssume = [&]() {
+            CleanupToDo.insert(Assume);
+            if (BOI.Begin != BOI.End) {
+              Use *U = &Assume->op_begin()[BOI.Begin + ABA_WasOn];
+              U->set(UndefValue::get(U->get()->getType()));
+            }
+            BOI.Tag = IgnoreTag;
+          };
+          if (BOI.Tag == IgnoreTag) {
+            CleanupToDo.insert(Assume);
+            continue;
+          }
+          RetainedKnowledge RK = getKnowledgeFromBundle(*Assume, BOI);
+          if (auto *Arg = dyn_cast_or_null<Argument>(RK.WasOn)) {
+            bool HasSameKindAttr = Arg->hasAttribute(RK.AttrKind);
+            if (HasSameKindAttr)
+              if (!Attribute::doesAttrKindHaveArgument(RK.AttrKind) ||
+                  Arg->getAttribute(RK.AttrKind).getValueAsInt() >=
+                      RK.ArgValue) {
+                RemoveFromAssume();
+                continue;
+              }
+            if (isValidAssumeForContext(
+                    Assume, &*F.getEntryBlock().getFirstInsertionPt()) ||
+                Assume == &*F.getEntryBlock().getFirstInsertionPt()) {
+              if (HasSameKindAttr)
+                Arg->removeAttr(RK.AttrKind);
+              Arg->addAttr(Attribute::get(C, RK.AttrKind, RK.ArgValue));
+              MadeChange = true;
+              RemoveFromAssume();
+              continue;
+            }
+          }
+          auto &Lookup = Knowledge[{RK.WasOn, RK.AttrKind}];
+          for (MapValue &Elem : Lookup) {
+            if (!isValidAssumeForContext(Elem.Assume, Assume, DT))
+              continue;
+            if (Elem.ArgValue >= RK.ArgValue) {
+              RemoveFromAssume();
+              continue;
+            } else if (isValidAssumeForContext(Assume, Elem.Assume, DT)) {
+              Elem.Assume->op_begin()[Elem.BOI->Begin + ABA_Argument].set(
+                  ConstantInt::get(Type::getInt64Ty(C), RK.ArgValue));
+              MadeChange = true;
+              RemoveFromAssume();
+              continue;
+            }
+          }
+          Lookup.push_back({Assume, RK.ArgValue, &BOI});
+        }
+      }
+  }
+
+  using MergeIterator = SmallVectorImpl<IntrinsicInst *>::iterator;
+
+  /// Merge all Assumes from Begin to End in and insert the resulting assume as
+  /// high as possible in the basicblock.
+  void mergeRange(BasicBlock *BB, MergeIterator Begin, MergeIterator End) {
+    if (Begin == End || std::next(Begin) == End)
+      return;
+    /// Provide no additional information so that AssumeBuilderState doesn't
+    /// try to do any punning since it already has been done better.
+    AssumeBuilderState Builder(F.getParent());
+
+    /// For now it is initialized to the best value it could have
+    Instruction *InsertPt = BB->getFirstNonPHI();
+    if (isa<LandingPadInst>(InsertPt))
+      InsertPt = InsertPt->getNextNode();
+    for (IntrinsicInst *I : make_range(Begin, End)) {
+      CleanupToDo.insert(I);
+      for (CallInst::BundleOpInfo &BOI : I->bundle_op_infos()) {
+        RetainedKnowledge RK = getKnowledgeFromBundle(*I, BOI);
+        if (!RK)
+          continue;
+        Builder.addKnowledge(RK);
+        if (auto *I = dyn_cast_or_null<Instruction>(RK.WasOn))
+          if (I->getParent() == InsertPt->getParent() &&
+              (InsertPt->comesBefore(I) || InsertPt == I))
+            InsertPt = I->getNextNode();
+      }
+    }
+
+    /// Adjust InsertPt if it is before Begin, since mergeAssumes only
+    /// guarantees we can place the resulting assume between Begin and End.
+    if (InsertPt->comesBefore(*Begin))
+      for (auto It = (*Begin)->getIterator(), E = InsertPt->getIterator();
+           It != E; --It)
+        if (!isGuaranteedToTransferExecutionToSuccessor(&*It)) {
+          InsertPt = It->getNextNode();
+          break;
+        }
+    IntrinsicInst *MergedAssume = Builder.build();
+    if (!MergedAssume)
+      return;
+    MadeChange = true;
+    MergedAssume->insertBefore(InsertPt);
+    AC.registerAssumption(MergedAssume);
+  }
+
+  /// Merge assume when they are in the same BasicBlock and for all instruction
+  /// between them isGuaranteedToTransferExecutionToSuccessor returns true.
+  void mergeAssumes() {
+    buildMapping(true);
+
+    SmallVector<MergeIterator, 4> SplitPoints;
+    for (auto &Elem : BBToAssume) {
+      SmallVectorImpl<IntrinsicInst *> &AssumesInBB = Elem.second;
+      if (AssumesInBB.size() < 2)
+        continue;
+      /// AssumesInBB is already sorted by order in the block.
+
+      BasicBlock::iterator It = AssumesInBB.front()->getIterator();
+      BasicBlock::iterator E = AssumesInBB.back()->getIterator();
+      SplitPoints.push_back(AssumesInBB.begin());
+      MergeIterator LastSplit = AssumesInBB.begin();
+      for (; It != E; ++It)
+        if (!isGuaranteedToTransferExecutionToSuccessor(&*It)) {
+          for (; (*LastSplit)->comesBefore(&*It); ++LastSplit)
+            ;
+          if (SplitPoints.back() != LastSplit)
+            SplitPoints.push_back(LastSplit);
+        }
+      SplitPoints.push_back(AssumesInBB.end());
+      for (auto SplitIt = SplitPoints.begin();
+           SplitIt != std::prev(SplitPoints.end()); SplitIt++) {
+        mergeRange(Elem.first, *SplitIt, *(SplitIt + 1));
+      }
+      SplitPoints.clear();
+    }
+  }
+};
+
+bool simplifyAssumes(Function &F, AssumptionCache *AC, DominatorTree *DT) {
+  AssumeSimplify AS(F, *AC, DT, F.getContext());
+
+  /// Remove knowledge that is already known by a dominating other assume or an
+  /// attribute.
+  AS.dropRedundantKnowledge();
+
+  /// Remove assume that are empty.
+  AS.RunCleanup(false);
+
+  /// Merge assume in the same basicblock when possible.
+  AS.mergeAssumes();
+
+  /// Remove assume that were merged.
+  AS.RunCleanup(true);
+  return AS.MadeChange;
+}
+
+} // namespace
+
+PreservedAnalyses AssumeSimplifyPass::run(Function &F,
+                                          FunctionAnalysisManager &AM) {
+  if (!EnableKnowledgeRetention)
+    return PreservedAnalyses::all();
+  simplifyAssumes(F, &AM.getResult<AssumptionAnalysis>(F),
+                  AM.getCachedResult<DominatorTreeAnalysis>(F));
+  return PreservedAnalyses::all();
+}
+
+namespace {
+class AssumeSimplifyPassLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  AssumeSimplifyPassLegacyPass() : FunctionPass(ID) {
+    initializeAssumeSimplifyPassLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F) || !EnableKnowledgeRetention)
+      return false;
+    AssumptionCache &AC =
+        getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    DominatorTreeWrapperPass *DTWP =
+        getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+    return simplifyAssumes(F, &AC, DTWP ? &DTWP->getDomTree() : nullptr);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+
+    AU.setPreservesAll();
+  }
+};
+} // namespace
+
+char AssumeSimplifyPassLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AssumeSimplifyPassLegacyPass, "assume-simplify",
+                      "Assume Simplify", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(AssumeSimplifyPassLegacyPass, "assume-simplify",
+                    "Assume Simplify", false, false)
+
+FunctionPass *llvm::createAssumeSimplifyPass() {
+  return new AssumeSimplifyPassLegacyPass();
+}
+
+PreservedAnalyses AssumeBuilderPass::run(Function &F,
+                                         FunctionAnalysisManager &AM) {
+  AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F);
+  DominatorTree* DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
+  for (Instruction &I : instructions(F))
+    salvageKnowledge(&I, AC, DT);
+  return PreservedAnalyses::all();
+}
+
+namespace {
+class AssumeBuilderPassLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  AssumeBuilderPassLegacyPass() : FunctionPass(ID) {
+    initializeAssumeBuilderPassLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F) override {
+    AssumptionCache &AC =
+        getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    DominatorTreeWrapperPass *DTWP =
+        getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+    for (Instruction &I : instructions(F))
+      salvageKnowledge(&I, &AC, DTWP ? &DTWP->getDomTree() : nullptr);
+    return true;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+
+    AU.setPreservesAll();
+  }
+};
+} // namespace
+
+char AssumeBuilderPassLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AssumeBuilderPassLegacyPass, "assume-builder",
+                      "Assume Builder", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(AssumeBuilderPassLegacyPass, "assume-builder",
+                    "Assume Builder", false, false)
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/BasicBlockUtils.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/BasicBlockUtils.cpp
index 414d6044ff..6bcd42c4c6 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -1,547 +1,547 @@
-//===- BasicBlockUtils.cpp - BasicBlock Utilities --------------------------==// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This family of functions perform manipulations on basic blocks, and 
-// instructions contained within basic blocks. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Twine.h" 
-#include "llvm/Analysis/CFG.h" 
-#include "llvm/Analysis/DomTreeUpdater.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/MemoryDependenceAnalysis.h" 
-#include "llvm/Analysis/MemorySSAUpdater.h" 
-#include "llvm/Analysis/PostDominators.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DebugInfoMetadata.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/IR/ValueHandle.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include <cassert> 
-#include <cstdint> 
-#include <string> 
-#include <utility> 
-#include <vector> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "basicblock-utils" 
- 
-void llvm::DetatchDeadBlocks( 
-    ArrayRef<BasicBlock *> BBs, 
-    SmallVectorImpl<DominatorTree::UpdateType> *Updates, 
-    bool KeepOneInputPHIs) { 
-  for (auto *BB : BBs) { 
-    // Loop through all of our successors and make sure they know that one 
-    // of their predecessors is going away. 
-    SmallPtrSet<BasicBlock *, 4> UniqueSuccessors; 
-    for (BasicBlock *Succ : successors(BB)) { 
-      Succ->removePredecessor(BB, KeepOneInputPHIs); 
-      if (Updates && UniqueSuccessors.insert(Succ).second) 
-        Updates->push_back({DominatorTree::Delete, BB, Succ}); 
-    } 
- 
-    // Zap all the instructions in the block. 
-    while (!BB->empty()) { 
-      Instruction &I = BB->back(); 
-      // If this instruction is used, replace uses with an arbitrary value. 
-      // Because control flow can't get here, we don't care what we replace the 
-      // value with.  Note that since this block is unreachable, and all values 
-      // contained within it must dominate their uses, that all uses will 
-      // eventually be removed (they are themselves dead). 
-      if (!I.use_empty()) 
-        I.replaceAllUsesWith(UndefValue::get(I.getType())); 
-      BB->getInstList().pop_back(); 
-    } 
-    new UnreachableInst(BB->getContext(), BB); 
-    assert(BB->getInstList().size() == 1 && 
-           isa<UnreachableInst>(BB->getTerminator()) && 
-           "The successor list of BB isn't empty before " 
-           "applying corresponding DTU updates."); 
-  } 
-} 
- 
-void llvm::DeleteDeadBlock(BasicBlock *BB, DomTreeUpdater *DTU, 
-                           bool KeepOneInputPHIs) { 
-  DeleteDeadBlocks({BB}, DTU, KeepOneInputPHIs); 
-} 
- 
-void llvm::DeleteDeadBlocks(ArrayRef <BasicBlock *> BBs, DomTreeUpdater *DTU, 
-                            bool KeepOneInputPHIs) { 
-#ifndef NDEBUG 
-  // Make sure that all predecessors of each dead block is also dead. 
-  SmallPtrSet<BasicBlock *, 4> Dead(BBs.begin(), BBs.end()); 
-  assert(Dead.size() == BBs.size() && "Duplicating blocks?"); 
-  for (auto *BB : Dead) 
-    for (BasicBlock *Pred : predecessors(BB)) 
-      assert(Dead.count(Pred) && "All predecessors must be dead!"); 
-#endif 
- 
-  SmallVector<DominatorTree::UpdateType, 4> Updates; 
-  DetatchDeadBlocks(BBs, DTU ? &Updates : nullptr, KeepOneInputPHIs); 
- 
-  if (DTU) 
+//===- BasicBlockUtils.cpp - BasicBlock Utilities --------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This family of functions perform manipulations on basic blocks, and
+// instructions contained within basic blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "basicblock-utils"
+
+void llvm::DetatchDeadBlocks(
+    ArrayRef<BasicBlock *> BBs,
+    SmallVectorImpl<DominatorTree::UpdateType> *Updates,
+    bool KeepOneInputPHIs) {
+  for (auto *BB : BBs) {
+    // Loop through all of our successors and make sure they know that one
+    // of their predecessors is going away.
+    SmallPtrSet<BasicBlock *, 4> UniqueSuccessors;
+    for (BasicBlock *Succ : successors(BB)) {
+      Succ->removePredecessor(BB, KeepOneInputPHIs);
+      if (Updates && UniqueSuccessors.insert(Succ).second)
+        Updates->push_back({DominatorTree::Delete, BB, Succ});
+    }
+
+    // Zap all the instructions in the block.
+    while (!BB->empty()) {
+      Instruction &I = BB->back();
+      // If this instruction is used, replace uses with an arbitrary value.
+      // Because control flow can't get here, we don't care what we replace the
+      // value with.  Note that since this block is unreachable, and all values
+      // contained within it must dominate their uses, that all uses will
+      // eventually be removed (they are themselves dead).
+      if (!I.use_empty())
+        I.replaceAllUsesWith(UndefValue::get(I.getType()));
+      BB->getInstList().pop_back();
+    }
+    new UnreachableInst(BB->getContext(), BB);
+    assert(BB->getInstList().size() == 1 &&
+           isa<UnreachableInst>(BB->getTerminator()) &&
+           "The successor list of BB isn't empty before "
+           "applying corresponding DTU updates.");
+  }
+}
+
+void llvm::DeleteDeadBlock(BasicBlock *BB, DomTreeUpdater *DTU,
+                           bool KeepOneInputPHIs) {
+  DeleteDeadBlocks({BB}, DTU, KeepOneInputPHIs);
+}
+
+void llvm::DeleteDeadBlocks(ArrayRef <BasicBlock *> BBs, DomTreeUpdater *DTU,
+                            bool KeepOneInputPHIs) {
+#ifndef NDEBUG
+  // Make sure that all predecessors of each dead block is also dead.
+  SmallPtrSet<BasicBlock *, 4> Dead(BBs.begin(), BBs.end());
+  assert(Dead.size() == BBs.size() && "Duplicating blocks?");
+  for (auto *BB : Dead)
+    for (BasicBlock *Pred : predecessors(BB))
+      assert(Dead.count(Pred) && "All predecessors must be dead!");
+#endif
+
+  SmallVector<DominatorTree::UpdateType, 4> Updates;
+  DetatchDeadBlocks(BBs, DTU ? &Updates : nullptr, KeepOneInputPHIs);
+
+  if (DTU)
     DTU->applyUpdates(Updates);
- 
-  for (BasicBlock *BB : BBs) 
-    if (DTU) 
-      DTU->deleteBB(BB); 
-    else 
-      BB->eraseFromParent(); 
-} 
- 
-bool llvm::EliminateUnreachableBlocks(Function &F, DomTreeUpdater *DTU, 
-                                      bool KeepOneInputPHIs) { 
-  df_iterator_default_set<BasicBlock*> Reachable; 
- 
-  // Mark all reachable blocks. 
-  for (BasicBlock *BB : depth_first_ext(&F, Reachable)) 
-    (void)BB/* Mark all reachable blocks */; 
- 
-  // Collect all dead blocks. 
-  std::vector<BasicBlock*> DeadBlocks; 
-  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) 
-    if (!Reachable.count(&*I)) { 
-      BasicBlock *BB = &*I; 
-      DeadBlocks.push_back(BB); 
-    } 
- 
-  // Delete the dead blocks. 
-  DeleteDeadBlocks(DeadBlocks, DTU, KeepOneInputPHIs); 
- 
-  return !DeadBlocks.empty(); 
-} 
- 
+
+  for (BasicBlock *BB : BBs)
+    if (DTU)
+      DTU->deleteBB(BB);
+    else
+      BB->eraseFromParent();
+}
+
+bool llvm::EliminateUnreachableBlocks(Function &F, DomTreeUpdater *DTU,
+                                      bool KeepOneInputPHIs) {
+  df_iterator_default_set<BasicBlock*> Reachable;
+
+  // Mark all reachable blocks.
+  for (BasicBlock *BB : depth_first_ext(&F, Reachable))
+    (void)BB/* Mark all reachable blocks */;
+
+  // Collect all dead blocks.
+  std::vector<BasicBlock*> DeadBlocks;
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
+    if (!Reachable.count(&*I)) {
+      BasicBlock *BB = &*I;
+      DeadBlocks.push_back(BB);
+    }
+
+  // Delete the dead blocks.
+  DeleteDeadBlocks(DeadBlocks, DTU, KeepOneInputPHIs);
+
+  return !DeadBlocks.empty();
+}
+
 bool llvm::FoldSingleEntryPHINodes(BasicBlock *BB,
-                                   MemoryDependenceResults *MemDep) { 
+                                   MemoryDependenceResults *MemDep) {
   if (!isa<PHINode>(BB->begin()))
     return false;
- 
-  while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) { 
-    if (PN->getIncomingValue(0) != PN) 
-      PN->replaceAllUsesWith(PN->getIncomingValue(0)); 
-    else 
-      PN->replaceAllUsesWith(UndefValue::get(PN->getType())); 
- 
-    if (MemDep) 
-      MemDep->removeInstruction(PN);  // Memdep updates AA itself. 
- 
-    PN->eraseFromParent(); 
-  } 
+
+  while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
+    if (PN->getIncomingValue(0) != PN)
+      PN->replaceAllUsesWith(PN->getIncomingValue(0));
+    else
+      PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
+
+    if (MemDep)
+      MemDep->removeInstruction(PN);  // Memdep updates AA itself.
+
+    PN->eraseFromParent();
+  }
   return true;
-} 
- 
-bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI, 
-                          MemorySSAUpdater *MSSAU) { 
-  // Recursively deleting a PHI may cause multiple PHIs to be deleted 
-  // or RAUW'd undef, so use an array of WeakTrackingVH for the PHIs to delete. 
-  SmallVector<WeakTrackingVH, 8> PHIs; 
-  for (PHINode &PN : BB->phis()) 
-    PHIs.push_back(&PN); 
- 
-  bool Changed = false; 
-  for (unsigned i = 0, e = PHIs.size(); i != e; ++i) 
-    if (PHINode *PN = dyn_cast_or_null<PHINode>(PHIs[i].operator Value*())) 
-      Changed |= RecursivelyDeleteDeadPHINode(PN, TLI, MSSAU); 
- 
-  return Changed; 
-} 
- 
-bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU, 
-                                     LoopInfo *LI, MemorySSAUpdater *MSSAU, 
-                                     MemoryDependenceResults *MemDep, 
-                                     bool PredecessorWithTwoSuccessors) { 
-  if (BB->hasAddressTaken()) 
-    return false; 
- 
-  // Can't merge if there are multiple predecessors, or no predecessors. 
-  BasicBlock *PredBB = BB->getUniquePredecessor(); 
-  if (!PredBB) return false; 
- 
-  // Don't break self-loops. 
-  if (PredBB == BB) return false; 
-  // Don't break unwinding instructions. 
-  if (PredBB->getTerminator()->isExceptionalTerminator()) 
-    return false; 
- 
-  // Can't merge if there are multiple distinct successors. 
-  if (!PredecessorWithTwoSuccessors && PredBB->getUniqueSuccessor() != BB) 
-    return false; 
- 
-  // Currently only allow PredBB to have two predecessors, one being BB. 
-  // Update BI to branch to BB's only successor instead of BB. 
-  BranchInst *PredBB_BI; 
-  BasicBlock *NewSucc = nullptr; 
-  unsigned FallThruPath; 
-  if (PredecessorWithTwoSuccessors) { 
-    if (!(PredBB_BI = dyn_cast<BranchInst>(PredBB->getTerminator()))) 
-      return false; 
-    BranchInst *BB_JmpI = dyn_cast<BranchInst>(BB->getTerminator()); 
-    if (!BB_JmpI || !BB_JmpI->isUnconditional()) 
-      return false; 
-    NewSucc = BB_JmpI->getSuccessor(0); 
-    FallThruPath = PredBB_BI->getSuccessor(0) == BB ? 0 : 1; 
-  } 
- 
-  // Can't merge if there is PHI loop. 
-  for (PHINode &PN : BB->phis()) 
-    for (Value *IncValue : PN.incoming_values()) 
-      if (IncValue == &PN) 
-        return false; 
- 
-  LLVM_DEBUG(dbgs() << "Merging: " << BB->getName() << " into " 
-                    << PredBB->getName() << "\n"); 
- 
-  // Begin by getting rid of unneeded PHIs. 
-  SmallVector<AssertingVH<Value>, 4> IncomingValues; 
-  if (isa<PHINode>(BB->front())) { 
-    for (PHINode &PN : BB->phis()) 
-      if (!isa<PHINode>(PN.getIncomingValue(0)) || 
-          cast<PHINode>(PN.getIncomingValue(0))->getParent() != BB) 
-        IncomingValues.push_back(PN.getIncomingValue(0)); 
-    FoldSingleEntryPHINodes(BB, MemDep); 
-  } 
- 
-  // DTU update: Collect all the edges that exit BB. 
-  // These dominator edges will be redirected from Pred. 
-  std::vector<DominatorTree::UpdateType> Updates; 
-  if (DTU) { 
+}
+
+bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI,
+                          MemorySSAUpdater *MSSAU) {
+  // Recursively deleting a PHI may cause multiple PHIs to be deleted
+  // or RAUW'd undef, so use an array of WeakTrackingVH for the PHIs to delete.
+  SmallVector<WeakTrackingVH, 8> PHIs;
+  for (PHINode &PN : BB->phis())
+    PHIs.push_back(&PN);
+
+  bool Changed = false;
+  for (unsigned i = 0, e = PHIs.size(); i != e; ++i)
+    if (PHINode *PN = dyn_cast_or_null<PHINode>(PHIs[i].operator Value*()))
+      Changed |= RecursivelyDeleteDeadPHINode(PN, TLI, MSSAU);
+
+  return Changed;
+}
+
+bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU,
+                                     LoopInfo *LI, MemorySSAUpdater *MSSAU,
+                                     MemoryDependenceResults *MemDep,
+                                     bool PredecessorWithTwoSuccessors) {
+  if (BB->hasAddressTaken())
+    return false;
+
+  // Can't merge if there are multiple predecessors, or no predecessors.
+  BasicBlock *PredBB = BB->getUniquePredecessor();
+  if (!PredBB) return false;
+
+  // Don't break self-loops.
+  if (PredBB == BB) return false;
+  // Don't break unwinding instructions.
+  if (PredBB->getTerminator()->isExceptionalTerminator())
+    return false;
+
+  // Can't merge if there are multiple distinct successors.
+  if (!PredecessorWithTwoSuccessors && PredBB->getUniqueSuccessor() != BB)
+    return false;
+
+  // Currently only allow PredBB to have two predecessors, one being BB.
+  // Update BI to branch to BB's only successor instead of BB.
+  BranchInst *PredBB_BI;
+  BasicBlock *NewSucc = nullptr;
+  unsigned FallThruPath;
+  if (PredecessorWithTwoSuccessors) {
+    if (!(PredBB_BI = dyn_cast<BranchInst>(PredBB->getTerminator())))
+      return false;
+    BranchInst *BB_JmpI = dyn_cast<BranchInst>(BB->getTerminator());
+    if (!BB_JmpI || !BB_JmpI->isUnconditional())
+      return false;
+    NewSucc = BB_JmpI->getSuccessor(0);
+    FallThruPath = PredBB_BI->getSuccessor(0) == BB ? 0 : 1;
+  }
+
+  // Can't merge if there is PHI loop.
+  for (PHINode &PN : BB->phis())
+    for (Value *IncValue : PN.incoming_values())
+      if (IncValue == &PN)
+        return false;
+
+  LLVM_DEBUG(dbgs() << "Merging: " << BB->getName() << " into "
+                    << PredBB->getName() << "\n");
+
+  // Begin by getting rid of unneeded PHIs.
+  SmallVector<AssertingVH<Value>, 4> IncomingValues;
+  if (isa<PHINode>(BB->front())) {
+    for (PHINode &PN : BB->phis())
+      if (!isa<PHINode>(PN.getIncomingValue(0)) ||
+          cast<PHINode>(PN.getIncomingValue(0))->getParent() != BB)
+        IncomingValues.push_back(PN.getIncomingValue(0));
+    FoldSingleEntryPHINodes(BB, MemDep);
+  }
+
+  // DTU update: Collect all the edges that exit BB.
+  // These dominator edges will be redirected from Pred.
+  std::vector<DominatorTree::UpdateType> Updates;
+  if (DTU) {
     SmallSetVector<BasicBlock *, 2> UniqueSuccessors(succ_begin(BB),
                                                      succ_end(BB));
     Updates.reserve(1 + (2 * UniqueSuccessors.size()));
-    // Add insert edges first. Experimentally, for the particular case of two 
-    // blocks that can be merged, with a single successor and single predecessor 
-    // respectively, it is beneficial to have all insert updates first. Deleting 
-    // edges first may lead to unreachable blocks, followed by inserting edges 
-    // making the blocks reachable again. Such DT updates lead to high compile 
-    // times. We add inserts before deletes here to reduce compile time. 
+    // Add insert edges first. Experimentally, for the particular case of two
+    // blocks that can be merged, with a single successor and single predecessor
+    // respectively, it is beneficial to have all insert updates first. Deleting
+    // edges first may lead to unreachable blocks, followed by inserting edges
+    // making the blocks reachable again. Such DT updates lead to high compile
+    // times. We add inserts before deletes here to reduce compile time.
     for (BasicBlock *UniqueSuccessor : UniqueSuccessors)
-      // This successor of BB may already have PredBB as a predecessor. 
+      // This successor of BB may already have PredBB as a predecessor.
       if (!llvm::is_contained(successors(PredBB), UniqueSuccessor))
         Updates.push_back({DominatorTree::Insert, PredBB, UniqueSuccessor});
     for (BasicBlock *UniqueSuccessor : UniqueSuccessors)
       Updates.push_back({DominatorTree::Delete, BB, UniqueSuccessor});
-    Updates.push_back({DominatorTree::Delete, PredBB, BB}); 
-  } 
- 
-  Instruction *PTI = PredBB->getTerminator(); 
-  Instruction *STI = BB->getTerminator(); 
-  Instruction *Start = &*BB->begin(); 
-  // If there's nothing to move, mark the starting instruction as the last 
-  // instruction in the block. Terminator instruction is handled separately. 
-  if (Start == STI) 
-    Start = PTI; 
- 
-  // Move all definitions in the successor to the predecessor... 
-  PredBB->getInstList().splice(PTI->getIterator(), BB->getInstList(), 
-                               BB->begin(), STI->getIterator()); 
- 
-  if (MSSAU) 
-    MSSAU->moveAllAfterMergeBlocks(BB, PredBB, Start); 
- 
-  // Make all PHI nodes that referred to BB now refer to Pred as their 
-  // source... 
-  BB->replaceAllUsesWith(PredBB); 
- 
-  if (PredecessorWithTwoSuccessors) { 
-    // Delete the unconditional branch from BB. 
-    BB->getInstList().pop_back(); 
- 
-    // Update branch in the predecessor. 
-    PredBB_BI->setSuccessor(FallThruPath, NewSucc); 
-  } else { 
-    // Delete the unconditional branch from the predecessor. 
-    PredBB->getInstList().pop_back(); 
- 
-    // Move terminator instruction. 
-    PredBB->getInstList().splice(PredBB->end(), BB->getInstList()); 
- 
-    // Terminator may be a memory accessing instruction too. 
-    if (MSSAU) 
-      if (MemoryUseOrDef *MUD = cast_or_null<MemoryUseOrDef>( 
-              MSSAU->getMemorySSA()->getMemoryAccess(PredBB->getTerminator()))) 
-        MSSAU->moveToPlace(MUD, PredBB, MemorySSA::End); 
-  } 
-  // Add unreachable to now empty BB. 
-  new UnreachableInst(BB->getContext(), BB); 
- 
-  // Inherit predecessors name if it exists. 
-  if (!PredBB->hasName()) 
-    PredBB->takeName(BB); 
- 
-  if (LI) 
-    LI->removeBlock(BB); 
- 
-  if (MemDep) 
-    MemDep->invalidateCachedPredecessors(); 
- 
-  // Finally, erase the old block and update dominator info. 
-  if (DTU) { 
-    assert(BB->getInstList().size() == 1 && 
-           isa<UnreachableInst>(BB->getTerminator()) && 
-           "The successor list of BB isn't empty before " 
-           "applying corresponding DTU updates."); 
+    Updates.push_back({DominatorTree::Delete, PredBB, BB});
+  }
+
+  Instruction *PTI = PredBB->getTerminator();
+  Instruction *STI = BB->getTerminator();
+  Instruction *Start = &*BB->begin();
+  // If there's nothing to move, mark the starting instruction as the last
+  // instruction in the block. Terminator instruction is handled separately.
+  if (Start == STI)
+    Start = PTI;
+
+  // Move all definitions in the successor to the predecessor...
+  PredBB->getInstList().splice(PTI->getIterator(), BB->getInstList(),
+                               BB->begin(), STI->getIterator());
+
+  if (MSSAU)
+    MSSAU->moveAllAfterMergeBlocks(BB, PredBB, Start);
+
+  // Make all PHI nodes that referred to BB now refer to Pred as their
+  // source...
+  BB->replaceAllUsesWith(PredBB);
+
+  if (PredecessorWithTwoSuccessors) {
+    // Delete the unconditional branch from BB.
+    BB->getInstList().pop_back();
+
+    // Update branch in the predecessor.
+    PredBB_BI->setSuccessor(FallThruPath, NewSucc);
+  } else {
+    // Delete the unconditional branch from the predecessor.
+    PredBB->getInstList().pop_back();
+
+    // Move terminator instruction.
+    PredBB->getInstList().splice(PredBB->end(), BB->getInstList());
+
+    // Terminator may be a memory accessing instruction too.
+    if (MSSAU)
+      if (MemoryUseOrDef *MUD = cast_or_null<MemoryUseOrDef>(
+              MSSAU->getMemorySSA()->getMemoryAccess(PredBB->getTerminator())))
+        MSSAU->moveToPlace(MUD, PredBB, MemorySSA::End);
+  }
+  // Add unreachable to now empty BB.
+  new UnreachableInst(BB->getContext(), BB);
+
+  // Inherit predecessors name if it exists.
+  if (!PredBB->hasName())
+    PredBB->takeName(BB);
+
+  if (LI)
+    LI->removeBlock(BB);
+
+  if (MemDep)
+    MemDep->invalidateCachedPredecessors();
+
+  // Finally, erase the old block and update dominator info.
+  if (DTU) {
+    assert(BB->getInstList().size() == 1 &&
+           isa<UnreachableInst>(BB->getTerminator()) &&
+           "The successor list of BB isn't empty before "
+           "applying corresponding DTU updates.");
     DTU->applyUpdates(Updates);
-    DTU->deleteBB(BB); 
-  } else { 
-    BB->eraseFromParent(); // Nuke BB if DTU is nullptr. 
-  } 
- 
-  return true; 
-} 
- 
-bool llvm::MergeBlockSuccessorsIntoGivenBlocks( 
-    SmallPtrSetImpl<BasicBlock *> &MergeBlocks, Loop *L, DomTreeUpdater *DTU, 
-    LoopInfo *LI) { 
-  assert(!MergeBlocks.empty() && "MergeBlocks should not be empty"); 
- 
-  bool BlocksHaveBeenMerged = false; 
-  while (!MergeBlocks.empty()) { 
-    BasicBlock *BB = *MergeBlocks.begin(); 
-    BasicBlock *Dest = BB->getSingleSuccessor(); 
-    if (Dest && (!L || L->contains(Dest))) { 
-      BasicBlock *Fold = Dest->getUniquePredecessor(); 
-      (void)Fold; 
-      if (MergeBlockIntoPredecessor(Dest, DTU, LI)) { 
-        assert(Fold == BB && 
-               "Expecting BB to be unique predecessor of the Dest block"); 
-        MergeBlocks.erase(Dest); 
-        BlocksHaveBeenMerged = true; 
-      } else 
-        MergeBlocks.erase(BB); 
-    } else 
-      MergeBlocks.erase(BB); 
-  } 
-  return BlocksHaveBeenMerged; 
-} 
- 
-/// Remove redundant instructions within sequences of consecutive dbg.value 
-/// instructions. This is done using a backward scan to keep the last dbg.value 
-/// describing a specific variable/fragment. 
-/// 
-/// BackwardScan strategy: 
-/// ---------------------- 
-/// Given a sequence of consecutive DbgValueInst like this 
-/// 
-///   dbg.value ..., "x", FragmentX1  (*) 
-///   dbg.value ..., "y", FragmentY1 
-///   dbg.value ..., "x", FragmentX2 
-///   dbg.value ..., "x", FragmentX1  (**) 
-/// 
-/// then the instruction marked with (*) can be removed (it is guaranteed to be 
-/// obsoleted by the instruction marked with (**) as the latter instruction is 
-/// describing the same variable using the same fragment info). 
-/// 
-/// Possible improvements: 
-/// - Check fully overlapping fragments and not only identical fragments. 
-/// - Support dbg.addr, dbg.declare. dbg.label, and possibly other meta 
-///   instructions being part of the sequence of consecutive instructions. 
-static bool removeRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) { 
-  SmallVector<DbgValueInst *, 8> ToBeRemoved; 
-  SmallDenseSet<DebugVariable> VariableSet; 
-  for (auto &I : reverse(*BB)) { 
-    if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(&I)) { 
-      DebugVariable Key(DVI->getVariable(), 
-                        DVI->getExpression(), 
-                        DVI->getDebugLoc()->getInlinedAt()); 
-      auto R = VariableSet.insert(Key); 
-      // If the same variable fragment is described more than once it is enough 
-      // to keep the last one (i.e. the first found since we for reverse 
-      // iteration). 
-      if (!R.second) 
-        ToBeRemoved.push_back(DVI); 
-      continue; 
-    } 
-    // Sequence with consecutive dbg.value instrs ended. Clear the map to 
-    // restart identifying redundant instructions if case we find another 
-    // dbg.value sequence. 
-    VariableSet.clear(); 
-  } 
- 
-  for (auto &Instr : ToBeRemoved) 
-    Instr->eraseFromParent(); 
- 
-  return !ToBeRemoved.empty(); 
-} 
- 
-/// Remove redundant dbg.value instructions using a forward scan. This can 
-/// remove a dbg.value instruction that is redundant due to indicating that a 
-/// variable has the same value as already being indicated by an earlier 
-/// dbg.value. 
-/// 
-/// ForwardScan strategy: 
-/// --------------------- 
-/// Given two identical dbg.value instructions, separated by a block of 
-/// instructions that isn't describing the same variable, like this 
-/// 
-///   dbg.value X1, "x", FragmentX1  (**) 
-///   <block of instructions, none being "dbg.value ..., "x", ..."> 
-///   dbg.value X1, "x", FragmentX1  (*) 
-/// 
-/// then the instruction marked with (*) can be removed. Variable "x" is already 
-/// described as being mapped to the SSA value X1. 
-/// 
-/// Possible improvements: 
-/// - Keep track of non-overlapping fragments. 
-static bool removeRedundantDbgInstrsUsingForwardScan(BasicBlock *BB) { 
-  SmallVector<DbgValueInst *, 8> ToBeRemoved; 
-  DenseMap<DebugVariable, std::pair<Value *, DIExpression *> > VariableMap; 
-  for (auto &I : *BB) { 
-    if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(&I)) { 
-      DebugVariable Key(DVI->getVariable(), 
-                        NoneType(), 
-                        DVI->getDebugLoc()->getInlinedAt()); 
-      auto VMI = VariableMap.find(Key); 
-      // Update the map if we found a new value/expression describing the 
-      // variable, or if the variable wasn't mapped already. 
-      if (VMI == VariableMap.end() || 
-          VMI->second.first != DVI->getValue() || 
-          VMI->second.second != DVI->getExpression()) { 
-        VariableMap[Key] = { DVI->getValue(), DVI->getExpression() }; 
-        continue; 
-      } 
-      // Found an identical mapping. Remember the instruction for later removal. 
-      ToBeRemoved.push_back(DVI); 
-    } 
-  } 
- 
-  for (auto &Instr : ToBeRemoved) 
-    Instr->eraseFromParent(); 
- 
-  return !ToBeRemoved.empty(); 
-} 
- 
-bool llvm::RemoveRedundantDbgInstrs(BasicBlock *BB) { 
-  bool MadeChanges = false; 
-  // By using the "backward scan" strategy before the "forward scan" strategy we 
-  // can remove both dbg.value (2) and (3) in a situation like this: 
-  // 
-  //   (1) dbg.value V1, "x", DIExpression() 
-  //       ... 
-  //   (2) dbg.value V2, "x", DIExpression() 
-  //   (3) dbg.value V1, "x", DIExpression() 
-  // 
-  // The backward scan will remove (2), it is made obsolete by (3). After 
-  // getting (2) out of the way, the foward scan will remove (3) since "x" 
-  // already is described as having the value V1 at (1). 
-  MadeChanges |= removeRedundantDbgInstrsUsingBackwardScan(BB); 
-  MadeChanges |= removeRedundantDbgInstrsUsingForwardScan(BB); 
- 
-  if (MadeChanges) 
-    LLVM_DEBUG(dbgs() << "Removed redundant dbg instrs from: " 
-                      << BB->getName() << "\n"); 
-  return MadeChanges; 
-} 
- 
-void llvm::ReplaceInstWithValue(BasicBlock::InstListType &BIL, 
-                                BasicBlock::iterator &BI, Value *V) { 
-  Instruction &I = *BI; 
-  // Replaces all of the uses of the instruction with uses of the value 
-  I.replaceAllUsesWith(V); 
- 
-  // Make sure to propagate a name if there is one already. 
-  if (I.hasName() && !V->hasName()) 
-    V->takeName(&I); 
- 
-  // Delete the unnecessary instruction now... 
-  BI = BIL.erase(BI); 
-} 
- 
-void llvm::ReplaceInstWithInst(BasicBlock::InstListType &BIL, 
-                               BasicBlock::iterator &BI, Instruction *I) { 
-  assert(I->getParent() == nullptr && 
-         "ReplaceInstWithInst: Instruction already inserted into basic block!"); 
- 
-  // Copy debug location to newly added instruction, if it wasn't already set 
-  // by the caller. 
-  if (!I->getDebugLoc()) 
-    I->setDebugLoc(BI->getDebugLoc()); 
- 
-  // Insert the new instruction into the basic block... 
-  BasicBlock::iterator New = BIL.insert(BI, I); 
- 
-  // Replace all uses of the old instruction, and delete it. 
-  ReplaceInstWithValue(BIL, BI, I); 
- 
-  // Move BI back to point to the newly inserted instruction 
-  BI = New; 
-} 
- 
-void llvm::ReplaceInstWithInst(Instruction *From, Instruction *To) { 
-  BasicBlock::iterator BI(From); 
-  ReplaceInstWithInst(From->getParent()->getInstList(), BI, To); 
-} 
- 
-BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, DominatorTree *DT, 
+    DTU->deleteBB(BB);
+  } else {
+    BB->eraseFromParent(); // Nuke BB if DTU is nullptr.
+  }
+
+  return true;
+}
+
+bool llvm::MergeBlockSuccessorsIntoGivenBlocks(
+    SmallPtrSetImpl<BasicBlock *> &MergeBlocks, Loop *L, DomTreeUpdater *DTU,
+    LoopInfo *LI) {
+  assert(!MergeBlocks.empty() && "MergeBlocks should not be empty");
+
+  bool BlocksHaveBeenMerged = false;
+  while (!MergeBlocks.empty()) {
+    BasicBlock *BB = *MergeBlocks.begin();
+    BasicBlock *Dest = BB->getSingleSuccessor();
+    if (Dest && (!L || L->contains(Dest))) {
+      BasicBlock *Fold = Dest->getUniquePredecessor();
+      (void)Fold;
+      if (MergeBlockIntoPredecessor(Dest, DTU, LI)) {
+        assert(Fold == BB &&
+               "Expecting BB to be unique predecessor of the Dest block");
+        MergeBlocks.erase(Dest);
+        BlocksHaveBeenMerged = true;
+      } else
+        MergeBlocks.erase(BB);
+    } else
+      MergeBlocks.erase(BB);
+  }
+  return BlocksHaveBeenMerged;
+}
+
+/// Remove redundant instructions within sequences of consecutive dbg.value
+/// instructions. This is done using a backward scan to keep the last dbg.value
+/// describing a specific variable/fragment.
+///
+/// BackwardScan strategy:
+/// ----------------------
+/// Given a sequence of consecutive DbgValueInst like this
+///
+///   dbg.value ..., "x", FragmentX1  (*)
+///   dbg.value ..., "y", FragmentY1
+///   dbg.value ..., "x", FragmentX2
+///   dbg.value ..., "x", FragmentX1  (**)
+///
+/// then the instruction marked with (*) can be removed (it is guaranteed to be
+/// obsoleted by the instruction marked with (**) as the latter instruction is
+/// describing the same variable using the same fragment info).
+///
+/// Possible improvements:
+/// - Check fully overlapping fragments and not only identical fragments.
+/// - Support dbg.addr, dbg.declare. dbg.label, and possibly other meta
+///   instructions being part of the sequence of consecutive instructions.
+static bool removeRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) {
+  SmallVector<DbgValueInst *, 8> ToBeRemoved;
+  SmallDenseSet<DebugVariable> VariableSet;
+  for (auto &I : reverse(*BB)) {
+    if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(&I)) {
+      DebugVariable Key(DVI->getVariable(),
+                        DVI->getExpression(),
+                        DVI->getDebugLoc()->getInlinedAt());
+      auto R = VariableSet.insert(Key);
+      // If the same variable fragment is described more than once it is enough
+      // to keep the last one (i.e. the first found since we for reverse
+      // iteration).
+      if (!R.second)
+        ToBeRemoved.push_back(DVI);
+      continue;
+    }
+    // Sequence with consecutive dbg.value instrs ended. Clear the map to
+    // restart identifying redundant instructions if case we find another
+    // dbg.value sequence.
+    VariableSet.clear();
+  }
+
+  for (auto &Instr : ToBeRemoved)
+    Instr->eraseFromParent();
+
+  return !ToBeRemoved.empty();
+}
+
+/// Remove redundant dbg.value instructions using a forward scan. This can
+/// remove a dbg.value instruction that is redundant due to indicating that a
+/// variable has the same value as already being indicated by an earlier
+/// dbg.value.
+///
+/// ForwardScan strategy:
+/// ---------------------
+/// Given two identical dbg.value instructions, separated by a block of
+/// instructions that isn't describing the same variable, like this
+///
+///   dbg.value X1, "x", FragmentX1  (**)
+///   <block of instructions, none being "dbg.value ..., "x", ...">
+///   dbg.value X1, "x", FragmentX1  (*)
+///
+/// then the instruction marked with (*) can be removed. Variable "x" is already
+/// described as being mapped to the SSA value X1.
+///
+/// Possible improvements:
+/// - Keep track of non-overlapping fragments.
+static bool removeRedundantDbgInstrsUsingForwardScan(BasicBlock *BB) {
+  SmallVector<DbgValueInst *, 8> ToBeRemoved;
+  DenseMap<DebugVariable, std::pair<Value *, DIExpression *> > VariableMap;
+  for (auto &I : *BB) {
+    if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(&I)) {
+      DebugVariable Key(DVI->getVariable(),
+                        NoneType(),
+                        DVI->getDebugLoc()->getInlinedAt());
+      auto VMI = VariableMap.find(Key);
+      // Update the map if we found a new value/expression describing the
+      // variable, or if the variable wasn't mapped already.
+      if (VMI == VariableMap.end() ||
+          VMI->second.first != DVI->getValue() ||
+          VMI->second.second != DVI->getExpression()) {
+        VariableMap[Key] = { DVI->getValue(), DVI->getExpression() };
+        continue;
+      }
+      // Found an identical mapping. Remember the instruction for later removal.
+      ToBeRemoved.push_back(DVI);
+    }
+  }
+
+  for (auto &Instr : ToBeRemoved)
+    Instr->eraseFromParent();
+
+  return !ToBeRemoved.empty();
+}
+
+bool llvm::RemoveRedundantDbgInstrs(BasicBlock *BB) {
+  bool MadeChanges = false;
+  // By using the "backward scan" strategy before the "forward scan" strategy we
+  // can remove both dbg.value (2) and (3) in a situation like this:
+  //
+  //   (1) dbg.value V1, "x", DIExpression()
+  //       ...
+  //   (2) dbg.value V2, "x", DIExpression()
+  //   (3) dbg.value V1, "x", DIExpression()
+  //
+  // The backward scan will remove (2), it is made obsolete by (3). After
+  // getting (2) out of the way, the foward scan will remove (3) since "x"
+  // already is described as having the value V1 at (1).
+  MadeChanges |= removeRedundantDbgInstrsUsingBackwardScan(BB);
+  MadeChanges |= removeRedundantDbgInstrsUsingForwardScan(BB);
+
+  if (MadeChanges)
+    LLVM_DEBUG(dbgs() << "Removed redundant dbg instrs from: "
+                      << BB->getName() << "\n");
+  return MadeChanges;
+}
+
+void llvm::ReplaceInstWithValue(BasicBlock::InstListType &BIL,
+                                BasicBlock::iterator &BI, Value *V) {
+  Instruction &I = *BI;
+  // Replaces all of the uses of the instruction with uses of the value
+  I.replaceAllUsesWith(V);
+
+  // Make sure to propagate a name if there is one already.
+  if (I.hasName() && !V->hasName())
+    V->takeName(&I);
+
+  // Delete the unnecessary instruction now...
+  BI = BIL.erase(BI);
+}
+
+void llvm::ReplaceInstWithInst(BasicBlock::InstListType &BIL,
+                               BasicBlock::iterator &BI, Instruction *I) {
+  assert(I->getParent() == nullptr &&
+         "ReplaceInstWithInst: Instruction already inserted into basic block!");
+
+  // Copy debug location to newly added instruction, if it wasn't already set
+  // by the caller.
+  if (!I->getDebugLoc())
+    I->setDebugLoc(BI->getDebugLoc());
+
+  // Insert the new instruction into the basic block...
+  BasicBlock::iterator New = BIL.insert(BI, I);
+
+  // Replace all uses of the old instruction, and delete it.
+  ReplaceInstWithValue(BIL, BI, I);
+
+  // Move BI back to point to the newly inserted instruction
+  BI = New;
+}
+
+void llvm::ReplaceInstWithInst(Instruction *From, Instruction *To) {
+  BasicBlock::iterator BI(From);
+  ReplaceInstWithInst(From->getParent()->getInstList(), BI, To);
+}
+
+BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, DominatorTree *DT,
                             LoopInfo *LI, MemorySSAUpdater *MSSAU,
                             const Twine &BBName) {
-  unsigned SuccNum = GetSuccessorNumber(BB, Succ); 
- 
-  // If this is a critical edge, let SplitCriticalEdge do it. 
-  Instruction *LatchTerm = BB->getTerminator(); 
-  if (SplitCriticalEdge( 
-          LatchTerm, SuccNum, 
+  unsigned SuccNum = GetSuccessorNumber(BB, Succ);
+
+  // If this is a critical edge, let SplitCriticalEdge do it.
+  Instruction *LatchTerm = BB->getTerminator();
+  if (SplitCriticalEdge(
+          LatchTerm, SuccNum,
           CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA(),
           BBName))
-    return LatchTerm->getSuccessor(SuccNum); 
- 
-  // If the edge isn't critical, then BB has a single successor or Succ has a 
-  // single pred.  Split the block. 
-  if (BasicBlock *SP = Succ->getSinglePredecessor()) { 
-    // If the successor only has a single pred, split the top of the successor 
-    // block. 
-    assert(SP == BB && "CFG broken"); 
-    SP = nullptr; 
+    return LatchTerm->getSuccessor(SuccNum);
+
+  // If the edge isn't critical, then BB has a single successor or Succ has a
+  // single pred.  Split the block.
+  if (BasicBlock *SP = Succ->getSinglePredecessor()) {
+    // If the successor only has a single pred, split the top of the successor
+    // block.
+    assert(SP == BB && "CFG broken");
+    SP = nullptr;
     return SplitBlock(Succ, &Succ->front(), DT, LI, MSSAU, BBName,
                       /*Before=*/true);
-  } 
- 
-  // Otherwise, if BB has a single successor, split it at the bottom of the 
-  // block. 
-  assert(BB->getTerminator()->getNumSuccessors() == 1 && 
-         "Should have a single succ!"); 
+  }
+
+  // Otherwise, if BB has a single successor, split it at the bottom of the
+  // block.
+  assert(BB->getTerminator()->getNumSuccessors() == 1 &&
+         "Should have a single succ!");
   return SplitBlock(BB, BB->getTerminator(), DT, LI, MSSAU, BBName);
-} 
- 
-unsigned 
-llvm::SplitAllCriticalEdges(Function &F, 
-                            const CriticalEdgeSplittingOptions &Options) { 
-  unsigned NumBroken = 0; 
-  for (BasicBlock &BB : F) { 
-    Instruction *TI = BB.getTerminator(); 
-    if (TI->getNumSuccessors() > 1 && !isa<IndirectBrInst>(TI) && 
-        !isa<CallBrInst>(TI)) 
-      for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) 
-        if (SplitCriticalEdge(TI, i, Options)) 
-          ++NumBroken; 
-  } 
-  return NumBroken; 
-} 
- 
+}
+
+unsigned
+llvm::SplitAllCriticalEdges(Function &F,
+                            const CriticalEdgeSplittingOptions &Options) {
+  unsigned NumBroken = 0;
+  for (BasicBlock &BB : F) {
+    Instruction *TI = BB.getTerminator();
+    if (TI->getNumSuccessors() > 1 && !isa<IndirectBrInst>(TI) &&
+        !isa<CallBrInst>(TI))
+      for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+        if (SplitCriticalEdge(TI, i, Options))
+          ++NumBroken;
+  }
+  return NumBroken;
+}
+
 static BasicBlock *SplitBlockImpl(BasicBlock *Old, Instruction *SplitPt,
                                   DomTreeUpdater *DTU, DominatorTree *DT,
                                   LoopInfo *LI, MemorySSAUpdater *MSSAU,
@@ -552,22 +552,22 @@ static BasicBlock *SplitBlockImpl(BasicBlock *Old, Instruction *SplitPt,
                             DTU ? DTU : (DT ? &LocalDTU : nullptr), LI, MSSAU,
                             BBName);
   }
-  BasicBlock::iterator SplitIt = SplitPt->getIterator(); 
-  while (isa<PHINode>(SplitIt) || SplitIt->isEHPad()) 
-    ++SplitIt; 
-  std::string Name = BBName.str(); 
-  BasicBlock *New = Old->splitBasicBlock( 
-      SplitIt, Name.empty() ? Old->getName() + ".split" : Name); 
- 
-  // The new block lives in whichever loop the old one did. This preserves 
-  // LCSSA as well, because we force the split point to be after any PHI nodes. 
-  if (LI) 
-    if (Loop *L = LI->getLoopFor(Old)) 
-      L->addBasicBlockToLoop(New, *LI); 
- 
+  BasicBlock::iterator SplitIt = SplitPt->getIterator();
+  while (isa<PHINode>(SplitIt) || SplitIt->isEHPad())
+    ++SplitIt;
+  std::string Name = BBName.str();
+  BasicBlock *New = Old->splitBasicBlock(
+      SplitIt, Name.empty() ? Old->getName() + ".split" : Name);
+
+  // The new block lives in whichever loop the old one did. This preserves
+  // LCSSA as well, because we force the split point to be after any PHI nodes.
+  if (LI)
+    if (Loop *L = LI->getLoopFor(Old))
+      L->addBasicBlockToLoop(New, *LI);
+
   if (DTU) {
     SmallVector<DominatorTree::UpdateType, 8> Updates;
-    // Old dominates New. New node dominates all other nodes dominated by Old. 
+    // Old dominates New. New node dominates all other nodes dominated by Old.
     SmallSetVector<BasicBlock *, 8> UniqueSuccessorsOfOld(succ_begin(New),
                                                           succ_end(New));
     Updates.push_back({DominatorTree::Insert, Old, New});
@@ -580,22 +580,22 @@ static BasicBlock *SplitBlockImpl(BasicBlock *Old, Instruction *SplitPt,
     DTU->applyUpdates(Updates);
   } else if (DT)
     // Old dominates New. New node dominates all other nodes dominated by Old.
-    if (DomTreeNode *OldNode = DT->getNode(Old)) { 
-      std::vector<DomTreeNode *> Children(OldNode->begin(), OldNode->end()); 
- 
-      DomTreeNode *NewNode = DT->addNewBlock(New, Old); 
-      for (DomTreeNode *I : Children) 
-        DT->changeImmediateDominator(I, NewNode); 
-    } 
- 
-  // Move MemoryAccesses still tracked in Old, but part of New now. 
-  // Update accesses in successor blocks accordingly. 
-  if (MSSAU) 
-    MSSAU->moveAllAfterSpliceBlocks(Old, New, &*(New->begin())); 
- 
-  return New; 
-} 
- 
+    if (DomTreeNode *OldNode = DT->getNode(Old)) {
+      std::vector<DomTreeNode *> Children(OldNode->begin(), OldNode->end());
+
+      DomTreeNode *NewNode = DT->addNewBlock(New, Old);
+      for (DomTreeNode *I : Children)
+        DT->changeImmediateDominator(I, NewNode);
+    }
+
+  // Move MemoryAccesses still tracked in Old, but part of New now.
+  // Update accesses in successor blocks accordingly.
+  if (MSSAU)
+    MSSAU->moveAllAfterSpliceBlocks(Old, New, &*(New->begin()));
+
+  return New;
+}
+
 BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt,
                              DominatorTree *DT, LoopInfo *LI,
                              MemorySSAUpdater *MSSAU, const Twine &BBName,
@@ -656,13 +656,13 @@ BasicBlock *llvm::splitBlockBefore(BasicBlock *Old, Instruction *SplitPt,
   return New;
 }
 
-/// Update DominatorTree, LoopInfo, and LCCSA analysis information. 
-static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB, 
-                                      ArrayRef<BasicBlock *> Preds, 
+/// Update DominatorTree, LoopInfo, and LCCSA analysis information.
+static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
+                                      ArrayRef<BasicBlock *> Preds,
                                       DomTreeUpdater *DTU, DominatorTree *DT,
                                       LoopInfo *LI, MemorySSAUpdater *MSSAU,
-                                      bool PreserveLCSSA, bool &HasLoopExit) { 
-  // Update dominator tree if available. 
+                                      bool PreserveLCSSA, bool &HasLoopExit) {
+  // Update dominator tree if available.
   if (DTU) {
     // Recalculation of DomTree is needed when updating a forward DomTree and
     // the Entry BB is replaced.
@@ -684,158 +684,158 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
       DTU->applyUpdates(Updates);
     }
   } else if (DT) {
-    if (OldBB == DT->getRootNode()->getBlock()) { 
-      assert(NewBB == &NewBB->getParent()->getEntryBlock()); 
-      DT->setNewRoot(NewBB); 
-    } else { 
-      // Split block expects NewBB to have a non-empty set of predecessors. 
-      DT->splitBlock(NewBB); 
-    } 
-  } 
- 
-  // Update MemoryPhis after split if MemorySSA is available 
-  if (MSSAU) 
-    MSSAU->wireOldPredecessorsToNewImmediatePredecessor(OldBB, NewBB, Preds); 
- 
-  // The rest of the logic is only relevant for updating the loop structures. 
-  if (!LI) 
-    return; 
- 
+    if (OldBB == DT->getRootNode()->getBlock()) {
+      assert(NewBB == &NewBB->getParent()->getEntryBlock());
+      DT->setNewRoot(NewBB);
+    } else {
+      // Split block expects NewBB to have a non-empty set of predecessors.
+      DT->splitBlock(NewBB);
+    }
+  }
+
+  // Update MemoryPhis after split if MemorySSA is available
+  if (MSSAU)
+    MSSAU->wireOldPredecessorsToNewImmediatePredecessor(OldBB, NewBB, Preds);
+
+  // The rest of the logic is only relevant for updating the loop structures.
+  if (!LI)
+    return;
+
   if (DTU && DTU->hasDomTree())
     DT = &DTU->getDomTree();
-  assert(DT && "DT should be available to update LoopInfo!"); 
-  Loop *L = LI->getLoopFor(OldBB); 
- 
-  // If we need to preserve loop analyses, collect some information about how 
-  // this split will affect loops. 
-  bool IsLoopEntry = !!L; 
-  bool SplitMakesNewLoopHeader = false; 
-  for (BasicBlock *Pred : Preds) { 
-    // Preds that are not reachable from entry should not be used to identify if 
-    // OldBB is a loop entry or if SplitMakesNewLoopHeader. Unreachable blocks 
-    // are not within any loops, so we incorrectly mark SplitMakesNewLoopHeader 
-    // as true and make the NewBB the header of some loop. This breaks LI. 
-    if (!DT->isReachableFromEntry(Pred)) 
-      continue; 
-    // If we need to preserve LCSSA, determine if any of the preds is a loop 
-    // exit. 
-    if (PreserveLCSSA) 
-      if (Loop *PL = LI->getLoopFor(Pred)) 
-        if (!PL->contains(OldBB)) 
-          HasLoopExit = true; 
- 
-    // If we need to preserve LoopInfo, note whether any of the preds crosses 
-    // an interesting loop boundary. 
-    if (!L) 
-      continue; 
-    if (L->contains(Pred)) 
-      IsLoopEntry = false; 
-    else 
-      SplitMakesNewLoopHeader = true; 
-  } 
- 
-  // Unless we have a loop for OldBB, nothing else to do here. 
-  if (!L) 
-    return; 
- 
-  if (IsLoopEntry) { 
-    // Add the new block to the nearest enclosing loop (and not an adjacent 
-    // loop). To find this, examine each of the predecessors and determine which 
-    // loops enclose them, and select the most-nested loop which contains the 
-    // loop containing the block being split. 
-    Loop *InnermostPredLoop = nullptr; 
-    for (BasicBlock *Pred : Preds) { 
-      if (Loop *PredLoop = LI->getLoopFor(Pred)) { 
-        // Seek a loop which actually contains the block being split (to avoid 
-        // adjacent loops). 
-        while (PredLoop && !PredLoop->contains(OldBB)) 
-          PredLoop = PredLoop->getParentLoop(); 
- 
-        // Select the most-nested of these loops which contains the block. 
-        if (PredLoop && PredLoop->contains(OldBB) && 
-            (!InnermostPredLoop || 
-             InnermostPredLoop->getLoopDepth() < PredLoop->getLoopDepth())) 
-          InnermostPredLoop = PredLoop; 
-      } 
-    } 
- 
-    if (InnermostPredLoop) 
-      InnermostPredLoop->addBasicBlockToLoop(NewBB, *LI); 
-  } else { 
-    L->addBasicBlockToLoop(NewBB, *LI); 
-    if (SplitMakesNewLoopHeader) 
-      L->moveToHeader(NewBB); 
-  } 
-} 
- 
-/// Update the PHI nodes in OrigBB to include the values coming from NewBB. 
-/// This also updates AliasAnalysis, if available. 
-static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB, 
-                           ArrayRef<BasicBlock *> Preds, BranchInst *BI, 
-                           bool HasLoopExit) { 
-  // Otherwise, create a new PHI node in NewBB for each PHI node in OrigBB. 
-  SmallPtrSet<BasicBlock *, 16> PredSet(Preds.begin(), Preds.end()); 
-  for (BasicBlock::iterator I = OrigBB->begin(); isa<PHINode>(I); ) { 
-    PHINode *PN = cast<PHINode>(I++); 
- 
-    // Check to see if all of the values coming in are the same.  If so, we 
-    // don't need to create a new PHI node, unless it's needed for LCSSA. 
-    Value *InVal = nullptr; 
-    if (!HasLoopExit) { 
-      InVal = PN->getIncomingValueForBlock(Preds[0]); 
-      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { 
-        if (!PredSet.count(PN->getIncomingBlock(i))) 
-          continue; 
-        if (!InVal) 
-          InVal = PN->getIncomingValue(i); 
-        else if (InVal != PN->getIncomingValue(i)) { 
-          InVal = nullptr; 
-          break; 
-        } 
-      } 
-    } 
- 
-    if (InVal) { 
-      // If all incoming values for the new PHI would be the same, just don't 
-      // make a new PHI.  Instead, just remove the incoming values from the old 
-      // PHI. 
- 
-      // NOTE! This loop walks backwards for a reason! First off, this minimizes 
-      // the cost of removal if we end up removing a large number of values, and 
-      // second off, this ensures that the indices for the incoming values 
-      // aren't invalidated when we remove one. 
-      for (int64_t i = PN->getNumIncomingValues() - 1; i >= 0; --i) 
-        if (PredSet.count(PN->getIncomingBlock(i))) 
-          PN->removeIncomingValue(i, false); 
- 
-      // Add an incoming value to the PHI node in the loop for the preheader 
-      // edge. 
-      PN->addIncoming(InVal, NewBB); 
-      continue; 
-    } 
- 
-    // If the values coming into the block are not the same, we need a new 
-    // PHI. 
-    // Create the new PHI node, insert it into NewBB at the end of the block 
-    PHINode *NewPHI = 
-        PHINode::Create(PN->getType(), Preds.size(), PN->getName() + ".ph", BI); 
- 
-    // NOTE! This loop walks backwards for a reason! First off, this minimizes 
-    // the cost of removal if we end up removing a large number of values, and 
-    // second off, this ensures that the indices for the incoming values aren't 
-    // invalidated when we remove one. 
-    for (int64_t i = PN->getNumIncomingValues() - 1; i >= 0; --i) { 
-      BasicBlock *IncomingBB = PN->getIncomingBlock(i); 
-      if (PredSet.count(IncomingBB)) { 
-        Value *V = PN->removeIncomingValue(i, false); 
-        NewPHI->addIncoming(V, IncomingBB); 
-      } 
-    } 
- 
-    PN->addIncoming(NewPHI, NewBB); 
-  } 
-} 
- 
+  assert(DT && "DT should be available to update LoopInfo!");
+  Loop *L = LI->getLoopFor(OldBB);
+
+  // If we need to preserve loop analyses, collect some information about how
+  // this split will affect loops.
+  bool IsLoopEntry = !!L;
+  bool SplitMakesNewLoopHeader = false;
+  for (BasicBlock *Pred : Preds) {
+    // Preds that are not reachable from entry should not be used to identify if
+    // OldBB is a loop entry or if SplitMakesNewLoopHeader. Unreachable blocks
+    // are not within any loops, so we incorrectly mark SplitMakesNewLoopHeader
+    // as true and make the NewBB the header of some loop. This breaks LI.
+    if (!DT->isReachableFromEntry(Pred))
+      continue;
+    // If we need to preserve LCSSA, determine if any of the preds is a loop
+    // exit.
+    if (PreserveLCSSA)
+      if (Loop *PL = LI->getLoopFor(Pred))
+        if (!PL->contains(OldBB))
+          HasLoopExit = true;
+
+    // If we need to preserve LoopInfo, note whether any of the preds crosses
+    // an interesting loop boundary.
+    if (!L)
+      continue;
+    if (L->contains(Pred))
+      IsLoopEntry = false;
+    else
+      SplitMakesNewLoopHeader = true;
+  }
+
+  // Unless we have a loop for OldBB, nothing else to do here.
+  if (!L)
+    return;
+
+  if (IsLoopEntry) {
+    // Add the new block to the nearest enclosing loop (and not an adjacent
+    // loop). To find this, examine each of the predecessors and determine which
+    // loops enclose them, and select the most-nested loop which contains the
+    // loop containing the block being split.
+    Loop *InnermostPredLoop = nullptr;
+    for (BasicBlock *Pred : Preds) {
+      if (Loop *PredLoop = LI->getLoopFor(Pred)) {
+        // Seek a loop which actually contains the block being split (to avoid
+        // adjacent loops).
+        while (PredLoop && !PredLoop->contains(OldBB))
+          PredLoop = PredLoop->getParentLoop();
+
+        // Select the most-nested of these loops which contains the block.
+        if (PredLoop && PredLoop->contains(OldBB) &&
+            (!InnermostPredLoop ||
+             InnermostPredLoop->getLoopDepth() < PredLoop->getLoopDepth()))
+          InnermostPredLoop = PredLoop;
+      }
+    }
+
+    if (InnermostPredLoop)
+      InnermostPredLoop->addBasicBlockToLoop(NewBB, *LI);
+  } else {
+    L->addBasicBlockToLoop(NewBB, *LI);
+    if (SplitMakesNewLoopHeader)
+      L->moveToHeader(NewBB);
+  }
+}
+
+/// Update the PHI nodes in OrigBB to include the values coming from NewBB.
+/// This also updates AliasAnalysis, if available.
+static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,
+                           ArrayRef<BasicBlock *> Preds, BranchInst *BI,
+                           bool HasLoopExit) {
+  // Otherwise, create a new PHI node in NewBB for each PHI node in OrigBB.
+  SmallPtrSet<BasicBlock *, 16> PredSet(Preds.begin(), Preds.end());
+  for (BasicBlock::iterator I = OrigBB->begin(); isa<PHINode>(I); ) {
+    PHINode *PN = cast<PHINode>(I++);
+
+    // Check to see if all of the values coming in are the same.  If so, we
+    // don't need to create a new PHI node, unless it's needed for LCSSA.
+    Value *InVal = nullptr;
+    if (!HasLoopExit) {
+      InVal = PN->getIncomingValueForBlock(Preds[0]);
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+        if (!PredSet.count(PN->getIncomingBlock(i)))
+          continue;
+        if (!InVal)
+          InVal = PN->getIncomingValue(i);
+        else if (InVal != PN->getIncomingValue(i)) {
+          InVal = nullptr;
+          break;
+        }
+      }
+    }
+
+    if (InVal) {
+      // If all incoming values for the new PHI would be the same, just don't
+      // make a new PHI.  Instead, just remove the incoming values from the old
+      // PHI.
+
+      // NOTE! This loop walks backwards for a reason! First off, this minimizes
+      // the cost of removal if we end up removing a large number of values, and
+      // second off, this ensures that the indices for the incoming values
+      // aren't invalidated when we remove one.
+      for (int64_t i = PN->getNumIncomingValues() - 1; i >= 0; --i)
+        if (PredSet.count(PN->getIncomingBlock(i)))
+          PN->removeIncomingValue(i, false);
+
+      // Add an incoming value to the PHI node in the loop for the preheader
+      // edge.
+      PN->addIncoming(InVal, NewBB);
+      continue;
+    }
+
+    // If the values coming into the block are not the same, we need a new
+    // PHI.
+    // Create the new PHI node, insert it into NewBB at the end of the block
+    PHINode *NewPHI =
+        PHINode::Create(PN->getType(), Preds.size(), PN->getName() + ".ph", BI);
+
+    // NOTE! This loop walks backwards for a reason! First off, this minimizes
+    // the cost of removal if we end up removing a large number of values, and
+    // second off, this ensures that the indices for the incoming values aren't
+    // invalidated when we remove one.
+    for (int64_t i = PN->getNumIncomingValues() - 1; i >= 0; --i) {
+      BasicBlock *IncomingBB = PN->getIncomingBlock(i);
+      if (PredSet.count(IncomingBB)) {
+        Value *V = PN->removeIncomingValue(i, false);
+        NewPHI->addIncoming(V, IncomingBB);
+      }
+    }
+
+    PN->addIncoming(NewPHI, NewBB);
+  }
+}
+
 static void SplitLandingPadPredecessorsImpl(
     BasicBlock *OrigBB, ArrayRef<BasicBlock *> Preds, const char *Suffix1,
     const char *Suffix2, SmallVectorImpl<BasicBlock *> &NewBBs,
@@ -847,35 +847,35 @@ SplitBlockPredecessorsImpl(BasicBlock *BB, ArrayRef<BasicBlock *> Preds,
                            const char *Suffix, DomTreeUpdater *DTU,
                            DominatorTree *DT, LoopInfo *LI,
                            MemorySSAUpdater *MSSAU, bool PreserveLCSSA) {
-  // Do not attempt to split that which cannot be split. 
-  if (!BB->canSplitPredecessors()) 
-    return nullptr; 
- 
-  // For the landingpads we need to act a bit differently. 
-  // Delegate this work to the SplitLandingPadPredecessors. 
-  if (BB->isLandingPad()) { 
-    SmallVector<BasicBlock*, 2> NewBBs; 
-    std::string NewName = std::string(Suffix) + ".split-lp"; 
- 
+  // Do not attempt to split that which cannot be split.
+  if (!BB->canSplitPredecessors())
+    return nullptr;
+
+  // For the landingpads we need to act a bit differently.
+  // Delegate this work to the SplitLandingPadPredecessors.
+  if (BB->isLandingPad()) {
+    SmallVector<BasicBlock*, 2> NewBBs;
+    std::string NewName = std::string(Suffix) + ".split-lp";
+
     SplitLandingPadPredecessorsImpl(BB, Preds, Suffix, NewName.c_str(), NewBBs,
                                     DTU, DT, LI, MSSAU, PreserveLCSSA);
-    return NewBBs[0]; 
-  } 
- 
-  // Create new basic block, insert right before the original block. 
-  BasicBlock *NewBB = BasicBlock::Create( 
-      BB->getContext(), BB->getName() + Suffix, BB->getParent(), BB); 
- 
-  // The new block unconditionally branches to the old block. 
-  BranchInst *BI = BranchInst::Create(BB, NewBB); 
+    return NewBBs[0];
+  }
+
+  // Create new basic block, insert right before the original block.
+  BasicBlock *NewBB = BasicBlock::Create(
+      BB->getContext(), BB->getName() + Suffix, BB->getParent(), BB);
+
+  // The new block unconditionally branches to the old block.
+  BranchInst *BI = BranchInst::Create(BB, NewBB);
 
   Loop *L = nullptr;
   BasicBlock *OldLatch = nullptr;
-  // Splitting the predecessors of a loop header creates a preheader block. 
+  // Splitting the predecessors of a loop header creates a preheader block.
   if (LI && LI->isLoopHeader(BB)) {
     L = LI->getLoopFor(BB);
-    // Using the loop start line number prevents debuggers stepping into the 
-    // loop body for this instruction. 
+    // Using the loop start line number prevents debuggers stepping into the
+    // loop body for this instruction.
     BI->setDebugLoc(L->getStartLoc());
 
     // If BB is the header of the Loop, it is possible that the loop is
@@ -884,40 +884,40 @@ SplitBlockPredecessorsImpl(BasicBlock *BB, ArrayRef<BasicBlock *> Preds,
     // to be applied to the new latch.
     OldLatch = L->getLoopLatch();
   } else
-    BI->setDebugLoc(BB->getFirstNonPHIOrDbg()->getDebugLoc()); 
- 
-  // Move the edges from Preds to point to NewBB instead of BB. 
-  for (unsigned i = 0, e = Preds.size(); i != e; ++i) { 
-    // This is slightly more strict than necessary; the minimum requirement 
-    // is that there be no more than one indirectbr branching to BB. And 
-    // all BlockAddress uses would need to be updated. 
-    assert(!isa<IndirectBrInst>(Preds[i]->getTerminator()) && 
-           "Cannot split an edge from an IndirectBrInst"); 
-    assert(!isa<CallBrInst>(Preds[i]->getTerminator()) && 
-           "Cannot split an edge from a CallBrInst"); 
-    Preds[i]->getTerminator()->replaceUsesOfWith(BB, NewBB); 
-  } 
- 
-  // Insert a new PHI node into NewBB for every PHI node in BB and that new PHI 
-  // node becomes an incoming value for BB's phi node.  However, if the Preds 
-  // list is empty, we need to insert dummy entries into the PHI nodes in BB to 
-  // account for the newly created predecessor. 
-  if (Preds.empty()) { 
-    // Insert dummy values as the incoming value. 
-    for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ++I) 
-      cast<PHINode>(I)->addIncoming(UndefValue::get(I->getType()), NewBB); 
-  } 
- 
-  // Update DominatorTree, LoopInfo, and LCCSA analysis information. 
-  bool HasLoopExit = false; 
+    BI->setDebugLoc(BB->getFirstNonPHIOrDbg()->getDebugLoc());
+
+  // Move the edges from Preds to point to NewBB instead of BB.
+  for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
+    // This is slightly more strict than necessary; the minimum requirement
+    // is that there be no more than one indirectbr branching to BB. And
+    // all BlockAddress uses would need to be updated.
+    assert(!isa<IndirectBrInst>(Preds[i]->getTerminator()) &&
+           "Cannot split an edge from an IndirectBrInst");
+    assert(!isa<CallBrInst>(Preds[i]->getTerminator()) &&
+           "Cannot split an edge from a CallBrInst");
+    Preds[i]->getTerminator()->replaceUsesOfWith(BB, NewBB);
+  }
+
+  // Insert a new PHI node into NewBB for every PHI node in BB and that new PHI
+  // node becomes an incoming value for BB's phi node.  However, if the Preds
+  // list is empty, we need to insert dummy entries into the PHI nodes in BB to
+  // account for the newly created predecessor.
+  if (Preds.empty()) {
+    // Insert dummy values as the incoming value.
+    for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ++I)
+      cast<PHINode>(I)->addIncoming(UndefValue::get(I->getType()), NewBB);
+  }
+
+  // Update DominatorTree, LoopInfo, and LCCSA analysis information.
+  bool HasLoopExit = false;
   UpdateAnalysisInformation(BB, NewBB, Preds, DTU, DT, LI, MSSAU, PreserveLCSSA,
-                            HasLoopExit); 
- 
-  if (!Preds.empty()) { 
-    // Update the PHI nodes in BB with the values coming from NewBB. 
-    UpdatePHINodes(BB, NewBB, Preds, BI, HasLoopExit); 
-  } 
- 
+                            HasLoopExit);
+
+  if (!Preds.empty()) {
+    // Update the PHI nodes in BB with the values coming from NewBB.
+    UpdatePHINodes(BB, NewBB, Preds, BI, HasLoopExit);
+  }
+
   if (OldLatch) {
     BasicBlock *NewLatch = L->getLoopLatch();
     if (NewLatch != OldLatch) {
@@ -927,9 +927,9 @@ SplitBlockPredecessorsImpl(BasicBlock *BB, ArrayRef<BasicBlock *> Preds,
     }
   }
 
-  return NewBB; 
-} 
- 
+  return NewBB;
+}
+
 BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
                                          ArrayRef<BasicBlock *> Preds,
                                          const char *Suffix, DominatorTree *DT,
@@ -953,103 +953,103 @@ static void SplitLandingPadPredecessorsImpl(
     const char *Suffix2, SmallVectorImpl<BasicBlock *> &NewBBs,
     DomTreeUpdater *DTU, DominatorTree *DT, LoopInfo *LI,
     MemorySSAUpdater *MSSAU, bool PreserveLCSSA) {
-  assert(OrigBB->isLandingPad() && "Trying to split a non-landing pad!"); 
- 
-  // Create a new basic block for OrigBB's predecessors listed in Preds. Insert 
-  // it right before the original block. 
-  BasicBlock *NewBB1 = BasicBlock::Create(OrigBB->getContext(), 
-                                          OrigBB->getName() + Suffix1, 
-                                          OrigBB->getParent(), OrigBB); 
-  NewBBs.push_back(NewBB1); 
- 
-  // The new block unconditionally branches to the old block. 
-  BranchInst *BI1 = BranchInst::Create(OrigBB, NewBB1); 
-  BI1->setDebugLoc(OrigBB->getFirstNonPHI()->getDebugLoc()); 
- 
-  // Move the edges from Preds to point to NewBB1 instead of OrigBB. 
-  for (unsigned i = 0, e = Preds.size(); i != e; ++i) { 
-    // This is slightly more strict than necessary; the minimum requirement 
-    // is that there be no more than one indirectbr branching to BB. And 
-    // all BlockAddress uses would need to be updated. 
-    assert(!isa<IndirectBrInst>(Preds[i]->getTerminator()) && 
-           "Cannot split an edge from an IndirectBrInst"); 
-    Preds[i]->getTerminator()->replaceUsesOfWith(OrigBB, NewBB1); 
-  } 
- 
-  bool HasLoopExit = false; 
+  assert(OrigBB->isLandingPad() && "Trying to split a non-landing pad!");
+
+  // Create a new basic block for OrigBB's predecessors listed in Preds. Insert
+  // it right before the original block.
+  BasicBlock *NewBB1 = BasicBlock::Create(OrigBB->getContext(),
+                                          OrigBB->getName() + Suffix1,
+                                          OrigBB->getParent(), OrigBB);
+  NewBBs.push_back(NewBB1);
+
+  // The new block unconditionally branches to the old block.
+  BranchInst *BI1 = BranchInst::Create(OrigBB, NewBB1);
+  BI1->setDebugLoc(OrigBB->getFirstNonPHI()->getDebugLoc());
+
+  // Move the edges from Preds to point to NewBB1 instead of OrigBB.
+  for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
+    // This is slightly more strict than necessary; the minimum requirement
+    // is that there be no more than one indirectbr branching to BB. And
+    // all BlockAddress uses would need to be updated.
+    assert(!isa<IndirectBrInst>(Preds[i]->getTerminator()) &&
+           "Cannot split an edge from an IndirectBrInst");
+    Preds[i]->getTerminator()->replaceUsesOfWith(OrigBB, NewBB1);
+  }
+
+  bool HasLoopExit = false;
   UpdateAnalysisInformation(OrigBB, NewBB1, Preds, DTU, DT, LI, MSSAU,
                             PreserveLCSSA, HasLoopExit);
- 
-  // Update the PHI nodes in OrigBB with the values coming from NewBB1. 
-  UpdatePHINodes(OrigBB, NewBB1, Preds, BI1, HasLoopExit); 
- 
-  // Move the remaining edges from OrigBB to point to NewBB2. 
-  SmallVector<BasicBlock*, 8> NewBB2Preds; 
-  for (pred_iterator i = pred_begin(OrigBB), e = pred_end(OrigBB); 
-       i != e; ) { 
-    BasicBlock *Pred = *i++; 
-    if (Pred == NewBB1) continue; 
-    assert(!isa<IndirectBrInst>(Pred->getTerminator()) && 
-           "Cannot split an edge from an IndirectBrInst"); 
-    NewBB2Preds.push_back(Pred); 
-    e = pred_end(OrigBB); 
-  } 
- 
-  BasicBlock *NewBB2 = nullptr; 
-  if (!NewBB2Preds.empty()) { 
-    // Create another basic block for the rest of OrigBB's predecessors. 
-    NewBB2 = BasicBlock::Create(OrigBB->getContext(), 
-                                OrigBB->getName() + Suffix2, 
-                                OrigBB->getParent(), OrigBB); 
-    NewBBs.push_back(NewBB2); 
- 
-    // The new block unconditionally branches to the old block. 
-    BranchInst *BI2 = BranchInst::Create(OrigBB, NewBB2); 
-    BI2->setDebugLoc(OrigBB->getFirstNonPHI()->getDebugLoc()); 
- 
-    // Move the remaining edges from OrigBB to point to NewBB2. 
-    for (BasicBlock *NewBB2Pred : NewBB2Preds) 
-      NewBB2Pred->getTerminator()->replaceUsesOfWith(OrigBB, NewBB2); 
- 
-    // Update DominatorTree, LoopInfo, and LCCSA analysis information. 
-    HasLoopExit = false; 
+
+  // Update the PHI nodes in OrigBB with the values coming from NewBB1.
+  UpdatePHINodes(OrigBB, NewBB1, Preds, BI1, HasLoopExit);
+
+  // Move the remaining edges from OrigBB to point to NewBB2.
+  SmallVector<BasicBlock*, 8> NewBB2Preds;
+  for (pred_iterator i = pred_begin(OrigBB), e = pred_end(OrigBB);
+       i != e; ) {
+    BasicBlock *Pred = *i++;
+    if (Pred == NewBB1) continue;
+    assert(!isa<IndirectBrInst>(Pred->getTerminator()) &&
+           "Cannot split an edge from an IndirectBrInst");
+    NewBB2Preds.push_back(Pred);
+    e = pred_end(OrigBB);
+  }
+
+  BasicBlock *NewBB2 = nullptr;
+  if (!NewBB2Preds.empty()) {
+    // Create another basic block for the rest of OrigBB's predecessors.
+    NewBB2 = BasicBlock::Create(OrigBB->getContext(),
+                                OrigBB->getName() + Suffix2,
+                                OrigBB->getParent(), OrigBB);
+    NewBBs.push_back(NewBB2);
+
+    // The new block unconditionally branches to the old block.
+    BranchInst *BI2 = BranchInst::Create(OrigBB, NewBB2);
+    BI2->setDebugLoc(OrigBB->getFirstNonPHI()->getDebugLoc());
+
+    // Move the remaining edges from OrigBB to point to NewBB2.
+    for (BasicBlock *NewBB2Pred : NewBB2Preds)
+      NewBB2Pred->getTerminator()->replaceUsesOfWith(OrigBB, NewBB2);
+
+    // Update DominatorTree, LoopInfo, and LCCSA analysis information.
+    HasLoopExit = false;
     UpdateAnalysisInformation(OrigBB, NewBB2, NewBB2Preds, DTU, DT, LI, MSSAU,
-                              PreserveLCSSA, HasLoopExit); 
- 
-    // Update the PHI nodes in OrigBB with the values coming from NewBB2. 
-    UpdatePHINodes(OrigBB, NewBB2, NewBB2Preds, BI2, HasLoopExit); 
-  } 
- 
-  LandingPadInst *LPad = OrigBB->getLandingPadInst(); 
-  Instruction *Clone1 = LPad->clone(); 
-  Clone1->setName(Twine("lpad") + Suffix1); 
-  NewBB1->getInstList().insert(NewBB1->getFirstInsertionPt(), Clone1); 
- 
-  if (NewBB2) { 
-    Instruction *Clone2 = LPad->clone(); 
-    Clone2->setName(Twine("lpad") + Suffix2); 
-    NewBB2->getInstList().insert(NewBB2->getFirstInsertionPt(), Clone2); 
- 
-    // Create a PHI node for the two cloned landingpad instructions only 
-    // if the original landingpad instruction has some uses. 
-    if (!LPad->use_empty()) { 
-      assert(!LPad->getType()->isTokenTy() && 
-             "Split cannot be applied if LPad is token type. Otherwise an " 
-             "invalid PHINode of token type would be created."); 
-      PHINode *PN = PHINode::Create(LPad->getType(), 2, "lpad.phi", LPad); 
-      PN->addIncoming(Clone1, NewBB1); 
-      PN->addIncoming(Clone2, NewBB2); 
-      LPad->replaceAllUsesWith(PN); 
-    } 
-    LPad->eraseFromParent(); 
-  } else { 
-    // There is no second clone. Just replace the landing pad with the first 
-    // clone. 
-    LPad->replaceAllUsesWith(Clone1); 
-    LPad->eraseFromParent(); 
-  } 
-} 
- 
+                              PreserveLCSSA, HasLoopExit);
+
+    // Update the PHI nodes in OrigBB with the values coming from NewBB2.
+    UpdatePHINodes(OrigBB, NewBB2, NewBB2Preds, BI2, HasLoopExit);
+  }
+
+  LandingPadInst *LPad = OrigBB->getLandingPadInst();
+  Instruction *Clone1 = LPad->clone();
+  Clone1->setName(Twine("lpad") + Suffix1);
+  NewBB1->getInstList().insert(NewBB1->getFirstInsertionPt(), Clone1);
+
+  if (NewBB2) {
+    Instruction *Clone2 = LPad->clone();
+    Clone2->setName(Twine("lpad") + Suffix2);
+    NewBB2->getInstList().insert(NewBB2->getFirstInsertionPt(), Clone2);
+
+    // Create a PHI node for the two cloned landingpad instructions only
+    // if the original landingpad instruction has some uses.
+    if (!LPad->use_empty()) {
+      assert(!LPad->getType()->isTokenTy() &&
+             "Split cannot be applied if LPad is token type. Otherwise an "
+             "invalid PHINode of token type would be created.");
+      PHINode *PN = PHINode::Create(LPad->getType(), 2, "lpad.phi", LPad);
+      PN->addIncoming(Clone1, NewBB1);
+      PN->addIncoming(Clone2, NewBB2);
+      LPad->replaceAllUsesWith(PN);
+    }
+    LPad->eraseFromParent();
+  } else {
+    // There is no second clone. Just replace the landing pad with the first
+    // clone.
+    LPad->replaceAllUsesWith(Clone1);
+    LPad->eraseFromParent();
+  }
+}
+
 void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,
                                        ArrayRef<BasicBlock *> Preds,
                                        const char *Suffix1, const char *Suffix2,
@@ -1073,73 +1073,73 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,
                                          PreserveLCSSA);
 }
 
-ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB, 
-                                             BasicBlock *Pred, 
-                                             DomTreeUpdater *DTU) { 
-  Instruction *UncondBranch = Pred->getTerminator(); 
-  // Clone the return and add it to the end of the predecessor. 
-  Instruction *NewRet = RI->clone(); 
-  Pred->getInstList().push_back(NewRet); 
- 
-  // If the return instruction returns a value, and if the value was a 
-  // PHI node in "BB", propagate the right value into the return. 
-  for (User::op_iterator i = NewRet->op_begin(), e = NewRet->op_end(); 
-       i != e; ++i) { 
-    Value *V = *i; 
-    Instruction *NewBC = nullptr; 
-    if (BitCastInst *BCI = dyn_cast<BitCastInst>(V)) { 
-      // Return value might be bitcasted. Clone and insert it before the 
-      // return instruction. 
-      V = BCI->getOperand(0); 
-      NewBC = BCI->clone(); 
-      Pred->getInstList().insert(NewRet->getIterator(), NewBC); 
-      *i = NewBC; 
-    } 
- 
-    Instruction *NewEV = nullptr; 
-    if (ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(V)) { 
-      V = EVI->getOperand(0); 
-      NewEV = EVI->clone(); 
-      if (NewBC) { 
-        NewBC->setOperand(0, NewEV); 
-        Pred->getInstList().insert(NewBC->getIterator(), NewEV); 
-      } else { 
-        Pred->getInstList().insert(NewRet->getIterator(), NewEV); 
-        *i = NewEV; 
-      } 
-    } 
- 
-    if (PHINode *PN = dyn_cast<PHINode>(V)) { 
-      if (PN->getParent() == BB) { 
-        if (NewEV) { 
-          NewEV->setOperand(0, PN->getIncomingValueForBlock(Pred)); 
-        } else if (NewBC) 
-          NewBC->setOperand(0, PN->getIncomingValueForBlock(Pred)); 
-        else 
-          *i = PN->getIncomingValueForBlock(Pred); 
-      } 
-    } 
-  } 
- 
-  // Update any PHI nodes in the returning block to realize that we no 
-  // longer branch to them. 
-  BB->removePredecessor(Pred); 
-  UncondBranch->eraseFromParent(); 
- 
-  if (DTU) 
-    DTU->applyUpdates({{DominatorTree::Delete, Pred, BB}}); 
- 
-  return cast<ReturnInst>(NewRet); 
-} 
- 
+ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
+                                             BasicBlock *Pred,
+                                             DomTreeUpdater *DTU) {
+  Instruction *UncondBranch = Pred->getTerminator();
+  // Clone the return and add it to the end of the predecessor.
+  Instruction *NewRet = RI->clone();
+  Pred->getInstList().push_back(NewRet);
+
+  // If the return instruction returns a value, and if the value was a
+  // PHI node in "BB", propagate the right value into the return.
+  for (User::op_iterator i = NewRet->op_begin(), e = NewRet->op_end();
+       i != e; ++i) {
+    Value *V = *i;
+    Instruction *NewBC = nullptr;
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(V)) {
+      // Return value might be bitcasted. Clone and insert it before the
+      // return instruction.
+      V = BCI->getOperand(0);
+      NewBC = BCI->clone();
+      Pred->getInstList().insert(NewRet->getIterator(), NewBC);
+      *i = NewBC;
+    }
+
+    Instruction *NewEV = nullptr;
+    if (ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(V)) {
+      V = EVI->getOperand(0);
+      NewEV = EVI->clone();
+      if (NewBC) {
+        NewBC->setOperand(0, NewEV);
+        Pred->getInstList().insert(NewBC->getIterator(), NewEV);
+      } else {
+        Pred->getInstList().insert(NewRet->getIterator(), NewEV);
+        *i = NewEV;
+      }
+    }
+
+    if (PHINode *PN = dyn_cast<PHINode>(V)) {
+      if (PN->getParent() == BB) {
+        if (NewEV) {
+          NewEV->setOperand(0, PN->getIncomingValueForBlock(Pred));
+        } else if (NewBC)
+          NewBC->setOperand(0, PN->getIncomingValueForBlock(Pred));
+        else
+          *i = PN->getIncomingValueForBlock(Pred);
+      }
+    }
+  }
+
+  // Update any PHI nodes in the returning block to realize that we no
+  // longer branch to them.
+  BB->removePredecessor(Pred);
+  UncondBranch->eraseFromParent();
+
+  if (DTU)
+    DTU->applyUpdates({{DominatorTree::Delete, Pred, BB}});
+
+  return cast<ReturnInst>(NewRet);
+}
+
 static Instruction *
 SplitBlockAndInsertIfThenImpl(Value *Cond, Instruction *SplitBefore,
                               bool Unreachable, MDNode *BranchWeights,
                               DomTreeUpdater *DTU, DominatorTree *DT,
                               LoopInfo *LI, BasicBlock *ThenBlock) {
   SmallVector<DominatorTree::UpdateType, 8> Updates;
-  BasicBlock *Head = SplitBefore->getParent(); 
-  BasicBlock *Tail = Head->splitBasicBlock(SplitBefore->getIterator()); 
+  BasicBlock *Head = SplitBefore->getParent();
+  BasicBlock *Tail = Head->splitBasicBlock(SplitBefore->getIterator());
   if (DTU) {
     SmallSetVector<BasicBlock *, 8> UniqueSuccessorsOfHead(succ_begin(Tail),
                                                            succ_end(Tail));
@@ -1150,57 +1150,57 @@ SplitBlockAndInsertIfThenImpl(Value *Cond, Instruction *SplitBefore,
       Updates.push_back({DominatorTree::Delete, Head, UniqueSuccessorOfHead});
     }
   }
-  Instruction *HeadOldTerm = Head->getTerminator(); 
-  LLVMContext &C = Head->getContext(); 
-  Instruction *CheckTerm; 
-  bool CreateThenBlock = (ThenBlock == nullptr); 
-  if (CreateThenBlock) { 
-    ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail); 
-    if (Unreachable) 
-      CheckTerm = new UnreachableInst(C, ThenBlock); 
+  Instruction *HeadOldTerm = Head->getTerminator();
+  LLVMContext &C = Head->getContext();
+  Instruction *CheckTerm;
+  bool CreateThenBlock = (ThenBlock == nullptr);
+  if (CreateThenBlock) {
+    ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail);
+    if (Unreachable)
+      CheckTerm = new UnreachableInst(C, ThenBlock);
     else {
-      CheckTerm = BranchInst::Create(Tail, ThenBlock); 
+      CheckTerm = BranchInst::Create(Tail, ThenBlock);
       if (DTU)
         Updates.push_back({DominatorTree::Insert, ThenBlock, Tail});
     }
-    CheckTerm->setDebugLoc(SplitBefore->getDebugLoc()); 
-  } else 
-    CheckTerm = ThenBlock->getTerminator(); 
-  BranchInst *HeadNewTerm = 
+    CheckTerm->setDebugLoc(SplitBefore->getDebugLoc());
+  } else
+    CheckTerm = ThenBlock->getTerminator();
+  BranchInst *HeadNewTerm =
       BranchInst::Create(/*ifTrue*/ ThenBlock, /*ifFalse*/ Tail, Cond);
   if (DTU)
     Updates.push_back({DominatorTree::Insert, Head, ThenBlock});
-  HeadNewTerm->setMetadata(LLVMContext::MD_prof, BranchWeights); 
-  ReplaceInstWithInst(HeadOldTerm, HeadNewTerm); 
- 
+  HeadNewTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
+  ReplaceInstWithInst(HeadOldTerm, HeadNewTerm);
+
   if (DTU)
     DTU->applyUpdates(Updates);
   else if (DT) {
-    if (DomTreeNode *OldNode = DT->getNode(Head)) { 
-      std::vector<DomTreeNode *> Children(OldNode->begin(), OldNode->end()); 
- 
-      DomTreeNode *NewNode = DT->addNewBlock(Tail, Head); 
-      for (DomTreeNode *Child : Children) 
-        DT->changeImmediateDominator(Child, NewNode); 
- 
-      // Head dominates ThenBlock. 
-      if (CreateThenBlock) 
-        DT->addNewBlock(ThenBlock, Head); 
-      else 
-        DT->changeImmediateDominator(ThenBlock, Head); 
-    } 
-  } 
- 
-  if (LI) { 
-    if (Loop *L = LI->getLoopFor(Head)) { 
-      L->addBasicBlockToLoop(ThenBlock, *LI); 
-      L->addBasicBlockToLoop(Tail, *LI); 
-    } 
-  } 
- 
-  return CheckTerm; 
-} 
- 
+    if (DomTreeNode *OldNode = DT->getNode(Head)) {
+      std::vector<DomTreeNode *> Children(OldNode->begin(), OldNode->end());
+
+      DomTreeNode *NewNode = DT->addNewBlock(Tail, Head);
+      for (DomTreeNode *Child : Children)
+        DT->changeImmediateDominator(Child, NewNode);
+
+      // Head dominates ThenBlock.
+      if (CreateThenBlock)
+        DT->addNewBlock(ThenBlock, Head);
+      else
+        DT->changeImmediateDominator(ThenBlock, Head);
+    }
+  }
+
+  if (LI) {
+    if (Loop *L = LI->getLoopFor(Head)) {
+      L->addBasicBlockToLoop(ThenBlock, *LI);
+      L->addBasicBlockToLoop(Tail, *LI);
+    }
+  }
+
+  return CheckTerm;
+}
+
 Instruction *llvm::SplitBlockAndInsertIfThen(Value *Cond,
                                              Instruction *SplitBefore,
                                              bool Unreachable,
@@ -1222,358 +1222,358 @@ Instruction *llvm::SplitBlockAndInsertIfThen(Value *Cond,
                                        ThenBlock);
 }
 
-void llvm::SplitBlockAndInsertIfThenElse(Value *Cond, Instruction *SplitBefore, 
-                                         Instruction **ThenTerm, 
-                                         Instruction **ElseTerm, 
-                                         MDNode *BranchWeights) { 
-  BasicBlock *Head = SplitBefore->getParent(); 
-  BasicBlock *Tail = Head->splitBasicBlock(SplitBefore->getIterator()); 
-  Instruction *HeadOldTerm = Head->getTerminator(); 
-  LLVMContext &C = Head->getContext(); 
-  BasicBlock *ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail); 
-  BasicBlock *ElseBlock = BasicBlock::Create(C, "", Head->getParent(), Tail); 
-  *ThenTerm = BranchInst::Create(Tail, ThenBlock); 
-  (*ThenTerm)->setDebugLoc(SplitBefore->getDebugLoc()); 
-  *ElseTerm = BranchInst::Create(Tail, ElseBlock); 
-  (*ElseTerm)->setDebugLoc(SplitBefore->getDebugLoc()); 
-  BranchInst *HeadNewTerm = 
-    BranchInst::Create(/*ifTrue*/ThenBlock, /*ifFalse*/ElseBlock, Cond); 
-  HeadNewTerm->setMetadata(LLVMContext::MD_prof, BranchWeights); 
-  ReplaceInstWithInst(HeadOldTerm, HeadNewTerm); 
-} 
- 
-Value *llvm::GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue, 
-                             BasicBlock *&IfFalse) { 
-  PHINode *SomePHI = dyn_cast<PHINode>(BB->begin()); 
-  BasicBlock *Pred1 = nullptr; 
-  BasicBlock *Pred2 = nullptr; 
- 
-  if (SomePHI) { 
-    if (SomePHI->getNumIncomingValues() != 2) 
-      return nullptr; 
-    Pred1 = SomePHI->getIncomingBlock(0); 
-    Pred2 = SomePHI->getIncomingBlock(1); 
-  } else { 
-    pred_iterator PI = pred_begin(BB), PE = pred_end(BB); 
-    if (PI == PE) // No predecessor 
-      return nullptr; 
-    Pred1 = *PI++; 
-    if (PI == PE) // Only one predecessor 
-      return nullptr; 
-    Pred2 = *PI++; 
-    if (PI != PE) // More than two predecessors 
-      return nullptr; 
-  } 
- 
-  // We can only handle branches.  Other control flow will be lowered to 
-  // branches if possible anyway. 
-  BranchInst *Pred1Br = dyn_cast<BranchInst>(Pred1->getTerminator()); 
-  BranchInst *Pred2Br = dyn_cast<BranchInst>(Pred2->getTerminator()); 
-  if (!Pred1Br || !Pred2Br) 
-    return nullptr; 
- 
-  // Eliminate code duplication by ensuring that Pred1Br is conditional if 
-  // either are. 
-  if (Pred2Br->isConditional()) { 
-    // If both branches are conditional, we don't have an "if statement".  In 
-    // reality, we could transform this case, but since the condition will be 
-    // required anyway, we stand no chance of eliminating it, so the xform is 
-    // probably not profitable. 
-    if (Pred1Br->isConditional()) 
-      return nullptr; 
- 
-    std::swap(Pred1, Pred2); 
-    std::swap(Pred1Br, Pred2Br); 
-  } 
- 
-  if (Pred1Br->isConditional()) { 
-    // The only thing we have to watch out for here is to make sure that Pred2 
-    // doesn't have incoming edges from other blocks.  If it does, the condition 
-    // doesn't dominate BB. 
-    if (!Pred2->getSinglePredecessor()) 
-      return nullptr; 
- 
-    // If we found a conditional branch predecessor, make sure that it branches 
-    // to BB and Pred2Br.  If it doesn't, this isn't an "if statement". 
-    if (Pred1Br->getSuccessor(0) == BB && 
-        Pred1Br->getSuccessor(1) == Pred2) { 
-      IfTrue = Pred1; 
-      IfFalse = Pred2; 
-    } else if (Pred1Br->getSuccessor(0) == Pred2 && 
-               Pred1Br->getSuccessor(1) == BB) { 
-      IfTrue = Pred2; 
-      IfFalse = Pred1; 
-    } else { 
-      // We know that one arm of the conditional goes to BB, so the other must 
-      // go somewhere unrelated, and this must not be an "if statement". 
-      return nullptr; 
-    } 
- 
-    return Pred1Br->getCondition(); 
-  } 
- 
-  // Ok, if we got here, both predecessors end with an unconditional branch to 
-  // BB.  Don't panic!  If both blocks only have a single (identical) 
-  // predecessor, and THAT is a conditional branch, then we're all ok! 
-  BasicBlock *CommonPred = Pred1->getSinglePredecessor(); 
-  if (CommonPred == nullptr || CommonPred != Pred2->getSinglePredecessor()) 
-    return nullptr; 
- 
-  // Otherwise, if this is a conditional branch, then we can use it! 
-  BranchInst *BI = dyn_cast<BranchInst>(CommonPred->getTerminator()); 
-  if (!BI) return nullptr; 
- 
-  assert(BI->isConditional() && "Two successors but not conditional?"); 
-  if (BI->getSuccessor(0) == Pred1) { 
-    IfTrue = Pred1; 
-    IfFalse = Pred2; 
-  } else { 
-    IfTrue = Pred2; 
-    IfFalse = Pred1; 
-  } 
-  return BI->getCondition(); 
-} 
- 
-// After creating a control flow hub, the operands of PHINodes in an outgoing 
-// block Out no longer match the predecessors of that block. Predecessors of Out 
-// that are incoming blocks to the hub are now replaced by just one edge from 
-// the hub. To match this new control flow, the corresponding values from each 
-// PHINode must now be moved a new PHINode in the first guard block of the hub. 
-// 
-// This operation cannot be performed with SSAUpdater, because it involves one 
-// new use: If the block Out is in the list of Incoming blocks, then the newly 
-// created PHI in the Hub will use itself along that edge from Out to Hub. 
-static void reconnectPhis(BasicBlock *Out, BasicBlock *GuardBlock, 
-                          const SetVector<BasicBlock *> &Incoming, 
-                          BasicBlock *FirstGuardBlock) { 
-  auto I = Out->begin(); 
-  while (I != Out->end() && isa<PHINode>(I)) { 
-    auto Phi = cast<PHINode>(I); 
-    auto NewPhi = 
-        PHINode::Create(Phi->getType(), Incoming.size(), 
-                        Phi->getName() + ".moved", &FirstGuardBlock->back()); 
-    for (auto In : Incoming) { 
-      Value *V = UndefValue::get(Phi->getType()); 
-      if (In == Out) { 
-        V = NewPhi; 
-      } else if (Phi->getBasicBlockIndex(In) != -1) { 
-        V = Phi->removeIncomingValue(In, false); 
-      } 
-      NewPhi->addIncoming(V, In); 
-    } 
-    assert(NewPhi->getNumIncomingValues() == Incoming.size()); 
-    if (Phi->getNumOperands() == 0) { 
-      Phi->replaceAllUsesWith(NewPhi); 
-      I = Phi->eraseFromParent(); 
-      continue; 
-    } 
-    Phi->addIncoming(NewPhi, GuardBlock); 
-    ++I; 
-  } 
-} 
- 
-using BBPredicates = DenseMap<BasicBlock *, PHINode *>; 
-using BBSetVector = SetVector<BasicBlock *>; 
- 
-// Redirects the terminator of the incoming block to the first guard 
-// block in the hub. The condition of the original terminator (if it 
-// was conditional) and its original successors are returned as a 
-// tuple <condition, succ0, succ1>. The function additionally filters 
-// out successors that are not in the set of outgoing blocks. 
-// 
-// - condition is non-null iff the branch is conditional. 
-// - Succ1 is non-null iff the sole/taken target is an outgoing block. 
-// - Succ2 is non-null iff condition is non-null and the fallthrough 
-//         target is an outgoing block. 
-static std::tuple<Value *, BasicBlock *, BasicBlock *> 
-redirectToHub(BasicBlock *BB, BasicBlock *FirstGuardBlock, 
-              const BBSetVector &Outgoing) { 
-  auto Branch = cast<BranchInst>(BB->getTerminator()); 
-  auto Condition = Branch->isConditional() ? Branch->getCondition() : nullptr; 
- 
-  BasicBlock *Succ0 = Branch->getSuccessor(0); 
-  BasicBlock *Succ1 = nullptr; 
-  Succ0 = Outgoing.count(Succ0) ? Succ0 : nullptr; 
- 
-  if (Branch->isUnconditional()) { 
-    Branch->setSuccessor(0, FirstGuardBlock); 
-    assert(Succ0); 
-  } else { 
-    Succ1 = Branch->getSuccessor(1); 
-    Succ1 = Outgoing.count(Succ1) ? Succ1 : nullptr; 
-    assert(Succ0 || Succ1); 
-    if (Succ0 && !Succ1) { 
-      Branch->setSuccessor(0, FirstGuardBlock); 
-    } else if (Succ1 && !Succ0) { 
-      Branch->setSuccessor(1, FirstGuardBlock); 
-    } else { 
-      Branch->eraseFromParent(); 
-      BranchInst::Create(FirstGuardBlock, BB); 
-    } 
-  } 
- 
-  assert(Succ0 || Succ1); 
-  return std::make_tuple(Condition, Succ0, Succ1); 
-} 
- 
-// Capture the existing control flow as guard predicates, and redirect 
-// control flow from every incoming block to the first guard block in 
-// the hub. 
-// 
-// There is one guard predicate for each outgoing block OutBB. The 
-// predicate is a PHINode with one input for each InBB which 
-// represents whether the hub should transfer control flow to OutBB if 
-// it arrived from InBB. These predicates are NOT ORTHOGONAL. The Hub 
-// evaluates them in the same order as the Outgoing set-vector, and 
-// control branches to the first outgoing block whose predicate 
-// evaluates to true. 
-static void convertToGuardPredicates( 
-    BasicBlock *FirstGuardBlock, BBPredicates &GuardPredicates, 
-    SmallVectorImpl<WeakVH> &DeletionCandidates, const BBSetVector &Incoming, 
-    const BBSetVector &Outgoing) { 
-  auto &Context = Incoming.front()->getContext(); 
-  auto BoolTrue = ConstantInt::getTrue(Context); 
-  auto BoolFalse = ConstantInt::getFalse(Context); 
- 
-  // The predicate for the last outgoing is trivially true, and so we 
-  // process only the first N-1 successors. 
-  for (int i = 0, e = Outgoing.size() - 1; i != e; ++i) { 
-    auto Out = Outgoing[i]; 
-    LLVM_DEBUG(dbgs() << "Creating guard for " << Out->getName() << "\n"); 
-    auto Phi = 
-        PHINode::Create(Type::getInt1Ty(Context), Incoming.size(), 
-                        StringRef("Guard.") + Out->getName(), FirstGuardBlock); 
-    GuardPredicates[Out] = Phi; 
-  } 
- 
-  for (auto In : Incoming) { 
-    Value *Condition; 
-    BasicBlock *Succ0; 
-    BasicBlock *Succ1; 
-    std::tie(Condition, Succ0, Succ1) = 
-        redirectToHub(In, FirstGuardBlock, Outgoing); 
- 
-    // Optimization: Consider an incoming block A with both successors 
-    // Succ0 and Succ1 in the set of outgoing blocks. The predicates 
-    // for Succ0 and Succ1 complement each other. If Succ0 is visited 
-    // first in the loop below, control will branch to Succ0 using the 
-    // corresponding predicate. But if that branch is not taken, then 
-    // control must reach Succ1, which means that the predicate for 
-    // Succ1 is always true. 
-    bool OneSuccessorDone = false; 
-    for (int i = 0, e = Outgoing.size() - 1; i != e; ++i) { 
-      auto Out = Outgoing[i]; 
-      auto Phi = GuardPredicates[Out]; 
-      if (Out != Succ0 && Out != Succ1) { 
-        Phi->addIncoming(BoolFalse, In); 
-        continue; 
-      } 
-      // Optimization: When only one successor is an outgoing block, 
-      // the predicate is always true. 
-      if (!Succ0 || !Succ1 || OneSuccessorDone) { 
-        Phi->addIncoming(BoolTrue, In); 
-        continue; 
-      } 
-      assert(Succ0 && Succ1); 
-      OneSuccessorDone = true; 
-      if (Out == Succ0) { 
-        Phi->addIncoming(Condition, In); 
-        continue; 
-      } 
-      auto Inverted = invertCondition(Condition); 
-      DeletionCandidates.push_back(Condition); 
-      Phi->addIncoming(Inverted, In); 
-    } 
-  } 
-} 
- 
-// For each outgoing block OutBB, create a guard block in the Hub. The 
-// first guard block was already created outside, and available as the 
-// first element in the vector of guard blocks. 
-// 
-// Each guard block terminates in a conditional branch that transfers 
-// control to the corresponding outgoing block or the next guard 
-// block. The last guard block has two outgoing blocks as successors 
-// since the condition for the final outgoing block is trivially 
-// true. So we create one less block (including the first guard block) 
-// than the number of outgoing blocks. 
-static void createGuardBlocks(SmallVectorImpl<BasicBlock *> &GuardBlocks, 
-                              Function *F, const BBSetVector &Outgoing, 
-                              BBPredicates &GuardPredicates, StringRef Prefix) { 
-  for (int i = 0, e = Outgoing.size() - 2; i != e; ++i) { 
-    GuardBlocks.push_back( 
-        BasicBlock::Create(F->getContext(), Prefix + ".guard", F)); 
-  } 
-  assert(GuardBlocks.size() == GuardPredicates.size()); 
- 
-  // To help keep the loop simple, temporarily append the last 
-  // outgoing block to the list of guard blocks. 
-  GuardBlocks.push_back(Outgoing.back()); 
- 
-  for (int i = 0, e = GuardBlocks.size() - 1; i != e; ++i) { 
-    auto Out = Outgoing[i]; 
-    assert(GuardPredicates.count(Out)); 
-    BranchInst::Create(Out, GuardBlocks[i + 1], GuardPredicates[Out], 
-                       GuardBlocks[i]); 
-  } 
- 
-  // Remove the last block from the guard list. 
-  GuardBlocks.pop_back(); 
-} 
- 
-BasicBlock *llvm::CreateControlFlowHub( 
-    DomTreeUpdater *DTU, SmallVectorImpl<BasicBlock *> &GuardBlocks, 
-    const BBSetVector &Incoming, const BBSetVector &Outgoing, 
-    const StringRef Prefix) { 
-  auto F = Incoming.front()->getParent(); 
-  auto FirstGuardBlock = 
-      BasicBlock::Create(F->getContext(), Prefix + ".guard", F); 
- 
-  SmallVector<DominatorTree::UpdateType, 16> Updates; 
-  if (DTU) { 
-    for (auto In : Incoming) { 
+void llvm::SplitBlockAndInsertIfThenElse(Value *Cond, Instruction *SplitBefore,
+                                         Instruction **ThenTerm,
+                                         Instruction **ElseTerm,
+                                         MDNode *BranchWeights) {
+  BasicBlock *Head = SplitBefore->getParent();
+  BasicBlock *Tail = Head->splitBasicBlock(SplitBefore->getIterator());
+  Instruction *HeadOldTerm = Head->getTerminator();
+  LLVMContext &C = Head->getContext();
+  BasicBlock *ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail);
+  BasicBlock *ElseBlock = BasicBlock::Create(C, "", Head->getParent(), Tail);
+  *ThenTerm = BranchInst::Create(Tail, ThenBlock);
+  (*ThenTerm)->setDebugLoc(SplitBefore->getDebugLoc());
+  *ElseTerm = BranchInst::Create(Tail, ElseBlock);
+  (*ElseTerm)->setDebugLoc(SplitBefore->getDebugLoc());
+  BranchInst *HeadNewTerm =
+    BranchInst::Create(/*ifTrue*/ThenBlock, /*ifFalse*/ElseBlock, Cond);
+  HeadNewTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
+  ReplaceInstWithInst(HeadOldTerm, HeadNewTerm);
+}
+
+Value *llvm::GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue,
+                             BasicBlock *&IfFalse) {
+  PHINode *SomePHI = dyn_cast<PHINode>(BB->begin());
+  BasicBlock *Pred1 = nullptr;
+  BasicBlock *Pred2 = nullptr;
+
+  if (SomePHI) {
+    if (SomePHI->getNumIncomingValues() != 2)
+      return nullptr;
+    Pred1 = SomePHI->getIncomingBlock(0);
+    Pred2 = SomePHI->getIncomingBlock(1);
+  } else {
+    pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
+    if (PI == PE) // No predecessor
+      return nullptr;
+    Pred1 = *PI++;
+    if (PI == PE) // Only one predecessor
+      return nullptr;
+    Pred2 = *PI++;
+    if (PI != PE) // More than two predecessors
+      return nullptr;
+  }
+
+  // We can only handle branches.  Other control flow will be lowered to
+  // branches if possible anyway.
+  BranchInst *Pred1Br = dyn_cast<BranchInst>(Pred1->getTerminator());
+  BranchInst *Pred2Br = dyn_cast<BranchInst>(Pred2->getTerminator());
+  if (!Pred1Br || !Pred2Br)
+    return nullptr;
+
+  // Eliminate code duplication by ensuring that Pred1Br is conditional if
+  // either are.
+  if (Pred2Br->isConditional()) {
+    // If both branches are conditional, we don't have an "if statement".  In
+    // reality, we could transform this case, but since the condition will be
+    // required anyway, we stand no chance of eliminating it, so the xform is
+    // probably not profitable.
+    if (Pred1Br->isConditional())
+      return nullptr;
+
+    std::swap(Pred1, Pred2);
+    std::swap(Pred1Br, Pred2Br);
+  }
+
+  if (Pred1Br->isConditional()) {
+    // The only thing we have to watch out for here is to make sure that Pred2
+    // doesn't have incoming edges from other blocks.  If it does, the condition
+    // doesn't dominate BB.
+    if (!Pred2->getSinglePredecessor())
+      return nullptr;
+
+    // If we found a conditional branch predecessor, make sure that it branches
+    // to BB and Pred2Br.  If it doesn't, this isn't an "if statement".
+    if (Pred1Br->getSuccessor(0) == BB &&
+        Pred1Br->getSuccessor(1) == Pred2) {
+      IfTrue = Pred1;
+      IfFalse = Pred2;
+    } else if (Pred1Br->getSuccessor(0) == Pred2 &&
+               Pred1Br->getSuccessor(1) == BB) {
+      IfTrue = Pred2;
+      IfFalse = Pred1;
+    } else {
+      // We know that one arm of the conditional goes to BB, so the other must
+      // go somewhere unrelated, and this must not be an "if statement".
+      return nullptr;
+    }
+
+    return Pred1Br->getCondition();
+  }
+
+  // Ok, if we got here, both predecessors end with an unconditional branch to
+  // BB.  Don't panic!  If both blocks only have a single (identical)
+  // predecessor, and THAT is a conditional branch, then we're all ok!
+  BasicBlock *CommonPred = Pred1->getSinglePredecessor();
+  if (CommonPred == nullptr || CommonPred != Pred2->getSinglePredecessor())
+    return nullptr;
+
+  // Otherwise, if this is a conditional branch, then we can use it!
+  BranchInst *BI = dyn_cast<BranchInst>(CommonPred->getTerminator());
+  if (!BI) return nullptr;
+
+  assert(BI->isConditional() && "Two successors but not conditional?");
+  if (BI->getSuccessor(0) == Pred1) {
+    IfTrue = Pred1;
+    IfFalse = Pred2;
+  } else {
+    IfTrue = Pred2;
+    IfFalse = Pred1;
+  }
+  return BI->getCondition();
+}
+
+// After creating a control flow hub, the operands of PHINodes in an outgoing
+// block Out no longer match the predecessors of that block. Predecessors of Out
+// that are incoming blocks to the hub are now replaced by just one edge from
+// the hub. To match this new control flow, the corresponding values from each
+// PHINode must now be moved a new PHINode in the first guard block of the hub.
+//
+// This operation cannot be performed with SSAUpdater, because it involves one
+// new use: If the block Out is in the list of Incoming blocks, then the newly
+// created PHI in the Hub will use itself along that edge from Out to Hub.
+static void reconnectPhis(BasicBlock *Out, BasicBlock *GuardBlock,
+                          const SetVector<BasicBlock *> &Incoming,
+                          BasicBlock *FirstGuardBlock) {
+  auto I = Out->begin();
+  while (I != Out->end() && isa<PHINode>(I)) {
+    auto Phi = cast<PHINode>(I);
+    auto NewPhi =
+        PHINode::Create(Phi->getType(), Incoming.size(),
+                        Phi->getName() + ".moved", &FirstGuardBlock->back());
+    for (auto In : Incoming) {
+      Value *V = UndefValue::get(Phi->getType());
+      if (In == Out) {
+        V = NewPhi;
+      } else if (Phi->getBasicBlockIndex(In) != -1) {
+        V = Phi->removeIncomingValue(In, false);
+      }
+      NewPhi->addIncoming(V, In);
+    }
+    assert(NewPhi->getNumIncomingValues() == Incoming.size());
+    if (Phi->getNumOperands() == 0) {
+      Phi->replaceAllUsesWith(NewPhi);
+      I = Phi->eraseFromParent();
+      continue;
+    }
+    Phi->addIncoming(NewPhi, GuardBlock);
+    ++I;
+  }
+}
+
+using BBPredicates = DenseMap<BasicBlock *, PHINode *>;
+using BBSetVector = SetVector<BasicBlock *>;
+
+// Redirects the terminator of the incoming block to the first guard
+// block in the hub. The condition of the original terminator (if it
+// was conditional) and its original successors are returned as a
+// tuple <condition, succ0, succ1>. The function additionally filters
+// out successors that are not in the set of outgoing blocks.
+//
+// - condition is non-null iff the branch is conditional.
+// - Succ1 is non-null iff the sole/taken target is an outgoing block.
+// - Succ2 is non-null iff condition is non-null and the fallthrough
+//         target is an outgoing block.
+static std::tuple<Value *, BasicBlock *, BasicBlock *>
+redirectToHub(BasicBlock *BB, BasicBlock *FirstGuardBlock,
+              const BBSetVector &Outgoing) {
+  auto Branch = cast<BranchInst>(BB->getTerminator());
+  auto Condition = Branch->isConditional() ? Branch->getCondition() : nullptr;
+
+  BasicBlock *Succ0 = Branch->getSuccessor(0);
+  BasicBlock *Succ1 = nullptr;
+  Succ0 = Outgoing.count(Succ0) ? Succ0 : nullptr;
+
+  if (Branch->isUnconditional()) {
+    Branch->setSuccessor(0, FirstGuardBlock);
+    assert(Succ0);
+  } else {
+    Succ1 = Branch->getSuccessor(1);
+    Succ1 = Outgoing.count(Succ1) ? Succ1 : nullptr;
+    assert(Succ0 || Succ1);
+    if (Succ0 && !Succ1) {
+      Branch->setSuccessor(0, FirstGuardBlock);
+    } else if (Succ1 && !Succ0) {
+      Branch->setSuccessor(1, FirstGuardBlock);
+    } else {
+      Branch->eraseFromParent();
+      BranchInst::Create(FirstGuardBlock, BB);
+    }
+  }
+
+  assert(Succ0 || Succ1);
+  return std::make_tuple(Condition, Succ0, Succ1);
+}
+
+// Capture the existing control flow as guard predicates, and redirect
+// control flow from every incoming block to the first guard block in
+// the hub.
+//
+// There is one guard predicate for each outgoing block OutBB. The
+// predicate is a PHINode with one input for each InBB which
+// represents whether the hub should transfer control flow to OutBB if
+// it arrived from InBB. These predicates are NOT ORTHOGONAL. The Hub
+// evaluates them in the same order as the Outgoing set-vector, and
+// control branches to the first outgoing block whose predicate
+// evaluates to true.
+static void convertToGuardPredicates(
+    BasicBlock *FirstGuardBlock, BBPredicates &GuardPredicates,
+    SmallVectorImpl<WeakVH> &DeletionCandidates, const BBSetVector &Incoming,
+    const BBSetVector &Outgoing) {
+  auto &Context = Incoming.front()->getContext();
+  auto BoolTrue = ConstantInt::getTrue(Context);
+  auto BoolFalse = ConstantInt::getFalse(Context);
+
+  // The predicate for the last outgoing is trivially true, and so we
+  // process only the first N-1 successors.
+  for (int i = 0, e = Outgoing.size() - 1; i != e; ++i) {
+    auto Out = Outgoing[i];
+    LLVM_DEBUG(dbgs() << "Creating guard for " << Out->getName() << "\n");
+    auto Phi =
+        PHINode::Create(Type::getInt1Ty(Context), Incoming.size(),
+                        StringRef("Guard.") + Out->getName(), FirstGuardBlock);
+    GuardPredicates[Out] = Phi;
+  }
+
+  for (auto In : Incoming) {
+    Value *Condition;
+    BasicBlock *Succ0;
+    BasicBlock *Succ1;
+    std::tie(Condition, Succ0, Succ1) =
+        redirectToHub(In, FirstGuardBlock, Outgoing);
+
+    // Optimization: Consider an incoming block A with both successors
+    // Succ0 and Succ1 in the set of outgoing blocks. The predicates
+    // for Succ0 and Succ1 complement each other. If Succ0 is visited
+    // first in the loop below, control will branch to Succ0 using the
+    // corresponding predicate. But if that branch is not taken, then
+    // control must reach Succ1, which means that the predicate for
+    // Succ1 is always true.
+    bool OneSuccessorDone = false;
+    for (int i = 0, e = Outgoing.size() - 1; i != e; ++i) {
+      auto Out = Outgoing[i];
+      auto Phi = GuardPredicates[Out];
+      if (Out != Succ0 && Out != Succ1) {
+        Phi->addIncoming(BoolFalse, In);
+        continue;
+      }
+      // Optimization: When only one successor is an outgoing block,
+      // the predicate is always true.
+      if (!Succ0 || !Succ1 || OneSuccessorDone) {
+        Phi->addIncoming(BoolTrue, In);
+        continue;
+      }
+      assert(Succ0 && Succ1);
+      OneSuccessorDone = true;
+      if (Out == Succ0) {
+        Phi->addIncoming(Condition, In);
+        continue;
+      }
+      auto Inverted = invertCondition(Condition);
+      DeletionCandidates.push_back(Condition);
+      Phi->addIncoming(Inverted, In);
+    }
+  }
+}
+
+// For each outgoing block OutBB, create a guard block in the Hub. The
+// first guard block was already created outside, and available as the
+// first element in the vector of guard blocks.
+//
+// Each guard block terminates in a conditional branch that transfers
+// control to the corresponding outgoing block or the next guard
+// block. The last guard block has two outgoing blocks as successors
+// since the condition for the final outgoing block is trivially
+// true. So we create one less block (including the first guard block)
+// than the number of outgoing blocks.
+static void createGuardBlocks(SmallVectorImpl<BasicBlock *> &GuardBlocks,
+                              Function *F, const BBSetVector &Outgoing,
+                              BBPredicates &GuardPredicates, StringRef Prefix) {
+  for (int i = 0, e = Outgoing.size() - 2; i != e; ++i) {
+    GuardBlocks.push_back(
+        BasicBlock::Create(F->getContext(), Prefix + ".guard", F));
+  }
+  assert(GuardBlocks.size() == GuardPredicates.size());
+
+  // To help keep the loop simple, temporarily append the last
+  // outgoing block to the list of guard blocks.
+  GuardBlocks.push_back(Outgoing.back());
+
+  for (int i = 0, e = GuardBlocks.size() - 1; i != e; ++i) {
+    auto Out = Outgoing[i];
+    assert(GuardPredicates.count(Out));
+    BranchInst::Create(Out, GuardBlocks[i + 1], GuardPredicates[Out],
+                       GuardBlocks[i]);
+  }
+
+  // Remove the last block from the guard list.
+  GuardBlocks.pop_back();
+}
+
+BasicBlock *llvm::CreateControlFlowHub(
+    DomTreeUpdater *DTU, SmallVectorImpl<BasicBlock *> &GuardBlocks,
+    const BBSetVector &Incoming, const BBSetVector &Outgoing,
+    const StringRef Prefix) {
+  auto F = Incoming.front()->getParent();
+  auto FirstGuardBlock =
+      BasicBlock::Create(F->getContext(), Prefix + ".guard", F);
+
+  SmallVector<DominatorTree::UpdateType, 16> Updates;
+  if (DTU) {
+    for (auto In : Incoming) {
       Updates.push_back({DominatorTree::Insert, In, FirstGuardBlock});
-      for (auto Succ : successors(In)) { 
-        if (Outgoing.count(Succ)) 
-          Updates.push_back({DominatorTree::Delete, In, Succ}); 
-      } 
-    } 
-  } 
- 
-  BBPredicates GuardPredicates; 
-  SmallVector<WeakVH, 8> DeletionCandidates; 
-  convertToGuardPredicates(FirstGuardBlock, GuardPredicates, DeletionCandidates, 
-                           Incoming, Outgoing); 
- 
-  GuardBlocks.push_back(FirstGuardBlock); 
-  createGuardBlocks(GuardBlocks, F, Outgoing, GuardPredicates, Prefix); 
- 
-  // Update the PHINodes in each outgoing block to match the new control flow. 
-  for (int i = 0, e = GuardBlocks.size(); i != e; ++i) { 
-    reconnectPhis(Outgoing[i], GuardBlocks[i], Incoming, FirstGuardBlock); 
-  } 
-  reconnectPhis(Outgoing.back(), GuardBlocks.back(), Incoming, FirstGuardBlock); 
- 
-  if (DTU) { 
-    int NumGuards = GuardBlocks.size(); 
-    assert((int)Outgoing.size() == NumGuards + 1); 
-    for (int i = 0; i != NumGuards - 1; ++i) { 
-      Updates.push_back({DominatorTree::Insert, GuardBlocks[i], Outgoing[i]}); 
-      Updates.push_back( 
-          {DominatorTree::Insert, GuardBlocks[i], GuardBlocks[i + 1]}); 
-    } 
-    Updates.push_back({DominatorTree::Insert, GuardBlocks[NumGuards - 1], 
-                       Outgoing[NumGuards - 1]}); 
-    Updates.push_back({DominatorTree::Insert, GuardBlocks[NumGuards - 1], 
-                       Outgoing[NumGuards]}); 
-    DTU->applyUpdates(Updates); 
-  } 
- 
-  for (auto I : DeletionCandidates) { 
-    if (I->use_empty()) 
-      if (auto Inst = dyn_cast_or_null<Instruction>(I)) 
-        Inst->eraseFromParent(); 
-  } 
- 
-  return FirstGuardBlock; 
-} 
+      for (auto Succ : successors(In)) {
+        if (Outgoing.count(Succ))
+          Updates.push_back({DominatorTree::Delete, In, Succ});
+      }
+    }
+  }
+
+  BBPredicates GuardPredicates;
+  SmallVector<WeakVH, 8> DeletionCandidates;
+  convertToGuardPredicates(FirstGuardBlock, GuardPredicates, DeletionCandidates,
+                           Incoming, Outgoing);
+
+  GuardBlocks.push_back(FirstGuardBlock);
+  createGuardBlocks(GuardBlocks, F, Outgoing, GuardPredicates, Prefix);
+
+  // Update the PHINodes in each outgoing block to match the new control flow.
+  for (int i = 0, e = GuardBlocks.size(); i != e; ++i) {
+    reconnectPhis(Outgoing[i], GuardBlocks[i], Incoming, FirstGuardBlock);
+  }
+  reconnectPhis(Outgoing.back(), GuardBlocks.back(), Incoming, FirstGuardBlock);
+
+  if (DTU) {
+    int NumGuards = GuardBlocks.size();
+    assert((int)Outgoing.size() == NumGuards + 1);
+    for (int i = 0; i != NumGuards - 1; ++i) {
+      Updates.push_back({DominatorTree::Insert, GuardBlocks[i], Outgoing[i]});
+      Updates.push_back(
+          {DominatorTree::Insert, GuardBlocks[i], GuardBlocks[i + 1]});
+    }
+    Updates.push_back({DominatorTree::Insert, GuardBlocks[NumGuards - 1],
+                       Outgoing[NumGuards - 1]});
+    Updates.push_back({DominatorTree::Insert, GuardBlocks[NumGuards - 1],
+                       Outgoing[NumGuards]});
+    DTU->applyUpdates(Updates);
+  }
+
+  for (auto I : DeletionCandidates) {
+    if (I->use_empty())
+      if (auto Inst = dyn_cast_or_null<Instruction>(I))
+        Inst->eraseFromParent();
+  }
+
+  return FirstGuardBlock;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/BreakCriticalEdges.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/BreakCriticalEdges.cpp
index bb1438e94b..939a1a3a86 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -1,203 +1,203 @@
-//===- BreakCriticalEdges.cpp - Critical Edge Elimination Pass ------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// BreakCriticalEdges pass - Break all of the critical edges in the CFG by 
-// inserting a dummy basic block.  This pass may be "required" by passes that 
-// cannot deal with critical edges.  For this usage, the structure type is 
-// forward declared.  This pass obviously invalidates the CFG, but can update 
-// dominator trees. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/BreakCriticalEdges.h" 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/BlockFrequencyInfo.h" 
-#include "llvm/Analysis/BranchProbabilityInfo.h" 
-#include "llvm/Analysis/CFG.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/MemorySSAUpdater.h" 
-#include "llvm/Analysis/PostDominators.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Transforms/Utils.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/Cloning.h" 
-#include "llvm/Transforms/Utils/ValueMapper.h" 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "break-crit-edges" 
- 
-STATISTIC(NumBroken, "Number of blocks inserted"); 
- 
-namespace { 
-  struct BreakCriticalEdges : public FunctionPass { 
-    static char ID; // Pass identification, replacement for typeid 
-    BreakCriticalEdges() : FunctionPass(ID) { 
-      initializeBreakCriticalEdgesPass(*PassRegistry::getPassRegistry()); 
-    } 
- 
-    bool runOnFunction(Function &F) override { 
-      auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 
-      auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; 
- 
-      auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>(); 
-      auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr; 
- 
-      auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>(); 
-      auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; 
-      unsigned N = 
-          SplitAllCriticalEdges(F, CriticalEdgeSplittingOptions(DT, LI, nullptr, PDT)); 
-      NumBroken += N; 
-      return N > 0; 
-    } 
- 
-    void getAnalysisUsage(AnalysisUsage &AU) const override { 
-      AU.addPreserved<DominatorTreeWrapperPass>(); 
-      AU.addPreserved<LoopInfoWrapperPass>(); 
- 
-      // No loop canonicalization guarantees are broken by this pass. 
-      AU.addPreservedID(LoopSimplifyID); 
-    } 
-  }; 
-} 
- 
-char BreakCriticalEdges::ID = 0; 
-INITIALIZE_PASS(BreakCriticalEdges, "break-crit-edges", 
-                "Break critical edges in CFG", false, false) 
- 
-// Publicly exposed interface to pass... 
-char &llvm::BreakCriticalEdgesID = BreakCriticalEdges::ID; 
-FunctionPass *llvm::createBreakCriticalEdgesPass() { 
-  return new BreakCriticalEdges(); 
-} 
- 
-PreservedAnalyses BreakCriticalEdgesPass::run(Function &F, 
-                                              FunctionAnalysisManager &AM) { 
-  auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F); 
-  auto *LI = AM.getCachedResult<LoopAnalysis>(F); 
-  unsigned N = SplitAllCriticalEdges(F, CriticalEdgeSplittingOptions(DT, LI)); 
-  NumBroken += N; 
-  if (N == 0) 
-    return PreservedAnalyses::all(); 
-  PreservedAnalyses PA; 
-  PA.preserve<DominatorTreeAnalysis>(); 
-  PA.preserve<LoopAnalysis>(); 
-  return PA; 
-} 
- 
-//===----------------------------------------------------------------------===// 
-//    Implementation of the external critical edge manipulation functions 
-//===----------------------------------------------------------------------===// 
- 
-/// When a loop exit edge is split, LCSSA form may require new PHIs in the new 
-/// exit block. This function inserts the new PHIs, as needed. Preds is a list 
-/// of preds inside the loop, SplitBB is the new loop exit block, and DestBB is 
-/// the old loop exit, now the successor of SplitBB. 
-static void createPHIsForSplitLoopExit(ArrayRef<BasicBlock *> Preds, 
-                                       BasicBlock *SplitBB, 
-                                       BasicBlock *DestBB) { 
-  // SplitBB shouldn't have anything non-trivial in it yet. 
-  assert((SplitBB->getFirstNonPHI() == SplitBB->getTerminator() || 
-          SplitBB->isLandingPad()) && "SplitBB has non-PHI nodes!"); 
- 
-  // For each PHI in the destination block. 
-  for (PHINode &PN : DestBB->phis()) { 
-    unsigned Idx = PN.getBasicBlockIndex(SplitBB); 
-    Value *V = PN.getIncomingValue(Idx); 
- 
-    // If the input is a PHI which already satisfies LCSSA, don't create 
-    // a new one. 
-    if (const PHINode *VP = dyn_cast<PHINode>(V)) 
-      if (VP->getParent() == SplitBB) 
-        continue; 
- 
-    // Otherwise a new PHI is needed. Create one and populate it. 
-    PHINode *NewPN = PHINode::Create( 
-        PN.getType(), Preds.size(), "split", 
-        SplitBB->isLandingPad() ? &SplitBB->front() : SplitBB->getTerminator()); 
-    for (unsigned i = 0, e = Preds.size(); i != e; ++i) 
-      NewPN->addIncoming(V, Preds[i]); 
- 
-    // Update the original PHI. 
-    PN.setIncomingValue(Idx, NewPN); 
-  } 
-} 
- 
+//===- BreakCriticalEdges.cpp - Critical Edge Elimination Pass ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// BreakCriticalEdges pass - Break all of the critical edges in the CFG by
+// inserting a dummy basic block.  This pass may be "required" by passes that
+// cannot deal with critical edges.  For this usage, the structure type is
+// forward declared.  This pass obviously invalidates the CFG, but can update
+// dominator trees.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/BreakCriticalEdges.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "break-crit-edges"
+
+STATISTIC(NumBroken, "Number of blocks inserted");
+
+namespace {
+  struct BreakCriticalEdges : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    BreakCriticalEdges() : FunctionPass(ID) {
+      initializeBreakCriticalEdgesPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override {
+      auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+      auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+
+      auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>();
+      auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr;
+
+      auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
+      auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
+      unsigned N =
+          SplitAllCriticalEdges(F, CriticalEdgeSplittingOptions(DT, LI, nullptr, PDT));
+      NumBroken += N;
+      return N > 0;
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addPreserved<LoopInfoWrapperPass>();
+
+      // No loop canonicalization guarantees are broken by this pass.
+      AU.addPreservedID(LoopSimplifyID);
+    }
+  };
+}
+
+char BreakCriticalEdges::ID = 0;
+INITIALIZE_PASS(BreakCriticalEdges, "break-crit-edges",
+                "Break critical edges in CFG", false, false)
+
+// Publicly exposed interface to pass...
+char &llvm::BreakCriticalEdgesID = BreakCriticalEdges::ID;
+FunctionPass *llvm::createBreakCriticalEdgesPass() {
+  return new BreakCriticalEdges();
+}
+
+PreservedAnalyses BreakCriticalEdgesPass::run(Function &F,
+                                              FunctionAnalysisManager &AM) {
+  auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
+  auto *LI = AM.getCachedResult<LoopAnalysis>(F);
+  unsigned N = SplitAllCriticalEdges(F, CriticalEdgeSplittingOptions(DT, LI));
+  NumBroken += N;
+  if (N == 0)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<LoopAnalysis>();
+  return PA;
+}
+
+//===----------------------------------------------------------------------===//
+//    Implementation of the external critical edge manipulation functions
+//===----------------------------------------------------------------------===//
+
+/// When a loop exit edge is split, LCSSA form may require new PHIs in the new
+/// exit block. This function inserts the new PHIs, as needed. Preds is a list
+/// of preds inside the loop, SplitBB is the new loop exit block, and DestBB is
+/// the old loop exit, now the successor of SplitBB.
+static void createPHIsForSplitLoopExit(ArrayRef<BasicBlock *> Preds,
+                                       BasicBlock *SplitBB,
+                                       BasicBlock *DestBB) {
+  // SplitBB shouldn't have anything non-trivial in it yet.
+  assert((SplitBB->getFirstNonPHI() == SplitBB->getTerminator() ||
+          SplitBB->isLandingPad()) && "SplitBB has non-PHI nodes!");
+
+  // For each PHI in the destination block.
+  for (PHINode &PN : DestBB->phis()) {
+    unsigned Idx = PN.getBasicBlockIndex(SplitBB);
+    Value *V = PN.getIncomingValue(Idx);
+
+    // If the input is a PHI which already satisfies LCSSA, don't create
+    // a new one.
+    if (const PHINode *VP = dyn_cast<PHINode>(V))
+      if (VP->getParent() == SplitBB)
+        continue;
+
+    // Otherwise a new PHI is needed. Create one and populate it.
+    PHINode *NewPN = PHINode::Create(
+        PN.getType(), Preds.size(), "split",
+        SplitBB->isLandingPad() ? &SplitBB->front() : SplitBB->getTerminator());
+    for (unsigned i = 0, e = Preds.size(); i != e; ++i)
+      NewPN->addIncoming(V, Preds[i]);
+
+    // Update the original PHI.
+    PN.setIncomingValue(Idx, NewPN);
+  }
+}
+
 BasicBlock *llvm::SplitCriticalEdge(Instruction *TI, unsigned SuccNum,
                                     const CriticalEdgeSplittingOptions &Options,
                                     const Twine &BBName) {
-  if (!isCriticalEdge(TI, SuccNum, Options.MergeIdenticalEdges)) 
-    return nullptr; 
- 
-  assert(!isa<IndirectBrInst>(TI) && 
-         "Cannot split critical edge from IndirectBrInst"); 
- 
-  BasicBlock *TIBB = TI->getParent(); 
-  BasicBlock *DestBB = TI->getSuccessor(SuccNum); 
- 
-  // Splitting the critical edge to a pad block is non-trivial. Don't do 
-  // it in this generic function. 
-  if (DestBB->isEHPad()) return nullptr; 
- 
-  if (Options.IgnoreUnreachableDests && 
-      isa<UnreachableInst>(DestBB->getFirstNonPHIOrDbgOrLifetime())) 
-    return nullptr; 
- 
-  auto *LI = Options.LI; 
-  SmallVector<BasicBlock *, 4> LoopPreds; 
-  // Check if extra modifications will be required to preserve loop-simplify 
-  // form after splitting. If it would require splitting blocks with IndirectBr 
+  if (!isCriticalEdge(TI, SuccNum, Options.MergeIdenticalEdges))
+    return nullptr;
+
+  assert(!isa<IndirectBrInst>(TI) &&
+         "Cannot split critical edge from IndirectBrInst");
+
+  BasicBlock *TIBB = TI->getParent();
+  BasicBlock *DestBB = TI->getSuccessor(SuccNum);
+
+  // Splitting the critical edge to a pad block is non-trivial. Don't do
+  // it in this generic function.
+  if (DestBB->isEHPad()) return nullptr;
+
+  if (Options.IgnoreUnreachableDests &&
+      isa<UnreachableInst>(DestBB->getFirstNonPHIOrDbgOrLifetime()))
+    return nullptr;
+
+  auto *LI = Options.LI;
+  SmallVector<BasicBlock *, 4> LoopPreds;
+  // Check if extra modifications will be required to preserve loop-simplify
+  // form after splitting. If it would require splitting blocks with IndirectBr
   // or CallBr terminators, bail out if preserving loop-simplify form is
   // requested.
-  if (LI) { 
-    if (Loop *TIL = LI->getLoopFor(TIBB)) { 
- 
+  if (LI) {
+    if (Loop *TIL = LI->getLoopFor(TIBB)) {
+
       // The only way that we can break LoopSimplify form by splitting a
       // critical edge is if after the split there exists some edge from TIL to
       // DestBB *and* the only edge into DestBB from outside of TIL is that of
-      // NewBB. If the first isn't true, then LoopSimplify still holds, NewBB 
-      // is the new exit block and it has no non-loop predecessors. If the 
-      // second isn't true, then DestBB was not in LoopSimplify form prior to 
-      // the split as it had a non-loop predecessor. In both of these cases, 
-      // the predecessor must be directly in TIL, not in a subloop, or again 
-      // LoopSimplify doesn't hold. 
+      // NewBB. If the first isn't true, then LoopSimplify still holds, NewBB
+      // is the new exit block and it has no non-loop predecessors. If the
+      // second isn't true, then DestBB was not in LoopSimplify form prior to
+      // the split as it had a non-loop predecessor. In both of these cases,
+      // the predecessor must be directly in TIL, not in a subloop, or again
+      // LoopSimplify doesn't hold.
       for (BasicBlock *P : predecessors(DestBB)) {
-        if (P == TIBB) 
-          continue; // The new block is known. 
-        if (LI->getLoopFor(P) != TIL) { 
-          // No need to re-simplify, it wasn't to start with. 
-          LoopPreds.clear(); 
-          break; 
-        } 
-        LoopPreds.push_back(P); 
-      } 
-      // Loop-simplify form can be preserved, if we can split all in-loop 
-      // predecessors. 
-      if (any_of(LoopPreds, [](BasicBlock *Pred) { 
+        if (P == TIBB)
+          continue; // The new block is known.
+        if (LI->getLoopFor(P) != TIL) {
+          // No need to re-simplify, it wasn't to start with.
+          LoopPreds.clear();
+          break;
+        }
+        LoopPreds.push_back(P);
+      }
+      // Loop-simplify form can be preserved, if we can split all in-loop
+      // predecessors.
+      if (any_of(LoopPreds, [](BasicBlock *Pred) {
             const Instruction *T = Pred->getTerminator();
             if (const auto *CBR = dyn_cast<CallBrInst>(T))
               return CBR->getDefaultDest() != Pred;
             return isa<IndirectBrInst>(T);
-          })) { 
-        if (Options.PreserveLoopSimplify) 
-          return nullptr; 
-        LoopPreds.clear(); 
-      } 
-    } 
-  } 
- 
-  // Create a new basic block, linking it into the CFG. 
+          })) {
+        if (Options.PreserveLoopSimplify)
+          return nullptr;
+        LoopPreds.clear();
+      }
+    }
+  }
+
+  // Create a new basic block, linking it into the CFG.
   BasicBlock *NewBB = nullptr;
   if (BBName.str() != "")
     NewBB = BasicBlock::Create(TI->getContext(), BBName);
@@ -205,297 +205,297 @@ BasicBlock *llvm::SplitCriticalEdge(Instruction *TI, unsigned SuccNum,
     NewBB = BasicBlock::Create(TI->getContext(), TIBB->getName() + "." +
                                                      DestBB->getName() +
                                                      "_crit_edge");
-  // Create our unconditional branch. 
-  BranchInst *NewBI = BranchInst::Create(DestBB, NewBB); 
-  NewBI->setDebugLoc(TI->getDebugLoc()); 
- 
-  // Insert the block into the function... right after the block TI lives in. 
-  Function &F = *TIBB->getParent(); 
-  Function::iterator FBBI = TIBB->getIterator(); 
-  F.getBasicBlockList().insert(++FBBI, NewBB); 
- 
-  // Branch to the new block, breaking the edge. 
-  TI->setSuccessor(SuccNum, NewBB); 
- 
-  // If there are any PHI nodes in DestBB, we need to update them so that they 
-  // merge incoming values from NewBB instead of from TIBB. 
-  { 
-    unsigned BBIdx = 0; 
-    for (BasicBlock::iterator I = DestBB->begin(); isa<PHINode>(I); ++I) { 
-      // We no longer enter through TIBB, now we come in through NewBB. 
-      // Revector exactly one entry in the PHI node that used to come from 
-      // TIBB to come from NewBB. 
-      PHINode *PN = cast<PHINode>(I); 
- 
-      // Reuse the previous value of BBIdx if it lines up.  In cases where we 
-      // have multiple phi nodes with *lots* of predecessors, this is a speed 
-      // win because we don't have to scan the PHI looking for TIBB.  This 
-      // happens because the BB list of PHI nodes are usually in the same 
-      // order. 
-      if (PN->getIncomingBlock(BBIdx) != TIBB) 
-        BBIdx = PN->getBasicBlockIndex(TIBB); 
-      PN->setIncomingBlock(BBIdx, NewBB); 
-    } 
-  } 
- 
-  // If there are any other edges from TIBB to DestBB, update those to go 
-  // through the split block, making those edges non-critical as well (and 
-  // reducing the number of phi entries in the DestBB if relevant). 
-  if (Options.MergeIdenticalEdges) { 
-    for (unsigned i = SuccNum+1, e = TI->getNumSuccessors(); i != e; ++i) { 
-      if (TI->getSuccessor(i) != DestBB) continue; 
- 
-      // Remove an entry for TIBB from DestBB phi nodes. 
-      DestBB->removePredecessor(TIBB, Options.KeepOneInputPHIs); 
- 
-      // We found another edge to DestBB, go to NewBB instead. 
-      TI->setSuccessor(i, NewBB); 
-    } 
-  } 
- 
-  // If we have nothing to update, just return. 
-  auto *DT = Options.DT; 
-  auto *PDT = Options.PDT; 
-  auto *MSSAU = Options.MSSAU; 
-  if (MSSAU) 
-    MSSAU->wireOldPredecessorsToNewImmediatePredecessor( 
-        DestBB, NewBB, {TIBB}, Options.MergeIdenticalEdges); 
- 
-  if (!DT && !PDT && !LI) 
-    return NewBB; 
- 
-  if (DT || PDT) { 
-    // Update the DominatorTree. 
-    //       ---> NewBB -----\ 
-    //      /                 V 
-    //  TIBB -------\\------> DestBB 
-    // 
-    // First, inform the DT about the new path from TIBB to DestBB via NewBB, 
-    // then delete the old edge from TIBB to DestBB. By doing this in that order 
-    // DestBB stays reachable in the DT the whole time and its subtree doesn't 
-    // get disconnected. 
-    SmallVector<DominatorTree::UpdateType, 3> Updates; 
-    Updates.push_back({DominatorTree::Insert, TIBB, NewBB}); 
-    Updates.push_back({DominatorTree::Insert, NewBB, DestBB}); 
+  // Create our unconditional branch.
+  BranchInst *NewBI = BranchInst::Create(DestBB, NewBB);
+  NewBI->setDebugLoc(TI->getDebugLoc());
+
+  // Insert the block into the function... right after the block TI lives in.
+  Function &F = *TIBB->getParent();
+  Function::iterator FBBI = TIBB->getIterator();
+  F.getBasicBlockList().insert(++FBBI, NewBB);
+
+  // Branch to the new block, breaking the edge.
+  TI->setSuccessor(SuccNum, NewBB);
+
+  // If there are any PHI nodes in DestBB, we need to update them so that they
+  // merge incoming values from NewBB instead of from TIBB.
+  {
+    unsigned BBIdx = 0;
+    for (BasicBlock::iterator I = DestBB->begin(); isa<PHINode>(I); ++I) {
+      // We no longer enter through TIBB, now we come in through NewBB.
+      // Revector exactly one entry in the PHI node that used to come from
+      // TIBB to come from NewBB.
+      PHINode *PN = cast<PHINode>(I);
+
+      // Reuse the previous value of BBIdx if it lines up.  In cases where we
+      // have multiple phi nodes with *lots* of predecessors, this is a speed
+      // win because we don't have to scan the PHI looking for TIBB.  This
+      // happens because the BB list of PHI nodes are usually in the same
+      // order.
+      if (PN->getIncomingBlock(BBIdx) != TIBB)
+        BBIdx = PN->getBasicBlockIndex(TIBB);
+      PN->setIncomingBlock(BBIdx, NewBB);
+    }
+  }
+
+  // If there are any other edges from TIBB to DestBB, update those to go
+  // through the split block, making those edges non-critical as well (and
+  // reducing the number of phi entries in the DestBB if relevant).
+  if (Options.MergeIdenticalEdges) {
+    for (unsigned i = SuccNum+1, e = TI->getNumSuccessors(); i != e; ++i) {
+      if (TI->getSuccessor(i) != DestBB) continue;
+
+      // Remove an entry for TIBB from DestBB phi nodes.
+      DestBB->removePredecessor(TIBB, Options.KeepOneInputPHIs);
+
+      // We found another edge to DestBB, go to NewBB instead.
+      TI->setSuccessor(i, NewBB);
+    }
+  }
+
+  // If we have nothing to update, just return.
+  auto *DT = Options.DT;
+  auto *PDT = Options.PDT;
+  auto *MSSAU = Options.MSSAU;
+  if (MSSAU)
+    MSSAU->wireOldPredecessorsToNewImmediatePredecessor(
+        DestBB, NewBB, {TIBB}, Options.MergeIdenticalEdges);
+
+  if (!DT && !PDT && !LI)
+    return NewBB;
+
+  if (DT || PDT) {
+    // Update the DominatorTree.
+    //       ---> NewBB -----\
+    //      /                 V
+    //  TIBB -------\\------> DestBB
+    //
+    // First, inform the DT about the new path from TIBB to DestBB via NewBB,
+    // then delete the old edge from TIBB to DestBB. By doing this in that order
+    // DestBB stays reachable in the DT the whole time and its subtree doesn't
+    // get disconnected.
+    SmallVector<DominatorTree::UpdateType, 3> Updates;
+    Updates.push_back({DominatorTree::Insert, TIBB, NewBB});
+    Updates.push_back({DominatorTree::Insert, NewBB, DestBB});
     if (!llvm::is_contained(successors(TIBB), DestBB))
-      Updates.push_back({DominatorTree::Delete, TIBB, DestBB}); 
- 
-    if (DT) 
-      DT->applyUpdates(Updates); 
-    if (PDT) 
-      PDT->applyUpdates(Updates); 
-  } 
- 
-  // Update LoopInfo if it is around. 
-  if (LI) { 
-    if (Loop *TIL = LI->getLoopFor(TIBB)) { 
-      // If one or the other blocks were not in a loop, the new block is not 
-      // either, and thus LI doesn't need to be updated. 
-      if (Loop *DestLoop = LI->getLoopFor(DestBB)) { 
-        if (TIL == DestLoop) { 
-          // Both in the same loop, the NewBB joins loop. 
-          DestLoop->addBasicBlockToLoop(NewBB, *LI); 
-        } else if (TIL->contains(DestLoop)) { 
-          // Edge from an outer loop to an inner loop.  Add to the outer loop. 
-          TIL->addBasicBlockToLoop(NewBB, *LI); 
-        } else if (DestLoop->contains(TIL)) { 
-          // Edge from an inner loop to an outer loop.  Add to the outer loop. 
-          DestLoop->addBasicBlockToLoop(NewBB, *LI); 
-        } else { 
-          // Edge from two loops with no containment relation.  Because these 
-          // are natural loops, we know that the destination block must be the 
-          // header of its loop (adding a branch into a loop elsewhere would 
-          // create an irreducible loop). 
-          assert(DestLoop->getHeader() == DestBB && 
-                 "Should not create irreducible loops!"); 
-          if (Loop *P = DestLoop->getParentLoop()) 
-            P->addBasicBlockToLoop(NewBB, *LI); 
-        } 
-      } 
- 
-      // If TIBB is in a loop and DestBB is outside of that loop, we may need 
-      // to update LoopSimplify form and LCSSA form. 
-      if (!TIL->contains(DestBB)) { 
-        assert(!TIL->contains(NewBB) && 
-               "Split point for loop exit is contained in loop!"); 
- 
-        // Update LCSSA form in the newly created exit block. 
-        if (Options.PreserveLCSSA) { 
-          createPHIsForSplitLoopExit(TIBB, NewBB, DestBB); 
-        } 
- 
-        if (!LoopPreds.empty()) { 
-          assert(!DestBB->isEHPad() && "We don't split edges to EH pads!"); 
-          BasicBlock *NewExitBB = SplitBlockPredecessors( 
-              DestBB, LoopPreds, "split", DT, LI, MSSAU, Options.PreserveLCSSA); 
-          if (Options.PreserveLCSSA) 
-            createPHIsForSplitLoopExit(LoopPreds, NewExitBB, DestBB); 
-        } 
-      } 
-    } 
-  } 
- 
-  return NewBB; 
-} 
- 
-// Return the unique indirectbr predecessor of a block. This may return null 
-// even if such a predecessor exists, if it's not useful for splitting. 
-// If a predecessor is found, OtherPreds will contain all other (non-indirectbr) 
-// predecessors of BB. 
-static BasicBlock * 
-findIBRPredecessor(BasicBlock *BB, SmallVectorImpl<BasicBlock *> &OtherPreds) { 
-  // If the block doesn't have any PHIs, we don't care about it, since there's 
-  // no point in splitting it. 
-  PHINode *PN = dyn_cast<PHINode>(BB->begin()); 
-  if (!PN) 
-    return nullptr; 
- 
-  // Verify we have exactly one IBR predecessor. 
-  // Conservatively bail out if one of the other predecessors is not a "regular" 
-  // terminator (that is, not a switch or a br). 
-  BasicBlock *IBB = nullptr; 
-  for (unsigned Pred = 0, E = PN->getNumIncomingValues(); Pred != E; ++Pred) { 
-    BasicBlock *PredBB = PN->getIncomingBlock(Pred); 
-    Instruction *PredTerm = PredBB->getTerminator(); 
-    switch (PredTerm->getOpcode()) { 
-    case Instruction::IndirectBr: 
-      if (IBB) 
-        return nullptr; 
-      IBB = PredBB; 
-      break; 
-    case Instruction::Br: 
-    case Instruction::Switch: 
-      OtherPreds.push_back(PredBB); 
-      continue; 
-    default: 
-      return nullptr; 
-    } 
-  } 
- 
-  return IBB; 
-} 
- 
-bool llvm::SplitIndirectBrCriticalEdges(Function &F, 
-                                        BranchProbabilityInfo *BPI, 
-                                        BlockFrequencyInfo *BFI) { 
-  // Check whether the function has any indirectbrs, and collect which blocks 
-  // they may jump to. Since most functions don't have indirect branches, 
-  // this lowers the common case's overhead to O(Blocks) instead of O(Edges). 
-  SmallSetVector<BasicBlock *, 16> Targets; 
-  for (auto &BB : F) { 
-    auto *IBI = dyn_cast<IndirectBrInst>(BB.getTerminator()); 
-    if (!IBI) 
-      continue; 
- 
-    for (unsigned Succ = 0, E = IBI->getNumSuccessors(); Succ != E; ++Succ) 
-      Targets.insert(IBI->getSuccessor(Succ)); 
-  } 
- 
-  if (Targets.empty()) 
-    return false; 
- 
-  bool ShouldUpdateAnalysis = BPI && BFI; 
-  bool Changed = false; 
-  for (BasicBlock *Target : Targets) { 
-    SmallVector<BasicBlock *, 16> OtherPreds; 
-    BasicBlock *IBRPred = findIBRPredecessor(Target, OtherPreds); 
-    // If we did not found an indirectbr, or the indirectbr is the only 
-    // incoming edge, this isn't the kind of edge we're looking for. 
-    if (!IBRPred || OtherPreds.empty()) 
-      continue; 
- 
-    // Don't even think about ehpads/landingpads. 
-    Instruction *FirstNonPHI = Target->getFirstNonPHI(); 
-    if (FirstNonPHI->isEHPad() || Target->isLandingPad()) 
-      continue; 
- 
-    // Remember edge probabilities if needed. 
-    SmallVector<BranchProbability, 4> EdgeProbabilities; 
-    if (ShouldUpdateAnalysis) { 
-      EdgeProbabilities.reserve(Target->getTerminator()->getNumSuccessors()); 
-      for (unsigned I = 0, E = Target->getTerminator()->getNumSuccessors(); 
-           I < E; ++I) 
-        EdgeProbabilities.emplace_back(BPI->getEdgeProbability(Target, I)); 
-      BPI->eraseBlock(Target); 
-    } 
- 
-    BasicBlock *BodyBlock = Target->splitBasicBlock(FirstNonPHI, ".split"); 
-    if (ShouldUpdateAnalysis) { 
-      // Copy the BFI/BPI from Target to BodyBlock. 
-      BPI->setEdgeProbability(BodyBlock, EdgeProbabilities); 
-      BFI->setBlockFreq(BodyBlock, BFI->getBlockFreq(Target).getFrequency()); 
-    } 
-    // It's possible Target was its own successor through an indirectbr. 
-    // In this case, the indirectbr now comes from BodyBlock. 
-    if (IBRPred == Target) 
-      IBRPred = BodyBlock; 
- 
-    // At this point Target only has PHIs, and BodyBlock has the rest of the 
-    // block's body. Create a copy of Target that will be used by the "direct" 
-    // preds. 
-    ValueToValueMapTy VMap; 
-    BasicBlock *DirectSucc = CloneBasicBlock(Target, VMap, ".clone", &F); 
- 
-    BlockFrequency BlockFreqForDirectSucc; 
-    for (BasicBlock *Pred : OtherPreds) { 
-      // If the target is a loop to itself, then the terminator of the split 
-      // block (BodyBlock) needs to be updated. 
-      BasicBlock *Src = Pred != Target ? Pred : BodyBlock; 
-      Src->getTerminator()->replaceUsesOfWith(Target, DirectSucc); 
-      if (ShouldUpdateAnalysis) 
-        BlockFreqForDirectSucc += BFI->getBlockFreq(Src) * 
-            BPI->getEdgeProbability(Src, DirectSucc); 
-    } 
-    if (ShouldUpdateAnalysis) { 
-      BFI->setBlockFreq(DirectSucc, BlockFreqForDirectSucc.getFrequency()); 
-      BlockFrequency NewBlockFreqForTarget = 
-          BFI->getBlockFreq(Target) - BlockFreqForDirectSucc; 
-      BFI->setBlockFreq(Target, NewBlockFreqForTarget.getFrequency()); 
-    } 
- 
-    // Ok, now fix up the PHIs. We know the two blocks only have PHIs, and that 
-    // they are clones, so the number of PHIs are the same. 
-    // (a) Remove the edge coming from IBRPred from the "Direct" PHI 
-    // (b) Leave that as the only edge in the "Indirect" PHI. 
-    // (c) Merge the two in the body block. 
-    BasicBlock::iterator Indirect = Target->begin(), 
-                         End = Target->getFirstNonPHI()->getIterator(); 
-    BasicBlock::iterator Direct = DirectSucc->begin(); 
-    BasicBlock::iterator MergeInsert = BodyBlock->getFirstInsertionPt(); 
- 
-    assert(&*End == Target->getTerminator() && 
-           "Block was expected to only contain PHIs"); 
- 
-    while (Indirect != End) { 
-      PHINode *DirPHI = cast<PHINode>(Direct); 
-      PHINode *IndPHI = cast<PHINode>(Indirect); 
- 
-      // Now, clean up - the direct block shouldn't get the indirect value, 
-      // and vice versa. 
-      DirPHI->removeIncomingValue(IBRPred); 
-      Direct++; 
- 
-      // Advance the pointer here, to avoid invalidation issues when the old 
-      // PHI is erased. 
-      Indirect++; 
- 
-      PHINode *NewIndPHI = PHINode::Create(IndPHI->getType(), 1, "ind", IndPHI); 
-      NewIndPHI->addIncoming(IndPHI->getIncomingValueForBlock(IBRPred), 
-                             IBRPred); 
- 
-      // Create a PHI in the body block, to merge the direct and indirect 
-      // predecessors. 
-      PHINode *MergePHI = 
-          PHINode::Create(IndPHI->getType(), 2, "merge", &*MergeInsert); 
-      MergePHI->addIncoming(NewIndPHI, Target); 
-      MergePHI->addIncoming(DirPHI, DirectSucc); 
- 
-      IndPHI->replaceAllUsesWith(MergePHI); 
-      IndPHI->eraseFromParent(); 
-    } 
- 
-    Changed = true; 
-  } 
- 
-  return Changed; 
-} 
+      Updates.push_back({DominatorTree::Delete, TIBB, DestBB});
+
+    if (DT)
+      DT->applyUpdates(Updates);
+    if (PDT)
+      PDT->applyUpdates(Updates);
+  }
+
+  // Update LoopInfo if it is around.
+  if (LI) {
+    if (Loop *TIL = LI->getLoopFor(TIBB)) {
+      // If one or the other blocks were not in a loop, the new block is not
+      // either, and thus LI doesn't need to be updated.
+      if (Loop *DestLoop = LI->getLoopFor(DestBB)) {
+        if (TIL == DestLoop) {
+          // Both in the same loop, the NewBB joins loop.
+          DestLoop->addBasicBlockToLoop(NewBB, *LI);
+        } else if (TIL->contains(DestLoop)) {
+          // Edge from an outer loop to an inner loop.  Add to the outer loop.
+          TIL->addBasicBlockToLoop(NewBB, *LI);
+        } else if (DestLoop->contains(TIL)) {
+          // Edge from an inner loop to an outer loop.  Add to the outer loop.
+          DestLoop->addBasicBlockToLoop(NewBB, *LI);
+        } else {
+          // Edge from two loops with no containment relation.  Because these
+          // are natural loops, we know that the destination block must be the
+          // header of its loop (adding a branch into a loop elsewhere would
+          // create an irreducible loop).
+          assert(DestLoop->getHeader() == DestBB &&
+                 "Should not create irreducible loops!");
+          if (Loop *P = DestLoop->getParentLoop())
+            P->addBasicBlockToLoop(NewBB, *LI);
+        }
+      }
+
+      // If TIBB is in a loop and DestBB is outside of that loop, we may need
+      // to update LoopSimplify form and LCSSA form.
+      if (!TIL->contains(DestBB)) {
+        assert(!TIL->contains(NewBB) &&
+               "Split point for loop exit is contained in loop!");
+
+        // Update LCSSA form in the newly created exit block.
+        if (Options.PreserveLCSSA) {
+          createPHIsForSplitLoopExit(TIBB, NewBB, DestBB);
+        }
+
+        if (!LoopPreds.empty()) {
+          assert(!DestBB->isEHPad() && "We don't split edges to EH pads!");
+          BasicBlock *NewExitBB = SplitBlockPredecessors(
+              DestBB, LoopPreds, "split", DT, LI, MSSAU, Options.PreserveLCSSA);
+          if (Options.PreserveLCSSA)
+            createPHIsForSplitLoopExit(LoopPreds, NewExitBB, DestBB);
+        }
+      }
+    }
+  }
+
+  return NewBB;
+}
+
+// Return the unique indirectbr predecessor of a block. This may return null
+// even if such a predecessor exists, if it's not useful for splitting.
+// If a predecessor is found, OtherPreds will contain all other (non-indirectbr)
+// predecessors of BB.
+static BasicBlock *
+findIBRPredecessor(BasicBlock *BB, SmallVectorImpl<BasicBlock *> &OtherPreds) {
+  // If the block doesn't have any PHIs, we don't care about it, since there's
+  // no point in splitting it.
+  PHINode *PN = dyn_cast<PHINode>(BB->begin());
+  if (!PN)
+    return nullptr;
+
+  // Verify we have exactly one IBR predecessor.
+  // Conservatively bail out if one of the other predecessors is not a "regular"
+  // terminator (that is, not a switch or a br).
+  BasicBlock *IBB = nullptr;
+  for (unsigned Pred = 0, E = PN->getNumIncomingValues(); Pred != E; ++Pred) {
+    BasicBlock *PredBB = PN->getIncomingBlock(Pred);
+    Instruction *PredTerm = PredBB->getTerminator();
+    switch (PredTerm->getOpcode()) {
+    case Instruction::IndirectBr:
+      if (IBB)
+        return nullptr;
+      IBB = PredBB;
+      break;
+    case Instruction::Br:
+    case Instruction::Switch:
+      OtherPreds.push_back(PredBB);
+      continue;
+    default:
+      return nullptr;
+    }
+  }
+
+  return IBB;
+}
+
+bool llvm::SplitIndirectBrCriticalEdges(Function &F,
+                                        BranchProbabilityInfo *BPI,
+                                        BlockFrequencyInfo *BFI) {
+  // Check whether the function has any indirectbrs, and collect which blocks
+  // they may jump to. Since most functions don't have indirect branches,
+  // this lowers the common case's overhead to O(Blocks) instead of O(Edges).
+  SmallSetVector<BasicBlock *, 16> Targets;
+  for (auto &BB : F) {
+    auto *IBI = dyn_cast<IndirectBrInst>(BB.getTerminator());
+    if (!IBI)
+      continue;
+
+    for (unsigned Succ = 0, E = IBI->getNumSuccessors(); Succ != E; ++Succ)
+      Targets.insert(IBI->getSuccessor(Succ));
+  }
+
+  if (Targets.empty())
+    return false;
+
+  bool ShouldUpdateAnalysis = BPI && BFI;
+  bool Changed = false;
+  for (BasicBlock *Target : Targets) {
+    SmallVector<BasicBlock *, 16> OtherPreds;
+    BasicBlock *IBRPred = findIBRPredecessor(Target, OtherPreds);
+    // If we did not found an indirectbr, or the indirectbr is the only
+    // incoming edge, this isn't the kind of edge we're looking for.
+    if (!IBRPred || OtherPreds.empty())
+      continue;
+
+    // Don't even think about ehpads/landingpads.
+    Instruction *FirstNonPHI = Target->getFirstNonPHI();
+    if (FirstNonPHI->isEHPad() || Target->isLandingPad())
+      continue;
+
+    // Remember edge probabilities if needed.
+    SmallVector<BranchProbability, 4> EdgeProbabilities;
+    if (ShouldUpdateAnalysis) {
+      EdgeProbabilities.reserve(Target->getTerminator()->getNumSuccessors());
+      for (unsigned I = 0, E = Target->getTerminator()->getNumSuccessors();
+           I < E; ++I)
+        EdgeProbabilities.emplace_back(BPI->getEdgeProbability(Target, I));
+      BPI->eraseBlock(Target);
+    }
+
+    BasicBlock *BodyBlock = Target->splitBasicBlock(FirstNonPHI, ".split");
+    if (ShouldUpdateAnalysis) {
+      // Copy the BFI/BPI from Target to BodyBlock.
+      BPI->setEdgeProbability(BodyBlock, EdgeProbabilities);
+      BFI->setBlockFreq(BodyBlock, BFI->getBlockFreq(Target).getFrequency());
+    }
+    // It's possible Target was its own successor through an indirectbr.
+    // In this case, the indirectbr now comes from BodyBlock.
+    if (IBRPred == Target)
+      IBRPred = BodyBlock;
+
+    // At this point Target only has PHIs, and BodyBlock has the rest of the
+    // block's body. Create a copy of Target that will be used by the "direct"
+    // preds.
+    ValueToValueMapTy VMap;
+    BasicBlock *DirectSucc = CloneBasicBlock(Target, VMap, ".clone", &F);
+
+    BlockFrequency BlockFreqForDirectSucc;
+    for (BasicBlock *Pred : OtherPreds) {
+      // If the target is a loop to itself, then the terminator of the split
+      // block (BodyBlock) needs to be updated.
+      BasicBlock *Src = Pred != Target ? Pred : BodyBlock;
+      Src->getTerminator()->replaceUsesOfWith(Target, DirectSucc);
+      if (ShouldUpdateAnalysis)
+        BlockFreqForDirectSucc += BFI->getBlockFreq(Src) *
+            BPI->getEdgeProbability(Src, DirectSucc);
+    }
+    if (ShouldUpdateAnalysis) {
+      BFI->setBlockFreq(DirectSucc, BlockFreqForDirectSucc.getFrequency());
+      BlockFrequency NewBlockFreqForTarget =
+          BFI->getBlockFreq(Target) - BlockFreqForDirectSucc;
+      BFI->setBlockFreq(Target, NewBlockFreqForTarget.getFrequency());
+    }
+
+    // Ok, now fix up the PHIs. We know the two blocks only have PHIs, and that
+    // they are clones, so the number of PHIs are the same.
+    // (a) Remove the edge coming from IBRPred from the "Direct" PHI
+    // (b) Leave that as the only edge in the "Indirect" PHI.
+    // (c) Merge the two in the body block.
+    BasicBlock::iterator Indirect = Target->begin(),
+                         End = Target->getFirstNonPHI()->getIterator();
+    BasicBlock::iterator Direct = DirectSucc->begin();
+    BasicBlock::iterator MergeInsert = BodyBlock->getFirstInsertionPt();
+
+    assert(&*End == Target->getTerminator() &&
+           "Block was expected to only contain PHIs");
+
+    while (Indirect != End) {
+      PHINode *DirPHI = cast<PHINode>(Direct);
+      PHINode *IndPHI = cast<PHINode>(Indirect);
+
+      // Now, clean up - the direct block shouldn't get the indirect value,
+      // and vice versa.
+      DirPHI->removeIncomingValue(IBRPred);
+      Direct++;
+
+      // Advance the pointer here, to avoid invalidation issues when the old
+      // PHI is erased.
+      Indirect++;
+
+      PHINode *NewIndPHI = PHINode::Create(IndPHI->getType(), 1, "ind", IndPHI);
+      NewIndPHI->addIncoming(IndPHI->getIncomingValueForBlock(IBRPred),
+                             IBRPred);
+
+      // Create a PHI in the body block, to merge the direct and indirect
+      // predecessors.
+      PHINode *MergePHI =
+          PHINode::Create(IndPHI->getType(), 2, "merge", &*MergeInsert);
+      MergePHI->addIncoming(NewIndPHI, Target);
+      MergePHI->addIncoming(DirPHI, DirectSucc);
+
+      IndPHI->replaceAllUsesWith(MergePHI);
+      IndPHI->eraseFromParent();
+    }
+
+    Changed = true;
+  }
+
+  return Changed;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/BuildLibCalls.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/BuildLibCalls.cpp
index 205ea1b9fd..dba5403f27 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -1,61 +1,61 @@
-//===- BuildLibCalls.cpp - Utility builder for libcalls -------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements some functions that will create standard C libcalls. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/BuildLibCalls.h" 
-#include "llvm/ADT/SmallString.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/Analysis/MemoryBuiltins.h" 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "build-libcalls" 
- 
-//- Infer Attributes ---------------------------------------------------------// 
- 
-STATISTIC(NumReadNone, "Number of functions inferred as readnone"); 
+//===- BuildLibCalls.cpp - Utility builder for libcalls -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements some functions that will create standard C libcalls.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "build-libcalls"
+
+//- Infer Attributes ---------------------------------------------------------//
+
+STATISTIC(NumReadNone, "Number of functions inferred as readnone");
 STATISTIC(NumInaccessibleMemOnly,
           "Number of functions inferred as inaccessiblememonly");
-STATISTIC(NumReadOnly, "Number of functions inferred as readonly"); 
-STATISTIC(NumArgMemOnly, "Number of functions inferred as argmemonly"); 
+STATISTIC(NumReadOnly, "Number of functions inferred as readonly");
+STATISTIC(NumArgMemOnly, "Number of functions inferred as argmemonly");
 STATISTIC(NumInaccessibleMemOrArgMemOnly,
           "Number of functions inferred as inaccessiblemem_or_argmemonly");
-STATISTIC(NumNoUnwind, "Number of functions inferred as nounwind"); 
-STATISTIC(NumNoCapture, "Number of arguments inferred as nocapture"); 
+STATISTIC(NumNoUnwind, "Number of functions inferred as nounwind");
+STATISTIC(NumNoCapture, "Number of arguments inferred as nocapture");
 STATISTIC(NumWriteOnlyArg, "Number of arguments inferred as writeonly");
 STATISTIC(NumSExtArg, "Number of arguments inferred as signext");
-STATISTIC(NumReadOnlyArg, "Number of arguments inferred as readonly"); 
-STATISTIC(NumNoAlias, "Number of function returns inferred as noalias"); 
+STATISTIC(NumReadOnlyArg, "Number of arguments inferred as readonly");
+STATISTIC(NumNoAlias, "Number of function returns inferred as noalias");
 STATISTIC(NumNoUndef, "Number of function returns inferred as noundef returns");
-STATISTIC(NumNonNull, "Number of function returns inferred as nonnull returns"); 
-STATISTIC(NumReturnedArg, "Number of arguments inferred as returned"); 
+STATISTIC(NumNonNull, "Number of function returns inferred as nonnull returns");
+STATISTIC(NumReturnedArg, "Number of arguments inferred as returned");
 STATISTIC(NumWillReturn, "Number of functions inferred as willreturn");
- 
-static bool setDoesNotAccessMemory(Function &F) { 
-  if (F.doesNotAccessMemory()) 
-    return false; 
-  F.setDoesNotAccessMemory(); 
-  ++NumReadNone; 
-  return true; 
-} 
- 
+
+static bool setDoesNotAccessMemory(Function &F) {
+  if (F.doesNotAccessMemory())
+    return false;
+  F.setDoesNotAccessMemory();
+  ++NumReadNone;
+  return true;
+}
+
 static bool setOnlyAccessesInaccessibleMemory(Function &F) {
   if (F.onlyAccessesInaccessibleMemory())
     return false;
@@ -64,22 +64,22 @@ static bool setOnlyAccessesInaccessibleMemory(Function &F) {
   return true;
 }
 
-static bool setOnlyReadsMemory(Function &F) { 
-  if (F.onlyReadsMemory()) 
-    return false; 
-  F.setOnlyReadsMemory(); 
-  ++NumReadOnly; 
-  return true; 
-} 
- 
-static bool setOnlyAccessesArgMemory(Function &F) { 
-  if (F.onlyAccessesArgMemory()) 
-    return false; 
-  F.setOnlyAccessesArgMemory(); 
-  ++NumArgMemOnly; 
-  return true; 
-} 
- 
+static bool setOnlyReadsMemory(Function &F) {
+  if (F.onlyReadsMemory())
+    return false;
+  F.setOnlyReadsMemory();
+  ++NumReadOnly;
+  return true;
+}
+
+static bool setOnlyAccessesArgMemory(Function &F) {
+  if (F.onlyAccessesArgMemory())
+    return false;
+  F.setOnlyAccessesArgMemory();
+  ++NumArgMemOnly;
+  return true;
+}
+
 static bool setOnlyAccessesInaccessibleMemOrArgMem(Function &F) {
   if (F.onlyAccessesInaccessibleMemOrArgMem())
     return false;
@@ -88,54 +88,54 @@ static bool setOnlyAccessesInaccessibleMemOrArgMem(Function &F) {
   return true;
 }
 
-static bool setDoesNotThrow(Function &F) { 
-  if (F.doesNotThrow()) 
-    return false; 
-  F.setDoesNotThrow(); 
-  ++NumNoUnwind; 
-  return true; 
-} 
- 
-static bool setRetDoesNotAlias(Function &F) { 
-  if (F.hasAttribute(AttributeList::ReturnIndex, Attribute::NoAlias)) 
-    return false; 
-  F.addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias); 
-  ++NumNoAlias; 
-  return true; 
-} 
- 
-static bool setDoesNotCapture(Function &F, unsigned ArgNo) { 
-  if (F.hasParamAttribute(ArgNo, Attribute::NoCapture)) 
-    return false; 
-  F.addParamAttr(ArgNo, Attribute::NoCapture); 
-  ++NumNoCapture; 
-  return true; 
-} 
- 
-static bool setDoesNotAlias(Function &F, unsigned ArgNo) { 
-  if (F.hasParamAttribute(ArgNo, Attribute::NoAlias)) 
-    return false; 
-  F.addParamAttr(ArgNo, Attribute::NoAlias); 
-  ++NumNoAlias; 
-  return true; 
-} 
- 
-static bool setOnlyReadsMemory(Function &F, unsigned ArgNo) { 
-  if (F.hasParamAttribute(ArgNo, Attribute::ReadOnly)) 
-    return false; 
-  F.addParamAttr(ArgNo, Attribute::ReadOnly); 
-  ++NumReadOnlyArg; 
-  return true; 
-} 
- 
+static bool setDoesNotThrow(Function &F) {
+  if (F.doesNotThrow())
+    return false;
+  F.setDoesNotThrow();
+  ++NumNoUnwind;
+  return true;
+}
+
+static bool setRetDoesNotAlias(Function &F) {
+  if (F.hasAttribute(AttributeList::ReturnIndex, Attribute::NoAlias))
+    return false;
+  F.addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
+  ++NumNoAlias;
+  return true;
+}
+
+static bool setDoesNotCapture(Function &F, unsigned ArgNo) {
+  if (F.hasParamAttribute(ArgNo, Attribute::NoCapture))
+    return false;
+  F.addParamAttr(ArgNo, Attribute::NoCapture);
+  ++NumNoCapture;
+  return true;
+}
+
+static bool setDoesNotAlias(Function &F, unsigned ArgNo) {
+  if (F.hasParamAttribute(ArgNo, Attribute::NoAlias))
+    return false;
+  F.addParamAttr(ArgNo, Attribute::NoAlias);
+  ++NumNoAlias;
+  return true;
+}
+
+static bool setOnlyReadsMemory(Function &F, unsigned ArgNo) {
+  if (F.hasParamAttribute(ArgNo, Attribute::ReadOnly))
+    return false;
+  F.addParamAttr(ArgNo, Attribute::ReadOnly);
+  ++NumReadOnlyArg;
+  return true;
+}
+
 static bool setOnlyWritesMemory(Function &F, unsigned ArgNo) {
   if (F.hasParamAttribute(ArgNo, Attribute::WriteOnly))
-    return false; 
+    return false;
   F.addParamAttr(ArgNo, Attribute::WriteOnly);
   ++NumWriteOnlyArg;
-  return true; 
-} 
- 
+  return true;
+}
+
 static bool setSignExtendedArg(Function &F, unsigned ArgNo) {
  if (F.hasParamAttribute(ArgNo, Attribute::SExt))
     return false;
@@ -170,28 +170,28 @@ static bool setRetAndArgsNoUndef(Function &F) {
   return setRetNoUndef(F) | setArgsNoUndef(F);
 }
 
-static bool setReturnedArg(Function &F, unsigned ArgNo) { 
-  if (F.hasParamAttribute(ArgNo, Attribute::Returned)) 
-    return false; 
-  F.addParamAttr(ArgNo, Attribute::Returned); 
-  ++NumReturnedArg; 
-  return true; 
-} 
- 
-static bool setNonLazyBind(Function &F) { 
-  if (F.hasFnAttribute(Attribute::NonLazyBind)) 
-    return false; 
-  F.addFnAttr(Attribute::NonLazyBind); 
-  return true; 
-} 
- 
-static bool setDoesNotFreeMemory(Function &F) { 
-  if (F.hasFnAttribute(Attribute::NoFree)) 
-    return false; 
-  F.addFnAttr(Attribute::NoFree); 
-  return true; 
-} 
- 
+static bool setReturnedArg(Function &F, unsigned ArgNo) {
+  if (F.hasParamAttribute(ArgNo, Attribute::Returned))
+    return false;
+  F.addParamAttr(ArgNo, Attribute::Returned);
+  ++NumReturnedArg;
+  return true;
+}
+
+static bool setNonLazyBind(Function &F) {
+  if (F.hasFnAttribute(Attribute::NonLazyBind))
+    return false;
+  F.addFnAttr(Attribute::NonLazyBind);
+  return true;
+}
+
+static bool setDoesNotFreeMemory(Function &F) {
+  if (F.hasFnAttribute(Attribute::NoFree))
+    return false;
+  F.addFnAttr(Attribute::NoFree);
+  return true;
+}
+
 static bool setWillReturn(Function &F) {
   if (F.hasFnAttribute(Attribute::WillReturn))
     return false;
@@ -200,84 +200,84 @@ static bool setWillReturn(Function &F) {
   return true;
 }
 
-bool llvm::inferLibFuncAttributes(Module *M, StringRef Name, 
-                                  const TargetLibraryInfo &TLI) { 
-  Function *F = M->getFunction(Name); 
-  if (!F) 
-    return false; 
-  return inferLibFuncAttributes(*F, TLI); 
-} 
- 
-bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { 
-  LibFunc TheLibFunc; 
-  if (!(TLI.getLibFunc(F, TheLibFunc) && TLI.has(TheLibFunc))) 
-    return false; 
- 
-  bool Changed = false; 
- 
-  if(!isLibFreeFunction(&F, TheLibFunc) && !isReallocLikeFn(&F,  &TLI)) 
-    Changed |= setDoesNotFreeMemory(F); 
- 
-  if (F.getParent() != nullptr && F.getParent()->getRtLibUseGOT()) 
-    Changed |= setNonLazyBind(F); 
- 
-  switch (TheLibFunc) { 
-  case LibFunc_strlen: 
-  case LibFunc_wcslen: 
-    Changed |= setOnlyReadsMemory(F); 
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setOnlyAccessesArgMemory(F); 
+bool llvm::inferLibFuncAttributes(Module *M, StringRef Name,
+                                  const TargetLibraryInfo &TLI) {
+  Function *F = M->getFunction(Name);
+  if (!F)
+    return false;
+  return inferLibFuncAttributes(*F, TLI);
+}
+
+bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
+  LibFunc TheLibFunc;
+  if (!(TLI.getLibFunc(F, TheLibFunc) && TLI.has(TheLibFunc)))
+    return false;
+
+  bool Changed = false;
+
+  if(!isLibFreeFunction(&F, TheLibFunc) && !isReallocLikeFn(&F,  &TLI))
+    Changed |= setDoesNotFreeMemory(F);
+
+  if (F.getParent() != nullptr && F.getParent()->getRtLibUseGOT())
+    Changed |= setNonLazyBind(F);
+
+  switch (TheLibFunc) {
+  case LibFunc_strlen:
+  case LibFunc_wcslen:
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setOnlyAccessesArgMemory(F);
     Changed |= setWillReturn(F);
-    Changed |= setDoesNotCapture(F, 0); 
-    return Changed; 
-  case LibFunc_strchr: 
-  case LibFunc_strrchr: 
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
+  case LibFunc_strchr:
+  case LibFunc_strrchr:
     Changed |= setOnlyAccessesArgMemory(F);
-    Changed |= setOnlyReadsMemory(F); 
-    Changed |= setDoesNotThrow(F); 
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotThrow(F);
     Changed |= setWillReturn(F);
-    return Changed; 
-  case LibFunc_strtol: 
-  case LibFunc_strtod: 
-  case LibFunc_strtof: 
-  case LibFunc_strtoul: 
-  case LibFunc_strtoll: 
-  case LibFunc_strtold: 
-  case LibFunc_strtoull: 
-    Changed |= setDoesNotThrow(F); 
+    return Changed;
+  case LibFunc_strtol:
+  case LibFunc_strtod:
+  case LibFunc_strtof:
+  case LibFunc_strtoul:
+  case LibFunc_strtoll:
+  case LibFunc_strtold:
+  case LibFunc_strtoull:
+    Changed |= setDoesNotThrow(F);
     Changed |= setWillReturn(F);
-    Changed |= setDoesNotCapture(F, 1); 
-    Changed |= setOnlyReadsMemory(F, 0); 
-    return Changed; 
-  case LibFunc_strcpy: 
-  case LibFunc_strncpy: 
-  case LibFunc_strcat: 
-  case LibFunc_strncat: 
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_strcpy:
+  case LibFunc_strncpy:
+  case LibFunc_strcat:
+  case LibFunc_strncat:
     Changed |= setWillReturn(F);
-    Changed |= setReturnedArg(F, 0); 
-    LLVM_FALLTHROUGH; 
-  case LibFunc_stpcpy: 
-  case LibFunc_stpncpy: 
+    Changed |= setReturnedArg(F, 0);
+    LLVM_FALLTHROUGH;
+  case LibFunc_stpcpy:
+  case LibFunc_stpncpy:
     Changed |= setOnlyAccessesArgMemory(F);
-    Changed |= setDoesNotThrow(F); 
+    Changed |= setDoesNotThrow(F);
     Changed |= setWillReturn(F);
-    Changed |= setDoesNotCapture(F, 1); 
+    Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyWritesMemory(F, 0);
-    Changed |= setOnlyReadsMemory(F, 1); 
+    Changed |= setOnlyReadsMemory(F, 1);
     Changed |= setDoesNotAlias(F, 0);
     Changed |= setDoesNotAlias(F, 1);
-    return Changed; 
-  case LibFunc_strxfrm: 
-    Changed |= setDoesNotThrow(F); 
+    return Changed;
+  case LibFunc_strxfrm:
+    Changed |= setDoesNotThrow(F);
     Changed |= setWillReturn(F);
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setDoesNotCapture(F, 1); 
-    Changed |= setOnlyReadsMemory(F, 1); 
-    return Changed; 
-  case LibFunc_strcmp:      // 0,1 
-  case LibFunc_strspn:      // 0,1 
-  case LibFunc_strncmp:     // 0,1 
-  case LibFunc_strcspn:     // 0,1 
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_strcmp:      // 0,1
+  case LibFunc_strspn:      // 0,1
+  case LibFunc_strncmp:     // 0,1
+  case LibFunc_strcspn:     // 0,1
     Changed |= setDoesNotThrow(F);
     Changed |= setOnlyAccessesArgMemory(F);
     Changed |= setWillReturn(F);
@@ -286,325 +286,325 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
   case LibFunc_strcoll:
-  case LibFunc_strcasecmp:  // 0,1 
-  case LibFunc_strncasecmp: // 
+  case LibFunc_strcasecmp:  // 0,1
+  case LibFunc_strncasecmp: //
     // Those functions may depend on the locale, which may be accessed through
     // global memory.
-    Changed |= setOnlyReadsMemory(F); 
-    Changed |= setDoesNotThrow(F); 
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotThrow(F);
     Changed |= setWillReturn(F);
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setDoesNotCapture(F, 1); 
-    return Changed; 
-  case LibFunc_strstr: 
-  case LibFunc_strpbrk: 
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc_strstr:
+  case LibFunc_strpbrk:
     Changed |= setOnlyAccessesArgMemory(F);
-    Changed |= setOnlyReadsMemory(F); 
-    Changed |= setDoesNotThrow(F); 
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotThrow(F);
     Changed |= setWillReturn(F);
-    Changed |= setDoesNotCapture(F, 1); 
-    return Changed; 
-  case LibFunc_strtok: 
-  case LibFunc_strtok_r: 
-    Changed |= setDoesNotThrow(F); 
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc_strtok:
+  case LibFunc_strtok_r:
+    Changed |= setDoesNotThrow(F);
     Changed |= setWillReturn(F);
-    Changed |= setDoesNotCapture(F, 1); 
-    Changed |= setOnlyReadsMemory(F, 1); 
-    return Changed; 
-  case LibFunc_scanf: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setOnlyReadsMemory(F, 0); 
-    return Changed; 
-  case LibFunc_setbuf: 
-  case LibFunc_setvbuf: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    return Changed; 
-  case LibFunc_strdup: 
-  case LibFunc_strndup: 
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_scanf:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_setbuf:
+  case LibFunc_setvbuf:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
+  case LibFunc_strdup:
+  case LibFunc_strndup:
     Changed |= setOnlyAccessesInaccessibleMemOrArgMem(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setRetDoesNotAlias(F); 
+    Changed |= setDoesNotThrow(F);
+    Changed |= setRetDoesNotAlias(F);
     Changed |= setWillReturn(F);
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setOnlyReadsMemory(F, 0); 
-    return Changed; 
-  case LibFunc_stat: 
-  case LibFunc_statvfs: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setDoesNotCapture(F, 1); 
-    Changed |= setOnlyReadsMemory(F, 0); 
-    return Changed; 
-  case LibFunc_sscanf: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setDoesNotCapture(F, 1); 
-    Changed |= setOnlyReadsMemory(F, 0); 
-    Changed |= setOnlyReadsMemory(F, 1); 
-    return Changed; 
-  case LibFunc_sprintf: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setDoesNotAlias(F, 0); 
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_stat:
+  case LibFunc_statvfs:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_sscanf:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_sprintf:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotAlias(F, 0);
     Changed |= setOnlyWritesMemory(F, 0);
-    Changed |= setDoesNotCapture(F, 1); 
-    Changed |= setOnlyReadsMemory(F, 1); 
-    return Changed; 
-  case LibFunc_snprintf: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setDoesNotAlias(F, 0); 
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_snprintf:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotAlias(F, 0);
     Changed |= setOnlyWritesMemory(F, 0);
-    Changed |= setDoesNotCapture(F, 2); 
-    Changed |= setOnlyReadsMemory(F, 2); 
-    return Changed; 
-  case LibFunc_setitimer: 
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc_setitimer:
     Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
+    Changed |= setDoesNotThrow(F);
     Changed |= setWillReturn(F);
-    Changed |= setDoesNotCapture(F, 1); 
-    Changed |= setDoesNotCapture(F, 2); 
-    Changed |= setOnlyReadsMemory(F, 1); 
-    return Changed; 
-  case LibFunc_system: 
-    // May throw; "system" is a valid pthread cancellation point. 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setOnlyReadsMemory(F, 0); 
-    return Changed; 
-  case LibFunc_malloc: 
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_system:
+    // May throw; "system" is a valid pthread cancellation point.
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_malloc:
   case LibFunc_vec_malloc:
     Changed |= setOnlyAccessesInaccessibleMemory(F);
     Changed |= setRetNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setRetDoesNotAlias(F); 
+    Changed |= setDoesNotThrow(F);
+    Changed |= setRetDoesNotAlias(F);
     Changed |= setWillReturn(F);
-    return Changed; 
-  case LibFunc_memcmp: 
+    return Changed;
+  case LibFunc_memcmp:
     Changed |= setOnlyAccessesArgMemory(F);
-    Changed |= setOnlyReadsMemory(F); 
-    Changed |= setDoesNotThrow(F); 
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotThrow(F);
     Changed |= setWillReturn(F);
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setDoesNotCapture(F, 1); 
-    return Changed; 
-  case LibFunc_memchr: 
-  case LibFunc_memrchr: 
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc_memchr:
+  case LibFunc_memrchr:
     Changed |= setDoesNotThrow(F);
     Changed |= setOnlyAccessesArgMemory(F);
-    Changed |= setOnlyReadsMemory(F); 
+    Changed |= setOnlyReadsMemory(F);
     Changed |= setWillReturn(F);
-    return Changed; 
-  case LibFunc_modf: 
-  case LibFunc_modff: 
-  case LibFunc_modfl: 
-    Changed |= setDoesNotThrow(F); 
+    return Changed;
+  case LibFunc_modf:
+  case LibFunc_modff:
+  case LibFunc_modfl:
+    Changed |= setDoesNotThrow(F);
     Changed |= setWillReturn(F);
-    Changed |= setDoesNotCapture(F, 1); 
-    return Changed; 
-  case LibFunc_memcpy: 
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc_memcpy:
     Changed |= setDoesNotThrow(F);
     Changed |= setOnlyAccessesArgMemory(F);
     Changed |= setWillReturn(F);
-    Changed |= setDoesNotAlias(F, 0); 
+    Changed |= setDoesNotAlias(F, 0);
     Changed |= setReturnedArg(F, 0);
     Changed |= setOnlyWritesMemory(F, 0);
-    Changed |= setDoesNotAlias(F, 1); 
-    Changed |= setDoesNotCapture(F, 1); 
-    Changed |= setOnlyReadsMemory(F, 1); 
-    return Changed; 
-  case LibFunc_memmove: 
+    Changed |= setDoesNotAlias(F, 1);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_memmove:
     Changed |= setDoesNotThrow(F);
     Changed |= setOnlyAccessesArgMemory(F);
     Changed |= setWillReturn(F);
-    Changed |= setReturnedArg(F, 0); 
+    Changed |= setReturnedArg(F, 0);
     Changed |= setOnlyWritesMemory(F, 0);
-    Changed |= setDoesNotCapture(F, 1); 
-    Changed |= setOnlyReadsMemory(F, 1); 
-    return Changed; 
-  case LibFunc_mempcpy: 
-  case LibFunc_memccpy: 
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_mempcpy:
+  case LibFunc_memccpy:
     Changed |= setDoesNotThrow(F);
     Changed |= setOnlyAccessesArgMemory(F);
     Changed |= setWillReturn(F);
-    Changed |= setDoesNotAlias(F, 0); 
+    Changed |= setDoesNotAlias(F, 0);
     Changed |= setOnlyWritesMemory(F, 0);
-    Changed |= setDoesNotAlias(F, 1); 
-    Changed |= setDoesNotCapture(F, 1); 
-    Changed |= setOnlyReadsMemory(F, 1); 
-    return Changed; 
-  case LibFunc_memcpy_chk: 
-    Changed |= setDoesNotThrow(F); 
-    return Changed; 
-  case LibFunc_memalign: 
+    Changed |= setDoesNotAlias(F, 1);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_memcpy_chk:
+    Changed |= setDoesNotThrow(F);
+    return Changed;
+  case LibFunc_memalign:
     Changed |= setOnlyAccessesInaccessibleMemory(F);
     Changed |= setRetNoUndef(F);
     Changed |= setDoesNotThrow(F);
-    Changed |= setRetDoesNotAlias(F); 
+    Changed |= setRetDoesNotAlias(F);
     Changed |= setWillReturn(F);
-    return Changed; 
-  case LibFunc_mkdir: 
+    return Changed;
+  case LibFunc_mkdir:
     Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setOnlyReadsMemory(F, 0); 
-    return Changed; 
-  case LibFunc_mktime: 
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_mktime:
     Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
+    Changed |= setDoesNotThrow(F);
     Changed |= setWillReturn(F);
-    Changed |= setDoesNotCapture(F, 0); 
-    return Changed; 
-  case LibFunc_realloc: 
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
+  case LibFunc_realloc:
   case LibFunc_vec_realloc:
     Changed |= setOnlyAccessesInaccessibleMemOrArgMem(F);
     Changed |= setRetNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setRetDoesNotAlias(F); 
+    Changed |= setDoesNotThrow(F);
+    Changed |= setRetDoesNotAlias(F);
     Changed |= setWillReturn(F);
-    Changed |= setDoesNotCapture(F, 0); 
-    return Changed; 
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
   case LibFunc_reallocf:
     Changed |= setRetNoUndef(F);
     Changed |= setWillReturn(F);
     return Changed;
-  case LibFunc_read: 
-    // May throw; "read" is a valid pthread cancellation point. 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotCapture(F, 1); 
-    return Changed; 
-  case LibFunc_rewind: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    return Changed; 
-  case LibFunc_rmdir: 
-  case LibFunc_remove: 
-  case LibFunc_realpath: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setOnlyReadsMemory(F, 0); 
-    return Changed; 
-  case LibFunc_rename: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setDoesNotCapture(F, 1); 
-    Changed |= setOnlyReadsMemory(F, 0); 
-    Changed |= setOnlyReadsMemory(F, 1); 
-    return Changed; 
-  case LibFunc_readlink: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setDoesNotCapture(F, 1); 
-    Changed |= setOnlyReadsMemory(F, 0); 
-    return Changed; 
-  case LibFunc_write: 
-    // May throw; "write" is a valid pthread cancellation point. 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotCapture(F, 1); 
-    Changed |= setOnlyReadsMemory(F, 1); 
-    return Changed; 
-  case LibFunc_aligned_alloc: 
+  case LibFunc_read:
+    // May throw; "read" is a valid pthread cancellation point.
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc_rewind:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
+  case LibFunc_rmdir:
+  case LibFunc_remove:
+  case LibFunc_realpath:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_rename:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_readlink:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_write:
+    // May throw; "write" is a valid pthread cancellation point.
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_aligned_alloc:
     Changed |= setOnlyAccessesInaccessibleMemory(F);
     Changed |= setRetNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setRetDoesNotAlias(F); 
+    Changed |= setDoesNotThrow(F);
+    Changed |= setRetDoesNotAlias(F);
     Changed |= setWillReturn(F);
-    return Changed; 
-  case LibFunc_bcopy: 
-    Changed |= setDoesNotThrow(F); 
+    return Changed;
+  case LibFunc_bcopy:
+    Changed |= setDoesNotThrow(F);
     Changed |= setOnlyAccessesArgMemory(F);
     Changed |= setWillReturn(F);
-    Changed |= setDoesNotCapture(F, 0); 
+    Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
     Changed |= setOnlyWritesMemory(F, 1);
-    Changed |= setDoesNotCapture(F, 1); 
-    return Changed; 
-  case LibFunc_bcmp: 
-    Changed |= setDoesNotThrow(F); 
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc_bcmp:
+    Changed |= setDoesNotThrow(F);
     Changed |= setOnlyAccessesArgMemory(F);
-    Changed |= setOnlyReadsMemory(F); 
+    Changed |= setOnlyReadsMemory(F);
     Changed |= setWillReturn(F);
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setDoesNotCapture(F, 1); 
-    return Changed; 
-  case LibFunc_bzero: 
-    Changed |= setDoesNotThrow(F); 
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc_bzero:
+    Changed |= setDoesNotThrow(F);
     Changed |= setOnlyAccessesArgMemory(F);
     Changed |= setWillReturn(F);
-    Changed |= setDoesNotCapture(F, 0); 
+    Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyWritesMemory(F, 0);
-    return Changed; 
-  case LibFunc_calloc: 
+    return Changed;
+  case LibFunc_calloc:
   case LibFunc_vec_calloc:
     Changed |= setOnlyAccessesInaccessibleMemory(F);
     Changed |= setRetNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setRetDoesNotAlias(F); 
+    Changed |= setDoesNotThrow(F);
+    Changed |= setRetDoesNotAlias(F);
     Changed |= setWillReturn(F);
-    return Changed; 
-  case LibFunc_chmod: 
-  case LibFunc_chown: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setOnlyReadsMemory(F, 0); 
-    return Changed; 
-  case LibFunc_ctermid: 
-  case LibFunc_clearerr: 
-  case LibFunc_closedir: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    return Changed; 
-  case LibFunc_atoi: 
-  case LibFunc_atol: 
-  case LibFunc_atof: 
-  case LibFunc_atoll: 
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setOnlyReadsMemory(F); 
+    return Changed;
+  case LibFunc_chmod:
+  case LibFunc_chown:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_ctermid:
+  case LibFunc_clearerr:
+  case LibFunc_closedir:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
+  case LibFunc_atoi:
+  case LibFunc_atol:
+  case LibFunc_atof:
+  case LibFunc_atoll:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setOnlyReadsMemory(F);
     Changed |= setWillReturn(F);
-    Changed |= setDoesNotCapture(F, 0); 
-    return Changed; 
-  case LibFunc_access: 
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
+  case LibFunc_access:
     Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setOnlyReadsMemory(F, 0); 
-    return Changed; 
-  case LibFunc_fopen: 
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_fopen:
     Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setRetDoesNotAlias(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setDoesNotCapture(F, 1); 
-    Changed |= setOnlyReadsMemory(F, 0); 
-    Changed |= setOnlyReadsMemory(F, 1); 
-    return Changed; 
-  case LibFunc_fdopen: 
+    Changed |= setDoesNotThrow(F);
+    Changed |= setRetDoesNotAlias(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_fdopen:
     Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setRetDoesNotAlias(F); 
-    Changed |= setDoesNotCapture(F, 1); 
-    Changed |= setOnlyReadsMemory(F, 1); 
-    return Changed; 
-  case LibFunc_feof: 
+    Changed |= setDoesNotThrow(F);
+    Changed |= setRetDoesNotAlias(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_feof:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     return Changed;
-  case LibFunc_free: 
+  case LibFunc_free:
   case LibFunc_vec_free:
     Changed |= setOnlyAccessesInaccessibleMemOrArgMem(F);
     Changed |= setArgsNoUndef(F);
@@ -612,411 +612,411 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 0);
     return Changed;
-  case LibFunc_fseek: 
-  case LibFunc_ftell: 
-  case LibFunc_fgetc: 
-  case LibFunc_fgetc_unlocked: 
-  case LibFunc_fseeko: 
-  case LibFunc_ftello: 
-  case LibFunc_fileno: 
-  case LibFunc_fflush: 
-  case LibFunc_fclose: 
-  case LibFunc_fsetpos: 
-  case LibFunc_flockfile: 
-  case LibFunc_funlockfile: 
-  case LibFunc_ftrylockfile: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    return Changed; 
-  case LibFunc_ferror: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setOnlyReadsMemory(F); 
-    return Changed; 
-  case LibFunc_fputc: 
-  case LibFunc_fputc_unlocked: 
-  case LibFunc_fstat: 
+  case LibFunc_fseek:
+  case LibFunc_ftell:
+  case LibFunc_fgetc:
+  case LibFunc_fgetc_unlocked:
+  case LibFunc_fseeko:
+  case LibFunc_ftello:
+  case LibFunc_fileno:
+  case LibFunc_fflush:
+  case LibFunc_fclose:
+  case LibFunc_fsetpos:
+  case LibFunc_flockfile:
+  case LibFunc_funlockfile:
+  case LibFunc_ftrylockfile:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
+  case LibFunc_ferror:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F);
+    return Changed;
+  case LibFunc_fputc:
+  case LibFunc_fputc_unlocked:
+  case LibFunc_fstat:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
-  case LibFunc_frexp: 
-  case LibFunc_frexpf: 
-  case LibFunc_frexpl: 
+  case LibFunc_frexp:
+  case LibFunc_frexpf:
+  case LibFunc_frexpl:
     Changed |= setDoesNotThrow(F);
     Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
-  case LibFunc_fstatvfs: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 1); 
-    return Changed; 
-  case LibFunc_fgets: 
-  case LibFunc_fgets_unlocked: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 2); 
-    return Changed; 
-  case LibFunc_fread: 
-  case LibFunc_fread_unlocked: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setDoesNotCapture(F, 3); 
-    return Changed; 
-  case LibFunc_fwrite: 
-  case LibFunc_fwrite_unlocked: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setDoesNotCapture(F, 3); 
-    // FIXME: readonly #1? 
-    return Changed; 
-  case LibFunc_fputs: 
-  case LibFunc_fputs_unlocked: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setDoesNotCapture(F, 1); 
-    Changed |= setOnlyReadsMemory(F, 0); 
-    return Changed; 
-  case LibFunc_fscanf: 
-  case LibFunc_fprintf: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setDoesNotCapture(F, 1); 
-    Changed |= setOnlyReadsMemory(F, 1); 
-    return Changed; 
-  case LibFunc_fgetpos: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setDoesNotCapture(F, 1); 
-    return Changed; 
-  case LibFunc_getc: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 0);
-    return Changed;
-  case LibFunc_getlogin_r: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 0);
-    return Changed;
-  case LibFunc_getc_unlocked: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    return Changed; 
-  case LibFunc_getenv: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setOnlyReadsMemory(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    return Changed; 
-  case LibFunc_gets: 
-  case LibFunc_getchar: 
-  case LibFunc_getchar_unlocked: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    return Changed; 
-  case LibFunc_getitimer: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 1); 
-    return Changed; 
-  case LibFunc_getpwnam: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setOnlyReadsMemory(F, 0); 
-    return Changed; 
-  case LibFunc_ungetc: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 1); 
-    return Changed; 
-  case LibFunc_uname: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    return Changed; 
-  case LibFunc_unlink: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setOnlyReadsMemory(F, 0); 
-    return Changed; 
-  case LibFunc_unsetenv: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setOnlyReadsMemory(F, 0); 
-    return Changed; 
-  case LibFunc_utime: 
-  case LibFunc_utimes: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setDoesNotCapture(F, 1); 
-    Changed |= setOnlyReadsMemory(F, 0); 
-    Changed |= setOnlyReadsMemory(F, 1); 
-    return Changed; 
-  case LibFunc_putc: 
-  case LibFunc_putc_unlocked: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 1); 
-    return Changed; 
-  case LibFunc_puts: 
-  case LibFunc_printf: 
-  case LibFunc_perror: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setOnlyReadsMemory(F, 0); 
-    return Changed; 
-  case LibFunc_pread: 
-    // May throw; "pread" is a valid pthread cancellation point. 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotCapture(F, 1); 
-    return Changed; 
-  case LibFunc_pwrite: 
-    // May throw; "pwrite" is a valid pthread cancellation point. 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotCapture(F, 1); 
-    Changed |= setOnlyReadsMemory(F, 1); 
-    return Changed; 
-  case LibFunc_putchar: 
-  case LibFunc_putchar_unlocked: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    return Changed; 
-  case LibFunc_popen: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setRetDoesNotAlias(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setDoesNotCapture(F, 1); 
-    Changed |= setOnlyReadsMemory(F, 0); 
-    Changed |= setOnlyReadsMemory(F, 1); 
-    return Changed; 
-  case LibFunc_pclose: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    return Changed; 
-  case LibFunc_vscanf: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setOnlyReadsMemory(F, 0); 
-    return Changed; 
-  case LibFunc_vsscanf: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setDoesNotCapture(F, 1); 
-    Changed |= setOnlyReadsMemory(F, 0); 
-    Changed |= setOnlyReadsMemory(F, 1); 
-    return Changed; 
-  case LibFunc_vfscanf: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setDoesNotCapture(F, 1); 
-    Changed |= setOnlyReadsMemory(F, 1); 
-    return Changed; 
-  case LibFunc_valloc: 
+  case LibFunc_fstatvfs:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc_fgets:
+  case LibFunc_fgets_unlocked:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc_fread:
+  case LibFunc_fread_unlocked:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 3);
+    return Changed;
+  case LibFunc_fwrite:
+  case LibFunc_fwrite_unlocked:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 3);
+    // FIXME: readonly #1?
+    return Changed;
+  case LibFunc_fputs:
+  case LibFunc_fputs_unlocked:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_fscanf:
+  case LibFunc_fprintf:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_fgetpos:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc_getc:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
+  case LibFunc_getlogin_r:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
+  case LibFunc_getc_unlocked:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
+  case LibFunc_getenv:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
+  case LibFunc_gets:
+  case LibFunc_getchar:
+  case LibFunc_getchar_unlocked:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    return Changed;
+  case LibFunc_getitimer:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc_getpwnam:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_ungetc:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc_uname:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
+  case LibFunc_unlink:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_unsetenv:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_utime:
+  case LibFunc_utimes:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_putc:
+  case LibFunc_putc_unlocked:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc_puts:
+  case LibFunc_printf:
+  case LibFunc_perror:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_pread:
+    // May throw; "pread" is a valid pthread cancellation point.
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc_pwrite:
+    // May throw; "pwrite" is a valid pthread cancellation point.
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_putchar:
+  case LibFunc_putchar_unlocked:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    return Changed;
+  case LibFunc_popen:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setRetDoesNotAlias(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_pclose:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
+  case LibFunc_vscanf:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_vsscanf:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_vfscanf:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_valloc:
     Changed |= setOnlyAccessesInaccessibleMemory(F);
     Changed |= setRetNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setRetDoesNotAlias(F); 
+    Changed |= setDoesNotThrow(F);
+    Changed |= setRetDoesNotAlias(F);
     Changed |= setWillReturn(F);
-    return Changed; 
-  case LibFunc_vprintf: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setOnlyReadsMemory(F, 0); 
-    return Changed; 
-  case LibFunc_vfprintf: 
-  case LibFunc_vsprintf: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setDoesNotCapture(F, 1); 
-    Changed |= setOnlyReadsMemory(F, 1); 
-    return Changed; 
-  case LibFunc_vsnprintf: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setDoesNotCapture(F, 2); 
-    Changed |= setOnlyReadsMemory(F, 2); 
-    return Changed; 
-  case LibFunc_open: 
-    // May throw; "open" is a valid pthread cancellation point. 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setOnlyReadsMemory(F, 0); 
-    return Changed; 
-  case LibFunc_opendir: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setRetDoesNotAlias(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setOnlyReadsMemory(F, 0); 
-    return Changed; 
-  case LibFunc_tmpfile: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setRetDoesNotAlias(F); 
-    return Changed; 
-  case LibFunc_times: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    return Changed; 
-  case LibFunc_htonl: 
-  case LibFunc_htons: 
-  case LibFunc_ntohl: 
-  case LibFunc_ntohs: 
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotAccessMemory(F); 
-    return Changed; 
-  case LibFunc_lstat: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setDoesNotCapture(F, 1); 
-    Changed |= setOnlyReadsMemory(F, 0); 
-    return Changed; 
-  case LibFunc_lchown: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setOnlyReadsMemory(F, 0); 
-    return Changed; 
-  case LibFunc_qsort: 
-    // May throw; places call through function pointer. 
+    return Changed;
+  case LibFunc_vprintf:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_vfprintf:
+  case LibFunc_vsprintf:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_vsnprintf:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc_open:
+    // May throw; "open" is a valid pthread cancellation point.
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_opendir:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setRetDoesNotAlias(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_tmpfile:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setRetDoesNotAlias(F);
+    return Changed;
+  case LibFunc_times:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
+  case LibFunc_htonl:
+  case LibFunc_htons:
+  case LibFunc_ntohl:
+  case LibFunc_ntohs:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAccessMemory(F);
+    return Changed;
+  case LibFunc_lstat:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_lchown:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_qsort:
+    // May throw; places call through function pointer.
     // Cannot give undef pointer/size
     Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotCapture(F, 3); 
-    return Changed; 
-  case LibFunc_dunder_strdup: 
-  case LibFunc_dunder_strndup: 
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setRetDoesNotAlias(F); 
+    Changed |= setDoesNotCapture(F, 3);
+    return Changed;
+  case LibFunc_dunder_strdup:
+  case LibFunc_dunder_strndup:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setRetDoesNotAlias(F);
     Changed |= setWillReturn(F);
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setOnlyReadsMemory(F, 0); 
-    return Changed; 
-  case LibFunc_dunder_strtok_r: 
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 1); 
-    Changed |= setOnlyReadsMemory(F, 1); 
-    return Changed; 
-  case LibFunc_under_IO_getc: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    return Changed; 
-  case LibFunc_under_IO_putc: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 1); 
-    return Changed; 
-  case LibFunc_dunder_isoc99_scanf: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setOnlyReadsMemory(F, 0); 
-    return Changed; 
-  case LibFunc_stat64: 
-  case LibFunc_lstat64: 
-  case LibFunc_statvfs64: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setDoesNotCapture(F, 1); 
-    Changed |= setOnlyReadsMemory(F, 0); 
-    return Changed; 
-  case LibFunc_dunder_isoc99_sscanf: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setDoesNotCapture(F, 1); 
-    Changed |= setOnlyReadsMemory(F, 0); 
-    Changed |= setOnlyReadsMemory(F, 1); 
-    return Changed; 
-  case LibFunc_fopen64: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setRetDoesNotAlias(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setDoesNotCapture(F, 1); 
-    Changed |= setOnlyReadsMemory(F, 0); 
-    Changed |= setOnlyReadsMemory(F, 1); 
-    return Changed; 
-  case LibFunc_fseeko64: 
-  case LibFunc_ftello64: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    return Changed; 
-  case LibFunc_tmpfile64: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setRetDoesNotAlias(F); 
-    return Changed; 
-  case LibFunc_fstat64: 
-  case LibFunc_fstatvfs64: 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 1); 
-    return Changed; 
-  case LibFunc_open64: 
-    // May throw; "open" is a valid pthread cancellation point. 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setOnlyReadsMemory(F, 0); 
-    return Changed; 
-  case LibFunc_gettimeofday: 
-    // Currently some platforms have the restrict keyword on the arguments to 
-    // gettimeofday. To be conservative, do not add noalias to gettimeofday's 
-    // arguments. 
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F); 
-    Changed |= setDoesNotCapture(F, 0); 
-    Changed |= setDoesNotCapture(F, 1); 
-    return Changed; 
-  // TODO: add LibFunc entries for: 
-  // case LibFunc_memset_pattern4: 
-  // case LibFunc_memset_pattern8: 
-  case LibFunc_memset_pattern16: 
-    Changed |= setOnlyAccessesArgMemory(F); 
-    Changed |= setDoesNotCapture(F, 0); 
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_dunder_strtok_r:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_under_IO_getc:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
+  case LibFunc_under_IO_putc:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc_dunder_isoc99_scanf:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_stat64:
+  case LibFunc_lstat64:
+  case LibFunc_statvfs64:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_dunder_isoc99_sscanf:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_fopen64:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setRetDoesNotAlias(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc_fseeko64:
+  case LibFunc_ftello64:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
+  case LibFunc_tmpfile64:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setRetDoesNotAlias(F);
+    return Changed;
+  case LibFunc_fstat64:
+  case LibFunc_fstatvfs64:
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc_open64:
+    // May throw; "open" is a valid pthread cancellation point.
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_gettimeofday:
+    // Currently some platforms have the restrict keyword on the arguments to
+    // gettimeofday. To be conservative, do not add noalias to gettimeofday's
+    // arguments.
+    Changed |= setRetAndArgsNoUndef(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  // TODO: add LibFunc entries for:
+  // case LibFunc_memset_pattern4:
+  // case LibFunc_memset_pattern8:
+  case LibFunc_memset_pattern16:
+    Changed |= setOnlyAccessesArgMemory(F);
+    Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyWritesMemory(F, 0);
-    Changed |= setDoesNotCapture(F, 1); 
-    Changed |= setOnlyReadsMemory(F, 1); 
-    return Changed; 
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
   case LibFunc_memset:
     Changed |= setOnlyAccessesArgMemory(F);
     Changed |= setWillReturn(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setOnlyWritesMemory(F, 0);
     return Changed;
-  // int __nvvm_reflect(const char *) 
-  case LibFunc_nvvm_reflect: 
+  // int __nvvm_reflect(const char *)
+  case LibFunc_nvvm_reflect:
     Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotAccessMemory(F); 
-    Changed |= setDoesNotThrow(F); 
-    return Changed; 
+    Changed |= setDoesNotAccessMemory(F);
+    Changed |= setDoesNotThrow(F);
+    return Changed;
   case LibFunc_ldexp:
   case LibFunc_ldexpf:
   case LibFunc_ldexpl:
@@ -1150,154 +1150,154 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setDoesNotFreeMemory(F);
     Changed |= setWillReturn(F);
     return Changed;
-  default: 
-    // FIXME: It'd be really nice to cover all the library functions we're 
-    // aware of here. 
-    return false; 
-  } 
-} 
- 
-bool llvm::hasFloatFn(const TargetLibraryInfo *TLI, Type *Ty, 
-                      LibFunc DoubleFn, LibFunc FloatFn, LibFunc LongDoubleFn) { 
-  switch (Ty->getTypeID()) { 
-  case Type::HalfTyID: 
-    return false; 
-  case Type::FloatTyID: 
-    return TLI->has(FloatFn); 
-  case Type::DoubleTyID: 
-    return TLI->has(DoubleFn); 
-  default: 
-    return TLI->has(LongDoubleFn); 
-  } 
-} 
- 
-StringRef llvm::getFloatFnName(const TargetLibraryInfo *TLI, Type *Ty, 
-                               LibFunc DoubleFn, LibFunc FloatFn, 
-                               LibFunc LongDoubleFn) { 
-  assert(hasFloatFn(TLI, Ty, DoubleFn, FloatFn, LongDoubleFn) && 
-         "Cannot get name for unavailable function!"); 
- 
-  switch (Ty->getTypeID()) { 
-  case Type::HalfTyID: 
-    llvm_unreachable("No name for HalfTy!"); 
-  case Type::FloatTyID: 
-    return TLI->getName(FloatFn); 
-  case Type::DoubleTyID: 
-    return TLI->getName(DoubleFn); 
-  default: 
-    return TLI->getName(LongDoubleFn); 
-  } 
-} 
- 
-//- Emit LibCalls ------------------------------------------------------------// 
- 
-Value *llvm::castToCStr(Value *V, IRBuilderBase &B) { 
-  unsigned AS = V->getType()->getPointerAddressSpace(); 
-  return B.CreateBitCast(V, B.getInt8PtrTy(AS), "cstr"); 
-} 
- 
-static Value *emitLibCall(LibFunc TheLibFunc, Type *ReturnType, 
-                          ArrayRef<Type *> ParamTypes, 
-                          ArrayRef<Value *> Operands, IRBuilderBase &B, 
-                          const TargetLibraryInfo *TLI, 
-                          bool IsVaArgs = false) { 
-  if (!TLI->has(TheLibFunc)) 
-    return nullptr; 
- 
-  Module *M = B.GetInsertBlock()->getModule(); 
-  StringRef FuncName = TLI->getName(TheLibFunc); 
-  FunctionType *FuncType = FunctionType::get(ReturnType, ParamTypes, IsVaArgs); 
-  FunctionCallee Callee = M->getOrInsertFunction(FuncName, FuncType); 
-  inferLibFuncAttributes(M, FuncName, *TLI); 
-  CallInst *CI = B.CreateCall(Callee, Operands, FuncName); 
-  if (const Function *F = 
-          dyn_cast<Function>(Callee.getCallee()->stripPointerCasts())) 
-    CI->setCallingConv(F->getCallingConv()); 
-  return CI; 
-} 
- 
-Value *llvm::emitStrLen(Value *Ptr, IRBuilderBase &B, const DataLayout &DL, 
-                        const TargetLibraryInfo *TLI) { 
-  LLVMContext &Context = B.GetInsertBlock()->getContext(); 
-  return emitLibCall(LibFunc_strlen, DL.getIntPtrType(Context), 
-                     B.getInt8PtrTy(), castToCStr(Ptr, B), B, TLI); 
-} 
- 
-Value *llvm::emitStrDup(Value *Ptr, IRBuilderBase &B, 
-                        const TargetLibraryInfo *TLI) { 
-  return emitLibCall(LibFunc_strdup, B.getInt8PtrTy(), B.getInt8PtrTy(), 
-                     castToCStr(Ptr, B), B, TLI); 
-} 
- 
-Value *llvm::emitStrChr(Value *Ptr, char C, IRBuilderBase &B, 
-                        const TargetLibraryInfo *TLI) { 
-  Type *I8Ptr = B.getInt8PtrTy(); 
-  Type *I32Ty = B.getInt32Ty(); 
-  return emitLibCall(LibFunc_strchr, I8Ptr, {I8Ptr, I32Ty}, 
-                     {castToCStr(Ptr, B), ConstantInt::get(I32Ty, C)}, B, TLI); 
-} 
- 
-Value *llvm::emitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilderBase &B, 
-                         const DataLayout &DL, const TargetLibraryInfo *TLI) { 
-  LLVMContext &Context = B.GetInsertBlock()->getContext(); 
-  return emitLibCall( 
-      LibFunc_strncmp, B.getInt32Ty(), 
-      {B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context)}, 
-      {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, B, TLI); 
-} 
- 
-Value *llvm::emitStrCpy(Value *Dst, Value *Src, IRBuilderBase &B, 
-                        const TargetLibraryInfo *TLI) { 
-  Type *I8Ptr = B.getInt8PtrTy(); 
-  return emitLibCall(LibFunc_strcpy, I8Ptr, {I8Ptr, I8Ptr}, 
-                     {castToCStr(Dst, B), castToCStr(Src, B)}, B, TLI); 
-} 
- 
-Value *llvm::emitStpCpy(Value *Dst, Value *Src, IRBuilderBase &B, 
-                        const TargetLibraryInfo *TLI) { 
-  Type *I8Ptr = B.getInt8PtrTy(); 
-  return emitLibCall(LibFunc_stpcpy, I8Ptr, {I8Ptr, I8Ptr}, 
-                     {castToCStr(Dst, B), castToCStr(Src, B)}, B, TLI); 
-} 
- 
-Value *llvm::emitStrNCpy(Value *Dst, Value *Src, Value *Len, IRBuilderBase &B, 
-                         const TargetLibraryInfo *TLI) { 
-  Type *I8Ptr = B.getInt8PtrTy(); 
-  return emitLibCall(LibFunc_strncpy, I8Ptr, {I8Ptr, I8Ptr, Len->getType()}, 
-                     {castToCStr(Dst, B), castToCStr(Src, B), Len}, B, TLI); 
-} 
- 
-Value *llvm::emitStpNCpy(Value *Dst, Value *Src, Value *Len, IRBuilderBase &B, 
-                         const TargetLibraryInfo *TLI) { 
-  Type *I8Ptr = B.getInt8PtrTy(); 
-  return emitLibCall(LibFunc_stpncpy, I8Ptr, {I8Ptr, I8Ptr, Len->getType()}, 
-                     {castToCStr(Dst, B), castToCStr(Src, B), Len}, B, TLI); 
-} 
- 
-Value *llvm::emitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize, 
-                           IRBuilderBase &B, const DataLayout &DL, 
-                           const TargetLibraryInfo *TLI) { 
-  if (!TLI->has(LibFunc_memcpy_chk)) 
-    return nullptr; 
- 
-  Module *M = B.GetInsertBlock()->getModule(); 
-  AttributeList AS; 
-  AS = AttributeList::get(M->getContext(), AttributeList::FunctionIndex, 
-                          Attribute::NoUnwind); 
-  LLVMContext &Context = B.GetInsertBlock()->getContext(); 
-  FunctionCallee MemCpy = M->getOrInsertFunction( 
-      "__memcpy_chk", AttributeList::get(M->getContext(), AS), B.getInt8PtrTy(), 
-      B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context), 
-      DL.getIntPtrType(Context)); 
-  Dst = castToCStr(Dst, B); 
-  Src = castToCStr(Src, B); 
-  CallInst *CI = B.CreateCall(MemCpy, {Dst, Src, Len, ObjSize}); 
-  if (const Function *F = 
-          dyn_cast<Function>(MemCpy.getCallee()->stripPointerCasts())) 
-    CI->setCallingConv(F->getCallingConv()); 
-  return CI; 
-} 
- 
+  default:
+    // FIXME: It'd be really nice to cover all the library functions we're
+    // aware of here.
+    return false;
+  }
+}
+
+bool llvm::hasFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
+                      LibFunc DoubleFn, LibFunc FloatFn, LibFunc LongDoubleFn) {
+  switch (Ty->getTypeID()) {
+  case Type::HalfTyID:
+    return false;
+  case Type::FloatTyID:
+    return TLI->has(FloatFn);
+  case Type::DoubleTyID:
+    return TLI->has(DoubleFn);
+  default:
+    return TLI->has(LongDoubleFn);
+  }
+}
+
+StringRef llvm::getFloatFnName(const TargetLibraryInfo *TLI, Type *Ty,
+                               LibFunc DoubleFn, LibFunc FloatFn,
+                               LibFunc LongDoubleFn) {
+  assert(hasFloatFn(TLI, Ty, DoubleFn, FloatFn, LongDoubleFn) &&
+         "Cannot get name for unavailable function!");
+
+  switch (Ty->getTypeID()) {
+  case Type::HalfTyID:
+    llvm_unreachable("No name for HalfTy!");
+  case Type::FloatTyID:
+    return TLI->getName(FloatFn);
+  case Type::DoubleTyID:
+    return TLI->getName(DoubleFn);
+  default:
+    return TLI->getName(LongDoubleFn);
+  }
+}
+
+//- Emit LibCalls ------------------------------------------------------------//
+
+Value *llvm::castToCStr(Value *V, IRBuilderBase &B) {
+  unsigned AS = V->getType()->getPointerAddressSpace();
+  return B.CreateBitCast(V, B.getInt8PtrTy(AS), "cstr");
+}
+
+static Value *emitLibCall(LibFunc TheLibFunc, Type *ReturnType,
+                          ArrayRef<Type *> ParamTypes,
+                          ArrayRef<Value *> Operands, IRBuilderBase &B,
+                          const TargetLibraryInfo *TLI,
+                          bool IsVaArgs = false) {
+  if (!TLI->has(TheLibFunc))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  StringRef FuncName = TLI->getName(TheLibFunc);
+  FunctionType *FuncType = FunctionType::get(ReturnType, ParamTypes, IsVaArgs);
+  FunctionCallee Callee = M->getOrInsertFunction(FuncName, FuncType);
+  inferLibFuncAttributes(M, FuncName, *TLI);
+  CallInst *CI = B.CreateCall(Callee, Operands, FuncName);
+  if (const Function *F =
+          dyn_cast<Function>(Callee.getCallee()->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+  return CI;
+}
+
+Value *llvm::emitStrLen(Value *Ptr, IRBuilderBase &B, const DataLayout &DL,
+                        const TargetLibraryInfo *TLI) {
+  LLVMContext &Context = B.GetInsertBlock()->getContext();
+  return emitLibCall(LibFunc_strlen, DL.getIntPtrType(Context),
+                     B.getInt8PtrTy(), castToCStr(Ptr, B), B, TLI);
+}
+
+Value *llvm::emitStrDup(Value *Ptr, IRBuilderBase &B,
+                        const TargetLibraryInfo *TLI) {
+  return emitLibCall(LibFunc_strdup, B.getInt8PtrTy(), B.getInt8PtrTy(),
+                     castToCStr(Ptr, B), B, TLI);
+}
+
+Value *llvm::emitStrChr(Value *Ptr, char C, IRBuilderBase &B,
+                        const TargetLibraryInfo *TLI) {
+  Type *I8Ptr = B.getInt8PtrTy();
+  Type *I32Ty = B.getInt32Ty();
+  return emitLibCall(LibFunc_strchr, I8Ptr, {I8Ptr, I32Ty},
+                     {castToCStr(Ptr, B), ConstantInt::get(I32Ty, C)}, B, TLI);
+}
+
+Value *llvm::emitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilderBase &B,
+                         const DataLayout &DL, const TargetLibraryInfo *TLI) {
+  LLVMContext &Context = B.GetInsertBlock()->getContext();
+  return emitLibCall(
+      LibFunc_strncmp, B.getInt32Ty(),
+      {B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context)},
+      {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, B, TLI);
+}
+
+Value *llvm::emitStrCpy(Value *Dst, Value *Src, IRBuilderBase &B,
+                        const TargetLibraryInfo *TLI) {
+  Type *I8Ptr = B.getInt8PtrTy();
+  return emitLibCall(LibFunc_strcpy, I8Ptr, {I8Ptr, I8Ptr},
+                     {castToCStr(Dst, B), castToCStr(Src, B)}, B, TLI);
+}
+
+Value *llvm::emitStpCpy(Value *Dst, Value *Src, IRBuilderBase &B,
+                        const TargetLibraryInfo *TLI) {
+  Type *I8Ptr = B.getInt8PtrTy();
+  return emitLibCall(LibFunc_stpcpy, I8Ptr, {I8Ptr, I8Ptr},
+                     {castToCStr(Dst, B), castToCStr(Src, B)}, B, TLI);
+}
+
+Value *llvm::emitStrNCpy(Value *Dst, Value *Src, Value *Len, IRBuilderBase &B,
+                         const TargetLibraryInfo *TLI) {
+  Type *I8Ptr = B.getInt8PtrTy();
+  return emitLibCall(LibFunc_strncpy, I8Ptr, {I8Ptr, I8Ptr, Len->getType()},
+                     {castToCStr(Dst, B), castToCStr(Src, B), Len}, B, TLI);
+}
+
+Value *llvm::emitStpNCpy(Value *Dst, Value *Src, Value *Len, IRBuilderBase &B,
+                         const TargetLibraryInfo *TLI) {
+  Type *I8Ptr = B.getInt8PtrTy();
+  return emitLibCall(LibFunc_stpncpy, I8Ptr, {I8Ptr, I8Ptr, Len->getType()},
+                     {castToCStr(Dst, B), castToCStr(Src, B), Len}, B, TLI);
+}
+
+Value *llvm::emitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
+                           IRBuilderBase &B, const DataLayout &DL,
+                           const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc_memcpy_chk))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  AttributeList AS;
+  AS = AttributeList::get(M->getContext(), AttributeList::FunctionIndex,
+                          Attribute::NoUnwind);
+  LLVMContext &Context = B.GetInsertBlock()->getContext();
+  FunctionCallee MemCpy = M->getOrInsertFunction(
+      "__memcpy_chk", AttributeList::get(M->getContext(), AS), B.getInt8PtrTy(),
+      B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context),
+      DL.getIntPtrType(Context));
+  Dst = castToCStr(Dst, B);
+  Src = castToCStr(Src, B);
+  CallInst *CI = B.CreateCall(MemCpy, {Dst, Src, Len, ObjSize});
+  if (const Function *F =
+          dyn_cast<Function>(MemCpy.getCallee()->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+  return CI;
+}
+
 Value *llvm::emitMemPCpy(Value *Dst, Value *Src, Value *Len, IRBuilderBase &B,
                          const DataLayout &DL, const TargetLibraryInfo *TLI) {
   LLVMContext &Context = B.GetInsertBlock()->getContext();
@@ -1307,351 +1307,351 @@ Value *llvm::emitMemPCpy(Value *Dst, Value *Src, Value *Len, IRBuilderBase &B,
       {Dst, Src, Len}, B, TLI);
 }
 
-Value *llvm::emitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilderBase &B, 
-                        const DataLayout &DL, const TargetLibraryInfo *TLI) { 
-  LLVMContext &Context = B.GetInsertBlock()->getContext(); 
-  return emitLibCall( 
-      LibFunc_memchr, B.getInt8PtrTy(), 
-      {B.getInt8PtrTy(), B.getInt32Ty(), DL.getIntPtrType(Context)}, 
-      {castToCStr(Ptr, B), Val, Len}, B, TLI); 
-} 
- 
-Value *llvm::emitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilderBase &B, 
-                        const DataLayout &DL, const TargetLibraryInfo *TLI) { 
-  LLVMContext &Context = B.GetInsertBlock()->getContext(); 
-  return emitLibCall( 
-      LibFunc_memcmp, B.getInt32Ty(), 
-      {B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context)}, 
-      {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, B, TLI); 
-} 
- 
-Value *llvm::emitBCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilderBase &B, 
-                      const DataLayout &DL, const TargetLibraryInfo *TLI) { 
-  LLVMContext &Context = B.GetInsertBlock()->getContext(); 
-  return emitLibCall( 
-      LibFunc_bcmp, B.getInt32Ty(), 
-      {B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context)}, 
-      {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, B, TLI); 
-} 
- 
-Value *llvm::emitMemCCpy(Value *Ptr1, Value *Ptr2, Value *Val, Value *Len, 
-                         IRBuilderBase &B, const TargetLibraryInfo *TLI) { 
-  return emitLibCall( 
-      LibFunc_memccpy, B.getInt8PtrTy(), 
-      {B.getInt8PtrTy(), B.getInt8PtrTy(), B.getInt32Ty(), Len->getType()}, 
-      {Ptr1, Ptr2, Val, Len}, B, TLI); 
-} 
- 
-Value *llvm::emitSNPrintf(Value *Dest, Value *Size, Value *Fmt, 
-                          ArrayRef<Value *> VariadicArgs, IRBuilderBase &B, 
-                          const TargetLibraryInfo *TLI) { 
-  SmallVector<Value *, 8> Args{castToCStr(Dest, B), Size, castToCStr(Fmt, B)}; 
+Value *llvm::emitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilderBase &B,
+                        const DataLayout &DL, const TargetLibraryInfo *TLI) {
+  LLVMContext &Context = B.GetInsertBlock()->getContext();
+  return emitLibCall(
+      LibFunc_memchr, B.getInt8PtrTy(),
+      {B.getInt8PtrTy(), B.getInt32Ty(), DL.getIntPtrType(Context)},
+      {castToCStr(Ptr, B), Val, Len}, B, TLI);
+}
+
+Value *llvm::emitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilderBase &B,
+                        const DataLayout &DL, const TargetLibraryInfo *TLI) {
+  LLVMContext &Context = B.GetInsertBlock()->getContext();
+  return emitLibCall(
+      LibFunc_memcmp, B.getInt32Ty(),
+      {B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context)},
+      {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, B, TLI);
+}
+
+Value *llvm::emitBCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilderBase &B,
+                      const DataLayout &DL, const TargetLibraryInfo *TLI) {
+  LLVMContext &Context = B.GetInsertBlock()->getContext();
+  return emitLibCall(
+      LibFunc_bcmp, B.getInt32Ty(),
+      {B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context)},
+      {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, B, TLI);
+}
+
+Value *llvm::emitMemCCpy(Value *Ptr1, Value *Ptr2, Value *Val, Value *Len,
+                         IRBuilderBase &B, const TargetLibraryInfo *TLI) {
+  return emitLibCall(
+      LibFunc_memccpy, B.getInt8PtrTy(),
+      {B.getInt8PtrTy(), B.getInt8PtrTy(), B.getInt32Ty(), Len->getType()},
+      {Ptr1, Ptr2, Val, Len}, B, TLI);
+}
+
+Value *llvm::emitSNPrintf(Value *Dest, Value *Size, Value *Fmt,
+                          ArrayRef<Value *> VariadicArgs, IRBuilderBase &B,
+                          const TargetLibraryInfo *TLI) {
+  SmallVector<Value *, 8> Args{castToCStr(Dest, B), Size, castToCStr(Fmt, B)};
   llvm::append_range(Args, VariadicArgs);
-  return emitLibCall(LibFunc_snprintf, B.getInt32Ty(), 
-                     {B.getInt8PtrTy(), Size->getType(), B.getInt8PtrTy()}, 
-                     Args, B, TLI, /*IsVaArgs=*/true); 
-} 
- 
-Value *llvm::emitSPrintf(Value *Dest, Value *Fmt, 
-                         ArrayRef<Value *> VariadicArgs, IRBuilderBase &B, 
-                         const TargetLibraryInfo *TLI) { 
-  SmallVector<Value *, 8> Args{castToCStr(Dest, B), castToCStr(Fmt, B)}; 
+  return emitLibCall(LibFunc_snprintf, B.getInt32Ty(),
+                     {B.getInt8PtrTy(), Size->getType(), B.getInt8PtrTy()},
+                     Args, B, TLI, /*IsVaArgs=*/true);
+}
+
+Value *llvm::emitSPrintf(Value *Dest, Value *Fmt,
+                         ArrayRef<Value *> VariadicArgs, IRBuilderBase &B,
+                         const TargetLibraryInfo *TLI) {
+  SmallVector<Value *, 8> Args{castToCStr(Dest, B), castToCStr(Fmt, B)};
   llvm::append_range(Args, VariadicArgs);
-  return emitLibCall(LibFunc_sprintf, B.getInt32Ty(), 
-                     {B.getInt8PtrTy(), B.getInt8PtrTy()}, Args, B, TLI, 
-                     /*IsVaArgs=*/true); 
-} 
- 
-Value *llvm::emitStrCat(Value *Dest, Value *Src, IRBuilderBase &B, 
-                        const TargetLibraryInfo *TLI) { 
-  return emitLibCall(LibFunc_strcat, B.getInt8PtrTy(), 
-                     {B.getInt8PtrTy(), B.getInt8PtrTy()}, 
-                     {castToCStr(Dest, B), castToCStr(Src, B)}, B, TLI); 
-} 
- 
-Value *llvm::emitStrLCpy(Value *Dest, Value *Src, Value *Size, IRBuilderBase &B, 
-                         const TargetLibraryInfo *TLI) { 
-  return emitLibCall(LibFunc_strlcpy, Size->getType(), 
-                     {B.getInt8PtrTy(), B.getInt8PtrTy(), Size->getType()}, 
-                     {castToCStr(Dest, B), castToCStr(Src, B), Size}, B, TLI); 
-} 
- 
-Value *llvm::emitStrLCat(Value *Dest, Value *Src, Value *Size, IRBuilderBase &B, 
-                         const TargetLibraryInfo *TLI) { 
-  return emitLibCall(LibFunc_strlcat, Size->getType(), 
-                     {B.getInt8PtrTy(), B.getInt8PtrTy(), Size->getType()}, 
-                     {castToCStr(Dest, B), castToCStr(Src, B), Size}, B, TLI); 
-} 
- 
-Value *llvm::emitStrNCat(Value *Dest, Value *Src, Value *Size, IRBuilderBase &B, 
-                         const TargetLibraryInfo *TLI) { 
-  return emitLibCall(LibFunc_strncat, B.getInt8PtrTy(), 
-                     {B.getInt8PtrTy(), B.getInt8PtrTy(), Size->getType()}, 
-                     {castToCStr(Dest, B), castToCStr(Src, B), Size}, B, TLI); 
-} 
- 
-Value *llvm::emitVSNPrintf(Value *Dest, Value *Size, Value *Fmt, Value *VAList, 
-                           IRBuilderBase &B, const TargetLibraryInfo *TLI) { 
-  return emitLibCall( 
-      LibFunc_vsnprintf, B.getInt32Ty(), 
-      {B.getInt8PtrTy(), Size->getType(), B.getInt8PtrTy(), VAList->getType()}, 
-      {castToCStr(Dest, B), Size, castToCStr(Fmt, B), VAList}, B, TLI); 
-} 
- 
-Value *llvm::emitVSPrintf(Value *Dest, Value *Fmt, Value *VAList, 
-                          IRBuilderBase &B, const TargetLibraryInfo *TLI) { 
-  return emitLibCall(LibFunc_vsprintf, B.getInt32Ty(), 
-                     {B.getInt8PtrTy(), B.getInt8PtrTy(), VAList->getType()}, 
-                     {castToCStr(Dest, B), castToCStr(Fmt, B), VAList}, B, TLI); 
-} 
- 
-/// Append a suffix to the function name according to the type of 'Op'. 
-static void appendTypeSuffix(Value *Op, StringRef &Name, 
-                             SmallString<20> &NameBuffer) { 
-  if (!Op->getType()->isDoubleTy()) { 
-      NameBuffer += Name; 
- 
-    if (Op->getType()->isFloatTy()) 
-      NameBuffer += 'f'; 
-    else 
-      NameBuffer += 'l'; 
- 
-    Name = NameBuffer; 
-  } 
-} 
- 
-static Value *emitUnaryFloatFnCallHelper(Value *Op, StringRef Name, 
-                                         IRBuilderBase &B, 
-                                         const AttributeList &Attrs) { 
-  assert((Name != "") && "Must specify Name to emitUnaryFloatFnCall"); 
- 
-  Module *M = B.GetInsertBlock()->getModule(); 
-  FunctionCallee Callee = 
-      M->getOrInsertFunction(Name, Op->getType(), Op->getType()); 
-  CallInst *CI = B.CreateCall(Callee, Op, Name); 
- 
-  // The incoming attribute set may have come from a speculatable intrinsic, but 
-  // is being replaced with a library call which is not allowed to be 
-  // speculatable. 
-  CI->setAttributes(Attrs.removeAttribute(B.getContext(), 
-                                          AttributeList::FunctionIndex, 
-                                          Attribute::Speculatable)); 
-  if (const Function *F = 
-          dyn_cast<Function>(Callee.getCallee()->stripPointerCasts())) 
-    CI->setCallingConv(F->getCallingConv()); 
- 
-  return CI; 
-} 
- 
-Value *llvm::emitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilderBase &B, 
-                                  const AttributeList &Attrs) { 
-  SmallString<20> NameBuffer; 
-  appendTypeSuffix(Op, Name, NameBuffer); 
- 
-  return emitUnaryFloatFnCallHelper(Op, Name, B, Attrs); 
-} 
- 
-Value *llvm::emitUnaryFloatFnCall(Value *Op, const TargetLibraryInfo *TLI, 
-                                  LibFunc DoubleFn, LibFunc FloatFn, 
-                                  LibFunc LongDoubleFn, IRBuilderBase &B, 
-                                  const AttributeList &Attrs) { 
-  // Get the name of the function according to TLI. 
-  StringRef Name = getFloatFnName(TLI, Op->getType(), 
-                                  DoubleFn, FloatFn, LongDoubleFn); 
- 
-  return emitUnaryFloatFnCallHelper(Op, Name, B, Attrs); 
-} 
- 
-static Value *emitBinaryFloatFnCallHelper(Value *Op1, Value *Op2, 
-                                          StringRef Name, IRBuilderBase &B, 
+  return emitLibCall(LibFunc_sprintf, B.getInt32Ty(),
+                     {B.getInt8PtrTy(), B.getInt8PtrTy()}, Args, B, TLI,
+                     /*IsVaArgs=*/true);
+}
+
+Value *llvm::emitStrCat(Value *Dest, Value *Src, IRBuilderBase &B,
+                        const TargetLibraryInfo *TLI) {
+  return emitLibCall(LibFunc_strcat, B.getInt8PtrTy(),
+                     {B.getInt8PtrTy(), B.getInt8PtrTy()},
+                     {castToCStr(Dest, B), castToCStr(Src, B)}, B, TLI);
+}
+
+Value *llvm::emitStrLCpy(Value *Dest, Value *Src, Value *Size, IRBuilderBase &B,
+                         const TargetLibraryInfo *TLI) {
+  return emitLibCall(LibFunc_strlcpy, Size->getType(),
+                     {B.getInt8PtrTy(), B.getInt8PtrTy(), Size->getType()},
+                     {castToCStr(Dest, B), castToCStr(Src, B), Size}, B, TLI);
+}
+
+Value *llvm::emitStrLCat(Value *Dest, Value *Src, Value *Size, IRBuilderBase &B,
+                         const TargetLibraryInfo *TLI) {
+  return emitLibCall(LibFunc_strlcat, Size->getType(),
+                     {B.getInt8PtrTy(), B.getInt8PtrTy(), Size->getType()},
+                     {castToCStr(Dest, B), castToCStr(Src, B), Size}, B, TLI);
+}
+
+Value *llvm::emitStrNCat(Value *Dest, Value *Src, Value *Size, IRBuilderBase &B,
+                         const TargetLibraryInfo *TLI) {
+  return emitLibCall(LibFunc_strncat, B.getInt8PtrTy(),
+                     {B.getInt8PtrTy(), B.getInt8PtrTy(), Size->getType()},
+                     {castToCStr(Dest, B), castToCStr(Src, B), Size}, B, TLI);
+}
+
+Value *llvm::emitVSNPrintf(Value *Dest, Value *Size, Value *Fmt, Value *VAList,
+                           IRBuilderBase &B, const TargetLibraryInfo *TLI) {
+  return emitLibCall(
+      LibFunc_vsnprintf, B.getInt32Ty(),
+      {B.getInt8PtrTy(), Size->getType(), B.getInt8PtrTy(), VAList->getType()},
+      {castToCStr(Dest, B), Size, castToCStr(Fmt, B), VAList}, B, TLI);
+}
+
+Value *llvm::emitVSPrintf(Value *Dest, Value *Fmt, Value *VAList,
+                          IRBuilderBase &B, const TargetLibraryInfo *TLI) {
+  return emitLibCall(LibFunc_vsprintf, B.getInt32Ty(),
+                     {B.getInt8PtrTy(), B.getInt8PtrTy(), VAList->getType()},
+                     {castToCStr(Dest, B), castToCStr(Fmt, B), VAList}, B, TLI);
+}
+
+/// Append a suffix to the function name according to the type of 'Op'.
+static void appendTypeSuffix(Value *Op, StringRef &Name,
+                             SmallString<20> &NameBuffer) {
+  if (!Op->getType()->isDoubleTy()) {
+      NameBuffer += Name;
+
+    if (Op->getType()->isFloatTy())
+      NameBuffer += 'f';
+    else
+      NameBuffer += 'l';
+
+    Name = NameBuffer;
+  }
+}
+
+static Value *emitUnaryFloatFnCallHelper(Value *Op, StringRef Name,
+                                         IRBuilderBase &B,
+                                         const AttributeList &Attrs) {
+  assert((Name != "") && "Must specify Name to emitUnaryFloatFnCall");
+
+  Module *M = B.GetInsertBlock()->getModule();
+  FunctionCallee Callee =
+      M->getOrInsertFunction(Name, Op->getType(), Op->getType());
+  CallInst *CI = B.CreateCall(Callee, Op, Name);
+
+  // The incoming attribute set may have come from a speculatable intrinsic, but
+  // is being replaced with a library call which is not allowed to be
+  // speculatable.
+  CI->setAttributes(Attrs.removeAttribute(B.getContext(),
+                                          AttributeList::FunctionIndex,
+                                          Attribute::Speculatable));
+  if (const Function *F =
+          dyn_cast<Function>(Callee.getCallee()->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+
+  return CI;
+}
+
+Value *llvm::emitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilderBase &B,
+                                  const AttributeList &Attrs) {
+  SmallString<20> NameBuffer;
+  appendTypeSuffix(Op, Name, NameBuffer);
+
+  return emitUnaryFloatFnCallHelper(Op, Name, B, Attrs);
+}
+
+Value *llvm::emitUnaryFloatFnCall(Value *Op, const TargetLibraryInfo *TLI,
+                                  LibFunc DoubleFn, LibFunc FloatFn,
+                                  LibFunc LongDoubleFn, IRBuilderBase &B,
+                                  const AttributeList &Attrs) {
+  // Get the name of the function according to TLI.
+  StringRef Name = getFloatFnName(TLI, Op->getType(),
+                                  DoubleFn, FloatFn, LongDoubleFn);
+
+  return emitUnaryFloatFnCallHelper(Op, Name, B, Attrs);
+}
+
+static Value *emitBinaryFloatFnCallHelper(Value *Op1, Value *Op2,
+                                          StringRef Name, IRBuilderBase &B,
                                           const AttributeList &Attrs,
                                           const TargetLibraryInfo *TLI = nullptr) {
-  assert((Name != "") && "Must specify Name to emitBinaryFloatFnCall"); 
- 
-  Module *M = B.GetInsertBlock()->getModule(); 
-  FunctionCallee Callee = M->getOrInsertFunction(Name, Op1->getType(), 
-                                                 Op1->getType(), Op2->getType()); 
+  assert((Name != "") && "Must specify Name to emitBinaryFloatFnCall");
+
+  Module *M = B.GetInsertBlock()->getModule();
+  FunctionCallee Callee = M->getOrInsertFunction(Name, Op1->getType(),
+                                                 Op1->getType(), Op2->getType());
   if (TLI != nullptr)
     inferLibFuncAttributes(M, Name, *TLI);
-  CallInst *CI = B.CreateCall(Callee, { Op1, Op2 }, Name); 
- 
-  // The incoming attribute set may have come from a speculatable intrinsic, but 
-  // is being replaced with a library call which is not allowed to be 
-  // speculatable. 
-  CI->setAttributes(Attrs.removeAttribute(B.getContext(), 
-                                          AttributeList::FunctionIndex, 
-                                          Attribute::Speculatable)); 
-  if (const Function *F = 
-          dyn_cast<Function>(Callee.getCallee()->stripPointerCasts())) 
-    CI->setCallingConv(F->getCallingConv()); 
- 
-  return CI; 
-} 
- 
-Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name, 
-                                   IRBuilderBase &B, 
-                                   const AttributeList &Attrs) { 
-  assert((Name != "") && "Must specify Name to emitBinaryFloatFnCall"); 
- 
-  SmallString<20> NameBuffer; 
-  appendTypeSuffix(Op1, Name, NameBuffer); 
- 
-  return emitBinaryFloatFnCallHelper(Op1, Op2, Name, B, Attrs); 
-} 
- 
-Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, 
-                                   const TargetLibraryInfo *TLI, 
-                                   LibFunc DoubleFn, LibFunc FloatFn, 
-                                   LibFunc LongDoubleFn, IRBuilderBase &B, 
-                                   const AttributeList &Attrs) { 
-  // Get the name of the function according to TLI. 
-  StringRef Name = getFloatFnName(TLI, Op1->getType(), 
-                                  DoubleFn, FloatFn, LongDoubleFn); 
- 
+  CallInst *CI = B.CreateCall(Callee, { Op1, Op2 }, Name);
+
+  // The incoming attribute set may have come from a speculatable intrinsic, but
+  // is being replaced with a library call which is not allowed to be
+  // speculatable.
+  CI->setAttributes(Attrs.removeAttribute(B.getContext(),
+                                          AttributeList::FunctionIndex,
+                                          Attribute::Speculatable));
+  if (const Function *F =
+          dyn_cast<Function>(Callee.getCallee()->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+
+  return CI;
+}
+
+Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name,
+                                   IRBuilderBase &B,
+                                   const AttributeList &Attrs) {
+  assert((Name != "") && "Must specify Name to emitBinaryFloatFnCall");
+
+  SmallString<20> NameBuffer;
+  appendTypeSuffix(Op1, Name, NameBuffer);
+
+  return emitBinaryFloatFnCallHelper(Op1, Op2, Name, B, Attrs);
+}
+
+Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2,
+                                   const TargetLibraryInfo *TLI,
+                                   LibFunc DoubleFn, LibFunc FloatFn,
+                                   LibFunc LongDoubleFn, IRBuilderBase &B,
+                                   const AttributeList &Attrs) {
+  // Get the name of the function according to TLI.
+  StringRef Name = getFloatFnName(TLI, Op1->getType(),
+                                  DoubleFn, FloatFn, LongDoubleFn);
+
   return emitBinaryFloatFnCallHelper(Op1, Op2, Name, B, Attrs, TLI);
-} 
- 
-Value *llvm::emitPutChar(Value *Char, IRBuilderBase &B, 
-                         const TargetLibraryInfo *TLI) { 
-  if (!TLI->has(LibFunc_putchar)) 
-    return nullptr; 
- 
-  Module *M = B.GetInsertBlock()->getModule(); 
-  StringRef PutCharName = TLI->getName(LibFunc_putchar); 
-  FunctionCallee PutChar = 
-      M->getOrInsertFunction(PutCharName, B.getInt32Ty(), B.getInt32Ty()); 
-  inferLibFuncAttributes(M, PutCharName, *TLI); 
-  CallInst *CI = B.CreateCall(PutChar, 
-                              B.CreateIntCast(Char, 
-                              B.getInt32Ty(), 
-                              /*isSigned*/true, 
-                              "chari"), 
-                              PutCharName); 
- 
-  if (const Function *F = 
-          dyn_cast<Function>(PutChar.getCallee()->stripPointerCasts())) 
-    CI->setCallingConv(F->getCallingConv()); 
-  return CI; 
-} 
- 
-Value *llvm::emitPutS(Value *Str, IRBuilderBase &B, 
-                      const TargetLibraryInfo *TLI) { 
-  if (!TLI->has(LibFunc_puts)) 
-    return nullptr; 
- 
-  Module *M = B.GetInsertBlock()->getModule(); 
-  StringRef PutsName = TLI->getName(LibFunc_puts); 
-  FunctionCallee PutS = 
-      M->getOrInsertFunction(PutsName, B.getInt32Ty(), B.getInt8PtrTy()); 
-  inferLibFuncAttributes(M, PutsName, *TLI); 
-  CallInst *CI = B.CreateCall(PutS, castToCStr(Str, B), PutsName); 
-  if (const Function *F = 
-          dyn_cast<Function>(PutS.getCallee()->stripPointerCasts())) 
-    CI->setCallingConv(F->getCallingConv()); 
-  return CI; 
-} 
- 
-Value *llvm::emitFPutC(Value *Char, Value *File, IRBuilderBase &B, 
-                       const TargetLibraryInfo *TLI) { 
-  if (!TLI->has(LibFunc_fputc)) 
-    return nullptr; 
- 
-  Module *M = B.GetInsertBlock()->getModule(); 
-  StringRef FPutcName = TLI->getName(LibFunc_fputc); 
-  FunctionCallee F = M->getOrInsertFunction(FPutcName, B.getInt32Ty(), 
-                                            B.getInt32Ty(), File->getType()); 
-  if (File->getType()->isPointerTy()) 
-    inferLibFuncAttributes(M, FPutcName, *TLI); 
-  Char = B.CreateIntCast(Char, B.getInt32Ty(), /*isSigned*/true, 
-                         "chari"); 
-  CallInst *CI = B.CreateCall(F, {Char, File}, FPutcName); 
- 
-  if (const Function *Fn = 
-          dyn_cast<Function>(F.getCallee()->stripPointerCasts())) 
-    CI->setCallingConv(Fn->getCallingConv()); 
-  return CI; 
-} 
- 
-Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilderBase &B, 
-                       const TargetLibraryInfo *TLI) { 
-  if (!TLI->has(LibFunc_fputs)) 
-    return nullptr; 
- 
-  Module *M = B.GetInsertBlock()->getModule(); 
-  StringRef FPutsName = TLI->getName(LibFunc_fputs); 
-  FunctionCallee F = M->getOrInsertFunction(FPutsName, B.getInt32Ty(), 
-                                            B.getInt8PtrTy(), File->getType()); 
-  if (File->getType()->isPointerTy()) 
-    inferLibFuncAttributes(M, FPutsName, *TLI); 
-  CallInst *CI = B.CreateCall(F, {castToCStr(Str, B), File}, FPutsName); 
- 
-  if (const Function *Fn = 
-          dyn_cast<Function>(F.getCallee()->stripPointerCasts())) 
-    CI->setCallingConv(Fn->getCallingConv()); 
-  return CI; 
-} 
- 
-Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilderBase &B, 
-                        const DataLayout &DL, const TargetLibraryInfo *TLI) { 
-  if (!TLI->has(LibFunc_fwrite)) 
-    return nullptr; 
- 
-  Module *M = B.GetInsertBlock()->getModule(); 
-  LLVMContext &Context = B.GetInsertBlock()->getContext(); 
-  StringRef FWriteName = TLI->getName(LibFunc_fwrite); 
-  FunctionCallee F = M->getOrInsertFunction( 
-      FWriteName, DL.getIntPtrType(Context), B.getInt8PtrTy(), 
-      DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType()); 
- 
-  if (File->getType()->isPointerTy()) 
-    inferLibFuncAttributes(M, FWriteName, *TLI); 
-  CallInst *CI = 
-      B.CreateCall(F, {castToCStr(Ptr, B), Size, 
-                       ConstantInt::get(DL.getIntPtrType(Context), 1), File}); 
- 
-  if (const Function *Fn = 
-          dyn_cast<Function>(F.getCallee()->stripPointerCasts())) 
-    CI->setCallingConv(Fn->getCallingConv()); 
-  return CI; 
-} 
- 
-Value *llvm::emitMalloc(Value *Num, IRBuilderBase &B, const DataLayout &DL, 
-                        const TargetLibraryInfo *TLI) { 
-  if (!TLI->has(LibFunc_malloc)) 
-    return nullptr; 
- 
-  Module *M = B.GetInsertBlock()->getModule(); 
-  StringRef MallocName = TLI->getName(LibFunc_malloc); 
-  LLVMContext &Context = B.GetInsertBlock()->getContext(); 
-  FunctionCallee Malloc = M->getOrInsertFunction(MallocName, B.getInt8PtrTy(), 
-                                                 DL.getIntPtrType(Context)); 
-  inferLibFuncAttributes(M, MallocName, *TLI); 
-  CallInst *CI = B.CreateCall(Malloc, Num, MallocName); 
- 
-  if (const Function *F = 
-          dyn_cast<Function>(Malloc.getCallee()->stripPointerCasts())) 
-    CI->setCallingConv(F->getCallingConv()); 
- 
-  return CI; 
-} 
- 
-Value *llvm::emitCalloc(Value *Num, Value *Size, const AttributeList &Attrs, 
-                        IRBuilderBase &B, const TargetLibraryInfo &TLI) { 
-  if (!TLI.has(LibFunc_calloc)) 
-    return nullptr; 
- 
-  Module *M = B.GetInsertBlock()->getModule(); 
-  StringRef CallocName = TLI.getName(LibFunc_calloc); 
-  const DataLayout &DL = M->getDataLayout(); 
-  IntegerType *PtrType = DL.getIntPtrType((B.GetInsertBlock()->getContext())); 
-  FunctionCallee Calloc = M->getOrInsertFunction( 
-      CallocName, Attrs, B.getInt8PtrTy(), PtrType, PtrType); 
-  inferLibFuncAttributes(M, CallocName, TLI); 
-  CallInst *CI = B.CreateCall(Calloc, {Num, Size}, CallocName); 
- 
-  if (const auto *F = 
-          dyn_cast<Function>(Calloc.getCallee()->stripPointerCasts())) 
-    CI->setCallingConv(F->getCallingConv()); 
- 
-  return CI; 
-} 
+}
+
+Value *llvm::emitPutChar(Value *Char, IRBuilderBase &B,
+                         const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc_putchar))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  StringRef PutCharName = TLI->getName(LibFunc_putchar);
+  FunctionCallee PutChar =
+      M->getOrInsertFunction(PutCharName, B.getInt32Ty(), B.getInt32Ty());
+  inferLibFuncAttributes(M, PutCharName, *TLI);
+  CallInst *CI = B.CreateCall(PutChar,
+                              B.CreateIntCast(Char,
+                              B.getInt32Ty(),
+                              /*isSigned*/true,
+                              "chari"),
+                              PutCharName);
+
+  if (const Function *F =
+          dyn_cast<Function>(PutChar.getCallee()->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+  return CI;
+}
+
+Value *llvm::emitPutS(Value *Str, IRBuilderBase &B,
+                      const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc_puts))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  StringRef PutsName = TLI->getName(LibFunc_puts);
+  FunctionCallee PutS =
+      M->getOrInsertFunction(PutsName, B.getInt32Ty(), B.getInt8PtrTy());
+  inferLibFuncAttributes(M, PutsName, *TLI);
+  CallInst *CI = B.CreateCall(PutS, castToCStr(Str, B), PutsName);
+  if (const Function *F =
+          dyn_cast<Function>(PutS.getCallee()->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+  return CI;
+}
+
+Value *llvm::emitFPutC(Value *Char, Value *File, IRBuilderBase &B,
+                       const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc_fputc))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  StringRef FPutcName = TLI->getName(LibFunc_fputc);
+  FunctionCallee F = M->getOrInsertFunction(FPutcName, B.getInt32Ty(),
+                                            B.getInt32Ty(), File->getType());
+  if (File->getType()->isPointerTy())
+    inferLibFuncAttributes(M, FPutcName, *TLI);
+  Char = B.CreateIntCast(Char, B.getInt32Ty(), /*isSigned*/true,
+                         "chari");
+  CallInst *CI = B.CreateCall(F, {Char, File}, FPutcName);
+
+  if (const Function *Fn =
+          dyn_cast<Function>(F.getCallee()->stripPointerCasts()))
+    CI->setCallingConv(Fn->getCallingConv());
+  return CI;
+}
+
+Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilderBase &B,
+                       const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc_fputs))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  StringRef FPutsName = TLI->getName(LibFunc_fputs);
+  FunctionCallee F = M->getOrInsertFunction(FPutsName, B.getInt32Ty(),
+                                            B.getInt8PtrTy(), File->getType());
+  if (File->getType()->isPointerTy())
+    inferLibFuncAttributes(M, FPutsName, *TLI);
+  CallInst *CI = B.CreateCall(F, {castToCStr(Str, B), File}, FPutsName);
+
+  if (const Function *Fn =
+          dyn_cast<Function>(F.getCallee()->stripPointerCasts()))
+    CI->setCallingConv(Fn->getCallingConv());
+  return CI;
+}
+
+Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilderBase &B,
+                        const DataLayout &DL, const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc_fwrite))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  LLVMContext &Context = B.GetInsertBlock()->getContext();
+  StringRef FWriteName = TLI->getName(LibFunc_fwrite);
+  FunctionCallee F = M->getOrInsertFunction(
+      FWriteName, DL.getIntPtrType(Context), B.getInt8PtrTy(),
+      DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType());
+
+  if (File->getType()->isPointerTy())
+    inferLibFuncAttributes(M, FWriteName, *TLI);
+  CallInst *CI =
+      B.CreateCall(F, {castToCStr(Ptr, B), Size,
+                       ConstantInt::get(DL.getIntPtrType(Context), 1), File});
+
+  if (const Function *Fn =
+          dyn_cast<Function>(F.getCallee()->stripPointerCasts()))
+    CI->setCallingConv(Fn->getCallingConv());
+  return CI;
+}
+
+Value *llvm::emitMalloc(Value *Num, IRBuilderBase &B, const DataLayout &DL,
+                        const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc_malloc))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  StringRef MallocName = TLI->getName(LibFunc_malloc);
+  LLVMContext &Context = B.GetInsertBlock()->getContext();
+  FunctionCallee Malloc = M->getOrInsertFunction(MallocName, B.getInt8PtrTy(),
+                                                 DL.getIntPtrType(Context));
+  inferLibFuncAttributes(M, MallocName, *TLI);
+  CallInst *CI = B.CreateCall(Malloc, Num, MallocName);
+
+  if (const Function *F =
+          dyn_cast<Function>(Malloc.getCallee()->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+
+  return CI;
+}
+
+Value *llvm::emitCalloc(Value *Num, Value *Size, const AttributeList &Attrs,
+                        IRBuilderBase &B, const TargetLibraryInfo &TLI) {
+  if (!TLI.has(LibFunc_calloc))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  StringRef CallocName = TLI.getName(LibFunc_calloc);
+  const DataLayout &DL = M->getDataLayout();
+  IntegerType *PtrType = DL.getIntPtrType((B.GetInsertBlock()->getContext()));
+  FunctionCallee Calloc = M->getOrInsertFunction(
+      CallocName, Attrs, B.getInt8PtrTy(), PtrType, PtrType);
+  inferLibFuncAttributes(M, CallocName, TLI);
+  CallInst *CI = B.CreateCall(Calloc, {Num, Size}, CallocName);
+
+  if (const auto *F =
+          dyn_cast<Function>(Calloc.getCallee()->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+
+  return CI;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/BypassSlowDivision.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/BypassSlowDivision.cpp
index 4299153e7b..833d042106 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -1,482 +1,482 @@
-//===- BypassSlowDivision.cpp - Bypass slow division ----------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file contains an optimization for div and rem on architectures that 
-// execute short instructions significantly faster than longer instructions. 
-// For example, on Intel Atom 32-bit divides are slow enough that during 
-// runtime it is profitable to check the value of the operands, and if they are 
-// positive and less than 256 use an unsigned 8-bit divide. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/BypassSlowDivision.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/None.h" 
-#include "llvm/ADT/Optional.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/KnownBits.h" 
-#include <cassert> 
-#include <cstdint> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "bypass-slow-division" 
- 
-namespace { 
- 
-  struct QuotRemPair { 
-    Value *Quotient; 
-    Value *Remainder; 
- 
-    QuotRemPair(Value *InQuotient, Value *InRemainder) 
-        : Quotient(InQuotient), Remainder(InRemainder) {} 
-  }; 
- 
-  /// A quotient and remainder, plus a BB from which they logically "originate". 
-  /// If you use Quotient or Remainder in a Phi node, you should use BB as its 
-  /// corresponding predecessor. 
-  struct QuotRemWithBB { 
-    BasicBlock *BB = nullptr; 
-    Value *Quotient = nullptr; 
-    Value *Remainder = nullptr; 
-  }; 
- 
-using DivCacheTy = DenseMap<DivRemMapKey, QuotRemPair>; 
-using BypassWidthsTy = DenseMap<unsigned, unsigned>; 
-using VisitedSetTy = SmallPtrSet<Instruction *, 4>; 
- 
-enum ValueRange { 
-  /// Operand definitely fits into BypassType. No runtime checks are needed. 
-  VALRNG_KNOWN_SHORT, 
-  /// A runtime check is required, as value range is unknown. 
-  VALRNG_UNKNOWN, 
-  /// Operand is unlikely to fit into BypassType. The bypassing should be 
-  /// disabled. 
-  VALRNG_LIKELY_LONG 
-}; 
- 
-class FastDivInsertionTask { 
-  bool IsValidTask = false; 
-  Instruction *SlowDivOrRem = nullptr; 
-  IntegerType *BypassType = nullptr; 
-  BasicBlock *MainBB = nullptr; 
- 
-  bool isHashLikeValue(Value *V, VisitedSetTy &Visited); 
-  ValueRange getValueRange(Value *Op, VisitedSetTy &Visited); 
-  QuotRemWithBB createSlowBB(BasicBlock *Successor); 
-  QuotRemWithBB createFastBB(BasicBlock *Successor); 
-  QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS, 
-                                   BasicBlock *PhiBB); 
-  Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2); 
-  Optional<QuotRemPair> insertFastDivAndRem(); 
- 
-  bool isSignedOp() { 
-    return SlowDivOrRem->getOpcode() == Instruction::SDiv || 
-           SlowDivOrRem->getOpcode() == Instruction::SRem; 
-  } 
- 
-  bool isDivisionOp() { 
-    return SlowDivOrRem->getOpcode() == Instruction::SDiv || 
-           SlowDivOrRem->getOpcode() == Instruction::UDiv; 
-  } 
- 
-  Type *getSlowType() { return SlowDivOrRem->getType(); } 
- 
-public: 
-  FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths); 
- 
-  Value *getReplacement(DivCacheTy &Cache); 
-}; 
- 
-} // end anonymous namespace 
- 
-FastDivInsertionTask::FastDivInsertionTask(Instruction *I, 
-                                           const BypassWidthsTy &BypassWidths) { 
-  switch (I->getOpcode()) { 
-  case Instruction::UDiv: 
-  case Instruction::SDiv: 
-  case Instruction::URem: 
-  case Instruction::SRem: 
-    SlowDivOrRem = I; 
-    break; 
-  default: 
-    // I is not a div/rem operation. 
-    return; 
-  } 
- 
-  // Skip division on vector types. Only optimize integer instructions. 
-  IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType()); 
-  if (!SlowType) 
-    return; 
- 
-  // Skip if this bitwidth is not bypassed. 
-  auto BI = BypassWidths.find(SlowType->getBitWidth()); 
-  if (BI == BypassWidths.end()) 
-    return; 
- 
-  // Get type for div/rem instruction with bypass bitwidth. 
-  IntegerType *BT = IntegerType::get(I->getContext(), BI->second); 
-  BypassType = BT; 
- 
-  // The original basic block. 
-  MainBB = I->getParent(); 
- 
-  // The instruction is indeed a slow div or rem operation. 
-  IsValidTask = true; 
-} 
- 
-/// Reuses previously-computed dividend or remainder from the current BB if 
-/// operands and operation are identical. Otherwise calls insertFastDivAndRem to 
-/// perform the optimization and caches the resulting dividend and remainder. 
-/// If no replacement can be generated, nullptr is returned. 
-Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) { 
-  // First, make sure that the task is valid. 
-  if (!IsValidTask) 
-    return nullptr; 
- 
-  // Then, look for a value in Cache. 
-  Value *Dividend = SlowDivOrRem->getOperand(0); 
-  Value *Divisor = SlowDivOrRem->getOperand(1); 
-  DivRemMapKey Key(isSignedOp(), Dividend, Divisor); 
-  auto CacheI = Cache.find(Key); 
- 
-  if (CacheI == Cache.end()) { 
-    // If previous instance does not exist, try to insert fast div. 
-    Optional<QuotRemPair> OptResult = insertFastDivAndRem(); 
-    // Bail out if insertFastDivAndRem has failed. 
-    if (!OptResult) 
-      return nullptr; 
-    CacheI = Cache.insert({Key, *OptResult}).first; 
-  } 
- 
-  QuotRemPair &Value = CacheI->second; 
-  return isDivisionOp() ? Value.Quotient : Value.Remainder; 
-} 
- 
-/// Check if a value looks like a hash. 
-/// 
-/// The routine is expected to detect values computed using the most common hash 
-/// algorithms. Typically, hash computations end with one of the following 
-/// instructions: 
-/// 
-/// 1) MUL with a constant wider than BypassType 
-/// 2) XOR instruction 
-/// 
-/// And even if we are wrong and the value is not a hash, it is still quite 
-/// unlikely that such values will fit into BypassType. 
-/// 
-/// To detect string hash algorithms like FNV we have to look through PHI-nodes. 
-/// It is implemented as a depth-first search for values that look neither long 
-/// nor hash-like. 
-bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) { 
-  Instruction *I = dyn_cast<Instruction>(V); 
-  if (!I) 
-    return false; 
- 
-  switch (I->getOpcode()) { 
-  case Instruction::Xor: 
-    return true; 
-  case Instruction::Mul: { 
-    // After Constant Hoisting pass, long constants may be represented as 
-    // bitcast instructions. As a result, some constants may look like an 
-    // instruction at first, and an additional check is necessary to find out if 
-    // an operand is actually a constant. 
-    Value *Op1 = I->getOperand(1); 
-    ConstantInt *C = dyn_cast<ConstantInt>(Op1); 
-    if (!C && isa<BitCastInst>(Op1)) 
-      C = dyn_cast<ConstantInt>(cast<BitCastInst>(Op1)->getOperand(0)); 
-    return C && C->getValue().getMinSignedBits() > BypassType->getBitWidth(); 
-  } 
-  case Instruction::PHI: 
-    // Stop IR traversal in case of a crazy input code. This limits recursion 
-    // depth. 
-    if (Visited.size() >= 16) 
-      return false; 
-    // Do not visit nodes that have been visited already. We return true because 
-    // it means that we couldn't find any value that doesn't look hash-like. 
-    if (!Visited.insert(I).second) 
-      return true; 
-    return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) { 
-      // Ignore undef values as they probably don't affect the division 
-      // operands. 
-      return getValueRange(V, Visited) == VALRNG_LIKELY_LONG || 
-             isa<UndefValue>(V); 
-    }); 
-  default: 
-    return false; 
-  } 
-} 
- 
-/// Check if an integer value fits into our bypass type. 
-ValueRange FastDivInsertionTask::getValueRange(Value *V, 
-                                               VisitedSetTy &Visited) { 
-  unsigned ShortLen = BypassType->getBitWidth(); 
-  unsigned LongLen = V->getType()->getIntegerBitWidth(); 
- 
-  assert(LongLen > ShortLen && "Value type must be wider than BypassType"); 
-  unsigned HiBits = LongLen - ShortLen; 
- 
-  const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout(); 
-  KnownBits Known(LongLen); 
- 
-  computeKnownBits(V, Known, DL); 
- 
-  if (Known.countMinLeadingZeros() >= HiBits) 
-    return VALRNG_KNOWN_SHORT; 
- 
-  if (Known.countMaxLeadingZeros() < HiBits) 
-    return VALRNG_LIKELY_LONG; 
- 
-  // Long integer divisions are often used in hashtable implementations. It's 
-  // not worth bypassing such divisions because hash values are extremely 
-  // unlikely to have enough leading zeros. The call below tries to detect 
-  // values that are unlikely to fit BypassType (including hashes). 
-  if (isHashLikeValue(V, Visited)) 
-    return VALRNG_LIKELY_LONG; 
- 
-  return VALRNG_UNKNOWN; 
-} 
- 
-/// Add new basic block for slow div and rem operations and put it before 
-/// SuccessorBB. 
-QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) { 
-  QuotRemWithBB DivRemPair; 
-  DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "", 
-                                     MainBB->getParent(), SuccessorBB); 
-  IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin()); 
-  Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc()); 
- 
-  Value *Dividend = SlowDivOrRem->getOperand(0); 
-  Value *Divisor = SlowDivOrRem->getOperand(1); 
- 
-  if (isSignedOp()) { 
-    DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor); 
-    DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor); 
-  } else { 
-    DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor); 
-    DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor); 
-  } 
- 
-  Builder.CreateBr(SuccessorBB); 
-  return DivRemPair; 
-} 
- 
-/// Add new basic block for fast div and rem operations and put it before 
-/// SuccessorBB. 
-QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) { 
-  QuotRemWithBB DivRemPair; 
-  DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "", 
-                                     MainBB->getParent(), SuccessorBB); 
-  IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin()); 
-  Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc()); 
- 
-  Value *Dividend = SlowDivOrRem->getOperand(0); 
-  Value *Divisor = SlowDivOrRem->getOperand(1); 
-  Value *ShortDivisorV = 
-      Builder.CreateCast(Instruction::Trunc, Divisor, BypassType); 
-  Value *ShortDividendV = 
-      Builder.CreateCast(Instruction::Trunc, Dividend, BypassType); 
- 
-  // udiv/urem because this optimization only handles positive numbers. 
-  Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV); 
-  Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV); 
-  DivRemPair.Quotient = 
-      Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType()); 
-  DivRemPair.Remainder = 
-      Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType()); 
-  Builder.CreateBr(SuccessorBB); 
- 
-  return DivRemPair; 
-} 
- 
-/// Creates Phi nodes for result of Div and Rem. 
-QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS, 
-                                                       QuotRemWithBB &RHS, 
-                                                       BasicBlock *PhiBB) { 
-  IRBuilder<> Builder(PhiBB, PhiBB->begin()); 
-  Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc()); 
-  PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2); 
-  QuoPhi->addIncoming(LHS.Quotient, LHS.BB); 
-  QuoPhi->addIncoming(RHS.Quotient, RHS.BB); 
-  PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2); 
-  RemPhi->addIncoming(LHS.Remainder, LHS.BB); 
-  RemPhi->addIncoming(RHS.Remainder, RHS.BB); 
-  return QuotRemPair(QuoPhi, RemPhi); 
-} 
- 
-/// Creates a runtime check to test whether both the divisor and dividend fit 
-/// into BypassType. The check is inserted at the end of MainBB. True return 
-/// value means that the operands fit. Either of the operands may be NULL if it 
-/// doesn't need a runtime check. 
-Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) { 
-  assert((Op1 || Op2) && "Nothing to check"); 
-  IRBuilder<> Builder(MainBB, MainBB->end()); 
-  Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc()); 
- 
-  Value *OrV; 
-  if (Op1 && Op2) 
-    OrV = Builder.CreateOr(Op1, Op2); 
-  else 
-    OrV = Op1 ? Op1 : Op2; 
- 
-  // BitMask is inverted to check if the operands are 
-  // larger than the bypass type 
-  uint64_t BitMask = ~BypassType->getBitMask(); 
-  Value *AndV = Builder.CreateAnd(OrV, BitMask); 
- 
-  // Compare operand values 
-  Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0); 
-  return Builder.CreateICmpEQ(AndV, ZeroV); 
-} 
- 
-/// Substitutes the div/rem instruction with code that checks the value of the 
-/// operands and uses a shorter-faster div/rem instruction when possible. 
-Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() { 
-  Value *Dividend = SlowDivOrRem->getOperand(0); 
-  Value *Divisor = SlowDivOrRem->getOperand(1); 
- 
-  VisitedSetTy SetL; 
-  ValueRange DividendRange = getValueRange(Dividend, SetL); 
-  if (DividendRange == VALRNG_LIKELY_LONG) 
-    return None; 
- 
-  VisitedSetTy SetR; 
-  ValueRange DivisorRange = getValueRange(Divisor, SetR); 
-  if (DivisorRange == VALRNG_LIKELY_LONG) 
-    return None; 
- 
-  bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT); 
-  bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT); 
- 
-  if (DividendShort && DivisorShort) { 
-    // If both operands are known to be short then just replace the long 
-    // division with a short one in-place.  Since we're not introducing control 
-    // flow in this case, narrowing the division is always a win, even if the 
-    // divisor is a constant (and will later get replaced by a multiplication). 
- 
-    IRBuilder<> Builder(SlowDivOrRem); 
-    Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType); 
-    Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType); 
-    Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor); 
-    Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor); 
-    Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType()); 
-    Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType()); 
-    return QuotRemPair(ExtDiv, ExtRem); 
-  } 
- 
-  if (isa<ConstantInt>(Divisor)) { 
-    // If the divisor is not a constant, DAGCombiner will convert it to a 
-    // multiplication by a magic constant.  It isn't clear if it is worth 
-    // introducing control flow to get a narrower multiply. 
-    return None; 
-  } 
- 
-  // After Constant Hoisting pass, long constants may be represented as 
-  // bitcast instructions. As a result, some constants may look like an 
-  // instruction at first, and an additional check is necessary to find out if 
-  // an operand is actually a constant. 
-  if (auto *BCI = dyn_cast<BitCastInst>(Divisor)) 
-    if (BCI->getParent() == SlowDivOrRem->getParent() && 
-        isa<ConstantInt>(BCI->getOperand(0))) 
-      return None; 
- 
-  IRBuilder<> Builder(MainBB, MainBB->end()); 
-  Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc()); 
- 
-  if (DividendShort && !isSignedOp()) { 
-    // If the division is unsigned and Dividend is known to be short, then 
-    // either 
-    // 1) Divisor is less or equal to Dividend, and the result can be computed 
-    //    with a short division. 
-    // 2) Divisor is greater than Dividend. In this case, no division is needed 
-    //    at all: The quotient is 0 and the remainder is equal to Dividend. 
-    // 
-    // So instead of checking at runtime whether Divisor fits into BypassType, 
-    // we emit a runtime check to differentiate between these two cases. This 
-    // lets us entirely avoid a long div. 
- 
-    // Split the basic block before the div/rem. 
-    BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); 
-    // Remove the unconditional branch from MainBB to SuccessorBB. 
-    MainBB->getInstList().back().eraseFromParent(); 
-    QuotRemWithBB Long; 
-    Long.BB = MainBB; 
-    Long.Quotient = ConstantInt::get(getSlowType(), 0); 
-    Long.Remainder = Dividend; 
-    QuotRemWithBB Fast = createFastBB(SuccessorBB); 
-    QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB); 
-    Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor); 
-    Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB); 
-    return Result; 
-  } else { 
-    // General case. Create both slow and fast div/rem pairs and choose one of 
-    // them at runtime. 
- 
-    // Split the basic block before the div/rem. 
-    BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); 
-    // Remove the unconditional branch from MainBB to SuccessorBB. 
-    MainBB->getInstList().back().eraseFromParent(); 
-    QuotRemWithBB Fast = createFastBB(SuccessorBB); 
-    QuotRemWithBB Slow = createSlowBB(SuccessorBB); 
-    QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB); 
-    Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend, 
-                                            DivisorShort ? nullptr : Divisor); 
-    Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB); 
-    return Result; 
-  } 
-} 
- 
-/// This optimization identifies DIV/REM instructions in a BB that can be 
-/// profitably bypassed and carried out with a shorter, faster divide. 
-bool llvm::bypassSlowDivision(BasicBlock *BB, 
-                              const BypassWidthsTy &BypassWidths) { 
-  DivCacheTy PerBBDivCache; 
- 
-  bool MadeChange = false; 
-  Instruction *Next = &*BB->begin(); 
-  while (Next != nullptr) { 
-    // We may add instructions immediately after I, but we want to skip over 
-    // them. 
-    Instruction *I = Next; 
-    Next = Next->getNextNode(); 
- 
-    // Ignore dead code to save time and avoid bugs. 
-    if (I->hasNUses(0)) 
-      continue; 
- 
-    FastDivInsertionTask Task(I, BypassWidths); 
-    if (Value *Replacement = Task.getReplacement(PerBBDivCache)) { 
-      I->replaceAllUsesWith(Replacement); 
-      I->eraseFromParent(); 
-      MadeChange = true; 
-    } 
-  } 
- 
-  // Above we eagerly create divs and rems, as pairs, so that we can efficiently 
-  // create divrem machine instructions.  Now erase any unused divs / rems so we 
-  // don't leave extra instructions sitting around. 
-  for (auto &KV : PerBBDivCache) 
-    for (Value *V : {KV.second.Quotient, KV.second.Remainder}) 
-      RecursivelyDeleteTriviallyDeadInstructions(V); 
- 
-  return MadeChange; 
-} 
+//===- BypassSlowDivision.cpp - Bypass slow division ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains an optimization for div and rem on architectures that
+// execute short instructions significantly faster than longer instructions.
+// For example, on Intel Atom 32-bit divides are slow enough that during
+// runtime it is profitable to check the value of the operands, and if they are
+// positive and less than 256 use an unsigned 8-bit divide.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/BypassSlowDivision.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/KnownBits.h"
+#include <cassert>
+#include <cstdint>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "bypass-slow-division"
+
+namespace {
+
+  struct QuotRemPair {
+    Value *Quotient;
+    Value *Remainder;
+
+    QuotRemPair(Value *InQuotient, Value *InRemainder)
+        : Quotient(InQuotient), Remainder(InRemainder) {}
+  };
+
+  /// A quotient and remainder, plus a BB from which they logically "originate".
+  /// If you use Quotient or Remainder in a Phi node, you should use BB as its
+  /// corresponding predecessor.
+  struct QuotRemWithBB {
+    BasicBlock *BB = nullptr;
+    Value *Quotient = nullptr;
+    Value *Remainder = nullptr;
+  };
+
+using DivCacheTy = DenseMap<DivRemMapKey, QuotRemPair>;
+using BypassWidthsTy = DenseMap<unsigned, unsigned>;
+using VisitedSetTy = SmallPtrSet<Instruction *, 4>;
+
+enum ValueRange {
+  /// Operand definitely fits into BypassType. No runtime checks are needed.
+  VALRNG_KNOWN_SHORT,
+  /// A runtime check is required, as value range is unknown.
+  VALRNG_UNKNOWN,
+  /// Operand is unlikely to fit into BypassType. The bypassing should be
+  /// disabled.
+  VALRNG_LIKELY_LONG
+};
+
+class FastDivInsertionTask {
+  bool IsValidTask = false;
+  Instruction *SlowDivOrRem = nullptr;
+  IntegerType *BypassType = nullptr;
+  BasicBlock *MainBB = nullptr;
+
+  bool isHashLikeValue(Value *V, VisitedSetTy &Visited);
+  ValueRange getValueRange(Value *Op, VisitedSetTy &Visited);
+  QuotRemWithBB createSlowBB(BasicBlock *Successor);
+  QuotRemWithBB createFastBB(BasicBlock *Successor);
+  QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS,
+                                   BasicBlock *PhiBB);
+  Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2);
+  Optional<QuotRemPair> insertFastDivAndRem();
+
+  bool isSignedOp() {
+    return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
+           SlowDivOrRem->getOpcode() == Instruction::SRem;
+  }
+
+  bool isDivisionOp() {
+    return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
+           SlowDivOrRem->getOpcode() == Instruction::UDiv;
+  }
+
+  Type *getSlowType() { return SlowDivOrRem->getType(); }
+
+public:
+  FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths);
+
+  Value *getReplacement(DivCacheTy &Cache);
+};
+
+} // end anonymous namespace
+
+FastDivInsertionTask::FastDivInsertionTask(Instruction *I,
+                                           const BypassWidthsTy &BypassWidths) {
+  switch (I->getOpcode()) {
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+    SlowDivOrRem = I;
+    break;
+  default:
+    // I is not a div/rem operation.
+    return;
+  }
+
+  // Skip division on vector types. Only optimize integer instructions.
+  IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType());
+  if (!SlowType)
+    return;
+
+  // Skip if this bitwidth is not bypassed.
+  auto BI = BypassWidths.find(SlowType->getBitWidth());
+  if (BI == BypassWidths.end())
+    return;
+
+  // Get type for div/rem instruction with bypass bitwidth.
+  IntegerType *BT = IntegerType::get(I->getContext(), BI->second);
+  BypassType = BT;
+
+  // The original basic block.
+  MainBB = I->getParent();
+
+  // The instruction is indeed a slow div or rem operation.
+  IsValidTask = true;
+}
+
+/// Reuses previously-computed dividend or remainder from the current BB if
+/// operands and operation are identical. Otherwise calls insertFastDivAndRem to
+/// perform the optimization and caches the resulting dividend and remainder.
+/// If no replacement can be generated, nullptr is returned.
+Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) {
+  // First, make sure that the task is valid.
+  if (!IsValidTask)
+    return nullptr;
+
+  // Then, look for a value in Cache.
+  Value *Dividend = SlowDivOrRem->getOperand(0);
+  Value *Divisor = SlowDivOrRem->getOperand(1);
+  DivRemMapKey Key(isSignedOp(), Dividend, Divisor);
+  auto CacheI = Cache.find(Key);
+
+  if (CacheI == Cache.end()) {
+    // If previous instance does not exist, try to insert fast div.
+    Optional<QuotRemPair> OptResult = insertFastDivAndRem();
+    // Bail out if insertFastDivAndRem has failed.
+    if (!OptResult)
+      return nullptr;
+    CacheI = Cache.insert({Key, *OptResult}).first;
+  }
+
+  QuotRemPair &Value = CacheI->second;
+  return isDivisionOp() ? Value.Quotient : Value.Remainder;
+}
+
+/// Check if a value looks like a hash.
+///
+/// The routine is expected to detect values computed using the most common hash
+/// algorithms. Typically, hash computations end with one of the following
+/// instructions:
+///
+/// 1) MUL with a constant wider than BypassType
+/// 2) XOR instruction
+///
+/// And even if we are wrong and the value is not a hash, it is still quite
+/// unlikely that such values will fit into BypassType.
+///
+/// To detect string hash algorithms like FNV we have to look through PHI-nodes.
+/// It is implemented as a depth-first search for values that look neither long
+/// nor hash-like.
+bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) {
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return false;
+
+  switch (I->getOpcode()) {
+  case Instruction::Xor:
+    return true;
+  case Instruction::Mul: {
+    // After Constant Hoisting pass, long constants may be represented as
+    // bitcast instructions. As a result, some constants may look like an
+    // instruction at first, and an additional check is necessary to find out if
+    // an operand is actually a constant.
+    Value *Op1 = I->getOperand(1);
+    ConstantInt *C = dyn_cast<ConstantInt>(Op1);
+    if (!C && isa<BitCastInst>(Op1))
+      C = dyn_cast<ConstantInt>(cast<BitCastInst>(Op1)->getOperand(0));
+    return C && C->getValue().getMinSignedBits() > BypassType->getBitWidth();
+  }
+  case Instruction::PHI:
+    // Stop IR traversal in case of a crazy input code. This limits recursion
+    // depth.
+    if (Visited.size() >= 16)
+      return false;
+    // Do not visit nodes that have been visited already. We return true because
+    // it means that we couldn't find any value that doesn't look hash-like.
+    if (!Visited.insert(I).second)
+      return true;
+    return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) {
+      // Ignore undef values as they probably don't affect the division
+      // operands.
+      return getValueRange(V, Visited) == VALRNG_LIKELY_LONG ||
+             isa<UndefValue>(V);
+    });
+  default:
+    return false;
+  }
+}
+
+/// Check if an integer value fits into our bypass type.
+ValueRange FastDivInsertionTask::getValueRange(Value *V,
+                                               VisitedSetTy &Visited) {
+  unsigned ShortLen = BypassType->getBitWidth();
+  unsigned LongLen = V->getType()->getIntegerBitWidth();
+
+  assert(LongLen > ShortLen && "Value type must be wider than BypassType");
+  unsigned HiBits = LongLen - ShortLen;
+
+  const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout();
+  KnownBits Known(LongLen);
+
+  computeKnownBits(V, Known, DL);
+
+  if (Known.countMinLeadingZeros() >= HiBits)
+    return VALRNG_KNOWN_SHORT;
+
+  if (Known.countMaxLeadingZeros() < HiBits)
+    return VALRNG_LIKELY_LONG;
+
+  // Long integer divisions are often used in hashtable implementations. It's
+  // not worth bypassing such divisions because hash values are extremely
+  // unlikely to have enough leading zeros. The call below tries to detect
+  // values that are unlikely to fit BypassType (including hashes).
+  if (isHashLikeValue(V, Visited))
+    return VALRNG_LIKELY_LONG;
+
+  return VALRNG_UNKNOWN;
+}
+
+/// Add new basic block for slow div and rem operations and put it before
+/// SuccessorBB.
+QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) {
+  QuotRemWithBB DivRemPair;
+  DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
+                                     MainBB->getParent(), SuccessorBB);
+  IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
+  Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
+
+  Value *Dividend = SlowDivOrRem->getOperand(0);
+  Value *Divisor = SlowDivOrRem->getOperand(1);
+
+  if (isSignedOp()) {
+    DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor);
+    DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor);
+  } else {
+    DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor);
+    DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor);
+  }
+
+  Builder.CreateBr(SuccessorBB);
+  return DivRemPair;
+}
+
+/// Add new basic block for fast div and rem operations and put it before
+/// SuccessorBB.
+QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) {
+  QuotRemWithBB DivRemPair;
+  DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
+                                     MainBB->getParent(), SuccessorBB);
+  IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
+  Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
+
+  Value *Dividend = SlowDivOrRem->getOperand(0);
+  Value *Divisor = SlowDivOrRem->getOperand(1);
+  Value *ShortDivisorV =
+      Builder.CreateCast(Instruction::Trunc, Divisor, BypassType);
+  Value *ShortDividendV =
+      Builder.CreateCast(Instruction::Trunc, Dividend, BypassType);
+
+  // udiv/urem because this optimization only handles positive numbers.
+  Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV);
+  Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV);
+  DivRemPair.Quotient =
+      Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType());
+  DivRemPair.Remainder =
+      Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType());
+  Builder.CreateBr(SuccessorBB);
+
+  return DivRemPair;
+}
+
+/// Creates Phi nodes for result of Div and Rem.
+QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS,
+                                                       QuotRemWithBB &RHS,
+                                                       BasicBlock *PhiBB) {
+  IRBuilder<> Builder(PhiBB, PhiBB->begin());
+  Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
+  PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2);
+  QuoPhi->addIncoming(LHS.Quotient, LHS.BB);
+  QuoPhi->addIncoming(RHS.Quotient, RHS.BB);
+  PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2);
+  RemPhi->addIncoming(LHS.Remainder, LHS.BB);
+  RemPhi->addIncoming(RHS.Remainder, RHS.BB);
+  return QuotRemPair(QuoPhi, RemPhi);
+}
+
+/// Creates a runtime check to test whether both the divisor and dividend fit
+/// into BypassType. The check is inserted at the end of MainBB. True return
+/// value means that the operands fit. Either of the operands may be NULL if it
+/// doesn't need a runtime check.
+Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) {
+  assert((Op1 || Op2) && "Nothing to check");
+  IRBuilder<> Builder(MainBB, MainBB->end());
+  Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
+
+  Value *OrV;
+  if (Op1 && Op2)
+    OrV = Builder.CreateOr(Op1, Op2);
+  else
+    OrV = Op1 ? Op1 : Op2;
+
+  // BitMask is inverted to check if the operands are
+  // larger than the bypass type
+  uint64_t BitMask = ~BypassType->getBitMask();
+  Value *AndV = Builder.CreateAnd(OrV, BitMask);
+
+  // Compare operand values
+  Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0);
+  return Builder.CreateICmpEQ(AndV, ZeroV);
+}
+
+/// Substitutes the div/rem instruction with code that checks the value of the
+/// operands and uses a shorter-faster div/rem instruction when possible.
+Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() {
+  Value *Dividend = SlowDivOrRem->getOperand(0);
+  Value *Divisor = SlowDivOrRem->getOperand(1);
+
+  VisitedSetTy SetL;
+  ValueRange DividendRange = getValueRange(Dividend, SetL);
+  if (DividendRange == VALRNG_LIKELY_LONG)
+    return None;
+
+  VisitedSetTy SetR;
+  ValueRange DivisorRange = getValueRange(Divisor, SetR);
+  if (DivisorRange == VALRNG_LIKELY_LONG)
+    return None;
+
+  bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT);
+  bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT);
+
+  if (DividendShort && DivisorShort) {
+    // If both operands are known to be short then just replace the long
+    // division with a short one in-place.  Since we're not introducing control
+    // flow in this case, narrowing the division is always a win, even if the
+    // divisor is a constant (and will later get replaced by a multiplication).
+
+    IRBuilder<> Builder(SlowDivOrRem);
+    Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType);
+    Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType);
+    Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor);
+    Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor);
+    Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType());
+    Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType());
+    return QuotRemPair(ExtDiv, ExtRem);
+  }
+
+  if (isa<ConstantInt>(Divisor)) {
+    // If the divisor is not a constant, DAGCombiner will convert it to a
+    // multiplication by a magic constant.  It isn't clear if it is worth
+    // introducing control flow to get a narrower multiply.
+    return None;
+  }
+
+  // After Constant Hoisting pass, long constants may be represented as
+  // bitcast instructions. As a result, some constants may look like an
+  // instruction at first, and an additional check is necessary to find out if
+  // an operand is actually a constant.
+  if (auto *BCI = dyn_cast<BitCastInst>(Divisor))
+    if (BCI->getParent() == SlowDivOrRem->getParent() &&
+        isa<ConstantInt>(BCI->getOperand(0)))
+      return None;
+
+  IRBuilder<> Builder(MainBB, MainBB->end());
+  Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
+
+  if (DividendShort && !isSignedOp()) {
+    // If the division is unsigned and Dividend is known to be short, then
+    // either
+    // 1) Divisor is less or equal to Dividend, and the result can be computed
+    //    with a short division.
+    // 2) Divisor is greater than Dividend. In this case, no division is needed
+    //    at all: The quotient is 0 and the remainder is equal to Dividend.
+    //
+    // So instead of checking at runtime whether Divisor fits into BypassType,
+    // we emit a runtime check to differentiate between these two cases. This
+    // lets us entirely avoid a long div.
+
+    // Split the basic block before the div/rem.
+    BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
+    // Remove the unconditional branch from MainBB to SuccessorBB.
+    MainBB->getInstList().back().eraseFromParent();
+    QuotRemWithBB Long;
+    Long.BB = MainBB;
+    Long.Quotient = ConstantInt::get(getSlowType(), 0);
+    Long.Remainder = Dividend;
+    QuotRemWithBB Fast = createFastBB(SuccessorBB);
+    QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB);
+    Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor);
+    Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB);
+    return Result;
+  } else {
+    // General case. Create both slow and fast div/rem pairs and choose one of
+    // them at runtime.
+
+    // Split the basic block before the div/rem.
+    BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
+    // Remove the unconditional branch from MainBB to SuccessorBB.
+    MainBB->getInstList().back().eraseFromParent();
+    QuotRemWithBB Fast = createFastBB(SuccessorBB);
+    QuotRemWithBB Slow = createSlowBB(SuccessorBB);
+    QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB);
+    Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend,
+                                            DivisorShort ? nullptr : Divisor);
+    Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB);
+    return Result;
+  }
+}
+
+/// This optimization identifies DIV/REM instructions in a BB that can be
+/// profitably bypassed and carried out with a shorter, faster divide.
+bool llvm::bypassSlowDivision(BasicBlock *BB,
+                              const BypassWidthsTy &BypassWidths) {
+  DivCacheTy PerBBDivCache;
+
+  bool MadeChange = false;
+  Instruction *Next = &*BB->begin();
+  while (Next != nullptr) {
+    // We may add instructions immediately after I, but we want to skip over
+    // them.
+    Instruction *I = Next;
+    Next = Next->getNextNode();
+
+    // Ignore dead code to save time and avoid bugs.
+    if (I->hasNUses(0))
+      continue;
+
+    FastDivInsertionTask Task(I, BypassWidths);
+    if (Value *Replacement = Task.getReplacement(PerBBDivCache)) {
+      I->replaceAllUsesWith(Replacement);
+      I->eraseFromParent();
+      MadeChange = true;
+    }
+  }
+
+  // Above we eagerly create divs and rems, as pairs, so that we can efficiently
+  // create divrem machine instructions.  Now erase any unused divs / rems so we
+  // don't leave extra instructions sitting around.
+  for (auto &KV : PerBBDivCache)
+    for (Value *V : {KV.second.Quotient, KV.second.Remainder})
+      RecursivelyDeleteTriviallyDeadInstructions(V);
+
+  return MadeChange;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/CallGraphUpdater.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/CallGraphUpdater.cpp
index f3facac06f..b2763900e1 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/CallGraphUpdater.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/CallGraphUpdater.cpp
@@ -1,168 +1,168 @@
-//===- CallGraphUpdater.cpp - A (lazy) call graph update helper -----------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-/// \file 
-/// 
-/// This file provides interfaces used to manipulate a call graph, regardless 
-/// if it is a "old style" CallGraph or an "new style" LazyCallGraph. 
-/// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/CallGraphUpdater.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/Transforms/Utils/ModuleUtils.h" 
- 
-using namespace llvm; 
- 
-bool CallGraphUpdater::finalize() { 
-  if (!DeadFunctionsInComdats.empty()) { 
-    filterDeadComdatFunctions(*DeadFunctionsInComdats.front()->getParent(), 
-                              DeadFunctionsInComdats); 
-    DeadFunctions.append(DeadFunctionsInComdats.begin(), 
-                         DeadFunctionsInComdats.end()); 
-  } 
- 
-  if (CG) { 
-    // First remove all references, e.g., outgoing via called functions. This is 
-    // necessary as we can delete functions that have circular references. 
-    for (Function *DeadFn : DeadFunctions) { 
-      DeadFn->removeDeadConstantUsers(); 
-      CallGraphNode *DeadCGN = (*CG)[DeadFn]; 
-      DeadCGN->removeAllCalledFunctions(); 
-      CG->getExternalCallingNode()->removeAnyCallEdgeTo(DeadCGN); 
-      DeadFn->replaceAllUsesWith(UndefValue::get(DeadFn->getType())); 
-    } 
- 
-    // Then remove the node and function from the module. 
-    for (Function *DeadFn : DeadFunctions) { 
-      CallGraphNode *DeadCGN = CG->getOrInsertFunction(DeadFn); 
-      assert(DeadCGN->getNumReferences() == 0 && 
-             "References should have been handled by now"); 
-      delete CG->removeFunctionFromModule(DeadCGN); 
-    } 
-  } else { 
-    // This is the code path for the new lazy call graph and for the case were 
-    // no call graph was provided. 
-    for (Function *DeadFn : DeadFunctions) { 
-      DeadFn->removeDeadConstantUsers(); 
-      DeadFn->replaceAllUsesWith(UndefValue::get(DeadFn->getType())); 
- 
-      if (LCG && !ReplacedFunctions.count(DeadFn)) { 
-        // Taken mostly from the inliner: 
-        LazyCallGraph::Node &N = LCG->get(*DeadFn); 
-        auto *DeadSCC = LCG->lookupSCC(N); 
-        assert(DeadSCC && DeadSCC->size() == 1 && 
-               &DeadSCC->begin()->getFunction() == DeadFn); 
-        auto &DeadRC = DeadSCC->getOuterRefSCC(); 
- 
-        FunctionAnalysisManager &FAM = 
-            AM->getResult<FunctionAnalysisManagerCGSCCProxy>(*DeadSCC, *LCG) 
-                .getManager(); 
- 
-        FAM.clear(*DeadFn, DeadFn->getName()); 
-        AM->clear(*DeadSCC, DeadSCC->getName()); 
-        LCG->removeDeadFunction(*DeadFn); 
- 
-        // Mark the relevant parts of the call graph as invalid so we don't 
-        // visit them. 
-        UR->InvalidatedSCCs.insert(DeadSCC); 
-        UR->InvalidatedRefSCCs.insert(&DeadRC); 
-      } 
- 
-      // The function is now really dead and de-attached from everything. 
-      DeadFn->eraseFromParent(); 
-    } 
-  } 
- 
-  bool Changed = !DeadFunctions.empty(); 
-  DeadFunctionsInComdats.clear(); 
-  DeadFunctions.clear(); 
-  return Changed; 
-} 
- 
-void CallGraphUpdater::reanalyzeFunction(Function &Fn) { 
-  if (CG) { 
-    CallGraphNode *OldCGN = CG->getOrInsertFunction(&Fn); 
-    OldCGN->removeAllCalledFunctions(); 
-    CG->populateCallGraphNode(OldCGN); 
-  } else if (LCG) { 
-    LazyCallGraph::Node &N = LCG->get(Fn); 
-    LazyCallGraph::SCC *C = LCG->lookupSCC(N); 
-    updateCGAndAnalysisManagerForCGSCCPass(*LCG, *C, N, *AM, *UR, *FAM); 
-  } 
-} 
- 
+//===- CallGraphUpdater.cpp - A (lazy) call graph update helper -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file provides interfaces used to manipulate a call graph, regardless
+/// if it is a "old style" CallGraph or an "new style" LazyCallGraph.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/CallGraphUpdater.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+bool CallGraphUpdater::finalize() {
+  if (!DeadFunctionsInComdats.empty()) {
+    filterDeadComdatFunctions(*DeadFunctionsInComdats.front()->getParent(),
+                              DeadFunctionsInComdats);
+    DeadFunctions.append(DeadFunctionsInComdats.begin(),
+                         DeadFunctionsInComdats.end());
+  }
+
+  if (CG) {
+    // First remove all references, e.g., outgoing via called functions. This is
+    // necessary as we can delete functions that have circular references.
+    for (Function *DeadFn : DeadFunctions) {
+      DeadFn->removeDeadConstantUsers();
+      CallGraphNode *DeadCGN = (*CG)[DeadFn];
+      DeadCGN->removeAllCalledFunctions();
+      CG->getExternalCallingNode()->removeAnyCallEdgeTo(DeadCGN);
+      DeadFn->replaceAllUsesWith(UndefValue::get(DeadFn->getType()));
+    }
+
+    // Then remove the node and function from the module.
+    for (Function *DeadFn : DeadFunctions) {
+      CallGraphNode *DeadCGN = CG->getOrInsertFunction(DeadFn);
+      assert(DeadCGN->getNumReferences() == 0 &&
+             "References should have been handled by now");
+      delete CG->removeFunctionFromModule(DeadCGN);
+    }
+  } else {
+    // This is the code path for the new lazy call graph and for the case were
+    // no call graph was provided.
+    for (Function *DeadFn : DeadFunctions) {
+      DeadFn->removeDeadConstantUsers();
+      DeadFn->replaceAllUsesWith(UndefValue::get(DeadFn->getType()));
+
+      if (LCG && !ReplacedFunctions.count(DeadFn)) {
+        // Taken mostly from the inliner:
+        LazyCallGraph::Node &N = LCG->get(*DeadFn);
+        auto *DeadSCC = LCG->lookupSCC(N);
+        assert(DeadSCC && DeadSCC->size() == 1 &&
+               &DeadSCC->begin()->getFunction() == DeadFn);
+        auto &DeadRC = DeadSCC->getOuterRefSCC();
+
+        FunctionAnalysisManager &FAM =
+            AM->getResult<FunctionAnalysisManagerCGSCCProxy>(*DeadSCC, *LCG)
+                .getManager();
+
+        FAM.clear(*DeadFn, DeadFn->getName());
+        AM->clear(*DeadSCC, DeadSCC->getName());
+        LCG->removeDeadFunction(*DeadFn);
+
+        // Mark the relevant parts of the call graph as invalid so we don't
+        // visit them.
+        UR->InvalidatedSCCs.insert(DeadSCC);
+        UR->InvalidatedRefSCCs.insert(&DeadRC);
+      }
+
+      // The function is now really dead and de-attached from everything.
+      DeadFn->eraseFromParent();
+    }
+  }
+
+  bool Changed = !DeadFunctions.empty();
+  DeadFunctionsInComdats.clear();
+  DeadFunctions.clear();
+  return Changed;
+}
+
+void CallGraphUpdater::reanalyzeFunction(Function &Fn) {
+  if (CG) {
+    CallGraphNode *OldCGN = CG->getOrInsertFunction(&Fn);
+    OldCGN->removeAllCalledFunctions();
+    CG->populateCallGraphNode(OldCGN);
+  } else if (LCG) {
+    LazyCallGraph::Node &N = LCG->get(Fn);
+    LazyCallGraph::SCC *C = LCG->lookupSCC(N);
+    updateCGAndAnalysisManagerForCGSCCPass(*LCG, *C, N, *AM, *UR, *FAM);
+  }
+}
+
 void CallGraphUpdater::registerOutlinedFunction(Function &OriginalFn,
                                                 Function &NewFn) {
-  if (CG) 
-    CG->addToCallGraph(&NewFn); 
-  else if (LCG) 
+  if (CG)
+    CG->addToCallGraph(&NewFn);
+  else if (LCG)
     LCG->addSplitFunction(OriginalFn, NewFn);
-} 
- 
-void CallGraphUpdater::removeFunction(Function &DeadFn) { 
-  DeadFn.deleteBody(); 
-  DeadFn.setLinkage(GlobalValue::ExternalLinkage); 
-  if (DeadFn.hasComdat()) 
-    DeadFunctionsInComdats.push_back(&DeadFn); 
-  else 
-    DeadFunctions.push_back(&DeadFn); 
- 
-  // For the old call graph we remove the function from the SCC right away. 
-  if (CG && !ReplacedFunctions.count(&DeadFn)) { 
-    CallGraphNode *DeadCGN = (*CG)[&DeadFn]; 
-    DeadCGN->removeAllCalledFunctions(); 
-    CGSCC->DeleteNode(DeadCGN); 
-  } 
-} 
- 
-void CallGraphUpdater::replaceFunctionWith(Function &OldFn, Function &NewFn) { 
-  OldFn.removeDeadConstantUsers(); 
-  ReplacedFunctions.insert(&OldFn); 
-  if (CG) { 
-    // Update the call graph for the newly promoted function. 
-    CallGraphNode *OldCGN = (*CG)[&OldFn]; 
-    CallGraphNode *NewCGN = CG->getOrInsertFunction(&NewFn); 
-    NewCGN->stealCalledFunctionsFrom(OldCGN); 
-    CG->ReplaceExternalCallEdge(OldCGN, NewCGN); 
- 
-    // And update the SCC we're iterating as well. 
-    CGSCC->ReplaceNode(OldCGN, NewCGN); 
-  } else if (LCG) { 
-    // Directly substitute the functions in the call graph. 
-    LazyCallGraph::Node &OldLCGN = LCG->get(OldFn); 
-    SCC->getOuterRefSCC().replaceNodeFunction(OldLCGN, NewFn); 
-  } 
-  removeFunction(OldFn); 
-} 
- 
-bool CallGraphUpdater::replaceCallSite(CallBase &OldCS, CallBase &NewCS) { 
-  // This is only necessary in the (old) CG. 
-  if (!CG) 
-    return true; 
- 
-  Function *Caller = OldCS.getCaller(); 
-  CallGraphNode *NewCalleeNode = 
-      CG->getOrInsertFunction(NewCS.getCalledFunction()); 
-  CallGraphNode *CallerNode = (*CG)[Caller]; 
-  if (llvm::none_of(*CallerNode, [&OldCS](const CallGraphNode::CallRecord &CR) { 
-        return CR.first && *CR.first == &OldCS; 
-      })) 
-    return false; 
-  CallerNode->replaceCallEdge(OldCS, NewCS, NewCalleeNode); 
-  return true; 
-} 
- 
-void CallGraphUpdater::removeCallSite(CallBase &CS) { 
-  // This is only necessary in the (old) CG. 
-  if (!CG) 
-    return; 
- 
-  Function *Caller = CS.getCaller(); 
-  CallGraphNode *CallerNode = (*CG)[Caller]; 
-  CallerNode->removeCallEdgeFor(CS); 
-} 
+}
+
+void CallGraphUpdater::removeFunction(Function &DeadFn) {
+  DeadFn.deleteBody();
+  DeadFn.setLinkage(GlobalValue::ExternalLinkage);
+  if (DeadFn.hasComdat())
+    DeadFunctionsInComdats.push_back(&DeadFn);
+  else
+    DeadFunctions.push_back(&DeadFn);
+
+  // For the old call graph we remove the function from the SCC right away.
+  if (CG && !ReplacedFunctions.count(&DeadFn)) {
+    CallGraphNode *DeadCGN = (*CG)[&DeadFn];
+    DeadCGN->removeAllCalledFunctions();
+    CGSCC->DeleteNode(DeadCGN);
+  }
+}
+
+void CallGraphUpdater::replaceFunctionWith(Function &OldFn, Function &NewFn) {
+  OldFn.removeDeadConstantUsers();
+  ReplacedFunctions.insert(&OldFn);
+  if (CG) {
+    // Update the call graph for the newly promoted function.
+    CallGraphNode *OldCGN = (*CG)[&OldFn];
+    CallGraphNode *NewCGN = CG->getOrInsertFunction(&NewFn);
+    NewCGN->stealCalledFunctionsFrom(OldCGN);
+    CG->ReplaceExternalCallEdge(OldCGN, NewCGN);
+
+    // And update the SCC we're iterating as well.
+    CGSCC->ReplaceNode(OldCGN, NewCGN);
+  } else if (LCG) {
+    // Directly substitute the functions in the call graph.
+    LazyCallGraph::Node &OldLCGN = LCG->get(OldFn);
+    SCC->getOuterRefSCC().replaceNodeFunction(OldLCGN, NewFn);
+  }
+  removeFunction(OldFn);
+}
+
+bool CallGraphUpdater::replaceCallSite(CallBase &OldCS, CallBase &NewCS) {
+  // This is only necessary in the (old) CG.
+  if (!CG)
+    return true;
+
+  Function *Caller = OldCS.getCaller();
+  CallGraphNode *NewCalleeNode =
+      CG->getOrInsertFunction(NewCS.getCalledFunction());
+  CallGraphNode *CallerNode = (*CG)[Caller];
+  if (llvm::none_of(*CallerNode, [&OldCS](const CallGraphNode::CallRecord &CR) {
+        return CR.first && *CR.first == &OldCS;
+      }))
+    return false;
+  CallerNode->replaceCallEdge(OldCS, NewCS, NewCalleeNode);
+  return true;
+}
+
+void CallGraphUpdater::removeCallSite(CallBase &CS) {
+  // This is only necessary in the (old) CG.
+  if (!CG)
+    return;
+
+  Function *Caller = CS.getCaller();
+  CallGraphNode *CallerNode = (*CG)[Caller];
+  CallerNode->removeCallEdgeFor(CS);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/CallPromotionUtils.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/CallPromotionUtils.cpp
index 9478516f98..bf08bf2747 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/CallPromotionUtils.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/CallPromotionUtils.cpp
@@ -1,595 +1,595 @@
-//===- CallPromotionUtils.cpp - Utilities for call promotion ----*- C++ -*-===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements utilities useful for promoting indirect call sites to 
-// direct call sites. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/CallPromotionUtils.h" 
-#include "llvm/Analysis/Loads.h" 
-#include "llvm/Analysis/TypeMetadataUtils.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "call-promotion-utils" 
- 
-/// Fix-up phi nodes in an invoke instruction's normal destination. 
-/// 
-/// After versioning an invoke instruction, values coming from the original 
-/// block will now be coming from the "merge" block. For example, in the code 
-/// below: 
-/// 
-///   then_bb: 
-///     %t0 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst 
-/// 
-///   else_bb: 
-///     %t1 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst 
-/// 
-///   merge_bb: 
-///     %t2 = phi i32 [ %t0, %then_bb ], [ %t1, %else_bb ] 
-///     br %normal_dst 
-/// 
-///   normal_dst: 
-///     %t3 = phi i32 [ %x, %orig_bb ], ... 
-/// 
-/// "orig_bb" is no longer a predecessor of "normal_dst", so the phi nodes in 
-/// "normal_dst" must be fixed to refer to "merge_bb": 
-/// 
-///    normal_dst: 
-///      %t3 = phi i32 [ %x, %merge_bb ], ... 
-/// 
-static void fixupPHINodeForNormalDest(InvokeInst *Invoke, BasicBlock *OrigBlock, 
-                                      BasicBlock *MergeBlock) { 
-  for (PHINode &Phi : Invoke->getNormalDest()->phis()) { 
-    int Idx = Phi.getBasicBlockIndex(OrigBlock); 
-    if (Idx == -1) 
-      continue; 
-    Phi.setIncomingBlock(Idx, MergeBlock); 
-  } 
-} 
- 
-/// Fix-up phi nodes in an invoke instruction's unwind destination. 
-/// 
-/// After versioning an invoke instruction, values coming from the original 
-/// block will now be coming from either the "then" block or the "else" block. 
-/// For example, in the code below: 
-/// 
-///   then_bb: 
-///     %t0 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst 
-/// 
-///   else_bb: 
-///     %t1 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst 
-/// 
-///   unwind_dst: 
-///     %t3 = phi i32 [ %x, %orig_bb ], ... 
-/// 
-/// "orig_bb" is no longer a predecessor of "unwind_dst", so the phi nodes in 
-/// "unwind_dst" must be fixed to refer to "then_bb" and "else_bb": 
-/// 
-///   unwind_dst: 
-///     %t3 = phi i32 [ %x, %then_bb ], [ %x, %else_bb ], ... 
-/// 
-static void fixupPHINodeForUnwindDest(InvokeInst *Invoke, BasicBlock *OrigBlock, 
-                                      BasicBlock *ThenBlock, 
-                                      BasicBlock *ElseBlock) { 
-  for (PHINode &Phi : Invoke->getUnwindDest()->phis()) { 
-    int Idx = Phi.getBasicBlockIndex(OrigBlock); 
-    if (Idx == -1) 
-      continue; 
-    auto *V = Phi.getIncomingValue(Idx); 
-    Phi.setIncomingBlock(Idx, ThenBlock); 
-    Phi.addIncoming(V, ElseBlock); 
-  } 
-} 
- 
-/// Create a phi node for the returned value of a call or invoke instruction. 
-/// 
-/// After versioning a call or invoke instruction that returns a value, we have 
-/// to merge the value of the original and new instructions. We do this by 
-/// creating a phi node and replacing uses of the original instruction with this 
-/// phi node. 
-/// 
-/// For example, if \p OrigInst is defined in "else_bb" and \p NewInst is 
-/// defined in "then_bb", we create the following phi node: 
-/// 
-///   ; Uses of the original instruction are replaced by uses of the phi node. 
-///   %t0 = phi i32 [ %orig_inst, %else_bb ], [ %new_inst, %then_bb ], 
-/// 
-static void createRetPHINode(Instruction *OrigInst, Instruction *NewInst, 
-                             BasicBlock *MergeBlock, IRBuilder<> &Builder) { 
- 
-  if (OrigInst->getType()->isVoidTy() || OrigInst->use_empty()) 
-    return; 
- 
-  Builder.SetInsertPoint(&MergeBlock->front()); 
-  PHINode *Phi = Builder.CreatePHI(OrigInst->getType(), 0); 
+//===- CallPromotionUtils.cpp - Utilities for call promotion ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements utilities useful for promoting indirect call sites to
+// direct call sites.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/CallPromotionUtils.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/TypeMetadataUtils.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "call-promotion-utils"
+
+/// Fix-up phi nodes in an invoke instruction's normal destination.
+///
+/// After versioning an invoke instruction, values coming from the original
+/// block will now be coming from the "merge" block. For example, in the code
+/// below:
+///
+///   then_bb:
+///     %t0 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst
+///
+///   else_bb:
+///     %t1 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst
+///
+///   merge_bb:
+///     %t2 = phi i32 [ %t0, %then_bb ], [ %t1, %else_bb ]
+///     br %normal_dst
+///
+///   normal_dst:
+///     %t3 = phi i32 [ %x, %orig_bb ], ...
+///
+/// "orig_bb" is no longer a predecessor of "normal_dst", so the phi nodes in
+/// "normal_dst" must be fixed to refer to "merge_bb":
+///
+///    normal_dst:
+///      %t3 = phi i32 [ %x, %merge_bb ], ...
+///
+static void fixupPHINodeForNormalDest(InvokeInst *Invoke, BasicBlock *OrigBlock,
+                                      BasicBlock *MergeBlock) {
+  for (PHINode &Phi : Invoke->getNormalDest()->phis()) {
+    int Idx = Phi.getBasicBlockIndex(OrigBlock);
+    if (Idx == -1)
+      continue;
+    Phi.setIncomingBlock(Idx, MergeBlock);
+  }
+}
+
+/// Fix-up phi nodes in an invoke instruction's unwind destination.
+///
+/// After versioning an invoke instruction, values coming from the original
+/// block will now be coming from either the "then" block or the "else" block.
+/// For example, in the code below:
+///
+///   then_bb:
+///     %t0 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst
+///
+///   else_bb:
+///     %t1 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst
+///
+///   unwind_dst:
+///     %t3 = phi i32 [ %x, %orig_bb ], ...
+///
+/// "orig_bb" is no longer a predecessor of "unwind_dst", so the phi nodes in
+/// "unwind_dst" must be fixed to refer to "then_bb" and "else_bb":
+///
+///   unwind_dst:
+///     %t3 = phi i32 [ %x, %then_bb ], [ %x, %else_bb ], ...
+///
+static void fixupPHINodeForUnwindDest(InvokeInst *Invoke, BasicBlock *OrigBlock,
+                                      BasicBlock *ThenBlock,
+                                      BasicBlock *ElseBlock) {
+  for (PHINode &Phi : Invoke->getUnwindDest()->phis()) {
+    int Idx = Phi.getBasicBlockIndex(OrigBlock);
+    if (Idx == -1)
+      continue;
+    auto *V = Phi.getIncomingValue(Idx);
+    Phi.setIncomingBlock(Idx, ThenBlock);
+    Phi.addIncoming(V, ElseBlock);
+  }
+}
+
+/// Create a phi node for the returned value of a call or invoke instruction.
+///
+/// After versioning a call or invoke instruction that returns a value, we have
+/// to merge the value of the original and new instructions. We do this by
+/// creating a phi node and replacing uses of the original instruction with this
+/// phi node.
+///
+/// For example, if \p OrigInst is defined in "else_bb" and \p NewInst is
+/// defined in "then_bb", we create the following phi node:
+///
+///   ; Uses of the original instruction are replaced by uses of the phi node.
+///   %t0 = phi i32 [ %orig_inst, %else_bb ], [ %new_inst, %then_bb ],
+///
+static void createRetPHINode(Instruction *OrigInst, Instruction *NewInst,
+                             BasicBlock *MergeBlock, IRBuilder<> &Builder) {
+
+  if (OrigInst->getType()->isVoidTy() || OrigInst->use_empty())
+    return;
+
+  Builder.SetInsertPoint(&MergeBlock->front());
+  PHINode *Phi = Builder.CreatePHI(OrigInst->getType(), 0);
   SmallVector<User *, 16> UsersToUpdate(OrigInst->users());
-  for (User *U : UsersToUpdate) 
-    U->replaceUsesOfWith(OrigInst, Phi); 
-  Phi->addIncoming(OrigInst, OrigInst->getParent()); 
-  Phi->addIncoming(NewInst, NewInst->getParent()); 
-} 
- 
-/// Cast a call or invoke instruction to the given type. 
-/// 
-/// When promoting a call site, the return type of the call site might not match 
-/// that of the callee. If this is the case, we have to cast the returned value 
-/// to the correct type. The location of the cast depends on if we have a call 
-/// or invoke instruction. 
-/// 
-/// For example, if the call instruction below requires a bitcast after 
-/// promotion: 
-/// 
-///   orig_bb: 
-///     %t0 = call i32 @func() 
-///     ... 
-/// 
-/// The bitcast is placed after the call instruction: 
-/// 
-///   orig_bb: 
-///     ; Uses of the original return value are replaced by uses of the bitcast. 
-///     %t0 = call i32 @func() 
-///     %t1 = bitcast i32 %t0 to ... 
-///     ... 
-/// 
-/// A similar transformation is performed for invoke instructions. However, 
-/// since invokes are terminating, a new block is created for the bitcast. For 
-/// example, if the invoke instruction below requires a bitcast after promotion: 
-/// 
-///   orig_bb: 
-///     %t0 = invoke i32 @func() to label %normal_dst unwind label %unwind_dst 
-/// 
-/// The edge between the original block and the invoke's normal destination is 
-/// split, and the bitcast is placed there: 
-/// 
-///   orig_bb: 
-///     %t0 = invoke i32 @func() to label %split_bb unwind label %unwind_dst 
-/// 
-///   split_bb: 
-///     ; Uses of the original return value are replaced by uses of the bitcast. 
-///     %t1 = bitcast i32 %t0 to ... 
-///     br label %normal_dst 
-/// 
-static void createRetBitCast(CallBase &CB, Type *RetTy, CastInst **RetBitCast) { 
- 
-  // Save the users of the calling instruction. These uses will be changed to 
-  // use the bitcast after we create it. 
+  for (User *U : UsersToUpdate)
+    U->replaceUsesOfWith(OrigInst, Phi);
+  Phi->addIncoming(OrigInst, OrigInst->getParent());
+  Phi->addIncoming(NewInst, NewInst->getParent());
+}
+
+/// Cast a call or invoke instruction to the given type.
+///
+/// When promoting a call site, the return type of the call site might not match
+/// that of the callee. If this is the case, we have to cast the returned value
+/// to the correct type. The location of the cast depends on if we have a call
+/// or invoke instruction.
+///
+/// For example, if the call instruction below requires a bitcast after
+/// promotion:
+///
+///   orig_bb:
+///     %t0 = call i32 @func()
+///     ...
+///
+/// The bitcast is placed after the call instruction:
+///
+///   orig_bb:
+///     ; Uses of the original return value are replaced by uses of the bitcast.
+///     %t0 = call i32 @func()
+///     %t1 = bitcast i32 %t0 to ...
+///     ...
+///
+/// A similar transformation is performed for invoke instructions. However,
+/// since invokes are terminating, a new block is created for the bitcast. For
+/// example, if the invoke instruction below requires a bitcast after promotion:
+///
+///   orig_bb:
+///     %t0 = invoke i32 @func() to label %normal_dst unwind label %unwind_dst
+///
+/// The edge between the original block and the invoke's normal destination is
+/// split, and the bitcast is placed there:
+///
+///   orig_bb:
+///     %t0 = invoke i32 @func() to label %split_bb unwind label %unwind_dst
+///
+///   split_bb:
+///     ; Uses of the original return value are replaced by uses of the bitcast.
+///     %t1 = bitcast i32 %t0 to ...
+///     br label %normal_dst
+///
+static void createRetBitCast(CallBase &CB, Type *RetTy, CastInst **RetBitCast) {
+
+  // Save the users of the calling instruction. These uses will be changed to
+  // use the bitcast after we create it.
   SmallVector<User *, 16> UsersToUpdate(CB.users());
- 
-  // Determine an appropriate location to create the bitcast for the return 
-  // value. The location depends on if we have a call or invoke instruction. 
-  Instruction *InsertBefore = nullptr; 
-  if (auto *Invoke = dyn_cast<InvokeInst>(&CB)) 
-    InsertBefore = 
-        &SplitEdge(Invoke->getParent(), Invoke->getNormalDest())->front(); 
-  else 
-    InsertBefore = &*std::next(CB.getIterator()); 
- 
-  // Bitcast the return value to the correct type. 
-  auto *Cast = CastInst::CreateBitOrPointerCast(&CB, RetTy, "", InsertBefore); 
-  if (RetBitCast) 
-    *RetBitCast = Cast; 
- 
-  // Replace all the original uses of the calling instruction with the bitcast. 
-  for (User *U : UsersToUpdate) 
-    U->replaceUsesOfWith(&CB, Cast); 
-} 
- 
-/// Predicate and clone the given call site. 
-/// 
-/// This function creates an if-then-else structure at the location of the call 
-/// site. The "if" condition compares the call site's called value to the given 
-/// callee. The original call site is moved into the "else" block, and a clone 
-/// of the call site is placed in the "then" block. The cloned instruction is 
-/// returned. 
-/// 
-/// For example, the call instruction below: 
-/// 
-///   orig_bb: 
-///     %t0 = call i32 %ptr() 
-///     ... 
-/// 
-/// Is replace by the following: 
-/// 
-///   orig_bb: 
-///     %cond = icmp eq i32 ()* %ptr, @func 
-///     br i1 %cond, %then_bb, %else_bb 
-/// 
-///   then_bb: 
-///     ; The clone of the original call instruction is placed in the "then" 
-///     ; block. It is not yet promoted. 
-///     %t1 = call i32 %ptr() 
-///     br merge_bb 
-/// 
-///   else_bb: 
-///     ; The original call instruction is moved to the "else" block. 
-///     %t0 = call i32 %ptr() 
-///     br merge_bb 
-/// 
-///   merge_bb: 
-///     ; Uses of the original call instruction are replaced by uses of the phi 
-///     ; node. 
-///     %t2 = phi i32 [ %t0, %else_bb ], [ %t1, %then_bb ] 
-///     ... 
-/// 
-/// A similar transformation is performed for invoke instructions. However, 
-/// since invokes are terminating, more work is required. For example, the 
-/// invoke instruction below: 
-/// 
-///   orig_bb: 
-///     %t0 = invoke %ptr() to label %normal_dst unwind label %unwind_dst 
-/// 
-/// Is replace by the following: 
-/// 
-///   orig_bb: 
-///     %cond = icmp eq i32 ()* %ptr, @func 
-///     br i1 %cond, %then_bb, %else_bb 
-/// 
-///   then_bb: 
-///     ; The clone of the original invoke instruction is placed in the "then" 
-///     ; block, and its normal destination is set to the "merge" block. It is 
-///     ; not yet promoted. 
-///     %t1 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst 
-/// 
-///   else_bb: 
-///     ; The original invoke instruction is moved into the "else" block, and 
-///     ; its normal destination is set to the "merge" block. 
-///     %t0 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst 
-/// 
-///   merge_bb: 
-///     ; Uses of the original invoke instruction are replaced by uses of the 
-///     ; phi node, and the merge block branches to the normal destination. 
-///     %t2 = phi i32 [ %t0, %else_bb ], [ %t1, %then_bb ] 
-///     br %normal_dst 
-/// 
-/// An indirect musttail call is processed slightly differently in that: 
-/// 1. No merge block needed for the orginal and the cloned callsite, since 
-///    either one ends the flow. No phi node is needed either. 
-/// 2. The return statement following the original call site is duplicated too 
-///    and placed immediately after the cloned call site per the IR convention. 
-/// 
-/// For example, the musttail call instruction below: 
-/// 
-///   orig_bb: 
-///     %t0 = musttail call i32 %ptr() 
-///     ... 
-/// 
-/// Is replaced by the following: 
-/// 
-///   cond_bb: 
-///     %cond = icmp eq i32 ()* %ptr, @func 
-///     br i1 %cond, %then_bb, %orig_bb 
-/// 
-///   then_bb: 
-///     ; The clone of the original call instruction is placed in the "then" 
-///     ; block. It is not yet promoted. 
-///     %t1 = musttail call i32 %ptr() 
-///     ret %t1 
-/// 
-///   orig_bb: 
-///     ; The original call instruction stays in its original block. 
-///     %t0 = musttail call i32 %ptr() 
-///     ret %t0 
-static CallBase &versionCallSite(CallBase &CB, Value *Callee, 
-                                 MDNode *BranchWeights) { 
- 
-  IRBuilder<> Builder(&CB); 
-  CallBase *OrigInst = &CB; 
-  BasicBlock *OrigBlock = OrigInst->getParent(); 
- 
-  // Create the compare. The called value and callee must have the same type to 
-  // be compared. 
-  if (CB.getCalledOperand()->getType() != Callee->getType()) 
-    Callee = Builder.CreateBitCast(Callee, CB.getCalledOperand()->getType()); 
-  auto *Cond = Builder.CreateICmpEQ(CB.getCalledOperand(), Callee); 
- 
-  if (OrigInst->isMustTailCall()) { 
-    // Create an if-then structure. The original instruction stays in its block, 
-    // and a clone of the original instruction is placed in the "then" block. 
-    Instruction *ThenTerm = 
-        SplitBlockAndInsertIfThen(Cond, &CB, false, BranchWeights); 
-    BasicBlock *ThenBlock = ThenTerm->getParent(); 
-    ThenBlock->setName("if.true.direct_targ"); 
-    CallBase *NewInst = cast<CallBase>(OrigInst->clone()); 
-    NewInst->insertBefore(ThenTerm); 
- 
-    // Place a clone of the optional bitcast after the new call site. 
-    Value *NewRetVal = NewInst; 
-    auto Next = OrigInst->getNextNode(); 
-    if (auto *BitCast = dyn_cast_or_null<BitCastInst>(Next)) { 
-      assert(BitCast->getOperand(0) == OrigInst && 
-             "bitcast following musttail call must use the call"); 
-      auto NewBitCast = BitCast->clone(); 
-      NewBitCast->replaceUsesOfWith(OrigInst, NewInst); 
-      NewBitCast->insertBefore(ThenTerm); 
-      NewRetVal = NewBitCast; 
-      Next = BitCast->getNextNode(); 
-    } 
- 
-    // Place a clone of the return instruction after the new call site. 
-    ReturnInst *Ret = dyn_cast_or_null<ReturnInst>(Next); 
-    assert(Ret && "musttail call must precede a ret with an optional bitcast"); 
-    auto NewRet = Ret->clone(); 
-    if (Ret->getReturnValue()) 
-      NewRet->replaceUsesOfWith(Ret->getReturnValue(), NewRetVal); 
-    NewRet->insertBefore(ThenTerm); 
- 
-    // A return instructions is terminating, so we don't need the terminator 
-    // instruction just created. 
-    ThenTerm->eraseFromParent(); 
- 
-    return *NewInst; 
-  } 
- 
-  // Create an if-then-else structure. The original instruction is moved into 
-  // the "else" block, and a clone of the original instruction is placed in the 
-  // "then" block. 
-  Instruction *ThenTerm = nullptr; 
-  Instruction *ElseTerm = nullptr; 
-  SplitBlockAndInsertIfThenElse(Cond, &CB, &ThenTerm, &ElseTerm, BranchWeights); 
-  BasicBlock *ThenBlock = ThenTerm->getParent(); 
-  BasicBlock *ElseBlock = ElseTerm->getParent(); 
-  BasicBlock *MergeBlock = OrigInst->getParent(); 
- 
-  ThenBlock->setName("if.true.direct_targ"); 
-  ElseBlock->setName("if.false.orig_indirect"); 
-  MergeBlock->setName("if.end.icp"); 
- 
-  CallBase *NewInst = cast<CallBase>(OrigInst->clone()); 
-  OrigInst->moveBefore(ElseTerm); 
-  NewInst->insertBefore(ThenTerm); 
- 
-  // If the original call site is an invoke instruction, we have extra work to 
-  // do since invoke instructions are terminating. We have to fix-up phi nodes 
-  // in the invoke's normal and unwind destinations. 
-  if (auto *OrigInvoke = dyn_cast<InvokeInst>(OrigInst)) { 
-    auto *NewInvoke = cast<InvokeInst>(NewInst); 
- 
-    // Invoke instructions are terminating, so we don't need the terminator 
-    // instructions that were just created. 
-    ThenTerm->eraseFromParent(); 
-    ElseTerm->eraseFromParent(); 
- 
-    // Branch from the "merge" block to the original normal destination. 
-    Builder.SetInsertPoint(MergeBlock); 
-    Builder.CreateBr(OrigInvoke->getNormalDest()); 
- 
-    // Fix-up phi nodes in the original invoke's normal and unwind destinations. 
-    fixupPHINodeForNormalDest(OrigInvoke, OrigBlock, MergeBlock); 
-    fixupPHINodeForUnwindDest(OrigInvoke, MergeBlock, ThenBlock, ElseBlock); 
- 
-    // Now set the normal destinations of the invoke instructions to be the 
-    // "merge" block. 
-    OrigInvoke->setNormalDest(MergeBlock); 
-    NewInvoke->setNormalDest(MergeBlock); 
-  } 
- 
-  // Create a phi node for the returned value of the call site. 
-  createRetPHINode(OrigInst, NewInst, MergeBlock, Builder); 
- 
-  return *NewInst; 
-} 
- 
-bool llvm::isLegalToPromote(const CallBase &CB, Function *Callee, 
-                            const char **FailureReason) { 
-  assert(!CB.getCalledFunction() && "Only indirect call sites can be promoted"); 
- 
-  auto &DL = Callee->getParent()->getDataLayout(); 
- 
-  // Check the return type. The callee's return value type must be bitcast 
-  // compatible with the call site's type. 
-  Type *CallRetTy = CB.getType(); 
-  Type *FuncRetTy = Callee->getReturnType(); 
-  if (CallRetTy != FuncRetTy) 
-    if (!CastInst::isBitOrNoopPointerCastable(FuncRetTy, CallRetTy, DL)) { 
-      if (FailureReason) 
-        *FailureReason = "Return type mismatch"; 
-      return false; 
-    } 
- 
-  // The number of formal arguments of the callee. 
-  unsigned NumParams = Callee->getFunctionType()->getNumParams(); 
- 
-  // The number of actual arguments in the call. 
-  unsigned NumArgs = CB.arg_size(); 
- 
-  // Check the number of arguments. The callee and call site must agree on the 
-  // number of arguments. 
-  if (NumArgs != NumParams && !Callee->isVarArg()) { 
-    if (FailureReason) 
-      *FailureReason = "The number of arguments mismatch"; 
-    return false; 
-  } 
- 
-  // Check the argument types. The callee's formal argument types must be 
-  // bitcast compatible with the corresponding actual argument types of the call 
-  // site. 
-  unsigned I = 0; 
-  for (; I < NumParams; ++I) { 
-    Type *FormalTy = Callee->getFunctionType()->getFunctionParamType(I); 
-    Type *ActualTy = CB.getArgOperand(I)->getType(); 
-    if (FormalTy == ActualTy) 
-      continue; 
-    if (!CastInst::isBitOrNoopPointerCastable(ActualTy, FormalTy, DL)) { 
-      if (FailureReason) 
-        *FailureReason = "Argument type mismatch"; 
-      return false; 
-    } 
-  } 
-  for (; I < NumArgs; I++) { 
+
+  // Determine an appropriate location to create the bitcast for the return
+  // value. The location depends on if we have a call or invoke instruction.
+  Instruction *InsertBefore = nullptr;
+  if (auto *Invoke = dyn_cast<InvokeInst>(&CB))
+    InsertBefore =
+        &SplitEdge(Invoke->getParent(), Invoke->getNormalDest())->front();
+  else
+    InsertBefore = &*std::next(CB.getIterator());
+
+  // Bitcast the return value to the correct type.
+  auto *Cast = CastInst::CreateBitOrPointerCast(&CB, RetTy, "", InsertBefore);
+  if (RetBitCast)
+    *RetBitCast = Cast;
+
+  // Replace all the original uses of the calling instruction with the bitcast.
+  for (User *U : UsersToUpdate)
+    U->replaceUsesOfWith(&CB, Cast);
+}
+
+/// Predicate and clone the given call site.
+///
+/// This function creates an if-then-else structure at the location of the call
+/// site. The "if" condition compares the call site's called value to the given
+/// callee. The original call site is moved into the "else" block, and a clone
+/// of the call site is placed in the "then" block. The cloned instruction is
+/// returned.
+///
+/// For example, the call instruction below:
+///
+///   orig_bb:
+///     %t0 = call i32 %ptr()
+///     ...
+///
+/// Is replace by the following:
+///
+///   orig_bb:
+///     %cond = icmp eq i32 ()* %ptr, @func
+///     br i1 %cond, %then_bb, %else_bb
+///
+///   then_bb:
+///     ; The clone of the original call instruction is placed in the "then"
+///     ; block. It is not yet promoted.
+///     %t1 = call i32 %ptr()
+///     br merge_bb
+///
+///   else_bb:
+///     ; The original call instruction is moved to the "else" block.
+///     %t0 = call i32 %ptr()
+///     br merge_bb
+///
+///   merge_bb:
+///     ; Uses of the original call instruction are replaced by uses of the phi
+///     ; node.
+///     %t2 = phi i32 [ %t0, %else_bb ], [ %t1, %then_bb ]
+///     ...
+///
+/// A similar transformation is performed for invoke instructions. However,
+/// since invokes are terminating, more work is required. For example, the
+/// invoke instruction below:
+///
+///   orig_bb:
+///     %t0 = invoke %ptr() to label %normal_dst unwind label %unwind_dst
+///
+/// Is replace by the following:
+///
+///   orig_bb:
+///     %cond = icmp eq i32 ()* %ptr, @func
+///     br i1 %cond, %then_bb, %else_bb
+///
+///   then_bb:
+///     ; The clone of the original invoke instruction is placed in the "then"
+///     ; block, and its normal destination is set to the "merge" block. It is
+///     ; not yet promoted.
+///     %t1 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst
+///
+///   else_bb:
+///     ; The original invoke instruction is moved into the "else" block, and
+///     ; its normal destination is set to the "merge" block.
+///     %t0 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst
+///
+///   merge_bb:
+///     ; Uses of the original invoke instruction are replaced by uses of the
+///     ; phi node, and the merge block branches to the normal destination.
+///     %t2 = phi i32 [ %t0, %else_bb ], [ %t1, %then_bb ]
+///     br %normal_dst
+///
+/// An indirect musttail call is processed slightly differently in that:
+/// 1. No merge block needed for the orginal and the cloned callsite, since
+///    either one ends the flow. No phi node is needed either.
+/// 2. The return statement following the original call site is duplicated too
+///    and placed immediately after the cloned call site per the IR convention.
+///
+/// For example, the musttail call instruction below:
+///
+///   orig_bb:
+///     %t0 = musttail call i32 %ptr()
+///     ...
+///
+/// Is replaced by the following:
+///
+///   cond_bb:
+///     %cond = icmp eq i32 ()* %ptr, @func
+///     br i1 %cond, %then_bb, %orig_bb
+///
+///   then_bb:
+///     ; The clone of the original call instruction is placed in the "then"
+///     ; block. It is not yet promoted.
+///     %t1 = musttail call i32 %ptr()
+///     ret %t1
+///
+///   orig_bb:
+///     ; The original call instruction stays in its original block.
+///     %t0 = musttail call i32 %ptr()
+///     ret %t0
+static CallBase &versionCallSite(CallBase &CB, Value *Callee,
+                                 MDNode *BranchWeights) {
+
+  IRBuilder<> Builder(&CB);
+  CallBase *OrigInst = &CB;
+  BasicBlock *OrigBlock = OrigInst->getParent();
+
+  // Create the compare. The called value and callee must have the same type to
+  // be compared.
+  if (CB.getCalledOperand()->getType() != Callee->getType())
+    Callee = Builder.CreateBitCast(Callee, CB.getCalledOperand()->getType());
+  auto *Cond = Builder.CreateICmpEQ(CB.getCalledOperand(), Callee);
+
+  if (OrigInst->isMustTailCall()) {
+    // Create an if-then structure. The original instruction stays in its block,
+    // and a clone of the original instruction is placed in the "then" block.
+    Instruction *ThenTerm =
+        SplitBlockAndInsertIfThen(Cond, &CB, false, BranchWeights);
+    BasicBlock *ThenBlock = ThenTerm->getParent();
+    ThenBlock->setName("if.true.direct_targ");
+    CallBase *NewInst = cast<CallBase>(OrigInst->clone());
+    NewInst->insertBefore(ThenTerm);
+
+    // Place a clone of the optional bitcast after the new call site.
+    Value *NewRetVal = NewInst;
+    auto Next = OrigInst->getNextNode();
+    if (auto *BitCast = dyn_cast_or_null<BitCastInst>(Next)) {
+      assert(BitCast->getOperand(0) == OrigInst &&
+             "bitcast following musttail call must use the call");
+      auto NewBitCast = BitCast->clone();
+      NewBitCast->replaceUsesOfWith(OrigInst, NewInst);
+      NewBitCast->insertBefore(ThenTerm);
+      NewRetVal = NewBitCast;
+      Next = BitCast->getNextNode();
+    }
+
+    // Place a clone of the return instruction after the new call site.
+    ReturnInst *Ret = dyn_cast_or_null<ReturnInst>(Next);
+    assert(Ret && "musttail call must precede a ret with an optional bitcast");
+    auto NewRet = Ret->clone();
+    if (Ret->getReturnValue())
+      NewRet->replaceUsesOfWith(Ret->getReturnValue(), NewRetVal);
+    NewRet->insertBefore(ThenTerm);
+
+    // A return instructions is terminating, so we don't need the terminator
+    // instruction just created.
+    ThenTerm->eraseFromParent();
+
+    return *NewInst;
+  }
+
+  // Create an if-then-else structure. The original instruction is moved into
+  // the "else" block, and a clone of the original instruction is placed in the
+  // "then" block.
+  Instruction *ThenTerm = nullptr;
+  Instruction *ElseTerm = nullptr;
+  SplitBlockAndInsertIfThenElse(Cond, &CB, &ThenTerm, &ElseTerm, BranchWeights);
+  BasicBlock *ThenBlock = ThenTerm->getParent();
+  BasicBlock *ElseBlock = ElseTerm->getParent();
+  BasicBlock *MergeBlock = OrigInst->getParent();
+
+  ThenBlock->setName("if.true.direct_targ");
+  ElseBlock->setName("if.false.orig_indirect");
+  MergeBlock->setName("if.end.icp");
+
+  CallBase *NewInst = cast<CallBase>(OrigInst->clone());
+  OrigInst->moveBefore(ElseTerm);
+  NewInst->insertBefore(ThenTerm);
+
+  // If the original call site is an invoke instruction, we have extra work to
+  // do since invoke instructions are terminating. We have to fix-up phi nodes
+  // in the invoke's normal and unwind destinations.
+  if (auto *OrigInvoke = dyn_cast<InvokeInst>(OrigInst)) {
+    auto *NewInvoke = cast<InvokeInst>(NewInst);
+
+    // Invoke instructions are terminating, so we don't need the terminator
+    // instructions that were just created.
+    ThenTerm->eraseFromParent();
+    ElseTerm->eraseFromParent();
+
+    // Branch from the "merge" block to the original normal destination.
+    Builder.SetInsertPoint(MergeBlock);
+    Builder.CreateBr(OrigInvoke->getNormalDest());
+
+    // Fix-up phi nodes in the original invoke's normal and unwind destinations.
+    fixupPHINodeForNormalDest(OrigInvoke, OrigBlock, MergeBlock);
+    fixupPHINodeForUnwindDest(OrigInvoke, MergeBlock, ThenBlock, ElseBlock);
+
+    // Now set the normal destinations of the invoke instructions to be the
+    // "merge" block.
+    OrigInvoke->setNormalDest(MergeBlock);
+    NewInvoke->setNormalDest(MergeBlock);
+  }
+
+  // Create a phi node for the returned value of the call site.
+  createRetPHINode(OrigInst, NewInst, MergeBlock, Builder);
+
+  return *NewInst;
+}
+
+bool llvm::isLegalToPromote(const CallBase &CB, Function *Callee,
+                            const char **FailureReason) {
+  assert(!CB.getCalledFunction() && "Only indirect call sites can be promoted");
+
+  auto &DL = Callee->getParent()->getDataLayout();
+
+  // Check the return type. The callee's return value type must be bitcast
+  // compatible with the call site's type.
+  Type *CallRetTy = CB.getType();
+  Type *FuncRetTy = Callee->getReturnType();
+  if (CallRetTy != FuncRetTy)
+    if (!CastInst::isBitOrNoopPointerCastable(FuncRetTy, CallRetTy, DL)) {
+      if (FailureReason)
+        *FailureReason = "Return type mismatch";
+      return false;
+    }
+
+  // The number of formal arguments of the callee.
+  unsigned NumParams = Callee->getFunctionType()->getNumParams();
+
+  // The number of actual arguments in the call.
+  unsigned NumArgs = CB.arg_size();
+
+  // Check the number of arguments. The callee and call site must agree on the
+  // number of arguments.
+  if (NumArgs != NumParams && !Callee->isVarArg()) {
+    if (FailureReason)
+      *FailureReason = "The number of arguments mismatch";
+    return false;
+  }
+
+  // Check the argument types. The callee's formal argument types must be
+  // bitcast compatible with the corresponding actual argument types of the call
+  // site.
+  unsigned I = 0;
+  for (; I < NumParams; ++I) {
+    Type *FormalTy = Callee->getFunctionType()->getFunctionParamType(I);
+    Type *ActualTy = CB.getArgOperand(I)->getType();
+    if (FormalTy == ActualTy)
+      continue;
+    if (!CastInst::isBitOrNoopPointerCastable(ActualTy, FormalTy, DL)) {
+      if (FailureReason)
+        *FailureReason = "Argument type mismatch";
+      return false;
+    }
+  }
+  for (; I < NumArgs; I++) {
     // Vararg functions can have more arguments than parameters.
-    assert(Callee->isVarArg()); 
-    if (CB.paramHasAttr(I, Attribute::StructRet)) { 
+    assert(Callee->isVarArg());
+    if (CB.paramHasAttr(I, Attribute::StructRet)) {
       if (FailureReason)
         *FailureReason = "SRet arg to vararg function";
-      return false; 
-    } 
-  } 
- 
-  return true; 
-} 
- 
-CallBase &llvm::promoteCall(CallBase &CB, Function *Callee, 
-                            CastInst **RetBitCast) { 
-  assert(!CB.getCalledFunction() && "Only indirect call sites can be promoted"); 
- 
-  // Set the called function of the call site to be the given callee (but don't 
-  // change the type). 
-  CB.setCalledOperand(Callee); 
- 
-  // Since the call site will no longer be direct, we must clear metadata that 
-  // is only appropriate for indirect calls. This includes !prof and !callees 
-  // metadata. 
-  CB.setMetadata(LLVMContext::MD_prof, nullptr); 
-  CB.setMetadata(LLVMContext::MD_callees, nullptr); 
- 
-  // If the function type of the call site matches that of the callee, no 
-  // additional work is required. 
-  if (CB.getFunctionType() == Callee->getFunctionType()) 
-    return CB; 
- 
-  // Save the return types of the call site and callee. 
-  Type *CallSiteRetTy = CB.getType(); 
-  Type *CalleeRetTy = Callee->getReturnType(); 
- 
-  // Change the function type of the call site the match that of the callee. 
-  CB.mutateFunctionType(Callee->getFunctionType()); 
- 
-  // Inspect the arguments of the call site. If an argument's type doesn't 
-  // match the corresponding formal argument's type in the callee, bitcast it 
-  // to the correct type. 
-  auto CalleeType = Callee->getFunctionType(); 
-  auto CalleeParamNum = CalleeType->getNumParams(); 
- 
-  LLVMContext &Ctx = Callee->getContext(); 
-  const AttributeList &CallerPAL = CB.getAttributes(); 
-  // The new list of argument attributes. 
-  SmallVector<AttributeSet, 4> NewArgAttrs; 
-  bool AttributeChanged = false; 
- 
-  for (unsigned ArgNo = 0; ArgNo < CalleeParamNum; ++ArgNo) { 
-    auto *Arg = CB.getArgOperand(ArgNo); 
-    Type *FormalTy = CalleeType->getParamType(ArgNo); 
-    Type *ActualTy = Arg->getType(); 
-    if (FormalTy != ActualTy) { 
-      auto *Cast = CastInst::CreateBitOrPointerCast(Arg, FormalTy, "", &CB); 
-      CB.setArgOperand(ArgNo, Cast); 
- 
-      // Remove any incompatible attributes for the argument. 
-      AttrBuilder ArgAttrs(CallerPAL.getParamAttributes(ArgNo)); 
-      ArgAttrs.remove(AttributeFuncs::typeIncompatible(FormalTy)); 
- 
-      // If byval is used, this must be a pointer type, and the byval type must 
-      // match the element type. Update it if present. 
-      if (ArgAttrs.getByValType()) { 
-        Type *NewTy = Callee->getParamByValType(ArgNo); 
-        ArgAttrs.addByValAttr( 
-            NewTy ? NewTy : cast<PointerType>(FormalTy)->getElementType()); 
-      } 
- 
-      NewArgAttrs.push_back(AttributeSet::get(Ctx, ArgAttrs)); 
-      AttributeChanged = true; 
-    } else 
-      NewArgAttrs.push_back(CallerPAL.getParamAttributes(ArgNo)); 
-  } 
- 
-  // If the return type of the call site doesn't match that of the callee, cast 
-  // the returned value to the appropriate type. 
-  // Remove any incompatible return value attribute. 
-  AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex); 
-  if (!CallSiteRetTy->isVoidTy() && CallSiteRetTy != CalleeRetTy) { 
-    createRetBitCast(CB, CallSiteRetTy, RetBitCast); 
-    RAttrs.remove(AttributeFuncs::typeIncompatible(CalleeRetTy)); 
-    AttributeChanged = true; 
-  } 
- 
-  // Set the new callsite attribute. 
-  if (AttributeChanged) 
-    CB.setAttributes(AttributeList::get(Ctx, CallerPAL.getFnAttributes(), 
-                                        AttributeSet::get(Ctx, RAttrs), 
-                                        NewArgAttrs)); 
- 
-  return CB; 
-} 
- 
-CallBase &llvm::promoteCallWithIfThenElse(CallBase &CB, Function *Callee, 
-                                          MDNode *BranchWeights) { 
- 
-  // Version the indirect call site. If the called value is equal to the given 
-  // callee, 'NewInst' will be executed, otherwise the original call site will 
-  // be executed. 
-  CallBase &NewInst = versionCallSite(CB, Callee, BranchWeights); 
- 
-  // Promote 'NewInst' so that it directly calls the desired function. 
-  return promoteCall(NewInst, Callee); 
-} 
- 
-bool llvm::tryPromoteCall(CallBase &CB) { 
-  assert(!CB.getCalledFunction()); 
-  Module *M = CB.getCaller()->getParent(); 
-  const DataLayout &DL = M->getDataLayout(); 
-  Value *Callee = CB.getCalledOperand(); 
- 
-  LoadInst *VTableEntryLoad = dyn_cast<LoadInst>(Callee); 
-  if (!VTableEntryLoad) 
-    return false; // Not a vtable entry load. 
-  Value *VTableEntryPtr = VTableEntryLoad->getPointerOperand(); 
-  APInt VTableOffset(DL.getTypeSizeInBits(VTableEntryPtr->getType()), 0); 
-  Value *VTableBasePtr = VTableEntryPtr->stripAndAccumulateConstantOffsets( 
-      DL, VTableOffset, /* AllowNonInbounds */ true); 
-  LoadInst *VTablePtrLoad = dyn_cast<LoadInst>(VTableBasePtr); 
-  if (!VTablePtrLoad) 
-    return false; // Not a vtable load. 
-  Value *Object = VTablePtrLoad->getPointerOperand(); 
-  APInt ObjectOffset(DL.getTypeSizeInBits(Object->getType()), 0); 
-  Value *ObjectBase = Object->stripAndAccumulateConstantOffsets( 
-      DL, ObjectOffset, /* AllowNonInbounds */ true); 
-  if (!(isa<AllocaInst>(ObjectBase) && ObjectOffset == 0)) 
-    // Not an Alloca or the offset isn't zero. 
-    return false; 
- 
-  // Look for the vtable pointer store into the object by the ctor. 
-  BasicBlock::iterator BBI(VTablePtrLoad); 
-  Value *VTablePtr = FindAvailableLoadedValue( 
-      VTablePtrLoad, VTablePtrLoad->getParent(), BBI, 0, nullptr, nullptr); 
-  if (!VTablePtr) 
-    return false; // No vtable found. 
-  APInt VTableOffsetGVBase(DL.getTypeSizeInBits(VTablePtr->getType()), 0); 
-  Value *VTableGVBase = VTablePtr->stripAndAccumulateConstantOffsets( 
-      DL, VTableOffsetGVBase, /* AllowNonInbounds */ true); 
-  GlobalVariable *GV = dyn_cast<GlobalVariable>(VTableGVBase); 
-  if (!(GV && GV->isConstant() && GV->hasDefinitiveInitializer())) 
-    // Not in the form of a global constant variable with an initializer. 
-    return false; 
- 
-  Constant *VTableGVInitializer = GV->getInitializer(); 
-  APInt VTableGVOffset = VTableOffsetGVBase + VTableOffset; 
-  if (!(VTableGVOffset.getActiveBits() <= 64)) 
-    return false; // Out of range. 
-  Constant *Ptr = getPointerAtOffset(VTableGVInitializer, 
-                                     VTableGVOffset.getZExtValue(), 
-                                     *M); 
-  if (!Ptr) 
-    return false; // No constant (function) pointer found. 
-  Function *DirectCallee = dyn_cast<Function>(Ptr->stripPointerCasts()); 
-  if (!DirectCallee) 
-    return false; // No function pointer found. 
- 
-  if (!isLegalToPromote(CB, DirectCallee)) 
-    return false; 
- 
-  // Success. 
-  promoteCall(CB, DirectCallee); 
-  return true; 
-} 
- 
-#undef DEBUG_TYPE 
+      return false;
+    }
+  }
+
+  return true;
+}
+
+CallBase &llvm::promoteCall(CallBase &CB, Function *Callee,
+                            CastInst **RetBitCast) {
+  assert(!CB.getCalledFunction() && "Only indirect call sites can be promoted");
+
+  // Set the called function of the call site to be the given callee (but don't
+  // change the type).
+  CB.setCalledOperand(Callee);
+
+  // Since the call site will no longer be direct, we must clear metadata that
+  // is only appropriate for indirect calls. This includes !prof and !callees
+  // metadata.
+  CB.setMetadata(LLVMContext::MD_prof, nullptr);
+  CB.setMetadata(LLVMContext::MD_callees, nullptr);
+
+  // If the function type of the call site matches that of the callee, no
+  // additional work is required.
+  if (CB.getFunctionType() == Callee->getFunctionType())
+    return CB;
+
+  // Save the return types of the call site and callee.
+  Type *CallSiteRetTy = CB.getType();
+  Type *CalleeRetTy = Callee->getReturnType();
+
+  // Change the function type of the call site the match that of the callee.
+  CB.mutateFunctionType(Callee->getFunctionType());
+
+  // Inspect the arguments of the call site. If an argument's type doesn't
+  // match the corresponding formal argument's type in the callee, bitcast it
+  // to the correct type.
+  auto CalleeType = Callee->getFunctionType();
+  auto CalleeParamNum = CalleeType->getNumParams();
+
+  LLVMContext &Ctx = Callee->getContext();
+  const AttributeList &CallerPAL = CB.getAttributes();
+  // The new list of argument attributes.
+  SmallVector<AttributeSet, 4> NewArgAttrs;
+  bool AttributeChanged = false;
+
+  for (unsigned ArgNo = 0; ArgNo < CalleeParamNum; ++ArgNo) {
+    auto *Arg = CB.getArgOperand(ArgNo);
+    Type *FormalTy = CalleeType->getParamType(ArgNo);
+    Type *ActualTy = Arg->getType();
+    if (FormalTy != ActualTy) {
+      auto *Cast = CastInst::CreateBitOrPointerCast(Arg, FormalTy, "", &CB);
+      CB.setArgOperand(ArgNo, Cast);
+
+      // Remove any incompatible attributes for the argument.
+      AttrBuilder ArgAttrs(CallerPAL.getParamAttributes(ArgNo));
+      ArgAttrs.remove(AttributeFuncs::typeIncompatible(FormalTy));
+
+      // If byval is used, this must be a pointer type, and the byval type must
+      // match the element type. Update it if present.
+      if (ArgAttrs.getByValType()) {
+        Type *NewTy = Callee->getParamByValType(ArgNo);
+        ArgAttrs.addByValAttr(
+            NewTy ? NewTy : cast<PointerType>(FormalTy)->getElementType());
+      }
+
+      NewArgAttrs.push_back(AttributeSet::get(Ctx, ArgAttrs));
+      AttributeChanged = true;
+    } else
+      NewArgAttrs.push_back(CallerPAL.getParamAttributes(ArgNo));
+  }
+
+  // If the return type of the call site doesn't match that of the callee, cast
+  // the returned value to the appropriate type.
+  // Remove any incompatible return value attribute.
+  AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex);
+  if (!CallSiteRetTy->isVoidTy() && CallSiteRetTy != CalleeRetTy) {
+    createRetBitCast(CB, CallSiteRetTy, RetBitCast);
+    RAttrs.remove(AttributeFuncs::typeIncompatible(CalleeRetTy));
+    AttributeChanged = true;
+  }
+
+  // Set the new callsite attribute.
+  if (AttributeChanged)
+    CB.setAttributes(AttributeList::get(Ctx, CallerPAL.getFnAttributes(),
+                                        AttributeSet::get(Ctx, RAttrs),
+                                        NewArgAttrs));
+
+  return CB;
+}
+
+CallBase &llvm::promoteCallWithIfThenElse(CallBase &CB, Function *Callee,
+                                          MDNode *BranchWeights) {
+
+  // Version the indirect call site. If the called value is equal to the given
+  // callee, 'NewInst' will be executed, otherwise the original call site will
+  // be executed.
+  CallBase &NewInst = versionCallSite(CB, Callee, BranchWeights);
+
+  // Promote 'NewInst' so that it directly calls the desired function.
+  return promoteCall(NewInst, Callee);
+}
+
+bool llvm::tryPromoteCall(CallBase &CB) {
+  assert(!CB.getCalledFunction());
+  Module *M = CB.getCaller()->getParent();
+  const DataLayout &DL = M->getDataLayout();
+  Value *Callee = CB.getCalledOperand();
+
+  LoadInst *VTableEntryLoad = dyn_cast<LoadInst>(Callee);
+  if (!VTableEntryLoad)
+    return false; // Not a vtable entry load.
+  Value *VTableEntryPtr = VTableEntryLoad->getPointerOperand();
+  APInt VTableOffset(DL.getTypeSizeInBits(VTableEntryPtr->getType()), 0);
+  Value *VTableBasePtr = VTableEntryPtr->stripAndAccumulateConstantOffsets(
+      DL, VTableOffset, /* AllowNonInbounds */ true);
+  LoadInst *VTablePtrLoad = dyn_cast<LoadInst>(VTableBasePtr);
+  if (!VTablePtrLoad)
+    return false; // Not a vtable load.
+  Value *Object = VTablePtrLoad->getPointerOperand();
+  APInt ObjectOffset(DL.getTypeSizeInBits(Object->getType()), 0);
+  Value *ObjectBase = Object->stripAndAccumulateConstantOffsets(
+      DL, ObjectOffset, /* AllowNonInbounds */ true);
+  if (!(isa<AllocaInst>(ObjectBase) && ObjectOffset == 0))
+    // Not an Alloca or the offset isn't zero.
+    return false;
+
+  // Look for the vtable pointer store into the object by the ctor.
+  BasicBlock::iterator BBI(VTablePtrLoad);
+  Value *VTablePtr = FindAvailableLoadedValue(
+      VTablePtrLoad, VTablePtrLoad->getParent(), BBI, 0, nullptr, nullptr);
+  if (!VTablePtr)
+    return false; // No vtable found.
+  APInt VTableOffsetGVBase(DL.getTypeSizeInBits(VTablePtr->getType()), 0);
+  Value *VTableGVBase = VTablePtr->stripAndAccumulateConstantOffsets(
+      DL, VTableOffsetGVBase, /* AllowNonInbounds */ true);
+  GlobalVariable *GV = dyn_cast<GlobalVariable>(VTableGVBase);
+  if (!(GV && GV->isConstant() && GV->hasDefinitiveInitializer()))
+    // Not in the form of a global constant variable with an initializer.
+    return false;
+
+  Constant *VTableGVInitializer = GV->getInitializer();
+  APInt VTableGVOffset = VTableOffsetGVBase + VTableOffset;
+  if (!(VTableGVOffset.getActiveBits() <= 64))
+    return false; // Out of range.
+  Constant *Ptr = getPointerAtOffset(VTableGVInitializer,
+                                     VTableGVOffset.getZExtValue(),
+                                     *M);
+  if (!Ptr)
+    return false; // No constant (function) pointer found.
+  Function *DirectCallee = dyn_cast<Function>(Ptr->stripPointerCasts());
+  if (!DirectCallee)
+    return false; // No function pointer found.
+
+  if (!isLegalToPromote(CB, DirectCallee))
+    return false;
+
+  // Success.
+  promoteCall(CB, DirectCallee);
+  return true;
+}
+
+#undef DEBUG_TYPE
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/CanonicalizeAliases.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/CanonicalizeAliases.cpp
index 295fc67108..6b01c0c71d 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/CanonicalizeAliases.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/CanonicalizeAliases.cpp
@@ -1,105 +1,105 @@
-//===- CanonicalizeAliases.cpp - ThinLTO Support: Canonicalize Aliases ----===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// Currently this file implements partial alias canonicalization, to 
-// flatten chains of aliases (also done by GlobalOpt, but not on for 
-// O0 compiles). E.g. 
-//  @a = alias i8, i8 *@b 
-//  @b = alias i8, i8 *@g 
-// 
-// will be converted to: 
-//  @a = alias i8, i8 *@g  <-- @a is now an alias to base object @g 
-//  @b = alias i8, i8 *@g 
-// 
-// Eventually this file will implement full alias canonicalation, so that 
-// all aliasees are private anonymous values. E.g. 
-//  @a = alias i8, i8 *@g 
-//  @g = global i8 0 
-// 
-// will be converted to: 
-//  @0 = private global 
-//  @a = alias i8, i8* @0 
-//  @g = alias i8, i8* @0 
-// 
-// This simplifies optimization and ThinLTO linking of the original symbols. 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/CanonicalizeAliases.h" 
-#include "llvm/IR/Operator.h" 
-#include "llvm/IR/ValueHandle.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
- 
-using namespace llvm; 
- 
-namespace { 
- 
-static Constant *canonicalizeAlias(Constant *C, bool &Changed) { 
-  if (auto *GA = dyn_cast<GlobalAlias>(C)) { 
-    auto *NewAliasee = canonicalizeAlias(GA->getAliasee(), Changed); 
-    if (NewAliasee != GA->getAliasee()) { 
-      GA->setAliasee(NewAliasee); 
-      Changed = true; 
-    } 
-    return NewAliasee; 
-  } 
- 
-  auto *CE = dyn_cast<ConstantExpr>(C); 
-  if (!CE) 
-    return C; 
- 
-  std::vector<Constant *> Ops; 
-  for (Use &U : CE->operands()) 
-    Ops.push_back(canonicalizeAlias(cast<Constant>(U), Changed)); 
-  return CE->getWithOperands(Ops); 
-} 
- 
-/// Convert aliases to canonical form. 
-static bool canonicalizeAliases(Module &M) { 
-  bool Changed = false; 
-  for (auto &GA : M.aliases()) 
-    canonicalizeAlias(&GA, Changed); 
-  return Changed; 
-} 
- 
-// Legacy pass that canonicalizes aliases. 
-class CanonicalizeAliasesLegacyPass : public ModulePass { 
- 
-public: 
-  /// Pass identification, replacement for typeid 
-  static char ID; 
- 
-  /// Specify pass name for debug output 
-  StringRef getPassName() const override { return "Canonicalize Aliases"; } 
- 
-  explicit CanonicalizeAliasesLegacyPass() : ModulePass(ID) {} 
- 
-  bool runOnModule(Module &M) override { return canonicalizeAliases(M); } 
-}; 
-char CanonicalizeAliasesLegacyPass::ID = 0; 
- 
-} // anonymous namespace 
- 
-PreservedAnalyses CanonicalizeAliasesPass::run(Module &M, 
-                                               ModuleAnalysisManager &AM) { 
-  if (!canonicalizeAliases(M)) 
-    return PreservedAnalyses::all(); 
- 
-  return PreservedAnalyses::none(); 
-} 
- 
-INITIALIZE_PASS_BEGIN(CanonicalizeAliasesLegacyPass, "canonicalize-aliases", 
-                      "Canonicalize aliases", false, false) 
-INITIALIZE_PASS_END(CanonicalizeAliasesLegacyPass, "canonicalize-aliases", 
-                    "Canonicalize aliases", false, false) 
- 
-namespace llvm { 
-ModulePass *createCanonicalizeAliasesPass() { 
-  return new CanonicalizeAliasesLegacyPass(); 
-} 
-} // namespace llvm 
+//===- CanonicalizeAliases.cpp - ThinLTO Support: Canonicalize Aliases ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Currently this file implements partial alias canonicalization, to
+// flatten chains of aliases (also done by GlobalOpt, but not on for
+// O0 compiles). E.g.
+//  @a = alias i8, i8 *@b
+//  @b = alias i8, i8 *@g
+//
+// will be converted to:
+//  @a = alias i8, i8 *@g  <-- @a is now an alias to base object @g
+//  @b = alias i8, i8 *@g
+//
+// Eventually this file will implement full alias canonicalation, so that
+// all aliasees are private anonymous values. E.g.
+//  @a = alias i8, i8 *@g
+//  @g = global i8 0
+//
+// will be converted to:
+//  @0 = private global
+//  @a = alias i8, i8* @0
+//  @g = alias i8, i8* @0
+//
+// This simplifies optimization and ThinLTO linking of the original symbols.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/CanonicalizeAliases.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+namespace {
+
+static Constant *canonicalizeAlias(Constant *C, bool &Changed) {
+  if (auto *GA = dyn_cast<GlobalAlias>(C)) {
+    auto *NewAliasee = canonicalizeAlias(GA->getAliasee(), Changed);
+    if (NewAliasee != GA->getAliasee()) {
+      GA->setAliasee(NewAliasee);
+      Changed = true;
+    }
+    return NewAliasee;
+  }
+
+  auto *CE = dyn_cast<ConstantExpr>(C);
+  if (!CE)
+    return C;
+
+  std::vector<Constant *> Ops;
+  for (Use &U : CE->operands())
+    Ops.push_back(canonicalizeAlias(cast<Constant>(U), Changed));
+  return CE->getWithOperands(Ops);
+}
+
+/// Convert aliases to canonical form.
+static bool canonicalizeAliases(Module &M) {
+  bool Changed = false;
+  for (auto &GA : M.aliases())
+    canonicalizeAlias(&GA, Changed);
+  return Changed;
+}
+
+// Legacy pass that canonicalizes aliases.
+class CanonicalizeAliasesLegacyPass : public ModulePass {
+
+public:
+  /// Pass identification, replacement for typeid
+  static char ID;
+
+  /// Specify pass name for debug output
+  StringRef getPassName() const override { return "Canonicalize Aliases"; }
+
+  explicit CanonicalizeAliasesLegacyPass() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) override { return canonicalizeAliases(M); }
+};
+char CanonicalizeAliasesLegacyPass::ID = 0;
+
+} // anonymous namespace
+
+PreservedAnalyses CanonicalizeAliasesPass::run(Module &M,
+                                               ModuleAnalysisManager &AM) {
+  if (!canonicalizeAliases(M))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+
+INITIALIZE_PASS_BEGIN(CanonicalizeAliasesLegacyPass, "canonicalize-aliases",
+                      "Canonicalize aliases", false, false)
+INITIALIZE_PASS_END(CanonicalizeAliasesLegacyPass, "canonicalize-aliases",
+                    "Canonicalize aliases", false, false)
+
+namespace llvm {
+ModulePass *createCanonicalizeAliasesPass() {
+  return new CanonicalizeAliasesLegacyPass();
+}
+} // namespace llvm
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
index 611efd8243..1f649fe6c7 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
@@ -1,250 +1,250 @@
-//==- CanonicalizeFreezeInLoops - Canonicalize freezes in a loop-*- C++ -*-===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass canonicalizes freeze instructions in a loop by pushing them out to 
-// the preheader. 
-// 
-//   loop: 
-//     i = phi init, i.next 
-//     i.next = add nsw i, 1 
-//     i.next.fr = freeze i.next // push this out of this loop 
-//     use(i.next.fr) 
-//     br i1 (i.next <= N), loop, exit 
-//   => 
-//     init.fr = freeze init 
-//   loop: 
-//     i = phi init.fr, i.next 
-//     i.next = add i, 1         // nsw is dropped here 
-//     use(i.next) 
-//     br i1 (i.next <= N), loop, exit 
-// 
-// Removing freezes from these chains help scalar evolution successfully analyze 
-// expressions. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/CanonicalizeFreezeInLoops.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/Analysis/IVDescriptors.h" 
-#include "llvm/Analysis/IVUsers.h" 
-#include "llvm/Analysis/LoopAnalysisManager.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/LoopPass.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Transforms/Utils.h" 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "canon-freeze" 
- 
-namespace { 
- 
-class CanonicalizeFreezeInLoops : public LoopPass { 
-public: 
-  static char ID; 
- 
-  CanonicalizeFreezeInLoops(); 
- 
-private: 
-  bool runOnLoop(Loop *L, LPPassManager &LPM) override; 
-  void getAnalysisUsage(AnalysisUsage &AU) const override; 
-}; 
- 
-class CanonicalizeFreezeInLoopsImpl { 
-  Loop *L; 
-  ScalarEvolution &SE; 
-  DominatorTree &DT; 
- 
-  struct FrozenIndPHIInfo { 
-    // A freeze instruction that uses an induction phi 
-    FreezeInst *FI = nullptr; 
-    // The induction phi, step instruction, the operand idx of StepInst which is 
-    // a step value 
-    PHINode *PHI; 
-    BinaryOperator *StepInst; 
-    unsigned StepValIdx = 0; 
- 
-    FrozenIndPHIInfo(PHINode *PHI, BinaryOperator *StepInst) 
-        : PHI(PHI), StepInst(StepInst) {} 
-  }; 
- 
-  // Can freeze instruction be pushed into operands of I? 
-  // In order to do this, I should not create a poison after I's flags are 
-  // stripped. 
-  bool canHandleInst(const Instruction *I) { 
-    auto Opc = I->getOpcode(); 
-    // If add/sub/mul, drop nsw/nuw flags. 
-    return Opc == Instruction::Add || Opc == Instruction::Sub || 
-           Opc == Instruction::Mul; 
-  } 
- 
-  void InsertFreezeAndForgetFromSCEV(Use &U); 
- 
-public: 
-  CanonicalizeFreezeInLoopsImpl(Loop *L, ScalarEvolution &SE, DominatorTree &DT) 
-      : L(L), SE(SE), DT(DT) {} 
-  bool run(); 
-}; 
- 
-} // anonymous namespace 
- 
-// Given U = (value, user), replace value with freeze(value), and let 
-// SCEV forget user. The inserted freeze is placed in the preheader. 
-void CanonicalizeFreezeInLoopsImpl::InsertFreezeAndForgetFromSCEV(Use &U) { 
-  auto *PH = L->getLoopPreheader(); 
- 
-  auto *UserI = cast<Instruction>(U.getUser()); 
-  auto *ValueToFr = U.get(); 
-  assert(L->contains(UserI->getParent()) && 
-         "Should not process an instruction that isn't inside the loop"); 
+//==- CanonicalizeFreezeInLoops - Canonicalize freezes in a loop-*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass canonicalizes freeze instructions in a loop by pushing them out to
+// the preheader.
+//
+//   loop:
+//     i = phi init, i.next
+//     i.next = add nsw i, 1
+//     i.next.fr = freeze i.next // push this out of this loop
+//     use(i.next.fr)
+//     br i1 (i.next <= N), loop, exit
+//   =>
+//     init.fr = freeze init
+//   loop:
+//     i = phi init.fr, i.next
+//     i.next = add i, 1         // nsw is dropped here
+//     use(i.next)
+//     br i1 (i.next <= N), loop, exit
+//
+// Removing freezes from these chains help scalar evolution successfully analyze
+// expressions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/CanonicalizeFreezeInLoops.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/IVDescriptors.h"
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "canon-freeze"
+
+namespace {
+
+class CanonicalizeFreezeInLoops : public LoopPass {
+public:
+  static char ID;
+
+  CanonicalizeFreezeInLoops();
+
+private:
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+
+class CanonicalizeFreezeInLoopsImpl {
+  Loop *L;
+  ScalarEvolution &SE;
+  DominatorTree &DT;
+
+  struct FrozenIndPHIInfo {
+    // A freeze instruction that uses an induction phi
+    FreezeInst *FI = nullptr;
+    // The induction phi, step instruction, the operand idx of StepInst which is
+    // a step value
+    PHINode *PHI;
+    BinaryOperator *StepInst;
+    unsigned StepValIdx = 0;
+
+    FrozenIndPHIInfo(PHINode *PHI, BinaryOperator *StepInst)
+        : PHI(PHI), StepInst(StepInst) {}
+  };
+
+  // Can freeze instruction be pushed into operands of I?
+  // In order to do this, I should not create a poison after I's flags are
+  // stripped.
+  bool canHandleInst(const Instruction *I) {
+    auto Opc = I->getOpcode();
+    // If add/sub/mul, drop nsw/nuw flags.
+    return Opc == Instruction::Add || Opc == Instruction::Sub ||
+           Opc == Instruction::Mul;
+  }
+
+  void InsertFreezeAndForgetFromSCEV(Use &U);
+
+public:
+  CanonicalizeFreezeInLoopsImpl(Loop *L, ScalarEvolution &SE, DominatorTree &DT)
+      : L(L), SE(SE), DT(DT) {}
+  bool run();
+};
+
+} // anonymous namespace
+
+// Given U = (value, user), replace value with freeze(value), and let
+// SCEV forget user. The inserted freeze is placed in the preheader.
+void CanonicalizeFreezeInLoopsImpl::InsertFreezeAndForgetFromSCEV(Use &U) {
+  auto *PH = L->getLoopPreheader();
+
+  auto *UserI = cast<Instruction>(U.getUser());
+  auto *ValueToFr = U.get();
+  assert(L->contains(UserI->getParent()) &&
+         "Should not process an instruction that isn't inside the loop");
   if (isGuaranteedNotToBeUndefOrPoison(ValueToFr, nullptr, UserI, &DT))
-    return; 
- 
-  LLVM_DEBUG(dbgs() << "canonfr: inserting freeze:\n"); 
-  LLVM_DEBUG(dbgs() << "\tUser: " << *U.getUser() << "\n"); 
-  LLVM_DEBUG(dbgs() << "\tOperand: " << *U.get() << "\n"); 
- 
-  U.set(new FreezeInst(ValueToFr, ValueToFr->getName() + ".frozen", 
-                       PH->getTerminator())); 
- 
-  SE.forgetValue(UserI); 
-} 
- 
-bool CanonicalizeFreezeInLoopsImpl::run() { 
-  // The loop should be in LoopSimplify form. 
-  if (!L->isLoopSimplifyForm()) 
-    return false; 
- 
-  SmallVector<FrozenIndPHIInfo, 4> Candidates; 
- 
-  for (auto &PHI : L->getHeader()->phis()) { 
-    InductionDescriptor ID; 
-    if (!InductionDescriptor::isInductionPHI(&PHI, L, &SE, ID)) 
-      continue; 
- 
-    LLVM_DEBUG(dbgs() << "canonfr: PHI: " << PHI << "\n"); 
-    FrozenIndPHIInfo Info(&PHI, ID.getInductionBinOp()); 
-    if (!Info.StepInst || !canHandleInst(Info.StepInst)) { 
-      // The stepping instruction has unknown form. 
-      // Ignore this PHI. 
-      continue; 
-    } 
- 
-    Info.StepValIdx = Info.StepInst->getOperand(0) == &PHI; 
-    Value *StepV = Info.StepInst->getOperand(Info.StepValIdx); 
-    if (auto *StepI = dyn_cast<Instruction>(StepV)) { 
-      if (L->contains(StepI->getParent())) { 
-        // The step value is inside the loop. Freezing step value will introduce 
-        // another freeze into the loop, so skip this PHI. 
-        continue; 
-      } 
-    } 
- 
-    auto Visit = [&](User *U) { 
-      if (auto *FI = dyn_cast<FreezeInst>(U)) { 
-        LLVM_DEBUG(dbgs() << "canonfr: found: " << *FI << "\n"); 
-        Info.FI = FI; 
-        Candidates.push_back(Info); 
-      } 
-    }; 
-    for_each(PHI.users(), Visit); 
-    for_each(Info.StepInst->users(), Visit); 
-  } 
- 
-  if (Candidates.empty()) 
-    return false; 
- 
-  SmallSet<PHINode *, 8> ProcessedPHIs; 
-  for (const auto &Info : Candidates) { 
-    PHINode *PHI = Info.PHI; 
-    if (!ProcessedPHIs.insert(Info.PHI).second) 
-      continue; 
- 
-    BinaryOperator *StepI = Info.StepInst; 
-    assert(StepI && "Step instruction should have been found"); 
- 
-    // Drop flags from the step instruction. 
+    return;
+
+  LLVM_DEBUG(dbgs() << "canonfr: inserting freeze:\n");
+  LLVM_DEBUG(dbgs() << "\tUser: " << *U.getUser() << "\n");
+  LLVM_DEBUG(dbgs() << "\tOperand: " << *U.get() << "\n");
+
+  U.set(new FreezeInst(ValueToFr, ValueToFr->getName() + ".frozen",
+                       PH->getTerminator()));
+
+  SE.forgetValue(UserI);
+}
+
+bool CanonicalizeFreezeInLoopsImpl::run() {
+  // The loop should be in LoopSimplify form.
+  if (!L->isLoopSimplifyForm())
+    return false;
+
+  SmallVector<FrozenIndPHIInfo, 4> Candidates;
+
+  for (auto &PHI : L->getHeader()->phis()) {
+    InductionDescriptor ID;
+    if (!InductionDescriptor::isInductionPHI(&PHI, L, &SE, ID))
+      continue;
+
+    LLVM_DEBUG(dbgs() << "canonfr: PHI: " << PHI << "\n");
+    FrozenIndPHIInfo Info(&PHI, ID.getInductionBinOp());
+    if (!Info.StepInst || !canHandleInst(Info.StepInst)) {
+      // The stepping instruction has unknown form.
+      // Ignore this PHI.
+      continue;
+    }
+
+    Info.StepValIdx = Info.StepInst->getOperand(0) == &PHI;
+    Value *StepV = Info.StepInst->getOperand(Info.StepValIdx);
+    if (auto *StepI = dyn_cast<Instruction>(StepV)) {
+      if (L->contains(StepI->getParent())) {
+        // The step value is inside the loop. Freezing step value will introduce
+        // another freeze into the loop, so skip this PHI.
+        continue;
+      }
+    }
+
+    auto Visit = [&](User *U) {
+      if (auto *FI = dyn_cast<FreezeInst>(U)) {
+        LLVM_DEBUG(dbgs() << "canonfr: found: " << *FI << "\n");
+        Info.FI = FI;
+        Candidates.push_back(Info);
+      }
+    };
+    for_each(PHI.users(), Visit);
+    for_each(Info.StepInst->users(), Visit);
+  }
+
+  if (Candidates.empty())
+    return false;
+
+  SmallSet<PHINode *, 8> ProcessedPHIs;
+  for (const auto &Info : Candidates) {
+    PHINode *PHI = Info.PHI;
+    if (!ProcessedPHIs.insert(Info.PHI).second)
+      continue;
+
+    BinaryOperator *StepI = Info.StepInst;
+    assert(StepI && "Step instruction should have been found");
+
+    // Drop flags from the step instruction.
     if (!isGuaranteedNotToBeUndefOrPoison(StepI, nullptr, StepI, &DT)) {
-      LLVM_DEBUG(dbgs() << "canonfr: drop flags: " << *StepI << "\n"); 
-      StepI->dropPoisonGeneratingFlags(); 
-      SE.forgetValue(StepI); 
-    } 
- 
-    InsertFreezeAndForgetFromSCEV(StepI->getOperandUse(Info.StepValIdx)); 
- 
-    unsigned OperandIdx = 
-        PHI->getOperandNumForIncomingValue(PHI->getIncomingValue(0) == StepI); 
-    InsertFreezeAndForgetFromSCEV(PHI->getOperandUse(OperandIdx)); 
-  } 
- 
-  // Finally, remove the old freeze instructions. 
-  for (const auto &Item : Candidates) { 
-    auto *FI = Item.FI; 
-    LLVM_DEBUG(dbgs() << "canonfr: removing " << *FI << "\n"); 
-    SE.forgetValue(FI); 
-    FI->replaceAllUsesWith(FI->getOperand(0)); 
-    FI->eraseFromParent(); 
-  } 
- 
-  return true; 
-} 
- 
-CanonicalizeFreezeInLoops::CanonicalizeFreezeInLoops() : LoopPass(ID) { 
-  initializeCanonicalizeFreezeInLoopsPass(*PassRegistry::getPassRegistry()); 
-} 
- 
-void CanonicalizeFreezeInLoops::getAnalysisUsage(AnalysisUsage &AU) const { 
-  AU.addPreservedID(LoopSimplifyID); 
-  AU.addRequired<LoopInfoWrapperPass>(); 
-  AU.addPreserved<LoopInfoWrapperPass>(); 
-  AU.addRequiredID(LoopSimplifyID); 
-  AU.addRequired<ScalarEvolutionWrapperPass>(); 
-  AU.addPreserved<ScalarEvolutionWrapperPass>(); 
-  AU.addRequired<DominatorTreeWrapperPass>(); 
-  AU.addPreserved<DominatorTreeWrapperPass>(); 
-} 
- 
-bool CanonicalizeFreezeInLoops::runOnLoop(Loop *L, LPPassManager &) { 
-  if (skipLoop(L)) 
-    return false; 
- 
-  auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 
-  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-  return CanonicalizeFreezeInLoopsImpl(L, SE, DT).run(); 
-} 
- 
-PreservedAnalyses 
-CanonicalizeFreezeInLoopsPass::run(Loop &L, LoopAnalysisManager &AM, 
-                                   LoopStandardAnalysisResults &AR, 
-                                   LPMUpdater &U) { 
-  if (!CanonicalizeFreezeInLoopsImpl(&L, AR.SE, AR.DT).run()) 
-    return PreservedAnalyses::all(); 
- 
-  return getLoopPassPreservedAnalyses(); 
-} 
- 
-INITIALIZE_PASS_BEGIN(CanonicalizeFreezeInLoops, "canon-freeze", 
-                      "Canonicalize Freeze Instructions in Loops", false, false) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 
-INITIALIZE_PASS_END(CanonicalizeFreezeInLoops, "canon-freeze", 
-                    "Canonicalize Freeze Instructions in Loops", false, false) 
- 
-Pass *llvm::createCanonicalizeFreezeInLoopsPass() { 
-  return new CanonicalizeFreezeInLoops(); 
-} 
- 
-char CanonicalizeFreezeInLoops::ID = 0; 
+      LLVM_DEBUG(dbgs() << "canonfr: drop flags: " << *StepI << "\n");
+      StepI->dropPoisonGeneratingFlags();
+      SE.forgetValue(StepI);
+    }
+
+    InsertFreezeAndForgetFromSCEV(StepI->getOperandUse(Info.StepValIdx));
+
+    unsigned OperandIdx =
+        PHI->getOperandNumForIncomingValue(PHI->getIncomingValue(0) == StepI);
+    InsertFreezeAndForgetFromSCEV(PHI->getOperandUse(OperandIdx));
+  }
+
+  // Finally, remove the old freeze instructions.
+  for (const auto &Item : Candidates) {
+    auto *FI = Item.FI;
+    LLVM_DEBUG(dbgs() << "canonfr: removing " << *FI << "\n");
+    SE.forgetValue(FI);
+    FI->replaceAllUsesWith(FI->getOperand(0));
+    FI->eraseFromParent();
+  }
+
+  return true;
+}
+
+CanonicalizeFreezeInLoops::CanonicalizeFreezeInLoops() : LoopPass(ID) {
+  initializeCanonicalizeFreezeInLoopsPass(*PassRegistry::getPassRegistry());
+}
+
+void CanonicalizeFreezeInLoops::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addPreservedID(LoopSimplifyID);
+  AU.addRequired<LoopInfoWrapperPass>();
+  AU.addPreserved<LoopInfoWrapperPass>();
+  AU.addRequiredID(LoopSimplifyID);
+  AU.addRequired<ScalarEvolutionWrapperPass>();
+  AU.addPreserved<ScalarEvolutionWrapperPass>();
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addPreserved<DominatorTreeWrapperPass>();
+}
+
+bool CanonicalizeFreezeInLoops::runOnLoop(Loop *L, LPPassManager &) {
+  if (skipLoop(L))
+    return false;
+
+  auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  return CanonicalizeFreezeInLoopsImpl(L, SE, DT).run();
+}
+
+PreservedAnalyses
+CanonicalizeFreezeInLoopsPass::run(Loop &L, LoopAnalysisManager &AM,
+                                   LoopStandardAnalysisResults &AR,
+                                   LPMUpdater &U) {
+  if (!CanonicalizeFreezeInLoopsImpl(&L, AR.SE, AR.DT).run())
+    return PreservedAnalyses::all();
+
+  return getLoopPassPreservedAnalyses();
+}
+
+INITIALIZE_PASS_BEGIN(CanonicalizeFreezeInLoops, "canon-freeze",
+                      "Canonicalize Freeze Instructions in Loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_END(CanonicalizeFreezeInLoops, "canon-freeze",
+                    "Canonicalize Freeze Instructions in Loops", false, false)
+
+Pass *llvm::createCanonicalizeFreezeInLoopsPass() {
+  return new CanonicalizeFreezeInLoops();
+}
+
+char CanonicalizeFreezeInLoops::ID = 0;
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/CloneFunction.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/CloneFunction.cpp
index 7f34784c6f..6ab061510a 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/CloneFunction.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/CloneFunction.cpp
@@ -1,198 +1,198 @@
-//===- CloneFunction.cpp - Clone a function into another function ---------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the CloneFunctionInto interface, which is used as the 
-// low-level function cloner.  This is used by the CloneFunction and function 
-// inliner to do the dirty work of copying the body of a function around. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/Analysis/ConstantFolding.h" 
-#include "llvm/Analysis/DomTreeUpdater.h" 
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DebugInfo.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GlobalVariable.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/LLVMContext.h" 
+//===- CloneFunction.cpp - Clone a function into another function ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CloneFunctionInto interface, which is used as the
+// low-level function cloner.  This is used by the CloneFunction and function
+// inliner to do the dirty work of copying the body of a function around.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/Cloning.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Transforms/Utils/ValueMapper.h" 
-#include <map> 
-using namespace llvm; 
- 
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <map>
+using namespace llvm;
+
 #define DEBUG_TYPE "clone-function"
 
-/// See comments in Cloning.h. 
-BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, 
-                                  const Twine &NameSuffix, Function *F, 
-                                  ClonedCodeInfo *CodeInfo, 
-                                  DebugInfoFinder *DIFinder) { 
-  DenseMap<const MDNode *, MDNode *> Cache; 
-  BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "", F); 
-  if (BB->hasName()) 
-    NewBB->setName(BB->getName() + NameSuffix); 
- 
-  bool hasCalls = false, hasDynamicAllocas = false; 
-  Module *TheModule = F ? F->getParent() : nullptr; 
- 
-  // Loop over all instructions, and copy them over. 
-  for (const Instruction &I : *BB) { 
-    if (DIFinder && TheModule) 
-      DIFinder->processInstruction(*TheModule, I); 
- 
-    Instruction *NewInst = I.clone(); 
-    if (I.hasName()) 
-      NewInst->setName(I.getName() + NameSuffix); 
-    NewBB->getInstList().push_back(NewInst); 
-    VMap[&I] = NewInst; // Add instruction map to value. 
- 
-    hasCalls |= (isa<CallInst>(I) && !isa<DbgInfoIntrinsic>(I)); 
-    if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) { 
-      if (!AI->isStaticAlloca()) { 
-        hasDynamicAllocas = true; 
-      } 
-    } 
-  } 
- 
-  if (CodeInfo) { 
-    CodeInfo->ContainsCalls          |= hasCalls; 
-    CodeInfo->ContainsDynamicAllocas |= hasDynamicAllocas; 
-  } 
-  return NewBB; 
-} 
- 
-// Clone OldFunc into NewFunc, transforming the old arguments into references to 
-// VMap values. 
-// 
-void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, 
-                             ValueToValueMapTy &VMap, 
-                             bool ModuleLevelChanges, 
-                             SmallVectorImpl<ReturnInst*> &Returns, 
-                             const char *NameSuffix, ClonedCodeInfo *CodeInfo, 
-                             ValueMapTypeRemapper *TypeMapper, 
-                             ValueMaterializer *Materializer) { 
-  assert(NameSuffix && "NameSuffix cannot be null!"); 
- 
-#ifndef NDEBUG 
-  for (const Argument &I : OldFunc->args()) 
-    assert(VMap.count(&I) && "No mapping from source argument specified!"); 
-#endif 
- 
-  // Copy all attributes other than those stored in the AttributeList.  We need 
-  // to remap the parameter indices of the AttributeList. 
-  AttributeList NewAttrs = NewFunc->getAttributes(); 
-  NewFunc->copyAttributesFrom(OldFunc); 
-  NewFunc->setAttributes(NewAttrs); 
- 
-  // Fix up the personality function that got copied over. 
-  if (OldFunc->hasPersonalityFn()) 
-    NewFunc->setPersonalityFn( 
-        MapValue(OldFunc->getPersonalityFn(), VMap, 
-                 ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, 
-                 TypeMapper, Materializer)); 
- 
-  SmallVector<AttributeSet, 4> NewArgAttrs(NewFunc->arg_size()); 
-  AttributeList OldAttrs = OldFunc->getAttributes(); 
- 
-  // Clone any argument attributes that are present in the VMap. 
-  for (const Argument &OldArg : OldFunc->args()) { 
-    if (Argument *NewArg = dyn_cast<Argument>(VMap[&OldArg])) { 
-      NewArgAttrs[NewArg->getArgNo()] = 
-          OldAttrs.getParamAttributes(OldArg.getArgNo()); 
-    } 
-  } 
- 
-  NewFunc->setAttributes( 
-      AttributeList::get(NewFunc->getContext(), OldAttrs.getFnAttributes(), 
-                         OldAttrs.getRetAttributes(), NewArgAttrs)); 
- 
-  bool MustCloneSP = 
-      OldFunc->getParent() && OldFunc->getParent() == NewFunc->getParent(); 
-  DISubprogram *SP = OldFunc->getSubprogram(); 
-  if (SP) { 
-    assert(!MustCloneSP || ModuleLevelChanges); 
-    // Add mappings for some DebugInfo nodes that we don't want duplicated 
-    // even if they're distinct. 
-    auto &MD = VMap.MD(); 
-    MD[SP->getUnit()].reset(SP->getUnit()); 
-    MD[SP->getType()].reset(SP->getType()); 
-    MD[SP->getFile()].reset(SP->getFile()); 
-    // If we're not cloning into the same module, no need to clone the 
-    // subprogram 
-    if (!MustCloneSP) 
-      MD[SP].reset(SP); 
-  } 
- 
+/// See comments in Cloning.h.
+BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap,
+                                  const Twine &NameSuffix, Function *F,
+                                  ClonedCodeInfo *CodeInfo,
+                                  DebugInfoFinder *DIFinder) {
+  DenseMap<const MDNode *, MDNode *> Cache;
+  BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "", F);
+  if (BB->hasName())
+    NewBB->setName(BB->getName() + NameSuffix);
+
+  bool hasCalls = false, hasDynamicAllocas = false;
+  Module *TheModule = F ? F->getParent() : nullptr;
+
+  // Loop over all instructions, and copy them over.
+  for (const Instruction &I : *BB) {
+    if (DIFinder && TheModule)
+      DIFinder->processInstruction(*TheModule, I);
+
+    Instruction *NewInst = I.clone();
+    if (I.hasName())
+      NewInst->setName(I.getName() + NameSuffix);
+    NewBB->getInstList().push_back(NewInst);
+    VMap[&I] = NewInst; // Add instruction map to value.
+
+    hasCalls |= (isa<CallInst>(I) && !isa<DbgInfoIntrinsic>(I));
+    if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
+      if (!AI->isStaticAlloca()) {
+        hasDynamicAllocas = true;
+      }
+    }
+  }
+
+  if (CodeInfo) {
+    CodeInfo->ContainsCalls          |= hasCalls;
+    CodeInfo->ContainsDynamicAllocas |= hasDynamicAllocas;
+  }
+  return NewBB;
+}
+
+// Clone OldFunc into NewFunc, transforming the old arguments into references to
+// VMap values.
+//
+void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
+                             ValueToValueMapTy &VMap,
+                             bool ModuleLevelChanges,
+                             SmallVectorImpl<ReturnInst*> &Returns,
+                             const char *NameSuffix, ClonedCodeInfo *CodeInfo,
+                             ValueMapTypeRemapper *TypeMapper,
+                             ValueMaterializer *Materializer) {
+  assert(NameSuffix && "NameSuffix cannot be null!");
+
+#ifndef NDEBUG
+  for (const Argument &I : OldFunc->args())
+    assert(VMap.count(&I) && "No mapping from source argument specified!");
+#endif
+
+  // Copy all attributes other than those stored in the AttributeList.  We need
+  // to remap the parameter indices of the AttributeList.
+  AttributeList NewAttrs = NewFunc->getAttributes();
+  NewFunc->copyAttributesFrom(OldFunc);
+  NewFunc->setAttributes(NewAttrs);
+
+  // Fix up the personality function that got copied over.
+  if (OldFunc->hasPersonalityFn())
+    NewFunc->setPersonalityFn(
+        MapValue(OldFunc->getPersonalityFn(), VMap,
+                 ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
+                 TypeMapper, Materializer));
+
+  SmallVector<AttributeSet, 4> NewArgAttrs(NewFunc->arg_size());
+  AttributeList OldAttrs = OldFunc->getAttributes();
+
+  // Clone any argument attributes that are present in the VMap.
+  for (const Argument &OldArg : OldFunc->args()) {
+    if (Argument *NewArg = dyn_cast<Argument>(VMap[&OldArg])) {
+      NewArgAttrs[NewArg->getArgNo()] =
+          OldAttrs.getParamAttributes(OldArg.getArgNo());
+    }
+  }
+
+  NewFunc->setAttributes(
+      AttributeList::get(NewFunc->getContext(), OldAttrs.getFnAttributes(),
+                         OldAttrs.getRetAttributes(), NewArgAttrs));
+
+  bool MustCloneSP =
+      OldFunc->getParent() && OldFunc->getParent() == NewFunc->getParent();
+  DISubprogram *SP = OldFunc->getSubprogram();
+  if (SP) {
+    assert(!MustCloneSP || ModuleLevelChanges);
+    // Add mappings for some DebugInfo nodes that we don't want duplicated
+    // even if they're distinct.
+    auto &MD = VMap.MD();
+    MD[SP->getUnit()].reset(SP->getUnit());
+    MD[SP->getType()].reset(SP->getType());
+    MD[SP->getFile()].reset(SP->getFile());
+    // If we're not cloning into the same module, no need to clone the
+    // subprogram
+    if (!MustCloneSP)
+      MD[SP].reset(SP);
+  }
+
   // Everything else beyond this point deals with function instructions,
   // so if we are dealing with a function declaration, we're done.
   if (OldFunc->isDeclaration())
     return;
- 
-  // When we remap instructions, we want to avoid duplicating inlined 
-  // DISubprograms, so record all subprograms we find as we duplicate 
-  // instructions and then freeze them in the MD map. 
-  // We also record information about dbg.value and dbg.declare to avoid 
-  // duplicating the types. 
-  DebugInfoFinder DIFinder; 
- 
-  // Loop over all of the basic blocks in the function, cloning them as 
-  // appropriate.  Note that we save BE this way in order to handle cloning of 
-  // recursive functions into themselves. 
-  for (Function::const_iterator BI = OldFunc->begin(), BE = OldFunc->end(); 
-       BI != BE; ++BI) { 
-    const BasicBlock &BB = *BI; 
- 
-    // Create a new basic block and copy instructions into it! 
-    BasicBlock *CBB = CloneBasicBlock(&BB, VMap, NameSuffix, NewFunc, CodeInfo, 
-                                      ModuleLevelChanges ? &DIFinder : nullptr); 
- 
-    // Add basic block mapping. 
-    VMap[&BB] = CBB; 
- 
-    // It is only legal to clone a function if a block address within that 
-    // function is never referenced outside of the function.  Given that, we 
-    // want to map block addresses from the old function to block addresses in 
-    // the clone. (This is different from the generic ValueMapper 
-    // implementation, which generates an invalid blockaddress when 
-    // cloning a function.) 
-    if (BB.hasAddressTaken()) { 
-      Constant *OldBBAddr = BlockAddress::get(const_cast<Function*>(OldFunc), 
-                                              const_cast<BasicBlock*>(&BB)); 
-      VMap[OldBBAddr] = BlockAddress::get(NewFunc, CBB); 
-    } 
- 
-    // Note return instructions for the caller. 
-    if (ReturnInst *RI = dyn_cast<ReturnInst>(CBB->getTerminator())) 
-      Returns.push_back(RI); 
-  } 
- 
-  for (DISubprogram *ISP : DIFinder.subprograms()) 
-    if (ISP != SP) 
-      VMap.MD()[ISP].reset(ISP); 
- 
-  for (DICompileUnit *CU : DIFinder.compile_units()) 
-    VMap.MD()[CU].reset(CU); 
- 
-  for (DIType *Type : DIFinder.types()) 
-    VMap.MD()[Type].reset(Type); 
- 
+
+  // When we remap instructions, we want to avoid duplicating inlined
+  // DISubprograms, so record all subprograms we find as we duplicate
+  // instructions and then freeze them in the MD map.
+  // We also record information about dbg.value and dbg.declare to avoid
+  // duplicating the types.
+  DebugInfoFinder DIFinder;
+
+  // Loop over all of the basic blocks in the function, cloning them as
+  // appropriate.  Note that we save BE this way in order to handle cloning of
+  // recursive functions into themselves.
+  for (Function::const_iterator BI = OldFunc->begin(), BE = OldFunc->end();
+       BI != BE; ++BI) {
+    const BasicBlock &BB = *BI;
+
+    // Create a new basic block and copy instructions into it!
+    BasicBlock *CBB = CloneBasicBlock(&BB, VMap, NameSuffix, NewFunc, CodeInfo,
+                                      ModuleLevelChanges ? &DIFinder : nullptr);
+
+    // Add basic block mapping.
+    VMap[&BB] = CBB;
+
+    // It is only legal to clone a function if a block address within that
+    // function is never referenced outside of the function.  Given that, we
+    // want to map block addresses from the old function to block addresses in
+    // the clone. (This is different from the generic ValueMapper
+    // implementation, which generates an invalid blockaddress when
+    // cloning a function.)
+    if (BB.hasAddressTaken()) {
+      Constant *OldBBAddr = BlockAddress::get(const_cast<Function*>(OldFunc),
+                                              const_cast<BasicBlock*>(&BB));
+      VMap[OldBBAddr] = BlockAddress::get(NewFunc, CBB);
+    }
+
+    // Note return instructions for the caller.
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(CBB->getTerminator()))
+      Returns.push_back(RI);
+  }
+
+  for (DISubprogram *ISP : DIFinder.subprograms())
+    if (ISP != SP)
+      VMap.MD()[ISP].reset(ISP);
+
+  for (DICompileUnit *CU : DIFinder.compile_units())
+    VMap.MD()[CU].reset(CU);
+
+  for (DIType *Type : DIFinder.types())
+    VMap.MD()[Type].reset(Type);
+
   // Duplicate the metadata that is attached to the cloned function.
   // Subprograms/CUs/types that were already mapped to themselves won't be
   // duplicated.
@@ -206,684 +206,684 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
                      TypeMapper, Materializer));
   }
 
-  // Loop over all of the instructions in the function, fixing up operand 
-  // references as we go.  This uses VMap to do all the hard work. 
-  for (Function::iterator BB = 
-           cast<BasicBlock>(VMap[&OldFunc->front()])->getIterator(), 
-                          BE = NewFunc->end(); 
-       BB != BE; ++BB) 
-    // Loop over all instructions, fixing each one as we find it... 
-    for (Instruction &II : *BB) 
-      RemapInstruction(&II, VMap, 
-                       ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, 
-                       TypeMapper, Materializer); 
- 
-  // Register all DICompileUnits of the old parent module in the new parent module 
-  auto* OldModule = OldFunc->getParent(); 
-  auto* NewModule = NewFunc->getParent(); 
-  if (OldModule && NewModule && OldModule != NewModule && DIFinder.compile_unit_count()) { 
-    auto* NMD = NewModule->getOrInsertNamedMetadata("llvm.dbg.cu"); 
-    // Avoid multiple insertions of the same DICompileUnit to NMD. 
-    SmallPtrSet<const void*, 8> Visited; 
-    for (auto* Operand : NMD->operands()) 
-      Visited.insert(Operand); 
-    for (auto* Unit : DIFinder.compile_units()) 
-      // VMap.MD()[Unit] == Unit 
-      if (Visited.insert(Unit).second) 
-        NMD->addOperand(Unit); 
-  } 
-} 
- 
-/// Return a copy of the specified function and add it to that function's 
-/// module.  Also, any references specified in the VMap are changed to refer to 
-/// their mapped value instead of the original one.  If any of the arguments to 
-/// the function are in the VMap, the arguments are deleted from the resultant 
-/// function.  The VMap is updated to include mappings from all of the 
-/// instructions and basicblocks in the function from their old to new values. 
-/// 
-Function *llvm::CloneFunction(Function *F, ValueToValueMapTy &VMap, 
-                              ClonedCodeInfo *CodeInfo) { 
-  std::vector<Type*> ArgTypes; 
- 
-  // The user might be deleting arguments to the function by specifying them in 
-  // the VMap.  If so, we need to not add the arguments to the arg ty vector 
-  // 
-  for (const Argument &I : F->args()) 
-    if (VMap.count(&I) == 0) // Haven't mapped the argument to anything yet? 
-      ArgTypes.push_back(I.getType()); 
- 
-  // Create a new function type... 
-  FunctionType *FTy = FunctionType::get(F->getFunctionType()->getReturnType(), 
-                                    ArgTypes, F->getFunctionType()->isVarArg()); 
- 
-  // Create the new function... 
-  Function *NewF = Function::Create(FTy, F->getLinkage(), F->getAddressSpace(), 
-                                    F->getName(), F->getParent()); 
- 
-  // Loop over the arguments, copying the names of the mapped arguments over... 
-  Function::arg_iterator DestI = NewF->arg_begin(); 
-  for (const Argument & I : F->args()) 
-    if (VMap.count(&I) == 0) {     // Is this argument preserved? 
-      DestI->setName(I.getName()); // Copy the name over... 
-      VMap[&I] = &*DestI++;        // Add mapping to VMap 
-    } 
- 
-  SmallVector<ReturnInst*, 8> Returns;  // Ignore returns cloned. 
-  CloneFunctionInto(NewF, F, VMap, F->getSubprogram() != nullptr, Returns, "", 
-                    CodeInfo); 
- 
-  return NewF; 
-} 
- 
- 
- 
-namespace { 
-  /// This is a private class used to implement CloneAndPruneFunctionInto. 
-  struct PruningFunctionCloner { 
-    Function *NewFunc; 
-    const Function *OldFunc; 
-    ValueToValueMapTy &VMap; 
-    bool ModuleLevelChanges; 
-    const char *NameSuffix; 
-    ClonedCodeInfo *CodeInfo; 
- 
-  public: 
-    PruningFunctionCloner(Function *newFunc, const Function *oldFunc, 
-                          ValueToValueMapTy &valueMap, bool moduleLevelChanges, 
-                          const char *nameSuffix, ClonedCodeInfo *codeInfo) 
-        : NewFunc(newFunc), OldFunc(oldFunc), VMap(valueMap), 
-          ModuleLevelChanges(moduleLevelChanges), NameSuffix(nameSuffix), 
-          CodeInfo(codeInfo) {} 
- 
-    /// The specified block is found to be reachable, clone it and 
-    /// anything that it can reach. 
-    void CloneBlock(const BasicBlock *BB, 
-                    BasicBlock::const_iterator StartingInst, 
-                    std::vector<const BasicBlock*> &ToClone); 
-  }; 
-} 
- 
-/// The specified block is found to be reachable, clone it and 
-/// anything that it can reach. 
-void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, 
-                                       BasicBlock::const_iterator StartingInst, 
-                                       std::vector<const BasicBlock*> &ToClone){ 
-  WeakTrackingVH &BBEntry = VMap[BB]; 
- 
-  // Have we already cloned this block? 
-  if (BBEntry) return; 
- 
-  // Nope, clone it now. 
-  BasicBlock *NewBB; 
-  BBEntry = NewBB = BasicBlock::Create(BB->getContext()); 
-  if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix); 
- 
-  // It is only legal to clone a function if a block address within that 
-  // function is never referenced outside of the function.  Given that, we 
-  // want to map block addresses from the old function to block addresses in 
-  // the clone. (This is different from the generic ValueMapper 
-  // implementation, which generates an invalid blockaddress when 
-  // cloning a function.) 
-  // 
-  // Note that we don't need to fix the mapping for unreachable blocks; 
-  // the default mapping there is safe. 
-  if (BB->hasAddressTaken()) { 
-    Constant *OldBBAddr = BlockAddress::get(const_cast<Function*>(OldFunc), 
-                                            const_cast<BasicBlock*>(BB)); 
-    VMap[OldBBAddr] = BlockAddress::get(NewFunc, NewBB); 
-  } 
- 
-  bool hasCalls = false, hasDynamicAllocas = false, hasStaticAllocas = false; 
- 
-  // Loop over all instructions, and copy them over, DCE'ing as we go.  This 
-  // loop doesn't include the terminator. 
-  for (BasicBlock::const_iterator II = StartingInst, IE = --BB->end(); 
-       II != IE; ++II) { 
- 
-    Instruction *NewInst = II->clone(); 
- 
-    // Eagerly remap operands to the newly cloned instruction, except for PHI 
-    // nodes for which we defer processing until we update the CFG. 
-    if (!isa<PHINode>(NewInst)) { 
-      RemapInstruction(NewInst, VMap, 
-                       ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges); 
- 
-      // If we can simplify this instruction to some other value, simply add 
-      // a mapping to that value rather than inserting a new instruction into 
-      // the basic block. 
-      if (Value *V = 
-              SimplifyInstruction(NewInst, BB->getModule()->getDataLayout())) { 
-        // On the off-chance that this simplifies to an instruction in the old 
-        // function, map it back into the new function. 
-        if (NewFunc != OldFunc) 
-          if (Value *MappedV = VMap.lookup(V)) 
-            V = MappedV; 
- 
-        if (!NewInst->mayHaveSideEffects()) { 
-          VMap[&*II] = V; 
-          NewInst->deleteValue(); 
-          continue; 
-        } 
-      } 
-    } 
- 
-    if (II->hasName()) 
-      NewInst->setName(II->getName()+NameSuffix); 
-    VMap[&*II] = NewInst; // Add instruction map to value. 
-    NewBB->getInstList().push_back(NewInst); 
-    hasCalls |= (isa<CallInst>(II) && !isa<DbgInfoIntrinsic>(II)); 
- 
-    if (CodeInfo) 
-      if (auto *CB = dyn_cast<CallBase>(&*II)) 
-        if (CB->hasOperandBundles()) 
-          CodeInfo->OperandBundleCallSites.push_back(NewInst); 
- 
-    if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) { 
-      if (isa<ConstantInt>(AI->getArraySize())) 
-        hasStaticAllocas = true; 
-      else 
-        hasDynamicAllocas = true; 
-    } 
-  } 
- 
-  // Finally, clone over the terminator. 
-  const Instruction *OldTI = BB->getTerminator(); 
-  bool TerminatorDone = false; 
-  if (const BranchInst *BI = dyn_cast<BranchInst>(OldTI)) { 
-    if (BI->isConditional()) { 
-      // If the condition was a known constant in the callee... 
-      ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition()); 
-      // Or is a known constant in the caller... 
-      if (!Cond) { 
-        Value *V = VMap.lookup(BI->getCondition()); 
-        Cond = dyn_cast_or_null<ConstantInt>(V); 
-      } 
- 
-      // Constant fold to uncond branch! 
-      if (Cond) { 
-        BasicBlock *Dest = BI->getSuccessor(!Cond->getZExtValue()); 
-        VMap[OldTI] = BranchInst::Create(Dest, NewBB); 
-        ToClone.push_back(Dest); 
-        TerminatorDone = true; 
-      } 
-    } 
-  } else if (const SwitchInst *SI = dyn_cast<SwitchInst>(OldTI)) { 
-    // If switching on a value known constant in the caller. 
-    ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition()); 
-    if (!Cond) { // Or known constant after constant prop in the callee... 
-      Value *V = VMap.lookup(SI->getCondition()); 
-      Cond = dyn_cast_or_null<ConstantInt>(V); 
-    } 
-    if (Cond) {     // Constant fold to uncond branch! 
-      SwitchInst::ConstCaseHandle Case = *SI->findCaseValue(Cond); 
-      BasicBlock *Dest = const_cast<BasicBlock*>(Case.getCaseSuccessor()); 
-      VMap[OldTI] = BranchInst::Create(Dest, NewBB); 
-      ToClone.push_back(Dest); 
-      TerminatorDone = true; 
-    } 
-  } 
- 
-  if (!TerminatorDone) { 
-    Instruction *NewInst = OldTI->clone(); 
-    if (OldTI->hasName()) 
-      NewInst->setName(OldTI->getName()+NameSuffix); 
-    NewBB->getInstList().push_back(NewInst); 
-    VMap[OldTI] = NewInst;             // Add instruction map to value. 
- 
-    if (CodeInfo) 
-      if (auto *CB = dyn_cast<CallBase>(OldTI)) 
-        if (CB->hasOperandBundles()) 
-          CodeInfo->OperandBundleCallSites.push_back(NewInst); 
- 
-    // Recursively clone any reachable successor blocks. 
+  // Loop over all of the instructions in the function, fixing up operand
+  // references as we go.  This uses VMap to do all the hard work.
+  for (Function::iterator BB =
+           cast<BasicBlock>(VMap[&OldFunc->front()])->getIterator(),
+                          BE = NewFunc->end();
+       BB != BE; ++BB)
+    // Loop over all instructions, fixing each one as we find it...
+    for (Instruction &II : *BB)
+      RemapInstruction(&II, VMap,
+                       ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
+                       TypeMapper, Materializer);
+
+  // Register all DICompileUnits of the old parent module in the new parent module
+  auto* OldModule = OldFunc->getParent();
+  auto* NewModule = NewFunc->getParent();
+  if (OldModule && NewModule && OldModule != NewModule && DIFinder.compile_unit_count()) {
+    auto* NMD = NewModule->getOrInsertNamedMetadata("llvm.dbg.cu");
+    // Avoid multiple insertions of the same DICompileUnit to NMD.
+    SmallPtrSet<const void*, 8> Visited;
+    for (auto* Operand : NMD->operands())
+      Visited.insert(Operand);
+    for (auto* Unit : DIFinder.compile_units())
+      // VMap.MD()[Unit] == Unit
+      if (Visited.insert(Unit).second)
+        NMD->addOperand(Unit);
+  }
+}
+
+/// Return a copy of the specified function and add it to that function's
+/// module.  Also, any references specified in the VMap are changed to refer to
+/// their mapped value instead of the original one.  If any of the arguments to
+/// the function are in the VMap, the arguments are deleted from the resultant
+/// function.  The VMap is updated to include mappings from all of the
+/// instructions and basicblocks in the function from their old to new values.
+///
+Function *llvm::CloneFunction(Function *F, ValueToValueMapTy &VMap,
+                              ClonedCodeInfo *CodeInfo) {
+  std::vector<Type*> ArgTypes;
+
+  // The user might be deleting arguments to the function by specifying them in
+  // the VMap.  If so, we need to not add the arguments to the arg ty vector
+  //
+  for (const Argument &I : F->args())
+    if (VMap.count(&I) == 0) // Haven't mapped the argument to anything yet?
+      ArgTypes.push_back(I.getType());
+
+  // Create a new function type...
+  FunctionType *FTy = FunctionType::get(F->getFunctionType()->getReturnType(),
+                                    ArgTypes, F->getFunctionType()->isVarArg());
+
+  // Create the new function...
+  Function *NewF = Function::Create(FTy, F->getLinkage(), F->getAddressSpace(),
+                                    F->getName(), F->getParent());
+
+  // Loop over the arguments, copying the names of the mapped arguments over...
+  Function::arg_iterator DestI = NewF->arg_begin();
+  for (const Argument & I : F->args())
+    if (VMap.count(&I) == 0) {     // Is this argument preserved?
+      DestI->setName(I.getName()); // Copy the name over...
+      VMap[&I] = &*DestI++;        // Add mapping to VMap
+    }
+
+  SmallVector<ReturnInst*, 8> Returns;  // Ignore returns cloned.
+  CloneFunctionInto(NewF, F, VMap, F->getSubprogram() != nullptr, Returns, "",
+                    CodeInfo);
+
+  return NewF;
+}
+
+
+
+namespace {
+  /// This is a private class used to implement CloneAndPruneFunctionInto.
+  struct PruningFunctionCloner {
+    Function *NewFunc;
+    const Function *OldFunc;
+    ValueToValueMapTy &VMap;
+    bool ModuleLevelChanges;
+    const char *NameSuffix;
+    ClonedCodeInfo *CodeInfo;
+
+  public:
+    PruningFunctionCloner(Function *newFunc, const Function *oldFunc,
+                          ValueToValueMapTy &valueMap, bool moduleLevelChanges,
+                          const char *nameSuffix, ClonedCodeInfo *codeInfo)
+        : NewFunc(newFunc), OldFunc(oldFunc), VMap(valueMap),
+          ModuleLevelChanges(moduleLevelChanges), NameSuffix(nameSuffix),
+          CodeInfo(codeInfo) {}
+
+    /// The specified block is found to be reachable, clone it and
+    /// anything that it can reach.
+    void CloneBlock(const BasicBlock *BB,
+                    BasicBlock::const_iterator StartingInst,
+                    std::vector<const BasicBlock*> &ToClone);
+  };
+}
+
+/// The specified block is found to be reachable, clone it and
+/// anything that it can reach.
+void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
+                                       BasicBlock::const_iterator StartingInst,
+                                       std::vector<const BasicBlock*> &ToClone){
+  WeakTrackingVH &BBEntry = VMap[BB];
+
+  // Have we already cloned this block?
+  if (BBEntry) return;
+
+  // Nope, clone it now.
+  BasicBlock *NewBB;
+  BBEntry = NewBB = BasicBlock::Create(BB->getContext());
+  if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix);
+
+  // It is only legal to clone a function if a block address within that
+  // function is never referenced outside of the function.  Given that, we
+  // want to map block addresses from the old function to block addresses in
+  // the clone. (This is different from the generic ValueMapper
+  // implementation, which generates an invalid blockaddress when
+  // cloning a function.)
+  //
+  // Note that we don't need to fix the mapping for unreachable blocks;
+  // the default mapping there is safe.
+  if (BB->hasAddressTaken()) {
+    Constant *OldBBAddr = BlockAddress::get(const_cast<Function*>(OldFunc),
+                                            const_cast<BasicBlock*>(BB));
+    VMap[OldBBAddr] = BlockAddress::get(NewFunc, NewBB);
+  }
+
+  bool hasCalls = false, hasDynamicAllocas = false, hasStaticAllocas = false;
+
+  // Loop over all instructions, and copy them over, DCE'ing as we go.  This
+  // loop doesn't include the terminator.
+  for (BasicBlock::const_iterator II = StartingInst, IE = --BB->end();
+       II != IE; ++II) {
+
+    Instruction *NewInst = II->clone();
+
+    // Eagerly remap operands to the newly cloned instruction, except for PHI
+    // nodes for which we defer processing until we update the CFG.
+    if (!isa<PHINode>(NewInst)) {
+      RemapInstruction(NewInst, VMap,
+                       ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges);
+
+      // If we can simplify this instruction to some other value, simply add
+      // a mapping to that value rather than inserting a new instruction into
+      // the basic block.
+      if (Value *V =
+              SimplifyInstruction(NewInst, BB->getModule()->getDataLayout())) {
+        // On the off-chance that this simplifies to an instruction in the old
+        // function, map it back into the new function.
+        if (NewFunc != OldFunc)
+          if (Value *MappedV = VMap.lookup(V))
+            V = MappedV;
+
+        if (!NewInst->mayHaveSideEffects()) {
+          VMap[&*II] = V;
+          NewInst->deleteValue();
+          continue;
+        }
+      }
+    }
+
+    if (II->hasName())
+      NewInst->setName(II->getName()+NameSuffix);
+    VMap[&*II] = NewInst; // Add instruction map to value.
+    NewBB->getInstList().push_back(NewInst);
+    hasCalls |= (isa<CallInst>(II) && !isa<DbgInfoIntrinsic>(II));
+
+    if (CodeInfo)
+      if (auto *CB = dyn_cast<CallBase>(&*II))
+        if (CB->hasOperandBundles())
+          CodeInfo->OperandBundleCallSites.push_back(NewInst);
+
+    if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
+      if (isa<ConstantInt>(AI->getArraySize()))
+        hasStaticAllocas = true;
+      else
+        hasDynamicAllocas = true;
+    }
+  }
+
+  // Finally, clone over the terminator.
+  const Instruction *OldTI = BB->getTerminator();
+  bool TerminatorDone = false;
+  if (const BranchInst *BI = dyn_cast<BranchInst>(OldTI)) {
+    if (BI->isConditional()) {
+      // If the condition was a known constant in the callee...
+      ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition());
+      // Or is a known constant in the caller...
+      if (!Cond) {
+        Value *V = VMap.lookup(BI->getCondition());
+        Cond = dyn_cast_or_null<ConstantInt>(V);
+      }
+
+      // Constant fold to uncond branch!
+      if (Cond) {
+        BasicBlock *Dest = BI->getSuccessor(!Cond->getZExtValue());
+        VMap[OldTI] = BranchInst::Create(Dest, NewBB);
+        ToClone.push_back(Dest);
+        TerminatorDone = true;
+      }
+    }
+  } else if (const SwitchInst *SI = dyn_cast<SwitchInst>(OldTI)) {
+    // If switching on a value known constant in the caller.
+    ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition());
+    if (!Cond) { // Or known constant after constant prop in the callee...
+      Value *V = VMap.lookup(SI->getCondition());
+      Cond = dyn_cast_or_null<ConstantInt>(V);
+    }
+    if (Cond) {     // Constant fold to uncond branch!
+      SwitchInst::ConstCaseHandle Case = *SI->findCaseValue(Cond);
+      BasicBlock *Dest = const_cast<BasicBlock*>(Case.getCaseSuccessor());
+      VMap[OldTI] = BranchInst::Create(Dest, NewBB);
+      ToClone.push_back(Dest);
+      TerminatorDone = true;
+    }
+  }
+
+  if (!TerminatorDone) {
+    Instruction *NewInst = OldTI->clone();
+    if (OldTI->hasName())
+      NewInst->setName(OldTI->getName()+NameSuffix);
+    NewBB->getInstList().push_back(NewInst);
+    VMap[OldTI] = NewInst;             // Add instruction map to value.
+
+    if (CodeInfo)
+      if (auto *CB = dyn_cast<CallBase>(OldTI))
+        if (CB->hasOperandBundles())
+          CodeInfo->OperandBundleCallSites.push_back(NewInst);
+
+    // Recursively clone any reachable successor blocks.
     append_range(ToClone, successors(BB->getTerminator()));
-  } 
- 
-  if (CodeInfo) { 
-    CodeInfo->ContainsCalls          |= hasCalls; 
-    CodeInfo->ContainsDynamicAllocas |= hasDynamicAllocas; 
-    CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas && 
-      BB != &BB->getParent()->front(); 
-  } 
-} 
- 
-/// This works like CloneAndPruneFunctionInto, except that it does not clone the 
-/// entire function. Instead it starts at an instruction provided by the caller 
-/// and copies (and prunes) only the code reachable from that instruction. 
-void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, 
-                                     const Instruction *StartingInst, 
-                                     ValueToValueMapTy &VMap, 
-                                     bool ModuleLevelChanges, 
-                                     SmallVectorImpl<ReturnInst *> &Returns, 
-                                     const char *NameSuffix, 
-                                     ClonedCodeInfo *CodeInfo) { 
-  assert(NameSuffix && "NameSuffix cannot be null!"); 
- 
-  ValueMapTypeRemapper *TypeMapper = nullptr; 
-  ValueMaterializer *Materializer = nullptr; 
- 
-#ifndef NDEBUG 
-  // If the cloning starts at the beginning of the function, verify that 
-  // the function arguments are mapped. 
-  if (!StartingInst) 
-    for (const Argument &II : OldFunc->args()) 
-      assert(VMap.count(&II) && "No mapping from source argument specified!"); 
-#endif 
- 
-  PruningFunctionCloner PFC(NewFunc, OldFunc, VMap, ModuleLevelChanges, 
-                            NameSuffix, CodeInfo); 
-  const BasicBlock *StartingBB; 
-  if (StartingInst) 
-    StartingBB = StartingInst->getParent(); 
-  else { 
-    StartingBB = &OldFunc->getEntryBlock(); 
-    StartingInst = &StartingBB->front(); 
-  } 
- 
-  // Clone the entry block, and anything recursively reachable from it. 
-  std::vector<const BasicBlock*> CloneWorklist; 
-  PFC.CloneBlock(StartingBB, StartingInst->getIterator(), CloneWorklist); 
-  while (!CloneWorklist.empty()) { 
-    const BasicBlock *BB = CloneWorklist.back(); 
-    CloneWorklist.pop_back(); 
-    PFC.CloneBlock(BB, BB->begin(), CloneWorklist); 
-  } 
- 
-  // Loop over all of the basic blocks in the old function.  If the block was 
-  // reachable, we have cloned it and the old block is now in the value map: 
-  // insert it into the new function in the right order.  If not, ignore it. 
-  // 
-  // Defer PHI resolution until rest of function is resolved. 
-  SmallVector<const PHINode*, 16> PHIToResolve; 
-  for (const BasicBlock &BI : *OldFunc) { 
-    Value *V = VMap.lookup(&BI); 
-    BasicBlock *NewBB = cast_or_null<BasicBlock>(V); 
-    if (!NewBB) continue;  // Dead block. 
- 
-    // Add the new block to the new function. 
-    NewFunc->getBasicBlockList().push_back(NewBB); 
- 
-    // Handle PHI nodes specially, as we have to remove references to dead 
-    // blocks. 
-    for (const PHINode &PN : BI.phis()) { 
-      // PHI nodes may have been remapped to non-PHI nodes by the caller or 
-      // during the cloning process. 
-      if (isa<PHINode>(VMap[&PN])) 
-        PHIToResolve.push_back(&PN); 
-      else 
-        break; 
-    } 
- 
-    // Finally, remap the terminator instructions, as those can't be remapped 
-    // until all BBs are mapped. 
-    RemapInstruction(NewBB->getTerminator(), VMap, 
-                     ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, 
-                     TypeMapper, Materializer); 
-  } 
- 
-  // Defer PHI resolution until rest of function is resolved, PHI resolution 
-  // requires the CFG to be up-to-date. 
-  for (unsigned phino = 0, e = PHIToResolve.size(); phino != e; ) { 
-    const PHINode *OPN = PHIToResolve[phino]; 
-    unsigned NumPreds = OPN->getNumIncomingValues(); 
-    const BasicBlock *OldBB = OPN->getParent(); 
-    BasicBlock *NewBB = cast<BasicBlock>(VMap[OldBB]); 
- 
-    // Map operands for blocks that are live and remove operands for blocks 
-    // that are dead. 
-    for (; phino != PHIToResolve.size() && 
-         PHIToResolve[phino]->getParent() == OldBB; ++phino) { 
-      OPN = PHIToResolve[phino]; 
-      PHINode *PN = cast<PHINode>(VMap[OPN]); 
-      for (unsigned pred = 0, e = NumPreds; pred != e; ++pred) { 
-        Value *V = VMap.lookup(PN->getIncomingBlock(pred)); 
-        if (BasicBlock *MappedBlock = cast_or_null<BasicBlock>(V)) { 
-          Value *InVal = MapValue(PN->getIncomingValue(pred), 
-                                  VMap, 
-                        ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges); 
-          assert(InVal && "Unknown input value?"); 
-          PN->setIncomingValue(pred, InVal); 
-          PN->setIncomingBlock(pred, MappedBlock); 
-        } else { 
-          PN->removeIncomingValue(pred, false); 
-          --pred;  // Revisit the next entry. 
-          --e; 
-        } 
-      } 
-    } 
- 
-    // The loop above has removed PHI entries for those blocks that are dead 
-    // and has updated others.  However, if a block is live (i.e. copied over) 
-    // but its terminator has been changed to not go to this block, then our 
-    // phi nodes will have invalid entries.  Update the PHI nodes in this 
-    // case. 
-    PHINode *PN = cast<PHINode>(NewBB->begin()); 
-    NumPreds = pred_size(NewBB); 
-    if (NumPreds != PN->getNumIncomingValues()) { 
-      assert(NumPreds < PN->getNumIncomingValues()); 
-      // Count how many times each predecessor comes to this block. 
-      std::map<BasicBlock*, unsigned> PredCount; 
-      for (pred_iterator PI = pred_begin(NewBB), E = pred_end(NewBB); 
-           PI != E; ++PI) 
-        --PredCount[*PI]; 
- 
-      // Figure out how many entries to remove from each PHI. 
-      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) 
-        ++PredCount[PN->getIncomingBlock(i)]; 
- 
-      // At this point, the excess predecessor entries are positive in the 
-      // map.  Loop over all of the PHIs and remove excess predecessor 
-      // entries. 
-      BasicBlock::iterator I = NewBB->begin(); 
-      for (; (PN = dyn_cast<PHINode>(I)); ++I) { 
-        for (const auto &PCI : PredCount) { 
-          BasicBlock *Pred = PCI.first; 
-          for (unsigned NumToRemove = PCI.second; NumToRemove; --NumToRemove) 
-            PN->removeIncomingValue(Pred, false); 
-        } 
-      } 
-    } 
- 
-    // If the loops above have made these phi nodes have 0 or 1 operand, 
-    // replace them with undef or the input value.  We must do this for 
-    // correctness, because 0-operand phis are not valid. 
-    PN = cast<PHINode>(NewBB->begin()); 
-    if (PN->getNumIncomingValues() == 0) { 
-      BasicBlock::iterator I = NewBB->begin(); 
-      BasicBlock::const_iterator OldI = OldBB->begin(); 
-      while ((PN = dyn_cast<PHINode>(I++))) { 
-        Value *NV = UndefValue::get(PN->getType()); 
-        PN->replaceAllUsesWith(NV); 
-        assert(VMap[&*OldI] == PN && "VMap mismatch"); 
-        VMap[&*OldI] = NV; 
-        PN->eraseFromParent(); 
-        ++OldI; 
-      } 
-    } 
-  } 
- 
-  // Make a second pass over the PHINodes now that all of them have been 
-  // remapped into the new function, simplifying the PHINode and performing any 
-  // recursive simplifications exposed. This will transparently update the 
-  // WeakTrackingVH in the VMap. Notably, we rely on that so that if we coalesce 
-  // two PHINodes, the iteration over the old PHIs remains valid, and the 
-  // mapping will just map us to the new node (which may not even be a PHI 
-  // node). 
-  const DataLayout &DL = NewFunc->getParent()->getDataLayout(); 
-  SmallSetVector<const Value *, 8> Worklist; 
-  for (unsigned Idx = 0, Size = PHIToResolve.size(); Idx != Size; ++Idx) 
-    if (isa<PHINode>(VMap[PHIToResolve[Idx]])) 
-      Worklist.insert(PHIToResolve[Idx]); 
- 
-  // Note that we must test the size on each iteration, the worklist can grow. 
-  for (unsigned Idx = 0; Idx != Worklist.size(); ++Idx) { 
-    const Value *OrigV = Worklist[Idx]; 
-    auto *I = dyn_cast_or_null<Instruction>(VMap.lookup(OrigV)); 
-    if (!I) 
-      continue; 
- 
-    // Skip over non-intrinsic callsites, we don't want to remove any nodes from 
-    // the CGSCC. 
-    CallBase *CB = dyn_cast<CallBase>(I); 
-    if (CB && CB->getCalledFunction() && 
-        !CB->getCalledFunction()->isIntrinsic()) 
-      continue; 
- 
-    // See if this instruction simplifies. 
-    Value *SimpleV = SimplifyInstruction(I, DL); 
-    if (!SimpleV) 
-      continue; 
- 
-    // Stash away all the uses of the old instruction so we can check them for 
-    // recursive simplifications after a RAUW. This is cheaper than checking all 
-    // uses of To on the recursive step in most cases. 
-    for (const User *U : OrigV->users()) 
-      Worklist.insert(cast<Instruction>(U)); 
- 
-    // Replace the instruction with its simplified value. 
-    I->replaceAllUsesWith(SimpleV); 
- 
-    // If the original instruction had no side effects, remove it. 
-    if (isInstructionTriviallyDead(I)) 
-      I->eraseFromParent(); 
-    else 
-      VMap[OrigV] = I; 
-  } 
- 
-  // Now that the inlined function body has been fully constructed, go through 
-  // and zap unconditional fall-through branches. This happens all the time when 
-  // specializing code: code specialization turns conditional branches into 
-  // uncond branches, and this code folds them. 
-  Function::iterator Begin = cast<BasicBlock>(VMap[StartingBB])->getIterator(); 
-  Function::iterator I = Begin; 
-  while (I != NewFunc->end()) { 
-    // We need to simplify conditional branches and switches with a constant 
-    // operand. We try to prune these out when cloning, but if the 
-    // simplification required looking through PHI nodes, those are only 
-    // available after forming the full basic block. That may leave some here, 
-    // and we still want to prune the dead code as early as possible. 
-    // 
-    // Do the folding before we check if the block is dead since we want code 
-    // like 
-    //  bb: 
-    //    br i1 undef, label %bb, label %bb 
-    // to be simplified to 
-    //  bb: 
-    //    br label %bb 
-    // before we call I->getSinglePredecessor(). 
-    ConstantFoldTerminator(&*I); 
- 
-    // Check if this block has become dead during inlining or other 
-    // simplifications. Note that the first block will appear dead, as it has 
-    // not yet been wired up properly. 
+  }
+
+  if (CodeInfo) {
+    CodeInfo->ContainsCalls          |= hasCalls;
+    CodeInfo->ContainsDynamicAllocas |= hasDynamicAllocas;
+    CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas &&
+      BB != &BB->getParent()->front();
+  }
+}
+
+/// This works like CloneAndPruneFunctionInto, except that it does not clone the
+/// entire function. Instead it starts at an instruction provided by the caller
+/// and copies (and prunes) only the code reachable from that instruction.
+void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
+                                     const Instruction *StartingInst,
+                                     ValueToValueMapTy &VMap,
+                                     bool ModuleLevelChanges,
+                                     SmallVectorImpl<ReturnInst *> &Returns,
+                                     const char *NameSuffix,
+                                     ClonedCodeInfo *CodeInfo) {
+  assert(NameSuffix && "NameSuffix cannot be null!");
+
+  ValueMapTypeRemapper *TypeMapper = nullptr;
+  ValueMaterializer *Materializer = nullptr;
+
+#ifndef NDEBUG
+  // If the cloning starts at the beginning of the function, verify that
+  // the function arguments are mapped.
+  if (!StartingInst)
+    for (const Argument &II : OldFunc->args())
+      assert(VMap.count(&II) && "No mapping from source argument specified!");
+#endif
+
+  PruningFunctionCloner PFC(NewFunc, OldFunc, VMap, ModuleLevelChanges,
+                            NameSuffix, CodeInfo);
+  const BasicBlock *StartingBB;
+  if (StartingInst)
+    StartingBB = StartingInst->getParent();
+  else {
+    StartingBB = &OldFunc->getEntryBlock();
+    StartingInst = &StartingBB->front();
+  }
+
+  // Clone the entry block, and anything recursively reachable from it.
+  std::vector<const BasicBlock*> CloneWorklist;
+  PFC.CloneBlock(StartingBB, StartingInst->getIterator(), CloneWorklist);
+  while (!CloneWorklist.empty()) {
+    const BasicBlock *BB = CloneWorklist.back();
+    CloneWorklist.pop_back();
+    PFC.CloneBlock(BB, BB->begin(), CloneWorklist);
+  }
+
+  // Loop over all of the basic blocks in the old function.  If the block was
+  // reachable, we have cloned it and the old block is now in the value map:
+  // insert it into the new function in the right order.  If not, ignore it.
+  //
+  // Defer PHI resolution until rest of function is resolved.
+  SmallVector<const PHINode*, 16> PHIToResolve;
+  for (const BasicBlock &BI : *OldFunc) {
+    Value *V = VMap.lookup(&BI);
+    BasicBlock *NewBB = cast_or_null<BasicBlock>(V);
+    if (!NewBB) continue;  // Dead block.
+
+    // Add the new block to the new function.
+    NewFunc->getBasicBlockList().push_back(NewBB);
+
+    // Handle PHI nodes specially, as we have to remove references to dead
+    // blocks.
+    for (const PHINode &PN : BI.phis()) {
+      // PHI nodes may have been remapped to non-PHI nodes by the caller or
+      // during the cloning process.
+      if (isa<PHINode>(VMap[&PN]))
+        PHIToResolve.push_back(&PN);
+      else
+        break;
+    }
+
+    // Finally, remap the terminator instructions, as those can't be remapped
+    // until all BBs are mapped.
+    RemapInstruction(NewBB->getTerminator(), VMap,
+                     ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
+                     TypeMapper, Materializer);
+  }
+
+  // Defer PHI resolution until rest of function is resolved, PHI resolution
+  // requires the CFG to be up-to-date.
+  for (unsigned phino = 0, e = PHIToResolve.size(); phino != e; ) {
+    const PHINode *OPN = PHIToResolve[phino];
+    unsigned NumPreds = OPN->getNumIncomingValues();
+    const BasicBlock *OldBB = OPN->getParent();
+    BasicBlock *NewBB = cast<BasicBlock>(VMap[OldBB]);
+
+    // Map operands for blocks that are live and remove operands for blocks
+    // that are dead.
+    for (; phino != PHIToResolve.size() &&
+         PHIToResolve[phino]->getParent() == OldBB; ++phino) {
+      OPN = PHIToResolve[phino];
+      PHINode *PN = cast<PHINode>(VMap[OPN]);
+      for (unsigned pred = 0, e = NumPreds; pred != e; ++pred) {
+        Value *V = VMap.lookup(PN->getIncomingBlock(pred));
+        if (BasicBlock *MappedBlock = cast_or_null<BasicBlock>(V)) {
+          Value *InVal = MapValue(PN->getIncomingValue(pred),
+                                  VMap,
+                        ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges);
+          assert(InVal && "Unknown input value?");
+          PN->setIncomingValue(pred, InVal);
+          PN->setIncomingBlock(pred, MappedBlock);
+        } else {
+          PN->removeIncomingValue(pred, false);
+          --pred;  // Revisit the next entry.
+          --e;
+        }
+      }
+    }
+
+    // The loop above has removed PHI entries for those blocks that are dead
+    // and has updated others.  However, if a block is live (i.e. copied over)
+    // but its terminator has been changed to not go to this block, then our
+    // phi nodes will have invalid entries.  Update the PHI nodes in this
+    // case.
+    PHINode *PN = cast<PHINode>(NewBB->begin());
+    NumPreds = pred_size(NewBB);
+    if (NumPreds != PN->getNumIncomingValues()) {
+      assert(NumPreds < PN->getNumIncomingValues());
+      // Count how many times each predecessor comes to this block.
+      std::map<BasicBlock*, unsigned> PredCount;
+      for (pred_iterator PI = pred_begin(NewBB), E = pred_end(NewBB);
+           PI != E; ++PI)
+        --PredCount[*PI];
+
+      // Figure out how many entries to remove from each PHI.
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        ++PredCount[PN->getIncomingBlock(i)];
+
+      // At this point, the excess predecessor entries are positive in the
+      // map.  Loop over all of the PHIs and remove excess predecessor
+      // entries.
+      BasicBlock::iterator I = NewBB->begin();
+      for (; (PN = dyn_cast<PHINode>(I)); ++I) {
+        for (const auto &PCI : PredCount) {
+          BasicBlock *Pred = PCI.first;
+          for (unsigned NumToRemove = PCI.second; NumToRemove; --NumToRemove)
+            PN->removeIncomingValue(Pred, false);
+        }
+      }
+    }
+
+    // If the loops above have made these phi nodes have 0 or 1 operand,
+    // replace them with undef or the input value.  We must do this for
+    // correctness, because 0-operand phis are not valid.
+    PN = cast<PHINode>(NewBB->begin());
+    if (PN->getNumIncomingValues() == 0) {
+      BasicBlock::iterator I = NewBB->begin();
+      BasicBlock::const_iterator OldI = OldBB->begin();
+      while ((PN = dyn_cast<PHINode>(I++))) {
+        Value *NV = UndefValue::get(PN->getType());
+        PN->replaceAllUsesWith(NV);
+        assert(VMap[&*OldI] == PN && "VMap mismatch");
+        VMap[&*OldI] = NV;
+        PN->eraseFromParent();
+        ++OldI;
+      }
+    }
+  }
+
+  // Make a second pass over the PHINodes now that all of them have been
+  // remapped into the new function, simplifying the PHINode and performing any
+  // recursive simplifications exposed. This will transparently update the
+  // WeakTrackingVH in the VMap. Notably, we rely on that so that if we coalesce
+  // two PHINodes, the iteration over the old PHIs remains valid, and the
+  // mapping will just map us to the new node (which may not even be a PHI
+  // node).
+  const DataLayout &DL = NewFunc->getParent()->getDataLayout();
+  SmallSetVector<const Value *, 8> Worklist;
+  for (unsigned Idx = 0, Size = PHIToResolve.size(); Idx != Size; ++Idx)
+    if (isa<PHINode>(VMap[PHIToResolve[Idx]]))
+      Worklist.insert(PHIToResolve[Idx]);
+
+  // Note that we must test the size on each iteration, the worklist can grow.
+  for (unsigned Idx = 0; Idx != Worklist.size(); ++Idx) {
+    const Value *OrigV = Worklist[Idx];
+    auto *I = dyn_cast_or_null<Instruction>(VMap.lookup(OrigV));
+    if (!I)
+      continue;
+
+    // Skip over non-intrinsic callsites, we don't want to remove any nodes from
+    // the CGSCC.
+    CallBase *CB = dyn_cast<CallBase>(I);
+    if (CB && CB->getCalledFunction() &&
+        !CB->getCalledFunction()->isIntrinsic())
+      continue;
+
+    // See if this instruction simplifies.
+    Value *SimpleV = SimplifyInstruction(I, DL);
+    if (!SimpleV)
+      continue;
+
+    // Stash away all the uses of the old instruction so we can check them for
+    // recursive simplifications after a RAUW. This is cheaper than checking all
+    // uses of To on the recursive step in most cases.
+    for (const User *U : OrigV->users())
+      Worklist.insert(cast<Instruction>(U));
+
+    // Replace the instruction with its simplified value.
+    I->replaceAllUsesWith(SimpleV);
+
+    // If the original instruction had no side effects, remove it.
+    if (isInstructionTriviallyDead(I))
+      I->eraseFromParent();
+    else
+      VMap[OrigV] = I;
+  }
+
+  // Now that the inlined function body has been fully constructed, go through
+  // and zap unconditional fall-through branches. This happens all the time when
+  // specializing code: code specialization turns conditional branches into
+  // uncond branches, and this code folds them.
+  Function::iterator Begin = cast<BasicBlock>(VMap[StartingBB])->getIterator();
+  Function::iterator I = Begin;
+  while (I != NewFunc->end()) {
+    // We need to simplify conditional branches and switches with a constant
+    // operand. We try to prune these out when cloning, but if the
+    // simplification required looking through PHI nodes, those are only
+    // available after forming the full basic block. That may leave some here,
+    // and we still want to prune the dead code as early as possible.
+    //
+    // Do the folding before we check if the block is dead since we want code
+    // like
+    //  bb:
+    //    br i1 undef, label %bb, label %bb
+    // to be simplified to
+    //  bb:
+    //    br label %bb
+    // before we call I->getSinglePredecessor().
+    ConstantFoldTerminator(&*I);
+
+    // Check if this block has become dead during inlining or other
+    // simplifications. Note that the first block will appear dead, as it has
+    // not yet been wired up properly.
     if (I != Begin && (pred_empty(&*I) || I->getSinglePredecessor() == &*I)) {
-      BasicBlock *DeadBB = &*I++; 
-      DeleteDeadBlock(DeadBB); 
-      continue; 
-    } 
- 
-    BranchInst *BI = dyn_cast<BranchInst>(I->getTerminator()); 
-    if (!BI || BI->isConditional()) { ++I; continue; } 
- 
-    BasicBlock *Dest = BI->getSuccessor(0); 
-    if (!Dest->getSinglePredecessor()) { 
-      ++I; continue; 
-    } 
- 
-    // We shouldn't be able to get single-entry PHI nodes here, as instsimplify 
-    // above should have zapped all of them.. 
-    assert(!isa<PHINode>(Dest->begin())); 
- 
-    // We know all single-entry PHI nodes in the inlined function have been 
-    // removed, so we just need to splice the blocks. 
-    BI->eraseFromParent(); 
- 
-    // Make all PHI nodes that referred to Dest now refer to I as their source. 
-    Dest->replaceAllUsesWith(&*I); 
- 
-    // Move all the instructions in the succ to the pred. 
-    I->getInstList().splice(I->end(), Dest->getInstList()); 
- 
-    // Remove the dest block. 
-    Dest->eraseFromParent(); 
- 
-    // Do not increment I, iteratively merge all things this block branches to. 
-  } 
- 
-  // Make a final pass over the basic blocks from the old function to gather 
-  // any return instructions which survived folding. We have to do this here 
-  // because we can iteratively remove and merge returns above. 
-  for (Function::iterator I = cast<BasicBlock>(VMap[StartingBB])->getIterator(), 
-                          E = NewFunc->end(); 
-       I != E; ++I) 
-    if (ReturnInst *RI = dyn_cast<ReturnInst>(I->getTerminator())) 
-      Returns.push_back(RI); 
-} 
- 
- 
-/// This works exactly like CloneFunctionInto, 
-/// except that it does some simple constant prop and DCE on the fly.  The 
-/// effect of this is to copy significantly less code in cases where (for 
-/// example) a function call with constant arguments is inlined, and those 
-/// constant arguments cause a significant amount of code in the callee to be 
-/// dead.  Since this doesn't produce an exact copy of the input, it can't be 
-/// used for things like CloneFunction or CloneModule. 
-void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc, 
-                                     ValueToValueMapTy &VMap, 
-                                     bool ModuleLevelChanges, 
-                                     SmallVectorImpl<ReturnInst*> &Returns, 
-                                     const char *NameSuffix, 
-                                     ClonedCodeInfo *CodeInfo, 
-                                     Instruction *TheCall) { 
-  CloneAndPruneIntoFromInst(NewFunc, OldFunc, &OldFunc->front().front(), VMap, 
-                            ModuleLevelChanges, Returns, NameSuffix, CodeInfo); 
-} 
- 
-/// Remaps instructions in \p Blocks using the mapping in \p VMap. 
-void llvm::remapInstructionsInBlocks( 
-    const SmallVectorImpl<BasicBlock *> &Blocks, ValueToValueMapTy &VMap) { 
-  // Rewrite the code to refer to itself. 
-  for (auto *BB : Blocks) 
-    for (auto &Inst : *BB) 
-      RemapInstruction(&Inst, VMap, 
-                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); 
-} 
- 
-/// Clones a loop \p OrigLoop.  Returns the loop and the blocks in \p 
-/// Blocks. 
-/// 
-/// Updates LoopInfo and DominatorTree assuming the loop is dominated by block 
-/// \p LoopDomBB.  Insert the new blocks before block specified in \p Before. 
-Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB, 
-                                   Loop *OrigLoop, ValueToValueMapTy &VMap, 
-                                   const Twine &NameSuffix, LoopInfo *LI, 
-                                   DominatorTree *DT, 
-                                   SmallVectorImpl<BasicBlock *> &Blocks) { 
-  Function *F = OrigLoop->getHeader()->getParent(); 
-  Loop *ParentLoop = OrigLoop->getParentLoop(); 
-  DenseMap<Loop *, Loop *> LMap; 
- 
-  Loop *NewLoop = LI->AllocateLoop(); 
-  LMap[OrigLoop] = NewLoop; 
-  if (ParentLoop) 
-    ParentLoop->addChildLoop(NewLoop); 
-  else 
-    LI->addTopLevelLoop(NewLoop); 
- 
-  BasicBlock *OrigPH = OrigLoop->getLoopPreheader(); 
-  assert(OrigPH && "No preheader"); 
-  BasicBlock *NewPH = CloneBasicBlock(OrigPH, VMap, NameSuffix, F); 
-  // To rename the loop PHIs. 
-  VMap[OrigPH] = NewPH; 
-  Blocks.push_back(NewPH); 
- 
-  // Update LoopInfo. 
-  if (ParentLoop) 
-    ParentLoop->addBasicBlockToLoop(NewPH, *LI); 
- 
-  // Update DominatorTree. 
-  DT->addNewBlock(NewPH, LoopDomBB); 
- 
-  for (Loop *CurLoop : OrigLoop->getLoopsInPreorder()) { 
-    Loop *&NewLoop = LMap[CurLoop]; 
-    if (!NewLoop) { 
-      NewLoop = LI->AllocateLoop(); 
- 
-      // Establish the parent/child relationship. 
-      Loop *OrigParent = CurLoop->getParentLoop(); 
-      assert(OrigParent && "Could not find the original parent loop"); 
-      Loop *NewParentLoop = LMap[OrigParent]; 
-      assert(NewParentLoop && "Could not find the new parent loop"); 
- 
-      NewParentLoop->addChildLoop(NewLoop); 
-    } 
-  } 
- 
-  for (BasicBlock *BB : OrigLoop->getBlocks()) { 
-    Loop *CurLoop = LI->getLoopFor(BB); 
-    Loop *&NewLoop = LMap[CurLoop]; 
-    assert(NewLoop && "Expecting new loop to be allocated"); 
- 
-    BasicBlock *NewBB = CloneBasicBlock(BB, VMap, NameSuffix, F); 
-    VMap[BB] = NewBB; 
- 
-    // Update LoopInfo. 
-    NewLoop->addBasicBlockToLoop(NewBB, *LI); 
- 
-    // Add DominatorTree node. After seeing all blocks, update to correct 
-    // IDom. 
-    DT->addNewBlock(NewBB, NewPH); 
- 
-    Blocks.push_back(NewBB); 
-  } 
- 
-  for (BasicBlock *BB : OrigLoop->getBlocks()) { 
-    // Update loop headers. 
-    Loop *CurLoop = LI->getLoopFor(BB); 
-    if (BB == CurLoop->getHeader()) 
-      LMap[CurLoop]->moveToHeader(cast<BasicBlock>(VMap[BB])); 
- 
-    // Update DominatorTree. 
-    BasicBlock *IDomBB = DT->getNode(BB)->getIDom()->getBlock(); 
-    DT->changeImmediateDominator(cast<BasicBlock>(VMap[BB]), 
-                                 cast<BasicBlock>(VMap[IDomBB])); 
-  } 
- 
-  // Move them physically from the end of the block list. 
-  F->getBasicBlockList().splice(Before->getIterator(), F->getBasicBlockList(), 
-                                NewPH); 
-  F->getBasicBlockList().splice(Before->getIterator(), F->getBasicBlockList(), 
-                                NewLoop->getHeader()->getIterator(), F->end()); 
- 
-  return NewLoop; 
-} 
- 
-/// Duplicate non-Phi instructions from the beginning of block up to 
-/// StopAt instruction into a split block between BB and its predecessor. 
-BasicBlock *llvm::DuplicateInstructionsInSplitBetween( 
-    BasicBlock *BB, BasicBlock *PredBB, Instruction *StopAt, 
-    ValueToValueMapTy &ValueMapping, DomTreeUpdater &DTU) { 
- 
-  assert(count(successors(PredBB), BB) == 1 && 
-         "There must be a single edge between PredBB and BB!"); 
-  // We are going to have to map operands from the original BB block to the new 
-  // copy of the block 'NewBB'.  If there are PHI nodes in BB, evaluate them to 
-  // account for entry from PredBB. 
-  BasicBlock::iterator BI = BB->begin(); 
-  for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI) 
-    ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB); 
- 
-  BasicBlock *NewBB = SplitEdge(PredBB, BB); 
-  NewBB->setName(PredBB->getName() + ".split"); 
-  Instruction *NewTerm = NewBB->getTerminator(); 
- 
-  // FIXME: SplitEdge does not yet take a DTU, so we include the split edge 
-  //        in the update set here. 
-  DTU.applyUpdates({{DominatorTree::Delete, PredBB, BB}, 
-                    {DominatorTree::Insert, PredBB, NewBB}, 
-                    {DominatorTree::Insert, NewBB, BB}}); 
- 
-  // Clone the non-phi instructions of BB into NewBB, keeping track of the 
-  // mapping and using it to remap operands in the cloned instructions. 
-  // Stop once we see the terminator too. This covers the case where BB's 
-  // terminator gets replaced and StopAt == BB's terminator. 
-  for (; StopAt != &*BI && BB->getTerminator() != &*BI; ++BI) { 
-    Instruction *New = BI->clone(); 
-    New->setName(BI->getName()); 
-    New->insertBefore(NewTerm); 
-    ValueMapping[&*BI] = New; 
- 
-    // Remap operands to patch up intra-block references. 
-    for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i) 
-      if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) { 
-        auto I = ValueMapping.find(Inst); 
-        if (I != ValueMapping.end()) 
-          New->setOperand(i, I->second); 
-      } 
-  } 
- 
-  return NewBB; 
-} 
+      BasicBlock *DeadBB = &*I++;
+      DeleteDeadBlock(DeadBB);
+      continue;
+    }
+
+    BranchInst *BI = dyn_cast<BranchInst>(I->getTerminator());
+    if (!BI || BI->isConditional()) { ++I; continue; }
+
+    BasicBlock *Dest = BI->getSuccessor(0);
+    if (!Dest->getSinglePredecessor()) {
+      ++I; continue;
+    }
+
+    // We shouldn't be able to get single-entry PHI nodes here, as instsimplify
+    // above should have zapped all of them..
+    assert(!isa<PHINode>(Dest->begin()));
+
+    // We know all single-entry PHI nodes in the inlined function have been
+    // removed, so we just need to splice the blocks.
+    BI->eraseFromParent();
+
+    // Make all PHI nodes that referred to Dest now refer to I as their source.
+    Dest->replaceAllUsesWith(&*I);
+
+    // Move all the instructions in the succ to the pred.
+    I->getInstList().splice(I->end(), Dest->getInstList());
+
+    // Remove the dest block.
+    Dest->eraseFromParent();
+
+    // Do not increment I, iteratively merge all things this block branches to.
+  }
+
+  // Make a final pass over the basic blocks from the old function to gather
+  // any return instructions which survived folding. We have to do this here
+  // because we can iteratively remove and merge returns above.
+  for (Function::iterator I = cast<BasicBlock>(VMap[StartingBB])->getIterator(),
+                          E = NewFunc->end();
+       I != E; ++I)
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(I->getTerminator()))
+      Returns.push_back(RI);
+}
+
+
+/// This works exactly like CloneFunctionInto,
+/// except that it does some simple constant prop and DCE on the fly.  The
+/// effect of this is to copy significantly less code in cases where (for
+/// example) a function call with constant arguments is inlined, and those
+/// constant arguments cause a significant amount of code in the callee to be
+/// dead.  Since this doesn't produce an exact copy of the input, it can't be
+/// used for things like CloneFunction or CloneModule.
+void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
+                                     ValueToValueMapTy &VMap,
+                                     bool ModuleLevelChanges,
+                                     SmallVectorImpl<ReturnInst*> &Returns,
+                                     const char *NameSuffix,
+                                     ClonedCodeInfo *CodeInfo,
+                                     Instruction *TheCall) {
+  CloneAndPruneIntoFromInst(NewFunc, OldFunc, &OldFunc->front().front(), VMap,
+                            ModuleLevelChanges, Returns, NameSuffix, CodeInfo);
+}
+
+/// Remaps instructions in \p Blocks using the mapping in \p VMap.
+void llvm::remapInstructionsInBlocks(
+    const SmallVectorImpl<BasicBlock *> &Blocks, ValueToValueMapTy &VMap) {
+  // Rewrite the code to refer to itself.
+  for (auto *BB : Blocks)
+    for (auto &Inst : *BB)
+      RemapInstruction(&Inst, VMap,
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+}
+
+/// Clones a loop \p OrigLoop.  Returns the loop and the blocks in \p
+/// Blocks.
+///
+/// Updates LoopInfo and DominatorTree assuming the loop is dominated by block
+/// \p LoopDomBB.  Insert the new blocks before block specified in \p Before.
+Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB,
+                                   Loop *OrigLoop, ValueToValueMapTy &VMap,
+                                   const Twine &NameSuffix, LoopInfo *LI,
+                                   DominatorTree *DT,
+                                   SmallVectorImpl<BasicBlock *> &Blocks) {
+  Function *F = OrigLoop->getHeader()->getParent();
+  Loop *ParentLoop = OrigLoop->getParentLoop();
+  DenseMap<Loop *, Loop *> LMap;
+
+  Loop *NewLoop = LI->AllocateLoop();
+  LMap[OrigLoop] = NewLoop;
+  if (ParentLoop)
+    ParentLoop->addChildLoop(NewLoop);
+  else
+    LI->addTopLevelLoop(NewLoop);
+
+  BasicBlock *OrigPH = OrigLoop->getLoopPreheader();
+  assert(OrigPH && "No preheader");
+  BasicBlock *NewPH = CloneBasicBlock(OrigPH, VMap, NameSuffix, F);
+  // To rename the loop PHIs.
+  VMap[OrigPH] = NewPH;
+  Blocks.push_back(NewPH);
+
+  // Update LoopInfo.
+  if (ParentLoop)
+    ParentLoop->addBasicBlockToLoop(NewPH, *LI);
+
+  // Update DominatorTree.
+  DT->addNewBlock(NewPH, LoopDomBB);
+
+  for (Loop *CurLoop : OrigLoop->getLoopsInPreorder()) {
+    Loop *&NewLoop = LMap[CurLoop];
+    if (!NewLoop) {
+      NewLoop = LI->AllocateLoop();
+
+      // Establish the parent/child relationship.
+      Loop *OrigParent = CurLoop->getParentLoop();
+      assert(OrigParent && "Could not find the original parent loop");
+      Loop *NewParentLoop = LMap[OrigParent];
+      assert(NewParentLoop && "Could not find the new parent loop");
+
+      NewParentLoop->addChildLoop(NewLoop);
+    }
+  }
+
+  for (BasicBlock *BB : OrigLoop->getBlocks()) {
+    Loop *CurLoop = LI->getLoopFor(BB);
+    Loop *&NewLoop = LMap[CurLoop];
+    assert(NewLoop && "Expecting new loop to be allocated");
+
+    BasicBlock *NewBB = CloneBasicBlock(BB, VMap, NameSuffix, F);
+    VMap[BB] = NewBB;
+
+    // Update LoopInfo.
+    NewLoop->addBasicBlockToLoop(NewBB, *LI);
+
+    // Add DominatorTree node. After seeing all blocks, update to correct
+    // IDom.
+    DT->addNewBlock(NewBB, NewPH);
+
+    Blocks.push_back(NewBB);
+  }
+
+  for (BasicBlock *BB : OrigLoop->getBlocks()) {
+    // Update loop headers.
+    Loop *CurLoop = LI->getLoopFor(BB);
+    if (BB == CurLoop->getHeader())
+      LMap[CurLoop]->moveToHeader(cast<BasicBlock>(VMap[BB]));
+
+    // Update DominatorTree.
+    BasicBlock *IDomBB = DT->getNode(BB)->getIDom()->getBlock();
+    DT->changeImmediateDominator(cast<BasicBlock>(VMap[BB]),
+                                 cast<BasicBlock>(VMap[IDomBB]));
+  }
+
+  // Move them physically from the end of the block list.
+  F->getBasicBlockList().splice(Before->getIterator(), F->getBasicBlockList(),
+                                NewPH);
+  F->getBasicBlockList().splice(Before->getIterator(), F->getBasicBlockList(),
+                                NewLoop->getHeader()->getIterator(), F->end());
+
+  return NewLoop;
+}
+
+/// Duplicate non-Phi instructions from the beginning of block up to
+/// StopAt instruction into a split block between BB and its predecessor.
+BasicBlock *llvm::DuplicateInstructionsInSplitBetween(
+    BasicBlock *BB, BasicBlock *PredBB, Instruction *StopAt,
+    ValueToValueMapTy &ValueMapping, DomTreeUpdater &DTU) {
+
+  assert(count(successors(PredBB), BB) == 1 &&
+         "There must be a single edge between PredBB and BB!");
+  // We are going to have to map operands from the original BB block to the new
+  // copy of the block 'NewBB'.  If there are PHI nodes in BB, evaluate them to
+  // account for entry from PredBB.
+  BasicBlock::iterator BI = BB->begin();
+  for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+    ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
+
+  BasicBlock *NewBB = SplitEdge(PredBB, BB);
+  NewBB->setName(PredBB->getName() + ".split");
+  Instruction *NewTerm = NewBB->getTerminator();
+
+  // FIXME: SplitEdge does not yet take a DTU, so we include the split edge
+  //        in the update set here.
+  DTU.applyUpdates({{DominatorTree::Delete, PredBB, BB},
+                    {DominatorTree::Insert, PredBB, NewBB},
+                    {DominatorTree::Insert, NewBB, BB}});
+
+  // Clone the non-phi instructions of BB into NewBB, keeping track of the
+  // mapping and using it to remap operands in the cloned instructions.
+  // Stop once we see the terminator too. This covers the case where BB's
+  // terminator gets replaced and StopAt == BB's terminator.
+  for (; StopAt != &*BI && BB->getTerminator() != &*BI; ++BI) {
+    Instruction *New = BI->clone();
+    New->setName(BI->getName());
+    New->insertBefore(NewTerm);
+    ValueMapping[&*BI] = New;
+
+    // Remap operands to patch up intra-block references.
+    for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
+      if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
+        auto I = ValueMapping.find(Inst);
+        if (I != ValueMapping.end())
+          New->setOperand(i, I->second);
+      }
+  }
+
+  return NewBB;
+}
 
 void llvm::cloneNoAliasScopes(
     ArrayRef<MDNode *> NoAliasDeclScopes,
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/CloneModule.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/CloneModule.cpp
index ae1c463b08..a6327bbf21 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/CloneModule.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/CloneModule.cpp
@@ -1,122 +1,122 @@
-//===- CloneModule.cpp - Clone an entire module ---------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the CloneModule interface which makes a copy of an 
-// entire module. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/Transforms/Utils/Cloning.h" 
-#include "llvm/Transforms/Utils/ValueMapper.h" 
-using namespace llvm; 
- 
-static void copyComdat(GlobalObject *Dst, const GlobalObject *Src) { 
-  const Comdat *SC = Src->getComdat(); 
-  if (!SC) 
-    return; 
-  Comdat *DC = Dst->getParent()->getOrInsertComdat(SC->getName()); 
-  DC->setSelectionKind(SC->getSelectionKind()); 
-  Dst->setComdat(DC); 
-} 
- 
-/// This is not as easy as it might seem because we have to worry about making 
-/// copies of global variables and functions, and making their (initializers and 
-/// references, respectively) refer to the right globals. 
-/// 
-std::unique_ptr<Module> llvm::CloneModule(const Module &M) { 
-  // Create the value map that maps things from the old module over to the new 
-  // module. 
-  ValueToValueMapTy VMap; 
-  return CloneModule(M, VMap); 
-} 
- 
-std::unique_ptr<Module> llvm::CloneModule(const Module &M, 
-                                          ValueToValueMapTy &VMap) { 
-  return CloneModule(M, VMap, [](const GlobalValue *GV) { return true; }); 
-} 
- 
-std::unique_ptr<Module> llvm::CloneModule( 
-    const Module &M, ValueToValueMapTy &VMap, 
-    function_ref<bool(const GlobalValue *)> ShouldCloneDefinition) { 
-  // First off, we need to create the new module. 
-  std::unique_ptr<Module> New = 
-      std::make_unique<Module>(M.getModuleIdentifier(), M.getContext()); 
-  New->setSourceFileName(M.getSourceFileName()); 
-  New->setDataLayout(M.getDataLayout()); 
-  New->setTargetTriple(M.getTargetTriple()); 
-  New->setModuleInlineAsm(M.getModuleInlineAsm()); 
- 
-  // Loop over all of the global variables, making corresponding globals in the 
-  // new module.  Here we add them to the VMap and to the new Module.  We 
-  // don't worry about attributes or initializers, they will come later. 
-  // 
-  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end(); 
-       I != E; ++I) { 
-    GlobalVariable *GV = new GlobalVariable(*New, 
-                                            I->getValueType(), 
-                                            I->isConstant(), I->getLinkage(), 
-                                            (Constant*) nullptr, I->getName(), 
-                                            (GlobalVariable*) nullptr, 
-                                            I->getThreadLocalMode(), 
-                                            I->getType()->getAddressSpace()); 
-    GV->copyAttributesFrom(&*I); 
-    VMap[&*I] = GV; 
-  } 
- 
-  // Loop over the functions in the module, making external functions as before 
-  for (const Function &I : M) { 
-    Function *NF = 
-        Function::Create(cast<FunctionType>(I.getValueType()), I.getLinkage(), 
-                         I.getAddressSpace(), I.getName(), New.get()); 
-    NF->copyAttributesFrom(&I); 
-    VMap[&I] = NF; 
-  } 
- 
-  // Loop over the aliases in the module 
-  for (Module::const_alias_iterator I = M.alias_begin(), E = M.alias_end(); 
-       I != E; ++I) { 
-    if (!ShouldCloneDefinition(&*I)) { 
-      // An alias cannot act as an external reference, so we need to create 
-      // either a function or a global variable depending on the value type. 
-      // FIXME: Once pointee types are gone we can probably pick one or the 
-      // other. 
-      GlobalValue *GV; 
-      if (I->getValueType()->isFunctionTy()) 
-        GV = Function::Create(cast<FunctionType>(I->getValueType()), 
-                              GlobalValue::ExternalLinkage, 
-                              I->getAddressSpace(), I->getName(), New.get()); 
-      else 
-        GV = new GlobalVariable( 
-            *New, I->getValueType(), false, GlobalValue::ExternalLinkage, 
-            nullptr, I->getName(), nullptr, 
-            I->getThreadLocalMode(), I->getType()->getAddressSpace()); 
-      VMap[&*I] = GV; 
-      // We do not copy attributes (mainly because copying between different 
-      // kinds of globals is forbidden), but this is generally not required for 
-      // correctness. 
-      continue; 
-    } 
-    auto *GA = GlobalAlias::create(I->getValueType(), 
-                                   I->getType()->getPointerAddressSpace(), 
-                                   I->getLinkage(), I->getName(), New.get()); 
-    GA->copyAttributesFrom(&*I); 
-    VMap[&*I] = GA; 
-  } 
- 
-  // Now that all of the things that global variable initializer can refer to 
-  // have been created, loop through and copy the global variable referrers 
-  // over...  We also set the attributes on the global now. 
-  // 
-  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end(); 
-       I != E; ++I) { 
+//===- CloneModule.cpp - Clone an entire module ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CloneModule interface which makes a copy of an
+// entire module.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+using namespace llvm;
+
+static void copyComdat(GlobalObject *Dst, const GlobalObject *Src) {
+  const Comdat *SC = Src->getComdat();
+  if (!SC)
+    return;
+  Comdat *DC = Dst->getParent()->getOrInsertComdat(SC->getName());
+  DC->setSelectionKind(SC->getSelectionKind());
+  Dst->setComdat(DC);
+}
+
+/// This is not as easy as it might seem because we have to worry about making
+/// copies of global variables and functions, and making their (initializers and
+/// references, respectively) refer to the right globals.
+///
+std::unique_ptr<Module> llvm::CloneModule(const Module &M) {
+  // Create the value map that maps things from the old module over to the new
+  // module.
+  ValueToValueMapTy VMap;
+  return CloneModule(M, VMap);
+}
+
+std::unique_ptr<Module> llvm::CloneModule(const Module &M,
+                                          ValueToValueMapTy &VMap) {
+  return CloneModule(M, VMap, [](const GlobalValue *GV) { return true; });
+}
+
+std::unique_ptr<Module> llvm::CloneModule(
+    const Module &M, ValueToValueMapTy &VMap,
+    function_ref<bool(const GlobalValue *)> ShouldCloneDefinition) {
+  // First off, we need to create the new module.
+  std::unique_ptr<Module> New =
+      std::make_unique<Module>(M.getModuleIdentifier(), M.getContext());
+  New->setSourceFileName(M.getSourceFileName());
+  New->setDataLayout(M.getDataLayout());
+  New->setTargetTriple(M.getTargetTriple());
+  New->setModuleInlineAsm(M.getModuleInlineAsm());
+
+  // Loop over all of the global variables, making corresponding globals in the
+  // new module.  Here we add them to the VMap and to the new Module.  We
+  // don't worry about attributes or initializers, they will come later.
+  //
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I) {
+    GlobalVariable *GV = new GlobalVariable(*New,
+                                            I->getValueType(),
+                                            I->isConstant(), I->getLinkage(),
+                                            (Constant*) nullptr, I->getName(),
+                                            (GlobalVariable*) nullptr,
+                                            I->getThreadLocalMode(),
+                                            I->getType()->getAddressSpace());
+    GV->copyAttributesFrom(&*I);
+    VMap[&*I] = GV;
+  }
+
+  // Loop over the functions in the module, making external functions as before
+  for (const Function &I : M) {
+    Function *NF =
+        Function::Create(cast<FunctionType>(I.getValueType()), I.getLinkage(),
+                         I.getAddressSpace(), I.getName(), New.get());
+    NF->copyAttributesFrom(&I);
+    VMap[&I] = NF;
+  }
+
+  // Loop over the aliases in the module
+  for (Module::const_alias_iterator I = M.alias_begin(), E = M.alias_end();
+       I != E; ++I) {
+    if (!ShouldCloneDefinition(&*I)) {
+      // An alias cannot act as an external reference, so we need to create
+      // either a function or a global variable depending on the value type.
+      // FIXME: Once pointee types are gone we can probably pick one or the
+      // other.
+      GlobalValue *GV;
+      if (I->getValueType()->isFunctionTy())
+        GV = Function::Create(cast<FunctionType>(I->getValueType()),
+                              GlobalValue::ExternalLinkage,
+                              I->getAddressSpace(), I->getName(), New.get());
+      else
+        GV = new GlobalVariable(
+            *New, I->getValueType(), false, GlobalValue::ExternalLinkage,
+            nullptr, I->getName(), nullptr,
+            I->getThreadLocalMode(), I->getType()->getAddressSpace());
+      VMap[&*I] = GV;
+      // We do not copy attributes (mainly because copying between different
+      // kinds of globals is forbidden), but this is generally not required for
+      // correctness.
+      continue;
+    }
+    auto *GA = GlobalAlias::create(I->getValueType(),
+                                   I->getType()->getPointerAddressSpace(),
+                                   I->getLinkage(), I->getName(), New.get());
+    GA->copyAttributesFrom(&*I);
+    VMap[&*I] = GA;
+  }
+
+  // Now that all of the things that global variable initializer can refer to
+  // have been created, loop through and copy the global variable referrers
+  // over...  We also set the attributes on the global now.
+  //
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I) {
     GlobalVariable *GV = cast<GlobalVariable>(VMap[&*I]);
 
     SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
@@ -125,91 +125,91 @@ std::unique_ptr<Module> llvm::CloneModule(
       GV->addMetadata(MD.first,
                       *MapMetadata(MD.second, VMap, RF_MoveDistinctMDs));
 
-    if (I->isDeclaration()) 
-      continue; 
- 
-    if (!ShouldCloneDefinition(&*I)) { 
-      // Skip after setting the correct linkage for an external reference. 
-      GV->setLinkage(GlobalValue::ExternalLinkage); 
-      continue; 
-    } 
-    if (I->hasInitializer()) 
-      GV->setInitializer(MapValue(I->getInitializer(), VMap)); 
- 
-    copyComdat(GV, &*I); 
-  } 
- 
-  // Similarly, copy over function bodies now... 
-  // 
-  for (const Function &I : M) { 
-    if (I.isDeclaration()) 
-      continue; 
- 
-    Function *F = cast<Function>(VMap[&I]); 
-    if (!ShouldCloneDefinition(&I)) { 
-      // Skip after setting the correct linkage for an external reference. 
-      F->setLinkage(GlobalValue::ExternalLinkage); 
-      // Personality function is not valid on a declaration. 
-      F->setPersonalityFn(nullptr); 
-      continue; 
-    } 
- 
-    Function::arg_iterator DestI = F->arg_begin(); 
-    for (Function::const_arg_iterator J = I.arg_begin(); J != I.arg_end(); 
-         ++J) { 
-      DestI->setName(J->getName()); 
-      VMap[&*J] = &*DestI++; 
-    } 
- 
-    SmallVector<ReturnInst *, 8> Returns; // Ignore returns cloned. 
-    CloneFunctionInto(F, &I, VMap, /*ModuleLevelChanges=*/true, Returns); 
- 
-    if (I.hasPersonalityFn()) 
-      F->setPersonalityFn(MapValue(I.getPersonalityFn(), VMap)); 
- 
-    copyComdat(F, &I); 
-  } 
- 
-  // And aliases 
-  for (Module::const_alias_iterator I = M.alias_begin(), E = M.alias_end(); 
-       I != E; ++I) { 
-    // We already dealt with undefined aliases above. 
-    if (!ShouldCloneDefinition(&*I)) 
-      continue; 
-    GlobalAlias *GA = cast<GlobalAlias>(VMap[&*I]); 
-    if (const Constant *C = I->getAliasee()) 
-      GA->setAliasee(MapValue(C, VMap)); 
-  } 
- 
-  // And named metadata.... 
-  const auto* LLVM_DBG_CU = M.getNamedMetadata("llvm.dbg.cu"); 
-  for (Module::const_named_metadata_iterator I = M.named_metadata_begin(), 
-                                             E = M.named_metadata_end(); 
-       I != E; ++I) { 
-    const NamedMDNode &NMD = *I; 
-    NamedMDNode *NewNMD = New->getOrInsertNamedMetadata(NMD.getName()); 
-    if (&NMD == LLVM_DBG_CU) { 
-      // Do not insert duplicate operands. 
-      SmallPtrSet<const void*, 8> Visited; 
-      for (const auto* Operand : NewNMD->operands()) 
-        Visited.insert(Operand); 
-      for (const auto* Operand : NMD.operands()) { 
-        auto* MappedOperand = MapMetadata(Operand, VMap); 
-        if (Visited.insert(MappedOperand).second) 
-          NewNMD->addOperand(MappedOperand); 
-      } 
-    } else 
-      for (unsigned i = 0, e = NMD.getNumOperands(); i != e; ++i) 
-        NewNMD->addOperand(MapMetadata(NMD.getOperand(i), VMap)); 
-  } 
- 
-  return New; 
-} 
- 
-extern "C" { 
- 
-LLVMModuleRef LLVMCloneModule(LLVMModuleRef M) { 
-  return wrap(CloneModule(*unwrap(M)).release()); 
-} 
- 
-} 
+    if (I->isDeclaration())
+      continue;
+
+    if (!ShouldCloneDefinition(&*I)) {
+      // Skip after setting the correct linkage for an external reference.
+      GV->setLinkage(GlobalValue::ExternalLinkage);
+      continue;
+    }
+    if (I->hasInitializer())
+      GV->setInitializer(MapValue(I->getInitializer(), VMap));
+
+    copyComdat(GV, &*I);
+  }
+
+  // Similarly, copy over function bodies now...
+  //
+  for (const Function &I : M) {
+    if (I.isDeclaration())
+      continue;
+
+    Function *F = cast<Function>(VMap[&I]);
+    if (!ShouldCloneDefinition(&I)) {
+      // Skip after setting the correct linkage for an external reference.
+      F->setLinkage(GlobalValue::ExternalLinkage);
+      // Personality function is not valid on a declaration.
+      F->setPersonalityFn(nullptr);
+      continue;
+    }
+
+    Function::arg_iterator DestI = F->arg_begin();
+    for (Function::const_arg_iterator J = I.arg_begin(); J != I.arg_end();
+         ++J) {
+      DestI->setName(J->getName());
+      VMap[&*J] = &*DestI++;
+    }
+
+    SmallVector<ReturnInst *, 8> Returns; // Ignore returns cloned.
+    CloneFunctionInto(F, &I, VMap, /*ModuleLevelChanges=*/true, Returns);
+
+    if (I.hasPersonalityFn())
+      F->setPersonalityFn(MapValue(I.getPersonalityFn(), VMap));
+
+    copyComdat(F, &I);
+  }
+
+  // And aliases
+  for (Module::const_alias_iterator I = M.alias_begin(), E = M.alias_end();
+       I != E; ++I) {
+    // We already dealt with undefined aliases above.
+    if (!ShouldCloneDefinition(&*I))
+      continue;
+    GlobalAlias *GA = cast<GlobalAlias>(VMap[&*I]);
+    if (const Constant *C = I->getAliasee())
+      GA->setAliasee(MapValue(C, VMap));
+  }
+
+  // And named metadata....
+  const auto* LLVM_DBG_CU = M.getNamedMetadata("llvm.dbg.cu");
+  for (Module::const_named_metadata_iterator I = M.named_metadata_begin(),
+                                             E = M.named_metadata_end();
+       I != E; ++I) {
+    const NamedMDNode &NMD = *I;
+    NamedMDNode *NewNMD = New->getOrInsertNamedMetadata(NMD.getName());
+    if (&NMD == LLVM_DBG_CU) {
+      // Do not insert duplicate operands.
+      SmallPtrSet<const void*, 8> Visited;
+      for (const auto* Operand : NewNMD->operands())
+        Visited.insert(Operand);
+      for (const auto* Operand : NMD.operands()) {
+        auto* MappedOperand = MapMetadata(Operand, VMap);
+        if (Visited.insert(MappedOperand).second)
+          NewNMD->addOperand(MappedOperand);
+      }
+    } else
+      for (unsigned i = 0, e = NMD.getNumOperands(); i != e; ++i)
+        NewNMD->addOperand(MapMetadata(NMD.getOperand(i), VMap));
+  }
+
+  return New;
+}
+
+extern "C" {
+
+LLVMModuleRef LLVMCloneModule(LLVMModuleRef M) {
+  return wrap(CloneModule(*unwrap(M)).release());
+}
+
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/CodeExtractor.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/CodeExtractor.cpp
index 03a371c093..390925a03b 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/CodeExtractor.cpp
@@ -1,540 +1,540 @@
-//===- CodeExtractor.cpp - Pull code region into a new function -----------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the interface to tear out a code region, such as an 
-// individual loop or a parallel section, into a new function, replacing it with 
-// a call to the new function. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/CodeExtractor.h" 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/Optional.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/BlockFrequencyInfo.h" 
-#include "llvm/Analysis/BlockFrequencyInfoImpl.h" 
-#include "llvm/Analysis/BranchProbabilityInfo.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/IR/Argument.h" 
-#include "llvm/IR/Attributes.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DIBuilder.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DebugInfoMetadata.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GlobalValue.h" 
-#include "llvm/IR/InstIterator.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/MDBuilder.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/IR/Verifier.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/BlockFrequency.h" 
-#include "llvm/Support/BranchProbability.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include <cassert> 
-#include <cstdint> 
-#include <iterator> 
-#include <map> 
-#include <set> 
-#include <utility> 
-#include <vector> 
- 
-using namespace llvm; 
-using namespace llvm::PatternMatch; 
-using ProfileCount = Function::ProfileCount; 
- 
-#define DEBUG_TYPE "code-extractor" 
- 
-// Provide a command-line option to aggregate function arguments into a struct 
-// for functions produced by the code extractor. This is useful when converting 
-// extracted functions to pthread-based code, as only one argument (void*) can 
-// be passed in to pthread_create(). 
-static cl::opt<bool> 
-AggregateArgsOpt("aggregate-extracted-args", cl::Hidden, 
-                 cl::desc("Aggregate arguments to code-extracted functions")); 
- 
-/// Test whether a block is valid for extraction. 
-static bool isBlockValidForExtraction(const BasicBlock &BB, 
-                                      const SetVector<BasicBlock *> &Result, 
-                                      bool AllowVarArgs, bool AllowAlloca) { 
-  // taking the address of a basic block moved to another function is illegal 
-  if (BB.hasAddressTaken()) 
-    return false; 
- 
-  // don't hoist code that uses another basicblock address, as it's likely to 
-  // lead to unexpected behavior, like cross-function jumps 
-  SmallPtrSet<User const *, 16> Visited; 
-  SmallVector<User const *, 16> ToVisit; 
- 
-  for (Instruction const &Inst : BB) 
-    ToVisit.push_back(&Inst); 
- 
-  while (!ToVisit.empty()) { 
-    User const *Curr = ToVisit.pop_back_val(); 
-    if (!Visited.insert(Curr).second) 
-      continue; 
-    if (isa<BlockAddress const>(Curr)) 
-      return false; // even a reference to self is likely to be not compatible 
- 
-    if (isa<Instruction>(Curr) && cast<Instruction>(Curr)->getParent() != &BB) 
-      continue; 
- 
-    for (auto const &U : Curr->operands()) { 
-      if (auto *UU = dyn_cast<User>(U)) 
-        ToVisit.push_back(UU); 
-    } 
-  } 
- 
-  // If explicitly requested, allow vastart and alloca. For invoke instructions 
-  // verify that extraction is valid. 
-  for (BasicBlock::const_iterator I = BB.begin(), E = BB.end(); I != E; ++I) { 
-    if (isa<AllocaInst>(I)) { 
-       if (!AllowAlloca) 
-         return false; 
-       continue; 
-    } 
- 
-    if (const auto *II = dyn_cast<InvokeInst>(I)) { 
-      // Unwind destination (either a landingpad, catchswitch, or cleanuppad) 
-      // must be a part of the subgraph which is being extracted. 
-      if (auto *UBB = II->getUnwindDest()) 
-        if (!Result.count(UBB)) 
-          return false; 
-      continue; 
-    } 
- 
-    // All catch handlers of a catchswitch instruction as well as the unwind 
-    // destination must be in the subgraph. 
-    if (const auto *CSI = dyn_cast<CatchSwitchInst>(I)) { 
-      if (auto *UBB = CSI->getUnwindDest()) 
-        if (!Result.count(UBB)) 
-          return false; 
-      for (auto *HBB : CSI->handlers()) 
-        if (!Result.count(const_cast<BasicBlock*>(HBB))) 
-          return false; 
-      continue; 
-    } 
- 
-    // Make sure that entire catch handler is within subgraph. It is sufficient 
-    // to check that catch return's block is in the list. 
-    if (const auto *CPI = dyn_cast<CatchPadInst>(I)) { 
-      for (const auto *U : CPI->users()) 
-        if (const auto *CRI = dyn_cast<CatchReturnInst>(U)) 
-          if (!Result.count(const_cast<BasicBlock*>(CRI->getParent()))) 
-            return false; 
-      continue; 
-    } 
- 
-    // And do similar checks for cleanup handler - the entire handler must be 
-    // in subgraph which is going to be extracted. For cleanup return should 
-    // additionally check that the unwind destination is also in the subgraph. 
-    if (const auto *CPI = dyn_cast<CleanupPadInst>(I)) { 
-      for (const auto *U : CPI->users()) 
-        if (const auto *CRI = dyn_cast<CleanupReturnInst>(U)) 
-          if (!Result.count(const_cast<BasicBlock*>(CRI->getParent()))) 
-            return false; 
-      continue; 
-    } 
-    if (const auto *CRI = dyn_cast<CleanupReturnInst>(I)) { 
-      if (auto *UBB = CRI->getUnwindDest()) 
-        if (!Result.count(UBB)) 
-          return false; 
-      continue; 
-    } 
- 
-    if (const CallInst *CI = dyn_cast<CallInst>(I)) { 
-      if (const Function *F = CI->getCalledFunction()) { 
-        auto IID = F->getIntrinsicID(); 
-        if (IID == Intrinsic::vastart) { 
-          if (AllowVarArgs) 
-            continue; 
-          else 
-            return false; 
-        } 
- 
-        // Currently, we miscompile outlined copies of eh_typid_for. There are 
-        // proposals for fixing this in llvm.org/PR39545. 
-        if (IID == Intrinsic::eh_typeid_for) 
-          return false; 
-      } 
-    } 
-  } 
- 
-  return true; 
-} 
- 
-/// Build a set of blocks to extract if the input blocks are viable. 
-static SetVector<BasicBlock *> 
-buildExtractionBlockSet(ArrayRef<BasicBlock *> BBs, DominatorTree *DT, 
-                        bool AllowVarArgs, bool AllowAlloca) { 
-  assert(!BBs.empty() && "The set of blocks to extract must be non-empty"); 
-  SetVector<BasicBlock *> Result; 
- 
-  // Loop over the blocks, adding them to our set-vector, and aborting with an 
-  // empty set if we encounter invalid blocks. 
-  for (BasicBlock *BB : BBs) { 
-    // If this block is dead, don't process it. 
-    if (DT && !DT->isReachableFromEntry(BB)) 
-      continue; 
- 
-    if (!Result.insert(BB)) 
-      llvm_unreachable("Repeated basic blocks in extraction input"); 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "Region front block: " << Result.front()->getName() 
-                    << '\n'); 
- 
-  for (auto *BB : Result) { 
-    if (!isBlockValidForExtraction(*BB, Result, AllowVarArgs, AllowAlloca)) 
-      return {}; 
- 
-    // Make sure that the first block is not a landing pad. 
-    if (BB == Result.front()) { 
-      if (BB->isEHPad()) { 
-        LLVM_DEBUG(dbgs() << "The first block cannot be an unwind block\n"); 
-        return {}; 
-      } 
-      continue; 
-    } 
- 
-    // All blocks other than the first must not have predecessors outside of 
-    // the subgraph which is being extracted. 
-    for (auto *PBB : predecessors(BB)) 
-      if (!Result.count(PBB)) { 
-        LLVM_DEBUG(dbgs() << "No blocks in this region may have entries from " 
-                             "outside the region except for the first block!\n" 
-                          << "Problematic source BB: " << BB->getName() << "\n" 
-                          << "Problematic destination BB: " << PBB->getName() 
-                          << "\n"); 
-        return {}; 
-      } 
-  } 
- 
-  return Result; 
-} 
- 
-CodeExtractor::CodeExtractor(ArrayRef<BasicBlock *> BBs, DominatorTree *DT, 
-                             bool AggregateArgs, BlockFrequencyInfo *BFI, 
-                             BranchProbabilityInfo *BPI, AssumptionCache *AC, 
-                             bool AllowVarArgs, bool AllowAlloca, 
-                             std::string Suffix) 
-    : DT(DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI), 
-      BPI(BPI), AC(AC), AllowVarArgs(AllowVarArgs), 
-      Blocks(buildExtractionBlockSet(BBs, DT, AllowVarArgs, AllowAlloca)), 
-      Suffix(Suffix) {} 
- 
-CodeExtractor::CodeExtractor(DominatorTree &DT, Loop &L, bool AggregateArgs, 
-                             BlockFrequencyInfo *BFI, 
-                             BranchProbabilityInfo *BPI, AssumptionCache *AC, 
-                             std::string Suffix) 
-    : DT(&DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI), 
-      BPI(BPI), AC(AC), AllowVarArgs(false), 
-      Blocks(buildExtractionBlockSet(L.getBlocks(), &DT, 
-                                     /* AllowVarArgs */ false, 
-                                     /* AllowAlloca */ false)), 
-      Suffix(Suffix) {} 
- 
-/// definedInRegion - Return true if the specified value is defined in the 
-/// extracted region. 
-static bool definedInRegion(const SetVector<BasicBlock *> &Blocks, Value *V) { 
-  if (Instruction *I = dyn_cast<Instruction>(V)) 
-    if (Blocks.count(I->getParent())) 
-      return true; 
-  return false; 
-} 
- 
-/// definedInCaller - Return true if the specified value is defined in the 
-/// function being code extracted, but not in the region being extracted. 
-/// These values must be passed in as live-ins to the function. 
-static bool definedInCaller(const SetVector<BasicBlock *> &Blocks, Value *V) { 
-  if (isa<Argument>(V)) return true; 
-  if (Instruction *I = dyn_cast<Instruction>(V)) 
-    if (!Blocks.count(I->getParent())) 
-      return true; 
-  return false; 
-} 
- 
-static BasicBlock *getCommonExitBlock(const SetVector<BasicBlock *> &Blocks) { 
-  BasicBlock *CommonExitBlock = nullptr; 
-  auto hasNonCommonExitSucc = [&](BasicBlock *Block) { 
-    for (auto *Succ : successors(Block)) { 
-      // Internal edges, ok. 
-      if (Blocks.count(Succ)) 
-        continue; 
-      if (!CommonExitBlock) { 
-        CommonExitBlock = Succ; 
-        continue; 
-      } 
-      if (CommonExitBlock != Succ) 
-        return true; 
-    } 
-    return false; 
-  }; 
- 
-  if (any_of(Blocks, hasNonCommonExitSucc)) 
-    return nullptr; 
- 
-  return CommonExitBlock; 
-} 
- 
-CodeExtractorAnalysisCache::CodeExtractorAnalysisCache(Function &F) { 
-  for (BasicBlock &BB : F) { 
-    for (Instruction &II : BB.instructionsWithoutDebug()) 
-      if (auto *AI = dyn_cast<AllocaInst>(&II)) 
-        Allocas.push_back(AI); 
- 
-    findSideEffectInfoForBlock(BB); 
-  } 
-} 
- 
-void CodeExtractorAnalysisCache::findSideEffectInfoForBlock(BasicBlock &BB) { 
-  for (Instruction &II : BB.instructionsWithoutDebug()) { 
-    unsigned Opcode = II.getOpcode(); 
-    Value *MemAddr = nullptr; 
-    switch (Opcode) { 
-    case Instruction::Store: 
-    case Instruction::Load: { 
-      if (Opcode == Instruction::Store) { 
-        StoreInst *SI = cast<StoreInst>(&II); 
-        MemAddr = SI->getPointerOperand(); 
-      } else { 
-        LoadInst *LI = cast<LoadInst>(&II); 
-        MemAddr = LI->getPointerOperand(); 
-      } 
-      // Global variable can not be aliased with locals. 
-      if (dyn_cast<Constant>(MemAddr)) 
-        break; 
-      Value *Base = MemAddr->stripInBoundsConstantOffsets(); 
-      if (!isa<AllocaInst>(Base)) { 
-        SideEffectingBlocks.insert(&BB); 
-        return; 
-      } 
-      BaseMemAddrs[&BB].insert(Base); 
-      break; 
-    } 
-    default: { 
-      IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(&II); 
-      if (IntrInst) { 
-        if (IntrInst->isLifetimeStartOrEnd()) 
-          break; 
-        SideEffectingBlocks.insert(&BB); 
-        return; 
-      } 
-      // Treat all the other cases conservatively if it has side effects. 
-      if (II.mayHaveSideEffects()) { 
-        SideEffectingBlocks.insert(&BB); 
-        return; 
-      } 
-    } 
-    } 
-  } 
-} 
- 
-bool CodeExtractorAnalysisCache::doesBlockContainClobberOfAddr( 
-    BasicBlock &BB, AllocaInst *Addr) const { 
-  if (SideEffectingBlocks.count(&BB)) 
-    return true; 
-  auto It = BaseMemAddrs.find(&BB); 
-  if (It != BaseMemAddrs.end()) 
-    return It->second.count(Addr); 
-  return false; 
-} 
- 
-bool CodeExtractor::isLegalToShrinkwrapLifetimeMarkers( 
-    const CodeExtractorAnalysisCache &CEAC, Instruction *Addr) const { 
-  AllocaInst *AI = cast<AllocaInst>(Addr->stripInBoundsConstantOffsets()); 
-  Function *Func = (*Blocks.begin())->getParent(); 
-  for (BasicBlock &BB : *Func) { 
-    if (Blocks.count(&BB)) 
-      continue; 
-    if (CEAC.doesBlockContainClobberOfAddr(BB, AI)) 
-      return false; 
-  } 
-  return true; 
-} 
- 
-BasicBlock * 
-CodeExtractor::findOrCreateBlockForHoisting(BasicBlock *CommonExitBlock) { 
-  BasicBlock *SinglePredFromOutlineRegion = nullptr; 
-  assert(!Blocks.count(CommonExitBlock) && 
-         "Expect a block outside the region!"); 
-  for (auto *Pred : predecessors(CommonExitBlock)) { 
-    if (!Blocks.count(Pred)) 
-      continue; 
-    if (!SinglePredFromOutlineRegion) { 
-      SinglePredFromOutlineRegion = Pred; 
-    } else if (SinglePredFromOutlineRegion != Pred) { 
-      SinglePredFromOutlineRegion = nullptr; 
-      break; 
-    } 
-  } 
- 
-  if (SinglePredFromOutlineRegion) 
-    return SinglePredFromOutlineRegion; 
- 
-#ifndef NDEBUG 
-  auto getFirstPHI = [](BasicBlock *BB) { 
-    BasicBlock::iterator I = BB->begin(); 
-    PHINode *FirstPhi = nullptr; 
-    while (I != BB->end()) { 
-      PHINode *Phi = dyn_cast<PHINode>(I); 
-      if (!Phi) 
-        break; 
-      if (!FirstPhi) { 
-        FirstPhi = Phi; 
-        break; 
-      } 
-    } 
-    return FirstPhi; 
-  }; 
-  // If there are any phi nodes, the single pred either exists or has already 
-  // be created before code extraction. 
-  assert(!getFirstPHI(CommonExitBlock) && "Phi not expected"); 
-#endif 
- 
-  BasicBlock *NewExitBlock = CommonExitBlock->splitBasicBlock( 
-      CommonExitBlock->getFirstNonPHI()->getIterator()); 
- 
-  for (auto PI = pred_begin(CommonExitBlock), PE = pred_end(CommonExitBlock); 
-       PI != PE;) { 
-    BasicBlock *Pred = *PI++; 
-    if (Blocks.count(Pred)) 
-      continue; 
-    Pred->getTerminator()->replaceUsesOfWith(CommonExitBlock, NewExitBlock); 
-  } 
-  // Now add the old exit block to the outline region. 
-  Blocks.insert(CommonExitBlock); 
-  return CommonExitBlock; 
-} 
- 
-// Find the pair of life time markers for address 'Addr' that are either 
-// defined inside the outline region or can legally be shrinkwrapped into the 
-// outline region. If there are not other untracked uses of the address, return 
-// the pair of markers if found; otherwise return a pair of nullptr. 
-CodeExtractor::LifetimeMarkerInfo 
-CodeExtractor::getLifetimeMarkers(const CodeExtractorAnalysisCache &CEAC, 
-                                  Instruction *Addr, 
-                                  BasicBlock *ExitBlock) const { 
-  LifetimeMarkerInfo Info; 
- 
-  for (User *U : Addr->users()) { 
-    IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(U); 
-    if (IntrInst) { 
-      // We don't model addresses with multiple start/end markers, but the 
-      // markers do not need to be in the region. 
-      if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start) { 
-        if (Info.LifeStart) 
-          return {}; 
-        Info.LifeStart = IntrInst; 
-        continue; 
-      } 
-      if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_end) { 
-        if (Info.LifeEnd) 
-          return {}; 
-        Info.LifeEnd = IntrInst; 
-        continue; 
-      } 
-      // At this point, permit debug uses outside of the region. 
-      // This is fixed in a later call to fixupDebugInfoPostExtraction(). 
-      if (isa<DbgInfoIntrinsic>(IntrInst)) 
-        continue; 
-    } 
-    // Find untracked uses of the address, bail. 
-    if (!definedInRegion(Blocks, U)) 
-      return {}; 
-  } 
- 
-  if (!Info.LifeStart || !Info.LifeEnd) 
-    return {}; 
- 
-  Info.SinkLifeStart = !definedInRegion(Blocks, Info.LifeStart); 
-  Info.HoistLifeEnd = !definedInRegion(Blocks, Info.LifeEnd); 
-  // Do legality check. 
-  if ((Info.SinkLifeStart || Info.HoistLifeEnd) && 
-      !isLegalToShrinkwrapLifetimeMarkers(CEAC, Addr)) 
-    return {}; 
- 
-  // Check to see if we have a place to do hoisting, if not, bail. 
-  if (Info.HoistLifeEnd && !ExitBlock) 
-    return {}; 
- 
-  return Info; 
-} 
- 
-void CodeExtractor::findAllocas(const CodeExtractorAnalysisCache &CEAC, 
-                                ValueSet &SinkCands, ValueSet &HoistCands, 
-                                BasicBlock *&ExitBlock) const { 
-  Function *Func = (*Blocks.begin())->getParent(); 
-  ExitBlock = getCommonExitBlock(Blocks); 
- 
-  auto moveOrIgnoreLifetimeMarkers = 
-      [&](const LifetimeMarkerInfo &LMI) -> bool { 
-    if (!LMI.LifeStart) 
-      return false; 
-    if (LMI.SinkLifeStart) { 
-      LLVM_DEBUG(dbgs() << "Sinking lifetime.start: " << *LMI.LifeStart 
-                        << "\n"); 
-      SinkCands.insert(LMI.LifeStart); 
-    } 
-    if (LMI.HoistLifeEnd) { 
-      LLVM_DEBUG(dbgs() << "Hoisting lifetime.end: " << *LMI.LifeEnd << "\n"); 
-      HoistCands.insert(LMI.LifeEnd); 
-    } 
-    return true; 
-  }; 
- 
-  // Look up allocas in the original function in CodeExtractorAnalysisCache, as 
-  // this is much faster than walking all the instructions. 
-  for (AllocaInst *AI : CEAC.getAllocas()) { 
-    BasicBlock *BB = AI->getParent(); 
-    if (Blocks.count(BB)) 
-      continue; 
- 
-    // As a prior call to extractCodeRegion() may have shrinkwrapped the alloca, 
-    // check whether it is actually still in the original function. 
-    Function *AIFunc = BB->getParent(); 
-    if (AIFunc != Func) 
-      continue; 
- 
-    LifetimeMarkerInfo MarkerInfo = getLifetimeMarkers(CEAC, AI, ExitBlock); 
-    bool Moved = moveOrIgnoreLifetimeMarkers(MarkerInfo); 
-    if (Moved) { 
-      LLVM_DEBUG(dbgs() << "Sinking alloca: " << *AI << "\n"); 
-      SinkCands.insert(AI); 
-      continue; 
-    } 
- 
+//===- CodeExtractor.cpp - Pull code region into a new function -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the interface to tear out a code region, such as an
+// individual loop or a parallel section, into a new function, replacing it with
+// a call to the new function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/CodeExtractor.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <map>
+#include <set>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+using ProfileCount = Function::ProfileCount;
+
+#define DEBUG_TYPE "code-extractor"
+
+// Provide a command-line option to aggregate function arguments into a struct
+// for functions produced by the code extractor. This is useful when converting
+// extracted functions to pthread-based code, as only one argument (void*) can
+// be passed in to pthread_create().
+static cl::opt<bool>
+AggregateArgsOpt("aggregate-extracted-args", cl::Hidden,
+                 cl::desc("Aggregate arguments to code-extracted functions"));
+
+/// Test whether a block is valid for extraction.
+static bool isBlockValidForExtraction(const BasicBlock &BB,
+                                      const SetVector<BasicBlock *> &Result,
+                                      bool AllowVarArgs, bool AllowAlloca) {
+  // taking the address of a basic block moved to another function is illegal
+  if (BB.hasAddressTaken())
+    return false;
+
+  // don't hoist code that uses another basicblock address, as it's likely to
+  // lead to unexpected behavior, like cross-function jumps
+  SmallPtrSet<User const *, 16> Visited;
+  SmallVector<User const *, 16> ToVisit;
+
+  for (Instruction const &Inst : BB)
+    ToVisit.push_back(&Inst);
+
+  while (!ToVisit.empty()) {
+    User const *Curr = ToVisit.pop_back_val();
+    if (!Visited.insert(Curr).second)
+      continue;
+    if (isa<BlockAddress const>(Curr))
+      return false; // even a reference to self is likely to be not compatible
+
+    if (isa<Instruction>(Curr) && cast<Instruction>(Curr)->getParent() != &BB)
+      continue;
+
+    for (auto const &U : Curr->operands()) {
+      if (auto *UU = dyn_cast<User>(U))
+        ToVisit.push_back(UU);
+    }
+  }
+
+  // If explicitly requested, allow vastart and alloca. For invoke instructions
+  // verify that extraction is valid.
+  for (BasicBlock::const_iterator I = BB.begin(), E = BB.end(); I != E; ++I) {
+    if (isa<AllocaInst>(I)) {
+       if (!AllowAlloca)
+         return false;
+       continue;
+    }
+
+    if (const auto *II = dyn_cast<InvokeInst>(I)) {
+      // Unwind destination (either a landingpad, catchswitch, or cleanuppad)
+      // must be a part of the subgraph which is being extracted.
+      if (auto *UBB = II->getUnwindDest())
+        if (!Result.count(UBB))
+          return false;
+      continue;
+    }
+
+    // All catch handlers of a catchswitch instruction as well as the unwind
+    // destination must be in the subgraph.
+    if (const auto *CSI = dyn_cast<CatchSwitchInst>(I)) {
+      if (auto *UBB = CSI->getUnwindDest())
+        if (!Result.count(UBB))
+          return false;
+      for (auto *HBB : CSI->handlers())
+        if (!Result.count(const_cast<BasicBlock*>(HBB)))
+          return false;
+      continue;
+    }
+
+    // Make sure that entire catch handler is within subgraph. It is sufficient
+    // to check that catch return's block is in the list.
+    if (const auto *CPI = dyn_cast<CatchPadInst>(I)) {
+      for (const auto *U : CPI->users())
+        if (const auto *CRI = dyn_cast<CatchReturnInst>(U))
+          if (!Result.count(const_cast<BasicBlock*>(CRI->getParent())))
+            return false;
+      continue;
+    }
+
+    // And do similar checks for cleanup handler - the entire handler must be
+    // in subgraph which is going to be extracted. For cleanup return should
+    // additionally check that the unwind destination is also in the subgraph.
+    if (const auto *CPI = dyn_cast<CleanupPadInst>(I)) {
+      for (const auto *U : CPI->users())
+        if (const auto *CRI = dyn_cast<CleanupReturnInst>(U))
+          if (!Result.count(const_cast<BasicBlock*>(CRI->getParent())))
+            return false;
+      continue;
+    }
+    if (const auto *CRI = dyn_cast<CleanupReturnInst>(I)) {
+      if (auto *UBB = CRI->getUnwindDest())
+        if (!Result.count(UBB))
+          return false;
+      continue;
+    }
+
+    if (const CallInst *CI = dyn_cast<CallInst>(I)) {
+      if (const Function *F = CI->getCalledFunction()) {
+        auto IID = F->getIntrinsicID();
+        if (IID == Intrinsic::vastart) {
+          if (AllowVarArgs)
+            continue;
+          else
+            return false;
+        }
+
+        // Currently, we miscompile outlined copies of eh_typid_for. There are
+        // proposals for fixing this in llvm.org/PR39545.
+        if (IID == Intrinsic::eh_typeid_for)
+          return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+/// Build a set of blocks to extract if the input blocks are viable.
+static SetVector<BasicBlock *>
+buildExtractionBlockSet(ArrayRef<BasicBlock *> BBs, DominatorTree *DT,
+                        bool AllowVarArgs, bool AllowAlloca) {
+  assert(!BBs.empty() && "The set of blocks to extract must be non-empty");
+  SetVector<BasicBlock *> Result;
+
+  // Loop over the blocks, adding them to our set-vector, and aborting with an
+  // empty set if we encounter invalid blocks.
+  for (BasicBlock *BB : BBs) {
+    // If this block is dead, don't process it.
+    if (DT && !DT->isReachableFromEntry(BB))
+      continue;
+
+    if (!Result.insert(BB))
+      llvm_unreachable("Repeated basic blocks in extraction input");
+  }
+
+  LLVM_DEBUG(dbgs() << "Region front block: " << Result.front()->getName()
+                    << '\n');
+
+  for (auto *BB : Result) {
+    if (!isBlockValidForExtraction(*BB, Result, AllowVarArgs, AllowAlloca))
+      return {};
+
+    // Make sure that the first block is not a landing pad.
+    if (BB == Result.front()) {
+      if (BB->isEHPad()) {
+        LLVM_DEBUG(dbgs() << "The first block cannot be an unwind block\n");
+        return {};
+      }
+      continue;
+    }
+
+    // All blocks other than the first must not have predecessors outside of
+    // the subgraph which is being extracted.
+    for (auto *PBB : predecessors(BB))
+      if (!Result.count(PBB)) {
+        LLVM_DEBUG(dbgs() << "No blocks in this region may have entries from "
+                             "outside the region except for the first block!\n"
+                          << "Problematic source BB: " << BB->getName() << "\n"
+                          << "Problematic destination BB: " << PBB->getName()
+                          << "\n");
+        return {};
+      }
+  }
+
+  return Result;
+}
+
+CodeExtractor::CodeExtractor(ArrayRef<BasicBlock *> BBs, DominatorTree *DT,
+                             bool AggregateArgs, BlockFrequencyInfo *BFI,
+                             BranchProbabilityInfo *BPI, AssumptionCache *AC,
+                             bool AllowVarArgs, bool AllowAlloca,
+                             std::string Suffix)
+    : DT(DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
+      BPI(BPI), AC(AC), AllowVarArgs(AllowVarArgs),
+      Blocks(buildExtractionBlockSet(BBs, DT, AllowVarArgs, AllowAlloca)),
+      Suffix(Suffix) {}
+
+CodeExtractor::CodeExtractor(DominatorTree &DT, Loop &L, bool AggregateArgs,
+                             BlockFrequencyInfo *BFI,
+                             BranchProbabilityInfo *BPI, AssumptionCache *AC,
+                             std::string Suffix)
+    : DT(&DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
+      BPI(BPI), AC(AC), AllowVarArgs(false),
+      Blocks(buildExtractionBlockSet(L.getBlocks(), &DT,
+                                     /* AllowVarArgs */ false,
+                                     /* AllowAlloca */ false)),
+      Suffix(Suffix) {}
+
+/// definedInRegion - Return true if the specified value is defined in the
+/// extracted region.
+static bool definedInRegion(const SetVector<BasicBlock *> &Blocks, Value *V) {
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    if (Blocks.count(I->getParent()))
+      return true;
+  return false;
+}
+
+/// definedInCaller - Return true if the specified value is defined in the
+/// function being code extracted, but not in the region being extracted.
+/// These values must be passed in as live-ins to the function.
+static bool definedInCaller(const SetVector<BasicBlock *> &Blocks, Value *V) {
+  if (isa<Argument>(V)) return true;
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    if (!Blocks.count(I->getParent()))
+      return true;
+  return false;
+}
+
+static BasicBlock *getCommonExitBlock(const SetVector<BasicBlock *> &Blocks) {
+  BasicBlock *CommonExitBlock = nullptr;
+  auto hasNonCommonExitSucc = [&](BasicBlock *Block) {
+    for (auto *Succ : successors(Block)) {
+      // Internal edges, ok.
+      if (Blocks.count(Succ))
+        continue;
+      if (!CommonExitBlock) {
+        CommonExitBlock = Succ;
+        continue;
+      }
+      if (CommonExitBlock != Succ)
+        return true;
+    }
+    return false;
+  };
+
+  if (any_of(Blocks, hasNonCommonExitSucc))
+    return nullptr;
+
+  return CommonExitBlock;
+}
+
+CodeExtractorAnalysisCache::CodeExtractorAnalysisCache(Function &F) {
+  for (BasicBlock &BB : F) {
+    for (Instruction &II : BB.instructionsWithoutDebug())
+      if (auto *AI = dyn_cast<AllocaInst>(&II))
+        Allocas.push_back(AI);
+
+    findSideEffectInfoForBlock(BB);
+  }
+}
+
+void CodeExtractorAnalysisCache::findSideEffectInfoForBlock(BasicBlock &BB) {
+  for (Instruction &II : BB.instructionsWithoutDebug()) {
+    unsigned Opcode = II.getOpcode();
+    Value *MemAddr = nullptr;
+    switch (Opcode) {
+    case Instruction::Store:
+    case Instruction::Load: {
+      if (Opcode == Instruction::Store) {
+        StoreInst *SI = cast<StoreInst>(&II);
+        MemAddr = SI->getPointerOperand();
+      } else {
+        LoadInst *LI = cast<LoadInst>(&II);
+        MemAddr = LI->getPointerOperand();
+      }
+      // Global variable can not be aliased with locals.
+      if (dyn_cast<Constant>(MemAddr))
+        break;
+      Value *Base = MemAddr->stripInBoundsConstantOffsets();
+      if (!isa<AllocaInst>(Base)) {
+        SideEffectingBlocks.insert(&BB);
+        return;
+      }
+      BaseMemAddrs[&BB].insert(Base);
+      break;
+    }
+    default: {
+      IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(&II);
+      if (IntrInst) {
+        if (IntrInst->isLifetimeStartOrEnd())
+          break;
+        SideEffectingBlocks.insert(&BB);
+        return;
+      }
+      // Treat all the other cases conservatively if it has side effects.
+      if (II.mayHaveSideEffects()) {
+        SideEffectingBlocks.insert(&BB);
+        return;
+      }
+    }
+    }
+  }
+}
+
+bool CodeExtractorAnalysisCache::doesBlockContainClobberOfAddr(
+    BasicBlock &BB, AllocaInst *Addr) const {
+  if (SideEffectingBlocks.count(&BB))
+    return true;
+  auto It = BaseMemAddrs.find(&BB);
+  if (It != BaseMemAddrs.end())
+    return It->second.count(Addr);
+  return false;
+}
+
+bool CodeExtractor::isLegalToShrinkwrapLifetimeMarkers(
+    const CodeExtractorAnalysisCache &CEAC, Instruction *Addr) const {
+  AllocaInst *AI = cast<AllocaInst>(Addr->stripInBoundsConstantOffsets());
+  Function *Func = (*Blocks.begin())->getParent();
+  for (BasicBlock &BB : *Func) {
+    if (Blocks.count(&BB))
+      continue;
+    if (CEAC.doesBlockContainClobberOfAddr(BB, AI))
+      return false;
+  }
+  return true;
+}
+
+BasicBlock *
+CodeExtractor::findOrCreateBlockForHoisting(BasicBlock *CommonExitBlock) {
+  BasicBlock *SinglePredFromOutlineRegion = nullptr;
+  assert(!Blocks.count(CommonExitBlock) &&
+         "Expect a block outside the region!");
+  for (auto *Pred : predecessors(CommonExitBlock)) {
+    if (!Blocks.count(Pred))
+      continue;
+    if (!SinglePredFromOutlineRegion) {
+      SinglePredFromOutlineRegion = Pred;
+    } else if (SinglePredFromOutlineRegion != Pred) {
+      SinglePredFromOutlineRegion = nullptr;
+      break;
+    }
+  }
+
+  if (SinglePredFromOutlineRegion)
+    return SinglePredFromOutlineRegion;
+
+#ifndef NDEBUG
+  auto getFirstPHI = [](BasicBlock *BB) {
+    BasicBlock::iterator I = BB->begin();
+    PHINode *FirstPhi = nullptr;
+    while (I != BB->end()) {
+      PHINode *Phi = dyn_cast<PHINode>(I);
+      if (!Phi)
+        break;
+      if (!FirstPhi) {
+        FirstPhi = Phi;
+        break;
+      }
+    }
+    return FirstPhi;
+  };
+  // If there are any phi nodes, the single pred either exists or has already
+  // be created before code extraction.
+  assert(!getFirstPHI(CommonExitBlock) && "Phi not expected");
+#endif
+
+  BasicBlock *NewExitBlock = CommonExitBlock->splitBasicBlock(
+      CommonExitBlock->getFirstNonPHI()->getIterator());
+
+  for (auto PI = pred_begin(CommonExitBlock), PE = pred_end(CommonExitBlock);
+       PI != PE;) {
+    BasicBlock *Pred = *PI++;
+    if (Blocks.count(Pred))
+      continue;
+    Pred->getTerminator()->replaceUsesOfWith(CommonExitBlock, NewExitBlock);
+  }
+  // Now add the old exit block to the outline region.
+  Blocks.insert(CommonExitBlock);
+  return CommonExitBlock;
+}
+
+// Find the pair of life time markers for address 'Addr' that are either
+// defined inside the outline region or can legally be shrinkwrapped into the
+// outline region. If there are not other untracked uses of the address, return
+// the pair of markers if found; otherwise return a pair of nullptr.
+CodeExtractor::LifetimeMarkerInfo
+CodeExtractor::getLifetimeMarkers(const CodeExtractorAnalysisCache &CEAC,
+                                  Instruction *Addr,
+                                  BasicBlock *ExitBlock) const {
+  LifetimeMarkerInfo Info;
+
+  for (User *U : Addr->users()) {
+    IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(U);
+    if (IntrInst) {
+      // We don't model addresses with multiple start/end markers, but the
+      // markers do not need to be in the region.
+      if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start) {
+        if (Info.LifeStart)
+          return {};
+        Info.LifeStart = IntrInst;
+        continue;
+      }
+      if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_end) {
+        if (Info.LifeEnd)
+          return {};
+        Info.LifeEnd = IntrInst;
+        continue;
+      }
+      // At this point, permit debug uses outside of the region.
+      // This is fixed in a later call to fixupDebugInfoPostExtraction().
+      if (isa<DbgInfoIntrinsic>(IntrInst))
+        continue;
+    }
+    // Find untracked uses of the address, bail.
+    if (!definedInRegion(Blocks, U))
+      return {};
+  }
+
+  if (!Info.LifeStart || !Info.LifeEnd)
+    return {};
+
+  Info.SinkLifeStart = !definedInRegion(Blocks, Info.LifeStart);
+  Info.HoistLifeEnd = !definedInRegion(Blocks, Info.LifeEnd);
+  // Do legality check.
+  if ((Info.SinkLifeStart || Info.HoistLifeEnd) &&
+      !isLegalToShrinkwrapLifetimeMarkers(CEAC, Addr))
+    return {};
+
+  // Check to see if we have a place to do hoisting, if not, bail.
+  if (Info.HoistLifeEnd && !ExitBlock)
+    return {};
+
+  return Info;
+}
+
+void CodeExtractor::findAllocas(const CodeExtractorAnalysisCache &CEAC,
+                                ValueSet &SinkCands, ValueSet &HoistCands,
+                                BasicBlock *&ExitBlock) const {
+  Function *Func = (*Blocks.begin())->getParent();
+  ExitBlock = getCommonExitBlock(Blocks);
+
+  auto moveOrIgnoreLifetimeMarkers =
+      [&](const LifetimeMarkerInfo &LMI) -> bool {
+    if (!LMI.LifeStart)
+      return false;
+    if (LMI.SinkLifeStart) {
+      LLVM_DEBUG(dbgs() << "Sinking lifetime.start: " << *LMI.LifeStart
+                        << "\n");
+      SinkCands.insert(LMI.LifeStart);
+    }
+    if (LMI.HoistLifeEnd) {
+      LLVM_DEBUG(dbgs() << "Hoisting lifetime.end: " << *LMI.LifeEnd << "\n");
+      HoistCands.insert(LMI.LifeEnd);
+    }
+    return true;
+  };
+
+  // Look up allocas in the original function in CodeExtractorAnalysisCache, as
+  // this is much faster than walking all the instructions.
+  for (AllocaInst *AI : CEAC.getAllocas()) {
+    BasicBlock *BB = AI->getParent();
+    if (Blocks.count(BB))
+      continue;
+
+    // As a prior call to extractCodeRegion() may have shrinkwrapped the alloca,
+    // check whether it is actually still in the original function.
+    Function *AIFunc = BB->getParent();
+    if (AIFunc != Func)
+      continue;
+
+    LifetimeMarkerInfo MarkerInfo = getLifetimeMarkers(CEAC, AI, ExitBlock);
+    bool Moved = moveOrIgnoreLifetimeMarkers(MarkerInfo);
+    if (Moved) {
+      LLVM_DEBUG(dbgs() << "Sinking alloca: " << *AI << "\n");
+      SinkCands.insert(AI);
+      continue;
+    }
+
     // Find bitcasts in the outlined region that have lifetime marker users
     // outside that region. Replace the lifetime marker use with an
     // outside region bitcast to avoid unnecessary alloca/reload instructions
@@ -575,1235 +575,1235 @@ void CodeExtractor::findAllocas(const CodeExtractorAnalysisCache &CEAC,
       I->replaceUsesOfWith(I->getOperand(1), CastI);
     }
 
-    // Follow any bitcasts. 
-    SmallVector<Instruction *, 2> Bitcasts; 
-    SmallVector<LifetimeMarkerInfo, 2> BitcastLifetimeInfo; 
-    for (User *U : AI->users()) { 
-      if (U->stripInBoundsConstantOffsets() == AI) { 
-        Instruction *Bitcast = cast<Instruction>(U); 
-        LifetimeMarkerInfo LMI = getLifetimeMarkers(CEAC, Bitcast, ExitBlock); 
-        if (LMI.LifeStart) { 
-          Bitcasts.push_back(Bitcast); 
-          BitcastLifetimeInfo.push_back(LMI); 
-          continue; 
-        } 
-      } 
- 
-      // Found unknown use of AI. 
-      if (!definedInRegion(Blocks, U)) { 
-        Bitcasts.clear(); 
-        break; 
-      } 
-    } 
- 
-    // Either no bitcasts reference the alloca or there are unknown uses. 
-    if (Bitcasts.empty()) 
-      continue; 
- 
-    LLVM_DEBUG(dbgs() << "Sinking alloca (via bitcast): " << *AI << "\n"); 
-    SinkCands.insert(AI); 
-    for (unsigned I = 0, E = Bitcasts.size(); I != E; ++I) { 
-      Instruction *BitcastAddr = Bitcasts[I]; 
-      const LifetimeMarkerInfo &LMI = BitcastLifetimeInfo[I]; 
-      assert(LMI.LifeStart && 
-             "Unsafe to sink bitcast without lifetime markers"); 
-      moveOrIgnoreLifetimeMarkers(LMI); 
-      if (!definedInRegion(Blocks, BitcastAddr)) { 
-        LLVM_DEBUG(dbgs() << "Sinking bitcast-of-alloca: " << *BitcastAddr 
-                          << "\n"); 
-        SinkCands.insert(BitcastAddr); 
-      } 
-    } 
-  } 
-} 
- 
-bool CodeExtractor::isEligible() const { 
-  if (Blocks.empty()) 
-    return false; 
-  BasicBlock *Header = *Blocks.begin(); 
-  Function *F = Header->getParent(); 
- 
-  // For functions with varargs, check that varargs handling is only done in the 
-  // outlined function, i.e vastart and vaend are only used in outlined blocks. 
-  if (AllowVarArgs && F->getFunctionType()->isVarArg()) { 
-    auto containsVarArgIntrinsic = [](const Instruction &I) { 
-      if (const CallInst *CI = dyn_cast<CallInst>(&I)) 
-        if (const Function *Callee = CI->getCalledFunction()) 
-          return Callee->getIntrinsicID() == Intrinsic::vastart || 
-                 Callee->getIntrinsicID() == Intrinsic::vaend; 
-      return false; 
-    }; 
- 
-    for (auto &BB : *F) { 
-      if (Blocks.count(&BB)) 
-        continue; 
-      if (llvm::any_of(BB, containsVarArgIntrinsic)) 
-        return false; 
-    } 
-  } 
-  return true; 
-} 
- 
-void CodeExtractor::findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs, 
-                                      const ValueSet &SinkCands) const { 
-  for (BasicBlock *BB : Blocks) { 
-    // If a used value is defined outside the region, it's an input.  If an 
-    // instruction is used outside the region, it's an output. 
-    for (Instruction &II : *BB) { 
-      for (auto &OI : II.operands()) { 
-        Value *V = OI; 
-        if (!SinkCands.count(V) && definedInCaller(Blocks, V)) 
-          Inputs.insert(V); 
-      } 
- 
-      for (User *U : II.users()) 
-        if (!definedInRegion(Blocks, U)) { 
-          Outputs.insert(&II); 
-          break; 
-        } 
-    } 
-  } 
-} 
- 
-/// severSplitPHINodesOfEntry - If a PHI node has multiple inputs from outside 
-/// of the region, we need to split the entry block of the region so that the 
-/// PHI node is easier to deal with. 
-void CodeExtractor::severSplitPHINodesOfEntry(BasicBlock *&Header) { 
-  unsigned NumPredsFromRegion = 0; 
-  unsigned NumPredsOutsideRegion = 0; 
- 
-  if (Header != &Header->getParent()->getEntryBlock()) { 
-    PHINode *PN = dyn_cast<PHINode>(Header->begin()); 
-    if (!PN) return;  // No PHI nodes. 
- 
-    // If the header node contains any PHI nodes, check to see if there is more 
-    // than one entry from outside the region.  If so, we need to sever the 
-    // header block into two. 
-    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) 
-      if (Blocks.count(PN->getIncomingBlock(i))) 
-        ++NumPredsFromRegion; 
-      else 
-        ++NumPredsOutsideRegion; 
- 
-    // If there is one (or fewer) predecessor from outside the region, we don't 
-    // need to do anything special. 
-    if (NumPredsOutsideRegion <= 1) return; 
-  } 
- 
-  // Otherwise, we need to split the header block into two pieces: one 
-  // containing PHI nodes merging values from outside of the region, and a 
-  // second that contains all of the code for the block and merges back any 
-  // incoming values from inside of the region. 
-  BasicBlock *NewBB = SplitBlock(Header, Header->getFirstNonPHI(), DT); 
- 
-  // We only want to code extract the second block now, and it becomes the new 
-  // header of the region. 
-  BasicBlock *OldPred = Header; 
-  Blocks.remove(OldPred); 
-  Blocks.insert(NewBB); 
-  Header = NewBB; 
- 
-  // Okay, now we need to adjust the PHI nodes and any branches from within the 
-  // region to go to the new header block instead of the old header block. 
-  if (NumPredsFromRegion) { 
-    PHINode *PN = cast<PHINode>(OldPred->begin()); 
-    // Loop over all of the predecessors of OldPred that are in the region, 
-    // changing them to branch to NewBB instead. 
-    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) 
-      if (Blocks.count(PN->getIncomingBlock(i))) { 
-        Instruction *TI = PN->getIncomingBlock(i)->getTerminator(); 
-        TI->replaceUsesOfWith(OldPred, NewBB); 
-      } 
- 
-    // Okay, everything within the region is now branching to the right block, we 
-    // just have to update the PHI nodes now, inserting PHI nodes into NewBB. 
-    BasicBlock::iterator AfterPHIs; 
-    for (AfterPHIs = OldPred->begin(); isa<PHINode>(AfterPHIs); ++AfterPHIs) { 
-      PHINode *PN = cast<PHINode>(AfterPHIs); 
-      // Create a new PHI node in the new region, which has an incoming value 
-      // from OldPred of PN. 
-      PHINode *NewPN = PHINode::Create(PN->getType(), 1 + NumPredsFromRegion, 
-                                       PN->getName() + ".ce", &NewBB->front()); 
-      PN->replaceAllUsesWith(NewPN); 
-      NewPN->addIncoming(PN, OldPred); 
- 
-      // Loop over all of the incoming value in PN, moving them to NewPN if they 
-      // are from the extracted region. 
-      for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) { 
-        if (Blocks.count(PN->getIncomingBlock(i))) { 
-          NewPN->addIncoming(PN->getIncomingValue(i), PN->getIncomingBlock(i)); 
-          PN->removeIncomingValue(i); 
-          --i; 
-        } 
-      } 
-    } 
-  } 
-} 
- 
-/// severSplitPHINodesOfExits - if PHI nodes in exit blocks have inputs from 
-/// outlined region, we split these PHIs on two: one with inputs from region 
-/// and other with remaining incoming blocks; then first PHIs are placed in 
-/// outlined region. 
-void CodeExtractor::severSplitPHINodesOfExits( 
-    const SmallPtrSetImpl<BasicBlock *> &Exits) { 
-  for (BasicBlock *ExitBB : Exits) { 
-    BasicBlock *NewBB = nullptr; 
- 
-    for (PHINode &PN : ExitBB->phis()) { 
-      // Find all incoming values from the outlining region. 
-      SmallVector<unsigned, 2> IncomingVals; 
-      for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i) 
-        if (Blocks.count(PN.getIncomingBlock(i))) 
-          IncomingVals.push_back(i); 
- 
-      // Do not process PHI if there is one (or fewer) predecessor from region. 
-      // If PHI has exactly one predecessor from region, only this one incoming 
-      // will be replaced on codeRepl block, so it should be safe to skip PHI. 
-      if (IncomingVals.size() <= 1) 
-        continue; 
- 
-      // Create block for new PHIs and add it to the list of outlined if it 
-      // wasn't done before. 
-      if (!NewBB) { 
-        NewBB = BasicBlock::Create(ExitBB->getContext(), 
-                                   ExitBB->getName() + ".split", 
-                                   ExitBB->getParent(), ExitBB); 
+    // Follow any bitcasts.
+    SmallVector<Instruction *, 2> Bitcasts;
+    SmallVector<LifetimeMarkerInfo, 2> BitcastLifetimeInfo;
+    for (User *U : AI->users()) {
+      if (U->stripInBoundsConstantOffsets() == AI) {
+        Instruction *Bitcast = cast<Instruction>(U);
+        LifetimeMarkerInfo LMI = getLifetimeMarkers(CEAC, Bitcast, ExitBlock);
+        if (LMI.LifeStart) {
+          Bitcasts.push_back(Bitcast);
+          BitcastLifetimeInfo.push_back(LMI);
+          continue;
+        }
+      }
+
+      // Found unknown use of AI.
+      if (!definedInRegion(Blocks, U)) {
+        Bitcasts.clear();
+        break;
+      }
+    }
+
+    // Either no bitcasts reference the alloca or there are unknown uses.
+    if (Bitcasts.empty())
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Sinking alloca (via bitcast): " << *AI << "\n");
+    SinkCands.insert(AI);
+    for (unsigned I = 0, E = Bitcasts.size(); I != E; ++I) {
+      Instruction *BitcastAddr = Bitcasts[I];
+      const LifetimeMarkerInfo &LMI = BitcastLifetimeInfo[I];
+      assert(LMI.LifeStart &&
+             "Unsafe to sink bitcast without lifetime markers");
+      moveOrIgnoreLifetimeMarkers(LMI);
+      if (!definedInRegion(Blocks, BitcastAddr)) {
+        LLVM_DEBUG(dbgs() << "Sinking bitcast-of-alloca: " << *BitcastAddr
+                          << "\n");
+        SinkCands.insert(BitcastAddr);
+      }
+    }
+  }
+}
+
+bool CodeExtractor::isEligible() const {
+  if (Blocks.empty())
+    return false;
+  BasicBlock *Header = *Blocks.begin();
+  Function *F = Header->getParent();
+
+  // For functions with varargs, check that varargs handling is only done in the
+  // outlined function, i.e vastart and vaend are only used in outlined blocks.
+  if (AllowVarArgs && F->getFunctionType()->isVarArg()) {
+    auto containsVarArgIntrinsic = [](const Instruction &I) {
+      if (const CallInst *CI = dyn_cast<CallInst>(&I))
+        if (const Function *Callee = CI->getCalledFunction())
+          return Callee->getIntrinsicID() == Intrinsic::vastart ||
+                 Callee->getIntrinsicID() == Intrinsic::vaend;
+      return false;
+    };
+
+    for (auto &BB : *F) {
+      if (Blocks.count(&BB))
+        continue;
+      if (llvm::any_of(BB, containsVarArgIntrinsic))
+        return false;
+    }
+  }
+  return true;
+}
+
+void CodeExtractor::findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs,
+                                      const ValueSet &SinkCands) const {
+  for (BasicBlock *BB : Blocks) {
+    // If a used value is defined outside the region, it's an input.  If an
+    // instruction is used outside the region, it's an output.
+    for (Instruction &II : *BB) {
+      for (auto &OI : II.operands()) {
+        Value *V = OI;
+        if (!SinkCands.count(V) && definedInCaller(Blocks, V))
+          Inputs.insert(V);
+      }
+
+      for (User *U : II.users())
+        if (!definedInRegion(Blocks, U)) {
+          Outputs.insert(&II);
+          break;
+        }
+    }
+  }
+}
+
+/// severSplitPHINodesOfEntry - If a PHI node has multiple inputs from outside
+/// of the region, we need to split the entry block of the region so that the
+/// PHI node is easier to deal with.
+void CodeExtractor::severSplitPHINodesOfEntry(BasicBlock *&Header) {
+  unsigned NumPredsFromRegion = 0;
+  unsigned NumPredsOutsideRegion = 0;
+
+  if (Header != &Header->getParent()->getEntryBlock()) {
+    PHINode *PN = dyn_cast<PHINode>(Header->begin());
+    if (!PN) return;  // No PHI nodes.
+
+    // If the header node contains any PHI nodes, check to see if there is more
+    // than one entry from outside the region.  If so, we need to sever the
+    // header block into two.
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (Blocks.count(PN->getIncomingBlock(i)))
+        ++NumPredsFromRegion;
+      else
+        ++NumPredsOutsideRegion;
+
+    // If there is one (or fewer) predecessor from outside the region, we don't
+    // need to do anything special.
+    if (NumPredsOutsideRegion <= 1) return;
+  }
+
+  // Otherwise, we need to split the header block into two pieces: one
+  // containing PHI nodes merging values from outside of the region, and a
+  // second that contains all of the code for the block and merges back any
+  // incoming values from inside of the region.
+  BasicBlock *NewBB = SplitBlock(Header, Header->getFirstNonPHI(), DT);
+
+  // We only want to code extract the second block now, and it becomes the new
+  // header of the region.
+  BasicBlock *OldPred = Header;
+  Blocks.remove(OldPred);
+  Blocks.insert(NewBB);
+  Header = NewBB;
+
+  // Okay, now we need to adjust the PHI nodes and any branches from within the
+  // region to go to the new header block instead of the old header block.
+  if (NumPredsFromRegion) {
+    PHINode *PN = cast<PHINode>(OldPred->begin());
+    // Loop over all of the predecessors of OldPred that are in the region,
+    // changing them to branch to NewBB instead.
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (Blocks.count(PN->getIncomingBlock(i))) {
+        Instruction *TI = PN->getIncomingBlock(i)->getTerminator();
+        TI->replaceUsesOfWith(OldPred, NewBB);
+      }
+
+    // Okay, everything within the region is now branching to the right block, we
+    // just have to update the PHI nodes now, inserting PHI nodes into NewBB.
+    BasicBlock::iterator AfterPHIs;
+    for (AfterPHIs = OldPred->begin(); isa<PHINode>(AfterPHIs); ++AfterPHIs) {
+      PHINode *PN = cast<PHINode>(AfterPHIs);
+      // Create a new PHI node in the new region, which has an incoming value
+      // from OldPred of PN.
+      PHINode *NewPN = PHINode::Create(PN->getType(), 1 + NumPredsFromRegion,
+                                       PN->getName() + ".ce", &NewBB->front());
+      PN->replaceAllUsesWith(NewPN);
+      NewPN->addIncoming(PN, OldPred);
+
+      // Loop over all of the incoming value in PN, moving them to NewPN if they
+      // are from the extracted region.
+      for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) {
+        if (Blocks.count(PN->getIncomingBlock(i))) {
+          NewPN->addIncoming(PN->getIncomingValue(i), PN->getIncomingBlock(i));
+          PN->removeIncomingValue(i);
+          --i;
+        }
+      }
+    }
+  }
+}
+
+/// severSplitPHINodesOfExits - if PHI nodes in exit blocks have inputs from
+/// outlined region, we split these PHIs on two: one with inputs from region
+/// and other with remaining incoming blocks; then first PHIs are placed in
+/// outlined region.
+void CodeExtractor::severSplitPHINodesOfExits(
+    const SmallPtrSetImpl<BasicBlock *> &Exits) {
+  for (BasicBlock *ExitBB : Exits) {
+    BasicBlock *NewBB = nullptr;
+
+    for (PHINode &PN : ExitBB->phis()) {
+      // Find all incoming values from the outlining region.
+      SmallVector<unsigned, 2> IncomingVals;
+      for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i)
+        if (Blocks.count(PN.getIncomingBlock(i)))
+          IncomingVals.push_back(i);
+
+      // Do not process PHI if there is one (or fewer) predecessor from region.
+      // If PHI has exactly one predecessor from region, only this one incoming
+      // will be replaced on codeRepl block, so it should be safe to skip PHI.
+      if (IncomingVals.size() <= 1)
+        continue;
+
+      // Create block for new PHIs and add it to the list of outlined if it
+      // wasn't done before.
+      if (!NewBB) {
+        NewBB = BasicBlock::Create(ExitBB->getContext(),
+                                   ExitBB->getName() + ".split",
+                                   ExitBB->getParent(), ExitBB);
         SmallVector<BasicBlock *, 4> Preds(predecessors(ExitBB));
-        for (BasicBlock *PredBB : Preds) 
-          if (Blocks.count(PredBB)) 
-            PredBB->getTerminator()->replaceUsesOfWith(ExitBB, NewBB); 
-        BranchInst::Create(ExitBB, NewBB); 
-        Blocks.insert(NewBB); 
-      } 
- 
-      // Split this PHI. 
-      PHINode *NewPN = 
-          PHINode::Create(PN.getType(), IncomingVals.size(), 
-                          PN.getName() + ".ce", NewBB->getFirstNonPHI()); 
-      for (unsigned i : IncomingVals) 
-        NewPN->addIncoming(PN.getIncomingValue(i), PN.getIncomingBlock(i)); 
-      for (unsigned i : reverse(IncomingVals)) 
-        PN.removeIncomingValue(i, false); 
-      PN.addIncoming(NewPN, NewBB); 
-    } 
-  } 
-} 
- 
-void CodeExtractor::splitReturnBlocks() { 
-  for (BasicBlock *Block : Blocks) 
-    if (ReturnInst *RI = dyn_cast<ReturnInst>(Block->getTerminator())) { 
-      BasicBlock *New = 
-          Block->splitBasicBlock(RI->getIterator(), Block->getName() + ".ret"); 
-      if (DT) { 
-        // Old dominates New. New node dominates all other nodes dominated 
-        // by Old. 
-        DomTreeNode *OldNode = DT->getNode(Block); 
-        SmallVector<DomTreeNode *, 8> Children(OldNode->begin(), 
-                                               OldNode->end()); 
- 
-        DomTreeNode *NewNode = DT->addNewBlock(New, Block); 
- 
-        for (DomTreeNode *I : Children) 
-          DT->changeImmediateDominator(I, NewNode); 
-      } 
-    } 
-} 
- 
-/// constructFunction - make a function based on inputs and outputs, as follows: 
-/// f(in0, ..., inN, out0, ..., outN) 
-Function *CodeExtractor::constructFunction(const ValueSet &inputs, 
-                                           const ValueSet &outputs, 
-                                           BasicBlock *header, 
-                                           BasicBlock *newRootNode, 
-                                           BasicBlock *newHeader, 
-                                           Function *oldFunction, 
-                                           Module *M) { 
-  LLVM_DEBUG(dbgs() << "inputs: " << inputs.size() << "\n"); 
-  LLVM_DEBUG(dbgs() << "outputs: " << outputs.size() << "\n"); 
- 
-  // This function returns unsigned, outputs will go back by reference. 
-  switch (NumExitBlocks) { 
-  case 0: 
-  case 1: RetTy = Type::getVoidTy(header->getContext()); break; 
-  case 2: RetTy = Type::getInt1Ty(header->getContext()); break; 
-  default: RetTy = Type::getInt16Ty(header->getContext()); break; 
-  } 
- 
-  std::vector<Type *> paramTy; 
- 
-  // Add the types of the input values to the function's argument list 
-  for (Value *value : inputs) { 
-    LLVM_DEBUG(dbgs() << "value used in func: " << *value << "\n"); 
-    paramTy.push_back(value->getType()); 
-  } 
- 
-  // Add the types of the output values to the function's argument list. 
-  for (Value *output : outputs) { 
-    LLVM_DEBUG(dbgs() << "instr used in func: " << *output << "\n"); 
-    if (AggregateArgs) 
-      paramTy.push_back(output->getType()); 
-    else 
-      paramTy.push_back(PointerType::getUnqual(output->getType())); 
-  } 
- 
-  LLVM_DEBUG({ 
-    dbgs() << "Function type: " << *RetTy << " f("; 
-    for (Type *i : paramTy) 
-      dbgs() << *i << ", "; 
-    dbgs() << ")\n"; 
-  }); 
- 
-  StructType *StructTy = nullptr; 
-  if (AggregateArgs && (inputs.size() + outputs.size() > 0)) { 
-    StructTy = StructType::get(M->getContext(), paramTy); 
-    paramTy.clear(); 
-    paramTy.push_back(PointerType::getUnqual(StructTy)); 
-  } 
-  FunctionType *funcType = 
-                  FunctionType::get(RetTy, paramTy, 
-                                    AllowVarArgs && oldFunction->isVarArg()); 
- 
-  std::string SuffixToUse = 
-      Suffix.empty() 
-          ? (header->getName().empty() ? "extracted" : header->getName().str()) 
-          : Suffix; 
-  // Create the new function 
-  Function *newFunction = Function::Create( 
-      funcType, GlobalValue::InternalLinkage, oldFunction->getAddressSpace(), 
-      oldFunction->getName() + "." + SuffixToUse, M); 
-  // If the old function is no-throw, so is the new one. 
-  if (oldFunction->doesNotThrow()) 
-    newFunction->setDoesNotThrow(); 
- 
-  // Inherit the uwtable attribute if we need to. 
-  if (oldFunction->hasUWTable()) 
-    newFunction->setHasUWTable(); 
- 
-  // Inherit all of the target dependent attributes and white-listed 
-  // target independent attributes. 
-  //  (e.g. If the extracted region contains a call to an x86.sse 
-  //  instruction we need to make sure that the extracted region has the 
-  //  "target-features" attribute allowing it to be lowered. 
-  // FIXME: This should be changed to check to see if a specific 
-  //           attribute can not be inherited. 
-  for (const auto &Attr : oldFunction->getAttributes().getFnAttributes()) { 
-    if (Attr.isStringAttribute()) { 
-      if (Attr.getKindAsString() == "thunk") 
-        continue; 
-    } else 
-      switch (Attr.getKindAsEnum()) { 
-      // Those attributes cannot be propagated safely. Explicitly list them 
-      // here so we get a warning if new attributes are added. This list also 
-      // includes non-function attributes. 
-      case Attribute::Alignment: 
-      case Attribute::AllocSize: 
-      case Attribute::ArgMemOnly: 
-      case Attribute::Builtin: 
-      case Attribute::ByVal: 
-      case Attribute::Convergent: 
-      case Attribute::Dereferenceable: 
-      case Attribute::DereferenceableOrNull: 
-      case Attribute::InAlloca: 
-      case Attribute::InReg: 
-      case Attribute::InaccessibleMemOnly: 
-      case Attribute::InaccessibleMemOrArgMemOnly: 
-      case Attribute::JumpTable: 
-      case Attribute::Naked: 
-      case Attribute::Nest: 
-      case Attribute::NoAlias: 
-      case Attribute::NoBuiltin: 
-      case Attribute::NoCapture: 
-      case Attribute::NoMerge: 
-      case Attribute::NoReturn: 
-      case Attribute::NoSync: 
-      case Attribute::NoUndef: 
-      case Attribute::None: 
-      case Attribute::NonNull: 
-      case Attribute::Preallocated: 
-      case Attribute::ReadNone: 
-      case Attribute::ReadOnly: 
-      case Attribute::Returned: 
-      case Attribute::ReturnsTwice: 
-      case Attribute::SExt: 
-      case Attribute::Speculatable: 
-      case Attribute::StackAlignment: 
-      case Attribute::StructRet: 
-      case Attribute::SwiftError: 
-      case Attribute::SwiftSelf: 
-      case Attribute::WillReturn: 
-      case Attribute::WriteOnly: 
-      case Attribute::ZExt: 
-      case Attribute::ImmArg: 
+        for (BasicBlock *PredBB : Preds)
+          if (Blocks.count(PredBB))
+            PredBB->getTerminator()->replaceUsesOfWith(ExitBB, NewBB);
+        BranchInst::Create(ExitBB, NewBB);
+        Blocks.insert(NewBB);
+      }
+
+      // Split this PHI.
+      PHINode *NewPN =
+          PHINode::Create(PN.getType(), IncomingVals.size(),
+                          PN.getName() + ".ce", NewBB->getFirstNonPHI());
+      for (unsigned i : IncomingVals)
+        NewPN->addIncoming(PN.getIncomingValue(i), PN.getIncomingBlock(i));
+      for (unsigned i : reverse(IncomingVals))
+        PN.removeIncomingValue(i, false);
+      PN.addIncoming(NewPN, NewBB);
+    }
+  }
+}
+
+void CodeExtractor::splitReturnBlocks() {
+  for (BasicBlock *Block : Blocks)
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(Block->getTerminator())) {
+      BasicBlock *New =
+          Block->splitBasicBlock(RI->getIterator(), Block->getName() + ".ret");
+      if (DT) {
+        // Old dominates New. New node dominates all other nodes dominated
+        // by Old.
+        DomTreeNode *OldNode = DT->getNode(Block);
+        SmallVector<DomTreeNode *, 8> Children(OldNode->begin(),
+                                               OldNode->end());
+
+        DomTreeNode *NewNode = DT->addNewBlock(New, Block);
+
+        for (DomTreeNode *I : Children)
+          DT->changeImmediateDominator(I, NewNode);
+      }
+    }
+}
+
+/// constructFunction - make a function based on inputs and outputs, as follows:
+/// f(in0, ..., inN, out0, ..., outN)
+Function *CodeExtractor::constructFunction(const ValueSet &inputs,
+                                           const ValueSet &outputs,
+                                           BasicBlock *header,
+                                           BasicBlock *newRootNode,
+                                           BasicBlock *newHeader,
+                                           Function *oldFunction,
+                                           Module *M) {
+  LLVM_DEBUG(dbgs() << "inputs: " << inputs.size() << "\n");
+  LLVM_DEBUG(dbgs() << "outputs: " << outputs.size() << "\n");
+
+  // This function returns unsigned, outputs will go back by reference.
+  switch (NumExitBlocks) {
+  case 0:
+  case 1: RetTy = Type::getVoidTy(header->getContext()); break;
+  case 2: RetTy = Type::getInt1Ty(header->getContext()); break;
+  default: RetTy = Type::getInt16Ty(header->getContext()); break;
+  }
+
+  std::vector<Type *> paramTy;
+
+  // Add the types of the input values to the function's argument list
+  for (Value *value : inputs) {
+    LLVM_DEBUG(dbgs() << "value used in func: " << *value << "\n");
+    paramTy.push_back(value->getType());
+  }
+
+  // Add the types of the output values to the function's argument list.
+  for (Value *output : outputs) {
+    LLVM_DEBUG(dbgs() << "instr used in func: " << *output << "\n");
+    if (AggregateArgs)
+      paramTy.push_back(output->getType());
+    else
+      paramTy.push_back(PointerType::getUnqual(output->getType()));
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "Function type: " << *RetTy << " f(";
+    for (Type *i : paramTy)
+      dbgs() << *i << ", ";
+    dbgs() << ")\n";
+  });
+
+  StructType *StructTy = nullptr;
+  if (AggregateArgs && (inputs.size() + outputs.size() > 0)) {
+    StructTy = StructType::get(M->getContext(), paramTy);
+    paramTy.clear();
+    paramTy.push_back(PointerType::getUnqual(StructTy));
+  }
+  FunctionType *funcType =
+                  FunctionType::get(RetTy, paramTy,
+                                    AllowVarArgs && oldFunction->isVarArg());
+
+  std::string SuffixToUse =
+      Suffix.empty()
+          ? (header->getName().empty() ? "extracted" : header->getName().str())
+          : Suffix;
+  // Create the new function
+  Function *newFunction = Function::Create(
+      funcType, GlobalValue::InternalLinkage, oldFunction->getAddressSpace(),
+      oldFunction->getName() + "." + SuffixToUse, M);
+  // If the old function is no-throw, so is the new one.
+  if (oldFunction->doesNotThrow())
+    newFunction->setDoesNotThrow();
+
+  // Inherit the uwtable attribute if we need to.
+  if (oldFunction->hasUWTable())
+    newFunction->setHasUWTable();
+
+  // Inherit all of the target dependent attributes and white-listed
+  // target independent attributes.
+  //  (e.g. If the extracted region contains a call to an x86.sse
+  //  instruction we need to make sure that the extracted region has the
+  //  "target-features" attribute allowing it to be lowered.
+  // FIXME: This should be changed to check to see if a specific
+  //           attribute can not be inherited.
+  for (const auto &Attr : oldFunction->getAttributes().getFnAttributes()) {
+    if (Attr.isStringAttribute()) {
+      if (Attr.getKindAsString() == "thunk")
+        continue;
+    } else
+      switch (Attr.getKindAsEnum()) {
+      // Those attributes cannot be propagated safely. Explicitly list them
+      // here so we get a warning if new attributes are added. This list also
+      // includes non-function attributes.
+      case Attribute::Alignment:
+      case Attribute::AllocSize:
+      case Attribute::ArgMemOnly:
+      case Attribute::Builtin:
+      case Attribute::ByVal:
+      case Attribute::Convergent:
+      case Attribute::Dereferenceable:
+      case Attribute::DereferenceableOrNull:
+      case Attribute::InAlloca:
+      case Attribute::InReg:
+      case Attribute::InaccessibleMemOnly:
+      case Attribute::InaccessibleMemOrArgMemOnly:
+      case Attribute::JumpTable:
+      case Attribute::Naked:
+      case Attribute::Nest:
+      case Attribute::NoAlias:
+      case Attribute::NoBuiltin:
+      case Attribute::NoCapture:
+      case Attribute::NoMerge:
+      case Attribute::NoReturn:
+      case Attribute::NoSync:
+      case Attribute::NoUndef:
+      case Attribute::None:
+      case Attribute::NonNull:
+      case Attribute::Preallocated:
+      case Attribute::ReadNone:
+      case Attribute::ReadOnly:
+      case Attribute::Returned:
+      case Attribute::ReturnsTwice:
+      case Attribute::SExt:
+      case Attribute::Speculatable:
+      case Attribute::StackAlignment:
+      case Attribute::StructRet:
+      case Attribute::SwiftError:
+      case Attribute::SwiftSelf:
+      case Attribute::WillReturn:
+      case Attribute::WriteOnly:
+      case Attribute::ZExt:
+      case Attribute::ImmArg:
       case Attribute::ByRef:
-      case Attribute::EndAttrKinds: 
-      case Attribute::EmptyKey: 
-      case Attribute::TombstoneKey: 
-        continue; 
-      // Those attributes should be safe to propagate to the extracted function. 
-      case Attribute::AlwaysInline: 
-      case Attribute::Cold: 
+      case Attribute::EndAttrKinds:
+      case Attribute::EmptyKey:
+      case Attribute::TombstoneKey:
+        continue;
+      // Those attributes should be safe to propagate to the extracted function.
+      case Attribute::AlwaysInline:
+      case Attribute::Cold:
       case Attribute::Hot:
-      case Attribute::NoRecurse: 
-      case Attribute::InlineHint: 
-      case Attribute::MinSize: 
+      case Attribute::NoRecurse:
+      case Attribute::InlineHint:
+      case Attribute::MinSize:
       case Attribute::NoCallback:
-      case Attribute::NoDuplicate: 
-      case Attribute::NoFree: 
-      case Attribute::NoImplicitFloat: 
-      case Attribute::NoInline: 
-      case Attribute::NonLazyBind: 
-      case Attribute::NoRedZone: 
-      case Attribute::NoUnwind: 
-      case Attribute::NullPointerIsValid: 
-      case Attribute::OptForFuzzing: 
-      case Attribute::OptimizeNone: 
-      case Attribute::OptimizeForSize: 
-      case Attribute::SafeStack: 
-      case Attribute::ShadowCallStack: 
-      case Attribute::SanitizeAddress: 
-      case Attribute::SanitizeMemory: 
-      case Attribute::SanitizeThread: 
-      case Attribute::SanitizeHWAddress: 
-      case Attribute::SanitizeMemTag: 
-      case Attribute::SpeculativeLoadHardening: 
-      case Attribute::StackProtect: 
-      case Attribute::StackProtectReq: 
-      case Attribute::StackProtectStrong: 
-      case Attribute::StrictFP: 
-      case Attribute::UWTable: 
-      case Attribute::NoCfCheck: 
+      case Attribute::NoDuplicate:
+      case Attribute::NoFree:
+      case Attribute::NoImplicitFloat:
+      case Attribute::NoInline:
+      case Attribute::NonLazyBind:
+      case Attribute::NoRedZone:
+      case Attribute::NoUnwind:
+      case Attribute::NullPointerIsValid:
+      case Attribute::OptForFuzzing:
+      case Attribute::OptimizeNone:
+      case Attribute::OptimizeForSize:
+      case Attribute::SafeStack:
+      case Attribute::ShadowCallStack:
+      case Attribute::SanitizeAddress:
+      case Attribute::SanitizeMemory:
+      case Attribute::SanitizeThread:
+      case Attribute::SanitizeHWAddress:
+      case Attribute::SanitizeMemTag:
+      case Attribute::SpeculativeLoadHardening:
+      case Attribute::StackProtect:
+      case Attribute::StackProtectReq:
+      case Attribute::StackProtectStrong:
+      case Attribute::StrictFP:
+      case Attribute::UWTable:
+      case Attribute::NoCfCheck:
       case Attribute::MustProgress:
       case Attribute::NoProfile:
-        break; 
-      } 
- 
-    newFunction->addFnAttr(Attr); 
-  } 
-  newFunction->getBasicBlockList().push_back(newRootNode); 
- 
-  // Create an iterator to name all of the arguments we inserted. 
-  Function::arg_iterator AI = newFunction->arg_begin(); 
- 
-  // Rewrite all users of the inputs in the extracted region to use the 
-  // arguments (or appropriate addressing into struct) instead. 
-  for (unsigned i = 0, e = inputs.size(); i != e; ++i) { 
-    Value *RewriteVal; 
-    if (AggregateArgs) { 
-      Value *Idx[2]; 
-      Idx[0] = Constant::getNullValue(Type::getInt32Ty(header->getContext())); 
-      Idx[1] = ConstantInt::get(Type::getInt32Ty(header->getContext()), i); 
-      Instruction *TI = newFunction->begin()->getTerminator(); 
-      GetElementPtrInst *GEP = GetElementPtrInst::Create( 
-          StructTy, &*AI, Idx, "gep_" + inputs[i]->getName(), TI); 
-      RewriteVal = new LoadInst(StructTy->getElementType(i), GEP, 
-                                "loadgep_" + inputs[i]->getName(), TI); 
-    } else 
-      RewriteVal = &*AI++; 
- 
-    std::vector<User *> Users(inputs[i]->user_begin(), inputs[i]->user_end()); 
-    for (User *use : Users) 
-      if (Instruction *inst = dyn_cast<Instruction>(use)) 
-        if (Blocks.count(inst->getParent())) 
-          inst->replaceUsesOfWith(inputs[i], RewriteVal); 
-  } 
- 
-  // Set names for input and output arguments. 
-  if (!AggregateArgs) { 
-    AI = newFunction->arg_begin(); 
-    for (unsigned i = 0, e = inputs.size(); i != e; ++i, ++AI) 
-      AI->setName(inputs[i]->getName()); 
-    for (unsigned i = 0, e = outputs.size(); i != e; ++i, ++AI) 
-      AI->setName(outputs[i]->getName()+".out"); 
-  } 
- 
-  // Rewrite branches to basic blocks outside of the loop to new dummy blocks 
-  // within the new function. This must be done before we lose track of which 
-  // blocks were originally in the code region. 
-  std::vector<User *> Users(header->user_begin(), header->user_end()); 
-  for (auto &U : Users) 
-    // The BasicBlock which contains the branch is not in the region 
-    // modify the branch target to a new block 
-    if (Instruction *I = dyn_cast<Instruction>(U)) 
-      if (I->isTerminator() && I->getFunction() == oldFunction && 
-          !Blocks.count(I->getParent())) 
-        I->replaceUsesOfWith(header, newHeader); 
- 
-  return newFunction; 
-} 
- 
-/// Erase lifetime.start markers which reference inputs to the extraction 
-/// region, and insert the referenced memory into \p LifetimesStart. 
-/// 
-/// The extraction region is defined by a set of blocks (\p Blocks), and a set 
-/// of allocas which will be moved from the caller function into the extracted 
-/// function (\p SunkAllocas). 
-static void eraseLifetimeMarkersOnInputs(const SetVector<BasicBlock *> &Blocks, 
-                                         const SetVector<Value *> &SunkAllocas, 
-                                         SetVector<Value *> &LifetimesStart) { 
-  for (BasicBlock *BB : Blocks) { 
-    for (auto It = BB->begin(), End = BB->end(); It != End;) { 
-      auto *II = dyn_cast<IntrinsicInst>(&*It); 
-      ++It; 
-      if (!II || !II->isLifetimeStartOrEnd()) 
-        continue; 
- 
-      // Get the memory operand of the lifetime marker. If the underlying 
-      // object is a sunk alloca, or is otherwise defined in the extraction 
-      // region, the lifetime marker must not be erased. 
-      Value *Mem = II->getOperand(1)->stripInBoundsOffsets(); 
-      if (SunkAllocas.count(Mem) || definedInRegion(Blocks, Mem)) 
-        continue; 
- 
-      if (II->getIntrinsicID() == Intrinsic::lifetime_start) 
-        LifetimesStart.insert(Mem); 
-      II->eraseFromParent(); 
-    } 
-  } 
-} 
- 
-/// Insert lifetime start/end markers surrounding the call to the new function 
-/// for objects defined in the caller. 
-static void insertLifetimeMarkersSurroundingCall( 
-    Module *M, ArrayRef<Value *> LifetimesStart, ArrayRef<Value *> LifetimesEnd, 
-    CallInst *TheCall) { 
-  LLVMContext &Ctx = M->getContext(); 
-  auto Int8PtrTy = Type::getInt8PtrTy(Ctx); 
-  auto NegativeOne = ConstantInt::getSigned(Type::getInt64Ty(Ctx), -1); 
-  Instruction *Term = TheCall->getParent()->getTerminator(); 
- 
-  // The memory argument to a lifetime marker must be a i8*. Cache any bitcasts 
-  // needed to satisfy this requirement so they may be reused. 
-  DenseMap<Value *, Value *> Bitcasts; 
- 
-  // Emit lifetime markers for the pointers given in \p Objects. Insert the 
-  // markers before the call if \p InsertBefore, and after the call otherwise. 
-  auto insertMarkers = [&](Function *MarkerFunc, ArrayRef<Value *> Objects, 
-                           bool InsertBefore) { 
-    for (Value *Mem : Objects) { 
-      assert((!isa<Instruction>(Mem) || cast<Instruction>(Mem)->getFunction() == 
-                                            TheCall->getFunction()) && 
-             "Input memory not defined in original function"); 
-      Value *&MemAsI8Ptr = Bitcasts[Mem]; 
-      if (!MemAsI8Ptr) { 
-        if (Mem->getType() == Int8PtrTy) 
-          MemAsI8Ptr = Mem; 
-        else 
-          MemAsI8Ptr = 
-              CastInst::CreatePointerCast(Mem, Int8PtrTy, "lt.cast", TheCall); 
-      } 
- 
-      auto Marker = CallInst::Create(MarkerFunc, {NegativeOne, MemAsI8Ptr}); 
-      if (InsertBefore) 
-        Marker->insertBefore(TheCall); 
-      else 
-        Marker->insertBefore(Term); 
-    } 
-  }; 
- 
-  if (!LifetimesStart.empty()) { 
-    auto StartFn = llvm::Intrinsic::getDeclaration( 
-        M, llvm::Intrinsic::lifetime_start, Int8PtrTy); 
-    insertMarkers(StartFn, LifetimesStart, /*InsertBefore=*/true); 
-  } 
- 
-  if (!LifetimesEnd.empty()) { 
-    auto EndFn = llvm::Intrinsic::getDeclaration( 
-        M, llvm::Intrinsic::lifetime_end, Int8PtrTy); 
-    insertMarkers(EndFn, LifetimesEnd, /*InsertBefore=*/false); 
-  } 
-} 
- 
-/// emitCallAndSwitchStatement - This method sets up the caller side by adding 
-/// the call instruction, splitting any PHI nodes in the header block as 
-/// necessary. 
-CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction, 
-                                                    BasicBlock *codeReplacer, 
-                                                    ValueSet &inputs, 
-                                                    ValueSet &outputs) { 
-  // Emit a call to the new function, passing in: *pointer to struct (if 
-  // aggregating parameters), or plan inputs and allocated memory for outputs 
-  std::vector<Value *> params, StructValues, ReloadOutputs, Reloads; 
- 
-  Module *M = newFunction->getParent(); 
-  LLVMContext &Context = M->getContext(); 
-  const DataLayout &DL = M->getDataLayout(); 
-  CallInst *call = nullptr; 
- 
-  // Add inputs as params, or to be filled into the struct 
-  unsigned ArgNo = 0; 
-  SmallVector<unsigned, 1> SwiftErrorArgs; 
-  for (Value *input : inputs) { 
-    if (AggregateArgs) 
-      StructValues.push_back(input); 
-    else { 
-      params.push_back(input); 
-      if (input->isSwiftError()) 
-        SwiftErrorArgs.push_back(ArgNo); 
-    } 
-    ++ArgNo; 
-  } 
- 
-  // Create allocas for the outputs 
-  for (Value *output : outputs) { 
-    if (AggregateArgs) { 
-      StructValues.push_back(output); 
-    } else { 
-      AllocaInst *alloca = 
-        new AllocaInst(output->getType(), DL.getAllocaAddrSpace(), 
-                       nullptr, output->getName() + ".loc", 
-                       &codeReplacer->getParent()->front().front()); 
-      ReloadOutputs.push_back(alloca); 
-      params.push_back(alloca); 
-    } 
-  } 
- 
-  StructType *StructArgTy = nullptr; 
-  AllocaInst *Struct = nullptr; 
-  if (AggregateArgs && (inputs.size() + outputs.size() > 0)) { 
-    std::vector<Type *> ArgTypes; 
-    for (ValueSet::iterator v = StructValues.begin(), 
-           ve = StructValues.end(); v != ve; ++v) 
-      ArgTypes.push_back((*v)->getType()); 
- 
-    // Allocate a struct at the beginning of this function 
-    StructArgTy = StructType::get(newFunction->getContext(), ArgTypes); 
-    Struct = new AllocaInst(StructArgTy, DL.getAllocaAddrSpace(), nullptr, 
-                            "structArg", 
-                            &codeReplacer->getParent()->front().front()); 
-    params.push_back(Struct); 
- 
-    for (unsigned i = 0, e = inputs.size(); i != e; ++i) { 
-      Value *Idx[2]; 
-      Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context)); 
-      Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), i); 
-      GetElementPtrInst *GEP = GetElementPtrInst::Create( 
-          StructArgTy, Struct, Idx, "gep_" + StructValues[i]->getName()); 
-      codeReplacer->getInstList().push_back(GEP); 
-      new StoreInst(StructValues[i], GEP, codeReplacer); 
-    } 
-  } 
- 
-  // Emit the call to the function 
-  call = CallInst::Create(newFunction, params, 
-                          NumExitBlocks > 1 ? "targetBlock" : ""); 
-  // Add debug location to the new call, if the original function has debug 
-  // info. In that case, the terminator of the entry block of the extracted 
-  // function contains the first debug location of the extracted function, 
-  // set in extractCodeRegion. 
-  if (codeReplacer->getParent()->getSubprogram()) { 
-    if (auto DL = newFunction->getEntryBlock().getTerminator()->getDebugLoc()) 
-      call->setDebugLoc(DL); 
-  } 
-  codeReplacer->getInstList().push_back(call); 
- 
-  // Set swifterror parameter attributes. 
-  for (unsigned SwiftErrArgNo : SwiftErrorArgs) { 
-    call->addParamAttr(SwiftErrArgNo, Attribute::SwiftError); 
-    newFunction->addParamAttr(SwiftErrArgNo, Attribute::SwiftError); 
-  } 
- 
-  Function::arg_iterator OutputArgBegin = newFunction->arg_begin(); 
-  unsigned FirstOut = inputs.size(); 
-  if (!AggregateArgs) 
-    std::advance(OutputArgBegin, inputs.size()); 
- 
-  // Reload the outputs passed in by reference. 
-  for (unsigned i = 0, e = outputs.size(); i != e; ++i) { 
-    Value *Output = nullptr; 
-    if (AggregateArgs) { 
-      Value *Idx[2]; 
-      Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context)); 
-      Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), FirstOut + i); 
-      GetElementPtrInst *GEP = GetElementPtrInst::Create( 
-          StructArgTy, Struct, Idx, "gep_reload_" + outputs[i]->getName()); 
-      codeReplacer->getInstList().push_back(GEP); 
-      Output = GEP; 
-    } else { 
-      Output = ReloadOutputs[i]; 
-    } 
-    LoadInst *load = new LoadInst(outputs[i]->getType(), Output, 
-                                  outputs[i]->getName() + ".reload", 
-                                  codeReplacer); 
-    Reloads.push_back(load); 
-    std::vector<User *> Users(outputs[i]->user_begin(), outputs[i]->user_end()); 
-    for (unsigned u = 0, e = Users.size(); u != e; ++u) { 
-      Instruction *inst = cast<Instruction>(Users[u]); 
-      if (!Blocks.count(inst->getParent())) 
-        inst->replaceUsesOfWith(outputs[i], load); 
-    } 
-  } 
- 
-  // Now we can emit a switch statement using the call as a value. 
-  SwitchInst *TheSwitch = 
-      SwitchInst::Create(Constant::getNullValue(Type::getInt16Ty(Context)), 
-                         codeReplacer, 0, codeReplacer); 
- 
-  // Since there may be multiple exits from the original region, make the new 
-  // function return an unsigned, switch on that number.  This loop iterates 
-  // over all of the blocks in the extracted region, updating any terminator 
-  // instructions in the to-be-extracted region that branch to blocks that are 
-  // not in the region to be extracted. 
-  std::map<BasicBlock *, BasicBlock *> ExitBlockMap; 
- 
-  unsigned switchVal = 0; 
-  for (BasicBlock *Block : Blocks) { 
-    Instruction *TI = Block->getTerminator(); 
-    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) 
-      if (!Blocks.count(TI->getSuccessor(i))) { 
-        BasicBlock *OldTarget = TI->getSuccessor(i); 
-        // add a new basic block which returns the appropriate value 
-        BasicBlock *&NewTarget = ExitBlockMap[OldTarget]; 
-        if (!NewTarget) { 
-          // If we don't already have an exit stub for this non-extracted 
-          // destination, create one now! 
-          NewTarget = BasicBlock::Create(Context, 
-                                         OldTarget->getName() + ".exitStub", 
-                                         newFunction); 
-          unsigned SuccNum = switchVal++; 
- 
-          Value *brVal = nullptr; 
-          switch (NumExitBlocks) { 
-          case 0: 
-          case 1: break;  // No value needed. 
-          case 2:         // Conditional branch, return a bool 
-            brVal = ConstantInt::get(Type::getInt1Ty(Context), !SuccNum); 
-            break; 
-          default: 
-            brVal = ConstantInt::get(Type::getInt16Ty(Context), SuccNum); 
-            break; 
-          } 
- 
-          ReturnInst::Create(Context, brVal, NewTarget); 
- 
-          // Update the switch instruction. 
-          TheSwitch->addCase(ConstantInt::get(Type::getInt16Ty(Context), 
-                                              SuccNum), 
-                             OldTarget); 
-        } 
- 
-        // rewrite the original branch instruction with this new target 
-        TI->setSuccessor(i, NewTarget); 
-      } 
-  } 
- 
-  // Store the arguments right after the definition of output value. 
-  // This should be proceeded after creating exit stubs to be ensure that invoke 
-  // result restore will be placed in the outlined function. 
-  Function::arg_iterator OAI = OutputArgBegin; 
-  for (unsigned i = 0, e = outputs.size(); i != e; ++i) { 
-    auto *OutI = dyn_cast<Instruction>(outputs[i]); 
-    if (!OutI) 
-      continue; 
- 
-    // Find proper insertion point. 
-    BasicBlock::iterator InsertPt; 
-    // In case OutI is an invoke, we insert the store at the beginning in the 
-    // 'normal destination' BB. Otherwise we insert the store right after OutI. 
-    if (auto *InvokeI = dyn_cast<InvokeInst>(OutI)) 
-      InsertPt = InvokeI->getNormalDest()->getFirstInsertionPt(); 
-    else if (auto *Phi = dyn_cast<PHINode>(OutI)) 
-      InsertPt = Phi->getParent()->getFirstInsertionPt(); 
-    else 
-      InsertPt = std::next(OutI->getIterator()); 
- 
-    Instruction *InsertBefore = &*InsertPt; 
-    assert((InsertBefore->getFunction() == newFunction || 
-            Blocks.count(InsertBefore->getParent())) && 
-           "InsertPt should be in new function"); 
-    assert(OAI != newFunction->arg_end() && 
-           "Number of output arguments should match " 
-           "the amount of defined values"); 
-    if (AggregateArgs) { 
-      Value *Idx[2]; 
-      Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context)); 
-      Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), FirstOut + i); 
-      GetElementPtrInst *GEP = GetElementPtrInst::Create( 
-          StructArgTy, &*OAI, Idx, "gep_" + outputs[i]->getName(), 
-          InsertBefore); 
-      new StoreInst(outputs[i], GEP, InsertBefore); 
-      // Since there should be only one struct argument aggregating 
-      // all the output values, we shouldn't increment OAI, which always 
-      // points to the struct argument, in this case. 
-    } else { 
-      new StoreInst(outputs[i], &*OAI, InsertBefore); 
-      ++OAI; 
-    } 
-  } 
- 
-  // Now that we've done the deed, simplify the switch instruction. 
-  Type *OldFnRetTy = TheSwitch->getParent()->getParent()->getReturnType(); 
-  switch (NumExitBlocks) { 
-  case 0: 
-    // There are no successors (the block containing the switch itself), which 
-    // means that previously this was the last part of the function, and hence 
-    // this should be rewritten as a `ret' 
- 
-    // Check if the function should return a value 
-    if (OldFnRetTy->isVoidTy()) { 
-      ReturnInst::Create(Context, nullptr, TheSwitch);  // Return void 
-    } else if (OldFnRetTy == TheSwitch->getCondition()->getType()) { 
-      // return what we have 
-      ReturnInst::Create(Context, TheSwitch->getCondition(), TheSwitch); 
-    } else { 
-      // Otherwise we must have code extracted an unwind or something, just 
-      // return whatever we want. 
-      ReturnInst::Create(Context, 
-                         Constant::getNullValue(OldFnRetTy), TheSwitch); 
-    } 
- 
-    TheSwitch->eraseFromParent(); 
-    break; 
-  case 1: 
-    // Only a single destination, change the switch into an unconditional 
-    // branch. 
-    BranchInst::Create(TheSwitch->getSuccessor(1), TheSwitch); 
-    TheSwitch->eraseFromParent(); 
-    break; 
-  case 2: 
-    BranchInst::Create(TheSwitch->getSuccessor(1), TheSwitch->getSuccessor(2), 
-                       call, TheSwitch); 
-    TheSwitch->eraseFromParent(); 
-    break; 
-  default: 
-    // Otherwise, make the default destination of the switch instruction be one 
-    // of the other successors. 
-    TheSwitch->setCondition(call); 
-    TheSwitch->setDefaultDest(TheSwitch->getSuccessor(NumExitBlocks)); 
-    // Remove redundant case 
-    TheSwitch->removeCase(SwitchInst::CaseIt(TheSwitch, NumExitBlocks-1)); 
-    break; 
-  } 
- 
-  // Insert lifetime markers around the reloads of any output values. The 
-  // allocas output values are stored in are only in-use in the codeRepl block. 
-  insertLifetimeMarkersSurroundingCall(M, ReloadOutputs, ReloadOutputs, call); 
- 
-  return call; 
-} 
- 
-void CodeExtractor::moveCodeToFunction(Function *newFunction) { 
-  Function *oldFunc = (*Blocks.begin())->getParent(); 
-  Function::BasicBlockListType &oldBlocks = oldFunc->getBasicBlockList(); 
-  Function::BasicBlockListType &newBlocks = newFunction->getBasicBlockList(); 
- 
-  for (BasicBlock *Block : Blocks) { 
-    // Delete the basic block from the old function, and the list of blocks 
-    oldBlocks.remove(Block); 
- 
-    // Insert this basic block into the new function 
-    newBlocks.push_back(Block); 
-  } 
-} 
- 
-void CodeExtractor::calculateNewCallTerminatorWeights( 
-    BasicBlock *CodeReplacer, 
-    DenseMap<BasicBlock *, BlockFrequency> &ExitWeights, 
-    BranchProbabilityInfo *BPI) { 
-  using Distribution = BlockFrequencyInfoImplBase::Distribution; 
-  using BlockNode = BlockFrequencyInfoImplBase::BlockNode; 
- 
-  // Update the branch weights for the exit block. 
-  Instruction *TI = CodeReplacer->getTerminator(); 
-  SmallVector<unsigned, 8> BranchWeights(TI->getNumSuccessors(), 0); 
- 
-  // Block Frequency distribution with dummy node. 
-  Distribution BranchDist; 
- 
-  SmallVector<BranchProbability, 4> EdgeProbabilities( 
-      TI->getNumSuccessors(), BranchProbability::getUnknown()); 
- 
-  // Add each of the frequencies of the successors. 
-  for (unsigned i = 0, e = TI->getNumSuccessors(); i < e; ++i) { 
-    BlockNode ExitNode(i); 
-    uint64_t ExitFreq = ExitWeights[TI->getSuccessor(i)].getFrequency(); 
-    if (ExitFreq != 0) 
-      BranchDist.addExit(ExitNode, ExitFreq); 
-    else 
-      EdgeProbabilities[i] = BranchProbability::getZero(); 
-  } 
- 
-  // Check for no total weight. 
-  if (BranchDist.Total == 0) { 
-    BPI->setEdgeProbability(CodeReplacer, EdgeProbabilities); 
-    return; 
-  } 
- 
-  // Normalize the distribution so that they can fit in unsigned. 
-  BranchDist.normalize(); 
- 
-  // Create normalized branch weights and set the metadata. 
-  for (unsigned I = 0, E = BranchDist.Weights.size(); I < E; ++I) { 
-    const auto &Weight = BranchDist.Weights[I]; 
- 
-    // Get the weight and update the current BFI. 
-    BranchWeights[Weight.TargetNode.Index] = Weight.Amount; 
-    BranchProbability BP(Weight.Amount, BranchDist.Total); 
-    EdgeProbabilities[Weight.TargetNode.Index] = BP; 
-  } 
-  BPI->setEdgeProbability(CodeReplacer, EdgeProbabilities); 
-  TI->setMetadata( 
-      LLVMContext::MD_prof, 
-      MDBuilder(TI->getContext()).createBranchWeights(BranchWeights)); 
-} 
- 
-/// Erase debug info intrinsics which refer to values in \p F but aren't in 
-/// \p F. 
-static void eraseDebugIntrinsicsWithNonLocalRefs(Function &F) { 
-  for (Instruction &I : instructions(F)) { 
-    SmallVector<DbgVariableIntrinsic *, 4> DbgUsers; 
-    findDbgUsers(DbgUsers, &I); 
-    for (DbgVariableIntrinsic *DVI : DbgUsers) 
-      if (DVI->getFunction() != &F) 
-        DVI->eraseFromParent(); 
-  } 
-} 
- 
-/// Fix up the debug info in the old and new functions by pointing line 
-/// locations and debug intrinsics to the new subprogram scope, and by deleting 
-/// intrinsics which point to values outside of the new function. 
-static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc, 
-                                         CallInst &TheCall) { 
-  DISubprogram *OldSP = OldFunc.getSubprogram(); 
-  LLVMContext &Ctx = OldFunc.getContext(); 
- 
-  if (!OldSP) { 
-    // Erase any debug info the new function contains. 
-    stripDebugInfo(NewFunc); 
-    // Make sure the old function doesn't contain any non-local metadata refs. 
-    eraseDebugIntrinsicsWithNonLocalRefs(NewFunc); 
-    return; 
-  } 
- 
-  // Create a subprogram for the new function. Leave out a description of the 
-  // function arguments, as the parameters don't correspond to anything at the 
-  // source level. 
-  assert(OldSP->getUnit() && "Missing compile unit for subprogram"); 
+        break;
+      }
+
+    newFunction->addFnAttr(Attr);
+  }
+  newFunction->getBasicBlockList().push_back(newRootNode);
+
+  // Create an iterator to name all of the arguments we inserted.
+  Function::arg_iterator AI = newFunction->arg_begin();
+
+  // Rewrite all users of the inputs in the extracted region to use the
+  // arguments (or appropriate addressing into struct) instead.
+  for (unsigned i = 0, e = inputs.size(); i != e; ++i) {
+    Value *RewriteVal;
+    if (AggregateArgs) {
+      Value *Idx[2];
+      Idx[0] = Constant::getNullValue(Type::getInt32Ty(header->getContext()));
+      Idx[1] = ConstantInt::get(Type::getInt32Ty(header->getContext()), i);
+      Instruction *TI = newFunction->begin()->getTerminator();
+      GetElementPtrInst *GEP = GetElementPtrInst::Create(
+          StructTy, &*AI, Idx, "gep_" + inputs[i]->getName(), TI);
+      RewriteVal = new LoadInst(StructTy->getElementType(i), GEP,
+                                "loadgep_" + inputs[i]->getName(), TI);
+    } else
+      RewriteVal = &*AI++;
+
+    std::vector<User *> Users(inputs[i]->user_begin(), inputs[i]->user_end());
+    for (User *use : Users)
+      if (Instruction *inst = dyn_cast<Instruction>(use))
+        if (Blocks.count(inst->getParent()))
+          inst->replaceUsesOfWith(inputs[i], RewriteVal);
+  }
+
+  // Set names for input and output arguments.
+  if (!AggregateArgs) {
+    AI = newFunction->arg_begin();
+    for (unsigned i = 0, e = inputs.size(); i != e; ++i, ++AI)
+      AI->setName(inputs[i]->getName());
+    for (unsigned i = 0, e = outputs.size(); i != e; ++i, ++AI)
+      AI->setName(outputs[i]->getName()+".out");
+  }
+
+  // Rewrite branches to basic blocks outside of the loop to new dummy blocks
+  // within the new function. This must be done before we lose track of which
+  // blocks were originally in the code region.
+  std::vector<User *> Users(header->user_begin(), header->user_end());
+  for (auto &U : Users)
+    // The BasicBlock which contains the branch is not in the region
+    // modify the branch target to a new block
+    if (Instruction *I = dyn_cast<Instruction>(U))
+      if (I->isTerminator() && I->getFunction() == oldFunction &&
+          !Blocks.count(I->getParent()))
+        I->replaceUsesOfWith(header, newHeader);
+
+  return newFunction;
+}
+
+/// Erase lifetime.start markers which reference inputs to the extraction
+/// region, and insert the referenced memory into \p LifetimesStart.
+///
+/// The extraction region is defined by a set of blocks (\p Blocks), and a set
+/// of allocas which will be moved from the caller function into the extracted
+/// function (\p SunkAllocas).
+static void eraseLifetimeMarkersOnInputs(const SetVector<BasicBlock *> &Blocks,
+                                         const SetVector<Value *> &SunkAllocas,
+                                         SetVector<Value *> &LifetimesStart) {
+  for (BasicBlock *BB : Blocks) {
+    for (auto It = BB->begin(), End = BB->end(); It != End;) {
+      auto *II = dyn_cast<IntrinsicInst>(&*It);
+      ++It;
+      if (!II || !II->isLifetimeStartOrEnd())
+        continue;
+
+      // Get the memory operand of the lifetime marker. If the underlying
+      // object is a sunk alloca, or is otherwise defined in the extraction
+      // region, the lifetime marker must not be erased.
+      Value *Mem = II->getOperand(1)->stripInBoundsOffsets();
+      if (SunkAllocas.count(Mem) || definedInRegion(Blocks, Mem))
+        continue;
+
+      if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+        LifetimesStart.insert(Mem);
+      II->eraseFromParent();
+    }
+  }
+}
+
+/// Insert lifetime start/end markers surrounding the call to the new function
+/// for objects defined in the caller.
+static void insertLifetimeMarkersSurroundingCall(
+    Module *M, ArrayRef<Value *> LifetimesStart, ArrayRef<Value *> LifetimesEnd,
+    CallInst *TheCall) {
+  LLVMContext &Ctx = M->getContext();
+  auto Int8PtrTy = Type::getInt8PtrTy(Ctx);
+  auto NegativeOne = ConstantInt::getSigned(Type::getInt64Ty(Ctx), -1);
+  Instruction *Term = TheCall->getParent()->getTerminator();
+
+  // The memory argument to a lifetime marker must be a i8*. Cache any bitcasts
+  // needed to satisfy this requirement so they may be reused.
+  DenseMap<Value *, Value *> Bitcasts;
+
+  // Emit lifetime markers for the pointers given in \p Objects. Insert the
+  // markers before the call if \p InsertBefore, and after the call otherwise.
+  auto insertMarkers = [&](Function *MarkerFunc, ArrayRef<Value *> Objects,
+                           bool InsertBefore) {
+    for (Value *Mem : Objects) {
+      assert((!isa<Instruction>(Mem) || cast<Instruction>(Mem)->getFunction() ==
+                                            TheCall->getFunction()) &&
+             "Input memory not defined in original function");
+      Value *&MemAsI8Ptr = Bitcasts[Mem];
+      if (!MemAsI8Ptr) {
+        if (Mem->getType() == Int8PtrTy)
+          MemAsI8Ptr = Mem;
+        else
+          MemAsI8Ptr =
+              CastInst::CreatePointerCast(Mem, Int8PtrTy, "lt.cast", TheCall);
+      }
+
+      auto Marker = CallInst::Create(MarkerFunc, {NegativeOne, MemAsI8Ptr});
+      if (InsertBefore)
+        Marker->insertBefore(TheCall);
+      else
+        Marker->insertBefore(Term);
+    }
+  };
+
+  if (!LifetimesStart.empty()) {
+    auto StartFn = llvm::Intrinsic::getDeclaration(
+        M, llvm::Intrinsic::lifetime_start, Int8PtrTy);
+    insertMarkers(StartFn, LifetimesStart, /*InsertBefore=*/true);
+  }
+
+  if (!LifetimesEnd.empty()) {
+    auto EndFn = llvm::Intrinsic::getDeclaration(
+        M, llvm::Intrinsic::lifetime_end, Int8PtrTy);
+    insertMarkers(EndFn, LifetimesEnd, /*InsertBefore=*/false);
+  }
+}
+
+/// emitCallAndSwitchStatement - This method sets up the caller side by adding
+/// the call instruction, splitting any PHI nodes in the header block as
+/// necessary.
+CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
+                                                    BasicBlock *codeReplacer,
+                                                    ValueSet &inputs,
+                                                    ValueSet &outputs) {
+  // Emit a call to the new function, passing in: *pointer to struct (if
+  // aggregating parameters), or plan inputs and allocated memory for outputs
+  std::vector<Value *> params, StructValues, ReloadOutputs, Reloads;
+
+  Module *M = newFunction->getParent();
+  LLVMContext &Context = M->getContext();
+  const DataLayout &DL = M->getDataLayout();
+  CallInst *call = nullptr;
+
+  // Add inputs as params, or to be filled into the struct
+  unsigned ArgNo = 0;
+  SmallVector<unsigned, 1> SwiftErrorArgs;
+  for (Value *input : inputs) {
+    if (AggregateArgs)
+      StructValues.push_back(input);
+    else {
+      params.push_back(input);
+      if (input->isSwiftError())
+        SwiftErrorArgs.push_back(ArgNo);
+    }
+    ++ArgNo;
+  }
+
+  // Create allocas for the outputs
+  for (Value *output : outputs) {
+    if (AggregateArgs) {
+      StructValues.push_back(output);
+    } else {
+      AllocaInst *alloca =
+        new AllocaInst(output->getType(), DL.getAllocaAddrSpace(),
+                       nullptr, output->getName() + ".loc",
+                       &codeReplacer->getParent()->front().front());
+      ReloadOutputs.push_back(alloca);
+      params.push_back(alloca);
+    }
+  }
+
+  StructType *StructArgTy = nullptr;
+  AllocaInst *Struct = nullptr;
+  if (AggregateArgs && (inputs.size() + outputs.size() > 0)) {
+    std::vector<Type *> ArgTypes;
+    for (ValueSet::iterator v = StructValues.begin(),
+           ve = StructValues.end(); v != ve; ++v)
+      ArgTypes.push_back((*v)->getType());
+
+    // Allocate a struct at the beginning of this function
+    StructArgTy = StructType::get(newFunction->getContext(), ArgTypes);
+    Struct = new AllocaInst(StructArgTy, DL.getAllocaAddrSpace(), nullptr,
+                            "structArg",
+                            &codeReplacer->getParent()->front().front());
+    params.push_back(Struct);
+
+    for (unsigned i = 0, e = inputs.size(); i != e; ++i) {
+      Value *Idx[2];
+      Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context));
+      Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), i);
+      GetElementPtrInst *GEP = GetElementPtrInst::Create(
+          StructArgTy, Struct, Idx, "gep_" + StructValues[i]->getName());
+      codeReplacer->getInstList().push_back(GEP);
+      new StoreInst(StructValues[i], GEP, codeReplacer);
+    }
+  }
+
+  // Emit the call to the function
+  call = CallInst::Create(newFunction, params,
+                          NumExitBlocks > 1 ? "targetBlock" : "");
+  // Add debug location to the new call, if the original function has debug
+  // info. In that case, the terminator of the entry block of the extracted
+  // function contains the first debug location of the extracted function,
+  // set in extractCodeRegion.
+  if (codeReplacer->getParent()->getSubprogram()) {
+    if (auto DL = newFunction->getEntryBlock().getTerminator()->getDebugLoc())
+      call->setDebugLoc(DL);
+  }
+  codeReplacer->getInstList().push_back(call);
+
+  // Set swifterror parameter attributes.
+  for (unsigned SwiftErrArgNo : SwiftErrorArgs) {
+    call->addParamAttr(SwiftErrArgNo, Attribute::SwiftError);
+    newFunction->addParamAttr(SwiftErrArgNo, Attribute::SwiftError);
+  }
+
+  Function::arg_iterator OutputArgBegin = newFunction->arg_begin();
+  unsigned FirstOut = inputs.size();
+  if (!AggregateArgs)
+    std::advance(OutputArgBegin, inputs.size());
+
+  // Reload the outputs passed in by reference.
+  for (unsigned i = 0, e = outputs.size(); i != e; ++i) {
+    Value *Output = nullptr;
+    if (AggregateArgs) {
+      Value *Idx[2];
+      Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context));
+      Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), FirstOut + i);
+      GetElementPtrInst *GEP = GetElementPtrInst::Create(
+          StructArgTy, Struct, Idx, "gep_reload_" + outputs[i]->getName());
+      codeReplacer->getInstList().push_back(GEP);
+      Output = GEP;
+    } else {
+      Output = ReloadOutputs[i];
+    }
+    LoadInst *load = new LoadInst(outputs[i]->getType(), Output,
+                                  outputs[i]->getName() + ".reload",
+                                  codeReplacer);
+    Reloads.push_back(load);
+    std::vector<User *> Users(outputs[i]->user_begin(), outputs[i]->user_end());
+    for (unsigned u = 0, e = Users.size(); u != e; ++u) {
+      Instruction *inst = cast<Instruction>(Users[u]);
+      if (!Blocks.count(inst->getParent()))
+        inst->replaceUsesOfWith(outputs[i], load);
+    }
+  }
+
+  // Now we can emit a switch statement using the call as a value.
+  SwitchInst *TheSwitch =
+      SwitchInst::Create(Constant::getNullValue(Type::getInt16Ty(Context)),
+                         codeReplacer, 0, codeReplacer);
+
+  // Since there may be multiple exits from the original region, make the new
+  // function return an unsigned, switch on that number.  This loop iterates
+  // over all of the blocks in the extracted region, updating any terminator
+  // instructions in the to-be-extracted region that branch to blocks that are
+  // not in the region to be extracted.
+  std::map<BasicBlock *, BasicBlock *> ExitBlockMap;
+
+  unsigned switchVal = 0;
+  for (BasicBlock *Block : Blocks) {
+    Instruction *TI = Block->getTerminator();
+    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+      if (!Blocks.count(TI->getSuccessor(i))) {
+        BasicBlock *OldTarget = TI->getSuccessor(i);
+        // add a new basic block which returns the appropriate value
+        BasicBlock *&NewTarget = ExitBlockMap[OldTarget];
+        if (!NewTarget) {
+          // If we don't already have an exit stub for this non-extracted
+          // destination, create one now!
+          NewTarget = BasicBlock::Create(Context,
+                                         OldTarget->getName() + ".exitStub",
+                                         newFunction);
+          unsigned SuccNum = switchVal++;
+
+          Value *brVal = nullptr;
+          switch (NumExitBlocks) {
+          case 0:
+          case 1: break;  // No value needed.
+          case 2:         // Conditional branch, return a bool
+            brVal = ConstantInt::get(Type::getInt1Ty(Context), !SuccNum);
+            break;
+          default:
+            brVal = ConstantInt::get(Type::getInt16Ty(Context), SuccNum);
+            break;
+          }
+
+          ReturnInst::Create(Context, brVal, NewTarget);
+
+          // Update the switch instruction.
+          TheSwitch->addCase(ConstantInt::get(Type::getInt16Ty(Context),
+                                              SuccNum),
+                             OldTarget);
+        }
+
+        // rewrite the original branch instruction with this new target
+        TI->setSuccessor(i, NewTarget);
+      }
+  }
+
+  // Store the arguments right after the definition of output value.
+  // This should be proceeded after creating exit stubs to be ensure that invoke
+  // result restore will be placed in the outlined function.
+  Function::arg_iterator OAI = OutputArgBegin;
+  for (unsigned i = 0, e = outputs.size(); i != e; ++i) {
+    auto *OutI = dyn_cast<Instruction>(outputs[i]);
+    if (!OutI)
+      continue;
+
+    // Find proper insertion point.
+    BasicBlock::iterator InsertPt;
+    // In case OutI is an invoke, we insert the store at the beginning in the
+    // 'normal destination' BB. Otherwise we insert the store right after OutI.
+    if (auto *InvokeI = dyn_cast<InvokeInst>(OutI))
+      InsertPt = InvokeI->getNormalDest()->getFirstInsertionPt();
+    else if (auto *Phi = dyn_cast<PHINode>(OutI))
+      InsertPt = Phi->getParent()->getFirstInsertionPt();
+    else
+      InsertPt = std::next(OutI->getIterator());
+
+    Instruction *InsertBefore = &*InsertPt;
+    assert((InsertBefore->getFunction() == newFunction ||
+            Blocks.count(InsertBefore->getParent())) &&
+           "InsertPt should be in new function");
+    assert(OAI != newFunction->arg_end() &&
+           "Number of output arguments should match "
+           "the amount of defined values");
+    if (AggregateArgs) {
+      Value *Idx[2];
+      Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context));
+      Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), FirstOut + i);
+      GetElementPtrInst *GEP = GetElementPtrInst::Create(
+          StructArgTy, &*OAI, Idx, "gep_" + outputs[i]->getName(),
+          InsertBefore);
+      new StoreInst(outputs[i], GEP, InsertBefore);
+      // Since there should be only one struct argument aggregating
+      // all the output values, we shouldn't increment OAI, which always
+      // points to the struct argument, in this case.
+    } else {
+      new StoreInst(outputs[i], &*OAI, InsertBefore);
+      ++OAI;
+    }
+  }
+
+  // Now that we've done the deed, simplify the switch instruction.
+  Type *OldFnRetTy = TheSwitch->getParent()->getParent()->getReturnType();
+  switch (NumExitBlocks) {
+  case 0:
+    // There are no successors (the block containing the switch itself), which
+    // means that previously this was the last part of the function, and hence
+    // this should be rewritten as a `ret'
+
+    // Check if the function should return a value
+    if (OldFnRetTy->isVoidTy()) {
+      ReturnInst::Create(Context, nullptr, TheSwitch);  // Return void
+    } else if (OldFnRetTy == TheSwitch->getCondition()->getType()) {
+      // return what we have
+      ReturnInst::Create(Context, TheSwitch->getCondition(), TheSwitch);
+    } else {
+      // Otherwise we must have code extracted an unwind or something, just
+      // return whatever we want.
+      ReturnInst::Create(Context,
+                         Constant::getNullValue(OldFnRetTy), TheSwitch);
+    }
+
+    TheSwitch->eraseFromParent();
+    break;
+  case 1:
+    // Only a single destination, change the switch into an unconditional
+    // branch.
+    BranchInst::Create(TheSwitch->getSuccessor(1), TheSwitch);
+    TheSwitch->eraseFromParent();
+    break;
+  case 2:
+    BranchInst::Create(TheSwitch->getSuccessor(1), TheSwitch->getSuccessor(2),
+                       call, TheSwitch);
+    TheSwitch->eraseFromParent();
+    break;
+  default:
+    // Otherwise, make the default destination of the switch instruction be one
+    // of the other successors.
+    TheSwitch->setCondition(call);
+    TheSwitch->setDefaultDest(TheSwitch->getSuccessor(NumExitBlocks));
+    // Remove redundant case
+    TheSwitch->removeCase(SwitchInst::CaseIt(TheSwitch, NumExitBlocks-1));
+    break;
+  }
+
+  // Insert lifetime markers around the reloads of any output values. The
+  // allocas output values are stored in are only in-use in the codeRepl block.
+  insertLifetimeMarkersSurroundingCall(M, ReloadOutputs, ReloadOutputs, call);
+
+  return call;
+}
+
+void CodeExtractor::moveCodeToFunction(Function *newFunction) {
+  Function *oldFunc = (*Blocks.begin())->getParent();
+  Function::BasicBlockListType &oldBlocks = oldFunc->getBasicBlockList();
+  Function::BasicBlockListType &newBlocks = newFunction->getBasicBlockList();
+
+  for (BasicBlock *Block : Blocks) {
+    // Delete the basic block from the old function, and the list of blocks
+    oldBlocks.remove(Block);
+
+    // Insert this basic block into the new function
+    newBlocks.push_back(Block);
+  }
+}
+
+void CodeExtractor::calculateNewCallTerminatorWeights(
+    BasicBlock *CodeReplacer,
+    DenseMap<BasicBlock *, BlockFrequency> &ExitWeights,
+    BranchProbabilityInfo *BPI) {
+  using Distribution = BlockFrequencyInfoImplBase::Distribution;
+  using BlockNode = BlockFrequencyInfoImplBase::BlockNode;
+
+  // Update the branch weights for the exit block.
+  Instruction *TI = CodeReplacer->getTerminator();
+  SmallVector<unsigned, 8> BranchWeights(TI->getNumSuccessors(), 0);
+
+  // Block Frequency distribution with dummy node.
+  Distribution BranchDist;
+
+  SmallVector<BranchProbability, 4> EdgeProbabilities(
+      TI->getNumSuccessors(), BranchProbability::getUnknown());
+
+  // Add each of the frequencies of the successors.
+  for (unsigned i = 0, e = TI->getNumSuccessors(); i < e; ++i) {
+    BlockNode ExitNode(i);
+    uint64_t ExitFreq = ExitWeights[TI->getSuccessor(i)].getFrequency();
+    if (ExitFreq != 0)
+      BranchDist.addExit(ExitNode, ExitFreq);
+    else
+      EdgeProbabilities[i] = BranchProbability::getZero();
+  }
+
+  // Check for no total weight.
+  if (BranchDist.Total == 0) {
+    BPI->setEdgeProbability(CodeReplacer, EdgeProbabilities);
+    return;
+  }
+
+  // Normalize the distribution so that they can fit in unsigned.
+  BranchDist.normalize();
+
+  // Create normalized branch weights and set the metadata.
+  for (unsigned I = 0, E = BranchDist.Weights.size(); I < E; ++I) {
+    const auto &Weight = BranchDist.Weights[I];
+
+    // Get the weight and update the current BFI.
+    BranchWeights[Weight.TargetNode.Index] = Weight.Amount;
+    BranchProbability BP(Weight.Amount, BranchDist.Total);
+    EdgeProbabilities[Weight.TargetNode.Index] = BP;
+  }
+  BPI->setEdgeProbability(CodeReplacer, EdgeProbabilities);
+  TI->setMetadata(
+      LLVMContext::MD_prof,
+      MDBuilder(TI->getContext()).createBranchWeights(BranchWeights));
+}
+
+/// Erase debug info intrinsics which refer to values in \p F but aren't in
+/// \p F.
+static void eraseDebugIntrinsicsWithNonLocalRefs(Function &F) {
+  for (Instruction &I : instructions(F)) {
+    SmallVector<DbgVariableIntrinsic *, 4> DbgUsers;
+    findDbgUsers(DbgUsers, &I);
+    for (DbgVariableIntrinsic *DVI : DbgUsers)
+      if (DVI->getFunction() != &F)
+        DVI->eraseFromParent();
+  }
+}
+
+/// Fix up the debug info in the old and new functions by pointing line
+/// locations and debug intrinsics to the new subprogram scope, and by deleting
+/// intrinsics which point to values outside of the new function.
+static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
+                                         CallInst &TheCall) {
+  DISubprogram *OldSP = OldFunc.getSubprogram();
+  LLVMContext &Ctx = OldFunc.getContext();
+
+  if (!OldSP) {
+    // Erase any debug info the new function contains.
+    stripDebugInfo(NewFunc);
+    // Make sure the old function doesn't contain any non-local metadata refs.
+    eraseDebugIntrinsicsWithNonLocalRefs(NewFunc);
+    return;
+  }
+
+  // Create a subprogram for the new function. Leave out a description of the
+  // function arguments, as the parameters don't correspond to anything at the
+  // source level.
+  assert(OldSP->getUnit() && "Missing compile unit for subprogram");
   DIBuilder DIB(*OldFunc.getParent(), /*AllowUnresolved=*/false,
-                OldSP->getUnit()); 
-  auto SPType = DIB.createSubroutineType(DIB.getOrCreateTypeArray(None)); 
-  DISubprogram::DISPFlags SPFlags = DISubprogram::SPFlagDefinition | 
-                                    DISubprogram::SPFlagOptimized | 
-                                    DISubprogram::SPFlagLocalToUnit; 
-  auto NewSP = DIB.createFunction( 
-      OldSP->getUnit(), NewFunc.getName(), NewFunc.getName(), OldSP->getFile(), 
-      /*LineNo=*/0, SPType, /*ScopeLine=*/0, DINode::FlagZero, SPFlags); 
-  NewFunc.setSubprogram(NewSP); 
- 
-  // Debug intrinsics in the new function need to be updated in one of two 
-  // ways: 
-  //  1) They need to be deleted, because they describe a value in the old 
-  //     function. 
-  //  2) They need to point to fresh metadata, e.g. because they currently 
-  //     point to a variable in the wrong scope. 
-  SmallDenseMap<DINode *, DINode *> RemappedMetadata; 
-  SmallVector<Instruction *, 4> DebugIntrinsicsToDelete; 
-  for (Instruction &I : instructions(NewFunc)) { 
-    auto *DII = dyn_cast<DbgInfoIntrinsic>(&I); 
-    if (!DII) 
-      continue; 
- 
-    // Point the intrinsic to a fresh label within the new function. 
-    if (auto *DLI = dyn_cast<DbgLabelInst>(&I)) { 
-      DILabel *OldLabel = DLI->getLabel(); 
-      DINode *&NewLabel = RemappedMetadata[OldLabel]; 
-      if (!NewLabel) 
-        NewLabel = DILabel::get(Ctx, NewSP, OldLabel->getName(), 
-                                OldLabel->getFile(), OldLabel->getLine()); 
-      DLI->setArgOperand(0, MetadataAsValue::get(Ctx, NewLabel)); 
-      continue; 
-    } 
- 
-    // If the location isn't a constant or an instruction, delete the 
-    // intrinsic. 
-    auto *DVI = cast<DbgVariableIntrinsic>(DII); 
-    Value *Location = DVI->getVariableLocation(); 
-    if (!Location || 
-        (!isa<Constant>(Location) && !isa<Instruction>(Location))) { 
-      DebugIntrinsicsToDelete.push_back(DVI); 
-      continue; 
-    } 
- 
-    // If the variable location is an instruction but isn't in the new 
-    // function, delete the intrinsic. 
-    Instruction *LocationInst = dyn_cast<Instruction>(Location); 
-    if (LocationInst && LocationInst->getFunction() != &NewFunc) { 
-      DebugIntrinsicsToDelete.push_back(DVI); 
-      continue; 
-    } 
- 
-    // Point the intrinsic to a fresh variable within the new function. 
-    DILocalVariable *OldVar = DVI->getVariable(); 
-    DINode *&NewVar = RemappedMetadata[OldVar]; 
-    if (!NewVar) 
-      NewVar = DIB.createAutoVariable( 
-          NewSP, OldVar->getName(), OldVar->getFile(), OldVar->getLine(), 
-          OldVar->getType(), /*AlwaysPreserve=*/false, DINode::FlagZero, 
-          OldVar->getAlignInBits()); 
-    DVI->setArgOperand(1, MetadataAsValue::get(Ctx, NewVar)); 
-  } 
-  for (auto *DII : DebugIntrinsicsToDelete) 
-    DII->eraseFromParent(); 
-  DIB.finalizeSubprogram(NewSP); 
- 
-  // Fix up the scope information attached to the line locations in the new 
-  // function. 
-  for (Instruction &I : instructions(NewFunc)) { 
-    if (const DebugLoc &DL = I.getDebugLoc()) 
+                OldSP->getUnit());
+  auto SPType = DIB.createSubroutineType(DIB.getOrCreateTypeArray(None));
+  DISubprogram::DISPFlags SPFlags = DISubprogram::SPFlagDefinition |
+                                    DISubprogram::SPFlagOptimized |
+                                    DISubprogram::SPFlagLocalToUnit;
+  auto NewSP = DIB.createFunction(
+      OldSP->getUnit(), NewFunc.getName(), NewFunc.getName(), OldSP->getFile(),
+      /*LineNo=*/0, SPType, /*ScopeLine=*/0, DINode::FlagZero, SPFlags);
+  NewFunc.setSubprogram(NewSP);
+
+  // Debug intrinsics in the new function need to be updated in one of two
+  // ways:
+  //  1) They need to be deleted, because they describe a value in the old
+  //     function.
+  //  2) They need to point to fresh metadata, e.g. because they currently
+  //     point to a variable in the wrong scope.
+  SmallDenseMap<DINode *, DINode *> RemappedMetadata;
+  SmallVector<Instruction *, 4> DebugIntrinsicsToDelete;
+  for (Instruction &I : instructions(NewFunc)) {
+    auto *DII = dyn_cast<DbgInfoIntrinsic>(&I);
+    if (!DII)
+      continue;
+
+    // Point the intrinsic to a fresh label within the new function.
+    if (auto *DLI = dyn_cast<DbgLabelInst>(&I)) {
+      DILabel *OldLabel = DLI->getLabel();
+      DINode *&NewLabel = RemappedMetadata[OldLabel];
+      if (!NewLabel)
+        NewLabel = DILabel::get(Ctx, NewSP, OldLabel->getName(),
+                                OldLabel->getFile(), OldLabel->getLine());
+      DLI->setArgOperand(0, MetadataAsValue::get(Ctx, NewLabel));
+      continue;
+    }
+
+    // If the location isn't a constant or an instruction, delete the
+    // intrinsic.
+    auto *DVI = cast<DbgVariableIntrinsic>(DII);
+    Value *Location = DVI->getVariableLocation();
+    if (!Location ||
+        (!isa<Constant>(Location) && !isa<Instruction>(Location))) {
+      DebugIntrinsicsToDelete.push_back(DVI);
+      continue;
+    }
+
+    // If the variable location is an instruction but isn't in the new
+    // function, delete the intrinsic.
+    Instruction *LocationInst = dyn_cast<Instruction>(Location);
+    if (LocationInst && LocationInst->getFunction() != &NewFunc) {
+      DebugIntrinsicsToDelete.push_back(DVI);
+      continue;
+    }
+
+    // Point the intrinsic to a fresh variable within the new function.
+    DILocalVariable *OldVar = DVI->getVariable();
+    DINode *&NewVar = RemappedMetadata[OldVar];
+    if (!NewVar)
+      NewVar = DIB.createAutoVariable(
+          NewSP, OldVar->getName(), OldVar->getFile(), OldVar->getLine(),
+          OldVar->getType(), /*AlwaysPreserve=*/false, DINode::FlagZero,
+          OldVar->getAlignInBits());
+    DVI->setArgOperand(1, MetadataAsValue::get(Ctx, NewVar));
+  }
+  for (auto *DII : DebugIntrinsicsToDelete)
+    DII->eraseFromParent();
+  DIB.finalizeSubprogram(NewSP);
+
+  // Fix up the scope information attached to the line locations in the new
+  // function.
+  for (Instruction &I : instructions(NewFunc)) {
+    if (const DebugLoc &DL = I.getDebugLoc())
       I.setDebugLoc(DILocation::get(Ctx, DL.getLine(), DL.getCol(), NewSP));
- 
-    // Loop info metadata may contain line locations. Fix them up. 
-    auto updateLoopInfoLoc = [&Ctx, 
-                              NewSP](const DILocation &Loc) -> DILocation * { 
-      return DILocation::get(Ctx, Loc.getLine(), Loc.getColumn(), NewSP, 
-                             nullptr); 
-    }; 
-    updateLoopMetadataDebugLocations(I, updateLoopInfoLoc); 
-  } 
-  if (!TheCall.getDebugLoc()) 
+
+    // Loop info metadata may contain line locations. Fix them up.
+    auto updateLoopInfoLoc = [&Ctx,
+                              NewSP](const DILocation &Loc) -> DILocation * {
+      return DILocation::get(Ctx, Loc.getLine(), Loc.getColumn(), NewSP,
+                             nullptr);
+    };
+    updateLoopMetadataDebugLocations(I, updateLoopInfoLoc);
+  }
+  if (!TheCall.getDebugLoc())
     TheCall.setDebugLoc(DILocation::get(Ctx, 0, 0, OldSP));
- 
-  eraseDebugIntrinsicsWithNonLocalRefs(NewFunc); 
-} 
- 
-Function * 
-CodeExtractor::extractCodeRegion(const CodeExtractorAnalysisCache &CEAC) { 
-  if (!isEligible()) 
-    return nullptr; 
- 
-  // Assumption: this is a single-entry code region, and the header is the first 
-  // block in the region. 
-  BasicBlock *header = *Blocks.begin(); 
-  Function *oldFunction = header->getParent(); 
- 
-  // Calculate the entry frequency of the new function before we change the root 
-  //   block. 
-  BlockFrequency EntryFreq; 
-  if (BFI) { 
-    assert(BPI && "Both BPI and BFI are required to preserve profile info"); 
-    for (BasicBlock *Pred : predecessors(header)) { 
-      if (Blocks.count(Pred)) 
-        continue; 
-      EntryFreq += 
-          BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, header); 
-    } 
-  } 
- 
-  // Remove @llvm.assume calls that will be moved to the new function from the 
-  // old function's assumption cache. 
-  for (BasicBlock *Block : Blocks) { 
-    for (auto It = Block->begin(), End = Block->end(); It != End;) { 
-      Instruction *I = &*It; 
-      ++It; 
- 
-      if (match(I, m_Intrinsic<Intrinsic::assume>())) { 
-        if (AC) 
-          AC->unregisterAssumption(cast<CallInst>(I)); 
-        I->eraseFromParent(); 
-      } 
-    } 
-  } 
- 
-  // If we have any return instructions in the region, split those blocks so 
-  // that the return is not in the region. 
-  splitReturnBlocks(); 
- 
-  // Calculate the exit blocks for the extracted region and the total exit 
-  // weights for each of those blocks. 
-  DenseMap<BasicBlock *, BlockFrequency> ExitWeights; 
-  SmallPtrSet<BasicBlock *, 1> ExitBlocks; 
-  for (BasicBlock *Block : Blocks) { 
-    for (succ_iterator SI = succ_begin(Block), SE = succ_end(Block); SI != SE; 
-         ++SI) { 
-      if (!Blocks.count(*SI)) { 
-        // Update the branch weight for this successor. 
-        if (BFI) { 
-          BlockFrequency &BF = ExitWeights[*SI]; 
-          BF += BFI->getBlockFreq(Block) * BPI->getEdgeProbability(Block, *SI); 
-        } 
-        ExitBlocks.insert(*SI); 
-      } 
-    } 
-  } 
-  NumExitBlocks = ExitBlocks.size(); 
- 
-  // If we have to split PHI nodes of the entry or exit blocks, do so now. 
-  severSplitPHINodesOfEntry(header); 
-  severSplitPHINodesOfExits(ExitBlocks); 
- 
-  // This takes place of the original loop 
-  BasicBlock *codeReplacer = BasicBlock::Create(header->getContext(), 
-                                                "codeRepl", oldFunction, 
-                                                header); 
- 
-  // The new function needs a root node because other nodes can branch to the 
-  // head of the region, but the entry node of a function cannot have preds. 
-  BasicBlock *newFuncRoot = BasicBlock::Create(header->getContext(), 
-                                               "newFuncRoot"); 
-  auto *BranchI = BranchInst::Create(header); 
-  // If the original function has debug info, we have to add a debug location 
-  // to the new branch instruction from the artificial entry block. 
-  // We use the debug location of the first instruction in the extracted 
-  // blocks, as there is no other equivalent line in the source code. 
-  if (oldFunction->getSubprogram()) { 
-    any_of(Blocks, [&BranchI](const BasicBlock *BB) { 
-      return any_of(*BB, [&BranchI](const Instruction &I) { 
-        if (!I.getDebugLoc()) 
-          return false; 
-        BranchI->setDebugLoc(I.getDebugLoc()); 
-        return true; 
-      }); 
-    }); 
-  } 
-  newFuncRoot->getInstList().push_back(BranchI); 
- 
-  ValueSet inputs, outputs, SinkingCands, HoistingCands; 
-  BasicBlock *CommonExit = nullptr; 
-  findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit); 
-  assert(HoistingCands.empty() || CommonExit); 
- 
-  // Find inputs to, outputs from the code region. 
-  findInputsOutputs(inputs, outputs, SinkingCands); 
- 
-  // Now sink all instructions which only have non-phi uses inside the region. 
-  // Group the allocas at the start of the block, so that any bitcast uses of 
-  // the allocas are well-defined. 
-  AllocaInst *FirstSunkAlloca = nullptr; 
-  for (auto *II : SinkingCands) { 
-    if (auto *AI = dyn_cast<AllocaInst>(II)) { 
-      AI->moveBefore(*newFuncRoot, newFuncRoot->getFirstInsertionPt()); 
-      if (!FirstSunkAlloca) 
-        FirstSunkAlloca = AI; 
-    } 
-  } 
-  assert((SinkingCands.empty() || FirstSunkAlloca) && 
-         "Did not expect a sink candidate without any allocas"); 
-  for (auto *II : SinkingCands) { 
-    if (!isa<AllocaInst>(II)) { 
-      cast<Instruction>(II)->moveAfter(FirstSunkAlloca); 
-    } 
-  } 
- 
-  if (!HoistingCands.empty()) { 
-    auto *HoistToBlock = findOrCreateBlockForHoisting(CommonExit); 
-    Instruction *TI = HoistToBlock->getTerminator(); 
-    for (auto *II : HoistingCands) 
-      cast<Instruction>(II)->moveBefore(TI); 
-  } 
- 
-  // Collect objects which are inputs to the extraction region and also 
-  // referenced by lifetime start markers within it. The effects of these 
-  // markers must be replicated in the calling function to prevent the stack 
-  // coloring pass from merging slots which store input objects. 
-  ValueSet LifetimesStart; 
-  eraseLifetimeMarkersOnInputs(Blocks, SinkingCands, LifetimesStart); 
- 
-  // Construct new function based on inputs/outputs & add allocas for all defs. 
-  Function *newFunction = 
-      constructFunction(inputs, outputs, header, newFuncRoot, codeReplacer, 
-                        oldFunction, oldFunction->getParent()); 
- 
-  // Update the entry count of the function. 
-  if (BFI) { 
-    auto Count = BFI->getProfileCountFromFreq(EntryFreq.getFrequency()); 
-    if (Count.hasValue()) 
-      newFunction->setEntryCount( 
-          ProfileCount(Count.getValue(), Function::PCT_Real)); // FIXME 
-    BFI->setBlockFreq(codeReplacer, EntryFreq.getFrequency()); 
-  } 
- 
-  CallInst *TheCall = 
-      emitCallAndSwitchStatement(newFunction, codeReplacer, inputs, outputs); 
- 
-  moveCodeToFunction(newFunction); 
- 
-  // Replicate the effects of any lifetime start/end markers which referenced 
-  // input objects in the extraction region by placing markers around the call. 
-  insertLifetimeMarkersSurroundingCall( 
-      oldFunction->getParent(), LifetimesStart.getArrayRef(), {}, TheCall); 
- 
-  // Propagate personality info to the new function if there is one. 
-  if (oldFunction->hasPersonalityFn()) 
-    newFunction->setPersonalityFn(oldFunction->getPersonalityFn()); 
- 
-  // Update the branch weights for the exit block. 
-  if (BFI && NumExitBlocks > 1) 
-    calculateNewCallTerminatorWeights(codeReplacer, ExitWeights, BPI); 
- 
-  // Loop over all of the PHI nodes in the header and exit blocks, and change 
-  // any references to the old incoming edge to be the new incoming edge. 
-  for (BasicBlock::iterator I = header->begin(); isa<PHINode>(I); ++I) { 
-    PHINode *PN = cast<PHINode>(I); 
-    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) 
-      if (!Blocks.count(PN->getIncomingBlock(i))) 
-        PN->setIncomingBlock(i, newFuncRoot); 
-  } 
- 
-  for (BasicBlock *ExitBB : ExitBlocks) 
-    for (PHINode &PN : ExitBB->phis()) { 
-      Value *IncomingCodeReplacerVal = nullptr; 
-      for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) { 
-        // Ignore incoming values from outside of the extracted region. 
-        if (!Blocks.count(PN.getIncomingBlock(i))) 
-          continue; 
- 
-        // Ensure that there is only one incoming value from codeReplacer. 
-        if (!IncomingCodeReplacerVal) { 
-          PN.setIncomingBlock(i, codeReplacer); 
-          IncomingCodeReplacerVal = PN.getIncomingValue(i); 
-        } else 
-          assert(IncomingCodeReplacerVal == PN.getIncomingValue(i) && 
-                 "PHI has two incompatbile incoming values from codeRepl"); 
-      } 
-    } 
- 
-  fixupDebugInfoPostExtraction(*oldFunction, *newFunction, *TheCall); 
- 
-  // Mark the new function `noreturn` if applicable. Terminators which resume 
-  // exception propagation are treated as returning instructions. This is to 
-  // avoid inserting traps after calls to outlined functions which unwind. 
-  bool doesNotReturn = none_of(*newFunction, [](const BasicBlock &BB) { 
-    const Instruction *Term = BB.getTerminator(); 
-    return isa<ReturnInst>(Term) || isa<ResumeInst>(Term); 
-  }); 
-  if (doesNotReturn) 
-    newFunction->setDoesNotReturn(); 
- 
-  LLVM_DEBUG(if (verifyFunction(*newFunction, &errs())) { 
-    newFunction->dump(); 
-    report_fatal_error("verification of newFunction failed!"); 
-  }); 
-  LLVM_DEBUG(if (verifyFunction(*oldFunction)) 
-             report_fatal_error("verification of oldFunction failed!")); 
-  LLVM_DEBUG(if (AC && verifyAssumptionCache(*oldFunction, *newFunction, AC)) 
-                 report_fatal_error("Stale Asumption cache for old Function!")); 
-  return newFunction; 
-} 
- 
-bool CodeExtractor::verifyAssumptionCache(const Function &OldFunc, 
-                                          const Function &NewFunc, 
-                                          AssumptionCache *AC) { 
-  for (auto AssumeVH : AC->assumptions()) { 
+
+  eraseDebugIntrinsicsWithNonLocalRefs(NewFunc);
+}
+
+Function *
+CodeExtractor::extractCodeRegion(const CodeExtractorAnalysisCache &CEAC) {
+  if (!isEligible())
+    return nullptr;
+
+  // Assumption: this is a single-entry code region, and the header is the first
+  // block in the region.
+  BasicBlock *header = *Blocks.begin();
+  Function *oldFunction = header->getParent();
+
+  // Calculate the entry frequency of the new function before we change the root
+  //   block.
+  BlockFrequency EntryFreq;
+  if (BFI) {
+    assert(BPI && "Both BPI and BFI are required to preserve profile info");
+    for (BasicBlock *Pred : predecessors(header)) {
+      if (Blocks.count(Pred))
+        continue;
+      EntryFreq +=
+          BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, header);
+    }
+  }
+
+  // Remove @llvm.assume calls that will be moved to the new function from the
+  // old function's assumption cache.
+  for (BasicBlock *Block : Blocks) {
+    for (auto It = Block->begin(), End = Block->end(); It != End;) {
+      Instruction *I = &*It;
+      ++It;
+
+      if (match(I, m_Intrinsic<Intrinsic::assume>())) {
+        if (AC)
+          AC->unregisterAssumption(cast<CallInst>(I));
+        I->eraseFromParent();
+      }
+    }
+  }
+
+  // If we have any return instructions in the region, split those blocks so
+  // that the return is not in the region.
+  splitReturnBlocks();
+
+  // Calculate the exit blocks for the extracted region and the total exit
+  // weights for each of those blocks.
+  DenseMap<BasicBlock *, BlockFrequency> ExitWeights;
+  SmallPtrSet<BasicBlock *, 1> ExitBlocks;
+  for (BasicBlock *Block : Blocks) {
+    for (succ_iterator SI = succ_begin(Block), SE = succ_end(Block); SI != SE;
+         ++SI) {
+      if (!Blocks.count(*SI)) {
+        // Update the branch weight for this successor.
+        if (BFI) {
+          BlockFrequency &BF = ExitWeights[*SI];
+          BF += BFI->getBlockFreq(Block) * BPI->getEdgeProbability(Block, *SI);
+        }
+        ExitBlocks.insert(*SI);
+      }
+    }
+  }
+  NumExitBlocks = ExitBlocks.size();
+
+  // If we have to split PHI nodes of the entry or exit blocks, do so now.
+  severSplitPHINodesOfEntry(header);
+  severSplitPHINodesOfExits(ExitBlocks);
+
+  // This takes place of the original loop
+  BasicBlock *codeReplacer = BasicBlock::Create(header->getContext(),
+                                                "codeRepl", oldFunction,
+                                                header);
+
+  // The new function needs a root node because other nodes can branch to the
+  // head of the region, but the entry node of a function cannot have preds.
+  BasicBlock *newFuncRoot = BasicBlock::Create(header->getContext(),
+                                               "newFuncRoot");
+  auto *BranchI = BranchInst::Create(header);
+  // If the original function has debug info, we have to add a debug location
+  // to the new branch instruction from the artificial entry block.
+  // We use the debug location of the first instruction in the extracted
+  // blocks, as there is no other equivalent line in the source code.
+  if (oldFunction->getSubprogram()) {
+    any_of(Blocks, [&BranchI](const BasicBlock *BB) {
+      return any_of(*BB, [&BranchI](const Instruction &I) {
+        if (!I.getDebugLoc())
+          return false;
+        BranchI->setDebugLoc(I.getDebugLoc());
+        return true;
+      });
+    });
+  }
+  newFuncRoot->getInstList().push_back(BranchI);
+
+  ValueSet inputs, outputs, SinkingCands, HoistingCands;
+  BasicBlock *CommonExit = nullptr;
+  findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
+  assert(HoistingCands.empty() || CommonExit);
+
+  // Find inputs to, outputs from the code region.
+  findInputsOutputs(inputs, outputs, SinkingCands);
+
+  // Now sink all instructions which only have non-phi uses inside the region.
+  // Group the allocas at the start of the block, so that any bitcast uses of
+  // the allocas are well-defined.
+  AllocaInst *FirstSunkAlloca = nullptr;
+  for (auto *II : SinkingCands) {
+    if (auto *AI = dyn_cast<AllocaInst>(II)) {
+      AI->moveBefore(*newFuncRoot, newFuncRoot->getFirstInsertionPt());
+      if (!FirstSunkAlloca)
+        FirstSunkAlloca = AI;
+    }
+  }
+  assert((SinkingCands.empty() || FirstSunkAlloca) &&
+         "Did not expect a sink candidate without any allocas");
+  for (auto *II : SinkingCands) {
+    if (!isa<AllocaInst>(II)) {
+      cast<Instruction>(II)->moveAfter(FirstSunkAlloca);
+    }
+  }
+
+  if (!HoistingCands.empty()) {
+    auto *HoistToBlock = findOrCreateBlockForHoisting(CommonExit);
+    Instruction *TI = HoistToBlock->getTerminator();
+    for (auto *II : HoistingCands)
+      cast<Instruction>(II)->moveBefore(TI);
+  }
+
+  // Collect objects which are inputs to the extraction region and also
+  // referenced by lifetime start markers within it. The effects of these
+  // markers must be replicated in the calling function to prevent the stack
+  // coloring pass from merging slots which store input objects.
+  ValueSet LifetimesStart;
+  eraseLifetimeMarkersOnInputs(Blocks, SinkingCands, LifetimesStart);
+
+  // Construct new function based on inputs/outputs & add allocas for all defs.
+  Function *newFunction =
+      constructFunction(inputs, outputs, header, newFuncRoot, codeReplacer,
+                        oldFunction, oldFunction->getParent());
+
+  // Update the entry count of the function.
+  if (BFI) {
+    auto Count = BFI->getProfileCountFromFreq(EntryFreq.getFrequency());
+    if (Count.hasValue())
+      newFunction->setEntryCount(
+          ProfileCount(Count.getValue(), Function::PCT_Real)); // FIXME
+    BFI->setBlockFreq(codeReplacer, EntryFreq.getFrequency());
+  }
+
+  CallInst *TheCall =
+      emitCallAndSwitchStatement(newFunction, codeReplacer, inputs, outputs);
+
+  moveCodeToFunction(newFunction);
+
+  // Replicate the effects of any lifetime start/end markers which referenced
+  // input objects in the extraction region by placing markers around the call.
+  insertLifetimeMarkersSurroundingCall(
+      oldFunction->getParent(), LifetimesStart.getArrayRef(), {}, TheCall);
+
+  // Propagate personality info to the new function if there is one.
+  if (oldFunction->hasPersonalityFn())
+    newFunction->setPersonalityFn(oldFunction->getPersonalityFn());
+
+  // Update the branch weights for the exit block.
+  if (BFI && NumExitBlocks > 1)
+    calculateNewCallTerminatorWeights(codeReplacer, ExitWeights, BPI);
+
+  // Loop over all of the PHI nodes in the header and exit blocks, and change
+  // any references to the old incoming edge to be the new incoming edge.
+  for (BasicBlock::iterator I = header->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (!Blocks.count(PN->getIncomingBlock(i)))
+        PN->setIncomingBlock(i, newFuncRoot);
+  }
+
+  for (BasicBlock *ExitBB : ExitBlocks)
+    for (PHINode &PN : ExitBB->phis()) {
+      Value *IncomingCodeReplacerVal = nullptr;
+      for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
+        // Ignore incoming values from outside of the extracted region.
+        if (!Blocks.count(PN.getIncomingBlock(i)))
+          continue;
+
+        // Ensure that there is only one incoming value from codeReplacer.
+        if (!IncomingCodeReplacerVal) {
+          PN.setIncomingBlock(i, codeReplacer);
+          IncomingCodeReplacerVal = PN.getIncomingValue(i);
+        } else
+          assert(IncomingCodeReplacerVal == PN.getIncomingValue(i) &&
+                 "PHI has two incompatbile incoming values from codeRepl");
+      }
+    }
+
+  fixupDebugInfoPostExtraction(*oldFunction, *newFunction, *TheCall);
+
+  // Mark the new function `noreturn` if applicable. Terminators which resume
+  // exception propagation are treated as returning instructions. This is to
+  // avoid inserting traps after calls to outlined functions which unwind.
+  bool doesNotReturn = none_of(*newFunction, [](const BasicBlock &BB) {
+    const Instruction *Term = BB.getTerminator();
+    return isa<ReturnInst>(Term) || isa<ResumeInst>(Term);
+  });
+  if (doesNotReturn)
+    newFunction->setDoesNotReturn();
+
+  LLVM_DEBUG(if (verifyFunction(*newFunction, &errs())) {
+    newFunction->dump();
+    report_fatal_error("verification of newFunction failed!");
+  });
+  LLVM_DEBUG(if (verifyFunction(*oldFunction))
+             report_fatal_error("verification of oldFunction failed!"));
+  LLVM_DEBUG(if (AC && verifyAssumptionCache(*oldFunction, *newFunction, AC))
+                 report_fatal_error("Stale Asumption cache for old Function!"));
+  return newFunction;
+}
+
+bool CodeExtractor::verifyAssumptionCache(const Function &OldFunc,
+                                          const Function &NewFunc,
+                                          AssumptionCache *AC) {
+  for (auto AssumeVH : AC->assumptions()) {
     auto *I = dyn_cast_or_null<CallInst>(AssumeVH);
-    if (!I) 
-      continue; 
- 
-    // There shouldn't be any llvm.assume intrinsics in the new function. 
-    if (I->getFunction() != &OldFunc) 
-      return true; 
- 
-    // There shouldn't be any stale affected values in the assumption cache 
-    // that were previously in the old function, but that have now been moved 
-    // to the new function. 
-    for (auto AffectedValVH : AC->assumptionsFor(I->getOperand(0))) { 
+    if (!I)
+      continue;
+
+    // There shouldn't be any llvm.assume intrinsics in the new function.
+    if (I->getFunction() != &OldFunc)
+      return true;
+
+    // There shouldn't be any stale affected values in the assumption cache
+    // that were previously in the old function, but that have now been moved
+    // to the new function.
+    for (auto AffectedValVH : AC->assumptionsFor(I->getOperand(0))) {
       auto *AffectedCI = dyn_cast_or_null<CallInst>(AffectedValVH);
-      if (!AffectedCI) 
-        continue; 
-      if (AffectedCI->getFunction() != &OldFunc) 
-        return true; 
+      if (!AffectedCI)
+        continue;
+      if (AffectedCI->getFunction() != &OldFunc)
+        return true;
       auto *AssumedInst = cast<Instruction>(AffectedCI->getOperand(0));
-      if (AssumedInst->getFunction() != &OldFunc) 
-        return true; 
-    } 
-  } 
-  return false; 
-} 
+      if (AssumedInst->getFunction() != &OldFunc)
+        return true;
+    }
+  }
+  return false;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/CodeMoverUtils.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/CodeMoverUtils.cpp
index 0e2f64b1d4..ce982c7403 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/CodeMoverUtils.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/CodeMoverUtils.cpp
@@ -1,364 +1,364 @@
-//===- CodeMoverUtils.cpp - CodeMover Utilities ----------------------------==// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This family of functions perform movements on basic blocks, and instructions 
-// contained within a function. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/CodeMoverUtils.h" 
-#include "llvm/ADT/Optional.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/DependenceAnalysis.h" 
-#include "llvm/Analysis/PostDominators.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/Dominators.h" 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "codemover-utils" 
- 
-STATISTIC(HasDependences, 
-          "Cannot move across instructions that has memory dependences"); 
-STATISTIC(MayThrowException, "Cannot move across instructions that may throw"); 
-STATISTIC(NotControlFlowEquivalent, 
-          "Instructions are not control flow equivalent"); 
-STATISTIC(NotMovedPHINode, "Movement of PHINodes are not supported"); 
-STATISTIC(NotMovedTerminator, "Movement of Terminator are not supported"); 
- 
-namespace { 
-/// Represent a control condition. A control condition is a condition of a 
-/// terminator to decide which successors to execute. The pointer field 
-/// represents the address of the condition of the terminator. The integer field 
-/// is a bool, it is true when the basic block is executed when V is true. For 
-/// example, `br %cond, bb0, bb1` %cond is a control condition of bb0 with the 
-/// integer field equals to true, while %cond is a control condition of bb1 with 
-/// the integer field equals to false. 
-using ControlCondition = PointerIntPair<Value *, 1, bool>; 
-#ifndef NDEBUG 
-raw_ostream &operator<<(raw_ostream &OS, const ControlCondition &C) { 
-  OS << "[" << *C.getPointer() << ", " << (C.getInt() ? "true" : "false") 
-     << "]"; 
-  return OS; 
-} 
-#endif 
- 
-/// Represent a set of control conditions required to execute ToBB from FromBB. 
-class ControlConditions { 
-  using ConditionVectorTy = SmallVector<ControlCondition, 6>; 
- 
-  /// A SmallVector of control conditions. 
-  ConditionVectorTy Conditions; 
- 
-public: 
-  /// Return a ControlConditions which stores all conditions required to execute 
-  /// \p BB from \p Dominator. If \p MaxLookup is non-zero, it limits the 
-  /// number of conditions to collect. Return None if not all conditions are 
-  /// collected successfully, or we hit the limit. 
-  static const Optional<ControlConditions> 
-  collectControlConditions(const BasicBlock &BB, const BasicBlock &Dominator, 
-                           const DominatorTree &DT, 
-                           const PostDominatorTree &PDT, 
-                           unsigned MaxLookup = 6); 
- 
-  /// Return true if there exists no control conditions required to execute ToBB 
-  /// from FromBB. 
-  bool isUnconditional() const { return Conditions.empty(); } 
- 
-  /// Return a constant reference of Conditions. 
-  const ConditionVectorTy &getControlConditions() const { return Conditions; } 
- 
-  /// Add \p V as one of the ControlCondition in Condition with IsTrueCondition 
-  /// equals to \p True. Return true if inserted successfully. 
-  bool addControlCondition(ControlCondition C); 
- 
-  /// Return true if for all control conditions in Conditions, there exists an 
-  /// equivalent control condition in \p Other.Conditions. 
-  bool isEquivalent(const ControlConditions &Other) const; 
- 
-  /// Return true if \p C1 and \p C2 are equivalent. 
-  static bool isEquivalent(const ControlCondition &C1, 
-                           const ControlCondition &C2); 
- 
-private: 
-  ControlConditions() = default; 
- 
-  static bool isEquivalent(const Value &V1, const Value &V2); 
-  static bool isInverse(const Value &V1, const Value &V2); 
-}; 
-} // namespace 
- 
-static bool domTreeLevelBefore(DominatorTree *DT, const Instruction *InstA, 
-                               const Instruction *InstB) { 
-  // Use ordered basic block in case the 2 instructions are in the same 
-  // block. 
-  if (InstA->getParent() == InstB->getParent()) 
-    return InstA->comesBefore(InstB); 
- 
-  DomTreeNode *DA = DT->getNode(InstA->getParent()); 
-  DomTreeNode *DB = DT->getNode(InstB->getParent()); 
-  return DA->getLevel() < DB->getLevel(); 
-} 
- 
-const Optional<ControlConditions> ControlConditions::collectControlConditions( 
-    const BasicBlock &BB, const BasicBlock &Dominator, const DominatorTree &DT, 
-    const PostDominatorTree &PDT, unsigned MaxLookup) { 
-  assert(DT.dominates(&Dominator, &BB) && "Expecting Dominator to dominate BB"); 
- 
-  ControlConditions Conditions; 
-  unsigned NumConditions = 0; 
- 
-  // BB is executed unconditional from itself. 
-  if (&Dominator == &BB) 
-    return Conditions; 
- 
-  const BasicBlock *CurBlock = &BB; 
-  // Walk up the dominator tree from the associated DT node for BB to the 
-  // associated DT node for Dominator. 
-  do { 
-    assert(DT.getNode(CurBlock) && "Expecting a valid DT node for CurBlock"); 
-    BasicBlock *IDom = DT.getNode(CurBlock)->getIDom()->getBlock(); 
-    assert(DT.dominates(&Dominator, IDom) && 
-           "Expecting Dominator to dominate IDom"); 
- 
-    // Limitation: can only handle branch instruction currently. 
-    const BranchInst *BI = dyn_cast<BranchInst>(IDom->getTerminator()); 
-    if (!BI) 
-      return None; 
- 
-    bool Inserted = false; 
-    if (PDT.dominates(CurBlock, IDom)) { 
-      LLVM_DEBUG(dbgs() << CurBlock->getName() 
-                        << " is executed unconditionally from " 
-                        << IDom->getName() << "\n"); 
-    } else if (PDT.dominates(CurBlock, BI->getSuccessor(0))) { 
-      LLVM_DEBUG(dbgs() << CurBlock->getName() << " is executed when \"" 
-                        << *BI->getCondition() << "\" is true from " 
-                        << IDom->getName() << "\n"); 
-      Inserted = Conditions.addControlCondition( 
-          ControlCondition(BI->getCondition(), true)); 
-    } else if (PDT.dominates(CurBlock, BI->getSuccessor(1))) { 
-      LLVM_DEBUG(dbgs() << CurBlock->getName() << " is executed when \"" 
-                        << *BI->getCondition() << "\" is false from " 
-                        << IDom->getName() << "\n"); 
-      Inserted = Conditions.addControlCondition( 
-          ControlCondition(BI->getCondition(), false)); 
-    } else 
-      return None; 
- 
-    if (Inserted) 
-      ++NumConditions; 
- 
-    if (MaxLookup != 0 && NumConditions > MaxLookup) 
-      return None; 
- 
-    CurBlock = IDom; 
-  } while (CurBlock != &Dominator); 
- 
-  return Conditions; 
-} 
- 
-bool ControlConditions::addControlCondition(ControlCondition C) { 
-  bool Inserted = false; 
-  if (none_of(Conditions, [&](ControlCondition &Exists) { 
-        return ControlConditions::isEquivalent(C, Exists); 
-      })) { 
-    Conditions.push_back(C); 
-    Inserted = true; 
-  } 
- 
-  LLVM_DEBUG(dbgs() << (Inserted ? "Inserted " : "Not inserted ") << C << "\n"); 
-  return Inserted; 
-} 
- 
-bool ControlConditions::isEquivalent(const ControlConditions &Other) const { 
-  if (Conditions.empty() && Other.Conditions.empty()) 
-    return true; 
- 
-  if (Conditions.size() != Other.Conditions.size()) 
-    return false; 
- 
-  return all_of(Conditions, [&](const ControlCondition &C) { 
-    return any_of(Other.Conditions, [&](const ControlCondition &OtherC) { 
-      return ControlConditions::isEquivalent(C, OtherC); 
-    }); 
-  }); 
-} 
- 
-bool ControlConditions::isEquivalent(const ControlCondition &C1, 
-                                     const ControlCondition &C2) { 
-  if (C1.getInt() == C2.getInt()) { 
-    if (isEquivalent(*C1.getPointer(), *C2.getPointer())) 
-      return true; 
-  } else if (isInverse(*C1.getPointer(), *C2.getPointer())) 
-    return true; 
- 
-  return false; 
-} 
- 
-// FIXME: Use SCEV and reuse GVN/CSE logic to check for equivalence between 
-// Values. 
-// Currently, isEquivalent rely on other passes to ensure equivalent conditions 
-// have the same value, e.g. GVN. 
-bool ControlConditions::isEquivalent(const Value &V1, const Value &V2) { 
-  return &V1 == &V2; 
-} 
- 
-bool ControlConditions::isInverse(const Value &V1, const Value &V2) { 
-  if (const CmpInst *Cmp1 = dyn_cast<CmpInst>(&V1)) 
-    if (const CmpInst *Cmp2 = dyn_cast<CmpInst>(&V2)) { 
-      if (Cmp1->getPredicate() == Cmp2->getInversePredicate() && 
-          Cmp1->getOperand(0) == Cmp2->getOperand(0) && 
-          Cmp1->getOperand(1) == Cmp2->getOperand(1)) 
-        return true; 
- 
-      if (Cmp1->getPredicate() == 
-              CmpInst::getSwappedPredicate(Cmp2->getInversePredicate()) && 
-          Cmp1->getOperand(0) == Cmp2->getOperand(1) && 
-          Cmp1->getOperand(1) == Cmp2->getOperand(0)) 
-        return true; 
-    } 
-  return false; 
-} 
- 
-bool llvm::isControlFlowEquivalent(const Instruction &I0, const Instruction &I1, 
-                                   const DominatorTree &DT, 
-                                   const PostDominatorTree &PDT) { 
-  return isControlFlowEquivalent(*I0.getParent(), *I1.getParent(), DT, PDT); 
-} 
- 
-bool llvm::isControlFlowEquivalent(const BasicBlock &BB0, const BasicBlock &BB1, 
-                                   const DominatorTree &DT, 
-                                   const PostDominatorTree &PDT) { 
-  if (&BB0 == &BB1) 
-    return true; 
- 
-  if ((DT.dominates(&BB0, &BB1) && PDT.dominates(&BB1, &BB0)) || 
-      (PDT.dominates(&BB0, &BB1) && DT.dominates(&BB1, &BB0))) 
-    return true; 
- 
-  // If the set of conditions required to execute BB0 and BB1 from their common 
-  // dominator are the same, then BB0 and BB1 are control flow equivalent. 
-  const BasicBlock *CommonDominator = DT.findNearestCommonDominator(&BB0, &BB1); 
-  LLVM_DEBUG(dbgs() << "The nearest common dominator of " << BB0.getName() 
-                    << " and " << BB1.getName() << " is " 
-                    << CommonDominator->getName() << "\n"); 
- 
-  const Optional<ControlConditions> BB0Conditions = 
-      ControlConditions::collectControlConditions(BB0, *CommonDominator, DT, 
-                                                  PDT); 
-  if (BB0Conditions == None) 
-    return false; 
- 
-  const Optional<ControlConditions> BB1Conditions = 
-      ControlConditions::collectControlConditions(BB1, *CommonDominator, DT, 
-                                                  PDT); 
-  if (BB1Conditions == None) 
-    return false; 
- 
-  return BB0Conditions->isEquivalent(*BB1Conditions); 
-} 
- 
-static bool reportInvalidCandidate(const Instruction &I, 
-                                   llvm::Statistic &Stat) { 
-  ++Stat; 
-  LLVM_DEBUG(dbgs() << "Unable to move instruction: " << I << ". " 
-                    << Stat.getDesc()); 
-  return false; 
-} 
- 
-/// Collect all instructions in between \p StartInst and \p EndInst, and store 
-/// them in \p InBetweenInsts. 
-static void 
-collectInstructionsInBetween(Instruction &StartInst, const Instruction &EndInst, 
-                             SmallPtrSetImpl<Instruction *> &InBetweenInsts) { 
-  assert(InBetweenInsts.empty() && "Expecting InBetweenInsts to be empty"); 
- 
-  /// Get the next instructions of \p I, and push them to \p WorkList. 
-  auto getNextInsts = [](Instruction &I, 
-                         SmallPtrSetImpl<Instruction *> &WorkList) { 
-    if (Instruction *NextInst = I.getNextNode()) 
-      WorkList.insert(NextInst); 
-    else { 
-      assert(I.isTerminator() && "Expecting a terminator instruction"); 
-      for (BasicBlock *Succ : successors(&I)) 
-        WorkList.insert(&Succ->front()); 
-    } 
-  }; 
- 
-  SmallPtrSet<Instruction *, 10> WorkList; 
-  getNextInsts(StartInst, WorkList); 
-  while (!WorkList.empty()) { 
-    Instruction *CurInst = *WorkList.begin(); 
-    WorkList.erase(CurInst); 
- 
-    if (CurInst == &EndInst) 
-      continue; 
- 
-    if (!InBetweenInsts.insert(CurInst).second) 
-      continue; 
- 
-    getNextInsts(*CurInst, WorkList); 
-  } 
-} 
- 
-bool llvm::isSafeToMoveBefore(Instruction &I, Instruction &InsertPoint, 
-                              DominatorTree &DT, const PostDominatorTree *PDT, 
-                              DependenceInfo *DI) { 
-  // Skip tests when we don't have PDT or DI 
-  if (!PDT || !DI) 
-    return false; 
- 
-  // Cannot move itself before itself. 
-  if (&I == &InsertPoint) 
-    return false; 
- 
-  // Not moved. 
-  if (I.getNextNode() == &InsertPoint) 
-    return true; 
- 
-  if (isa<PHINode>(I) || isa<PHINode>(InsertPoint)) 
-    return reportInvalidCandidate(I, NotMovedPHINode); 
- 
-  if (I.isTerminator()) 
-    return reportInvalidCandidate(I, NotMovedTerminator); 
- 
-  // TODO remove this limitation. 
-  if (!isControlFlowEquivalent(I, InsertPoint, DT, *PDT)) 
-    return reportInvalidCandidate(I, NotControlFlowEquivalent); 
- 
-  if (!DT.dominates(&InsertPoint, &I)) 
-    for (const Use &U : I.uses()) 
-      if (auto *UserInst = dyn_cast<Instruction>(U.getUser())) 
-        if (UserInst != &InsertPoint && !DT.dominates(&InsertPoint, U)) 
-          return false; 
-  if (!DT.dominates(&I, &InsertPoint)) 
-    for (const Value *Op : I.operands()) 
-      if (auto *OpInst = dyn_cast<Instruction>(Op)) 
-        if (&InsertPoint == OpInst || !DT.dominates(OpInst, &InsertPoint)) 
-          return false; 
- 
-  DT.updateDFSNumbers(); 
-  const bool MoveForward = domTreeLevelBefore(&DT, &I, &InsertPoint); 
-  Instruction &StartInst = (MoveForward ? I : InsertPoint); 
-  Instruction &EndInst = (MoveForward ? InsertPoint : I); 
-  SmallPtrSet<Instruction *, 10> InstsToCheck; 
-  collectInstructionsInBetween(StartInst, EndInst, InstsToCheck); 
-  if (!MoveForward) 
-    InstsToCheck.insert(&InsertPoint); 
- 
-  // Check if there exists instructions which may throw, may synchonize, or may 
-  // never return, from I to InsertPoint. 
-  if (!isSafeToSpeculativelyExecute(&I)) 
+//===- CodeMoverUtils.cpp - CodeMover Utilities ----------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This family of functions perform movements on basic blocks, and instructions
+// contained within a function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/CodeMoverUtils.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "codemover-utils"
+
+STATISTIC(HasDependences,
+          "Cannot move across instructions that has memory dependences");
+STATISTIC(MayThrowException, "Cannot move across instructions that may throw");
+STATISTIC(NotControlFlowEquivalent,
+          "Instructions are not control flow equivalent");
+STATISTIC(NotMovedPHINode, "Movement of PHINodes are not supported");
+STATISTIC(NotMovedTerminator, "Movement of Terminator are not supported");
+
+namespace {
+/// Represent a control condition. A control condition is a condition of a
+/// terminator to decide which successors to execute. The pointer field
+/// represents the address of the condition of the terminator. The integer field
+/// is a bool, it is true when the basic block is executed when V is true. For
+/// example, `br %cond, bb0, bb1` %cond is a control condition of bb0 with the
+/// integer field equals to true, while %cond is a control condition of bb1 with
+/// the integer field equals to false.
+using ControlCondition = PointerIntPair<Value *, 1, bool>;
+#ifndef NDEBUG
+raw_ostream &operator<<(raw_ostream &OS, const ControlCondition &C) {
+  OS << "[" << *C.getPointer() << ", " << (C.getInt() ? "true" : "false")
+     << "]";
+  return OS;
+}
+#endif
+
+/// Represent a set of control conditions required to execute ToBB from FromBB.
+class ControlConditions {
+  using ConditionVectorTy = SmallVector<ControlCondition, 6>;
+
+  /// A SmallVector of control conditions.
+  ConditionVectorTy Conditions;
+
+public:
+  /// Return a ControlConditions which stores all conditions required to execute
+  /// \p BB from \p Dominator. If \p MaxLookup is non-zero, it limits the
+  /// number of conditions to collect. Return None if not all conditions are
+  /// collected successfully, or we hit the limit.
+  static const Optional<ControlConditions>
+  collectControlConditions(const BasicBlock &BB, const BasicBlock &Dominator,
+                           const DominatorTree &DT,
+                           const PostDominatorTree &PDT,
+                           unsigned MaxLookup = 6);
+
+  /// Return true if there exists no control conditions required to execute ToBB
+  /// from FromBB.
+  bool isUnconditional() const { return Conditions.empty(); }
+
+  /// Return a constant reference of Conditions.
+  const ConditionVectorTy &getControlConditions() const { return Conditions; }
+
+  /// Add \p V as one of the ControlCondition in Condition with IsTrueCondition
+  /// equals to \p True. Return true if inserted successfully.
+  bool addControlCondition(ControlCondition C);
+
+  /// Return true if for all control conditions in Conditions, there exists an
+  /// equivalent control condition in \p Other.Conditions.
+  bool isEquivalent(const ControlConditions &Other) const;
+
+  /// Return true if \p C1 and \p C2 are equivalent.
+  static bool isEquivalent(const ControlCondition &C1,
+                           const ControlCondition &C2);
+
+private:
+  ControlConditions() = default;
+
+  static bool isEquivalent(const Value &V1, const Value &V2);
+  static bool isInverse(const Value &V1, const Value &V2);
+};
+} // namespace
+
+static bool domTreeLevelBefore(DominatorTree *DT, const Instruction *InstA,
+                               const Instruction *InstB) {
+  // Use ordered basic block in case the 2 instructions are in the same
+  // block.
+  if (InstA->getParent() == InstB->getParent())
+    return InstA->comesBefore(InstB);
+
+  DomTreeNode *DA = DT->getNode(InstA->getParent());
+  DomTreeNode *DB = DT->getNode(InstB->getParent());
+  return DA->getLevel() < DB->getLevel();
+}
+
+const Optional<ControlConditions> ControlConditions::collectControlConditions(
+    const BasicBlock &BB, const BasicBlock &Dominator, const DominatorTree &DT,
+    const PostDominatorTree &PDT, unsigned MaxLookup) {
+  assert(DT.dominates(&Dominator, &BB) && "Expecting Dominator to dominate BB");
+
+  ControlConditions Conditions;
+  unsigned NumConditions = 0;
+
+  // BB is executed unconditional from itself.
+  if (&Dominator == &BB)
+    return Conditions;
+
+  const BasicBlock *CurBlock = &BB;
+  // Walk up the dominator tree from the associated DT node for BB to the
+  // associated DT node for Dominator.
+  do {
+    assert(DT.getNode(CurBlock) && "Expecting a valid DT node for CurBlock");
+    BasicBlock *IDom = DT.getNode(CurBlock)->getIDom()->getBlock();
+    assert(DT.dominates(&Dominator, IDom) &&
+           "Expecting Dominator to dominate IDom");
+
+    // Limitation: can only handle branch instruction currently.
+    const BranchInst *BI = dyn_cast<BranchInst>(IDom->getTerminator());
+    if (!BI)
+      return None;
+
+    bool Inserted = false;
+    if (PDT.dominates(CurBlock, IDom)) {
+      LLVM_DEBUG(dbgs() << CurBlock->getName()
+                        << " is executed unconditionally from "
+                        << IDom->getName() << "\n");
+    } else if (PDT.dominates(CurBlock, BI->getSuccessor(0))) {
+      LLVM_DEBUG(dbgs() << CurBlock->getName() << " is executed when \""
+                        << *BI->getCondition() << "\" is true from "
+                        << IDom->getName() << "\n");
+      Inserted = Conditions.addControlCondition(
+          ControlCondition(BI->getCondition(), true));
+    } else if (PDT.dominates(CurBlock, BI->getSuccessor(1))) {
+      LLVM_DEBUG(dbgs() << CurBlock->getName() << " is executed when \""
+                        << *BI->getCondition() << "\" is false from "
+                        << IDom->getName() << "\n");
+      Inserted = Conditions.addControlCondition(
+          ControlCondition(BI->getCondition(), false));
+    } else
+      return None;
+
+    if (Inserted)
+      ++NumConditions;
+
+    if (MaxLookup != 0 && NumConditions > MaxLookup)
+      return None;
+
+    CurBlock = IDom;
+  } while (CurBlock != &Dominator);
+
+  return Conditions;
+}
+
+bool ControlConditions::addControlCondition(ControlCondition C) {
+  bool Inserted = false;
+  if (none_of(Conditions, [&](ControlCondition &Exists) {
+        return ControlConditions::isEquivalent(C, Exists);
+      })) {
+    Conditions.push_back(C);
+    Inserted = true;
+  }
+
+  LLVM_DEBUG(dbgs() << (Inserted ? "Inserted " : "Not inserted ") << C << "\n");
+  return Inserted;
+}
+
+bool ControlConditions::isEquivalent(const ControlConditions &Other) const {
+  if (Conditions.empty() && Other.Conditions.empty())
+    return true;
+
+  if (Conditions.size() != Other.Conditions.size())
+    return false;
+
+  return all_of(Conditions, [&](const ControlCondition &C) {
+    return any_of(Other.Conditions, [&](const ControlCondition &OtherC) {
+      return ControlConditions::isEquivalent(C, OtherC);
+    });
+  });
+}
+
+bool ControlConditions::isEquivalent(const ControlCondition &C1,
+                                     const ControlCondition &C2) {
+  if (C1.getInt() == C2.getInt()) {
+    if (isEquivalent(*C1.getPointer(), *C2.getPointer()))
+      return true;
+  } else if (isInverse(*C1.getPointer(), *C2.getPointer()))
+    return true;
+
+  return false;
+}
+
+// FIXME: Use SCEV and reuse GVN/CSE logic to check for equivalence between
+// Values.
+// Currently, isEquivalent rely on other passes to ensure equivalent conditions
+// have the same value, e.g. GVN.
+bool ControlConditions::isEquivalent(const Value &V1, const Value &V2) {
+  return &V1 == &V2;
+}
+
+bool ControlConditions::isInverse(const Value &V1, const Value &V2) {
+  if (const CmpInst *Cmp1 = dyn_cast<CmpInst>(&V1))
+    if (const CmpInst *Cmp2 = dyn_cast<CmpInst>(&V2)) {
+      if (Cmp1->getPredicate() == Cmp2->getInversePredicate() &&
+          Cmp1->getOperand(0) == Cmp2->getOperand(0) &&
+          Cmp1->getOperand(1) == Cmp2->getOperand(1))
+        return true;
+
+      if (Cmp1->getPredicate() ==
+              CmpInst::getSwappedPredicate(Cmp2->getInversePredicate()) &&
+          Cmp1->getOperand(0) == Cmp2->getOperand(1) &&
+          Cmp1->getOperand(1) == Cmp2->getOperand(0))
+        return true;
+    }
+  return false;
+}
+
+bool llvm::isControlFlowEquivalent(const Instruction &I0, const Instruction &I1,
+                                   const DominatorTree &DT,
+                                   const PostDominatorTree &PDT) {
+  return isControlFlowEquivalent(*I0.getParent(), *I1.getParent(), DT, PDT);
+}
+
+bool llvm::isControlFlowEquivalent(const BasicBlock &BB0, const BasicBlock &BB1,
+                                   const DominatorTree &DT,
+                                   const PostDominatorTree &PDT) {
+  if (&BB0 == &BB1)
+    return true;
+
+  if ((DT.dominates(&BB0, &BB1) && PDT.dominates(&BB1, &BB0)) ||
+      (PDT.dominates(&BB0, &BB1) && DT.dominates(&BB1, &BB0)))
+    return true;
+
+  // If the set of conditions required to execute BB0 and BB1 from their common
+  // dominator are the same, then BB0 and BB1 are control flow equivalent.
+  const BasicBlock *CommonDominator = DT.findNearestCommonDominator(&BB0, &BB1);
+  LLVM_DEBUG(dbgs() << "The nearest common dominator of " << BB0.getName()
+                    << " and " << BB1.getName() << " is "
+                    << CommonDominator->getName() << "\n");
+
+  const Optional<ControlConditions> BB0Conditions =
+      ControlConditions::collectControlConditions(BB0, *CommonDominator, DT,
+                                                  PDT);
+  if (BB0Conditions == None)
+    return false;
+
+  const Optional<ControlConditions> BB1Conditions =
+      ControlConditions::collectControlConditions(BB1, *CommonDominator, DT,
+                                                  PDT);
+  if (BB1Conditions == None)
+    return false;
+
+  return BB0Conditions->isEquivalent(*BB1Conditions);
+}
+
+static bool reportInvalidCandidate(const Instruction &I,
+                                   llvm::Statistic &Stat) {
+  ++Stat;
+  LLVM_DEBUG(dbgs() << "Unable to move instruction: " << I << ". "
+                    << Stat.getDesc());
+  return false;
+}
+
+/// Collect all instructions in between \p StartInst and \p EndInst, and store
+/// them in \p InBetweenInsts.
+static void
+collectInstructionsInBetween(Instruction &StartInst, const Instruction &EndInst,
+                             SmallPtrSetImpl<Instruction *> &InBetweenInsts) {
+  assert(InBetweenInsts.empty() && "Expecting InBetweenInsts to be empty");
+
+  /// Get the next instructions of \p I, and push them to \p WorkList.
+  auto getNextInsts = [](Instruction &I,
+                         SmallPtrSetImpl<Instruction *> &WorkList) {
+    if (Instruction *NextInst = I.getNextNode())
+      WorkList.insert(NextInst);
+    else {
+      assert(I.isTerminator() && "Expecting a terminator instruction");
+      for (BasicBlock *Succ : successors(&I))
+        WorkList.insert(&Succ->front());
+    }
+  };
+
+  SmallPtrSet<Instruction *, 10> WorkList;
+  getNextInsts(StartInst, WorkList);
+  while (!WorkList.empty()) {
+    Instruction *CurInst = *WorkList.begin();
+    WorkList.erase(CurInst);
+
+    if (CurInst == &EndInst)
+      continue;
+
+    if (!InBetweenInsts.insert(CurInst).second)
+      continue;
+
+    getNextInsts(*CurInst, WorkList);
+  }
+}
+
+bool llvm::isSafeToMoveBefore(Instruction &I, Instruction &InsertPoint,
+                              DominatorTree &DT, const PostDominatorTree *PDT,
+                              DependenceInfo *DI) {
+  // Skip tests when we don't have PDT or DI
+  if (!PDT || !DI)
+    return false;
+
+  // Cannot move itself before itself.
+  if (&I == &InsertPoint)
+    return false;
+
+  // Not moved.
+  if (I.getNextNode() == &InsertPoint)
+    return true;
+
+  if (isa<PHINode>(I) || isa<PHINode>(InsertPoint))
+    return reportInvalidCandidate(I, NotMovedPHINode);
+
+  if (I.isTerminator())
+    return reportInvalidCandidate(I, NotMovedTerminator);
+
+  // TODO remove this limitation.
+  if (!isControlFlowEquivalent(I, InsertPoint, DT, *PDT))
+    return reportInvalidCandidate(I, NotControlFlowEquivalent);
+
+  if (!DT.dominates(&InsertPoint, &I))
+    for (const Use &U : I.uses())
+      if (auto *UserInst = dyn_cast<Instruction>(U.getUser()))
+        if (UserInst != &InsertPoint && !DT.dominates(&InsertPoint, U))
+          return false;
+  if (!DT.dominates(&I, &InsertPoint))
+    for (const Value *Op : I.operands())
+      if (auto *OpInst = dyn_cast<Instruction>(Op))
+        if (&InsertPoint == OpInst || !DT.dominates(OpInst, &InsertPoint))
+          return false;
+
+  DT.updateDFSNumbers();
+  const bool MoveForward = domTreeLevelBefore(&DT, &I, &InsertPoint);
+  Instruction &StartInst = (MoveForward ? I : InsertPoint);
+  Instruction &EndInst = (MoveForward ? InsertPoint : I);
+  SmallPtrSet<Instruction *, 10> InstsToCheck;
+  collectInstructionsInBetween(StartInst, EndInst, InstsToCheck);
+  if (!MoveForward)
+    InstsToCheck.insert(&InsertPoint);
+
+  // Check if there exists instructions which may throw, may synchonize, or may
+  // never return, from I to InsertPoint.
+  if (!isSafeToSpeculativelyExecute(&I))
     if (llvm::any_of(InstsToCheck, [](Instruction *I) {
           if (I->mayThrow())
             return true;
- 
+
           const CallBase *CB = dyn_cast<CallBase>(I);
           if (!CB)
             return false;
@@ -366,14 +366,14 @@ bool llvm::isSafeToMoveBefore(Instruction &I, Instruction &InsertPoint,
             return true;
           if (!CB->hasFnAttr(Attribute::NoSync))
             return true;
- 
+
           return false;
         })) {
-      return reportInvalidCandidate(I, MayThrowException); 
-    } 
- 
-  // Check if I has any output/flow/anti dependences with instructions from \p 
-  // StartInst to \p EndInst. 
+      return reportInvalidCandidate(I, MayThrowException);
+    }
+
+  // Check if I has any output/flow/anti dependences with instructions from \p
+  // StartInst to \p EndInst.
   if (llvm::any_of(InstsToCheck, [&DI, &I](Instruction *CurInst) {
         auto DepResult = DI->depends(&I, CurInst, true);
         if (DepResult && (DepResult->isOutput() || DepResult->isFlow() ||
@@ -381,45 +381,45 @@ bool llvm::isSafeToMoveBefore(Instruction &I, Instruction &InsertPoint,
           return true;
         return false;
       }))
-    return reportInvalidCandidate(I, HasDependences); 
- 
-  return true; 
-} 
- 
-bool llvm::isSafeToMoveBefore(BasicBlock &BB, Instruction &InsertPoint, 
-                              DominatorTree &DT, const PostDominatorTree *PDT, 
-                              DependenceInfo *DI) { 
-  return llvm::all_of(BB, [&](Instruction &I) { 
-    if (BB.getTerminator() == &I) 
-      return true; 
- 
-    return isSafeToMoveBefore(I, InsertPoint, DT, PDT, DI); 
-  }); 
-} 
- 
-void llvm::moveInstructionsToTheBeginning(BasicBlock &FromBB, BasicBlock &ToBB, 
-                                          DominatorTree &DT, 
-                                          const PostDominatorTree &PDT, 
-                                          DependenceInfo &DI) { 
-  for (auto It = ++FromBB.rbegin(); It != FromBB.rend();) { 
-    Instruction *MovePos = ToBB.getFirstNonPHIOrDbg(); 
-    Instruction &I = *It; 
-    // Increment the iterator before modifying FromBB. 
-    ++It; 
- 
-    if (isSafeToMoveBefore(I, *MovePos, DT, &PDT, &DI)) 
-      I.moveBefore(MovePos); 
-  } 
-} 
- 
-void llvm::moveInstructionsToTheEnd(BasicBlock &FromBB, BasicBlock &ToBB, 
-                                    DominatorTree &DT, 
-                                    const PostDominatorTree &PDT, 
-                                    DependenceInfo &DI) { 
-  Instruction *MovePos = ToBB.getTerminator(); 
-  while (FromBB.size() > 1) { 
-    Instruction &I = FromBB.front(); 
-    if (isSafeToMoveBefore(I, *MovePos, DT, &PDT, &DI)) 
-      I.moveBefore(MovePos); 
-  } 
-} 
+    return reportInvalidCandidate(I, HasDependences);
+
+  return true;
+}
+
+bool llvm::isSafeToMoveBefore(BasicBlock &BB, Instruction &InsertPoint,
+                              DominatorTree &DT, const PostDominatorTree *PDT,
+                              DependenceInfo *DI) {
+  return llvm::all_of(BB, [&](Instruction &I) {
+    if (BB.getTerminator() == &I)
+      return true;
+
+    return isSafeToMoveBefore(I, InsertPoint, DT, PDT, DI);
+  });
+}
+
+void llvm::moveInstructionsToTheBeginning(BasicBlock &FromBB, BasicBlock &ToBB,
+                                          DominatorTree &DT,
+                                          const PostDominatorTree &PDT,
+                                          DependenceInfo &DI) {
+  for (auto It = ++FromBB.rbegin(); It != FromBB.rend();) {
+    Instruction *MovePos = ToBB.getFirstNonPHIOrDbg();
+    Instruction &I = *It;
+    // Increment the iterator before modifying FromBB.
+    ++It;
+
+    if (isSafeToMoveBefore(I, *MovePos, DT, &PDT, &DI))
+      I.moveBefore(MovePos);
+  }
+}
+
+void llvm::moveInstructionsToTheEnd(BasicBlock &FromBB, BasicBlock &ToBB,
+                                    DominatorTree &DT,
+                                    const PostDominatorTree &PDT,
+                                    DependenceInfo &DI) {
+  Instruction *MovePos = ToBB.getTerminator();
+  while (FromBB.size() > 1) {
+    Instruction &I = FromBB.front();
+    if (isSafeToMoveBefore(I, *MovePos, DT, &PDT, &DI))
+      I.moveBefore(MovePos);
+  }
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/CtorUtils.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/CtorUtils.cpp
index b973b7709b..069a86f6ab 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/CtorUtils.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/CtorUtils.cpp
@@ -1,159 +1,159 @@
-//===- CtorUtils.cpp - Helpers for working with global_ctors ----*- C++ -*-===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file defines functions that are used to process llvm.global_ctors. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/CtorUtils.h" 
-#include "llvm/ADT/BitVector.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GlobalVariable.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
- 
-#define DEBUG_TYPE "ctor_utils" 
- 
-using namespace llvm; 
- 
-/// Given a specified llvm.global_ctors list, remove the listed elements. 
-static void removeGlobalCtors(GlobalVariable *GCL, const BitVector &CtorsToRemove) { 
-  // Filter out the initializer elements to remove. 
-  ConstantArray *OldCA = cast<ConstantArray>(GCL->getInitializer()); 
-  SmallVector<Constant *, 10> CAList; 
-  for (unsigned I = 0, E = OldCA->getNumOperands(); I < E; ++I) 
-    if (!CtorsToRemove.test(I)) 
-      CAList.push_back(OldCA->getOperand(I)); 
- 
-  // Create the new array initializer. 
-  ArrayType *ATy = 
-      ArrayType::get(OldCA->getType()->getElementType(), CAList.size()); 
-  Constant *CA = ConstantArray::get(ATy, CAList); 
- 
-  // If we didn't change the number of elements, don't create a new GV. 
-  if (CA->getType() == OldCA->getType()) { 
-    GCL->setInitializer(CA); 
-    return; 
-  } 
- 
-  // Create the new global and insert it next to the existing list. 
-  GlobalVariable *NGV = 
-      new GlobalVariable(CA->getType(), GCL->isConstant(), GCL->getLinkage(), 
-                         CA, "", GCL->getThreadLocalMode()); 
-  GCL->getParent()->getGlobalList().insert(GCL->getIterator(), NGV); 
-  NGV->takeName(GCL); 
- 
-  // Nuke the old list, replacing any uses with the new one. 
-  if (!GCL->use_empty()) { 
-    Constant *V = NGV; 
-    if (V->getType() != GCL->getType()) 
-      V = ConstantExpr::getBitCast(V, GCL->getType()); 
-    GCL->replaceAllUsesWith(V); 
-  } 
-  GCL->eraseFromParent(); 
-} 
- 
-/// Given a llvm.global_ctors list that we can understand, 
-/// return a list of the functions and null terminator as a vector. 
-static std::vector<Function *> parseGlobalCtors(GlobalVariable *GV) { 
-  if (GV->getInitializer()->isNullValue()) 
-    return std::vector<Function *>(); 
-  ConstantArray *CA = cast<ConstantArray>(GV->getInitializer()); 
-  std::vector<Function *> Result; 
-  Result.reserve(CA->getNumOperands()); 
-  for (auto &V : CA->operands()) { 
-    ConstantStruct *CS = cast<ConstantStruct>(V); 
-    Result.push_back(dyn_cast<Function>(CS->getOperand(1))); 
-  } 
-  return Result; 
-} 
- 
-/// Find the llvm.global_ctors list, verifying that all initializers have an 
-/// init priority of 65535. 
-static GlobalVariable *findGlobalCtors(Module &M) { 
-  GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors"); 
-  if (!GV) 
-    return nullptr; 
- 
-  // Verify that the initializer is simple enough for us to handle. We are 
-  // only allowed to optimize the initializer if it is unique. 
-  if (!GV->hasUniqueInitializer()) 
-    return nullptr; 
- 
-  if (isa<ConstantAggregateZero>(GV->getInitializer())) 
-    return GV; 
-  ConstantArray *CA = cast<ConstantArray>(GV->getInitializer()); 
- 
-  for (auto &V : CA->operands()) { 
-    if (isa<ConstantAggregateZero>(V)) 
-      continue; 
-    ConstantStruct *CS = cast<ConstantStruct>(V); 
-    if (isa<ConstantPointerNull>(CS->getOperand(1))) 
-      continue; 
- 
-    // Must have a function or null ptr. 
-    if (!isa<Function>(CS->getOperand(1))) 
-      return nullptr; 
- 
-    // Init priority must be standard. 
-    ConstantInt *CI = cast<ConstantInt>(CS->getOperand(0)); 
-    if (CI->getZExtValue() != 65535) 
-      return nullptr; 
-  } 
- 
-  return GV; 
-} 
- 
-/// Call "ShouldRemove" for every entry in M's global_ctor list and remove the 
-/// entries for which it returns true.  Return true if anything changed. 
-bool llvm::optimizeGlobalCtorsList( 
-    Module &M, function_ref<bool(Function *)> ShouldRemove) { 
-  GlobalVariable *GlobalCtors = findGlobalCtors(M); 
-  if (!GlobalCtors) 
-    return false; 
- 
-  std::vector<Function *> Ctors = parseGlobalCtors(GlobalCtors); 
-  if (Ctors.empty()) 
-    return false; 
- 
-  bool MadeChange = false; 
- 
-  // Loop over global ctors, optimizing them when we can. 
-  unsigned NumCtors = Ctors.size(); 
-  BitVector CtorsToRemove(NumCtors); 
-  for (unsigned i = 0; i != Ctors.size() && NumCtors > 0; ++i) { 
-    Function *F = Ctors[i]; 
-    // Found a null terminator in the middle of the list, prune off the rest of 
-    // the list. 
-    if (!F) 
-      continue; 
- 
-    LLVM_DEBUG(dbgs() << "Optimizing Global Constructor: " << *F << "\n"); 
- 
-    // We cannot simplify external ctor functions. 
-    if (F->empty()) 
-      continue; 
- 
-    // If we can evaluate the ctor at compile time, do. 
-    if (ShouldRemove(F)) { 
-      Ctors[i] = nullptr; 
-      CtorsToRemove.set(i); 
-      NumCtors--; 
-      MadeChange = true; 
-      continue; 
-    } 
-  } 
- 
-  if (!MadeChange) 
-    return false; 
- 
-  removeGlobalCtors(GlobalCtors, CtorsToRemove); 
-  return true; 
-} 
+//===- CtorUtils.cpp - Helpers for working with global_ctors ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines functions that are used to process llvm.global_ctors.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/CtorUtils.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "ctor_utils"
+
+using namespace llvm;
+
+/// Given a specified llvm.global_ctors list, remove the listed elements.
+static void removeGlobalCtors(GlobalVariable *GCL, const BitVector &CtorsToRemove) {
+  // Filter out the initializer elements to remove.
+  ConstantArray *OldCA = cast<ConstantArray>(GCL->getInitializer());
+  SmallVector<Constant *, 10> CAList;
+  for (unsigned I = 0, E = OldCA->getNumOperands(); I < E; ++I)
+    if (!CtorsToRemove.test(I))
+      CAList.push_back(OldCA->getOperand(I));
+
+  // Create the new array initializer.
+  ArrayType *ATy =
+      ArrayType::get(OldCA->getType()->getElementType(), CAList.size());
+  Constant *CA = ConstantArray::get(ATy, CAList);
+
+  // If we didn't change the number of elements, don't create a new GV.
+  if (CA->getType() == OldCA->getType()) {
+    GCL->setInitializer(CA);
+    return;
+  }
+
+  // Create the new global and insert it next to the existing list.
+  GlobalVariable *NGV =
+      new GlobalVariable(CA->getType(), GCL->isConstant(), GCL->getLinkage(),
+                         CA, "", GCL->getThreadLocalMode());
+  GCL->getParent()->getGlobalList().insert(GCL->getIterator(), NGV);
+  NGV->takeName(GCL);
+
+  // Nuke the old list, replacing any uses with the new one.
+  if (!GCL->use_empty()) {
+    Constant *V = NGV;
+    if (V->getType() != GCL->getType())
+      V = ConstantExpr::getBitCast(V, GCL->getType());
+    GCL->replaceAllUsesWith(V);
+  }
+  GCL->eraseFromParent();
+}
+
+/// Given a llvm.global_ctors list that we can understand,
+/// return a list of the functions and null terminator as a vector.
+static std::vector<Function *> parseGlobalCtors(GlobalVariable *GV) {
+  if (GV->getInitializer()->isNullValue())
+    return std::vector<Function *>();
+  ConstantArray *CA = cast<ConstantArray>(GV->getInitializer());
+  std::vector<Function *> Result;
+  Result.reserve(CA->getNumOperands());
+  for (auto &V : CA->operands()) {
+    ConstantStruct *CS = cast<ConstantStruct>(V);
+    Result.push_back(dyn_cast<Function>(CS->getOperand(1)));
+  }
+  return Result;
+}
+
+/// Find the llvm.global_ctors list, verifying that all initializers have an
+/// init priority of 65535.
+static GlobalVariable *findGlobalCtors(Module &M) {
+  GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
+  if (!GV)
+    return nullptr;
+
+  // Verify that the initializer is simple enough for us to handle. We are
+  // only allowed to optimize the initializer if it is unique.
+  if (!GV->hasUniqueInitializer())
+    return nullptr;
+
+  if (isa<ConstantAggregateZero>(GV->getInitializer()))
+    return GV;
+  ConstantArray *CA = cast<ConstantArray>(GV->getInitializer());
+
+  for (auto &V : CA->operands()) {
+    if (isa<ConstantAggregateZero>(V))
+      continue;
+    ConstantStruct *CS = cast<ConstantStruct>(V);
+    if (isa<ConstantPointerNull>(CS->getOperand(1)))
+      continue;
+
+    // Must have a function or null ptr.
+    if (!isa<Function>(CS->getOperand(1)))
+      return nullptr;
+
+    // Init priority must be standard.
+    ConstantInt *CI = cast<ConstantInt>(CS->getOperand(0));
+    if (CI->getZExtValue() != 65535)
+      return nullptr;
+  }
+
+  return GV;
+}
+
+/// Call "ShouldRemove" for every entry in M's global_ctor list and remove the
+/// entries for which it returns true.  Return true if anything changed.
+bool llvm::optimizeGlobalCtorsList(
+    Module &M, function_ref<bool(Function *)> ShouldRemove) {
+  GlobalVariable *GlobalCtors = findGlobalCtors(M);
+  if (!GlobalCtors)
+    return false;
+
+  std::vector<Function *> Ctors = parseGlobalCtors(GlobalCtors);
+  if (Ctors.empty())
+    return false;
+
+  bool MadeChange = false;
+
+  // Loop over global ctors, optimizing them when we can.
+  unsigned NumCtors = Ctors.size();
+  BitVector CtorsToRemove(NumCtors);
+  for (unsigned i = 0; i != Ctors.size() && NumCtors > 0; ++i) {
+    Function *F = Ctors[i];
+    // Found a null terminator in the middle of the list, prune off the rest of
+    // the list.
+    if (!F)
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Optimizing Global Constructor: " << *F << "\n");
+
+    // We cannot simplify external ctor functions.
+    if (F->empty())
+      continue;
+
+    // If we can evaluate the ctor at compile time, do.
+    if (ShouldRemove(F)) {
+      Ctors[i] = nullptr;
+      CtorsToRemove.set(i);
+      NumCtors--;
+      MadeChange = true;
+      continue;
+    }
+  }
+
+  if (!MadeChange)
+    return false;
+
+  removeGlobalCtors(GlobalCtors, CtorsToRemove);
+  return true;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/Debugify.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/Debugify.cpp
index 9cbea67a55..3e4d53c10d 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/Debugify.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/Debugify.cpp
@@ -1,204 +1,204 @@
-//===- Debugify.cpp - Attach synthetic debug info to everything -----------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-/// 
-/// \file This pass attaches synthetic debug info to everything. It can be used 
-/// to create targeted tests for debug info preservation. 
-/// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/Debugify.h" 
-#include "llvm/ADT/BitVector.h" 
-#include "llvm/ADT/StringExtras.h" 
-#include "llvm/IR/DIBuilder.h" 
-#include "llvm/IR/DebugInfo.h" 
-#include "llvm/IR/InstIterator.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Module.h" 
+//===- Debugify.cpp - Attach synthetic debug info to everything -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file This pass attaches synthetic debug info to everything. It can be used
+/// to create targeted tests for debug info preservation.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/Debugify.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PassInstrumentation.h"
-#include "llvm/Pass.h" 
-#include "llvm/Support/CommandLine.h" 
- 
-using namespace llvm; 
- 
-namespace { 
- 
-cl::opt<bool> Quiet("debugify-quiet", 
-                    cl::desc("Suppress verbose debugify output")); 
- 
-enum class Level { 
-  Locations, 
-  LocationsAndVariables 
-}; 
-cl::opt<Level> DebugifyLevel( 
-    "debugify-level", cl::desc("Kind of debug info to add"), 
-    cl::values(clEnumValN(Level::Locations, "locations", "Locations only"), 
-               clEnumValN(Level::LocationsAndVariables, "location+variables", 
-                          "Locations and Variables")), 
-    cl::init(Level::LocationsAndVariables)); 
- 
-raw_ostream &dbg() { return Quiet ? nulls() : errs(); } 
- 
-uint64_t getAllocSizeInBits(Module &M, Type *Ty) { 
-  return Ty->isSized() ? M.getDataLayout().getTypeAllocSizeInBits(Ty) : 0; 
-} 
- 
-bool isFunctionSkipped(Function &F) { 
-  return F.isDeclaration() || !F.hasExactDefinition(); 
-} 
- 
-/// Find the basic block's terminating instruction. 
-/// 
-/// Special care is needed to handle musttail and deopt calls, as these behave 
-/// like (but are in fact not) terminators. 
-Instruction *findTerminatingInstruction(BasicBlock &BB) { 
-  if (auto *I = BB.getTerminatingMustTailCall()) 
-    return I; 
-  if (auto *I = BB.getTerminatingDeoptimizeCall()) 
-    return I; 
-  return BB.getTerminator(); 
-} 
-} // end anonymous namespace 
- 
-bool llvm::applyDebugifyMetadata( 
-    Module &M, iterator_range<Module::iterator> Functions, StringRef Banner, 
-    std::function<bool(DIBuilder &DIB, Function &F)> ApplyToMF) { 
-  // Skip modules with debug info. 
-  if (M.getNamedMetadata("llvm.dbg.cu")) { 
-    dbg() << Banner << "Skipping module with debug info\n"; 
-    return false; 
-  } 
- 
-  DIBuilder DIB(M); 
-  LLVMContext &Ctx = M.getContext(); 
-  auto *Int32Ty = Type::getInt32Ty(Ctx); 
- 
-  // Get a DIType which corresponds to Ty. 
-  DenseMap<uint64_t, DIType *> TypeCache; 
-  auto getCachedDIType = [&](Type *Ty) -> DIType * { 
-    uint64_t Size = getAllocSizeInBits(M, Ty); 
-    DIType *&DTy = TypeCache[Size]; 
-    if (!DTy) { 
-      std::string Name = "ty" + utostr(Size); 
-      DTy = DIB.createBasicType(Name, Size, dwarf::DW_ATE_unsigned); 
-    } 
-    return DTy; 
-  }; 
- 
-  unsigned NextLine = 1; 
-  unsigned NextVar = 1; 
-  auto File = DIB.createFile(M.getName(), "/"); 
-  auto CU = DIB.createCompileUnit(dwarf::DW_LANG_C, File, "debugify", 
-                                  /*isOptimized=*/true, "", 0); 
- 
-  // Visit each instruction. 
-  for (Function &F : Functions) { 
-    if (isFunctionSkipped(F)) 
-      continue; 
- 
-    bool InsertedDbgVal = false; 
-    auto SPType = DIB.createSubroutineType(DIB.getOrCreateTypeArray(None)); 
-    DISubprogram::DISPFlags SPFlags = 
-        DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized; 
-    if (F.hasPrivateLinkage() || F.hasInternalLinkage()) 
-      SPFlags |= DISubprogram::SPFlagLocalToUnit; 
-    auto SP = DIB.createFunction(CU, F.getName(), F.getName(), File, NextLine, 
-                                 SPType, NextLine, DINode::FlagZero, SPFlags); 
-    F.setSubprogram(SP); 
- 
-    // Helper that inserts a dbg.value before \p InsertBefore, copying the 
-    // location (and possibly the type, if it's non-void) from \p TemplateInst. 
-    auto insertDbgVal = [&](Instruction &TemplateInst, 
-                            Instruction *InsertBefore) { 
-      std::string Name = utostr(NextVar++); 
-      Value *V = &TemplateInst; 
-      if (TemplateInst.getType()->isVoidTy()) 
-        V = ConstantInt::get(Int32Ty, 0); 
-      const DILocation *Loc = TemplateInst.getDebugLoc().get(); 
-      auto LocalVar = DIB.createAutoVariable(SP, Name, File, Loc->getLine(), 
-                                             getCachedDIType(V->getType()), 
-                                             /*AlwaysPreserve=*/true); 
-      DIB.insertDbgValueIntrinsic(V, LocalVar, DIB.createExpression(), Loc, 
-                                  InsertBefore); 
-    }; 
- 
-    for (BasicBlock &BB : F) { 
-      // Attach debug locations. 
-      for (Instruction &I : BB) 
-        I.setDebugLoc(DILocation::get(Ctx, NextLine++, 1, SP)); 
- 
-      if (DebugifyLevel < Level::LocationsAndVariables) 
-        continue; 
- 
-      // Inserting debug values into EH pads can break IR invariants. 
-      if (BB.isEHPad()) 
-        continue; 
- 
-      // Find the terminating instruction, after which no debug values are 
-      // attached. 
-      Instruction *LastInst = findTerminatingInstruction(BB); 
-      assert(LastInst && "Expected basic block with a terminator"); 
- 
-      // Maintain an insertion point which can't be invalidated when updates 
-      // are made. 
-      BasicBlock::iterator InsertPt = BB.getFirstInsertionPt(); 
-      assert(InsertPt != BB.end() && "Expected to find an insertion point"); 
-      Instruction *InsertBefore = &*InsertPt; 
- 
-      // Attach debug values. 
-      for (Instruction *I = &*BB.begin(); I != LastInst; I = I->getNextNode()) { 
-        // Skip void-valued instructions. 
-        if (I->getType()->isVoidTy()) 
-          continue; 
- 
-        // Phis and EH pads must be grouped at the beginning of the block. 
-        // Only advance the insertion point when we finish visiting these. 
-        if (!isa<PHINode>(I) && !I->isEHPad()) 
-          InsertBefore = I->getNextNode(); 
- 
-        insertDbgVal(*I, InsertBefore); 
-        InsertedDbgVal = true; 
-      } 
-    } 
-    // Make sure we emit at least one dbg.value, otherwise MachineDebugify may 
-    // not have anything to work with as it goes about inserting DBG_VALUEs. 
-    // (It's common for MIR tests to be written containing skeletal IR with 
-    // empty functions -- we're still interested in debugifying the MIR within 
-    // those tests, and this helps with that.) 
-    if (DebugifyLevel == Level::LocationsAndVariables && !InsertedDbgVal) { 
-      auto *Term = findTerminatingInstruction(F.getEntryBlock()); 
-      insertDbgVal(*Term, Term); 
-    } 
-    if (ApplyToMF) 
-      ApplyToMF(DIB, F); 
-    DIB.finalizeSubprogram(SP); 
-  } 
-  DIB.finalize(); 
- 
-  // Track the number of distinct lines and variables. 
-  NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.debugify"); 
-  auto addDebugifyOperand = [&](unsigned N) { 
-    NMD->addOperand(MDNode::get( 
-        Ctx, ValueAsMetadata::getConstant(ConstantInt::get(Int32Ty, N)))); 
-  }; 
-  addDebugifyOperand(NextLine - 1); // Original number of lines. 
-  addDebugifyOperand(NextVar - 1);  // Original number of variables. 
-  assert(NMD->getNumOperands() == 2 && 
-         "llvm.debugify should have exactly 2 operands!"); 
- 
-  // Claim that this synthetic debug info is valid. 
-  StringRef DIVersionKey = "Debug Info Version"; 
-  if (!M.getModuleFlag(DIVersionKey)) 
-    M.addModuleFlag(Module::Warning, DIVersionKey, DEBUG_METADATA_VERSION); 
- 
-  return true; 
-} 
- 
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+namespace {
+
+cl::opt<bool> Quiet("debugify-quiet",
+                    cl::desc("Suppress verbose debugify output"));
+
+enum class Level {
+  Locations,
+  LocationsAndVariables
+};
+cl::opt<Level> DebugifyLevel(
+    "debugify-level", cl::desc("Kind of debug info to add"),
+    cl::values(clEnumValN(Level::Locations, "locations", "Locations only"),
+               clEnumValN(Level::LocationsAndVariables, "location+variables",
+                          "Locations and Variables")),
+    cl::init(Level::LocationsAndVariables));
+
+raw_ostream &dbg() { return Quiet ? nulls() : errs(); }
+
+uint64_t getAllocSizeInBits(Module &M, Type *Ty) {
+  return Ty->isSized() ? M.getDataLayout().getTypeAllocSizeInBits(Ty) : 0;
+}
+
+bool isFunctionSkipped(Function &F) {
+  return F.isDeclaration() || !F.hasExactDefinition();
+}
+
+/// Find the basic block's terminating instruction.
+///
+/// Special care is needed to handle musttail and deopt calls, as these behave
+/// like (but are in fact not) terminators.
+Instruction *findTerminatingInstruction(BasicBlock &BB) {
+  if (auto *I = BB.getTerminatingMustTailCall())
+    return I;
+  if (auto *I = BB.getTerminatingDeoptimizeCall())
+    return I;
+  return BB.getTerminator();
+}
+} // end anonymous namespace
+
+bool llvm::applyDebugifyMetadata(
+    Module &M, iterator_range<Module::iterator> Functions, StringRef Banner,
+    std::function<bool(DIBuilder &DIB, Function &F)> ApplyToMF) {
+  // Skip modules with debug info.
+  if (M.getNamedMetadata("llvm.dbg.cu")) {
+    dbg() << Banner << "Skipping module with debug info\n";
+    return false;
+  }
+
+  DIBuilder DIB(M);
+  LLVMContext &Ctx = M.getContext();
+  auto *Int32Ty = Type::getInt32Ty(Ctx);
+
+  // Get a DIType which corresponds to Ty.
+  DenseMap<uint64_t, DIType *> TypeCache;
+  auto getCachedDIType = [&](Type *Ty) -> DIType * {
+    uint64_t Size = getAllocSizeInBits(M, Ty);
+    DIType *&DTy = TypeCache[Size];
+    if (!DTy) {
+      std::string Name = "ty" + utostr(Size);
+      DTy = DIB.createBasicType(Name, Size, dwarf::DW_ATE_unsigned);
+    }
+    return DTy;
+  };
+
+  unsigned NextLine = 1;
+  unsigned NextVar = 1;
+  auto File = DIB.createFile(M.getName(), "/");
+  auto CU = DIB.createCompileUnit(dwarf::DW_LANG_C, File, "debugify",
+                                  /*isOptimized=*/true, "", 0);
+
+  // Visit each instruction.
+  for (Function &F : Functions) {
+    if (isFunctionSkipped(F))
+      continue;
+
+    bool InsertedDbgVal = false;
+    auto SPType = DIB.createSubroutineType(DIB.getOrCreateTypeArray(None));
+    DISubprogram::DISPFlags SPFlags =
+        DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized;
+    if (F.hasPrivateLinkage() || F.hasInternalLinkage())
+      SPFlags |= DISubprogram::SPFlagLocalToUnit;
+    auto SP = DIB.createFunction(CU, F.getName(), F.getName(), File, NextLine,
+                                 SPType, NextLine, DINode::FlagZero, SPFlags);
+    F.setSubprogram(SP);
+
+    // Helper that inserts a dbg.value before \p InsertBefore, copying the
+    // location (and possibly the type, if it's non-void) from \p TemplateInst.
+    auto insertDbgVal = [&](Instruction &TemplateInst,
+                            Instruction *InsertBefore) {
+      std::string Name = utostr(NextVar++);
+      Value *V = &TemplateInst;
+      if (TemplateInst.getType()->isVoidTy())
+        V = ConstantInt::get(Int32Ty, 0);
+      const DILocation *Loc = TemplateInst.getDebugLoc().get();
+      auto LocalVar = DIB.createAutoVariable(SP, Name, File, Loc->getLine(),
+                                             getCachedDIType(V->getType()),
+                                             /*AlwaysPreserve=*/true);
+      DIB.insertDbgValueIntrinsic(V, LocalVar, DIB.createExpression(), Loc,
+                                  InsertBefore);
+    };
+
+    for (BasicBlock &BB : F) {
+      // Attach debug locations.
+      for (Instruction &I : BB)
+        I.setDebugLoc(DILocation::get(Ctx, NextLine++, 1, SP));
+
+      if (DebugifyLevel < Level::LocationsAndVariables)
+        continue;
+
+      // Inserting debug values into EH pads can break IR invariants.
+      if (BB.isEHPad())
+        continue;
+
+      // Find the terminating instruction, after which no debug values are
+      // attached.
+      Instruction *LastInst = findTerminatingInstruction(BB);
+      assert(LastInst && "Expected basic block with a terminator");
+
+      // Maintain an insertion point which can't be invalidated when updates
+      // are made.
+      BasicBlock::iterator InsertPt = BB.getFirstInsertionPt();
+      assert(InsertPt != BB.end() && "Expected to find an insertion point");
+      Instruction *InsertBefore = &*InsertPt;
+
+      // Attach debug values.
+      for (Instruction *I = &*BB.begin(); I != LastInst; I = I->getNextNode()) {
+        // Skip void-valued instructions.
+        if (I->getType()->isVoidTy())
+          continue;
+
+        // Phis and EH pads must be grouped at the beginning of the block.
+        // Only advance the insertion point when we finish visiting these.
+        if (!isa<PHINode>(I) && !I->isEHPad())
+          InsertBefore = I->getNextNode();
+
+        insertDbgVal(*I, InsertBefore);
+        InsertedDbgVal = true;
+      }
+    }
+    // Make sure we emit at least one dbg.value, otherwise MachineDebugify may
+    // not have anything to work with as it goes about inserting DBG_VALUEs.
+    // (It's common for MIR tests to be written containing skeletal IR with
+    // empty functions -- we're still interested in debugifying the MIR within
+    // those tests, and this helps with that.)
+    if (DebugifyLevel == Level::LocationsAndVariables && !InsertedDbgVal) {
+      auto *Term = findTerminatingInstruction(F.getEntryBlock());
+      insertDbgVal(*Term, Term);
+    }
+    if (ApplyToMF)
+      ApplyToMF(DIB, F);
+    DIB.finalizeSubprogram(SP);
+  }
+  DIB.finalize();
+
+  // Track the number of distinct lines and variables.
+  NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.debugify");
+  auto addDebugifyOperand = [&](unsigned N) {
+    NMD->addOperand(MDNode::get(
+        Ctx, ValueAsMetadata::getConstant(ConstantInt::get(Int32Ty, N))));
+  };
+  addDebugifyOperand(NextLine - 1); // Original number of lines.
+  addDebugifyOperand(NextVar - 1);  // Original number of variables.
+  assert(NMD->getNumOperands() == 2 &&
+         "llvm.debugify should have exactly 2 operands!");
+
+  // Claim that this synthetic debug info is valid.
+  StringRef DIVersionKey = "Debug Info Version";
+  if (!M.getModuleFlag(DIVersionKey))
+    M.addModuleFlag(Module::Warning, DIVersionKey, DEBUG_METADATA_VERSION);
+
+  return true;
+}
+
 static bool applyDebugify(Function &F) {
   Module &M = *F.getParent();
   auto FuncIt = F.getIterator();
@@ -211,270 +211,270 @@ static bool applyDebugify(Module &M) {
                                "ModuleDebugify: ", /*ApplyToMF=*/nullptr);
 }
 
-bool llvm::stripDebugifyMetadata(Module &M) { 
-  bool Changed = false; 
- 
-  // Remove the llvm.debugify module-level named metadata. 
-  NamedMDNode *DebugifyMD = M.getNamedMetadata("llvm.debugify"); 
-  if (DebugifyMD) { 
-    M.eraseNamedMetadata(DebugifyMD); 
-    Changed = true; 
-  } 
- 
-  // Strip out all debug intrinsics and supporting metadata (subprograms, types, 
-  // variables, etc). 
-  Changed |= StripDebugInfo(M); 
- 
-  // Strip out the dead dbg.value prototype. 
-  Function *DbgValF = M.getFunction("llvm.dbg.value"); 
-  if (DbgValF) { 
-    assert(DbgValF->isDeclaration() && DbgValF->use_empty() && 
-           "Not all debug info stripped?"); 
-    DbgValF->eraseFromParent(); 
-    Changed = true; 
-  } 
- 
-  // Strip out the module-level Debug Info Version metadata. 
-  // FIXME: There must be an easier way to remove an operand from a NamedMDNode. 
-  NamedMDNode *NMD = M.getModuleFlagsMetadata(); 
-  if (!NMD) 
-    return Changed; 
+bool llvm::stripDebugifyMetadata(Module &M) {
+  bool Changed = false;
+
+  // Remove the llvm.debugify module-level named metadata.
+  NamedMDNode *DebugifyMD = M.getNamedMetadata("llvm.debugify");
+  if (DebugifyMD) {
+    M.eraseNamedMetadata(DebugifyMD);
+    Changed = true;
+  }
+
+  // Strip out all debug intrinsics and supporting metadata (subprograms, types,
+  // variables, etc).
+  Changed |= StripDebugInfo(M);
+
+  // Strip out the dead dbg.value prototype.
+  Function *DbgValF = M.getFunction("llvm.dbg.value");
+  if (DbgValF) {
+    assert(DbgValF->isDeclaration() && DbgValF->use_empty() &&
+           "Not all debug info stripped?");
+    DbgValF->eraseFromParent();
+    Changed = true;
+  }
+
+  // Strip out the module-level Debug Info Version metadata.
+  // FIXME: There must be an easier way to remove an operand from a NamedMDNode.
+  NamedMDNode *NMD = M.getModuleFlagsMetadata();
+  if (!NMD)
+    return Changed;
   SmallVector<MDNode *, 4> Flags(NMD->operands());
-  NMD->clearOperands(); 
-  for (MDNode *Flag : Flags) { 
-    MDString *Key = dyn_cast_or_null<MDString>(Flag->getOperand(1)); 
-    if (Key->getString() == "Debug Info Version") { 
-      Changed = true; 
-      continue; 
-    } 
-    NMD->addOperand(Flag); 
-  } 
-  // If we left it empty we might as well remove it. 
-  if (NMD->getNumOperands() == 0) 
-    NMD->eraseFromParent(); 
- 
-  return Changed; 
-} 
- 
-namespace { 
-/// Return true if a mis-sized diagnostic is issued for \p DVI. 
-bool diagnoseMisSizedDbgValue(Module &M, DbgValueInst *DVI) { 
-  // The size of a dbg.value's value operand should match the size of the 
-  // variable it corresponds to. 
-  // 
-  // TODO: This, along with a check for non-null value operands, should be 
-  // promoted to verifier failures. 
-  Value *V = DVI->getValue(); 
-  if (!V) 
-    return false; 
- 
-  // For now, don't try to interpret anything more complicated than an empty 
-  // DIExpression. Eventually we should try to handle OP_deref and fragments. 
-  if (DVI->getExpression()->getNumElements()) 
-    return false; 
- 
-  Type *Ty = V->getType(); 
-  uint64_t ValueOperandSize = getAllocSizeInBits(M, Ty); 
-  Optional<uint64_t> DbgVarSize = DVI->getFragmentSizeInBits(); 
-  if (!ValueOperandSize || !DbgVarSize) 
-    return false; 
- 
-  bool HasBadSize = false; 
-  if (Ty->isIntegerTy()) { 
-    auto Signedness = DVI->getVariable()->getSignedness(); 
-    if (Signedness && *Signedness == DIBasicType::Signedness::Signed) 
-      HasBadSize = ValueOperandSize < *DbgVarSize; 
-  } else { 
-    HasBadSize = ValueOperandSize != *DbgVarSize; 
-  } 
- 
-  if (HasBadSize) { 
-    dbg() << "ERROR: dbg.value operand has size " << ValueOperandSize 
-          << ", but its variable has size " << *DbgVarSize << ": "; 
-    DVI->print(dbg()); 
-    dbg() << "\n"; 
-  } 
-  return HasBadSize; 
-} 
- 
-bool checkDebugifyMetadata(Module &M, 
-                           iterator_range<Module::iterator> Functions, 
-                           StringRef NameOfWrappedPass, StringRef Banner, 
-                           bool Strip, DebugifyStatsMap *StatsMap) { 
-  // Skip modules without debugify metadata. 
-  NamedMDNode *NMD = M.getNamedMetadata("llvm.debugify"); 
-  if (!NMD) { 
-    dbg() << Banner << ": Skipping module without debugify metadata\n"; 
-    return false; 
-  } 
- 
-  auto getDebugifyOperand = [&](unsigned Idx) -> unsigned { 
-    return mdconst::extract<ConstantInt>(NMD->getOperand(Idx)->getOperand(0)) 
-        ->getZExtValue(); 
-  }; 
-  assert(NMD->getNumOperands() == 2 && 
-         "llvm.debugify should have exactly 2 operands!"); 
-  unsigned OriginalNumLines = getDebugifyOperand(0); 
-  unsigned OriginalNumVars = getDebugifyOperand(1); 
-  bool HasErrors = false; 
- 
-  // Track debug info loss statistics if able. 
-  DebugifyStatistics *Stats = nullptr; 
-  if (StatsMap && !NameOfWrappedPass.empty()) 
-    Stats = &StatsMap->operator[](NameOfWrappedPass); 
- 
-  BitVector MissingLines{OriginalNumLines, true}; 
-  BitVector MissingVars{OriginalNumVars, true}; 
-  for (Function &F : Functions) { 
-    if (isFunctionSkipped(F)) 
-      continue; 
- 
-    // Find missing lines. 
-    for (Instruction &I : instructions(F)) { 
-      if (isa<DbgValueInst>(&I) || isa<PHINode>(&I)) 
-        continue; 
- 
-      auto DL = I.getDebugLoc(); 
-      if (DL && DL.getLine() != 0) { 
-        MissingLines.reset(DL.getLine() - 1); 
-        continue; 
-      } 
- 
-      if (!DL) { 
-        dbg() << "WARNING: Instruction with empty DebugLoc in function "; 
-        dbg() << F.getName() << " --"; 
-        I.print(dbg()); 
-        dbg() << "\n"; 
-      } 
-    } 
- 
-    // Find missing variables and mis-sized debug values. 
-    for (Instruction &I : instructions(F)) { 
-      auto *DVI = dyn_cast<DbgValueInst>(&I); 
-      if (!DVI) 
-        continue; 
- 
-      unsigned Var = ~0U; 
-      (void)to_integer(DVI->getVariable()->getName(), Var, 10); 
-      assert(Var <= OriginalNumVars && "Unexpected name for DILocalVariable"); 
-      bool HasBadSize = diagnoseMisSizedDbgValue(M, DVI); 
-      if (!HasBadSize) 
-        MissingVars.reset(Var - 1); 
-      HasErrors |= HasBadSize; 
-    } 
-  } 
- 
-  // Print the results. 
-  for (unsigned Idx : MissingLines.set_bits()) 
-    dbg() << "WARNING: Missing line " << Idx + 1 << "\n"; 
- 
-  for (unsigned Idx : MissingVars.set_bits()) 
-    dbg() << "WARNING: Missing variable " << Idx + 1 << "\n"; 
- 
-  // Update DI loss statistics. 
-  if (Stats) { 
-    Stats->NumDbgLocsExpected += OriginalNumLines; 
-    Stats->NumDbgLocsMissing += MissingLines.count(); 
-    Stats->NumDbgValuesExpected += OriginalNumVars; 
-    Stats->NumDbgValuesMissing += MissingVars.count(); 
-  } 
- 
-  dbg() << Banner; 
-  if (!NameOfWrappedPass.empty()) 
-    dbg() << " [" << NameOfWrappedPass << "]"; 
-  dbg() << ": " << (HasErrors ? "FAIL" : "PASS") << '\n'; 
- 
-  // Strip debugify metadata if required. 
-  if (Strip) 
-    return stripDebugifyMetadata(M); 
- 
-  return false; 
-} 
- 
-/// ModulePass for attaching synthetic debug info to everything, used with the 
-/// legacy module pass manager. 
-struct DebugifyModulePass : public ModulePass { 
+  NMD->clearOperands();
+  for (MDNode *Flag : Flags) {
+    MDString *Key = dyn_cast_or_null<MDString>(Flag->getOperand(1));
+    if (Key->getString() == "Debug Info Version") {
+      Changed = true;
+      continue;
+    }
+    NMD->addOperand(Flag);
+  }
+  // If we left it empty we might as well remove it.
+  if (NMD->getNumOperands() == 0)
+    NMD->eraseFromParent();
+
+  return Changed;
+}
+
+namespace {
+/// Return true if a mis-sized diagnostic is issued for \p DVI.
+bool diagnoseMisSizedDbgValue(Module &M, DbgValueInst *DVI) {
+  // The size of a dbg.value's value operand should match the size of the
+  // variable it corresponds to.
+  //
+  // TODO: This, along with a check for non-null value operands, should be
+  // promoted to verifier failures.
+  Value *V = DVI->getValue();
+  if (!V)
+    return false;
+
+  // For now, don't try to interpret anything more complicated than an empty
+  // DIExpression. Eventually we should try to handle OP_deref and fragments.
+  if (DVI->getExpression()->getNumElements())
+    return false;
+
+  Type *Ty = V->getType();
+  uint64_t ValueOperandSize = getAllocSizeInBits(M, Ty);
+  Optional<uint64_t> DbgVarSize = DVI->getFragmentSizeInBits();
+  if (!ValueOperandSize || !DbgVarSize)
+    return false;
+
+  bool HasBadSize = false;
+  if (Ty->isIntegerTy()) {
+    auto Signedness = DVI->getVariable()->getSignedness();
+    if (Signedness && *Signedness == DIBasicType::Signedness::Signed)
+      HasBadSize = ValueOperandSize < *DbgVarSize;
+  } else {
+    HasBadSize = ValueOperandSize != *DbgVarSize;
+  }
+
+  if (HasBadSize) {
+    dbg() << "ERROR: dbg.value operand has size " << ValueOperandSize
+          << ", but its variable has size " << *DbgVarSize << ": ";
+    DVI->print(dbg());
+    dbg() << "\n";
+  }
+  return HasBadSize;
+}
+
+bool checkDebugifyMetadata(Module &M,
+                           iterator_range<Module::iterator> Functions,
+                           StringRef NameOfWrappedPass, StringRef Banner,
+                           bool Strip, DebugifyStatsMap *StatsMap) {
+  // Skip modules without debugify metadata.
+  NamedMDNode *NMD = M.getNamedMetadata("llvm.debugify");
+  if (!NMD) {
+    dbg() << Banner << ": Skipping module without debugify metadata\n";
+    return false;
+  }
+
+  auto getDebugifyOperand = [&](unsigned Idx) -> unsigned {
+    return mdconst::extract<ConstantInt>(NMD->getOperand(Idx)->getOperand(0))
+        ->getZExtValue();
+  };
+  assert(NMD->getNumOperands() == 2 &&
+         "llvm.debugify should have exactly 2 operands!");
+  unsigned OriginalNumLines = getDebugifyOperand(0);
+  unsigned OriginalNumVars = getDebugifyOperand(1);
+  bool HasErrors = false;
+
+  // Track debug info loss statistics if able.
+  DebugifyStatistics *Stats = nullptr;
+  if (StatsMap && !NameOfWrappedPass.empty())
+    Stats = &StatsMap->operator[](NameOfWrappedPass);
+
+  BitVector MissingLines{OriginalNumLines, true};
+  BitVector MissingVars{OriginalNumVars, true};
+  for (Function &F : Functions) {
+    if (isFunctionSkipped(F))
+      continue;
+
+    // Find missing lines.
+    for (Instruction &I : instructions(F)) {
+      if (isa<DbgValueInst>(&I) || isa<PHINode>(&I))
+        continue;
+
+      auto DL = I.getDebugLoc();
+      if (DL && DL.getLine() != 0) {
+        MissingLines.reset(DL.getLine() - 1);
+        continue;
+      }
+
+      if (!DL) {
+        dbg() << "WARNING: Instruction with empty DebugLoc in function ";
+        dbg() << F.getName() << " --";
+        I.print(dbg());
+        dbg() << "\n";
+      }
+    }
+
+    // Find missing variables and mis-sized debug values.
+    for (Instruction &I : instructions(F)) {
+      auto *DVI = dyn_cast<DbgValueInst>(&I);
+      if (!DVI)
+        continue;
+
+      unsigned Var = ~0U;
+      (void)to_integer(DVI->getVariable()->getName(), Var, 10);
+      assert(Var <= OriginalNumVars && "Unexpected name for DILocalVariable");
+      bool HasBadSize = diagnoseMisSizedDbgValue(M, DVI);
+      if (!HasBadSize)
+        MissingVars.reset(Var - 1);
+      HasErrors |= HasBadSize;
+    }
+  }
+
+  // Print the results.
+  for (unsigned Idx : MissingLines.set_bits())
+    dbg() << "WARNING: Missing line " << Idx + 1 << "\n";
+
+  for (unsigned Idx : MissingVars.set_bits())
+    dbg() << "WARNING: Missing variable " << Idx + 1 << "\n";
+
+  // Update DI loss statistics.
+  if (Stats) {
+    Stats->NumDbgLocsExpected += OriginalNumLines;
+    Stats->NumDbgLocsMissing += MissingLines.count();
+    Stats->NumDbgValuesExpected += OriginalNumVars;
+    Stats->NumDbgValuesMissing += MissingVars.count();
+  }
+
+  dbg() << Banner;
+  if (!NameOfWrappedPass.empty())
+    dbg() << " [" << NameOfWrappedPass << "]";
+  dbg() << ": " << (HasErrors ? "FAIL" : "PASS") << '\n';
+
+  // Strip debugify metadata if required.
+  if (Strip)
+    return stripDebugifyMetadata(M);
+
+  return false;
+}
+
+/// ModulePass for attaching synthetic debug info to everything, used with the
+/// legacy module pass manager.
+struct DebugifyModulePass : public ModulePass {
   bool runOnModule(Module &M) override { return applyDebugify(M); }
- 
-  DebugifyModulePass() : ModulePass(ID) {} 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.setPreservesAll(); 
-  } 
- 
-  static char ID; // Pass identification. 
-}; 
- 
-/// FunctionPass for attaching synthetic debug info to instructions within a 
-/// single function, used with the legacy module pass manager. 
-struct DebugifyFunctionPass : public FunctionPass { 
+
+  DebugifyModulePass() : ModulePass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+
+  static char ID; // Pass identification.
+};
+
+/// FunctionPass for attaching synthetic debug info to instructions within a
+/// single function, used with the legacy module pass manager.
+struct DebugifyFunctionPass : public FunctionPass {
   bool runOnFunction(Function &F) override { return applyDebugify(F); }
- 
-  DebugifyFunctionPass() : FunctionPass(ID) {} 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.setPreservesAll(); 
-  } 
- 
-  static char ID; // Pass identification. 
-}; 
- 
-/// ModulePass for checking debug info inserted by -debugify, used with the 
-/// legacy module pass manager. 
-struct CheckDebugifyModulePass : public ModulePass { 
-  bool runOnModule(Module &M) override { 
-    return checkDebugifyMetadata(M, M.functions(), NameOfWrappedPass, 
-                                 "CheckModuleDebugify", Strip, StatsMap); 
-  } 
- 
-  CheckDebugifyModulePass(bool Strip = false, StringRef NameOfWrappedPass = "", 
-                          DebugifyStatsMap *StatsMap = nullptr) 
-      : ModulePass(ID), Strip(Strip), NameOfWrappedPass(NameOfWrappedPass), 
-        StatsMap(StatsMap) {} 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.setPreservesAll(); 
-  } 
- 
-  static char ID; // Pass identification. 
- 
-private: 
-  bool Strip; 
-  StringRef NameOfWrappedPass; 
-  DebugifyStatsMap *StatsMap; 
-}; 
- 
-/// FunctionPass for checking debug info inserted by -debugify-function, used 
-/// with the legacy module pass manager. 
-struct CheckDebugifyFunctionPass : public FunctionPass { 
-  bool runOnFunction(Function &F) override { 
-    Module &M = *F.getParent(); 
-    auto FuncIt = F.getIterator(); 
-    return checkDebugifyMetadata(M, make_range(FuncIt, std::next(FuncIt)), 
-                                 NameOfWrappedPass, "CheckFunctionDebugify", 
-                                 Strip, StatsMap); 
-  } 
- 
-  CheckDebugifyFunctionPass(bool Strip = false, 
-                            StringRef NameOfWrappedPass = "", 
-                            DebugifyStatsMap *StatsMap = nullptr) 
-      : FunctionPass(ID), Strip(Strip), NameOfWrappedPass(NameOfWrappedPass), 
-        StatsMap(StatsMap) {} 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.setPreservesAll(); 
-  } 
- 
-  static char ID; // Pass identification. 
- 
-private: 
-  bool Strip; 
-  StringRef NameOfWrappedPass; 
-  DebugifyStatsMap *StatsMap; 
-}; 
- 
-} // end anonymous namespace 
- 
+
+  DebugifyFunctionPass() : FunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+
+  static char ID; // Pass identification.
+};
+
+/// ModulePass for checking debug info inserted by -debugify, used with the
+/// legacy module pass manager.
+struct CheckDebugifyModulePass : public ModulePass {
+  bool runOnModule(Module &M) override {
+    return checkDebugifyMetadata(M, M.functions(), NameOfWrappedPass,
+                                 "CheckModuleDebugify", Strip, StatsMap);
+  }
+
+  CheckDebugifyModulePass(bool Strip = false, StringRef NameOfWrappedPass = "",
+                          DebugifyStatsMap *StatsMap = nullptr)
+      : ModulePass(ID), Strip(Strip), NameOfWrappedPass(NameOfWrappedPass),
+        StatsMap(StatsMap) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+
+  static char ID; // Pass identification.
+
+private:
+  bool Strip;
+  StringRef NameOfWrappedPass;
+  DebugifyStatsMap *StatsMap;
+};
+
+/// FunctionPass for checking debug info inserted by -debugify-function, used
+/// with the legacy module pass manager.
+struct CheckDebugifyFunctionPass : public FunctionPass {
+  bool runOnFunction(Function &F) override {
+    Module &M = *F.getParent();
+    auto FuncIt = F.getIterator();
+    return checkDebugifyMetadata(M, make_range(FuncIt, std::next(FuncIt)),
+                                 NameOfWrappedPass, "CheckFunctionDebugify",
+                                 Strip, StatsMap);
+  }
+
+  CheckDebugifyFunctionPass(bool Strip = false,
+                            StringRef NameOfWrappedPass = "",
+                            DebugifyStatsMap *StatsMap = nullptr)
+      : FunctionPass(ID), Strip(Strip), NameOfWrappedPass(NameOfWrappedPass),
+        StatsMap(StatsMap) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+
+  static char ID; // Pass identification.
+
+private:
+  bool Strip;
+  StringRef NameOfWrappedPass;
+  DebugifyStatsMap *StatsMap;
+};
+
+} // end anonymous namespace
+
 void llvm::exportDebugifyStats(StringRef Path, const DebugifyStatsMap &Map) {
   std::error_code EC;
   raw_fd_ostream OS{Path, EC};
@@ -482,7 +482,7 @@ void llvm::exportDebugifyStats(StringRef Path, const DebugifyStatsMap &Map) {
     errs() << "Could not open file: " << EC.message() << ", " << Path << '\n';
     return;
   }
- 
+
   OS << "Pass Name" << ',' << "# of missing debug values" << ','
      << "# of missing locations" << ',' << "Missing/Expected value ratio" << ','
      << "Missing/Expected location ratio" << '\n';
@@ -501,34 +501,34 @@ ModulePass *llvm::createDebugifyModulePass() {
 }
 
 FunctionPass *llvm::createDebugifyFunctionPass() {
-  return new DebugifyFunctionPass(); 
-} 
- 
-PreservedAnalyses NewPMDebugifyPass::run(Module &M, ModuleAnalysisManager &) { 
-  applyDebugifyMetadata(M, M.functions(), 
-                        "ModuleDebugify: ", /*ApplyToMF*/ nullptr); 
-  return PreservedAnalyses::all(); 
-} 
- 
+  return new DebugifyFunctionPass();
+}
+
+PreservedAnalyses NewPMDebugifyPass::run(Module &M, ModuleAnalysisManager &) {
+  applyDebugifyMetadata(M, M.functions(),
+                        "ModuleDebugify: ", /*ApplyToMF*/ nullptr);
+  return PreservedAnalyses::all();
+}
+
 ModulePass *llvm::createCheckDebugifyModulePass(bool Strip,
                                                 StringRef NameOfWrappedPass,
                                                 DebugifyStatsMap *StatsMap) {
-  return new CheckDebugifyModulePass(Strip, NameOfWrappedPass, StatsMap); 
-} 
- 
+  return new CheckDebugifyModulePass(Strip, NameOfWrappedPass, StatsMap);
+}
+
 FunctionPass *
 llvm::createCheckDebugifyFunctionPass(bool Strip, StringRef NameOfWrappedPass,
                                       DebugifyStatsMap *StatsMap) {
-  return new CheckDebugifyFunctionPass(Strip, NameOfWrappedPass, StatsMap); 
-} 
- 
-PreservedAnalyses NewPMCheckDebugifyPass::run(Module &M, 
-                                              ModuleAnalysisManager &) { 
-  checkDebugifyMetadata(M, M.functions(), "", "CheckModuleDebugify", false, 
-                        nullptr); 
-  return PreservedAnalyses::all(); 
-} 
- 
+  return new CheckDebugifyFunctionPass(Strip, NameOfWrappedPass, StatsMap);
+}
+
+PreservedAnalyses NewPMCheckDebugifyPass::run(Module &M,
+                                              ModuleAnalysisManager &) {
+  checkDebugifyMetadata(M, M.functions(), "", "CheckModuleDebugify", false,
+                        nullptr);
+  return PreservedAnalyses::all();
+}
+
 static bool isIgnoredPass(StringRef PassID) {
   return isSpecialPass(PassID, {"PassManager", "PassAdaptor",
                                 "AnalysisManagerProxy", "PrintFunctionPass",
@@ -564,18 +564,18 @@ void DebugifyEachInstrumentation::registerCallbacks(
   });
 }
 
-char DebugifyModulePass::ID = 0; 
-static RegisterPass<DebugifyModulePass> DM("debugify", 
-                                           "Attach debug info to everything"); 
- 
-char CheckDebugifyModulePass::ID = 0; 
-static RegisterPass<CheckDebugifyModulePass> 
-    CDM("check-debugify", "Check debug info from -debugify"); 
- 
-char DebugifyFunctionPass::ID = 0; 
-static RegisterPass<DebugifyFunctionPass> DF("debugify-function", 
-                                             "Attach debug info to a function"); 
- 
-char CheckDebugifyFunctionPass::ID = 0; 
-static RegisterPass<CheckDebugifyFunctionPass> 
-    CDF("check-debugify-function", "Check debug info from -debugify-function"); 
+char DebugifyModulePass::ID = 0;
+static RegisterPass<DebugifyModulePass> DM("debugify",
+                                           "Attach debug info to everything");
+
+char CheckDebugifyModulePass::ID = 0;
+static RegisterPass<CheckDebugifyModulePass>
+    CDM("check-debugify", "Check debug info from -debugify");
+
+char DebugifyFunctionPass::ID = 0;
+static RegisterPass<DebugifyFunctionPass> DF("debugify-function",
+                                             "Attach debug info to a function");
+
+char CheckDebugifyFunctionPass::ID = 0;
+static RegisterPass<CheckDebugifyFunctionPass>
+    CDF("check-debugify-function", "Check debug info from -debugify-function");
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/DemoteRegToStack.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/DemoteRegToStack.cpp
index fb9db4033c..5f53d794fe 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/DemoteRegToStack.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/DemoteRegToStack.cpp
@@ -1,153 +1,153 @@
-//===- DemoteRegToStack.cpp - Move a virtual register to the stack --------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/Analysis/CFG.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-using namespace llvm; 
- 
-/// DemoteRegToStack - This function takes a virtual register computed by an 
-/// Instruction and replaces it with a slot in the stack frame, allocated via 
-/// alloca.  This allows the CFG to be changed around without fear of 
-/// invalidating the SSA information for the value.  It returns the pointer to 
-/// the alloca inserted to create a stack slot for I. 
-AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads, 
-                                   Instruction *AllocaPoint) { 
-  if (I.use_empty()) { 
-    I.eraseFromParent(); 
-    return nullptr; 
-  } 
- 
-  Function *F = I.getParent()->getParent(); 
-  const DataLayout &DL = F->getParent()->getDataLayout(); 
- 
-  // Create a stack slot to hold the value. 
-  AllocaInst *Slot; 
-  if (AllocaPoint) { 
-    Slot = new AllocaInst(I.getType(), DL.getAllocaAddrSpace(), nullptr, 
-                          I.getName()+".reg2mem", AllocaPoint); 
-  } else { 
-    Slot = new AllocaInst(I.getType(), DL.getAllocaAddrSpace(), nullptr, 
-                          I.getName() + ".reg2mem", &F->getEntryBlock().front()); 
-  } 
- 
-  // We cannot demote invoke instructions to the stack if their normal edge 
-  // is critical. Therefore, split the critical edge and create a basic block 
-  // into which the store can be inserted. 
-  if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) { 
-    if (!II->getNormalDest()->getSinglePredecessor()) { 
-      unsigned SuccNum = GetSuccessorNumber(II->getParent(), II->getNormalDest()); 
-      assert(isCriticalEdge(II, SuccNum) && "Expected a critical edge!"); 
-      BasicBlock *BB = SplitCriticalEdge(II, SuccNum); 
-      assert(BB && "Unable to split critical edge."); 
-      (void)BB; 
-    } 
-  } 
- 
-  // Change all of the users of the instruction to read from the stack slot. 
-  while (!I.use_empty()) { 
-    Instruction *U = cast<Instruction>(I.user_back()); 
-    if (PHINode *PN = dyn_cast<PHINode>(U)) { 
-      // If this is a PHI node, we can't insert a load of the value before the 
-      // use.  Instead insert the load in the predecessor block corresponding 
-      // to the incoming value. 
-      // 
-      // Note that if there are multiple edges from a basic block to this PHI 
-      // node that we cannot have multiple loads. The problem is that the 
-      // resulting PHI node will have multiple values (from each load) coming in 
-      // from the same block, which is illegal SSA form. For this reason, we 
-      // keep track of and reuse loads we insert. 
-      DenseMap<BasicBlock*, Value*> Loads; 
-      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) 
-        if (PN->getIncomingValue(i) == &I) { 
-          Value *&V = Loads[PN->getIncomingBlock(i)]; 
-          if (!V) { 
-            // Insert the load into the predecessor block 
-            V = new LoadInst(I.getType(), Slot, I.getName() + ".reload", 
-                             VolatileLoads, 
-                             PN->getIncomingBlock(i)->getTerminator()); 
-          } 
-          PN->setIncomingValue(i, V); 
-        } 
- 
-    } else { 
-      // If this is a normal instruction, just insert a load. 
-      Value *V = new LoadInst(I.getType(), Slot, I.getName() + ".reload", 
-                              VolatileLoads, U); 
-      U->replaceUsesOfWith(&I, V); 
-    } 
-  } 
- 
-  // Insert stores of the computed value into the stack slot. We have to be 
-  // careful if I is an invoke instruction, because we can't insert the store 
-  // AFTER the terminator instruction. 
-  BasicBlock::iterator InsertPt; 
-  if (!I.isTerminator()) { 
-    InsertPt = ++I.getIterator(); 
-    for (; isa<PHINode>(InsertPt) || InsertPt->isEHPad(); ++InsertPt) 
-      /* empty */;   // Don't insert before PHI nodes or landingpad instrs. 
-  } else { 
-    InvokeInst &II = cast<InvokeInst>(I); 
-    InsertPt = II.getNormalDest()->getFirstInsertionPt(); 
-  } 
- 
-  new StoreInst(&I, Slot, &*InsertPt); 
-  return Slot; 
-} 
- 
-/// DemotePHIToStack - This function takes a virtual register computed by a PHI 
-/// node and replaces it with a slot in the stack frame allocated via alloca. 
-/// The PHI node is deleted. It returns the pointer to the alloca inserted. 
-AllocaInst *llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) { 
-  if (P->use_empty()) { 
-    P->eraseFromParent(); 
-    return nullptr; 
-  } 
- 
-  const DataLayout &DL = P->getModule()->getDataLayout(); 
- 
-  // Create a stack slot to hold the value. 
-  AllocaInst *Slot; 
-  if (AllocaPoint) { 
-    Slot = new AllocaInst(P->getType(), DL.getAllocaAddrSpace(), nullptr, 
-                          P->getName()+".reg2mem", AllocaPoint); 
-  } else { 
-    Function *F = P->getParent()->getParent(); 
-    Slot = new AllocaInst(P->getType(), DL.getAllocaAddrSpace(), nullptr, 
-                          P->getName() + ".reg2mem", 
-                          &F->getEntryBlock().front()); 
-  } 
- 
-  // Iterate over each operand inserting a store in each predecessor. 
-  for (unsigned i = 0, e = P->getNumIncomingValues(); i < e; ++i) { 
-    if (InvokeInst *II = dyn_cast<InvokeInst>(P->getIncomingValue(i))) { 
-      assert(II->getParent() != P->getIncomingBlock(i) && 
-             "Invoke edge not supported yet"); (void)II; 
-    } 
-    new StoreInst(P->getIncomingValue(i), Slot, 
-                  P->getIncomingBlock(i)->getTerminator()); 
-  } 
- 
-  // Insert a load in place of the PHI and replace all uses. 
-  BasicBlock::iterator InsertPt = P->getIterator(); 
- 
-  for (; isa<PHINode>(InsertPt) || InsertPt->isEHPad(); ++InsertPt) 
-    /* empty */;   // Don't insert before PHI nodes or landingpad instrs. 
- 
-  Value *V = 
-      new LoadInst(P->getType(), Slot, P->getName() + ".reload", &*InsertPt); 
-  P->replaceAllUsesWith(V); 
- 
-  // Delete PHI. 
-  P->eraseFromParent(); 
-  return Slot; 
-} 
+//===- DemoteRegToStack.cpp - Move a virtual register to the stack --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+using namespace llvm;
+
+/// DemoteRegToStack - This function takes a virtual register computed by an
+/// Instruction and replaces it with a slot in the stack frame, allocated via
+/// alloca.  This allows the CFG to be changed around without fear of
+/// invalidating the SSA information for the value.  It returns the pointer to
+/// the alloca inserted to create a stack slot for I.
+AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
+                                   Instruction *AllocaPoint) {
+  if (I.use_empty()) {
+    I.eraseFromParent();
+    return nullptr;
+  }
+
+  Function *F = I.getParent()->getParent();
+  const DataLayout &DL = F->getParent()->getDataLayout();
+
+  // Create a stack slot to hold the value.
+  AllocaInst *Slot;
+  if (AllocaPoint) {
+    Slot = new AllocaInst(I.getType(), DL.getAllocaAddrSpace(), nullptr,
+                          I.getName()+".reg2mem", AllocaPoint);
+  } else {
+    Slot = new AllocaInst(I.getType(), DL.getAllocaAddrSpace(), nullptr,
+                          I.getName() + ".reg2mem", &F->getEntryBlock().front());
+  }
+
+  // We cannot demote invoke instructions to the stack if their normal edge
+  // is critical. Therefore, split the critical edge and create a basic block
+  // into which the store can be inserted.
+  if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
+    if (!II->getNormalDest()->getSinglePredecessor()) {
+      unsigned SuccNum = GetSuccessorNumber(II->getParent(), II->getNormalDest());
+      assert(isCriticalEdge(II, SuccNum) && "Expected a critical edge!");
+      BasicBlock *BB = SplitCriticalEdge(II, SuccNum);
+      assert(BB && "Unable to split critical edge.");
+      (void)BB;
+    }
+  }
+
+  // Change all of the users of the instruction to read from the stack slot.
+  while (!I.use_empty()) {
+    Instruction *U = cast<Instruction>(I.user_back());
+    if (PHINode *PN = dyn_cast<PHINode>(U)) {
+      // If this is a PHI node, we can't insert a load of the value before the
+      // use.  Instead insert the load in the predecessor block corresponding
+      // to the incoming value.
+      //
+      // Note that if there are multiple edges from a basic block to this PHI
+      // node that we cannot have multiple loads. The problem is that the
+      // resulting PHI node will have multiple values (from each load) coming in
+      // from the same block, which is illegal SSA form. For this reason, we
+      // keep track of and reuse loads we insert.
+      DenseMap<BasicBlock*, Value*> Loads;
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        if (PN->getIncomingValue(i) == &I) {
+          Value *&V = Loads[PN->getIncomingBlock(i)];
+          if (!V) {
+            // Insert the load into the predecessor block
+            V = new LoadInst(I.getType(), Slot, I.getName() + ".reload",
+                             VolatileLoads,
+                             PN->getIncomingBlock(i)->getTerminator());
+          }
+          PN->setIncomingValue(i, V);
+        }
+
+    } else {
+      // If this is a normal instruction, just insert a load.
+      Value *V = new LoadInst(I.getType(), Slot, I.getName() + ".reload",
+                              VolatileLoads, U);
+      U->replaceUsesOfWith(&I, V);
+    }
+  }
+
+  // Insert stores of the computed value into the stack slot. We have to be
+  // careful if I is an invoke instruction, because we can't insert the store
+  // AFTER the terminator instruction.
+  BasicBlock::iterator InsertPt;
+  if (!I.isTerminator()) {
+    InsertPt = ++I.getIterator();
+    for (; isa<PHINode>(InsertPt) || InsertPt->isEHPad(); ++InsertPt)
+      /* empty */;   // Don't insert before PHI nodes or landingpad instrs.
+  } else {
+    InvokeInst &II = cast<InvokeInst>(I);
+    InsertPt = II.getNormalDest()->getFirstInsertionPt();
+  }
+
+  new StoreInst(&I, Slot, &*InsertPt);
+  return Slot;
+}
+
+/// DemotePHIToStack - This function takes a virtual register computed by a PHI
+/// node and replaces it with a slot in the stack frame allocated via alloca.
+/// The PHI node is deleted. It returns the pointer to the alloca inserted.
+AllocaInst *llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) {
+  if (P->use_empty()) {
+    P->eraseFromParent();
+    return nullptr;
+  }
+
+  const DataLayout &DL = P->getModule()->getDataLayout();
+
+  // Create a stack slot to hold the value.
+  AllocaInst *Slot;
+  if (AllocaPoint) {
+    Slot = new AllocaInst(P->getType(), DL.getAllocaAddrSpace(), nullptr,
+                          P->getName()+".reg2mem", AllocaPoint);
+  } else {
+    Function *F = P->getParent()->getParent();
+    Slot = new AllocaInst(P->getType(), DL.getAllocaAddrSpace(), nullptr,
+                          P->getName() + ".reg2mem",
+                          &F->getEntryBlock().front());
+  }
+
+  // Iterate over each operand inserting a store in each predecessor.
+  for (unsigned i = 0, e = P->getNumIncomingValues(); i < e; ++i) {
+    if (InvokeInst *II = dyn_cast<InvokeInst>(P->getIncomingValue(i))) {
+      assert(II->getParent() != P->getIncomingBlock(i) &&
+             "Invoke edge not supported yet"); (void)II;
+    }
+    new StoreInst(P->getIncomingValue(i), Slot,
+                  P->getIncomingBlock(i)->getTerminator());
+  }
+
+  // Insert a load in place of the PHI and replace all uses.
+  BasicBlock::iterator InsertPt = P->getIterator();
+
+  for (; isa<PHINode>(InsertPt) || InsertPt->isEHPad(); ++InsertPt)
+    /* empty */;   // Don't insert before PHI nodes or landingpad instrs.
+
+  Value *V =
+      new LoadInst(P->getType(), Slot, P->getName() + ".reload", &*InsertPt);
+  P->replaceAllUsesWith(V);
+
+  // Delete PHI.
+  P->eraseFromParent();
+  return Slot;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/EntryExitInstrumenter.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/EntryExitInstrumenter.cpp
index 8171eb6b2c..26f8e21952 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/EntryExitInstrumenter.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/EntryExitInstrumenter.cpp
@@ -1,168 +1,168 @@
-//===- EntryExitInstrumenter.cpp - Function Entry/Exit Instrumentation ----===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/EntryExitInstrumenter.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/IR/DebugInfoMetadata.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Transforms/Utils.h" 
-using namespace llvm; 
- 
-static void insertCall(Function &CurFn, StringRef Func, 
-                       Instruction *InsertionPt, DebugLoc DL) { 
-  Module &M = *InsertionPt->getParent()->getParent()->getParent(); 
-  LLVMContext &C = InsertionPt->getParent()->getContext(); 
- 
-  if (Func == "mcount" || 
-      Func == ".mcount" || 
-      Func == "llvm.arm.gnu.eabi.mcount" || 
-      Func == "\01_mcount" || 
-      Func == "\01mcount" || 
-      Func == "__mcount" || 
-      Func == "_mcount" || 
-      Func == "__cyg_profile_func_enter_bare") { 
-    FunctionCallee Fn = M.getOrInsertFunction(Func, Type::getVoidTy(C)); 
-    CallInst *Call = CallInst::Create(Fn, "", InsertionPt); 
-    Call->setDebugLoc(DL); 
-    return; 
-  } 
- 
-  if (Func == "__cyg_profile_func_enter" || Func == "__cyg_profile_func_exit") { 
-    Type *ArgTypes[] = {Type::getInt8PtrTy(C), Type::getInt8PtrTy(C)}; 
- 
-    FunctionCallee Fn = M.getOrInsertFunction( 
-        Func, FunctionType::get(Type::getVoidTy(C), ArgTypes, false)); 
- 
-    Instruction *RetAddr = CallInst::Create( 
-        Intrinsic::getDeclaration(&M, Intrinsic::returnaddress), 
-        ArrayRef<Value *>(ConstantInt::get(Type::getInt32Ty(C), 0)), "", 
-        InsertionPt); 
-    RetAddr->setDebugLoc(DL); 
- 
-    Value *Args[] = {ConstantExpr::getBitCast(&CurFn, Type::getInt8PtrTy(C)), 
-                     RetAddr}; 
- 
-    CallInst *Call = 
-        CallInst::Create(Fn, ArrayRef<Value *>(Args), "", InsertionPt); 
-    Call->setDebugLoc(DL); 
-    return; 
-  } 
- 
-  // We only know how to call a fixed set of instrumentation functions, because 
-  // they all expect different arguments, etc. 
-  report_fatal_error(Twine("Unknown instrumentation function: '") + Func + "'"); 
-} 
- 
-static bool runOnFunction(Function &F, bool PostInlining) { 
-  StringRef EntryAttr = PostInlining ? "instrument-function-entry-inlined" 
-                                     : "instrument-function-entry"; 
- 
-  StringRef ExitAttr = PostInlining ? "instrument-function-exit-inlined" 
-                                    : "instrument-function-exit"; 
- 
-  StringRef EntryFunc = F.getFnAttribute(EntryAttr).getValueAsString(); 
-  StringRef ExitFunc = F.getFnAttribute(ExitAttr).getValueAsString(); 
- 
-  bool Changed = false; 
- 
-  // If the attribute is specified, insert instrumentation and then "consume" 
-  // the attribute so that it's not inserted again if the pass should happen to 
-  // run later for some reason. 
- 
-  if (!EntryFunc.empty()) { 
-    DebugLoc DL; 
-    if (auto SP = F.getSubprogram()) 
+//===- EntryExitInstrumenter.cpp - Function Entry/Exit Instrumentation ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/EntryExitInstrumenter.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils.h"
+using namespace llvm;
+
+static void insertCall(Function &CurFn, StringRef Func,
+                       Instruction *InsertionPt, DebugLoc DL) {
+  Module &M = *InsertionPt->getParent()->getParent()->getParent();
+  LLVMContext &C = InsertionPt->getParent()->getContext();
+
+  if (Func == "mcount" ||
+      Func == ".mcount" ||
+      Func == "llvm.arm.gnu.eabi.mcount" ||
+      Func == "\01_mcount" ||
+      Func == "\01mcount" ||
+      Func == "__mcount" ||
+      Func == "_mcount" ||
+      Func == "__cyg_profile_func_enter_bare") {
+    FunctionCallee Fn = M.getOrInsertFunction(Func, Type::getVoidTy(C));
+    CallInst *Call = CallInst::Create(Fn, "", InsertionPt);
+    Call->setDebugLoc(DL);
+    return;
+  }
+
+  if (Func == "__cyg_profile_func_enter" || Func == "__cyg_profile_func_exit") {
+    Type *ArgTypes[] = {Type::getInt8PtrTy(C), Type::getInt8PtrTy(C)};
+
+    FunctionCallee Fn = M.getOrInsertFunction(
+        Func, FunctionType::get(Type::getVoidTy(C), ArgTypes, false));
+
+    Instruction *RetAddr = CallInst::Create(
+        Intrinsic::getDeclaration(&M, Intrinsic::returnaddress),
+        ArrayRef<Value *>(ConstantInt::get(Type::getInt32Ty(C), 0)), "",
+        InsertionPt);
+    RetAddr->setDebugLoc(DL);
+
+    Value *Args[] = {ConstantExpr::getBitCast(&CurFn, Type::getInt8PtrTy(C)),
+                     RetAddr};
+
+    CallInst *Call =
+        CallInst::Create(Fn, ArrayRef<Value *>(Args), "", InsertionPt);
+    Call->setDebugLoc(DL);
+    return;
+  }
+
+  // We only know how to call a fixed set of instrumentation functions, because
+  // they all expect different arguments, etc.
+  report_fatal_error(Twine("Unknown instrumentation function: '") + Func + "'");
+}
+
+static bool runOnFunction(Function &F, bool PostInlining) {
+  StringRef EntryAttr = PostInlining ? "instrument-function-entry-inlined"
+                                     : "instrument-function-entry";
+
+  StringRef ExitAttr = PostInlining ? "instrument-function-exit-inlined"
+                                    : "instrument-function-exit";
+
+  StringRef EntryFunc = F.getFnAttribute(EntryAttr).getValueAsString();
+  StringRef ExitFunc = F.getFnAttribute(ExitAttr).getValueAsString();
+
+  bool Changed = false;
+
+  // If the attribute is specified, insert instrumentation and then "consume"
+  // the attribute so that it's not inserted again if the pass should happen to
+  // run later for some reason.
+
+  if (!EntryFunc.empty()) {
+    DebugLoc DL;
+    if (auto SP = F.getSubprogram())
       DL = DILocation::get(SP->getContext(), SP->getScopeLine(), 0, SP);
- 
-    insertCall(F, EntryFunc, &*F.begin()->getFirstInsertionPt(), DL); 
-    Changed = true; 
-    F.removeAttribute(AttributeList::FunctionIndex, EntryAttr); 
-  } 
- 
-  if (!ExitFunc.empty()) { 
-    for (BasicBlock &BB : F) { 
-      Instruction *T = BB.getTerminator(); 
-      if (!isa<ReturnInst>(T)) 
-        continue; 
- 
-      // If T is preceded by a musttail call, that's the real terminator. 
+
+    insertCall(F, EntryFunc, &*F.begin()->getFirstInsertionPt(), DL);
+    Changed = true;
+    F.removeAttribute(AttributeList::FunctionIndex, EntryAttr);
+  }
+
+  if (!ExitFunc.empty()) {
+    for (BasicBlock &BB : F) {
+      Instruction *T = BB.getTerminator();
+      if (!isa<ReturnInst>(T))
+        continue;
+
+      // If T is preceded by a musttail call, that's the real terminator.
       if (CallInst *CI = BB.getTerminatingMustTailCall())
         T = CI;
- 
-      DebugLoc DL; 
-      if (DebugLoc TerminatorDL = T->getDebugLoc()) 
-        DL = TerminatorDL; 
-      else if (auto SP = F.getSubprogram()) 
+
+      DebugLoc DL;
+      if (DebugLoc TerminatorDL = T->getDebugLoc())
+        DL = TerminatorDL;
+      else if (auto SP = F.getSubprogram())
         DL = DILocation::get(SP->getContext(), 0, 0, SP);
- 
-      insertCall(F, ExitFunc, T, DL); 
-      Changed = true; 
-    } 
-    F.removeAttribute(AttributeList::FunctionIndex, ExitAttr); 
-  } 
- 
-  return Changed; 
-} 
- 
-namespace { 
-struct EntryExitInstrumenter : public FunctionPass { 
-  static char ID; 
-  EntryExitInstrumenter() : FunctionPass(ID) { 
-    initializeEntryExitInstrumenterPass(*PassRegistry::getPassRegistry()); 
-  } 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addPreserved<GlobalsAAWrapperPass>(); 
-  } 
-  bool runOnFunction(Function &F) override { return ::runOnFunction(F, false); } 
-}; 
-char EntryExitInstrumenter::ID = 0; 
- 
-struct PostInlineEntryExitInstrumenter : public FunctionPass { 
-  static char ID; 
-  PostInlineEntryExitInstrumenter() : FunctionPass(ID) { 
-    initializePostInlineEntryExitInstrumenterPass( 
-        *PassRegistry::getPassRegistry()); 
-  } 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addPreserved<GlobalsAAWrapperPass>(); 
-  } 
-  bool runOnFunction(Function &F) override { return ::runOnFunction(F, true); } 
-}; 
-char PostInlineEntryExitInstrumenter::ID = 0; 
-} 
- 
-INITIALIZE_PASS( 
-    EntryExitInstrumenter, "ee-instrument", 
-    "Instrument function entry/exit with calls to e.g. mcount() (pre inlining)", 
-    false, false) 
-INITIALIZE_PASS(PostInlineEntryExitInstrumenter, "post-inline-ee-instrument", 
-                "Instrument function entry/exit with calls to e.g. mcount() " 
-                "(post inlining)", 
-                false, false) 
- 
-FunctionPass *llvm::createEntryExitInstrumenterPass() { 
-  return new EntryExitInstrumenter(); 
-} 
- 
-FunctionPass *llvm::createPostInlineEntryExitInstrumenterPass() { 
-  return new PostInlineEntryExitInstrumenter(); 
-} 
- 
-PreservedAnalyses 
-llvm::EntryExitInstrumenterPass::run(Function &F, FunctionAnalysisManager &AM) { 
-  runOnFunction(F, PostInlining); 
-  PreservedAnalyses PA; 
-  PA.preserveSet<CFGAnalyses>(); 
-  return PA; 
-} 
+
+      insertCall(F, ExitFunc, T, DL);
+      Changed = true;
+    }
+    F.removeAttribute(AttributeList::FunctionIndex, ExitAttr);
+  }
+
+  return Changed;
+}
+
+namespace {
+struct EntryExitInstrumenter : public FunctionPass {
+  static char ID;
+  EntryExitInstrumenter() : FunctionPass(ID) {
+    initializeEntryExitInstrumenterPass(*PassRegistry::getPassRegistry());
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+  bool runOnFunction(Function &F) override { return ::runOnFunction(F, false); }
+};
+char EntryExitInstrumenter::ID = 0;
+
+struct PostInlineEntryExitInstrumenter : public FunctionPass {
+  static char ID;
+  PostInlineEntryExitInstrumenter() : FunctionPass(ID) {
+    initializePostInlineEntryExitInstrumenterPass(
+        *PassRegistry::getPassRegistry());
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+  bool runOnFunction(Function &F) override { return ::runOnFunction(F, true); }
+};
+char PostInlineEntryExitInstrumenter::ID = 0;
+}
+
+INITIALIZE_PASS(
+    EntryExitInstrumenter, "ee-instrument",
+    "Instrument function entry/exit with calls to e.g. mcount() (pre inlining)",
+    false, false)
+INITIALIZE_PASS(PostInlineEntryExitInstrumenter, "post-inline-ee-instrument",
+                "Instrument function entry/exit with calls to e.g. mcount() "
+                "(post inlining)",
+                false, false)
+
+FunctionPass *llvm::createEntryExitInstrumenterPass() {
+  return new EntryExitInstrumenter();
+}
+
+FunctionPass *llvm::createPostInlineEntryExitInstrumenterPass() {
+  return new PostInlineEntryExitInstrumenter();
+}
+
+PreservedAnalyses
+llvm::EntryExitInstrumenterPass::run(Function &F, FunctionAnalysisManager &AM) {
+  runOnFunction(F, PostInlining);
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/EscapeEnumerator.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/EscapeEnumerator.cpp
index d57669834a..accedd5b4e 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/EscapeEnumerator.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/EscapeEnumerator.cpp
@@ -1,98 +1,98 @@
-//===- EscapeEnumerator.cpp -----------------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// Defines a helper class that enumerates all possible exits from a function, 
-// including exception handling. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/EscapeEnumerator.h" 
-#include "llvm/ADT/Triple.h" 
-#include "llvm/Analysis/EHPersonalities.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/Transforms/Utils/Local.h" 
- 
-using namespace llvm; 
- 
-static FunctionCallee getDefaultPersonalityFn(Module *M) { 
-  LLVMContext &C = M->getContext(); 
-  Triple T(M->getTargetTriple()); 
-  EHPersonality Pers = getDefaultEHPersonality(T); 
-  return M->getOrInsertFunction(getEHPersonalityName(Pers), 
-                                FunctionType::get(Type::getInt32Ty(C), true)); 
-} 
- 
-IRBuilder<> *EscapeEnumerator::Next() { 
-  if (Done) 
-    return nullptr; 
- 
-  // Find all 'return', 'resume', and 'unwind' instructions. 
-  while (StateBB != StateE) { 
-    BasicBlock *CurBB = &*StateBB++; 
- 
-    // Branches and invokes do not escape, only unwind, resume, and return 
-    // do. 
-    Instruction *TI = CurBB->getTerminator(); 
-    if (!isa<ReturnInst>(TI) && !isa<ResumeInst>(TI)) 
-      continue; 
- 
+//===- EscapeEnumerator.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines a helper class that enumerates all possible exits from a function,
+// including exception handling.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/EscapeEnumerator.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+static FunctionCallee getDefaultPersonalityFn(Module *M) {
+  LLVMContext &C = M->getContext();
+  Triple T(M->getTargetTriple());
+  EHPersonality Pers = getDefaultEHPersonality(T);
+  return M->getOrInsertFunction(getEHPersonalityName(Pers),
+                                FunctionType::get(Type::getInt32Ty(C), true));
+}
+
+IRBuilder<> *EscapeEnumerator::Next() {
+  if (Done)
+    return nullptr;
+
+  // Find all 'return', 'resume', and 'unwind' instructions.
+  while (StateBB != StateE) {
+    BasicBlock *CurBB = &*StateBB++;
+
+    // Branches and invokes do not escape, only unwind, resume, and return
+    // do.
+    Instruction *TI = CurBB->getTerminator();
+    if (!isa<ReturnInst>(TI) && !isa<ResumeInst>(TI))
+      continue;
+
     if (CallInst *CI = CurBB->getTerminatingMustTailCall())
       TI = CI;
-    Builder.SetInsertPoint(TI); 
-    return &Builder; 
-  } 
- 
-  Done = true; 
- 
-  if (!HandleExceptions) 
-    return nullptr; 
- 
-  if (F.doesNotThrow()) 
-    return nullptr; 
- 
-  // Find all 'call' instructions that may throw. 
+    Builder.SetInsertPoint(TI);
+    return &Builder;
+  }
+
+  Done = true;
+
+  if (!HandleExceptions)
+    return nullptr;
+
+  if (F.doesNotThrow())
+    return nullptr;
+
+  // Find all 'call' instructions that may throw.
   // We cannot tranform calls with musttail tag.
-  SmallVector<Instruction *, 16> Calls; 
-  for (BasicBlock &BB : F) 
-    for (Instruction &II : BB) 
-      if (CallInst *CI = dyn_cast<CallInst>(&II)) 
+  SmallVector<Instruction *, 16> Calls;
+  for (BasicBlock &BB : F)
+    for (Instruction &II : BB)
+      if (CallInst *CI = dyn_cast<CallInst>(&II))
         if (!CI->doesNotThrow() && !CI->isMustTailCall())
-          Calls.push_back(CI); 
- 
-  if (Calls.empty()) 
-    return nullptr; 
- 
-  // Create a cleanup block. 
-  LLVMContext &C = F.getContext(); 
-  BasicBlock *CleanupBB = BasicBlock::Create(C, CleanupBBName, &F); 
-  Type *ExnTy = StructType::get(Type::getInt8PtrTy(C), Type::getInt32Ty(C)); 
-  if (!F.hasPersonalityFn()) { 
-    FunctionCallee PersFn = getDefaultPersonalityFn(F.getParent()); 
-    F.setPersonalityFn(cast<Constant>(PersFn.getCallee())); 
-  } 
- 
-  if (isScopedEHPersonality(classifyEHPersonality(F.getPersonalityFn()))) { 
-    report_fatal_error("Scoped EH not supported"); 
-  } 
- 
-  LandingPadInst *LPad = 
-      LandingPadInst::Create(ExnTy, 1, "cleanup.lpad", CleanupBB); 
-  LPad->setCleanup(true); 
-  ResumeInst *RI = ResumeInst::Create(LPad, CleanupBB); 
- 
-  // Transform the 'call' instructions into 'invoke's branching to the 
-  // cleanup block. Go in reverse order to make prettier BB names. 
-  SmallVector<Value *, 16> Args; 
-  for (unsigned I = Calls.size(); I != 0;) { 
-    CallInst *CI = cast<CallInst>(Calls[--I]); 
-    changeToInvokeAndSplitBasicBlock(CI, CleanupBB); 
-  } 
- 
-  Builder.SetInsertPoint(RI); 
-  return &Builder; 
-} 
+          Calls.push_back(CI);
+
+  if (Calls.empty())
+    return nullptr;
+
+  // Create a cleanup block.
+  LLVMContext &C = F.getContext();
+  BasicBlock *CleanupBB = BasicBlock::Create(C, CleanupBBName, &F);
+  Type *ExnTy = StructType::get(Type::getInt8PtrTy(C), Type::getInt32Ty(C));
+  if (!F.hasPersonalityFn()) {
+    FunctionCallee PersFn = getDefaultPersonalityFn(F.getParent());
+    F.setPersonalityFn(cast<Constant>(PersFn.getCallee()));
+  }
+
+  if (isScopedEHPersonality(classifyEHPersonality(F.getPersonalityFn()))) {
+    report_fatal_error("Scoped EH not supported");
+  }
+
+  LandingPadInst *LPad =
+      LandingPadInst::Create(ExnTy, 1, "cleanup.lpad", CleanupBB);
+  LPad->setCleanup(true);
+  ResumeInst *RI = ResumeInst::Create(LPad, CleanupBB);
+
+  // Transform the 'call' instructions into 'invoke's branching to the
+  // cleanup block. Go in reverse order to make prettier BB names.
+  SmallVector<Value *, 16> Args;
+  for (unsigned I = Calls.size(); I != 0;) {
+    CallInst *CI = cast<CallInst>(Calls[--I]);
+    changeToInvokeAndSplitBasicBlock(CI, CleanupBB);
+  }
+
+  Builder.SetInsertPoint(RI);
+  return &Builder;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/Evaluator.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/Evaluator.cpp
index 4a7167069b..732b00635e 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/Evaluator.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/Evaluator.cpp
@@ -1,728 +1,728 @@
-//===- Evaluator.cpp - LLVM IR evaluator ----------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// Function evaluator for LLVM IR. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/Evaluator.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/Analysis/ConstantFolding.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GlobalAlias.h" 
-#include "llvm/IR/GlobalValue.h" 
-#include "llvm/IR/GlobalVariable.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/Operator.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include <iterator> 
- 
-#define DEBUG_TYPE "evaluator" 
- 
-using namespace llvm; 
- 
-static inline bool 
-isSimpleEnoughValueToCommit(Constant *C, 
-                            SmallPtrSetImpl<Constant *> &SimpleConstants, 
-                            const DataLayout &DL); 
- 
-/// Return true if the specified constant can be handled by the code generator. 
-/// We don't want to generate something like: 
-///   void *X = &X/42; 
-/// because the code generator doesn't have a relocation that can handle that. 
-/// 
-/// This function should be called if C was not found (but just got inserted) 
-/// in SimpleConstants to avoid having to rescan the same constants all the 
-/// time. 
-static bool 
-isSimpleEnoughValueToCommitHelper(Constant *C, 
-                                  SmallPtrSetImpl<Constant *> &SimpleConstants, 
-                                  const DataLayout &DL) { 
-  // Simple global addresses are supported, do not allow dllimport or 
-  // thread-local globals. 
-  if (auto *GV = dyn_cast<GlobalValue>(C)) 
-    return !GV->hasDLLImportStorageClass() && !GV->isThreadLocal(); 
- 
-  // Simple integer, undef, constant aggregate zero, etc are all supported. 
-  if (C->getNumOperands() == 0 || isa<BlockAddress>(C)) 
-    return true; 
- 
-  // Aggregate values are safe if all their elements are. 
-  if (isa<ConstantAggregate>(C)) { 
-    for (Value *Op : C->operands()) 
-      if (!isSimpleEnoughValueToCommit(cast<Constant>(Op), SimpleConstants, DL)) 
-        return false; 
-    return true; 
-  } 
- 
-  // We don't know exactly what relocations are allowed in constant expressions, 
-  // so we allow &global+constantoffset, which is safe and uniformly supported 
-  // across targets. 
-  ConstantExpr *CE = cast<ConstantExpr>(C); 
-  switch (CE->getOpcode()) { 
-  case Instruction::BitCast: 
-    // Bitcast is fine if the casted value is fine. 
-    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL); 
- 
-  case Instruction::IntToPtr: 
-  case Instruction::PtrToInt: 
-    // int <=> ptr is fine if the int type is the same size as the 
-    // pointer type. 
-    if (DL.getTypeSizeInBits(CE->getType()) != 
-        DL.getTypeSizeInBits(CE->getOperand(0)->getType())) 
-      return false; 
-    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL); 
- 
-  // GEP is fine if it is simple + constant offset. 
-  case Instruction::GetElementPtr: 
-    for (unsigned i = 1, e = CE->getNumOperands(); i != e; ++i) 
-      if (!isa<ConstantInt>(CE->getOperand(i))) 
-        return false; 
-    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL); 
- 
-  case Instruction::Add: 
-    // We allow simple+cst. 
-    if (!isa<ConstantInt>(CE->getOperand(1))) 
-      return false; 
-    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL); 
-  } 
-  return false; 
-} 
- 
-static inline bool 
-isSimpleEnoughValueToCommit(Constant *C, 
-                            SmallPtrSetImpl<Constant *> &SimpleConstants, 
-                            const DataLayout &DL) { 
-  // If we already checked this constant, we win. 
-  if (!SimpleConstants.insert(C).second) 
-    return true; 
-  // Check the constant. 
-  return isSimpleEnoughValueToCommitHelper(C, SimpleConstants, DL); 
-} 
- 
-/// Return true if this constant is simple enough for us to understand.  In 
-/// particular, if it is a cast to anything other than from one pointer type to 
-/// another pointer type, we punt.  We basically just support direct accesses to 
-/// globals and GEP's of globals.  This should be kept up to date with 
-/// CommitValueTo. 
-static bool isSimpleEnoughPointerToCommit(Constant *C) { 
-  // Conservatively, avoid aggregate types. This is because we don't 
-  // want to worry about them partially overlapping other stores. 
-  if (!cast<PointerType>(C->getType())->getElementType()->isSingleValueType()) 
-    return false; 
- 
-  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) 
-    // Do not allow weak/*_odr/linkonce linkage or external globals. 
-    return GV->hasUniqueInitializer(); 
- 
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) { 
-    // Handle a constantexpr gep. 
-    if (CE->getOpcode() == Instruction::GetElementPtr && 
-        isa<GlobalVariable>(CE->getOperand(0)) && 
-        cast<GEPOperator>(CE)->isInBounds()) { 
-      GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0)); 
-      // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or 
-      // external globals. 
-      if (!GV->hasUniqueInitializer()) 
-        return false; 
- 
-      // The first index must be zero. 
-      ConstantInt *CI = dyn_cast<ConstantInt>(*std::next(CE->op_begin())); 
-      if (!CI || !CI->isZero()) return false; 
- 
-      // The remaining indices must be compile-time known integers within the 
-      // notional bounds of the corresponding static array types. 
-      if (!CE->isGEPWithNoNotionalOverIndexing()) 
-        return false; 
- 
-      return ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE); 
- 
-    // A constantexpr bitcast from a pointer to another pointer is a no-op, 
-    // and we know how to evaluate it by moving the bitcast from the pointer 
-    // operand to the value operand. 
-    } else if (CE->getOpcode() == Instruction::BitCast && 
-               isa<GlobalVariable>(CE->getOperand(0))) { 
-      // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or 
-      // external globals. 
-      return cast<GlobalVariable>(CE->getOperand(0))->hasUniqueInitializer(); 
-    } 
-  } 
- 
-  return false; 
-} 
- 
-/// Apply 'Func' to Ptr. If this returns nullptr, introspect the pointer's 
-/// type and walk down through the initial elements to obtain additional 
-/// pointers to try. Returns the first non-null return value from Func, or 
-/// nullptr if the type can't be introspected further. 
-static Constant * 
-evaluateBitcastFromPtr(Constant *Ptr, const DataLayout &DL, 
-                       const TargetLibraryInfo *TLI, 
-                       std::function<Constant *(Constant *)> Func) { 
-  Constant *Val; 
-  while (!(Val = Func(Ptr))) { 
+//===- Evaluator.cpp - LLVM IR evaluator ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Function evaluator for LLVM IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/Evaluator.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <iterator>
+
+#define DEBUG_TYPE "evaluator"
+
+using namespace llvm;
+
+static inline bool
+isSimpleEnoughValueToCommit(Constant *C,
+                            SmallPtrSetImpl<Constant *> &SimpleConstants,
+                            const DataLayout &DL);
+
+/// Return true if the specified constant can be handled by the code generator.
+/// We don't want to generate something like:
+///   void *X = &X/42;
+/// because the code generator doesn't have a relocation that can handle that.
+///
+/// This function should be called if C was not found (but just got inserted)
+/// in SimpleConstants to avoid having to rescan the same constants all the
+/// time.
+static bool
+isSimpleEnoughValueToCommitHelper(Constant *C,
+                                  SmallPtrSetImpl<Constant *> &SimpleConstants,
+                                  const DataLayout &DL) {
+  // Simple global addresses are supported, do not allow dllimport or
+  // thread-local globals.
+  if (auto *GV = dyn_cast<GlobalValue>(C))
+    return !GV->hasDLLImportStorageClass() && !GV->isThreadLocal();
+
+  // Simple integer, undef, constant aggregate zero, etc are all supported.
+  if (C->getNumOperands() == 0 || isa<BlockAddress>(C))
+    return true;
+
+  // Aggregate values are safe if all their elements are.
+  if (isa<ConstantAggregate>(C)) {
+    for (Value *Op : C->operands())
+      if (!isSimpleEnoughValueToCommit(cast<Constant>(Op), SimpleConstants, DL))
+        return false;
+    return true;
+  }
+
+  // We don't know exactly what relocations are allowed in constant expressions,
+  // so we allow &global+constantoffset, which is safe and uniformly supported
+  // across targets.
+  ConstantExpr *CE = cast<ConstantExpr>(C);
+  switch (CE->getOpcode()) {
+  case Instruction::BitCast:
+    // Bitcast is fine if the casted value is fine.
+    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL);
+
+  case Instruction::IntToPtr:
+  case Instruction::PtrToInt:
+    // int <=> ptr is fine if the int type is the same size as the
+    // pointer type.
+    if (DL.getTypeSizeInBits(CE->getType()) !=
+        DL.getTypeSizeInBits(CE->getOperand(0)->getType()))
+      return false;
+    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL);
+
+  // GEP is fine if it is simple + constant offset.
+  case Instruction::GetElementPtr:
+    for (unsigned i = 1, e = CE->getNumOperands(); i != e; ++i)
+      if (!isa<ConstantInt>(CE->getOperand(i)))
+        return false;
+    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL);
+
+  case Instruction::Add:
+    // We allow simple+cst.
+    if (!isa<ConstantInt>(CE->getOperand(1)))
+      return false;
+    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL);
+  }
+  return false;
+}
+
+static inline bool
+isSimpleEnoughValueToCommit(Constant *C,
+                            SmallPtrSetImpl<Constant *> &SimpleConstants,
+                            const DataLayout &DL) {
+  // If we already checked this constant, we win.
+  if (!SimpleConstants.insert(C).second)
+    return true;
+  // Check the constant.
+  return isSimpleEnoughValueToCommitHelper(C, SimpleConstants, DL);
+}
+
+/// Return true if this constant is simple enough for us to understand.  In
+/// particular, if it is a cast to anything other than from one pointer type to
+/// another pointer type, we punt.  We basically just support direct accesses to
+/// globals and GEP's of globals.  This should be kept up to date with
+/// CommitValueTo.
+static bool isSimpleEnoughPointerToCommit(Constant *C) {
+  // Conservatively, avoid aggregate types. This is because we don't
+  // want to worry about them partially overlapping other stores.
+  if (!cast<PointerType>(C->getType())->getElementType()->isSingleValueType())
+    return false;
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C))
+    // Do not allow weak/*_odr/linkonce linkage or external globals.
+    return GV->hasUniqueInitializer();
+
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+    // Handle a constantexpr gep.
+    if (CE->getOpcode() == Instruction::GetElementPtr &&
+        isa<GlobalVariable>(CE->getOperand(0)) &&
+        cast<GEPOperator>(CE)->isInBounds()) {
+      GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
+      // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or
+      // external globals.
+      if (!GV->hasUniqueInitializer())
+        return false;
+
+      // The first index must be zero.
+      ConstantInt *CI = dyn_cast<ConstantInt>(*std::next(CE->op_begin()));
+      if (!CI || !CI->isZero()) return false;
+
+      // The remaining indices must be compile-time known integers within the
+      // notional bounds of the corresponding static array types.
+      if (!CE->isGEPWithNoNotionalOverIndexing())
+        return false;
+
+      return ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE);
+
+    // A constantexpr bitcast from a pointer to another pointer is a no-op,
+    // and we know how to evaluate it by moving the bitcast from the pointer
+    // operand to the value operand.
+    } else if (CE->getOpcode() == Instruction::BitCast &&
+               isa<GlobalVariable>(CE->getOperand(0))) {
+      // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or
+      // external globals.
+      return cast<GlobalVariable>(CE->getOperand(0))->hasUniqueInitializer();
+    }
+  }
+
+  return false;
+}
+
+/// Apply 'Func' to Ptr. If this returns nullptr, introspect the pointer's
+/// type and walk down through the initial elements to obtain additional
+/// pointers to try. Returns the first non-null return value from Func, or
+/// nullptr if the type can't be introspected further.
+static Constant *
+evaluateBitcastFromPtr(Constant *Ptr, const DataLayout &DL,
+                       const TargetLibraryInfo *TLI,
+                       std::function<Constant *(Constant *)> Func) {
+  Constant *Val;
+  while (!(Val = Func(Ptr))) {
     // If Ty is a non-opaque struct, we can convert the pointer to the struct
-    // into a pointer to its first member. 
-    // FIXME: This could be extended to support arrays as well. 
-    Type *Ty = cast<PointerType>(Ptr->getType())->getElementType(); 
+    // into a pointer to its first member.
+    // FIXME: This could be extended to support arrays as well.
+    Type *Ty = cast<PointerType>(Ptr->getType())->getElementType();
     if (!isa<StructType>(Ty) || cast<StructType>(Ty)->isOpaque())
-      break; 
- 
-    IntegerType *IdxTy = IntegerType::get(Ty->getContext(), 32); 
-    Constant *IdxZero = ConstantInt::get(IdxTy, 0, false); 
-    Constant *const IdxList[] = {IdxZero, IdxZero}; 
- 
-    Ptr = ConstantExpr::getGetElementPtr(Ty, Ptr, IdxList); 
-    Ptr = ConstantFoldConstant(Ptr, DL, TLI); 
-  } 
-  return Val; 
-} 
- 
-static Constant *getInitializer(Constant *C) { 
-  auto *GV = dyn_cast<GlobalVariable>(C); 
-  return GV && GV->hasDefinitiveInitializer() ? GV->getInitializer() : nullptr; 
-} 
- 
-/// Return the value that would be computed by a load from P after the stores 
-/// reflected by 'memory' have been performed.  If we can't decide, return null. 
-Constant *Evaluator::ComputeLoadResult(Constant *P) { 
-  // If this memory location has been recently stored, use the stored value: it 
-  // is the most up-to-date. 
+      break;
+
+    IntegerType *IdxTy = IntegerType::get(Ty->getContext(), 32);
+    Constant *IdxZero = ConstantInt::get(IdxTy, 0, false);
+    Constant *const IdxList[] = {IdxZero, IdxZero};
+
+    Ptr = ConstantExpr::getGetElementPtr(Ty, Ptr, IdxList);
+    Ptr = ConstantFoldConstant(Ptr, DL, TLI);
+  }
+  return Val;
+}
+
+static Constant *getInitializer(Constant *C) {
+  auto *GV = dyn_cast<GlobalVariable>(C);
+  return GV && GV->hasDefinitiveInitializer() ? GV->getInitializer() : nullptr;
+}
+
+/// Return the value that would be computed by a load from P after the stores
+/// reflected by 'memory' have been performed.  If we can't decide, return null.
+Constant *Evaluator::ComputeLoadResult(Constant *P) {
+  // If this memory location has been recently stored, use the stored value: it
+  // is the most up-to-date.
   auto findMemLoc = [this](Constant *Ptr) { return MutatedMemory.lookup(Ptr); };
- 
-  if (Constant *Val = findMemLoc(P)) 
-    return Val; 
- 
-  // Access it. 
-  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P)) { 
-    if (GV->hasDefinitiveInitializer()) 
-      return GV->getInitializer(); 
-    return nullptr; 
-  } 
- 
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(P)) { 
-    switch (CE->getOpcode()) { 
-    // Handle a constantexpr getelementptr. 
-    case Instruction::GetElementPtr: 
-      if (auto *I = getInitializer(CE->getOperand(0))) 
-        return ConstantFoldLoadThroughGEPConstantExpr(I, CE); 
-      break; 
-    // Handle a constantexpr bitcast. 
-    case Instruction::BitCast: 
-      // We're evaluating a load through a pointer that was bitcast to a 
-      // different type. See if the "from" pointer has recently been stored. 
-      // If it hasn't, we may still be able to find a stored pointer by 
-      // introspecting the type. 
-      Constant *Val = 
-          evaluateBitcastFromPtr(CE->getOperand(0), DL, TLI, findMemLoc); 
-      if (!Val) 
-        Val = getInitializer(CE->getOperand(0)); 
-      if (Val) 
-        return ConstantFoldLoadThroughBitcast( 
-            Val, P->getType()->getPointerElementType(), DL); 
-      break; 
-    } 
-  } 
- 
-  return nullptr;  // don't know how to evaluate. 
-} 
- 
-static Function *getFunction(Constant *C) { 
-  if (auto *Fn = dyn_cast<Function>(C)) 
-    return Fn; 
- 
-  if (auto *Alias = dyn_cast<GlobalAlias>(C)) 
-    if (auto *Fn = dyn_cast<Function>(Alias->getAliasee())) 
-      return Fn; 
-  return nullptr; 
-} 
- 
-Function * 
-Evaluator::getCalleeWithFormalArgs(CallBase &CB, 
-                                   SmallVectorImpl<Constant *> &Formals) { 
-  auto *V = CB.getCalledOperand(); 
-  if (auto *Fn = getFunction(getVal(V))) 
-    return getFormalParams(CB, Fn, Formals) ? Fn : nullptr; 
- 
-  auto *CE = dyn_cast<ConstantExpr>(V); 
-  if (!CE || CE->getOpcode() != Instruction::BitCast || 
-      !getFormalParams(CB, getFunction(CE->getOperand(0)), Formals)) 
-    return nullptr; 
- 
-  return dyn_cast<Function>( 
-      ConstantFoldLoadThroughBitcast(CE, CE->getOperand(0)->getType(), DL)); 
-} 
- 
-bool Evaluator::getFormalParams(CallBase &CB, Function *F, 
-                                SmallVectorImpl<Constant *> &Formals) { 
-  if (!F) 
-    return false; 
- 
-  auto *FTy = F->getFunctionType(); 
-  if (FTy->getNumParams() > CB.getNumArgOperands()) { 
-    LLVM_DEBUG(dbgs() << "Too few arguments for function.\n"); 
-    return false; 
-  } 
- 
-  auto ArgI = CB.arg_begin(); 
-  for (auto ParI = FTy->param_begin(), ParE = FTy->param_end(); ParI != ParE; 
-       ++ParI) { 
-    auto *ArgC = ConstantFoldLoadThroughBitcast(getVal(*ArgI), *ParI, DL); 
-    if (!ArgC) { 
-      LLVM_DEBUG(dbgs() << "Can not convert function argument.\n"); 
-      return false; 
-    } 
-    Formals.push_back(ArgC); 
-    ++ArgI; 
-  } 
-  return true; 
-} 
- 
-/// If call expression contains bitcast then we may need to cast 
-/// evaluated return value to a type of the call expression. 
-Constant *Evaluator::castCallResultIfNeeded(Value *CallExpr, Constant *RV) { 
-  ConstantExpr *CE = dyn_cast<ConstantExpr>(CallExpr); 
-  if (!RV || !CE || CE->getOpcode() != Instruction::BitCast) 
-    return RV; 
- 
-  if (auto *FT = 
-          dyn_cast<FunctionType>(CE->getType()->getPointerElementType())) { 
-    RV = ConstantFoldLoadThroughBitcast(RV, FT->getReturnType(), DL); 
-    if (!RV) 
-      LLVM_DEBUG(dbgs() << "Failed to fold bitcast call expr\n"); 
-  } 
-  return RV; 
-} 
- 
-/// Evaluate all instructions in block BB, returning true if successful, false 
-/// if we can't evaluate it.  NewBB returns the next BB that control flows into, 
-/// or null upon return. 
-bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, 
-                              BasicBlock *&NextBB) { 
-  // This is the main evaluation loop. 
-  while (true) { 
-    Constant *InstResult = nullptr; 
- 
-    LLVM_DEBUG(dbgs() << "Evaluating Instruction: " << *CurInst << "\n"); 
- 
-    if (StoreInst *SI = dyn_cast<StoreInst>(CurInst)) { 
-      if (!SI->isSimple()) { 
-        LLVM_DEBUG(dbgs() << "Store is not simple! Can not evaluate.\n"); 
-        return false;  // no volatile/atomic accesses. 
-      } 
-      Constant *Ptr = getVal(SI->getOperand(1)); 
-      Constant *FoldedPtr = ConstantFoldConstant(Ptr, DL, TLI); 
-      if (Ptr != FoldedPtr) { 
-        LLVM_DEBUG(dbgs() << "Folding constant ptr expression: " << *Ptr); 
-        Ptr = FoldedPtr; 
-        LLVM_DEBUG(dbgs() << "; To: " << *Ptr << "\n"); 
-      } 
-      if (!isSimpleEnoughPointerToCommit(Ptr)) { 
-        // If this is too complex for us to commit, reject it. 
-        LLVM_DEBUG( 
-            dbgs() << "Pointer is too complex for us to evaluate store."); 
-        return false; 
-      } 
- 
-      Constant *Val = getVal(SI->getOperand(0)); 
- 
-      // If this might be too difficult for the backend to handle (e.g. the addr 
-      // of one global variable divided by another) then we can't commit it. 
-      if (!isSimpleEnoughValueToCommit(Val, SimpleConstants, DL)) { 
-        LLVM_DEBUG(dbgs() << "Store value is too complex to evaluate store. " 
-                          << *Val << "\n"); 
-        return false; 
-      } 
- 
-      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) { 
-        if (CE->getOpcode() == Instruction::BitCast) { 
-          LLVM_DEBUG(dbgs() 
-                     << "Attempting to resolve bitcast on constant ptr.\n"); 
-          // If we're evaluating a store through a bitcast, then we need 
-          // to pull the bitcast off the pointer type and push it onto the 
-          // stored value. In order to push the bitcast onto the stored value, 
-          // a bitcast from the pointer's element type to Val's type must be 
-          // legal. If it's not, we can try introspecting the type to find a 
-          // legal conversion. 
- 
-          auto castValTy = [&](Constant *P) -> Constant * { 
-            Type *Ty = cast<PointerType>(P->getType())->getElementType(); 
-            if (Constant *FV = ConstantFoldLoadThroughBitcast(Val, Ty, DL)) { 
-              Ptr = P; 
-              return FV; 
-            } 
-            return nullptr; 
-          }; 
- 
-          Constant *NewVal = 
-              evaluateBitcastFromPtr(CE->getOperand(0), DL, TLI, castValTy); 
-          if (!NewVal) { 
-            LLVM_DEBUG(dbgs() << "Failed to bitcast constant ptr, can not " 
-                                 "evaluate.\n"); 
-            return false; 
-          } 
- 
-          Val = NewVal; 
-          LLVM_DEBUG(dbgs() << "Evaluated bitcast: " << *Val << "\n"); 
-        } 
-      } 
- 
-      MutatedMemory[Ptr] = Val; 
-    } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(CurInst)) { 
-      InstResult = ConstantExpr::get(BO->getOpcode(), 
-                                     getVal(BO->getOperand(0)), 
-                                     getVal(BO->getOperand(1))); 
-      LLVM_DEBUG(dbgs() << "Found a BinaryOperator! Simplifying: " 
-                        << *InstResult << "\n"); 
-    } else if (CmpInst *CI = dyn_cast<CmpInst>(CurInst)) { 
-      InstResult = ConstantExpr::getCompare(CI->getPredicate(), 
-                                            getVal(CI->getOperand(0)), 
-                                            getVal(CI->getOperand(1))); 
-      LLVM_DEBUG(dbgs() << "Found a CmpInst! Simplifying: " << *InstResult 
-                        << "\n"); 
-    } else if (CastInst *CI = dyn_cast<CastInst>(CurInst)) { 
-      InstResult = ConstantExpr::getCast(CI->getOpcode(), 
-                                         getVal(CI->getOperand(0)), 
-                                         CI->getType()); 
-      LLVM_DEBUG(dbgs() << "Found a Cast! Simplifying: " << *InstResult 
-                        << "\n"); 
-    } else if (SelectInst *SI = dyn_cast<SelectInst>(CurInst)) { 
-      InstResult = ConstantExpr::getSelect(getVal(SI->getOperand(0)), 
-                                           getVal(SI->getOperand(1)), 
-                                           getVal(SI->getOperand(2))); 
-      LLVM_DEBUG(dbgs() << "Found a Select! Simplifying: " << *InstResult 
-                        << "\n"); 
-    } else if (auto *EVI = dyn_cast<ExtractValueInst>(CurInst)) { 
-      InstResult = ConstantExpr::getExtractValue( 
-          getVal(EVI->getAggregateOperand()), EVI->getIndices()); 
-      LLVM_DEBUG(dbgs() << "Found an ExtractValueInst! Simplifying: " 
-                        << *InstResult << "\n"); 
-    } else if (auto *IVI = dyn_cast<InsertValueInst>(CurInst)) { 
-      InstResult = ConstantExpr::getInsertValue( 
-          getVal(IVI->getAggregateOperand()), 
-          getVal(IVI->getInsertedValueOperand()), IVI->getIndices()); 
-      LLVM_DEBUG(dbgs() << "Found an InsertValueInst! Simplifying: " 
-                        << *InstResult << "\n"); 
-    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurInst)) { 
-      Constant *P = getVal(GEP->getOperand(0)); 
-      SmallVector<Constant*, 8> GEPOps; 
-      for (User::op_iterator i = GEP->op_begin() + 1, e = GEP->op_end(); 
-           i != e; ++i) 
-        GEPOps.push_back(getVal(*i)); 
-      InstResult = 
-          ConstantExpr::getGetElementPtr(GEP->getSourceElementType(), P, GEPOps, 
-                                         cast<GEPOperator>(GEP)->isInBounds()); 
-      LLVM_DEBUG(dbgs() << "Found a GEP! Simplifying: " << *InstResult << "\n"); 
-    } else if (LoadInst *LI = dyn_cast<LoadInst>(CurInst)) { 
-      if (!LI->isSimple()) { 
-        LLVM_DEBUG( 
-            dbgs() << "Found a Load! Not a simple load, can not evaluate.\n"); 
-        return false;  // no volatile/atomic accesses. 
-      } 
- 
-      Constant *Ptr = getVal(LI->getOperand(0)); 
-      Constant *FoldedPtr = ConstantFoldConstant(Ptr, DL, TLI); 
-      if (Ptr != FoldedPtr) { 
-        Ptr = FoldedPtr; 
-        LLVM_DEBUG(dbgs() << "Found a constant pointer expression, constant " 
-                             "folding: " 
-                          << *Ptr << "\n"); 
-      } 
-      InstResult = ComputeLoadResult(Ptr); 
-      if (!InstResult) { 
-        LLVM_DEBUG( 
-            dbgs() << "Failed to compute load result. Can not evaluate load." 
-                      "\n"); 
-        return false; // Could not evaluate load. 
-      } 
- 
-      LLVM_DEBUG(dbgs() << "Evaluated load: " << *InstResult << "\n"); 
-    } else if (AllocaInst *AI = dyn_cast<AllocaInst>(CurInst)) { 
-      if (AI->isArrayAllocation()) { 
-        LLVM_DEBUG(dbgs() << "Found an array alloca. Can not evaluate.\n"); 
-        return false;  // Cannot handle array allocs. 
-      } 
-      Type *Ty = AI->getAllocatedType(); 
-      AllocaTmps.push_back(std::make_unique<GlobalVariable>( 
-          Ty, false, GlobalValue::InternalLinkage, UndefValue::get(Ty), 
-          AI->getName(), /*TLMode=*/GlobalValue::NotThreadLocal, 
-          AI->getType()->getPointerAddressSpace())); 
-      InstResult = AllocaTmps.back().get(); 
-      LLVM_DEBUG(dbgs() << "Found an alloca. Result: " << *InstResult << "\n"); 
-    } else if (isa<CallInst>(CurInst) || isa<InvokeInst>(CurInst)) { 
-      CallBase &CB = *cast<CallBase>(&*CurInst); 
- 
-      // Debug info can safely be ignored here. 
-      if (isa<DbgInfoIntrinsic>(CB)) { 
-        LLVM_DEBUG(dbgs() << "Ignoring debug info.\n"); 
-        ++CurInst; 
-        continue; 
-      } 
- 
-      // Cannot handle inline asm. 
-      if (CB.isInlineAsm()) { 
-        LLVM_DEBUG(dbgs() << "Found inline asm, can not evaluate.\n"); 
-        return false; 
-      } 
- 
-      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CB)) { 
-        if (MemSetInst *MSI = dyn_cast<MemSetInst>(II)) { 
-          if (MSI->isVolatile()) { 
-            LLVM_DEBUG(dbgs() << "Can not optimize a volatile memset " 
-                              << "intrinsic.\n"); 
-            return false; 
-          } 
-          Constant *Ptr = getVal(MSI->getDest()); 
-          Constant *Val = getVal(MSI->getValue()); 
-          Constant *DestVal = ComputeLoadResult(getVal(Ptr)); 
-          if (Val->isNullValue() && DestVal && DestVal->isNullValue()) { 
-            // This memset is a no-op. 
-            LLVM_DEBUG(dbgs() << "Ignoring no-op memset.\n"); 
-            ++CurInst; 
-            continue; 
-          } 
-        } 
- 
-        if (II->isLifetimeStartOrEnd()) { 
-          LLVM_DEBUG(dbgs() << "Ignoring lifetime intrinsic.\n"); 
-          ++CurInst; 
-          continue; 
-        } 
- 
-        if (II->getIntrinsicID() == Intrinsic::invariant_start) { 
-          // We don't insert an entry into Values, as it doesn't have a 
-          // meaningful return value. 
-          if (!II->use_empty()) { 
-            LLVM_DEBUG(dbgs() 
-                       << "Found unused invariant_start. Can't evaluate.\n"); 
-            return false; 
-          } 
-          ConstantInt *Size = cast<ConstantInt>(II->getArgOperand(0)); 
-          Value *PtrArg = getVal(II->getArgOperand(1)); 
-          Value *Ptr = PtrArg->stripPointerCasts(); 
-          if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) { 
-            Type *ElemTy = GV->getValueType(); 
-            if (!Size->isMinusOne() && 
-                Size->getValue().getLimitedValue() >= 
-                    DL.getTypeStoreSize(ElemTy)) { 
-              Invariants.insert(GV); 
-              LLVM_DEBUG(dbgs() << "Found a global var that is an invariant: " 
-                                << *GV << "\n"); 
-            } else { 
-              LLVM_DEBUG(dbgs() 
-                         << "Found a global var, but can not treat it as an " 
-                            "invariant.\n"); 
-            } 
-          } 
-          // Continue even if we do nothing. 
-          ++CurInst; 
-          continue; 
-        } else if (II->getIntrinsicID() == Intrinsic::assume) { 
-          LLVM_DEBUG(dbgs() << "Skipping assume intrinsic.\n"); 
-          ++CurInst; 
-          continue; 
-        } else if (II->getIntrinsicID() == Intrinsic::sideeffect) { 
-          LLVM_DEBUG(dbgs() << "Skipping sideeffect intrinsic.\n"); 
-          ++CurInst; 
-          continue; 
+
+  if (Constant *Val = findMemLoc(P))
+    return Val;
+
+  // Access it.
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P)) {
+    if (GV->hasDefinitiveInitializer())
+      return GV->getInitializer();
+    return nullptr;
+  }
+
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(P)) {
+    switch (CE->getOpcode()) {
+    // Handle a constantexpr getelementptr.
+    case Instruction::GetElementPtr:
+      if (auto *I = getInitializer(CE->getOperand(0)))
+        return ConstantFoldLoadThroughGEPConstantExpr(I, CE);
+      break;
+    // Handle a constantexpr bitcast.
+    case Instruction::BitCast:
+      // We're evaluating a load through a pointer that was bitcast to a
+      // different type. See if the "from" pointer has recently been stored.
+      // If it hasn't, we may still be able to find a stored pointer by
+      // introspecting the type.
+      Constant *Val =
+          evaluateBitcastFromPtr(CE->getOperand(0), DL, TLI, findMemLoc);
+      if (!Val)
+        Val = getInitializer(CE->getOperand(0));
+      if (Val)
+        return ConstantFoldLoadThroughBitcast(
+            Val, P->getType()->getPointerElementType(), DL);
+      break;
+    }
+  }
+
+  return nullptr;  // don't know how to evaluate.
+}
+
+static Function *getFunction(Constant *C) {
+  if (auto *Fn = dyn_cast<Function>(C))
+    return Fn;
+
+  if (auto *Alias = dyn_cast<GlobalAlias>(C))
+    if (auto *Fn = dyn_cast<Function>(Alias->getAliasee()))
+      return Fn;
+  return nullptr;
+}
+
+Function *
+Evaluator::getCalleeWithFormalArgs(CallBase &CB,
+                                   SmallVectorImpl<Constant *> &Formals) {
+  auto *V = CB.getCalledOperand();
+  if (auto *Fn = getFunction(getVal(V)))
+    return getFormalParams(CB, Fn, Formals) ? Fn : nullptr;
+
+  auto *CE = dyn_cast<ConstantExpr>(V);
+  if (!CE || CE->getOpcode() != Instruction::BitCast ||
+      !getFormalParams(CB, getFunction(CE->getOperand(0)), Formals))
+    return nullptr;
+
+  return dyn_cast<Function>(
+      ConstantFoldLoadThroughBitcast(CE, CE->getOperand(0)->getType(), DL));
+}
+
+bool Evaluator::getFormalParams(CallBase &CB, Function *F,
+                                SmallVectorImpl<Constant *> &Formals) {
+  if (!F)
+    return false;
+
+  auto *FTy = F->getFunctionType();
+  if (FTy->getNumParams() > CB.getNumArgOperands()) {
+    LLVM_DEBUG(dbgs() << "Too few arguments for function.\n");
+    return false;
+  }
+
+  auto ArgI = CB.arg_begin();
+  for (auto ParI = FTy->param_begin(), ParE = FTy->param_end(); ParI != ParE;
+       ++ParI) {
+    auto *ArgC = ConstantFoldLoadThroughBitcast(getVal(*ArgI), *ParI, DL);
+    if (!ArgC) {
+      LLVM_DEBUG(dbgs() << "Can not convert function argument.\n");
+      return false;
+    }
+    Formals.push_back(ArgC);
+    ++ArgI;
+  }
+  return true;
+}
+
+/// If call expression contains bitcast then we may need to cast
+/// evaluated return value to a type of the call expression.
+Constant *Evaluator::castCallResultIfNeeded(Value *CallExpr, Constant *RV) {
+  ConstantExpr *CE = dyn_cast<ConstantExpr>(CallExpr);
+  if (!RV || !CE || CE->getOpcode() != Instruction::BitCast)
+    return RV;
+
+  if (auto *FT =
+          dyn_cast<FunctionType>(CE->getType()->getPointerElementType())) {
+    RV = ConstantFoldLoadThroughBitcast(RV, FT->getReturnType(), DL);
+    if (!RV)
+      LLVM_DEBUG(dbgs() << "Failed to fold bitcast call expr\n");
+  }
+  return RV;
+}
+
+/// Evaluate all instructions in block BB, returning true if successful, false
+/// if we can't evaluate it.  NewBB returns the next BB that control flows into,
+/// or null upon return.
+bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
+                              BasicBlock *&NextBB) {
+  // This is the main evaluation loop.
+  while (true) {
+    Constant *InstResult = nullptr;
+
+    LLVM_DEBUG(dbgs() << "Evaluating Instruction: " << *CurInst << "\n");
+
+    if (StoreInst *SI = dyn_cast<StoreInst>(CurInst)) {
+      if (!SI->isSimple()) {
+        LLVM_DEBUG(dbgs() << "Store is not simple! Can not evaluate.\n");
+        return false;  // no volatile/atomic accesses.
+      }
+      Constant *Ptr = getVal(SI->getOperand(1));
+      Constant *FoldedPtr = ConstantFoldConstant(Ptr, DL, TLI);
+      if (Ptr != FoldedPtr) {
+        LLVM_DEBUG(dbgs() << "Folding constant ptr expression: " << *Ptr);
+        Ptr = FoldedPtr;
+        LLVM_DEBUG(dbgs() << "; To: " << *Ptr << "\n");
+      }
+      if (!isSimpleEnoughPointerToCommit(Ptr)) {
+        // If this is too complex for us to commit, reject it.
+        LLVM_DEBUG(
+            dbgs() << "Pointer is too complex for us to evaluate store.");
+        return false;
+      }
+
+      Constant *Val = getVal(SI->getOperand(0));
+
+      // If this might be too difficult for the backend to handle (e.g. the addr
+      // of one global variable divided by another) then we can't commit it.
+      if (!isSimpleEnoughValueToCommit(Val, SimpleConstants, DL)) {
+        LLVM_DEBUG(dbgs() << "Store value is too complex to evaluate store. "
+                          << *Val << "\n");
+        return false;
+      }
+
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
+        if (CE->getOpcode() == Instruction::BitCast) {
+          LLVM_DEBUG(dbgs()
+                     << "Attempting to resolve bitcast on constant ptr.\n");
+          // If we're evaluating a store through a bitcast, then we need
+          // to pull the bitcast off the pointer type and push it onto the
+          // stored value. In order to push the bitcast onto the stored value,
+          // a bitcast from the pointer's element type to Val's type must be
+          // legal. If it's not, we can try introspecting the type to find a
+          // legal conversion.
+
+          auto castValTy = [&](Constant *P) -> Constant * {
+            Type *Ty = cast<PointerType>(P->getType())->getElementType();
+            if (Constant *FV = ConstantFoldLoadThroughBitcast(Val, Ty, DL)) {
+              Ptr = P;
+              return FV;
+            }
+            return nullptr;
+          };
+
+          Constant *NewVal =
+              evaluateBitcastFromPtr(CE->getOperand(0), DL, TLI, castValTy);
+          if (!NewVal) {
+            LLVM_DEBUG(dbgs() << "Failed to bitcast constant ptr, can not "
+                                 "evaluate.\n");
+            return false;
+          }
+
+          Val = NewVal;
+          LLVM_DEBUG(dbgs() << "Evaluated bitcast: " << *Val << "\n");
+        }
+      }
+
+      MutatedMemory[Ptr] = Val;
+    } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(CurInst)) {
+      InstResult = ConstantExpr::get(BO->getOpcode(),
+                                     getVal(BO->getOperand(0)),
+                                     getVal(BO->getOperand(1)));
+      LLVM_DEBUG(dbgs() << "Found a BinaryOperator! Simplifying: "
+                        << *InstResult << "\n");
+    } else if (CmpInst *CI = dyn_cast<CmpInst>(CurInst)) {
+      InstResult = ConstantExpr::getCompare(CI->getPredicate(),
+                                            getVal(CI->getOperand(0)),
+                                            getVal(CI->getOperand(1)));
+      LLVM_DEBUG(dbgs() << "Found a CmpInst! Simplifying: " << *InstResult
+                        << "\n");
+    } else if (CastInst *CI = dyn_cast<CastInst>(CurInst)) {
+      InstResult = ConstantExpr::getCast(CI->getOpcode(),
+                                         getVal(CI->getOperand(0)),
+                                         CI->getType());
+      LLVM_DEBUG(dbgs() << "Found a Cast! Simplifying: " << *InstResult
+                        << "\n");
+    } else if (SelectInst *SI = dyn_cast<SelectInst>(CurInst)) {
+      InstResult = ConstantExpr::getSelect(getVal(SI->getOperand(0)),
+                                           getVal(SI->getOperand(1)),
+                                           getVal(SI->getOperand(2)));
+      LLVM_DEBUG(dbgs() << "Found a Select! Simplifying: " << *InstResult
+                        << "\n");
+    } else if (auto *EVI = dyn_cast<ExtractValueInst>(CurInst)) {
+      InstResult = ConstantExpr::getExtractValue(
+          getVal(EVI->getAggregateOperand()), EVI->getIndices());
+      LLVM_DEBUG(dbgs() << "Found an ExtractValueInst! Simplifying: "
+                        << *InstResult << "\n");
+    } else if (auto *IVI = dyn_cast<InsertValueInst>(CurInst)) {
+      InstResult = ConstantExpr::getInsertValue(
+          getVal(IVI->getAggregateOperand()),
+          getVal(IVI->getInsertedValueOperand()), IVI->getIndices());
+      LLVM_DEBUG(dbgs() << "Found an InsertValueInst! Simplifying: "
+                        << *InstResult << "\n");
+    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurInst)) {
+      Constant *P = getVal(GEP->getOperand(0));
+      SmallVector<Constant*, 8> GEPOps;
+      for (User::op_iterator i = GEP->op_begin() + 1, e = GEP->op_end();
+           i != e; ++i)
+        GEPOps.push_back(getVal(*i));
+      InstResult =
+          ConstantExpr::getGetElementPtr(GEP->getSourceElementType(), P, GEPOps,
+                                         cast<GEPOperator>(GEP)->isInBounds());
+      LLVM_DEBUG(dbgs() << "Found a GEP! Simplifying: " << *InstResult << "\n");
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(CurInst)) {
+      if (!LI->isSimple()) {
+        LLVM_DEBUG(
+            dbgs() << "Found a Load! Not a simple load, can not evaluate.\n");
+        return false;  // no volatile/atomic accesses.
+      }
+
+      Constant *Ptr = getVal(LI->getOperand(0));
+      Constant *FoldedPtr = ConstantFoldConstant(Ptr, DL, TLI);
+      if (Ptr != FoldedPtr) {
+        Ptr = FoldedPtr;
+        LLVM_DEBUG(dbgs() << "Found a constant pointer expression, constant "
+                             "folding: "
+                          << *Ptr << "\n");
+      }
+      InstResult = ComputeLoadResult(Ptr);
+      if (!InstResult) {
+        LLVM_DEBUG(
+            dbgs() << "Failed to compute load result. Can not evaluate load."
+                      "\n");
+        return false; // Could not evaluate load.
+      }
+
+      LLVM_DEBUG(dbgs() << "Evaluated load: " << *InstResult << "\n");
+    } else if (AllocaInst *AI = dyn_cast<AllocaInst>(CurInst)) {
+      if (AI->isArrayAllocation()) {
+        LLVM_DEBUG(dbgs() << "Found an array alloca. Can not evaluate.\n");
+        return false;  // Cannot handle array allocs.
+      }
+      Type *Ty = AI->getAllocatedType();
+      AllocaTmps.push_back(std::make_unique<GlobalVariable>(
+          Ty, false, GlobalValue::InternalLinkage, UndefValue::get(Ty),
+          AI->getName(), /*TLMode=*/GlobalValue::NotThreadLocal,
+          AI->getType()->getPointerAddressSpace()));
+      InstResult = AllocaTmps.back().get();
+      LLVM_DEBUG(dbgs() << "Found an alloca. Result: " << *InstResult << "\n");
+    } else if (isa<CallInst>(CurInst) || isa<InvokeInst>(CurInst)) {
+      CallBase &CB = *cast<CallBase>(&*CurInst);
+
+      // Debug info can safely be ignored here.
+      if (isa<DbgInfoIntrinsic>(CB)) {
+        LLVM_DEBUG(dbgs() << "Ignoring debug info.\n");
+        ++CurInst;
+        continue;
+      }
+
+      // Cannot handle inline asm.
+      if (CB.isInlineAsm()) {
+        LLVM_DEBUG(dbgs() << "Found inline asm, can not evaluate.\n");
+        return false;
+      }
+
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CB)) {
+        if (MemSetInst *MSI = dyn_cast<MemSetInst>(II)) {
+          if (MSI->isVolatile()) {
+            LLVM_DEBUG(dbgs() << "Can not optimize a volatile memset "
+                              << "intrinsic.\n");
+            return false;
+          }
+          Constant *Ptr = getVal(MSI->getDest());
+          Constant *Val = getVal(MSI->getValue());
+          Constant *DestVal = ComputeLoadResult(getVal(Ptr));
+          if (Val->isNullValue() && DestVal && DestVal->isNullValue()) {
+            // This memset is a no-op.
+            LLVM_DEBUG(dbgs() << "Ignoring no-op memset.\n");
+            ++CurInst;
+            continue;
+          }
+        }
+
+        if (II->isLifetimeStartOrEnd()) {
+          LLVM_DEBUG(dbgs() << "Ignoring lifetime intrinsic.\n");
+          ++CurInst;
+          continue;
+        }
+
+        if (II->getIntrinsicID() == Intrinsic::invariant_start) {
+          // We don't insert an entry into Values, as it doesn't have a
+          // meaningful return value.
+          if (!II->use_empty()) {
+            LLVM_DEBUG(dbgs()
+                       << "Found unused invariant_start. Can't evaluate.\n");
+            return false;
+          }
+          ConstantInt *Size = cast<ConstantInt>(II->getArgOperand(0));
+          Value *PtrArg = getVal(II->getArgOperand(1));
+          Value *Ptr = PtrArg->stripPointerCasts();
+          if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) {
+            Type *ElemTy = GV->getValueType();
+            if (!Size->isMinusOne() &&
+                Size->getValue().getLimitedValue() >=
+                    DL.getTypeStoreSize(ElemTy)) {
+              Invariants.insert(GV);
+              LLVM_DEBUG(dbgs() << "Found a global var that is an invariant: "
+                                << *GV << "\n");
+            } else {
+              LLVM_DEBUG(dbgs()
+                         << "Found a global var, but can not treat it as an "
+                            "invariant.\n");
+            }
+          }
+          // Continue even if we do nothing.
+          ++CurInst;
+          continue;
+        } else if (II->getIntrinsicID() == Intrinsic::assume) {
+          LLVM_DEBUG(dbgs() << "Skipping assume intrinsic.\n");
+          ++CurInst;
+          continue;
+        } else if (II->getIntrinsicID() == Intrinsic::sideeffect) {
+          LLVM_DEBUG(dbgs() << "Skipping sideeffect intrinsic.\n");
+          ++CurInst;
+          continue;
         } else if (II->getIntrinsicID() == Intrinsic::pseudoprobe) {
           LLVM_DEBUG(dbgs() << "Skipping pseudoprobe intrinsic.\n");
           ++CurInst;
           continue;
-        } 
- 
-        LLVM_DEBUG(dbgs() << "Unknown intrinsic. Can not evaluate.\n"); 
-        return false; 
-      } 
- 
-      // Resolve function pointers. 
-      SmallVector<Constant *, 8> Formals; 
-      Function *Callee = getCalleeWithFormalArgs(CB, Formals); 
-      if (!Callee || Callee->isInterposable()) { 
-        LLVM_DEBUG(dbgs() << "Can not resolve function pointer.\n"); 
-        return false;  // Cannot resolve. 
-      } 
- 
-      if (Callee->isDeclaration()) { 
-        // If this is a function we can constant fold, do it. 
-        if (Constant *C = ConstantFoldCall(&CB, Callee, Formals, TLI)) { 
-          InstResult = castCallResultIfNeeded(CB.getCalledOperand(), C); 
-          if (!InstResult) 
-            return false; 
-          LLVM_DEBUG(dbgs() << "Constant folded function call. Result: " 
-                            << *InstResult << "\n"); 
-        } else { 
-          LLVM_DEBUG(dbgs() << "Can not constant fold function call.\n"); 
-          return false; 
-        } 
-      } else { 
-        if (Callee->getFunctionType()->isVarArg()) { 
-          LLVM_DEBUG(dbgs() << "Can not constant fold vararg function call.\n"); 
-          return false; 
-        } 
- 
-        Constant *RetVal = nullptr; 
-        // Execute the call, if successful, use the return value. 
-        ValueStack.emplace_back(); 
-        if (!EvaluateFunction(Callee, RetVal, Formals)) { 
-          LLVM_DEBUG(dbgs() << "Failed to evaluate function.\n"); 
-          return false; 
-        } 
-        ValueStack.pop_back(); 
-        InstResult = castCallResultIfNeeded(CB.getCalledOperand(), RetVal); 
-        if (RetVal && !InstResult) 
-          return false; 
- 
-        if (InstResult) { 
-          LLVM_DEBUG(dbgs() << "Successfully evaluated function. Result: " 
-                            << *InstResult << "\n\n"); 
-        } else { 
-          LLVM_DEBUG(dbgs() 
-                     << "Successfully evaluated function. Result: 0\n\n"); 
-        } 
-      } 
-    } else if (CurInst->isTerminator()) { 
-      LLVM_DEBUG(dbgs() << "Found a terminator instruction.\n"); 
- 
-      if (BranchInst *BI = dyn_cast<BranchInst>(CurInst)) { 
-        if (BI->isUnconditional()) { 
-          NextBB = BI->getSuccessor(0); 
-        } else { 
-          ConstantInt *Cond = 
-            dyn_cast<ConstantInt>(getVal(BI->getCondition())); 
-          if (!Cond) return false;  // Cannot determine. 
- 
-          NextBB = BI->getSuccessor(!Cond->getZExtValue()); 
-        } 
-      } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurInst)) { 
-        ConstantInt *Val = 
-          dyn_cast<ConstantInt>(getVal(SI->getCondition())); 
-        if (!Val) return false;  // Cannot determine. 
-        NextBB = SI->findCaseValue(Val)->getCaseSuccessor(); 
-      } else if (IndirectBrInst *IBI = dyn_cast<IndirectBrInst>(CurInst)) { 
-        Value *Val = getVal(IBI->getAddress())->stripPointerCasts(); 
-        if (BlockAddress *BA = dyn_cast<BlockAddress>(Val)) 
-          NextBB = BA->getBasicBlock(); 
-        else 
-          return false;  // Cannot determine. 
-      } else if (isa<ReturnInst>(CurInst)) { 
-        NextBB = nullptr; 
-      } else { 
-        // invoke, unwind, resume, unreachable. 
-        LLVM_DEBUG(dbgs() << "Can not handle terminator."); 
-        return false;  // Cannot handle this terminator. 
-      } 
- 
-      // We succeeded at evaluating this block! 
-      LLVM_DEBUG(dbgs() << "Successfully evaluated block.\n"); 
-      return true; 
-    } else { 
-      // Did not know how to evaluate this! 
-      LLVM_DEBUG( 
-          dbgs() << "Failed to evaluate block due to unhandled instruction." 
-                    "\n"); 
-      return false; 
-    } 
- 
-    if (!CurInst->use_empty()) { 
-      InstResult = ConstantFoldConstant(InstResult, DL, TLI); 
-      setVal(&*CurInst, InstResult); 
-    } 
- 
-    // If we just processed an invoke, we finished evaluating the block. 
-    if (InvokeInst *II = dyn_cast<InvokeInst>(CurInst)) { 
-      NextBB = II->getNormalDest(); 
-      LLVM_DEBUG(dbgs() << "Found an invoke instruction. Finished Block.\n\n"); 
-      return true; 
-    } 
- 
-    // Advance program counter. 
-    ++CurInst; 
-  } 
-} 
- 
-/// Evaluate a call to function F, returning true if successful, false if we 
-/// can't evaluate it.  ActualArgs contains the formal arguments for the 
-/// function. 
-bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal, 
-                                 const SmallVectorImpl<Constant*> &ActualArgs) { 
-  // Check to see if this function is already executing (recursion).  If so, 
-  // bail out.  TODO: we might want to accept limited recursion. 
-  if (is_contained(CallStack, F)) 
-    return false; 
- 
-  CallStack.push_back(F); 
- 
-  // Initialize arguments to the incoming values specified. 
-  unsigned ArgNo = 0; 
-  for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); AI != E; 
-       ++AI, ++ArgNo) 
-    setVal(&*AI, ActualArgs[ArgNo]); 
- 
-  // ExecutedBlocks - We only handle non-looping, non-recursive code.  As such, 
-  // we can only evaluate any one basic block at most once.  This set keeps 
-  // track of what we have executed so we can detect recursive cases etc. 
-  SmallPtrSet<BasicBlock*, 32> ExecutedBlocks; 
- 
-  // CurBB - The current basic block we're evaluating. 
-  BasicBlock *CurBB = &F->front(); 
- 
-  BasicBlock::iterator CurInst = CurBB->begin(); 
- 
-  while (true) { 
-    BasicBlock *NextBB = nullptr; // Initialized to avoid compiler warnings. 
-    LLVM_DEBUG(dbgs() << "Trying to evaluate BB: " << *CurBB << "\n"); 
- 
-    if (!EvaluateBlock(CurInst, NextBB)) 
-      return false; 
- 
-    if (!NextBB) { 
-      // Successfully running until there's no next block means that we found 
-      // the return.  Fill it the return value and pop the call stack. 
-      ReturnInst *RI = cast<ReturnInst>(CurBB->getTerminator()); 
-      if (RI->getNumOperands()) 
-        RetVal = getVal(RI->getOperand(0)); 
-      CallStack.pop_back(); 
-      return true; 
-    } 
- 
-    // Okay, we succeeded in evaluating this control flow.  See if we have 
-    // executed the new block before.  If so, we have a looping function, 
-    // which we cannot evaluate in reasonable time. 
-    if (!ExecutedBlocks.insert(NextBB).second) 
-      return false;  // looped! 
- 
-    // Okay, we have never been in this block before.  Check to see if there 
-    // are any PHI nodes.  If so, evaluate them with information about where 
-    // we came from. 
-    PHINode *PN = nullptr; 
-    for (CurInst = NextBB->begin(); 
-         (PN = dyn_cast<PHINode>(CurInst)); ++CurInst) 
-      setVal(PN, getVal(PN->getIncomingValueForBlock(CurBB))); 
- 
-    // Advance to the next block. 
-    CurBB = NextBB; 
-  } 
-} 
+        }
+
+        LLVM_DEBUG(dbgs() << "Unknown intrinsic. Can not evaluate.\n");
+        return false;
+      }
+
+      // Resolve function pointers.
+      SmallVector<Constant *, 8> Formals;
+      Function *Callee = getCalleeWithFormalArgs(CB, Formals);
+      if (!Callee || Callee->isInterposable()) {
+        LLVM_DEBUG(dbgs() << "Can not resolve function pointer.\n");
+        return false;  // Cannot resolve.
+      }
+
+      if (Callee->isDeclaration()) {
+        // If this is a function we can constant fold, do it.
+        if (Constant *C = ConstantFoldCall(&CB, Callee, Formals, TLI)) {
+          InstResult = castCallResultIfNeeded(CB.getCalledOperand(), C);
+          if (!InstResult)
+            return false;
+          LLVM_DEBUG(dbgs() << "Constant folded function call. Result: "
+                            << *InstResult << "\n");
+        } else {
+          LLVM_DEBUG(dbgs() << "Can not constant fold function call.\n");
+          return false;
+        }
+      } else {
+        if (Callee->getFunctionType()->isVarArg()) {
+          LLVM_DEBUG(dbgs() << "Can not constant fold vararg function call.\n");
+          return false;
+        }
+
+        Constant *RetVal = nullptr;
+        // Execute the call, if successful, use the return value.
+        ValueStack.emplace_back();
+        if (!EvaluateFunction(Callee, RetVal, Formals)) {
+          LLVM_DEBUG(dbgs() << "Failed to evaluate function.\n");
+          return false;
+        }
+        ValueStack.pop_back();
+        InstResult = castCallResultIfNeeded(CB.getCalledOperand(), RetVal);
+        if (RetVal && !InstResult)
+          return false;
+
+        if (InstResult) {
+          LLVM_DEBUG(dbgs() << "Successfully evaluated function. Result: "
+                            << *InstResult << "\n\n");
+        } else {
+          LLVM_DEBUG(dbgs()
+                     << "Successfully evaluated function. Result: 0\n\n");
+        }
+      }
+    } else if (CurInst->isTerminator()) {
+      LLVM_DEBUG(dbgs() << "Found a terminator instruction.\n");
+
+      if (BranchInst *BI = dyn_cast<BranchInst>(CurInst)) {
+        if (BI->isUnconditional()) {
+          NextBB = BI->getSuccessor(0);
+        } else {
+          ConstantInt *Cond =
+            dyn_cast<ConstantInt>(getVal(BI->getCondition()));
+          if (!Cond) return false;  // Cannot determine.
+
+          NextBB = BI->getSuccessor(!Cond->getZExtValue());
+        }
+      } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurInst)) {
+        ConstantInt *Val =
+          dyn_cast<ConstantInt>(getVal(SI->getCondition()));
+        if (!Val) return false;  // Cannot determine.
+        NextBB = SI->findCaseValue(Val)->getCaseSuccessor();
+      } else if (IndirectBrInst *IBI = dyn_cast<IndirectBrInst>(CurInst)) {
+        Value *Val = getVal(IBI->getAddress())->stripPointerCasts();
+        if (BlockAddress *BA = dyn_cast<BlockAddress>(Val))
+          NextBB = BA->getBasicBlock();
+        else
+          return false;  // Cannot determine.
+      } else if (isa<ReturnInst>(CurInst)) {
+        NextBB = nullptr;
+      } else {
+        // invoke, unwind, resume, unreachable.
+        LLVM_DEBUG(dbgs() << "Can not handle terminator.");
+        return false;  // Cannot handle this terminator.
+      }
+
+      // We succeeded at evaluating this block!
+      LLVM_DEBUG(dbgs() << "Successfully evaluated block.\n");
+      return true;
+    } else {
+      // Did not know how to evaluate this!
+      LLVM_DEBUG(
+          dbgs() << "Failed to evaluate block due to unhandled instruction."
+                    "\n");
+      return false;
+    }
+
+    if (!CurInst->use_empty()) {
+      InstResult = ConstantFoldConstant(InstResult, DL, TLI);
+      setVal(&*CurInst, InstResult);
+    }
+
+    // If we just processed an invoke, we finished evaluating the block.
+    if (InvokeInst *II = dyn_cast<InvokeInst>(CurInst)) {
+      NextBB = II->getNormalDest();
+      LLVM_DEBUG(dbgs() << "Found an invoke instruction. Finished Block.\n\n");
+      return true;
+    }
+
+    // Advance program counter.
+    ++CurInst;
+  }
+}
+
+/// Evaluate a call to function F, returning true if successful, false if we
+/// can't evaluate it.  ActualArgs contains the formal arguments for the
+/// function.
+bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal,
+                                 const SmallVectorImpl<Constant*> &ActualArgs) {
+  // Check to see if this function is already executing (recursion).  If so,
+  // bail out.  TODO: we might want to accept limited recursion.
+  if (is_contained(CallStack, F))
+    return false;
+
+  CallStack.push_back(F);
+
+  // Initialize arguments to the incoming values specified.
+  unsigned ArgNo = 0;
+  for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); AI != E;
+       ++AI, ++ArgNo)
+    setVal(&*AI, ActualArgs[ArgNo]);
+
+  // ExecutedBlocks - We only handle non-looping, non-recursive code.  As such,
+  // we can only evaluate any one basic block at most once.  This set keeps
+  // track of what we have executed so we can detect recursive cases etc.
+  SmallPtrSet<BasicBlock*, 32> ExecutedBlocks;
+
+  // CurBB - The current basic block we're evaluating.
+  BasicBlock *CurBB = &F->front();
+
+  BasicBlock::iterator CurInst = CurBB->begin();
+
+  while (true) {
+    BasicBlock *NextBB = nullptr; // Initialized to avoid compiler warnings.
+    LLVM_DEBUG(dbgs() << "Trying to evaluate BB: " << *CurBB << "\n");
+
+    if (!EvaluateBlock(CurInst, NextBB))
+      return false;
+
+    if (!NextBB) {
+      // Successfully running until there's no next block means that we found
+      // the return.  Fill it the return value and pop the call stack.
+      ReturnInst *RI = cast<ReturnInst>(CurBB->getTerminator());
+      if (RI->getNumOperands())
+        RetVal = getVal(RI->getOperand(0));
+      CallStack.pop_back();
+      return true;
+    }
+
+    // Okay, we succeeded in evaluating this control flow.  See if we have
+    // executed the new block before.  If so, we have a looping function,
+    // which we cannot evaluate in reasonable time.
+    if (!ExecutedBlocks.insert(NextBB).second)
+      return false;  // looped!
+
+    // Okay, we have never been in this block before.  Check to see if there
+    // are any PHI nodes.  If so, evaluate them with information about where
+    // we came from.
+    PHINode *PN = nullptr;
+    for (CurInst = NextBB->begin();
+         (PN = dyn_cast<PHINode>(CurInst)); ++CurInst)
+      setVal(PN, getVal(PN->getIncomingValueForBlock(CurBB)));
+
+    // Advance to the next block.
+    CurBB = NextBB;
+  }
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/FixIrreducible.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/FixIrreducible.cpp
index 29fa7f12d7..44af95eef6 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/FixIrreducible.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/FixIrreducible.cpp
@@ -1,336 +1,336 @@
-//===- FixIrreducible.cpp - Convert irreducible control-flow into loops ---===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// An irreducible SCC is one which has multiple "header" blocks, i.e., blocks 
-// with control-flow edges incident from outside the SCC.  This pass converts a 
-// irreducible SCC into a natural loop by applying the following transformation: 
-// 
-// 1. Collect the set of headers H of the SCC. 
-// 2. Collect the set of predecessors P of these headers. These may be inside as 
-//    well as outside the SCC. 
-// 3. Create block N and redirect every edge from set P to set H through N. 
-// 
-// This converts the SCC into a natural loop with N as the header: N is the only 
-// block with edges incident from outside the SCC, and all backedges in the SCC 
-// are incident on N, i.e., for every backedge, the head now dominates the tail. 
-// 
-// INPUT CFG: The blocks A and B form an irreducible loop with two headers. 
-// 
-//                        Entry 
-//                       /     \ 
-//                      v       v 
-//                      A ----> B 
-//                      ^      /| 
-//                       `----' | 
-//                              v 
-//                             Exit 
-// 
-// OUTPUT CFG: Edges incident on A and B are now redirected through a 
-// new block N, forming a natural loop consisting of N, A and B. 
-// 
-//                        Entry 
-//                          | 
-//                          v 
-//                    .---> N <---. 
-//                   /     / \     \ 
-//                  |     /   \     | 
-//                  \    v     v    / 
-//                   `-- A     B --' 
-//                             | 
-//                             v 
-//                            Exit 
-// 
-// The transformation is applied to every maximal SCC that is not already 
-// recognized as a loop. The pass operates on all maximal SCCs found in the 
-// function body outside of any loop, as well as those found inside each loop, 
-// including inside any newly created loops. This ensures that any SCC hidden 
-// inside a maximal SCC is also transformed. 
-// 
-// The actual transformation is handled by function CreateControlFlowHub, which 
-// takes a set of incoming blocks (the predecessors) and outgoing blocks (the 
-// headers). The function also moves every PHINode in an outgoing block to the 
-// hub. Since the hub dominates all the outgoing blocks, each such PHINode 
-// continues to dominate its uses. Since every header in an SCC has at least two 
-// predecessors, every value used in the header (or later) but defined in a 
-// predecessor (or earlier) is represented by a PHINode in a header. Hence the 
-// above handling of PHINodes is sufficient and no further processing is 
-// required to restore SSA. 
-// 
-// Limitation: The pass cannot handle switch statements and indirect 
-//             branches. Both must be lowered to plain branches first. 
-// 
-//===----------------------------------------------------------------------===// 
- 
+//===- FixIrreducible.cpp - Convert irreducible control-flow into loops ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// An irreducible SCC is one which has multiple "header" blocks, i.e., blocks
+// with control-flow edges incident from outside the SCC.  This pass converts a
+// irreducible SCC into a natural loop by applying the following transformation:
+//
+// 1. Collect the set of headers H of the SCC.
+// 2. Collect the set of predecessors P of these headers. These may be inside as
+//    well as outside the SCC.
+// 3. Create block N and redirect every edge from set P to set H through N.
+//
+// This converts the SCC into a natural loop with N as the header: N is the only
+// block with edges incident from outside the SCC, and all backedges in the SCC
+// are incident on N, i.e., for every backedge, the head now dominates the tail.
+//
+// INPUT CFG: The blocks A and B form an irreducible loop with two headers.
+//
+//                        Entry
+//                       /     \
+//                      v       v
+//                      A ----> B
+//                      ^      /|
+//                       `----' |
+//                              v
+//                             Exit
+//
+// OUTPUT CFG: Edges incident on A and B are now redirected through a
+// new block N, forming a natural loop consisting of N, A and B.
+//
+//                        Entry
+//                          |
+//                          v
+//                    .---> N <---.
+//                   /     / \     \
+//                  |     /   \     |
+//                  \    v     v    /
+//                   `-- A     B --'
+//                             |
+//                             v
+//                            Exit
+//
+// The transformation is applied to every maximal SCC that is not already
+// recognized as a loop. The pass operates on all maximal SCCs found in the
+// function body outside of any loop, as well as those found inside each loop,
+// including inside any newly created loops. This ensures that any SCC hidden
+// inside a maximal SCC is also transformed.
+//
+// The actual transformation is handled by function CreateControlFlowHub, which
+// takes a set of incoming blocks (the predecessors) and outgoing blocks (the
+// headers). The function also moves every PHINode in an outgoing block to the
+// hub. Since the hub dominates all the outgoing blocks, each such PHINode
+// continues to dominate its uses. Since every header in an SCC has at least two
+// predecessors, every value used in the header (or later) but defined in a
+// predecessor (or earlier) is represented by a PHINode in a header. Hence the
+// above handling of PHINodes is sufficient and no further processing is
+// required to restore SSA.
+//
+// Limitation: The pass cannot handle switch statements and indirect
+//             branches. Both must be lowered to plain branches first.
+//
+//===----------------------------------------------------------------------===//
+
 #include "llvm/Transforms/Utils/FixIrreducible.h"
-#include "llvm/ADT/SCCIterator.h" 
-#include "llvm/Analysis/LoopIterator.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Transforms/Utils.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
- 
-#define DEBUG_TYPE "fix-irreducible" 
- 
-using namespace llvm; 
- 
-namespace { 
-struct FixIrreducible : public FunctionPass { 
-  static char ID; 
-  FixIrreducible() : FunctionPass(ID) { 
-    initializeFixIrreduciblePass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequiredID(LowerSwitchID); 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addRequired<LoopInfoWrapperPass>(); 
-    AU.addPreservedID(LowerSwitchID); 
-    AU.addPreserved<DominatorTreeWrapperPass>(); 
-    AU.addPreserved<LoopInfoWrapperPass>(); 
-  } 
- 
-  bool runOnFunction(Function &F) override; 
-}; 
-} // namespace 
- 
-char FixIrreducible::ID = 0; 
- 
-FunctionPass *llvm::createFixIrreduciblePass() { return new FixIrreducible(); } 
- 
-INITIALIZE_PASS_BEGIN(FixIrreducible, "fix-irreducible", 
-                      "Convert irreducible control-flow into natural loops", 
-                      false /* Only looks at CFG */, false /* Analysis Pass */) 
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "fix-irreducible"
+
+using namespace llvm;
+
+namespace {
+struct FixIrreducible : public FunctionPass {
+  static char ID;
+  FixIrreducible() : FunctionPass(ID) {
+    initializeFixIrreduciblePass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequiredID(LowerSwitchID);
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreservedID(LowerSwitchID);
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override;
+};
+} // namespace
+
+char FixIrreducible::ID = 0;
+
+FunctionPass *llvm::createFixIrreduciblePass() { return new FixIrreducible(); }
+
+INITIALIZE_PASS_BEGIN(FixIrreducible, "fix-irreducible",
+                      "Convert irreducible control-flow into natural loops",
+                      false /* Only looks at CFG */, false /* Analysis Pass */)
 INITIALIZE_PASS_DEPENDENCY(LowerSwitchLegacyPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 
-INITIALIZE_PASS_END(FixIrreducible, "fix-irreducible", 
-                    "Convert irreducible control-flow into natural loops", 
-                    false /* Only looks at CFG */, false /* Analysis Pass */) 
- 
-// When a new loop is created, existing children of the parent loop may now be 
-// fully inside the new loop. Reconnect these as children of the new loop. 
-static void reconnectChildLoops(LoopInfo &LI, Loop *ParentLoop, Loop *NewLoop, 
-                                SetVector<BasicBlock *> &Blocks, 
-                                SetVector<BasicBlock *> &Headers) { 
-  auto &CandidateLoops = ParentLoop ? ParentLoop->getSubLoopsVector() 
-                                    : LI.getTopLevelLoopsVector(); 
-  // The new loop cannot be its own child, and any candidate is a 
-  // child iff its header is owned by the new loop. Move all the 
-  // children to a new vector. 
-  auto FirstChild = std::partition( 
-      CandidateLoops.begin(), CandidateLoops.end(), [&](Loop *L) { 
-        return L == NewLoop || Blocks.count(L->getHeader()) == 0; 
-      }); 
-  SmallVector<Loop *, 8> ChildLoops(FirstChild, CandidateLoops.end()); 
-  CandidateLoops.erase(FirstChild, CandidateLoops.end()); 
- 
-  for (auto II = ChildLoops.begin(), IE = ChildLoops.end(); II != IE; ++II) { 
-    auto Child = *II; 
-    LLVM_DEBUG(dbgs() << "child loop: " << Child->getHeader()->getName() 
-                      << "\n"); 
-    // TODO: A child loop whose header is also a header in the current 
-    // SCC gets destroyed since its backedges are removed. That may 
-    // not be necessary if we can retain such backedges. 
-    if (Headers.count(Child->getHeader())) { 
-      for (auto BB : Child->blocks()) { 
-        LI.changeLoopFor(BB, NewLoop); 
-        LLVM_DEBUG(dbgs() << "moved block from child: " << BB->getName() 
-                          << "\n"); 
-      } 
-      LI.destroy(Child); 
-      LLVM_DEBUG(dbgs() << "subsumed child loop (common header)\n"); 
-      continue; 
-    } 
- 
-    Child->setParentLoop(nullptr); 
-    NewLoop->addChildLoop(Child); 
-    LLVM_DEBUG(dbgs() << "added child loop to new loop\n"); 
-  } 
-} 
- 
-// Given a set of blocks and headers in an irreducible SCC, convert it into a 
-// natural loop. Also insert this new loop at its appropriate place in the 
-// hierarchy of loops. 
-static void createNaturalLoopInternal(LoopInfo &LI, DominatorTree &DT, 
-                                      Loop *ParentLoop, 
-                                      SetVector<BasicBlock *> &Blocks, 
-                                      SetVector<BasicBlock *> &Headers) { 
-#ifndef NDEBUG 
-  // All headers are part of the SCC 
-  for (auto H : Headers) { 
-    assert(Blocks.count(H)); 
-  } 
-#endif 
- 
-  SetVector<BasicBlock *> Predecessors; 
-  for (auto H : Headers) { 
-    for (auto P : predecessors(H)) { 
-      Predecessors.insert(P); 
-    } 
-  } 
- 
-  LLVM_DEBUG( 
-      dbgs() << "Found predecessors:"; 
-      for (auto P : Predecessors) { 
-        dbgs() << " " << P->getName(); 
-      } 
-      dbgs() << "\n"); 
- 
-  // Redirect all the backedges through a "hub" consisting of a series 
-  // of guard blocks that manage the flow of control from the 
-  // predecessors to the headers. 
-  SmallVector<BasicBlock *, 8> GuardBlocks; 
-  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); 
-  CreateControlFlowHub(&DTU, GuardBlocks, Predecessors, Headers, "irr"); 
-#if defined(EXPENSIVE_CHECKS) 
-  assert(DT.verify(DominatorTree::VerificationLevel::Full)); 
-#else 
-  assert(DT.verify(DominatorTree::VerificationLevel::Fast)); 
-#endif 
- 
-  // Create a new loop from the now-transformed cycle 
-  auto NewLoop = LI.AllocateLoop(); 
-  if (ParentLoop) { 
-    ParentLoop->addChildLoop(NewLoop); 
-  } else { 
-    LI.addTopLevelLoop(NewLoop); 
-  } 
- 
-  // Add the guard blocks to the new loop. The first guard block is 
-  // the head of all the backedges, and it is the first to be inserted 
-  // in the loop. This ensures that it is recognized as the 
-  // header. Since the new loop is already in LoopInfo, the new blocks 
-  // are also propagated up the chain of parent loops. 
-  for (auto G : GuardBlocks) { 
-    LLVM_DEBUG(dbgs() << "added guard block: " << G->getName() << "\n"); 
-    NewLoop->addBasicBlockToLoop(G, LI); 
-  } 
- 
-  // Add the SCC blocks to the new loop. 
-  for (auto BB : Blocks) { 
-    NewLoop->addBlockEntry(BB); 
-    if (LI.getLoopFor(BB) == ParentLoop) { 
-      LLVM_DEBUG(dbgs() << "moved block from parent: " << BB->getName() 
-                        << "\n"); 
-      LI.changeLoopFor(BB, NewLoop); 
-    } else { 
-      LLVM_DEBUG(dbgs() << "added block from child: " << BB->getName() << "\n"); 
-    } 
-  } 
-  LLVM_DEBUG(dbgs() << "header for new loop: " 
-                    << NewLoop->getHeader()->getName() << "\n"); 
- 
-  reconnectChildLoops(LI, ParentLoop, NewLoop, Blocks, Headers); 
- 
-  NewLoop->verifyLoop(); 
-  if (ParentLoop) { 
-    ParentLoop->verifyLoop(); 
-  } 
-#if defined(EXPENSIVE_CHECKS) 
-  LI.verify(DT); 
-#endif // EXPENSIVE_CHECKS 
-} 
- 
-namespace llvm { 
-// Enable the graph traits required for traversing a Loop body. 
-template <> struct GraphTraits<Loop> : LoopBodyTraits {}; 
-} // namespace llvm 
- 
-// Overloaded wrappers to go with the function template below. 
-static BasicBlock *unwrapBlock(BasicBlock *B) { return B; } 
-static BasicBlock *unwrapBlock(LoopBodyTraits::NodeRef &N) { return N.second; } 
- 
-static void createNaturalLoop(LoopInfo &LI, DominatorTree &DT, Function *F, 
-                              SetVector<BasicBlock *> &Blocks, 
-                              SetVector<BasicBlock *> &Headers) { 
-  createNaturalLoopInternal(LI, DT, nullptr, Blocks, Headers); 
-} 
- 
-static void createNaturalLoop(LoopInfo &LI, DominatorTree &DT, Loop &L, 
-                              SetVector<BasicBlock *> &Blocks, 
-                              SetVector<BasicBlock *> &Headers) { 
-  createNaturalLoopInternal(LI, DT, &L, Blocks, Headers); 
-} 
- 
-// Convert irreducible SCCs; Graph G may be a Function* or a Loop&. 
-template <class Graph> 
-static bool makeReducible(LoopInfo &LI, DominatorTree &DT, Graph &&G) { 
-  bool Changed = false; 
-  for (auto Scc = scc_begin(G); !Scc.isAtEnd(); ++Scc) { 
-    if (Scc->size() < 2) 
-      continue; 
-    SetVector<BasicBlock *> Blocks; 
-    LLVM_DEBUG(dbgs() << "Found SCC:"); 
-    for (auto N : *Scc) { 
-      auto BB = unwrapBlock(N); 
-      LLVM_DEBUG(dbgs() << " " << BB->getName()); 
-      Blocks.insert(BB); 
-    } 
-    LLVM_DEBUG(dbgs() << "\n"); 
- 
-    // Minor optimization: The SCC blocks are usually discovered in an order 
-    // that is the opposite of the order in which these blocks appear as branch 
-    // targets. This results in a lot of condition inversions in the control 
-    // flow out of the new ControlFlowHub, which can be mitigated if the orders 
-    // match. So we discover the headers using the reverse of the block order. 
-    SetVector<BasicBlock *> Headers; 
-    LLVM_DEBUG(dbgs() << "Found headers:"); 
-    for (auto BB : reverse(Blocks)) { 
-      for (const auto P : predecessors(BB)) { 
-        // Skip unreachable predecessors. 
-        if (!DT.isReachableFromEntry(P)) 
-          continue; 
-        if (!Blocks.count(P)) { 
-          LLVM_DEBUG(dbgs() << " " << BB->getName()); 
-          Headers.insert(BB); 
-          break; 
-        } 
-      } 
-    } 
-    LLVM_DEBUG(dbgs() << "\n"); 
- 
-    if (Headers.size() == 1) { 
-      assert(LI.isLoopHeader(Headers.front())); 
-      LLVM_DEBUG(dbgs() << "Natural loop with a single header: skipped\n"); 
-      continue; 
-    } 
-    createNaturalLoop(LI, DT, G, Blocks, Headers); 
-    Changed = true; 
-  } 
-  return Changed; 
-} 
- 
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(FixIrreducible, "fix-irreducible",
+                    "Convert irreducible control-flow into natural loops",
+                    false /* Only looks at CFG */, false /* Analysis Pass */)
+
+// When a new loop is created, existing children of the parent loop may now be
+// fully inside the new loop. Reconnect these as children of the new loop.
+static void reconnectChildLoops(LoopInfo &LI, Loop *ParentLoop, Loop *NewLoop,
+                                SetVector<BasicBlock *> &Blocks,
+                                SetVector<BasicBlock *> &Headers) {
+  auto &CandidateLoops = ParentLoop ? ParentLoop->getSubLoopsVector()
+                                    : LI.getTopLevelLoopsVector();
+  // The new loop cannot be its own child, and any candidate is a
+  // child iff its header is owned by the new loop. Move all the
+  // children to a new vector.
+  auto FirstChild = std::partition(
+      CandidateLoops.begin(), CandidateLoops.end(), [&](Loop *L) {
+        return L == NewLoop || Blocks.count(L->getHeader()) == 0;
+      });
+  SmallVector<Loop *, 8> ChildLoops(FirstChild, CandidateLoops.end());
+  CandidateLoops.erase(FirstChild, CandidateLoops.end());
+
+  for (auto II = ChildLoops.begin(), IE = ChildLoops.end(); II != IE; ++II) {
+    auto Child = *II;
+    LLVM_DEBUG(dbgs() << "child loop: " << Child->getHeader()->getName()
+                      << "\n");
+    // TODO: A child loop whose header is also a header in the current
+    // SCC gets destroyed since its backedges are removed. That may
+    // not be necessary if we can retain such backedges.
+    if (Headers.count(Child->getHeader())) {
+      for (auto BB : Child->blocks()) {
+        LI.changeLoopFor(BB, NewLoop);
+        LLVM_DEBUG(dbgs() << "moved block from child: " << BB->getName()
+                          << "\n");
+      }
+      LI.destroy(Child);
+      LLVM_DEBUG(dbgs() << "subsumed child loop (common header)\n");
+      continue;
+    }
+
+    Child->setParentLoop(nullptr);
+    NewLoop->addChildLoop(Child);
+    LLVM_DEBUG(dbgs() << "added child loop to new loop\n");
+  }
+}
+
+// Given a set of blocks and headers in an irreducible SCC, convert it into a
+// natural loop. Also insert this new loop at its appropriate place in the
+// hierarchy of loops.
+static void createNaturalLoopInternal(LoopInfo &LI, DominatorTree &DT,
+                                      Loop *ParentLoop,
+                                      SetVector<BasicBlock *> &Blocks,
+                                      SetVector<BasicBlock *> &Headers) {
+#ifndef NDEBUG
+  // All headers are part of the SCC
+  for (auto H : Headers) {
+    assert(Blocks.count(H));
+  }
+#endif
+
+  SetVector<BasicBlock *> Predecessors;
+  for (auto H : Headers) {
+    for (auto P : predecessors(H)) {
+      Predecessors.insert(P);
+    }
+  }
+
+  LLVM_DEBUG(
+      dbgs() << "Found predecessors:";
+      for (auto P : Predecessors) {
+        dbgs() << " " << P->getName();
+      }
+      dbgs() << "\n");
+
+  // Redirect all the backedges through a "hub" consisting of a series
+  // of guard blocks that manage the flow of control from the
+  // predecessors to the headers.
+  SmallVector<BasicBlock *, 8> GuardBlocks;
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+  CreateControlFlowHub(&DTU, GuardBlocks, Predecessors, Headers, "irr");
+#if defined(EXPENSIVE_CHECKS)
+  assert(DT.verify(DominatorTree::VerificationLevel::Full));
+#else
+  assert(DT.verify(DominatorTree::VerificationLevel::Fast));
+#endif
+
+  // Create a new loop from the now-transformed cycle
+  auto NewLoop = LI.AllocateLoop();
+  if (ParentLoop) {
+    ParentLoop->addChildLoop(NewLoop);
+  } else {
+    LI.addTopLevelLoop(NewLoop);
+  }
+
+  // Add the guard blocks to the new loop. The first guard block is
+  // the head of all the backedges, and it is the first to be inserted
+  // in the loop. This ensures that it is recognized as the
+  // header. Since the new loop is already in LoopInfo, the new blocks
+  // are also propagated up the chain of parent loops.
+  for (auto G : GuardBlocks) {
+    LLVM_DEBUG(dbgs() << "added guard block: " << G->getName() << "\n");
+    NewLoop->addBasicBlockToLoop(G, LI);
+  }
+
+  // Add the SCC blocks to the new loop.
+  for (auto BB : Blocks) {
+    NewLoop->addBlockEntry(BB);
+    if (LI.getLoopFor(BB) == ParentLoop) {
+      LLVM_DEBUG(dbgs() << "moved block from parent: " << BB->getName()
+                        << "\n");
+      LI.changeLoopFor(BB, NewLoop);
+    } else {
+      LLVM_DEBUG(dbgs() << "added block from child: " << BB->getName() << "\n");
+    }
+  }
+  LLVM_DEBUG(dbgs() << "header for new loop: "
+                    << NewLoop->getHeader()->getName() << "\n");
+
+  reconnectChildLoops(LI, ParentLoop, NewLoop, Blocks, Headers);
+
+  NewLoop->verifyLoop();
+  if (ParentLoop) {
+    ParentLoop->verifyLoop();
+  }
+#if defined(EXPENSIVE_CHECKS)
+  LI.verify(DT);
+#endif // EXPENSIVE_CHECKS
+}
+
+namespace llvm {
+// Enable the graph traits required for traversing a Loop body.
+template <> struct GraphTraits<Loop> : LoopBodyTraits {};
+} // namespace llvm
+
+// Overloaded wrappers to go with the function template below.
+static BasicBlock *unwrapBlock(BasicBlock *B) { return B; }
+static BasicBlock *unwrapBlock(LoopBodyTraits::NodeRef &N) { return N.second; }
+
+static void createNaturalLoop(LoopInfo &LI, DominatorTree &DT, Function *F,
+                              SetVector<BasicBlock *> &Blocks,
+                              SetVector<BasicBlock *> &Headers) {
+  createNaturalLoopInternal(LI, DT, nullptr, Blocks, Headers);
+}
+
+static void createNaturalLoop(LoopInfo &LI, DominatorTree &DT, Loop &L,
+                              SetVector<BasicBlock *> &Blocks,
+                              SetVector<BasicBlock *> &Headers) {
+  createNaturalLoopInternal(LI, DT, &L, Blocks, Headers);
+}
+
+// Convert irreducible SCCs; Graph G may be a Function* or a Loop&.
+template <class Graph>
+static bool makeReducible(LoopInfo &LI, DominatorTree &DT, Graph &&G) {
+  bool Changed = false;
+  for (auto Scc = scc_begin(G); !Scc.isAtEnd(); ++Scc) {
+    if (Scc->size() < 2)
+      continue;
+    SetVector<BasicBlock *> Blocks;
+    LLVM_DEBUG(dbgs() << "Found SCC:");
+    for (auto N : *Scc) {
+      auto BB = unwrapBlock(N);
+      LLVM_DEBUG(dbgs() << " " << BB->getName());
+      Blocks.insert(BB);
+    }
+    LLVM_DEBUG(dbgs() << "\n");
+
+    // Minor optimization: The SCC blocks are usually discovered in an order
+    // that is the opposite of the order in which these blocks appear as branch
+    // targets. This results in a lot of condition inversions in the control
+    // flow out of the new ControlFlowHub, which can be mitigated if the orders
+    // match. So we discover the headers using the reverse of the block order.
+    SetVector<BasicBlock *> Headers;
+    LLVM_DEBUG(dbgs() << "Found headers:");
+    for (auto BB : reverse(Blocks)) {
+      for (const auto P : predecessors(BB)) {
+        // Skip unreachable predecessors.
+        if (!DT.isReachableFromEntry(P))
+          continue;
+        if (!Blocks.count(P)) {
+          LLVM_DEBUG(dbgs() << " " << BB->getName());
+          Headers.insert(BB);
+          break;
+        }
+      }
+    }
+    LLVM_DEBUG(dbgs() << "\n");
+
+    if (Headers.size() == 1) {
+      assert(LI.isLoopHeader(Headers.front()));
+      LLVM_DEBUG(dbgs() << "Natural loop with a single header: skipped\n");
+      continue;
+    }
+    createNaturalLoop(LI, DT, G, Blocks, Headers);
+    Changed = true;
+  }
+  return Changed;
+}
+
 static bool FixIrreducibleImpl(Function &F, LoopInfo &LI, DominatorTree &DT) {
-  LLVM_DEBUG(dbgs() << "===== Fix irreducible control-flow in function: " 
-                    << F.getName() << "\n"); 
- 
-  bool Changed = false; 
-  SmallVector<Loop *, 8> WorkList; 
- 
-  LLVM_DEBUG(dbgs() << "visiting top-level\n"); 
-  Changed |= makeReducible(LI, DT, &F); 
- 
-  // Any SCCs reduced are now already in the list of top-level loops, so simply 
-  // add them all to the worklist. 
+  LLVM_DEBUG(dbgs() << "===== Fix irreducible control-flow in function: "
+                    << F.getName() << "\n");
+
+  bool Changed = false;
+  SmallVector<Loop *, 8> WorkList;
+
+  LLVM_DEBUG(dbgs() << "visiting top-level\n");
+  Changed |= makeReducible(LI, DT, &F);
+
+  // Any SCCs reduced are now already in the list of top-level loops, so simply
+  // add them all to the worklist.
   append_range(WorkList, LI);
- 
-  while (!WorkList.empty()) { 
+
+  while (!WorkList.empty()) {
     auto L = WorkList.pop_back_val();
-    LLVM_DEBUG(dbgs() << "visiting loop with header " 
-                      << L->getHeader()->getName() << "\n"); 
-    Changed |= makeReducible(LI, DT, *L); 
-    // Any SCCs reduced are now already in the list of child loops, so simply 
-    // add them all to the worklist. 
-    WorkList.append(L->begin(), L->end()); 
-  } 
- 
-  return Changed; 
-} 
+    LLVM_DEBUG(dbgs() << "visiting loop with header "
+                      << L->getHeader()->getName() << "\n");
+    Changed |= makeReducible(LI, DT, *L);
+    // Any SCCs reduced are now already in the list of child loops, so simply
+    // add them all to the worklist.
+    WorkList.append(L->begin(), L->end());
+  }
+
+  return Changed;
+}
 
 bool FixIrreducible::runOnFunction(Function &F) {
   auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/FlattenCFG.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/FlattenCFG.cpp
index a24f9f8fd5..0098dcaeb0 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/FlattenCFG.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/FlattenCFG.cpp
@@ -1,545 +1,545 @@
-//===- FlatternCFG.cpp - Code to perform CFG flattening -------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// Reduce conditional branches in CFG. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/Analysis/AliasAnalysis.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include <cassert> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "flattencfg" 
- 
-namespace { 
- 
-class FlattenCFGOpt { 
-  AliasAnalysis *AA; 
- 
-  /// Use parallel-and or parallel-or to generate conditions for 
-  /// conditional branches. 
-  bool FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder); 
- 
-  /// If \param BB is the merge block of an if-region, attempt to merge 
-  /// the if-region with an adjacent if-region upstream if two if-regions 
-  /// contain identical instructions. 
-  bool MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder); 
- 
-  /// Compare a pair of blocks: \p Block1 and \p Block2, which 
-  /// are from two if-regions, where \p Head2 is the entry block of the 2nd 
-  /// if-region.  \returns true if \p Block1 and \p Block2 contain identical 
-  /// instructions, and have no memory reference alias with \p Head2. 
-  /// This is used as a legality check for merging if-regions. 
-  bool CompareIfRegionBlock(BasicBlock *Block1, BasicBlock *Block2, 
-                            BasicBlock *Head2); 
- 
-public: 
-  FlattenCFGOpt(AliasAnalysis *AA) : AA(AA) {} 
- 
-  bool run(BasicBlock *BB); 
-}; 
- 
-} // end anonymous namespace 
- 
-/// If \param [in] BB has more than one predecessor that is a conditional 
-/// branch, attempt to use parallel and/or for the branch condition. \returns 
-/// true on success. 
-/// 
-/// Before: 
-///   ...... 
-///   %cmp10 = fcmp une float %tmp1, %tmp2 
-///   br i1 %cmp10, label %if.then, label %lor.rhs 
-/// 
-/// lor.rhs: 
-///   ...... 
-///   %cmp11 = fcmp une float %tmp3, %tmp4 
-///   br i1 %cmp11, label %if.then, label %ifend 
-/// 
-/// if.end:  // the merge block 
-///   ...... 
-/// 
-/// if.then: // has two predecessors, both of them contains conditional branch. 
-///   ...... 
-///   br label %if.end; 
-/// 
-/// After: 
-///  ...... 
-///  %cmp10 = fcmp une float %tmp1, %tmp2 
-///  ...... 
-///  %cmp11 = fcmp une float %tmp3, %tmp4 
-///  %cmp12 = or i1 %cmp10, %cmp11    // parallel-or mode. 
-///  br i1 %cmp12, label %if.then, label %ifend 
-/// 
-///  if.end: 
-///    ...... 
-/// 
-///  if.then: 
-///    ...... 
-///    br label %if.end; 
-/// 
-///  Current implementation handles two cases. 
-///  Case 1: BB is on the else-path. 
-/// 
-///          BB1 
-///        /     | 
-///       BB2    | 
-///      /   \   | 
-///     BB3   \  |     where, BB1, BB2 contain conditional branches. 
-///      \    |  /     BB3 contains unconditional branch. 
-///       \   | /      BB4 corresponds to BB which is also the merge. 
-///  BB => BB4 
-/// 
-/// 
-///  Corresponding source code: 
-/// 
-///  if (a == b && c == d) 
-///    statement; // BB3 
-/// 
-///  Case 2: BB is on the then-path. 
-/// 
-///             BB1 
-///          /      | 
-///         |      BB2 
-///         \    /    |  where BB1, BB2 contain conditional branches. 
-///  BB =>   BB3      |  BB3 contains unconditiona branch and corresponds 
-///           \     /    to BB.  BB4 is the merge. 
-///             BB4 
-/// 
-///  Corresponding source code: 
-/// 
-///  if (a == b || c == d) 
-///    statement;  // BB3 
-/// 
-///  In both cases, BB is the common successor of conditional branches. 
-///  In Case 1, BB (BB4) has an unconditional branch (BB3) as 
-///  its predecessor.  In Case 2, BB (BB3) only has conditional branches 
-///  as its predecessors. 
-bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) { 
-  PHINode *PHI = dyn_cast<PHINode>(BB->begin()); 
-  if (PHI) 
-    return false; // For simplicity, avoid cases containing PHI nodes. 
- 
-  BasicBlock *LastCondBlock = nullptr; 
-  BasicBlock *FirstCondBlock = nullptr; 
-  BasicBlock *UnCondBlock = nullptr; 
-  int Idx = -1; 
- 
-  // Check predecessors of \param BB. 
-  SmallPtrSet<BasicBlock *, 16> Preds(pred_begin(BB), pred_end(BB)); 
-  for (SmallPtrSetIterator<BasicBlock *> PI = Preds.begin(), PE = Preds.end(); 
-       PI != PE; ++PI) { 
-    BasicBlock *Pred = *PI; 
-    BranchInst *PBI = dyn_cast<BranchInst>(Pred->getTerminator()); 
- 
-    // All predecessors should terminate with a branch. 
-    if (!PBI) 
-      return false; 
- 
-    BasicBlock *PP = Pred->getSinglePredecessor(); 
- 
-    if (PBI->isUnconditional()) { 
-      // Case 1: Pred (BB3) is an unconditional block, it should 
-      // have a single predecessor (BB2) that is also a predecessor 
-      // of \param BB (BB4) and should not have address-taken. 
-      // There should exist only one such unconditional 
-      // branch among the predecessors. 
-      if (UnCondBlock || !PP || (Preds.count(PP) == 0) || 
-          Pred->hasAddressTaken()) 
-        return false; 
- 
-      UnCondBlock = Pred; 
-      continue; 
-    } 
- 
-    // Only conditional branches are allowed beyond this point. 
-    assert(PBI->isConditional()); 
- 
-    // Condition's unique use should be the branch instruction. 
-    Value *PC = PBI->getCondition(); 
-    if (!PC || !PC->hasOneUse()) 
-      return false; 
- 
-    if (PP && Preds.count(PP)) { 
-      // These are internal condition blocks to be merged from, e.g., 
-      // BB2 in both cases. 
-      // Should not be address-taken. 
-      if (Pred->hasAddressTaken()) 
-        return false; 
- 
-      // Instructions in the internal condition blocks should be safe 
-      // to hoist up. 
-      for (BasicBlock::iterator BI = Pred->begin(), BE = PBI->getIterator(); 
-           BI != BE;) { 
-        Instruction *CI = &*BI++; 
-        if (isa<PHINode>(CI) || !isSafeToSpeculativelyExecute(CI)) 
-          return false; 
-      } 
-    } else { 
-      // This is the condition block to be merged into, e.g. BB1 in 
-      // both cases. 
-      if (FirstCondBlock) 
-        return false; 
-      FirstCondBlock = Pred; 
-    } 
- 
-    // Find whether BB is uniformly on the true (or false) path 
-    // for all of its predecessors. 
-    BasicBlock *PS1 = PBI->getSuccessor(0); 
-    BasicBlock *PS2 = PBI->getSuccessor(1); 
-    BasicBlock *PS = (PS1 == BB) ? PS2 : PS1; 
-    int CIdx = (PS1 == BB) ? 0 : 1; 
- 
-    if (Idx == -1) 
-      Idx = CIdx; 
-    else if (CIdx != Idx) 
-      return false; 
- 
-    // PS is the successor which is not BB. Check successors to identify 
-    // the last conditional branch. 
-    if (Preds.count(PS) == 0) { 
-      // Case 2. 
-      LastCondBlock = Pred; 
-    } else { 
-      // Case 1 
-      BranchInst *BPS = dyn_cast<BranchInst>(PS->getTerminator()); 
-      if (BPS && BPS->isUnconditional()) { 
-        // Case 1: PS(BB3) should be an unconditional branch. 
-        LastCondBlock = Pred; 
-      } 
-    } 
-  } 
- 
-  if (!FirstCondBlock || !LastCondBlock || (FirstCondBlock == LastCondBlock)) 
-    return false; 
- 
-  Instruction *TBB = LastCondBlock->getTerminator(); 
-  BasicBlock *PS1 = TBB->getSuccessor(0); 
-  BasicBlock *PS2 = TBB->getSuccessor(1); 
-  BranchInst *PBI1 = dyn_cast<BranchInst>(PS1->getTerminator()); 
-  BranchInst *PBI2 = dyn_cast<BranchInst>(PS2->getTerminator()); 
- 
-  // If PS1 does not jump into PS2, but PS2 jumps into PS1, 
-  // attempt branch inversion. 
-  if (!PBI1 || !PBI1->isUnconditional() || 
-      (PS1->getTerminator()->getSuccessor(0) != PS2)) { 
-    // Check whether PS2 jumps into PS1. 
-    if (!PBI2 || !PBI2->isUnconditional() || 
-        (PS2->getTerminator()->getSuccessor(0) != PS1)) 
-      return false; 
- 
-    // Do branch inversion. 
-    BasicBlock *CurrBlock = LastCondBlock; 
-    bool EverChanged = false; 
-    for (; CurrBlock != FirstCondBlock; 
-         CurrBlock = CurrBlock->getSinglePredecessor()) { 
-      auto *BI = cast<BranchInst>(CurrBlock->getTerminator()); 
-      auto *CI = dyn_cast<CmpInst>(BI->getCondition()); 
-      if (!CI) 
-        continue; 
- 
-      CmpInst::Predicate Predicate = CI->getPredicate(); 
-      // Canonicalize icmp_ne -> icmp_eq, fcmp_one -> fcmp_oeq 
-      if ((Predicate == CmpInst::ICMP_NE) || (Predicate == CmpInst::FCMP_ONE)) { 
-        CI->setPredicate(ICmpInst::getInversePredicate(Predicate)); 
-        BI->swapSuccessors(); 
-        EverChanged = true; 
-      } 
-    } 
-    return EverChanged; 
-  } 
- 
-  // PS1 must have a conditional branch. 
-  if (!PBI1 || !PBI1->isUnconditional()) 
-    return false; 
- 
-  // PS2 should not contain PHI node. 
-  PHI = dyn_cast<PHINode>(PS2->begin()); 
-  if (PHI) 
-    return false; 
- 
-  // Do the transformation. 
-  BasicBlock *CB; 
-  BranchInst *PBI = cast<BranchInst>(FirstCondBlock->getTerminator()); 
-  bool Iteration = true; 
-  IRBuilder<>::InsertPointGuard Guard(Builder); 
-  Value *PC = PBI->getCondition(); 
- 
-  do { 
-    CB = PBI->getSuccessor(1 - Idx); 
-    // Delete the conditional branch. 
-    FirstCondBlock->getInstList().pop_back(); 
-    FirstCondBlock->getInstList() 
-        .splice(FirstCondBlock->end(), CB->getInstList()); 
-    PBI = cast<BranchInst>(FirstCondBlock->getTerminator()); 
-    Value *CC = PBI->getCondition(); 
-    // Merge conditions. 
-    Builder.SetInsertPoint(PBI); 
-    Value *NC; 
-    if (Idx == 0) 
-      // Case 2, use parallel or. 
-      NC = Builder.CreateOr(PC, CC); 
-    else 
-      // Case 1, use parallel and. 
-      NC = Builder.CreateAnd(PC, CC); 
- 
-    PBI->replaceUsesOfWith(CC, NC); 
-    PC = NC; 
-    if (CB == LastCondBlock) 
-      Iteration = false; 
-    // Remove internal conditional branches. 
-    CB->dropAllReferences(); 
-    // make CB unreachable and let downstream to delete the block. 
-    new UnreachableInst(CB->getContext(), CB); 
-  } while (Iteration); 
- 
-  LLVM_DEBUG(dbgs() << "Use parallel and/or in:\n" << *FirstCondBlock); 
-  return true; 
-} 
- 
-/// Compare blocks from two if-regions, where \param Head2 is the entry of the 
-/// 2nd if-region. \param Block1 is a block in the 1st if-region to compare. 
-/// \param Block2 is a block in the 2nd if-region to compare.  \returns true if 
-/// Block1 and Block2 have identical instructions and do not have 
-/// memory reference alias with Head2. 
-bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Block1, BasicBlock *Block2, 
-                                         BasicBlock *Head2) { 
-  Instruction *PTI2 = Head2->getTerminator(); 
-  Instruction *PBI2 = &Head2->front(); 
- 
-  // Check whether instructions in Block1 and Block2 are identical 
-  // and do not alias with instructions in Head2. 
-  BasicBlock::iterator iter1 = Block1->begin(); 
-  BasicBlock::iterator end1 = Block1->getTerminator()->getIterator(); 
-  BasicBlock::iterator iter2 = Block2->begin(); 
-  BasicBlock::iterator end2 = Block2->getTerminator()->getIterator(); 
- 
-  while (true) { 
-    if (iter1 == end1) { 
-      if (iter2 != end2) 
-        return false; 
-      break; 
-    } 
- 
-    if (!iter1->isIdenticalTo(&*iter2)) 
-      return false; 
- 
-    // Illegal to remove instructions with side effects except 
-    // non-volatile stores. 
-    if (iter1->mayHaveSideEffects()) { 
-      Instruction *CurI = &*iter1; 
-      StoreInst *SI = dyn_cast<StoreInst>(CurI); 
-      if (!SI || SI->isVolatile()) 
-        return false; 
-    } 
- 
-    // For simplicity and speed, data dependency check can be 
-    // avoided if read from memory doesn't exist. 
-    if (iter1->mayReadFromMemory()) 
-      return false; 
- 
-    if (iter1->mayWriteToMemory()) { 
-      for (BasicBlock::iterator BI(PBI2), BE(PTI2); BI != BE; ++BI) { 
-        if (BI->mayReadFromMemory() || BI->mayWriteToMemory()) { 
-          // Check alias with Head2. 
-          if (!AA || AA->alias(&*iter1, &*BI)) 
-            return false; 
-        } 
-      } 
-    } 
-    ++iter1; 
-    ++iter2; 
-  } 
- 
-  return true; 
-} 
- 
-/// Check whether \param BB is the merge block of a if-region.  If yes, check 
-/// whether there exists an adjacent if-region upstream, the two if-regions 
-/// contain identical instructions and can be legally merged.  \returns true if 
-/// the two if-regions are merged. 
-/// 
-/// From: 
-/// if (a) 
-///   statement; 
-/// if (b) 
-///   statement; 
-/// 
-/// To: 
-/// if (a || b) 
-///   statement; 
-/// 
-/// 
-/// And from: 
-/// if (a) 
-///   ; 
-/// else 
-///   statement; 
-/// if (b) 
-///   ; 
-/// else 
-///   statement; 
-/// 
-/// To: 
-/// if (a && b) 
-///   ; 
-/// else 
-///   statement; 
-/// 
-/// We always take the form of the first if-region. This means that if the 
-/// statement in the first if-region, is in the "then-path", while in the second 
-/// if-region it is in the "else-path", then we convert the second to the first 
-/// form, by inverting the condition and the branch successors. The same 
-/// approach goes for the opposite case. 
-bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) { 
-  BasicBlock *IfTrue2, *IfFalse2; 
-  Value *IfCond2 = GetIfCondition(BB, IfTrue2, IfFalse2); 
-  Instruction *CInst2 = dyn_cast_or_null<Instruction>(IfCond2); 
-  if (!CInst2) 
-    return false; 
- 
-  BasicBlock *SecondEntryBlock = CInst2->getParent(); 
-  if (SecondEntryBlock->hasAddressTaken()) 
-    return false; 
- 
-  BasicBlock *IfTrue1, *IfFalse1; 
-  Value *IfCond1 = GetIfCondition(SecondEntryBlock, IfTrue1, IfFalse1); 
-  Instruction *CInst1 = dyn_cast_or_null<Instruction>(IfCond1); 
-  if (!CInst1) 
-    return false; 
- 
-  BasicBlock *FirstEntryBlock = CInst1->getParent(); 
- 
-  // Either then-path or else-path should be empty. 
-  bool InvertCond2 = false; 
-  BinaryOperator::BinaryOps CombineOp; 
-  if (IfFalse1 == FirstEntryBlock) { 
-    // The else-path is empty, so we must use "or" operation to combine the 
-    // conditions. 
-    CombineOp = BinaryOperator::Or; 
-    if (IfFalse2 != SecondEntryBlock) { 
-      if (IfTrue2 != SecondEntryBlock) 
-        return false; 
- 
-      InvertCond2 = true; 
-      std::swap(IfTrue2, IfFalse2); 
-    } 
- 
-    if (!CompareIfRegionBlock(IfTrue1, IfTrue2, SecondEntryBlock)) 
-      return false; 
-  } else if (IfTrue1 == FirstEntryBlock) { 
-    // The then-path is empty, so we must use "and" operation to combine the 
-    // conditions. 
-    CombineOp = BinaryOperator::And; 
-    if (IfTrue2 != SecondEntryBlock) { 
-      if (IfFalse2 != SecondEntryBlock) 
-        return false; 
- 
-      InvertCond2 = true; 
-      std::swap(IfTrue2, IfFalse2); 
-    } 
- 
-    if (!CompareIfRegionBlock(IfFalse1, IfFalse2, SecondEntryBlock)) 
-      return false; 
-  } else 
-    return false; 
- 
-  Instruction *PTI2 = SecondEntryBlock->getTerminator(); 
-  Instruction *PBI2 = &SecondEntryBlock->front(); 
- 
-  // Check whether \param SecondEntryBlock has side-effect and is safe to 
-  // speculate. 
-  for (BasicBlock::iterator BI(PBI2), BE(PTI2); BI != BE; ++BI) { 
-    Instruction *CI = &*BI; 
-    if (isa<PHINode>(CI) || CI->mayHaveSideEffects() || 
-        !isSafeToSpeculativelyExecute(CI)) 
-      return false; 
-  } 
- 
-  // Merge \param SecondEntryBlock into \param FirstEntryBlock. 
-  FirstEntryBlock->getInstList().pop_back(); 
-  FirstEntryBlock->getInstList() 
-      .splice(FirstEntryBlock->end(), SecondEntryBlock->getInstList()); 
-  BranchInst *PBI = cast<BranchInst>(FirstEntryBlock->getTerminator()); 
-  assert(PBI->getCondition() == IfCond2); 
-  BasicBlock *SaveInsertBB = Builder.GetInsertBlock(); 
-  BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint(); 
-  Builder.SetInsertPoint(PBI); 
-  if (InvertCond2) { 
-    // If this is a "cmp" instruction, only used for branching (and nowhere 
-    // else), then we can simply invert the predicate. 
-    auto Cmp2 = dyn_cast<CmpInst>(CInst2); 
-    if (Cmp2 && Cmp2->hasOneUse()) 
-      Cmp2->setPredicate(Cmp2->getInversePredicate()); 
-    else 
-      CInst2 = cast<Instruction>(Builder.CreateNot(CInst2)); 
-    PBI->swapSuccessors(); 
-  } 
-  Value *NC = Builder.CreateBinOp(CombineOp, CInst1, CInst2); 
-  PBI->replaceUsesOfWith(IfCond2, NC); 
-  Builder.SetInsertPoint(SaveInsertBB, SaveInsertPt); 
- 
-  // Handle PHI node to replace its predecessors to FirstEntryBlock. 
-  for (BasicBlock *Succ : successors(PBI)) { 
-    for (PHINode &Phi : Succ->phis()) { 
-      for (unsigned i = 0, e = Phi.getNumIncomingValues(); i != e; ++i) { 
-        if (Phi.getIncomingBlock(i) == SecondEntryBlock) 
-          Phi.setIncomingBlock(i, FirstEntryBlock); 
-      } 
-    } 
-  } 
- 
-  // Remove IfTrue1 
-  if (IfTrue1 != FirstEntryBlock) { 
-    IfTrue1->dropAllReferences(); 
-    IfTrue1->eraseFromParent(); 
-  } 
- 
-  // Remove IfFalse1 
-  if (IfFalse1 != FirstEntryBlock) { 
-    IfFalse1->dropAllReferences(); 
-    IfFalse1->eraseFromParent(); 
-  } 
- 
-  // Remove \param SecondEntryBlock 
-  SecondEntryBlock->dropAllReferences(); 
-  SecondEntryBlock->eraseFromParent(); 
-  LLVM_DEBUG(dbgs() << "If conditions merged into:\n" << *FirstEntryBlock); 
-  return true; 
-} 
- 
-bool FlattenCFGOpt::run(BasicBlock *BB) { 
-  assert(BB && BB->getParent() && "Block not embedded in function!"); 
-  assert(BB->getTerminator() && "Degenerate basic block encountered!"); 
- 
-  IRBuilder<> Builder(BB); 
- 
-  if (FlattenParallelAndOr(BB, Builder) || MergeIfRegion(BB, Builder)) 
-    return true; 
-  return false; 
-} 
- 
-/// FlattenCFG - This function is used to flatten a CFG.  For 
-/// example, it uses parallel-and and parallel-or mode to collapse 
-/// if-conditions and merge if-regions with identical statements. 
-bool llvm::FlattenCFG(BasicBlock *BB, AAResults *AA) { 
-  return FlattenCFGOpt(AA).run(BB); 
-} 
+//===- FlatternCFG.cpp - Code to perform CFG flattening -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Reduce conditional branches in CFG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <cassert>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "flattencfg"
+
+namespace {
+
+class FlattenCFGOpt {
+  AliasAnalysis *AA;
+
+  /// Use parallel-and or parallel-or to generate conditions for
+  /// conditional branches.
+  bool FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder);
+
+  /// If \param BB is the merge block of an if-region, attempt to merge
+  /// the if-region with an adjacent if-region upstream if two if-regions
+  /// contain identical instructions.
+  bool MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder);
+
+  /// Compare a pair of blocks: \p Block1 and \p Block2, which
+  /// are from two if-regions, where \p Head2 is the entry block of the 2nd
+  /// if-region.  \returns true if \p Block1 and \p Block2 contain identical
+  /// instructions, and have no memory reference alias with \p Head2.
+  /// This is used as a legality check for merging if-regions.
+  bool CompareIfRegionBlock(BasicBlock *Block1, BasicBlock *Block2,
+                            BasicBlock *Head2);
+
+public:
+  FlattenCFGOpt(AliasAnalysis *AA) : AA(AA) {}
+
+  bool run(BasicBlock *BB);
+};
+
+} // end anonymous namespace
+
+/// If \param [in] BB has more than one predecessor that is a conditional
+/// branch, attempt to use parallel and/or for the branch condition. \returns
+/// true on success.
+///
+/// Before:
+///   ......
+///   %cmp10 = fcmp une float %tmp1, %tmp2
+///   br i1 %cmp10, label %if.then, label %lor.rhs
+///
+/// lor.rhs:
+///   ......
+///   %cmp11 = fcmp une float %tmp3, %tmp4
+///   br i1 %cmp11, label %if.then, label %ifend
+///
+/// if.end:  // the merge block
+///   ......
+///
+/// if.then: // has two predecessors, both of them contains conditional branch.
+///   ......
+///   br label %if.end;
+///
+/// After:
+///  ......
+///  %cmp10 = fcmp une float %tmp1, %tmp2
+///  ......
+///  %cmp11 = fcmp une float %tmp3, %tmp4
+///  %cmp12 = or i1 %cmp10, %cmp11    // parallel-or mode.
+///  br i1 %cmp12, label %if.then, label %ifend
+///
+///  if.end:
+///    ......
+///
+///  if.then:
+///    ......
+///    br label %if.end;
+///
+///  Current implementation handles two cases.
+///  Case 1: BB is on the else-path.
+///
+///          BB1
+///        /     |
+///       BB2    |
+///      /   \   |
+///     BB3   \  |     where, BB1, BB2 contain conditional branches.
+///      \    |  /     BB3 contains unconditional branch.
+///       \   | /      BB4 corresponds to BB which is also the merge.
+///  BB => BB4
+///
+///
+///  Corresponding source code:
+///
+///  if (a == b && c == d)
+///    statement; // BB3
+///
+///  Case 2: BB is on the then-path.
+///
+///             BB1
+///          /      |
+///         |      BB2
+///         \    /    |  where BB1, BB2 contain conditional branches.
+///  BB =>   BB3      |  BB3 contains unconditiona branch and corresponds
+///           \     /    to BB.  BB4 is the merge.
+///             BB4
+///
+///  Corresponding source code:
+///
+///  if (a == b || c == d)
+///    statement;  // BB3
+///
+///  In both cases, BB is the common successor of conditional branches.
+///  In Case 1, BB (BB4) has an unconditional branch (BB3) as
+///  its predecessor.  In Case 2, BB (BB3) only has conditional branches
+///  as its predecessors.
+bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
+  PHINode *PHI = dyn_cast<PHINode>(BB->begin());
+  if (PHI)
+    return false; // For simplicity, avoid cases containing PHI nodes.
+
+  BasicBlock *LastCondBlock = nullptr;
+  BasicBlock *FirstCondBlock = nullptr;
+  BasicBlock *UnCondBlock = nullptr;
+  int Idx = -1;
+
+  // Check predecessors of \param BB.
+  SmallPtrSet<BasicBlock *, 16> Preds(pred_begin(BB), pred_end(BB));
+  for (SmallPtrSetIterator<BasicBlock *> PI = Preds.begin(), PE = Preds.end();
+       PI != PE; ++PI) {
+    BasicBlock *Pred = *PI;
+    BranchInst *PBI = dyn_cast<BranchInst>(Pred->getTerminator());
+
+    // All predecessors should terminate with a branch.
+    if (!PBI)
+      return false;
+
+    BasicBlock *PP = Pred->getSinglePredecessor();
+
+    if (PBI->isUnconditional()) {
+      // Case 1: Pred (BB3) is an unconditional block, it should
+      // have a single predecessor (BB2) that is also a predecessor
+      // of \param BB (BB4) and should not have address-taken.
+      // There should exist only one such unconditional
+      // branch among the predecessors.
+      if (UnCondBlock || !PP || (Preds.count(PP) == 0) ||
+          Pred->hasAddressTaken())
+        return false;
+
+      UnCondBlock = Pred;
+      continue;
+    }
+
+    // Only conditional branches are allowed beyond this point.
+    assert(PBI->isConditional());
+
+    // Condition's unique use should be the branch instruction.
+    Value *PC = PBI->getCondition();
+    if (!PC || !PC->hasOneUse())
+      return false;
+
+    if (PP && Preds.count(PP)) {
+      // These are internal condition blocks to be merged from, e.g.,
+      // BB2 in both cases.
+      // Should not be address-taken.
+      if (Pred->hasAddressTaken())
+        return false;
+
+      // Instructions in the internal condition blocks should be safe
+      // to hoist up.
+      for (BasicBlock::iterator BI = Pred->begin(), BE = PBI->getIterator();
+           BI != BE;) {
+        Instruction *CI = &*BI++;
+        if (isa<PHINode>(CI) || !isSafeToSpeculativelyExecute(CI))
+          return false;
+      }
+    } else {
+      // This is the condition block to be merged into, e.g. BB1 in
+      // both cases.
+      if (FirstCondBlock)
+        return false;
+      FirstCondBlock = Pred;
+    }
+
+    // Find whether BB is uniformly on the true (or false) path
+    // for all of its predecessors.
+    BasicBlock *PS1 = PBI->getSuccessor(0);
+    BasicBlock *PS2 = PBI->getSuccessor(1);
+    BasicBlock *PS = (PS1 == BB) ? PS2 : PS1;
+    int CIdx = (PS1 == BB) ? 0 : 1;
+
+    if (Idx == -1)
+      Idx = CIdx;
+    else if (CIdx != Idx)
+      return false;
+
+    // PS is the successor which is not BB. Check successors to identify
+    // the last conditional branch.
+    if (Preds.count(PS) == 0) {
+      // Case 2.
+      LastCondBlock = Pred;
+    } else {
+      // Case 1
+      BranchInst *BPS = dyn_cast<BranchInst>(PS->getTerminator());
+      if (BPS && BPS->isUnconditional()) {
+        // Case 1: PS(BB3) should be an unconditional branch.
+        LastCondBlock = Pred;
+      }
+    }
+  }
+
+  if (!FirstCondBlock || !LastCondBlock || (FirstCondBlock == LastCondBlock))
+    return false;
+
+  Instruction *TBB = LastCondBlock->getTerminator();
+  BasicBlock *PS1 = TBB->getSuccessor(0);
+  BasicBlock *PS2 = TBB->getSuccessor(1);
+  BranchInst *PBI1 = dyn_cast<BranchInst>(PS1->getTerminator());
+  BranchInst *PBI2 = dyn_cast<BranchInst>(PS2->getTerminator());
+
+  // If PS1 does not jump into PS2, but PS2 jumps into PS1,
+  // attempt branch inversion.
+  if (!PBI1 || !PBI1->isUnconditional() ||
+      (PS1->getTerminator()->getSuccessor(0) != PS2)) {
+    // Check whether PS2 jumps into PS1.
+    if (!PBI2 || !PBI2->isUnconditional() ||
+        (PS2->getTerminator()->getSuccessor(0) != PS1))
+      return false;
+
+    // Do branch inversion.
+    BasicBlock *CurrBlock = LastCondBlock;
+    bool EverChanged = false;
+    for (; CurrBlock != FirstCondBlock;
+         CurrBlock = CurrBlock->getSinglePredecessor()) {
+      auto *BI = cast<BranchInst>(CurrBlock->getTerminator());
+      auto *CI = dyn_cast<CmpInst>(BI->getCondition());
+      if (!CI)
+        continue;
+
+      CmpInst::Predicate Predicate = CI->getPredicate();
+      // Canonicalize icmp_ne -> icmp_eq, fcmp_one -> fcmp_oeq
+      if ((Predicate == CmpInst::ICMP_NE) || (Predicate == CmpInst::FCMP_ONE)) {
+        CI->setPredicate(ICmpInst::getInversePredicate(Predicate));
+        BI->swapSuccessors();
+        EverChanged = true;
+      }
+    }
+    return EverChanged;
+  }
+
+  // PS1 must have a conditional branch.
+  if (!PBI1 || !PBI1->isUnconditional())
+    return false;
+
+  // PS2 should not contain PHI node.
+  PHI = dyn_cast<PHINode>(PS2->begin());
+  if (PHI)
+    return false;
+
+  // Do the transformation.
+  BasicBlock *CB;
+  BranchInst *PBI = cast<BranchInst>(FirstCondBlock->getTerminator());
+  bool Iteration = true;
+  IRBuilder<>::InsertPointGuard Guard(Builder);
+  Value *PC = PBI->getCondition();
+
+  do {
+    CB = PBI->getSuccessor(1 - Idx);
+    // Delete the conditional branch.
+    FirstCondBlock->getInstList().pop_back();
+    FirstCondBlock->getInstList()
+        .splice(FirstCondBlock->end(), CB->getInstList());
+    PBI = cast<BranchInst>(FirstCondBlock->getTerminator());
+    Value *CC = PBI->getCondition();
+    // Merge conditions.
+    Builder.SetInsertPoint(PBI);
+    Value *NC;
+    if (Idx == 0)
+      // Case 2, use parallel or.
+      NC = Builder.CreateOr(PC, CC);
+    else
+      // Case 1, use parallel and.
+      NC = Builder.CreateAnd(PC, CC);
+
+    PBI->replaceUsesOfWith(CC, NC);
+    PC = NC;
+    if (CB == LastCondBlock)
+      Iteration = false;
+    // Remove internal conditional branches.
+    CB->dropAllReferences();
+    // make CB unreachable and let downstream to delete the block.
+    new UnreachableInst(CB->getContext(), CB);
+  } while (Iteration);
+
+  LLVM_DEBUG(dbgs() << "Use parallel and/or in:\n" << *FirstCondBlock);
+  return true;
+}
+
+/// Compare blocks from two if-regions, where \param Head2 is the entry of the
+/// 2nd if-region. \param Block1 is a block in the 1st if-region to compare.
+/// \param Block2 is a block in the 2nd if-region to compare.  \returns true if
+/// Block1 and Block2 have identical instructions and do not have
+/// memory reference alias with Head2.
+bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Block1, BasicBlock *Block2,
+                                         BasicBlock *Head2) {
+  Instruction *PTI2 = Head2->getTerminator();
+  Instruction *PBI2 = &Head2->front();
+
+  // Check whether instructions in Block1 and Block2 are identical
+  // and do not alias with instructions in Head2.
+  BasicBlock::iterator iter1 = Block1->begin();
+  BasicBlock::iterator end1 = Block1->getTerminator()->getIterator();
+  BasicBlock::iterator iter2 = Block2->begin();
+  BasicBlock::iterator end2 = Block2->getTerminator()->getIterator();
+
+  while (true) {
+    if (iter1 == end1) {
+      if (iter2 != end2)
+        return false;
+      break;
+    }
+
+    if (!iter1->isIdenticalTo(&*iter2))
+      return false;
+
+    // Illegal to remove instructions with side effects except
+    // non-volatile stores.
+    if (iter1->mayHaveSideEffects()) {
+      Instruction *CurI = &*iter1;
+      StoreInst *SI = dyn_cast<StoreInst>(CurI);
+      if (!SI || SI->isVolatile())
+        return false;
+    }
+
+    // For simplicity and speed, data dependency check can be
+    // avoided if read from memory doesn't exist.
+    if (iter1->mayReadFromMemory())
+      return false;
+
+    if (iter1->mayWriteToMemory()) {
+      for (BasicBlock::iterator BI(PBI2), BE(PTI2); BI != BE; ++BI) {
+        if (BI->mayReadFromMemory() || BI->mayWriteToMemory()) {
+          // Check alias with Head2.
+          if (!AA || AA->alias(&*iter1, &*BI))
+            return false;
+        }
+      }
+    }
+    ++iter1;
+    ++iter2;
+  }
+
+  return true;
+}
+
+/// Check whether \param BB is the merge block of a if-region.  If yes, check
+/// whether there exists an adjacent if-region upstream, the two if-regions
+/// contain identical instructions and can be legally merged.  \returns true if
+/// the two if-regions are merged.
+///
+/// From:
+/// if (a)
+///   statement;
+/// if (b)
+///   statement;
+///
+/// To:
+/// if (a || b)
+///   statement;
+///
+///
+/// And from:
+/// if (a)
+///   ;
+/// else
+///   statement;
+/// if (b)
+///   ;
+/// else
+///   statement;
+///
+/// To:
+/// if (a && b)
+///   ;
+/// else
+///   statement;
+///
+/// We always take the form of the first if-region. This means that if the
+/// statement in the first if-region, is in the "then-path", while in the second
+/// if-region it is in the "else-path", then we convert the second to the first
+/// form, by inverting the condition and the branch successors. The same
+/// approach goes for the opposite case.
+bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) {
+  BasicBlock *IfTrue2, *IfFalse2;
+  Value *IfCond2 = GetIfCondition(BB, IfTrue2, IfFalse2);
+  Instruction *CInst2 = dyn_cast_or_null<Instruction>(IfCond2);
+  if (!CInst2)
+    return false;
+
+  BasicBlock *SecondEntryBlock = CInst2->getParent();
+  if (SecondEntryBlock->hasAddressTaken())
+    return false;
+
+  BasicBlock *IfTrue1, *IfFalse1;
+  Value *IfCond1 = GetIfCondition(SecondEntryBlock, IfTrue1, IfFalse1);
+  Instruction *CInst1 = dyn_cast_or_null<Instruction>(IfCond1);
+  if (!CInst1)
+    return false;
+
+  BasicBlock *FirstEntryBlock = CInst1->getParent();
+
+  // Either then-path or else-path should be empty.
+  bool InvertCond2 = false;
+  BinaryOperator::BinaryOps CombineOp;
+  if (IfFalse1 == FirstEntryBlock) {
+    // The else-path is empty, so we must use "or" operation to combine the
+    // conditions.
+    CombineOp = BinaryOperator::Or;
+    if (IfFalse2 != SecondEntryBlock) {
+      if (IfTrue2 != SecondEntryBlock)
+        return false;
+
+      InvertCond2 = true;
+      std::swap(IfTrue2, IfFalse2);
+    }
+
+    if (!CompareIfRegionBlock(IfTrue1, IfTrue2, SecondEntryBlock))
+      return false;
+  } else if (IfTrue1 == FirstEntryBlock) {
+    // The then-path is empty, so we must use "and" operation to combine the
+    // conditions.
+    CombineOp = BinaryOperator::And;
+    if (IfTrue2 != SecondEntryBlock) {
+      if (IfFalse2 != SecondEntryBlock)
+        return false;
+
+      InvertCond2 = true;
+      std::swap(IfTrue2, IfFalse2);
+    }
+
+    if (!CompareIfRegionBlock(IfFalse1, IfFalse2, SecondEntryBlock))
+      return false;
+  } else
+    return false;
+
+  Instruction *PTI2 = SecondEntryBlock->getTerminator();
+  Instruction *PBI2 = &SecondEntryBlock->front();
+
+  // Check whether \param SecondEntryBlock has side-effect and is safe to
+  // speculate.
+  for (BasicBlock::iterator BI(PBI2), BE(PTI2); BI != BE; ++BI) {
+    Instruction *CI = &*BI;
+    if (isa<PHINode>(CI) || CI->mayHaveSideEffects() ||
+        !isSafeToSpeculativelyExecute(CI))
+      return false;
+  }
+
+  // Merge \param SecondEntryBlock into \param FirstEntryBlock.
+  FirstEntryBlock->getInstList().pop_back();
+  FirstEntryBlock->getInstList()
+      .splice(FirstEntryBlock->end(), SecondEntryBlock->getInstList());
+  BranchInst *PBI = cast<BranchInst>(FirstEntryBlock->getTerminator());
+  assert(PBI->getCondition() == IfCond2);
+  BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
+  BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
+  Builder.SetInsertPoint(PBI);
+  if (InvertCond2) {
+    // If this is a "cmp" instruction, only used for branching (and nowhere
+    // else), then we can simply invert the predicate.
+    auto Cmp2 = dyn_cast<CmpInst>(CInst2);
+    if (Cmp2 && Cmp2->hasOneUse())
+      Cmp2->setPredicate(Cmp2->getInversePredicate());
+    else
+      CInst2 = cast<Instruction>(Builder.CreateNot(CInst2));
+    PBI->swapSuccessors();
+  }
+  Value *NC = Builder.CreateBinOp(CombineOp, CInst1, CInst2);
+  PBI->replaceUsesOfWith(IfCond2, NC);
+  Builder.SetInsertPoint(SaveInsertBB, SaveInsertPt);
+
+  // Handle PHI node to replace its predecessors to FirstEntryBlock.
+  for (BasicBlock *Succ : successors(PBI)) {
+    for (PHINode &Phi : Succ->phis()) {
+      for (unsigned i = 0, e = Phi.getNumIncomingValues(); i != e; ++i) {
+        if (Phi.getIncomingBlock(i) == SecondEntryBlock)
+          Phi.setIncomingBlock(i, FirstEntryBlock);
+      }
+    }
+  }
+
+  // Remove IfTrue1
+  if (IfTrue1 != FirstEntryBlock) {
+    IfTrue1->dropAllReferences();
+    IfTrue1->eraseFromParent();
+  }
+
+  // Remove IfFalse1
+  if (IfFalse1 != FirstEntryBlock) {
+    IfFalse1->dropAllReferences();
+    IfFalse1->eraseFromParent();
+  }
+
+  // Remove \param SecondEntryBlock
+  SecondEntryBlock->dropAllReferences();
+  SecondEntryBlock->eraseFromParent();
+  LLVM_DEBUG(dbgs() << "If conditions merged into:\n" << *FirstEntryBlock);
+  return true;
+}
+
+bool FlattenCFGOpt::run(BasicBlock *BB) {
+  assert(BB && BB->getParent() && "Block not embedded in function!");
+  assert(BB->getTerminator() && "Degenerate basic block encountered!");
+
+  IRBuilder<> Builder(BB);
+
+  if (FlattenParallelAndOr(BB, Builder) || MergeIfRegion(BB, Builder))
+    return true;
+  return false;
+}
+
+/// FlattenCFG - This function is used to flatten a CFG.  For
+/// example, it uses parallel-and and parallel-or mode to collapse
+/// if-conditions and merge if-regions with identical statements.
+bool llvm::FlattenCFG(BasicBlock *BB, AAResults *AA) {
+  return FlattenCFGOpt(AA).run(BB);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/FunctionComparator.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/FunctionComparator.cpp
index df90e972b1..2696557a71 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/FunctionComparator.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/FunctionComparator.cpp
@@ -1,499 +1,499 @@
-//===- FunctionComparator.h - Function Comparator -------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the FunctionComparator and GlobalNumberState classes 
-// which are used by the MergeFunctions pass for comparing functions. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/FunctionComparator.h" 
-#include "llvm/ADT/APFloat.h" 
-#include "llvm/ADT/APInt.h" 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/Hashing.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/IR/Attributes.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GlobalValue.h" 
-#include "llvm/IR/InlineAsm.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Operator.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/Compiler.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include <cassert> 
-#include <cstddef> 
-#include <cstdint> 
-#include <utility> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "functioncomparator" 
- 
-int FunctionComparator::cmpNumbers(uint64_t L, uint64_t R) const { 
-  if (L < R) 
-    return -1; 
-  if (L > R) 
-    return 1; 
-  return 0; 
-} 
- 
-int FunctionComparator::cmpOrderings(AtomicOrdering L, AtomicOrdering R) const { 
-  if ((int)L < (int)R) 
-    return -1; 
-  if ((int)L > (int)R) 
-    return 1; 
-  return 0; 
-} 
- 
-int FunctionComparator::cmpAPInts(const APInt &L, const APInt &R) const { 
-  if (int Res = cmpNumbers(L.getBitWidth(), R.getBitWidth())) 
-    return Res; 
-  if (L.ugt(R)) 
-    return 1; 
-  if (R.ugt(L)) 
-    return -1; 
-  return 0; 
-} 
- 
-int FunctionComparator::cmpAPFloats(const APFloat &L, const APFloat &R) const { 
-  // Floats are ordered first by semantics (i.e. float, double, half, etc.), 
-  // then by value interpreted as a bitstring (aka APInt). 
-  const fltSemantics &SL = L.getSemantics(), &SR = R.getSemantics(); 
-  if (int Res = cmpNumbers(APFloat::semanticsPrecision(SL), 
-                           APFloat::semanticsPrecision(SR))) 
-    return Res; 
-  if (int Res = cmpNumbers(APFloat::semanticsMaxExponent(SL), 
-                           APFloat::semanticsMaxExponent(SR))) 
-    return Res; 
-  if (int Res = cmpNumbers(APFloat::semanticsMinExponent(SL), 
-                           APFloat::semanticsMinExponent(SR))) 
-    return Res; 
-  if (int Res = cmpNumbers(APFloat::semanticsSizeInBits(SL), 
-                           APFloat::semanticsSizeInBits(SR))) 
-    return Res; 
-  return cmpAPInts(L.bitcastToAPInt(), R.bitcastToAPInt()); 
-} 
- 
-int FunctionComparator::cmpMem(StringRef L, StringRef R) const { 
-  // Prevent heavy comparison, compare sizes first. 
-  if (int Res = cmpNumbers(L.size(), R.size())) 
-    return Res; 
- 
-  // Compare strings lexicographically only when it is necessary: only when 
-  // strings are equal in size. 
-  return L.compare(R); 
-} 
- 
-int FunctionComparator::cmpAttrs(const AttributeList L, 
-                                 const AttributeList R) const { 
-  if (int Res = cmpNumbers(L.getNumAttrSets(), R.getNumAttrSets())) 
-    return Res; 
- 
-  for (unsigned i = L.index_begin(), e = L.index_end(); i != e; ++i) { 
-    AttributeSet LAS = L.getAttributes(i); 
-    AttributeSet RAS = R.getAttributes(i); 
-    AttributeSet::iterator LI = LAS.begin(), LE = LAS.end(); 
-    AttributeSet::iterator RI = RAS.begin(), RE = RAS.end(); 
-    for (; LI != LE && RI != RE; ++LI, ++RI) { 
-      Attribute LA = *LI; 
-      Attribute RA = *RI; 
-      if (LA.isTypeAttribute() && RA.isTypeAttribute()) { 
-        if (LA.getKindAsEnum() != RA.getKindAsEnum()) 
-          return cmpNumbers(LA.getKindAsEnum(), RA.getKindAsEnum()); 
- 
-        Type *TyL = LA.getValueAsType(); 
-        Type *TyR = RA.getValueAsType(); 
+//===- FunctionComparator.h - Function Comparator -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the FunctionComparator and GlobalNumberState classes
+// which are used by the MergeFunctions pass for comparing functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/FunctionComparator.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "functioncomparator"
+
+int FunctionComparator::cmpNumbers(uint64_t L, uint64_t R) const {
+  if (L < R)
+    return -1;
+  if (L > R)
+    return 1;
+  return 0;
+}
+
+int FunctionComparator::cmpOrderings(AtomicOrdering L, AtomicOrdering R) const {
+  if ((int)L < (int)R)
+    return -1;
+  if ((int)L > (int)R)
+    return 1;
+  return 0;
+}
+
+int FunctionComparator::cmpAPInts(const APInt &L, const APInt &R) const {
+  if (int Res = cmpNumbers(L.getBitWidth(), R.getBitWidth()))
+    return Res;
+  if (L.ugt(R))
+    return 1;
+  if (R.ugt(L))
+    return -1;
+  return 0;
+}
+
+int FunctionComparator::cmpAPFloats(const APFloat &L, const APFloat &R) const {
+  // Floats are ordered first by semantics (i.e. float, double, half, etc.),
+  // then by value interpreted as a bitstring (aka APInt).
+  const fltSemantics &SL = L.getSemantics(), &SR = R.getSemantics();
+  if (int Res = cmpNumbers(APFloat::semanticsPrecision(SL),
+                           APFloat::semanticsPrecision(SR)))
+    return Res;
+  if (int Res = cmpNumbers(APFloat::semanticsMaxExponent(SL),
+                           APFloat::semanticsMaxExponent(SR)))
+    return Res;
+  if (int Res = cmpNumbers(APFloat::semanticsMinExponent(SL),
+                           APFloat::semanticsMinExponent(SR)))
+    return Res;
+  if (int Res = cmpNumbers(APFloat::semanticsSizeInBits(SL),
+                           APFloat::semanticsSizeInBits(SR)))
+    return Res;
+  return cmpAPInts(L.bitcastToAPInt(), R.bitcastToAPInt());
+}
+
+int FunctionComparator::cmpMem(StringRef L, StringRef R) const {
+  // Prevent heavy comparison, compare sizes first.
+  if (int Res = cmpNumbers(L.size(), R.size()))
+    return Res;
+
+  // Compare strings lexicographically only when it is necessary: only when
+  // strings are equal in size.
+  return L.compare(R);
+}
+
+int FunctionComparator::cmpAttrs(const AttributeList L,
+                                 const AttributeList R) const {
+  if (int Res = cmpNumbers(L.getNumAttrSets(), R.getNumAttrSets()))
+    return Res;
+
+  for (unsigned i = L.index_begin(), e = L.index_end(); i != e; ++i) {
+    AttributeSet LAS = L.getAttributes(i);
+    AttributeSet RAS = R.getAttributes(i);
+    AttributeSet::iterator LI = LAS.begin(), LE = LAS.end();
+    AttributeSet::iterator RI = RAS.begin(), RE = RAS.end();
+    for (; LI != LE && RI != RE; ++LI, ++RI) {
+      Attribute LA = *LI;
+      Attribute RA = *RI;
+      if (LA.isTypeAttribute() && RA.isTypeAttribute()) {
+        if (LA.getKindAsEnum() != RA.getKindAsEnum())
+          return cmpNumbers(LA.getKindAsEnum(), RA.getKindAsEnum());
+
+        Type *TyL = LA.getValueAsType();
+        Type *TyR = RA.getValueAsType();
         if (TyL && TyR) {
           if (int Res = cmpTypes(TyL, TyR))
             return Res;
           continue;
         }
- 
-        // Two pointers, at least one null, so the comparison result is 
-        // independent of the value of a real pointer. 
+
+        // Two pointers, at least one null, so the comparison result is
+        // independent of the value of a real pointer.
         if (int Res = cmpNumbers((uint64_t)TyL, (uint64_t)TyR))
           return Res;
         continue;
-      } 
-      if (LA < RA) 
-        return -1; 
-      if (RA < LA) 
-        return 1; 
-    } 
-    if (LI != LE) 
-      return 1; 
-    if (RI != RE) 
-      return -1; 
-  } 
-  return 0; 
-} 
- 
-int FunctionComparator::cmpRangeMetadata(const MDNode *L, 
-                                         const MDNode *R) const { 
-  if (L == R) 
-    return 0; 
-  if (!L) 
-    return -1; 
-  if (!R) 
-    return 1; 
-  // Range metadata is a sequence of numbers. Make sure they are the same 
-  // sequence. 
-  // TODO: Note that as this is metadata, it is possible to drop and/or merge 
-  // this data when considering functions to merge. Thus this comparison would 
-  // return 0 (i.e. equivalent), but merging would become more complicated 
-  // because the ranges would need to be unioned. It is not likely that 
-  // functions differ ONLY in this metadata if they are actually the same 
-  // function semantically. 
-  if (int Res = cmpNumbers(L->getNumOperands(), R->getNumOperands())) 
-    return Res; 
-  for (size_t I = 0; I < L->getNumOperands(); ++I) { 
-    ConstantInt *LLow = mdconst::extract<ConstantInt>(L->getOperand(I)); 
-    ConstantInt *RLow = mdconst::extract<ConstantInt>(R->getOperand(I)); 
-    if (int Res = cmpAPInts(LLow->getValue(), RLow->getValue())) 
-      return Res; 
-  } 
-  return 0; 
-} 
- 
-int FunctionComparator::cmpOperandBundlesSchema(const CallBase &LCS, 
-                                                const CallBase &RCS) const { 
-  assert(LCS.getOpcode() == RCS.getOpcode() && "Can't compare otherwise!"); 
- 
-  if (int Res = 
-          cmpNumbers(LCS.getNumOperandBundles(), RCS.getNumOperandBundles())) 
-    return Res; 
- 
-  for (unsigned I = 0, E = LCS.getNumOperandBundles(); I != E; ++I) { 
-    auto OBL = LCS.getOperandBundleAt(I); 
-    auto OBR = RCS.getOperandBundleAt(I); 
- 
-    if (int Res = OBL.getTagName().compare(OBR.getTagName())) 
-      return Res; 
- 
-    if (int Res = cmpNumbers(OBL.Inputs.size(), OBR.Inputs.size())) 
-      return Res; 
-  } 
- 
-  return 0; 
-} 
- 
-/// Constants comparison: 
-/// 1. Check whether type of L constant could be losslessly bitcasted to R 
-/// type. 
-/// 2. Compare constant contents. 
-/// For more details see declaration comments. 
-int FunctionComparator::cmpConstants(const Constant *L, 
-                                     const Constant *R) const { 
-  Type *TyL = L->getType(); 
-  Type *TyR = R->getType(); 
- 
-  // Check whether types are bitcastable. This part is just re-factored 
-  // Type::canLosslesslyBitCastTo method, but instead of returning true/false, 
-  // we also pack into result which type is "less" for us. 
-  int TypesRes = cmpTypes(TyL, TyR); 
-  if (TypesRes != 0) { 
-    // Types are different, but check whether we can bitcast them. 
-    if (!TyL->isFirstClassType()) { 
-      if (TyR->isFirstClassType()) 
-        return -1; 
-      // Neither TyL nor TyR are values of first class type. Return the result 
-      // of comparing the types 
-      return TypesRes; 
-    } 
-    if (!TyR->isFirstClassType()) { 
-      if (TyL->isFirstClassType()) 
-        return 1; 
-      return TypesRes; 
-    } 
- 
-    // Vector -> Vector conversions are always lossless if the two vector types 
-    // have the same size, otherwise not. 
-    unsigned TyLWidth = 0; 
-    unsigned TyRWidth = 0; 
- 
-    if (auto *VecTyL = dyn_cast<VectorType>(TyL)) 
-      TyLWidth = VecTyL->getPrimitiveSizeInBits().getFixedSize(); 
-    if (auto *VecTyR = dyn_cast<VectorType>(TyR)) 
-      TyRWidth = VecTyR->getPrimitiveSizeInBits().getFixedSize(); 
- 
-    if (TyLWidth != TyRWidth) 
-      return cmpNumbers(TyLWidth, TyRWidth); 
- 
-    // Zero bit-width means neither TyL nor TyR are vectors. 
-    if (!TyLWidth) { 
-      PointerType *PTyL = dyn_cast<PointerType>(TyL); 
-      PointerType *PTyR = dyn_cast<PointerType>(TyR); 
-      if (PTyL && PTyR) { 
-        unsigned AddrSpaceL = PTyL->getAddressSpace(); 
-        unsigned AddrSpaceR = PTyR->getAddressSpace(); 
-        if (int Res = cmpNumbers(AddrSpaceL, AddrSpaceR)) 
-          return Res; 
-      } 
-      if (PTyL) 
-        return 1; 
-      if (PTyR) 
-        return -1; 
- 
-      // TyL and TyR aren't vectors, nor pointers. We don't know how to 
-      // bitcast them. 
-      return TypesRes; 
-    } 
-  } 
- 
-  // OK, types are bitcastable, now check constant contents. 
- 
-  if (L->isNullValue() && R->isNullValue()) 
-    return TypesRes; 
-  if (L->isNullValue() && !R->isNullValue()) 
-    return 1; 
-  if (!L->isNullValue() && R->isNullValue()) 
-    return -1; 
- 
-  auto GlobalValueL = const_cast<GlobalValue *>(dyn_cast<GlobalValue>(L)); 
-  auto GlobalValueR = const_cast<GlobalValue *>(dyn_cast<GlobalValue>(R)); 
-  if (GlobalValueL && GlobalValueR) { 
-    return cmpGlobalValues(GlobalValueL, GlobalValueR); 
-  } 
- 
-  if (int Res = cmpNumbers(L->getValueID(), R->getValueID())) 
-    return Res; 
- 
-  if (const auto *SeqL = dyn_cast<ConstantDataSequential>(L)) { 
-    const auto *SeqR = cast<ConstantDataSequential>(R); 
-    // This handles ConstantDataArray and ConstantDataVector. Note that we 
-    // compare the two raw data arrays, which might differ depending on the host 
-    // endianness. This isn't a problem though, because the endiness of a module 
-    // will affect the order of the constants, but this order is the same 
-    // for a given input module and host platform. 
-    return cmpMem(SeqL->getRawDataValues(), SeqR->getRawDataValues()); 
-  } 
- 
-  switch (L->getValueID()) { 
-  case Value::UndefValueVal: 
+      }
+      if (LA < RA)
+        return -1;
+      if (RA < LA)
+        return 1;
+    }
+    if (LI != LE)
+      return 1;
+    if (RI != RE)
+      return -1;
+  }
+  return 0;
+}
+
+int FunctionComparator::cmpRangeMetadata(const MDNode *L,
+                                         const MDNode *R) const {
+  if (L == R)
+    return 0;
+  if (!L)
+    return -1;
+  if (!R)
+    return 1;
+  // Range metadata is a sequence of numbers. Make sure they are the same
+  // sequence.
+  // TODO: Note that as this is metadata, it is possible to drop and/or merge
+  // this data when considering functions to merge. Thus this comparison would
+  // return 0 (i.e. equivalent), but merging would become more complicated
+  // because the ranges would need to be unioned. It is not likely that
+  // functions differ ONLY in this metadata if they are actually the same
+  // function semantically.
+  if (int Res = cmpNumbers(L->getNumOperands(), R->getNumOperands()))
+    return Res;
+  for (size_t I = 0; I < L->getNumOperands(); ++I) {
+    ConstantInt *LLow = mdconst::extract<ConstantInt>(L->getOperand(I));
+    ConstantInt *RLow = mdconst::extract<ConstantInt>(R->getOperand(I));
+    if (int Res = cmpAPInts(LLow->getValue(), RLow->getValue()))
+      return Res;
+  }
+  return 0;
+}
+
+int FunctionComparator::cmpOperandBundlesSchema(const CallBase &LCS,
+                                                const CallBase &RCS) const {
+  assert(LCS.getOpcode() == RCS.getOpcode() && "Can't compare otherwise!");
+
+  if (int Res =
+          cmpNumbers(LCS.getNumOperandBundles(), RCS.getNumOperandBundles()))
+    return Res;
+
+  for (unsigned I = 0, E = LCS.getNumOperandBundles(); I != E; ++I) {
+    auto OBL = LCS.getOperandBundleAt(I);
+    auto OBR = RCS.getOperandBundleAt(I);
+
+    if (int Res = OBL.getTagName().compare(OBR.getTagName()))
+      return Res;
+
+    if (int Res = cmpNumbers(OBL.Inputs.size(), OBR.Inputs.size()))
+      return Res;
+  }
+
+  return 0;
+}
+
+/// Constants comparison:
+/// 1. Check whether type of L constant could be losslessly bitcasted to R
+/// type.
+/// 2. Compare constant contents.
+/// For more details see declaration comments.
+int FunctionComparator::cmpConstants(const Constant *L,
+                                     const Constant *R) const {
+  Type *TyL = L->getType();
+  Type *TyR = R->getType();
+
+  // Check whether types are bitcastable. This part is just re-factored
+  // Type::canLosslesslyBitCastTo method, but instead of returning true/false,
+  // we also pack into result which type is "less" for us.
+  int TypesRes = cmpTypes(TyL, TyR);
+  if (TypesRes != 0) {
+    // Types are different, but check whether we can bitcast them.
+    if (!TyL->isFirstClassType()) {
+      if (TyR->isFirstClassType())
+        return -1;
+      // Neither TyL nor TyR are values of first class type. Return the result
+      // of comparing the types
+      return TypesRes;
+    }
+    if (!TyR->isFirstClassType()) {
+      if (TyL->isFirstClassType())
+        return 1;
+      return TypesRes;
+    }
+
+    // Vector -> Vector conversions are always lossless if the two vector types
+    // have the same size, otherwise not.
+    unsigned TyLWidth = 0;
+    unsigned TyRWidth = 0;
+
+    if (auto *VecTyL = dyn_cast<VectorType>(TyL))
+      TyLWidth = VecTyL->getPrimitiveSizeInBits().getFixedSize();
+    if (auto *VecTyR = dyn_cast<VectorType>(TyR))
+      TyRWidth = VecTyR->getPrimitiveSizeInBits().getFixedSize();
+
+    if (TyLWidth != TyRWidth)
+      return cmpNumbers(TyLWidth, TyRWidth);
+
+    // Zero bit-width means neither TyL nor TyR are vectors.
+    if (!TyLWidth) {
+      PointerType *PTyL = dyn_cast<PointerType>(TyL);
+      PointerType *PTyR = dyn_cast<PointerType>(TyR);
+      if (PTyL && PTyR) {
+        unsigned AddrSpaceL = PTyL->getAddressSpace();
+        unsigned AddrSpaceR = PTyR->getAddressSpace();
+        if (int Res = cmpNumbers(AddrSpaceL, AddrSpaceR))
+          return Res;
+      }
+      if (PTyL)
+        return 1;
+      if (PTyR)
+        return -1;
+
+      // TyL and TyR aren't vectors, nor pointers. We don't know how to
+      // bitcast them.
+      return TypesRes;
+    }
+  }
+
+  // OK, types are bitcastable, now check constant contents.
+
+  if (L->isNullValue() && R->isNullValue())
+    return TypesRes;
+  if (L->isNullValue() && !R->isNullValue())
+    return 1;
+  if (!L->isNullValue() && R->isNullValue())
+    return -1;
+
+  auto GlobalValueL = const_cast<GlobalValue *>(dyn_cast<GlobalValue>(L));
+  auto GlobalValueR = const_cast<GlobalValue *>(dyn_cast<GlobalValue>(R));
+  if (GlobalValueL && GlobalValueR) {
+    return cmpGlobalValues(GlobalValueL, GlobalValueR);
+  }
+
+  if (int Res = cmpNumbers(L->getValueID(), R->getValueID()))
+    return Res;
+
+  if (const auto *SeqL = dyn_cast<ConstantDataSequential>(L)) {
+    const auto *SeqR = cast<ConstantDataSequential>(R);
+    // This handles ConstantDataArray and ConstantDataVector. Note that we
+    // compare the two raw data arrays, which might differ depending on the host
+    // endianness. This isn't a problem though, because the endiness of a module
+    // will affect the order of the constants, but this order is the same
+    // for a given input module and host platform.
+    return cmpMem(SeqL->getRawDataValues(), SeqR->getRawDataValues());
+  }
+
+  switch (L->getValueID()) {
+  case Value::UndefValueVal:
   case Value::PoisonValueVal:
-  case Value::ConstantTokenNoneVal: 
-    return TypesRes; 
-  case Value::ConstantIntVal: { 
-    const APInt &LInt = cast<ConstantInt>(L)->getValue(); 
-    const APInt &RInt = cast<ConstantInt>(R)->getValue(); 
-    return cmpAPInts(LInt, RInt); 
-  } 
-  case Value::ConstantFPVal: { 
-    const APFloat &LAPF = cast<ConstantFP>(L)->getValueAPF(); 
-    const APFloat &RAPF = cast<ConstantFP>(R)->getValueAPF(); 
-    return cmpAPFloats(LAPF, RAPF); 
-  } 
-  case Value::ConstantArrayVal: { 
-    const ConstantArray *LA = cast<ConstantArray>(L); 
-    const ConstantArray *RA = cast<ConstantArray>(R); 
-    uint64_t NumElementsL = cast<ArrayType>(TyL)->getNumElements(); 
-    uint64_t NumElementsR = cast<ArrayType>(TyR)->getNumElements(); 
-    if (int Res = cmpNumbers(NumElementsL, NumElementsR)) 
-      return Res; 
-    for (uint64_t i = 0; i < NumElementsL; ++i) { 
-      if (int Res = cmpConstants(cast<Constant>(LA->getOperand(i)), 
-                                 cast<Constant>(RA->getOperand(i)))) 
-        return Res; 
-    } 
-    return 0; 
-  } 
-  case Value::ConstantStructVal: { 
-    const ConstantStruct *LS = cast<ConstantStruct>(L); 
-    const ConstantStruct *RS = cast<ConstantStruct>(R); 
-    unsigned NumElementsL = cast<StructType>(TyL)->getNumElements(); 
-    unsigned NumElementsR = cast<StructType>(TyR)->getNumElements(); 
-    if (int Res = cmpNumbers(NumElementsL, NumElementsR)) 
-      return Res; 
-    for (unsigned i = 0; i != NumElementsL; ++i) { 
-      if (int Res = cmpConstants(cast<Constant>(LS->getOperand(i)), 
-                                 cast<Constant>(RS->getOperand(i)))) 
-        return Res; 
-    } 
-    return 0; 
-  } 
-  case Value::ConstantVectorVal: { 
-    const ConstantVector *LV = cast<ConstantVector>(L); 
-    const ConstantVector *RV = cast<ConstantVector>(R); 
-    unsigned NumElementsL = cast<FixedVectorType>(TyL)->getNumElements(); 
-    unsigned NumElementsR = cast<FixedVectorType>(TyR)->getNumElements(); 
-    if (int Res = cmpNumbers(NumElementsL, NumElementsR)) 
-      return Res; 
-    for (uint64_t i = 0; i < NumElementsL; ++i) { 
-      if (int Res = cmpConstants(cast<Constant>(LV->getOperand(i)), 
-                                 cast<Constant>(RV->getOperand(i)))) 
-        return Res; 
-    } 
-    return 0; 
-  } 
-  case Value::ConstantExprVal: { 
-    const ConstantExpr *LE = cast<ConstantExpr>(L); 
-    const ConstantExpr *RE = cast<ConstantExpr>(R); 
-    unsigned NumOperandsL = LE->getNumOperands(); 
-    unsigned NumOperandsR = RE->getNumOperands(); 
-    if (int Res = cmpNumbers(NumOperandsL, NumOperandsR)) 
-      return Res; 
-    for (unsigned i = 0; i < NumOperandsL; ++i) { 
-      if (int Res = cmpConstants(cast<Constant>(LE->getOperand(i)), 
-                                 cast<Constant>(RE->getOperand(i)))) 
-        return Res; 
-    } 
-    return 0; 
-  } 
-  case Value::BlockAddressVal: { 
-    const BlockAddress *LBA = cast<BlockAddress>(L); 
-    const BlockAddress *RBA = cast<BlockAddress>(R); 
-    if (int Res = cmpValues(LBA->getFunction(), RBA->getFunction())) 
-      return Res; 
-    if (LBA->getFunction() == RBA->getFunction()) { 
-      // They are BBs in the same function. Order by which comes first in the 
-      // BB order of the function. This order is deterministic. 
-      Function *F = LBA->getFunction(); 
-      BasicBlock *LBB = LBA->getBasicBlock(); 
-      BasicBlock *RBB = RBA->getBasicBlock(); 
-      if (LBB == RBB) 
-        return 0; 
-      for (BasicBlock &BB : F->getBasicBlockList()) { 
-        if (&BB == LBB) { 
-          assert(&BB != RBB); 
-          return -1; 
-        } 
-        if (&BB == RBB) 
-          return 1; 
-      } 
-      llvm_unreachable("Basic Block Address does not point to a basic block in " 
-                       "its function."); 
-      return -1; 
-    } else { 
-      // cmpValues said the functions are the same. So because they aren't 
-      // literally the same pointer, they must respectively be the left and 
-      // right functions. 
-      assert(LBA->getFunction() == FnL && RBA->getFunction() == FnR); 
-      // cmpValues will tell us if these are equivalent BasicBlocks, in the 
-      // context of their respective functions. 
-      return cmpValues(LBA->getBasicBlock(), RBA->getBasicBlock()); 
-    } 
-  } 
-  default: // Unknown constant, abort. 
-    LLVM_DEBUG(dbgs() << "Looking at valueID " << L->getValueID() << "\n"); 
-    llvm_unreachable("Constant ValueID not recognized."); 
-    return -1; 
-  } 
-} 
- 
-int FunctionComparator::cmpGlobalValues(GlobalValue *L, GlobalValue *R) const { 
-  uint64_t LNumber = GlobalNumbers->getNumber(L); 
-  uint64_t RNumber = GlobalNumbers->getNumber(R); 
-  return cmpNumbers(LNumber, RNumber); 
-} 
- 
-/// cmpType - compares two types, 
-/// defines total ordering among the types set. 
-/// See method declaration comments for more details. 
-int FunctionComparator::cmpTypes(Type *TyL, Type *TyR) const { 
-  PointerType *PTyL = dyn_cast<PointerType>(TyL); 
-  PointerType *PTyR = dyn_cast<PointerType>(TyR); 
- 
-  const DataLayout &DL = FnL->getParent()->getDataLayout(); 
-  if (PTyL && PTyL->getAddressSpace() == 0) 
-    TyL = DL.getIntPtrType(TyL); 
-  if (PTyR && PTyR->getAddressSpace() == 0) 
-    TyR = DL.getIntPtrType(TyR); 
- 
-  if (TyL == TyR) 
-    return 0; 
- 
-  if (int Res = cmpNumbers(TyL->getTypeID(), TyR->getTypeID())) 
-    return Res; 
- 
-  switch (TyL->getTypeID()) { 
-  default: 
-    llvm_unreachable("Unknown type!"); 
-  case Type::IntegerTyID: 
-    return cmpNumbers(cast<IntegerType>(TyL)->getBitWidth(), 
-                      cast<IntegerType>(TyR)->getBitWidth()); 
-  // TyL == TyR would have returned true earlier, because types are uniqued. 
-  case Type::VoidTyID: 
-  case Type::FloatTyID: 
-  case Type::DoubleTyID: 
-  case Type::X86_FP80TyID: 
-  case Type::FP128TyID: 
-  case Type::PPC_FP128TyID: 
-  case Type::LabelTyID: 
-  case Type::MetadataTyID: 
-  case Type::TokenTyID: 
-    return 0; 
- 
-  case Type::PointerTyID: 
-    assert(PTyL && PTyR && "Both types must be pointers here."); 
-    return cmpNumbers(PTyL->getAddressSpace(), PTyR->getAddressSpace()); 
- 
-  case Type::StructTyID: { 
-    StructType *STyL = cast<StructType>(TyL); 
-    StructType *STyR = cast<StructType>(TyR); 
-    if (STyL->getNumElements() != STyR->getNumElements()) 
-      return cmpNumbers(STyL->getNumElements(), STyR->getNumElements()); 
- 
-    if (STyL->isPacked() != STyR->isPacked()) 
-      return cmpNumbers(STyL->isPacked(), STyR->isPacked()); 
- 
-    for (unsigned i = 0, e = STyL->getNumElements(); i != e; ++i) { 
-      if (int Res = cmpTypes(STyL->getElementType(i), STyR->getElementType(i))) 
-        return Res; 
-    } 
-    return 0; 
-  } 
- 
-  case Type::FunctionTyID: { 
-    FunctionType *FTyL = cast<FunctionType>(TyL); 
-    FunctionType *FTyR = cast<FunctionType>(TyR); 
-    if (FTyL->getNumParams() != FTyR->getNumParams()) 
-      return cmpNumbers(FTyL->getNumParams(), FTyR->getNumParams()); 
- 
-    if (FTyL->isVarArg() != FTyR->isVarArg()) 
-      return cmpNumbers(FTyL->isVarArg(), FTyR->isVarArg()); 
- 
-    if (int Res = cmpTypes(FTyL->getReturnType(), FTyR->getReturnType())) 
-      return Res; 
- 
-    for (unsigned i = 0, e = FTyL->getNumParams(); i != e; ++i) { 
-      if (int Res = cmpTypes(FTyL->getParamType(i), FTyR->getParamType(i))) 
-        return Res; 
-    } 
-    return 0; 
-  } 
- 
-  case Type::ArrayTyID: { 
-    auto *STyL = cast<ArrayType>(TyL); 
-    auto *STyR = cast<ArrayType>(TyR); 
-    if (STyL->getNumElements() != STyR->getNumElements()) 
-      return cmpNumbers(STyL->getNumElements(), STyR->getNumElements()); 
-    return cmpTypes(STyL->getElementType(), STyR->getElementType()); 
-  } 
-  case Type::FixedVectorTyID: 
-  case Type::ScalableVectorTyID: { 
-    auto *STyL = cast<VectorType>(TyL); 
-    auto *STyR = cast<VectorType>(TyR); 
+  case Value::ConstantTokenNoneVal:
+    return TypesRes;
+  case Value::ConstantIntVal: {
+    const APInt &LInt = cast<ConstantInt>(L)->getValue();
+    const APInt &RInt = cast<ConstantInt>(R)->getValue();
+    return cmpAPInts(LInt, RInt);
+  }
+  case Value::ConstantFPVal: {
+    const APFloat &LAPF = cast<ConstantFP>(L)->getValueAPF();
+    const APFloat &RAPF = cast<ConstantFP>(R)->getValueAPF();
+    return cmpAPFloats(LAPF, RAPF);
+  }
+  case Value::ConstantArrayVal: {
+    const ConstantArray *LA = cast<ConstantArray>(L);
+    const ConstantArray *RA = cast<ConstantArray>(R);
+    uint64_t NumElementsL = cast<ArrayType>(TyL)->getNumElements();
+    uint64_t NumElementsR = cast<ArrayType>(TyR)->getNumElements();
+    if (int Res = cmpNumbers(NumElementsL, NumElementsR))
+      return Res;
+    for (uint64_t i = 0; i < NumElementsL; ++i) {
+      if (int Res = cmpConstants(cast<Constant>(LA->getOperand(i)),
+                                 cast<Constant>(RA->getOperand(i))))
+        return Res;
+    }
+    return 0;
+  }
+  case Value::ConstantStructVal: {
+    const ConstantStruct *LS = cast<ConstantStruct>(L);
+    const ConstantStruct *RS = cast<ConstantStruct>(R);
+    unsigned NumElementsL = cast<StructType>(TyL)->getNumElements();
+    unsigned NumElementsR = cast<StructType>(TyR)->getNumElements();
+    if (int Res = cmpNumbers(NumElementsL, NumElementsR))
+      return Res;
+    for (unsigned i = 0; i != NumElementsL; ++i) {
+      if (int Res = cmpConstants(cast<Constant>(LS->getOperand(i)),
+                                 cast<Constant>(RS->getOperand(i))))
+        return Res;
+    }
+    return 0;
+  }
+  case Value::ConstantVectorVal: {
+    const ConstantVector *LV = cast<ConstantVector>(L);
+    const ConstantVector *RV = cast<ConstantVector>(R);
+    unsigned NumElementsL = cast<FixedVectorType>(TyL)->getNumElements();
+    unsigned NumElementsR = cast<FixedVectorType>(TyR)->getNumElements();
+    if (int Res = cmpNumbers(NumElementsL, NumElementsR))
+      return Res;
+    for (uint64_t i = 0; i < NumElementsL; ++i) {
+      if (int Res = cmpConstants(cast<Constant>(LV->getOperand(i)),
+                                 cast<Constant>(RV->getOperand(i))))
+        return Res;
+    }
+    return 0;
+  }
+  case Value::ConstantExprVal: {
+    const ConstantExpr *LE = cast<ConstantExpr>(L);
+    const ConstantExpr *RE = cast<ConstantExpr>(R);
+    unsigned NumOperandsL = LE->getNumOperands();
+    unsigned NumOperandsR = RE->getNumOperands();
+    if (int Res = cmpNumbers(NumOperandsL, NumOperandsR))
+      return Res;
+    for (unsigned i = 0; i < NumOperandsL; ++i) {
+      if (int Res = cmpConstants(cast<Constant>(LE->getOperand(i)),
+                                 cast<Constant>(RE->getOperand(i))))
+        return Res;
+    }
+    return 0;
+  }
+  case Value::BlockAddressVal: {
+    const BlockAddress *LBA = cast<BlockAddress>(L);
+    const BlockAddress *RBA = cast<BlockAddress>(R);
+    if (int Res = cmpValues(LBA->getFunction(), RBA->getFunction()))
+      return Res;
+    if (LBA->getFunction() == RBA->getFunction()) {
+      // They are BBs in the same function. Order by which comes first in the
+      // BB order of the function. This order is deterministic.
+      Function *F = LBA->getFunction();
+      BasicBlock *LBB = LBA->getBasicBlock();
+      BasicBlock *RBB = RBA->getBasicBlock();
+      if (LBB == RBB)
+        return 0;
+      for (BasicBlock &BB : F->getBasicBlockList()) {
+        if (&BB == LBB) {
+          assert(&BB != RBB);
+          return -1;
+        }
+        if (&BB == RBB)
+          return 1;
+      }
+      llvm_unreachable("Basic Block Address does not point to a basic block in "
+                       "its function.");
+      return -1;
+    } else {
+      // cmpValues said the functions are the same. So because they aren't
+      // literally the same pointer, they must respectively be the left and
+      // right functions.
+      assert(LBA->getFunction() == FnL && RBA->getFunction() == FnR);
+      // cmpValues will tell us if these are equivalent BasicBlocks, in the
+      // context of their respective functions.
+      return cmpValues(LBA->getBasicBlock(), RBA->getBasicBlock());
+    }
+  }
+  default: // Unknown constant, abort.
+    LLVM_DEBUG(dbgs() << "Looking at valueID " << L->getValueID() << "\n");
+    llvm_unreachable("Constant ValueID not recognized.");
+    return -1;
+  }
+}
+
+int FunctionComparator::cmpGlobalValues(GlobalValue *L, GlobalValue *R) const {
+  uint64_t LNumber = GlobalNumbers->getNumber(L);
+  uint64_t RNumber = GlobalNumbers->getNumber(R);
+  return cmpNumbers(LNumber, RNumber);
+}
+
+/// cmpType - compares two types,
+/// defines total ordering among the types set.
+/// See method declaration comments for more details.
+int FunctionComparator::cmpTypes(Type *TyL, Type *TyR) const {
+  PointerType *PTyL = dyn_cast<PointerType>(TyL);
+  PointerType *PTyR = dyn_cast<PointerType>(TyR);
+
+  const DataLayout &DL = FnL->getParent()->getDataLayout();
+  if (PTyL && PTyL->getAddressSpace() == 0)
+    TyL = DL.getIntPtrType(TyL);
+  if (PTyR && PTyR->getAddressSpace() == 0)
+    TyR = DL.getIntPtrType(TyR);
+
+  if (TyL == TyR)
+    return 0;
+
+  if (int Res = cmpNumbers(TyL->getTypeID(), TyR->getTypeID()))
+    return Res;
+
+  switch (TyL->getTypeID()) {
+  default:
+    llvm_unreachable("Unknown type!");
+  case Type::IntegerTyID:
+    return cmpNumbers(cast<IntegerType>(TyL)->getBitWidth(),
+                      cast<IntegerType>(TyR)->getBitWidth());
+  // TyL == TyR would have returned true earlier, because types are uniqued.
+  case Type::VoidTyID:
+  case Type::FloatTyID:
+  case Type::DoubleTyID:
+  case Type::X86_FP80TyID:
+  case Type::FP128TyID:
+  case Type::PPC_FP128TyID:
+  case Type::LabelTyID:
+  case Type::MetadataTyID:
+  case Type::TokenTyID:
+    return 0;
+
+  case Type::PointerTyID:
+    assert(PTyL && PTyR && "Both types must be pointers here.");
+    return cmpNumbers(PTyL->getAddressSpace(), PTyR->getAddressSpace());
+
+  case Type::StructTyID: {
+    StructType *STyL = cast<StructType>(TyL);
+    StructType *STyR = cast<StructType>(TyR);
+    if (STyL->getNumElements() != STyR->getNumElements())
+      return cmpNumbers(STyL->getNumElements(), STyR->getNumElements());
+
+    if (STyL->isPacked() != STyR->isPacked())
+      return cmpNumbers(STyL->isPacked(), STyR->isPacked());
+
+    for (unsigned i = 0, e = STyL->getNumElements(); i != e; ++i) {
+      if (int Res = cmpTypes(STyL->getElementType(i), STyR->getElementType(i)))
+        return Res;
+    }
+    return 0;
+  }
+
+  case Type::FunctionTyID: {
+    FunctionType *FTyL = cast<FunctionType>(TyL);
+    FunctionType *FTyR = cast<FunctionType>(TyR);
+    if (FTyL->getNumParams() != FTyR->getNumParams())
+      return cmpNumbers(FTyL->getNumParams(), FTyR->getNumParams());
+
+    if (FTyL->isVarArg() != FTyR->isVarArg())
+      return cmpNumbers(FTyL->isVarArg(), FTyR->isVarArg());
+
+    if (int Res = cmpTypes(FTyL->getReturnType(), FTyR->getReturnType()))
+      return Res;
+
+    for (unsigned i = 0, e = FTyL->getNumParams(); i != e; ++i) {
+      if (int Res = cmpTypes(FTyL->getParamType(i), FTyR->getParamType(i)))
+        return Res;
+    }
+    return 0;
+  }
+
+  case Type::ArrayTyID: {
+    auto *STyL = cast<ArrayType>(TyL);
+    auto *STyR = cast<ArrayType>(TyR);
+    if (STyL->getNumElements() != STyR->getNumElements())
+      return cmpNumbers(STyL->getNumElements(), STyR->getNumElements());
+    return cmpTypes(STyL->getElementType(), STyR->getElementType());
+  }
+  case Type::FixedVectorTyID:
+  case Type::ScalableVectorTyID: {
+    auto *STyL = cast<VectorType>(TyL);
+    auto *STyR = cast<VectorType>(TyR);
     if (STyL->getElementCount().isScalable() !=
         STyR->getElementCount().isScalable())
       return cmpNumbers(STyL->getElementCount().isScalable(),
@@ -501,476 +501,476 @@ int FunctionComparator::cmpTypes(Type *TyL, Type *TyR) const {
     if (STyL->getElementCount() != STyR->getElementCount())
       return cmpNumbers(STyL->getElementCount().getKnownMinValue(),
                         STyR->getElementCount().getKnownMinValue());
-    return cmpTypes(STyL->getElementType(), STyR->getElementType()); 
-  } 
-  } 
-} 
- 
-// Determine whether the two operations are the same except that pointer-to-A 
-// and pointer-to-B are equivalent. This should be kept in sync with 
-// Instruction::isSameOperationAs. 
-// Read method declaration comments for more details. 
-int FunctionComparator::cmpOperations(const Instruction *L, 
-                                      const Instruction *R, 
-                                      bool &needToCmpOperands) const { 
-  needToCmpOperands = true; 
-  if (int Res = cmpValues(L, R)) 
-    return Res; 
- 
-  // Differences from Instruction::isSameOperationAs: 
-  //  * replace type comparison with calls to cmpTypes. 
-  //  * we test for I->getRawSubclassOptionalData (nuw/nsw/tail) at the top. 
-  //  * because of the above, we don't test for the tail bit on calls later on. 
-  if (int Res = cmpNumbers(L->getOpcode(), R->getOpcode())) 
-    return Res; 
- 
-  if (const GetElementPtrInst *GEPL = dyn_cast<GetElementPtrInst>(L)) { 
-    needToCmpOperands = false; 
-    const GetElementPtrInst *GEPR = cast<GetElementPtrInst>(R); 
-    if (int Res = 
-            cmpValues(GEPL->getPointerOperand(), GEPR->getPointerOperand())) 
-      return Res; 
-    return cmpGEPs(GEPL, GEPR); 
-  } 
- 
-  if (int Res = cmpNumbers(L->getNumOperands(), R->getNumOperands())) 
-    return Res; 
- 
-  if (int Res = cmpTypes(L->getType(), R->getType())) 
-    return Res; 
- 
-  if (int Res = cmpNumbers(L->getRawSubclassOptionalData(), 
-                           R->getRawSubclassOptionalData())) 
-    return Res; 
- 
-  // We have two instructions of identical opcode and #operands.  Check to see 
-  // if all operands are the same type 
-  for (unsigned i = 0, e = L->getNumOperands(); i != e; ++i) { 
-    if (int Res = 
-            cmpTypes(L->getOperand(i)->getType(), R->getOperand(i)->getType())) 
-      return Res; 
-  } 
- 
-  // Check special state that is a part of some instructions. 
-  if (const AllocaInst *AI = dyn_cast<AllocaInst>(L)) { 
-    if (int Res = cmpTypes(AI->getAllocatedType(), 
-                           cast<AllocaInst>(R)->getAllocatedType())) 
-      return Res; 
-    return cmpNumbers(AI->getAlignment(), cast<AllocaInst>(R)->getAlignment()); 
-  } 
-  if (const LoadInst *LI = dyn_cast<LoadInst>(L)) { 
-    if (int Res = cmpNumbers(LI->isVolatile(), cast<LoadInst>(R)->isVolatile())) 
-      return Res; 
-    if (int Res = 
-            cmpNumbers(LI->getAlignment(), cast<LoadInst>(R)->getAlignment())) 
-      return Res; 
-    if (int Res = 
-            cmpOrderings(LI->getOrdering(), cast<LoadInst>(R)->getOrdering())) 
-      return Res; 
-    if (int Res = cmpNumbers(LI->getSyncScopeID(), 
-                             cast<LoadInst>(R)->getSyncScopeID())) 
-      return Res; 
-    return cmpRangeMetadata( 
-        LI->getMetadata(LLVMContext::MD_range), 
-        cast<LoadInst>(R)->getMetadata(LLVMContext::MD_range)); 
-  } 
-  if (const StoreInst *SI = dyn_cast<StoreInst>(L)) { 
-    if (int Res = 
-            cmpNumbers(SI->isVolatile(), cast<StoreInst>(R)->isVolatile())) 
-      return Res; 
-    if (int Res = 
-            cmpNumbers(SI->getAlignment(), cast<StoreInst>(R)->getAlignment())) 
-      return Res; 
-    if (int Res = 
-            cmpOrderings(SI->getOrdering(), cast<StoreInst>(R)->getOrdering())) 
-      return Res; 
-    return cmpNumbers(SI->getSyncScopeID(), 
-                      cast<StoreInst>(R)->getSyncScopeID()); 
-  } 
-  if (const CmpInst *CI = dyn_cast<CmpInst>(L)) 
-    return cmpNumbers(CI->getPredicate(), cast<CmpInst>(R)->getPredicate()); 
-  if (auto *CBL = dyn_cast<CallBase>(L)) { 
-    auto *CBR = cast<CallBase>(R); 
-    if (int Res = cmpNumbers(CBL->getCallingConv(), CBR->getCallingConv())) 
-      return Res; 
-    if (int Res = cmpAttrs(CBL->getAttributes(), CBR->getAttributes())) 
-      return Res; 
-    if (int Res = cmpOperandBundlesSchema(*CBL, *CBR)) 
-      return Res; 
-    if (const CallInst *CI = dyn_cast<CallInst>(L)) 
-      if (int Res = cmpNumbers(CI->getTailCallKind(), 
-                               cast<CallInst>(R)->getTailCallKind())) 
-        return Res; 
-    return cmpRangeMetadata(L->getMetadata(LLVMContext::MD_range), 
-                            R->getMetadata(LLVMContext::MD_range)); 
-  } 
-  if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(L)) { 
-    ArrayRef<unsigned> LIndices = IVI->getIndices(); 
-    ArrayRef<unsigned> RIndices = cast<InsertValueInst>(R)->getIndices(); 
-    if (int Res = cmpNumbers(LIndices.size(), RIndices.size())) 
-      return Res; 
-    for (size_t i = 0, e = LIndices.size(); i != e; ++i) { 
-      if (int Res = cmpNumbers(LIndices[i], RIndices[i])) 
-        return Res; 
-    } 
-    return 0; 
-  } 
-  if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(L)) { 
-    ArrayRef<unsigned> LIndices = EVI->getIndices(); 
-    ArrayRef<unsigned> RIndices = cast<ExtractValueInst>(R)->getIndices(); 
-    if (int Res = cmpNumbers(LIndices.size(), RIndices.size())) 
-      return Res; 
-    for (size_t i = 0, e = LIndices.size(); i != e; ++i) { 
-      if (int Res = cmpNumbers(LIndices[i], RIndices[i])) 
-        return Res; 
-    } 
-  } 
-  if (const FenceInst *FI = dyn_cast<FenceInst>(L)) { 
-    if (int Res = 
-            cmpOrderings(FI->getOrdering(), cast<FenceInst>(R)->getOrdering())) 
-      return Res; 
-    return cmpNumbers(FI->getSyncScopeID(), 
-                      cast<FenceInst>(R)->getSyncScopeID()); 
-  } 
-  if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(L)) { 
-    if (int Res = cmpNumbers(CXI->isVolatile(), 
-                             cast<AtomicCmpXchgInst>(R)->isVolatile())) 
-      return Res; 
-    if (int Res = 
-            cmpNumbers(CXI->isWeak(), cast<AtomicCmpXchgInst>(R)->isWeak())) 
-      return Res; 
-    if (int Res = 
-            cmpOrderings(CXI->getSuccessOrdering(), 
-                         cast<AtomicCmpXchgInst>(R)->getSuccessOrdering())) 
-      return Res; 
-    if (int Res = 
-            cmpOrderings(CXI->getFailureOrdering(), 
-                         cast<AtomicCmpXchgInst>(R)->getFailureOrdering())) 
-      return Res; 
-    return cmpNumbers(CXI->getSyncScopeID(), 
-                      cast<AtomicCmpXchgInst>(R)->getSyncScopeID()); 
-  } 
-  if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(L)) { 
-    if (int Res = cmpNumbers(RMWI->getOperation(), 
-                             cast<AtomicRMWInst>(R)->getOperation())) 
-      return Res; 
-    if (int Res = cmpNumbers(RMWI->isVolatile(), 
-                             cast<AtomicRMWInst>(R)->isVolatile())) 
-      return Res; 
-    if (int Res = cmpOrderings(RMWI->getOrdering(), 
-                               cast<AtomicRMWInst>(R)->getOrdering())) 
-      return Res; 
-    return cmpNumbers(RMWI->getSyncScopeID(), 
-                      cast<AtomicRMWInst>(R)->getSyncScopeID()); 
-  } 
-  if (const ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(L)) { 
-    ArrayRef<int> LMask = SVI->getShuffleMask(); 
-    ArrayRef<int> RMask = cast<ShuffleVectorInst>(R)->getShuffleMask(); 
-    if (int Res = cmpNumbers(LMask.size(), RMask.size())) 
-      return Res; 
-    for (size_t i = 0, e = LMask.size(); i != e; ++i) { 
-      if (int Res = cmpNumbers(LMask[i], RMask[i])) 
-        return Res; 
-    } 
-  } 
-  if (const PHINode *PNL = dyn_cast<PHINode>(L)) { 
-    const PHINode *PNR = cast<PHINode>(R); 
-    // Ensure that in addition to the incoming values being identical 
-    // (checked by the caller of this function), the incoming blocks 
-    // are also identical. 
-    for (unsigned i = 0, e = PNL->getNumIncomingValues(); i != e; ++i) { 
-      if (int Res = 
-              cmpValues(PNL->getIncomingBlock(i), PNR->getIncomingBlock(i))) 
-        return Res; 
-    } 
-  } 
-  return 0; 
-} 
- 
-// Determine whether two GEP operations perform the same underlying arithmetic. 
-// Read method declaration comments for more details. 
-int FunctionComparator::cmpGEPs(const GEPOperator *GEPL, 
-                                const GEPOperator *GEPR) const { 
-  unsigned int ASL = GEPL->getPointerAddressSpace(); 
-  unsigned int ASR = GEPR->getPointerAddressSpace(); 
- 
-  if (int Res = cmpNumbers(ASL, ASR)) 
-    return Res; 
- 
-  // When we have target data, we can reduce the GEP down to the value in bytes 
-  // added to the address. 
-  const DataLayout &DL = FnL->getParent()->getDataLayout(); 
-  unsigned BitWidth = DL.getPointerSizeInBits(ASL); 
-  APInt OffsetL(BitWidth, 0), OffsetR(BitWidth, 0); 
-  if (GEPL->accumulateConstantOffset(DL, OffsetL) && 
-      GEPR->accumulateConstantOffset(DL, OffsetR)) 
-    return cmpAPInts(OffsetL, OffsetR); 
-  if (int Res = 
-          cmpTypes(GEPL->getSourceElementType(), GEPR->getSourceElementType())) 
-    return Res; 
- 
-  if (int Res = cmpNumbers(GEPL->getNumOperands(), GEPR->getNumOperands())) 
-    return Res; 
- 
-  for (unsigned i = 0, e = GEPL->getNumOperands(); i != e; ++i) { 
-    if (int Res = cmpValues(GEPL->getOperand(i), GEPR->getOperand(i))) 
-      return Res; 
-  } 
- 
-  return 0; 
-} 
- 
-int FunctionComparator::cmpInlineAsm(const InlineAsm *L, 
-                                     const InlineAsm *R) const { 
-  // InlineAsm's are uniqued. If they are the same pointer, obviously they are 
-  // the same, otherwise compare the fields. 
-  if (L == R) 
-    return 0; 
-  if (int Res = cmpTypes(L->getFunctionType(), R->getFunctionType())) 
-    return Res; 
-  if (int Res = cmpMem(L->getAsmString(), R->getAsmString())) 
-    return Res; 
-  if (int Res = cmpMem(L->getConstraintString(), R->getConstraintString())) 
-    return Res; 
-  if (int Res = cmpNumbers(L->hasSideEffects(), R->hasSideEffects())) 
-    return Res; 
-  if (int Res = cmpNumbers(L->isAlignStack(), R->isAlignStack())) 
-    return Res; 
-  if (int Res = cmpNumbers(L->getDialect(), R->getDialect())) 
-    return Res; 
-  assert(L->getFunctionType() != R->getFunctionType()); 
-  return 0; 
-} 
- 
-/// Compare two values used by the two functions under pair-wise comparison. If 
-/// this is the first time the values are seen, they're added to the mapping so 
-/// that we will detect mismatches on next use. 
-/// See comments in declaration for more details. 
-int FunctionComparator::cmpValues(const Value *L, const Value *R) const { 
-  // Catch self-reference case. 
-  if (L == FnL) { 
-    if (R == FnR) 
-      return 0; 
-    return -1; 
-  } 
-  if (R == FnR) { 
-    if (L == FnL) 
-      return 0; 
-    return 1; 
-  } 
- 
-  const Constant *ConstL = dyn_cast<Constant>(L); 
-  const Constant *ConstR = dyn_cast<Constant>(R); 
-  if (ConstL && ConstR) { 
-    if (L == R) 
-      return 0; 
-    return cmpConstants(ConstL, ConstR); 
-  } 
- 
-  if (ConstL) 
-    return 1; 
-  if (ConstR) 
-    return -1; 
- 
-  const InlineAsm *InlineAsmL = dyn_cast<InlineAsm>(L); 
-  const InlineAsm *InlineAsmR = dyn_cast<InlineAsm>(R); 
- 
-  if (InlineAsmL && InlineAsmR) 
-    return cmpInlineAsm(InlineAsmL, InlineAsmR); 
-  if (InlineAsmL) 
-    return 1; 
-  if (InlineAsmR) 
-    return -1; 
- 
-  auto LeftSN = sn_mapL.insert(std::make_pair(L, sn_mapL.size())), 
-       RightSN = sn_mapR.insert(std::make_pair(R, sn_mapR.size())); 
- 
-  return cmpNumbers(LeftSN.first->second, RightSN.first->second); 
-} 
- 
-// Test whether two basic blocks have equivalent behaviour. 
-int FunctionComparator::cmpBasicBlocks(const BasicBlock *BBL, 
-                                       const BasicBlock *BBR) const { 
-  BasicBlock::const_iterator InstL = BBL->begin(), InstLE = BBL->end(); 
-  BasicBlock::const_iterator InstR = BBR->begin(), InstRE = BBR->end(); 
- 
-  do { 
-    bool needToCmpOperands = true; 
-    if (int Res = cmpOperations(&*InstL, &*InstR, needToCmpOperands)) 
-      return Res; 
-    if (needToCmpOperands) { 
-      assert(InstL->getNumOperands() == InstR->getNumOperands()); 
- 
-      for (unsigned i = 0, e = InstL->getNumOperands(); i != e; ++i) { 
-        Value *OpL = InstL->getOperand(i); 
-        Value *OpR = InstR->getOperand(i); 
-        if (int Res = cmpValues(OpL, OpR)) 
-          return Res; 
-        // cmpValues should ensure this is true. 
-        assert(cmpTypes(OpL->getType(), OpR->getType()) == 0); 
-      } 
-    } 
- 
-    ++InstL; 
-    ++InstR; 
-  } while (InstL != InstLE && InstR != InstRE); 
- 
-  if (InstL != InstLE && InstR == InstRE) 
-    return 1; 
-  if (InstL == InstLE && InstR != InstRE) 
-    return -1; 
-  return 0; 
-} 
- 
-int FunctionComparator::compareSignature() const { 
-  if (int Res = cmpAttrs(FnL->getAttributes(), FnR->getAttributes())) 
-    return Res; 
- 
-  if (int Res = cmpNumbers(FnL->hasGC(), FnR->hasGC())) 
-    return Res; 
- 
-  if (FnL->hasGC()) { 
-    if (int Res = cmpMem(FnL->getGC(), FnR->getGC())) 
-      return Res; 
-  } 
- 
-  if (int Res = cmpNumbers(FnL->hasSection(), FnR->hasSection())) 
-    return Res; 
- 
-  if (FnL->hasSection()) { 
-    if (int Res = cmpMem(FnL->getSection(), FnR->getSection())) 
-      return Res; 
-  } 
- 
-  if (int Res = cmpNumbers(FnL->isVarArg(), FnR->isVarArg())) 
-    return Res; 
- 
-  // TODO: if it's internal and only used in direct calls, we could handle this 
-  // case too. 
-  if (int Res = cmpNumbers(FnL->getCallingConv(), FnR->getCallingConv())) 
-    return Res; 
- 
-  if (int Res = cmpTypes(FnL->getFunctionType(), FnR->getFunctionType())) 
-    return Res; 
- 
-  assert(FnL->arg_size() == FnR->arg_size() && 
-         "Identically typed functions have different numbers of args!"); 
- 
-  // Visit the arguments so that they get enumerated in the order they're 
-  // passed in. 
-  for (Function::const_arg_iterator ArgLI = FnL->arg_begin(), 
-                                    ArgRI = FnR->arg_begin(), 
-                                    ArgLE = FnL->arg_end(); 
-       ArgLI != ArgLE; ++ArgLI, ++ArgRI) { 
-    if (cmpValues(&*ArgLI, &*ArgRI) != 0) 
-      llvm_unreachable("Arguments repeat!"); 
-  } 
-  return 0; 
-} 
- 
-// Test whether the two functions have equivalent behaviour. 
-int FunctionComparator::compare() { 
-  beginCompare(); 
- 
-  if (int Res = compareSignature()) 
-    return Res; 
- 
-  // We do a CFG-ordered walk since the actual ordering of the blocks in the 
-  // linked list is immaterial. Our walk starts at the entry block for both 
-  // functions, then takes each block from each terminator in order. As an 
-  // artifact, this also means that unreachable blocks are ignored. 
-  SmallVector<const BasicBlock *, 8> FnLBBs, FnRBBs; 
-  SmallPtrSet<const BasicBlock *, 32> VisitedBBs; // in terms of F1. 
- 
-  FnLBBs.push_back(&FnL->getEntryBlock()); 
-  FnRBBs.push_back(&FnR->getEntryBlock()); 
- 
-  VisitedBBs.insert(FnLBBs[0]); 
-  while (!FnLBBs.empty()) { 
-    const BasicBlock *BBL = FnLBBs.pop_back_val(); 
-    const BasicBlock *BBR = FnRBBs.pop_back_val(); 
- 
-    if (int Res = cmpValues(BBL, BBR)) 
-      return Res; 
- 
-    if (int Res = cmpBasicBlocks(BBL, BBR)) 
-      return Res; 
- 
-    const Instruction *TermL = BBL->getTerminator(); 
-    const Instruction *TermR = BBR->getTerminator(); 
- 
-    assert(TermL->getNumSuccessors() == TermR->getNumSuccessors()); 
-    for (unsigned i = 0, e = TermL->getNumSuccessors(); i != e; ++i) { 
-      if (!VisitedBBs.insert(TermL->getSuccessor(i)).second) 
-        continue; 
- 
-      FnLBBs.push_back(TermL->getSuccessor(i)); 
-      FnRBBs.push_back(TermR->getSuccessor(i)); 
-    } 
-  } 
-  return 0; 
-} 
- 
-namespace { 
- 
-// Accumulate the hash of a sequence of 64-bit integers. This is similar to a 
-// hash of a sequence of 64bit ints, but the entire input does not need to be 
-// available at once. This interface is necessary for functionHash because it 
-// needs to accumulate the hash as the structure of the function is traversed 
-// without saving these values to an intermediate buffer. This form of hashing 
-// is not often needed, as usually the object to hash is just read from a 
-// buffer. 
-class HashAccumulator64 { 
-  uint64_t Hash; 
- 
-public: 
-  // Initialize to random constant, so the state isn't zero. 
-  HashAccumulator64() { Hash = 0x6acaa36bef8325c5ULL; } 
- 
-  void add(uint64_t V) { Hash = hashing::detail::hash_16_bytes(Hash, V); } 
- 
-  // No finishing is required, because the entire hash value is used. 
-  uint64_t getHash() { return Hash; } 
-}; 
- 
-} // end anonymous namespace 
- 
-// A function hash is calculated by considering only the number of arguments and 
-// whether a function is varargs, the order of basic blocks (given by the 
-// successors of each basic block in depth first order), and the order of 
-// opcodes of each instruction within each of these basic blocks. This mirrors 
-// the strategy compare() uses to compare functions by walking the BBs in depth 
-// first order and comparing each instruction in sequence. Because this hash 
-// does not look at the operands, it is insensitive to things such as the 
-// target of calls and the constants used in the function, which makes it useful 
-// when possibly merging functions which are the same modulo constants and call 
-// targets. 
-FunctionComparator::FunctionHash FunctionComparator::functionHash(Function &F) { 
-  HashAccumulator64 H; 
-  H.add(F.isVarArg()); 
-  H.add(F.arg_size()); 
- 
-  SmallVector<const BasicBlock *, 8> BBs; 
-  SmallPtrSet<const BasicBlock *, 16> VisitedBBs; 
- 
-  // Walk the blocks in the same order as FunctionComparator::cmpBasicBlocks(), 
-  // accumulating the hash of the function "structure." (BB and opcode sequence) 
-  BBs.push_back(&F.getEntryBlock()); 
-  VisitedBBs.insert(BBs[0]); 
-  while (!BBs.empty()) { 
-    const BasicBlock *BB = BBs.pop_back_val(); 
-    // This random value acts as a block header, as otherwise the partition of 
-    // opcodes into BBs wouldn't affect the hash, only the order of the opcodes 
-    H.add(45798); 
-    for (auto &Inst : *BB) { 
-      H.add(Inst.getOpcode()); 
-    } 
-    const Instruction *Term = BB->getTerminator(); 
-    for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) { 
-      if (!VisitedBBs.insert(Term->getSuccessor(i)).second) 
-        continue; 
-      BBs.push_back(Term->getSuccessor(i)); 
-    } 
-  } 
-  return H.getHash(); 
-} 
+    return cmpTypes(STyL->getElementType(), STyR->getElementType());
+  }
+  }
+}
+
+// Determine whether the two operations are the same except that pointer-to-A
+// and pointer-to-B are equivalent. This should be kept in sync with
+// Instruction::isSameOperationAs.
+// Read method declaration comments for more details.
+int FunctionComparator::cmpOperations(const Instruction *L,
+                                      const Instruction *R,
+                                      bool &needToCmpOperands) const {
+  needToCmpOperands = true;
+  if (int Res = cmpValues(L, R))
+    return Res;
+
+  // Differences from Instruction::isSameOperationAs:
+  //  * replace type comparison with calls to cmpTypes.
+  //  * we test for I->getRawSubclassOptionalData (nuw/nsw/tail) at the top.
+  //  * because of the above, we don't test for the tail bit on calls later on.
+  if (int Res = cmpNumbers(L->getOpcode(), R->getOpcode()))
+    return Res;
+
+  if (const GetElementPtrInst *GEPL = dyn_cast<GetElementPtrInst>(L)) {
+    needToCmpOperands = false;
+    const GetElementPtrInst *GEPR = cast<GetElementPtrInst>(R);
+    if (int Res =
+            cmpValues(GEPL->getPointerOperand(), GEPR->getPointerOperand()))
+      return Res;
+    return cmpGEPs(GEPL, GEPR);
+  }
+
+  if (int Res = cmpNumbers(L->getNumOperands(), R->getNumOperands()))
+    return Res;
+
+  if (int Res = cmpTypes(L->getType(), R->getType()))
+    return Res;
+
+  if (int Res = cmpNumbers(L->getRawSubclassOptionalData(),
+                           R->getRawSubclassOptionalData()))
+    return Res;
+
+  // We have two instructions of identical opcode and #operands.  Check to see
+  // if all operands are the same type
+  for (unsigned i = 0, e = L->getNumOperands(); i != e; ++i) {
+    if (int Res =
+            cmpTypes(L->getOperand(i)->getType(), R->getOperand(i)->getType()))
+      return Res;
+  }
+
+  // Check special state that is a part of some instructions.
+  if (const AllocaInst *AI = dyn_cast<AllocaInst>(L)) {
+    if (int Res = cmpTypes(AI->getAllocatedType(),
+                           cast<AllocaInst>(R)->getAllocatedType()))
+      return Res;
+    return cmpNumbers(AI->getAlignment(), cast<AllocaInst>(R)->getAlignment());
+  }
+  if (const LoadInst *LI = dyn_cast<LoadInst>(L)) {
+    if (int Res = cmpNumbers(LI->isVolatile(), cast<LoadInst>(R)->isVolatile()))
+      return Res;
+    if (int Res =
+            cmpNumbers(LI->getAlignment(), cast<LoadInst>(R)->getAlignment()))
+      return Res;
+    if (int Res =
+            cmpOrderings(LI->getOrdering(), cast<LoadInst>(R)->getOrdering()))
+      return Res;
+    if (int Res = cmpNumbers(LI->getSyncScopeID(),
+                             cast<LoadInst>(R)->getSyncScopeID()))
+      return Res;
+    return cmpRangeMetadata(
+        LI->getMetadata(LLVMContext::MD_range),
+        cast<LoadInst>(R)->getMetadata(LLVMContext::MD_range));
+  }
+  if (const StoreInst *SI = dyn_cast<StoreInst>(L)) {
+    if (int Res =
+            cmpNumbers(SI->isVolatile(), cast<StoreInst>(R)->isVolatile()))
+      return Res;
+    if (int Res =
+            cmpNumbers(SI->getAlignment(), cast<StoreInst>(R)->getAlignment()))
+      return Res;
+    if (int Res =
+            cmpOrderings(SI->getOrdering(), cast<StoreInst>(R)->getOrdering()))
+      return Res;
+    return cmpNumbers(SI->getSyncScopeID(),
+                      cast<StoreInst>(R)->getSyncScopeID());
+  }
+  if (const CmpInst *CI = dyn_cast<CmpInst>(L))
+    return cmpNumbers(CI->getPredicate(), cast<CmpInst>(R)->getPredicate());
+  if (auto *CBL = dyn_cast<CallBase>(L)) {
+    auto *CBR = cast<CallBase>(R);
+    if (int Res = cmpNumbers(CBL->getCallingConv(), CBR->getCallingConv()))
+      return Res;
+    if (int Res = cmpAttrs(CBL->getAttributes(), CBR->getAttributes()))
+      return Res;
+    if (int Res = cmpOperandBundlesSchema(*CBL, *CBR))
+      return Res;
+    if (const CallInst *CI = dyn_cast<CallInst>(L))
+      if (int Res = cmpNumbers(CI->getTailCallKind(),
+                               cast<CallInst>(R)->getTailCallKind()))
+        return Res;
+    return cmpRangeMetadata(L->getMetadata(LLVMContext::MD_range),
+                            R->getMetadata(LLVMContext::MD_range));
+  }
+  if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(L)) {
+    ArrayRef<unsigned> LIndices = IVI->getIndices();
+    ArrayRef<unsigned> RIndices = cast<InsertValueInst>(R)->getIndices();
+    if (int Res = cmpNumbers(LIndices.size(), RIndices.size()))
+      return Res;
+    for (size_t i = 0, e = LIndices.size(); i != e; ++i) {
+      if (int Res = cmpNumbers(LIndices[i], RIndices[i]))
+        return Res;
+    }
+    return 0;
+  }
+  if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(L)) {
+    ArrayRef<unsigned> LIndices = EVI->getIndices();
+    ArrayRef<unsigned> RIndices = cast<ExtractValueInst>(R)->getIndices();
+    if (int Res = cmpNumbers(LIndices.size(), RIndices.size()))
+      return Res;
+    for (size_t i = 0, e = LIndices.size(); i != e; ++i) {
+      if (int Res = cmpNumbers(LIndices[i], RIndices[i]))
+        return Res;
+    }
+  }
+  if (const FenceInst *FI = dyn_cast<FenceInst>(L)) {
+    if (int Res =
+            cmpOrderings(FI->getOrdering(), cast<FenceInst>(R)->getOrdering()))
+      return Res;
+    return cmpNumbers(FI->getSyncScopeID(),
+                      cast<FenceInst>(R)->getSyncScopeID());
+  }
+  if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(L)) {
+    if (int Res = cmpNumbers(CXI->isVolatile(),
+                             cast<AtomicCmpXchgInst>(R)->isVolatile()))
+      return Res;
+    if (int Res =
+            cmpNumbers(CXI->isWeak(), cast<AtomicCmpXchgInst>(R)->isWeak()))
+      return Res;
+    if (int Res =
+            cmpOrderings(CXI->getSuccessOrdering(),
+                         cast<AtomicCmpXchgInst>(R)->getSuccessOrdering()))
+      return Res;
+    if (int Res =
+            cmpOrderings(CXI->getFailureOrdering(),
+                         cast<AtomicCmpXchgInst>(R)->getFailureOrdering()))
+      return Res;
+    return cmpNumbers(CXI->getSyncScopeID(),
+                      cast<AtomicCmpXchgInst>(R)->getSyncScopeID());
+  }
+  if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(L)) {
+    if (int Res = cmpNumbers(RMWI->getOperation(),
+                             cast<AtomicRMWInst>(R)->getOperation()))
+      return Res;
+    if (int Res = cmpNumbers(RMWI->isVolatile(),
+                             cast<AtomicRMWInst>(R)->isVolatile()))
+      return Res;
+    if (int Res = cmpOrderings(RMWI->getOrdering(),
+                               cast<AtomicRMWInst>(R)->getOrdering()))
+      return Res;
+    return cmpNumbers(RMWI->getSyncScopeID(),
+                      cast<AtomicRMWInst>(R)->getSyncScopeID());
+  }
+  if (const ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(L)) {
+    ArrayRef<int> LMask = SVI->getShuffleMask();
+    ArrayRef<int> RMask = cast<ShuffleVectorInst>(R)->getShuffleMask();
+    if (int Res = cmpNumbers(LMask.size(), RMask.size()))
+      return Res;
+    for (size_t i = 0, e = LMask.size(); i != e; ++i) {
+      if (int Res = cmpNumbers(LMask[i], RMask[i]))
+        return Res;
+    }
+  }
+  if (const PHINode *PNL = dyn_cast<PHINode>(L)) {
+    const PHINode *PNR = cast<PHINode>(R);
+    // Ensure that in addition to the incoming values being identical
+    // (checked by the caller of this function), the incoming blocks
+    // are also identical.
+    for (unsigned i = 0, e = PNL->getNumIncomingValues(); i != e; ++i) {
+      if (int Res =
+              cmpValues(PNL->getIncomingBlock(i), PNR->getIncomingBlock(i)))
+        return Res;
+    }
+  }
+  return 0;
+}
+
+// Determine whether two GEP operations perform the same underlying arithmetic.
+// Read method declaration comments for more details.
+int FunctionComparator::cmpGEPs(const GEPOperator *GEPL,
+                                const GEPOperator *GEPR) const {
+  unsigned int ASL = GEPL->getPointerAddressSpace();
+  unsigned int ASR = GEPR->getPointerAddressSpace();
+
+  if (int Res = cmpNumbers(ASL, ASR))
+    return Res;
+
+  // When we have target data, we can reduce the GEP down to the value in bytes
+  // added to the address.
+  const DataLayout &DL = FnL->getParent()->getDataLayout();
+  unsigned BitWidth = DL.getPointerSizeInBits(ASL);
+  APInt OffsetL(BitWidth, 0), OffsetR(BitWidth, 0);
+  if (GEPL->accumulateConstantOffset(DL, OffsetL) &&
+      GEPR->accumulateConstantOffset(DL, OffsetR))
+    return cmpAPInts(OffsetL, OffsetR);
+  if (int Res =
+          cmpTypes(GEPL->getSourceElementType(), GEPR->getSourceElementType()))
+    return Res;
+
+  if (int Res = cmpNumbers(GEPL->getNumOperands(), GEPR->getNumOperands()))
+    return Res;
+
+  for (unsigned i = 0, e = GEPL->getNumOperands(); i != e; ++i) {
+    if (int Res = cmpValues(GEPL->getOperand(i), GEPR->getOperand(i)))
+      return Res;
+  }
+
+  return 0;
+}
+
+int FunctionComparator::cmpInlineAsm(const InlineAsm *L,
+                                     const InlineAsm *R) const {
+  // InlineAsm's are uniqued. If they are the same pointer, obviously they are
+  // the same, otherwise compare the fields.
+  if (L == R)
+    return 0;
+  if (int Res = cmpTypes(L->getFunctionType(), R->getFunctionType()))
+    return Res;
+  if (int Res = cmpMem(L->getAsmString(), R->getAsmString()))
+    return Res;
+  if (int Res = cmpMem(L->getConstraintString(), R->getConstraintString()))
+    return Res;
+  if (int Res = cmpNumbers(L->hasSideEffects(), R->hasSideEffects()))
+    return Res;
+  if (int Res = cmpNumbers(L->isAlignStack(), R->isAlignStack()))
+    return Res;
+  if (int Res = cmpNumbers(L->getDialect(), R->getDialect()))
+    return Res;
+  assert(L->getFunctionType() != R->getFunctionType());
+  return 0;
+}
+
+/// Compare two values used by the two functions under pair-wise comparison. If
+/// this is the first time the values are seen, they're added to the mapping so
+/// that we will detect mismatches on next use.
+/// See comments in declaration for more details.
+int FunctionComparator::cmpValues(const Value *L, const Value *R) const {
+  // Catch self-reference case.
+  if (L == FnL) {
+    if (R == FnR)
+      return 0;
+    return -1;
+  }
+  if (R == FnR) {
+    if (L == FnL)
+      return 0;
+    return 1;
+  }
+
+  const Constant *ConstL = dyn_cast<Constant>(L);
+  const Constant *ConstR = dyn_cast<Constant>(R);
+  if (ConstL && ConstR) {
+    if (L == R)
+      return 0;
+    return cmpConstants(ConstL, ConstR);
+  }
+
+  if (ConstL)
+    return 1;
+  if (ConstR)
+    return -1;
+
+  const InlineAsm *InlineAsmL = dyn_cast<InlineAsm>(L);
+  const InlineAsm *InlineAsmR = dyn_cast<InlineAsm>(R);
+
+  if (InlineAsmL && InlineAsmR)
+    return cmpInlineAsm(InlineAsmL, InlineAsmR);
+  if (InlineAsmL)
+    return 1;
+  if (InlineAsmR)
+    return -1;
+
+  auto LeftSN = sn_mapL.insert(std::make_pair(L, sn_mapL.size())),
+       RightSN = sn_mapR.insert(std::make_pair(R, sn_mapR.size()));
+
+  return cmpNumbers(LeftSN.first->second, RightSN.first->second);
+}
+
+// Test whether two basic blocks have equivalent behaviour.
+int FunctionComparator::cmpBasicBlocks(const BasicBlock *BBL,
+                                       const BasicBlock *BBR) const {
+  BasicBlock::const_iterator InstL = BBL->begin(), InstLE = BBL->end();
+  BasicBlock::const_iterator InstR = BBR->begin(), InstRE = BBR->end();
+
+  do {
+    bool needToCmpOperands = true;
+    if (int Res = cmpOperations(&*InstL, &*InstR, needToCmpOperands))
+      return Res;
+    if (needToCmpOperands) {
+      assert(InstL->getNumOperands() == InstR->getNumOperands());
+
+      for (unsigned i = 0, e = InstL->getNumOperands(); i != e; ++i) {
+        Value *OpL = InstL->getOperand(i);
+        Value *OpR = InstR->getOperand(i);
+        if (int Res = cmpValues(OpL, OpR))
+          return Res;
+        // cmpValues should ensure this is true.
+        assert(cmpTypes(OpL->getType(), OpR->getType()) == 0);
+      }
+    }
+
+    ++InstL;
+    ++InstR;
+  } while (InstL != InstLE && InstR != InstRE);
+
+  if (InstL != InstLE && InstR == InstRE)
+    return 1;
+  if (InstL == InstLE && InstR != InstRE)
+    return -1;
+  return 0;
+}
+
+int FunctionComparator::compareSignature() const {
+  if (int Res = cmpAttrs(FnL->getAttributes(), FnR->getAttributes()))
+    return Res;
+
+  if (int Res = cmpNumbers(FnL->hasGC(), FnR->hasGC()))
+    return Res;
+
+  if (FnL->hasGC()) {
+    if (int Res = cmpMem(FnL->getGC(), FnR->getGC()))
+      return Res;
+  }
+
+  if (int Res = cmpNumbers(FnL->hasSection(), FnR->hasSection()))
+    return Res;
+
+  if (FnL->hasSection()) {
+    if (int Res = cmpMem(FnL->getSection(), FnR->getSection()))
+      return Res;
+  }
+
+  if (int Res = cmpNumbers(FnL->isVarArg(), FnR->isVarArg()))
+    return Res;
+
+  // TODO: if it's internal and only used in direct calls, we could handle this
+  // case too.
+  if (int Res = cmpNumbers(FnL->getCallingConv(), FnR->getCallingConv()))
+    return Res;
+
+  if (int Res = cmpTypes(FnL->getFunctionType(), FnR->getFunctionType()))
+    return Res;
+
+  assert(FnL->arg_size() == FnR->arg_size() &&
+         "Identically typed functions have different numbers of args!");
+
+  // Visit the arguments so that they get enumerated in the order they're
+  // passed in.
+  for (Function::const_arg_iterator ArgLI = FnL->arg_begin(),
+                                    ArgRI = FnR->arg_begin(),
+                                    ArgLE = FnL->arg_end();
+       ArgLI != ArgLE; ++ArgLI, ++ArgRI) {
+    if (cmpValues(&*ArgLI, &*ArgRI) != 0)
+      llvm_unreachable("Arguments repeat!");
+  }
+  return 0;
+}
+
+// Test whether the two functions have equivalent behaviour.
+int FunctionComparator::compare() {
+  beginCompare();
+
+  if (int Res = compareSignature())
+    return Res;
+
+  // We do a CFG-ordered walk since the actual ordering of the blocks in the
+  // linked list is immaterial. Our walk starts at the entry block for both
+  // functions, then takes each block from each terminator in order. As an
+  // artifact, this also means that unreachable blocks are ignored.
+  SmallVector<const BasicBlock *, 8> FnLBBs, FnRBBs;
+  SmallPtrSet<const BasicBlock *, 32> VisitedBBs; // in terms of F1.
+
+  FnLBBs.push_back(&FnL->getEntryBlock());
+  FnRBBs.push_back(&FnR->getEntryBlock());
+
+  VisitedBBs.insert(FnLBBs[0]);
+  while (!FnLBBs.empty()) {
+    const BasicBlock *BBL = FnLBBs.pop_back_val();
+    const BasicBlock *BBR = FnRBBs.pop_back_val();
+
+    if (int Res = cmpValues(BBL, BBR))
+      return Res;
+
+    if (int Res = cmpBasicBlocks(BBL, BBR))
+      return Res;
+
+    const Instruction *TermL = BBL->getTerminator();
+    const Instruction *TermR = BBR->getTerminator();
+
+    assert(TermL->getNumSuccessors() == TermR->getNumSuccessors());
+    for (unsigned i = 0, e = TermL->getNumSuccessors(); i != e; ++i) {
+      if (!VisitedBBs.insert(TermL->getSuccessor(i)).second)
+        continue;
+
+      FnLBBs.push_back(TermL->getSuccessor(i));
+      FnRBBs.push_back(TermR->getSuccessor(i));
+    }
+  }
+  return 0;
+}
+
+namespace {
+
+// Accumulate the hash of a sequence of 64-bit integers. This is similar to a
+// hash of a sequence of 64bit ints, but the entire input does not need to be
+// available at once. This interface is necessary for functionHash because it
+// needs to accumulate the hash as the structure of the function is traversed
+// without saving these values to an intermediate buffer. This form of hashing
+// is not often needed, as usually the object to hash is just read from a
+// buffer.
+class HashAccumulator64 {
+  uint64_t Hash;
+
+public:
+  // Initialize to random constant, so the state isn't zero.
+  HashAccumulator64() { Hash = 0x6acaa36bef8325c5ULL; }
+
+  void add(uint64_t V) { Hash = hashing::detail::hash_16_bytes(Hash, V); }
+
+  // No finishing is required, because the entire hash value is used.
+  uint64_t getHash() { return Hash; }
+};
+
+} // end anonymous namespace
+
+// A function hash is calculated by considering only the number of arguments and
+// whether a function is varargs, the order of basic blocks (given by the
+// successors of each basic block in depth first order), and the order of
+// opcodes of each instruction within each of these basic blocks. This mirrors
+// the strategy compare() uses to compare functions by walking the BBs in depth
+// first order and comparing each instruction in sequence. Because this hash
+// does not look at the operands, it is insensitive to things such as the
+// target of calls and the constants used in the function, which makes it useful
+// when possibly merging functions which are the same modulo constants and call
+// targets.
+FunctionComparator::FunctionHash FunctionComparator::functionHash(Function &F) {
+  HashAccumulator64 H;
+  H.add(F.isVarArg());
+  H.add(F.arg_size());
+
+  SmallVector<const BasicBlock *, 8> BBs;
+  SmallPtrSet<const BasicBlock *, 16> VisitedBBs;
+
+  // Walk the blocks in the same order as FunctionComparator::cmpBasicBlocks(),
+  // accumulating the hash of the function "structure." (BB and opcode sequence)
+  BBs.push_back(&F.getEntryBlock());
+  VisitedBBs.insert(BBs[0]);
+  while (!BBs.empty()) {
+    const BasicBlock *BB = BBs.pop_back_val();
+    // This random value acts as a block header, as otherwise the partition of
+    // opcodes into BBs wouldn't affect the hash, only the order of the opcodes
+    H.add(45798);
+    for (auto &Inst : *BB) {
+      H.add(Inst.getOpcode());
+    }
+    const Instruction *Term = BB->getTerminator();
+    for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
+      if (!VisitedBBs.insert(Term->getSuccessor(i)).second)
+        continue;
+      BBs.push_back(Term->getSuccessor(i));
+    }
+  }
+  return H.getHash();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/FunctionImportUtils.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/FunctionImportUtils.cpp
index 28f69a8b6a..8df7ae9563 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/FunctionImportUtils.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/FunctionImportUtils.cpp
@@ -1,334 +1,334 @@
-//===- lib/Transforms/Utils/FunctionImportUtils.cpp - Importing utilities -===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the FunctionImportGlobalProcessing class, used 
-// to perform the necessary global value handling for function importing. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/FunctionImportUtils.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/InstIterator.h" 
-using namespace llvm; 
- 
-/// Checks if we should import SGV as a definition, otherwise import as a 
-/// declaration. 
-bool FunctionImportGlobalProcessing::doImportAsDefinition( 
-    const GlobalValue *SGV) { 
-  if (!isPerformingImport()) 
-    return false; 
- 
-  // Only import the globals requested for importing. 
-  if (!GlobalsToImport->count(const_cast<GlobalValue *>(SGV))) 
-    return false; 
- 
-  assert(!isa<GlobalAlias>(SGV) && 
-         "Unexpected global alias in the import list."); 
- 
-  // Otherwise yes. 
-  return true; 
-} 
- 
-bool FunctionImportGlobalProcessing::shouldPromoteLocalToGlobal( 
-    const GlobalValue *SGV, ValueInfo VI) { 
-  assert(SGV->hasLocalLinkage()); 
-  // Both the imported references and the original local variable must 
-  // be promoted. 
-  if (!isPerformingImport() && !isModuleExporting()) 
-    return false; 
- 
-  if (isPerformingImport()) { 
-    assert((!GlobalsToImport->count(const_cast<GlobalValue *>(SGV)) || 
-            !isNonRenamableLocal(*SGV)) && 
-           "Attempting to promote non-renamable local"); 
-    // We don't know for sure yet if we are importing this value (as either 
-    // a reference or a def), since we are simply walking all values in the 
-    // module. But by necessity if we end up importing it and it is local, 
-    // it must be promoted, so unconditionally promote all values in the 
-    // importing module. 
-    return true; 
-  } 
- 
-  // When exporting, consult the index. We can have more than one local 
-  // with the same GUID, in the case of same-named locals in different but 
-  // same-named source files that were compiled in their respective directories 
-  // (so the source file name and resulting GUID is the same). Find the one 
-  // in this module. 
-  auto Summary = ImportIndex.findSummaryInModule( 
-      VI, SGV->getParent()->getModuleIdentifier()); 
-  assert(Summary && "Missing summary for global value when exporting"); 
-  auto Linkage = Summary->linkage(); 
-  if (!GlobalValue::isLocalLinkage(Linkage)) { 
-    assert(!isNonRenamableLocal(*SGV) && 
-           "Attempting to promote non-renamable local"); 
-    return true; 
-  } 
- 
-  return false; 
-} 
- 
-#ifndef NDEBUG 
-bool FunctionImportGlobalProcessing::isNonRenamableLocal( 
-    const GlobalValue &GV) const { 
-  if (!GV.hasLocalLinkage()) 
-    return false; 
-  // This needs to stay in sync with the logic in buildModuleSummaryIndex. 
-  if (GV.hasSection()) 
-    return true; 
-  if (Used.count(const_cast<GlobalValue *>(&GV))) 
-    return true; 
-  return false; 
-} 
-#endif 
- 
-std::string 
-FunctionImportGlobalProcessing::getPromotedName(const GlobalValue *SGV) { 
-  assert(SGV->hasLocalLinkage()); 
-  // For locals that must be promoted to global scope, ensure that 
-  // the promoted name uniquely identifies the copy in the original module, 
-  // using the ID assigned during combined index creation. 
-  return ModuleSummaryIndex::getGlobalNameForLocal( 
-      SGV->getName(), 
-      ImportIndex.getModuleHash(SGV->getParent()->getModuleIdentifier())); 
-} 
- 
-GlobalValue::LinkageTypes 
-FunctionImportGlobalProcessing::getLinkage(const GlobalValue *SGV, 
-                                           bool DoPromote) { 
-  // Any local variable that is referenced by an exported function needs 
-  // to be promoted to global scope. Since we don't currently know which 
-  // functions reference which local variables/functions, we must treat 
-  // all as potentially exported if this module is exporting anything. 
-  if (isModuleExporting()) { 
-    if (SGV->hasLocalLinkage() && DoPromote) 
-      return GlobalValue::ExternalLinkage; 
-    return SGV->getLinkage(); 
-  } 
- 
-  // Otherwise, if we aren't importing, no linkage change is needed. 
-  if (!isPerformingImport()) 
-    return SGV->getLinkage(); 
- 
-  switch (SGV->getLinkage()) { 
-  case GlobalValue::LinkOnceODRLinkage: 
-  case GlobalValue::ExternalLinkage: 
-    // External and linkonce definitions are converted to available_externally 
-    // definitions upon import, so that they are available for inlining 
-    // and/or optimization, but are turned into declarations later 
-    // during the EliminateAvailableExternally pass. 
-    if (doImportAsDefinition(SGV) && !isa<GlobalAlias>(SGV)) 
-      return GlobalValue::AvailableExternallyLinkage; 
-    // An imported external declaration stays external. 
-    return SGV->getLinkage(); 
- 
-  case GlobalValue::AvailableExternallyLinkage: 
-    // An imported available_externally definition converts 
-    // to external if imported as a declaration. 
-    if (!doImportAsDefinition(SGV)) 
-      return GlobalValue::ExternalLinkage; 
-    // An imported available_externally declaration stays that way. 
-    return SGV->getLinkage(); 
- 
-  case GlobalValue::LinkOnceAnyLinkage: 
-  case GlobalValue::WeakAnyLinkage: 
-    // Can't import linkonce_any/weak_any definitions correctly, or we might 
-    // change the program semantics, since the linker will pick the first 
-    // linkonce_any/weak_any definition and importing would change the order 
-    // they are seen by the linker. The module linking caller needs to enforce 
-    // this. 
-    assert(!doImportAsDefinition(SGV)); 
-    // If imported as a declaration, it becomes external_weak. 
-    return SGV->getLinkage(); 
- 
-  case GlobalValue::WeakODRLinkage: 
-    // For weak_odr linkage, there is a guarantee that all copies will be 
-    // equivalent, so the issue described above for weak_any does not exist, 
-    // and the definition can be imported. It can be treated similarly 
-    // to an imported externally visible global value. 
-    if (doImportAsDefinition(SGV) && !isa<GlobalAlias>(SGV)) 
-      return GlobalValue::AvailableExternallyLinkage; 
-    else 
-      return GlobalValue::ExternalLinkage; 
- 
-  case GlobalValue::AppendingLinkage: 
-    // It would be incorrect to import an appending linkage variable, 
-    // since it would cause global constructors/destructors to be 
-    // executed multiple times. This should have already been handled 
-    // by linkIfNeeded, and we will assert in shouldLinkFromSource 
-    // if we try to import, so we simply return AppendingLinkage. 
-    return GlobalValue::AppendingLinkage; 
- 
-  case GlobalValue::InternalLinkage: 
-  case GlobalValue::PrivateLinkage: 
-    // If we are promoting the local to global scope, it is handled 
-    // similarly to a normal externally visible global. 
-    if (DoPromote) { 
-      if (doImportAsDefinition(SGV) && !isa<GlobalAlias>(SGV)) 
-        return GlobalValue::AvailableExternallyLinkage; 
-      else 
-        return GlobalValue::ExternalLinkage; 
-    } 
-    // A non-promoted imported local definition stays local. 
-    // The ThinLTO pass will eventually force-import their definitions. 
-    return SGV->getLinkage(); 
- 
-  case GlobalValue::ExternalWeakLinkage: 
-    // External weak doesn't apply to definitions, must be a declaration. 
-    assert(!doImportAsDefinition(SGV)); 
-    // Linkage stays external_weak. 
-    return SGV->getLinkage(); 
- 
-  case GlobalValue::CommonLinkage: 
-    // Linkage stays common on definitions. 
-    // The ThinLTO pass will eventually force-import their definitions. 
-    return SGV->getLinkage(); 
-  } 
- 
-  llvm_unreachable("unknown linkage type"); 
-} 
- 
-void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) { 
- 
-  ValueInfo VI; 
-  if (GV.hasName()) { 
-    VI = ImportIndex.getValueInfo(GV.getGUID()); 
-    // Set synthetic function entry counts. 
-    if (VI && ImportIndex.hasSyntheticEntryCounts()) { 
-      if (Function *F = dyn_cast<Function>(&GV)) { 
-        if (!F->isDeclaration()) { 
-          for (auto &S : VI.getSummaryList()) { 
-            auto *FS = cast<FunctionSummary>(S->getBaseObject()); 
-            if (FS->modulePath() == M.getModuleIdentifier()) { 
-              F->setEntryCount(Function::ProfileCount(FS->entryCount(), 
-                                                      Function::PCT_Synthetic)); 
-              break; 
-            } 
-          } 
-        } 
-      } 
-    } 
-  } 
- 
-  // We should always have a ValueInfo (i.e. GV in index) for definitions when 
-  // we are exporting, and also when importing that value. 
-  assert(VI || GV.isDeclaration() || 
-         (isPerformingImport() && !doImportAsDefinition(&GV))); 
- 
-  // Mark read/write-only variables which can be imported with specific 
-  // attribute. We can't internalize them now because IRMover will fail 
-  // to link variable definitions to their external declarations during 
-  // ThinLTO import. We'll internalize read-only variables later, after 
-  // import is finished. See internalizeGVsAfterImport. 
-  // 
-  // If global value dead stripping is not enabled in summary then 
-  // propagateConstants hasn't been run. We can't internalize GV 
-  // in such case. 
-  if (!GV.isDeclaration() && VI && ImportIndex.withAttributePropagation()) { 
-    if (GlobalVariable *V = dyn_cast<GlobalVariable>(&GV)) { 
-      // We can have more than one local with the same GUID, in the case of 
-      // same-named locals in different but same-named source files that were 
-      // compiled in their respective directories (so the source file name 
-      // and resulting GUID is the same). Find the one in this module. 
-      // Handle the case where there is no summary found in this module. That 
-      // can happen in the distributed ThinLTO backend, because the index only 
-      // contains summaries from the source modules if they are being imported. 
-      // We might have a non-null VI and get here even in that case if the name 
-      // matches one in this module (e.g. weak or appending linkage). 
-      auto *GVS = dyn_cast_or_null<GlobalVarSummary>( 
-          ImportIndex.findSummaryInModule(VI, M.getModuleIdentifier())); 
-      if (GVS && 
-          (ImportIndex.isReadOnly(GVS) || ImportIndex.isWriteOnly(GVS))) { 
-        V->addAttribute("thinlto-internalize"); 
-        // Objects referenced by writeonly GV initializer should not be 
-        // promoted, because there is no any kind of read access to them 
-        // on behalf of this writeonly GV. To avoid promotion we convert 
-        // GV initializer to 'zeroinitializer'. This effectively drops 
-        // references in IR module (not in combined index), so we can 
-        // ignore them when computing import. We do not export references 
-        // of writeonly object. See computeImportForReferencedGlobals 
-        if (ImportIndex.isWriteOnly(GVS)) 
-          V->setInitializer(Constant::getNullValue(V->getValueType())); 
-      } 
-    } 
-  } 
- 
-  if (GV.hasLocalLinkage() && shouldPromoteLocalToGlobal(&GV, VI)) { 
-    // Save the original name string before we rename GV below. 
-    auto Name = GV.getName().str(); 
-    GV.setName(getPromotedName(&GV)); 
-    GV.setLinkage(getLinkage(&GV, /* DoPromote */ true)); 
-    assert(!GV.hasLocalLinkage()); 
-    GV.setVisibility(GlobalValue::HiddenVisibility); 
- 
-    // If we are renaming a COMDAT leader, ensure that we record the COMDAT 
-    // for later renaming as well. This is required for COFF. 
-    if (const auto *C = GV.getComdat()) 
-      if (C->getName() == Name) 
-        RenamedComdats.try_emplace(C, M.getOrInsertComdat(GV.getName())); 
-  } else 
-    GV.setLinkage(getLinkage(&GV, /* DoPromote */ false)); 
- 
-  // When ClearDSOLocalOnDeclarations is true, clear dso_local if GV is 
-  // converted to a declaration, to disable direct access. Don't do this if GV 
-  // is implicitly dso_local due to a non-default visibility. 
-  if (ClearDSOLocalOnDeclarations && GV.isDeclarationForLinker() && 
-      !GV.isImplicitDSOLocal()) { 
-    GV.setDSOLocal(false); 
-  } else if (VI && VI.isDSOLocal()) { 
-    // If all summaries are dso_local, symbol gets resolved to a known local 
-    // definition. 
-    GV.setDSOLocal(true); 
-    if (GV.hasDLLImportStorageClass()) 
-      GV.setDLLStorageClass(GlobalValue::DefaultStorageClass); 
-  } 
- 
-  // Remove functions imported as available externally defs from comdats, 
-  // as this is a declaration for the linker, and will be dropped eventually. 
-  // It is illegal for comdats to contain declarations. 
-  auto *GO = dyn_cast<GlobalObject>(&GV); 
-  if (GO && GO->isDeclarationForLinker() && GO->hasComdat()) { 
-    // The IRMover should not have placed any imported declarations in 
-    // a comdat, so the only declaration that should be in a comdat 
-    // at this point would be a definition imported as available_externally. 
-    assert(GO->hasAvailableExternallyLinkage() && 
-           "Expected comdat on definition (possibly available external)"); 
-    GO->setComdat(nullptr); 
-  } 
-} 
- 
-void FunctionImportGlobalProcessing::processGlobalsForThinLTO() { 
-  for (GlobalVariable &GV : M.globals()) 
-    processGlobalForThinLTO(GV); 
-  for (Function &SF : M) 
-    processGlobalForThinLTO(SF); 
-  for (GlobalAlias &GA : M.aliases()) 
-    processGlobalForThinLTO(GA); 
- 
-  // Replace any COMDATS that required renaming (because the COMDAT leader was 
-  // promoted and renamed). 
-  if (!RenamedComdats.empty()) 
-    for (auto &GO : M.global_objects()) 
-      if (auto *C = GO.getComdat()) { 
-        auto Replacement = RenamedComdats.find(C); 
-        if (Replacement != RenamedComdats.end()) 
-          GO.setComdat(Replacement->second); 
-      } 
-} 
- 
-bool FunctionImportGlobalProcessing::run() { 
-  processGlobalsForThinLTO(); 
-  return false; 
-} 
- 
-bool llvm::renameModuleForThinLTO(Module &M, const ModuleSummaryIndex &Index, 
-                                  bool ClearDSOLocalOnDeclarations, 
-                                  SetVector<GlobalValue *> *GlobalsToImport) { 
-  FunctionImportGlobalProcessing ThinLTOProcessing(M, Index, GlobalsToImport, 
-                                                   ClearDSOLocalOnDeclarations); 
-  return ThinLTOProcessing.run(); 
-} 
+//===- lib/Transforms/Utils/FunctionImportUtils.cpp - Importing utilities -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the FunctionImportGlobalProcessing class, used
+// to perform the necessary global value handling for function importing.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/FunctionImportUtils.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstIterator.h"
+using namespace llvm;
+
+/// Checks if we should import SGV as a definition, otherwise import as a
+/// declaration.
+bool FunctionImportGlobalProcessing::doImportAsDefinition(
+    const GlobalValue *SGV) {
+  if (!isPerformingImport())
+    return false;
+
+  // Only import the globals requested for importing.
+  if (!GlobalsToImport->count(const_cast<GlobalValue *>(SGV)))
+    return false;
+
+  assert(!isa<GlobalAlias>(SGV) &&
+         "Unexpected global alias in the import list.");
+
+  // Otherwise yes.
+  return true;
+}
+
+bool FunctionImportGlobalProcessing::shouldPromoteLocalToGlobal(
+    const GlobalValue *SGV, ValueInfo VI) {
+  assert(SGV->hasLocalLinkage());
+  // Both the imported references and the original local variable must
+  // be promoted.
+  if (!isPerformingImport() && !isModuleExporting())
+    return false;
+
+  if (isPerformingImport()) {
+    assert((!GlobalsToImport->count(const_cast<GlobalValue *>(SGV)) ||
+            !isNonRenamableLocal(*SGV)) &&
+           "Attempting to promote non-renamable local");
+    // We don't know for sure yet if we are importing this value (as either
+    // a reference or a def), since we are simply walking all values in the
+    // module. But by necessity if we end up importing it and it is local,
+    // it must be promoted, so unconditionally promote all values in the
+    // importing module.
+    return true;
+  }
+
+  // When exporting, consult the index. We can have more than one local
+  // with the same GUID, in the case of same-named locals in different but
+  // same-named source files that were compiled in their respective directories
+  // (so the source file name and resulting GUID is the same). Find the one
+  // in this module.
+  auto Summary = ImportIndex.findSummaryInModule(
+      VI, SGV->getParent()->getModuleIdentifier());
+  assert(Summary && "Missing summary for global value when exporting");
+  auto Linkage = Summary->linkage();
+  if (!GlobalValue::isLocalLinkage(Linkage)) {
+    assert(!isNonRenamableLocal(*SGV) &&
+           "Attempting to promote non-renamable local");
+    return true;
+  }
+
+  return false;
+}
+
+#ifndef NDEBUG
+bool FunctionImportGlobalProcessing::isNonRenamableLocal(
+    const GlobalValue &GV) const {
+  if (!GV.hasLocalLinkage())
+    return false;
+  // This needs to stay in sync with the logic in buildModuleSummaryIndex.
+  if (GV.hasSection())
+    return true;
+  if (Used.count(const_cast<GlobalValue *>(&GV)))
+    return true;
+  return false;
+}
+#endif
+
+std::string
+FunctionImportGlobalProcessing::getPromotedName(const GlobalValue *SGV) {
+  assert(SGV->hasLocalLinkage());
+  // For locals that must be promoted to global scope, ensure that
+  // the promoted name uniquely identifies the copy in the original module,
+  // using the ID assigned during combined index creation.
+  return ModuleSummaryIndex::getGlobalNameForLocal(
+      SGV->getName(),
+      ImportIndex.getModuleHash(SGV->getParent()->getModuleIdentifier()));
+}
+
+GlobalValue::LinkageTypes
+FunctionImportGlobalProcessing::getLinkage(const GlobalValue *SGV,
+                                           bool DoPromote) {
+  // Any local variable that is referenced by an exported function needs
+  // to be promoted to global scope. Since we don't currently know which
+  // functions reference which local variables/functions, we must treat
+  // all as potentially exported if this module is exporting anything.
+  if (isModuleExporting()) {
+    if (SGV->hasLocalLinkage() && DoPromote)
+      return GlobalValue::ExternalLinkage;
+    return SGV->getLinkage();
+  }
+
+  // Otherwise, if we aren't importing, no linkage change is needed.
+  if (!isPerformingImport())
+    return SGV->getLinkage();
+
+  switch (SGV->getLinkage()) {
+  case GlobalValue::LinkOnceODRLinkage:
+  case GlobalValue::ExternalLinkage:
+    // External and linkonce definitions are converted to available_externally
+    // definitions upon import, so that they are available for inlining
+    // and/or optimization, but are turned into declarations later
+    // during the EliminateAvailableExternally pass.
+    if (doImportAsDefinition(SGV) && !isa<GlobalAlias>(SGV))
+      return GlobalValue::AvailableExternallyLinkage;
+    // An imported external declaration stays external.
+    return SGV->getLinkage();
+
+  case GlobalValue::AvailableExternallyLinkage:
+    // An imported available_externally definition converts
+    // to external if imported as a declaration.
+    if (!doImportAsDefinition(SGV))
+      return GlobalValue::ExternalLinkage;
+    // An imported available_externally declaration stays that way.
+    return SGV->getLinkage();
+
+  case GlobalValue::LinkOnceAnyLinkage:
+  case GlobalValue::WeakAnyLinkage:
+    // Can't import linkonce_any/weak_any definitions correctly, or we might
+    // change the program semantics, since the linker will pick the first
+    // linkonce_any/weak_any definition and importing would change the order
+    // they are seen by the linker. The module linking caller needs to enforce
+    // this.
+    assert(!doImportAsDefinition(SGV));
+    // If imported as a declaration, it becomes external_weak.
+    return SGV->getLinkage();
+
+  case GlobalValue::WeakODRLinkage:
+    // For weak_odr linkage, there is a guarantee that all copies will be
+    // equivalent, so the issue described above for weak_any does not exist,
+    // and the definition can be imported. It can be treated similarly
+    // to an imported externally visible global value.
+    if (doImportAsDefinition(SGV) && !isa<GlobalAlias>(SGV))
+      return GlobalValue::AvailableExternallyLinkage;
+    else
+      return GlobalValue::ExternalLinkage;
+
+  case GlobalValue::AppendingLinkage:
+    // It would be incorrect to import an appending linkage variable,
+    // since it would cause global constructors/destructors to be
+    // executed multiple times. This should have already been handled
+    // by linkIfNeeded, and we will assert in shouldLinkFromSource
+    // if we try to import, so we simply return AppendingLinkage.
+    return GlobalValue::AppendingLinkage;
+
+  case GlobalValue::InternalLinkage:
+  case GlobalValue::PrivateLinkage:
+    // If we are promoting the local to global scope, it is handled
+    // similarly to a normal externally visible global.
+    if (DoPromote) {
+      if (doImportAsDefinition(SGV) && !isa<GlobalAlias>(SGV))
+        return GlobalValue::AvailableExternallyLinkage;
+      else
+        return GlobalValue::ExternalLinkage;
+    }
+    // A non-promoted imported local definition stays local.
+    // The ThinLTO pass will eventually force-import their definitions.
+    return SGV->getLinkage();
+
+  case GlobalValue::ExternalWeakLinkage:
+    // External weak doesn't apply to definitions, must be a declaration.
+    assert(!doImportAsDefinition(SGV));
+    // Linkage stays external_weak.
+    return SGV->getLinkage();
+
+  case GlobalValue::CommonLinkage:
+    // Linkage stays common on definitions.
+    // The ThinLTO pass will eventually force-import their definitions.
+    return SGV->getLinkage();
+  }
+
+  llvm_unreachable("unknown linkage type");
+}
+
+void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) {
+
+  ValueInfo VI;
+  if (GV.hasName()) {
+    VI = ImportIndex.getValueInfo(GV.getGUID());
+    // Set synthetic function entry counts.
+    if (VI && ImportIndex.hasSyntheticEntryCounts()) {
+      if (Function *F = dyn_cast<Function>(&GV)) {
+        if (!F->isDeclaration()) {
+          for (auto &S : VI.getSummaryList()) {
+            auto *FS = cast<FunctionSummary>(S->getBaseObject());
+            if (FS->modulePath() == M.getModuleIdentifier()) {
+              F->setEntryCount(Function::ProfileCount(FS->entryCount(),
+                                                      Function::PCT_Synthetic));
+              break;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // We should always have a ValueInfo (i.e. GV in index) for definitions when
+  // we are exporting, and also when importing that value.
+  assert(VI || GV.isDeclaration() ||
+         (isPerformingImport() && !doImportAsDefinition(&GV)));
+
+  // Mark read/write-only variables which can be imported with specific
+  // attribute. We can't internalize them now because IRMover will fail
+  // to link variable definitions to their external declarations during
+  // ThinLTO import. We'll internalize read-only variables later, after
+  // import is finished. See internalizeGVsAfterImport.
+  //
+  // If global value dead stripping is not enabled in summary then
+  // propagateConstants hasn't been run. We can't internalize GV
+  // in such case.
+  if (!GV.isDeclaration() && VI && ImportIndex.withAttributePropagation()) {
+    if (GlobalVariable *V = dyn_cast<GlobalVariable>(&GV)) {
+      // We can have more than one local with the same GUID, in the case of
+      // same-named locals in different but same-named source files that were
+      // compiled in their respective directories (so the source file name
+      // and resulting GUID is the same). Find the one in this module.
+      // Handle the case where there is no summary found in this module. That
+      // can happen in the distributed ThinLTO backend, because the index only
+      // contains summaries from the source modules if they are being imported.
+      // We might have a non-null VI and get here even in that case if the name
+      // matches one in this module (e.g. weak or appending linkage).
+      auto *GVS = dyn_cast_or_null<GlobalVarSummary>(
+          ImportIndex.findSummaryInModule(VI, M.getModuleIdentifier()));
+      if (GVS &&
+          (ImportIndex.isReadOnly(GVS) || ImportIndex.isWriteOnly(GVS))) {
+        V->addAttribute("thinlto-internalize");
+        // Objects referenced by writeonly GV initializer should not be
+        // promoted, because there is no any kind of read access to them
+        // on behalf of this writeonly GV. To avoid promotion we convert
+        // GV initializer to 'zeroinitializer'. This effectively drops
+        // references in IR module (not in combined index), so we can
+        // ignore them when computing import. We do not export references
+        // of writeonly object. See computeImportForReferencedGlobals
+        if (ImportIndex.isWriteOnly(GVS))
+          V->setInitializer(Constant::getNullValue(V->getValueType()));
+      }
+    }
+  }
+
+  if (GV.hasLocalLinkage() && shouldPromoteLocalToGlobal(&GV, VI)) {
+    // Save the original name string before we rename GV below.
+    auto Name = GV.getName().str();
+    GV.setName(getPromotedName(&GV));
+    GV.setLinkage(getLinkage(&GV, /* DoPromote */ true));
+    assert(!GV.hasLocalLinkage());
+    GV.setVisibility(GlobalValue::HiddenVisibility);
+
+    // If we are renaming a COMDAT leader, ensure that we record the COMDAT
+    // for later renaming as well. This is required for COFF.
+    if (const auto *C = GV.getComdat())
+      if (C->getName() == Name)
+        RenamedComdats.try_emplace(C, M.getOrInsertComdat(GV.getName()));
+  } else
+    GV.setLinkage(getLinkage(&GV, /* DoPromote */ false));
+
+  // When ClearDSOLocalOnDeclarations is true, clear dso_local if GV is
+  // converted to a declaration, to disable direct access. Don't do this if GV
+  // is implicitly dso_local due to a non-default visibility.
+  if (ClearDSOLocalOnDeclarations && GV.isDeclarationForLinker() &&
+      !GV.isImplicitDSOLocal()) {
+    GV.setDSOLocal(false);
+  } else if (VI && VI.isDSOLocal()) {
+    // If all summaries are dso_local, symbol gets resolved to a known local
+    // definition.
+    GV.setDSOLocal(true);
+    if (GV.hasDLLImportStorageClass())
+      GV.setDLLStorageClass(GlobalValue::DefaultStorageClass);
+  }
+
+  // Remove functions imported as available externally defs from comdats,
+  // as this is a declaration for the linker, and will be dropped eventually.
+  // It is illegal for comdats to contain declarations.
+  auto *GO = dyn_cast<GlobalObject>(&GV);
+  if (GO && GO->isDeclarationForLinker() && GO->hasComdat()) {
+    // The IRMover should not have placed any imported declarations in
+    // a comdat, so the only declaration that should be in a comdat
+    // at this point would be a definition imported as available_externally.
+    assert(GO->hasAvailableExternallyLinkage() &&
+           "Expected comdat on definition (possibly available external)");
+    GO->setComdat(nullptr);
+  }
+}
+
+void FunctionImportGlobalProcessing::processGlobalsForThinLTO() {
+  for (GlobalVariable &GV : M.globals())
+    processGlobalForThinLTO(GV);
+  for (Function &SF : M)
+    processGlobalForThinLTO(SF);
+  for (GlobalAlias &GA : M.aliases())
+    processGlobalForThinLTO(GA);
+
+  // Replace any COMDATS that required renaming (because the COMDAT leader was
+  // promoted and renamed).
+  if (!RenamedComdats.empty())
+    for (auto &GO : M.global_objects())
+      if (auto *C = GO.getComdat()) {
+        auto Replacement = RenamedComdats.find(C);
+        if (Replacement != RenamedComdats.end())
+          GO.setComdat(Replacement->second);
+      }
+}
+
+bool FunctionImportGlobalProcessing::run() {
+  processGlobalsForThinLTO();
+  return false;
+}
+
+bool llvm::renameModuleForThinLTO(Module &M, const ModuleSummaryIndex &Index,
+                                  bool ClearDSOLocalOnDeclarations,
+                                  SetVector<GlobalValue *> *GlobalsToImport) {
+  FunctionImportGlobalProcessing ThinLTOProcessing(M, Index, GlobalsToImport,
+                                                   ClearDSOLocalOnDeclarations);
+  return ThinLTOProcessing.run();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/GlobalStatus.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/GlobalStatus.cpp
index 7220a86d3e..f782396be7 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/GlobalStatus.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/GlobalStatus.cpp
@@ -1,194 +1,194 @@
-//===-- GlobalStatus.cpp - Compute status info for globals -----------------==// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/GlobalStatus.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/GlobalValue.h" 
-#include "llvm/IR/GlobalVariable.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/Support/AtomicOrdering.h" 
-#include "llvm/Support/Casting.h" 
-#include <algorithm> 
-#include <cassert> 
- 
-using namespace llvm; 
- 
-/// Return the stronger of the two ordering. If the two orderings are acquire 
-/// and release, then return AcquireRelease. 
-/// 
-static AtomicOrdering strongerOrdering(AtomicOrdering X, AtomicOrdering Y) { 
-  if ((X == AtomicOrdering::Acquire && Y == AtomicOrdering::Release) || 
-      (Y == AtomicOrdering::Acquire && X == AtomicOrdering::Release)) 
-    return AtomicOrdering::AcquireRelease; 
-  return (AtomicOrdering)std::max((unsigned)X, (unsigned)Y); 
-} 
- 
-/// It is safe to destroy a constant iff it is only used by constants itself. 
-/// Note that constants cannot be cyclic, so this test is pretty easy to 
-/// implement recursively. 
-/// 
-bool llvm::isSafeToDestroyConstant(const Constant *C) { 
-  if (isa<GlobalValue>(C)) 
-    return false; 
- 
-  if (isa<ConstantData>(C)) 
-    return false; 
- 
-  for (const User *U : C->users()) 
-    if (const Constant *CU = dyn_cast<Constant>(U)) { 
-      if (!isSafeToDestroyConstant(CU)) 
-        return false; 
-    } else 
-      return false; 
-  return true; 
-} 
- 
-static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS, 
-                             SmallPtrSetImpl<const Value *> &VisitedUsers) { 
-  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) 
-    if (GV->isExternallyInitialized()) 
-      GS.StoredType = GlobalStatus::StoredOnce; 
- 
-  for (const Use &U : V->uses()) { 
-    const User *UR = U.getUser(); 
-    if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(UR)) { 
-      GS.HasNonInstructionUser = true; 
- 
-      // If the result of the constantexpr isn't pointer type, then we won't 
-      // know to expect it in various places.  Just reject early. 
-      if (!isa<PointerType>(CE->getType())) 
-        return true; 
- 
-      // FIXME: Do we need to add constexpr selects to VisitedUsers? 
-      if (analyzeGlobalAux(CE, GS, VisitedUsers)) 
-        return true; 
-    } else if (const Instruction *I = dyn_cast<Instruction>(UR)) { 
-      if (!GS.HasMultipleAccessingFunctions) { 
-        const Function *F = I->getParent()->getParent(); 
-        if (!GS.AccessingFunction) 
-          GS.AccessingFunction = F; 
-        else if (GS.AccessingFunction != F) 
-          GS.HasMultipleAccessingFunctions = true; 
-      } 
-      if (const LoadInst *LI = dyn_cast<LoadInst>(I)) { 
-        GS.IsLoaded = true; 
-        // Don't hack on volatile loads. 
-        if (LI->isVolatile()) 
-          return true; 
-        GS.Ordering = strongerOrdering(GS.Ordering, LI->getOrdering()); 
-      } else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) { 
-        // Don't allow a store OF the address, only stores TO the address. 
-        if (SI->getOperand(0) == V) 
-          return true; 
- 
-        // Don't hack on volatile stores. 
-        if (SI->isVolatile()) 
-          return true; 
- 
-        GS.Ordering = strongerOrdering(GS.Ordering, SI->getOrdering()); 
- 
-        // If this is a direct store to the global (i.e., the global is a scalar 
-        // value, not an aggregate), keep more specific information about 
-        // stores. 
-        if (GS.StoredType != GlobalStatus::Stored) { 
-          if (const GlobalVariable *GV = 
-                  dyn_cast<GlobalVariable>(SI->getOperand(1))) { 
-            Value *StoredVal = SI->getOperand(0); 
- 
-            if (Constant *C = dyn_cast<Constant>(StoredVal)) { 
-              if (C->isThreadDependent()) { 
-                // The stored value changes between threads; don't track it. 
-                return true; 
-              } 
-            } 
- 
-            if (GV->hasInitializer() && StoredVal == GV->getInitializer()) { 
-              if (GS.StoredType < GlobalStatus::InitializerStored) 
-                GS.StoredType = GlobalStatus::InitializerStored; 
-            } else if (isa<LoadInst>(StoredVal) && 
-                       cast<LoadInst>(StoredVal)->getOperand(0) == GV) { 
-              if (GS.StoredType < GlobalStatus::InitializerStored) 
-                GS.StoredType = GlobalStatus::InitializerStored; 
-            } else if (GS.StoredType < GlobalStatus::StoredOnce) { 
-              GS.StoredType = GlobalStatus::StoredOnce; 
-              GS.StoredOnceValue = StoredVal; 
-            } else if (GS.StoredType == GlobalStatus::StoredOnce && 
-                       GS.StoredOnceValue == StoredVal) { 
-              // noop. 
-            } else { 
-              GS.StoredType = GlobalStatus::Stored; 
-            } 
-          } else { 
-            GS.StoredType = GlobalStatus::Stored; 
-          } 
-        } 
+//===-- GlobalStatus.cpp - Compute status info for globals -----------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/GlobalStatus.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include <algorithm>
+#include <cassert>
+
+using namespace llvm;
+
+/// Return the stronger of the two ordering. If the two orderings are acquire
+/// and release, then return AcquireRelease.
+///
+static AtomicOrdering strongerOrdering(AtomicOrdering X, AtomicOrdering Y) {
+  if ((X == AtomicOrdering::Acquire && Y == AtomicOrdering::Release) ||
+      (Y == AtomicOrdering::Acquire && X == AtomicOrdering::Release))
+    return AtomicOrdering::AcquireRelease;
+  return (AtomicOrdering)std::max((unsigned)X, (unsigned)Y);
+}
+
+/// It is safe to destroy a constant iff it is only used by constants itself.
+/// Note that constants cannot be cyclic, so this test is pretty easy to
+/// implement recursively.
+///
+bool llvm::isSafeToDestroyConstant(const Constant *C) {
+  if (isa<GlobalValue>(C))
+    return false;
+
+  if (isa<ConstantData>(C))
+    return false;
+
+  for (const User *U : C->users())
+    if (const Constant *CU = dyn_cast<Constant>(U)) {
+      if (!isSafeToDestroyConstant(CU))
+        return false;
+    } else
+      return false;
+  return true;
+}
+
+static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
+                             SmallPtrSetImpl<const Value *> &VisitedUsers) {
+  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+    if (GV->isExternallyInitialized())
+      GS.StoredType = GlobalStatus::StoredOnce;
+
+  for (const Use &U : V->uses()) {
+    const User *UR = U.getUser();
+    if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(UR)) {
+      GS.HasNonInstructionUser = true;
+
+      // If the result of the constantexpr isn't pointer type, then we won't
+      // know to expect it in various places.  Just reject early.
+      if (!isa<PointerType>(CE->getType()))
+        return true;
+
+      // FIXME: Do we need to add constexpr selects to VisitedUsers?
+      if (analyzeGlobalAux(CE, GS, VisitedUsers))
+        return true;
+    } else if (const Instruction *I = dyn_cast<Instruction>(UR)) {
+      if (!GS.HasMultipleAccessingFunctions) {
+        const Function *F = I->getParent()->getParent();
+        if (!GS.AccessingFunction)
+          GS.AccessingFunction = F;
+        else if (GS.AccessingFunction != F)
+          GS.HasMultipleAccessingFunctions = true;
+      }
+      if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
+        GS.IsLoaded = true;
+        // Don't hack on volatile loads.
+        if (LI->isVolatile())
+          return true;
+        GS.Ordering = strongerOrdering(GS.Ordering, LI->getOrdering());
+      } else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
+        // Don't allow a store OF the address, only stores TO the address.
+        if (SI->getOperand(0) == V)
+          return true;
+
+        // Don't hack on volatile stores.
+        if (SI->isVolatile())
+          return true;
+
+        GS.Ordering = strongerOrdering(GS.Ordering, SI->getOrdering());
+
+        // If this is a direct store to the global (i.e., the global is a scalar
+        // value, not an aggregate), keep more specific information about
+        // stores.
+        if (GS.StoredType != GlobalStatus::Stored) {
+          if (const GlobalVariable *GV =
+                  dyn_cast<GlobalVariable>(SI->getOperand(1))) {
+            Value *StoredVal = SI->getOperand(0);
+
+            if (Constant *C = dyn_cast<Constant>(StoredVal)) {
+              if (C->isThreadDependent()) {
+                // The stored value changes between threads; don't track it.
+                return true;
+              }
+            }
+
+            if (GV->hasInitializer() && StoredVal == GV->getInitializer()) {
+              if (GS.StoredType < GlobalStatus::InitializerStored)
+                GS.StoredType = GlobalStatus::InitializerStored;
+            } else if (isa<LoadInst>(StoredVal) &&
+                       cast<LoadInst>(StoredVal)->getOperand(0) == GV) {
+              if (GS.StoredType < GlobalStatus::InitializerStored)
+                GS.StoredType = GlobalStatus::InitializerStored;
+            } else if (GS.StoredType < GlobalStatus::StoredOnce) {
+              GS.StoredType = GlobalStatus::StoredOnce;
+              GS.StoredOnceValue = StoredVal;
+            } else if (GS.StoredType == GlobalStatus::StoredOnce &&
+                       GS.StoredOnceValue == StoredVal) {
+              // noop.
+            } else {
+              GS.StoredType = GlobalStatus::Stored;
+            }
+          } else {
+            GS.StoredType = GlobalStatus::Stored;
+          }
+        }
       } else if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I) ||
                  isa<AddrSpaceCastInst>(I)) {
-        // Skip over bitcasts and GEPs; we don't care about the type or offset 
-        // of the pointer. 
-        if (analyzeGlobalAux(I, GS, VisitedUsers)) 
-          return true; 
-      } else if (isa<SelectInst>(I) || isa<PHINode>(I)) { 
-        // Look through selects and PHIs to find if the pointer is 
-        // conditionally accessed. Make sure we only visit an instruction 
-        // once; otherwise, we can get infinite recursion or exponential 
-        // compile time. 
-        if (VisitedUsers.insert(I).second) 
-          if (analyzeGlobalAux(I, GS, VisitedUsers)) 
-            return true; 
-      } else if (isa<CmpInst>(I)) { 
-        GS.IsCompared = true; 
-      } else if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(I)) { 
-        if (MTI->isVolatile()) 
-          return true; 
-        if (MTI->getArgOperand(0) == V) 
-          GS.StoredType = GlobalStatus::Stored; 
-        if (MTI->getArgOperand(1) == V) 
-          GS.IsLoaded = true; 
-      } else if (const MemSetInst *MSI = dyn_cast<MemSetInst>(I)) { 
-        assert(MSI->getArgOperand(0) == V && "Memset only takes one pointer!"); 
-        if (MSI->isVolatile()) 
-          return true; 
-        GS.StoredType = GlobalStatus::Stored; 
-      } else if (const auto *CB = dyn_cast<CallBase>(I)) { 
-        if (!CB->isCallee(&U)) 
-          return true; 
-        GS.IsLoaded = true; 
-      } else { 
-        return true; // Any other non-load instruction might take address! 
-      } 
-    } else if (const Constant *C = dyn_cast<Constant>(UR)) { 
-      GS.HasNonInstructionUser = true; 
-      // We might have a dead and dangling constant hanging off of here. 
-      if (!isSafeToDestroyConstant(C)) 
-        return true; 
-    } else { 
-      GS.HasNonInstructionUser = true; 
-      // Otherwise must be some other user. 
-      return true; 
-    } 
-  } 
- 
-  return false; 
-} 
- 
-GlobalStatus::GlobalStatus() = default; 
- 
-bool GlobalStatus::analyzeGlobal(const Value *V, GlobalStatus &GS) { 
-  SmallPtrSet<const Value *, 16> VisitedUsers; 
-  return analyzeGlobalAux(V, GS, VisitedUsers); 
-} 
+        // Skip over bitcasts and GEPs; we don't care about the type or offset
+        // of the pointer.
+        if (analyzeGlobalAux(I, GS, VisitedUsers))
+          return true;
+      } else if (isa<SelectInst>(I) || isa<PHINode>(I)) {
+        // Look through selects and PHIs to find if the pointer is
+        // conditionally accessed. Make sure we only visit an instruction
+        // once; otherwise, we can get infinite recursion or exponential
+        // compile time.
+        if (VisitedUsers.insert(I).second)
+          if (analyzeGlobalAux(I, GS, VisitedUsers))
+            return true;
+      } else if (isa<CmpInst>(I)) {
+        GS.IsCompared = true;
+      } else if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(I)) {
+        if (MTI->isVolatile())
+          return true;
+        if (MTI->getArgOperand(0) == V)
+          GS.StoredType = GlobalStatus::Stored;
+        if (MTI->getArgOperand(1) == V)
+          GS.IsLoaded = true;
+      } else if (const MemSetInst *MSI = dyn_cast<MemSetInst>(I)) {
+        assert(MSI->getArgOperand(0) == V && "Memset only takes one pointer!");
+        if (MSI->isVolatile())
+          return true;
+        GS.StoredType = GlobalStatus::Stored;
+      } else if (const auto *CB = dyn_cast<CallBase>(I)) {
+        if (!CB->isCallee(&U))
+          return true;
+        GS.IsLoaded = true;
+      } else {
+        return true; // Any other non-load instruction might take address!
+      }
+    } else if (const Constant *C = dyn_cast<Constant>(UR)) {
+      GS.HasNonInstructionUser = true;
+      // We might have a dead and dangling constant hanging off of here.
+      if (!isSafeToDestroyConstant(C))
+        return true;
+    } else {
+      GS.HasNonInstructionUser = true;
+      // Otherwise must be some other user.
+      return true;
+    }
+  }
+
+  return false;
+}
+
+GlobalStatus::GlobalStatus() = default;
+
+bool GlobalStatus::analyzeGlobal(const Value *V, GlobalStatus &GS) {
+  SmallPtrSet<const Value *, 16> VisitedUsers;
+  return analyzeGlobalAux(V, GS, VisitedUsers);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/GuardUtils.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/GuardUtils.cpp
index 13f22440bb..4dbcbf80d3 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/GuardUtils.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/GuardUtils.cpp
@@ -1,126 +1,126 @@
-//===-- GuardUtils.cpp - Utils for work with guards -------------*- C++ -*-===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// Utils that are used to perform transformations related to guards and their 
-// conditions. 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/GuardUtils.h" 
-#include "llvm/Analysis/GuardUtils.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/MDBuilder.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
- 
-using namespace llvm; 
-using namespace llvm::PatternMatch; 
- 
-static cl::opt<uint32_t> PredicatePassBranchWeight( 
-    "guards-predicate-pass-branch-weight", cl::Hidden, cl::init(1 << 20), 
-    cl::desc("The probability of a guard failing is assumed to be the " 
-             "reciprocal of this value (default = 1 << 20)")); 
- 
-void llvm::makeGuardControlFlowExplicit(Function *DeoptIntrinsic, 
-                                        CallInst *Guard, bool UseWC) { 
-  OperandBundleDef DeoptOB(*Guard->getOperandBundle(LLVMContext::OB_deopt)); 
+//===-- GuardUtils.cpp - Utils for work with guards -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Utils that are used to perform transformations related to guards and their
+// conditions.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/GuardUtils.h"
+#include "llvm/Analysis/GuardUtils.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+static cl::opt<uint32_t> PredicatePassBranchWeight(
+    "guards-predicate-pass-branch-weight", cl::Hidden, cl::init(1 << 20),
+    cl::desc("The probability of a guard failing is assumed to be the "
+             "reciprocal of this value (default = 1 << 20)"));
+
+void llvm::makeGuardControlFlowExplicit(Function *DeoptIntrinsic,
+                                        CallInst *Guard, bool UseWC) {
+  OperandBundleDef DeoptOB(*Guard->getOperandBundle(LLVMContext::OB_deopt));
   SmallVector<Value *, 4> Args(drop_begin(Guard->args()));
- 
-  auto *CheckBB = Guard->getParent(); 
-  auto *DeoptBlockTerm = 
-      SplitBlockAndInsertIfThen(Guard->getArgOperand(0), Guard, true); 
- 
-  auto *CheckBI = cast<BranchInst>(CheckBB->getTerminator()); 
- 
-  // SplitBlockAndInsertIfThen inserts control flow that branches to 
-  // DeoptBlockTerm if the condition is true.  We want the opposite. 
-  CheckBI->swapSuccessors(); 
- 
-  CheckBI->getSuccessor(0)->setName("guarded"); 
-  CheckBI->getSuccessor(1)->setName("deopt"); 
- 
-  if (auto *MD = Guard->getMetadata(LLVMContext::MD_make_implicit)) 
-    CheckBI->setMetadata(LLVMContext::MD_make_implicit, MD); 
- 
-  MDBuilder MDB(Guard->getContext()); 
-  CheckBI->setMetadata(LLVMContext::MD_prof, 
-                       MDB.createBranchWeights(PredicatePassBranchWeight, 1)); 
- 
-  IRBuilder<> B(DeoptBlockTerm); 
-  auto *DeoptCall = B.CreateCall(DeoptIntrinsic, Args, {DeoptOB}, ""); 
- 
-  if (DeoptIntrinsic->getReturnType()->isVoidTy()) { 
-    B.CreateRetVoid(); 
-  } else { 
-    DeoptCall->setName("deoptcall"); 
-    B.CreateRet(DeoptCall); 
-  } 
- 
-  DeoptCall->setCallingConv(Guard->getCallingConv()); 
-  DeoptBlockTerm->eraseFromParent(); 
- 
-  if (UseWC) { 
-    // We want the guard to be expressed as explicit control flow, but still be 
-    // widenable. For that, we add Widenable Condition intrinsic call to the 
-    // guard's condition. 
-    IRBuilder<> B(CheckBI); 
-    auto *WC = B.CreateIntrinsic(Intrinsic::experimental_widenable_condition, 
-                                 {}, {}, nullptr, "widenable_cond"); 
-    CheckBI->setCondition(B.CreateAnd(CheckBI->getCondition(), WC, 
-                                      "exiplicit_guard_cond")); 
-    assert(isWidenableBranch(CheckBI) && "sanity check"); 
-  } 
-} 
- 
- 
-void llvm::widenWidenableBranch(BranchInst *WidenableBR, Value *NewCond) { 
-  assert(isWidenableBranch(WidenableBR) && "precondition"); 
- 
-  // The tempting trivially option is to produce something like this: 
-  // br (and oldcond, newcond) where oldcond is assumed to contain a widenable 
-  // condition, but that doesn't match the pattern parseWidenableBranch expects 
-  // so we have to be more sophisticated. 
- 
-  Use *C, *WC; 
-  BasicBlock *IfTrueBB, *IfFalseBB; 
-  parseWidenableBranch(WidenableBR, C, WC, IfTrueBB, IfFalseBB); 
-  if (!C) { 
-    // br (wc()), ... form 
-    IRBuilder<> B(WidenableBR); 
-    WidenableBR->setCondition(B.CreateAnd(NewCond, WC->get())); 
-  } else { 
-    // br (wc & C), ... form 
-    IRBuilder<> B(WidenableBR); 
-    C->set(B.CreateAnd(NewCond, C->get())); 
-    Instruction *WCAnd = cast<Instruction>(WidenableBR->getCondition()); 
-    // Condition is only guaranteed to dominate branch 
-    WCAnd->moveBefore(WidenableBR);     
-  } 
-  assert(isWidenableBranch(WidenableBR) && "preserve widenabiliy"); 
-} 
- 
-void llvm::setWidenableBranchCond(BranchInst *WidenableBR, Value *NewCond) { 
-  assert(isWidenableBranch(WidenableBR) && "precondition"); 
- 
-  Use *C, *WC; 
-  BasicBlock *IfTrueBB, *IfFalseBB; 
-  parseWidenableBranch(WidenableBR, C, WC, IfTrueBB, IfFalseBB); 
-  if (!C) { 
-    // br (wc()), ... form 
-    IRBuilder<> B(WidenableBR); 
-    WidenableBR->setCondition(B.CreateAnd(NewCond, WC->get())); 
-  } else { 
-    // br (wc & C), ... form 
-    Instruction *WCAnd = cast<Instruction>(WidenableBR->getCondition()); 
-    // Condition is only guaranteed to dominate branch 
-    WCAnd->moveBefore(WidenableBR); 
-    C->set(NewCond); 
-  } 
-  assert(isWidenableBranch(WidenableBR) && "preserve widenabiliy"); 
-} 
+
+  auto *CheckBB = Guard->getParent();
+  auto *DeoptBlockTerm =
+      SplitBlockAndInsertIfThen(Guard->getArgOperand(0), Guard, true);
+
+  auto *CheckBI = cast<BranchInst>(CheckBB->getTerminator());
+
+  // SplitBlockAndInsertIfThen inserts control flow that branches to
+  // DeoptBlockTerm if the condition is true.  We want the opposite.
+  CheckBI->swapSuccessors();
+
+  CheckBI->getSuccessor(0)->setName("guarded");
+  CheckBI->getSuccessor(1)->setName("deopt");
+
+  if (auto *MD = Guard->getMetadata(LLVMContext::MD_make_implicit))
+    CheckBI->setMetadata(LLVMContext::MD_make_implicit, MD);
+
+  MDBuilder MDB(Guard->getContext());
+  CheckBI->setMetadata(LLVMContext::MD_prof,
+                       MDB.createBranchWeights(PredicatePassBranchWeight, 1));
+
+  IRBuilder<> B(DeoptBlockTerm);
+  auto *DeoptCall = B.CreateCall(DeoptIntrinsic, Args, {DeoptOB}, "");
+
+  if (DeoptIntrinsic->getReturnType()->isVoidTy()) {
+    B.CreateRetVoid();
+  } else {
+    DeoptCall->setName("deoptcall");
+    B.CreateRet(DeoptCall);
+  }
+
+  DeoptCall->setCallingConv(Guard->getCallingConv());
+  DeoptBlockTerm->eraseFromParent();
+
+  if (UseWC) {
+    // We want the guard to be expressed as explicit control flow, but still be
+    // widenable. For that, we add Widenable Condition intrinsic call to the
+    // guard's condition.
+    IRBuilder<> B(CheckBI);
+    auto *WC = B.CreateIntrinsic(Intrinsic::experimental_widenable_condition,
+                                 {}, {}, nullptr, "widenable_cond");
+    CheckBI->setCondition(B.CreateAnd(CheckBI->getCondition(), WC,
+                                      "exiplicit_guard_cond"));
+    assert(isWidenableBranch(CheckBI) && "sanity check");
+  }
+}
+
+
+void llvm::widenWidenableBranch(BranchInst *WidenableBR, Value *NewCond) {
+  assert(isWidenableBranch(WidenableBR) && "precondition");
+
+  // The tempting trivially option is to produce something like this:
+  // br (and oldcond, newcond) where oldcond is assumed to contain a widenable
+  // condition, but that doesn't match the pattern parseWidenableBranch expects
+  // so we have to be more sophisticated.
+
+  Use *C, *WC;
+  BasicBlock *IfTrueBB, *IfFalseBB;
+  parseWidenableBranch(WidenableBR, C, WC, IfTrueBB, IfFalseBB);
+  if (!C) {
+    // br (wc()), ... form
+    IRBuilder<> B(WidenableBR);
+    WidenableBR->setCondition(B.CreateAnd(NewCond, WC->get()));
+  } else {
+    // br (wc & C), ... form
+    IRBuilder<> B(WidenableBR);
+    C->set(B.CreateAnd(NewCond, C->get()));
+    Instruction *WCAnd = cast<Instruction>(WidenableBR->getCondition());
+    // Condition is only guaranteed to dominate branch
+    WCAnd->moveBefore(WidenableBR);    
+  }
+  assert(isWidenableBranch(WidenableBR) && "preserve widenabiliy");
+}
+
+void llvm::setWidenableBranchCond(BranchInst *WidenableBR, Value *NewCond) {
+  assert(isWidenableBranch(WidenableBR) && "precondition");
+
+  Use *C, *WC;
+  BasicBlock *IfTrueBB, *IfFalseBB;
+  parseWidenableBranch(WidenableBR, C, WC, IfTrueBB, IfFalseBB);
+  if (!C) {
+    // br (wc()), ... form
+    IRBuilder<> B(WidenableBR);
+    WidenableBR->setCondition(B.CreateAnd(NewCond, WC->get()));
+  } else {
+    // br (wc & C), ... form
+    Instruction *WCAnd = cast<Instruction>(WidenableBR->getCondition());
+    // Condition is only guaranteed to dominate branch
+    WCAnd->moveBefore(WidenableBR);
+    C->set(NewCond);
+  }
+  assert(isWidenableBranch(WidenableBR) && "preserve widenabiliy");
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/InjectTLIMappings.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/InjectTLIMappings.cpp
index 2626ebb942..a2b72e4e7f 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/InjectTLIMappings.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/InjectTLIMappings.cpp
@@ -1,166 +1,166 @@
-//===- InjectTLIMAppings.cpp - TLI to VFABI attribute injection  ----------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// Populates the VFABI attribute with the scalar-to-vector mappings 
-// from the TargetLibraryInfo. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/InjectTLIMappings.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/DemandedBits.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/OptimizationRemarkEmitter.h" 
+//===- InjectTLIMAppings.cpp - TLI to VFABI attribute injection  ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Populates the VFABI attribute with the scalar-to-vector mappings
+// from the TargetLibraryInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/InjectTLIMappings.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/DemandedBits.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/VectorUtils.h" 
-#include "llvm/IR/InstIterator.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/Transforms/Utils.h" 
-#include "llvm/Transforms/Utils/ModuleUtils.h" 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "inject-tli-mappings" 
- 
-STATISTIC(NumCallInjected, 
-          "Number of calls in which the mappings have been injected."); 
- 
-STATISTIC(NumVFDeclAdded, 
-          "Number of function declarations that have been added."); 
-STATISTIC(NumCompUsedAdded, 
-          "Number of `@llvm.compiler.used` operands that have been added."); 
- 
-/// A helper function that adds the vector function declaration that 
-/// vectorizes the CallInst CI with a vectorization factor of VF 
-/// lanes. The TLI assumes that all parameters and the return type of 
-/// CI (other than void) need to be widened to a VectorType of VF 
-/// lanes. 
-static void addVariantDeclaration(CallInst &CI, const unsigned VF, 
-                                  const StringRef VFName) { 
-  Module *M = CI.getModule(); 
- 
-  // Add function declaration. 
-  Type *RetTy = ToVectorTy(CI.getType(), VF); 
-  SmallVector<Type *, 4> Tys; 
-  for (Value *ArgOperand : CI.arg_operands()) 
-    Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); 
-  assert(!CI.getFunctionType()->isVarArg() && 
-         "VarArg functions are not supported."); 
-  FunctionType *FTy = FunctionType::get(RetTy, Tys, /*isVarArg=*/false); 
-  Function *VectorF = 
-      Function::Create(FTy, Function::ExternalLinkage, VFName, M); 
-  VectorF->copyAttributesFrom(CI.getCalledFunction()); 
-  ++NumVFDeclAdded; 
-  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Added to the module: `" << VFName 
-                    << "` of type " << *(VectorF->getType()) << "\n"); 
- 
-  // Make function declaration (without a body) "sticky" in the IR by 
-  // listing it in the @llvm.compiler.used intrinsic. 
-  assert(!VectorF->size() && "VFABI attribute requires `@llvm.compiler.used` " 
-                             "only on declarations."); 
-  appendToCompilerUsed(*M, {VectorF}); 
-  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Adding `" << VFName 
-                    << "` to `@llvm.compiler.used`.\n"); 
-  ++NumCompUsedAdded; 
-} 
- 
-static void addMappingsFromTLI(const TargetLibraryInfo &TLI, CallInst &CI) { 
-  // This is needed to make sure we don't query the TLI for calls to 
-  // bitcast of function pointers, like `%call = call i32 (i32*, ...) 
-  // bitcast (i32 (...)* @goo to i32 (i32*, ...)*)(i32* nonnull %i)`, 
-  // as such calls make the `isFunctionVectorizable` raise an 
-  // exception. 
-  if (CI.isNoBuiltin() || !CI.getCalledFunction()) 
-    return; 
- 
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "inject-tli-mappings"
+
+STATISTIC(NumCallInjected,
+          "Number of calls in which the mappings have been injected.");
+
+STATISTIC(NumVFDeclAdded,
+          "Number of function declarations that have been added.");
+STATISTIC(NumCompUsedAdded,
+          "Number of `@llvm.compiler.used` operands that have been added.");
+
+/// A helper function that adds the vector function declaration that
+/// vectorizes the CallInst CI with a vectorization factor of VF
+/// lanes. The TLI assumes that all parameters and the return type of
+/// CI (other than void) need to be widened to a VectorType of VF
+/// lanes.
+static void addVariantDeclaration(CallInst &CI, const unsigned VF,
+                                  const StringRef VFName) {
+  Module *M = CI.getModule();
+
+  // Add function declaration.
+  Type *RetTy = ToVectorTy(CI.getType(), VF);
+  SmallVector<Type *, 4> Tys;
+  for (Value *ArgOperand : CI.arg_operands())
+    Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
+  assert(!CI.getFunctionType()->isVarArg() &&
+         "VarArg functions are not supported.");
+  FunctionType *FTy = FunctionType::get(RetTy, Tys, /*isVarArg=*/false);
+  Function *VectorF =
+      Function::Create(FTy, Function::ExternalLinkage, VFName, M);
+  VectorF->copyAttributesFrom(CI.getCalledFunction());
+  ++NumVFDeclAdded;
+  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Added to the module: `" << VFName
+                    << "` of type " << *(VectorF->getType()) << "\n");
+
+  // Make function declaration (without a body) "sticky" in the IR by
+  // listing it in the @llvm.compiler.used intrinsic.
+  assert(!VectorF->size() && "VFABI attribute requires `@llvm.compiler.used` "
+                             "only on declarations.");
+  appendToCompilerUsed(*M, {VectorF});
+  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Adding `" << VFName
+                    << "` to `@llvm.compiler.used`.\n");
+  ++NumCompUsedAdded;
+}
+
+static void addMappingsFromTLI(const TargetLibraryInfo &TLI, CallInst &CI) {
+  // This is needed to make sure we don't query the TLI for calls to
+  // bitcast of function pointers, like `%call = call i32 (i32*, ...)
+  // bitcast (i32 (...)* @goo to i32 (i32*, ...)*)(i32* nonnull %i)`,
+  // as such calls make the `isFunctionVectorizable` raise an
+  // exception.
+  if (CI.isNoBuiltin() || !CI.getCalledFunction())
+    return;
+
   StringRef ScalarName = CI.getCalledFunction()->getName();
 
-  // Nothing to be done if the TLI thinks the function is not 
-  // vectorizable. 
-  if (!TLI.isFunctionVectorizable(ScalarName)) 
-    return; 
-  SmallVector<std::string, 8> Mappings; 
-  VFABI::getVectorVariantNames(CI, Mappings); 
-  Module *M = CI.getModule(); 
-  const SetVector<StringRef> OriginalSetOfMappings(Mappings.begin(), 
-                                                   Mappings.end()); 
-  //  All VFs in the TLI are powers of 2. 
-  for (unsigned VF = 2, WidestVF = TLI.getWidestVF(ScalarName); VF <= WidestVF; 
-       VF *= 2) { 
-    const std::string TLIName = 
-        std::string(TLI.getVectorizedFunction(ScalarName, VF)); 
-    if (!TLIName.empty()) { 
-      std::string MangledName = VFABI::mangleTLIVectorName( 
-          TLIName, ScalarName, CI.getNumArgOperands(), VF); 
-      if (!OriginalSetOfMappings.count(MangledName)) { 
-        Mappings.push_back(MangledName); 
-        ++NumCallInjected; 
-      } 
-      Function *VariantF = M->getFunction(TLIName); 
-      if (!VariantF) 
-        addVariantDeclaration(CI, VF, TLIName); 
-    } 
-  } 
- 
-  VFABI::setVectorVariantNames(&CI, Mappings); 
-} 
- 
-static bool runImpl(const TargetLibraryInfo &TLI, Function &F) { 
-  for (auto &I : instructions(F)) 
-    if (auto CI = dyn_cast<CallInst>(&I)) 
-      addMappingsFromTLI(TLI, *CI); 
-  // Even if the pass adds IR attributes, the analyses are preserved. 
-  return false; 
-} 
- 
-//////////////////////////////////////////////////////////////////////////////// 
-// New pass manager implementation. 
-//////////////////////////////////////////////////////////////////////////////// 
-PreservedAnalyses InjectTLIMappings::run(Function &F, 
-                                         FunctionAnalysisManager &AM) { 
-  const TargetLibraryInfo &TLI = AM.getResult<TargetLibraryAnalysis>(F); 
-  runImpl(TLI, F); 
-  // Even if the pass adds IR attributes, the analyses are preserved. 
-  return PreservedAnalyses::all(); 
-} 
- 
-//////////////////////////////////////////////////////////////////////////////// 
-// Legacy PM Implementation. 
-//////////////////////////////////////////////////////////////////////////////// 
-bool InjectTLIMappingsLegacy::runOnFunction(Function &F) { 
-  const TargetLibraryInfo &TLI = 
-      getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 
-  return runImpl(TLI, F); 
-} 
- 
-void InjectTLIMappingsLegacy::getAnalysisUsage(AnalysisUsage &AU) const { 
-  AU.setPreservesCFG(); 
-  AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-  AU.addPreserved<TargetLibraryInfoWrapperPass>(); 
-  AU.addPreserved<ScalarEvolutionWrapperPass>(); 
-  AU.addPreserved<AAResultsWrapperPass>(); 
-  AU.addPreserved<LoopAccessLegacyAnalysis>(); 
-  AU.addPreserved<DemandedBitsWrapperPass>(); 
-  AU.addPreserved<OptimizationRemarkEmitterWrapperPass>(); 
-  AU.addPreserved<GlobalsAAWrapperPass>(); 
-} 
- 
-//////////////////////////////////////////////////////////////////////////////// 
-// Legacy Pass manager initialization 
-//////////////////////////////////////////////////////////////////////////////// 
-char InjectTLIMappingsLegacy::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(InjectTLIMappingsLegacy, DEBUG_TYPE, 
-                      "Inject TLI Mappings", false, false) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_END(InjectTLIMappingsLegacy, DEBUG_TYPE, "Inject TLI Mappings", 
-                    false, false) 
- 
-FunctionPass *llvm::createInjectTLIMappingsLegacyPass() { 
-  return new InjectTLIMappingsLegacy(); 
-} 
+  // Nothing to be done if the TLI thinks the function is not
+  // vectorizable.
+  if (!TLI.isFunctionVectorizable(ScalarName))
+    return;
+  SmallVector<std::string, 8> Mappings;
+  VFABI::getVectorVariantNames(CI, Mappings);
+  Module *M = CI.getModule();
+  const SetVector<StringRef> OriginalSetOfMappings(Mappings.begin(),
+                                                   Mappings.end());
+  //  All VFs in the TLI are powers of 2.
+  for (unsigned VF = 2, WidestVF = TLI.getWidestVF(ScalarName); VF <= WidestVF;
+       VF *= 2) {
+    const std::string TLIName =
+        std::string(TLI.getVectorizedFunction(ScalarName, VF));
+    if (!TLIName.empty()) {
+      std::string MangledName = VFABI::mangleTLIVectorName(
+          TLIName, ScalarName, CI.getNumArgOperands(), VF);
+      if (!OriginalSetOfMappings.count(MangledName)) {
+        Mappings.push_back(MangledName);
+        ++NumCallInjected;
+      }
+      Function *VariantF = M->getFunction(TLIName);
+      if (!VariantF)
+        addVariantDeclaration(CI, VF, TLIName);
+    }
+  }
+
+  VFABI::setVectorVariantNames(&CI, Mappings);
+}
+
+static bool runImpl(const TargetLibraryInfo &TLI, Function &F) {
+  for (auto &I : instructions(F))
+    if (auto CI = dyn_cast<CallInst>(&I))
+      addMappingsFromTLI(TLI, *CI);
+  // Even if the pass adds IR attributes, the analyses are preserved.
+  return false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// New pass manager implementation.
+////////////////////////////////////////////////////////////////////////////////
+PreservedAnalyses InjectTLIMappings::run(Function &F,
+                                         FunctionAnalysisManager &AM) {
+  const TargetLibraryInfo &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  runImpl(TLI, F);
+  // Even if the pass adds IR attributes, the analyses are preserved.
+  return PreservedAnalyses::all();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Legacy PM Implementation.
+////////////////////////////////////////////////////////////////////////////////
+bool InjectTLIMappingsLegacy::runOnFunction(Function &F) {
+  const TargetLibraryInfo &TLI =
+      getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  return runImpl(TLI, F);
+}
+
+void InjectTLIMappingsLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
+  AU.addPreserved<TargetLibraryInfoWrapperPass>();
+  AU.addPreserved<ScalarEvolutionWrapperPass>();
+  AU.addPreserved<AAResultsWrapperPass>();
+  AU.addPreserved<LoopAccessLegacyAnalysis>();
+  AU.addPreserved<DemandedBitsWrapperPass>();
+  AU.addPreserved<OptimizationRemarkEmitterWrapperPass>();
+  AU.addPreserved<GlobalsAAWrapperPass>();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Legacy Pass manager initialization
+////////////////////////////////////////////////////////////////////////////////
+char InjectTLIMappingsLegacy::ID = 0;
+
+INITIALIZE_PASS_BEGIN(InjectTLIMappingsLegacy, DEBUG_TYPE,
+                      "Inject TLI Mappings", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(InjectTLIMappingsLegacy, DEBUG_TYPE, "Inject TLI Mappings",
+                    false, false)
+
+FunctionPass *llvm::createInjectTLIMappingsLegacyPass() {
+  return new InjectTLIMappingsLegacy();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/InlineFunction.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/InlineFunction.cpp
index 1c5604f8a3..fb271a2118 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/InlineFunction.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/InlineFunction.cpp
@@ -1,782 +1,782 @@
-//===- InlineFunction.cpp - Code to perform function inlining -------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements inlining of a function into a call site, resolving 
-// parameters and the return value as appropriate. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/None.h" 
-#include "llvm/ADT/Optional.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/StringExtras.h" 
-#include "llvm/ADT/iterator_range.h" 
-#include "llvm/Analysis/AliasAnalysis.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/BlockFrequencyInfo.h" 
-#include "llvm/Analysis/CallGraph.h" 
-#include "llvm/Analysis/CaptureTracking.h" 
-#include "llvm/Analysis/EHPersonalities.h" 
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/Analysis/ProfileSummaryInfo.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/Analysis/VectorUtils.h" 
-#include "llvm/IR/Argument.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DIBuilder.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DebugInfoMetadata.h" 
-#include "llvm/IR/DebugLoc.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/MDBuilder.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Transforms/Utils/AssumeBundleBuilder.h" 
-#include "llvm/Transforms/Utils/Cloning.h" 
-#include "llvm/Transforms/Utils/ValueMapper.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <cstdint> 
-#include <iterator> 
-#include <limits> 
-#include <string> 
-#include <utility> 
-#include <vector> 
- 
-using namespace llvm; 
-using ProfileCount = Function::ProfileCount; 
- 
-static cl::opt<bool> 
-EnableNoAliasConversion("enable-noalias-to-md-conversion", cl::init(true), 
-  cl::Hidden, 
-  cl::desc("Convert noalias attributes to metadata during inlining.")); 
- 
+//===- InlineFunction.cpp - Code to perform function inlining -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements inlining of a function into a call site, resolving
+// parameters and the return value as appropriate.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <limits>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using ProfileCount = Function::ProfileCount;
+
+static cl::opt<bool>
+EnableNoAliasConversion("enable-noalias-to-md-conversion", cl::init(true),
+  cl::Hidden,
+  cl::desc("Convert noalias attributes to metadata during inlining."));
+
 static cl::opt<bool>
     UseNoAliasIntrinsic("use-noalias-intrinsic-during-inlining", cl::Hidden,
                         cl::ZeroOrMore, cl::init(true),
                         cl::desc("Use the llvm.experimental.noalias.scope.decl "
                                  "intrinsic during inlining."));
 
-// Disabled by default, because the added alignment assumptions may increase 
-// compile-time and block optimizations. This option is not suitable for use 
-// with frontends that emit comprehensive parameter alignment annotations. 
-static cl::opt<bool> 
-PreserveAlignmentAssumptions("preserve-alignment-assumptions-during-inlining", 
-  cl::init(false), cl::Hidden, 
-  cl::desc("Convert align attributes to assumptions during inlining.")); 
- 
-static cl::opt<bool> UpdateReturnAttributes( 
-        "update-return-attrs", cl::init(true), cl::Hidden, 
-            cl::desc("Update return attributes on calls within inlined body")); 
- 
-static cl::opt<unsigned> InlinerAttributeWindow( 
-    "max-inst-checked-for-throw-during-inlining", cl::Hidden, 
-    cl::desc("the maximum number of instructions analyzed for may throw during " 
-             "attribute inference in inlined body"), 
-    cl::init(4)); 
- 
-namespace { 
- 
-  /// A class for recording information about inlining a landing pad. 
-  class LandingPadInliningInfo { 
-    /// Destination of the invoke's unwind. 
-    BasicBlock *OuterResumeDest; 
- 
-    /// Destination for the callee's resume. 
-    BasicBlock *InnerResumeDest = nullptr; 
- 
-    /// LandingPadInst associated with the invoke. 
-    LandingPadInst *CallerLPad = nullptr; 
- 
-    /// PHI for EH values from landingpad insts. 
-    PHINode *InnerEHValuesPHI = nullptr; 
- 
-    SmallVector<Value*, 8> UnwindDestPHIValues; 
- 
-  public: 
-    LandingPadInliningInfo(InvokeInst *II) 
-        : OuterResumeDest(II->getUnwindDest()) { 
-      // If there are PHI nodes in the unwind destination block, we need to keep 
-      // track of which values came into them from the invoke before removing 
-      // the edge from this block. 
-      BasicBlock *InvokeBB = II->getParent(); 
-      BasicBlock::iterator I = OuterResumeDest->begin(); 
-      for (; isa<PHINode>(I); ++I) { 
-        // Save the value to use for this edge. 
-        PHINode *PHI = cast<PHINode>(I); 
-        UnwindDestPHIValues.push_back(PHI->getIncomingValueForBlock(InvokeBB)); 
-      } 
- 
-      CallerLPad = cast<LandingPadInst>(I); 
-    } 
- 
-    /// The outer unwind destination is the target of 
-    /// unwind edges introduced for calls within the inlined function. 
-    BasicBlock *getOuterResumeDest() const { 
-      return OuterResumeDest; 
-    } 
- 
-    BasicBlock *getInnerResumeDest(); 
- 
-    LandingPadInst *getLandingPadInst() const { return CallerLPad; } 
- 
-    /// Forward the 'resume' instruction to the caller's landing pad block. 
-    /// When the landing pad block has only one predecessor, this is 
-    /// a simple branch. When there is more than one predecessor, we need to 
-    /// split the landing pad block after the landingpad instruction and jump 
-    /// to there. 
-    void forwardResume(ResumeInst *RI, 
-                       SmallPtrSetImpl<LandingPadInst*> &InlinedLPads); 
- 
-    /// Add incoming-PHI values to the unwind destination block for the given 
-    /// basic block, using the values for the original invoke's source block. 
-    void addIncomingPHIValuesFor(BasicBlock *BB) const { 
-      addIncomingPHIValuesForInto(BB, OuterResumeDest); 
-    } 
- 
-    void addIncomingPHIValuesForInto(BasicBlock *src, BasicBlock *dest) const { 
-      BasicBlock::iterator I = dest->begin(); 
-      for (unsigned i = 0, e = UnwindDestPHIValues.size(); i != e; ++i, ++I) { 
-        PHINode *phi = cast<PHINode>(I); 
-        phi->addIncoming(UnwindDestPHIValues[i], src); 
-      } 
-    } 
-  }; 
- 
-} // end anonymous namespace 
- 
-/// Get or create a target for the branch from ResumeInsts. 
-BasicBlock *LandingPadInliningInfo::getInnerResumeDest() { 
-  if (InnerResumeDest) return InnerResumeDest; 
- 
-  // Split the landing pad. 
-  BasicBlock::iterator SplitPoint = ++CallerLPad->getIterator(); 
-  InnerResumeDest = 
-    OuterResumeDest->splitBasicBlock(SplitPoint, 
-                                     OuterResumeDest->getName() + ".body"); 
- 
-  // The number of incoming edges we expect to the inner landing pad. 
-  const unsigned PHICapacity = 2; 
- 
-  // Create corresponding new PHIs for all the PHIs in the outer landing pad. 
-  Instruction *InsertPoint = &InnerResumeDest->front(); 
-  BasicBlock::iterator I = OuterResumeDest->begin(); 
-  for (unsigned i = 0, e = UnwindDestPHIValues.size(); i != e; ++i, ++I) { 
-    PHINode *OuterPHI = cast<PHINode>(I); 
-    PHINode *InnerPHI = PHINode::Create(OuterPHI->getType(), PHICapacity, 
-                                        OuterPHI->getName() + ".lpad-body", 
-                                        InsertPoint); 
-    OuterPHI->replaceAllUsesWith(InnerPHI); 
-    InnerPHI->addIncoming(OuterPHI, OuterResumeDest); 
-  } 
- 
-  // Create a PHI for the exception values. 
-  InnerEHValuesPHI = PHINode::Create(CallerLPad->getType(), PHICapacity, 
-                                     "eh.lpad-body", InsertPoint); 
-  CallerLPad->replaceAllUsesWith(InnerEHValuesPHI); 
-  InnerEHValuesPHI->addIncoming(CallerLPad, OuterResumeDest); 
- 
-  // All done. 
-  return InnerResumeDest; 
-} 
- 
-/// Forward the 'resume' instruction to the caller's landing pad block. 
-/// When the landing pad block has only one predecessor, this is a simple 
-/// branch. When there is more than one predecessor, we need to split the 
-/// landing pad block after the landingpad instruction and jump to there. 
-void LandingPadInliningInfo::forwardResume( 
-    ResumeInst *RI, SmallPtrSetImpl<LandingPadInst *> &InlinedLPads) { 
-  BasicBlock *Dest = getInnerResumeDest(); 
-  BasicBlock *Src = RI->getParent(); 
- 
-  BranchInst::Create(Dest, Src); 
- 
-  // Update the PHIs in the destination. They were inserted in an order which 
-  // makes this work. 
-  addIncomingPHIValuesForInto(Src, Dest); 
- 
-  InnerEHValuesPHI->addIncoming(RI->getOperand(0), Src); 
-  RI->eraseFromParent(); 
-} 
- 
-/// Helper for getUnwindDestToken/getUnwindDestTokenHelper. 
-static Value *getParentPad(Value *EHPad) { 
-  if (auto *FPI = dyn_cast<FuncletPadInst>(EHPad)) 
-    return FPI->getParentPad(); 
-  return cast<CatchSwitchInst>(EHPad)->getParentPad(); 
-} 
- 
-using UnwindDestMemoTy = DenseMap<Instruction *, Value *>; 
- 
-/// Helper for getUnwindDestToken that does the descendant-ward part of 
-/// the search. 
-static Value *getUnwindDestTokenHelper(Instruction *EHPad, 
-                                       UnwindDestMemoTy &MemoMap) { 
-  SmallVector<Instruction *, 8> Worklist(1, EHPad); 
- 
-  while (!Worklist.empty()) { 
-    Instruction *CurrentPad = Worklist.pop_back_val(); 
-    // We only put pads on the worklist that aren't in the MemoMap.  When 
-    // we find an unwind dest for a pad we may update its ancestors, but 
-    // the queue only ever contains uncles/great-uncles/etc. of CurrentPad, 
-    // so they should never get updated while queued on the worklist. 
-    assert(!MemoMap.count(CurrentPad)); 
-    Value *UnwindDestToken = nullptr; 
-    if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(CurrentPad)) { 
-      if (CatchSwitch->hasUnwindDest()) { 
-        UnwindDestToken = CatchSwitch->getUnwindDest()->getFirstNonPHI(); 
-      } else { 
-        // Catchswitch doesn't have a 'nounwind' variant, and one might be 
-        // annotated as "unwinds to caller" when really it's nounwind (see 
-        // e.g. SimplifyCFGOpt::SimplifyUnreachable), so we can't infer the 
-        // parent's unwind dest from this.  We can check its catchpads' 
-        // descendants, since they might include a cleanuppad with an 
-        // "unwinds to caller" cleanupret, which can be trusted. 
-        for (auto HI = CatchSwitch->handler_begin(), 
-                  HE = CatchSwitch->handler_end(); 
-             HI != HE && !UnwindDestToken; ++HI) { 
-          BasicBlock *HandlerBlock = *HI; 
-          auto *CatchPad = cast<CatchPadInst>(HandlerBlock->getFirstNonPHI()); 
-          for (User *Child : CatchPad->users()) { 
-            // Intentionally ignore invokes here -- since the catchswitch is 
-            // marked "unwind to caller", it would be a verifier error if it 
-            // contained an invoke which unwinds out of it, so any invoke we'd 
-            // encounter must unwind to some child of the catch. 
-            if (!isa<CleanupPadInst>(Child) && !isa<CatchSwitchInst>(Child)) 
-              continue; 
- 
-            Instruction *ChildPad = cast<Instruction>(Child); 
-            auto Memo = MemoMap.find(ChildPad); 
-            if (Memo == MemoMap.end()) { 
-              // Haven't figured out this child pad yet; queue it. 
-              Worklist.push_back(ChildPad); 
-              continue; 
-            } 
-            // We've already checked this child, but might have found that 
-            // it offers no proof either way. 
-            Value *ChildUnwindDestToken = Memo->second; 
-            if (!ChildUnwindDestToken) 
-              continue; 
-            // We already know the child's unwind dest, which can either 
-            // be ConstantTokenNone to indicate unwind to caller, or can 
-            // be another child of the catchpad.  Only the former indicates 
-            // the unwind dest of the catchswitch. 
-            if (isa<ConstantTokenNone>(ChildUnwindDestToken)) { 
-              UnwindDestToken = ChildUnwindDestToken; 
-              break; 
-            } 
-            assert(getParentPad(ChildUnwindDestToken) == CatchPad); 
-          } 
-        } 
-      } 
-    } else { 
-      auto *CleanupPad = cast<CleanupPadInst>(CurrentPad); 
-      for (User *U : CleanupPad->users()) { 
-        if (auto *CleanupRet = dyn_cast<CleanupReturnInst>(U)) { 
-          if (BasicBlock *RetUnwindDest = CleanupRet->getUnwindDest()) 
-            UnwindDestToken = RetUnwindDest->getFirstNonPHI(); 
-          else 
-            UnwindDestToken = ConstantTokenNone::get(CleanupPad->getContext()); 
-          break; 
-        } 
-        Value *ChildUnwindDestToken; 
-        if (auto *Invoke = dyn_cast<InvokeInst>(U)) { 
-          ChildUnwindDestToken = Invoke->getUnwindDest()->getFirstNonPHI(); 
-        } else if (isa<CleanupPadInst>(U) || isa<CatchSwitchInst>(U)) { 
-          Instruction *ChildPad = cast<Instruction>(U); 
-          auto Memo = MemoMap.find(ChildPad); 
-          if (Memo == MemoMap.end()) { 
-            // Haven't resolved this child yet; queue it and keep searching. 
-            Worklist.push_back(ChildPad); 
-            continue; 
-          } 
-          // We've checked this child, but still need to ignore it if it 
-          // had no proof either way. 
-          ChildUnwindDestToken = Memo->second; 
-          if (!ChildUnwindDestToken) 
-            continue; 
-        } else { 
-          // Not a relevant user of the cleanuppad 
-          continue; 
-        } 
-        // In a well-formed program, the child/invoke must either unwind to 
-        // an(other) child of the cleanup, or exit the cleanup.  In the 
-        // first case, continue searching. 
-        if (isa<Instruction>(ChildUnwindDestToken) && 
-            getParentPad(ChildUnwindDestToken) == CleanupPad) 
-          continue; 
-        UnwindDestToken = ChildUnwindDestToken; 
-        break; 
-      } 
-    } 
-    // If we haven't found an unwind dest for CurrentPad, we may have queued its 
-    // children, so move on to the next in the worklist. 
-    if (!UnwindDestToken) 
-      continue; 
- 
-    // Now we know that CurrentPad unwinds to UnwindDestToken.  It also exits 
-    // any ancestors of CurrentPad up to but not including UnwindDestToken's 
-    // parent pad.  Record this in the memo map, and check to see if the 
-    // original EHPad being queried is one of the ones exited. 
-    Value *UnwindParent; 
-    if (auto *UnwindPad = dyn_cast<Instruction>(UnwindDestToken)) 
-      UnwindParent = getParentPad(UnwindPad); 
-    else 
-      UnwindParent = nullptr; 
-    bool ExitedOriginalPad = false; 
-    for (Instruction *ExitedPad = CurrentPad; 
-         ExitedPad && ExitedPad != UnwindParent; 
-         ExitedPad = dyn_cast<Instruction>(getParentPad(ExitedPad))) { 
-      // Skip over catchpads since they just follow their catchswitches. 
-      if (isa<CatchPadInst>(ExitedPad)) 
-        continue; 
-      MemoMap[ExitedPad] = UnwindDestToken; 
-      ExitedOriginalPad |= (ExitedPad == EHPad); 
-    } 
- 
-    if (ExitedOriginalPad) 
-      return UnwindDestToken; 
- 
-    // Continue the search. 
-  } 
- 
-  // No definitive information is contained within this funclet. 
-  return nullptr; 
-} 
- 
-/// Given an EH pad, find where it unwinds.  If it unwinds to an EH pad, 
-/// return that pad instruction.  If it unwinds to caller, return 
-/// ConstantTokenNone.  If it does not have a definitive unwind destination, 
-/// return nullptr. 
-/// 
-/// This routine gets invoked for calls in funclets in inlinees when inlining 
-/// an invoke.  Since many funclets don't have calls inside them, it's queried 
-/// on-demand rather than building a map of pads to unwind dests up front. 
-/// Determining a funclet's unwind dest may require recursively searching its 
-/// descendants, and also ancestors and cousins if the descendants don't provide 
-/// an answer.  Since most funclets will have their unwind dest immediately 
-/// available as the unwind dest of a catchswitch or cleanupret, this routine 
-/// searches top-down from the given pad and then up. To avoid worst-case 
-/// quadratic run-time given that approach, it uses a memo map to avoid 
-/// re-processing funclet trees.  The callers that rewrite the IR as they go 
-/// take advantage of this, for correctness, by checking/forcing rewritten 
-/// pads' entries to match the original callee view. 
-static Value *getUnwindDestToken(Instruction *EHPad, 
-                                 UnwindDestMemoTy &MemoMap) { 
-  // Catchpads unwind to the same place as their catchswitch; 
-  // redirct any queries on catchpads so the code below can 
-  // deal with just catchswitches and cleanuppads. 
-  if (auto *CPI = dyn_cast<CatchPadInst>(EHPad)) 
-    EHPad = CPI->getCatchSwitch(); 
- 
-  // Check if we've already determined the unwind dest for this pad. 
-  auto Memo = MemoMap.find(EHPad); 
-  if (Memo != MemoMap.end()) 
-    return Memo->second; 
- 
-  // Search EHPad and, if necessary, its descendants. 
-  Value *UnwindDestToken = getUnwindDestTokenHelper(EHPad, MemoMap); 
-  assert((UnwindDestToken == nullptr) != (MemoMap.count(EHPad) != 0)); 
-  if (UnwindDestToken) 
-    return UnwindDestToken; 
- 
-  // No information is available for this EHPad from itself or any of its 
-  // descendants.  An unwind all the way out to a pad in the caller would 
-  // need also to agree with the unwind dest of the parent funclet, so 
-  // search up the chain to try to find a funclet with information.  Put 
-  // null entries in the memo map to avoid re-processing as we go up. 
-  MemoMap[EHPad] = nullptr; 
-#ifndef NDEBUG 
-  SmallPtrSet<Instruction *, 4> TempMemos; 
-  TempMemos.insert(EHPad); 
-#endif 
-  Instruction *LastUselessPad = EHPad; 
-  Value *AncestorToken; 
-  for (AncestorToken = getParentPad(EHPad); 
-       auto *AncestorPad = dyn_cast<Instruction>(AncestorToken); 
-       AncestorToken = getParentPad(AncestorToken)) { 
-    // Skip over catchpads since they just follow their catchswitches. 
-    if (isa<CatchPadInst>(AncestorPad)) 
-      continue; 
-    // If the MemoMap had an entry mapping AncestorPad to nullptr, since we 
-    // haven't yet called getUnwindDestTokenHelper for AncestorPad in this 
-    // call to getUnwindDestToken, that would mean that AncestorPad had no 
-    // information in itself, its descendants, or its ancestors.  If that 
-    // were the case, then we should also have recorded the lack of information 
-    // for the descendant that we're coming from.  So assert that we don't 
-    // find a null entry in the MemoMap for AncestorPad. 
-    assert(!MemoMap.count(AncestorPad) || MemoMap[AncestorPad]); 
-    auto AncestorMemo = MemoMap.find(AncestorPad); 
-    if (AncestorMemo == MemoMap.end()) { 
-      UnwindDestToken = getUnwindDestTokenHelper(AncestorPad, MemoMap); 
-    } else { 
-      UnwindDestToken = AncestorMemo->second; 
-    } 
-    if (UnwindDestToken) 
-      break; 
-    LastUselessPad = AncestorPad; 
-    MemoMap[LastUselessPad] = nullptr; 
-#ifndef NDEBUG 
-    TempMemos.insert(LastUselessPad); 
-#endif 
-  } 
- 
-  // We know that getUnwindDestTokenHelper was called on LastUselessPad and 
-  // returned nullptr (and likewise for EHPad and any of its ancestors up to 
-  // LastUselessPad), so LastUselessPad has no information from below.  Since 
-  // getUnwindDestTokenHelper must investigate all downward paths through 
-  // no-information nodes to prove that a node has no information like this, 
-  // and since any time it finds information it records it in the MemoMap for 
-  // not just the immediately-containing funclet but also any ancestors also 
-  // exited, it must be the case that, walking downward from LastUselessPad, 
-  // visiting just those nodes which have not been mapped to an unwind dest 
-  // by getUnwindDestTokenHelper (the nullptr TempMemos notwithstanding, since 
-  // they are just used to keep getUnwindDestTokenHelper from repeating work), 
-  // any node visited must have been exhaustively searched with no information 
-  // for it found. 
-  SmallVector<Instruction *, 8> Worklist(1, LastUselessPad); 
-  while (!Worklist.empty()) { 
-    Instruction *UselessPad = Worklist.pop_back_val(); 
-    auto Memo = MemoMap.find(UselessPad); 
-    if (Memo != MemoMap.end() && Memo->second) { 
-      // Here the name 'UselessPad' is a bit of a misnomer, because we've found 
-      // that it is a funclet that does have information about unwinding to 
-      // a particular destination; its parent was a useless pad. 
-      // Since its parent has no information, the unwind edge must not escape 
-      // the parent, and must target a sibling of this pad.  This local unwind 
-      // gives us no information about EHPad.  Leave it and the subtree rooted 
-      // at it alone. 
-      assert(getParentPad(Memo->second) == getParentPad(UselessPad)); 
-      continue; 
-    } 
-    // We know we don't have information for UselesPad.  If it has an entry in 
-    // the MemoMap (mapping it to nullptr), it must be one of the TempMemos 
-    // added on this invocation of getUnwindDestToken; if a previous invocation 
-    // recorded nullptr, it would have had to prove that the ancestors of 
-    // UselessPad, which include LastUselessPad, had no information, and that 
-    // in turn would have required proving that the descendants of 
-    // LastUselesPad, which include EHPad, have no information about 
-    // LastUselessPad, which would imply that EHPad was mapped to nullptr in 
-    // the MemoMap on that invocation, which isn't the case if we got here. 
-    assert(!MemoMap.count(UselessPad) || TempMemos.count(UselessPad)); 
-    // Assert as we enumerate users that 'UselessPad' doesn't have any unwind 
-    // information that we'd be contradicting by making a map entry for it 
-    // (which is something that getUnwindDestTokenHelper must have proved for 
-    // us to get here).  Just assert on is direct users here; the checks in 
-    // this downward walk at its descendants will verify that they don't have 
-    // any unwind edges that exit 'UselessPad' either (i.e. they either have no 
-    // unwind edges or unwind to a sibling). 
-    MemoMap[UselessPad] = UnwindDestToken; 
-    if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(UselessPad)) { 
-      assert(CatchSwitch->getUnwindDest() == nullptr && "Expected useless pad"); 
-      for (BasicBlock *HandlerBlock : CatchSwitch->handlers()) { 
-        auto *CatchPad = HandlerBlock->getFirstNonPHI(); 
-        for (User *U : CatchPad->users()) { 
-          assert( 
-              (!isa<InvokeInst>(U) || 
-               (getParentPad( 
-                    cast<InvokeInst>(U)->getUnwindDest()->getFirstNonPHI()) == 
-                CatchPad)) && 
-              "Expected useless pad"); 
-          if (isa<CatchSwitchInst>(U) || isa<CleanupPadInst>(U)) 
-            Worklist.push_back(cast<Instruction>(U)); 
-        } 
-      } 
-    } else { 
-      assert(isa<CleanupPadInst>(UselessPad)); 
-      for (User *U : UselessPad->users()) { 
-        assert(!isa<CleanupReturnInst>(U) && "Expected useless pad"); 
-        assert((!isa<InvokeInst>(U) || 
-                (getParentPad( 
-                     cast<InvokeInst>(U)->getUnwindDest()->getFirstNonPHI()) == 
-                 UselessPad)) && 
-               "Expected useless pad"); 
-        if (isa<CatchSwitchInst>(U) || isa<CleanupPadInst>(U)) 
-          Worklist.push_back(cast<Instruction>(U)); 
-      } 
-    } 
-  } 
- 
-  return UnwindDestToken; 
-} 
- 
-/// When we inline a basic block into an invoke, 
-/// we have to turn all of the calls that can throw into invokes. 
-/// This function analyze BB to see if there are any calls, and if so, 
-/// it rewrites them to be invokes that jump to InvokeDest and fills in the PHI 
-/// nodes in that block with the values specified in InvokeDestPHIValues. 
-static BasicBlock *HandleCallsInBlockInlinedThroughInvoke( 
-    BasicBlock *BB, BasicBlock *UnwindEdge, 
-    UnwindDestMemoTy *FuncletUnwindMap = nullptr) { 
-  for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) { 
-    Instruction *I = &*BBI++; 
- 
-    // We only need to check for function calls: inlined invoke 
-    // instructions require no special handling. 
-    CallInst *CI = dyn_cast<CallInst>(I); 
- 
-    if (!CI || CI->doesNotThrow() || CI->isInlineAsm()) 
-      continue; 
- 
-    // We do not need to (and in fact, cannot) convert possibly throwing calls 
-    // to @llvm.experimental_deoptimize (resp. @llvm.experimental.guard) into 
-    // invokes.  The caller's "segment" of the deoptimization continuation 
-    // attached to the newly inlined @llvm.experimental_deoptimize 
-    // (resp. @llvm.experimental.guard) call should contain the exception 
-    // handling logic, if any. 
-    if (auto *F = CI->getCalledFunction()) 
-      if (F->getIntrinsicID() == Intrinsic::experimental_deoptimize || 
-          F->getIntrinsicID() == Intrinsic::experimental_guard) 
-        continue; 
- 
-    if (auto FuncletBundle = CI->getOperandBundle(LLVMContext::OB_funclet)) { 
-      // This call is nested inside a funclet.  If that funclet has an unwind 
-      // destination within the inlinee, then unwinding out of this call would 
-      // be UB.  Rewriting this call to an invoke which targets the inlined 
-      // invoke's unwind dest would give the call's parent funclet multiple 
-      // unwind destinations, which is something that subsequent EH table 
-      // generation can't handle and that the veirifer rejects.  So when we 
-      // see such a call, leave it as a call. 
-      auto *FuncletPad = cast<Instruction>(FuncletBundle->Inputs[0]); 
-      Value *UnwindDestToken = 
-          getUnwindDestToken(FuncletPad, *FuncletUnwindMap); 
-      if (UnwindDestToken && !isa<ConstantTokenNone>(UnwindDestToken)) 
-        continue; 
-#ifndef NDEBUG 
-      Instruction *MemoKey; 
-      if (auto *CatchPad = dyn_cast<CatchPadInst>(FuncletPad)) 
-        MemoKey = CatchPad->getCatchSwitch(); 
-      else 
-        MemoKey = FuncletPad; 
-      assert(FuncletUnwindMap->count(MemoKey) && 
-             (*FuncletUnwindMap)[MemoKey] == UnwindDestToken && 
-             "must get memoized to avoid confusing later searches"); 
-#endif // NDEBUG 
-    } 
- 
-    changeToInvokeAndSplitBasicBlock(CI, UnwindEdge); 
-    return BB; 
-  } 
-  return nullptr; 
-} 
- 
-/// If we inlined an invoke site, we need to convert calls 
-/// in the body of the inlined function into invokes. 
-/// 
-/// II is the invoke instruction being inlined.  FirstNewBlock is the first 
-/// block of the inlined code (the last block is the end of the function), 
-/// and InlineCodeInfo is information about the code that got inlined. 
-static void HandleInlinedLandingPad(InvokeInst *II, BasicBlock *FirstNewBlock, 
-                                    ClonedCodeInfo &InlinedCodeInfo) { 
-  BasicBlock *InvokeDest = II->getUnwindDest(); 
- 
-  Function *Caller = FirstNewBlock->getParent(); 
- 
-  // The inlined code is currently at the end of the function, scan from the 
-  // start of the inlined code to its end, checking for stuff we need to 
-  // rewrite. 
-  LandingPadInliningInfo Invoke(II); 
- 
-  // Get all of the inlined landing pad instructions. 
-  SmallPtrSet<LandingPadInst*, 16> InlinedLPads; 
-  for (Function::iterator I = FirstNewBlock->getIterator(), E = Caller->end(); 
-       I != E; ++I) 
-    if (InvokeInst *II = dyn_cast<InvokeInst>(I->getTerminator())) 
-      InlinedLPads.insert(II->getLandingPadInst()); 
- 
-  // Append the clauses from the outer landing pad instruction into the inlined 
-  // landing pad instructions. 
-  LandingPadInst *OuterLPad = Invoke.getLandingPadInst(); 
-  for (LandingPadInst *InlinedLPad : InlinedLPads) { 
-    unsigned OuterNum = OuterLPad->getNumClauses(); 
-    InlinedLPad->reserveClauses(OuterNum); 
-    for (unsigned OuterIdx = 0; OuterIdx != OuterNum; ++OuterIdx) 
-      InlinedLPad->addClause(OuterLPad->getClause(OuterIdx)); 
-    if (OuterLPad->isCleanup()) 
-      InlinedLPad->setCleanup(true); 
-  } 
- 
-  for (Function::iterator BB = FirstNewBlock->getIterator(), E = Caller->end(); 
-       BB != E; ++BB) { 
-    if (InlinedCodeInfo.ContainsCalls) 
-      if (BasicBlock *NewBB = HandleCallsInBlockInlinedThroughInvoke( 
-              &*BB, Invoke.getOuterResumeDest())) 
-        // Update any PHI nodes in the exceptional block to indicate that there 
-        // is now a new entry in them. 
-        Invoke.addIncomingPHIValuesFor(NewBB); 
- 
-    // Forward any resumes that are remaining here. 
-    if (ResumeInst *RI = dyn_cast<ResumeInst>(BB->getTerminator())) 
-      Invoke.forwardResume(RI, InlinedLPads); 
-  } 
- 
-  // Now that everything is happy, we have one final detail.  The PHI nodes in 
-  // the exception destination block still have entries due to the original 
-  // invoke instruction. Eliminate these entries (which might even delete the 
-  // PHI node) now. 
-  InvokeDest->removePredecessor(II->getParent()); 
-} 
- 
-/// If we inlined an invoke site, we need to convert calls 
-/// in the body of the inlined function into invokes. 
-/// 
-/// II is the invoke instruction being inlined.  FirstNewBlock is the first 
-/// block of the inlined code (the last block is the end of the function), 
-/// and InlineCodeInfo is information about the code that got inlined. 
-static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock, 
-                               ClonedCodeInfo &InlinedCodeInfo) { 
-  BasicBlock *UnwindDest = II->getUnwindDest(); 
-  Function *Caller = FirstNewBlock->getParent(); 
- 
-  assert(UnwindDest->getFirstNonPHI()->isEHPad() && "unexpected BasicBlock!"); 
- 
-  // If there are PHI nodes in the unwind destination block, we need to keep 
-  // track of which values came into them from the invoke before removing the 
-  // edge from this block. 
-  SmallVector<Value *, 8> UnwindDestPHIValues; 
-  BasicBlock *InvokeBB = II->getParent(); 
-  for (Instruction &I : *UnwindDest) { 
-    // Save the value to use for this edge. 
-    PHINode *PHI = dyn_cast<PHINode>(&I); 
-    if (!PHI) 
-      break; 
-    UnwindDestPHIValues.push_back(PHI->getIncomingValueForBlock(InvokeBB)); 
-  } 
- 
-  // Add incoming-PHI values to the unwind destination block for the given basic 
-  // block, using the values for the original invoke's source block. 
-  auto UpdatePHINodes = [&](BasicBlock *Src) { 
-    BasicBlock::iterator I = UnwindDest->begin(); 
-    for (Value *V : UnwindDestPHIValues) { 
-      PHINode *PHI = cast<PHINode>(I); 
-      PHI->addIncoming(V, Src); 
-      ++I; 
-    } 
-  }; 
- 
-  // This connects all the instructions which 'unwind to caller' to the invoke 
-  // destination. 
-  UnwindDestMemoTy FuncletUnwindMap; 
-  for (Function::iterator BB = FirstNewBlock->getIterator(), E = Caller->end(); 
-       BB != E; ++BB) { 
-    if (auto *CRI = dyn_cast<CleanupReturnInst>(BB->getTerminator())) { 
-      if (CRI->unwindsToCaller()) { 
-        auto *CleanupPad = CRI->getCleanupPad(); 
-        CleanupReturnInst::Create(CleanupPad, UnwindDest, CRI); 
-        CRI->eraseFromParent(); 
-        UpdatePHINodes(&*BB); 
-        // Finding a cleanupret with an unwind destination would confuse 
-        // subsequent calls to getUnwindDestToken, so map the cleanuppad 
-        // to short-circuit any such calls and recognize this as an "unwind 
-        // to caller" cleanup. 
-        assert(!FuncletUnwindMap.count(CleanupPad) || 
-               isa<ConstantTokenNone>(FuncletUnwindMap[CleanupPad])); 
-        FuncletUnwindMap[CleanupPad] = 
-            ConstantTokenNone::get(Caller->getContext()); 
-      } 
-    } 
- 
-    Instruction *I = BB->getFirstNonPHI(); 
-    if (!I->isEHPad()) 
-      continue; 
- 
-    Instruction *Replacement = nullptr; 
-    if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(I)) { 
-      if (CatchSwitch->unwindsToCaller()) { 
-        Value *UnwindDestToken; 
-        if (auto *ParentPad = 
-                dyn_cast<Instruction>(CatchSwitch->getParentPad())) { 
-          // This catchswitch is nested inside another funclet.  If that 
-          // funclet has an unwind destination within the inlinee, then 
-          // unwinding out of this catchswitch would be UB.  Rewriting this 
-          // catchswitch to unwind to the inlined invoke's unwind dest would 
-          // give the parent funclet multiple unwind destinations, which is 
-          // something that subsequent EH table generation can't handle and 
-          // that the veirifer rejects.  So when we see such a call, leave it 
-          // as "unwind to caller". 
-          UnwindDestToken = getUnwindDestToken(ParentPad, FuncletUnwindMap); 
-          if (UnwindDestToken && !isa<ConstantTokenNone>(UnwindDestToken)) 
-            continue; 
-        } else { 
-          // This catchswitch has no parent to inherit constraints from, and 
-          // none of its descendants can have an unwind edge that exits it and 
-          // targets another funclet in the inlinee.  It may or may not have a 
-          // descendant that definitively has an unwind to caller.  In either 
-          // case, we'll have to assume that any unwinds out of it may need to 
-          // be routed to the caller, so treat it as though it has a definitive 
-          // unwind to caller. 
-          UnwindDestToken = ConstantTokenNone::get(Caller->getContext()); 
-        } 
-        auto *NewCatchSwitch = CatchSwitchInst::Create( 
-            CatchSwitch->getParentPad(), UnwindDest, 
-            CatchSwitch->getNumHandlers(), CatchSwitch->getName(), 
-            CatchSwitch); 
-        for (BasicBlock *PadBB : CatchSwitch->handlers()) 
-          NewCatchSwitch->addHandler(PadBB); 
-        // Propagate info for the old catchswitch over to the new one in 
-        // the unwind map.  This also serves to short-circuit any subsequent 
-        // checks for the unwind dest of this catchswitch, which would get 
-        // confused if they found the outer handler in the callee. 
-        FuncletUnwindMap[NewCatchSwitch] = UnwindDestToken; 
-        Replacement = NewCatchSwitch; 
-      } 
-    } else if (!isa<FuncletPadInst>(I)) { 
-      llvm_unreachable("unexpected EHPad!"); 
-    } 
- 
-    if (Replacement) { 
-      Replacement->takeName(I); 
-      I->replaceAllUsesWith(Replacement); 
-      I->eraseFromParent(); 
-      UpdatePHINodes(&*BB); 
-    } 
-  } 
- 
-  if (InlinedCodeInfo.ContainsCalls) 
-    for (Function::iterator BB = FirstNewBlock->getIterator(), 
-                            E = Caller->end(); 
-         BB != E; ++BB) 
-      if (BasicBlock *NewBB = HandleCallsInBlockInlinedThroughInvoke( 
-              &*BB, UnwindDest, &FuncletUnwindMap)) 
-        // Update any PHI nodes in the exceptional block to indicate that there 
-        // is now a new entry in them. 
-        UpdatePHINodes(NewBB); 
- 
-  // Now that everything is happy, we have one final detail.  The PHI nodes in 
-  // the exception destination block still have entries due to the original 
-  // invoke instruction. Eliminate these entries (which might even delete the 
-  // PHI node) now. 
-  UnwindDest->removePredecessor(InvokeBB); 
-} 
- 
+// Disabled by default, because the added alignment assumptions may increase
+// compile-time and block optimizations. This option is not suitable for use
+// with frontends that emit comprehensive parameter alignment annotations.
+static cl::opt<bool>
+PreserveAlignmentAssumptions("preserve-alignment-assumptions-during-inlining",
+  cl::init(false), cl::Hidden,
+  cl::desc("Convert align attributes to assumptions during inlining."));
+
+static cl::opt<bool> UpdateReturnAttributes(
+        "update-return-attrs", cl::init(true), cl::Hidden,
+            cl::desc("Update return attributes on calls within inlined body"));
+
+static cl::opt<unsigned> InlinerAttributeWindow(
+    "max-inst-checked-for-throw-during-inlining", cl::Hidden,
+    cl::desc("the maximum number of instructions analyzed for may throw during "
+             "attribute inference in inlined body"),
+    cl::init(4));
+
+namespace {
+
+  /// A class for recording information about inlining a landing pad.
+  class LandingPadInliningInfo {
+    /// Destination of the invoke's unwind.
+    BasicBlock *OuterResumeDest;
+
+    /// Destination for the callee's resume.
+    BasicBlock *InnerResumeDest = nullptr;
+
+    /// LandingPadInst associated with the invoke.
+    LandingPadInst *CallerLPad = nullptr;
+
+    /// PHI for EH values from landingpad insts.
+    PHINode *InnerEHValuesPHI = nullptr;
+
+    SmallVector<Value*, 8> UnwindDestPHIValues;
+
+  public:
+    LandingPadInliningInfo(InvokeInst *II)
+        : OuterResumeDest(II->getUnwindDest()) {
+      // If there are PHI nodes in the unwind destination block, we need to keep
+      // track of which values came into them from the invoke before removing
+      // the edge from this block.
+      BasicBlock *InvokeBB = II->getParent();
+      BasicBlock::iterator I = OuterResumeDest->begin();
+      for (; isa<PHINode>(I); ++I) {
+        // Save the value to use for this edge.
+        PHINode *PHI = cast<PHINode>(I);
+        UnwindDestPHIValues.push_back(PHI->getIncomingValueForBlock(InvokeBB));
+      }
+
+      CallerLPad = cast<LandingPadInst>(I);
+    }
+
+    /// The outer unwind destination is the target of
+    /// unwind edges introduced for calls within the inlined function.
+    BasicBlock *getOuterResumeDest() const {
+      return OuterResumeDest;
+    }
+
+    BasicBlock *getInnerResumeDest();
+
+    LandingPadInst *getLandingPadInst() const { return CallerLPad; }
+
+    /// Forward the 'resume' instruction to the caller's landing pad block.
+    /// When the landing pad block has only one predecessor, this is
+    /// a simple branch. When there is more than one predecessor, we need to
+    /// split the landing pad block after the landingpad instruction and jump
+    /// to there.
+    void forwardResume(ResumeInst *RI,
+                       SmallPtrSetImpl<LandingPadInst*> &InlinedLPads);
+
+    /// Add incoming-PHI values to the unwind destination block for the given
+    /// basic block, using the values for the original invoke's source block.
+    void addIncomingPHIValuesFor(BasicBlock *BB) const {
+      addIncomingPHIValuesForInto(BB, OuterResumeDest);
+    }
+
+    void addIncomingPHIValuesForInto(BasicBlock *src, BasicBlock *dest) const {
+      BasicBlock::iterator I = dest->begin();
+      for (unsigned i = 0, e = UnwindDestPHIValues.size(); i != e; ++i, ++I) {
+        PHINode *phi = cast<PHINode>(I);
+        phi->addIncoming(UnwindDestPHIValues[i], src);
+      }
+    }
+  };
+
+} // end anonymous namespace
+
+/// Get or create a target for the branch from ResumeInsts.
+BasicBlock *LandingPadInliningInfo::getInnerResumeDest() {
+  if (InnerResumeDest) return InnerResumeDest;
+
+  // Split the landing pad.
+  BasicBlock::iterator SplitPoint = ++CallerLPad->getIterator();
+  InnerResumeDest =
+    OuterResumeDest->splitBasicBlock(SplitPoint,
+                                     OuterResumeDest->getName() + ".body");
+
+  // The number of incoming edges we expect to the inner landing pad.
+  const unsigned PHICapacity = 2;
+
+  // Create corresponding new PHIs for all the PHIs in the outer landing pad.
+  Instruction *InsertPoint = &InnerResumeDest->front();
+  BasicBlock::iterator I = OuterResumeDest->begin();
+  for (unsigned i = 0, e = UnwindDestPHIValues.size(); i != e; ++i, ++I) {
+    PHINode *OuterPHI = cast<PHINode>(I);
+    PHINode *InnerPHI = PHINode::Create(OuterPHI->getType(), PHICapacity,
+                                        OuterPHI->getName() + ".lpad-body",
+                                        InsertPoint);
+    OuterPHI->replaceAllUsesWith(InnerPHI);
+    InnerPHI->addIncoming(OuterPHI, OuterResumeDest);
+  }
+
+  // Create a PHI for the exception values.
+  InnerEHValuesPHI = PHINode::Create(CallerLPad->getType(), PHICapacity,
+                                     "eh.lpad-body", InsertPoint);
+  CallerLPad->replaceAllUsesWith(InnerEHValuesPHI);
+  InnerEHValuesPHI->addIncoming(CallerLPad, OuterResumeDest);
+
+  // All done.
+  return InnerResumeDest;
+}
+
+/// Forward the 'resume' instruction to the caller's landing pad block.
+/// When the landing pad block has only one predecessor, this is a simple
+/// branch. When there is more than one predecessor, we need to split the
+/// landing pad block after the landingpad instruction and jump to there.
+void LandingPadInliningInfo::forwardResume(
+    ResumeInst *RI, SmallPtrSetImpl<LandingPadInst *> &InlinedLPads) {
+  BasicBlock *Dest = getInnerResumeDest();
+  BasicBlock *Src = RI->getParent();
+
+  BranchInst::Create(Dest, Src);
+
+  // Update the PHIs in the destination. They were inserted in an order which
+  // makes this work.
+  addIncomingPHIValuesForInto(Src, Dest);
+
+  InnerEHValuesPHI->addIncoming(RI->getOperand(0), Src);
+  RI->eraseFromParent();
+}
+
+/// Helper for getUnwindDestToken/getUnwindDestTokenHelper.
+static Value *getParentPad(Value *EHPad) {
+  if (auto *FPI = dyn_cast<FuncletPadInst>(EHPad))
+    return FPI->getParentPad();
+  return cast<CatchSwitchInst>(EHPad)->getParentPad();
+}
+
+using UnwindDestMemoTy = DenseMap<Instruction *, Value *>;
+
+/// Helper for getUnwindDestToken that does the descendant-ward part of
+/// the search.
+static Value *getUnwindDestTokenHelper(Instruction *EHPad,
+                                       UnwindDestMemoTy &MemoMap) {
+  SmallVector<Instruction *, 8> Worklist(1, EHPad);
+
+  while (!Worklist.empty()) {
+    Instruction *CurrentPad = Worklist.pop_back_val();
+    // We only put pads on the worklist that aren't in the MemoMap.  When
+    // we find an unwind dest for a pad we may update its ancestors, but
+    // the queue only ever contains uncles/great-uncles/etc. of CurrentPad,
+    // so they should never get updated while queued on the worklist.
+    assert(!MemoMap.count(CurrentPad));
+    Value *UnwindDestToken = nullptr;
+    if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(CurrentPad)) {
+      if (CatchSwitch->hasUnwindDest()) {
+        UnwindDestToken = CatchSwitch->getUnwindDest()->getFirstNonPHI();
+      } else {
+        // Catchswitch doesn't have a 'nounwind' variant, and one might be
+        // annotated as "unwinds to caller" when really it's nounwind (see
+        // e.g. SimplifyCFGOpt::SimplifyUnreachable), so we can't infer the
+        // parent's unwind dest from this.  We can check its catchpads'
+        // descendants, since they might include a cleanuppad with an
+        // "unwinds to caller" cleanupret, which can be trusted.
+        for (auto HI = CatchSwitch->handler_begin(),
+                  HE = CatchSwitch->handler_end();
+             HI != HE && !UnwindDestToken; ++HI) {
+          BasicBlock *HandlerBlock = *HI;
+          auto *CatchPad = cast<CatchPadInst>(HandlerBlock->getFirstNonPHI());
+          for (User *Child : CatchPad->users()) {
+            // Intentionally ignore invokes here -- since the catchswitch is
+            // marked "unwind to caller", it would be a verifier error if it
+            // contained an invoke which unwinds out of it, so any invoke we'd
+            // encounter must unwind to some child of the catch.
+            if (!isa<CleanupPadInst>(Child) && !isa<CatchSwitchInst>(Child))
+              continue;
+
+            Instruction *ChildPad = cast<Instruction>(Child);
+            auto Memo = MemoMap.find(ChildPad);
+            if (Memo == MemoMap.end()) {
+              // Haven't figured out this child pad yet; queue it.
+              Worklist.push_back(ChildPad);
+              continue;
+            }
+            // We've already checked this child, but might have found that
+            // it offers no proof either way.
+            Value *ChildUnwindDestToken = Memo->second;
+            if (!ChildUnwindDestToken)
+              continue;
+            // We already know the child's unwind dest, which can either
+            // be ConstantTokenNone to indicate unwind to caller, or can
+            // be another child of the catchpad.  Only the former indicates
+            // the unwind dest of the catchswitch.
+            if (isa<ConstantTokenNone>(ChildUnwindDestToken)) {
+              UnwindDestToken = ChildUnwindDestToken;
+              break;
+            }
+            assert(getParentPad(ChildUnwindDestToken) == CatchPad);
+          }
+        }
+      }
+    } else {
+      auto *CleanupPad = cast<CleanupPadInst>(CurrentPad);
+      for (User *U : CleanupPad->users()) {
+        if (auto *CleanupRet = dyn_cast<CleanupReturnInst>(U)) {
+          if (BasicBlock *RetUnwindDest = CleanupRet->getUnwindDest())
+            UnwindDestToken = RetUnwindDest->getFirstNonPHI();
+          else
+            UnwindDestToken = ConstantTokenNone::get(CleanupPad->getContext());
+          break;
+        }
+        Value *ChildUnwindDestToken;
+        if (auto *Invoke = dyn_cast<InvokeInst>(U)) {
+          ChildUnwindDestToken = Invoke->getUnwindDest()->getFirstNonPHI();
+        } else if (isa<CleanupPadInst>(U) || isa<CatchSwitchInst>(U)) {
+          Instruction *ChildPad = cast<Instruction>(U);
+          auto Memo = MemoMap.find(ChildPad);
+          if (Memo == MemoMap.end()) {
+            // Haven't resolved this child yet; queue it and keep searching.
+            Worklist.push_back(ChildPad);
+            continue;
+          }
+          // We've checked this child, but still need to ignore it if it
+          // had no proof either way.
+          ChildUnwindDestToken = Memo->second;
+          if (!ChildUnwindDestToken)
+            continue;
+        } else {
+          // Not a relevant user of the cleanuppad
+          continue;
+        }
+        // In a well-formed program, the child/invoke must either unwind to
+        // an(other) child of the cleanup, or exit the cleanup.  In the
+        // first case, continue searching.
+        if (isa<Instruction>(ChildUnwindDestToken) &&
+            getParentPad(ChildUnwindDestToken) == CleanupPad)
+          continue;
+        UnwindDestToken = ChildUnwindDestToken;
+        break;
+      }
+    }
+    // If we haven't found an unwind dest for CurrentPad, we may have queued its
+    // children, so move on to the next in the worklist.
+    if (!UnwindDestToken)
+      continue;
+
+    // Now we know that CurrentPad unwinds to UnwindDestToken.  It also exits
+    // any ancestors of CurrentPad up to but not including UnwindDestToken's
+    // parent pad.  Record this in the memo map, and check to see if the
+    // original EHPad being queried is one of the ones exited.
+    Value *UnwindParent;
+    if (auto *UnwindPad = dyn_cast<Instruction>(UnwindDestToken))
+      UnwindParent = getParentPad(UnwindPad);
+    else
+      UnwindParent = nullptr;
+    bool ExitedOriginalPad = false;
+    for (Instruction *ExitedPad = CurrentPad;
+         ExitedPad && ExitedPad != UnwindParent;
+         ExitedPad = dyn_cast<Instruction>(getParentPad(ExitedPad))) {
+      // Skip over catchpads since they just follow their catchswitches.
+      if (isa<CatchPadInst>(ExitedPad))
+        continue;
+      MemoMap[ExitedPad] = UnwindDestToken;
+      ExitedOriginalPad |= (ExitedPad == EHPad);
+    }
+
+    if (ExitedOriginalPad)
+      return UnwindDestToken;
+
+    // Continue the search.
+  }
+
+  // No definitive information is contained within this funclet.
+  return nullptr;
+}
+
+/// Given an EH pad, find where it unwinds.  If it unwinds to an EH pad,
+/// return that pad instruction.  If it unwinds to caller, return
+/// ConstantTokenNone.  If it does not have a definitive unwind destination,
+/// return nullptr.
+///
+/// This routine gets invoked for calls in funclets in inlinees when inlining
+/// an invoke.  Since many funclets don't have calls inside them, it's queried
+/// on-demand rather than building a map of pads to unwind dests up front.
+/// Determining a funclet's unwind dest may require recursively searching its
+/// descendants, and also ancestors and cousins if the descendants don't provide
+/// an answer.  Since most funclets will have their unwind dest immediately
+/// available as the unwind dest of a catchswitch or cleanupret, this routine
+/// searches top-down from the given pad and then up. To avoid worst-case
+/// quadratic run-time given that approach, it uses a memo map to avoid
+/// re-processing funclet trees.  The callers that rewrite the IR as they go
+/// take advantage of this, for correctness, by checking/forcing rewritten
+/// pads' entries to match the original callee view.
+static Value *getUnwindDestToken(Instruction *EHPad,
+                                 UnwindDestMemoTy &MemoMap) {
+  // Catchpads unwind to the same place as their catchswitch;
+  // redirct any queries on catchpads so the code below can
+  // deal with just catchswitches and cleanuppads.
+  if (auto *CPI = dyn_cast<CatchPadInst>(EHPad))
+    EHPad = CPI->getCatchSwitch();
+
+  // Check if we've already determined the unwind dest for this pad.
+  auto Memo = MemoMap.find(EHPad);
+  if (Memo != MemoMap.end())
+    return Memo->second;
+
+  // Search EHPad and, if necessary, its descendants.
+  Value *UnwindDestToken = getUnwindDestTokenHelper(EHPad, MemoMap);
+  assert((UnwindDestToken == nullptr) != (MemoMap.count(EHPad) != 0));
+  if (UnwindDestToken)
+    return UnwindDestToken;
+
+  // No information is available for this EHPad from itself or any of its
+  // descendants.  An unwind all the way out to a pad in the caller would
+  // need also to agree with the unwind dest of the parent funclet, so
+  // search up the chain to try to find a funclet with information.  Put
+  // null entries in the memo map to avoid re-processing as we go up.
+  MemoMap[EHPad] = nullptr;
+#ifndef NDEBUG
+  SmallPtrSet<Instruction *, 4> TempMemos;
+  TempMemos.insert(EHPad);
+#endif
+  Instruction *LastUselessPad = EHPad;
+  Value *AncestorToken;
+  for (AncestorToken = getParentPad(EHPad);
+       auto *AncestorPad = dyn_cast<Instruction>(AncestorToken);
+       AncestorToken = getParentPad(AncestorToken)) {
+    // Skip over catchpads since they just follow their catchswitches.
+    if (isa<CatchPadInst>(AncestorPad))
+      continue;
+    // If the MemoMap had an entry mapping AncestorPad to nullptr, since we
+    // haven't yet called getUnwindDestTokenHelper for AncestorPad in this
+    // call to getUnwindDestToken, that would mean that AncestorPad had no
+    // information in itself, its descendants, or its ancestors.  If that
+    // were the case, then we should also have recorded the lack of information
+    // for the descendant that we're coming from.  So assert that we don't
+    // find a null entry in the MemoMap for AncestorPad.
+    assert(!MemoMap.count(AncestorPad) || MemoMap[AncestorPad]);
+    auto AncestorMemo = MemoMap.find(AncestorPad);
+    if (AncestorMemo == MemoMap.end()) {
+      UnwindDestToken = getUnwindDestTokenHelper(AncestorPad, MemoMap);
+    } else {
+      UnwindDestToken = AncestorMemo->second;
+    }
+    if (UnwindDestToken)
+      break;
+    LastUselessPad = AncestorPad;
+    MemoMap[LastUselessPad] = nullptr;
+#ifndef NDEBUG
+    TempMemos.insert(LastUselessPad);
+#endif
+  }
+
+  // We know that getUnwindDestTokenHelper was called on LastUselessPad and
+  // returned nullptr (and likewise for EHPad and any of its ancestors up to
+  // LastUselessPad), so LastUselessPad has no information from below.  Since
+  // getUnwindDestTokenHelper must investigate all downward paths through
+  // no-information nodes to prove that a node has no information like this,
+  // and since any time it finds information it records it in the MemoMap for
+  // not just the immediately-containing funclet but also any ancestors also
+  // exited, it must be the case that, walking downward from LastUselessPad,
+  // visiting just those nodes which have not been mapped to an unwind dest
+  // by getUnwindDestTokenHelper (the nullptr TempMemos notwithstanding, since
+  // they are just used to keep getUnwindDestTokenHelper from repeating work),
+  // any node visited must have been exhaustively searched with no information
+  // for it found.
+  SmallVector<Instruction *, 8> Worklist(1, LastUselessPad);
+  while (!Worklist.empty()) {
+    Instruction *UselessPad = Worklist.pop_back_val();
+    auto Memo = MemoMap.find(UselessPad);
+    if (Memo != MemoMap.end() && Memo->second) {
+      // Here the name 'UselessPad' is a bit of a misnomer, because we've found
+      // that it is a funclet that does have information about unwinding to
+      // a particular destination; its parent was a useless pad.
+      // Since its parent has no information, the unwind edge must not escape
+      // the parent, and must target a sibling of this pad.  This local unwind
+      // gives us no information about EHPad.  Leave it and the subtree rooted
+      // at it alone.
+      assert(getParentPad(Memo->second) == getParentPad(UselessPad));
+      continue;
+    }
+    // We know we don't have information for UselesPad.  If it has an entry in
+    // the MemoMap (mapping it to nullptr), it must be one of the TempMemos
+    // added on this invocation of getUnwindDestToken; if a previous invocation
+    // recorded nullptr, it would have had to prove that the ancestors of
+    // UselessPad, which include LastUselessPad, had no information, and that
+    // in turn would have required proving that the descendants of
+    // LastUselesPad, which include EHPad, have no information about
+    // LastUselessPad, which would imply that EHPad was mapped to nullptr in
+    // the MemoMap on that invocation, which isn't the case if we got here.
+    assert(!MemoMap.count(UselessPad) || TempMemos.count(UselessPad));
+    // Assert as we enumerate users that 'UselessPad' doesn't have any unwind
+    // information that we'd be contradicting by making a map entry for it
+    // (which is something that getUnwindDestTokenHelper must have proved for
+    // us to get here).  Just assert on is direct users here; the checks in
+    // this downward walk at its descendants will verify that they don't have
+    // any unwind edges that exit 'UselessPad' either (i.e. they either have no
+    // unwind edges or unwind to a sibling).
+    MemoMap[UselessPad] = UnwindDestToken;
+    if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(UselessPad)) {
+      assert(CatchSwitch->getUnwindDest() == nullptr && "Expected useless pad");
+      for (BasicBlock *HandlerBlock : CatchSwitch->handlers()) {
+        auto *CatchPad = HandlerBlock->getFirstNonPHI();
+        for (User *U : CatchPad->users()) {
+          assert(
+              (!isa<InvokeInst>(U) ||
+               (getParentPad(
+                    cast<InvokeInst>(U)->getUnwindDest()->getFirstNonPHI()) ==
+                CatchPad)) &&
+              "Expected useless pad");
+          if (isa<CatchSwitchInst>(U) || isa<CleanupPadInst>(U))
+            Worklist.push_back(cast<Instruction>(U));
+        }
+      }
+    } else {
+      assert(isa<CleanupPadInst>(UselessPad));
+      for (User *U : UselessPad->users()) {
+        assert(!isa<CleanupReturnInst>(U) && "Expected useless pad");
+        assert((!isa<InvokeInst>(U) ||
+                (getParentPad(
+                     cast<InvokeInst>(U)->getUnwindDest()->getFirstNonPHI()) ==
+                 UselessPad)) &&
+               "Expected useless pad");
+        if (isa<CatchSwitchInst>(U) || isa<CleanupPadInst>(U))
+          Worklist.push_back(cast<Instruction>(U));
+      }
+    }
+  }
+
+  return UnwindDestToken;
+}
+
+/// When we inline a basic block into an invoke,
+/// we have to turn all of the calls that can throw into invokes.
+/// This function analyze BB to see if there are any calls, and if so,
+/// it rewrites them to be invokes that jump to InvokeDest and fills in the PHI
+/// nodes in that block with the values specified in InvokeDestPHIValues.
+static BasicBlock *HandleCallsInBlockInlinedThroughInvoke(
+    BasicBlock *BB, BasicBlock *UnwindEdge,
+    UnwindDestMemoTy *FuncletUnwindMap = nullptr) {
+  for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) {
+    Instruction *I = &*BBI++;
+
+    // We only need to check for function calls: inlined invoke
+    // instructions require no special handling.
+    CallInst *CI = dyn_cast<CallInst>(I);
+
+    if (!CI || CI->doesNotThrow() || CI->isInlineAsm())
+      continue;
+
+    // We do not need to (and in fact, cannot) convert possibly throwing calls
+    // to @llvm.experimental_deoptimize (resp. @llvm.experimental.guard) into
+    // invokes.  The caller's "segment" of the deoptimization continuation
+    // attached to the newly inlined @llvm.experimental_deoptimize
+    // (resp. @llvm.experimental.guard) call should contain the exception
+    // handling logic, if any.
+    if (auto *F = CI->getCalledFunction())
+      if (F->getIntrinsicID() == Intrinsic::experimental_deoptimize ||
+          F->getIntrinsicID() == Intrinsic::experimental_guard)
+        continue;
+
+    if (auto FuncletBundle = CI->getOperandBundle(LLVMContext::OB_funclet)) {
+      // This call is nested inside a funclet.  If that funclet has an unwind
+      // destination within the inlinee, then unwinding out of this call would
+      // be UB.  Rewriting this call to an invoke which targets the inlined
+      // invoke's unwind dest would give the call's parent funclet multiple
+      // unwind destinations, which is something that subsequent EH table
+      // generation can't handle and that the veirifer rejects.  So when we
+      // see such a call, leave it as a call.
+      auto *FuncletPad = cast<Instruction>(FuncletBundle->Inputs[0]);
+      Value *UnwindDestToken =
+          getUnwindDestToken(FuncletPad, *FuncletUnwindMap);
+      if (UnwindDestToken && !isa<ConstantTokenNone>(UnwindDestToken))
+        continue;
+#ifndef NDEBUG
+      Instruction *MemoKey;
+      if (auto *CatchPad = dyn_cast<CatchPadInst>(FuncletPad))
+        MemoKey = CatchPad->getCatchSwitch();
+      else
+        MemoKey = FuncletPad;
+      assert(FuncletUnwindMap->count(MemoKey) &&
+             (*FuncletUnwindMap)[MemoKey] == UnwindDestToken &&
+             "must get memoized to avoid confusing later searches");
+#endif // NDEBUG
+    }
+
+    changeToInvokeAndSplitBasicBlock(CI, UnwindEdge);
+    return BB;
+  }
+  return nullptr;
+}
+
+/// If we inlined an invoke site, we need to convert calls
+/// in the body of the inlined function into invokes.
+///
+/// II is the invoke instruction being inlined.  FirstNewBlock is the first
+/// block of the inlined code (the last block is the end of the function),
+/// and InlineCodeInfo is information about the code that got inlined.
+static void HandleInlinedLandingPad(InvokeInst *II, BasicBlock *FirstNewBlock,
+                                    ClonedCodeInfo &InlinedCodeInfo) {
+  BasicBlock *InvokeDest = II->getUnwindDest();
+
+  Function *Caller = FirstNewBlock->getParent();
+
+  // The inlined code is currently at the end of the function, scan from the
+  // start of the inlined code to its end, checking for stuff we need to
+  // rewrite.
+  LandingPadInliningInfo Invoke(II);
+
+  // Get all of the inlined landing pad instructions.
+  SmallPtrSet<LandingPadInst*, 16> InlinedLPads;
+  for (Function::iterator I = FirstNewBlock->getIterator(), E = Caller->end();
+       I != E; ++I)
+    if (InvokeInst *II = dyn_cast<InvokeInst>(I->getTerminator()))
+      InlinedLPads.insert(II->getLandingPadInst());
+
+  // Append the clauses from the outer landing pad instruction into the inlined
+  // landing pad instructions.
+  LandingPadInst *OuterLPad = Invoke.getLandingPadInst();
+  for (LandingPadInst *InlinedLPad : InlinedLPads) {
+    unsigned OuterNum = OuterLPad->getNumClauses();
+    InlinedLPad->reserveClauses(OuterNum);
+    for (unsigned OuterIdx = 0; OuterIdx != OuterNum; ++OuterIdx)
+      InlinedLPad->addClause(OuterLPad->getClause(OuterIdx));
+    if (OuterLPad->isCleanup())
+      InlinedLPad->setCleanup(true);
+  }
+
+  for (Function::iterator BB = FirstNewBlock->getIterator(), E = Caller->end();
+       BB != E; ++BB) {
+    if (InlinedCodeInfo.ContainsCalls)
+      if (BasicBlock *NewBB = HandleCallsInBlockInlinedThroughInvoke(
+              &*BB, Invoke.getOuterResumeDest()))
+        // Update any PHI nodes in the exceptional block to indicate that there
+        // is now a new entry in them.
+        Invoke.addIncomingPHIValuesFor(NewBB);
+
+    // Forward any resumes that are remaining here.
+    if (ResumeInst *RI = dyn_cast<ResumeInst>(BB->getTerminator()))
+      Invoke.forwardResume(RI, InlinedLPads);
+  }
+
+  // Now that everything is happy, we have one final detail.  The PHI nodes in
+  // the exception destination block still have entries due to the original
+  // invoke instruction. Eliminate these entries (which might even delete the
+  // PHI node) now.
+  InvokeDest->removePredecessor(II->getParent());
+}
+
+/// If we inlined an invoke site, we need to convert calls
+/// in the body of the inlined function into invokes.
+///
+/// II is the invoke instruction being inlined.  FirstNewBlock is the first
+/// block of the inlined code (the last block is the end of the function),
+/// and InlineCodeInfo is information about the code that got inlined.
+static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock,
+                               ClonedCodeInfo &InlinedCodeInfo) {
+  BasicBlock *UnwindDest = II->getUnwindDest();
+  Function *Caller = FirstNewBlock->getParent();
+
+  assert(UnwindDest->getFirstNonPHI()->isEHPad() && "unexpected BasicBlock!");
+
+  // If there are PHI nodes in the unwind destination block, we need to keep
+  // track of which values came into them from the invoke before removing the
+  // edge from this block.
+  SmallVector<Value *, 8> UnwindDestPHIValues;
+  BasicBlock *InvokeBB = II->getParent();
+  for (Instruction &I : *UnwindDest) {
+    // Save the value to use for this edge.
+    PHINode *PHI = dyn_cast<PHINode>(&I);
+    if (!PHI)
+      break;
+    UnwindDestPHIValues.push_back(PHI->getIncomingValueForBlock(InvokeBB));
+  }
+
+  // Add incoming-PHI values to the unwind destination block for the given basic
+  // block, using the values for the original invoke's source block.
+  auto UpdatePHINodes = [&](BasicBlock *Src) {
+    BasicBlock::iterator I = UnwindDest->begin();
+    for (Value *V : UnwindDestPHIValues) {
+      PHINode *PHI = cast<PHINode>(I);
+      PHI->addIncoming(V, Src);
+      ++I;
+    }
+  };
+
+  // This connects all the instructions which 'unwind to caller' to the invoke
+  // destination.
+  UnwindDestMemoTy FuncletUnwindMap;
+  for (Function::iterator BB = FirstNewBlock->getIterator(), E = Caller->end();
+       BB != E; ++BB) {
+    if (auto *CRI = dyn_cast<CleanupReturnInst>(BB->getTerminator())) {
+      if (CRI->unwindsToCaller()) {
+        auto *CleanupPad = CRI->getCleanupPad();
+        CleanupReturnInst::Create(CleanupPad, UnwindDest, CRI);
+        CRI->eraseFromParent();
+        UpdatePHINodes(&*BB);
+        // Finding a cleanupret with an unwind destination would confuse
+        // subsequent calls to getUnwindDestToken, so map the cleanuppad
+        // to short-circuit any such calls and recognize this as an "unwind
+        // to caller" cleanup.
+        assert(!FuncletUnwindMap.count(CleanupPad) ||
+               isa<ConstantTokenNone>(FuncletUnwindMap[CleanupPad]));
+        FuncletUnwindMap[CleanupPad] =
+            ConstantTokenNone::get(Caller->getContext());
+      }
+    }
+
+    Instruction *I = BB->getFirstNonPHI();
+    if (!I->isEHPad())
+      continue;
+
+    Instruction *Replacement = nullptr;
+    if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(I)) {
+      if (CatchSwitch->unwindsToCaller()) {
+        Value *UnwindDestToken;
+        if (auto *ParentPad =
+                dyn_cast<Instruction>(CatchSwitch->getParentPad())) {
+          // This catchswitch is nested inside another funclet.  If that
+          // funclet has an unwind destination within the inlinee, then
+          // unwinding out of this catchswitch would be UB.  Rewriting this
+          // catchswitch to unwind to the inlined invoke's unwind dest would
+          // give the parent funclet multiple unwind destinations, which is
+          // something that subsequent EH table generation can't handle and
+          // that the veirifer rejects.  So when we see such a call, leave it
+          // as "unwind to caller".
+          UnwindDestToken = getUnwindDestToken(ParentPad, FuncletUnwindMap);
+          if (UnwindDestToken && !isa<ConstantTokenNone>(UnwindDestToken))
+            continue;
+        } else {
+          // This catchswitch has no parent to inherit constraints from, and
+          // none of its descendants can have an unwind edge that exits it and
+          // targets another funclet in the inlinee.  It may or may not have a
+          // descendant that definitively has an unwind to caller.  In either
+          // case, we'll have to assume that any unwinds out of it may need to
+          // be routed to the caller, so treat it as though it has a definitive
+          // unwind to caller.
+          UnwindDestToken = ConstantTokenNone::get(Caller->getContext());
+        }
+        auto *NewCatchSwitch = CatchSwitchInst::Create(
+            CatchSwitch->getParentPad(), UnwindDest,
+            CatchSwitch->getNumHandlers(), CatchSwitch->getName(),
+            CatchSwitch);
+        for (BasicBlock *PadBB : CatchSwitch->handlers())
+          NewCatchSwitch->addHandler(PadBB);
+        // Propagate info for the old catchswitch over to the new one in
+        // the unwind map.  This also serves to short-circuit any subsequent
+        // checks for the unwind dest of this catchswitch, which would get
+        // confused if they found the outer handler in the callee.
+        FuncletUnwindMap[NewCatchSwitch] = UnwindDestToken;
+        Replacement = NewCatchSwitch;
+      }
+    } else if (!isa<FuncletPadInst>(I)) {
+      llvm_unreachable("unexpected EHPad!");
+    }
+
+    if (Replacement) {
+      Replacement->takeName(I);
+      I->replaceAllUsesWith(Replacement);
+      I->eraseFromParent();
+      UpdatePHINodes(&*BB);
+    }
+  }
+
+  if (InlinedCodeInfo.ContainsCalls)
+    for (Function::iterator BB = FirstNewBlock->getIterator(),
+                            E = Caller->end();
+         BB != E; ++BB)
+      if (BasicBlock *NewBB = HandleCallsInBlockInlinedThroughInvoke(
+              &*BB, UnwindDest, &FuncletUnwindMap))
+        // Update any PHI nodes in the exceptional block to indicate that there
+        // is now a new entry in them.
+        UpdatePHINodes(NewBB);
+
+  // Now that everything is happy, we have one final detail.  The PHI nodes in
+  // the exception destination block still have entries due to the original
+  // invoke instruction. Eliminate these entries (which might even delete the
+  // PHI node) now.
+  UnwindDest->removePredecessor(InvokeBB);
+}
+
 /// When inlining a call site that has !llvm.mem.parallel_loop_access,
 /// !llvm.access.group, !alias.scope or !noalias metadata, that metadata should
 /// be propagated to all memory-accessing cloned instructions.
@@ -788,14 +788,14 @@ static void PropagateCallSiteMetadata(CallBase &CB, Function::iterator FStart,
   MDNode *AliasScope = CB.getMetadata(LLVMContext::MD_alias_scope);
   MDNode *NoAlias = CB.getMetadata(LLVMContext::MD_noalias);
   if (!MemParallelLoopAccess && !AccessGroup && !AliasScope && !NoAlias)
-    return; 
- 
+    return;
+
   for (BasicBlock &BB : make_range(FStart, FEnd)) {
     for (Instruction &I : BB) {
       // This metadata is only relevant for instructions that access memory.
       if (!I.mayReadOrWriteMemory())
         continue;
- 
+
       if (MemParallelLoopAccess) {
         // TODO: This probably should not overwrite MemParalleLoopAccess.
         MemParallelLoopAccess = MDNode::concatenate(
@@ -804,7 +804,7 @@ static void PropagateCallSiteMetadata(CallBase &CB, Function::iterator FStart,
         I.setMetadata(LLVMContext::MD_mem_parallel_loop_access,
                       MemParallelLoopAccess);
       }
- 
+
       if (AccessGroup)
         I.setMetadata(LLVMContext::MD_access_group, uniteAccessGroups(
             I.getMetadata(LLVMContext::MD_access_group), AccessGroup));
@@ -817,22 +817,22 @@ static void PropagateCallSiteMetadata(CallBase &CB, Function::iterator FStart,
         I.setMetadata(LLVMContext::MD_noalias, MDNode::concatenate(
             I.getMetadata(LLVMContext::MD_noalias), NoAlias));
     }
-  } 
-} 
- 
+  }
+}
+
 /// Utility for cloning !noalias and !alias.scope metadata. When a code region
 /// using scoped alias metadata is inlined, the aliasing relationships may not
 /// hold between the two version. It is necessary to create a deep clone of the
 /// metadata, putting the two versions in separate scope domains.
 class ScopedAliasMetadataDeepCloner {
   using MetadataMap = DenseMap<const MDNode *, TrackingMDNodeRef>;
-  SetVector<const MDNode *> MD; 
+  SetVector<const MDNode *> MD;
   MetadataMap MDMap;
   void addRecursiveMetadataUses();
- 
+
 public:
   ScopedAliasMetadataDeepCloner(const Function *F);
- 
+
   /// Create a new clone of the scoped alias metadata, which will be used by
   /// subsequent remap() calls.
   void clone();
@@ -847,59 +847,59 @@ ScopedAliasMetadataDeepCloner::ScopedAliasMetadataDeepCloner(
   for (const BasicBlock &BB : *F) {
     for (const Instruction &I : BB) {
       if (const MDNode *M = I.getMetadata(LLVMContext::MD_alias_scope))
-        MD.insert(M); 
+        MD.insert(M);
       if (const MDNode *M = I.getMetadata(LLVMContext::MD_noalias))
-        MD.insert(M); 
+        MD.insert(M);
 
       // We also need to clone the metadata in noalias intrinsics.
       if (const auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I))
         MD.insert(Decl->getScopeList());
-    } 
+    }
   }
   addRecursiveMetadataUses();
 }
- 
+
 void ScopedAliasMetadataDeepCloner::addRecursiveMetadataUses() {
-  SmallVector<const Metadata *, 16> Queue(MD.begin(), MD.end()); 
-  while (!Queue.empty()) { 
-    const MDNode *M = cast<MDNode>(Queue.pop_back_val()); 
+  SmallVector<const Metadata *, 16> Queue(MD.begin(), MD.end());
+  while (!Queue.empty()) {
+    const MDNode *M = cast<MDNode>(Queue.pop_back_val());
     for (const Metadata *Op : M->operands())
       if (const MDNode *OpMD = dyn_cast<MDNode>(Op))
         if (MD.insert(OpMD))
           Queue.push_back(OpMD);
-  } 
+  }
 }
- 
+
 void ScopedAliasMetadataDeepCloner::clone() {
   assert(MDMap.empty() && "clone() already called ?");
 
-  SmallVector<TempMDTuple, 16> DummyNodes; 
-  for (const MDNode *I : MD) { 
+  SmallVector<TempMDTuple, 16> DummyNodes;
+  for (const MDNode *I : MD) {
     DummyNodes.push_back(MDTuple::getTemporary(I->getContext(), None));
-    MDMap[I].reset(DummyNodes.back().get()); 
-  } 
- 
-  // Create new metadata nodes to replace the dummy nodes, replacing old 
-  // metadata references with either a dummy node or an already-created new 
-  // node. 
+    MDMap[I].reset(DummyNodes.back().get());
+  }
+
+  // Create new metadata nodes to replace the dummy nodes, replacing old
+  // metadata references with either a dummy node or an already-created new
+  // node.
   SmallVector<Metadata *, 4> NewOps;
-  for (const MDNode *I : MD) { 
+  for (const MDNode *I : MD) {
     for (const Metadata *Op : I->operands()) {
       if (const MDNode *M = dyn_cast<MDNode>(Op))
-        NewOps.push_back(MDMap[M]); 
-      else 
+        NewOps.push_back(MDMap[M]);
+      else
         NewOps.push_back(const_cast<Metadata *>(Op));
-    } 
- 
+    }
+
     MDNode *NewM = MDNode::get(I->getContext(), NewOps);
-    MDTuple *TempM = cast<MDTuple>(MDMap[I]); 
-    assert(TempM->isTemporary() && "Expected temporary node"); 
- 
-    TempM->replaceAllUsesWith(NewM); 
+    MDTuple *TempM = cast<MDTuple>(MDMap[I]);
+    assert(TempM->isTemporary() && "Expected temporary node");
+
+    TempM->replaceAllUsesWith(NewM);
     NewOps.clear();
-  } 
+  }
 }
- 
+
 void ScopedAliasMetadataDeepCloner::remap(Function::iterator FStart,
                                           Function::iterator FEnd) {
   if (MDMap.empty())
@@ -912,71 +912,71 @@ void ScopedAliasMetadataDeepCloner::remap(Function::iterator FStart,
       if (MDNode *M = I.getMetadata(LLVMContext::MD_alias_scope))
         if (MDNode *MNew = MDMap.lookup(M))
           I.setMetadata(LLVMContext::MD_alias_scope, MNew);
- 
+
       if (MDNode *M = I.getMetadata(LLVMContext::MD_noalias))
         if (MDNode *MNew = MDMap.lookup(M))
           I.setMetadata(LLVMContext::MD_noalias, MNew);
- 
+
       if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I))
         if (MDNode *MNew = MDMap.lookup(Decl->getScopeList()))
           Decl->setScopeList(MNew);
     }
-  } 
-} 
- 
-/// If the inlined function has noalias arguments, 
-/// then add new alias scopes for each noalias argument, tag the mapped noalias 
-/// parameters with noalias metadata specifying the new scope, and tag all 
-/// non-derived loads, stores and memory intrinsics with the new alias scopes. 
-static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap, 
-                                  const DataLayout &DL, AAResults *CalleeAAR) { 
-  if (!EnableNoAliasConversion) 
-    return; 
- 
-  const Function *CalledFunc = CB.getCalledFunction(); 
-  SmallVector<const Argument *, 4> NoAliasArgs; 
- 
-  for (const Argument &Arg : CalledFunc->args()) 
-    if (CB.paramHasAttr(Arg.getArgNo(), Attribute::NoAlias) && !Arg.use_empty()) 
-      NoAliasArgs.push_back(&Arg); 
- 
-  if (NoAliasArgs.empty()) 
-    return; 
- 
-  // To do a good job, if a noalias variable is captured, we need to know if 
-  // the capture point dominates the particular use we're considering. 
-  DominatorTree DT; 
-  DT.recalculate(const_cast<Function&>(*CalledFunc)); 
- 
-  // noalias indicates that pointer values based on the argument do not alias 
-  // pointer values which are not based on it. So we add a new "scope" for each 
-  // noalias function argument. Accesses using pointers based on that argument 
-  // become part of that alias scope, accesses using pointers not based on that 
-  // argument are tagged as noalias with that scope. 
- 
-  DenseMap<const Argument *, MDNode *> NewScopes; 
-  MDBuilder MDB(CalledFunc->getContext()); 
- 
-  // Create a new scope domain for this function. 
-  MDNode *NewDomain = 
-    MDB.createAnonymousAliasScopeDomain(CalledFunc->getName()); 
-  for (unsigned i = 0, e = NoAliasArgs.size(); i != e; ++i) { 
-    const Argument *A = NoAliasArgs[i]; 
- 
-    std::string Name = std::string(CalledFunc->getName()); 
-    if (A->hasName()) { 
-      Name += ": %"; 
-      Name += A->getName(); 
-    } else { 
-      Name += ": argument "; 
-      Name += utostr(i); 
-    } 
- 
-    // Note: We always create a new anonymous root here. This is true regardless 
-    // of the linkage of the callee because the aliasing "scope" is not just a 
-    // property of the callee, but also all control dependencies in the caller. 
-    MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name); 
-    NewScopes.insert(std::make_pair(A, NewScope)); 
+  }
+}
+
+/// If the inlined function has noalias arguments,
+/// then add new alias scopes for each noalias argument, tag the mapped noalias
+/// parameters with noalias metadata specifying the new scope, and tag all
+/// non-derived loads, stores and memory intrinsics with the new alias scopes.
+static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
+                                  const DataLayout &DL, AAResults *CalleeAAR) {
+  if (!EnableNoAliasConversion)
+    return;
+
+  const Function *CalledFunc = CB.getCalledFunction();
+  SmallVector<const Argument *, 4> NoAliasArgs;
+
+  for (const Argument &Arg : CalledFunc->args())
+    if (CB.paramHasAttr(Arg.getArgNo(), Attribute::NoAlias) && !Arg.use_empty())
+      NoAliasArgs.push_back(&Arg);
+
+  if (NoAliasArgs.empty())
+    return;
+
+  // To do a good job, if a noalias variable is captured, we need to know if
+  // the capture point dominates the particular use we're considering.
+  DominatorTree DT;
+  DT.recalculate(const_cast<Function&>(*CalledFunc));
+
+  // noalias indicates that pointer values based on the argument do not alias
+  // pointer values which are not based on it. So we add a new "scope" for each
+  // noalias function argument. Accesses using pointers based on that argument
+  // become part of that alias scope, accesses using pointers not based on that
+  // argument are tagged as noalias with that scope.
+
+  DenseMap<const Argument *, MDNode *> NewScopes;
+  MDBuilder MDB(CalledFunc->getContext());
+
+  // Create a new scope domain for this function.
+  MDNode *NewDomain =
+    MDB.createAnonymousAliasScopeDomain(CalledFunc->getName());
+  for (unsigned i = 0, e = NoAliasArgs.size(); i != e; ++i) {
+    const Argument *A = NoAliasArgs[i];
+
+    std::string Name = std::string(CalledFunc->getName());
+    if (A->hasName()) {
+      Name += ": %";
+      Name += A->getName();
+    } else {
+      Name += ": argument ";
+      Name += utostr(i);
+    }
+
+    // Note: We always create a new anonymous root here. This is true regardless
+    // of the linkage of the callee because the aliasing "scope" is not just a
+    // property of the callee, but also all control dependencies in the caller.
+    MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name);
+    NewScopes.insert(std::make_pair(A, NewScope));
 
     if (UseNoAliasIntrinsic) {
       // Introduce a llvm.experimental.noalias.scope.decl for the noalias
@@ -988,803 +988,803 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
       // llvm.noalias intrinsic is introduced.
       (void)NoAliasDecl;
     }
-  } 
- 
-  // Iterate over all new instructions in the map; for all memory-access 
-  // instructions, add the alias scope metadata. 
-  for (ValueToValueMapTy::iterator VMI = VMap.begin(), VMIE = VMap.end(); 
-       VMI != VMIE; ++VMI) { 
-    if (const Instruction *I = dyn_cast<Instruction>(VMI->first)) { 
-      if (!VMI->second) 
-        continue; 
- 
-      Instruction *NI = dyn_cast<Instruction>(VMI->second); 
-      if (!NI) 
-        continue; 
- 
-      bool IsArgMemOnlyCall = false, IsFuncCall = false; 
-      SmallVector<const Value *, 2> PtrArgs; 
- 
-      if (const LoadInst *LI = dyn_cast<LoadInst>(I)) 
-        PtrArgs.push_back(LI->getPointerOperand()); 
-      else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) 
-        PtrArgs.push_back(SI->getPointerOperand()); 
-      else if (const VAArgInst *VAAI = dyn_cast<VAArgInst>(I)) 
-        PtrArgs.push_back(VAAI->getPointerOperand()); 
-      else if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(I)) 
-        PtrArgs.push_back(CXI->getPointerOperand()); 
-      else if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I)) 
-        PtrArgs.push_back(RMWI->getPointerOperand()); 
-      else if (const auto *Call = dyn_cast<CallBase>(I)) { 
-        // If we know that the call does not access memory, then we'll still 
-        // know that about the inlined clone of this call site, and we don't 
-        // need to add metadata. 
-        if (Call->doesNotAccessMemory()) 
-          continue; 
- 
-        IsFuncCall = true; 
-        if (CalleeAAR) { 
-          FunctionModRefBehavior MRB = CalleeAAR->getModRefBehavior(Call); 
-          if (AAResults::onlyAccessesArgPointees(MRB)) 
-            IsArgMemOnlyCall = true; 
-        } 
- 
-        for (Value *Arg : Call->args()) { 
-          // We need to check the underlying objects of all arguments, not just 
-          // the pointer arguments, because we might be passing pointers as 
-          // integers, etc. 
-          // However, if we know that the call only accesses pointer arguments, 
-          // then we only need to check the pointer arguments. 
-          if (IsArgMemOnlyCall && !Arg->getType()->isPointerTy()) 
-            continue; 
- 
-          PtrArgs.push_back(Arg); 
-        } 
-      } 
- 
-      // If we found no pointers, then this instruction is not suitable for 
-      // pairing with an instruction to receive aliasing metadata. 
-      // However, if this is a call, this we might just alias with none of the 
-      // noalias arguments. 
-      if (PtrArgs.empty() && !IsFuncCall) 
-        continue; 
- 
-      // It is possible that there is only one underlying object, but you 
-      // need to go through several PHIs to see it, and thus could be 
-      // repeated in the Objects list. 
-      SmallPtrSet<const Value *, 4> ObjSet; 
-      SmallVector<Metadata *, 4> Scopes, NoAliases; 
- 
-      SmallSetVector<const Argument *, 4> NAPtrArgs; 
-      for (const Value *V : PtrArgs) { 
-        SmallVector<const Value *, 4> Objects; 
+  }
+
+  // Iterate over all new instructions in the map; for all memory-access
+  // instructions, add the alias scope metadata.
+  for (ValueToValueMapTy::iterator VMI = VMap.begin(), VMIE = VMap.end();
+       VMI != VMIE; ++VMI) {
+    if (const Instruction *I = dyn_cast<Instruction>(VMI->first)) {
+      if (!VMI->second)
+        continue;
+
+      Instruction *NI = dyn_cast<Instruction>(VMI->second);
+      if (!NI)
+        continue;
+
+      bool IsArgMemOnlyCall = false, IsFuncCall = false;
+      SmallVector<const Value *, 2> PtrArgs;
+
+      if (const LoadInst *LI = dyn_cast<LoadInst>(I))
+        PtrArgs.push_back(LI->getPointerOperand());
+      else if (const StoreInst *SI = dyn_cast<StoreInst>(I))
+        PtrArgs.push_back(SI->getPointerOperand());
+      else if (const VAArgInst *VAAI = dyn_cast<VAArgInst>(I))
+        PtrArgs.push_back(VAAI->getPointerOperand());
+      else if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(I))
+        PtrArgs.push_back(CXI->getPointerOperand());
+      else if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I))
+        PtrArgs.push_back(RMWI->getPointerOperand());
+      else if (const auto *Call = dyn_cast<CallBase>(I)) {
+        // If we know that the call does not access memory, then we'll still
+        // know that about the inlined clone of this call site, and we don't
+        // need to add metadata.
+        if (Call->doesNotAccessMemory())
+          continue;
+
+        IsFuncCall = true;
+        if (CalleeAAR) {
+          FunctionModRefBehavior MRB = CalleeAAR->getModRefBehavior(Call);
+          if (AAResults::onlyAccessesArgPointees(MRB))
+            IsArgMemOnlyCall = true;
+        }
+
+        for (Value *Arg : Call->args()) {
+          // We need to check the underlying objects of all arguments, not just
+          // the pointer arguments, because we might be passing pointers as
+          // integers, etc.
+          // However, if we know that the call only accesses pointer arguments,
+          // then we only need to check the pointer arguments.
+          if (IsArgMemOnlyCall && !Arg->getType()->isPointerTy())
+            continue;
+
+          PtrArgs.push_back(Arg);
+        }
+      }
+
+      // If we found no pointers, then this instruction is not suitable for
+      // pairing with an instruction to receive aliasing metadata.
+      // However, if this is a call, this we might just alias with none of the
+      // noalias arguments.
+      if (PtrArgs.empty() && !IsFuncCall)
+        continue;
+
+      // It is possible that there is only one underlying object, but you
+      // need to go through several PHIs to see it, and thus could be
+      // repeated in the Objects list.
+      SmallPtrSet<const Value *, 4> ObjSet;
+      SmallVector<Metadata *, 4> Scopes, NoAliases;
+
+      SmallSetVector<const Argument *, 4> NAPtrArgs;
+      for (const Value *V : PtrArgs) {
+        SmallVector<const Value *, 4> Objects;
         getUnderlyingObjects(V, Objects, /* LI = */ nullptr);
- 
-        for (const Value *O : Objects) 
-          ObjSet.insert(O); 
-      } 
- 
-      // Figure out if we're derived from anything that is not a noalias 
-      // argument. 
-      bool CanDeriveViaCapture = false, UsesAliasingPtr = false; 
-      for (const Value *V : ObjSet) { 
-        // Is this value a constant that cannot be derived from any pointer 
-        // value (we need to exclude constant expressions, for example, that 
-        // are formed from arithmetic on global symbols). 
-        bool IsNonPtrConst = isa<ConstantInt>(V) || isa<ConstantFP>(V) || 
-                             isa<ConstantPointerNull>(V) || 
-                             isa<ConstantDataVector>(V) || isa<UndefValue>(V); 
-        if (IsNonPtrConst) 
-          continue; 
- 
-        // If this is anything other than a noalias argument, then we cannot 
-        // completely describe the aliasing properties using alias.scope 
-        // metadata (and, thus, won't add any). 
-        if (const Argument *A = dyn_cast<Argument>(V)) { 
-          if (!CB.paramHasAttr(A->getArgNo(), Attribute::NoAlias)) 
-            UsesAliasingPtr = true; 
-        } else { 
-          UsesAliasingPtr = true; 
-        } 
- 
-        // If this is not some identified function-local object (which cannot 
-        // directly alias a noalias argument), or some other argument (which, 
-        // by definition, also cannot alias a noalias argument), then we could 
-        // alias a noalias argument that has been captured). 
-        if (!isa<Argument>(V) && 
-            !isIdentifiedFunctionLocal(const_cast<Value*>(V))) 
-          CanDeriveViaCapture = true; 
-      } 
- 
-      // A function call can always get captured noalias pointers (via other 
-      // parameters, globals, etc.). 
-      if (IsFuncCall && !IsArgMemOnlyCall) 
-        CanDeriveViaCapture = true; 
- 
-      // First, we want to figure out all of the sets with which we definitely 
-      // don't alias. Iterate over all noalias set, and add those for which: 
-      //   1. The noalias argument is not in the set of objects from which we 
-      //      definitely derive. 
-      //   2. The noalias argument has not yet been captured. 
-      // An arbitrary function that might load pointers could see captured 
-      // noalias arguments via other noalias arguments or globals, and so we 
-      // must always check for prior capture. 
-      for (const Argument *A : NoAliasArgs) { 
-        if (!ObjSet.count(A) && (!CanDeriveViaCapture || 
-                                 // It might be tempting to skip the 
-                                 // PointerMayBeCapturedBefore check if 
-                                 // A->hasNoCaptureAttr() is true, but this is 
-                                 // incorrect because nocapture only guarantees 
-                                 // that no copies outlive the function, not 
-                                 // that the value cannot be locally captured. 
-                                 !PointerMayBeCapturedBefore(A, 
-                                   /* ReturnCaptures */ false, 
-                                   /* StoreCaptures */ false, I, &DT))) 
-          NoAliases.push_back(NewScopes[A]); 
-      } 
- 
-      if (!NoAliases.empty()) 
-        NI->setMetadata(LLVMContext::MD_noalias, 
-                        MDNode::concatenate( 
-                            NI->getMetadata(LLVMContext::MD_noalias), 
-                            MDNode::get(CalledFunc->getContext(), NoAliases))); 
- 
-      // Next, we want to figure out all of the sets to which we might belong. 
-      // We might belong to a set if the noalias argument is in the set of 
-      // underlying objects. If there is some non-noalias argument in our list 
-      // of underlying objects, then we cannot add a scope because the fact 
-      // that some access does not alias with any set of our noalias arguments 
-      // cannot itself guarantee that it does not alias with this access 
-      // (because there is some pointer of unknown origin involved and the 
-      // other access might also depend on this pointer). We also cannot add 
-      // scopes to arbitrary functions unless we know they don't access any 
-      // non-parameter pointer-values. 
-      bool CanAddScopes = !UsesAliasingPtr; 
-      if (CanAddScopes && IsFuncCall) 
-        CanAddScopes = IsArgMemOnlyCall; 
- 
-      if (CanAddScopes) 
-        for (const Argument *A : NoAliasArgs) { 
-          if (ObjSet.count(A)) 
-            Scopes.push_back(NewScopes[A]); 
-        } 
- 
-      if (!Scopes.empty()) 
-        NI->setMetadata( 
-            LLVMContext::MD_alias_scope, 
-            MDNode::concatenate(NI->getMetadata(LLVMContext::MD_alias_scope), 
-                                MDNode::get(CalledFunc->getContext(), Scopes))); 
-    } 
-  } 
-} 
- 
-static bool MayContainThrowingOrExitingCall(Instruction *Begin, 
-                                            Instruction *End) { 
- 
-  assert(Begin->getParent() == End->getParent() && 
-         "Expected to be in same basic block!"); 
-  unsigned NumInstChecked = 0; 
-  // Check that all instructions in the range [Begin, End) are guaranteed to 
-  // transfer execution to successor. 
-  for (auto &I : make_range(Begin->getIterator(), End->getIterator())) 
-    if (NumInstChecked++ > InlinerAttributeWindow || 
-        !isGuaranteedToTransferExecutionToSuccessor(&I)) 
-      return true; 
-  return false; 
-} 
- 
-static AttrBuilder IdentifyValidAttributes(CallBase &CB) { 
- 
-  AttrBuilder AB(CB.getAttributes(), AttributeList::ReturnIndex); 
-  if (AB.empty()) 
-    return AB; 
-  AttrBuilder Valid; 
-  // Only allow these white listed attributes to be propagated back to the 
-  // callee. This is because other attributes may only be valid on the call 
-  // itself, i.e. attributes such as signext and zeroext. 
-  if (auto DerefBytes = AB.getDereferenceableBytes()) 
-    Valid.addDereferenceableAttr(DerefBytes); 
-  if (auto DerefOrNullBytes = AB.getDereferenceableOrNullBytes()) 
-    Valid.addDereferenceableOrNullAttr(DerefOrNullBytes); 
-  if (AB.contains(Attribute::NoAlias)) 
-    Valid.addAttribute(Attribute::NoAlias); 
-  if (AB.contains(Attribute::NonNull)) 
-    Valid.addAttribute(Attribute::NonNull); 
-  return Valid; 
-} 
- 
-static void AddReturnAttributes(CallBase &CB, ValueToValueMapTy &VMap) { 
-  if (!UpdateReturnAttributes) 
-    return; 
- 
-  AttrBuilder Valid = IdentifyValidAttributes(CB); 
-  if (Valid.empty()) 
-    return; 
-  auto *CalledFunction = CB.getCalledFunction(); 
-  auto &Context = CalledFunction->getContext(); 
- 
-  for (auto &BB : *CalledFunction) { 
-    auto *RI = dyn_cast<ReturnInst>(BB.getTerminator()); 
-    if (!RI || !isa<CallBase>(RI->getOperand(0))) 
-      continue; 
-    auto *RetVal = cast<CallBase>(RI->getOperand(0)); 
-    // Sanity check that the cloned RetVal exists and is a call, otherwise we 
-    // cannot add the attributes on the cloned RetVal. 
-    // Simplification during inlining could have transformed the cloned 
-    // instruction. 
-    auto *NewRetVal = dyn_cast_or_null<CallBase>(VMap.lookup(RetVal)); 
-    if (!NewRetVal) 
-      continue; 
-    // Backward propagation of attributes to the returned value may be incorrect 
-    // if it is control flow dependent. 
-    // Consider: 
-    // @callee { 
-    //  %rv = call @foo() 
-    //  %rv2 = call @bar() 
-    //  if (%rv2 != null) 
-    //    return %rv2 
-    //  if (%rv == null) 
-    //    exit() 
-    //  return %rv 
-    // } 
-    // caller() { 
-    //   %val = call nonnull @callee() 
-    // } 
-    // Here we cannot add the nonnull attribute on either foo or bar. So, we 
-    // limit the check to both RetVal and RI are in the same basic block and 
-    // there are no throwing/exiting instructions between these instructions. 
-    if (RI->getParent() != RetVal->getParent() || 
-        MayContainThrowingOrExitingCall(RetVal, RI)) 
-      continue; 
-    // Add to the existing attributes of NewRetVal, i.e. the cloned call 
-    // instruction. 
-    // NB! When we have the same attribute already existing on NewRetVal, but 
-    // with a differing value, the AttributeList's merge API honours the already 
-    // existing attribute value (i.e. attributes such as dereferenceable, 
-    // dereferenceable_or_null etc). See AttrBuilder::merge for more details. 
-    AttributeList AL = NewRetVal->getAttributes(); 
-    AttributeList NewAL = 
-        AL.addAttributes(Context, AttributeList::ReturnIndex, Valid); 
-    NewRetVal->setAttributes(NewAL); 
-  } 
-} 
- 
-/// If the inlined function has non-byval align arguments, then 
-/// add @llvm.assume-based alignment assumptions to preserve this information. 
-static void AddAlignmentAssumptions(CallBase &CB, InlineFunctionInfo &IFI) { 
-  if (!PreserveAlignmentAssumptions || !IFI.GetAssumptionCache) 
-    return; 
- 
-  AssumptionCache *AC = &IFI.GetAssumptionCache(*CB.getCaller()); 
-  auto &DL = CB.getCaller()->getParent()->getDataLayout(); 
- 
-  // To avoid inserting redundant assumptions, we should check for assumptions 
-  // already in the caller. To do this, we might need a DT of the caller. 
-  DominatorTree DT; 
-  bool DTCalculated = false; 
- 
-  Function *CalledFunc = CB.getCalledFunction(); 
-  for (Argument &Arg : CalledFunc->args()) { 
-    unsigned Align = Arg.getType()->isPointerTy() ? Arg.getParamAlignment() : 0; 
+
+        for (const Value *O : Objects)
+          ObjSet.insert(O);
+      }
+
+      // Figure out if we're derived from anything that is not a noalias
+      // argument.
+      bool CanDeriveViaCapture = false, UsesAliasingPtr = false;
+      for (const Value *V : ObjSet) {
+        // Is this value a constant that cannot be derived from any pointer
+        // value (we need to exclude constant expressions, for example, that
+        // are formed from arithmetic on global symbols).
+        bool IsNonPtrConst = isa<ConstantInt>(V) || isa<ConstantFP>(V) ||
+                             isa<ConstantPointerNull>(V) ||
+                             isa<ConstantDataVector>(V) || isa<UndefValue>(V);
+        if (IsNonPtrConst)
+          continue;
+
+        // If this is anything other than a noalias argument, then we cannot
+        // completely describe the aliasing properties using alias.scope
+        // metadata (and, thus, won't add any).
+        if (const Argument *A = dyn_cast<Argument>(V)) {
+          if (!CB.paramHasAttr(A->getArgNo(), Attribute::NoAlias))
+            UsesAliasingPtr = true;
+        } else {
+          UsesAliasingPtr = true;
+        }
+
+        // If this is not some identified function-local object (which cannot
+        // directly alias a noalias argument), or some other argument (which,
+        // by definition, also cannot alias a noalias argument), then we could
+        // alias a noalias argument that has been captured).
+        if (!isa<Argument>(V) &&
+            !isIdentifiedFunctionLocal(const_cast<Value*>(V)))
+          CanDeriveViaCapture = true;
+      }
+
+      // A function call can always get captured noalias pointers (via other
+      // parameters, globals, etc.).
+      if (IsFuncCall && !IsArgMemOnlyCall)
+        CanDeriveViaCapture = true;
+
+      // First, we want to figure out all of the sets with which we definitely
+      // don't alias. Iterate over all noalias set, and add those for which:
+      //   1. The noalias argument is not in the set of objects from which we
+      //      definitely derive.
+      //   2. The noalias argument has not yet been captured.
+      // An arbitrary function that might load pointers could see captured
+      // noalias arguments via other noalias arguments or globals, and so we
+      // must always check for prior capture.
+      for (const Argument *A : NoAliasArgs) {
+        if (!ObjSet.count(A) && (!CanDeriveViaCapture ||
+                                 // It might be tempting to skip the
+                                 // PointerMayBeCapturedBefore check if
+                                 // A->hasNoCaptureAttr() is true, but this is
+                                 // incorrect because nocapture only guarantees
+                                 // that no copies outlive the function, not
+                                 // that the value cannot be locally captured.
+                                 !PointerMayBeCapturedBefore(A,
+                                   /* ReturnCaptures */ false,
+                                   /* StoreCaptures */ false, I, &DT)))
+          NoAliases.push_back(NewScopes[A]);
+      }
+
+      if (!NoAliases.empty())
+        NI->setMetadata(LLVMContext::MD_noalias,
+                        MDNode::concatenate(
+                            NI->getMetadata(LLVMContext::MD_noalias),
+                            MDNode::get(CalledFunc->getContext(), NoAliases)));
+
+      // Next, we want to figure out all of the sets to which we might belong.
+      // We might belong to a set if the noalias argument is in the set of
+      // underlying objects. If there is some non-noalias argument in our list
+      // of underlying objects, then we cannot add a scope because the fact
+      // that some access does not alias with any set of our noalias arguments
+      // cannot itself guarantee that it does not alias with this access
+      // (because there is some pointer of unknown origin involved and the
+      // other access might also depend on this pointer). We also cannot add
+      // scopes to arbitrary functions unless we know they don't access any
+      // non-parameter pointer-values.
+      bool CanAddScopes = !UsesAliasingPtr;
+      if (CanAddScopes && IsFuncCall)
+        CanAddScopes = IsArgMemOnlyCall;
+
+      if (CanAddScopes)
+        for (const Argument *A : NoAliasArgs) {
+          if (ObjSet.count(A))
+            Scopes.push_back(NewScopes[A]);
+        }
+
+      if (!Scopes.empty())
+        NI->setMetadata(
+            LLVMContext::MD_alias_scope,
+            MDNode::concatenate(NI->getMetadata(LLVMContext::MD_alias_scope),
+                                MDNode::get(CalledFunc->getContext(), Scopes)));
+    }
+  }
+}
+
+static bool MayContainThrowingOrExitingCall(Instruction *Begin,
+                                            Instruction *End) {
+
+  assert(Begin->getParent() == End->getParent() &&
+         "Expected to be in same basic block!");
+  unsigned NumInstChecked = 0;
+  // Check that all instructions in the range [Begin, End) are guaranteed to
+  // transfer execution to successor.
+  for (auto &I : make_range(Begin->getIterator(), End->getIterator()))
+    if (NumInstChecked++ > InlinerAttributeWindow ||
+        !isGuaranteedToTransferExecutionToSuccessor(&I))
+      return true;
+  return false;
+}
+
+static AttrBuilder IdentifyValidAttributes(CallBase &CB) {
+
+  AttrBuilder AB(CB.getAttributes(), AttributeList::ReturnIndex);
+  if (AB.empty())
+    return AB;
+  AttrBuilder Valid;
+  // Only allow these white listed attributes to be propagated back to the
+  // callee. This is because other attributes may only be valid on the call
+  // itself, i.e. attributes such as signext and zeroext.
+  if (auto DerefBytes = AB.getDereferenceableBytes())
+    Valid.addDereferenceableAttr(DerefBytes);
+  if (auto DerefOrNullBytes = AB.getDereferenceableOrNullBytes())
+    Valid.addDereferenceableOrNullAttr(DerefOrNullBytes);
+  if (AB.contains(Attribute::NoAlias))
+    Valid.addAttribute(Attribute::NoAlias);
+  if (AB.contains(Attribute::NonNull))
+    Valid.addAttribute(Attribute::NonNull);
+  return Valid;
+}
+
+static void AddReturnAttributes(CallBase &CB, ValueToValueMapTy &VMap) {
+  if (!UpdateReturnAttributes)
+    return;
+
+  AttrBuilder Valid = IdentifyValidAttributes(CB);
+  if (Valid.empty())
+    return;
+  auto *CalledFunction = CB.getCalledFunction();
+  auto &Context = CalledFunction->getContext();
+
+  for (auto &BB : *CalledFunction) {
+    auto *RI = dyn_cast<ReturnInst>(BB.getTerminator());
+    if (!RI || !isa<CallBase>(RI->getOperand(0)))
+      continue;
+    auto *RetVal = cast<CallBase>(RI->getOperand(0));
+    // Sanity check that the cloned RetVal exists and is a call, otherwise we
+    // cannot add the attributes on the cloned RetVal.
+    // Simplification during inlining could have transformed the cloned
+    // instruction.
+    auto *NewRetVal = dyn_cast_or_null<CallBase>(VMap.lookup(RetVal));
+    if (!NewRetVal)
+      continue;
+    // Backward propagation of attributes to the returned value may be incorrect
+    // if it is control flow dependent.
+    // Consider:
+    // @callee {
+    //  %rv = call @foo()
+    //  %rv2 = call @bar()
+    //  if (%rv2 != null)
+    //    return %rv2
+    //  if (%rv == null)
+    //    exit()
+    //  return %rv
+    // }
+    // caller() {
+    //   %val = call nonnull @callee()
+    // }
+    // Here we cannot add the nonnull attribute on either foo or bar. So, we
+    // limit the check to both RetVal and RI are in the same basic block and
+    // there are no throwing/exiting instructions between these instructions.
+    if (RI->getParent() != RetVal->getParent() ||
+        MayContainThrowingOrExitingCall(RetVal, RI))
+      continue;
+    // Add to the existing attributes of NewRetVal, i.e. the cloned call
+    // instruction.
+    // NB! When we have the same attribute already existing on NewRetVal, but
+    // with a differing value, the AttributeList's merge API honours the already
+    // existing attribute value (i.e. attributes such as dereferenceable,
+    // dereferenceable_or_null etc). See AttrBuilder::merge for more details.
+    AttributeList AL = NewRetVal->getAttributes();
+    AttributeList NewAL =
+        AL.addAttributes(Context, AttributeList::ReturnIndex, Valid);
+    NewRetVal->setAttributes(NewAL);
+  }
+}
+
+/// If the inlined function has non-byval align arguments, then
+/// add @llvm.assume-based alignment assumptions to preserve this information.
+static void AddAlignmentAssumptions(CallBase &CB, InlineFunctionInfo &IFI) {
+  if (!PreserveAlignmentAssumptions || !IFI.GetAssumptionCache)
+    return;
+
+  AssumptionCache *AC = &IFI.GetAssumptionCache(*CB.getCaller());
+  auto &DL = CB.getCaller()->getParent()->getDataLayout();
+
+  // To avoid inserting redundant assumptions, we should check for assumptions
+  // already in the caller. To do this, we might need a DT of the caller.
+  DominatorTree DT;
+  bool DTCalculated = false;
+
+  Function *CalledFunc = CB.getCalledFunction();
+  for (Argument &Arg : CalledFunc->args()) {
+    unsigned Align = Arg.getType()->isPointerTy() ? Arg.getParamAlignment() : 0;
     if (Align && !Arg.hasPassPointeeByValueCopyAttr() && !Arg.hasNUses(0)) {
-      if (!DTCalculated) { 
-        DT.recalculate(*CB.getCaller()); 
-        DTCalculated = true; 
-      } 
- 
-      // If we can already prove the asserted alignment in the context of the 
-      // caller, then don't bother inserting the assumption. 
-      Value *ArgVal = CB.getArgOperand(Arg.getArgNo()); 
-      if (getKnownAlignment(ArgVal, DL, &CB, AC, &DT) >= Align) 
-        continue; 
- 
-      CallInst *NewAsmp = 
-          IRBuilder<>(&CB).CreateAlignmentAssumption(DL, ArgVal, Align); 
-      AC->registerAssumption(NewAsmp); 
-    } 
-  } 
-} 
- 
-/// Once we have cloned code over from a callee into the caller, 
-/// update the specified callgraph to reflect the changes we made. 
-/// Note that it's possible that not all code was copied over, so only 
-/// some edges of the callgraph may remain. 
-static void UpdateCallGraphAfterInlining(CallBase &CB, 
-                                         Function::iterator FirstNewBlock, 
-                                         ValueToValueMapTy &VMap, 
-                                         InlineFunctionInfo &IFI) { 
-  CallGraph &CG = *IFI.CG; 
-  const Function *Caller = CB.getCaller(); 
-  const Function *Callee = CB.getCalledFunction(); 
-  CallGraphNode *CalleeNode = CG[Callee]; 
-  CallGraphNode *CallerNode = CG[Caller]; 
- 
-  // Since we inlined some uninlined call sites in the callee into the caller, 
-  // add edges from the caller to all of the callees of the callee. 
-  CallGraphNode::iterator I = CalleeNode->begin(), E = CalleeNode->end(); 
- 
-  // Consider the case where CalleeNode == CallerNode. 
-  CallGraphNode::CalledFunctionsVector CallCache; 
-  if (CalleeNode == CallerNode) { 
-    CallCache.assign(I, E); 
-    I = CallCache.begin(); 
-    E = CallCache.end(); 
-  } 
- 
-  for (; I != E; ++I) { 
-    // Skip 'refererence' call records. 
-    if (!I->first) 
-      continue; 
- 
-    const Value *OrigCall = *I->first; 
- 
-    ValueToValueMapTy::iterator VMI = VMap.find(OrigCall); 
-    // Only copy the edge if the call was inlined! 
-    if (VMI == VMap.end() || VMI->second == nullptr) 
-      continue; 
- 
-    // If the call was inlined, but then constant folded, there is no edge to 
-    // add.  Check for this case. 
-    auto *NewCall = dyn_cast<CallBase>(VMI->second); 
-    if (!NewCall) 
-      continue; 
- 
-    // We do not treat intrinsic calls like real function calls because we 
-    // expect them to become inline code; do not add an edge for an intrinsic. 
-    if (NewCall->getCalledFunction() && 
-        NewCall->getCalledFunction()->isIntrinsic()) 
-      continue; 
- 
-    // Remember that this call site got inlined for the client of 
-    // InlineFunction. 
-    IFI.InlinedCalls.push_back(NewCall); 
- 
-    // It's possible that inlining the callsite will cause it to go from an 
-    // indirect to a direct call by resolving a function pointer.  If this 
-    // happens, set the callee of the new call site to a more precise 
-    // destination.  This can also happen if the call graph node of the caller 
-    // was just unnecessarily imprecise. 
-    if (!I->second->getFunction()) 
-      if (Function *F = NewCall->getCalledFunction()) { 
-        // Indirect call site resolved to direct call. 
-        CallerNode->addCalledFunction(NewCall, CG[F]); 
- 
-        continue; 
-      } 
- 
-    CallerNode->addCalledFunction(NewCall, I->second); 
-  } 
- 
-  // Update the call graph by deleting the edge from Callee to Caller.  We must 
-  // do this after the loop above in case Caller and Callee are the same. 
-  CallerNode->removeCallEdgeFor(*cast<CallBase>(&CB)); 
-} 
- 
-static void HandleByValArgumentInit(Value *Dst, Value *Src, Module *M, 
-                                    BasicBlock *InsertBlock, 
-                                    InlineFunctionInfo &IFI) { 
-  Type *AggTy = cast<PointerType>(Src->getType())->getElementType(); 
-  IRBuilder<> Builder(InsertBlock, InsertBlock->begin()); 
- 
-  Value *Size = Builder.getInt64(M->getDataLayout().getTypeStoreSize(AggTy)); 
- 
-  // Always generate a memcpy of alignment 1 here because we don't know 
-  // the alignment of the src pointer.  Other optimizations can infer 
-  // better alignment. 
-  Builder.CreateMemCpy(Dst, /*DstAlign*/ Align(1), Src, 
-                       /*SrcAlign*/ Align(1), Size); 
-} 
- 
-/// When inlining a call site that has a byval argument, 
-/// we have to make the implicit memcpy explicit by adding it. 
-static Value *HandleByValArgument(Value *Arg, Instruction *TheCall, 
-                                  const Function *CalledFunc, 
-                                  InlineFunctionInfo &IFI, 
-                                  unsigned ByValAlignment) { 
-  PointerType *ArgTy = cast<PointerType>(Arg->getType()); 
-  Type *AggTy = ArgTy->getElementType(); 
- 
-  Function *Caller = TheCall->getFunction(); 
-  const DataLayout &DL = Caller->getParent()->getDataLayout(); 
- 
-  // If the called function is readonly, then it could not mutate the caller's 
-  // copy of the byval'd memory.  In this case, it is safe to elide the copy and 
-  // temporary. 
-  if (CalledFunc->onlyReadsMemory()) { 
-    // If the byval argument has a specified alignment that is greater than the 
-    // passed in pointer, then we either have to round up the input pointer or 
-    // give up on this transformation. 
-    if (ByValAlignment <= 1)  // 0 = unspecified, 1 = no particular alignment. 
-      return Arg; 
- 
-    AssumptionCache *AC = 
-        IFI.GetAssumptionCache ? &IFI.GetAssumptionCache(*Caller) : nullptr; 
- 
-    // If the pointer is already known to be sufficiently aligned, or if we can 
-    // round it up to a larger alignment, then we don't need a temporary. 
-    if (getOrEnforceKnownAlignment(Arg, Align(ByValAlignment), DL, TheCall, 
-                                   AC) >= ByValAlignment) 
-      return Arg; 
- 
-    // Otherwise, we have to make a memcpy to get a safe alignment.  This is bad 
-    // for code quality, but rarely happens and is required for correctness. 
-  } 
- 
-  // Create the alloca.  If we have DataLayout, use nice alignment. 
-  Align Alignment(DL.getPrefTypeAlignment(AggTy)); 
- 
-  // If the byval had an alignment specified, we *must* use at least that 
-  // alignment, as it is required by the byval argument (and uses of the 
-  // pointer inside the callee). 
-  Alignment = max(Alignment, MaybeAlign(ByValAlignment)); 
- 
-  Value *NewAlloca = 
-      new AllocaInst(AggTy, DL.getAllocaAddrSpace(), nullptr, Alignment, 
-                     Arg->getName(), &*Caller->begin()->begin()); 
-  IFI.StaticAllocas.push_back(cast<AllocaInst>(NewAlloca)); 
- 
-  // Uses of the argument in the function should use our new alloca 
-  // instead. 
-  return NewAlloca; 
-} 
- 
-// Check whether this Value is used by a lifetime intrinsic. 
-static bool isUsedByLifetimeMarker(Value *V) { 
-  for (User *U : V->users()) 
-    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) 
-      if (II->isLifetimeStartOrEnd()) 
-        return true; 
-  return false; 
-} 
- 
-// Check whether the given alloca already has 
-// lifetime.start or lifetime.end intrinsics. 
-static bool hasLifetimeMarkers(AllocaInst *AI) { 
-  Type *Ty = AI->getType(); 
-  Type *Int8PtrTy = Type::getInt8PtrTy(Ty->getContext(), 
-                                       Ty->getPointerAddressSpace()); 
-  if (Ty == Int8PtrTy) 
-    return isUsedByLifetimeMarker(AI); 
- 
-  // Do a scan to find all the casts to i8*. 
-  for (User *U : AI->users()) { 
-    if (U->getType() != Int8PtrTy) continue; 
-    if (U->stripPointerCasts() != AI) continue; 
-    if (isUsedByLifetimeMarker(U)) 
-      return true; 
-  } 
-  return false; 
-} 
- 
-/// Return the result of AI->isStaticAlloca() if AI were moved to the entry 
-/// block. Allocas used in inalloca calls and allocas of dynamic array size 
-/// cannot be static. 
-static bool allocaWouldBeStaticInEntry(const AllocaInst *AI ) { 
-  return isa<Constant>(AI->getArraySize()) && !AI->isUsedWithInAlloca(); 
-} 
- 
-/// Returns a DebugLoc for a new DILocation which is a clone of \p OrigDL 
-/// inlined at \p InlinedAt. \p IANodes is an inlined-at cache. 
-static DebugLoc inlineDebugLoc(DebugLoc OrigDL, DILocation *InlinedAt, 
-                               LLVMContext &Ctx, 
-                               DenseMap<const MDNode *, MDNode *> &IANodes) { 
-  auto IA = DebugLoc::appendInlinedAt(OrigDL, InlinedAt, Ctx, IANodes); 
+      if (!DTCalculated) {
+        DT.recalculate(*CB.getCaller());
+        DTCalculated = true;
+      }
+
+      // If we can already prove the asserted alignment in the context of the
+      // caller, then don't bother inserting the assumption.
+      Value *ArgVal = CB.getArgOperand(Arg.getArgNo());
+      if (getKnownAlignment(ArgVal, DL, &CB, AC, &DT) >= Align)
+        continue;
+
+      CallInst *NewAsmp =
+          IRBuilder<>(&CB).CreateAlignmentAssumption(DL, ArgVal, Align);
+      AC->registerAssumption(NewAsmp);
+    }
+  }
+}
+
+/// Once we have cloned code over from a callee into the caller,
+/// update the specified callgraph to reflect the changes we made.
+/// Note that it's possible that not all code was copied over, so only
+/// some edges of the callgraph may remain.
+static void UpdateCallGraphAfterInlining(CallBase &CB,
+                                         Function::iterator FirstNewBlock,
+                                         ValueToValueMapTy &VMap,
+                                         InlineFunctionInfo &IFI) {
+  CallGraph &CG = *IFI.CG;
+  const Function *Caller = CB.getCaller();
+  const Function *Callee = CB.getCalledFunction();
+  CallGraphNode *CalleeNode = CG[Callee];
+  CallGraphNode *CallerNode = CG[Caller];
+
+  // Since we inlined some uninlined call sites in the callee into the caller,
+  // add edges from the caller to all of the callees of the callee.
+  CallGraphNode::iterator I = CalleeNode->begin(), E = CalleeNode->end();
+
+  // Consider the case where CalleeNode == CallerNode.
+  CallGraphNode::CalledFunctionsVector CallCache;
+  if (CalleeNode == CallerNode) {
+    CallCache.assign(I, E);
+    I = CallCache.begin();
+    E = CallCache.end();
+  }
+
+  for (; I != E; ++I) {
+    // Skip 'refererence' call records.
+    if (!I->first)
+      continue;
+
+    const Value *OrigCall = *I->first;
+
+    ValueToValueMapTy::iterator VMI = VMap.find(OrigCall);
+    // Only copy the edge if the call was inlined!
+    if (VMI == VMap.end() || VMI->second == nullptr)
+      continue;
+
+    // If the call was inlined, but then constant folded, there is no edge to
+    // add.  Check for this case.
+    auto *NewCall = dyn_cast<CallBase>(VMI->second);
+    if (!NewCall)
+      continue;
+
+    // We do not treat intrinsic calls like real function calls because we
+    // expect them to become inline code; do not add an edge for an intrinsic.
+    if (NewCall->getCalledFunction() &&
+        NewCall->getCalledFunction()->isIntrinsic())
+      continue;
+
+    // Remember that this call site got inlined for the client of
+    // InlineFunction.
+    IFI.InlinedCalls.push_back(NewCall);
+
+    // It's possible that inlining the callsite will cause it to go from an
+    // indirect to a direct call by resolving a function pointer.  If this
+    // happens, set the callee of the new call site to a more precise
+    // destination.  This can also happen if the call graph node of the caller
+    // was just unnecessarily imprecise.
+    if (!I->second->getFunction())
+      if (Function *F = NewCall->getCalledFunction()) {
+        // Indirect call site resolved to direct call.
+        CallerNode->addCalledFunction(NewCall, CG[F]);
+
+        continue;
+      }
+
+    CallerNode->addCalledFunction(NewCall, I->second);
+  }
+
+  // Update the call graph by deleting the edge from Callee to Caller.  We must
+  // do this after the loop above in case Caller and Callee are the same.
+  CallerNode->removeCallEdgeFor(*cast<CallBase>(&CB));
+}
+
+static void HandleByValArgumentInit(Value *Dst, Value *Src, Module *M,
+                                    BasicBlock *InsertBlock,
+                                    InlineFunctionInfo &IFI) {
+  Type *AggTy = cast<PointerType>(Src->getType())->getElementType();
+  IRBuilder<> Builder(InsertBlock, InsertBlock->begin());
+
+  Value *Size = Builder.getInt64(M->getDataLayout().getTypeStoreSize(AggTy));
+
+  // Always generate a memcpy of alignment 1 here because we don't know
+  // the alignment of the src pointer.  Other optimizations can infer
+  // better alignment.
+  Builder.CreateMemCpy(Dst, /*DstAlign*/ Align(1), Src,
+                       /*SrcAlign*/ Align(1), Size);
+}
+
+/// When inlining a call site that has a byval argument,
+/// we have to make the implicit memcpy explicit by adding it.
+static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
+                                  const Function *CalledFunc,
+                                  InlineFunctionInfo &IFI,
+                                  unsigned ByValAlignment) {
+  PointerType *ArgTy = cast<PointerType>(Arg->getType());
+  Type *AggTy = ArgTy->getElementType();
+
+  Function *Caller = TheCall->getFunction();
+  const DataLayout &DL = Caller->getParent()->getDataLayout();
+
+  // If the called function is readonly, then it could not mutate the caller's
+  // copy of the byval'd memory.  In this case, it is safe to elide the copy and
+  // temporary.
+  if (CalledFunc->onlyReadsMemory()) {
+    // If the byval argument has a specified alignment that is greater than the
+    // passed in pointer, then we either have to round up the input pointer or
+    // give up on this transformation.
+    if (ByValAlignment <= 1)  // 0 = unspecified, 1 = no particular alignment.
+      return Arg;
+
+    AssumptionCache *AC =
+        IFI.GetAssumptionCache ? &IFI.GetAssumptionCache(*Caller) : nullptr;
+
+    // If the pointer is already known to be sufficiently aligned, or if we can
+    // round it up to a larger alignment, then we don't need a temporary.
+    if (getOrEnforceKnownAlignment(Arg, Align(ByValAlignment), DL, TheCall,
+                                   AC) >= ByValAlignment)
+      return Arg;
+
+    // Otherwise, we have to make a memcpy to get a safe alignment.  This is bad
+    // for code quality, but rarely happens and is required for correctness.
+  }
+
+  // Create the alloca.  If we have DataLayout, use nice alignment.
+  Align Alignment(DL.getPrefTypeAlignment(AggTy));
+
+  // If the byval had an alignment specified, we *must* use at least that
+  // alignment, as it is required by the byval argument (and uses of the
+  // pointer inside the callee).
+  Alignment = max(Alignment, MaybeAlign(ByValAlignment));
+
+  Value *NewAlloca =
+      new AllocaInst(AggTy, DL.getAllocaAddrSpace(), nullptr, Alignment,
+                     Arg->getName(), &*Caller->begin()->begin());
+  IFI.StaticAllocas.push_back(cast<AllocaInst>(NewAlloca));
+
+  // Uses of the argument in the function should use our new alloca
+  // instead.
+  return NewAlloca;
+}
+
+// Check whether this Value is used by a lifetime intrinsic.
+static bool isUsedByLifetimeMarker(Value *V) {
+  for (User *U : V->users())
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U))
+      if (II->isLifetimeStartOrEnd())
+        return true;
+  return false;
+}
+
+// Check whether the given alloca already has
+// lifetime.start or lifetime.end intrinsics.
+static bool hasLifetimeMarkers(AllocaInst *AI) {
+  Type *Ty = AI->getType();
+  Type *Int8PtrTy = Type::getInt8PtrTy(Ty->getContext(),
+                                       Ty->getPointerAddressSpace());
+  if (Ty == Int8PtrTy)
+    return isUsedByLifetimeMarker(AI);
+
+  // Do a scan to find all the casts to i8*.
+  for (User *U : AI->users()) {
+    if (U->getType() != Int8PtrTy) continue;
+    if (U->stripPointerCasts() != AI) continue;
+    if (isUsedByLifetimeMarker(U))
+      return true;
+  }
+  return false;
+}
+
+/// Return the result of AI->isStaticAlloca() if AI were moved to the entry
+/// block. Allocas used in inalloca calls and allocas of dynamic array size
+/// cannot be static.
+static bool allocaWouldBeStaticInEntry(const AllocaInst *AI ) {
+  return isa<Constant>(AI->getArraySize()) && !AI->isUsedWithInAlloca();
+}
+
+/// Returns a DebugLoc for a new DILocation which is a clone of \p OrigDL
+/// inlined at \p InlinedAt. \p IANodes is an inlined-at cache.
+static DebugLoc inlineDebugLoc(DebugLoc OrigDL, DILocation *InlinedAt,
+                               LLVMContext &Ctx,
+                               DenseMap<const MDNode *, MDNode *> &IANodes) {
+  auto IA = DebugLoc::appendInlinedAt(OrigDL, InlinedAt, Ctx, IANodes);
   return DILocation::get(Ctx, OrigDL.getLine(), OrigDL.getCol(),
                          OrigDL.getScope(), IA);
-} 
- 
-/// Update inlined instructions' line numbers to 
-/// to encode location where these instructions are inlined. 
-static void fixupLineNumbers(Function *Fn, Function::iterator FI, 
-                             Instruction *TheCall, bool CalleeHasDebugInfo) { 
-  const DebugLoc &TheCallDL = TheCall->getDebugLoc(); 
-  if (!TheCallDL) 
-    return; 
- 
-  auto &Ctx = Fn->getContext(); 
-  DILocation *InlinedAtNode = TheCallDL; 
- 
-  // Create a unique call site, not to be confused with any other call from the 
-  // same location. 
-  InlinedAtNode = DILocation::getDistinct( 
-      Ctx, InlinedAtNode->getLine(), InlinedAtNode->getColumn(), 
-      InlinedAtNode->getScope(), InlinedAtNode->getInlinedAt()); 
- 
-  // Cache the inlined-at nodes as they're built so they are reused, without 
-  // this every instruction's inlined-at chain would become distinct from each 
-  // other. 
-  DenseMap<const MDNode *, MDNode *> IANodes; 
- 
-  // Check if we are not generating inline line tables and want to use 
-  // the call site location instead. 
-  bool NoInlineLineTables = Fn->hasFnAttribute("no-inline-line-tables"); 
- 
-  for (; FI != Fn->end(); ++FI) { 
-    for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); 
-         BI != BE; ++BI) { 
-      // Loop metadata needs to be updated so that the start and end locs 
-      // reference inlined-at locations. 
-      auto updateLoopInfoLoc = [&Ctx, &InlinedAtNode, &IANodes]( 
-                                   const DILocation &Loc) -> DILocation * { 
-        return inlineDebugLoc(&Loc, InlinedAtNode, Ctx, IANodes).get(); 
-      }; 
-      updateLoopMetadataDebugLocations(*BI, updateLoopInfoLoc); 
- 
-      if (!NoInlineLineTables) 
-        if (DebugLoc DL = BI->getDebugLoc()) { 
-          DebugLoc IDL = 
-              inlineDebugLoc(DL, InlinedAtNode, BI->getContext(), IANodes); 
-          BI->setDebugLoc(IDL); 
-          continue; 
-        } 
- 
-      if (CalleeHasDebugInfo && !NoInlineLineTables) 
-        continue; 
- 
-      // If the inlined instruction has no line number, or if inline info 
-      // is not being generated, make it look as if it originates from the call 
-      // location. This is important for ((__always_inline, __nodebug__)) 
-      // functions which must use caller location for all instructions in their 
-      // function body. 
- 
-      // Don't update static allocas, as they may get moved later. 
-      if (auto *AI = dyn_cast<AllocaInst>(BI)) 
-        if (allocaWouldBeStaticInEntry(AI)) 
-          continue; 
- 
-      BI->setDebugLoc(TheCallDL); 
-    } 
- 
-    // Remove debug info intrinsics if we're not keeping inline info. 
-    if (NoInlineLineTables) { 
-      BasicBlock::iterator BI = FI->begin(); 
-      while (BI != FI->end()) { 
-        if (isa<DbgInfoIntrinsic>(BI)) { 
-          BI = BI->eraseFromParent(); 
-          continue; 
-        } 
-        ++BI; 
-      } 
-    } 
- 
-  } 
-} 
- 
-/// Update the block frequencies of the caller after a callee has been inlined. 
-/// 
-/// Each block cloned into the caller has its block frequency scaled by the 
-/// ratio of CallSiteFreq/CalleeEntryFreq. This ensures that the cloned copy of 
-/// callee's entry block gets the same frequency as the callsite block and the 
-/// relative frequencies of all cloned blocks remain the same after cloning. 
-static void updateCallerBFI(BasicBlock *CallSiteBlock, 
-                            const ValueToValueMapTy &VMap, 
-                            BlockFrequencyInfo *CallerBFI, 
-                            BlockFrequencyInfo *CalleeBFI, 
-                            const BasicBlock &CalleeEntryBlock) { 
-  SmallPtrSet<BasicBlock *, 16> ClonedBBs; 
-  for (auto Entry : VMap) { 
-    if (!isa<BasicBlock>(Entry.first) || !Entry.second) 
-      continue; 
-    auto *OrigBB = cast<BasicBlock>(Entry.first); 
-    auto *ClonedBB = cast<BasicBlock>(Entry.second); 
-    uint64_t Freq = CalleeBFI->getBlockFreq(OrigBB).getFrequency(); 
-    if (!ClonedBBs.insert(ClonedBB).second) { 
-      // Multiple blocks in the callee might get mapped to one cloned block in 
-      // the caller since we prune the callee as we clone it. When that happens, 
-      // we want to use the maximum among the original blocks' frequencies. 
-      uint64_t NewFreq = CallerBFI->getBlockFreq(ClonedBB).getFrequency(); 
-      if (NewFreq > Freq) 
-        Freq = NewFreq; 
-    } 
-    CallerBFI->setBlockFreq(ClonedBB, Freq); 
-  } 
-  BasicBlock *EntryClone = cast<BasicBlock>(VMap.lookup(&CalleeEntryBlock)); 
-  CallerBFI->setBlockFreqAndScale( 
-      EntryClone, CallerBFI->getBlockFreq(CallSiteBlock).getFrequency(), 
-      ClonedBBs); 
-} 
- 
-/// Update the branch metadata for cloned call instructions. 
-static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap, 
-                              const ProfileCount &CalleeEntryCount, 
-                              const CallBase &TheCall, ProfileSummaryInfo *PSI, 
-                              BlockFrequencyInfo *CallerBFI) { 
-  if (!CalleeEntryCount.hasValue() || CalleeEntryCount.isSynthetic() || 
-      CalleeEntryCount.getCount() < 1) 
-    return; 
-  auto CallSiteCount = PSI ? PSI->getProfileCount(TheCall, CallerBFI) : None; 
-  int64_t CallCount = 
+}
+
+/// Update inlined instructions' line numbers to
+/// to encode location where these instructions are inlined.
+static void fixupLineNumbers(Function *Fn, Function::iterator FI,
+                             Instruction *TheCall, bool CalleeHasDebugInfo) {
+  const DebugLoc &TheCallDL = TheCall->getDebugLoc();
+  if (!TheCallDL)
+    return;
+
+  auto &Ctx = Fn->getContext();
+  DILocation *InlinedAtNode = TheCallDL;
+
+  // Create a unique call site, not to be confused with any other call from the
+  // same location.
+  InlinedAtNode = DILocation::getDistinct(
+      Ctx, InlinedAtNode->getLine(), InlinedAtNode->getColumn(),
+      InlinedAtNode->getScope(), InlinedAtNode->getInlinedAt());
+
+  // Cache the inlined-at nodes as they're built so they are reused, without
+  // this every instruction's inlined-at chain would become distinct from each
+  // other.
+  DenseMap<const MDNode *, MDNode *> IANodes;
+
+  // Check if we are not generating inline line tables and want to use
+  // the call site location instead.
+  bool NoInlineLineTables = Fn->hasFnAttribute("no-inline-line-tables");
+
+  for (; FI != Fn->end(); ++FI) {
+    for (BasicBlock::iterator BI = FI->begin(), BE = FI->end();
+         BI != BE; ++BI) {
+      // Loop metadata needs to be updated so that the start and end locs
+      // reference inlined-at locations.
+      auto updateLoopInfoLoc = [&Ctx, &InlinedAtNode, &IANodes](
+                                   const DILocation &Loc) -> DILocation * {
+        return inlineDebugLoc(&Loc, InlinedAtNode, Ctx, IANodes).get();
+      };
+      updateLoopMetadataDebugLocations(*BI, updateLoopInfoLoc);
+
+      if (!NoInlineLineTables)
+        if (DebugLoc DL = BI->getDebugLoc()) {
+          DebugLoc IDL =
+              inlineDebugLoc(DL, InlinedAtNode, BI->getContext(), IANodes);
+          BI->setDebugLoc(IDL);
+          continue;
+        }
+
+      if (CalleeHasDebugInfo && !NoInlineLineTables)
+        continue;
+
+      // If the inlined instruction has no line number, or if inline info
+      // is not being generated, make it look as if it originates from the call
+      // location. This is important for ((__always_inline, __nodebug__))
+      // functions which must use caller location for all instructions in their
+      // function body.
+
+      // Don't update static allocas, as they may get moved later.
+      if (auto *AI = dyn_cast<AllocaInst>(BI))
+        if (allocaWouldBeStaticInEntry(AI))
+          continue;
+
+      BI->setDebugLoc(TheCallDL);
+    }
+
+    // Remove debug info intrinsics if we're not keeping inline info.
+    if (NoInlineLineTables) {
+      BasicBlock::iterator BI = FI->begin();
+      while (BI != FI->end()) {
+        if (isa<DbgInfoIntrinsic>(BI)) {
+          BI = BI->eraseFromParent();
+          continue;
+        }
+        ++BI;
+      }
+    }
+
+  }
+}
+
+/// Update the block frequencies of the caller after a callee has been inlined.
+///
+/// Each block cloned into the caller has its block frequency scaled by the
+/// ratio of CallSiteFreq/CalleeEntryFreq. This ensures that the cloned copy of
+/// callee's entry block gets the same frequency as the callsite block and the
+/// relative frequencies of all cloned blocks remain the same after cloning.
+static void updateCallerBFI(BasicBlock *CallSiteBlock,
+                            const ValueToValueMapTy &VMap,
+                            BlockFrequencyInfo *CallerBFI,
+                            BlockFrequencyInfo *CalleeBFI,
+                            const BasicBlock &CalleeEntryBlock) {
+  SmallPtrSet<BasicBlock *, 16> ClonedBBs;
+  for (auto Entry : VMap) {
+    if (!isa<BasicBlock>(Entry.first) || !Entry.second)
+      continue;
+    auto *OrigBB = cast<BasicBlock>(Entry.first);
+    auto *ClonedBB = cast<BasicBlock>(Entry.second);
+    uint64_t Freq = CalleeBFI->getBlockFreq(OrigBB).getFrequency();
+    if (!ClonedBBs.insert(ClonedBB).second) {
+      // Multiple blocks in the callee might get mapped to one cloned block in
+      // the caller since we prune the callee as we clone it. When that happens,
+      // we want to use the maximum among the original blocks' frequencies.
+      uint64_t NewFreq = CallerBFI->getBlockFreq(ClonedBB).getFrequency();
+      if (NewFreq > Freq)
+        Freq = NewFreq;
+    }
+    CallerBFI->setBlockFreq(ClonedBB, Freq);
+  }
+  BasicBlock *EntryClone = cast<BasicBlock>(VMap.lookup(&CalleeEntryBlock));
+  CallerBFI->setBlockFreqAndScale(
+      EntryClone, CallerBFI->getBlockFreq(CallSiteBlock).getFrequency(),
+      ClonedBBs);
+}
+
+/// Update the branch metadata for cloned call instructions.
+static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap,
+                              const ProfileCount &CalleeEntryCount,
+                              const CallBase &TheCall, ProfileSummaryInfo *PSI,
+                              BlockFrequencyInfo *CallerBFI) {
+  if (!CalleeEntryCount.hasValue() || CalleeEntryCount.isSynthetic() ||
+      CalleeEntryCount.getCount() < 1)
+    return;
+  auto CallSiteCount = PSI ? PSI->getProfileCount(TheCall, CallerBFI) : None;
+  int64_t CallCount =
       std::min(CallSiteCount.getValueOr(0), CalleeEntryCount.getCount());
-  updateProfileCallee(Callee, -CallCount, &VMap); 
-} 
- 
-void llvm::updateProfileCallee( 
-    Function *Callee, int64_t entryDelta, 
-    const ValueMap<const Value *, WeakTrackingVH> *VMap) { 
-  auto CalleeCount = Callee->getEntryCount(); 
-  if (!CalleeCount.hasValue()) 
-    return; 
- 
-  uint64_t priorEntryCount = CalleeCount.getCount(); 
-  uint64_t newEntryCount; 
- 
-  // Since CallSiteCount is an estimate, it could exceed the original callee 
-  // count and has to be set to 0 so guard against underflow. 
-  if (entryDelta < 0 && static_cast<uint64_t>(-entryDelta) > priorEntryCount) 
-    newEntryCount = 0; 
-  else 
-    newEntryCount = priorEntryCount + entryDelta; 
- 
-  // During inlining ? 
-  if (VMap) { 
-    uint64_t cloneEntryCount = priorEntryCount - newEntryCount; 
-    for (auto Entry : *VMap) 
-      if (isa<CallInst>(Entry.first)) 
-        if (auto *CI = dyn_cast_or_null<CallInst>(Entry.second)) 
-          CI->updateProfWeight(cloneEntryCount, priorEntryCount); 
-  } 
- 
-  if (entryDelta) { 
-    Callee->setEntryCount(newEntryCount); 
- 
-    for (BasicBlock &BB : *Callee) 
-      // No need to update the callsite if it is pruned during inlining. 
-      if (!VMap || VMap->count(&BB)) 
-        for (Instruction &I : BB) 
-          if (CallInst *CI = dyn_cast<CallInst>(&I)) 
-            CI->updateProfWeight(newEntryCount, priorEntryCount); 
-  } 
-} 
- 
-/// This function inlines the called function into the basic block of the 
-/// caller. This returns false if it is not possible to inline this call. 
-/// The program is still in a well defined state if this occurs though. 
-/// 
-/// Note that this only does one level of inlining.  For example, if the 
-/// instruction 'call B' is inlined, and 'B' calls 'C', then the call to 'C' now 
-/// exists in the instruction stream.  Similarly this will inline a recursive 
-/// function by one level. 
-llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, 
-                                        AAResults *CalleeAAR, 
-                                        bool InsertLifetime, 
-                                        Function *ForwardVarArgsTo) { 
-  assert(CB.getParent() && CB.getFunction() && "Instruction not in function!"); 
- 
-  // FIXME: we don't inline callbr yet. 
-  if (isa<CallBrInst>(CB)) 
-    return InlineResult::failure("We don't inline callbr yet."); 
- 
-  // If IFI has any state in it, zap it before we fill it in. 
-  IFI.reset(); 
- 
-  Function *CalledFunc = CB.getCalledFunction(); 
-  if (!CalledFunc ||               // Can't inline external function or indirect 
-      CalledFunc->isDeclaration()) // call! 
-    return InlineResult::failure("external or indirect"); 
- 
-  // The inliner does not know how to inline through calls with operand bundles 
-  // in general ... 
-  if (CB.hasOperandBundles()) { 
-    for (int i = 0, e = CB.getNumOperandBundles(); i != e; ++i) { 
-      uint32_t Tag = CB.getOperandBundleAt(i).getTagID(); 
-      // ... but it knows how to inline through "deopt" operand bundles ... 
-      if (Tag == LLVMContext::OB_deopt) 
-        continue; 
-      // ... and "funclet" operand bundles. 
-      if (Tag == LLVMContext::OB_funclet) 
-        continue; 
- 
-      return InlineResult::failure("unsupported operand bundle"); 
-    } 
-  } 
- 
-  // If the call to the callee cannot throw, set the 'nounwind' flag on any 
-  // calls that we inline. 
-  bool MarkNoUnwind = CB.doesNotThrow(); 
- 
-  BasicBlock *OrigBB = CB.getParent(); 
-  Function *Caller = OrigBB->getParent(); 
- 
-  // GC poses two hazards to inlining, which only occur when the callee has GC: 
-  //  1. If the caller has no GC, then the callee's GC must be propagated to the 
-  //     caller. 
-  //  2. If the caller has a differing GC, it is invalid to inline. 
-  if (CalledFunc->hasGC()) { 
-    if (!Caller->hasGC()) 
-      Caller->setGC(CalledFunc->getGC()); 
-    else if (CalledFunc->getGC() != Caller->getGC()) 
-      return InlineResult::failure("incompatible GC"); 
-  } 
- 
-  // Get the personality function from the callee if it contains a landing pad. 
-  Constant *CalledPersonality = 
-      CalledFunc->hasPersonalityFn() 
-          ? CalledFunc->getPersonalityFn()->stripPointerCasts() 
-          : nullptr; 
- 
-  // Find the personality function used by the landing pads of the caller. If it 
-  // exists, then check to see that it matches the personality function used in 
-  // the callee. 
-  Constant *CallerPersonality = 
-      Caller->hasPersonalityFn() 
-          ? Caller->getPersonalityFn()->stripPointerCasts() 
-          : nullptr; 
-  if (CalledPersonality) { 
-    if (!CallerPersonality) 
-      Caller->setPersonalityFn(CalledPersonality); 
-    // If the personality functions match, then we can perform the 
-    // inlining. Otherwise, we can't inline. 
-    // TODO: This isn't 100% true. Some personality functions are proper 
-    //       supersets of others and can be used in place of the other. 
-    else if (CalledPersonality != CallerPersonality) 
-      return InlineResult::failure("incompatible personality"); 
-  } 
- 
-  // We need to figure out which funclet the callsite was in so that we may 
-  // properly nest the callee. 
-  Instruction *CallSiteEHPad = nullptr; 
-  if (CallerPersonality) { 
-    EHPersonality Personality = classifyEHPersonality(CallerPersonality); 
-    if (isScopedEHPersonality(Personality)) { 
-      Optional<OperandBundleUse> ParentFunclet = 
-          CB.getOperandBundle(LLVMContext::OB_funclet); 
-      if (ParentFunclet) 
-        CallSiteEHPad = cast<FuncletPadInst>(ParentFunclet->Inputs.front()); 
- 
-      // OK, the inlining site is legal.  What about the target function? 
- 
-      if (CallSiteEHPad) { 
-        if (Personality == EHPersonality::MSVC_CXX) { 
-          // The MSVC personality cannot tolerate catches getting inlined into 
-          // cleanup funclets. 
-          if (isa<CleanupPadInst>(CallSiteEHPad)) { 
-            // Ok, the call site is within a cleanuppad.  Let's check the callee 
-            // for catchpads. 
-            for (const BasicBlock &CalledBB : *CalledFunc) { 
-              if (isa<CatchSwitchInst>(CalledBB.getFirstNonPHI())) 
-                return InlineResult::failure("catch in cleanup funclet"); 
-            } 
-          } 
-        } else if (isAsynchronousEHPersonality(Personality)) { 
-          // SEH is even less tolerant, there may not be any sort of exceptional 
-          // funclet in the callee. 
-          for (const BasicBlock &CalledBB : *CalledFunc) { 
-            if (CalledBB.isEHPad()) 
-              return InlineResult::failure("SEH in cleanup funclet"); 
-          } 
-        } 
-      } 
-    } 
-  } 
- 
-  // Determine if we are dealing with a call in an EHPad which does not unwind 
-  // to caller. 
-  bool EHPadForCallUnwindsLocally = false; 
-  if (CallSiteEHPad && isa<CallInst>(CB)) { 
-    UnwindDestMemoTy FuncletUnwindMap; 
-    Value *CallSiteUnwindDestToken = 
-        getUnwindDestToken(CallSiteEHPad, FuncletUnwindMap); 
- 
-    EHPadForCallUnwindsLocally = 
-        CallSiteUnwindDestToken && 
-        !isa<ConstantTokenNone>(CallSiteUnwindDestToken); 
-  } 
- 
-  // Get an iterator to the last basic block in the function, which will have 
-  // the new function inlined after it. 
-  Function::iterator LastBlock = --Caller->end(); 
- 
-  // Make sure to capture all of the return instructions from the cloned 
-  // function. 
-  SmallVector<ReturnInst*, 8> Returns; 
-  ClonedCodeInfo InlinedFunctionInfo; 
-  Function::iterator FirstNewBlock; 
- 
-  { // Scope to destroy VMap after cloning. 
-    ValueToValueMapTy VMap; 
-    // Keep a list of pair (dst, src) to emit byval initializations. 
-    SmallVector<std::pair<Value*, Value*>, 4> ByValInit; 
- 
+  updateProfileCallee(Callee, -CallCount, &VMap);
+}
+
+void llvm::updateProfileCallee(
+    Function *Callee, int64_t entryDelta,
+    const ValueMap<const Value *, WeakTrackingVH> *VMap) {
+  auto CalleeCount = Callee->getEntryCount();
+  if (!CalleeCount.hasValue())
+    return;
+
+  uint64_t priorEntryCount = CalleeCount.getCount();
+  uint64_t newEntryCount;
+
+  // Since CallSiteCount is an estimate, it could exceed the original callee
+  // count and has to be set to 0 so guard against underflow.
+  if (entryDelta < 0 && static_cast<uint64_t>(-entryDelta) > priorEntryCount)
+    newEntryCount = 0;
+  else
+    newEntryCount = priorEntryCount + entryDelta;
+
+  // During inlining ?
+  if (VMap) {
+    uint64_t cloneEntryCount = priorEntryCount - newEntryCount;
+    for (auto Entry : *VMap)
+      if (isa<CallInst>(Entry.first))
+        if (auto *CI = dyn_cast_or_null<CallInst>(Entry.second))
+          CI->updateProfWeight(cloneEntryCount, priorEntryCount);
+  }
+
+  if (entryDelta) {
+    Callee->setEntryCount(newEntryCount);
+
+    for (BasicBlock &BB : *Callee)
+      // No need to update the callsite if it is pruned during inlining.
+      if (!VMap || VMap->count(&BB))
+        for (Instruction &I : BB)
+          if (CallInst *CI = dyn_cast<CallInst>(&I))
+            CI->updateProfWeight(newEntryCount, priorEntryCount);
+  }
+}
+
+/// This function inlines the called function into the basic block of the
+/// caller. This returns false if it is not possible to inline this call.
+/// The program is still in a well defined state if this occurs though.
+///
+/// Note that this only does one level of inlining.  For example, if the
+/// instruction 'call B' is inlined, and 'B' calls 'C', then the call to 'C' now
+/// exists in the instruction stream.  Similarly this will inline a recursive
+/// function by one level.
+llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
+                                        AAResults *CalleeAAR,
+                                        bool InsertLifetime,
+                                        Function *ForwardVarArgsTo) {
+  assert(CB.getParent() && CB.getFunction() && "Instruction not in function!");
+
+  // FIXME: we don't inline callbr yet.
+  if (isa<CallBrInst>(CB))
+    return InlineResult::failure("We don't inline callbr yet.");
+
+  // If IFI has any state in it, zap it before we fill it in.
+  IFI.reset();
+
+  Function *CalledFunc = CB.getCalledFunction();
+  if (!CalledFunc ||               // Can't inline external function or indirect
+      CalledFunc->isDeclaration()) // call!
+    return InlineResult::failure("external or indirect");
+
+  // The inliner does not know how to inline through calls with operand bundles
+  // in general ...
+  if (CB.hasOperandBundles()) {
+    for (int i = 0, e = CB.getNumOperandBundles(); i != e; ++i) {
+      uint32_t Tag = CB.getOperandBundleAt(i).getTagID();
+      // ... but it knows how to inline through "deopt" operand bundles ...
+      if (Tag == LLVMContext::OB_deopt)
+        continue;
+      // ... and "funclet" operand bundles.
+      if (Tag == LLVMContext::OB_funclet)
+        continue;
+
+      return InlineResult::failure("unsupported operand bundle");
+    }
+  }
+
+  // If the call to the callee cannot throw, set the 'nounwind' flag on any
+  // calls that we inline.
+  bool MarkNoUnwind = CB.doesNotThrow();
+
+  BasicBlock *OrigBB = CB.getParent();
+  Function *Caller = OrigBB->getParent();
+
+  // GC poses two hazards to inlining, which only occur when the callee has GC:
+  //  1. If the caller has no GC, then the callee's GC must be propagated to the
+  //     caller.
+  //  2. If the caller has a differing GC, it is invalid to inline.
+  if (CalledFunc->hasGC()) {
+    if (!Caller->hasGC())
+      Caller->setGC(CalledFunc->getGC());
+    else if (CalledFunc->getGC() != Caller->getGC())
+      return InlineResult::failure("incompatible GC");
+  }
+
+  // Get the personality function from the callee if it contains a landing pad.
+  Constant *CalledPersonality =
+      CalledFunc->hasPersonalityFn()
+          ? CalledFunc->getPersonalityFn()->stripPointerCasts()
+          : nullptr;
+
+  // Find the personality function used by the landing pads of the caller. If it
+  // exists, then check to see that it matches the personality function used in
+  // the callee.
+  Constant *CallerPersonality =
+      Caller->hasPersonalityFn()
+          ? Caller->getPersonalityFn()->stripPointerCasts()
+          : nullptr;
+  if (CalledPersonality) {
+    if (!CallerPersonality)
+      Caller->setPersonalityFn(CalledPersonality);
+    // If the personality functions match, then we can perform the
+    // inlining. Otherwise, we can't inline.
+    // TODO: This isn't 100% true. Some personality functions are proper
+    //       supersets of others and can be used in place of the other.
+    else if (CalledPersonality != CallerPersonality)
+      return InlineResult::failure("incompatible personality");
+  }
+
+  // We need to figure out which funclet the callsite was in so that we may
+  // properly nest the callee.
+  Instruction *CallSiteEHPad = nullptr;
+  if (CallerPersonality) {
+    EHPersonality Personality = classifyEHPersonality(CallerPersonality);
+    if (isScopedEHPersonality(Personality)) {
+      Optional<OperandBundleUse> ParentFunclet =
+          CB.getOperandBundle(LLVMContext::OB_funclet);
+      if (ParentFunclet)
+        CallSiteEHPad = cast<FuncletPadInst>(ParentFunclet->Inputs.front());
+
+      // OK, the inlining site is legal.  What about the target function?
+
+      if (CallSiteEHPad) {
+        if (Personality == EHPersonality::MSVC_CXX) {
+          // The MSVC personality cannot tolerate catches getting inlined into
+          // cleanup funclets.
+          if (isa<CleanupPadInst>(CallSiteEHPad)) {
+            // Ok, the call site is within a cleanuppad.  Let's check the callee
+            // for catchpads.
+            for (const BasicBlock &CalledBB : *CalledFunc) {
+              if (isa<CatchSwitchInst>(CalledBB.getFirstNonPHI()))
+                return InlineResult::failure("catch in cleanup funclet");
+            }
+          }
+        } else if (isAsynchronousEHPersonality(Personality)) {
+          // SEH is even less tolerant, there may not be any sort of exceptional
+          // funclet in the callee.
+          for (const BasicBlock &CalledBB : *CalledFunc) {
+            if (CalledBB.isEHPad())
+              return InlineResult::failure("SEH in cleanup funclet");
+          }
+        }
+      }
+    }
+  }
+
+  // Determine if we are dealing with a call in an EHPad which does not unwind
+  // to caller.
+  bool EHPadForCallUnwindsLocally = false;
+  if (CallSiteEHPad && isa<CallInst>(CB)) {
+    UnwindDestMemoTy FuncletUnwindMap;
+    Value *CallSiteUnwindDestToken =
+        getUnwindDestToken(CallSiteEHPad, FuncletUnwindMap);
+
+    EHPadForCallUnwindsLocally =
+        CallSiteUnwindDestToken &&
+        !isa<ConstantTokenNone>(CallSiteUnwindDestToken);
+  }
+
+  // Get an iterator to the last basic block in the function, which will have
+  // the new function inlined after it.
+  Function::iterator LastBlock = --Caller->end();
+
+  // Make sure to capture all of the return instructions from the cloned
+  // function.
+  SmallVector<ReturnInst*, 8> Returns;
+  ClonedCodeInfo InlinedFunctionInfo;
+  Function::iterator FirstNewBlock;
+
+  { // Scope to destroy VMap after cloning.
+    ValueToValueMapTy VMap;
+    // Keep a list of pair (dst, src) to emit byval initializations.
+    SmallVector<std::pair<Value*, Value*>, 4> ByValInit;
+
     // When inlining a function that contains noalias scope metadata,
     // this metadata needs to be cloned so that the inlined blocks
     // have different "unique scopes" at every call site.
@@ -1793,732 +1793,732 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
     // callee.
     ScopedAliasMetadataDeepCloner SAMetadataCloner(CB.getCalledFunction());
 
-    auto &DL = Caller->getParent()->getDataLayout(); 
- 
-    // Calculate the vector of arguments to pass into the function cloner, which 
-    // matches up the formal to the actual argument values. 
-    auto AI = CB.arg_begin(); 
-    unsigned ArgNo = 0; 
-    for (Function::arg_iterator I = CalledFunc->arg_begin(), 
-         E = CalledFunc->arg_end(); I != E; ++I, ++AI, ++ArgNo) { 
-      Value *ActualArg = *AI; 
- 
-      // When byval arguments actually inlined, we need to make the copy implied 
-      // by them explicit.  However, we don't do this if the callee is readonly 
-      // or readnone, because the copy would be unneeded: the callee doesn't 
-      // modify the struct. 
-      if (CB.isByValArgument(ArgNo)) { 
-        ActualArg = HandleByValArgument(ActualArg, &CB, CalledFunc, IFI, 
-                                        CalledFunc->getParamAlignment(ArgNo)); 
-        if (ActualArg != *AI) 
-          ByValInit.push_back(std::make_pair(ActualArg, (Value*) *AI)); 
-      } 
- 
-      VMap[&*I] = ActualArg; 
-    } 
- 
-    // TODO: Remove this when users have been updated to the assume bundles. 
-    // Add alignment assumptions if necessary. We do this before the inlined 
-    // instructions are actually cloned into the caller so that we can easily 
-    // check what will be known at the start of the inlined code. 
-    AddAlignmentAssumptions(CB, IFI); 
- 
-    AssumptionCache *AC = 
-        IFI.GetAssumptionCache ? &IFI.GetAssumptionCache(*Caller) : nullptr; 
- 
-    /// Preserve all attributes on of the call and its parameters. 
-    salvageKnowledge(&CB, AC); 
- 
-    // We want the inliner to prune the code as it copies.  We would LOVE to 
-    // have no dead or constant instructions leftover after inlining occurs 
-    // (which can happen, e.g., because an argument was constant), but we'll be 
-    // happy with whatever the cloner can do. 
-    CloneAndPruneFunctionInto(Caller, CalledFunc, VMap, 
-                              /*ModuleLevelChanges=*/false, Returns, ".i", 
-                              &InlinedFunctionInfo, &CB); 
-    // Remember the first block that is newly cloned over. 
-    FirstNewBlock = LastBlock; ++FirstNewBlock; 
- 
-    if (IFI.CallerBFI != nullptr && IFI.CalleeBFI != nullptr) 
-      // Update the BFI of blocks cloned into the caller. 
-      updateCallerBFI(OrigBB, VMap, IFI.CallerBFI, IFI.CalleeBFI, 
-                      CalledFunc->front()); 
- 
-    updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), CB, 
-                      IFI.PSI, IFI.CallerBFI); 
- 
-    // Inject byval arguments initialization. 
-    for (std::pair<Value*, Value*> &Init : ByValInit) 
-      HandleByValArgumentInit(Init.first, Init.second, Caller->getParent(), 
-                              &*FirstNewBlock, IFI); 
- 
-    Optional<OperandBundleUse> ParentDeopt = 
-        CB.getOperandBundle(LLVMContext::OB_deopt); 
-    if (ParentDeopt) { 
-      SmallVector<OperandBundleDef, 2> OpDefs; 
- 
-      for (auto &VH : InlinedFunctionInfo.OperandBundleCallSites) { 
-        CallBase *ICS = dyn_cast_or_null<CallBase>(VH); 
-        if (!ICS) 
-          continue; // instruction was DCE'd or RAUW'ed to undef 
- 
-        OpDefs.clear(); 
- 
-        OpDefs.reserve(ICS->getNumOperandBundles()); 
- 
-        for (unsigned COBi = 0, COBe = ICS->getNumOperandBundles(); COBi < COBe; 
-             ++COBi) { 
-          auto ChildOB = ICS->getOperandBundleAt(COBi); 
-          if (ChildOB.getTagID() != LLVMContext::OB_deopt) { 
-            // If the inlined call has other operand bundles, let them be 
-            OpDefs.emplace_back(ChildOB); 
-            continue; 
-          } 
- 
-          // It may be useful to separate this logic (of handling operand 
-          // bundles) out to a separate "policy" component if this gets crowded. 
-          // Prepend the parent's deoptimization continuation to the newly 
-          // inlined call's deoptimization continuation. 
-          std::vector<Value *> MergedDeoptArgs; 
-          MergedDeoptArgs.reserve(ParentDeopt->Inputs.size() + 
-                                  ChildOB.Inputs.size()); 
- 
+    auto &DL = Caller->getParent()->getDataLayout();
+
+    // Calculate the vector of arguments to pass into the function cloner, which
+    // matches up the formal to the actual argument values.
+    auto AI = CB.arg_begin();
+    unsigned ArgNo = 0;
+    for (Function::arg_iterator I = CalledFunc->arg_begin(),
+         E = CalledFunc->arg_end(); I != E; ++I, ++AI, ++ArgNo) {
+      Value *ActualArg = *AI;
+
+      // When byval arguments actually inlined, we need to make the copy implied
+      // by them explicit.  However, we don't do this if the callee is readonly
+      // or readnone, because the copy would be unneeded: the callee doesn't
+      // modify the struct.
+      if (CB.isByValArgument(ArgNo)) {
+        ActualArg = HandleByValArgument(ActualArg, &CB, CalledFunc, IFI,
+                                        CalledFunc->getParamAlignment(ArgNo));
+        if (ActualArg != *AI)
+          ByValInit.push_back(std::make_pair(ActualArg, (Value*) *AI));
+      }
+
+      VMap[&*I] = ActualArg;
+    }
+
+    // TODO: Remove this when users have been updated to the assume bundles.
+    // Add alignment assumptions if necessary. We do this before the inlined
+    // instructions are actually cloned into the caller so that we can easily
+    // check what will be known at the start of the inlined code.
+    AddAlignmentAssumptions(CB, IFI);
+
+    AssumptionCache *AC =
+        IFI.GetAssumptionCache ? &IFI.GetAssumptionCache(*Caller) : nullptr;
+
+    /// Preserve all attributes on of the call and its parameters.
+    salvageKnowledge(&CB, AC);
+
+    // We want the inliner to prune the code as it copies.  We would LOVE to
+    // have no dead or constant instructions leftover after inlining occurs
+    // (which can happen, e.g., because an argument was constant), but we'll be
+    // happy with whatever the cloner can do.
+    CloneAndPruneFunctionInto(Caller, CalledFunc, VMap,
+                              /*ModuleLevelChanges=*/false, Returns, ".i",
+                              &InlinedFunctionInfo, &CB);
+    // Remember the first block that is newly cloned over.
+    FirstNewBlock = LastBlock; ++FirstNewBlock;
+
+    if (IFI.CallerBFI != nullptr && IFI.CalleeBFI != nullptr)
+      // Update the BFI of blocks cloned into the caller.
+      updateCallerBFI(OrigBB, VMap, IFI.CallerBFI, IFI.CalleeBFI,
+                      CalledFunc->front());
+
+    updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), CB,
+                      IFI.PSI, IFI.CallerBFI);
+
+    // Inject byval arguments initialization.
+    for (std::pair<Value*, Value*> &Init : ByValInit)
+      HandleByValArgumentInit(Init.first, Init.second, Caller->getParent(),
+                              &*FirstNewBlock, IFI);
+
+    Optional<OperandBundleUse> ParentDeopt =
+        CB.getOperandBundle(LLVMContext::OB_deopt);
+    if (ParentDeopt) {
+      SmallVector<OperandBundleDef, 2> OpDefs;
+
+      for (auto &VH : InlinedFunctionInfo.OperandBundleCallSites) {
+        CallBase *ICS = dyn_cast_or_null<CallBase>(VH);
+        if (!ICS)
+          continue; // instruction was DCE'd or RAUW'ed to undef
+
+        OpDefs.clear();
+
+        OpDefs.reserve(ICS->getNumOperandBundles());
+
+        for (unsigned COBi = 0, COBe = ICS->getNumOperandBundles(); COBi < COBe;
+             ++COBi) {
+          auto ChildOB = ICS->getOperandBundleAt(COBi);
+          if (ChildOB.getTagID() != LLVMContext::OB_deopt) {
+            // If the inlined call has other operand bundles, let them be
+            OpDefs.emplace_back(ChildOB);
+            continue;
+          }
+
+          // It may be useful to separate this logic (of handling operand
+          // bundles) out to a separate "policy" component if this gets crowded.
+          // Prepend the parent's deoptimization continuation to the newly
+          // inlined call's deoptimization continuation.
+          std::vector<Value *> MergedDeoptArgs;
+          MergedDeoptArgs.reserve(ParentDeopt->Inputs.size() +
+                                  ChildOB.Inputs.size());
+
           llvm::append_range(MergedDeoptArgs, ParentDeopt->Inputs);
           llvm::append_range(MergedDeoptArgs, ChildOB.Inputs);
- 
-          OpDefs.emplace_back("deopt", std::move(MergedDeoptArgs)); 
-        } 
- 
-        Instruction *NewI = CallBase::Create(ICS, OpDefs, ICS); 
- 
-        // Note: the RAUW does the appropriate fixup in VMap, so we need to do 
-        // this even if the call returns void. 
-        ICS->replaceAllUsesWith(NewI); 
- 
-        VH = nullptr; 
-        ICS->eraseFromParent(); 
-      } 
-    } 
- 
-    // Update the callgraph if requested. 
-    if (IFI.CG) 
-      UpdateCallGraphAfterInlining(CB, FirstNewBlock, VMap, IFI); 
- 
-    // For 'nodebug' functions, the associated DISubprogram is always null. 
-    // Conservatively avoid propagating the callsite debug location to 
-    // instructions inlined from a function whose DISubprogram is not null. 
-    fixupLineNumbers(Caller, FirstNewBlock, &CB, 
-                     CalledFunc->getSubprogram() != nullptr); 
- 
+
+          OpDefs.emplace_back("deopt", std::move(MergedDeoptArgs));
+        }
+
+        Instruction *NewI = CallBase::Create(ICS, OpDefs, ICS);
+
+        // Note: the RAUW does the appropriate fixup in VMap, so we need to do
+        // this even if the call returns void.
+        ICS->replaceAllUsesWith(NewI);
+
+        VH = nullptr;
+        ICS->eraseFromParent();
+      }
+    }
+
+    // Update the callgraph if requested.
+    if (IFI.CG)
+      UpdateCallGraphAfterInlining(CB, FirstNewBlock, VMap, IFI);
+
+    // For 'nodebug' functions, the associated DISubprogram is always null.
+    // Conservatively avoid propagating the callsite debug location to
+    // instructions inlined from a function whose DISubprogram is not null.
+    fixupLineNumbers(Caller, FirstNewBlock, &CB,
+                     CalledFunc->getSubprogram() != nullptr);
+
     // Now clone the inlined noalias scope metadata.
     SAMetadataCloner.clone();
     SAMetadataCloner.remap(FirstNewBlock, Caller->end());
- 
-    // Add noalias metadata if necessary. 
-    AddAliasScopeMetadata(CB, VMap, DL, CalleeAAR); 
- 
-    // Clone return attributes on the callsite into the calls within the inlined 
-    // function which feed into its return value. 
-    AddReturnAttributes(CB, VMap); 
- 
+
+    // Add noalias metadata if necessary.
+    AddAliasScopeMetadata(CB, VMap, DL, CalleeAAR);
+
+    // Clone return attributes on the callsite into the calls within the inlined
+    // function which feed into its return value.
+    AddReturnAttributes(CB, VMap);
+
     // Propagate metadata on the callsite if necessary.
     PropagateCallSiteMetadata(CB, FirstNewBlock, Caller->end());
- 
-    // Register any cloned assumptions. 
-    if (IFI.GetAssumptionCache) 
-      for (BasicBlock &NewBlock : 
-           make_range(FirstNewBlock->getIterator(), Caller->end())) 
-        for (Instruction &I : NewBlock) 
-          if (auto *II = dyn_cast<IntrinsicInst>(&I)) 
-            if (II->getIntrinsicID() == Intrinsic::assume) 
-              IFI.GetAssumptionCache(*Caller).registerAssumption(II); 
-  } 
- 
-  // If there are any alloca instructions in the block that used to be the entry 
-  // block for the callee, move them to the entry block of the caller.  First 
-  // calculate which instruction they should be inserted before.  We insert the 
-  // instructions at the end of the current alloca list. 
-  { 
-    BasicBlock::iterator InsertPoint = Caller->begin()->begin(); 
-    for (BasicBlock::iterator I = FirstNewBlock->begin(), 
-         E = FirstNewBlock->end(); I != E; ) { 
-      AllocaInst *AI = dyn_cast<AllocaInst>(I++); 
-      if (!AI) continue; 
- 
-      // If the alloca is now dead, remove it.  This often occurs due to code 
-      // specialization. 
-      if (AI->use_empty()) { 
-        AI->eraseFromParent(); 
-        continue; 
-      } 
- 
-      if (!allocaWouldBeStaticInEntry(AI)) 
-        continue; 
- 
-      // Keep track of the static allocas that we inline into the caller. 
-      IFI.StaticAllocas.push_back(AI); 
- 
-      // Scan for the block of allocas that we can move over, and move them 
-      // all at once. 
-      while (isa<AllocaInst>(I) && 
-             !cast<AllocaInst>(I)->use_empty() && 
-             allocaWouldBeStaticInEntry(cast<AllocaInst>(I))) { 
-        IFI.StaticAllocas.push_back(cast<AllocaInst>(I)); 
-        ++I; 
-      } 
- 
-      // Transfer all of the allocas over in a block.  Using splice means 
-      // that the instructions aren't removed from the symbol table, then 
-      // reinserted. 
-      Caller->getEntryBlock().getInstList().splice( 
-          InsertPoint, FirstNewBlock->getInstList(), AI->getIterator(), I); 
-    } 
-  } 
- 
-  SmallVector<Value*,4> VarArgsToForward; 
-  SmallVector<AttributeSet, 4> VarArgsAttrs; 
-  for (unsigned i = CalledFunc->getFunctionType()->getNumParams(); 
-       i < CB.getNumArgOperands(); i++) { 
-    VarArgsToForward.push_back(CB.getArgOperand(i)); 
-    VarArgsAttrs.push_back(CB.getAttributes().getParamAttributes(i)); 
-  } 
- 
-  bool InlinedMustTailCalls = false, InlinedDeoptimizeCalls = false; 
-  if (InlinedFunctionInfo.ContainsCalls) { 
-    CallInst::TailCallKind CallSiteTailKind = CallInst::TCK_None; 
-    if (CallInst *CI = dyn_cast<CallInst>(&CB)) 
-      CallSiteTailKind = CI->getTailCallKind(); 
- 
-    // For inlining purposes, the "notail" marker is the same as no marker. 
-    if (CallSiteTailKind == CallInst::TCK_NoTail) 
-      CallSiteTailKind = CallInst::TCK_None; 
- 
-    for (Function::iterator BB = FirstNewBlock, E = Caller->end(); BB != E; 
-         ++BB) { 
-      for (auto II = BB->begin(); II != BB->end();) { 
-        Instruction &I = *II++; 
-        CallInst *CI = dyn_cast<CallInst>(&I); 
-        if (!CI) 
-          continue; 
- 
-        // Forward varargs from inlined call site to calls to the 
-        // ForwardVarArgsTo function, if requested, and to musttail calls. 
-        if (!VarArgsToForward.empty() && 
-            ((ForwardVarArgsTo && 
-              CI->getCalledFunction() == ForwardVarArgsTo) || 
-             CI->isMustTailCall())) { 
-          // Collect attributes for non-vararg parameters. 
-          AttributeList Attrs = CI->getAttributes(); 
-          SmallVector<AttributeSet, 8> ArgAttrs; 
-          if (!Attrs.isEmpty() || !VarArgsAttrs.empty()) { 
-            for (unsigned ArgNo = 0; 
-                 ArgNo < CI->getFunctionType()->getNumParams(); ++ArgNo) 
-              ArgAttrs.push_back(Attrs.getParamAttributes(ArgNo)); 
-          } 
- 
-          // Add VarArg attributes. 
-          ArgAttrs.append(VarArgsAttrs.begin(), VarArgsAttrs.end()); 
-          Attrs = AttributeList::get(CI->getContext(), Attrs.getFnAttributes(), 
-                                     Attrs.getRetAttributes(), ArgAttrs); 
-          // Add VarArgs to existing parameters. 
-          SmallVector<Value *, 6> Params(CI->arg_operands()); 
-          Params.append(VarArgsToForward.begin(), VarArgsToForward.end()); 
-          CallInst *NewCI = CallInst::Create( 
-              CI->getFunctionType(), CI->getCalledOperand(), Params, "", CI); 
-          NewCI->setDebugLoc(CI->getDebugLoc()); 
-          NewCI->setAttributes(Attrs); 
-          NewCI->setCallingConv(CI->getCallingConv()); 
-          CI->replaceAllUsesWith(NewCI); 
-          CI->eraseFromParent(); 
-          CI = NewCI; 
-        } 
- 
-        if (Function *F = CI->getCalledFunction()) 
-          InlinedDeoptimizeCalls |= 
-              F->getIntrinsicID() == Intrinsic::experimental_deoptimize; 
- 
-        // We need to reduce the strength of any inlined tail calls.  For 
-        // musttail, we have to avoid introducing potential unbounded stack 
-        // growth.  For example, if functions 'f' and 'g' are mutually recursive 
-        // with musttail, we can inline 'g' into 'f' so long as we preserve 
-        // musttail on the cloned call to 'f'.  If either the inlined call site 
-        // or the cloned call site is *not* musttail, the program already has 
-        // one frame of stack growth, so it's safe to remove musttail.  Here is 
-        // a table of example transformations: 
-        // 
-        //    f -> musttail g -> musttail f  ==>  f -> musttail f 
-        //    f -> musttail g ->     tail f  ==>  f ->     tail f 
-        //    f ->          g -> musttail f  ==>  f ->          f 
-        //    f ->          g ->     tail f  ==>  f ->          f 
-        // 
-        // Inlined notail calls should remain notail calls. 
-        CallInst::TailCallKind ChildTCK = CI->getTailCallKind(); 
-        if (ChildTCK != CallInst::TCK_NoTail) 
-          ChildTCK = std::min(CallSiteTailKind, ChildTCK); 
-        CI->setTailCallKind(ChildTCK); 
-        InlinedMustTailCalls |= CI->isMustTailCall(); 
- 
-        // Calls inlined through a 'nounwind' call site should be marked 
-        // 'nounwind'. 
-        if (MarkNoUnwind) 
-          CI->setDoesNotThrow(); 
-      } 
-    } 
-  } 
- 
-  // Leave lifetime markers for the static alloca's, scoping them to the 
-  // function we just inlined. 
-  if (InsertLifetime && !IFI.StaticAllocas.empty()) { 
-    IRBuilder<> builder(&FirstNewBlock->front()); 
-    for (unsigned ai = 0, ae = IFI.StaticAllocas.size(); ai != ae; ++ai) { 
-      AllocaInst *AI = IFI.StaticAllocas[ai]; 
-      // Don't mark swifterror allocas. They can't have bitcast uses. 
-      if (AI->isSwiftError()) 
-        continue; 
- 
-      // If the alloca is already scoped to something smaller than the whole 
-      // function then there's no need to add redundant, less accurate markers. 
-      if (hasLifetimeMarkers(AI)) 
-        continue; 
- 
-      // Try to determine the size of the allocation. 
-      ConstantInt *AllocaSize = nullptr; 
-      if (ConstantInt *AIArraySize = 
-          dyn_cast<ConstantInt>(AI->getArraySize())) { 
-        auto &DL = Caller->getParent()->getDataLayout(); 
-        Type *AllocaType = AI->getAllocatedType(); 
+
+    // Register any cloned assumptions.
+    if (IFI.GetAssumptionCache)
+      for (BasicBlock &NewBlock :
+           make_range(FirstNewBlock->getIterator(), Caller->end()))
+        for (Instruction &I : NewBlock)
+          if (auto *II = dyn_cast<IntrinsicInst>(&I))
+            if (II->getIntrinsicID() == Intrinsic::assume)
+              IFI.GetAssumptionCache(*Caller).registerAssumption(II);
+  }
+
+  // If there are any alloca instructions in the block that used to be the entry
+  // block for the callee, move them to the entry block of the caller.  First
+  // calculate which instruction they should be inserted before.  We insert the
+  // instructions at the end of the current alloca list.
+  {
+    BasicBlock::iterator InsertPoint = Caller->begin()->begin();
+    for (BasicBlock::iterator I = FirstNewBlock->begin(),
+         E = FirstNewBlock->end(); I != E; ) {
+      AllocaInst *AI = dyn_cast<AllocaInst>(I++);
+      if (!AI) continue;
+
+      // If the alloca is now dead, remove it.  This often occurs due to code
+      // specialization.
+      if (AI->use_empty()) {
+        AI->eraseFromParent();
+        continue;
+      }
+
+      if (!allocaWouldBeStaticInEntry(AI))
+        continue;
+
+      // Keep track of the static allocas that we inline into the caller.
+      IFI.StaticAllocas.push_back(AI);
+
+      // Scan for the block of allocas that we can move over, and move them
+      // all at once.
+      while (isa<AllocaInst>(I) &&
+             !cast<AllocaInst>(I)->use_empty() &&
+             allocaWouldBeStaticInEntry(cast<AllocaInst>(I))) {
+        IFI.StaticAllocas.push_back(cast<AllocaInst>(I));
+        ++I;
+      }
+
+      // Transfer all of the allocas over in a block.  Using splice means
+      // that the instructions aren't removed from the symbol table, then
+      // reinserted.
+      Caller->getEntryBlock().getInstList().splice(
+          InsertPoint, FirstNewBlock->getInstList(), AI->getIterator(), I);
+    }
+  }
+
+  SmallVector<Value*,4> VarArgsToForward;
+  SmallVector<AttributeSet, 4> VarArgsAttrs;
+  for (unsigned i = CalledFunc->getFunctionType()->getNumParams();
+       i < CB.getNumArgOperands(); i++) {
+    VarArgsToForward.push_back(CB.getArgOperand(i));
+    VarArgsAttrs.push_back(CB.getAttributes().getParamAttributes(i));
+  }
+
+  bool InlinedMustTailCalls = false, InlinedDeoptimizeCalls = false;
+  if (InlinedFunctionInfo.ContainsCalls) {
+    CallInst::TailCallKind CallSiteTailKind = CallInst::TCK_None;
+    if (CallInst *CI = dyn_cast<CallInst>(&CB))
+      CallSiteTailKind = CI->getTailCallKind();
+
+    // For inlining purposes, the "notail" marker is the same as no marker.
+    if (CallSiteTailKind == CallInst::TCK_NoTail)
+      CallSiteTailKind = CallInst::TCK_None;
+
+    for (Function::iterator BB = FirstNewBlock, E = Caller->end(); BB != E;
+         ++BB) {
+      for (auto II = BB->begin(); II != BB->end();) {
+        Instruction &I = *II++;
+        CallInst *CI = dyn_cast<CallInst>(&I);
+        if (!CI)
+          continue;
+
+        // Forward varargs from inlined call site to calls to the
+        // ForwardVarArgsTo function, if requested, and to musttail calls.
+        if (!VarArgsToForward.empty() &&
+            ((ForwardVarArgsTo &&
+              CI->getCalledFunction() == ForwardVarArgsTo) ||
+             CI->isMustTailCall())) {
+          // Collect attributes for non-vararg parameters.
+          AttributeList Attrs = CI->getAttributes();
+          SmallVector<AttributeSet, 8> ArgAttrs;
+          if (!Attrs.isEmpty() || !VarArgsAttrs.empty()) {
+            for (unsigned ArgNo = 0;
+                 ArgNo < CI->getFunctionType()->getNumParams(); ++ArgNo)
+              ArgAttrs.push_back(Attrs.getParamAttributes(ArgNo));
+          }
+
+          // Add VarArg attributes.
+          ArgAttrs.append(VarArgsAttrs.begin(), VarArgsAttrs.end());
+          Attrs = AttributeList::get(CI->getContext(), Attrs.getFnAttributes(),
+                                     Attrs.getRetAttributes(), ArgAttrs);
+          // Add VarArgs to existing parameters.
+          SmallVector<Value *, 6> Params(CI->arg_operands());
+          Params.append(VarArgsToForward.begin(), VarArgsToForward.end());
+          CallInst *NewCI = CallInst::Create(
+              CI->getFunctionType(), CI->getCalledOperand(), Params, "", CI);
+          NewCI->setDebugLoc(CI->getDebugLoc());
+          NewCI->setAttributes(Attrs);
+          NewCI->setCallingConv(CI->getCallingConv());
+          CI->replaceAllUsesWith(NewCI);
+          CI->eraseFromParent();
+          CI = NewCI;
+        }
+
+        if (Function *F = CI->getCalledFunction())
+          InlinedDeoptimizeCalls |=
+              F->getIntrinsicID() == Intrinsic::experimental_deoptimize;
+
+        // We need to reduce the strength of any inlined tail calls.  For
+        // musttail, we have to avoid introducing potential unbounded stack
+        // growth.  For example, if functions 'f' and 'g' are mutually recursive
+        // with musttail, we can inline 'g' into 'f' so long as we preserve
+        // musttail on the cloned call to 'f'.  If either the inlined call site
+        // or the cloned call site is *not* musttail, the program already has
+        // one frame of stack growth, so it's safe to remove musttail.  Here is
+        // a table of example transformations:
+        //
+        //    f -> musttail g -> musttail f  ==>  f -> musttail f
+        //    f -> musttail g ->     tail f  ==>  f ->     tail f
+        //    f ->          g -> musttail f  ==>  f ->          f
+        //    f ->          g ->     tail f  ==>  f ->          f
+        //
+        // Inlined notail calls should remain notail calls.
+        CallInst::TailCallKind ChildTCK = CI->getTailCallKind();
+        if (ChildTCK != CallInst::TCK_NoTail)
+          ChildTCK = std::min(CallSiteTailKind, ChildTCK);
+        CI->setTailCallKind(ChildTCK);
+        InlinedMustTailCalls |= CI->isMustTailCall();
+
+        // Calls inlined through a 'nounwind' call site should be marked
+        // 'nounwind'.
+        if (MarkNoUnwind)
+          CI->setDoesNotThrow();
+      }
+    }
+  }
+
+  // Leave lifetime markers for the static alloca's, scoping them to the
+  // function we just inlined.
+  if (InsertLifetime && !IFI.StaticAllocas.empty()) {
+    IRBuilder<> builder(&FirstNewBlock->front());
+    for (unsigned ai = 0, ae = IFI.StaticAllocas.size(); ai != ae; ++ai) {
+      AllocaInst *AI = IFI.StaticAllocas[ai];
+      // Don't mark swifterror allocas. They can't have bitcast uses.
+      if (AI->isSwiftError())
+        continue;
+
+      // If the alloca is already scoped to something smaller than the whole
+      // function then there's no need to add redundant, less accurate markers.
+      if (hasLifetimeMarkers(AI))
+        continue;
+
+      // Try to determine the size of the allocation.
+      ConstantInt *AllocaSize = nullptr;
+      if (ConstantInt *AIArraySize =
+          dyn_cast<ConstantInt>(AI->getArraySize())) {
+        auto &DL = Caller->getParent()->getDataLayout();
+        Type *AllocaType = AI->getAllocatedType();
         TypeSize AllocaTypeSize = DL.getTypeAllocSize(AllocaType);
-        uint64_t AllocaArraySize = AIArraySize->getLimitedValue(); 
- 
-        // Don't add markers for zero-sized allocas. 
-        if (AllocaArraySize == 0) 
-          continue; 
- 
-        // Check that array size doesn't saturate uint64_t and doesn't 
-        // overflow when it's multiplied by type size. 
+        uint64_t AllocaArraySize = AIArraySize->getLimitedValue();
+
+        // Don't add markers for zero-sized allocas.
+        if (AllocaArraySize == 0)
+          continue;
+
+        // Check that array size doesn't saturate uint64_t and doesn't
+        // overflow when it's multiplied by type size.
         if (!AllocaTypeSize.isScalable() &&
             AllocaArraySize != std::numeric_limits<uint64_t>::max() &&
-            std::numeric_limits<uint64_t>::max() / AllocaArraySize >= 
+            std::numeric_limits<uint64_t>::max() / AllocaArraySize >=
                 AllocaTypeSize.getFixedSize()) {
-          AllocaSize = ConstantInt::get(Type::getInt64Ty(AI->getContext()), 
-                                        AllocaArraySize * AllocaTypeSize); 
-        } 
-      } 
- 
-      builder.CreateLifetimeStart(AI, AllocaSize); 
-      for (ReturnInst *RI : Returns) { 
-        // Don't insert llvm.lifetime.end calls between a musttail or deoptimize 
-        // call and a return.  The return kills all local allocas. 
-        if (InlinedMustTailCalls && 
-            RI->getParent()->getTerminatingMustTailCall()) 
-          continue; 
-        if (InlinedDeoptimizeCalls && 
-            RI->getParent()->getTerminatingDeoptimizeCall()) 
-          continue; 
-        IRBuilder<>(RI).CreateLifetimeEnd(AI, AllocaSize); 
-      } 
-    } 
-  } 
- 
-  // If the inlined code contained dynamic alloca instructions, wrap the inlined 
-  // code with llvm.stacksave/llvm.stackrestore intrinsics. 
-  if (InlinedFunctionInfo.ContainsDynamicAllocas) { 
-    Module *M = Caller->getParent(); 
-    // Get the two intrinsics we care about. 
-    Function *StackSave = Intrinsic::getDeclaration(M, Intrinsic::stacksave); 
-    Function *StackRestore=Intrinsic::getDeclaration(M,Intrinsic::stackrestore); 
- 
-    // Insert the llvm.stacksave. 
-    CallInst *SavedPtr = IRBuilder<>(&*FirstNewBlock, FirstNewBlock->begin()) 
-                             .CreateCall(StackSave, {}, "savedstack"); 
- 
-    // Insert a call to llvm.stackrestore before any return instructions in the 
-    // inlined function. 
-    for (ReturnInst *RI : Returns) { 
-      // Don't insert llvm.stackrestore calls between a musttail or deoptimize 
-      // call and a return.  The return will restore the stack pointer. 
-      if (InlinedMustTailCalls && RI->getParent()->getTerminatingMustTailCall()) 
-        continue; 
-      if (InlinedDeoptimizeCalls && RI->getParent()->getTerminatingDeoptimizeCall()) 
-        continue; 
-      IRBuilder<>(RI).CreateCall(StackRestore, SavedPtr); 
-    } 
-  } 
- 
-  // If we are inlining for an invoke instruction, we must make sure to rewrite 
-  // any call instructions into invoke instructions.  This is sensitive to which 
-  // funclet pads were top-level in the inlinee, so must be done before 
-  // rewriting the "parent pad" links. 
-  if (auto *II = dyn_cast<InvokeInst>(&CB)) { 
-    BasicBlock *UnwindDest = II->getUnwindDest(); 
-    Instruction *FirstNonPHI = UnwindDest->getFirstNonPHI(); 
-    if (isa<LandingPadInst>(FirstNonPHI)) { 
-      HandleInlinedLandingPad(II, &*FirstNewBlock, InlinedFunctionInfo); 
-    } else { 
-      HandleInlinedEHPad(II, &*FirstNewBlock, InlinedFunctionInfo); 
-    } 
-  } 
- 
-  // Update the lexical scopes of the new funclets and callsites. 
-  // Anything that had 'none' as its parent is now nested inside the callsite's 
-  // EHPad. 
- 
-  if (CallSiteEHPad) { 
-    for (Function::iterator BB = FirstNewBlock->getIterator(), 
-                            E = Caller->end(); 
-         BB != E; ++BB) { 
-      // Add bundle operands to any top-level call sites. 
-      SmallVector<OperandBundleDef, 1> OpBundles; 
-      for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;) { 
-        CallBase *I = dyn_cast<CallBase>(&*BBI++); 
-        if (!I) 
-          continue; 
- 
-        // Skip call sites which are nounwind intrinsics. 
-        auto *CalledFn = 
-            dyn_cast<Function>(I->getCalledOperand()->stripPointerCasts()); 
-        if (CalledFn && CalledFn->isIntrinsic() && I->doesNotThrow()) 
-          continue; 
- 
-        // Skip call sites which already have a "funclet" bundle. 
-        if (I->getOperandBundle(LLVMContext::OB_funclet)) 
-          continue; 
- 
-        I->getOperandBundlesAsDefs(OpBundles); 
-        OpBundles.emplace_back("funclet", CallSiteEHPad); 
- 
-        Instruction *NewInst = CallBase::Create(I, OpBundles, I); 
-        NewInst->takeName(I); 
-        I->replaceAllUsesWith(NewInst); 
-        I->eraseFromParent(); 
- 
-        OpBundles.clear(); 
-      } 
- 
-      // It is problematic if the inlinee has a cleanupret which unwinds to 
-      // caller and we inline it into a call site which doesn't unwind but into 
-      // an EH pad that does.  Such an edge must be dynamically unreachable. 
-      // As such, we replace the cleanupret with unreachable. 
-      if (auto *CleanupRet = dyn_cast<CleanupReturnInst>(BB->getTerminator())) 
-        if (CleanupRet->unwindsToCaller() && EHPadForCallUnwindsLocally) 
-          changeToUnreachable(CleanupRet, /*UseLLVMTrap=*/false); 
- 
-      Instruction *I = BB->getFirstNonPHI(); 
-      if (!I->isEHPad()) 
-        continue; 
- 
-      if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(I)) { 
-        if (isa<ConstantTokenNone>(CatchSwitch->getParentPad())) 
-          CatchSwitch->setParentPad(CallSiteEHPad); 
-      } else { 
-        auto *FPI = cast<FuncletPadInst>(I); 
-        if (isa<ConstantTokenNone>(FPI->getParentPad())) 
-          FPI->setParentPad(CallSiteEHPad); 
-      } 
-    } 
-  } 
- 
-  if (InlinedDeoptimizeCalls) { 
-    // We need to at least remove the deoptimizing returns from the Return set, 
-    // so that the control flow from those returns does not get merged into the 
-    // caller (but terminate it instead).  If the caller's return type does not 
-    // match the callee's return type, we also need to change the return type of 
-    // the intrinsic. 
-    if (Caller->getReturnType() == CB.getType()) { 
+          AllocaSize = ConstantInt::get(Type::getInt64Ty(AI->getContext()),
+                                        AllocaArraySize * AllocaTypeSize);
+        }
+      }
+
+      builder.CreateLifetimeStart(AI, AllocaSize);
+      for (ReturnInst *RI : Returns) {
+        // Don't insert llvm.lifetime.end calls between a musttail or deoptimize
+        // call and a return.  The return kills all local allocas.
+        if (InlinedMustTailCalls &&
+            RI->getParent()->getTerminatingMustTailCall())
+          continue;
+        if (InlinedDeoptimizeCalls &&
+            RI->getParent()->getTerminatingDeoptimizeCall())
+          continue;
+        IRBuilder<>(RI).CreateLifetimeEnd(AI, AllocaSize);
+      }
+    }
+  }
+
+  // If the inlined code contained dynamic alloca instructions, wrap the inlined
+  // code with llvm.stacksave/llvm.stackrestore intrinsics.
+  if (InlinedFunctionInfo.ContainsDynamicAllocas) {
+    Module *M = Caller->getParent();
+    // Get the two intrinsics we care about.
+    Function *StackSave = Intrinsic::getDeclaration(M, Intrinsic::stacksave);
+    Function *StackRestore=Intrinsic::getDeclaration(M,Intrinsic::stackrestore);
+
+    // Insert the llvm.stacksave.
+    CallInst *SavedPtr = IRBuilder<>(&*FirstNewBlock, FirstNewBlock->begin())
+                             .CreateCall(StackSave, {}, "savedstack");
+
+    // Insert a call to llvm.stackrestore before any return instructions in the
+    // inlined function.
+    for (ReturnInst *RI : Returns) {
+      // Don't insert llvm.stackrestore calls between a musttail or deoptimize
+      // call and a return.  The return will restore the stack pointer.
+      if (InlinedMustTailCalls && RI->getParent()->getTerminatingMustTailCall())
+        continue;
+      if (InlinedDeoptimizeCalls && RI->getParent()->getTerminatingDeoptimizeCall())
+        continue;
+      IRBuilder<>(RI).CreateCall(StackRestore, SavedPtr);
+    }
+  }
+
+  // If we are inlining for an invoke instruction, we must make sure to rewrite
+  // any call instructions into invoke instructions.  This is sensitive to which
+  // funclet pads were top-level in the inlinee, so must be done before
+  // rewriting the "parent pad" links.
+  if (auto *II = dyn_cast<InvokeInst>(&CB)) {
+    BasicBlock *UnwindDest = II->getUnwindDest();
+    Instruction *FirstNonPHI = UnwindDest->getFirstNonPHI();
+    if (isa<LandingPadInst>(FirstNonPHI)) {
+      HandleInlinedLandingPad(II, &*FirstNewBlock, InlinedFunctionInfo);
+    } else {
+      HandleInlinedEHPad(II, &*FirstNewBlock, InlinedFunctionInfo);
+    }
+  }
+
+  // Update the lexical scopes of the new funclets and callsites.
+  // Anything that had 'none' as its parent is now nested inside the callsite's
+  // EHPad.
+
+  if (CallSiteEHPad) {
+    for (Function::iterator BB = FirstNewBlock->getIterator(),
+                            E = Caller->end();
+         BB != E; ++BB) {
+      // Add bundle operands to any top-level call sites.
+      SmallVector<OperandBundleDef, 1> OpBundles;
+      for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;) {
+        CallBase *I = dyn_cast<CallBase>(&*BBI++);
+        if (!I)
+          continue;
+
+        // Skip call sites which are nounwind intrinsics.
+        auto *CalledFn =
+            dyn_cast<Function>(I->getCalledOperand()->stripPointerCasts());
+        if (CalledFn && CalledFn->isIntrinsic() && I->doesNotThrow())
+          continue;
+
+        // Skip call sites which already have a "funclet" bundle.
+        if (I->getOperandBundle(LLVMContext::OB_funclet))
+          continue;
+
+        I->getOperandBundlesAsDefs(OpBundles);
+        OpBundles.emplace_back("funclet", CallSiteEHPad);
+
+        Instruction *NewInst = CallBase::Create(I, OpBundles, I);
+        NewInst->takeName(I);
+        I->replaceAllUsesWith(NewInst);
+        I->eraseFromParent();
+
+        OpBundles.clear();
+      }
+
+      // It is problematic if the inlinee has a cleanupret which unwinds to
+      // caller and we inline it into a call site which doesn't unwind but into
+      // an EH pad that does.  Such an edge must be dynamically unreachable.
+      // As such, we replace the cleanupret with unreachable.
+      if (auto *CleanupRet = dyn_cast<CleanupReturnInst>(BB->getTerminator()))
+        if (CleanupRet->unwindsToCaller() && EHPadForCallUnwindsLocally)
+          changeToUnreachable(CleanupRet, /*UseLLVMTrap=*/false);
+
+      Instruction *I = BB->getFirstNonPHI();
+      if (!I->isEHPad())
+        continue;
+
+      if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(I)) {
+        if (isa<ConstantTokenNone>(CatchSwitch->getParentPad()))
+          CatchSwitch->setParentPad(CallSiteEHPad);
+      } else {
+        auto *FPI = cast<FuncletPadInst>(I);
+        if (isa<ConstantTokenNone>(FPI->getParentPad()))
+          FPI->setParentPad(CallSiteEHPad);
+      }
+    }
+  }
+
+  if (InlinedDeoptimizeCalls) {
+    // We need to at least remove the deoptimizing returns from the Return set,
+    // so that the control flow from those returns does not get merged into the
+    // caller (but terminate it instead).  If the caller's return type does not
+    // match the callee's return type, we also need to change the return type of
+    // the intrinsic.
+    if (Caller->getReturnType() == CB.getType()) {
       llvm::erase_if(Returns, [](ReturnInst *RI) {
-        return RI->getParent()->getTerminatingDeoptimizeCall() != nullptr; 
-      }); 
-    } else { 
-      SmallVector<ReturnInst *, 8> NormalReturns; 
-      Function *NewDeoptIntrinsic = Intrinsic::getDeclaration( 
-          Caller->getParent(), Intrinsic::experimental_deoptimize, 
-          {Caller->getReturnType()}); 
- 
-      for (ReturnInst *RI : Returns) { 
-        CallInst *DeoptCall = RI->getParent()->getTerminatingDeoptimizeCall(); 
-        if (!DeoptCall) { 
-          NormalReturns.push_back(RI); 
-          continue; 
-        } 
- 
-        // The calling convention on the deoptimize call itself may be bogus, 
-        // since the code we're inlining may have undefined behavior (and may 
-        // never actually execute at runtime); but all 
-        // @llvm.experimental.deoptimize declarations have to have the same 
-        // calling convention in a well-formed module. 
-        auto CallingConv = DeoptCall->getCalledFunction()->getCallingConv(); 
-        NewDeoptIntrinsic->setCallingConv(CallingConv); 
-        auto *CurBB = RI->getParent(); 
-        RI->eraseFromParent(); 
- 
+        return RI->getParent()->getTerminatingDeoptimizeCall() != nullptr;
+      });
+    } else {
+      SmallVector<ReturnInst *, 8> NormalReturns;
+      Function *NewDeoptIntrinsic = Intrinsic::getDeclaration(
+          Caller->getParent(), Intrinsic::experimental_deoptimize,
+          {Caller->getReturnType()});
+
+      for (ReturnInst *RI : Returns) {
+        CallInst *DeoptCall = RI->getParent()->getTerminatingDeoptimizeCall();
+        if (!DeoptCall) {
+          NormalReturns.push_back(RI);
+          continue;
+        }
+
+        // The calling convention on the deoptimize call itself may be bogus,
+        // since the code we're inlining may have undefined behavior (and may
+        // never actually execute at runtime); but all
+        // @llvm.experimental.deoptimize declarations have to have the same
+        // calling convention in a well-formed module.
+        auto CallingConv = DeoptCall->getCalledFunction()->getCallingConv();
+        NewDeoptIntrinsic->setCallingConv(CallingConv);
+        auto *CurBB = RI->getParent();
+        RI->eraseFromParent();
+
         SmallVector<Value *, 4> CallArgs(DeoptCall->args());
- 
-        SmallVector<OperandBundleDef, 1> OpBundles; 
-        DeoptCall->getOperandBundlesAsDefs(OpBundles); 
-        DeoptCall->eraseFromParent(); 
-        assert(!OpBundles.empty() && 
-               "Expected at least the deopt operand bundle"); 
- 
-        IRBuilder<> Builder(CurBB); 
-        CallInst *NewDeoptCall = 
-            Builder.CreateCall(NewDeoptIntrinsic, CallArgs, OpBundles); 
-        NewDeoptCall->setCallingConv(CallingConv); 
-        if (NewDeoptCall->getType()->isVoidTy()) 
-          Builder.CreateRetVoid(); 
-        else 
-          Builder.CreateRet(NewDeoptCall); 
-      } 
- 
-      // Leave behind the normal returns so we can merge control flow. 
-      std::swap(Returns, NormalReturns); 
-    } 
-  } 
- 
-  // Handle any inlined musttail call sites.  In order for a new call site to be 
-  // musttail, the source of the clone and the inlined call site must have been 
-  // musttail.  Therefore it's safe to return without merging control into the 
-  // phi below. 
-  if (InlinedMustTailCalls) { 
-    // Check if we need to bitcast the result of any musttail calls. 
-    Type *NewRetTy = Caller->getReturnType(); 
-    bool NeedBitCast = !CB.use_empty() && CB.getType() != NewRetTy; 
- 
-    // Handle the returns preceded by musttail calls separately. 
-    SmallVector<ReturnInst *, 8> NormalReturns; 
-    for (ReturnInst *RI : Returns) { 
-      CallInst *ReturnedMustTail = 
-          RI->getParent()->getTerminatingMustTailCall(); 
-      if (!ReturnedMustTail) { 
-        NormalReturns.push_back(RI); 
-        continue; 
-      } 
-      if (!NeedBitCast) 
-        continue; 
- 
-      // Delete the old return and any preceding bitcast. 
-      BasicBlock *CurBB = RI->getParent(); 
-      auto *OldCast = dyn_cast_or_null<BitCastInst>(RI->getReturnValue()); 
-      RI->eraseFromParent(); 
-      if (OldCast) 
-        OldCast->eraseFromParent(); 
- 
-      // Insert a new bitcast and return with the right type. 
-      IRBuilder<> Builder(CurBB); 
-      Builder.CreateRet(Builder.CreateBitCast(ReturnedMustTail, NewRetTy)); 
-    } 
- 
-    // Leave behind the normal returns so we can merge control flow. 
-    std::swap(Returns, NormalReturns); 
-  } 
- 
-  // Now that all of the transforms on the inlined code have taken place but 
-  // before we splice the inlined code into the CFG and lose track of which 
-  // blocks were actually inlined, collect the call sites. We only do this if 
-  // call graph updates weren't requested, as those provide value handle based 
-  // tracking of inlined call sites instead. 
-  if (InlinedFunctionInfo.ContainsCalls && !IFI.CG) { 
-    // Otherwise just collect the raw call sites that were inlined. 
-    for (BasicBlock &NewBB : 
-         make_range(FirstNewBlock->getIterator(), Caller->end())) 
-      for (Instruction &I : NewBB) 
-        if (auto *CB = dyn_cast<CallBase>(&I)) 
-          IFI.InlinedCallSites.push_back(CB); 
-  } 
- 
-  // If we cloned in _exactly one_ basic block, and if that block ends in a 
-  // return instruction, we splice the body of the inlined callee directly into 
-  // the calling basic block. 
-  if (Returns.size() == 1 && std::distance(FirstNewBlock, Caller->end()) == 1) { 
-    // Move all of the instructions right before the call. 
-    OrigBB->getInstList().splice(CB.getIterator(), FirstNewBlock->getInstList(), 
-                                 FirstNewBlock->begin(), FirstNewBlock->end()); 
-    // Remove the cloned basic block. 
-    Caller->getBasicBlockList().pop_back(); 
- 
-    // If the call site was an invoke instruction, add a branch to the normal 
-    // destination. 
-    if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) { 
-      BranchInst *NewBr = BranchInst::Create(II->getNormalDest(), &CB); 
-      NewBr->setDebugLoc(Returns[0]->getDebugLoc()); 
-    } 
- 
-    // If the return instruction returned a value, replace uses of the call with 
-    // uses of the returned value. 
-    if (!CB.use_empty()) { 
-      ReturnInst *R = Returns[0]; 
-      if (&CB == R->getReturnValue()) 
-        CB.replaceAllUsesWith(UndefValue::get(CB.getType())); 
-      else 
-        CB.replaceAllUsesWith(R->getReturnValue()); 
-    } 
-    // Since we are now done with the Call/Invoke, we can delete it. 
-    CB.eraseFromParent(); 
- 
-    // Since we are now done with the return instruction, delete it also. 
-    Returns[0]->eraseFromParent(); 
- 
-    // We are now done with the inlining. 
-    return InlineResult::success(); 
-  } 
- 
-  // Otherwise, we have the normal case, of more than one block to inline or 
-  // multiple return sites. 
- 
-  // We want to clone the entire callee function into the hole between the 
-  // "starter" and "ender" blocks.  How we accomplish this depends on whether 
-  // this is an invoke instruction or a call instruction. 
-  BasicBlock *AfterCallBB; 
-  BranchInst *CreatedBranchToNormalDest = nullptr; 
-  if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) { 
- 
-    // Add an unconditional branch to make this look like the CallInst case... 
-    CreatedBranchToNormalDest = BranchInst::Create(II->getNormalDest(), &CB); 
- 
-    // Split the basic block.  This guarantees that no PHI nodes will have to be 
-    // updated due to new incoming edges, and make the invoke case more 
-    // symmetric to the call case. 
-    AfterCallBB = 
-        OrigBB->splitBasicBlock(CreatedBranchToNormalDest->getIterator(), 
-                                CalledFunc->getName() + ".exit"); 
- 
-  } else { // It's a call 
-    // If this is a call instruction, we need to split the basic block that 
-    // the call lives in. 
-    // 
-    AfterCallBB = OrigBB->splitBasicBlock(CB.getIterator(), 
-                                          CalledFunc->getName() + ".exit"); 
-  } 
- 
-  if (IFI.CallerBFI) { 
-    // Copy original BB's block frequency to AfterCallBB 
-    IFI.CallerBFI->setBlockFreq( 
-        AfterCallBB, IFI.CallerBFI->getBlockFreq(OrigBB).getFrequency()); 
-  } 
- 
-  // Change the branch that used to go to AfterCallBB to branch to the first 
-  // basic block of the inlined function. 
-  // 
-  Instruction *Br = OrigBB->getTerminator(); 
-  assert(Br && Br->getOpcode() == Instruction::Br && 
-         "splitBasicBlock broken!"); 
-  Br->setOperand(0, &*FirstNewBlock); 
- 
-  // Now that the function is correct, make it a little bit nicer.  In 
-  // particular, move the basic blocks inserted from the end of the function 
-  // into the space made by splitting the source basic block. 
-  Caller->getBasicBlockList().splice(AfterCallBB->getIterator(), 
-                                     Caller->getBasicBlockList(), FirstNewBlock, 
-                                     Caller->end()); 
- 
-  // Handle all of the return instructions that we just cloned in, and eliminate 
-  // any users of the original call/invoke instruction. 
-  Type *RTy = CalledFunc->getReturnType(); 
- 
-  PHINode *PHI = nullptr; 
-  if (Returns.size() > 1) { 
-    // The PHI node should go at the front of the new basic block to merge all 
-    // possible incoming values. 
-    if (!CB.use_empty()) { 
-      PHI = PHINode::Create(RTy, Returns.size(), CB.getName(), 
-                            &AfterCallBB->front()); 
-      // Anything that used the result of the function call should now use the 
-      // PHI node as their operand. 
-      CB.replaceAllUsesWith(PHI); 
-    } 
- 
-    // Loop over all of the return instructions adding entries to the PHI node 
-    // as appropriate. 
-    if (PHI) { 
-      for (unsigned i = 0, e = Returns.size(); i != e; ++i) { 
-        ReturnInst *RI = Returns[i]; 
-        assert(RI->getReturnValue()->getType() == PHI->getType() && 
-               "Ret value not consistent in function!"); 
-        PHI->addIncoming(RI->getReturnValue(), RI->getParent()); 
-      } 
-    } 
- 
-    // Add a branch to the merge points and remove return instructions. 
-    DebugLoc Loc; 
-    for (unsigned i = 0, e = Returns.size(); i != e; ++i) { 
-      ReturnInst *RI = Returns[i]; 
-      BranchInst* BI = BranchInst::Create(AfterCallBB, RI); 
-      Loc = RI->getDebugLoc(); 
-      BI->setDebugLoc(Loc); 
-      RI->eraseFromParent(); 
-    } 
-    // We need to set the debug location to *somewhere* inside the 
-    // inlined function. The line number may be nonsensical, but the 
-    // instruction will at least be associated with the right 
-    // function. 
-    if (CreatedBranchToNormalDest) 
-      CreatedBranchToNormalDest->setDebugLoc(Loc); 
-  } else if (!Returns.empty()) { 
-    // Otherwise, if there is exactly one return value, just replace anything 
-    // using the return value of the call with the computed value. 
-    if (!CB.use_empty()) { 
-      if (&CB == Returns[0]->getReturnValue()) 
-        CB.replaceAllUsesWith(UndefValue::get(CB.getType())); 
-      else 
-        CB.replaceAllUsesWith(Returns[0]->getReturnValue()); 
-    } 
- 
-    // Update PHI nodes that use the ReturnBB to use the AfterCallBB. 
-    BasicBlock *ReturnBB = Returns[0]->getParent(); 
-    ReturnBB->replaceAllUsesWith(AfterCallBB); 
- 
-    // Splice the code from the return block into the block that it will return 
-    // to, which contains the code that was after the call. 
-    AfterCallBB->getInstList().splice(AfterCallBB->begin(), 
-                                      ReturnBB->getInstList()); 
- 
-    if (CreatedBranchToNormalDest) 
-      CreatedBranchToNormalDest->setDebugLoc(Returns[0]->getDebugLoc()); 
- 
-    // Delete the return instruction now and empty ReturnBB now. 
-    Returns[0]->eraseFromParent(); 
-    ReturnBB->eraseFromParent(); 
-  } else if (!CB.use_empty()) { 
-    // No returns, but something is using the return value of the call.  Just 
-    // nuke the result. 
-    CB.replaceAllUsesWith(UndefValue::get(CB.getType())); 
-  } 
- 
-  // Since we are now done with the Call/Invoke, we can delete it. 
-  CB.eraseFromParent(); 
- 
-  // If we inlined any musttail calls and the original return is now 
-  // unreachable, delete it.  It can only contain a bitcast and ret. 
+
+        SmallVector<OperandBundleDef, 1> OpBundles;
+        DeoptCall->getOperandBundlesAsDefs(OpBundles);
+        DeoptCall->eraseFromParent();
+        assert(!OpBundles.empty() &&
+               "Expected at least the deopt operand bundle");
+
+        IRBuilder<> Builder(CurBB);
+        CallInst *NewDeoptCall =
+            Builder.CreateCall(NewDeoptIntrinsic, CallArgs, OpBundles);
+        NewDeoptCall->setCallingConv(CallingConv);
+        if (NewDeoptCall->getType()->isVoidTy())
+          Builder.CreateRetVoid();
+        else
+          Builder.CreateRet(NewDeoptCall);
+      }
+
+      // Leave behind the normal returns so we can merge control flow.
+      std::swap(Returns, NormalReturns);
+    }
+  }
+
+  // Handle any inlined musttail call sites.  In order for a new call site to be
+  // musttail, the source of the clone and the inlined call site must have been
+  // musttail.  Therefore it's safe to return without merging control into the
+  // phi below.
+  if (InlinedMustTailCalls) {
+    // Check if we need to bitcast the result of any musttail calls.
+    Type *NewRetTy = Caller->getReturnType();
+    bool NeedBitCast = !CB.use_empty() && CB.getType() != NewRetTy;
+
+    // Handle the returns preceded by musttail calls separately.
+    SmallVector<ReturnInst *, 8> NormalReturns;
+    for (ReturnInst *RI : Returns) {
+      CallInst *ReturnedMustTail =
+          RI->getParent()->getTerminatingMustTailCall();
+      if (!ReturnedMustTail) {
+        NormalReturns.push_back(RI);
+        continue;
+      }
+      if (!NeedBitCast)
+        continue;
+
+      // Delete the old return and any preceding bitcast.
+      BasicBlock *CurBB = RI->getParent();
+      auto *OldCast = dyn_cast_or_null<BitCastInst>(RI->getReturnValue());
+      RI->eraseFromParent();
+      if (OldCast)
+        OldCast->eraseFromParent();
+
+      // Insert a new bitcast and return with the right type.
+      IRBuilder<> Builder(CurBB);
+      Builder.CreateRet(Builder.CreateBitCast(ReturnedMustTail, NewRetTy));
+    }
+
+    // Leave behind the normal returns so we can merge control flow.
+    std::swap(Returns, NormalReturns);
+  }
+
+  // Now that all of the transforms on the inlined code have taken place but
+  // before we splice the inlined code into the CFG and lose track of which
+  // blocks were actually inlined, collect the call sites. We only do this if
+  // call graph updates weren't requested, as those provide value handle based
+  // tracking of inlined call sites instead.
+  if (InlinedFunctionInfo.ContainsCalls && !IFI.CG) {
+    // Otherwise just collect the raw call sites that were inlined.
+    for (BasicBlock &NewBB :
+         make_range(FirstNewBlock->getIterator(), Caller->end()))
+      for (Instruction &I : NewBB)
+        if (auto *CB = dyn_cast<CallBase>(&I))
+          IFI.InlinedCallSites.push_back(CB);
+  }
+
+  // If we cloned in _exactly one_ basic block, and if that block ends in a
+  // return instruction, we splice the body of the inlined callee directly into
+  // the calling basic block.
+  if (Returns.size() == 1 && std::distance(FirstNewBlock, Caller->end()) == 1) {
+    // Move all of the instructions right before the call.
+    OrigBB->getInstList().splice(CB.getIterator(), FirstNewBlock->getInstList(),
+                                 FirstNewBlock->begin(), FirstNewBlock->end());
+    // Remove the cloned basic block.
+    Caller->getBasicBlockList().pop_back();
+
+    // If the call site was an invoke instruction, add a branch to the normal
+    // destination.
+    if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
+      BranchInst *NewBr = BranchInst::Create(II->getNormalDest(), &CB);
+      NewBr->setDebugLoc(Returns[0]->getDebugLoc());
+    }
+
+    // If the return instruction returned a value, replace uses of the call with
+    // uses of the returned value.
+    if (!CB.use_empty()) {
+      ReturnInst *R = Returns[0];
+      if (&CB == R->getReturnValue())
+        CB.replaceAllUsesWith(UndefValue::get(CB.getType()));
+      else
+        CB.replaceAllUsesWith(R->getReturnValue());
+    }
+    // Since we are now done with the Call/Invoke, we can delete it.
+    CB.eraseFromParent();
+
+    // Since we are now done with the return instruction, delete it also.
+    Returns[0]->eraseFromParent();
+
+    // We are now done with the inlining.
+    return InlineResult::success();
+  }
+
+  // Otherwise, we have the normal case, of more than one block to inline or
+  // multiple return sites.
+
+  // We want to clone the entire callee function into the hole between the
+  // "starter" and "ender" blocks.  How we accomplish this depends on whether
+  // this is an invoke instruction or a call instruction.
+  BasicBlock *AfterCallBB;
+  BranchInst *CreatedBranchToNormalDest = nullptr;
+  if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
+
+    // Add an unconditional branch to make this look like the CallInst case...
+    CreatedBranchToNormalDest = BranchInst::Create(II->getNormalDest(), &CB);
+
+    // Split the basic block.  This guarantees that no PHI nodes will have to be
+    // updated due to new incoming edges, and make the invoke case more
+    // symmetric to the call case.
+    AfterCallBB =
+        OrigBB->splitBasicBlock(CreatedBranchToNormalDest->getIterator(),
+                                CalledFunc->getName() + ".exit");
+
+  } else { // It's a call
+    // If this is a call instruction, we need to split the basic block that
+    // the call lives in.
+    //
+    AfterCallBB = OrigBB->splitBasicBlock(CB.getIterator(),
+                                          CalledFunc->getName() + ".exit");
+  }
+
+  if (IFI.CallerBFI) {
+    // Copy original BB's block frequency to AfterCallBB
+    IFI.CallerBFI->setBlockFreq(
+        AfterCallBB, IFI.CallerBFI->getBlockFreq(OrigBB).getFrequency());
+  }
+
+  // Change the branch that used to go to AfterCallBB to branch to the first
+  // basic block of the inlined function.
+  //
+  Instruction *Br = OrigBB->getTerminator();
+  assert(Br && Br->getOpcode() == Instruction::Br &&
+         "splitBasicBlock broken!");
+  Br->setOperand(0, &*FirstNewBlock);
+
+  // Now that the function is correct, make it a little bit nicer.  In
+  // particular, move the basic blocks inserted from the end of the function
+  // into the space made by splitting the source basic block.
+  Caller->getBasicBlockList().splice(AfterCallBB->getIterator(),
+                                     Caller->getBasicBlockList(), FirstNewBlock,
+                                     Caller->end());
+
+  // Handle all of the return instructions that we just cloned in, and eliminate
+  // any users of the original call/invoke instruction.
+  Type *RTy = CalledFunc->getReturnType();
+
+  PHINode *PHI = nullptr;
+  if (Returns.size() > 1) {
+    // The PHI node should go at the front of the new basic block to merge all
+    // possible incoming values.
+    if (!CB.use_empty()) {
+      PHI = PHINode::Create(RTy, Returns.size(), CB.getName(),
+                            &AfterCallBB->front());
+      // Anything that used the result of the function call should now use the
+      // PHI node as their operand.
+      CB.replaceAllUsesWith(PHI);
+    }
+
+    // Loop over all of the return instructions adding entries to the PHI node
+    // as appropriate.
+    if (PHI) {
+      for (unsigned i = 0, e = Returns.size(); i != e; ++i) {
+        ReturnInst *RI = Returns[i];
+        assert(RI->getReturnValue()->getType() == PHI->getType() &&
+               "Ret value not consistent in function!");
+        PHI->addIncoming(RI->getReturnValue(), RI->getParent());
+      }
+    }
+
+    // Add a branch to the merge points and remove return instructions.
+    DebugLoc Loc;
+    for (unsigned i = 0, e = Returns.size(); i != e; ++i) {
+      ReturnInst *RI = Returns[i];
+      BranchInst* BI = BranchInst::Create(AfterCallBB, RI);
+      Loc = RI->getDebugLoc();
+      BI->setDebugLoc(Loc);
+      RI->eraseFromParent();
+    }
+    // We need to set the debug location to *somewhere* inside the
+    // inlined function. The line number may be nonsensical, but the
+    // instruction will at least be associated with the right
+    // function.
+    if (CreatedBranchToNormalDest)
+      CreatedBranchToNormalDest->setDebugLoc(Loc);
+  } else if (!Returns.empty()) {
+    // Otherwise, if there is exactly one return value, just replace anything
+    // using the return value of the call with the computed value.
+    if (!CB.use_empty()) {
+      if (&CB == Returns[0]->getReturnValue())
+        CB.replaceAllUsesWith(UndefValue::get(CB.getType()));
+      else
+        CB.replaceAllUsesWith(Returns[0]->getReturnValue());
+    }
+
+    // Update PHI nodes that use the ReturnBB to use the AfterCallBB.
+    BasicBlock *ReturnBB = Returns[0]->getParent();
+    ReturnBB->replaceAllUsesWith(AfterCallBB);
+
+    // Splice the code from the return block into the block that it will return
+    // to, which contains the code that was after the call.
+    AfterCallBB->getInstList().splice(AfterCallBB->begin(),
+                                      ReturnBB->getInstList());
+
+    if (CreatedBranchToNormalDest)
+      CreatedBranchToNormalDest->setDebugLoc(Returns[0]->getDebugLoc());
+
+    // Delete the return instruction now and empty ReturnBB now.
+    Returns[0]->eraseFromParent();
+    ReturnBB->eraseFromParent();
+  } else if (!CB.use_empty()) {
+    // No returns, but something is using the return value of the call.  Just
+    // nuke the result.
+    CB.replaceAllUsesWith(UndefValue::get(CB.getType()));
+  }
+
+  // Since we are now done with the Call/Invoke, we can delete it.
+  CB.eraseFromParent();
+
+  // If we inlined any musttail calls and the original return is now
+  // unreachable, delete it.  It can only contain a bitcast and ret.
   if (InlinedMustTailCalls && pred_empty(AfterCallBB))
-    AfterCallBB->eraseFromParent(); 
- 
-  // We should always be able to fold the entry block of the function into the 
-  // single predecessor of the block... 
-  assert(cast<BranchInst>(Br)->isUnconditional() && "splitBasicBlock broken!"); 
-  BasicBlock *CalleeEntry = cast<BranchInst>(Br)->getSuccessor(0); 
- 
-  // Splice the code entry block into calling block, right before the 
-  // unconditional branch. 
-  CalleeEntry->replaceAllUsesWith(OrigBB);  // Update PHI nodes 
-  OrigBB->getInstList().splice(Br->getIterator(), CalleeEntry->getInstList()); 
- 
-  // Remove the unconditional branch. 
-  OrigBB->getInstList().erase(Br); 
- 
-  // Now we can remove the CalleeEntry block, which is now empty. 
-  Caller->getBasicBlockList().erase(CalleeEntry); 
- 
-  // If we inserted a phi node, check to see if it has a single value (e.g. all 
-  // the entries are the same or undef).  If so, remove the PHI so it doesn't 
-  // block other optimizations. 
-  if (PHI) { 
-    AssumptionCache *AC = 
-        IFI.GetAssumptionCache ? &IFI.GetAssumptionCache(*Caller) : nullptr; 
-    auto &DL = Caller->getParent()->getDataLayout(); 
-    if (Value *V = SimplifyInstruction(PHI, {DL, nullptr, nullptr, AC})) { 
-      PHI->replaceAllUsesWith(V); 
-      PHI->eraseFromParent(); 
-    } 
-  } 
- 
-  return InlineResult::success(); 
-} 
+    AfterCallBB->eraseFromParent();
+
+  // We should always be able to fold the entry block of the function into the
+  // single predecessor of the block...
+  assert(cast<BranchInst>(Br)->isUnconditional() && "splitBasicBlock broken!");
+  BasicBlock *CalleeEntry = cast<BranchInst>(Br)->getSuccessor(0);
+
+  // Splice the code entry block into calling block, right before the
+  // unconditional branch.
+  CalleeEntry->replaceAllUsesWith(OrigBB);  // Update PHI nodes
+  OrigBB->getInstList().splice(Br->getIterator(), CalleeEntry->getInstList());
+
+  // Remove the unconditional branch.
+  OrigBB->getInstList().erase(Br);
+
+  // Now we can remove the CalleeEntry block, which is now empty.
+  Caller->getBasicBlockList().erase(CalleeEntry);
+
+  // If we inserted a phi node, check to see if it has a single value (e.g. all
+  // the entries are the same or undef).  If so, remove the PHI so it doesn't
+  // block other optimizations.
+  if (PHI) {
+    AssumptionCache *AC =
+        IFI.GetAssumptionCache ? &IFI.GetAssumptionCache(*Caller) : nullptr;
+    auto &DL = Caller->getParent()->getDataLayout();
+    if (Value *V = SimplifyInstruction(PHI, {DL, nullptr, nullptr, AC})) {
+      PHI->replaceAllUsesWith(V);
+      PHI->eraseFromParent();
+    }
+  }
+
+  return InlineResult::success();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/InstructionNamer.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/InstructionNamer.cpp
index ad334034b0..f3499c9c8a 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/InstructionNamer.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/InstructionNamer.cpp
@@ -1,35 +1,35 @@
-//===- InstructionNamer.cpp - Give anonymous instructions names -----------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This is a little utility pass that gives instructions names, this is mostly 
-// useful when diffing the effect of an optimization because deleting an 
-// unnamed instruction can change all other instruction numbering, making the 
-// diff very noisy. 
-// 
-//===----------------------------------------------------------------------===// 
- 
+//===- InstructionNamer.cpp - Give anonymous instructions names -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a little utility pass that gives instructions names, this is mostly
+// useful when diffing the effect of an optimization because deleting an
+// unnamed instruction can change all other instruction numbering, making the
+// diff very noisy.
+//
+//===----------------------------------------------------------------------===//
+
 #include "llvm/Transforms/Utils/InstructionNamer.h"
-#include "llvm/IR/Function.h" 
+#include "llvm/IR/Function.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/IR/Type.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Transforms/Utils.h" 
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils.h"
+
+using namespace llvm;
 
-using namespace llvm; 
- 
-namespace { 
+namespace {
 void nameInstructions(Function &F) {
   for (auto &Arg : F.args()) {
     if (!Arg.hasName())
       Arg.setName("arg");
   }
- 
+
   for (BasicBlock &BB : F) {
     if (!BB.hasName())
       BB.setName("bb");
@@ -37,39 +37,39 @@ void nameInstructions(Function &F) {
     for (Instruction &I : BB) {
       if (!I.hasName() && !I.getType()->isVoidTy())
         I.setName("i");
-    } 
+    }
   }
 }
- 
+
 struct InstNamer : public FunctionPass {
   static char ID; // Pass identification, replacement for typeid
   InstNamer() : FunctionPass(ID) {
     initializeInstNamerPass(*PassRegistry::getPassRegistry());
   }
- 
+
   void getAnalysisUsage(AnalysisUsage &Info) const override {
     Info.setPreservesAll();
   }
- 
+
   bool runOnFunction(Function &F) override {
     nameInstructions(F);
     return true;
   }
 };
- 
-  char InstNamer::ID = 0; 
+
+  char InstNamer::ID = 0;
   } // namespace
- 
-INITIALIZE_PASS(InstNamer, "instnamer", 
-                "Assign names to anonymous instructions", false, false) 
-char &llvm::InstructionNamerID = InstNamer::ID; 
-//===----------------------------------------------------------------------===// 
-// 
-// InstructionNamer - Give any unnamed non-void instructions "tmp" names. 
-// 
-FunctionPass *llvm::createInstructionNamerPass() { 
-  return new InstNamer(); 
-} 
+
+INITIALIZE_PASS(InstNamer, "instnamer",
+                "Assign names to anonymous instructions", false, false)
+char &llvm::InstructionNamerID = InstNamer::ID;
+//===----------------------------------------------------------------------===//
+//
+// InstructionNamer - Give any unnamed non-void instructions "tmp" names.
+//
+FunctionPass *llvm::createInstructionNamerPass() {
+  return new InstNamer();
+}
 
 PreservedAnalyses InstructionNamerPass::run(Function &F,
                                             FunctionAnalysisManager &FAM) {
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/IntegerDivision.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/IntegerDivision.cpp
index ffb56f2fbe..9082049c82 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/IntegerDivision.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/IntegerDivision.cpp
@@ -1,673 +1,673 @@
-//===-- IntegerDivision.cpp - Expand integer division ---------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file contains an implementation of 32bit and 64bit scalar integer 
-// division for targets that don't have native support. It's largely derived 
-// from compiler-rt's implementations of __udivsi3 and __udivmoddi4, 
-// but hand-tuned for targets that prefer less control flow. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/IntegerDivision.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include <utility> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "integer-division" 
- 
-/// Generate code to compute the remainder of two signed integers. Returns the 
-/// remainder, which will have the sign of the dividend. Builder's insert point 
-/// should be pointing where the caller wants code generated, e.g. at the srem 
-/// instruction. This will generate a urem in the process, and Builder's insert 
-/// point will be pointing at the uren (if present, i.e. not folded), ready to 
-/// be expanded if the user wishes 
-static Value *generateSignedRemainderCode(Value *Dividend, Value *Divisor, 
-                                          IRBuilder<> &Builder) { 
-  unsigned BitWidth = Dividend->getType()->getIntegerBitWidth(); 
-  ConstantInt *Shift; 
- 
-  if (BitWidth == 64) { 
-    Shift = Builder.getInt64(63); 
-  } else { 
-    assert(BitWidth == 32 && "Unexpected bit width"); 
-    Shift = Builder.getInt32(31); 
-  } 
- 
-  // Following instructions are generated for both i32 (shift 31) and 
-  // i64 (shift 63). 
- 
-  // ;   %dividend_sgn = ashr i32 %dividend, 31 
-  // ;   %divisor_sgn  = ashr i32 %divisor, 31 
-  // ;   %dvd_xor      = xor i32 %dividend, %dividend_sgn 
-  // ;   %dvs_xor      = xor i32 %divisor, %divisor_sgn 
-  // ;   %u_dividend   = sub i32 %dvd_xor, %dividend_sgn 
-  // ;   %u_divisor    = sub i32 %dvs_xor, %divisor_sgn 
-  // ;   %urem         = urem i32 %dividend, %divisor 
-  // ;   %xored        = xor i32 %urem, %dividend_sgn 
-  // ;   %srem         = sub i32 %xored, %dividend_sgn 
-  Value *DividendSign = Builder.CreateAShr(Dividend, Shift); 
-  Value *DivisorSign  = Builder.CreateAShr(Divisor, Shift); 
-  Value *DvdXor       = Builder.CreateXor(Dividend, DividendSign); 
-  Value *DvsXor       = Builder.CreateXor(Divisor, DivisorSign); 
-  Value *UDividend    = Builder.CreateSub(DvdXor, DividendSign); 
-  Value *UDivisor     = Builder.CreateSub(DvsXor, DivisorSign); 
-  Value *URem         = Builder.CreateURem(UDividend, UDivisor); 
-  Value *Xored        = Builder.CreateXor(URem, DividendSign); 
-  Value *SRem         = Builder.CreateSub(Xored, DividendSign); 
- 
-  if (Instruction *URemInst = dyn_cast<Instruction>(URem)) 
-    Builder.SetInsertPoint(URemInst); 
- 
-  return SRem; 
-} 
- 
- 
-/// Generate code to compute the remainder of two unsigned integers. Returns the 
-/// remainder. Builder's insert point should be pointing where the caller wants 
-/// code generated, e.g. at the urem instruction. This will generate a udiv in 
-/// the process, and Builder's insert point will be pointing at the udiv (if 
-/// present, i.e. not folded), ready to be expanded if the user wishes 
-static Value *generatedUnsignedRemainderCode(Value *Dividend, Value *Divisor, 
-                                             IRBuilder<> &Builder) { 
-  // Remainder = Dividend - Quotient*Divisor 
- 
-  // Following instructions are generated for both i32 and i64 
- 
-  // ;   %quotient  = udiv i32 %dividend, %divisor 
-  // ;   %product   = mul i32 %divisor, %quotient 
-  // ;   %remainder = sub i32 %dividend, %product 
-  Value *Quotient  = Builder.CreateUDiv(Dividend, Divisor); 
-  Value *Product   = Builder.CreateMul(Divisor, Quotient); 
-  Value *Remainder = Builder.CreateSub(Dividend, Product); 
- 
-  if (Instruction *UDiv = dyn_cast<Instruction>(Quotient)) 
-    Builder.SetInsertPoint(UDiv); 
- 
-  return Remainder; 
-} 
- 
-/// Generate code to divide two signed integers. Returns the quotient, rounded 
-/// towards 0. Builder's insert point should be pointing where the caller wants 
-/// code generated, e.g. at the sdiv instruction. This will generate a udiv in 
-/// the process, and Builder's insert point will be pointing at the udiv (if 
-/// present, i.e. not folded), ready to be expanded if the user wishes. 
-static Value *generateSignedDivisionCode(Value *Dividend, Value *Divisor, 
-                                         IRBuilder<> &Builder) { 
-  // Implementation taken from compiler-rt's __divsi3 and __divdi3 
- 
-  unsigned BitWidth = Dividend->getType()->getIntegerBitWidth(); 
-  ConstantInt *Shift; 
- 
-  if (BitWidth == 64) { 
-    Shift = Builder.getInt64(63); 
-  } else { 
-    assert(BitWidth == 32 && "Unexpected bit width"); 
-    Shift = Builder.getInt32(31); 
-  } 
- 
-  // Following instructions are generated for both i32 (shift 31) and 
-  // i64 (shift 63). 
- 
-  // ;   %tmp    = ashr i32 %dividend, 31 
-  // ;   %tmp1   = ashr i32 %divisor, 31 
-  // ;   %tmp2   = xor i32 %tmp, %dividend 
-  // ;   %u_dvnd = sub nsw i32 %tmp2, %tmp 
-  // ;   %tmp3   = xor i32 %tmp1, %divisor 
-  // ;   %u_dvsr = sub nsw i32 %tmp3, %tmp1 
-  // ;   %q_sgn  = xor i32 %tmp1, %tmp 
-  // ;   %q_mag  = udiv i32 %u_dvnd, %u_dvsr 
-  // ;   %tmp4   = xor i32 %q_mag, %q_sgn 
-  // ;   %q      = sub i32 %tmp4, %q_sgn 
-  Value *Tmp    = Builder.CreateAShr(Dividend, Shift); 
-  Value *Tmp1   = Builder.CreateAShr(Divisor, Shift); 
-  Value *Tmp2   = Builder.CreateXor(Tmp, Dividend); 
-  Value *U_Dvnd = Builder.CreateSub(Tmp2, Tmp); 
-  Value *Tmp3   = Builder.CreateXor(Tmp1, Divisor); 
-  Value *U_Dvsr = Builder.CreateSub(Tmp3, Tmp1); 
-  Value *Q_Sgn  = Builder.CreateXor(Tmp1, Tmp); 
-  Value *Q_Mag  = Builder.CreateUDiv(U_Dvnd, U_Dvsr); 
-  Value *Tmp4   = Builder.CreateXor(Q_Mag, Q_Sgn); 
-  Value *Q      = Builder.CreateSub(Tmp4, Q_Sgn); 
- 
-  if (Instruction *UDiv = dyn_cast<Instruction>(Q_Mag)) 
-    Builder.SetInsertPoint(UDiv); 
- 
-  return Q; 
-} 
- 
-/// Generates code to divide two unsigned scalar 32-bit or 64-bit integers. 
-/// Returns the quotient, rounded towards 0. Builder's insert point should 
-/// point where the caller wants code generated, e.g. at the udiv instruction. 
-static Value *generateUnsignedDivisionCode(Value *Dividend, Value *Divisor, 
-                                           IRBuilder<> &Builder) { 
-  // The basic algorithm can be found in the compiler-rt project's 
-  // implementation of __udivsi3.c. Here, we do a lower-level IR based approach 
-  // that's been hand-tuned to lessen the amount of control flow involved. 
- 
-  // Some helper values 
-  IntegerType *DivTy = cast<IntegerType>(Dividend->getType()); 
-  unsigned BitWidth = DivTy->getBitWidth(); 
- 
-  ConstantInt *Zero; 
-  ConstantInt *One; 
-  ConstantInt *NegOne; 
-  ConstantInt *MSB; 
- 
-  if (BitWidth == 64) { 
-    Zero      = Builder.getInt64(0); 
-    One       = Builder.getInt64(1); 
-    NegOne    = ConstantInt::getSigned(DivTy, -1); 
-    MSB       = Builder.getInt64(63); 
-  } else { 
-    assert(BitWidth == 32 && "Unexpected bit width"); 
-    Zero      = Builder.getInt32(0); 
-    One       = Builder.getInt32(1); 
-    NegOne    = ConstantInt::getSigned(DivTy, -1); 
-    MSB       = Builder.getInt32(31); 
-  } 
- 
-  ConstantInt *True = Builder.getTrue(); 
- 
-  BasicBlock *IBB = Builder.GetInsertBlock(); 
-  Function *F = IBB->getParent(); 
-  Function *CTLZ = Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz, 
-                                             DivTy); 
- 
-  // Our CFG is going to look like: 
-  // +---------------------+ 
-  // | special-cases       | 
-  // |   ...               | 
-  // +---------------------+ 
-  //  |       | 
-  //  |   +----------+ 
-  //  |   |  bb1     | 
-  //  |   |  ...     | 
-  //  |   +----------+ 
-  //  |    |      | 
-  //  |    |  +------------+ 
-  //  |    |  |  preheader | 
-  //  |    |  |  ...       | 
-  //  |    |  +------------+ 
-  //  |    |      | 
-  //  |    |      |      +---+ 
-  //  |    |      |      |   | 
-  //  |    |  +------------+ | 
-  //  |    |  |  do-while  | | 
-  //  |    |  |  ...       | | 
-  //  |    |  +------------+ | 
-  //  |    |      |      |   | 
-  //  |   +-----------+  +---+ 
-  //  |   | loop-exit | 
-  //  |   |  ...      | 
-  //  |   +-----------+ 
-  //  |     | 
-  // +-------+ 
-  // | ...   | 
-  // | end   | 
-  // +-------+ 
-  BasicBlock *SpecialCases = Builder.GetInsertBlock(); 
-  SpecialCases->setName(Twine(SpecialCases->getName(), "_udiv-special-cases")); 
-  BasicBlock *End = SpecialCases->splitBasicBlock(Builder.GetInsertPoint(), 
-                                                  "udiv-end"); 
-  BasicBlock *LoopExit  = BasicBlock::Create(Builder.getContext(), 
-                                             "udiv-loop-exit", F, End); 
-  BasicBlock *DoWhile   = BasicBlock::Create(Builder.getContext(), 
-                                             "udiv-do-while", F, End); 
-  BasicBlock *Preheader = BasicBlock::Create(Builder.getContext(), 
-                                             "udiv-preheader", F, End); 
-  BasicBlock *BB1       = BasicBlock::Create(Builder.getContext(), 
-                                             "udiv-bb1", F, End); 
- 
-  // We'll be overwriting the terminator to insert our extra blocks 
-  SpecialCases->getTerminator()->eraseFromParent(); 
- 
-  // Same instructions are generated for both i32 (msb 31) and i64 (msb 63). 
- 
-  // First off, check for special cases: dividend or divisor is zero, divisor 
-  // is greater than dividend, and divisor is 1. 
-  // ; special-cases: 
-  // ;   %ret0_1      = icmp eq i32 %divisor, 0 
-  // ;   %ret0_2      = icmp eq i32 %dividend, 0 
-  // ;   %ret0_3      = or i1 %ret0_1, %ret0_2 
-  // ;   %tmp0        = tail call i32 @llvm.ctlz.i32(i32 %divisor, i1 true) 
-  // ;   %tmp1        = tail call i32 @llvm.ctlz.i32(i32 %dividend, i1 true) 
-  // ;   %sr          = sub nsw i32 %tmp0, %tmp1 
-  // ;   %ret0_4      = icmp ugt i32 %sr, 31 
-  // ;   %ret0        = or i1 %ret0_3, %ret0_4 
-  // ;   %retDividend = icmp eq i32 %sr, 31 
-  // ;   %retVal      = select i1 %ret0, i32 0, i32 %dividend 
-  // ;   %earlyRet    = or i1 %ret0, %retDividend 
-  // ;   br i1 %earlyRet, label %end, label %bb1 
-  Builder.SetInsertPoint(SpecialCases); 
-  Value *Ret0_1      = Builder.CreateICmpEQ(Divisor, Zero); 
-  Value *Ret0_2      = Builder.CreateICmpEQ(Dividend, Zero); 
-  Value *Ret0_3      = Builder.CreateOr(Ret0_1, Ret0_2); 
-  Value *Tmp0 = Builder.CreateCall(CTLZ, {Divisor, True}); 
-  Value *Tmp1 = Builder.CreateCall(CTLZ, {Dividend, True}); 
-  Value *SR          = Builder.CreateSub(Tmp0, Tmp1); 
-  Value *Ret0_4      = Builder.CreateICmpUGT(SR, MSB); 
-  Value *Ret0        = Builder.CreateOr(Ret0_3, Ret0_4); 
-  Value *RetDividend = Builder.CreateICmpEQ(SR, MSB); 
-  Value *RetVal      = Builder.CreateSelect(Ret0, Zero, Dividend); 
-  Value *EarlyRet    = Builder.CreateOr(Ret0, RetDividend); 
-  Builder.CreateCondBr(EarlyRet, End, BB1); 
- 
-  // ; bb1:                                             ; preds = %special-cases 
-  // ;   %sr_1     = add i32 %sr, 1 
-  // ;   %tmp2     = sub i32 31, %sr 
-  // ;   %q        = shl i32 %dividend, %tmp2 
-  // ;   %skipLoop = icmp eq i32 %sr_1, 0 
-  // ;   br i1 %skipLoop, label %loop-exit, label %preheader 
-  Builder.SetInsertPoint(BB1); 
-  Value *SR_1     = Builder.CreateAdd(SR, One); 
-  Value *Tmp2     = Builder.CreateSub(MSB, SR); 
-  Value *Q        = Builder.CreateShl(Dividend, Tmp2); 
-  Value *SkipLoop = Builder.CreateICmpEQ(SR_1, Zero); 
-  Builder.CreateCondBr(SkipLoop, LoopExit, Preheader); 
- 
-  // ; preheader:                                           ; preds = %bb1 
-  // ;   %tmp3 = lshr i32 %dividend, %sr_1 
-  // ;   %tmp4 = add i32 %divisor, -1 
-  // ;   br label %do-while 
-  Builder.SetInsertPoint(Preheader); 
-  Value *Tmp3 = Builder.CreateLShr(Dividend, SR_1); 
-  Value *Tmp4 = Builder.CreateAdd(Divisor, NegOne); 
-  Builder.CreateBr(DoWhile); 
- 
-  // ; do-while:                                 ; preds = %do-while, %preheader 
-  // ;   %carry_1 = phi i32 [ 0, %preheader ], [ %carry, %do-while ] 
-  // ;   %sr_3    = phi i32 [ %sr_1, %preheader ], [ %sr_2, %do-while ] 
-  // ;   %r_1     = phi i32 [ %tmp3, %preheader ], [ %r, %do-while ] 
-  // ;   %q_2     = phi i32 [ %q, %preheader ], [ %q_1, %do-while ] 
-  // ;   %tmp5  = shl i32 %r_1, 1 
-  // ;   %tmp6  = lshr i32 %q_2, 31 
-  // ;   %tmp7  = or i32 %tmp5, %tmp6 
-  // ;   %tmp8  = shl i32 %q_2, 1 
-  // ;   %q_1   = or i32 %carry_1, %tmp8 
-  // ;   %tmp9  = sub i32 %tmp4, %tmp7 
-  // ;   %tmp10 = ashr i32 %tmp9, 31 
-  // ;   %carry = and i32 %tmp10, 1 
-  // ;   %tmp11 = and i32 %tmp10, %divisor 
-  // ;   %r     = sub i32 %tmp7, %tmp11 
-  // ;   %sr_2  = add i32 %sr_3, -1 
-  // ;   %tmp12 = icmp eq i32 %sr_2, 0 
-  // ;   br i1 %tmp12, label %loop-exit, label %do-while 
-  Builder.SetInsertPoint(DoWhile); 
-  PHINode *Carry_1 = Builder.CreatePHI(DivTy, 2); 
-  PHINode *SR_3    = Builder.CreatePHI(DivTy, 2); 
-  PHINode *R_1     = Builder.CreatePHI(DivTy, 2); 
-  PHINode *Q_2     = Builder.CreatePHI(DivTy, 2); 
-  Value *Tmp5  = Builder.CreateShl(R_1, One); 
-  Value *Tmp6  = Builder.CreateLShr(Q_2, MSB); 
-  Value *Tmp7  = Builder.CreateOr(Tmp5, Tmp6); 
-  Value *Tmp8  = Builder.CreateShl(Q_2, One); 
-  Value *Q_1   = Builder.CreateOr(Carry_1, Tmp8); 
-  Value *Tmp9  = Builder.CreateSub(Tmp4, Tmp7); 
-  Value *Tmp10 = Builder.CreateAShr(Tmp9, MSB); 
-  Value *Carry = Builder.CreateAnd(Tmp10, One); 
-  Value *Tmp11 = Builder.CreateAnd(Tmp10, Divisor); 
-  Value *R     = Builder.CreateSub(Tmp7, Tmp11); 
-  Value *SR_2  = Builder.CreateAdd(SR_3, NegOne); 
-  Value *Tmp12 = Builder.CreateICmpEQ(SR_2, Zero); 
-  Builder.CreateCondBr(Tmp12, LoopExit, DoWhile); 
- 
-  // ; loop-exit:                                      ; preds = %do-while, %bb1 
-  // ;   %carry_2 = phi i32 [ 0, %bb1 ], [ %carry, %do-while ] 
-  // ;   %q_3     = phi i32 [ %q, %bb1 ], [ %q_1, %do-while ] 
-  // ;   %tmp13 = shl i32 %q_3, 1 
-  // ;   %q_4   = or i32 %carry_2, %tmp13 
-  // ;   br label %end 
-  Builder.SetInsertPoint(LoopExit); 
-  PHINode *Carry_2 = Builder.CreatePHI(DivTy, 2); 
-  PHINode *Q_3     = Builder.CreatePHI(DivTy, 2); 
-  Value *Tmp13 = Builder.CreateShl(Q_3, One); 
-  Value *Q_4   = Builder.CreateOr(Carry_2, Tmp13); 
-  Builder.CreateBr(End); 
- 
-  // ; end:                                 ; preds = %loop-exit, %special-cases 
-  // ;   %q_5 = phi i32 [ %q_4, %loop-exit ], [ %retVal, %special-cases ] 
-  // ;   ret i32 %q_5 
-  Builder.SetInsertPoint(End, End->begin()); 
-  PHINode *Q_5 = Builder.CreatePHI(DivTy, 2); 
- 
-  // Populate the Phis, since all values have now been created. Our Phis were: 
-  // ;   %carry_1 = phi i32 [ 0, %preheader ], [ %carry, %do-while ] 
-  Carry_1->addIncoming(Zero, Preheader); 
-  Carry_1->addIncoming(Carry, DoWhile); 
-  // ;   %sr_3 = phi i32 [ %sr_1, %preheader ], [ %sr_2, %do-while ] 
-  SR_3->addIncoming(SR_1, Preheader); 
-  SR_3->addIncoming(SR_2, DoWhile); 
-  // ;   %r_1 = phi i32 [ %tmp3, %preheader ], [ %r, %do-while ] 
-  R_1->addIncoming(Tmp3, Preheader); 
-  R_1->addIncoming(R, DoWhile); 
-  // ;   %q_2 = phi i32 [ %q, %preheader ], [ %q_1, %do-while ] 
-  Q_2->addIncoming(Q, Preheader); 
-  Q_2->addIncoming(Q_1, DoWhile); 
-  // ;   %carry_2 = phi i32 [ 0, %bb1 ], [ %carry, %do-while ] 
-  Carry_2->addIncoming(Zero, BB1); 
-  Carry_2->addIncoming(Carry, DoWhile); 
-  // ;   %q_3 = phi i32 [ %q, %bb1 ], [ %q_1, %do-while ] 
-  Q_3->addIncoming(Q, BB1); 
-  Q_3->addIncoming(Q_1, DoWhile); 
-  // ;   %q_5 = phi i32 [ %q_4, %loop-exit ], [ %retVal, %special-cases ] 
-  Q_5->addIncoming(Q_4, LoopExit); 
-  Q_5->addIncoming(RetVal, SpecialCases); 
- 
-  return Q_5; 
-} 
- 
-/// Generate code to calculate the remainder of two integers, replacing Rem with 
-/// the generated code. This currently generates code using the udiv expansion, 
-/// but future work includes generating more specialized code, e.g. when more 
-/// information about the operands are known. Implements both 32bit and 64bit 
-/// scalar division. 
-/// 
-/// Replace Rem with generated code. 
-bool llvm::expandRemainder(BinaryOperator *Rem) { 
-  assert((Rem->getOpcode() == Instruction::SRem || 
-          Rem->getOpcode() == Instruction::URem) && 
-         "Trying to expand remainder from a non-remainder function"); 
- 
-  IRBuilder<> Builder(Rem); 
- 
-  assert(!Rem->getType()->isVectorTy() && "Div over vectors not supported"); 
-  assert((Rem->getType()->getIntegerBitWidth() == 32 || 
-          Rem->getType()->getIntegerBitWidth() == 64) && 
-         "Div of bitwidth other than 32 or 64 not supported"); 
- 
-  // First prepare the sign if it's a signed remainder 
-  if (Rem->getOpcode() == Instruction::SRem) { 
-    Value *Remainder = generateSignedRemainderCode(Rem->getOperand(0), 
-                                                   Rem->getOperand(1), Builder); 
- 
-    // Check whether this is the insert point while Rem is still valid. 
-    bool IsInsertPoint = Rem->getIterator() == Builder.GetInsertPoint(); 
-    Rem->replaceAllUsesWith(Remainder); 
-    Rem->dropAllReferences(); 
-    Rem->eraseFromParent(); 
- 
-    // If we didn't actually generate an urem instruction, we're done 
-    // This happens for example if the input were constant. In this case the 
-    // Builder insertion point was unchanged 
-    if (IsInsertPoint) 
-      return true; 
- 
-    BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint()); 
-    Rem = BO; 
-  } 
- 
-  Value *Remainder = generatedUnsignedRemainderCode(Rem->getOperand(0), 
-                                                    Rem->getOperand(1), 
-                                                    Builder); 
- 
-  Rem->replaceAllUsesWith(Remainder); 
-  Rem->dropAllReferences(); 
-  Rem->eraseFromParent(); 
- 
-  // Expand the udiv 
-  if (BinaryOperator *UDiv = dyn_cast<BinaryOperator>(Builder.GetInsertPoint())) { 
-    assert(UDiv->getOpcode() == Instruction::UDiv && "Non-udiv in expansion?"); 
-    expandDivision(UDiv); 
-  } 
- 
-  return true; 
-} 
- 
- 
-/// Generate code to divide two integers, replacing Div with the generated 
-/// code. This currently generates code similarly to compiler-rt's 
-/// implementations, but future work includes generating more specialized code 
-/// when more information about the operands are known. Implements both 
-/// 32bit and 64bit scalar division. 
-/// 
-/// Replace Div with generated code. 
-bool llvm::expandDivision(BinaryOperator *Div) { 
-  assert((Div->getOpcode() == Instruction::SDiv || 
-          Div->getOpcode() == Instruction::UDiv) && 
-         "Trying to expand division from a non-division function"); 
- 
-  IRBuilder<> Builder(Div); 
- 
-  assert(!Div->getType()->isVectorTy() && "Div over vectors not supported"); 
-  assert((Div->getType()->getIntegerBitWidth() == 32 || 
-          Div->getType()->getIntegerBitWidth() == 64) && 
-         "Div of bitwidth other than 32 or 64 not supported"); 
- 
-  // First prepare the sign if it's a signed division 
-  if (Div->getOpcode() == Instruction::SDiv) { 
-    // Lower the code to unsigned division, and reset Div to point to the udiv. 
-    Value *Quotient = generateSignedDivisionCode(Div->getOperand(0), 
-                                                 Div->getOperand(1), Builder); 
- 
-    // Check whether this is the insert point while Div is still valid. 
-    bool IsInsertPoint = Div->getIterator() == Builder.GetInsertPoint(); 
-    Div->replaceAllUsesWith(Quotient); 
-    Div->dropAllReferences(); 
-    Div->eraseFromParent(); 
- 
-    // If we didn't actually generate an udiv instruction, we're done 
-    // This happens for example if the input were constant. In this case the 
-    // Builder insertion point was unchanged 
-    if (IsInsertPoint) 
-      return true; 
- 
-    BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint()); 
-    Div = BO; 
-  } 
- 
-  // Insert the unsigned division code 
-  Value *Quotient = generateUnsignedDivisionCode(Div->getOperand(0), 
-                                                 Div->getOperand(1), 
-                                                 Builder); 
-  Div->replaceAllUsesWith(Quotient); 
-  Div->dropAllReferences(); 
-  Div->eraseFromParent(); 
- 
-  return true; 
-} 
- 
-/// Generate code to compute the remainder of two integers of bitwidth up to 
-/// 32 bits. Uses the above routines and extends the inputs/truncates the 
-/// outputs to operate in 32 bits; that is, these routines are good for targets 
-/// that have no or very little suppport for smaller than 32 bit integer 
-/// arithmetic. 
-/// 
-/// Replace Rem with emulation code. 
-bool llvm::expandRemainderUpTo32Bits(BinaryOperator *Rem) { 
-  assert((Rem->getOpcode() == Instruction::SRem || 
-          Rem->getOpcode() == Instruction::URem) && 
-          "Trying to expand remainder from a non-remainder function"); 
- 
-  Type *RemTy = Rem->getType(); 
-  assert(!RemTy->isVectorTy() && "Div over vectors not supported"); 
- 
-  unsigned RemTyBitWidth = RemTy->getIntegerBitWidth(); 
- 
-  assert(RemTyBitWidth <= 32 && 
-         "Div of bitwidth greater than 32 not supported"); 
- 
-  if (RemTyBitWidth == 32) 
-    return expandRemainder(Rem); 
- 
-  // If bitwidth smaller than 32 extend inputs, extend output and proceed 
-  // with 32 bit division. 
-  IRBuilder<> Builder(Rem); 
- 
-  Value *ExtDividend; 
-  Value *ExtDivisor; 
-  Value *ExtRem; 
-  Value *Trunc; 
-  Type *Int32Ty = Builder.getInt32Ty(); 
- 
-  if (Rem->getOpcode() == Instruction::SRem) { 
-    ExtDividend = Builder.CreateSExt(Rem->getOperand(0), Int32Ty); 
-    ExtDivisor = Builder.CreateSExt(Rem->getOperand(1), Int32Ty); 
-    ExtRem = Builder.CreateSRem(ExtDividend, ExtDivisor); 
-  } else { 
-    ExtDividend = Builder.CreateZExt(Rem->getOperand(0), Int32Ty); 
-    ExtDivisor = Builder.CreateZExt(Rem->getOperand(1), Int32Ty); 
-    ExtRem = Builder.CreateURem(ExtDividend, ExtDivisor); 
-  } 
-  Trunc = Builder.CreateTrunc(ExtRem, RemTy); 
- 
-  Rem->replaceAllUsesWith(Trunc); 
-  Rem->dropAllReferences(); 
-  Rem->eraseFromParent(); 
- 
-  return expandRemainder(cast<BinaryOperator>(ExtRem)); 
-} 
- 
-/// Generate code to compute the remainder of two integers of bitwidth up to 
-/// 64 bits. Uses the above routines and extends the inputs/truncates the 
-/// outputs to operate in 64 bits. 
-/// 
-/// Replace Rem with emulation code. 
-bool llvm::expandRemainderUpTo64Bits(BinaryOperator *Rem) { 
-  assert((Rem->getOpcode() == Instruction::SRem || 
-          Rem->getOpcode() == Instruction::URem) && 
-          "Trying to expand remainder from a non-remainder function"); 
- 
-  Type *RemTy = Rem->getType(); 
-  assert(!RemTy->isVectorTy() && "Div over vectors not supported"); 
- 
-  unsigned RemTyBitWidth = RemTy->getIntegerBitWidth(); 
- 
-  assert(RemTyBitWidth <= 64 && "Div of bitwidth greater than 64 not supported"); 
- 
-  if (RemTyBitWidth == 64) 
-    return expandRemainder(Rem); 
- 
-  // If bitwidth smaller than 64 extend inputs, extend output and proceed 
-  // with 64 bit division. 
-  IRBuilder<> Builder(Rem); 
- 
-  Value *ExtDividend; 
-  Value *ExtDivisor; 
-  Value *ExtRem; 
-  Value *Trunc; 
-  Type *Int64Ty = Builder.getInt64Ty(); 
- 
-  if (Rem->getOpcode() == Instruction::SRem) { 
-    ExtDividend = Builder.CreateSExt(Rem->getOperand(0), Int64Ty); 
-    ExtDivisor = Builder.CreateSExt(Rem->getOperand(1), Int64Ty); 
-    ExtRem = Builder.CreateSRem(ExtDividend, ExtDivisor); 
-  } else { 
-    ExtDividend = Builder.CreateZExt(Rem->getOperand(0), Int64Ty); 
-    ExtDivisor = Builder.CreateZExt(Rem->getOperand(1), Int64Ty); 
-    ExtRem = Builder.CreateURem(ExtDividend, ExtDivisor); 
-  } 
-  Trunc = Builder.CreateTrunc(ExtRem, RemTy); 
- 
-  Rem->replaceAllUsesWith(Trunc); 
-  Rem->dropAllReferences(); 
-  Rem->eraseFromParent(); 
- 
-  return expandRemainder(cast<BinaryOperator>(ExtRem)); 
-} 
- 
-/// Generate code to divide two integers of bitwidth up to 32 bits. Uses the 
-/// above routines and extends the inputs/truncates the outputs to operate 
-/// in 32 bits; that is, these routines are good for targets that have no 
-/// or very little support for smaller than 32 bit integer arithmetic. 
-/// 
-/// Replace Div with emulation code. 
-bool llvm::expandDivisionUpTo32Bits(BinaryOperator *Div) { 
-  assert((Div->getOpcode() == Instruction::SDiv || 
-          Div->getOpcode() == Instruction::UDiv) && 
-          "Trying to expand division from a non-division function"); 
- 
-  Type *DivTy = Div->getType(); 
-  assert(!DivTy->isVectorTy() && "Div over vectors not supported"); 
- 
-  unsigned DivTyBitWidth = DivTy->getIntegerBitWidth(); 
- 
-  assert(DivTyBitWidth <= 32 && "Div of bitwidth greater than 32 not supported"); 
- 
-  if (DivTyBitWidth == 32) 
-    return expandDivision(Div); 
- 
-  // If bitwidth smaller than 32 extend inputs, extend output and proceed 
-  // with 32 bit division. 
-  IRBuilder<> Builder(Div); 
- 
-  Value *ExtDividend; 
-  Value *ExtDivisor; 
-  Value *ExtDiv; 
-  Value *Trunc; 
-  Type *Int32Ty = Builder.getInt32Ty(); 
- 
-  if (Div->getOpcode() == Instruction::SDiv) { 
-    ExtDividend = Builder.CreateSExt(Div->getOperand(0), Int32Ty); 
-    ExtDivisor = Builder.CreateSExt(Div->getOperand(1), Int32Ty); 
-    ExtDiv = Builder.CreateSDiv(ExtDividend, ExtDivisor); 
-  } else { 
-    ExtDividend = Builder.CreateZExt(Div->getOperand(0), Int32Ty); 
-    ExtDivisor = Builder.CreateZExt(Div->getOperand(1), Int32Ty); 
-    ExtDiv = Builder.CreateUDiv(ExtDividend, ExtDivisor); 
-  } 
-  Trunc = Builder.CreateTrunc(ExtDiv, DivTy); 
- 
-  Div->replaceAllUsesWith(Trunc); 
-  Div->dropAllReferences(); 
-  Div->eraseFromParent(); 
- 
-  return expandDivision(cast<BinaryOperator>(ExtDiv)); 
-} 
- 
-/// Generate code to divide two integers of bitwidth up to 64 bits. Uses the 
-/// above routines and extends the inputs/truncates the outputs to operate 
-/// in 64 bits. 
-/// 
-/// Replace Div with emulation code. 
-bool llvm::expandDivisionUpTo64Bits(BinaryOperator *Div) { 
-  assert((Div->getOpcode() == Instruction::SDiv || 
-          Div->getOpcode() == Instruction::UDiv) && 
-          "Trying to expand division from a non-division function"); 
- 
-  Type *DivTy = Div->getType(); 
-  assert(!DivTy->isVectorTy() && "Div over vectors not supported"); 
- 
-  unsigned DivTyBitWidth = DivTy->getIntegerBitWidth(); 
- 
-  assert(DivTyBitWidth <= 64 && 
-         "Div of bitwidth greater than 64 not supported"); 
- 
-  if (DivTyBitWidth == 64) 
-    return expandDivision(Div); 
- 
-  // If bitwidth smaller than 64 extend inputs, extend output and proceed 
-  // with 64 bit division. 
-  IRBuilder<> Builder(Div); 
- 
-  Value *ExtDividend; 
-  Value *ExtDivisor; 
-  Value *ExtDiv; 
-  Value *Trunc; 
-  Type *Int64Ty = Builder.getInt64Ty(); 
- 
-  if (Div->getOpcode() == Instruction::SDiv) { 
-    ExtDividend = Builder.CreateSExt(Div->getOperand(0), Int64Ty); 
-    ExtDivisor = Builder.CreateSExt(Div->getOperand(1), Int64Ty); 
-    ExtDiv = Builder.CreateSDiv(ExtDividend, ExtDivisor); 
-  } else { 
-    ExtDividend = Builder.CreateZExt(Div->getOperand(0), Int64Ty); 
-    ExtDivisor = Builder.CreateZExt(Div->getOperand(1), Int64Ty); 
-    ExtDiv = Builder.CreateUDiv(ExtDividend, ExtDivisor); 
-  } 
-  Trunc = Builder.CreateTrunc(ExtDiv, DivTy); 
- 
-  Div->replaceAllUsesWith(Trunc); 
-  Div->dropAllReferences(); 
-  Div->eraseFromParent(); 
- 
-  return expandDivision(cast<BinaryOperator>(ExtDiv)); 
-} 
+//===-- IntegerDivision.cpp - Expand integer division ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains an implementation of 32bit and 64bit scalar integer
+// division for targets that don't have native support. It's largely derived
+// from compiler-rt's implementations of __udivsi3 and __udivmoddi4,
+// but hand-tuned for targets that prefer less control flow.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/IntegerDivision.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "integer-division"
+
+/// Generate code to compute the remainder of two signed integers. Returns the
+/// remainder, which will have the sign of the dividend. Builder's insert point
+/// should be pointing where the caller wants code generated, e.g. at the srem
+/// instruction. This will generate a urem in the process, and Builder's insert
+/// point will be pointing at the uren (if present, i.e. not folded), ready to
+/// be expanded if the user wishes
+static Value *generateSignedRemainderCode(Value *Dividend, Value *Divisor,
+                                          IRBuilder<> &Builder) {
+  unsigned BitWidth = Dividend->getType()->getIntegerBitWidth();
+  ConstantInt *Shift;
+
+  if (BitWidth == 64) {
+    Shift = Builder.getInt64(63);
+  } else {
+    assert(BitWidth == 32 && "Unexpected bit width");
+    Shift = Builder.getInt32(31);
+  }
+
+  // Following instructions are generated for both i32 (shift 31) and
+  // i64 (shift 63).
+
+  // ;   %dividend_sgn = ashr i32 %dividend, 31
+  // ;   %divisor_sgn  = ashr i32 %divisor, 31
+  // ;   %dvd_xor      = xor i32 %dividend, %dividend_sgn
+  // ;   %dvs_xor      = xor i32 %divisor, %divisor_sgn
+  // ;   %u_dividend   = sub i32 %dvd_xor, %dividend_sgn
+  // ;   %u_divisor    = sub i32 %dvs_xor, %divisor_sgn
+  // ;   %urem         = urem i32 %dividend, %divisor
+  // ;   %xored        = xor i32 %urem, %dividend_sgn
+  // ;   %srem         = sub i32 %xored, %dividend_sgn
+  Value *DividendSign = Builder.CreateAShr(Dividend, Shift);
+  Value *DivisorSign  = Builder.CreateAShr(Divisor, Shift);
+  Value *DvdXor       = Builder.CreateXor(Dividend, DividendSign);
+  Value *DvsXor       = Builder.CreateXor(Divisor, DivisorSign);
+  Value *UDividend    = Builder.CreateSub(DvdXor, DividendSign);
+  Value *UDivisor     = Builder.CreateSub(DvsXor, DivisorSign);
+  Value *URem         = Builder.CreateURem(UDividend, UDivisor);
+  Value *Xored        = Builder.CreateXor(URem, DividendSign);
+  Value *SRem         = Builder.CreateSub(Xored, DividendSign);
+
+  if (Instruction *URemInst = dyn_cast<Instruction>(URem))
+    Builder.SetInsertPoint(URemInst);
+
+  return SRem;
+}
+
+
+/// Generate code to compute the remainder of two unsigned integers. Returns the
+/// remainder. Builder's insert point should be pointing where the caller wants
+/// code generated, e.g. at the urem instruction. This will generate a udiv in
+/// the process, and Builder's insert point will be pointing at the udiv (if
+/// present, i.e. not folded), ready to be expanded if the user wishes
+static Value *generatedUnsignedRemainderCode(Value *Dividend, Value *Divisor,
+                                             IRBuilder<> &Builder) {
+  // Remainder = Dividend - Quotient*Divisor
+
+  // Following instructions are generated for both i32 and i64
+
+  // ;   %quotient  = udiv i32 %dividend, %divisor
+  // ;   %product   = mul i32 %divisor, %quotient
+  // ;   %remainder = sub i32 %dividend, %product
+  Value *Quotient  = Builder.CreateUDiv(Dividend, Divisor);
+  Value *Product   = Builder.CreateMul(Divisor, Quotient);
+  Value *Remainder = Builder.CreateSub(Dividend, Product);
+
+  if (Instruction *UDiv = dyn_cast<Instruction>(Quotient))
+    Builder.SetInsertPoint(UDiv);
+
+  return Remainder;
+}
+
+/// Generate code to divide two signed integers. Returns the quotient, rounded
+/// towards 0. Builder's insert point should be pointing where the caller wants
+/// code generated, e.g. at the sdiv instruction. This will generate a udiv in
+/// the process, and Builder's insert point will be pointing at the udiv (if
+/// present, i.e. not folded), ready to be expanded if the user wishes.
+static Value *generateSignedDivisionCode(Value *Dividend, Value *Divisor,
+                                         IRBuilder<> &Builder) {
+  // Implementation taken from compiler-rt's __divsi3 and __divdi3
+
+  unsigned BitWidth = Dividend->getType()->getIntegerBitWidth();
+  ConstantInt *Shift;
+
+  if (BitWidth == 64) {
+    Shift = Builder.getInt64(63);
+  } else {
+    assert(BitWidth == 32 && "Unexpected bit width");
+    Shift = Builder.getInt32(31);
+  }
+
+  // Following instructions are generated for both i32 (shift 31) and
+  // i64 (shift 63).
+
+  // ;   %tmp    = ashr i32 %dividend, 31
+  // ;   %tmp1   = ashr i32 %divisor, 31
+  // ;   %tmp2   = xor i32 %tmp, %dividend
+  // ;   %u_dvnd = sub nsw i32 %tmp2, %tmp
+  // ;   %tmp3   = xor i32 %tmp1, %divisor
+  // ;   %u_dvsr = sub nsw i32 %tmp3, %tmp1
+  // ;   %q_sgn  = xor i32 %tmp1, %tmp
+  // ;   %q_mag  = udiv i32 %u_dvnd, %u_dvsr
+  // ;   %tmp4   = xor i32 %q_mag, %q_sgn
+  // ;   %q      = sub i32 %tmp4, %q_sgn
+  Value *Tmp    = Builder.CreateAShr(Dividend, Shift);
+  Value *Tmp1   = Builder.CreateAShr(Divisor, Shift);
+  Value *Tmp2   = Builder.CreateXor(Tmp, Dividend);
+  Value *U_Dvnd = Builder.CreateSub(Tmp2, Tmp);
+  Value *Tmp3   = Builder.CreateXor(Tmp1, Divisor);
+  Value *U_Dvsr = Builder.CreateSub(Tmp3, Tmp1);
+  Value *Q_Sgn  = Builder.CreateXor(Tmp1, Tmp);
+  Value *Q_Mag  = Builder.CreateUDiv(U_Dvnd, U_Dvsr);
+  Value *Tmp4   = Builder.CreateXor(Q_Mag, Q_Sgn);
+  Value *Q      = Builder.CreateSub(Tmp4, Q_Sgn);
+
+  if (Instruction *UDiv = dyn_cast<Instruction>(Q_Mag))
+    Builder.SetInsertPoint(UDiv);
+
+  return Q;
+}
+
+/// Generates code to divide two unsigned scalar 32-bit or 64-bit integers.
+/// Returns the quotient, rounded towards 0. Builder's insert point should
+/// point where the caller wants code generated, e.g. at the udiv instruction.
+static Value *generateUnsignedDivisionCode(Value *Dividend, Value *Divisor,
+                                           IRBuilder<> &Builder) {
+  // The basic algorithm can be found in the compiler-rt project's
+  // implementation of __udivsi3.c. Here, we do a lower-level IR based approach
+  // that's been hand-tuned to lessen the amount of control flow involved.
+
+  // Some helper values
+  IntegerType *DivTy = cast<IntegerType>(Dividend->getType());
+  unsigned BitWidth = DivTy->getBitWidth();
+
+  ConstantInt *Zero;
+  ConstantInt *One;
+  ConstantInt *NegOne;
+  ConstantInt *MSB;
+
+  if (BitWidth == 64) {
+    Zero      = Builder.getInt64(0);
+    One       = Builder.getInt64(1);
+    NegOne    = ConstantInt::getSigned(DivTy, -1);
+    MSB       = Builder.getInt64(63);
+  } else {
+    assert(BitWidth == 32 && "Unexpected bit width");
+    Zero      = Builder.getInt32(0);
+    One       = Builder.getInt32(1);
+    NegOne    = ConstantInt::getSigned(DivTy, -1);
+    MSB       = Builder.getInt32(31);
+  }
+
+  ConstantInt *True = Builder.getTrue();
+
+  BasicBlock *IBB = Builder.GetInsertBlock();
+  Function *F = IBB->getParent();
+  Function *CTLZ = Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz,
+                                             DivTy);
+
+  // Our CFG is going to look like:
+  // +---------------------+
+  // | special-cases       |
+  // |   ...               |
+  // +---------------------+
+  //  |       |
+  //  |   +----------+
+  //  |   |  bb1     |
+  //  |   |  ...     |
+  //  |   +----------+
+  //  |    |      |
+  //  |    |  +------------+
+  //  |    |  |  preheader |
+  //  |    |  |  ...       |
+  //  |    |  +------------+
+  //  |    |      |
+  //  |    |      |      +---+
+  //  |    |      |      |   |
+  //  |    |  +------------+ |
+  //  |    |  |  do-while  | |
+  //  |    |  |  ...       | |
+  //  |    |  +------------+ |
+  //  |    |      |      |   |
+  //  |   +-----------+  +---+
+  //  |   | loop-exit |
+  //  |   |  ...      |
+  //  |   +-----------+
+  //  |     |
+  // +-------+
+  // | ...   |
+  // | end   |
+  // +-------+
+  BasicBlock *SpecialCases = Builder.GetInsertBlock();
+  SpecialCases->setName(Twine(SpecialCases->getName(), "_udiv-special-cases"));
+  BasicBlock *End = SpecialCases->splitBasicBlock(Builder.GetInsertPoint(),
+                                                  "udiv-end");
+  BasicBlock *LoopExit  = BasicBlock::Create(Builder.getContext(),
+                                             "udiv-loop-exit", F, End);
+  BasicBlock *DoWhile   = BasicBlock::Create(Builder.getContext(),
+                                             "udiv-do-while", F, End);
+  BasicBlock *Preheader = BasicBlock::Create(Builder.getContext(),
+                                             "udiv-preheader", F, End);
+  BasicBlock *BB1       = BasicBlock::Create(Builder.getContext(),
+                                             "udiv-bb1", F, End);
+
+  // We'll be overwriting the terminator to insert our extra blocks
+  SpecialCases->getTerminator()->eraseFromParent();
+
+  // Same instructions are generated for both i32 (msb 31) and i64 (msb 63).
+
+  // First off, check for special cases: dividend or divisor is zero, divisor
+  // is greater than dividend, and divisor is 1.
+  // ; special-cases:
+  // ;   %ret0_1      = icmp eq i32 %divisor, 0
+  // ;   %ret0_2      = icmp eq i32 %dividend, 0
+  // ;   %ret0_3      = or i1 %ret0_1, %ret0_2
+  // ;   %tmp0        = tail call i32 @llvm.ctlz.i32(i32 %divisor, i1 true)
+  // ;   %tmp1        = tail call i32 @llvm.ctlz.i32(i32 %dividend, i1 true)
+  // ;   %sr          = sub nsw i32 %tmp0, %tmp1
+  // ;   %ret0_4      = icmp ugt i32 %sr, 31
+  // ;   %ret0        = or i1 %ret0_3, %ret0_4
+  // ;   %retDividend = icmp eq i32 %sr, 31
+  // ;   %retVal      = select i1 %ret0, i32 0, i32 %dividend
+  // ;   %earlyRet    = or i1 %ret0, %retDividend
+  // ;   br i1 %earlyRet, label %end, label %bb1
+  Builder.SetInsertPoint(SpecialCases);
+  Value *Ret0_1      = Builder.CreateICmpEQ(Divisor, Zero);
+  Value *Ret0_2      = Builder.CreateICmpEQ(Dividend, Zero);
+  Value *Ret0_3      = Builder.CreateOr(Ret0_1, Ret0_2);
+  Value *Tmp0 = Builder.CreateCall(CTLZ, {Divisor, True});
+  Value *Tmp1 = Builder.CreateCall(CTLZ, {Dividend, True});
+  Value *SR          = Builder.CreateSub(Tmp0, Tmp1);
+  Value *Ret0_4      = Builder.CreateICmpUGT(SR, MSB);
+  Value *Ret0        = Builder.CreateOr(Ret0_3, Ret0_4);
+  Value *RetDividend = Builder.CreateICmpEQ(SR, MSB);
+  Value *RetVal      = Builder.CreateSelect(Ret0, Zero, Dividend);
+  Value *EarlyRet    = Builder.CreateOr(Ret0, RetDividend);
+  Builder.CreateCondBr(EarlyRet, End, BB1);
+
+  // ; bb1:                                             ; preds = %special-cases
+  // ;   %sr_1     = add i32 %sr, 1
+  // ;   %tmp2     = sub i32 31, %sr
+  // ;   %q        = shl i32 %dividend, %tmp2
+  // ;   %skipLoop = icmp eq i32 %sr_1, 0
+  // ;   br i1 %skipLoop, label %loop-exit, label %preheader
+  Builder.SetInsertPoint(BB1);
+  Value *SR_1     = Builder.CreateAdd(SR, One);
+  Value *Tmp2     = Builder.CreateSub(MSB, SR);
+  Value *Q        = Builder.CreateShl(Dividend, Tmp2);
+  Value *SkipLoop = Builder.CreateICmpEQ(SR_1, Zero);
+  Builder.CreateCondBr(SkipLoop, LoopExit, Preheader);
+
+  // ; preheader:                                           ; preds = %bb1
+  // ;   %tmp3 = lshr i32 %dividend, %sr_1
+  // ;   %tmp4 = add i32 %divisor, -1
+  // ;   br label %do-while
+  Builder.SetInsertPoint(Preheader);
+  Value *Tmp3 = Builder.CreateLShr(Dividend, SR_1);
+  Value *Tmp4 = Builder.CreateAdd(Divisor, NegOne);
+  Builder.CreateBr(DoWhile);
+
+  // ; do-while:                                 ; preds = %do-while, %preheader
+  // ;   %carry_1 = phi i32 [ 0, %preheader ], [ %carry, %do-while ]
+  // ;   %sr_3    = phi i32 [ %sr_1, %preheader ], [ %sr_2, %do-while ]
+  // ;   %r_1     = phi i32 [ %tmp3, %preheader ], [ %r, %do-while ]
+  // ;   %q_2     = phi i32 [ %q, %preheader ], [ %q_1, %do-while ]
+  // ;   %tmp5  = shl i32 %r_1, 1
+  // ;   %tmp6  = lshr i32 %q_2, 31
+  // ;   %tmp7  = or i32 %tmp5, %tmp6
+  // ;   %tmp8  = shl i32 %q_2, 1
+  // ;   %q_1   = or i32 %carry_1, %tmp8
+  // ;   %tmp9  = sub i32 %tmp4, %tmp7
+  // ;   %tmp10 = ashr i32 %tmp9, 31
+  // ;   %carry = and i32 %tmp10, 1
+  // ;   %tmp11 = and i32 %tmp10, %divisor
+  // ;   %r     = sub i32 %tmp7, %tmp11
+  // ;   %sr_2  = add i32 %sr_3, -1
+  // ;   %tmp12 = icmp eq i32 %sr_2, 0
+  // ;   br i1 %tmp12, label %loop-exit, label %do-while
+  Builder.SetInsertPoint(DoWhile);
+  PHINode *Carry_1 = Builder.CreatePHI(DivTy, 2);
+  PHINode *SR_3    = Builder.CreatePHI(DivTy, 2);
+  PHINode *R_1     = Builder.CreatePHI(DivTy, 2);
+  PHINode *Q_2     = Builder.CreatePHI(DivTy, 2);
+  Value *Tmp5  = Builder.CreateShl(R_1, One);
+  Value *Tmp6  = Builder.CreateLShr(Q_2, MSB);
+  Value *Tmp7  = Builder.CreateOr(Tmp5, Tmp6);
+  Value *Tmp8  = Builder.CreateShl(Q_2, One);
+  Value *Q_1   = Builder.CreateOr(Carry_1, Tmp8);
+  Value *Tmp9  = Builder.CreateSub(Tmp4, Tmp7);
+  Value *Tmp10 = Builder.CreateAShr(Tmp9, MSB);
+  Value *Carry = Builder.CreateAnd(Tmp10, One);
+  Value *Tmp11 = Builder.CreateAnd(Tmp10, Divisor);
+  Value *R     = Builder.CreateSub(Tmp7, Tmp11);
+  Value *SR_2  = Builder.CreateAdd(SR_3, NegOne);
+  Value *Tmp12 = Builder.CreateICmpEQ(SR_2, Zero);
+  Builder.CreateCondBr(Tmp12, LoopExit, DoWhile);
+
+  // ; loop-exit:                                      ; preds = %do-while, %bb1
+  // ;   %carry_2 = phi i32 [ 0, %bb1 ], [ %carry, %do-while ]
+  // ;   %q_3     = phi i32 [ %q, %bb1 ], [ %q_1, %do-while ]
+  // ;   %tmp13 = shl i32 %q_3, 1
+  // ;   %q_4   = or i32 %carry_2, %tmp13
+  // ;   br label %end
+  Builder.SetInsertPoint(LoopExit);
+  PHINode *Carry_2 = Builder.CreatePHI(DivTy, 2);
+  PHINode *Q_3     = Builder.CreatePHI(DivTy, 2);
+  Value *Tmp13 = Builder.CreateShl(Q_3, One);
+  Value *Q_4   = Builder.CreateOr(Carry_2, Tmp13);
+  Builder.CreateBr(End);
+
+  // ; end:                                 ; preds = %loop-exit, %special-cases
+  // ;   %q_5 = phi i32 [ %q_4, %loop-exit ], [ %retVal, %special-cases ]
+  // ;   ret i32 %q_5
+  Builder.SetInsertPoint(End, End->begin());
+  PHINode *Q_5 = Builder.CreatePHI(DivTy, 2);
+
+  // Populate the Phis, since all values have now been created. Our Phis were:
+  // ;   %carry_1 = phi i32 [ 0, %preheader ], [ %carry, %do-while ]
+  Carry_1->addIncoming(Zero, Preheader);
+  Carry_1->addIncoming(Carry, DoWhile);
+  // ;   %sr_3 = phi i32 [ %sr_1, %preheader ], [ %sr_2, %do-while ]
+  SR_3->addIncoming(SR_1, Preheader);
+  SR_3->addIncoming(SR_2, DoWhile);
+  // ;   %r_1 = phi i32 [ %tmp3, %preheader ], [ %r, %do-while ]
+  R_1->addIncoming(Tmp3, Preheader);
+  R_1->addIncoming(R, DoWhile);
+  // ;   %q_2 = phi i32 [ %q, %preheader ], [ %q_1, %do-while ]
+  Q_2->addIncoming(Q, Preheader);
+  Q_2->addIncoming(Q_1, DoWhile);
+  // ;   %carry_2 = phi i32 [ 0, %bb1 ], [ %carry, %do-while ]
+  Carry_2->addIncoming(Zero, BB1);
+  Carry_2->addIncoming(Carry, DoWhile);
+  // ;   %q_3 = phi i32 [ %q, %bb1 ], [ %q_1, %do-while ]
+  Q_3->addIncoming(Q, BB1);
+  Q_3->addIncoming(Q_1, DoWhile);
+  // ;   %q_5 = phi i32 [ %q_4, %loop-exit ], [ %retVal, %special-cases ]
+  Q_5->addIncoming(Q_4, LoopExit);
+  Q_5->addIncoming(RetVal, SpecialCases);
+
+  return Q_5;
+}
+
+/// Generate code to calculate the remainder of two integers, replacing Rem with
+/// the generated code. This currently generates code using the udiv expansion,
+/// but future work includes generating more specialized code, e.g. when more
+/// information about the operands are known. Implements both 32bit and 64bit
+/// scalar division.
+///
+/// Replace Rem with generated code.
+bool llvm::expandRemainder(BinaryOperator *Rem) {
+  assert((Rem->getOpcode() == Instruction::SRem ||
+          Rem->getOpcode() == Instruction::URem) &&
+         "Trying to expand remainder from a non-remainder function");
+
+  IRBuilder<> Builder(Rem);
+
+  assert(!Rem->getType()->isVectorTy() && "Div over vectors not supported");
+  assert((Rem->getType()->getIntegerBitWidth() == 32 ||
+          Rem->getType()->getIntegerBitWidth() == 64) &&
+         "Div of bitwidth other than 32 or 64 not supported");
+
+  // First prepare the sign if it's a signed remainder
+  if (Rem->getOpcode() == Instruction::SRem) {
+    Value *Remainder = generateSignedRemainderCode(Rem->getOperand(0),
+                                                   Rem->getOperand(1), Builder);
+
+    // Check whether this is the insert point while Rem is still valid.
+    bool IsInsertPoint = Rem->getIterator() == Builder.GetInsertPoint();
+    Rem->replaceAllUsesWith(Remainder);
+    Rem->dropAllReferences();
+    Rem->eraseFromParent();
+
+    // If we didn't actually generate an urem instruction, we're done
+    // This happens for example if the input were constant. In this case the
+    // Builder insertion point was unchanged
+    if (IsInsertPoint)
+      return true;
+
+    BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint());
+    Rem = BO;
+  }
+
+  Value *Remainder = generatedUnsignedRemainderCode(Rem->getOperand(0),
+                                                    Rem->getOperand(1),
+                                                    Builder);
+
+  Rem->replaceAllUsesWith(Remainder);
+  Rem->dropAllReferences();
+  Rem->eraseFromParent();
+
+  // Expand the udiv
+  if (BinaryOperator *UDiv = dyn_cast<BinaryOperator>(Builder.GetInsertPoint())) {
+    assert(UDiv->getOpcode() == Instruction::UDiv && "Non-udiv in expansion?");
+    expandDivision(UDiv);
+  }
+
+  return true;
+}
+
+
+/// Generate code to divide two integers, replacing Div with the generated
+/// code. This currently generates code similarly to compiler-rt's
+/// implementations, but future work includes generating more specialized code
+/// when more information about the operands are known. Implements both
+/// 32bit and 64bit scalar division.
+///
+/// Replace Div with generated code.
+bool llvm::expandDivision(BinaryOperator *Div) {
+  assert((Div->getOpcode() == Instruction::SDiv ||
+          Div->getOpcode() == Instruction::UDiv) &&
+         "Trying to expand division from a non-division function");
+
+  IRBuilder<> Builder(Div);
+
+  assert(!Div->getType()->isVectorTy() && "Div over vectors not supported");
+  assert((Div->getType()->getIntegerBitWidth() == 32 ||
+          Div->getType()->getIntegerBitWidth() == 64) &&
+         "Div of bitwidth other than 32 or 64 not supported");
+
+  // First prepare the sign if it's a signed division
+  if (Div->getOpcode() == Instruction::SDiv) {
+    // Lower the code to unsigned division, and reset Div to point to the udiv.
+    Value *Quotient = generateSignedDivisionCode(Div->getOperand(0),
+                                                 Div->getOperand(1), Builder);
+
+    // Check whether this is the insert point while Div is still valid.
+    bool IsInsertPoint = Div->getIterator() == Builder.GetInsertPoint();
+    Div->replaceAllUsesWith(Quotient);
+    Div->dropAllReferences();
+    Div->eraseFromParent();
+
+    // If we didn't actually generate an udiv instruction, we're done
+    // This happens for example if the input were constant. In this case the
+    // Builder insertion point was unchanged
+    if (IsInsertPoint)
+      return true;
+
+    BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint());
+    Div = BO;
+  }
+
+  // Insert the unsigned division code
+  Value *Quotient = generateUnsignedDivisionCode(Div->getOperand(0),
+                                                 Div->getOperand(1),
+                                                 Builder);
+  Div->replaceAllUsesWith(Quotient);
+  Div->dropAllReferences();
+  Div->eraseFromParent();
+
+  return true;
+}
+
+/// Generate code to compute the remainder of two integers of bitwidth up to
+/// 32 bits. Uses the above routines and extends the inputs/truncates the
+/// outputs to operate in 32 bits; that is, these routines are good for targets
+/// that have no or very little suppport for smaller than 32 bit integer
+/// arithmetic.
+///
+/// Replace Rem with emulation code.
+bool llvm::expandRemainderUpTo32Bits(BinaryOperator *Rem) {
+  assert((Rem->getOpcode() == Instruction::SRem ||
+          Rem->getOpcode() == Instruction::URem) &&
+          "Trying to expand remainder from a non-remainder function");
+
+  Type *RemTy = Rem->getType();
+  assert(!RemTy->isVectorTy() && "Div over vectors not supported");
+
+  unsigned RemTyBitWidth = RemTy->getIntegerBitWidth();
+
+  assert(RemTyBitWidth <= 32 &&
+         "Div of bitwidth greater than 32 not supported");
+
+  if (RemTyBitWidth == 32)
+    return expandRemainder(Rem);
+
+  // If bitwidth smaller than 32 extend inputs, extend output and proceed
+  // with 32 bit division.
+  IRBuilder<> Builder(Rem);
+
+  Value *ExtDividend;
+  Value *ExtDivisor;
+  Value *ExtRem;
+  Value *Trunc;
+  Type *Int32Ty = Builder.getInt32Ty();
+
+  if (Rem->getOpcode() == Instruction::SRem) {
+    ExtDividend = Builder.CreateSExt(Rem->getOperand(0), Int32Ty);
+    ExtDivisor = Builder.CreateSExt(Rem->getOperand(1), Int32Ty);
+    ExtRem = Builder.CreateSRem(ExtDividend, ExtDivisor);
+  } else {
+    ExtDividend = Builder.CreateZExt(Rem->getOperand(0), Int32Ty);
+    ExtDivisor = Builder.CreateZExt(Rem->getOperand(1), Int32Ty);
+    ExtRem = Builder.CreateURem(ExtDividend, ExtDivisor);
+  }
+  Trunc = Builder.CreateTrunc(ExtRem, RemTy);
+
+  Rem->replaceAllUsesWith(Trunc);
+  Rem->dropAllReferences();
+  Rem->eraseFromParent();
+
+  return expandRemainder(cast<BinaryOperator>(ExtRem));
+}
+
+/// Generate code to compute the remainder of two integers of bitwidth up to
+/// 64 bits. Uses the above routines and extends the inputs/truncates the
+/// outputs to operate in 64 bits.
+///
+/// Replace Rem with emulation code.
+bool llvm::expandRemainderUpTo64Bits(BinaryOperator *Rem) {
+  assert((Rem->getOpcode() == Instruction::SRem ||
+          Rem->getOpcode() == Instruction::URem) &&
+          "Trying to expand remainder from a non-remainder function");
+
+  Type *RemTy = Rem->getType();
+  assert(!RemTy->isVectorTy() && "Div over vectors not supported");
+
+  unsigned RemTyBitWidth = RemTy->getIntegerBitWidth();
+
+  assert(RemTyBitWidth <= 64 && "Div of bitwidth greater than 64 not supported");
+
+  if (RemTyBitWidth == 64)
+    return expandRemainder(Rem);
+
+  // If bitwidth smaller than 64 extend inputs, extend output and proceed
+  // with 64 bit division.
+  IRBuilder<> Builder(Rem);
+
+  Value *ExtDividend;
+  Value *ExtDivisor;
+  Value *ExtRem;
+  Value *Trunc;
+  Type *Int64Ty = Builder.getInt64Ty();
+
+  if (Rem->getOpcode() == Instruction::SRem) {
+    ExtDividend = Builder.CreateSExt(Rem->getOperand(0), Int64Ty);
+    ExtDivisor = Builder.CreateSExt(Rem->getOperand(1), Int64Ty);
+    ExtRem = Builder.CreateSRem(ExtDividend, ExtDivisor);
+  } else {
+    ExtDividend = Builder.CreateZExt(Rem->getOperand(0), Int64Ty);
+    ExtDivisor = Builder.CreateZExt(Rem->getOperand(1), Int64Ty);
+    ExtRem = Builder.CreateURem(ExtDividend, ExtDivisor);
+  }
+  Trunc = Builder.CreateTrunc(ExtRem, RemTy);
+
+  Rem->replaceAllUsesWith(Trunc);
+  Rem->dropAllReferences();
+  Rem->eraseFromParent();
+
+  return expandRemainder(cast<BinaryOperator>(ExtRem));
+}
+
+/// Generate code to divide two integers of bitwidth up to 32 bits. Uses the
+/// above routines and extends the inputs/truncates the outputs to operate
+/// in 32 bits; that is, these routines are good for targets that have no
+/// or very little support for smaller than 32 bit integer arithmetic.
+///
+/// Replace Div with emulation code.
+bool llvm::expandDivisionUpTo32Bits(BinaryOperator *Div) {
+  assert((Div->getOpcode() == Instruction::SDiv ||
+          Div->getOpcode() == Instruction::UDiv) &&
+          "Trying to expand division from a non-division function");
+
+  Type *DivTy = Div->getType();
+  assert(!DivTy->isVectorTy() && "Div over vectors not supported");
+
+  unsigned DivTyBitWidth = DivTy->getIntegerBitWidth();
+
+  assert(DivTyBitWidth <= 32 && "Div of bitwidth greater than 32 not supported");
+
+  if (DivTyBitWidth == 32)
+    return expandDivision(Div);
+
+  // If bitwidth smaller than 32 extend inputs, extend output and proceed
+  // with 32 bit division.
+  IRBuilder<> Builder(Div);
+
+  Value *ExtDividend;
+  Value *ExtDivisor;
+  Value *ExtDiv;
+  Value *Trunc;
+  Type *Int32Ty = Builder.getInt32Ty();
+
+  if (Div->getOpcode() == Instruction::SDiv) {
+    ExtDividend = Builder.CreateSExt(Div->getOperand(0), Int32Ty);
+    ExtDivisor = Builder.CreateSExt(Div->getOperand(1), Int32Ty);
+    ExtDiv = Builder.CreateSDiv(ExtDividend, ExtDivisor);
+  } else {
+    ExtDividend = Builder.CreateZExt(Div->getOperand(0), Int32Ty);
+    ExtDivisor = Builder.CreateZExt(Div->getOperand(1), Int32Ty);
+    ExtDiv = Builder.CreateUDiv(ExtDividend, ExtDivisor);
+  }
+  Trunc = Builder.CreateTrunc(ExtDiv, DivTy);
+
+  Div->replaceAllUsesWith(Trunc);
+  Div->dropAllReferences();
+  Div->eraseFromParent();
+
+  return expandDivision(cast<BinaryOperator>(ExtDiv));
+}
+
+/// Generate code to divide two integers of bitwidth up to 64 bits. Uses the
+/// above routines and extends the inputs/truncates the outputs to operate
+/// in 64 bits.
+///
+/// Replace Div with emulation code.
+bool llvm::expandDivisionUpTo64Bits(BinaryOperator *Div) {
+  assert((Div->getOpcode() == Instruction::SDiv ||
+          Div->getOpcode() == Instruction::UDiv) &&
+          "Trying to expand division from a non-division function");
+
+  Type *DivTy = Div->getType();
+  assert(!DivTy->isVectorTy() && "Div over vectors not supported");
+
+  unsigned DivTyBitWidth = DivTy->getIntegerBitWidth();
+
+  assert(DivTyBitWidth <= 64 &&
+         "Div of bitwidth greater than 64 not supported");
+
+  if (DivTyBitWidth == 64)
+    return expandDivision(Div);
+
+  // If bitwidth smaller than 64 extend inputs, extend output and proceed
+  // with 64 bit division.
+  IRBuilder<> Builder(Div);
+
+  Value *ExtDividend;
+  Value *ExtDivisor;
+  Value *ExtDiv;
+  Value *Trunc;
+  Type *Int64Ty = Builder.getInt64Ty();
+
+  if (Div->getOpcode() == Instruction::SDiv) {
+    ExtDividend = Builder.CreateSExt(Div->getOperand(0), Int64Ty);
+    ExtDivisor = Builder.CreateSExt(Div->getOperand(1), Int64Ty);
+    ExtDiv = Builder.CreateSDiv(ExtDividend, ExtDivisor);
+  } else {
+    ExtDividend = Builder.CreateZExt(Div->getOperand(0), Int64Ty);
+    ExtDivisor = Builder.CreateZExt(Div->getOperand(1), Int64Ty);
+    ExtDiv = Builder.CreateUDiv(ExtDividend, ExtDivisor);
+  }
+  Trunc = Builder.CreateTrunc(ExtDiv, DivTy);
+
+  Div->replaceAllUsesWith(Trunc);
+  Div->dropAllReferences();
+  Div->eraseFromParent();
+
+  return expandDivision(cast<BinaryOperator>(ExtDiv));
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/LCSSA.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/LCSSA.cpp
index c632f11e46..7437701f53 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/LCSSA.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/LCSSA.cpp
@@ -1,277 +1,277 @@
-//===-- LCSSA.cpp - Convert loops into loop-closed SSA form ---------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass transforms loops by placing phi nodes at the end of the loops for 
-// all values that are live across the loop boundary.  For example, it turns 
-// the left into the right code: 
-// 
-// for (...)                for (...) 
-//   if (c)                   if (c) 
-//     X1 = ...                 X1 = ... 
-//   else                     else 
-//     X2 = ...                 X2 = ... 
-//   X3 = phi(X1, X2)         X3 = phi(X1, X2) 
-// ... = X3 + 4             X4 = phi(X3) 
-//                          ... = X4 + 4 
-// 
-// This is still valid LLVM; the extra phi nodes are purely redundant, and will 
-// be trivially eliminated by InstCombine.  The major benefit of this 
-// transformation is that it makes many other loop optimizations, such as 
-// LoopUnswitching, simpler. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/LCSSA.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/AliasAnalysis.h" 
-#include "llvm/Analysis/BasicAliasAnalysis.h" 
-#include "llvm/Analysis/BranchProbabilityInfo.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/LoopPass.h" 
-#include "llvm/Analysis/MemorySSA.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
+//===-- LCSSA.cpp - Convert loops into loop-closed SSA form ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass transforms loops by placing phi nodes at the end of the loops for
+// all values that are live across the loop boundary.  For example, it turns
+// the left into the right code:
+//
+// for (...)                for (...)
+//   if (c)                   if (c)
+//     X1 = ...                 X1 = ...
+//   else                     else
+//     X2 = ...                 X2 = ...
+//   X3 = phi(X1, X2)         X3 = phi(X1, X2)
+// ... = X3 + 4             X4 = phi(X3)
+//                          ... = X4 + 4
+//
+// This is still valid LLVM; the extra phi nodes are purely redundant, and will
+// be trivially eliminated by InstCombine.  The major benefit of this
+// transformation is that it makes many other loop optimizations, such as
+// LoopUnswitching, simpler.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LCSSA.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/PredIteratorCache.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Transforms/Utils.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Transforms/Utils/LoopUtils.h" 
-#include "llvm/Transforms/Utils/SSAUpdater.h" 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "lcssa" 
- 
-STATISTIC(NumLCSSA, "Number of live out of a loop variables"); 
- 
-#ifdef EXPENSIVE_CHECKS 
-static bool VerifyLoopLCSSA = true; 
-#else 
-static bool VerifyLoopLCSSA = false; 
-#endif 
-static cl::opt<bool, true> 
-    VerifyLoopLCSSAFlag("verify-loop-lcssa", cl::location(VerifyLoopLCSSA), 
-                        cl::Hidden, 
-                        cl::desc("Verify loop lcssa form (time consuming)")); 
- 
-/// Return true if the specified block is in the list. 
-static bool isExitBlock(BasicBlock *BB, 
-                        const SmallVectorImpl<BasicBlock *> &ExitBlocks) { 
-  return is_contained(ExitBlocks, BB); 
-} 
- 
-/// For every instruction from the worklist, check to see if it has any uses 
-/// that are outside the current loop.  If so, insert LCSSA PHI nodes and 
-/// rewrite the uses. 
-bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist, 
-                                    const DominatorTree &DT, const LoopInfo &LI, 
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PredIteratorCache.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "lcssa"
+
+STATISTIC(NumLCSSA, "Number of live out of a loop variables");
+
+#ifdef EXPENSIVE_CHECKS
+static bool VerifyLoopLCSSA = true;
+#else
+static bool VerifyLoopLCSSA = false;
+#endif
+static cl::opt<bool, true>
+    VerifyLoopLCSSAFlag("verify-loop-lcssa", cl::location(VerifyLoopLCSSA),
+                        cl::Hidden,
+                        cl::desc("Verify loop lcssa form (time consuming)"));
+
+/// Return true if the specified block is in the list.
+static bool isExitBlock(BasicBlock *BB,
+                        const SmallVectorImpl<BasicBlock *> &ExitBlocks) {
+  return is_contained(ExitBlocks, BB);
+}
+
+/// For every instruction from the worklist, check to see if it has any uses
+/// that are outside the current loop.  If so, insert LCSSA PHI nodes and
+/// rewrite the uses.
+bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
+                                    const DominatorTree &DT, const LoopInfo &LI,
                                     ScalarEvolution *SE, IRBuilderBase &Builder,
                                     SmallVectorImpl<PHINode *> *PHIsToRemove) {
-  SmallVector<Use *, 16> UsesToRewrite; 
+  SmallVector<Use *, 16> UsesToRewrite;
   SmallSetVector<PHINode *, 16> LocalPHIsToRemove;
-  PredIteratorCache PredCache; 
-  bool Changed = false; 
- 
+  PredIteratorCache PredCache;
+  bool Changed = false;
+
   IRBuilderBase::InsertPointGuard InsertPtGuard(Builder);
 
-  // Cache the Loop ExitBlocks across this loop.  We expect to get a lot of 
-  // instructions within the same loops, computing the exit blocks is 
-  // expensive, and we're not mutating the loop structure. 
-  SmallDenseMap<Loop*, SmallVector<BasicBlock *,1>> LoopExitBlocks; 
- 
-  while (!Worklist.empty()) { 
-    UsesToRewrite.clear(); 
- 
-    Instruction *I = Worklist.pop_back_val(); 
-    assert(!I->getType()->isTokenTy() && "Tokens shouldn't be in the worklist"); 
-    BasicBlock *InstBB = I->getParent(); 
-    Loop *L = LI.getLoopFor(InstBB); 
-    assert(L && "Instruction belongs to a BB that's not part of a loop"); 
-    if (!LoopExitBlocks.count(L)) 
-      L->getExitBlocks(LoopExitBlocks[L]); 
-    assert(LoopExitBlocks.count(L)); 
-    const SmallVectorImpl<BasicBlock *> &ExitBlocks = LoopExitBlocks[L]; 
- 
-    if (ExitBlocks.empty()) 
-      continue; 
- 
-    for (Use &U : I->uses()) { 
-      Instruction *User = cast<Instruction>(U.getUser()); 
-      BasicBlock *UserBB = User->getParent(); 
+  // Cache the Loop ExitBlocks across this loop.  We expect to get a lot of
+  // instructions within the same loops, computing the exit blocks is
+  // expensive, and we're not mutating the loop structure.
+  SmallDenseMap<Loop*, SmallVector<BasicBlock *,1>> LoopExitBlocks;
+
+  while (!Worklist.empty()) {
+    UsesToRewrite.clear();
+
+    Instruction *I = Worklist.pop_back_val();
+    assert(!I->getType()->isTokenTy() && "Tokens shouldn't be in the worklist");
+    BasicBlock *InstBB = I->getParent();
+    Loop *L = LI.getLoopFor(InstBB);
+    assert(L && "Instruction belongs to a BB that's not part of a loop");
+    if (!LoopExitBlocks.count(L))
+      L->getExitBlocks(LoopExitBlocks[L]);
+    assert(LoopExitBlocks.count(L));
+    const SmallVectorImpl<BasicBlock *> &ExitBlocks = LoopExitBlocks[L];
+
+    if (ExitBlocks.empty())
+      continue;
+
+    for (Use &U : I->uses()) {
+      Instruction *User = cast<Instruction>(U.getUser());
+      BasicBlock *UserBB = User->getParent();
 
       // For practical purposes, we consider that the use in a PHI
       // occurs in the respective predecessor block. For more info,
       // see the `phi` doc in LangRef and the LCSSA doc.
-      if (auto *PN = dyn_cast<PHINode>(User)) 
-        UserBB = PN->getIncomingBlock(U); 
- 
-      if (InstBB != UserBB && !L->contains(UserBB)) 
-        UsesToRewrite.push_back(&U); 
-    } 
- 
-    // If there are no uses outside the loop, exit with no change. 
-    if (UsesToRewrite.empty()) 
-      continue; 
- 
-    ++NumLCSSA; // We are applying the transformation 
- 
-    // Invoke instructions are special in that their result value is not 
-    // available along their unwind edge. The code below tests to see whether 
-    // DomBB dominates the value, so adjust DomBB to the normal destination 
-    // block, which is effectively where the value is first usable. 
-    BasicBlock *DomBB = InstBB; 
-    if (auto *Inv = dyn_cast<InvokeInst>(I)) 
-      DomBB = Inv->getNormalDest(); 
- 
-    const DomTreeNode *DomNode = DT.getNode(DomBB); 
- 
-    SmallVector<PHINode *, 16> AddedPHIs; 
-    SmallVector<PHINode *, 8> PostProcessPHIs; 
- 
-    SmallVector<PHINode *, 4> InsertedPHIs; 
-    SSAUpdater SSAUpdate(&InsertedPHIs); 
-    SSAUpdate.Initialize(I->getType(), I->getName()); 
- 
-    // Force re-computation of I, as some users now need to use the new PHI 
-    // node. 
-    if (SE) 
-      SE->forgetValue(I); 
- 
-    // Insert the LCSSA phi's into all of the exit blocks dominated by the 
-    // value, and add them to the Phi's map. 
-    for (BasicBlock *ExitBB : ExitBlocks) { 
-      if (!DT.dominates(DomNode, DT.getNode(ExitBB))) 
-        continue; 
- 
-      // If we already inserted something for this BB, don't reprocess it. 
-      if (SSAUpdate.HasValueForBlock(ExitBB)) 
-        continue; 
+      if (auto *PN = dyn_cast<PHINode>(User))
+        UserBB = PN->getIncomingBlock(U);
+
+      if (InstBB != UserBB && !L->contains(UserBB))
+        UsesToRewrite.push_back(&U);
+    }
+
+    // If there are no uses outside the loop, exit with no change.
+    if (UsesToRewrite.empty())
+      continue;
+
+    ++NumLCSSA; // We are applying the transformation
+
+    // Invoke instructions are special in that their result value is not
+    // available along their unwind edge. The code below tests to see whether
+    // DomBB dominates the value, so adjust DomBB to the normal destination
+    // block, which is effectively where the value is first usable.
+    BasicBlock *DomBB = InstBB;
+    if (auto *Inv = dyn_cast<InvokeInst>(I))
+      DomBB = Inv->getNormalDest();
+
+    const DomTreeNode *DomNode = DT.getNode(DomBB);
+
+    SmallVector<PHINode *, 16> AddedPHIs;
+    SmallVector<PHINode *, 8> PostProcessPHIs;
+
+    SmallVector<PHINode *, 4> InsertedPHIs;
+    SSAUpdater SSAUpdate(&InsertedPHIs);
+    SSAUpdate.Initialize(I->getType(), I->getName());
+
+    // Force re-computation of I, as some users now need to use the new PHI
+    // node.
+    if (SE)
+      SE->forgetValue(I);
+
+    // Insert the LCSSA phi's into all of the exit blocks dominated by the
+    // value, and add them to the Phi's map.
+    for (BasicBlock *ExitBB : ExitBlocks) {
+      if (!DT.dominates(DomNode, DT.getNode(ExitBB)))
+        continue;
+
+      // If we already inserted something for this BB, don't reprocess it.
+      if (SSAUpdate.HasValueForBlock(ExitBB))
+        continue;
       Builder.SetInsertPoint(&ExitBB->front());
       PHINode *PN = Builder.CreatePHI(I->getType(), PredCache.size(ExitBB),
                                       I->getName() + ".lcssa");
-      // Get the debug location from the original instruction. 
-      PN->setDebugLoc(I->getDebugLoc()); 
+      // Get the debug location from the original instruction.
+      PN->setDebugLoc(I->getDebugLoc());
 
       // Add inputs from inside the loop for this PHI. This is valid
       // because `I` dominates `ExitBB` (checked above).  This implies
       // that every incoming block/edge is dominated by `I` as well,
       // i.e. we can add uses of `I` to those incoming edges/append to the incoming
       // blocks without violating the SSA dominance property.
-      for (BasicBlock *Pred : PredCache.get(ExitBB)) { 
-        PN->addIncoming(I, Pred); 
- 
-        // If the exit block has a predecessor not within the loop, arrange for 
-        // the incoming value use corresponding to that predecessor to be 
-        // rewritten in terms of a different LCSSA PHI. 
-        if (!L->contains(Pred)) 
-          UsesToRewrite.push_back( 
-              &PN->getOperandUse(PN->getOperandNumForIncomingValue( 
-                  PN->getNumIncomingValues() - 1))); 
-      } 
- 
-      AddedPHIs.push_back(PN); 
- 
-      // Remember that this phi makes the value alive in this block. 
-      SSAUpdate.AddAvailableValue(ExitBB, PN); 
- 
-      // LoopSimplify might fail to simplify some loops (e.g. when indirect 
-      // branches are involved). In such situations, it might happen that an 
-      // exit for Loop L1 is the header of a disjoint Loop L2. Thus, when we 
-      // create PHIs in such an exit block, we are also inserting PHIs into L2's 
-      // header. This could break LCSSA form for L2 because these inserted PHIs 
-      // can also have uses outside of L2. Remember all PHIs in such situation 
-      // as to revisit than later on. FIXME: Remove this if indirectbr support 
-      // into LoopSimplify gets improved. 
-      if (auto *OtherLoop = LI.getLoopFor(ExitBB)) 
-        if (!L->contains(OtherLoop)) 
-          PostProcessPHIs.push_back(PN); 
-    } 
- 
-    // Rewrite all uses outside the loop in terms of the new PHIs we just 
-    // inserted. 
-    for (Use *UseToRewrite : UsesToRewrite) { 
-      Instruction *User = cast<Instruction>(UseToRewrite->getUser()); 
-      BasicBlock *UserBB = User->getParent(); 
+      for (BasicBlock *Pred : PredCache.get(ExitBB)) {
+        PN->addIncoming(I, Pred);
+
+        // If the exit block has a predecessor not within the loop, arrange for
+        // the incoming value use corresponding to that predecessor to be
+        // rewritten in terms of a different LCSSA PHI.
+        if (!L->contains(Pred))
+          UsesToRewrite.push_back(
+              &PN->getOperandUse(PN->getOperandNumForIncomingValue(
+                  PN->getNumIncomingValues() - 1)));
+      }
+
+      AddedPHIs.push_back(PN);
+
+      // Remember that this phi makes the value alive in this block.
+      SSAUpdate.AddAvailableValue(ExitBB, PN);
+
+      // LoopSimplify might fail to simplify some loops (e.g. when indirect
+      // branches are involved). In such situations, it might happen that an
+      // exit for Loop L1 is the header of a disjoint Loop L2. Thus, when we
+      // create PHIs in such an exit block, we are also inserting PHIs into L2's
+      // header. This could break LCSSA form for L2 because these inserted PHIs
+      // can also have uses outside of L2. Remember all PHIs in such situation
+      // as to revisit than later on. FIXME: Remove this if indirectbr support
+      // into LoopSimplify gets improved.
+      if (auto *OtherLoop = LI.getLoopFor(ExitBB))
+        if (!L->contains(OtherLoop))
+          PostProcessPHIs.push_back(PN);
+    }
+
+    // Rewrite all uses outside the loop in terms of the new PHIs we just
+    // inserted.
+    for (Use *UseToRewrite : UsesToRewrite) {
+      Instruction *User = cast<Instruction>(UseToRewrite->getUser());
+      BasicBlock *UserBB = User->getParent();
 
       // For practical purposes, we consider that the use in a PHI
       // occurs in the respective predecessor block. For more info,
       // see the `phi` doc in LangRef and the LCSSA doc.
-      if (auto *PN = dyn_cast<PHINode>(User)) 
-        UserBB = PN->getIncomingBlock(*UseToRewrite); 
- 
+      if (auto *PN = dyn_cast<PHINode>(User))
+        UserBB = PN->getIncomingBlock(*UseToRewrite);
+
       // If this use is in an exit block, rewrite to use the newly inserted PHI.
       // This is required for correctness because SSAUpdate doesn't handle uses
       // in the same block.  It assumes the PHI we inserted is at the end of the
       // block.
-      if (isa<PHINode>(UserBB->begin()) && isExitBlock(UserBB, ExitBlocks)) { 
-        UseToRewrite->set(&UserBB->front()); 
-        continue; 
-      } 
- 
-      // If we added a single PHI, it must dominate all uses and we can directly 
-      // rename it. 
-      if (AddedPHIs.size() == 1) { 
-        UseToRewrite->set(AddedPHIs[0]); 
-        continue; 
-      } 
- 
-      // Otherwise, do full PHI insertion. 
-      SSAUpdate.RewriteUse(*UseToRewrite); 
-    } 
- 
-    SmallVector<DbgValueInst *, 4> DbgValues; 
-    llvm::findDbgValues(DbgValues, I); 
- 
-    // Update pre-existing debug value uses that reside outside the loop. 
-    auto &Ctx = I->getContext(); 
-    for (auto DVI : DbgValues) { 
-      BasicBlock *UserBB = DVI->getParent(); 
-      if (InstBB == UserBB || L->contains(UserBB)) 
-        continue; 
-      // We currently only handle debug values residing in blocks that were 
-      // traversed while rewriting the uses. If we inserted just a single PHI, 
-      // we will handle all relevant debug values. 
-      Value *V = AddedPHIs.size() == 1 ? AddedPHIs[0] 
-                                       : SSAUpdate.FindValueForBlock(UserBB); 
-      if (V) 
-        DVI->setOperand(0, MetadataAsValue::get(Ctx, ValueAsMetadata::get(V))); 
-    } 
- 
-    // SSAUpdater might have inserted phi-nodes inside other loops. We'll need 
-    // to post-process them to keep LCSSA form. 
-    for (PHINode *InsertedPN : InsertedPHIs) { 
-      if (auto *OtherLoop = LI.getLoopFor(InsertedPN->getParent())) 
-        if (!L->contains(OtherLoop)) 
-          PostProcessPHIs.push_back(InsertedPN); 
-    } 
- 
-    // Post process PHI instructions that were inserted into another disjoint 
-    // loop and update their exits properly. 
-    for (auto *PostProcessPN : PostProcessPHIs) 
-      if (!PostProcessPN->use_empty()) 
-        Worklist.push_back(PostProcessPN); 
- 
-    // Keep track of PHI nodes that we want to remove because they did not have 
+      if (isa<PHINode>(UserBB->begin()) && isExitBlock(UserBB, ExitBlocks)) {
+        UseToRewrite->set(&UserBB->front());
+        continue;
+      }
+
+      // If we added a single PHI, it must dominate all uses and we can directly
+      // rename it.
+      if (AddedPHIs.size() == 1) {
+        UseToRewrite->set(AddedPHIs[0]);
+        continue;
+      }
+
+      // Otherwise, do full PHI insertion.
+      SSAUpdate.RewriteUse(*UseToRewrite);
+    }
+
+    SmallVector<DbgValueInst *, 4> DbgValues;
+    llvm::findDbgValues(DbgValues, I);
+
+    // Update pre-existing debug value uses that reside outside the loop.
+    auto &Ctx = I->getContext();
+    for (auto DVI : DbgValues) {
+      BasicBlock *UserBB = DVI->getParent();
+      if (InstBB == UserBB || L->contains(UserBB))
+        continue;
+      // We currently only handle debug values residing in blocks that were
+      // traversed while rewriting the uses. If we inserted just a single PHI,
+      // we will handle all relevant debug values.
+      Value *V = AddedPHIs.size() == 1 ? AddedPHIs[0]
+                                       : SSAUpdate.FindValueForBlock(UserBB);
+      if (V)
+        DVI->setOperand(0, MetadataAsValue::get(Ctx, ValueAsMetadata::get(V)));
+    }
+
+    // SSAUpdater might have inserted phi-nodes inside other loops. We'll need
+    // to post-process them to keep LCSSA form.
+    for (PHINode *InsertedPN : InsertedPHIs) {
+      if (auto *OtherLoop = LI.getLoopFor(InsertedPN->getParent()))
+        if (!L->contains(OtherLoop))
+          PostProcessPHIs.push_back(InsertedPN);
+    }
+
+    // Post process PHI instructions that were inserted into another disjoint
+    // loop and update their exits properly.
+    for (auto *PostProcessPN : PostProcessPHIs)
+      if (!PostProcessPN->use_empty())
+        Worklist.push_back(PostProcessPN);
+
+    // Keep track of PHI nodes that we want to remove because they did not have
     // any uses rewritten.
-    for (PHINode *PN : AddedPHIs) 
-      if (PN->use_empty()) 
+    for (PHINode *PN : AddedPHIs)
+      if (PN->use_empty())
         LocalPHIsToRemove.insert(PN);
 
-    Changed = true; 
-  } 
+    Changed = true;
+  }
 
   // Remove PHI nodes that did not have any uses rewritten or add them to
   // PHIsToRemove, so the caller can remove them after some additional cleanup.
@@ -288,229 +288,229 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
       if (PN->use_empty())
         PN->eraseFromParent();
   }
-  return Changed; 
-} 
- 
-// Compute the set of BasicBlocks in the loop `L` dominating at least one exit. 
-static void computeBlocksDominatingExits( 
-    Loop &L, const DominatorTree &DT, SmallVector<BasicBlock *, 8> &ExitBlocks, 
-    SmallSetVector<BasicBlock *, 8> &BlocksDominatingExits) { 
-  // We start from the exit blocks, as every block trivially dominates itself 
-  // (not strictly). 
+  return Changed;
+}
+
+// Compute the set of BasicBlocks in the loop `L` dominating at least one exit.
+static void computeBlocksDominatingExits(
+    Loop &L, const DominatorTree &DT, SmallVector<BasicBlock *, 8> &ExitBlocks,
+    SmallSetVector<BasicBlock *, 8> &BlocksDominatingExits) {
+  // We start from the exit blocks, as every block trivially dominates itself
+  // (not strictly).
   SmallVector<BasicBlock *, 8> BBWorklist(ExitBlocks);
- 
-  while (!BBWorklist.empty()) { 
-    BasicBlock *BB = BBWorklist.pop_back_val(); 
- 
-    // Check if this is a loop header. If this is the case, we're done. 
-    if (L.getHeader() == BB) 
-      continue; 
- 
-    // Otherwise, add its immediate predecessor in the dominator tree to the 
-    // worklist, unless we visited it already. 
-    BasicBlock *IDomBB = DT.getNode(BB)->getIDom()->getBlock(); 
- 
-    // Exit blocks can have an immediate dominator not beloinging to the 
-    // loop. For an exit block to be immediately dominated by another block 
-    // outside the loop, it implies not all paths from that dominator, to the 
-    // exit block, go through the loop. 
-    // Example: 
-    // 
-    // |---- A 
-    // |     | 
-    // |     B<-- 
-    // |     |  | 
-    // |---> C -- 
-    //       | 
-    //       D 
-    // 
-    // C is the exit block of the loop and it's immediately dominated by A, 
-    // which doesn't belong to the loop. 
-    if (!L.contains(IDomBB)) 
-      continue; 
- 
-    if (BlocksDominatingExits.insert(IDomBB)) 
-      BBWorklist.push_back(IDomBB); 
-  } 
-} 
- 
-bool llvm::formLCSSA(Loop &L, const DominatorTree &DT, const LoopInfo *LI, 
-                     ScalarEvolution *SE) { 
-  bool Changed = false; 
- 
-#ifdef EXPENSIVE_CHECKS 
-  // Verify all sub-loops are in LCSSA form already. 
-  for (Loop *SubLoop: L) 
-    assert(SubLoop->isRecursivelyLCSSAForm(DT, *LI) && "Subloop not in LCSSA!"); 
-#endif 
- 
-  SmallVector<BasicBlock *, 8> ExitBlocks; 
-  L.getExitBlocks(ExitBlocks); 
-  if (ExitBlocks.empty()) 
-    return false; 
- 
-  SmallSetVector<BasicBlock *, 8> BlocksDominatingExits; 
- 
-  // We want to avoid use-scanning leveraging dominance informations. 
-  // If a block doesn't dominate any of the loop exits, the none of the values 
-  // defined in the loop can be used outside. 
-  // We compute the set of blocks fullfilling the conditions in advance 
-  // walking the dominator tree upwards until we hit a loop header. 
-  computeBlocksDominatingExits(L, DT, ExitBlocks, BlocksDominatingExits); 
- 
-  SmallVector<Instruction *, 8> Worklist; 
- 
-  // Look at all the instructions in the loop, checking to see if they have uses 
-  // outside the loop.  If so, put them into the worklist to rewrite those uses. 
-  for (BasicBlock *BB : BlocksDominatingExits) { 
-    // Skip blocks that are part of any sub-loops, they must be in LCSSA 
-    // already. 
-    if (LI->getLoopFor(BB) != &L) 
-      continue; 
-    for (Instruction &I : *BB) { 
-      // Reject two common cases fast: instructions with no uses (like stores) 
-      // and instructions with one use that is in the same block as this. 
-      if (I.use_empty() || 
-          (I.hasOneUse() && I.user_back()->getParent() == BB && 
-           !isa<PHINode>(I.user_back()))) 
-        continue; 
- 
-      // Tokens cannot be used in PHI nodes, so we skip over them. 
-      // We can run into tokens which are live out of a loop with catchswitch 
-      // instructions in Windows EH if the catchswitch has one catchpad which 
-      // is inside the loop and another which is not. 
-      if (I.getType()->isTokenTy()) 
-        continue; 
- 
-      Worklist.push_back(&I); 
-    } 
-  } 
- 
+
+  while (!BBWorklist.empty()) {
+    BasicBlock *BB = BBWorklist.pop_back_val();
+
+    // Check if this is a loop header. If this is the case, we're done.
+    if (L.getHeader() == BB)
+      continue;
+
+    // Otherwise, add its immediate predecessor in the dominator tree to the
+    // worklist, unless we visited it already.
+    BasicBlock *IDomBB = DT.getNode(BB)->getIDom()->getBlock();
+
+    // Exit blocks can have an immediate dominator not beloinging to the
+    // loop. For an exit block to be immediately dominated by another block
+    // outside the loop, it implies not all paths from that dominator, to the
+    // exit block, go through the loop.
+    // Example:
+    //
+    // |---- A
+    // |     |
+    // |     B<--
+    // |     |  |
+    // |---> C --
+    //       |
+    //       D
+    //
+    // C is the exit block of the loop and it's immediately dominated by A,
+    // which doesn't belong to the loop.
+    if (!L.contains(IDomBB))
+      continue;
+
+    if (BlocksDominatingExits.insert(IDomBB))
+      BBWorklist.push_back(IDomBB);
+  }
+}
+
+bool llvm::formLCSSA(Loop &L, const DominatorTree &DT, const LoopInfo *LI,
+                     ScalarEvolution *SE) {
+  bool Changed = false;
+
+#ifdef EXPENSIVE_CHECKS
+  // Verify all sub-loops are in LCSSA form already.
+  for (Loop *SubLoop: L)
+    assert(SubLoop->isRecursivelyLCSSAForm(DT, *LI) && "Subloop not in LCSSA!");
+#endif
+
+  SmallVector<BasicBlock *, 8> ExitBlocks;
+  L.getExitBlocks(ExitBlocks);
+  if (ExitBlocks.empty())
+    return false;
+
+  SmallSetVector<BasicBlock *, 8> BlocksDominatingExits;
+
+  // We want to avoid use-scanning leveraging dominance informations.
+  // If a block doesn't dominate any of the loop exits, the none of the values
+  // defined in the loop can be used outside.
+  // We compute the set of blocks fullfilling the conditions in advance
+  // walking the dominator tree upwards until we hit a loop header.
+  computeBlocksDominatingExits(L, DT, ExitBlocks, BlocksDominatingExits);
+
+  SmallVector<Instruction *, 8> Worklist;
+
+  // Look at all the instructions in the loop, checking to see if they have uses
+  // outside the loop.  If so, put them into the worklist to rewrite those uses.
+  for (BasicBlock *BB : BlocksDominatingExits) {
+    // Skip blocks that are part of any sub-loops, they must be in LCSSA
+    // already.
+    if (LI->getLoopFor(BB) != &L)
+      continue;
+    for (Instruction &I : *BB) {
+      // Reject two common cases fast: instructions with no uses (like stores)
+      // and instructions with one use that is in the same block as this.
+      if (I.use_empty() ||
+          (I.hasOneUse() && I.user_back()->getParent() == BB &&
+           !isa<PHINode>(I.user_back())))
+        continue;
+
+      // Tokens cannot be used in PHI nodes, so we skip over them.
+      // We can run into tokens which are live out of a loop with catchswitch
+      // instructions in Windows EH if the catchswitch has one catchpad which
+      // is inside the loop and another which is not.
+      if (I.getType()->isTokenTy())
+        continue;
+
+      Worklist.push_back(&I);
+    }
+  }
+
   IRBuilder<> Builder(L.getHeader()->getContext());
   Changed = formLCSSAForInstructions(Worklist, DT, *LI, SE, Builder);
 
-  // If we modified the code, remove any caches about the loop from SCEV to 
-  // avoid dangling entries. 
-  // FIXME: This is a big hammer, can we clear the cache more selectively? 
-  if (SE && Changed) 
-    SE->forgetLoop(&L); 
- 
-  assert(L.isLCSSAForm(DT)); 
- 
-  return Changed; 
-} 
- 
-/// Process a loop nest depth first. 
-bool llvm::formLCSSARecursively(Loop &L, const DominatorTree &DT, 
-                                const LoopInfo *LI, ScalarEvolution *SE) { 
-  bool Changed = false; 
- 
-  // Recurse depth-first through inner loops. 
-  for (Loop *SubLoop : L.getSubLoops()) 
-    Changed |= formLCSSARecursively(*SubLoop, DT, LI, SE); 
- 
-  Changed |= formLCSSA(L, DT, LI, SE); 
-  return Changed; 
-} 
- 
-/// Process all loops in the function, inner-most out. 
-static bool formLCSSAOnAllLoops(const LoopInfo *LI, const DominatorTree &DT, 
-                                ScalarEvolution *SE) { 
-  bool Changed = false; 
-  for (auto &L : *LI) 
-    Changed |= formLCSSARecursively(*L, DT, LI, SE); 
-  return Changed; 
-} 
- 
-namespace { 
-struct LCSSAWrapperPass : public FunctionPass { 
-  static char ID; // Pass identification, replacement for typeid 
-  LCSSAWrapperPass() : FunctionPass(ID) { 
-    initializeLCSSAWrapperPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  // Cached analysis information for the current function. 
-  DominatorTree *DT; 
-  LoopInfo *LI; 
-  ScalarEvolution *SE; 
- 
-  bool runOnFunction(Function &F) override; 
-  void verifyAnalysis() const override { 
-    // This check is very expensive. On the loop intensive compiles it may cause 
-    // up to 10x slowdown. Currently it's disabled by default. LPPassManager 
-    // always does limited form of the LCSSA verification. Similar reasoning 
-    // was used for the LoopInfo verifier. 
-    if (VerifyLoopLCSSA) { 
-      assert(all_of(*LI, 
-                    [&](Loop *L) { 
-                      return L->isRecursivelyLCSSAForm(*DT, *LI); 
-                    }) && 
-             "LCSSA form is broken!"); 
-    } 
-  }; 
- 
-  /// This transformation requires natural loop information & requires that 
-  /// loop preheaders be inserted into the CFG.  It maintains both of these, 
-  /// as well as the CFG.  It also requires dominator information. 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.setPreservesCFG(); 
- 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addRequired<LoopInfoWrapperPass>(); 
-    AU.addPreservedID(LoopSimplifyID); 
-    AU.addPreserved<AAResultsWrapperPass>(); 
-    AU.addPreserved<BasicAAWrapperPass>(); 
-    AU.addPreserved<GlobalsAAWrapperPass>(); 
-    AU.addPreserved<ScalarEvolutionWrapperPass>(); 
-    AU.addPreserved<SCEVAAWrapperPass>(); 
-    AU.addPreserved<BranchProbabilityInfoWrapperPass>(); 
-    AU.addPreserved<MemorySSAWrapperPass>(); 
- 
-    // This is needed to perform LCSSA verification inside LPPassManager 
-    AU.addRequired<LCSSAVerificationPass>(); 
-    AU.addPreserved<LCSSAVerificationPass>(); 
-  } 
-}; 
-} 
- 
-char LCSSAWrapperPass::ID = 0; 
-INITIALIZE_PASS_BEGIN(LCSSAWrapperPass, "lcssa", "Loop-Closed SSA Form Pass", 
-                      false, false) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LCSSAVerificationPass) 
-INITIALIZE_PASS_END(LCSSAWrapperPass, "lcssa", "Loop-Closed SSA Form Pass", 
-                    false, false) 
- 
-Pass *llvm::createLCSSAPass() { return new LCSSAWrapperPass(); } 
-char &llvm::LCSSAID = LCSSAWrapperPass::ID; 
- 
-/// Transform \p F into loop-closed SSA form. 
-bool LCSSAWrapperPass::runOnFunction(Function &F) { 
-  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-  auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); 
-  SE = SEWP ? &SEWP->getSE() : nullptr; 
- 
-  return formLCSSAOnAllLoops(LI, *DT, SE); 
-} 
- 
-PreservedAnalyses LCSSAPass::run(Function &F, FunctionAnalysisManager &AM) { 
-  auto &LI = AM.getResult<LoopAnalysis>(F); 
-  auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 
-  auto *SE = AM.getCachedResult<ScalarEvolutionAnalysis>(F); 
-  if (!formLCSSAOnAllLoops(&LI, DT, SE)) 
-    return PreservedAnalyses::all(); 
- 
-  PreservedAnalyses PA; 
-  PA.preserveSet<CFGAnalyses>(); 
-  PA.preserve<BasicAA>(); 
-  PA.preserve<GlobalsAA>(); 
-  PA.preserve<SCEVAA>(); 
-  PA.preserve<ScalarEvolutionAnalysis>(); 
-  // BPI maps terminators to probabilities, since we don't modify the CFG, no 
-  // updates are needed to preserve it. 
-  PA.preserve<BranchProbabilityAnalysis>(); 
-  PA.preserve<MemorySSAAnalysis>(); 
-  return PA; 
-} 
+  // If we modified the code, remove any caches about the loop from SCEV to
+  // avoid dangling entries.
+  // FIXME: This is a big hammer, can we clear the cache more selectively?
+  if (SE && Changed)
+    SE->forgetLoop(&L);
+
+  assert(L.isLCSSAForm(DT));
+
+  return Changed;
+}
+
+/// Process a loop nest depth first.
+bool llvm::formLCSSARecursively(Loop &L, const DominatorTree &DT,
+                                const LoopInfo *LI, ScalarEvolution *SE) {
+  bool Changed = false;
+
+  // Recurse depth-first through inner loops.
+  for (Loop *SubLoop : L.getSubLoops())
+    Changed |= formLCSSARecursively(*SubLoop, DT, LI, SE);
+
+  Changed |= formLCSSA(L, DT, LI, SE);
+  return Changed;
+}
+
+/// Process all loops in the function, inner-most out.
+static bool formLCSSAOnAllLoops(const LoopInfo *LI, const DominatorTree &DT,
+                                ScalarEvolution *SE) {
+  bool Changed = false;
+  for (auto &L : *LI)
+    Changed |= formLCSSARecursively(*L, DT, LI, SE);
+  return Changed;
+}
+
+namespace {
+struct LCSSAWrapperPass : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  LCSSAWrapperPass() : FunctionPass(ID) {
+    initializeLCSSAWrapperPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  // Cached analysis information for the current function.
+  DominatorTree *DT;
+  LoopInfo *LI;
+  ScalarEvolution *SE;
+
+  bool runOnFunction(Function &F) override;
+  void verifyAnalysis() const override {
+    // This check is very expensive. On the loop intensive compiles it may cause
+    // up to 10x slowdown. Currently it's disabled by default. LPPassManager
+    // always does limited form of the LCSSA verification. Similar reasoning
+    // was used for the LoopInfo verifier.
+    if (VerifyLoopLCSSA) {
+      assert(all_of(*LI,
+                    [&](Loop *L) {
+                      return L->isRecursivelyLCSSAForm(*DT, *LI);
+                    }) &&
+             "LCSSA form is broken!");
+    }
+  };
+
+  /// This transformation requires natural loop information & requires that
+  /// loop preheaders be inserted into the CFG.  It maintains both of these,
+  /// as well as the CFG.  It also requires dominator information.
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreservedID(LoopSimplifyID);
+    AU.addPreserved<AAResultsWrapperPass>();
+    AU.addPreserved<BasicAAWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<ScalarEvolutionWrapperPass>();
+    AU.addPreserved<SCEVAAWrapperPass>();
+    AU.addPreserved<BranchProbabilityInfoWrapperPass>();
+    AU.addPreserved<MemorySSAWrapperPass>();
+
+    // This is needed to perform LCSSA verification inside LPPassManager
+    AU.addRequired<LCSSAVerificationPass>();
+    AU.addPreserved<LCSSAVerificationPass>();
+  }
+};
+}
+
+char LCSSAWrapperPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LCSSAWrapperPass, "lcssa", "Loop-Closed SSA Form Pass",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LCSSAVerificationPass)
+INITIALIZE_PASS_END(LCSSAWrapperPass, "lcssa", "Loop-Closed SSA Form Pass",
+                    false, false)
+
+Pass *llvm::createLCSSAPass() { return new LCSSAWrapperPass(); }
+char &llvm::LCSSAID = LCSSAWrapperPass::ID;
+
+/// Transform \p F into loop-closed SSA form.
+bool LCSSAWrapperPass::runOnFunction(Function &F) {
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+  SE = SEWP ? &SEWP->getSE() : nullptr;
+
+  return formLCSSAOnAllLoops(LI, *DT, SE);
+}
+
+PreservedAnalyses LCSSAPass::run(Function &F, FunctionAnalysisManager &AM) {
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto *SE = AM.getCachedResult<ScalarEvolutionAnalysis>(F);
+  if (!formLCSSAOnAllLoops(&LI, DT, SE))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<BasicAA>();
+  PA.preserve<GlobalsAA>();
+  PA.preserve<SCEVAA>();
+  PA.preserve<ScalarEvolutionAnalysis>();
+  // BPI maps terminators to probabilities, since we don't modify the CFG, no
+  // updates are needed to preserve it.
+  PA.preserve<BranchProbabilityAnalysis>();
+  PA.preserve<MemorySSAAnalysis>();
+  return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/LibCallsShrinkWrap.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
index 05446019c6..4c52fac6f7 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
@@ -1,562 +1,562 @@
-//===-- LibCallsShrinkWrap.cpp ----------------------------------*- C++ -*-===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass shrink-wraps a call to function if the result is not used. 
-// The call can set errno but is otherwise side effect free. For example: 
-//    sqrt(val); 
-//  is transformed to 
-//    if (val < 0) 
-//      sqrt(val); 
-//  Even if the result of library call is not being used, the compiler cannot 
-//  safely delete the call because the function can set errno on error 
-//  conditions. 
-//  Note in many functions, the error condition solely depends on the incoming 
-//  parameter. In this optimization, we can generate the condition can lead to 
-//  the errno to shrink-wrap the call. Since the chances of hitting the error 
-//  condition is low, the runtime call is effectively eliminated. 
-// 
-//  These partially dead calls are usually results of C++ abstraction penalty 
-//  exposed by inlining. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/LibCallsShrinkWrap.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstVisitor.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/MDBuilder.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "libcalls-shrinkwrap" 
- 
-STATISTIC(NumWrappedOneCond, "Number of One-Condition Wrappers Inserted"); 
-STATISTIC(NumWrappedTwoCond, "Number of Two-Condition Wrappers Inserted"); 
- 
-namespace { 
-class LibCallsShrinkWrapLegacyPass : public FunctionPass { 
-public: 
-  static char ID; // Pass identification, replacement for typeid 
-  explicit LibCallsShrinkWrapLegacyPass() : FunctionPass(ID) { 
-    initializeLibCallsShrinkWrapLegacyPassPass( 
-        *PassRegistry::getPassRegistry()); 
-  } 
-  void getAnalysisUsage(AnalysisUsage &AU) const override; 
-  bool runOnFunction(Function &F) override; 
-}; 
-} 
- 
-char LibCallsShrinkWrapLegacyPass::ID = 0; 
-INITIALIZE_PASS_BEGIN(LibCallsShrinkWrapLegacyPass, "libcalls-shrinkwrap", 
-                      "Conditionally eliminate dead library calls", false, 
-                      false) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_END(LibCallsShrinkWrapLegacyPass, "libcalls-shrinkwrap", 
-                    "Conditionally eliminate dead library calls", false, false) 
- 
-namespace { 
-class LibCallsShrinkWrap : public InstVisitor<LibCallsShrinkWrap> { 
-public: 
-  LibCallsShrinkWrap(const TargetLibraryInfo &TLI, DominatorTree *DT) 
-      : TLI(TLI), DT(DT){}; 
-  void visitCallInst(CallInst &CI) { checkCandidate(CI); } 
-  bool perform() { 
-    bool Changed = false; 
-    for (auto &CI : WorkList) { 
-      LLVM_DEBUG(dbgs() << "CDCE calls: " << CI->getCalledFunction()->getName() 
-                        << "\n"); 
-      if (perform(CI)) { 
-        Changed = true; 
-        LLVM_DEBUG(dbgs() << "Transformed\n"); 
-      } 
-    } 
-    return Changed; 
-  } 
- 
-private: 
-  bool perform(CallInst *CI); 
-  void checkCandidate(CallInst &CI); 
-  void shrinkWrapCI(CallInst *CI, Value *Cond); 
-  bool performCallDomainErrorOnly(CallInst *CI, const LibFunc &Func); 
-  bool performCallErrors(CallInst *CI, const LibFunc &Func); 
-  bool performCallRangeErrorOnly(CallInst *CI, const LibFunc &Func); 
-  Value *generateOneRangeCond(CallInst *CI, const LibFunc &Func); 
-  Value *generateTwoRangeCond(CallInst *CI, const LibFunc &Func); 
-  Value *generateCondForPow(CallInst *CI, const LibFunc &Func); 
- 
-  // Create an OR of two conditions. 
-  Value *createOrCond(CallInst *CI, CmpInst::Predicate Cmp, float Val, 
-                      CmpInst::Predicate Cmp2, float Val2) { 
-    IRBuilder<> BBBuilder(CI); 
-    Value *Arg = CI->getArgOperand(0); 
-    auto Cond2 = createCond(BBBuilder, Arg, Cmp2, Val2); 
-    auto Cond1 = createCond(BBBuilder, Arg, Cmp, Val); 
-    return BBBuilder.CreateOr(Cond1, Cond2); 
-  } 
- 
-  // Create a single condition using IRBuilder. 
-  Value *createCond(IRBuilder<> &BBBuilder, Value *Arg, CmpInst::Predicate Cmp, 
-                    float Val) { 
-    Constant *V = ConstantFP::get(BBBuilder.getContext(), APFloat(Val)); 
-    if (!Arg->getType()->isFloatTy()) 
-      V = ConstantExpr::getFPExtend(V, Arg->getType()); 
-    return BBBuilder.CreateFCmp(Cmp, Arg, V); 
-  } 
- 
-  // Create a single condition. 
-  Value *createCond(CallInst *CI, CmpInst::Predicate Cmp, float Val) { 
-    IRBuilder<> BBBuilder(CI); 
-    Value *Arg = CI->getArgOperand(0); 
-    return createCond(BBBuilder, Arg, Cmp, Val); 
-  } 
- 
-  const TargetLibraryInfo &TLI; 
-  DominatorTree *DT; 
-  SmallVector<CallInst *, 16> WorkList; 
-}; 
-} // end anonymous namespace 
- 
-// Perform the transformation to calls with errno set by domain error. 
-bool LibCallsShrinkWrap::performCallDomainErrorOnly(CallInst *CI, 
-                                                    const LibFunc &Func) { 
-  Value *Cond = nullptr; 
- 
-  switch (Func) { 
-  case LibFunc_acos:  // DomainError: (x < -1 || x > 1) 
-  case LibFunc_acosf: // Same as acos 
-  case LibFunc_acosl: // Same as acos 
-  case LibFunc_asin:  // DomainError: (x < -1 || x > 1) 
-  case LibFunc_asinf: // Same as asin 
-  case LibFunc_asinl: // Same as asin 
-  { 
-    ++NumWrappedTwoCond; 
-    Cond = createOrCond(CI, CmpInst::FCMP_OLT, -1.0f, CmpInst::FCMP_OGT, 1.0f); 
-    break; 
-  } 
-  case LibFunc_cos:  // DomainError: (x == +inf || x == -inf) 
-  case LibFunc_cosf: // Same as cos 
-  case LibFunc_cosl: // Same as cos 
-  case LibFunc_sin:  // DomainError: (x == +inf || x == -inf) 
-  case LibFunc_sinf: // Same as sin 
-  case LibFunc_sinl: // Same as sin 
-  { 
-    ++NumWrappedTwoCond; 
-    Cond = createOrCond(CI, CmpInst::FCMP_OEQ, INFINITY, CmpInst::FCMP_OEQ, 
-                        -INFINITY); 
-    break; 
-  } 
-  case LibFunc_acosh:  // DomainError: (x < 1) 
-  case LibFunc_acoshf: // Same as acosh 
-  case LibFunc_acoshl: // Same as acosh 
-  { 
-    ++NumWrappedOneCond; 
-    Cond = createCond(CI, CmpInst::FCMP_OLT, 1.0f); 
-    break; 
-  } 
-  case LibFunc_sqrt:  // DomainError: (x < 0) 
-  case LibFunc_sqrtf: // Same as sqrt 
-  case LibFunc_sqrtl: // Same as sqrt 
-  { 
-    ++NumWrappedOneCond; 
-    Cond = createCond(CI, CmpInst::FCMP_OLT, 0.0f); 
-    break; 
-  } 
-  default: 
-    return false; 
-  } 
-  shrinkWrapCI(CI, Cond); 
-  return true; 
-} 
- 
-// Perform the transformation to calls with errno set by range error. 
-bool LibCallsShrinkWrap::performCallRangeErrorOnly(CallInst *CI, 
-                                                   const LibFunc &Func) { 
-  Value *Cond = nullptr; 
- 
-  switch (Func) { 
-  case LibFunc_cosh: 
-  case LibFunc_coshf: 
-  case LibFunc_coshl: 
-  case LibFunc_exp: 
-  case LibFunc_expf: 
-  case LibFunc_expl: 
-  case LibFunc_exp10: 
-  case LibFunc_exp10f: 
-  case LibFunc_exp10l: 
-  case LibFunc_exp2: 
-  case LibFunc_exp2f: 
-  case LibFunc_exp2l: 
-  case LibFunc_sinh: 
-  case LibFunc_sinhf: 
-  case LibFunc_sinhl: { 
-    Cond = generateTwoRangeCond(CI, Func); 
-    break; 
-  } 
-  case LibFunc_expm1:  // RangeError: (709, inf) 
-  case LibFunc_expm1f: // RangeError: (88, inf) 
-  case LibFunc_expm1l: // RangeError: (11356, inf) 
-  { 
-    Cond = generateOneRangeCond(CI, Func); 
-    break; 
-  } 
-  default: 
-    return false; 
-  } 
-  shrinkWrapCI(CI, Cond); 
-  return true; 
-} 
- 
-// Perform the transformation to calls with errno set by combination of errors. 
-bool LibCallsShrinkWrap::performCallErrors(CallInst *CI, 
-                                           const LibFunc &Func) { 
-  Value *Cond = nullptr; 
- 
-  switch (Func) { 
-  case LibFunc_atanh:  // DomainError: (x < -1 || x > 1) 
-                        // PoleError:   (x == -1 || x == 1) 
-                        // Overall Cond: (x <= -1 || x >= 1) 
-  case LibFunc_atanhf: // Same as atanh 
-  case LibFunc_atanhl: // Same as atanh 
-  { 
-    ++NumWrappedTwoCond; 
-    Cond = createOrCond(CI, CmpInst::FCMP_OLE, -1.0f, CmpInst::FCMP_OGE, 1.0f); 
-    break; 
-  } 
-  case LibFunc_log:    // DomainError: (x < 0) 
-                        // PoleError:   (x == 0) 
-                        // Overall Cond: (x <= 0) 
-  case LibFunc_logf:   // Same as log 
-  case LibFunc_logl:   // Same as log 
-  case LibFunc_log10:  // Same as log 
-  case LibFunc_log10f: // Same as log 
-  case LibFunc_log10l: // Same as log 
-  case LibFunc_log2:   // Same as log 
-  case LibFunc_log2f:  // Same as log 
-  case LibFunc_log2l:  // Same as log 
-  case LibFunc_logb:   // Same as log 
-  case LibFunc_logbf:  // Same as log 
-  case LibFunc_logbl:  // Same as log 
-  { 
-    ++NumWrappedOneCond; 
-    Cond = createCond(CI, CmpInst::FCMP_OLE, 0.0f); 
-    break; 
-  } 
-  case LibFunc_log1p:  // DomainError: (x < -1) 
-                        // PoleError:   (x == -1) 
-                        // Overall Cond: (x <= -1) 
-  case LibFunc_log1pf: // Same as log1p 
-  case LibFunc_log1pl: // Same as log1p 
-  { 
-    ++NumWrappedOneCond; 
-    Cond = createCond(CI, CmpInst::FCMP_OLE, -1.0f); 
-    break; 
-  } 
-  case LibFunc_pow: // DomainError: x < 0 and y is noninteger 
-                     // PoleError:   x == 0 and y < 0 
-                     // RangeError:  overflow or underflow 
-  case LibFunc_powf: 
-  case LibFunc_powl: { 
-    Cond = generateCondForPow(CI, Func); 
-    if (Cond == nullptr) 
-      return false; 
-    break; 
-  } 
-  default: 
-    return false; 
-  } 
-  assert(Cond && "performCallErrors should not see an empty condition"); 
-  shrinkWrapCI(CI, Cond); 
-  return true; 
-} 
- 
-// Checks if CI is a candidate for shrinkwrapping and put it into work list if 
-// true. 
-void LibCallsShrinkWrap::checkCandidate(CallInst &CI) { 
-  if (CI.isNoBuiltin()) 
-    return; 
-  // A possible improvement is to handle the calls with the return value being 
-  // used. If there is API for fast libcall implementation without setting 
-  // errno, we can use the same framework to direct/wrap the call to the fast 
-  // API in the error free path, and leave the original call in the slow path. 
-  if (!CI.use_empty()) 
-    return; 
- 
-  LibFunc Func; 
-  Function *Callee = CI.getCalledFunction(); 
-  if (!Callee) 
-    return; 
-  if (!TLI.getLibFunc(*Callee, Func) || !TLI.has(Func)) 
-    return; 
- 
-  if (CI.getNumArgOperands() == 0) 
-    return; 
-  // TODO: Handle long double in other formats. 
-  Type *ArgType = CI.getArgOperand(0)->getType(); 
-  if (!(ArgType->isFloatTy() || ArgType->isDoubleTy() || 
-        ArgType->isX86_FP80Ty())) 
-    return; 
- 
-  WorkList.push_back(&CI); 
-} 
- 
-// Generate the upper bound condition for RangeError. 
-Value *LibCallsShrinkWrap::generateOneRangeCond(CallInst *CI, 
-                                                const LibFunc &Func) { 
-  float UpperBound; 
-  switch (Func) { 
-  case LibFunc_expm1: // RangeError: (709, inf) 
-    UpperBound = 709.0f; 
-    break; 
-  case LibFunc_expm1f: // RangeError: (88, inf) 
-    UpperBound = 88.0f; 
-    break; 
-  case LibFunc_expm1l: // RangeError: (11356, inf) 
-    UpperBound = 11356.0f; 
-    break; 
-  default: 
-    llvm_unreachable("Unhandled library call!"); 
-  } 
- 
-  ++NumWrappedOneCond; 
-  return createCond(CI, CmpInst::FCMP_OGT, UpperBound); 
-} 
- 
-// Generate the lower and upper bound condition for RangeError. 
-Value *LibCallsShrinkWrap::generateTwoRangeCond(CallInst *CI, 
-                                                const LibFunc &Func) { 
-  float UpperBound, LowerBound; 
-  switch (Func) { 
-  case LibFunc_cosh: // RangeError: (x < -710 || x > 710) 
-  case LibFunc_sinh: // Same as cosh 
-    LowerBound = -710.0f; 
-    UpperBound = 710.0f; 
-    break; 
-  case LibFunc_coshf: // RangeError: (x < -89 || x > 89) 
-  case LibFunc_sinhf: // Same as coshf 
-    LowerBound = -89.0f; 
-    UpperBound = 89.0f; 
-    break; 
-  case LibFunc_coshl: // RangeError: (x < -11357 || x > 11357) 
-  case LibFunc_sinhl: // Same as coshl 
-    LowerBound = -11357.0f; 
-    UpperBound = 11357.0f; 
-    break; 
-  case LibFunc_exp: // RangeError: (x < -745 || x > 709) 
-    LowerBound = -745.0f; 
-    UpperBound = 709.0f; 
-    break; 
-  case LibFunc_expf: // RangeError: (x < -103 || x > 88) 
-    LowerBound = -103.0f; 
-    UpperBound = 88.0f; 
-    break; 
-  case LibFunc_expl: // RangeError: (x < -11399 || x > 11356) 
-    LowerBound = -11399.0f; 
-    UpperBound = 11356.0f; 
-    break; 
-  case LibFunc_exp10: // RangeError: (x < -323 || x > 308) 
-    LowerBound = -323.0f; 
-    UpperBound = 308.0f; 
-    break; 
-  case LibFunc_exp10f: // RangeError: (x < -45 || x > 38) 
-    LowerBound = -45.0f; 
-    UpperBound = 38.0f; 
-    break; 
-  case LibFunc_exp10l: // RangeError: (x < -4950 || x > 4932) 
-    LowerBound = -4950.0f; 
-    UpperBound = 4932.0f; 
-    break; 
-  case LibFunc_exp2: // RangeError: (x < -1074 || x > 1023) 
-    LowerBound = -1074.0f; 
-    UpperBound = 1023.0f; 
-    break; 
-  case LibFunc_exp2f: // RangeError: (x < -149 || x > 127) 
-    LowerBound = -149.0f; 
-    UpperBound = 127.0f; 
-    break; 
-  case LibFunc_exp2l: // RangeError: (x < -16445 || x > 11383) 
-    LowerBound = -16445.0f; 
-    UpperBound = 11383.0f; 
-    break; 
-  default: 
-    llvm_unreachable("Unhandled library call!"); 
-  } 
- 
-  ++NumWrappedTwoCond; 
-  return createOrCond(CI, CmpInst::FCMP_OGT, UpperBound, CmpInst::FCMP_OLT, 
-                      LowerBound); 
-} 
- 
-// For pow(x,y), We only handle the following cases: 
-// (1) x is a constant && (x >= 1) && (x < MaxUInt8) 
-//     Cond is: (y > 127) 
-// (2) x is a value coming from an integer type. 
-//   (2.1) if x's bit_size == 8 
-//         Cond: (x <= 0 || y > 128) 
-//   (2.2) if x's bit_size is 16 
-//         Cond: (x <= 0 || y > 64) 
-//   (2.3) if x's bit_size is 32 
-//         Cond: (x <= 0 || y > 32) 
-// Support for powl(x,y) and powf(x,y) are TBD. 
-// 
-// Note that condition can be more conservative than the actual condition 
-// (i.e. we might invoke the calls that will not set the errno.). 
-// 
-Value *LibCallsShrinkWrap::generateCondForPow(CallInst *CI, 
-                                              const LibFunc &Func) { 
-  // FIXME: LibFunc_powf and powl TBD. 
-  if (Func != LibFunc_pow) { 
-    LLVM_DEBUG(dbgs() << "Not handled powf() and powl()\n"); 
-    return nullptr; 
-  } 
- 
-  Value *Base = CI->getArgOperand(0); 
-  Value *Exp = CI->getArgOperand(1); 
-  IRBuilder<> BBBuilder(CI); 
- 
-  // Constant Base case. 
-  if (ConstantFP *CF = dyn_cast<ConstantFP>(Base)) { 
-    double D = CF->getValueAPF().convertToDouble(); 
-    if (D < 1.0f || D > APInt::getMaxValue(8).getZExtValue()) { 
-      LLVM_DEBUG(dbgs() << "Not handled pow(): constant base out of range\n"); 
-      return nullptr; 
-    } 
- 
-    ++NumWrappedOneCond; 
-    Constant *V = ConstantFP::get(CI->getContext(), APFloat(127.0f)); 
-    if (!Exp->getType()->isFloatTy()) 
-      V = ConstantExpr::getFPExtend(V, Exp->getType()); 
-    return BBBuilder.CreateFCmp(CmpInst::FCMP_OGT, Exp, V); 
-  } 
- 
-  // If the Base value coming from an integer type. 
-  Instruction *I = dyn_cast<Instruction>(Base); 
-  if (!I) { 
-    LLVM_DEBUG(dbgs() << "Not handled pow(): FP type base\n"); 
-    return nullptr; 
-  } 
-  unsigned Opcode = I->getOpcode(); 
-  if (Opcode == Instruction::UIToFP || Opcode == Instruction::SIToFP) { 
-    unsigned BW = I->getOperand(0)->getType()->getPrimitiveSizeInBits(); 
-    float UpperV = 0.0f; 
-    if (BW == 8) 
-      UpperV = 128.0f; 
-    else if (BW == 16) 
-      UpperV = 64.0f; 
-    else if (BW == 32) 
-      UpperV = 32.0f; 
-    else { 
-      LLVM_DEBUG(dbgs() << "Not handled pow(): type too wide\n"); 
-      return nullptr; 
-    } 
- 
-    ++NumWrappedTwoCond; 
-    Constant *V = ConstantFP::get(CI->getContext(), APFloat(UpperV)); 
-    Constant *V0 = ConstantFP::get(CI->getContext(), APFloat(0.0f)); 
-    if (!Exp->getType()->isFloatTy()) 
-      V = ConstantExpr::getFPExtend(V, Exp->getType()); 
-    if (!Base->getType()->isFloatTy()) 
-      V0 = ConstantExpr::getFPExtend(V0, Exp->getType()); 
- 
-    Value *Cond = BBBuilder.CreateFCmp(CmpInst::FCMP_OGT, Exp, V); 
-    Value *Cond0 = BBBuilder.CreateFCmp(CmpInst::FCMP_OLE, Base, V0); 
-    return BBBuilder.CreateOr(Cond0, Cond); 
-  } 
-  LLVM_DEBUG(dbgs() << "Not handled pow(): base not from integer convert\n"); 
-  return nullptr; 
-} 
- 
-// Wrap conditions that can potentially generate errno to the library call. 
-void LibCallsShrinkWrap::shrinkWrapCI(CallInst *CI, Value *Cond) { 
-  assert(Cond != nullptr && "ShrinkWrapCI is not expecting an empty call inst"); 
-  MDNode *BranchWeights = 
-      MDBuilder(CI->getContext()).createBranchWeights(1, 2000); 
- 
-  Instruction *NewInst = 
-      SplitBlockAndInsertIfThen(Cond, CI, false, BranchWeights, DT); 
-  BasicBlock *CallBB = NewInst->getParent(); 
-  CallBB->setName("cdce.call"); 
-  BasicBlock *SuccBB = CallBB->getSingleSuccessor(); 
-  assert(SuccBB && "The split block should have a single successor"); 
-  SuccBB->setName("cdce.end"); 
-  CI->removeFromParent(); 
-  CallBB->getInstList().insert(CallBB->getFirstInsertionPt(), CI); 
-  LLVM_DEBUG(dbgs() << "== Basic Block After =="); 
-  LLVM_DEBUG(dbgs() << *CallBB->getSinglePredecessor() << *CallBB 
-                    << *CallBB->getSingleSuccessor() << "\n"); 
-} 
- 
-// Perform the transformation to a single candidate. 
-bool LibCallsShrinkWrap::perform(CallInst *CI) { 
-  LibFunc Func; 
-  Function *Callee = CI->getCalledFunction(); 
-  assert(Callee && "perform() should apply to a non-empty callee"); 
-  TLI.getLibFunc(*Callee, Func); 
-  assert(Func && "perform() is not expecting an empty function"); 
- 
-  if (performCallDomainErrorOnly(CI, Func) || performCallRangeErrorOnly(CI, Func)) 
-    return true; 
-  return performCallErrors(CI, Func); 
-} 
- 
-void LibCallsShrinkWrapLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const { 
-  AU.addPreserved<DominatorTreeWrapperPass>(); 
-  AU.addPreserved<GlobalsAAWrapperPass>(); 
-  AU.addRequired<TargetLibraryInfoWrapperPass>(); 
-} 
- 
-static bool runImpl(Function &F, const TargetLibraryInfo &TLI, 
-                    DominatorTree *DT) { 
-  if (F.hasFnAttribute(Attribute::OptimizeForSize)) 
-    return false; 
-  LibCallsShrinkWrap CCDCE(TLI, DT); 
-  CCDCE.visit(F); 
-  bool Changed = CCDCE.perform(); 
- 
-// Verify the dominator after we've updated it locally. 
-  assert(!DT || DT->verify(DominatorTree::VerificationLevel::Fast)); 
-  return Changed; 
-} 
- 
-bool LibCallsShrinkWrapLegacyPass::runOnFunction(Function &F) { 
-  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 
-  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 
-  auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; 
-  return runImpl(F, TLI, DT); 
-} 
- 
-namespace llvm { 
-char &LibCallsShrinkWrapPassID = LibCallsShrinkWrapLegacyPass::ID; 
- 
-// Public interface to LibCallsShrinkWrap pass. 
-FunctionPass *createLibCallsShrinkWrapPass() { 
-  return new LibCallsShrinkWrapLegacyPass(); 
-} 
- 
-PreservedAnalyses LibCallsShrinkWrapPass::run(Function &F, 
-                                              FunctionAnalysisManager &FAM) { 
-  auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F); 
-  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F); 
-  if (!runImpl(F, TLI, DT)) 
-    return PreservedAnalyses::all(); 
-  auto PA = PreservedAnalyses(); 
-  PA.preserve<GlobalsAA>(); 
-  PA.preserve<DominatorTreeAnalysis>(); 
-  return PA; 
-} 
-} 
+//===-- LibCallsShrinkWrap.cpp ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass shrink-wraps a call to function if the result is not used.
+// The call can set errno but is otherwise side effect free. For example:
+//    sqrt(val);
+//  is transformed to
+//    if (val < 0)
+//      sqrt(val);
+//  Even if the result of library call is not being used, the compiler cannot
+//  safely delete the call because the function can set errno on error
+//  conditions.
+//  Note in many functions, the error condition solely depends on the incoming
+//  parameter. In this optimization, we can generate the condition can lead to
+//  the errno to shrink-wrap the call. Since the chances of hitting the error
+//  condition is low, the runtime call is effectively eliminated.
+//
+//  These partially dead calls are usually results of C++ abstraction penalty
+//  exposed by inlining.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LibCallsShrinkWrap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "libcalls-shrinkwrap"
+
+STATISTIC(NumWrappedOneCond, "Number of One-Condition Wrappers Inserted");
+STATISTIC(NumWrappedTwoCond, "Number of Two-Condition Wrappers Inserted");
+
+namespace {
+class LibCallsShrinkWrapLegacyPass : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  explicit LibCallsShrinkWrapLegacyPass() : FunctionPass(ID) {
+    initializeLibCallsShrinkWrapLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnFunction(Function &F) override;
+};
+}
+
+char LibCallsShrinkWrapLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LibCallsShrinkWrapLegacyPass, "libcalls-shrinkwrap",
+                      "Conditionally eliminate dead library calls", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(LibCallsShrinkWrapLegacyPass, "libcalls-shrinkwrap",
+                    "Conditionally eliminate dead library calls", false, false)
+
+namespace {
+class LibCallsShrinkWrap : public InstVisitor<LibCallsShrinkWrap> {
+public:
+  LibCallsShrinkWrap(const TargetLibraryInfo &TLI, DominatorTree *DT)
+      : TLI(TLI), DT(DT){};
+  void visitCallInst(CallInst &CI) { checkCandidate(CI); }
+  bool perform() {
+    bool Changed = false;
+    for (auto &CI : WorkList) {
+      LLVM_DEBUG(dbgs() << "CDCE calls: " << CI->getCalledFunction()->getName()
+                        << "\n");
+      if (perform(CI)) {
+        Changed = true;
+        LLVM_DEBUG(dbgs() << "Transformed\n");
+      }
+    }
+    return Changed;
+  }
+
+private:
+  bool perform(CallInst *CI);
+  void checkCandidate(CallInst &CI);
+  void shrinkWrapCI(CallInst *CI, Value *Cond);
+  bool performCallDomainErrorOnly(CallInst *CI, const LibFunc &Func);
+  bool performCallErrors(CallInst *CI, const LibFunc &Func);
+  bool performCallRangeErrorOnly(CallInst *CI, const LibFunc &Func);
+  Value *generateOneRangeCond(CallInst *CI, const LibFunc &Func);
+  Value *generateTwoRangeCond(CallInst *CI, const LibFunc &Func);
+  Value *generateCondForPow(CallInst *CI, const LibFunc &Func);
+
+  // Create an OR of two conditions.
+  Value *createOrCond(CallInst *CI, CmpInst::Predicate Cmp, float Val,
+                      CmpInst::Predicate Cmp2, float Val2) {
+    IRBuilder<> BBBuilder(CI);
+    Value *Arg = CI->getArgOperand(0);
+    auto Cond2 = createCond(BBBuilder, Arg, Cmp2, Val2);
+    auto Cond1 = createCond(BBBuilder, Arg, Cmp, Val);
+    return BBBuilder.CreateOr(Cond1, Cond2);
+  }
+
+  // Create a single condition using IRBuilder.
+  Value *createCond(IRBuilder<> &BBBuilder, Value *Arg, CmpInst::Predicate Cmp,
+                    float Val) {
+    Constant *V = ConstantFP::get(BBBuilder.getContext(), APFloat(Val));
+    if (!Arg->getType()->isFloatTy())
+      V = ConstantExpr::getFPExtend(V, Arg->getType());
+    return BBBuilder.CreateFCmp(Cmp, Arg, V);
+  }
+
+  // Create a single condition.
+  Value *createCond(CallInst *CI, CmpInst::Predicate Cmp, float Val) {
+    IRBuilder<> BBBuilder(CI);
+    Value *Arg = CI->getArgOperand(0);
+    return createCond(BBBuilder, Arg, Cmp, Val);
+  }
+
+  const TargetLibraryInfo &TLI;
+  DominatorTree *DT;
+  SmallVector<CallInst *, 16> WorkList;
+};
+} // end anonymous namespace
+
+// Perform the transformation to calls with errno set by domain error.
+bool LibCallsShrinkWrap::performCallDomainErrorOnly(CallInst *CI,
+                                                    const LibFunc &Func) {
+  Value *Cond = nullptr;
+
+  switch (Func) {
+  case LibFunc_acos:  // DomainError: (x < -1 || x > 1)
+  case LibFunc_acosf: // Same as acos
+  case LibFunc_acosl: // Same as acos
+  case LibFunc_asin:  // DomainError: (x < -1 || x > 1)
+  case LibFunc_asinf: // Same as asin
+  case LibFunc_asinl: // Same as asin
+  {
+    ++NumWrappedTwoCond;
+    Cond = createOrCond(CI, CmpInst::FCMP_OLT, -1.0f, CmpInst::FCMP_OGT, 1.0f);
+    break;
+  }
+  case LibFunc_cos:  // DomainError: (x == +inf || x == -inf)
+  case LibFunc_cosf: // Same as cos
+  case LibFunc_cosl: // Same as cos
+  case LibFunc_sin:  // DomainError: (x == +inf || x == -inf)
+  case LibFunc_sinf: // Same as sin
+  case LibFunc_sinl: // Same as sin
+  {
+    ++NumWrappedTwoCond;
+    Cond = createOrCond(CI, CmpInst::FCMP_OEQ, INFINITY, CmpInst::FCMP_OEQ,
+                        -INFINITY);
+    break;
+  }
+  case LibFunc_acosh:  // DomainError: (x < 1)
+  case LibFunc_acoshf: // Same as acosh
+  case LibFunc_acoshl: // Same as acosh
+  {
+    ++NumWrappedOneCond;
+    Cond = createCond(CI, CmpInst::FCMP_OLT, 1.0f);
+    break;
+  }
+  case LibFunc_sqrt:  // DomainError: (x < 0)
+  case LibFunc_sqrtf: // Same as sqrt
+  case LibFunc_sqrtl: // Same as sqrt
+  {
+    ++NumWrappedOneCond;
+    Cond = createCond(CI, CmpInst::FCMP_OLT, 0.0f);
+    break;
+  }
+  default:
+    return false;
+  }
+  shrinkWrapCI(CI, Cond);
+  return true;
+}
+
+// Perform the transformation to calls with errno set by range error.
+bool LibCallsShrinkWrap::performCallRangeErrorOnly(CallInst *CI,
+                                                   const LibFunc &Func) {
+  Value *Cond = nullptr;
+
+  switch (Func) {
+  case LibFunc_cosh:
+  case LibFunc_coshf:
+  case LibFunc_coshl:
+  case LibFunc_exp:
+  case LibFunc_expf:
+  case LibFunc_expl:
+  case LibFunc_exp10:
+  case LibFunc_exp10f:
+  case LibFunc_exp10l:
+  case LibFunc_exp2:
+  case LibFunc_exp2f:
+  case LibFunc_exp2l:
+  case LibFunc_sinh:
+  case LibFunc_sinhf:
+  case LibFunc_sinhl: {
+    Cond = generateTwoRangeCond(CI, Func);
+    break;
+  }
+  case LibFunc_expm1:  // RangeError: (709, inf)
+  case LibFunc_expm1f: // RangeError: (88, inf)
+  case LibFunc_expm1l: // RangeError: (11356, inf)
+  {
+    Cond = generateOneRangeCond(CI, Func);
+    break;
+  }
+  default:
+    return false;
+  }
+  shrinkWrapCI(CI, Cond);
+  return true;
+}
+
+// Perform the transformation to calls with errno set by combination of errors.
+bool LibCallsShrinkWrap::performCallErrors(CallInst *CI,
+                                           const LibFunc &Func) {
+  Value *Cond = nullptr;
+
+  switch (Func) {
+  case LibFunc_atanh:  // DomainError: (x < -1 || x > 1)
+                        // PoleError:   (x == -1 || x == 1)
+                        // Overall Cond: (x <= -1 || x >= 1)
+  case LibFunc_atanhf: // Same as atanh
+  case LibFunc_atanhl: // Same as atanh
+  {
+    ++NumWrappedTwoCond;
+    Cond = createOrCond(CI, CmpInst::FCMP_OLE, -1.0f, CmpInst::FCMP_OGE, 1.0f);
+    break;
+  }
+  case LibFunc_log:    // DomainError: (x < 0)
+                        // PoleError:   (x == 0)
+                        // Overall Cond: (x <= 0)
+  case LibFunc_logf:   // Same as log
+  case LibFunc_logl:   // Same as log
+  case LibFunc_log10:  // Same as log
+  case LibFunc_log10f: // Same as log
+  case LibFunc_log10l: // Same as log
+  case LibFunc_log2:   // Same as log
+  case LibFunc_log2f:  // Same as log
+  case LibFunc_log2l:  // Same as log
+  case LibFunc_logb:   // Same as log
+  case LibFunc_logbf:  // Same as log
+  case LibFunc_logbl:  // Same as log
+  {
+    ++NumWrappedOneCond;
+    Cond = createCond(CI, CmpInst::FCMP_OLE, 0.0f);
+    break;
+  }
+  case LibFunc_log1p:  // DomainError: (x < -1)
+                        // PoleError:   (x == -1)
+                        // Overall Cond: (x <= -1)
+  case LibFunc_log1pf: // Same as log1p
+  case LibFunc_log1pl: // Same as log1p
+  {
+    ++NumWrappedOneCond;
+    Cond = createCond(CI, CmpInst::FCMP_OLE, -1.0f);
+    break;
+  }
+  case LibFunc_pow: // DomainError: x < 0 and y is noninteger
+                     // PoleError:   x == 0 and y < 0
+                     // RangeError:  overflow or underflow
+  case LibFunc_powf:
+  case LibFunc_powl: {
+    Cond = generateCondForPow(CI, Func);
+    if (Cond == nullptr)
+      return false;
+    break;
+  }
+  default:
+    return false;
+  }
+  assert(Cond && "performCallErrors should not see an empty condition");
+  shrinkWrapCI(CI, Cond);
+  return true;
+}
+
+// Checks if CI is a candidate for shrinkwrapping and put it into work list if
+// true.
+void LibCallsShrinkWrap::checkCandidate(CallInst &CI) {
+  if (CI.isNoBuiltin())
+    return;
+  // A possible improvement is to handle the calls with the return value being
+  // used. If there is API for fast libcall implementation without setting
+  // errno, we can use the same framework to direct/wrap the call to the fast
+  // API in the error free path, and leave the original call in the slow path.
+  if (!CI.use_empty())
+    return;
+
+  LibFunc Func;
+  Function *Callee = CI.getCalledFunction();
+  if (!Callee)
+    return;
+  if (!TLI.getLibFunc(*Callee, Func) || !TLI.has(Func))
+    return;
+
+  if (CI.getNumArgOperands() == 0)
+    return;
+  // TODO: Handle long double in other formats.
+  Type *ArgType = CI.getArgOperand(0)->getType();
+  if (!(ArgType->isFloatTy() || ArgType->isDoubleTy() ||
+        ArgType->isX86_FP80Ty()))
+    return;
+
+  WorkList.push_back(&CI);
+}
+
+// Generate the upper bound condition for RangeError.
+Value *LibCallsShrinkWrap::generateOneRangeCond(CallInst *CI,
+                                                const LibFunc &Func) {
+  float UpperBound;
+  switch (Func) {
+  case LibFunc_expm1: // RangeError: (709, inf)
+    UpperBound = 709.0f;
+    break;
+  case LibFunc_expm1f: // RangeError: (88, inf)
+    UpperBound = 88.0f;
+    break;
+  case LibFunc_expm1l: // RangeError: (11356, inf)
+    UpperBound = 11356.0f;
+    break;
+  default:
+    llvm_unreachable("Unhandled library call!");
+  }
+
+  ++NumWrappedOneCond;
+  return createCond(CI, CmpInst::FCMP_OGT, UpperBound);
+}
+
+// Generate the lower and upper bound condition for RangeError.
+Value *LibCallsShrinkWrap::generateTwoRangeCond(CallInst *CI,
+                                                const LibFunc &Func) {
+  float UpperBound, LowerBound;
+  switch (Func) {
+  case LibFunc_cosh: // RangeError: (x < -710 || x > 710)
+  case LibFunc_sinh: // Same as cosh
+    LowerBound = -710.0f;
+    UpperBound = 710.0f;
+    break;
+  case LibFunc_coshf: // RangeError: (x < -89 || x > 89)
+  case LibFunc_sinhf: // Same as coshf
+    LowerBound = -89.0f;
+    UpperBound = 89.0f;
+    break;
+  case LibFunc_coshl: // RangeError: (x < -11357 || x > 11357)
+  case LibFunc_sinhl: // Same as coshl
+    LowerBound = -11357.0f;
+    UpperBound = 11357.0f;
+    break;
+  case LibFunc_exp: // RangeError: (x < -745 || x > 709)
+    LowerBound = -745.0f;
+    UpperBound = 709.0f;
+    break;
+  case LibFunc_expf: // RangeError: (x < -103 || x > 88)
+    LowerBound = -103.0f;
+    UpperBound = 88.0f;
+    break;
+  case LibFunc_expl: // RangeError: (x < -11399 || x > 11356)
+    LowerBound = -11399.0f;
+    UpperBound = 11356.0f;
+    break;
+  case LibFunc_exp10: // RangeError: (x < -323 || x > 308)
+    LowerBound = -323.0f;
+    UpperBound = 308.0f;
+    break;
+  case LibFunc_exp10f: // RangeError: (x < -45 || x > 38)
+    LowerBound = -45.0f;
+    UpperBound = 38.0f;
+    break;
+  case LibFunc_exp10l: // RangeError: (x < -4950 || x > 4932)
+    LowerBound = -4950.0f;
+    UpperBound = 4932.0f;
+    break;
+  case LibFunc_exp2: // RangeError: (x < -1074 || x > 1023)
+    LowerBound = -1074.0f;
+    UpperBound = 1023.0f;
+    break;
+  case LibFunc_exp2f: // RangeError: (x < -149 || x > 127)
+    LowerBound = -149.0f;
+    UpperBound = 127.0f;
+    break;
+  case LibFunc_exp2l: // RangeError: (x < -16445 || x > 11383)
+    LowerBound = -16445.0f;
+    UpperBound = 11383.0f;
+    break;
+  default:
+    llvm_unreachable("Unhandled library call!");
+  }
+
+  ++NumWrappedTwoCond;
+  return createOrCond(CI, CmpInst::FCMP_OGT, UpperBound, CmpInst::FCMP_OLT,
+                      LowerBound);
+}
+
+// For pow(x,y), We only handle the following cases:
+// (1) x is a constant && (x >= 1) && (x < MaxUInt8)
+//     Cond is: (y > 127)
+// (2) x is a value coming from an integer type.
+//   (2.1) if x's bit_size == 8
+//         Cond: (x <= 0 || y > 128)
+//   (2.2) if x's bit_size is 16
+//         Cond: (x <= 0 || y > 64)
+//   (2.3) if x's bit_size is 32
+//         Cond: (x <= 0 || y > 32)
+// Support for powl(x,y) and powf(x,y) are TBD.
+//
+// Note that condition can be more conservative than the actual condition
+// (i.e. we might invoke the calls that will not set the errno.).
+//
+Value *LibCallsShrinkWrap::generateCondForPow(CallInst *CI,
+                                              const LibFunc &Func) {
+  // FIXME: LibFunc_powf and powl TBD.
+  if (Func != LibFunc_pow) {
+    LLVM_DEBUG(dbgs() << "Not handled powf() and powl()\n");
+    return nullptr;
+  }
+
+  Value *Base = CI->getArgOperand(0);
+  Value *Exp = CI->getArgOperand(1);
+  IRBuilder<> BBBuilder(CI);
+
+  // Constant Base case.
+  if (ConstantFP *CF = dyn_cast<ConstantFP>(Base)) {
+    double D = CF->getValueAPF().convertToDouble();
+    if (D < 1.0f || D > APInt::getMaxValue(8).getZExtValue()) {
+      LLVM_DEBUG(dbgs() << "Not handled pow(): constant base out of range\n");
+      return nullptr;
+    }
+
+    ++NumWrappedOneCond;
+    Constant *V = ConstantFP::get(CI->getContext(), APFloat(127.0f));
+    if (!Exp->getType()->isFloatTy())
+      V = ConstantExpr::getFPExtend(V, Exp->getType());
+    return BBBuilder.CreateFCmp(CmpInst::FCMP_OGT, Exp, V);
+  }
+
+  // If the Base value coming from an integer type.
+  Instruction *I = dyn_cast<Instruction>(Base);
+  if (!I) {
+    LLVM_DEBUG(dbgs() << "Not handled pow(): FP type base\n");
+    return nullptr;
+  }
+  unsigned Opcode = I->getOpcode();
+  if (Opcode == Instruction::UIToFP || Opcode == Instruction::SIToFP) {
+    unsigned BW = I->getOperand(0)->getType()->getPrimitiveSizeInBits();
+    float UpperV = 0.0f;
+    if (BW == 8)
+      UpperV = 128.0f;
+    else if (BW == 16)
+      UpperV = 64.0f;
+    else if (BW == 32)
+      UpperV = 32.0f;
+    else {
+      LLVM_DEBUG(dbgs() << "Not handled pow(): type too wide\n");
+      return nullptr;
+    }
+
+    ++NumWrappedTwoCond;
+    Constant *V = ConstantFP::get(CI->getContext(), APFloat(UpperV));
+    Constant *V0 = ConstantFP::get(CI->getContext(), APFloat(0.0f));
+    if (!Exp->getType()->isFloatTy())
+      V = ConstantExpr::getFPExtend(V, Exp->getType());
+    if (!Base->getType()->isFloatTy())
+      V0 = ConstantExpr::getFPExtend(V0, Exp->getType());
+
+    Value *Cond = BBBuilder.CreateFCmp(CmpInst::FCMP_OGT, Exp, V);
+    Value *Cond0 = BBBuilder.CreateFCmp(CmpInst::FCMP_OLE, Base, V0);
+    return BBBuilder.CreateOr(Cond0, Cond);
+  }
+  LLVM_DEBUG(dbgs() << "Not handled pow(): base not from integer convert\n");
+  return nullptr;
+}
+
+// Wrap conditions that can potentially generate errno to the library call.
+void LibCallsShrinkWrap::shrinkWrapCI(CallInst *CI, Value *Cond) {
+  assert(Cond != nullptr && "ShrinkWrapCI is not expecting an empty call inst");
+  MDNode *BranchWeights =
+      MDBuilder(CI->getContext()).createBranchWeights(1, 2000);
+
+  Instruction *NewInst =
+      SplitBlockAndInsertIfThen(Cond, CI, false, BranchWeights, DT);
+  BasicBlock *CallBB = NewInst->getParent();
+  CallBB->setName("cdce.call");
+  BasicBlock *SuccBB = CallBB->getSingleSuccessor();
+  assert(SuccBB && "The split block should have a single successor");
+  SuccBB->setName("cdce.end");
+  CI->removeFromParent();
+  CallBB->getInstList().insert(CallBB->getFirstInsertionPt(), CI);
+  LLVM_DEBUG(dbgs() << "== Basic Block After ==");
+  LLVM_DEBUG(dbgs() << *CallBB->getSinglePredecessor() << *CallBB
+                    << *CallBB->getSingleSuccessor() << "\n");
+}
+
+// Perform the transformation to a single candidate.
+bool LibCallsShrinkWrap::perform(CallInst *CI) {
+  LibFunc Func;
+  Function *Callee = CI->getCalledFunction();
+  assert(Callee && "perform() should apply to a non-empty callee");
+  TLI.getLibFunc(*Callee, Func);
+  assert(Func && "perform() is not expecting an empty function");
+
+  if (performCallDomainErrorOnly(CI, Func) || performCallRangeErrorOnly(CI, Func))
+    return true;
+  return performCallErrors(CI, Func);
+}
+
+void LibCallsShrinkWrapLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addPreserved<DominatorTreeWrapperPass>();
+  AU.addPreserved<GlobalsAAWrapperPass>();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
+}
+
+static bool runImpl(Function &F, const TargetLibraryInfo &TLI,
+                    DominatorTree *DT) {
+  if (F.hasFnAttribute(Attribute::OptimizeForSize))
+    return false;
+  LibCallsShrinkWrap CCDCE(TLI, DT);
+  CCDCE.visit(F);
+  bool Changed = CCDCE.perform();
+
+// Verify the dominator after we've updated it locally.
+  assert(!DT || DT->verify(DominatorTree::VerificationLevel::Fast));
+  return Changed;
+}
+
+bool LibCallsShrinkWrapLegacyPass::runOnFunction(Function &F) {
+  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+  return runImpl(F, TLI, DT);
+}
+
+namespace llvm {
+char &LibCallsShrinkWrapPassID = LibCallsShrinkWrapLegacyPass::ID;
+
+// Public interface to LibCallsShrinkWrap pass.
+FunctionPass *createLibCallsShrinkWrapPass() {
+  return new LibCallsShrinkWrapLegacyPass();
+}
+
+PreservedAnalyses LibCallsShrinkWrapPass::run(Function &F,
+                                              FunctionAnalysisManager &FAM) {
+  auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
+  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
+  if (!runImpl(F, TLI, DT))
+    return PreservedAnalyses::all();
+  auto PA = PreservedAnalyses();
+  PA.preserve<GlobalsAA>();
+  PA.preserve<DominatorTreeAnalysis>();
+  return PA;
+}
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/Local.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/Local.cpp
index 3223fd6f65..ae26058c21 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/Local.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/Local.cpp
@@ -1,98 +1,98 @@
-//===- Local.cpp - Functions to perform local transformations -------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This family of functions perform various local transformations to the 
-// program. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/ADT/APInt.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/DenseMapInfo.h" 
-#include "llvm/ADT/DenseSet.h" 
-#include "llvm/ADT/Hashing.h" 
-#include "llvm/ADT/None.h" 
-#include "llvm/ADT/Optional.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/TinyPtrVector.h" 
-#include "llvm/Analysis/AssumeBundleQueries.h" 
-#include "llvm/Analysis/ConstantFolding.h" 
-#include "llvm/Analysis/DomTreeUpdater.h" 
-#include "llvm/Analysis/EHPersonalities.h" 
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/Analysis/LazyValueInfo.h" 
-#include "llvm/Analysis/MemoryBuiltins.h" 
-#include "llvm/Analysis/MemorySSAUpdater.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/Analysis/VectorUtils.h" 
-#include "llvm/BinaryFormat/Dwarf.h" 
-#include "llvm/IR/Argument.h" 
-#include "llvm/IR/Attributes.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/ConstantRange.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DIBuilder.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DebugInfoMetadata.h" 
-#include "llvm/IR/DebugLoc.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GetElementPtrTypeIterator.h" 
-#include "llvm/IR/GlobalObject.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/MDBuilder.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Operator.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/IR/ValueHandle.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/KnownBits.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/ValueMapper.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <climits> 
-#include <cstdint> 
-#include <iterator> 
-#include <map> 
-#include <utility> 
- 
-using namespace llvm; 
-using namespace llvm::PatternMatch; 
- 
-#define DEBUG_TYPE "local" 
- 
-STATISTIC(NumRemoved, "Number of unreachable basic blocks removed"); 
+//===- Local.cpp - Functions to perform local transformations -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This family of functions perform various local transformations to the
+// program.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/Analysis/AssumeBundleQueries.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
+#include <cassert>
+#include <climits>
+#include <cstdint>
+#include <iterator>
+#include <map>
+#include <utility>
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "local"
+
+STATISTIC(NumRemoved, "Number of unreachable basic blocks removed");
 STATISTIC(NumPHICSEs, "Number of PHI's that got CSE'd");
- 
+
 static cl::opt<bool> PHICSEDebugHash(
     "phicse-debug-hash",
 #ifdef EXPENSIVE_CHECKS
@@ -110,51 +110,51 @@ static cl::opt<unsigned> PHICSENumPHISmallSize(
         "When the basic block contains not more than this number of PHI nodes, "
         "perform a (faster!) exhaustive search instead of set-driven one."));
 
-// Max recursion depth for collectBitParts used when detecting bswap and 
-// bitreverse idioms 
-static const unsigned BitPartRecursionMaxDepth = 64; 
- 
-//===----------------------------------------------------------------------===// 
-//  Local constant propagation. 
-// 
- 
-/// ConstantFoldTerminator - If a terminator instruction is predicated on a 
-/// constant value, convert it into an unconditional branch to the constant 
-/// destination.  This is a nontrivial operation because the successors of this 
-/// basic block must have their PHI nodes updated. 
-/// Also calls RecursivelyDeleteTriviallyDeadInstructions() on any branch/switch 
-/// conditions and indirectbr addresses this might make dead if 
-/// DeleteDeadConditions is true. 
-bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, 
-                                  const TargetLibraryInfo *TLI, 
-                                  DomTreeUpdater *DTU) { 
-  Instruction *T = BB->getTerminator(); 
-  IRBuilder<> Builder(T); 
- 
-  // Branch - See if we are conditional jumping on constant 
-  if (auto *BI = dyn_cast<BranchInst>(T)) { 
-    if (BI->isUnconditional()) return false;  // Can't optimize uncond branch 
-
-    BasicBlock *Dest1 = BI->getSuccessor(0); 
-    BasicBlock *Dest2 = BI->getSuccessor(1); 
- 
-    if (Dest2 == Dest1) {       // Conditional branch to same location? 
-      // This branch matches something like this: 
-      //     br bool %cond, label %Dest, label %Dest 
-      // and changes it into:  br label %Dest 
- 
-      // Let the basic block know that we are letting go of one copy of it. 
-      assert(BI->getParent() && "Terminator not inserted in block!"); 
-      Dest1->removePredecessor(BI->getParent()); 
- 
-      // Replace the conditional branch with an unconditional one. 
-      Builder.CreateBr(Dest1); 
-      Value *Cond = BI->getCondition(); 
-      BI->eraseFromParent(); 
-      if (DeleteDeadConditions) 
-        RecursivelyDeleteTriviallyDeadInstructions(Cond, TLI); 
-      return true; 
-    } 
+// Max recursion depth for collectBitParts used when detecting bswap and
+// bitreverse idioms
+static const unsigned BitPartRecursionMaxDepth = 64;
+
+//===----------------------------------------------------------------------===//
+//  Local constant propagation.
+//
+
+/// ConstantFoldTerminator - If a terminator instruction is predicated on a
+/// constant value, convert it into an unconditional branch to the constant
+/// destination.  This is a nontrivial operation because the successors of this
+/// basic block must have their PHI nodes updated.
+/// Also calls RecursivelyDeleteTriviallyDeadInstructions() on any branch/switch
+/// conditions and indirectbr addresses this might make dead if
+/// DeleteDeadConditions is true.
+bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
+                                  const TargetLibraryInfo *TLI,
+                                  DomTreeUpdater *DTU) {
+  Instruction *T = BB->getTerminator();
+  IRBuilder<> Builder(T);
+
+  // Branch - See if we are conditional jumping on constant
+  if (auto *BI = dyn_cast<BranchInst>(T)) {
+    if (BI->isUnconditional()) return false;  // Can't optimize uncond branch
+
+    BasicBlock *Dest1 = BI->getSuccessor(0);
+    BasicBlock *Dest2 = BI->getSuccessor(1);
+
+    if (Dest2 == Dest1) {       // Conditional branch to same location?
+      // This branch matches something like this:
+      //     br bool %cond, label %Dest, label %Dest
+      // and changes it into:  br label %Dest
+
+      // Let the basic block know that we are letting go of one copy of it.
+      assert(BI->getParent() && "Terminator not inserted in block!");
+      Dest1->removePredecessor(BI->getParent());
+
+      // Replace the conditional branch with an unconditional one.
+      Builder.CreateBr(Dest1);
+      Value *Cond = BI->getCondition();
+      BI->eraseFromParent();
+      if (DeleteDeadConditions)
+        RecursivelyDeleteTriviallyDeadInstructions(Cond, TLI);
+      return true;
+    }
 
     if (auto *Cond = dyn_cast<ConstantInt>(BI->getCondition())) {
       // Are we branching on constant?
@@ -174,109 +174,109 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
       return true;
     }
 
-    return false; 
-  } 
- 
-  if (auto *SI = dyn_cast<SwitchInst>(T)) { 
-    // If we are switching on a constant, we can convert the switch to an 
-    // unconditional branch. 
-    auto *CI = dyn_cast<ConstantInt>(SI->getCondition()); 
-    BasicBlock *DefaultDest = SI->getDefaultDest(); 
-    BasicBlock *TheOnlyDest = DefaultDest; 
- 
-    // If the default is unreachable, ignore it when searching for TheOnlyDest. 
-    if (isa<UnreachableInst>(DefaultDest->getFirstNonPHIOrDbg()) && 
-        SI->getNumCases() > 0) { 
-      TheOnlyDest = SI->case_begin()->getCaseSuccessor(); 
-    } 
- 
+    return false;
+  }
+
+  if (auto *SI = dyn_cast<SwitchInst>(T)) {
+    // If we are switching on a constant, we can convert the switch to an
+    // unconditional branch.
+    auto *CI = dyn_cast<ConstantInt>(SI->getCondition());
+    BasicBlock *DefaultDest = SI->getDefaultDest();
+    BasicBlock *TheOnlyDest = DefaultDest;
+
+    // If the default is unreachable, ignore it when searching for TheOnlyDest.
+    if (isa<UnreachableInst>(DefaultDest->getFirstNonPHIOrDbg()) &&
+        SI->getNumCases() > 0) {
+      TheOnlyDest = SI->case_begin()->getCaseSuccessor();
+    }
+
     bool Changed = false;
 
-    // Figure out which case it goes to. 
-    for (auto i = SI->case_begin(), e = SI->case_end(); i != e;) { 
-      // Found case matching a constant operand? 
-      if (i->getCaseValue() == CI) { 
-        TheOnlyDest = i->getCaseSuccessor(); 
-        break; 
-      } 
- 
-      // Check to see if this branch is going to the same place as the default 
-      // dest.  If so, eliminate it as an explicit compare. 
-      if (i->getCaseSuccessor() == DefaultDest) { 
-        MDNode *MD = SI->getMetadata(LLVMContext::MD_prof); 
-        unsigned NCases = SI->getNumCases(); 
-        // Fold the case metadata into the default if there will be any branches 
-        // left, unless the metadata doesn't match the switch. 
-        if (NCases > 1 && MD && MD->getNumOperands() == 2 + NCases) { 
-          // Collect branch weights into a vector. 
-          SmallVector<uint32_t, 8> Weights; 
-          for (unsigned MD_i = 1, MD_e = MD->getNumOperands(); MD_i < MD_e; 
-               ++MD_i) { 
-            auto *CI = mdconst::extract<ConstantInt>(MD->getOperand(MD_i)); 
-            Weights.push_back(CI->getValue().getZExtValue()); 
-          } 
-          // Merge weight of this case to the default weight. 
-          unsigned idx = i->getCaseIndex(); 
-          Weights[0] += Weights[idx+1]; 
-          // Remove weight for this case. 
-          std::swap(Weights[idx+1], Weights.back()); 
-          Weights.pop_back(); 
-          SI->setMetadata(LLVMContext::MD_prof, 
-                          MDBuilder(BB->getContext()). 
-                          createBranchWeights(Weights)); 
-        } 
-        // Remove this entry. 
-        BasicBlock *ParentBB = SI->getParent(); 
-        DefaultDest->removePredecessor(ParentBB); 
-        i = SI->removeCase(i); 
-        e = SI->case_end(); 
+    // Figure out which case it goes to.
+    for (auto i = SI->case_begin(), e = SI->case_end(); i != e;) {
+      // Found case matching a constant operand?
+      if (i->getCaseValue() == CI) {
+        TheOnlyDest = i->getCaseSuccessor();
+        break;
+      }
+
+      // Check to see if this branch is going to the same place as the default
+      // dest.  If so, eliminate it as an explicit compare.
+      if (i->getCaseSuccessor() == DefaultDest) {
+        MDNode *MD = SI->getMetadata(LLVMContext::MD_prof);
+        unsigned NCases = SI->getNumCases();
+        // Fold the case metadata into the default if there will be any branches
+        // left, unless the metadata doesn't match the switch.
+        if (NCases > 1 && MD && MD->getNumOperands() == 2 + NCases) {
+          // Collect branch weights into a vector.
+          SmallVector<uint32_t, 8> Weights;
+          for (unsigned MD_i = 1, MD_e = MD->getNumOperands(); MD_i < MD_e;
+               ++MD_i) {
+            auto *CI = mdconst::extract<ConstantInt>(MD->getOperand(MD_i));
+            Weights.push_back(CI->getValue().getZExtValue());
+          }
+          // Merge weight of this case to the default weight.
+          unsigned idx = i->getCaseIndex();
+          Weights[0] += Weights[idx+1];
+          // Remove weight for this case.
+          std::swap(Weights[idx+1], Weights.back());
+          Weights.pop_back();
+          SI->setMetadata(LLVMContext::MD_prof,
+                          MDBuilder(BB->getContext()).
+                          createBranchWeights(Weights));
+        }
+        // Remove this entry.
+        BasicBlock *ParentBB = SI->getParent();
+        DefaultDest->removePredecessor(ParentBB);
+        i = SI->removeCase(i);
+        e = SI->case_end();
         Changed = true;
-        continue; 
-      } 
- 
-      // Otherwise, check to see if the switch only branches to one destination. 
-      // We do this by reseting "TheOnlyDest" to null when we find two non-equal 
-      // destinations. 
-      if (i->getCaseSuccessor() != TheOnlyDest) 
-        TheOnlyDest = nullptr; 
- 
-      // Increment this iterator as we haven't removed the case. 
-      ++i; 
-    } 
- 
-    if (CI && !TheOnlyDest) { 
-      // Branching on a constant, but not any of the cases, go to the default 
-      // successor. 
-      TheOnlyDest = SI->getDefaultDest(); 
-    } 
- 
-    // If we found a single destination that we can fold the switch into, do so 
-    // now. 
-    if (TheOnlyDest) { 
-      // Insert the new branch. 
-      Builder.CreateBr(TheOnlyDest); 
-      BasicBlock *BB = SI->getParent(); 
- 
+        continue;
+      }
+
+      // Otherwise, check to see if the switch only branches to one destination.
+      // We do this by reseting "TheOnlyDest" to null when we find two non-equal
+      // destinations.
+      if (i->getCaseSuccessor() != TheOnlyDest)
+        TheOnlyDest = nullptr;
+
+      // Increment this iterator as we haven't removed the case.
+      ++i;
+    }
+
+    if (CI && !TheOnlyDest) {
+      // Branching on a constant, but not any of the cases, go to the default
+      // successor.
+      TheOnlyDest = SI->getDefaultDest();
+    }
+
+    // If we found a single destination that we can fold the switch into, do so
+    // now.
+    if (TheOnlyDest) {
+      // Insert the new branch.
+      Builder.CreateBr(TheOnlyDest);
+      BasicBlock *BB = SI->getParent();
+
       SmallSetVector<BasicBlock *, 8> RemovedSuccessors;
 
-      // Remove entries from PHI nodes which we no longer branch to... 
+      // Remove entries from PHI nodes which we no longer branch to...
       BasicBlock *SuccToKeep = TheOnlyDest;
-      for (BasicBlock *Succ : successors(SI)) { 
+      for (BasicBlock *Succ : successors(SI)) {
         if (DTU && Succ != TheOnlyDest)
           RemovedSuccessors.insert(Succ);
-        // Found case matching a constant operand? 
+        // Found case matching a constant operand?
         if (Succ == SuccToKeep) {
           SuccToKeep = nullptr; // Don't modify the first branch to TheOnlyDest
-        } else { 
-          Succ->removePredecessor(BB); 
-        } 
-      } 
- 
-      // Delete the old switch. 
-      Value *Cond = SI->getCondition(); 
-      SI->eraseFromParent(); 
-      if (DeleteDeadConditions) 
-        RecursivelyDeleteTriviallyDeadInstructions(Cond, TLI); 
+        } else {
+          Succ->removePredecessor(BB);
+        }
+      }
+
+      // Delete the old switch.
+      Value *Cond = SI->getCondition();
+      SI->eraseFromParent();
+      if (DeleteDeadConditions)
+        RecursivelyDeleteTriviallyDeadInstructions(Cond, TLI);
       if (DTU) {
         std::vector<DominatorTree::UpdateType> Updates;
         Updates.reserve(RemovedSuccessors.size());
@@ -284,86 +284,86 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
           Updates.push_back({DominatorTree::Delete, BB, RemovedSuccessor});
         DTU->applyUpdates(Updates);
       }
-      return true; 
-    } 
- 
-    if (SI->getNumCases() == 1) { 
-      // Otherwise, we can fold this switch into a conditional branch 
-      // instruction if it has only one non-default destination. 
-      auto FirstCase = *SI->case_begin(); 
-      Value *Cond = Builder.CreateICmpEQ(SI->getCondition(), 
-          FirstCase.getCaseValue(), "cond"); 
- 
-      // Insert the new branch. 
-      BranchInst *NewBr = Builder.CreateCondBr(Cond, 
-                                               FirstCase.getCaseSuccessor(), 
-                                               SI->getDefaultDest()); 
-      MDNode *MD = SI->getMetadata(LLVMContext::MD_prof); 
-      if (MD && MD->getNumOperands() == 3) { 
-        ConstantInt *SICase = 
-            mdconst::dyn_extract<ConstantInt>(MD->getOperand(2)); 
-        ConstantInt *SIDef = 
-            mdconst::dyn_extract<ConstantInt>(MD->getOperand(1)); 
-        assert(SICase && SIDef); 
-        // The TrueWeight should be the weight for the single case of SI. 
-        NewBr->setMetadata(LLVMContext::MD_prof, 
-                        MDBuilder(BB->getContext()). 
-                        createBranchWeights(SICase->getValue().getZExtValue(), 
-                                            SIDef->getValue().getZExtValue())); 
-      } 
- 
-      // Update make.implicit metadata to the newly-created conditional branch. 
-      MDNode *MakeImplicitMD = SI->getMetadata(LLVMContext::MD_make_implicit); 
-      if (MakeImplicitMD) 
-        NewBr->setMetadata(LLVMContext::MD_make_implicit, MakeImplicitMD); 
- 
-      // Delete the old switch. 
-      SI->eraseFromParent(); 
-      return true; 
-    } 
+      return true;
+    }
+
+    if (SI->getNumCases() == 1) {
+      // Otherwise, we can fold this switch into a conditional branch
+      // instruction if it has only one non-default destination.
+      auto FirstCase = *SI->case_begin();
+      Value *Cond = Builder.CreateICmpEQ(SI->getCondition(),
+          FirstCase.getCaseValue(), "cond");
+
+      // Insert the new branch.
+      BranchInst *NewBr = Builder.CreateCondBr(Cond,
+                                               FirstCase.getCaseSuccessor(),
+                                               SI->getDefaultDest());
+      MDNode *MD = SI->getMetadata(LLVMContext::MD_prof);
+      if (MD && MD->getNumOperands() == 3) {
+        ConstantInt *SICase =
+            mdconst::dyn_extract<ConstantInt>(MD->getOperand(2));
+        ConstantInt *SIDef =
+            mdconst::dyn_extract<ConstantInt>(MD->getOperand(1));
+        assert(SICase && SIDef);
+        // The TrueWeight should be the weight for the single case of SI.
+        NewBr->setMetadata(LLVMContext::MD_prof,
+                        MDBuilder(BB->getContext()).
+                        createBranchWeights(SICase->getValue().getZExtValue(),
+                                            SIDef->getValue().getZExtValue()));
+      }
+
+      // Update make.implicit metadata to the newly-created conditional branch.
+      MDNode *MakeImplicitMD = SI->getMetadata(LLVMContext::MD_make_implicit);
+      if (MakeImplicitMD)
+        NewBr->setMetadata(LLVMContext::MD_make_implicit, MakeImplicitMD);
+
+      // Delete the old switch.
+      SI->eraseFromParent();
+      return true;
+    }
     return Changed;
-  } 
- 
-  if (auto *IBI = dyn_cast<IndirectBrInst>(T)) { 
-    // indirectbr blockaddress(@F, @BB) -> br label @BB 
-    if (auto *BA = 
-          dyn_cast<BlockAddress>(IBI->getAddress()->stripPointerCasts())) { 
-      BasicBlock *TheOnlyDest = BA->getBasicBlock(); 
+  }
+
+  if (auto *IBI = dyn_cast<IndirectBrInst>(T)) {
+    // indirectbr blockaddress(@F, @BB) -> br label @BB
+    if (auto *BA =
+          dyn_cast<BlockAddress>(IBI->getAddress()->stripPointerCasts())) {
+      BasicBlock *TheOnlyDest = BA->getBasicBlock();
       SmallSetVector<BasicBlock *, 8> RemovedSuccessors;
- 
-      // Insert the new branch. 
-      Builder.CreateBr(TheOnlyDest); 
- 
+
+      // Insert the new branch.
+      Builder.CreateBr(TheOnlyDest);
+
       BasicBlock *SuccToKeep = TheOnlyDest;
-      for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) { 
+      for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
         BasicBlock *DestBB = IBI->getDestination(i);
         if (DTU && DestBB != TheOnlyDest)
           RemovedSuccessors.insert(DestBB);
         if (IBI->getDestination(i) == SuccToKeep) {
           SuccToKeep = nullptr;
-        } else { 
+        } else {
           DestBB->removePredecessor(BB);
-        } 
-      } 
-      Value *Address = IBI->getAddress(); 
-      IBI->eraseFromParent(); 
-      if (DeleteDeadConditions) 
-        // Delete pointer cast instructions. 
-        RecursivelyDeleteTriviallyDeadInstructions(Address, TLI); 
- 
-      // Also zap the blockaddress constant if there are no users remaining, 
-      // otherwise the destination is still marked as having its address taken. 
-      if (BA->use_empty()) 
-        BA->destroyConstant(); 
- 
-      // If we didn't find our destination in the IBI successor list, then we 
-      // have undefined behavior.  Replace the unconditional branch with an 
-      // 'unreachable' instruction. 
+        }
+      }
+      Value *Address = IBI->getAddress();
+      IBI->eraseFromParent();
+      if (DeleteDeadConditions)
+        // Delete pointer cast instructions.
+        RecursivelyDeleteTriviallyDeadInstructions(Address, TLI);
+
+      // Also zap the blockaddress constant if there are no users remaining,
+      // otherwise the destination is still marked as having its address taken.
+      if (BA->use_empty())
+        BA->destroyConstant();
+
+      // If we didn't find our destination in the IBI successor list, then we
+      // have undefined behavior.  Replace the unconditional branch with an
+      // 'unreachable' instruction.
       if (SuccToKeep) {
-        BB->getTerminator()->eraseFromParent(); 
-        new UnreachableInst(BB->getContext(), BB); 
-      } 
- 
+        BB->getTerminator()->eraseFromParent();
+        new UnreachableInst(BB->getContext(), BB);
+      }
+
       if (DTU) {
         std::vector<DominatorTree::UpdateType> Updates;
         Updates.reserve(RemovedSuccessors.size());
@@ -371,562 +371,562 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
           Updates.push_back({DominatorTree::Delete, BB, RemovedSuccessor});
         DTU->applyUpdates(Updates);
       }
-      return true; 
-    } 
-  } 
- 
-  return false; 
-} 
- 
-//===----------------------------------------------------------------------===// 
-//  Local dead code elimination. 
-// 
- 
-/// isInstructionTriviallyDead - Return true if the result produced by the 
-/// instruction is not used, and the instruction has no side effects. 
-/// 
-bool llvm::isInstructionTriviallyDead(Instruction *I, 
-                                      const TargetLibraryInfo *TLI) { 
-  if (!I->use_empty()) 
-    return false; 
-  return wouldInstructionBeTriviallyDead(I, TLI); 
-} 
- 
-bool llvm::wouldInstructionBeTriviallyDead(Instruction *I, 
-                                           const TargetLibraryInfo *TLI) { 
-  if (I->isTerminator()) 
-    return false; 
- 
-  // We don't want the landingpad-like instructions removed by anything this 
-  // general. 
-  if (I->isEHPad()) 
-    return false; 
- 
-  // We don't want debug info removed by anything this general, unless 
-  // debug info is empty. 
-  if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(I)) { 
-    if (DDI->getAddress()) 
-      return false; 
-    return true; 
-  } 
-  if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(I)) { 
-    if (DVI->getValue()) 
-      return false; 
-    return true; 
-  } 
-  if (DbgLabelInst *DLI = dyn_cast<DbgLabelInst>(I)) { 
-    if (DLI->getLabel()) 
-      return false; 
-    return true; 
-  } 
- 
+      return true;
+    }
+  }
+
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+//  Local dead code elimination.
+//
+
+/// isInstructionTriviallyDead - Return true if the result produced by the
+/// instruction is not used, and the instruction has no side effects.
+///
+bool llvm::isInstructionTriviallyDead(Instruction *I,
+                                      const TargetLibraryInfo *TLI) {
+  if (!I->use_empty())
+    return false;
+  return wouldInstructionBeTriviallyDead(I, TLI);
+}
+
+bool llvm::wouldInstructionBeTriviallyDead(Instruction *I,
+                                           const TargetLibraryInfo *TLI) {
+  if (I->isTerminator())
+    return false;
+
+  // We don't want the landingpad-like instructions removed by anything this
+  // general.
+  if (I->isEHPad())
+    return false;
+
+  // We don't want debug info removed by anything this general, unless
+  // debug info is empty.
+  if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(I)) {
+    if (DDI->getAddress())
+      return false;
+    return true;
+  }
+  if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(I)) {
+    if (DVI->getValue())
+      return false;
+    return true;
+  }
+  if (DbgLabelInst *DLI = dyn_cast<DbgLabelInst>(I)) {
+    if (DLI->getLabel())
+      return false;
+    return true;
+  }
+
   if (!I->willReturn())
     return false;
 
-  if (!I->mayHaveSideEffects()) 
-    return true; 
- 
-  // Special case intrinsics that "may have side effects" but can be deleted 
-  // when dead. 
-  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 
-    // Safe to delete llvm.stacksave and launder.invariant.group if dead. 
-    if (II->getIntrinsicID() == Intrinsic::stacksave || 
-        II->getIntrinsicID() == Intrinsic::launder_invariant_group) 
-      return true; 
- 
-    if (II->isLifetimeStartOrEnd()) { 
-      auto *Arg = II->getArgOperand(1); 
-      // Lifetime intrinsics are dead when their right-hand is undef. 
-      if (isa<UndefValue>(Arg)) 
-        return true; 
-      // If the right-hand is an alloc, global, or argument and the only uses 
-      // are lifetime intrinsics then the intrinsics are dead. 
-      if (isa<AllocaInst>(Arg) || isa<GlobalValue>(Arg) || isa<Argument>(Arg)) 
-        return llvm::all_of(Arg->uses(), [](Use &Use) { 
-          if (IntrinsicInst *IntrinsicUse = 
-                  dyn_cast<IntrinsicInst>(Use.getUser())) 
-            return IntrinsicUse->isLifetimeStartOrEnd(); 
-          return false; 
-        }); 
-      return false; 
-    } 
- 
-    // Assumptions are dead if their condition is trivially true.  Guards on 
-    // true are operationally no-ops.  In the future we can consider more 
-    // sophisticated tradeoffs for guards considering potential for check 
-    // widening, but for now we keep things simple. 
-    if ((II->getIntrinsicID() == Intrinsic::assume && 
-         isAssumeWithEmptyBundle(*II)) || 
-        II->getIntrinsicID() == Intrinsic::experimental_guard) { 
-      if (ConstantInt *Cond = dyn_cast<ConstantInt>(II->getArgOperand(0))) 
-        return !Cond->isZero(); 
- 
-      return false; 
-    } 
-  } 
- 
-  if (isAllocLikeFn(I, TLI)) 
-    return true; 
- 
-  if (CallInst *CI = isFreeCall(I, TLI)) 
-    if (Constant *C = dyn_cast<Constant>(CI->getArgOperand(0))) 
-      return C->isNullValue() || isa<UndefValue>(C); 
- 
-  if (auto *Call = dyn_cast<CallBase>(I)) 
-    if (isMathLibCallNoop(Call, TLI)) 
-      return true; 
- 
-  return false; 
-} 
- 
-/// RecursivelyDeleteTriviallyDeadInstructions - If the specified value is a 
-/// trivially dead instruction, delete it.  If that makes any of its operands 
-/// trivially dead, delete them too, recursively.  Return true if any 
-/// instructions were deleted. 
-bool llvm::RecursivelyDeleteTriviallyDeadInstructions( 
+  if (!I->mayHaveSideEffects())
+    return true;
+
+  // Special case intrinsics that "may have side effects" but can be deleted
+  // when dead.
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    // Safe to delete llvm.stacksave and launder.invariant.group if dead.
+    if (II->getIntrinsicID() == Intrinsic::stacksave ||
+        II->getIntrinsicID() == Intrinsic::launder_invariant_group)
+      return true;
+
+    if (II->isLifetimeStartOrEnd()) {
+      auto *Arg = II->getArgOperand(1);
+      // Lifetime intrinsics are dead when their right-hand is undef.
+      if (isa<UndefValue>(Arg))
+        return true;
+      // If the right-hand is an alloc, global, or argument and the only uses
+      // are lifetime intrinsics then the intrinsics are dead.
+      if (isa<AllocaInst>(Arg) || isa<GlobalValue>(Arg) || isa<Argument>(Arg))
+        return llvm::all_of(Arg->uses(), [](Use &Use) {
+          if (IntrinsicInst *IntrinsicUse =
+                  dyn_cast<IntrinsicInst>(Use.getUser()))
+            return IntrinsicUse->isLifetimeStartOrEnd();
+          return false;
+        });
+      return false;
+    }
+
+    // Assumptions are dead if their condition is trivially true.  Guards on
+    // true are operationally no-ops.  In the future we can consider more
+    // sophisticated tradeoffs for guards considering potential for check
+    // widening, but for now we keep things simple.
+    if ((II->getIntrinsicID() == Intrinsic::assume &&
+         isAssumeWithEmptyBundle(*II)) ||
+        II->getIntrinsicID() == Intrinsic::experimental_guard) {
+      if (ConstantInt *Cond = dyn_cast<ConstantInt>(II->getArgOperand(0)))
+        return !Cond->isZero();
+
+      return false;
+    }
+  }
+
+  if (isAllocLikeFn(I, TLI))
+    return true;
+
+  if (CallInst *CI = isFreeCall(I, TLI))
+    if (Constant *C = dyn_cast<Constant>(CI->getArgOperand(0)))
+      return C->isNullValue() || isa<UndefValue>(C);
+
+  if (auto *Call = dyn_cast<CallBase>(I))
+    if (isMathLibCallNoop(Call, TLI))
+      return true;
+
+  return false;
+}
+
+/// RecursivelyDeleteTriviallyDeadInstructions - If the specified value is a
+/// trivially dead instruction, delete it.  If that makes any of its operands
+/// trivially dead, delete them too, recursively.  Return true if any
+/// instructions were deleted.
+bool llvm::RecursivelyDeleteTriviallyDeadInstructions(
     Value *V, const TargetLibraryInfo *TLI, MemorySSAUpdater *MSSAU,
     std::function<void(Value *)> AboutToDeleteCallback) {
-  Instruction *I = dyn_cast<Instruction>(V); 
-  if (!I || !isInstructionTriviallyDead(I, TLI)) 
-    return false; 
- 
-  SmallVector<WeakTrackingVH, 16> DeadInsts; 
-  DeadInsts.push_back(I); 
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I || !isInstructionTriviallyDead(I, TLI))
+    return false;
+
+  SmallVector<WeakTrackingVH, 16> DeadInsts;
+  DeadInsts.push_back(I);
   RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI, MSSAU,
                                              AboutToDeleteCallback);
- 
-  return true; 
-} 
- 
-bool llvm::RecursivelyDeleteTriviallyDeadInstructionsPermissive( 
-    SmallVectorImpl<WeakTrackingVH> &DeadInsts, const TargetLibraryInfo *TLI, 
+
+  return true;
+}
+
+bool llvm::RecursivelyDeleteTriviallyDeadInstructionsPermissive(
+    SmallVectorImpl<WeakTrackingVH> &DeadInsts, const TargetLibraryInfo *TLI,
     MemorySSAUpdater *MSSAU,
     std::function<void(Value *)> AboutToDeleteCallback) {
-  unsigned S = 0, E = DeadInsts.size(), Alive = 0; 
-  for (; S != E; ++S) { 
-    auto *I = cast<Instruction>(DeadInsts[S]); 
-    if (!isInstructionTriviallyDead(I)) { 
-      DeadInsts[S] = nullptr; 
-      ++Alive; 
-    } 
-  } 
-  if (Alive == E) 
-    return false; 
+  unsigned S = 0, E = DeadInsts.size(), Alive = 0;
+  for (; S != E; ++S) {
+    auto *I = cast<Instruction>(DeadInsts[S]);
+    if (!isInstructionTriviallyDead(I)) {
+      DeadInsts[S] = nullptr;
+      ++Alive;
+    }
+  }
+  if (Alive == E)
+    return false;
   RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI, MSSAU,
                                              AboutToDeleteCallback);
-  return true; 
-} 
- 
-void llvm::RecursivelyDeleteTriviallyDeadInstructions( 
-    SmallVectorImpl<WeakTrackingVH> &DeadInsts, const TargetLibraryInfo *TLI, 
+  return true;
+}
+
+void llvm::RecursivelyDeleteTriviallyDeadInstructions(
+    SmallVectorImpl<WeakTrackingVH> &DeadInsts, const TargetLibraryInfo *TLI,
     MemorySSAUpdater *MSSAU,
     std::function<void(Value *)> AboutToDeleteCallback) {
-  // Process the dead instruction list until empty. 
-  while (!DeadInsts.empty()) { 
-    Value *V = DeadInsts.pop_back_val(); 
-    Instruction *I = cast_or_null<Instruction>(V); 
-    if (!I) 
-      continue; 
-    assert(isInstructionTriviallyDead(I, TLI) && 
-           "Live instruction found in dead worklist!"); 
-    assert(I->use_empty() && "Instructions with uses are not dead."); 
- 
-    // Don't lose the debug info while deleting the instructions. 
-    salvageDebugInfo(*I); 
- 
+  // Process the dead instruction list until empty.
+  while (!DeadInsts.empty()) {
+    Value *V = DeadInsts.pop_back_val();
+    Instruction *I = cast_or_null<Instruction>(V);
+    if (!I)
+      continue;
+    assert(isInstructionTriviallyDead(I, TLI) &&
+           "Live instruction found in dead worklist!");
+    assert(I->use_empty() && "Instructions with uses are not dead.");
+
+    // Don't lose the debug info while deleting the instructions.
+    salvageDebugInfo(*I);
+
     if (AboutToDeleteCallback)
       AboutToDeleteCallback(I);
 
-    // Null out all of the instruction's operands to see if any operand becomes 
-    // dead as we go. 
-    for (Use &OpU : I->operands()) { 
-      Value *OpV = OpU.get(); 
-      OpU.set(nullptr); 
- 
-      if (!OpV->use_empty()) 
-        continue; 
- 
-      // If the operand is an instruction that became dead as we nulled out the 
-      // operand, and if it is 'trivially' dead, delete it in a future loop 
-      // iteration. 
-      if (Instruction *OpI = dyn_cast<Instruction>(OpV)) 
-        if (isInstructionTriviallyDead(OpI, TLI)) 
-          DeadInsts.push_back(OpI); 
-    } 
-    if (MSSAU) 
-      MSSAU->removeMemoryAccess(I); 
- 
-    I->eraseFromParent(); 
-  } 
-} 
- 
-bool llvm::replaceDbgUsesWithUndef(Instruction *I) { 
-  SmallVector<DbgVariableIntrinsic *, 1> DbgUsers; 
-  findDbgUsers(DbgUsers, I); 
-  for (auto *DII : DbgUsers) { 
-    Value *Undef = UndefValue::get(I->getType()); 
-    DII->setOperand(0, MetadataAsValue::get(DII->getContext(), 
-                                            ValueAsMetadata::get(Undef))); 
-  } 
-  return !DbgUsers.empty(); 
-} 
- 
-/// areAllUsesEqual - Check whether the uses of a value are all the same. 
-/// This is similar to Instruction::hasOneUse() except this will also return 
-/// true when there are no uses or multiple uses that all refer to the same 
-/// value. 
-static bool areAllUsesEqual(Instruction *I) { 
-  Value::user_iterator UI = I->user_begin(); 
-  Value::user_iterator UE = I->user_end(); 
-  if (UI == UE) 
-    return true; 
- 
-  User *TheUse = *UI; 
-  for (++UI; UI != UE; ++UI) { 
-    if (*UI != TheUse) 
-      return false; 
-  } 
-  return true; 
-} 
- 
-/// RecursivelyDeleteDeadPHINode - If the specified value is an effectively 
-/// dead PHI node, due to being a def-use chain of single-use nodes that 
-/// either forms a cycle or is terminated by a trivially dead instruction, 
-/// delete it.  If that makes any of its operands trivially dead, delete them 
-/// too, recursively.  Return true if a change was made. 
-bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN, 
-                                        const TargetLibraryInfo *TLI, 
-                                        llvm::MemorySSAUpdater *MSSAU) { 
-  SmallPtrSet<Instruction*, 4> Visited; 
-  for (Instruction *I = PN; areAllUsesEqual(I) && !I->mayHaveSideEffects(); 
-       I = cast<Instruction>(*I->user_begin())) { 
-    if (I->use_empty()) 
-      return RecursivelyDeleteTriviallyDeadInstructions(I, TLI, MSSAU); 
- 
-    // If we find an instruction more than once, we're on a cycle that 
-    // won't prove fruitful. 
-    if (!Visited.insert(I).second) { 
-      // Break the cycle and delete the instruction and its operands. 
-      I->replaceAllUsesWith(UndefValue::get(I->getType())); 
-      (void)RecursivelyDeleteTriviallyDeadInstructions(I, TLI, MSSAU); 
-      return true; 
-    } 
-  } 
-  return false; 
-} 
- 
-static bool 
-simplifyAndDCEInstruction(Instruction *I, 
-                          SmallSetVector<Instruction *, 16> &WorkList, 
-                          const DataLayout &DL, 
-                          const TargetLibraryInfo *TLI) { 
-  if (isInstructionTriviallyDead(I, TLI)) { 
-    salvageDebugInfo(*I); 
- 
-    // Null out all of the instruction's operands to see if any operand becomes 
-    // dead as we go. 
-    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { 
-      Value *OpV = I->getOperand(i); 
-      I->setOperand(i, nullptr); 
- 
-      if (!OpV->use_empty() || I == OpV) 
-        continue; 
- 
-      // If the operand is an instruction that became dead as we nulled out the 
-      // operand, and if it is 'trivially' dead, delete it in a future loop 
-      // iteration. 
-      if (Instruction *OpI = dyn_cast<Instruction>(OpV)) 
-        if (isInstructionTriviallyDead(OpI, TLI)) 
-          WorkList.insert(OpI); 
-    } 
- 
-    I->eraseFromParent(); 
- 
-    return true; 
-  } 
- 
-  if (Value *SimpleV = SimplifyInstruction(I, DL)) { 
-    // Add the users to the worklist. CAREFUL: an instruction can use itself, 
-    // in the case of a phi node. 
-    for (User *U : I->users()) { 
-      if (U != I) { 
-        WorkList.insert(cast<Instruction>(U)); 
-      } 
-    } 
- 
-    // Replace the instruction with its simplified value. 
-    bool Changed = false; 
-    if (!I->use_empty()) { 
-      I->replaceAllUsesWith(SimpleV); 
-      Changed = true; 
-    } 
-    if (isInstructionTriviallyDead(I, TLI)) { 
-      I->eraseFromParent(); 
-      Changed = true; 
-    } 
-    return Changed; 
-  } 
-  return false; 
-} 
- 
-/// SimplifyInstructionsInBlock - Scan the specified basic block and try to 
-/// simplify any instructions in it and recursively delete dead instructions. 
-/// 
-/// This returns true if it changed the code, note that it can delete 
-/// instructions in other blocks as well in this block. 
-bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, 
-                                       const TargetLibraryInfo *TLI) { 
-  bool MadeChange = false; 
-  const DataLayout &DL = BB->getModule()->getDataLayout(); 
- 
-#ifndef NDEBUG 
-  // In debug builds, ensure that the terminator of the block is never replaced 
-  // or deleted by these simplifications. The idea of simplification is that it 
-  // cannot introduce new instructions, and there is no way to replace the 
-  // terminator of a block without introducing a new instruction. 
-  AssertingVH<Instruction> TerminatorVH(&BB->back()); 
-#endif 
- 
-  SmallSetVector<Instruction *, 16> WorkList; 
-  // Iterate over the original function, only adding insts to the worklist 
-  // if they actually need to be revisited. This avoids having to pre-init 
-  // the worklist with the entire function's worth of instructions. 
-  for (BasicBlock::iterator BI = BB->begin(), E = std::prev(BB->end()); 
-       BI != E;) { 
-    assert(!BI->isTerminator()); 
-    Instruction *I = &*BI; 
-    ++BI; 
- 
-    // We're visiting this instruction now, so make sure it's not in the 
-    // worklist from an earlier visit. 
-    if (!WorkList.count(I)) 
-      MadeChange |= simplifyAndDCEInstruction(I, WorkList, DL, TLI); 
-  } 
- 
-  while (!WorkList.empty()) { 
-    Instruction *I = WorkList.pop_back_val(); 
-    MadeChange |= simplifyAndDCEInstruction(I, WorkList, DL, TLI); 
-  } 
-  return MadeChange; 
-} 
- 
-//===----------------------------------------------------------------------===// 
-//  Control Flow Graph Restructuring. 
-// 
- 
-void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, 
-                                       DomTreeUpdater *DTU) { 
- 
-  // If BB has single-entry PHI nodes, fold them. 
-  while (PHINode *PN = dyn_cast<PHINode>(DestBB->begin())) { 
-    Value *NewVal = PN->getIncomingValue(0); 
-    // Replace self referencing PHI with undef, it must be dead. 
-    if (NewVal == PN) NewVal = UndefValue::get(PN->getType()); 
-    PN->replaceAllUsesWith(NewVal); 
-    PN->eraseFromParent(); 
-  } 
- 
-  BasicBlock *PredBB = DestBB->getSinglePredecessor(); 
-  assert(PredBB && "Block doesn't have a single predecessor!"); 
- 
-  bool ReplaceEntryBB = false; 
-  if (PredBB == &DestBB->getParent()->getEntryBlock()) 
-    ReplaceEntryBB = true; 
- 
-  // DTU updates: Collect all the edges that enter 
-  // PredBB. These dominator edges will be redirected to DestBB. 
-  SmallVector<DominatorTree::UpdateType, 32> Updates; 
- 
-  if (DTU) { 
-    for (auto I = pred_begin(PredBB), E = pred_end(PredBB); I != E; ++I) { 
-      // This predecessor of PredBB may already have DestBB as a successor. 
+    // Null out all of the instruction's operands to see if any operand becomes
+    // dead as we go.
+    for (Use &OpU : I->operands()) {
+      Value *OpV = OpU.get();
+      OpU.set(nullptr);
+
+      if (!OpV->use_empty())
+        continue;
+
+      // If the operand is an instruction that became dead as we nulled out the
+      // operand, and if it is 'trivially' dead, delete it in a future loop
+      // iteration.
+      if (Instruction *OpI = dyn_cast<Instruction>(OpV))
+        if (isInstructionTriviallyDead(OpI, TLI))
+          DeadInsts.push_back(OpI);
+    }
+    if (MSSAU)
+      MSSAU->removeMemoryAccess(I);
+
+    I->eraseFromParent();
+  }
+}
+
+bool llvm::replaceDbgUsesWithUndef(Instruction *I) {
+  SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
+  findDbgUsers(DbgUsers, I);
+  for (auto *DII : DbgUsers) {
+    Value *Undef = UndefValue::get(I->getType());
+    DII->setOperand(0, MetadataAsValue::get(DII->getContext(),
+                                            ValueAsMetadata::get(Undef)));
+  }
+  return !DbgUsers.empty();
+}
+
+/// areAllUsesEqual - Check whether the uses of a value are all the same.
+/// This is similar to Instruction::hasOneUse() except this will also return
+/// true when there are no uses or multiple uses that all refer to the same
+/// value.
+static bool areAllUsesEqual(Instruction *I) {
+  Value::user_iterator UI = I->user_begin();
+  Value::user_iterator UE = I->user_end();
+  if (UI == UE)
+    return true;
+
+  User *TheUse = *UI;
+  for (++UI; UI != UE; ++UI) {
+    if (*UI != TheUse)
+      return false;
+  }
+  return true;
+}
+
+/// RecursivelyDeleteDeadPHINode - If the specified value is an effectively
+/// dead PHI node, due to being a def-use chain of single-use nodes that
+/// either forms a cycle or is terminated by a trivially dead instruction,
+/// delete it.  If that makes any of its operands trivially dead, delete them
+/// too, recursively.  Return true if a change was made.
+bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN,
+                                        const TargetLibraryInfo *TLI,
+                                        llvm::MemorySSAUpdater *MSSAU) {
+  SmallPtrSet<Instruction*, 4> Visited;
+  for (Instruction *I = PN; areAllUsesEqual(I) && !I->mayHaveSideEffects();
+       I = cast<Instruction>(*I->user_begin())) {
+    if (I->use_empty())
+      return RecursivelyDeleteTriviallyDeadInstructions(I, TLI, MSSAU);
+
+    // If we find an instruction more than once, we're on a cycle that
+    // won't prove fruitful.
+    if (!Visited.insert(I).second) {
+      // Break the cycle and delete the instruction and its operands.
+      I->replaceAllUsesWith(UndefValue::get(I->getType()));
+      (void)RecursivelyDeleteTriviallyDeadInstructions(I, TLI, MSSAU);
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool
+simplifyAndDCEInstruction(Instruction *I,
+                          SmallSetVector<Instruction *, 16> &WorkList,
+                          const DataLayout &DL,
+                          const TargetLibraryInfo *TLI) {
+  if (isInstructionTriviallyDead(I, TLI)) {
+    salvageDebugInfo(*I);
+
+    // Null out all of the instruction's operands to see if any operand becomes
+    // dead as we go.
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+      Value *OpV = I->getOperand(i);
+      I->setOperand(i, nullptr);
+
+      if (!OpV->use_empty() || I == OpV)
+        continue;
+
+      // If the operand is an instruction that became dead as we nulled out the
+      // operand, and if it is 'trivially' dead, delete it in a future loop
+      // iteration.
+      if (Instruction *OpI = dyn_cast<Instruction>(OpV))
+        if (isInstructionTriviallyDead(OpI, TLI))
+          WorkList.insert(OpI);
+    }
+
+    I->eraseFromParent();
+
+    return true;
+  }
+
+  if (Value *SimpleV = SimplifyInstruction(I, DL)) {
+    // Add the users to the worklist. CAREFUL: an instruction can use itself,
+    // in the case of a phi node.
+    for (User *U : I->users()) {
+      if (U != I) {
+        WorkList.insert(cast<Instruction>(U));
+      }
+    }
+
+    // Replace the instruction with its simplified value.
+    bool Changed = false;
+    if (!I->use_empty()) {
+      I->replaceAllUsesWith(SimpleV);
+      Changed = true;
+    }
+    if (isInstructionTriviallyDead(I, TLI)) {
+      I->eraseFromParent();
+      Changed = true;
+    }
+    return Changed;
+  }
+  return false;
+}
+
+/// SimplifyInstructionsInBlock - Scan the specified basic block and try to
+/// simplify any instructions in it and recursively delete dead instructions.
+///
+/// This returns true if it changed the code, note that it can delete
+/// instructions in other blocks as well in this block.
+bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB,
+                                       const TargetLibraryInfo *TLI) {
+  bool MadeChange = false;
+  const DataLayout &DL = BB->getModule()->getDataLayout();
+
+#ifndef NDEBUG
+  // In debug builds, ensure that the terminator of the block is never replaced
+  // or deleted by these simplifications. The idea of simplification is that it
+  // cannot introduce new instructions, and there is no way to replace the
+  // terminator of a block without introducing a new instruction.
+  AssertingVH<Instruction> TerminatorVH(&BB->back());
+#endif
+
+  SmallSetVector<Instruction *, 16> WorkList;
+  // Iterate over the original function, only adding insts to the worklist
+  // if they actually need to be revisited. This avoids having to pre-init
+  // the worklist with the entire function's worth of instructions.
+  for (BasicBlock::iterator BI = BB->begin(), E = std::prev(BB->end());
+       BI != E;) {
+    assert(!BI->isTerminator());
+    Instruction *I = &*BI;
+    ++BI;
+
+    // We're visiting this instruction now, so make sure it's not in the
+    // worklist from an earlier visit.
+    if (!WorkList.count(I))
+      MadeChange |= simplifyAndDCEInstruction(I, WorkList, DL, TLI);
+  }
+
+  while (!WorkList.empty()) {
+    Instruction *I = WorkList.pop_back_val();
+    MadeChange |= simplifyAndDCEInstruction(I, WorkList, DL, TLI);
+  }
+  return MadeChange;
+}
+
+//===----------------------------------------------------------------------===//
+//  Control Flow Graph Restructuring.
+//
+
+void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB,
+                                       DomTreeUpdater *DTU) {
+
+  // If BB has single-entry PHI nodes, fold them.
+  while (PHINode *PN = dyn_cast<PHINode>(DestBB->begin())) {
+    Value *NewVal = PN->getIncomingValue(0);
+    // Replace self referencing PHI with undef, it must be dead.
+    if (NewVal == PN) NewVal = UndefValue::get(PN->getType());
+    PN->replaceAllUsesWith(NewVal);
+    PN->eraseFromParent();
+  }
+
+  BasicBlock *PredBB = DestBB->getSinglePredecessor();
+  assert(PredBB && "Block doesn't have a single predecessor!");
+
+  bool ReplaceEntryBB = false;
+  if (PredBB == &DestBB->getParent()->getEntryBlock())
+    ReplaceEntryBB = true;
+
+  // DTU updates: Collect all the edges that enter
+  // PredBB. These dominator edges will be redirected to DestBB.
+  SmallVector<DominatorTree::UpdateType, 32> Updates;
+
+  if (DTU) {
+    for (auto I = pred_begin(PredBB), E = pred_end(PredBB); I != E; ++I) {
+      // This predecessor of PredBB may already have DestBB as a successor.
       if (!llvm::is_contained(successors(*I), DestBB))
-        Updates.push_back({DominatorTree::Insert, *I, DestBB}); 
+        Updates.push_back({DominatorTree::Insert, *I, DestBB});
       Updates.push_back({DominatorTree::Delete, *I, PredBB});
-    } 
+    }
     Updates.push_back({DominatorTree::Delete, PredBB, DestBB});
-  } 
- 
-  // Zap anything that took the address of DestBB.  Not doing this will give the 
-  // address an invalid value. 
-  if (DestBB->hasAddressTaken()) { 
-    BlockAddress *BA = BlockAddress::get(DestBB); 
-    Constant *Replacement = 
-      ConstantInt::get(Type::getInt32Ty(BA->getContext()), 1); 
-    BA->replaceAllUsesWith(ConstantExpr::getIntToPtr(Replacement, 
-                                                     BA->getType())); 
-    BA->destroyConstant(); 
-  } 
- 
-  // Anything that branched to PredBB now branches to DestBB. 
-  PredBB->replaceAllUsesWith(DestBB); 
- 
-  // Splice all the instructions from PredBB to DestBB. 
-  PredBB->getTerminator()->eraseFromParent(); 
-  DestBB->getInstList().splice(DestBB->begin(), PredBB->getInstList()); 
-  new UnreachableInst(PredBB->getContext(), PredBB); 
- 
-  // If the PredBB is the entry block of the function, move DestBB up to 
-  // become the entry block after we erase PredBB. 
-  if (ReplaceEntryBB) 
-    DestBB->moveAfter(PredBB); 
- 
-  if (DTU) { 
-    assert(PredBB->getInstList().size() == 1 && 
-           isa<UnreachableInst>(PredBB->getTerminator()) && 
-           "The successor list of PredBB isn't empty before " 
-           "applying corresponding DTU updates."); 
-    DTU->applyUpdatesPermissive(Updates); 
-    DTU->deleteBB(PredBB); 
-    // Recalculation of DomTree is needed when updating a forward DomTree and 
-    // the Entry BB is replaced. 
-    if (ReplaceEntryBB && DTU->hasDomTree()) { 
-      // The entry block was removed and there is no external interface for 
-      // the dominator tree to be notified of this change. In this corner-case 
-      // we recalculate the entire tree. 
-      DTU->recalculate(*(DestBB->getParent())); 
-    } 
-  } 
- 
-  else { 
-    PredBB->eraseFromParent(); // Nuke BB if DTU is nullptr. 
-  } 
-} 
- 
-/// Return true if we can choose one of these values to use in place of the 
-/// other. Note that we will always choose the non-undef value to keep. 
-static bool CanMergeValues(Value *First, Value *Second) { 
-  return First == Second || isa<UndefValue>(First) || isa<UndefValue>(Second); 
-} 
- 
-/// Return true if we can fold BB, an almost-empty BB ending in an unconditional 
-/// branch to Succ, into Succ. 
-/// 
-/// Assumption: Succ is the single successor for BB. 
-static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) { 
-  assert(*succ_begin(BB) == Succ && "Succ is not successor of BB!"); 
- 
-  LLVM_DEBUG(dbgs() << "Looking to fold " << BB->getName() << " into " 
-                    << Succ->getName() << "\n"); 
-  // Shortcut, if there is only a single predecessor it must be BB and merging 
-  // is always safe 
-  if (Succ->getSinglePredecessor()) return true; 
- 
-  // Make a list of the predecessors of BB 
-  SmallPtrSet<BasicBlock*, 16> BBPreds(pred_begin(BB), pred_end(BB)); 
- 
-  // Look at all the phi nodes in Succ, to see if they present a conflict when 
-  // merging these blocks 
-  for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) { 
-    PHINode *PN = cast<PHINode>(I); 
- 
-    // If the incoming value from BB is again a PHINode in 
-    // BB which has the same incoming value for *PI as PN does, we can 
-    // merge the phi nodes and then the blocks can still be merged 
-    PHINode *BBPN = dyn_cast<PHINode>(PN->getIncomingValueForBlock(BB)); 
-    if (BBPN && BBPN->getParent() == BB) { 
-      for (unsigned PI = 0, PE = PN->getNumIncomingValues(); PI != PE; ++PI) { 
-        BasicBlock *IBB = PN->getIncomingBlock(PI); 
-        if (BBPreds.count(IBB) && 
-            !CanMergeValues(BBPN->getIncomingValueForBlock(IBB), 
-                            PN->getIncomingValue(PI))) { 
-          LLVM_DEBUG(dbgs() 
-                     << "Can't fold, phi node " << PN->getName() << " in " 
-                     << Succ->getName() << " is conflicting with " 
-                     << BBPN->getName() << " with regard to common predecessor " 
-                     << IBB->getName() << "\n"); 
-          return false; 
-        } 
-      } 
-    } else { 
-      Value* Val = PN->getIncomingValueForBlock(BB); 
-      for (unsigned PI = 0, PE = PN->getNumIncomingValues(); PI != PE; ++PI) { 
-        // See if the incoming value for the common predecessor is equal to the 
-        // one for BB, in which case this phi node will not prevent the merging 
-        // of the block. 
-        BasicBlock *IBB = PN->getIncomingBlock(PI); 
-        if (BBPreds.count(IBB) && 
-            !CanMergeValues(Val, PN->getIncomingValue(PI))) { 
-          LLVM_DEBUG(dbgs() << "Can't fold, phi node " << PN->getName() 
-                            << " in " << Succ->getName() 
-                            << " is conflicting with regard to common " 
-                            << "predecessor " << IBB->getName() << "\n"); 
-          return false; 
-        } 
-      } 
-    } 
-  } 
- 
-  return true; 
-} 
- 
-using PredBlockVector = SmallVector<BasicBlock *, 16>; 
-using IncomingValueMap = DenseMap<BasicBlock *, Value *>; 
- 
-/// Determines the value to use as the phi node input for a block. 
-/// 
-/// Select between \p OldVal any value that we know flows from \p BB 
-/// to a particular phi on the basis of which one (if either) is not 
-/// undef. Update IncomingValues based on the selected value. 
-/// 
-/// \param OldVal The value we are considering selecting. 
-/// \param BB The block that the value flows in from. 
-/// \param IncomingValues A map from block-to-value for other phi inputs 
-/// that we have examined. 
-/// 
-/// \returns the selected value. 
-static Value *selectIncomingValueForBlock(Value *OldVal, BasicBlock *BB, 
-                                          IncomingValueMap &IncomingValues) { 
-  if (!isa<UndefValue>(OldVal)) { 
-    assert((!IncomingValues.count(BB) || 
-            IncomingValues.find(BB)->second == OldVal) && 
-           "Expected OldVal to match incoming value from BB!"); 
- 
-    IncomingValues.insert(std::make_pair(BB, OldVal)); 
-    return OldVal; 
-  } 
- 
-  IncomingValueMap::const_iterator It = IncomingValues.find(BB); 
-  if (It != IncomingValues.end()) return It->second; 
- 
-  return OldVal; 
-} 
- 
-/// Create a map from block to value for the operands of a 
-/// given phi. 
-/// 
-/// Create a map from block to value for each non-undef value flowing 
-/// into \p PN. 
-/// 
-/// \param PN The phi we are collecting the map for. 
-/// \param IncomingValues [out] The map from block to value for this phi. 
-static void gatherIncomingValuesToPhi(PHINode *PN, 
-                                      IncomingValueMap &IncomingValues) { 
-  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { 
-    BasicBlock *BB = PN->getIncomingBlock(i); 
-    Value *V = PN->getIncomingValue(i); 
- 
-    if (!isa<UndefValue>(V)) 
-      IncomingValues.insert(std::make_pair(BB, V)); 
-  } 
-} 
- 
-/// Replace the incoming undef values to a phi with the values 
-/// from a block-to-value map. 
-/// 
-/// \param PN The phi we are replacing the undefs in. 
-/// \param IncomingValues A map from block to value. 
-static void replaceUndefValuesInPhi(PHINode *PN, 
-                                    const IncomingValueMap &IncomingValues) { 
+  }
+
+  // Zap anything that took the address of DestBB.  Not doing this will give the
+  // address an invalid value.
+  if (DestBB->hasAddressTaken()) {
+    BlockAddress *BA = BlockAddress::get(DestBB);
+    Constant *Replacement =
+      ConstantInt::get(Type::getInt32Ty(BA->getContext()), 1);
+    BA->replaceAllUsesWith(ConstantExpr::getIntToPtr(Replacement,
+                                                     BA->getType()));
+    BA->destroyConstant();
+  }
+
+  // Anything that branched to PredBB now branches to DestBB.
+  PredBB->replaceAllUsesWith(DestBB);
+
+  // Splice all the instructions from PredBB to DestBB.
+  PredBB->getTerminator()->eraseFromParent();
+  DestBB->getInstList().splice(DestBB->begin(), PredBB->getInstList());
+  new UnreachableInst(PredBB->getContext(), PredBB);
+
+  // If the PredBB is the entry block of the function, move DestBB up to
+  // become the entry block after we erase PredBB.
+  if (ReplaceEntryBB)
+    DestBB->moveAfter(PredBB);
+
+  if (DTU) {
+    assert(PredBB->getInstList().size() == 1 &&
+           isa<UnreachableInst>(PredBB->getTerminator()) &&
+           "The successor list of PredBB isn't empty before "
+           "applying corresponding DTU updates.");
+    DTU->applyUpdatesPermissive(Updates);
+    DTU->deleteBB(PredBB);
+    // Recalculation of DomTree is needed when updating a forward DomTree and
+    // the Entry BB is replaced.
+    if (ReplaceEntryBB && DTU->hasDomTree()) {
+      // The entry block was removed and there is no external interface for
+      // the dominator tree to be notified of this change. In this corner-case
+      // we recalculate the entire tree.
+      DTU->recalculate(*(DestBB->getParent()));
+    }
+  }
+
+  else {
+    PredBB->eraseFromParent(); // Nuke BB if DTU is nullptr.
+  }
+}
+
+/// Return true if we can choose one of these values to use in place of the
+/// other. Note that we will always choose the non-undef value to keep.
+static bool CanMergeValues(Value *First, Value *Second) {
+  return First == Second || isa<UndefValue>(First) || isa<UndefValue>(Second);
+}
+
+/// Return true if we can fold BB, an almost-empty BB ending in an unconditional
+/// branch to Succ, into Succ.
+///
+/// Assumption: Succ is the single successor for BB.
+static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) {
+  assert(*succ_begin(BB) == Succ && "Succ is not successor of BB!");
+
+  LLVM_DEBUG(dbgs() << "Looking to fold " << BB->getName() << " into "
+                    << Succ->getName() << "\n");
+  // Shortcut, if there is only a single predecessor it must be BB and merging
+  // is always safe
+  if (Succ->getSinglePredecessor()) return true;
+
+  // Make a list of the predecessors of BB
+  SmallPtrSet<BasicBlock*, 16> BBPreds(pred_begin(BB), pred_end(BB));
+
+  // Look at all the phi nodes in Succ, to see if they present a conflict when
+  // merging these blocks
+  for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+
+    // If the incoming value from BB is again a PHINode in
+    // BB which has the same incoming value for *PI as PN does, we can
+    // merge the phi nodes and then the blocks can still be merged
+    PHINode *BBPN = dyn_cast<PHINode>(PN->getIncomingValueForBlock(BB));
+    if (BBPN && BBPN->getParent() == BB) {
+      for (unsigned PI = 0, PE = PN->getNumIncomingValues(); PI != PE; ++PI) {
+        BasicBlock *IBB = PN->getIncomingBlock(PI);
+        if (BBPreds.count(IBB) &&
+            !CanMergeValues(BBPN->getIncomingValueForBlock(IBB),
+                            PN->getIncomingValue(PI))) {
+          LLVM_DEBUG(dbgs()
+                     << "Can't fold, phi node " << PN->getName() << " in "
+                     << Succ->getName() << " is conflicting with "
+                     << BBPN->getName() << " with regard to common predecessor "
+                     << IBB->getName() << "\n");
+          return false;
+        }
+      }
+    } else {
+      Value* Val = PN->getIncomingValueForBlock(BB);
+      for (unsigned PI = 0, PE = PN->getNumIncomingValues(); PI != PE; ++PI) {
+        // See if the incoming value for the common predecessor is equal to the
+        // one for BB, in which case this phi node will not prevent the merging
+        // of the block.
+        BasicBlock *IBB = PN->getIncomingBlock(PI);
+        if (BBPreds.count(IBB) &&
+            !CanMergeValues(Val, PN->getIncomingValue(PI))) {
+          LLVM_DEBUG(dbgs() << "Can't fold, phi node " << PN->getName()
+                            << " in " << Succ->getName()
+                            << " is conflicting with regard to common "
+                            << "predecessor " << IBB->getName() << "\n");
+          return false;
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+using PredBlockVector = SmallVector<BasicBlock *, 16>;
+using IncomingValueMap = DenseMap<BasicBlock *, Value *>;
+
+/// Determines the value to use as the phi node input for a block.
+///
+/// Select between \p OldVal any value that we know flows from \p BB
+/// to a particular phi on the basis of which one (if either) is not
+/// undef. Update IncomingValues based on the selected value.
+///
+/// \param OldVal The value we are considering selecting.
+/// \param BB The block that the value flows in from.
+/// \param IncomingValues A map from block-to-value for other phi inputs
+/// that we have examined.
+///
+/// \returns the selected value.
+static Value *selectIncomingValueForBlock(Value *OldVal, BasicBlock *BB,
+                                          IncomingValueMap &IncomingValues) {
+  if (!isa<UndefValue>(OldVal)) {
+    assert((!IncomingValues.count(BB) ||
+            IncomingValues.find(BB)->second == OldVal) &&
+           "Expected OldVal to match incoming value from BB!");
+
+    IncomingValues.insert(std::make_pair(BB, OldVal));
+    return OldVal;
+  }
+
+  IncomingValueMap::const_iterator It = IncomingValues.find(BB);
+  if (It != IncomingValues.end()) return It->second;
+
+  return OldVal;
+}
+
+/// Create a map from block to value for the operands of a
+/// given phi.
+///
+/// Create a map from block to value for each non-undef value flowing
+/// into \p PN.
+///
+/// \param PN The phi we are collecting the map for.
+/// \param IncomingValues [out] The map from block to value for this phi.
+static void gatherIncomingValuesToPhi(PHINode *PN,
+                                      IncomingValueMap &IncomingValues) {
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    BasicBlock *BB = PN->getIncomingBlock(i);
+    Value *V = PN->getIncomingValue(i);
+
+    if (!isa<UndefValue>(V))
+      IncomingValues.insert(std::make_pair(BB, V));
+  }
+}
+
+/// Replace the incoming undef values to a phi with the values
+/// from a block-to-value map.
+///
+/// \param PN The phi we are replacing the undefs in.
+/// \param IncomingValues A map from block to value.
+static void replaceUndefValuesInPhi(PHINode *PN,
+                                    const IncomingValueMap &IncomingValues) {
   SmallVector<unsigned> TrueUndefOps;
-  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { 
-    Value *V = PN->getIncomingValue(i); 
- 
-    if (!isa<UndefValue>(V)) continue; 
- 
-    BasicBlock *BB = PN->getIncomingBlock(i); 
-    IncomingValueMap::const_iterator It = IncomingValues.find(BB); 
- 
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    Value *V = PN->getIncomingValue(i);
+
+    if (!isa<UndefValue>(V)) continue;
+
+    BasicBlock *BB = PN->getIncomingBlock(i);
+    IncomingValueMap::const_iterator It = IncomingValues.find(BB);
+
     // Keep track of undef/poison incoming values. Those must match, so we fix
     // them up below if needed.
     // Note: this is conservatively correct, but we could try harder and group
@@ -938,8 +938,8 @@ static void replaceUndefValuesInPhi(PHINode *PN,
 
     // There is a defined value for this incoming block, so map this undef
     // incoming value to the defined value.
-    PN->setIncomingValue(i, It->second); 
-  } 
+    PN->setIncomingValue(i, It->second);
+  }
 
   // If there are both undef and poison values incoming, then convert those
   // values to undef. It is invalid to have different values for the same
@@ -951,204 +951,204 @@ static void replaceUndefValuesInPhi(PHINode *PN,
     for (unsigned i : TrueUndefOps)
       PN->setIncomingValue(i, UndefValue::get(PN->getType()));
   }
-} 
- 
-/// Replace a value flowing from a block to a phi with 
-/// potentially multiple instances of that value flowing from the 
-/// block's predecessors to the phi. 
-/// 
-/// \param BB The block with the value flowing into the phi. 
-/// \param BBPreds The predecessors of BB. 
-/// \param PN The phi that we are updating. 
-static void redirectValuesFromPredecessorsToPhi(BasicBlock *BB, 
-                                                const PredBlockVector &BBPreds, 
-                                                PHINode *PN) { 
-  Value *OldVal = PN->removeIncomingValue(BB, false); 
-  assert(OldVal && "No entry in PHI for Pred BB!"); 
- 
-  IncomingValueMap IncomingValues; 
- 
-  // We are merging two blocks - BB, and the block containing PN - and 
-  // as a result we need to redirect edges from the predecessors of BB 
-  // to go to the block containing PN, and update PN 
-  // accordingly. Since we allow merging blocks in the case where the 
-  // predecessor and successor blocks both share some predecessors, 
-  // and where some of those common predecessors might have undef 
-  // values flowing into PN, we want to rewrite those values to be 
-  // consistent with the non-undef values. 
- 
-  gatherIncomingValuesToPhi(PN, IncomingValues); 
- 
-  // If this incoming value is one of the PHI nodes in BB, the new entries 
-  // in the PHI node are the entries from the old PHI. 
-  if (isa<PHINode>(OldVal) && cast<PHINode>(OldVal)->getParent() == BB) { 
-    PHINode *OldValPN = cast<PHINode>(OldVal); 
-    for (unsigned i = 0, e = OldValPN->getNumIncomingValues(); i != e; ++i) { 
-      // Note that, since we are merging phi nodes and BB and Succ might 
-      // have common predecessors, we could end up with a phi node with 
-      // identical incoming branches. This will be cleaned up later (and 
-      // will trigger asserts if we try to clean it up now, without also 
-      // simplifying the corresponding conditional branch). 
-      BasicBlock *PredBB = OldValPN->getIncomingBlock(i); 
-      Value *PredVal = OldValPN->getIncomingValue(i); 
-      Value *Selected = selectIncomingValueForBlock(PredVal, PredBB, 
-                                                    IncomingValues); 
- 
-      // And add a new incoming value for this predecessor for the 
-      // newly retargeted branch. 
-      PN->addIncoming(Selected, PredBB); 
-    } 
-  } else { 
-    for (unsigned i = 0, e = BBPreds.size(); i != e; ++i) { 
-      // Update existing incoming values in PN for this 
-      // predecessor of BB. 
-      BasicBlock *PredBB = BBPreds[i]; 
-      Value *Selected = selectIncomingValueForBlock(OldVal, PredBB, 
-                                                    IncomingValues); 
- 
-      // And add a new incoming value for this predecessor for the 
-      // newly retargeted branch. 
-      PN->addIncoming(Selected, PredBB); 
-    } 
-  } 
- 
-  replaceUndefValuesInPhi(PN, IncomingValues); 
-} 
- 
-bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB, 
-                                                   DomTreeUpdater *DTU) { 
-  assert(BB != &BB->getParent()->getEntryBlock() && 
-         "TryToSimplifyUncondBranchFromEmptyBlock called on entry block!"); 
- 
-  // We can't eliminate infinite loops. 
-  BasicBlock *Succ = cast<BranchInst>(BB->getTerminator())->getSuccessor(0); 
-  if (BB == Succ) return false; 
- 
-  // Check to see if merging these blocks would cause conflicts for any of the 
-  // phi nodes in BB or Succ. If not, we can safely merge. 
-  if (!CanPropagatePredecessorsForPHIs(BB, Succ)) return false; 
- 
-  // Check for cases where Succ has multiple predecessors and a PHI node in BB 
-  // has uses which will not disappear when the PHI nodes are merged.  It is 
-  // possible to handle such cases, but difficult: it requires checking whether 
-  // BB dominates Succ, which is non-trivial to calculate in the case where 
-  // Succ has multiple predecessors.  Also, it requires checking whether 
-  // constructing the necessary self-referential PHI node doesn't introduce any 
-  // conflicts; this isn't too difficult, but the previous code for doing this 
-  // was incorrect. 
-  // 
-  // Note that if this check finds a live use, BB dominates Succ, so BB is 
-  // something like a loop pre-header (or rarely, a part of an irreducible CFG); 
-  // folding the branch isn't profitable in that case anyway. 
-  if (!Succ->getSinglePredecessor()) { 
-    BasicBlock::iterator BBI = BB->begin(); 
-    while (isa<PHINode>(*BBI)) { 
-      for (Use &U : BBI->uses()) { 
-        if (PHINode* PN = dyn_cast<PHINode>(U.getUser())) { 
-          if (PN->getIncomingBlock(U) != BB) 
-            return false; 
-        } else { 
-          return false; 
-        } 
-      } 
-      ++BBI; 
-    } 
-  } 
- 
-  // We cannot fold the block if it's a branch to an already present callbr 
-  // successor because that creates duplicate successors. 
-  for (auto I = pred_begin(BB), E = pred_end(BB); I != E; ++I) { 
-    if (auto *CBI = dyn_cast<CallBrInst>((*I)->getTerminator())) { 
-      if (Succ == CBI->getDefaultDest()) 
-        return false; 
-      for (unsigned i = 0, e = CBI->getNumIndirectDests(); i != e; ++i) 
-        if (Succ == CBI->getIndirectDest(i)) 
-          return false; 
-    } 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "Killing Trivial BB: \n" << *BB); 
- 
-  SmallVector<DominatorTree::UpdateType, 32> Updates; 
-  if (DTU) { 
-    // All predecessors of BB will be moved to Succ. 
+}
+
+/// Replace a value flowing from a block to a phi with
+/// potentially multiple instances of that value flowing from the
+/// block's predecessors to the phi.
+///
+/// \param BB The block with the value flowing into the phi.
+/// \param BBPreds The predecessors of BB.
+/// \param PN The phi that we are updating.
+static void redirectValuesFromPredecessorsToPhi(BasicBlock *BB,
+                                                const PredBlockVector &BBPreds,
+                                                PHINode *PN) {
+  Value *OldVal = PN->removeIncomingValue(BB, false);
+  assert(OldVal && "No entry in PHI for Pred BB!");
+
+  IncomingValueMap IncomingValues;
+
+  // We are merging two blocks - BB, and the block containing PN - and
+  // as a result we need to redirect edges from the predecessors of BB
+  // to go to the block containing PN, and update PN
+  // accordingly. Since we allow merging blocks in the case where the
+  // predecessor and successor blocks both share some predecessors,
+  // and where some of those common predecessors might have undef
+  // values flowing into PN, we want to rewrite those values to be
+  // consistent with the non-undef values.
+
+  gatherIncomingValuesToPhi(PN, IncomingValues);
+
+  // If this incoming value is one of the PHI nodes in BB, the new entries
+  // in the PHI node are the entries from the old PHI.
+  if (isa<PHINode>(OldVal) && cast<PHINode>(OldVal)->getParent() == BB) {
+    PHINode *OldValPN = cast<PHINode>(OldVal);
+    for (unsigned i = 0, e = OldValPN->getNumIncomingValues(); i != e; ++i) {
+      // Note that, since we are merging phi nodes and BB and Succ might
+      // have common predecessors, we could end up with a phi node with
+      // identical incoming branches. This will be cleaned up later (and
+      // will trigger asserts if we try to clean it up now, without also
+      // simplifying the corresponding conditional branch).
+      BasicBlock *PredBB = OldValPN->getIncomingBlock(i);
+      Value *PredVal = OldValPN->getIncomingValue(i);
+      Value *Selected = selectIncomingValueForBlock(PredVal, PredBB,
+                                                    IncomingValues);
+
+      // And add a new incoming value for this predecessor for the
+      // newly retargeted branch.
+      PN->addIncoming(Selected, PredBB);
+    }
+  } else {
+    for (unsigned i = 0, e = BBPreds.size(); i != e; ++i) {
+      // Update existing incoming values in PN for this
+      // predecessor of BB.
+      BasicBlock *PredBB = BBPreds[i];
+      Value *Selected = selectIncomingValueForBlock(OldVal, PredBB,
+                                                    IncomingValues);
+
+      // And add a new incoming value for this predecessor for the
+      // newly retargeted branch.
+      PN->addIncoming(Selected, PredBB);
+    }
+  }
+
+  replaceUndefValuesInPhi(PN, IncomingValues);
+}
+
+bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
+                                                   DomTreeUpdater *DTU) {
+  assert(BB != &BB->getParent()->getEntryBlock() &&
+         "TryToSimplifyUncondBranchFromEmptyBlock called on entry block!");
+
+  // We can't eliminate infinite loops.
+  BasicBlock *Succ = cast<BranchInst>(BB->getTerminator())->getSuccessor(0);
+  if (BB == Succ) return false;
+
+  // Check to see if merging these blocks would cause conflicts for any of the
+  // phi nodes in BB or Succ. If not, we can safely merge.
+  if (!CanPropagatePredecessorsForPHIs(BB, Succ)) return false;
+
+  // Check for cases where Succ has multiple predecessors and a PHI node in BB
+  // has uses which will not disappear when the PHI nodes are merged.  It is
+  // possible to handle such cases, but difficult: it requires checking whether
+  // BB dominates Succ, which is non-trivial to calculate in the case where
+  // Succ has multiple predecessors.  Also, it requires checking whether
+  // constructing the necessary self-referential PHI node doesn't introduce any
+  // conflicts; this isn't too difficult, but the previous code for doing this
+  // was incorrect.
+  //
+  // Note that if this check finds a live use, BB dominates Succ, so BB is
+  // something like a loop pre-header (or rarely, a part of an irreducible CFG);
+  // folding the branch isn't profitable in that case anyway.
+  if (!Succ->getSinglePredecessor()) {
+    BasicBlock::iterator BBI = BB->begin();
+    while (isa<PHINode>(*BBI)) {
+      for (Use &U : BBI->uses()) {
+        if (PHINode* PN = dyn_cast<PHINode>(U.getUser())) {
+          if (PN->getIncomingBlock(U) != BB)
+            return false;
+        } else {
+          return false;
+        }
+      }
+      ++BBI;
+    }
+  }
+
+  // We cannot fold the block if it's a branch to an already present callbr
+  // successor because that creates duplicate successors.
+  for (auto I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
+    if (auto *CBI = dyn_cast<CallBrInst>((*I)->getTerminator())) {
+      if (Succ == CBI->getDefaultDest())
+        return false;
+      for (unsigned i = 0, e = CBI->getNumIndirectDests(); i != e; ++i)
+        if (Succ == CBI->getIndirectDest(i))
+          return false;
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "Killing Trivial BB: \n" << *BB);
+
+  SmallVector<DominatorTree::UpdateType, 32> Updates;
+  if (DTU) {
+    // All predecessors of BB will be moved to Succ.
     SmallSetVector<BasicBlock *, 8> Predecessors(pred_begin(BB), pred_end(BB));
     Updates.reserve(Updates.size() + 2 * Predecessors.size());
     for (auto *Predecessor : Predecessors) {
-      // This predecessor of BB may already have Succ as a successor. 
+      // This predecessor of BB may already have Succ as a successor.
       if (!llvm::is_contained(successors(Predecessor), Succ))
         Updates.push_back({DominatorTree::Insert, Predecessor, Succ});
       Updates.push_back({DominatorTree::Delete, Predecessor, BB});
-    } 
+    }
     Updates.push_back({DominatorTree::Delete, BB, Succ});
-  } 
- 
-  if (isa<PHINode>(Succ->begin())) { 
-    // If there is more than one pred of succ, and there are PHI nodes in 
-    // the successor, then we need to add incoming edges for the PHI nodes 
-    // 
-    const PredBlockVector BBPreds(pred_begin(BB), pred_end(BB)); 
- 
-    // Loop over all of the PHI nodes in the successor of BB. 
-    for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) { 
-      PHINode *PN = cast<PHINode>(I); 
- 
-      redirectValuesFromPredecessorsToPhi(BB, BBPreds, PN); 
-    } 
-  } 
- 
-  if (Succ->getSinglePredecessor()) { 
-    // BB is the only predecessor of Succ, so Succ will end up with exactly 
-    // the same predecessors BB had. 
- 
-    // Copy over any phi, debug or lifetime instruction. 
-    BB->getTerminator()->eraseFromParent(); 
-    Succ->getInstList().splice(Succ->getFirstNonPHI()->getIterator(), 
-                               BB->getInstList()); 
-  } else { 
-    while (PHINode *PN = dyn_cast<PHINode>(&BB->front())) { 
-      // We explicitly check for such uses in CanPropagatePredecessorsForPHIs. 
-      assert(PN->use_empty() && "There shouldn't be any uses here!"); 
-      PN->eraseFromParent(); 
-    } 
-  } 
- 
-  // If the unconditional branch we replaced contains llvm.loop metadata, we 
-  // add the metadata to the branch instructions in the predecessors. 
-  unsigned LoopMDKind = BB->getContext().getMDKindID("llvm.loop"); 
-  Instruction *TI = BB->getTerminator(); 
-  if (TI) 
-    if (MDNode *LoopMD = TI->getMetadata(LoopMDKind)) 
-      for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { 
-        BasicBlock *Pred = *PI; 
-        Pred->getTerminator()->setMetadata(LoopMDKind, LoopMD); 
-      } 
- 
-  // Everything that jumped to BB now goes to Succ. 
-  BB->replaceAllUsesWith(Succ); 
-  if (!Succ->hasName()) Succ->takeName(BB); 
- 
-  // Clear the successor list of BB to match updates applying to DTU later. 
-  if (BB->getTerminator()) 
-    BB->getInstList().pop_back(); 
-  new UnreachableInst(BB->getContext(), BB); 
-  assert(succ_empty(BB) && "The successor list of BB isn't empty before " 
-                           "applying corresponding DTU updates."); 
- 
-  if (DTU) { 
+  }
+
+  if (isa<PHINode>(Succ->begin())) {
+    // If there is more than one pred of succ, and there are PHI nodes in
+    // the successor, then we need to add incoming edges for the PHI nodes
+    //
+    const PredBlockVector BBPreds(pred_begin(BB), pred_end(BB));
+
+    // Loop over all of the PHI nodes in the successor of BB.
+    for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) {
+      PHINode *PN = cast<PHINode>(I);
+
+      redirectValuesFromPredecessorsToPhi(BB, BBPreds, PN);
+    }
+  }
+
+  if (Succ->getSinglePredecessor()) {
+    // BB is the only predecessor of Succ, so Succ will end up with exactly
+    // the same predecessors BB had.
+
+    // Copy over any phi, debug or lifetime instruction.
+    BB->getTerminator()->eraseFromParent();
+    Succ->getInstList().splice(Succ->getFirstNonPHI()->getIterator(),
+                               BB->getInstList());
+  } else {
+    while (PHINode *PN = dyn_cast<PHINode>(&BB->front())) {
+      // We explicitly check for such uses in CanPropagatePredecessorsForPHIs.
+      assert(PN->use_empty() && "There shouldn't be any uses here!");
+      PN->eraseFromParent();
+    }
+  }
+
+  // If the unconditional branch we replaced contains llvm.loop metadata, we
+  // add the metadata to the branch instructions in the predecessors.
+  unsigned LoopMDKind = BB->getContext().getMDKindID("llvm.loop");
+  Instruction *TI = BB->getTerminator();
+  if (TI)
+    if (MDNode *LoopMD = TI->getMetadata(LoopMDKind))
+      for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+        BasicBlock *Pred = *PI;
+        Pred->getTerminator()->setMetadata(LoopMDKind, LoopMD);
+      }
+
+  // Everything that jumped to BB now goes to Succ.
+  BB->replaceAllUsesWith(Succ);
+  if (!Succ->hasName()) Succ->takeName(BB);
+
+  // Clear the successor list of BB to match updates applying to DTU later.
+  if (BB->getTerminator())
+    BB->getInstList().pop_back();
+  new UnreachableInst(BB->getContext(), BB);
+  assert(succ_empty(BB) && "The successor list of BB isn't empty before "
+                           "applying corresponding DTU updates.");
+
+  if (DTU) {
     DTU->applyUpdates(Updates);
-    DTU->deleteBB(BB); 
-  } else { 
-    BB->eraseFromParent(); // Delete the old basic block. 
-  } 
-  return true; 
-} 
- 
+    DTU->deleteBB(BB);
+  } else {
+    BB->eraseFromParent(); // Delete the old basic block.
+  }
+  return true;
+}
+
 static bool EliminateDuplicatePHINodesNaiveImpl(BasicBlock *BB) {
-  // This implementation doesn't currently consider undef operands 
-  // specially. Theoretically, two phis which are identical except for 
-  // one having an undef where the other doesn't could be collapsed. 
- 
+  // This implementation doesn't currently consider undef operands
+  // specially. Theoretically, two phis which are identical except for
+  // one having an undef where the other doesn't could be collapsed.
+
   bool Changed = false;
 
   // Examine each PHI.
@@ -1181,15 +1181,15 @@ static bool EliminateDuplicatePHINodesSetBasedImpl(BasicBlock *BB) {
   // specially. Theoretically, two phis which are identical except for
   // one having an undef where the other doesn't could be collapsed.
 
-  struct PHIDenseMapInfo { 
-    static PHINode *getEmptyKey() { 
-      return DenseMapInfo<PHINode *>::getEmptyKey(); 
-    } 
- 
-    static PHINode *getTombstoneKey() { 
-      return DenseMapInfo<PHINode *>::getTombstoneKey(); 
-    } 
- 
+  struct PHIDenseMapInfo {
+    static PHINode *getEmptyKey() {
+      return DenseMapInfo<PHINode *>::getEmptyKey();
+    }
+
+    static PHINode *getTombstoneKey() {
+      return DenseMapInfo<PHINode *>::getTombstoneKey();
+    }
+
     static bool isSentinel(PHINode *PN) {
       return PN == getEmptyKey() || PN == getTombstoneKey();
     }
@@ -1197,14 +1197,14 @@ static bool EliminateDuplicatePHINodesSetBasedImpl(BasicBlock *BB) {
     // WARNING: this logic must be kept in sync with
     //          Instruction::isIdenticalToWhenDefined()!
     static unsigned getHashValueImpl(PHINode *PN) {
-      // Compute a hash value on the operands. Instcombine will likely have 
-      // sorted them, which helps expose duplicates, but we have to check all 
-      // the operands to be safe in case instcombine hasn't run. 
-      return static_cast<unsigned>(hash_combine( 
-          hash_combine_range(PN->value_op_begin(), PN->value_op_end()), 
-          hash_combine_range(PN->block_begin(), PN->block_end()))); 
-    } 
- 
+      // Compute a hash value on the operands. Instcombine will likely have
+      // sorted them, which helps expose duplicates, but we have to check all
+      // the operands to be safe in case instcombine hasn't run.
+      return static_cast<unsigned>(hash_combine(
+          hash_combine_range(PN->value_op_begin(), PN->value_op_end()),
+          hash_combine_range(PN->block_begin(), PN->block_end())));
+    }
+
     static unsigned getHashValue(PHINode *PN) {
 #ifndef NDEBUG
       // If -phicse-debug-hash was specified, return a constant -- this
@@ -1219,9 +1219,9 @@ static bool EliminateDuplicatePHINodesSetBasedImpl(BasicBlock *BB) {
 
     static bool isEqualImpl(PHINode *LHS, PHINode *RHS) {
       if (isSentinel(LHS) || isSentinel(RHS))
-        return LHS == RHS; 
-      return LHS->isIdenticalTo(RHS); 
-    } 
+        return LHS == RHS;
+      return LHS->isIdenticalTo(RHS);
+    }
 
     static bool isEqual(PHINode *LHS, PHINode *RHS) {
       // These comparisons are nontrivial, so assert that equality implies
@@ -1231,33 +1231,33 @@ static bool EliminateDuplicatePHINodesSetBasedImpl(BasicBlock *BB) {
              getHashValueImpl(LHS) == getHashValueImpl(RHS));
       return Result;
     }
-  }; 
- 
-  // Set of unique PHINodes. 
-  DenseSet<PHINode *, PHIDenseMapInfo> PHISet; 
+  };
+
+  // Set of unique PHINodes.
+  DenseSet<PHINode *, PHIDenseMapInfo> PHISet;
   PHISet.reserve(4 * PHICSENumPHISmallSize);
- 
-  // Examine each PHI. 
-  bool Changed = false; 
-  for (auto I = BB->begin(); PHINode *PN = dyn_cast<PHINode>(I++);) { 
-    auto Inserted = PHISet.insert(PN); 
-    if (!Inserted.second) { 
-      // A duplicate. Replace this PHI with its duplicate. 
+
+  // Examine each PHI.
+  bool Changed = false;
+  for (auto I = BB->begin(); PHINode *PN = dyn_cast<PHINode>(I++);) {
+    auto Inserted = PHISet.insert(PN);
+    if (!Inserted.second) {
+      // A duplicate. Replace this PHI with its duplicate.
       ++NumPHICSEs;
-      PN->replaceAllUsesWith(*Inserted.first); 
-      PN->eraseFromParent(); 
-      Changed = true; 
- 
-      // The RAUW can change PHIs that we already visited. Start over from the 
-      // beginning. 
-      PHISet.clear(); 
-      I = BB->begin(); 
-    } 
-  } 
- 
-  return Changed; 
-} 
- 
+      PN->replaceAllUsesWith(*Inserted.first);
+      PN->eraseFromParent();
+      Changed = true;
+
+      // The RAUW can change PHIs that we already visited. Start over from the
+      // beginning.
+      PHISet.clear();
+      I = BB->begin();
+    }
+  }
+
+  return Changed;
+}
+
 bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) {
   if (
 #ifndef NDEBUG
@@ -1267,7 +1267,7 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) {
     return EliminateDuplicatePHINodesNaiveImpl(BB);
   return EliminateDuplicatePHINodesSetBasedImpl(BB);
 }
- 
+
 /// If the specified pointer points to an object that we control, try to modify
 /// the object's alignment to PrefAlign. Returns a minimum known alignment of
 /// the value after the operation, which may be lower than PrefAlign.
@@ -1277,9 +1277,9 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) {
 /// and allocation instructions to their preferred alignment from the beginning.
 static Align tryEnforceAlignment(Value *V, Align PrefAlign,
                                  const DataLayout &DL) {
-  V = V->stripPointerCasts(); 
- 
-  if (AllocaInst *AI = dyn_cast<AllocaInst>(V)) { 
+  V = V->stripPointerCasts();
+
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
     // TODO: Ideally, this function would not be called if PrefAlign is smaller
     // than the current alignment, as the known bits calculation should have
     // already taken it into account. However, this is not always the case,
@@ -1288,801 +1288,801 @@ static Align tryEnforceAlignment(Value *V, Align PrefAlign,
     Align CurrentAlign = AI->getAlign();
     if (PrefAlign <= CurrentAlign)
       return CurrentAlign;
- 
-    // If the preferred alignment is greater than the natural stack alignment 
-    // then don't round up. This avoids dynamic stack realignment. 
-    if (DL.exceedsNaturalStackAlignment(PrefAlign)) 
+
+    // If the preferred alignment is greater than the natural stack alignment
+    // then don't round up. This avoids dynamic stack realignment.
+    if (DL.exceedsNaturalStackAlignment(PrefAlign))
       return CurrentAlign;
-    AI->setAlignment(PrefAlign); 
-    return PrefAlign; 
-  } 
- 
-  if (auto *GO = dyn_cast<GlobalObject>(V)) { 
-    // TODO: as above, this shouldn't be necessary. 
+    AI->setAlignment(PrefAlign);
+    return PrefAlign;
+  }
+
+  if (auto *GO = dyn_cast<GlobalObject>(V)) {
+    // TODO: as above, this shouldn't be necessary.
     Align CurrentAlign = GO->getPointerAlignment(DL);
     if (PrefAlign <= CurrentAlign)
       return CurrentAlign;
- 
-    // If there is a large requested alignment and we can, bump up the alignment 
-    // of the global.  If the memory we set aside for the global may not be the 
-    // memory used by the final program then it is impossible for us to reliably 
-    // enforce the preferred alignment. 
-    if (!GO->canIncreaseAlignment()) 
+
+    // If there is a large requested alignment and we can, bump up the alignment
+    // of the global.  If the memory we set aside for the global may not be the
+    // memory used by the final program then it is impossible for us to reliably
+    // enforce the preferred alignment.
+    if (!GO->canIncreaseAlignment())
       return CurrentAlign;
- 
-    GO->setAlignment(PrefAlign); 
-    return PrefAlign; 
-  } 
- 
+
+    GO->setAlignment(PrefAlign);
+    return PrefAlign;
+  }
+
   return Align(1);
-} 
- 
-Align llvm::getOrEnforceKnownAlignment(Value *V, MaybeAlign PrefAlign, 
-                                       const DataLayout &DL, 
-                                       const Instruction *CxtI, 
-                                       AssumptionCache *AC, 
-                                       const DominatorTree *DT) { 
-  assert(V->getType()->isPointerTy() && 
-         "getOrEnforceKnownAlignment expects a pointer!"); 
- 
-  KnownBits Known = computeKnownBits(V, DL, 0, AC, CxtI, DT); 
-  unsigned TrailZ = Known.countMinTrailingZeros(); 
- 
-  // Avoid trouble with ridiculously large TrailZ values, such as 
-  // those computed from a null pointer. 
-  // LLVM doesn't support alignments larger than (1 << MaxAlignmentExponent). 
-  TrailZ = std::min(TrailZ, +Value::MaxAlignmentExponent); 
- 
-  Align Alignment = Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ)); 
- 
-  if (PrefAlign && *PrefAlign > Alignment) 
+}
+
+Align llvm::getOrEnforceKnownAlignment(Value *V, MaybeAlign PrefAlign,
+                                       const DataLayout &DL,
+                                       const Instruction *CxtI,
+                                       AssumptionCache *AC,
+                                       const DominatorTree *DT) {
+  assert(V->getType()->isPointerTy() &&
+         "getOrEnforceKnownAlignment expects a pointer!");
+
+  KnownBits Known = computeKnownBits(V, DL, 0, AC, CxtI, DT);
+  unsigned TrailZ = Known.countMinTrailingZeros();
+
+  // Avoid trouble with ridiculously large TrailZ values, such as
+  // those computed from a null pointer.
+  // LLVM doesn't support alignments larger than (1 << MaxAlignmentExponent).
+  TrailZ = std::min(TrailZ, +Value::MaxAlignmentExponent);
+
+  Align Alignment = Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ));
+
+  if (PrefAlign && *PrefAlign > Alignment)
     Alignment = std::max(Alignment, tryEnforceAlignment(V, *PrefAlign, DL));
- 
-  // We don't need to make any adjustment. 
-  return Alignment; 
-} 
- 
-///===---------------------------------------------------------------------===// 
-///  Dbg Intrinsic utilities 
-/// 
- 
-/// See if there is a dbg.value intrinsic for DIVar for the PHI node. 
-static bool PhiHasDebugValue(DILocalVariable *DIVar, 
-                             DIExpression *DIExpr, 
-                             PHINode *APN) { 
-  // Since we can't guarantee that the original dbg.declare instrinsic 
-  // is removed by LowerDbgDeclare(), we need to make sure that we are 
-  // not inserting the same dbg.value intrinsic over and over. 
-  SmallVector<DbgValueInst *, 1> DbgValues; 
-  findDbgValues(DbgValues, APN); 
-  for (auto *DVI : DbgValues) { 
-    assert(DVI->getValue() == APN); 
-    if ((DVI->getVariable() == DIVar) && (DVI->getExpression() == DIExpr)) 
-      return true; 
-  } 
-  return false; 
-} 
- 
-/// Check if the alloc size of \p ValTy is large enough to cover the variable 
-/// (or fragment of the variable) described by \p DII. 
-/// 
-/// This is primarily intended as a helper for the different 
-/// ConvertDebugDeclareToDebugValue functions. The dbg.declare/dbg.addr that is 
-/// converted describes an alloca'd variable, so we need to use the 
-/// alloc size of the value when doing the comparison. E.g. an i1 value will be 
-/// identified as covering an n-bit fragment, if the store size of i1 is at 
-/// least n bits. 
-static bool valueCoversEntireFragment(Type *ValTy, DbgVariableIntrinsic *DII) { 
-  const DataLayout &DL = DII->getModule()->getDataLayout(); 
+
+  // We don't need to make any adjustment.
+  return Alignment;
+}
+
+///===---------------------------------------------------------------------===//
+///  Dbg Intrinsic utilities
+///
+
+/// See if there is a dbg.value intrinsic for DIVar for the PHI node.
+static bool PhiHasDebugValue(DILocalVariable *DIVar,
+                             DIExpression *DIExpr,
+                             PHINode *APN) {
+  // Since we can't guarantee that the original dbg.declare instrinsic
+  // is removed by LowerDbgDeclare(), we need to make sure that we are
+  // not inserting the same dbg.value intrinsic over and over.
+  SmallVector<DbgValueInst *, 1> DbgValues;
+  findDbgValues(DbgValues, APN);
+  for (auto *DVI : DbgValues) {
+    assert(DVI->getValue() == APN);
+    if ((DVI->getVariable() == DIVar) && (DVI->getExpression() == DIExpr))
+      return true;
+  }
+  return false;
+}
+
+/// Check if the alloc size of \p ValTy is large enough to cover the variable
+/// (or fragment of the variable) described by \p DII.
+///
+/// This is primarily intended as a helper for the different
+/// ConvertDebugDeclareToDebugValue functions. The dbg.declare/dbg.addr that is
+/// converted describes an alloca'd variable, so we need to use the
+/// alloc size of the value when doing the comparison. E.g. an i1 value will be
+/// identified as covering an n-bit fragment, if the store size of i1 is at
+/// least n bits.
+static bool valueCoversEntireFragment(Type *ValTy, DbgVariableIntrinsic *DII) {
+  const DataLayout &DL = DII->getModule()->getDataLayout();
   TypeSize ValueSize = DL.getTypeAllocSizeInBits(ValTy);
   if (Optional<uint64_t> FragmentSize = DII->getFragmentSizeInBits()) {
     assert(!ValueSize.isScalable() &&
            "Fragments don't work on scalable types.");
     return ValueSize.getFixedSize() >= *FragmentSize;
   }
-  // We can't always calculate the size of the DI variable (e.g. if it is a 
-  // VLA). Try to use the size of the alloca that the dbg intrinsic describes 
-  // intead. 
-  if (DII->isAddressOfVariable()) 
-    if (auto *AI = dyn_cast_or_null<AllocaInst>(DII->getVariableLocation())) 
+  // We can't always calculate the size of the DI variable (e.g. if it is a
+  // VLA). Try to use the size of the alloca that the dbg intrinsic describes
+  // intead.
+  if (DII->isAddressOfVariable())
+    if (auto *AI = dyn_cast_or_null<AllocaInst>(DII->getVariableLocation()))
       if (Optional<TypeSize> FragmentSize = AI->getAllocationSizeInBits(DL)) {
         assert(ValueSize.isScalable() == FragmentSize->isScalable() &&
                "Both sizes should agree on the scalable flag.");
         return TypeSize::isKnownGE(ValueSize, *FragmentSize);
       }
-  // Could not determine size of variable. Conservatively return false. 
-  return false; 
-} 
- 
-/// Produce a DebugLoc to use for each dbg.declare/inst pair that are promoted 
-/// to a dbg.value. Because no machine insts can come from debug intrinsics, 
-/// only the scope and inlinedAt is significant. Zero line numbers are used in 
-/// case this DebugLoc leaks into any adjacent instructions. 
-static DebugLoc getDebugValueLoc(DbgVariableIntrinsic *DII, Instruction *Src) { 
-  // Original dbg.declare must have a location. 
-  DebugLoc DeclareLoc = DII->getDebugLoc(); 
-  MDNode *Scope = DeclareLoc.getScope(); 
-  DILocation *InlinedAt = DeclareLoc.getInlinedAt(); 
-  // Produce an unknown location with the correct scope / inlinedAt fields. 
+  // Could not determine size of variable. Conservatively return false.
+  return false;
+}
+
+/// Produce a DebugLoc to use for each dbg.declare/inst pair that are promoted
+/// to a dbg.value. Because no machine insts can come from debug intrinsics,
+/// only the scope and inlinedAt is significant. Zero line numbers are used in
+/// case this DebugLoc leaks into any adjacent instructions.
+static DebugLoc getDebugValueLoc(DbgVariableIntrinsic *DII, Instruction *Src) {
+  // Original dbg.declare must have a location.
+  DebugLoc DeclareLoc = DII->getDebugLoc();
+  MDNode *Scope = DeclareLoc.getScope();
+  DILocation *InlinedAt = DeclareLoc.getInlinedAt();
+  // Produce an unknown location with the correct scope / inlinedAt fields.
   return DILocation::get(DII->getContext(), 0, 0, Scope, InlinedAt);
-} 
- 
-/// Inserts a llvm.dbg.value intrinsic before a store to an alloca'd value 
-/// that has an associated llvm.dbg.declare or llvm.dbg.addr intrinsic. 
-void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII, 
-                                           StoreInst *SI, DIBuilder &Builder) { 
-  assert(DII->isAddressOfVariable()); 
-  auto *DIVar = DII->getVariable(); 
-  assert(DIVar && "Missing variable"); 
-  auto *DIExpr = DII->getExpression(); 
-  Value *DV = SI->getValueOperand(); 
- 
-  DebugLoc NewLoc = getDebugValueLoc(DII, SI); 
- 
-  if (!valueCoversEntireFragment(DV->getType(), DII)) { 
-    // FIXME: If storing to a part of the variable described by the dbg.declare, 
-    // then we want to insert a dbg.value for the corresponding fragment. 
-    LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: " 
-                      << *DII << '\n'); 
-    // For now, when there is a store to parts of the variable (but we do not 
-    // know which part) we insert an dbg.value instrinsic to indicate that we 
-    // know nothing about the variable's content. 
-    DV = UndefValue::get(DV->getType()); 
-    Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc, SI); 
-    return; 
-  } 
- 
-  Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc, SI); 
-} 
- 
-/// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value 
-/// that has an associated llvm.dbg.declare or llvm.dbg.addr intrinsic. 
-void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII, 
-                                           LoadInst *LI, DIBuilder &Builder) { 
-  auto *DIVar = DII->getVariable(); 
-  auto *DIExpr = DII->getExpression(); 
-  assert(DIVar && "Missing variable"); 
- 
-  if (!valueCoversEntireFragment(LI->getType(), DII)) { 
-    // FIXME: If only referring to a part of the variable described by the 
-    // dbg.declare, then we want to insert a dbg.value for the corresponding 
-    // fragment. 
-    LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: " 
-                      << *DII << '\n'); 
-    return; 
-  } 
- 
-  DebugLoc NewLoc = getDebugValueLoc(DII, nullptr); 
- 
-  // We are now tracking the loaded value instead of the address. In the 
-  // future if multi-location support is added to the IR, it might be 
-  // preferable to keep tracking both the loaded value and the original 
-  // address in case the alloca can not be elided. 
-  Instruction *DbgValue = Builder.insertDbgValueIntrinsic( 
-      LI, DIVar, DIExpr, NewLoc, (Instruction *)nullptr); 
-  DbgValue->insertAfter(LI); 
-} 
- 
-/// Inserts a llvm.dbg.value intrinsic after a phi that has an associated 
-/// llvm.dbg.declare or llvm.dbg.addr intrinsic. 
-void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII, 
-                                           PHINode *APN, DIBuilder &Builder) { 
-  auto *DIVar = DII->getVariable(); 
-  auto *DIExpr = DII->getExpression(); 
-  assert(DIVar && "Missing variable"); 
- 
-  if (PhiHasDebugValue(DIVar, DIExpr, APN)) 
-    return; 
- 
-  if (!valueCoversEntireFragment(APN->getType(), DII)) { 
-    // FIXME: If only referring to a part of the variable described by the 
-    // dbg.declare, then we want to insert a dbg.value for the corresponding 
-    // fragment. 
-    LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: " 
-                      << *DII << '\n'); 
-    return; 
-  } 
- 
-  BasicBlock *BB = APN->getParent(); 
-  auto InsertionPt = BB->getFirstInsertionPt(); 
- 
-  DebugLoc NewLoc = getDebugValueLoc(DII, nullptr); 
- 
-  // The block may be a catchswitch block, which does not have a valid 
-  // insertion point. 
-  // FIXME: Insert dbg.value markers in the successors when appropriate. 
-  if (InsertionPt != BB->end()) 
-    Builder.insertDbgValueIntrinsic(APN, DIVar, DIExpr, NewLoc, &*InsertionPt); 
-} 
- 
-/// Determine whether this alloca is either a VLA or an array. 
-static bool isArray(AllocaInst *AI) { 
-  return AI->isArrayAllocation() || 
-         (AI->getAllocatedType() && AI->getAllocatedType()->isArrayTy()); 
-} 
- 
-/// Determine whether this alloca is a structure. 
-static bool isStructure(AllocaInst *AI) { 
-  return AI->getAllocatedType() && AI->getAllocatedType()->isStructTy(); 
-} 
- 
-/// LowerDbgDeclare - Lowers llvm.dbg.declare intrinsics into appropriate set 
-/// of llvm.dbg.value intrinsics. 
-bool llvm::LowerDbgDeclare(Function &F) { 
-  bool Changed = false; 
-  DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false); 
-  SmallVector<DbgDeclareInst *, 4> Dbgs; 
-  for (auto &FI : F) 
-    for (Instruction &BI : FI) 
-      if (auto DDI = dyn_cast<DbgDeclareInst>(&BI)) 
-        Dbgs.push_back(DDI); 
- 
-  if (Dbgs.empty()) 
-    return Changed; 
- 
-  for (auto &I : Dbgs) { 
-    DbgDeclareInst *DDI = I; 
-    AllocaInst *AI = dyn_cast_or_null<AllocaInst>(DDI->getAddress()); 
-    // If this is an alloca for a scalar variable, insert a dbg.value 
-    // at each load and store to the alloca and erase the dbg.declare. 
-    // The dbg.values allow tracking a variable even if it is not 
-    // stored on the stack, while the dbg.declare can only describe 
-    // the stack slot (and at a lexical-scope granularity). Later 
-    // passes will attempt to elide the stack slot. 
-    if (!AI || isArray(AI) || isStructure(AI)) 
-      continue; 
- 
-    // A volatile load/store means that the alloca can't be elided anyway. 
-    if (llvm::any_of(AI->users(), [](User *U) -> bool { 
-          if (LoadInst *LI = dyn_cast<LoadInst>(U)) 
-            return LI->isVolatile(); 
-          if (StoreInst *SI = dyn_cast<StoreInst>(U)) 
-            return SI->isVolatile(); 
-          return false; 
-        })) 
-      continue; 
- 
-    SmallVector<const Value *, 8> WorkList; 
-    WorkList.push_back(AI); 
-    while (!WorkList.empty()) { 
-      const Value *V = WorkList.pop_back_val(); 
-      for (auto &AIUse : V->uses()) { 
-        User *U = AIUse.getUser(); 
-        if (StoreInst *SI = dyn_cast<StoreInst>(U)) { 
-          if (AIUse.getOperandNo() == 1) 
-            ConvertDebugDeclareToDebugValue(DDI, SI, DIB); 
-        } else if (LoadInst *LI = dyn_cast<LoadInst>(U)) { 
-          ConvertDebugDeclareToDebugValue(DDI, LI, DIB); 
-        } else if (CallInst *CI = dyn_cast<CallInst>(U)) { 
-          // This is a call by-value or some other instruction that takes a 
-          // pointer to the variable. Insert a *value* intrinsic that describes 
-          // the variable by dereferencing the alloca. 
-          if (!CI->isLifetimeStartOrEnd()) { 
-            DebugLoc NewLoc = getDebugValueLoc(DDI, nullptr); 
-            auto *DerefExpr = 
-                DIExpression::append(DDI->getExpression(), dwarf::DW_OP_deref); 
-            DIB.insertDbgValueIntrinsic(AI, DDI->getVariable(), DerefExpr, 
-                                        NewLoc, CI); 
-          } 
-        } else if (BitCastInst *BI = dyn_cast<BitCastInst>(U)) { 
-          if (BI->getType()->isPointerTy()) 
-            WorkList.push_back(BI); 
-        } 
-      } 
-    } 
-    DDI->eraseFromParent(); 
-    Changed = true; 
-  } 
- 
-  if (Changed) 
-  for (BasicBlock &BB : F) 
-    RemoveRedundantDbgInstrs(&BB); 
- 
-  return Changed; 
-} 
- 
-/// Propagate dbg.value intrinsics through the newly inserted PHIs. 
-void llvm::insertDebugValuesForPHIs(BasicBlock *BB, 
-                                    SmallVectorImpl<PHINode *> &InsertedPHIs) { 
-  assert(BB && "No BasicBlock to clone dbg.value(s) from."); 
-  if (InsertedPHIs.size() == 0) 
-    return; 
- 
-  // Map existing PHI nodes to their dbg.values. 
-  ValueToValueMapTy DbgValueMap; 
-  for (auto &I : *BB) { 
-    if (auto DbgII = dyn_cast<DbgVariableIntrinsic>(&I)) { 
-      if (auto *Loc = dyn_cast_or_null<PHINode>(DbgII->getVariableLocation())) 
-        DbgValueMap.insert({Loc, DbgII}); 
-    } 
-  } 
-  if (DbgValueMap.size() == 0) 
-    return; 
- 
-  // Then iterate through the new PHIs and look to see if they use one of the 
-  // previously mapped PHIs. If so, insert a new dbg.value intrinsic that will 
-  // propagate the info through the new PHI. 
-  LLVMContext &C = BB->getContext(); 
-  for (auto PHI : InsertedPHIs) { 
-    BasicBlock *Parent = PHI->getParent(); 
-    // Avoid inserting an intrinsic into an EH block. 
-    if (Parent->getFirstNonPHI()->isEHPad()) 
-      continue; 
-    auto PhiMAV = MetadataAsValue::get(C, ValueAsMetadata::get(PHI)); 
-    for (auto VI : PHI->operand_values()) { 
-      auto V = DbgValueMap.find(VI); 
-      if (V != DbgValueMap.end()) { 
-        auto *DbgII = cast<DbgVariableIntrinsic>(V->second); 
-        Instruction *NewDbgII = DbgII->clone(); 
-        NewDbgII->setOperand(0, PhiMAV); 
-        auto InsertionPt = Parent->getFirstInsertionPt(); 
-        assert(InsertionPt != Parent->end() && "Ill-formed basic block"); 
-        NewDbgII->insertBefore(&*InsertionPt); 
-      } 
-    } 
-  } 
-} 
- 
-/// Finds all intrinsics declaring local variables as living in the memory that 
-/// 'V' points to. This may include a mix of dbg.declare and 
-/// dbg.addr intrinsics. 
-TinyPtrVector<DbgVariableIntrinsic *> llvm::FindDbgAddrUses(Value *V) { 
-  // This function is hot. Check whether the value has any metadata to avoid a 
-  // DenseMap lookup. 
-  if (!V->isUsedByMetadata()) 
-    return {}; 
-  auto *L = LocalAsMetadata::getIfExists(V); 
-  if (!L) 
-    return {}; 
-  auto *MDV = MetadataAsValue::getIfExists(V->getContext(), L); 
-  if (!MDV) 
-    return {}; 
- 
-  TinyPtrVector<DbgVariableIntrinsic *> Declares; 
-  for (User *U : MDV->users()) { 
-    if (auto *DII = dyn_cast<DbgVariableIntrinsic>(U)) 
-      if (DII->isAddressOfVariable()) 
-        Declares.push_back(DII); 
-  } 
- 
-  return Declares; 
-} 
- 
-TinyPtrVector<DbgDeclareInst *> llvm::FindDbgDeclareUses(Value *V) { 
-  TinyPtrVector<DbgDeclareInst *> DDIs; 
-  for (DbgVariableIntrinsic *DVI : FindDbgAddrUses(V)) 
-    if (auto *DDI = dyn_cast<DbgDeclareInst>(DVI)) 
-      DDIs.push_back(DDI); 
-  return DDIs; 
-} 
- 
-void llvm::findDbgValues(SmallVectorImpl<DbgValueInst *> &DbgValues, Value *V) { 
-  // This function is hot. Check whether the value has any metadata to avoid a 
-  // DenseMap lookup. 
-  if (!V->isUsedByMetadata()) 
-    return; 
-  if (auto *L = LocalAsMetadata::getIfExists(V)) 
-    if (auto *MDV = MetadataAsValue::getIfExists(V->getContext(), L)) 
-      for (User *U : MDV->users()) 
-        if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U)) 
-          DbgValues.push_back(DVI); 
-} 
- 
-void llvm::findDbgUsers(SmallVectorImpl<DbgVariableIntrinsic *> &DbgUsers, 
-                        Value *V) { 
-  // This function is hot. Check whether the value has any metadata to avoid a 
-  // DenseMap lookup. 
-  if (!V->isUsedByMetadata()) 
-    return; 
-  if (auto *L = LocalAsMetadata::getIfExists(V)) 
-    if (auto *MDV = MetadataAsValue::getIfExists(V->getContext(), L)) 
-      for (User *U : MDV->users()) 
-        if (DbgVariableIntrinsic *DII = dyn_cast<DbgVariableIntrinsic>(U)) 
-          DbgUsers.push_back(DII); 
-} 
- 
-bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress, 
-                             DIBuilder &Builder, uint8_t DIExprFlags, 
-                             int Offset) { 
-  auto DbgAddrs = FindDbgAddrUses(Address); 
-  for (DbgVariableIntrinsic *DII : DbgAddrs) { 
-    DebugLoc Loc = DII->getDebugLoc(); 
-    auto *DIVar = DII->getVariable(); 
-    auto *DIExpr = DII->getExpression(); 
-    assert(DIVar && "Missing variable"); 
-    DIExpr = DIExpression::prepend(DIExpr, DIExprFlags, Offset); 
-    // Insert llvm.dbg.declare immediately before DII, and remove old 
-    // llvm.dbg.declare. 
-    Builder.insertDeclare(NewAddress, DIVar, DIExpr, Loc, DII); 
-    DII->eraseFromParent(); 
-  } 
-  return !DbgAddrs.empty(); 
-} 
- 
-static void replaceOneDbgValueForAlloca(DbgValueInst *DVI, Value *NewAddress, 
-                                        DIBuilder &Builder, int Offset) { 
-  DebugLoc Loc = DVI->getDebugLoc(); 
-  auto *DIVar = DVI->getVariable(); 
-  auto *DIExpr = DVI->getExpression(); 
-  assert(DIVar && "Missing variable"); 
- 
-  // This is an alloca-based llvm.dbg.value. The first thing it should do with 
-  // the alloca pointer is dereference it. Otherwise we don't know how to handle 
-  // it and give up. 
-  if (!DIExpr || DIExpr->getNumElements() < 1 || 
-      DIExpr->getElement(0) != dwarf::DW_OP_deref) 
-    return; 
- 
-  // Insert the offset before the first deref. 
-  // We could just change the offset argument of dbg.value, but it's unsigned... 
-  if (Offset) 
-    DIExpr = DIExpression::prepend(DIExpr, 0, Offset); 
- 
-  Builder.insertDbgValueIntrinsic(NewAddress, DIVar, DIExpr, Loc, DVI); 
-  DVI->eraseFromParent(); 
-} 
- 
-void llvm::replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress, 
-                                    DIBuilder &Builder, int Offset) { 
-  if (auto *L = LocalAsMetadata::getIfExists(AI)) 
-    if (auto *MDV = MetadataAsValue::getIfExists(AI->getContext(), L)) 
-      for (auto UI = MDV->use_begin(), UE = MDV->use_end(); UI != UE;) { 
-        Use &U = *UI++; 
-        if (auto *DVI = dyn_cast<DbgValueInst>(U.getUser())) 
-          replaceOneDbgValueForAlloca(DVI, NewAllocaAddress, Builder, Offset); 
-      } 
-} 
- 
-/// Wrap \p V in a ValueAsMetadata instance. 
-static MetadataAsValue *wrapValueInMetadata(LLVMContext &C, Value *V) { 
-  return MetadataAsValue::get(C, ValueAsMetadata::get(V)); 
-} 
- 
-/// Where possible to salvage debug information for \p I do so 
-/// and return True. If not possible mark undef and return False. 
-void llvm::salvageDebugInfo(Instruction &I) { 
-  SmallVector<DbgVariableIntrinsic *, 1> DbgUsers; 
-  findDbgUsers(DbgUsers, &I); 
-  salvageDebugInfoForDbgValues(I, DbgUsers); 
-} 
- 
-void llvm::salvageDebugInfoForDbgValues( 
-    Instruction &I, ArrayRef<DbgVariableIntrinsic *> DbgUsers) { 
-  auto &Ctx = I.getContext(); 
-  bool Salvaged = false; 
-  auto wrapMD = [&](Value *V) { return wrapValueInMetadata(Ctx, V); }; 
- 
-  for (auto *DII : DbgUsers) { 
-    // Do not add DW_OP_stack_value for DbgDeclare and DbgAddr, because they 
-    // are implicitly pointing out the value as a DWARF memory location 
-    // description. 
-    bool StackValue = isa<DbgValueInst>(DII); 
- 
-    DIExpression *DIExpr = 
-        salvageDebugInfoImpl(I, DII->getExpression(), StackValue); 
- 
-    // salvageDebugInfoImpl should fail on examining the first element of 
-    // DbgUsers, or none of them. 
-    if (!DIExpr) 
-      break; 
- 
-    DII->setOperand(0, wrapMD(I.getOperand(0))); 
-    DII->setOperand(2, MetadataAsValue::get(Ctx, DIExpr)); 
-    LLVM_DEBUG(dbgs() << "SALVAGE: " << *DII << '\n'); 
-    Salvaged = true; 
-  } 
- 
-  if (Salvaged) 
-    return; 
- 
-  for (auto *DII : DbgUsers) { 
-    Value *Undef = UndefValue::get(I.getType()); 
-    DII->setOperand(0, MetadataAsValue::get(DII->getContext(), 
-                                            ValueAsMetadata::get(Undef))); 
-  } 
-} 
- 
-DIExpression *llvm::salvageDebugInfoImpl(Instruction &I, 
-                                         DIExpression *SrcDIExpr, 
-                                         bool WithStackValue) { 
-  auto &M = *I.getModule(); 
-  auto &DL = M.getDataLayout(); 
- 
-  // Apply a vector of opcodes to the source DIExpression. 
-  auto doSalvage = [&](SmallVectorImpl<uint64_t> &Ops) -> DIExpression * { 
-    DIExpression *DIExpr = SrcDIExpr; 
-    if (!Ops.empty()) { 
-      DIExpr = DIExpression::prependOpcodes(DIExpr, Ops, WithStackValue); 
-    } 
-    return DIExpr; 
-  }; 
- 
-  // Apply the given offset to the source DIExpression. 
-  auto applyOffset = [&](uint64_t Offset) -> DIExpression * { 
-    SmallVector<uint64_t, 8> Ops; 
-    DIExpression::appendOffset(Ops, Offset); 
-    return doSalvage(Ops); 
-  }; 
- 
-  // initializer-list helper for applying operators to the source DIExpression. 
-  auto applyOps = [&](ArrayRef<uint64_t> Opcodes) -> DIExpression * { 
-    SmallVector<uint64_t, 8> Ops(Opcodes.begin(), Opcodes.end()); 
-    return doSalvage(Ops); 
-  }; 
- 
-  if (auto *CI = dyn_cast<CastInst>(&I)) { 
-    // No-op casts are irrelevant for debug info. 
-    if (CI->isNoopCast(DL)) 
-      return SrcDIExpr; 
- 
-    Type *Type = CI->getType(); 
-    // Casts other than Trunc, SExt, or ZExt to scalar types cannot be salvaged. 
-    if (Type->isVectorTy() || 
-        !(isa<TruncInst>(&I) || isa<SExtInst>(&I) || isa<ZExtInst>(&I))) 
-      return nullptr; 
- 
-    Value *FromValue = CI->getOperand(0); 
-    unsigned FromTypeBitSize = FromValue->getType()->getScalarSizeInBits(); 
-    unsigned ToTypeBitSize = Type->getScalarSizeInBits(); 
- 
-    return applyOps(DIExpression::getExtOps(FromTypeBitSize, ToTypeBitSize, 
-                                            isa<SExtInst>(&I))); 
-  } 
- 
-  if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { 
-    unsigned BitWidth = 
-        M.getDataLayout().getIndexSizeInBits(GEP->getPointerAddressSpace()); 
-    // Rewrite a constant GEP into a DIExpression. 
-    APInt Offset(BitWidth, 0); 
-    if (GEP->accumulateConstantOffset(M.getDataLayout(), Offset)) { 
-      return applyOffset(Offset.getSExtValue()); 
-    } else { 
-      return nullptr; 
-    } 
-  } else if (auto *BI = dyn_cast<BinaryOperator>(&I)) { 
-    // Rewrite binary operations with constant integer operands. 
-    auto *ConstInt = dyn_cast<ConstantInt>(I.getOperand(1)); 
-    if (!ConstInt || ConstInt->getBitWidth() > 64) 
-      return nullptr; 
- 
-    uint64_t Val = ConstInt->getSExtValue(); 
-    switch (BI->getOpcode()) { 
-    case Instruction::Add: 
-      return applyOffset(Val); 
-    case Instruction::Sub: 
-      return applyOffset(-int64_t(Val)); 
-    case Instruction::Mul: 
-      return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_mul}); 
-    case Instruction::SDiv: 
-      return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_div}); 
-    case Instruction::SRem: 
-      return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_mod}); 
-    case Instruction::Or: 
-      return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_or}); 
-    case Instruction::And: 
-      return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_and}); 
-    case Instruction::Xor: 
-      return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_xor}); 
-    case Instruction::Shl: 
-      return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_shl}); 
-    case Instruction::LShr: 
-      return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_shr}); 
-    case Instruction::AShr: 
-      return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_shra}); 
-    default: 
-      // TODO: Salvage constants from each kind of binop we know about. 
-      return nullptr; 
-    } 
-    // *Not* to do: we should not attempt to salvage load instructions, 
-    // because the validity and lifetime of a dbg.value containing 
-    // DW_OP_deref becomes difficult to analyze. See PR40628 for examples. 
-  } 
-  return nullptr; 
-} 
- 
-/// A replacement for a dbg.value expression. 
-using DbgValReplacement = Optional<DIExpression *>; 
- 
-/// Point debug users of \p From to \p To using exprs given by \p RewriteExpr, 
-/// possibly moving/undefing users to prevent use-before-def. Returns true if 
-/// changes are made. 
-static bool rewriteDebugUsers( 
-    Instruction &From, Value &To, Instruction &DomPoint, DominatorTree &DT, 
-    function_ref<DbgValReplacement(DbgVariableIntrinsic &DII)> RewriteExpr) { 
-  // Find debug users of From. 
-  SmallVector<DbgVariableIntrinsic *, 1> Users; 
-  findDbgUsers(Users, &From); 
-  if (Users.empty()) 
-    return false; 
- 
-  // Prevent use-before-def of To. 
-  bool Changed = false; 
-  SmallPtrSet<DbgVariableIntrinsic *, 1> UndefOrSalvage; 
-  if (isa<Instruction>(&To)) { 
-    bool DomPointAfterFrom = From.getNextNonDebugInstruction() == &DomPoint; 
- 
-    for (auto *DII : Users) { 
-      // It's common to see a debug user between From and DomPoint. Move it 
-      // after DomPoint to preserve the variable update without any reordering. 
-      if (DomPointAfterFrom && DII->getNextNonDebugInstruction() == &DomPoint) { 
-        LLVM_DEBUG(dbgs() << "MOVE:  " << *DII << '\n'); 
-        DII->moveAfter(&DomPoint); 
-        Changed = true; 
- 
-      // Users which otherwise aren't dominated by the replacement value must 
-      // be salvaged or deleted. 
-      } else if (!DT.dominates(&DomPoint, DII)) { 
-        UndefOrSalvage.insert(DII); 
-      } 
-    } 
-  } 
- 
-  // Update debug users without use-before-def risk. 
-  for (auto *DII : Users) { 
-    if (UndefOrSalvage.count(DII)) 
-      continue; 
- 
-    LLVMContext &Ctx = DII->getContext(); 
-    DbgValReplacement DVR = RewriteExpr(*DII); 
-    if (!DVR) 
-      continue; 
- 
-    DII->setOperand(0, wrapValueInMetadata(Ctx, &To)); 
-    DII->setOperand(2, MetadataAsValue::get(Ctx, *DVR)); 
-    LLVM_DEBUG(dbgs() << "REWRITE:  " << *DII << '\n'); 
-    Changed = true; 
-  } 
- 
-  if (!UndefOrSalvage.empty()) { 
-    // Try to salvage the remaining debug users. 
-    salvageDebugInfo(From); 
-    Changed = true; 
-  } 
- 
-  return Changed; 
-} 
- 
-/// Check if a bitcast between a value of type \p FromTy to type \p ToTy would 
-/// losslessly preserve the bits and semantics of the value. This predicate is 
-/// symmetric, i.e swapping \p FromTy and \p ToTy should give the same result. 
-/// 
-/// Note that Type::canLosslesslyBitCastTo is not suitable here because it 
-/// allows semantically unequivalent bitcasts, such as <2 x i64> -> <4 x i32>, 
-/// and also does not allow lossless pointer <-> integer conversions. 
-static bool isBitCastSemanticsPreserving(const DataLayout &DL, Type *FromTy, 
-                                         Type *ToTy) { 
-  // Trivially compatible types. 
-  if (FromTy == ToTy) 
-    return true; 
- 
-  // Handle compatible pointer <-> integer conversions. 
-  if (FromTy->isIntOrPtrTy() && ToTy->isIntOrPtrTy()) { 
-    bool SameSize = DL.getTypeSizeInBits(FromTy) == DL.getTypeSizeInBits(ToTy); 
-    bool LosslessConversion = !DL.isNonIntegralPointerType(FromTy) && 
-                              !DL.isNonIntegralPointerType(ToTy); 
-    return SameSize && LosslessConversion; 
-  } 
- 
-  // TODO: This is not exhaustive. 
-  return false; 
-} 
- 
-bool llvm::replaceAllDbgUsesWith(Instruction &From, Value &To, 
-                                 Instruction &DomPoint, DominatorTree &DT) { 
-  // Exit early if From has no debug users. 
-  if (!From.isUsedByMetadata()) 
-    return false; 
- 
-  assert(&From != &To && "Can't replace something with itself"); 
- 
-  Type *FromTy = From.getType(); 
-  Type *ToTy = To.getType(); 
- 
-  auto Identity = [&](DbgVariableIntrinsic &DII) -> DbgValReplacement { 
-    return DII.getExpression(); 
-  }; 
- 
-  // Handle no-op conversions. 
-  Module &M = *From.getModule(); 
-  const DataLayout &DL = M.getDataLayout(); 
-  if (isBitCastSemanticsPreserving(DL, FromTy, ToTy)) 
-    return rewriteDebugUsers(From, To, DomPoint, DT, Identity); 
- 
-  // Handle integer-to-integer widening and narrowing. 
-  // FIXME: Use DW_OP_convert when it's available everywhere. 
-  if (FromTy->isIntegerTy() && ToTy->isIntegerTy()) { 
-    uint64_t FromBits = FromTy->getPrimitiveSizeInBits(); 
-    uint64_t ToBits = ToTy->getPrimitiveSizeInBits(); 
-    assert(FromBits != ToBits && "Unexpected no-op conversion"); 
- 
-    // When the width of the result grows, assume that a debugger will only 
-    // access the low `FromBits` bits when inspecting the source variable. 
-    if (FromBits < ToBits) 
-      return rewriteDebugUsers(From, To, DomPoint, DT, Identity); 
- 
-    // The width of the result has shrunk. Use sign/zero extension to describe 
-    // the source variable's high bits. 
-    auto SignOrZeroExt = [&](DbgVariableIntrinsic &DII) -> DbgValReplacement { 
-      DILocalVariable *Var = DII.getVariable(); 
- 
-      // Without knowing signedness, sign/zero extension isn't possible. 
-      auto Signedness = Var->getSignedness(); 
-      if (!Signedness) 
-        return None; 
- 
-      bool Signed = *Signedness == DIBasicType::Signedness::Signed; 
-      return DIExpression::appendExt(DII.getExpression(), ToBits, FromBits, 
-                                     Signed); 
-    }; 
-    return rewriteDebugUsers(From, To, DomPoint, DT, SignOrZeroExt); 
-  } 
- 
-  // TODO: Floating-point conversions, vectors. 
-  return false; 
-} 
- 
+}
+
+/// Inserts a llvm.dbg.value intrinsic before a store to an alloca'd value
+/// that has an associated llvm.dbg.declare or llvm.dbg.addr intrinsic.
+void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
+                                           StoreInst *SI, DIBuilder &Builder) {
+  assert(DII->isAddressOfVariable());
+  auto *DIVar = DII->getVariable();
+  assert(DIVar && "Missing variable");
+  auto *DIExpr = DII->getExpression();
+  Value *DV = SI->getValueOperand();
+
+  DebugLoc NewLoc = getDebugValueLoc(DII, SI);
+
+  if (!valueCoversEntireFragment(DV->getType(), DII)) {
+    // FIXME: If storing to a part of the variable described by the dbg.declare,
+    // then we want to insert a dbg.value for the corresponding fragment.
+    LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: "
+                      << *DII << '\n');
+    // For now, when there is a store to parts of the variable (but we do not
+    // know which part) we insert an dbg.value instrinsic to indicate that we
+    // know nothing about the variable's content.
+    DV = UndefValue::get(DV->getType());
+    Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc, SI);
+    return;
+  }
+
+  Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc, SI);
+}
+
+/// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value
+/// that has an associated llvm.dbg.declare or llvm.dbg.addr intrinsic.
+void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
+                                           LoadInst *LI, DIBuilder &Builder) {
+  auto *DIVar = DII->getVariable();
+  auto *DIExpr = DII->getExpression();
+  assert(DIVar && "Missing variable");
+
+  if (!valueCoversEntireFragment(LI->getType(), DII)) {
+    // FIXME: If only referring to a part of the variable described by the
+    // dbg.declare, then we want to insert a dbg.value for the corresponding
+    // fragment.
+    LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: "
+                      << *DII << '\n');
+    return;
+  }
+
+  DebugLoc NewLoc = getDebugValueLoc(DII, nullptr);
+
+  // We are now tracking the loaded value instead of the address. In the
+  // future if multi-location support is added to the IR, it might be
+  // preferable to keep tracking both the loaded value and the original
+  // address in case the alloca can not be elided.
+  Instruction *DbgValue = Builder.insertDbgValueIntrinsic(
+      LI, DIVar, DIExpr, NewLoc, (Instruction *)nullptr);
+  DbgValue->insertAfter(LI);
+}
+
+/// Inserts a llvm.dbg.value intrinsic after a phi that has an associated
+/// llvm.dbg.declare or llvm.dbg.addr intrinsic.
+void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
+                                           PHINode *APN, DIBuilder &Builder) {
+  auto *DIVar = DII->getVariable();
+  auto *DIExpr = DII->getExpression();
+  assert(DIVar && "Missing variable");
+
+  if (PhiHasDebugValue(DIVar, DIExpr, APN))
+    return;
+
+  if (!valueCoversEntireFragment(APN->getType(), DII)) {
+    // FIXME: If only referring to a part of the variable described by the
+    // dbg.declare, then we want to insert a dbg.value for the corresponding
+    // fragment.
+    LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: "
+                      << *DII << '\n');
+    return;
+  }
+
+  BasicBlock *BB = APN->getParent();
+  auto InsertionPt = BB->getFirstInsertionPt();
+
+  DebugLoc NewLoc = getDebugValueLoc(DII, nullptr);
+
+  // The block may be a catchswitch block, which does not have a valid
+  // insertion point.
+  // FIXME: Insert dbg.value markers in the successors when appropriate.
+  if (InsertionPt != BB->end())
+    Builder.insertDbgValueIntrinsic(APN, DIVar, DIExpr, NewLoc, &*InsertionPt);
+}
+
+/// Determine whether this alloca is either a VLA or an array.
+static bool isArray(AllocaInst *AI) {
+  return AI->isArrayAllocation() ||
+         (AI->getAllocatedType() && AI->getAllocatedType()->isArrayTy());
+}
+
+/// Determine whether this alloca is a structure.
+static bool isStructure(AllocaInst *AI) {
+  return AI->getAllocatedType() && AI->getAllocatedType()->isStructTy();
+}
+
+/// LowerDbgDeclare - Lowers llvm.dbg.declare intrinsics into appropriate set
+/// of llvm.dbg.value intrinsics.
+bool llvm::LowerDbgDeclare(Function &F) {
+  bool Changed = false;
+  DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false);
+  SmallVector<DbgDeclareInst *, 4> Dbgs;
+  for (auto &FI : F)
+    for (Instruction &BI : FI)
+      if (auto DDI = dyn_cast<DbgDeclareInst>(&BI))
+        Dbgs.push_back(DDI);
+
+  if (Dbgs.empty())
+    return Changed;
+
+  for (auto &I : Dbgs) {
+    DbgDeclareInst *DDI = I;
+    AllocaInst *AI = dyn_cast_or_null<AllocaInst>(DDI->getAddress());
+    // If this is an alloca for a scalar variable, insert a dbg.value
+    // at each load and store to the alloca and erase the dbg.declare.
+    // The dbg.values allow tracking a variable even if it is not
+    // stored on the stack, while the dbg.declare can only describe
+    // the stack slot (and at a lexical-scope granularity). Later
+    // passes will attempt to elide the stack slot.
+    if (!AI || isArray(AI) || isStructure(AI))
+      continue;
+
+    // A volatile load/store means that the alloca can't be elided anyway.
+    if (llvm::any_of(AI->users(), [](User *U) -> bool {
+          if (LoadInst *LI = dyn_cast<LoadInst>(U))
+            return LI->isVolatile();
+          if (StoreInst *SI = dyn_cast<StoreInst>(U))
+            return SI->isVolatile();
+          return false;
+        }))
+      continue;
+
+    SmallVector<const Value *, 8> WorkList;
+    WorkList.push_back(AI);
+    while (!WorkList.empty()) {
+      const Value *V = WorkList.pop_back_val();
+      for (auto &AIUse : V->uses()) {
+        User *U = AIUse.getUser();
+        if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+          if (AIUse.getOperandNo() == 1)
+            ConvertDebugDeclareToDebugValue(DDI, SI, DIB);
+        } else if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
+          ConvertDebugDeclareToDebugValue(DDI, LI, DIB);
+        } else if (CallInst *CI = dyn_cast<CallInst>(U)) {
+          // This is a call by-value or some other instruction that takes a
+          // pointer to the variable. Insert a *value* intrinsic that describes
+          // the variable by dereferencing the alloca.
+          if (!CI->isLifetimeStartOrEnd()) {
+            DebugLoc NewLoc = getDebugValueLoc(DDI, nullptr);
+            auto *DerefExpr =
+                DIExpression::append(DDI->getExpression(), dwarf::DW_OP_deref);
+            DIB.insertDbgValueIntrinsic(AI, DDI->getVariable(), DerefExpr,
+                                        NewLoc, CI);
+          }
+        } else if (BitCastInst *BI = dyn_cast<BitCastInst>(U)) {
+          if (BI->getType()->isPointerTy())
+            WorkList.push_back(BI);
+        }
+      }
+    }
+    DDI->eraseFromParent();
+    Changed = true;
+  }
+
+  if (Changed)
+  for (BasicBlock &BB : F)
+    RemoveRedundantDbgInstrs(&BB);
+
+  return Changed;
+}
+
+/// Propagate dbg.value intrinsics through the newly inserted PHIs.
+void llvm::insertDebugValuesForPHIs(BasicBlock *BB,
+                                    SmallVectorImpl<PHINode *> &InsertedPHIs) {
+  assert(BB && "No BasicBlock to clone dbg.value(s) from.");
+  if (InsertedPHIs.size() == 0)
+    return;
+
+  // Map existing PHI nodes to their dbg.values.
+  ValueToValueMapTy DbgValueMap;
+  for (auto &I : *BB) {
+    if (auto DbgII = dyn_cast<DbgVariableIntrinsic>(&I)) {
+      if (auto *Loc = dyn_cast_or_null<PHINode>(DbgII->getVariableLocation()))
+        DbgValueMap.insert({Loc, DbgII});
+    }
+  }
+  if (DbgValueMap.size() == 0)
+    return;
+
+  // Then iterate through the new PHIs and look to see if they use one of the
+  // previously mapped PHIs. If so, insert a new dbg.value intrinsic that will
+  // propagate the info through the new PHI.
+  LLVMContext &C = BB->getContext();
+  for (auto PHI : InsertedPHIs) {
+    BasicBlock *Parent = PHI->getParent();
+    // Avoid inserting an intrinsic into an EH block.
+    if (Parent->getFirstNonPHI()->isEHPad())
+      continue;
+    auto PhiMAV = MetadataAsValue::get(C, ValueAsMetadata::get(PHI));
+    for (auto VI : PHI->operand_values()) {
+      auto V = DbgValueMap.find(VI);
+      if (V != DbgValueMap.end()) {
+        auto *DbgII = cast<DbgVariableIntrinsic>(V->second);
+        Instruction *NewDbgII = DbgII->clone();
+        NewDbgII->setOperand(0, PhiMAV);
+        auto InsertionPt = Parent->getFirstInsertionPt();
+        assert(InsertionPt != Parent->end() && "Ill-formed basic block");
+        NewDbgII->insertBefore(&*InsertionPt);
+      }
+    }
+  }
+}
+
+/// Finds all intrinsics declaring local variables as living in the memory that
+/// 'V' points to. This may include a mix of dbg.declare and
+/// dbg.addr intrinsics.
+TinyPtrVector<DbgVariableIntrinsic *> llvm::FindDbgAddrUses(Value *V) {
+  // This function is hot. Check whether the value has any metadata to avoid a
+  // DenseMap lookup.
+  if (!V->isUsedByMetadata())
+    return {};
+  auto *L = LocalAsMetadata::getIfExists(V);
+  if (!L)
+    return {};
+  auto *MDV = MetadataAsValue::getIfExists(V->getContext(), L);
+  if (!MDV)
+    return {};
+
+  TinyPtrVector<DbgVariableIntrinsic *> Declares;
+  for (User *U : MDV->users()) {
+    if (auto *DII = dyn_cast<DbgVariableIntrinsic>(U))
+      if (DII->isAddressOfVariable())
+        Declares.push_back(DII);
+  }
+
+  return Declares;
+}
+
+TinyPtrVector<DbgDeclareInst *> llvm::FindDbgDeclareUses(Value *V) {
+  TinyPtrVector<DbgDeclareInst *> DDIs;
+  for (DbgVariableIntrinsic *DVI : FindDbgAddrUses(V))
+    if (auto *DDI = dyn_cast<DbgDeclareInst>(DVI))
+      DDIs.push_back(DDI);
+  return DDIs;
+}
+
+void llvm::findDbgValues(SmallVectorImpl<DbgValueInst *> &DbgValues, Value *V) {
+  // This function is hot. Check whether the value has any metadata to avoid a
+  // DenseMap lookup.
+  if (!V->isUsedByMetadata())
+    return;
+  if (auto *L = LocalAsMetadata::getIfExists(V))
+    if (auto *MDV = MetadataAsValue::getIfExists(V->getContext(), L))
+      for (User *U : MDV->users())
+        if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U))
+          DbgValues.push_back(DVI);
+}
+
+void llvm::findDbgUsers(SmallVectorImpl<DbgVariableIntrinsic *> &DbgUsers,
+                        Value *V) {
+  // This function is hot. Check whether the value has any metadata to avoid a
+  // DenseMap lookup.
+  if (!V->isUsedByMetadata())
+    return;
+  if (auto *L = LocalAsMetadata::getIfExists(V))
+    if (auto *MDV = MetadataAsValue::getIfExists(V->getContext(), L))
+      for (User *U : MDV->users())
+        if (DbgVariableIntrinsic *DII = dyn_cast<DbgVariableIntrinsic>(U))
+          DbgUsers.push_back(DII);
+}
+
+bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress,
+                             DIBuilder &Builder, uint8_t DIExprFlags,
+                             int Offset) {
+  auto DbgAddrs = FindDbgAddrUses(Address);
+  for (DbgVariableIntrinsic *DII : DbgAddrs) {
+    DebugLoc Loc = DII->getDebugLoc();
+    auto *DIVar = DII->getVariable();
+    auto *DIExpr = DII->getExpression();
+    assert(DIVar && "Missing variable");
+    DIExpr = DIExpression::prepend(DIExpr, DIExprFlags, Offset);
+    // Insert llvm.dbg.declare immediately before DII, and remove old
+    // llvm.dbg.declare.
+    Builder.insertDeclare(NewAddress, DIVar, DIExpr, Loc, DII);
+    DII->eraseFromParent();
+  }
+  return !DbgAddrs.empty();
+}
+
+static void replaceOneDbgValueForAlloca(DbgValueInst *DVI, Value *NewAddress,
+                                        DIBuilder &Builder, int Offset) {
+  DebugLoc Loc = DVI->getDebugLoc();
+  auto *DIVar = DVI->getVariable();
+  auto *DIExpr = DVI->getExpression();
+  assert(DIVar && "Missing variable");
+
+  // This is an alloca-based llvm.dbg.value. The first thing it should do with
+  // the alloca pointer is dereference it. Otherwise we don't know how to handle
+  // it and give up.
+  if (!DIExpr || DIExpr->getNumElements() < 1 ||
+      DIExpr->getElement(0) != dwarf::DW_OP_deref)
+    return;
+
+  // Insert the offset before the first deref.
+  // We could just change the offset argument of dbg.value, but it's unsigned...
+  if (Offset)
+    DIExpr = DIExpression::prepend(DIExpr, 0, Offset);
+
+  Builder.insertDbgValueIntrinsic(NewAddress, DIVar, DIExpr, Loc, DVI);
+  DVI->eraseFromParent();
+}
+
+void llvm::replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
+                                    DIBuilder &Builder, int Offset) {
+  if (auto *L = LocalAsMetadata::getIfExists(AI))
+    if (auto *MDV = MetadataAsValue::getIfExists(AI->getContext(), L))
+      for (auto UI = MDV->use_begin(), UE = MDV->use_end(); UI != UE;) {
+        Use &U = *UI++;
+        if (auto *DVI = dyn_cast<DbgValueInst>(U.getUser()))
+          replaceOneDbgValueForAlloca(DVI, NewAllocaAddress, Builder, Offset);
+      }
+}
+
+/// Wrap \p V in a ValueAsMetadata instance.
+static MetadataAsValue *wrapValueInMetadata(LLVMContext &C, Value *V) {
+  return MetadataAsValue::get(C, ValueAsMetadata::get(V));
+}
+
+/// Where possible to salvage debug information for \p I do so
+/// and return True. If not possible mark undef and return False.
+void llvm::salvageDebugInfo(Instruction &I) {
+  SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
+  findDbgUsers(DbgUsers, &I);
+  salvageDebugInfoForDbgValues(I, DbgUsers);
+}
+
+void llvm::salvageDebugInfoForDbgValues(
+    Instruction &I, ArrayRef<DbgVariableIntrinsic *> DbgUsers) {
+  auto &Ctx = I.getContext();
+  bool Salvaged = false;
+  auto wrapMD = [&](Value *V) { return wrapValueInMetadata(Ctx, V); };
+
+  for (auto *DII : DbgUsers) {
+    // Do not add DW_OP_stack_value for DbgDeclare and DbgAddr, because they
+    // are implicitly pointing out the value as a DWARF memory location
+    // description.
+    bool StackValue = isa<DbgValueInst>(DII);
+
+    DIExpression *DIExpr =
+        salvageDebugInfoImpl(I, DII->getExpression(), StackValue);
+
+    // salvageDebugInfoImpl should fail on examining the first element of
+    // DbgUsers, or none of them.
+    if (!DIExpr)
+      break;
+
+    DII->setOperand(0, wrapMD(I.getOperand(0)));
+    DII->setOperand(2, MetadataAsValue::get(Ctx, DIExpr));
+    LLVM_DEBUG(dbgs() << "SALVAGE: " << *DII << '\n');
+    Salvaged = true;
+  }
+
+  if (Salvaged)
+    return;
+
+  for (auto *DII : DbgUsers) {
+    Value *Undef = UndefValue::get(I.getType());
+    DII->setOperand(0, MetadataAsValue::get(DII->getContext(),
+                                            ValueAsMetadata::get(Undef)));
+  }
+}
+
+DIExpression *llvm::salvageDebugInfoImpl(Instruction &I,
+                                         DIExpression *SrcDIExpr,
+                                         bool WithStackValue) {
+  auto &M = *I.getModule();
+  auto &DL = M.getDataLayout();
+
+  // Apply a vector of opcodes to the source DIExpression.
+  auto doSalvage = [&](SmallVectorImpl<uint64_t> &Ops) -> DIExpression * {
+    DIExpression *DIExpr = SrcDIExpr;
+    if (!Ops.empty()) {
+      DIExpr = DIExpression::prependOpcodes(DIExpr, Ops, WithStackValue);
+    }
+    return DIExpr;
+  };
+
+  // Apply the given offset to the source DIExpression.
+  auto applyOffset = [&](uint64_t Offset) -> DIExpression * {
+    SmallVector<uint64_t, 8> Ops;
+    DIExpression::appendOffset(Ops, Offset);
+    return doSalvage(Ops);
+  };
+
+  // initializer-list helper for applying operators to the source DIExpression.
+  auto applyOps = [&](ArrayRef<uint64_t> Opcodes) -> DIExpression * {
+    SmallVector<uint64_t, 8> Ops(Opcodes.begin(), Opcodes.end());
+    return doSalvage(Ops);
+  };
+
+  if (auto *CI = dyn_cast<CastInst>(&I)) {
+    // No-op casts are irrelevant for debug info.
+    if (CI->isNoopCast(DL))
+      return SrcDIExpr;
+
+    Type *Type = CI->getType();
+    // Casts other than Trunc, SExt, or ZExt to scalar types cannot be salvaged.
+    if (Type->isVectorTy() ||
+        !(isa<TruncInst>(&I) || isa<SExtInst>(&I) || isa<ZExtInst>(&I)))
+      return nullptr;
+
+    Value *FromValue = CI->getOperand(0);
+    unsigned FromTypeBitSize = FromValue->getType()->getScalarSizeInBits();
+    unsigned ToTypeBitSize = Type->getScalarSizeInBits();
+
+    return applyOps(DIExpression::getExtOps(FromTypeBitSize, ToTypeBitSize,
+                                            isa<SExtInst>(&I)));
+  }
+
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+    unsigned BitWidth =
+        M.getDataLayout().getIndexSizeInBits(GEP->getPointerAddressSpace());
+    // Rewrite a constant GEP into a DIExpression.
+    APInt Offset(BitWidth, 0);
+    if (GEP->accumulateConstantOffset(M.getDataLayout(), Offset)) {
+      return applyOffset(Offset.getSExtValue());
+    } else {
+      return nullptr;
+    }
+  } else if (auto *BI = dyn_cast<BinaryOperator>(&I)) {
+    // Rewrite binary operations with constant integer operands.
+    auto *ConstInt = dyn_cast<ConstantInt>(I.getOperand(1));
+    if (!ConstInt || ConstInt->getBitWidth() > 64)
+      return nullptr;
+
+    uint64_t Val = ConstInt->getSExtValue();
+    switch (BI->getOpcode()) {
+    case Instruction::Add:
+      return applyOffset(Val);
+    case Instruction::Sub:
+      return applyOffset(-int64_t(Val));
+    case Instruction::Mul:
+      return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_mul});
+    case Instruction::SDiv:
+      return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_div});
+    case Instruction::SRem:
+      return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_mod});
+    case Instruction::Or:
+      return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_or});
+    case Instruction::And:
+      return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_and});
+    case Instruction::Xor:
+      return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_xor});
+    case Instruction::Shl:
+      return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_shl});
+    case Instruction::LShr:
+      return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_shr});
+    case Instruction::AShr:
+      return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_shra});
+    default:
+      // TODO: Salvage constants from each kind of binop we know about.
+      return nullptr;
+    }
+    // *Not* to do: we should not attempt to salvage load instructions,
+    // because the validity and lifetime of a dbg.value containing
+    // DW_OP_deref becomes difficult to analyze. See PR40628 for examples.
+  }
+  return nullptr;
+}
+
+/// A replacement for a dbg.value expression.
+using DbgValReplacement = Optional<DIExpression *>;
+
+/// Point debug users of \p From to \p To using exprs given by \p RewriteExpr,
+/// possibly moving/undefing users to prevent use-before-def. Returns true if
+/// changes are made.
+static bool rewriteDebugUsers(
+    Instruction &From, Value &To, Instruction &DomPoint, DominatorTree &DT,
+    function_ref<DbgValReplacement(DbgVariableIntrinsic &DII)> RewriteExpr) {
+  // Find debug users of From.
+  SmallVector<DbgVariableIntrinsic *, 1> Users;
+  findDbgUsers(Users, &From);
+  if (Users.empty())
+    return false;
+
+  // Prevent use-before-def of To.
+  bool Changed = false;
+  SmallPtrSet<DbgVariableIntrinsic *, 1> UndefOrSalvage;
+  if (isa<Instruction>(&To)) {
+    bool DomPointAfterFrom = From.getNextNonDebugInstruction() == &DomPoint;
+
+    for (auto *DII : Users) {
+      // It's common to see a debug user between From and DomPoint. Move it
+      // after DomPoint to preserve the variable update without any reordering.
+      if (DomPointAfterFrom && DII->getNextNonDebugInstruction() == &DomPoint) {
+        LLVM_DEBUG(dbgs() << "MOVE:  " << *DII << '\n');
+        DII->moveAfter(&DomPoint);
+        Changed = true;
+
+      // Users which otherwise aren't dominated by the replacement value must
+      // be salvaged or deleted.
+      } else if (!DT.dominates(&DomPoint, DII)) {
+        UndefOrSalvage.insert(DII);
+      }
+    }
+  }
+
+  // Update debug users without use-before-def risk.
+  for (auto *DII : Users) {
+    if (UndefOrSalvage.count(DII))
+      continue;
+
+    LLVMContext &Ctx = DII->getContext();
+    DbgValReplacement DVR = RewriteExpr(*DII);
+    if (!DVR)
+      continue;
+
+    DII->setOperand(0, wrapValueInMetadata(Ctx, &To));
+    DII->setOperand(2, MetadataAsValue::get(Ctx, *DVR));
+    LLVM_DEBUG(dbgs() << "REWRITE:  " << *DII << '\n');
+    Changed = true;
+  }
+
+  if (!UndefOrSalvage.empty()) {
+    // Try to salvage the remaining debug users.
+    salvageDebugInfo(From);
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+/// Check if a bitcast between a value of type \p FromTy to type \p ToTy would
+/// losslessly preserve the bits and semantics of the value. This predicate is
+/// symmetric, i.e swapping \p FromTy and \p ToTy should give the same result.
+///
+/// Note that Type::canLosslesslyBitCastTo is not suitable here because it
+/// allows semantically unequivalent bitcasts, such as <2 x i64> -> <4 x i32>,
+/// and also does not allow lossless pointer <-> integer conversions.
+static bool isBitCastSemanticsPreserving(const DataLayout &DL, Type *FromTy,
+                                         Type *ToTy) {
+  // Trivially compatible types.
+  if (FromTy == ToTy)
+    return true;
+
+  // Handle compatible pointer <-> integer conversions.
+  if (FromTy->isIntOrPtrTy() && ToTy->isIntOrPtrTy()) {
+    bool SameSize = DL.getTypeSizeInBits(FromTy) == DL.getTypeSizeInBits(ToTy);
+    bool LosslessConversion = !DL.isNonIntegralPointerType(FromTy) &&
+                              !DL.isNonIntegralPointerType(ToTy);
+    return SameSize && LosslessConversion;
+  }
+
+  // TODO: This is not exhaustive.
+  return false;
+}
+
+bool llvm::replaceAllDbgUsesWith(Instruction &From, Value &To,
+                                 Instruction &DomPoint, DominatorTree &DT) {
+  // Exit early if From has no debug users.
+  if (!From.isUsedByMetadata())
+    return false;
+
+  assert(&From != &To && "Can't replace something with itself");
+
+  Type *FromTy = From.getType();
+  Type *ToTy = To.getType();
+
+  auto Identity = [&](DbgVariableIntrinsic &DII) -> DbgValReplacement {
+    return DII.getExpression();
+  };
+
+  // Handle no-op conversions.
+  Module &M = *From.getModule();
+  const DataLayout &DL = M.getDataLayout();
+  if (isBitCastSemanticsPreserving(DL, FromTy, ToTy))
+    return rewriteDebugUsers(From, To, DomPoint, DT, Identity);
+
+  // Handle integer-to-integer widening and narrowing.
+  // FIXME: Use DW_OP_convert when it's available everywhere.
+  if (FromTy->isIntegerTy() && ToTy->isIntegerTy()) {
+    uint64_t FromBits = FromTy->getPrimitiveSizeInBits();
+    uint64_t ToBits = ToTy->getPrimitiveSizeInBits();
+    assert(FromBits != ToBits && "Unexpected no-op conversion");
+
+    // When the width of the result grows, assume that a debugger will only
+    // access the low `FromBits` bits when inspecting the source variable.
+    if (FromBits < ToBits)
+      return rewriteDebugUsers(From, To, DomPoint, DT, Identity);
+
+    // The width of the result has shrunk. Use sign/zero extension to describe
+    // the source variable's high bits.
+    auto SignOrZeroExt = [&](DbgVariableIntrinsic &DII) -> DbgValReplacement {
+      DILocalVariable *Var = DII.getVariable();
+
+      // Without knowing signedness, sign/zero extension isn't possible.
+      auto Signedness = Var->getSignedness();
+      if (!Signedness)
+        return None;
+
+      bool Signed = *Signedness == DIBasicType::Signedness::Signed;
+      return DIExpression::appendExt(DII.getExpression(), ToBits, FromBits,
+                                     Signed);
+    };
+    return rewriteDebugUsers(From, To, DomPoint, DT, SignOrZeroExt);
+  }
+
+  // TODO: Floating-point conversions, vectors.
+  return false;
+}
+
 std::pair<unsigned, unsigned>
 llvm::removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB) {
-  unsigned NumDeadInst = 0; 
+  unsigned NumDeadInst = 0;
   unsigned NumDeadDbgInst = 0;
-  // Delete the instructions backwards, as it has a reduced likelihood of 
-  // having to update as many def-use and use-def chains. 
-  Instruction *EndInst = BB->getTerminator(); // Last not to be deleted. 
-  while (EndInst != &BB->front()) { 
-    // Delete the next to last instruction. 
-    Instruction *Inst = &*--EndInst->getIterator(); 
-    if (!Inst->use_empty() && !Inst->getType()->isTokenTy()) 
-      Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); 
-    if (Inst->isEHPad() || Inst->getType()->isTokenTy()) { 
-      EndInst = Inst; 
-      continue; 
-    } 
+  // Delete the instructions backwards, as it has a reduced likelihood of
+  // having to update as many def-use and use-def chains.
+  Instruction *EndInst = BB->getTerminator(); // Last not to be deleted.
+  while (EndInst != &BB->front()) {
+    // Delete the next to last instruction.
+    Instruction *Inst = &*--EndInst->getIterator();
+    if (!Inst->use_empty() && !Inst->getType()->isTokenTy())
+      Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
+    if (Inst->isEHPad() || Inst->getType()->isTokenTy()) {
+      EndInst = Inst;
+      continue;
+    }
     if (isa<DbgInfoIntrinsic>(Inst))
       ++NumDeadDbgInst;
     else
-      ++NumDeadInst; 
-    Inst->eraseFromParent(); 
-  } 
+      ++NumDeadInst;
+    Inst->eraseFromParent();
+  }
   return {NumDeadInst, NumDeadDbgInst};
-} 
- 
-unsigned llvm::changeToUnreachable(Instruction *I, bool UseLLVMTrap, 
-                                   bool PreserveLCSSA, DomTreeUpdater *DTU, 
-                                   MemorySSAUpdater *MSSAU) { 
-  BasicBlock *BB = I->getParent(); 
- 
-  if (MSSAU) 
-    MSSAU->changeToUnreachable(I); 
- 
+}
+
+unsigned llvm::changeToUnreachable(Instruction *I, bool UseLLVMTrap,
+                                   bool PreserveLCSSA, DomTreeUpdater *DTU,
+                                   MemorySSAUpdater *MSSAU) {
+  BasicBlock *BB = I->getParent();
+
+  if (MSSAU)
+    MSSAU->changeToUnreachable(I);
+
   SmallSetVector<BasicBlock *, 8> UniqueSuccessors;
 
-  // Loop over all of the successors, removing BB's entry from any PHI 
-  // nodes. 
-  for (BasicBlock *Successor : successors(BB)) { 
-    Successor->removePredecessor(BB, PreserveLCSSA); 
-    if (DTU) 
+  // Loop over all of the successors, removing BB's entry from any PHI
+  // nodes.
+  for (BasicBlock *Successor : successors(BB)) {
+    Successor->removePredecessor(BB, PreserveLCSSA);
+    if (DTU)
       UniqueSuccessors.insert(Successor);
-  } 
-  // Insert a call to llvm.trap right before this.  This turns the undefined 
-  // behavior into a hard fail instead of falling through into random code. 
-  if (UseLLVMTrap) { 
-    Function *TrapFn = 
-      Intrinsic::getDeclaration(BB->getParent()->getParent(), Intrinsic::trap); 
-    CallInst *CallTrap = CallInst::Create(TrapFn, "", I); 
-    CallTrap->setDebugLoc(I->getDebugLoc()); 
-  } 
-  auto *UI = new UnreachableInst(I->getContext(), I); 
-  UI->setDebugLoc(I->getDebugLoc()); 
- 
-  // All instructions after this are dead. 
-  unsigned NumInstrsRemoved = 0; 
-  BasicBlock::iterator BBI = I->getIterator(), BBE = BB->end(); 
-  while (BBI != BBE) { 
-    if (!BBI->use_empty()) 
-      BBI->replaceAllUsesWith(UndefValue::get(BBI->getType())); 
-    BB->getInstList().erase(BBI++); 
-    ++NumInstrsRemoved; 
-  } 
+  }
+  // Insert a call to llvm.trap right before this.  This turns the undefined
+  // behavior into a hard fail instead of falling through into random code.
+  if (UseLLVMTrap) {
+    Function *TrapFn =
+      Intrinsic::getDeclaration(BB->getParent()->getParent(), Intrinsic::trap);
+    CallInst *CallTrap = CallInst::Create(TrapFn, "", I);
+    CallTrap->setDebugLoc(I->getDebugLoc());
+  }
+  auto *UI = new UnreachableInst(I->getContext(), I);
+  UI->setDebugLoc(I->getDebugLoc());
+
+  // All instructions after this are dead.
+  unsigned NumInstrsRemoved = 0;
+  BasicBlock::iterator BBI = I->getIterator(), BBE = BB->end();
+  while (BBI != BBE) {
+    if (!BBI->use_empty())
+      BBI->replaceAllUsesWith(UndefValue::get(BBI->getType()));
+    BB->getInstList().erase(BBI++);
+    ++NumInstrsRemoved;
+  }
   if (DTU) {
     SmallVector<DominatorTree::UpdateType, 8> Updates;
     Updates.reserve(UniqueSuccessors.size());
@@ -2090,892 +2090,892 @@ unsigned llvm::changeToUnreachable(Instruction *I, bool UseLLVMTrap,
       Updates.push_back({DominatorTree::Delete, BB, UniqueSuccessor});
     DTU->applyUpdates(Updates);
   }
-  return NumInstrsRemoved; 
-} 
- 
-CallInst *llvm::createCallMatchingInvoke(InvokeInst *II) { 
+  return NumInstrsRemoved;
+}
+
+CallInst *llvm::createCallMatchingInvoke(InvokeInst *II) {
   SmallVector<Value *, 8> Args(II->args());
-  SmallVector<OperandBundleDef, 1> OpBundles; 
-  II->getOperandBundlesAsDefs(OpBundles); 
-  CallInst *NewCall = CallInst::Create(II->getFunctionType(), 
-                                       II->getCalledOperand(), Args, OpBundles); 
-  NewCall->setCallingConv(II->getCallingConv()); 
-  NewCall->setAttributes(II->getAttributes()); 
-  NewCall->setDebugLoc(II->getDebugLoc()); 
-  NewCall->copyMetadata(*II); 
- 
-  // If the invoke had profile metadata, try converting them for CallInst. 
-  uint64_t TotalWeight; 
-  if (NewCall->extractProfTotalWeight(TotalWeight)) { 
-    // Set the total weight if it fits into i32, otherwise reset. 
-    MDBuilder MDB(NewCall->getContext()); 
-    auto NewWeights = uint32_t(TotalWeight) != TotalWeight 
-                          ? nullptr 
-                          : MDB.createBranchWeights({uint32_t(TotalWeight)}); 
-    NewCall->setMetadata(LLVMContext::MD_prof, NewWeights); 
-  } 
- 
-  return NewCall; 
-} 
- 
-/// changeToCall - Convert the specified invoke into a normal call. 
-void llvm::changeToCall(InvokeInst *II, DomTreeUpdater *DTU) { 
-  CallInst *NewCall = createCallMatchingInvoke(II); 
-  NewCall->takeName(II); 
-  NewCall->insertBefore(II); 
-  II->replaceAllUsesWith(NewCall); 
- 
-  // Follow the call by a branch to the normal destination. 
-  BasicBlock *NormalDestBB = II->getNormalDest(); 
-  BranchInst::Create(NormalDestBB, II); 
- 
-  // Update PHI nodes in the unwind destination 
-  BasicBlock *BB = II->getParent(); 
-  BasicBlock *UnwindDestBB = II->getUnwindDest(); 
-  UnwindDestBB->removePredecessor(BB); 
-  II->eraseFromParent(); 
-  if (DTU) 
+  SmallVector<OperandBundleDef, 1> OpBundles;
+  II->getOperandBundlesAsDefs(OpBundles);
+  CallInst *NewCall = CallInst::Create(II->getFunctionType(),
+                                       II->getCalledOperand(), Args, OpBundles);
+  NewCall->setCallingConv(II->getCallingConv());
+  NewCall->setAttributes(II->getAttributes());
+  NewCall->setDebugLoc(II->getDebugLoc());
+  NewCall->copyMetadata(*II);
+
+  // If the invoke had profile metadata, try converting them for CallInst.
+  uint64_t TotalWeight;
+  if (NewCall->extractProfTotalWeight(TotalWeight)) {
+    // Set the total weight if it fits into i32, otherwise reset.
+    MDBuilder MDB(NewCall->getContext());
+    auto NewWeights = uint32_t(TotalWeight) != TotalWeight
+                          ? nullptr
+                          : MDB.createBranchWeights({uint32_t(TotalWeight)});
+    NewCall->setMetadata(LLVMContext::MD_prof, NewWeights);
+  }
+
+  return NewCall;
+}
+
+/// changeToCall - Convert the specified invoke into a normal call.
+void llvm::changeToCall(InvokeInst *II, DomTreeUpdater *DTU) {
+  CallInst *NewCall = createCallMatchingInvoke(II);
+  NewCall->takeName(II);
+  NewCall->insertBefore(II);
+  II->replaceAllUsesWith(NewCall);
+
+  // Follow the call by a branch to the normal destination.
+  BasicBlock *NormalDestBB = II->getNormalDest();
+  BranchInst::Create(NormalDestBB, II);
+
+  // Update PHI nodes in the unwind destination
+  BasicBlock *BB = II->getParent();
+  BasicBlock *UnwindDestBB = II->getUnwindDest();
+  UnwindDestBB->removePredecessor(BB);
+  II->eraseFromParent();
+  if (DTU)
     DTU->applyUpdates({{DominatorTree::Delete, BB, UnwindDestBB}});
-} 
- 
-BasicBlock *llvm::changeToInvokeAndSplitBasicBlock(CallInst *CI, 
-                                                   BasicBlock *UnwindEdge) { 
-  BasicBlock *BB = CI->getParent(); 
- 
-  // Convert this function call into an invoke instruction.  First, split the 
-  // basic block. 
-  BasicBlock *Split = 
-      BB->splitBasicBlock(CI->getIterator(), CI->getName() + ".noexc"); 
- 
-  // Delete the unconditional branch inserted by splitBasicBlock 
-  BB->getInstList().pop_back(); 
- 
-  // Create the new invoke instruction. 
+}
+
+BasicBlock *llvm::changeToInvokeAndSplitBasicBlock(CallInst *CI,
+                                                   BasicBlock *UnwindEdge) {
+  BasicBlock *BB = CI->getParent();
+
+  // Convert this function call into an invoke instruction.  First, split the
+  // basic block.
+  BasicBlock *Split =
+      BB->splitBasicBlock(CI->getIterator(), CI->getName() + ".noexc");
+
+  // Delete the unconditional branch inserted by splitBasicBlock
+  BB->getInstList().pop_back();
+
+  // Create the new invoke instruction.
   SmallVector<Value *, 8> InvokeArgs(CI->args());
-  SmallVector<OperandBundleDef, 1> OpBundles; 
- 
-  CI->getOperandBundlesAsDefs(OpBundles); 
- 
-  // Note: we're round tripping operand bundles through memory here, and that 
-  // can potentially be avoided with a cleverer API design that we do not have 
-  // as of this time. 
- 
-  InvokeInst *II = 
-      InvokeInst::Create(CI->getFunctionType(), CI->getCalledOperand(), Split, 
-                         UnwindEdge, InvokeArgs, OpBundles, CI->getName(), BB); 
-  II->setDebugLoc(CI->getDebugLoc()); 
-  II->setCallingConv(CI->getCallingConv()); 
-  II->setAttributes(CI->getAttributes()); 
- 
-  // Make sure that anything using the call now uses the invoke!  This also 
-  // updates the CallGraph if present, because it uses a WeakTrackingVH. 
-  CI->replaceAllUsesWith(II); 
- 
-  // Delete the original call 
-  Split->getInstList().pop_front(); 
-  return Split; 
-} 
- 
-static bool markAliveBlocks(Function &F, 
-                            SmallPtrSetImpl<BasicBlock *> &Reachable, 
-                            DomTreeUpdater *DTU = nullptr) { 
-  SmallVector<BasicBlock*, 128> Worklist; 
-  BasicBlock *BB = &F.front(); 
-  Worklist.push_back(BB); 
-  Reachable.insert(BB); 
-  bool Changed = false; 
-  do { 
-    BB = Worklist.pop_back_val(); 
- 
-    // Do a quick scan of the basic block, turning any obviously unreachable 
-    // instructions into LLVM unreachable insts.  The instruction combining pass 
-    // canonicalizes unreachable insts into stores to null or undef. 
-    for (Instruction &I : *BB) { 
-      if (auto *CI = dyn_cast<CallInst>(&I)) { 
-        Value *Callee = CI->getCalledOperand(); 
-        // Handle intrinsic calls. 
-        if (Function *F = dyn_cast<Function>(Callee)) { 
-          auto IntrinsicID = F->getIntrinsicID(); 
-          // Assumptions that are known to be false are equivalent to 
-          // unreachable. Also, if the condition is undefined, then we make the 
-          // choice most beneficial to the optimizer, and choose that to also be 
-          // unreachable. 
-          if (IntrinsicID == Intrinsic::assume) { 
-            if (match(CI->getArgOperand(0), m_CombineOr(m_Zero(), m_Undef()))) { 
-              // Don't insert a call to llvm.trap right before the unreachable. 
-              changeToUnreachable(CI, false, false, DTU); 
-              Changed = true; 
-              break; 
-            } 
-          } else if (IntrinsicID == Intrinsic::experimental_guard) { 
-            // A call to the guard intrinsic bails out of the current 
-            // compilation unit if the predicate passed to it is false. If the 
-            // predicate is a constant false, then we know the guard will bail 
-            // out of the current compile unconditionally, so all code following 
-            // it is dead. 
-            // 
-            // Note: unlike in llvm.assume, it is not "obviously profitable" for 
-            // guards to treat `undef` as `false` since a guard on `undef` can 
-            // still be useful for widening. 
-            if (match(CI->getArgOperand(0), m_Zero())) 
-              if (!isa<UnreachableInst>(CI->getNextNode())) { 
-                changeToUnreachable(CI->getNextNode(), /*UseLLVMTrap=*/false, 
-                                    false, DTU); 
-                Changed = true; 
-                break; 
-              } 
-          } 
-        } else if ((isa<ConstantPointerNull>(Callee) && 
-                    !NullPointerIsDefined(CI->getFunction())) || 
-                   isa<UndefValue>(Callee)) { 
-          changeToUnreachable(CI, /*UseLLVMTrap=*/false, false, DTU); 
-          Changed = true; 
-          break; 
-        } 
-        if (CI->doesNotReturn() && !CI->isMustTailCall()) { 
-          // If we found a call to a no-return function, insert an unreachable 
-          // instruction after it.  Make sure there isn't *already* one there 
-          // though. 
-          if (!isa<UnreachableInst>(CI->getNextNode())) { 
-            // Don't insert a call to llvm.trap right before the unreachable. 
-            changeToUnreachable(CI->getNextNode(), false, false, DTU); 
-            Changed = true; 
-          } 
-          break; 
-        } 
-      } else if (auto *SI = dyn_cast<StoreInst>(&I)) { 
-        // Store to undef and store to null are undefined and used to signal 
-        // that they should be changed to unreachable by passes that can't 
-        // modify the CFG. 
- 
-        // Don't touch volatile stores. 
-        if (SI->isVolatile()) continue; 
- 
-        Value *Ptr = SI->getOperand(1); 
- 
-        if (isa<UndefValue>(Ptr) || 
-            (isa<ConstantPointerNull>(Ptr) && 
-             !NullPointerIsDefined(SI->getFunction(), 
-                                   SI->getPointerAddressSpace()))) { 
-          changeToUnreachable(SI, true, false, DTU); 
-          Changed = true; 
-          break; 
-        } 
-      } 
-    } 
- 
-    Instruction *Terminator = BB->getTerminator(); 
-    if (auto *II = dyn_cast<InvokeInst>(Terminator)) { 
-      // Turn invokes that call 'nounwind' functions into ordinary calls. 
-      Value *Callee = II->getCalledOperand(); 
-      if ((isa<ConstantPointerNull>(Callee) && 
-           !NullPointerIsDefined(BB->getParent())) || 
-          isa<UndefValue>(Callee)) { 
-        changeToUnreachable(II, true, false, DTU); 
-        Changed = true; 
-      } else if (II->doesNotThrow() && canSimplifyInvokeNoUnwind(&F)) { 
-        if (II->use_empty() && II->onlyReadsMemory()) { 
-          // jump to the normal destination branch. 
-          BasicBlock *NormalDestBB = II->getNormalDest(); 
-          BasicBlock *UnwindDestBB = II->getUnwindDest(); 
-          BranchInst::Create(NormalDestBB, II); 
-          UnwindDestBB->removePredecessor(II->getParent()); 
-          II->eraseFromParent(); 
-          if (DTU) 
+  SmallVector<OperandBundleDef, 1> OpBundles;
+
+  CI->getOperandBundlesAsDefs(OpBundles);
+
+  // Note: we're round tripping operand bundles through memory here, and that
+  // can potentially be avoided with a cleverer API design that we do not have
+  // as of this time.
+
+  InvokeInst *II =
+      InvokeInst::Create(CI->getFunctionType(), CI->getCalledOperand(), Split,
+                         UnwindEdge, InvokeArgs, OpBundles, CI->getName(), BB);
+  II->setDebugLoc(CI->getDebugLoc());
+  II->setCallingConv(CI->getCallingConv());
+  II->setAttributes(CI->getAttributes());
+
+  // Make sure that anything using the call now uses the invoke!  This also
+  // updates the CallGraph if present, because it uses a WeakTrackingVH.
+  CI->replaceAllUsesWith(II);
+
+  // Delete the original call
+  Split->getInstList().pop_front();
+  return Split;
+}
+
+static bool markAliveBlocks(Function &F,
+                            SmallPtrSetImpl<BasicBlock *> &Reachable,
+                            DomTreeUpdater *DTU = nullptr) {
+  SmallVector<BasicBlock*, 128> Worklist;
+  BasicBlock *BB = &F.front();
+  Worklist.push_back(BB);
+  Reachable.insert(BB);
+  bool Changed = false;
+  do {
+    BB = Worklist.pop_back_val();
+
+    // Do a quick scan of the basic block, turning any obviously unreachable
+    // instructions into LLVM unreachable insts.  The instruction combining pass
+    // canonicalizes unreachable insts into stores to null or undef.
+    for (Instruction &I : *BB) {
+      if (auto *CI = dyn_cast<CallInst>(&I)) {
+        Value *Callee = CI->getCalledOperand();
+        // Handle intrinsic calls.
+        if (Function *F = dyn_cast<Function>(Callee)) {
+          auto IntrinsicID = F->getIntrinsicID();
+          // Assumptions that are known to be false are equivalent to
+          // unreachable. Also, if the condition is undefined, then we make the
+          // choice most beneficial to the optimizer, and choose that to also be
+          // unreachable.
+          if (IntrinsicID == Intrinsic::assume) {
+            if (match(CI->getArgOperand(0), m_CombineOr(m_Zero(), m_Undef()))) {
+              // Don't insert a call to llvm.trap right before the unreachable.
+              changeToUnreachable(CI, false, false, DTU);
+              Changed = true;
+              break;
+            }
+          } else if (IntrinsicID == Intrinsic::experimental_guard) {
+            // A call to the guard intrinsic bails out of the current
+            // compilation unit if the predicate passed to it is false. If the
+            // predicate is a constant false, then we know the guard will bail
+            // out of the current compile unconditionally, so all code following
+            // it is dead.
+            //
+            // Note: unlike in llvm.assume, it is not "obviously profitable" for
+            // guards to treat `undef` as `false` since a guard on `undef` can
+            // still be useful for widening.
+            if (match(CI->getArgOperand(0), m_Zero()))
+              if (!isa<UnreachableInst>(CI->getNextNode())) {
+                changeToUnreachable(CI->getNextNode(), /*UseLLVMTrap=*/false,
+                                    false, DTU);
+                Changed = true;
+                break;
+              }
+          }
+        } else if ((isa<ConstantPointerNull>(Callee) &&
+                    !NullPointerIsDefined(CI->getFunction())) ||
+                   isa<UndefValue>(Callee)) {
+          changeToUnreachable(CI, /*UseLLVMTrap=*/false, false, DTU);
+          Changed = true;
+          break;
+        }
+        if (CI->doesNotReturn() && !CI->isMustTailCall()) {
+          // If we found a call to a no-return function, insert an unreachable
+          // instruction after it.  Make sure there isn't *already* one there
+          // though.
+          if (!isa<UnreachableInst>(CI->getNextNode())) {
+            // Don't insert a call to llvm.trap right before the unreachable.
+            changeToUnreachable(CI->getNextNode(), false, false, DTU);
+            Changed = true;
+          }
+          break;
+        }
+      } else if (auto *SI = dyn_cast<StoreInst>(&I)) {
+        // Store to undef and store to null are undefined and used to signal
+        // that they should be changed to unreachable by passes that can't
+        // modify the CFG.
+
+        // Don't touch volatile stores.
+        if (SI->isVolatile()) continue;
+
+        Value *Ptr = SI->getOperand(1);
+
+        if (isa<UndefValue>(Ptr) ||
+            (isa<ConstantPointerNull>(Ptr) &&
+             !NullPointerIsDefined(SI->getFunction(),
+                                   SI->getPointerAddressSpace()))) {
+          changeToUnreachable(SI, true, false, DTU);
+          Changed = true;
+          break;
+        }
+      }
+    }
+
+    Instruction *Terminator = BB->getTerminator();
+    if (auto *II = dyn_cast<InvokeInst>(Terminator)) {
+      // Turn invokes that call 'nounwind' functions into ordinary calls.
+      Value *Callee = II->getCalledOperand();
+      if ((isa<ConstantPointerNull>(Callee) &&
+           !NullPointerIsDefined(BB->getParent())) ||
+          isa<UndefValue>(Callee)) {
+        changeToUnreachable(II, true, false, DTU);
+        Changed = true;
+      } else if (II->doesNotThrow() && canSimplifyInvokeNoUnwind(&F)) {
+        if (II->use_empty() && II->onlyReadsMemory()) {
+          // jump to the normal destination branch.
+          BasicBlock *NormalDestBB = II->getNormalDest();
+          BasicBlock *UnwindDestBB = II->getUnwindDest();
+          BranchInst::Create(NormalDestBB, II);
+          UnwindDestBB->removePredecessor(II->getParent());
+          II->eraseFromParent();
+          if (DTU)
             DTU->applyUpdates({{DominatorTree::Delete, BB, UnwindDestBB}});
-        } else 
-          changeToCall(II, DTU); 
-        Changed = true; 
-      } 
-    } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(Terminator)) { 
-      // Remove catchpads which cannot be reached. 
-      struct CatchPadDenseMapInfo { 
-        static CatchPadInst *getEmptyKey() { 
-          return DenseMapInfo<CatchPadInst *>::getEmptyKey(); 
-        } 
- 
-        static CatchPadInst *getTombstoneKey() { 
-          return DenseMapInfo<CatchPadInst *>::getTombstoneKey(); 
-        } 
- 
-        static unsigned getHashValue(CatchPadInst *CatchPad) { 
-          return static_cast<unsigned>(hash_combine_range( 
-              CatchPad->value_op_begin(), CatchPad->value_op_end())); 
-        } 
- 
-        static bool isEqual(CatchPadInst *LHS, CatchPadInst *RHS) { 
-          if (LHS == getEmptyKey() || LHS == getTombstoneKey() || 
-              RHS == getEmptyKey() || RHS == getTombstoneKey()) 
-            return LHS == RHS; 
-          return LHS->isIdenticalTo(RHS); 
-        } 
-      }; 
- 
+        } else
+          changeToCall(II, DTU);
+        Changed = true;
+      }
+    } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(Terminator)) {
+      // Remove catchpads which cannot be reached.
+      struct CatchPadDenseMapInfo {
+        static CatchPadInst *getEmptyKey() {
+          return DenseMapInfo<CatchPadInst *>::getEmptyKey();
+        }
+
+        static CatchPadInst *getTombstoneKey() {
+          return DenseMapInfo<CatchPadInst *>::getTombstoneKey();
+        }
+
+        static unsigned getHashValue(CatchPadInst *CatchPad) {
+          return static_cast<unsigned>(hash_combine_range(
+              CatchPad->value_op_begin(), CatchPad->value_op_end()));
+        }
+
+        static bool isEqual(CatchPadInst *LHS, CatchPadInst *RHS) {
+          if (LHS == getEmptyKey() || LHS == getTombstoneKey() ||
+              RHS == getEmptyKey() || RHS == getTombstoneKey())
+            return LHS == RHS;
+          return LHS->isIdenticalTo(RHS);
+        }
+      };
+
       SmallMapVector<BasicBlock *, int, 8> NumPerSuccessorCases;
-      // Set of unique CatchPads. 
-      SmallDenseMap<CatchPadInst *, detail::DenseSetEmpty, 4, 
-                    CatchPadDenseMapInfo, detail::DenseSetPair<CatchPadInst *>> 
-          HandlerSet; 
-      detail::DenseSetEmpty Empty; 
-      for (CatchSwitchInst::handler_iterator I = CatchSwitch->handler_begin(), 
-                                             E = CatchSwitch->handler_end(); 
-           I != E; ++I) { 
-        BasicBlock *HandlerBB = *I; 
+      // Set of unique CatchPads.
+      SmallDenseMap<CatchPadInst *, detail::DenseSetEmpty, 4,
+                    CatchPadDenseMapInfo, detail::DenseSetPair<CatchPadInst *>>
+          HandlerSet;
+      detail::DenseSetEmpty Empty;
+      for (CatchSwitchInst::handler_iterator I = CatchSwitch->handler_begin(),
+                                             E = CatchSwitch->handler_end();
+           I != E; ++I) {
+        BasicBlock *HandlerBB = *I;
         ++NumPerSuccessorCases[HandlerBB];
-        auto *CatchPad = cast<CatchPadInst>(HandlerBB->getFirstNonPHI()); 
-        if (!HandlerSet.insert({CatchPad, Empty}).second) { 
+        auto *CatchPad = cast<CatchPadInst>(HandlerBB->getFirstNonPHI());
+        if (!HandlerSet.insert({CatchPad, Empty}).second) {
           --NumPerSuccessorCases[HandlerBB];
-          CatchSwitch->removeHandler(I); 
-          --I; 
-          --E; 
-          Changed = true; 
-        } 
-      } 
+          CatchSwitch->removeHandler(I);
+          --I;
+          --E;
+          Changed = true;
+        }
+      }
       std::vector<DominatorTree::UpdateType> Updates;
       for (const std::pair<BasicBlock *, int> &I : NumPerSuccessorCases)
         if (I.second == 0)
           Updates.push_back({DominatorTree::Delete, BB, I.first});
       if (DTU)
         DTU->applyUpdates(Updates);
-    } 
- 
-    Changed |= ConstantFoldTerminator(BB, true, nullptr, DTU); 
-    for (BasicBlock *Successor : successors(BB)) 
-      if (Reachable.insert(Successor).second) 
-        Worklist.push_back(Successor); 
-  } while (!Worklist.empty()); 
-  return Changed; 
-} 
- 
-void llvm::removeUnwindEdge(BasicBlock *BB, DomTreeUpdater *DTU) { 
-  Instruction *TI = BB->getTerminator(); 
- 
-  if (auto *II = dyn_cast<InvokeInst>(TI)) { 
-    changeToCall(II, DTU); 
-    return; 
-  } 
- 
-  Instruction *NewTI; 
-  BasicBlock *UnwindDest; 
- 
-  if (auto *CRI = dyn_cast<CleanupReturnInst>(TI)) { 
-    NewTI = CleanupReturnInst::Create(CRI->getCleanupPad(), nullptr, CRI); 
-    UnwindDest = CRI->getUnwindDest(); 
-  } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(TI)) { 
-    auto *NewCatchSwitch = CatchSwitchInst::Create( 
-        CatchSwitch->getParentPad(), nullptr, CatchSwitch->getNumHandlers(), 
-        CatchSwitch->getName(), CatchSwitch); 
-    for (BasicBlock *PadBB : CatchSwitch->handlers()) 
-      NewCatchSwitch->addHandler(PadBB); 
- 
-    NewTI = NewCatchSwitch; 
-    UnwindDest = CatchSwitch->getUnwindDest(); 
-  } else { 
-    llvm_unreachable("Could not find unwind successor"); 
-  } 
- 
-  NewTI->takeName(TI); 
-  NewTI->setDebugLoc(TI->getDebugLoc()); 
-  UnwindDest->removePredecessor(BB); 
-  TI->replaceAllUsesWith(NewTI); 
-  TI->eraseFromParent(); 
-  if (DTU) 
+    }
+
+    Changed |= ConstantFoldTerminator(BB, true, nullptr, DTU);
+    for (BasicBlock *Successor : successors(BB))
+      if (Reachable.insert(Successor).second)
+        Worklist.push_back(Successor);
+  } while (!Worklist.empty());
+  return Changed;
+}
+
+void llvm::removeUnwindEdge(BasicBlock *BB, DomTreeUpdater *DTU) {
+  Instruction *TI = BB->getTerminator();
+
+  if (auto *II = dyn_cast<InvokeInst>(TI)) {
+    changeToCall(II, DTU);
+    return;
+  }
+
+  Instruction *NewTI;
+  BasicBlock *UnwindDest;
+
+  if (auto *CRI = dyn_cast<CleanupReturnInst>(TI)) {
+    NewTI = CleanupReturnInst::Create(CRI->getCleanupPad(), nullptr, CRI);
+    UnwindDest = CRI->getUnwindDest();
+  } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(TI)) {
+    auto *NewCatchSwitch = CatchSwitchInst::Create(
+        CatchSwitch->getParentPad(), nullptr, CatchSwitch->getNumHandlers(),
+        CatchSwitch->getName(), CatchSwitch);
+    for (BasicBlock *PadBB : CatchSwitch->handlers())
+      NewCatchSwitch->addHandler(PadBB);
+
+    NewTI = NewCatchSwitch;
+    UnwindDest = CatchSwitch->getUnwindDest();
+  } else {
+    llvm_unreachable("Could not find unwind successor");
+  }
+
+  NewTI->takeName(TI);
+  NewTI->setDebugLoc(TI->getDebugLoc());
+  UnwindDest->removePredecessor(BB);
+  TI->replaceAllUsesWith(NewTI);
+  TI->eraseFromParent();
+  if (DTU)
     DTU->applyUpdates({{DominatorTree::Delete, BB, UnwindDest}});
-} 
- 
-/// removeUnreachableBlocks - Remove blocks that are not reachable, even 
-/// if they are in a dead cycle.  Return true if a change was made, false 
-/// otherwise. 
-bool llvm::removeUnreachableBlocks(Function &F, DomTreeUpdater *DTU, 
-                                   MemorySSAUpdater *MSSAU) { 
-  SmallPtrSet<BasicBlock *, 16> Reachable; 
-  bool Changed = markAliveBlocks(F, Reachable, DTU); 
- 
-  // If there are unreachable blocks in the CFG... 
-  if (Reachable.size() == F.size()) 
-    return Changed; 
- 
-  assert(Reachable.size() < F.size()); 
- 
+}
+
+/// removeUnreachableBlocks - Remove blocks that are not reachable, even
+/// if they are in a dead cycle.  Return true if a change was made, false
+/// otherwise.
+bool llvm::removeUnreachableBlocks(Function &F, DomTreeUpdater *DTU,
+                                   MemorySSAUpdater *MSSAU) {
+  SmallPtrSet<BasicBlock *, 16> Reachable;
+  bool Changed = markAliveBlocks(F, Reachable, DTU);
+
+  // If there are unreachable blocks in the CFG...
+  if (Reachable.size() == F.size())
+    return Changed;
+
+  assert(Reachable.size() < F.size());
+
   // Are there any blocks left to actually delete?
   SmallSetVector<BasicBlock *, 8> BlocksToRemove;
-  for (BasicBlock &BB : F) { 
-    // Skip reachable basic blocks 
-    if (Reachable.count(&BB)) 
-      continue; 
+  for (BasicBlock &BB : F) {
+    // Skip reachable basic blocks
+    if (Reachable.count(&BB))
+      continue;
     // Skip already-deleted blocks
     if (DTU && DTU->isBBPendingDeletion(&BB))
       continue;
     BlocksToRemove.insert(&BB);
-  } 
- 
+  }
+
   if (BlocksToRemove.empty())
     return Changed;
 
   Changed = true;
   NumRemoved += BlocksToRemove.size();
 
-  if (MSSAU) 
+  if (MSSAU)
     MSSAU->removeBlocks(BlocksToRemove);
- 
+
   // Loop over all of the basic blocks that are up for removal, dropping all of
-  // their internal references. Update DTU if available. 
-  std::vector<DominatorTree::UpdateType> Updates; 
+  // their internal references. Update DTU if available.
+  std::vector<DominatorTree::UpdateType> Updates;
   for (auto *BB : BlocksToRemove) {
     SmallSetVector<BasicBlock *, 8> UniqueSuccessors;
-    for (BasicBlock *Successor : successors(BB)) { 
+    for (BasicBlock *Successor : successors(BB)) {
       // Only remove references to BB in reachable successors of BB.
       if (Reachable.count(Successor))
-        Successor->removePredecessor(BB); 
-      if (DTU) 
+        Successor->removePredecessor(BB);
+      if (DTU)
         UniqueSuccessors.insert(Successor);
-    } 
-    BB->dropAllReferences(); 
-    if (DTU) { 
-      Instruction *TI = BB->getTerminator(); 
-      assert(TI && "Basic block should have a terminator"); 
-      // Terminators like invoke can have users. We have to replace their users, 
-      // before removing them. 
-      if (!TI->use_empty()) 
-        TI->replaceAllUsesWith(UndefValue::get(TI->getType())); 
-      TI->eraseFromParent(); 
-      new UnreachableInst(BB->getContext(), BB); 
-      assert(succ_empty(BB) && "The successor list of BB isn't empty before " 
-                               "applying corresponding DTU updates."); 
+    }
+    BB->dropAllReferences();
+    if (DTU) {
+      Instruction *TI = BB->getTerminator();
+      assert(TI && "Basic block should have a terminator");
+      // Terminators like invoke can have users. We have to replace their users,
+      // before removing them.
+      if (!TI->use_empty())
+        TI->replaceAllUsesWith(UndefValue::get(TI->getType()));
+      TI->eraseFromParent();
+      new UnreachableInst(BB->getContext(), BB);
+      assert(succ_empty(BB) && "The successor list of BB isn't empty before "
+                               "applying corresponding DTU updates.");
       Updates.reserve(Updates.size() + UniqueSuccessors.size());
       for (auto *UniqueSuccessor : UniqueSuccessors)
         Updates.push_back({DominatorTree::Delete, BB, UniqueSuccessor});
-    } 
-  } 
- 
-  if (DTU) { 
+    }
+  }
+
+  if (DTU) {
     DTU->applyUpdates(Updates);
     for (auto *BB : BlocksToRemove)
-      DTU->deleteBB(BB); 
-  } else { 
+      DTU->deleteBB(BB);
+  } else {
     for (auto *BB : BlocksToRemove)
-      BB->eraseFromParent(); 
-  } 
- 
+      BB->eraseFromParent();
+  }
+
   return Changed;
-} 
- 
-void llvm::combineMetadata(Instruction *K, const Instruction *J, 
-                           ArrayRef<unsigned> KnownIDs, bool DoesKMove) { 
-  SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata; 
-  K->dropUnknownNonDebugMetadata(KnownIDs); 
-  K->getAllMetadataOtherThanDebugLoc(Metadata); 
-  for (const auto &MD : Metadata) { 
-    unsigned Kind = MD.first; 
-    MDNode *JMD = J->getMetadata(Kind); 
-    MDNode *KMD = MD.second; 
- 
-    switch (Kind) { 
-      default: 
-        K->setMetadata(Kind, nullptr); // Remove unknown metadata 
-        break; 
-      case LLVMContext::MD_dbg: 
-        llvm_unreachable("getAllMetadataOtherThanDebugLoc returned a MD_dbg"); 
-      case LLVMContext::MD_tbaa: 
-        K->setMetadata(Kind, MDNode::getMostGenericTBAA(JMD, KMD)); 
-        break; 
-      case LLVMContext::MD_alias_scope: 
-        K->setMetadata(Kind, MDNode::getMostGenericAliasScope(JMD, KMD)); 
-        break; 
-      case LLVMContext::MD_noalias: 
-      case LLVMContext::MD_mem_parallel_loop_access: 
-        K->setMetadata(Kind, MDNode::intersect(JMD, KMD)); 
-        break; 
-      case LLVMContext::MD_access_group: 
-        K->setMetadata(LLVMContext::MD_access_group, 
-                       intersectAccessGroups(K, J)); 
-        break; 
-      case LLVMContext::MD_range: 
- 
-        // If K does move, use most generic range. Otherwise keep the range of 
-        // K. 
-        if (DoesKMove) 
-          // FIXME: If K does move, we should drop the range info and nonnull. 
-          //        Currently this function is used with DoesKMove in passes 
-          //        doing hoisting/sinking and the current behavior of using the 
-          //        most generic range is correct in those cases. 
-          K->setMetadata(Kind, MDNode::getMostGenericRange(JMD, KMD)); 
-        break; 
-      case LLVMContext::MD_fpmath: 
-        K->setMetadata(Kind, MDNode::getMostGenericFPMath(JMD, KMD)); 
-        break; 
-      case LLVMContext::MD_invariant_load: 
-        // Only set the !invariant.load if it is present in both instructions. 
-        K->setMetadata(Kind, JMD); 
-        break; 
-      case LLVMContext::MD_nonnull: 
-        // If K does move, keep nonull if it is present in both instructions. 
-        if (DoesKMove) 
-          K->setMetadata(Kind, JMD); 
-        break; 
-      case LLVMContext::MD_invariant_group: 
-        // Preserve !invariant.group in K. 
-        break; 
-      case LLVMContext::MD_align: 
-        K->setMetadata(Kind, 
-          MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD)); 
-        break; 
-      case LLVMContext::MD_dereferenceable: 
-      case LLVMContext::MD_dereferenceable_or_null: 
-        K->setMetadata(Kind, 
-          MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD)); 
-        break; 
-      case LLVMContext::MD_preserve_access_index: 
-        // Preserve !preserve.access.index in K. 
-        break; 
-    } 
-  } 
-  // Set !invariant.group from J if J has it. If both instructions have it 
-  // then we will just pick it from J - even when they are different. 
-  // Also make sure that K is load or store - f.e. combining bitcast with load 
-  // could produce bitcast with invariant.group metadata, which is invalid. 
-  // FIXME: we should try to preserve both invariant.group md if they are 
-  // different, but right now instruction can only have one invariant.group. 
-  if (auto *JMD = J->getMetadata(LLVMContext::MD_invariant_group)) 
-    if (isa<LoadInst>(K) || isa<StoreInst>(K)) 
-      K->setMetadata(LLVMContext::MD_invariant_group, JMD); 
-} 
- 
-void llvm::combineMetadataForCSE(Instruction *K, const Instruction *J, 
-                                 bool KDominatesJ) { 
-  unsigned KnownIDs[] = { 
-      LLVMContext::MD_tbaa,            LLVMContext::MD_alias_scope, 
-      LLVMContext::MD_noalias,         LLVMContext::MD_range, 
-      LLVMContext::MD_invariant_load,  LLVMContext::MD_nonnull, 
-      LLVMContext::MD_invariant_group, LLVMContext::MD_align, 
-      LLVMContext::MD_dereferenceable, 
-      LLVMContext::MD_dereferenceable_or_null, 
-      LLVMContext::MD_access_group,    LLVMContext::MD_preserve_access_index}; 
-  combineMetadata(K, J, KnownIDs, KDominatesJ); 
-} 
- 
-void llvm::copyMetadataForLoad(LoadInst &Dest, const LoadInst &Source) { 
-  SmallVector<std::pair<unsigned, MDNode *>, 8> MD; 
-  Source.getAllMetadata(MD); 
-  MDBuilder MDB(Dest.getContext()); 
-  Type *NewType = Dest.getType(); 
-  const DataLayout &DL = Source.getModule()->getDataLayout(); 
-  for (const auto &MDPair : MD) { 
-    unsigned ID = MDPair.first; 
-    MDNode *N = MDPair.second; 
-    // Note, essentially every kind of metadata should be preserved here! This 
-    // routine is supposed to clone a load instruction changing *only its type*. 
-    // The only metadata it makes sense to drop is metadata which is invalidated 
-    // when the pointer type changes. This should essentially never be the case 
-    // in LLVM, but we explicitly switch over only known metadata to be 
-    // conservatively correct. If you are adding metadata to LLVM which pertains 
-    // to loads, you almost certainly want to add it here. 
-    switch (ID) { 
-    case LLVMContext::MD_dbg: 
-    case LLVMContext::MD_tbaa: 
-    case LLVMContext::MD_prof: 
-    case LLVMContext::MD_fpmath: 
-    case LLVMContext::MD_tbaa_struct: 
-    case LLVMContext::MD_invariant_load: 
-    case LLVMContext::MD_alias_scope: 
-    case LLVMContext::MD_noalias: 
-    case LLVMContext::MD_nontemporal: 
-    case LLVMContext::MD_mem_parallel_loop_access: 
-    case LLVMContext::MD_access_group: 
-      // All of these directly apply. 
-      Dest.setMetadata(ID, N); 
-      break; 
- 
-    case LLVMContext::MD_nonnull: 
-      copyNonnullMetadata(Source, N, Dest); 
-      break; 
- 
-    case LLVMContext::MD_align: 
-    case LLVMContext::MD_dereferenceable: 
-    case LLVMContext::MD_dereferenceable_or_null: 
-      // These only directly apply if the new type is also a pointer. 
-      if (NewType->isPointerTy()) 
-        Dest.setMetadata(ID, N); 
-      break; 
- 
-    case LLVMContext::MD_range: 
-      copyRangeMetadata(DL, Source, N, Dest); 
-      break; 
-    } 
-  } 
-} 
- 
-void llvm::patchReplacementInstruction(Instruction *I, Value *Repl) { 
-  auto *ReplInst = dyn_cast<Instruction>(Repl); 
-  if (!ReplInst) 
-    return; 
- 
-  // Patch the replacement so that it is not more restrictive than the value 
-  // being replaced. 
-  // Note that if 'I' is a load being replaced by some operation, 
-  // for example, by an arithmetic operation, then andIRFlags() 
-  // would just erase all math flags from the original arithmetic 
-  // operation, which is clearly not wanted and not needed. 
-  if (!isa<LoadInst>(I)) 
-    ReplInst->andIRFlags(I); 
- 
-  // FIXME: If both the original and replacement value are part of the 
-  // same control-flow region (meaning that the execution of one 
-  // guarantees the execution of the other), then we can combine the 
-  // noalias scopes here and do better than the general conservative 
-  // answer used in combineMetadata(). 
- 
-  // In general, GVN unifies expressions over different control-flow 
-  // regions, and so we need a conservative combination of the noalias 
-  // scopes. 
-  static const unsigned KnownIDs[] = { 
-      LLVMContext::MD_tbaa,            LLVMContext::MD_alias_scope, 
-      LLVMContext::MD_noalias,         LLVMContext::MD_range, 
-      LLVMContext::MD_fpmath,          LLVMContext::MD_invariant_load, 
-      LLVMContext::MD_invariant_group, LLVMContext::MD_nonnull, 
-      LLVMContext::MD_access_group,    LLVMContext::MD_preserve_access_index}; 
-  combineMetadata(ReplInst, I, KnownIDs, false); 
-} 
- 
-template <typename RootType, typename DominatesFn> 
-static unsigned replaceDominatedUsesWith(Value *From, Value *To, 
-                                         const RootType &Root, 
-                                         const DominatesFn &Dominates) { 
-  assert(From->getType() == To->getType()); 
- 
-  unsigned Count = 0; 
-  for (Value::use_iterator UI = From->use_begin(), UE = From->use_end(); 
-       UI != UE;) { 
-    Use &U = *UI++; 
-    if (!Dominates(Root, U)) 
-      continue; 
-    U.set(To); 
-    LLVM_DEBUG(dbgs() << "Replace dominated use of '" << From->getName() 
-                      << "' as " << *To << " in " << *U << "\n"); 
-    ++Count; 
-  } 
-  return Count; 
-} 
- 
-unsigned llvm::replaceNonLocalUsesWith(Instruction *From, Value *To) { 
-   assert(From->getType() == To->getType()); 
-   auto *BB = From->getParent(); 
-   unsigned Count = 0; 
- 
-  for (Value::use_iterator UI = From->use_begin(), UE = From->use_end(); 
-       UI != UE;) { 
-    Use &U = *UI++; 
-    auto *I = cast<Instruction>(U.getUser()); 
-    if (I->getParent() == BB) 
-      continue; 
-    U.set(To); 
-    ++Count; 
-  } 
-  return Count; 
-} 
- 
-unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To, 
-                                        DominatorTree &DT, 
-                                        const BasicBlockEdge &Root) { 
-  auto Dominates = [&DT](const BasicBlockEdge &Root, const Use &U) { 
-    return DT.dominates(Root, U); 
-  }; 
-  return ::replaceDominatedUsesWith(From, To, Root, Dominates); 
-} 
- 
-unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To, 
-                                        DominatorTree &DT, 
-                                        const BasicBlock *BB) { 
-  auto ProperlyDominates = [&DT](const BasicBlock *BB, const Use &U) { 
-    auto *I = cast<Instruction>(U.getUser())->getParent(); 
-    return DT.properlyDominates(BB, I); 
-  }; 
-  return ::replaceDominatedUsesWith(From, To, BB, ProperlyDominates); 
-} 
- 
-bool llvm::callsGCLeafFunction(const CallBase *Call, 
-                               const TargetLibraryInfo &TLI) { 
-  // Check if the function is specifically marked as a gc leaf function. 
-  if (Call->hasFnAttr("gc-leaf-function")) 
-    return true; 
-  if (const Function *F = Call->getCalledFunction()) { 
-    if (F->hasFnAttribute("gc-leaf-function")) 
-      return true; 
- 
+}
+
+void llvm::combineMetadata(Instruction *K, const Instruction *J,
+                           ArrayRef<unsigned> KnownIDs, bool DoesKMove) {
+  SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
+  K->dropUnknownNonDebugMetadata(KnownIDs);
+  K->getAllMetadataOtherThanDebugLoc(Metadata);
+  for (const auto &MD : Metadata) {
+    unsigned Kind = MD.first;
+    MDNode *JMD = J->getMetadata(Kind);
+    MDNode *KMD = MD.second;
+
+    switch (Kind) {
+      default:
+        K->setMetadata(Kind, nullptr); // Remove unknown metadata
+        break;
+      case LLVMContext::MD_dbg:
+        llvm_unreachable("getAllMetadataOtherThanDebugLoc returned a MD_dbg");
+      case LLVMContext::MD_tbaa:
+        K->setMetadata(Kind, MDNode::getMostGenericTBAA(JMD, KMD));
+        break;
+      case LLVMContext::MD_alias_scope:
+        K->setMetadata(Kind, MDNode::getMostGenericAliasScope(JMD, KMD));
+        break;
+      case LLVMContext::MD_noalias:
+      case LLVMContext::MD_mem_parallel_loop_access:
+        K->setMetadata(Kind, MDNode::intersect(JMD, KMD));
+        break;
+      case LLVMContext::MD_access_group:
+        K->setMetadata(LLVMContext::MD_access_group,
+                       intersectAccessGroups(K, J));
+        break;
+      case LLVMContext::MD_range:
+
+        // If K does move, use most generic range. Otherwise keep the range of
+        // K.
+        if (DoesKMove)
+          // FIXME: If K does move, we should drop the range info and nonnull.
+          //        Currently this function is used with DoesKMove in passes
+          //        doing hoisting/sinking and the current behavior of using the
+          //        most generic range is correct in those cases.
+          K->setMetadata(Kind, MDNode::getMostGenericRange(JMD, KMD));
+        break;
+      case LLVMContext::MD_fpmath:
+        K->setMetadata(Kind, MDNode::getMostGenericFPMath(JMD, KMD));
+        break;
+      case LLVMContext::MD_invariant_load:
+        // Only set the !invariant.load if it is present in both instructions.
+        K->setMetadata(Kind, JMD);
+        break;
+      case LLVMContext::MD_nonnull:
+        // If K does move, keep nonull if it is present in both instructions.
+        if (DoesKMove)
+          K->setMetadata(Kind, JMD);
+        break;
+      case LLVMContext::MD_invariant_group:
+        // Preserve !invariant.group in K.
+        break;
+      case LLVMContext::MD_align:
+        K->setMetadata(Kind,
+          MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD));
+        break;
+      case LLVMContext::MD_dereferenceable:
+      case LLVMContext::MD_dereferenceable_or_null:
+        K->setMetadata(Kind,
+          MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD));
+        break;
+      case LLVMContext::MD_preserve_access_index:
+        // Preserve !preserve.access.index in K.
+        break;
+    }
+  }
+  // Set !invariant.group from J if J has it. If both instructions have it
+  // then we will just pick it from J - even when they are different.
+  // Also make sure that K is load or store - f.e. combining bitcast with load
+  // could produce bitcast with invariant.group metadata, which is invalid.
+  // FIXME: we should try to preserve both invariant.group md if they are
+  // different, but right now instruction can only have one invariant.group.
+  if (auto *JMD = J->getMetadata(LLVMContext::MD_invariant_group))
+    if (isa<LoadInst>(K) || isa<StoreInst>(K))
+      K->setMetadata(LLVMContext::MD_invariant_group, JMD);
+}
+
+void llvm::combineMetadataForCSE(Instruction *K, const Instruction *J,
+                                 bool KDominatesJ) {
+  unsigned KnownIDs[] = {
+      LLVMContext::MD_tbaa,            LLVMContext::MD_alias_scope,
+      LLVMContext::MD_noalias,         LLVMContext::MD_range,
+      LLVMContext::MD_invariant_load,  LLVMContext::MD_nonnull,
+      LLVMContext::MD_invariant_group, LLVMContext::MD_align,
+      LLVMContext::MD_dereferenceable,
+      LLVMContext::MD_dereferenceable_or_null,
+      LLVMContext::MD_access_group,    LLVMContext::MD_preserve_access_index};
+  combineMetadata(K, J, KnownIDs, KDominatesJ);
+}
+
+void llvm::copyMetadataForLoad(LoadInst &Dest, const LoadInst &Source) {
+  SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
+  Source.getAllMetadata(MD);
+  MDBuilder MDB(Dest.getContext());
+  Type *NewType = Dest.getType();
+  const DataLayout &DL = Source.getModule()->getDataLayout();
+  for (const auto &MDPair : MD) {
+    unsigned ID = MDPair.first;
+    MDNode *N = MDPair.second;
+    // Note, essentially every kind of metadata should be preserved here! This
+    // routine is supposed to clone a load instruction changing *only its type*.
+    // The only metadata it makes sense to drop is metadata which is invalidated
+    // when the pointer type changes. This should essentially never be the case
+    // in LLVM, but we explicitly switch over only known metadata to be
+    // conservatively correct. If you are adding metadata to LLVM which pertains
+    // to loads, you almost certainly want to add it here.
+    switch (ID) {
+    case LLVMContext::MD_dbg:
+    case LLVMContext::MD_tbaa:
+    case LLVMContext::MD_prof:
+    case LLVMContext::MD_fpmath:
+    case LLVMContext::MD_tbaa_struct:
+    case LLVMContext::MD_invariant_load:
+    case LLVMContext::MD_alias_scope:
+    case LLVMContext::MD_noalias:
+    case LLVMContext::MD_nontemporal:
+    case LLVMContext::MD_mem_parallel_loop_access:
+    case LLVMContext::MD_access_group:
+      // All of these directly apply.
+      Dest.setMetadata(ID, N);
+      break;
+
+    case LLVMContext::MD_nonnull:
+      copyNonnullMetadata(Source, N, Dest);
+      break;
+
+    case LLVMContext::MD_align:
+    case LLVMContext::MD_dereferenceable:
+    case LLVMContext::MD_dereferenceable_or_null:
+      // These only directly apply if the new type is also a pointer.
+      if (NewType->isPointerTy())
+        Dest.setMetadata(ID, N);
+      break;
+
+    case LLVMContext::MD_range:
+      copyRangeMetadata(DL, Source, N, Dest);
+      break;
+    }
+  }
+}
+
+void llvm::patchReplacementInstruction(Instruction *I, Value *Repl) {
+  auto *ReplInst = dyn_cast<Instruction>(Repl);
+  if (!ReplInst)
+    return;
+
+  // Patch the replacement so that it is not more restrictive than the value
+  // being replaced.
+  // Note that if 'I' is a load being replaced by some operation,
+  // for example, by an arithmetic operation, then andIRFlags()
+  // would just erase all math flags from the original arithmetic
+  // operation, which is clearly not wanted and not needed.
+  if (!isa<LoadInst>(I))
+    ReplInst->andIRFlags(I);
+
+  // FIXME: If both the original and replacement value are part of the
+  // same control-flow region (meaning that the execution of one
+  // guarantees the execution of the other), then we can combine the
+  // noalias scopes here and do better than the general conservative
+  // answer used in combineMetadata().
+
+  // In general, GVN unifies expressions over different control-flow
+  // regions, and so we need a conservative combination of the noalias
+  // scopes.
+  static const unsigned KnownIDs[] = {
+      LLVMContext::MD_tbaa,            LLVMContext::MD_alias_scope,
+      LLVMContext::MD_noalias,         LLVMContext::MD_range,
+      LLVMContext::MD_fpmath,          LLVMContext::MD_invariant_load,
+      LLVMContext::MD_invariant_group, LLVMContext::MD_nonnull,
+      LLVMContext::MD_access_group,    LLVMContext::MD_preserve_access_index};
+  combineMetadata(ReplInst, I, KnownIDs, false);
+}
+
+template <typename RootType, typename DominatesFn>
+static unsigned replaceDominatedUsesWith(Value *From, Value *To,
+                                         const RootType &Root,
+                                         const DominatesFn &Dominates) {
+  assert(From->getType() == To->getType());
+
+  unsigned Count = 0;
+  for (Value::use_iterator UI = From->use_begin(), UE = From->use_end();
+       UI != UE;) {
+    Use &U = *UI++;
+    if (!Dominates(Root, U))
+      continue;
+    U.set(To);
+    LLVM_DEBUG(dbgs() << "Replace dominated use of '" << From->getName()
+                      << "' as " << *To << " in " << *U << "\n");
+    ++Count;
+  }
+  return Count;
+}
+
+unsigned llvm::replaceNonLocalUsesWith(Instruction *From, Value *To) {
+   assert(From->getType() == To->getType());
+   auto *BB = From->getParent();
+   unsigned Count = 0;
+
+  for (Value::use_iterator UI = From->use_begin(), UE = From->use_end();
+       UI != UE;) {
+    Use &U = *UI++;
+    auto *I = cast<Instruction>(U.getUser());
+    if (I->getParent() == BB)
+      continue;
+    U.set(To);
+    ++Count;
+  }
+  return Count;
+}
+
+unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
+                                        DominatorTree &DT,
+                                        const BasicBlockEdge &Root) {
+  auto Dominates = [&DT](const BasicBlockEdge &Root, const Use &U) {
+    return DT.dominates(Root, U);
+  };
+  return ::replaceDominatedUsesWith(From, To, Root, Dominates);
+}
+
+unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
+                                        DominatorTree &DT,
+                                        const BasicBlock *BB) {
+  auto ProperlyDominates = [&DT](const BasicBlock *BB, const Use &U) {
+    auto *I = cast<Instruction>(U.getUser())->getParent();
+    return DT.properlyDominates(BB, I);
+  };
+  return ::replaceDominatedUsesWith(From, To, BB, ProperlyDominates);
+}
+
+bool llvm::callsGCLeafFunction(const CallBase *Call,
+                               const TargetLibraryInfo &TLI) {
+  // Check if the function is specifically marked as a gc leaf function.
+  if (Call->hasFnAttr("gc-leaf-function"))
+    return true;
+  if (const Function *F = Call->getCalledFunction()) {
+    if (F->hasFnAttribute("gc-leaf-function"))
+      return true;
+
     if (auto IID = F->getIntrinsicID()) {
-      // Most LLVM intrinsics do not take safepoints. 
-      return IID != Intrinsic::experimental_gc_statepoint && 
+      // Most LLVM intrinsics do not take safepoints.
+      return IID != Intrinsic::experimental_gc_statepoint &&
              IID != Intrinsic::experimental_deoptimize &&
              IID != Intrinsic::memcpy_element_unordered_atomic &&
              IID != Intrinsic::memmove_element_unordered_atomic;
     }
-  } 
- 
-  // Lib calls can be materialized by some passes, and won't be 
-  // marked as 'gc-leaf-function.' All available Libcalls are 
-  // GC-leaf. 
-  LibFunc LF; 
-  if (TLI.getLibFunc(*Call, LF)) { 
-    return TLI.has(LF); 
-  } 
- 
-  return false; 
-} 
- 
-void llvm::copyNonnullMetadata(const LoadInst &OldLI, MDNode *N, 
-                               LoadInst &NewLI) { 
-  auto *NewTy = NewLI.getType(); 
- 
-  // This only directly applies if the new type is also a pointer. 
-  if (NewTy->isPointerTy()) { 
-    NewLI.setMetadata(LLVMContext::MD_nonnull, N); 
-    return; 
-  } 
- 
-  // The only other translation we can do is to integral loads with !range 
-  // metadata. 
-  if (!NewTy->isIntegerTy()) 
-    return; 
- 
-  MDBuilder MDB(NewLI.getContext()); 
-  const Value *Ptr = OldLI.getPointerOperand(); 
-  auto *ITy = cast<IntegerType>(NewTy); 
-  auto *NullInt = ConstantExpr::getPtrToInt( 
-      ConstantPointerNull::get(cast<PointerType>(Ptr->getType())), ITy); 
-  auto *NonNullInt = ConstantExpr::getAdd(NullInt, ConstantInt::get(ITy, 1)); 
-  NewLI.setMetadata(LLVMContext::MD_range, 
-                    MDB.createRange(NonNullInt, NullInt)); 
-} 
- 
-void llvm::copyRangeMetadata(const DataLayout &DL, const LoadInst &OldLI, 
-                             MDNode *N, LoadInst &NewLI) { 
-  auto *NewTy = NewLI.getType(); 
- 
-  // Give up unless it is converted to a pointer where there is a single very 
-  // valuable mapping we can do reliably. 
-  // FIXME: It would be nice to propagate this in more ways, but the type 
-  // conversions make it hard. 
-  if (!NewTy->isPointerTy()) 
-    return; 
- 
-  unsigned BitWidth = DL.getPointerTypeSizeInBits(NewTy); 
-  if (!getConstantRangeFromMetadata(*N).contains(APInt(BitWidth, 0))) { 
-    MDNode *NN = MDNode::get(OldLI.getContext(), None); 
-    NewLI.setMetadata(LLVMContext::MD_nonnull, NN); 
-  } 
-} 
- 
-void llvm::dropDebugUsers(Instruction &I) { 
-  SmallVector<DbgVariableIntrinsic *, 1> DbgUsers; 
-  findDbgUsers(DbgUsers, &I); 
-  for (auto *DII : DbgUsers) 
-    DII->eraseFromParent(); 
-} 
- 
-void llvm::hoistAllInstructionsInto(BasicBlock *DomBlock, Instruction *InsertPt, 
-                                    BasicBlock *BB) { 
-  // Since we are moving the instructions out of its basic block, we do not 
-  // retain their original debug locations (DILocations) and debug intrinsic 
-  // instructions. 
-  // 
-  // Doing so would degrade the debugging experience and adversely affect the 
-  // accuracy of profiling information. 
-  // 
-  // Currently, when hoisting the instructions, we take the following actions: 
-  // - Remove their debug intrinsic instructions. 
-  // - Set their debug locations to the values from the insertion point. 
-  // 
-  // As per PR39141 (comment #8), the more fundamental reason why the dbg.values 
-  // need to be deleted, is because there will not be any instructions with a 
-  // DILocation in either branch left after performing the transformation. We 
-  // can only insert a dbg.value after the two branches are joined again. 
-  // 
-  // See PR38762, PR39243 for more details. 
-  // 
-  // TODO: Extend llvm.dbg.value to take more than one SSA Value (PR39141) to 
-  // encode predicated DIExpressions that yield different results on different 
-  // code paths. 
-  for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;) { 
-    Instruction *I = &*II; 
-    I->dropUnknownNonDebugMetadata(); 
-    if (I->isUsedByMetadata()) 
-      dropDebugUsers(*I); 
-    if (isa<DbgInfoIntrinsic>(I)) { 
-      // Remove DbgInfo Intrinsics. 
-      II = I->eraseFromParent(); 
-      continue; 
-    } 
-    I->setDebugLoc(InsertPt->getDebugLoc()); 
-    ++II; 
-  } 
-  DomBlock->getInstList().splice(InsertPt->getIterator(), BB->getInstList(), 
-                                 BB->begin(), 
-                                 BB->getTerminator()->getIterator()); 
-} 
- 
-namespace { 
- 
-/// A potential constituent of a bitreverse or bswap expression. See 
-/// collectBitParts for a fuller explanation. 
-struct BitPart { 
-  BitPart(Value *P, unsigned BW) : Provider(P) { 
-    Provenance.resize(BW); 
-  } 
- 
-  /// The Value that this is a bitreverse/bswap of. 
-  Value *Provider; 
- 
-  /// The "provenance" of each bit. Provenance[A] = B means that bit A 
-  /// in Provider becomes bit B in the result of this expression. 
-  SmallVector<int8_t, 32> Provenance; // int8_t means max size is i128. 
- 
-  enum { Unset = -1 }; 
-}; 
- 
-} // end anonymous namespace 
- 
-/// Analyze the specified subexpression and see if it is capable of providing 
-/// pieces of a bswap or bitreverse. The subexpression provides a potential 
+  }
+
+  // Lib calls can be materialized by some passes, and won't be
+  // marked as 'gc-leaf-function.' All available Libcalls are
+  // GC-leaf.
+  LibFunc LF;
+  if (TLI.getLibFunc(*Call, LF)) {
+    return TLI.has(LF);
+  }
+
+  return false;
+}
+
+void llvm::copyNonnullMetadata(const LoadInst &OldLI, MDNode *N,
+                               LoadInst &NewLI) {
+  auto *NewTy = NewLI.getType();
+
+  // This only directly applies if the new type is also a pointer.
+  if (NewTy->isPointerTy()) {
+    NewLI.setMetadata(LLVMContext::MD_nonnull, N);
+    return;
+  }
+
+  // The only other translation we can do is to integral loads with !range
+  // metadata.
+  if (!NewTy->isIntegerTy())
+    return;
+
+  MDBuilder MDB(NewLI.getContext());
+  const Value *Ptr = OldLI.getPointerOperand();
+  auto *ITy = cast<IntegerType>(NewTy);
+  auto *NullInt = ConstantExpr::getPtrToInt(
+      ConstantPointerNull::get(cast<PointerType>(Ptr->getType())), ITy);
+  auto *NonNullInt = ConstantExpr::getAdd(NullInt, ConstantInt::get(ITy, 1));
+  NewLI.setMetadata(LLVMContext::MD_range,
+                    MDB.createRange(NonNullInt, NullInt));
+}
+
+void llvm::copyRangeMetadata(const DataLayout &DL, const LoadInst &OldLI,
+                             MDNode *N, LoadInst &NewLI) {
+  auto *NewTy = NewLI.getType();
+
+  // Give up unless it is converted to a pointer where there is a single very
+  // valuable mapping we can do reliably.
+  // FIXME: It would be nice to propagate this in more ways, but the type
+  // conversions make it hard.
+  if (!NewTy->isPointerTy())
+    return;
+
+  unsigned BitWidth = DL.getPointerTypeSizeInBits(NewTy);
+  if (!getConstantRangeFromMetadata(*N).contains(APInt(BitWidth, 0))) {
+    MDNode *NN = MDNode::get(OldLI.getContext(), None);
+    NewLI.setMetadata(LLVMContext::MD_nonnull, NN);
+  }
+}
+
+void llvm::dropDebugUsers(Instruction &I) {
+  SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
+  findDbgUsers(DbgUsers, &I);
+  for (auto *DII : DbgUsers)
+    DII->eraseFromParent();
+}
+
+void llvm::hoistAllInstructionsInto(BasicBlock *DomBlock, Instruction *InsertPt,
+                                    BasicBlock *BB) {
+  // Since we are moving the instructions out of its basic block, we do not
+  // retain their original debug locations (DILocations) and debug intrinsic
+  // instructions.
+  //
+  // Doing so would degrade the debugging experience and adversely affect the
+  // accuracy of profiling information.
+  //
+  // Currently, when hoisting the instructions, we take the following actions:
+  // - Remove their debug intrinsic instructions.
+  // - Set their debug locations to the values from the insertion point.
+  //
+  // As per PR39141 (comment #8), the more fundamental reason why the dbg.values
+  // need to be deleted, is because there will not be any instructions with a
+  // DILocation in either branch left after performing the transformation. We
+  // can only insert a dbg.value after the two branches are joined again.
+  //
+  // See PR38762, PR39243 for more details.
+  //
+  // TODO: Extend llvm.dbg.value to take more than one SSA Value (PR39141) to
+  // encode predicated DIExpressions that yield different results on different
+  // code paths.
+  for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;) {
+    Instruction *I = &*II;
+    I->dropUnknownNonDebugMetadata();
+    if (I->isUsedByMetadata())
+      dropDebugUsers(*I);
+    if (isa<DbgInfoIntrinsic>(I)) {
+      // Remove DbgInfo Intrinsics.
+      II = I->eraseFromParent();
+      continue;
+    }
+    I->setDebugLoc(InsertPt->getDebugLoc());
+    ++II;
+  }
+  DomBlock->getInstList().splice(InsertPt->getIterator(), BB->getInstList(),
+                                 BB->begin(),
+                                 BB->getTerminator()->getIterator());
+}
+
+namespace {
+
+/// A potential constituent of a bitreverse or bswap expression. See
+/// collectBitParts for a fuller explanation.
+struct BitPart {
+  BitPart(Value *P, unsigned BW) : Provider(P) {
+    Provenance.resize(BW);
+  }
+
+  /// The Value that this is a bitreverse/bswap of.
+  Value *Provider;
+
+  /// The "provenance" of each bit. Provenance[A] = B means that bit A
+  /// in Provider becomes bit B in the result of this expression.
+  SmallVector<int8_t, 32> Provenance; // int8_t means max size is i128.
+
+  enum { Unset = -1 };
+};
+
+} // end anonymous namespace
+
+/// Analyze the specified subexpression and see if it is capable of providing
+/// pieces of a bswap or bitreverse. The subexpression provides a potential
 /// piece of a bswap or bitreverse if it can be proved that each non-zero bit in
-/// the output of the expression came from a corresponding bit in some other 
-/// value. This function is recursive, and the end result is a mapping of 
-/// bitnumber to bitnumber. It is the caller's responsibility to validate that 
-/// the bitnumber to bitnumber mapping is correct for a bswap or bitreverse. 
-/// 
-/// For example, if the current subexpression if "(shl i32 %X, 24)" then we know 
-/// that the expression deposits the low byte of %X into the high byte of the 
-/// result and that all other bits are zero. This expression is accepted and a 
-/// BitPart is returned with Provider set to %X and Provenance[24-31] set to 
-/// [0-7]. 
-/// 
+/// the output of the expression came from a corresponding bit in some other
+/// value. This function is recursive, and the end result is a mapping of
+/// bitnumber to bitnumber. It is the caller's responsibility to validate that
+/// the bitnumber to bitnumber mapping is correct for a bswap or bitreverse.
+///
+/// For example, if the current subexpression if "(shl i32 %X, 24)" then we know
+/// that the expression deposits the low byte of %X into the high byte of the
+/// result and that all other bits are zero. This expression is accepted and a
+/// BitPart is returned with Provider set to %X and Provenance[24-31] set to
+/// [0-7].
+///
 /// For vector types, all analysis is performed at the per-element level. No
 /// cross-element analysis is supported (shuffle/insertion/reduction), and all
 /// constant masks must be splatted across all elements.
 ///
-/// To avoid revisiting values, the BitPart results are memoized into the 
-/// provided map. To avoid unnecessary copying of BitParts, BitParts are 
-/// constructed in-place in the \c BPS map. Because of this \c BPS needs to 
-/// store BitParts objects, not pointers. As we need the concept of a nullptr 
-/// BitParts (Value has been analyzed and the analysis failed), we an Optional 
-/// type instead to provide the same functionality. 
-/// 
-/// Because we pass around references into \c BPS, we must use a container that 
-/// does not invalidate internal references (std::map instead of DenseMap). 
-static const Optional<BitPart> & 
-collectBitParts(Value *V, bool MatchBSwaps, bool MatchBitReversals, 
-                std::map<Value *, Optional<BitPart>> &BPS, int Depth) { 
-  auto I = BPS.find(V); 
-  if (I != BPS.end()) 
-    return I->second; 
- 
-  auto &Result = BPS[V] = None; 
+/// To avoid revisiting values, the BitPart results are memoized into the
+/// provided map. To avoid unnecessary copying of BitParts, BitParts are
+/// constructed in-place in the \c BPS map. Because of this \c BPS needs to
+/// store BitParts objects, not pointers. As we need the concept of a nullptr
+/// BitParts (Value has been analyzed and the analysis failed), we an Optional
+/// type instead to provide the same functionality.
+///
+/// Because we pass around references into \c BPS, we must use a container that
+/// does not invalidate internal references (std::map instead of DenseMap).
+static const Optional<BitPart> &
+collectBitParts(Value *V, bool MatchBSwaps, bool MatchBitReversals,
+                std::map<Value *, Optional<BitPart>> &BPS, int Depth) {
+  auto I = BPS.find(V);
+  if (I != BPS.end())
+    return I->second;
+
+  auto &Result = BPS[V] = None;
   auto BitWidth = V->getType()->getScalarSizeInBits();
- 
-  // Prevent stack overflow by limiting the recursion depth 
-  if (Depth == BitPartRecursionMaxDepth) { 
-    LLVM_DEBUG(dbgs() << "collectBitParts max recursion depth reached.\n"); 
-    return Result; 
-  } 
- 
+
+  // Prevent stack overflow by limiting the recursion depth
+  if (Depth == BitPartRecursionMaxDepth) {
+    LLVM_DEBUG(dbgs() << "collectBitParts max recursion depth reached.\n");
+    return Result;
+  }
+
   if (auto *I = dyn_cast<Instruction>(V)) {
     Value *X, *Y;
     const APInt *C;
 
-    // If this is an or instruction, it may be an inner node of the bswap. 
+    // If this is an or instruction, it may be an inner node of the bswap.
     if (match(V, m_Or(m_Value(X), m_Value(Y)))) {
       const auto &A =
           collectBitParts(X, MatchBSwaps, MatchBitReversals, BPS, Depth + 1);
       const auto &B =
           collectBitParts(Y, MatchBSwaps, MatchBitReversals, BPS, Depth + 1);
-      if (!A || !B) 
-        return Result; 
- 
-      // Try and merge the two together. 
-      if (!A->Provider || A->Provider != B->Provider) 
-        return Result; 
- 
-      Result = BitPart(A->Provider, BitWidth); 
+      if (!A || !B)
+        return Result;
+
+      // Try and merge the two together.
+      if (!A->Provider || A->Provider != B->Provider)
+        return Result;
+
+      Result = BitPart(A->Provider, BitWidth);
       for (unsigned BitIdx = 0; BitIdx < BitWidth; ++BitIdx) {
         if (A->Provenance[BitIdx] != BitPart::Unset &&
             B->Provenance[BitIdx] != BitPart::Unset &&
             A->Provenance[BitIdx] != B->Provenance[BitIdx])
-          return Result = None; 
- 
+          return Result = None;
+
         if (A->Provenance[BitIdx] == BitPart::Unset)
           Result->Provenance[BitIdx] = B->Provenance[BitIdx];
-        else 
+        else
           Result->Provenance[BitIdx] = A->Provenance[BitIdx];
-      } 
- 
-      return Result; 
-    } 
- 
-    // If this is a logical shift by a constant, recurse then shift the result. 
+      }
+
+      return Result;
+    }
+
+    // If this is a logical shift by a constant, recurse then shift the result.
     if (match(V, m_LogicalShift(m_Value(X), m_APInt(C)))) {
       const APInt &BitShift = *C;
 
-      // Ensure the shift amount is defined. 
+      // Ensure the shift amount is defined.
       if (BitShift.uge(BitWidth))
-        return Result; 
- 
+        return Result;
+
       const auto &Res =
           collectBitParts(X, MatchBSwaps, MatchBitReversals, BPS, Depth + 1);
-      if (!Res) 
-        return Result; 
-      Result = Res; 
- 
-      // Perform the "shift" on BitProvenance. 
-      auto &P = Result->Provenance; 
-      if (I->getOpcode() == Instruction::Shl) { 
+      if (!Res)
+        return Result;
+      Result = Res;
+
+      // Perform the "shift" on BitProvenance.
+      auto &P = Result->Provenance;
+      if (I->getOpcode() == Instruction::Shl) {
         P.erase(std::prev(P.end(), BitShift.getZExtValue()), P.end());
         P.insert(P.begin(), BitShift.getZExtValue(), BitPart::Unset);
-      } else { 
+      } else {
         P.erase(P.begin(), std::next(P.begin(), BitShift.getZExtValue()));
         P.insert(P.end(), BitShift.getZExtValue(), BitPart::Unset);
-      } 
- 
-      return Result; 
-    } 
- 
-    // If this is a logical 'and' with a mask that clears bits, recurse then 
-    // unset the appropriate bits. 
+      }
+
+      return Result;
+    }
+
+    // If this is a logical 'and' with a mask that clears bits, recurse then
+    // unset the appropriate bits.
     if (match(V, m_And(m_Value(X), m_APInt(C)))) {
       const APInt &AndMask = *C;
- 
-      // Check that the mask allows a multiple of 8 bits for a bswap, for an 
-      // early exit. 
-      unsigned NumMaskedBits = AndMask.countPopulation(); 
+
+      // Check that the mask allows a multiple of 8 bits for a bswap, for an
+      // early exit.
+      unsigned NumMaskedBits = AndMask.countPopulation();
       if (!MatchBitReversals && (NumMaskedBits % 8) != 0)
-        return Result; 
- 
+        return Result;
+
       const auto &Res =
           collectBitParts(X, MatchBSwaps, MatchBitReversals, BPS, Depth + 1);
-      if (!Res) 
-        return Result; 
-      Result = Res; 
- 
+      if (!Res)
+        return Result;
+      Result = Res;
+
       for (unsigned BitIdx = 0; BitIdx < BitWidth; ++BitIdx)
-        // If the AndMask is zero for this bit, clear the bit. 
+        // If the AndMask is zero for this bit, clear the bit.
         if (AndMask[BitIdx] == 0)
           Result->Provenance[BitIdx] = BitPart::Unset;
-      return Result; 
-    } 
- 
-    // If this is a zext instruction zero extend the result. 
+      return Result;
+    }
+
+    // If this is a zext instruction zero extend the result.
     if (match(V, m_ZExt(m_Value(X)))) {
       const auto &Res =
           collectBitParts(X, MatchBSwaps, MatchBitReversals, BPS, Depth + 1);
-      if (!Res) 
-        return Result; 
- 
-      Result = BitPart(Res->Provider, BitWidth); 
+      if (!Res)
+        return Result;
+
+      Result = BitPart(Res->Provider, BitWidth);
       auto NarrowBitWidth = X->getType()->getScalarSizeInBits();
       for (unsigned BitIdx = 0; BitIdx < NarrowBitWidth; ++BitIdx)
         Result->Provenance[BitIdx] = Res->Provenance[BitIdx];
       for (unsigned BitIdx = NarrowBitWidth; BitIdx < BitWidth; ++BitIdx)
         Result->Provenance[BitIdx] = BitPart::Unset;
-      return Result; 
-    } 
+      return Result;
+    }
 
     // BITREVERSE - most likely due to us previous matching a partial
     // bitreverse.
@@ -3037,58 +3037,58 @@ collectBitParts(Value *V, bool MatchBSwaps, bool MatchBitReversals,
         Result->Provenance[BitIdx] = RHS->Provenance[BitIdx + StartBitRHS];
       return Result;
     }
-  } 
- 
-  // Okay, we got to something that isn't a shift, 'or' or 'and'.  This must be 
-  // the input value to the bswap/bitreverse. 
-  Result = BitPart(V, BitWidth); 
+  }
+
+  // Okay, we got to something that isn't a shift, 'or' or 'and'.  This must be
+  // the input value to the bswap/bitreverse.
+  Result = BitPart(V, BitWidth);
   for (unsigned BitIdx = 0; BitIdx < BitWidth; ++BitIdx)
     Result->Provenance[BitIdx] = BitIdx;
-  return Result; 
-} 
- 
-static bool bitTransformIsCorrectForBSwap(unsigned From, unsigned To, 
-                                          unsigned BitWidth) { 
-  if (From % 8 != To % 8) 
-    return false; 
-  // Convert from bit indices to byte indices and check for a byte reversal. 
-  From >>= 3; 
-  To >>= 3; 
-  BitWidth >>= 3; 
-  return From == BitWidth - To - 1; 
-} 
- 
-static bool bitTransformIsCorrectForBitReverse(unsigned From, unsigned To, 
-                                               unsigned BitWidth) { 
-  return From == BitWidth - To - 1; 
-} 
- 
-bool llvm::recognizeBSwapOrBitReverseIdiom( 
-    Instruction *I, bool MatchBSwaps, bool MatchBitReversals, 
-    SmallVectorImpl<Instruction *> &InsertedInsts) { 
-  if (Operator::getOpcode(I) != Instruction::Or) 
-    return false; 
-  if (!MatchBSwaps && !MatchBitReversals) 
-    return false; 
+  return Result;
+}
+
+static bool bitTransformIsCorrectForBSwap(unsigned From, unsigned To,
+                                          unsigned BitWidth) {
+  if (From % 8 != To % 8)
+    return false;
+  // Convert from bit indices to byte indices and check for a byte reversal.
+  From >>= 3;
+  To >>= 3;
+  BitWidth >>= 3;
+  return From == BitWidth - To - 1;
+}
+
+static bool bitTransformIsCorrectForBitReverse(unsigned From, unsigned To,
+                                               unsigned BitWidth) {
+  return From == BitWidth - To - 1;
+}
+
+bool llvm::recognizeBSwapOrBitReverseIdiom(
+    Instruction *I, bool MatchBSwaps, bool MatchBitReversals,
+    SmallVectorImpl<Instruction *> &InsertedInsts) {
+  if (Operator::getOpcode(I) != Instruction::Or)
+    return false;
+  if (!MatchBSwaps && !MatchBitReversals)
+    return false;
   Type *ITy = I->getType();
   if (!ITy->isIntOrIntVectorTy() || ITy->getScalarSizeInBits() > 128)
     return false;  // Can't do integer/elements > 128 bits.
- 
+
   Type *DemandedTy = ITy;
   if (I->hasOneUse())
     if (auto *Trunc = dyn_cast<TruncInst>(I->user_back()))
       DemandedTy = Trunc->getType();
- 
-  // Try to find all the pieces corresponding to the bswap. 
-  std::map<Value *, Optional<BitPart>> BPS; 
-  auto Res = collectBitParts(I, MatchBSwaps, MatchBitReversals, BPS, 0); 
-  if (!Res) 
-    return false; 
+
+  // Try to find all the pieces corresponding to the bswap.
+  std::map<Value *, Optional<BitPart>> BPS;
+  auto Res = collectBitParts(I, MatchBSwaps, MatchBitReversals, BPS, 0);
+  if (!Res)
+    return false;
   ArrayRef<int8_t> BitProvenance = Res->Provenance;
   assert(all_of(BitProvenance,
                 [](int8_t I) { return I == BitPart::Unset || 0 <= I; }) &&
          "Illegal bit provenance index");
- 
+
   // If the upper bits are zero, then attempt to perform as a truncated op.
   if (BitProvenance.back() == BitPart::Unset) {
     while (!BitProvenance.empty() && BitProvenance.back() == BitPart::Unset)
@@ -3105,8 +3105,8 @@ bool llvm::recognizeBSwapOrBitReverseIdiom(
   if (DemandedBW > ITy->getScalarSizeInBits())
     return false;
 
-  // Now, is the bit permutation correct for a bswap or a bitreverse? We can 
-  // only byteswap values with an even number of bytes. 
+  // Now, is the bit permutation correct for a bswap or a bitreverse? We can
+  // only byteswap values with an even number of bytes.
   APInt DemandedMask = APInt::getAllOnesValue(DemandedBW);
   bool OKForBSwap = MatchBSwaps && (DemandedBW % 16) == 0;
   bool OKForBitReverse = MatchBitReversals;
@@ -3120,16 +3120,16 @@ bool llvm::recognizeBSwapOrBitReverseIdiom(
                                                 DemandedBW);
     OKForBitReverse &= bitTransformIsCorrectForBitReverse(BitProvenance[BitIdx],
                                                           BitIdx, DemandedBW);
-  } 
- 
-  Intrinsic::ID Intrin; 
+  }
+
+  Intrinsic::ID Intrin;
   if (OKForBSwap)
-    Intrin = Intrinsic::bswap; 
+    Intrin = Intrinsic::bswap;
   else if (OKForBitReverse)
-    Intrin = Intrinsic::bitreverse; 
-  else 
-    return false; 
- 
+    Intrin = Intrinsic::bitreverse;
+  else
+    return false;
+
   Function *F = Intrinsic::getDeclaration(I->getModule(), Intrin, DemandedTy);
   Value *Provider = Res->Provider;
 
@@ -3153,130 +3153,130 @@ bool llvm::recognizeBSwapOrBitReverseIdiom(
   // We may need to zeroextend back to the result type.
   if (ITy != Result->getType()) {
     auto *ExtInst = CastInst::CreateIntegerCast(Result, ITy, false, "zext", I);
-    InsertedInsts.push_back(ExtInst); 
-  } 
- 
-  return true; 
-} 
- 
-// CodeGen has special handling for some string functions that may replace 
-// them with target-specific intrinsics.  Since that'd skip our interceptors 
-// in ASan/MSan/TSan/DFSan, and thus make us miss some memory accesses, 
-// we mark affected calls as NoBuiltin, which will disable optimization 
-// in CodeGen. 
-void llvm::maybeMarkSanitizerLibraryCallNoBuiltin( 
-    CallInst *CI, const TargetLibraryInfo *TLI) { 
-  Function *F = CI->getCalledFunction(); 
-  LibFunc Func; 
-  if (F && !F->hasLocalLinkage() && F->hasName() && 
-      TLI->getLibFunc(F->getName(), Func) && TLI->hasOptimizedCodeGen(Func) && 
-      !F->doesNotAccessMemory()) 
-    CI->addAttribute(AttributeList::FunctionIndex, Attribute::NoBuiltin); 
-} 
- 
-bool llvm::canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx) { 
-  // We can't have a PHI with a metadata type. 
-  if (I->getOperand(OpIdx)->getType()->isMetadataTy()) 
-    return false; 
- 
-  // Early exit. 
-  if (!isa<Constant>(I->getOperand(OpIdx))) 
-    return true; 
- 
-  switch (I->getOpcode()) { 
-  default: 
-    return true; 
-  case Instruction::Call: 
-  case Instruction::Invoke: { 
-    const auto &CB = cast<CallBase>(*I); 
- 
-    // Can't handle inline asm. Skip it. 
-    if (CB.isInlineAsm()) 
-      return false; 
- 
-    // Constant bundle operands may need to retain their constant-ness for 
-    // correctness. 
-    if (CB.isBundleOperand(OpIdx)) 
-      return false; 
- 
-    if (OpIdx < CB.getNumArgOperands()) { 
-      // Some variadic intrinsics require constants in the variadic arguments, 
-      // which currently aren't markable as immarg. 
-      if (isa<IntrinsicInst>(CB) && 
-          OpIdx >= CB.getFunctionType()->getNumParams()) { 
-        // This is known to be OK for stackmap. 
-        return CB.getIntrinsicID() == Intrinsic::experimental_stackmap; 
-      } 
- 
-      // gcroot is a special case, since it requires a constant argument which 
-      // isn't also required to be a simple ConstantInt. 
-      if (CB.getIntrinsicID() == Intrinsic::gcroot) 
-        return false; 
- 
-      // Some intrinsic operands are required to be immediates. 
-      return !CB.paramHasAttr(OpIdx, Attribute::ImmArg); 
-    } 
- 
-    // It is never allowed to replace the call argument to an intrinsic, but it 
-    // may be possible for a call. 
-    return !isa<IntrinsicInst>(CB); 
-  } 
-  case Instruction::ShuffleVector: 
-    // Shufflevector masks are constant. 
-    return OpIdx != 2; 
-  case Instruction::Switch: 
-  case Instruction::ExtractValue: 
-    // All operands apart from the first are constant. 
-    return OpIdx == 0; 
-  case Instruction::InsertValue: 
-    // All operands apart from the first and the second are constant. 
-    return OpIdx < 2; 
-  case Instruction::Alloca: 
-    // Static allocas (constant size in the entry block) are handled by 
-    // prologue/epilogue insertion so they're free anyway. We definitely don't 
-    // want to make them non-constant. 
-    return !cast<AllocaInst>(I)->isStaticAlloca(); 
-  case Instruction::GetElementPtr: 
-    if (OpIdx == 0) 
-      return true; 
-    gep_type_iterator It = gep_type_begin(I); 
-    for (auto E = std::next(It, OpIdx); It != E; ++It) 
-      if (It.isStruct()) 
-        return false; 
-    return true; 
-  } 
-} 
- 
-Value *llvm::invertCondition(Value *Condition) { 
-  // First: Check if it's a constant 
-  if (Constant *C = dyn_cast<Constant>(Condition)) 
-    return ConstantExpr::getNot(C); 
- 
-  // Second: If the condition is already inverted, return the original value 
-  Value *NotCondition; 
-  if (match(Condition, m_Not(m_Value(NotCondition)))) 
-    return NotCondition; 
- 
-  BasicBlock *Parent = nullptr; 
-  Instruction *Inst = dyn_cast<Instruction>(Condition); 
-  if (Inst) 
-    Parent = Inst->getParent(); 
-  else if (Argument *Arg = dyn_cast<Argument>(Condition)) 
-    Parent = &Arg->getParent()->getEntryBlock(); 
-  assert(Parent && "Unsupported condition to invert"); 
- 
-  // Third: Check all the users for an invert 
-  for (User *U : Condition->users()) 
-    if (Instruction *I = dyn_cast<Instruction>(U)) 
-      if (I->getParent() == Parent && match(I, m_Not(m_Specific(Condition)))) 
-        return I; 
- 
-  // Last option: Create a new instruction 
-  auto *Inverted = 
-      BinaryOperator::CreateNot(Condition, Condition->getName() + ".inv"); 
-  if (Inst && !isa<PHINode>(Inst)) 
-    Inverted->insertAfter(Inst); 
-  else 
-    Inverted->insertBefore(&*Parent->getFirstInsertionPt()); 
-  return Inverted; 
-} 
+    InsertedInsts.push_back(ExtInst);
+  }
+
+  return true;
+}
+
+// CodeGen has special handling for some string functions that may replace
+// them with target-specific intrinsics.  Since that'd skip our interceptors
+// in ASan/MSan/TSan/DFSan, and thus make us miss some memory accesses,
+// we mark affected calls as NoBuiltin, which will disable optimization
+// in CodeGen.
+void llvm::maybeMarkSanitizerLibraryCallNoBuiltin(
+    CallInst *CI, const TargetLibraryInfo *TLI) {
+  Function *F = CI->getCalledFunction();
+  LibFunc Func;
+  if (F && !F->hasLocalLinkage() && F->hasName() &&
+      TLI->getLibFunc(F->getName(), Func) && TLI->hasOptimizedCodeGen(Func) &&
+      !F->doesNotAccessMemory())
+    CI->addAttribute(AttributeList::FunctionIndex, Attribute::NoBuiltin);
+}
+
+bool llvm::canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx) {
+  // We can't have a PHI with a metadata type.
+  if (I->getOperand(OpIdx)->getType()->isMetadataTy())
+    return false;
+
+  // Early exit.
+  if (!isa<Constant>(I->getOperand(OpIdx)))
+    return true;
+
+  switch (I->getOpcode()) {
+  default:
+    return true;
+  case Instruction::Call:
+  case Instruction::Invoke: {
+    const auto &CB = cast<CallBase>(*I);
+
+    // Can't handle inline asm. Skip it.
+    if (CB.isInlineAsm())
+      return false;
+
+    // Constant bundle operands may need to retain their constant-ness for
+    // correctness.
+    if (CB.isBundleOperand(OpIdx))
+      return false;
+
+    if (OpIdx < CB.getNumArgOperands()) {
+      // Some variadic intrinsics require constants in the variadic arguments,
+      // which currently aren't markable as immarg.
+      if (isa<IntrinsicInst>(CB) &&
+          OpIdx >= CB.getFunctionType()->getNumParams()) {
+        // This is known to be OK for stackmap.
+        return CB.getIntrinsicID() == Intrinsic::experimental_stackmap;
+      }
+
+      // gcroot is a special case, since it requires a constant argument which
+      // isn't also required to be a simple ConstantInt.
+      if (CB.getIntrinsicID() == Intrinsic::gcroot)
+        return false;
+
+      // Some intrinsic operands are required to be immediates.
+      return !CB.paramHasAttr(OpIdx, Attribute::ImmArg);
+    }
+
+    // It is never allowed to replace the call argument to an intrinsic, but it
+    // may be possible for a call.
+    return !isa<IntrinsicInst>(CB);
+  }
+  case Instruction::ShuffleVector:
+    // Shufflevector masks are constant.
+    return OpIdx != 2;
+  case Instruction::Switch:
+  case Instruction::ExtractValue:
+    // All operands apart from the first are constant.
+    return OpIdx == 0;
+  case Instruction::InsertValue:
+    // All operands apart from the first and the second are constant.
+    return OpIdx < 2;
+  case Instruction::Alloca:
+    // Static allocas (constant size in the entry block) are handled by
+    // prologue/epilogue insertion so they're free anyway. We definitely don't
+    // want to make them non-constant.
+    return !cast<AllocaInst>(I)->isStaticAlloca();
+  case Instruction::GetElementPtr:
+    if (OpIdx == 0)
+      return true;
+    gep_type_iterator It = gep_type_begin(I);
+    for (auto E = std::next(It, OpIdx); It != E; ++It)
+      if (It.isStruct())
+        return false;
+    return true;
+  }
+}
+
+Value *llvm::invertCondition(Value *Condition) {
+  // First: Check if it's a constant
+  if (Constant *C = dyn_cast<Constant>(Condition))
+    return ConstantExpr::getNot(C);
+
+  // Second: If the condition is already inverted, return the original value
+  Value *NotCondition;
+  if (match(Condition, m_Not(m_Value(NotCondition))))
+    return NotCondition;
+
+  BasicBlock *Parent = nullptr;
+  Instruction *Inst = dyn_cast<Instruction>(Condition);
+  if (Inst)
+    Parent = Inst->getParent();
+  else if (Argument *Arg = dyn_cast<Argument>(Condition))
+    Parent = &Arg->getParent()->getEntryBlock();
+  assert(Parent && "Unsupported condition to invert");
+
+  // Third: Check all the users for an invert
+  for (User *U : Condition->users())
+    if (Instruction *I = dyn_cast<Instruction>(U))
+      if (I->getParent() == Parent && match(I, m_Not(m_Specific(Condition))))
+        return I;
+
+  // Last option: Create a new instruction
+  auto *Inverted =
+      BinaryOperator::CreateNot(Condition, Condition->getName() + ".inv");
+  if (Inst && !isa<PHINode>(Inst))
+    Inverted->insertAfter(Inst);
+  else
+    Inverted->insertBefore(&*Parent->getFirstInsertionPt());
+  return Inverted;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/LoopRotationUtils.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/LoopRotationUtils.cpp
index f83c968e91..b678efdc8d 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -1,406 +1,406 @@
-//===----------------- LoopRotationUtils.cpp -----------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file provides utilities to convert a loop into a loop with bottom test. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/LoopRotationUtils.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/BasicAliasAnalysis.h" 
-#include "llvm/Analysis/CodeMetrics.h" 
-#include "llvm/Analysis/DomTreeUpdater.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/Analysis/LoopPass.h" 
-#include "llvm/Analysis/MemorySSA.h" 
-#include "llvm/Analysis/MemorySSAUpdater.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/DebugInfoMetadata.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
+//===----------------- LoopRotationUtils.cpp -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides utilities to convert a loop into a loop with bottom test.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LoopRotationUtils.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Transforms/Utils/LoopUtils.h" 
-#include "llvm/Transforms/Utils/SSAUpdater.h" 
-#include "llvm/Transforms/Utils/ValueMapper.h" 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "loop-rotate" 
- 
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-rotate"
+
 STATISTIC(NumNotRotatedDueToHeaderSize,
           "Number of loops not rotated due to the header size");
-STATISTIC(NumRotated, "Number of loops rotated"); 
- 
-static cl::opt<bool> 
-    MultiRotate("loop-rotate-multi", cl::init(false), cl::Hidden, 
-                cl::desc("Allow loop rotation multiple times in order to reach " 
-                         "a better latch exit")); 
- 
-namespace { 
-/// A simple loop rotation transformation. 
-class LoopRotate { 
-  const unsigned MaxHeaderSize; 
-  LoopInfo *LI; 
-  const TargetTransformInfo *TTI; 
-  AssumptionCache *AC; 
-  DominatorTree *DT; 
-  ScalarEvolution *SE; 
-  MemorySSAUpdater *MSSAU; 
-  const SimplifyQuery &SQ; 
-  bool RotationOnly; 
-  bool IsUtilMode; 
+STATISTIC(NumRotated, "Number of loops rotated");
+
+static cl::opt<bool>
+    MultiRotate("loop-rotate-multi", cl::init(false), cl::Hidden,
+                cl::desc("Allow loop rotation multiple times in order to reach "
+                         "a better latch exit"));
+
+namespace {
+/// A simple loop rotation transformation.
+class LoopRotate {
+  const unsigned MaxHeaderSize;
+  LoopInfo *LI;
+  const TargetTransformInfo *TTI;
+  AssumptionCache *AC;
+  DominatorTree *DT;
+  ScalarEvolution *SE;
+  MemorySSAUpdater *MSSAU;
+  const SimplifyQuery &SQ;
+  bool RotationOnly;
+  bool IsUtilMode;
   bool PrepareForLTO;
- 
-public: 
-  LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI, 
-             const TargetTransformInfo *TTI, AssumptionCache *AC, 
-             DominatorTree *DT, ScalarEvolution *SE, MemorySSAUpdater *MSSAU, 
+
+public:
+  LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI,
+             const TargetTransformInfo *TTI, AssumptionCache *AC,
+             DominatorTree *DT, ScalarEvolution *SE, MemorySSAUpdater *MSSAU,
              const SimplifyQuery &SQ, bool RotationOnly, bool IsUtilMode,
              bool PrepareForLTO)
-      : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE), 
-        MSSAU(MSSAU), SQ(SQ), RotationOnly(RotationOnly), 
+      : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE),
+        MSSAU(MSSAU), SQ(SQ), RotationOnly(RotationOnly),
         IsUtilMode(IsUtilMode), PrepareForLTO(PrepareForLTO) {}
-  bool processLoop(Loop *L); 
- 
-private: 
-  bool rotateLoop(Loop *L, bool SimplifiedLatch); 
-  bool simplifyLoopLatch(Loop *L); 
-}; 
-} // end anonymous namespace 
- 
-/// Insert (K, V) pair into the ValueToValueMap, and verify the key did not 
-/// previously exist in the map, and the value was inserted. 
-static void InsertNewValueIntoMap(ValueToValueMapTy &VM, Value *K, Value *V) { 
-  bool Inserted = VM.insert({K, V}).second; 
-  assert(Inserted); 
-  (void)Inserted; 
-} 
-/// RewriteUsesOfClonedInstructions - We just cloned the instructions from the 
-/// old header into the preheader.  If there were uses of the values produced by 
-/// these instruction that were outside of the loop, we have to insert PHI nodes 
-/// to merge the two values.  Do this now. 
-static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader, 
-                                            BasicBlock *OrigPreheader, 
-                                            ValueToValueMapTy &ValueMap, 
-                                SmallVectorImpl<PHINode*> *InsertedPHIs) { 
-  // Remove PHI node entries that are no longer live. 
-  BasicBlock::iterator I, E = OrigHeader->end(); 
-  for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I) 
-    PN->removeIncomingValue(PN->getBasicBlockIndex(OrigPreheader)); 
- 
-  // Now fix up users of the instructions in OrigHeader, inserting PHI nodes 
-  // as necessary. 
-  SSAUpdater SSA(InsertedPHIs); 
-  for (I = OrigHeader->begin(); I != E; ++I) { 
-    Value *OrigHeaderVal = &*I; 
- 
-    // If there are no uses of the value (e.g. because it returns void), there 
-    // is nothing to rewrite. 
-    if (OrigHeaderVal->use_empty()) 
-      continue; 
- 
-    Value *OrigPreHeaderVal = ValueMap.lookup(OrigHeaderVal); 
- 
-    // The value now exits in two versions: the initial value in the preheader 
-    // and the loop "next" value in the original header. 
-    SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName()); 
-    SSA.AddAvailableValue(OrigHeader, OrigHeaderVal); 
-    SSA.AddAvailableValue(OrigPreheader, OrigPreHeaderVal); 
- 
-    // Visit each use of the OrigHeader instruction. 
-    for (Value::use_iterator UI = OrigHeaderVal->use_begin(), 
-                             UE = OrigHeaderVal->use_end(); 
-         UI != UE;) { 
-      // Grab the use before incrementing the iterator. 
-      Use &U = *UI; 
- 
-      // Increment the iterator before removing the use from the list. 
-      ++UI; 
- 
-      // SSAUpdater can't handle a non-PHI use in the same block as an 
-      // earlier def. We can easily handle those cases manually. 
-      Instruction *UserInst = cast<Instruction>(U.getUser()); 
-      if (!isa<PHINode>(UserInst)) { 
-        BasicBlock *UserBB = UserInst->getParent(); 
- 
-        // The original users in the OrigHeader are already using the 
-        // original definitions. 
-        if (UserBB == OrigHeader) 
-          continue; 
- 
-        // Users in the OrigPreHeader need to use the value to which the 
-        // original definitions are mapped. 
-        if (UserBB == OrigPreheader) { 
-          U = OrigPreHeaderVal; 
-          continue; 
-        } 
-      } 
- 
-      // Anything else can be handled by SSAUpdater. 
-      SSA.RewriteUse(U); 
-    } 
- 
-    // Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug 
-    // intrinsics. 
-    SmallVector<DbgValueInst *, 1> DbgValues; 
-    llvm::findDbgValues(DbgValues, OrigHeaderVal); 
-    for (auto &DbgValue : DbgValues) { 
-      // The original users in the OrigHeader are already using the original 
-      // definitions. 
-      BasicBlock *UserBB = DbgValue->getParent(); 
-      if (UserBB == OrigHeader) 
-        continue; 
- 
-      // Users in the OrigPreHeader need to use the value to which the 
-      // original definitions are mapped and anything else can be handled by 
-      // the SSAUpdater. To avoid adding PHINodes, check if the value is 
-      // available in UserBB, if not substitute undef. 
-      Value *NewVal; 
-      if (UserBB == OrigPreheader) 
-        NewVal = OrigPreHeaderVal; 
-      else if (SSA.HasValueForBlock(UserBB)) 
-        NewVal = SSA.GetValueInMiddleOfBlock(UserBB); 
-      else 
-        NewVal = UndefValue::get(OrigHeaderVal->getType()); 
-      DbgValue->setOperand(0, 
-                           MetadataAsValue::get(OrigHeaderVal->getContext(), 
-                                                ValueAsMetadata::get(NewVal))); 
-    } 
-  } 
-} 
- 
-// Assuming both header and latch are exiting, look for a phi which is only 
-// used outside the loop (via a LCSSA phi) in the exit from the header. 
-// This means that rotating the loop can remove the phi. 
-static bool profitableToRotateLoopExitingLatch(Loop *L) { 
-  BasicBlock *Header = L->getHeader(); 
-  BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator()); 
-  assert(BI && BI->isConditional() && "need header with conditional exit"); 
-  BasicBlock *HeaderExit = BI->getSuccessor(0); 
-  if (L->contains(HeaderExit)) 
-    HeaderExit = BI->getSuccessor(1); 
- 
-  for (auto &Phi : Header->phis()) { 
-    // Look for uses of this phi in the loop/via exits other than the header. 
-    if (llvm::any_of(Phi.users(), [HeaderExit](const User *U) { 
-          return cast<Instruction>(U)->getParent() != HeaderExit; 
-        })) 
-      continue; 
-    return true; 
-  } 
-  return false; 
-} 
- 
-// Check that latch exit is deoptimizing (which means - very unlikely to happen) 
-// and there is another exit from the loop which is non-deoptimizing. 
-// If we rotate latch to that exit our loop has a better chance of being fully 
-// canonical. 
-// 
-// It can give false positives in some rare cases. 
-static bool canRotateDeoptimizingLatchExit(Loop *L) { 
-  BasicBlock *Latch = L->getLoopLatch(); 
-  assert(Latch && "need latch"); 
-  BranchInst *BI = dyn_cast<BranchInst>(Latch->getTerminator()); 
-  // Need normal exiting latch. 
-  if (!BI || !BI->isConditional()) 
-    return false; 
- 
-  BasicBlock *Exit = BI->getSuccessor(1); 
-  if (L->contains(Exit)) 
-    Exit = BI->getSuccessor(0); 
- 
-  // Latch exit is non-deoptimizing, no need to rotate. 
-  if (!Exit->getPostdominatingDeoptimizeCall()) 
-    return false; 
- 
-  SmallVector<BasicBlock *, 4> Exits; 
-  L->getUniqueExitBlocks(Exits); 
-  if (!Exits.empty()) { 
-    // There is at least one non-deoptimizing exit. 
-    // 
-    // Note, that BasicBlock::getPostdominatingDeoptimizeCall is not exact, 
-    // as it can conservatively return false for deoptimizing exits with 
-    // complex enough control flow down to deoptimize call. 
-    // 
-    // That means here we can report success for a case where 
-    // all exits are deoptimizing but one of them has complex enough 
-    // control flow (e.g. with loops). 
-    // 
-    // That should be a very rare case and false positives for this function 
-    // have compile-time effect only. 
-    return any_of(Exits, [](const BasicBlock *BB) { 
-      return !BB->getPostdominatingDeoptimizeCall(); 
-    }); 
-  } 
-  return false; 
-} 
- 
-/// Rotate loop LP. Return true if the loop is rotated. 
-/// 
-/// \param SimplifiedLatch is true if the latch was just folded into the final 
-/// loop exit. In this case we may want to rotate even though the new latch is 
-/// now an exiting branch. This rotation would have happened had the latch not 
-/// been simplified. However, if SimplifiedLatch is false, then we avoid 
-/// rotating loops in which the latch exits to avoid excessive or endless 
-/// rotation. LoopRotate should be repeatable and converge to a canonical 
-/// form. This property is satisfied because simplifying the loop latch can only 
-/// happen once across multiple invocations of the LoopRotate pass. 
-/// 
-/// If -loop-rotate-multi is enabled we can do multiple rotations in one go 
-/// so to reach a suitable (non-deoptimizing) exit. 
-bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { 
-  // If the loop has only one block then there is not much to rotate. 
-  if (L->getBlocks().size() == 1) 
-    return false; 
- 
-  bool Rotated = false; 
-  do { 
-    BasicBlock *OrigHeader = L->getHeader(); 
-    BasicBlock *OrigLatch = L->getLoopLatch(); 
- 
-    BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator()); 
-    if (!BI || BI->isUnconditional()) 
-      return Rotated; 
- 
-    // If the loop header is not one of the loop exiting blocks then 
-    // either this loop is already rotated or it is not 
-    // suitable for loop rotation transformations. 
-    if (!L->isLoopExiting(OrigHeader)) 
-      return Rotated; 
- 
-    // If the loop latch already contains a branch that leaves the loop then the 
-    // loop is already rotated. 
-    if (!OrigLatch) 
-      return Rotated; 
- 
-    // Rotate if either the loop latch does *not* exit the loop, or if the loop 
-    // latch was just simplified. Or if we think it will be profitable. 
-    if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch && IsUtilMode == false && 
-        !profitableToRotateLoopExitingLatch(L) && 
-        !canRotateDeoptimizingLatchExit(L)) 
-      return Rotated; 
- 
-    // Check size of original header and reject loop if it is very big or we can't 
-    // duplicate blocks inside it. 
-    { 
-      SmallPtrSet<const Value *, 32> EphValues; 
-      CodeMetrics::collectEphemeralValues(L, AC, EphValues); 
- 
-      CodeMetrics Metrics; 
+  bool processLoop(Loop *L);
+
+private:
+  bool rotateLoop(Loop *L, bool SimplifiedLatch);
+  bool simplifyLoopLatch(Loop *L);
+};
+} // end anonymous namespace
+
+/// Insert (K, V) pair into the ValueToValueMap, and verify the key did not
+/// previously exist in the map, and the value was inserted.
+static void InsertNewValueIntoMap(ValueToValueMapTy &VM, Value *K, Value *V) {
+  bool Inserted = VM.insert({K, V}).second;
+  assert(Inserted);
+  (void)Inserted;
+}
+/// RewriteUsesOfClonedInstructions - We just cloned the instructions from the
+/// old header into the preheader.  If there were uses of the values produced by
+/// these instruction that were outside of the loop, we have to insert PHI nodes
+/// to merge the two values.  Do this now.
+static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
+                                            BasicBlock *OrigPreheader,
+                                            ValueToValueMapTy &ValueMap,
+                                SmallVectorImpl<PHINode*> *InsertedPHIs) {
+  // Remove PHI node entries that are no longer live.
+  BasicBlock::iterator I, E = OrigHeader->end();
+  for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I)
+    PN->removeIncomingValue(PN->getBasicBlockIndex(OrigPreheader));
+
+  // Now fix up users of the instructions in OrigHeader, inserting PHI nodes
+  // as necessary.
+  SSAUpdater SSA(InsertedPHIs);
+  for (I = OrigHeader->begin(); I != E; ++I) {
+    Value *OrigHeaderVal = &*I;
+
+    // If there are no uses of the value (e.g. because it returns void), there
+    // is nothing to rewrite.
+    if (OrigHeaderVal->use_empty())
+      continue;
+
+    Value *OrigPreHeaderVal = ValueMap.lookup(OrigHeaderVal);
+
+    // The value now exits in two versions: the initial value in the preheader
+    // and the loop "next" value in the original header.
+    SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName());
+    SSA.AddAvailableValue(OrigHeader, OrigHeaderVal);
+    SSA.AddAvailableValue(OrigPreheader, OrigPreHeaderVal);
+
+    // Visit each use of the OrigHeader instruction.
+    for (Value::use_iterator UI = OrigHeaderVal->use_begin(),
+                             UE = OrigHeaderVal->use_end();
+         UI != UE;) {
+      // Grab the use before incrementing the iterator.
+      Use &U = *UI;
+
+      // Increment the iterator before removing the use from the list.
+      ++UI;
+
+      // SSAUpdater can't handle a non-PHI use in the same block as an
+      // earlier def. We can easily handle those cases manually.
+      Instruction *UserInst = cast<Instruction>(U.getUser());
+      if (!isa<PHINode>(UserInst)) {
+        BasicBlock *UserBB = UserInst->getParent();
+
+        // The original users in the OrigHeader are already using the
+        // original definitions.
+        if (UserBB == OrigHeader)
+          continue;
+
+        // Users in the OrigPreHeader need to use the value to which the
+        // original definitions are mapped.
+        if (UserBB == OrigPreheader) {
+          U = OrigPreHeaderVal;
+          continue;
+        }
+      }
+
+      // Anything else can be handled by SSAUpdater.
+      SSA.RewriteUse(U);
+    }
+
+    // Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug
+    // intrinsics.
+    SmallVector<DbgValueInst *, 1> DbgValues;
+    llvm::findDbgValues(DbgValues, OrigHeaderVal);
+    for (auto &DbgValue : DbgValues) {
+      // The original users in the OrigHeader are already using the original
+      // definitions.
+      BasicBlock *UserBB = DbgValue->getParent();
+      if (UserBB == OrigHeader)
+        continue;
+
+      // Users in the OrigPreHeader need to use the value to which the
+      // original definitions are mapped and anything else can be handled by
+      // the SSAUpdater. To avoid adding PHINodes, check if the value is
+      // available in UserBB, if not substitute undef.
+      Value *NewVal;
+      if (UserBB == OrigPreheader)
+        NewVal = OrigPreHeaderVal;
+      else if (SSA.HasValueForBlock(UserBB))
+        NewVal = SSA.GetValueInMiddleOfBlock(UserBB);
+      else
+        NewVal = UndefValue::get(OrigHeaderVal->getType());
+      DbgValue->setOperand(0,
+                           MetadataAsValue::get(OrigHeaderVal->getContext(),
+                                                ValueAsMetadata::get(NewVal)));
+    }
+  }
+}
+
+// Assuming both header and latch are exiting, look for a phi which is only
+// used outside the loop (via a LCSSA phi) in the exit from the header.
+// This means that rotating the loop can remove the phi.
+static bool profitableToRotateLoopExitingLatch(Loop *L) {
+  BasicBlock *Header = L->getHeader();
+  BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator());
+  assert(BI && BI->isConditional() && "need header with conditional exit");
+  BasicBlock *HeaderExit = BI->getSuccessor(0);
+  if (L->contains(HeaderExit))
+    HeaderExit = BI->getSuccessor(1);
+
+  for (auto &Phi : Header->phis()) {
+    // Look for uses of this phi in the loop/via exits other than the header.
+    if (llvm::any_of(Phi.users(), [HeaderExit](const User *U) {
+          return cast<Instruction>(U)->getParent() != HeaderExit;
+        }))
+      continue;
+    return true;
+  }
+  return false;
+}
+
+// Check that latch exit is deoptimizing (which means - very unlikely to happen)
+// and there is another exit from the loop which is non-deoptimizing.
+// If we rotate latch to that exit our loop has a better chance of being fully
+// canonical.
+//
+// It can give false positives in some rare cases.
+static bool canRotateDeoptimizingLatchExit(Loop *L) {
+  BasicBlock *Latch = L->getLoopLatch();
+  assert(Latch && "need latch");
+  BranchInst *BI = dyn_cast<BranchInst>(Latch->getTerminator());
+  // Need normal exiting latch.
+  if (!BI || !BI->isConditional())
+    return false;
+
+  BasicBlock *Exit = BI->getSuccessor(1);
+  if (L->contains(Exit))
+    Exit = BI->getSuccessor(0);
+
+  // Latch exit is non-deoptimizing, no need to rotate.
+  if (!Exit->getPostdominatingDeoptimizeCall())
+    return false;
+
+  SmallVector<BasicBlock *, 4> Exits;
+  L->getUniqueExitBlocks(Exits);
+  if (!Exits.empty()) {
+    // There is at least one non-deoptimizing exit.
+    //
+    // Note, that BasicBlock::getPostdominatingDeoptimizeCall is not exact,
+    // as it can conservatively return false for deoptimizing exits with
+    // complex enough control flow down to deoptimize call.
+    //
+    // That means here we can report success for a case where
+    // all exits are deoptimizing but one of them has complex enough
+    // control flow (e.g. with loops).
+    //
+    // That should be a very rare case and false positives for this function
+    // have compile-time effect only.
+    return any_of(Exits, [](const BasicBlock *BB) {
+      return !BB->getPostdominatingDeoptimizeCall();
+    });
+  }
+  return false;
+}
+
+/// Rotate loop LP. Return true if the loop is rotated.
+///
+/// \param SimplifiedLatch is true if the latch was just folded into the final
+/// loop exit. In this case we may want to rotate even though the new latch is
+/// now an exiting branch. This rotation would have happened had the latch not
+/// been simplified. However, if SimplifiedLatch is false, then we avoid
+/// rotating loops in which the latch exits to avoid excessive or endless
+/// rotation. LoopRotate should be repeatable and converge to a canonical
+/// form. This property is satisfied because simplifying the loop latch can only
+/// happen once across multiple invocations of the LoopRotate pass.
+///
+/// If -loop-rotate-multi is enabled we can do multiple rotations in one go
+/// so to reach a suitable (non-deoptimizing) exit.
+bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
+  // If the loop has only one block then there is not much to rotate.
+  if (L->getBlocks().size() == 1)
+    return false;
+
+  bool Rotated = false;
+  do {
+    BasicBlock *OrigHeader = L->getHeader();
+    BasicBlock *OrigLatch = L->getLoopLatch();
+
+    BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
+    if (!BI || BI->isUnconditional())
+      return Rotated;
+
+    // If the loop header is not one of the loop exiting blocks then
+    // either this loop is already rotated or it is not
+    // suitable for loop rotation transformations.
+    if (!L->isLoopExiting(OrigHeader))
+      return Rotated;
+
+    // If the loop latch already contains a branch that leaves the loop then the
+    // loop is already rotated.
+    if (!OrigLatch)
+      return Rotated;
+
+    // Rotate if either the loop latch does *not* exit the loop, or if the loop
+    // latch was just simplified. Or if we think it will be profitable.
+    if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch && IsUtilMode == false &&
+        !profitableToRotateLoopExitingLatch(L) &&
+        !canRotateDeoptimizingLatchExit(L))
+      return Rotated;
+
+    // Check size of original header and reject loop if it is very big or we can't
+    // duplicate blocks inside it.
+    {
+      SmallPtrSet<const Value *, 32> EphValues;
+      CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
+      CodeMetrics Metrics;
       Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues, PrepareForLTO);
-      if (Metrics.notDuplicatable) { 
-        LLVM_DEBUG( 
-                   dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable" 
-                   << " instructions: "; 
-                   L->dump()); 
-        return Rotated; 
-      } 
-      if (Metrics.convergent) { 
-        LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent " 
-                   "instructions: "; 
-                   L->dump()); 
-        return Rotated; 
-      } 
-      if (Metrics.NumInsts > MaxHeaderSize) { 
-        LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains " 
-                          << Metrics.NumInsts 
-                          << " instructions, which is more than the threshold (" 
-                          << MaxHeaderSize << " instructions): "; 
-                   L->dump()); 
+      if (Metrics.notDuplicatable) {
+        LLVM_DEBUG(
+                   dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
+                   << " instructions: ";
+                   L->dump());
+        return Rotated;
+      }
+      if (Metrics.convergent) {
+        LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
+                   "instructions: ";
+                   L->dump());
+        return Rotated;
+      }
+      if (Metrics.NumInsts > MaxHeaderSize) {
+        LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains "
+                          << Metrics.NumInsts
+                          << " instructions, which is more than the threshold ("
+                          << MaxHeaderSize << " instructions): ";
+                   L->dump());
         ++NumNotRotatedDueToHeaderSize;
-        return Rotated; 
-      } 
+        return Rotated;
+      }
 
       // When preparing for LTO, avoid rotating loops with calls that could be
       // inlined during the LTO stage.
       if (PrepareForLTO && Metrics.NumInlineCandidates > 0)
         return Rotated;
-    } 
- 
-    // Now, this loop is suitable for rotation. 
-    BasicBlock *OrigPreheader = L->getLoopPreheader(); 
- 
-    // If the loop could not be converted to canonical form, it must have an 
-    // indirectbr in it, just give up. 
-    if (!OrigPreheader || !L->hasDedicatedExits()) 
-      return Rotated; 
- 
-    // Anything ScalarEvolution may know about this loop or the PHI nodes 
-    // in its header will soon be invalidated. We should also invalidate 
-    // all outer loops because insertion and deletion of blocks that happens 
-    // during the rotation may violate invariants related to backedge taken 
-    // infos in them. 
-    if (SE) 
-      SE->forgetTopmostLoop(L); 
- 
-    LLVM_DEBUG(dbgs() << "LoopRotation: rotating "; L->dump()); 
-    if (MSSAU && VerifyMemorySSA) 
-      MSSAU->getMemorySSA()->verifyMemorySSA(); 
- 
-    // Find new Loop header. NewHeader is a Header's one and only successor 
-    // that is inside loop.  Header's other successor is outside the 
-    // loop.  Otherwise loop is not suitable for rotation. 
-    BasicBlock *Exit = BI->getSuccessor(0); 
-    BasicBlock *NewHeader = BI->getSuccessor(1); 
-    if (L->contains(Exit)) 
-      std::swap(Exit, NewHeader); 
-    assert(NewHeader && "Unable to determine new loop header"); 
-    assert(L->contains(NewHeader) && !L->contains(Exit) && 
-           "Unable to determine loop header and exit blocks"); 
- 
-    // This code assumes that the new header has exactly one predecessor. 
-    // Remove any single-entry PHI nodes in it. 
-    assert(NewHeader->getSinglePredecessor() && 
-           "New header doesn't have one pred!"); 
-    FoldSingleEntryPHINodes(NewHeader); 
- 
-    // Begin by walking OrigHeader and populating ValueMap with an entry for 
-    // each Instruction. 
-    BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end(); 
-    ValueToValueMapTy ValueMap, ValueMapMSSA; 
- 
-    // For PHI nodes, the value available in OldPreHeader is just the 
-    // incoming value from OldPreHeader. 
-    for (; PHINode *PN = dyn_cast<PHINode>(I); ++I) 
-      InsertNewValueIntoMap(ValueMap, PN, 
-                            PN->getIncomingValueForBlock(OrigPreheader)); 
- 
-    // For the rest of the instructions, either hoist to the OrigPreheader if 
-    // possible or create a clone in the OldPreHeader if not. 
-    Instruction *LoopEntryBranch = OrigPreheader->getTerminator(); 
- 
-    // Record all debug intrinsics preceding LoopEntryBranch to avoid duplication. 
-    using DbgIntrinsicHash = 
-      std::pair<std::pair<Value *, DILocalVariable *>, DIExpression *>; 
-    auto makeHash = [](DbgVariableIntrinsic *D) -> DbgIntrinsicHash { 
-      return {{D->getVariableLocation(), D->getVariable()}, D->getExpression()}; 
-    }; 
-    SmallDenseSet<DbgIntrinsicHash, 8> DbgIntrinsics; 
-    for (auto I = std::next(OrigPreheader->rbegin()), E = OrigPreheader->rend(); 
-         I != E; ++I) { 
-      if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&*I)) 
-        DbgIntrinsics.insert(makeHash(DII)); 
-      else 
-        break; 
-    } 
- 
+    }
+
+    // Now, this loop is suitable for rotation.
+    BasicBlock *OrigPreheader = L->getLoopPreheader();
+
+    // If the loop could not be converted to canonical form, it must have an
+    // indirectbr in it, just give up.
+    if (!OrigPreheader || !L->hasDedicatedExits())
+      return Rotated;
+
+    // Anything ScalarEvolution may know about this loop or the PHI nodes
+    // in its header will soon be invalidated. We should also invalidate
+    // all outer loops because insertion and deletion of blocks that happens
+    // during the rotation may violate invariants related to backedge taken
+    // infos in them.
+    if (SE)
+      SE->forgetTopmostLoop(L);
+
+    LLVM_DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
+    if (MSSAU && VerifyMemorySSA)
+      MSSAU->getMemorySSA()->verifyMemorySSA();
+
+    // Find new Loop header. NewHeader is a Header's one and only successor
+    // that is inside loop.  Header's other successor is outside the
+    // loop.  Otherwise loop is not suitable for rotation.
+    BasicBlock *Exit = BI->getSuccessor(0);
+    BasicBlock *NewHeader = BI->getSuccessor(1);
+    if (L->contains(Exit))
+      std::swap(Exit, NewHeader);
+    assert(NewHeader && "Unable to determine new loop header");
+    assert(L->contains(NewHeader) && !L->contains(Exit) &&
+           "Unable to determine loop header and exit blocks");
+
+    // This code assumes that the new header has exactly one predecessor.
+    // Remove any single-entry PHI nodes in it.
+    assert(NewHeader->getSinglePredecessor() &&
+           "New header doesn't have one pred!");
+    FoldSingleEntryPHINodes(NewHeader);
+
+    // Begin by walking OrigHeader and populating ValueMap with an entry for
+    // each Instruction.
+    BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
+    ValueToValueMapTy ValueMap, ValueMapMSSA;
+
+    // For PHI nodes, the value available in OldPreHeader is just the
+    // incoming value from OldPreHeader.
+    for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
+      InsertNewValueIntoMap(ValueMap, PN,
+                            PN->getIncomingValueForBlock(OrigPreheader));
+
+    // For the rest of the instructions, either hoist to the OrigPreheader if
+    // possible or create a clone in the OldPreHeader if not.
+    Instruction *LoopEntryBranch = OrigPreheader->getTerminator();
+
+    // Record all debug intrinsics preceding LoopEntryBranch to avoid duplication.
+    using DbgIntrinsicHash =
+      std::pair<std::pair<Value *, DILocalVariable *>, DIExpression *>;
+    auto makeHash = [](DbgVariableIntrinsic *D) -> DbgIntrinsicHash {
+      return {{D->getVariableLocation(), D->getVariable()}, D->getExpression()};
+    };
+    SmallDenseSet<DbgIntrinsicHash, 8> DbgIntrinsics;
+    for (auto I = std::next(OrigPreheader->rbegin()), E = OrigPreheader->rend();
+         I != E; ++I) {
+      if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&*I))
+        DbgIntrinsics.insert(makeHash(DII));
+      else
+        break;
+    }
+
     // Remember the local noalias scope declarations in the header. After the
     // rotation, they must be duplicated and the scope must be cloned. This
     // avoids unwanted interaction across iterations.
@@ -409,66 +409,66 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
       if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I))
         NoAliasDeclInstructions.push_back(Decl);
 
-    while (I != E) { 
-      Instruction *Inst = &*I++; 
- 
-      // If the instruction's operands are invariant and it doesn't read or write 
-      // memory, then it is safe to hoist.  Doing this doesn't change the order of 
-      // execution in the preheader, but does prevent the instruction from 
-      // executing in each iteration of the loop.  This means it is safe to hoist 
-      // something that might trap, but isn't safe to hoist something that reads 
-      // memory (without proving that the loop doesn't write). 
-      if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() && 
-          !Inst->mayWriteToMemory() && !Inst->isTerminator() && 
-          !isa<DbgInfoIntrinsic>(Inst) && !isa<AllocaInst>(Inst)) { 
-        Inst->moveBefore(LoopEntryBranch); 
-        continue; 
-      } 
- 
-      // Otherwise, create a duplicate of the instruction. 
-      Instruction *C = Inst->clone(); 
- 
-      // Eagerly remap the operands of the instruction. 
-      RemapInstruction(C, ValueMap, 
-                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); 
- 
-      // Avoid inserting the same intrinsic twice. 
-      if (auto *DII = dyn_cast<DbgVariableIntrinsic>(C)) 
-        if (DbgIntrinsics.count(makeHash(DII))) { 
-          C->deleteValue(); 
-          continue; 
-        } 
- 
-      // With the operands remapped, see if the instruction constant folds or is 
-      // otherwise simplifyable.  This commonly occurs because the entry from PHI 
-      // nodes allows icmps and other instructions to fold. 
-      Value *V = SimplifyInstruction(C, SQ); 
-      if (V && LI->replacementPreservesLCSSAForm(C, V)) { 
-        // If so, then delete the temporary instruction and stick the folded value 
-        // in the map. 
-        InsertNewValueIntoMap(ValueMap, Inst, V); 
-        if (!C->mayHaveSideEffects()) { 
-          C->deleteValue(); 
-          C = nullptr; 
-        } 
-      } else { 
-        InsertNewValueIntoMap(ValueMap, Inst, C); 
-      } 
-      if (C) { 
-        // Otherwise, stick the new instruction into the new block! 
-        C->setName(Inst->getName()); 
-        C->insertBefore(LoopEntryBranch); 
- 
-        if (auto *II = dyn_cast<IntrinsicInst>(C)) 
-          if (II->getIntrinsicID() == Intrinsic::assume) 
-            AC->registerAssumption(II); 
-        // MemorySSA cares whether the cloned instruction was inserted or not, and 
-        // not whether it can be remapped to a simplified value. 
-        if (MSSAU) 
-          InsertNewValueIntoMap(ValueMapMSSA, Inst, C); 
-      } 
-    } 
- 
+    while (I != E) {
+      Instruction *Inst = &*I++;
+
+      // If the instruction's operands are invariant and it doesn't read or write
+      // memory, then it is safe to hoist.  Doing this doesn't change the order of
+      // execution in the preheader, but does prevent the instruction from
+      // executing in each iteration of the loop.  This means it is safe to hoist
+      // something that might trap, but isn't safe to hoist something that reads
+      // memory (without proving that the loop doesn't write).
+      if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() &&
+          !Inst->mayWriteToMemory() && !Inst->isTerminator() &&
+          !isa<DbgInfoIntrinsic>(Inst) && !isa<AllocaInst>(Inst)) {
+        Inst->moveBefore(LoopEntryBranch);
+        continue;
+      }
+
+      // Otherwise, create a duplicate of the instruction.
+      Instruction *C = Inst->clone();
+
+      // Eagerly remap the operands of the instruction.
+      RemapInstruction(C, ValueMap,
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+
+      // Avoid inserting the same intrinsic twice.
+      if (auto *DII = dyn_cast<DbgVariableIntrinsic>(C))
+        if (DbgIntrinsics.count(makeHash(DII))) {
+          C->deleteValue();
+          continue;
+        }
+
+      // With the operands remapped, see if the instruction constant folds or is
+      // otherwise simplifyable.  This commonly occurs because the entry from PHI
+      // nodes allows icmps and other instructions to fold.
+      Value *V = SimplifyInstruction(C, SQ);
+      if (V && LI->replacementPreservesLCSSAForm(C, V)) {
+        // If so, then delete the temporary instruction and stick the folded value
+        // in the map.
+        InsertNewValueIntoMap(ValueMap, Inst, V);
+        if (!C->mayHaveSideEffects()) {
+          C->deleteValue();
+          C = nullptr;
+        }
+      } else {
+        InsertNewValueIntoMap(ValueMap, Inst, C);
+      }
+      if (C) {
+        // Otherwise, stick the new instruction into the new block!
+        C->setName(Inst->getName());
+        C->insertBefore(LoopEntryBranch);
+
+        if (auto *II = dyn_cast<IntrinsicInst>(C))
+          if (II->getIntrinsicID() == Intrinsic::assume)
+            AC->registerAssumption(II);
+        // MemorySSA cares whether the cloned instruction was inserted or not, and
+        // not whether it can be remapped to a simplified value.
+        if (MSSAU)
+          InsertNewValueIntoMap(ValueMapMSSA, Inst, C);
+      }
+    }
+
     if (!NoAliasDeclInstructions.empty()) {
       // There are noalias scope declarations:
       // (general):
@@ -532,300 +532,300 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
       }
     }
 
-    // Along with all the other instructions, we just cloned OrigHeader's 
-    // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's 
-    // successors by duplicating their incoming values for OrigHeader. 
-    for (BasicBlock *SuccBB : successors(OrigHeader)) 
-      for (BasicBlock::iterator BI = SuccBB->begin(); 
-           PHINode *PN = dyn_cast<PHINode>(BI); ++BI) 
-        PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader); 
- 
-    // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove 
-    // OrigPreHeader's old terminator (the original branch into the loop), and 
-    // remove the corresponding incoming values from the PHI nodes in OrigHeader. 
-    LoopEntryBranch->eraseFromParent(); 
- 
-    // Update MemorySSA before the rewrite call below changes the 1:1 
-    // instruction:cloned_instruction_or_value mapping. 
-    if (MSSAU) { 
-      InsertNewValueIntoMap(ValueMapMSSA, OrigHeader, OrigPreheader); 
-      MSSAU->updateForClonedBlockIntoPred(OrigHeader, OrigPreheader, 
-                                          ValueMapMSSA); 
-    } 
- 
-    SmallVector<PHINode*, 2> InsertedPHIs; 
-    // If there were any uses of instructions in the duplicated block outside the 
-    // loop, update them, inserting PHI nodes as required 
-    RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap, 
-                                    &InsertedPHIs); 
- 
-    // Attach dbg.value intrinsics to the new phis if that phi uses a value that 
-    // previously had debug metadata attached. This keeps the debug info 
-    // up-to-date in the loop body. 
-    if (!InsertedPHIs.empty()) 
-      insertDebugValuesForPHIs(OrigHeader, InsertedPHIs); 
- 
-    // NewHeader is now the header of the loop. 
-    L->moveToHeader(NewHeader); 
-    assert(L->getHeader() == NewHeader && "Latch block is our new header"); 
- 
-    // Inform DT about changes to the CFG. 
-    if (DT) { 
-      // The OrigPreheader branches to the NewHeader and Exit now. Then, inform 
-      // the DT about the removed edge to the OrigHeader (that got removed). 
-      SmallVector<DominatorTree::UpdateType, 3> Updates; 
-      Updates.push_back({DominatorTree::Insert, OrigPreheader, Exit}); 
-      Updates.push_back({DominatorTree::Insert, OrigPreheader, NewHeader}); 
-      Updates.push_back({DominatorTree::Delete, OrigPreheader, OrigHeader}); 
- 
-      if (MSSAU) { 
+    // Along with all the other instructions, we just cloned OrigHeader's
+    // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
+    // successors by duplicating their incoming values for OrigHeader.
+    for (BasicBlock *SuccBB : successors(OrigHeader))
+      for (BasicBlock::iterator BI = SuccBB->begin();
+           PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+        PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader);
+
+    // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove
+    // OrigPreHeader's old terminator (the original branch into the loop), and
+    // remove the corresponding incoming values from the PHI nodes in OrigHeader.
+    LoopEntryBranch->eraseFromParent();
+
+    // Update MemorySSA before the rewrite call below changes the 1:1
+    // instruction:cloned_instruction_or_value mapping.
+    if (MSSAU) {
+      InsertNewValueIntoMap(ValueMapMSSA, OrigHeader, OrigPreheader);
+      MSSAU->updateForClonedBlockIntoPred(OrigHeader, OrigPreheader,
+                                          ValueMapMSSA);
+    }
+
+    SmallVector<PHINode*, 2> InsertedPHIs;
+    // If there were any uses of instructions in the duplicated block outside the
+    // loop, update them, inserting PHI nodes as required
+    RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap,
+                                    &InsertedPHIs);
+
+    // Attach dbg.value intrinsics to the new phis if that phi uses a value that
+    // previously had debug metadata attached. This keeps the debug info
+    // up-to-date in the loop body.
+    if (!InsertedPHIs.empty())
+      insertDebugValuesForPHIs(OrigHeader, InsertedPHIs);
+
+    // NewHeader is now the header of the loop.
+    L->moveToHeader(NewHeader);
+    assert(L->getHeader() == NewHeader && "Latch block is our new header");
+
+    // Inform DT about changes to the CFG.
+    if (DT) {
+      // The OrigPreheader branches to the NewHeader and Exit now. Then, inform
+      // the DT about the removed edge to the OrigHeader (that got removed).
+      SmallVector<DominatorTree::UpdateType, 3> Updates;
+      Updates.push_back({DominatorTree::Insert, OrigPreheader, Exit});
+      Updates.push_back({DominatorTree::Insert, OrigPreheader, NewHeader});
+      Updates.push_back({DominatorTree::Delete, OrigPreheader, OrigHeader});
+
+      if (MSSAU) {
         MSSAU->applyUpdates(Updates, *DT, /*UpdateDT=*/true);
-        if (VerifyMemorySSA) 
-          MSSAU->getMemorySSA()->verifyMemorySSA(); 
+        if (VerifyMemorySSA)
+          MSSAU->getMemorySSA()->verifyMemorySSA();
       } else {
         DT->applyUpdates(Updates);
-      } 
-    } 
- 
-    // At this point, we've finished our major CFG changes.  As part of cloning 
-    // the loop into the preheader we've simplified instructions and the 
-    // duplicated conditional branch may now be branching on a constant.  If it is 
-    // branching on a constant and if that constant means that we enter the loop, 
-    // then we fold away the cond branch to an uncond branch.  This simplifies the 
-    // loop in cases important for nested loops, and it also means we don't have 
-    // to split as many edges. 
-    BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator()); 
-    assert(PHBI->isConditional() && "Should be clone of BI condbr!"); 
-    if (!isa<ConstantInt>(PHBI->getCondition()) || 
-        PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero()) != 
-        NewHeader) { 
-      // The conditional branch can't be folded, handle the general case. 
-      // Split edges as necessary to preserve LoopSimplify form. 
- 
-      // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and 
-      // thus is not a preheader anymore. 
-      // Split the edge to form a real preheader. 
-      BasicBlock *NewPH = SplitCriticalEdge( 
-                                            OrigPreheader, NewHeader, 
-                                            CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA()); 
-      NewPH->setName(NewHeader->getName() + ".lr.ph"); 
- 
-      // Preserve canonical loop form, which means that 'Exit' should have only 
-      // one predecessor. Note that Exit could be an exit block for multiple 
-      // nested loops, causing both of the edges to now be critical and need to 
-      // be split. 
-      SmallVector<BasicBlock *, 4> ExitPreds(pred_begin(Exit), pred_end(Exit)); 
-      bool SplitLatchEdge = false; 
-      for (BasicBlock *ExitPred : ExitPreds) { 
-        // We only need to split loop exit edges. 
-        Loop *PredLoop = LI->getLoopFor(ExitPred); 
-        if (!PredLoop || PredLoop->contains(Exit) || 
-            ExitPred->getTerminator()->isIndirectTerminator()) 
-          continue; 
-        SplitLatchEdge |= L->getLoopLatch() == ExitPred; 
-        BasicBlock *ExitSplit = SplitCriticalEdge( 
-                                                  ExitPred, Exit, 
-                                                  CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA()); 
-        ExitSplit->moveBefore(Exit); 
-      } 
-      assert(SplitLatchEdge && 
-             "Despite splitting all preds, failed to split latch exit?"); 
-    } else { 
-      // We can fold the conditional branch in the preheader, this makes things 
-      // simpler. The first step is to remove the extra edge to the Exit block. 
-      Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/); 
-      BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI); 
-      NewBI->setDebugLoc(PHBI->getDebugLoc()); 
-      PHBI->eraseFromParent(); 
- 
-      // With our CFG finalized, update DomTree if it is available. 
-      if (DT) DT->deleteEdge(OrigPreheader, Exit); 
- 
-      // Update MSSA too, if available. 
-      if (MSSAU) 
-        MSSAU->removeEdge(OrigPreheader, Exit); 
-    } 
- 
-    assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation"); 
-    assert(L->getLoopLatch() && "Invalid loop latch after loop rotation"); 
- 
-    if (MSSAU && VerifyMemorySSA) 
-      MSSAU->getMemorySSA()->verifyMemorySSA(); 
- 
-    // Now that the CFG and DomTree are in a consistent state again, try to merge 
-    // the OrigHeader block into OrigLatch.  This will succeed if they are 
-    // connected by an unconditional branch.  This is just a cleanup so the 
-    // emitted code isn't too gross in this common case. 
-    DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); 
+      }
+    }
+
+    // At this point, we've finished our major CFG changes.  As part of cloning
+    // the loop into the preheader we've simplified instructions and the
+    // duplicated conditional branch may now be branching on a constant.  If it is
+    // branching on a constant and if that constant means that we enter the loop,
+    // then we fold away the cond branch to an uncond branch.  This simplifies the
+    // loop in cases important for nested loops, and it also means we don't have
+    // to split as many edges.
+    BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());
+    assert(PHBI->isConditional() && "Should be clone of BI condbr!");
+    if (!isa<ConstantInt>(PHBI->getCondition()) ||
+        PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero()) !=
+        NewHeader) {
+      // The conditional branch can't be folded, handle the general case.
+      // Split edges as necessary to preserve LoopSimplify form.
+
+      // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
+      // thus is not a preheader anymore.
+      // Split the edge to form a real preheader.
+      BasicBlock *NewPH = SplitCriticalEdge(
+                                            OrigPreheader, NewHeader,
+                                            CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA());
+      NewPH->setName(NewHeader->getName() + ".lr.ph");
+
+      // Preserve canonical loop form, which means that 'Exit' should have only
+      // one predecessor. Note that Exit could be an exit block for multiple
+      // nested loops, causing both of the edges to now be critical and need to
+      // be split.
+      SmallVector<BasicBlock *, 4> ExitPreds(pred_begin(Exit), pred_end(Exit));
+      bool SplitLatchEdge = false;
+      for (BasicBlock *ExitPred : ExitPreds) {
+        // We only need to split loop exit edges.
+        Loop *PredLoop = LI->getLoopFor(ExitPred);
+        if (!PredLoop || PredLoop->contains(Exit) ||
+            ExitPred->getTerminator()->isIndirectTerminator())
+          continue;
+        SplitLatchEdge |= L->getLoopLatch() == ExitPred;
+        BasicBlock *ExitSplit = SplitCriticalEdge(
+                                                  ExitPred, Exit,
+                                                  CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA());
+        ExitSplit->moveBefore(Exit);
+      }
+      assert(SplitLatchEdge &&
+             "Despite splitting all preds, failed to split latch exit?");
+    } else {
+      // We can fold the conditional branch in the preheader, this makes things
+      // simpler. The first step is to remove the extra edge to the Exit block.
+      Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
+      BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI);
+      NewBI->setDebugLoc(PHBI->getDebugLoc());
+      PHBI->eraseFromParent();
+
+      // With our CFG finalized, update DomTree if it is available.
+      if (DT) DT->deleteEdge(OrigPreheader, Exit);
+
+      // Update MSSA too, if available.
+      if (MSSAU)
+        MSSAU->removeEdge(OrigPreheader, Exit);
+    }
+
+    assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
+    assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
+
+    if (MSSAU && VerifyMemorySSA)
+      MSSAU->getMemorySSA()->verifyMemorySSA();
+
+    // Now that the CFG and DomTree are in a consistent state again, try to merge
+    // the OrigHeader block into OrigLatch.  This will succeed if they are
+    // connected by an unconditional branch.  This is just a cleanup so the
+    // emitted code isn't too gross in this common case.
+    DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
     BasicBlock *PredBB = OrigHeader->getUniquePredecessor();
     bool DidMerge = MergeBlockIntoPredecessor(OrigHeader, &DTU, LI, MSSAU);
     if (DidMerge)
       RemoveRedundantDbgInstrs(PredBB);
- 
-    if (MSSAU && VerifyMemorySSA) 
-      MSSAU->getMemorySSA()->verifyMemorySSA(); 
- 
-    LLVM_DEBUG(dbgs() << "LoopRotation: into "; L->dump()); 
- 
-    ++NumRotated; 
- 
-    Rotated = true; 
-    SimplifiedLatch = false; 
- 
-    // Check that new latch is a deoptimizing exit and then repeat rotation if possible. 
-    // Deoptimizing latch exit is not a generally typical case, so we just loop over. 
-    // TODO: if it becomes a performance bottleneck extend rotation algorithm 
-    // to handle multiple rotations in one go. 
-  } while (MultiRotate && canRotateDeoptimizingLatchExit(L)); 
- 
- 
-  return true; 
-} 
- 
-/// Determine whether the instructions in this range may be safely and cheaply 
-/// speculated. This is not an important enough situation to develop complex 
-/// heuristics. We handle a single arithmetic instruction along with any type 
-/// conversions. 
-static bool shouldSpeculateInstrs(BasicBlock::iterator Begin, 
-                                  BasicBlock::iterator End, Loop *L) { 
-  bool seenIncrement = false; 
-  bool MultiExitLoop = false; 
- 
-  if (!L->getExitingBlock()) 
-    MultiExitLoop = true; 
- 
-  for (BasicBlock::iterator I = Begin; I != End; ++I) { 
- 
-    if (!isSafeToSpeculativelyExecute(&*I)) 
-      return false; 
- 
-    if (isa<DbgInfoIntrinsic>(I)) 
-      continue; 
- 
-    switch (I->getOpcode()) { 
-    default: 
-      return false; 
-    case Instruction::GetElementPtr: 
-      // GEPs are cheap if all indices are constant. 
-      if (!cast<GEPOperator>(I)->hasAllConstantIndices()) 
-        return false; 
-      // fall-thru to increment case 
-      LLVM_FALLTHROUGH; 
-    case Instruction::Add: 
-    case Instruction::Sub: 
-    case Instruction::And: 
-    case Instruction::Or: 
-    case Instruction::Xor: 
-    case Instruction::Shl: 
-    case Instruction::LShr: 
-    case Instruction::AShr: { 
-      Value *IVOpnd = 
-          !isa<Constant>(I->getOperand(0)) 
-              ? I->getOperand(0) 
-              : !isa<Constant>(I->getOperand(1)) ? I->getOperand(1) : nullptr; 
-      if (!IVOpnd) 
-        return false; 
- 
-      // If increment operand is used outside of the loop, this speculation 
-      // could cause extra live range interference. 
-      if (MultiExitLoop) { 
-        for (User *UseI : IVOpnd->users()) { 
-          auto *UserInst = cast<Instruction>(UseI); 
-          if (!L->contains(UserInst)) 
-            return false; 
-        } 
-      } 
- 
-      if (seenIncrement) 
-        return false; 
-      seenIncrement = true; 
-      break; 
-    } 
-    case Instruction::Trunc: 
-    case Instruction::ZExt: 
-    case Instruction::SExt: 
-      // ignore type conversions 
-      break; 
-    } 
-  } 
-  return true; 
-} 
- 
-/// Fold the loop tail into the loop exit by speculating the loop tail 
-/// instructions. Typically, this is a single post-increment. In the case of a 
-/// simple 2-block loop, hoisting the increment can be much better than 
-/// duplicating the entire loop header. In the case of loops with early exits, 
-/// rotation will not work anyway, but simplifyLoopLatch will put the loop in 
-/// canonical form so downstream passes can handle it. 
-/// 
-/// I don't believe this invalidates SCEV. 
-bool LoopRotate::simplifyLoopLatch(Loop *L) { 
-  BasicBlock *Latch = L->getLoopLatch(); 
-  if (!Latch || Latch->hasAddressTaken()) 
-    return false; 
- 
-  BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator()); 
-  if (!Jmp || !Jmp->isUnconditional()) 
-    return false; 
- 
-  BasicBlock *LastExit = Latch->getSinglePredecessor(); 
-  if (!LastExit || !L->isLoopExiting(LastExit)) 
-    return false; 
- 
-  BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator()); 
-  if (!BI) 
-    return false; 
- 
-  if (!shouldSpeculateInstrs(Latch->begin(), Jmp->getIterator(), L)) 
-    return false; 
- 
-  LLVM_DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into " 
-                    << LastExit->getName() << "\n"); 
- 
-  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); 
-  MergeBlockIntoPredecessor(Latch, &DTU, LI, MSSAU, nullptr, 
-                            /*PredecessorWithTwoSuccessors=*/true); 
- 
-  if (MSSAU && VerifyMemorySSA) 
-    MSSAU->getMemorySSA()->verifyMemorySSA(); 
- 
-  return true; 
-} 
- 
-/// Rotate \c L, and return true if any modification was made. 
-bool LoopRotate::processLoop(Loop *L) { 
-  // Save the loop metadata. 
-  MDNode *LoopMD = L->getLoopID(); 
- 
-  bool SimplifiedLatch = false; 
- 
-  // Simplify the loop latch before attempting to rotate the header 
-  // upward. Rotation may not be needed if the loop tail can be folded into the 
-  // loop exit. 
-  if (!RotationOnly) 
-    SimplifiedLatch = simplifyLoopLatch(L); 
- 
-  bool MadeChange = rotateLoop(L, SimplifiedLatch); 
-  assert((!MadeChange || L->isLoopExiting(L->getLoopLatch())) && 
-         "Loop latch should be exiting after loop-rotate."); 
- 
-  // Restore the loop metadata. 
-  // NB! We presume LoopRotation DOESN'T ADD its own metadata. 
-  if ((MadeChange || SimplifiedLatch) && LoopMD) 
-    L->setLoopID(LoopMD); 
- 
-  return MadeChange || SimplifiedLatch; 
-} 
- 
- 
-/// The utility to convert a loop into a loop with bottom test. 
-bool llvm::LoopRotation(Loop *L, LoopInfo *LI, const TargetTransformInfo *TTI, 
-                        AssumptionCache *AC, DominatorTree *DT, 
-                        ScalarEvolution *SE, MemorySSAUpdater *MSSAU, 
-                        const SimplifyQuery &SQ, bool RotationOnly = true, 
-                        unsigned Threshold = unsigned(-1), 
+
+    if (MSSAU && VerifyMemorySSA)
+      MSSAU->getMemorySSA()->verifyMemorySSA();
+
+    LLVM_DEBUG(dbgs() << "LoopRotation: into "; L->dump());
+
+    ++NumRotated;
+
+    Rotated = true;
+    SimplifiedLatch = false;
+
+    // Check that new latch is a deoptimizing exit and then repeat rotation if possible.
+    // Deoptimizing latch exit is not a generally typical case, so we just loop over.
+    // TODO: if it becomes a performance bottleneck extend rotation algorithm
+    // to handle multiple rotations in one go.
+  } while (MultiRotate && canRotateDeoptimizingLatchExit(L));
+
+
+  return true;
+}
+
+/// Determine whether the instructions in this range may be safely and cheaply
+/// speculated. This is not an important enough situation to develop complex
+/// heuristics. We handle a single arithmetic instruction along with any type
+/// conversions.
+static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
+                                  BasicBlock::iterator End, Loop *L) {
+  bool seenIncrement = false;
+  bool MultiExitLoop = false;
+
+  if (!L->getExitingBlock())
+    MultiExitLoop = true;
+
+  for (BasicBlock::iterator I = Begin; I != End; ++I) {
+
+    if (!isSafeToSpeculativelyExecute(&*I))
+      return false;
+
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
+
+    switch (I->getOpcode()) {
+    default:
+      return false;
+    case Instruction::GetElementPtr:
+      // GEPs are cheap if all indices are constant.
+      if (!cast<GEPOperator>(I)->hasAllConstantIndices())
+        return false;
+      // fall-thru to increment case
+      LLVM_FALLTHROUGH;
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr: {
+      Value *IVOpnd =
+          !isa<Constant>(I->getOperand(0))
+              ? I->getOperand(0)
+              : !isa<Constant>(I->getOperand(1)) ? I->getOperand(1) : nullptr;
+      if (!IVOpnd)
+        return false;
+
+      // If increment operand is used outside of the loop, this speculation
+      // could cause extra live range interference.
+      if (MultiExitLoop) {
+        for (User *UseI : IVOpnd->users()) {
+          auto *UserInst = cast<Instruction>(UseI);
+          if (!L->contains(UserInst))
+            return false;
+        }
+      }
+
+      if (seenIncrement)
+        return false;
+      seenIncrement = true;
+      break;
+    }
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+      // ignore type conversions
+      break;
+    }
+  }
+  return true;
+}
+
+/// Fold the loop tail into the loop exit by speculating the loop tail
+/// instructions. Typically, this is a single post-increment. In the case of a
+/// simple 2-block loop, hoisting the increment can be much better than
+/// duplicating the entire loop header. In the case of loops with early exits,
+/// rotation will not work anyway, but simplifyLoopLatch will put the loop in
+/// canonical form so downstream passes can handle it.
+///
+/// I don't believe this invalidates SCEV.
+bool LoopRotate::simplifyLoopLatch(Loop *L) {
+  BasicBlock *Latch = L->getLoopLatch();
+  if (!Latch || Latch->hasAddressTaken())
+    return false;
+
+  BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator());
+  if (!Jmp || !Jmp->isUnconditional())
+    return false;
+
+  BasicBlock *LastExit = Latch->getSinglePredecessor();
+  if (!LastExit || !L->isLoopExiting(LastExit))
+    return false;
+
+  BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator());
+  if (!BI)
+    return false;
+
+  if (!shouldSpeculateInstrs(Latch->begin(), Jmp->getIterator(), L))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into "
+                    << LastExit->getName() << "\n");
+
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+  MergeBlockIntoPredecessor(Latch, &DTU, LI, MSSAU, nullptr,
+                            /*PredecessorWithTwoSuccessors=*/true);
+
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
+  return true;
+}
+
+/// Rotate \c L, and return true if any modification was made.
+bool LoopRotate::processLoop(Loop *L) {
+  // Save the loop metadata.
+  MDNode *LoopMD = L->getLoopID();
+
+  bool SimplifiedLatch = false;
+
+  // Simplify the loop latch before attempting to rotate the header
+  // upward. Rotation may not be needed if the loop tail can be folded into the
+  // loop exit.
+  if (!RotationOnly)
+    SimplifiedLatch = simplifyLoopLatch(L);
+
+  bool MadeChange = rotateLoop(L, SimplifiedLatch);
+  assert((!MadeChange || L->isLoopExiting(L->getLoopLatch())) &&
+         "Loop latch should be exiting after loop-rotate.");
+
+  // Restore the loop metadata.
+  // NB! We presume LoopRotation DOESN'T ADD its own metadata.
+  if ((MadeChange || SimplifiedLatch) && LoopMD)
+    L->setLoopID(LoopMD);
+
+  return MadeChange || SimplifiedLatch;
+}
+
+
+/// The utility to convert a loop into a loop with bottom test.
+bool llvm::LoopRotation(Loop *L, LoopInfo *LI, const TargetTransformInfo *TTI,
+                        AssumptionCache *AC, DominatorTree *DT,
+                        ScalarEvolution *SE, MemorySSAUpdater *MSSAU,
+                        const SimplifyQuery &SQ, bool RotationOnly = true,
+                        unsigned Threshold = unsigned(-1),
                         bool IsUtilMode = true, bool PrepareForLTO) {
-  LoopRotate LR(Threshold, LI, TTI, AC, DT, SE, MSSAU, SQ, RotationOnly, 
+  LoopRotate LR(Threshold, LI, TTI, AC, DT, SE, MSSAU, SQ, RotationOnly,
                 IsUtilMode, PrepareForLTO);
-  return LR.processLoop(L); 
-} 
+  return LR.processLoop(L);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/LoopSimplify.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/LoopSimplify.cpp
index bb724747a2..2e104334ad 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/LoopSimplify.cpp
@@ -1,946 +1,946 @@
-//===- LoopSimplify.cpp - Loop Canonicalization Pass ----------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass performs several transformations to transform natural loops into a 
-// simpler form, which makes subsequent analyses and transformations simpler and 
-// more effective. 
-// 
-// Loop pre-header insertion guarantees that there is a single, non-critical 
-// entry edge from outside of the loop to the loop header.  This simplifies a 
-// number of analyses and transformations, such as LICM. 
-// 
-// Loop exit-block insertion guarantees that all exit blocks from the loop 
-// (blocks which are outside of the loop that have predecessors inside of the 
-// loop) only have predecessors from inside of the loop (and are thus dominated 
-// by the loop header).  This simplifies transformations such as store-sinking 
-// that are built into LICM. 
-// 
-// This pass also guarantees that loops will have exactly one backedge. 
-// 
-// Indirectbr instructions introduce several complications. If the loop 
-// contains or is entered by an indirectbr instruction, it may not be possible 
-// to transform the loop and make these guarantees. Client code should check 
-// that these conditions are true before relying on them. 
-// 
-// Similar complications arise from callbr instructions, particularly in 
-// asm-goto where blockaddress expressions are used. 
-// 
-// Note that the simplifycfg pass will clean up blocks which are split out but 
-// end up being unnecessary, so usage of this pass should not pessimize 
-// generated code. 
-// 
-// This pass obviously modifies the CFG, but updates loop information and 
-// dominator information. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/LoopSimplify.h" 
-#include "llvm/ADT/DepthFirstIterator.h" 
-#include "llvm/ADT/SetOperations.h" 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/AliasAnalysis.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/BasicAliasAnalysis.h" 
-#include "llvm/Analysis/BranchProbabilityInfo.h" 
-#include "llvm/Analysis/DependenceAnalysis.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/MemorySSA.h" 
-#include "llvm/Analysis/MemorySSAUpdater.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Utils.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Transforms/Utils/LoopUtils.h" 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "loop-simplify" 
- 
-STATISTIC(NumNested  , "Number of nested loops split out"); 
- 
-// If the block isn't already, move the new block to right after some 'outside 
-// block' block.  This prevents the preheader from being placed inside the loop 
-// body, e.g. when the loop hasn't been rotated. 
-static void placeSplitBlockCarefully(BasicBlock *NewBB, 
-                                     SmallVectorImpl<BasicBlock *> &SplitPreds, 
-                                     Loop *L) { 
-  // Check to see if NewBB is already well placed. 
-  Function::iterator BBI = --NewBB->getIterator(); 
-  for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) { 
-    if (&*BBI == SplitPreds[i]) 
-      return; 
-  } 
- 
-  // If it isn't already after an outside block, move it after one.  This is 
-  // always good as it makes the uncond branch from the outside block into a 
-  // fall-through. 
- 
-  // Figure out *which* outside block to put this after.  Prefer an outside 
-  // block that neighbors a BB actually in the loop. 
-  BasicBlock *FoundBB = nullptr; 
-  for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) { 
-    Function::iterator BBI = SplitPreds[i]->getIterator(); 
-    if (++BBI != NewBB->getParent()->end() && L->contains(&*BBI)) { 
-      FoundBB = SplitPreds[i]; 
-      break; 
-    } 
-  } 
- 
-  // If our heuristic for a *good* bb to place this after doesn't find 
-  // anything, just pick something.  It's likely better than leaving it within 
-  // the loop. 
-  if (!FoundBB) 
-    FoundBB = SplitPreds[0]; 
-  NewBB->moveAfter(FoundBB); 
-} 
- 
-/// InsertPreheaderForLoop - Once we discover that a loop doesn't have a 
-/// preheader, this method is called to insert one.  This method has two phases: 
-/// preheader insertion and analysis updating. 
-/// 
-BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, DominatorTree *DT, 
-                                         LoopInfo *LI, MemorySSAUpdater *MSSAU, 
-                                         bool PreserveLCSSA) { 
-  BasicBlock *Header = L->getHeader(); 
- 
-  // Compute the set of predecessors of the loop that are not in the loop. 
-  SmallVector<BasicBlock*, 8> OutsideBlocks; 
-  for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header); 
-       PI != PE; ++PI) { 
-    BasicBlock *P = *PI; 
-    if (!L->contains(P)) {         // Coming in from outside the loop? 
-      // If the loop is branched to from an indirect terminator, we won't 
-      // be able to fully transform the loop, because it prohibits 
-      // edge splitting. 
-      if (P->getTerminator()->isIndirectTerminator()) 
-        return nullptr; 
- 
-      // Keep track of it. 
-      OutsideBlocks.push_back(P); 
-    } 
-  } 
- 
-  // Split out the loop pre-header. 
-  BasicBlock *PreheaderBB; 
-  PreheaderBB = SplitBlockPredecessors(Header, OutsideBlocks, ".preheader", DT, 
-                                       LI, MSSAU, PreserveLCSSA); 
-  if (!PreheaderBB) 
-    return nullptr; 
- 
-  LLVM_DEBUG(dbgs() << "LoopSimplify: Creating pre-header " 
-                    << PreheaderBB->getName() << "\n"); 
- 
-  // Make sure that NewBB is put someplace intelligent, which doesn't mess up 
-  // code layout too horribly. 
-  placeSplitBlockCarefully(PreheaderBB, OutsideBlocks, L); 
- 
-  return PreheaderBB; 
-} 
- 
-/// Add the specified block, and all of its predecessors, to the specified set, 
-/// if it's not already in there.  Stop predecessor traversal when we reach 
-/// StopBlock. 
-static void addBlockAndPredsToSet(BasicBlock *InputBB, BasicBlock *StopBlock, 
+//===- LoopSimplify.cpp - Loop Canonicalization Pass ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs several transformations to transform natural loops into a
+// simpler form, which makes subsequent analyses and transformations simpler and
+// more effective.
+//
+// Loop pre-header insertion guarantees that there is a single, non-critical
+// entry edge from outside of the loop to the loop header.  This simplifies a
+// number of analyses and transformations, such as LICM.
+//
+// Loop exit-block insertion guarantees that all exit blocks from the loop
+// (blocks which are outside of the loop that have predecessors inside of the
+// loop) only have predecessors from inside of the loop (and are thus dominated
+// by the loop header).  This simplifies transformations such as store-sinking
+// that are built into LICM.
+//
+// This pass also guarantees that loops will have exactly one backedge.
+//
+// Indirectbr instructions introduce several complications. If the loop
+// contains or is entered by an indirectbr instruction, it may not be possible
+// to transform the loop and make these guarantees. Client code should check
+// that these conditions are true before relying on them.
+//
+// Similar complications arise from callbr instructions, particularly in
+// asm-goto where blockaddress expressions are used.
+//
+// Note that the simplifycfg pass will clean up blocks which are split out but
+// end up being unnecessary, so usage of this pass should not pessimize
+// generated code.
+//
+// This pass obviously modifies the CFG, but updates loop information and
+// dominator information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-simplify"
+
+STATISTIC(NumNested  , "Number of nested loops split out");
+
+// If the block isn't already, move the new block to right after some 'outside
+// block' block.  This prevents the preheader from being placed inside the loop
+// body, e.g. when the loop hasn't been rotated.
+static void placeSplitBlockCarefully(BasicBlock *NewBB,
+                                     SmallVectorImpl<BasicBlock *> &SplitPreds,
+                                     Loop *L) {
+  // Check to see if NewBB is already well placed.
+  Function::iterator BBI = --NewBB->getIterator();
+  for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) {
+    if (&*BBI == SplitPreds[i])
+      return;
+  }
+
+  // If it isn't already after an outside block, move it after one.  This is
+  // always good as it makes the uncond branch from the outside block into a
+  // fall-through.
+
+  // Figure out *which* outside block to put this after.  Prefer an outside
+  // block that neighbors a BB actually in the loop.
+  BasicBlock *FoundBB = nullptr;
+  for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) {
+    Function::iterator BBI = SplitPreds[i]->getIterator();
+    if (++BBI != NewBB->getParent()->end() && L->contains(&*BBI)) {
+      FoundBB = SplitPreds[i];
+      break;
+    }
+  }
+
+  // If our heuristic for a *good* bb to place this after doesn't find
+  // anything, just pick something.  It's likely better than leaving it within
+  // the loop.
+  if (!FoundBB)
+    FoundBB = SplitPreds[0];
+  NewBB->moveAfter(FoundBB);
+}
+
+/// InsertPreheaderForLoop - Once we discover that a loop doesn't have a
+/// preheader, this method is called to insert one.  This method has two phases:
+/// preheader insertion and analysis updating.
+///
+BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, DominatorTree *DT,
+                                         LoopInfo *LI, MemorySSAUpdater *MSSAU,
+                                         bool PreserveLCSSA) {
+  BasicBlock *Header = L->getHeader();
+
+  // Compute the set of predecessors of the loop that are not in the loop.
+  SmallVector<BasicBlock*, 8> OutsideBlocks;
+  for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header);
+       PI != PE; ++PI) {
+    BasicBlock *P = *PI;
+    if (!L->contains(P)) {         // Coming in from outside the loop?
+      // If the loop is branched to from an indirect terminator, we won't
+      // be able to fully transform the loop, because it prohibits
+      // edge splitting.
+      if (P->getTerminator()->isIndirectTerminator())
+        return nullptr;
+
+      // Keep track of it.
+      OutsideBlocks.push_back(P);
+    }
+  }
+
+  // Split out the loop pre-header.
+  BasicBlock *PreheaderBB;
+  PreheaderBB = SplitBlockPredecessors(Header, OutsideBlocks, ".preheader", DT,
+                                       LI, MSSAU, PreserveLCSSA);
+  if (!PreheaderBB)
+    return nullptr;
+
+  LLVM_DEBUG(dbgs() << "LoopSimplify: Creating pre-header "
+                    << PreheaderBB->getName() << "\n");
+
+  // Make sure that NewBB is put someplace intelligent, which doesn't mess up
+  // code layout too horribly.
+  placeSplitBlockCarefully(PreheaderBB, OutsideBlocks, L);
+
+  return PreheaderBB;
+}
+
+/// Add the specified block, and all of its predecessors, to the specified set,
+/// if it's not already in there.  Stop predecessor traversal when we reach
+/// StopBlock.
+static void addBlockAndPredsToSet(BasicBlock *InputBB, BasicBlock *StopBlock,
                                   SmallPtrSetImpl<BasicBlock *> &Blocks) {
-  SmallVector<BasicBlock *, 8> Worklist; 
-  Worklist.push_back(InputBB); 
-  do { 
-    BasicBlock *BB = Worklist.pop_back_val(); 
-    if (Blocks.insert(BB).second && BB != StopBlock) 
-      // If BB is not already processed and it is not a stop block then 
-      // insert its predecessor in the work list 
+  SmallVector<BasicBlock *, 8> Worklist;
+  Worklist.push_back(InputBB);
+  do {
+    BasicBlock *BB = Worklist.pop_back_val();
+    if (Blocks.insert(BB).second && BB != StopBlock)
+      // If BB is not already processed and it is not a stop block then
+      // insert its predecessor in the work list
       append_range(Worklist, predecessors(BB));
-  } while (!Worklist.empty()); 
-} 
- 
-/// The first part of loop-nestification is to find a PHI node that tells 
-/// us how to partition the loops. 
-static PHINode *findPHIToPartitionLoops(Loop *L, DominatorTree *DT, 
-                                        AssumptionCache *AC) { 
-  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 
-  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ) { 
-    PHINode *PN = cast<PHINode>(I); 
-    ++I; 
-    if (Value *V = SimplifyInstruction(PN, {DL, nullptr, DT, AC})) { 
-      // This is a degenerate PHI already, don't modify it! 
-      PN->replaceAllUsesWith(V); 
-      PN->eraseFromParent(); 
-      continue; 
-    } 
- 
-    // Scan this PHI node looking for a use of the PHI node by itself. 
-    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) 
-      if (PN->getIncomingValue(i) == PN && 
-          L->contains(PN->getIncomingBlock(i))) 
-        // We found something tasty to remove. 
-        return PN; 
-  } 
-  return nullptr; 
-} 
- 
-/// If this loop has multiple backedges, try to pull one of them out into 
-/// a nested loop. 
-/// 
-/// This is important for code that looks like 
-/// this: 
-/// 
-///  Loop: 
-///     ... 
-///     br cond, Loop, Next 
-///     ... 
-///     br cond2, Loop, Out 
-/// 
-/// To identify this common case, we look at the PHI nodes in the header of the 
-/// loop.  PHI nodes with unchanging values on one backedge correspond to values 
-/// that change in the "outer" loop, but not in the "inner" loop. 
-/// 
-/// If we are able to separate out a loop, return the new outer loop that was 
-/// created. 
-/// 
-static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader, 
-                                DominatorTree *DT, LoopInfo *LI, 
-                                ScalarEvolution *SE, bool PreserveLCSSA, 
-                                AssumptionCache *AC, MemorySSAUpdater *MSSAU) { 
-  // Don't try to separate loops without a preheader. 
-  if (!Preheader) 
-    return nullptr; 
- 
-  // Treat the presence of convergent functions conservatively. The 
-  // transformation is invalid if calls to certain convergent 
-  // functions (like an AMDGPU barrier) get included in the resulting 
-  // inner loop. But blocks meant for the inner loop will be 
-  // identified later at a point where it's too late to abort the 
-  // transformation. Also, the convergent attribute is not really 
-  // sufficient to express the semantics of functions that are 
-  // affected by this transformation. So we choose to back off if such 
-  // a function call is present until a better alternative becomes 
-  // available. This is similar to the conservative treatment of 
-  // convergent function calls in GVNHoist and JumpThreading. 
-  for (auto BB : L->blocks()) { 
-    for (auto &II : *BB) { 
-      if (auto CI = dyn_cast<CallBase>(&II)) { 
-        if (CI->isConvergent()) { 
-          return nullptr; 
-        } 
-      } 
-    } 
-  } 
- 
-  // The header is not a landing pad; preheader insertion should ensure this. 
-  BasicBlock *Header = L->getHeader(); 
-  assert(!Header->isEHPad() && "Can't insert backedge to EH pad"); 
- 
-  PHINode *PN = findPHIToPartitionLoops(L, DT, AC); 
-  if (!PN) return nullptr;  // No known way to partition. 
- 
-  // Pull out all predecessors that have varying values in the loop.  This 
-  // handles the case when a PHI node has multiple instances of itself as 
-  // arguments. 
-  SmallVector<BasicBlock*, 8> OuterLoopPreds; 
-  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { 
-    if (PN->getIncomingValue(i) != PN || 
-        !L->contains(PN->getIncomingBlock(i))) { 
-      // We can't split indirect control flow edges. 
-      if (PN->getIncomingBlock(i)->getTerminator()->isIndirectTerminator()) 
-        return nullptr; 
-      OuterLoopPreds.push_back(PN->getIncomingBlock(i)); 
-    } 
-  } 
-  LLVM_DEBUG(dbgs() << "LoopSimplify: Splitting out a new outer loop\n"); 
- 
-  // If ScalarEvolution is around and knows anything about values in 
-  // this loop, tell it to forget them, because we're about to 
-  // substantially change it. 
-  if (SE) 
-    SE->forgetLoop(L); 
- 
-  BasicBlock *NewBB = SplitBlockPredecessors(Header, OuterLoopPreds, ".outer", 
-                                             DT, LI, MSSAU, PreserveLCSSA); 
- 
-  // Make sure that NewBB is put someplace intelligent, which doesn't mess up 
-  // code layout too horribly. 
-  placeSplitBlockCarefully(NewBB, OuterLoopPreds, L); 
- 
-  // Create the new outer loop. 
-  Loop *NewOuter = LI->AllocateLoop(); 
- 
-  // Change the parent loop to use the outer loop as its child now. 
-  if (Loop *Parent = L->getParentLoop()) 
-    Parent->replaceChildLoopWith(L, NewOuter); 
-  else 
-    LI->changeTopLevelLoop(L, NewOuter); 
- 
-  // L is now a subloop of our outer loop. 
-  NewOuter->addChildLoop(L); 
- 
-  for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); 
-       I != E; ++I) 
-    NewOuter->addBlockEntry(*I); 
- 
-  // Now reset the header in L, which had been moved by 
-  // SplitBlockPredecessors for the outer loop. 
-  L->moveToHeader(Header); 
- 
-  // Determine which blocks should stay in L and which should be moved out to 
-  // the Outer loop now. 
+  } while (!Worklist.empty());
+}
+
+/// The first part of loop-nestification is to find a PHI node that tells
+/// us how to partition the loops.
+static PHINode *findPHIToPartitionLoops(Loop *L, DominatorTree *DT,
+                                        AssumptionCache *AC) {
+  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ) {
+    PHINode *PN = cast<PHINode>(I);
+    ++I;
+    if (Value *V = SimplifyInstruction(PN, {DL, nullptr, DT, AC})) {
+      // This is a degenerate PHI already, don't modify it!
+      PN->replaceAllUsesWith(V);
+      PN->eraseFromParent();
+      continue;
+    }
+
+    // Scan this PHI node looking for a use of the PHI node by itself.
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (PN->getIncomingValue(i) == PN &&
+          L->contains(PN->getIncomingBlock(i)))
+        // We found something tasty to remove.
+        return PN;
+  }
+  return nullptr;
+}
+
+/// If this loop has multiple backedges, try to pull one of them out into
+/// a nested loop.
+///
+/// This is important for code that looks like
+/// this:
+///
+///  Loop:
+///     ...
+///     br cond, Loop, Next
+///     ...
+///     br cond2, Loop, Out
+///
+/// To identify this common case, we look at the PHI nodes in the header of the
+/// loop.  PHI nodes with unchanging values on one backedge correspond to values
+/// that change in the "outer" loop, but not in the "inner" loop.
+///
+/// If we are able to separate out a loop, return the new outer loop that was
+/// created.
+///
+static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader,
+                                DominatorTree *DT, LoopInfo *LI,
+                                ScalarEvolution *SE, bool PreserveLCSSA,
+                                AssumptionCache *AC, MemorySSAUpdater *MSSAU) {
+  // Don't try to separate loops without a preheader.
+  if (!Preheader)
+    return nullptr;
+
+  // Treat the presence of convergent functions conservatively. The
+  // transformation is invalid if calls to certain convergent
+  // functions (like an AMDGPU barrier) get included in the resulting
+  // inner loop. But blocks meant for the inner loop will be
+  // identified later at a point where it's too late to abort the
+  // transformation. Also, the convergent attribute is not really
+  // sufficient to express the semantics of functions that are
+  // affected by this transformation. So we choose to back off if such
+  // a function call is present until a better alternative becomes
+  // available. This is similar to the conservative treatment of
+  // convergent function calls in GVNHoist and JumpThreading.
+  for (auto BB : L->blocks()) {
+    for (auto &II : *BB) {
+      if (auto CI = dyn_cast<CallBase>(&II)) {
+        if (CI->isConvergent()) {
+          return nullptr;
+        }
+      }
+    }
+  }
+
+  // The header is not a landing pad; preheader insertion should ensure this.
+  BasicBlock *Header = L->getHeader();
+  assert(!Header->isEHPad() && "Can't insert backedge to EH pad");
+
+  PHINode *PN = findPHIToPartitionLoops(L, DT, AC);
+  if (!PN) return nullptr;  // No known way to partition.
+
+  // Pull out all predecessors that have varying values in the loop.  This
+  // handles the case when a PHI node has multiple instances of itself as
+  // arguments.
+  SmallVector<BasicBlock*, 8> OuterLoopPreds;
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    if (PN->getIncomingValue(i) != PN ||
+        !L->contains(PN->getIncomingBlock(i))) {
+      // We can't split indirect control flow edges.
+      if (PN->getIncomingBlock(i)->getTerminator()->isIndirectTerminator())
+        return nullptr;
+      OuterLoopPreds.push_back(PN->getIncomingBlock(i));
+    }
+  }
+  LLVM_DEBUG(dbgs() << "LoopSimplify: Splitting out a new outer loop\n");
+
+  // If ScalarEvolution is around and knows anything about values in
+  // this loop, tell it to forget them, because we're about to
+  // substantially change it.
+  if (SE)
+    SE->forgetLoop(L);
+
+  BasicBlock *NewBB = SplitBlockPredecessors(Header, OuterLoopPreds, ".outer",
+                                             DT, LI, MSSAU, PreserveLCSSA);
+
+  // Make sure that NewBB is put someplace intelligent, which doesn't mess up
+  // code layout too horribly.
+  placeSplitBlockCarefully(NewBB, OuterLoopPreds, L);
+
+  // Create the new outer loop.
+  Loop *NewOuter = LI->AllocateLoop();
+
+  // Change the parent loop to use the outer loop as its child now.
+  if (Loop *Parent = L->getParentLoop())
+    Parent->replaceChildLoopWith(L, NewOuter);
+  else
+    LI->changeTopLevelLoop(L, NewOuter);
+
+  // L is now a subloop of our outer loop.
+  NewOuter->addChildLoop(L);
+
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I)
+    NewOuter->addBlockEntry(*I);
+
+  // Now reset the header in L, which had been moved by
+  // SplitBlockPredecessors for the outer loop.
+  L->moveToHeader(Header);
+
+  // Determine which blocks should stay in L and which should be moved out to
+  // the Outer loop now.
   SmallPtrSet<BasicBlock *, 4> BlocksInL;
   for (BasicBlock *P : predecessors(Header)) {
-    if (DT->dominates(Header, P)) 
-      addBlockAndPredsToSet(P, Header, BlocksInL); 
-  } 
- 
-  // Scan all of the loop children of L, moving them to OuterLoop if they are 
-  // not part of the inner loop. 
-  const std::vector<Loop*> &SubLoops = L->getSubLoops(); 
-  for (size_t I = 0; I != SubLoops.size(); ) 
-    if (BlocksInL.count(SubLoops[I]->getHeader())) 
-      ++I;   // Loop remains in L 
-    else 
-      NewOuter->addChildLoop(L->removeChildLoop(SubLoops.begin() + I)); 
- 
-  SmallVector<BasicBlock *, 8> OuterLoopBlocks; 
-  OuterLoopBlocks.push_back(NewBB); 
-  // Now that we know which blocks are in L and which need to be moved to 
-  // OuterLoop, move any blocks that need it. 
-  for (unsigned i = 0; i != L->getBlocks().size(); ++i) { 
-    BasicBlock *BB = L->getBlocks()[i]; 
-    if (!BlocksInL.count(BB)) { 
-      // Move this block to the parent, updating the exit blocks sets 
-      L->removeBlockFromLoop(BB); 
-      if ((*LI)[BB] == L) { 
-        LI->changeLoopFor(BB, NewOuter); 
-        OuterLoopBlocks.push_back(BB); 
-      } 
-      --i; 
-    } 
-  } 
- 
-  // Split edges to exit blocks from the inner loop, if they emerged in the 
-  // process of separating the outer one. 
-  formDedicatedExitBlocks(L, DT, LI, MSSAU, PreserveLCSSA); 
- 
-  if (PreserveLCSSA) { 
-    // Fix LCSSA form for L. Some values, which previously were only used inside 
-    // L, can now be used in NewOuter loop. We need to insert phi-nodes for them 
-    // in corresponding exit blocks. 
-    // We don't need to form LCSSA recursively, because there cannot be uses 
-    // inside a newly created loop of defs from inner loops as those would 
-    // already be a use of an LCSSA phi node. 
-    formLCSSA(*L, *DT, LI, SE); 
- 
-    assert(NewOuter->isRecursivelyLCSSAForm(*DT, *LI) && 
-           "LCSSA is broken after separating nested loops!"); 
-  } 
- 
-  return NewOuter; 
-} 
- 
-/// This method is called when the specified loop has more than one 
-/// backedge in it. 
-/// 
-/// If this occurs, revector all of these backedges to target a new basic block 
-/// and have that block branch to the loop header.  This ensures that loops 
-/// have exactly one backedge. 
-static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader, 
-                                             DominatorTree *DT, LoopInfo *LI, 
-                                             MemorySSAUpdater *MSSAU) { 
-  assert(L->getNumBackEdges() > 1 && "Must have > 1 backedge!"); 
- 
-  // Get information about the loop 
-  BasicBlock *Header = L->getHeader(); 
-  Function *F = Header->getParent(); 
- 
-  // Unique backedge insertion currently depends on having a preheader. 
-  if (!Preheader) 
-    return nullptr; 
- 
-  // The header is not an EH pad; preheader insertion should ensure this. 
-  assert(!Header->isEHPad() && "Can't insert backedge to EH pad"); 
- 
-  // Figure out which basic blocks contain back-edges to the loop header. 
-  std::vector<BasicBlock*> BackedgeBlocks; 
-  for (pred_iterator I = pred_begin(Header), E = pred_end(Header); I != E; ++I){ 
-    BasicBlock *P = *I; 
- 
-    // Indirect edges cannot be split, so we must fail if we find one. 
-    if (P->getTerminator()->isIndirectTerminator()) 
-      return nullptr; 
- 
-    if (P != Preheader) BackedgeBlocks.push_back(P); 
-  } 
- 
-  // Create and insert the new backedge block... 
-  BasicBlock *BEBlock = BasicBlock::Create(Header->getContext(), 
-                                           Header->getName() + ".backedge", F); 
-  BranchInst *BETerminator = BranchInst::Create(Header, BEBlock); 
-  BETerminator->setDebugLoc(Header->getFirstNonPHI()->getDebugLoc()); 
- 
-  LLVM_DEBUG(dbgs() << "LoopSimplify: Inserting unique backedge block " 
-                    << BEBlock->getName() << "\n"); 
- 
-  // Move the new backedge block to right after the last backedge block. 
-  Function::iterator InsertPos = ++BackedgeBlocks.back()->getIterator(); 
-  F->getBasicBlockList().splice(InsertPos, F->getBasicBlockList(), BEBlock); 
- 
-  // Now that the block has been inserted into the function, create PHI nodes in 
-  // the backedge block which correspond to any PHI nodes in the header block. 
-  for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) { 
-    PHINode *PN = cast<PHINode>(I); 
-    PHINode *NewPN = PHINode::Create(PN->getType(), BackedgeBlocks.size(), 
-                                     PN->getName()+".be", BETerminator); 
- 
-    // Loop over the PHI node, moving all entries except the one for the 
-    // preheader over to the new PHI node. 
-    unsigned PreheaderIdx = ~0U; 
-    bool HasUniqueIncomingValue = true; 
-    Value *UniqueValue = nullptr; 
-    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { 
-      BasicBlock *IBB = PN->getIncomingBlock(i); 
-      Value *IV = PN->getIncomingValue(i); 
-      if (IBB == Preheader) { 
-        PreheaderIdx = i; 
-      } else { 
-        NewPN->addIncoming(IV, IBB); 
-        if (HasUniqueIncomingValue) { 
-          if (!UniqueValue) 
-            UniqueValue = IV; 
-          else if (UniqueValue != IV) 
-            HasUniqueIncomingValue = false; 
-        } 
-      } 
-    } 
- 
-    // Delete all of the incoming values from the old PN except the preheader's 
-    assert(PreheaderIdx != ~0U && "PHI has no preheader entry??"); 
-    if (PreheaderIdx != 0) { 
-      PN->setIncomingValue(0, PN->getIncomingValue(PreheaderIdx)); 
-      PN->setIncomingBlock(0, PN->getIncomingBlock(PreheaderIdx)); 
-    } 
-    // Nuke all entries except the zero'th. 
-    for (unsigned i = 0, e = PN->getNumIncomingValues()-1; i != e; ++i) 
-      PN->removeIncomingValue(e-i, false); 
- 
-    // Finally, add the newly constructed PHI node as the entry for the BEBlock. 
-    PN->addIncoming(NewPN, BEBlock); 
- 
-    // As an optimization, if all incoming values in the new PhiNode (which is a 
-    // subset of the incoming values of the old PHI node) have the same value, 
-    // eliminate the PHI Node. 
-    if (HasUniqueIncomingValue) { 
-      NewPN->replaceAllUsesWith(UniqueValue); 
-      BEBlock->getInstList().erase(NewPN); 
-    } 
-  } 
- 
-  // Now that all of the PHI nodes have been inserted and adjusted, modify the 
-  // backedge blocks to jump to the BEBlock instead of the header. 
-  // If one of the backedges has llvm.loop metadata attached, we remove 
-  // it from the backedge and add it to BEBlock. 
-  unsigned LoopMDKind = BEBlock->getContext().getMDKindID("llvm.loop"); 
-  MDNode *LoopMD = nullptr; 
-  for (unsigned i = 0, e = BackedgeBlocks.size(); i != e; ++i) { 
-    Instruction *TI = BackedgeBlocks[i]->getTerminator(); 
-    if (!LoopMD) 
-      LoopMD = TI->getMetadata(LoopMDKind); 
-    TI->setMetadata(LoopMDKind, nullptr); 
-    TI->replaceSuccessorWith(Header, BEBlock); 
-  } 
-  BEBlock->getTerminator()->setMetadata(LoopMDKind, LoopMD); 
- 
-  //===--- Update all analyses which we must preserve now -----------------===// 
- 
-  // Update Loop Information - we know that this block is now in the current 
-  // loop and all parent loops. 
-  L->addBasicBlockToLoop(BEBlock, *LI); 
- 
-  // Update dominator information 
-  DT->splitBlock(BEBlock); 
- 
-  if (MSSAU) 
-    MSSAU->updatePhisWhenInsertingUniqueBackedgeBlock(Header, Preheader, 
-                                                      BEBlock); 
- 
-  return BEBlock; 
-} 
- 
-/// Simplify one loop and queue further loops for simplification. 
-static bool simplifyOneLoop(Loop *L, SmallVectorImpl<Loop *> &Worklist, 
-                            DominatorTree *DT, LoopInfo *LI, 
-                            ScalarEvolution *SE, AssumptionCache *AC, 
-                            MemorySSAUpdater *MSSAU, bool PreserveLCSSA) { 
-  bool Changed = false; 
-  if (MSSAU && VerifyMemorySSA) 
-    MSSAU->getMemorySSA()->verifyMemorySSA(); 
- 
-ReprocessLoop: 
- 
-  // Check to see that no blocks (other than the header) in this loop have 
-  // predecessors that are not in the loop.  This is not valid for natural 
-  // loops, but can occur if the blocks are unreachable.  Since they are 
-  // unreachable we can just shamelessly delete those CFG edges! 
-  for (Loop::block_iterator BB = L->block_begin(), E = L->block_end(); 
-       BB != E; ++BB) { 
-    if (*BB == L->getHeader()) continue; 
- 
-    SmallPtrSet<BasicBlock*, 4> BadPreds; 
-    for (pred_iterator PI = pred_begin(*BB), 
-         PE = pred_end(*BB); PI != PE; ++PI) { 
-      BasicBlock *P = *PI; 
-      if (!L->contains(P)) 
-        BadPreds.insert(P); 
-    } 
- 
-    // Delete each unique out-of-loop (and thus dead) predecessor. 
-    for (BasicBlock *P : BadPreds) { 
- 
-      LLVM_DEBUG(dbgs() << "LoopSimplify: Deleting edge from dead predecessor " 
-                        << P->getName() << "\n"); 
- 
-      // Zap the dead pred's terminator and replace it with unreachable. 
-      Instruction *TI = P->getTerminator(); 
-      changeToUnreachable(TI, /*UseLLVMTrap=*/false, PreserveLCSSA, 
-                          /*DTU=*/nullptr, MSSAU); 
-      Changed = true; 
-    } 
-  } 
- 
-  if (MSSAU && VerifyMemorySSA) 
-    MSSAU->getMemorySSA()->verifyMemorySSA(); 
- 
-  // If there are exiting blocks with branches on undef, resolve the undef in 
-  // the direction which will exit the loop. This will help simplify loop 
-  // trip count computations. 
-  SmallVector<BasicBlock*, 8> ExitingBlocks; 
-  L->getExitingBlocks(ExitingBlocks); 
-  for (BasicBlock *ExitingBlock : ExitingBlocks) 
-    if (BranchInst *BI = dyn_cast<BranchInst>(ExitingBlock->getTerminator())) 
-      if (BI->isConditional()) { 
-        if (UndefValue *Cond = dyn_cast<UndefValue>(BI->getCondition())) { 
- 
-          LLVM_DEBUG(dbgs() 
-                     << "LoopSimplify: Resolving \"br i1 undef\" to exit in " 
-                     << ExitingBlock->getName() << "\n"); 
- 
-          BI->setCondition(ConstantInt::get(Cond->getType(), 
-                                            !L->contains(BI->getSuccessor(0)))); 
- 
-          Changed = true; 
-        } 
-      } 
- 
-  // Does the loop already have a preheader?  If so, don't insert one. 
-  BasicBlock *Preheader = L->getLoopPreheader(); 
-  if (!Preheader) { 
-    Preheader = InsertPreheaderForLoop(L, DT, LI, MSSAU, PreserveLCSSA); 
-    if (Preheader) 
-      Changed = true; 
-  } 
- 
-  // Next, check to make sure that all exit nodes of the loop only have 
-  // predecessors that are inside of the loop.  This check guarantees that the 
-  // loop preheader/header will dominate the exit blocks.  If the exit block has 
-  // predecessors from outside of the loop, split the edge now. 
-  if (formDedicatedExitBlocks(L, DT, LI, MSSAU, PreserveLCSSA)) 
-    Changed = true; 
- 
-  if (MSSAU && VerifyMemorySSA) 
-    MSSAU->getMemorySSA()->verifyMemorySSA(); 
- 
-  // If the header has more than two predecessors at this point (from the 
-  // preheader and from multiple backedges), we must adjust the loop. 
-  BasicBlock *LoopLatch = L->getLoopLatch(); 
-  if (!LoopLatch) { 
-    // If this is really a nested loop, rip it out into a child loop.  Don't do 
-    // this for loops with a giant number of backedges, just factor them into a 
-    // common backedge instead. 
-    if (L->getNumBackEdges() < 8) { 
-      if (Loop *OuterL = separateNestedLoop(L, Preheader, DT, LI, SE, 
-                                            PreserveLCSSA, AC, MSSAU)) { 
-        ++NumNested; 
-        // Enqueue the outer loop as it should be processed next in our 
-        // depth-first nest walk. 
-        Worklist.push_back(OuterL); 
- 
-        // This is a big restructuring change, reprocess the whole loop. 
-        Changed = true; 
-        // GCC doesn't tail recursion eliminate this. 
-        // FIXME: It isn't clear we can't rely on LLVM to TRE this. 
-        goto ReprocessLoop; 
-      } 
-    } 
- 
-    // If we either couldn't, or didn't want to, identify nesting of the loops, 
-    // insert a new block that all backedges target, then make it jump to the 
-    // loop header. 
-    LoopLatch = insertUniqueBackedgeBlock(L, Preheader, DT, LI, MSSAU); 
-    if (LoopLatch) 
-      Changed = true; 
-  } 
- 
-  if (MSSAU && VerifyMemorySSA) 
-    MSSAU->getMemorySSA()->verifyMemorySSA(); 
- 
-  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 
- 
-  // Scan over the PHI nodes in the loop header.  Since they now have only two 
-  // incoming values (the loop is canonicalized), we may have simplified the PHI 
-  // down to 'X = phi [X, Y]', which should be replaced with 'Y'. 
-  PHINode *PN; 
-  for (BasicBlock::iterator I = L->getHeader()->begin(); 
-       (PN = dyn_cast<PHINode>(I++)); ) 
-    if (Value *V = SimplifyInstruction(PN, {DL, nullptr, DT, AC})) { 
-      if (SE) SE->forgetValue(PN); 
-      if (!PreserveLCSSA || LI->replacementPreservesLCSSAForm(PN, V)) { 
-        PN->replaceAllUsesWith(V); 
-        PN->eraseFromParent(); 
-        Changed = true; 
-      } 
-    } 
- 
-  // If this loop has multiple exits and the exits all go to the same 
-  // block, attempt to merge the exits. This helps several passes, such 
-  // as LoopRotation, which do not support loops with multiple exits. 
-  // SimplifyCFG also does this (and this code uses the same utility 
-  // function), however this code is loop-aware, where SimplifyCFG is 
-  // not. That gives it the advantage of being able to hoist 
-  // loop-invariant instructions out of the way to open up more 
-  // opportunities, and the disadvantage of having the responsibility 
-  // to preserve dominator information. 
-  auto HasUniqueExitBlock = [&]() { 
-    BasicBlock *UniqueExit = nullptr; 
-    for (auto *ExitingBB : ExitingBlocks) 
-      for (auto *SuccBB : successors(ExitingBB)) { 
-        if (L->contains(SuccBB)) 
-          continue; 
- 
-        if (!UniqueExit) 
-          UniqueExit = SuccBB; 
-        else if (UniqueExit != SuccBB) 
-          return false; 
-      } 
- 
-    return true; 
-  }; 
-  if (HasUniqueExitBlock()) { 
-    for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) { 
-      BasicBlock *ExitingBlock = ExitingBlocks[i]; 
-      if (!ExitingBlock->getSinglePredecessor()) continue; 
-      BranchInst *BI = dyn_cast<BranchInst>(ExitingBlock->getTerminator()); 
-      if (!BI || !BI->isConditional()) continue; 
-      CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition()); 
-      if (!CI || CI->getParent() != ExitingBlock) continue; 
- 
-      // Attempt to hoist out all instructions except for the 
-      // comparison and the branch. 
-      bool AllInvariant = true; 
-      bool AnyInvariant = false; 
-      for (auto I = ExitingBlock->instructionsWithoutDebug().begin(); &*I != BI; ) { 
-        Instruction *Inst = &*I++; 
-        if (Inst == CI) 
-          continue; 
-        if (!L->makeLoopInvariant( 
-                Inst, AnyInvariant, 
-                Preheader ? Preheader->getTerminator() : nullptr, MSSAU)) { 
-          AllInvariant = false; 
-          break; 
-        } 
-      } 
-      if (AnyInvariant) { 
-        Changed = true; 
-        // The loop disposition of all SCEV expressions that depend on any 
-        // hoisted values have also changed. 
-        if (SE) 
-          SE->forgetLoopDispositions(L); 
-      } 
-      if (!AllInvariant) continue; 
- 
-      // The block has now been cleared of all instructions except for 
-      // a comparison and a conditional branch. SimplifyCFG may be able 
-      // to fold it now. 
+    if (DT->dominates(Header, P))
+      addBlockAndPredsToSet(P, Header, BlocksInL);
+  }
+
+  // Scan all of the loop children of L, moving them to OuterLoop if they are
+  // not part of the inner loop.
+  const std::vector<Loop*> &SubLoops = L->getSubLoops();
+  for (size_t I = 0; I != SubLoops.size(); )
+    if (BlocksInL.count(SubLoops[I]->getHeader()))
+      ++I;   // Loop remains in L
+    else
+      NewOuter->addChildLoop(L->removeChildLoop(SubLoops.begin() + I));
+
+  SmallVector<BasicBlock *, 8> OuterLoopBlocks;
+  OuterLoopBlocks.push_back(NewBB);
+  // Now that we know which blocks are in L and which need to be moved to
+  // OuterLoop, move any blocks that need it.
+  for (unsigned i = 0; i != L->getBlocks().size(); ++i) {
+    BasicBlock *BB = L->getBlocks()[i];
+    if (!BlocksInL.count(BB)) {
+      // Move this block to the parent, updating the exit blocks sets
+      L->removeBlockFromLoop(BB);
+      if ((*LI)[BB] == L) {
+        LI->changeLoopFor(BB, NewOuter);
+        OuterLoopBlocks.push_back(BB);
+      }
+      --i;
+    }
+  }
+
+  // Split edges to exit blocks from the inner loop, if they emerged in the
+  // process of separating the outer one.
+  formDedicatedExitBlocks(L, DT, LI, MSSAU, PreserveLCSSA);
+
+  if (PreserveLCSSA) {
+    // Fix LCSSA form for L. Some values, which previously were only used inside
+    // L, can now be used in NewOuter loop. We need to insert phi-nodes for them
+    // in corresponding exit blocks.
+    // We don't need to form LCSSA recursively, because there cannot be uses
+    // inside a newly created loop of defs from inner loops as those would
+    // already be a use of an LCSSA phi node.
+    formLCSSA(*L, *DT, LI, SE);
+
+    assert(NewOuter->isRecursivelyLCSSAForm(*DT, *LI) &&
+           "LCSSA is broken after separating nested loops!");
+  }
+
+  return NewOuter;
+}
+
+/// This method is called when the specified loop has more than one
+/// backedge in it.
+///
+/// If this occurs, revector all of these backedges to target a new basic block
+/// and have that block branch to the loop header.  This ensures that loops
+/// have exactly one backedge.
+static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
+                                             DominatorTree *DT, LoopInfo *LI,
+                                             MemorySSAUpdater *MSSAU) {
+  assert(L->getNumBackEdges() > 1 && "Must have > 1 backedge!");
+
+  // Get information about the loop
+  BasicBlock *Header = L->getHeader();
+  Function *F = Header->getParent();
+
+  // Unique backedge insertion currently depends on having a preheader.
+  if (!Preheader)
+    return nullptr;
+
+  // The header is not an EH pad; preheader insertion should ensure this.
+  assert(!Header->isEHPad() && "Can't insert backedge to EH pad");
+
+  // Figure out which basic blocks contain back-edges to the loop header.
+  std::vector<BasicBlock*> BackedgeBlocks;
+  for (pred_iterator I = pred_begin(Header), E = pred_end(Header); I != E; ++I){
+    BasicBlock *P = *I;
+
+    // Indirect edges cannot be split, so we must fail if we find one.
+    if (P->getTerminator()->isIndirectTerminator())
+      return nullptr;
+
+    if (P != Preheader) BackedgeBlocks.push_back(P);
+  }
+
+  // Create and insert the new backedge block...
+  BasicBlock *BEBlock = BasicBlock::Create(Header->getContext(),
+                                           Header->getName() + ".backedge", F);
+  BranchInst *BETerminator = BranchInst::Create(Header, BEBlock);
+  BETerminator->setDebugLoc(Header->getFirstNonPHI()->getDebugLoc());
+
+  LLVM_DEBUG(dbgs() << "LoopSimplify: Inserting unique backedge block "
+                    << BEBlock->getName() << "\n");
+
+  // Move the new backedge block to right after the last backedge block.
+  Function::iterator InsertPos = ++BackedgeBlocks.back()->getIterator();
+  F->getBasicBlockList().splice(InsertPos, F->getBasicBlockList(), BEBlock);
+
+  // Now that the block has been inserted into the function, create PHI nodes in
+  // the backedge block which correspond to any PHI nodes in the header block.
+  for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+    PHINode *NewPN = PHINode::Create(PN->getType(), BackedgeBlocks.size(),
+                                     PN->getName()+".be", BETerminator);
+
+    // Loop over the PHI node, moving all entries except the one for the
+    // preheader over to the new PHI node.
+    unsigned PreheaderIdx = ~0U;
+    bool HasUniqueIncomingValue = true;
+    Value *UniqueValue = nullptr;
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      BasicBlock *IBB = PN->getIncomingBlock(i);
+      Value *IV = PN->getIncomingValue(i);
+      if (IBB == Preheader) {
+        PreheaderIdx = i;
+      } else {
+        NewPN->addIncoming(IV, IBB);
+        if (HasUniqueIncomingValue) {
+          if (!UniqueValue)
+            UniqueValue = IV;
+          else if (UniqueValue != IV)
+            HasUniqueIncomingValue = false;
+        }
+      }
+    }
+
+    // Delete all of the incoming values from the old PN except the preheader's
+    assert(PreheaderIdx != ~0U && "PHI has no preheader entry??");
+    if (PreheaderIdx != 0) {
+      PN->setIncomingValue(0, PN->getIncomingValue(PreheaderIdx));
+      PN->setIncomingBlock(0, PN->getIncomingBlock(PreheaderIdx));
+    }
+    // Nuke all entries except the zero'th.
+    for (unsigned i = 0, e = PN->getNumIncomingValues()-1; i != e; ++i)
+      PN->removeIncomingValue(e-i, false);
+
+    // Finally, add the newly constructed PHI node as the entry for the BEBlock.
+    PN->addIncoming(NewPN, BEBlock);
+
+    // As an optimization, if all incoming values in the new PhiNode (which is a
+    // subset of the incoming values of the old PHI node) have the same value,
+    // eliminate the PHI Node.
+    if (HasUniqueIncomingValue) {
+      NewPN->replaceAllUsesWith(UniqueValue);
+      BEBlock->getInstList().erase(NewPN);
+    }
+  }
+
+  // Now that all of the PHI nodes have been inserted and adjusted, modify the
+  // backedge blocks to jump to the BEBlock instead of the header.
+  // If one of the backedges has llvm.loop metadata attached, we remove
+  // it from the backedge and add it to BEBlock.
+  unsigned LoopMDKind = BEBlock->getContext().getMDKindID("llvm.loop");
+  MDNode *LoopMD = nullptr;
+  for (unsigned i = 0, e = BackedgeBlocks.size(); i != e; ++i) {
+    Instruction *TI = BackedgeBlocks[i]->getTerminator();
+    if (!LoopMD)
+      LoopMD = TI->getMetadata(LoopMDKind);
+    TI->setMetadata(LoopMDKind, nullptr);
+    TI->replaceSuccessorWith(Header, BEBlock);
+  }
+  BEBlock->getTerminator()->setMetadata(LoopMDKind, LoopMD);
+
+  //===--- Update all analyses which we must preserve now -----------------===//
+
+  // Update Loop Information - we know that this block is now in the current
+  // loop and all parent loops.
+  L->addBasicBlockToLoop(BEBlock, *LI);
+
+  // Update dominator information
+  DT->splitBlock(BEBlock);
+
+  if (MSSAU)
+    MSSAU->updatePhisWhenInsertingUniqueBackedgeBlock(Header, Preheader,
+                                                      BEBlock);
+
+  return BEBlock;
+}
+
+/// Simplify one loop and queue further loops for simplification.
+static bool simplifyOneLoop(Loop *L, SmallVectorImpl<Loop *> &Worklist,
+                            DominatorTree *DT, LoopInfo *LI,
+                            ScalarEvolution *SE, AssumptionCache *AC,
+                            MemorySSAUpdater *MSSAU, bool PreserveLCSSA) {
+  bool Changed = false;
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
+ReprocessLoop:
+
+  // Check to see that no blocks (other than the header) in this loop have
+  // predecessors that are not in the loop.  This is not valid for natural
+  // loops, but can occur if the blocks are unreachable.  Since they are
+  // unreachable we can just shamelessly delete those CFG edges!
+  for (Loop::block_iterator BB = L->block_begin(), E = L->block_end();
+       BB != E; ++BB) {
+    if (*BB == L->getHeader()) continue;
+
+    SmallPtrSet<BasicBlock*, 4> BadPreds;
+    for (pred_iterator PI = pred_begin(*BB),
+         PE = pred_end(*BB); PI != PE; ++PI) {
+      BasicBlock *P = *PI;
+      if (!L->contains(P))
+        BadPreds.insert(P);
+    }
+
+    // Delete each unique out-of-loop (and thus dead) predecessor.
+    for (BasicBlock *P : BadPreds) {
+
+      LLVM_DEBUG(dbgs() << "LoopSimplify: Deleting edge from dead predecessor "
+                        << P->getName() << "\n");
+
+      // Zap the dead pred's terminator and replace it with unreachable.
+      Instruction *TI = P->getTerminator();
+      changeToUnreachable(TI, /*UseLLVMTrap=*/false, PreserveLCSSA,
+                          /*DTU=*/nullptr, MSSAU);
+      Changed = true;
+    }
+  }
+
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
+  // If there are exiting blocks with branches on undef, resolve the undef in
+  // the direction which will exit the loop. This will help simplify loop
+  // trip count computations.
+  SmallVector<BasicBlock*, 8> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+  for (BasicBlock *ExitingBlock : ExitingBlocks)
+    if (BranchInst *BI = dyn_cast<BranchInst>(ExitingBlock->getTerminator()))
+      if (BI->isConditional()) {
+        if (UndefValue *Cond = dyn_cast<UndefValue>(BI->getCondition())) {
+
+          LLVM_DEBUG(dbgs()
+                     << "LoopSimplify: Resolving \"br i1 undef\" to exit in "
+                     << ExitingBlock->getName() << "\n");
+
+          BI->setCondition(ConstantInt::get(Cond->getType(),
+                                            !L->contains(BI->getSuccessor(0))));
+
+          Changed = true;
+        }
+      }
+
+  // Does the loop already have a preheader?  If so, don't insert one.
+  BasicBlock *Preheader = L->getLoopPreheader();
+  if (!Preheader) {
+    Preheader = InsertPreheaderForLoop(L, DT, LI, MSSAU, PreserveLCSSA);
+    if (Preheader)
+      Changed = true;
+  }
+
+  // Next, check to make sure that all exit nodes of the loop only have
+  // predecessors that are inside of the loop.  This check guarantees that the
+  // loop preheader/header will dominate the exit blocks.  If the exit block has
+  // predecessors from outside of the loop, split the edge now.
+  if (formDedicatedExitBlocks(L, DT, LI, MSSAU, PreserveLCSSA))
+    Changed = true;
+
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
+  // If the header has more than two predecessors at this point (from the
+  // preheader and from multiple backedges), we must adjust the loop.
+  BasicBlock *LoopLatch = L->getLoopLatch();
+  if (!LoopLatch) {
+    // If this is really a nested loop, rip it out into a child loop.  Don't do
+    // this for loops with a giant number of backedges, just factor them into a
+    // common backedge instead.
+    if (L->getNumBackEdges() < 8) {
+      if (Loop *OuterL = separateNestedLoop(L, Preheader, DT, LI, SE,
+                                            PreserveLCSSA, AC, MSSAU)) {
+        ++NumNested;
+        // Enqueue the outer loop as it should be processed next in our
+        // depth-first nest walk.
+        Worklist.push_back(OuterL);
+
+        // This is a big restructuring change, reprocess the whole loop.
+        Changed = true;
+        // GCC doesn't tail recursion eliminate this.
+        // FIXME: It isn't clear we can't rely on LLVM to TRE this.
+        goto ReprocessLoop;
+      }
+    }
+
+    // If we either couldn't, or didn't want to, identify nesting of the loops,
+    // insert a new block that all backedges target, then make it jump to the
+    // loop header.
+    LoopLatch = insertUniqueBackedgeBlock(L, Preheader, DT, LI, MSSAU);
+    if (LoopLatch)
+      Changed = true;
+  }
+
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
+  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+
+  // Scan over the PHI nodes in the loop header.  Since they now have only two
+  // incoming values (the loop is canonicalized), we may have simplified the PHI
+  // down to 'X = phi [X, Y]', which should be replaced with 'Y'.
+  PHINode *PN;
+  for (BasicBlock::iterator I = L->getHeader()->begin();
+       (PN = dyn_cast<PHINode>(I++)); )
+    if (Value *V = SimplifyInstruction(PN, {DL, nullptr, DT, AC})) {
+      if (SE) SE->forgetValue(PN);
+      if (!PreserveLCSSA || LI->replacementPreservesLCSSAForm(PN, V)) {
+        PN->replaceAllUsesWith(V);
+        PN->eraseFromParent();
+        Changed = true;
+      }
+    }
+
+  // If this loop has multiple exits and the exits all go to the same
+  // block, attempt to merge the exits. This helps several passes, such
+  // as LoopRotation, which do not support loops with multiple exits.
+  // SimplifyCFG also does this (and this code uses the same utility
+  // function), however this code is loop-aware, where SimplifyCFG is
+  // not. That gives it the advantage of being able to hoist
+  // loop-invariant instructions out of the way to open up more
+  // opportunities, and the disadvantage of having the responsibility
+  // to preserve dominator information.
+  auto HasUniqueExitBlock = [&]() {
+    BasicBlock *UniqueExit = nullptr;
+    for (auto *ExitingBB : ExitingBlocks)
+      for (auto *SuccBB : successors(ExitingBB)) {
+        if (L->contains(SuccBB))
+          continue;
+
+        if (!UniqueExit)
+          UniqueExit = SuccBB;
+        else if (UniqueExit != SuccBB)
+          return false;
+      }
+
+    return true;
+  };
+  if (HasUniqueExitBlock()) {
+    for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
+      BasicBlock *ExitingBlock = ExitingBlocks[i];
+      if (!ExitingBlock->getSinglePredecessor()) continue;
+      BranchInst *BI = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
+      if (!BI || !BI->isConditional()) continue;
+      CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition());
+      if (!CI || CI->getParent() != ExitingBlock) continue;
+
+      // Attempt to hoist out all instructions except for the
+      // comparison and the branch.
+      bool AllInvariant = true;
+      bool AnyInvariant = false;
+      for (auto I = ExitingBlock->instructionsWithoutDebug().begin(); &*I != BI; ) {
+        Instruction *Inst = &*I++;
+        if (Inst == CI)
+          continue;
+        if (!L->makeLoopInvariant(
+                Inst, AnyInvariant,
+                Preheader ? Preheader->getTerminator() : nullptr, MSSAU)) {
+          AllInvariant = false;
+          break;
+        }
+      }
+      if (AnyInvariant) {
+        Changed = true;
+        // The loop disposition of all SCEV expressions that depend on any
+        // hoisted values have also changed.
+        if (SE)
+          SE->forgetLoopDispositions(L);
+      }
+      if (!AllInvariant) continue;
+
+      // The block has now been cleared of all instructions except for
+      // a comparison and a conditional branch. SimplifyCFG may be able
+      // to fold it now.
       if (!FoldBranchToCommonDest(BI, /*DTU=*/nullptr, MSSAU))
-        continue; 
- 
-      // Success. The block is now dead, so remove it from the loop, 
-      // update the dominator tree and delete it. 
-      LLVM_DEBUG(dbgs() << "LoopSimplify: Eliminating exiting block " 
-                        << ExitingBlock->getName() << "\n"); 
- 
+        continue;
+
+      // Success. The block is now dead, so remove it from the loop,
+      // update the dominator tree and delete it.
+      LLVM_DEBUG(dbgs() << "LoopSimplify: Eliminating exiting block "
+                        << ExitingBlock->getName() << "\n");
+
       assert(pred_empty(ExitingBlock));
-      Changed = true; 
-      LI->removeBlock(ExitingBlock); 
- 
-      DomTreeNode *Node = DT->getNode(ExitingBlock); 
-      while (!Node->isLeaf()) { 
-        DomTreeNode *Child = Node->back(); 
-        DT->changeImmediateDominator(Child, Node->getIDom()); 
-      } 
-      DT->eraseNode(ExitingBlock); 
-      if (MSSAU) { 
-        SmallSetVector<BasicBlock *, 8> ExitBlockSet; 
-        ExitBlockSet.insert(ExitingBlock); 
-        MSSAU->removeBlocks(ExitBlockSet); 
-      } 
- 
-      BI->getSuccessor(0)->removePredecessor( 
-          ExitingBlock, /* KeepOneInputPHIs */ PreserveLCSSA); 
-      BI->getSuccessor(1)->removePredecessor( 
-          ExitingBlock, /* KeepOneInputPHIs */ PreserveLCSSA); 
-      ExitingBlock->eraseFromParent(); 
-    } 
-  } 
- 
-  // Changing exit conditions for blocks may affect exit counts of this loop and 
-  // any of its paretns, so we must invalidate the entire subtree if we've made 
-  // any changes. 
-  if (Changed && SE) 
-    SE->forgetTopmostLoop(L); 
- 
-  if (MSSAU && VerifyMemorySSA) 
-    MSSAU->getMemorySSA()->verifyMemorySSA(); 
- 
-  return Changed; 
-} 
- 
-bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, 
-                        ScalarEvolution *SE, AssumptionCache *AC, 
-                        MemorySSAUpdater *MSSAU, bool PreserveLCSSA) { 
-  bool Changed = false; 
- 
-#ifndef NDEBUG 
-  // If we're asked to preserve LCSSA, the loop nest needs to start in LCSSA 
-  // form. 
-  if (PreserveLCSSA) { 
-    assert(DT && "DT not available."); 
-    assert(LI && "LI not available."); 
-    assert(L->isRecursivelyLCSSAForm(*DT, *LI) && 
-           "Requested to preserve LCSSA, but it's already broken."); 
-  } 
-#endif 
- 
-  // Worklist maintains our depth-first queue of loops in this nest to process. 
-  SmallVector<Loop *, 4> Worklist; 
-  Worklist.push_back(L); 
- 
-  // Walk the worklist from front to back, pushing newly found sub loops onto 
-  // the back. This will let us process loops from back to front in depth-first 
-  // order. We can use this simple process because loops form a tree. 
-  for (unsigned Idx = 0; Idx != Worklist.size(); ++Idx) { 
-    Loop *L2 = Worklist[Idx]; 
-    Worklist.append(L2->begin(), L2->end()); 
-  } 
- 
-  while (!Worklist.empty()) 
-    Changed |= simplifyOneLoop(Worklist.pop_back_val(), Worklist, DT, LI, SE, 
-                               AC, MSSAU, PreserveLCSSA); 
- 
-  return Changed; 
-} 
- 
-namespace { 
-  struct LoopSimplify : public FunctionPass { 
-    static char ID; // Pass identification, replacement for typeid 
-    LoopSimplify() : FunctionPass(ID) { 
-      initializeLoopSimplifyPass(*PassRegistry::getPassRegistry()); 
-    } 
- 
-    bool runOnFunction(Function &F) override; 
- 
-    void getAnalysisUsage(AnalysisUsage &AU) const override { 
-      AU.addRequired<AssumptionCacheTracker>(); 
- 
-      // We need loop information to identify the loops... 
-      AU.addRequired<DominatorTreeWrapperPass>(); 
-      AU.addPreserved<DominatorTreeWrapperPass>(); 
- 
-      AU.addRequired<LoopInfoWrapperPass>(); 
-      AU.addPreserved<LoopInfoWrapperPass>(); 
- 
-      AU.addPreserved<BasicAAWrapperPass>(); 
-      AU.addPreserved<AAResultsWrapperPass>(); 
-      AU.addPreserved<GlobalsAAWrapperPass>(); 
-      AU.addPreserved<ScalarEvolutionWrapperPass>(); 
-      AU.addPreserved<SCEVAAWrapperPass>(); 
-      AU.addPreservedID(LCSSAID); 
-      AU.addPreserved<DependenceAnalysisWrapperPass>(); 
-      AU.addPreservedID(BreakCriticalEdgesID);  // No critical edges added. 
-      AU.addPreserved<BranchProbabilityInfoWrapperPass>(); 
-      if (EnableMSSALoopDependency) 
-        AU.addPreserved<MemorySSAWrapperPass>(); 
-    } 
- 
-    /// verifyAnalysis() - Verify LoopSimplifyForm's guarantees. 
-    void verifyAnalysis() const override; 
-  }; 
-} 
- 
-char LoopSimplify::ID = 0; 
-INITIALIZE_PASS_BEGIN(LoopSimplify, "loop-simplify", 
-                "Canonicalize natural loops", false, false) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 
-INITIALIZE_PASS_END(LoopSimplify, "loop-simplify", 
-                "Canonicalize natural loops", false, false) 
- 
-// Publicly exposed interface to pass... 
-char &llvm::LoopSimplifyID = LoopSimplify::ID; 
-Pass *llvm::createLoopSimplifyPass() { return new LoopSimplify(); } 
- 
-/// runOnFunction - Run down all loops in the CFG (recursively, but we could do 
-/// it in any convenient order) inserting preheaders... 
-/// 
-bool LoopSimplify::runOnFunction(Function &F) { 
-  bool Changed = false; 
-  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 
-  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-  auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); 
-  ScalarEvolution *SE = SEWP ? &SEWP->getSE() : nullptr; 
-  AssumptionCache *AC = 
-      &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 
-  MemorySSA *MSSA = nullptr; 
-  std::unique_ptr<MemorySSAUpdater> MSSAU; 
-  if (EnableMSSALoopDependency) { 
-    auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>(); 
-    if (MSSAAnalysis) { 
-      MSSA = &MSSAAnalysis->getMSSA(); 
-      MSSAU = std::make_unique<MemorySSAUpdater>(MSSA); 
-    } 
-  } 
- 
-  bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); 
- 
-  // Simplify each loop nest in the function. 
+      Changed = true;
+      LI->removeBlock(ExitingBlock);
+
+      DomTreeNode *Node = DT->getNode(ExitingBlock);
+      while (!Node->isLeaf()) {
+        DomTreeNode *Child = Node->back();
+        DT->changeImmediateDominator(Child, Node->getIDom());
+      }
+      DT->eraseNode(ExitingBlock);
+      if (MSSAU) {
+        SmallSetVector<BasicBlock *, 8> ExitBlockSet;
+        ExitBlockSet.insert(ExitingBlock);
+        MSSAU->removeBlocks(ExitBlockSet);
+      }
+
+      BI->getSuccessor(0)->removePredecessor(
+          ExitingBlock, /* KeepOneInputPHIs */ PreserveLCSSA);
+      BI->getSuccessor(1)->removePredecessor(
+          ExitingBlock, /* KeepOneInputPHIs */ PreserveLCSSA);
+      ExitingBlock->eraseFromParent();
+    }
+  }
+
+  // Changing exit conditions for blocks may affect exit counts of this loop and
+  // any of its paretns, so we must invalidate the entire subtree if we've made
+  // any changes.
+  if (Changed && SE)
+    SE->forgetTopmostLoop(L);
+
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
+  return Changed;
+}
+
+bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI,
+                        ScalarEvolution *SE, AssumptionCache *AC,
+                        MemorySSAUpdater *MSSAU, bool PreserveLCSSA) {
+  bool Changed = false;
+
+#ifndef NDEBUG
+  // If we're asked to preserve LCSSA, the loop nest needs to start in LCSSA
+  // form.
+  if (PreserveLCSSA) {
+    assert(DT && "DT not available.");
+    assert(LI && "LI not available.");
+    assert(L->isRecursivelyLCSSAForm(*DT, *LI) &&
+           "Requested to preserve LCSSA, but it's already broken.");
+  }
+#endif
+
+  // Worklist maintains our depth-first queue of loops in this nest to process.
+  SmallVector<Loop *, 4> Worklist;
+  Worklist.push_back(L);
+
+  // Walk the worklist from front to back, pushing newly found sub loops onto
+  // the back. This will let us process loops from back to front in depth-first
+  // order. We can use this simple process because loops form a tree.
+  for (unsigned Idx = 0; Idx != Worklist.size(); ++Idx) {
+    Loop *L2 = Worklist[Idx];
+    Worklist.append(L2->begin(), L2->end());
+  }
+
+  while (!Worklist.empty())
+    Changed |= simplifyOneLoop(Worklist.pop_back_val(), Worklist, DT, LI, SE,
+                               AC, MSSAU, PreserveLCSSA);
+
+  return Changed;
+}
+
+namespace {
+  struct LoopSimplify : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    LoopSimplify() : FunctionPass(ID) {
+      initializeLoopSimplifyPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<AssumptionCacheTracker>();
+
+      // We need loop information to identify the loops...
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
+
+      AU.addRequired<LoopInfoWrapperPass>();
+      AU.addPreserved<LoopInfoWrapperPass>();
+
+      AU.addPreserved<BasicAAWrapperPass>();
+      AU.addPreserved<AAResultsWrapperPass>();
+      AU.addPreserved<GlobalsAAWrapperPass>();
+      AU.addPreserved<ScalarEvolutionWrapperPass>();
+      AU.addPreserved<SCEVAAWrapperPass>();
+      AU.addPreservedID(LCSSAID);
+      AU.addPreserved<DependenceAnalysisWrapperPass>();
+      AU.addPreservedID(BreakCriticalEdgesID);  // No critical edges added.
+      AU.addPreserved<BranchProbabilityInfoWrapperPass>();
+      if (EnableMSSALoopDependency)
+        AU.addPreserved<MemorySSAWrapperPass>();
+    }
+
+    /// verifyAnalysis() - Verify LoopSimplifyForm's guarantees.
+    void verifyAnalysis() const override;
+  };
+}
+
+char LoopSimplify::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopSimplify, "loop-simplify",
+                "Canonicalize natural loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(LoopSimplify, "loop-simplify",
+                "Canonicalize natural loops", false, false)
+
+// Publicly exposed interface to pass...
+char &llvm::LoopSimplifyID = LoopSimplify::ID;
+Pass *llvm::createLoopSimplifyPass() { return new LoopSimplify(); }
+
+/// runOnFunction - Run down all loops in the CFG (recursively, but we could do
+/// it in any convenient order) inserting preheaders...
+///
+bool LoopSimplify::runOnFunction(Function &F) {
+  bool Changed = false;
+  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+  ScalarEvolution *SE = SEWP ? &SEWP->getSE() : nullptr;
+  AssumptionCache *AC =
+      &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  MemorySSA *MSSA = nullptr;
+  std::unique_ptr<MemorySSAUpdater> MSSAU;
+  if (EnableMSSALoopDependency) {
+    auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
+    if (MSSAAnalysis) {
+      MSSA = &MSSAAnalysis->getMSSA();
+      MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
+    }
+  }
+
+  bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+
+  // Simplify each loop nest in the function.
   for (auto *L : *LI)
     Changed |= simplifyLoop(L, DT, LI, SE, AC, MSSAU.get(), PreserveLCSSA);
- 
-#ifndef NDEBUG 
-  if (PreserveLCSSA) { 
-    bool InLCSSA = all_of( 
-        *LI, [&](Loop *L) { return L->isRecursivelyLCSSAForm(*DT, *LI); }); 
-    assert(InLCSSA && "LCSSA is broken after loop-simplify."); 
-  } 
-#endif 
-  return Changed; 
-} 
- 
-PreservedAnalyses LoopSimplifyPass::run(Function &F, 
-                                        FunctionAnalysisManager &AM) { 
-  bool Changed = false; 
-  LoopInfo *LI = &AM.getResult<LoopAnalysis>(F); 
-  DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F); 
-  ScalarEvolution *SE = AM.getCachedResult<ScalarEvolutionAnalysis>(F); 
-  AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F); 
-  auto *MSSAAnalysis = AM.getCachedResult<MemorySSAAnalysis>(F); 
-  std::unique_ptr<MemorySSAUpdater> MSSAU; 
-  if (MSSAAnalysis) { 
-    auto *MSSA = &MSSAAnalysis->getMSSA(); 
-    MSSAU = std::make_unique<MemorySSAUpdater>(MSSA); 
-  } 
- 
- 
-  // Note that we don't preserve LCSSA in the new PM, if you need it run LCSSA 
-  // after simplifying the loops. MemorySSA is preserved if it exists. 
+
+#ifndef NDEBUG
+  if (PreserveLCSSA) {
+    bool InLCSSA = all_of(
+        *LI, [&](Loop *L) { return L->isRecursivelyLCSSAForm(*DT, *LI); });
+    assert(InLCSSA && "LCSSA is broken after loop-simplify.");
+  }
+#endif
+  return Changed;
+}
+
+PreservedAnalyses LoopSimplifyPass::run(Function &F,
+                                        FunctionAnalysisManager &AM) {
+  bool Changed = false;
+  LoopInfo *LI = &AM.getResult<LoopAnalysis>(F);
+  DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+  ScalarEvolution *SE = AM.getCachedResult<ScalarEvolutionAnalysis>(F);
+  AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F);
+  auto *MSSAAnalysis = AM.getCachedResult<MemorySSAAnalysis>(F);
+  std::unique_ptr<MemorySSAUpdater> MSSAU;
+  if (MSSAAnalysis) {
+    auto *MSSA = &MSSAAnalysis->getMSSA();
+    MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
+  }
+
+
+  // Note that we don't preserve LCSSA in the new PM, if you need it run LCSSA
+  // after simplifying the loops. MemorySSA is preserved if it exists.
   for (auto *L : *LI)
-    Changed |= 
+    Changed |=
         simplifyLoop(L, DT, LI, SE, AC, MSSAU.get(), /*PreserveLCSSA*/ false);
- 
-  if (!Changed) 
-    return PreservedAnalyses::all(); 
- 
-  PreservedAnalyses PA; 
-  PA.preserve<DominatorTreeAnalysis>(); 
-  PA.preserve<LoopAnalysis>(); 
-  PA.preserve<BasicAA>(); 
-  PA.preserve<GlobalsAA>(); 
-  PA.preserve<SCEVAA>(); 
-  PA.preserve<ScalarEvolutionAnalysis>(); 
-  PA.preserve<DependenceAnalysis>(); 
-  if (MSSAAnalysis) 
-    PA.preserve<MemorySSAAnalysis>(); 
-  // BPI maps conditional terminators to probabilities, LoopSimplify can insert 
-  // blocks, but it does so only by splitting existing blocks and edges. This 
-  // results in the interesting property that all new terminators inserted are 
-  // unconditional branches which do not appear in BPI. All deletions are 
-  // handled via ValueHandle callbacks w/in BPI. 
-  PA.preserve<BranchProbabilityAnalysis>(); 
-  return PA; 
-} 
- 
-// FIXME: Restore this code when we re-enable verification in verifyAnalysis 
-// below. 
-#if 0 
-static void verifyLoop(Loop *L) { 
-  // Verify subloops. 
-  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) 
-    verifyLoop(*I); 
- 
-  // It used to be possible to just assert L->isLoopSimplifyForm(), however 
-  // with the introduction of indirectbr, there are now cases where it's 
-  // not possible to transform a loop as necessary. We can at least check 
-  // that there is an indirectbr near any time there's trouble. 
- 
-  // Indirectbr can interfere with preheader and unique backedge insertion. 
-  if (!L->getLoopPreheader() || !L->getLoopLatch()) { 
-    bool HasIndBrPred = false; 
-    for (pred_iterator PI = pred_begin(L->getHeader()), 
-         PE = pred_end(L->getHeader()); PI != PE; ++PI) 
-      if (isa<IndirectBrInst>((*PI)->getTerminator())) { 
-        HasIndBrPred = true; 
-        break; 
-      } 
-    assert(HasIndBrPred && 
-           "LoopSimplify has no excuse for missing loop header info!"); 
-    (void)HasIndBrPred; 
-  } 
- 
-  // Indirectbr can interfere with exit block canonicalization. 
-  if (!L->hasDedicatedExits()) { 
-    bool HasIndBrExiting = false; 
-    SmallVector<BasicBlock*, 8> ExitingBlocks; 
-    L->getExitingBlocks(ExitingBlocks); 
-    for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) { 
-      if (isa<IndirectBrInst>((ExitingBlocks[i])->getTerminator())) { 
-        HasIndBrExiting = true; 
-        break; 
-      } 
-    } 
- 
-    assert(HasIndBrExiting && 
-           "LoopSimplify has no excuse for missing exit block info!"); 
-    (void)HasIndBrExiting; 
-  } 
-} 
-#endif 
- 
-void LoopSimplify::verifyAnalysis() const { 
-  // FIXME: This routine is being called mid-way through the loop pass manager 
-  // as loop passes destroy this analysis. That's actually fine, but we have no 
-  // way of expressing that here. Once all of the passes that destroy this are 
-  // hoisted out of the loop pass manager we can add back verification here. 
-#if 0 
-  for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) 
-    verifyLoop(*I); 
-#endif 
-} 
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<LoopAnalysis>();
+  PA.preserve<BasicAA>();
+  PA.preserve<GlobalsAA>();
+  PA.preserve<SCEVAA>();
+  PA.preserve<ScalarEvolutionAnalysis>();
+  PA.preserve<DependenceAnalysis>();
+  if (MSSAAnalysis)
+    PA.preserve<MemorySSAAnalysis>();
+  // BPI maps conditional terminators to probabilities, LoopSimplify can insert
+  // blocks, but it does so only by splitting existing blocks and edges. This
+  // results in the interesting property that all new terminators inserted are
+  // unconditional branches which do not appear in BPI. All deletions are
+  // handled via ValueHandle callbacks w/in BPI.
+  PA.preserve<BranchProbabilityAnalysis>();
+  return PA;
+}
+
+// FIXME: Restore this code when we re-enable verification in verifyAnalysis
+// below.
+#if 0
+static void verifyLoop(Loop *L) {
+  // Verify subloops.
+  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+    verifyLoop(*I);
+
+  // It used to be possible to just assert L->isLoopSimplifyForm(), however
+  // with the introduction of indirectbr, there are now cases where it's
+  // not possible to transform a loop as necessary. We can at least check
+  // that there is an indirectbr near any time there's trouble.
+
+  // Indirectbr can interfere with preheader and unique backedge insertion.
+  if (!L->getLoopPreheader() || !L->getLoopLatch()) {
+    bool HasIndBrPred = false;
+    for (pred_iterator PI = pred_begin(L->getHeader()),
+         PE = pred_end(L->getHeader()); PI != PE; ++PI)
+      if (isa<IndirectBrInst>((*PI)->getTerminator())) {
+        HasIndBrPred = true;
+        break;
+      }
+    assert(HasIndBrPred &&
+           "LoopSimplify has no excuse for missing loop header info!");
+    (void)HasIndBrPred;
+  }
+
+  // Indirectbr can interfere with exit block canonicalization.
+  if (!L->hasDedicatedExits()) {
+    bool HasIndBrExiting = false;
+    SmallVector<BasicBlock*, 8> ExitingBlocks;
+    L->getExitingBlocks(ExitingBlocks);
+    for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
+      if (isa<IndirectBrInst>((ExitingBlocks[i])->getTerminator())) {
+        HasIndBrExiting = true;
+        break;
+      }
+    }
+
+    assert(HasIndBrExiting &&
+           "LoopSimplify has no excuse for missing exit block info!");
+    (void)HasIndBrExiting;
+  }
+}
+#endif
+
+void LoopSimplify::verifyAnalysis() const {
+  // FIXME: This routine is being called mid-way through the loop pass manager
+  // as loop passes destroy this analysis. That's actually fine, but we have no
+  // way of expressing that here. Once all of the passes that destroy this are
+  // hoisted out of the loop pass manager we can add back verification here.
+#if 0
+  for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
+    verifyLoop(*I);
+#endif
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/LoopUnroll.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/LoopUnroll.cpp
index 6426a5636f..d4cd574052 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/LoopUnroll.cpp
@@ -1,367 +1,367 @@
-//===-- UnrollLoop.cpp - Loop unrolling utilities -------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements some loop unrolling utilities. It does not define any 
-// actual pass or policy, but provides a single function to perform loop 
-// unrolling. 
-// 
-// The process of unrolling can produce extraneous basic blocks linked with 
-// unconditional branches.  This will be corrected in the future. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/Optional.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/StringRef.h" 
-#include "llvm/ADT/Twine.h" 
-#include "llvm/ADT/ilist_iterator.h" 
-#include "llvm/ADT/iterator_range.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/DomTreeUpdater.h" 
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/LoopIterator.h" 
-#include "llvm/Analysis/OptimizationRemarkEmitter.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DebugInfoMetadata.h" 
-#include "llvm/IR/DebugLoc.h" 
-#include "llvm/IR/DiagnosticInfo.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/ValueHandle.h" 
-#include "llvm/IR/ValueMap.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/GenericDomTree.h" 
-#include "llvm/Support/MathExtras.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/Cloning.h" 
-#include "llvm/Transforms/Utils/Local.h" 
+//===-- UnrollLoop.cpp - Loop unrolling utilities -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements some loop unrolling utilities. It does not define any
+// actual pass or policy, but provides a single function to perform loop
+// unrolling.
+//
+// The process of unrolling can produce extraneous basic blocks linked with
+// unconditional branches.  This will be corrected in the future.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/ilist_iterator.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GenericDomTree.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopPeel.h"
-#include "llvm/Transforms/Utils/LoopSimplify.h" 
-#include "llvm/Transforms/Utils/LoopUtils.h" 
-#include "llvm/Transforms/Utils/SimplifyIndVar.h" 
-#include "llvm/Transforms/Utils/UnrollLoop.h" 
-#include "llvm/Transforms/Utils/ValueMapper.h" 
-#include <algorithm> 
-#include <assert.h> 
-#include <type_traits> 
-#include <vector> 
- 
-namespace llvm { 
-class DataLayout; 
-class Value; 
-} // namespace llvm 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "loop-unroll" 
- 
-// TODO: Should these be here or in LoopUnroll? 
-STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled"); 
-STATISTIC(NumUnrolled, "Number of loops unrolled (completely or otherwise)"); 
-STATISTIC(NumUnrolledNotLatch, "Number of loops unrolled without a conditional " 
-                               "latch (completely or otherwise)"); 
- 
-static cl::opt<bool> 
-UnrollRuntimeEpilog("unroll-runtime-epilog", cl::init(false), cl::Hidden, 
-                    cl::desc("Allow runtime unrolled loops to be unrolled " 
-                             "with epilog instead of prolog.")); 
- 
-static cl::opt<bool> 
-UnrollVerifyDomtree("unroll-verify-domtree", cl::Hidden, 
-                    cl::desc("Verify domtree after unrolling"), 
-#ifdef EXPENSIVE_CHECKS 
-    cl::init(true) 
-#else 
-    cl::init(false) 
-#endif 
-                    ); 
- 
-/// Check if unrolling created a situation where we need to insert phi nodes to 
-/// preserve LCSSA form. 
-/// \param Blocks is a vector of basic blocks representing unrolled loop. 
-/// \param L is the outer loop. 
-/// It's possible that some of the blocks are in L, and some are not. In this 
-/// case, if there is a use is outside L, and definition is inside L, we need to 
-/// insert a phi-node, otherwise LCSSA will be broken. 
-/// The function is just a helper function for llvm::UnrollLoop that returns 
-/// true if this situation occurs, indicating that LCSSA needs to be fixed. 
+#include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
+#include <assert.h>
+#include <type_traits>
+#include <vector>
+
+namespace llvm {
+class DataLayout;
+class Value;
+} // namespace llvm
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-unroll"
+
+// TODO: Should these be here or in LoopUnroll?
+STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled");
+STATISTIC(NumUnrolled, "Number of loops unrolled (completely or otherwise)");
+STATISTIC(NumUnrolledNotLatch, "Number of loops unrolled without a conditional "
+                               "latch (completely or otherwise)");
+
+static cl::opt<bool>
+UnrollRuntimeEpilog("unroll-runtime-epilog", cl::init(false), cl::Hidden,
+                    cl::desc("Allow runtime unrolled loops to be unrolled "
+                             "with epilog instead of prolog."));
+
+static cl::opt<bool>
+UnrollVerifyDomtree("unroll-verify-domtree", cl::Hidden,
+                    cl::desc("Verify domtree after unrolling"),
+#ifdef EXPENSIVE_CHECKS
+    cl::init(true)
+#else
+    cl::init(false)
+#endif
+                    );
+
+/// Check if unrolling created a situation where we need to insert phi nodes to
+/// preserve LCSSA form.
+/// \param Blocks is a vector of basic blocks representing unrolled loop.
+/// \param L is the outer loop.
+/// It's possible that some of the blocks are in L, and some are not. In this
+/// case, if there is a use is outside L, and definition is inside L, we need to
+/// insert a phi-node, otherwise LCSSA will be broken.
+/// The function is just a helper function for llvm::UnrollLoop that returns
+/// true if this situation occurs, indicating that LCSSA needs to be fixed.
 static bool needToInsertPhisForLCSSA(Loop *L,
                                      const std::vector<BasicBlock *> &Blocks,
-                                     LoopInfo *LI) { 
-  for (BasicBlock *BB : Blocks) { 
-    if (LI->getLoopFor(BB) == L) 
-      continue; 
-    for (Instruction &I : *BB) { 
-      for (Use &U : I.operands()) { 
+                                     LoopInfo *LI) {
+  for (BasicBlock *BB : Blocks) {
+    if (LI->getLoopFor(BB) == L)
+      continue;
+    for (Instruction &I : *BB) {
+      for (Use &U : I.operands()) {
         if (const auto *Def = dyn_cast<Instruction>(U)) {
-          Loop *DefLoop = LI->getLoopFor(Def->getParent()); 
-          if (!DefLoop) 
-            continue; 
-          if (DefLoop->contains(L)) 
-            return true; 
-        } 
-      } 
-    } 
-  } 
-  return false; 
-} 
- 
-/// Adds ClonedBB to LoopInfo, creates a new loop for ClonedBB if necessary 
-/// and adds a mapping from the original loop to the new loop to NewLoops. 
-/// Returns nullptr if no new loop was created and a pointer to the 
-/// original loop OriginalBB was part of otherwise. 
-const Loop* llvm::addClonedBlockToLoopInfo(BasicBlock *OriginalBB, 
-                                           BasicBlock *ClonedBB, LoopInfo *LI, 
-                                           NewLoopsMap &NewLoops) { 
-  // Figure out which loop New is in. 
-  const Loop *OldLoop = LI->getLoopFor(OriginalBB); 
-  assert(OldLoop && "Should (at least) be in the loop being unrolled!"); 
- 
-  Loop *&NewLoop = NewLoops[OldLoop]; 
-  if (!NewLoop) { 
-    // Found a new sub-loop. 
-    assert(OriginalBB == OldLoop->getHeader() && 
-           "Header should be first in RPO"); 
- 
-    NewLoop = LI->AllocateLoop(); 
-    Loop *NewLoopParent = NewLoops.lookup(OldLoop->getParentLoop()); 
- 
-    if (NewLoopParent) 
-      NewLoopParent->addChildLoop(NewLoop); 
-    else 
-      LI->addTopLevelLoop(NewLoop); 
- 
-    NewLoop->addBasicBlockToLoop(ClonedBB, *LI); 
-    return OldLoop; 
-  } else { 
-    NewLoop->addBasicBlockToLoop(ClonedBB, *LI); 
-    return nullptr; 
-  } 
-} 
- 
-/// The function chooses which type of unroll (epilog or prolog) is more 
-/// profitabale. 
-/// Epilog unroll is more profitable when there is PHI that starts from 
-/// constant.  In this case epilog will leave PHI start from constant, 
-/// but prolog will convert it to non-constant. 
-/// 
-/// loop: 
-///   PN = PHI [I, Latch], [CI, PreHeader] 
-///   I = foo(PN) 
-///   ... 
-/// 
-/// Epilog unroll case. 
-/// loop: 
-///   PN = PHI [I2, Latch], [CI, PreHeader] 
-///   I1 = foo(PN) 
-///   I2 = foo(I1) 
-///   ... 
-/// Prolog unroll case. 
-///   NewPN = PHI [PrologI, Prolog], [CI, PreHeader] 
-/// loop: 
-///   PN = PHI [I2, Latch], [NewPN, PreHeader] 
-///   I1 = foo(PN) 
-///   I2 = foo(I1) 
-///   ... 
-/// 
-static bool isEpilogProfitable(Loop *L) { 
-  BasicBlock *PreHeader = L->getLoopPreheader(); 
-  BasicBlock *Header = L->getHeader(); 
-  assert(PreHeader && Header); 
-  for (const PHINode &PN : Header->phis()) { 
-    if (isa<ConstantInt>(PN.getIncomingValueForBlock(PreHeader))) 
-      return true; 
-  } 
-  return false; 
-} 
- 
-/// Perform some cleanup and simplifications on loops after unrolling. It is 
-/// useful to simplify the IV's in the new loop, as well as do a quick 
-/// simplify/dce pass of the instructions. 
-void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI, 
-                                   ScalarEvolution *SE, DominatorTree *DT, 
-                                   AssumptionCache *AC, 
-                                   const TargetTransformInfo *TTI) { 
-  // Simplify any new induction variables in the partially unrolled loop. 
-  if (SE && SimplifyIVs) { 
-    SmallVector<WeakTrackingVH, 16> DeadInsts; 
-    simplifyLoopIVs(L, SE, DT, LI, TTI, DeadInsts); 
- 
-    // Aggressively clean up dead instructions that simplifyLoopIVs already 
-    // identified. Any remaining should be cleaned up below. 
-    while (!DeadInsts.empty()) { 
-      Value *V = DeadInsts.pop_back_val(); 
-      if (Instruction *Inst = dyn_cast_or_null<Instruction>(V)) 
-        RecursivelyDeleteTriviallyDeadInstructions(Inst); 
-    } 
-  } 
- 
-  // At this point, the code is well formed.  We now do a quick sweep over the 
-  // inserted code, doing constant propagation and dead code elimination as we 
-  // go. 
-  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 
-  for (BasicBlock *BB : L->getBlocks()) { 
-    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 
-      Instruction *Inst = &*I++; 
- 
-      if (Value *V = SimplifyInstruction(Inst, {DL, nullptr, DT, AC})) 
-        if (LI->replacementPreservesLCSSAForm(Inst, V)) 
-          Inst->replaceAllUsesWith(V); 
-      if (isInstructionTriviallyDead(Inst)) 
-        BB->getInstList().erase(Inst); 
-    } 
-  } 
- 
-  // TODO: after peeling or unrolling, previously loop variant conditions are 
-  // likely to fold to constants, eagerly propagating those here will require 
-  // fewer cleanup passes to be run.  Alternatively, a LoopEarlyCSE might be 
-  // appropriate. 
-} 
- 
-/// Unroll the given loop by Count. The loop must be in LCSSA form.  Unrolling 
-/// can only fail when the loop's latch block is not terminated by a conditional 
-/// branch instruction. However, if the trip count (and multiple) are not known, 
-/// loop unrolling will mostly produce more code that is no faster. 
-/// 
-/// TripCount is the upper bound of the iteration on which control exits 
-/// LatchBlock. Control may exit the loop prior to TripCount iterations either 
-/// via an early branch in other loop block or via LatchBlock terminator. This 
-/// is relaxed from the general definition of trip count which is the number of 
-/// times the loop header executes. Note that UnrollLoop assumes that the loop 
-/// counter test is in LatchBlock in order to remove unnecesssary instances of 
-/// the test.  If control can exit the loop from the LatchBlock's terminator 
-/// prior to TripCount iterations, flag PreserveCondBr needs to be set. 
-/// 
-/// PreserveCondBr indicates whether the conditional branch of the LatchBlock 
-/// needs to be preserved.  It is needed when we use trip count upper bound to 
-/// fully unroll the loop. If PreserveOnlyFirst is also set then only the first 
-/// conditional branch needs to be preserved. 
-/// 
-/// Similarly, TripMultiple divides the number of times that the LatchBlock may 
-/// execute without exiting the loop. 
-/// 
-/// If AllowRuntime is true then UnrollLoop will consider unrolling loops that 
-/// have a runtime (i.e. not compile time constant) trip count.  Unrolling these 
-/// loops require a unroll "prologue" that runs "RuntimeTripCount % Count" 
-/// iterations before branching into the unrolled loop.  UnrollLoop will not 
-/// runtime-unroll the loop if computing RuntimeTripCount will be expensive and 
-/// AllowExpensiveTripCount is false. 
-/// 
-/// If we want to perform PGO-based loop peeling, PeelCount is set to the 
-/// number of iterations we want to peel off. 
-/// 
-/// The LoopInfo Analysis that is passed will be kept consistent. 
-/// 
-/// This utility preserves LoopInfo. It will also preserve ScalarEvolution and 
-/// DominatorTree if they are non-null. 
-/// 
-/// If RemainderLoop is non-null, it will receive the remainder loop (if 
-/// required and not fully unrolled). 
-LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, 
-                                  ScalarEvolution *SE, DominatorTree *DT, 
-                                  AssumptionCache *AC, 
-                                  const TargetTransformInfo *TTI, 
-                                  OptimizationRemarkEmitter *ORE, 
-                                  bool PreserveLCSSA, Loop **RemainderLoop) { 
- 
+          Loop *DefLoop = LI->getLoopFor(Def->getParent());
+          if (!DefLoop)
+            continue;
+          if (DefLoop->contains(L))
+            return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+/// Adds ClonedBB to LoopInfo, creates a new loop for ClonedBB if necessary
+/// and adds a mapping from the original loop to the new loop to NewLoops.
+/// Returns nullptr if no new loop was created and a pointer to the
+/// original loop OriginalBB was part of otherwise.
+const Loop* llvm::addClonedBlockToLoopInfo(BasicBlock *OriginalBB,
+                                           BasicBlock *ClonedBB, LoopInfo *LI,
+                                           NewLoopsMap &NewLoops) {
+  // Figure out which loop New is in.
+  const Loop *OldLoop = LI->getLoopFor(OriginalBB);
+  assert(OldLoop && "Should (at least) be in the loop being unrolled!");
+
+  Loop *&NewLoop = NewLoops[OldLoop];
+  if (!NewLoop) {
+    // Found a new sub-loop.
+    assert(OriginalBB == OldLoop->getHeader() &&
+           "Header should be first in RPO");
+
+    NewLoop = LI->AllocateLoop();
+    Loop *NewLoopParent = NewLoops.lookup(OldLoop->getParentLoop());
+
+    if (NewLoopParent)
+      NewLoopParent->addChildLoop(NewLoop);
+    else
+      LI->addTopLevelLoop(NewLoop);
+
+    NewLoop->addBasicBlockToLoop(ClonedBB, *LI);
+    return OldLoop;
+  } else {
+    NewLoop->addBasicBlockToLoop(ClonedBB, *LI);
+    return nullptr;
+  }
+}
+
+/// The function chooses which type of unroll (epilog or prolog) is more
+/// profitabale.
+/// Epilog unroll is more profitable when there is PHI that starts from
+/// constant.  In this case epilog will leave PHI start from constant,
+/// but prolog will convert it to non-constant.
+///
+/// loop:
+///   PN = PHI [I, Latch], [CI, PreHeader]
+///   I = foo(PN)
+///   ...
+///
+/// Epilog unroll case.
+/// loop:
+///   PN = PHI [I2, Latch], [CI, PreHeader]
+///   I1 = foo(PN)
+///   I2 = foo(I1)
+///   ...
+/// Prolog unroll case.
+///   NewPN = PHI [PrologI, Prolog], [CI, PreHeader]
+/// loop:
+///   PN = PHI [I2, Latch], [NewPN, PreHeader]
+///   I1 = foo(PN)
+///   I2 = foo(I1)
+///   ...
+///
+static bool isEpilogProfitable(Loop *L) {
+  BasicBlock *PreHeader = L->getLoopPreheader();
+  BasicBlock *Header = L->getHeader();
+  assert(PreHeader && Header);
+  for (const PHINode &PN : Header->phis()) {
+    if (isa<ConstantInt>(PN.getIncomingValueForBlock(PreHeader)))
+      return true;
+  }
+  return false;
+}
+
+/// Perform some cleanup and simplifications on loops after unrolling. It is
+/// useful to simplify the IV's in the new loop, as well as do a quick
+/// simplify/dce pass of the instructions.
+void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
+                                   ScalarEvolution *SE, DominatorTree *DT,
+                                   AssumptionCache *AC,
+                                   const TargetTransformInfo *TTI) {
+  // Simplify any new induction variables in the partially unrolled loop.
+  if (SE && SimplifyIVs) {
+    SmallVector<WeakTrackingVH, 16> DeadInsts;
+    simplifyLoopIVs(L, SE, DT, LI, TTI, DeadInsts);
+
+    // Aggressively clean up dead instructions that simplifyLoopIVs already
+    // identified. Any remaining should be cleaned up below.
+    while (!DeadInsts.empty()) {
+      Value *V = DeadInsts.pop_back_val();
+      if (Instruction *Inst = dyn_cast_or_null<Instruction>(V))
+        RecursivelyDeleteTriviallyDeadInstructions(Inst);
+    }
+  }
+
+  // At this point, the code is well formed.  We now do a quick sweep over the
+  // inserted code, doing constant propagation and dead code elimination as we
+  // go.
+  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+  for (BasicBlock *BB : L->getBlocks()) {
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+      Instruction *Inst = &*I++;
+
+      if (Value *V = SimplifyInstruction(Inst, {DL, nullptr, DT, AC}))
+        if (LI->replacementPreservesLCSSAForm(Inst, V))
+          Inst->replaceAllUsesWith(V);
+      if (isInstructionTriviallyDead(Inst))
+        BB->getInstList().erase(Inst);
+    }
+  }
+
+  // TODO: after peeling or unrolling, previously loop variant conditions are
+  // likely to fold to constants, eagerly propagating those here will require
+  // fewer cleanup passes to be run.  Alternatively, a LoopEarlyCSE might be
+  // appropriate.
+}
+
+/// Unroll the given loop by Count. The loop must be in LCSSA form.  Unrolling
+/// can only fail when the loop's latch block is not terminated by a conditional
+/// branch instruction. However, if the trip count (and multiple) are not known,
+/// loop unrolling will mostly produce more code that is no faster.
+///
+/// TripCount is the upper bound of the iteration on which control exits
+/// LatchBlock. Control may exit the loop prior to TripCount iterations either
+/// via an early branch in other loop block or via LatchBlock terminator. This
+/// is relaxed from the general definition of trip count which is the number of
+/// times the loop header executes. Note that UnrollLoop assumes that the loop
+/// counter test is in LatchBlock in order to remove unnecesssary instances of
+/// the test.  If control can exit the loop from the LatchBlock's terminator
+/// prior to TripCount iterations, flag PreserveCondBr needs to be set.
+///
+/// PreserveCondBr indicates whether the conditional branch of the LatchBlock
+/// needs to be preserved.  It is needed when we use trip count upper bound to
+/// fully unroll the loop. If PreserveOnlyFirst is also set then only the first
+/// conditional branch needs to be preserved.
+///
+/// Similarly, TripMultiple divides the number of times that the LatchBlock may
+/// execute without exiting the loop.
+///
+/// If AllowRuntime is true then UnrollLoop will consider unrolling loops that
+/// have a runtime (i.e. not compile time constant) trip count.  Unrolling these
+/// loops require a unroll "prologue" that runs "RuntimeTripCount % Count"
+/// iterations before branching into the unrolled loop.  UnrollLoop will not
+/// runtime-unroll the loop if computing RuntimeTripCount will be expensive and
+/// AllowExpensiveTripCount is false.
+///
+/// If we want to perform PGO-based loop peeling, PeelCount is set to the
+/// number of iterations we want to peel off.
+///
+/// The LoopInfo Analysis that is passed will be kept consistent.
+///
+/// This utility preserves LoopInfo. It will also preserve ScalarEvolution and
+/// DominatorTree if they are non-null.
+///
+/// If RemainderLoop is non-null, it will receive the remainder loop (if
+/// required and not fully unrolled).
+LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
+                                  ScalarEvolution *SE, DominatorTree *DT,
+                                  AssumptionCache *AC,
+                                  const TargetTransformInfo *TTI,
+                                  OptimizationRemarkEmitter *ORE,
+                                  bool PreserveLCSSA, Loop **RemainderLoop) {
+
   if (!L->getLoopPreheader()) {
-    LLVM_DEBUG(dbgs() << "  Can't unroll; loop preheader-insertion failed.\n"); 
-    return LoopUnrollResult::Unmodified; 
-  } 
- 
+    LLVM_DEBUG(dbgs() << "  Can't unroll; loop preheader-insertion failed.\n");
+    return LoopUnrollResult::Unmodified;
+  }
+
   if (!L->getLoopLatch()) {
-    LLVM_DEBUG(dbgs() << "  Can't unroll; loop exit-block-insertion failed.\n"); 
-    return LoopUnrollResult::Unmodified; 
-  } 
- 
-  // Loops with indirectbr cannot be cloned. 
-  if (!L->isSafeToClone()) { 
-    LLVM_DEBUG(dbgs() << "  Can't unroll; Loop body cannot be cloned.\n"); 
-    return LoopUnrollResult::Unmodified; 
-  } 
- 
+    LLVM_DEBUG(dbgs() << "  Can't unroll; loop exit-block-insertion failed.\n");
+    return LoopUnrollResult::Unmodified;
+  }
+
+  // Loops with indirectbr cannot be cloned.
+  if (!L->isSafeToClone()) {
+    LLVM_DEBUG(dbgs() << "  Can't unroll; Loop body cannot be cloned.\n");
+    return LoopUnrollResult::Unmodified;
+  }
+
   if (L->getHeader()->hasAddressTaken()) {
-    // The loop-rotate pass can be helpful to avoid this in many cases. 
-    LLVM_DEBUG( 
-        dbgs() << "  Won't unroll loop: address of header block is taken.\n"); 
-    return LoopUnrollResult::Unmodified; 
-  } 
- 
-  if (ULO.TripCount != 0) 
-    LLVM_DEBUG(dbgs() << "  Trip Count = " << ULO.TripCount << "\n"); 
-  if (ULO.TripMultiple != 1) 
-    LLVM_DEBUG(dbgs() << "  Trip Multiple = " << ULO.TripMultiple << "\n"); 
- 
-  // Effectively "DCE" unrolled iterations that are beyond the tripcount 
-  // and will never be executed. 
-  if (ULO.TripCount != 0 && ULO.Count > ULO.TripCount) 
-    ULO.Count = ULO.TripCount; 
- 
-  // Don't enter the unroll code if there is nothing to do. 
-  if (ULO.TripCount == 0 && ULO.Count < 2 && ULO.PeelCount == 0) { 
-    LLVM_DEBUG(dbgs() << "Won't unroll; almost nothing to do\n"); 
-    return LoopUnrollResult::Unmodified; 
-  } 
- 
-  assert(ULO.Count > 0); 
-  assert(ULO.TripMultiple > 0); 
-  assert(ULO.TripCount == 0 || ULO.TripCount % ULO.TripMultiple == 0); 
- 
-  // Are we eliminating the loop control altogether? 
-  bool CompletelyUnroll = ULO.Count == ULO.TripCount; 
- 
-  // We assume a run-time trip count if the compiler cannot 
-  // figure out the loop trip count and the unroll-runtime 
-  // flag is specified. 
-  bool RuntimeTripCount = 
-      (ULO.TripCount == 0 && ULO.Count > 0 && ULO.AllowRuntime); 
- 
-  assert((!RuntimeTripCount || !ULO.PeelCount) && 
-         "Did not expect runtime trip-count unrolling " 
-         "and peeling for the same loop"); 
- 
-  bool Peeled = false; 
-  if (ULO.PeelCount) { 
-    Peeled = peelLoop(L, ULO.PeelCount, LI, SE, DT, AC, PreserveLCSSA); 
- 
-    // Successful peeling may result in a change in the loop preheader/trip 
-    // counts. If we later unroll the loop, we want these to be updated. 
-    if (Peeled) { 
-      // According to our guards and profitability checks the only 
-      // meaningful exit should be latch block. Other exits go to deopt, 
-      // so we do not worry about them. 
-      BasicBlock *ExitingBlock = L->getLoopLatch(); 
-      assert(ExitingBlock && "Loop without exiting block?"); 
-      assert(L->isLoopExiting(ExitingBlock) && "Latch is not exiting?"); 
-      ULO.TripCount = SE->getSmallConstantTripCount(L, ExitingBlock); 
-      ULO.TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock); 
-    } 
-  } 
- 
+    // The loop-rotate pass can be helpful to avoid this in many cases.
+    LLVM_DEBUG(
+        dbgs() << "  Won't unroll loop: address of header block is taken.\n");
+    return LoopUnrollResult::Unmodified;
+  }
+
+  if (ULO.TripCount != 0)
+    LLVM_DEBUG(dbgs() << "  Trip Count = " << ULO.TripCount << "\n");
+  if (ULO.TripMultiple != 1)
+    LLVM_DEBUG(dbgs() << "  Trip Multiple = " << ULO.TripMultiple << "\n");
+
+  // Effectively "DCE" unrolled iterations that are beyond the tripcount
+  // and will never be executed.
+  if (ULO.TripCount != 0 && ULO.Count > ULO.TripCount)
+    ULO.Count = ULO.TripCount;
+
+  // Don't enter the unroll code if there is nothing to do.
+  if (ULO.TripCount == 0 && ULO.Count < 2 && ULO.PeelCount == 0) {
+    LLVM_DEBUG(dbgs() << "Won't unroll; almost nothing to do\n");
+    return LoopUnrollResult::Unmodified;
+  }
+
+  assert(ULO.Count > 0);
+  assert(ULO.TripMultiple > 0);
+  assert(ULO.TripCount == 0 || ULO.TripCount % ULO.TripMultiple == 0);
+
+  // Are we eliminating the loop control altogether?
+  bool CompletelyUnroll = ULO.Count == ULO.TripCount;
+
+  // We assume a run-time trip count if the compiler cannot
+  // figure out the loop trip count and the unroll-runtime
+  // flag is specified.
+  bool RuntimeTripCount =
+      (ULO.TripCount == 0 && ULO.Count > 0 && ULO.AllowRuntime);
+
+  assert((!RuntimeTripCount || !ULO.PeelCount) &&
+         "Did not expect runtime trip-count unrolling "
+         "and peeling for the same loop");
+
+  bool Peeled = false;
+  if (ULO.PeelCount) {
+    Peeled = peelLoop(L, ULO.PeelCount, LI, SE, DT, AC, PreserveLCSSA);
+
+    // Successful peeling may result in a change in the loop preheader/trip
+    // counts. If we later unroll the loop, we want these to be updated.
+    if (Peeled) {
+      // According to our guards and profitability checks the only
+      // meaningful exit should be latch block. Other exits go to deopt,
+      // so we do not worry about them.
+      BasicBlock *ExitingBlock = L->getLoopLatch();
+      assert(ExitingBlock && "Loop without exiting block?");
+      assert(L->isLoopExiting(ExitingBlock) && "Latch is not exiting?");
+      ULO.TripCount = SE->getSmallConstantTripCount(L, ExitingBlock);
+      ULO.TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock);
+    }
+  }
+
   // All these values should be taken only after peeling because they might have
   // changed.
   BasicBlock *Preheader = L->getLoopPreheader();
@@ -414,280 +414,280 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
       dbgs() << "  No single exiting block\n";
   });
 
-  // Loops containing convergent instructions must have a count that divides 
-  // their TripMultiple. 
-  LLVM_DEBUG( 
-      { 
-        bool HasConvergent = false; 
-        for (auto &BB : L->blocks()) 
-          for (auto &I : *BB) 
-            if (auto *CB = dyn_cast<CallBase>(&I)) 
-              HasConvergent |= CB->isConvergent(); 
-        assert((!HasConvergent || ULO.TripMultiple % ULO.Count == 0) && 
-               "Unroll count must divide trip multiple if loop contains a " 
-               "convergent operation."); 
-      }); 
- 
-  bool EpilogProfitability = 
-      UnrollRuntimeEpilog.getNumOccurrences() ? UnrollRuntimeEpilog 
-                                              : isEpilogProfitable(L); 
- 
-  if (RuntimeTripCount && ULO.TripMultiple % ULO.Count != 0 && 
-      !UnrollRuntimeLoopRemainder(L, ULO.Count, ULO.AllowExpensiveTripCount, 
-                                  EpilogProfitability, ULO.UnrollRemainder, 
-                                  ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI, 
-                                  PreserveLCSSA, RemainderLoop)) { 
-    if (ULO.Force) 
-      RuntimeTripCount = false; 
-    else { 
-      LLVM_DEBUG(dbgs() << "Won't unroll; remainder loop could not be " 
-                           "generated when assuming runtime trip count\n"); 
-      return LoopUnrollResult::Unmodified; 
-    } 
-  } 
- 
-  // If we know the trip count, we know the multiple... 
-  unsigned BreakoutTrip = 0; 
-  if (ULO.TripCount != 0) { 
-    BreakoutTrip = ULO.TripCount % ULO.Count; 
-    ULO.TripMultiple = 0; 
-  } else { 
-    // Figure out what multiple to use. 
-    BreakoutTrip = ULO.TripMultiple = 
-        (unsigned)GreatestCommonDivisor64(ULO.Count, ULO.TripMultiple); 
-  } 
- 
-  using namespace ore; 
-  // Report the unrolling decision. 
-  if (CompletelyUnroll) { 
-    LLVM_DEBUG(dbgs() << "COMPLETELY UNROLLING loop %" << Header->getName() 
-                      << " with trip count " << ULO.TripCount << "!\n"); 
-    if (ORE) 
-      ORE->emit([&]() { 
-        return OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(), 
-                                  L->getHeader()) 
-               << "completely unrolled loop with " 
-               << NV("UnrollCount", ULO.TripCount) << " iterations"; 
-      }); 
-  } else if (ULO.PeelCount) { 
-    LLVM_DEBUG(dbgs() << "PEELING loop %" << Header->getName() 
-                      << " with iteration count " << ULO.PeelCount << "!\n"); 
-    if (ORE) 
-      ORE->emit([&]() { 
-        return OptimizationRemark(DEBUG_TYPE, "Peeled", L->getStartLoc(), 
-                                  L->getHeader()) 
-               << " peeled loop by " << NV("PeelCount", ULO.PeelCount) 
-               << " iterations"; 
-      }); 
-  } else { 
-    auto DiagBuilder = [&]() { 
-      OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(), 
-                              L->getHeader()); 
-      return Diag << "unrolled loop by a factor of " 
-                  << NV("UnrollCount", ULO.Count); 
-    }; 
- 
-    LLVM_DEBUG(dbgs() << "UNROLLING loop %" << Header->getName() << " by " 
-                      << ULO.Count); 
-    if (ULO.TripMultiple == 0 || BreakoutTrip != ULO.TripMultiple) { 
-      LLVM_DEBUG(dbgs() << " with a breakout at trip " << BreakoutTrip); 
-      if (ORE) 
-        ORE->emit([&]() { 
-          return DiagBuilder() << " with a breakout at trip " 
-                               << NV("BreakoutTrip", BreakoutTrip); 
-        }); 
-    } else if (ULO.TripMultiple != 1) { 
-      LLVM_DEBUG(dbgs() << " with " << ULO.TripMultiple << " trips per branch"); 
-      if (ORE) 
-        ORE->emit([&]() { 
-          return DiagBuilder() 
-                 << " with " << NV("TripMultiple", ULO.TripMultiple) 
-                 << " trips per branch"; 
-        }); 
-    } else if (RuntimeTripCount) { 
-      LLVM_DEBUG(dbgs() << " with run-time trip count"); 
-      if (ORE) 
-        ORE->emit( 
-            [&]() { return DiagBuilder() << " with run-time trip count"; }); 
-    } 
-    LLVM_DEBUG(dbgs() << "!\n"); 
-  } 
- 
-  // We are going to make changes to this loop. SCEV may be keeping cached info 
-  // about it, in particular about backedge taken count. The changes we make 
-  // are guaranteed to invalidate this information for our loop. It is tempting 
-  // to only invalidate the loop being unrolled, but it is incorrect as long as 
-  // all exiting branches from all inner loops have impact on the outer loops, 
-  // and if something changes inside them then any of outer loops may also 
-  // change. When we forget outermost loop, we also forget all contained loops 
-  // and this is what we need here. 
-  if (SE) { 
-    if (ULO.ForgetAllSCEV) 
-      SE->forgetAllLoops(); 
-    else 
-      SE->forgetTopmostLoop(L); 
-  } 
- 
-  if (!LatchIsExiting) 
-    ++NumUnrolledNotLatch; 
-  Optional<bool> ContinueOnTrue = None; 
-  BasicBlock *LoopExit = nullptr; 
-  if (ExitingBI) { 
-    ContinueOnTrue = L->contains(ExitingBI->getSuccessor(0)); 
-    LoopExit = ExitingBI->getSuccessor(*ContinueOnTrue); 
-  } 
- 
-  // For the first iteration of the loop, we should use the precloned values for 
-  // PHI nodes.  Insert associations now. 
-  ValueToValueMapTy LastValueMap; 
-  std::vector<PHINode*> OrigPHINode; 
-  for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) { 
-    OrigPHINode.push_back(cast<PHINode>(I)); 
-  } 
- 
-  std::vector<BasicBlock *> Headers; 
-  std::vector<BasicBlock *> ExitingBlocks; 
-  std::vector<BasicBlock *> ExitingSucc; 
-  std::vector<BasicBlock *> Latches; 
-  Headers.push_back(Header); 
-  Latches.push_back(LatchBlock); 
-  if (ExitingBI) { 
-    ExitingBlocks.push_back(ExitingBI->getParent()); 
-    ExitingSucc.push_back(ExitingBI->getSuccessor(!(*ContinueOnTrue))); 
-  } 
- 
-  // The current on-the-fly SSA update requires blocks to be processed in 
-  // reverse postorder so that LastValueMap contains the correct value at each 
-  // exit. 
-  LoopBlocksDFS DFS(L); 
-  DFS.perform(LI); 
- 
-  // Stash the DFS iterators before adding blocks to the loop. 
-  LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO(); 
-  LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO(); 
- 
-  std::vector<BasicBlock*> UnrolledLoopBlocks = L->getBlocks(); 
- 
-  // Loop Unrolling might create new loops. While we do preserve LoopInfo, we 
-  // might break loop-simplified form for these loops (as they, e.g., would 
-  // share the same exit blocks). We'll keep track of loops for which we can 
-  // break this so that later we can re-simplify them. 
-  SmallSetVector<Loop *, 4> LoopsToSimplify; 
-  for (Loop *SubLoop : *L) 
-    LoopsToSimplify.insert(SubLoop); 
- 
-  if (Header->getParent()->isDebugInfoForProfiling()) 
-    for (BasicBlock *BB : L->getBlocks()) 
-      for (Instruction &I : *BB) 
-        if (!isa<DbgInfoIntrinsic>(&I)) 
-          if (const DILocation *DIL = I.getDebugLoc()) { 
-            auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(ULO.Count); 
-            if (NewDIL) 
-              I.setDebugLoc(NewDIL.getValue()); 
-            else 
-              LLVM_DEBUG(dbgs() 
-                         << "Failed to create new discriminator: " 
-                         << DIL->getFilename() << " Line: " << DIL->getLine()); 
-          } 
- 
+  // Loops containing convergent instructions must have a count that divides
+  // their TripMultiple.
+  LLVM_DEBUG(
+      {
+        bool HasConvergent = false;
+        for (auto &BB : L->blocks())
+          for (auto &I : *BB)
+            if (auto *CB = dyn_cast<CallBase>(&I))
+              HasConvergent |= CB->isConvergent();
+        assert((!HasConvergent || ULO.TripMultiple % ULO.Count == 0) &&
+               "Unroll count must divide trip multiple if loop contains a "
+               "convergent operation.");
+      });
+
+  bool EpilogProfitability =
+      UnrollRuntimeEpilog.getNumOccurrences() ? UnrollRuntimeEpilog
+                                              : isEpilogProfitable(L);
+
+  if (RuntimeTripCount && ULO.TripMultiple % ULO.Count != 0 &&
+      !UnrollRuntimeLoopRemainder(L, ULO.Count, ULO.AllowExpensiveTripCount,
+                                  EpilogProfitability, ULO.UnrollRemainder,
+                                  ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI,
+                                  PreserveLCSSA, RemainderLoop)) {
+    if (ULO.Force)
+      RuntimeTripCount = false;
+    else {
+      LLVM_DEBUG(dbgs() << "Won't unroll; remainder loop could not be "
+                           "generated when assuming runtime trip count\n");
+      return LoopUnrollResult::Unmodified;
+    }
+  }
+
+  // If we know the trip count, we know the multiple...
+  unsigned BreakoutTrip = 0;
+  if (ULO.TripCount != 0) {
+    BreakoutTrip = ULO.TripCount % ULO.Count;
+    ULO.TripMultiple = 0;
+  } else {
+    // Figure out what multiple to use.
+    BreakoutTrip = ULO.TripMultiple =
+        (unsigned)GreatestCommonDivisor64(ULO.Count, ULO.TripMultiple);
+  }
+
+  using namespace ore;
+  // Report the unrolling decision.
+  if (CompletelyUnroll) {
+    LLVM_DEBUG(dbgs() << "COMPLETELY UNROLLING loop %" << Header->getName()
+                      << " with trip count " << ULO.TripCount << "!\n");
+    if (ORE)
+      ORE->emit([&]() {
+        return OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
+                                  L->getHeader())
+               << "completely unrolled loop with "
+               << NV("UnrollCount", ULO.TripCount) << " iterations";
+      });
+  } else if (ULO.PeelCount) {
+    LLVM_DEBUG(dbgs() << "PEELING loop %" << Header->getName()
+                      << " with iteration count " << ULO.PeelCount << "!\n");
+    if (ORE)
+      ORE->emit([&]() {
+        return OptimizationRemark(DEBUG_TYPE, "Peeled", L->getStartLoc(),
+                                  L->getHeader())
+               << " peeled loop by " << NV("PeelCount", ULO.PeelCount)
+               << " iterations";
+      });
+  } else {
+    auto DiagBuilder = [&]() {
+      OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
+                              L->getHeader());
+      return Diag << "unrolled loop by a factor of "
+                  << NV("UnrollCount", ULO.Count);
+    };
+
+    LLVM_DEBUG(dbgs() << "UNROLLING loop %" << Header->getName() << " by "
+                      << ULO.Count);
+    if (ULO.TripMultiple == 0 || BreakoutTrip != ULO.TripMultiple) {
+      LLVM_DEBUG(dbgs() << " with a breakout at trip " << BreakoutTrip);
+      if (ORE)
+        ORE->emit([&]() {
+          return DiagBuilder() << " with a breakout at trip "
+                               << NV("BreakoutTrip", BreakoutTrip);
+        });
+    } else if (ULO.TripMultiple != 1) {
+      LLVM_DEBUG(dbgs() << " with " << ULO.TripMultiple << " trips per branch");
+      if (ORE)
+        ORE->emit([&]() {
+          return DiagBuilder()
+                 << " with " << NV("TripMultiple", ULO.TripMultiple)
+                 << " trips per branch";
+        });
+    } else if (RuntimeTripCount) {
+      LLVM_DEBUG(dbgs() << " with run-time trip count");
+      if (ORE)
+        ORE->emit(
+            [&]() { return DiagBuilder() << " with run-time trip count"; });
+    }
+    LLVM_DEBUG(dbgs() << "!\n");
+  }
+
+  // We are going to make changes to this loop. SCEV may be keeping cached info
+  // about it, in particular about backedge taken count. The changes we make
+  // are guaranteed to invalidate this information for our loop. It is tempting
+  // to only invalidate the loop being unrolled, but it is incorrect as long as
+  // all exiting branches from all inner loops have impact on the outer loops,
+  // and if something changes inside them then any of outer loops may also
+  // change. When we forget outermost loop, we also forget all contained loops
+  // and this is what we need here.
+  if (SE) {
+    if (ULO.ForgetAllSCEV)
+      SE->forgetAllLoops();
+    else
+      SE->forgetTopmostLoop(L);
+  }
+
+  if (!LatchIsExiting)
+    ++NumUnrolledNotLatch;
+  Optional<bool> ContinueOnTrue = None;
+  BasicBlock *LoopExit = nullptr;
+  if (ExitingBI) {
+    ContinueOnTrue = L->contains(ExitingBI->getSuccessor(0));
+    LoopExit = ExitingBI->getSuccessor(*ContinueOnTrue);
+  }
+
+  // For the first iteration of the loop, we should use the precloned values for
+  // PHI nodes.  Insert associations now.
+  ValueToValueMapTy LastValueMap;
+  std::vector<PHINode*> OrigPHINode;
+  for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
+    OrigPHINode.push_back(cast<PHINode>(I));
+  }
+
+  std::vector<BasicBlock *> Headers;
+  std::vector<BasicBlock *> ExitingBlocks;
+  std::vector<BasicBlock *> ExitingSucc;
+  std::vector<BasicBlock *> Latches;
+  Headers.push_back(Header);
+  Latches.push_back(LatchBlock);
+  if (ExitingBI) {
+    ExitingBlocks.push_back(ExitingBI->getParent());
+    ExitingSucc.push_back(ExitingBI->getSuccessor(!(*ContinueOnTrue)));
+  }
+
+  // The current on-the-fly SSA update requires blocks to be processed in
+  // reverse postorder so that LastValueMap contains the correct value at each
+  // exit.
+  LoopBlocksDFS DFS(L);
+  DFS.perform(LI);
+
+  // Stash the DFS iterators before adding blocks to the loop.
+  LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
+  LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
+
+  std::vector<BasicBlock*> UnrolledLoopBlocks = L->getBlocks();
+
+  // Loop Unrolling might create new loops. While we do preserve LoopInfo, we
+  // might break loop-simplified form for these loops (as they, e.g., would
+  // share the same exit blocks). We'll keep track of loops for which we can
+  // break this so that later we can re-simplify them.
+  SmallSetVector<Loop *, 4> LoopsToSimplify;
+  for (Loop *SubLoop : *L)
+    LoopsToSimplify.insert(SubLoop);
+
+  if (Header->getParent()->isDebugInfoForProfiling())
+    for (BasicBlock *BB : L->getBlocks())
+      for (Instruction &I : *BB)
+        if (!isa<DbgInfoIntrinsic>(&I))
+          if (const DILocation *DIL = I.getDebugLoc()) {
+            auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(ULO.Count);
+            if (NewDIL)
+              I.setDebugLoc(NewDIL.getValue());
+            else
+              LLVM_DEBUG(dbgs()
+                         << "Failed to create new discriminator: "
+                         << DIL->getFilename() << " Line: " << DIL->getLine());
+          }
+
   // Identify what noalias metadata is inside the loop: if it is inside the
   // loop, the associated metadata must be cloned for each iteration.
   SmallVector<MDNode *, 6> LoopLocalNoAliasDeclScopes;
   identifyNoAliasScopesToClone(L->getBlocks(), LoopLocalNoAliasDeclScopes);
 
-  for (unsigned It = 1; It != ULO.Count; ++It) { 
-    SmallVector<BasicBlock *, 8> NewBlocks; 
-    SmallDenseMap<const Loop *, Loop *, 4> NewLoops; 
-    NewLoops[L] = L; 
- 
-    for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) { 
-      ValueToValueMapTy VMap; 
-      BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It)); 
-      Header->getParent()->getBasicBlockList().push_back(New); 
- 
-      assert((*BB != Header || LI->getLoopFor(*BB) == L) && 
-             "Header should not be in a sub-loop"); 
-      // Tell LI about New. 
-      const Loop *OldLoop = addClonedBlockToLoopInfo(*BB, New, LI, NewLoops); 
-      if (OldLoop) 
-        LoopsToSimplify.insert(NewLoops[OldLoop]); 
- 
-      if (*BB == Header) 
-        // Loop over all of the PHI nodes in the block, changing them to use 
-        // the incoming values from the previous block. 
-        for (PHINode *OrigPHI : OrigPHINode) { 
-          PHINode *NewPHI = cast<PHINode>(VMap[OrigPHI]); 
-          Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock); 
-          if (Instruction *InValI = dyn_cast<Instruction>(InVal)) 
-            if (It > 1 && L->contains(InValI)) 
-              InVal = LastValueMap[InValI]; 
-          VMap[OrigPHI] = InVal; 
-          New->getInstList().erase(NewPHI); 
-        } 
- 
-      // Update our running map of newest clones 
-      LastValueMap[*BB] = New; 
-      for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end(); 
-           VI != VE; ++VI) 
-        LastValueMap[VI->first] = VI->second; 
- 
-      // Add phi entries for newly created values to all exit blocks. 
-      for (BasicBlock *Succ : successors(*BB)) { 
-        if (L->contains(Succ)) 
-          continue; 
-        for (PHINode &PHI : Succ->phis()) { 
-          Value *Incoming = PHI.getIncomingValueForBlock(*BB); 
-          ValueToValueMapTy::iterator It = LastValueMap.find(Incoming); 
-          if (It != LastValueMap.end()) 
-            Incoming = It->second; 
-          PHI.addIncoming(Incoming, New); 
-        } 
-      } 
-      // Keep track of new headers and latches as we create them, so that 
-      // we can insert the proper branches later. 
-      if (*BB == Header) 
-        Headers.push_back(New); 
-      if (*BB == LatchBlock) 
-        Latches.push_back(New); 
- 
-      // Keep track of the exiting block and its successor block contained in 
-      // the loop for the current iteration. 
-      if (ExitingBI) { 
-        if (*BB == ExitingBlocks[0]) 
-          ExitingBlocks.push_back(New); 
-        if (*BB == ExitingSucc[0]) 
-          ExitingSucc.push_back(New); 
-      } 
- 
-      NewBlocks.push_back(New); 
-      UnrolledLoopBlocks.push_back(New); 
- 
-      // Update DomTree: since we just copy the loop body, and each copy has a 
-      // dedicated entry block (copy of the header block), this header's copy 
-      // dominates all copied blocks. That means, dominance relations in the 
-      // copied body are the same as in the original body. 
-      if (DT) { 
-        if (*BB == Header) 
-          DT->addNewBlock(New, Latches[It - 1]); 
-        else { 
-          auto BBDomNode = DT->getNode(*BB); 
-          auto BBIDom = BBDomNode->getIDom(); 
-          BasicBlock *OriginalBBIDom = BBIDom->getBlock(); 
-          DT->addNewBlock( 
-              New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)])); 
-        } 
-      } 
-    } 
- 
-    // Remap all instructions in the most recent iteration 
-    remapInstructionsInBlocks(NewBlocks, LastValueMap); 
-    for (BasicBlock *NewBlock : NewBlocks) { 
-      for (Instruction &I : *NewBlock) { 
-        if (auto *II = dyn_cast<IntrinsicInst>(&I)) 
-          if (II->getIntrinsicID() == Intrinsic::assume) 
-            AC->registerAssumption(II); 
-      } 
-    } 
+  for (unsigned It = 1; It != ULO.Count; ++It) {
+    SmallVector<BasicBlock *, 8> NewBlocks;
+    SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
+    NewLoops[L] = L;
+
+    for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
+      ValueToValueMapTy VMap;
+      BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
+      Header->getParent()->getBasicBlockList().push_back(New);
+
+      assert((*BB != Header || LI->getLoopFor(*BB) == L) &&
+             "Header should not be in a sub-loop");
+      // Tell LI about New.
+      const Loop *OldLoop = addClonedBlockToLoopInfo(*BB, New, LI, NewLoops);
+      if (OldLoop)
+        LoopsToSimplify.insert(NewLoops[OldLoop]);
+
+      if (*BB == Header)
+        // Loop over all of the PHI nodes in the block, changing them to use
+        // the incoming values from the previous block.
+        for (PHINode *OrigPHI : OrigPHINode) {
+          PHINode *NewPHI = cast<PHINode>(VMap[OrigPHI]);
+          Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock);
+          if (Instruction *InValI = dyn_cast<Instruction>(InVal))
+            if (It > 1 && L->contains(InValI))
+              InVal = LastValueMap[InValI];
+          VMap[OrigPHI] = InVal;
+          New->getInstList().erase(NewPHI);
+        }
+
+      // Update our running map of newest clones
+      LastValueMap[*BB] = New;
+      for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
+           VI != VE; ++VI)
+        LastValueMap[VI->first] = VI->second;
+
+      // Add phi entries for newly created values to all exit blocks.
+      for (BasicBlock *Succ : successors(*BB)) {
+        if (L->contains(Succ))
+          continue;
+        for (PHINode &PHI : Succ->phis()) {
+          Value *Incoming = PHI.getIncomingValueForBlock(*BB);
+          ValueToValueMapTy::iterator It = LastValueMap.find(Incoming);
+          if (It != LastValueMap.end())
+            Incoming = It->second;
+          PHI.addIncoming(Incoming, New);
+        }
+      }
+      // Keep track of new headers and latches as we create them, so that
+      // we can insert the proper branches later.
+      if (*BB == Header)
+        Headers.push_back(New);
+      if (*BB == LatchBlock)
+        Latches.push_back(New);
+
+      // Keep track of the exiting block and its successor block contained in
+      // the loop for the current iteration.
+      if (ExitingBI) {
+        if (*BB == ExitingBlocks[0])
+          ExitingBlocks.push_back(New);
+        if (*BB == ExitingSucc[0])
+          ExitingSucc.push_back(New);
+      }
+
+      NewBlocks.push_back(New);
+      UnrolledLoopBlocks.push_back(New);
+
+      // Update DomTree: since we just copy the loop body, and each copy has a
+      // dedicated entry block (copy of the header block), this header's copy
+      // dominates all copied blocks. That means, dominance relations in the
+      // copied body are the same as in the original body.
+      if (DT) {
+        if (*BB == Header)
+          DT->addNewBlock(New, Latches[It - 1]);
+        else {
+          auto BBDomNode = DT->getNode(*BB);
+          auto BBIDom = BBDomNode->getIDom();
+          BasicBlock *OriginalBBIDom = BBIDom->getBlock();
+          DT->addNewBlock(
+              New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
+        }
+      }
+    }
+
+    // Remap all instructions in the most recent iteration
+    remapInstructionsInBlocks(NewBlocks, LastValueMap);
+    for (BasicBlock *NewBlock : NewBlocks) {
+      for (Instruction &I : *NewBlock) {
+        if (auto *II = dyn_cast<IntrinsicInst>(&I))
+          if (II->getIntrinsicID() == Intrinsic::assume)
+            AC->registerAssumption(II);
+      }
+    }
 
     {
       // Identify what other metadata depends on the cloned version. After
@@ -697,282 +697,282 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
       cloneAndAdaptNoAliasScopes(LoopLocalNoAliasDeclScopes, NewBlocks,
                                  Header->getContext(), ext);
     }
-  } 
- 
-  // Loop over the PHI nodes in the original block, setting incoming values. 
-  for (PHINode *PN : OrigPHINode) { 
-    if (CompletelyUnroll) { 
-      PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader)); 
-      Header->getInstList().erase(PN); 
-    } else if (ULO.Count > 1) { 
-      Value *InVal = PN->removeIncomingValue(LatchBlock, false); 
-      // If this value was defined in the loop, take the value defined by the 
-      // last iteration of the loop. 
-      if (Instruction *InValI = dyn_cast<Instruction>(InVal)) { 
-        if (L->contains(InValI)) 
-          InVal = LastValueMap[InVal]; 
-      } 
-      assert(Latches.back() == LastValueMap[LatchBlock] && "bad last latch"); 
-      PN->addIncoming(InVal, Latches.back()); 
-    } 
-  } 
- 
-  auto setDest = [](BasicBlock *Src, BasicBlock *Dest, BasicBlock *BlockInLoop, 
-                    bool NeedConditional, Optional<bool> ContinueOnTrue, 
-                    bool IsDestLoopExit) { 
-    auto *Term = cast<BranchInst>(Src->getTerminator()); 
-    if (NeedConditional) { 
-      // Update the conditional branch's successor for the following 
-      // iteration. 
-      assert(ContinueOnTrue.hasValue() && 
-             "Expecting valid ContinueOnTrue when NeedConditional is true"); 
-      Term->setSuccessor(!(*ContinueOnTrue), Dest); 
-    } else { 
-      // Remove phi operands at this loop exit 
-      if (!IsDestLoopExit) { 
-        BasicBlock *BB = Src; 
-        for (BasicBlock *Succ : successors(BB)) { 
-          // Preserve the incoming value from BB if we are jumping to the block 
-          // in the current loop. 
-          if (Succ == BlockInLoop) 
-            continue; 
-          for (PHINode &Phi : Succ->phis()) 
-            Phi.removeIncomingValue(BB, false); 
-        } 
-      } 
-      // Replace the conditional branch with an unconditional one. 
-      BranchInst::Create(Dest, Term); 
-      Term->eraseFromParent(); 
-    } 
-  }; 
- 
-  // Connect latches of the unrolled iterations to the headers of the next 
-  // iteration. If the latch is also the exiting block, the conditional branch 
-  // may have to be preserved. 
-  for (unsigned i = 0, e = Latches.size(); i != e; ++i) { 
-    // The branch destination. 
-    unsigned j = (i + 1) % e; 
-    BasicBlock *Dest = Headers[j]; 
-    bool NeedConditional = LatchIsExiting; 
- 
-    if (LatchIsExiting) { 
-      if (RuntimeTripCount && j != 0) 
-        NeedConditional = false; 
- 
-      // For a complete unroll, make the last iteration end with a branch 
-      // to the exit block. 
-      if (CompletelyUnroll) { 
-        if (j == 0) 
-          Dest = LoopExit; 
-        // If using trip count upper bound to completely unroll, we need to 
-        // keep the conditional branch except the last one because the loop 
-        // may exit after any iteration. 
-        assert(NeedConditional && 
-               "NeedCondition cannot be modified by both complete " 
-               "unrolling and runtime unrolling"); 
-        NeedConditional = 
-            (ULO.PreserveCondBr && j && !(ULO.PreserveOnlyFirst && i != 0)); 
-      } else if (j != BreakoutTrip && 
-                 (ULO.TripMultiple == 0 || j % ULO.TripMultiple != 0)) { 
-        // If we know the trip count or a multiple of it, we can safely use an 
-        // unconditional branch for some iterations. 
-        NeedConditional = false; 
-      } 
-    } 
- 
-    setDest(Latches[i], Dest, Headers[i], NeedConditional, ContinueOnTrue, 
-            Dest == LoopExit); 
-  } 
- 
-  if (!LatchIsExiting) { 
-    // If the latch is not exiting, we may be able to simplify the conditional 
-    // branches in the unrolled exiting blocks. 
-    for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) { 
-      // The branch destination. 
-      unsigned j = (i + 1) % e; 
-      bool NeedConditional = true; 
- 
-      if (RuntimeTripCount && j != 0) 
-        NeedConditional = false; 
- 
-      if (CompletelyUnroll) 
-        // We cannot drop the conditional branch for the last condition, as we 
-        // may have to execute the loop body depending on the condition. 
-        NeedConditional = j == 0 || ULO.PreserveCondBr; 
-      else if (j != BreakoutTrip && 
-               (ULO.TripMultiple == 0 || j % ULO.TripMultiple != 0)) 
-        // If we know the trip count or a multiple of it, we can safely use an 
-        // unconditional branch for some iterations. 
-        NeedConditional = false; 
- 
-      // Conditional branches from non-latch exiting block have successors 
-      // either in the same loop iteration or outside the loop. The branches are 
-      // already correct. 
-      if (NeedConditional) 
-        continue; 
-      setDest(ExitingBlocks[i], ExitingSucc[i], ExitingSucc[i], NeedConditional, 
-              None, false); 
-    } 
- 
-    // When completely unrolling, the last latch becomes unreachable. 
-    if (CompletelyUnroll) { 
-      BranchInst *Term = cast<BranchInst>(Latches.back()->getTerminator()); 
-      new UnreachableInst(Term->getContext(), Term); 
-      Term->eraseFromParent(); 
-    } 
-  } 
- 
-  // Update dominators of blocks we might reach through exits. 
-  // Immediate dominator of such block might change, because we add more 
-  // routes which can lead to the exit: we can now reach it from the copied 
-  // iterations too. 
-  if (DT && ULO.Count > 1) { 
-    for (auto *BB : OriginalLoopBlocks) { 
-      auto *BBDomNode = DT->getNode(BB); 
-      SmallVector<BasicBlock *, 16> ChildrenToUpdate; 
-      for (auto *ChildDomNode : BBDomNode->children()) { 
-        auto *ChildBB = ChildDomNode->getBlock(); 
-        if (!L->contains(ChildBB)) 
-          ChildrenToUpdate.push_back(ChildBB); 
-      } 
-      BasicBlock *NewIDom; 
-      if (ExitingBI && BB == ExitingBlocks[0]) { 
-        // The latch is special because we emit unconditional branches in 
-        // some cases where the original loop contained a conditional branch. 
-        // Since the latch is always at the bottom of the loop, if the latch 
-        // dominated an exit before unrolling, the new dominator of that exit 
-        // must also be a latch.  Specifically, the dominator is the first 
-        // latch which ends in a conditional branch, or the last latch if 
-        // there is no such latch. 
-        // For loops exiting from non latch exiting block, we limit the 
-        // branch simplification to single exiting block loops. 
-        NewIDom = ExitingBlocks.back(); 
-        for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) { 
-          Instruction *Term = ExitingBlocks[i]->getTerminator(); 
-          if (isa<BranchInst>(Term) && cast<BranchInst>(Term)->isConditional()) { 
-            NewIDom = 
-                DT->findNearestCommonDominator(ExitingBlocks[i], Latches[i]); 
-            break; 
-          } 
-        } 
-      } else { 
-        // The new idom of the block will be the nearest common dominator 
-        // of all copies of the previous idom. This is equivalent to the 
-        // nearest common dominator of the previous idom and the first latch, 
-        // which dominates all copies of the previous idom. 
-        NewIDom = DT->findNearestCommonDominator(BB, LatchBlock); 
-      } 
-      for (auto *ChildBB : ChildrenToUpdate) 
-        DT->changeImmediateDominator(ChildBB, NewIDom); 
-    } 
-  } 
- 
-  assert(!DT || !UnrollVerifyDomtree || 
-         DT->verify(DominatorTree::VerificationLevel::Fast)); 
- 
-  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); 
-  // Merge adjacent basic blocks, if possible. 
-  for (BasicBlock *Latch : Latches) { 
-    BranchInst *Term = dyn_cast<BranchInst>(Latch->getTerminator()); 
-    assert((Term || 
-            (CompletelyUnroll && !LatchIsExiting && Latch == Latches.back())) && 
-           "Need a branch as terminator, except when fully unrolling with " 
-           "unconditional latch"); 
-    if (Term && Term->isUnconditional()) { 
-      BasicBlock *Dest = Term->getSuccessor(0); 
-      BasicBlock *Fold = Dest->getUniquePredecessor(); 
-      if (MergeBlockIntoPredecessor(Dest, &DTU, LI)) { 
-        // Dest has been folded into Fold. Update our worklists accordingly. 
-        std::replace(Latches.begin(), Latches.end(), Dest, Fold); 
+  }
+
+  // Loop over the PHI nodes in the original block, setting incoming values.
+  for (PHINode *PN : OrigPHINode) {
+    if (CompletelyUnroll) {
+      PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader));
+      Header->getInstList().erase(PN);
+    } else if (ULO.Count > 1) {
+      Value *InVal = PN->removeIncomingValue(LatchBlock, false);
+      // If this value was defined in the loop, take the value defined by the
+      // last iteration of the loop.
+      if (Instruction *InValI = dyn_cast<Instruction>(InVal)) {
+        if (L->contains(InValI))
+          InVal = LastValueMap[InVal];
+      }
+      assert(Latches.back() == LastValueMap[LatchBlock] && "bad last latch");
+      PN->addIncoming(InVal, Latches.back());
+    }
+  }
+
+  auto setDest = [](BasicBlock *Src, BasicBlock *Dest, BasicBlock *BlockInLoop,
+                    bool NeedConditional, Optional<bool> ContinueOnTrue,
+                    bool IsDestLoopExit) {
+    auto *Term = cast<BranchInst>(Src->getTerminator());
+    if (NeedConditional) {
+      // Update the conditional branch's successor for the following
+      // iteration.
+      assert(ContinueOnTrue.hasValue() &&
+             "Expecting valid ContinueOnTrue when NeedConditional is true");
+      Term->setSuccessor(!(*ContinueOnTrue), Dest);
+    } else {
+      // Remove phi operands at this loop exit
+      if (!IsDestLoopExit) {
+        BasicBlock *BB = Src;
+        for (BasicBlock *Succ : successors(BB)) {
+          // Preserve the incoming value from BB if we are jumping to the block
+          // in the current loop.
+          if (Succ == BlockInLoop)
+            continue;
+          for (PHINode &Phi : Succ->phis())
+            Phi.removeIncomingValue(BB, false);
+        }
+      }
+      // Replace the conditional branch with an unconditional one.
+      BranchInst::Create(Dest, Term);
+      Term->eraseFromParent();
+    }
+  };
+
+  // Connect latches of the unrolled iterations to the headers of the next
+  // iteration. If the latch is also the exiting block, the conditional branch
+  // may have to be preserved.
+  for (unsigned i = 0, e = Latches.size(); i != e; ++i) {
+    // The branch destination.
+    unsigned j = (i + 1) % e;
+    BasicBlock *Dest = Headers[j];
+    bool NeedConditional = LatchIsExiting;
+
+    if (LatchIsExiting) {
+      if (RuntimeTripCount && j != 0)
+        NeedConditional = false;
+
+      // For a complete unroll, make the last iteration end with a branch
+      // to the exit block.
+      if (CompletelyUnroll) {
+        if (j == 0)
+          Dest = LoopExit;
+        // If using trip count upper bound to completely unroll, we need to
+        // keep the conditional branch except the last one because the loop
+        // may exit after any iteration.
+        assert(NeedConditional &&
+               "NeedCondition cannot be modified by both complete "
+               "unrolling and runtime unrolling");
+        NeedConditional =
+            (ULO.PreserveCondBr && j && !(ULO.PreserveOnlyFirst && i != 0));
+      } else if (j != BreakoutTrip &&
+                 (ULO.TripMultiple == 0 || j % ULO.TripMultiple != 0)) {
+        // If we know the trip count or a multiple of it, we can safely use an
+        // unconditional branch for some iterations.
+        NeedConditional = false;
+      }
+    }
+
+    setDest(Latches[i], Dest, Headers[i], NeedConditional, ContinueOnTrue,
+            Dest == LoopExit);
+  }
+
+  if (!LatchIsExiting) {
+    // If the latch is not exiting, we may be able to simplify the conditional
+    // branches in the unrolled exiting blocks.
+    for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
+      // The branch destination.
+      unsigned j = (i + 1) % e;
+      bool NeedConditional = true;
+
+      if (RuntimeTripCount && j != 0)
+        NeedConditional = false;
+
+      if (CompletelyUnroll)
+        // We cannot drop the conditional branch for the last condition, as we
+        // may have to execute the loop body depending on the condition.
+        NeedConditional = j == 0 || ULO.PreserveCondBr;
+      else if (j != BreakoutTrip &&
+               (ULO.TripMultiple == 0 || j % ULO.TripMultiple != 0))
+        // If we know the trip count or a multiple of it, we can safely use an
+        // unconditional branch for some iterations.
+        NeedConditional = false;
+
+      // Conditional branches from non-latch exiting block have successors
+      // either in the same loop iteration or outside the loop. The branches are
+      // already correct.
+      if (NeedConditional)
+        continue;
+      setDest(ExitingBlocks[i], ExitingSucc[i], ExitingSucc[i], NeedConditional,
+              None, false);
+    }
+
+    // When completely unrolling, the last latch becomes unreachable.
+    if (CompletelyUnroll) {
+      BranchInst *Term = cast<BranchInst>(Latches.back()->getTerminator());
+      new UnreachableInst(Term->getContext(), Term);
+      Term->eraseFromParent();
+    }
+  }
+
+  // Update dominators of blocks we might reach through exits.
+  // Immediate dominator of such block might change, because we add more
+  // routes which can lead to the exit: we can now reach it from the copied
+  // iterations too.
+  if (DT && ULO.Count > 1) {
+    for (auto *BB : OriginalLoopBlocks) {
+      auto *BBDomNode = DT->getNode(BB);
+      SmallVector<BasicBlock *, 16> ChildrenToUpdate;
+      for (auto *ChildDomNode : BBDomNode->children()) {
+        auto *ChildBB = ChildDomNode->getBlock();
+        if (!L->contains(ChildBB))
+          ChildrenToUpdate.push_back(ChildBB);
+      }
+      BasicBlock *NewIDom;
+      if (ExitingBI && BB == ExitingBlocks[0]) {
+        // The latch is special because we emit unconditional branches in
+        // some cases where the original loop contained a conditional branch.
+        // Since the latch is always at the bottom of the loop, if the latch
+        // dominated an exit before unrolling, the new dominator of that exit
+        // must also be a latch.  Specifically, the dominator is the first
+        // latch which ends in a conditional branch, or the last latch if
+        // there is no such latch.
+        // For loops exiting from non latch exiting block, we limit the
+        // branch simplification to single exiting block loops.
+        NewIDom = ExitingBlocks.back();
+        for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
+          Instruction *Term = ExitingBlocks[i]->getTerminator();
+          if (isa<BranchInst>(Term) && cast<BranchInst>(Term)->isConditional()) {
+            NewIDom =
+                DT->findNearestCommonDominator(ExitingBlocks[i], Latches[i]);
+            break;
+          }
+        }
+      } else {
+        // The new idom of the block will be the nearest common dominator
+        // of all copies of the previous idom. This is equivalent to the
+        // nearest common dominator of the previous idom and the first latch,
+        // which dominates all copies of the previous idom.
+        NewIDom = DT->findNearestCommonDominator(BB, LatchBlock);
+      }
+      for (auto *ChildBB : ChildrenToUpdate)
+        DT->changeImmediateDominator(ChildBB, NewIDom);
+    }
+  }
+
+  assert(!DT || !UnrollVerifyDomtree ||
+         DT->verify(DominatorTree::VerificationLevel::Fast));
+
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+  // Merge adjacent basic blocks, if possible.
+  for (BasicBlock *Latch : Latches) {
+    BranchInst *Term = dyn_cast<BranchInst>(Latch->getTerminator());
+    assert((Term ||
+            (CompletelyUnroll && !LatchIsExiting && Latch == Latches.back())) &&
+           "Need a branch as terminator, except when fully unrolling with "
+           "unconditional latch");
+    if (Term && Term->isUnconditional()) {
+      BasicBlock *Dest = Term->getSuccessor(0);
+      BasicBlock *Fold = Dest->getUniquePredecessor();
+      if (MergeBlockIntoPredecessor(Dest, &DTU, LI)) {
+        // Dest has been folded into Fold. Update our worklists accordingly.
+        std::replace(Latches.begin(), Latches.end(), Dest, Fold);
         llvm::erase_value(UnrolledLoopBlocks, Dest);
-      } 
-    } 
-  } 
-  // Apply updates to the DomTree. 
-  DT = &DTU.getDomTree(); 
- 
-  // At this point, the code is well formed.  We now simplify the unrolled loop, 
-  // doing constant propagation and dead code elimination as we go. 
-  simplifyLoopAfterUnroll(L, !CompletelyUnroll && (ULO.Count > 1 || Peeled), LI, 
-                          SE, DT, AC, TTI); 
- 
-  NumCompletelyUnrolled += CompletelyUnroll; 
-  ++NumUnrolled; 
- 
-  Loop *OuterL = L->getParentLoop(); 
-  // Update LoopInfo if the loop is completely removed. 
-  if (CompletelyUnroll) 
-    LI->erase(L); 
- 
-  // After complete unrolling most of the blocks should be contained in OuterL. 
-  // However, some of them might happen to be out of OuterL (e.g. if they 
-  // precede a loop exit). In this case we might need to insert PHI nodes in 
-  // order to preserve LCSSA form. 
-  // We don't need to check this if we already know that we need to fix LCSSA 
-  // form. 
-  // TODO: For now we just recompute LCSSA for the outer loop in this case, but 
-  // it should be possible to fix it in-place. 
-  if (PreserveLCSSA && OuterL && CompletelyUnroll && !NeedToFixLCSSA) 
-    NeedToFixLCSSA |= ::needToInsertPhisForLCSSA(OuterL, UnrolledLoopBlocks, LI); 
- 
-  // If we have a pass and a DominatorTree we should re-simplify impacted loops 
-  // to ensure subsequent analyses can rely on this form. We want to simplify 
-  // at least one layer outside of the loop that was unrolled so that any 
-  // changes to the parent loop exposed by the unrolling are considered. 
-  if (DT) { 
-    if (OuterL) { 
-      // OuterL includes all loops for which we can break loop-simplify, so 
-      // it's sufficient to simplify only it (it'll recursively simplify inner 
-      // loops too). 
-      if (NeedToFixLCSSA) { 
-        // LCSSA must be performed on the outermost affected loop. The unrolled 
-        // loop's last loop latch is guaranteed to be in the outermost loop 
-        // after LoopInfo's been updated by LoopInfo::erase. 
-        Loop *LatchLoop = LI->getLoopFor(Latches.back()); 
-        Loop *FixLCSSALoop = OuterL; 
-        if (!FixLCSSALoop->contains(LatchLoop)) 
-          while (FixLCSSALoop->getParentLoop() != LatchLoop) 
-            FixLCSSALoop = FixLCSSALoop->getParentLoop(); 
- 
-        formLCSSARecursively(*FixLCSSALoop, *DT, LI, SE); 
-      } else if (PreserveLCSSA) { 
-        assert(OuterL->isLCSSAForm(*DT) && 
-               "Loops should be in LCSSA form after loop-unroll."); 
-      } 
- 
-      // TODO: That potentially might be compile-time expensive. We should try 
-      // to fix the loop-simplified form incrementally. 
-      simplifyLoop(OuterL, DT, LI, SE, AC, nullptr, PreserveLCSSA); 
-    } else { 
-      // Simplify loops for which we might've broken loop-simplify form. 
-      for (Loop *SubLoop : LoopsToSimplify) 
-        simplifyLoop(SubLoop, DT, LI, SE, AC, nullptr, PreserveLCSSA); 
-    } 
-  } 
- 
-  return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled 
-                          : LoopUnrollResult::PartiallyUnrolled; 
-} 
- 
-/// Given an llvm.loop loop id metadata node, returns the loop hint metadata 
-/// node with the given name (for example, "llvm.loop.unroll.count"). If no 
-/// such metadata node exists, then nullptr is returned. 
-MDNode *llvm::GetUnrollMetadata(MDNode *LoopID, StringRef Name) { 
-  // First operand should refer to the loop id itself. 
-  assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); 
-  assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); 
- 
-  for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) { 
-    MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 
-    if (!MD) 
-      continue; 
- 
-    MDString *S = dyn_cast<MDString>(MD->getOperand(0)); 
-    if (!S) 
-      continue; 
- 
-    if (Name.equals(S->getString())) 
-      return MD; 
-  } 
-  return nullptr; 
-} 
+      }
+    }
+  }
+  // Apply updates to the DomTree.
+  DT = &DTU.getDomTree();
+
+  // At this point, the code is well formed.  We now simplify the unrolled loop,
+  // doing constant propagation and dead code elimination as we go.
+  simplifyLoopAfterUnroll(L, !CompletelyUnroll && (ULO.Count > 1 || Peeled), LI,
+                          SE, DT, AC, TTI);
+
+  NumCompletelyUnrolled += CompletelyUnroll;
+  ++NumUnrolled;
+
+  Loop *OuterL = L->getParentLoop();
+  // Update LoopInfo if the loop is completely removed.
+  if (CompletelyUnroll)
+    LI->erase(L);
+
+  // After complete unrolling most of the blocks should be contained in OuterL.
+  // However, some of them might happen to be out of OuterL (e.g. if they
+  // precede a loop exit). In this case we might need to insert PHI nodes in
+  // order to preserve LCSSA form.
+  // We don't need to check this if we already know that we need to fix LCSSA
+  // form.
+  // TODO: For now we just recompute LCSSA for the outer loop in this case, but
+  // it should be possible to fix it in-place.
+  if (PreserveLCSSA && OuterL && CompletelyUnroll && !NeedToFixLCSSA)
+    NeedToFixLCSSA |= ::needToInsertPhisForLCSSA(OuterL, UnrolledLoopBlocks, LI);
+
+  // If we have a pass and a DominatorTree we should re-simplify impacted loops
+  // to ensure subsequent analyses can rely on this form. We want to simplify
+  // at least one layer outside of the loop that was unrolled so that any
+  // changes to the parent loop exposed by the unrolling are considered.
+  if (DT) {
+    if (OuterL) {
+      // OuterL includes all loops for which we can break loop-simplify, so
+      // it's sufficient to simplify only it (it'll recursively simplify inner
+      // loops too).
+      if (NeedToFixLCSSA) {
+        // LCSSA must be performed on the outermost affected loop. The unrolled
+        // loop's last loop latch is guaranteed to be in the outermost loop
+        // after LoopInfo's been updated by LoopInfo::erase.
+        Loop *LatchLoop = LI->getLoopFor(Latches.back());
+        Loop *FixLCSSALoop = OuterL;
+        if (!FixLCSSALoop->contains(LatchLoop))
+          while (FixLCSSALoop->getParentLoop() != LatchLoop)
+            FixLCSSALoop = FixLCSSALoop->getParentLoop();
+
+        formLCSSARecursively(*FixLCSSALoop, *DT, LI, SE);
+      } else if (PreserveLCSSA) {
+        assert(OuterL->isLCSSAForm(*DT) &&
+               "Loops should be in LCSSA form after loop-unroll.");
+      }
+
+      // TODO: That potentially might be compile-time expensive. We should try
+      // to fix the loop-simplified form incrementally.
+      simplifyLoop(OuterL, DT, LI, SE, AC, nullptr, PreserveLCSSA);
+    } else {
+      // Simplify loops for which we might've broken loop-simplify form.
+      for (Loop *SubLoop : LoopsToSimplify)
+        simplifyLoop(SubLoop, DT, LI, SE, AC, nullptr, PreserveLCSSA);
+    }
+  }
+
+  return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
+                          : LoopUnrollResult::PartiallyUnrolled;
+}
+
+/// Given an llvm.loop loop id metadata node, returns the loop hint metadata
+/// node with the given name (for example, "llvm.loop.unroll.count"). If no
+/// such metadata node exists, then nullptr is returned.
+MDNode *llvm::GetUnrollMetadata(MDNode *LoopID, StringRef Name) {
+  // First operand should refer to the loop id itself.
+  assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
+  assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
+
+  for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
+    MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+    if (!MD)
+      continue;
+
+    MDString *S = dyn_cast<MDString>(MD->getOperand(0));
+    if (!S)
+      continue;
+
+    if (Name.equals(S->getString()))
+      return MD;
+  }
+  return nullptr;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/LoopUnrollAndJam.cpp
index 6dd14c591e..6e32a2b865 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/LoopUnrollAndJam.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/LoopUnrollAndJam.cpp
@@ -1,985 +1,985 @@
-//===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements loop unroll and jam as a routine, much like 
-// LoopUnroll.cpp implements loop unroll. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/Optional.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/Sequence.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/StringRef.h" 
-#include "llvm/ADT/Twine.h" 
-#include "llvm/ADT/iterator_range.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/DependenceAnalysis.h" 
-#include "llvm/Analysis/DomTreeUpdater.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/LoopIterator.h" 
-#include "llvm/Analysis/MustExecute.h" 
-#include "llvm/Analysis/OptimizationRemarkEmitter.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/DebugInfoMetadata.h" 
-#include "llvm/IR/DebugLoc.h" 
-#include "llvm/IR/DiagnosticInfo.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/IR/ValueHandle.h" 
-#include "llvm/IR/ValueMap.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/GenericDomTree.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/Cloning.h" 
-#include "llvm/Transforms/Utils/LoopUtils.h" 
-#include "llvm/Transforms/Utils/UnrollLoop.h" 
-#include "llvm/Transforms/Utils/ValueMapper.h" 
-#include <assert.h> 
-#include <memory> 
-#include <type_traits> 
-#include <vector> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "loop-unroll-and-jam" 
- 
-STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed"); 
-STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed"); 
- 
-typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet; 
- 
-// Partition blocks in an outer/inner loop pair into blocks before and after 
-// the loop 
-static bool partitionLoopBlocks(Loop &L, BasicBlockSet &ForeBlocks, 
-                                BasicBlockSet &AftBlocks, DominatorTree &DT) { 
-  Loop *SubLoop = L.getSubLoops()[0]; 
-  BasicBlock *SubLoopLatch = SubLoop->getLoopLatch(); 
- 
-  for (BasicBlock *BB : L.blocks()) { 
-    if (!SubLoop->contains(BB)) { 
-      if (DT.dominates(SubLoopLatch, BB)) 
-        AftBlocks.insert(BB); 
-      else 
-        ForeBlocks.insert(BB); 
-    } 
-  } 
- 
-  // Check that all blocks in ForeBlocks together dominate the subloop 
-  // TODO: This might ideally be done better with a dominator/postdominators. 
-  BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader(); 
-  for (BasicBlock *BB : ForeBlocks) { 
-    if (BB == SubLoopPreHeader) 
-      continue; 
-    Instruction *TI = BB->getTerminator(); 
-    for (BasicBlock *Succ : successors(TI)) 
-      if (!ForeBlocks.count(Succ)) 
-        return false; 
-  } 
- 
-  return true; 
-} 
- 
-/// Partition blocks in a loop nest into blocks before and after each inner 
-/// loop. 
-static bool partitionOuterLoopBlocks( 
-    Loop &Root, Loop &JamLoop, BasicBlockSet &JamLoopBlocks, 
-    DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap, 
-    DenseMap<Loop *, BasicBlockSet> &AftBlocksMap, DominatorTree &DT) { 
-  JamLoopBlocks.insert(JamLoop.block_begin(), JamLoop.block_end()); 
- 
-  for (Loop *L : Root.getLoopsInPreorder()) { 
-    if (L == &JamLoop) 
-      break; 
- 
-    if (!partitionLoopBlocks(*L, ForeBlocksMap[L], AftBlocksMap[L], DT)) 
-      return false; 
-  } 
- 
-  return true; 
-} 
- 
-// TODO Remove when UnrollAndJamLoop changed to support unroll and jamming more 
-// than 2 levels loop. 
-static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop, 
-                                     BasicBlockSet &ForeBlocks, 
-                                     BasicBlockSet &SubLoopBlocks, 
-                                     BasicBlockSet &AftBlocks, 
-                                     DominatorTree *DT) { 
-  SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end()); 
-  return partitionLoopBlocks(*L, ForeBlocks, AftBlocks, *DT); 
-} 
- 
-// Looks at the phi nodes in Header for values coming from Latch. For these 
-// instructions and all their operands calls Visit on them, keeping going for 
-// all the operands in AftBlocks. Returns false if Visit returns false, 
-// otherwise returns true. This is used to process the instructions in the 
-// Aft blocks that need to be moved before the subloop. It is used in two 
-// places. One to check that the required set of instructions can be moved 
-// before the loop. Then to collect the instructions to actually move in 
-// moveHeaderPhiOperandsToForeBlocks. 
-template <typename T> 
-static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch, 
-                                     BasicBlockSet &AftBlocks, T Visit) { 
-  SmallVector<Instruction *, 8> Worklist; 
-  for (auto &Phi : Header->phis()) { 
-    Value *V = Phi.getIncomingValueForBlock(Latch); 
-    if (Instruction *I = dyn_cast<Instruction>(V)) 
-      Worklist.push_back(I); 
-  } 
- 
-  while (!Worklist.empty()) { 
+//===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements loop unroll and jam as a routine, much like
+// LoopUnroll.cpp implements loop unroll.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/MustExecute.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GenericDomTree.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <assert.h>
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-unroll-and-jam"
+
+STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed");
+STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed");
+
+typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet;
+
+// Partition blocks in an outer/inner loop pair into blocks before and after
+// the loop
+static bool partitionLoopBlocks(Loop &L, BasicBlockSet &ForeBlocks,
+                                BasicBlockSet &AftBlocks, DominatorTree &DT) {
+  Loop *SubLoop = L.getSubLoops()[0];
+  BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
+
+  for (BasicBlock *BB : L.blocks()) {
+    if (!SubLoop->contains(BB)) {
+      if (DT.dominates(SubLoopLatch, BB))
+        AftBlocks.insert(BB);
+      else
+        ForeBlocks.insert(BB);
+    }
+  }
+
+  // Check that all blocks in ForeBlocks together dominate the subloop
+  // TODO: This might ideally be done better with a dominator/postdominators.
+  BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader();
+  for (BasicBlock *BB : ForeBlocks) {
+    if (BB == SubLoopPreHeader)
+      continue;
+    Instruction *TI = BB->getTerminator();
+    for (BasicBlock *Succ : successors(TI))
+      if (!ForeBlocks.count(Succ))
+        return false;
+  }
+
+  return true;
+}
+
+/// Partition blocks in a loop nest into blocks before and after each inner
+/// loop.
+static bool partitionOuterLoopBlocks(
+    Loop &Root, Loop &JamLoop, BasicBlockSet &JamLoopBlocks,
+    DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap,
+    DenseMap<Loop *, BasicBlockSet> &AftBlocksMap, DominatorTree &DT) {
+  JamLoopBlocks.insert(JamLoop.block_begin(), JamLoop.block_end());
+
+  for (Loop *L : Root.getLoopsInPreorder()) {
+    if (L == &JamLoop)
+      break;
+
+    if (!partitionLoopBlocks(*L, ForeBlocksMap[L], AftBlocksMap[L], DT))
+      return false;
+  }
+
+  return true;
+}
+
+// TODO Remove when UnrollAndJamLoop changed to support unroll and jamming more
+// than 2 levels loop.
+static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
+                                     BasicBlockSet &ForeBlocks,
+                                     BasicBlockSet &SubLoopBlocks,
+                                     BasicBlockSet &AftBlocks,
+                                     DominatorTree *DT) {
+  SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end());
+  return partitionLoopBlocks(*L, ForeBlocks, AftBlocks, *DT);
+}
+
+// Looks at the phi nodes in Header for values coming from Latch. For these
+// instructions and all their operands calls Visit on them, keeping going for
+// all the operands in AftBlocks. Returns false if Visit returns false,
+// otherwise returns true. This is used to process the instructions in the
+// Aft blocks that need to be moved before the subloop. It is used in two
+// places. One to check that the required set of instructions can be moved
+// before the loop. Then to collect the instructions to actually move in
+// moveHeaderPhiOperandsToForeBlocks.
+template <typename T>
+static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch,
+                                     BasicBlockSet &AftBlocks, T Visit) {
+  SmallVector<Instruction *, 8> Worklist;
+  for (auto &Phi : Header->phis()) {
+    Value *V = Phi.getIncomingValueForBlock(Latch);
+    if (Instruction *I = dyn_cast<Instruction>(V))
+      Worklist.push_back(I);
+  }
+
+  while (!Worklist.empty()) {
     Instruction *I = Worklist.pop_back_val();
-    if (!Visit(I)) 
-      return false; 
- 
-    if (AftBlocks.count(I->getParent())) 
-      for (auto &U : I->operands()) 
-        if (Instruction *II = dyn_cast<Instruction>(U)) 
-          Worklist.push_back(II); 
-  } 
- 
-  return true; 
-} 
- 
-// Move the phi operands of Header from Latch out of AftBlocks to InsertLoc. 
-static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header, 
-                                              BasicBlock *Latch, 
-                                              Instruction *InsertLoc, 
-                                              BasicBlockSet &AftBlocks) { 
-  // We need to ensure we move the instructions in the correct order, 
-  // starting with the earliest required instruction and moving forward. 
-  std::vector<Instruction *> Visited; 
-  processHeaderPhiOperands(Header, Latch, AftBlocks, 
-                           [&Visited, &AftBlocks](Instruction *I) { 
-                             if (AftBlocks.count(I->getParent())) 
-                               Visited.push_back(I); 
-                             return true; 
-                           }); 
- 
-  // Move all instructions in program order to before the InsertLoc 
-  BasicBlock *InsertLocBB = InsertLoc->getParent(); 
-  for (Instruction *I : reverse(Visited)) { 
-    if (I->getParent() != InsertLocBB) 
-      I->moveBefore(InsertLoc); 
-  } 
-} 
- 
-/* 
-  This method performs Unroll and Jam. For a simple loop like: 
-  for (i = ..) 
-    Fore(i) 
-    for (j = ..) 
-      SubLoop(i, j) 
-    Aft(i) 
- 
-  Instead of doing normal inner or outer unrolling, we do: 
-  for (i = .., i+=2) 
-    Fore(i) 
-    Fore(i+1) 
-    for (j = ..) 
-      SubLoop(i, j) 
-      SubLoop(i+1, j) 
-    Aft(i) 
-    Aft(i+1) 
- 
-  So the outer loop is essetially unrolled and then the inner loops are fused 
-  ("jammed") together into a single loop. This can increase speed when there 
-  are loads in SubLoop that are invariant to i, as they become shared between 
-  the now jammed inner loops. 
- 
-  We do this by spliting the blocks in the loop into Fore, Subloop and Aft. 
-  Fore blocks are those before the inner loop, Aft are those after. Normal 
-  Unroll code is used to copy each of these sets of blocks and the results are 
-  combined together into the final form above. 
- 
-  isSafeToUnrollAndJam should be used prior to calling this to make sure the 
-  unrolling will be valid. Checking profitablility is also advisable. 
- 
-  If EpilogueLoop is non-null, it receives the epilogue loop (if it was 
-  necessary to create one and not fully unrolled). 
-*/ 
-LoopUnrollResult 
-llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount, 
-                       unsigned TripMultiple, bool UnrollRemainder, 
-                       LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 
-                       AssumptionCache *AC, const TargetTransformInfo *TTI, 
-                       OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) { 
- 
-  // When we enter here we should have already checked that it is safe 
-  BasicBlock *Header = L->getHeader(); 
-  assert(Header && "No header."); 
-  assert(L->getSubLoops().size() == 1); 
-  Loop *SubLoop = *L->begin(); 
- 
-  // Don't enter the unroll code if there is nothing to do. 
-  if (TripCount == 0 && Count < 2) { 
-    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; almost nothing to do\n"); 
-    return LoopUnrollResult::Unmodified; 
-  } 
- 
-  assert(Count > 0); 
-  assert(TripMultiple > 0); 
-  assert(TripCount == 0 || TripCount % TripMultiple == 0); 
- 
-  // Are we eliminating the loop control altogether? 
-  bool CompletelyUnroll = (Count == TripCount); 
- 
-  // We use the runtime remainder in cases where we don't know trip multiple 
-  if (TripMultiple == 1 || TripMultiple % Count != 0) { 
-    if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false, 
-                                    /*UseEpilogRemainder*/ true, 
-                                    UnrollRemainder, /*ForgetAllSCEV*/ false, 
-                                    LI, SE, DT, AC, TTI, true, EpilogueLoop)) { 
-      LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be " 
-                           "generated when assuming runtime trip count\n"); 
-      return LoopUnrollResult::Unmodified; 
-    } 
-  } 
- 
-  // Notify ScalarEvolution that the loop will be substantially changed, 
-  // if not outright eliminated. 
-  if (SE) { 
-    SE->forgetLoop(L); 
-    SE->forgetLoop(SubLoop); 
-  } 
- 
-  using namespace ore; 
-  // Report the unrolling decision. 
-  if (CompletelyUnroll) { 
-    LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %" 
-                      << Header->getName() << " with trip count " << TripCount 
-                      << "!\n"); 
-    ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(), 
-                                 L->getHeader()) 
-              << "completely unroll and jammed loop with " 
-              << NV("UnrollCount", TripCount) << " iterations"); 
-  } else { 
-    auto DiagBuilder = [&]() { 
-      OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(), 
-                              L->getHeader()); 
-      return Diag << "unroll and jammed loop by a factor of " 
-                  << NV("UnrollCount", Count); 
-    }; 
- 
-    LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName() 
-                      << " by " << Count); 
-    if (TripMultiple != 1) { 
-      LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch"); 
-      ORE->emit([&]() { 
-        return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple) 
-                             << " trips per branch"; 
-      }); 
-    } else { 
-      LLVM_DEBUG(dbgs() << " with run-time trip count"); 
-      ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; }); 
-    } 
-    LLVM_DEBUG(dbgs() << "!\n"); 
-  } 
- 
-  BasicBlock *Preheader = L->getLoopPreheader(); 
-  BasicBlock *LatchBlock = L->getLoopLatch(); 
-  assert(Preheader && "No preheader"); 
-  assert(LatchBlock && "No latch block"); 
-  BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator()); 
-  assert(BI && !BI->isUnconditional()); 
-  bool ContinueOnTrue = L->contains(BI->getSuccessor(0)); 
-  BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue); 
-  bool SubLoopContinueOnTrue = SubLoop->contains( 
-      SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0)); 
- 
-  // Partition blocks in an outer/inner loop pair into blocks before and after 
-  // the loop 
-  BasicBlockSet SubLoopBlocks; 
-  BasicBlockSet ForeBlocks; 
-  BasicBlockSet AftBlocks; 
-  partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks, 
-                           DT); 
- 
-  // We keep track of the entering/first and exiting/last block of each of 
-  // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of 
-  // blocks easier. 
-  std::vector<BasicBlock *> ForeBlocksFirst; 
-  std::vector<BasicBlock *> ForeBlocksLast; 
-  std::vector<BasicBlock *> SubLoopBlocksFirst; 
-  std::vector<BasicBlock *> SubLoopBlocksLast; 
-  std::vector<BasicBlock *> AftBlocksFirst; 
-  std::vector<BasicBlock *> AftBlocksLast; 
-  ForeBlocksFirst.push_back(Header); 
-  ForeBlocksLast.push_back(SubLoop->getLoopPreheader()); 
-  SubLoopBlocksFirst.push_back(SubLoop->getHeader()); 
-  SubLoopBlocksLast.push_back(SubLoop->getExitingBlock()); 
-  AftBlocksFirst.push_back(SubLoop->getExitBlock()); 
-  AftBlocksLast.push_back(L->getExitingBlock()); 
-  // Maps Blocks[0] -> Blocks[It] 
-  ValueToValueMapTy LastValueMap; 
- 
-  // Move any instructions from fore phi operands from AftBlocks into Fore. 
-  moveHeaderPhiOperandsToForeBlocks( 
-      Header, LatchBlock, ForeBlocksLast[0]->getTerminator(), AftBlocks); 
- 
-  // The current on-the-fly SSA update requires blocks to be processed in 
-  // reverse postorder so that LastValueMap contains the correct value at each 
-  // exit. 
-  LoopBlocksDFS DFS(L); 
-  DFS.perform(LI); 
-  // Stash the DFS iterators before adding blocks to the loop. 
-  LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO(); 
-  LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO(); 
- 
-  if (Header->getParent()->isDebugInfoForProfiling()) 
-    for (BasicBlock *BB : L->getBlocks()) 
-      for (Instruction &I : *BB) 
-        if (!isa<DbgInfoIntrinsic>(&I)) 
-          if (const DILocation *DIL = I.getDebugLoc()) { 
-            auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(Count); 
-            if (NewDIL) 
-              I.setDebugLoc(NewDIL.getValue()); 
-            else 
-              LLVM_DEBUG(dbgs() 
-                         << "Failed to create new discriminator: " 
-                         << DIL->getFilename() << " Line: " << DIL->getLine()); 
-          } 
- 
-  // Copy all blocks 
-  for (unsigned It = 1; It != Count; ++It) { 
-    SmallVector<BasicBlock *, 8> NewBlocks; 
-    // Maps Blocks[It] -> Blocks[It-1] 
-    DenseMap<Value *, Value *> PrevItValueMap; 
-    SmallDenseMap<const Loop *, Loop *, 4> NewLoops; 
-    NewLoops[L] = L; 
-    NewLoops[SubLoop] = SubLoop; 
- 
-    for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) { 
-      ValueToValueMapTy VMap; 
-      BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It)); 
-      Header->getParent()->getBasicBlockList().push_back(New); 
- 
-      // Tell LI about New. 
-      addClonedBlockToLoopInfo(*BB, New, LI, NewLoops); 
- 
-      if (ForeBlocks.count(*BB)) { 
-        if (*BB == ForeBlocksFirst[0]) 
-          ForeBlocksFirst.push_back(New); 
-        if (*BB == ForeBlocksLast[0]) 
-          ForeBlocksLast.push_back(New); 
-      } else if (SubLoopBlocks.count(*BB)) { 
-        if (*BB == SubLoopBlocksFirst[0]) 
-          SubLoopBlocksFirst.push_back(New); 
-        if (*BB == SubLoopBlocksLast[0]) 
-          SubLoopBlocksLast.push_back(New); 
-      } else if (AftBlocks.count(*BB)) { 
-        if (*BB == AftBlocksFirst[0]) 
-          AftBlocksFirst.push_back(New); 
-        if (*BB == AftBlocksLast[0]) 
-          AftBlocksLast.push_back(New); 
-      } else { 
-        llvm_unreachable("BB being cloned should be in Fore/Sub/Aft"); 
-      } 
- 
-      // Update our running maps of newest clones 
-      PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]); 
-      LastValueMap[*BB] = New; 
-      for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end(); 
-           VI != VE; ++VI) { 
-        PrevItValueMap[VI->second] = 
-            const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]); 
-        LastValueMap[VI->first] = VI->second; 
-      } 
- 
-      NewBlocks.push_back(New); 
- 
-      // Update DomTree: 
-      if (*BB == ForeBlocksFirst[0]) 
-        DT->addNewBlock(New, ForeBlocksLast[It - 1]); 
-      else if (*BB == SubLoopBlocksFirst[0]) 
-        DT->addNewBlock(New, SubLoopBlocksLast[It - 1]); 
-      else if (*BB == AftBlocksFirst[0]) 
-        DT->addNewBlock(New, AftBlocksLast[It - 1]); 
-      else { 
-        // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree 
-        // structure. 
-        auto BBDomNode = DT->getNode(*BB); 
-        auto BBIDom = BBDomNode->getIDom(); 
-        BasicBlock *OriginalBBIDom = BBIDom->getBlock(); 
-        assert(OriginalBBIDom); 
-        assert(LastValueMap[cast<Value>(OriginalBBIDom)]); 
-        DT->addNewBlock( 
-            New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)])); 
-      } 
-    } 
- 
-    // Remap all instructions in the most recent iteration 
-    remapInstructionsInBlocks(NewBlocks, LastValueMap); 
-    for (BasicBlock *NewBlock : NewBlocks) { 
-      for (Instruction &I : *NewBlock) { 
-        if (auto *II = dyn_cast<IntrinsicInst>(&I)) 
-          if (II->getIntrinsicID() == Intrinsic::assume) 
-            AC->registerAssumption(II); 
-      } 
-    } 
- 
-    // Alter the ForeBlocks phi's, pointing them at the latest version of the 
-    // value from the previous iteration's phis 
-    for (PHINode &Phi : ForeBlocksFirst[It]->phis()) { 
-      Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]); 
-      assert(OldValue && "should have incoming edge from Aft[It]"); 
-      Value *NewValue = OldValue; 
-      if (Value *PrevValue = PrevItValueMap[OldValue]) 
-        NewValue = PrevValue; 
- 
-      assert(Phi.getNumOperands() == 2); 
-      Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]); 
-      Phi.setIncomingValue(0, NewValue); 
-      Phi.removeIncomingValue(1); 
-    } 
-  } 
- 
-  // Now that all the basic blocks for the unrolled iterations are in place, 
-  // finish up connecting the blocks and phi nodes. At this point LastValueMap 
-  // is the last unrolled iterations values. 
- 
-  // Update Phis in BB from OldBB to point to NewBB and use the latest value 
-  // from LastValueMap 
-  auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB, 
-                                     BasicBlock *NewBB, 
-                                     ValueToValueMapTy &LastValueMap) { 
-    for (PHINode &Phi : BB->phis()) { 
-      for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) { 
-        if (Phi.getIncomingBlock(b) == OldBB) { 
-          Value *OldValue = Phi.getIncomingValue(b); 
-          if (Value *LastValue = LastValueMap[OldValue]) 
-            Phi.setIncomingValue(b, LastValue); 
-          Phi.setIncomingBlock(b, NewBB); 
-          break; 
-        } 
-      } 
-    } 
-  }; 
-  // Move all the phis from Src into Dest 
-  auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) { 
-    Instruction *insertPoint = Dest->getFirstNonPHI(); 
-    while (PHINode *Phi = dyn_cast<PHINode>(Src->begin())) 
-      Phi->moveBefore(insertPoint); 
-  }; 
- 
-  // Update the PHI values outside the loop to point to the last block 
-  updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(), 
-                           LastValueMap); 
- 
-  // Update ForeBlocks successors and phi nodes 
-  BranchInst *ForeTerm = 
-      cast<BranchInst>(ForeBlocksLast.back()->getTerminator()); 
-  assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor"); 
-  ForeTerm->setSuccessor(0, SubLoopBlocksFirst[0]); 
- 
-  if (CompletelyUnroll) { 
-    while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) { 
-      Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader)); 
-      Phi->getParent()->getInstList().erase(Phi); 
-    } 
-  } else { 
-    // Update the PHI values to point to the last aft block 
-    updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0], 
-                             AftBlocksLast.back(), LastValueMap); 
-  } 
- 
-  for (unsigned It = 1; It != Count; It++) { 
-    // Remap ForeBlock successors from previous iteration to this 
-    BranchInst *ForeTerm = 
-        cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator()); 
-    assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor"); 
-    ForeTerm->setSuccessor(0, ForeBlocksFirst[It]); 
-  } 
- 
-  // Subloop successors and phis 
-  BranchInst *SubTerm = 
-      cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator()); 
-  SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]); 
-  SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]); 
+    if (!Visit(I))
+      return false;
+
+    if (AftBlocks.count(I->getParent()))
+      for (auto &U : I->operands())
+        if (Instruction *II = dyn_cast<Instruction>(U))
+          Worklist.push_back(II);
+  }
+
+  return true;
+}
+
+// Move the phi operands of Header from Latch out of AftBlocks to InsertLoc.
+static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header,
+                                              BasicBlock *Latch,
+                                              Instruction *InsertLoc,
+                                              BasicBlockSet &AftBlocks) {
+  // We need to ensure we move the instructions in the correct order,
+  // starting with the earliest required instruction and moving forward.
+  std::vector<Instruction *> Visited;
+  processHeaderPhiOperands(Header, Latch, AftBlocks,
+                           [&Visited, &AftBlocks](Instruction *I) {
+                             if (AftBlocks.count(I->getParent()))
+                               Visited.push_back(I);
+                             return true;
+                           });
+
+  // Move all instructions in program order to before the InsertLoc
+  BasicBlock *InsertLocBB = InsertLoc->getParent();
+  for (Instruction *I : reverse(Visited)) {
+    if (I->getParent() != InsertLocBB)
+      I->moveBefore(InsertLoc);
+  }
+}
+
+/*
+  This method performs Unroll and Jam. For a simple loop like:
+  for (i = ..)
+    Fore(i)
+    for (j = ..)
+      SubLoop(i, j)
+    Aft(i)
+
+  Instead of doing normal inner or outer unrolling, we do:
+  for (i = .., i+=2)
+    Fore(i)
+    Fore(i+1)
+    for (j = ..)
+      SubLoop(i, j)
+      SubLoop(i+1, j)
+    Aft(i)
+    Aft(i+1)
+
+  So the outer loop is essetially unrolled and then the inner loops are fused
+  ("jammed") together into a single loop. This can increase speed when there
+  are loads in SubLoop that are invariant to i, as they become shared between
+  the now jammed inner loops.
+
+  We do this by spliting the blocks in the loop into Fore, Subloop and Aft.
+  Fore blocks are those before the inner loop, Aft are those after. Normal
+  Unroll code is used to copy each of these sets of blocks and the results are
+  combined together into the final form above.
+
+  isSafeToUnrollAndJam should be used prior to calling this to make sure the
+  unrolling will be valid. Checking profitablility is also advisable.
+
+  If EpilogueLoop is non-null, it receives the epilogue loop (if it was
+  necessary to create one and not fully unrolled).
+*/
+LoopUnrollResult
+llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
+                       unsigned TripMultiple, bool UnrollRemainder,
+                       LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
+                       AssumptionCache *AC, const TargetTransformInfo *TTI,
+                       OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) {
+
+  // When we enter here we should have already checked that it is safe
+  BasicBlock *Header = L->getHeader();
+  assert(Header && "No header.");
+  assert(L->getSubLoops().size() == 1);
+  Loop *SubLoop = *L->begin();
+
+  // Don't enter the unroll code if there is nothing to do.
+  if (TripCount == 0 && Count < 2) {
+    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; almost nothing to do\n");
+    return LoopUnrollResult::Unmodified;
+  }
+
+  assert(Count > 0);
+  assert(TripMultiple > 0);
+  assert(TripCount == 0 || TripCount % TripMultiple == 0);
+
+  // Are we eliminating the loop control altogether?
+  bool CompletelyUnroll = (Count == TripCount);
+
+  // We use the runtime remainder in cases where we don't know trip multiple
+  if (TripMultiple == 1 || TripMultiple % Count != 0) {
+    if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
+                                    /*UseEpilogRemainder*/ true,
+                                    UnrollRemainder, /*ForgetAllSCEV*/ false,
+                                    LI, SE, DT, AC, TTI, true, EpilogueLoop)) {
+      LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
+                           "generated when assuming runtime trip count\n");
+      return LoopUnrollResult::Unmodified;
+    }
+  }
+
+  // Notify ScalarEvolution that the loop will be substantially changed,
+  // if not outright eliminated.
+  if (SE) {
+    SE->forgetLoop(L);
+    SE->forgetLoop(SubLoop);
+  }
+
+  using namespace ore;
+  // Report the unrolling decision.
+  if (CompletelyUnroll) {
+    LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %"
+                      << Header->getName() << " with trip count " << TripCount
+                      << "!\n");
+    ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
+                                 L->getHeader())
+              << "completely unroll and jammed loop with "
+              << NV("UnrollCount", TripCount) << " iterations");
+  } else {
+    auto DiagBuilder = [&]() {
+      OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
+                              L->getHeader());
+      return Diag << "unroll and jammed loop by a factor of "
+                  << NV("UnrollCount", Count);
+    };
+
+    LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName()
+                      << " by " << Count);
+    if (TripMultiple != 1) {
+      LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
+      ORE->emit([&]() {
+        return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
+                             << " trips per branch";
+      });
+    } else {
+      LLVM_DEBUG(dbgs() << " with run-time trip count");
+      ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; });
+    }
+    LLVM_DEBUG(dbgs() << "!\n");
+  }
+
+  BasicBlock *Preheader = L->getLoopPreheader();
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  assert(Preheader && "No preheader");
+  assert(LatchBlock && "No latch block");
+  BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
+  assert(BI && !BI->isUnconditional());
+  bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
+  BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
+  bool SubLoopContinueOnTrue = SubLoop->contains(
+      SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0));
+
+  // Partition blocks in an outer/inner loop pair into blocks before and after
+  // the loop
+  BasicBlockSet SubLoopBlocks;
+  BasicBlockSet ForeBlocks;
+  BasicBlockSet AftBlocks;
+  partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks,
+                           DT);
+
+  // We keep track of the entering/first and exiting/last block of each of
+  // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of
+  // blocks easier.
+  std::vector<BasicBlock *> ForeBlocksFirst;
+  std::vector<BasicBlock *> ForeBlocksLast;
+  std::vector<BasicBlock *> SubLoopBlocksFirst;
+  std::vector<BasicBlock *> SubLoopBlocksLast;
+  std::vector<BasicBlock *> AftBlocksFirst;
+  std::vector<BasicBlock *> AftBlocksLast;
+  ForeBlocksFirst.push_back(Header);
+  ForeBlocksLast.push_back(SubLoop->getLoopPreheader());
+  SubLoopBlocksFirst.push_back(SubLoop->getHeader());
+  SubLoopBlocksLast.push_back(SubLoop->getExitingBlock());
+  AftBlocksFirst.push_back(SubLoop->getExitBlock());
+  AftBlocksLast.push_back(L->getExitingBlock());
+  // Maps Blocks[0] -> Blocks[It]
+  ValueToValueMapTy LastValueMap;
+
+  // Move any instructions from fore phi operands from AftBlocks into Fore.
+  moveHeaderPhiOperandsToForeBlocks(
+      Header, LatchBlock, ForeBlocksLast[0]->getTerminator(), AftBlocks);
+
+  // The current on-the-fly SSA update requires blocks to be processed in
+  // reverse postorder so that LastValueMap contains the correct value at each
+  // exit.
+  LoopBlocksDFS DFS(L);
+  DFS.perform(LI);
+  // Stash the DFS iterators before adding blocks to the loop.
+  LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
+  LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
+
+  if (Header->getParent()->isDebugInfoForProfiling())
+    for (BasicBlock *BB : L->getBlocks())
+      for (Instruction &I : *BB)
+        if (!isa<DbgInfoIntrinsic>(&I))
+          if (const DILocation *DIL = I.getDebugLoc()) {
+            auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(Count);
+            if (NewDIL)
+              I.setDebugLoc(NewDIL.getValue());
+            else
+              LLVM_DEBUG(dbgs()
+                         << "Failed to create new discriminator: "
+                         << DIL->getFilename() << " Line: " << DIL->getLine());
+          }
+
+  // Copy all blocks
+  for (unsigned It = 1; It != Count; ++It) {
+    SmallVector<BasicBlock *, 8> NewBlocks;
+    // Maps Blocks[It] -> Blocks[It-1]
+    DenseMap<Value *, Value *> PrevItValueMap;
+    SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
+    NewLoops[L] = L;
+    NewLoops[SubLoop] = SubLoop;
+
+    for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
+      ValueToValueMapTy VMap;
+      BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
+      Header->getParent()->getBasicBlockList().push_back(New);
+
+      // Tell LI about New.
+      addClonedBlockToLoopInfo(*BB, New, LI, NewLoops);
+
+      if (ForeBlocks.count(*BB)) {
+        if (*BB == ForeBlocksFirst[0])
+          ForeBlocksFirst.push_back(New);
+        if (*BB == ForeBlocksLast[0])
+          ForeBlocksLast.push_back(New);
+      } else if (SubLoopBlocks.count(*BB)) {
+        if (*BB == SubLoopBlocksFirst[0])
+          SubLoopBlocksFirst.push_back(New);
+        if (*BB == SubLoopBlocksLast[0])
+          SubLoopBlocksLast.push_back(New);
+      } else if (AftBlocks.count(*BB)) {
+        if (*BB == AftBlocksFirst[0])
+          AftBlocksFirst.push_back(New);
+        if (*BB == AftBlocksLast[0])
+          AftBlocksLast.push_back(New);
+      } else {
+        llvm_unreachable("BB being cloned should be in Fore/Sub/Aft");
+      }
+
+      // Update our running maps of newest clones
+      PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]);
+      LastValueMap[*BB] = New;
+      for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
+           VI != VE; ++VI) {
+        PrevItValueMap[VI->second] =
+            const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]);
+        LastValueMap[VI->first] = VI->second;
+      }
+
+      NewBlocks.push_back(New);
+
+      // Update DomTree:
+      if (*BB == ForeBlocksFirst[0])
+        DT->addNewBlock(New, ForeBlocksLast[It - 1]);
+      else if (*BB == SubLoopBlocksFirst[0])
+        DT->addNewBlock(New, SubLoopBlocksLast[It - 1]);
+      else if (*BB == AftBlocksFirst[0])
+        DT->addNewBlock(New, AftBlocksLast[It - 1]);
+      else {
+        // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree
+        // structure.
+        auto BBDomNode = DT->getNode(*BB);
+        auto BBIDom = BBDomNode->getIDom();
+        BasicBlock *OriginalBBIDom = BBIDom->getBlock();
+        assert(OriginalBBIDom);
+        assert(LastValueMap[cast<Value>(OriginalBBIDom)]);
+        DT->addNewBlock(
+            New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
+      }
+    }
+
+    // Remap all instructions in the most recent iteration
+    remapInstructionsInBlocks(NewBlocks, LastValueMap);
+    for (BasicBlock *NewBlock : NewBlocks) {
+      for (Instruction &I : *NewBlock) {
+        if (auto *II = dyn_cast<IntrinsicInst>(&I))
+          if (II->getIntrinsicID() == Intrinsic::assume)
+            AC->registerAssumption(II);
+      }
+    }
+
+    // Alter the ForeBlocks phi's, pointing them at the latest version of the
+    // value from the previous iteration's phis
+    for (PHINode &Phi : ForeBlocksFirst[It]->phis()) {
+      Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]);
+      assert(OldValue && "should have incoming edge from Aft[It]");
+      Value *NewValue = OldValue;
+      if (Value *PrevValue = PrevItValueMap[OldValue])
+        NewValue = PrevValue;
+
+      assert(Phi.getNumOperands() == 2);
+      Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]);
+      Phi.setIncomingValue(0, NewValue);
+      Phi.removeIncomingValue(1);
+    }
+  }
+
+  // Now that all the basic blocks for the unrolled iterations are in place,
+  // finish up connecting the blocks and phi nodes. At this point LastValueMap
+  // is the last unrolled iterations values.
+
+  // Update Phis in BB from OldBB to point to NewBB and use the latest value
+  // from LastValueMap
+  auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB,
+                                     BasicBlock *NewBB,
+                                     ValueToValueMapTy &LastValueMap) {
+    for (PHINode &Phi : BB->phis()) {
+      for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) {
+        if (Phi.getIncomingBlock(b) == OldBB) {
+          Value *OldValue = Phi.getIncomingValue(b);
+          if (Value *LastValue = LastValueMap[OldValue])
+            Phi.setIncomingValue(b, LastValue);
+          Phi.setIncomingBlock(b, NewBB);
+          break;
+        }
+      }
+    }
+  };
+  // Move all the phis from Src into Dest
+  auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) {
+    Instruction *insertPoint = Dest->getFirstNonPHI();
+    while (PHINode *Phi = dyn_cast<PHINode>(Src->begin()))
+      Phi->moveBefore(insertPoint);
+  };
+
+  // Update the PHI values outside the loop to point to the last block
+  updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(),
+                           LastValueMap);
+
+  // Update ForeBlocks successors and phi nodes
+  BranchInst *ForeTerm =
+      cast<BranchInst>(ForeBlocksLast.back()->getTerminator());
+  assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor");
+  ForeTerm->setSuccessor(0, SubLoopBlocksFirst[0]);
+
+  if (CompletelyUnroll) {
+    while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) {
+      Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader));
+      Phi->getParent()->getInstList().erase(Phi);
+    }
+  } else {
+    // Update the PHI values to point to the last aft block
+    updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0],
+                             AftBlocksLast.back(), LastValueMap);
+  }
+
+  for (unsigned It = 1; It != Count; It++) {
+    // Remap ForeBlock successors from previous iteration to this
+    BranchInst *ForeTerm =
+        cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator());
+    assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor");
+    ForeTerm->setSuccessor(0, ForeBlocksFirst[It]);
+  }
+
+  // Subloop successors and phis
+  BranchInst *SubTerm =
+      cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator());
+  SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);
+  SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);
   SubLoopBlocksFirst[0]->replacePhiUsesWith(ForeBlocksLast[0],
                                             ForeBlocksLast.back());
   SubLoopBlocksFirst[0]->replacePhiUsesWith(SubLoopBlocksLast[0],
                                             SubLoopBlocksLast.back());
- 
-  for (unsigned It = 1; It != Count; It++) { 
-    // Replace the conditional branch of the previous iteration subloop with an 
-    // unconditional one to this one 
-    BranchInst *SubTerm = 
-        cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator()); 
-    BranchInst::Create(SubLoopBlocksFirst[It], SubTerm); 
-    SubTerm->eraseFromParent(); 
- 
+
+  for (unsigned It = 1; It != Count; It++) {
+    // Replace the conditional branch of the previous iteration subloop with an
+    // unconditional one to this one
+    BranchInst *SubTerm =
+        cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator());
+    BranchInst::Create(SubLoopBlocksFirst[It], SubTerm);
+    SubTerm->eraseFromParent();
+
     SubLoopBlocksFirst[It]->replacePhiUsesWith(ForeBlocksLast[It],
                                                ForeBlocksLast.back());
     SubLoopBlocksFirst[It]->replacePhiUsesWith(SubLoopBlocksLast[It],
                                                SubLoopBlocksLast.back());
-    movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]); 
-  } 
- 
-  // Aft blocks successors and phis 
-  BranchInst *AftTerm = cast<BranchInst>(AftBlocksLast.back()->getTerminator()); 
-  if (CompletelyUnroll) { 
-    BranchInst::Create(LoopExit, AftTerm); 
-    AftTerm->eraseFromParent(); 
-  } else { 
-    AftTerm->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]); 
-    assert(AftTerm->getSuccessor(ContinueOnTrue) == LoopExit && 
-           "Expecting the ContinueOnTrue successor of AftTerm to be LoopExit"); 
-  } 
+    movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]);
+  }
+
+  // Aft blocks successors and phis
+  BranchInst *AftTerm = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
+  if (CompletelyUnroll) {
+    BranchInst::Create(LoopExit, AftTerm);
+    AftTerm->eraseFromParent();
+  } else {
+    AftTerm->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
+    assert(AftTerm->getSuccessor(ContinueOnTrue) == LoopExit &&
+           "Expecting the ContinueOnTrue successor of AftTerm to be LoopExit");
+  }
   AftBlocksFirst[0]->replacePhiUsesWith(SubLoopBlocksLast[0],
                                         SubLoopBlocksLast.back());
- 
-  for (unsigned It = 1; It != Count; It++) { 
-    // Replace the conditional branch of the previous iteration subloop with an 
-    // unconditional one to this one 
-    BranchInst *AftTerm = 
-        cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator()); 
-    BranchInst::Create(AftBlocksFirst[It], AftTerm); 
-    AftTerm->eraseFromParent(); 
- 
+
+  for (unsigned It = 1; It != Count; It++) {
+    // Replace the conditional branch of the previous iteration subloop with an
+    // unconditional one to this one
+    BranchInst *AftTerm =
+        cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator());
+    BranchInst::Create(AftBlocksFirst[It], AftTerm);
+    AftTerm->eraseFromParent();
+
     AftBlocksFirst[It]->replacePhiUsesWith(SubLoopBlocksLast[It],
                                            SubLoopBlocksLast.back());
-    movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]); 
-  } 
- 
-  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); 
-  // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the 
-  // new ones required. 
-  if (Count != 1) { 
-    SmallVector<DominatorTree::UpdateType, 4> DTUpdates; 
-    DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0], 
-                           SubLoopBlocksFirst[0]); 
-    DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, 
-                           SubLoopBlocksLast[0], AftBlocksFirst[0]); 
- 
-    DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert, 
-                           ForeBlocksLast.back(), SubLoopBlocksFirst[0]); 
-    DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert, 
-                           SubLoopBlocksLast.back(), AftBlocksFirst[0]); 
-    DTU.applyUpdatesPermissive(DTUpdates); 
-  } 
- 
-  // Merge adjacent basic blocks, if possible. 
-  SmallPtrSet<BasicBlock *, 16> MergeBlocks; 
-  MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end()); 
-  MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end()); 
-  MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end()); 
- 
-  MergeBlockSuccessorsIntoGivenBlocks(MergeBlocks, L, &DTU, LI); 
- 
-  // Apply updates to the DomTree. 
-  DT = &DTU.getDomTree(); 
- 
-  // At this point, the code is well formed.  We now do a quick sweep over the 
-  // inserted code, doing constant propagation and dead code elimination as we 
-  // go. 
-  simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC, TTI); 
-  simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC, 
-                          TTI); 
- 
-  NumCompletelyUnrolledAndJammed += CompletelyUnroll; 
-  ++NumUnrolledAndJammed; 
- 
-  // Update LoopInfo if the loop is completely removed. 
-  if (CompletelyUnroll) 
-    LI->erase(L); 
- 
-#ifndef NDEBUG 
-  // We shouldn't have done anything to break loop simplify form or LCSSA. 
-  Loop *OutestLoop = SubLoop->getParentLoop() 
-                         ? SubLoop->getParentLoop()->getParentLoop() 
-                               ? SubLoop->getParentLoop()->getParentLoop() 
-                               : SubLoop->getParentLoop() 
-                         : SubLoop; 
-  assert(DT->verify()); 
-  LI->verify(*DT); 
-  assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI)); 
-  if (!CompletelyUnroll) 
-    assert(L->isLoopSimplifyForm()); 
-  assert(SubLoop->isLoopSimplifyForm()); 
-  SE->verify(); 
-#endif 
- 
-  return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled 
-                          : LoopUnrollResult::PartiallyUnrolled; 
-} 
- 
-static bool getLoadsAndStores(BasicBlockSet &Blocks, 
-                              SmallVector<Instruction *, 4> &MemInstr) { 
-  // Scan the BBs and collect legal loads and stores. 
-  // Returns false if non-simple loads/stores are found. 
-  for (BasicBlock *BB : Blocks) { 
-    for (Instruction &I : *BB) { 
-      if (auto *Ld = dyn_cast<LoadInst>(&I)) { 
-        if (!Ld->isSimple()) 
-          return false; 
-        MemInstr.push_back(&I); 
-      } else if (auto *St = dyn_cast<StoreInst>(&I)) { 
-        if (!St->isSimple()) 
-          return false; 
-        MemInstr.push_back(&I); 
-      } else if (I.mayReadOrWriteMemory()) { 
-        return false; 
-      } 
-    } 
-  } 
-  return true; 
-} 
- 
-static bool preservesForwardDependence(Instruction *Src, Instruction *Dst, 
-                                       unsigned UnrollLevel, unsigned JamLevel, 
-                                       bool Sequentialized, Dependence *D) { 
-  // UnrollLevel might carry the dependency Src --> Dst 
-  // Does a different loop after unrolling? 
-  for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel; 
-       ++CurLoopDepth) { 
-    auto JammedDir = D->getDirection(CurLoopDepth); 
-    if (JammedDir == Dependence::DVEntry::LT) 
-      return true; 
- 
-    if (JammedDir & Dependence::DVEntry::GT) 
-      return false; 
-  } 
- 
-  return true; 
-} 
- 
-static bool preservesBackwardDependence(Instruction *Src, Instruction *Dst, 
-                                        unsigned UnrollLevel, unsigned JamLevel, 
-                                        bool Sequentialized, Dependence *D) { 
-  // UnrollLevel might carry the dependency Dst --> Src 
-  for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel; 
-       ++CurLoopDepth) { 
-    auto JammedDir = D->getDirection(CurLoopDepth); 
-    if (JammedDir == Dependence::DVEntry::GT) 
-      return true; 
- 
-    if (JammedDir & Dependence::DVEntry::LT) 
-      return false; 
-  } 
- 
-  // Backward dependencies are only preserved if not interleaved. 
-  return Sequentialized; 
-} 
- 
-// Check whether it is semantically safe Src and Dst considering any potential 
-// dependency between them. 
-// 
-// @param UnrollLevel The level of the loop being unrolled 
-// @param JamLevel    The level of the loop being jammed; if Src and Dst are on 
-// different levels, the outermost common loop counts as jammed level 
-// 
-// @return true if is safe and false if there is a dependency violation. 
-static bool checkDependency(Instruction *Src, Instruction *Dst, 
-                            unsigned UnrollLevel, unsigned JamLevel, 
-                            bool Sequentialized, DependenceInfo &DI) { 
-  assert(UnrollLevel <= JamLevel && 
-         "Expecting JamLevel to be at least UnrollLevel"); 
- 
-  if (Src == Dst) 
-    return true; 
-  // Ignore Input dependencies. 
-  if (isa<LoadInst>(Src) && isa<LoadInst>(Dst)) 
-    return true; 
- 
-  // Check whether unroll-and-jam may violate a dependency. 
-  // By construction, every dependency will be lexicographically non-negative 
-  // (if it was, it would violate the current execution order), such as 
-  //   (0,0,>,*,*) 
-  // Unroll-and-jam changes the GT execution of two executions to the same 
-  // iteration of the chosen unroll level. That is, a GT dependence becomes a GE 
-  // dependence (or EQ, if we fully unrolled the loop) at the loop's position: 
-  //   (0,0,>=,*,*) 
-  // Now, the dependency is not necessarily non-negative anymore, i.e. 
-  // unroll-and-jam may violate correctness. 
-  std::unique_ptr<Dependence> D = DI.depends(Src, Dst, true); 
-  if (!D) 
-    return true; 
-  assert(D->isOrdered() && "Expected an output, flow or anti dep."); 
- 
-  if (D->isConfused()) { 
-    LLVM_DEBUG(dbgs() << "  Confused dependency between:\n" 
-                      << "  " << *Src << "\n" 
-                      << "  " << *Dst << "\n"); 
-    return false; 
-  } 
- 
-  // If outer levels (levels enclosing the loop being unroll-and-jammed) have a 
-  // non-equal direction, then the locations accessed in the inner levels cannot 
-  // overlap in memory. We assumes the indexes never overlap into neighboring 
-  // dimensions. 
-  for (unsigned CurLoopDepth = 1; CurLoopDepth < UnrollLevel; ++CurLoopDepth) 
-    if (!(D->getDirection(CurLoopDepth) & Dependence::DVEntry::EQ)) 
-      return true; 
- 
-  auto UnrollDirection = D->getDirection(UnrollLevel); 
- 
-  // If the distance carried by the unrolled loop is 0, then after unrolling 
-  // that distance will become non-zero resulting in non-overlapping accesses in 
-  // the inner loops. 
-  if (UnrollDirection == Dependence::DVEntry::EQ) 
-    return true; 
- 
-  if (UnrollDirection & Dependence::DVEntry::LT && 
-      !preservesForwardDependence(Src, Dst, UnrollLevel, JamLevel, 
-                                  Sequentialized, D.get())) 
-    return false; 
- 
-  if (UnrollDirection & Dependence::DVEntry::GT && 
-      !preservesBackwardDependence(Src, Dst, UnrollLevel, JamLevel, 
-                                   Sequentialized, D.get())) 
-    return false; 
- 
-  return true; 
-} 
- 
-static bool 
-checkDependencies(Loop &Root, const BasicBlockSet &SubLoopBlocks, 
-                  const DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap, 
-                  const DenseMap<Loop *, BasicBlockSet> &AftBlocksMap, 
-                  DependenceInfo &DI, LoopInfo &LI) { 
-  SmallVector<BasicBlockSet, 8> AllBlocks; 
-  for (Loop *L : Root.getLoopsInPreorder()) 
-    if (ForeBlocksMap.find(L) != ForeBlocksMap.end()) 
-      AllBlocks.push_back(ForeBlocksMap.lookup(L)); 
-  AllBlocks.push_back(SubLoopBlocks); 
-  for (Loop *L : Root.getLoopsInPreorder()) 
-    if (AftBlocksMap.find(L) != AftBlocksMap.end()) 
-      AllBlocks.push_back(AftBlocksMap.lookup(L)); 
- 
-  unsigned LoopDepth = Root.getLoopDepth(); 
-  SmallVector<Instruction *, 4> EarlierLoadsAndStores; 
-  SmallVector<Instruction *, 4> CurrentLoadsAndStores; 
-  for (BasicBlockSet &Blocks : AllBlocks) { 
-    CurrentLoadsAndStores.clear(); 
-    if (!getLoadsAndStores(Blocks, CurrentLoadsAndStores)) 
-      return false; 
- 
-    Loop *CurLoop = LI.getLoopFor((*Blocks.begin())->front().getParent()); 
-    unsigned CurLoopDepth = CurLoop->getLoopDepth(); 
- 
-    for (auto *Earlier : EarlierLoadsAndStores) { 
-      Loop *EarlierLoop = LI.getLoopFor(Earlier->getParent()); 
-      unsigned EarlierDepth = EarlierLoop->getLoopDepth(); 
-      unsigned CommonLoopDepth = std::min(EarlierDepth, CurLoopDepth); 
-      for (auto *Later : CurrentLoadsAndStores) { 
-        if (!checkDependency(Earlier, Later, LoopDepth, CommonLoopDepth, false, 
-                             DI)) 
-          return false; 
-      } 
-    } 
- 
-    size_t NumInsts = CurrentLoadsAndStores.size(); 
-    for (size_t I = 0; I < NumInsts; ++I) { 
-      for (size_t J = I; J < NumInsts; ++J) { 
-        if (!checkDependency(CurrentLoadsAndStores[I], CurrentLoadsAndStores[J], 
-                             LoopDepth, CurLoopDepth, true, DI)) 
-          return false; 
-      } 
-    } 
- 
-    EarlierLoadsAndStores.append(CurrentLoadsAndStores.begin(), 
-                                 CurrentLoadsAndStores.end()); 
-  } 
-  return true; 
-} 
- 
-static bool isEligibleLoopForm(const Loop &Root) { 
-  // Root must have a child. 
-  if (Root.getSubLoops().size() != 1) 
-    return false; 
- 
-  const Loop *L = &Root; 
-  do { 
-    // All loops in Root need to be in simplify and rotated form. 
-    if (!L->isLoopSimplifyForm()) 
-      return false; 
- 
-    if (!L->isRotatedForm()) 
-      return false; 
- 
-    if (L->getHeader()->hasAddressTaken()) { 
-      LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n"); 
-      return false; 
-    } 
- 
-    unsigned SubLoopsSize = L->getSubLoops().size(); 
-    if (SubLoopsSize == 0) 
-      return true; 
- 
-    // Only one child is allowed. 
-    if (SubLoopsSize != 1) 
-      return false; 
- 
-    L = L->getSubLoops()[0]; 
-  } while (L); 
- 
-  return true; 
-} 
- 
-static Loop *getInnerMostLoop(Loop *L) { 
-  while (!L->getSubLoops().empty()) 
-    L = L->getSubLoops()[0]; 
-  return L; 
-} 
- 
-bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, 
-                                DependenceInfo &DI, LoopInfo &LI) { 
-  if (!isEligibleLoopForm(*L)) { 
-    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Ineligible loop form\n"); 
-    return false; 
-  } 
- 
-  /* We currently handle outer loops like this: 
-        | 
-    ForeFirst    <------\   } 
-     Blocks             |   } ForeBlocks of L 
-    ForeLast            |   } 
-        |               | 
-       ...              | 
-        |               | 
-    ForeFirst    <----\ |   } 
-     Blocks           | |   } ForeBlocks of a inner loop of L 
-    ForeLast          | |   } 
-        |             | | 
-    JamLoopFirst  <\  | |   } 
-     Blocks        |  | |   } JamLoopBlocks of the innermost loop 
-    JamLoopLast   -/  | |   } 
-        |             | | 
-    AftFirst          | |   } 
-     Blocks           | |   } AftBlocks of a inner loop of L 
-    AftLast     ------/ |   } 
-        |               | 
-       ...              | 
-        |               | 
-    AftFirst            |   } 
-     Blocks             |   } AftBlocks of L 
-    AftLast     --------/   } 
-        | 
- 
-    There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks 
-    and AftBlocks, providing that there is one edge from Fores to SubLoops, 
-    one edge from SubLoops to Afts and a single outer loop exit (from Afts). 
-    In practice we currently limit Aft blocks to a single block, and limit 
-    things further in the profitablility checks of the unroll and jam pass. 
- 
-    Because of the way we rearrange basic blocks, we also require that 
-    the Fore blocks of L on all unrolled iterations are safe to move before the 
-    blocks of the direct child of L of all iterations. So we require that the 
-    phi node looping operands of ForeHeader can be moved to at least the end of 
-    ForeEnd, so that we can arrange cloned Fore Blocks before the subloop and 
-    match up Phi's correctly. 
- 
-    i.e. The old order of blocks used to be 
-           (F1)1 (F2)1 J1_1 J1_2 (A2)1 (A1)1 (F1)2 (F2)2 J2_1 J2_2 (A2)2 (A1)2. 
-         It needs to be safe to transform this to 
-           (F1)1 (F1)2 (F2)1 (F2)2 J1_1 J1_2 J2_1 J2_2 (A2)1 (A2)2 (A1)1 (A1)2. 
- 
-    There are then a number of checks along the lines of no calls, no 
-    exceptions, inner loop IV is consistent, etc. Note that for loops requiring 
-    runtime unrolling, UnrollRuntimeLoopRemainder can also fail in 
-    UnrollAndJamLoop if the trip count cannot be easily calculated. 
-  */ 
- 
-  // Split blocks into Fore/SubLoop/Aft based on dominators 
-  Loop *JamLoop = getInnerMostLoop(L); 
-  BasicBlockSet SubLoopBlocks; 
-  DenseMap<Loop *, BasicBlockSet> ForeBlocksMap; 
-  DenseMap<Loop *, BasicBlockSet> AftBlocksMap; 
-  if (!partitionOuterLoopBlocks(*L, *JamLoop, SubLoopBlocks, ForeBlocksMap, 
-                                AftBlocksMap, DT)) { 
-    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n"); 
-    return false; 
-  } 
- 
-  // Aft blocks may need to move instructions to fore blocks, which becomes more 
-  // difficult if there are multiple (potentially conditionally executed) 
-  // blocks. For now we just exclude loops with multiple aft blocks. 
-  if (AftBlocksMap[L].size() != 1) { 
-    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle " 
-                         "multiple blocks after the loop\n"); 
-    return false; 
-  } 
- 
-  // Check inner loop backedge count is consistent on all iterations of the 
-  // outer loop 
-  if (any_of(L->getLoopsInPreorder(), [&SE](Loop *SubLoop) { 
-        return !hasIterationCountInvariantInParent(SubLoop, SE); 
-      })) { 
-    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is " 
-                         "not consistent on each iteration\n"); 
-    return false; 
-  } 
- 
-  // Check the loop safety info for exceptions. 
-  SimpleLoopSafetyInfo LSI; 
-  LSI.computeLoopSafetyInfo(L); 
-  if (LSI.anyBlockMayThrow()) { 
-    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Something may throw\n"); 
-    return false; 
-  } 
- 
-  // We've ruled out the easy stuff and now need to check that there are no 
-  // interdependencies which may prevent us from moving the: 
-  //  ForeBlocks before Subloop and AftBlocks. 
-  //  Subloop before AftBlocks. 
-  //  ForeBlock phi operands before the subloop 
- 
-  // Make sure we can move all instructions we need to before the subloop 
-  BasicBlock *Header = L->getHeader(); 
-  BasicBlock *Latch = L->getLoopLatch(); 
-  BasicBlockSet AftBlocks = AftBlocksMap[L]; 
-  Loop *SubLoop = L->getSubLoops()[0]; 
-  if (!processHeaderPhiOperands( 
-          Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) { 
-            if (SubLoop->contains(I->getParent())) 
-              return false; 
-            if (AftBlocks.count(I->getParent())) { 
-              // If we hit a phi node in afts we know we are done (probably 
-              // LCSSA) 
-              if (isa<PHINode>(I)) 
-                return false; 
-              // Can't move instructions with side effects or memory 
-              // reads/writes 
-              if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory()) 
-                return false; 
-            } 
-            // Keep going 
-            return true; 
-          })) { 
-    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't move required " 
-                         "instructions after subloop to before it\n"); 
-    return false; 
-  } 
- 
-  // Check for memory dependencies which prohibit the unrolling we are doing. 
-  // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check 
-  // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub. 
-  if (!checkDependencies(*L, SubLoopBlocks, ForeBlocksMap, AftBlocksMap, DI, 
-                         LI)) { 
-    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n"); 
-    return false; 
-  } 
- 
-  return true; 
-} 
+    movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
+  }
+
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+  // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
+  // new ones required.
+  if (Count != 1) {
+    SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
+    DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0],
+                           SubLoopBlocksFirst[0]);
+    DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete,
+                           SubLoopBlocksLast[0], AftBlocksFirst[0]);
+
+    DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
+                           ForeBlocksLast.back(), SubLoopBlocksFirst[0]);
+    DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
+                           SubLoopBlocksLast.back(), AftBlocksFirst[0]);
+    DTU.applyUpdatesPermissive(DTUpdates);
+  }
+
+  // Merge adjacent basic blocks, if possible.
+  SmallPtrSet<BasicBlock *, 16> MergeBlocks;
+  MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
+  MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
+  MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
+
+  MergeBlockSuccessorsIntoGivenBlocks(MergeBlocks, L, &DTU, LI);
+
+  // Apply updates to the DomTree.
+  DT = &DTU.getDomTree();
+
+  // At this point, the code is well formed.  We now do a quick sweep over the
+  // inserted code, doing constant propagation and dead code elimination as we
+  // go.
+  simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC, TTI);
+  simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC,
+                          TTI);
+
+  NumCompletelyUnrolledAndJammed += CompletelyUnroll;
+  ++NumUnrolledAndJammed;
+
+  // Update LoopInfo if the loop is completely removed.
+  if (CompletelyUnroll)
+    LI->erase(L);
+
+#ifndef NDEBUG
+  // We shouldn't have done anything to break loop simplify form or LCSSA.
+  Loop *OutestLoop = SubLoop->getParentLoop()
+                         ? SubLoop->getParentLoop()->getParentLoop()
+                               ? SubLoop->getParentLoop()->getParentLoop()
+                               : SubLoop->getParentLoop()
+                         : SubLoop;
+  assert(DT->verify());
+  LI->verify(*DT);
+  assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
+  if (!CompletelyUnroll)
+    assert(L->isLoopSimplifyForm());
+  assert(SubLoop->isLoopSimplifyForm());
+  SE->verify();
+#endif
+
+  return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
+                          : LoopUnrollResult::PartiallyUnrolled;
+}
+
+static bool getLoadsAndStores(BasicBlockSet &Blocks,
+                              SmallVector<Instruction *, 4> &MemInstr) {
+  // Scan the BBs and collect legal loads and stores.
+  // Returns false if non-simple loads/stores are found.
+  for (BasicBlock *BB : Blocks) {
+    for (Instruction &I : *BB) {
+      if (auto *Ld = dyn_cast<LoadInst>(&I)) {
+        if (!Ld->isSimple())
+          return false;
+        MemInstr.push_back(&I);
+      } else if (auto *St = dyn_cast<StoreInst>(&I)) {
+        if (!St->isSimple())
+          return false;
+        MemInstr.push_back(&I);
+      } else if (I.mayReadOrWriteMemory()) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+static bool preservesForwardDependence(Instruction *Src, Instruction *Dst,
+                                       unsigned UnrollLevel, unsigned JamLevel,
+                                       bool Sequentialized, Dependence *D) {
+  // UnrollLevel might carry the dependency Src --> Dst
+  // Does a different loop after unrolling?
+  for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel;
+       ++CurLoopDepth) {
+    auto JammedDir = D->getDirection(CurLoopDepth);
+    if (JammedDir == Dependence::DVEntry::LT)
+      return true;
+
+    if (JammedDir & Dependence::DVEntry::GT)
+      return false;
+  }
+
+  return true;
+}
+
+static bool preservesBackwardDependence(Instruction *Src, Instruction *Dst,
+                                        unsigned UnrollLevel, unsigned JamLevel,
+                                        bool Sequentialized, Dependence *D) {
+  // UnrollLevel might carry the dependency Dst --> Src
+  for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel;
+       ++CurLoopDepth) {
+    auto JammedDir = D->getDirection(CurLoopDepth);
+    if (JammedDir == Dependence::DVEntry::GT)
+      return true;
+
+    if (JammedDir & Dependence::DVEntry::LT)
+      return false;
+  }
+
+  // Backward dependencies are only preserved if not interleaved.
+  return Sequentialized;
+}
+
+// Check whether it is semantically safe Src and Dst considering any potential
+// dependency between them.
+//
+// @param UnrollLevel The level of the loop being unrolled
+// @param JamLevel    The level of the loop being jammed; if Src and Dst are on
+// different levels, the outermost common loop counts as jammed level
+//
+// @return true if is safe and false if there is a dependency violation.
+static bool checkDependency(Instruction *Src, Instruction *Dst,
+                            unsigned UnrollLevel, unsigned JamLevel,
+                            bool Sequentialized, DependenceInfo &DI) {
+  assert(UnrollLevel <= JamLevel &&
+         "Expecting JamLevel to be at least UnrollLevel");
+
+  if (Src == Dst)
+    return true;
+  // Ignore Input dependencies.
+  if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
+    return true;
+
+  // Check whether unroll-and-jam may violate a dependency.
+  // By construction, every dependency will be lexicographically non-negative
+  // (if it was, it would violate the current execution order), such as
+  //   (0,0,>,*,*)
+  // Unroll-and-jam changes the GT execution of two executions to the same
+  // iteration of the chosen unroll level. That is, a GT dependence becomes a GE
+  // dependence (or EQ, if we fully unrolled the loop) at the loop's position:
+  //   (0,0,>=,*,*)
+  // Now, the dependency is not necessarily non-negative anymore, i.e.
+  // unroll-and-jam may violate correctness.
+  std::unique_ptr<Dependence> D = DI.depends(Src, Dst, true);
+  if (!D)
+    return true;
+  assert(D->isOrdered() && "Expected an output, flow or anti dep.");
+
+  if (D->isConfused()) {
+    LLVM_DEBUG(dbgs() << "  Confused dependency between:\n"
+                      << "  " << *Src << "\n"
+                      << "  " << *Dst << "\n");
+    return false;
+  }
+
+  // If outer levels (levels enclosing the loop being unroll-and-jammed) have a
+  // non-equal direction, then the locations accessed in the inner levels cannot
+  // overlap in memory. We assumes the indexes never overlap into neighboring
+  // dimensions.
+  for (unsigned CurLoopDepth = 1; CurLoopDepth < UnrollLevel; ++CurLoopDepth)
+    if (!(D->getDirection(CurLoopDepth) & Dependence::DVEntry::EQ))
+      return true;
+
+  auto UnrollDirection = D->getDirection(UnrollLevel);
+
+  // If the distance carried by the unrolled loop is 0, then after unrolling
+  // that distance will become non-zero resulting in non-overlapping accesses in
+  // the inner loops.
+  if (UnrollDirection == Dependence::DVEntry::EQ)
+    return true;
+
+  if (UnrollDirection & Dependence::DVEntry::LT &&
+      !preservesForwardDependence(Src, Dst, UnrollLevel, JamLevel,
+                                  Sequentialized, D.get()))
+    return false;
+
+  if (UnrollDirection & Dependence::DVEntry::GT &&
+      !preservesBackwardDependence(Src, Dst, UnrollLevel, JamLevel,
+                                   Sequentialized, D.get()))
+    return false;
+
+  return true;
+}
+
+static bool
+checkDependencies(Loop &Root, const BasicBlockSet &SubLoopBlocks,
+                  const DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap,
+                  const DenseMap<Loop *, BasicBlockSet> &AftBlocksMap,
+                  DependenceInfo &DI, LoopInfo &LI) {
+  SmallVector<BasicBlockSet, 8> AllBlocks;
+  for (Loop *L : Root.getLoopsInPreorder())
+    if (ForeBlocksMap.find(L) != ForeBlocksMap.end())
+      AllBlocks.push_back(ForeBlocksMap.lookup(L));
+  AllBlocks.push_back(SubLoopBlocks);
+  for (Loop *L : Root.getLoopsInPreorder())
+    if (AftBlocksMap.find(L) != AftBlocksMap.end())
+      AllBlocks.push_back(AftBlocksMap.lookup(L));
+
+  unsigned LoopDepth = Root.getLoopDepth();
+  SmallVector<Instruction *, 4> EarlierLoadsAndStores;
+  SmallVector<Instruction *, 4> CurrentLoadsAndStores;
+  for (BasicBlockSet &Blocks : AllBlocks) {
+    CurrentLoadsAndStores.clear();
+    if (!getLoadsAndStores(Blocks, CurrentLoadsAndStores))
+      return false;
+
+    Loop *CurLoop = LI.getLoopFor((*Blocks.begin())->front().getParent());
+    unsigned CurLoopDepth = CurLoop->getLoopDepth();
+
+    for (auto *Earlier : EarlierLoadsAndStores) {
+      Loop *EarlierLoop = LI.getLoopFor(Earlier->getParent());
+      unsigned EarlierDepth = EarlierLoop->getLoopDepth();
+      unsigned CommonLoopDepth = std::min(EarlierDepth, CurLoopDepth);
+      for (auto *Later : CurrentLoadsAndStores) {
+        if (!checkDependency(Earlier, Later, LoopDepth, CommonLoopDepth, false,
+                             DI))
+          return false;
+      }
+    }
+
+    size_t NumInsts = CurrentLoadsAndStores.size();
+    for (size_t I = 0; I < NumInsts; ++I) {
+      for (size_t J = I; J < NumInsts; ++J) {
+        if (!checkDependency(CurrentLoadsAndStores[I], CurrentLoadsAndStores[J],
+                             LoopDepth, CurLoopDepth, true, DI))
+          return false;
+      }
+    }
+
+    EarlierLoadsAndStores.append(CurrentLoadsAndStores.begin(),
+                                 CurrentLoadsAndStores.end());
+  }
+  return true;
+}
+
+static bool isEligibleLoopForm(const Loop &Root) {
+  // Root must have a child.
+  if (Root.getSubLoops().size() != 1)
+    return false;
+
+  const Loop *L = &Root;
+  do {
+    // All loops in Root need to be in simplify and rotated form.
+    if (!L->isLoopSimplifyForm())
+      return false;
+
+    if (!L->isRotatedForm())
+      return false;
+
+    if (L->getHeader()->hasAddressTaken()) {
+      LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n");
+      return false;
+    }
+
+    unsigned SubLoopsSize = L->getSubLoops().size();
+    if (SubLoopsSize == 0)
+      return true;
+
+    // Only one child is allowed.
+    if (SubLoopsSize != 1)
+      return false;
+
+    L = L->getSubLoops()[0];
+  } while (L);
+
+  return true;
+}
+
+static Loop *getInnerMostLoop(Loop *L) {
+  while (!L->getSubLoops().empty())
+    L = L->getSubLoops()[0];
+  return L;
+}
+
+bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
+                                DependenceInfo &DI, LoopInfo &LI) {
+  if (!isEligibleLoopForm(*L)) {
+    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Ineligible loop form\n");
+    return false;
+  }
+
+  /* We currently handle outer loops like this:
+        |
+    ForeFirst    <------\   }
+     Blocks             |   } ForeBlocks of L
+    ForeLast            |   }
+        |               |
+       ...              |
+        |               |
+    ForeFirst    <----\ |   }
+     Blocks           | |   } ForeBlocks of a inner loop of L
+    ForeLast          | |   }
+        |             | |
+    JamLoopFirst  <\  | |   }
+     Blocks        |  | |   } JamLoopBlocks of the innermost loop
+    JamLoopLast   -/  | |   }
+        |             | |
+    AftFirst          | |   }
+     Blocks           | |   } AftBlocks of a inner loop of L
+    AftLast     ------/ |   }
+        |               |
+       ...              |
+        |               |
+    AftFirst            |   }
+     Blocks             |   } AftBlocks of L
+    AftLast     --------/   }
+        |
+
+    There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
+    and AftBlocks, providing that there is one edge from Fores to SubLoops,
+    one edge from SubLoops to Afts and a single outer loop exit (from Afts).
+    In practice we currently limit Aft blocks to a single block, and limit
+    things further in the profitablility checks of the unroll and jam pass.
+
+    Because of the way we rearrange basic blocks, we also require that
+    the Fore blocks of L on all unrolled iterations are safe to move before the
+    blocks of the direct child of L of all iterations. So we require that the
+    phi node looping operands of ForeHeader can be moved to at least the end of
+    ForeEnd, so that we can arrange cloned Fore Blocks before the subloop and
+    match up Phi's correctly.
+
+    i.e. The old order of blocks used to be
+           (F1)1 (F2)1 J1_1 J1_2 (A2)1 (A1)1 (F1)2 (F2)2 J2_1 J2_2 (A2)2 (A1)2.
+         It needs to be safe to transform this to
+           (F1)1 (F1)2 (F2)1 (F2)2 J1_1 J1_2 J2_1 J2_2 (A2)1 (A2)2 (A1)1 (A1)2.
+
+    There are then a number of checks along the lines of no calls, no
+    exceptions, inner loop IV is consistent, etc. Note that for loops requiring
+    runtime unrolling, UnrollRuntimeLoopRemainder can also fail in
+    UnrollAndJamLoop if the trip count cannot be easily calculated.
+  */
+
+  // Split blocks into Fore/SubLoop/Aft based on dominators
+  Loop *JamLoop = getInnerMostLoop(L);
+  BasicBlockSet SubLoopBlocks;
+  DenseMap<Loop *, BasicBlockSet> ForeBlocksMap;
+  DenseMap<Loop *, BasicBlockSet> AftBlocksMap;
+  if (!partitionOuterLoopBlocks(*L, *JamLoop, SubLoopBlocks, ForeBlocksMap,
+                                AftBlocksMap, DT)) {
+    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n");
+    return false;
+  }
+
+  // Aft blocks may need to move instructions to fore blocks, which becomes more
+  // difficult if there are multiple (potentially conditionally executed)
+  // blocks. For now we just exclude loops with multiple aft blocks.
+  if (AftBlocksMap[L].size() != 1) {
+    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle "
+                         "multiple blocks after the loop\n");
+    return false;
+  }
+
+  // Check inner loop backedge count is consistent on all iterations of the
+  // outer loop
+  if (any_of(L->getLoopsInPreorder(), [&SE](Loop *SubLoop) {
+        return !hasIterationCountInvariantInParent(SubLoop, SE);
+      })) {
+    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is "
+                         "not consistent on each iteration\n");
+    return false;
+  }
+
+  // Check the loop safety info for exceptions.
+  SimpleLoopSafetyInfo LSI;
+  LSI.computeLoopSafetyInfo(L);
+  if (LSI.anyBlockMayThrow()) {
+    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Something may throw\n");
+    return false;
+  }
+
+  // We've ruled out the easy stuff and now need to check that there are no
+  // interdependencies which may prevent us from moving the:
+  //  ForeBlocks before Subloop and AftBlocks.
+  //  Subloop before AftBlocks.
+  //  ForeBlock phi operands before the subloop
+
+  // Make sure we can move all instructions we need to before the subloop
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Latch = L->getLoopLatch();
+  BasicBlockSet AftBlocks = AftBlocksMap[L];
+  Loop *SubLoop = L->getSubLoops()[0];
+  if (!processHeaderPhiOperands(
+          Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) {
+            if (SubLoop->contains(I->getParent()))
+              return false;
+            if (AftBlocks.count(I->getParent())) {
+              // If we hit a phi node in afts we know we are done (probably
+              // LCSSA)
+              if (isa<PHINode>(I))
+                return false;
+              // Can't move instructions with side effects or memory
+              // reads/writes
+              if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory())
+                return false;
+            }
+            // Keep going
+            return true;
+          })) {
+    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't move required "
+                         "instructions after subloop to before it\n");
+    return false;
+  }
+
+  // Check for memory dependencies which prohibit the unrolling we are doing.
+  // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check
+  // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
+  if (!checkDependencies(*L, SubLoopBlocks, ForeBlocksMap, AftBlocksMap, DI,
+                         LI)) {
+    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n");
+    return false;
+  }
+
+  return true;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 6f73c51db5..0abf62be15 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -1,510 +1,510 @@
-//===-- UnrollLoopRuntime.cpp - Runtime Loop unrolling utilities ----------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements some loop unrolling utilities for loops with run-time 
-// trip counts.  See LoopUnroll.cpp for unrolling loops with compile-time 
-// trip counts. 
-// 
-// The functions in this file are used to generate extra code when the 
-// run-time trip count modulo the unroll factor is not 0.  When this is the 
-// case, we need to generate code to execute these 'left over' iterations. 
-// 
-// The current strategy generates an if-then-else sequence prior to the 
-// unrolled loop to execute the 'left over' iterations before or after the 
-// unrolled loop. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/LoopIterator.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Dominators.h" 
+//===-- UnrollLoopRuntime.cpp - Runtime Loop unrolling utilities ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements some loop unrolling utilities for loops with run-time
+// trip counts.  See LoopUnroll.cpp for unrolling loops with compile-time
+// trip counts.
+//
+// The functions in this file are used to generate extra code when the
+// run-time trip count modulo the unroll factor is not 0.  When this is the
+// case, we need to generate code to execute these 'left over' iterations.
+//
+// The current strategy generates an if-then-else sequence prior to the
+// unrolled loop to execute the 'left over' iterations before or after the
+// unrolled loop.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Utils.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/Cloning.h" 
-#include "llvm/Transforms/Utils/LoopUtils.h" 
-#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 
-#include "llvm/Transforms/Utils/UnrollLoop.h" 
-#include <algorithm> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "loop-unroll" 
- 
-STATISTIC(NumRuntimeUnrolled, 
-          "Number of loops unrolled with run-time trip counts"); 
-static cl::opt<bool> UnrollRuntimeMultiExit( 
-    "unroll-runtime-multi-exit", cl::init(false), cl::Hidden, 
-    cl::desc("Allow runtime unrolling for loops with multiple exits, when " 
-             "epilog is generated")); 
- 
-/// Connect the unrolling prolog code to the original loop. 
-/// The unrolling prolog code contains code to execute the 
-/// 'extra' iterations if the run-time trip count modulo the 
-/// unroll count is non-zero. 
-/// 
-/// This function performs the following: 
-/// - Create PHI nodes at prolog end block to combine values 
-///   that exit the prolog code and jump around the prolog. 
-/// - Add a PHI operand to a PHI node at the loop exit block 
-///   for values that exit the prolog and go around the loop. 
-/// - Branch around the original loop if the trip count is less 
-///   than the unroll factor. 
-/// 
-static void ConnectProlog(Loop *L, Value *BECount, unsigned Count, 
-                          BasicBlock *PrologExit, 
-                          BasicBlock *OriginalLoopLatchExit, 
-                          BasicBlock *PreHeader, BasicBlock *NewPreHeader, 
-                          ValueToValueMapTy &VMap, DominatorTree *DT, 
-                          LoopInfo *LI, bool PreserveLCSSA) { 
-  // Loop structure should be the following: 
-  // Preheader 
-  //  PrologHeader 
-  //  ... 
-  //  PrologLatch 
-  //  PrologExit 
-  //   NewPreheader 
-  //    Header 
-  //    ... 
-  //    Latch 
-  //      LatchExit 
-  BasicBlock *Latch = L->getLoopLatch(); 
-  assert(Latch && "Loop must have a latch"); 
-  BasicBlock *PrologLatch = cast<BasicBlock>(VMap[Latch]); 
- 
-  // Create a PHI node for each outgoing value from the original loop 
-  // (which means it is an outgoing value from the prolog code too). 
-  // The new PHI node is inserted in the prolog end basic block. 
-  // The new PHI node value is added as an operand of a PHI node in either 
-  // the loop header or the loop exit block. 
-  for (BasicBlock *Succ : successors(Latch)) { 
-    for (PHINode &PN : Succ->phis()) { 
-      // Add a new PHI node to the prolog end block and add the 
-      // appropriate incoming values. 
-      // TODO: This code assumes that the PrologExit (or the LatchExit block for 
-      // prolog loop) contains only one predecessor from the loop, i.e. the 
-      // PrologLatch. When supporting multiple-exiting block loops, we can have 
-      // two or more blocks that have the LatchExit as the target in the 
-      // original loop. 
-      PHINode *NewPN = PHINode::Create(PN.getType(), 2, PN.getName() + ".unr", 
-                                       PrologExit->getFirstNonPHI()); 
-      // Adding a value to the new PHI node from the original loop preheader. 
-      // This is the value that skips all the prolog code. 
-      if (L->contains(&PN)) { 
-        // Succ is loop header. 
-        NewPN->addIncoming(PN.getIncomingValueForBlock(NewPreHeader), 
-                           PreHeader); 
-      } else { 
-        // Succ is LatchExit. 
-        NewPN->addIncoming(UndefValue::get(PN.getType()), PreHeader); 
-      } 
- 
-      Value *V = PN.getIncomingValueForBlock(Latch); 
-      if (Instruction *I = dyn_cast<Instruction>(V)) { 
-        if (L->contains(I)) { 
-          V = VMap.lookup(I); 
-        } 
-      } 
-      // Adding a value to the new PHI node from the last prolog block 
-      // that was created. 
-      NewPN->addIncoming(V, PrologLatch); 
- 
-      // Update the existing PHI node operand with the value from the 
-      // new PHI node.  How this is done depends on if the existing 
-      // PHI node is in the original loop block, or the exit block. 
-      if (L->contains(&PN)) 
-        PN.setIncomingValueForBlock(NewPreHeader, NewPN); 
-      else 
-        PN.addIncoming(NewPN, PrologExit); 
-    } 
-  } 
- 
-  // Make sure that created prolog loop is in simplified form 
-  SmallVector<BasicBlock *, 4> PrologExitPreds; 
-  Loop *PrologLoop = LI->getLoopFor(PrologLatch); 
-  if (PrologLoop) { 
-    for (BasicBlock *PredBB : predecessors(PrologExit)) 
-      if (PrologLoop->contains(PredBB)) 
-        PrologExitPreds.push_back(PredBB); 
- 
-    SplitBlockPredecessors(PrologExit, PrologExitPreds, ".unr-lcssa", DT, LI, 
-                           nullptr, PreserveLCSSA); 
-  } 
- 
-  // Create a branch around the original loop, which is taken if there are no 
-  // iterations remaining to be executed after running the prologue. 
-  Instruction *InsertPt = PrologExit->getTerminator(); 
-  IRBuilder<> B(InsertPt); 
- 
-  assert(Count != 0 && "nonsensical Count!"); 
- 
-  // If BECount <u (Count - 1) then (BECount + 1) % Count == (BECount + 1) 
-  // This means %xtraiter is (BECount + 1) and all of the iterations of this 
-  // loop were executed by the prologue.  Note that if BECount <u (Count - 1) 
-  // then (BECount + 1) cannot unsigned-overflow. 
-  Value *BrLoopExit = 
-      B.CreateICmpULT(BECount, ConstantInt::get(BECount->getType(), Count - 1)); 
-  // Split the exit to maintain loop canonicalization guarantees 
-  SmallVector<BasicBlock *, 4> Preds(predecessors(OriginalLoopLatchExit)); 
-  SplitBlockPredecessors(OriginalLoopLatchExit, Preds, ".unr-lcssa", DT, LI, 
-                         nullptr, PreserveLCSSA); 
-  // Add the branch to the exit block (around the unrolled loop) 
-  B.CreateCondBr(BrLoopExit, OriginalLoopLatchExit, NewPreHeader); 
-  InsertPt->eraseFromParent(); 
-  if (DT) 
-    DT->changeImmediateDominator(OriginalLoopLatchExit, PrologExit); 
-} 
- 
-/// Connect the unrolling epilog code to the original loop. 
-/// The unrolling epilog code contains code to execute the 
-/// 'extra' iterations if the run-time trip count modulo the 
-/// unroll count is non-zero. 
-/// 
-/// This function performs the following: 
-/// - Update PHI nodes at the unrolling loop exit and epilog loop exit 
-/// - Create PHI nodes at the unrolling loop exit to combine 
-///   values that exit the unrolling loop code and jump around it. 
-/// - Update PHI operands in the epilog loop by the new PHI nodes 
-/// - Branch around the epilog loop if extra iters (ModVal) is zero. 
-/// 
-static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, 
-                          BasicBlock *Exit, BasicBlock *PreHeader, 
-                          BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader, 
-                          ValueToValueMapTy &VMap, DominatorTree *DT, 
-                          LoopInfo *LI, bool PreserveLCSSA)  { 
-  BasicBlock *Latch = L->getLoopLatch(); 
-  assert(Latch && "Loop must have a latch"); 
-  BasicBlock *EpilogLatch = cast<BasicBlock>(VMap[Latch]); 
- 
-  // Loop structure should be the following: 
-  // 
-  // PreHeader 
-  // NewPreHeader 
-  //   Header 
-  //   ... 
-  //   Latch 
-  // NewExit (PN) 
-  // EpilogPreHeader 
-  //   EpilogHeader 
-  //   ... 
-  //   EpilogLatch 
-  // Exit (EpilogPN) 
- 
-  // Update PHI nodes at NewExit and Exit. 
-  for (PHINode &PN : NewExit->phis()) { 
-    // PN should be used in another PHI located in Exit block as 
-    // Exit was split by SplitBlockPredecessors into Exit and NewExit 
-    // Basicaly it should look like: 
-    // NewExit: 
-    //   PN = PHI [I, Latch] 
-    // ... 
-    // Exit: 
-    //   EpilogPN = PHI [PN, EpilogPreHeader] 
-    // 
-    // There is EpilogPreHeader incoming block instead of NewExit as 
-    // NewExit was spilt 1 more time to get EpilogPreHeader. 
-    assert(PN.hasOneUse() && "The phi should have 1 use"); 
-    PHINode *EpilogPN = cast<PHINode>(PN.use_begin()->getUser()); 
-    assert(EpilogPN->getParent() == Exit && "EpilogPN should be in Exit block"); 
- 
-    // Add incoming PreHeader from branch around the Loop 
-    PN.addIncoming(UndefValue::get(PN.getType()), PreHeader); 
- 
-    Value *V = PN.getIncomingValueForBlock(Latch); 
-    Instruction *I = dyn_cast<Instruction>(V); 
-    if (I && L->contains(I)) 
-      // If value comes from an instruction in the loop add VMap value. 
-      V = VMap.lookup(I); 
-    // For the instruction out of the loop, constant or undefined value 
-    // insert value itself. 
-    EpilogPN->addIncoming(V, EpilogLatch); 
- 
-    assert(EpilogPN->getBasicBlockIndex(EpilogPreHeader) >= 0 && 
-          "EpilogPN should have EpilogPreHeader incoming block"); 
-    // Change EpilogPreHeader incoming block to NewExit. 
-    EpilogPN->setIncomingBlock(EpilogPN->getBasicBlockIndex(EpilogPreHeader), 
-                               NewExit); 
-    // Now PHIs should look like: 
-    // NewExit: 
-    //   PN = PHI [I, Latch], [undef, PreHeader] 
-    // ... 
-    // Exit: 
-    //   EpilogPN = PHI [PN, NewExit], [VMap[I], EpilogLatch] 
-  } 
- 
-  // Create PHI nodes at NewExit (from the unrolling loop Latch and PreHeader). 
-  // Update corresponding PHI nodes in epilog loop. 
-  for (BasicBlock *Succ : successors(Latch)) { 
-    // Skip this as we already updated phis in exit blocks. 
-    if (!L->contains(Succ)) 
-      continue; 
-    for (PHINode &PN : Succ->phis()) { 
-      // Add new PHI nodes to the loop exit block and update epilog 
-      // PHIs with the new PHI values. 
-      PHINode *NewPN = PHINode::Create(PN.getType(), 2, PN.getName() + ".unr", 
-                                       NewExit->getFirstNonPHI()); 
-      // Adding a value to the new PHI node from the unrolling loop preheader. 
-      NewPN->addIncoming(PN.getIncomingValueForBlock(NewPreHeader), PreHeader); 
-      // Adding a value to the new PHI node from the unrolling loop latch. 
-      NewPN->addIncoming(PN.getIncomingValueForBlock(Latch), Latch); 
- 
-      // Update the existing PHI node operand with the value from the new PHI 
-      // node.  Corresponding instruction in epilog loop should be PHI. 
-      PHINode *VPN = cast<PHINode>(VMap[&PN]); 
-      VPN->setIncomingValueForBlock(EpilogPreHeader, NewPN); 
-    } 
-  } 
- 
-  Instruction *InsertPt = NewExit->getTerminator(); 
-  IRBuilder<> B(InsertPt); 
-  Value *BrLoopExit = B.CreateIsNotNull(ModVal, "lcmp.mod"); 
-  assert(Exit && "Loop must have a single exit block only"); 
-  // Split the epilogue exit to maintain loop canonicalization guarantees 
-  SmallVector<BasicBlock*, 4> Preds(predecessors(Exit)); 
-  SplitBlockPredecessors(Exit, Preds, ".epilog-lcssa", DT, LI, nullptr, 
-                         PreserveLCSSA); 
-  // Add the branch to the exit block (around the unrolling loop) 
-  B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit); 
-  InsertPt->eraseFromParent(); 
-  if (DT) 
-    DT->changeImmediateDominator(Exit, NewExit); 
- 
-  // Split the main loop exit to maintain canonicalization guarantees. 
-  SmallVector<BasicBlock*, 4> NewExitPreds{Latch}; 
-  SplitBlockPredecessors(NewExit, NewExitPreds, ".loopexit", DT, LI, nullptr, 
-                         PreserveLCSSA); 
-} 
- 
-/// Create a clone of the blocks in a loop and connect them together. 
-/// If CreateRemainderLoop is false, loop structure will not be cloned, 
-/// otherwise a new loop will be created including all cloned blocks, and the 
-/// iterator of it switches to count NewIter down to 0. 
-/// The cloned blocks should be inserted between InsertTop and InsertBot. 
-/// If loop structure is cloned InsertTop should be new preheader, InsertBot 
-/// new loop exit. 
-/// Return the new cloned loop that is created when CreateRemainderLoop is true. 
-static Loop * 
-CloneLoopBlocks(Loop *L, Value *NewIter, const bool CreateRemainderLoop, 
-                const bool UseEpilogRemainder, const bool UnrollRemainder, 
-                BasicBlock *InsertTop, 
-                BasicBlock *InsertBot, BasicBlock *Preheader, 
-                std::vector<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks, 
-                ValueToValueMapTy &VMap, DominatorTree *DT, LoopInfo *LI) { 
-  StringRef suffix = UseEpilogRemainder ? "epil" : "prol"; 
-  BasicBlock *Header = L->getHeader(); 
-  BasicBlock *Latch = L->getLoopLatch(); 
-  Function *F = Header->getParent(); 
-  LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO(); 
-  LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO(); 
-  Loop *ParentLoop = L->getParentLoop(); 
-  NewLoopsMap NewLoops; 
-  NewLoops[ParentLoop] = ParentLoop; 
-  if (!CreateRemainderLoop) 
-    NewLoops[L] = ParentLoop; 
- 
-  // For each block in the original loop, create a new copy, 
-  // and update the value map with the newly created values. 
-  for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) { 
-    BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, "." + suffix, F); 
-    NewBlocks.push_back(NewBB); 
- 
-    // If we're unrolling the outermost loop, there's no remainder loop, 
-    // and this block isn't in a nested loop, then the new block is not 
-    // in any loop. Otherwise, add it to loopinfo. 
-    if (CreateRemainderLoop || LI->getLoopFor(*BB) != L || ParentLoop) 
-      addClonedBlockToLoopInfo(*BB, NewBB, LI, NewLoops); 
- 
-    VMap[*BB] = NewBB; 
-    if (Header == *BB) { 
-      // For the first block, add a CFG connection to this newly 
-      // created block. 
-      InsertTop->getTerminator()->setSuccessor(0, NewBB); 
-    } 
- 
-    if (DT) { 
-      if (Header == *BB) { 
-        // The header is dominated by the preheader. 
-        DT->addNewBlock(NewBB, InsertTop); 
-      } else { 
-        // Copy information from original loop to unrolled loop. 
-        BasicBlock *IDomBB = DT->getNode(*BB)->getIDom()->getBlock(); 
-        DT->addNewBlock(NewBB, cast<BasicBlock>(VMap[IDomBB])); 
-      } 
-    } 
- 
-    if (Latch == *BB) { 
-      // For the last block, if CreateRemainderLoop is false, create a direct 
-      // jump to InsertBot. If not, create a loop back to cloned head. 
-      VMap.erase((*BB)->getTerminator()); 
-      BasicBlock *FirstLoopBB = cast<BasicBlock>(VMap[Header]); 
-      BranchInst *LatchBR = cast<BranchInst>(NewBB->getTerminator()); 
-      IRBuilder<> Builder(LatchBR); 
-      if (!CreateRemainderLoop) { 
-        Builder.CreateBr(InsertBot); 
-      } else { 
-        PHINode *NewIdx = PHINode::Create(NewIter->getType(), 2, 
-                                          suffix + ".iter", 
-                                          FirstLoopBB->getFirstNonPHI()); 
-        Value *IdxSub = 
-            Builder.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1), 
-                              NewIdx->getName() + ".sub"); 
-        Value *IdxCmp = 
-            Builder.CreateIsNotNull(IdxSub, NewIdx->getName() + ".cmp"); 
-        Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot); 
-        NewIdx->addIncoming(NewIter, InsertTop); 
-        NewIdx->addIncoming(IdxSub, NewBB); 
-      } 
-      LatchBR->eraseFromParent(); 
-    } 
-  } 
- 
-  // Change the incoming values to the ones defined in the preheader or 
-  // cloned loop. 
-  for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) { 
-    PHINode *NewPHI = cast<PHINode>(VMap[&*I]); 
-    if (!CreateRemainderLoop) { 
-      if (UseEpilogRemainder) { 
-        unsigned idx = NewPHI->getBasicBlockIndex(Preheader); 
-        NewPHI->setIncomingBlock(idx, InsertTop); 
-        NewPHI->removeIncomingValue(Latch, false); 
-      } else { 
-        VMap[&*I] = NewPHI->getIncomingValueForBlock(Preheader); 
-        cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI); 
-      } 
-    } else { 
-      unsigned idx = NewPHI->getBasicBlockIndex(Preheader); 
-      NewPHI->setIncomingBlock(idx, InsertTop); 
-      BasicBlock *NewLatch = cast<BasicBlock>(VMap[Latch]); 
-      idx = NewPHI->getBasicBlockIndex(Latch); 
-      Value *InVal = NewPHI->getIncomingValue(idx); 
-      NewPHI->setIncomingBlock(idx, NewLatch); 
-      if (Value *V = VMap.lookup(InVal)) 
-        NewPHI->setIncomingValue(idx, V); 
-    } 
-  } 
-  if (CreateRemainderLoop) { 
-    Loop *NewLoop = NewLoops[L];   
-    assert(NewLoop && "L should have been cloned"); 
-    MDNode *LoopID = NewLoop->getLoopID(); 
- 
-    // Only add loop metadata if the loop is not going to be completely 
-    // unrolled. 
-    if (UnrollRemainder) 
-      return NewLoop; 
- 
-    Optional<MDNode *> NewLoopID = makeFollowupLoopID( 
-        LoopID, {LLVMLoopUnrollFollowupAll, LLVMLoopUnrollFollowupRemainder}); 
-    if (NewLoopID.hasValue()) { 
-      NewLoop->setLoopID(NewLoopID.getValue()); 
- 
-      // Do not setLoopAlreadyUnrolled if loop attributes have been defined 
-      // explicitly. 
-      return NewLoop; 
-    } 
- 
-    // Add unroll disable metadata to disable future unrolling for this loop. 
-    NewLoop->setLoopAlreadyUnrolled(); 
-    return NewLoop; 
-  } 
-  else 
-    return nullptr; 
-} 
- 
-/// Returns true if we can safely unroll a multi-exit/exiting loop. OtherExits 
-/// is populated with all the loop exit blocks other than the LatchExit block. 
-static bool canSafelyUnrollMultiExitLoop(Loop *L, BasicBlock *LatchExit, 
-                                         bool PreserveLCSSA, 
-                                         bool UseEpilogRemainder) { 
- 
-  // We currently have some correctness constrains in unrolling a multi-exit 
-  // loop. Check for these below. 
- 
-  // We rely on LCSSA form being preserved when the exit blocks are transformed. 
-  if (!PreserveLCSSA) 
-    return false; 
- 
-  // TODO: Support multiple exiting blocks jumping to the `LatchExit` when 
-  // UnrollRuntimeMultiExit is true. This will need updating the logic in 
-  // connectEpilog/connectProlog. 
-  if (!LatchExit->getSinglePredecessor()) { 
-    LLVM_DEBUG( 
-        dbgs() << "Bailout for multi-exit handling when latch exit has >1 " 
-                  "predecessor.\n"); 
-    return false; 
-  } 
-  // FIXME: We bail out of multi-exit unrolling when epilog loop is generated 
-  // and L is an inner loop. This is because in presence of multiple exits, the 
-  // outer loop is incorrect: we do not add the EpilogPreheader and exit to the 
-  // outer loop. This is automatically handled in the prolog case, so we do not 
-  // have that bug in prolog generation. 
-  if (UseEpilogRemainder && L->getParentLoop()) 
-    return false; 
- 
-  // All constraints have been satisfied. 
-  return true; 
-} 
- 
-/// Returns true if we can profitably unroll the multi-exit loop L. Currently, 
-/// we return true only if UnrollRuntimeMultiExit is set to true. 
-static bool canProfitablyUnrollMultiExitLoop( 
-    Loop *L, SmallVectorImpl<BasicBlock *> &OtherExits, BasicBlock *LatchExit, 
-    bool PreserveLCSSA, bool UseEpilogRemainder) { 
- 
-#if !defined(NDEBUG) 
-  assert(canSafelyUnrollMultiExitLoop(L, LatchExit, PreserveLCSSA, 
-                                      UseEpilogRemainder) && 
-         "Should be safe to unroll before checking profitability!"); 
-#endif 
- 
-  // Priority goes to UnrollRuntimeMultiExit if it's supplied. 
-  if (UnrollRuntimeMultiExit.getNumOccurrences()) 
-    return UnrollRuntimeMultiExit; 
- 
-  // The main pain point with multi-exit loop unrolling is that once unrolled, 
-  // we will not be able to merge all blocks into a straight line code. 
-  // There are branches within the unrolled loop that go to the OtherExits. 
-  // The second point is the increase in code size, but this is true 
-  // irrespective of multiple exits. 
- 
-  // Note: Both the heuristics below are coarse grained. We are essentially 
-  // enabling unrolling of loops that have a single side exit other than the 
-  // normal LatchExit (i.e. exiting into a deoptimize block). 
-  // The heuristics considered are: 
-  // 1. low number of branches in the unrolled version. 
-  // 2. high predictability of these extra branches. 
-  // We avoid unrolling loops that have more than two exiting blocks. This 
-  // limits the total number of branches in the unrolled loop to be atmost 
-  // the unroll factor (since one of the exiting blocks is the latch block). 
-  SmallVector<BasicBlock*, 4> ExitingBlocks; 
-  L->getExitingBlocks(ExitingBlocks); 
-  if (ExitingBlocks.size() > 2) 
-    return false; 
- 
-  // The second heuristic is that L has one exit other than the latchexit and 
-  // that exit is a deoptimize block. We know that deoptimize blocks are rarely 
-  // taken, which also implies the branch leading to the deoptimize block is 
-  // highly predictable. 
-  return (OtherExits.size() == 1 && 
-          OtherExits[0]->getTerminatingDeoptimizeCall()); 
-  // TODO: These can be fine-tuned further to consider code size or deopt states 
-  // that are captured by the deoptimize exit block. 
-  // Also, we can extend this to support more cases, if we actually 
-  // know of kinds of multiexit loops that would benefit from unrolling. 
-} 
- 
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include <algorithm>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-unroll"
+
+STATISTIC(NumRuntimeUnrolled,
+          "Number of loops unrolled with run-time trip counts");
+static cl::opt<bool> UnrollRuntimeMultiExit(
+    "unroll-runtime-multi-exit", cl::init(false), cl::Hidden,
+    cl::desc("Allow runtime unrolling for loops with multiple exits, when "
+             "epilog is generated"));
+
+/// Connect the unrolling prolog code to the original loop.
+/// The unrolling prolog code contains code to execute the
+/// 'extra' iterations if the run-time trip count modulo the
+/// unroll count is non-zero.
+///
+/// This function performs the following:
+/// - Create PHI nodes at prolog end block to combine values
+///   that exit the prolog code and jump around the prolog.
+/// - Add a PHI operand to a PHI node at the loop exit block
+///   for values that exit the prolog and go around the loop.
+/// - Branch around the original loop if the trip count is less
+///   than the unroll factor.
+///
+static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
+                          BasicBlock *PrologExit,
+                          BasicBlock *OriginalLoopLatchExit,
+                          BasicBlock *PreHeader, BasicBlock *NewPreHeader,
+                          ValueToValueMapTy &VMap, DominatorTree *DT,
+                          LoopInfo *LI, bool PreserveLCSSA) {
+  // Loop structure should be the following:
+  // Preheader
+  //  PrologHeader
+  //  ...
+  //  PrologLatch
+  //  PrologExit
+  //   NewPreheader
+  //    Header
+  //    ...
+  //    Latch
+  //      LatchExit
+  BasicBlock *Latch = L->getLoopLatch();
+  assert(Latch && "Loop must have a latch");
+  BasicBlock *PrologLatch = cast<BasicBlock>(VMap[Latch]);
+
+  // Create a PHI node for each outgoing value from the original loop
+  // (which means it is an outgoing value from the prolog code too).
+  // The new PHI node is inserted in the prolog end basic block.
+  // The new PHI node value is added as an operand of a PHI node in either
+  // the loop header or the loop exit block.
+  for (BasicBlock *Succ : successors(Latch)) {
+    for (PHINode &PN : Succ->phis()) {
+      // Add a new PHI node to the prolog end block and add the
+      // appropriate incoming values.
+      // TODO: This code assumes that the PrologExit (or the LatchExit block for
+      // prolog loop) contains only one predecessor from the loop, i.e. the
+      // PrologLatch. When supporting multiple-exiting block loops, we can have
+      // two or more blocks that have the LatchExit as the target in the
+      // original loop.
+      PHINode *NewPN = PHINode::Create(PN.getType(), 2, PN.getName() + ".unr",
+                                       PrologExit->getFirstNonPHI());
+      // Adding a value to the new PHI node from the original loop preheader.
+      // This is the value that skips all the prolog code.
+      if (L->contains(&PN)) {
+        // Succ is loop header.
+        NewPN->addIncoming(PN.getIncomingValueForBlock(NewPreHeader),
+                           PreHeader);
+      } else {
+        // Succ is LatchExit.
+        NewPN->addIncoming(UndefValue::get(PN.getType()), PreHeader);
+      }
+
+      Value *V = PN.getIncomingValueForBlock(Latch);
+      if (Instruction *I = dyn_cast<Instruction>(V)) {
+        if (L->contains(I)) {
+          V = VMap.lookup(I);
+        }
+      }
+      // Adding a value to the new PHI node from the last prolog block
+      // that was created.
+      NewPN->addIncoming(V, PrologLatch);
+
+      // Update the existing PHI node operand with the value from the
+      // new PHI node.  How this is done depends on if the existing
+      // PHI node is in the original loop block, or the exit block.
+      if (L->contains(&PN))
+        PN.setIncomingValueForBlock(NewPreHeader, NewPN);
+      else
+        PN.addIncoming(NewPN, PrologExit);
+    }
+  }
+
+  // Make sure that created prolog loop is in simplified form
+  SmallVector<BasicBlock *, 4> PrologExitPreds;
+  Loop *PrologLoop = LI->getLoopFor(PrologLatch);
+  if (PrologLoop) {
+    for (BasicBlock *PredBB : predecessors(PrologExit))
+      if (PrologLoop->contains(PredBB))
+        PrologExitPreds.push_back(PredBB);
+
+    SplitBlockPredecessors(PrologExit, PrologExitPreds, ".unr-lcssa", DT, LI,
+                           nullptr, PreserveLCSSA);
+  }
+
+  // Create a branch around the original loop, which is taken if there are no
+  // iterations remaining to be executed after running the prologue.
+  Instruction *InsertPt = PrologExit->getTerminator();
+  IRBuilder<> B(InsertPt);
+
+  assert(Count != 0 && "nonsensical Count!");
+
+  // If BECount <u (Count - 1) then (BECount + 1) % Count == (BECount + 1)
+  // This means %xtraiter is (BECount + 1) and all of the iterations of this
+  // loop were executed by the prologue.  Note that if BECount <u (Count - 1)
+  // then (BECount + 1) cannot unsigned-overflow.
+  Value *BrLoopExit =
+      B.CreateICmpULT(BECount, ConstantInt::get(BECount->getType(), Count - 1));
+  // Split the exit to maintain loop canonicalization guarantees
+  SmallVector<BasicBlock *, 4> Preds(predecessors(OriginalLoopLatchExit));
+  SplitBlockPredecessors(OriginalLoopLatchExit, Preds, ".unr-lcssa", DT, LI,
+                         nullptr, PreserveLCSSA);
+  // Add the branch to the exit block (around the unrolled loop)
+  B.CreateCondBr(BrLoopExit, OriginalLoopLatchExit, NewPreHeader);
+  InsertPt->eraseFromParent();
+  if (DT)
+    DT->changeImmediateDominator(OriginalLoopLatchExit, PrologExit);
+}
+
+/// Connect the unrolling epilog code to the original loop.
+/// The unrolling epilog code contains code to execute the
+/// 'extra' iterations if the run-time trip count modulo the
+/// unroll count is non-zero.
+///
+/// This function performs the following:
+/// - Update PHI nodes at the unrolling loop exit and epilog loop exit
+/// - Create PHI nodes at the unrolling loop exit to combine
+///   values that exit the unrolling loop code and jump around it.
+/// - Update PHI operands in the epilog loop by the new PHI nodes
+/// - Branch around the epilog loop if extra iters (ModVal) is zero.
+///
+static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
+                          BasicBlock *Exit, BasicBlock *PreHeader,
+                          BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader,
+                          ValueToValueMapTy &VMap, DominatorTree *DT,
+                          LoopInfo *LI, bool PreserveLCSSA)  {
+  BasicBlock *Latch = L->getLoopLatch();
+  assert(Latch && "Loop must have a latch");
+  BasicBlock *EpilogLatch = cast<BasicBlock>(VMap[Latch]);
+
+  // Loop structure should be the following:
+  //
+  // PreHeader
+  // NewPreHeader
+  //   Header
+  //   ...
+  //   Latch
+  // NewExit (PN)
+  // EpilogPreHeader
+  //   EpilogHeader
+  //   ...
+  //   EpilogLatch
+  // Exit (EpilogPN)
+
+  // Update PHI nodes at NewExit and Exit.
+  for (PHINode &PN : NewExit->phis()) {
+    // PN should be used in another PHI located in Exit block as
+    // Exit was split by SplitBlockPredecessors into Exit and NewExit
+    // Basicaly it should look like:
+    // NewExit:
+    //   PN = PHI [I, Latch]
+    // ...
+    // Exit:
+    //   EpilogPN = PHI [PN, EpilogPreHeader]
+    //
+    // There is EpilogPreHeader incoming block instead of NewExit as
+    // NewExit was spilt 1 more time to get EpilogPreHeader.
+    assert(PN.hasOneUse() && "The phi should have 1 use");
+    PHINode *EpilogPN = cast<PHINode>(PN.use_begin()->getUser());
+    assert(EpilogPN->getParent() == Exit && "EpilogPN should be in Exit block");
+
+    // Add incoming PreHeader from branch around the Loop
+    PN.addIncoming(UndefValue::get(PN.getType()), PreHeader);
+
+    Value *V = PN.getIncomingValueForBlock(Latch);
+    Instruction *I = dyn_cast<Instruction>(V);
+    if (I && L->contains(I))
+      // If value comes from an instruction in the loop add VMap value.
+      V = VMap.lookup(I);
+    // For the instruction out of the loop, constant or undefined value
+    // insert value itself.
+    EpilogPN->addIncoming(V, EpilogLatch);
+
+    assert(EpilogPN->getBasicBlockIndex(EpilogPreHeader) >= 0 &&
+          "EpilogPN should have EpilogPreHeader incoming block");
+    // Change EpilogPreHeader incoming block to NewExit.
+    EpilogPN->setIncomingBlock(EpilogPN->getBasicBlockIndex(EpilogPreHeader),
+                               NewExit);
+    // Now PHIs should look like:
+    // NewExit:
+    //   PN = PHI [I, Latch], [undef, PreHeader]
+    // ...
+    // Exit:
+    //   EpilogPN = PHI [PN, NewExit], [VMap[I], EpilogLatch]
+  }
+
+  // Create PHI nodes at NewExit (from the unrolling loop Latch and PreHeader).
+  // Update corresponding PHI nodes in epilog loop.
+  for (BasicBlock *Succ : successors(Latch)) {
+    // Skip this as we already updated phis in exit blocks.
+    if (!L->contains(Succ))
+      continue;
+    for (PHINode &PN : Succ->phis()) {
+      // Add new PHI nodes to the loop exit block and update epilog
+      // PHIs with the new PHI values.
+      PHINode *NewPN = PHINode::Create(PN.getType(), 2, PN.getName() + ".unr",
+                                       NewExit->getFirstNonPHI());
+      // Adding a value to the new PHI node from the unrolling loop preheader.
+      NewPN->addIncoming(PN.getIncomingValueForBlock(NewPreHeader), PreHeader);
+      // Adding a value to the new PHI node from the unrolling loop latch.
+      NewPN->addIncoming(PN.getIncomingValueForBlock(Latch), Latch);
+
+      // Update the existing PHI node operand with the value from the new PHI
+      // node.  Corresponding instruction in epilog loop should be PHI.
+      PHINode *VPN = cast<PHINode>(VMap[&PN]);
+      VPN->setIncomingValueForBlock(EpilogPreHeader, NewPN);
+    }
+  }
+
+  Instruction *InsertPt = NewExit->getTerminator();
+  IRBuilder<> B(InsertPt);
+  Value *BrLoopExit = B.CreateIsNotNull(ModVal, "lcmp.mod");
+  assert(Exit && "Loop must have a single exit block only");
+  // Split the epilogue exit to maintain loop canonicalization guarantees
+  SmallVector<BasicBlock*, 4> Preds(predecessors(Exit));
+  SplitBlockPredecessors(Exit, Preds, ".epilog-lcssa", DT, LI, nullptr,
+                         PreserveLCSSA);
+  // Add the branch to the exit block (around the unrolling loop)
+  B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit);
+  InsertPt->eraseFromParent();
+  if (DT)
+    DT->changeImmediateDominator(Exit, NewExit);
+
+  // Split the main loop exit to maintain canonicalization guarantees.
+  SmallVector<BasicBlock*, 4> NewExitPreds{Latch};
+  SplitBlockPredecessors(NewExit, NewExitPreds, ".loopexit", DT, LI, nullptr,
+                         PreserveLCSSA);
+}
+
+/// Create a clone of the blocks in a loop and connect them together.
+/// If CreateRemainderLoop is false, loop structure will not be cloned,
+/// otherwise a new loop will be created including all cloned blocks, and the
+/// iterator of it switches to count NewIter down to 0.
+/// The cloned blocks should be inserted between InsertTop and InsertBot.
+/// If loop structure is cloned InsertTop should be new preheader, InsertBot
+/// new loop exit.
+/// Return the new cloned loop that is created when CreateRemainderLoop is true.
+static Loop *
+CloneLoopBlocks(Loop *L, Value *NewIter, const bool CreateRemainderLoop,
+                const bool UseEpilogRemainder, const bool UnrollRemainder,
+                BasicBlock *InsertTop,
+                BasicBlock *InsertBot, BasicBlock *Preheader,
+                std::vector<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks,
+                ValueToValueMapTy &VMap, DominatorTree *DT, LoopInfo *LI) {
+  StringRef suffix = UseEpilogRemainder ? "epil" : "prol";
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Latch = L->getLoopLatch();
+  Function *F = Header->getParent();
+  LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO();
+  LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
+  Loop *ParentLoop = L->getParentLoop();
+  NewLoopsMap NewLoops;
+  NewLoops[ParentLoop] = ParentLoop;
+  if (!CreateRemainderLoop)
+    NewLoops[L] = ParentLoop;
+
+  // For each block in the original loop, create a new copy,
+  // and update the value map with the newly created values.
+  for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
+    BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, "." + suffix, F);
+    NewBlocks.push_back(NewBB);
+
+    // If we're unrolling the outermost loop, there's no remainder loop,
+    // and this block isn't in a nested loop, then the new block is not
+    // in any loop. Otherwise, add it to loopinfo.
+    if (CreateRemainderLoop || LI->getLoopFor(*BB) != L || ParentLoop)
+      addClonedBlockToLoopInfo(*BB, NewBB, LI, NewLoops);
+
+    VMap[*BB] = NewBB;
+    if (Header == *BB) {
+      // For the first block, add a CFG connection to this newly
+      // created block.
+      InsertTop->getTerminator()->setSuccessor(0, NewBB);
+    }
+
+    if (DT) {
+      if (Header == *BB) {
+        // The header is dominated by the preheader.
+        DT->addNewBlock(NewBB, InsertTop);
+      } else {
+        // Copy information from original loop to unrolled loop.
+        BasicBlock *IDomBB = DT->getNode(*BB)->getIDom()->getBlock();
+        DT->addNewBlock(NewBB, cast<BasicBlock>(VMap[IDomBB]));
+      }
+    }
+
+    if (Latch == *BB) {
+      // For the last block, if CreateRemainderLoop is false, create a direct
+      // jump to InsertBot. If not, create a loop back to cloned head.
+      VMap.erase((*BB)->getTerminator());
+      BasicBlock *FirstLoopBB = cast<BasicBlock>(VMap[Header]);
+      BranchInst *LatchBR = cast<BranchInst>(NewBB->getTerminator());
+      IRBuilder<> Builder(LatchBR);
+      if (!CreateRemainderLoop) {
+        Builder.CreateBr(InsertBot);
+      } else {
+        PHINode *NewIdx = PHINode::Create(NewIter->getType(), 2,
+                                          suffix + ".iter",
+                                          FirstLoopBB->getFirstNonPHI());
+        Value *IdxSub =
+            Builder.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
+                              NewIdx->getName() + ".sub");
+        Value *IdxCmp =
+            Builder.CreateIsNotNull(IdxSub, NewIdx->getName() + ".cmp");
+        Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot);
+        NewIdx->addIncoming(NewIter, InsertTop);
+        NewIdx->addIncoming(IdxSub, NewBB);
+      }
+      LatchBR->eraseFromParent();
+    }
+  }
+
+  // Change the incoming values to the ones defined in the preheader or
+  // cloned loop.
+  for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
+    PHINode *NewPHI = cast<PHINode>(VMap[&*I]);
+    if (!CreateRemainderLoop) {
+      if (UseEpilogRemainder) {
+        unsigned idx = NewPHI->getBasicBlockIndex(Preheader);
+        NewPHI->setIncomingBlock(idx, InsertTop);
+        NewPHI->removeIncomingValue(Latch, false);
+      } else {
+        VMap[&*I] = NewPHI->getIncomingValueForBlock(Preheader);
+        cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI);
+      }
+    } else {
+      unsigned idx = NewPHI->getBasicBlockIndex(Preheader);
+      NewPHI->setIncomingBlock(idx, InsertTop);
+      BasicBlock *NewLatch = cast<BasicBlock>(VMap[Latch]);
+      idx = NewPHI->getBasicBlockIndex(Latch);
+      Value *InVal = NewPHI->getIncomingValue(idx);
+      NewPHI->setIncomingBlock(idx, NewLatch);
+      if (Value *V = VMap.lookup(InVal))
+        NewPHI->setIncomingValue(idx, V);
+    }
+  }
+  if (CreateRemainderLoop) {
+    Loop *NewLoop = NewLoops[L];  
+    assert(NewLoop && "L should have been cloned");
+    MDNode *LoopID = NewLoop->getLoopID();
+
+    // Only add loop metadata if the loop is not going to be completely
+    // unrolled.
+    if (UnrollRemainder)
+      return NewLoop;
+
+    Optional<MDNode *> NewLoopID = makeFollowupLoopID(
+        LoopID, {LLVMLoopUnrollFollowupAll, LLVMLoopUnrollFollowupRemainder});
+    if (NewLoopID.hasValue()) {
+      NewLoop->setLoopID(NewLoopID.getValue());
+
+      // Do not setLoopAlreadyUnrolled if loop attributes have been defined
+      // explicitly.
+      return NewLoop;
+    }
+
+    // Add unroll disable metadata to disable future unrolling for this loop.
+    NewLoop->setLoopAlreadyUnrolled();
+    return NewLoop;
+  }
+  else
+    return nullptr;
+}
+
+/// Returns true if we can safely unroll a multi-exit/exiting loop. OtherExits
+/// is populated with all the loop exit blocks other than the LatchExit block.
+static bool canSafelyUnrollMultiExitLoop(Loop *L, BasicBlock *LatchExit,
+                                         bool PreserveLCSSA,
+                                         bool UseEpilogRemainder) {
+
+  // We currently have some correctness constrains in unrolling a multi-exit
+  // loop. Check for these below.
+
+  // We rely on LCSSA form being preserved when the exit blocks are transformed.
+  if (!PreserveLCSSA)
+    return false;
+
+  // TODO: Support multiple exiting blocks jumping to the `LatchExit` when
+  // UnrollRuntimeMultiExit is true. This will need updating the logic in
+  // connectEpilog/connectProlog.
+  if (!LatchExit->getSinglePredecessor()) {
+    LLVM_DEBUG(
+        dbgs() << "Bailout for multi-exit handling when latch exit has >1 "
+                  "predecessor.\n");
+    return false;
+  }
+  // FIXME: We bail out of multi-exit unrolling when epilog loop is generated
+  // and L is an inner loop. This is because in presence of multiple exits, the
+  // outer loop is incorrect: we do not add the EpilogPreheader and exit to the
+  // outer loop. This is automatically handled in the prolog case, so we do not
+  // have that bug in prolog generation.
+  if (UseEpilogRemainder && L->getParentLoop())
+    return false;
+
+  // All constraints have been satisfied.
+  return true;
+}
+
+/// Returns true if we can profitably unroll the multi-exit loop L. Currently,
+/// we return true only if UnrollRuntimeMultiExit is set to true.
+static bool canProfitablyUnrollMultiExitLoop(
+    Loop *L, SmallVectorImpl<BasicBlock *> &OtherExits, BasicBlock *LatchExit,
+    bool PreserveLCSSA, bool UseEpilogRemainder) {
+
+#if !defined(NDEBUG)
+  assert(canSafelyUnrollMultiExitLoop(L, LatchExit, PreserveLCSSA,
+                                      UseEpilogRemainder) &&
+         "Should be safe to unroll before checking profitability!");
+#endif
+
+  // Priority goes to UnrollRuntimeMultiExit if it's supplied.
+  if (UnrollRuntimeMultiExit.getNumOccurrences())
+    return UnrollRuntimeMultiExit;
+
+  // The main pain point with multi-exit loop unrolling is that once unrolled,
+  // we will not be able to merge all blocks into a straight line code.
+  // There are branches within the unrolled loop that go to the OtherExits.
+  // The second point is the increase in code size, but this is true
+  // irrespective of multiple exits.
+
+  // Note: Both the heuristics below are coarse grained. We are essentially
+  // enabling unrolling of loops that have a single side exit other than the
+  // normal LatchExit (i.e. exiting into a deoptimize block).
+  // The heuristics considered are:
+  // 1. low number of branches in the unrolled version.
+  // 2. high predictability of these extra branches.
+  // We avoid unrolling loops that have more than two exiting blocks. This
+  // limits the total number of branches in the unrolled loop to be atmost
+  // the unroll factor (since one of the exiting blocks is the latch block).
+  SmallVector<BasicBlock*, 4> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+  if (ExitingBlocks.size() > 2)
+    return false;
+
+  // The second heuristic is that L has one exit other than the latchexit and
+  // that exit is a deoptimize block. We know that deoptimize blocks are rarely
+  // taken, which also implies the branch leading to the deoptimize block is
+  // highly predictable.
+  return (OtherExits.size() == 1 &&
+          OtherExits[0]->getTerminatingDeoptimizeCall());
+  // TODO: These can be fine-tuned further to consider code size or deopt states
+  // that are captured by the deoptimize exit block.
+  // Also, we can extend this to support more cases, if we actually
+  // know of kinds of multiexit loops that would benefit from unrolling.
+}
+
 // Assign the maximum possible trip count as the back edge weight for the
 // remainder loop if the original loop comes with a branch weight.
 static void updateLatchBranchWeightsForRemainderLoop(Loop *OrigLoop,
@@ -531,459 +531,459 @@ static void updateLatchBranchWeightsForRemainderLoop(Loop *OrigLoop,
   }
 }
 
-/// Insert code in the prolog/epilog code when unrolling a loop with a 
-/// run-time trip-count. 
-/// 
-/// This method assumes that the loop unroll factor is total number 
-/// of loop bodies in the loop after unrolling. (Some folks refer 
-/// to the unroll factor as the number of *extra* copies added). 
-/// We assume also that the loop unroll factor is a power-of-two. So, after 
-/// unrolling the loop, the number of loop bodies executed is 2, 
-/// 4, 8, etc.  Note - LLVM converts the if-then-sequence to a switch 
-/// instruction in SimplifyCFG.cpp.  Then, the backend decides how code for 
-/// the switch instruction is generated. 
-/// 
-/// ***Prolog case*** 
-///        extraiters = tripcount % loopfactor 
-///        if (extraiters == 0) jump Loop: 
-///        else jump Prol: 
-/// Prol:  LoopBody; 
-///        extraiters -= 1                 // Omitted if unroll factor is 2. 
-///        if (extraiters != 0) jump Prol: // Omitted if unroll factor is 2. 
-///        if (tripcount < loopfactor) jump End: 
-/// Loop: 
-/// ... 
-/// End: 
-/// 
-/// ***Epilog case*** 
-///        extraiters = tripcount % loopfactor 
-///        if (tripcount < loopfactor) jump LoopExit: 
-///        unroll_iters = tripcount - extraiters 
-/// Loop:  LoopBody; (executes unroll_iter times); 
-///        unroll_iter -= 1 
-///        if (unroll_iter != 0) jump Loop: 
-/// LoopExit: 
-///        if (extraiters == 0) jump EpilExit: 
-/// Epil:  LoopBody; (executes extraiters times) 
-///        extraiters -= 1                 // Omitted if unroll factor is 2. 
-///        if (extraiters != 0) jump Epil: // Omitted if unroll factor is 2. 
-/// EpilExit: 
- 
-bool llvm::UnrollRuntimeLoopRemainder( 
-    Loop *L, unsigned Count, bool AllowExpensiveTripCount, 
-    bool UseEpilogRemainder, bool UnrollRemainder, bool ForgetAllSCEV, 
-    LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, 
-    const TargetTransformInfo *TTI, bool PreserveLCSSA, Loop **ResultLoop) { 
-  LLVM_DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n"); 
-  LLVM_DEBUG(L->dump()); 
-  LLVM_DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n" 
-                                : dbgs() << "Using prolog remainder.\n"); 
- 
-  // Make sure the loop is in canonical form. 
-  if (!L->isLoopSimplifyForm()) { 
-    LLVM_DEBUG(dbgs() << "Not in simplify form!\n"); 
-    return false; 
-  } 
- 
-  // Guaranteed by LoopSimplifyForm. 
-  BasicBlock *Latch = L->getLoopLatch(); 
-  BasicBlock *Header = L->getHeader(); 
- 
-  BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator()); 
- 
-  if (!LatchBR || LatchBR->isUnconditional()) { 
-    // The loop-rotate pass can be helpful to avoid this in many cases. 
-    LLVM_DEBUG( 
-        dbgs() 
-        << "Loop latch not terminated by a conditional branch.\n"); 
-    return false; 
-  } 
- 
-  unsigned ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0; 
-  BasicBlock *LatchExit = LatchBR->getSuccessor(ExitIndex); 
- 
-  if (L->contains(LatchExit)) { 
-    // Cloning the loop basic blocks (`CloneLoopBlocks`) requires that one of the 
-    // targets of the Latch be an exit block out of the loop. 
-    LLVM_DEBUG( 
-        dbgs() 
-        << "One of the loop latch successors must be the exit block.\n"); 
-    return false; 
-  } 
- 
-  // These are exit blocks other than the target of the latch exiting block. 
-  SmallVector<BasicBlock *, 4> OtherExits; 
-  L->getUniqueNonLatchExitBlocks(OtherExits); 
-  bool isMultiExitUnrollingEnabled = 
-      canSafelyUnrollMultiExitLoop(L, LatchExit, PreserveLCSSA, 
-                                   UseEpilogRemainder) && 
-      canProfitablyUnrollMultiExitLoop(L, OtherExits, LatchExit, PreserveLCSSA, 
-                                       UseEpilogRemainder); 
-  // Support only single exit and exiting block unless multi-exit loop unrolling is enabled. 
-  if (!isMultiExitUnrollingEnabled && 
-      (!L->getExitingBlock() || OtherExits.size())) { 
-    LLVM_DEBUG( 
-        dbgs() 
-        << "Multiple exit/exiting blocks in loop and multi-exit unrolling not " 
-           "enabled!\n"); 
-    return false; 
-  } 
-  // Use Scalar Evolution to compute the trip count. This allows more loops to 
-  // be unrolled than relying on induction var simplification. 
-  if (!SE) 
-    return false; 
- 
-  // Only unroll loops with a computable trip count, and the trip count needs 
-  // to be an int value (allowing a pointer type is a TODO item). 
-  // We calculate the backedge count by using getExitCount on the Latch block, 
-  // which is proven to be the only exiting block in this loop. This is same as 
-  // calculating getBackedgeTakenCount on the loop (which computes SCEV for all 
-  // exiting blocks). 
-  const SCEV *BECountSC = SE->getExitCount(L, Latch); 
-  if (isa<SCEVCouldNotCompute>(BECountSC) || 
-      !BECountSC->getType()->isIntegerTy()) { 
-    LLVM_DEBUG(dbgs() << "Could not compute exit block SCEV\n"); 
-    return false; 
-  } 
- 
-  unsigned BEWidth = cast<IntegerType>(BECountSC->getType())->getBitWidth(); 
- 
-  // Add 1 since the backedge count doesn't include the first loop iteration. 
-  const SCEV *TripCountSC = 
-      SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1)); 
-  if (isa<SCEVCouldNotCompute>(TripCountSC)) { 
-    LLVM_DEBUG(dbgs() << "Could not compute trip count SCEV.\n"); 
-    return false; 
-  } 
- 
-  BasicBlock *PreHeader = L->getLoopPreheader(); 
-  BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator()); 
-  const DataLayout &DL = Header->getModule()->getDataLayout(); 
-  SCEVExpander Expander(*SE, DL, "loop-unroll"); 
-  if (!AllowExpensiveTripCount && 
-      Expander.isHighCostExpansion(TripCountSC, L, SCEVCheapExpansionBudget, 
-                                   TTI, PreHeaderBR)) { 
-    LLVM_DEBUG(dbgs() << "High cost for expanding trip count scev!\n"); 
-    return false; 
-  } 
- 
-  // This constraint lets us deal with an overflowing trip count easily; see the 
-  // comment on ModVal below. 
-  if (Log2_32(Count) > BEWidth) { 
-    LLVM_DEBUG( 
-        dbgs() 
-        << "Count failed constraint on overflow trip count calculation.\n"); 
-    return false; 
-  } 
- 
-  // Loop structure is the following: 
-  // 
-  // PreHeader 
-  //   Header 
-  //   ... 
-  //   Latch 
-  // LatchExit 
- 
-  BasicBlock *NewPreHeader; 
-  BasicBlock *NewExit = nullptr; 
-  BasicBlock *PrologExit = nullptr; 
-  BasicBlock *EpilogPreHeader = nullptr; 
-  BasicBlock *PrologPreHeader = nullptr; 
- 
-  if (UseEpilogRemainder) { 
-    // If epilog remainder 
-    // Split PreHeader to insert a branch around loop for unrolling. 
-    NewPreHeader = SplitBlock(PreHeader, PreHeader->getTerminator(), DT, LI); 
-    NewPreHeader->setName(PreHeader->getName() + ".new"); 
-    // Split LatchExit to create phi nodes from branch above. 
-    SmallVector<BasicBlock*, 4> Preds(predecessors(LatchExit)); 
-    NewExit = SplitBlockPredecessors(LatchExit, Preds, ".unr-lcssa", DT, LI, 
-                                     nullptr, PreserveLCSSA); 
-    // NewExit gets its DebugLoc from LatchExit, which is not part of the 
-    // original Loop. 
-    // Fix this by setting Loop's DebugLoc to NewExit. 
-    auto *NewExitTerminator = NewExit->getTerminator(); 
-    NewExitTerminator->setDebugLoc(Header->getTerminator()->getDebugLoc()); 
-    // Split NewExit to insert epilog remainder loop. 
-    EpilogPreHeader = SplitBlock(NewExit, NewExitTerminator, DT, LI); 
-    EpilogPreHeader->setName(Header->getName() + ".epil.preheader"); 
-  } else { 
-    // If prolog remainder 
-    // Split the original preheader twice to insert prolog remainder loop 
-    PrologPreHeader = SplitEdge(PreHeader, Header, DT, LI); 
-    PrologPreHeader->setName(Header->getName() + ".prol.preheader"); 
-    PrologExit = SplitBlock(PrologPreHeader, PrologPreHeader->getTerminator(), 
-                            DT, LI); 
-    PrologExit->setName(Header->getName() + ".prol.loopexit"); 
-    // Split PrologExit to get NewPreHeader. 
-    NewPreHeader = SplitBlock(PrologExit, PrologExit->getTerminator(), DT, LI); 
-    NewPreHeader->setName(PreHeader->getName() + ".new"); 
-  } 
-  // Loop structure should be the following: 
-  //  Epilog             Prolog 
-  // 
-  // PreHeader         PreHeader 
-  // *NewPreHeader     *PrologPreHeader 
-  //   Header          *PrologExit 
-  //   ...             *NewPreHeader 
-  //   Latch             Header 
-  // *NewExit            ... 
-  // *EpilogPreHeader    Latch 
-  // LatchExit              LatchExit 
- 
-  // Calculate conditions for branch around loop for unrolling 
-  // in epilog case and around prolog remainder loop in prolog case. 
-  // Compute the number of extra iterations required, which is: 
-  //  extra iterations = run-time trip count % loop unroll factor 
-  PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator()); 
-  Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(), 
-                                            PreHeaderBR); 
-  Value *BECount = Expander.expandCodeFor(BECountSC, BECountSC->getType(), 
-                                          PreHeaderBR); 
-  IRBuilder<> B(PreHeaderBR); 
-  Value *ModVal; 
-  // Calculate ModVal = (BECount + 1) % Count. 
-  // Note that TripCount is BECount + 1. 
-  if (isPowerOf2_32(Count)) { 
-    // When Count is power of 2 we don't BECount for epilog case, however we'll 
-    // need it for a branch around unrolling loop for prolog case. 
-    ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter"); 
-    //  1. There are no iterations to be run in the prolog/epilog loop. 
-    // OR 
-    //  2. The addition computing TripCount overflowed. 
-    // 
-    // If (2) is true, we know that TripCount really is (1 << BEWidth) and so 
-    // the number of iterations that remain to be run in the original loop is a 
-    // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (we 
-    // explicitly check this above). 
-  } else { 
-    // As (BECount + 1) can potentially unsigned overflow we count 
-    // (BECount % Count) + 1 which is overflow safe as BECount % Count < Count. 
-    Value *ModValTmp = B.CreateURem(BECount, 
-                                    ConstantInt::get(BECount->getType(), 
-                                                     Count)); 
-    Value *ModValAdd = B.CreateAdd(ModValTmp, 
-                                   ConstantInt::get(ModValTmp->getType(), 1)); 
-    // At that point (BECount % Count) + 1 could be equal to Count. 
-    // To handle this case we need to take mod by Count one more time. 
-    ModVal = B.CreateURem(ModValAdd, 
-                          ConstantInt::get(BECount->getType(), Count), 
-                          "xtraiter"); 
-  } 
-  Value *BranchVal = 
-      UseEpilogRemainder ? B.CreateICmpULT(BECount, 
-                                           ConstantInt::get(BECount->getType(), 
-                                                            Count - 1)) : 
-                           B.CreateIsNotNull(ModVal, "lcmp.mod"); 
-  BasicBlock *RemainderLoop = UseEpilogRemainder ? NewExit : PrologPreHeader; 
-  BasicBlock *UnrollingLoop = UseEpilogRemainder ? NewPreHeader : PrologExit; 
-  // Branch to either remainder (extra iterations) loop or unrolling loop. 
-  B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop); 
-  PreHeaderBR->eraseFromParent(); 
-  if (DT) { 
-    if (UseEpilogRemainder) 
-      DT->changeImmediateDominator(NewExit, PreHeader); 
-    else 
-      DT->changeImmediateDominator(PrologExit, PreHeader); 
-  } 
-  Function *F = Header->getParent(); 
-  // Get an ordered list of blocks in the loop to help with the ordering of the 
-  // cloned blocks in the prolog/epilog code 
-  LoopBlocksDFS LoopBlocks(L); 
-  LoopBlocks.perform(LI); 
- 
-  // 
-  // For each extra loop iteration, create a copy of the loop's basic blocks 
-  // and generate a condition that branches to the copy depending on the 
-  // number of 'left over' iterations. 
-  // 
-  std::vector<BasicBlock *> NewBlocks; 
-  ValueToValueMapTy VMap; 
- 
-  // For unroll factor 2 remainder loop will have 1 iterations. 
-  // Do not create 1 iteration loop. 
-  bool CreateRemainderLoop = (Count != 2); 
- 
-  // Clone all the basic blocks in the loop. If Count is 2, we don't clone 
-  // the loop, otherwise we create a cloned loop to execute the extra 
-  // iterations. This function adds the appropriate CFG connections. 
-  BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit; 
-  BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader; 
-  Loop *remainderLoop = CloneLoopBlocks( 
-      L, ModVal, CreateRemainderLoop, UseEpilogRemainder, UnrollRemainder, 
-      InsertTop, InsertBot, 
-      NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI); 
- 
+/// Insert code in the prolog/epilog code when unrolling a loop with a
+/// run-time trip-count.
+///
+/// This method assumes that the loop unroll factor is total number
+/// of loop bodies in the loop after unrolling. (Some folks refer
+/// to the unroll factor as the number of *extra* copies added).
+/// We assume also that the loop unroll factor is a power-of-two. So, after
+/// unrolling the loop, the number of loop bodies executed is 2,
+/// 4, 8, etc.  Note - LLVM converts the if-then-sequence to a switch
+/// instruction in SimplifyCFG.cpp.  Then, the backend decides how code for
+/// the switch instruction is generated.
+///
+/// ***Prolog case***
+///        extraiters = tripcount % loopfactor
+///        if (extraiters == 0) jump Loop:
+///        else jump Prol:
+/// Prol:  LoopBody;
+///        extraiters -= 1                 // Omitted if unroll factor is 2.
+///        if (extraiters != 0) jump Prol: // Omitted if unroll factor is 2.
+///        if (tripcount < loopfactor) jump End:
+/// Loop:
+/// ...
+/// End:
+///
+/// ***Epilog case***
+///        extraiters = tripcount % loopfactor
+///        if (tripcount < loopfactor) jump LoopExit:
+///        unroll_iters = tripcount - extraiters
+/// Loop:  LoopBody; (executes unroll_iter times);
+///        unroll_iter -= 1
+///        if (unroll_iter != 0) jump Loop:
+/// LoopExit:
+///        if (extraiters == 0) jump EpilExit:
+/// Epil:  LoopBody; (executes extraiters times)
+///        extraiters -= 1                 // Omitted if unroll factor is 2.
+///        if (extraiters != 0) jump Epil: // Omitted if unroll factor is 2.
+/// EpilExit:
+
+bool llvm::UnrollRuntimeLoopRemainder(
+    Loop *L, unsigned Count, bool AllowExpensiveTripCount,
+    bool UseEpilogRemainder, bool UnrollRemainder, bool ForgetAllSCEV,
+    LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
+    const TargetTransformInfo *TTI, bool PreserveLCSSA, Loop **ResultLoop) {
+  LLVM_DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n");
+  LLVM_DEBUG(L->dump());
+  LLVM_DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n"
+                                : dbgs() << "Using prolog remainder.\n");
+
+  // Make sure the loop is in canonical form.
+  if (!L->isLoopSimplifyForm()) {
+    LLVM_DEBUG(dbgs() << "Not in simplify form!\n");
+    return false;
+  }
+
+  // Guaranteed by LoopSimplifyForm.
+  BasicBlock *Latch = L->getLoopLatch();
+  BasicBlock *Header = L->getHeader();
+
+  BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
+
+  if (!LatchBR || LatchBR->isUnconditional()) {
+    // The loop-rotate pass can be helpful to avoid this in many cases.
+    LLVM_DEBUG(
+        dbgs()
+        << "Loop latch not terminated by a conditional branch.\n");
+    return false;
+  }
+
+  unsigned ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0;
+  BasicBlock *LatchExit = LatchBR->getSuccessor(ExitIndex);
+
+  if (L->contains(LatchExit)) {
+    // Cloning the loop basic blocks (`CloneLoopBlocks`) requires that one of the
+    // targets of the Latch be an exit block out of the loop.
+    LLVM_DEBUG(
+        dbgs()
+        << "One of the loop latch successors must be the exit block.\n");
+    return false;
+  }
+
+  // These are exit blocks other than the target of the latch exiting block.
+  SmallVector<BasicBlock *, 4> OtherExits;
+  L->getUniqueNonLatchExitBlocks(OtherExits);
+  bool isMultiExitUnrollingEnabled =
+      canSafelyUnrollMultiExitLoop(L, LatchExit, PreserveLCSSA,
+                                   UseEpilogRemainder) &&
+      canProfitablyUnrollMultiExitLoop(L, OtherExits, LatchExit, PreserveLCSSA,
+                                       UseEpilogRemainder);
+  // Support only single exit and exiting block unless multi-exit loop unrolling is enabled.
+  if (!isMultiExitUnrollingEnabled &&
+      (!L->getExitingBlock() || OtherExits.size())) {
+    LLVM_DEBUG(
+        dbgs()
+        << "Multiple exit/exiting blocks in loop and multi-exit unrolling not "
+           "enabled!\n");
+    return false;
+  }
+  // Use Scalar Evolution to compute the trip count. This allows more loops to
+  // be unrolled than relying on induction var simplification.
+  if (!SE)
+    return false;
+
+  // Only unroll loops with a computable trip count, and the trip count needs
+  // to be an int value (allowing a pointer type is a TODO item).
+  // We calculate the backedge count by using getExitCount on the Latch block,
+  // which is proven to be the only exiting block in this loop. This is same as
+  // calculating getBackedgeTakenCount on the loop (which computes SCEV for all
+  // exiting blocks).
+  const SCEV *BECountSC = SE->getExitCount(L, Latch);
+  if (isa<SCEVCouldNotCompute>(BECountSC) ||
+      !BECountSC->getType()->isIntegerTy()) {
+    LLVM_DEBUG(dbgs() << "Could not compute exit block SCEV\n");
+    return false;
+  }
+
+  unsigned BEWidth = cast<IntegerType>(BECountSC->getType())->getBitWidth();
+
+  // Add 1 since the backedge count doesn't include the first loop iteration.
+  const SCEV *TripCountSC =
+      SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1));
+  if (isa<SCEVCouldNotCompute>(TripCountSC)) {
+    LLVM_DEBUG(dbgs() << "Could not compute trip count SCEV.\n");
+    return false;
+  }
+
+  BasicBlock *PreHeader = L->getLoopPreheader();
+  BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
+  const DataLayout &DL = Header->getModule()->getDataLayout();
+  SCEVExpander Expander(*SE, DL, "loop-unroll");
+  if (!AllowExpensiveTripCount &&
+      Expander.isHighCostExpansion(TripCountSC, L, SCEVCheapExpansionBudget,
+                                   TTI, PreHeaderBR)) {
+    LLVM_DEBUG(dbgs() << "High cost for expanding trip count scev!\n");
+    return false;
+  }
+
+  // This constraint lets us deal with an overflowing trip count easily; see the
+  // comment on ModVal below.
+  if (Log2_32(Count) > BEWidth) {
+    LLVM_DEBUG(
+        dbgs()
+        << "Count failed constraint on overflow trip count calculation.\n");
+    return false;
+  }
+
+  // Loop structure is the following:
+  //
+  // PreHeader
+  //   Header
+  //   ...
+  //   Latch
+  // LatchExit
+
+  BasicBlock *NewPreHeader;
+  BasicBlock *NewExit = nullptr;
+  BasicBlock *PrologExit = nullptr;
+  BasicBlock *EpilogPreHeader = nullptr;
+  BasicBlock *PrologPreHeader = nullptr;
+
+  if (UseEpilogRemainder) {
+    // If epilog remainder
+    // Split PreHeader to insert a branch around loop for unrolling.
+    NewPreHeader = SplitBlock(PreHeader, PreHeader->getTerminator(), DT, LI);
+    NewPreHeader->setName(PreHeader->getName() + ".new");
+    // Split LatchExit to create phi nodes from branch above.
+    SmallVector<BasicBlock*, 4> Preds(predecessors(LatchExit));
+    NewExit = SplitBlockPredecessors(LatchExit, Preds, ".unr-lcssa", DT, LI,
+                                     nullptr, PreserveLCSSA);
+    // NewExit gets its DebugLoc from LatchExit, which is not part of the
+    // original Loop.
+    // Fix this by setting Loop's DebugLoc to NewExit.
+    auto *NewExitTerminator = NewExit->getTerminator();
+    NewExitTerminator->setDebugLoc(Header->getTerminator()->getDebugLoc());
+    // Split NewExit to insert epilog remainder loop.
+    EpilogPreHeader = SplitBlock(NewExit, NewExitTerminator, DT, LI);
+    EpilogPreHeader->setName(Header->getName() + ".epil.preheader");
+  } else {
+    // If prolog remainder
+    // Split the original preheader twice to insert prolog remainder loop
+    PrologPreHeader = SplitEdge(PreHeader, Header, DT, LI);
+    PrologPreHeader->setName(Header->getName() + ".prol.preheader");
+    PrologExit = SplitBlock(PrologPreHeader, PrologPreHeader->getTerminator(),
+                            DT, LI);
+    PrologExit->setName(Header->getName() + ".prol.loopexit");
+    // Split PrologExit to get NewPreHeader.
+    NewPreHeader = SplitBlock(PrologExit, PrologExit->getTerminator(), DT, LI);
+    NewPreHeader->setName(PreHeader->getName() + ".new");
+  }
+  // Loop structure should be the following:
+  //  Epilog             Prolog
+  //
+  // PreHeader         PreHeader
+  // *NewPreHeader     *PrologPreHeader
+  //   Header          *PrologExit
+  //   ...             *NewPreHeader
+  //   Latch             Header
+  // *NewExit            ...
+  // *EpilogPreHeader    Latch
+  // LatchExit              LatchExit
+
+  // Calculate conditions for branch around loop for unrolling
+  // in epilog case and around prolog remainder loop in prolog case.
+  // Compute the number of extra iterations required, which is:
+  //  extra iterations = run-time trip count % loop unroll factor
+  PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
+  Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(),
+                                            PreHeaderBR);
+  Value *BECount = Expander.expandCodeFor(BECountSC, BECountSC->getType(),
+                                          PreHeaderBR);
+  IRBuilder<> B(PreHeaderBR);
+  Value *ModVal;
+  // Calculate ModVal = (BECount + 1) % Count.
+  // Note that TripCount is BECount + 1.
+  if (isPowerOf2_32(Count)) {
+    // When Count is power of 2 we don't BECount for epilog case, however we'll
+    // need it for a branch around unrolling loop for prolog case.
+    ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter");
+    //  1. There are no iterations to be run in the prolog/epilog loop.
+    // OR
+    //  2. The addition computing TripCount overflowed.
+    //
+    // If (2) is true, we know that TripCount really is (1 << BEWidth) and so
+    // the number of iterations that remain to be run in the original loop is a
+    // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (we
+    // explicitly check this above).
+  } else {
+    // As (BECount + 1) can potentially unsigned overflow we count
+    // (BECount % Count) + 1 which is overflow safe as BECount % Count < Count.
+    Value *ModValTmp = B.CreateURem(BECount,
+                                    ConstantInt::get(BECount->getType(),
+                                                     Count));
+    Value *ModValAdd = B.CreateAdd(ModValTmp,
+                                   ConstantInt::get(ModValTmp->getType(), 1));
+    // At that point (BECount % Count) + 1 could be equal to Count.
+    // To handle this case we need to take mod by Count one more time.
+    ModVal = B.CreateURem(ModValAdd,
+                          ConstantInt::get(BECount->getType(), Count),
+                          "xtraiter");
+  }
+  Value *BranchVal =
+      UseEpilogRemainder ? B.CreateICmpULT(BECount,
+                                           ConstantInt::get(BECount->getType(),
+                                                            Count - 1)) :
+                           B.CreateIsNotNull(ModVal, "lcmp.mod");
+  BasicBlock *RemainderLoop = UseEpilogRemainder ? NewExit : PrologPreHeader;
+  BasicBlock *UnrollingLoop = UseEpilogRemainder ? NewPreHeader : PrologExit;
+  // Branch to either remainder (extra iterations) loop or unrolling loop.
+  B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop);
+  PreHeaderBR->eraseFromParent();
+  if (DT) {
+    if (UseEpilogRemainder)
+      DT->changeImmediateDominator(NewExit, PreHeader);
+    else
+      DT->changeImmediateDominator(PrologExit, PreHeader);
+  }
+  Function *F = Header->getParent();
+  // Get an ordered list of blocks in the loop to help with the ordering of the
+  // cloned blocks in the prolog/epilog code
+  LoopBlocksDFS LoopBlocks(L);
+  LoopBlocks.perform(LI);
+
+  //
+  // For each extra loop iteration, create a copy of the loop's basic blocks
+  // and generate a condition that branches to the copy depending on the
+  // number of 'left over' iterations.
+  //
+  std::vector<BasicBlock *> NewBlocks;
+  ValueToValueMapTy VMap;
+
+  // For unroll factor 2 remainder loop will have 1 iterations.
+  // Do not create 1 iteration loop.
+  bool CreateRemainderLoop = (Count != 2);
+
+  // Clone all the basic blocks in the loop. If Count is 2, we don't clone
+  // the loop, otherwise we create a cloned loop to execute the extra
+  // iterations. This function adds the appropriate CFG connections.
+  BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit;
+  BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader;
+  Loop *remainderLoop = CloneLoopBlocks(
+      L, ModVal, CreateRemainderLoop, UseEpilogRemainder, UnrollRemainder,
+      InsertTop, InsertBot,
+      NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI);
+
   // Assign the maximum possible trip count as the back edge weight for the
   // remainder loop if the original loop comes with a branch weight.
   if (remainderLoop && !UnrollRemainder)
     updateLatchBranchWeightsForRemainderLoop(L, remainderLoop, Count);
 
-  // Insert the cloned blocks into the function. 
-  F->getBasicBlockList().splice(InsertBot->getIterator(), 
-                                F->getBasicBlockList(), 
-                                NewBlocks[0]->getIterator(), 
-                                F->end()); 
- 
-  // Now the loop blocks are cloned and the other exiting blocks from the 
-  // remainder are connected to the original Loop's exit blocks. The remaining 
-  // work is to update the phi nodes in the original loop, and take in the 
-  // values from the cloned region. 
-  for (auto *BB : OtherExits) { 
-   for (auto &II : *BB) { 
- 
-     // Given we preserve LCSSA form, we know that the values used outside the 
-     // loop will be used through these phi nodes at the exit blocks that are 
-     // transformed below. 
-     if (!isa<PHINode>(II)) 
-       break; 
-     PHINode *Phi = cast<PHINode>(&II); 
-     unsigned oldNumOperands = Phi->getNumIncomingValues(); 
-     // Add the incoming values from the remainder code to the end of the phi 
-     // node. 
-     for (unsigned i =0; i < oldNumOperands; i++){ 
-       Value *newVal = VMap.lookup(Phi->getIncomingValue(i)); 
-       // newVal can be a constant or derived from values outside the loop, and 
-       // hence need not have a VMap value. Also, since lookup already generated 
-       // a default "null" VMap entry for this value, we need to populate that 
-       // VMap entry correctly, with the mapped entry being itself. 
-       if (!newVal) { 
-         newVal = Phi->getIncomingValue(i); 
-         VMap[Phi->getIncomingValue(i)] = Phi->getIncomingValue(i); 
-       } 
-       Phi->addIncoming(newVal, 
-                           cast<BasicBlock>(VMap[Phi->getIncomingBlock(i)])); 
-     } 
-   } 
-#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG) 
-    for (BasicBlock *SuccBB : successors(BB)) { 
-      assert(!(any_of(OtherExits, 
-                      [SuccBB](BasicBlock *EB) { return EB == SuccBB; }) || 
-               SuccBB == LatchExit) && 
-             "Breaks the definition of dedicated exits!"); 
-    } 
-#endif 
-  } 
- 
-  // Update the immediate dominator of the exit blocks and blocks that are 
-  // reachable from the exit blocks. This is needed because we now have paths 
-  // from both the original loop and the remainder code reaching the exit 
-  // blocks. While the IDom of these exit blocks were from the original loop, 
-  // now the IDom is the preheader (which decides whether the original loop or 
-  // remainder code should run). 
-  if (DT && !L->getExitingBlock()) { 
-    SmallVector<BasicBlock *, 16> ChildrenToUpdate; 
-    // NB! We have to examine the dom children of all loop blocks, not just 
-    // those which are the IDom of the exit blocks. This is because blocks 
-    // reachable from the exit blocks can have their IDom as the nearest common 
-    // dominator of the exit blocks. 
-    for (auto *BB : L->blocks()) { 
-      auto *DomNodeBB = DT->getNode(BB); 
-      for (auto *DomChild : DomNodeBB->children()) { 
-        auto *DomChildBB = DomChild->getBlock(); 
-        if (!L->contains(LI->getLoopFor(DomChildBB))) 
-          ChildrenToUpdate.push_back(DomChildBB); 
-      } 
-    } 
-    for (auto *BB : ChildrenToUpdate) 
-      DT->changeImmediateDominator(BB, PreHeader); 
-  } 
- 
-  // Loop structure should be the following: 
-  //  Epilog             Prolog 
-  // 
-  // PreHeader         PreHeader 
-  // NewPreHeader      PrologPreHeader 
-  //   Header            PrologHeader 
-  //   ...               ... 
-  //   Latch             PrologLatch 
-  // NewExit           PrologExit 
-  // EpilogPreHeader   NewPreHeader 
-  //   EpilogHeader      Header 
-  //   ...               ... 
-  //   EpilogLatch       Latch 
-  // LatchExit              LatchExit 
- 
-  // Rewrite the cloned instruction operands to use the values created when the 
-  // clone is created. 
-  for (BasicBlock *BB : NewBlocks) { 
-    for (Instruction &I : *BB) { 
-      RemapInstruction(&I, VMap, 
-                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); 
-    } 
-  } 
- 
-  if (UseEpilogRemainder) { 
-    // Connect the epilog code to the original loop and update the 
-    // PHI functions. 
-    ConnectEpilog(L, ModVal, NewExit, LatchExit, PreHeader, 
-                  EpilogPreHeader, NewPreHeader, VMap, DT, LI, 
-                  PreserveLCSSA); 
- 
-    // Update counter in loop for unrolling. 
-    // I should be multiply of Count. 
-    IRBuilder<> B2(NewPreHeader->getTerminator()); 
-    Value *TestVal = B2.CreateSub(TripCount, ModVal, "unroll_iter"); 
-    BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator()); 
-    B2.SetInsertPoint(LatchBR); 
-    PHINode *NewIdx = PHINode::Create(TestVal->getType(), 2, "niter", 
-                                      Header->getFirstNonPHI()); 
-    Value *IdxSub = 
-        B2.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1), 
-                     NewIdx->getName() + ".nsub"); 
-    Value *IdxCmp; 
-    if (LatchBR->getSuccessor(0) == Header) 
-      IdxCmp = B2.CreateIsNotNull(IdxSub, NewIdx->getName() + ".ncmp"); 
-    else 
-      IdxCmp = B2.CreateIsNull(IdxSub, NewIdx->getName() + ".ncmp"); 
-    NewIdx->addIncoming(TestVal, NewPreHeader); 
-    NewIdx->addIncoming(IdxSub, Latch); 
-    LatchBR->setCondition(IdxCmp); 
-  } else { 
-    // Connect the prolog code to the original loop and update the 
-    // PHI functions. 
-    ConnectProlog(L, BECount, Count, PrologExit, LatchExit, PreHeader, 
-                  NewPreHeader, VMap, DT, LI, PreserveLCSSA); 
-  } 
- 
-  // If this loop is nested, then the loop unroller changes the code in the any 
-  // of its parent loops, so the Scalar Evolution pass needs to be run again. 
-  SE->forgetTopmostLoop(L); 
- 
-  // Verify that the Dom Tree is correct. 
-#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG) 
-  if (DT) 
-    assert(DT->verify(DominatorTree::VerificationLevel::Full)); 
-#endif 
- 
-  // Canonicalize to LoopSimplifyForm both original and remainder loops. We 
-  // cannot rely on the LoopUnrollPass to do this because it only does 
-  // canonicalization for parent/subloops and not the sibling loops. 
-  if (OtherExits.size() > 0) { 
-    // Generate dedicated exit blocks for the original loop, to preserve 
-    // LoopSimplifyForm. 
-    formDedicatedExitBlocks(L, DT, LI, nullptr, PreserveLCSSA); 
-    // Generate dedicated exit blocks for the remainder loop if one exists, to 
-    // preserve LoopSimplifyForm. 
-    if (remainderLoop) 
-      formDedicatedExitBlocks(remainderLoop, DT, LI, nullptr, PreserveLCSSA); 
-  } 
- 
-  auto UnrollResult = LoopUnrollResult::Unmodified; 
-  if (remainderLoop && UnrollRemainder) { 
-    LLVM_DEBUG(dbgs() << "Unrolling remainder loop\n"); 
-    UnrollResult = 
-        UnrollLoop(remainderLoop, 
-                   {/*Count*/ Count - 1, /*TripCount*/ Count - 1, 
-                    /*Force*/ false, /*AllowRuntime*/ false, 
-                    /*AllowExpensiveTripCount*/ false, /*PreserveCondBr*/ true, 
-                    /*PreserveOnlyFirst*/ false, /*TripMultiple*/ 1, 
-                    /*PeelCount*/ 0, /*UnrollRemainder*/ false, ForgetAllSCEV}, 
-                   LI, SE, DT, AC, TTI, /*ORE*/ nullptr, PreserveLCSSA); 
-  } 
- 
-  if (ResultLoop && UnrollResult != LoopUnrollResult::FullyUnrolled) 
-    *ResultLoop = remainderLoop; 
-  NumRuntimeUnrolled++; 
-  return true; 
-} 
+  // Insert the cloned blocks into the function.
+  F->getBasicBlockList().splice(InsertBot->getIterator(),
+                                F->getBasicBlockList(),
+                                NewBlocks[0]->getIterator(),
+                                F->end());
+
+  // Now the loop blocks are cloned and the other exiting blocks from the
+  // remainder are connected to the original Loop's exit blocks. The remaining
+  // work is to update the phi nodes in the original loop, and take in the
+  // values from the cloned region.
+  for (auto *BB : OtherExits) {
+   for (auto &II : *BB) {
+
+     // Given we preserve LCSSA form, we know that the values used outside the
+     // loop will be used through these phi nodes at the exit blocks that are
+     // transformed below.
+     if (!isa<PHINode>(II))
+       break;
+     PHINode *Phi = cast<PHINode>(&II);
+     unsigned oldNumOperands = Phi->getNumIncomingValues();
+     // Add the incoming values from the remainder code to the end of the phi
+     // node.
+     for (unsigned i =0; i < oldNumOperands; i++){
+       Value *newVal = VMap.lookup(Phi->getIncomingValue(i));
+       // newVal can be a constant or derived from values outside the loop, and
+       // hence need not have a VMap value. Also, since lookup already generated
+       // a default "null" VMap entry for this value, we need to populate that
+       // VMap entry correctly, with the mapped entry being itself.
+       if (!newVal) {
+         newVal = Phi->getIncomingValue(i);
+         VMap[Phi->getIncomingValue(i)] = Phi->getIncomingValue(i);
+       }
+       Phi->addIncoming(newVal,
+                           cast<BasicBlock>(VMap[Phi->getIncomingBlock(i)]));
+     }
+   }
+#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)
+    for (BasicBlock *SuccBB : successors(BB)) {
+      assert(!(any_of(OtherExits,
+                      [SuccBB](BasicBlock *EB) { return EB == SuccBB; }) ||
+               SuccBB == LatchExit) &&
+             "Breaks the definition of dedicated exits!");
+    }
+#endif
+  }
+
+  // Update the immediate dominator of the exit blocks and blocks that are
+  // reachable from the exit blocks. This is needed because we now have paths
+  // from both the original loop and the remainder code reaching the exit
+  // blocks. While the IDom of these exit blocks were from the original loop,
+  // now the IDom is the preheader (which decides whether the original loop or
+  // remainder code should run).
+  if (DT && !L->getExitingBlock()) {
+    SmallVector<BasicBlock *, 16> ChildrenToUpdate;
+    // NB! We have to examine the dom children of all loop blocks, not just
+    // those which are the IDom of the exit blocks. This is because blocks
+    // reachable from the exit blocks can have their IDom as the nearest common
+    // dominator of the exit blocks.
+    for (auto *BB : L->blocks()) {
+      auto *DomNodeBB = DT->getNode(BB);
+      for (auto *DomChild : DomNodeBB->children()) {
+        auto *DomChildBB = DomChild->getBlock();
+        if (!L->contains(LI->getLoopFor(DomChildBB)))
+          ChildrenToUpdate.push_back(DomChildBB);
+      }
+    }
+    for (auto *BB : ChildrenToUpdate)
+      DT->changeImmediateDominator(BB, PreHeader);
+  }
+
+  // Loop structure should be the following:
+  //  Epilog             Prolog
+  //
+  // PreHeader         PreHeader
+  // NewPreHeader      PrologPreHeader
+  //   Header            PrologHeader
+  //   ...               ...
+  //   Latch             PrologLatch
+  // NewExit           PrologExit
+  // EpilogPreHeader   NewPreHeader
+  //   EpilogHeader      Header
+  //   ...               ...
+  //   EpilogLatch       Latch
+  // LatchExit              LatchExit
+
+  // Rewrite the cloned instruction operands to use the values created when the
+  // clone is created.
+  for (BasicBlock *BB : NewBlocks) {
+    for (Instruction &I : *BB) {
+      RemapInstruction(&I, VMap,
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+    }
+  }
+
+  if (UseEpilogRemainder) {
+    // Connect the epilog code to the original loop and update the
+    // PHI functions.
+    ConnectEpilog(L, ModVal, NewExit, LatchExit, PreHeader,
+                  EpilogPreHeader, NewPreHeader, VMap, DT, LI,
+                  PreserveLCSSA);
+
+    // Update counter in loop for unrolling.
+    // I should be multiply of Count.
+    IRBuilder<> B2(NewPreHeader->getTerminator());
+    Value *TestVal = B2.CreateSub(TripCount, ModVal, "unroll_iter");
+    BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
+    B2.SetInsertPoint(LatchBR);
+    PHINode *NewIdx = PHINode::Create(TestVal->getType(), 2, "niter",
+                                      Header->getFirstNonPHI());
+    Value *IdxSub =
+        B2.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
+                     NewIdx->getName() + ".nsub");
+    Value *IdxCmp;
+    if (LatchBR->getSuccessor(0) == Header)
+      IdxCmp = B2.CreateIsNotNull(IdxSub, NewIdx->getName() + ".ncmp");
+    else
+      IdxCmp = B2.CreateIsNull(IdxSub, NewIdx->getName() + ".ncmp");
+    NewIdx->addIncoming(TestVal, NewPreHeader);
+    NewIdx->addIncoming(IdxSub, Latch);
+    LatchBR->setCondition(IdxCmp);
+  } else {
+    // Connect the prolog code to the original loop and update the
+    // PHI functions.
+    ConnectProlog(L, BECount, Count, PrologExit, LatchExit, PreHeader,
+                  NewPreHeader, VMap, DT, LI, PreserveLCSSA);
+  }
+
+  // If this loop is nested, then the loop unroller changes the code in the any
+  // of its parent loops, so the Scalar Evolution pass needs to be run again.
+  SE->forgetTopmostLoop(L);
+
+  // Verify that the Dom Tree is correct.
+#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)
+  if (DT)
+    assert(DT->verify(DominatorTree::VerificationLevel::Full));
+#endif
+
+  // Canonicalize to LoopSimplifyForm both original and remainder loops. We
+  // cannot rely on the LoopUnrollPass to do this because it only does
+  // canonicalization for parent/subloops and not the sibling loops.
+  if (OtherExits.size() > 0) {
+    // Generate dedicated exit blocks for the original loop, to preserve
+    // LoopSimplifyForm.
+    formDedicatedExitBlocks(L, DT, LI, nullptr, PreserveLCSSA);
+    // Generate dedicated exit blocks for the remainder loop if one exists, to
+    // preserve LoopSimplifyForm.
+    if (remainderLoop)
+      formDedicatedExitBlocks(remainderLoop, DT, LI, nullptr, PreserveLCSSA);
+  }
+
+  auto UnrollResult = LoopUnrollResult::Unmodified;
+  if (remainderLoop && UnrollRemainder) {
+    LLVM_DEBUG(dbgs() << "Unrolling remainder loop\n");
+    UnrollResult =
+        UnrollLoop(remainderLoop,
+                   {/*Count*/ Count - 1, /*TripCount*/ Count - 1,
+                    /*Force*/ false, /*AllowRuntime*/ false,
+                    /*AllowExpensiveTripCount*/ false, /*PreserveCondBr*/ true,
+                    /*PreserveOnlyFirst*/ false, /*TripMultiple*/ 1,
+                    /*PeelCount*/ 0, /*UnrollRemainder*/ false, ForgetAllSCEV},
+                   LI, SE, DT, AC, TTI, /*ORE*/ nullptr, PreserveLCSSA);
+  }
+
+  if (ResultLoop && UnrollResult != LoopUnrollResult::FullyUnrolled)
+    *ResultLoop = remainderLoop;
+  NumRuntimeUnrolled++;
+  return true;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/LoopUtils.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/LoopUtils.cpp
index a220f9d25a..f0f423e981 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/LoopUtils.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/LoopUtils.cpp
@@ -1,307 +1,307 @@
-//===-- LoopUtils.cpp - Loop Utility functions -------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file defines common loop utility functions. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/LoopUtils.h" 
-#include "llvm/ADT/DenseSet.h" 
-#include "llvm/ADT/Optional.h" 
-#include "llvm/ADT/PriorityWorklist.h" 
-#include "llvm/ADT/ScopeExit.h" 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/Analysis/AliasAnalysis.h" 
-#include "llvm/Analysis/BasicAliasAnalysis.h" 
-#include "llvm/Analysis/DomTreeUpdater.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/Analysis/LoopAccessAnalysis.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/LoopPass.h" 
-#include "llvm/Analysis/MemorySSA.h" 
-#include "llvm/Analysis/MemorySSAUpdater.h" 
-#include "llvm/Analysis/MustExecute.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" 
-#include "llvm/Analysis/ScalarEvolutionExpressions.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/DIBuilder.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/MDBuilder.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Operator.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/IR/ValueHandle.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/KnownBits.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 
- 
-using namespace llvm; 
-using namespace llvm::PatternMatch; 
- 
-static cl::opt<bool> ForceReductionIntrinsic( 
-    "force-reduction-intrinsics", cl::Hidden, 
-    cl::desc("Force creating reduction intrinsics for testing."), 
-    cl::init(false)); 
- 
-#define DEBUG_TYPE "loop-utils" 
- 
-static const char *LLVMLoopDisableNonforced = "llvm.loop.disable_nonforced"; 
-static const char *LLVMLoopDisableLICM = "llvm.licm.disable"; 
+//===-- LoopUtils.cpp - Loop Utility functions -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines common loop utility functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/PriorityWorklist.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/MustExecute.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+static cl::opt<bool> ForceReductionIntrinsic(
+    "force-reduction-intrinsics", cl::Hidden,
+    cl::desc("Force creating reduction intrinsics for testing."),
+    cl::init(false));
+
+#define DEBUG_TYPE "loop-utils"
+
+static const char *LLVMLoopDisableNonforced = "llvm.loop.disable_nonforced";
+static const char *LLVMLoopDisableLICM = "llvm.licm.disable";
 static const char *LLVMLoopMustProgress = "llvm.loop.mustprogress";
- 
-bool llvm::formDedicatedExitBlocks(Loop *L, DominatorTree *DT, LoopInfo *LI, 
-                                   MemorySSAUpdater *MSSAU, 
-                                   bool PreserveLCSSA) { 
-  bool Changed = false; 
- 
-  // We re-use a vector for the in-loop predecesosrs. 
-  SmallVector<BasicBlock *, 4> InLoopPredecessors; 
- 
-  auto RewriteExit = [&](BasicBlock *BB) { 
-    assert(InLoopPredecessors.empty() && 
-           "Must start with an empty predecessors list!"); 
-    auto Cleanup = make_scope_exit([&] { InLoopPredecessors.clear(); }); 
- 
-    // See if there are any non-loop predecessors of this exit block and 
-    // keep track of the in-loop predecessors. 
-    bool IsDedicatedExit = true; 
-    for (auto *PredBB : predecessors(BB)) 
-      if (L->contains(PredBB)) { 
-        if (isa<IndirectBrInst>(PredBB->getTerminator())) 
-          // We cannot rewrite exiting edges from an indirectbr. 
-          return false; 
-        if (isa<CallBrInst>(PredBB->getTerminator())) 
-          // We cannot rewrite exiting edges from a callbr. 
-          return false; 
- 
-        InLoopPredecessors.push_back(PredBB); 
-      } else { 
-        IsDedicatedExit = false; 
-      } 
- 
-    assert(!InLoopPredecessors.empty() && "Must have *some* loop predecessor!"); 
- 
-    // Nothing to do if this is already a dedicated exit. 
-    if (IsDedicatedExit) 
-      return false; 
- 
-    auto *NewExitBB = SplitBlockPredecessors( 
-        BB, InLoopPredecessors, ".loopexit", DT, LI, MSSAU, PreserveLCSSA); 
- 
-    if (!NewExitBB) 
-      LLVM_DEBUG( 
-          dbgs() << "WARNING: Can't create a dedicated exit block for loop: " 
-                 << *L << "\n"); 
-    else 
-      LLVM_DEBUG(dbgs() << "LoopSimplify: Creating dedicated exit block " 
-                        << NewExitBB->getName() << "\n"); 
-    return true; 
-  }; 
- 
-  // Walk the exit blocks directly rather than building up a data structure for 
-  // them, but only visit each one once. 
-  SmallPtrSet<BasicBlock *, 4> Visited; 
-  for (auto *BB : L->blocks()) 
-    for (auto *SuccBB : successors(BB)) { 
-      // We're looking for exit blocks so skip in-loop successors. 
-      if (L->contains(SuccBB)) 
-        continue; 
- 
-      // Visit each exit block exactly once. 
-      if (!Visited.insert(SuccBB).second) 
-        continue; 
- 
-      Changed |= RewriteExit(SuccBB); 
-    } 
- 
-  return Changed; 
-} 
- 
-/// Returns the instructions that use values defined in the loop. 
-SmallVector<Instruction *, 8> llvm::findDefsUsedOutsideOfLoop(Loop *L) { 
-  SmallVector<Instruction *, 8> UsedOutside; 
- 
-  for (auto *Block : L->getBlocks()) 
-    // FIXME: I believe that this could use copy_if if the Inst reference could 
-    // be adapted into a pointer. 
-    for (auto &Inst : *Block) { 
-      auto Users = Inst.users(); 
-      if (any_of(Users, [&](User *U) { 
-            auto *Use = cast<Instruction>(U); 
-            return !L->contains(Use->getParent()); 
-          })) 
-        UsedOutside.push_back(&Inst); 
-    } 
- 
-  return UsedOutside; 
-} 
- 
-void llvm::getLoopAnalysisUsage(AnalysisUsage &AU) { 
-  // By definition, all loop passes need the LoopInfo analysis and the 
-  // Dominator tree it depends on. Because they all participate in the loop 
-  // pass manager, they must also preserve these. 
-  AU.addRequired<DominatorTreeWrapperPass>(); 
-  AU.addPreserved<DominatorTreeWrapperPass>(); 
-  AU.addRequired<LoopInfoWrapperPass>(); 
-  AU.addPreserved<LoopInfoWrapperPass>(); 
- 
-  // We must also preserve LoopSimplify and LCSSA. We locally access their IDs 
-  // here because users shouldn't directly get them from this header. 
-  extern char &LoopSimplifyID; 
-  extern char &LCSSAID; 
-  AU.addRequiredID(LoopSimplifyID); 
-  AU.addPreservedID(LoopSimplifyID); 
-  AU.addRequiredID(LCSSAID); 
-  AU.addPreservedID(LCSSAID); 
-  // This is used in the LPPassManager to perform LCSSA verification on passes 
-  // which preserve lcssa form 
-  AU.addRequired<LCSSAVerificationPass>(); 
-  AU.addPreserved<LCSSAVerificationPass>(); 
- 
-  // Loop passes are designed to run inside of a loop pass manager which means 
-  // that any function analyses they require must be required by the first loop 
-  // pass in the manager (so that it is computed before the loop pass manager 
-  // runs) and preserved by all loop pasess in the manager. To make this 
-  // reasonably robust, the set needed for most loop passes is maintained here. 
-  // If your loop pass requires an analysis not listed here, you will need to 
-  // carefully audit the loop pass manager nesting structure that results. 
-  AU.addRequired<AAResultsWrapperPass>(); 
-  AU.addPreserved<AAResultsWrapperPass>(); 
-  AU.addPreserved<BasicAAWrapperPass>(); 
-  AU.addPreserved<GlobalsAAWrapperPass>(); 
-  AU.addPreserved<SCEVAAWrapperPass>(); 
-  AU.addRequired<ScalarEvolutionWrapperPass>(); 
-  AU.addPreserved<ScalarEvolutionWrapperPass>(); 
-  // FIXME: When all loop passes preserve MemorySSA, it can be required and 
-  // preserved here instead of the individual handling in each pass. 
-} 
- 
-/// Manually defined generic "LoopPass" dependency initialization. This is used 
-/// to initialize the exact set of passes from above in \c 
-/// getLoopAnalysisUsage. It can be used within a loop pass's initialization 
-/// with: 
-/// 
-///   INITIALIZE_PASS_DEPENDENCY(LoopPass) 
-/// 
-/// As-if "LoopPass" were a pass. 
-void llvm::initializeLoopPassPass(PassRegistry &Registry) { 
-  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-  INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 
-  INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 
-  INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass) 
-  INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 
-  INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 
-  INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 
-  INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) 
-  INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 
-  INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) 
-} 
- 
-/// Create MDNode for input string. 
-static MDNode *createStringMetadata(Loop *TheLoop, StringRef Name, unsigned V) { 
-  LLVMContext &Context = TheLoop->getHeader()->getContext(); 
-  Metadata *MDs[] = { 
-      MDString::get(Context, Name), 
-      ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Context), V))}; 
-  return MDNode::get(Context, MDs); 
-} 
- 
-/// Set input string into loop metadata by keeping other values intact. 
-/// If the string is already in loop metadata update value if it is 
-/// different. 
-void llvm::addStringMetadataToLoop(Loop *TheLoop, const char *StringMD, 
-                                   unsigned V) { 
-  SmallVector<Metadata *, 4> MDs(1); 
-  // If the loop already has metadata, retain it. 
-  MDNode *LoopID = TheLoop->getLoopID(); 
-  if (LoopID) { 
-    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 
-      MDNode *Node = cast<MDNode>(LoopID->getOperand(i)); 
-      // If it is of form key = value, try to parse it. 
-      if (Node->getNumOperands() == 2) { 
-        MDString *S = dyn_cast<MDString>(Node->getOperand(0)); 
-        if (S && S->getString().equals(StringMD)) { 
-          ConstantInt *IntMD = 
-              mdconst::extract_or_null<ConstantInt>(Node->getOperand(1)); 
-          if (IntMD && IntMD->getSExtValue() == V) 
-            // It is already in place. Do nothing. 
-            return; 
-          // We need to update the value, so just skip it here and it will 
-          // be added after copying other existed nodes. 
-          continue; 
-        } 
-      } 
-      MDs.push_back(Node); 
-    } 
-  } 
-  // Add new metadata. 
-  MDs.push_back(createStringMetadata(TheLoop, StringMD, V)); 
-  // Replace current metadata node with new one. 
-  LLVMContext &Context = TheLoop->getHeader()->getContext(); 
-  MDNode *NewLoopID = MDNode::get(Context, MDs); 
-  // Set operand 0 to refer to the loop id itself. 
-  NewLoopID->replaceOperandWith(0, NewLoopID); 
-  TheLoop->setLoopID(NewLoopID); 
-} 
- 
-/// Find string metadata for loop 
-/// 
-/// If it has a value (e.g. {"llvm.distribute", 1} return the value as an 
-/// operand or null otherwise.  If the string metadata is not found return 
-/// Optional's not-a-value. 
-Optional<const MDOperand *> llvm::findStringMetadataForLoop(const Loop *TheLoop, 
-                                                            StringRef Name) { 
-  MDNode *MD = findOptionMDForLoop(TheLoop, Name); 
-  if (!MD) 
-    return None; 
-  switch (MD->getNumOperands()) { 
-  case 1: 
-    return nullptr; 
-  case 2: 
-    return &MD->getOperand(1); 
-  default: 
-    llvm_unreachable("loop metadata has 0 or 1 operand"); 
-  } 
-} 
- 
-static Optional<bool> getOptionalBoolLoopAttribute(const Loop *TheLoop, 
-                                                   StringRef Name) { 
-  MDNode *MD = findOptionMDForLoop(TheLoop, Name); 
-  if (!MD) 
-    return None; 
-  switch (MD->getNumOperands()) { 
-  case 1: 
-    // When the value is absent it is interpreted as 'attribute set'. 
-    return true; 
-  case 2: 
-    if (ConstantInt *IntMD = 
-            mdconst::extract_or_null<ConstantInt>(MD->getOperand(1).get())) 
-      return IntMD->getZExtValue(); 
-    return true; 
-  } 
-  llvm_unreachable("unexpected number of options"); 
-} 
- 
+
+bool llvm::formDedicatedExitBlocks(Loop *L, DominatorTree *DT, LoopInfo *LI,
+                                   MemorySSAUpdater *MSSAU,
+                                   bool PreserveLCSSA) {
+  bool Changed = false;
+
+  // We re-use a vector for the in-loop predecesosrs.
+  SmallVector<BasicBlock *, 4> InLoopPredecessors;
+
+  auto RewriteExit = [&](BasicBlock *BB) {
+    assert(InLoopPredecessors.empty() &&
+           "Must start with an empty predecessors list!");
+    auto Cleanup = make_scope_exit([&] { InLoopPredecessors.clear(); });
+
+    // See if there are any non-loop predecessors of this exit block and
+    // keep track of the in-loop predecessors.
+    bool IsDedicatedExit = true;
+    for (auto *PredBB : predecessors(BB))
+      if (L->contains(PredBB)) {
+        if (isa<IndirectBrInst>(PredBB->getTerminator()))
+          // We cannot rewrite exiting edges from an indirectbr.
+          return false;
+        if (isa<CallBrInst>(PredBB->getTerminator()))
+          // We cannot rewrite exiting edges from a callbr.
+          return false;
+
+        InLoopPredecessors.push_back(PredBB);
+      } else {
+        IsDedicatedExit = false;
+      }
+
+    assert(!InLoopPredecessors.empty() && "Must have *some* loop predecessor!");
+
+    // Nothing to do if this is already a dedicated exit.
+    if (IsDedicatedExit)
+      return false;
+
+    auto *NewExitBB = SplitBlockPredecessors(
+        BB, InLoopPredecessors, ".loopexit", DT, LI, MSSAU, PreserveLCSSA);
+
+    if (!NewExitBB)
+      LLVM_DEBUG(
+          dbgs() << "WARNING: Can't create a dedicated exit block for loop: "
+                 << *L << "\n");
+    else
+      LLVM_DEBUG(dbgs() << "LoopSimplify: Creating dedicated exit block "
+                        << NewExitBB->getName() << "\n");
+    return true;
+  };
+
+  // Walk the exit blocks directly rather than building up a data structure for
+  // them, but only visit each one once.
+  SmallPtrSet<BasicBlock *, 4> Visited;
+  for (auto *BB : L->blocks())
+    for (auto *SuccBB : successors(BB)) {
+      // We're looking for exit blocks so skip in-loop successors.
+      if (L->contains(SuccBB))
+        continue;
+
+      // Visit each exit block exactly once.
+      if (!Visited.insert(SuccBB).second)
+        continue;
+
+      Changed |= RewriteExit(SuccBB);
+    }
+
+  return Changed;
+}
+
+/// Returns the instructions that use values defined in the loop.
+SmallVector<Instruction *, 8> llvm::findDefsUsedOutsideOfLoop(Loop *L) {
+  SmallVector<Instruction *, 8> UsedOutside;
+
+  for (auto *Block : L->getBlocks())
+    // FIXME: I believe that this could use copy_if if the Inst reference could
+    // be adapted into a pointer.
+    for (auto &Inst : *Block) {
+      auto Users = Inst.users();
+      if (any_of(Users, [&](User *U) {
+            auto *Use = cast<Instruction>(U);
+            return !L->contains(Use->getParent());
+          }))
+        UsedOutside.push_back(&Inst);
+    }
+
+  return UsedOutside;
+}
+
+void llvm::getLoopAnalysisUsage(AnalysisUsage &AU) {
+  // By definition, all loop passes need the LoopInfo analysis and the
+  // Dominator tree it depends on. Because they all participate in the loop
+  // pass manager, they must also preserve these.
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addPreserved<DominatorTreeWrapperPass>();
+  AU.addRequired<LoopInfoWrapperPass>();
+  AU.addPreserved<LoopInfoWrapperPass>();
+
+  // We must also preserve LoopSimplify and LCSSA. We locally access their IDs
+  // here because users shouldn't directly get them from this header.
+  extern char &LoopSimplifyID;
+  extern char &LCSSAID;
+  AU.addRequiredID(LoopSimplifyID);
+  AU.addPreservedID(LoopSimplifyID);
+  AU.addRequiredID(LCSSAID);
+  AU.addPreservedID(LCSSAID);
+  // This is used in the LPPassManager to perform LCSSA verification on passes
+  // which preserve lcssa form
+  AU.addRequired<LCSSAVerificationPass>();
+  AU.addPreserved<LCSSAVerificationPass>();
+
+  // Loop passes are designed to run inside of a loop pass manager which means
+  // that any function analyses they require must be required by the first loop
+  // pass in the manager (so that it is computed before the loop pass manager
+  // runs) and preserved by all loop pasess in the manager. To make this
+  // reasonably robust, the set needed for most loop passes is maintained here.
+  // If your loop pass requires an analysis not listed here, you will need to
+  // carefully audit the loop pass manager nesting structure that results.
+  AU.addRequired<AAResultsWrapperPass>();
+  AU.addPreserved<AAResultsWrapperPass>();
+  AU.addPreserved<BasicAAWrapperPass>();
+  AU.addPreserved<GlobalsAAWrapperPass>();
+  AU.addPreserved<SCEVAAWrapperPass>();
+  AU.addRequired<ScalarEvolutionWrapperPass>();
+  AU.addPreserved<ScalarEvolutionWrapperPass>();
+  // FIXME: When all loop passes preserve MemorySSA, it can be required and
+  // preserved here instead of the individual handling in each pass.
+}
+
+/// Manually defined generic "LoopPass" dependency initialization. This is used
+/// to initialize the exact set of passes from above in \c
+/// getLoopAnalysisUsage. It can be used within a loop pass's initialization
+/// with:
+///
+///   INITIALIZE_PASS_DEPENDENCY(LoopPass)
+///
+/// As-if "LoopPass" were a pass.
+void llvm::initializeLoopPassPass(PassRegistry &Registry) {
+  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+  INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+  INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+  INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
+  INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+  INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
+  INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+  INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
+  INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+  INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+}
+
+/// Create MDNode for input string.
+static MDNode *createStringMetadata(Loop *TheLoop, StringRef Name, unsigned V) {
+  LLVMContext &Context = TheLoop->getHeader()->getContext();
+  Metadata *MDs[] = {
+      MDString::get(Context, Name),
+      ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Context), V))};
+  return MDNode::get(Context, MDs);
+}
+
+/// Set input string into loop metadata by keeping other values intact.
+/// If the string is already in loop metadata update value if it is
+/// different.
+void llvm::addStringMetadataToLoop(Loop *TheLoop, const char *StringMD,
+                                   unsigned V) {
+  SmallVector<Metadata *, 4> MDs(1);
+  // If the loop already has metadata, retain it.
+  MDNode *LoopID = TheLoop->getLoopID();
+  if (LoopID) {
+    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+      MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
+      // If it is of form key = value, try to parse it.
+      if (Node->getNumOperands() == 2) {
+        MDString *S = dyn_cast<MDString>(Node->getOperand(0));
+        if (S && S->getString().equals(StringMD)) {
+          ConstantInt *IntMD =
+              mdconst::extract_or_null<ConstantInt>(Node->getOperand(1));
+          if (IntMD && IntMD->getSExtValue() == V)
+            // It is already in place. Do nothing.
+            return;
+          // We need to update the value, so just skip it here and it will
+          // be added after copying other existed nodes.
+          continue;
+        }
+      }
+      MDs.push_back(Node);
+    }
+  }
+  // Add new metadata.
+  MDs.push_back(createStringMetadata(TheLoop, StringMD, V));
+  // Replace current metadata node with new one.
+  LLVMContext &Context = TheLoop->getHeader()->getContext();
+  MDNode *NewLoopID = MDNode::get(Context, MDs);
+  // Set operand 0 to refer to the loop id itself.
+  NewLoopID->replaceOperandWith(0, NewLoopID);
+  TheLoop->setLoopID(NewLoopID);
+}
+
+/// Find string metadata for loop
+///
+/// If it has a value (e.g. {"llvm.distribute", 1} return the value as an
+/// operand or null otherwise.  If the string metadata is not found return
+/// Optional's not-a-value.
+Optional<const MDOperand *> llvm::findStringMetadataForLoop(const Loop *TheLoop,
+                                                            StringRef Name) {
+  MDNode *MD = findOptionMDForLoop(TheLoop, Name);
+  if (!MD)
+    return None;
+  switch (MD->getNumOperands()) {
+  case 1:
+    return nullptr;
+  case 2:
+    return &MD->getOperand(1);
+  default:
+    llvm_unreachable("loop metadata has 0 or 1 operand");
+  }
+}
+
+static Optional<bool> getOptionalBoolLoopAttribute(const Loop *TheLoop,
+                                                   StringRef Name) {
+  MDNode *MD = findOptionMDForLoop(TheLoop, Name);
+  if (!MD)
+    return None;
+  switch (MD->getNumOperands()) {
+  case 1:
+    // When the value is absent it is interpreted as 'attribute set'.
+    return true;
+  case 2:
+    if (ConstantInt *IntMD =
+            mdconst::extract_or_null<ConstantInt>(MD->getOperand(1).get()))
+      return IntMD->getZExtValue();
+    return true;
+  }
+  llvm_unreachable("unexpected number of options");
+}
+
 bool llvm::getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name) {
-  return getOptionalBoolLoopAttribute(TheLoop, Name).getValueOr(false); 
-} 
- 
+  return getOptionalBoolLoopAttribute(TheLoop, Name).getValueOr(false);
+}
+
 Optional<ElementCount>
 llvm::getOptionalElementCountLoopAttribute(Loop *TheLoop) {
   Optional<int> Width =
@@ -316,292 +316,292 @@ llvm::getOptionalElementCountLoopAttribute(Loop *TheLoop) {
   return None;
 }
 
-llvm::Optional<int> llvm::getOptionalIntLoopAttribute(Loop *TheLoop, 
-                                                      StringRef Name) { 
-  const MDOperand *AttrMD = 
-      findStringMetadataForLoop(TheLoop, Name).getValueOr(nullptr); 
-  if (!AttrMD) 
-    return None; 
- 
-  ConstantInt *IntMD = mdconst::extract_or_null<ConstantInt>(AttrMD->get()); 
-  if (!IntMD) 
-    return None; 
- 
-  return IntMD->getSExtValue(); 
-} 
- 
-Optional<MDNode *> llvm::makeFollowupLoopID( 
-    MDNode *OrigLoopID, ArrayRef<StringRef> FollowupOptions, 
-    const char *InheritOptionsExceptPrefix, bool AlwaysNew) { 
-  if (!OrigLoopID) { 
-    if (AlwaysNew) 
-      return nullptr; 
-    return None; 
-  } 
- 
-  assert(OrigLoopID->getOperand(0) == OrigLoopID); 
- 
-  bool InheritAllAttrs = !InheritOptionsExceptPrefix; 
-  bool InheritSomeAttrs = 
-      InheritOptionsExceptPrefix && InheritOptionsExceptPrefix[0] != '\0'; 
-  SmallVector<Metadata *, 8> MDs; 
-  MDs.push_back(nullptr); 
- 
-  bool Changed = false; 
-  if (InheritAllAttrs || InheritSomeAttrs) { 
+llvm::Optional<int> llvm::getOptionalIntLoopAttribute(Loop *TheLoop,
+                                                      StringRef Name) {
+  const MDOperand *AttrMD =
+      findStringMetadataForLoop(TheLoop, Name).getValueOr(nullptr);
+  if (!AttrMD)
+    return None;
+
+  ConstantInt *IntMD = mdconst::extract_or_null<ConstantInt>(AttrMD->get());
+  if (!IntMD)
+    return None;
+
+  return IntMD->getSExtValue();
+}
+
+Optional<MDNode *> llvm::makeFollowupLoopID(
+    MDNode *OrigLoopID, ArrayRef<StringRef> FollowupOptions,
+    const char *InheritOptionsExceptPrefix, bool AlwaysNew) {
+  if (!OrigLoopID) {
+    if (AlwaysNew)
+      return nullptr;
+    return None;
+  }
+
+  assert(OrigLoopID->getOperand(0) == OrigLoopID);
+
+  bool InheritAllAttrs = !InheritOptionsExceptPrefix;
+  bool InheritSomeAttrs =
+      InheritOptionsExceptPrefix && InheritOptionsExceptPrefix[0] != '\0';
+  SmallVector<Metadata *, 8> MDs;
+  MDs.push_back(nullptr);
+
+  bool Changed = false;
+  if (InheritAllAttrs || InheritSomeAttrs) {
     for (const MDOperand &Existing : drop_begin(OrigLoopID->operands())) {
-      MDNode *Op = cast<MDNode>(Existing.get()); 
- 
-      auto InheritThisAttribute = [InheritSomeAttrs, 
-                                   InheritOptionsExceptPrefix](MDNode *Op) { 
-        if (!InheritSomeAttrs) 
-          return false; 
- 
-        // Skip malformatted attribute metadata nodes. 
-        if (Op->getNumOperands() == 0) 
-          return true; 
-        Metadata *NameMD = Op->getOperand(0).get(); 
-        if (!isa<MDString>(NameMD)) 
-          return true; 
-        StringRef AttrName = cast<MDString>(NameMD)->getString(); 
- 
-        // Do not inherit excluded attributes. 
-        return !AttrName.startswith(InheritOptionsExceptPrefix); 
-      }; 
- 
-      if (InheritThisAttribute(Op)) 
-        MDs.push_back(Op); 
-      else 
-        Changed = true; 
-    } 
-  } else { 
-    // Modified if we dropped at least one attribute. 
-    Changed = OrigLoopID->getNumOperands() > 1; 
-  } 
- 
-  bool HasAnyFollowup = false; 
-  for (StringRef OptionName : FollowupOptions) { 
-    MDNode *FollowupNode = findOptionMDForLoopID(OrigLoopID, OptionName); 
-    if (!FollowupNode) 
-      continue; 
- 
-    HasAnyFollowup = true; 
+      MDNode *Op = cast<MDNode>(Existing.get());
+
+      auto InheritThisAttribute = [InheritSomeAttrs,
+                                   InheritOptionsExceptPrefix](MDNode *Op) {
+        if (!InheritSomeAttrs)
+          return false;
+
+        // Skip malformatted attribute metadata nodes.
+        if (Op->getNumOperands() == 0)
+          return true;
+        Metadata *NameMD = Op->getOperand(0).get();
+        if (!isa<MDString>(NameMD))
+          return true;
+        StringRef AttrName = cast<MDString>(NameMD)->getString();
+
+        // Do not inherit excluded attributes.
+        return !AttrName.startswith(InheritOptionsExceptPrefix);
+      };
+
+      if (InheritThisAttribute(Op))
+        MDs.push_back(Op);
+      else
+        Changed = true;
+    }
+  } else {
+    // Modified if we dropped at least one attribute.
+    Changed = OrigLoopID->getNumOperands() > 1;
+  }
+
+  bool HasAnyFollowup = false;
+  for (StringRef OptionName : FollowupOptions) {
+    MDNode *FollowupNode = findOptionMDForLoopID(OrigLoopID, OptionName);
+    if (!FollowupNode)
+      continue;
+
+    HasAnyFollowup = true;
     for (const MDOperand &Option : drop_begin(FollowupNode->operands())) {
-      MDs.push_back(Option.get()); 
-      Changed = true; 
-    } 
-  } 
- 
-  // Attributes of the followup loop not specified explicity, so signal to the 
-  // transformation pass to add suitable attributes. 
-  if (!AlwaysNew && !HasAnyFollowup) 
-    return None; 
- 
-  // If no attributes were added or remove, the previous loop Id can be reused. 
-  if (!AlwaysNew && !Changed) 
-    return OrigLoopID; 
- 
-  // No attributes is equivalent to having no !llvm.loop metadata at all. 
-  if (MDs.size() == 1) 
-    return nullptr; 
- 
-  // Build the new loop ID. 
-  MDTuple *FollowupLoopID = MDNode::get(OrigLoopID->getContext(), MDs); 
-  FollowupLoopID->replaceOperandWith(0, FollowupLoopID); 
-  return FollowupLoopID; 
-} 
- 
-bool llvm::hasDisableAllTransformsHint(const Loop *L) { 
-  return getBooleanLoopAttribute(L, LLVMLoopDisableNonforced); 
-} 
- 
-bool llvm::hasDisableLICMTransformsHint(const Loop *L) { 
-  return getBooleanLoopAttribute(L, LLVMLoopDisableLICM); 
-} 
- 
+      MDs.push_back(Option.get());
+      Changed = true;
+    }
+  }
+
+  // Attributes of the followup loop not specified explicity, so signal to the
+  // transformation pass to add suitable attributes.
+  if (!AlwaysNew && !HasAnyFollowup)
+    return None;
+
+  // If no attributes were added or remove, the previous loop Id can be reused.
+  if (!AlwaysNew && !Changed)
+    return OrigLoopID;
+
+  // No attributes is equivalent to having no !llvm.loop metadata at all.
+  if (MDs.size() == 1)
+    return nullptr;
+
+  // Build the new loop ID.
+  MDTuple *FollowupLoopID = MDNode::get(OrigLoopID->getContext(), MDs);
+  FollowupLoopID->replaceOperandWith(0, FollowupLoopID);
+  return FollowupLoopID;
+}
+
+bool llvm::hasDisableAllTransformsHint(const Loop *L) {
+  return getBooleanLoopAttribute(L, LLVMLoopDisableNonforced);
+}
+
+bool llvm::hasDisableLICMTransformsHint(const Loop *L) {
+  return getBooleanLoopAttribute(L, LLVMLoopDisableLICM);
+}
+
 bool llvm::hasMustProgress(const Loop *L) {
   return getBooleanLoopAttribute(L, LLVMLoopMustProgress);
 }
 
-TransformationMode llvm::hasUnrollTransformation(Loop *L) { 
-  if (getBooleanLoopAttribute(L, "llvm.loop.unroll.disable")) 
-    return TM_SuppressedByUser; 
- 
-  Optional<int> Count = 
-      getOptionalIntLoopAttribute(L, "llvm.loop.unroll.count"); 
-  if (Count.hasValue()) 
-    return Count.getValue() == 1 ? TM_SuppressedByUser : TM_ForcedByUser; 
- 
-  if (getBooleanLoopAttribute(L, "llvm.loop.unroll.enable")) 
-    return TM_ForcedByUser; 
- 
-  if (getBooleanLoopAttribute(L, "llvm.loop.unroll.full")) 
-    return TM_ForcedByUser; 
- 
-  if (hasDisableAllTransformsHint(L)) 
-    return TM_Disable; 
- 
-  return TM_Unspecified; 
-} 
- 
-TransformationMode llvm::hasUnrollAndJamTransformation(Loop *L) { 
-  if (getBooleanLoopAttribute(L, "llvm.loop.unroll_and_jam.disable")) 
-    return TM_SuppressedByUser; 
- 
-  Optional<int> Count = 
-      getOptionalIntLoopAttribute(L, "llvm.loop.unroll_and_jam.count"); 
-  if (Count.hasValue()) 
-    return Count.getValue() == 1 ? TM_SuppressedByUser : TM_ForcedByUser; 
- 
-  if (getBooleanLoopAttribute(L, "llvm.loop.unroll_and_jam.enable")) 
-    return TM_ForcedByUser; 
- 
-  if (hasDisableAllTransformsHint(L)) 
-    return TM_Disable; 
- 
-  return TM_Unspecified; 
-} 
- 
-TransformationMode llvm::hasVectorizeTransformation(Loop *L) { 
-  Optional<bool> Enable = 
-      getOptionalBoolLoopAttribute(L, "llvm.loop.vectorize.enable"); 
- 
-  if (Enable == false) 
-    return TM_SuppressedByUser; 
- 
+TransformationMode llvm::hasUnrollTransformation(Loop *L) {
+  if (getBooleanLoopAttribute(L, "llvm.loop.unroll.disable"))
+    return TM_SuppressedByUser;
+
+  Optional<int> Count =
+      getOptionalIntLoopAttribute(L, "llvm.loop.unroll.count");
+  if (Count.hasValue())
+    return Count.getValue() == 1 ? TM_SuppressedByUser : TM_ForcedByUser;
+
+  if (getBooleanLoopAttribute(L, "llvm.loop.unroll.enable"))
+    return TM_ForcedByUser;
+
+  if (getBooleanLoopAttribute(L, "llvm.loop.unroll.full"))
+    return TM_ForcedByUser;
+
+  if (hasDisableAllTransformsHint(L))
+    return TM_Disable;
+
+  return TM_Unspecified;
+}
+
+TransformationMode llvm::hasUnrollAndJamTransformation(Loop *L) {
+  if (getBooleanLoopAttribute(L, "llvm.loop.unroll_and_jam.disable"))
+    return TM_SuppressedByUser;
+
+  Optional<int> Count =
+      getOptionalIntLoopAttribute(L, "llvm.loop.unroll_and_jam.count");
+  if (Count.hasValue())
+    return Count.getValue() == 1 ? TM_SuppressedByUser : TM_ForcedByUser;
+
+  if (getBooleanLoopAttribute(L, "llvm.loop.unroll_and_jam.enable"))
+    return TM_ForcedByUser;
+
+  if (hasDisableAllTransformsHint(L))
+    return TM_Disable;
+
+  return TM_Unspecified;
+}
+
+TransformationMode llvm::hasVectorizeTransformation(Loop *L) {
+  Optional<bool> Enable =
+      getOptionalBoolLoopAttribute(L, "llvm.loop.vectorize.enable");
+
+  if (Enable == false)
+    return TM_SuppressedByUser;
+
   Optional<ElementCount> VectorizeWidth =
       getOptionalElementCountLoopAttribute(L);
-  Optional<int> InterleaveCount = 
-      getOptionalIntLoopAttribute(L, "llvm.loop.interleave.count"); 
- 
-  // 'Forcing' vector width and interleave count to one effectively disables 
-  // this tranformation. 
+  Optional<int> InterleaveCount =
+      getOptionalIntLoopAttribute(L, "llvm.loop.interleave.count");
+
+  // 'Forcing' vector width and interleave count to one effectively disables
+  // this tranformation.
   if (Enable == true && VectorizeWidth && VectorizeWidth->isScalar() &&
       InterleaveCount == 1)
-    return TM_SuppressedByUser; 
- 
-  if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized")) 
-    return TM_Disable; 
- 
-  if (Enable == true) 
-    return TM_ForcedByUser; 
- 
+    return TM_SuppressedByUser;
+
+  if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
+    return TM_Disable;
+
+  if (Enable == true)
+    return TM_ForcedByUser;
+
   if ((VectorizeWidth && VectorizeWidth->isScalar()) && InterleaveCount == 1)
-    return TM_Disable; 
- 
+    return TM_Disable;
+
   if ((VectorizeWidth && VectorizeWidth->isVector()) || InterleaveCount > 1)
-    return TM_Enable; 
- 
-  if (hasDisableAllTransformsHint(L)) 
-    return TM_Disable; 
- 
-  return TM_Unspecified; 
-} 
- 
-TransformationMode llvm::hasDistributeTransformation(Loop *L) { 
-  if (getBooleanLoopAttribute(L, "llvm.loop.distribute.enable")) 
-    return TM_ForcedByUser; 
- 
-  if (hasDisableAllTransformsHint(L)) 
-    return TM_Disable; 
- 
-  return TM_Unspecified; 
-} 
- 
-TransformationMode llvm::hasLICMVersioningTransformation(Loop *L) { 
-  if (getBooleanLoopAttribute(L, "llvm.loop.licm_versioning.disable")) 
-    return TM_SuppressedByUser; 
- 
-  if (hasDisableAllTransformsHint(L)) 
-    return TM_Disable; 
- 
-  return TM_Unspecified; 
-} 
- 
-/// Does a BFS from a given node to all of its children inside a given loop. 
-/// The returned vector of nodes includes the starting point. 
-SmallVector<DomTreeNode *, 16> 
-llvm::collectChildrenInLoop(DomTreeNode *N, const Loop *CurLoop) { 
-  SmallVector<DomTreeNode *, 16> Worklist; 
-  auto AddRegionToWorklist = [&](DomTreeNode *DTN) { 
-    // Only include subregions in the top level loop. 
-    BasicBlock *BB = DTN->getBlock(); 
-    if (CurLoop->contains(BB)) 
-      Worklist.push_back(DTN); 
-  }; 
- 
-  AddRegionToWorklist(N); 
- 
-  for (size_t I = 0; I < Worklist.size(); I++) { 
-    for (DomTreeNode *Child : Worklist[I]->children()) 
-      AddRegionToWorklist(Child); 
-  } 
- 
-  return Worklist; 
-} 
- 
-void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE, 
-                          LoopInfo *LI, MemorySSA *MSSA) { 
-  assert((!DT || L->isLCSSAForm(*DT)) && "Expected LCSSA!"); 
-  auto *Preheader = L->getLoopPreheader(); 
-  assert(Preheader && "Preheader should exist!"); 
- 
-  std::unique_ptr<MemorySSAUpdater> MSSAU; 
-  if (MSSA) 
-    MSSAU = std::make_unique<MemorySSAUpdater>(MSSA); 
- 
-  // Now that we know the removal is safe, remove the loop by changing the 
-  // branch from the preheader to go to the single exit block. 
-  // 
-  // Because we're deleting a large chunk of code at once, the sequence in which 
-  // we remove things is very important to avoid invalidation issues. 
- 
-  // Tell ScalarEvolution that the loop is deleted. Do this before 
-  // deleting the loop so that ScalarEvolution can look at the loop 
-  // to determine what it needs to clean up. 
-  if (SE) 
-    SE->forgetLoop(L); 
- 
-  auto *OldBr = dyn_cast<BranchInst>(Preheader->getTerminator()); 
-  assert(OldBr && "Preheader must end with a branch"); 
-  assert(OldBr->isUnconditional() && "Preheader must have a single successor"); 
-  // Connect the preheader to the exit block. Keep the old edge to the header 
-  // around to perform the dominator tree update in two separate steps 
-  // -- #1 insertion of the edge preheader -> exit and #2 deletion of the edge 
-  // preheader -> header. 
-  // 
-  // 
-  // 0.  Preheader          1.  Preheader           2.  Preheader 
-  //        |                    |   |                   | 
-  //        V                    |   V                   | 
-  //      Header <--\            | Header <--\           | Header <--\ 
-  //       |  |     |            |  |  |     |           |  |  |     | 
-  //       |  V     |            |  |  V     |           |  |  V     | 
-  //       | Body --/            |  | Body --/           |  | Body --/ 
-  //       V                     V  V                    V  V 
-  //      Exit                   Exit                    Exit 
-  // 
-  // By doing this is two separate steps we can perform the dominator tree 
-  // update without using the batch update API. 
-  // 
-  // Even when the loop is never executed, we cannot remove the edge from the 
-  // source block to the exit block. Consider the case where the unexecuted loop 
-  // branches back to an outer loop. If we deleted the loop and removed the edge 
-  // coming to this inner loop, this will break the outer loop structure (by 
-  // deleting the backedge of the outer loop). If the outer loop is indeed a 
-  // non-loop, it will be deleted in a future iteration of loop deletion pass. 
-  IRBuilder<> Builder(OldBr); 
- 
+    return TM_Enable;
+
+  if (hasDisableAllTransformsHint(L))
+    return TM_Disable;
+
+  return TM_Unspecified;
+}
+
+TransformationMode llvm::hasDistributeTransformation(Loop *L) {
+  if (getBooleanLoopAttribute(L, "llvm.loop.distribute.enable"))
+    return TM_ForcedByUser;
+
+  if (hasDisableAllTransformsHint(L))
+    return TM_Disable;
+
+  return TM_Unspecified;
+}
+
+TransformationMode llvm::hasLICMVersioningTransformation(Loop *L) {
+  if (getBooleanLoopAttribute(L, "llvm.loop.licm_versioning.disable"))
+    return TM_SuppressedByUser;
+
+  if (hasDisableAllTransformsHint(L))
+    return TM_Disable;
+
+  return TM_Unspecified;
+}
+
+/// Does a BFS from a given node to all of its children inside a given loop.
+/// The returned vector of nodes includes the starting point.
+SmallVector<DomTreeNode *, 16>
+llvm::collectChildrenInLoop(DomTreeNode *N, const Loop *CurLoop) {
+  SmallVector<DomTreeNode *, 16> Worklist;
+  auto AddRegionToWorklist = [&](DomTreeNode *DTN) {
+    // Only include subregions in the top level loop.
+    BasicBlock *BB = DTN->getBlock();
+    if (CurLoop->contains(BB))
+      Worklist.push_back(DTN);
+  };
+
+  AddRegionToWorklist(N);
+
+  for (size_t I = 0; I < Worklist.size(); I++) {
+    for (DomTreeNode *Child : Worklist[I]->children())
+      AddRegionToWorklist(Child);
+  }
+
+  return Worklist;
+}
+
+void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
+                          LoopInfo *LI, MemorySSA *MSSA) {
+  assert((!DT || L->isLCSSAForm(*DT)) && "Expected LCSSA!");
+  auto *Preheader = L->getLoopPreheader();
+  assert(Preheader && "Preheader should exist!");
+
+  std::unique_ptr<MemorySSAUpdater> MSSAU;
+  if (MSSA)
+    MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
+
+  // Now that we know the removal is safe, remove the loop by changing the
+  // branch from the preheader to go to the single exit block.
+  //
+  // Because we're deleting a large chunk of code at once, the sequence in which
+  // we remove things is very important to avoid invalidation issues.
+
+  // Tell ScalarEvolution that the loop is deleted. Do this before
+  // deleting the loop so that ScalarEvolution can look at the loop
+  // to determine what it needs to clean up.
+  if (SE)
+    SE->forgetLoop(L);
+
+  auto *OldBr = dyn_cast<BranchInst>(Preheader->getTerminator());
+  assert(OldBr && "Preheader must end with a branch");
+  assert(OldBr->isUnconditional() && "Preheader must have a single successor");
+  // Connect the preheader to the exit block. Keep the old edge to the header
+  // around to perform the dominator tree update in two separate steps
+  // -- #1 insertion of the edge preheader -> exit and #2 deletion of the edge
+  // preheader -> header.
+  //
+  //
+  // 0.  Preheader          1.  Preheader           2.  Preheader
+  //        |                    |   |                   |
+  //        V                    |   V                   |
+  //      Header <--\            | Header <--\           | Header <--\
+  //       |  |     |            |  |  |     |           |  |  |     |
+  //       |  V     |            |  |  V     |           |  |  V     |
+  //       | Body --/            |  | Body --/           |  | Body --/
+  //       V                     V  V                    V  V
+  //      Exit                   Exit                    Exit
+  //
+  // By doing this is two separate steps we can perform the dominator tree
+  // update without using the batch update API.
+  //
+  // Even when the loop is never executed, we cannot remove the edge from the
+  // source block to the exit block. Consider the case where the unexecuted loop
+  // branches back to an outer loop. If we deleted the loop and removed the edge
+  // coming to this inner loop, this will break the outer loop structure (by
+  // deleting the backedge of the outer loop). If the outer loop is indeed a
+  // non-loop, it will be deleted in a future iteration of loop deletion pass.
+  IRBuilder<> Builder(OldBr);
+
   auto *ExitBlock = L->getUniqueExitBlock();
   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
   if (ExitBlock) {
     assert(ExitBlock && "Should have a unique exit block!");
     assert(L->hasDedicatedExits() && "Loop should have dedicated exits!");
- 
+
     Builder.CreateCondBr(Builder.getFalse(), L->getHeader(), ExitBlock);
     // Remove the old branch. The conditional branch becomes a new terminator.
     OldBr->eraseFromParent();
- 
+
     // Rewrite phis in the exit block to get their inputs from the Preheader
     // instead of the exiting block.
     for (PHINode &P : ExitBlock->phis()) {
@@ -623,7 +623,7 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
       assert((P.getNumIncomingValues() == 1 &&
               P.getIncomingBlock(PredIndex) == Preheader) &&
              "Should have exactly one value and that's from the preheader!");
-    } 
+    }
 
     if (DT) {
       DTU.applyUpdates({{DominatorTree::Insert, Preheader, ExitBlock}});
@@ -647,25 +647,25 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
     Builder.SetInsertPoint(OldBr);
     Builder.CreateUnreachable();
     Preheader->getTerminator()->eraseFromParent();
-  } 
- 
-  if (DT) { 
-    DTU.applyUpdates({{DominatorTree::Delete, Preheader, L->getHeader()}}); 
-    if (MSSA) { 
-      MSSAU->applyUpdates({{DominatorTree::Delete, Preheader, L->getHeader()}}, 
-                          *DT); 
-      SmallSetVector<BasicBlock *, 8> DeadBlockSet(L->block_begin(), 
-                                                   L->block_end()); 
-      MSSAU->removeBlocks(DeadBlockSet); 
-      if (VerifyMemorySSA) 
-        MSSA->verifyMemorySSA(); 
-    } 
-  } 
- 
-  // Use a map to unique and a vector to guarantee deterministic ordering. 
-  llvm::SmallDenseSet<std::pair<DIVariable *, DIExpression *>, 4> DeadDebugSet; 
-  llvm::SmallVector<DbgVariableIntrinsic *, 4> DeadDebugInst; 
- 
+  }
+
+  if (DT) {
+    DTU.applyUpdates({{DominatorTree::Delete, Preheader, L->getHeader()}});
+    if (MSSA) {
+      MSSAU->applyUpdates({{DominatorTree::Delete, Preheader, L->getHeader()}},
+                          *DT);
+      SmallSetVector<BasicBlock *, 8> DeadBlockSet(L->block_begin(),
+                                                   L->block_end());
+      MSSAU->removeBlocks(DeadBlockSet);
+      if (VerifyMemorySSA)
+        MSSA->verifyMemorySSA();
+    }
+  }
+
+  // Use a map to unique and a vector to guarantee deterministic ordering.
+  llvm::SmallDenseSet<std::pair<DIVariable *, DIExpression *>, 4> DeadDebugSet;
+  llvm::SmallVector<DbgVariableIntrinsic *, 4> DeadDebugInst;
+
   if (ExitBlock) {
     // Given LCSSA form is satisfied, we should not have users of instructions
     // within the dead loop outside of the loop. However, LCSSA doesn't take
@@ -701,8 +701,8 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
           continue;
         DeadDebugSet.insert({DVI->getVariable(), DVI->getExpression()});
         DeadDebugInst.push_back(DVI);
-      } 
- 
+      }
+
     // After the loop has been deleted all the values defined and modified
     // inside the loop are going to be unavailable.
     // Since debug values in the loop have been deleted, inserting an undef
@@ -718,49 +718,49 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
                                   DVI->getVariable(), DVI->getExpression(),
                                   DVI->getDebugLoc(), InsertDbgValueBefore);
   }
- 
-  // Remove the block from the reference counting scheme, so that we can 
-  // delete it freely later. 
-  for (auto *Block : L->blocks()) 
-    Block->dropAllReferences(); 
- 
-  if (MSSA && VerifyMemorySSA) 
-    MSSA->verifyMemorySSA(); 
- 
-  if (LI) { 
-    // Erase the instructions and the blocks without having to worry 
-    // about ordering because we already dropped the references. 
-    // NOTE: This iteration is safe because erasing the block does not remove 
-    // its entry from the loop's block list.  We do that in the next section. 
-    for (Loop::block_iterator LpI = L->block_begin(), LpE = L->block_end(); 
-         LpI != LpE; ++LpI) 
-      (*LpI)->eraseFromParent(); 
- 
-    // Finally, the blocks from loopinfo.  This has to happen late because 
-    // otherwise our loop iterators won't work. 
- 
-    SmallPtrSet<BasicBlock *, 8> blocks; 
-    blocks.insert(L->block_begin(), L->block_end()); 
-    for (BasicBlock *BB : blocks) 
-      LI->removeBlock(BB); 
- 
-    // The last step is to update LoopInfo now that we've eliminated this loop. 
-    // Note: LoopInfo::erase remove the given loop and relink its subloops with 
-    // its parent. While removeLoop/removeChildLoop remove the given loop but 
-    // not relink its subloops, which is what we want. 
-    if (Loop *ParentLoop = L->getParentLoop()) { 
-      Loop::iterator I = find(*ParentLoop, L); 
-      assert(I != ParentLoop->end() && "Couldn't find loop"); 
-      ParentLoop->removeChildLoop(I); 
-    } else { 
-      Loop::iterator I = find(*LI, L); 
-      assert(I != LI->end() && "Couldn't find loop"); 
-      LI->removeLoop(I); 
-    } 
-    LI->destroy(L); 
-  } 
-} 
- 
+
+  // Remove the block from the reference counting scheme, so that we can
+  // delete it freely later.
+  for (auto *Block : L->blocks())
+    Block->dropAllReferences();
+
+  if (MSSA && VerifyMemorySSA)
+    MSSA->verifyMemorySSA();
+
+  if (LI) {
+    // Erase the instructions and the blocks without having to worry
+    // about ordering because we already dropped the references.
+    // NOTE: This iteration is safe because erasing the block does not remove
+    // its entry from the loop's block list.  We do that in the next section.
+    for (Loop::block_iterator LpI = L->block_begin(), LpE = L->block_end();
+         LpI != LpE; ++LpI)
+      (*LpI)->eraseFromParent();
+
+    // Finally, the blocks from loopinfo.  This has to happen late because
+    // otherwise our loop iterators won't work.
+
+    SmallPtrSet<BasicBlock *, 8> blocks;
+    blocks.insert(L->block_begin(), L->block_end());
+    for (BasicBlock *BB : blocks)
+      LI->removeBlock(BB);
+
+    // The last step is to update LoopInfo now that we've eliminated this loop.
+    // Note: LoopInfo::erase remove the given loop and relink its subloops with
+    // its parent. While removeLoop/removeChildLoop remove the given loop but
+    // not relink its subloops, which is what we want.
+    if (Loop *ParentLoop = L->getParentLoop()) {
+      Loop::iterator I = find(*ParentLoop, L);
+      assert(I != ParentLoop->end() && "Couldn't find loop");
+      ParentLoop->removeChildLoop(I);
+    } else {
+      Loop::iterator I = find(*LI, L);
+      assert(I != LI->end() && "Couldn't find loop");
+      LI->removeLoop(I);
+    }
+    LI->destroy(L);
+  }
+}
+
 static Loop *getOutermostLoop(Loop *L) {
   while (Loop *Parent = L->getParentLoop())
     L = Parent;
@@ -806,227 +806,227 @@ void llvm::breakLoopBackedge(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
 }
 
 
-/// Checks if \p L has single exit through latch block except possibly 
-/// "deoptimizing" exits. Returns branch instruction terminating the loop 
-/// latch if above check is successful, nullptr otherwise. 
-static BranchInst *getExpectedExitLoopLatchBranch(Loop *L) { 
-  BasicBlock *Latch = L->getLoopLatch(); 
-  if (!Latch) 
-    return nullptr; 
- 
-  BranchInst *LatchBR = dyn_cast<BranchInst>(Latch->getTerminator()); 
-  if (!LatchBR || LatchBR->getNumSuccessors() != 2 || !L->isLoopExiting(Latch)) 
-    return nullptr; 
- 
-  assert((LatchBR->getSuccessor(0) == L->getHeader() || 
-          LatchBR->getSuccessor(1) == L->getHeader()) && 
-         "At least one edge out of the latch must go to the header"); 
- 
-  SmallVector<BasicBlock *, 4> ExitBlocks; 
-  L->getUniqueNonLatchExitBlocks(ExitBlocks); 
-  if (any_of(ExitBlocks, [](const BasicBlock *EB) { 
-        return !EB->getTerminatingDeoptimizeCall(); 
-      })) 
-    return nullptr; 
- 
-  return LatchBR; 
-} 
- 
-Optional<unsigned> 
-llvm::getLoopEstimatedTripCount(Loop *L, 
-                                unsigned *EstimatedLoopInvocationWeight) { 
-  // Support loops with an exiting latch and other existing exists only 
-  // deoptimize. 
-  BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L); 
-  if (!LatchBranch) 
-    return None; 
- 
-  // To estimate the number of times the loop body was executed, we want to 
-  // know the number of times the backedge was taken, vs. the number of times 
-  // we exited the loop. 
-  uint64_t BackedgeTakenWeight, LatchExitWeight; 
-  if (!LatchBranch->extractProfMetadata(BackedgeTakenWeight, LatchExitWeight)) 
-    return None; 
- 
-  if (LatchBranch->getSuccessor(0) != L->getHeader()) 
-    std::swap(BackedgeTakenWeight, LatchExitWeight); 
- 
-  if (!LatchExitWeight) 
-    return None; 
- 
-  if (EstimatedLoopInvocationWeight) 
-    *EstimatedLoopInvocationWeight = LatchExitWeight; 
- 
-  // Estimated backedge taken count is a ratio of the backedge taken weight by 
-  // the weight of the edge exiting the loop, rounded to nearest. 
-  uint64_t BackedgeTakenCount = 
-      llvm::divideNearest(BackedgeTakenWeight, LatchExitWeight); 
-  // Estimated trip count is one plus estimated backedge taken count. 
-  return BackedgeTakenCount + 1; 
-} 
- 
-bool llvm::setLoopEstimatedTripCount(Loop *L, unsigned EstimatedTripCount, 
-                                     unsigned EstimatedloopInvocationWeight) { 
-  // Support loops with an exiting latch and other existing exists only 
-  // deoptimize. 
-  BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L); 
-  if (!LatchBranch) 
-    return false; 
- 
-  // Calculate taken and exit weights. 
-  unsigned LatchExitWeight = 0; 
-  unsigned BackedgeTakenWeight = 0; 
- 
-  if (EstimatedTripCount > 0) { 
-    LatchExitWeight = EstimatedloopInvocationWeight; 
-    BackedgeTakenWeight = (EstimatedTripCount - 1) * LatchExitWeight; 
-  } 
- 
-  // Make a swap if back edge is taken when condition is "false". 
-  if (LatchBranch->getSuccessor(0) != L->getHeader()) 
-    std::swap(BackedgeTakenWeight, LatchExitWeight); 
- 
-  MDBuilder MDB(LatchBranch->getContext()); 
- 
-  // Set/Update profile metadata. 
-  LatchBranch->setMetadata( 
-      LLVMContext::MD_prof, 
-      MDB.createBranchWeights(BackedgeTakenWeight, LatchExitWeight)); 
- 
-  return true; 
-} 
- 
-bool llvm::hasIterationCountInvariantInParent(Loop *InnerLoop, 
-                                              ScalarEvolution &SE) { 
-  Loop *OuterL = InnerLoop->getParentLoop(); 
-  if (!OuterL) 
-    return true; 
- 
-  // Get the backedge taken count for the inner loop 
-  BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); 
-  const SCEV *InnerLoopBECountSC = SE.getExitCount(InnerLoop, InnerLoopLatch); 
-  if (isa<SCEVCouldNotCompute>(InnerLoopBECountSC) || 
-      !InnerLoopBECountSC->getType()->isIntegerTy()) 
-    return false; 
- 
-  // Get whether count is invariant to the outer loop 
-  ScalarEvolution::LoopDisposition LD = 
-      SE.getLoopDisposition(InnerLoopBECountSC, OuterL); 
-  if (LD != ScalarEvolution::LoopInvariant) 
-    return false; 
- 
-  return true; 
-} 
- 
+/// Checks if \p L has single exit through latch block except possibly
+/// "deoptimizing" exits. Returns branch instruction terminating the loop
+/// latch if above check is successful, nullptr otherwise.
+static BranchInst *getExpectedExitLoopLatchBranch(Loop *L) {
+  BasicBlock *Latch = L->getLoopLatch();
+  if (!Latch)
+    return nullptr;
+
+  BranchInst *LatchBR = dyn_cast<BranchInst>(Latch->getTerminator());
+  if (!LatchBR || LatchBR->getNumSuccessors() != 2 || !L->isLoopExiting(Latch))
+    return nullptr;
+
+  assert((LatchBR->getSuccessor(0) == L->getHeader() ||
+          LatchBR->getSuccessor(1) == L->getHeader()) &&
+         "At least one edge out of the latch must go to the header");
+
+  SmallVector<BasicBlock *, 4> ExitBlocks;
+  L->getUniqueNonLatchExitBlocks(ExitBlocks);
+  if (any_of(ExitBlocks, [](const BasicBlock *EB) {
+        return !EB->getTerminatingDeoptimizeCall();
+      }))
+    return nullptr;
+
+  return LatchBR;
+}
+
+Optional<unsigned>
+llvm::getLoopEstimatedTripCount(Loop *L,
+                                unsigned *EstimatedLoopInvocationWeight) {
+  // Support loops with an exiting latch and other existing exists only
+  // deoptimize.
+  BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
+  if (!LatchBranch)
+    return None;
+
+  // To estimate the number of times the loop body was executed, we want to
+  // know the number of times the backedge was taken, vs. the number of times
+  // we exited the loop.
+  uint64_t BackedgeTakenWeight, LatchExitWeight;
+  if (!LatchBranch->extractProfMetadata(BackedgeTakenWeight, LatchExitWeight))
+    return None;
+
+  if (LatchBranch->getSuccessor(0) != L->getHeader())
+    std::swap(BackedgeTakenWeight, LatchExitWeight);
+
+  if (!LatchExitWeight)
+    return None;
+
+  if (EstimatedLoopInvocationWeight)
+    *EstimatedLoopInvocationWeight = LatchExitWeight;
+
+  // Estimated backedge taken count is a ratio of the backedge taken weight by
+  // the weight of the edge exiting the loop, rounded to nearest.
+  uint64_t BackedgeTakenCount =
+      llvm::divideNearest(BackedgeTakenWeight, LatchExitWeight);
+  // Estimated trip count is one plus estimated backedge taken count.
+  return BackedgeTakenCount + 1;
+}
+
+bool llvm::setLoopEstimatedTripCount(Loop *L, unsigned EstimatedTripCount,
+                                     unsigned EstimatedloopInvocationWeight) {
+  // Support loops with an exiting latch and other existing exists only
+  // deoptimize.
+  BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
+  if (!LatchBranch)
+    return false;
+
+  // Calculate taken and exit weights.
+  unsigned LatchExitWeight = 0;
+  unsigned BackedgeTakenWeight = 0;
+
+  if (EstimatedTripCount > 0) {
+    LatchExitWeight = EstimatedloopInvocationWeight;
+    BackedgeTakenWeight = (EstimatedTripCount - 1) * LatchExitWeight;
+  }
+
+  // Make a swap if back edge is taken when condition is "false".
+  if (LatchBranch->getSuccessor(0) != L->getHeader())
+    std::swap(BackedgeTakenWeight, LatchExitWeight);
+
+  MDBuilder MDB(LatchBranch->getContext());
+
+  // Set/Update profile metadata.
+  LatchBranch->setMetadata(
+      LLVMContext::MD_prof,
+      MDB.createBranchWeights(BackedgeTakenWeight, LatchExitWeight));
+
+  return true;
+}
+
+bool llvm::hasIterationCountInvariantInParent(Loop *InnerLoop,
+                                              ScalarEvolution &SE) {
+  Loop *OuterL = InnerLoop->getParentLoop();
+  if (!OuterL)
+    return true;
+
+  // Get the backedge taken count for the inner loop
+  BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
+  const SCEV *InnerLoopBECountSC = SE.getExitCount(InnerLoop, InnerLoopLatch);
+  if (isa<SCEVCouldNotCompute>(InnerLoopBECountSC) ||
+      !InnerLoopBECountSC->getType()->isIntegerTy())
+    return false;
+
+  // Get whether count is invariant to the outer loop
+  ScalarEvolution::LoopDisposition LD =
+      SE.getLoopDisposition(InnerLoopBECountSC, OuterL);
+  if (LD != ScalarEvolution::LoopInvariant)
+    return false;
+
+  return true;
+}
+
 Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left,
                             Value *Right) {
   CmpInst::Predicate Pred;
-  switch (RK) { 
-  default: 
-    llvm_unreachable("Unknown min/max recurrence kind"); 
+  switch (RK) {
+  default:
+    llvm_unreachable("Unknown min/max recurrence kind");
   case RecurKind::UMin:
     Pred = CmpInst::ICMP_ULT;
-    break; 
+    break;
   case RecurKind::UMax:
     Pred = CmpInst::ICMP_UGT;
-    break; 
+    break;
   case RecurKind::SMin:
     Pred = CmpInst::ICMP_SLT;
-    break; 
+    break;
   case RecurKind::SMax:
     Pred = CmpInst::ICMP_SGT;
-    break; 
+    break;
   case RecurKind::FMin:
     Pred = CmpInst::FCMP_OLT;
-    break; 
+    break;
   case RecurKind::FMax:
     Pred = CmpInst::FCMP_OGT;
-    break; 
-  } 
- 
-  // We only match FP sequences that are 'fast', so we can unconditionally 
-  // set it on any generated instructions. 
-  IRBuilderBase::FastMathFlagGuard FMFG(Builder); 
-  FastMathFlags FMF; 
-  FMF.setFast(); 
-  Builder.setFastMathFlags(FMF); 
+    break;
+  }
+
+  // We only match FP sequences that are 'fast', so we can unconditionally
+  // set it on any generated instructions.
+  IRBuilderBase::FastMathFlagGuard FMFG(Builder);
+  FastMathFlags FMF;
+  FMF.setFast();
+  Builder.setFastMathFlags(FMF);
   Value *Cmp = Builder.CreateCmp(Pred, Left, Right, "rdx.minmax.cmp");
-  Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select"); 
-  return Select; 
-} 
- 
-// Helper to generate an ordered reduction. 
+  Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select");
+  return Select;
+}
+
+// Helper to generate an ordered reduction.
 Value *llvm::getOrderedReduction(IRBuilderBase &Builder, Value *Acc, Value *Src,
                                  unsigned Op, RecurKind RdxKind,
                                  ArrayRef<Value *> RedOps) {
-  unsigned VF = cast<FixedVectorType>(Src->getType())->getNumElements(); 
- 
-  // Extract and apply reduction ops in ascending order: 
-  // e.g. ((((Acc + Scl[0]) + Scl[1]) + Scl[2]) + ) ... + Scl[VF-1] 
-  Value *Result = Acc; 
-  for (unsigned ExtractIdx = 0; ExtractIdx != VF; ++ExtractIdx) { 
-    Value *Ext = 
-        Builder.CreateExtractElement(Src, Builder.getInt32(ExtractIdx)); 
- 
-    if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 
-      Result = Builder.CreateBinOp((Instruction::BinaryOps)Op, Result, Ext, 
-                                   "bin.rdx"); 
-    } else { 
+  unsigned VF = cast<FixedVectorType>(Src->getType())->getNumElements();
+
+  // Extract and apply reduction ops in ascending order:
+  // e.g. ((((Acc + Scl[0]) + Scl[1]) + Scl[2]) + ) ... + Scl[VF-1]
+  Value *Result = Acc;
+  for (unsigned ExtractIdx = 0; ExtractIdx != VF; ++ExtractIdx) {
+    Value *Ext =
+        Builder.CreateExtractElement(Src, Builder.getInt32(ExtractIdx));
+
+    if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
+      Result = Builder.CreateBinOp((Instruction::BinaryOps)Op, Result, Ext,
+                                   "bin.rdx");
+    } else {
       assert(RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind) &&
-             "Invalid min/max"); 
+             "Invalid min/max");
       Result = createMinMaxOp(Builder, RdxKind, Result, Ext);
-    } 
- 
-    if (!RedOps.empty()) 
-      propagateIRFlags(Result, RedOps); 
-  } 
- 
-  return Result; 
-} 
- 
-// Helper to generate a log2 shuffle reduction. 
+    }
+
+    if (!RedOps.empty())
+      propagateIRFlags(Result, RedOps);
+  }
+
+  return Result;
+}
+
+// Helper to generate a log2 shuffle reduction.
 Value *llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src,
                                  unsigned Op, RecurKind RdxKind,
                                  ArrayRef<Value *> RedOps) {
-  unsigned VF = cast<FixedVectorType>(Src->getType())->getNumElements(); 
-  // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles 
-  // and vector ops, reducing the set of values being computed by half each 
-  // round. 
-  assert(isPowerOf2_32(VF) && 
-         "Reduction emission only supported for pow2 vectors!"); 
-  Value *TmpVec = Src; 
-  SmallVector<int, 32> ShuffleMask(VF); 
-  for (unsigned i = VF; i != 1; i >>= 1) { 
-    // Move the upper half of the vector to the lower half. 
-    for (unsigned j = 0; j != i / 2; ++j) 
-      ShuffleMask[j] = i / 2 + j; 
- 
-    // Fill the rest of the mask with undef. 
-    std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), -1); 
- 
+  unsigned VF = cast<FixedVectorType>(Src->getType())->getNumElements();
+  // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
+  // and vector ops, reducing the set of values being computed by half each
+  // round.
+  assert(isPowerOf2_32(VF) &&
+         "Reduction emission only supported for pow2 vectors!");
+  Value *TmpVec = Src;
+  SmallVector<int, 32> ShuffleMask(VF);
+  for (unsigned i = VF; i != 1; i >>= 1) {
+    // Move the upper half of the vector to the lower half.
+    for (unsigned j = 0; j != i / 2; ++j)
+      ShuffleMask[j] = i / 2 + j;
+
+    // Fill the rest of the mask with undef.
+    std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), -1);
+
     Value *Shuf = Builder.CreateShuffleVector(TmpVec, ShuffleMask, "rdx.shuf");
- 
-    if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 
-      // The builder propagates its fast-math-flags setting. 
-      TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf, 
-                                   "bin.rdx"); 
-    } else { 
+
+    if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
+      // The builder propagates its fast-math-flags setting.
+      TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf,
+                                   "bin.rdx");
+    } else {
       assert(RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind) &&
-             "Invalid min/max"); 
+             "Invalid min/max");
       TmpVec = createMinMaxOp(Builder, RdxKind, TmpVec, Shuf);
-    } 
-    if (!RedOps.empty()) 
-      propagateIRFlags(TmpVec, RedOps); 
- 
-    // We may compute the reassociated scalar ops in a way that does not 
-    // preserve nsw/nuw etc. Conservatively, drop those flags. 
-    if (auto *ReductionInst = dyn_cast<Instruction>(TmpVec)) 
-      ReductionInst->dropPoisonGeneratingFlags(); 
-  } 
-  // The result is in the first element of the vector. 
-  return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); 
-} 
- 
+    }
+    if (!RedOps.empty())
+      propagateIRFlags(TmpVec, RedOps);
+
+    // We may compute the reassociated scalar ops in a way that does not
+    // preserve nsw/nuw etc. Conservatively, drop those flags.
+    if (auto *ReductionInst = dyn_cast<Instruction>(TmpVec))
+      ReductionInst->dropPoisonGeneratingFlags();
+  }
+  // The result is in the first element of the vector.
+  return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
+}
+
 Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder,
                                          const TargetTransformInfo *TTI,
                                          Value *Src, RecurKind RdxKind,
@@ -1039,7 +1039,7 @@ Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder,
   if (!ForceReductionIntrinsic &&
       !TTI->useReductionIntrinsic(Opcode, Src->getType(), RdxFlags))
     return getShuffleReduction(Builder, Src, Opcode, RdxKind, RedOps);
- 
+
   auto *SrcVecEltTy = cast<VectorType>(Src->getType())->getElementType();
   switch (RdxKind) {
   case RecurKind::Add:
@@ -1069,656 +1069,656 @@ Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder,
     return Builder.CreateFPMaxReduce(Src);
   case RecurKind::FMin:
     return Builder.CreateFPMinReduce(Src);
-  default: 
-    llvm_unreachable("Unhandled opcode"); 
-  } 
-} 
- 
-Value *llvm::createTargetReduction(IRBuilderBase &B, 
-                                   const TargetTransformInfo *TTI, 
+  default:
+    llvm_unreachable("Unhandled opcode");
+  }
+}
+
+Value *llvm::createTargetReduction(IRBuilderBase &B,
+                                   const TargetTransformInfo *TTI,
                                    RecurrenceDescriptor &Desc, Value *Src) {
-  // TODO: Support in-order reductions based on the recurrence descriptor. 
-  // All ops in the reduction inherit fast-math-flags from the recurrence 
-  // descriptor. 
-  IRBuilderBase::FastMathFlagGuard FMFGuard(B); 
-  B.setFastMathFlags(Desc.getFastMathFlags()); 
+  // TODO: Support in-order reductions based on the recurrence descriptor.
+  // All ops in the reduction inherit fast-math-flags from the recurrence
+  // descriptor.
+  IRBuilderBase::FastMathFlagGuard FMFGuard(B);
+  B.setFastMathFlags(Desc.getFastMathFlags());
   return createSimpleTargetReduction(B, TTI, Src, Desc.getRecurrenceKind());
-} 
- 
-void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL, Value *OpValue) { 
-  auto *VecOp = dyn_cast<Instruction>(I); 
-  if (!VecOp) 
-    return; 
-  auto *Intersection = (OpValue == nullptr) ? dyn_cast<Instruction>(VL[0]) 
-                                            : dyn_cast<Instruction>(OpValue); 
-  if (!Intersection) 
-    return; 
-  const unsigned Opcode = Intersection->getOpcode(); 
-  VecOp->copyIRFlags(Intersection); 
-  for (auto *V : VL) { 
-    auto *Instr = dyn_cast<Instruction>(V); 
-    if (!Instr) 
-      continue; 
-    if (OpValue == nullptr || Opcode == Instr->getOpcode()) 
-      VecOp->andIRFlags(V); 
-  } 
-} 
- 
-bool llvm::isKnownNegativeInLoop(const SCEV *S, const Loop *L, 
-                                 ScalarEvolution &SE) { 
-  const SCEV *Zero = SE.getZero(S->getType()); 
-  return SE.isAvailableAtLoopEntry(S, L) && 
-         SE.isLoopEntryGuardedByCond(L, ICmpInst::ICMP_SLT, S, Zero); 
-} 
- 
-bool llvm::isKnownNonNegativeInLoop(const SCEV *S, const Loop *L, 
-                                    ScalarEvolution &SE) { 
-  const SCEV *Zero = SE.getZero(S->getType()); 
-  return SE.isAvailableAtLoopEntry(S, L) && 
-         SE.isLoopEntryGuardedByCond(L, ICmpInst::ICMP_SGE, S, Zero); 
-} 
- 
-bool llvm::cannotBeMinInLoop(const SCEV *S, const Loop *L, ScalarEvolution &SE, 
-                             bool Signed) { 
-  unsigned BitWidth = cast<IntegerType>(S->getType())->getBitWidth(); 
-  APInt Min = Signed ? APInt::getSignedMinValue(BitWidth) : 
-    APInt::getMinValue(BitWidth); 
-  auto Predicate = Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; 
-  return SE.isAvailableAtLoopEntry(S, L) && 
-         SE.isLoopEntryGuardedByCond(L, Predicate, S, 
-                                     SE.getConstant(Min)); 
-} 
- 
-bool llvm::cannotBeMaxInLoop(const SCEV *S, const Loop *L, ScalarEvolution &SE, 
-                             bool Signed) { 
-  unsigned BitWidth = cast<IntegerType>(S->getType())->getBitWidth(); 
-  APInt Max = Signed ? APInt::getSignedMaxValue(BitWidth) : 
-    APInt::getMaxValue(BitWidth); 
-  auto Predicate = Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; 
-  return SE.isAvailableAtLoopEntry(S, L) && 
-         SE.isLoopEntryGuardedByCond(L, Predicate, S, 
-                                     SE.getConstant(Max)); 
-} 
- 
-//===----------------------------------------------------------------------===// 
-// rewriteLoopExitValues - Optimize IV users outside the loop. 
-// As a side effect, reduces the amount of IV processing within the loop. 
-//===----------------------------------------------------------------------===// 
- 
-// Return true if the SCEV expansion generated by the rewriter can replace the 
-// original value. SCEV guarantees that it produces the same value, but the way 
-// it is produced may be illegal IR.  Ideally, this function will only be 
-// called for verification. 
-static bool isValidRewrite(ScalarEvolution *SE, Value *FromVal, Value *ToVal) { 
-  // If an SCEV expression subsumed multiple pointers, its expansion could 
-  // reassociate the GEP changing the base pointer. This is illegal because the 
-  // final address produced by a GEP chain must be inbounds relative to its 
-  // underlying object. Otherwise basic alias analysis, among other things, 
-  // could fail in a dangerous way. Ultimately, SCEV will be improved to avoid 
-  // producing an expression involving multiple pointers. Until then, we must 
-  // bail out here. 
-  // 
+}
+
+void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL, Value *OpValue) {
+  auto *VecOp = dyn_cast<Instruction>(I);
+  if (!VecOp)
+    return;
+  auto *Intersection = (OpValue == nullptr) ? dyn_cast<Instruction>(VL[0])
+                                            : dyn_cast<Instruction>(OpValue);
+  if (!Intersection)
+    return;
+  const unsigned Opcode = Intersection->getOpcode();
+  VecOp->copyIRFlags(Intersection);
+  for (auto *V : VL) {
+    auto *Instr = dyn_cast<Instruction>(V);
+    if (!Instr)
+      continue;
+    if (OpValue == nullptr || Opcode == Instr->getOpcode())
+      VecOp->andIRFlags(V);
+  }
+}
+
+bool llvm::isKnownNegativeInLoop(const SCEV *S, const Loop *L,
+                                 ScalarEvolution &SE) {
+  const SCEV *Zero = SE.getZero(S->getType());
+  return SE.isAvailableAtLoopEntry(S, L) &&
+         SE.isLoopEntryGuardedByCond(L, ICmpInst::ICMP_SLT, S, Zero);
+}
+
+bool llvm::isKnownNonNegativeInLoop(const SCEV *S, const Loop *L,
+                                    ScalarEvolution &SE) {
+  const SCEV *Zero = SE.getZero(S->getType());
+  return SE.isAvailableAtLoopEntry(S, L) &&
+         SE.isLoopEntryGuardedByCond(L, ICmpInst::ICMP_SGE, S, Zero);
+}
+
+bool llvm::cannotBeMinInLoop(const SCEV *S, const Loop *L, ScalarEvolution &SE,
+                             bool Signed) {
+  unsigned BitWidth = cast<IntegerType>(S->getType())->getBitWidth();
+  APInt Min = Signed ? APInt::getSignedMinValue(BitWidth) :
+    APInt::getMinValue(BitWidth);
+  auto Predicate = Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
+  return SE.isAvailableAtLoopEntry(S, L) &&
+         SE.isLoopEntryGuardedByCond(L, Predicate, S,
+                                     SE.getConstant(Min));
+}
+
+bool llvm::cannotBeMaxInLoop(const SCEV *S, const Loop *L, ScalarEvolution &SE,
+                             bool Signed) {
+  unsigned BitWidth = cast<IntegerType>(S->getType())->getBitWidth();
+  APInt Max = Signed ? APInt::getSignedMaxValue(BitWidth) :
+    APInt::getMaxValue(BitWidth);
+  auto Predicate = Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
+  return SE.isAvailableAtLoopEntry(S, L) &&
+         SE.isLoopEntryGuardedByCond(L, Predicate, S,
+                                     SE.getConstant(Max));
+}
+
+//===----------------------------------------------------------------------===//
+// rewriteLoopExitValues - Optimize IV users outside the loop.
+// As a side effect, reduces the amount of IV processing within the loop.
+//===----------------------------------------------------------------------===//
+
+// Return true if the SCEV expansion generated by the rewriter can replace the
+// original value. SCEV guarantees that it produces the same value, but the way
+// it is produced may be illegal IR.  Ideally, this function will only be
+// called for verification.
+static bool isValidRewrite(ScalarEvolution *SE, Value *FromVal, Value *ToVal) {
+  // If an SCEV expression subsumed multiple pointers, its expansion could
+  // reassociate the GEP changing the base pointer. This is illegal because the
+  // final address produced by a GEP chain must be inbounds relative to its
+  // underlying object. Otherwise basic alias analysis, among other things,
+  // could fail in a dangerous way. Ultimately, SCEV will be improved to avoid
+  // producing an expression involving multiple pointers. Until then, we must
+  // bail out here.
+  //
   // Retrieve the pointer operand of the GEP. Don't use getUnderlyingObject
-  // because it understands lcssa phis while SCEV does not. 
-  Value *FromPtr = FromVal; 
-  Value *ToPtr = ToVal; 
-  if (auto *GEP = dyn_cast<GEPOperator>(FromVal)) 
-    FromPtr = GEP->getPointerOperand(); 
- 
-  if (auto *GEP = dyn_cast<GEPOperator>(ToVal)) 
-    ToPtr = GEP->getPointerOperand(); 
- 
-  if (FromPtr != FromVal || ToPtr != ToVal) { 
-    // Quickly check the common case 
-    if (FromPtr == ToPtr) 
-      return true; 
- 
-    // SCEV may have rewritten an expression that produces the GEP's pointer 
-    // operand. That's ok as long as the pointer operand has the same base 
+  // because it understands lcssa phis while SCEV does not.
+  Value *FromPtr = FromVal;
+  Value *ToPtr = ToVal;
+  if (auto *GEP = dyn_cast<GEPOperator>(FromVal))
+    FromPtr = GEP->getPointerOperand();
+
+  if (auto *GEP = dyn_cast<GEPOperator>(ToVal))
+    ToPtr = GEP->getPointerOperand();
+
+  if (FromPtr != FromVal || ToPtr != ToVal) {
+    // Quickly check the common case
+    if (FromPtr == ToPtr)
+      return true;
+
+    // SCEV may have rewritten an expression that produces the GEP's pointer
+    // operand. That's ok as long as the pointer operand has the same base
     // pointer. Unlike getUnderlyingObject(), getPointerBase() will find the
-    // base of a recurrence. This handles the case in which SCEV expansion 
-    // converts a pointer type recurrence into a nonrecurrent pointer base 
-    // indexed by an integer recurrence. 
- 
-    // If the GEP base pointer is a vector of pointers, abort. 
-    if (!FromPtr->getType()->isPointerTy() || !ToPtr->getType()->isPointerTy()) 
-      return false; 
- 
-    const SCEV *FromBase = SE->getPointerBase(SE->getSCEV(FromPtr)); 
-    const SCEV *ToBase = SE->getPointerBase(SE->getSCEV(ToPtr)); 
-    if (FromBase == ToBase) 
-      return true; 
- 
-    LLVM_DEBUG(dbgs() << "rewriteLoopExitValues: GEP rewrite bail out " 
-                      << *FromBase << " != " << *ToBase << "\n"); 
- 
-    return false; 
-  } 
-  return true; 
-} 
- 
-static bool hasHardUserWithinLoop(const Loop *L, const Instruction *I) { 
-  SmallPtrSet<const Instruction *, 8> Visited; 
-  SmallVector<const Instruction *, 8> WorkList; 
-  Visited.insert(I); 
-  WorkList.push_back(I); 
-  while (!WorkList.empty()) { 
-    const Instruction *Curr = WorkList.pop_back_val(); 
-    // This use is outside the loop, nothing to do. 
-    if (!L->contains(Curr)) 
-      continue; 
-    // Do we assume it is a "hard" use which will not be eliminated easily? 
-    if (Curr->mayHaveSideEffects()) 
-      return true; 
-    // Otherwise, add all its users to worklist. 
-    for (auto U : Curr->users()) { 
-      auto *UI = cast<Instruction>(U); 
-      if (Visited.insert(UI).second) 
-        WorkList.push_back(UI); 
-    } 
-  } 
-  return false; 
-} 
- 
-// Collect information about PHI nodes which can be transformed in 
-// rewriteLoopExitValues. 
-struct RewritePhi { 
-  PHINode *PN;               // For which PHI node is this replacement? 
-  unsigned Ith;              // For which incoming value? 
-  const SCEV *ExpansionSCEV; // The SCEV of the incoming value we are rewriting. 
-  Instruction *ExpansionPoint; // Where we'd like to expand that SCEV? 
-  bool HighCost;               // Is this expansion a high-cost? 
- 
-  Value *Expansion = nullptr; 
-  bool ValidRewrite = false; 
- 
-  RewritePhi(PHINode *P, unsigned I, const SCEV *Val, Instruction *ExpansionPt, 
-             bool H) 
-      : PN(P), Ith(I), ExpansionSCEV(Val), ExpansionPoint(ExpansionPt), 
-        HighCost(H) {} 
-}; 
- 
-// Check whether it is possible to delete the loop after rewriting exit 
-// value. If it is possible, ignore ReplaceExitValue and do rewriting 
-// aggressively. 
-static bool canLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet) { 
-  BasicBlock *Preheader = L->getLoopPreheader(); 
-  // If there is no preheader, the loop will not be deleted. 
-  if (!Preheader) 
-    return false; 
- 
-  // In LoopDeletion pass Loop can be deleted when ExitingBlocks.size() > 1. 
-  // We obviate multiple ExitingBlocks case for simplicity. 
-  // TODO: If we see testcase with multiple ExitingBlocks can be deleted 
-  // after exit value rewriting, we can enhance the logic here. 
-  SmallVector<BasicBlock *, 4> ExitingBlocks; 
-  L->getExitingBlocks(ExitingBlocks); 
-  SmallVector<BasicBlock *, 8> ExitBlocks; 
-  L->getUniqueExitBlocks(ExitBlocks); 
-  if (ExitBlocks.size() != 1 || ExitingBlocks.size() != 1) 
-    return false; 
- 
-  BasicBlock *ExitBlock = ExitBlocks[0]; 
-  BasicBlock::iterator BI = ExitBlock->begin(); 
-  while (PHINode *P = dyn_cast<PHINode>(BI)) { 
-    Value *Incoming = P->getIncomingValueForBlock(ExitingBlocks[0]); 
- 
-    // If the Incoming value of P is found in RewritePhiSet, we know it 
-    // could be rewritten to use a loop invariant value in transformation 
-    // phase later. Skip it in the loop invariant check below. 
-    bool found = false; 
-    for (const RewritePhi &Phi : RewritePhiSet) { 
-      if (!Phi.ValidRewrite) 
-        continue; 
-      unsigned i = Phi.Ith; 
-      if (Phi.PN == P && (Phi.PN)->getIncomingValue(i) == Incoming) { 
-        found = true; 
-        break; 
-      } 
-    } 
- 
-    Instruction *I; 
-    if (!found && (I = dyn_cast<Instruction>(Incoming))) 
-      if (!L->hasLoopInvariantOperands(I)) 
-        return false; 
- 
-    ++BI; 
-  } 
- 
-  for (auto *BB : L->blocks()) 
-    if (llvm::any_of(*BB, [](Instruction &I) { 
-          return I.mayHaveSideEffects(); 
-        })) 
-      return false; 
- 
-  return true; 
-} 
- 
-int llvm::rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI, 
-                                ScalarEvolution *SE, 
-                                const TargetTransformInfo *TTI, 
-                                SCEVExpander &Rewriter, DominatorTree *DT, 
-                                ReplaceExitVal ReplaceExitValue, 
-                                SmallVector<WeakTrackingVH, 16> &DeadInsts) { 
-  // Check a pre-condition. 
-  assert(L->isRecursivelyLCSSAForm(*DT, *LI) && 
-         "Indvars did not preserve LCSSA!"); 
- 
-  SmallVector<BasicBlock*, 8> ExitBlocks; 
-  L->getUniqueExitBlocks(ExitBlocks); 
- 
-  SmallVector<RewritePhi, 8> RewritePhiSet; 
-  // Find all values that are computed inside the loop, but used outside of it. 
-  // Because of LCSSA, these values will only occur in LCSSA PHI Nodes.  Scan 
-  // the exit blocks of the loop to find them. 
-  for (BasicBlock *ExitBB : ExitBlocks) { 
-    // If there are no PHI nodes in this exit block, then no values defined 
-    // inside the loop are used on this path, skip it. 
-    PHINode *PN = dyn_cast<PHINode>(ExitBB->begin()); 
-    if (!PN) continue; 
- 
-    unsigned NumPreds = PN->getNumIncomingValues(); 
- 
-    // Iterate over all of the PHI nodes. 
-    BasicBlock::iterator BBI = ExitBB->begin(); 
-    while ((PN = dyn_cast<PHINode>(BBI++))) { 
-      if (PN->use_empty()) 
-        continue; // dead use, don't replace it 
- 
-      if (!SE->isSCEVable(PN->getType())) 
-        continue; 
- 
-      // It's necessary to tell ScalarEvolution about this explicitly so that 
-      // it can walk the def-use list and forget all SCEVs, as it may not be 
-      // watching the PHI itself. Once the new exit value is in place, there 
-      // may not be a def-use connection between the loop and every instruction 
-      // which got a SCEVAddRecExpr for that loop. 
-      SE->forgetValue(PN); 
- 
-      // Iterate over all of the values in all the PHI nodes. 
-      for (unsigned i = 0; i != NumPreds; ++i) { 
-        // If the value being merged in is not integer or is not defined 
-        // in the loop, skip it. 
-        Value *InVal = PN->getIncomingValue(i); 
-        if (!isa<Instruction>(InVal)) 
-          continue; 
- 
-        // If this pred is for a subloop, not L itself, skip it. 
-        if (LI->getLoopFor(PN->getIncomingBlock(i)) != L) 
-          continue; // The Block is in a subloop, skip it. 
- 
-        // Check that InVal is defined in the loop. 
-        Instruction *Inst = cast<Instruction>(InVal); 
-        if (!L->contains(Inst)) 
-          continue; 
- 
-        // Okay, this instruction has a user outside of the current loop 
-        // and varies predictably *inside* the loop.  Evaluate the value it 
-        // contains when the loop exits, if possible.  We prefer to start with 
-        // expressions which are true for all exits (so as to maximize 
-        // expression reuse by the SCEVExpander), but resort to per-exit 
-        // evaluation if that fails. 
-        const SCEV *ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop()); 
-        if (isa<SCEVCouldNotCompute>(ExitValue) || 
-            !SE->isLoopInvariant(ExitValue, L) || 
-            !isSafeToExpand(ExitValue, *SE)) { 
-          // TODO: This should probably be sunk into SCEV in some way; maybe a 
-          // getSCEVForExit(SCEV*, L, ExitingBB)?  It can be generalized for 
-          // most SCEV expressions and other recurrence types (e.g. shift 
-          // recurrences).  Is there existing code we can reuse? 
-          const SCEV *ExitCount = SE->getExitCount(L, PN->getIncomingBlock(i)); 
-          if (isa<SCEVCouldNotCompute>(ExitCount)) 
-            continue; 
-          if (auto *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Inst))) 
-            if (AddRec->getLoop() == L) 
-              ExitValue = AddRec->evaluateAtIteration(ExitCount, *SE); 
-          if (isa<SCEVCouldNotCompute>(ExitValue) || 
-              !SE->isLoopInvariant(ExitValue, L) || 
-              !isSafeToExpand(ExitValue, *SE)) 
-            continue; 
-        } 
- 
-        // Computing the value outside of the loop brings no benefit if it is 
-        // definitely used inside the loop in a way which can not be optimized 
-        // away. Avoid doing so unless we know we have a value which computes 
-        // the ExitValue already. TODO: This should be merged into SCEV 
-        // expander to leverage its knowledge of existing expressions. 
-        if (ReplaceExitValue != AlwaysRepl && !isa<SCEVConstant>(ExitValue) && 
-            !isa<SCEVUnknown>(ExitValue) && hasHardUserWithinLoop(L, Inst)) 
-          continue; 
- 
-        // Check if expansions of this SCEV would count as being high cost. 
-        bool HighCost = Rewriter.isHighCostExpansion( 
-            ExitValue, L, SCEVCheapExpansionBudget, TTI, Inst); 
- 
-        // Note that we must not perform expansions until after 
-        // we query *all* the costs, because if we perform temporary expansion 
-        // inbetween, one that we might not intend to keep, said expansion 
-        // *may* affect cost calculation of the the next SCEV's we'll query, 
-        // and next SCEV may errneously get smaller cost. 
- 
-        // Collect all the candidate PHINodes to be rewritten. 
-        RewritePhiSet.emplace_back(PN, i, ExitValue, Inst, HighCost); 
-      } 
-    } 
-  } 
- 
-  // Now that we've done preliminary filtering and billed all the SCEV's, 
-  // we can perform the last sanity check - the expansion must be valid. 
-  for (RewritePhi &Phi : RewritePhiSet) { 
-    Phi.Expansion = Rewriter.expandCodeFor(Phi.ExpansionSCEV, Phi.PN->getType(), 
-                                           Phi.ExpansionPoint); 
- 
-    LLVM_DEBUG(dbgs() << "rewriteLoopExitValues: AfterLoopVal = " 
-                      << *(Phi.Expansion) << '\n' 
-                      << "  LoopVal = " << *(Phi.ExpansionPoint) << "\n"); 
- 
-    // FIXME: isValidRewrite() is a hack. it should be an assert, eventually. 
-    Phi.ValidRewrite = isValidRewrite(SE, Phi.ExpansionPoint, Phi.Expansion); 
-    if (!Phi.ValidRewrite) { 
-      DeadInsts.push_back(Phi.Expansion); 
-      continue; 
-    } 
- 
-#ifndef NDEBUG 
-    // If we reuse an instruction from a loop which is neither L nor one of 
-    // its containing loops, we end up breaking LCSSA form for this loop by 
-    // creating a new use of its instruction. 
-    if (auto *ExitInsn = dyn_cast<Instruction>(Phi.Expansion)) 
-      if (auto *EVL = LI->getLoopFor(ExitInsn->getParent())) 
-        if (EVL != L) 
-          assert(EVL->contains(L) && "LCSSA breach detected!"); 
-#endif 
-  } 
- 
-  // TODO: after isValidRewrite() is an assertion, evaluate whether 
-  // it is beneficial to change how we calculate high-cost: 
-  // if we have SCEV 'A' which we know we will expand, should we calculate 
-  // the cost of other SCEV's after expanding SCEV 'A', 
-  // thus potentially giving cost bonus to those other SCEV's? 
- 
-  bool LoopCanBeDel = canLoopBeDeleted(L, RewritePhiSet); 
-  int NumReplaced = 0; 
- 
-  // Transformation. 
-  for (const RewritePhi &Phi : RewritePhiSet) { 
-    if (!Phi.ValidRewrite) 
-      continue; 
- 
-    PHINode *PN = Phi.PN; 
-    Value *ExitVal = Phi.Expansion; 
- 
-    // Only do the rewrite when the ExitValue can be expanded cheaply. 
-    // If LoopCanBeDel is true, rewrite exit value aggressively. 
-    if (ReplaceExitValue == OnlyCheapRepl && !LoopCanBeDel && Phi.HighCost) { 
-      DeadInsts.push_back(ExitVal); 
-      continue; 
-    } 
- 
-    NumReplaced++; 
-    Instruction *Inst = cast<Instruction>(PN->getIncomingValue(Phi.Ith)); 
-    PN->setIncomingValue(Phi.Ith, ExitVal); 
- 
-    // If this instruction is dead now, delete it. Don't do it now to avoid 
-    // invalidating iterators. 
-    if (isInstructionTriviallyDead(Inst, TLI)) 
-      DeadInsts.push_back(Inst); 
- 
-    // Replace PN with ExitVal if that is legal and does not break LCSSA. 
-    if (PN->getNumIncomingValues() == 1 && 
-        LI->replacementPreservesLCSSAForm(PN, ExitVal)) { 
-      PN->replaceAllUsesWith(ExitVal); 
-      PN->eraseFromParent(); 
-    } 
-  } 
- 
-  // The insertion point instruction may have been deleted; clear it out 
-  // so that the rewriter doesn't trip over it later. 
-  Rewriter.clearInsertPoint(); 
-  return NumReplaced; 
-} 
- 
-/// Set weights for \p UnrolledLoop and \p RemainderLoop based on weights for 
-/// \p OrigLoop. 
-void llvm::setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop, 
-                                        Loop *RemainderLoop, uint64_t UF) { 
-  assert(UF > 0 && "Zero unrolled factor is not supported"); 
-  assert(UnrolledLoop != RemainderLoop && 
-         "Unrolled and Remainder loops are expected to distinct"); 
- 
-  // Get number of iterations in the original scalar loop. 
-  unsigned OrigLoopInvocationWeight = 0; 
-  Optional<unsigned> OrigAverageTripCount = 
-      getLoopEstimatedTripCount(OrigLoop, &OrigLoopInvocationWeight); 
-  if (!OrigAverageTripCount) 
-    return; 
- 
-  // Calculate number of iterations in unrolled loop. 
-  unsigned UnrolledAverageTripCount = *OrigAverageTripCount / UF; 
-  // Calculate number of iterations for remainder loop. 
-  unsigned RemainderAverageTripCount = *OrigAverageTripCount % UF; 
- 
-  setLoopEstimatedTripCount(UnrolledLoop, UnrolledAverageTripCount, 
-                            OrigLoopInvocationWeight); 
-  setLoopEstimatedTripCount(RemainderLoop, RemainderAverageTripCount, 
-                            OrigLoopInvocationWeight); 
-} 
- 
-/// Utility that implements appending of loops onto a worklist. 
-/// Loops are added in preorder (analogous for reverse postorder for trees), 
-/// and the worklist is processed LIFO. 
-template <typename RangeT> 
-void llvm::appendReversedLoopsToWorklist( 
-    RangeT &&Loops, SmallPriorityWorklist<Loop *, 4> &Worklist) { 
-  // We use an internal worklist to build up the preorder traversal without 
-  // recursion. 
-  SmallVector<Loop *, 4> PreOrderLoops, PreOrderWorklist; 
- 
-  // We walk the initial sequence of loops in reverse because we generally want 
-  // to visit defs before uses and the worklist is LIFO. 
-  for (Loop *RootL : Loops) { 
-    assert(PreOrderLoops.empty() && "Must start with an empty preorder walk."); 
-    assert(PreOrderWorklist.empty() && 
-           "Must start with an empty preorder walk worklist."); 
-    PreOrderWorklist.push_back(RootL); 
-    do { 
-      Loop *L = PreOrderWorklist.pop_back_val(); 
-      PreOrderWorklist.append(L->begin(), L->end()); 
-      PreOrderLoops.push_back(L); 
-    } while (!PreOrderWorklist.empty()); 
- 
-    Worklist.insert(std::move(PreOrderLoops)); 
-    PreOrderLoops.clear(); 
-  } 
-} 
- 
-template <typename RangeT> 
-void llvm::appendLoopsToWorklist(RangeT &&Loops, 
-                                 SmallPriorityWorklist<Loop *, 4> &Worklist) { 
-  appendReversedLoopsToWorklist(reverse(Loops), Worklist); 
-} 
- 
-template void llvm::appendLoopsToWorklist<ArrayRef<Loop *> &>( 
-    ArrayRef<Loop *> &Loops, SmallPriorityWorklist<Loop *, 4> &Worklist); 
- 
-template void 
-llvm::appendLoopsToWorklist<Loop &>(Loop &L, 
-                                    SmallPriorityWorklist<Loop *, 4> &Worklist); 
- 
-void llvm::appendLoopsToWorklist(LoopInfo &LI, 
-                                 SmallPriorityWorklist<Loop *, 4> &Worklist) { 
-  appendReversedLoopsToWorklist(LI, Worklist); 
-} 
- 
-Loop *llvm::cloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM, 
-                      LoopInfo *LI, LPPassManager *LPM) { 
-  Loop &New = *LI->AllocateLoop(); 
-  if (PL) 
-    PL->addChildLoop(&New); 
-  else 
-    LI->addTopLevelLoop(&New); 
- 
-  if (LPM) 
-    LPM->addLoop(New); 
- 
-  // Add all of the blocks in L to the new loop. 
-  for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); 
-       I != E; ++I) 
-    if (LI->getLoopFor(*I) == L) 
-      New.addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), *LI); 
- 
-  // Add all of the subloops to the new loop. 
-  for (Loop *I : *L) 
-    cloneLoop(I, &New, VM, LI, LPM); 
- 
-  return &New; 
-} 
- 
-/// IR Values for the lower and upper bounds of a pointer evolution.  We 
-/// need to use value-handles because SCEV expansion can invalidate previously 
-/// expanded values.  Thus expansion of a pointer can invalidate the bounds for 
-/// a previous one. 
-struct PointerBounds { 
-  TrackingVH<Value> Start; 
-  TrackingVH<Value> End; 
-}; 
- 
-/// Expand code for the lower and upper bound of the pointer group \p CG 
-/// in \p TheLoop.  \return the values for the bounds. 
-static PointerBounds expandBounds(const RuntimeCheckingPtrGroup *CG, 
-                                  Loop *TheLoop, Instruction *Loc, 
-                                  SCEVExpander &Exp, ScalarEvolution *SE) { 
-  // TODO: Add helper to retrieve pointers to CG. 
-  Value *Ptr = CG->RtCheck.Pointers[CG->Members[0]].PointerValue; 
-  const SCEV *Sc = SE->getSCEV(Ptr); 
- 
-  unsigned AS = Ptr->getType()->getPointerAddressSpace(); 
-  LLVMContext &Ctx = Loc->getContext(); 
- 
-  // Use this type for pointer arithmetic. 
-  Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS); 
- 
-  if (SE->isLoopInvariant(Sc, TheLoop)) { 
-    LLVM_DEBUG(dbgs() << "LAA: Adding RT check for a loop invariant ptr:" 
-                      << *Ptr << "\n"); 
-    // Ptr could be in the loop body. If so, expand a new one at the correct 
-    // location. 
-    Instruction *Inst = dyn_cast<Instruction>(Ptr); 
-    Value *NewPtr = (Inst && TheLoop->contains(Inst)) 
-                        ? Exp.expandCodeFor(Sc, PtrArithTy, Loc) 
-                        : Ptr; 
-    // We must return a half-open range, which means incrementing Sc. 
-    const SCEV *ScPlusOne = SE->getAddExpr(Sc, SE->getOne(PtrArithTy)); 
-    Value *NewPtrPlusOne = Exp.expandCodeFor(ScPlusOne, PtrArithTy, Loc); 
-    return {NewPtr, NewPtrPlusOne}; 
-  } else { 
-    Value *Start = nullptr, *End = nullptr; 
-    LLVM_DEBUG(dbgs() << "LAA: Adding RT check for range:\n"); 
-    Start = Exp.expandCodeFor(CG->Low, PtrArithTy, Loc); 
-    End = Exp.expandCodeFor(CG->High, PtrArithTy, Loc); 
-    LLVM_DEBUG(dbgs() << "Start: " << *CG->Low << " End: " << *CG->High 
-                      << "\n"); 
-    return {Start, End}; 
-  } 
-} 
- 
-/// Turns a collection of checks into a collection of expanded upper and 
-/// lower bounds for both pointers in the check. 
-static SmallVector<std::pair<PointerBounds, PointerBounds>, 4> 
-expandBounds(const SmallVectorImpl<RuntimePointerCheck> &PointerChecks, Loop *L, 
-             Instruction *Loc, ScalarEvolution *SE, SCEVExpander &Exp) { 
-  SmallVector<std::pair<PointerBounds, PointerBounds>, 4> ChecksWithBounds; 
- 
-  // Here we're relying on the SCEV Expander's cache to only emit code for the 
-  // same bounds once. 
-  transform(PointerChecks, std::back_inserter(ChecksWithBounds), 
-            [&](const RuntimePointerCheck &Check) { 
-              PointerBounds First = expandBounds(Check.first, L, Loc, Exp, SE), 
-                            Second = 
-                                expandBounds(Check.second, L, Loc, Exp, SE); 
-              return std::make_pair(First, Second); 
-            }); 
- 
-  return ChecksWithBounds; 
-} 
- 
-std::pair<Instruction *, Instruction *> llvm::addRuntimeChecks( 
-    Instruction *Loc, Loop *TheLoop, 
-    const SmallVectorImpl<RuntimePointerCheck> &PointerChecks, 
-    ScalarEvolution *SE) { 
-  // TODO: Move noalias annotation code from LoopVersioning here and share with LV if possible. 
-  // TODO: Pass  RtPtrChecking instead of PointerChecks and SE separately, if possible 
-  const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout(); 
-  SCEVExpander Exp(*SE, DL, "induction"); 
-  auto ExpandedChecks = expandBounds(PointerChecks, TheLoop, Loc, SE, Exp); 
- 
-  LLVMContext &Ctx = Loc->getContext(); 
-  Instruction *FirstInst = nullptr; 
-  IRBuilder<> ChkBuilder(Loc); 
-  // Our instructions might fold to a constant. 
-  Value *MemoryRuntimeCheck = nullptr; 
- 
-  // FIXME: this helper is currently a duplicate of the one in 
-  // LoopVectorize.cpp. 
-  auto GetFirstInst = [](Instruction *FirstInst, Value *V, 
-                         Instruction *Loc) -> Instruction * { 
-    if (FirstInst) 
-      return FirstInst; 
-    if (Instruction *I = dyn_cast<Instruction>(V)) 
-      return I->getParent() == Loc->getParent() ? I : nullptr; 
-    return nullptr; 
-  }; 
- 
-  for (const auto &Check : ExpandedChecks) { 
-    const PointerBounds &A = Check.first, &B = Check.second; 
-    // Check if two pointers (A and B) conflict where conflict is computed as: 
-    // start(A) <= end(B) && start(B) <= end(A) 
-    unsigned AS0 = A.Start->getType()->getPointerAddressSpace(); 
-    unsigned AS1 = B.Start->getType()->getPointerAddressSpace(); 
- 
-    assert((AS0 == B.End->getType()->getPointerAddressSpace()) && 
-           (AS1 == A.End->getType()->getPointerAddressSpace()) && 
-           "Trying to bounds check pointers with different address spaces"); 
- 
-    Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0); 
-    Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1); 
- 
-    Value *Start0 = ChkBuilder.CreateBitCast(A.Start, PtrArithTy0, "bc"); 
-    Value *Start1 = ChkBuilder.CreateBitCast(B.Start, PtrArithTy1, "bc"); 
-    Value *End0 = ChkBuilder.CreateBitCast(A.End, PtrArithTy1, "bc"); 
-    Value *End1 = ChkBuilder.CreateBitCast(B.End, PtrArithTy0, "bc"); 
- 
-    // [A|B].Start points to the first accessed byte under base [A|B]. 
-    // [A|B].End points to the last accessed byte, plus one. 
-    // There is no conflict when the intervals are disjoint: 
-    // NoConflict = (B.Start >= A.End) || (A.Start >= B.End) 
-    // 
-    // bound0 = (B.Start < A.End) 
-    // bound1 = (A.Start < B.End) 
-    //  IsConflict = bound0 & bound1 
-    Value *Cmp0 = ChkBuilder.CreateICmpULT(Start0, End1, "bound0"); 
-    FirstInst = GetFirstInst(FirstInst, Cmp0, Loc); 
-    Value *Cmp1 = ChkBuilder.CreateICmpULT(Start1, End0, "bound1"); 
-    FirstInst = GetFirstInst(FirstInst, Cmp1, Loc); 
-    Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict"); 
-    FirstInst = GetFirstInst(FirstInst, IsConflict, Loc); 
-    if (MemoryRuntimeCheck) { 
-      IsConflict = 
-          ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx"); 
-      FirstInst = GetFirstInst(FirstInst, IsConflict, Loc); 
-    } 
-    MemoryRuntimeCheck = IsConflict; 
-  } 
- 
-  if (!MemoryRuntimeCheck) 
-    return std::make_pair(nullptr, nullptr); 
- 
-  // We have to do this trickery because the IRBuilder might fold the check to a 
-  // constant expression in which case there is no Instruction anchored in a 
-  // the block. 
-  Instruction *Check = 
-      BinaryOperator::CreateAnd(MemoryRuntimeCheck, ConstantInt::getTrue(Ctx)); 
-  ChkBuilder.Insert(Check, "memcheck.conflict"); 
-  FirstInst = GetFirstInst(FirstInst, Check, Loc); 
-  return std::make_pair(FirstInst, Check); 
-} 
+    // base of a recurrence. This handles the case in which SCEV expansion
+    // converts a pointer type recurrence into a nonrecurrent pointer base
+    // indexed by an integer recurrence.
+
+    // If the GEP base pointer is a vector of pointers, abort.
+    if (!FromPtr->getType()->isPointerTy() || !ToPtr->getType()->isPointerTy())
+      return false;
+
+    const SCEV *FromBase = SE->getPointerBase(SE->getSCEV(FromPtr));
+    const SCEV *ToBase = SE->getPointerBase(SE->getSCEV(ToPtr));
+    if (FromBase == ToBase)
+      return true;
+
+    LLVM_DEBUG(dbgs() << "rewriteLoopExitValues: GEP rewrite bail out "
+                      << *FromBase << " != " << *ToBase << "\n");
+
+    return false;
+  }
+  return true;
+}
+
+static bool hasHardUserWithinLoop(const Loop *L, const Instruction *I) {
+  SmallPtrSet<const Instruction *, 8> Visited;
+  SmallVector<const Instruction *, 8> WorkList;
+  Visited.insert(I);
+  WorkList.push_back(I);
+  while (!WorkList.empty()) {
+    const Instruction *Curr = WorkList.pop_back_val();
+    // This use is outside the loop, nothing to do.
+    if (!L->contains(Curr))
+      continue;
+    // Do we assume it is a "hard" use which will not be eliminated easily?
+    if (Curr->mayHaveSideEffects())
+      return true;
+    // Otherwise, add all its users to worklist.
+    for (auto U : Curr->users()) {
+      auto *UI = cast<Instruction>(U);
+      if (Visited.insert(UI).second)
+        WorkList.push_back(UI);
+    }
+  }
+  return false;
+}
+
+// Collect information about PHI nodes which can be transformed in
+// rewriteLoopExitValues.
+struct RewritePhi {
+  PHINode *PN;               // For which PHI node is this replacement?
+  unsigned Ith;              // For which incoming value?
+  const SCEV *ExpansionSCEV; // The SCEV of the incoming value we are rewriting.
+  Instruction *ExpansionPoint; // Where we'd like to expand that SCEV?
+  bool HighCost;               // Is this expansion a high-cost?
+
+  Value *Expansion = nullptr;
+  bool ValidRewrite = false;
+
+  RewritePhi(PHINode *P, unsigned I, const SCEV *Val, Instruction *ExpansionPt,
+             bool H)
+      : PN(P), Ith(I), ExpansionSCEV(Val), ExpansionPoint(ExpansionPt),
+        HighCost(H) {}
+};
+
+// Check whether it is possible to delete the loop after rewriting exit
+// value. If it is possible, ignore ReplaceExitValue and do rewriting
+// aggressively.
+static bool canLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet) {
+  BasicBlock *Preheader = L->getLoopPreheader();
+  // If there is no preheader, the loop will not be deleted.
+  if (!Preheader)
+    return false;
+
+  // In LoopDeletion pass Loop can be deleted when ExitingBlocks.size() > 1.
+  // We obviate multiple ExitingBlocks case for simplicity.
+  // TODO: If we see testcase with multiple ExitingBlocks can be deleted
+  // after exit value rewriting, we can enhance the logic here.
+  SmallVector<BasicBlock *, 4> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+  SmallVector<BasicBlock *, 8> ExitBlocks;
+  L->getUniqueExitBlocks(ExitBlocks);
+  if (ExitBlocks.size() != 1 || ExitingBlocks.size() != 1)
+    return false;
+
+  BasicBlock *ExitBlock = ExitBlocks[0];
+  BasicBlock::iterator BI = ExitBlock->begin();
+  while (PHINode *P = dyn_cast<PHINode>(BI)) {
+    Value *Incoming = P->getIncomingValueForBlock(ExitingBlocks[0]);
+
+    // If the Incoming value of P is found in RewritePhiSet, we know it
+    // could be rewritten to use a loop invariant value in transformation
+    // phase later. Skip it in the loop invariant check below.
+    bool found = false;
+    for (const RewritePhi &Phi : RewritePhiSet) {
+      if (!Phi.ValidRewrite)
+        continue;
+      unsigned i = Phi.Ith;
+      if (Phi.PN == P && (Phi.PN)->getIncomingValue(i) == Incoming) {
+        found = true;
+        break;
+      }
+    }
+
+    Instruction *I;
+    if (!found && (I = dyn_cast<Instruction>(Incoming)))
+      if (!L->hasLoopInvariantOperands(I))
+        return false;
+
+    ++BI;
+  }
+
+  for (auto *BB : L->blocks())
+    if (llvm::any_of(*BB, [](Instruction &I) {
+          return I.mayHaveSideEffects();
+        }))
+      return false;
+
+  return true;
+}
+
+int llvm::rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI,
+                                ScalarEvolution *SE,
+                                const TargetTransformInfo *TTI,
+                                SCEVExpander &Rewriter, DominatorTree *DT,
+                                ReplaceExitVal ReplaceExitValue,
+                                SmallVector<WeakTrackingVH, 16> &DeadInsts) {
+  // Check a pre-condition.
+  assert(L->isRecursivelyLCSSAForm(*DT, *LI) &&
+         "Indvars did not preserve LCSSA!");
+
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  L->getUniqueExitBlocks(ExitBlocks);
+
+  SmallVector<RewritePhi, 8> RewritePhiSet;
+  // Find all values that are computed inside the loop, but used outside of it.
+  // Because of LCSSA, these values will only occur in LCSSA PHI Nodes.  Scan
+  // the exit blocks of the loop to find them.
+  for (BasicBlock *ExitBB : ExitBlocks) {
+    // If there are no PHI nodes in this exit block, then no values defined
+    // inside the loop are used on this path, skip it.
+    PHINode *PN = dyn_cast<PHINode>(ExitBB->begin());
+    if (!PN) continue;
+
+    unsigned NumPreds = PN->getNumIncomingValues();
+
+    // Iterate over all of the PHI nodes.
+    BasicBlock::iterator BBI = ExitBB->begin();
+    while ((PN = dyn_cast<PHINode>(BBI++))) {
+      if (PN->use_empty())
+        continue; // dead use, don't replace it
+
+      if (!SE->isSCEVable(PN->getType()))
+        continue;
+
+      // It's necessary to tell ScalarEvolution about this explicitly so that
+      // it can walk the def-use list and forget all SCEVs, as it may not be
+      // watching the PHI itself. Once the new exit value is in place, there
+      // may not be a def-use connection between the loop and every instruction
+      // which got a SCEVAddRecExpr for that loop.
+      SE->forgetValue(PN);
+
+      // Iterate over all of the values in all the PHI nodes.
+      for (unsigned i = 0; i != NumPreds; ++i) {
+        // If the value being merged in is not integer or is not defined
+        // in the loop, skip it.
+        Value *InVal = PN->getIncomingValue(i);
+        if (!isa<Instruction>(InVal))
+          continue;
+
+        // If this pred is for a subloop, not L itself, skip it.
+        if (LI->getLoopFor(PN->getIncomingBlock(i)) != L)
+          continue; // The Block is in a subloop, skip it.
+
+        // Check that InVal is defined in the loop.
+        Instruction *Inst = cast<Instruction>(InVal);
+        if (!L->contains(Inst))
+          continue;
+
+        // Okay, this instruction has a user outside of the current loop
+        // and varies predictably *inside* the loop.  Evaluate the value it
+        // contains when the loop exits, if possible.  We prefer to start with
+        // expressions which are true for all exits (so as to maximize
+        // expression reuse by the SCEVExpander), but resort to per-exit
+        // evaluation if that fails.
+        const SCEV *ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop());
+        if (isa<SCEVCouldNotCompute>(ExitValue) ||
+            !SE->isLoopInvariant(ExitValue, L) ||
+            !isSafeToExpand(ExitValue, *SE)) {
+          // TODO: This should probably be sunk into SCEV in some way; maybe a
+          // getSCEVForExit(SCEV*, L, ExitingBB)?  It can be generalized for
+          // most SCEV expressions and other recurrence types (e.g. shift
+          // recurrences).  Is there existing code we can reuse?
+          const SCEV *ExitCount = SE->getExitCount(L, PN->getIncomingBlock(i));
+          if (isa<SCEVCouldNotCompute>(ExitCount))
+            continue;
+          if (auto *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Inst)))
+            if (AddRec->getLoop() == L)
+              ExitValue = AddRec->evaluateAtIteration(ExitCount, *SE);
+          if (isa<SCEVCouldNotCompute>(ExitValue) ||
+              !SE->isLoopInvariant(ExitValue, L) ||
+              !isSafeToExpand(ExitValue, *SE))
+            continue;
+        }
+
+        // Computing the value outside of the loop brings no benefit if it is
+        // definitely used inside the loop in a way which can not be optimized
+        // away. Avoid doing so unless we know we have a value which computes
+        // the ExitValue already. TODO: This should be merged into SCEV
+        // expander to leverage its knowledge of existing expressions.
+        if (ReplaceExitValue != AlwaysRepl && !isa<SCEVConstant>(ExitValue) &&
+            !isa<SCEVUnknown>(ExitValue) && hasHardUserWithinLoop(L, Inst))
+          continue;
+
+        // Check if expansions of this SCEV would count as being high cost.
+        bool HighCost = Rewriter.isHighCostExpansion(
+            ExitValue, L, SCEVCheapExpansionBudget, TTI, Inst);
+
+        // Note that we must not perform expansions until after
+        // we query *all* the costs, because if we perform temporary expansion
+        // inbetween, one that we might not intend to keep, said expansion
+        // *may* affect cost calculation of the the next SCEV's we'll query,
+        // and next SCEV may errneously get smaller cost.
+
+        // Collect all the candidate PHINodes to be rewritten.
+        RewritePhiSet.emplace_back(PN, i, ExitValue, Inst, HighCost);
+      }
+    }
+  }
+
+  // Now that we've done preliminary filtering and billed all the SCEV's,
+  // we can perform the last sanity check - the expansion must be valid.
+  for (RewritePhi &Phi : RewritePhiSet) {
+    Phi.Expansion = Rewriter.expandCodeFor(Phi.ExpansionSCEV, Phi.PN->getType(),
+                                           Phi.ExpansionPoint);
+
+    LLVM_DEBUG(dbgs() << "rewriteLoopExitValues: AfterLoopVal = "
+                      << *(Phi.Expansion) << '\n'
+                      << "  LoopVal = " << *(Phi.ExpansionPoint) << "\n");
+
+    // FIXME: isValidRewrite() is a hack. it should be an assert, eventually.
+    Phi.ValidRewrite = isValidRewrite(SE, Phi.ExpansionPoint, Phi.Expansion);
+    if (!Phi.ValidRewrite) {
+      DeadInsts.push_back(Phi.Expansion);
+      continue;
+    }
+
+#ifndef NDEBUG
+    // If we reuse an instruction from a loop which is neither L nor one of
+    // its containing loops, we end up breaking LCSSA form for this loop by
+    // creating a new use of its instruction.
+    if (auto *ExitInsn = dyn_cast<Instruction>(Phi.Expansion))
+      if (auto *EVL = LI->getLoopFor(ExitInsn->getParent()))
+        if (EVL != L)
+          assert(EVL->contains(L) && "LCSSA breach detected!");
+#endif
+  }
+
+  // TODO: after isValidRewrite() is an assertion, evaluate whether
+  // it is beneficial to change how we calculate high-cost:
+  // if we have SCEV 'A' which we know we will expand, should we calculate
+  // the cost of other SCEV's after expanding SCEV 'A',
+  // thus potentially giving cost bonus to those other SCEV's?
+
+  bool LoopCanBeDel = canLoopBeDeleted(L, RewritePhiSet);
+  int NumReplaced = 0;
+
+  // Transformation.
+  for (const RewritePhi &Phi : RewritePhiSet) {
+    if (!Phi.ValidRewrite)
+      continue;
+
+    PHINode *PN = Phi.PN;
+    Value *ExitVal = Phi.Expansion;
+
+    // Only do the rewrite when the ExitValue can be expanded cheaply.
+    // If LoopCanBeDel is true, rewrite exit value aggressively.
+    if (ReplaceExitValue == OnlyCheapRepl && !LoopCanBeDel && Phi.HighCost) {
+      DeadInsts.push_back(ExitVal);
+      continue;
+    }
+
+    NumReplaced++;
+    Instruction *Inst = cast<Instruction>(PN->getIncomingValue(Phi.Ith));
+    PN->setIncomingValue(Phi.Ith, ExitVal);
+
+    // If this instruction is dead now, delete it. Don't do it now to avoid
+    // invalidating iterators.
+    if (isInstructionTriviallyDead(Inst, TLI))
+      DeadInsts.push_back(Inst);
+
+    // Replace PN with ExitVal if that is legal and does not break LCSSA.
+    if (PN->getNumIncomingValues() == 1 &&
+        LI->replacementPreservesLCSSAForm(PN, ExitVal)) {
+      PN->replaceAllUsesWith(ExitVal);
+      PN->eraseFromParent();
+    }
+  }
+
+  // The insertion point instruction may have been deleted; clear it out
+  // so that the rewriter doesn't trip over it later.
+  Rewriter.clearInsertPoint();
+  return NumReplaced;
+}
+
+/// Set weights for \p UnrolledLoop and \p RemainderLoop based on weights for
+/// \p OrigLoop.
+void llvm::setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop,
+                                        Loop *RemainderLoop, uint64_t UF) {
+  assert(UF > 0 && "Zero unrolled factor is not supported");
+  assert(UnrolledLoop != RemainderLoop &&
+         "Unrolled and Remainder loops are expected to distinct");
+
+  // Get number of iterations in the original scalar loop.
+  unsigned OrigLoopInvocationWeight = 0;
+  Optional<unsigned> OrigAverageTripCount =
+      getLoopEstimatedTripCount(OrigLoop, &OrigLoopInvocationWeight);
+  if (!OrigAverageTripCount)
+    return;
+
+  // Calculate number of iterations in unrolled loop.
+  unsigned UnrolledAverageTripCount = *OrigAverageTripCount / UF;
+  // Calculate number of iterations for remainder loop.
+  unsigned RemainderAverageTripCount = *OrigAverageTripCount % UF;
+
+  setLoopEstimatedTripCount(UnrolledLoop, UnrolledAverageTripCount,
+                            OrigLoopInvocationWeight);
+  setLoopEstimatedTripCount(RemainderLoop, RemainderAverageTripCount,
+                            OrigLoopInvocationWeight);
+}
+
+/// Utility that implements appending of loops onto a worklist.
+/// Loops are added in preorder (analogous for reverse postorder for trees),
+/// and the worklist is processed LIFO.
+template <typename RangeT>
+void llvm::appendReversedLoopsToWorklist(
+    RangeT &&Loops, SmallPriorityWorklist<Loop *, 4> &Worklist) {
+  // We use an internal worklist to build up the preorder traversal without
+  // recursion.
+  SmallVector<Loop *, 4> PreOrderLoops, PreOrderWorklist;
+
+  // We walk the initial sequence of loops in reverse because we generally want
+  // to visit defs before uses and the worklist is LIFO.
+  for (Loop *RootL : Loops) {
+    assert(PreOrderLoops.empty() && "Must start with an empty preorder walk.");
+    assert(PreOrderWorklist.empty() &&
+           "Must start with an empty preorder walk worklist.");
+    PreOrderWorklist.push_back(RootL);
+    do {
+      Loop *L = PreOrderWorklist.pop_back_val();
+      PreOrderWorklist.append(L->begin(), L->end());
+      PreOrderLoops.push_back(L);
+    } while (!PreOrderWorklist.empty());
+
+    Worklist.insert(std::move(PreOrderLoops));
+    PreOrderLoops.clear();
+  }
+}
+
+template <typename RangeT>
+void llvm::appendLoopsToWorklist(RangeT &&Loops,
+                                 SmallPriorityWorklist<Loop *, 4> &Worklist) {
+  appendReversedLoopsToWorklist(reverse(Loops), Worklist);
+}
+
+template void llvm::appendLoopsToWorklist<ArrayRef<Loop *> &>(
+    ArrayRef<Loop *> &Loops, SmallPriorityWorklist<Loop *, 4> &Worklist);
+
+template void
+llvm::appendLoopsToWorklist<Loop &>(Loop &L,
+                                    SmallPriorityWorklist<Loop *, 4> &Worklist);
+
+void llvm::appendLoopsToWorklist(LoopInfo &LI,
+                                 SmallPriorityWorklist<Loop *, 4> &Worklist) {
+  appendReversedLoopsToWorklist(LI, Worklist);
+}
+
+Loop *llvm::cloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM,
+                      LoopInfo *LI, LPPassManager *LPM) {
+  Loop &New = *LI->AllocateLoop();
+  if (PL)
+    PL->addChildLoop(&New);
+  else
+    LI->addTopLevelLoop(&New);
+
+  if (LPM)
+    LPM->addLoop(New);
+
+  // Add all of the blocks in L to the new loop.
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I)
+    if (LI->getLoopFor(*I) == L)
+      New.addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), *LI);
+
+  // Add all of the subloops to the new loop.
+  for (Loop *I : *L)
+    cloneLoop(I, &New, VM, LI, LPM);
+
+  return &New;
+}
+
+/// IR Values for the lower and upper bounds of a pointer evolution.  We
+/// need to use value-handles because SCEV expansion can invalidate previously
+/// expanded values.  Thus expansion of a pointer can invalidate the bounds for
+/// a previous one.
+struct PointerBounds {
+  TrackingVH<Value> Start;
+  TrackingVH<Value> End;
+};
+
+/// Expand code for the lower and upper bound of the pointer group \p CG
+/// in \p TheLoop.  \return the values for the bounds.
+static PointerBounds expandBounds(const RuntimeCheckingPtrGroup *CG,
+                                  Loop *TheLoop, Instruction *Loc,
+                                  SCEVExpander &Exp, ScalarEvolution *SE) {
+  // TODO: Add helper to retrieve pointers to CG.
+  Value *Ptr = CG->RtCheck.Pointers[CG->Members[0]].PointerValue;
+  const SCEV *Sc = SE->getSCEV(Ptr);
+
+  unsigned AS = Ptr->getType()->getPointerAddressSpace();
+  LLVMContext &Ctx = Loc->getContext();
+
+  // Use this type for pointer arithmetic.
+  Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS);
+
+  if (SE->isLoopInvariant(Sc, TheLoop)) {
+    LLVM_DEBUG(dbgs() << "LAA: Adding RT check for a loop invariant ptr:"
+                      << *Ptr << "\n");
+    // Ptr could be in the loop body. If so, expand a new one at the correct
+    // location.
+    Instruction *Inst = dyn_cast<Instruction>(Ptr);
+    Value *NewPtr = (Inst && TheLoop->contains(Inst))
+                        ? Exp.expandCodeFor(Sc, PtrArithTy, Loc)
+                        : Ptr;
+    // We must return a half-open range, which means incrementing Sc.
+    const SCEV *ScPlusOne = SE->getAddExpr(Sc, SE->getOne(PtrArithTy));
+    Value *NewPtrPlusOne = Exp.expandCodeFor(ScPlusOne, PtrArithTy, Loc);
+    return {NewPtr, NewPtrPlusOne};
+  } else {
+    Value *Start = nullptr, *End = nullptr;
+    LLVM_DEBUG(dbgs() << "LAA: Adding RT check for range:\n");
+    Start = Exp.expandCodeFor(CG->Low, PtrArithTy, Loc);
+    End = Exp.expandCodeFor(CG->High, PtrArithTy, Loc);
+    LLVM_DEBUG(dbgs() << "Start: " << *CG->Low << " End: " << *CG->High
+                      << "\n");
+    return {Start, End};
+  }
+}
+
+/// Turns a collection of checks into a collection of expanded upper and
+/// lower bounds for both pointers in the check.
+static SmallVector<std::pair<PointerBounds, PointerBounds>, 4>
+expandBounds(const SmallVectorImpl<RuntimePointerCheck> &PointerChecks, Loop *L,
+             Instruction *Loc, ScalarEvolution *SE, SCEVExpander &Exp) {
+  SmallVector<std::pair<PointerBounds, PointerBounds>, 4> ChecksWithBounds;
+
+  // Here we're relying on the SCEV Expander's cache to only emit code for the
+  // same bounds once.
+  transform(PointerChecks, std::back_inserter(ChecksWithBounds),
+            [&](const RuntimePointerCheck &Check) {
+              PointerBounds First = expandBounds(Check.first, L, Loc, Exp, SE),
+                            Second =
+                                expandBounds(Check.second, L, Loc, Exp, SE);
+              return std::make_pair(First, Second);
+            });
+
+  return ChecksWithBounds;
+}
+
+std::pair<Instruction *, Instruction *> llvm::addRuntimeChecks(
+    Instruction *Loc, Loop *TheLoop,
+    const SmallVectorImpl<RuntimePointerCheck> &PointerChecks,
+    ScalarEvolution *SE) {
+  // TODO: Move noalias annotation code from LoopVersioning here and share with LV if possible.
+  // TODO: Pass  RtPtrChecking instead of PointerChecks and SE separately, if possible
+  const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
+  SCEVExpander Exp(*SE, DL, "induction");
+  auto ExpandedChecks = expandBounds(PointerChecks, TheLoop, Loc, SE, Exp);
+
+  LLVMContext &Ctx = Loc->getContext();
+  Instruction *FirstInst = nullptr;
+  IRBuilder<> ChkBuilder(Loc);
+  // Our instructions might fold to a constant.
+  Value *MemoryRuntimeCheck = nullptr;
+
+  // FIXME: this helper is currently a duplicate of the one in
+  // LoopVectorize.cpp.
+  auto GetFirstInst = [](Instruction *FirstInst, Value *V,
+                         Instruction *Loc) -> Instruction * {
+    if (FirstInst)
+      return FirstInst;
+    if (Instruction *I = dyn_cast<Instruction>(V))
+      return I->getParent() == Loc->getParent() ? I : nullptr;
+    return nullptr;
+  };
+
+  for (const auto &Check : ExpandedChecks) {
+    const PointerBounds &A = Check.first, &B = Check.second;
+    // Check if two pointers (A and B) conflict where conflict is computed as:
+    // start(A) <= end(B) && start(B) <= end(A)
+    unsigned AS0 = A.Start->getType()->getPointerAddressSpace();
+    unsigned AS1 = B.Start->getType()->getPointerAddressSpace();
+
+    assert((AS0 == B.End->getType()->getPointerAddressSpace()) &&
+           (AS1 == A.End->getType()->getPointerAddressSpace()) &&
+           "Trying to bounds check pointers with different address spaces");
+
+    Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0);
+    Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1);
+
+    Value *Start0 = ChkBuilder.CreateBitCast(A.Start, PtrArithTy0, "bc");
+    Value *Start1 = ChkBuilder.CreateBitCast(B.Start, PtrArithTy1, "bc");
+    Value *End0 = ChkBuilder.CreateBitCast(A.End, PtrArithTy1, "bc");
+    Value *End1 = ChkBuilder.CreateBitCast(B.End, PtrArithTy0, "bc");
+
+    // [A|B].Start points to the first accessed byte under base [A|B].
+    // [A|B].End points to the last accessed byte, plus one.
+    // There is no conflict when the intervals are disjoint:
+    // NoConflict = (B.Start >= A.End) || (A.Start >= B.End)
+    //
+    // bound0 = (B.Start < A.End)
+    // bound1 = (A.Start < B.End)
+    //  IsConflict = bound0 & bound1
+    Value *Cmp0 = ChkBuilder.CreateICmpULT(Start0, End1, "bound0");
+    FirstInst = GetFirstInst(FirstInst, Cmp0, Loc);
+    Value *Cmp1 = ChkBuilder.CreateICmpULT(Start1, End0, "bound1");
+    FirstInst = GetFirstInst(FirstInst, Cmp1, Loc);
+    Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
+    FirstInst = GetFirstInst(FirstInst, IsConflict, Loc);
+    if (MemoryRuntimeCheck) {
+      IsConflict =
+          ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx");
+      FirstInst = GetFirstInst(FirstInst, IsConflict, Loc);
+    }
+    MemoryRuntimeCheck = IsConflict;
+  }
+
+  if (!MemoryRuntimeCheck)
+    return std::make_pair(nullptr, nullptr);
+
+  // We have to do this trickery because the IRBuilder might fold the check to a
+  // constant expression in which case there is no Instruction anchored in a
+  // the block.
+  Instruction *Check =
+      BinaryOperator::CreateAnd(MemoryRuntimeCheck, ConstantInt::getTrue(Ctx));
+  ChkBuilder.Insert(Check, "memcheck.conflict");
+  FirstInst = GetFirstInst(FirstInst, Check, Loc);
+  return std::make_pair(FirstInst, Check);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/LoopVersioning.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/LoopVersioning.cpp
index 43a9f270c0..599bd1feb2 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/LoopVersioning.cpp
@@ -1,41 +1,41 @@
-//===- LoopVersioning.cpp - Utility to version a loop ---------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file defines a utility class to perform loop versioning.  The versioned 
-// loop speculates that otherwise may-aliasing memory accesses don't overlap and 
-// emits checks to prove this. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/LoopVersioning.h" 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/Analysis/LoopAccessAnalysis.h" 
-#include "llvm/Analysis/LoopInfo.h" 
+//===- LoopVersioning.cpp - Utility to version a loop ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a utility class to perform loop versioning.  The versioned
+// loop speculates that otherwise may-aliasing memory accesses don't overlap and
+// emits checks to prove this.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/MDBuilder.h" 
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/InitializePasses.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/Cloning.h" 
-#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 
- 
-using namespace llvm; 
- 
-static cl::opt<bool> 
-    AnnotateNoAlias("loop-version-annotate-no-alias", cl::init(true), 
-                    cl::Hidden, 
-                    cl::desc("Add no-alias annotation for instructions that " 
-                             "are disambiguated by memchecks")); 
- 
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+    AnnotateNoAlias("loop-version-annotate-no-alias", cl::init(true),
+                    cl::Hidden,
+                    cl::desc("Add no-alias annotation for instructions that "
+                             "are disambiguated by memchecks"));
+
 LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI,
                                ArrayRef<RuntimePointerCheck> Checks, Loop *L,
                                LoopInfo *LI, DominatorTree *DT,
@@ -43,217 +43,217 @@ LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI,
     : VersionedLoop(L), NonVersionedLoop(nullptr),
       AliasChecks(Checks.begin(), Checks.end()),
       Preds(LAI.getPSE().getUnionPredicate()), LAI(LAI), LI(LI), DT(DT),
-      SE(SE) { 
+      SE(SE) {
   assert(L->getUniqueExitBlock() && "No single exit block");
-} 
- 
-void LoopVersioning::versionLoop( 
-    const SmallVectorImpl<Instruction *> &DefsUsedOutside) { 
+}
+
+void LoopVersioning::versionLoop(
+    const SmallVectorImpl<Instruction *> &DefsUsedOutside) {
   assert(VersionedLoop->isLoopSimplifyForm() &&
          "Loop is not in loop-simplify form");
 
-  Instruction *FirstCheckInst; 
-  Instruction *MemRuntimeCheck; 
-  Value *SCEVRuntimeCheck; 
-  Value *RuntimeCheck = nullptr; 
- 
-  // Add the memcheck in the original preheader (this is empty initially). 
-  BasicBlock *RuntimeCheckBB = VersionedLoop->getLoopPreheader(); 
-  const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 
-  std::tie(FirstCheckInst, MemRuntimeCheck) = 
-      addRuntimeChecks(RuntimeCheckBB->getTerminator(), VersionedLoop, 
-                       AliasChecks, RtPtrChecking.getSE()); 
- 
-  SCEVExpander Exp(*SE, RuntimeCheckBB->getModule()->getDataLayout(), 
-                   "scev.check"); 
-  SCEVRuntimeCheck = 
+  Instruction *FirstCheckInst;
+  Instruction *MemRuntimeCheck;
+  Value *SCEVRuntimeCheck;
+  Value *RuntimeCheck = nullptr;
+
+  // Add the memcheck in the original preheader (this is empty initially).
+  BasicBlock *RuntimeCheckBB = VersionedLoop->getLoopPreheader();
+  const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
+  std::tie(FirstCheckInst, MemRuntimeCheck) =
+      addRuntimeChecks(RuntimeCheckBB->getTerminator(), VersionedLoop,
+                       AliasChecks, RtPtrChecking.getSE());
+
+  SCEVExpander Exp(*SE, RuntimeCheckBB->getModule()->getDataLayout(),
+                   "scev.check");
+  SCEVRuntimeCheck =
       Exp.expandCodeForPredicate(&Preds, RuntimeCheckBB->getTerminator());
-  auto *CI = dyn_cast<ConstantInt>(SCEVRuntimeCheck); 
- 
-  // Discard the SCEV runtime check if it is always true. 
-  if (CI && CI->isZero()) 
-    SCEVRuntimeCheck = nullptr; 
- 
-  if (MemRuntimeCheck && SCEVRuntimeCheck) { 
-    RuntimeCheck = BinaryOperator::Create(Instruction::Or, MemRuntimeCheck, 
-                                          SCEVRuntimeCheck, "lver.safe"); 
-    if (auto *I = dyn_cast<Instruction>(RuntimeCheck)) 
-      I->insertBefore(RuntimeCheckBB->getTerminator()); 
-  } else 
-    RuntimeCheck = MemRuntimeCheck ? MemRuntimeCheck : SCEVRuntimeCheck; 
- 
-  assert(RuntimeCheck && "called even though we don't need " 
-                         "any runtime checks"); 
- 
-  // Rename the block to make the IR more readable. 
-  RuntimeCheckBB->setName(VersionedLoop->getHeader()->getName() + 
-                          ".lver.check"); 
- 
-  // Create empty preheader for the loop (and after cloning for the 
-  // non-versioned loop). 
-  BasicBlock *PH = 
-      SplitBlock(RuntimeCheckBB, RuntimeCheckBB->getTerminator(), DT, LI, 
-                 nullptr, VersionedLoop->getHeader()->getName() + ".ph"); 
- 
-  // Clone the loop including the preheader. 
-  // 
-  // FIXME: This does not currently preserve SimplifyLoop because the exit 
-  // block is a join between the two loops. 
-  SmallVector<BasicBlock *, 8> NonVersionedLoopBlocks; 
-  NonVersionedLoop = 
-      cloneLoopWithPreheader(PH, RuntimeCheckBB, VersionedLoop, VMap, 
-                             ".lver.orig", LI, DT, NonVersionedLoopBlocks); 
-  remapInstructionsInBlocks(NonVersionedLoopBlocks, VMap); 
- 
-  // Insert the conditional branch based on the result of the memchecks. 
-  Instruction *OrigTerm = RuntimeCheckBB->getTerminator(); 
-  BranchInst::Create(NonVersionedLoop->getLoopPreheader(), 
-                     VersionedLoop->getLoopPreheader(), RuntimeCheck, OrigTerm); 
-  OrigTerm->eraseFromParent(); 
- 
-  // The loops merge in the original exit block.  This is now dominated by the 
-  // memchecking block. 
-  DT->changeImmediateDominator(VersionedLoop->getExitBlock(), RuntimeCheckBB); 
- 
-  // Adds the necessary PHI nodes for the versioned loops based on the 
-  // loop-defined values used outside of the loop. 
-  addPHINodes(DefsUsedOutside); 
+  auto *CI = dyn_cast<ConstantInt>(SCEVRuntimeCheck);
+
+  // Discard the SCEV runtime check if it is always true.
+  if (CI && CI->isZero())
+    SCEVRuntimeCheck = nullptr;
+
+  if (MemRuntimeCheck && SCEVRuntimeCheck) {
+    RuntimeCheck = BinaryOperator::Create(Instruction::Or, MemRuntimeCheck,
+                                          SCEVRuntimeCheck, "lver.safe");
+    if (auto *I = dyn_cast<Instruction>(RuntimeCheck))
+      I->insertBefore(RuntimeCheckBB->getTerminator());
+  } else
+    RuntimeCheck = MemRuntimeCheck ? MemRuntimeCheck : SCEVRuntimeCheck;
+
+  assert(RuntimeCheck && "called even though we don't need "
+                         "any runtime checks");
+
+  // Rename the block to make the IR more readable.
+  RuntimeCheckBB->setName(VersionedLoop->getHeader()->getName() +
+                          ".lver.check");
+
+  // Create empty preheader for the loop (and after cloning for the
+  // non-versioned loop).
+  BasicBlock *PH =
+      SplitBlock(RuntimeCheckBB, RuntimeCheckBB->getTerminator(), DT, LI,
+                 nullptr, VersionedLoop->getHeader()->getName() + ".ph");
+
+  // Clone the loop including the preheader.
+  //
+  // FIXME: This does not currently preserve SimplifyLoop because the exit
+  // block is a join between the two loops.
+  SmallVector<BasicBlock *, 8> NonVersionedLoopBlocks;
+  NonVersionedLoop =
+      cloneLoopWithPreheader(PH, RuntimeCheckBB, VersionedLoop, VMap,
+                             ".lver.orig", LI, DT, NonVersionedLoopBlocks);
+  remapInstructionsInBlocks(NonVersionedLoopBlocks, VMap);
+
+  // Insert the conditional branch based on the result of the memchecks.
+  Instruction *OrigTerm = RuntimeCheckBB->getTerminator();
+  BranchInst::Create(NonVersionedLoop->getLoopPreheader(),
+                     VersionedLoop->getLoopPreheader(), RuntimeCheck, OrigTerm);
+  OrigTerm->eraseFromParent();
+
+  // The loops merge in the original exit block.  This is now dominated by the
+  // memchecking block.
+  DT->changeImmediateDominator(VersionedLoop->getExitBlock(), RuntimeCheckBB);
+
+  // Adds the necessary PHI nodes for the versioned loops based on the
+  // loop-defined values used outside of the loop.
+  addPHINodes(DefsUsedOutside);
   formDedicatedExitBlocks(NonVersionedLoop, DT, LI, nullptr, true);
   formDedicatedExitBlocks(VersionedLoop, DT, LI, nullptr, true);
   assert(NonVersionedLoop->isLoopSimplifyForm() &&
          VersionedLoop->isLoopSimplifyForm() &&
          "The versioned loops should be in simplify form.");
-} 
- 
-void LoopVersioning::addPHINodes( 
-    const SmallVectorImpl<Instruction *> &DefsUsedOutside) { 
-  BasicBlock *PHIBlock = VersionedLoop->getExitBlock(); 
-  assert(PHIBlock && "No single successor to loop exit block"); 
-  PHINode *PN; 
- 
-  // First add a single-operand PHI for each DefsUsedOutside if one does not 
-  // exists yet. 
-  for (auto *Inst : DefsUsedOutside) { 
-    // See if we have a single-operand PHI with the value defined by the 
-    // original loop. 
-    for (auto I = PHIBlock->begin(); (PN = dyn_cast<PHINode>(I)); ++I) { 
-      if (PN->getIncomingValue(0) == Inst) 
-        break; 
-    } 
-    // If not create it. 
-    if (!PN) { 
-      PN = PHINode::Create(Inst->getType(), 2, Inst->getName() + ".lver", 
-                           &PHIBlock->front()); 
-      SmallVector<User*, 8> UsersToUpdate; 
-      for (User *U : Inst->users()) 
-        if (!VersionedLoop->contains(cast<Instruction>(U)->getParent())) 
-          UsersToUpdate.push_back(U); 
-      for (User *U : UsersToUpdate) 
-        U->replaceUsesOfWith(Inst, PN); 
-      PN->addIncoming(Inst, VersionedLoop->getExitingBlock()); 
-    } 
-  } 
- 
-  // Then for each PHI add the operand for the edge from the cloned loop. 
-  for (auto I = PHIBlock->begin(); (PN = dyn_cast<PHINode>(I)); ++I) { 
-    assert(PN->getNumOperands() == 1 && 
-           "Exit block should only have on predecessor"); 
- 
-    // If the definition was cloned used that otherwise use the same value. 
-    Value *ClonedValue = PN->getIncomingValue(0); 
-    auto Mapped = VMap.find(ClonedValue); 
-    if (Mapped != VMap.end()) 
-      ClonedValue = Mapped->second; 
- 
-    PN->addIncoming(ClonedValue, NonVersionedLoop->getExitingBlock()); 
-  } 
-} 
- 
-void LoopVersioning::prepareNoAliasMetadata() { 
-  // We need to turn the no-alias relation between pointer checking groups into 
-  // no-aliasing annotations between instructions. 
-  // 
-  // We accomplish this by mapping each pointer checking group (a set of 
-  // pointers memchecked together) to an alias scope and then also mapping each 
-  // group to the list of scopes it can't alias. 
- 
-  const RuntimePointerChecking *RtPtrChecking = LAI.getRuntimePointerChecking(); 
-  LLVMContext &Context = VersionedLoop->getHeader()->getContext(); 
- 
-  // First allocate an aliasing scope for each pointer checking group. 
-  // 
-  // While traversing through the checking groups in the loop, also create a 
-  // reverse map from pointers to the pointer checking group they were assigned 
-  // to. 
-  MDBuilder MDB(Context); 
-  MDNode *Domain = MDB.createAnonymousAliasScopeDomain("LVerDomain"); 
- 
-  for (const auto &Group : RtPtrChecking->CheckingGroups) { 
-    GroupToScope[&Group] = MDB.createAnonymousAliasScope(Domain); 
- 
-    for (unsigned PtrIdx : Group.Members) 
-      PtrToGroup[RtPtrChecking->getPointerInfo(PtrIdx).PointerValue] = &Group; 
-  } 
- 
-  // Go through the checks and for each pointer group, collect the scopes for 
-  // each non-aliasing pointer group. 
-  DenseMap<const RuntimeCheckingPtrGroup *, SmallVector<Metadata *, 4>> 
-      GroupToNonAliasingScopes; 
- 
-  for (const auto &Check : AliasChecks) 
-    GroupToNonAliasingScopes[Check.first].push_back(GroupToScope[Check.second]); 
- 
-  // Finally, transform the above to actually map to scope list which is what 
-  // the metadata uses. 
- 
-  for (auto Pair : GroupToNonAliasingScopes) 
-    GroupToNonAliasingScopeList[Pair.first] = MDNode::get(Context, Pair.second); 
-} 
- 
-void LoopVersioning::annotateLoopWithNoAlias() { 
-  if (!AnnotateNoAlias) 
-    return; 
- 
-  // First prepare the maps. 
-  prepareNoAliasMetadata(); 
- 
-  // Add the scope and no-alias metadata to the instructions. 
-  for (Instruction *I : LAI.getDepChecker().getMemoryInstructions()) { 
-    annotateInstWithNoAlias(I); 
-  } 
-} 
- 
-void LoopVersioning::annotateInstWithNoAlias(Instruction *VersionedInst, 
-                                             const Instruction *OrigInst) { 
-  if (!AnnotateNoAlias) 
-    return; 
- 
-  LLVMContext &Context = VersionedLoop->getHeader()->getContext(); 
-  const Value *Ptr = isa<LoadInst>(OrigInst) 
-                         ? cast<LoadInst>(OrigInst)->getPointerOperand() 
-                         : cast<StoreInst>(OrigInst)->getPointerOperand(); 
- 
-  // Find the group for the pointer and then add the scope metadata. 
-  auto Group = PtrToGroup.find(Ptr); 
-  if (Group != PtrToGroup.end()) { 
-    VersionedInst->setMetadata( 
-        LLVMContext::MD_alias_scope, 
-        MDNode::concatenate( 
-            VersionedInst->getMetadata(LLVMContext::MD_alias_scope), 
-            MDNode::get(Context, GroupToScope[Group->second]))); 
- 
-    // Add the no-alias metadata. 
-    auto NonAliasingScopeList = GroupToNonAliasingScopeList.find(Group->second); 
-    if (NonAliasingScopeList != GroupToNonAliasingScopeList.end()) 
-      VersionedInst->setMetadata( 
-          LLVMContext::MD_noalias, 
-          MDNode::concatenate( 
-              VersionedInst->getMetadata(LLVMContext::MD_noalias), 
-              NonAliasingScopeList->second)); 
-  } 
-} 
- 
-namespace { 
+}
+
+void LoopVersioning::addPHINodes(
+    const SmallVectorImpl<Instruction *> &DefsUsedOutside) {
+  BasicBlock *PHIBlock = VersionedLoop->getExitBlock();
+  assert(PHIBlock && "No single successor to loop exit block");
+  PHINode *PN;
+
+  // First add a single-operand PHI for each DefsUsedOutside if one does not
+  // exists yet.
+  for (auto *Inst : DefsUsedOutside) {
+    // See if we have a single-operand PHI with the value defined by the
+    // original loop.
+    for (auto I = PHIBlock->begin(); (PN = dyn_cast<PHINode>(I)); ++I) {
+      if (PN->getIncomingValue(0) == Inst)
+        break;
+    }
+    // If not create it.
+    if (!PN) {
+      PN = PHINode::Create(Inst->getType(), 2, Inst->getName() + ".lver",
+                           &PHIBlock->front());
+      SmallVector<User*, 8> UsersToUpdate;
+      for (User *U : Inst->users())
+        if (!VersionedLoop->contains(cast<Instruction>(U)->getParent()))
+          UsersToUpdate.push_back(U);
+      for (User *U : UsersToUpdate)
+        U->replaceUsesOfWith(Inst, PN);
+      PN->addIncoming(Inst, VersionedLoop->getExitingBlock());
+    }
+  }
+
+  // Then for each PHI add the operand for the edge from the cloned loop.
+  for (auto I = PHIBlock->begin(); (PN = dyn_cast<PHINode>(I)); ++I) {
+    assert(PN->getNumOperands() == 1 &&
+           "Exit block should only have on predecessor");
+
+    // If the definition was cloned used that otherwise use the same value.
+    Value *ClonedValue = PN->getIncomingValue(0);
+    auto Mapped = VMap.find(ClonedValue);
+    if (Mapped != VMap.end())
+      ClonedValue = Mapped->second;
+
+    PN->addIncoming(ClonedValue, NonVersionedLoop->getExitingBlock());
+  }
+}
+
+void LoopVersioning::prepareNoAliasMetadata() {
+  // We need to turn the no-alias relation between pointer checking groups into
+  // no-aliasing annotations between instructions.
+  //
+  // We accomplish this by mapping each pointer checking group (a set of
+  // pointers memchecked together) to an alias scope and then also mapping each
+  // group to the list of scopes it can't alias.
+
+  const RuntimePointerChecking *RtPtrChecking = LAI.getRuntimePointerChecking();
+  LLVMContext &Context = VersionedLoop->getHeader()->getContext();
+
+  // First allocate an aliasing scope for each pointer checking group.
+  //
+  // While traversing through the checking groups in the loop, also create a
+  // reverse map from pointers to the pointer checking group they were assigned
+  // to.
+  MDBuilder MDB(Context);
+  MDNode *Domain = MDB.createAnonymousAliasScopeDomain("LVerDomain");
+
+  for (const auto &Group : RtPtrChecking->CheckingGroups) {
+    GroupToScope[&Group] = MDB.createAnonymousAliasScope(Domain);
+
+    for (unsigned PtrIdx : Group.Members)
+      PtrToGroup[RtPtrChecking->getPointerInfo(PtrIdx).PointerValue] = &Group;
+  }
+
+  // Go through the checks and for each pointer group, collect the scopes for
+  // each non-aliasing pointer group.
+  DenseMap<const RuntimeCheckingPtrGroup *, SmallVector<Metadata *, 4>>
+      GroupToNonAliasingScopes;
+
+  for (const auto &Check : AliasChecks)
+    GroupToNonAliasingScopes[Check.first].push_back(GroupToScope[Check.second]);
+
+  // Finally, transform the above to actually map to scope list which is what
+  // the metadata uses.
+
+  for (auto Pair : GroupToNonAliasingScopes)
+    GroupToNonAliasingScopeList[Pair.first] = MDNode::get(Context, Pair.second);
+}
+
+void LoopVersioning::annotateLoopWithNoAlias() {
+  if (!AnnotateNoAlias)
+    return;
+
+  // First prepare the maps.
+  prepareNoAliasMetadata();
+
+  // Add the scope and no-alias metadata to the instructions.
+  for (Instruction *I : LAI.getDepChecker().getMemoryInstructions()) {
+    annotateInstWithNoAlias(I);
+  }
+}
+
+void LoopVersioning::annotateInstWithNoAlias(Instruction *VersionedInst,
+                                             const Instruction *OrigInst) {
+  if (!AnnotateNoAlias)
+    return;
+
+  LLVMContext &Context = VersionedLoop->getHeader()->getContext();
+  const Value *Ptr = isa<LoadInst>(OrigInst)
+                         ? cast<LoadInst>(OrigInst)->getPointerOperand()
+                         : cast<StoreInst>(OrigInst)->getPointerOperand();
+
+  // Find the group for the pointer and then add the scope metadata.
+  auto Group = PtrToGroup.find(Ptr);
+  if (Group != PtrToGroup.end()) {
+    VersionedInst->setMetadata(
+        LLVMContext::MD_alias_scope,
+        MDNode::concatenate(
+            VersionedInst->getMetadata(LLVMContext::MD_alias_scope),
+            MDNode::get(Context, GroupToScope[Group->second])));
+
+    // Add the no-alias metadata.
+    auto NonAliasingScopeList = GroupToNonAliasingScopeList.find(Group->second);
+    if (NonAliasingScopeList != GroupToNonAliasingScopeList.end())
+      VersionedInst->setMetadata(
+          LLVMContext::MD_noalias,
+          MDNode::concatenate(
+              VersionedInst->getMetadata(LLVMContext::MD_noalias),
+              NonAliasingScopeList->second));
+  }
+}
+
+namespace {
 bool runImpl(LoopInfo *LI, function_ref<const LoopAccessInfo &(Loop &)> GetLAA,
              DominatorTree *DT, ScalarEvolution *SE) {
   // Build up a worklist of inner-loops to version. This is necessary as the
@@ -288,59 +288,59 @@ bool runImpl(LoopInfo *LI, function_ref<const LoopAccessInfo &(Loop &)> GetLAA,
   return Changed;
 }
 
-/// Also expose this is a pass.  Currently this is only used for 
-/// unit-testing.  It adds all memchecks necessary to remove all may-aliasing 
-/// array accesses from the loop. 
+/// Also expose this is a pass.  Currently this is only used for
+/// unit-testing.  It adds all memchecks necessary to remove all may-aliasing
+/// array accesses from the loop.
 class LoopVersioningLegacyPass : public FunctionPass {
-public: 
+public:
   LoopVersioningLegacyPass() : FunctionPass(ID) {
     initializeLoopVersioningLegacyPassPass(*PassRegistry::getPassRegistry());
-  } 
- 
-  bool runOnFunction(Function &F) override { 
-    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 
+  }
+
+  bool runOnFunction(Function &F) override {
+    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     auto GetLAA = [&](Loop &L) -> const LoopAccessInfo & {
       return getAnalysis<LoopAccessLegacyAnalysis>().getInfo(&L);
     };
 
-    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 
- 
+    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+
     return runImpl(LI, GetLAA, DT, SE);
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<LoopInfoWrapperPass>(); 
-    AU.addPreserved<LoopInfoWrapperPass>(); 
-    AU.addRequired<LoopAccessLegacyAnalysis>(); 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addPreserved<DominatorTreeWrapperPass>(); 
-    AU.addRequired<ScalarEvolutionWrapperPass>(); 
-  } 
- 
-  static char ID; 
-}; 
-} 
- 
-#define LVER_OPTION "loop-versioning" 
-#define DEBUG_TYPE LVER_OPTION 
- 
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequired<LoopAccessLegacyAnalysis>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+  }
+
+  static char ID;
+};
+}
+
+#define LVER_OPTION "loop-versioning"
+#define DEBUG_TYPE LVER_OPTION
+
 char LoopVersioningLegacyPass::ID;
-static const char LVer_name[] = "Loop Versioning"; 
- 
+static const char LVer_name[] = "Loop Versioning";
+
 INITIALIZE_PASS_BEGIN(LoopVersioningLegacyPass, LVER_OPTION, LVer_name, false,
                       false)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_END(LoopVersioningLegacyPass, LVER_OPTION, LVer_name, false,
                     false)
- 
-namespace llvm { 
+
+namespace llvm {
 FunctionPass *createLoopVersioningLegacyPass() {
   return new LoopVersioningLegacyPass();
-} 
+}
 
 PreservedAnalyses LoopVersioningPass::run(Function &F,
                                           FunctionAnalysisManager &AM) {
@@ -365,5 +365,5 @@ PreservedAnalyses LoopVersioningPass::run(Function &F,
   if (runImpl(&LI, GetLAA, &DT, &SE))
     return PreservedAnalyses::none();
   return PreservedAnalyses::all();
-} 
+}
 } // namespace llvm
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/LowerInvoke.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/LowerInvoke.cpp
index 39f55a9ca6..fe0ff5899d 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/LowerInvoke.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/LowerInvoke.cpp
@@ -1,97 +1,97 @@
-//===- LowerInvoke.cpp - Eliminate Invoke instructions --------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This transformation is designed for use by code generators which do not yet 
-// support stack unwinding.  This pass converts 'invoke' instructions to 'call' 
-// instructions, so that any exception-handling 'landingpad' blocks become dead 
-// code (which can be removed by running the '-simplifycfg' pass afterwards). 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/LowerInvoke.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Transforms/Utils.h" 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "lowerinvoke" 
- 
-STATISTIC(NumInvokes, "Number of invokes replaced"); 
- 
-namespace { 
-  class LowerInvokeLegacyPass : public FunctionPass { 
-  public: 
-    static char ID; // Pass identification, replacement for typeid 
-    explicit LowerInvokeLegacyPass() : FunctionPass(ID) { 
-      initializeLowerInvokeLegacyPassPass(*PassRegistry::getPassRegistry()); 
-    } 
-    bool runOnFunction(Function &F) override; 
-  }; 
-} 
- 
-char LowerInvokeLegacyPass::ID = 0; 
-INITIALIZE_PASS(LowerInvokeLegacyPass, "lowerinvoke", 
-                "Lower invoke and unwind, for unwindless code generators", 
-                false, false) 
- 
-static bool runImpl(Function &F) { 
-  bool Changed = false; 
-  for (BasicBlock &BB : F) 
-    if (InvokeInst *II = dyn_cast<InvokeInst>(BB.getTerminator())) { 
+//===- LowerInvoke.cpp - Eliminate Invoke instructions --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation is designed for use by code generators which do not yet
+// support stack unwinding.  This pass converts 'invoke' instructions to 'call'
+// instructions, so that any exception-handling 'landingpad' blocks become dead
+// code (which can be removed by running the '-simplifycfg' pass afterwards).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LowerInvoke.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "lowerinvoke"
+
+STATISTIC(NumInvokes, "Number of invokes replaced");
+
+namespace {
+  class LowerInvokeLegacyPass : public FunctionPass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit LowerInvokeLegacyPass() : FunctionPass(ID) {
+      initializeLowerInvokeLegacyPassPass(*PassRegistry::getPassRegistry());
+    }
+    bool runOnFunction(Function &F) override;
+  };
+}
+
+char LowerInvokeLegacyPass::ID = 0;
+INITIALIZE_PASS(LowerInvokeLegacyPass, "lowerinvoke",
+                "Lower invoke and unwind, for unwindless code generators",
+                false, false)
+
+static bool runImpl(Function &F) {
+  bool Changed = false;
+  for (BasicBlock &BB : F)
+    if (InvokeInst *II = dyn_cast<InvokeInst>(BB.getTerminator())) {
       SmallVector<Value *, 16> CallArgs(II->args());
-      SmallVector<OperandBundleDef, 1> OpBundles; 
-      II->getOperandBundlesAsDefs(OpBundles); 
-      // Insert a normal call instruction... 
-      CallInst *NewCall = 
-          CallInst::Create(II->getFunctionType(), II->getCalledOperand(), 
-                           CallArgs, OpBundles, "", II); 
-      NewCall->takeName(II); 
-      NewCall->setCallingConv(II->getCallingConv()); 
-      NewCall->setAttributes(II->getAttributes()); 
-      NewCall->setDebugLoc(II->getDebugLoc()); 
-      II->replaceAllUsesWith(NewCall); 
- 
-      // Insert an unconditional branch to the normal destination. 
-      BranchInst::Create(II->getNormalDest(), II); 
- 
-      // Remove any PHI node entries from the exception destination. 
-      II->getUnwindDest()->removePredecessor(&BB); 
- 
-      // Remove the invoke instruction now. 
-      BB.getInstList().erase(II); 
- 
-      ++NumInvokes; 
-      Changed = true; 
-    } 
-  return Changed; 
-} 
- 
-bool LowerInvokeLegacyPass::runOnFunction(Function &F) { 
-  return runImpl(F); 
-} 
- 
-namespace llvm { 
-char &LowerInvokePassID = LowerInvokeLegacyPass::ID; 
- 
-// Public Interface To the LowerInvoke pass. 
-FunctionPass *createLowerInvokePass() { return new LowerInvokeLegacyPass(); } 
- 
-PreservedAnalyses LowerInvokePass::run(Function &F, 
-                                       FunctionAnalysisManager &AM) { 
-  bool Changed = runImpl(F); 
-  if (!Changed) 
-    return PreservedAnalyses::all(); 
- 
-  return PreservedAnalyses::none(); 
-} 
-} 
+      SmallVector<OperandBundleDef, 1> OpBundles;
+      II->getOperandBundlesAsDefs(OpBundles);
+      // Insert a normal call instruction...
+      CallInst *NewCall =
+          CallInst::Create(II->getFunctionType(), II->getCalledOperand(),
+                           CallArgs, OpBundles, "", II);
+      NewCall->takeName(II);
+      NewCall->setCallingConv(II->getCallingConv());
+      NewCall->setAttributes(II->getAttributes());
+      NewCall->setDebugLoc(II->getDebugLoc());
+      II->replaceAllUsesWith(NewCall);
+
+      // Insert an unconditional branch to the normal destination.
+      BranchInst::Create(II->getNormalDest(), II);
+
+      // Remove any PHI node entries from the exception destination.
+      II->getUnwindDest()->removePredecessor(&BB);
+
+      // Remove the invoke instruction now.
+      BB.getInstList().erase(II);
+
+      ++NumInvokes;
+      Changed = true;
+    }
+  return Changed;
+}
+
+bool LowerInvokeLegacyPass::runOnFunction(Function &F) {
+  return runImpl(F);
+}
+
+namespace llvm {
+char &LowerInvokePassID = LowerInvokeLegacyPass::ID;
+
+// Public Interface To the LowerInvoke pass.
+FunctionPass *createLowerInvokePass() { return new LowerInvokeLegacyPass(); }
+
+PreservedAnalyses LowerInvokePass::run(Function &F,
+                                       FunctionAnalysisManager &AM) {
+  bool Changed = runImpl(F);
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index 389c5108cb..616b4e8eb0 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -1,467 +1,467 @@
-//===- LowerMemIntrinsics.cpp ----------------------------------*- C++ -*--===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/LowerMemIntrinsics.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
- 
-using namespace llvm; 
- 
-void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr, 
-                                     Value *DstAddr, ConstantInt *CopyLen, 
-                                     Align SrcAlign, Align DstAlign, 
-                                     bool SrcIsVolatile, bool DstIsVolatile, 
-                                     const TargetTransformInfo &TTI) { 
-  // No need to expand zero length copies. 
-  if (CopyLen->isZero()) 
-    return; 
- 
-  BasicBlock *PreLoopBB = InsertBefore->getParent(); 
-  BasicBlock *PostLoopBB = nullptr; 
-  Function *ParentFunc = PreLoopBB->getParent(); 
-  LLVMContext &Ctx = PreLoopBB->getContext(); 
-  const DataLayout &DL = ParentFunc->getParent()->getDataLayout(); 
- 
-  unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace(); 
-  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace(); 
- 
-  Type *TypeOfCopyLen = CopyLen->getType(); 
-  Type *LoopOpType = TTI.getMemcpyLoopLoweringType( 
-      Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value()); 
- 
-  unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType); 
-  uint64_t LoopEndCount = CopyLen->getZExtValue() / LoopOpSize; 
- 
-  if (LoopEndCount != 0) { 
-    // Split 
-    PostLoopBB = PreLoopBB->splitBasicBlock(InsertBefore, "memcpy-split"); 
-    BasicBlock *LoopBB = 
-        BasicBlock::Create(Ctx, "load-store-loop", ParentFunc, PostLoopBB); 
-    PreLoopBB->getTerminator()->setSuccessor(0, LoopBB); 
- 
-    IRBuilder<> PLBuilder(PreLoopBB->getTerminator()); 
- 
-    // Cast the Src and Dst pointers to pointers to the loop operand type (if 
-    // needed). 
-    PointerType *SrcOpType = PointerType::get(LoopOpType, SrcAS); 
-    PointerType *DstOpType = PointerType::get(LoopOpType, DstAS); 
-    if (SrcAddr->getType() != SrcOpType) { 
-      SrcAddr = PLBuilder.CreateBitCast(SrcAddr, SrcOpType); 
-    } 
-    if (DstAddr->getType() != DstOpType) { 
-      DstAddr = PLBuilder.CreateBitCast(DstAddr, DstOpType); 
-    } 
- 
-    Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize)); 
-    Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize)); 
- 
-    IRBuilder<> LoopBuilder(LoopBB); 
-    PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 2, "loop-index"); 
-    LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0U), PreLoopBB); 
-    // Loop Body 
-    Value *SrcGEP = 
-        LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex); 
-    Value *Load = LoopBuilder.CreateAlignedLoad(LoopOpType, SrcGEP, 
-                                                PartSrcAlign, SrcIsVolatile); 
-    Value *DstGEP = 
-        LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex); 
-    LoopBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile); 
- 
-    Value *NewIndex = 
-        LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1U)); 
-    LoopIndex->addIncoming(NewIndex, LoopBB); 
- 
-    // Create the loop branch condition. 
-    Constant *LoopEndCI = ConstantInt::get(TypeOfCopyLen, LoopEndCount); 
-    LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, LoopEndCI), 
-                             LoopBB, PostLoopBB); 
-  } 
- 
-  uint64_t BytesCopied = LoopEndCount * LoopOpSize; 
-  uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopied; 
-  if (RemainingBytes) { 
-    IRBuilder<> RBuilder(PostLoopBB ? PostLoopBB->getFirstNonPHI() 
-                                    : InsertBefore); 
- 
-    SmallVector<Type *, 5> RemainingOps; 
-    TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes, 
-                                          SrcAS, DstAS, SrcAlign.value(), 
-                                          DstAlign.value()); 
- 
-    for (auto OpTy : RemainingOps) { 
-      Align PartSrcAlign(commonAlignment(SrcAlign, BytesCopied)); 
-      Align PartDstAlign(commonAlignment(DstAlign, BytesCopied)); 
- 
-      // Calaculate the new index 
-      unsigned OperandSize = DL.getTypeStoreSize(OpTy); 
-      uint64_t GepIndex = BytesCopied / OperandSize; 
-      assert(GepIndex * OperandSize == BytesCopied && 
-             "Division should have no Remainder!"); 
-      // Cast source to operand type and load 
-      PointerType *SrcPtrType = PointerType::get(OpTy, SrcAS); 
-      Value *CastedSrc = SrcAddr->getType() == SrcPtrType 
-                             ? SrcAddr 
-                             : RBuilder.CreateBitCast(SrcAddr, SrcPtrType); 
-      Value *SrcGEP = RBuilder.CreateInBoundsGEP( 
-          OpTy, CastedSrc, ConstantInt::get(TypeOfCopyLen, GepIndex)); 
-      Value *Load = 
-          RBuilder.CreateAlignedLoad(OpTy, SrcGEP, PartSrcAlign, SrcIsVolatile); 
- 
-      // Cast destination to operand type and store. 
-      PointerType *DstPtrType = PointerType::get(OpTy, DstAS); 
-      Value *CastedDst = DstAddr->getType() == DstPtrType 
-                             ? DstAddr 
-                             : RBuilder.CreateBitCast(DstAddr, DstPtrType); 
-      Value *DstGEP = RBuilder.CreateInBoundsGEP( 
-          OpTy, CastedDst, ConstantInt::get(TypeOfCopyLen, GepIndex)); 
-      RBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile); 
- 
-      BytesCopied += OperandSize; 
-    } 
-  } 
-  assert(BytesCopied == CopyLen->getZExtValue() && 
-         "Bytes copied should match size in the call!"); 
-} 
- 
-void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore, 
-                                       Value *SrcAddr, Value *DstAddr, 
-                                       Value *CopyLen, Align SrcAlign, 
-                                       Align DstAlign, bool SrcIsVolatile, 
-                                       bool DstIsVolatile, 
-                                       const TargetTransformInfo &TTI) { 
-  BasicBlock *PreLoopBB = InsertBefore->getParent(); 
-  BasicBlock *PostLoopBB = 
-      PreLoopBB->splitBasicBlock(InsertBefore, "post-loop-memcpy-expansion"); 
- 
-  Function *ParentFunc = PreLoopBB->getParent(); 
-  const DataLayout &DL = ParentFunc->getParent()->getDataLayout(); 
-  LLVMContext &Ctx = PreLoopBB->getContext(); 
-  unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace(); 
-  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace(); 
- 
-  Type *LoopOpType = TTI.getMemcpyLoopLoweringType( 
-      Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value()); 
-  unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType); 
- 
-  IRBuilder<> PLBuilder(PreLoopBB->getTerminator()); 
- 
-  PointerType *SrcOpType = PointerType::get(LoopOpType, SrcAS); 
-  PointerType *DstOpType = PointerType::get(LoopOpType, DstAS); 
-  if (SrcAddr->getType() != SrcOpType) { 
-    SrcAddr = PLBuilder.CreateBitCast(SrcAddr, SrcOpType); 
-  } 
-  if (DstAddr->getType() != DstOpType) { 
-    DstAddr = PLBuilder.CreateBitCast(DstAddr, DstOpType); 
-  } 
- 
-  // Calculate the loop trip count, and remaining bytes to copy after the loop. 
-  Type *CopyLenType = CopyLen->getType(); 
-  IntegerType *ILengthType = dyn_cast<IntegerType>(CopyLenType); 
-  assert(ILengthType && 
-         "expected size argument to memcpy to be an integer type!"); 
-  Type *Int8Type = Type::getInt8Ty(Ctx); 
-  bool LoopOpIsInt8 = LoopOpType == Int8Type; 
-  ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize); 
-  Value *RuntimeLoopCount = LoopOpIsInt8 ? 
-                            CopyLen : 
-                            PLBuilder.CreateUDiv(CopyLen, CILoopOpSize); 
-  BasicBlock *LoopBB = 
-      BasicBlock::Create(Ctx, "loop-memcpy-expansion", ParentFunc, PostLoopBB); 
-  IRBuilder<> LoopBuilder(LoopBB); 
- 
-  Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize)); 
-  Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize)); 
- 
-  PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLenType, 2, "loop-index"); 
-  LoopIndex->addIncoming(ConstantInt::get(CopyLenType, 0U), PreLoopBB); 
- 
-  Value *SrcGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex); 
-  Value *Load = LoopBuilder.CreateAlignedLoad(LoopOpType, SrcGEP, PartSrcAlign, 
-                                              SrcIsVolatile); 
-  Value *DstGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex); 
-  LoopBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile); 
- 
-  Value *NewIndex = 
-      LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLenType, 1U)); 
-  LoopIndex->addIncoming(NewIndex, LoopBB); 
- 
-  if (!LoopOpIsInt8) { 
-   // Add in the 
-   Value *RuntimeResidual = PLBuilder.CreateURem(CopyLen, CILoopOpSize); 
-   Value *RuntimeBytesCopied = PLBuilder.CreateSub(CopyLen, RuntimeResidual); 
- 
-    // Loop body for the residual copy. 
-    BasicBlock *ResLoopBB = BasicBlock::Create(Ctx, "loop-memcpy-residual", 
-                                               PreLoopBB->getParent(), 
-                                               PostLoopBB); 
-    // Residual loop header. 
-    BasicBlock *ResHeaderBB = BasicBlock::Create( 
-        Ctx, "loop-memcpy-residual-header", PreLoopBB->getParent(), nullptr); 
- 
-    // Need to update the pre-loop basic block to branch to the correct place. 
-    // branch to the main loop if the count is non-zero, branch to the residual 
-    // loop if the copy size is smaller then 1 iteration of the main loop but 
-    // non-zero and finally branch to after the residual loop if the memcpy 
-    //  size is zero. 
-    ConstantInt *Zero = ConstantInt::get(ILengthType, 0U); 
-    PLBuilder.CreateCondBr(PLBuilder.CreateICmpNE(RuntimeLoopCount, Zero), 
-                           LoopBB, ResHeaderBB); 
-    PreLoopBB->getTerminator()->eraseFromParent(); 
- 
-    LoopBuilder.CreateCondBr( 
-        LoopBuilder.CreateICmpULT(NewIndex, RuntimeLoopCount), LoopBB, 
-        ResHeaderBB); 
- 
-    // Determine if we need to branch to the residual loop or bypass it. 
-    IRBuilder<> RHBuilder(ResHeaderBB); 
-    RHBuilder.CreateCondBr(RHBuilder.CreateICmpNE(RuntimeResidual, Zero), 
-                           ResLoopBB, PostLoopBB); 
- 
-    // Copy the residual with single byte load/store loop. 
-    IRBuilder<> ResBuilder(ResLoopBB); 
-    PHINode *ResidualIndex = 
-        ResBuilder.CreatePHI(CopyLenType, 2, "residual-loop-index"); 
-    ResidualIndex->addIncoming(Zero, ResHeaderBB); 
- 
-    Value *SrcAsInt8 = 
-        ResBuilder.CreateBitCast(SrcAddr, PointerType::get(Int8Type, SrcAS)); 
-    Value *DstAsInt8 = 
-        ResBuilder.CreateBitCast(DstAddr, PointerType::get(Int8Type, DstAS)); 
-    Value *FullOffset = ResBuilder.CreateAdd(RuntimeBytesCopied, ResidualIndex); 
-    Value *SrcGEP = 
-        ResBuilder.CreateInBoundsGEP(Int8Type, SrcAsInt8, FullOffset); 
-    Value *Load = ResBuilder.CreateAlignedLoad(Int8Type, SrcGEP, PartSrcAlign, 
-                                               SrcIsVolatile); 
-    Value *DstGEP = 
-        ResBuilder.CreateInBoundsGEP(Int8Type, DstAsInt8, FullOffset); 
-    ResBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile); 
- 
-    Value *ResNewIndex = 
-        ResBuilder.CreateAdd(ResidualIndex, ConstantInt::get(CopyLenType, 1U)); 
-    ResidualIndex->addIncoming(ResNewIndex, ResLoopBB); 
- 
-    // Create the loop branch condition. 
-    ResBuilder.CreateCondBr( 
-        ResBuilder.CreateICmpULT(ResNewIndex, RuntimeResidual), ResLoopBB, 
-        PostLoopBB); 
-  } else { 
-    // In this case the loop operand type was a byte, and there is no need for a 
-    // residual loop to copy the remaining memory after the main loop. 
-    // We do however need to patch up the control flow by creating the 
-    // terminators for the preloop block and the memcpy loop. 
-    ConstantInt *Zero = ConstantInt::get(ILengthType, 0U); 
-    PLBuilder.CreateCondBr(PLBuilder.CreateICmpNE(RuntimeLoopCount, Zero), 
-                           LoopBB, PostLoopBB); 
-    PreLoopBB->getTerminator()->eraseFromParent(); 
-    LoopBuilder.CreateCondBr( 
-        LoopBuilder.CreateICmpULT(NewIndex, RuntimeLoopCount), LoopBB, 
-        PostLoopBB); 
-  } 
-} 
- 
-// Lower memmove to IR. memmove is required to correctly copy overlapping memory 
-// regions; therefore, it has to check the relative positions of the source and 
-// destination pointers and choose the copy direction accordingly. 
-// 
-// The code below is an IR rendition of this C function: 
-// 
-// void* memmove(void* dst, const void* src, size_t n) { 
-//   unsigned char* d = dst; 
-//   const unsigned char* s = src; 
-//   if (s < d) { 
-//     // copy backwards 
-//     while (n--) { 
-//       d[n] = s[n]; 
-//     } 
-//   } else { 
-//     // copy forward 
-//     for (size_t i = 0; i < n; ++i) { 
-//       d[i] = s[i]; 
-//     } 
-//   } 
-//   return dst; 
-// } 
-static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr, 
-                              Value *DstAddr, Value *CopyLen, Align SrcAlign, 
-                              Align DstAlign, bool SrcIsVolatile, 
-                              bool DstIsVolatile) { 
-  Type *TypeOfCopyLen = CopyLen->getType(); 
-  BasicBlock *OrigBB = InsertBefore->getParent(); 
-  Function *F = OrigBB->getParent(); 
-  const DataLayout &DL = F->getParent()->getDataLayout(); 
- 
-  Type *EltTy = cast<PointerType>(SrcAddr->getType())->getElementType(); 
- 
-  // Create the a comparison of src and dst, based on which we jump to either 
-  // the forward-copy part of the function (if src >= dst) or the backwards-copy 
-  // part (if src < dst). 
-  // SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else 
-  // structure. Its block terminators (unconditional branches) are replaced by 
-  // the appropriate conditional branches when the loop is built. 
-  ICmpInst *PtrCompare = new ICmpInst(InsertBefore, ICmpInst::ICMP_ULT, 
-                                      SrcAddr, DstAddr, "compare_src_dst"); 
-  Instruction *ThenTerm, *ElseTerm; 
-  SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore, &ThenTerm, 
-                                &ElseTerm); 
- 
-  // Each part of the function consists of two blocks: 
-  //   copy_backwards:        used to skip the loop when n == 0 
-  //   copy_backwards_loop:   the actual backwards loop BB 
-  //   copy_forward:          used to skip the loop when n == 0 
-  //   copy_forward_loop:     the actual forward loop BB 
-  BasicBlock *CopyBackwardsBB = ThenTerm->getParent(); 
-  CopyBackwardsBB->setName("copy_backwards"); 
-  BasicBlock *CopyForwardBB = ElseTerm->getParent(); 
-  CopyForwardBB->setName("copy_forward"); 
-  BasicBlock *ExitBB = InsertBefore->getParent(); 
-  ExitBB->setName("memmove_done"); 
- 
-  unsigned PartSize = DL.getTypeStoreSize(EltTy); 
-  Align PartSrcAlign(commonAlignment(SrcAlign, PartSize)); 
-  Align PartDstAlign(commonAlignment(DstAlign, PartSize)); 
- 
-  // Initial comparison of n == 0 that lets us skip the loops altogether. Shared 
-  // between both backwards and forward copy clauses. 
-  ICmpInst *CompareN = 
-      new ICmpInst(OrigBB->getTerminator(), ICmpInst::ICMP_EQ, CopyLen, 
-                   ConstantInt::get(TypeOfCopyLen, 0), "compare_n_to_0"); 
- 
-  // Copying backwards. 
-  BasicBlock *LoopBB = 
-    BasicBlock::Create(F->getContext(), "copy_backwards_loop", F, CopyForwardBB); 
-  IRBuilder<> LoopBuilder(LoopBB); 
-  PHINode *LoopPhi = LoopBuilder.CreatePHI(TypeOfCopyLen, 0); 
-  Value *IndexPtr = LoopBuilder.CreateSub( 
-      LoopPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_ptr"); 
-  Value *Element = LoopBuilder.CreateAlignedLoad( 
-      EltTy, LoopBuilder.CreateInBoundsGEP(EltTy, SrcAddr, IndexPtr), 
-      PartSrcAlign, "element"); 
-  LoopBuilder.CreateAlignedStore( 
-      Element, LoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, IndexPtr), 
-      PartDstAlign); 
-  LoopBuilder.CreateCondBr( 
-      LoopBuilder.CreateICmpEQ(IndexPtr, ConstantInt::get(TypeOfCopyLen, 0)), 
-      ExitBB, LoopBB); 
-  LoopPhi->addIncoming(IndexPtr, LoopBB); 
-  LoopPhi->addIncoming(CopyLen, CopyBackwardsBB); 
-  BranchInst::Create(ExitBB, LoopBB, CompareN, ThenTerm); 
-  ThenTerm->eraseFromParent(); 
- 
-  // Copying forward. 
-  BasicBlock *FwdLoopBB = 
-    BasicBlock::Create(F->getContext(), "copy_forward_loop", F, ExitBB); 
-  IRBuilder<> FwdLoopBuilder(FwdLoopBB); 
-  PHINode *FwdCopyPhi = FwdLoopBuilder.CreatePHI(TypeOfCopyLen, 0, "index_ptr"); 
-  Value *SrcGEP = FwdLoopBuilder.CreateInBoundsGEP(EltTy, SrcAddr, FwdCopyPhi); 
-  Value *FwdElement = 
-      FwdLoopBuilder.CreateAlignedLoad(EltTy, SrcGEP, PartSrcAlign, "element"); 
-  Value *DstGEP = FwdLoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, FwdCopyPhi); 
-  FwdLoopBuilder.CreateAlignedStore(FwdElement, DstGEP, PartDstAlign); 
-  Value *FwdIndexPtr = FwdLoopBuilder.CreateAdd( 
-      FwdCopyPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_increment"); 
-  FwdLoopBuilder.CreateCondBr(FwdLoopBuilder.CreateICmpEQ(FwdIndexPtr, CopyLen), 
-                              ExitBB, FwdLoopBB); 
-  FwdCopyPhi->addIncoming(FwdIndexPtr, FwdLoopBB); 
-  FwdCopyPhi->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), CopyForwardBB); 
- 
-  BranchInst::Create(ExitBB, FwdLoopBB, CompareN, ElseTerm); 
-  ElseTerm->eraseFromParent(); 
-} 
- 
-static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr, 
-                             Value *CopyLen, Value *SetValue, Align DstAlign, 
-                             bool IsVolatile) { 
-  Type *TypeOfCopyLen = CopyLen->getType(); 
-  BasicBlock *OrigBB = InsertBefore->getParent(); 
-  Function *F = OrigBB->getParent(); 
-  const DataLayout &DL = F->getParent()->getDataLayout(); 
-  BasicBlock *NewBB = 
-      OrigBB->splitBasicBlock(InsertBefore, "split"); 
-  BasicBlock *LoopBB 
-    = BasicBlock::Create(F->getContext(), "loadstoreloop", F, NewBB); 
- 
-  IRBuilder<> Builder(OrigBB->getTerminator()); 
- 
-  // Cast pointer to the type of value getting stored 
-  unsigned dstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace(); 
-  DstAddr = Builder.CreateBitCast(DstAddr, 
-                                  PointerType::get(SetValue->getType(), dstAS)); 
- 
-  Builder.CreateCondBr( 
-      Builder.CreateICmpEQ(ConstantInt::get(TypeOfCopyLen, 0), CopyLen), NewBB, 
-      LoopBB); 
-  OrigBB->getTerminator()->eraseFromParent(); 
- 
-  unsigned PartSize = DL.getTypeStoreSize(SetValue->getType()); 
-  Align PartAlign(commonAlignment(DstAlign, PartSize)); 
- 
-  IRBuilder<> LoopBuilder(LoopBB); 
-  PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0); 
-  LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB); 
- 
-  LoopBuilder.CreateAlignedStore( 
-      SetValue, 
-      LoopBuilder.CreateInBoundsGEP(SetValue->getType(), DstAddr, LoopIndex), 
-      PartAlign, IsVolatile); 
- 
-  Value *NewIndex = 
-      LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1)); 
-  LoopIndex->addIncoming(NewIndex, LoopBB); 
- 
-  LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB, 
-                           NewBB); 
-} 
- 
-void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy, 
-                              const TargetTransformInfo &TTI) { 
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(Memcpy->getLength())) { 
-    createMemCpyLoopKnownSize( 
-        /* InsertBefore */ Memcpy, 
-        /* SrcAddr */ Memcpy->getRawSource(), 
-        /* DstAddr */ Memcpy->getRawDest(), 
-        /* CopyLen */ CI, 
-        /* SrcAlign */ Memcpy->getSourceAlign().valueOrOne(), 
-        /* DestAlign */ Memcpy->getDestAlign().valueOrOne(), 
-        /* SrcIsVolatile */ Memcpy->isVolatile(), 
-        /* DstIsVolatile */ Memcpy->isVolatile(), 
-        /* TargetTransformInfo */ TTI); 
-  } else { 
-    createMemCpyLoopUnknownSize( 
-        /* InsertBefore */ Memcpy, 
-        /* SrcAddr */ Memcpy->getRawSource(), 
-        /* DstAddr */ Memcpy->getRawDest(), 
-        /* CopyLen */ Memcpy->getLength(), 
-        /* SrcAlign */ Memcpy->getSourceAlign().valueOrOne(), 
-        /* DestAlign */ Memcpy->getDestAlign().valueOrOne(), 
-        /* SrcIsVolatile */ Memcpy->isVolatile(), 
-        /* DstIsVolatile */ Memcpy->isVolatile(), 
-        /* TargetTransfomrInfo */ TTI); 
-  } 
-} 
- 
-void llvm::expandMemMoveAsLoop(MemMoveInst *Memmove) { 
-  createMemMoveLoop(/* InsertBefore */ Memmove, 
-                    /* SrcAddr */ Memmove->getRawSource(), 
-                    /* DstAddr */ Memmove->getRawDest(), 
-                    /* CopyLen */ Memmove->getLength(), 
-                    /* SrcAlign */ Memmove->getSourceAlign().valueOrOne(), 
-                    /* DestAlign */ Memmove->getDestAlign().valueOrOne(), 
-                    /* SrcIsVolatile */ Memmove->isVolatile(), 
-                    /* DstIsVolatile */ Memmove->isVolatile()); 
-} 
- 
-void llvm::expandMemSetAsLoop(MemSetInst *Memset) { 
-  createMemSetLoop(/* InsertBefore */ Memset, 
-                   /* DstAddr */ Memset->getRawDest(), 
-                   /* CopyLen */ Memset->getLength(), 
-                   /* SetValue */ Memset->getValue(), 
-                   /* Alignment */ Memset->getDestAlign().valueOrOne(), 
-                   Memset->isVolatile()); 
-} 
+//===- LowerMemIntrinsics.cpp ----------------------------------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
+                                     Value *DstAddr, ConstantInt *CopyLen,
+                                     Align SrcAlign, Align DstAlign,
+                                     bool SrcIsVolatile, bool DstIsVolatile,
+                                     const TargetTransformInfo &TTI) {
+  // No need to expand zero length copies.
+  if (CopyLen->isZero())
+    return;
+
+  BasicBlock *PreLoopBB = InsertBefore->getParent();
+  BasicBlock *PostLoopBB = nullptr;
+  Function *ParentFunc = PreLoopBB->getParent();
+  LLVMContext &Ctx = PreLoopBB->getContext();
+  const DataLayout &DL = ParentFunc->getParent()->getDataLayout();
+
+  unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
+  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+
+  Type *TypeOfCopyLen = CopyLen->getType();
+  Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
+      Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value());
+
+  unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
+  uint64_t LoopEndCount = CopyLen->getZExtValue() / LoopOpSize;
+
+  if (LoopEndCount != 0) {
+    // Split
+    PostLoopBB = PreLoopBB->splitBasicBlock(InsertBefore, "memcpy-split");
+    BasicBlock *LoopBB =
+        BasicBlock::Create(Ctx, "load-store-loop", ParentFunc, PostLoopBB);
+    PreLoopBB->getTerminator()->setSuccessor(0, LoopBB);
+
+    IRBuilder<> PLBuilder(PreLoopBB->getTerminator());
+
+    // Cast the Src and Dst pointers to pointers to the loop operand type (if
+    // needed).
+    PointerType *SrcOpType = PointerType::get(LoopOpType, SrcAS);
+    PointerType *DstOpType = PointerType::get(LoopOpType, DstAS);
+    if (SrcAddr->getType() != SrcOpType) {
+      SrcAddr = PLBuilder.CreateBitCast(SrcAddr, SrcOpType);
+    }
+    if (DstAddr->getType() != DstOpType) {
+      DstAddr = PLBuilder.CreateBitCast(DstAddr, DstOpType);
+    }
+
+    Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
+    Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
+
+    IRBuilder<> LoopBuilder(LoopBB);
+    PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 2, "loop-index");
+    LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0U), PreLoopBB);
+    // Loop Body
+    Value *SrcGEP =
+        LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex);
+    Value *Load = LoopBuilder.CreateAlignedLoad(LoopOpType, SrcGEP,
+                                                PartSrcAlign, SrcIsVolatile);
+    Value *DstGEP =
+        LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex);
+    LoopBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile);
+
+    Value *NewIndex =
+        LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1U));
+    LoopIndex->addIncoming(NewIndex, LoopBB);
+
+    // Create the loop branch condition.
+    Constant *LoopEndCI = ConstantInt::get(TypeOfCopyLen, LoopEndCount);
+    LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, LoopEndCI),
+                             LoopBB, PostLoopBB);
+  }
+
+  uint64_t BytesCopied = LoopEndCount * LoopOpSize;
+  uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopied;
+  if (RemainingBytes) {
+    IRBuilder<> RBuilder(PostLoopBB ? PostLoopBB->getFirstNonPHI()
+                                    : InsertBefore);
+
+    SmallVector<Type *, 5> RemainingOps;
+    TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
+                                          SrcAS, DstAS, SrcAlign.value(),
+                                          DstAlign.value());
+
+    for (auto OpTy : RemainingOps) {
+      Align PartSrcAlign(commonAlignment(SrcAlign, BytesCopied));
+      Align PartDstAlign(commonAlignment(DstAlign, BytesCopied));
+
+      // Calaculate the new index
+      unsigned OperandSize = DL.getTypeStoreSize(OpTy);
+      uint64_t GepIndex = BytesCopied / OperandSize;
+      assert(GepIndex * OperandSize == BytesCopied &&
+             "Division should have no Remainder!");
+      // Cast source to operand type and load
+      PointerType *SrcPtrType = PointerType::get(OpTy, SrcAS);
+      Value *CastedSrc = SrcAddr->getType() == SrcPtrType
+                             ? SrcAddr
+                             : RBuilder.CreateBitCast(SrcAddr, SrcPtrType);
+      Value *SrcGEP = RBuilder.CreateInBoundsGEP(
+          OpTy, CastedSrc, ConstantInt::get(TypeOfCopyLen, GepIndex));
+      Value *Load =
+          RBuilder.CreateAlignedLoad(OpTy, SrcGEP, PartSrcAlign, SrcIsVolatile);
+
+      // Cast destination to operand type and store.
+      PointerType *DstPtrType = PointerType::get(OpTy, DstAS);
+      Value *CastedDst = DstAddr->getType() == DstPtrType
+                             ? DstAddr
+                             : RBuilder.CreateBitCast(DstAddr, DstPtrType);
+      Value *DstGEP = RBuilder.CreateInBoundsGEP(
+          OpTy, CastedDst, ConstantInt::get(TypeOfCopyLen, GepIndex));
+      RBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile);
+
+      BytesCopied += OperandSize;
+    }
+  }
+  assert(BytesCopied == CopyLen->getZExtValue() &&
+         "Bytes copied should match size in the call!");
+}
+
+void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore,
+                                       Value *SrcAddr, Value *DstAddr,
+                                       Value *CopyLen, Align SrcAlign,
+                                       Align DstAlign, bool SrcIsVolatile,
+                                       bool DstIsVolatile,
+                                       const TargetTransformInfo &TTI) {
+  BasicBlock *PreLoopBB = InsertBefore->getParent();
+  BasicBlock *PostLoopBB =
+      PreLoopBB->splitBasicBlock(InsertBefore, "post-loop-memcpy-expansion");
+
+  Function *ParentFunc = PreLoopBB->getParent();
+  const DataLayout &DL = ParentFunc->getParent()->getDataLayout();
+  LLVMContext &Ctx = PreLoopBB->getContext();
+  unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
+  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+
+  Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
+      Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value());
+  unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
+
+  IRBuilder<> PLBuilder(PreLoopBB->getTerminator());
+
+  PointerType *SrcOpType = PointerType::get(LoopOpType, SrcAS);
+  PointerType *DstOpType = PointerType::get(LoopOpType, DstAS);
+  if (SrcAddr->getType() != SrcOpType) {
+    SrcAddr = PLBuilder.CreateBitCast(SrcAddr, SrcOpType);
+  }
+  if (DstAddr->getType() != DstOpType) {
+    DstAddr = PLBuilder.CreateBitCast(DstAddr, DstOpType);
+  }
+
+  // Calculate the loop trip count, and remaining bytes to copy after the loop.
+  Type *CopyLenType = CopyLen->getType();
+  IntegerType *ILengthType = dyn_cast<IntegerType>(CopyLenType);
+  assert(ILengthType &&
+         "expected size argument to memcpy to be an integer type!");
+  Type *Int8Type = Type::getInt8Ty(Ctx);
+  bool LoopOpIsInt8 = LoopOpType == Int8Type;
+  ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize);
+  Value *RuntimeLoopCount = LoopOpIsInt8 ?
+                            CopyLen :
+                            PLBuilder.CreateUDiv(CopyLen, CILoopOpSize);
+  BasicBlock *LoopBB =
+      BasicBlock::Create(Ctx, "loop-memcpy-expansion", ParentFunc, PostLoopBB);
+  IRBuilder<> LoopBuilder(LoopBB);
+
+  Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
+  Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
+
+  PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLenType, 2, "loop-index");
+  LoopIndex->addIncoming(ConstantInt::get(CopyLenType, 0U), PreLoopBB);
+
+  Value *SrcGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex);
+  Value *Load = LoopBuilder.CreateAlignedLoad(LoopOpType, SrcGEP, PartSrcAlign,
+                                              SrcIsVolatile);
+  Value *DstGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex);
+  LoopBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile);
+
+  Value *NewIndex =
+      LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLenType, 1U));
+  LoopIndex->addIncoming(NewIndex, LoopBB);
+
+  if (!LoopOpIsInt8) {
+   // Add in the
+   Value *RuntimeResidual = PLBuilder.CreateURem(CopyLen, CILoopOpSize);
+   Value *RuntimeBytesCopied = PLBuilder.CreateSub(CopyLen, RuntimeResidual);
+
+    // Loop body for the residual copy.
+    BasicBlock *ResLoopBB = BasicBlock::Create(Ctx, "loop-memcpy-residual",
+                                               PreLoopBB->getParent(),
+                                               PostLoopBB);
+    // Residual loop header.
+    BasicBlock *ResHeaderBB = BasicBlock::Create(
+        Ctx, "loop-memcpy-residual-header", PreLoopBB->getParent(), nullptr);
+
+    // Need to update the pre-loop basic block to branch to the correct place.
+    // branch to the main loop if the count is non-zero, branch to the residual
+    // loop if the copy size is smaller then 1 iteration of the main loop but
+    // non-zero and finally branch to after the residual loop if the memcpy
+    //  size is zero.
+    ConstantInt *Zero = ConstantInt::get(ILengthType, 0U);
+    PLBuilder.CreateCondBr(PLBuilder.CreateICmpNE(RuntimeLoopCount, Zero),
+                           LoopBB, ResHeaderBB);
+    PreLoopBB->getTerminator()->eraseFromParent();
+
+    LoopBuilder.CreateCondBr(
+        LoopBuilder.CreateICmpULT(NewIndex, RuntimeLoopCount), LoopBB,
+        ResHeaderBB);
+
+    // Determine if we need to branch to the residual loop or bypass it.
+    IRBuilder<> RHBuilder(ResHeaderBB);
+    RHBuilder.CreateCondBr(RHBuilder.CreateICmpNE(RuntimeResidual, Zero),
+                           ResLoopBB, PostLoopBB);
+
+    // Copy the residual with single byte load/store loop.
+    IRBuilder<> ResBuilder(ResLoopBB);
+    PHINode *ResidualIndex =
+        ResBuilder.CreatePHI(CopyLenType, 2, "residual-loop-index");
+    ResidualIndex->addIncoming(Zero, ResHeaderBB);
+
+    Value *SrcAsInt8 =
+        ResBuilder.CreateBitCast(SrcAddr, PointerType::get(Int8Type, SrcAS));
+    Value *DstAsInt8 =
+        ResBuilder.CreateBitCast(DstAddr, PointerType::get(Int8Type, DstAS));
+    Value *FullOffset = ResBuilder.CreateAdd(RuntimeBytesCopied, ResidualIndex);
+    Value *SrcGEP =
+        ResBuilder.CreateInBoundsGEP(Int8Type, SrcAsInt8, FullOffset);
+    Value *Load = ResBuilder.CreateAlignedLoad(Int8Type, SrcGEP, PartSrcAlign,
+                                               SrcIsVolatile);
+    Value *DstGEP =
+        ResBuilder.CreateInBoundsGEP(Int8Type, DstAsInt8, FullOffset);
+    ResBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile);
+
+    Value *ResNewIndex =
+        ResBuilder.CreateAdd(ResidualIndex, ConstantInt::get(CopyLenType, 1U));
+    ResidualIndex->addIncoming(ResNewIndex, ResLoopBB);
+
+    // Create the loop branch condition.
+    ResBuilder.CreateCondBr(
+        ResBuilder.CreateICmpULT(ResNewIndex, RuntimeResidual), ResLoopBB,
+        PostLoopBB);
+  } else {
+    // In this case the loop operand type was a byte, and there is no need for a
+    // residual loop to copy the remaining memory after the main loop.
+    // We do however need to patch up the control flow by creating the
+    // terminators for the preloop block and the memcpy loop.
+    ConstantInt *Zero = ConstantInt::get(ILengthType, 0U);
+    PLBuilder.CreateCondBr(PLBuilder.CreateICmpNE(RuntimeLoopCount, Zero),
+                           LoopBB, PostLoopBB);
+    PreLoopBB->getTerminator()->eraseFromParent();
+    LoopBuilder.CreateCondBr(
+        LoopBuilder.CreateICmpULT(NewIndex, RuntimeLoopCount), LoopBB,
+        PostLoopBB);
+  }
+}
+
+// Lower memmove to IR. memmove is required to correctly copy overlapping memory
+// regions; therefore, it has to check the relative positions of the source and
+// destination pointers and choose the copy direction accordingly.
+//
+// The code below is an IR rendition of this C function:
+//
+// void* memmove(void* dst, const void* src, size_t n) {
+//   unsigned char* d = dst;
+//   const unsigned char* s = src;
+//   if (s < d) {
+//     // copy backwards
+//     while (n--) {
+//       d[n] = s[n];
+//     }
+//   } else {
+//     // copy forward
+//     for (size_t i = 0; i < n; ++i) {
+//       d[i] = s[i];
+//     }
+//   }
+//   return dst;
+// }
+static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
+                              Value *DstAddr, Value *CopyLen, Align SrcAlign,
+                              Align DstAlign, bool SrcIsVolatile,
+                              bool DstIsVolatile) {
+  Type *TypeOfCopyLen = CopyLen->getType();
+  BasicBlock *OrigBB = InsertBefore->getParent();
+  Function *F = OrigBB->getParent();
+  const DataLayout &DL = F->getParent()->getDataLayout();
+
+  Type *EltTy = cast<PointerType>(SrcAddr->getType())->getElementType();
+
+  // Create the a comparison of src and dst, based on which we jump to either
+  // the forward-copy part of the function (if src >= dst) or the backwards-copy
+  // part (if src < dst).
+  // SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else
+  // structure. Its block terminators (unconditional branches) are replaced by
+  // the appropriate conditional branches when the loop is built.
+  ICmpInst *PtrCompare = new ICmpInst(InsertBefore, ICmpInst::ICMP_ULT,
+                                      SrcAddr, DstAddr, "compare_src_dst");
+  Instruction *ThenTerm, *ElseTerm;
+  SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore, &ThenTerm,
+                                &ElseTerm);
+
+  // Each part of the function consists of two blocks:
+  //   copy_backwards:        used to skip the loop when n == 0
+  //   copy_backwards_loop:   the actual backwards loop BB
+  //   copy_forward:          used to skip the loop when n == 0
+  //   copy_forward_loop:     the actual forward loop BB
+  BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
+  CopyBackwardsBB->setName("copy_backwards");
+  BasicBlock *CopyForwardBB = ElseTerm->getParent();
+  CopyForwardBB->setName("copy_forward");
+  BasicBlock *ExitBB = InsertBefore->getParent();
+  ExitBB->setName("memmove_done");
+
+  unsigned PartSize = DL.getTypeStoreSize(EltTy);
+  Align PartSrcAlign(commonAlignment(SrcAlign, PartSize));
+  Align PartDstAlign(commonAlignment(DstAlign, PartSize));
+
+  // Initial comparison of n == 0 that lets us skip the loops altogether. Shared
+  // between both backwards and forward copy clauses.
+  ICmpInst *CompareN =
+      new ICmpInst(OrigBB->getTerminator(), ICmpInst::ICMP_EQ, CopyLen,
+                   ConstantInt::get(TypeOfCopyLen, 0), "compare_n_to_0");
+
+  // Copying backwards.
+  BasicBlock *LoopBB =
+    BasicBlock::Create(F->getContext(), "copy_backwards_loop", F, CopyForwardBB);
+  IRBuilder<> LoopBuilder(LoopBB);
+  PHINode *LoopPhi = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
+  Value *IndexPtr = LoopBuilder.CreateSub(
+      LoopPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_ptr");
+  Value *Element = LoopBuilder.CreateAlignedLoad(
+      EltTy, LoopBuilder.CreateInBoundsGEP(EltTy, SrcAddr, IndexPtr),
+      PartSrcAlign, "element");
+  LoopBuilder.CreateAlignedStore(
+      Element, LoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, IndexPtr),
+      PartDstAlign);
+  LoopBuilder.CreateCondBr(
+      LoopBuilder.CreateICmpEQ(IndexPtr, ConstantInt::get(TypeOfCopyLen, 0)),
+      ExitBB, LoopBB);
+  LoopPhi->addIncoming(IndexPtr, LoopBB);
+  LoopPhi->addIncoming(CopyLen, CopyBackwardsBB);
+  BranchInst::Create(ExitBB, LoopBB, CompareN, ThenTerm);
+  ThenTerm->eraseFromParent();
+
+  // Copying forward.
+  BasicBlock *FwdLoopBB =
+    BasicBlock::Create(F->getContext(), "copy_forward_loop", F, ExitBB);
+  IRBuilder<> FwdLoopBuilder(FwdLoopBB);
+  PHINode *FwdCopyPhi = FwdLoopBuilder.CreatePHI(TypeOfCopyLen, 0, "index_ptr");
+  Value *SrcGEP = FwdLoopBuilder.CreateInBoundsGEP(EltTy, SrcAddr, FwdCopyPhi);
+  Value *FwdElement =
+      FwdLoopBuilder.CreateAlignedLoad(EltTy, SrcGEP, PartSrcAlign, "element");
+  Value *DstGEP = FwdLoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, FwdCopyPhi);
+  FwdLoopBuilder.CreateAlignedStore(FwdElement, DstGEP, PartDstAlign);
+  Value *FwdIndexPtr = FwdLoopBuilder.CreateAdd(
+      FwdCopyPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_increment");
+  FwdLoopBuilder.CreateCondBr(FwdLoopBuilder.CreateICmpEQ(FwdIndexPtr, CopyLen),
+                              ExitBB, FwdLoopBB);
+  FwdCopyPhi->addIncoming(FwdIndexPtr, FwdLoopBB);
+  FwdCopyPhi->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), CopyForwardBB);
+
+  BranchInst::Create(ExitBB, FwdLoopBB, CompareN, ElseTerm);
+  ElseTerm->eraseFromParent();
+}
+
+static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
+                             Value *CopyLen, Value *SetValue, Align DstAlign,
+                             bool IsVolatile) {
+  Type *TypeOfCopyLen = CopyLen->getType();
+  BasicBlock *OrigBB = InsertBefore->getParent();
+  Function *F = OrigBB->getParent();
+  const DataLayout &DL = F->getParent()->getDataLayout();
+  BasicBlock *NewBB =
+      OrigBB->splitBasicBlock(InsertBefore, "split");
+  BasicBlock *LoopBB
+    = BasicBlock::Create(F->getContext(), "loadstoreloop", F, NewBB);
+
+  IRBuilder<> Builder(OrigBB->getTerminator());
+
+  // Cast pointer to the type of value getting stored
+  unsigned dstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+  DstAddr = Builder.CreateBitCast(DstAddr,
+                                  PointerType::get(SetValue->getType(), dstAS));
+
+  Builder.CreateCondBr(
+      Builder.CreateICmpEQ(ConstantInt::get(TypeOfCopyLen, 0), CopyLen), NewBB,
+      LoopBB);
+  OrigBB->getTerminator()->eraseFromParent();
+
+  unsigned PartSize = DL.getTypeStoreSize(SetValue->getType());
+  Align PartAlign(commonAlignment(DstAlign, PartSize));
+
+  IRBuilder<> LoopBuilder(LoopBB);
+  PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
+  LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB);
+
+  LoopBuilder.CreateAlignedStore(
+      SetValue,
+      LoopBuilder.CreateInBoundsGEP(SetValue->getType(), DstAddr, LoopIndex),
+      PartAlign, IsVolatile);
+
+  Value *NewIndex =
+      LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1));
+  LoopIndex->addIncoming(NewIndex, LoopBB);
+
+  LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB,
+                           NewBB);
+}
+
+void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy,
+                              const TargetTransformInfo &TTI) {
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Memcpy->getLength())) {
+    createMemCpyLoopKnownSize(
+        /* InsertBefore */ Memcpy,
+        /* SrcAddr */ Memcpy->getRawSource(),
+        /* DstAddr */ Memcpy->getRawDest(),
+        /* CopyLen */ CI,
+        /* SrcAlign */ Memcpy->getSourceAlign().valueOrOne(),
+        /* DestAlign */ Memcpy->getDestAlign().valueOrOne(),
+        /* SrcIsVolatile */ Memcpy->isVolatile(),
+        /* DstIsVolatile */ Memcpy->isVolatile(),
+        /* TargetTransformInfo */ TTI);
+  } else {
+    createMemCpyLoopUnknownSize(
+        /* InsertBefore */ Memcpy,
+        /* SrcAddr */ Memcpy->getRawSource(),
+        /* DstAddr */ Memcpy->getRawDest(),
+        /* CopyLen */ Memcpy->getLength(),
+        /* SrcAlign */ Memcpy->getSourceAlign().valueOrOne(),
+        /* DestAlign */ Memcpy->getDestAlign().valueOrOne(),
+        /* SrcIsVolatile */ Memcpy->isVolatile(),
+        /* DstIsVolatile */ Memcpy->isVolatile(),
+        /* TargetTransfomrInfo */ TTI);
+  }
+}
+
+void llvm::expandMemMoveAsLoop(MemMoveInst *Memmove) {
+  createMemMoveLoop(/* InsertBefore */ Memmove,
+                    /* SrcAddr */ Memmove->getRawSource(),
+                    /* DstAddr */ Memmove->getRawDest(),
+                    /* CopyLen */ Memmove->getLength(),
+                    /* SrcAlign */ Memmove->getSourceAlign().valueOrOne(),
+                    /* DestAlign */ Memmove->getDestAlign().valueOrOne(),
+                    /* SrcIsVolatile */ Memmove->isVolatile(),
+                    /* DstIsVolatile */ Memmove->isVolatile());
+}
+
+void llvm::expandMemSetAsLoop(MemSetInst *Memset) {
+  createMemSetLoop(/* InsertBefore */ Memset,
+                   /* DstAddr */ Memset->getRawDest(),
+                   /* CopyLen */ Memset->getLength(),
+                   /* SetValue */ Memset->getValue(),
+                   /* Alignment */ Memset->getDestAlign().valueOrOne(),
+                   Memset->isVolatile());
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/LowerSwitch.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/LowerSwitch.cpp
index 20ceb21bee..ec8d7a7074 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/LowerSwitch.cpp
@@ -1,87 +1,87 @@
-//===- LowerSwitch.cpp - Eliminate Switch instructions --------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// The LowerSwitch transformation rewrites switch instructions with a sequence 
-// of branches, which allows targets to get away with not implementing the 
-// switch instruction until it is convenient. 
-// 
-//===----------------------------------------------------------------------===// 
- 
+//===- LowerSwitch.cpp - Eliminate Switch instructions --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The LowerSwitch transformation rewrites switch instructions with a sequence
+// of branches, which allows targets to get away with not implementing the
+// switch instruction until it is convenient.
+//
+//===----------------------------------------------------------------------===//
+
 #include "llvm/Transforms/Utils/LowerSwitch.h"
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/LazyValueInfo.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/ConstantRange.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instructions.h" 
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/Compiler.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/KnownBits.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Utils.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <cstdint> 
-#include <iterator> 
-#include <limits> 
-#include <vector> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "lower-switch" 
- 
-namespace { 
- 
-  struct IntRange { 
-    int64_t Low, High; 
-  }; 
- 
-} // end anonymous namespace 
- 
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <limits>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "lower-switch"
+
 namespace {
-// Return true iff R is covered by Ranges. 
+
+  struct IntRange {
+    int64_t Low, High;
+  };
+
+} // end anonymous namespace
+
+namespace {
+// Return true iff R is covered by Ranges.
 bool IsInRanges(const IntRange &R, const std::vector<IntRange> &Ranges) {
-  // Note: Ranges must be sorted, non-overlapping and non-adjacent. 
- 
-  // Find the first range whose High field is >= R.High, 
-  // then check if the Low field is <= R.Low. If so, we 
-  // have a Range that covers R. 
-  auto I = llvm::lower_bound( 
-      Ranges, R, [](IntRange A, IntRange B) { return A.High < B.High; }); 
-  return I != Ranges.end() && I->Low <= R.Low; 
-} 
- 
+  // Note: Ranges must be sorted, non-overlapping and non-adjacent.
+
+  // Find the first range whose High field is >= R.High,
+  // then check if the Low field is <= R.Low. If so, we
+  // have a Range that covers R.
+  auto I = llvm::lower_bound(
+      Ranges, R, [](IntRange A, IntRange B) { return A.High < B.High; });
+  return I != Ranges.end() && I->Low <= R.Low;
+}
+
 struct CaseRange {
   ConstantInt *Low;
   ConstantInt *High;
   BasicBlock *BB;
- 
+
   CaseRange(ConstantInt *low, ConstantInt *high, BasicBlock *bb)
       : Low(low), High(high), BB(bb) {}
 };
- 
+
 using CaseVector = std::vector<CaseRange>;
 using CaseItr = std::vector<CaseRange>::iterator;
- 
+
 /// The comparison function for sorting the switch case values in the vector.
 /// WARNING: Case ranges should be disjoint!
 struct CaseCmp {
@@ -89,66 +89,66 @@ struct CaseCmp {
     const ConstantInt *CI1 = cast<const ConstantInt>(C1.Low);
     const ConstantInt *CI2 = cast<const ConstantInt>(C2.High);
     return CI1->getValue().slt(CI2->getValue());
-  } 
+  }
 };
- 
-/// Used for debugging purposes. 
-LLVM_ATTRIBUTE_USED 
+
+/// Used for debugging purposes.
+LLVM_ATTRIBUTE_USED
 raw_ostream &operator<<(raw_ostream &O, const CaseVector &C) {
-  O << "["; 
- 
+  O << "[";
+
   for (CaseVector::const_iterator B = C.begin(), E = C.end(); B != E;) {
-    O << "[" << B->Low->getValue() << ", " << B->High->getValue() << "]"; 
-    if (++B != E) 
-      O << ", "; 
-  } 
- 
-  return O << "]"; 
-} 
- 
-/// Update the first occurrence of the "switch statement" BB in the PHI 
-/// node with the "new" BB. The other occurrences will: 
-/// 
-/// 1) Be updated by subsequent calls to this function.  Switch statements may 
-/// have more than one outcoming edge into the same BB if they all have the same 
-/// value. When the switch statement is converted these incoming edges are now 
-/// coming from multiple BBs. 
-/// 2) Removed if subsequent incoming values now share the same case, i.e., 
-/// multiple outcome edges are condensed into one. This is necessary to keep the 
-/// number of phi values equal to the number of branches to SuccBB. 
+    O << "[" << B->Low->getValue() << ", " << B->High->getValue() << "]";
+    if (++B != E)
+      O << ", ";
+  }
+
+  return O << "]";
+}
+
+/// Update the first occurrence of the "switch statement" BB in the PHI
+/// node with the "new" BB. The other occurrences will:
+///
+/// 1) Be updated by subsequent calls to this function.  Switch statements may
+/// have more than one outcoming edge into the same BB if they all have the same
+/// value. When the switch statement is converted these incoming edges are now
+/// coming from multiple BBs.
+/// 2) Removed if subsequent incoming values now share the same case, i.e.,
+/// multiple outcome edges are condensed into one. This is necessary to keep the
+/// number of phi values equal to the number of branches to SuccBB.
 void FixPhis(
     BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB,
     const unsigned NumMergedCases = std::numeric_limits<unsigned>::max()) {
-  for (BasicBlock::iterator I = SuccBB->begin(), 
-                            IE = SuccBB->getFirstNonPHI()->getIterator(); 
-       I != IE; ++I) { 
-    PHINode *PN = cast<PHINode>(I); 
- 
-    // Only update the first occurrence. 
-    unsigned Idx = 0, E = PN->getNumIncomingValues(); 
-    unsigned LocalNumMergedCases = NumMergedCases; 
-    for (; Idx != E; ++Idx) { 
-      if (PN->getIncomingBlock(Idx) == OrigBB) { 
-        PN->setIncomingBlock(Idx, NewBB); 
-        break; 
-      } 
-    } 
- 
-    // Remove additional occurrences coming from condensed cases and keep the 
-    // number of incoming values equal to the number of branches to SuccBB. 
-    SmallVector<unsigned, 8> Indices; 
-    for (++Idx; LocalNumMergedCases > 0 && Idx < E; ++Idx) 
-      if (PN->getIncomingBlock(Idx) == OrigBB) { 
-        Indices.push_back(Idx); 
-        LocalNumMergedCases--; 
-      } 
-    // Remove incoming values in the reverse order to prevent invalidating 
-    // *successive* index. 
-    for (unsigned III : llvm::reverse(Indices)) 
-      PN->removeIncomingValue(III); 
-  } 
-} 
- 
+  for (BasicBlock::iterator I = SuccBB->begin(),
+                            IE = SuccBB->getFirstNonPHI()->getIterator();
+       I != IE; ++I) {
+    PHINode *PN = cast<PHINode>(I);
+
+    // Only update the first occurrence.
+    unsigned Idx = 0, E = PN->getNumIncomingValues();
+    unsigned LocalNumMergedCases = NumMergedCases;
+    for (; Idx != E; ++Idx) {
+      if (PN->getIncomingBlock(Idx) == OrigBB) {
+        PN->setIncomingBlock(Idx, NewBB);
+        break;
+      }
+    }
+
+    // Remove additional occurrences coming from condensed cases and keep the
+    // number of incoming values equal to the number of branches to SuccBB.
+    SmallVector<unsigned, 8> Indices;
+    for (++Idx; LocalNumMergedCases > 0 && Idx < E; ++Idx)
+      if (PN->getIncomingBlock(Idx) == OrigBB) {
+        Indices.push_back(Idx);
+        LocalNumMergedCases--;
+      }
+    // Remove incoming values in the reverse order to prevent invalidating
+    // *successive* index.
+    for (unsigned III : llvm::reverse(Indices))
+      PN->removeIncomingValue(III);
+  }
+}
+
 /// Create a new leaf block for the binary lookup tree. It checks if the
 /// switch's value == the case's value. If not, then it jumps to the default
 /// branch. At this point in the tree, the value can't be another valid case
@@ -213,312 +213,312 @@ BasicBlock *NewLeafBlock(CaseRange &Leaf, Value *Val, ConstantInt *LowerBound,
   return NewLeaf;
 }
 
-/// Convert the switch statement into a binary lookup of the case values. 
-/// The function recursively builds this tree. LowerBound and UpperBound are 
-/// used to keep track of the bounds for Val that have already been checked by 
-/// a block emitted by one of the previous calls to switchConvert in the call 
-/// stack. 
+/// Convert the switch statement into a binary lookup of the case values.
+/// The function recursively builds this tree. LowerBound and UpperBound are
+/// used to keep track of the bounds for Val that have already been checked by
+/// a block emitted by one of the previous calls to switchConvert in the call
+/// stack.
 BasicBlock *SwitchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound,
                           ConstantInt *UpperBound, Value *Val,
                           BasicBlock *Predecessor, BasicBlock *OrigBlock,
                           BasicBlock *Default,
                           const std::vector<IntRange> &UnreachableRanges) {
-  assert(LowerBound && UpperBound && "Bounds must be initialized"); 
-  unsigned Size = End - Begin; 
- 
-  if (Size == 1) { 
-    // Check if the Case Range is perfectly squeezed in between 
-    // already checked Upper and Lower bounds. If it is then we can avoid 
-    // emitting the code that checks if the value actually falls in the range 
-    // because the bounds already tell us so. 
-    if (Begin->Low == LowerBound && Begin->High == UpperBound) { 
-      unsigned NumMergedCases = 0; 
-      NumMergedCases = UpperBound->getSExtValue() - LowerBound->getSExtValue(); 
+  assert(LowerBound && UpperBound && "Bounds must be initialized");
+  unsigned Size = End - Begin;
+
+  if (Size == 1) {
+    // Check if the Case Range is perfectly squeezed in between
+    // already checked Upper and Lower bounds. If it is then we can avoid
+    // emitting the code that checks if the value actually falls in the range
+    // because the bounds already tell us so.
+    if (Begin->Low == LowerBound && Begin->High == UpperBound) {
+      unsigned NumMergedCases = 0;
+      NumMergedCases = UpperBound->getSExtValue() - LowerBound->getSExtValue();
       FixPhis(Begin->BB, OrigBlock, Predecessor, NumMergedCases);
-      return Begin->BB; 
-    } 
+      return Begin->BB;
+    }
     return NewLeafBlock(*Begin, Val, LowerBound, UpperBound, OrigBlock,
-                        Default); 
-  } 
- 
-  unsigned Mid = Size / 2; 
-  std::vector<CaseRange> LHS(Begin, Begin + Mid); 
-  LLVM_DEBUG(dbgs() << "LHS: " << LHS << "\n"); 
-  std::vector<CaseRange> RHS(Begin + Mid, End); 
-  LLVM_DEBUG(dbgs() << "RHS: " << RHS << "\n"); 
- 
-  CaseRange &Pivot = *(Begin + Mid); 
-  LLVM_DEBUG(dbgs() << "Pivot ==> [" << Pivot.Low->getValue() << ", " 
-                    << Pivot.High->getValue() << "]\n"); 
- 
-  // NewLowerBound here should never be the integer minimal value. 
-  // This is because it is computed from a case range that is never 
-  // the smallest, so there is always a case range that has at least 
-  // a smaller value. 
-  ConstantInt *NewLowerBound = Pivot.Low; 
- 
-  // Because NewLowerBound is never the smallest representable integer 
-  // it is safe here to subtract one. 
-  ConstantInt *NewUpperBound = ConstantInt::get(NewLowerBound->getContext(), 
-                                                NewLowerBound->getValue() - 1); 
- 
-  if (!UnreachableRanges.empty()) { 
-    // Check if the gap between LHS's highest and NewLowerBound is unreachable. 
-    int64_t GapLow = LHS.back().High->getSExtValue() + 1; 
-    int64_t GapHigh = NewLowerBound->getSExtValue() - 1; 
-    IntRange Gap = { GapLow, GapHigh }; 
-    if (GapHigh >= GapLow && IsInRanges(Gap, UnreachableRanges)) 
-      NewUpperBound = LHS.back().High; 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "LHS Bounds ==> [" << LowerBound->getSExtValue() << ", " 
-                    << NewUpperBound->getSExtValue() << "]\n" 
-                    << "RHS Bounds ==> [" << NewLowerBound->getSExtValue() 
-                    << ", " << UpperBound->getSExtValue() << "]\n"); 
- 
-  // Create a new node that checks if the value is < pivot. Go to the 
-  // left branch if it is and right branch if not. 
-  Function* F = OrigBlock->getParent(); 
-  BasicBlock* NewNode = BasicBlock::Create(Val->getContext(), "NodeBlock"); 
- 
-  ICmpInst* Comp = new ICmpInst(ICmpInst::ICMP_SLT, 
-                                Val, Pivot.Low, "Pivot"); 
- 
+                        Default);
+  }
+
+  unsigned Mid = Size / 2;
+  std::vector<CaseRange> LHS(Begin, Begin + Mid);
+  LLVM_DEBUG(dbgs() << "LHS: " << LHS << "\n");
+  std::vector<CaseRange> RHS(Begin + Mid, End);
+  LLVM_DEBUG(dbgs() << "RHS: " << RHS << "\n");
+
+  CaseRange &Pivot = *(Begin + Mid);
+  LLVM_DEBUG(dbgs() << "Pivot ==> [" << Pivot.Low->getValue() << ", "
+                    << Pivot.High->getValue() << "]\n");
+
+  // NewLowerBound here should never be the integer minimal value.
+  // This is because it is computed from a case range that is never
+  // the smallest, so there is always a case range that has at least
+  // a smaller value.
+  ConstantInt *NewLowerBound = Pivot.Low;
+
+  // Because NewLowerBound is never the smallest representable integer
+  // it is safe here to subtract one.
+  ConstantInt *NewUpperBound = ConstantInt::get(NewLowerBound->getContext(),
+                                                NewLowerBound->getValue() - 1);
+
+  if (!UnreachableRanges.empty()) {
+    // Check if the gap between LHS's highest and NewLowerBound is unreachable.
+    int64_t GapLow = LHS.back().High->getSExtValue() + 1;
+    int64_t GapHigh = NewLowerBound->getSExtValue() - 1;
+    IntRange Gap = { GapLow, GapHigh };
+    if (GapHigh >= GapLow && IsInRanges(Gap, UnreachableRanges))
+      NewUpperBound = LHS.back().High;
+  }
+
+  LLVM_DEBUG(dbgs() << "LHS Bounds ==> [" << LowerBound->getSExtValue() << ", "
+                    << NewUpperBound->getSExtValue() << "]\n"
+                    << "RHS Bounds ==> [" << NewLowerBound->getSExtValue()
+                    << ", " << UpperBound->getSExtValue() << "]\n");
+
+  // Create a new node that checks if the value is < pivot. Go to the
+  // left branch if it is and right branch if not.
+  Function* F = OrigBlock->getParent();
+  BasicBlock* NewNode = BasicBlock::Create(Val->getContext(), "NodeBlock");
+
+  ICmpInst* Comp = new ICmpInst(ICmpInst::ICMP_SLT,
+                                Val, Pivot.Low, "Pivot");
+
   BasicBlock *LBranch =
       SwitchConvert(LHS.begin(), LHS.end(), LowerBound, NewUpperBound, Val,
                     NewNode, OrigBlock, Default, UnreachableRanges);
   BasicBlock *RBranch =
       SwitchConvert(RHS.begin(), RHS.end(), NewLowerBound, UpperBound, Val,
                     NewNode, OrigBlock, Default, UnreachableRanges);
- 
-  F->getBasicBlockList().insert(++OrigBlock->getIterator(), NewNode); 
-  NewNode->getInstList().push_back(Comp); 
- 
-  BranchInst::Create(LBranch, RBranch, Comp, NewNode); 
-  return NewNode; 
-} 
- 
-/// Transform simple list of \p SI's cases into list of CaseRange's \p Cases. 
-/// \post \p Cases wouldn't contain references to \p SI's default BB. 
-/// \returns Number of \p SI's cases that do not reference \p SI's default BB. 
+
+  F->getBasicBlockList().insert(++OrigBlock->getIterator(), NewNode);
+  NewNode->getInstList().push_back(Comp);
+
+  BranchInst::Create(LBranch, RBranch, Comp, NewNode);
+  return NewNode;
+}
+
+/// Transform simple list of \p SI's cases into list of CaseRange's \p Cases.
+/// \post \p Cases wouldn't contain references to \p SI's default BB.
+/// \returns Number of \p SI's cases that do not reference \p SI's default BB.
 unsigned Clusterify(CaseVector &Cases, SwitchInst *SI) {
-  unsigned NumSimpleCases = 0; 
- 
-  // Start with "simple" cases 
-  for (auto Case : SI->cases()) { 
-    if (Case.getCaseSuccessor() == SI->getDefaultDest()) 
-      continue; 
-    Cases.push_back(CaseRange(Case.getCaseValue(), Case.getCaseValue(), 
-                              Case.getCaseSuccessor())); 
-    ++NumSimpleCases; 
-  } 
- 
-  llvm::sort(Cases, CaseCmp()); 
- 
-  // Merge case into clusters 
-  if (Cases.size() >= 2) { 
-    CaseItr I = Cases.begin(); 
-    for (CaseItr J = std::next(I), E = Cases.end(); J != E; ++J) { 
-      int64_t nextValue = J->Low->getSExtValue(); 
-      int64_t currentValue = I->High->getSExtValue(); 
-      BasicBlock* nextBB = J->BB; 
-      BasicBlock* currentBB = I->BB; 
- 
-      // If the two neighboring cases go to the same destination, merge them 
-      // into a single case. 
-      assert(nextValue > currentValue && "Cases should be strictly ascending"); 
-      if ((nextValue == currentValue + 1) && (currentBB == nextBB)) { 
-        I->High = J->High; 
-        // FIXME: Combine branch weights. 
-      } else if (++I != J) { 
-        *I = *J; 
-      } 
-    } 
-    Cases.erase(std::next(I), Cases.end()); 
-  } 
- 
-  return NumSimpleCases; 
-} 
- 
-/// Replace the specified switch instruction with a sequence of chained if-then 
-/// insts in a balanced binary search. 
+  unsigned NumSimpleCases = 0;
+
+  // Start with "simple" cases
+  for (auto Case : SI->cases()) {
+    if (Case.getCaseSuccessor() == SI->getDefaultDest())
+      continue;
+    Cases.push_back(CaseRange(Case.getCaseValue(), Case.getCaseValue(),
+                              Case.getCaseSuccessor()));
+    ++NumSimpleCases;
+  }
+
+  llvm::sort(Cases, CaseCmp());
+
+  // Merge case into clusters
+  if (Cases.size() >= 2) {
+    CaseItr I = Cases.begin();
+    for (CaseItr J = std::next(I), E = Cases.end(); J != E; ++J) {
+      int64_t nextValue = J->Low->getSExtValue();
+      int64_t currentValue = I->High->getSExtValue();
+      BasicBlock* nextBB = J->BB;
+      BasicBlock* currentBB = I->BB;
+
+      // If the two neighboring cases go to the same destination, merge them
+      // into a single case.
+      assert(nextValue > currentValue && "Cases should be strictly ascending");
+      if ((nextValue == currentValue + 1) && (currentBB == nextBB)) {
+        I->High = J->High;
+        // FIXME: Combine branch weights.
+      } else if (++I != J) {
+        *I = *J;
+      }
+    }
+    Cases.erase(std::next(I), Cases.end());
+  }
+
+  return NumSimpleCases;
+}
+
+/// Replace the specified switch instruction with a sequence of chained if-then
+/// insts in a balanced binary search.
 void ProcessSwitchInst(SwitchInst *SI,
                        SmallPtrSetImpl<BasicBlock *> &DeleteList,
                        AssumptionCache *AC, LazyValueInfo *LVI) {
-  BasicBlock *OrigBlock = SI->getParent(); 
-  Function *F = OrigBlock->getParent(); 
-  Value *Val = SI->getCondition();  // The value we are switching on... 
-  BasicBlock* Default = SI->getDefaultDest(); 
- 
-  // Don't handle unreachable blocks. If there are successors with phis, this 
-  // would leave them behind with missing predecessors. 
-  if ((OrigBlock != &F->getEntryBlock() && pred_empty(OrigBlock)) || 
-      OrigBlock->getSinglePredecessor() == OrigBlock) { 
-    DeleteList.insert(OrigBlock); 
-    return; 
-  } 
- 
-  // Prepare cases vector. 
-  CaseVector Cases; 
-  const unsigned NumSimpleCases = Clusterify(Cases, SI); 
-  LLVM_DEBUG(dbgs() << "Clusterify finished. Total clusters: " << Cases.size() 
-                    << ". Total non-default cases: " << NumSimpleCases 
-                    << "\nCase clusters: " << Cases << "\n"); 
- 
-  // If there is only the default destination, just branch. 
-  if (Cases.empty()) { 
-    BranchInst::Create(Default, OrigBlock); 
-    // Remove all the references from Default's PHIs to OrigBlock, but one. 
+  BasicBlock *OrigBlock = SI->getParent();
+  Function *F = OrigBlock->getParent();
+  Value *Val = SI->getCondition();  // The value we are switching on...
+  BasicBlock* Default = SI->getDefaultDest();
+
+  // Don't handle unreachable blocks. If there are successors with phis, this
+  // would leave them behind with missing predecessors.
+  if ((OrigBlock != &F->getEntryBlock() && pred_empty(OrigBlock)) ||
+      OrigBlock->getSinglePredecessor() == OrigBlock) {
+    DeleteList.insert(OrigBlock);
+    return;
+  }
+
+  // Prepare cases vector.
+  CaseVector Cases;
+  const unsigned NumSimpleCases = Clusterify(Cases, SI);
+  LLVM_DEBUG(dbgs() << "Clusterify finished. Total clusters: " << Cases.size()
+                    << ". Total non-default cases: " << NumSimpleCases
+                    << "\nCase clusters: " << Cases << "\n");
+
+  // If there is only the default destination, just branch.
+  if (Cases.empty()) {
+    BranchInst::Create(Default, OrigBlock);
+    // Remove all the references from Default's PHIs to OrigBlock, but one.
     FixPhis(Default, OrigBlock, OrigBlock);
-    SI->eraseFromParent(); 
-    return; 
-  } 
- 
-  ConstantInt *LowerBound = nullptr; 
-  ConstantInt *UpperBound = nullptr; 
-  bool DefaultIsUnreachableFromSwitch = false; 
- 
-  if (isa<UnreachableInst>(Default->getFirstNonPHIOrDbg())) { 
-    // Make the bounds tightly fitted around the case value range, because we 
-    // know that the value passed to the switch must be exactly one of the case 
-    // values. 
-    LowerBound = Cases.front().Low; 
-    UpperBound = Cases.back().High; 
-    DefaultIsUnreachableFromSwitch = true; 
-  } else { 
-    // Constraining the range of the value being switched over helps eliminating 
-    // unreachable BBs and minimizing the number of `add` instructions 
-    // newLeafBlock ends up emitting. Running CorrelatedValuePropagation after 
-    // LowerSwitch isn't as good, and also much more expensive in terms of 
-    // compile time for the following reasons: 
-    // 1. it processes many kinds of instructions, not just switches; 
-    // 2. even if limited to icmp instructions only, it will have to process 
-    //    roughly C icmp's per switch, where C is the number of cases in the 
-    //    switch, while LowerSwitch only needs to call LVI once per switch. 
-    const DataLayout &DL = F->getParent()->getDataLayout(); 
-    KnownBits Known = computeKnownBits(Val, DL, /*Depth=*/0, AC, SI); 
-    // TODO Shouldn't this create a signed range? 
-    ConstantRange KnownBitsRange = 
-        ConstantRange::fromKnownBits(Known, /*IsSigned=*/false); 
+    SI->eraseFromParent();
+    return;
+  }
+
+  ConstantInt *LowerBound = nullptr;
+  ConstantInt *UpperBound = nullptr;
+  bool DefaultIsUnreachableFromSwitch = false;
+
+  if (isa<UnreachableInst>(Default->getFirstNonPHIOrDbg())) {
+    // Make the bounds tightly fitted around the case value range, because we
+    // know that the value passed to the switch must be exactly one of the case
+    // values.
+    LowerBound = Cases.front().Low;
+    UpperBound = Cases.back().High;
+    DefaultIsUnreachableFromSwitch = true;
+  } else {
+    // Constraining the range of the value being switched over helps eliminating
+    // unreachable BBs and minimizing the number of `add` instructions
+    // newLeafBlock ends up emitting. Running CorrelatedValuePropagation after
+    // LowerSwitch isn't as good, and also much more expensive in terms of
+    // compile time for the following reasons:
+    // 1. it processes many kinds of instructions, not just switches;
+    // 2. even if limited to icmp instructions only, it will have to process
+    //    roughly C icmp's per switch, where C is the number of cases in the
+    //    switch, while LowerSwitch only needs to call LVI once per switch.
+    const DataLayout &DL = F->getParent()->getDataLayout();
+    KnownBits Known = computeKnownBits(Val, DL, /*Depth=*/0, AC, SI);
+    // TODO Shouldn't this create a signed range?
+    ConstantRange KnownBitsRange =
+        ConstantRange::fromKnownBits(Known, /*IsSigned=*/false);
     const ConstantRange LVIRange = LVI->getConstantRange(Val, SI);
-    ConstantRange ValRange = KnownBitsRange.intersectWith(LVIRange); 
-    // We delegate removal of unreachable non-default cases to other passes. In 
-    // the unlikely event that some of them survived, we just conservatively 
-    // maintain the invariant that all the cases lie between the bounds. This 
-    // may, however, still render the default case effectively unreachable. 
-    APInt Low = Cases.front().Low->getValue(); 
-    APInt High = Cases.back().High->getValue(); 
-    APInt Min = APIntOps::smin(ValRange.getSignedMin(), Low); 
-    APInt Max = APIntOps::smax(ValRange.getSignedMax(), High); 
- 
-    LowerBound = ConstantInt::get(SI->getContext(), Min); 
-    UpperBound = ConstantInt::get(SI->getContext(), Max); 
-    DefaultIsUnreachableFromSwitch = (Min + (NumSimpleCases - 1) == Max); 
-  } 
- 
-  std::vector<IntRange> UnreachableRanges; 
- 
-  if (DefaultIsUnreachableFromSwitch) { 
-    DenseMap<BasicBlock *, unsigned> Popularity; 
-    unsigned MaxPop = 0; 
-    BasicBlock *PopSucc = nullptr; 
- 
-    IntRange R = {std::numeric_limits<int64_t>::min(), 
-                  std::numeric_limits<int64_t>::max()}; 
-    UnreachableRanges.push_back(R); 
-    for (const auto &I : Cases) { 
-      int64_t Low = I.Low->getSExtValue(); 
-      int64_t High = I.High->getSExtValue(); 
- 
-      IntRange &LastRange = UnreachableRanges.back(); 
-      if (LastRange.Low == Low) { 
-        // There is nothing left of the previous range. 
-        UnreachableRanges.pop_back(); 
-      } else { 
-        // Terminate the previous range. 
-        assert(Low > LastRange.Low); 
-        LastRange.High = Low - 1; 
-      } 
-      if (High != std::numeric_limits<int64_t>::max()) { 
-        IntRange R = { High + 1, std::numeric_limits<int64_t>::max() }; 
-        UnreachableRanges.push_back(R); 
-      } 
- 
-      // Count popularity. 
-      int64_t N = High - Low + 1; 
-      unsigned &Pop = Popularity[I.BB]; 
-      if ((Pop += N) > MaxPop) { 
-        MaxPop = Pop; 
-        PopSucc = I.BB; 
-      } 
-    } 
-#ifndef NDEBUG 
-    /* UnreachableRanges should be sorted and the ranges non-adjacent. */ 
-    for (auto I = UnreachableRanges.begin(), E = UnreachableRanges.end(); 
-         I != E; ++I) { 
-      assert(I->Low <= I->High); 
-      auto Next = I + 1; 
-      if (Next != E) { 
-        assert(Next->Low > I->High); 
-      } 
-    } 
-#endif 
- 
-    // As the default block in the switch is unreachable, update the PHI nodes 
-    // (remove all of the references to the default block) to reflect this. 
-    const unsigned NumDefaultEdges = SI->getNumCases() + 1 - NumSimpleCases; 
-    for (unsigned I = 0; I < NumDefaultEdges; ++I) 
-      Default->removePredecessor(OrigBlock); 
- 
-    // Use the most popular block as the new default, reducing the number of 
-    // cases. 
-    assert(MaxPop > 0 && PopSucc); 
-    Default = PopSucc; 
+    ConstantRange ValRange = KnownBitsRange.intersectWith(LVIRange);
+    // We delegate removal of unreachable non-default cases to other passes. In
+    // the unlikely event that some of them survived, we just conservatively
+    // maintain the invariant that all the cases lie between the bounds. This
+    // may, however, still render the default case effectively unreachable.
+    APInt Low = Cases.front().Low->getValue();
+    APInt High = Cases.back().High->getValue();
+    APInt Min = APIntOps::smin(ValRange.getSignedMin(), Low);
+    APInt Max = APIntOps::smax(ValRange.getSignedMax(), High);
+
+    LowerBound = ConstantInt::get(SI->getContext(), Min);
+    UpperBound = ConstantInt::get(SI->getContext(), Max);
+    DefaultIsUnreachableFromSwitch = (Min + (NumSimpleCases - 1) == Max);
+  }
+
+  std::vector<IntRange> UnreachableRanges;
+
+  if (DefaultIsUnreachableFromSwitch) {
+    DenseMap<BasicBlock *, unsigned> Popularity;
+    unsigned MaxPop = 0;
+    BasicBlock *PopSucc = nullptr;
+
+    IntRange R = {std::numeric_limits<int64_t>::min(),
+                  std::numeric_limits<int64_t>::max()};
+    UnreachableRanges.push_back(R);
+    for (const auto &I : Cases) {
+      int64_t Low = I.Low->getSExtValue();
+      int64_t High = I.High->getSExtValue();
+
+      IntRange &LastRange = UnreachableRanges.back();
+      if (LastRange.Low == Low) {
+        // There is nothing left of the previous range.
+        UnreachableRanges.pop_back();
+      } else {
+        // Terminate the previous range.
+        assert(Low > LastRange.Low);
+        LastRange.High = Low - 1;
+      }
+      if (High != std::numeric_limits<int64_t>::max()) {
+        IntRange R = { High + 1, std::numeric_limits<int64_t>::max() };
+        UnreachableRanges.push_back(R);
+      }
+
+      // Count popularity.
+      int64_t N = High - Low + 1;
+      unsigned &Pop = Popularity[I.BB];
+      if ((Pop += N) > MaxPop) {
+        MaxPop = Pop;
+        PopSucc = I.BB;
+      }
+    }
+#ifndef NDEBUG
+    /* UnreachableRanges should be sorted and the ranges non-adjacent. */
+    for (auto I = UnreachableRanges.begin(), E = UnreachableRanges.end();
+         I != E; ++I) {
+      assert(I->Low <= I->High);
+      auto Next = I + 1;
+      if (Next != E) {
+        assert(Next->Low > I->High);
+      }
+    }
+#endif
+
+    // As the default block in the switch is unreachable, update the PHI nodes
+    // (remove all of the references to the default block) to reflect this.
+    const unsigned NumDefaultEdges = SI->getNumCases() + 1 - NumSimpleCases;
+    for (unsigned I = 0; I < NumDefaultEdges; ++I)
+      Default->removePredecessor(OrigBlock);
+
+    // Use the most popular block as the new default, reducing the number of
+    // cases.
+    assert(MaxPop > 0 && PopSucc);
+    Default = PopSucc;
     llvm::erase_if(Cases,
                    [PopSucc](const CaseRange &R) { return R.BB == PopSucc; });
- 
-    // If there are no cases left, just branch. 
-    if (Cases.empty()) { 
-      BranchInst::Create(Default, OrigBlock); 
-      SI->eraseFromParent(); 
-      // As all the cases have been replaced with a single branch, only keep 
-      // one entry in the PHI nodes. 
-      for (unsigned I = 0 ; I < (MaxPop - 1) ; ++I) 
-        PopSucc->removePredecessor(OrigBlock); 
-      return; 
-    } 
- 
-    // If the condition was a PHI node with the switch block as a predecessor 
-    // removing predecessors may have caused the condition to be erased. 
-    // Getting the condition value again here protects against that. 
-    Val = SI->getCondition(); 
-  } 
- 
-  // Create a new, empty default block so that the new hierarchy of 
-  // if-then statements go to this and the PHI nodes are happy. 
-  BasicBlock *NewDefault = BasicBlock::Create(SI->getContext(), "NewDefault"); 
-  F->getBasicBlockList().insert(Default->getIterator(), NewDefault); 
-  BranchInst::Create(Default, NewDefault); 
- 
-  BasicBlock *SwitchBlock = 
+
+    // If there are no cases left, just branch.
+    if (Cases.empty()) {
+      BranchInst::Create(Default, OrigBlock);
+      SI->eraseFromParent();
+      // As all the cases have been replaced with a single branch, only keep
+      // one entry in the PHI nodes.
+      for (unsigned I = 0 ; I < (MaxPop - 1) ; ++I)
+        PopSucc->removePredecessor(OrigBlock);
+      return;
+    }
+
+    // If the condition was a PHI node with the switch block as a predecessor
+    // removing predecessors may have caused the condition to be erased.
+    // Getting the condition value again here protects against that.
+    Val = SI->getCondition();
+  }
+
+  // Create a new, empty default block so that the new hierarchy of
+  // if-then statements go to this and the PHI nodes are happy.
+  BasicBlock *NewDefault = BasicBlock::Create(SI->getContext(), "NewDefault");
+  F->getBasicBlockList().insert(Default->getIterator(), NewDefault);
+  BranchInst::Create(Default, NewDefault);
+
+  BasicBlock *SwitchBlock =
       SwitchConvert(Cases.begin(), Cases.end(), LowerBound, UpperBound, Val,
-                    OrigBlock, OrigBlock, NewDefault, UnreachableRanges); 
- 
-  // If there are entries in any PHI nodes for the default edge, make sure 
-  // to update them as well. 
+                    OrigBlock, OrigBlock, NewDefault, UnreachableRanges);
+
+  // If there are entries in any PHI nodes for the default edge, make sure
+  // to update them as well.
   FixPhis(Default, OrigBlock, NewDefault);
- 
-  // Branch to our shiny new if-then stuff... 
-  BranchInst::Create(SwitchBlock, OrigBlock); 
- 
-  // We are now done with the switch instruction, delete it. 
-  BasicBlock *OldDefault = SI->getDefaultDest(); 
-  OrigBlock->getInstList().erase(SI); 
- 
-  // If the Default block has no more predecessors just add it to DeleteList. 
+
+  // Branch to our shiny new if-then stuff...
+  BranchInst::Create(SwitchBlock, OrigBlock);
+
+  // We are now done with the switch instruction, delete it.
+  BasicBlock *OldDefault = SI->getDefaultDest();
+  OrigBlock->getInstList().erase(SI);
+
+  // If the Default block has no more predecessors just add it to DeleteList.
   if (pred_empty(OldDefault))
-    DeleteList.insert(OldDefault); 
-} 
+    DeleteList.insert(OldDefault);
+}
 
 bool LowerSwitch(Function &F, LazyValueInfo *LVI, AssumptionCache *AC) {
   bool Changed = false;
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/Mem2Reg.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/Mem2Reg.cpp
index 6fd5672f08..5ad7aeb463 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/Mem2Reg.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/Mem2Reg.cpp
@@ -1,116 +1,116 @@
-//===- Mem2Reg.cpp - The -mem2reg pass, a wrapper around the Utils lib ----===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass is a simple pass wrapper around the PromoteMemToReg function call 
-// exposed by the Utils library. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/Mem2Reg.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/PassManager.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Transforms/Utils.h" 
-#include "llvm/Transforms/Utils/PromoteMemToReg.h" 
-#include <vector> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "mem2reg" 
- 
-STATISTIC(NumPromoted, "Number of alloca's promoted"); 
- 
-static bool promoteMemoryToRegister(Function &F, DominatorTree &DT, 
-                                    AssumptionCache &AC) { 
-  std::vector<AllocaInst *> Allocas; 
-  BasicBlock &BB = F.getEntryBlock(); // Get the entry node for the function 
-  bool Changed = false; 
- 
-  while (true) { 
-    Allocas.clear(); 
- 
-    // Find allocas that are safe to promote, by looking at all instructions in 
-    // the entry node 
-    for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I) 
-      if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) // Is it an alloca? 
-        if (isAllocaPromotable(AI)) 
-          Allocas.push_back(AI); 
- 
-    if (Allocas.empty()) 
-      break; 
- 
-    PromoteMemToReg(Allocas, DT, &AC); 
-    NumPromoted += Allocas.size(); 
-    Changed = true; 
-  } 
-  return Changed; 
-} 
- 
-PreservedAnalyses PromotePass::run(Function &F, FunctionAnalysisManager &AM) { 
-  auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 
-  auto &AC = AM.getResult<AssumptionAnalysis>(F); 
-  if (!promoteMemoryToRegister(F, DT, AC)) 
-    return PreservedAnalyses::all(); 
- 
-  PreservedAnalyses PA; 
-  PA.preserveSet<CFGAnalyses>(); 
-  return PA; 
-} 
- 
-namespace { 
- 
-struct PromoteLegacyPass : public FunctionPass { 
-  // Pass identification, replacement for typeid 
-  static char ID; 
- 
-  PromoteLegacyPass() : FunctionPass(ID) { 
-    initializePromoteLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  // runOnFunction - To run this pass, first we calculate the alloca 
-  // instructions that are safe for promotion, then we promote each one. 
-  bool runOnFunction(Function &F) override { 
-    if (skipFunction(F)) 
-      return false; 
- 
-    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-    AssumptionCache &AC = 
-        getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 
-    return promoteMemoryToRegister(F, DT, AC); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<AssumptionCacheTracker>(); 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.setPreservesCFG(); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-char PromoteLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(PromoteLegacyPass, "mem2reg", "Promote Memory to " 
-                                                    "Register", 
-                      false, false) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_END(PromoteLegacyPass, "mem2reg", "Promote Memory to Register", 
-                    false, false) 
- 
-// createPromoteMemoryToRegister - Provide an entry point to create this pass. 
-FunctionPass *llvm::createPromoteMemoryToRegisterPass() { 
-  return new PromoteLegacyPass(); 
-} 
+//===- Mem2Reg.cpp - The -mem2reg pass, a wrapper around the Utils lib ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is a simple pass wrapper around the PromoteMemToReg function call
+// exposed by the Utils library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/Mem2Reg.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mem2reg"
+
+STATISTIC(NumPromoted, "Number of alloca's promoted");
+
+static bool promoteMemoryToRegister(Function &F, DominatorTree &DT,
+                                    AssumptionCache &AC) {
+  std::vector<AllocaInst *> Allocas;
+  BasicBlock &BB = F.getEntryBlock(); // Get the entry node for the function
+  bool Changed = false;
+
+  while (true) {
+    Allocas.clear();
+
+    // Find allocas that are safe to promote, by looking at all instructions in
+    // the entry node
+    for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) // Is it an alloca?
+        if (isAllocaPromotable(AI))
+          Allocas.push_back(AI);
+
+    if (Allocas.empty())
+      break;
+
+    PromoteMemToReg(Allocas, DT, &AC);
+    NumPromoted += Allocas.size();
+    Changed = true;
+  }
+  return Changed;
+}
+
+PreservedAnalyses PromotePass::run(Function &F, FunctionAnalysisManager &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  if (!promoteMemoryToRegister(F, DT, AC))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
+namespace {
+
+struct PromoteLegacyPass : public FunctionPass {
+  // Pass identification, replacement for typeid
+  static char ID;
+
+  PromoteLegacyPass() : FunctionPass(ID) {
+    initializePromoteLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  // runOnFunction - To run this pass, first we calculate the alloca
+  // instructions that are safe for promotion, then we promote each one.
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    AssumptionCache &AC =
+        getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    return promoteMemoryToRegister(F, DT, AC);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.setPreservesCFG();
+  }
+};
+
+} // end anonymous namespace
+
+char PromoteLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(PromoteLegacyPass, "mem2reg", "Promote Memory to "
+                                                    "Register",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(PromoteLegacyPass, "mem2reg", "Promote Memory to Register",
+                    false, false)
+
+// createPromoteMemoryToRegister - Provide an entry point to create this pass.
+FunctionPass *llvm::createPromoteMemoryToRegisterPass() {
+  return new PromoteLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/MetaRenamer.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/MetaRenamer.cpp
index 477d0588f6..e350320e75 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/MetaRenamer.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/MetaRenamer.cpp
@@ -1,85 +1,85 @@
-//===- MetaRenamer.cpp - Rename everything with metasyntatic names --------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass renames everything with metasyntatic names. The intent is to use 
-// this pass after bugpoint reduction to conceal the nature of the original 
-// program. 
-// 
-//===----------------------------------------------------------------------===// 
- 
+//===- MetaRenamer.cpp - Rename everything with metasyntatic names --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass renames everything with metasyntatic names. The intent is to use
+// this pass after bugpoint reduction to conceal the nature of the original
+// program.
+//
+//===----------------------------------------------------------------------===//
+
 #include "llvm/Transforms/Utils/MetaRenamer.h"
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallString.h" 
-#include "llvm/ADT/StringRef.h" 
-#include "llvm/ADT/Twine.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/IR/Argument.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GlobalAlias.h" 
-#include "llvm/IR/GlobalVariable.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Module.h" 
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/TypeFinder.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Transforms/Utils.h" 
- 
-using namespace llvm; 
- 
-static const char *const metaNames[] = { 
-  // See http://en.wikipedia.org/wiki/Metasyntactic_variable 
-  "foo", "bar", "baz", "quux", "barney", "snork", "zot", "blam", "hoge", 
-  "wibble", "wobble", "widget", "wombat", "ham", "eggs", "pluto", "spam" 
-}; 
- 
-namespace { 
+#include "llvm/IR/Type.h"
+#include "llvm/IR/TypeFinder.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils.h"
+
+using namespace llvm;
+
+static const char *const metaNames[] = {
+  // See http://en.wikipedia.org/wiki/Metasyntactic_variable
+  "foo", "bar", "baz", "quux", "barney", "snork", "zot", "blam", "hoge",
+  "wibble", "wobble", "widget", "wombat", "ham", "eggs", "pluto", "spam"
+};
+
+namespace {
 // This PRNG is from the ISO C spec. It is intentionally simple and
 // unsuitable for cryptographic use. We're just looking for enough
 // variety to surprise and delight users.
 struct PRNG {
   unsigned long next;
- 
+
   void srand(unsigned int seed) { next = seed; }
- 
+
   int rand() {
     next = next * 1103515245 + 12345;
     return (unsigned int)(next / 65536) % 32768;
   }
 };
- 
+
 struct Renamer {
   Renamer(unsigned int seed) { prng.srand(seed); }
- 
+
   const char *newName() {
     return metaNames[prng.rand() % array_lengthof(metaNames)];
   }
- 
+
   PRNG prng;
 };
- 
+
 void MetaRename(Function &F) {
   for (auto AI = F.arg_begin(), AE = F.arg_end(); AI != AE; ++AI)
     if (!AI->getType()->isVoidTy())
       AI->setName("arg");
- 
+
   for (auto &BB : F) {
     BB.setName("bb");
- 
+
     for (auto &I : BB)
       if (!I.getType()->isVoidTy())
         I.setName("tmp");
   }
 }
- 
+
 void MetaRename(Module &M,
                 function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
   // Seed our PRNG with simple additive sum of ModuleID. We're looking to
@@ -88,39 +88,39 @@ void MetaRename(Module &M,
   unsigned int randSeed = 0;
   for (auto C : M.getModuleIdentifier())
     randSeed += C;
- 
+
   Renamer renamer(randSeed);
- 
+
   // Rename all aliases
   for (auto AI = M.alias_begin(), AE = M.alias_end(); AI != AE; ++AI) {
     StringRef Name = AI->getName();
     if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))
       continue;
- 
+
     AI->setName("alias");
   }
- 
+
   // Rename all global variables
   for (auto GI = M.global_begin(), GE = M.global_end(); GI != GE; ++GI) {
     StringRef Name = GI->getName();
     if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))
       continue;
- 
+
     GI->setName("global");
   }
- 
+
   // Rename all struct types
   TypeFinder StructTypes;
   StructTypes.run(M, true);
   for (StructType *STy : StructTypes) {
     if (STy->isLiteral() || STy->getName().empty())
       continue;
- 
+
     SmallString<128> NameStorage;
     STy->setName(
         (Twine("struct.") + renamer.newName()).toStringRef(NameStorage));
   }
- 
+
   // Rename all functions
   for (auto &F : M) {
     StringRef Name = F.getName();
@@ -130,29 +130,29 @@ void MetaRename(Module &M,
     if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1) ||
         GetTLI(F).getLibFunc(F, Tmp))
       continue;
- 
+
     // Leave @main alone. The output of -metarenamer might be passed to
     // lli for execution and the latter needs a main entry point.
     if (Name != "main")
       F.setName(renamer.newName());
- 
+
     MetaRename(F);
   }
 }
- 
+
 struct MetaRenamer : public ModulePass {
   // Pass identification, replacement for typeid
   static char ID;
- 
+
   MetaRenamer() : ModulePass(ID) {
     initializeMetaRenamerPass(*PassRegistry::getPassRegistry());
   }
- 
+
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.setPreservesAll();
   }
- 
+
   bool runOnModule(Module &M) override {
     auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
       return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
@@ -161,24 +161,24 @@ struct MetaRenamer : public ModulePass {
     return true;
   }
 };
- 
-} // end anonymous namespace 
- 
-char MetaRenamer::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(MetaRenamer, "metarenamer", 
-                      "Assign new names to everything", false, false) 
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 
-INITIALIZE_PASS_END(MetaRenamer, "metarenamer", 
-                    "Assign new names to everything", false, false) 
- 
-//===----------------------------------------------------------------------===// 
-// 
-// MetaRenamer - Rename everything with metasyntactic names. 
-// 
-ModulePass *llvm::createMetaRenamerPass() { 
-  return new MetaRenamer(); 
-} 
+
+} // end anonymous namespace
+
+char MetaRenamer::ID = 0;
+
+INITIALIZE_PASS_BEGIN(MetaRenamer, "metarenamer",
+                      "Assign new names to everything", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(MetaRenamer, "metarenamer",
+                    "Assign new names to everything", false, false)
+
+//===----------------------------------------------------------------------===//
+//
+// MetaRenamer - Rename everything with metasyntactic names.
+//
+ModulePass *llvm::createMetaRenamerPass() {
+  return new MetaRenamer();
+}
 
 PreservedAnalyses MetaRenamerPass::run(Module &M, ModuleAnalysisManager &AM) {
   FunctionAnalysisManager &FAM =
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/ModuleUtils.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/ModuleUtils.cpp
index fbd6ddecbb..ef9f18a228 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/ModuleUtils.cpp
@@ -1,320 +1,320 @@
-//===-- ModuleUtils.cpp - Functions to manipulate Modules -----------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This family of functions perform manipulations on Modules. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/ModuleUtils.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/Analysis/VectorUtils.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/Support/raw_ostream.h" 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "moduleutils" 
- 
-static void appendToGlobalArray(const char *Array, Module &M, Function *F, 
-                                int Priority, Constant *Data) { 
-  IRBuilder<> IRB(M.getContext()); 
-  FunctionType *FnTy = FunctionType::get(IRB.getVoidTy(), false); 
- 
-  // Get the current set of static global constructors and add the new ctor 
-  // to the list. 
-  SmallVector<Constant *, 16> CurrentCtors; 
-  StructType *EltTy = StructType::get( 
-      IRB.getInt32Ty(), PointerType::getUnqual(FnTy), IRB.getInt8PtrTy()); 
-  if (GlobalVariable *GVCtor = M.getNamedGlobal(Array)) { 
-    if (Constant *Init = GVCtor->getInitializer()) { 
-      unsigned n = Init->getNumOperands(); 
-      CurrentCtors.reserve(n + 1); 
-      for (unsigned i = 0; i != n; ++i) 
-        CurrentCtors.push_back(cast<Constant>(Init->getOperand(i))); 
-    } 
-    GVCtor->eraseFromParent(); 
-  } 
- 
-  // Build a 3 field global_ctor entry.  We don't take a comdat key. 
-  Constant *CSVals[3]; 
-  CSVals[0] = IRB.getInt32(Priority); 
-  CSVals[1] = F; 
-  CSVals[2] = Data ? ConstantExpr::getPointerCast(Data, IRB.getInt8PtrTy()) 
-                   : Constant::getNullValue(IRB.getInt8PtrTy()); 
-  Constant *RuntimeCtorInit = 
-      ConstantStruct::get(EltTy, makeArrayRef(CSVals, EltTy->getNumElements())); 
- 
-  CurrentCtors.push_back(RuntimeCtorInit); 
- 
-  // Create a new initializer. 
-  ArrayType *AT = ArrayType::get(EltTy, CurrentCtors.size()); 
-  Constant *NewInit = ConstantArray::get(AT, CurrentCtors); 
- 
-  // Create the new global variable and replace all uses of 
-  // the old global variable with the new one. 
-  (void)new GlobalVariable(M, NewInit->getType(), false, 
-                           GlobalValue::AppendingLinkage, NewInit, Array); 
-} 
- 
-void llvm::appendToGlobalCtors(Module &M, Function *F, int Priority, Constant *Data) { 
-  appendToGlobalArray("llvm.global_ctors", M, F, Priority, Data); 
-} 
- 
-void llvm::appendToGlobalDtors(Module &M, Function *F, int Priority, Constant *Data) { 
-  appendToGlobalArray("llvm.global_dtors", M, F, Priority, Data); 
-} 
- 
-static void appendToUsedList(Module &M, StringRef Name, ArrayRef<GlobalValue *> Values) { 
-  GlobalVariable *GV = M.getGlobalVariable(Name); 
-  SmallPtrSet<Constant *, 16> InitAsSet; 
-  SmallVector<Constant *, 16> Init; 
-  if (GV) { 
-    auto *CA = cast<ConstantArray>(GV->getInitializer()); 
-    for (auto &Op : CA->operands()) { 
-      Constant *C = cast_or_null<Constant>(Op); 
-      if (InitAsSet.insert(C).second) 
-        Init.push_back(C); 
-    } 
-    GV->eraseFromParent(); 
-  } 
- 
-  Type *Int8PtrTy = llvm::Type::getInt8PtrTy(M.getContext()); 
-  for (auto *V : Values) { 
-    Constant *C = ConstantExpr::getBitCast(V, Int8PtrTy); 
-    if (InitAsSet.insert(C).second) 
-      Init.push_back(C); 
-  } 
- 
-  if (Init.empty()) 
-    return; 
- 
-  ArrayType *ATy = ArrayType::get(Int8PtrTy, Init.size()); 
-  GV = new llvm::GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage, 
-                                ConstantArray::get(ATy, Init), Name); 
-  GV->setSection("llvm.metadata"); 
-} 
- 
-void llvm::appendToUsed(Module &M, ArrayRef<GlobalValue *> Values) { 
-  appendToUsedList(M, "llvm.used", Values); 
-} 
- 
-void llvm::appendToCompilerUsed(Module &M, ArrayRef<GlobalValue *> Values) { 
-  appendToUsedList(M, "llvm.compiler.used", Values); 
-} 
- 
-FunctionCallee 
-llvm::declareSanitizerInitFunction(Module &M, StringRef InitName, 
-                                   ArrayRef<Type *> InitArgTypes) { 
-  assert(!InitName.empty() && "Expected init function name"); 
-  return M.getOrInsertFunction( 
-      InitName, 
-      FunctionType::get(Type::getVoidTy(M.getContext()), InitArgTypes, false), 
-      AttributeList()); 
-} 
- 
-Function *llvm::createSanitizerCtor(Module &M, StringRef CtorName) { 
-  Function *Ctor = Function::Create( 
-      FunctionType::get(Type::getVoidTy(M.getContext()), false), 
-      GlobalValue::InternalLinkage, CtorName, &M); 
-  BasicBlock *CtorBB = BasicBlock::Create(M.getContext(), "", Ctor); 
-  ReturnInst::Create(M.getContext(), CtorBB); 
-  return Ctor; 
-} 
- 
-std::pair<Function *, FunctionCallee> llvm::createSanitizerCtorAndInitFunctions( 
-    Module &M, StringRef CtorName, StringRef InitName, 
-    ArrayRef<Type *> InitArgTypes, ArrayRef<Value *> InitArgs, 
-    StringRef VersionCheckName) { 
-  assert(!InitName.empty() && "Expected init function name"); 
-  assert(InitArgs.size() == InitArgTypes.size() && 
-         "Sanitizer's init function expects different number of arguments"); 
-  FunctionCallee InitFunction = 
-      declareSanitizerInitFunction(M, InitName, InitArgTypes); 
-  Function *Ctor = createSanitizerCtor(M, CtorName); 
-  IRBuilder<> IRB(Ctor->getEntryBlock().getTerminator()); 
-  IRB.CreateCall(InitFunction, InitArgs); 
-  if (!VersionCheckName.empty()) { 
-    FunctionCallee VersionCheckFunction = M.getOrInsertFunction( 
-        VersionCheckName, FunctionType::get(IRB.getVoidTy(), {}, false), 
-        AttributeList()); 
-    IRB.CreateCall(VersionCheckFunction, {}); 
-  } 
-  return std::make_pair(Ctor, InitFunction); 
-} 
- 
-std::pair<Function *, FunctionCallee> 
-llvm::getOrCreateSanitizerCtorAndInitFunctions( 
-    Module &M, StringRef CtorName, StringRef InitName, 
-    ArrayRef<Type *> InitArgTypes, ArrayRef<Value *> InitArgs, 
-    function_ref<void(Function *, FunctionCallee)> FunctionsCreatedCallback, 
-    StringRef VersionCheckName) { 
-  assert(!CtorName.empty() && "Expected ctor function name"); 
- 
-  if (Function *Ctor = M.getFunction(CtorName)) 
-    // FIXME: Sink this logic into the module, similar to the handling of 
-    // globals. This will make moving to a concurrent model much easier. 
-    if (Ctor->arg_size() == 0 || 
-        Ctor->getReturnType() == Type::getVoidTy(M.getContext())) 
-      return {Ctor, declareSanitizerInitFunction(M, InitName, InitArgTypes)}; 
- 
-  Function *Ctor; 
-  FunctionCallee InitFunction; 
-  std::tie(Ctor, InitFunction) = llvm::createSanitizerCtorAndInitFunctions( 
-      M, CtorName, InitName, InitArgTypes, InitArgs, VersionCheckName); 
-  FunctionsCreatedCallback(Ctor, InitFunction); 
-  return std::make_pair(Ctor, InitFunction); 
-} 
- 
-Function *llvm::getOrCreateInitFunction(Module &M, StringRef Name) { 
-  assert(!Name.empty() && "Expected init function name"); 
-  if (Function *F = M.getFunction(Name)) { 
-    if (F->arg_size() != 0 || 
-        F->getReturnType() != Type::getVoidTy(M.getContext())) { 
-      std::string Err; 
-      raw_string_ostream Stream(Err); 
-      Stream << "Sanitizer interface function defined with wrong type: " << *F; 
-      report_fatal_error(Err); 
-    } 
-    return F; 
-  } 
-  Function *F = 
-      cast<Function>(M.getOrInsertFunction(Name, AttributeList(), 
-                                           Type::getVoidTy(M.getContext())) 
-                         .getCallee()); 
- 
-  appendToGlobalCtors(M, F, 0); 
- 
-  return F; 
-} 
- 
-void llvm::filterDeadComdatFunctions( 
-    Module &M, SmallVectorImpl<Function *> &DeadComdatFunctions) { 
-  // Build a map from the comdat to the number of entries in that comdat we 
-  // think are dead. If this fully covers the comdat group, then the entire 
-  // group is dead. If we find another entry in the comdat group though, we'll 
-  // have to preserve the whole group. 
-  SmallDenseMap<Comdat *, int, 16> ComdatEntriesCovered; 
-  for (Function *F : DeadComdatFunctions) { 
-    Comdat *C = F->getComdat(); 
-    assert(C && "Expected all input GVs to be in a comdat!"); 
-    ComdatEntriesCovered[C] += 1; 
-  } 
- 
-  auto CheckComdat = [&](Comdat &C) { 
-    auto CI = ComdatEntriesCovered.find(&C); 
-    if (CI == ComdatEntriesCovered.end()) 
-      return; 
- 
-    // If this could have been covered by a dead entry, just subtract one to 
-    // account for it. 
-    if (CI->second > 0) { 
-      CI->second -= 1; 
-      return; 
-    } 
- 
-    // If we've already accounted for all the entries that were dead, the 
-    // entire comdat is alive so remove it from the map. 
-    ComdatEntriesCovered.erase(CI); 
-  }; 
- 
-  auto CheckAllComdats = [&] { 
-    for (Function &F : M.functions()) 
-      if (Comdat *C = F.getComdat()) { 
-        CheckComdat(*C); 
-        if (ComdatEntriesCovered.empty()) 
-          return; 
-      } 
-    for (GlobalVariable &GV : M.globals()) 
-      if (Comdat *C = GV.getComdat()) { 
-        CheckComdat(*C); 
-        if (ComdatEntriesCovered.empty()) 
-          return; 
-      } 
-    for (GlobalAlias &GA : M.aliases()) 
-      if (Comdat *C = GA.getComdat()) { 
-        CheckComdat(*C); 
-        if (ComdatEntriesCovered.empty()) 
-          return; 
-      } 
-  }; 
-  CheckAllComdats(); 
- 
-  if (ComdatEntriesCovered.empty()) { 
-    DeadComdatFunctions.clear(); 
-    return; 
-  } 
- 
-  // Remove the entries that were not covering. 
-  erase_if(DeadComdatFunctions, [&](GlobalValue *GV) { 
-    return ComdatEntriesCovered.find(GV->getComdat()) == 
-           ComdatEntriesCovered.end(); 
-  }); 
-} 
- 
-std::string llvm::getUniqueModuleId(Module *M) { 
-  MD5 Md5; 
-  bool ExportsSymbols = false; 
-  auto AddGlobal = [&](GlobalValue &GV) { 
-    if (GV.isDeclaration() || GV.getName().startswith("llvm.") || 
-        !GV.hasExternalLinkage() || GV.hasComdat()) 
-      return; 
-    ExportsSymbols = true; 
-    Md5.update(GV.getName()); 
-    Md5.update(ArrayRef<uint8_t>{0}); 
-  }; 
- 
-  for (auto &F : *M) 
-    AddGlobal(F); 
-  for (auto &GV : M->globals()) 
-    AddGlobal(GV); 
-  for (auto &GA : M->aliases()) 
-    AddGlobal(GA); 
-  for (auto &IF : M->ifuncs()) 
-    AddGlobal(IF); 
- 
-  if (!ExportsSymbols) 
-    return ""; 
- 
-  MD5::MD5Result R; 
-  Md5.final(R); 
- 
-  SmallString<32> Str; 
-  MD5::stringifyResult(R, Str); 
-  return ("$" + Str).str(); 
-} 
- 
-void VFABI::setVectorVariantNames( 
-    CallInst *CI, const SmallVector<std::string, 8> &VariantMappings) { 
-  if (VariantMappings.empty()) 
-    return; 
- 
-  SmallString<256> Buffer; 
-  llvm::raw_svector_ostream Out(Buffer); 
-  for (const std::string &VariantMapping : VariantMappings) 
-    Out << VariantMapping << ","; 
-  // Get rid of the trailing ','. 
-  assert(!Buffer.str().empty() && "Must have at least one char."); 
-  Buffer.pop_back(); 
- 
-  Module *M = CI->getModule(); 
-#ifndef NDEBUG 
-  for (const std::string &VariantMapping : VariantMappings) { 
-    LLVM_DEBUG(dbgs() << "VFABI: adding mapping '" << VariantMapping << "'\n"); 
-    Optional<VFInfo> VI = VFABI::tryDemangleForVFABI(VariantMapping, *M); 
-    assert(VI.hasValue() && "Cannot add an invalid VFABI name."); 
-    assert(M->getNamedValue(VI.getValue().VectorName) && 
-           "Cannot add variant to attribute: " 
-           "vector function declaration is missing."); 
-  } 
-#endif 
-  CI->addAttribute( 
-      AttributeList::FunctionIndex, 
-      Attribute::get(M->getContext(), MappingsAttrName, Buffer.str())); 
-} 
+//===-- ModuleUtils.cpp - Functions to manipulate Modules -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This family of functions perform manipulations on Modules.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "moduleutils"
+
+static void appendToGlobalArray(const char *Array, Module &M, Function *F,
+                                int Priority, Constant *Data) {
+  IRBuilder<> IRB(M.getContext());
+  FunctionType *FnTy = FunctionType::get(IRB.getVoidTy(), false);
+
+  // Get the current set of static global constructors and add the new ctor
+  // to the list.
+  SmallVector<Constant *, 16> CurrentCtors;
+  StructType *EltTy = StructType::get(
+      IRB.getInt32Ty(), PointerType::getUnqual(FnTy), IRB.getInt8PtrTy());
+  if (GlobalVariable *GVCtor = M.getNamedGlobal(Array)) {
+    if (Constant *Init = GVCtor->getInitializer()) {
+      unsigned n = Init->getNumOperands();
+      CurrentCtors.reserve(n + 1);
+      for (unsigned i = 0; i != n; ++i)
+        CurrentCtors.push_back(cast<Constant>(Init->getOperand(i)));
+    }
+    GVCtor->eraseFromParent();
+  }
+
+  // Build a 3 field global_ctor entry.  We don't take a comdat key.
+  Constant *CSVals[3];
+  CSVals[0] = IRB.getInt32(Priority);
+  CSVals[1] = F;
+  CSVals[2] = Data ? ConstantExpr::getPointerCast(Data, IRB.getInt8PtrTy())
+                   : Constant::getNullValue(IRB.getInt8PtrTy());
+  Constant *RuntimeCtorInit =
+      ConstantStruct::get(EltTy, makeArrayRef(CSVals, EltTy->getNumElements()));
+
+  CurrentCtors.push_back(RuntimeCtorInit);
+
+  // Create a new initializer.
+  ArrayType *AT = ArrayType::get(EltTy, CurrentCtors.size());
+  Constant *NewInit = ConstantArray::get(AT, CurrentCtors);
+
+  // Create the new global variable and replace all uses of
+  // the old global variable with the new one.
+  (void)new GlobalVariable(M, NewInit->getType(), false,
+                           GlobalValue::AppendingLinkage, NewInit, Array);
+}
+
+void llvm::appendToGlobalCtors(Module &M, Function *F, int Priority, Constant *Data) {
+  appendToGlobalArray("llvm.global_ctors", M, F, Priority, Data);
+}
+
+void llvm::appendToGlobalDtors(Module &M, Function *F, int Priority, Constant *Data) {
+  appendToGlobalArray("llvm.global_dtors", M, F, Priority, Data);
+}
+
+static void appendToUsedList(Module &M, StringRef Name, ArrayRef<GlobalValue *> Values) {
+  GlobalVariable *GV = M.getGlobalVariable(Name);
+  SmallPtrSet<Constant *, 16> InitAsSet;
+  SmallVector<Constant *, 16> Init;
+  if (GV) {
+    auto *CA = cast<ConstantArray>(GV->getInitializer());
+    for (auto &Op : CA->operands()) {
+      Constant *C = cast_or_null<Constant>(Op);
+      if (InitAsSet.insert(C).second)
+        Init.push_back(C);
+    }
+    GV->eraseFromParent();
+  }
+
+  Type *Int8PtrTy = llvm::Type::getInt8PtrTy(M.getContext());
+  for (auto *V : Values) {
+    Constant *C = ConstantExpr::getBitCast(V, Int8PtrTy);
+    if (InitAsSet.insert(C).second)
+      Init.push_back(C);
+  }
+
+  if (Init.empty())
+    return;
+
+  ArrayType *ATy = ArrayType::get(Int8PtrTy, Init.size());
+  GV = new llvm::GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
+                                ConstantArray::get(ATy, Init), Name);
+  GV->setSection("llvm.metadata");
+}
+
+void llvm::appendToUsed(Module &M, ArrayRef<GlobalValue *> Values) {
+  appendToUsedList(M, "llvm.used", Values);
+}
+
+void llvm::appendToCompilerUsed(Module &M, ArrayRef<GlobalValue *> Values) {
+  appendToUsedList(M, "llvm.compiler.used", Values);
+}
+
+FunctionCallee
+llvm::declareSanitizerInitFunction(Module &M, StringRef InitName,
+                                   ArrayRef<Type *> InitArgTypes) {
+  assert(!InitName.empty() && "Expected init function name");
+  return M.getOrInsertFunction(
+      InitName,
+      FunctionType::get(Type::getVoidTy(M.getContext()), InitArgTypes, false),
+      AttributeList());
+}
+
+Function *llvm::createSanitizerCtor(Module &M, StringRef CtorName) {
+  Function *Ctor = Function::Create(
+      FunctionType::get(Type::getVoidTy(M.getContext()), false),
+      GlobalValue::InternalLinkage, CtorName, &M);
+  BasicBlock *CtorBB = BasicBlock::Create(M.getContext(), "", Ctor);
+  ReturnInst::Create(M.getContext(), CtorBB);
+  return Ctor;
+}
+
+std::pair<Function *, FunctionCallee> llvm::createSanitizerCtorAndInitFunctions(
+    Module &M, StringRef CtorName, StringRef InitName,
+    ArrayRef<Type *> InitArgTypes, ArrayRef<Value *> InitArgs,
+    StringRef VersionCheckName) {
+  assert(!InitName.empty() && "Expected init function name");
+  assert(InitArgs.size() == InitArgTypes.size() &&
+         "Sanitizer's init function expects different number of arguments");
+  FunctionCallee InitFunction =
+      declareSanitizerInitFunction(M, InitName, InitArgTypes);
+  Function *Ctor = createSanitizerCtor(M, CtorName);
+  IRBuilder<> IRB(Ctor->getEntryBlock().getTerminator());
+  IRB.CreateCall(InitFunction, InitArgs);
+  if (!VersionCheckName.empty()) {
+    FunctionCallee VersionCheckFunction = M.getOrInsertFunction(
+        VersionCheckName, FunctionType::get(IRB.getVoidTy(), {}, false),
+        AttributeList());
+    IRB.CreateCall(VersionCheckFunction, {});
+  }
+  return std::make_pair(Ctor, InitFunction);
+}
+
+std::pair<Function *, FunctionCallee>
+llvm::getOrCreateSanitizerCtorAndInitFunctions(
+    Module &M, StringRef CtorName, StringRef InitName,
+    ArrayRef<Type *> InitArgTypes, ArrayRef<Value *> InitArgs,
+    function_ref<void(Function *, FunctionCallee)> FunctionsCreatedCallback,
+    StringRef VersionCheckName) {
+  assert(!CtorName.empty() && "Expected ctor function name");
+
+  if (Function *Ctor = M.getFunction(CtorName))
+    // FIXME: Sink this logic into the module, similar to the handling of
+    // globals. This will make moving to a concurrent model much easier.
+    if (Ctor->arg_size() == 0 ||
+        Ctor->getReturnType() == Type::getVoidTy(M.getContext()))
+      return {Ctor, declareSanitizerInitFunction(M, InitName, InitArgTypes)};
+
+  Function *Ctor;
+  FunctionCallee InitFunction;
+  std::tie(Ctor, InitFunction) = llvm::createSanitizerCtorAndInitFunctions(
+      M, CtorName, InitName, InitArgTypes, InitArgs, VersionCheckName);
+  FunctionsCreatedCallback(Ctor, InitFunction);
+  return std::make_pair(Ctor, InitFunction);
+}
+
+Function *llvm::getOrCreateInitFunction(Module &M, StringRef Name) {
+  assert(!Name.empty() && "Expected init function name");
+  if (Function *F = M.getFunction(Name)) {
+    if (F->arg_size() != 0 ||
+        F->getReturnType() != Type::getVoidTy(M.getContext())) {
+      std::string Err;
+      raw_string_ostream Stream(Err);
+      Stream << "Sanitizer interface function defined with wrong type: " << *F;
+      report_fatal_error(Err);
+    }
+    return F;
+  }
+  Function *F =
+      cast<Function>(M.getOrInsertFunction(Name, AttributeList(),
+                                           Type::getVoidTy(M.getContext()))
+                         .getCallee());
+
+  appendToGlobalCtors(M, F, 0);
+
+  return F;
+}
+
+void llvm::filterDeadComdatFunctions(
+    Module &M, SmallVectorImpl<Function *> &DeadComdatFunctions) {
+  // Build a map from the comdat to the number of entries in that comdat we
+  // think are dead. If this fully covers the comdat group, then the entire
+  // group is dead. If we find another entry in the comdat group though, we'll
+  // have to preserve the whole group.
+  SmallDenseMap<Comdat *, int, 16> ComdatEntriesCovered;
+  for (Function *F : DeadComdatFunctions) {
+    Comdat *C = F->getComdat();
+    assert(C && "Expected all input GVs to be in a comdat!");
+    ComdatEntriesCovered[C] += 1;
+  }
+
+  auto CheckComdat = [&](Comdat &C) {
+    auto CI = ComdatEntriesCovered.find(&C);
+    if (CI == ComdatEntriesCovered.end())
+      return;
+
+    // If this could have been covered by a dead entry, just subtract one to
+    // account for it.
+    if (CI->second > 0) {
+      CI->second -= 1;
+      return;
+    }
+
+    // If we've already accounted for all the entries that were dead, the
+    // entire comdat is alive so remove it from the map.
+    ComdatEntriesCovered.erase(CI);
+  };
+
+  auto CheckAllComdats = [&] {
+    for (Function &F : M.functions())
+      if (Comdat *C = F.getComdat()) {
+        CheckComdat(*C);
+        if (ComdatEntriesCovered.empty())
+          return;
+      }
+    for (GlobalVariable &GV : M.globals())
+      if (Comdat *C = GV.getComdat()) {
+        CheckComdat(*C);
+        if (ComdatEntriesCovered.empty())
+          return;
+      }
+    for (GlobalAlias &GA : M.aliases())
+      if (Comdat *C = GA.getComdat()) {
+        CheckComdat(*C);
+        if (ComdatEntriesCovered.empty())
+          return;
+      }
+  };
+  CheckAllComdats();
+
+  if (ComdatEntriesCovered.empty()) {
+    DeadComdatFunctions.clear();
+    return;
+  }
+
+  // Remove the entries that were not covering.
+  erase_if(DeadComdatFunctions, [&](GlobalValue *GV) {
+    return ComdatEntriesCovered.find(GV->getComdat()) ==
+           ComdatEntriesCovered.end();
+  });
+}
+
+std::string llvm::getUniqueModuleId(Module *M) {
+  MD5 Md5;
+  bool ExportsSymbols = false;
+  auto AddGlobal = [&](GlobalValue &GV) {
+    if (GV.isDeclaration() || GV.getName().startswith("llvm.") ||
+        !GV.hasExternalLinkage() || GV.hasComdat())
+      return;
+    ExportsSymbols = true;
+    Md5.update(GV.getName());
+    Md5.update(ArrayRef<uint8_t>{0});
+  };
+
+  for (auto &F : *M)
+    AddGlobal(F);
+  for (auto &GV : M->globals())
+    AddGlobal(GV);
+  for (auto &GA : M->aliases())
+    AddGlobal(GA);
+  for (auto &IF : M->ifuncs())
+    AddGlobal(IF);
+
+  if (!ExportsSymbols)
+    return "";
+
+  MD5::MD5Result R;
+  Md5.final(R);
+
+  SmallString<32> Str;
+  MD5::stringifyResult(R, Str);
+  return ("$" + Str).str();
+}
+
+void VFABI::setVectorVariantNames(
+    CallInst *CI, const SmallVector<std::string, 8> &VariantMappings) {
+  if (VariantMappings.empty())
+    return;
+
+  SmallString<256> Buffer;
+  llvm::raw_svector_ostream Out(Buffer);
+  for (const std::string &VariantMapping : VariantMappings)
+    Out << VariantMapping << ",";
+  // Get rid of the trailing ','.
+  assert(!Buffer.str().empty() && "Must have at least one char.");
+  Buffer.pop_back();
+
+  Module *M = CI->getModule();
+#ifndef NDEBUG
+  for (const std::string &VariantMapping : VariantMappings) {
+    LLVM_DEBUG(dbgs() << "VFABI: adding mapping '" << VariantMapping << "'\n");
+    Optional<VFInfo> VI = VFABI::tryDemangleForVFABI(VariantMapping, *M);
+    assert(VI.hasValue() && "Cannot add an invalid VFABI name.");
+    assert(M->getNamedValue(VI.getValue().VectorName) &&
+           "Cannot add variant to attribute: "
+           "vector function declaration is missing.");
+  }
+#endif
+  CI->addAttribute(
+      AttributeList::FunctionIndex,
+      Attribute::get(M->getContext(), MappingsAttrName, Buffer.str()));
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/NameAnonGlobals.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/NameAnonGlobals.cpp
index 1b036854fd..7083789267 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/NameAnonGlobals.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/NameAnonGlobals.cpp
@@ -1,120 +1,120 @@
-//===- NameAnonGlobals.cpp - ThinLTO Support: Name Unnamed Globals --------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements naming anonymous globals to make sure they can be 
-// referred to by ThinLTO. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/NameAnonGlobals.h" 
-#include "llvm/ADT/SmallString.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Support/MD5.h" 
-#include "llvm/Transforms/Utils/ModuleUtils.h" 
- 
-using namespace llvm; 
- 
-namespace { 
-// Compute a "unique" hash for the module based on the name of the public 
-// globals. 
-class ModuleHasher { 
-  Module &TheModule; 
-  std::string TheHash; 
- 
-public: 
-  ModuleHasher(Module &M) : TheModule(M) {} 
- 
-  /// Return the lazily computed hash. 
-  std::string &get() { 
-    if (!TheHash.empty()) 
-      // Cache hit :) 
-      return TheHash; 
- 
-    MD5 Hasher; 
-    for (auto &F : TheModule) { 
-      if (F.isDeclaration() || F.hasLocalLinkage() || !F.hasName()) 
-        continue; 
-      auto Name = F.getName(); 
-      Hasher.update(Name); 
-    } 
-    for (auto &GV : TheModule.globals()) { 
-      if (GV.isDeclaration() || GV.hasLocalLinkage() || !GV.hasName()) 
-        continue; 
-      auto Name = GV.getName(); 
-      Hasher.update(Name); 
-    } 
- 
-    // Now return the result. 
-    MD5::MD5Result Hash; 
-    Hasher.final(Hash); 
-    SmallString<32> Result; 
-    MD5::stringifyResult(Hash, Result); 
-    TheHash = std::string(Result.str()); 
-    return TheHash; 
-  } 
-}; 
-} // end anonymous namespace 
- 
-// Rename all the anon globals in the module 
-bool llvm::nameUnamedGlobals(Module &M) { 
-  bool Changed = false; 
-  ModuleHasher ModuleHash(M); 
-  int count = 0; 
-  auto RenameIfNeed = [&](GlobalValue &GV) { 
-    if (GV.hasName()) 
-      return; 
-    GV.setName(Twine("anon.") + ModuleHash.get() + "." + Twine(count++)); 
-    Changed = true; 
-  }; 
-  for (auto &GO : M.global_objects()) 
-    RenameIfNeed(GO); 
-  for (auto &GA : M.aliases()) 
-    RenameIfNeed(GA); 
- 
-  return Changed; 
-} 
- 
-namespace { 
- 
-// Legacy pass that provides a name to every anon globals. 
-class NameAnonGlobalLegacyPass : public ModulePass { 
- 
-public: 
-  /// Pass identification, replacement for typeid 
-  static char ID; 
- 
-  /// Specify pass name for debug output 
-  StringRef getPassName() const override { return "Name Anon Globals"; } 
- 
-  explicit NameAnonGlobalLegacyPass() : ModulePass(ID) {} 
- 
-  bool runOnModule(Module &M) override { return nameUnamedGlobals(M); } 
-}; 
-char NameAnonGlobalLegacyPass::ID = 0; 
- 
-} // anonymous namespace 
- 
-PreservedAnalyses NameAnonGlobalPass::run(Module &M, 
-                                          ModuleAnalysisManager &AM) { 
-  if (!nameUnamedGlobals(M)) 
-    return PreservedAnalyses::all(); 
- 
-  return PreservedAnalyses::none(); 
-} 
- 
-INITIALIZE_PASS_BEGIN(NameAnonGlobalLegacyPass, "name-anon-globals", 
-                      "Provide a name to nameless globals", false, false) 
-INITIALIZE_PASS_END(NameAnonGlobalLegacyPass, "name-anon-globals", 
-                    "Provide a name to nameless globals", false, false) 
- 
-namespace llvm { 
-ModulePass *createNameAnonGlobalPass() { 
-  return new NameAnonGlobalLegacyPass(); 
-} 
-} 
+//===- NameAnonGlobals.cpp - ThinLTO Support: Name Unnamed Globals --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements naming anonymous globals to make sure they can be
+// referred to by ThinLTO.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/NameAnonGlobals.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/MD5.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+namespace {
+// Compute a "unique" hash for the module based on the name of the public
+// globals.
+class ModuleHasher {
+  Module &TheModule;
+  std::string TheHash;
+
+public:
+  ModuleHasher(Module &M) : TheModule(M) {}
+
+  /// Return the lazily computed hash.
+  std::string &get() {
+    if (!TheHash.empty())
+      // Cache hit :)
+      return TheHash;
+
+    MD5 Hasher;
+    for (auto &F : TheModule) {
+      if (F.isDeclaration() || F.hasLocalLinkage() || !F.hasName())
+        continue;
+      auto Name = F.getName();
+      Hasher.update(Name);
+    }
+    for (auto &GV : TheModule.globals()) {
+      if (GV.isDeclaration() || GV.hasLocalLinkage() || !GV.hasName())
+        continue;
+      auto Name = GV.getName();
+      Hasher.update(Name);
+    }
+
+    // Now return the result.
+    MD5::MD5Result Hash;
+    Hasher.final(Hash);
+    SmallString<32> Result;
+    MD5::stringifyResult(Hash, Result);
+    TheHash = std::string(Result.str());
+    return TheHash;
+  }
+};
+} // end anonymous namespace
+
+// Rename all the anon globals in the module
+bool llvm::nameUnamedGlobals(Module &M) {
+  bool Changed = false;
+  ModuleHasher ModuleHash(M);
+  int count = 0;
+  auto RenameIfNeed = [&](GlobalValue &GV) {
+    if (GV.hasName())
+      return;
+    GV.setName(Twine("anon.") + ModuleHash.get() + "." + Twine(count++));
+    Changed = true;
+  };
+  for (auto &GO : M.global_objects())
+    RenameIfNeed(GO);
+  for (auto &GA : M.aliases())
+    RenameIfNeed(GA);
+
+  return Changed;
+}
+
+namespace {
+
+// Legacy pass that provides a name to every anon globals.
+class NameAnonGlobalLegacyPass : public ModulePass {
+
+public:
+  /// Pass identification, replacement for typeid
+  static char ID;
+
+  /// Specify pass name for debug output
+  StringRef getPassName() const override { return "Name Anon Globals"; }
+
+  explicit NameAnonGlobalLegacyPass() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) override { return nameUnamedGlobals(M); }
+};
+char NameAnonGlobalLegacyPass::ID = 0;
+
+} // anonymous namespace
+
+PreservedAnalyses NameAnonGlobalPass::run(Module &M,
+                                          ModuleAnalysisManager &AM) {
+  if (!nameUnamedGlobals(M))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+
+INITIALIZE_PASS_BEGIN(NameAnonGlobalLegacyPass, "name-anon-globals",
+                      "Provide a name to nameless globals", false, false)
+INITIALIZE_PASS_END(NameAnonGlobalLegacyPass, "name-anon-globals",
+                    "Provide a name to nameless globals", false, false)
+
+namespace llvm {
+ModulePass *createNameAnonGlobalPass() {
+  return new NameAnonGlobalLegacyPass();
+}
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/PredicateInfo.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/PredicateInfo.cpp
index 8e2a2ba8de..3312a6f945 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/PredicateInfo.cpp
@@ -1,376 +1,376 @@
-//===-- PredicateInfo.cpp - PredicateInfo Builder--------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------===// 
-// 
-// This file implements the PredicateInfo class. 
-// 
-//===----------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/PredicateInfo.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/DepthFirstIterator.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/StringExtras.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/CFG.h" 
-#include "llvm/IR/AssemblyAnnotationWriter.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/GlobalVariable.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstIterator.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/DebugCounter.h" 
-#include "llvm/Support/FormattedStream.h" 
-#include "llvm/Transforms/Utils.h" 
-#include <algorithm> 
-#define DEBUG_TYPE "predicateinfo" 
-using namespace llvm; 
-using namespace PatternMatch; 
- 
-INITIALIZE_PASS_BEGIN(PredicateInfoPrinterLegacyPass, "print-predicateinfo", 
-                      "PredicateInfo Printer", false, false) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
-INITIALIZE_PASS_END(PredicateInfoPrinterLegacyPass, "print-predicateinfo", 
-                    "PredicateInfo Printer", false, false) 
-static cl::opt<bool> VerifyPredicateInfo( 
-    "verify-predicateinfo", cl::init(false), cl::Hidden, 
-    cl::desc("Verify PredicateInfo in legacy printer pass.")); 
-DEBUG_COUNTER(RenameCounter, "predicateinfo-rename", 
-              "Controls which variables are renamed with predicateinfo"); 
- 
+//===-- PredicateInfo.cpp - PredicateInfo Builder--------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------===//
+//
+// This file implements the PredicateInfo class.
+//
+//===----------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/PredicateInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/IR/AssemblyAnnotationWriter.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Transforms/Utils.h"
+#include <algorithm>
+#define DEBUG_TYPE "predicateinfo"
+using namespace llvm;
+using namespace PatternMatch;
+
+INITIALIZE_PASS_BEGIN(PredicateInfoPrinterLegacyPass, "print-predicateinfo",
+                      "PredicateInfo Printer", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(PredicateInfoPrinterLegacyPass, "print-predicateinfo",
+                    "PredicateInfo Printer", false, false)
+static cl::opt<bool> VerifyPredicateInfo(
+    "verify-predicateinfo", cl::init(false), cl::Hidden,
+    cl::desc("Verify PredicateInfo in legacy printer pass."));
+DEBUG_COUNTER(RenameCounter, "predicateinfo-rename",
+              "Controls which variables are renamed with predicateinfo");
+
 // Maximum number of conditions considered for renaming for each branch/assume.
 // This limits renaming of deep and/or chains.
 static const unsigned MaxCondsPerBranch = 8;
 
-namespace { 
-// Given a predicate info that is a type of branching terminator, get the 
-// branching block. 
-const BasicBlock *getBranchBlock(const PredicateBase *PB) { 
-  assert(isa<PredicateWithEdge>(PB) && 
-         "Only branches and switches should have PHIOnly defs that " 
-         "require branch blocks."); 
-  return cast<PredicateWithEdge>(PB)->From; 
-} 
- 
-// Given a predicate info that is a type of branching terminator, get the 
-// branching terminator. 
-static Instruction *getBranchTerminator(const PredicateBase *PB) { 
-  assert(isa<PredicateWithEdge>(PB) && 
-         "Not a predicate info type we know how to get a terminator from."); 
-  return cast<PredicateWithEdge>(PB)->From->getTerminator(); 
-} 
- 
-// Given a predicate info that is a type of branching terminator, get the 
-// edge this predicate info represents 
-const std::pair<BasicBlock *, BasicBlock *> 
-getBlockEdge(const PredicateBase *PB) { 
-  assert(isa<PredicateWithEdge>(PB) && 
-         "Not a predicate info type we know how to get an edge from."); 
-  const auto *PEdge = cast<PredicateWithEdge>(PB); 
-  return std::make_pair(PEdge->From, PEdge->To); 
-} 
-} 
- 
-namespace llvm { 
-enum LocalNum { 
-  // Operations that must appear first in the block. 
-  LN_First, 
-  // Operations that are somewhere in the middle of the block, and are sorted on 
-  // demand. 
-  LN_Middle, 
-  // Operations that must appear last in a block, like successor phi node uses. 
-  LN_Last 
-}; 
- 
-// Associate global and local DFS info with defs and uses, so we can sort them 
-// into a global domination ordering. 
-struct ValueDFS { 
-  int DFSIn = 0; 
-  int DFSOut = 0; 
-  unsigned int LocalNum = LN_Middle; 
-  // Only one of Def or Use will be set. 
-  Value *Def = nullptr; 
-  Use *U = nullptr; 
-  // Neither PInfo nor EdgeOnly participate in the ordering 
-  PredicateBase *PInfo = nullptr; 
-  bool EdgeOnly = false; 
-}; 
- 
-// Perform a strict weak ordering on instructions and arguments. 
-static bool valueComesBefore(const Value *A, const Value *B) { 
-  auto *ArgA = dyn_cast_or_null<Argument>(A); 
-  auto *ArgB = dyn_cast_or_null<Argument>(B); 
-  if (ArgA && !ArgB) 
-    return true; 
-  if (ArgB && !ArgA) 
-    return false; 
-  if (ArgA && ArgB) 
-    return ArgA->getArgNo() < ArgB->getArgNo(); 
-  return cast<Instruction>(A)->comesBefore(cast<Instruction>(B)); 
-} 
- 
-// This compares ValueDFS structures. Doing so allows us to walk the minimum 
-// number of instructions necessary to compute our def/use ordering. 
-struct ValueDFS_Compare { 
-  DominatorTree &DT; 
-  ValueDFS_Compare(DominatorTree &DT) : DT(DT) {} 
- 
-  bool operator()(const ValueDFS &A, const ValueDFS &B) const { 
-    if (&A == &B) 
-      return false; 
-    // The only case we can't directly compare them is when they in the same 
-    // block, and both have localnum == middle.  In that case, we have to use 
-    // comesbefore to see what the real ordering is, because they are in the 
-    // same basic block. 
- 
-    assert((A.DFSIn != B.DFSIn || A.DFSOut == B.DFSOut) && 
-           "Equal DFS-in numbers imply equal out numbers"); 
-    bool SameBlock = A.DFSIn == B.DFSIn; 
- 
-    // We want to put the def that will get used for a given set of phi uses, 
-    // before those phi uses. 
-    // So we sort by edge, then by def. 
-    // Note that only phi nodes uses and defs can come last. 
-    if (SameBlock && A.LocalNum == LN_Last && B.LocalNum == LN_Last) 
-      return comparePHIRelated(A, B); 
- 
-    bool isADef = A.Def; 
-    bool isBDef = B.Def; 
-    if (!SameBlock || A.LocalNum != LN_Middle || B.LocalNum != LN_Middle) 
-      return std::tie(A.DFSIn, A.LocalNum, isADef) < 
-             std::tie(B.DFSIn, B.LocalNum, isBDef); 
-    return localComesBefore(A, B); 
-  } 
- 
-  // For a phi use, or a non-materialized def, return the edge it represents. 
-  const std::pair<BasicBlock *, BasicBlock *> 
-  getBlockEdge(const ValueDFS &VD) const { 
-    if (!VD.Def && VD.U) { 
-      auto *PHI = cast<PHINode>(VD.U->getUser()); 
-      return std::make_pair(PHI->getIncomingBlock(*VD.U), PHI->getParent()); 
-    } 
-    // This is really a non-materialized def. 
-    return ::getBlockEdge(VD.PInfo); 
-  } 
- 
-  // For two phi related values, return the ordering. 
-  bool comparePHIRelated(const ValueDFS &A, const ValueDFS &B) const { 
-    BasicBlock *ASrc, *ADest, *BSrc, *BDest; 
-    std::tie(ASrc, ADest) = getBlockEdge(A); 
-    std::tie(BSrc, BDest) = getBlockEdge(B); 
- 
-#ifndef NDEBUG 
-    // This function should only be used for values in the same BB, check that. 
-    DomTreeNode *DomASrc = DT.getNode(ASrc); 
-    DomTreeNode *DomBSrc = DT.getNode(BSrc); 
-    assert(DomASrc->getDFSNumIn() == (unsigned)A.DFSIn && 
-           "DFS numbers for A should match the ones of the source block"); 
-    assert(DomBSrc->getDFSNumIn() == (unsigned)B.DFSIn && 
-           "DFS numbers for B should match the ones of the source block"); 
-    assert(A.DFSIn == B.DFSIn && "Values must be in the same block"); 
-#endif 
-    (void)ASrc; 
-    (void)BSrc; 
- 
-    // Use DFS numbers to compare destination blocks, to guarantee a 
-    // deterministic order. 
-    DomTreeNode *DomADest = DT.getNode(ADest); 
-    DomTreeNode *DomBDest = DT.getNode(BDest); 
-    unsigned AIn = DomADest->getDFSNumIn(); 
-    unsigned BIn = DomBDest->getDFSNumIn(); 
-    bool isADef = A.Def; 
-    bool isBDef = B.Def; 
-    assert((!A.Def || !A.U) && (!B.Def || !B.U) && 
-           "Def and U cannot be set at the same time"); 
-    // Now sort by edge destination and then defs before uses. 
-    return std::tie(AIn, isADef) < std::tie(BIn, isBDef); 
-  } 
- 
-  // Get the definition of an instruction that occurs in the middle of a block. 
-  Value *getMiddleDef(const ValueDFS &VD) const { 
-    if (VD.Def) 
-      return VD.Def; 
-    // It's possible for the defs and uses to be null.  For branches, the local 
-    // numbering will say the placed predicaeinfos should go first (IE 
-    // LN_beginning), so we won't be in this function. For assumes, we will end 
-    // up here, beause we need to order the def we will place relative to the 
-    // assume.  So for the purpose of ordering, we pretend the def is right 
-    // after the assume, because that is where we will insert the info. 
-    if (!VD.U) { 
-      assert(VD.PInfo && 
-             "No def, no use, and no predicateinfo should not occur"); 
-      assert(isa<PredicateAssume>(VD.PInfo) && 
-             "Middle of block should only occur for assumes"); 
-      return cast<PredicateAssume>(VD.PInfo)->AssumeInst->getNextNode(); 
-    } 
-    return nullptr; 
-  } 
- 
-  // Return either the Def, if it's not null, or the user of the Use, if the def 
-  // is null. 
-  const Instruction *getDefOrUser(const Value *Def, const Use *U) const { 
-    if (Def) 
-      return cast<Instruction>(Def); 
-    return cast<Instruction>(U->getUser()); 
-  } 
- 
-  // This performs the necessary local basic block ordering checks to tell 
-  // whether A comes before B, where both are in the same basic block. 
-  bool localComesBefore(const ValueDFS &A, const ValueDFS &B) const { 
-    auto *ADef = getMiddleDef(A); 
-    auto *BDef = getMiddleDef(B); 
- 
-    // See if we have real values or uses. If we have real values, we are 
-    // guaranteed they are instructions or arguments. No matter what, we are 
-    // guaranteed they are in the same block if they are instructions. 
-    auto *ArgA = dyn_cast_or_null<Argument>(ADef); 
-    auto *ArgB = dyn_cast_or_null<Argument>(BDef); 
- 
-    if (ArgA || ArgB) 
-      return valueComesBefore(ArgA, ArgB); 
- 
-    auto *AInst = getDefOrUser(ADef, A.U); 
-    auto *BInst = getDefOrUser(BDef, B.U); 
-    return valueComesBefore(AInst, BInst); 
-  } 
-}; 
- 
-class PredicateInfoBuilder { 
-  // Used to store information about each value we might rename. 
-  struct ValueInfo { 
-    SmallVector<PredicateBase *, 4> Infos; 
-  }; 
- 
-  PredicateInfo &PI; 
-  Function &F; 
-  DominatorTree &DT; 
-  AssumptionCache &AC; 
- 
-  // This stores info about each operand or comparison result we make copies 
-  // of. The real ValueInfos start at index 1, index 0 is unused so that we 
-  // can more easily detect invalid indexing. 
-  SmallVector<ValueInfo, 32> ValueInfos; 
- 
-  // This gives the index into the ValueInfos array for a given Value. Because 
-  // 0 is not a valid Value Info index, you can use DenseMap::lookup and tell 
-  // whether it returned a valid result. 
-  DenseMap<Value *, unsigned int> ValueInfoNums; 
- 
-  // The set of edges along which we can only handle phi uses, due to critical 
-  // edges. 
-  DenseSet<std::pair<BasicBlock *, BasicBlock *>> EdgeUsesOnly; 
- 
-  ValueInfo &getOrCreateValueInfo(Value *); 
-  const ValueInfo &getValueInfo(Value *) const; 
- 
-  void processAssume(IntrinsicInst *, BasicBlock *, 
-                     SmallVectorImpl<Value *> &OpsToRename); 
-  void processBranch(BranchInst *, BasicBlock *, 
-                     SmallVectorImpl<Value *> &OpsToRename); 
-  void processSwitch(SwitchInst *, BasicBlock *, 
-                     SmallVectorImpl<Value *> &OpsToRename); 
-  void renameUses(SmallVectorImpl<Value *> &OpsToRename); 
-  void addInfoFor(SmallVectorImpl<Value *> &OpsToRename, Value *Op, 
-                  PredicateBase *PB); 
- 
-  typedef SmallVectorImpl<ValueDFS> ValueDFSStack; 
-  void convertUsesToDFSOrdered(Value *, SmallVectorImpl<ValueDFS> &); 
-  Value *materializeStack(unsigned int &, ValueDFSStack &, Value *); 
-  bool stackIsInScope(const ValueDFSStack &, const ValueDFS &) const; 
-  void popStackUntilDFSScope(ValueDFSStack &, const ValueDFS &); 
- 
-public: 
-  PredicateInfoBuilder(PredicateInfo &PI, Function &F, DominatorTree &DT, 
-                       AssumptionCache &AC) 
-      : PI(PI), F(F), DT(DT), AC(AC) { 
-    // Push an empty operand info so that we can detect 0 as not finding one 
-    ValueInfos.resize(1); 
-  } 
- 
-  void buildPredicateInfo(); 
-}; 
- 
-bool PredicateInfoBuilder::stackIsInScope(const ValueDFSStack &Stack, 
-                                          const ValueDFS &VDUse) const { 
-  if (Stack.empty()) 
-    return false; 
-  // If it's a phi only use, make sure it's for this phi node edge, and that the 
-  // use is in a phi node.  If it's anything else, and the top of the stack is 
-  // EdgeOnly, we need to pop the stack.  We deliberately sort phi uses next to 
-  // the defs they must go with so that we can know it's time to pop the stack 
-  // when we hit the end of the phi uses for a given def. 
-  if (Stack.back().EdgeOnly) { 
-    if (!VDUse.U) 
-      return false; 
-    auto *PHI = dyn_cast<PHINode>(VDUse.U->getUser()); 
-    if (!PHI) 
-      return false; 
-    // Check edge 
-    BasicBlock *EdgePred = PHI->getIncomingBlock(*VDUse.U); 
-    if (EdgePred != getBranchBlock(Stack.back().PInfo)) 
-      return false; 
- 
-    // Use dominates, which knows how to handle edge dominance. 
-    return DT.dominates(getBlockEdge(Stack.back().PInfo), *VDUse.U); 
-  } 
- 
-  return (VDUse.DFSIn >= Stack.back().DFSIn && 
-          VDUse.DFSOut <= Stack.back().DFSOut); 
-} 
- 
-void PredicateInfoBuilder::popStackUntilDFSScope(ValueDFSStack &Stack, 
-                                                 const ValueDFS &VD) { 
-  while (!Stack.empty() && !stackIsInScope(Stack, VD)) 
-    Stack.pop_back(); 
-} 
- 
-// Convert the uses of Op into a vector of uses, associating global and local 
-// DFS info with each one. 
-void PredicateInfoBuilder::convertUsesToDFSOrdered( 
-    Value *Op, SmallVectorImpl<ValueDFS> &DFSOrderedSet) { 
-  for (auto &U : Op->uses()) { 
-    if (auto *I = dyn_cast<Instruction>(U.getUser())) { 
-      ValueDFS VD; 
-      // Put the phi node uses in the incoming block. 
-      BasicBlock *IBlock; 
-      if (auto *PN = dyn_cast<PHINode>(I)) { 
-        IBlock = PN->getIncomingBlock(U); 
-        // Make phi node users appear last in the incoming block 
-        // they are from. 
-        VD.LocalNum = LN_Last; 
-      } else { 
-        // If it's not a phi node use, it is somewhere in the middle of the 
-        // block. 
-        IBlock = I->getParent(); 
-        VD.LocalNum = LN_Middle; 
-      } 
-      DomTreeNode *DomNode = DT.getNode(IBlock); 
-      // It's possible our use is in an unreachable block. Skip it if so. 
-      if (!DomNode) 
-        continue; 
-      VD.DFSIn = DomNode->getDFSNumIn(); 
-      VD.DFSOut = DomNode->getDFSNumOut(); 
-      VD.U = &U; 
-      DFSOrderedSet.push_back(VD); 
-    } 
-  } 
-} 
- 
+namespace {
+// Given a predicate info that is a type of branching terminator, get the
+// branching block.
+const BasicBlock *getBranchBlock(const PredicateBase *PB) {
+  assert(isa<PredicateWithEdge>(PB) &&
+         "Only branches and switches should have PHIOnly defs that "
+         "require branch blocks.");
+  return cast<PredicateWithEdge>(PB)->From;
+}
+
+// Given a predicate info that is a type of branching terminator, get the
+// branching terminator.
+static Instruction *getBranchTerminator(const PredicateBase *PB) {
+  assert(isa<PredicateWithEdge>(PB) &&
+         "Not a predicate info type we know how to get a terminator from.");
+  return cast<PredicateWithEdge>(PB)->From->getTerminator();
+}
+
+// Given a predicate info that is a type of branching terminator, get the
+// edge this predicate info represents
+const std::pair<BasicBlock *, BasicBlock *>
+getBlockEdge(const PredicateBase *PB) {
+  assert(isa<PredicateWithEdge>(PB) &&
+         "Not a predicate info type we know how to get an edge from.");
+  const auto *PEdge = cast<PredicateWithEdge>(PB);
+  return std::make_pair(PEdge->From, PEdge->To);
+}
+}
+
+namespace llvm {
+enum LocalNum {
+  // Operations that must appear first in the block.
+  LN_First,
+  // Operations that are somewhere in the middle of the block, and are sorted on
+  // demand.
+  LN_Middle,
+  // Operations that must appear last in a block, like successor phi node uses.
+  LN_Last
+};
+
+// Associate global and local DFS info with defs and uses, so we can sort them
+// into a global domination ordering.
+struct ValueDFS {
+  int DFSIn = 0;
+  int DFSOut = 0;
+  unsigned int LocalNum = LN_Middle;
+  // Only one of Def or Use will be set.
+  Value *Def = nullptr;
+  Use *U = nullptr;
+  // Neither PInfo nor EdgeOnly participate in the ordering
+  PredicateBase *PInfo = nullptr;
+  bool EdgeOnly = false;
+};
+
+// Perform a strict weak ordering on instructions and arguments.
+static bool valueComesBefore(const Value *A, const Value *B) {
+  auto *ArgA = dyn_cast_or_null<Argument>(A);
+  auto *ArgB = dyn_cast_or_null<Argument>(B);
+  if (ArgA && !ArgB)
+    return true;
+  if (ArgB && !ArgA)
+    return false;
+  if (ArgA && ArgB)
+    return ArgA->getArgNo() < ArgB->getArgNo();
+  return cast<Instruction>(A)->comesBefore(cast<Instruction>(B));
+}
+
+// This compares ValueDFS structures. Doing so allows us to walk the minimum
+// number of instructions necessary to compute our def/use ordering.
+struct ValueDFS_Compare {
+  DominatorTree &DT;
+  ValueDFS_Compare(DominatorTree &DT) : DT(DT) {}
+
+  bool operator()(const ValueDFS &A, const ValueDFS &B) const {
+    if (&A == &B)
+      return false;
+    // The only case we can't directly compare them is when they in the same
+    // block, and both have localnum == middle.  In that case, we have to use
+    // comesbefore to see what the real ordering is, because they are in the
+    // same basic block.
+
+    assert((A.DFSIn != B.DFSIn || A.DFSOut == B.DFSOut) &&
+           "Equal DFS-in numbers imply equal out numbers");
+    bool SameBlock = A.DFSIn == B.DFSIn;
+
+    // We want to put the def that will get used for a given set of phi uses,
+    // before those phi uses.
+    // So we sort by edge, then by def.
+    // Note that only phi nodes uses and defs can come last.
+    if (SameBlock && A.LocalNum == LN_Last && B.LocalNum == LN_Last)
+      return comparePHIRelated(A, B);
+
+    bool isADef = A.Def;
+    bool isBDef = B.Def;
+    if (!SameBlock || A.LocalNum != LN_Middle || B.LocalNum != LN_Middle)
+      return std::tie(A.DFSIn, A.LocalNum, isADef) <
+             std::tie(B.DFSIn, B.LocalNum, isBDef);
+    return localComesBefore(A, B);
+  }
+
+  // For a phi use, or a non-materialized def, return the edge it represents.
+  const std::pair<BasicBlock *, BasicBlock *>
+  getBlockEdge(const ValueDFS &VD) const {
+    if (!VD.Def && VD.U) {
+      auto *PHI = cast<PHINode>(VD.U->getUser());
+      return std::make_pair(PHI->getIncomingBlock(*VD.U), PHI->getParent());
+    }
+    // This is really a non-materialized def.
+    return ::getBlockEdge(VD.PInfo);
+  }
+
+  // For two phi related values, return the ordering.
+  bool comparePHIRelated(const ValueDFS &A, const ValueDFS &B) const {
+    BasicBlock *ASrc, *ADest, *BSrc, *BDest;
+    std::tie(ASrc, ADest) = getBlockEdge(A);
+    std::tie(BSrc, BDest) = getBlockEdge(B);
+
+#ifndef NDEBUG
+    // This function should only be used for values in the same BB, check that.
+    DomTreeNode *DomASrc = DT.getNode(ASrc);
+    DomTreeNode *DomBSrc = DT.getNode(BSrc);
+    assert(DomASrc->getDFSNumIn() == (unsigned)A.DFSIn &&
+           "DFS numbers for A should match the ones of the source block");
+    assert(DomBSrc->getDFSNumIn() == (unsigned)B.DFSIn &&
+           "DFS numbers for B should match the ones of the source block");
+    assert(A.DFSIn == B.DFSIn && "Values must be in the same block");
+#endif
+    (void)ASrc;
+    (void)BSrc;
+
+    // Use DFS numbers to compare destination blocks, to guarantee a
+    // deterministic order.
+    DomTreeNode *DomADest = DT.getNode(ADest);
+    DomTreeNode *DomBDest = DT.getNode(BDest);
+    unsigned AIn = DomADest->getDFSNumIn();
+    unsigned BIn = DomBDest->getDFSNumIn();
+    bool isADef = A.Def;
+    bool isBDef = B.Def;
+    assert((!A.Def || !A.U) && (!B.Def || !B.U) &&
+           "Def and U cannot be set at the same time");
+    // Now sort by edge destination and then defs before uses.
+    return std::tie(AIn, isADef) < std::tie(BIn, isBDef);
+  }
+
+  // Get the definition of an instruction that occurs in the middle of a block.
+  Value *getMiddleDef(const ValueDFS &VD) const {
+    if (VD.Def)
+      return VD.Def;
+    // It's possible for the defs and uses to be null.  For branches, the local
+    // numbering will say the placed predicaeinfos should go first (IE
+    // LN_beginning), so we won't be in this function. For assumes, we will end
+    // up here, beause we need to order the def we will place relative to the
+    // assume.  So for the purpose of ordering, we pretend the def is right
+    // after the assume, because that is where we will insert the info.
+    if (!VD.U) {
+      assert(VD.PInfo &&
+             "No def, no use, and no predicateinfo should not occur");
+      assert(isa<PredicateAssume>(VD.PInfo) &&
+             "Middle of block should only occur for assumes");
+      return cast<PredicateAssume>(VD.PInfo)->AssumeInst->getNextNode();
+    }
+    return nullptr;
+  }
+
+  // Return either the Def, if it's not null, or the user of the Use, if the def
+  // is null.
+  const Instruction *getDefOrUser(const Value *Def, const Use *U) const {
+    if (Def)
+      return cast<Instruction>(Def);
+    return cast<Instruction>(U->getUser());
+  }
+
+  // This performs the necessary local basic block ordering checks to tell
+  // whether A comes before B, where both are in the same basic block.
+  bool localComesBefore(const ValueDFS &A, const ValueDFS &B) const {
+    auto *ADef = getMiddleDef(A);
+    auto *BDef = getMiddleDef(B);
+
+    // See if we have real values or uses. If we have real values, we are
+    // guaranteed they are instructions or arguments. No matter what, we are
+    // guaranteed they are in the same block if they are instructions.
+    auto *ArgA = dyn_cast_or_null<Argument>(ADef);
+    auto *ArgB = dyn_cast_or_null<Argument>(BDef);
+
+    if (ArgA || ArgB)
+      return valueComesBefore(ArgA, ArgB);
+
+    auto *AInst = getDefOrUser(ADef, A.U);
+    auto *BInst = getDefOrUser(BDef, B.U);
+    return valueComesBefore(AInst, BInst);
+  }
+};
+
+class PredicateInfoBuilder {
+  // Used to store information about each value we might rename.
+  struct ValueInfo {
+    SmallVector<PredicateBase *, 4> Infos;
+  };
+
+  PredicateInfo &PI;
+  Function &F;
+  DominatorTree &DT;
+  AssumptionCache &AC;
+
+  // This stores info about each operand or comparison result we make copies
+  // of. The real ValueInfos start at index 1, index 0 is unused so that we
+  // can more easily detect invalid indexing.
+  SmallVector<ValueInfo, 32> ValueInfos;
+
+  // This gives the index into the ValueInfos array for a given Value. Because
+  // 0 is not a valid Value Info index, you can use DenseMap::lookup and tell
+  // whether it returned a valid result.
+  DenseMap<Value *, unsigned int> ValueInfoNums;
+
+  // The set of edges along which we can only handle phi uses, due to critical
+  // edges.
+  DenseSet<std::pair<BasicBlock *, BasicBlock *>> EdgeUsesOnly;
+
+  ValueInfo &getOrCreateValueInfo(Value *);
+  const ValueInfo &getValueInfo(Value *) const;
+
+  void processAssume(IntrinsicInst *, BasicBlock *,
+                     SmallVectorImpl<Value *> &OpsToRename);
+  void processBranch(BranchInst *, BasicBlock *,
+                     SmallVectorImpl<Value *> &OpsToRename);
+  void processSwitch(SwitchInst *, BasicBlock *,
+                     SmallVectorImpl<Value *> &OpsToRename);
+  void renameUses(SmallVectorImpl<Value *> &OpsToRename);
+  void addInfoFor(SmallVectorImpl<Value *> &OpsToRename, Value *Op,
+                  PredicateBase *PB);
+
+  typedef SmallVectorImpl<ValueDFS> ValueDFSStack;
+  void convertUsesToDFSOrdered(Value *, SmallVectorImpl<ValueDFS> &);
+  Value *materializeStack(unsigned int &, ValueDFSStack &, Value *);
+  bool stackIsInScope(const ValueDFSStack &, const ValueDFS &) const;
+  void popStackUntilDFSScope(ValueDFSStack &, const ValueDFS &);
+
+public:
+  PredicateInfoBuilder(PredicateInfo &PI, Function &F, DominatorTree &DT,
+                       AssumptionCache &AC)
+      : PI(PI), F(F), DT(DT), AC(AC) {
+    // Push an empty operand info so that we can detect 0 as not finding one
+    ValueInfos.resize(1);
+  }
+
+  void buildPredicateInfo();
+};
+
+bool PredicateInfoBuilder::stackIsInScope(const ValueDFSStack &Stack,
+                                          const ValueDFS &VDUse) const {
+  if (Stack.empty())
+    return false;
+  // If it's a phi only use, make sure it's for this phi node edge, and that the
+  // use is in a phi node.  If it's anything else, and the top of the stack is
+  // EdgeOnly, we need to pop the stack.  We deliberately sort phi uses next to
+  // the defs they must go with so that we can know it's time to pop the stack
+  // when we hit the end of the phi uses for a given def.
+  if (Stack.back().EdgeOnly) {
+    if (!VDUse.U)
+      return false;
+    auto *PHI = dyn_cast<PHINode>(VDUse.U->getUser());
+    if (!PHI)
+      return false;
+    // Check edge
+    BasicBlock *EdgePred = PHI->getIncomingBlock(*VDUse.U);
+    if (EdgePred != getBranchBlock(Stack.back().PInfo))
+      return false;
+
+    // Use dominates, which knows how to handle edge dominance.
+    return DT.dominates(getBlockEdge(Stack.back().PInfo), *VDUse.U);
+  }
+
+  return (VDUse.DFSIn >= Stack.back().DFSIn &&
+          VDUse.DFSOut <= Stack.back().DFSOut);
+}
+
+void PredicateInfoBuilder::popStackUntilDFSScope(ValueDFSStack &Stack,
+                                                 const ValueDFS &VD) {
+  while (!Stack.empty() && !stackIsInScope(Stack, VD))
+    Stack.pop_back();
+}
+
+// Convert the uses of Op into a vector of uses, associating global and local
+// DFS info with each one.
+void PredicateInfoBuilder::convertUsesToDFSOrdered(
+    Value *Op, SmallVectorImpl<ValueDFS> &DFSOrderedSet) {
+  for (auto &U : Op->uses()) {
+    if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+      ValueDFS VD;
+      // Put the phi node uses in the incoming block.
+      BasicBlock *IBlock;
+      if (auto *PN = dyn_cast<PHINode>(I)) {
+        IBlock = PN->getIncomingBlock(U);
+        // Make phi node users appear last in the incoming block
+        // they are from.
+        VD.LocalNum = LN_Last;
+      } else {
+        // If it's not a phi node use, it is somewhere in the middle of the
+        // block.
+        IBlock = I->getParent();
+        VD.LocalNum = LN_Middle;
+      }
+      DomTreeNode *DomNode = DT.getNode(IBlock);
+      // It's possible our use is in an unreachable block. Skip it if so.
+      if (!DomNode)
+        continue;
+      VD.DFSIn = DomNode->getDFSNumIn();
+      VD.DFSOut = DomNode->getDFSNumOut();
+      VD.U = &U;
+      DFSOrderedSet.push_back(VD);
+    }
+  }
+}
+
 bool shouldRename(Value *V) {
   // Only want real values, not constants.  Additionally, operands with one use
   // are only being used in the comparison, which means they will not be useful
@@ -378,33 +378,33 @@ bool shouldRename(Value *V) {
   return (isa<Instruction>(V) || isa<Argument>(V)) && !V->hasOneUse();
 }
 
-// Collect relevant operations from Comparison that we may want to insert copies 
-// for. 
-void collectCmpOps(CmpInst *Comparison, SmallVectorImpl<Value *> &CmpOperands) { 
-  auto *Op0 = Comparison->getOperand(0); 
-  auto *Op1 = Comparison->getOperand(1); 
-  if (Op0 == Op1) 
-    return; 
+// Collect relevant operations from Comparison that we may want to insert copies
+// for.
+void collectCmpOps(CmpInst *Comparison, SmallVectorImpl<Value *> &CmpOperands) {
+  auto *Op0 = Comparison->getOperand(0);
+  auto *Op1 = Comparison->getOperand(1);
+  if (Op0 == Op1)
+    return;
 
   CmpOperands.push_back(Op0);
   CmpOperands.push_back(Op1);
-} 
- 
-// Add Op, PB to the list of value infos for Op, and mark Op to be renamed. 
-void PredicateInfoBuilder::addInfoFor(SmallVectorImpl<Value *> &OpsToRename, 
-                                      Value *Op, PredicateBase *PB) { 
-  auto &OperandInfo = getOrCreateValueInfo(Op); 
-  if (OperandInfo.Infos.empty()) 
-    OpsToRename.push_back(Op); 
-  PI.AllInfos.push_back(PB); 
-  OperandInfo.Infos.push_back(PB); 
-} 
- 
-// Process an assume instruction and place relevant operations we want to rename 
-// into OpsToRename. 
-void PredicateInfoBuilder::processAssume( 
-    IntrinsicInst *II, BasicBlock *AssumeBB, 
-    SmallVectorImpl<Value *> &OpsToRename) { 
+}
+
+// Add Op, PB to the list of value infos for Op, and mark Op to be renamed.
+void PredicateInfoBuilder::addInfoFor(SmallVectorImpl<Value *> &OpsToRename,
+                                      Value *Op, PredicateBase *PB) {
+  auto &OperandInfo = getOrCreateValueInfo(Op);
+  if (OperandInfo.Infos.empty())
+    OpsToRename.push_back(Op);
+  PI.AllInfos.push_back(PB);
+  OperandInfo.Infos.push_back(PB);
+}
+
+// Process an assume instruction and place relevant operations we want to rename
+// into OpsToRename.
+void PredicateInfoBuilder::processAssume(
+    IntrinsicInst *II, BasicBlock *AssumeBB,
+    SmallVectorImpl<Value *> &OpsToRename) {
   SmallVector<Value *, 4> Worklist;
   SmallPtrSet<Value *, 4> Visited;
   Worklist.push_back(II->getOperand(0));
@@ -414,7 +414,7 @@ void PredicateInfoBuilder::processAssume(
       continue;
     if (Visited.size() > MaxCondsPerBranch)
       break;
- 
+
     Value *Op0, *Op1;
     if (match(Cond, m_LogicalAnd(m_Value(Op0), m_Value(Op1)))) {
       Worklist.push_back(Op1);
@@ -430,19 +430,19 @@ void PredicateInfoBuilder::processAssume(
       if (shouldRename(V)) {
         auto *PA = new PredicateAssume(V, II, Cond);
         addInfoFor(OpsToRename, V, PA);
-      } 
-    } 
-  } 
-} 
- 
-// Process a block terminating branch, and place relevant operations to be 
-// renamed into OpsToRename. 
-void PredicateInfoBuilder::processBranch( 
-    BranchInst *BI, BasicBlock *BranchBB, 
-    SmallVectorImpl<Value *> &OpsToRename) { 
-  BasicBlock *FirstBB = BI->getSuccessor(0); 
-  BasicBlock *SecondBB = BI->getSuccessor(1); 
- 
+      }
+    }
+  }
+}
+
+// Process a block terminating branch, and place relevant operations to be
+// renamed into OpsToRename.
+void PredicateInfoBuilder::processBranch(
+    BranchInst *BI, BasicBlock *BranchBB,
+    SmallVectorImpl<Value *> &OpsToRename) {
+  BasicBlock *FirstBB = BI->getSuccessor(0);
+  BasicBlock *SecondBB = BI->getSuccessor(1);
+
   for (BasicBlock *Succ : {FirstBB, SecondBB}) {
     bool TakenEdge = Succ == FirstBB;
     // Don't try to insert on a self-edge. This is mainly because we will
@@ -456,10 +456,10 @@ void PredicateInfoBuilder::processBranch(
     while (!Worklist.empty()) {
       Value *Cond = Worklist.pop_back_val();
       if (!Visited.insert(Cond).second)
-        continue; 
+        continue;
       if (Visited.size() > MaxCondsPerBranch)
         break;
- 
+
       Value *Op0, *Op1;
       if (TakenEdge ? match(Cond, m_LogicalAnd(m_Value(Op0), m_Value(Op1)))
                     : match(Cond, m_LogicalOr(m_Value(Op0), m_Value(Op1)))) {
@@ -481,324 +481,324 @@ void PredicateInfoBuilder::processBranch(
             EdgeUsesOnly.insert({BranchBB, Succ});
         }
       }
-    } 
-  } 
-} 
-// Process a block terminating switch, and place relevant operations to be 
-// renamed into OpsToRename. 
-void PredicateInfoBuilder::processSwitch( 
-    SwitchInst *SI, BasicBlock *BranchBB, 
-    SmallVectorImpl<Value *> &OpsToRename) { 
-  Value *Op = SI->getCondition(); 
-  if ((!isa<Instruction>(Op) && !isa<Argument>(Op)) || Op->hasOneUse()) 
-    return; 
- 
-  // Remember how many outgoing edges there are to every successor. 
-  SmallDenseMap<BasicBlock *, unsigned, 16> SwitchEdges; 
-  for (unsigned i = 0, e = SI->getNumSuccessors(); i != e; ++i) { 
-    BasicBlock *TargetBlock = SI->getSuccessor(i); 
-    ++SwitchEdges[TargetBlock]; 
-  } 
- 
-  // Now propagate info for each case value 
-  for (auto C : SI->cases()) { 
-    BasicBlock *TargetBlock = C.getCaseSuccessor(); 
-    if (SwitchEdges.lookup(TargetBlock) == 1) { 
-      PredicateSwitch *PS = new PredicateSwitch( 
-          Op, SI->getParent(), TargetBlock, C.getCaseValue(), SI); 
-      addInfoFor(OpsToRename, Op, PS); 
-      if (!TargetBlock->getSinglePredecessor()) 
-        EdgeUsesOnly.insert({BranchBB, TargetBlock}); 
-    } 
-  } 
-} 
- 
-// Build predicate info for our function 
-void PredicateInfoBuilder::buildPredicateInfo() { 
-  DT.updateDFSNumbers(); 
-  // Collect operands to rename from all conditional branch terminators, as well 
-  // as assume statements. 
-  SmallVector<Value *, 8> OpsToRename; 
-  for (auto DTN : depth_first(DT.getRootNode())) { 
-    BasicBlock *BranchBB = DTN->getBlock(); 
-    if (auto *BI = dyn_cast<BranchInst>(BranchBB->getTerminator())) { 
-      if (!BI->isConditional()) 
-        continue; 
-      // Can't insert conditional information if they all go to the same place. 
-      if (BI->getSuccessor(0) == BI->getSuccessor(1)) 
-        continue; 
-      processBranch(BI, BranchBB, OpsToRename); 
-    } else if (auto *SI = dyn_cast<SwitchInst>(BranchBB->getTerminator())) { 
-      processSwitch(SI, BranchBB, OpsToRename); 
-    } 
-  } 
-  for (auto &Assume : AC.assumptions()) { 
-    if (auto *II = dyn_cast_or_null<IntrinsicInst>(Assume)) 
-      if (DT.isReachableFromEntry(II->getParent())) 
-        processAssume(II, II->getParent(), OpsToRename); 
-  } 
-  // Now rename all our operations. 
-  renameUses(OpsToRename); 
-} 
- 
-// Create a ssa_copy declaration with custom mangling, because 
-// Intrinsic::getDeclaration does not handle overloaded unnamed types properly: 
-// all unnamed types get mangled to the same string. We use the pointer 
-// to the type as name here, as it guarantees unique names for different 
-// types and we remove the declarations when destroying PredicateInfo. 
-// It is a workaround for PR38117, because solving it in a fully general way is 
-// tricky (FIXME). 
-static Function *getCopyDeclaration(Module *M, Type *Ty) { 
-  std::string Name = "llvm.ssa.copy." + utostr((uintptr_t) Ty); 
-  return cast<Function>( 
-      M->getOrInsertFunction(Name, 
-                             getType(M->getContext(), Intrinsic::ssa_copy, Ty)) 
-          .getCallee()); 
-} 
- 
-// Given the renaming stack, make all the operands currently on the stack real 
-// by inserting them into the IR.  Return the last operation's value. 
-Value *PredicateInfoBuilder::materializeStack(unsigned int &Counter, 
-                                             ValueDFSStack &RenameStack, 
-                                             Value *OrigOp) { 
-  // Find the first thing we have to materialize 
-  auto RevIter = RenameStack.rbegin(); 
-  for (; RevIter != RenameStack.rend(); ++RevIter) 
-    if (RevIter->Def) 
-      break; 
- 
-  size_t Start = RevIter - RenameStack.rbegin(); 
-  // The maximum number of things we should be trying to materialize at once 
-  // right now is 4, depending on if we had an assume, a branch, and both used 
-  // and of conditions. 
-  for (auto RenameIter = RenameStack.end() - Start; 
-       RenameIter != RenameStack.end(); ++RenameIter) { 
-    auto *Op = 
-        RenameIter == RenameStack.begin() ? OrigOp : (RenameIter - 1)->Def; 
-    ValueDFS &Result = *RenameIter; 
-    auto *ValInfo = Result.PInfo; 
-    ValInfo->RenamedOp = (RenameStack.end() - Start) == RenameStack.begin() 
-                             ? OrigOp 
-                             : (RenameStack.end() - Start - 1)->Def; 
-    // For edge predicates, we can just place the operand in the block before 
-    // the terminator.  For assume, we have to place it right before the assume 
-    // to ensure we dominate all of our uses.  Always insert right before the 
-    // relevant instruction (terminator, assume), so that we insert in proper 
-    // order in the case of multiple predicateinfo in the same block. 
-    if (isa<PredicateWithEdge>(ValInfo)) { 
-      IRBuilder<> B(getBranchTerminator(ValInfo)); 
-      Function *IF = getCopyDeclaration(F.getParent(), Op->getType()); 
-      if (IF->users().empty()) 
-        PI.CreatedDeclarations.insert(IF); 
-      CallInst *PIC = 
-          B.CreateCall(IF, Op, Op->getName() + "." + Twine(Counter++)); 
-      PI.PredicateMap.insert({PIC, ValInfo}); 
-      Result.Def = PIC; 
-    } else { 
-      auto *PAssume = dyn_cast<PredicateAssume>(ValInfo); 
-      assert(PAssume && 
-             "Should not have gotten here without it being an assume"); 
-      // Insert the predicate directly after the assume. While it also holds 
-      // directly before it, assume(i1 true) is not a useful fact. 
-      IRBuilder<> B(PAssume->AssumeInst->getNextNode()); 
-      Function *IF = getCopyDeclaration(F.getParent(), Op->getType()); 
-      if (IF->users().empty()) 
-        PI.CreatedDeclarations.insert(IF); 
-      CallInst *PIC = B.CreateCall(IF, Op); 
-      PI.PredicateMap.insert({PIC, ValInfo}); 
-      Result.Def = PIC; 
-    } 
-  } 
-  return RenameStack.back().Def; 
-} 
- 
-// Instead of the standard SSA renaming algorithm, which is O(Number of 
-// instructions), and walks the entire dominator tree, we walk only the defs + 
-// uses.  The standard SSA renaming algorithm does not really rely on the 
-// dominator tree except to order the stack push/pops of the renaming stacks, so 
-// that defs end up getting pushed before hitting the correct uses.  This does 
-// not require the dominator tree, only the *order* of the dominator tree. The 
-// complete and correct ordering of the defs and uses, in dominator tree is 
-// contained in the DFS numbering of the dominator tree. So we sort the defs and 
-// uses into the DFS ordering, and then just use the renaming stack as per 
-// normal, pushing when we hit a def (which is a predicateinfo instruction), 
-// popping when we are out of the dfs scope for that def, and replacing any uses 
-// with top of stack if it exists.  In order to handle liveness without 
-// propagating liveness info, we don't actually insert the predicateinfo 
-// instruction def until we see a use that it would dominate.  Once we see such 
-// a use, we materialize the predicateinfo instruction in the right place and 
-// use it. 
-// 
-// TODO: Use this algorithm to perform fast single-variable renaming in 
-// promotememtoreg and memoryssa. 
-void PredicateInfoBuilder::renameUses(SmallVectorImpl<Value *> &OpsToRename) { 
-  ValueDFS_Compare Compare(DT); 
-  // Compute liveness, and rename in O(uses) per Op. 
-  for (auto *Op : OpsToRename) { 
-    LLVM_DEBUG(dbgs() << "Visiting " << *Op << "\n"); 
-    unsigned Counter = 0; 
-    SmallVector<ValueDFS, 16> OrderedUses; 
-    const auto &ValueInfo = getValueInfo(Op); 
-    // Insert the possible copies into the def/use list. 
-    // They will become real copies if we find a real use for them, and never 
-    // created otherwise. 
-    for (auto &PossibleCopy : ValueInfo.Infos) { 
-      ValueDFS VD; 
-      // Determine where we are going to place the copy by the copy type. 
-      // The predicate info for branches always come first, they will get 
-      // materialized in the split block at the top of the block. 
-      // The predicate info for assumes will be somewhere in the middle, 
-      // it will get materialized in front of the assume. 
-      if (const auto *PAssume = dyn_cast<PredicateAssume>(PossibleCopy)) { 
-        VD.LocalNum = LN_Middle; 
-        DomTreeNode *DomNode = DT.getNode(PAssume->AssumeInst->getParent()); 
-        if (!DomNode) 
-          continue; 
-        VD.DFSIn = DomNode->getDFSNumIn(); 
-        VD.DFSOut = DomNode->getDFSNumOut(); 
-        VD.PInfo = PossibleCopy; 
-        OrderedUses.push_back(VD); 
-      } else if (isa<PredicateWithEdge>(PossibleCopy)) { 
-        // If we can only do phi uses, we treat it like it's in the branch 
-        // block, and handle it specially. We know that it goes last, and only 
-        // dominate phi uses. 
-        auto BlockEdge = getBlockEdge(PossibleCopy); 
-        if (EdgeUsesOnly.count(BlockEdge)) { 
-          VD.LocalNum = LN_Last; 
-          auto *DomNode = DT.getNode(BlockEdge.first); 
-          if (DomNode) { 
-            VD.DFSIn = DomNode->getDFSNumIn(); 
-            VD.DFSOut = DomNode->getDFSNumOut(); 
-            VD.PInfo = PossibleCopy; 
-            VD.EdgeOnly = true; 
-            OrderedUses.push_back(VD); 
-          } 
-        } else { 
-          // Otherwise, we are in the split block (even though we perform 
-          // insertion in the branch block). 
-          // Insert a possible copy at the split block and before the branch. 
-          VD.LocalNum = LN_First; 
-          auto *DomNode = DT.getNode(BlockEdge.second); 
-          if (DomNode) { 
-            VD.DFSIn = DomNode->getDFSNumIn(); 
-            VD.DFSOut = DomNode->getDFSNumOut(); 
-            VD.PInfo = PossibleCopy; 
-            OrderedUses.push_back(VD); 
-          } 
-        } 
-      } 
-    } 
- 
-    convertUsesToDFSOrdered(Op, OrderedUses); 
-    // Here we require a stable sort because we do not bother to try to 
-    // assign an order to the operands the uses represent. Thus, two 
-    // uses in the same instruction do not have a strict sort order 
-    // currently and will be considered equal. We could get rid of the 
-    // stable sort by creating one if we wanted. 
-    llvm::stable_sort(OrderedUses, Compare); 
-    SmallVector<ValueDFS, 8> RenameStack; 
-    // For each use, sorted into dfs order, push values and replaces uses with 
-    // top of stack, which will represent the reaching def. 
-    for (auto &VD : OrderedUses) { 
-      // We currently do not materialize copy over copy, but we should decide if 
-      // we want to. 
-      bool PossibleCopy = VD.PInfo != nullptr; 
-      if (RenameStack.empty()) { 
-        LLVM_DEBUG(dbgs() << "Rename Stack is empty\n"); 
-      } else { 
-        LLVM_DEBUG(dbgs() << "Rename Stack Top DFS numbers are (" 
-                          << RenameStack.back().DFSIn << "," 
-                          << RenameStack.back().DFSOut << ")\n"); 
-      } 
- 
-      LLVM_DEBUG(dbgs() << "Current DFS numbers are (" << VD.DFSIn << "," 
-                        << VD.DFSOut << ")\n"); 
- 
-      bool ShouldPush = (VD.Def || PossibleCopy); 
-      bool OutOfScope = !stackIsInScope(RenameStack, VD); 
-      if (OutOfScope || ShouldPush) { 
-        // Sync to our current scope. 
-        popStackUntilDFSScope(RenameStack, VD); 
-        if (ShouldPush) { 
-          RenameStack.push_back(VD); 
-        } 
-      } 
-      // If we get to this point, and the stack is empty we must have a use 
-      // with no renaming needed, just skip it. 
-      if (RenameStack.empty()) 
-        continue; 
-      // Skip values, only want to rename the uses 
-      if (VD.Def || PossibleCopy) 
-        continue; 
-      if (!DebugCounter::shouldExecute(RenameCounter)) { 
-        LLVM_DEBUG(dbgs() << "Skipping execution due to debug counter\n"); 
-        continue; 
-      } 
-      ValueDFS &Result = RenameStack.back(); 
- 
-      // If the possible copy dominates something, materialize our stack up to 
-      // this point. This ensures every comparison that affects our operation 
-      // ends up with predicateinfo. 
-      if (!Result.Def) 
-        Result.Def = materializeStack(Counter, RenameStack, Op); 
- 
-      LLVM_DEBUG(dbgs() << "Found replacement " << *Result.Def << " for " 
-                        << *VD.U->get() << " in " << *(VD.U->getUser()) 
-                        << "\n"); 
-      assert(DT.dominates(cast<Instruction>(Result.Def), *VD.U) && 
-             "Predicateinfo def should have dominated this use"); 
-      VD.U->set(Result.Def); 
-    } 
-  } 
-} 
- 
-PredicateInfoBuilder::ValueInfo & 
-PredicateInfoBuilder::getOrCreateValueInfo(Value *Operand) { 
-  auto OIN = ValueInfoNums.find(Operand); 
-  if (OIN == ValueInfoNums.end()) { 
-    // This will grow it 
-    ValueInfos.resize(ValueInfos.size() + 1); 
-    // This will use the new size and give us a 0 based number of the info 
-    auto InsertResult = ValueInfoNums.insert({Operand, ValueInfos.size() - 1}); 
-    assert(InsertResult.second && "Value info number already existed?"); 
-    return ValueInfos[InsertResult.first->second]; 
-  } 
-  return ValueInfos[OIN->second]; 
-} 
- 
-const PredicateInfoBuilder::ValueInfo & 
-PredicateInfoBuilder::getValueInfo(Value *Operand) const { 
-  auto OINI = ValueInfoNums.lookup(Operand); 
-  assert(OINI != 0 && "Operand was not really in the Value Info Numbers"); 
-  assert(OINI < ValueInfos.size() && 
-         "Value Info Number greater than size of Value Info Table"); 
-  return ValueInfos[OINI]; 
-} 
- 
-PredicateInfo::PredicateInfo(Function &F, DominatorTree &DT, 
-                             AssumptionCache &AC) 
-    : F(F) { 
-  PredicateInfoBuilder Builder(*this, F, DT, AC); 
-  Builder.buildPredicateInfo(); 
-} 
- 
-// Remove all declarations we created . The PredicateInfo consumers are 
-// responsible for remove the ssa_copy calls created. 
-PredicateInfo::~PredicateInfo() { 
-  // Collect function pointers in set first, as SmallSet uses a SmallVector 
-  // internally and we have to remove the asserting value handles first. 
-  SmallPtrSet<Function *, 20> FunctionPtrs; 
-  for (auto &F : CreatedDeclarations) 
-    FunctionPtrs.insert(&*F); 
-  CreatedDeclarations.clear(); 
- 
-  for (Function *F : FunctionPtrs) { 
-    assert(F->user_begin() == F->user_end() && 
-           "PredicateInfo consumer did not remove all SSA copies."); 
-    F->eraseFromParent(); 
-  } 
-} 
- 
+    }
+  }
+}
+// Process a block terminating switch, and place relevant operations to be
+// renamed into OpsToRename.
+void PredicateInfoBuilder::processSwitch(
+    SwitchInst *SI, BasicBlock *BranchBB,
+    SmallVectorImpl<Value *> &OpsToRename) {
+  Value *Op = SI->getCondition();
+  if ((!isa<Instruction>(Op) && !isa<Argument>(Op)) || Op->hasOneUse())
+    return;
+
+  // Remember how many outgoing edges there are to every successor.
+  SmallDenseMap<BasicBlock *, unsigned, 16> SwitchEdges;
+  for (unsigned i = 0, e = SI->getNumSuccessors(); i != e; ++i) {
+    BasicBlock *TargetBlock = SI->getSuccessor(i);
+    ++SwitchEdges[TargetBlock];
+  }
+
+  // Now propagate info for each case value
+  for (auto C : SI->cases()) {
+    BasicBlock *TargetBlock = C.getCaseSuccessor();
+    if (SwitchEdges.lookup(TargetBlock) == 1) {
+      PredicateSwitch *PS = new PredicateSwitch(
+          Op, SI->getParent(), TargetBlock, C.getCaseValue(), SI);
+      addInfoFor(OpsToRename, Op, PS);
+      if (!TargetBlock->getSinglePredecessor())
+        EdgeUsesOnly.insert({BranchBB, TargetBlock});
+    }
+  }
+}
+
+// Build predicate info for our function
+void PredicateInfoBuilder::buildPredicateInfo() {
+  DT.updateDFSNumbers();
+  // Collect operands to rename from all conditional branch terminators, as well
+  // as assume statements.
+  SmallVector<Value *, 8> OpsToRename;
+  for (auto DTN : depth_first(DT.getRootNode())) {
+    BasicBlock *BranchBB = DTN->getBlock();
+    if (auto *BI = dyn_cast<BranchInst>(BranchBB->getTerminator())) {
+      if (!BI->isConditional())
+        continue;
+      // Can't insert conditional information if they all go to the same place.
+      if (BI->getSuccessor(0) == BI->getSuccessor(1))
+        continue;
+      processBranch(BI, BranchBB, OpsToRename);
+    } else if (auto *SI = dyn_cast<SwitchInst>(BranchBB->getTerminator())) {
+      processSwitch(SI, BranchBB, OpsToRename);
+    }
+  }
+  for (auto &Assume : AC.assumptions()) {
+    if (auto *II = dyn_cast_or_null<IntrinsicInst>(Assume))
+      if (DT.isReachableFromEntry(II->getParent()))
+        processAssume(II, II->getParent(), OpsToRename);
+  }
+  // Now rename all our operations.
+  renameUses(OpsToRename);
+}
+
+// Create a ssa_copy declaration with custom mangling, because
+// Intrinsic::getDeclaration does not handle overloaded unnamed types properly:
+// all unnamed types get mangled to the same string. We use the pointer
+// to the type as name here, as it guarantees unique names for different
+// types and we remove the declarations when destroying PredicateInfo.
+// It is a workaround for PR38117, because solving it in a fully general way is
+// tricky (FIXME).
+static Function *getCopyDeclaration(Module *M, Type *Ty) {
+  std::string Name = "llvm.ssa.copy." + utostr((uintptr_t) Ty);
+  return cast<Function>(
+      M->getOrInsertFunction(Name,
+                             getType(M->getContext(), Intrinsic::ssa_copy, Ty))
+          .getCallee());
+}
+
+// Given the renaming stack, make all the operands currently on the stack real
+// by inserting them into the IR.  Return the last operation's value.
+Value *PredicateInfoBuilder::materializeStack(unsigned int &Counter,
+                                             ValueDFSStack &RenameStack,
+                                             Value *OrigOp) {
+  // Find the first thing we have to materialize
+  auto RevIter = RenameStack.rbegin();
+  for (; RevIter != RenameStack.rend(); ++RevIter)
+    if (RevIter->Def)
+      break;
+
+  size_t Start = RevIter - RenameStack.rbegin();
+  // The maximum number of things we should be trying to materialize at once
+  // right now is 4, depending on if we had an assume, a branch, and both used
+  // and of conditions.
+  for (auto RenameIter = RenameStack.end() - Start;
+       RenameIter != RenameStack.end(); ++RenameIter) {
+    auto *Op =
+        RenameIter == RenameStack.begin() ? OrigOp : (RenameIter - 1)->Def;
+    ValueDFS &Result = *RenameIter;
+    auto *ValInfo = Result.PInfo;
+    ValInfo->RenamedOp = (RenameStack.end() - Start) == RenameStack.begin()
+                             ? OrigOp
+                             : (RenameStack.end() - Start - 1)->Def;
+    // For edge predicates, we can just place the operand in the block before
+    // the terminator.  For assume, we have to place it right before the assume
+    // to ensure we dominate all of our uses.  Always insert right before the
+    // relevant instruction (terminator, assume), so that we insert in proper
+    // order in the case of multiple predicateinfo in the same block.
+    if (isa<PredicateWithEdge>(ValInfo)) {
+      IRBuilder<> B(getBranchTerminator(ValInfo));
+      Function *IF = getCopyDeclaration(F.getParent(), Op->getType());
+      if (IF->users().empty())
+        PI.CreatedDeclarations.insert(IF);
+      CallInst *PIC =
+          B.CreateCall(IF, Op, Op->getName() + "." + Twine(Counter++));
+      PI.PredicateMap.insert({PIC, ValInfo});
+      Result.Def = PIC;
+    } else {
+      auto *PAssume = dyn_cast<PredicateAssume>(ValInfo);
+      assert(PAssume &&
+             "Should not have gotten here without it being an assume");
+      // Insert the predicate directly after the assume. While it also holds
+      // directly before it, assume(i1 true) is not a useful fact.
+      IRBuilder<> B(PAssume->AssumeInst->getNextNode());
+      Function *IF = getCopyDeclaration(F.getParent(), Op->getType());
+      if (IF->users().empty())
+        PI.CreatedDeclarations.insert(IF);
+      CallInst *PIC = B.CreateCall(IF, Op);
+      PI.PredicateMap.insert({PIC, ValInfo});
+      Result.Def = PIC;
+    }
+  }
+  return RenameStack.back().Def;
+}
+
+// Instead of the standard SSA renaming algorithm, which is O(Number of
+// instructions), and walks the entire dominator tree, we walk only the defs +
+// uses.  The standard SSA renaming algorithm does not really rely on the
+// dominator tree except to order the stack push/pops of the renaming stacks, so
+// that defs end up getting pushed before hitting the correct uses.  This does
+// not require the dominator tree, only the *order* of the dominator tree. The
+// complete and correct ordering of the defs and uses, in dominator tree is
+// contained in the DFS numbering of the dominator tree. So we sort the defs and
+// uses into the DFS ordering, and then just use the renaming stack as per
+// normal, pushing when we hit a def (which is a predicateinfo instruction),
+// popping when we are out of the dfs scope for that def, and replacing any uses
+// with top of stack if it exists.  In order to handle liveness without
+// propagating liveness info, we don't actually insert the predicateinfo
+// instruction def until we see a use that it would dominate.  Once we see such
+// a use, we materialize the predicateinfo instruction in the right place and
+// use it.
+//
+// TODO: Use this algorithm to perform fast single-variable renaming in
+// promotememtoreg and memoryssa.
+void PredicateInfoBuilder::renameUses(SmallVectorImpl<Value *> &OpsToRename) {
+  ValueDFS_Compare Compare(DT);
+  // Compute liveness, and rename in O(uses) per Op.
+  for (auto *Op : OpsToRename) {
+    LLVM_DEBUG(dbgs() << "Visiting " << *Op << "\n");
+    unsigned Counter = 0;
+    SmallVector<ValueDFS, 16> OrderedUses;
+    const auto &ValueInfo = getValueInfo(Op);
+    // Insert the possible copies into the def/use list.
+    // They will become real copies if we find a real use for them, and never
+    // created otherwise.
+    for (auto &PossibleCopy : ValueInfo.Infos) {
+      ValueDFS VD;
+      // Determine where we are going to place the copy by the copy type.
+      // The predicate info for branches always come first, they will get
+      // materialized in the split block at the top of the block.
+      // The predicate info for assumes will be somewhere in the middle,
+      // it will get materialized in front of the assume.
+      if (const auto *PAssume = dyn_cast<PredicateAssume>(PossibleCopy)) {
+        VD.LocalNum = LN_Middle;
+        DomTreeNode *DomNode = DT.getNode(PAssume->AssumeInst->getParent());
+        if (!DomNode)
+          continue;
+        VD.DFSIn = DomNode->getDFSNumIn();
+        VD.DFSOut = DomNode->getDFSNumOut();
+        VD.PInfo = PossibleCopy;
+        OrderedUses.push_back(VD);
+      } else if (isa<PredicateWithEdge>(PossibleCopy)) {
+        // If we can only do phi uses, we treat it like it's in the branch
+        // block, and handle it specially. We know that it goes last, and only
+        // dominate phi uses.
+        auto BlockEdge = getBlockEdge(PossibleCopy);
+        if (EdgeUsesOnly.count(BlockEdge)) {
+          VD.LocalNum = LN_Last;
+          auto *DomNode = DT.getNode(BlockEdge.first);
+          if (DomNode) {
+            VD.DFSIn = DomNode->getDFSNumIn();
+            VD.DFSOut = DomNode->getDFSNumOut();
+            VD.PInfo = PossibleCopy;
+            VD.EdgeOnly = true;
+            OrderedUses.push_back(VD);
+          }
+        } else {
+          // Otherwise, we are in the split block (even though we perform
+          // insertion in the branch block).
+          // Insert a possible copy at the split block and before the branch.
+          VD.LocalNum = LN_First;
+          auto *DomNode = DT.getNode(BlockEdge.second);
+          if (DomNode) {
+            VD.DFSIn = DomNode->getDFSNumIn();
+            VD.DFSOut = DomNode->getDFSNumOut();
+            VD.PInfo = PossibleCopy;
+            OrderedUses.push_back(VD);
+          }
+        }
+      }
+    }
+
+    convertUsesToDFSOrdered(Op, OrderedUses);
+    // Here we require a stable sort because we do not bother to try to
+    // assign an order to the operands the uses represent. Thus, two
+    // uses in the same instruction do not have a strict sort order
+    // currently and will be considered equal. We could get rid of the
+    // stable sort by creating one if we wanted.
+    llvm::stable_sort(OrderedUses, Compare);
+    SmallVector<ValueDFS, 8> RenameStack;
+    // For each use, sorted into dfs order, push values and replaces uses with
+    // top of stack, which will represent the reaching def.
+    for (auto &VD : OrderedUses) {
+      // We currently do not materialize copy over copy, but we should decide if
+      // we want to.
+      bool PossibleCopy = VD.PInfo != nullptr;
+      if (RenameStack.empty()) {
+        LLVM_DEBUG(dbgs() << "Rename Stack is empty\n");
+      } else {
+        LLVM_DEBUG(dbgs() << "Rename Stack Top DFS numbers are ("
+                          << RenameStack.back().DFSIn << ","
+                          << RenameStack.back().DFSOut << ")\n");
+      }
+
+      LLVM_DEBUG(dbgs() << "Current DFS numbers are (" << VD.DFSIn << ","
+                        << VD.DFSOut << ")\n");
+
+      bool ShouldPush = (VD.Def || PossibleCopy);
+      bool OutOfScope = !stackIsInScope(RenameStack, VD);
+      if (OutOfScope || ShouldPush) {
+        // Sync to our current scope.
+        popStackUntilDFSScope(RenameStack, VD);
+        if (ShouldPush) {
+          RenameStack.push_back(VD);
+        }
+      }
+      // If we get to this point, and the stack is empty we must have a use
+      // with no renaming needed, just skip it.
+      if (RenameStack.empty())
+        continue;
+      // Skip values, only want to rename the uses
+      if (VD.Def || PossibleCopy)
+        continue;
+      if (!DebugCounter::shouldExecute(RenameCounter)) {
+        LLVM_DEBUG(dbgs() << "Skipping execution due to debug counter\n");
+        continue;
+      }
+      ValueDFS &Result = RenameStack.back();
+
+      // If the possible copy dominates something, materialize our stack up to
+      // this point. This ensures every comparison that affects our operation
+      // ends up with predicateinfo.
+      if (!Result.Def)
+        Result.Def = materializeStack(Counter, RenameStack, Op);
+
+      LLVM_DEBUG(dbgs() << "Found replacement " << *Result.Def << " for "
+                        << *VD.U->get() << " in " << *(VD.U->getUser())
+                        << "\n");
+      assert(DT.dominates(cast<Instruction>(Result.Def), *VD.U) &&
+             "Predicateinfo def should have dominated this use");
+      VD.U->set(Result.Def);
+    }
+  }
+}
+
+PredicateInfoBuilder::ValueInfo &
+PredicateInfoBuilder::getOrCreateValueInfo(Value *Operand) {
+  auto OIN = ValueInfoNums.find(Operand);
+  if (OIN == ValueInfoNums.end()) {
+    // This will grow it
+    ValueInfos.resize(ValueInfos.size() + 1);
+    // This will use the new size and give us a 0 based number of the info
+    auto InsertResult = ValueInfoNums.insert({Operand, ValueInfos.size() - 1});
+    assert(InsertResult.second && "Value info number already existed?");
+    return ValueInfos[InsertResult.first->second];
+  }
+  return ValueInfos[OIN->second];
+}
+
+const PredicateInfoBuilder::ValueInfo &
+PredicateInfoBuilder::getValueInfo(Value *Operand) const {
+  auto OINI = ValueInfoNums.lookup(Operand);
+  assert(OINI != 0 && "Operand was not really in the Value Info Numbers");
+  assert(OINI < ValueInfos.size() &&
+         "Value Info Number greater than size of Value Info Table");
+  return ValueInfos[OINI];
+}
+
+PredicateInfo::PredicateInfo(Function &F, DominatorTree &DT,
+                             AssumptionCache &AC)
+    : F(F) {
+  PredicateInfoBuilder Builder(*this, F, DT, AC);
+  Builder.buildPredicateInfo();
+}
+
+// Remove all declarations we created . The PredicateInfo consumers are
+// responsible for remove the ssa_copy calls created.
+PredicateInfo::~PredicateInfo() {
+  // Collect function pointers in set first, as SmallSet uses a SmallVector
+  // internally and we have to remove the asserting value handles first.
+  SmallPtrSet<Function *, 20> FunctionPtrs;
+  for (auto &F : CreatedDeclarations)
+    FunctionPtrs.insert(&*F);
+  CreatedDeclarations.clear();
+
+  for (Function *F : FunctionPtrs) {
+    assert(F->user_begin() == F->user_end() &&
+           "PredicateInfo consumer did not remove all SSA copies.");
+    F->eraseFromParent();
+  }
+}
+
 Optional<PredicateConstraint> PredicateBase::getConstraint() const {
   switch (Type) {
   case PT_Assume:
@@ -849,117 +849,117 @@ Optional<PredicateConstraint> PredicateBase::getConstraint() const {
   llvm_unreachable("Unknown predicate type");
 }
 
-void PredicateInfo::verifyPredicateInfo() const {} 
- 
-char PredicateInfoPrinterLegacyPass::ID = 0; 
- 
-PredicateInfoPrinterLegacyPass::PredicateInfoPrinterLegacyPass() 
-    : FunctionPass(ID) { 
-  initializePredicateInfoPrinterLegacyPassPass( 
-      *PassRegistry::getPassRegistry()); 
-} 
- 
-void PredicateInfoPrinterLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const { 
-  AU.setPreservesAll(); 
-  AU.addRequiredTransitive<DominatorTreeWrapperPass>(); 
-  AU.addRequired<AssumptionCacheTracker>(); 
-} 
- 
-// Replace ssa_copy calls created by PredicateInfo with their operand. 
-static void replaceCreatedSSACopys(PredicateInfo &PredInfo, Function &F) { 
-  for (auto I = inst_begin(F), E = inst_end(F); I != E;) { 
-    Instruction *Inst = &*I++; 
-    const auto *PI = PredInfo.getPredicateInfoFor(Inst); 
-    auto *II = dyn_cast<IntrinsicInst>(Inst); 
-    if (!PI || !II || II->getIntrinsicID() != Intrinsic::ssa_copy) 
-      continue; 
- 
-    Inst->replaceAllUsesWith(II->getOperand(0)); 
-    Inst->eraseFromParent(); 
-  } 
-} 
- 
-bool PredicateInfoPrinterLegacyPass::runOnFunction(Function &F) { 
-  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 
-  auto PredInfo = std::make_unique<PredicateInfo>(F, DT, AC); 
-  PredInfo->print(dbgs()); 
-  if (VerifyPredicateInfo) 
-    PredInfo->verifyPredicateInfo(); 
- 
-  replaceCreatedSSACopys(*PredInfo, F); 
-  return false; 
-} 
- 
-PreservedAnalyses PredicateInfoPrinterPass::run(Function &F, 
-                                                FunctionAnalysisManager &AM) { 
-  auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 
-  auto &AC = AM.getResult<AssumptionAnalysis>(F); 
-  OS << "PredicateInfo for function: " << F.getName() << "\n"; 
-  auto PredInfo = std::make_unique<PredicateInfo>(F, DT, AC); 
-  PredInfo->print(OS); 
- 
-  replaceCreatedSSACopys(*PredInfo, F); 
-  return PreservedAnalyses::all(); 
-} 
- 
-/// An assembly annotator class to print PredicateInfo information in 
-/// comments. 
-class PredicateInfoAnnotatedWriter : public AssemblyAnnotationWriter { 
-  friend class PredicateInfo; 
-  const PredicateInfo *PredInfo; 
- 
-public: 
-  PredicateInfoAnnotatedWriter(const PredicateInfo *M) : PredInfo(M) {} 
- 
-  void emitBasicBlockStartAnnot(const BasicBlock *BB, 
-                                formatted_raw_ostream &OS) override {} 
- 
-  void emitInstructionAnnot(const Instruction *I, 
-                            formatted_raw_ostream &OS) override { 
-    if (const auto *PI = PredInfo->getPredicateInfoFor(I)) { 
-      OS << "; Has predicate info\n"; 
-      if (const auto *PB = dyn_cast<PredicateBranch>(PI)) { 
-        OS << "; branch predicate info { TrueEdge: " << PB->TrueEdge 
-           << " Comparison:" << *PB->Condition << " Edge: ["; 
-        PB->From->printAsOperand(OS); 
-        OS << ","; 
-        PB->To->printAsOperand(OS); 
-        OS << "]"; 
-      } else if (const auto *PS = dyn_cast<PredicateSwitch>(PI)) { 
-        OS << "; switch predicate info { CaseValue: " << *PS->CaseValue 
-           << " Switch:" << *PS->Switch << " Edge: ["; 
-        PS->From->printAsOperand(OS); 
-        OS << ","; 
-        PS->To->printAsOperand(OS); 
-        OS << "]"; 
-      } else if (const auto *PA = dyn_cast<PredicateAssume>(PI)) { 
-        OS << "; assume predicate info {" 
-           << " Comparison:" << *PA->Condition; 
-      } 
-      OS << ", RenamedOp: "; 
-      PI->RenamedOp->printAsOperand(OS, false); 
-      OS << " }\n"; 
-    } 
-  } 
-}; 
- 
-void PredicateInfo::print(raw_ostream &OS) const { 
-  PredicateInfoAnnotatedWriter Writer(this); 
-  F.print(OS, &Writer); 
-} 
- 
-void PredicateInfo::dump() const { 
-  PredicateInfoAnnotatedWriter Writer(this); 
-  F.print(dbgs(), &Writer); 
-} 
- 
-PreservedAnalyses PredicateInfoVerifierPass::run(Function &F, 
-                                                 FunctionAnalysisManager &AM) { 
-  auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 
-  auto &AC = AM.getResult<AssumptionAnalysis>(F); 
-  std::make_unique<PredicateInfo>(F, DT, AC)->verifyPredicateInfo(); 
- 
-  return PreservedAnalyses::all(); 
-} 
-} 
+void PredicateInfo::verifyPredicateInfo() const {}
+
+char PredicateInfoPrinterLegacyPass::ID = 0;
+
+PredicateInfoPrinterLegacyPass::PredicateInfoPrinterLegacyPass()
+    : FunctionPass(ID) {
+  initializePredicateInfoPrinterLegacyPassPass(
+      *PassRegistry::getPassRegistry());
+}
+
+void PredicateInfoPrinterLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequiredTransitive<DominatorTreeWrapperPass>();
+  AU.addRequired<AssumptionCacheTracker>();
+}
+
+// Replace ssa_copy calls created by PredicateInfo with their operand.
+static void replaceCreatedSSACopys(PredicateInfo &PredInfo, Function &F) {
+  for (auto I = inst_begin(F), E = inst_end(F); I != E;) {
+    Instruction *Inst = &*I++;
+    const auto *PI = PredInfo.getPredicateInfoFor(Inst);
+    auto *II = dyn_cast<IntrinsicInst>(Inst);
+    if (!PI || !II || II->getIntrinsicID() != Intrinsic::ssa_copy)
+      continue;
+
+    Inst->replaceAllUsesWith(II->getOperand(0));
+    Inst->eraseFromParent();
+  }
+}
+
+bool PredicateInfoPrinterLegacyPass::runOnFunction(Function &F) {
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  auto PredInfo = std::make_unique<PredicateInfo>(F, DT, AC);
+  PredInfo->print(dbgs());
+  if (VerifyPredicateInfo)
+    PredInfo->verifyPredicateInfo();
+
+  replaceCreatedSSACopys(*PredInfo, F);
+  return false;
+}
+
+PreservedAnalyses PredicateInfoPrinterPass::run(Function &F,
+                                                FunctionAnalysisManager &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  OS << "PredicateInfo for function: " << F.getName() << "\n";
+  auto PredInfo = std::make_unique<PredicateInfo>(F, DT, AC);
+  PredInfo->print(OS);
+
+  replaceCreatedSSACopys(*PredInfo, F);
+  return PreservedAnalyses::all();
+}
+
+/// An assembly annotator class to print PredicateInfo information in
+/// comments.
+class PredicateInfoAnnotatedWriter : public AssemblyAnnotationWriter {
+  friend class PredicateInfo;
+  const PredicateInfo *PredInfo;
+
+public:
+  PredicateInfoAnnotatedWriter(const PredicateInfo *M) : PredInfo(M) {}
+
+  void emitBasicBlockStartAnnot(const BasicBlock *BB,
+                                formatted_raw_ostream &OS) override {}
+
+  void emitInstructionAnnot(const Instruction *I,
+                            formatted_raw_ostream &OS) override {
+    if (const auto *PI = PredInfo->getPredicateInfoFor(I)) {
+      OS << "; Has predicate info\n";
+      if (const auto *PB = dyn_cast<PredicateBranch>(PI)) {
+        OS << "; branch predicate info { TrueEdge: " << PB->TrueEdge
+           << " Comparison:" << *PB->Condition << " Edge: [";
+        PB->From->printAsOperand(OS);
+        OS << ",";
+        PB->To->printAsOperand(OS);
+        OS << "]";
+      } else if (const auto *PS = dyn_cast<PredicateSwitch>(PI)) {
+        OS << "; switch predicate info { CaseValue: " << *PS->CaseValue
+           << " Switch:" << *PS->Switch << " Edge: [";
+        PS->From->printAsOperand(OS);
+        OS << ",";
+        PS->To->printAsOperand(OS);
+        OS << "]";
+      } else if (const auto *PA = dyn_cast<PredicateAssume>(PI)) {
+        OS << "; assume predicate info {"
+           << " Comparison:" << *PA->Condition;
+      }
+      OS << ", RenamedOp: ";
+      PI->RenamedOp->printAsOperand(OS, false);
+      OS << " }\n";
+    }
+  }
+};
+
+void PredicateInfo::print(raw_ostream &OS) const {
+  PredicateInfoAnnotatedWriter Writer(this);
+  F.print(OS, &Writer);
+}
+
+void PredicateInfo::dump() const {
+  PredicateInfoAnnotatedWriter Writer(this);
+  F.print(dbgs(), &Writer);
+}
+
+PreservedAnalyses PredicateInfoVerifierPass::run(Function &F,
+                                                 FunctionAnalysisManager &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  std::make_unique<PredicateInfo>(F, DT, AC)->verifyPredicateInfo();
+
+  return PreservedAnalyses::all();
+}
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index da5405593b..86bbb6a889 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -1,428 +1,428 @@
-//===- PromoteMemoryToRegister.cpp - Convert allocas to registers ---------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file promotes memory references to be register references.  It promotes 
-// alloca instructions which only have loads and stores as uses.  An alloca is 
-// transformed by using iterated dominator frontiers to place PHI nodes, then 
-// traversing the function in depth-first order to rewrite loads and stores as 
-// appropriate. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/TinyPtrVector.h" 
-#include "llvm/ADT/Twine.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/Analysis/IteratedDominanceFrontier.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DIBuilder.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Transforms/Utils/PromoteMemToReg.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <iterator> 
-#include <utility> 
-#include <vector> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "mem2reg" 
- 
-STATISTIC(NumLocalPromoted, "Number of alloca's promoted within one block"); 
-STATISTIC(NumSingleStore,   "Number of alloca's promoted with a single store"); 
-STATISTIC(NumDeadAlloca,    "Number of dead alloca's removed"); 
-STATISTIC(NumPHIInsert,     "Number of PHI nodes inserted"); 
- 
-bool llvm::isAllocaPromotable(const AllocaInst *AI) { 
-  // Only allow direct and non-volatile loads and stores... 
-  for (const User *U : AI->users()) { 
-    if (const LoadInst *LI = dyn_cast<LoadInst>(U)) { 
-      // Note that atomic loads can be transformed; atomic semantics do 
-      // not have any meaning for a local alloca. 
-      if (LI->isVolatile()) 
-        return false; 
-    } else if (const StoreInst *SI = dyn_cast<StoreInst>(U)) { 
-      if (SI->getOperand(0) == AI) 
-        return false; // Don't allow a store OF the AI, only INTO the AI. 
-      // Note that atomic stores can be transformed; atomic semantics do 
-      // not have any meaning for a local alloca. 
-      if (SI->isVolatile()) 
-        return false; 
-    } else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) { 
+//===- PromoteMemoryToRegister.cpp - Convert allocas to registers ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file promotes memory references to be register references.  It promotes
+// alloca instructions which only have loads and stores as uses.  An alloca is
+// transformed by using iterated dominator frontiers to place PHI nodes, then
+// traversing the function in depth-first order to rewrite loads and stores as
+// appropriate.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/IteratedDominanceFrontier.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mem2reg"
+
+STATISTIC(NumLocalPromoted, "Number of alloca's promoted within one block");
+STATISTIC(NumSingleStore,   "Number of alloca's promoted with a single store");
+STATISTIC(NumDeadAlloca,    "Number of dead alloca's removed");
+STATISTIC(NumPHIInsert,     "Number of PHI nodes inserted");
+
+bool llvm::isAllocaPromotable(const AllocaInst *AI) {
+  // Only allow direct and non-volatile loads and stores...
+  for (const User *U : AI->users()) {
+    if (const LoadInst *LI = dyn_cast<LoadInst>(U)) {
+      // Note that atomic loads can be transformed; atomic semantics do
+      // not have any meaning for a local alloca.
+      if (LI->isVolatile())
+        return false;
+    } else if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      if (SI->getOperand(0) == AI)
+        return false; // Don't allow a store OF the AI, only INTO the AI.
+      // Note that atomic stores can be transformed; atomic semantics do
+      // not have any meaning for a local alloca.
+      if (SI->isVolatile())
+        return false;
+    } else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
       if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
-        return false; 
-    } else if (const BitCastInst *BCI = dyn_cast<BitCastInst>(U)) { 
+        return false;
+    } else if (const BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
       if (!onlyUsedByLifetimeMarkersOrDroppableInsts(BCI))
-        return false; 
-    } else if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) { 
-      if (!GEPI->hasAllZeroIndices()) 
-        return false; 
+        return false;
+    } else if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) {
+      if (!GEPI->hasAllZeroIndices())
+        return false;
       if (!onlyUsedByLifetimeMarkersOrDroppableInsts(GEPI))
-        return false; 
+        return false;
     } else if (const AddrSpaceCastInst *ASCI = dyn_cast<AddrSpaceCastInst>(U)) {
       if (!onlyUsedByLifetimeMarkers(ASCI))
         return false;
-    } else { 
-      return false; 
-    } 
-  } 
- 
-  return true; 
-} 
- 
-namespace { 
- 
-struct AllocaInfo { 
+    } else {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+namespace {
+
+struct AllocaInfo {
   using DbgUserVec = SmallVector<DbgVariableIntrinsic *, 1>;
 
-  SmallVector<BasicBlock *, 32> DefiningBlocks; 
-  SmallVector<BasicBlock *, 32> UsingBlocks; 
- 
-  StoreInst *OnlyStore; 
-  BasicBlock *OnlyBlock; 
-  bool OnlyUsedInOneBlock; 
- 
+  SmallVector<BasicBlock *, 32> DefiningBlocks;
+  SmallVector<BasicBlock *, 32> UsingBlocks;
+
+  StoreInst *OnlyStore;
+  BasicBlock *OnlyBlock;
+  bool OnlyUsedInOneBlock;
+
   DbgUserVec DbgUsers;
- 
-  void clear() { 
-    DefiningBlocks.clear(); 
-    UsingBlocks.clear(); 
-    OnlyStore = nullptr; 
-    OnlyBlock = nullptr; 
-    OnlyUsedInOneBlock = true; 
+
+  void clear() {
+    DefiningBlocks.clear();
+    UsingBlocks.clear();
+    OnlyStore = nullptr;
+    OnlyBlock = nullptr;
+    OnlyUsedInOneBlock = true;
     DbgUsers.clear();
-  } 
- 
-  /// Scan the uses of the specified alloca, filling in the AllocaInfo used 
-  /// by the rest of the pass to reason about the uses of this alloca. 
-  void AnalyzeAlloca(AllocaInst *AI) { 
-    clear(); 
- 
-    // As we scan the uses of the alloca instruction, keep track of stores, 
-    // and decide whether all of the loads and stores to the alloca are within 
-    // the same basic block. 
+  }
+
+  /// Scan the uses of the specified alloca, filling in the AllocaInfo used
+  /// by the rest of the pass to reason about the uses of this alloca.
+  void AnalyzeAlloca(AllocaInst *AI) {
+    clear();
+
+    // As we scan the uses of the alloca instruction, keep track of stores,
+    // and decide whether all of the loads and stores to the alloca are within
+    // the same basic block.
     for (User *U : AI->users()) {
       Instruction *User = cast<Instruction>(U);
- 
-      if (StoreInst *SI = dyn_cast<StoreInst>(User)) { 
-        // Remember the basic blocks which define new values for the alloca 
-        DefiningBlocks.push_back(SI->getParent()); 
-        OnlyStore = SI; 
-      } else { 
-        LoadInst *LI = cast<LoadInst>(User); 
-        // Otherwise it must be a load instruction, keep track of variable 
-        // reads. 
-        UsingBlocks.push_back(LI->getParent()); 
-      } 
- 
-      if (OnlyUsedInOneBlock) { 
-        if (!OnlyBlock) 
-          OnlyBlock = User->getParent(); 
-        else if (OnlyBlock != User->getParent()) 
-          OnlyUsedInOneBlock = false; 
-      } 
-    } 
- 
+
+      if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+        // Remember the basic blocks which define new values for the alloca
+        DefiningBlocks.push_back(SI->getParent());
+        OnlyStore = SI;
+      } else {
+        LoadInst *LI = cast<LoadInst>(User);
+        // Otherwise it must be a load instruction, keep track of variable
+        // reads.
+        UsingBlocks.push_back(LI->getParent());
+      }
+
+      if (OnlyUsedInOneBlock) {
+        if (!OnlyBlock)
+          OnlyBlock = User->getParent();
+        else if (OnlyBlock != User->getParent())
+          OnlyUsedInOneBlock = false;
+      }
+    }
+
     findDbgUsers(DbgUsers, AI);
-  } 
-}; 
- 
-/// Data package used by RenamePass(). 
-struct RenamePassData { 
-  using ValVector = std::vector<Value *>; 
-  using LocationVector = std::vector<DebugLoc>; 
- 
-  RenamePassData(BasicBlock *B, BasicBlock *P, ValVector V, LocationVector L) 
-      : BB(B), Pred(P), Values(std::move(V)), Locations(std::move(L)) {} 
- 
-  BasicBlock *BB; 
-  BasicBlock *Pred; 
-  ValVector Values; 
-  LocationVector Locations; 
-}; 
- 
-/// This assigns and keeps a per-bb relative ordering of load/store 
-/// instructions in the block that directly load or store an alloca. 
-/// 
-/// This functionality is important because it avoids scanning large basic 
-/// blocks multiple times when promoting many allocas in the same block. 
-class LargeBlockInfo { 
-  /// For each instruction that we track, keep the index of the 
-  /// instruction. 
-  /// 
-  /// The index starts out as the number of the instruction from the start of 
-  /// the block. 
-  DenseMap<const Instruction *, unsigned> InstNumbers; 
- 
-public: 
- 
-  /// This code only looks at accesses to allocas. 
-  static bool isInterestingInstruction(const Instruction *I) { 
-    return (isa<LoadInst>(I) && isa<AllocaInst>(I->getOperand(0))) || 
-           (isa<StoreInst>(I) && isa<AllocaInst>(I->getOperand(1))); 
-  } 
- 
-  /// Get or calculate the index of the specified instruction. 
-  unsigned getInstructionIndex(const Instruction *I) { 
-    assert(isInterestingInstruction(I) && 
-           "Not a load/store to/from an alloca?"); 
- 
-    // If we already have this instruction number, return it. 
-    DenseMap<const Instruction *, unsigned>::iterator It = InstNumbers.find(I); 
-    if (It != InstNumbers.end()) 
-      return It->second; 
- 
-    // Scan the whole block to get the instruction.  This accumulates 
-    // information for every interesting instruction in the block, in order to 
-    // avoid gratuitus rescans. 
-    const BasicBlock *BB = I->getParent(); 
-    unsigned InstNo = 0; 
-    for (const Instruction &BBI : *BB) 
-      if (isInterestingInstruction(&BBI)) 
-        InstNumbers[&BBI] = InstNo++; 
-    It = InstNumbers.find(I); 
- 
-    assert(It != InstNumbers.end() && "Didn't insert instruction?"); 
-    return It->second; 
-  } 
- 
-  void deleteValue(const Instruction *I) { InstNumbers.erase(I); } 
- 
-  void clear() { InstNumbers.clear(); } 
-}; 
- 
-struct PromoteMem2Reg { 
-  /// The alloca instructions being promoted. 
-  std::vector<AllocaInst *> Allocas; 
- 
-  DominatorTree &DT; 
-  DIBuilder DIB; 
- 
-  /// A cache of @llvm.assume intrinsics used by SimplifyInstruction. 
-  AssumptionCache *AC; 
- 
-  const SimplifyQuery SQ; 
- 
-  /// Reverse mapping of Allocas. 
-  DenseMap<AllocaInst *, unsigned> AllocaLookup; 
- 
-  /// The PhiNodes we're adding. 
-  /// 
-  /// That map is used to simplify some Phi nodes as we iterate over it, so 
-  /// it should have deterministic iterators.  We could use a MapVector, but 
-  /// since we already maintain a map from BasicBlock* to a stable numbering 
-  /// (BBNumbers), the DenseMap is more efficient (also supports removal). 
-  DenseMap<std::pair<unsigned, unsigned>, PHINode *> NewPhiNodes; 
- 
-  /// For each PHI node, keep track of which entry in Allocas it corresponds 
-  /// to. 
-  DenseMap<PHINode *, unsigned> PhiToAllocaMap; 
- 
-  /// For each alloca, we keep track of the dbg.declare intrinsic that 
-  /// describes it, if any, so that we can convert it to a dbg.value 
-  /// intrinsic if the alloca gets promoted. 
+  }
+};
+
+/// Data package used by RenamePass().
+struct RenamePassData {
+  using ValVector = std::vector<Value *>;
+  using LocationVector = std::vector<DebugLoc>;
+
+  RenamePassData(BasicBlock *B, BasicBlock *P, ValVector V, LocationVector L)
+      : BB(B), Pred(P), Values(std::move(V)), Locations(std::move(L)) {}
+
+  BasicBlock *BB;
+  BasicBlock *Pred;
+  ValVector Values;
+  LocationVector Locations;
+};
+
+/// This assigns and keeps a per-bb relative ordering of load/store
+/// instructions in the block that directly load or store an alloca.
+///
+/// This functionality is important because it avoids scanning large basic
+/// blocks multiple times when promoting many allocas in the same block.
+class LargeBlockInfo {
+  /// For each instruction that we track, keep the index of the
+  /// instruction.
+  ///
+  /// The index starts out as the number of the instruction from the start of
+  /// the block.
+  DenseMap<const Instruction *, unsigned> InstNumbers;
+
+public:
+
+  /// This code only looks at accesses to allocas.
+  static bool isInterestingInstruction(const Instruction *I) {
+    return (isa<LoadInst>(I) && isa<AllocaInst>(I->getOperand(0))) ||
+           (isa<StoreInst>(I) && isa<AllocaInst>(I->getOperand(1)));
+  }
+
+  /// Get or calculate the index of the specified instruction.
+  unsigned getInstructionIndex(const Instruction *I) {
+    assert(isInterestingInstruction(I) &&
+           "Not a load/store to/from an alloca?");
+
+    // If we already have this instruction number, return it.
+    DenseMap<const Instruction *, unsigned>::iterator It = InstNumbers.find(I);
+    if (It != InstNumbers.end())
+      return It->second;
+
+    // Scan the whole block to get the instruction.  This accumulates
+    // information for every interesting instruction in the block, in order to
+    // avoid gratuitus rescans.
+    const BasicBlock *BB = I->getParent();
+    unsigned InstNo = 0;
+    for (const Instruction &BBI : *BB)
+      if (isInterestingInstruction(&BBI))
+        InstNumbers[&BBI] = InstNo++;
+    It = InstNumbers.find(I);
+
+    assert(It != InstNumbers.end() && "Didn't insert instruction?");
+    return It->second;
+  }
+
+  void deleteValue(const Instruction *I) { InstNumbers.erase(I); }
+
+  void clear() { InstNumbers.clear(); }
+};
+
+struct PromoteMem2Reg {
+  /// The alloca instructions being promoted.
+  std::vector<AllocaInst *> Allocas;
+
+  DominatorTree &DT;
+  DIBuilder DIB;
+
+  /// A cache of @llvm.assume intrinsics used by SimplifyInstruction.
+  AssumptionCache *AC;
+
+  const SimplifyQuery SQ;
+
+  /// Reverse mapping of Allocas.
+  DenseMap<AllocaInst *, unsigned> AllocaLookup;
+
+  /// The PhiNodes we're adding.
+  ///
+  /// That map is used to simplify some Phi nodes as we iterate over it, so
+  /// it should have deterministic iterators.  We could use a MapVector, but
+  /// since we already maintain a map from BasicBlock* to a stable numbering
+  /// (BBNumbers), the DenseMap is more efficient (also supports removal).
+  DenseMap<std::pair<unsigned, unsigned>, PHINode *> NewPhiNodes;
+
+  /// For each PHI node, keep track of which entry in Allocas it corresponds
+  /// to.
+  DenseMap<PHINode *, unsigned> PhiToAllocaMap;
+
+  /// For each alloca, we keep track of the dbg.declare intrinsic that
+  /// describes it, if any, so that we can convert it to a dbg.value
+  /// intrinsic if the alloca gets promoted.
   SmallVector<AllocaInfo::DbgUserVec, 8> AllocaDbgUsers;
- 
-  /// The set of basic blocks the renamer has already visited. 
-  SmallPtrSet<BasicBlock *, 16> Visited; 
- 
-  /// Contains a stable numbering of basic blocks to avoid non-determinstic 
-  /// behavior. 
-  DenseMap<BasicBlock *, unsigned> BBNumbers; 
- 
-  /// Lazily compute the number of predecessors a block has. 
-  DenseMap<const BasicBlock *, unsigned> BBNumPreds; 
- 
-public: 
-  PromoteMem2Reg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT, 
-                 AssumptionCache *AC) 
-      : Allocas(Allocas.begin(), Allocas.end()), DT(DT), 
-        DIB(*DT.getRoot()->getParent()->getParent(), /*AllowUnresolved*/ false), 
-        AC(AC), SQ(DT.getRoot()->getParent()->getParent()->getDataLayout(), 
-                   nullptr, &DT, AC) {} 
- 
-  void run(); 
- 
-private: 
-  void RemoveFromAllocasList(unsigned &AllocaIdx) { 
-    Allocas[AllocaIdx] = Allocas.back(); 
-    Allocas.pop_back(); 
-    --AllocaIdx; 
-  } 
- 
-  unsigned getNumPreds(const BasicBlock *BB) { 
-    unsigned &NP = BBNumPreds[BB]; 
-    if (NP == 0) 
-      NP = pred_size(BB) + 1; 
-    return NP - 1; 
-  } 
- 
-  void ComputeLiveInBlocks(AllocaInst *AI, AllocaInfo &Info, 
-                           const SmallPtrSetImpl<BasicBlock *> &DefBlocks, 
-                           SmallPtrSetImpl<BasicBlock *> &LiveInBlocks); 
-  void RenamePass(BasicBlock *BB, BasicBlock *Pred, 
-                  RenamePassData::ValVector &IncVals, 
-                  RenamePassData::LocationVector &IncLocs, 
-                  std::vector<RenamePassData> &Worklist); 
-  bool QueuePhiNode(BasicBlock *BB, unsigned AllocaIdx, unsigned &Version); 
-}; 
- 
-} // end anonymous namespace 
- 
-/// Given a LoadInst LI this adds assume(LI != null) after it. 
-static void addAssumeNonNull(AssumptionCache *AC, LoadInst *LI) { 
-  Function *AssumeIntrinsic = 
-      Intrinsic::getDeclaration(LI->getModule(), Intrinsic::assume); 
-  ICmpInst *LoadNotNull = new ICmpInst(ICmpInst::ICMP_NE, LI, 
-                                       Constant::getNullValue(LI->getType())); 
-  LoadNotNull->insertAfter(LI); 
-  CallInst *CI = CallInst::Create(AssumeIntrinsic, {LoadNotNull}); 
-  CI->insertAfter(LoadNotNull); 
-  AC->registerAssumption(CI); 
-} 
- 
+
+  /// The set of basic blocks the renamer has already visited.
+  SmallPtrSet<BasicBlock *, 16> Visited;
+
+  /// Contains a stable numbering of basic blocks to avoid non-determinstic
+  /// behavior.
+  DenseMap<BasicBlock *, unsigned> BBNumbers;
+
+  /// Lazily compute the number of predecessors a block has.
+  DenseMap<const BasicBlock *, unsigned> BBNumPreds;
+
+public:
+  PromoteMem2Reg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT,
+                 AssumptionCache *AC)
+      : Allocas(Allocas.begin(), Allocas.end()), DT(DT),
+        DIB(*DT.getRoot()->getParent()->getParent(), /*AllowUnresolved*/ false),
+        AC(AC), SQ(DT.getRoot()->getParent()->getParent()->getDataLayout(),
+                   nullptr, &DT, AC) {}
+
+  void run();
+
+private:
+  void RemoveFromAllocasList(unsigned &AllocaIdx) {
+    Allocas[AllocaIdx] = Allocas.back();
+    Allocas.pop_back();
+    --AllocaIdx;
+  }
+
+  unsigned getNumPreds(const BasicBlock *BB) {
+    unsigned &NP = BBNumPreds[BB];
+    if (NP == 0)
+      NP = pred_size(BB) + 1;
+    return NP - 1;
+  }
+
+  void ComputeLiveInBlocks(AllocaInst *AI, AllocaInfo &Info,
+                           const SmallPtrSetImpl<BasicBlock *> &DefBlocks,
+                           SmallPtrSetImpl<BasicBlock *> &LiveInBlocks);
+  void RenamePass(BasicBlock *BB, BasicBlock *Pred,
+                  RenamePassData::ValVector &IncVals,
+                  RenamePassData::LocationVector &IncLocs,
+                  std::vector<RenamePassData> &Worklist);
+  bool QueuePhiNode(BasicBlock *BB, unsigned AllocaIdx, unsigned &Version);
+};
+
+} // end anonymous namespace
+
+/// Given a LoadInst LI this adds assume(LI != null) after it.
+static void addAssumeNonNull(AssumptionCache *AC, LoadInst *LI) {
+  Function *AssumeIntrinsic =
+      Intrinsic::getDeclaration(LI->getModule(), Intrinsic::assume);
+  ICmpInst *LoadNotNull = new ICmpInst(ICmpInst::ICMP_NE, LI,
+                                       Constant::getNullValue(LI->getType()));
+  LoadNotNull->insertAfter(LI);
+  CallInst *CI = CallInst::Create(AssumeIntrinsic, {LoadNotNull});
+  CI->insertAfter(LoadNotNull);
+  AC->registerAssumption(CI);
+}
+
 static void removeIntrinsicUsers(AllocaInst *AI) {
-  // Knowing that this alloca is promotable, we know that it's safe to kill all 
-  // instructions except for load and store. 
- 
+  // Knowing that this alloca is promotable, we know that it's safe to kill all
+  // instructions except for load and store.
+
   for (auto UI = AI->use_begin(), UE = AI->use_end(); UI != UE;) {
     Instruction *I = cast<Instruction>(UI->getUser());
     Use &U = *UI;
-    ++UI; 
-    if (isa<LoadInst>(I) || isa<StoreInst>(I)) 
-      continue; 
- 
+    ++UI;
+    if (isa<LoadInst>(I) || isa<StoreInst>(I))
+      continue;
+
     // Drop the use of AI in droppable instructions.
     if (I->isDroppable()) {
       I->dropDroppableUse(U);
       continue;
     }
 
-    if (!I->getType()->isVoidTy()) { 
-      // The only users of this bitcast/GEP instruction are lifetime intrinsics. 
-      // Follow the use/def chain to erase them now instead of leaving it for 
-      // dead code elimination later. 
+    if (!I->getType()->isVoidTy()) {
+      // The only users of this bitcast/GEP instruction are lifetime intrinsics.
+      // Follow the use/def chain to erase them now instead of leaving it for
+      // dead code elimination later.
       for (auto UUI = I->use_begin(), UUE = I->use_end(); UUI != UUE;) {
         Instruction *Inst = cast<Instruction>(UUI->getUser());
         Use &UU = *UUI;
-        ++UUI; 
+        ++UUI;
 
         // Drop the use of I in droppable instructions.
         if (Inst->isDroppable()) {
           Inst->dropDroppableUse(UU);
           continue;
         }
-        Inst->eraseFromParent(); 
-      } 
-    } 
-    I->eraseFromParent(); 
-  } 
-} 
- 
-/// Rewrite as many loads as possible given a single store. 
-/// 
-/// When there is only a single store, we can use the domtree to trivially 
-/// replace all of the dominated loads with the stored value. Do so, and return 
-/// true if this has successfully promoted the alloca entirely. If this returns 
-/// false there were some loads which were not dominated by the single store 
-/// and thus must be phi-ed with undef. We fall back to the standard alloca 
-/// promotion algorithm in that case. 
-static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, 
-                                     LargeBlockInfo &LBI, const DataLayout &DL, 
-                                     DominatorTree &DT, AssumptionCache *AC) { 
-  StoreInst *OnlyStore = Info.OnlyStore; 
-  bool StoringGlobalVal = !isa<Instruction>(OnlyStore->getOperand(0)); 
-  BasicBlock *StoreBB = OnlyStore->getParent(); 
-  int StoreIndex = -1; 
- 
-  // Clear out UsingBlocks.  We will reconstruct it here if needed. 
-  Info.UsingBlocks.clear(); 
- 
+        Inst->eraseFromParent();
+      }
+    }
+    I->eraseFromParent();
+  }
+}
+
+/// Rewrite as many loads as possible given a single store.
+///
+/// When there is only a single store, we can use the domtree to trivially
+/// replace all of the dominated loads with the stored value. Do so, and return
+/// true if this has successfully promoted the alloca entirely. If this returns
+/// false there were some loads which were not dominated by the single store
+/// and thus must be phi-ed with undef. We fall back to the standard alloca
+/// promotion algorithm in that case.
+static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
+                                     LargeBlockInfo &LBI, const DataLayout &DL,
+                                     DominatorTree &DT, AssumptionCache *AC) {
+  StoreInst *OnlyStore = Info.OnlyStore;
+  bool StoringGlobalVal = !isa<Instruction>(OnlyStore->getOperand(0));
+  BasicBlock *StoreBB = OnlyStore->getParent();
+  int StoreIndex = -1;
+
+  // Clear out UsingBlocks.  We will reconstruct it here if needed.
+  Info.UsingBlocks.clear();
+
   for (User *U : make_early_inc_range(AI->users())) {
     Instruction *UserInst = cast<Instruction>(U);
-    if (UserInst == OnlyStore) 
-      continue; 
-    LoadInst *LI = cast<LoadInst>(UserInst); 
- 
-    // Okay, if we have a load from the alloca, we want to replace it with the 
-    // only value stored to the alloca.  We can do this if the value is 
-    // dominated by the store.  If not, we use the rest of the mem2reg machinery 
-    // to insert the phi nodes as needed. 
-    if (!StoringGlobalVal) { // Non-instructions are always dominated. 
-      if (LI->getParent() == StoreBB) { 
-        // If we have a use that is in the same block as the store, compare the 
-        // indices of the two instructions to see which one came first.  If the 
-        // load came before the store, we can't handle it. 
-        if (StoreIndex == -1) 
-          StoreIndex = LBI.getInstructionIndex(OnlyStore); 
- 
-        if (unsigned(StoreIndex) > LBI.getInstructionIndex(LI)) { 
-          // Can't handle this load, bail out. 
-          Info.UsingBlocks.push_back(StoreBB); 
-          continue; 
-        } 
-      } else if (!DT.dominates(StoreBB, LI->getParent())) { 
-        // If the load and store are in different blocks, use BB dominance to 
-        // check their relationships.  If the store doesn't dom the use, bail 
-        // out. 
-        Info.UsingBlocks.push_back(LI->getParent()); 
-        continue; 
-      } 
-    } 
- 
-    // Otherwise, we *can* safely rewrite this load. 
-    Value *ReplVal = OnlyStore->getOperand(0); 
-    // If the replacement value is the load, this must occur in unreachable 
-    // code. 
-    if (ReplVal == LI) 
-      ReplVal = UndefValue::get(LI->getType()); 
- 
-    // If the load was marked as nonnull we don't want to lose 
-    // that information when we erase this Load. So we preserve 
-    // it with an assume. 
-    if (AC && LI->getMetadata(LLVMContext::MD_nonnull) && 
-        !isKnownNonZero(ReplVal, DL, 0, AC, LI, &DT)) 
-      addAssumeNonNull(AC, LI); 
- 
-    LI->replaceAllUsesWith(ReplVal); 
-    LI->eraseFromParent(); 
-    LBI.deleteValue(LI); 
-  } 
- 
-  // Finally, after the scan, check to see if the store is all that is left. 
-  if (!Info.UsingBlocks.empty()) 
-    return false; // If not, we'll have to fall back for the remainder. 
- 
-  // Record debuginfo for the store and remove the declaration's 
-  // debuginfo. 
+    if (UserInst == OnlyStore)
+      continue;
+    LoadInst *LI = cast<LoadInst>(UserInst);
+
+    // Okay, if we have a load from the alloca, we want to replace it with the
+    // only value stored to the alloca.  We can do this if the value is
+    // dominated by the store.  If not, we use the rest of the mem2reg machinery
+    // to insert the phi nodes as needed.
+    if (!StoringGlobalVal) { // Non-instructions are always dominated.
+      if (LI->getParent() == StoreBB) {
+        // If we have a use that is in the same block as the store, compare the
+        // indices of the two instructions to see which one came first.  If the
+        // load came before the store, we can't handle it.
+        if (StoreIndex == -1)
+          StoreIndex = LBI.getInstructionIndex(OnlyStore);
+
+        if (unsigned(StoreIndex) > LBI.getInstructionIndex(LI)) {
+          // Can't handle this load, bail out.
+          Info.UsingBlocks.push_back(StoreBB);
+          continue;
+        }
+      } else if (!DT.dominates(StoreBB, LI->getParent())) {
+        // If the load and store are in different blocks, use BB dominance to
+        // check their relationships.  If the store doesn't dom the use, bail
+        // out.
+        Info.UsingBlocks.push_back(LI->getParent());
+        continue;
+      }
+    }
+
+    // Otherwise, we *can* safely rewrite this load.
+    Value *ReplVal = OnlyStore->getOperand(0);
+    // If the replacement value is the load, this must occur in unreachable
+    // code.
+    if (ReplVal == LI)
+      ReplVal = UndefValue::get(LI->getType());
+
+    // If the load was marked as nonnull we don't want to lose
+    // that information when we erase this Load. So we preserve
+    // it with an assume.
+    if (AC && LI->getMetadata(LLVMContext::MD_nonnull) &&
+        !isKnownNonZero(ReplVal, DL, 0, AC, LI, &DT))
+      addAssumeNonNull(AC, LI);
+
+    LI->replaceAllUsesWith(ReplVal);
+    LI->eraseFromParent();
+    LBI.deleteValue(LI);
+  }
+
+  // Finally, after the scan, check to see if the store is all that is left.
+  if (!Info.UsingBlocks.empty())
+    return false; // If not, we'll have to fall back for the remainder.
+
+  // Record debuginfo for the store and remove the declaration's
+  // debuginfo.
   for (DbgVariableIntrinsic *DII : Info.DbgUsers) {
     if (DII->isAddressOfVariable()) {
       DIBuilder DIB(*AI->getModule(), /*AllowUnresolved*/ false);
@@ -431,594 +431,594 @@ static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
     } else if (DII->getExpression()->startsWithDeref()) {
       DII->eraseFromParent();
     }
-  } 
-  // Remove the (now dead) store and alloca. 
-  Info.OnlyStore->eraseFromParent(); 
-  LBI.deleteValue(Info.OnlyStore); 
- 
-  AI->eraseFromParent(); 
-  return true; 
-} 
- 
-/// Many allocas are only used within a single basic block.  If this is the 
-/// case, avoid traversing the CFG and inserting a lot of potentially useless 
-/// PHI nodes by just performing a single linear pass over the basic block 
-/// using the Alloca. 
-/// 
-/// If we cannot promote this alloca (because it is read before it is written), 
-/// return false.  This is necessary in cases where, due to control flow, the 
-/// alloca is undefined only on some control flow paths.  e.g. code like 
-/// this is correct in LLVM IR: 
-///  // A is an alloca with no stores so far 
-///  for (...) { 
-///    int t = *A; 
-///    if (!first_iteration) 
-///      use(t); 
-///    *A = 42; 
-///  } 
-static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info, 
-                                     LargeBlockInfo &LBI, 
-                                     const DataLayout &DL, 
-                                     DominatorTree &DT, 
-                                     AssumptionCache *AC) { 
-  // The trickiest case to handle is when we have large blocks. Because of this, 
-  // this code is optimized assuming that large blocks happen.  This does not 
-  // significantly pessimize the small block case.  This uses LargeBlockInfo to 
-  // make it efficient to get the index of various operations in the block. 
- 
-  // Walk the use-def list of the alloca, getting the locations of all stores. 
-  using StoresByIndexTy = SmallVector<std::pair<unsigned, StoreInst *>, 64>; 
-  StoresByIndexTy StoresByIndex; 
- 
-  for (User *U : AI->users()) 
-    if (StoreInst *SI = dyn_cast<StoreInst>(U)) 
-      StoresByIndex.push_back(std::make_pair(LBI.getInstructionIndex(SI), SI)); 
- 
-  // Sort the stores by their index, making it efficient to do a lookup with a 
-  // binary search. 
-  llvm::sort(StoresByIndex, less_first()); 
- 
-  // Walk all of the loads from this alloca, replacing them with the nearest 
-  // store above them, if any. 
+  }
+  // Remove the (now dead) store and alloca.
+  Info.OnlyStore->eraseFromParent();
+  LBI.deleteValue(Info.OnlyStore);
+
+  AI->eraseFromParent();
+  return true;
+}
+
+/// Many allocas are only used within a single basic block.  If this is the
+/// case, avoid traversing the CFG and inserting a lot of potentially useless
+/// PHI nodes by just performing a single linear pass over the basic block
+/// using the Alloca.
+///
+/// If we cannot promote this alloca (because it is read before it is written),
+/// return false.  This is necessary in cases where, due to control flow, the
+/// alloca is undefined only on some control flow paths.  e.g. code like
+/// this is correct in LLVM IR:
+///  // A is an alloca with no stores so far
+///  for (...) {
+///    int t = *A;
+///    if (!first_iteration)
+///      use(t);
+///    *A = 42;
+///  }
+static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
+                                     LargeBlockInfo &LBI,
+                                     const DataLayout &DL,
+                                     DominatorTree &DT,
+                                     AssumptionCache *AC) {
+  // The trickiest case to handle is when we have large blocks. Because of this,
+  // this code is optimized assuming that large blocks happen.  This does not
+  // significantly pessimize the small block case.  This uses LargeBlockInfo to
+  // make it efficient to get the index of various operations in the block.
+
+  // Walk the use-def list of the alloca, getting the locations of all stores.
+  using StoresByIndexTy = SmallVector<std::pair<unsigned, StoreInst *>, 64>;
+  StoresByIndexTy StoresByIndex;
+
+  for (User *U : AI->users())
+    if (StoreInst *SI = dyn_cast<StoreInst>(U))
+      StoresByIndex.push_back(std::make_pair(LBI.getInstructionIndex(SI), SI));
+
+  // Sort the stores by their index, making it efficient to do a lookup with a
+  // binary search.
+  llvm::sort(StoresByIndex, less_first());
+
+  // Walk all of the loads from this alloca, replacing them with the nearest
+  // store above them, if any.
   for (User *U : make_early_inc_range(AI->users())) {
     LoadInst *LI = dyn_cast<LoadInst>(U);
-    if (!LI) 
-      continue; 
- 
-    unsigned LoadIdx = LBI.getInstructionIndex(LI); 
- 
-    // Find the nearest store that has a lower index than this load. 
-    StoresByIndexTy::iterator I = llvm::lower_bound( 
-        StoresByIndex, 
-        std::make_pair(LoadIdx, static_cast<StoreInst *>(nullptr)), 
-        less_first()); 
-    if (I == StoresByIndex.begin()) { 
-      if (StoresByIndex.empty()) 
-        // If there are no stores, the load takes the undef value. 
-        LI->replaceAllUsesWith(UndefValue::get(LI->getType())); 
-      else 
-        // There is no store before this load, bail out (load may be affected 
-        // by the following stores - see main comment). 
-        return false; 
-    } else { 
-      // Otherwise, there was a store before this load, the load takes its value. 
-      // Note, if the load was marked as nonnull we don't want to lose that 
-      // information when we erase it. So we preserve it with an assume. 
-      Value *ReplVal = std::prev(I)->second->getOperand(0); 
-      if (AC && LI->getMetadata(LLVMContext::MD_nonnull) && 
-          !isKnownNonZero(ReplVal, DL, 0, AC, LI, &DT)) 
-        addAssumeNonNull(AC, LI); 
- 
-      // If the replacement value is the load, this must occur in unreachable 
-      // code. 
-      if (ReplVal == LI) 
-        ReplVal = UndefValue::get(LI->getType()); 
- 
-      LI->replaceAllUsesWith(ReplVal); 
-    } 
- 
-    LI->eraseFromParent(); 
-    LBI.deleteValue(LI); 
-  } 
- 
-  // Remove the (now dead) stores and alloca. 
-  while (!AI->use_empty()) { 
-    StoreInst *SI = cast<StoreInst>(AI->user_back()); 
-    // Record debuginfo for the store before removing it. 
+    if (!LI)
+      continue;
+
+    unsigned LoadIdx = LBI.getInstructionIndex(LI);
+
+    // Find the nearest store that has a lower index than this load.
+    StoresByIndexTy::iterator I = llvm::lower_bound(
+        StoresByIndex,
+        std::make_pair(LoadIdx, static_cast<StoreInst *>(nullptr)),
+        less_first());
+    if (I == StoresByIndex.begin()) {
+      if (StoresByIndex.empty())
+        // If there are no stores, the load takes the undef value.
+        LI->replaceAllUsesWith(UndefValue::get(LI->getType()));
+      else
+        // There is no store before this load, bail out (load may be affected
+        // by the following stores - see main comment).
+        return false;
+    } else {
+      // Otherwise, there was a store before this load, the load takes its value.
+      // Note, if the load was marked as nonnull we don't want to lose that
+      // information when we erase it. So we preserve it with an assume.
+      Value *ReplVal = std::prev(I)->second->getOperand(0);
+      if (AC && LI->getMetadata(LLVMContext::MD_nonnull) &&
+          !isKnownNonZero(ReplVal, DL, 0, AC, LI, &DT))
+        addAssumeNonNull(AC, LI);
+
+      // If the replacement value is the load, this must occur in unreachable
+      // code.
+      if (ReplVal == LI)
+        ReplVal = UndefValue::get(LI->getType());
+
+      LI->replaceAllUsesWith(ReplVal);
+    }
+
+    LI->eraseFromParent();
+    LBI.deleteValue(LI);
+  }
+
+  // Remove the (now dead) stores and alloca.
+  while (!AI->use_empty()) {
+    StoreInst *SI = cast<StoreInst>(AI->user_back());
+    // Record debuginfo for the store before removing it.
     for (DbgVariableIntrinsic *DII : Info.DbgUsers) {
       if (DII->isAddressOfVariable()) {
         DIBuilder DIB(*AI->getModule(), /*AllowUnresolved*/ false);
         ConvertDebugDeclareToDebugValue(DII, SI, DIB);
       }
-    } 
-    SI->eraseFromParent(); 
-    LBI.deleteValue(SI); 
-  } 
- 
-  AI->eraseFromParent(); 
- 
-  // The alloca's debuginfo can be removed as well. 
+    }
+    SI->eraseFromParent();
+    LBI.deleteValue(SI);
+  }
+
+  AI->eraseFromParent();
+
+  // The alloca's debuginfo can be removed as well.
   for (DbgVariableIntrinsic *DII : Info.DbgUsers)
     if (DII->isAddressOfVariable() || DII->getExpression()->startsWithDeref())
       DII->eraseFromParent();
- 
-  ++NumLocalPromoted; 
-  return true; 
-} 
- 
-void PromoteMem2Reg::run() { 
-  Function &F = *DT.getRoot()->getParent(); 
- 
+
+  ++NumLocalPromoted;
+  return true;
+}
+
+void PromoteMem2Reg::run() {
+  Function &F = *DT.getRoot()->getParent();
+
   AllocaDbgUsers.resize(Allocas.size());
- 
-  AllocaInfo Info; 
-  LargeBlockInfo LBI; 
-  ForwardIDFCalculator IDF(DT); 
- 
-  for (unsigned AllocaNum = 0; AllocaNum != Allocas.size(); ++AllocaNum) { 
-    AllocaInst *AI = Allocas[AllocaNum]; 
- 
-    assert(isAllocaPromotable(AI) && "Cannot promote non-promotable alloca!"); 
-    assert(AI->getParent()->getParent() == &F && 
-           "All allocas should be in the same function, which is same as DF!"); 
- 
+
+  AllocaInfo Info;
+  LargeBlockInfo LBI;
+  ForwardIDFCalculator IDF(DT);
+
+  for (unsigned AllocaNum = 0; AllocaNum != Allocas.size(); ++AllocaNum) {
+    AllocaInst *AI = Allocas[AllocaNum];
+
+    assert(isAllocaPromotable(AI) && "Cannot promote non-promotable alloca!");
+    assert(AI->getParent()->getParent() == &F &&
+           "All allocas should be in the same function, which is same as DF!");
+
     removeIntrinsicUsers(AI);
- 
-    if (AI->use_empty()) { 
-      // If there are no uses of the alloca, just delete it now. 
-      AI->eraseFromParent(); 
- 
-      // Remove the alloca from the Allocas list, since it has been processed 
-      RemoveFromAllocasList(AllocaNum); 
-      ++NumDeadAlloca; 
-      continue; 
-    } 
- 
-    // Calculate the set of read and write-locations for each alloca.  This is 
-    // analogous to finding the 'uses' and 'definitions' of each variable. 
-    Info.AnalyzeAlloca(AI); 
- 
-    // If there is only a single store to this value, replace any loads of 
-    // it that are directly dominated by the definition with the value stored. 
-    if (Info.DefiningBlocks.size() == 1) { 
-      if (rewriteSingleStoreAlloca(AI, Info, LBI, SQ.DL, DT, AC)) { 
-        // The alloca has been processed, move on. 
-        RemoveFromAllocasList(AllocaNum); 
-        ++NumSingleStore; 
-        continue; 
-      } 
-    } 
- 
-    // If the alloca is only read and written in one basic block, just perform a 
-    // linear sweep over the block to eliminate it. 
-    if (Info.OnlyUsedInOneBlock && 
-        promoteSingleBlockAlloca(AI, Info, LBI, SQ.DL, DT, AC)) { 
-      // The alloca has been processed, move on. 
-      RemoveFromAllocasList(AllocaNum); 
-      continue; 
-    } 
- 
-    // If we haven't computed a numbering for the BB's in the function, do so 
-    // now. 
-    if (BBNumbers.empty()) { 
-      unsigned ID = 0; 
-      for (auto &BB : F) 
-        BBNumbers[&BB] = ID++; 
-    } 
- 
-    // Remember the dbg.declare intrinsic describing this alloca, if any. 
+
+    if (AI->use_empty()) {
+      // If there are no uses of the alloca, just delete it now.
+      AI->eraseFromParent();
+
+      // Remove the alloca from the Allocas list, since it has been processed
+      RemoveFromAllocasList(AllocaNum);
+      ++NumDeadAlloca;
+      continue;
+    }
+
+    // Calculate the set of read and write-locations for each alloca.  This is
+    // analogous to finding the 'uses' and 'definitions' of each variable.
+    Info.AnalyzeAlloca(AI);
+
+    // If there is only a single store to this value, replace any loads of
+    // it that are directly dominated by the definition with the value stored.
+    if (Info.DefiningBlocks.size() == 1) {
+      if (rewriteSingleStoreAlloca(AI, Info, LBI, SQ.DL, DT, AC)) {
+        // The alloca has been processed, move on.
+        RemoveFromAllocasList(AllocaNum);
+        ++NumSingleStore;
+        continue;
+      }
+    }
+
+    // If the alloca is only read and written in one basic block, just perform a
+    // linear sweep over the block to eliminate it.
+    if (Info.OnlyUsedInOneBlock &&
+        promoteSingleBlockAlloca(AI, Info, LBI, SQ.DL, DT, AC)) {
+      // The alloca has been processed, move on.
+      RemoveFromAllocasList(AllocaNum);
+      continue;
+    }
+
+    // If we haven't computed a numbering for the BB's in the function, do so
+    // now.
+    if (BBNumbers.empty()) {
+      unsigned ID = 0;
+      for (auto &BB : F)
+        BBNumbers[&BB] = ID++;
+    }
+
+    // Remember the dbg.declare intrinsic describing this alloca, if any.
     if (!Info.DbgUsers.empty())
       AllocaDbgUsers[AllocaNum] = Info.DbgUsers;
- 
-    // Keep the reverse mapping of the 'Allocas' array for the rename pass. 
-    AllocaLookup[Allocas[AllocaNum]] = AllocaNum; 
- 
-    // Unique the set of defining blocks for efficient lookup. 
-    SmallPtrSet<BasicBlock *, 32> DefBlocks(Info.DefiningBlocks.begin(), 
-                                            Info.DefiningBlocks.end()); 
- 
-    // Determine which blocks the value is live in.  These are blocks which lead 
-    // to uses. 
-    SmallPtrSet<BasicBlock *, 32> LiveInBlocks; 
-    ComputeLiveInBlocks(AI, Info, DefBlocks, LiveInBlocks); 
- 
-    // At this point, we're committed to promoting the alloca using IDF's, and 
-    // the standard SSA construction algorithm.  Determine which blocks need phi 
-    // nodes and see if we can optimize out some work by avoiding insertion of 
-    // dead phi nodes. 
-    IDF.setLiveInBlocks(LiveInBlocks); 
-    IDF.setDefiningBlocks(DefBlocks); 
-    SmallVector<BasicBlock *, 32> PHIBlocks; 
-    IDF.calculate(PHIBlocks); 
-    llvm::sort(PHIBlocks, [this](BasicBlock *A, BasicBlock *B) { 
-      return BBNumbers.find(A)->second < BBNumbers.find(B)->second; 
-    }); 
- 
-    unsigned CurrentVersion = 0; 
-    for (BasicBlock *BB : PHIBlocks) 
-      QueuePhiNode(BB, AllocaNum, CurrentVersion); 
-  } 
- 
-  if (Allocas.empty()) 
-    return; // All of the allocas must have been trivial! 
- 
-  LBI.clear(); 
- 
-  // Set the incoming values for the basic block to be null values for all of 
-  // the alloca's.  We do this in case there is a load of a value that has not 
-  // been stored yet.  In this case, it will get this null value. 
-  RenamePassData::ValVector Values(Allocas.size()); 
-  for (unsigned i = 0, e = Allocas.size(); i != e; ++i) 
-    Values[i] = UndefValue::get(Allocas[i]->getAllocatedType()); 
- 
-  // When handling debug info, treat all incoming values as if they have unknown 
-  // locations until proven otherwise. 
-  RenamePassData::LocationVector Locations(Allocas.size()); 
- 
-  // Walks all basic blocks in the function performing the SSA rename algorithm 
-  // and inserting the phi nodes we marked as necessary 
-  std::vector<RenamePassData> RenamePassWorkList; 
-  RenamePassWorkList.emplace_back(&F.front(), nullptr, std::move(Values), 
-                                  std::move(Locations)); 
-  do { 
-    RenamePassData RPD = std::move(RenamePassWorkList.back()); 
-    RenamePassWorkList.pop_back(); 
-    // RenamePass may add new worklist entries. 
-    RenamePass(RPD.BB, RPD.Pred, RPD.Values, RPD.Locations, RenamePassWorkList); 
-  } while (!RenamePassWorkList.empty()); 
- 
-  // The renamer uses the Visited set to avoid infinite loops.  Clear it now. 
-  Visited.clear(); 
- 
-  // Remove the allocas themselves from the function. 
-  for (Instruction *A : Allocas) { 
-    // If there are any uses of the alloca instructions left, they must be in 
-    // unreachable basic blocks that were not processed by walking the dominator 
-    // tree. Just delete the users now. 
-    if (!A->use_empty()) 
-      A->replaceAllUsesWith(UndefValue::get(A->getType())); 
-    A->eraseFromParent(); 
-  } 
- 
-  // Remove alloca's dbg.declare instrinsics from the function. 
+
+    // Keep the reverse mapping of the 'Allocas' array for the rename pass.
+    AllocaLookup[Allocas[AllocaNum]] = AllocaNum;
+
+    // Unique the set of defining blocks for efficient lookup.
+    SmallPtrSet<BasicBlock *, 32> DefBlocks(Info.DefiningBlocks.begin(),
+                                            Info.DefiningBlocks.end());
+
+    // Determine which blocks the value is live in.  These are blocks which lead
+    // to uses.
+    SmallPtrSet<BasicBlock *, 32> LiveInBlocks;
+    ComputeLiveInBlocks(AI, Info, DefBlocks, LiveInBlocks);
+
+    // At this point, we're committed to promoting the alloca using IDF's, and
+    // the standard SSA construction algorithm.  Determine which blocks need phi
+    // nodes and see if we can optimize out some work by avoiding insertion of
+    // dead phi nodes.
+    IDF.setLiveInBlocks(LiveInBlocks);
+    IDF.setDefiningBlocks(DefBlocks);
+    SmallVector<BasicBlock *, 32> PHIBlocks;
+    IDF.calculate(PHIBlocks);
+    llvm::sort(PHIBlocks, [this](BasicBlock *A, BasicBlock *B) {
+      return BBNumbers.find(A)->second < BBNumbers.find(B)->second;
+    });
+
+    unsigned CurrentVersion = 0;
+    for (BasicBlock *BB : PHIBlocks)
+      QueuePhiNode(BB, AllocaNum, CurrentVersion);
+  }
+
+  if (Allocas.empty())
+    return; // All of the allocas must have been trivial!
+
+  LBI.clear();
+
+  // Set the incoming values for the basic block to be null values for all of
+  // the alloca's.  We do this in case there is a load of a value that has not
+  // been stored yet.  In this case, it will get this null value.
+  RenamePassData::ValVector Values(Allocas.size());
+  for (unsigned i = 0, e = Allocas.size(); i != e; ++i)
+    Values[i] = UndefValue::get(Allocas[i]->getAllocatedType());
+
+  // When handling debug info, treat all incoming values as if they have unknown
+  // locations until proven otherwise.
+  RenamePassData::LocationVector Locations(Allocas.size());
+
+  // Walks all basic blocks in the function performing the SSA rename algorithm
+  // and inserting the phi nodes we marked as necessary
+  std::vector<RenamePassData> RenamePassWorkList;
+  RenamePassWorkList.emplace_back(&F.front(), nullptr, std::move(Values),
+                                  std::move(Locations));
+  do {
+    RenamePassData RPD = std::move(RenamePassWorkList.back());
+    RenamePassWorkList.pop_back();
+    // RenamePass may add new worklist entries.
+    RenamePass(RPD.BB, RPD.Pred, RPD.Values, RPD.Locations, RenamePassWorkList);
+  } while (!RenamePassWorkList.empty());
+
+  // The renamer uses the Visited set to avoid infinite loops.  Clear it now.
+  Visited.clear();
+
+  // Remove the allocas themselves from the function.
+  for (Instruction *A : Allocas) {
+    // If there are any uses of the alloca instructions left, they must be in
+    // unreachable basic blocks that were not processed by walking the dominator
+    // tree. Just delete the users now.
+    if (!A->use_empty())
+      A->replaceAllUsesWith(UndefValue::get(A->getType()));
+    A->eraseFromParent();
+  }
+
+  // Remove alloca's dbg.declare instrinsics from the function.
   for (auto &DbgUsers : AllocaDbgUsers) {
     for (auto *DII : DbgUsers)
       if (DII->isAddressOfVariable() || DII->getExpression()->startsWithDeref())
         DII->eraseFromParent();
   }
- 
-  // Loop over all of the PHI nodes and see if there are any that we can get 
-  // rid of because they merge all of the same incoming values.  This can 
-  // happen due to undef values coming into the PHI nodes.  This process is 
-  // iterative, because eliminating one PHI node can cause others to be removed. 
-  bool EliminatedAPHI = true; 
-  while (EliminatedAPHI) { 
-    EliminatedAPHI = false; 
- 
-    // Iterating over NewPhiNodes is deterministic, so it is safe to try to 
-    // simplify and RAUW them as we go.  If it was not, we could add uses to 
-    // the values we replace with in a non-deterministic order, thus creating 
-    // non-deterministic def->use chains. 
-    for (DenseMap<std::pair<unsigned, unsigned>, PHINode *>::iterator 
-             I = NewPhiNodes.begin(), 
-             E = NewPhiNodes.end(); 
-         I != E;) { 
-      PHINode *PN = I->second; 
- 
-      // If this PHI node merges one value and/or undefs, get the value. 
-      if (Value *V = SimplifyInstruction(PN, SQ)) { 
-        PN->replaceAllUsesWith(V); 
-        PN->eraseFromParent(); 
-        NewPhiNodes.erase(I++); 
-        EliminatedAPHI = true; 
-        continue; 
-      } 
-      ++I; 
-    } 
-  } 
- 
-  // At this point, the renamer has added entries to PHI nodes for all reachable 
-  // code.  Unfortunately, there may be unreachable blocks which the renamer 
-  // hasn't traversed.  If this is the case, the PHI nodes may not 
-  // have incoming values for all predecessors.  Loop over all PHI nodes we have 
-  // created, inserting undef values if they are missing any incoming values. 
-  for (DenseMap<std::pair<unsigned, unsigned>, PHINode *>::iterator 
-           I = NewPhiNodes.begin(), 
-           E = NewPhiNodes.end(); 
-       I != E; ++I) { 
-    // We want to do this once per basic block.  As such, only process a block 
-    // when we find the PHI that is the first entry in the block. 
-    PHINode *SomePHI = I->second; 
-    BasicBlock *BB = SomePHI->getParent(); 
-    if (&BB->front() != SomePHI) 
-      continue; 
- 
-    // Only do work here if there the PHI nodes are missing incoming values.  We 
-    // know that all PHI nodes that were inserted in a block will have the same 
-    // number of incoming values, so we can just check any of them. 
-    if (SomePHI->getNumIncomingValues() == getNumPreds(BB)) 
-      continue; 
- 
-    // Get the preds for BB. 
+
+  // Loop over all of the PHI nodes and see if there are any that we can get
+  // rid of because they merge all of the same incoming values.  This can
+  // happen due to undef values coming into the PHI nodes.  This process is
+  // iterative, because eliminating one PHI node can cause others to be removed.
+  bool EliminatedAPHI = true;
+  while (EliminatedAPHI) {
+    EliminatedAPHI = false;
+
+    // Iterating over NewPhiNodes is deterministic, so it is safe to try to
+    // simplify and RAUW them as we go.  If it was not, we could add uses to
+    // the values we replace with in a non-deterministic order, thus creating
+    // non-deterministic def->use chains.
+    for (DenseMap<std::pair<unsigned, unsigned>, PHINode *>::iterator
+             I = NewPhiNodes.begin(),
+             E = NewPhiNodes.end();
+         I != E;) {
+      PHINode *PN = I->second;
+
+      // If this PHI node merges one value and/or undefs, get the value.
+      if (Value *V = SimplifyInstruction(PN, SQ)) {
+        PN->replaceAllUsesWith(V);
+        PN->eraseFromParent();
+        NewPhiNodes.erase(I++);
+        EliminatedAPHI = true;
+        continue;
+      }
+      ++I;
+    }
+  }
+
+  // At this point, the renamer has added entries to PHI nodes for all reachable
+  // code.  Unfortunately, there may be unreachable blocks which the renamer
+  // hasn't traversed.  If this is the case, the PHI nodes may not
+  // have incoming values for all predecessors.  Loop over all PHI nodes we have
+  // created, inserting undef values if they are missing any incoming values.
+  for (DenseMap<std::pair<unsigned, unsigned>, PHINode *>::iterator
+           I = NewPhiNodes.begin(),
+           E = NewPhiNodes.end();
+       I != E; ++I) {
+    // We want to do this once per basic block.  As such, only process a block
+    // when we find the PHI that is the first entry in the block.
+    PHINode *SomePHI = I->second;
+    BasicBlock *BB = SomePHI->getParent();
+    if (&BB->front() != SomePHI)
+      continue;
+
+    // Only do work here if there the PHI nodes are missing incoming values.  We
+    // know that all PHI nodes that were inserted in a block will have the same
+    // number of incoming values, so we can just check any of them.
+    if (SomePHI->getNumIncomingValues() == getNumPreds(BB))
+      continue;
+
+    // Get the preds for BB.
     SmallVector<BasicBlock *, 16> Preds(predecessors(BB));
- 
-    // Ok, now we know that all of the PHI nodes are missing entries for some 
-    // basic blocks.  Start by sorting the incoming predecessors for efficient 
-    // access. 
-    auto CompareBBNumbers = [this](BasicBlock *A, BasicBlock *B) { 
-      return BBNumbers.find(A)->second < BBNumbers.find(B)->second; 
-    }; 
-    llvm::sort(Preds, CompareBBNumbers); 
- 
-    // Now we loop through all BB's which have entries in SomePHI and remove 
-    // them from the Preds list. 
-    for (unsigned i = 0, e = SomePHI->getNumIncomingValues(); i != e; ++i) { 
-      // Do a log(n) search of the Preds list for the entry we want. 
-      SmallVectorImpl<BasicBlock *>::iterator EntIt = llvm::lower_bound( 
-          Preds, SomePHI->getIncomingBlock(i), CompareBBNumbers); 
-      assert(EntIt != Preds.end() && *EntIt == SomePHI->getIncomingBlock(i) && 
-             "PHI node has entry for a block which is not a predecessor!"); 
- 
-      // Remove the entry 
-      Preds.erase(EntIt); 
-    } 
- 
-    // At this point, the blocks left in the preds list must have dummy 
-    // entries inserted into every PHI nodes for the block.  Update all the phi 
-    // nodes in this block that we are inserting (there could be phis before 
-    // mem2reg runs). 
-    unsigned NumBadPreds = SomePHI->getNumIncomingValues(); 
-    BasicBlock::iterator BBI = BB->begin(); 
-    while ((SomePHI = dyn_cast<PHINode>(BBI++)) && 
-           SomePHI->getNumIncomingValues() == NumBadPreds) { 
-      Value *UndefVal = UndefValue::get(SomePHI->getType()); 
-      for (BasicBlock *Pred : Preds) 
-        SomePHI->addIncoming(UndefVal, Pred); 
-    } 
-  } 
- 
-  NewPhiNodes.clear(); 
-} 
- 
-/// Determine which blocks the value is live in. 
-/// 
-/// These are blocks which lead to uses.  Knowing this allows us to avoid 
-/// inserting PHI nodes into blocks which don't lead to uses (thus, the 
-/// inserted phi nodes would be dead). 
-void PromoteMem2Reg::ComputeLiveInBlocks( 
-    AllocaInst *AI, AllocaInfo &Info, 
-    const SmallPtrSetImpl<BasicBlock *> &DefBlocks, 
-    SmallPtrSetImpl<BasicBlock *> &LiveInBlocks) { 
-  // To determine liveness, we must iterate through the predecessors of blocks 
-  // where the def is live.  Blocks are added to the worklist if we need to 
-  // check their predecessors.  Start with all the using blocks. 
-  SmallVector<BasicBlock *, 64> LiveInBlockWorklist(Info.UsingBlocks.begin(), 
-                                                    Info.UsingBlocks.end()); 
- 
-  // If any of the using blocks is also a definition block, check to see if the 
-  // definition occurs before or after the use.  If it happens before the use, 
-  // the value isn't really live-in. 
-  for (unsigned i = 0, e = LiveInBlockWorklist.size(); i != e; ++i) { 
-    BasicBlock *BB = LiveInBlockWorklist[i]; 
-    if (!DefBlocks.count(BB)) 
-      continue; 
- 
-    // Okay, this is a block that both uses and defines the value.  If the first 
-    // reference to the alloca is a def (store), then we know it isn't live-in. 
-    for (BasicBlock::iterator I = BB->begin();; ++I) { 
-      if (StoreInst *SI = dyn_cast<StoreInst>(I)) { 
-        if (SI->getOperand(1) != AI) 
-          continue; 
- 
-        // We found a store to the alloca before a load.  The alloca is not 
-        // actually live-in here. 
-        LiveInBlockWorklist[i] = LiveInBlockWorklist.back(); 
-        LiveInBlockWorklist.pop_back(); 
-        --i; 
-        --e; 
-        break; 
-      } 
- 
-      if (LoadInst *LI = dyn_cast<LoadInst>(I)) 
-        // Okay, we found a load before a store to the alloca.  It is actually 
-        // live into this block. 
-        if (LI->getOperand(0) == AI) 
-          break; 
-    } 
-  } 
- 
-  // Now that we have a set of blocks where the phi is live-in, recursively add 
-  // their predecessors until we find the full region the value is live. 
-  while (!LiveInBlockWorklist.empty()) { 
-    BasicBlock *BB = LiveInBlockWorklist.pop_back_val(); 
- 
-    // The block really is live in here, insert it into the set.  If already in 
-    // the set, then it has already been processed. 
-    if (!LiveInBlocks.insert(BB).second) 
-      continue; 
- 
-    // Since the value is live into BB, it is either defined in a predecessor or 
-    // live into it to.  Add the preds to the worklist unless they are a 
-    // defining block. 
-    for (BasicBlock *P : predecessors(BB)) { 
-      // The value is not live into a predecessor if it defines the value. 
-      if (DefBlocks.count(P)) 
-        continue; 
- 
-      // Otherwise it is, add to the worklist. 
-      LiveInBlockWorklist.push_back(P); 
-    } 
-  } 
-} 
- 
-/// Queue a phi-node to be added to a basic-block for a specific Alloca. 
-/// 
-/// Returns true if there wasn't already a phi-node for that variable 
-bool PromoteMem2Reg::QueuePhiNode(BasicBlock *BB, unsigned AllocaNo, 
-                                  unsigned &Version) { 
-  // Look up the basic-block in question. 
-  PHINode *&PN = NewPhiNodes[std::make_pair(BBNumbers[BB], AllocaNo)]; 
- 
-  // If the BB already has a phi node added for the i'th alloca then we're done! 
-  if (PN) 
-    return false; 
- 
-  // Create a PhiNode using the dereferenced type... and add the phi-node to the 
-  // BasicBlock. 
-  PN = PHINode::Create(Allocas[AllocaNo]->getAllocatedType(), getNumPreds(BB), 
-                       Allocas[AllocaNo]->getName() + "." + Twine(Version++), 
-                       &BB->front()); 
-  ++NumPHIInsert; 
-  PhiToAllocaMap[PN] = AllocaNo; 
-  return true; 
-} 
- 
-/// Update the debug location of a phi. \p ApplyMergedLoc indicates whether to 
-/// create a merged location incorporating \p DL, or to set \p DL directly. 
-static void updateForIncomingValueLocation(PHINode *PN, DebugLoc DL, 
-                                           bool ApplyMergedLoc) { 
-  if (ApplyMergedLoc) 
-    PN->applyMergedLocation(PN->getDebugLoc(), DL); 
-  else 
-    PN->setDebugLoc(DL); 
-} 
- 
-/// Recursively traverse the CFG of the function, renaming loads and 
-/// stores to the allocas which we are promoting. 
-/// 
-/// IncomingVals indicates what value each Alloca contains on exit from the 
-/// predecessor block Pred. 
-void PromoteMem2Reg::RenamePass(BasicBlock *BB, BasicBlock *Pred, 
-                                RenamePassData::ValVector &IncomingVals, 
-                                RenamePassData::LocationVector &IncomingLocs, 
-                                std::vector<RenamePassData> &Worklist) { 
-NextIteration: 
-  // If we are inserting any phi nodes into this BB, they will already be in the 
-  // block. 
-  if (PHINode *APN = dyn_cast<PHINode>(BB->begin())) { 
-    // If we have PHI nodes to update, compute the number of edges from Pred to 
-    // BB. 
-    if (PhiToAllocaMap.count(APN)) { 
-      // We want to be able to distinguish between PHI nodes being inserted by 
-      // this invocation of mem2reg from those phi nodes that already existed in 
-      // the IR before mem2reg was run.  We determine that APN is being inserted 
-      // because it is missing incoming edges.  All other PHI nodes being 
-      // inserted by this pass of mem2reg will have the same number of incoming 
-      // operands so far.  Remember this count. 
-      unsigned NewPHINumOperands = APN->getNumOperands(); 
- 
+
+    // Ok, now we know that all of the PHI nodes are missing entries for some
+    // basic blocks.  Start by sorting the incoming predecessors for efficient
+    // access.
+    auto CompareBBNumbers = [this](BasicBlock *A, BasicBlock *B) {
+      return BBNumbers.find(A)->second < BBNumbers.find(B)->second;
+    };
+    llvm::sort(Preds, CompareBBNumbers);
+
+    // Now we loop through all BB's which have entries in SomePHI and remove
+    // them from the Preds list.
+    for (unsigned i = 0, e = SomePHI->getNumIncomingValues(); i != e; ++i) {
+      // Do a log(n) search of the Preds list for the entry we want.
+      SmallVectorImpl<BasicBlock *>::iterator EntIt = llvm::lower_bound(
+          Preds, SomePHI->getIncomingBlock(i), CompareBBNumbers);
+      assert(EntIt != Preds.end() && *EntIt == SomePHI->getIncomingBlock(i) &&
+             "PHI node has entry for a block which is not a predecessor!");
+
+      // Remove the entry
+      Preds.erase(EntIt);
+    }
+
+    // At this point, the blocks left in the preds list must have dummy
+    // entries inserted into every PHI nodes for the block.  Update all the phi
+    // nodes in this block that we are inserting (there could be phis before
+    // mem2reg runs).
+    unsigned NumBadPreds = SomePHI->getNumIncomingValues();
+    BasicBlock::iterator BBI = BB->begin();
+    while ((SomePHI = dyn_cast<PHINode>(BBI++)) &&
+           SomePHI->getNumIncomingValues() == NumBadPreds) {
+      Value *UndefVal = UndefValue::get(SomePHI->getType());
+      for (BasicBlock *Pred : Preds)
+        SomePHI->addIncoming(UndefVal, Pred);
+    }
+  }
+
+  NewPhiNodes.clear();
+}
+
+/// Determine which blocks the value is live in.
+///
+/// These are blocks which lead to uses.  Knowing this allows us to avoid
+/// inserting PHI nodes into blocks which don't lead to uses (thus, the
+/// inserted phi nodes would be dead).
+void PromoteMem2Reg::ComputeLiveInBlocks(
+    AllocaInst *AI, AllocaInfo &Info,
+    const SmallPtrSetImpl<BasicBlock *> &DefBlocks,
+    SmallPtrSetImpl<BasicBlock *> &LiveInBlocks) {
+  // To determine liveness, we must iterate through the predecessors of blocks
+  // where the def is live.  Blocks are added to the worklist if we need to
+  // check their predecessors.  Start with all the using blocks.
+  SmallVector<BasicBlock *, 64> LiveInBlockWorklist(Info.UsingBlocks.begin(),
+                                                    Info.UsingBlocks.end());
+
+  // If any of the using blocks is also a definition block, check to see if the
+  // definition occurs before or after the use.  If it happens before the use,
+  // the value isn't really live-in.
+  for (unsigned i = 0, e = LiveInBlockWorklist.size(); i != e; ++i) {
+    BasicBlock *BB = LiveInBlockWorklist[i];
+    if (!DefBlocks.count(BB))
+      continue;
+
+    // Okay, this is a block that both uses and defines the value.  If the first
+    // reference to the alloca is a def (store), then we know it isn't live-in.
+    for (BasicBlock::iterator I = BB->begin();; ++I) {
+      if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+        if (SI->getOperand(1) != AI)
+          continue;
+
+        // We found a store to the alloca before a load.  The alloca is not
+        // actually live-in here.
+        LiveInBlockWorklist[i] = LiveInBlockWorklist.back();
+        LiveInBlockWorklist.pop_back();
+        --i;
+        --e;
+        break;
+      }
+
+      if (LoadInst *LI = dyn_cast<LoadInst>(I))
+        // Okay, we found a load before a store to the alloca.  It is actually
+        // live into this block.
+        if (LI->getOperand(0) == AI)
+          break;
+    }
+  }
+
+  // Now that we have a set of blocks where the phi is live-in, recursively add
+  // their predecessors until we find the full region the value is live.
+  while (!LiveInBlockWorklist.empty()) {
+    BasicBlock *BB = LiveInBlockWorklist.pop_back_val();
+
+    // The block really is live in here, insert it into the set.  If already in
+    // the set, then it has already been processed.
+    if (!LiveInBlocks.insert(BB).second)
+      continue;
+
+    // Since the value is live into BB, it is either defined in a predecessor or
+    // live into it to.  Add the preds to the worklist unless they are a
+    // defining block.
+    for (BasicBlock *P : predecessors(BB)) {
+      // The value is not live into a predecessor if it defines the value.
+      if (DefBlocks.count(P))
+        continue;
+
+      // Otherwise it is, add to the worklist.
+      LiveInBlockWorklist.push_back(P);
+    }
+  }
+}
+
+/// Queue a phi-node to be added to a basic-block for a specific Alloca.
+///
+/// Returns true if there wasn't already a phi-node for that variable
+bool PromoteMem2Reg::QueuePhiNode(BasicBlock *BB, unsigned AllocaNo,
+                                  unsigned &Version) {
+  // Look up the basic-block in question.
+  PHINode *&PN = NewPhiNodes[std::make_pair(BBNumbers[BB], AllocaNo)];
+
+  // If the BB already has a phi node added for the i'th alloca then we're done!
+  if (PN)
+    return false;
+
+  // Create a PhiNode using the dereferenced type... and add the phi-node to the
+  // BasicBlock.
+  PN = PHINode::Create(Allocas[AllocaNo]->getAllocatedType(), getNumPreds(BB),
+                       Allocas[AllocaNo]->getName() + "." + Twine(Version++),
+                       &BB->front());
+  ++NumPHIInsert;
+  PhiToAllocaMap[PN] = AllocaNo;
+  return true;
+}
+
+/// Update the debug location of a phi. \p ApplyMergedLoc indicates whether to
+/// create a merged location incorporating \p DL, or to set \p DL directly.
+static void updateForIncomingValueLocation(PHINode *PN, DebugLoc DL,
+                                           bool ApplyMergedLoc) {
+  if (ApplyMergedLoc)
+    PN->applyMergedLocation(PN->getDebugLoc(), DL);
+  else
+    PN->setDebugLoc(DL);
+}
+
+/// Recursively traverse the CFG of the function, renaming loads and
+/// stores to the allocas which we are promoting.
+///
+/// IncomingVals indicates what value each Alloca contains on exit from the
+/// predecessor block Pred.
+void PromoteMem2Reg::RenamePass(BasicBlock *BB, BasicBlock *Pred,
+                                RenamePassData::ValVector &IncomingVals,
+                                RenamePassData::LocationVector &IncomingLocs,
+                                std::vector<RenamePassData> &Worklist) {
+NextIteration:
+  // If we are inserting any phi nodes into this BB, they will already be in the
+  // block.
+  if (PHINode *APN = dyn_cast<PHINode>(BB->begin())) {
+    // If we have PHI nodes to update, compute the number of edges from Pred to
+    // BB.
+    if (PhiToAllocaMap.count(APN)) {
+      // We want to be able to distinguish between PHI nodes being inserted by
+      // this invocation of mem2reg from those phi nodes that already existed in
+      // the IR before mem2reg was run.  We determine that APN is being inserted
+      // because it is missing incoming edges.  All other PHI nodes being
+      // inserted by this pass of mem2reg will have the same number of incoming
+      // operands so far.  Remember this count.
+      unsigned NewPHINumOperands = APN->getNumOperands();
+
       unsigned NumEdges = llvm::count(successors(Pred), BB);
-      assert(NumEdges && "Must be at least one edge from Pred to BB!"); 
- 
-      // Add entries for all the phis. 
-      BasicBlock::iterator PNI = BB->begin(); 
-      do { 
-        unsigned AllocaNo = PhiToAllocaMap[APN]; 
- 
-        // Update the location of the phi node. 
-        updateForIncomingValueLocation(APN, IncomingLocs[AllocaNo], 
-                                       APN->getNumIncomingValues() > 0); 
- 
-        // Add N incoming values to the PHI node. 
-        for (unsigned i = 0; i != NumEdges; ++i) 
-          APN->addIncoming(IncomingVals[AllocaNo], Pred); 
- 
-        // The currently active variable for this block is now the PHI. 
-        IncomingVals[AllocaNo] = APN; 
+      assert(NumEdges && "Must be at least one edge from Pred to BB!");
+
+      // Add entries for all the phis.
+      BasicBlock::iterator PNI = BB->begin();
+      do {
+        unsigned AllocaNo = PhiToAllocaMap[APN];
+
+        // Update the location of the phi node.
+        updateForIncomingValueLocation(APN, IncomingLocs[AllocaNo],
+                                       APN->getNumIncomingValues() > 0);
+
+        // Add N incoming values to the PHI node.
+        for (unsigned i = 0; i != NumEdges; ++i)
+          APN->addIncoming(IncomingVals[AllocaNo], Pred);
+
+        // The currently active variable for this block is now the PHI.
+        IncomingVals[AllocaNo] = APN;
         for (DbgVariableIntrinsic *DII : AllocaDbgUsers[AllocaNo])
           if (DII->isAddressOfVariable())
             ConvertDebugDeclareToDebugValue(DII, APN, DIB);
- 
-        // Get the next phi node. 
-        ++PNI; 
-        APN = dyn_cast<PHINode>(PNI); 
-        if (!APN) 
-          break; 
- 
-        // Verify that it is missing entries.  If not, it is not being inserted 
-        // by this mem2reg invocation so we want to ignore it. 
-      } while (APN->getNumOperands() == NewPHINumOperands); 
-    } 
-  } 
- 
-  // Don't revisit blocks. 
-  if (!Visited.insert(BB).second) 
-    return; 
- 
-  for (BasicBlock::iterator II = BB->begin(); !II->isTerminator();) { 
-    Instruction *I = &*II++; // get the instruction, increment iterator 
- 
-    if (LoadInst *LI = dyn_cast<LoadInst>(I)) { 
-      AllocaInst *Src = dyn_cast<AllocaInst>(LI->getPointerOperand()); 
-      if (!Src) 
-        continue; 
- 
-      DenseMap<AllocaInst *, unsigned>::iterator AI = AllocaLookup.find(Src); 
-      if (AI == AllocaLookup.end()) 
-        continue; 
- 
-      Value *V = IncomingVals[AI->second]; 
- 
-      // If the load was marked as nonnull we don't want to lose 
-      // that information when we erase this Load. So we preserve 
-      // it with an assume. 
-      if (AC && LI->getMetadata(LLVMContext::MD_nonnull) && 
-          !isKnownNonZero(V, SQ.DL, 0, AC, LI, &DT)) 
-        addAssumeNonNull(AC, LI); 
- 
-      // Anything using the load now uses the current value. 
-      LI->replaceAllUsesWith(V); 
-      BB->getInstList().erase(LI); 
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) { 
-      // Delete this instruction and mark the name as the current holder of the 
-      // value 
-      AllocaInst *Dest = dyn_cast<AllocaInst>(SI->getPointerOperand()); 
-      if (!Dest) 
-        continue; 
- 
-      DenseMap<AllocaInst *, unsigned>::iterator ai = AllocaLookup.find(Dest); 
-      if (ai == AllocaLookup.end()) 
-        continue; 
- 
-      // what value were we writing? 
-      unsigned AllocaNo = ai->second; 
-      IncomingVals[AllocaNo] = SI->getOperand(0); 
- 
-      // Record debuginfo for the store before removing it. 
-      IncomingLocs[AllocaNo] = SI->getDebugLoc(); 
+
+        // Get the next phi node.
+        ++PNI;
+        APN = dyn_cast<PHINode>(PNI);
+        if (!APN)
+          break;
+
+        // Verify that it is missing entries.  If not, it is not being inserted
+        // by this mem2reg invocation so we want to ignore it.
+      } while (APN->getNumOperands() == NewPHINumOperands);
+    }
+  }
+
+  // Don't revisit blocks.
+  if (!Visited.insert(BB).second)
+    return;
+
+  for (BasicBlock::iterator II = BB->begin(); !II->isTerminator();) {
+    Instruction *I = &*II++; // get the instruction, increment iterator
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+      AllocaInst *Src = dyn_cast<AllocaInst>(LI->getPointerOperand());
+      if (!Src)
+        continue;
+
+      DenseMap<AllocaInst *, unsigned>::iterator AI = AllocaLookup.find(Src);
+      if (AI == AllocaLookup.end())
+        continue;
+
+      Value *V = IncomingVals[AI->second];
+
+      // If the load was marked as nonnull we don't want to lose
+      // that information when we erase this Load. So we preserve
+      // it with an assume.
+      if (AC && LI->getMetadata(LLVMContext::MD_nonnull) &&
+          !isKnownNonZero(V, SQ.DL, 0, AC, LI, &DT))
+        addAssumeNonNull(AC, LI);
+
+      // Anything using the load now uses the current value.
+      LI->replaceAllUsesWith(V);
+      BB->getInstList().erase(LI);
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+      // Delete this instruction and mark the name as the current holder of the
+      // value
+      AllocaInst *Dest = dyn_cast<AllocaInst>(SI->getPointerOperand());
+      if (!Dest)
+        continue;
+
+      DenseMap<AllocaInst *, unsigned>::iterator ai = AllocaLookup.find(Dest);
+      if (ai == AllocaLookup.end())
+        continue;
+
+      // what value were we writing?
+      unsigned AllocaNo = ai->second;
+      IncomingVals[AllocaNo] = SI->getOperand(0);
+
+      // Record debuginfo for the store before removing it.
+      IncomingLocs[AllocaNo] = SI->getDebugLoc();
       for (DbgVariableIntrinsic *DII : AllocaDbgUsers[ai->second])
         if (DII->isAddressOfVariable())
           ConvertDebugDeclareToDebugValue(DII, SI, DIB);
-      BB->getInstList().erase(SI); 
-    } 
-  } 
- 
-  // 'Recurse' to our successors. 
-  succ_iterator I = succ_begin(BB), E = succ_end(BB); 
-  if (I == E) 
-    return; 
- 
-  // Keep track of the successors so we don't visit the same successor twice 
-  SmallPtrSet<BasicBlock *, 8> VisitedSuccs; 
- 
-  // Handle the first successor without using the worklist. 
-  VisitedSuccs.insert(*I); 
-  Pred = BB; 
-  BB = *I; 
-  ++I; 
- 
-  for (; I != E; ++I) 
-    if (VisitedSuccs.insert(*I).second) 
-      Worklist.emplace_back(*I, Pred, IncomingVals, IncomingLocs); 
- 
-  goto NextIteration; 
-} 
- 
-void llvm::PromoteMemToReg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT, 
-                           AssumptionCache *AC) { 
-  // If there is nothing to do, bail out... 
-  if (Allocas.empty()) 
-    return; 
- 
-  PromoteMem2Reg(Allocas, DT, AC).run(); 
-} 
+      BB->getInstList().erase(SI);
+    }
+  }
+
+  // 'Recurse' to our successors.
+  succ_iterator I = succ_begin(BB), E = succ_end(BB);
+  if (I == E)
+    return;
+
+  // Keep track of the successors so we don't visit the same successor twice
+  SmallPtrSet<BasicBlock *, 8> VisitedSuccs;
+
+  // Handle the first successor without using the worklist.
+  VisitedSuccs.insert(*I);
+  Pred = BB;
+  BB = *I;
+  ++I;
+
+  for (; I != E; ++I)
+    if (VisitedSuccs.insert(*I).second)
+      Worklist.emplace_back(*I, Pred, IncomingVals, IncomingLocs);
+
+  goto NextIteration;
+}
+
+void llvm::PromoteMemToReg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT,
+                           AssumptionCache *AC) {
+  // If there is nothing to do, bail out...
+  if (Allocas.empty())
+    return;
+
+  PromoteMem2Reg(Allocas, DT, AC).run();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/SSAUpdater.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/SSAUpdater.cpp
index 509027119c..c210d1c460 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/SSAUpdater.cpp
@@ -1,481 +1,481 @@
-//===- SSAUpdater.cpp - Unstructured SSA Update Tool ----------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the SSAUpdater class. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/SSAUpdater.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/TinyPtrVector.h" 
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DebugLoc.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/IR/ValueHandle.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Utils/SSAUpdaterImpl.h" 
-#include <cassert> 
-#include <utility> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "ssaupdater" 
- 
-using AvailableValsTy = DenseMap<BasicBlock *, Value *>; 
- 
-static AvailableValsTy &getAvailableVals(void *AV) { 
-  return *static_cast<AvailableValsTy*>(AV); 
-} 
- 
-SSAUpdater::SSAUpdater(SmallVectorImpl<PHINode *> *NewPHI) 
-  : InsertedPHIs(NewPHI) {} 
- 
-SSAUpdater::~SSAUpdater() { 
-  delete static_cast<AvailableValsTy*>(AV); 
-} 
- 
-void SSAUpdater::Initialize(Type *Ty, StringRef Name) { 
-  if (!AV) 
-    AV = new AvailableValsTy(); 
-  else 
-    getAvailableVals(AV).clear(); 
-  ProtoType = Ty; 
-  ProtoName = std::string(Name); 
-} 
- 
-bool SSAUpdater::HasValueForBlock(BasicBlock *BB) const { 
-  return getAvailableVals(AV).count(BB); 
-} 
- 
-Value *SSAUpdater::FindValueForBlock(BasicBlock *BB) const { 
+//===- SSAUpdater.cpp - Unstructured SSA Update Tool ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SSAUpdater class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/SSAUpdaterImpl.h"
+#include <cassert>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ssaupdater"
+
+using AvailableValsTy = DenseMap<BasicBlock *, Value *>;
+
+static AvailableValsTy &getAvailableVals(void *AV) {
+  return *static_cast<AvailableValsTy*>(AV);
+}
+
+SSAUpdater::SSAUpdater(SmallVectorImpl<PHINode *> *NewPHI)
+  : InsertedPHIs(NewPHI) {}
+
+SSAUpdater::~SSAUpdater() {
+  delete static_cast<AvailableValsTy*>(AV);
+}
+
+void SSAUpdater::Initialize(Type *Ty, StringRef Name) {
+  if (!AV)
+    AV = new AvailableValsTy();
+  else
+    getAvailableVals(AV).clear();
+  ProtoType = Ty;
+  ProtoName = std::string(Name);
+}
+
+bool SSAUpdater::HasValueForBlock(BasicBlock *BB) const {
+  return getAvailableVals(AV).count(BB);
+}
+
+Value *SSAUpdater::FindValueForBlock(BasicBlock *BB) const {
   return getAvailableVals(AV).lookup(BB);
-} 
- 
-void SSAUpdater::AddAvailableValue(BasicBlock *BB, Value *V) { 
-  assert(ProtoType && "Need to initialize SSAUpdater"); 
-  assert(ProtoType == V->getType() && 
-         "All rewritten values must have the same type"); 
-  getAvailableVals(AV)[BB] = V; 
-} 
- 
-static bool IsEquivalentPHI(PHINode *PHI, 
-                        SmallDenseMap<BasicBlock *, Value *, 8> &ValueMapping) { 
-  unsigned PHINumValues = PHI->getNumIncomingValues(); 
-  if (PHINumValues != ValueMapping.size()) 
-    return false; 
- 
-  // Scan the phi to see if it matches. 
-  for (unsigned i = 0, e = PHINumValues; i != e; ++i) 
-    if (ValueMapping[PHI->getIncomingBlock(i)] != 
-        PHI->getIncomingValue(i)) { 
-      return false; 
-    } 
- 
-  return true; 
-} 
- 
-Value *SSAUpdater::GetValueAtEndOfBlock(BasicBlock *BB) { 
-  Value *Res = GetValueAtEndOfBlockInternal(BB); 
-  return Res; 
-} 
- 
-Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) { 
-  // If there is no definition of the renamed variable in this block, just use 
-  // GetValueAtEndOfBlock to do our work. 
-  if (!HasValueForBlock(BB)) 
-    return GetValueAtEndOfBlock(BB); 
- 
-  // Otherwise, we have the hard case.  Get the live-in values for each 
-  // predecessor. 
-  SmallVector<std::pair<BasicBlock *, Value *>, 8> PredValues; 
-  Value *SingularValue = nullptr; 
- 
-  // We can get our predecessor info by walking the pred_iterator list, but it 
-  // is relatively slow.  If we already have PHI nodes in this block, walk one 
-  // of them to get the predecessor list instead. 
-  if (PHINode *SomePhi = dyn_cast<PHINode>(BB->begin())) { 
-    for (unsigned i = 0, e = SomePhi->getNumIncomingValues(); i != e; ++i) { 
-      BasicBlock *PredBB = SomePhi->getIncomingBlock(i); 
-      Value *PredVal = GetValueAtEndOfBlock(PredBB); 
-      PredValues.push_back(std::make_pair(PredBB, PredVal)); 
- 
-      // Compute SingularValue. 
-      if (i == 0) 
-        SingularValue = PredVal; 
-      else if (PredVal != SingularValue) 
-        SingularValue = nullptr; 
-    } 
-  } else { 
-    bool isFirstPred = true; 
-    for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { 
-      BasicBlock *PredBB = *PI; 
-      Value *PredVal = GetValueAtEndOfBlock(PredBB); 
-      PredValues.push_back(std::make_pair(PredBB, PredVal)); 
- 
-      // Compute SingularValue. 
-      if (isFirstPred) { 
-        SingularValue = PredVal; 
-        isFirstPred = false; 
-      } else if (PredVal != SingularValue) 
-        SingularValue = nullptr; 
-    } 
-  } 
- 
-  // If there are no predecessors, just return undef. 
-  if (PredValues.empty()) 
-    return UndefValue::get(ProtoType); 
- 
-  // Otherwise, if all the merged values are the same, just use it. 
-  if (SingularValue) 
-    return SingularValue; 
- 
-  // Otherwise, we do need a PHI: check to see if we already have one available 
-  // in this block that produces the right value. 
-  if (isa<PHINode>(BB->begin())) { 
-    SmallDenseMap<BasicBlock *, Value *, 8> ValueMapping(PredValues.begin(), 
-                                                         PredValues.end()); 
-    for (PHINode &SomePHI : BB->phis()) { 
-      if (IsEquivalentPHI(&SomePHI, ValueMapping)) 
-        return &SomePHI; 
-    } 
-  } 
- 
-  // Ok, we have no way out, insert a new one now. 
-  PHINode *InsertedPHI = PHINode::Create(ProtoType, PredValues.size(), 
-                                         ProtoName, &BB->front()); 
- 
-  // Fill in all the predecessors of the PHI. 
-  for (const auto &PredValue : PredValues) 
-    InsertedPHI->addIncoming(PredValue.second, PredValue.first); 
- 
-  // See if the PHI node can be merged to a single value.  This can happen in 
-  // loop cases when we get a PHI of itself and one other value. 
-  if (Value *V = 
-          SimplifyInstruction(InsertedPHI, BB->getModule()->getDataLayout())) { 
-    InsertedPHI->eraseFromParent(); 
-    return V; 
-  } 
- 
-  // Set the DebugLoc of the inserted PHI, if available. 
-  DebugLoc DL; 
-  if (const Instruction *I = BB->getFirstNonPHI()) 
-      DL = I->getDebugLoc(); 
-  InsertedPHI->setDebugLoc(DL); 
- 
-  // If the client wants to know about all new instructions, tell it. 
-  if (InsertedPHIs) InsertedPHIs->push_back(InsertedPHI); 
- 
-  LLVM_DEBUG(dbgs() << "  Inserted PHI: " << *InsertedPHI << "\n"); 
-  return InsertedPHI; 
-} 
- 
-void SSAUpdater::RewriteUse(Use &U) { 
-  Instruction *User = cast<Instruction>(U.getUser()); 
- 
-  Value *V; 
-  if (PHINode *UserPN = dyn_cast<PHINode>(User)) 
-    V = GetValueAtEndOfBlock(UserPN->getIncomingBlock(U)); 
-  else 
-    V = GetValueInMiddleOfBlock(User->getParent()); 
- 
-  U.set(V); 
-} 
- 
-void SSAUpdater::RewriteUseAfterInsertions(Use &U) { 
-  Instruction *User = cast<Instruction>(U.getUser()); 
- 
-  Value *V; 
-  if (PHINode *UserPN = dyn_cast<PHINode>(User)) 
-    V = GetValueAtEndOfBlock(UserPN->getIncomingBlock(U)); 
-  else 
-    V = GetValueAtEndOfBlock(User->getParent()); 
- 
-  U.set(V); 
-} 
- 
-namespace llvm { 
- 
-template<> 
-class SSAUpdaterTraits<SSAUpdater> { 
-public: 
-  using BlkT = BasicBlock; 
-  using ValT = Value *; 
-  using PhiT = PHINode; 
-  using BlkSucc_iterator = succ_iterator; 
- 
-  static BlkSucc_iterator BlkSucc_begin(BlkT *BB) { return succ_begin(BB); } 
-  static BlkSucc_iterator BlkSucc_end(BlkT *BB) { return succ_end(BB); } 
- 
-  class PHI_iterator { 
-  private: 
-    PHINode *PHI; 
-    unsigned idx; 
- 
-  public: 
-    explicit PHI_iterator(PHINode *P) // begin iterator 
-      : PHI(P), idx(0) {} 
-    PHI_iterator(PHINode *P, bool) // end iterator 
-      : PHI(P), idx(PHI->getNumIncomingValues()) {} 
- 
-    PHI_iterator &operator++() { ++idx; return *this; } 
-    bool operator==(const PHI_iterator& x) const { return idx == x.idx; } 
-    bool operator!=(const PHI_iterator& x) const { return !operator==(x); } 
- 
-    Value *getIncomingValue() { return PHI->getIncomingValue(idx); } 
-    BasicBlock *getIncomingBlock() { return PHI->getIncomingBlock(idx); } 
-  }; 
- 
-  static PHI_iterator PHI_begin(PhiT *PHI) { return PHI_iterator(PHI); } 
-  static PHI_iterator PHI_end(PhiT *PHI) { 
-    return PHI_iterator(PHI, true); 
-  } 
- 
-  /// FindPredecessorBlocks - Put the predecessors of Info->BB into the Preds 
-  /// vector, set Info->NumPreds, and allocate space in Info->Preds. 
-  static void FindPredecessorBlocks(BasicBlock *BB, 
-                                    SmallVectorImpl<BasicBlock *> *Preds) { 
-    // We can get our predecessor info by walking the pred_iterator list, 
-    // but it is relatively slow.  If we already have PHI nodes in this 
-    // block, walk one of them to get the predecessor list instead. 
+}
+
+void SSAUpdater::AddAvailableValue(BasicBlock *BB, Value *V) {
+  assert(ProtoType && "Need to initialize SSAUpdater");
+  assert(ProtoType == V->getType() &&
+         "All rewritten values must have the same type");
+  getAvailableVals(AV)[BB] = V;
+}
+
+static bool IsEquivalentPHI(PHINode *PHI,
+                        SmallDenseMap<BasicBlock *, Value *, 8> &ValueMapping) {
+  unsigned PHINumValues = PHI->getNumIncomingValues();
+  if (PHINumValues != ValueMapping.size())
+    return false;
+
+  // Scan the phi to see if it matches.
+  for (unsigned i = 0, e = PHINumValues; i != e; ++i)
+    if (ValueMapping[PHI->getIncomingBlock(i)] !=
+        PHI->getIncomingValue(i)) {
+      return false;
+    }
+
+  return true;
+}
+
+Value *SSAUpdater::GetValueAtEndOfBlock(BasicBlock *BB) {
+  Value *Res = GetValueAtEndOfBlockInternal(BB);
+  return Res;
+}
+
+Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
+  // If there is no definition of the renamed variable in this block, just use
+  // GetValueAtEndOfBlock to do our work.
+  if (!HasValueForBlock(BB))
+    return GetValueAtEndOfBlock(BB);
+
+  // Otherwise, we have the hard case.  Get the live-in values for each
+  // predecessor.
+  SmallVector<std::pair<BasicBlock *, Value *>, 8> PredValues;
+  Value *SingularValue = nullptr;
+
+  // We can get our predecessor info by walking the pred_iterator list, but it
+  // is relatively slow.  If we already have PHI nodes in this block, walk one
+  // of them to get the predecessor list instead.
+  if (PHINode *SomePhi = dyn_cast<PHINode>(BB->begin())) {
+    for (unsigned i = 0, e = SomePhi->getNumIncomingValues(); i != e; ++i) {
+      BasicBlock *PredBB = SomePhi->getIncomingBlock(i);
+      Value *PredVal = GetValueAtEndOfBlock(PredBB);
+      PredValues.push_back(std::make_pair(PredBB, PredVal));
+
+      // Compute SingularValue.
+      if (i == 0)
+        SingularValue = PredVal;
+      else if (PredVal != SingularValue)
+        SingularValue = nullptr;
+    }
+  } else {
+    bool isFirstPred = true;
+    for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+      BasicBlock *PredBB = *PI;
+      Value *PredVal = GetValueAtEndOfBlock(PredBB);
+      PredValues.push_back(std::make_pair(PredBB, PredVal));
+
+      // Compute SingularValue.
+      if (isFirstPred) {
+        SingularValue = PredVal;
+        isFirstPred = false;
+      } else if (PredVal != SingularValue)
+        SingularValue = nullptr;
+    }
+  }
+
+  // If there are no predecessors, just return undef.
+  if (PredValues.empty())
+    return UndefValue::get(ProtoType);
+
+  // Otherwise, if all the merged values are the same, just use it.
+  if (SingularValue)
+    return SingularValue;
+
+  // Otherwise, we do need a PHI: check to see if we already have one available
+  // in this block that produces the right value.
+  if (isa<PHINode>(BB->begin())) {
+    SmallDenseMap<BasicBlock *, Value *, 8> ValueMapping(PredValues.begin(),
+                                                         PredValues.end());
+    for (PHINode &SomePHI : BB->phis()) {
+      if (IsEquivalentPHI(&SomePHI, ValueMapping))
+        return &SomePHI;
+    }
+  }
+
+  // Ok, we have no way out, insert a new one now.
+  PHINode *InsertedPHI = PHINode::Create(ProtoType, PredValues.size(),
+                                         ProtoName, &BB->front());
+
+  // Fill in all the predecessors of the PHI.
+  for (const auto &PredValue : PredValues)
+    InsertedPHI->addIncoming(PredValue.second, PredValue.first);
+
+  // See if the PHI node can be merged to a single value.  This can happen in
+  // loop cases when we get a PHI of itself and one other value.
+  if (Value *V =
+          SimplifyInstruction(InsertedPHI, BB->getModule()->getDataLayout())) {
+    InsertedPHI->eraseFromParent();
+    return V;
+  }
+
+  // Set the DebugLoc of the inserted PHI, if available.
+  DebugLoc DL;
+  if (const Instruction *I = BB->getFirstNonPHI())
+      DL = I->getDebugLoc();
+  InsertedPHI->setDebugLoc(DL);
+
+  // If the client wants to know about all new instructions, tell it.
+  if (InsertedPHIs) InsertedPHIs->push_back(InsertedPHI);
+
+  LLVM_DEBUG(dbgs() << "  Inserted PHI: " << *InsertedPHI << "\n");
+  return InsertedPHI;
+}
+
+void SSAUpdater::RewriteUse(Use &U) {
+  Instruction *User = cast<Instruction>(U.getUser());
+
+  Value *V;
+  if (PHINode *UserPN = dyn_cast<PHINode>(User))
+    V = GetValueAtEndOfBlock(UserPN->getIncomingBlock(U));
+  else
+    V = GetValueInMiddleOfBlock(User->getParent());
+
+  U.set(V);
+}
+
+void SSAUpdater::RewriteUseAfterInsertions(Use &U) {
+  Instruction *User = cast<Instruction>(U.getUser());
+
+  Value *V;
+  if (PHINode *UserPN = dyn_cast<PHINode>(User))
+    V = GetValueAtEndOfBlock(UserPN->getIncomingBlock(U));
+  else
+    V = GetValueAtEndOfBlock(User->getParent());
+
+  U.set(V);
+}
+
+namespace llvm {
+
+template<>
+class SSAUpdaterTraits<SSAUpdater> {
+public:
+  using BlkT = BasicBlock;
+  using ValT = Value *;
+  using PhiT = PHINode;
+  using BlkSucc_iterator = succ_iterator;
+
+  static BlkSucc_iterator BlkSucc_begin(BlkT *BB) { return succ_begin(BB); }
+  static BlkSucc_iterator BlkSucc_end(BlkT *BB) { return succ_end(BB); }
+
+  class PHI_iterator {
+  private:
+    PHINode *PHI;
+    unsigned idx;
+
+  public:
+    explicit PHI_iterator(PHINode *P) // begin iterator
+      : PHI(P), idx(0) {}
+    PHI_iterator(PHINode *P, bool) // end iterator
+      : PHI(P), idx(PHI->getNumIncomingValues()) {}
+
+    PHI_iterator &operator++() { ++idx; return *this; }
+    bool operator==(const PHI_iterator& x) const { return idx == x.idx; }
+    bool operator!=(const PHI_iterator& x) const { return !operator==(x); }
+
+    Value *getIncomingValue() { return PHI->getIncomingValue(idx); }
+    BasicBlock *getIncomingBlock() { return PHI->getIncomingBlock(idx); }
+  };
+
+  static PHI_iterator PHI_begin(PhiT *PHI) { return PHI_iterator(PHI); }
+  static PHI_iterator PHI_end(PhiT *PHI) {
+    return PHI_iterator(PHI, true);
+  }
+
+  /// FindPredecessorBlocks - Put the predecessors of Info->BB into the Preds
+  /// vector, set Info->NumPreds, and allocate space in Info->Preds.
+  static void FindPredecessorBlocks(BasicBlock *BB,
+                                    SmallVectorImpl<BasicBlock *> *Preds) {
+    // We can get our predecessor info by walking the pred_iterator list,
+    // but it is relatively slow.  If we already have PHI nodes in this
+    // block, walk one of them to get the predecessor list instead.
     if (PHINode *SomePhi = dyn_cast<PHINode>(BB->begin()))
       append_range(*Preds, SomePhi->blocks());
     else
       append_range(*Preds, predecessors(BB));
-  } 
- 
-  /// GetUndefVal - Get an undefined value of the same type as the value 
-  /// being handled. 
-  static Value *GetUndefVal(BasicBlock *BB, SSAUpdater *Updater) { 
-    return UndefValue::get(Updater->ProtoType); 
-  } 
- 
-  /// CreateEmptyPHI - Create a new PHI instruction in the specified block. 
-  /// Reserve space for the operands but do not fill them in yet. 
-  static Value *CreateEmptyPHI(BasicBlock *BB, unsigned NumPreds, 
-                               SSAUpdater *Updater) { 
-    PHINode *PHI = PHINode::Create(Updater->ProtoType, NumPreds, 
-                                   Updater->ProtoName, &BB->front()); 
-    return PHI; 
-  } 
- 
-  /// AddPHIOperand - Add the specified value as an operand of the PHI for 
-  /// the specified predecessor block. 
-  static void AddPHIOperand(PHINode *PHI, Value *Val, BasicBlock *Pred) { 
-    PHI->addIncoming(Val, Pred); 
-  } 
- 
-  /// ValueIsPHI - Check if a value is a PHI. 
-  static PHINode *ValueIsPHI(Value *Val, SSAUpdater *Updater) { 
-    return dyn_cast<PHINode>(Val); 
-  } 
- 
-  /// ValueIsNewPHI - Like ValueIsPHI but also check if the PHI has no source 
-  /// operands, i.e., it was just added. 
-  static PHINode *ValueIsNewPHI(Value *Val, SSAUpdater *Updater) { 
-    PHINode *PHI = ValueIsPHI(Val, Updater); 
-    if (PHI && PHI->getNumIncomingValues() == 0) 
-      return PHI; 
-    return nullptr; 
-  } 
- 
-  /// GetPHIValue - For the specified PHI instruction, return the value 
-  /// that it defines. 
-  static Value *GetPHIValue(PHINode *PHI) { 
-    return PHI; 
-  } 
-}; 
- 
-} // end namespace llvm 
- 
-/// Check to see if AvailableVals has an entry for the specified BB and if so, 
-/// return it.  If not, construct SSA form by first calculating the required 
-/// placement of PHIs and then inserting new PHIs where needed. 
-Value *SSAUpdater::GetValueAtEndOfBlockInternal(BasicBlock *BB) { 
-  AvailableValsTy &AvailableVals = getAvailableVals(AV); 
-  if (Value *V = AvailableVals[BB]) 
-    return V; 
- 
-  SSAUpdaterImpl<SSAUpdater> Impl(this, &AvailableVals, InsertedPHIs); 
-  return Impl.GetValue(BB); 
-} 
- 
-//===----------------------------------------------------------------------===// 
-// LoadAndStorePromoter Implementation 
-//===----------------------------------------------------------------------===// 
- 
-LoadAndStorePromoter:: 
-LoadAndStorePromoter(ArrayRef<const Instruction *> Insts, 
-                     SSAUpdater &S, StringRef BaseName) : SSA(S) { 
-  if (Insts.empty()) return; 
- 
-  const Value *SomeVal; 
-  if (const LoadInst *LI = dyn_cast<LoadInst>(Insts[0])) 
-    SomeVal = LI; 
-  else 
-    SomeVal = cast<StoreInst>(Insts[0])->getOperand(0); 
- 
-  if (BaseName.empty()) 
-    BaseName = SomeVal->getName(); 
-  SSA.Initialize(SomeVal->getType(), BaseName); 
-} 
- 
-void LoadAndStorePromoter::run(const SmallVectorImpl<Instruction *> &Insts) { 
-  // First step: bucket up uses of the alloca by the block they occur in. 
-  // This is important because we have to handle multiple defs/uses in a block 
-  // ourselves: SSAUpdater is purely for cross-block references. 
-  DenseMap<BasicBlock *, TinyPtrVector<Instruction *>> UsesByBlock; 
- 
-  for (Instruction *User : Insts) 
-    UsesByBlock[User->getParent()].push_back(User); 
- 
-  // Okay, now we can iterate over all the blocks in the function with uses, 
-  // processing them.  Keep track of which loads are loading a live-in value. 
-  // Walk the uses in the use-list order to be determinstic. 
-  SmallVector<LoadInst *, 32> LiveInLoads; 
-  DenseMap<Value *, Value *> ReplacedLoads; 
- 
-  for (Instruction *User : Insts) { 
-    BasicBlock *BB = User->getParent(); 
-    TinyPtrVector<Instruction *> &BlockUses = UsesByBlock[BB]; 
- 
-    // If this block has already been processed, ignore this repeat use. 
-    if (BlockUses.empty()) continue; 
- 
-    // Okay, this is the first use in the block.  If this block just has a 
-    // single user in it, we can rewrite it trivially. 
-    if (BlockUses.size() == 1) { 
-      // If it is a store, it is a trivial def of the value in the block. 
-      if (StoreInst *SI = dyn_cast<StoreInst>(User)) { 
-        updateDebugInfo(SI); 
-        SSA.AddAvailableValue(BB, SI->getOperand(0)); 
-      } else 
-        // Otherwise it is a load, queue it to rewrite as a live-in load. 
-        LiveInLoads.push_back(cast<LoadInst>(User)); 
-      BlockUses.clear(); 
-      continue; 
-    } 
- 
-    // Otherwise, check to see if this block is all loads. 
-    bool HasStore = false; 
-    for (Instruction *I : BlockUses) { 
-      if (isa<StoreInst>(I)) { 
-        HasStore = true; 
-        break; 
-      } 
-    } 
- 
-    // If so, we can queue them all as live in loads.  We don't have an 
-    // efficient way to tell which on is first in the block and don't want to 
-    // scan large blocks, so just add all loads as live ins. 
-    if (!HasStore) { 
-      for (Instruction *I : BlockUses) 
-        LiveInLoads.push_back(cast<LoadInst>(I)); 
-      BlockUses.clear(); 
-      continue; 
-    } 
- 
-    // Otherwise, we have mixed loads and stores (or just a bunch of stores). 
-    // Since SSAUpdater is purely for cross-block values, we need to determine 
-    // the order of these instructions in the block.  If the first use in the 
-    // block is a load, then it uses the live in value.  The last store defines 
-    // the live out value.  We handle this by doing a linear scan of the block. 
-    Value *StoredValue = nullptr; 
-    for (Instruction &I : *BB) { 
-      if (LoadInst *L = dyn_cast<LoadInst>(&I)) { 
-        // If this is a load from an unrelated pointer, ignore it. 
-        if (!isInstInList(L, Insts)) continue; 
- 
-        // If we haven't seen a store yet, this is a live in use, otherwise 
-        // use the stored value. 
-        if (StoredValue) { 
-          replaceLoadWithValue(L, StoredValue); 
-          L->replaceAllUsesWith(StoredValue); 
-          ReplacedLoads[L] = StoredValue; 
-        } else { 
-          LiveInLoads.push_back(L); 
-        } 
-        continue; 
-      } 
- 
-      if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { 
-        // If this is a store to an unrelated pointer, ignore it. 
-        if (!isInstInList(SI, Insts)) continue; 
-        updateDebugInfo(SI); 
- 
-        // Remember that this is the active value in the block. 
-        StoredValue = SI->getOperand(0); 
-      } 
-    } 
- 
-    // The last stored value that happened is the live-out for the block. 
-    assert(StoredValue && "Already checked that there is a store in block"); 
-    SSA.AddAvailableValue(BB, StoredValue); 
-    BlockUses.clear(); 
-  } 
- 
-  // Okay, now we rewrite all loads that use live-in values in the loop, 
-  // inserting PHI nodes as necessary. 
-  for (LoadInst *ALoad : LiveInLoads) { 
-    Value *NewVal = SSA.GetValueInMiddleOfBlock(ALoad->getParent()); 
-    replaceLoadWithValue(ALoad, NewVal); 
- 
-    // Avoid assertions in unreachable code. 
-    if (NewVal == ALoad) NewVal = UndefValue::get(NewVal->getType()); 
-    ALoad->replaceAllUsesWith(NewVal); 
-    ReplacedLoads[ALoad] = NewVal; 
-  } 
- 
-  // Allow the client to do stuff before we start nuking things. 
-  doExtraRewritesBeforeFinalDeletion(); 
- 
-  // Now that everything is rewritten, delete the old instructions from the 
-  // function.  They should all be dead now. 
-  for (Instruction *User : Insts) { 
-    // If this is a load that still has uses, then the load must have been added 
-    // as a live value in the SSAUpdate data structure for a block (e.g. because 
-    // the loaded value was stored later).  In this case, we need to recursively 
-    // propagate the updates until we get to the real value. 
-    if (!User->use_empty()) { 
-      Value *NewVal = ReplacedLoads[User]; 
-      assert(NewVal && "not a replaced load?"); 
- 
-      // Propagate down to the ultimate replacee.  The intermediately loads 
-      // could theoretically already have been deleted, so we don't want to 
-      // dereference the Value*'s. 
-      DenseMap<Value*, Value*>::iterator RLI = ReplacedLoads.find(NewVal); 
-      while (RLI != ReplacedLoads.end()) { 
-        NewVal = RLI->second; 
-        RLI = ReplacedLoads.find(NewVal); 
-      } 
- 
-      replaceLoadWithValue(cast<LoadInst>(User), NewVal); 
-      User->replaceAllUsesWith(NewVal); 
-    } 
- 
-    instructionDeleted(User); 
-    User->eraseFromParent(); 
-  } 
-} 
- 
-bool 
-LoadAndStorePromoter::isInstInList(Instruction *I, 
-                                   const SmallVectorImpl<Instruction *> &Insts) 
-                                   const { 
-  return is_contained(Insts, I); 
-} 
+  }
+
+  /// GetUndefVal - Get an undefined value of the same type as the value
+  /// being handled.
+  static Value *GetUndefVal(BasicBlock *BB, SSAUpdater *Updater) {
+    return UndefValue::get(Updater->ProtoType);
+  }
+
+  /// CreateEmptyPHI - Create a new PHI instruction in the specified block.
+  /// Reserve space for the operands but do not fill them in yet.
+  static Value *CreateEmptyPHI(BasicBlock *BB, unsigned NumPreds,
+                               SSAUpdater *Updater) {
+    PHINode *PHI = PHINode::Create(Updater->ProtoType, NumPreds,
+                                   Updater->ProtoName, &BB->front());
+    return PHI;
+  }
+
+  /// AddPHIOperand - Add the specified value as an operand of the PHI for
+  /// the specified predecessor block.
+  static void AddPHIOperand(PHINode *PHI, Value *Val, BasicBlock *Pred) {
+    PHI->addIncoming(Val, Pred);
+  }
+
+  /// ValueIsPHI - Check if a value is a PHI.
+  static PHINode *ValueIsPHI(Value *Val, SSAUpdater *Updater) {
+    return dyn_cast<PHINode>(Val);
+  }
+
+  /// ValueIsNewPHI - Like ValueIsPHI but also check if the PHI has no source
+  /// operands, i.e., it was just added.
+  static PHINode *ValueIsNewPHI(Value *Val, SSAUpdater *Updater) {
+    PHINode *PHI = ValueIsPHI(Val, Updater);
+    if (PHI && PHI->getNumIncomingValues() == 0)
+      return PHI;
+    return nullptr;
+  }
+
+  /// GetPHIValue - For the specified PHI instruction, return the value
+  /// that it defines.
+  static Value *GetPHIValue(PHINode *PHI) {
+    return PHI;
+  }
+};
+
+} // end namespace llvm
+
+/// Check to see if AvailableVals has an entry for the specified BB and if so,
+/// return it.  If not, construct SSA form by first calculating the required
+/// placement of PHIs and then inserting new PHIs where needed.
+Value *SSAUpdater::GetValueAtEndOfBlockInternal(BasicBlock *BB) {
+  AvailableValsTy &AvailableVals = getAvailableVals(AV);
+  if (Value *V = AvailableVals[BB])
+    return V;
+
+  SSAUpdaterImpl<SSAUpdater> Impl(this, &AvailableVals, InsertedPHIs);
+  return Impl.GetValue(BB);
+}
+
+//===----------------------------------------------------------------------===//
+// LoadAndStorePromoter Implementation
+//===----------------------------------------------------------------------===//
+
+LoadAndStorePromoter::
+LoadAndStorePromoter(ArrayRef<const Instruction *> Insts,
+                     SSAUpdater &S, StringRef BaseName) : SSA(S) {
+  if (Insts.empty()) return;
+
+  const Value *SomeVal;
+  if (const LoadInst *LI = dyn_cast<LoadInst>(Insts[0]))
+    SomeVal = LI;
+  else
+    SomeVal = cast<StoreInst>(Insts[0])->getOperand(0);
+
+  if (BaseName.empty())
+    BaseName = SomeVal->getName();
+  SSA.Initialize(SomeVal->getType(), BaseName);
+}
+
+void LoadAndStorePromoter::run(const SmallVectorImpl<Instruction *> &Insts) {
+  // First step: bucket up uses of the alloca by the block they occur in.
+  // This is important because we have to handle multiple defs/uses in a block
+  // ourselves: SSAUpdater is purely for cross-block references.
+  DenseMap<BasicBlock *, TinyPtrVector<Instruction *>> UsesByBlock;
+
+  for (Instruction *User : Insts)
+    UsesByBlock[User->getParent()].push_back(User);
+
+  // Okay, now we can iterate over all the blocks in the function with uses,
+  // processing them.  Keep track of which loads are loading a live-in value.
+  // Walk the uses in the use-list order to be determinstic.
+  SmallVector<LoadInst *, 32> LiveInLoads;
+  DenseMap<Value *, Value *> ReplacedLoads;
+
+  for (Instruction *User : Insts) {
+    BasicBlock *BB = User->getParent();
+    TinyPtrVector<Instruction *> &BlockUses = UsesByBlock[BB];
+
+    // If this block has already been processed, ignore this repeat use.
+    if (BlockUses.empty()) continue;
+
+    // Okay, this is the first use in the block.  If this block just has a
+    // single user in it, we can rewrite it trivially.
+    if (BlockUses.size() == 1) {
+      // If it is a store, it is a trivial def of the value in the block.
+      if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+        updateDebugInfo(SI);
+        SSA.AddAvailableValue(BB, SI->getOperand(0));
+      } else
+        // Otherwise it is a load, queue it to rewrite as a live-in load.
+        LiveInLoads.push_back(cast<LoadInst>(User));
+      BlockUses.clear();
+      continue;
+    }
+
+    // Otherwise, check to see if this block is all loads.
+    bool HasStore = false;
+    for (Instruction *I : BlockUses) {
+      if (isa<StoreInst>(I)) {
+        HasStore = true;
+        break;
+      }
+    }
+
+    // If so, we can queue them all as live in loads.  We don't have an
+    // efficient way to tell which on is first in the block and don't want to
+    // scan large blocks, so just add all loads as live ins.
+    if (!HasStore) {
+      for (Instruction *I : BlockUses)
+        LiveInLoads.push_back(cast<LoadInst>(I));
+      BlockUses.clear();
+      continue;
+    }
+
+    // Otherwise, we have mixed loads and stores (or just a bunch of stores).
+    // Since SSAUpdater is purely for cross-block values, we need to determine
+    // the order of these instructions in the block.  If the first use in the
+    // block is a load, then it uses the live in value.  The last store defines
+    // the live out value.  We handle this by doing a linear scan of the block.
+    Value *StoredValue = nullptr;
+    for (Instruction &I : *BB) {
+      if (LoadInst *L = dyn_cast<LoadInst>(&I)) {
+        // If this is a load from an unrelated pointer, ignore it.
+        if (!isInstInList(L, Insts)) continue;
+
+        // If we haven't seen a store yet, this is a live in use, otherwise
+        // use the stored value.
+        if (StoredValue) {
+          replaceLoadWithValue(L, StoredValue);
+          L->replaceAllUsesWith(StoredValue);
+          ReplacedLoads[L] = StoredValue;
+        } else {
+          LiveInLoads.push_back(L);
+        }
+        continue;
+      }
+
+      if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
+        // If this is a store to an unrelated pointer, ignore it.
+        if (!isInstInList(SI, Insts)) continue;
+        updateDebugInfo(SI);
+
+        // Remember that this is the active value in the block.
+        StoredValue = SI->getOperand(0);
+      }
+    }
+
+    // The last stored value that happened is the live-out for the block.
+    assert(StoredValue && "Already checked that there is a store in block");
+    SSA.AddAvailableValue(BB, StoredValue);
+    BlockUses.clear();
+  }
+
+  // Okay, now we rewrite all loads that use live-in values in the loop,
+  // inserting PHI nodes as necessary.
+  for (LoadInst *ALoad : LiveInLoads) {
+    Value *NewVal = SSA.GetValueInMiddleOfBlock(ALoad->getParent());
+    replaceLoadWithValue(ALoad, NewVal);
+
+    // Avoid assertions in unreachable code.
+    if (NewVal == ALoad) NewVal = UndefValue::get(NewVal->getType());
+    ALoad->replaceAllUsesWith(NewVal);
+    ReplacedLoads[ALoad] = NewVal;
+  }
+
+  // Allow the client to do stuff before we start nuking things.
+  doExtraRewritesBeforeFinalDeletion();
+
+  // Now that everything is rewritten, delete the old instructions from the
+  // function.  They should all be dead now.
+  for (Instruction *User : Insts) {
+    // If this is a load that still has uses, then the load must have been added
+    // as a live value in the SSAUpdate data structure for a block (e.g. because
+    // the loaded value was stored later).  In this case, we need to recursively
+    // propagate the updates until we get to the real value.
+    if (!User->use_empty()) {
+      Value *NewVal = ReplacedLoads[User];
+      assert(NewVal && "not a replaced load?");
+
+      // Propagate down to the ultimate replacee.  The intermediately loads
+      // could theoretically already have been deleted, so we don't want to
+      // dereference the Value*'s.
+      DenseMap<Value*, Value*>::iterator RLI = ReplacedLoads.find(NewVal);
+      while (RLI != ReplacedLoads.end()) {
+        NewVal = RLI->second;
+        RLI = ReplacedLoads.find(NewVal);
+      }
+
+      replaceLoadWithValue(cast<LoadInst>(User), NewVal);
+      User->replaceAllUsesWith(NewVal);
+    }
+
+    instructionDeleted(User);
+    User->eraseFromParent();
+  }
+}
+
+bool
+LoadAndStorePromoter::isInstInList(Instruction *I,
+                                   const SmallVectorImpl<Instruction *> &Insts)
+                                   const {
+  return is_contained(Insts, I);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/SSAUpdaterBulk.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/SSAUpdaterBulk.cpp
index 40aa28c7c2..917d5e0a1e 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/SSAUpdaterBulk.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/SSAUpdaterBulk.cpp
@@ -1,190 +1,190 @@
-//===- SSAUpdaterBulk.cpp - Unstructured SSA Update Tool ------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the SSAUpdaterBulk class. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/SSAUpdaterBulk.h" 
-#include "llvm/Analysis/IteratedDominanceFrontier.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/Value.h" 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "ssaupdaterbulk" 
- 
-/// Helper function for finding a block which should have a value for the given 
-/// user. For PHI-nodes this block is the corresponding predecessor, for other 
-/// instructions it's their parent block. 
-static BasicBlock *getUserBB(Use *U) { 
-  auto *User = cast<Instruction>(U->getUser()); 
- 
-  if (auto *UserPN = dyn_cast<PHINode>(User)) 
-    return UserPN->getIncomingBlock(*U); 
-  else 
-    return User->getParent(); 
-} 
- 
-/// Add a new variable to the SSA rewriter. This needs to be called before 
-/// AddAvailableValue or AddUse calls. 
-unsigned SSAUpdaterBulk::AddVariable(StringRef Name, Type *Ty) { 
-  unsigned Var = Rewrites.size(); 
-  LLVM_DEBUG(dbgs() << "SSAUpdater: Var=" << Var << ": initialized with Ty = " 
-                    << *Ty << ", Name = " << Name << "\n"); 
-  RewriteInfo RI(Name, Ty); 
-  Rewrites.push_back(RI); 
-  return Var; 
-} 
- 
-/// Indicate that a rewritten value is available in the specified block with the 
-/// specified value. 
-void SSAUpdaterBulk::AddAvailableValue(unsigned Var, BasicBlock *BB, Value *V) { 
-  assert(Var < Rewrites.size() && "Variable not found!"); 
-  LLVM_DEBUG(dbgs() << "SSAUpdater: Var=" << Var 
-                    << ": added new available value" << *V << " in " 
-                    << BB->getName() << "\n"); 
-  Rewrites[Var].Defines[BB] = V; 
-} 
- 
-/// Record a use of the symbolic value. This use will be updated with a 
-/// rewritten value when RewriteAllUses is called. 
-void SSAUpdaterBulk::AddUse(unsigned Var, Use *U) { 
-  assert(Var < Rewrites.size() && "Variable not found!"); 
-  LLVM_DEBUG(dbgs() << "SSAUpdater: Var=" << Var << ": added a use" << *U->get() 
-                    << " in " << getUserBB(U)->getName() << "\n"); 
-  Rewrites[Var].Uses.push_back(U); 
-} 
- 
-/// Return true if the SSAUpdater already has a value for the specified variable 
-/// in the specified block. 
-bool SSAUpdaterBulk::HasValueForBlock(unsigned Var, BasicBlock *BB) { 
-  return (Var < Rewrites.size()) ? Rewrites[Var].Defines.count(BB) : false; 
-} 
- 
-// Compute value at the given block BB. We either should already know it, or we 
-// should be able to recursively reach it going up dominator tree. 
-Value *SSAUpdaterBulk::computeValueAt(BasicBlock *BB, RewriteInfo &R, 
-                                      DominatorTree *DT) { 
-  if (!R.Defines.count(BB)) { 
-    if (DT->isReachableFromEntry(BB) && PredCache.get(BB).size()) { 
-      BasicBlock *IDom = DT->getNode(BB)->getIDom()->getBlock(); 
-      Value *V = computeValueAt(IDom, R, DT); 
-      R.Defines[BB] = V; 
-    } else 
-      R.Defines[BB] = UndefValue::get(R.Ty); 
-  } 
-  return R.Defines[BB]; 
-} 
- 
-/// Given sets of UsingBlocks and DefBlocks, compute the set of LiveInBlocks. 
-/// This is basically a subgraph limited by DefBlocks and UsingBlocks. 
-static void 
-ComputeLiveInBlocks(const SmallPtrSetImpl<BasicBlock *> &UsingBlocks, 
-                    const SmallPtrSetImpl<BasicBlock *> &DefBlocks, 
-                    SmallPtrSetImpl<BasicBlock *> &LiveInBlocks, 
-                    PredIteratorCache &PredCache) { 
-  // To determine liveness, we must iterate through the predecessors of blocks 
-  // where the def is live.  Blocks are added to the worklist if we need to 
-  // check their predecessors.  Start with all the using blocks. 
-  SmallVector<BasicBlock *, 64> LiveInBlockWorklist(UsingBlocks.begin(), 
-                                                    UsingBlocks.end()); 
- 
-  // Now that we have a set of blocks where the phi is live-in, recursively add 
-  // their predecessors until we find the full region the value is live. 
-  while (!LiveInBlockWorklist.empty()) { 
-    BasicBlock *BB = LiveInBlockWorklist.pop_back_val(); 
- 
-    // The block really is live in here, insert it into the set.  If already in 
-    // the set, then it has already been processed. 
-    if (!LiveInBlocks.insert(BB).second) 
-      continue; 
- 
-    // Since the value is live into BB, it is either defined in a predecessor or 
-    // live into it to.  Add the preds to the worklist unless they are a 
-    // defining block. 
-    for (BasicBlock *P : PredCache.get(BB)) { 
-      // The value is not live into a predecessor if it defines the value. 
-      if (DefBlocks.count(P)) 
-        continue; 
- 
-      // Otherwise it is, add to the worklist. 
-      LiveInBlockWorklist.push_back(P); 
-    } 
-  } 
-} 
- 
-/// Perform all the necessary updates, including new PHI-nodes insertion and the 
-/// requested uses update. 
-void SSAUpdaterBulk::RewriteAllUses(DominatorTree *DT, 
-                                    SmallVectorImpl<PHINode *> *InsertedPHIs) { 
-  for (auto &R : Rewrites) { 
-    // Compute locations for new phi-nodes. 
-    // For that we need to initialize DefBlocks from definitions in R.Defines, 
-    // UsingBlocks from uses in R.Uses, then compute LiveInBlocks, and then use 
-    // this set for computing iterated dominance frontier (IDF). 
-    // The IDF blocks are the blocks where we need to insert new phi-nodes. 
-    ForwardIDFCalculator IDF(*DT); 
-    LLVM_DEBUG(dbgs() << "SSAUpdater: rewriting " << R.Uses.size() 
-                      << " use(s)\n"); 
- 
-    SmallPtrSet<BasicBlock *, 2> DefBlocks; 
-    for (auto &Def : R.Defines) 
-      DefBlocks.insert(Def.first); 
-    IDF.setDefiningBlocks(DefBlocks); 
- 
-    SmallPtrSet<BasicBlock *, 2> UsingBlocks; 
-    for (Use *U : R.Uses) 
-      UsingBlocks.insert(getUserBB(U)); 
- 
-    SmallVector<BasicBlock *, 32> IDFBlocks; 
-    SmallPtrSet<BasicBlock *, 32> LiveInBlocks; 
-    ComputeLiveInBlocks(UsingBlocks, DefBlocks, LiveInBlocks, PredCache); 
-    IDF.resetLiveInBlocks(); 
-    IDF.setLiveInBlocks(LiveInBlocks); 
-    IDF.calculate(IDFBlocks); 
- 
-    // We've computed IDF, now insert new phi-nodes there. 
-    SmallVector<PHINode *, 4> InsertedPHIsForVar; 
-    for (auto *FrontierBB : IDFBlocks) { 
-      IRBuilder<> B(FrontierBB, FrontierBB->begin()); 
-      PHINode *PN = B.CreatePHI(R.Ty, 0, R.Name); 
-      R.Defines[FrontierBB] = PN; 
-      InsertedPHIsForVar.push_back(PN); 
-      if (InsertedPHIs) 
-        InsertedPHIs->push_back(PN); 
-    } 
- 
-    // Fill in arguments of the inserted PHIs. 
-    for (auto *PN : InsertedPHIsForVar) { 
-      BasicBlock *PBB = PN->getParent(); 
-      for (BasicBlock *Pred : PredCache.get(PBB)) 
-        PN->addIncoming(computeValueAt(Pred, R, DT), Pred); 
-    } 
- 
-    // Rewrite actual uses with the inserted definitions. 
-    SmallPtrSet<Use *, 4> ProcessedUses; 
-    for (Use *U : R.Uses) { 
-      if (!ProcessedUses.insert(U).second) 
-        continue; 
-      Value *V = computeValueAt(getUserBB(U), R, DT); 
-      Value *OldVal = U->get(); 
-      assert(OldVal && "Invalid use!"); 
-      // Notify that users of the existing value that it is being replaced. 
-      if (OldVal != V && OldVal->hasValueHandle()) 
-        ValueHandleBase::ValueIsRAUWd(OldVal, V); 
-      LLVM_DEBUG(dbgs() << "SSAUpdater: replacing " << *OldVal << " with " << *V 
-                        << "\n"); 
-      U->set(V); 
-    } 
-  } 
-} 
+//===- SSAUpdaterBulk.cpp - Unstructured SSA Update Tool ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SSAUpdaterBulk class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SSAUpdaterBulk.h"
+#include "llvm/Analysis/IteratedDominanceFrontier.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ssaupdaterbulk"
+
+/// Helper function for finding a block which should have a value for the given
+/// user. For PHI-nodes this block is the corresponding predecessor, for other
+/// instructions it's their parent block.
+static BasicBlock *getUserBB(Use *U) {
+  auto *User = cast<Instruction>(U->getUser());
+
+  if (auto *UserPN = dyn_cast<PHINode>(User))
+    return UserPN->getIncomingBlock(*U);
+  else
+    return User->getParent();
+}
+
+/// Add a new variable to the SSA rewriter. This needs to be called before
+/// AddAvailableValue or AddUse calls.
+unsigned SSAUpdaterBulk::AddVariable(StringRef Name, Type *Ty) {
+  unsigned Var = Rewrites.size();
+  LLVM_DEBUG(dbgs() << "SSAUpdater: Var=" << Var << ": initialized with Ty = "
+                    << *Ty << ", Name = " << Name << "\n");
+  RewriteInfo RI(Name, Ty);
+  Rewrites.push_back(RI);
+  return Var;
+}
+
+/// Indicate that a rewritten value is available in the specified block with the
+/// specified value.
+void SSAUpdaterBulk::AddAvailableValue(unsigned Var, BasicBlock *BB, Value *V) {
+  assert(Var < Rewrites.size() && "Variable not found!");
+  LLVM_DEBUG(dbgs() << "SSAUpdater: Var=" << Var
+                    << ": added new available value" << *V << " in "
+                    << BB->getName() << "\n");
+  Rewrites[Var].Defines[BB] = V;
+}
+
+/// Record a use of the symbolic value. This use will be updated with a
+/// rewritten value when RewriteAllUses is called.
+void SSAUpdaterBulk::AddUse(unsigned Var, Use *U) {
+  assert(Var < Rewrites.size() && "Variable not found!");
+  LLVM_DEBUG(dbgs() << "SSAUpdater: Var=" << Var << ": added a use" << *U->get()
+                    << " in " << getUserBB(U)->getName() << "\n");
+  Rewrites[Var].Uses.push_back(U);
+}
+
+/// Return true if the SSAUpdater already has a value for the specified variable
+/// in the specified block.
+bool SSAUpdaterBulk::HasValueForBlock(unsigned Var, BasicBlock *BB) {
+  return (Var < Rewrites.size()) ? Rewrites[Var].Defines.count(BB) : false;
+}
+
+// Compute value at the given block BB. We either should already know it, or we
+// should be able to recursively reach it going up dominator tree.
+Value *SSAUpdaterBulk::computeValueAt(BasicBlock *BB, RewriteInfo &R,
+                                      DominatorTree *DT) {
+  if (!R.Defines.count(BB)) {
+    if (DT->isReachableFromEntry(BB) && PredCache.get(BB).size()) {
+      BasicBlock *IDom = DT->getNode(BB)->getIDom()->getBlock();
+      Value *V = computeValueAt(IDom, R, DT);
+      R.Defines[BB] = V;
+    } else
+      R.Defines[BB] = UndefValue::get(R.Ty);
+  }
+  return R.Defines[BB];
+}
+
+/// Given sets of UsingBlocks and DefBlocks, compute the set of LiveInBlocks.
+/// This is basically a subgraph limited by DefBlocks and UsingBlocks.
+static void
+ComputeLiveInBlocks(const SmallPtrSetImpl<BasicBlock *> &UsingBlocks,
+                    const SmallPtrSetImpl<BasicBlock *> &DefBlocks,
+                    SmallPtrSetImpl<BasicBlock *> &LiveInBlocks,
+                    PredIteratorCache &PredCache) {
+  // To determine liveness, we must iterate through the predecessors of blocks
+  // where the def is live.  Blocks are added to the worklist if we need to
+  // check their predecessors.  Start with all the using blocks.
+  SmallVector<BasicBlock *, 64> LiveInBlockWorklist(UsingBlocks.begin(),
+                                                    UsingBlocks.end());
+
+  // Now that we have a set of blocks where the phi is live-in, recursively add
+  // their predecessors until we find the full region the value is live.
+  while (!LiveInBlockWorklist.empty()) {
+    BasicBlock *BB = LiveInBlockWorklist.pop_back_val();
+
+    // The block really is live in here, insert it into the set.  If already in
+    // the set, then it has already been processed.
+    if (!LiveInBlocks.insert(BB).second)
+      continue;
+
+    // Since the value is live into BB, it is either defined in a predecessor or
+    // live into it to.  Add the preds to the worklist unless they are a
+    // defining block.
+    for (BasicBlock *P : PredCache.get(BB)) {
+      // The value is not live into a predecessor if it defines the value.
+      if (DefBlocks.count(P))
+        continue;
+
+      // Otherwise it is, add to the worklist.
+      LiveInBlockWorklist.push_back(P);
+    }
+  }
+}
+
+/// Perform all the necessary updates, including new PHI-nodes insertion and the
+/// requested uses update.
+void SSAUpdaterBulk::RewriteAllUses(DominatorTree *DT,
+                                    SmallVectorImpl<PHINode *> *InsertedPHIs) {
+  for (auto &R : Rewrites) {
+    // Compute locations for new phi-nodes.
+    // For that we need to initialize DefBlocks from definitions in R.Defines,
+    // UsingBlocks from uses in R.Uses, then compute LiveInBlocks, and then use
+    // this set for computing iterated dominance frontier (IDF).
+    // The IDF blocks are the blocks where we need to insert new phi-nodes.
+    ForwardIDFCalculator IDF(*DT);
+    LLVM_DEBUG(dbgs() << "SSAUpdater: rewriting " << R.Uses.size()
+                      << " use(s)\n");
+
+    SmallPtrSet<BasicBlock *, 2> DefBlocks;
+    for (auto &Def : R.Defines)
+      DefBlocks.insert(Def.first);
+    IDF.setDefiningBlocks(DefBlocks);
+
+    SmallPtrSet<BasicBlock *, 2> UsingBlocks;
+    for (Use *U : R.Uses)
+      UsingBlocks.insert(getUserBB(U));
+
+    SmallVector<BasicBlock *, 32> IDFBlocks;
+    SmallPtrSet<BasicBlock *, 32> LiveInBlocks;
+    ComputeLiveInBlocks(UsingBlocks, DefBlocks, LiveInBlocks, PredCache);
+    IDF.resetLiveInBlocks();
+    IDF.setLiveInBlocks(LiveInBlocks);
+    IDF.calculate(IDFBlocks);
+
+    // We've computed IDF, now insert new phi-nodes there.
+    SmallVector<PHINode *, 4> InsertedPHIsForVar;
+    for (auto *FrontierBB : IDFBlocks) {
+      IRBuilder<> B(FrontierBB, FrontierBB->begin());
+      PHINode *PN = B.CreatePHI(R.Ty, 0, R.Name);
+      R.Defines[FrontierBB] = PN;
+      InsertedPHIsForVar.push_back(PN);
+      if (InsertedPHIs)
+        InsertedPHIs->push_back(PN);
+    }
+
+    // Fill in arguments of the inserted PHIs.
+    for (auto *PN : InsertedPHIsForVar) {
+      BasicBlock *PBB = PN->getParent();
+      for (BasicBlock *Pred : PredCache.get(PBB))
+        PN->addIncoming(computeValueAt(Pred, R, DT), Pred);
+    }
+
+    // Rewrite actual uses with the inserted definitions.
+    SmallPtrSet<Use *, 4> ProcessedUses;
+    for (Use *U : R.Uses) {
+      if (!ProcessedUses.insert(U).second)
+        continue;
+      Value *V = computeValueAt(getUserBB(U), R, DT);
+      Value *OldVal = U->get();
+      assert(OldVal && "Invalid use!");
+      // Notify that users of the existing value that it is being replaced.
+      if (OldVal != V && OldVal->hasValueHandle())
+        ValueHandleBase::ValueIsRAUWd(OldVal, V);
+      LLVM_DEBUG(dbgs() << "SSAUpdater: replacing " << *OldVal << " with " << *V
+                        << "\n");
+      U->set(V);
+    }
+  }
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/SanitizerStats.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/SanitizerStats.cpp
index acecebe646..a1313c77ed 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/SanitizerStats.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/SanitizerStats.cpp
@@ -1,107 +1,107 @@
-//===- SanitizerStats.cpp - Sanitizer statistics gathering ----------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// Implements code generation for sanitizer statistics gathering. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/SanitizerStats.h" 
-#include "llvm/ADT/Triple.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/GlobalVariable.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/Transforms/Utils/ModuleUtils.h" 
- 
-using namespace llvm; 
- 
-SanitizerStatReport::SanitizerStatReport(Module *M) : M(M) { 
-  StatTy = ArrayType::get(Type::getInt8PtrTy(M->getContext()), 2); 
-  EmptyModuleStatsTy = makeModuleStatsTy(); 
- 
-  ModuleStatsGV = new GlobalVariable(*M, EmptyModuleStatsTy, false, 
-                                     GlobalValue::InternalLinkage, nullptr); 
-} 
- 
-ArrayType *SanitizerStatReport::makeModuleStatsArrayTy() { 
-  return ArrayType::get(StatTy, Inits.size()); 
-} 
- 
-StructType *SanitizerStatReport::makeModuleStatsTy() { 
-  return StructType::get(M->getContext(), {Type::getInt8PtrTy(M->getContext()), 
-                                           Type::getInt32Ty(M->getContext()), 
-                                           makeModuleStatsArrayTy()}); 
-} 
- 
-void SanitizerStatReport::create(IRBuilder<> &B, SanitizerStatKind SK) { 
-  Function *F = B.GetInsertBlock()->getParent(); 
-  Module *M = F->getParent(); 
-  PointerType *Int8PtrTy = B.getInt8PtrTy(); 
-  IntegerType *IntPtrTy = B.getIntPtrTy(M->getDataLayout()); 
-  ArrayType *StatTy = ArrayType::get(Int8PtrTy, 2); 
- 
-  Inits.push_back(ConstantArray::get( 
-      StatTy, 
-      {Constant::getNullValue(Int8PtrTy), 
-       ConstantExpr::getIntToPtr( 
-           ConstantInt::get(IntPtrTy, uint64_t(SK) << (IntPtrTy->getBitWidth() - 
-                                                       kSanitizerStatKindBits)), 
-           Int8PtrTy)})); 
- 
-  FunctionType *StatReportTy = 
-      FunctionType::get(B.getVoidTy(), Int8PtrTy, false); 
-  FunctionCallee StatReport = 
-      M->getOrInsertFunction("__sanitizer_stat_report", StatReportTy); 
- 
-  auto InitAddr = ConstantExpr::getGetElementPtr( 
-      EmptyModuleStatsTy, ModuleStatsGV, 
-      ArrayRef<Constant *>{ 
-          ConstantInt::get(IntPtrTy, 0), ConstantInt::get(B.getInt32Ty(), 2), 
-          ConstantInt::get(IntPtrTy, Inits.size() - 1), 
-      }); 
-  B.CreateCall(StatReport, ConstantExpr::getBitCast(InitAddr, Int8PtrTy)); 
-} 
- 
-void SanitizerStatReport::finish() { 
-  if (Inits.empty()) { 
-    ModuleStatsGV->eraseFromParent(); 
-    return; 
-  } 
- 
-  PointerType *Int8PtrTy = Type::getInt8PtrTy(M->getContext()); 
-  IntegerType *Int32Ty = Type::getInt32Ty(M->getContext()); 
-  Type *VoidTy = Type::getVoidTy(M->getContext()); 
- 
-  // Create a new ModuleStatsGV to replace the old one. We can't just set the 
-  // old one's initializer because its type is different. 
-  auto NewModuleStatsGV = new GlobalVariable( 
-      *M, makeModuleStatsTy(), false, GlobalValue::InternalLinkage, 
-      ConstantStruct::getAnon( 
-          {Constant::getNullValue(Int8PtrTy), 
-           ConstantInt::get(Int32Ty, Inits.size()), 
-           ConstantArray::get(makeModuleStatsArrayTy(), Inits)})); 
-  ModuleStatsGV->replaceAllUsesWith( 
-      ConstantExpr::getBitCast(NewModuleStatsGV, ModuleStatsGV->getType())); 
-  ModuleStatsGV->eraseFromParent(); 
- 
-  // Create a global constructor to register NewModuleStatsGV. 
-  auto F = Function::Create(FunctionType::get(VoidTy, false), 
-                            GlobalValue::InternalLinkage, "", M); 
-  auto BB = BasicBlock::Create(M->getContext(), "", F); 
-  IRBuilder<> B(BB); 
- 
-  FunctionType *StatInitTy = FunctionType::get(VoidTy, Int8PtrTy, false); 
-  FunctionCallee StatInit = 
-      M->getOrInsertFunction("__sanitizer_stat_init", StatInitTy); 
- 
-  B.CreateCall(StatInit, ConstantExpr::getBitCast(NewModuleStatsGV, Int8PtrTy)); 
-  B.CreateRetVoid(); 
- 
-  appendToGlobalCtors(*M, F, 0); 
-} 
+//===- SanitizerStats.cpp - Sanitizer statistics gathering ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements code generation for sanitizer statistics gathering.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SanitizerStats.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+SanitizerStatReport::SanitizerStatReport(Module *M) : M(M) {
+  StatTy = ArrayType::get(Type::getInt8PtrTy(M->getContext()), 2);
+  EmptyModuleStatsTy = makeModuleStatsTy();
+
+  ModuleStatsGV = new GlobalVariable(*M, EmptyModuleStatsTy, false,
+                                     GlobalValue::InternalLinkage, nullptr);
+}
+
+ArrayType *SanitizerStatReport::makeModuleStatsArrayTy() {
+  return ArrayType::get(StatTy, Inits.size());
+}
+
+StructType *SanitizerStatReport::makeModuleStatsTy() {
+  return StructType::get(M->getContext(), {Type::getInt8PtrTy(M->getContext()),
+                                           Type::getInt32Ty(M->getContext()),
+                                           makeModuleStatsArrayTy()});
+}
+
+void SanitizerStatReport::create(IRBuilder<> &B, SanitizerStatKind SK) {
+  Function *F = B.GetInsertBlock()->getParent();
+  Module *M = F->getParent();
+  PointerType *Int8PtrTy = B.getInt8PtrTy();
+  IntegerType *IntPtrTy = B.getIntPtrTy(M->getDataLayout());
+  ArrayType *StatTy = ArrayType::get(Int8PtrTy, 2);
+
+  Inits.push_back(ConstantArray::get(
+      StatTy,
+      {Constant::getNullValue(Int8PtrTy),
+       ConstantExpr::getIntToPtr(
+           ConstantInt::get(IntPtrTy, uint64_t(SK) << (IntPtrTy->getBitWidth() -
+                                                       kSanitizerStatKindBits)),
+           Int8PtrTy)}));
+
+  FunctionType *StatReportTy =
+      FunctionType::get(B.getVoidTy(), Int8PtrTy, false);
+  FunctionCallee StatReport =
+      M->getOrInsertFunction("__sanitizer_stat_report", StatReportTy);
+
+  auto InitAddr = ConstantExpr::getGetElementPtr(
+      EmptyModuleStatsTy, ModuleStatsGV,
+      ArrayRef<Constant *>{
+          ConstantInt::get(IntPtrTy, 0), ConstantInt::get(B.getInt32Ty(), 2),
+          ConstantInt::get(IntPtrTy, Inits.size() - 1),
+      });
+  B.CreateCall(StatReport, ConstantExpr::getBitCast(InitAddr, Int8PtrTy));
+}
+
+void SanitizerStatReport::finish() {
+  if (Inits.empty()) {
+    ModuleStatsGV->eraseFromParent();
+    return;
+  }
+
+  PointerType *Int8PtrTy = Type::getInt8PtrTy(M->getContext());
+  IntegerType *Int32Ty = Type::getInt32Ty(M->getContext());
+  Type *VoidTy = Type::getVoidTy(M->getContext());
+
+  // Create a new ModuleStatsGV to replace the old one. We can't just set the
+  // old one's initializer because its type is different.
+  auto NewModuleStatsGV = new GlobalVariable(
+      *M, makeModuleStatsTy(), false, GlobalValue::InternalLinkage,
+      ConstantStruct::getAnon(
+          {Constant::getNullValue(Int8PtrTy),
+           ConstantInt::get(Int32Ty, Inits.size()),
+           ConstantArray::get(makeModuleStatsArrayTy(), Inits)}));
+  ModuleStatsGV->replaceAllUsesWith(
+      ConstantExpr::getBitCast(NewModuleStatsGV, ModuleStatsGV->getType()));
+  ModuleStatsGV->eraseFromParent();
+
+  // Create a global constructor to register NewModuleStatsGV.
+  auto F = Function::Create(FunctionType::get(VoidTy, false),
+                            GlobalValue::InternalLinkage, "", M);
+  auto BB = BasicBlock::Create(M->getContext(), "", F);
+  IRBuilder<> B(BB);
+
+  FunctionType *StatInitTy = FunctionType::get(VoidTy, Int8PtrTy, false);
+  FunctionCallee StatInit =
+      M->getOrInsertFunction("__sanitizer_stat_init", StatInitTy);
+
+  B.CreateCall(StatInit, ConstantExpr::getBitCast(NewModuleStatsGV, Int8PtrTy));
+  B.CreateRetVoid();
+
+  appendToGlobalCtors(*M, F, 0);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 3aedba4753..6dbfb0b61f 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -1,70 +1,70 @@
-//===- ScalarEvolutionExpander.cpp - Scalar Evolution Analysis ------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file contains the implementation of the scalar evolution expander, 
-// which is used to generate the code corresponding to a given scalar evolution 
-// expression. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallSet.h" 
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
+//===- ScalarEvolutionExpander.cpp - Scalar Evolution Analysis ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the scalar evolution expander,
+// which is used to generate the code corresponding to a given scalar evolution
+// expression.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
- 
-using namespace llvm; 
- 
-cl::opt<unsigned> llvm::SCEVCheapExpansionBudget( 
-    "scev-cheap-expansion-budget", cl::Hidden, cl::init(4), 
-    cl::desc("When performing SCEV expansion only if it is cheap to do, this " 
-             "controls the budget that is considered cheap (default = 4)")); 
- 
-using namespace PatternMatch; 
- 
-/// ReuseOrCreateCast - Arrange for there to be a cast of V to Ty at IP, 
+
+using namespace llvm;
+
+cl::opt<unsigned> llvm::SCEVCheapExpansionBudget(
+    "scev-cheap-expansion-budget", cl::Hidden, cl::init(4),
+    cl::desc("When performing SCEV expansion only if it is cheap to do, this "
+             "controls the budget that is considered cheap (default = 4)"));
+
+using namespace PatternMatch;
+
+/// ReuseOrCreateCast - Arrange for there to be a cast of V to Ty at IP,
 /// reusing an existing cast if a suitable one (= dominating IP) exists, or
-/// creating a new one. 
-Value *SCEVExpander::ReuseOrCreateCast(Value *V, Type *Ty, 
-                                       Instruction::CastOps Op, 
-                                       BasicBlock::iterator IP) { 
-  // This function must be called with the builder having a valid insertion 
-  // point. It doesn't need to be the actual IP where the uses of the returned 
-  // cast will be added, but it must dominate such IP. 
-  // We use this precondition to produce a cast that will dominate all its 
-  // uses. In particular, this is crucial for the case where the builder's 
-  // insertion point *is* the point where we were asked to put the cast. 
-  // Since we don't know the builder's insertion point is actually 
-  // where the uses will be added (only that it dominates it), we are 
-  // not allowed to move it. 
-  BasicBlock::iterator BIP = Builder.GetInsertPoint(); 
- 
-  Instruction *Ret = nullptr; 
- 
-  // Check to see if there is already a cast! 
+/// creating a new one.
+Value *SCEVExpander::ReuseOrCreateCast(Value *V, Type *Ty,
+                                       Instruction::CastOps Op,
+                                       BasicBlock::iterator IP) {
+  // This function must be called with the builder having a valid insertion
+  // point. It doesn't need to be the actual IP where the uses of the returned
+  // cast will be added, but it must dominate such IP.
+  // We use this precondition to produce a cast that will dominate all its
+  // uses. In particular, this is crucial for the case where the builder's
+  // insertion point *is* the point where we were asked to put the cast.
+  // Since we don't know the builder's insertion point is actually
+  // where the uses will be added (only that it dominates it), we are
+  // not allowed to move it.
+  BasicBlock::iterator BIP = Builder.GetInsertPoint();
+
+  Instruction *Ret = nullptr;
+
+  // Check to see if there is already a cast!
   for (User *U : V->users()) {
     if (U->getType() != Ty)
       continue;
     CastInst *CI = dyn_cast<CastInst>(U);
     if (!CI || CI->getOpcode() != Op)
       continue;
- 
+
     // Found a suitable cast that is at IP or comes before IP. Use it. Note that
     // the cast must also properly dominate the Builder's insertion point.
     if (IP->getParent() == CI->getParent() && &*BIP != CI &&
@@ -74,58 +74,58 @@ Value *SCEVExpander::ReuseOrCreateCast(Value *V, Type *Ty,
     }
   }
 
-  // Create a new cast. 
+  // Create a new cast.
   if (!Ret) {
-    Ret = CastInst::Create(Op, V, Ty, V->getName(), &*IP); 
+    Ret = CastInst::Create(Op, V, Ty, V->getName(), &*IP);
     rememberInstruction(Ret);
   }
- 
-  // We assert at the end of the function since IP might point to an 
-  // instruction with different dominance properties than a cast 
-  // (an invoke for example) and not dominate BIP (but the cast does). 
-  assert(SE.DT.dominates(Ret, &*BIP)); 
- 
-  return Ret; 
-} 
- 
+
+  // We assert at the end of the function since IP might point to an
+  // instruction with different dominance properties than a cast
+  // (an invoke for example) and not dominate BIP (but the cast does).
+  assert(SE.DT.dominates(Ret, &*BIP));
+
+  return Ret;
+}
+
 BasicBlock::iterator
 SCEVExpander::findInsertPointAfter(Instruction *I, Instruction *MustDominate) {
-  BasicBlock::iterator IP = ++I->getIterator(); 
-  if (auto *II = dyn_cast<InvokeInst>(I)) 
-    IP = II->getNormalDest()->begin(); 
- 
-  while (isa<PHINode>(IP)) 
-    ++IP; 
- 
-  if (isa<FuncletPadInst>(IP) || isa<LandingPadInst>(IP)) { 
-    ++IP; 
-  } else if (isa<CatchSwitchInst>(IP)) { 
+  BasicBlock::iterator IP = ++I->getIterator();
+  if (auto *II = dyn_cast<InvokeInst>(I))
+    IP = II->getNormalDest()->begin();
+
+  while (isa<PHINode>(IP))
+    ++IP;
+
+  if (isa<FuncletPadInst>(IP) || isa<LandingPadInst>(IP)) {
+    ++IP;
+  } else if (isa<CatchSwitchInst>(IP)) {
     IP = MustDominate->getParent()->getFirstInsertionPt();
-  } else { 
-    assert(!IP->isEHPad() && "unexpected eh pad!"); 
-  } 
- 
+  } else {
+    assert(!IP->isEHPad() && "unexpected eh pad!");
+  }
+
   // Adjust insert point to be after instructions inserted by the expander, so
   // we can re-use already inserted instructions. Avoid skipping past the
   // original \p MustDominate, in case it is an inserted instruction.
   while (isInsertedInstruction(&*IP) && &*IP != MustDominate)
     ++IP;
 
-  return IP; 
-} 
- 
-/// InsertNoopCastOfTo - Insert a cast of V to the specified type, 
-/// which must be possible with a noop cast, doing what we can to share 
-/// the casts. 
-Value *SCEVExpander::InsertNoopCastOfTo(Value *V, Type *Ty) { 
-  Instruction::CastOps Op = CastInst::getCastOpcode(V, false, Ty, false); 
-  assert((Op == Instruction::BitCast || 
-          Op == Instruction::PtrToInt || 
-          Op == Instruction::IntToPtr) && 
-         "InsertNoopCastOfTo cannot perform non-noop casts!"); 
-  assert(SE.getTypeSizeInBits(V->getType()) == SE.getTypeSizeInBits(Ty) && 
-         "InsertNoopCastOfTo cannot change sizes!"); 
- 
+  return IP;
+}
+
+/// InsertNoopCastOfTo - Insert a cast of V to the specified type,
+/// which must be possible with a noop cast, doing what we can to share
+/// the casts.
+Value *SCEVExpander::InsertNoopCastOfTo(Value *V, Type *Ty) {
+  Instruction::CastOps Op = CastInst::getCastOpcode(V, false, Ty, false);
+  assert((Op == Instruction::BitCast ||
+          Op == Instruction::PtrToInt ||
+          Op == Instruction::IntToPtr) &&
+         "InsertNoopCastOfTo cannot perform non-noop casts!");
+  assert(SE.getTypeSizeInBits(V->getType()) == SE.getTypeSizeInBits(Ty) &&
+         "InsertNoopCastOfTo cannot change sizes!");
+
   // inttoptr only works for integral pointers. For non-integral pointers, we
   // can create a GEP on i8* null  with the integral value as index. Note that
   // it is safe to use GEP of null instead of inttoptr here, because only
@@ -142,1070 +142,1070 @@ Value *SCEVExpander::InsertNoopCastOfTo(Value *V, Type *Ty) {
       return Builder.CreateBitCast(GEP, Ty);
     }
   }
-  // Short-circuit unnecessary bitcasts. 
-  if (Op == Instruction::BitCast) { 
-    if (V->getType() == Ty) 
-      return V; 
-    if (CastInst *CI = dyn_cast<CastInst>(V)) { 
-      if (CI->getOperand(0)->getType() == Ty) 
-        return CI->getOperand(0); 
-    } 
-  } 
-  // Short-circuit unnecessary inttoptr<->ptrtoint casts. 
-  if ((Op == Instruction::PtrToInt || Op == Instruction::IntToPtr) && 
-      SE.getTypeSizeInBits(Ty) == SE.getTypeSizeInBits(V->getType())) { 
-    if (CastInst *CI = dyn_cast<CastInst>(V)) 
-      if ((CI->getOpcode() == Instruction::PtrToInt || 
-           CI->getOpcode() == Instruction::IntToPtr) && 
-          SE.getTypeSizeInBits(CI->getType()) == 
-          SE.getTypeSizeInBits(CI->getOperand(0)->getType())) 
-        return CI->getOperand(0); 
-    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) 
-      if ((CE->getOpcode() == Instruction::PtrToInt || 
-           CE->getOpcode() == Instruction::IntToPtr) && 
-          SE.getTypeSizeInBits(CE->getType()) == 
-          SE.getTypeSizeInBits(CE->getOperand(0)->getType())) 
-        return CE->getOperand(0); 
-  } 
- 
-  // Fold a cast of a constant. 
-  if (Constant *C = dyn_cast<Constant>(V)) 
-    return ConstantExpr::getCast(Op, C, Ty); 
- 
-  // Cast the argument at the beginning of the entry block, after 
-  // any bitcasts of other arguments. 
-  if (Argument *A = dyn_cast<Argument>(V)) { 
-    BasicBlock::iterator IP = A->getParent()->getEntryBlock().begin(); 
-    while ((isa<BitCastInst>(IP) && 
-            isa<Argument>(cast<BitCastInst>(IP)->getOperand(0)) && 
-            cast<BitCastInst>(IP)->getOperand(0) != A) || 
-           isa<DbgInfoIntrinsic>(IP)) 
-      ++IP; 
-    return ReuseOrCreateCast(A, Ty, Op, IP); 
-  } 
- 
-  // Cast the instruction immediately after the instruction. 
-  Instruction *I = cast<Instruction>(V); 
+  // Short-circuit unnecessary bitcasts.
+  if (Op == Instruction::BitCast) {
+    if (V->getType() == Ty)
+      return V;
+    if (CastInst *CI = dyn_cast<CastInst>(V)) {
+      if (CI->getOperand(0)->getType() == Ty)
+        return CI->getOperand(0);
+    }
+  }
+  // Short-circuit unnecessary inttoptr<->ptrtoint casts.
+  if ((Op == Instruction::PtrToInt || Op == Instruction::IntToPtr) &&
+      SE.getTypeSizeInBits(Ty) == SE.getTypeSizeInBits(V->getType())) {
+    if (CastInst *CI = dyn_cast<CastInst>(V))
+      if ((CI->getOpcode() == Instruction::PtrToInt ||
+           CI->getOpcode() == Instruction::IntToPtr) &&
+          SE.getTypeSizeInBits(CI->getType()) ==
+          SE.getTypeSizeInBits(CI->getOperand(0)->getType()))
+        return CI->getOperand(0);
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+      if ((CE->getOpcode() == Instruction::PtrToInt ||
+           CE->getOpcode() == Instruction::IntToPtr) &&
+          SE.getTypeSizeInBits(CE->getType()) ==
+          SE.getTypeSizeInBits(CE->getOperand(0)->getType()))
+        return CE->getOperand(0);
+  }
+
+  // Fold a cast of a constant.
+  if (Constant *C = dyn_cast<Constant>(V))
+    return ConstantExpr::getCast(Op, C, Ty);
+
+  // Cast the argument at the beginning of the entry block, after
+  // any bitcasts of other arguments.
+  if (Argument *A = dyn_cast<Argument>(V)) {
+    BasicBlock::iterator IP = A->getParent()->getEntryBlock().begin();
+    while ((isa<BitCastInst>(IP) &&
+            isa<Argument>(cast<BitCastInst>(IP)->getOperand(0)) &&
+            cast<BitCastInst>(IP)->getOperand(0) != A) ||
+           isa<DbgInfoIntrinsic>(IP))
+      ++IP;
+    return ReuseOrCreateCast(A, Ty, Op, IP);
+  }
+
+  // Cast the instruction immediately after the instruction.
+  Instruction *I = cast<Instruction>(V);
   BasicBlock::iterator IP = findInsertPointAfter(I, &*Builder.GetInsertPoint());
-  return ReuseOrCreateCast(I, Ty, Op, IP); 
-} 
- 
-/// InsertBinop - Insert the specified binary operator, doing a small amount 
-/// of work to avoid inserting an obviously redundant operation, and hoisting 
-/// to an outer loop when the opportunity is there and it is safe. 
-Value *SCEVExpander::InsertBinop(Instruction::BinaryOps Opcode, 
-                                 Value *LHS, Value *RHS, 
-                                 SCEV::NoWrapFlags Flags, bool IsSafeToHoist) { 
-  // Fold a binop with constant operands. 
-  if (Constant *CLHS = dyn_cast<Constant>(LHS)) 
-    if (Constant *CRHS = dyn_cast<Constant>(RHS)) 
-      return ConstantExpr::get(Opcode, CLHS, CRHS); 
- 
-  // Do a quick scan to see if we have this binop nearby.  If so, reuse it. 
-  unsigned ScanLimit = 6; 
-  BasicBlock::iterator BlockBegin = Builder.GetInsertBlock()->begin(); 
-  // Scanning starts from the last instruction before the insertion point. 
-  BasicBlock::iterator IP = Builder.GetInsertPoint(); 
-  if (IP != BlockBegin) { 
-    --IP; 
-    for (; ScanLimit; --IP, --ScanLimit) { 
-      // Don't count dbg.value against the ScanLimit, to avoid perturbing the 
-      // generated code. 
-      if (isa<DbgInfoIntrinsic>(IP)) 
-        ScanLimit++; 
- 
-      auto canGenerateIncompatiblePoison = [&Flags](Instruction *I) { 
-        // Ensure that no-wrap flags match. 
-        if (isa<OverflowingBinaryOperator>(I)) { 
-          if (I->hasNoSignedWrap() != (Flags & SCEV::FlagNSW)) 
-            return true; 
-          if (I->hasNoUnsignedWrap() != (Flags & SCEV::FlagNUW)) 
-            return true; 
-        } 
-        // Conservatively, do not use any instruction which has any of exact 
-        // flags installed. 
-        if (isa<PossiblyExactOperator>(I) && I->isExact()) 
-          return true; 
-        return false; 
-      }; 
-      if (IP->getOpcode() == (unsigned)Opcode && IP->getOperand(0) == LHS && 
-          IP->getOperand(1) == RHS && !canGenerateIncompatiblePoison(&*IP)) 
-        return &*IP; 
-      if (IP == BlockBegin) break; 
-    } 
-  } 
- 
-  // Save the original insertion point so we can restore it when we're done. 
-  DebugLoc Loc = Builder.GetInsertPoint()->getDebugLoc(); 
-  SCEVInsertPointGuard Guard(Builder, this); 
- 
-  if (IsSafeToHoist) { 
-    // Move the insertion point out of as many loops as we can. 
-    while (const Loop *L = SE.LI.getLoopFor(Builder.GetInsertBlock())) { 
-      if (!L->isLoopInvariant(LHS) || !L->isLoopInvariant(RHS)) break; 
-      BasicBlock *Preheader = L->getLoopPreheader(); 
-      if (!Preheader) break; 
- 
-      // Ok, move up a level. 
-      Builder.SetInsertPoint(Preheader->getTerminator()); 
-    } 
-  } 
- 
-  // If we haven't found this binop, insert it. 
-  Instruction *BO = cast<Instruction>(Builder.CreateBinOp(Opcode, LHS, RHS)); 
-  BO->setDebugLoc(Loc); 
-  if (Flags & SCEV::FlagNUW) 
-    BO->setHasNoUnsignedWrap(); 
-  if (Flags & SCEV::FlagNSW) 
-    BO->setHasNoSignedWrap(); 
- 
-  return BO; 
-} 
- 
-/// FactorOutConstant - Test if S is divisible by Factor, using signed 
-/// division. If so, update S with Factor divided out and return true. 
-/// S need not be evenly divisible if a reasonable remainder can be 
-/// computed. 
-static bool FactorOutConstant(const SCEV *&S, const SCEV *&Remainder, 
-                              const SCEV *Factor, ScalarEvolution &SE, 
-                              const DataLayout &DL) { 
-  // Everything is divisible by one. 
-  if (Factor->isOne()) 
-    return true; 
- 
-  // x/x == 1. 
-  if (S == Factor) { 
-    S = SE.getConstant(S->getType(), 1); 
-    return true; 
-  } 
- 
-  // For a Constant, check for a multiple of the given factor. 
-  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) { 
-    // 0/x == 0. 
-    if (C->isZero()) 
-      return true; 
-    // Check for divisibility. 
-    if (const SCEVConstant *FC = dyn_cast<SCEVConstant>(Factor)) { 
-      ConstantInt *CI = 
-          ConstantInt::get(SE.getContext(), C->getAPInt().sdiv(FC->getAPInt())); 
-      // If the quotient is zero and the remainder is non-zero, reject 
-      // the value at this scale. It will be considered for subsequent 
-      // smaller scales. 
-      if (!CI->isZero()) { 
-        const SCEV *Div = SE.getConstant(CI); 
-        S = Div; 
-        Remainder = SE.getAddExpr( 
-            Remainder, SE.getConstant(C->getAPInt().srem(FC->getAPInt()))); 
-        return true; 
-      } 
-    } 
-  } 
- 
-  // In a Mul, check if there is a constant operand which is a multiple 
-  // of the given factor. 
-  if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S)) { 
-    // Size is known, check if there is a constant operand which is a multiple 
-    // of the given factor. If so, we can factor it. 
-    if (const SCEVConstant *FC = dyn_cast<SCEVConstant>(Factor)) 
-      if (const SCEVConstant *C = dyn_cast<SCEVConstant>(M->getOperand(0))) 
-        if (!C->getAPInt().srem(FC->getAPInt())) { 
+  return ReuseOrCreateCast(I, Ty, Op, IP);
+}
+
+/// InsertBinop - Insert the specified binary operator, doing a small amount
+/// of work to avoid inserting an obviously redundant operation, and hoisting
+/// to an outer loop when the opportunity is there and it is safe.
+Value *SCEVExpander::InsertBinop(Instruction::BinaryOps Opcode,
+                                 Value *LHS, Value *RHS,
+                                 SCEV::NoWrapFlags Flags, bool IsSafeToHoist) {
+  // Fold a binop with constant operands.
+  if (Constant *CLHS = dyn_cast<Constant>(LHS))
+    if (Constant *CRHS = dyn_cast<Constant>(RHS))
+      return ConstantExpr::get(Opcode, CLHS, CRHS);
+
+  // Do a quick scan to see if we have this binop nearby.  If so, reuse it.
+  unsigned ScanLimit = 6;
+  BasicBlock::iterator BlockBegin = Builder.GetInsertBlock()->begin();
+  // Scanning starts from the last instruction before the insertion point.
+  BasicBlock::iterator IP = Builder.GetInsertPoint();
+  if (IP != BlockBegin) {
+    --IP;
+    for (; ScanLimit; --IP, --ScanLimit) {
+      // Don't count dbg.value against the ScanLimit, to avoid perturbing the
+      // generated code.
+      if (isa<DbgInfoIntrinsic>(IP))
+        ScanLimit++;
+
+      auto canGenerateIncompatiblePoison = [&Flags](Instruction *I) {
+        // Ensure that no-wrap flags match.
+        if (isa<OverflowingBinaryOperator>(I)) {
+          if (I->hasNoSignedWrap() != (Flags & SCEV::FlagNSW))
+            return true;
+          if (I->hasNoUnsignedWrap() != (Flags & SCEV::FlagNUW))
+            return true;
+        }
+        // Conservatively, do not use any instruction which has any of exact
+        // flags installed.
+        if (isa<PossiblyExactOperator>(I) && I->isExact())
+          return true;
+        return false;
+      };
+      if (IP->getOpcode() == (unsigned)Opcode && IP->getOperand(0) == LHS &&
+          IP->getOperand(1) == RHS && !canGenerateIncompatiblePoison(&*IP))
+        return &*IP;
+      if (IP == BlockBegin) break;
+    }
+  }
+
+  // Save the original insertion point so we can restore it when we're done.
+  DebugLoc Loc = Builder.GetInsertPoint()->getDebugLoc();
+  SCEVInsertPointGuard Guard(Builder, this);
+
+  if (IsSafeToHoist) {
+    // Move the insertion point out of as many loops as we can.
+    while (const Loop *L = SE.LI.getLoopFor(Builder.GetInsertBlock())) {
+      if (!L->isLoopInvariant(LHS) || !L->isLoopInvariant(RHS)) break;
+      BasicBlock *Preheader = L->getLoopPreheader();
+      if (!Preheader) break;
+
+      // Ok, move up a level.
+      Builder.SetInsertPoint(Preheader->getTerminator());
+    }
+  }
+
+  // If we haven't found this binop, insert it.
+  Instruction *BO = cast<Instruction>(Builder.CreateBinOp(Opcode, LHS, RHS));
+  BO->setDebugLoc(Loc);
+  if (Flags & SCEV::FlagNUW)
+    BO->setHasNoUnsignedWrap();
+  if (Flags & SCEV::FlagNSW)
+    BO->setHasNoSignedWrap();
+
+  return BO;
+}
+
+/// FactorOutConstant - Test if S is divisible by Factor, using signed
+/// division. If so, update S with Factor divided out and return true.
+/// S need not be evenly divisible if a reasonable remainder can be
+/// computed.
+static bool FactorOutConstant(const SCEV *&S, const SCEV *&Remainder,
+                              const SCEV *Factor, ScalarEvolution &SE,
+                              const DataLayout &DL) {
+  // Everything is divisible by one.
+  if (Factor->isOne())
+    return true;
+
+  // x/x == 1.
+  if (S == Factor) {
+    S = SE.getConstant(S->getType(), 1);
+    return true;
+  }
+
+  // For a Constant, check for a multiple of the given factor.
+  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
+    // 0/x == 0.
+    if (C->isZero())
+      return true;
+    // Check for divisibility.
+    if (const SCEVConstant *FC = dyn_cast<SCEVConstant>(Factor)) {
+      ConstantInt *CI =
+          ConstantInt::get(SE.getContext(), C->getAPInt().sdiv(FC->getAPInt()));
+      // If the quotient is zero and the remainder is non-zero, reject
+      // the value at this scale. It will be considered for subsequent
+      // smaller scales.
+      if (!CI->isZero()) {
+        const SCEV *Div = SE.getConstant(CI);
+        S = Div;
+        Remainder = SE.getAddExpr(
+            Remainder, SE.getConstant(C->getAPInt().srem(FC->getAPInt())));
+        return true;
+      }
+    }
+  }
+
+  // In a Mul, check if there is a constant operand which is a multiple
+  // of the given factor.
+  if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S)) {
+    // Size is known, check if there is a constant operand which is a multiple
+    // of the given factor. If so, we can factor it.
+    if (const SCEVConstant *FC = dyn_cast<SCEVConstant>(Factor))
+      if (const SCEVConstant *C = dyn_cast<SCEVConstant>(M->getOperand(0)))
+        if (!C->getAPInt().srem(FC->getAPInt())) {
           SmallVector<const SCEV *, 4> NewMulOps(M->operands());
-          NewMulOps[0] = SE.getConstant(C->getAPInt().sdiv(FC->getAPInt())); 
-          S = SE.getMulExpr(NewMulOps); 
-          return true; 
-        } 
-  } 
- 
-  // In an AddRec, check if both start and step are divisible. 
-  if (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(S)) { 
-    const SCEV *Step = A->getStepRecurrence(SE); 
-    const SCEV *StepRem = SE.getConstant(Step->getType(), 0); 
-    if (!FactorOutConstant(Step, StepRem, Factor, SE, DL)) 
-      return false; 
-    if (!StepRem->isZero()) 
-      return false; 
-    const SCEV *Start = A->getStart(); 
-    if (!FactorOutConstant(Start, Remainder, Factor, SE, DL)) 
-      return false; 
-    S = SE.getAddRecExpr(Start, Step, A->getLoop(), 
-                         A->getNoWrapFlags(SCEV::FlagNW)); 
-    return true; 
-  } 
- 
-  return false; 
-} 
- 
-/// SimplifyAddOperands - Sort and simplify a list of add operands. NumAddRecs 
-/// is the number of SCEVAddRecExprs present, which are kept at the end of 
-/// the list. 
-/// 
-static void SimplifyAddOperands(SmallVectorImpl<const SCEV *> &Ops, 
-                                Type *Ty, 
-                                ScalarEvolution &SE) { 
-  unsigned NumAddRecs = 0; 
-  for (unsigned i = Ops.size(); i > 0 && isa<SCEVAddRecExpr>(Ops[i-1]); --i) 
-    ++NumAddRecs; 
-  // Group Ops into non-addrecs and addrecs. 
-  SmallVector<const SCEV *, 8> NoAddRecs(Ops.begin(), Ops.end() - NumAddRecs); 
-  SmallVector<const SCEV *, 8> AddRecs(Ops.end() - NumAddRecs, Ops.end()); 
-  // Let ScalarEvolution sort and simplify the non-addrecs list. 
-  const SCEV *Sum = NoAddRecs.empty() ? 
-                    SE.getConstant(Ty, 0) : 
-                    SE.getAddExpr(NoAddRecs); 
-  // If it returned an add, use the operands. Otherwise it simplified 
-  // the sum into a single value, so just use that. 
-  Ops.clear(); 
-  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Sum)) 
-    Ops.append(Add->op_begin(), Add->op_end()); 
-  else if (!Sum->isZero()) 
-    Ops.push_back(Sum); 
-  // Then append the addrecs. 
-  Ops.append(AddRecs.begin(), AddRecs.end()); 
-} 
- 
-/// SplitAddRecs - Flatten a list of add operands, moving addrec start values 
-/// out to the top level. For example, convert {a + b,+,c} to a, b, {0,+,d}. 
-/// This helps expose more opportunities for folding parts of the expressions 
-/// into GEP indices. 
-/// 
-static void SplitAddRecs(SmallVectorImpl<const SCEV *> &Ops, 
-                         Type *Ty, 
-                         ScalarEvolution &SE) { 
-  // Find the addrecs. 
-  SmallVector<const SCEV *, 8> AddRecs; 
-  for (unsigned i = 0, e = Ops.size(); i != e; ++i) 
-    while (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(Ops[i])) { 
-      const SCEV *Start = A->getStart(); 
-      if (Start->isZero()) break; 
-      const SCEV *Zero = SE.getConstant(Ty, 0); 
-      AddRecs.push_back(SE.getAddRecExpr(Zero, 
-                                         A->getStepRecurrence(SE), 
-                                         A->getLoop(), 
-                                         A->getNoWrapFlags(SCEV::FlagNW))); 
-      if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Start)) { 
-        Ops[i] = Zero; 
-        Ops.append(Add->op_begin(), Add->op_end()); 
-        e += Add->getNumOperands(); 
-      } else { 
-        Ops[i] = Start; 
-      } 
-    } 
-  if (!AddRecs.empty()) { 
-    // Add the addrecs onto the end of the list. 
-    Ops.append(AddRecs.begin(), AddRecs.end()); 
-    // Resort the operand list, moving any constants to the front. 
-    SimplifyAddOperands(Ops, Ty, SE); 
-  } 
-} 
- 
-/// expandAddToGEP - Expand an addition expression with a pointer type into 
-/// a GEP instead of using ptrtoint+arithmetic+inttoptr. This helps 
-/// BasicAliasAnalysis and other passes analyze the result. See the rules 
-/// for getelementptr vs. inttoptr in 
-/// http://llvm.org/docs/LangRef.html#pointeraliasing 
-/// for details. 
-/// 
-/// Design note: The correctness of using getelementptr here depends on 
-/// ScalarEvolution not recognizing inttoptr and ptrtoint operators, as 
-/// they may introduce pointer arithmetic which may not be safely converted 
-/// into getelementptr. 
-/// 
-/// Design note: It might seem desirable for this function to be more 
-/// loop-aware. If some of the indices are loop-invariant while others 
-/// aren't, it might seem desirable to emit multiple GEPs, keeping the 
-/// loop-invariant portions of the overall computation outside the loop. 
-/// However, there are a few reasons this is not done here. Hoisting simple 
-/// arithmetic is a low-level optimization that often isn't very 
-/// important until late in the optimization process. In fact, passes 
-/// like InstructionCombining will combine GEPs, even if it means 
-/// pushing loop-invariant computation down into loops, so even if the 
-/// GEPs were split here, the work would quickly be undone. The 
-/// LoopStrengthReduction pass, which is usually run quite late (and 
-/// after the last InstructionCombining pass), takes care of hoisting 
-/// loop-invariant portions of expressions, after considering what 
-/// can be folded using target addressing modes. 
-/// 
-Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin, 
-                                    const SCEV *const *op_end, 
-                                    PointerType *PTy, 
-                                    Type *Ty, 
-                                    Value *V) { 
-  Type *OriginalElTy = PTy->getElementType(); 
-  Type *ElTy = OriginalElTy; 
-  SmallVector<Value *, 4> GepIndices; 
-  SmallVector<const SCEV *, 8> Ops(op_begin, op_end); 
-  bool AnyNonZeroIndices = false; 
- 
-  // Split AddRecs up into parts as either of the parts may be usable 
-  // without the other. 
-  SplitAddRecs(Ops, Ty, SE); 
- 
-  Type *IntIdxTy = DL.getIndexType(PTy); 
- 
-  // Descend down the pointer's type and attempt to convert the other 
-  // operands into GEP indices, at each level. The first index in a GEP 
-  // indexes into the array implied by the pointer operand; the rest of 
-  // the indices index into the element or field type selected by the 
-  // preceding index. 
-  for (;;) { 
-    // If the scale size is not 0, attempt to factor out a scale for 
-    // array indexing. 
-    SmallVector<const SCEV *, 8> ScaledOps; 
-    if (ElTy->isSized()) { 
-      const SCEV *ElSize = SE.getSizeOfExpr(IntIdxTy, ElTy); 
-      if (!ElSize->isZero()) { 
-        SmallVector<const SCEV *, 8> NewOps; 
-        for (const SCEV *Op : Ops) { 
-          const SCEV *Remainder = SE.getConstant(Ty, 0); 
-          if (FactorOutConstant(Op, Remainder, ElSize, SE, DL)) { 
-            // Op now has ElSize factored out. 
-            ScaledOps.push_back(Op); 
-            if (!Remainder->isZero()) 
-              NewOps.push_back(Remainder); 
-            AnyNonZeroIndices = true; 
-          } else { 
-            // The operand was not divisible, so add it to the list of operands 
-            // we'll scan next iteration. 
-            NewOps.push_back(Op); 
-          } 
-        } 
-        // If we made any changes, update Ops. 
-        if (!ScaledOps.empty()) { 
-          Ops = NewOps; 
-          SimplifyAddOperands(Ops, Ty, SE); 
-        } 
-      } 
-    } 
- 
-    // Record the scaled array index for this level of the type. If 
-    // we didn't find any operands that could be factored, tentatively 
-    // assume that element zero was selected (since the zero offset 
-    // would obviously be folded away). 
+          NewMulOps[0] = SE.getConstant(C->getAPInt().sdiv(FC->getAPInt()));
+          S = SE.getMulExpr(NewMulOps);
+          return true;
+        }
+  }
+
+  // In an AddRec, check if both start and step are divisible.
+  if (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(S)) {
+    const SCEV *Step = A->getStepRecurrence(SE);
+    const SCEV *StepRem = SE.getConstant(Step->getType(), 0);
+    if (!FactorOutConstant(Step, StepRem, Factor, SE, DL))
+      return false;
+    if (!StepRem->isZero())
+      return false;
+    const SCEV *Start = A->getStart();
+    if (!FactorOutConstant(Start, Remainder, Factor, SE, DL))
+      return false;
+    S = SE.getAddRecExpr(Start, Step, A->getLoop(),
+                         A->getNoWrapFlags(SCEV::FlagNW));
+    return true;
+  }
+
+  return false;
+}
+
+/// SimplifyAddOperands - Sort and simplify a list of add operands. NumAddRecs
+/// is the number of SCEVAddRecExprs present, which are kept at the end of
+/// the list.
+///
+static void SimplifyAddOperands(SmallVectorImpl<const SCEV *> &Ops,
+                                Type *Ty,
+                                ScalarEvolution &SE) {
+  unsigned NumAddRecs = 0;
+  for (unsigned i = Ops.size(); i > 0 && isa<SCEVAddRecExpr>(Ops[i-1]); --i)
+    ++NumAddRecs;
+  // Group Ops into non-addrecs and addrecs.
+  SmallVector<const SCEV *, 8> NoAddRecs(Ops.begin(), Ops.end() - NumAddRecs);
+  SmallVector<const SCEV *, 8> AddRecs(Ops.end() - NumAddRecs, Ops.end());
+  // Let ScalarEvolution sort and simplify the non-addrecs list.
+  const SCEV *Sum = NoAddRecs.empty() ?
+                    SE.getConstant(Ty, 0) :
+                    SE.getAddExpr(NoAddRecs);
+  // If it returned an add, use the operands. Otherwise it simplified
+  // the sum into a single value, so just use that.
+  Ops.clear();
+  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Sum))
+    Ops.append(Add->op_begin(), Add->op_end());
+  else if (!Sum->isZero())
+    Ops.push_back(Sum);
+  // Then append the addrecs.
+  Ops.append(AddRecs.begin(), AddRecs.end());
+}
+
+/// SplitAddRecs - Flatten a list of add operands, moving addrec start values
+/// out to the top level. For example, convert {a + b,+,c} to a, b, {0,+,d}.
+/// This helps expose more opportunities for folding parts of the expressions
+/// into GEP indices.
+///
+static void SplitAddRecs(SmallVectorImpl<const SCEV *> &Ops,
+                         Type *Ty,
+                         ScalarEvolution &SE) {
+  // Find the addrecs.
+  SmallVector<const SCEV *, 8> AddRecs;
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    while (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(Ops[i])) {
+      const SCEV *Start = A->getStart();
+      if (Start->isZero()) break;
+      const SCEV *Zero = SE.getConstant(Ty, 0);
+      AddRecs.push_back(SE.getAddRecExpr(Zero,
+                                         A->getStepRecurrence(SE),
+                                         A->getLoop(),
+                                         A->getNoWrapFlags(SCEV::FlagNW)));
+      if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Start)) {
+        Ops[i] = Zero;
+        Ops.append(Add->op_begin(), Add->op_end());
+        e += Add->getNumOperands();
+      } else {
+        Ops[i] = Start;
+      }
+    }
+  if (!AddRecs.empty()) {
+    // Add the addrecs onto the end of the list.
+    Ops.append(AddRecs.begin(), AddRecs.end());
+    // Resort the operand list, moving any constants to the front.
+    SimplifyAddOperands(Ops, Ty, SE);
+  }
+}
+
+/// expandAddToGEP - Expand an addition expression with a pointer type into
+/// a GEP instead of using ptrtoint+arithmetic+inttoptr. This helps
+/// BasicAliasAnalysis and other passes analyze the result. See the rules
+/// for getelementptr vs. inttoptr in
+/// http://llvm.org/docs/LangRef.html#pointeraliasing
+/// for details.
+///
+/// Design note: The correctness of using getelementptr here depends on
+/// ScalarEvolution not recognizing inttoptr and ptrtoint operators, as
+/// they may introduce pointer arithmetic which may not be safely converted
+/// into getelementptr.
+///
+/// Design note: It might seem desirable for this function to be more
+/// loop-aware. If some of the indices are loop-invariant while others
+/// aren't, it might seem desirable to emit multiple GEPs, keeping the
+/// loop-invariant portions of the overall computation outside the loop.
+/// However, there are a few reasons this is not done here. Hoisting simple
+/// arithmetic is a low-level optimization that often isn't very
+/// important until late in the optimization process. In fact, passes
+/// like InstructionCombining will combine GEPs, even if it means
+/// pushing loop-invariant computation down into loops, so even if the
+/// GEPs were split here, the work would quickly be undone. The
+/// LoopStrengthReduction pass, which is usually run quite late (and
+/// after the last InstructionCombining pass), takes care of hoisting
+/// loop-invariant portions of expressions, after considering what
+/// can be folded using target addressing modes.
+///
+Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
+                                    const SCEV *const *op_end,
+                                    PointerType *PTy,
+                                    Type *Ty,
+                                    Value *V) {
+  Type *OriginalElTy = PTy->getElementType();
+  Type *ElTy = OriginalElTy;
+  SmallVector<Value *, 4> GepIndices;
+  SmallVector<const SCEV *, 8> Ops(op_begin, op_end);
+  bool AnyNonZeroIndices = false;
+
+  // Split AddRecs up into parts as either of the parts may be usable
+  // without the other.
+  SplitAddRecs(Ops, Ty, SE);
+
+  Type *IntIdxTy = DL.getIndexType(PTy);
+
+  // Descend down the pointer's type and attempt to convert the other
+  // operands into GEP indices, at each level. The first index in a GEP
+  // indexes into the array implied by the pointer operand; the rest of
+  // the indices index into the element or field type selected by the
+  // preceding index.
+  for (;;) {
+    // If the scale size is not 0, attempt to factor out a scale for
+    // array indexing.
+    SmallVector<const SCEV *, 8> ScaledOps;
+    if (ElTy->isSized()) {
+      const SCEV *ElSize = SE.getSizeOfExpr(IntIdxTy, ElTy);
+      if (!ElSize->isZero()) {
+        SmallVector<const SCEV *, 8> NewOps;
+        for (const SCEV *Op : Ops) {
+          const SCEV *Remainder = SE.getConstant(Ty, 0);
+          if (FactorOutConstant(Op, Remainder, ElSize, SE, DL)) {
+            // Op now has ElSize factored out.
+            ScaledOps.push_back(Op);
+            if (!Remainder->isZero())
+              NewOps.push_back(Remainder);
+            AnyNonZeroIndices = true;
+          } else {
+            // The operand was not divisible, so add it to the list of operands
+            // we'll scan next iteration.
+            NewOps.push_back(Op);
+          }
+        }
+        // If we made any changes, update Ops.
+        if (!ScaledOps.empty()) {
+          Ops = NewOps;
+          SimplifyAddOperands(Ops, Ty, SE);
+        }
+      }
+    }
+
+    // Record the scaled array index for this level of the type. If
+    // we didn't find any operands that could be factored, tentatively
+    // assume that element zero was selected (since the zero offset
+    // would obviously be folded away).
     Value *Scaled =
         ScaledOps.empty()
             ? Constant::getNullValue(Ty)
             : expandCodeForImpl(SE.getAddExpr(ScaledOps), Ty, false);
-    GepIndices.push_back(Scaled); 
- 
-    // Collect struct field index operands. 
-    while (StructType *STy = dyn_cast<StructType>(ElTy)) { 
-      bool FoundFieldNo = false; 
-      // An empty struct has no fields. 
-      if (STy->getNumElements() == 0) break; 
-      // Field offsets are known. See if a constant offset falls within any of 
-      // the struct fields. 
-      if (Ops.empty()) 
-        break; 
-      if (const SCEVConstant *C = dyn_cast<SCEVConstant>(Ops[0])) 
-        if (SE.getTypeSizeInBits(C->getType()) <= 64) { 
-          const StructLayout &SL = *DL.getStructLayout(STy); 
-          uint64_t FullOffset = C->getValue()->getZExtValue(); 
-          if (FullOffset < SL.getSizeInBytes()) { 
-            unsigned ElIdx = SL.getElementContainingOffset(FullOffset); 
-            GepIndices.push_back( 
-                ConstantInt::get(Type::getInt32Ty(Ty->getContext()), ElIdx)); 
-            ElTy = STy->getTypeAtIndex(ElIdx); 
-            Ops[0] = 
-                SE.getConstant(Ty, FullOffset - SL.getElementOffset(ElIdx)); 
-            AnyNonZeroIndices = true; 
-            FoundFieldNo = true; 
-          } 
-        } 
-      // If no struct field offsets were found, tentatively assume that 
-      // field zero was selected (since the zero offset would obviously 
-      // be folded away). 
-      if (!FoundFieldNo) { 
-        ElTy = STy->getTypeAtIndex(0u); 
-        GepIndices.push_back( 
-          Constant::getNullValue(Type::getInt32Ty(Ty->getContext()))); 
-      } 
-    } 
- 
-    if (ArrayType *ATy = dyn_cast<ArrayType>(ElTy)) 
-      ElTy = ATy->getElementType(); 
-    else 
-      // FIXME: Handle VectorType. 
-      // E.g., If ElTy is scalable vector, then ElSize is not a compile-time 
-      // constant, therefore can not be factored out. The generated IR is less 
-      // ideal with base 'V' cast to i8* and do ugly getelementptr over that. 
-      break; 
-  } 
- 
-  // If none of the operands were convertible to proper GEP indices, cast 
-  // the base to i8* and do an ugly getelementptr with that. It's still 
-  // better than ptrtoint+arithmetic+inttoptr at least. 
-  if (!AnyNonZeroIndices) { 
-    // Cast the base to i8*. 
-    V = InsertNoopCastOfTo(V, 
-       Type::getInt8PtrTy(Ty->getContext(), PTy->getAddressSpace())); 
- 
-    assert(!isa<Instruction>(V) || 
-           SE.DT.dominates(cast<Instruction>(V), &*Builder.GetInsertPoint())); 
- 
-    // Expand the operands for a plain byte offset. 
+    GepIndices.push_back(Scaled);
+
+    // Collect struct field index operands.
+    while (StructType *STy = dyn_cast<StructType>(ElTy)) {
+      bool FoundFieldNo = false;
+      // An empty struct has no fields.
+      if (STy->getNumElements() == 0) break;
+      // Field offsets are known. See if a constant offset falls within any of
+      // the struct fields.
+      if (Ops.empty())
+        break;
+      if (const SCEVConstant *C = dyn_cast<SCEVConstant>(Ops[0]))
+        if (SE.getTypeSizeInBits(C->getType()) <= 64) {
+          const StructLayout &SL = *DL.getStructLayout(STy);
+          uint64_t FullOffset = C->getValue()->getZExtValue();
+          if (FullOffset < SL.getSizeInBytes()) {
+            unsigned ElIdx = SL.getElementContainingOffset(FullOffset);
+            GepIndices.push_back(
+                ConstantInt::get(Type::getInt32Ty(Ty->getContext()), ElIdx));
+            ElTy = STy->getTypeAtIndex(ElIdx);
+            Ops[0] =
+                SE.getConstant(Ty, FullOffset - SL.getElementOffset(ElIdx));
+            AnyNonZeroIndices = true;
+            FoundFieldNo = true;
+          }
+        }
+      // If no struct field offsets were found, tentatively assume that
+      // field zero was selected (since the zero offset would obviously
+      // be folded away).
+      if (!FoundFieldNo) {
+        ElTy = STy->getTypeAtIndex(0u);
+        GepIndices.push_back(
+          Constant::getNullValue(Type::getInt32Ty(Ty->getContext())));
+      }
+    }
+
+    if (ArrayType *ATy = dyn_cast<ArrayType>(ElTy))
+      ElTy = ATy->getElementType();
+    else
+      // FIXME: Handle VectorType.
+      // E.g., If ElTy is scalable vector, then ElSize is not a compile-time
+      // constant, therefore can not be factored out. The generated IR is less
+      // ideal with base 'V' cast to i8* and do ugly getelementptr over that.
+      break;
+  }
+
+  // If none of the operands were convertible to proper GEP indices, cast
+  // the base to i8* and do an ugly getelementptr with that. It's still
+  // better than ptrtoint+arithmetic+inttoptr at least.
+  if (!AnyNonZeroIndices) {
+    // Cast the base to i8*.
+    V = InsertNoopCastOfTo(V,
+       Type::getInt8PtrTy(Ty->getContext(), PTy->getAddressSpace()));
+
+    assert(!isa<Instruction>(V) ||
+           SE.DT.dominates(cast<Instruction>(V), &*Builder.GetInsertPoint()));
+
+    // Expand the operands for a plain byte offset.
     Value *Idx = expandCodeForImpl(SE.getAddExpr(Ops), Ty, false);
- 
-    // Fold a GEP with constant operands. 
-    if (Constant *CLHS = dyn_cast<Constant>(V)) 
-      if (Constant *CRHS = dyn_cast<Constant>(Idx)) 
-        return ConstantExpr::getGetElementPtr(Type::getInt8Ty(Ty->getContext()), 
-                                              CLHS, CRHS); 
- 
-    // Do a quick scan to see if we have this GEP nearby.  If so, reuse it. 
-    unsigned ScanLimit = 6; 
-    BasicBlock::iterator BlockBegin = Builder.GetInsertBlock()->begin(); 
-    // Scanning starts from the last instruction before the insertion point. 
-    BasicBlock::iterator IP = Builder.GetInsertPoint(); 
-    if (IP != BlockBegin) { 
-      --IP; 
-      for (; ScanLimit; --IP, --ScanLimit) { 
-        // Don't count dbg.value against the ScanLimit, to avoid perturbing the 
-        // generated code. 
-        if (isa<DbgInfoIntrinsic>(IP)) 
-          ScanLimit++; 
-        if (IP->getOpcode() == Instruction::GetElementPtr && 
-            IP->getOperand(0) == V && IP->getOperand(1) == Idx) 
-          return &*IP; 
-        if (IP == BlockBegin) break; 
-      } 
-    } 
- 
-    // Save the original insertion point so we can restore it when we're done. 
-    SCEVInsertPointGuard Guard(Builder, this); 
- 
-    // Move the insertion point out of as many loops as we can. 
-    while (const Loop *L = SE.LI.getLoopFor(Builder.GetInsertBlock())) { 
-      if (!L->isLoopInvariant(V) || !L->isLoopInvariant(Idx)) break; 
-      BasicBlock *Preheader = L->getLoopPreheader(); 
-      if (!Preheader) break; 
- 
-      // Ok, move up a level. 
-      Builder.SetInsertPoint(Preheader->getTerminator()); 
-    } 
- 
-    // Emit a GEP. 
+
+    // Fold a GEP with constant operands.
+    if (Constant *CLHS = dyn_cast<Constant>(V))
+      if (Constant *CRHS = dyn_cast<Constant>(Idx))
+        return ConstantExpr::getGetElementPtr(Type::getInt8Ty(Ty->getContext()),
+                                              CLHS, CRHS);
+
+    // Do a quick scan to see if we have this GEP nearby.  If so, reuse it.
+    unsigned ScanLimit = 6;
+    BasicBlock::iterator BlockBegin = Builder.GetInsertBlock()->begin();
+    // Scanning starts from the last instruction before the insertion point.
+    BasicBlock::iterator IP = Builder.GetInsertPoint();
+    if (IP != BlockBegin) {
+      --IP;
+      for (; ScanLimit; --IP, --ScanLimit) {
+        // Don't count dbg.value against the ScanLimit, to avoid perturbing the
+        // generated code.
+        if (isa<DbgInfoIntrinsic>(IP))
+          ScanLimit++;
+        if (IP->getOpcode() == Instruction::GetElementPtr &&
+            IP->getOperand(0) == V && IP->getOperand(1) == Idx)
+          return &*IP;
+        if (IP == BlockBegin) break;
+      }
+    }
+
+    // Save the original insertion point so we can restore it when we're done.
+    SCEVInsertPointGuard Guard(Builder, this);
+
+    // Move the insertion point out of as many loops as we can.
+    while (const Loop *L = SE.LI.getLoopFor(Builder.GetInsertBlock())) {
+      if (!L->isLoopInvariant(V) || !L->isLoopInvariant(Idx)) break;
+      BasicBlock *Preheader = L->getLoopPreheader();
+      if (!Preheader) break;
+
+      // Ok, move up a level.
+      Builder.SetInsertPoint(Preheader->getTerminator());
+    }
+
+    // Emit a GEP.
     return Builder.CreateGEP(Builder.getInt8Ty(), V, Idx, "uglygep");
-  } 
- 
-  { 
-    SCEVInsertPointGuard Guard(Builder, this); 
- 
-    // Move the insertion point out of as many loops as we can. 
-    while (const Loop *L = SE.LI.getLoopFor(Builder.GetInsertBlock())) { 
-      if (!L->isLoopInvariant(V)) break; 
- 
-      bool AnyIndexNotLoopInvariant = any_of( 
-          GepIndices, [L](Value *Op) { return !L->isLoopInvariant(Op); }); 
- 
-      if (AnyIndexNotLoopInvariant) 
-        break; 
- 
-      BasicBlock *Preheader = L->getLoopPreheader(); 
-      if (!Preheader) break; 
- 
-      // Ok, move up a level. 
-      Builder.SetInsertPoint(Preheader->getTerminator()); 
-    } 
- 
-    // Insert a pretty getelementptr. Note that this GEP is not marked inbounds, 
-    // because ScalarEvolution may have changed the address arithmetic to 
-    // compute a value which is beyond the end of the allocated object. 
-    Value *Casted = V; 
-    if (V->getType() != PTy) 
-      Casted = InsertNoopCastOfTo(Casted, PTy); 
-    Value *GEP = Builder.CreateGEP(OriginalElTy, Casted, GepIndices, "scevgep"); 
-    Ops.push_back(SE.getUnknown(GEP)); 
-  } 
- 
-  return expand(SE.getAddExpr(Ops)); 
-} 
- 
-Value *SCEVExpander::expandAddToGEP(const SCEV *Op, PointerType *PTy, Type *Ty, 
-                                    Value *V) { 
-  const SCEV *const Ops[1] = {Op}; 
-  return expandAddToGEP(Ops, Ops + 1, PTy, Ty, V); 
-} 
- 
-/// PickMostRelevantLoop - Given two loops pick the one that's most relevant for 
-/// SCEV expansion. If they are nested, this is the most nested. If they are 
-/// neighboring, pick the later. 
-static const Loop *PickMostRelevantLoop(const Loop *A, const Loop *B, 
-                                        DominatorTree &DT) { 
-  if (!A) return B; 
-  if (!B) return A; 
-  if (A->contains(B)) return B; 
-  if (B->contains(A)) return A; 
-  if (DT.dominates(A->getHeader(), B->getHeader())) return B; 
-  if (DT.dominates(B->getHeader(), A->getHeader())) return A; 
-  return A; // Arbitrarily break the tie. 
-} 
- 
-/// getRelevantLoop - Get the most relevant loop associated with the given 
-/// expression, according to PickMostRelevantLoop. 
-const Loop *SCEVExpander::getRelevantLoop(const SCEV *S) { 
-  // Test whether we've already computed the most relevant loop for this SCEV. 
-  auto Pair = RelevantLoops.insert(std::make_pair(S, nullptr)); 
-  if (!Pair.second) 
-    return Pair.first->second; 
- 
-  if (isa<SCEVConstant>(S)) 
-    // A constant has no relevant loops. 
-    return nullptr; 
-  if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) { 
-    if (const Instruction *I = dyn_cast<Instruction>(U->getValue())) 
-      return Pair.first->second = SE.LI.getLoopFor(I->getParent()); 
-    // A non-instruction has no relevant loops. 
-    return nullptr; 
-  } 
-  if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S)) { 
-    const Loop *L = nullptr; 
-    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) 
-      L = AR->getLoop(); 
-    for (const SCEV *Op : N->operands()) 
-      L = PickMostRelevantLoop(L, getRelevantLoop(Op), SE.DT); 
-    return RelevantLoops[N] = L; 
-  } 
-  if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S)) { 
-    const Loop *Result = getRelevantLoop(C->getOperand()); 
-    return RelevantLoops[C] = Result; 
-  } 
-  if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) { 
-    const Loop *Result = PickMostRelevantLoop( 
-        getRelevantLoop(D->getLHS()), getRelevantLoop(D->getRHS()), SE.DT); 
-    return RelevantLoops[D] = Result; 
-  } 
-  llvm_unreachable("Unexpected SCEV type!"); 
-} 
- 
-namespace { 
- 
-/// LoopCompare - Compare loops by PickMostRelevantLoop. 
-class LoopCompare { 
-  DominatorTree &DT; 
-public: 
-  explicit LoopCompare(DominatorTree &dt) : DT(dt) {} 
- 
-  bool operator()(std::pair<const Loop *, const SCEV *> LHS, 
-                  std::pair<const Loop *, const SCEV *> RHS) const { 
-    // Keep pointer operands sorted at the end. 
-    if (LHS.second->getType()->isPointerTy() != 
-        RHS.second->getType()->isPointerTy()) 
-      return LHS.second->getType()->isPointerTy(); 
- 
-    // Compare loops with PickMostRelevantLoop. 
-    if (LHS.first != RHS.first) 
-      return PickMostRelevantLoop(LHS.first, RHS.first, DT) != LHS.first; 
- 
-    // If one operand is a non-constant negative and the other is not, 
-    // put the non-constant negative on the right so that a sub can 
-    // be used instead of a negate and add. 
-    if (LHS.second->isNonConstantNegative()) { 
-      if (!RHS.second->isNonConstantNegative()) 
-        return false; 
-    } else if (RHS.second->isNonConstantNegative()) 
-      return true; 
- 
-    // Otherwise they are equivalent according to this comparison. 
-    return false; 
-  } 
-}; 
- 
-} 
- 
-Value *SCEVExpander::visitAddExpr(const SCEVAddExpr *S) { 
-  Type *Ty = SE.getEffectiveSCEVType(S->getType()); 
- 
-  // Collect all the add operands in a loop, along with their associated loops. 
-  // Iterate in reverse so that constants are emitted last, all else equal, and 
-  // so that pointer operands are inserted first, which the code below relies on 
-  // to form more involved GEPs. 
-  SmallVector<std::pair<const Loop *, const SCEV *>, 8> OpsAndLoops; 
-  for (std::reverse_iterator<SCEVAddExpr::op_iterator> I(S->op_end()), 
-       E(S->op_begin()); I != E; ++I) 
-    OpsAndLoops.push_back(std::make_pair(getRelevantLoop(*I), *I)); 
- 
-  // Sort by loop. Use a stable sort so that constants follow non-constants and 
-  // pointer operands precede non-pointer operands. 
-  llvm::stable_sort(OpsAndLoops, LoopCompare(SE.DT)); 
- 
-  // Emit instructions to add all the operands. Hoist as much as possible 
-  // out of loops, and form meaningful getelementptrs where possible. 
-  Value *Sum = nullptr; 
-  for (auto I = OpsAndLoops.begin(), E = OpsAndLoops.end(); I != E;) { 
-    const Loop *CurLoop = I->first; 
-    const SCEV *Op = I->second; 
-    if (!Sum) { 
-      // This is the first operand. Just expand it. 
-      Sum = expand(Op); 
-      ++I; 
-    } else if (PointerType *PTy = dyn_cast<PointerType>(Sum->getType())) { 
-      // The running sum expression is a pointer. Try to form a getelementptr 
-      // at this level with that as the base. 
-      SmallVector<const SCEV *, 4> NewOps; 
-      for (; I != E && I->first == CurLoop; ++I) { 
-        // If the operand is SCEVUnknown and not instructions, peek through 
-        // it, to enable more of it to be folded into the GEP. 
-        const SCEV *X = I->second; 
-        if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(X)) 
-          if (!isa<Instruction>(U->getValue())) 
-            X = SE.getSCEV(U->getValue()); 
-        NewOps.push_back(X); 
-      } 
-      Sum = expandAddToGEP(NewOps.begin(), NewOps.end(), PTy, Ty, Sum); 
-    } else if (PointerType *PTy = dyn_cast<PointerType>(Op->getType())) { 
-      // The running sum is an integer, and there's a pointer at this level. 
-      // Try to form a getelementptr. If the running sum is instructions, 
-      // use a SCEVUnknown to avoid re-analyzing them. 
-      SmallVector<const SCEV *, 4> NewOps; 
-      NewOps.push_back(isa<Instruction>(Sum) ? SE.getUnknown(Sum) : 
-                                               SE.getSCEV(Sum)); 
-      for (++I; I != E && I->first == CurLoop; ++I) 
-        NewOps.push_back(I->second); 
-      Sum = expandAddToGEP(NewOps.begin(), NewOps.end(), PTy, Ty, expand(Op)); 
-    } else if (Op->isNonConstantNegative()) { 
-      // Instead of doing a negate and add, just do a subtract. 
+  }
+
+  {
+    SCEVInsertPointGuard Guard(Builder, this);
+
+    // Move the insertion point out of as many loops as we can.
+    while (const Loop *L = SE.LI.getLoopFor(Builder.GetInsertBlock())) {
+      if (!L->isLoopInvariant(V)) break;
+
+      bool AnyIndexNotLoopInvariant = any_of(
+          GepIndices, [L](Value *Op) { return !L->isLoopInvariant(Op); });
+
+      if (AnyIndexNotLoopInvariant)
+        break;
+
+      BasicBlock *Preheader = L->getLoopPreheader();
+      if (!Preheader) break;
+
+      // Ok, move up a level.
+      Builder.SetInsertPoint(Preheader->getTerminator());
+    }
+
+    // Insert a pretty getelementptr. Note that this GEP is not marked inbounds,
+    // because ScalarEvolution may have changed the address arithmetic to
+    // compute a value which is beyond the end of the allocated object.
+    Value *Casted = V;
+    if (V->getType() != PTy)
+      Casted = InsertNoopCastOfTo(Casted, PTy);
+    Value *GEP = Builder.CreateGEP(OriginalElTy, Casted, GepIndices, "scevgep");
+    Ops.push_back(SE.getUnknown(GEP));
+  }
+
+  return expand(SE.getAddExpr(Ops));
+}
+
+Value *SCEVExpander::expandAddToGEP(const SCEV *Op, PointerType *PTy, Type *Ty,
+                                    Value *V) {
+  const SCEV *const Ops[1] = {Op};
+  return expandAddToGEP(Ops, Ops + 1, PTy, Ty, V);
+}
+
+/// PickMostRelevantLoop - Given two loops pick the one that's most relevant for
+/// SCEV expansion. If they are nested, this is the most nested. If they are
+/// neighboring, pick the later.
+static const Loop *PickMostRelevantLoop(const Loop *A, const Loop *B,
+                                        DominatorTree &DT) {
+  if (!A) return B;
+  if (!B) return A;
+  if (A->contains(B)) return B;
+  if (B->contains(A)) return A;
+  if (DT.dominates(A->getHeader(), B->getHeader())) return B;
+  if (DT.dominates(B->getHeader(), A->getHeader())) return A;
+  return A; // Arbitrarily break the tie.
+}
+
+/// getRelevantLoop - Get the most relevant loop associated with the given
+/// expression, according to PickMostRelevantLoop.
+const Loop *SCEVExpander::getRelevantLoop(const SCEV *S) {
+  // Test whether we've already computed the most relevant loop for this SCEV.
+  auto Pair = RelevantLoops.insert(std::make_pair(S, nullptr));
+  if (!Pair.second)
+    return Pair.first->second;
+
+  if (isa<SCEVConstant>(S))
+    // A constant has no relevant loops.
+    return nullptr;
+  if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
+    if (const Instruction *I = dyn_cast<Instruction>(U->getValue()))
+      return Pair.first->second = SE.LI.getLoopFor(I->getParent());
+    // A non-instruction has no relevant loops.
+    return nullptr;
+  }
+  if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S)) {
+    const Loop *L = nullptr;
+    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
+      L = AR->getLoop();
+    for (const SCEV *Op : N->operands())
+      L = PickMostRelevantLoop(L, getRelevantLoop(Op), SE.DT);
+    return RelevantLoops[N] = L;
+  }
+  if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S)) {
+    const Loop *Result = getRelevantLoop(C->getOperand());
+    return RelevantLoops[C] = Result;
+  }
+  if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
+    const Loop *Result = PickMostRelevantLoop(
+        getRelevantLoop(D->getLHS()), getRelevantLoop(D->getRHS()), SE.DT);
+    return RelevantLoops[D] = Result;
+  }
+  llvm_unreachable("Unexpected SCEV type!");
+}
+
+namespace {
+
+/// LoopCompare - Compare loops by PickMostRelevantLoop.
+class LoopCompare {
+  DominatorTree &DT;
+public:
+  explicit LoopCompare(DominatorTree &dt) : DT(dt) {}
+
+  bool operator()(std::pair<const Loop *, const SCEV *> LHS,
+                  std::pair<const Loop *, const SCEV *> RHS) const {
+    // Keep pointer operands sorted at the end.
+    if (LHS.second->getType()->isPointerTy() !=
+        RHS.second->getType()->isPointerTy())
+      return LHS.second->getType()->isPointerTy();
+
+    // Compare loops with PickMostRelevantLoop.
+    if (LHS.first != RHS.first)
+      return PickMostRelevantLoop(LHS.first, RHS.first, DT) != LHS.first;
+
+    // If one operand is a non-constant negative and the other is not,
+    // put the non-constant negative on the right so that a sub can
+    // be used instead of a negate and add.
+    if (LHS.second->isNonConstantNegative()) {
+      if (!RHS.second->isNonConstantNegative())
+        return false;
+    } else if (RHS.second->isNonConstantNegative())
+      return true;
+
+    // Otherwise they are equivalent according to this comparison.
+    return false;
+  }
+};
+
+}
+
+Value *SCEVExpander::visitAddExpr(const SCEVAddExpr *S) {
+  Type *Ty = SE.getEffectiveSCEVType(S->getType());
+
+  // Collect all the add operands in a loop, along with their associated loops.
+  // Iterate in reverse so that constants are emitted last, all else equal, and
+  // so that pointer operands are inserted first, which the code below relies on
+  // to form more involved GEPs.
+  SmallVector<std::pair<const Loop *, const SCEV *>, 8> OpsAndLoops;
+  for (std::reverse_iterator<SCEVAddExpr::op_iterator> I(S->op_end()),
+       E(S->op_begin()); I != E; ++I)
+    OpsAndLoops.push_back(std::make_pair(getRelevantLoop(*I), *I));
+
+  // Sort by loop. Use a stable sort so that constants follow non-constants and
+  // pointer operands precede non-pointer operands.
+  llvm::stable_sort(OpsAndLoops, LoopCompare(SE.DT));
+
+  // Emit instructions to add all the operands. Hoist as much as possible
+  // out of loops, and form meaningful getelementptrs where possible.
+  Value *Sum = nullptr;
+  for (auto I = OpsAndLoops.begin(), E = OpsAndLoops.end(); I != E;) {
+    const Loop *CurLoop = I->first;
+    const SCEV *Op = I->second;
+    if (!Sum) {
+      // This is the first operand. Just expand it.
+      Sum = expand(Op);
+      ++I;
+    } else if (PointerType *PTy = dyn_cast<PointerType>(Sum->getType())) {
+      // The running sum expression is a pointer. Try to form a getelementptr
+      // at this level with that as the base.
+      SmallVector<const SCEV *, 4> NewOps;
+      for (; I != E && I->first == CurLoop; ++I) {
+        // If the operand is SCEVUnknown and not instructions, peek through
+        // it, to enable more of it to be folded into the GEP.
+        const SCEV *X = I->second;
+        if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(X))
+          if (!isa<Instruction>(U->getValue()))
+            X = SE.getSCEV(U->getValue());
+        NewOps.push_back(X);
+      }
+      Sum = expandAddToGEP(NewOps.begin(), NewOps.end(), PTy, Ty, Sum);
+    } else if (PointerType *PTy = dyn_cast<PointerType>(Op->getType())) {
+      // The running sum is an integer, and there's a pointer at this level.
+      // Try to form a getelementptr. If the running sum is instructions,
+      // use a SCEVUnknown to avoid re-analyzing them.
+      SmallVector<const SCEV *, 4> NewOps;
+      NewOps.push_back(isa<Instruction>(Sum) ? SE.getUnknown(Sum) :
+                                               SE.getSCEV(Sum));
+      for (++I; I != E && I->first == CurLoop; ++I)
+        NewOps.push_back(I->second);
+      Sum = expandAddToGEP(NewOps.begin(), NewOps.end(), PTy, Ty, expand(Op));
+    } else if (Op->isNonConstantNegative()) {
+      // Instead of doing a negate and add, just do a subtract.
       Value *W = expandCodeForImpl(SE.getNegativeSCEV(Op), Ty, false);
-      Sum = InsertNoopCastOfTo(Sum, Ty); 
-      Sum = InsertBinop(Instruction::Sub, Sum, W, SCEV::FlagAnyWrap, 
-                        /*IsSafeToHoist*/ true); 
-      ++I; 
-    } else { 
-      // A simple add. 
+      Sum = InsertNoopCastOfTo(Sum, Ty);
+      Sum = InsertBinop(Instruction::Sub, Sum, W, SCEV::FlagAnyWrap,
+                        /*IsSafeToHoist*/ true);
+      ++I;
+    } else {
+      // A simple add.
       Value *W = expandCodeForImpl(Op, Ty, false);
-      Sum = InsertNoopCastOfTo(Sum, Ty); 
-      // Canonicalize a constant to the RHS. 
-      if (isa<Constant>(Sum)) std::swap(Sum, W); 
-      Sum = InsertBinop(Instruction::Add, Sum, W, S->getNoWrapFlags(), 
-                        /*IsSafeToHoist*/ true); 
-      ++I; 
-    } 
-  } 
- 
-  return Sum; 
-} 
- 
-Value *SCEVExpander::visitMulExpr(const SCEVMulExpr *S) { 
-  Type *Ty = SE.getEffectiveSCEVType(S->getType()); 
- 
-  // Collect all the mul operands in a loop, along with their associated loops. 
-  // Iterate in reverse so that constants are emitted last, all else equal. 
-  SmallVector<std::pair<const Loop *, const SCEV *>, 8> OpsAndLoops; 
-  for (std::reverse_iterator<SCEVMulExpr::op_iterator> I(S->op_end()), 
-       E(S->op_begin()); I != E; ++I) 
-    OpsAndLoops.push_back(std::make_pair(getRelevantLoop(*I), *I)); 
- 
-  // Sort by loop. Use a stable sort so that constants follow non-constants. 
-  llvm::stable_sort(OpsAndLoops, LoopCompare(SE.DT)); 
- 
-  // Emit instructions to mul all the operands. Hoist as much as possible 
-  // out of loops. 
-  Value *Prod = nullptr; 
-  auto I = OpsAndLoops.begin(); 
- 
-  // Expand the calculation of X pow N in the following manner: 
-  // Let N = P1 + P2 + ... + PK, where all P are powers of 2. Then: 
-  // X pow N = (X pow P1) * (X pow P2) * ... * (X pow PK). 
-  const auto ExpandOpBinPowN = [this, &I, &OpsAndLoops, &Ty]() { 
-    auto E = I; 
-    // Calculate how many times the same operand from the same loop is included 
-    // into this power. 
-    uint64_t Exponent = 0; 
-    const uint64_t MaxExponent = UINT64_MAX >> 1; 
-    // No one sane will ever try to calculate such huge exponents, but if we 
-    // need this, we stop on UINT64_MAX / 2 because we need to exit the loop 
-    // below when the power of 2 exceeds our Exponent, and we want it to be 
-    // 1u << 31 at most to not deal with unsigned overflow. 
-    while (E != OpsAndLoops.end() && *I == *E && Exponent != MaxExponent) { 
-      ++Exponent; 
-      ++E; 
-    } 
-    assert(Exponent > 0 && "Trying to calculate a zeroth exponent of operand?"); 
- 
-    // Calculate powers with exponents 1, 2, 4, 8 etc. and include those of them 
-    // that are needed into the result. 
+      Sum = InsertNoopCastOfTo(Sum, Ty);
+      // Canonicalize a constant to the RHS.
+      if (isa<Constant>(Sum)) std::swap(Sum, W);
+      Sum = InsertBinop(Instruction::Add, Sum, W, S->getNoWrapFlags(),
+                        /*IsSafeToHoist*/ true);
+      ++I;
+    }
+  }
+
+  return Sum;
+}
+
+Value *SCEVExpander::visitMulExpr(const SCEVMulExpr *S) {
+  Type *Ty = SE.getEffectiveSCEVType(S->getType());
+
+  // Collect all the mul operands in a loop, along with their associated loops.
+  // Iterate in reverse so that constants are emitted last, all else equal.
+  SmallVector<std::pair<const Loop *, const SCEV *>, 8> OpsAndLoops;
+  for (std::reverse_iterator<SCEVMulExpr::op_iterator> I(S->op_end()),
+       E(S->op_begin()); I != E; ++I)
+    OpsAndLoops.push_back(std::make_pair(getRelevantLoop(*I), *I));
+
+  // Sort by loop. Use a stable sort so that constants follow non-constants.
+  llvm::stable_sort(OpsAndLoops, LoopCompare(SE.DT));
+
+  // Emit instructions to mul all the operands. Hoist as much as possible
+  // out of loops.
+  Value *Prod = nullptr;
+  auto I = OpsAndLoops.begin();
+
+  // Expand the calculation of X pow N in the following manner:
+  // Let N = P1 + P2 + ... + PK, where all P are powers of 2. Then:
+  // X pow N = (X pow P1) * (X pow P2) * ... * (X pow PK).
+  const auto ExpandOpBinPowN = [this, &I, &OpsAndLoops, &Ty]() {
+    auto E = I;
+    // Calculate how many times the same operand from the same loop is included
+    // into this power.
+    uint64_t Exponent = 0;
+    const uint64_t MaxExponent = UINT64_MAX >> 1;
+    // No one sane will ever try to calculate such huge exponents, but if we
+    // need this, we stop on UINT64_MAX / 2 because we need to exit the loop
+    // below when the power of 2 exceeds our Exponent, and we want it to be
+    // 1u << 31 at most to not deal with unsigned overflow.
+    while (E != OpsAndLoops.end() && *I == *E && Exponent != MaxExponent) {
+      ++Exponent;
+      ++E;
+    }
+    assert(Exponent > 0 && "Trying to calculate a zeroth exponent of operand?");
+
+    // Calculate powers with exponents 1, 2, 4, 8 etc. and include those of them
+    // that are needed into the result.
     Value *P = expandCodeForImpl(I->second, Ty, false);
-    Value *Result = nullptr; 
-    if (Exponent & 1) 
-      Result = P; 
-    for (uint64_t BinExp = 2; BinExp <= Exponent; BinExp <<= 1) { 
-      P = InsertBinop(Instruction::Mul, P, P, SCEV::FlagAnyWrap, 
-                      /*IsSafeToHoist*/ true); 
-      if (Exponent & BinExp) 
-        Result = Result ? InsertBinop(Instruction::Mul, Result, P, 
-                                      SCEV::FlagAnyWrap, 
-                                      /*IsSafeToHoist*/ true) 
-                        : P; 
-    } 
- 
-    I = E; 
-    assert(Result && "Nothing was expanded?"); 
-    return Result; 
-  }; 
- 
-  while (I != OpsAndLoops.end()) { 
-    if (!Prod) { 
-      // This is the first operand. Just expand it. 
-      Prod = ExpandOpBinPowN(); 
-    } else if (I->second->isAllOnesValue()) { 
-      // Instead of doing a multiply by negative one, just do a negate. 
-      Prod = InsertNoopCastOfTo(Prod, Ty); 
-      Prod = InsertBinop(Instruction::Sub, Constant::getNullValue(Ty), Prod, 
-                         SCEV::FlagAnyWrap, /*IsSafeToHoist*/ true); 
-      ++I; 
-    } else { 
-      // A simple mul. 
-      Value *W = ExpandOpBinPowN(); 
-      Prod = InsertNoopCastOfTo(Prod, Ty); 
-      // Canonicalize a constant to the RHS. 
-      if (isa<Constant>(Prod)) std::swap(Prod, W); 
-      const APInt *RHS; 
-      if (match(W, m_Power2(RHS))) { 
-        // Canonicalize Prod*(1<<C) to Prod<<C. 
-        assert(!Ty->isVectorTy() && "vector types are not SCEVable"); 
-        auto NWFlags = S->getNoWrapFlags(); 
-        // clear nsw flag if shl will produce poison value. 
-        if (RHS->logBase2() == RHS->getBitWidth() - 1) 
-          NWFlags = ScalarEvolution::clearFlags(NWFlags, SCEV::FlagNSW); 
-        Prod = InsertBinop(Instruction::Shl, Prod, 
-                           ConstantInt::get(Ty, RHS->logBase2()), NWFlags, 
-                           /*IsSafeToHoist*/ true); 
-      } else { 
-        Prod = InsertBinop(Instruction::Mul, Prod, W, S->getNoWrapFlags(), 
-                           /*IsSafeToHoist*/ true); 
-      } 
-    } 
-  } 
- 
-  return Prod; 
-} 
- 
-Value *SCEVExpander::visitUDivExpr(const SCEVUDivExpr *S) { 
-  Type *Ty = SE.getEffectiveSCEVType(S->getType()); 
- 
+    Value *Result = nullptr;
+    if (Exponent & 1)
+      Result = P;
+    for (uint64_t BinExp = 2; BinExp <= Exponent; BinExp <<= 1) {
+      P = InsertBinop(Instruction::Mul, P, P, SCEV::FlagAnyWrap,
+                      /*IsSafeToHoist*/ true);
+      if (Exponent & BinExp)
+        Result = Result ? InsertBinop(Instruction::Mul, Result, P,
+                                      SCEV::FlagAnyWrap,
+                                      /*IsSafeToHoist*/ true)
+                        : P;
+    }
+
+    I = E;
+    assert(Result && "Nothing was expanded?");
+    return Result;
+  };
+
+  while (I != OpsAndLoops.end()) {
+    if (!Prod) {
+      // This is the first operand. Just expand it.
+      Prod = ExpandOpBinPowN();
+    } else if (I->second->isAllOnesValue()) {
+      // Instead of doing a multiply by negative one, just do a negate.
+      Prod = InsertNoopCastOfTo(Prod, Ty);
+      Prod = InsertBinop(Instruction::Sub, Constant::getNullValue(Ty), Prod,
+                         SCEV::FlagAnyWrap, /*IsSafeToHoist*/ true);
+      ++I;
+    } else {
+      // A simple mul.
+      Value *W = ExpandOpBinPowN();
+      Prod = InsertNoopCastOfTo(Prod, Ty);
+      // Canonicalize a constant to the RHS.
+      if (isa<Constant>(Prod)) std::swap(Prod, W);
+      const APInt *RHS;
+      if (match(W, m_Power2(RHS))) {
+        // Canonicalize Prod*(1<<C) to Prod<<C.
+        assert(!Ty->isVectorTy() && "vector types are not SCEVable");
+        auto NWFlags = S->getNoWrapFlags();
+        // clear nsw flag if shl will produce poison value.
+        if (RHS->logBase2() == RHS->getBitWidth() - 1)
+          NWFlags = ScalarEvolution::clearFlags(NWFlags, SCEV::FlagNSW);
+        Prod = InsertBinop(Instruction::Shl, Prod,
+                           ConstantInt::get(Ty, RHS->logBase2()), NWFlags,
+                           /*IsSafeToHoist*/ true);
+      } else {
+        Prod = InsertBinop(Instruction::Mul, Prod, W, S->getNoWrapFlags(),
+                           /*IsSafeToHoist*/ true);
+      }
+    }
+  }
+
+  return Prod;
+}
+
+Value *SCEVExpander::visitUDivExpr(const SCEVUDivExpr *S) {
+  Type *Ty = SE.getEffectiveSCEVType(S->getType());
+
   Value *LHS = expandCodeForImpl(S->getLHS(), Ty, false);
-  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(S->getRHS())) { 
-    const APInt &RHS = SC->getAPInt(); 
-    if (RHS.isPowerOf2()) 
-      return InsertBinop(Instruction::LShr, LHS, 
-                         ConstantInt::get(Ty, RHS.logBase2()), 
-                         SCEV::FlagAnyWrap, /*IsSafeToHoist*/ true); 
-  } 
- 
+  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(S->getRHS())) {
+    const APInt &RHS = SC->getAPInt();
+    if (RHS.isPowerOf2())
+      return InsertBinop(Instruction::LShr, LHS,
+                         ConstantInt::get(Ty, RHS.logBase2()),
+                         SCEV::FlagAnyWrap, /*IsSafeToHoist*/ true);
+  }
+
   Value *RHS = expandCodeForImpl(S->getRHS(), Ty, false);
-  return InsertBinop(Instruction::UDiv, LHS, RHS, SCEV::FlagAnyWrap, 
-                     /*IsSafeToHoist*/ SE.isKnownNonZero(S->getRHS())); 
-} 
- 
-/// Move parts of Base into Rest to leave Base with the minimal 
-/// expression that provides a pointer operand suitable for a 
-/// GEP expansion. 
-static void ExposePointerBase(const SCEV *&Base, const SCEV *&Rest, 
-                              ScalarEvolution &SE) { 
-  while (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(Base)) { 
-    Base = A->getStart(); 
-    Rest = SE.getAddExpr(Rest, 
-                         SE.getAddRecExpr(SE.getConstant(A->getType(), 0), 
-                                          A->getStepRecurrence(SE), 
-                                          A->getLoop(), 
-                                          A->getNoWrapFlags(SCEV::FlagNW))); 
-  } 
-  if (const SCEVAddExpr *A = dyn_cast<SCEVAddExpr>(Base)) { 
-    Base = A->getOperand(A->getNumOperands()-1); 
+  return InsertBinop(Instruction::UDiv, LHS, RHS, SCEV::FlagAnyWrap,
+                     /*IsSafeToHoist*/ SE.isKnownNonZero(S->getRHS()));
+}
+
+/// Move parts of Base into Rest to leave Base with the minimal
+/// expression that provides a pointer operand suitable for a
+/// GEP expansion.
+static void ExposePointerBase(const SCEV *&Base, const SCEV *&Rest,
+                              ScalarEvolution &SE) {
+  while (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(Base)) {
+    Base = A->getStart();
+    Rest = SE.getAddExpr(Rest,
+                         SE.getAddRecExpr(SE.getConstant(A->getType(), 0),
+                                          A->getStepRecurrence(SE),
+                                          A->getLoop(),
+                                          A->getNoWrapFlags(SCEV::FlagNW)));
+  }
+  if (const SCEVAddExpr *A = dyn_cast<SCEVAddExpr>(Base)) {
+    Base = A->getOperand(A->getNumOperands()-1);
     SmallVector<const SCEV *, 8> NewAddOps(A->operands());
-    NewAddOps.back() = Rest; 
-    Rest = SE.getAddExpr(NewAddOps); 
-    ExposePointerBase(Base, Rest, SE); 
-  } 
-} 
- 
-/// Determine if this is a well-behaved chain of instructions leading back to 
-/// the PHI. If so, it may be reused by expanded expressions. 
-bool SCEVExpander::isNormalAddRecExprPHI(PHINode *PN, Instruction *IncV, 
-                                         const Loop *L) { 
-  if (IncV->getNumOperands() == 0 || isa<PHINode>(IncV) || 
-      (isa<CastInst>(IncV) && !isa<BitCastInst>(IncV))) 
-    return false; 
-  // If any of the operands don't dominate the insert position, bail. 
-  // Addrec operands are always loop-invariant, so this can only happen 
-  // if there are instructions which haven't been hoisted. 
-  if (L == IVIncInsertLoop) { 
-    for (User::op_iterator OI = IncV->op_begin()+1, 
-           OE = IncV->op_end(); OI != OE; ++OI) 
-      if (Instruction *OInst = dyn_cast<Instruction>(OI)) 
-        if (!SE.DT.dominates(OInst, IVIncInsertPos)) 
-          return false; 
-  } 
-  // Advance to the next instruction. 
-  IncV = dyn_cast<Instruction>(IncV->getOperand(0)); 
-  if (!IncV) 
-    return false; 
- 
-  if (IncV->mayHaveSideEffects()) 
-    return false; 
- 
-  if (IncV == PN) 
-    return true; 
- 
-  return isNormalAddRecExprPHI(PN, IncV, L); 
-} 
- 
-/// getIVIncOperand returns an induction variable increment's induction 
-/// variable operand. 
-/// 
-/// If allowScale is set, any type of GEP is allowed as long as the nonIV 
-/// operands dominate InsertPos. 
-/// 
-/// If allowScale is not set, ensure that a GEP increment conforms to one of the 
-/// simple patterns generated by getAddRecExprPHILiterally and 
-/// expandAddtoGEP. If the pattern isn't recognized, return NULL. 
-Instruction *SCEVExpander::getIVIncOperand(Instruction *IncV, 
-                                           Instruction *InsertPos, 
-                                           bool allowScale) { 
-  if (IncV == InsertPos) 
-    return nullptr; 
- 
-  switch (IncV->getOpcode()) { 
-  default: 
-    return nullptr; 
-  // Check for a simple Add/Sub or GEP of a loop invariant step. 
-  case Instruction::Add: 
-  case Instruction::Sub: { 
-    Instruction *OInst = dyn_cast<Instruction>(IncV->getOperand(1)); 
-    if (!OInst || SE.DT.dominates(OInst, InsertPos)) 
-      return dyn_cast<Instruction>(IncV->getOperand(0)); 
-    return nullptr; 
-  } 
-  case Instruction::BitCast: 
-    return dyn_cast<Instruction>(IncV->getOperand(0)); 
-  case Instruction::GetElementPtr: 
-    for (auto I = IncV->op_begin() + 1, E = IncV->op_end(); I != E; ++I) { 
-      if (isa<Constant>(*I)) 
-        continue; 
-      if (Instruction *OInst = dyn_cast<Instruction>(*I)) { 
-        if (!SE.DT.dominates(OInst, InsertPos)) 
-          return nullptr; 
-      } 
-      if (allowScale) { 
-        // allow any kind of GEP as long as it can be hoisted. 
-        continue; 
-      } 
-      // This must be a pointer addition of constants (pretty), which is already 
-      // handled, or some number of address-size elements (ugly). Ugly geps 
-      // have 2 operands. i1* is used by the expander to represent an 
-      // address-size element. 
-      if (IncV->getNumOperands() != 2) 
-        return nullptr; 
-      unsigned AS = cast<PointerType>(IncV->getType())->getAddressSpace(); 
-      if (IncV->getType() != Type::getInt1PtrTy(SE.getContext(), AS) 
-          && IncV->getType() != Type::getInt8PtrTy(SE.getContext(), AS)) 
-        return nullptr; 
-      break; 
-    } 
-    return dyn_cast<Instruction>(IncV->getOperand(0)); 
-  } 
-} 
- 
-/// If the insert point of the current builder or any of the builders on the 
-/// stack of saved builders has 'I' as its insert point, update it to point to 
-/// the instruction after 'I'.  This is intended to be used when the instruction 
-/// 'I' is being moved.  If this fixup is not done and 'I' is moved to a 
-/// different block, the inconsistent insert point (with a mismatched 
-/// Instruction and Block) can lead to an instruction being inserted in a block 
-/// other than its parent. 
-void SCEVExpander::fixupInsertPoints(Instruction *I) { 
-  BasicBlock::iterator It(*I); 
-  BasicBlock::iterator NewInsertPt = std::next(It); 
-  if (Builder.GetInsertPoint() == It) 
-    Builder.SetInsertPoint(&*NewInsertPt); 
-  for (auto *InsertPtGuard : InsertPointGuards) 
-    if (InsertPtGuard->GetInsertPoint() == It) 
-      InsertPtGuard->SetInsertPoint(NewInsertPt); 
-} 
- 
-/// hoistStep - Attempt to hoist a simple IV increment above InsertPos to make 
-/// it available to other uses in this loop. Recursively hoist any operands, 
-/// until we reach a value that dominates InsertPos. 
-bool SCEVExpander::hoistIVInc(Instruction *IncV, Instruction *InsertPos) { 
-  if (SE.DT.dominates(IncV, InsertPos)) 
-      return true; 
- 
-  // InsertPos must itself dominate IncV so that IncV's new position satisfies 
-  // its existing users. 
-  if (isa<PHINode>(InsertPos) || 
-      !SE.DT.dominates(InsertPos->getParent(), IncV->getParent())) 
-    return false; 
- 
-  if (!SE.LI.movementPreservesLCSSAForm(IncV, InsertPos)) 
-    return false; 
- 
-  // Check that the chain of IV operands leading back to Phi can be hoisted. 
-  SmallVector<Instruction*, 4> IVIncs; 
-  for(;;) { 
-    Instruction *Oper = getIVIncOperand(IncV, InsertPos, /*allowScale*/true); 
-    if (!Oper) 
-      return false; 
-    // IncV is safe to hoist. 
-    IVIncs.push_back(IncV); 
-    IncV = Oper; 
-    if (SE.DT.dominates(IncV, InsertPos)) 
-      break; 
-  } 
-  for (auto I = IVIncs.rbegin(), E = IVIncs.rend(); I != E; ++I) { 
-    fixupInsertPoints(*I); 
-    (*I)->moveBefore(InsertPos); 
-  } 
-  return true; 
-} 
- 
-/// Determine if this cyclic phi is in a form that would have been generated by 
-/// LSR. We don't care if the phi was actually expanded in this pass, as long 
-/// as it is in a low-cost form, for example, no implied multiplication. This 
-/// should match any patterns generated by getAddRecExprPHILiterally and 
-/// expandAddtoGEP. 
-bool SCEVExpander::isExpandedAddRecExprPHI(PHINode *PN, Instruction *IncV, 
-                                           const Loop *L) { 
-  for(Instruction *IVOper = IncV; 
-      (IVOper = getIVIncOperand(IVOper, L->getLoopPreheader()->getTerminator(), 
-                                /*allowScale=*/false));) { 
-    if (IVOper == PN) 
-      return true; 
-  } 
-  return false; 
-} 
- 
-/// expandIVInc - Expand an IV increment at Builder's current InsertPos. 
-/// Typically this is the LatchBlock terminator or IVIncInsertPos, but we may 
-/// need to materialize IV increments elsewhere to handle difficult situations. 
-Value *SCEVExpander::expandIVInc(PHINode *PN, Value *StepV, const Loop *L, 
-                                 Type *ExpandTy, Type *IntTy, 
-                                 bool useSubtract) { 
-  Value *IncV; 
-  // If the PHI is a pointer, use a GEP, otherwise use an add or sub. 
-  if (ExpandTy->isPointerTy()) { 
-    PointerType *GEPPtrTy = cast<PointerType>(ExpandTy); 
-    // If the step isn't constant, don't use an implicitly scaled GEP, because 
-    // that would require a multiply inside the loop. 
-    if (!isa<ConstantInt>(StepV)) 
-      GEPPtrTy = PointerType::get(Type::getInt1Ty(SE.getContext()), 
-                                  GEPPtrTy->getAddressSpace()); 
-    IncV = expandAddToGEP(SE.getSCEV(StepV), GEPPtrTy, IntTy, PN); 
+    NewAddOps.back() = Rest;
+    Rest = SE.getAddExpr(NewAddOps);
+    ExposePointerBase(Base, Rest, SE);
+  }
+}
+
+/// Determine if this is a well-behaved chain of instructions leading back to
+/// the PHI. If so, it may be reused by expanded expressions.
+bool SCEVExpander::isNormalAddRecExprPHI(PHINode *PN, Instruction *IncV,
+                                         const Loop *L) {
+  if (IncV->getNumOperands() == 0 || isa<PHINode>(IncV) ||
+      (isa<CastInst>(IncV) && !isa<BitCastInst>(IncV)))
+    return false;
+  // If any of the operands don't dominate the insert position, bail.
+  // Addrec operands are always loop-invariant, so this can only happen
+  // if there are instructions which haven't been hoisted.
+  if (L == IVIncInsertLoop) {
+    for (User::op_iterator OI = IncV->op_begin()+1,
+           OE = IncV->op_end(); OI != OE; ++OI)
+      if (Instruction *OInst = dyn_cast<Instruction>(OI))
+        if (!SE.DT.dominates(OInst, IVIncInsertPos))
+          return false;
+  }
+  // Advance to the next instruction.
+  IncV = dyn_cast<Instruction>(IncV->getOperand(0));
+  if (!IncV)
+    return false;
+
+  if (IncV->mayHaveSideEffects())
+    return false;
+
+  if (IncV == PN)
+    return true;
+
+  return isNormalAddRecExprPHI(PN, IncV, L);
+}
+
+/// getIVIncOperand returns an induction variable increment's induction
+/// variable operand.
+///
+/// If allowScale is set, any type of GEP is allowed as long as the nonIV
+/// operands dominate InsertPos.
+///
+/// If allowScale is not set, ensure that a GEP increment conforms to one of the
+/// simple patterns generated by getAddRecExprPHILiterally and
+/// expandAddtoGEP. If the pattern isn't recognized, return NULL.
+Instruction *SCEVExpander::getIVIncOperand(Instruction *IncV,
+                                           Instruction *InsertPos,
+                                           bool allowScale) {
+  if (IncV == InsertPos)
+    return nullptr;
+
+  switch (IncV->getOpcode()) {
+  default:
+    return nullptr;
+  // Check for a simple Add/Sub or GEP of a loop invariant step.
+  case Instruction::Add:
+  case Instruction::Sub: {
+    Instruction *OInst = dyn_cast<Instruction>(IncV->getOperand(1));
+    if (!OInst || SE.DT.dominates(OInst, InsertPos))
+      return dyn_cast<Instruction>(IncV->getOperand(0));
+    return nullptr;
+  }
+  case Instruction::BitCast:
+    return dyn_cast<Instruction>(IncV->getOperand(0));
+  case Instruction::GetElementPtr:
+    for (auto I = IncV->op_begin() + 1, E = IncV->op_end(); I != E; ++I) {
+      if (isa<Constant>(*I))
+        continue;
+      if (Instruction *OInst = dyn_cast<Instruction>(*I)) {
+        if (!SE.DT.dominates(OInst, InsertPos))
+          return nullptr;
+      }
+      if (allowScale) {
+        // allow any kind of GEP as long as it can be hoisted.
+        continue;
+      }
+      // This must be a pointer addition of constants (pretty), which is already
+      // handled, or some number of address-size elements (ugly). Ugly geps
+      // have 2 operands. i1* is used by the expander to represent an
+      // address-size element.
+      if (IncV->getNumOperands() != 2)
+        return nullptr;
+      unsigned AS = cast<PointerType>(IncV->getType())->getAddressSpace();
+      if (IncV->getType() != Type::getInt1PtrTy(SE.getContext(), AS)
+          && IncV->getType() != Type::getInt8PtrTy(SE.getContext(), AS))
+        return nullptr;
+      break;
+    }
+    return dyn_cast<Instruction>(IncV->getOperand(0));
+  }
+}
+
+/// If the insert point of the current builder or any of the builders on the
+/// stack of saved builders has 'I' as its insert point, update it to point to
+/// the instruction after 'I'.  This is intended to be used when the instruction
+/// 'I' is being moved.  If this fixup is not done and 'I' is moved to a
+/// different block, the inconsistent insert point (with a mismatched
+/// Instruction and Block) can lead to an instruction being inserted in a block
+/// other than its parent.
+void SCEVExpander::fixupInsertPoints(Instruction *I) {
+  BasicBlock::iterator It(*I);
+  BasicBlock::iterator NewInsertPt = std::next(It);
+  if (Builder.GetInsertPoint() == It)
+    Builder.SetInsertPoint(&*NewInsertPt);
+  for (auto *InsertPtGuard : InsertPointGuards)
+    if (InsertPtGuard->GetInsertPoint() == It)
+      InsertPtGuard->SetInsertPoint(NewInsertPt);
+}
+
+/// hoistStep - Attempt to hoist a simple IV increment above InsertPos to make
+/// it available to other uses in this loop. Recursively hoist any operands,
+/// until we reach a value that dominates InsertPos.
+bool SCEVExpander::hoistIVInc(Instruction *IncV, Instruction *InsertPos) {
+  if (SE.DT.dominates(IncV, InsertPos))
+      return true;
+
+  // InsertPos must itself dominate IncV so that IncV's new position satisfies
+  // its existing users.
+  if (isa<PHINode>(InsertPos) ||
+      !SE.DT.dominates(InsertPos->getParent(), IncV->getParent()))
+    return false;
+
+  if (!SE.LI.movementPreservesLCSSAForm(IncV, InsertPos))
+    return false;
+
+  // Check that the chain of IV operands leading back to Phi can be hoisted.
+  SmallVector<Instruction*, 4> IVIncs;
+  for(;;) {
+    Instruction *Oper = getIVIncOperand(IncV, InsertPos, /*allowScale*/true);
+    if (!Oper)
+      return false;
+    // IncV is safe to hoist.
+    IVIncs.push_back(IncV);
+    IncV = Oper;
+    if (SE.DT.dominates(IncV, InsertPos))
+      break;
+  }
+  for (auto I = IVIncs.rbegin(), E = IVIncs.rend(); I != E; ++I) {
+    fixupInsertPoints(*I);
+    (*I)->moveBefore(InsertPos);
+  }
+  return true;
+}
+
+/// Determine if this cyclic phi is in a form that would have been generated by
+/// LSR. We don't care if the phi was actually expanded in this pass, as long
+/// as it is in a low-cost form, for example, no implied multiplication. This
+/// should match any patterns generated by getAddRecExprPHILiterally and
+/// expandAddtoGEP.
+bool SCEVExpander::isExpandedAddRecExprPHI(PHINode *PN, Instruction *IncV,
+                                           const Loop *L) {
+  for(Instruction *IVOper = IncV;
+      (IVOper = getIVIncOperand(IVOper, L->getLoopPreheader()->getTerminator(),
+                                /*allowScale=*/false));) {
+    if (IVOper == PN)
+      return true;
+  }
+  return false;
+}
+
+/// expandIVInc - Expand an IV increment at Builder's current InsertPos.
+/// Typically this is the LatchBlock terminator or IVIncInsertPos, but we may
+/// need to materialize IV increments elsewhere to handle difficult situations.
+Value *SCEVExpander::expandIVInc(PHINode *PN, Value *StepV, const Loop *L,
+                                 Type *ExpandTy, Type *IntTy,
+                                 bool useSubtract) {
+  Value *IncV;
+  // If the PHI is a pointer, use a GEP, otherwise use an add or sub.
+  if (ExpandTy->isPointerTy()) {
+    PointerType *GEPPtrTy = cast<PointerType>(ExpandTy);
+    // If the step isn't constant, don't use an implicitly scaled GEP, because
+    // that would require a multiply inside the loop.
+    if (!isa<ConstantInt>(StepV))
+      GEPPtrTy = PointerType::get(Type::getInt1Ty(SE.getContext()),
+                                  GEPPtrTy->getAddressSpace());
+    IncV = expandAddToGEP(SE.getSCEV(StepV), GEPPtrTy, IntTy, PN);
     if (IncV->getType() != PN->getType())
-      IncV = Builder.CreateBitCast(IncV, PN->getType()); 
-  } else { 
-    IncV = useSubtract ? 
-      Builder.CreateSub(PN, StepV, Twine(IVName) + ".iv.next") : 
-      Builder.CreateAdd(PN, StepV, Twine(IVName) + ".iv.next"); 
-  } 
-  return IncV; 
-} 
- 
-/// Hoist the addrec instruction chain rooted in the loop phi above the 
-/// position. This routine assumes that this is possible (has been checked). 
-void SCEVExpander::hoistBeforePos(DominatorTree *DT, Instruction *InstToHoist, 
-                                  Instruction *Pos, PHINode *LoopPhi) { 
-  do { 
-    if (DT->dominates(InstToHoist, Pos)) 
-      break; 
-    // Make sure the increment is where we want it. But don't move it 
-    // down past a potential existing post-inc user. 
-    fixupInsertPoints(InstToHoist); 
-    InstToHoist->moveBefore(Pos); 
-    Pos = InstToHoist; 
-    InstToHoist = cast<Instruction>(InstToHoist->getOperand(0)); 
-  } while (InstToHoist != LoopPhi); 
-} 
- 
-/// Check whether we can cheaply express the requested SCEV in terms of 
-/// the available PHI SCEV by truncation and/or inversion of the step. 
-static bool canBeCheaplyTransformed(ScalarEvolution &SE, 
-                                    const SCEVAddRecExpr *Phi, 
-                                    const SCEVAddRecExpr *Requested, 
-                                    bool &InvertStep) { 
-  Type *PhiTy = SE.getEffectiveSCEVType(Phi->getType()); 
-  Type *RequestedTy = SE.getEffectiveSCEVType(Requested->getType()); 
- 
-  if (RequestedTy->getIntegerBitWidth() > PhiTy->getIntegerBitWidth()) 
-    return false; 
- 
-  // Try truncate it if necessary. 
-  Phi = dyn_cast<SCEVAddRecExpr>(SE.getTruncateOrNoop(Phi, RequestedTy)); 
-  if (!Phi) 
-    return false; 
- 
-  // Check whether truncation will help. 
-  if (Phi == Requested) { 
-    InvertStep = false; 
-    return true; 
-  } 
- 
-  // Check whether inverting will help: {R,+,-1} == R - {0,+,1}. 
-  if (SE.getAddExpr(Requested->getStart(), 
-                    SE.getNegativeSCEV(Requested)) == Phi) { 
-    InvertStep = true; 
-    return true; 
-  } 
- 
-  return false; 
-} 
- 
-static bool IsIncrementNSW(ScalarEvolution &SE, const SCEVAddRecExpr *AR) { 
-  if (!isa<IntegerType>(AR->getType())) 
-    return false; 
- 
-  unsigned BitWidth = cast<IntegerType>(AR->getType())->getBitWidth(); 
-  Type *WideTy = IntegerType::get(AR->getType()->getContext(), BitWidth * 2); 
-  const SCEV *Step = AR->getStepRecurrence(SE); 
-  const SCEV *OpAfterExtend = SE.getAddExpr(SE.getSignExtendExpr(Step, WideTy), 
-                                            SE.getSignExtendExpr(AR, WideTy)); 
-  const SCEV *ExtendAfterOp = 
-    SE.getSignExtendExpr(SE.getAddExpr(AR, Step), WideTy); 
-  return ExtendAfterOp == OpAfterExtend; 
-} 
- 
-static bool IsIncrementNUW(ScalarEvolution &SE, const SCEVAddRecExpr *AR) { 
-  if (!isa<IntegerType>(AR->getType())) 
-    return false; 
- 
-  unsigned BitWidth = cast<IntegerType>(AR->getType())->getBitWidth(); 
-  Type *WideTy = IntegerType::get(AR->getType()->getContext(), BitWidth * 2); 
-  const SCEV *Step = AR->getStepRecurrence(SE); 
-  const SCEV *OpAfterExtend = SE.getAddExpr(SE.getZeroExtendExpr(Step, WideTy), 
-                                            SE.getZeroExtendExpr(AR, WideTy)); 
-  const SCEV *ExtendAfterOp = 
-    SE.getZeroExtendExpr(SE.getAddExpr(AR, Step), WideTy); 
-  return ExtendAfterOp == OpAfterExtend; 
-} 
- 
-/// getAddRecExprPHILiterally - Helper for expandAddRecExprLiterally. Expand 
-/// the base addrec, which is the addrec without any non-loop-dominating 
-/// values, and return the PHI. 
-PHINode * 
-SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized, 
-                                        const Loop *L, 
-                                        Type *ExpandTy, 
-                                        Type *IntTy, 
-                                        Type *&TruncTy, 
-                                        bool &InvertStep) { 
-  assert((!IVIncInsertLoop||IVIncInsertPos) && "Uninitialized insert position"); 
- 
-  // Reuse a previously-inserted PHI, if present. 
-  BasicBlock *LatchBlock = L->getLoopLatch(); 
-  if (LatchBlock) { 
-    PHINode *AddRecPhiMatch = nullptr; 
-    Instruction *IncV = nullptr; 
-    TruncTy = nullptr; 
-    InvertStep = false; 
- 
-    // Only try partially matching scevs that need truncation and/or 
-    // step-inversion if we know this loop is outside the current loop. 
-    bool TryNonMatchingSCEV = 
-        IVIncInsertLoop && 
-        SE.DT.properlyDominates(LatchBlock, IVIncInsertLoop->getHeader()); 
- 
-    for (PHINode &PN : L->getHeader()->phis()) { 
-      if (!SE.isSCEVable(PN.getType())) 
-        continue; 
- 
+      IncV = Builder.CreateBitCast(IncV, PN->getType());
+  } else {
+    IncV = useSubtract ?
+      Builder.CreateSub(PN, StepV, Twine(IVName) + ".iv.next") :
+      Builder.CreateAdd(PN, StepV, Twine(IVName) + ".iv.next");
+  }
+  return IncV;
+}
+
+/// Hoist the addrec instruction chain rooted in the loop phi above the
+/// position. This routine assumes that this is possible (has been checked).
+void SCEVExpander::hoistBeforePos(DominatorTree *DT, Instruction *InstToHoist,
+                                  Instruction *Pos, PHINode *LoopPhi) {
+  do {
+    if (DT->dominates(InstToHoist, Pos))
+      break;
+    // Make sure the increment is where we want it. But don't move it
+    // down past a potential existing post-inc user.
+    fixupInsertPoints(InstToHoist);
+    InstToHoist->moveBefore(Pos);
+    Pos = InstToHoist;
+    InstToHoist = cast<Instruction>(InstToHoist->getOperand(0));
+  } while (InstToHoist != LoopPhi);
+}
+
+/// Check whether we can cheaply express the requested SCEV in terms of
+/// the available PHI SCEV by truncation and/or inversion of the step.
+static bool canBeCheaplyTransformed(ScalarEvolution &SE,
+                                    const SCEVAddRecExpr *Phi,
+                                    const SCEVAddRecExpr *Requested,
+                                    bool &InvertStep) {
+  Type *PhiTy = SE.getEffectiveSCEVType(Phi->getType());
+  Type *RequestedTy = SE.getEffectiveSCEVType(Requested->getType());
+
+  if (RequestedTy->getIntegerBitWidth() > PhiTy->getIntegerBitWidth())
+    return false;
+
+  // Try truncate it if necessary.
+  Phi = dyn_cast<SCEVAddRecExpr>(SE.getTruncateOrNoop(Phi, RequestedTy));
+  if (!Phi)
+    return false;
+
+  // Check whether truncation will help.
+  if (Phi == Requested) {
+    InvertStep = false;
+    return true;
+  }
+
+  // Check whether inverting will help: {R,+,-1} == R - {0,+,1}.
+  if (SE.getAddExpr(Requested->getStart(),
+                    SE.getNegativeSCEV(Requested)) == Phi) {
+    InvertStep = true;
+    return true;
+  }
+
+  return false;
+}
+
+static bool IsIncrementNSW(ScalarEvolution &SE, const SCEVAddRecExpr *AR) {
+  if (!isa<IntegerType>(AR->getType()))
+    return false;
+
+  unsigned BitWidth = cast<IntegerType>(AR->getType())->getBitWidth();
+  Type *WideTy = IntegerType::get(AR->getType()->getContext(), BitWidth * 2);
+  const SCEV *Step = AR->getStepRecurrence(SE);
+  const SCEV *OpAfterExtend = SE.getAddExpr(SE.getSignExtendExpr(Step, WideTy),
+                                            SE.getSignExtendExpr(AR, WideTy));
+  const SCEV *ExtendAfterOp =
+    SE.getSignExtendExpr(SE.getAddExpr(AR, Step), WideTy);
+  return ExtendAfterOp == OpAfterExtend;
+}
+
+static bool IsIncrementNUW(ScalarEvolution &SE, const SCEVAddRecExpr *AR) {
+  if (!isa<IntegerType>(AR->getType()))
+    return false;
+
+  unsigned BitWidth = cast<IntegerType>(AR->getType())->getBitWidth();
+  Type *WideTy = IntegerType::get(AR->getType()->getContext(), BitWidth * 2);
+  const SCEV *Step = AR->getStepRecurrence(SE);
+  const SCEV *OpAfterExtend = SE.getAddExpr(SE.getZeroExtendExpr(Step, WideTy),
+                                            SE.getZeroExtendExpr(AR, WideTy));
+  const SCEV *ExtendAfterOp =
+    SE.getZeroExtendExpr(SE.getAddExpr(AR, Step), WideTy);
+  return ExtendAfterOp == OpAfterExtend;
+}
+
+/// getAddRecExprPHILiterally - Helper for expandAddRecExprLiterally. Expand
+/// the base addrec, which is the addrec without any non-loop-dominating
+/// values, and return the PHI.
+PHINode *
+SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
+                                        const Loop *L,
+                                        Type *ExpandTy,
+                                        Type *IntTy,
+                                        Type *&TruncTy,
+                                        bool &InvertStep) {
+  assert((!IVIncInsertLoop||IVIncInsertPos) && "Uninitialized insert position");
+
+  // Reuse a previously-inserted PHI, if present.
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  if (LatchBlock) {
+    PHINode *AddRecPhiMatch = nullptr;
+    Instruction *IncV = nullptr;
+    TruncTy = nullptr;
+    InvertStep = false;
+
+    // Only try partially matching scevs that need truncation and/or
+    // step-inversion if we know this loop is outside the current loop.
+    bool TryNonMatchingSCEV =
+        IVIncInsertLoop &&
+        SE.DT.properlyDominates(LatchBlock, IVIncInsertLoop->getHeader());
+
+    for (PHINode &PN : L->getHeader()->phis()) {
+      if (!SE.isSCEVable(PN.getType()))
+        continue;
+
       // We should not look for a incomplete PHI. Getting SCEV for a incomplete
       // PHI has no meaning at all.
       if (!PN.isComplete()) {
@@ -1214,232 +1214,232 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
         continue;
       }
 
-      const SCEVAddRecExpr *PhiSCEV = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(&PN)); 
-      if (!PhiSCEV) 
-        continue; 
- 
-      bool IsMatchingSCEV = PhiSCEV == Normalized; 
-      // We only handle truncation and inversion of phi recurrences for the 
-      // expanded expression if the expanded expression's loop dominates the 
-      // loop we insert to. Check now, so we can bail out early. 
-      if (!IsMatchingSCEV && !TryNonMatchingSCEV) 
-          continue; 
- 
-      // TODO: this possibly can be reworked to avoid this cast at all. 
-      Instruction *TempIncV = 
-          dyn_cast<Instruction>(PN.getIncomingValueForBlock(LatchBlock)); 
-      if (!TempIncV) 
-        continue; 
- 
-      // Check whether we can reuse this PHI node. 
-      if (LSRMode) { 
-        if (!isExpandedAddRecExprPHI(&PN, TempIncV, L)) 
-          continue; 
-        if (L == IVIncInsertLoop && !hoistIVInc(TempIncV, IVIncInsertPos)) 
-          continue; 
-      } else { 
-        if (!isNormalAddRecExprPHI(&PN, TempIncV, L)) 
-          continue; 
-      } 
- 
-      // Stop if we have found an exact match SCEV. 
-      if (IsMatchingSCEV) { 
-        IncV = TempIncV; 
-        TruncTy = nullptr; 
-        InvertStep = false; 
-        AddRecPhiMatch = &PN; 
-        break; 
-      } 
- 
-      // Try whether the phi can be translated into the requested form 
-      // (truncated and/or offset by a constant). 
-      if ((!TruncTy || InvertStep) && 
-          canBeCheaplyTransformed(SE, PhiSCEV, Normalized, InvertStep)) { 
-        // Record the phi node. But don't stop we might find an exact match 
-        // later. 
-        AddRecPhiMatch = &PN; 
-        IncV = TempIncV; 
-        TruncTy = SE.getEffectiveSCEVType(Normalized->getType()); 
-      } 
-    } 
- 
-    if (AddRecPhiMatch) { 
-      // Potentially, move the increment. We have made sure in 
-      // isExpandedAddRecExprPHI or hoistIVInc that this is possible. 
-      if (L == IVIncInsertLoop) 
-        hoistBeforePos(&SE.DT, IncV, IVIncInsertPos, AddRecPhiMatch); 
- 
-      // Ok, the add recurrence looks usable. 
-      // Remember this PHI, even in post-inc mode. 
-      InsertedValues.insert(AddRecPhiMatch); 
-      // Remember the increment. 
-      rememberInstruction(IncV); 
+      const SCEVAddRecExpr *PhiSCEV = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(&PN));
+      if (!PhiSCEV)
+        continue;
+
+      bool IsMatchingSCEV = PhiSCEV == Normalized;
+      // We only handle truncation and inversion of phi recurrences for the
+      // expanded expression if the expanded expression's loop dominates the
+      // loop we insert to. Check now, so we can bail out early.
+      if (!IsMatchingSCEV && !TryNonMatchingSCEV)
+          continue;
+
+      // TODO: this possibly can be reworked to avoid this cast at all.
+      Instruction *TempIncV =
+          dyn_cast<Instruction>(PN.getIncomingValueForBlock(LatchBlock));
+      if (!TempIncV)
+        continue;
+
+      // Check whether we can reuse this PHI node.
+      if (LSRMode) {
+        if (!isExpandedAddRecExprPHI(&PN, TempIncV, L))
+          continue;
+        if (L == IVIncInsertLoop && !hoistIVInc(TempIncV, IVIncInsertPos))
+          continue;
+      } else {
+        if (!isNormalAddRecExprPHI(&PN, TempIncV, L))
+          continue;
+      }
+
+      // Stop if we have found an exact match SCEV.
+      if (IsMatchingSCEV) {
+        IncV = TempIncV;
+        TruncTy = nullptr;
+        InvertStep = false;
+        AddRecPhiMatch = &PN;
+        break;
+      }
+
+      // Try whether the phi can be translated into the requested form
+      // (truncated and/or offset by a constant).
+      if ((!TruncTy || InvertStep) &&
+          canBeCheaplyTransformed(SE, PhiSCEV, Normalized, InvertStep)) {
+        // Record the phi node. But don't stop we might find an exact match
+        // later.
+        AddRecPhiMatch = &PN;
+        IncV = TempIncV;
+        TruncTy = SE.getEffectiveSCEVType(Normalized->getType());
+      }
+    }
+
+    if (AddRecPhiMatch) {
+      // Potentially, move the increment. We have made sure in
+      // isExpandedAddRecExprPHI or hoistIVInc that this is possible.
+      if (L == IVIncInsertLoop)
+        hoistBeforePos(&SE.DT, IncV, IVIncInsertPos, AddRecPhiMatch);
+
+      // Ok, the add recurrence looks usable.
+      // Remember this PHI, even in post-inc mode.
+      InsertedValues.insert(AddRecPhiMatch);
+      // Remember the increment.
+      rememberInstruction(IncV);
       // Those values were not actually inserted but re-used.
       ReusedValues.insert(AddRecPhiMatch);
       ReusedValues.insert(IncV);
-      return AddRecPhiMatch; 
-    } 
-  } 
- 
-  // Save the original insertion point so we can restore it when we're done. 
-  SCEVInsertPointGuard Guard(Builder, this); 
- 
-  // Another AddRec may need to be recursively expanded below. For example, if 
-  // this AddRec is quadratic, the StepV may itself be an AddRec in this 
-  // loop. Remove this loop from the PostIncLoops set before expanding such 
-  // AddRecs. Otherwise, we cannot find a valid position for the step 
-  // (i.e. StepV can never dominate its loop header).  Ideally, we could do 
-  // SavedIncLoops.swap(PostIncLoops), but we generally have a single element, 
-  // so it's not worth implementing SmallPtrSet::swap. 
-  PostIncLoopSet SavedPostIncLoops = PostIncLoops; 
-  PostIncLoops.clear(); 
- 
-  // Expand code for the start value into the loop preheader. 
-  assert(L->getLoopPreheader() && 
-         "Can't expand add recurrences without a loop preheader!"); 
+      return AddRecPhiMatch;
+    }
+  }
+
+  // Save the original insertion point so we can restore it when we're done.
+  SCEVInsertPointGuard Guard(Builder, this);
+
+  // Another AddRec may need to be recursively expanded below. For example, if
+  // this AddRec is quadratic, the StepV may itself be an AddRec in this
+  // loop. Remove this loop from the PostIncLoops set before expanding such
+  // AddRecs. Otherwise, we cannot find a valid position for the step
+  // (i.e. StepV can never dominate its loop header).  Ideally, we could do
+  // SavedIncLoops.swap(PostIncLoops), but we generally have a single element,
+  // so it's not worth implementing SmallPtrSet::swap.
+  PostIncLoopSet SavedPostIncLoops = PostIncLoops;
+  PostIncLoops.clear();
+
+  // Expand code for the start value into the loop preheader.
+  assert(L->getLoopPreheader() &&
+         "Can't expand add recurrences without a loop preheader!");
   Value *StartV =
       expandCodeForImpl(Normalized->getStart(), ExpandTy,
                         L->getLoopPreheader()->getTerminator(), false);
- 
-  // StartV must have been be inserted into L's preheader to dominate the new 
-  // phi. 
-  assert(!isa<Instruction>(StartV) || 
-         SE.DT.properlyDominates(cast<Instruction>(StartV)->getParent(), 
-                                 L->getHeader())); 
- 
-  // Expand code for the step value. Do this before creating the PHI so that PHI 
-  // reuse code doesn't see an incomplete PHI. 
-  const SCEV *Step = Normalized->getStepRecurrence(SE); 
-  // If the stride is negative, insert a sub instead of an add for the increment 
-  // (unless it's a constant, because subtracts of constants are canonicalized 
-  // to adds). 
-  bool useSubtract = !ExpandTy->isPointerTy() && Step->isNonConstantNegative(); 
-  if (useSubtract) 
-    Step = SE.getNegativeSCEV(Step); 
-  // Expand the step somewhere that dominates the loop header. 
+
+  // StartV must have been be inserted into L's preheader to dominate the new
+  // phi.
+  assert(!isa<Instruction>(StartV) ||
+         SE.DT.properlyDominates(cast<Instruction>(StartV)->getParent(),
+                                 L->getHeader()));
+
+  // Expand code for the step value. Do this before creating the PHI so that PHI
+  // reuse code doesn't see an incomplete PHI.
+  const SCEV *Step = Normalized->getStepRecurrence(SE);
+  // If the stride is negative, insert a sub instead of an add for the increment
+  // (unless it's a constant, because subtracts of constants are canonicalized
+  // to adds).
+  bool useSubtract = !ExpandTy->isPointerTy() && Step->isNonConstantNegative();
+  if (useSubtract)
+    Step = SE.getNegativeSCEV(Step);
+  // Expand the step somewhere that dominates the loop header.
   Value *StepV = expandCodeForImpl(
       Step, IntTy, &*L->getHeader()->getFirstInsertionPt(), false);
- 
-  // The no-wrap behavior proved by IsIncrement(NUW|NSW) is only applicable if 
-  // we actually do emit an addition.  It does not apply if we emit a 
-  // subtraction. 
-  bool IncrementIsNUW = !useSubtract && IsIncrementNUW(SE, Normalized); 
-  bool IncrementIsNSW = !useSubtract && IsIncrementNSW(SE, Normalized); 
- 
-  // Create the PHI. 
-  BasicBlock *Header = L->getHeader(); 
-  Builder.SetInsertPoint(Header, Header->begin()); 
-  pred_iterator HPB = pred_begin(Header), HPE = pred_end(Header); 
-  PHINode *PN = Builder.CreatePHI(ExpandTy, std::distance(HPB, HPE), 
-                                  Twine(IVName) + ".iv"); 
- 
-  // Create the step instructions and populate the PHI. 
-  for (pred_iterator HPI = HPB; HPI != HPE; ++HPI) { 
-    BasicBlock *Pred = *HPI; 
- 
-    // Add a start value. 
-    if (!L->contains(Pred)) { 
-      PN->addIncoming(StartV, Pred); 
-      continue; 
-    } 
- 
-    // Create a step value and add it to the PHI. 
-    // If IVIncInsertLoop is non-null and equal to the addrec's loop, insert the 
-    // instructions at IVIncInsertPos. 
-    Instruction *InsertPos = L == IVIncInsertLoop ? 
-      IVIncInsertPos : Pred->getTerminator(); 
-    Builder.SetInsertPoint(InsertPos); 
-    Value *IncV = expandIVInc(PN, StepV, L, ExpandTy, IntTy, useSubtract); 
- 
-    if (isa<OverflowingBinaryOperator>(IncV)) { 
-      if (IncrementIsNUW) 
-        cast<BinaryOperator>(IncV)->setHasNoUnsignedWrap(); 
-      if (IncrementIsNSW) 
-        cast<BinaryOperator>(IncV)->setHasNoSignedWrap(); 
-    } 
-    PN->addIncoming(IncV, Pred); 
-  } 
- 
-  // After expanding subexpressions, restore the PostIncLoops set so the caller 
-  // can ensure that IVIncrement dominates the current uses. 
-  PostIncLoops = SavedPostIncLoops; 
- 
-  // Remember this PHI, even in post-inc mode. 
-  InsertedValues.insert(PN); 
- 
-  return PN; 
-} 
- 
-Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) { 
-  Type *STy = S->getType(); 
-  Type *IntTy = SE.getEffectiveSCEVType(STy); 
-  const Loop *L = S->getLoop(); 
- 
-  // Determine a normalized form of this expression, which is the expression 
-  // before any post-inc adjustment is made. 
-  const SCEVAddRecExpr *Normalized = S; 
-  if (PostIncLoops.count(L)) { 
-    PostIncLoopSet Loops; 
-    Loops.insert(L); 
-    Normalized = cast<SCEVAddRecExpr>(normalizeForPostIncUse(S, Loops, SE)); 
-  } 
- 
-  // Strip off any non-loop-dominating component from the addrec start. 
-  const SCEV *Start = Normalized->getStart(); 
-  const SCEV *PostLoopOffset = nullptr; 
-  if (!SE.properlyDominates(Start, L->getHeader())) { 
-    PostLoopOffset = Start; 
-    Start = SE.getConstant(Normalized->getType(), 0); 
-    Normalized = cast<SCEVAddRecExpr>( 
-      SE.getAddRecExpr(Start, Normalized->getStepRecurrence(SE), 
-                       Normalized->getLoop(), 
-                       Normalized->getNoWrapFlags(SCEV::FlagNW))); 
-  } 
- 
-  // Strip off any non-loop-dominating component from the addrec step. 
-  const SCEV *Step = Normalized->getStepRecurrence(SE); 
-  const SCEV *PostLoopScale = nullptr; 
-  if (!SE.dominates(Step, L->getHeader())) { 
-    PostLoopScale = Step; 
-    Step = SE.getConstant(Normalized->getType(), 1); 
-    if (!Start->isZero()) { 
-        // The normalization below assumes that Start is constant zero, so if 
-        // it isn't re-associate Start to PostLoopOffset. 
-        assert(!PostLoopOffset && "Start not-null but PostLoopOffset set?"); 
-        PostLoopOffset = Start; 
-        Start = SE.getConstant(Normalized->getType(), 0); 
-    } 
-    Normalized = 
-      cast<SCEVAddRecExpr>(SE.getAddRecExpr( 
-                             Start, Step, Normalized->getLoop(), 
-                             Normalized->getNoWrapFlags(SCEV::FlagNW))); 
-  } 
- 
-  // Expand the core addrec. If we need post-loop scaling, force it to 
-  // expand to an integer type to avoid the need for additional casting. 
-  Type *ExpandTy = PostLoopScale ? IntTy : STy; 
-  // We can't use a pointer type for the addrec if the pointer type is 
-  // non-integral. 
-  Type *AddRecPHIExpandTy = 
-      DL.isNonIntegralPointerType(STy) ? Normalized->getType() : ExpandTy; 
- 
-  // In some cases, we decide to reuse an existing phi node but need to truncate 
-  // it and/or invert the step. 
-  Type *TruncTy = nullptr; 
-  bool InvertStep = false; 
-  PHINode *PN = getAddRecExprPHILiterally(Normalized, L, AddRecPHIExpandTy, 
-                                          IntTy, TruncTy, InvertStep); 
- 
-  // Accommodate post-inc mode, if necessary. 
-  Value *Result; 
-  if (!PostIncLoops.count(L)) 
-    Result = PN; 
-  else { 
-    // In PostInc mode, use the post-incremented value. 
-    BasicBlock *LatchBlock = L->getLoopLatch(); 
-    assert(LatchBlock && "PostInc mode requires a unique loop latch!"); 
-    Result = PN->getIncomingValueForBlock(LatchBlock); 
- 
+
+  // The no-wrap behavior proved by IsIncrement(NUW|NSW) is only applicable if
+  // we actually do emit an addition.  It does not apply if we emit a
+  // subtraction.
+  bool IncrementIsNUW = !useSubtract && IsIncrementNUW(SE, Normalized);
+  bool IncrementIsNSW = !useSubtract && IsIncrementNSW(SE, Normalized);
+
+  // Create the PHI.
+  BasicBlock *Header = L->getHeader();
+  Builder.SetInsertPoint(Header, Header->begin());
+  pred_iterator HPB = pred_begin(Header), HPE = pred_end(Header);
+  PHINode *PN = Builder.CreatePHI(ExpandTy, std::distance(HPB, HPE),
+                                  Twine(IVName) + ".iv");
+
+  // Create the step instructions and populate the PHI.
+  for (pred_iterator HPI = HPB; HPI != HPE; ++HPI) {
+    BasicBlock *Pred = *HPI;
+
+    // Add a start value.
+    if (!L->contains(Pred)) {
+      PN->addIncoming(StartV, Pred);
+      continue;
+    }
+
+    // Create a step value and add it to the PHI.
+    // If IVIncInsertLoop is non-null and equal to the addrec's loop, insert the
+    // instructions at IVIncInsertPos.
+    Instruction *InsertPos = L == IVIncInsertLoop ?
+      IVIncInsertPos : Pred->getTerminator();
+    Builder.SetInsertPoint(InsertPos);
+    Value *IncV = expandIVInc(PN, StepV, L, ExpandTy, IntTy, useSubtract);
+
+    if (isa<OverflowingBinaryOperator>(IncV)) {
+      if (IncrementIsNUW)
+        cast<BinaryOperator>(IncV)->setHasNoUnsignedWrap();
+      if (IncrementIsNSW)
+        cast<BinaryOperator>(IncV)->setHasNoSignedWrap();
+    }
+    PN->addIncoming(IncV, Pred);
+  }
+
+  // After expanding subexpressions, restore the PostIncLoops set so the caller
+  // can ensure that IVIncrement dominates the current uses.
+  PostIncLoops = SavedPostIncLoops;
+
+  // Remember this PHI, even in post-inc mode.
+  InsertedValues.insert(PN);
+
+  return PN;
+}
+
+Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
+  Type *STy = S->getType();
+  Type *IntTy = SE.getEffectiveSCEVType(STy);
+  const Loop *L = S->getLoop();
+
+  // Determine a normalized form of this expression, which is the expression
+  // before any post-inc adjustment is made.
+  const SCEVAddRecExpr *Normalized = S;
+  if (PostIncLoops.count(L)) {
+    PostIncLoopSet Loops;
+    Loops.insert(L);
+    Normalized = cast<SCEVAddRecExpr>(normalizeForPostIncUse(S, Loops, SE));
+  }
+
+  // Strip off any non-loop-dominating component from the addrec start.
+  const SCEV *Start = Normalized->getStart();
+  const SCEV *PostLoopOffset = nullptr;
+  if (!SE.properlyDominates(Start, L->getHeader())) {
+    PostLoopOffset = Start;
+    Start = SE.getConstant(Normalized->getType(), 0);
+    Normalized = cast<SCEVAddRecExpr>(
+      SE.getAddRecExpr(Start, Normalized->getStepRecurrence(SE),
+                       Normalized->getLoop(),
+                       Normalized->getNoWrapFlags(SCEV::FlagNW)));
+  }
+
+  // Strip off any non-loop-dominating component from the addrec step.
+  const SCEV *Step = Normalized->getStepRecurrence(SE);
+  const SCEV *PostLoopScale = nullptr;
+  if (!SE.dominates(Step, L->getHeader())) {
+    PostLoopScale = Step;
+    Step = SE.getConstant(Normalized->getType(), 1);
+    if (!Start->isZero()) {
+        // The normalization below assumes that Start is constant zero, so if
+        // it isn't re-associate Start to PostLoopOffset.
+        assert(!PostLoopOffset && "Start not-null but PostLoopOffset set?");
+        PostLoopOffset = Start;
+        Start = SE.getConstant(Normalized->getType(), 0);
+    }
+    Normalized =
+      cast<SCEVAddRecExpr>(SE.getAddRecExpr(
+                             Start, Step, Normalized->getLoop(),
+                             Normalized->getNoWrapFlags(SCEV::FlagNW)));
+  }
+
+  // Expand the core addrec. If we need post-loop scaling, force it to
+  // expand to an integer type to avoid the need for additional casting.
+  Type *ExpandTy = PostLoopScale ? IntTy : STy;
+  // We can't use a pointer type for the addrec if the pointer type is
+  // non-integral.
+  Type *AddRecPHIExpandTy =
+      DL.isNonIntegralPointerType(STy) ? Normalized->getType() : ExpandTy;
+
+  // In some cases, we decide to reuse an existing phi node but need to truncate
+  // it and/or invert the step.
+  Type *TruncTy = nullptr;
+  bool InvertStep = false;
+  PHINode *PN = getAddRecExprPHILiterally(Normalized, L, AddRecPHIExpandTy,
+                                          IntTy, TruncTy, InvertStep);
+
+  // Accommodate post-inc mode, if necessary.
+  Value *Result;
+  if (!PostIncLoops.count(L))
+    Result = PN;
+  else {
+    // In PostInc mode, use the post-incremented value.
+    BasicBlock *LatchBlock = L->getLoopLatch();
+    assert(LatchBlock && "PostInc mode requires a unique loop latch!");
+    Result = PN->getIncomingValueForBlock(LatchBlock);
+
     // We might be introducing a new use of the post-inc IV that is not poison
     // safe, in which case we should drop poison generating flags. Only keep
     // those flags for which SCEV has proven that they always hold.
@@ -1451,361 +1451,361 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
         I->setHasNoSignedWrap(false);
     }
 
-    // For an expansion to use the postinc form, the client must call 
-    // expandCodeFor with an InsertPoint that is either outside the PostIncLoop 
-    // or dominated by IVIncInsertPos. 
-    if (isa<Instruction>(Result) && 
-        !SE.DT.dominates(cast<Instruction>(Result), 
-                         &*Builder.GetInsertPoint())) { 
-      // The induction variable's postinc expansion does not dominate this use. 
-      // IVUsers tries to prevent this case, so it is rare. However, it can 
-      // happen when an IVUser outside the loop is not dominated by the latch 
-      // block. Adjusting IVIncInsertPos before expansion begins cannot handle 
-      // all cases. Consider a phi outside whose operand is replaced during 
-      // expansion with the value of the postinc user. Without fundamentally 
-      // changing the way postinc users are tracked, the only remedy is 
-      // inserting an extra IV increment. StepV might fold into PostLoopOffset, 
-      // but hopefully expandCodeFor handles that. 
-      bool useSubtract = 
-        !ExpandTy->isPointerTy() && Step->isNonConstantNegative(); 
-      if (useSubtract) 
-        Step = SE.getNegativeSCEV(Step); 
-      Value *StepV; 
-      { 
-        // Expand the step somewhere that dominates the loop header. 
-        SCEVInsertPointGuard Guard(Builder, this); 
+    // For an expansion to use the postinc form, the client must call
+    // expandCodeFor with an InsertPoint that is either outside the PostIncLoop
+    // or dominated by IVIncInsertPos.
+    if (isa<Instruction>(Result) &&
+        !SE.DT.dominates(cast<Instruction>(Result),
+                         &*Builder.GetInsertPoint())) {
+      // The induction variable's postinc expansion does not dominate this use.
+      // IVUsers tries to prevent this case, so it is rare. However, it can
+      // happen when an IVUser outside the loop is not dominated by the latch
+      // block. Adjusting IVIncInsertPos before expansion begins cannot handle
+      // all cases. Consider a phi outside whose operand is replaced during
+      // expansion with the value of the postinc user. Without fundamentally
+      // changing the way postinc users are tracked, the only remedy is
+      // inserting an extra IV increment. StepV might fold into PostLoopOffset,
+      // but hopefully expandCodeFor handles that.
+      bool useSubtract =
+        !ExpandTy->isPointerTy() && Step->isNonConstantNegative();
+      if (useSubtract)
+        Step = SE.getNegativeSCEV(Step);
+      Value *StepV;
+      {
+        // Expand the step somewhere that dominates the loop header.
+        SCEVInsertPointGuard Guard(Builder, this);
         StepV = expandCodeForImpl(
             Step, IntTy, &*L->getHeader()->getFirstInsertionPt(), false);
-      } 
-      Result = expandIVInc(PN, StepV, L, ExpandTy, IntTy, useSubtract); 
-    } 
-  } 
- 
-  // We have decided to reuse an induction variable of a dominating loop. Apply 
-  // truncation and/or inversion of the step. 
-  if (TruncTy) { 
-    Type *ResTy = Result->getType(); 
-    // Normalize the result type. 
-    if (ResTy != SE.getEffectiveSCEVType(ResTy)) 
-      Result = InsertNoopCastOfTo(Result, SE.getEffectiveSCEVType(ResTy)); 
-    // Truncate the result. 
+      }
+      Result = expandIVInc(PN, StepV, L, ExpandTy, IntTy, useSubtract);
+    }
+  }
+
+  // We have decided to reuse an induction variable of a dominating loop. Apply
+  // truncation and/or inversion of the step.
+  if (TruncTy) {
+    Type *ResTy = Result->getType();
+    // Normalize the result type.
+    if (ResTy != SE.getEffectiveSCEVType(ResTy))
+      Result = InsertNoopCastOfTo(Result, SE.getEffectiveSCEVType(ResTy));
+    // Truncate the result.
     if (TruncTy != Result->getType())
-      Result = Builder.CreateTrunc(Result, TruncTy); 
+      Result = Builder.CreateTrunc(Result, TruncTy);
 
-    // Invert the result. 
+    // Invert the result.
     if (InvertStep)
       Result = Builder.CreateSub(
           expandCodeForImpl(Normalized->getStart(), TruncTy, false), Result);
-  } 
- 
-  // Re-apply any non-loop-dominating scale. 
-  if (PostLoopScale) { 
-    assert(S->isAffine() && "Can't linearly scale non-affine recurrences."); 
-    Result = InsertNoopCastOfTo(Result, IntTy); 
-    Result = Builder.CreateMul(Result, 
+  }
+
+  // Re-apply any non-loop-dominating scale.
+  if (PostLoopScale) {
+    assert(S->isAffine() && "Can't linearly scale non-affine recurrences.");
+    Result = InsertNoopCastOfTo(Result, IntTy);
+    Result = Builder.CreateMul(Result,
                                expandCodeForImpl(PostLoopScale, IntTy, false));
-  } 
- 
-  // Re-apply any non-loop-dominating offset. 
-  if (PostLoopOffset) { 
-    if (PointerType *PTy = dyn_cast<PointerType>(ExpandTy)) { 
-      if (Result->getType()->isIntegerTy()) { 
+  }
+
+  // Re-apply any non-loop-dominating offset.
+  if (PostLoopOffset) {
+    if (PointerType *PTy = dyn_cast<PointerType>(ExpandTy)) {
+      if (Result->getType()->isIntegerTy()) {
         Value *Base = expandCodeForImpl(PostLoopOffset, ExpandTy, false);
-        Result = expandAddToGEP(SE.getUnknown(Result), PTy, IntTy, Base); 
-      } else { 
-        Result = expandAddToGEP(PostLoopOffset, PTy, IntTy, Result); 
-      } 
-    } else { 
-      Result = InsertNoopCastOfTo(Result, IntTy); 
+        Result = expandAddToGEP(SE.getUnknown(Result), PTy, IntTy, Base);
+      } else {
+        Result = expandAddToGEP(PostLoopOffset, PTy, IntTy, Result);
+      }
+    } else {
+      Result = InsertNoopCastOfTo(Result, IntTy);
       Result = Builder.CreateAdd(
           Result, expandCodeForImpl(PostLoopOffset, IntTy, false));
-    } 
-  } 
- 
-  return Result; 
-} 
- 
-Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) { 
-  // In canonical mode we compute the addrec as an expression of a canonical IV 
-  // using evaluateAtIteration and expand the resulting SCEV expression. This 
-  // way we avoid introducing new IVs to carry on the comutation of the addrec 
-  // throughout the loop. 
-  // 
-  // For nested addrecs evaluateAtIteration might need a canonical IV of a 
-  // type wider than the addrec itself. Emitting a canonical IV of the 
-  // proper type might produce non-legal types, for example expanding an i64 
-  // {0,+,2,+,1} addrec would need an i65 canonical IV. To avoid this just fall 
-  // back to non-canonical mode for nested addrecs. 
-  if (!CanonicalMode || (S->getNumOperands() > 2)) 
-    return expandAddRecExprLiterally(S); 
- 
-  Type *Ty = SE.getEffectiveSCEVType(S->getType()); 
-  const Loop *L = S->getLoop(); 
- 
-  // First check for an existing canonical IV in a suitable type. 
-  PHINode *CanonicalIV = nullptr; 
-  if (PHINode *PN = L->getCanonicalInductionVariable()) 
-    if (SE.getTypeSizeInBits(PN->getType()) >= SE.getTypeSizeInBits(Ty)) 
-      CanonicalIV = PN; 
- 
-  // Rewrite an AddRec in terms of the canonical induction variable, if 
-  // its type is more narrow. 
-  if (CanonicalIV && 
-      SE.getTypeSizeInBits(CanonicalIV->getType()) > 
-      SE.getTypeSizeInBits(Ty)) { 
-    SmallVector<const SCEV *, 4> NewOps(S->getNumOperands()); 
-    for (unsigned i = 0, e = S->getNumOperands(); i != e; ++i) 
-      NewOps[i] = SE.getAnyExtendExpr(S->op_begin()[i], CanonicalIV->getType()); 
-    Value *V = expand(SE.getAddRecExpr(NewOps, S->getLoop(), 
-                                       S->getNoWrapFlags(SCEV::FlagNW))); 
-    BasicBlock::iterator NewInsertPt = 
+    }
+  }
+
+  return Result;
+}
+
+Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
+  // In canonical mode we compute the addrec as an expression of a canonical IV
+  // using evaluateAtIteration and expand the resulting SCEV expression. This
+  // way we avoid introducing new IVs to carry on the comutation of the addrec
+  // throughout the loop.
+  //
+  // For nested addrecs evaluateAtIteration might need a canonical IV of a
+  // type wider than the addrec itself. Emitting a canonical IV of the
+  // proper type might produce non-legal types, for example expanding an i64
+  // {0,+,2,+,1} addrec would need an i65 canonical IV. To avoid this just fall
+  // back to non-canonical mode for nested addrecs.
+  if (!CanonicalMode || (S->getNumOperands() > 2))
+    return expandAddRecExprLiterally(S);
+
+  Type *Ty = SE.getEffectiveSCEVType(S->getType());
+  const Loop *L = S->getLoop();
+
+  // First check for an existing canonical IV in a suitable type.
+  PHINode *CanonicalIV = nullptr;
+  if (PHINode *PN = L->getCanonicalInductionVariable())
+    if (SE.getTypeSizeInBits(PN->getType()) >= SE.getTypeSizeInBits(Ty))
+      CanonicalIV = PN;
+
+  // Rewrite an AddRec in terms of the canonical induction variable, if
+  // its type is more narrow.
+  if (CanonicalIV &&
+      SE.getTypeSizeInBits(CanonicalIV->getType()) >
+      SE.getTypeSizeInBits(Ty)) {
+    SmallVector<const SCEV *, 4> NewOps(S->getNumOperands());
+    for (unsigned i = 0, e = S->getNumOperands(); i != e; ++i)
+      NewOps[i] = SE.getAnyExtendExpr(S->op_begin()[i], CanonicalIV->getType());
+    Value *V = expand(SE.getAddRecExpr(NewOps, S->getLoop(),
+                                       S->getNoWrapFlags(SCEV::FlagNW)));
+    BasicBlock::iterator NewInsertPt =
         findInsertPointAfter(cast<Instruction>(V), &*Builder.GetInsertPoint());
     V = expandCodeForImpl(SE.getTruncateExpr(SE.getUnknown(V), Ty), nullptr,
                           &*NewInsertPt, false);
-    return V; 
-  } 
- 
-  // {X,+,F} --> X + {0,+,F} 
-  if (!S->getStart()->isZero()) { 
+    return V;
+  }
+
+  // {X,+,F} --> X + {0,+,F}
+  if (!S->getStart()->isZero()) {
     SmallVector<const SCEV *, 4> NewOps(S->operands());
-    NewOps[0] = SE.getConstant(Ty, 0); 
-    const SCEV *Rest = SE.getAddRecExpr(NewOps, L, 
-                                        S->getNoWrapFlags(SCEV::FlagNW)); 
- 
-    // Turn things like ptrtoint+arithmetic+inttoptr into GEP. See the 
-    // comments on expandAddToGEP for details. 
-    const SCEV *Base = S->getStart(); 
-    // Dig into the expression to find the pointer base for a GEP. 
-    const SCEV *ExposedRest = Rest; 
-    ExposePointerBase(Base, ExposedRest, SE); 
-    // If we found a pointer, expand the AddRec with a GEP. 
-    if (PointerType *PTy = dyn_cast<PointerType>(Base->getType())) { 
-      // Make sure the Base isn't something exotic, such as a multiplied 
-      // or divided pointer value. In those cases, the result type isn't 
-      // actually a pointer type. 
-      if (!isa<SCEVMulExpr>(Base) && !isa<SCEVUDivExpr>(Base)) { 
-        Value *StartV = expand(Base); 
-        assert(StartV->getType() == PTy && "Pointer type mismatch for GEP!"); 
-        return expandAddToGEP(ExposedRest, PTy, Ty, StartV); 
-      } 
-    } 
- 
-    // Just do a normal add. Pre-expand the operands to suppress folding. 
-    // 
-    // The LHS and RHS values are factored out of the expand call to make the 
-    // output independent of the argument evaluation order. 
-    const SCEV *AddExprLHS = SE.getUnknown(expand(S->getStart())); 
-    const SCEV *AddExprRHS = SE.getUnknown(expand(Rest)); 
-    return expand(SE.getAddExpr(AddExprLHS, AddExprRHS)); 
-  } 
- 
-  // If we don't yet have a canonical IV, create one. 
-  if (!CanonicalIV) { 
-    // Create and insert the PHI node for the induction variable in the 
-    // specified loop. 
-    BasicBlock *Header = L->getHeader(); 
-    pred_iterator HPB = pred_begin(Header), HPE = pred_end(Header); 
-    CanonicalIV = PHINode::Create(Ty, std::distance(HPB, HPE), "indvar", 
-                                  &Header->front()); 
-    rememberInstruction(CanonicalIV); 
- 
-    SmallSet<BasicBlock *, 4> PredSeen; 
-    Constant *One = ConstantInt::get(Ty, 1); 
-    for (pred_iterator HPI = HPB; HPI != HPE; ++HPI) { 
-      BasicBlock *HP = *HPI; 
-      if (!PredSeen.insert(HP).second) { 
-        // There must be an incoming value for each predecessor, even the 
-        // duplicates! 
-        CanonicalIV->addIncoming(CanonicalIV->getIncomingValueForBlock(HP), HP); 
-        continue; 
-      } 
- 
-      if (L->contains(HP)) { 
-        // Insert a unit add instruction right before the terminator 
-        // corresponding to the back-edge. 
-        Instruction *Add = BinaryOperator::CreateAdd(CanonicalIV, One, 
-                                                     "indvar.next", 
-                                                     HP->getTerminator()); 
-        Add->setDebugLoc(HP->getTerminator()->getDebugLoc()); 
-        rememberInstruction(Add); 
-        CanonicalIV->addIncoming(Add, HP); 
-      } else { 
-        CanonicalIV->addIncoming(Constant::getNullValue(Ty), HP); 
-      } 
-    } 
-  } 
- 
-  // {0,+,1} --> Insert a canonical induction variable into the loop! 
-  if (S->isAffine() && S->getOperand(1)->isOne()) { 
-    assert(Ty == SE.getEffectiveSCEVType(CanonicalIV->getType()) && 
-           "IVs with types different from the canonical IV should " 
-           "already have been handled!"); 
-    return CanonicalIV; 
-  } 
- 
-  // {0,+,F} --> {0,+,1} * F 
- 
-  // If this is a simple linear addrec, emit it now as a special case. 
-  if (S->isAffine())    // {0,+,F} --> i*F 
-    return 
-      expand(SE.getTruncateOrNoop( 
-        SE.getMulExpr(SE.getUnknown(CanonicalIV), 
-                      SE.getNoopOrAnyExtend(S->getOperand(1), 
-                                            CanonicalIV->getType())), 
-        Ty)); 
- 
-  // If this is a chain of recurrences, turn it into a closed form, using the 
-  // folders, then expandCodeFor the closed form.  This allows the folders to 
-  // simplify the expression without having to build a bunch of special code 
-  // into this folder. 
-  const SCEV *IH = SE.getUnknown(CanonicalIV);   // Get I as a "symbolic" SCEV. 
- 
-  // Promote S up to the canonical IV type, if the cast is foldable. 
-  const SCEV *NewS = S; 
-  const SCEV *Ext = SE.getNoopOrAnyExtend(S, CanonicalIV->getType()); 
-  if (isa<SCEVAddRecExpr>(Ext)) 
-    NewS = Ext; 
- 
-  const SCEV *V = cast<SCEVAddRecExpr>(NewS)->evaluateAtIteration(IH, SE); 
-  //cerr << "Evaluated: " << *this << "\n     to: " << *V << "\n"; 
- 
-  // Truncate the result down to the original type, if needed. 
-  const SCEV *T = SE.getTruncateOrNoop(V, Ty); 
-  return expand(T); 
-} 
- 
+    NewOps[0] = SE.getConstant(Ty, 0);
+    const SCEV *Rest = SE.getAddRecExpr(NewOps, L,
+                                        S->getNoWrapFlags(SCEV::FlagNW));
+
+    // Turn things like ptrtoint+arithmetic+inttoptr into GEP. See the
+    // comments on expandAddToGEP for details.
+    const SCEV *Base = S->getStart();
+    // Dig into the expression to find the pointer base for a GEP.
+    const SCEV *ExposedRest = Rest;
+    ExposePointerBase(Base, ExposedRest, SE);
+    // If we found a pointer, expand the AddRec with a GEP.
+    if (PointerType *PTy = dyn_cast<PointerType>(Base->getType())) {
+      // Make sure the Base isn't something exotic, such as a multiplied
+      // or divided pointer value. In those cases, the result type isn't
+      // actually a pointer type.
+      if (!isa<SCEVMulExpr>(Base) && !isa<SCEVUDivExpr>(Base)) {
+        Value *StartV = expand(Base);
+        assert(StartV->getType() == PTy && "Pointer type mismatch for GEP!");
+        return expandAddToGEP(ExposedRest, PTy, Ty, StartV);
+      }
+    }
+
+    // Just do a normal add. Pre-expand the operands to suppress folding.
+    //
+    // The LHS and RHS values are factored out of the expand call to make the
+    // output independent of the argument evaluation order.
+    const SCEV *AddExprLHS = SE.getUnknown(expand(S->getStart()));
+    const SCEV *AddExprRHS = SE.getUnknown(expand(Rest));
+    return expand(SE.getAddExpr(AddExprLHS, AddExprRHS));
+  }
+
+  // If we don't yet have a canonical IV, create one.
+  if (!CanonicalIV) {
+    // Create and insert the PHI node for the induction variable in the
+    // specified loop.
+    BasicBlock *Header = L->getHeader();
+    pred_iterator HPB = pred_begin(Header), HPE = pred_end(Header);
+    CanonicalIV = PHINode::Create(Ty, std::distance(HPB, HPE), "indvar",
+                                  &Header->front());
+    rememberInstruction(CanonicalIV);
+
+    SmallSet<BasicBlock *, 4> PredSeen;
+    Constant *One = ConstantInt::get(Ty, 1);
+    for (pred_iterator HPI = HPB; HPI != HPE; ++HPI) {
+      BasicBlock *HP = *HPI;
+      if (!PredSeen.insert(HP).second) {
+        // There must be an incoming value for each predecessor, even the
+        // duplicates!
+        CanonicalIV->addIncoming(CanonicalIV->getIncomingValueForBlock(HP), HP);
+        continue;
+      }
+
+      if (L->contains(HP)) {
+        // Insert a unit add instruction right before the terminator
+        // corresponding to the back-edge.
+        Instruction *Add = BinaryOperator::CreateAdd(CanonicalIV, One,
+                                                     "indvar.next",
+                                                     HP->getTerminator());
+        Add->setDebugLoc(HP->getTerminator()->getDebugLoc());
+        rememberInstruction(Add);
+        CanonicalIV->addIncoming(Add, HP);
+      } else {
+        CanonicalIV->addIncoming(Constant::getNullValue(Ty), HP);
+      }
+    }
+  }
+
+  // {0,+,1} --> Insert a canonical induction variable into the loop!
+  if (S->isAffine() && S->getOperand(1)->isOne()) {
+    assert(Ty == SE.getEffectiveSCEVType(CanonicalIV->getType()) &&
+           "IVs with types different from the canonical IV should "
+           "already have been handled!");
+    return CanonicalIV;
+  }
+
+  // {0,+,F} --> {0,+,1} * F
+
+  // If this is a simple linear addrec, emit it now as a special case.
+  if (S->isAffine())    // {0,+,F} --> i*F
+    return
+      expand(SE.getTruncateOrNoop(
+        SE.getMulExpr(SE.getUnknown(CanonicalIV),
+                      SE.getNoopOrAnyExtend(S->getOperand(1),
+                                            CanonicalIV->getType())),
+        Ty));
+
+  // If this is a chain of recurrences, turn it into a closed form, using the
+  // folders, then expandCodeFor the closed form.  This allows the folders to
+  // simplify the expression without having to build a bunch of special code
+  // into this folder.
+  const SCEV *IH = SE.getUnknown(CanonicalIV);   // Get I as a "symbolic" SCEV.
+
+  // Promote S up to the canonical IV type, if the cast is foldable.
+  const SCEV *NewS = S;
+  const SCEV *Ext = SE.getNoopOrAnyExtend(S, CanonicalIV->getType());
+  if (isa<SCEVAddRecExpr>(Ext))
+    NewS = Ext;
+
+  const SCEV *V = cast<SCEVAddRecExpr>(NewS)->evaluateAtIteration(IH, SE);
+  //cerr << "Evaluated: " << *this << "\n     to: " << *V << "\n";
+
+  // Truncate the result down to the original type, if needed.
+  const SCEV *T = SE.getTruncateOrNoop(V, Ty);
+  return expand(T);
+}
+
 Value *SCEVExpander::visitPtrToIntExpr(const SCEVPtrToIntExpr *S) {
   Value *V =
       expandCodeForImpl(S->getOperand(), S->getOperand()->getType(), false);
   return Builder.CreatePtrToInt(V, S->getType());
 }
 
-Value *SCEVExpander::visitTruncateExpr(const SCEVTruncateExpr *S) { 
-  Type *Ty = SE.getEffectiveSCEVType(S->getType()); 
+Value *SCEVExpander::visitTruncateExpr(const SCEVTruncateExpr *S) {
+  Type *Ty = SE.getEffectiveSCEVType(S->getType());
   Value *V = expandCodeForImpl(
       S->getOperand(), SE.getEffectiveSCEVType(S->getOperand()->getType()),
       false);
   return Builder.CreateTrunc(V, Ty);
-} 
- 
-Value *SCEVExpander::visitZeroExtendExpr(const SCEVZeroExtendExpr *S) { 
-  Type *Ty = SE.getEffectiveSCEVType(S->getType()); 
+}
+
+Value *SCEVExpander::visitZeroExtendExpr(const SCEVZeroExtendExpr *S) {
+  Type *Ty = SE.getEffectiveSCEVType(S->getType());
   Value *V = expandCodeForImpl(
       S->getOperand(), SE.getEffectiveSCEVType(S->getOperand()->getType()),
       false);
   return Builder.CreateZExt(V, Ty);
-} 
- 
-Value *SCEVExpander::visitSignExtendExpr(const SCEVSignExtendExpr *S) { 
-  Type *Ty = SE.getEffectiveSCEVType(S->getType()); 
+}
+
+Value *SCEVExpander::visitSignExtendExpr(const SCEVSignExtendExpr *S) {
+  Type *Ty = SE.getEffectiveSCEVType(S->getType());
   Value *V = expandCodeForImpl(
       S->getOperand(), SE.getEffectiveSCEVType(S->getOperand()->getType()),
       false);
   return Builder.CreateSExt(V, Ty);
-} 
- 
-Value *SCEVExpander::visitSMaxExpr(const SCEVSMaxExpr *S) { 
-  Value *LHS = expand(S->getOperand(S->getNumOperands()-1)); 
-  Type *Ty = LHS->getType(); 
-  for (int i = S->getNumOperands()-2; i >= 0; --i) { 
-    // In the case of mixed integer and pointer types, do the 
-    // rest of the comparisons as integer. 
-    Type *OpTy = S->getOperand(i)->getType(); 
-    if (OpTy->isIntegerTy() != Ty->isIntegerTy()) { 
-      Ty = SE.getEffectiveSCEVType(Ty); 
-      LHS = InsertNoopCastOfTo(LHS, Ty); 
-    } 
+}
+
+Value *SCEVExpander::visitSMaxExpr(const SCEVSMaxExpr *S) {
+  Value *LHS = expand(S->getOperand(S->getNumOperands()-1));
+  Type *Ty = LHS->getType();
+  for (int i = S->getNumOperands()-2; i >= 0; --i) {
+    // In the case of mixed integer and pointer types, do the
+    // rest of the comparisons as integer.
+    Type *OpTy = S->getOperand(i)->getType();
+    if (OpTy->isIntegerTy() != Ty->isIntegerTy()) {
+      Ty = SE.getEffectiveSCEVType(Ty);
+      LHS = InsertNoopCastOfTo(LHS, Ty);
+    }
     Value *RHS = expandCodeForImpl(S->getOperand(i), Ty, false);
-    Value *ICmp = Builder.CreateICmpSGT(LHS, RHS); 
-    Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "smax"); 
-    LHS = Sel; 
-  } 
-  // In the case of mixed integer and pointer types, cast the 
-  // final result back to the pointer type. 
-  if (LHS->getType() != S->getType()) 
-    LHS = InsertNoopCastOfTo(LHS, S->getType()); 
-  return LHS; 
-} 
- 
-Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) { 
-  Value *LHS = expand(S->getOperand(S->getNumOperands()-1)); 
-  Type *Ty = LHS->getType(); 
-  for (int i = S->getNumOperands()-2; i >= 0; --i) { 
-    // In the case of mixed integer and pointer types, do the 
-    // rest of the comparisons as integer. 
-    Type *OpTy = S->getOperand(i)->getType(); 
-    if (OpTy->isIntegerTy() != Ty->isIntegerTy()) { 
-      Ty = SE.getEffectiveSCEVType(Ty); 
-      LHS = InsertNoopCastOfTo(LHS, Ty); 
-    } 
+    Value *ICmp = Builder.CreateICmpSGT(LHS, RHS);
+    Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "smax");
+    LHS = Sel;
+  }
+  // In the case of mixed integer and pointer types, cast the
+  // final result back to the pointer type.
+  if (LHS->getType() != S->getType())
+    LHS = InsertNoopCastOfTo(LHS, S->getType());
+  return LHS;
+}
+
+Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) {
+  Value *LHS = expand(S->getOperand(S->getNumOperands()-1));
+  Type *Ty = LHS->getType();
+  for (int i = S->getNumOperands()-2; i >= 0; --i) {
+    // In the case of mixed integer and pointer types, do the
+    // rest of the comparisons as integer.
+    Type *OpTy = S->getOperand(i)->getType();
+    if (OpTy->isIntegerTy() != Ty->isIntegerTy()) {
+      Ty = SE.getEffectiveSCEVType(Ty);
+      LHS = InsertNoopCastOfTo(LHS, Ty);
+    }
     Value *RHS = expandCodeForImpl(S->getOperand(i), Ty, false);
-    Value *ICmp = Builder.CreateICmpUGT(LHS, RHS); 
-    Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "umax"); 
-    LHS = Sel; 
-  } 
-  // In the case of mixed integer and pointer types, cast the 
-  // final result back to the pointer type. 
-  if (LHS->getType() != S->getType()) 
-    LHS = InsertNoopCastOfTo(LHS, S->getType()); 
-  return LHS; 
-} 
- 
-Value *SCEVExpander::visitSMinExpr(const SCEVSMinExpr *S) { 
-  Value *LHS = expand(S->getOperand(S->getNumOperands() - 1)); 
-  Type *Ty = LHS->getType(); 
-  for (int i = S->getNumOperands() - 2; i >= 0; --i) { 
-    // In the case of mixed integer and pointer types, do the 
-    // rest of the comparisons as integer. 
-    Type *OpTy = S->getOperand(i)->getType(); 
-    if (OpTy->isIntegerTy() != Ty->isIntegerTy()) { 
-      Ty = SE.getEffectiveSCEVType(Ty); 
-      LHS = InsertNoopCastOfTo(LHS, Ty); 
-    } 
+    Value *ICmp = Builder.CreateICmpUGT(LHS, RHS);
+    Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "umax");
+    LHS = Sel;
+  }
+  // In the case of mixed integer and pointer types, cast the
+  // final result back to the pointer type.
+  if (LHS->getType() != S->getType())
+    LHS = InsertNoopCastOfTo(LHS, S->getType());
+  return LHS;
+}
+
+Value *SCEVExpander::visitSMinExpr(const SCEVSMinExpr *S) {
+  Value *LHS = expand(S->getOperand(S->getNumOperands() - 1));
+  Type *Ty = LHS->getType();
+  for (int i = S->getNumOperands() - 2; i >= 0; --i) {
+    // In the case of mixed integer and pointer types, do the
+    // rest of the comparisons as integer.
+    Type *OpTy = S->getOperand(i)->getType();
+    if (OpTy->isIntegerTy() != Ty->isIntegerTy()) {
+      Ty = SE.getEffectiveSCEVType(Ty);
+      LHS = InsertNoopCastOfTo(LHS, Ty);
+    }
     Value *RHS = expandCodeForImpl(S->getOperand(i), Ty, false);
-    Value *ICmp = Builder.CreateICmpSLT(LHS, RHS); 
-    Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "smin"); 
-    LHS = Sel; 
-  } 
-  // In the case of mixed integer and pointer types, cast the 
-  // final result back to the pointer type. 
-  if (LHS->getType() != S->getType()) 
-    LHS = InsertNoopCastOfTo(LHS, S->getType()); 
-  return LHS; 
-} 
- 
-Value *SCEVExpander::visitUMinExpr(const SCEVUMinExpr *S) { 
-  Value *LHS = expand(S->getOperand(S->getNumOperands() - 1)); 
-  Type *Ty = LHS->getType(); 
-  for (int i = S->getNumOperands() - 2; i >= 0; --i) { 
-    // In the case of mixed integer and pointer types, do the 
-    // rest of the comparisons as integer. 
-    Type *OpTy = S->getOperand(i)->getType(); 
-    if (OpTy->isIntegerTy() != Ty->isIntegerTy()) { 
-      Ty = SE.getEffectiveSCEVType(Ty); 
-      LHS = InsertNoopCastOfTo(LHS, Ty); 
-    } 
+    Value *ICmp = Builder.CreateICmpSLT(LHS, RHS);
+    Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "smin");
+    LHS = Sel;
+  }
+  // In the case of mixed integer and pointer types, cast the
+  // final result back to the pointer type.
+  if (LHS->getType() != S->getType())
+    LHS = InsertNoopCastOfTo(LHS, S->getType());
+  return LHS;
+}
+
+Value *SCEVExpander::visitUMinExpr(const SCEVUMinExpr *S) {
+  Value *LHS = expand(S->getOperand(S->getNumOperands() - 1));
+  Type *Ty = LHS->getType();
+  for (int i = S->getNumOperands() - 2; i >= 0; --i) {
+    // In the case of mixed integer and pointer types, do the
+    // rest of the comparisons as integer.
+    Type *OpTy = S->getOperand(i)->getType();
+    if (OpTy->isIntegerTy() != Ty->isIntegerTy()) {
+      Ty = SE.getEffectiveSCEVType(Ty);
+      LHS = InsertNoopCastOfTo(LHS, Ty);
+    }
     Value *RHS = expandCodeForImpl(S->getOperand(i), Ty, false);
-    Value *ICmp = Builder.CreateICmpULT(LHS, RHS); 
-    Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "umin"); 
-    LHS = Sel; 
-  } 
-  // In the case of mixed integer and pointer types, cast the 
-  // final result back to the pointer type. 
-  if (LHS->getType() != S->getType()) 
-    LHS = InsertNoopCastOfTo(LHS, S->getType()); 
-  return LHS; 
-} 
- 
+    Value *ICmp = Builder.CreateICmpULT(LHS, RHS);
+    Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "umin");
+    LHS = Sel;
+  }
+  // In the case of mixed integer and pointer types, cast the
+  // final result back to the pointer type.
+  if (LHS->getType() != S->getType())
+    LHS = InsertNoopCastOfTo(LHS, S->getType());
+  return LHS;
+}
+
 Value *SCEVExpander::expandCodeForImpl(const SCEV *SH, Type *Ty,
                                        Instruction *IP, bool Root) {
-  setInsertPoint(IP); 
+  setInsertPoint(IP);
   Value *V = expandCodeForImpl(SH, Ty, Root);
   return V;
-} 
- 
+}
+
 Value *SCEVExpander::expandCodeForImpl(const SCEV *SH, Type *Ty, bool Root) {
-  // Expand the code for this SCEV. 
-  Value *V = expand(SH); 
+  // Expand the code for this SCEV.
+  Value *V = expand(SH);
 
   if (PreserveLCSSA) {
     if (auto *Inst = dyn_cast<Instruction>(V)) {
@@ -1835,147 +1835,147 @@ Value *SCEVExpander::expandCodeForImpl(const SCEV *SH, Type *Ty, bool Root) {
   }
 
   InsertedExpressions[std::make_pair(SH, &*Builder.GetInsertPoint())] = V;
-  if (Ty) { 
-    assert(SE.getTypeSizeInBits(Ty) == SE.getTypeSizeInBits(SH->getType()) && 
-           "non-trivial casts should be done with the SCEVs directly!"); 
-    V = InsertNoopCastOfTo(V, Ty); 
-  } 
-  return V; 
-} 
- 
-ScalarEvolution::ValueOffsetPair 
-SCEVExpander::FindValueInExprValueMap(const SCEV *S, 
-                                      const Instruction *InsertPt) { 
-  SetVector<ScalarEvolution::ValueOffsetPair> *Set = SE.getSCEVValues(S); 
-  // If the expansion is not in CanonicalMode, and the SCEV contains any 
-  // sub scAddRecExpr type SCEV, it is required to expand the SCEV literally. 
-  if (CanonicalMode || !SE.containsAddRecurrence(S)) { 
-    // If S is scConstant, it may be worse to reuse an existing Value. 
-    if (S->getSCEVType() != scConstant && Set) { 
-      // Choose a Value from the set which dominates the insertPt. 
-      // insertPt should be inside the Value's parent loop so as not to break 
-      // the LCSSA form. 
-      for (auto const &VOPair : *Set) { 
-        Value *V = VOPair.first; 
-        ConstantInt *Offset = VOPair.second; 
-        Instruction *EntInst = nullptr; 
-        if (V && isa<Instruction>(V) && (EntInst = cast<Instruction>(V)) && 
-            S->getType() == V->getType() && 
-            EntInst->getFunction() == InsertPt->getFunction() && 
-            SE.DT.dominates(EntInst, InsertPt) && 
-            (SE.LI.getLoopFor(EntInst->getParent()) == nullptr || 
-             SE.LI.getLoopFor(EntInst->getParent())->contains(InsertPt))) 
-          return {V, Offset}; 
-      } 
-    } 
-  } 
-  return {nullptr, nullptr}; 
-} 
- 
-// The expansion of SCEV will either reuse a previous Value in ExprValueMap, 
-// or expand the SCEV literally. Specifically, if the expansion is in LSRMode, 
-// and the SCEV contains any sub scAddRecExpr type SCEV, it will be expanded 
-// literally, to prevent LSR's transformed SCEV from being reverted. Otherwise, 
-// the expansion will try to reuse Value from ExprValueMap, and only when it 
-// fails, expand the SCEV literally. 
-Value *SCEVExpander::expand(const SCEV *S) { 
-  // Compute an insertion point for this SCEV object. Hoist the instructions 
-  // as far out in the loop nest as possible. 
-  Instruction *InsertPt = &*Builder.GetInsertPoint(); 
- 
-  // We can move insertion point only if there is no div or rem operations 
-  // otherwise we are risky to move it over the check for zero denominator. 
-  auto SafeToHoist = [](const SCEV *S) { 
-    return !SCEVExprContains(S, [](const SCEV *S) { 
-              if (const auto *D = dyn_cast<SCEVUDivExpr>(S)) { 
-                if (const auto *SC = dyn_cast<SCEVConstant>(D->getRHS())) 
-                  // Division by non-zero constants can be hoisted. 
-                  return SC->getValue()->isZero(); 
-                // All other divisions should not be moved as they may be 
-                // divisions by zero and should be kept within the 
-                // conditions of the surrounding loops that guard their 
-                // execution (see PR35406). 
-                return true; 
-              } 
-              return false; 
-            }); 
-  }; 
-  if (SafeToHoist(S)) { 
-    for (Loop *L = SE.LI.getLoopFor(Builder.GetInsertBlock());; 
-         L = L->getParentLoop()) { 
-      if (SE.isLoopInvariant(S, L)) { 
-        if (!L) break; 
-        if (BasicBlock *Preheader = L->getLoopPreheader()) 
-          InsertPt = Preheader->getTerminator(); 
-        else 
-          // LSR sets the insertion point for AddRec start/step values to the 
-          // block start to simplify value reuse, even though it's an invalid 
-          // position. SCEVExpander must correct for this in all cases. 
-          InsertPt = &*L->getHeader()->getFirstInsertionPt(); 
-      } else { 
-        // If the SCEV is computable at this level, insert it into the header 
-        // after the PHIs (and after any other instructions that we've inserted 
-        // there) so that it is guaranteed to dominate any user inside the loop. 
-        if (L && SE.hasComputableLoopEvolution(S, L) && !PostIncLoops.count(L)) 
-          InsertPt = &*L->getHeader()->getFirstInsertionPt(); 
-
-        while (InsertPt->getIterator() != Builder.GetInsertPoint() && 
-               (isInsertedInstruction(InsertPt) || 
+  if (Ty) {
+    assert(SE.getTypeSizeInBits(Ty) == SE.getTypeSizeInBits(SH->getType()) &&
+           "non-trivial casts should be done with the SCEVs directly!");
+    V = InsertNoopCastOfTo(V, Ty);
+  }
+  return V;
+}
+
+ScalarEvolution::ValueOffsetPair
+SCEVExpander::FindValueInExprValueMap(const SCEV *S,
+                                      const Instruction *InsertPt) {
+  SetVector<ScalarEvolution::ValueOffsetPair> *Set = SE.getSCEVValues(S);
+  // If the expansion is not in CanonicalMode, and the SCEV contains any
+  // sub scAddRecExpr type SCEV, it is required to expand the SCEV literally.
+  if (CanonicalMode || !SE.containsAddRecurrence(S)) {
+    // If S is scConstant, it may be worse to reuse an existing Value.
+    if (S->getSCEVType() != scConstant && Set) {
+      // Choose a Value from the set which dominates the insertPt.
+      // insertPt should be inside the Value's parent loop so as not to break
+      // the LCSSA form.
+      for (auto const &VOPair : *Set) {
+        Value *V = VOPair.first;
+        ConstantInt *Offset = VOPair.second;
+        Instruction *EntInst = nullptr;
+        if (V && isa<Instruction>(V) && (EntInst = cast<Instruction>(V)) &&
+            S->getType() == V->getType() &&
+            EntInst->getFunction() == InsertPt->getFunction() &&
+            SE.DT.dominates(EntInst, InsertPt) &&
+            (SE.LI.getLoopFor(EntInst->getParent()) == nullptr ||
+             SE.LI.getLoopFor(EntInst->getParent())->contains(InsertPt)))
+          return {V, Offset};
+      }
+    }
+  }
+  return {nullptr, nullptr};
+}
+
+// The expansion of SCEV will either reuse a previous Value in ExprValueMap,
+// or expand the SCEV literally. Specifically, if the expansion is in LSRMode,
+// and the SCEV contains any sub scAddRecExpr type SCEV, it will be expanded
+// literally, to prevent LSR's transformed SCEV from being reverted. Otherwise,
+// the expansion will try to reuse Value from ExprValueMap, and only when it
+// fails, expand the SCEV literally.
+Value *SCEVExpander::expand(const SCEV *S) {
+  // Compute an insertion point for this SCEV object. Hoist the instructions
+  // as far out in the loop nest as possible.
+  Instruction *InsertPt = &*Builder.GetInsertPoint();
+
+  // We can move insertion point only if there is no div or rem operations
+  // otherwise we are risky to move it over the check for zero denominator.
+  auto SafeToHoist = [](const SCEV *S) {
+    return !SCEVExprContains(S, [](const SCEV *S) {
+              if (const auto *D = dyn_cast<SCEVUDivExpr>(S)) {
+                if (const auto *SC = dyn_cast<SCEVConstant>(D->getRHS()))
+                  // Division by non-zero constants can be hoisted.
+                  return SC->getValue()->isZero();
+                // All other divisions should not be moved as they may be
+                // divisions by zero and should be kept within the
+                // conditions of the surrounding loops that guard their
+                // execution (see PR35406).
+                return true;
+              }
+              return false;
+            });
+  };
+  if (SafeToHoist(S)) {
+    for (Loop *L = SE.LI.getLoopFor(Builder.GetInsertBlock());;
+         L = L->getParentLoop()) {
+      if (SE.isLoopInvariant(S, L)) {
+        if (!L) break;
+        if (BasicBlock *Preheader = L->getLoopPreheader())
+          InsertPt = Preheader->getTerminator();
+        else
+          // LSR sets the insertion point for AddRec start/step values to the
+          // block start to simplify value reuse, even though it's an invalid
+          // position. SCEVExpander must correct for this in all cases.
+          InsertPt = &*L->getHeader()->getFirstInsertionPt();
+      } else {
+        // If the SCEV is computable at this level, insert it into the header
+        // after the PHIs (and after any other instructions that we've inserted
+        // there) so that it is guaranteed to dominate any user inside the loop.
+        if (L && SE.hasComputableLoopEvolution(S, L) && !PostIncLoops.count(L))
+          InsertPt = &*L->getHeader()->getFirstInsertionPt();
+
+        while (InsertPt->getIterator() != Builder.GetInsertPoint() &&
+               (isInsertedInstruction(InsertPt) ||
                 isa<DbgInfoIntrinsic>(InsertPt))) {
-          InsertPt = &*std::next(InsertPt->getIterator()); 
+          InsertPt = &*std::next(InsertPt->getIterator());
         }
-        break; 
-      } 
-    } 
-  } 
- 
-  // Check to see if we already expanded this here. 
-  auto I = InsertedExpressions.find(std::make_pair(S, InsertPt)); 
-  if (I != InsertedExpressions.end()) 
-    return I->second; 
- 
-  SCEVInsertPointGuard Guard(Builder, this); 
-  Builder.SetInsertPoint(InsertPt); 
- 
-  // Expand the expression into instructions. 
-  ScalarEvolution::ValueOffsetPair VO = FindValueInExprValueMap(S, InsertPt); 
-  Value *V = VO.first; 
- 
-  if (!V) 
-    V = visit(S); 
-  else if (VO.second) { 
-    if (PointerType *Vty = dyn_cast<PointerType>(V->getType())) { 
-      Type *Ety = Vty->getPointerElementType(); 
-      int64_t Offset = VO.second->getSExtValue(); 
-      int64_t ESize = SE.getTypeSizeInBits(Ety); 
-      if ((Offset * 8) % ESize == 0) { 
-        ConstantInt *Idx = 
-            ConstantInt::getSigned(VO.second->getType(), -(Offset * 8) / ESize); 
-        V = Builder.CreateGEP(Ety, V, Idx, "scevgep"); 
-      } else { 
-        ConstantInt *Idx = 
-            ConstantInt::getSigned(VO.second->getType(), -Offset); 
-        unsigned AS = Vty->getAddressSpace(); 
-        V = Builder.CreateBitCast(V, Type::getInt8PtrTy(SE.getContext(), AS)); 
-        V = Builder.CreateGEP(Type::getInt8Ty(SE.getContext()), V, Idx, 
-                              "uglygep"); 
-        V = Builder.CreateBitCast(V, Vty); 
-      } 
-    } else { 
-      V = Builder.CreateSub(V, VO.second); 
-    } 
-  } 
-  // Remember the expanded value for this SCEV at this location. 
-  // 
-  // This is independent of PostIncLoops. The mapped value simply materializes 
-  // the expression at this insertion point. If the mapped value happened to be 
-  // a postinc expansion, it could be reused by a non-postinc user, but only if 
-  // its insertion point was already at the head of the loop. 
-  InsertedExpressions[std::make_pair(S, InsertPt)] = V; 
-  return V; 
-} 
- 
-void SCEVExpander::rememberInstruction(Value *I) { 
+        break;
+      }
+    }
+  }
+
+  // Check to see if we already expanded this here.
+  auto I = InsertedExpressions.find(std::make_pair(S, InsertPt));
+  if (I != InsertedExpressions.end())
+    return I->second;
+
+  SCEVInsertPointGuard Guard(Builder, this);
+  Builder.SetInsertPoint(InsertPt);
+
+  // Expand the expression into instructions.
+  ScalarEvolution::ValueOffsetPair VO = FindValueInExprValueMap(S, InsertPt);
+  Value *V = VO.first;
+
+  if (!V)
+    V = visit(S);
+  else if (VO.second) {
+    if (PointerType *Vty = dyn_cast<PointerType>(V->getType())) {
+      Type *Ety = Vty->getPointerElementType();
+      int64_t Offset = VO.second->getSExtValue();
+      int64_t ESize = SE.getTypeSizeInBits(Ety);
+      if ((Offset * 8) % ESize == 0) {
+        ConstantInt *Idx =
+            ConstantInt::getSigned(VO.second->getType(), -(Offset * 8) / ESize);
+        V = Builder.CreateGEP(Ety, V, Idx, "scevgep");
+      } else {
+        ConstantInt *Idx =
+            ConstantInt::getSigned(VO.second->getType(), -Offset);
+        unsigned AS = Vty->getAddressSpace();
+        V = Builder.CreateBitCast(V, Type::getInt8PtrTy(SE.getContext(), AS));
+        V = Builder.CreateGEP(Type::getInt8Ty(SE.getContext()), V, Idx,
+                              "uglygep");
+        V = Builder.CreateBitCast(V, Vty);
+      }
+    } else {
+      V = Builder.CreateSub(V, VO.second);
+    }
+  }
+  // Remember the expanded value for this SCEV at this location.
+  //
+  // This is independent of PostIncLoops. The mapped value simply materializes
+  // the expression at this insertion point. If the mapped value happened to be
+  // a postinc expansion, it could be reused by a non-postinc user, but only if
+  // its insertion point was already at the head of the loop.
+  InsertedExpressions[std::make_pair(S, InsertPt)] = V;
+  return V;
+}
+
+void SCEVExpander::rememberInstruction(Value *I) {
   auto DoInsert = [this](Value *V) {
     if (!PostIncLoops.empty())
       InsertedPostIncValues.insert(V);
@@ -1983,10 +1983,10 @@ void SCEVExpander::rememberInstruction(Value *I) {
       InsertedValues.insert(V);
   };
   DoInsert(I);
- 
+
   if (!PreserveLCSSA)
     return;
- 
+
   if (auto *Inst = dyn_cast<Instruction>(I)) {
     // A new instruction has been added, which might introduce new uses outside
     // a defining loop. Fix LCSSA from for each operand of the new instruction,
@@ -1995,190 +1995,190 @@ void SCEVExpander::rememberInstruction(Value *I) {
          OpIdx++)
       fixupLCSSAFormFor(Inst, OpIdx);
   }
-} 
- 
-/// replaceCongruentIVs - Check for congruent phis in this loop header and 
-/// replace them with their most canonical representative. Return the number of 
-/// phis eliminated. 
-/// 
-/// This does not depend on any SCEVExpander state but should be used in 
-/// the same context that SCEVExpander is used. 
-unsigned 
-SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT, 
-                                  SmallVectorImpl<WeakTrackingVH> &DeadInsts, 
-                                  const TargetTransformInfo *TTI) { 
-  // Find integer phis in order of increasing width. 
-  SmallVector<PHINode*, 8> Phis; 
-  for (PHINode &PN : L->getHeader()->phis()) 
-    Phis.push_back(&PN); 
- 
-  if (TTI) 
-    llvm::sort(Phis, [](Value *LHS, Value *RHS) { 
-      // Put pointers at the back and make sure pointer < pointer = false. 
-      if (!LHS->getType()->isIntegerTy() || !RHS->getType()->isIntegerTy()) 
-        return RHS->getType()->isIntegerTy() && !LHS->getType()->isIntegerTy(); 
+}
+
+/// replaceCongruentIVs - Check for congruent phis in this loop header and
+/// replace them with their most canonical representative. Return the number of
+/// phis eliminated.
+///
+/// This does not depend on any SCEVExpander state but should be used in
+/// the same context that SCEVExpander is used.
+unsigned
+SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT,
+                                  SmallVectorImpl<WeakTrackingVH> &DeadInsts,
+                                  const TargetTransformInfo *TTI) {
+  // Find integer phis in order of increasing width.
+  SmallVector<PHINode*, 8> Phis;
+  for (PHINode &PN : L->getHeader()->phis())
+    Phis.push_back(&PN);
+
+  if (TTI)
+    llvm::sort(Phis, [](Value *LHS, Value *RHS) {
+      // Put pointers at the back and make sure pointer < pointer = false.
+      if (!LHS->getType()->isIntegerTy() || !RHS->getType()->isIntegerTy())
+        return RHS->getType()->isIntegerTy() && !LHS->getType()->isIntegerTy();
       return RHS->getType()->getPrimitiveSizeInBits().getFixedSize() <
              LHS->getType()->getPrimitiveSizeInBits().getFixedSize();
-    }); 
- 
-  unsigned NumElim = 0; 
-  DenseMap<const SCEV *, PHINode *> ExprToIVMap; 
-  // Process phis from wide to narrow. Map wide phis to their truncation 
-  // so narrow phis can reuse them. 
-  for (PHINode *Phi : Phis) { 
-    auto SimplifyPHINode = [&](PHINode *PN) -> Value * { 
-      if (Value *V = SimplifyInstruction(PN, {DL, &SE.TLI, &SE.DT, &SE.AC})) 
-        return V; 
-      if (!SE.isSCEVable(PN->getType())) 
-        return nullptr; 
-      auto *Const = dyn_cast<SCEVConstant>(SE.getSCEV(PN)); 
-      if (!Const) 
-        return nullptr; 
-      return Const->getValue(); 
-    }; 
- 
-    // Fold constant phis. They may be congruent to other constant phis and 
-    // would confuse the logic below that expects proper IVs. 
-    if (Value *V = SimplifyPHINode(Phi)) { 
-      if (V->getType() != Phi->getType()) 
-        continue; 
-      Phi->replaceAllUsesWith(V); 
-      DeadInsts.emplace_back(Phi); 
-      ++NumElim; 
-      DEBUG_WITH_TYPE(DebugType, dbgs() 
-                      << "INDVARS: Eliminated constant iv: " << *Phi << '\n'); 
-      continue; 
-    } 
- 
-    if (!SE.isSCEVable(Phi->getType())) 
-      continue; 
- 
-    PHINode *&OrigPhiRef = ExprToIVMap[SE.getSCEV(Phi)]; 
-    if (!OrigPhiRef) { 
-      OrigPhiRef = Phi; 
-      if (Phi->getType()->isIntegerTy() && TTI && 
-          TTI->isTruncateFree(Phi->getType(), Phis.back()->getType())) { 
-        // This phi can be freely truncated to the narrowest phi type. Map the 
-        // truncated expression to it so it will be reused for narrow types. 
-        const SCEV *TruncExpr = 
-          SE.getTruncateExpr(SE.getSCEV(Phi), Phis.back()->getType()); 
-        ExprToIVMap[TruncExpr] = Phi; 
-      } 
-      continue; 
-    } 
- 
-    // Replacing a pointer phi with an integer phi or vice-versa doesn't make 
-    // sense. 
-    if (OrigPhiRef->getType()->isPointerTy() != Phi->getType()->isPointerTy()) 
-      continue; 
- 
-    if (BasicBlock *LatchBlock = L->getLoopLatch()) { 
-      Instruction *OrigInc = dyn_cast<Instruction>( 
-          OrigPhiRef->getIncomingValueForBlock(LatchBlock)); 
-      Instruction *IsomorphicInc = 
-          dyn_cast<Instruction>(Phi->getIncomingValueForBlock(LatchBlock)); 
- 
-      if (OrigInc && IsomorphicInc) { 
-        // If this phi has the same width but is more canonical, replace the 
-        // original with it. As part of the "more canonical" determination, 
-        // respect a prior decision to use an IV chain. 
-        if (OrigPhiRef->getType() == Phi->getType() && 
-            !(ChainedPhis.count(Phi) || 
-              isExpandedAddRecExprPHI(OrigPhiRef, OrigInc, L)) && 
-            (ChainedPhis.count(Phi) || 
-             isExpandedAddRecExprPHI(Phi, IsomorphicInc, L))) { 
-          std::swap(OrigPhiRef, Phi); 
-          std::swap(OrigInc, IsomorphicInc); 
-        } 
-        // Replacing the congruent phi is sufficient because acyclic 
-        // redundancy elimination, CSE/GVN, should handle the 
-        // rest. However, once SCEV proves that a phi is congruent, 
-        // it's often the head of an IV user cycle that is isomorphic 
-        // with the original phi. It's worth eagerly cleaning up the 
-        // common case of a single IV increment so that DeleteDeadPHIs 
-        // can remove cycles that had postinc uses. 
-        const SCEV *TruncExpr = 
-            SE.getTruncateOrNoop(SE.getSCEV(OrigInc), IsomorphicInc->getType()); 
-        if (OrigInc != IsomorphicInc && 
-            TruncExpr == SE.getSCEV(IsomorphicInc) && 
-            SE.LI.replacementPreservesLCSSAForm(IsomorphicInc, OrigInc) && 
-            hoistIVInc(OrigInc, IsomorphicInc)) { 
-          DEBUG_WITH_TYPE(DebugType, 
-                          dbgs() << "INDVARS: Eliminated congruent iv.inc: " 
-                                 << *IsomorphicInc << '\n'); 
-          Value *NewInc = OrigInc; 
-          if (OrigInc->getType() != IsomorphicInc->getType()) { 
-            Instruction *IP = nullptr; 
-            if (PHINode *PN = dyn_cast<PHINode>(OrigInc)) 
-              IP = &*PN->getParent()->getFirstInsertionPt(); 
-            else 
-              IP = OrigInc->getNextNode(); 
- 
-            IRBuilder<> Builder(IP); 
-            Builder.SetCurrentDebugLocation(IsomorphicInc->getDebugLoc()); 
-            NewInc = Builder.CreateTruncOrBitCast( 
-                OrigInc, IsomorphicInc->getType(), IVName); 
-          } 
-          IsomorphicInc->replaceAllUsesWith(NewInc); 
-          DeadInsts.emplace_back(IsomorphicInc); 
-        } 
-      } 
-    } 
-    DEBUG_WITH_TYPE(DebugType, dbgs() << "INDVARS: Eliminated congruent iv: " 
-                                      << *Phi << '\n'); 
+    });
+
+  unsigned NumElim = 0;
+  DenseMap<const SCEV *, PHINode *> ExprToIVMap;
+  // Process phis from wide to narrow. Map wide phis to their truncation
+  // so narrow phis can reuse them.
+  for (PHINode *Phi : Phis) {
+    auto SimplifyPHINode = [&](PHINode *PN) -> Value * {
+      if (Value *V = SimplifyInstruction(PN, {DL, &SE.TLI, &SE.DT, &SE.AC}))
+        return V;
+      if (!SE.isSCEVable(PN->getType()))
+        return nullptr;
+      auto *Const = dyn_cast<SCEVConstant>(SE.getSCEV(PN));
+      if (!Const)
+        return nullptr;
+      return Const->getValue();
+    };
+
+    // Fold constant phis. They may be congruent to other constant phis and
+    // would confuse the logic below that expects proper IVs.
+    if (Value *V = SimplifyPHINode(Phi)) {
+      if (V->getType() != Phi->getType())
+        continue;
+      Phi->replaceAllUsesWith(V);
+      DeadInsts.emplace_back(Phi);
+      ++NumElim;
+      DEBUG_WITH_TYPE(DebugType, dbgs()
+                      << "INDVARS: Eliminated constant iv: " << *Phi << '\n');
+      continue;
+    }
+
+    if (!SE.isSCEVable(Phi->getType()))
+      continue;
+
+    PHINode *&OrigPhiRef = ExprToIVMap[SE.getSCEV(Phi)];
+    if (!OrigPhiRef) {
+      OrigPhiRef = Phi;
+      if (Phi->getType()->isIntegerTy() && TTI &&
+          TTI->isTruncateFree(Phi->getType(), Phis.back()->getType())) {
+        // This phi can be freely truncated to the narrowest phi type. Map the
+        // truncated expression to it so it will be reused for narrow types.
+        const SCEV *TruncExpr =
+          SE.getTruncateExpr(SE.getSCEV(Phi), Phis.back()->getType());
+        ExprToIVMap[TruncExpr] = Phi;
+      }
+      continue;
+    }
+
+    // Replacing a pointer phi with an integer phi or vice-versa doesn't make
+    // sense.
+    if (OrigPhiRef->getType()->isPointerTy() != Phi->getType()->isPointerTy())
+      continue;
+
+    if (BasicBlock *LatchBlock = L->getLoopLatch()) {
+      Instruction *OrigInc = dyn_cast<Instruction>(
+          OrigPhiRef->getIncomingValueForBlock(LatchBlock));
+      Instruction *IsomorphicInc =
+          dyn_cast<Instruction>(Phi->getIncomingValueForBlock(LatchBlock));
+
+      if (OrigInc && IsomorphicInc) {
+        // If this phi has the same width but is more canonical, replace the
+        // original with it. As part of the "more canonical" determination,
+        // respect a prior decision to use an IV chain.
+        if (OrigPhiRef->getType() == Phi->getType() &&
+            !(ChainedPhis.count(Phi) ||
+              isExpandedAddRecExprPHI(OrigPhiRef, OrigInc, L)) &&
+            (ChainedPhis.count(Phi) ||
+             isExpandedAddRecExprPHI(Phi, IsomorphicInc, L))) {
+          std::swap(OrigPhiRef, Phi);
+          std::swap(OrigInc, IsomorphicInc);
+        }
+        // Replacing the congruent phi is sufficient because acyclic
+        // redundancy elimination, CSE/GVN, should handle the
+        // rest. However, once SCEV proves that a phi is congruent,
+        // it's often the head of an IV user cycle that is isomorphic
+        // with the original phi. It's worth eagerly cleaning up the
+        // common case of a single IV increment so that DeleteDeadPHIs
+        // can remove cycles that had postinc uses.
+        const SCEV *TruncExpr =
+            SE.getTruncateOrNoop(SE.getSCEV(OrigInc), IsomorphicInc->getType());
+        if (OrigInc != IsomorphicInc &&
+            TruncExpr == SE.getSCEV(IsomorphicInc) &&
+            SE.LI.replacementPreservesLCSSAForm(IsomorphicInc, OrigInc) &&
+            hoistIVInc(OrigInc, IsomorphicInc)) {
+          DEBUG_WITH_TYPE(DebugType,
+                          dbgs() << "INDVARS: Eliminated congruent iv.inc: "
+                                 << *IsomorphicInc << '\n');
+          Value *NewInc = OrigInc;
+          if (OrigInc->getType() != IsomorphicInc->getType()) {
+            Instruction *IP = nullptr;
+            if (PHINode *PN = dyn_cast<PHINode>(OrigInc))
+              IP = &*PN->getParent()->getFirstInsertionPt();
+            else
+              IP = OrigInc->getNextNode();
+
+            IRBuilder<> Builder(IP);
+            Builder.SetCurrentDebugLocation(IsomorphicInc->getDebugLoc());
+            NewInc = Builder.CreateTruncOrBitCast(
+                OrigInc, IsomorphicInc->getType(), IVName);
+          }
+          IsomorphicInc->replaceAllUsesWith(NewInc);
+          DeadInsts.emplace_back(IsomorphicInc);
+        }
+      }
+    }
+    DEBUG_WITH_TYPE(DebugType, dbgs() << "INDVARS: Eliminated congruent iv: "
+                                      << *Phi << '\n');
     DEBUG_WITH_TYPE(DebugType, dbgs() << "INDVARS: Original iv: "
                                       << *OrigPhiRef << '\n');
-    ++NumElim; 
-    Value *NewIV = OrigPhiRef; 
-    if (OrigPhiRef->getType() != Phi->getType()) { 
-      IRBuilder<> Builder(&*L->getHeader()->getFirstInsertionPt()); 
-      Builder.SetCurrentDebugLocation(Phi->getDebugLoc()); 
-      NewIV = Builder.CreateTruncOrBitCast(OrigPhiRef, Phi->getType(), IVName); 
-    } 
-    Phi->replaceAllUsesWith(NewIV); 
-    DeadInsts.emplace_back(Phi); 
-  } 
-  return NumElim; 
-} 
- 
-Optional<ScalarEvolution::ValueOffsetPair> 
-SCEVExpander::getRelatedExistingExpansion(const SCEV *S, const Instruction *At, 
-                                          Loop *L) { 
-  using namespace llvm::PatternMatch; 
- 
-  SmallVector<BasicBlock *, 4> ExitingBlocks; 
-  L->getExitingBlocks(ExitingBlocks); 
- 
-  // Look for suitable value in simple conditions at the loop exits. 
-  for (BasicBlock *BB : ExitingBlocks) { 
-    ICmpInst::Predicate Pred; 
-    Instruction *LHS, *RHS; 
- 
-    if (!match(BB->getTerminator(), 
-               m_Br(m_ICmp(Pred, m_Instruction(LHS), m_Instruction(RHS)), 
-                    m_BasicBlock(), m_BasicBlock()))) 
-      continue; 
- 
-    if (SE.getSCEV(LHS) == S && SE.DT.dominates(LHS, At)) 
-      return ScalarEvolution::ValueOffsetPair(LHS, nullptr); 
- 
-    if (SE.getSCEV(RHS) == S && SE.DT.dominates(RHS, At)) 
-      return ScalarEvolution::ValueOffsetPair(RHS, nullptr); 
-  } 
- 
-  // Use expand's logic which is used for reusing a previous Value in 
-  // ExprValueMap. 
-  ScalarEvolution::ValueOffsetPair VO = FindValueInExprValueMap(S, At); 
-  if (VO.first) 
-    return VO; 
- 
-  // There is potential to make this significantly smarter, but this simple 
-  // heuristic already gets some interesting cases. 
- 
-  // Can not find suitable value. 
-  return None; 
-} 
- 
+    ++NumElim;
+    Value *NewIV = OrigPhiRef;
+    if (OrigPhiRef->getType() != Phi->getType()) {
+      IRBuilder<> Builder(&*L->getHeader()->getFirstInsertionPt());
+      Builder.SetCurrentDebugLocation(Phi->getDebugLoc());
+      NewIV = Builder.CreateTruncOrBitCast(OrigPhiRef, Phi->getType(), IVName);
+    }
+    Phi->replaceAllUsesWith(NewIV);
+    DeadInsts.emplace_back(Phi);
+  }
+  return NumElim;
+}
+
+Optional<ScalarEvolution::ValueOffsetPair>
+SCEVExpander::getRelatedExistingExpansion(const SCEV *S, const Instruction *At,
+                                          Loop *L) {
+  using namespace llvm::PatternMatch;
+
+  SmallVector<BasicBlock *, 4> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+
+  // Look for suitable value in simple conditions at the loop exits.
+  for (BasicBlock *BB : ExitingBlocks) {
+    ICmpInst::Predicate Pred;
+    Instruction *LHS, *RHS;
+
+    if (!match(BB->getTerminator(),
+               m_Br(m_ICmp(Pred, m_Instruction(LHS), m_Instruction(RHS)),
+                    m_BasicBlock(), m_BasicBlock())))
+      continue;
+
+    if (SE.getSCEV(LHS) == S && SE.DT.dominates(LHS, At))
+      return ScalarEvolution::ValueOffsetPair(LHS, nullptr);
+
+    if (SE.getSCEV(RHS) == S && SE.DT.dominates(RHS, At))
+      return ScalarEvolution::ValueOffsetPair(RHS, nullptr);
+  }
+
+  // Use expand's logic which is used for reusing a previous Value in
+  // ExprValueMap.
+  ScalarEvolution::ValueOffsetPair VO = FindValueInExprValueMap(S, At);
+  if (VO.first)
+    return VO;
+
+  // There is potential to make this significantly smarter, but this simple
+  // heuristic already gets some interesting cases.
+
+  // Can not find suitable value.
+  return None;
+}
+
 template<typename T> static int costAndCollectOperands(
   const SCEVOperand &WorkItem, const TargetTransformInfo &TTI,
   TargetTransformInfo::TargetCostKind CostKind,
@@ -2318,33 +2318,33 @@ template<typename T> static int costAndCollectOperands(
   return Cost;
 }
 
-bool SCEVExpander::isHighCostExpansionHelper( 
+bool SCEVExpander::isHighCostExpansionHelper(
     const SCEVOperand &WorkItem, Loop *L, const Instruction &At,
     int &BudgetRemaining, const TargetTransformInfo &TTI,
     SmallPtrSetImpl<const SCEV *> &Processed,
     SmallVectorImpl<SCEVOperand> &Worklist) {
-  if (BudgetRemaining < 0) 
-    return true; // Already run out of budget, give up. 
- 
+  if (BudgetRemaining < 0)
+    return true; // Already run out of budget, give up.
+
   const SCEV *S = WorkItem.S;
-  // Was the cost of expansion of this expression already accounted for? 
+  // Was the cost of expansion of this expression already accounted for?
   if (!isa<SCEVConstant>(S) && !Processed.insert(S).second)
-    return false; // We have already accounted for this expression. 
- 
-  // If we can find an existing value for this scev available at the point "At" 
-  // then consider the expression cheap. 
-  if (getRelatedExistingExpansion(S, &At, L)) 
-    return false; // Consider the expression to be free. 
- 
+    return false; // We have already accounted for this expression.
+
+  // If we can find an existing value for this scev available at the point "At"
+  // then consider the expression cheap.
+  if (getRelatedExistingExpansion(S, &At, L))
+    return false; // Consider the expression to be free.
+
   TargetTransformInfo::TargetCostKind CostKind =
       L->getHeader()->getParent()->hasMinSize()
           ? TargetTransformInfo::TCK_CodeSize
           : TargetTransformInfo::TCK_RecipThroughput;
 
-  switch (S->getSCEVType()) { 
+  switch (S->getSCEVType()) {
   case scCouldNotCompute:
     llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!");
-  case scUnknown: 
+  case scUnknown:
     // Assume to be zero-cost.
     return false;
   case scConstant: {
@@ -2356,7 +2356,7 @@ bool SCEVExpander::isHighCostExpansionHelper(
     BudgetRemaining -= TTI.getIntImmCostInst(
         WorkItem.ParentOpcode, WorkItem.OperandIdx, Imm, Ty, CostKind);
     return BudgetRemaining < 0;
-  } 
+  }
   case scTruncate:
   case scPtrToInt:
   case scZeroExtend:
@@ -2364,27 +2364,27 @@ bool SCEVExpander::isHighCostExpansionHelper(
     int Cost =
         costAndCollectOperands<SCEVCastExpr>(WorkItem, TTI, CostKind, Worklist);
     BudgetRemaining -= Cost;
-    return false; // Will answer upon next entry into this function. 
-  } 
+    return false; // Will answer upon next entry into this function.
+  }
   case scUDivExpr: {
-    // UDivExpr is very likely a UDiv that ScalarEvolution's HowFarToZero or 
-    // HowManyLessThans produced to compute a precise expression, rather than a 
-    // UDiv from the user's code. If we can't find a UDiv in the code with some 
-    // simple searching, we need to account for it's cost. 
- 
-    // At the beginning of this function we already tried to find existing 
-    // value for plain 'S'. Now try to lookup 'S + 1' since it is common 
-    // pattern involving division. This is just a simple search heuristic. 
-    if (getRelatedExistingExpansion( 
-            SE.getAddExpr(S, SE.getConstant(S->getType(), 1)), &At, L)) 
-      return false; // Consider it to be free. 
- 
+    // UDivExpr is very likely a UDiv that ScalarEvolution's HowFarToZero or
+    // HowManyLessThans produced to compute a precise expression, rather than a
+    // UDiv from the user's code. If we can't find a UDiv in the code with some
+    // simple searching, we need to account for it's cost.
+
+    // At the beginning of this function we already tried to find existing
+    // value for plain 'S'. Now try to lookup 'S + 1' since it is common
+    // pattern involving division. This is just a simple search heuristic.
+    if (getRelatedExistingExpansion(
+            SE.getAddExpr(S, SE.getConstant(S->getType(), 1)), &At, L))
+      return false; // Consider it to be free.
+
     int Cost =
         costAndCollectOperands<SCEVUDivExpr>(WorkItem, TTI, CostKind, Worklist);
-    // Need to count the cost of this UDiv. 
+    // Need to count the cost of this UDiv.
     BudgetRemaining -= Cost;
-    return false; // Will answer upon next entry into this function. 
-  } 
+    return false; // Will answer upon next entry into this function.
+  }
   case scAddExpr:
   case scMulExpr:
   case scUMaxExpr:
@@ -2392,14 +2392,14 @@ bool SCEVExpander::isHighCostExpansionHelper(
   case scUMinExpr:
   case scSMinExpr: {
     assert(cast<SCEVNAryExpr>(S)->getNumOperands() > 1 &&
-           "Nary expr should have more than 1 operand."); 
-    // The simple nary expr will require one less op (or pair of ops) 
-    // than the number of it's terms. 
+           "Nary expr should have more than 1 operand.");
+    // The simple nary expr will require one less op (or pair of ops)
+    // than the number of it's terms.
     int Cost =
         costAndCollectOperands<SCEVNAryExpr>(WorkItem, TTI, CostKind, Worklist);
     BudgetRemaining -= Cost;
     return BudgetRemaining < 0;
-  } 
+  }
   case scAddRecExpr: {
     assert(cast<SCEVAddRecExpr>(S)->getNumOperands() >= 2 &&
            "Polynomial should be at least linear");
@@ -2409,173 +2409,173 @@ bool SCEVExpander::isHighCostExpansionHelper(
   }
   }
   llvm_unreachable("Unknown SCEV kind!");
-} 
- 
-Value *SCEVExpander::expandCodeForPredicate(const SCEVPredicate *Pred, 
-                                            Instruction *IP) { 
-  assert(IP); 
-  switch (Pred->getKind()) { 
-  case SCEVPredicate::P_Union: 
-    return expandUnionPredicate(cast<SCEVUnionPredicate>(Pred), IP); 
-  case SCEVPredicate::P_Equal: 
-    return expandEqualPredicate(cast<SCEVEqualPredicate>(Pred), IP); 
-  case SCEVPredicate::P_Wrap: { 
-    auto *AddRecPred = cast<SCEVWrapPredicate>(Pred); 
-    return expandWrapPredicate(AddRecPred, IP); 
-  } 
-  } 
-  llvm_unreachable("Unknown SCEV predicate type"); 
-} 
- 
-Value *SCEVExpander::expandEqualPredicate(const SCEVEqualPredicate *Pred, 
-                                          Instruction *IP) { 
+}
+
+Value *SCEVExpander::expandCodeForPredicate(const SCEVPredicate *Pred,
+                                            Instruction *IP) {
+  assert(IP);
+  switch (Pred->getKind()) {
+  case SCEVPredicate::P_Union:
+    return expandUnionPredicate(cast<SCEVUnionPredicate>(Pred), IP);
+  case SCEVPredicate::P_Equal:
+    return expandEqualPredicate(cast<SCEVEqualPredicate>(Pred), IP);
+  case SCEVPredicate::P_Wrap: {
+    auto *AddRecPred = cast<SCEVWrapPredicate>(Pred);
+    return expandWrapPredicate(AddRecPred, IP);
+  }
+  }
+  llvm_unreachable("Unknown SCEV predicate type");
+}
+
+Value *SCEVExpander::expandEqualPredicate(const SCEVEqualPredicate *Pred,
+                                          Instruction *IP) {
   Value *Expr0 =
       expandCodeForImpl(Pred->getLHS(), Pred->getLHS()->getType(), IP, false);
   Value *Expr1 =
       expandCodeForImpl(Pred->getRHS(), Pred->getRHS()->getType(), IP, false);
- 
-  Builder.SetInsertPoint(IP); 
-  auto *I = Builder.CreateICmpNE(Expr0, Expr1, "ident.check"); 
-  return I; 
-} 
- 
-Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR, 
-                                           Instruction *Loc, bool Signed) { 
-  assert(AR->isAffine() && "Cannot generate RT check for " 
-                           "non-affine expression"); 
- 
-  SCEVUnionPredicate Pred; 
-  const SCEV *ExitCount = 
-      SE.getPredicatedBackedgeTakenCount(AR->getLoop(), Pred); 
- 
+
+  Builder.SetInsertPoint(IP);
+  auto *I = Builder.CreateICmpNE(Expr0, Expr1, "ident.check");
+  return I;
+}
+
+Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR,
+                                           Instruction *Loc, bool Signed) {
+  assert(AR->isAffine() && "Cannot generate RT check for "
+                           "non-affine expression");
+
+  SCEVUnionPredicate Pred;
+  const SCEV *ExitCount =
+      SE.getPredicatedBackedgeTakenCount(AR->getLoop(), Pred);
+
   assert(!isa<SCEVCouldNotCompute>(ExitCount) && "Invalid loop count");
- 
-  const SCEV *Step = AR->getStepRecurrence(SE); 
-  const SCEV *Start = AR->getStart(); 
- 
-  Type *ARTy = AR->getType(); 
-  unsigned SrcBits = SE.getTypeSizeInBits(ExitCount->getType()); 
-  unsigned DstBits = SE.getTypeSizeInBits(ARTy); 
- 
-  // The expression {Start,+,Step} has nusw/nssw if 
-  //   Step < 0, Start - |Step| * Backedge <= Start 
-  //   Step >= 0, Start + |Step| * Backedge > Start 
-  // and |Step| * Backedge doesn't unsigned overflow. 
- 
-  IntegerType *CountTy = IntegerType::get(Loc->getContext(), SrcBits); 
-  Builder.SetInsertPoint(Loc); 
+
+  const SCEV *Step = AR->getStepRecurrence(SE);
+  const SCEV *Start = AR->getStart();
+
+  Type *ARTy = AR->getType();
+  unsigned SrcBits = SE.getTypeSizeInBits(ExitCount->getType());
+  unsigned DstBits = SE.getTypeSizeInBits(ARTy);
+
+  // The expression {Start,+,Step} has nusw/nssw if
+  //   Step < 0, Start - |Step| * Backedge <= Start
+  //   Step >= 0, Start + |Step| * Backedge > Start
+  // and |Step| * Backedge doesn't unsigned overflow.
+
+  IntegerType *CountTy = IntegerType::get(Loc->getContext(), SrcBits);
+  Builder.SetInsertPoint(Loc);
   Value *TripCountVal = expandCodeForImpl(ExitCount, CountTy, Loc, false);
- 
-  IntegerType *Ty = 
-      IntegerType::get(Loc->getContext(), SE.getTypeSizeInBits(ARTy)); 
-  Type *ARExpandTy = DL.isNonIntegralPointerType(ARTy) ? ARTy : Ty; 
- 
+
+  IntegerType *Ty =
+      IntegerType::get(Loc->getContext(), SE.getTypeSizeInBits(ARTy));
+  Type *ARExpandTy = DL.isNonIntegralPointerType(ARTy) ? ARTy : Ty;
+
   Value *StepValue = expandCodeForImpl(Step, Ty, Loc, false);
   Value *NegStepValue =
       expandCodeForImpl(SE.getNegativeSCEV(Step), Ty, Loc, false);
   Value *StartValue = expandCodeForImpl(Start, ARExpandTy, Loc, false);
- 
-  ConstantInt *Zero = 
-      ConstantInt::get(Loc->getContext(), APInt::getNullValue(DstBits)); 
- 
-  Builder.SetInsertPoint(Loc); 
-  // Compute |Step| 
-  Value *StepCompare = Builder.CreateICmp(ICmpInst::ICMP_SLT, StepValue, Zero); 
-  Value *AbsStep = Builder.CreateSelect(StepCompare, NegStepValue, StepValue); 
- 
-  // Get the backedge taken count and truncate or extended to the AR type. 
-  Value *TruncTripCount = Builder.CreateZExtOrTrunc(TripCountVal, Ty); 
-  auto *MulF = Intrinsic::getDeclaration(Loc->getModule(), 
-                                         Intrinsic::umul_with_overflow, Ty); 
- 
-  // Compute |Step| * Backedge 
-  CallInst *Mul = Builder.CreateCall(MulF, {AbsStep, TruncTripCount}, "mul"); 
-  Value *MulV = Builder.CreateExtractValue(Mul, 0, "mul.result"); 
-  Value *OfMul = Builder.CreateExtractValue(Mul, 1, "mul.overflow"); 
- 
-  // Compute: 
-  //   Start + |Step| * Backedge < Start 
-  //   Start - |Step| * Backedge > Start 
-  Value *Add = nullptr, *Sub = nullptr; 
-  if (PointerType *ARPtrTy = dyn_cast<PointerType>(ARExpandTy)) { 
-    const SCEV *MulS = SE.getSCEV(MulV); 
-    const SCEV *NegMulS = SE.getNegativeSCEV(MulS); 
-    Add = Builder.CreateBitCast(expandAddToGEP(MulS, ARPtrTy, Ty, StartValue), 
-                                ARPtrTy); 
-    Sub = Builder.CreateBitCast( 
-        expandAddToGEP(NegMulS, ARPtrTy, Ty, StartValue), ARPtrTy); 
-  } else { 
-    Add = Builder.CreateAdd(StartValue, MulV); 
-    Sub = Builder.CreateSub(StartValue, MulV); 
-  } 
- 
-  Value *EndCompareGT = Builder.CreateICmp( 
-      Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT, Sub, StartValue); 
- 
-  Value *EndCompareLT = Builder.CreateICmp( 
-      Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, Add, StartValue); 
- 
-  // Select the answer based on the sign of Step. 
-  Value *EndCheck = 
-      Builder.CreateSelect(StepCompare, EndCompareGT, EndCompareLT); 
- 
-  // If the backedge taken count type is larger than the AR type, 
-  // check that we don't drop any bits by truncating it. If we are 
-  // dropping bits, then we have overflow (unless the step is zero). 
-  if (SE.getTypeSizeInBits(CountTy) > SE.getTypeSizeInBits(Ty)) { 
-    auto MaxVal = APInt::getMaxValue(DstBits).zext(SrcBits); 
-    auto *BackedgeCheck = 
-        Builder.CreateICmp(ICmpInst::ICMP_UGT, TripCountVal, 
-                           ConstantInt::get(Loc->getContext(), MaxVal)); 
-    BackedgeCheck = Builder.CreateAnd( 
-        BackedgeCheck, Builder.CreateICmp(ICmpInst::ICMP_NE, StepValue, Zero)); 
- 
-    EndCheck = Builder.CreateOr(EndCheck, BackedgeCheck); 
-  } 
- 
+
+  ConstantInt *Zero =
+      ConstantInt::get(Loc->getContext(), APInt::getNullValue(DstBits));
+
+  Builder.SetInsertPoint(Loc);
+  // Compute |Step|
+  Value *StepCompare = Builder.CreateICmp(ICmpInst::ICMP_SLT, StepValue, Zero);
+  Value *AbsStep = Builder.CreateSelect(StepCompare, NegStepValue, StepValue);
+
+  // Get the backedge taken count and truncate or extended to the AR type.
+  Value *TruncTripCount = Builder.CreateZExtOrTrunc(TripCountVal, Ty);
+  auto *MulF = Intrinsic::getDeclaration(Loc->getModule(),
+                                         Intrinsic::umul_with_overflow, Ty);
+
+  // Compute |Step| * Backedge
+  CallInst *Mul = Builder.CreateCall(MulF, {AbsStep, TruncTripCount}, "mul");
+  Value *MulV = Builder.CreateExtractValue(Mul, 0, "mul.result");
+  Value *OfMul = Builder.CreateExtractValue(Mul, 1, "mul.overflow");
+
+  // Compute:
+  //   Start + |Step| * Backedge < Start
+  //   Start - |Step| * Backedge > Start
+  Value *Add = nullptr, *Sub = nullptr;
+  if (PointerType *ARPtrTy = dyn_cast<PointerType>(ARExpandTy)) {
+    const SCEV *MulS = SE.getSCEV(MulV);
+    const SCEV *NegMulS = SE.getNegativeSCEV(MulS);
+    Add = Builder.CreateBitCast(expandAddToGEP(MulS, ARPtrTy, Ty, StartValue),
+                                ARPtrTy);
+    Sub = Builder.CreateBitCast(
+        expandAddToGEP(NegMulS, ARPtrTy, Ty, StartValue), ARPtrTy);
+  } else {
+    Add = Builder.CreateAdd(StartValue, MulV);
+    Sub = Builder.CreateSub(StartValue, MulV);
+  }
+
+  Value *EndCompareGT = Builder.CreateICmp(
+      Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT, Sub, StartValue);
+
+  Value *EndCompareLT = Builder.CreateICmp(
+      Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, Add, StartValue);
+
+  // Select the answer based on the sign of Step.
+  Value *EndCheck =
+      Builder.CreateSelect(StepCompare, EndCompareGT, EndCompareLT);
+
+  // If the backedge taken count type is larger than the AR type,
+  // check that we don't drop any bits by truncating it. If we are
+  // dropping bits, then we have overflow (unless the step is zero).
+  if (SE.getTypeSizeInBits(CountTy) > SE.getTypeSizeInBits(Ty)) {
+    auto MaxVal = APInt::getMaxValue(DstBits).zext(SrcBits);
+    auto *BackedgeCheck =
+        Builder.CreateICmp(ICmpInst::ICMP_UGT, TripCountVal,
+                           ConstantInt::get(Loc->getContext(), MaxVal));
+    BackedgeCheck = Builder.CreateAnd(
+        BackedgeCheck, Builder.CreateICmp(ICmpInst::ICMP_NE, StepValue, Zero));
+
+    EndCheck = Builder.CreateOr(EndCheck, BackedgeCheck);
+  }
+
   return Builder.CreateOr(EndCheck, OfMul);
-} 
- 
-Value *SCEVExpander::expandWrapPredicate(const SCEVWrapPredicate *Pred, 
-                                         Instruction *IP) { 
-  const auto *A = cast<SCEVAddRecExpr>(Pred->getExpr()); 
-  Value *NSSWCheck = nullptr, *NUSWCheck = nullptr; 
- 
-  // Add a check for NUSW 
-  if (Pred->getFlags() & SCEVWrapPredicate::IncrementNUSW) 
-    NUSWCheck = generateOverflowCheck(A, IP, false); 
- 
-  // Add a check for NSSW 
-  if (Pred->getFlags() & SCEVWrapPredicate::IncrementNSSW) 
-    NSSWCheck = generateOverflowCheck(A, IP, true); 
- 
-  if (NUSWCheck && NSSWCheck) 
-    return Builder.CreateOr(NUSWCheck, NSSWCheck); 
- 
-  if (NUSWCheck) 
-    return NUSWCheck; 
- 
-  if (NSSWCheck) 
-    return NSSWCheck; 
- 
-  return ConstantInt::getFalse(IP->getContext()); 
-} 
- 
-Value *SCEVExpander::expandUnionPredicate(const SCEVUnionPredicate *Union, 
-                                          Instruction *IP) { 
-  auto *BoolType = IntegerType::get(IP->getContext(), 1); 
-  Value *Check = ConstantInt::getNullValue(BoolType); 
- 
-  // Loop over all checks in this set. 
-  for (auto Pred : Union->getPredicates()) { 
-    auto *NextCheck = expandCodeForPredicate(Pred, IP); 
-    Builder.SetInsertPoint(IP); 
-    Check = Builder.CreateOr(Check, NextCheck); 
-  } 
- 
-  return Check; 
-} 
- 
+}
+
+Value *SCEVExpander::expandWrapPredicate(const SCEVWrapPredicate *Pred,
+                                         Instruction *IP) {
+  const auto *A = cast<SCEVAddRecExpr>(Pred->getExpr());
+  Value *NSSWCheck = nullptr, *NUSWCheck = nullptr;
+
+  // Add a check for NUSW
+  if (Pred->getFlags() & SCEVWrapPredicate::IncrementNUSW)
+    NUSWCheck = generateOverflowCheck(A, IP, false);
+
+  // Add a check for NSSW
+  if (Pred->getFlags() & SCEVWrapPredicate::IncrementNSSW)
+    NSSWCheck = generateOverflowCheck(A, IP, true);
+
+  if (NUSWCheck && NSSWCheck)
+    return Builder.CreateOr(NUSWCheck, NSSWCheck);
+
+  if (NUSWCheck)
+    return NUSWCheck;
+
+  if (NSSWCheck)
+    return NSSWCheck;
+
+  return ConstantInt::getFalse(IP->getContext());
+}
+
+Value *SCEVExpander::expandUnionPredicate(const SCEVUnionPredicate *Union,
+                                          Instruction *IP) {
+  auto *BoolType = IntegerType::get(IP->getContext(), 1);
+  Value *Check = ConstantInt::getNullValue(BoolType);
+
+  // Loop over all checks in this set.
+  for (auto Pred : Union->getPredicates()) {
+    auto *NextCheck = expandCodeForPredicate(Pred, IP);
+    Builder.SetInsertPoint(IP);
+    Check = Builder.CreateOr(Check, NextCheck);
+  }
+
+  return Check;
+}
+
 Value *SCEVExpander::fixupLCSSAFormFor(Instruction *User, unsigned OpIdx) {
   assert(PreserveLCSSA);
   SmallVector<Instruction *, 1> ToUpdate;
@@ -2604,83 +2604,83 @@ Value *SCEVExpander::fixupLCSSAFormFor(Instruction *User, unsigned OpIdx) {
   return User->getOperand(OpIdx);
 }
 
-namespace { 
-// Search for a SCEV subexpression that is not safe to expand.  Any expression 
-// that may expand to a !isSafeToSpeculativelyExecute value is unsafe, namely 
-// UDiv expressions. We don't know if the UDiv is derived from an IR divide 
-// instruction, but the important thing is that we prove the denominator is 
-// nonzero before expansion. 
-// 
-// IVUsers already checks that IV-derived expressions are safe. So this check is 
-// only needed when the expression includes some subexpression that is not IV 
-// derived. 
-// 
-// Currently, we only allow division by a nonzero constant here. If this is 
-// inadequate, we could easily allow division by SCEVUnknown by using 
-// ValueTracking to check isKnownNonZero(). 
-// 
-// We cannot generally expand recurrences unless the step dominates the loop 
-// header. The expander handles the special case of affine recurrences by 
-// scaling the recurrence outside the loop, but this technique isn't generally 
-// applicable. Expanding a nested recurrence outside a loop requires computing 
-// binomial coefficients. This could be done, but the recurrence has to be in a 
-// perfectly reduced form, which can't be guaranteed. 
-struct SCEVFindUnsafe { 
-  ScalarEvolution &SE; 
-  bool IsUnsafe; 
- 
-  SCEVFindUnsafe(ScalarEvolution &se): SE(se), IsUnsafe(false) {} 
- 
-  bool follow(const SCEV *S) { 
-    if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) { 
-      const SCEVConstant *SC = dyn_cast<SCEVConstant>(D->getRHS()); 
-      if (!SC || SC->getValue()->isZero()) { 
-        IsUnsafe = true; 
-        return false; 
-      } 
-    } 
-    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { 
-      const SCEV *Step = AR->getStepRecurrence(SE); 
-      if (!AR->isAffine() && !SE.dominates(Step, AR->getLoop()->getHeader())) { 
-        IsUnsafe = true; 
-        return false; 
-      } 
-    } 
-    return true; 
-  } 
-  bool isDone() const { return IsUnsafe; } 
-}; 
-} 
- 
-namespace llvm { 
-bool isSafeToExpand(const SCEV *S, ScalarEvolution &SE) { 
-  SCEVFindUnsafe Search(SE); 
-  visitAll(S, Search); 
-  return !Search.IsUnsafe; 
-} 
- 
-bool isSafeToExpandAt(const SCEV *S, const Instruction *InsertionPoint, 
-                      ScalarEvolution &SE) { 
-  if (!isSafeToExpand(S, SE)) 
-    return false; 
-  // We have to prove that the expanded site of S dominates InsertionPoint. 
-  // This is easy when not in the same block, but hard when S is an instruction 
-  // to be expanded somewhere inside the same block as our insertion point. 
-  // What we really need here is something analogous to an OrderedBasicBlock, 
-  // but for the moment, we paper over the problem by handling two common and 
-  // cheap to check cases. 
-  if (SE.properlyDominates(S, InsertionPoint->getParent())) 
-    return true; 
-  if (SE.dominates(S, InsertionPoint->getParent())) { 
-    if (InsertionPoint->getParent()->getTerminator() == InsertionPoint) 
-      return true; 
-    if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) 
-      for (const Value *V : InsertionPoint->operand_values()) 
-        if (V == U->getValue()) 
-          return true; 
-  } 
-  return false; 
-} 
+namespace {
+// Search for a SCEV subexpression that is not safe to expand.  Any expression
+// that may expand to a !isSafeToSpeculativelyExecute value is unsafe, namely
+// UDiv expressions. We don't know if the UDiv is derived from an IR divide
+// instruction, but the important thing is that we prove the denominator is
+// nonzero before expansion.
+//
+// IVUsers already checks that IV-derived expressions are safe. So this check is
+// only needed when the expression includes some subexpression that is not IV
+// derived.
+//
+// Currently, we only allow division by a nonzero constant here. If this is
+// inadequate, we could easily allow division by SCEVUnknown by using
+// ValueTracking to check isKnownNonZero().
+//
+// We cannot generally expand recurrences unless the step dominates the loop
+// header. The expander handles the special case of affine recurrences by
+// scaling the recurrence outside the loop, but this technique isn't generally
+// applicable. Expanding a nested recurrence outside a loop requires computing
+// binomial coefficients. This could be done, but the recurrence has to be in a
+// perfectly reduced form, which can't be guaranteed.
+struct SCEVFindUnsafe {
+  ScalarEvolution &SE;
+  bool IsUnsafe;
+
+  SCEVFindUnsafe(ScalarEvolution &se): SE(se), IsUnsafe(false) {}
+
+  bool follow(const SCEV *S) {
+    if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
+      const SCEVConstant *SC = dyn_cast<SCEVConstant>(D->getRHS());
+      if (!SC || SC->getValue()->isZero()) {
+        IsUnsafe = true;
+        return false;
+      }
+    }
+    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+      const SCEV *Step = AR->getStepRecurrence(SE);
+      if (!AR->isAffine() && !SE.dominates(Step, AR->getLoop()->getHeader())) {
+        IsUnsafe = true;
+        return false;
+      }
+    }
+    return true;
+  }
+  bool isDone() const { return IsUnsafe; }
+};
+}
+
+namespace llvm {
+bool isSafeToExpand(const SCEV *S, ScalarEvolution &SE) {
+  SCEVFindUnsafe Search(SE);
+  visitAll(S, Search);
+  return !Search.IsUnsafe;
+}
+
+bool isSafeToExpandAt(const SCEV *S, const Instruction *InsertionPoint,
+                      ScalarEvolution &SE) {
+  if (!isSafeToExpand(S, SE))
+    return false;
+  // We have to prove that the expanded site of S dominates InsertionPoint.
+  // This is easy when not in the same block, but hard when S is an instruction
+  // to be expanded somewhere inside the same block as our insertion point.
+  // What we really need here is something analogous to an OrderedBasicBlock,
+  // but for the moment, we paper over the problem by handling two common and
+  // cheap to check cases.
+  if (SE.properlyDominates(S, InsertionPoint->getParent()))
+    return true;
+  if (SE.dominates(S, InsertionPoint->getParent())) {
+    if (InsertionPoint->getParent()->getTerminator() == InsertionPoint)
+      return true;
+    if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S))
+      for (const Value *V : InsertionPoint->operand_values())
+        if (V == U->getValue())
+          return true;
+  }
+  return false;
+}
 
 SCEVExpanderCleaner::~SCEVExpanderCleaner() {
   // Result is used, nothing to remove.
@@ -2716,5 +2716,5 @@ SCEVExpanderCleaner::~SCEVExpanderCleaner() {
     I->replaceAllUsesWith(UndefValue::get(I->getType()));
     I->eraseFromParent();
   }
-} 
+}
 }
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/SimplifyCFG.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/SimplifyCFG.cpp
index af157e1a4d..de9560df97 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -1,158 +1,158 @@
-//===- SimplifyCFG.cpp - Code to perform CFG simplification ---------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// Peephole optimize the CFG. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/ADT/APInt.h" 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/DenseMap.h" 
+//===- SimplifyCFG.cpp - Code to perform CFG simplification ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Peephole optimize the CFG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/Optional.h" 
-#include "llvm/ADT/STLExtras.h" 
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/Sequence.h"
-#include "llvm/ADT/SetOperations.h" 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/StringRef.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/ConstantFolding.h" 
-#include "llvm/Analysis/EHPersonalities.h" 
-#include "llvm/Analysis/GuardUtils.h" 
-#include "llvm/Analysis/InstructionSimplify.h" 
-#include "llvm/Analysis/MemorySSA.h" 
-#include "llvm/Analysis/MemorySSAUpdater.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/Attributes.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/ConstantRange.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GlobalValue.h" 
-#include "llvm/IR/GlobalVariable.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/MDBuilder.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/NoFolder.h" 
-#include "llvm/IR/Operator.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/GuardUtils.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/NoFolder.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/KnownBits.h" 
-#include "llvm/Support/MathExtras.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/Local.h" 
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
-#include "llvm/Transforms/Utils/ValueMapper.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <climits> 
-#include <cstddef> 
-#include <cstdint> 
-#include <iterator> 
-#include <map> 
-#include <set> 
-#include <tuple> 
-#include <utility> 
-#include <vector> 
- 
-using namespace llvm; 
-using namespace PatternMatch; 
- 
-#define DEBUG_TYPE "simplifycfg" 
- 
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
+#include <cassert>
+#include <climits>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <map>
+#include <set>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "simplifycfg"
+
 cl::opt<bool> llvm::RequireAndPreserveDomTree(
     "simplifycfg-require-and-preserve-domtree", cl::Hidden, cl::ZeroOrMore,
     cl::init(false),
     cl::desc("Temorary development switch used to gradually uplift SimplifyCFG "
              "into preserving DomTree,"));
 
-// Chosen as 2 so as to be cheap, but still to have enough power to fold 
-// a select, so the "clamp" idiom (of a min followed by a max) will be caught. 
-// To catch this, we need to fold a compare and a select, hence '2' being the 
-// minimum reasonable default. 
-static cl::opt<unsigned> PHINodeFoldingThreshold( 
-    "phi-node-folding-threshold", cl::Hidden, cl::init(2), 
-    cl::desc( 
-        "Control the amount of phi node folding to perform (default = 2)")); 
- 
-static cl::opt<unsigned> TwoEntryPHINodeFoldingThreshold( 
-    "two-entry-phi-node-folding-threshold", cl::Hidden, cl::init(4), 
-    cl::desc("Control the maximal total instruction cost that we are willing " 
-             "to speculatively execute to fold a 2-entry PHI node into a " 
-             "select (default = 4)")); 
- 
-static cl::opt<bool> DupRet( 
-    "simplifycfg-dup-ret", cl::Hidden, cl::init(false), 
-    cl::desc("Duplicate return instructions into unconditional branches")); 
- 
-static cl::opt<bool> 
+// Chosen as 2 so as to be cheap, but still to have enough power to fold
+// a select, so the "clamp" idiom (of a min followed by a max) will be caught.
+// To catch this, we need to fold a compare and a select, hence '2' being the
+// minimum reasonable default.
+static cl::opt<unsigned> PHINodeFoldingThreshold(
+    "phi-node-folding-threshold", cl::Hidden, cl::init(2),
+    cl::desc(
+        "Control the amount of phi node folding to perform (default = 2)"));
+
+static cl::opt<unsigned> TwoEntryPHINodeFoldingThreshold(
+    "two-entry-phi-node-folding-threshold", cl::Hidden, cl::init(4),
+    cl::desc("Control the maximal total instruction cost that we are willing "
+             "to speculatively execute to fold a 2-entry PHI node into a "
+             "select (default = 4)"));
+
+static cl::opt<bool> DupRet(
+    "simplifycfg-dup-ret", cl::Hidden, cl::init(false),
+    cl::desc("Duplicate return instructions into unconditional branches"));
+
+static cl::opt<bool>
     HoistCommon("simplifycfg-hoist-common", cl::Hidden, cl::init(true),
                 cl::desc("Hoist common instructions up to the parent block"));
 
 static cl::opt<bool>
-    SinkCommon("simplifycfg-sink-common", cl::Hidden, cl::init(true), 
-               cl::desc("Sink common instructions down to the end block")); 
- 
-static cl::opt<bool> HoistCondStores( 
-    "simplifycfg-hoist-cond-stores", cl::Hidden, cl::init(true), 
-    cl::desc("Hoist conditional stores if an unconditional store precedes")); 
- 
-static cl::opt<bool> MergeCondStores( 
-    "simplifycfg-merge-cond-stores", cl::Hidden, cl::init(true), 
-    cl::desc("Hoist conditional stores even if an unconditional store does not " 
-             "precede - hoist multiple conditional stores into a single " 
-             "predicated store")); 
- 
-static cl::opt<bool> MergeCondStoresAggressively( 
-    "simplifycfg-merge-cond-stores-aggressively", cl::Hidden, cl::init(false), 
-    cl::desc("When merging conditional stores, do so even if the resultant " 
-             "basic blocks are unlikely to be if-converted as a result")); 
- 
-static cl::opt<bool> SpeculateOneExpensiveInst( 
-    "speculate-one-expensive-inst", cl::Hidden, cl::init(true), 
-    cl::desc("Allow exactly one expensive instruction to be speculatively " 
-             "executed")); 
- 
-static cl::opt<unsigned> MaxSpeculationDepth( 
-    "max-speculation-depth", cl::Hidden, cl::init(10), 
-    cl::desc("Limit maximum recursion depth when calculating costs of " 
-             "speculatively executed instructions")); 
- 
-static cl::opt<int> 
-MaxSmallBlockSize("simplifycfg-max-small-block-size", cl::Hidden, cl::init(10), 
-                  cl::desc("Max size of a block which is still considered " 
-                           "small enough to thread through")); 
- 
+    SinkCommon("simplifycfg-sink-common", cl::Hidden, cl::init(true),
+               cl::desc("Sink common instructions down to the end block"));
+
+static cl::opt<bool> HoistCondStores(
+    "simplifycfg-hoist-cond-stores", cl::Hidden, cl::init(true),
+    cl::desc("Hoist conditional stores if an unconditional store precedes"));
+
+static cl::opt<bool> MergeCondStores(
+    "simplifycfg-merge-cond-stores", cl::Hidden, cl::init(true),
+    cl::desc("Hoist conditional stores even if an unconditional store does not "
+             "precede - hoist multiple conditional stores into a single "
+             "predicated store"));
+
+static cl::opt<bool> MergeCondStoresAggressively(
+    "simplifycfg-merge-cond-stores-aggressively", cl::Hidden, cl::init(false),
+    cl::desc("When merging conditional stores, do so even if the resultant "
+             "basic blocks are unlikely to be if-converted as a result"));
+
+static cl::opt<bool> SpeculateOneExpensiveInst(
+    "speculate-one-expensive-inst", cl::Hidden, cl::init(true),
+    cl::desc("Allow exactly one expensive instruction to be speculatively "
+             "executed"));
+
+static cl::opt<unsigned> MaxSpeculationDepth(
+    "max-speculation-depth", cl::Hidden, cl::init(10),
+    cl::desc("Limit maximum recursion depth when calculating costs of "
+             "speculatively executed instructions"));
+
+static cl::opt<int>
+MaxSmallBlockSize("simplifycfg-max-small-block-size", cl::Hidden, cl::init(10),
+                  cl::desc("Max size of a block which is still considered "
+                           "small enough to thread through"));
+
 // Two is chosen to allow one negation and a logical combine.
 static cl::opt<unsigned>
     BranchFoldThreshold("simplifycfg-branch-fold-threshold", cl::Hidden,
@@ -160,15 +160,15 @@ static cl::opt<unsigned>
                         cl::desc("Maximum cost of combining conditions when "
                                  "folding branches"));
 
-STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps"); 
-STATISTIC(NumLinearMaps, 
-          "Number of switch instructions turned into linear mapping"); 
-STATISTIC(NumLookupTables, 
-          "Number of switch instructions turned into lookup tables"); 
-STATISTIC( 
-    NumLookupTablesHoles, 
-    "Number of switch instructions turned into lookup tables (holes checked)"); 
-STATISTIC(NumTableCmpReuses, "Number of reused switch table lookup compares"); 
+STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
+STATISTIC(NumLinearMaps,
+          "Number of switch instructions turned into linear mapping");
+STATISTIC(NumLookupTables,
+          "Number of switch instructions turned into lookup tables");
+STATISTIC(
+    NumLookupTablesHoles,
+    "Number of switch instructions turned into lookup tables (holes checked)");
+STATISTIC(NumTableCmpReuses, "Number of reused switch table lookup compares");
 STATISTIC(NumFoldValueComparisonIntoPredecessors,
           "Number of value comparisons folded into predecessor basic blocks");
 STATISTIC(NumFoldBranchToCommonDest,
@@ -181,495 +181,495 @@ STATISTIC(NumHoistCommonInstrs,
 STATISTIC(NumSinkCommonCode,
           "Number of common instruction 'blocks' sunk down to the end block");
 STATISTIC(NumSinkCommonInstrs,
-          "Number of common instructions sunk down to the end block"); 
-STATISTIC(NumSpeculations, "Number of speculative executed instructions"); 
+          "Number of common instructions sunk down to the end block");
+STATISTIC(NumSpeculations, "Number of speculative executed instructions");
 STATISTIC(NumInvokes,
           "Number of invokes with empty resume blocks simplified into calls");
- 
-namespace { 
- 
-// The first field contains the value that the switch produces when a certain 
-// case group is selected, and the second field is a vector containing the 
-// cases composing the case group. 
-using SwitchCaseResultVectorTy = 
-    SmallVector<std::pair<Constant *, SmallVector<ConstantInt *, 4>>, 2>; 
- 
-// The first field contains the phi node that generates a result of the switch 
-// and the second field contains the value generated for a certain case in the 
-// switch for that PHI. 
-using SwitchCaseResultsTy = SmallVector<std::pair<PHINode *, Constant *>, 4>; 
- 
-/// ValueEqualityComparisonCase - Represents a case of a switch. 
-struct ValueEqualityComparisonCase { 
-  ConstantInt *Value; 
-  BasicBlock *Dest; 
- 
-  ValueEqualityComparisonCase(ConstantInt *Value, BasicBlock *Dest) 
-      : Value(Value), Dest(Dest) {} 
- 
-  bool operator<(ValueEqualityComparisonCase RHS) const { 
-    // Comparing pointers is ok as we only rely on the order for uniquing. 
-    return Value < RHS.Value; 
-  } 
- 
-  bool operator==(BasicBlock *RHSDest) const { return Dest == RHSDest; } 
-}; 
- 
-class SimplifyCFGOpt { 
-  const TargetTransformInfo &TTI; 
+
+namespace {
+
+// The first field contains the value that the switch produces when a certain
+// case group is selected, and the second field is a vector containing the
+// cases composing the case group.
+using SwitchCaseResultVectorTy =
+    SmallVector<std::pair<Constant *, SmallVector<ConstantInt *, 4>>, 2>;
+
+// The first field contains the phi node that generates a result of the switch
+// and the second field contains the value generated for a certain case in the
+// switch for that PHI.
+using SwitchCaseResultsTy = SmallVector<std::pair<PHINode *, Constant *>, 4>;
+
+/// ValueEqualityComparisonCase - Represents a case of a switch.
+struct ValueEqualityComparisonCase {
+  ConstantInt *Value;
+  BasicBlock *Dest;
+
+  ValueEqualityComparisonCase(ConstantInt *Value, BasicBlock *Dest)
+      : Value(Value), Dest(Dest) {}
+
+  bool operator<(ValueEqualityComparisonCase RHS) const {
+    // Comparing pointers is ok as we only rely on the order for uniquing.
+    return Value < RHS.Value;
+  }
+
+  bool operator==(BasicBlock *RHSDest) const { return Dest == RHSDest; }
+};
+
+class SimplifyCFGOpt {
+  const TargetTransformInfo &TTI;
   DomTreeUpdater *DTU;
-  const DataLayout &DL; 
+  const DataLayout &DL;
   ArrayRef<WeakVH> LoopHeaders;
-  const SimplifyCFGOptions &Options; 
-  bool Resimplify; 
- 
-  Value *isValueEqualityComparison(Instruction *TI); 
-  BasicBlock *GetValueEqualityComparisonCases( 
-      Instruction *TI, std::vector<ValueEqualityComparisonCase> &Cases); 
-  bool SimplifyEqualityComparisonWithOnlyPredecessor(Instruction *TI, 
-                                                     BasicBlock *Pred, 
-                                                     IRBuilder<> &Builder); 
+  const SimplifyCFGOptions &Options;
+  bool Resimplify;
+
+  Value *isValueEqualityComparison(Instruction *TI);
+  BasicBlock *GetValueEqualityComparisonCases(
+      Instruction *TI, std::vector<ValueEqualityComparisonCase> &Cases);
+  bool SimplifyEqualityComparisonWithOnlyPredecessor(Instruction *TI,
+                                                     BasicBlock *Pred,
+                                                     IRBuilder<> &Builder);
   bool PerformValueComparisonIntoPredecessorFolding(Instruction *TI, Value *&CV,
                                                     Instruction *PTI,
                                                     IRBuilder<> &Builder);
-  bool FoldValueComparisonIntoPredecessors(Instruction *TI, 
-                                           IRBuilder<> &Builder); 
- 
-  bool simplifyReturn(ReturnInst *RI, IRBuilder<> &Builder); 
-  bool simplifyResume(ResumeInst *RI, IRBuilder<> &Builder); 
-  bool simplifySingleResume(ResumeInst *RI); 
-  bool simplifyCommonResume(ResumeInst *RI); 
-  bool simplifyCleanupReturn(CleanupReturnInst *RI); 
-  bool simplifyUnreachable(UnreachableInst *UI); 
-  bool simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder); 
-  bool simplifyIndirectBr(IndirectBrInst *IBI); 
-  bool simplifyBranch(BranchInst *Branch, IRBuilder<> &Builder); 
-  bool simplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder); 
-  bool simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder); 
-  bool SimplifyCondBranchToTwoReturns(BranchInst *BI, IRBuilder<> &Builder); 
- 
-  bool tryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI, 
-                                             IRBuilder<> &Builder); 
- 
-  bool HoistThenElseCodeToIf(BranchInst *BI, const TargetTransformInfo &TTI); 
-  bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB, 
-                              const TargetTransformInfo &TTI); 
-  bool SimplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond, 
-                                  BasicBlock *TrueBB, BasicBlock *FalseBB, 
-                                  uint32_t TrueWeight, uint32_t FalseWeight); 
-  bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder, 
-                                 const DataLayout &DL); 
-  bool SimplifySwitchOnSelect(SwitchInst *SI, SelectInst *Select); 
-  bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI); 
-  bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder); 
- 
-public: 
+  bool FoldValueComparisonIntoPredecessors(Instruction *TI,
+                                           IRBuilder<> &Builder);
+
+  bool simplifyReturn(ReturnInst *RI, IRBuilder<> &Builder);
+  bool simplifyResume(ResumeInst *RI, IRBuilder<> &Builder);
+  bool simplifySingleResume(ResumeInst *RI);
+  bool simplifyCommonResume(ResumeInst *RI);
+  bool simplifyCleanupReturn(CleanupReturnInst *RI);
+  bool simplifyUnreachable(UnreachableInst *UI);
+  bool simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder);
+  bool simplifyIndirectBr(IndirectBrInst *IBI);
+  bool simplifyBranch(BranchInst *Branch, IRBuilder<> &Builder);
+  bool simplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder);
+  bool simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder);
+  bool SimplifyCondBranchToTwoReturns(BranchInst *BI, IRBuilder<> &Builder);
+
+  bool tryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
+                                             IRBuilder<> &Builder);
+
+  bool HoistThenElseCodeToIf(BranchInst *BI, const TargetTransformInfo &TTI);
+  bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
+                              const TargetTransformInfo &TTI);
+  bool SimplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond,
+                                  BasicBlock *TrueBB, BasicBlock *FalseBB,
+                                  uint32_t TrueWeight, uint32_t FalseWeight);
+  bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
+                                 const DataLayout &DL);
+  bool SimplifySwitchOnSelect(SwitchInst *SI, SelectInst *Select);
+  bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI);
+  bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder);
+
+public:
   SimplifyCFGOpt(const TargetTransformInfo &TTI, DomTreeUpdater *DTU,
                  const DataLayout &DL, ArrayRef<WeakVH> LoopHeaders,
-                 const SimplifyCFGOptions &Opts) 
+                 const SimplifyCFGOptions &Opts)
       : TTI(TTI), DTU(DTU), DL(DL), LoopHeaders(LoopHeaders), Options(Opts) {
     assert((!DTU || !DTU->hasPostDomTree()) &&
            "SimplifyCFG is not yet capable of maintaining validity of a "
            "PostDomTree, so don't ask for it.");
   }
- 
+
   bool simplifyOnce(BasicBlock *BB);
   bool simplifyOnceImpl(BasicBlock *BB);
-  bool run(BasicBlock *BB); 
- 
-  // Helper to set Resimplify and return change indication. 
-  bool requestResimplify() { 
-    Resimplify = true; 
-    return true; 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-/// Return true if it is safe to merge these two 
-/// terminator instructions together. 
-static bool 
-SafeToMergeTerminators(Instruction *SI1, Instruction *SI2, 
-                       SmallSetVector<BasicBlock *, 4> *FailBlocks = nullptr) { 
-  if (SI1 == SI2) 
-    return false; // Can't merge with self! 
- 
-  // It is not safe to merge these two switch instructions if they have a common 
-  // successor, and if that successor has a PHI node, and if *that* PHI node has 
-  // conflicting incoming values from the two switch blocks. 
-  BasicBlock *SI1BB = SI1->getParent(); 
-  BasicBlock *SI2BB = SI2->getParent(); 
- 
-  SmallPtrSet<BasicBlock *, 16> SI1Succs(succ_begin(SI1BB), succ_end(SI1BB)); 
-  bool Fail = false; 
-  for (BasicBlock *Succ : successors(SI2BB)) 
-    if (SI1Succs.count(Succ)) 
-      for (BasicBlock::iterator BBI = Succ->begin(); isa<PHINode>(BBI); ++BBI) { 
-        PHINode *PN = cast<PHINode>(BBI); 
-        if (PN->getIncomingValueForBlock(SI1BB) != 
-            PN->getIncomingValueForBlock(SI2BB)) { 
-          if (FailBlocks) 
-            FailBlocks->insert(Succ); 
-          Fail = true; 
-        } 
-      } 
- 
-  return !Fail; 
-} 
- 
-/// Update PHI nodes in Succ to indicate that there will now be entries in it 
-/// from the 'NewPred' block. The values that will be flowing into the PHI nodes 
-/// will be the same as those coming in from ExistPred, an existing predecessor 
-/// of Succ. 
-static void AddPredecessorToBlock(BasicBlock *Succ, BasicBlock *NewPred, 
-                                  BasicBlock *ExistPred, 
-                                  MemorySSAUpdater *MSSAU = nullptr) { 
-  for (PHINode &PN : Succ->phis()) 
-    PN.addIncoming(PN.getIncomingValueForBlock(ExistPred), NewPred); 
-  if (MSSAU) 
-    if (auto *MPhi = MSSAU->getMemorySSA()->getMemoryAccess(Succ)) 
-      MPhi->addIncoming(MPhi->getIncomingValueForBlock(ExistPred), NewPred); 
-} 
- 
-/// Compute an abstract "cost" of speculating the given instruction, 
-/// which is assumed to be safe to speculate. TCC_Free means cheap, 
-/// TCC_Basic means less cheap, and TCC_Expensive means prohibitively 
-/// expensive. 
-static unsigned ComputeSpeculationCost(const User *I, 
-                                       const TargetTransformInfo &TTI) { 
-  assert(isSafeToSpeculativelyExecute(I) && 
-         "Instruction is not safe to speculatively execute!"); 
-  return TTI.getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency); 
-} 
- 
-/// If we have a merge point of an "if condition" as accepted above, 
-/// return true if the specified value dominates the block.  We 
-/// don't handle the true generality of domination here, just a special case 
-/// which works well enough for us. 
-/// 
-/// If AggressiveInsts is non-null, and if V does not dominate BB, we check to 
-/// see if V (which must be an instruction) and its recursive operands 
-/// that do not dominate BB have a combined cost lower than CostRemaining and 
-/// are non-trapping.  If both are true, the instruction is inserted into the 
-/// set and true is returned. 
-/// 
-/// The cost for most non-trapping instructions is defined as 1 except for 
-/// Select whose cost is 2. 
-/// 
-/// After this function returns, CostRemaining is decreased by the cost of 
-/// V plus its non-dominating operands.  If that cost is greater than 
-/// CostRemaining, false is returned and CostRemaining is undefined. 
-static bool DominatesMergePoint(Value *V, BasicBlock *BB, 
-                                SmallPtrSetImpl<Instruction *> &AggressiveInsts, 
-                                int &BudgetRemaining, 
-                                const TargetTransformInfo &TTI, 
-                                unsigned Depth = 0) { 
-  // It is possible to hit a zero-cost cycle (phi/gep instructions for example), 
-  // so limit the recursion depth. 
-  // TODO: While this recursion limit does prevent pathological behavior, it 
-  // would be better to track visited instructions to avoid cycles. 
-  if (Depth == MaxSpeculationDepth) 
-    return false; 
- 
-  Instruction *I = dyn_cast<Instruction>(V); 
-  if (!I) { 
-    // Non-instructions all dominate instructions, but not all constantexprs 
-    // can be executed unconditionally. 
-    if (ConstantExpr *C = dyn_cast<ConstantExpr>(V)) 
-      if (C->canTrap()) 
-        return false; 
-    return true; 
-  } 
-  BasicBlock *PBB = I->getParent(); 
- 
-  // We don't want to allow weird loops that might have the "if condition" in 
-  // the bottom of this block. 
-  if (PBB == BB) 
-    return false; 
- 
-  // If this instruction is defined in a block that contains an unconditional 
-  // branch to BB, then it must be in the 'conditional' part of the "if 
-  // statement".  If not, it definitely dominates the region. 
-  BranchInst *BI = dyn_cast<BranchInst>(PBB->getTerminator()); 
-  if (!BI || BI->isConditional() || BI->getSuccessor(0) != BB) 
-    return true; 
- 
-  // If we have seen this instruction before, don't count it again. 
-  if (AggressiveInsts.count(I)) 
-    return true; 
- 
-  // Okay, it looks like the instruction IS in the "condition".  Check to 
-  // see if it's a cheap instruction to unconditionally compute, and if it 
-  // only uses stuff defined outside of the condition.  If so, hoist it out. 
-  if (!isSafeToSpeculativelyExecute(I)) 
-    return false; 
- 
-  BudgetRemaining -= ComputeSpeculationCost(I, TTI); 
- 
-  // Allow exactly one instruction to be speculated regardless of its cost 
-  // (as long as it is safe to do so). 
-  // This is intended to flatten the CFG even if the instruction is a division 
-  // or other expensive operation. The speculation of an expensive instruction 
-  // is expected to be undone in CodeGenPrepare if the speculation has not 
-  // enabled further IR optimizations. 
-  if (BudgetRemaining < 0 && 
-      (!SpeculateOneExpensiveInst || !AggressiveInsts.empty() || Depth > 0)) 
-    return false; 
- 
-  // Okay, we can only really hoist these out if their operands do 
-  // not take us over the cost threshold. 
-  for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i) 
-    if (!DominatesMergePoint(*i, BB, AggressiveInsts, BudgetRemaining, TTI, 
-                             Depth + 1)) 
-      return false; 
-  // Okay, it's safe to do this!  Remember this instruction. 
-  AggressiveInsts.insert(I); 
-  return true; 
-} 
- 
-/// Extract ConstantInt from value, looking through IntToPtr 
-/// and PointerNullValue. Return NULL if value is not a constant int. 
-static ConstantInt *GetConstantInt(Value *V, const DataLayout &DL) { 
-  // Normal constant int. 
-  ConstantInt *CI = dyn_cast<ConstantInt>(V); 
-  if (CI || !isa<Constant>(V) || !V->getType()->isPointerTy()) 
-    return CI; 
- 
-  // This is some kind of pointer constant. Turn it into a pointer-sized 
-  // ConstantInt if possible. 
-  IntegerType *PtrTy = cast<IntegerType>(DL.getIntPtrType(V->getType())); 
- 
-  // Null pointer means 0, see SelectionDAGBuilder::getValue(const Value*). 
-  if (isa<ConstantPointerNull>(V)) 
-    return ConstantInt::get(PtrTy, 0); 
- 
-  // IntToPtr const int. 
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) 
-    if (CE->getOpcode() == Instruction::IntToPtr) 
-      if (ConstantInt *CI = dyn_cast<ConstantInt>(CE->getOperand(0))) { 
-        // The constant is very likely to have the right type already. 
-        if (CI->getType() == PtrTy) 
-          return CI; 
-        else 
-          return cast<ConstantInt>( 
-              ConstantExpr::getIntegerCast(CI, PtrTy, /*isSigned=*/false)); 
-      } 
-  return nullptr; 
-} 
- 
-namespace { 
- 
-/// Given a chain of or (||) or and (&&) comparison of a value against a 
-/// constant, this will try to recover the information required for a switch 
-/// structure. 
-/// It will depth-first traverse the chain of comparison, seeking for patterns 
-/// like %a == 12 or %a < 4 and combine them to produce a set of integer 
-/// representing the different cases for the switch. 
-/// Note that if the chain is composed of '||' it will build the set of elements 
-/// that matches the comparisons (i.e. any of this value validate the chain) 
-/// while for a chain of '&&' it will build the set elements that make the test 
-/// fail. 
-struct ConstantComparesGatherer { 
-  const DataLayout &DL; 
- 
-  /// Value found for the switch comparison 
-  Value *CompValue = nullptr; 
- 
-  /// Extra clause to be checked before the switch 
-  Value *Extra = nullptr; 
- 
-  /// Set of integers to match in switch 
-  SmallVector<ConstantInt *, 8> Vals; 
- 
-  /// Number of comparisons matched in the and/or chain 
-  unsigned UsedICmps = 0; 
- 
-  /// Construct and compute the result for the comparison instruction Cond 
-  ConstantComparesGatherer(Instruction *Cond, const DataLayout &DL) : DL(DL) { 
-    gather(Cond); 
-  } 
- 
-  ConstantComparesGatherer(const ConstantComparesGatherer &) = delete; 
-  ConstantComparesGatherer & 
-  operator=(const ConstantComparesGatherer &) = delete; 
- 
-private: 
-  /// Try to set the current value used for the comparison, it succeeds only if 
-  /// it wasn't set before or if the new value is the same as the old one 
-  bool setValueOnce(Value *NewVal) { 
-    if (CompValue && CompValue != NewVal) 
-      return false; 
-    CompValue = NewVal; 
-    return (CompValue != nullptr); 
-  } 
- 
-  /// Try to match Instruction "I" as a comparison against a constant and 
-  /// populates the array Vals with the set of values that match (or do not 
-  /// match depending on isEQ). 
-  /// Return false on failure. On success, the Value the comparison matched 
-  /// against is placed in CompValue. 
-  /// If CompValue is already set, the function is expected to fail if a match 
-  /// is found but the value compared to is different. 
-  bool matchInstruction(Instruction *I, bool isEQ) { 
-    // If this is an icmp against a constant, handle this as one of the cases. 
-    ICmpInst *ICI; 
-    ConstantInt *C; 
-    if (!((ICI = dyn_cast<ICmpInst>(I)) && 
-          (C = GetConstantInt(I->getOperand(1), DL)))) { 
-      return false; 
-    } 
- 
-    Value *RHSVal; 
-    const APInt *RHSC; 
- 
-    // Pattern match a special case 
-    // (x & ~2^z) == y --> x == y || x == y|2^z 
-    // This undoes a transformation done by instcombine to fuse 2 compares. 
-    if (ICI->getPredicate() == (isEQ ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE)) { 
-      // It's a little bit hard to see why the following transformations are 
-      // correct. Here is a CVC3 program to verify them for 64-bit values: 
- 
-      /* 
-         ONE  : BITVECTOR(64) = BVZEROEXTEND(0bin1, 63); 
-         x    : BITVECTOR(64); 
-         y    : BITVECTOR(64); 
-         z    : BITVECTOR(64); 
-         mask : BITVECTOR(64) = BVSHL(ONE, z); 
-         QUERY( (y & ~mask = y) => 
-                ((x & ~mask = y) <=> (x = y OR x = (y |  mask))) 
-         ); 
-         QUERY( (y |  mask = y) => 
-                ((x |  mask = y) <=> (x = y OR x = (y & ~mask))) 
-         ); 
-      */ 
- 
-      // Please note that each pattern must be a dual implication (<--> or 
-      // iff). One directional implication can create spurious matches. If the 
-      // implication is only one-way, an unsatisfiable condition on the left 
-      // side can imply a satisfiable condition on the right side. Dual 
-      // implication ensures that satisfiable conditions are transformed to 
-      // other satisfiable conditions and unsatisfiable conditions are 
-      // transformed to other unsatisfiable conditions. 
- 
-      // Here is a concrete example of a unsatisfiable condition on the left 
-      // implying a satisfiable condition on the right: 
-      // 
-      // mask = (1 << z) 
-      // (x & ~mask) == y  --> (x == y || x == (y | mask)) 
-      // 
-      // Substituting y = 3, z = 0 yields: 
-      // (x & -2) == 3 --> (x == 3 || x == 2) 
- 
-      // Pattern match a special case: 
-      /* 
-        QUERY( (y & ~mask = y) => 
-               ((x & ~mask = y) <=> (x = y OR x = (y |  mask))) 
-        ); 
-      */ 
-      if (match(ICI->getOperand(0), 
-                m_And(m_Value(RHSVal), m_APInt(RHSC)))) { 
-        APInt Mask = ~*RHSC; 
-        if (Mask.isPowerOf2() && (C->getValue() & ~Mask) == C->getValue()) { 
-          // If we already have a value for the switch, it has to match! 
-          if (!setValueOnce(RHSVal)) 
-            return false; 
- 
-          Vals.push_back(C); 
-          Vals.push_back( 
-              ConstantInt::get(C->getContext(), 
-                               C->getValue() | Mask)); 
-          UsedICmps++; 
-          return true; 
-        } 
-      } 
- 
-      // Pattern match a special case: 
-      /* 
-        QUERY( (y |  mask = y) => 
-               ((x |  mask = y) <=> (x = y OR x = (y & ~mask))) 
-        ); 
-      */ 
-      if (match(ICI->getOperand(0), 
-                m_Or(m_Value(RHSVal), m_APInt(RHSC)))) { 
-        APInt Mask = *RHSC; 
-        if (Mask.isPowerOf2() && (C->getValue() | Mask) == C->getValue()) { 
-          // If we already have a value for the switch, it has to match! 
-          if (!setValueOnce(RHSVal)) 
-            return false; 
- 
-          Vals.push_back(C); 
-          Vals.push_back(ConstantInt::get(C->getContext(), 
-                                          C->getValue() & ~Mask)); 
-          UsedICmps++; 
-          return true; 
-        } 
-      } 
- 
-      // If we already have a value for the switch, it has to match! 
-      if (!setValueOnce(ICI->getOperand(0))) 
-        return false; 
- 
-      UsedICmps++; 
-      Vals.push_back(C); 
-      return ICI->getOperand(0); 
-    } 
- 
-    // If we have "x ult 3", for example, then we can add 0,1,2 to the set. 
-    ConstantRange Span = ConstantRange::makeAllowedICmpRegion( 
-        ICI->getPredicate(), C->getValue()); 
- 
-    // Shift the range if the compare is fed by an add. This is the range 
-    // compare idiom as emitted by instcombine. 
-    Value *CandidateVal = I->getOperand(0); 
-    if (match(I->getOperand(0), m_Add(m_Value(RHSVal), m_APInt(RHSC)))) { 
-      Span = Span.subtract(*RHSC); 
-      CandidateVal = RHSVal; 
-    } 
- 
-    // If this is an and/!= check, then we are looking to build the set of 
-    // value that *don't* pass the and chain. I.e. to turn "x ugt 2" into 
-    // x != 0 && x != 1. 
-    if (!isEQ) 
-      Span = Span.inverse(); 
- 
-    // If there are a ton of values, we don't want to make a ginormous switch. 
-    if (Span.isSizeLargerThan(8) || Span.isEmptySet()) { 
-      return false; 
-    } 
- 
-    // If we already have a value for the switch, it has to match! 
-    if (!setValueOnce(CandidateVal)) 
-      return false; 
- 
-    // Add all values from the range to the set 
-    for (APInt Tmp = Span.getLower(); Tmp != Span.getUpper(); ++Tmp) 
-      Vals.push_back(ConstantInt::get(I->getContext(), Tmp)); 
- 
-    UsedICmps++; 
-    return true; 
-  } 
- 
-  /// Given a potentially 'or'd or 'and'd together collection of icmp 
-  /// eq/ne/lt/gt instructions that compare a value against a constant, extract 
-  /// the value being compared, and stick the list constants into the Vals 
-  /// vector. 
-  /// One "Extra" case is allowed to differ from the other. 
-  void gather(Value *V) { 
+  bool run(BasicBlock *BB);
+
+  // Helper to set Resimplify and return change indication.
+  bool requestResimplify() {
+    Resimplify = true;
+    return true;
+  }
+};
+
+} // end anonymous namespace
+
+/// Return true if it is safe to merge these two
+/// terminator instructions together.
+static bool
+SafeToMergeTerminators(Instruction *SI1, Instruction *SI2,
+                       SmallSetVector<BasicBlock *, 4> *FailBlocks = nullptr) {
+  if (SI1 == SI2)
+    return false; // Can't merge with self!
+
+  // It is not safe to merge these two switch instructions if they have a common
+  // successor, and if that successor has a PHI node, and if *that* PHI node has
+  // conflicting incoming values from the two switch blocks.
+  BasicBlock *SI1BB = SI1->getParent();
+  BasicBlock *SI2BB = SI2->getParent();
+
+  SmallPtrSet<BasicBlock *, 16> SI1Succs(succ_begin(SI1BB), succ_end(SI1BB));
+  bool Fail = false;
+  for (BasicBlock *Succ : successors(SI2BB))
+    if (SI1Succs.count(Succ))
+      for (BasicBlock::iterator BBI = Succ->begin(); isa<PHINode>(BBI); ++BBI) {
+        PHINode *PN = cast<PHINode>(BBI);
+        if (PN->getIncomingValueForBlock(SI1BB) !=
+            PN->getIncomingValueForBlock(SI2BB)) {
+          if (FailBlocks)
+            FailBlocks->insert(Succ);
+          Fail = true;
+        }
+      }
+
+  return !Fail;
+}
+
+/// Update PHI nodes in Succ to indicate that there will now be entries in it
+/// from the 'NewPred' block. The values that will be flowing into the PHI nodes
+/// will be the same as those coming in from ExistPred, an existing predecessor
+/// of Succ.
+static void AddPredecessorToBlock(BasicBlock *Succ, BasicBlock *NewPred,
+                                  BasicBlock *ExistPred,
+                                  MemorySSAUpdater *MSSAU = nullptr) {
+  for (PHINode &PN : Succ->phis())
+    PN.addIncoming(PN.getIncomingValueForBlock(ExistPred), NewPred);
+  if (MSSAU)
+    if (auto *MPhi = MSSAU->getMemorySSA()->getMemoryAccess(Succ))
+      MPhi->addIncoming(MPhi->getIncomingValueForBlock(ExistPred), NewPred);
+}
+
+/// Compute an abstract "cost" of speculating the given instruction,
+/// which is assumed to be safe to speculate. TCC_Free means cheap,
+/// TCC_Basic means less cheap, and TCC_Expensive means prohibitively
+/// expensive.
+static unsigned ComputeSpeculationCost(const User *I,
+                                       const TargetTransformInfo &TTI) {
+  assert(isSafeToSpeculativelyExecute(I) &&
+         "Instruction is not safe to speculatively execute!");
+  return TTI.getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency);
+}
+
+/// If we have a merge point of an "if condition" as accepted above,
+/// return true if the specified value dominates the block.  We
+/// don't handle the true generality of domination here, just a special case
+/// which works well enough for us.
+///
+/// If AggressiveInsts is non-null, and if V does not dominate BB, we check to
+/// see if V (which must be an instruction) and its recursive operands
+/// that do not dominate BB have a combined cost lower than CostRemaining and
+/// are non-trapping.  If both are true, the instruction is inserted into the
+/// set and true is returned.
+///
+/// The cost for most non-trapping instructions is defined as 1 except for
+/// Select whose cost is 2.
+///
+/// After this function returns, CostRemaining is decreased by the cost of
+/// V plus its non-dominating operands.  If that cost is greater than
+/// CostRemaining, false is returned and CostRemaining is undefined.
+static bool DominatesMergePoint(Value *V, BasicBlock *BB,
+                                SmallPtrSetImpl<Instruction *> &AggressiveInsts,
+                                int &BudgetRemaining,
+                                const TargetTransformInfo &TTI,
+                                unsigned Depth = 0) {
+  // It is possible to hit a zero-cost cycle (phi/gep instructions for example),
+  // so limit the recursion depth.
+  // TODO: While this recursion limit does prevent pathological behavior, it
+  // would be better to track visited instructions to avoid cycles.
+  if (Depth == MaxSpeculationDepth)
+    return false;
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) {
+    // Non-instructions all dominate instructions, but not all constantexprs
+    // can be executed unconditionally.
+    if (ConstantExpr *C = dyn_cast<ConstantExpr>(V))
+      if (C->canTrap())
+        return false;
+    return true;
+  }
+  BasicBlock *PBB = I->getParent();
+
+  // We don't want to allow weird loops that might have the "if condition" in
+  // the bottom of this block.
+  if (PBB == BB)
+    return false;
+
+  // If this instruction is defined in a block that contains an unconditional
+  // branch to BB, then it must be in the 'conditional' part of the "if
+  // statement".  If not, it definitely dominates the region.
+  BranchInst *BI = dyn_cast<BranchInst>(PBB->getTerminator());
+  if (!BI || BI->isConditional() || BI->getSuccessor(0) != BB)
+    return true;
+
+  // If we have seen this instruction before, don't count it again.
+  if (AggressiveInsts.count(I))
+    return true;
+
+  // Okay, it looks like the instruction IS in the "condition".  Check to
+  // see if it's a cheap instruction to unconditionally compute, and if it
+  // only uses stuff defined outside of the condition.  If so, hoist it out.
+  if (!isSafeToSpeculativelyExecute(I))
+    return false;
+
+  BudgetRemaining -= ComputeSpeculationCost(I, TTI);
+
+  // Allow exactly one instruction to be speculated regardless of its cost
+  // (as long as it is safe to do so).
+  // This is intended to flatten the CFG even if the instruction is a division
+  // or other expensive operation. The speculation of an expensive instruction
+  // is expected to be undone in CodeGenPrepare if the speculation has not
+  // enabled further IR optimizations.
+  if (BudgetRemaining < 0 &&
+      (!SpeculateOneExpensiveInst || !AggressiveInsts.empty() || Depth > 0))
+    return false;
+
+  // Okay, we can only really hoist these out if their operands do
+  // not take us over the cost threshold.
+  for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i)
+    if (!DominatesMergePoint(*i, BB, AggressiveInsts, BudgetRemaining, TTI,
+                             Depth + 1))
+      return false;
+  // Okay, it's safe to do this!  Remember this instruction.
+  AggressiveInsts.insert(I);
+  return true;
+}
+
+/// Extract ConstantInt from value, looking through IntToPtr
+/// and PointerNullValue. Return NULL if value is not a constant int.
+static ConstantInt *GetConstantInt(Value *V, const DataLayout &DL) {
+  // Normal constant int.
+  ConstantInt *CI = dyn_cast<ConstantInt>(V);
+  if (CI || !isa<Constant>(V) || !V->getType()->isPointerTy())
+    return CI;
+
+  // This is some kind of pointer constant. Turn it into a pointer-sized
+  // ConstantInt if possible.
+  IntegerType *PtrTy = cast<IntegerType>(DL.getIntPtrType(V->getType()));
+
+  // Null pointer means 0, see SelectionDAGBuilder::getValue(const Value*).
+  if (isa<ConstantPointerNull>(V))
+    return ConstantInt::get(PtrTy, 0);
+
+  // IntToPtr const int.
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+    if (CE->getOpcode() == Instruction::IntToPtr)
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(CE->getOperand(0))) {
+        // The constant is very likely to have the right type already.
+        if (CI->getType() == PtrTy)
+          return CI;
+        else
+          return cast<ConstantInt>(
+              ConstantExpr::getIntegerCast(CI, PtrTy, /*isSigned=*/false));
+      }
+  return nullptr;
+}
+
+namespace {
+
+/// Given a chain of or (||) or and (&&) comparison of a value against a
+/// constant, this will try to recover the information required for a switch
+/// structure.
+/// It will depth-first traverse the chain of comparison, seeking for patterns
+/// like %a == 12 or %a < 4 and combine them to produce a set of integer
+/// representing the different cases for the switch.
+/// Note that if the chain is composed of '||' it will build the set of elements
+/// that matches the comparisons (i.e. any of this value validate the chain)
+/// while for a chain of '&&' it will build the set elements that make the test
+/// fail.
+struct ConstantComparesGatherer {
+  const DataLayout &DL;
+
+  /// Value found for the switch comparison
+  Value *CompValue = nullptr;
+
+  /// Extra clause to be checked before the switch
+  Value *Extra = nullptr;
+
+  /// Set of integers to match in switch
+  SmallVector<ConstantInt *, 8> Vals;
+
+  /// Number of comparisons matched in the and/or chain
+  unsigned UsedICmps = 0;
+
+  /// Construct and compute the result for the comparison instruction Cond
+  ConstantComparesGatherer(Instruction *Cond, const DataLayout &DL) : DL(DL) {
+    gather(Cond);
+  }
+
+  ConstantComparesGatherer(const ConstantComparesGatherer &) = delete;
+  ConstantComparesGatherer &
+  operator=(const ConstantComparesGatherer &) = delete;
+
+private:
+  /// Try to set the current value used for the comparison, it succeeds only if
+  /// it wasn't set before or if the new value is the same as the old one
+  bool setValueOnce(Value *NewVal) {
+    if (CompValue && CompValue != NewVal)
+      return false;
+    CompValue = NewVal;
+    return (CompValue != nullptr);
+  }
+
+  /// Try to match Instruction "I" as a comparison against a constant and
+  /// populates the array Vals with the set of values that match (or do not
+  /// match depending on isEQ).
+  /// Return false on failure. On success, the Value the comparison matched
+  /// against is placed in CompValue.
+  /// If CompValue is already set, the function is expected to fail if a match
+  /// is found but the value compared to is different.
+  bool matchInstruction(Instruction *I, bool isEQ) {
+    // If this is an icmp against a constant, handle this as one of the cases.
+    ICmpInst *ICI;
+    ConstantInt *C;
+    if (!((ICI = dyn_cast<ICmpInst>(I)) &&
+          (C = GetConstantInt(I->getOperand(1), DL)))) {
+      return false;
+    }
+
+    Value *RHSVal;
+    const APInt *RHSC;
+
+    // Pattern match a special case
+    // (x & ~2^z) == y --> x == y || x == y|2^z
+    // This undoes a transformation done by instcombine to fuse 2 compares.
+    if (ICI->getPredicate() == (isEQ ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE)) {
+      // It's a little bit hard to see why the following transformations are
+      // correct. Here is a CVC3 program to verify them for 64-bit values:
+
+      /*
+         ONE  : BITVECTOR(64) = BVZEROEXTEND(0bin1, 63);
+         x    : BITVECTOR(64);
+         y    : BITVECTOR(64);
+         z    : BITVECTOR(64);
+         mask : BITVECTOR(64) = BVSHL(ONE, z);
+         QUERY( (y & ~mask = y) =>
+                ((x & ~mask = y) <=> (x = y OR x = (y |  mask)))
+         );
+         QUERY( (y |  mask = y) =>
+                ((x |  mask = y) <=> (x = y OR x = (y & ~mask)))
+         );
+      */
+
+      // Please note that each pattern must be a dual implication (<--> or
+      // iff). One directional implication can create spurious matches. If the
+      // implication is only one-way, an unsatisfiable condition on the left
+      // side can imply a satisfiable condition on the right side. Dual
+      // implication ensures that satisfiable conditions are transformed to
+      // other satisfiable conditions and unsatisfiable conditions are
+      // transformed to other unsatisfiable conditions.
+
+      // Here is a concrete example of a unsatisfiable condition on the left
+      // implying a satisfiable condition on the right:
+      //
+      // mask = (1 << z)
+      // (x & ~mask) == y  --> (x == y || x == (y | mask))
+      //
+      // Substituting y = 3, z = 0 yields:
+      // (x & -2) == 3 --> (x == 3 || x == 2)
+
+      // Pattern match a special case:
+      /*
+        QUERY( (y & ~mask = y) =>
+               ((x & ~mask = y) <=> (x = y OR x = (y |  mask)))
+        );
+      */
+      if (match(ICI->getOperand(0),
+                m_And(m_Value(RHSVal), m_APInt(RHSC)))) {
+        APInt Mask = ~*RHSC;
+        if (Mask.isPowerOf2() && (C->getValue() & ~Mask) == C->getValue()) {
+          // If we already have a value for the switch, it has to match!
+          if (!setValueOnce(RHSVal))
+            return false;
+
+          Vals.push_back(C);
+          Vals.push_back(
+              ConstantInt::get(C->getContext(),
+                               C->getValue() | Mask));
+          UsedICmps++;
+          return true;
+        }
+      }
+
+      // Pattern match a special case:
+      /*
+        QUERY( (y |  mask = y) =>
+               ((x |  mask = y) <=> (x = y OR x = (y & ~mask)))
+        );
+      */
+      if (match(ICI->getOperand(0),
+                m_Or(m_Value(RHSVal), m_APInt(RHSC)))) {
+        APInt Mask = *RHSC;
+        if (Mask.isPowerOf2() && (C->getValue() | Mask) == C->getValue()) {
+          // If we already have a value for the switch, it has to match!
+          if (!setValueOnce(RHSVal))
+            return false;
+
+          Vals.push_back(C);
+          Vals.push_back(ConstantInt::get(C->getContext(),
+                                          C->getValue() & ~Mask));
+          UsedICmps++;
+          return true;
+        }
+      }
+
+      // If we already have a value for the switch, it has to match!
+      if (!setValueOnce(ICI->getOperand(0)))
+        return false;
+
+      UsedICmps++;
+      Vals.push_back(C);
+      return ICI->getOperand(0);
+    }
+
+    // If we have "x ult 3", for example, then we can add 0,1,2 to the set.
+    ConstantRange Span = ConstantRange::makeAllowedICmpRegion(
+        ICI->getPredicate(), C->getValue());
+
+    // Shift the range if the compare is fed by an add. This is the range
+    // compare idiom as emitted by instcombine.
+    Value *CandidateVal = I->getOperand(0);
+    if (match(I->getOperand(0), m_Add(m_Value(RHSVal), m_APInt(RHSC)))) {
+      Span = Span.subtract(*RHSC);
+      CandidateVal = RHSVal;
+    }
+
+    // If this is an and/!= check, then we are looking to build the set of
+    // value that *don't* pass the and chain. I.e. to turn "x ugt 2" into
+    // x != 0 && x != 1.
+    if (!isEQ)
+      Span = Span.inverse();
+
+    // If there are a ton of values, we don't want to make a ginormous switch.
+    if (Span.isSizeLargerThan(8) || Span.isEmptySet()) {
+      return false;
+    }
+
+    // If we already have a value for the switch, it has to match!
+    if (!setValueOnce(CandidateVal))
+      return false;
+
+    // Add all values from the range to the set
+    for (APInt Tmp = Span.getLower(); Tmp != Span.getUpper(); ++Tmp)
+      Vals.push_back(ConstantInt::get(I->getContext(), Tmp));
+
+    UsedICmps++;
+    return true;
+  }
+
+  /// Given a potentially 'or'd or 'and'd together collection of icmp
+  /// eq/ne/lt/gt instructions that compare a value against a constant, extract
+  /// the value being compared, and stick the list constants into the Vals
+  /// vector.
+  /// One "Extra" case is allowed to differ from the other.
+  void gather(Value *V) {
     bool isEQ = match(V, m_LogicalOr(m_Value(), m_Value()));
- 
-    // Keep a stack (SmallVector for efficiency) for depth-first traversal 
-    SmallVector<Value *, 8> DFT; 
-    SmallPtrSet<Value *, 8> Visited; 
- 
-    // Initialize 
-    Visited.insert(V); 
-    DFT.push_back(V); 
- 
-    while (!DFT.empty()) { 
-      V = DFT.pop_back_val(); 
- 
-      if (Instruction *I = dyn_cast<Instruction>(V)) { 
-        // If it is a || (or && depending on isEQ), process the operands. 
+
+    // Keep a stack (SmallVector for efficiency) for depth-first traversal
+    SmallVector<Value *, 8> DFT;
+    SmallPtrSet<Value *, 8> Visited;
+
+    // Initialize
+    Visited.insert(V);
+    DFT.push_back(V);
+
+    while (!DFT.empty()) {
+      V = DFT.pop_back_val();
+
+      if (Instruction *I = dyn_cast<Instruction>(V)) {
+        // If it is a || (or && depending on isEQ), process the operands.
         Value *Op0, *Op1;
         if (isEQ ? match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1)))
                  : match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1)))) {
@@ -678,245 +678,245 @@ private:
           if (Visited.insert(Op0).second)
             DFT.push_back(Op0);
 
-          continue; 
-        } 
- 
-        // Try to match the current instruction 
-        if (matchInstruction(I, isEQ)) 
-          // Match succeed, continue the loop 
-          continue; 
-      } 
- 
-      // One element of the sequence of || (or &&) could not be match as a 
-      // comparison against the same value as the others. 
-      // We allow only one "Extra" case to be checked before the switch 
-      if (!Extra) { 
-        Extra = V; 
-        continue; 
-      } 
-      // Failed to parse a proper sequence, abort now 
-      CompValue = nullptr; 
-      break; 
-    } 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-static void EraseTerminatorAndDCECond(Instruction *TI, 
-                                      MemorySSAUpdater *MSSAU = nullptr) { 
-  Instruction *Cond = nullptr; 
-  if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { 
-    Cond = dyn_cast<Instruction>(SI->getCondition()); 
-  } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { 
-    if (BI->isConditional()) 
-      Cond = dyn_cast<Instruction>(BI->getCondition()); 
-  } else if (IndirectBrInst *IBI = dyn_cast<IndirectBrInst>(TI)) { 
-    Cond = dyn_cast<Instruction>(IBI->getAddress()); 
-  } 
- 
-  TI->eraseFromParent(); 
-  if (Cond) 
-    RecursivelyDeleteTriviallyDeadInstructions(Cond, nullptr, MSSAU); 
-} 
- 
-/// Return true if the specified terminator checks 
-/// to see if a value is equal to constant integer value. 
-Value *SimplifyCFGOpt::isValueEqualityComparison(Instruction *TI) { 
-  Value *CV = nullptr; 
-  if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { 
-    // Do not permit merging of large switch instructions into their 
-    // predecessors unless there is only one predecessor. 
-    if (!SI->getParent()->hasNPredecessorsOrMore(128 / SI->getNumSuccessors())) 
-      CV = SI->getCondition(); 
-  } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) 
-    if (BI->isConditional() && BI->getCondition()->hasOneUse()) 
-      if (ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition())) { 
-        if (ICI->isEquality() && GetConstantInt(ICI->getOperand(1), DL)) 
-          CV = ICI->getOperand(0); 
-      } 
- 
-  // Unwrap any lossless ptrtoint cast. 
-  if (CV) { 
-    if (PtrToIntInst *PTII = dyn_cast<PtrToIntInst>(CV)) { 
-      Value *Ptr = PTII->getPointerOperand(); 
-      if (PTII->getType() == DL.getIntPtrType(Ptr->getType())) 
-        CV = Ptr; 
-    } 
-  } 
-  return CV; 
-} 
- 
-/// Given a value comparison instruction, 
-/// decode all of the 'cases' that it represents and return the 'default' block. 
-BasicBlock *SimplifyCFGOpt::GetValueEqualityComparisonCases( 
-    Instruction *TI, std::vector<ValueEqualityComparisonCase> &Cases) { 
-  if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { 
-    Cases.reserve(SI->getNumCases()); 
-    for (auto Case : SI->cases()) 
-      Cases.push_back(ValueEqualityComparisonCase(Case.getCaseValue(), 
-                                                  Case.getCaseSuccessor())); 
-    return SI->getDefaultDest(); 
-  } 
- 
-  BranchInst *BI = cast<BranchInst>(TI); 
-  ICmpInst *ICI = cast<ICmpInst>(BI->getCondition()); 
-  BasicBlock *Succ = BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_NE); 
-  Cases.push_back(ValueEqualityComparisonCase( 
-      GetConstantInt(ICI->getOperand(1), DL), Succ)); 
-  return BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_EQ); 
-} 
- 
-/// Given a vector of bb/value pairs, remove any entries 
-/// in the list that match the specified block. 
-static void 
-EliminateBlockCases(BasicBlock *BB, 
-                    std::vector<ValueEqualityComparisonCase> &Cases) { 
+          continue;
+        }
+
+        // Try to match the current instruction
+        if (matchInstruction(I, isEQ))
+          // Match succeed, continue the loop
+          continue;
+      }
+
+      // One element of the sequence of || (or &&) could not be match as a
+      // comparison against the same value as the others.
+      // We allow only one "Extra" case to be checked before the switch
+      if (!Extra) {
+        Extra = V;
+        continue;
+      }
+      // Failed to parse a proper sequence, abort now
+      CompValue = nullptr;
+      break;
+    }
+  }
+};
+
+} // end anonymous namespace
+
+static void EraseTerminatorAndDCECond(Instruction *TI,
+                                      MemorySSAUpdater *MSSAU = nullptr) {
+  Instruction *Cond = nullptr;
+  if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+    Cond = dyn_cast<Instruction>(SI->getCondition());
+  } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+    if (BI->isConditional())
+      Cond = dyn_cast<Instruction>(BI->getCondition());
+  } else if (IndirectBrInst *IBI = dyn_cast<IndirectBrInst>(TI)) {
+    Cond = dyn_cast<Instruction>(IBI->getAddress());
+  }
+
+  TI->eraseFromParent();
+  if (Cond)
+    RecursivelyDeleteTriviallyDeadInstructions(Cond, nullptr, MSSAU);
+}
+
+/// Return true if the specified terminator checks
+/// to see if a value is equal to constant integer value.
+Value *SimplifyCFGOpt::isValueEqualityComparison(Instruction *TI) {
+  Value *CV = nullptr;
+  if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+    // Do not permit merging of large switch instructions into their
+    // predecessors unless there is only one predecessor.
+    if (!SI->getParent()->hasNPredecessorsOrMore(128 / SI->getNumSuccessors()))
+      CV = SI->getCondition();
+  } else if (BranchInst *BI = dyn_cast<BranchInst>(TI))
+    if (BI->isConditional() && BI->getCondition()->hasOneUse())
+      if (ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition())) {
+        if (ICI->isEquality() && GetConstantInt(ICI->getOperand(1), DL))
+          CV = ICI->getOperand(0);
+      }
+
+  // Unwrap any lossless ptrtoint cast.
+  if (CV) {
+    if (PtrToIntInst *PTII = dyn_cast<PtrToIntInst>(CV)) {
+      Value *Ptr = PTII->getPointerOperand();
+      if (PTII->getType() == DL.getIntPtrType(Ptr->getType()))
+        CV = Ptr;
+    }
+  }
+  return CV;
+}
+
+/// Given a value comparison instruction,
+/// decode all of the 'cases' that it represents and return the 'default' block.
+BasicBlock *SimplifyCFGOpt::GetValueEqualityComparisonCases(
+    Instruction *TI, std::vector<ValueEqualityComparisonCase> &Cases) {
+  if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+    Cases.reserve(SI->getNumCases());
+    for (auto Case : SI->cases())
+      Cases.push_back(ValueEqualityComparisonCase(Case.getCaseValue(),
+                                                  Case.getCaseSuccessor()));
+    return SI->getDefaultDest();
+  }
+
+  BranchInst *BI = cast<BranchInst>(TI);
+  ICmpInst *ICI = cast<ICmpInst>(BI->getCondition());
+  BasicBlock *Succ = BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_NE);
+  Cases.push_back(ValueEqualityComparisonCase(
+      GetConstantInt(ICI->getOperand(1), DL), Succ));
+  return BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_EQ);
+}
+
+/// Given a vector of bb/value pairs, remove any entries
+/// in the list that match the specified block.
+static void
+EliminateBlockCases(BasicBlock *BB,
+                    std::vector<ValueEqualityComparisonCase> &Cases) {
   llvm::erase_value(Cases, BB);
-} 
- 
-/// Return true if there are any keys in C1 that exist in C2 as well. 
-static bool ValuesOverlap(std::vector<ValueEqualityComparisonCase> &C1, 
-                          std::vector<ValueEqualityComparisonCase> &C2) { 
-  std::vector<ValueEqualityComparisonCase> *V1 = &C1, *V2 = &C2; 
- 
-  // Make V1 be smaller than V2. 
-  if (V1->size() > V2->size()) 
-    std::swap(V1, V2); 
- 
-  if (V1->empty()) 
-    return false; 
-  if (V1->size() == 1) { 
-    // Just scan V2. 
-    ConstantInt *TheVal = (*V1)[0].Value; 
-    for (unsigned i = 0, e = V2->size(); i != e; ++i) 
-      if (TheVal == (*V2)[i].Value) 
-        return true; 
-  } 
- 
-  // Otherwise, just sort both lists and compare element by element. 
-  array_pod_sort(V1->begin(), V1->end()); 
-  array_pod_sort(V2->begin(), V2->end()); 
-  unsigned i1 = 0, i2 = 0, e1 = V1->size(), e2 = V2->size(); 
-  while (i1 != e1 && i2 != e2) { 
-    if ((*V1)[i1].Value == (*V2)[i2].Value) 
-      return true; 
-    if ((*V1)[i1].Value < (*V2)[i2].Value) 
-      ++i1; 
-    else 
-      ++i2; 
-  } 
-  return false; 
-} 
- 
-// Set branch weights on SwitchInst. This sets the metadata if there is at 
-// least one non-zero weight. 
-static void setBranchWeights(SwitchInst *SI, ArrayRef<uint32_t> Weights) { 
-  // Check that there is at least one non-zero weight. Otherwise, pass 
-  // nullptr to setMetadata which will erase the existing metadata. 
-  MDNode *N = nullptr; 
-  if (llvm::any_of(Weights, [](uint32_t W) { return W != 0; })) 
-    N = MDBuilder(SI->getParent()->getContext()).createBranchWeights(Weights); 
-  SI->setMetadata(LLVMContext::MD_prof, N); 
-} 
- 
-// Similar to the above, but for branch and select instructions that take 
-// exactly 2 weights. 
-static void setBranchWeights(Instruction *I, uint32_t TrueWeight, 
-                             uint32_t FalseWeight) { 
-  assert(isa<BranchInst>(I) || isa<SelectInst>(I)); 
-  // Check that there is at least one non-zero weight. Otherwise, pass 
-  // nullptr to setMetadata which will erase the existing metadata. 
-  MDNode *N = nullptr; 
-  if (TrueWeight || FalseWeight) 
-    N = MDBuilder(I->getParent()->getContext()) 
-            .createBranchWeights(TrueWeight, FalseWeight); 
-  I->setMetadata(LLVMContext::MD_prof, N); 
-} 
- 
-/// If TI is known to be a terminator instruction and its block is known to 
-/// only have a single predecessor block, check to see if that predecessor is 
-/// also a value comparison with the same value, and if that comparison 
-/// determines the outcome of this comparison. If so, simplify TI. This does a 
-/// very limited form of jump threading. 
-bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor( 
-    Instruction *TI, BasicBlock *Pred, IRBuilder<> &Builder) { 
-  Value *PredVal = isValueEqualityComparison(Pred->getTerminator()); 
-  if (!PredVal) 
-    return false; // Not a value comparison in predecessor. 
- 
-  Value *ThisVal = isValueEqualityComparison(TI); 
-  assert(ThisVal && "This isn't a value comparison!!"); 
-  if (ThisVal != PredVal) 
-    return false; // Different predicates. 
- 
-  // TODO: Preserve branch weight metadata, similarly to how 
-  // FoldValueComparisonIntoPredecessors preserves it. 
- 
-  // Find out information about when control will move from Pred to TI's block. 
-  std::vector<ValueEqualityComparisonCase> PredCases; 
-  BasicBlock *PredDef = 
-      GetValueEqualityComparisonCases(Pred->getTerminator(), PredCases); 
-  EliminateBlockCases(PredDef, PredCases); // Remove default from cases. 
- 
-  // Find information about how control leaves this block. 
-  std::vector<ValueEqualityComparisonCase> ThisCases; 
-  BasicBlock *ThisDef = GetValueEqualityComparisonCases(TI, ThisCases); 
-  EliminateBlockCases(ThisDef, ThisCases); // Remove default from cases. 
- 
-  // If TI's block is the default block from Pred's comparison, potentially 
-  // simplify TI based on this knowledge. 
-  if (PredDef == TI->getParent()) { 
-    // If we are here, we know that the value is none of those cases listed in 
-    // PredCases.  If there are any cases in ThisCases that are in PredCases, we 
-    // can simplify TI. 
-    if (!ValuesOverlap(PredCases, ThisCases)) 
-      return false; 
- 
-    if (isa<BranchInst>(TI)) { 
-      // Okay, one of the successors of this condbr is dead.  Convert it to a 
-      // uncond br. 
-      assert(ThisCases.size() == 1 && "Branch can only have one case!"); 
-      // Insert the new branch. 
-      Instruction *NI = Builder.CreateBr(ThisDef); 
-      (void)NI; 
- 
-      // Remove PHI node entries for the dead edge. 
+}
+
+/// Return true if there are any keys in C1 that exist in C2 as well.
+static bool ValuesOverlap(std::vector<ValueEqualityComparisonCase> &C1,
+                          std::vector<ValueEqualityComparisonCase> &C2) {
+  std::vector<ValueEqualityComparisonCase> *V1 = &C1, *V2 = &C2;
+
+  // Make V1 be smaller than V2.
+  if (V1->size() > V2->size())
+    std::swap(V1, V2);
+
+  if (V1->empty())
+    return false;
+  if (V1->size() == 1) {
+    // Just scan V2.
+    ConstantInt *TheVal = (*V1)[0].Value;
+    for (unsigned i = 0, e = V2->size(); i != e; ++i)
+      if (TheVal == (*V2)[i].Value)
+        return true;
+  }
+
+  // Otherwise, just sort both lists and compare element by element.
+  array_pod_sort(V1->begin(), V1->end());
+  array_pod_sort(V2->begin(), V2->end());
+  unsigned i1 = 0, i2 = 0, e1 = V1->size(), e2 = V2->size();
+  while (i1 != e1 && i2 != e2) {
+    if ((*V1)[i1].Value == (*V2)[i2].Value)
+      return true;
+    if ((*V1)[i1].Value < (*V2)[i2].Value)
+      ++i1;
+    else
+      ++i2;
+  }
+  return false;
+}
+
+// Set branch weights on SwitchInst. This sets the metadata if there is at
+// least one non-zero weight.
+static void setBranchWeights(SwitchInst *SI, ArrayRef<uint32_t> Weights) {
+  // Check that there is at least one non-zero weight. Otherwise, pass
+  // nullptr to setMetadata which will erase the existing metadata.
+  MDNode *N = nullptr;
+  if (llvm::any_of(Weights, [](uint32_t W) { return W != 0; }))
+    N = MDBuilder(SI->getParent()->getContext()).createBranchWeights(Weights);
+  SI->setMetadata(LLVMContext::MD_prof, N);
+}
+
+// Similar to the above, but for branch and select instructions that take
+// exactly 2 weights.
+static void setBranchWeights(Instruction *I, uint32_t TrueWeight,
+                             uint32_t FalseWeight) {
+  assert(isa<BranchInst>(I) || isa<SelectInst>(I));
+  // Check that there is at least one non-zero weight. Otherwise, pass
+  // nullptr to setMetadata which will erase the existing metadata.
+  MDNode *N = nullptr;
+  if (TrueWeight || FalseWeight)
+    N = MDBuilder(I->getParent()->getContext())
+            .createBranchWeights(TrueWeight, FalseWeight);
+  I->setMetadata(LLVMContext::MD_prof, N);
+}
+
+/// If TI is known to be a terminator instruction and its block is known to
+/// only have a single predecessor block, check to see if that predecessor is
+/// also a value comparison with the same value, and if that comparison
+/// determines the outcome of this comparison. If so, simplify TI. This does a
+/// very limited form of jump threading.
+bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
+    Instruction *TI, BasicBlock *Pred, IRBuilder<> &Builder) {
+  Value *PredVal = isValueEqualityComparison(Pred->getTerminator());
+  if (!PredVal)
+    return false; // Not a value comparison in predecessor.
+
+  Value *ThisVal = isValueEqualityComparison(TI);
+  assert(ThisVal && "This isn't a value comparison!!");
+  if (ThisVal != PredVal)
+    return false; // Different predicates.
+
+  // TODO: Preserve branch weight metadata, similarly to how
+  // FoldValueComparisonIntoPredecessors preserves it.
+
+  // Find out information about when control will move from Pred to TI's block.
+  std::vector<ValueEqualityComparisonCase> PredCases;
+  BasicBlock *PredDef =
+      GetValueEqualityComparisonCases(Pred->getTerminator(), PredCases);
+  EliminateBlockCases(PredDef, PredCases); // Remove default from cases.
+
+  // Find information about how control leaves this block.
+  std::vector<ValueEqualityComparisonCase> ThisCases;
+  BasicBlock *ThisDef = GetValueEqualityComparisonCases(TI, ThisCases);
+  EliminateBlockCases(ThisDef, ThisCases); // Remove default from cases.
+
+  // If TI's block is the default block from Pred's comparison, potentially
+  // simplify TI based on this knowledge.
+  if (PredDef == TI->getParent()) {
+    // If we are here, we know that the value is none of those cases listed in
+    // PredCases.  If there are any cases in ThisCases that are in PredCases, we
+    // can simplify TI.
+    if (!ValuesOverlap(PredCases, ThisCases))
+      return false;
+
+    if (isa<BranchInst>(TI)) {
+      // Okay, one of the successors of this condbr is dead.  Convert it to a
+      // uncond br.
+      assert(ThisCases.size() == 1 && "Branch can only have one case!");
+      // Insert the new branch.
+      Instruction *NI = Builder.CreateBr(ThisDef);
+      (void)NI;
+
+      // Remove PHI node entries for the dead edge.
       ThisCases[0].Dest->removePredecessor(PredDef);
- 
-      LLVM_DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator() 
-                        << "Through successor TI: " << *TI << "Leaving: " << *NI 
-                        << "\n"); 
- 
-      EraseTerminatorAndDCECond(TI); 
+
+      LLVM_DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
+                        << "Through successor TI: " << *TI << "Leaving: " << *NI
+                        << "\n");
+
+      EraseTerminatorAndDCECond(TI);
 
       if (DTU)
         DTU->applyUpdates(
             {{DominatorTree::Delete, PredDef, ThisCases[0].Dest}});
 
-      return true; 
-    } 
- 
-    SwitchInstProfUpdateWrapper SI = *cast<SwitchInst>(TI); 
-    // Okay, TI has cases that are statically dead, prune them away. 
-    SmallPtrSet<Constant *, 16> DeadCases; 
-    for (unsigned i = 0, e = PredCases.size(); i != e; ++i) 
-      DeadCases.insert(PredCases[i].Value); 
- 
-    LLVM_DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator() 
-                      << "Through successor TI: " << *TI); 
- 
+      return true;
+    }
+
+    SwitchInstProfUpdateWrapper SI = *cast<SwitchInst>(TI);
+    // Okay, TI has cases that are statically dead, prune them away.
+    SmallPtrSet<Constant *, 16> DeadCases;
+    for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
+      DeadCases.insert(PredCases[i].Value);
+
+    LLVM_DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
+                      << "Through successor TI: " << *TI);
+
     SmallMapVector<BasicBlock *, int, 8> NumPerSuccessorCases;
-    for (SwitchInst::CaseIt i = SI->case_end(), e = SI->case_begin(); i != e;) { 
-      --i; 
+    for (SwitchInst::CaseIt i = SI->case_end(), e = SI->case_begin(); i != e;) {
+      --i;
       auto *Successor = i->getCaseSuccessor();
       ++NumPerSuccessorCases[Successor];
-      if (DeadCases.count(i->getCaseValue())) { 
+      if (DeadCases.count(i->getCaseValue())) {
         Successor->removePredecessor(PredDef);
-        SI.removeCase(i); 
+        SI.removeCase(i);
         --NumPerSuccessorCases[Successor];
-      } 
-    } 
+      }
+    }
 
     std::vector<DominatorTree::UpdateType> Updates;
     for (const std::pair<BasicBlock *, int> &I : NumPerSuccessorCases)
@@ -925,56 +925,56 @@ bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
     if (DTU)
       DTU->applyUpdates(Updates);
 
-    LLVM_DEBUG(dbgs() << "Leaving: " << *TI << "\n"); 
-    return true; 
-  } 
- 
-  // Otherwise, TI's block must correspond to some matched value.  Find out 
-  // which value (or set of values) this is. 
-  ConstantInt *TIV = nullptr; 
-  BasicBlock *TIBB = TI->getParent(); 
-  for (unsigned i = 0, e = PredCases.size(); i != e; ++i) 
-    if (PredCases[i].Dest == TIBB) { 
-      if (TIV) 
-        return false; // Cannot handle multiple values coming to this block. 
-      TIV = PredCases[i].Value; 
-    } 
-  assert(TIV && "No edge from pred to succ?"); 
- 
-  // Okay, we found the one constant that our value can be if we get into TI's 
-  // BB.  Find out which successor will unconditionally be branched to. 
-  BasicBlock *TheRealDest = nullptr; 
-  for (unsigned i = 0, e = ThisCases.size(); i != e; ++i) 
-    if (ThisCases[i].Value == TIV) { 
-      TheRealDest = ThisCases[i].Dest; 
-      break; 
-    } 
- 
-  // If not handled by any explicit cases, it is handled by the default case. 
-  if (!TheRealDest) 
-    TheRealDest = ThisDef; 
- 
+    LLVM_DEBUG(dbgs() << "Leaving: " << *TI << "\n");
+    return true;
+  }
+
+  // Otherwise, TI's block must correspond to some matched value.  Find out
+  // which value (or set of values) this is.
+  ConstantInt *TIV = nullptr;
+  BasicBlock *TIBB = TI->getParent();
+  for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
+    if (PredCases[i].Dest == TIBB) {
+      if (TIV)
+        return false; // Cannot handle multiple values coming to this block.
+      TIV = PredCases[i].Value;
+    }
+  assert(TIV && "No edge from pred to succ?");
+
+  // Okay, we found the one constant that our value can be if we get into TI's
+  // BB.  Find out which successor will unconditionally be branched to.
+  BasicBlock *TheRealDest = nullptr;
+  for (unsigned i = 0, e = ThisCases.size(); i != e; ++i)
+    if (ThisCases[i].Value == TIV) {
+      TheRealDest = ThisCases[i].Dest;
+      break;
+    }
+
+  // If not handled by any explicit cases, it is handled by the default case.
+  if (!TheRealDest)
+    TheRealDest = ThisDef;
+
   SmallSetVector<BasicBlock *, 2> RemovedSuccs;
 
-  // Remove PHI node entries for dead edges. 
-  BasicBlock *CheckEdge = TheRealDest; 
-  for (BasicBlock *Succ : successors(TIBB)) 
+  // Remove PHI node entries for dead edges.
+  BasicBlock *CheckEdge = TheRealDest;
+  for (BasicBlock *Succ : successors(TIBB))
     if (Succ != CheckEdge) {
       if (Succ != TheRealDest)
         RemovedSuccs.insert(Succ);
-      Succ->removePredecessor(TIBB); 
+      Succ->removePredecessor(TIBB);
     } else
-      CheckEdge = nullptr; 
- 
-  // Insert the new branch. 
-  Instruction *NI = Builder.CreateBr(TheRealDest); 
-  (void)NI; 
- 
-  LLVM_DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator() 
-                    << "Through successor TI: " << *TI << "Leaving: " << *NI 
-                    << "\n"); 
- 
-  EraseTerminatorAndDCECond(TI); 
+      CheckEdge = nullptr;
+
+  // Insert the new branch.
+  Instruction *NI = Builder.CreateBr(TheRealDest);
+  (void)NI;
+
+  LLVM_DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
+                    << "Through successor TI: " << *TI << "Leaving: " << *NI
+                    << "\n");
+
+  EraseTerminatorAndDCECond(TI);
   if (DTU) {
     SmallVector<DominatorTree::UpdateType, 2> Updates;
     Updates.reserve(RemovedSuccs.size());
@@ -982,86 +982,86 @@ bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
       Updates.push_back({DominatorTree::Delete, TIBB, RemovedSucc});
     DTU->applyUpdates(Updates);
   }
-  return true; 
-} 
- 
-namespace { 
- 
-/// This class implements a stable ordering of constant 
-/// integers that does not depend on their address.  This is important for 
-/// applications that sort ConstantInt's to ensure uniqueness. 
-struct ConstantIntOrdering { 
-  bool operator()(const ConstantInt *LHS, const ConstantInt *RHS) const { 
-    return LHS->getValue().ult(RHS->getValue()); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-static int ConstantIntSortPredicate(ConstantInt *const *P1, 
-                                    ConstantInt *const *P2) { 
-  const ConstantInt *LHS = *P1; 
-  const ConstantInt *RHS = *P2; 
-  if (LHS == RHS) 
-    return 0; 
-  return LHS->getValue().ult(RHS->getValue()) ? 1 : -1; 
-} 
- 
-static inline bool HasBranchWeights(const Instruction *I) { 
-  MDNode *ProfMD = I->getMetadata(LLVMContext::MD_prof); 
-  if (ProfMD && ProfMD->getOperand(0)) 
-    if (MDString *MDS = dyn_cast<MDString>(ProfMD->getOperand(0))) 
-      return MDS->getString().equals("branch_weights"); 
- 
-  return false; 
-} 
- 
-/// Get Weights of a given terminator, the default weight is at the front 
-/// of the vector. If TI is a conditional eq, we need to swap the branch-weight 
-/// metadata. 
-static void GetBranchWeights(Instruction *TI, 
-                             SmallVectorImpl<uint64_t> &Weights) { 
-  MDNode *MD = TI->getMetadata(LLVMContext::MD_prof); 
-  assert(MD); 
-  for (unsigned i = 1, e = MD->getNumOperands(); i < e; ++i) { 
-    ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(i)); 
-    Weights.push_back(CI->getValue().getZExtValue()); 
-  } 
- 
-  // If TI is a conditional eq, the default case is the false case, 
-  // and the corresponding branch-weight data is at index 2. We swap the 
-  // default weight to be the first entry. 
-  if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { 
-    assert(Weights.size() == 2); 
-    ICmpInst *ICI = cast<ICmpInst>(BI->getCondition()); 
-    if (ICI->getPredicate() == ICmpInst::ICMP_EQ) 
-      std::swap(Weights.front(), Weights.back()); 
-  } 
-} 
- 
-/// Keep halving the weights until all can fit in uint32_t. 
-static void FitWeights(MutableArrayRef<uint64_t> Weights) { 
-  uint64_t Max = *std::max_element(Weights.begin(), Weights.end()); 
-  if (Max > UINT_MAX) { 
-    unsigned Offset = 32 - countLeadingZeros(Max); 
-    for (uint64_t &I : Weights) 
-      I >>= Offset; 
-  } 
-} 
- 
+  return true;
+}
+
+namespace {
+
+/// This class implements a stable ordering of constant
+/// integers that does not depend on their address.  This is important for
+/// applications that sort ConstantInt's to ensure uniqueness.
+struct ConstantIntOrdering {
+  bool operator()(const ConstantInt *LHS, const ConstantInt *RHS) const {
+    return LHS->getValue().ult(RHS->getValue());
+  }
+};
+
+} // end anonymous namespace
+
+static int ConstantIntSortPredicate(ConstantInt *const *P1,
+                                    ConstantInt *const *P2) {
+  const ConstantInt *LHS = *P1;
+  const ConstantInt *RHS = *P2;
+  if (LHS == RHS)
+    return 0;
+  return LHS->getValue().ult(RHS->getValue()) ? 1 : -1;
+}
+
+static inline bool HasBranchWeights(const Instruction *I) {
+  MDNode *ProfMD = I->getMetadata(LLVMContext::MD_prof);
+  if (ProfMD && ProfMD->getOperand(0))
+    if (MDString *MDS = dyn_cast<MDString>(ProfMD->getOperand(0)))
+      return MDS->getString().equals("branch_weights");
+
+  return false;
+}
+
+/// Get Weights of a given terminator, the default weight is at the front
+/// of the vector. If TI is a conditional eq, we need to swap the branch-weight
+/// metadata.
+static void GetBranchWeights(Instruction *TI,
+                             SmallVectorImpl<uint64_t> &Weights) {
+  MDNode *MD = TI->getMetadata(LLVMContext::MD_prof);
+  assert(MD);
+  for (unsigned i = 1, e = MD->getNumOperands(); i < e; ++i) {
+    ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(i));
+    Weights.push_back(CI->getValue().getZExtValue());
+  }
+
+  // If TI is a conditional eq, the default case is the false case,
+  // and the corresponding branch-weight data is at index 2. We swap the
+  // default weight to be the first entry.
+  if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+    assert(Weights.size() == 2);
+    ICmpInst *ICI = cast<ICmpInst>(BI->getCondition());
+    if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
+      std::swap(Weights.front(), Weights.back());
+  }
+}
+
+/// Keep halving the weights until all can fit in uint32_t.
+static void FitWeights(MutableArrayRef<uint64_t> Weights) {
+  uint64_t Max = *std::max_element(Weights.begin(), Weights.end());
+  if (Max > UINT_MAX) {
+    unsigned Offset = 32 - countLeadingZeros(Max);
+    for (uint64_t &I : Weights)
+      I >>= Offset;
+  }
+}
+
 static void CloneInstructionsIntoPredecessorBlockAndUpdateSSAUses(
     BasicBlock *BB, BasicBlock *PredBlock, ValueToValueMapTy &VMap) {
   Instruction *PTI = PredBlock->getTerminator();
- 
+
   // If we have bonus instructions, clone them into the predecessor block.
   // Note that there may be multiple predecessor blocks, so we cannot move
   // bonus instructions to a predecessor block.
   for (Instruction &BonusInst : *BB) {
     if (isa<DbgInfoIntrinsic>(BonusInst) || BonusInst.isTerminator())
       continue;
- 
+
     Instruction *NewBonusInst = BonusInst.clone();
- 
+
     if (PTI->getDebugLoc() != NewBonusInst->getDebugLoc()) {
       // Unless the instruction has the same !dbg location as the original
       // branch, drop it. When we fold the bonus instructions we want to make
@@ -1069,11 +1069,11 @@ static void CloneInstructionsIntoPredecessorBlockAndUpdateSSAUses(
       // dead code caused by folding dead branches.
       NewBonusInst->setDebugLoc(DebugLoc());
     }
- 
+
     RemapInstruction(NewBonusInst, VMap,
                      RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
     VMap[&BonusInst] = NewBonusInst;
- 
+
     // If we moved a load, we cannot any longer claim any knowledge about
     // its potential value. The previous information might have been valid
     // only given the branch precondition.
@@ -1081,11 +1081,11 @@ static void CloneInstructionsIntoPredecessorBlockAndUpdateSSAUses(
     // semantics we don't understand. We *can* preserve !annotation, because
     // it is tied to the instruction itself, not the value or position.
     NewBonusInst->dropUnknownNonDebugMetadata(LLVMContext::MD_annotation);
- 
+
     PredBlock->getInstList().insert(PTI->getIterator(), NewBonusInst);
     NewBonusInst->takeName(&BonusInst);
     BonusInst.setName(NewBonusInst->getName() + ".old");
- 
+
     // Update (liveout) uses of bonus instructions,
     // now that the bonus instruction has been cloned into predecessor.
     SSAUpdater SSAUpdate;
@@ -1097,26 +1097,26 @@ static void CloneInstructionsIntoPredecessorBlockAndUpdateSSAUses(
       SSAUpdate.RewriteUseAfterInsertions(U);
   }
 }
- 
+
 bool SimplifyCFGOpt::PerformValueComparisonIntoPredecessorFolding(
     Instruction *TI, Value *&CV, Instruction *PTI, IRBuilder<> &Builder) {
   BasicBlock *BB = TI->getParent();
   BasicBlock *Pred = PTI->getParent();
- 
+
   std::vector<DominatorTree::UpdateType> Updates;
- 
+
   // Figure out which 'cases' to copy from SI to PSI.
   std::vector<ValueEqualityComparisonCase> BBCases;
   BasicBlock *BBDefault = GetValueEqualityComparisonCases(TI, BBCases);
- 
+
   std::vector<ValueEqualityComparisonCase> PredCases;
   BasicBlock *PredDefault = GetValueEqualityComparisonCases(PTI, PredCases);
- 
+
   // Based on whether the default edge from PTI goes to BB or not, fill in
   // PredCases and PredDefault with the new switch cases we would like to
   // build.
   SmallMapVector<BasicBlock *, int, 8> NewSuccessors;
- 
+
   // Update the branch weight metadata along the way
   SmallVector<uint64_t, 8> Weights;
   bool PredHasWeights = HasBranchWeights(PTI);
@@ -1158,13 +1158,13 @@ bool SimplifyCFGOpt::PerformValueComparisonIntoPredecessorFolding(
           Weights[0] += Weights[i + 1];
           std::swap(Weights[i + 1], Weights.back());
           Weights.pop_back();
-        } 
- 
+        }
+
         PredCases.pop_back();
         --i;
         --e;
       }
- 
+
     // Reconstruct the new switch statement we will be building.
     if (PredDefault != BBDefault) {
       PredDefault->removePredecessor(Pred);
@@ -1180,15 +1180,15 @@ bool SimplifyCFGOpt::PerformValueComparisonIntoPredecessorFolding(
       if (!PTIHandled.count(BBCases[i].Value) && BBCases[i].Dest != BBDefault) {
         PredCases.push_back(BBCases[i]);
         ++NewSuccessors[BBCases[i].Dest];
-        if (SuccHasWeights || PredHasWeights) { 
+        if (SuccHasWeights || PredHasWeights) {
           // The default weight is at index 0, so weight for the ith case
           // should be at index i+1. Scale the cases from successor by
           // PredDefaultWeight (Weights[0]).
           Weights.push_back(Weights[0] * SuccWeights[i + 1]);
           ValidTotalSuccWeight += SuccWeights[i + 1];
-        } 
+        }
       }
- 
+
     if (SuccHasWeights || PredHasWeights) {
       ValidTotalSuccWeight += SuccWeights[0];
       // Scale the cases from predecessor by ValidTotalSuccWeight.
@@ -1206,19 +1206,19 @@ bool SimplifyCFGOpt::PerformValueComparisonIntoPredecessorFolding(
     for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
       if (PredCases[i].Dest == BB) {
         PTIHandled.insert(PredCases[i].Value);
- 
+
         if (PredHasWeights || SuccHasWeights) {
           WeightsForHandled[PredCases[i].Value] = Weights[i + 1];
           std::swap(Weights[i + 1], Weights.back());
           Weights.pop_back();
         }
- 
+
         std::swap(PredCases[i], PredCases.back());
         PredCases.pop_back();
         --i;
         --e;
       }
- 
+
     // Okay, now we know which constants were sent to BB from the
     // predecessor.  Figure out where they will all go now.
     for (unsigned i = 0, e = BBCases.size(); i != e; ++i)
@@ -1229,8 +1229,8 @@ bool SimplifyCFGOpt::PerformValueComparisonIntoPredecessorFolding(
         PredCases.push_back(BBCases[i]);
         ++NewSuccessors[BBCases[i].Dest];
         PTIHandled.erase(BBCases[i].Value); // This constant is taken care of
-      } 
- 
+      }
+
     // If there are any constants vectored to BB that TI doesn't handle,
     // they must go to the default destination of TI.
     for (ConstantInt *I : PTIHandled) {
@@ -1240,7 +1240,7 @@ bool SimplifyCFGOpt::PerformValueComparisonIntoPredecessorFolding(
       ++NewSuccessors[BBDefault];
     }
   }
- 
+
   // Okay, at this point, we know which new successor Pred will get.  Make
   // sure we update the number of entries in the PHI nodes for these
   // successors.
@@ -1253,24 +1253,24 @@ bool SimplifyCFGOpt::PerformValueComparisonIntoPredecessorFolding(
     if (!is_contained(successors(Pred), NewSuccessor.first))
       Updates.push_back({DominatorTree::Insert, Pred, NewSuccessor.first});
   }
- 
+
   Builder.SetInsertPoint(PTI);
   // Convert pointer to int before we switch.
   if (CV->getType()->isPointerTy()) {
     CV =
         Builder.CreatePtrToInt(CV, DL.getIntPtrType(CV->getType()), "magicptr");
   }
- 
+
   // Now that the successors are updated, create the new Switch instruction.
   SwitchInst *NewSI = Builder.CreateSwitch(CV, PredDefault, PredCases.size());
   NewSI->setDebugLoc(PTI->getDebugLoc());
   for (ValueEqualityComparisonCase &V : PredCases)
     NewSI->addCase(V.Value, V.Dest);
- 
+
   if (PredHasWeights || SuccHasWeights) {
     // Halve the weights if any of them cannot fit in an uint32_t
     FitWeights(Weights);
- 
+
     SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end());
 
     setBranchWeights(NewSI, MDWeights);
@@ -1291,15 +1291,15 @@ bool SimplifyCFGOpt::PerformValueComparisonIntoPredecessorFolding(
             BasicBlock::Create(BB->getContext(), "infloop", BB->getParent());
         BranchInst::Create(InfLoopBlock, InfLoopBlock);
         Updates.push_back({DominatorTree::Insert, InfLoopBlock, InfLoopBlock});
-      } 
+      }
       NewSI->setSuccessor(i, InfLoopBlock);
     }
- 
+
   if (InfLoopBlock)
     Updates.push_back({DominatorTree::Insert, Pred, InfLoopBlock});
- 
+
   Updates.push_back({DominatorTree::Delete, Pred, BB});
- 
+
   if (DTU)
     DTU->applyUpdates(Updates);
 
@@ -1339,702 +1339,702 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(Instruction *TI,
         if (!SplitBlockPredecessors(Succ, TI->getParent(), ".fold.split", DTU))
           return false;
       }
-    } 
+    }
 
     PerformValueComparisonIntoPredecessorFolding(TI, CV, PTI, Builder);
     Changed = true;
-  } 
-  return Changed; 
-} 
- 
-// If we would need to insert a select that uses the value of this invoke 
-// (comments in HoistThenElseCodeToIf explain why we would need to do this), we 
-// can't hoist the invoke, as there is nowhere to put the select in this case. 
-static bool isSafeToHoistInvoke(BasicBlock *BB1, BasicBlock *BB2, 
-                                Instruction *I1, Instruction *I2) { 
-  for (BasicBlock *Succ : successors(BB1)) { 
-    for (const PHINode &PN : Succ->phis()) { 
-      Value *BB1V = PN.getIncomingValueForBlock(BB1); 
-      Value *BB2V = PN.getIncomingValueForBlock(BB2); 
-      if (BB1V != BB2V && (BB1V == I1 || BB2V == I2)) { 
-        return false; 
-      } 
-    } 
-  } 
-  return true; 
-} 
- 
+  }
+  return Changed;
+}
+
+// If we would need to insert a select that uses the value of this invoke
+// (comments in HoistThenElseCodeToIf explain why we would need to do this), we
+// can't hoist the invoke, as there is nowhere to put the select in this case.
+static bool isSafeToHoistInvoke(BasicBlock *BB1, BasicBlock *BB2,
+                                Instruction *I1, Instruction *I2) {
+  for (BasicBlock *Succ : successors(BB1)) {
+    for (const PHINode &PN : Succ->phis()) {
+      Value *BB1V = PN.getIncomingValueForBlock(BB1);
+      Value *BB2V = PN.getIncomingValueForBlock(BB2);
+      if (BB1V != BB2V && (BB1V == I1 || BB2V == I2)) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
 static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I, bool PtrValueMayBeModified = false);
- 
-/// Given a conditional branch that goes to BB1 and BB2, hoist any common code 
-/// in the two blocks up into the branch block. The caller of this function 
-/// guarantees that BI's block dominates BB1 and BB2. 
-bool SimplifyCFGOpt::HoistThenElseCodeToIf(BranchInst *BI, 
-                                           const TargetTransformInfo &TTI) { 
-  // This does very trivial matching, with limited scanning, to find identical 
-  // instructions in the two blocks.  In particular, we don't want to get into 
-  // O(M*N) situations here where M and N are the sizes of BB1 and BB2.  As 
-  // such, we currently just scan for obviously identical instructions in an 
-  // identical order. 
-  BasicBlock *BB1 = BI->getSuccessor(0); // The true destination. 
-  BasicBlock *BB2 = BI->getSuccessor(1); // The false destination 
- 
-  BasicBlock::iterator BB1_Itr = BB1->begin(); 
-  BasicBlock::iterator BB2_Itr = BB2->begin(); 
- 
-  Instruction *I1 = &*BB1_Itr++, *I2 = &*BB2_Itr++; 
-  // Skip debug info if it is not identical. 
-  DbgInfoIntrinsic *DBI1 = dyn_cast<DbgInfoIntrinsic>(I1); 
-  DbgInfoIntrinsic *DBI2 = dyn_cast<DbgInfoIntrinsic>(I2); 
-  if (!DBI1 || !DBI2 || !DBI1->isIdenticalToWhenDefined(DBI2)) { 
-    while (isa<DbgInfoIntrinsic>(I1)) 
-      I1 = &*BB1_Itr++; 
-    while (isa<DbgInfoIntrinsic>(I2)) 
-      I2 = &*BB2_Itr++; 
-  } 
-  // FIXME: Can we define a safety predicate for CallBr? 
-  if (isa<PHINode>(I1) || !I1->isIdenticalToWhenDefined(I2) || 
-      (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2)) || 
-      isa<CallBrInst>(I1)) 
-    return false; 
- 
-  BasicBlock *BIParent = BI->getParent(); 
- 
-  bool Changed = false; 
+
+/// Given a conditional branch that goes to BB1 and BB2, hoist any common code
+/// in the two blocks up into the branch block. The caller of this function
+/// guarantees that BI's block dominates BB1 and BB2.
+bool SimplifyCFGOpt::HoistThenElseCodeToIf(BranchInst *BI,
+                                           const TargetTransformInfo &TTI) {
+  // This does very trivial matching, with limited scanning, to find identical
+  // instructions in the two blocks.  In particular, we don't want to get into
+  // O(M*N) situations here where M and N are the sizes of BB1 and BB2.  As
+  // such, we currently just scan for obviously identical instructions in an
+  // identical order.
+  BasicBlock *BB1 = BI->getSuccessor(0); // The true destination.
+  BasicBlock *BB2 = BI->getSuccessor(1); // The false destination
+
+  BasicBlock::iterator BB1_Itr = BB1->begin();
+  BasicBlock::iterator BB2_Itr = BB2->begin();
+
+  Instruction *I1 = &*BB1_Itr++, *I2 = &*BB2_Itr++;
+  // Skip debug info if it is not identical.
+  DbgInfoIntrinsic *DBI1 = dyn_cast<DbgInfoIntrinsic>(I1);
+  DbgInfoIntrinsic *DBI2 = dyn_cast<DbgInfoIntrinsic>(I2);
+  if (!DBI1 || !DBI2 || !DBI1->isIdenticalToWhenDefined(DBI2)) {
+    while (isa<DbgInfoIntrinsic>(I1))
+      I1 = &*BB1_Itr++;
+    while (isa<DbgInfoIntrinsic>(I2))
+      I2 = &*BB2_Itr++;
+  }
+  // FIXME: Can we define a safety predicate for CallBr?
+  if (isa<PHINode>(I1) || !I1->isIdenticalToWhenDefined(I2) ||
+      (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2)) ||
+      isa<CallBrInst>(I1))
+    return false;
+
+  BasicBlock *BIParent = BI->getParent();
+
+  bool Changed = false;
 
   auto _ = make_scope_exit([&]() {
     if (Changed)
       ++NumHoistCommonCode;
   });
 
-  do { 
-    // If we are hoisting the terminator instruction, don't move one (making a 
-    // broken BB), instead clone it, and remove BI. 
-    if (I1->isTerminator()) 
-      goto HoistTerminator; 
- 
-    // If we're going to hoist a call, make sure that the two instructions we're 
-    // commoning/hoisting are both marked with musttail, or neither of them is 
-    // marked as such. Otherwise, we might end up in a situation where we hoist 
-    // from a block where the terminator is a `ret` to a block where the terminator 
-    // is a `br`, and `musttail` calls expect to be followed by a return. 
-    auto *C1 = dyn_cast<CallInst>(I1); 
-    auto *C2 = dyn_cast<CallInst>(I2); 
-    if (C1 && C2) 
-      if (C1->isMustTailCall() != C2->isMustTailCall()) 
-        return Changed; 
- 
-    if (!TTI.isProfitableToHoist(I1) || !TTI.isProfitableToHoist(I2)) 
-      return Changed; 
- 
-    // If any of the two call sites has nomerge attribute, stop hoisting. 
-    if (const auto *CB1 = dyn_cast<CallBase>(I1)) 
-      if (CB1->cannotMerge()) 
-        return Changed; 
-    if (const auto *CB2 = dyn_cast<CallBase>(I2)) 
-      if (CB2->cannotMerge()) 
-        return Changed; 
- 
-    if (isa<DbgInfoIntrinsic>(I1) || isa<DbgInfoIntrinsic>(I2)) { 
-      assert (isa<DbgInfoIntrinsic>(I1) && isa<DbgInfoIntrinsic>(I2)); 
-      // The debug location is an integral part of a debug info intrinsic 
-      // and can't be separated from it or replaced.  Instead of attempting 
-      // to merge locations, simply hoist both copies of the intrinsic. 
-      BIParent->getInstList().splice(BI->getIterator(), 
-                                     BB1->getInstList(), I1); 
-      BIParent->getInstList().splice(BI->getIterator(), 
-                                     BB2->getInstList(), I2); 
-      Changed = true; 
-    } else { 
-      // For a normal instruction, we just move one to right before the branch, 
-      // then replace all uses of the other with the first.  Finally, we remove 
-      // the now redundant second instruction. 
-      BIParent->getInstList().splice(BI->getIterator(), 
-                                     BB1->getInstList(), I1); 
-      if (!I2->use_empty()) 
-        I2->replaceAllUsesWith(I1); 
-      I1->andIRFlags(I2); 
-      unsigned KnownIDs[] = {LLVMContext::MD_tbaa, 
-                             LLVMContext::MD_range, 
-                             LLVMContext::MD_fpmath, 
-                             LLVMContext::MD_invariant_load, 
-                             LLVMContext::MD_nonnull, 
-                             LLVMContext::MD_invariant_group, 
-                             LLVMContext::MD_align, 
-                             LLVMContext::MD_dereferenceable, 
-                             LLVMContext::MD_dereferenceable_or_null, 
-                             LLVMContext::MD_mem_parallel_loop_access, 
-                             LLVMContext::MD_access_group, 
-                             LLVMContext::MD_preserve_access_index}; 
-      combineMetadata(I1, I2, KnownIDs, true); 
- 
-      // I1 and I2 are being combined into a single instruction.  Its debug 
-      // location is the merged locations of the original instructions. 
-      I1->applyMergedLocation(I1->getDebugLoc(), I2->getDebugLoc()); 
- 
-      I2->eraseFromParent(); 
-      Changed = true; 
-    } 
+  do {
+    // If we are hoisting the terminator instruction, don't move one (making a
+    // broken BB), instead clone it, and remove BI.
+    if (I1->isTerminator())
+      goto HoistTerminator;
+
+    // If we're going to hoist a call, make sure that the two instructions we're
+    // commoning/hoisting are both marked with musttail, or neither of them is
+    // marked as such. Otherwise, we might end up in a situation where we hoist
+    // from a block where the terminator is a `ret` to a block where the terminator
+    // is a `br`, and `musttail` calls expect to be followed by a return.
+    auto *C1 = dyn_cast<CallInst>(I1);
+    auto *C2 = dyn_cast<CallInst>(I2);
+    if (C1 && C2)
+      if (C1->isMustTailCall() != C2->isMustTailCall())
+        return Changed;
+
+    if (!TTI.isProfitableToHoist(I1) || !TTI.isProfitableToHoist(I2))
+      return Changed;
+
+    // If any of the two call sites has nomerge attribute, stop hoisting.
+    if (const auto *CB1 = dyn_cast<CallBase>(I1))
+      if (CB1->cannotMerge())
+        return Changed;
+    if (const auto *CB2 = dyn_cast<CallBase>(I2))
+      if (CB2->cannotMerge())
+        return Changed;
+
+    if (isa<DbgInfoIntrinsic>(I1) || isa<DbgInfoIntrinsic>(I2)) {
+      assert (isa<DbgInfoIntrinsic>(I1) && isa<DbgInfoIntrinsic>(I2));
+      // The debug location is an integral part of a debug info intrinsic
+      // and can't be separated from it or replaced.  Instead of attempting
+      // to merge locations, simply hoist both copies of the intrinsic.
+      BIParent->getInstList().splice(BI->getIterator(),
+                                     BB1->getInstList(), I1);
+      BIParent->getInstList().splice(BI->getIterator(),
+                                     BB2->getInstList(), I2);
+      Changed = true;
+    } else {
+      // For a normal instruction, we just move one to right before the branch,
+      // then replace all uses of the other with the first.  Finally, we remove
+      // the now redundant second instruction.
+      BIParent->getInstList().splice(BI->getIterator(),
+                                     BB1->getInstList(), I1);
+      if (!I2->use_empty())
+        I2->replaceAllUsesWith(I1);
+      I1->andIRFlags(I2);
+      unsigned KnownIDs[] = {LLVMContext::MD_tbaa,
+                             LLVMContext::MD_range,
+                             LLVMContext::MD_fpmath,
+                             LLVMContext::MD_invariant_load,
+                             LLVMContext::MD_nonnull,
+                             LLVMContext::MD_invariant_group,
+                             LLVMContext::MD_align,
+                             LLVMContext::MD_dereferenceable,
+                             LLVMContext::MD_dereferenceable_or_null,
+                             LLVMContext::MD_mem_parallel_loop_access,
+                             LLVMContext::MD_access_group,
+                             LLVMContext::MD_preserve_access_index};
+      combineMetadata(I1, I2, KnownIDs, true);
+
+      // I1 and I2 are being combined into a single instruction.  Its debug
+      // location is the merged locations of the original instructions.
+      I1->applyMergedLocation(I1->getDebugLoc(), I2->getDebugLoc());
+
+      I2->eraseFromParent();
+      Changed = true;
+    }
     ++NumHoistCommonInstrs;
- 
-    I1 = &*BB1_Itr++; 
-    I2 = &*BB2_Itr++; 
-    // Skip debug info if it is not identical. 
-    DbgInfoIntrinsic *DBI1 = dyn_cast<DbgInfoIntrinsic>(I1); 
-    DbgInfoIntrinsic *DBI2 = dyn_cast<DbgInfoIntrinsic>(I2); 
-    if (!DBI1 || !DBI2 || !DBI1->isIdenticalToWhenDefined(DBI2)) { 
-      while (isa<DbgInfoIntrinsic>(I1)) 
-        I1 = &*BB1_Itr++; 
-      while (isa<DbgInfoIntrinsic>(I2)) 
-        I2 = &*BB2_Itr++; 
-    } 
-  } while (I1->isIdenticalToWhenDefined(I2)); 
- 
-  return true; 
- 
-HoistTerminator: 
-  // It may not be possible to hoist an invoke. 
-  // FIXME: Can we define a safety predicate for CallBr? 
-  if (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2)) 
-    return Changed; 
- 
-  // TODO: callbr hoisting currently disabled pending further study. 
-  if (isa<CallBrInst>(I1)) 
-    return Changed; 
- 
-  for (BasicBlock *Succ : successors(BB1)) { 
-    for (PHINode &PN : Succ->phis()) { 
-      Value *BB1V = PN.getIncomingValueForBlock(BB1); 
-      Value *BB2V = PN.getIncomingValueForBlock(BB2); 
-      if (BB1V == BB2V) 
-        continue; 
- 
-      // Check for passingValueIsAlwaysUndefined here because we would rather 
-      // eliminate undefined control flow then converting it to a select. 
-      if (passingValueIsAlwaysUndefined(BB1V, &PN) || 
-          passingValueIsAlwaysUndefined(BB2V, &PN)) 
-        return Changed; 
- 
-      if (isa<ConstantExpr>(BB1V) && !isSafeToSpeculativelyExecute(BB1V)) 
-        return Changed; 
-      if (isa<ConstantExpr>(BB2V) && !isSafeToSpeculativelyExecute(BB2V)) 
-        return Changed; 
-    } 
-  } 
- 
-  // Okay, it is safe to hoist the terminator. 
-  Instruction *NT = I1->clone(); 
-  BIParent->getInstList().insert(BI->getIterator(), NT); 
-  if (!NT->getType()->isVoidTy()) { 
-    I1->replaceAllUsesWith(NT); 
-    I2->replaceAllUsesWith(NT); 
-    NT->takeName(I1); 
-  } 
+
+    I1 = &*BB1_Itr++;
+    I2 = &*BB2_Itr++;
+    // Skip debug info if it is not identical.
+    DbgInfoIntrinsic *DBI1 = dyn_cast<DbgInfoIntrinsic>(I1);
+    DbgInfoIntrinsic *DBI2 = dyn_cast<DbgInfoIntrinsic>(I2);
+    if (!DBI1 || !DBI2 || !DBI1->isIdenticalToWhenDefined(DBI2)) {
+      while (isa<DbgInfoIntrinsic>(I1))
+        I1 = &*BB1_Itr++;
+      while (isa<DbgInfoIntrinsic>(I2))
+        I2 = &*BB2_Itr++;
+    }
+  } while (I1->isIdenticalToWhenDefined(I2));
+
+  return true;
+
+HoistTerminator:
+  // It may not be possible to hoist an invoke.
+  // FIXME: Can we define a safety predicate for CallBr?
+  if (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2))
+    return Changed;
+
+  // TODO: callbr hoisting currently disabled pending further study.
+  if (isa<CallBrInst>(I1))
+    return Changed;
+
+  for (BasicBlock *Succ : successors(BB1)) {
+    for (PHINode &PN : Succ->phis()) {
+      Value *BB1V = PN.getIncomingValueForBlock(BB1);
+      Value *BB2V = PN.getIncomingValueForBlock(BB2);
+      if (BB1V == BB2V)
+        continue;
+
+      // Check for passingValueIsAlwaysUndefined here because we would rather
+      // eliminate undefined control flow then converting it to a select.
+      if (passingValueIsAlwaysUndefined(BB1V, &PN) ||
+          passingValueIsAlwaysUndefined(BB2V, &PN))
+        return Changed;
+
+      if (isa<ConstantExpr>(BB1V) && !isSafeToSpeculativelyExecute(BB1V))
+        return Changed;
+      if (isa<ConstantExpr>(BB2V) && !isSafeToSpeculativelyExecute(BB2V))
+        return Changed;
+    }
+  }
+
+  // Okay, it is safe to hoist the terminator.
+  Instruction *NT = I1->clone();
+  BIParent->getInstList().insert(BI->getIterator(), NT);
+  if (!NT->getType()->isVoidTy()) {
+    I1->replaceAllUsesWith(NT);
+    I2->replaceAllUsesWith(NT);
+    NT->takeName(I1);
+  }
   Changed = true;
   ++NumHoistCommonInstrs;
- 
-  // Ensure terminator gets a debug location, even an unknown one, in case 
-  // it involves inlinable calls. 
-  NT->applyMergedLocation(I1->getDebugLoc(), I2->getDebugLoc()); 
- 
-  // PHIs created below will adopt NT's merged DebugLoc. 
-  IRBuilder<NoFolder> Builder(NT); 
- 
-  // Hoisting one of the terminators from our successor is a great thing. 
-  // Unfortunately, the successors of the if/else blocks may have PHI nodes in 
-  // them.  If they do, all PHI entries for BB1/BB2 must agree for all PHI 
-  // nodes, so we insert select instruction to compute the final result. 
-  std::map<std::pair<Value *, Value *>, SelectInst *> InsertedSelects; 
-  for (BasicBlock *Succ : successors(BB1)) { 
-    for (PHINode &PN : Succ->phis()) { 
-      Value *BB1V = PN.getIncomingValueForBlock(BB1); 
-      Value *BB2V = PN.getIncomingValueForBlock(BB2); 
-      if (BB1V == BB2V) 
-        continue; 
- 
-      // These values do not agree.  Insert a select instruction before NT 
-      // that determines the right value. 
-      SelectInst *&SI = InsertedSelects[std::make_pair(BB1V, BB2V)]; 
-      if (!SI) { 
-        // Propagate fast-math-flags from phi node to its replacement select. 
-        IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); 
-        if (isa<FPMathOperator>(PN)) 
-          Builder.setFastMathFlags(PN.getFastMathFlags()); 
- 
-        SI = cast<SelectInst>( 
-            Builder.CreateSelect(BI->getCondition(), BB1V, BB2V, 
-                                 BB1V->getName() + "." + BB2V->getName(), BI)); 
-      } 
- 
-      // Make the PHI node use the select for all incoming values for BB1/BB2 
-      for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) 
-        if (PN.getIncomingBlock(i) == BB1 || PN.getIncomingBlock(i) == BB2) 
-          PN.setIncomingValue(i, SI); 
-    } 
-  } 
- 
+
+  // Ensure terminator gets a debug location, even an unknown one, in case
+  // it involves inlinable calls.
+  NT->applyMergedLocation(I1->getDebugLoc(), I2->getDebugLoc());
+
+  // PHIs created below will adopt NT's merged DebugLoc.
+  IRBuilder<NoFolder> Builder(NT);
+
+  // Hoisting one of the terminators from our successor is a great thing.
+  // Unfortunately, the successors of the if/else blocks may have PHI nodes in
+  // them.  If they do, all PHI entries for BB1/BB2 must agree for all PHI
+  // nodes, so we insert select instruction to compute the final result.
+  std::map<std::pair<Value *, Value *>, SelectInst *> InsertedSelects;
+  for (BasicBlock *Succ : successors(BB1)) {
+    for (PHINode &PN : Succ->phis()) {
+      Value *BB1V = PN.getIncomingValueForBlock(BB1);
+      Value *BB2V = PN.getIncomingValueForBlock(BB2);
+      if (BB1V == BB2V)
+        continue;
+
+      // These values do not agree.  Insert a select instruction before NT
+      // that determines the right value.
+      SelectInst *&SI = InsertedSelects[std::make_pair(BB1V, BB2V)];
+      if (!SI) {
+        // Propagate fast-math-flags from phi node to its replacement select.
+        IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
+        if (isa<FPMathOperator>(PN))
+          Builder.setFastMathFlags(PN.getFastMathFlags());
+
+        SI = cast<SelectInst>(
+            Builder.CreateSelect(BI->getCondition(), BB1V, BB2V,
+                                 BB1V->getName() + "." + BB2V->getName(), BI));
+      }
+
+      // Make the PHI node use the select for all incoming values for BB1/BB2
+      for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
+        if (PN.getIncomingBlock(i) == BB1 || PN.getIncomingBlock(i) == BB2)
+          PN.setIncomingValue(i, SI);
+    }
+  }
+
   SmallVector<DominatorTree::UpdateType, 4> Updates;
 
-  // Update any PHI nodes in our new successors. 
+  // Update any PHI nodes in our new successors.
   for (BasicBlock *Succ : successors(BB1)) {
-    AddPredecessorToBlock(Succ, BIParent, BB1); 
+    AddPredecessorToBlock(Succ, BIParent, BB1);
     Updates.push_back({DominatorTree::Insert, BIParent, Succ});
   }
   for (BasicBlock *Succ : successors(BI))
     Updates.push_back({DominatorTree::Delete, BIParent, Succ});
- 
-  EraseTerminatorAndDCECond(BI); 
+
+  EraseTerminatorAndDCECond(BI);
   if (DTU)
     DTU->applyUpdates(Updates);
   return Changed;
-} 
- 
-// Check lifetime markers. 
-static bool isLifeTimeMarker(const Instruction *I) { 
-  if (auto II = dyn_cast<IntrinsicInst>(I)) { 
-    switch (II->getIntrinsicID()) { 
-    default: 
-      break; 
-    case Intrinsic::lifetime_start: 
-    case Intrinsic::lifetime_end: 
-      return true; 
-    } 
-  } 
-  return false; 
-} 
- 
-// TODO: Refine this. This should avoid cases like turning constant memcpy sizes 
-// into variables. 
-static bool replacingOperandWithVariableIsCheap(const Instruction *I, 
-                                                int OpIdx) { 
-  return !isa<IntrinsicInst>(I); 
-} 
- 
-// All instructions in Insts belong to different blocks that all unconditionally 
-// branch to a common successor. Analyze each instruction and return true if it 
-// would be possible to sink them into their successor, creating one common 
-// instruction instead. For every value that would be required to be provided by 
-// PHI node (because an operand varies in each input block), add to PHIOperands. 
-static bool canSinkInstructions( 
-    ArrayRef<Instruction *> Insts, 
-    DenseMap<Instruction *, SmallVector<Value *, 4>> &PHIOperands) { 
-  // Prune out obviously bad instructions to move. Each instruction must have 
-  // exactly zero or one use, and we check later that use is by a single, common 
-  // PHI instruction in the successor. 
-  bool HasUse = !Insts.front()->user_empty(); 
-  for (auto *I : Insts) { 
-    // These instructions may change or break semantics if moved. 
-    if (isa<PHINode>(I) || I->isEHPad() || isa<AllocaInst>(I) || 
-        I->getType()->isTokenTy()) 
-      return false; 
- 
+}
+
+// Check lifetime markers.
+static bool isLifeTimeMarker(const Instruction *I) {
+  if (auto II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+    default:
+      break;
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
+      return true;
+    }
+  }
+  return false;
+}
+
+// TODO: Refine this. This should avoid cases like turning constant memcpy sizes
+// into variables.
+static bool replacingOperandWithVariableIsCheap(const Instruction *I,
+                                                int OpIdx) {
+  return !isa<IntrinsicInst>(I);
+}
+
+// All instructions in Insts belong to different blocks that all unconditionally
+// branch to a common successor. Analyze each instruction and return true if it
+// would be possible to sink them into their successor, creating one common
+// instruction instead. For every value that would be required to be provided by
+// PHI node (because an operand varies in each input block), add to PHIOperands.
+static bool canSinkInstructions(
+    ArrayRef<Instruction *> Insts,
+    DenseMap<Instruction *, SmallVector<Value *, 4>> &PHIOperands) {
+  // Prune out obviously bad instructions to move. Each instruction must have
+  // exactly zero or one use, and we check later that use is by a single, common
+  // PHI instruction in the successor.
+  bool HasUse = !Insts.front()->user_empty();
+  for (auto *I : Insts) {
+    // These instructions may change or break semantics if moved.
+    if (isa<PHINode>(I) || I->isEHPad() || isa<AllocaInst>(I) ||
+        I->getType()->isTokenTy())
+      return false;
+
     // Do not try to sink an instruction in an infinite loop - it can cause
     // this algorithm to infinite loop.
     if (I->getParent()->getSingleSuccessor() == I->getParent())
       return false;
 
-    // Conservatively return false if I is an inline-asm instruction. Sinking 
-    // and merging inline-asm instructions can potentially create arguments 
-    // that cannot satisfy the inline-asm constraints. 
-    // If the instruction has nomerge attribute, return false. 
-    if (const auto *C = dyn_cast<CallBase>(I)) 
-      if (C->isInlineAsm() || C->cannotMerge()) 
-        return false; 
- 
-    // Each instruction must have zero or one use. 
-    if (HasUse && !I->hasOneUse()) 
-      return false; 
-    if (!HasUse && !I->user_empty()) 
-      return false; 
-  } 
- 
-  const Instruction *I0 = Insts.front(); 
-  for (auto *I : Insts) 
-    if (!I->isSameOperationAs(I0)) 
-      return false; 
- 
-  // All instructions in Insts are known to be the same opcode. If they have a 
-  // use, check that the only user is a PHI or in the same block as the 
-  // instruction, because if a user is in the same block as an instruction we're 
-  // contemplating sinking, it must already be determined to be sinkable. 
-  if (HasUse) { 
-    auto *PNUse = dyn_cast<PHINode>(*I0->user_begin()); 
-    auto *Succ = I0->getParent()->getTerminator()->getSuccessor(0); 
-    if (!all_of(Insts, [&PNUse,&Succ](const Instruction *I) -> bool { 
-          auto *U = cast<Instruction>(*I->user_begin()); 
-          return (PNUse && 
-                  PNUse->getParent() == Succ && 
-                  PNUse->getIncomingValueForBlock(I->getParent()) == I) || 
-                 U->getParent() == I->getParent(); 
-        })) 
-      return false; 
-  } 
- 
-  // Because SROA can't handle speculating stores of selects, try not to sink 
-  // loads, stores or lifetime markers of allocas when we'd have to create a 
-  // PHI for the address operand. Also, because it is likely that loads or 
-  // stores of allocas will disappear when Mem2Reg/SROA is run, don't sink 
-  // them. 
-  // This can cause code churn which can have unintended consequences down 
-  // the line - see https://llvm.org/bugs/show_bug.cgi?id=30244. 
-  // FIXME: This is a workaround for a deficiency in SROA - see 
-  // https://llvm.org/bugs/show_bug.cgi?id=30188 
-  if (isa<StoreInst>(I0) && any_of(Insts, [](const Instruction *I) { 
-        return isa<AllocaInst>(I->getOperand(1)->stripPointerCasts()); 
-      })) 
-    return false; 
-  if (isa<LoadInst>(I0) && any_of(Insts, [](const Instruction *I) { 
-        return isa<AllocaInst>(I->getOperand(0)->stripPointerCasts()); 
-      })) 
-    return false; 
-  if (isLifeTimeMarker(I0) && any_of(Insts, [](const Instruction *I) { 
-        return isa<AllocaInst>(I->getOperand(1)->stripPointerCasts()); 
-      })) 
-    return false; 
- 
-  for (unsigned OI = 0, OE = I0->getNumOperands(); OI != OE; ++OI) { 
-    Value *Op = I0->getOperand(OI); 
-    if (Op->getType()->isTokenTy()) 
-      // Don't touch any operand of token type. 
-      return false; 
- 
-    auto SameAsI0 = [&I0, OI](const Instruction *I) { 
-      assert(I->getNumOperands() == I0->getNumOperands()); 
-      return I->getOperand(OI) == I0->getOperand(OI); 
-    }; 
-    if (!all_of(Insts, SameAsI0)) { 
-      if ((isa<Constant>(Op) && !replacingOperandWithVariableIsCheap(I0, OI)) || 
-          !canReplaceOperandWithVariable(I0, OI)) 
-        // We can't create a PHI from this GEP. 
-        return false; 
-      // Don't create indirect calls! The called value is the final operand. 
-      if (isa<CallBase>(I0) && OI == OE - 1) { 
-        // FIXME: if the call was *already* indirect, we should do this. 
-        return false; 
-      } 
-      for (auto *I : Insts) 
-        PHIOperands[I].push_back(I->getOperand(OI)); 
-    } 
-  } 
-  return true; 
-} 
- 
+    // Conservatively return false if I is an inline-asm instruction. Sinking
+    // and merging inline-asm instructions can potentially create arguments
+    // that cannot satisfy the inline-asm constraints.
+    // If the instruction has nomerge attribute, return false.
+    if (const auto *C = dyn_cast<CallBase>(I))
+      if (C->isInlineAsm() || C->cannotMerge())
+        return false;
+
+    // Each instruction must have zero or one use.
+    if (HasUse && !I->hasOneUse())
+      return false;
+    if (!HasUse && !I->user_empty())
+      return false;
+  }
+
+  const Instruction *I0 = Insts.front();
+  for (auto *I : Insts)
+    if (!I->isSameOperationAs(I0))
+      return false;
+
+  // All instructions in Insts are known to be the same opcode. If they have a
+  // use, check that the only user is a PHI or in the same block as the
+  // instruction, because if a user is in the same block as an instruction we're
+  // contemplating sinking, it must already be determined to be sinkable.
+  if (HasUse) {
+    auto *PNUse = dyn_cast<PHINode>(*I0->user_begin());
+    auto *Succ = I0->getParent()->getTerminator()->getSuccessor(0);
+    if (!all_of(Insts, [&PNUse,&Succ](const Instruction *I) -> bool {
+          auto *U = cast<Instruction>(*I->user_begin());
+          return (PNUse &&
+                  PNUse->getParent() == Succ &&
+                  PNUse->getIncomingValueForBlock(I->getParent()) == I) ||
+                 U->getParent() == I->getParent();
+        }))
+      return false;
+  }
+
+  // Because SROA can't handle speculating stores of selects, try not to sink
+  // loads, stores or lifetime markers of allocas when we'd have to create a
+  // PHI for the address operand. Also, because it is likely that loads or
+  // stores of allocas will disappear when Mem2Reg/SROA is run, don't sink
+  // them.
+  // This can cause code churn which can have unintended consequences down
+  // the line - see https://llvm.org/bugs/show_bug.cgi?id=30244.
+  // FIXME: This is a workaround for a deficiency in SROA - see
+  // https://llvm.org/bugs/show_bug.cgi?id=30188
+  if (isa<StoreInst>(I0) && any_of(Insts, [](const Instruction *I) {
+        return isa<AllocaInst>(I->getOperand(1)->stripPointerCasts());
+      }))
+    return false;
+  if (isa<LoadInst>(I0) && any_of(Insts, [](const Instruction *I) {
+        return isa<AllocaInst>(I->getOperand(0)->stripPointerCasts());
+      }))
+    return false;
+  if (isLifeTimeMarker(I0) && any_of(Insts, [](const Instruction *I) {
+        return isa<AllocaInst>(I->getOperand(1)->stripPointerCasts());
+      }))
+    return false;
+
+  for (unsigned OI = 0, OE = I0->getNumOperands(); OI != OE; ++OI) {
+    Value *Op = I0->getOperand(OI);
+    if (Op->getType()->isTokenTy())
+      // Don't touch any operand of token type.
+      return false;
+
+    auto SameAsI0 = [&I0, OI](const Instruction *I) {
+      assert(I->getNumOperands() == I0->getNumOperands());
+      return I->getOperand(OI) == I0->getOperand(OI);
+    };
+    if (!all_of(Insts, SameAsI0)) {
+      if ((isa<Constant>(Op) && !replacingOperandWithVariableIsCheap(I0, OI)) ||
+          !canReplaceOperandWithVariable(I0, OI))
+        // We can't create a PHI from this GEP.
+        return false;
+      // Don't create indirect calls! The called value is the final operand.
+      if (isa<CallBase>(I0) && OI == OE - 1) {
+        // FIXME: if the call was *already* indirect, we should do this.
+        return false;
+      }
+      for (auto *I : Insts)
+        PHIOperands[I].push_back(I->getOperand(OI));
+    }
+  }
+  return true;
+}
+
 // Assuming canSinkInstructions(Blocks) has returned true, sink the last
-// instruction of every block in Blocks to their common successor, commoning 
-// into one instruction. 
-static bool sinkLastInstruction(ArrayRef<BasicBlock*> Blocks) { 
-  auto *BBEnd = Blocks[0]->getTerminator()->getSuccessor(0); 
- 
+// instruction of every block in Blocks to their common successor, commoning
+// into one instruction.
+static bool sinkLastInstruction(ArrayRef<BasicBlock*> Blocks) {
+  auto *BBEnd = Blocks[0]->getTerminator()->getSuccessor(0);
+
   // canSinkInstructions returning true guarantees that every block has at
-  // least one non-terminator instruction. 
-  SmallVector<Instruction*,4> Insts; 
-  for (auto *BB : Blocks) { 
-    Instruction *I = BB->getTerminator(); 
-    do { 
-      I = I->getPrevNode(); 
-    } while (isa<DbgInfoIntrinsic>(I) && I != &BB->front()); 
-    if (!isa<DbgInfoIntrinsic>(I)) 
-      Insts.push_back(I); 
-  } 
- 
-  // The only checking we need to do now is that all users of all instructions 
+  // least one non-terminator instruction.
+  SmallVector<Instruction*,4> Insts;
+  for (auto *BB : Blocks) {
+    Instruction *I = BB->getTerminator();
+    do {
+      I = I->getPrevNode();
+    } while (isa<DbgInfoIntrinsic>(I) && I != &BB->front());
+    if (!isa<DbgInfoIntrinsic>(I))
+      Insts.push_back(I);
+  }
+
+  // The only checking we need to do now is that all users of all instructions
   // are the same PHI node. canSinkInstructions should have checked this but
   // it is slightly over-aggressive - it gets confused by commutative
   // instructions so double-check it here.
-  Instruction *I0 = Insts.front(); 
-  if (!I0->user_empty()) { 
-    auto *PNUse = dyn_cast<PHINode>(*I0->user_begin()); 
-    if (!all_of(Insts, [&PNUse](const Instruction *I) -> bool { 
-          auto *U = cast<Instruction>(*I->user_begin()); 
-          return U == PNUse; 
-        })) 
-      return false; 
-  } 
- 
+  Instruction *I0 = Insts.front();
+  if (!I0->user_empty()) {
+    auto *PNUse = dyn_cast<PHINode>(*I0->user_begin());
+    if (!all_of(Insts, [&PNUse](const Instruction *I) -> bool {
+          auto *U = cast<Instruction>(*I->user_begin());
+          return U == PNUse;
+        }))
+      return false;
+  }
+
   // We don't need to do any more checking here; canSinkInstructions should
-  // have done it all for us. 
-  SmallVector<Value*, 4> NewOperands; 
-  for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O) { 
+  // have done it all for us.
+  SmallVector<Value*, 4> NewOperands;
+  for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O) {
     // This check is different to that in canSinkInstructions. There, we
-    // cared about the global view once simplifycfg (and instcombine) have 
-    // completed - it takes into account PHIs that become trivially 
-    // simplifiable.  However here we need a more local view; if an operand 
-    // differs we create a PHI and rely on instcombine to clean up the very 
-    // small mess we may make. 
-    bool NeedPHI = any_of(Insts, [&I0, O](const Instruction *I) { 
-      return I->getOperand(O) != I0->getOperand(O); 
-    }); 
-    if (!NeedPHI) { 
-      NewOperands.push_back(I0->getOperand(O)); 
-      continue; 
-    } 
- 
-    // Create a new PHI in the successor block and populate it. 
-    auto *Op = I0->getOperand(O); 
-    assert(!Op->getType()->isTokenTy() && "Can't PHI tokens!"); 
-    auto *PN = PHINode::Create(Op->getType(), Insts.size(), 
-                               Op->getName() + ".sink", &BBEnd->front()); 
-    for (auto *I : Insts) 
-      PN->addIncoming(I->getOperand(O), I->getParent()); 
-    NewOperands.push_back(PN); 
-  } 
- 
-  // Arbitrarily use I0 as the new "common" instruction; remap its operands 
-  // and move it to the start of the successor block. 
-  for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O) 
-    I0->getOperandUse(O).set(NewOperands[O]); 
-  I0->moveBefore(&*BBEnd->getFirstInsertionPt()); 
- 
-  // Update metadata and IR flags, and merge debug locations. 
-  for (auto *I : Insts) 
-    if (I != I0) { 
-      // The debug location for the "common" instruction is the merged locations 
-      // of all the commoned instructions.  We start with the original location 
-      // of the "common" instruction and iteratively merge each location in the 
-      // loop below. 
-      // This is an N-way merge, which will be inefficient if I0 is a CallInst. 
-      // However, as N-way merge for CallInst is rare, so we use simplified API 
-      // instead of using complex API for N-way merge. 
-      I0->applyMergedLocation(I0->getDebugLoc(), I->getDebugLoc()); 
-      combineMetadataForCSE(I0, I, true); 
-      I0->andIRFlags(I); 
-    } 
- 
-  if (!I0->user_empty()) { 
-    // canSinkLastInstruction checked that all instructions were used by 
-    // one and only one PHI node. Find that now, RAUW it to our common 
-    // instruction and nuke it. 
-    auto *PN = cast<PHINode>(*I0->user_begin()); 
-    PN->replaceAllUsesWith(I0); 
-    PN->eraseFromParent(); 
-  } 
- 
-  // Finally nuke all instructions apart from the common instruction. 
-  for (auto *I : Insts) 
-    if (I != I0) 
-      I->eraseFromParent(); 
- 
-  return true; 
-} 
- 
-namespace { 
- 
-  // LockstepReverseIterator - Iterates through instructions 
-  // in a set of blocks in reverse order from the first non-terminator. 
-  // For example (assume all blocks have size n): 
-  //   LockstepReverseIterator I([B1, B2, B3]); 
-  //   *I-- = [B1[n], B2[n], B3[n]]; 
-  //   *I-- = [B1[n-1], B2[n-1], B3[n-1]]; 
-  //   *I-- = [B1[n-2], B2[n-2], B3[n-2]]; 
-  //   ... 
-  class LockstepReverseIterator { 
-    ArrayRef<BasicBlock*> Blocks; 
-    SmallVector<Instruction*,4> Insts; 
-    bool Fail; 
- 
-  public: 
-    LockstepReverseIterator(ArrayRef<BasicBlock*> Blocks) : Blocks(Blocks) { 
-      reset(); 
-    } 
- 
-    void reset() { 
-      Fail = false; 
-      Insts.clear(); 
-      for (auto *BB : Blocks) { 
-        Instruction *Inst = BB->getTerminator(); 
-        for (Inst = Inst->getPrevNode(); Inst && isa<DbgInfoIntrinsic>(Inst);) 
-          Inst = Inst->getPrevNode(); 
-        if (!Inst) { 
-          // Block wasn't big enough. 
-          Fail = true; 
-          return; 
-        } 
-        Insts.push_back(Inst); 
-      } 
-    } 
- 
-    bool isValid() const { 
-      return !Fail; 
-    } 
- 
-    void operator--() { 
-      if (Fail) 
-        return; 
-      for (auto *&Inst : Insts) { 
-        for (Inst = Inst->getPrevNode(); Inst && isa<DbgInfoIntrinsic>(Inst);) 
-          Inst = Inst->getPrevNode(); 
-        // Already at beginning of block. 
-        if (!Inst) { 
-          Fail = true; 
-          return; 
-        } 
-      } 
-    } 
- 
-    ArrayRef<Instruction*> operator * () const { 
-      return Insts; 
-    } 
-  }; 
- 
-} // end anonymous namespace 
- 
-/// Check whether BB's predecessors end with unconditional branches. If it is 
-/// true, sink any common code from the predecessors to BB. 
-/// We also allow one predecessor to end with conditional branch (but no more 
-/// than one). 
+    // cared about the global view once simplifycfg (and instcombine) have
+    // completed - it takes into account PHIs that become trivially
+    // simplifiable.  However here we need a more local view; if an operand
+    // differs we create a PHI and rely on instcombine to clean up the very
+    // small mess we may make.
+    bool NeedPHI = any_of(Insts, [&I0, O](const Instruction *I) {
+      return I->getOperand(O) != I0->getOperand(O);
+    });
+    if (!NeedPHI) {
+      NewOperands.push_back(I0->getOperand(O));
+      continue;
+    }
+
+    // Create a new PHI in the successor block and populate it.
+    auto *Op = I0->getOperand(O);
+    assert(!Op->getType()->isTokenTy() && "Can't PHI tokens!");
+    auto *PN = PHINode::Create(Op->getType(), Insts.size(),
+                               Op->getName() + ".sink", &BBEnd->front());
+    for (auto *I : Insts)
+      PN->addIncoming(I->getOperand(O), I->getParent());
+    NewOperands.push_back(PN);
+  }
+
+  // Arbitrarily use I0 as the new "common" instruction; remap its operands
+  // and move it to the start of the successor block.
+  for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O)
+    I0->getOperandUse(O).set(NewOperands[O]);
+  I0->moveBefore(&*BBEnd->getFirstInsertionPt());
+
+  // Update metadata and IR flags, and merge debug locations.
+  for (auto *I : Insts)
+    if (I != I0) {
+      // The debug location for the "common" instruction is the merged locations
+      // of all the commoned instructions.  We start with the original location
+      // of the "common" instruction and iteratively merge each location in the
+      // loop below.
+      // This is an N-way merge, which will be inefficient if I0 is a CallInst.
+      // However, as N-way merge for CallInst is rare, so we use simplified API
+      // instead of using complex API for N-way merge.
+      I0->applyMergedLocation(I0->getDebugLoc(), I->getDebugLoc());
+      combineMetadataForCSE(I0, I, true);
+      I0->andIRFlags(I);
+    }
+
+  if (!I0->user_empty()) {
+    // canSinkLastInstruction checked that all instructions were used by
+    // one and only one PHI node. Find that now, RAUW it to our common
+    // instruction and nuke it.
+    auto *PN = cast<PHINode>(*I0->user_begin());
+    PN->replaceAllUsesWith(I0);
+    PN->eraseFromParent();
+  }
+
+  // Finally nuke all instructions apart from the common instruction.
+  for (auto *I : Insts)
+    if (I != I0)
+      I->eraseFromParent();
+
+  return true;
+}
+
+namespace {
+
+  // LockstepReverseIterator - Iterates through instructions
+  // in a set of blocks in reverse order from the first non-terminator.
+  // For example (assume all blocks have size n):
+  //   LockstepReverseIterator I([B1, B2, B3]);
+  //   *I-- = [B1[n], B2[n], B3[n]];
+  //   *I-- = [B1[n-1], B2[n-1], B3[n-1]];
+  //   *I-- = [B1[n-2], B2[n-2], B3[n-2]];
+  //   ...
+  class LockstepReverseIterator {
+    ArrayRef<BasicBlock*> Blocks;
+    SmallVector<Instruction*,4> Insts;
+    bool Fail;
+
+  public:
+    LockstepReverseIterator(ArrayRef<BasicBlock*> Blocks) : Blocks(Blocks) {
+      reset();
+    }
+
+    void reset() {
+      Fail = false;
+      Insts.clear();
+      for (auto *BB : Blocks) {
+        Instruction *Inst = BB->getTerminator();
+        for (Inst = Inst->getPrevNode(); Inst && isa<DbgInfoIntrinsic>(Inst);)
+          Inst = Inst->getPrevNode();
+        if (!Inst) {
+          // Block wasn't big enough.
+          Fail = true;
+          return;
+        }
+        Insts.push_back(Inst);
+      }
+    }
+
+    bool isValid() const {
+      return !Fail;
+    }
+
+    void operator--() {
+      if (Fail)
+        return;
+      for (auto *&Inst : Insts) {
+        for (Inst = Inst->getPrevNode(); Inst && isa<DbgInfoIntrinsic>(Inst);)
+          Inst = Inst->getPrevNode();
+        // Already at beginning of block.
+        if (!Inst) {
+          Fail = true;
+          return;
+        }
+      }
+    }
+
+    ArrayRef<Instruction*> operator * () const {
+      return Insts;
+    }
+  };
+
+} // end anonymous namespace
+
+/// Check whether BB's predecessors end with unconditional branches. If it is
+/// true, sink any common code from the predecessors to BB.
+/// We also allow one predecessor to end with conditional branch (but no more
+/// than one).
 static bool SinkCommonCodeFromPredecessors(BasicBlock *BB,
                                            DomTreeUpdater *DTU) {
-  // We support two situations: 
-  //   (1) all incoming arcs are unconditional 
-  //   (2) one incoming arc is conditional 
-  // 
-  // (2) is very common in switch defaults and 
-  // else-if patterns; 
-  // 
-  //   if (a) f(1); 
-  //   else if (b) f(2); 
-  // 
-  // produces: 
-  // 
-  //       [if] 
-  //      /    \ 
-  //    [f(1)] [if] 
-  //      |     | \ 
-  //      |     |  | 
-  //      |  [f(2)]| 
-  //       \    | / 
-  //        [ end ] 
-  // 
-  // [end] has two unconditional predecessor arcs and one conditional. The 
-  // conditional refers to the implicit empty 'else' arc. This conditional 
-  // arc can also be caused by an empty default block in a switch. 
-  // 
-  // In this case, we attempt to sink code from all *unconditional* arcs. 
-  // If we can sink instructions from these arcs (determined during the scan 
-  // phase below) we insert a common successor for all unconditional arcs and 
-  // connect that to [end], to enable sinking: 
-  // 
-  //       [if] 
-  //      /    \ 
-  //    [x(1)] [if] 
-  //      |     | \ 
-  //      |     |  \ 
-  //      |  [x(2)] | 
-  //       \   /    | 
-  //   [sink.split] | 
-  //         \     / 
-  //         [ end ] 
-  // 
-  SmallVector<BasicBlock*,4> UnconditionalPreds; 
-  Instruction *Cond = nullptr; 
-  for (auto *B : predecessors(BB)) { 
-    auto *T = B->getTerminator(); 
-    if (isa<BranchInst>(T) && cast<BranchInst>(T)->isUnconditional()) 
-      UnconditionalPreds.push_back(B); 
-    else if ((isa<BranchInst>(T) || isa<SwitchInst>(T)) && !Cond) 
-      Cond = T; 
-    else 
-      return false; 
-  } 
-  if (UnconditionalPreds.size() < 2) 
-    return false; 
- 
-  // We take a two-step approach to tail sinking. First we scan from the end of 
-  // each block upwards in lockstep. If the n'th instruction from the end of each 
-  // block can be sunk, those instructions are added to ValuesToSink and we 
-  // carry on. If we can sink an instruction but need to PHI-merge some operands 
-  // (because they're not identical in each instruction) we add these to 
-  // PHIOperands. 
-  unsigned ScanIdx = 0; 
-  SmallPtrSet<Value*,4> InstructionsToSink; 
-  DenseMap<Instruction*, SmallVector<Value*,4>> PHIOperands; 
-  LockstepReverseIterator LRI(UnconditionalPreds); 
-  while (LRI.isValid() && 
-         canSinkInstructions(*LRI, PHIOperands)) { 
-    LLVM_DEBUG(dbgs() << "SINK: instruction can be sunk: " << *(*LRI)[0] 
-                      << "\n"); 
-    InstructionsToSink.insert((*LRI).begin(), (*LRI).end()); 
-    ++ScanIdx; 
-    --LRI; 
-  } 
- 
+  // We support two situations:
+  //   (1) all incoming arcs are unconditional
+  //   (2) one incoming arc is conditional
+  //
+  // (2) is very common in switch defaults and
+  // else-if patterns;
+  //
+  //   if (a) f(1);
+  //   else if (b) f(2);
+  //
+  // produces:
+  //
+  //       [if]
+  //      /    \
+  //    [f(1)] [if]
+  //      |     | \
+  //      |     |  |
+  //      |  [f(2)]|
+  //       \    | /
+  //        [ end ]
+  //
+  // [end] has two unconditional predecessor arcs and one conditional. The
+  // conditional refers to the implicit empty 'else' arc. This conditional
+  // arc can also be caused by an empty default block in a switch.
+  //
+  // In this case, we attempt to sink code from all *unconditional* arcs.
+  // If we can sink instructions from these arcs (determined during the scan
+  // phase below) we insert a common successor for all unconditional arcs and
+  // connect that to [end], to enable sinking:
+  //
+  //       [if]
+  //      /    \
+  //    [x(1)] [if]
+  //      |     | \
+  //      |     |  \
+  //      |  [x(2)] |
+  //       \   /    |
+  //   [sink.split] |
+  //         \     /
+  //         [ end ]
+  //
+  SmallVector<BasicBlock*,4> UnconditionalPreds;
+  Instruction *Cond = nullptr;
+  for (auto *B : predecessors(BB)) {
+    auto *T = B->getTerminator();
+    if (isa<BranchInst>(T) && cast<BranchInst>(T)->isUnconditional())
+      UnconditionalPreds.push_back(B);
+    else if ((isa<BranchInst>(T) || isa<SwitchInst>(T)) && !Cond)
+      Cond = T;
+    else
+      return false;
+  }
+  if (UnconditionalPreds.size() < 2)
+    return false;
+
+  // We take a two-step approach to tail sinking. First we scan from the end of
+  // each block upwards in lockstep. If the n'th instruction from the end of each
+  // block can be sunk, those instructions are added to ValuesToSink and we
+  // carry on. If we can sink an instruction but need to PHI-merge some operands
+  // (because they're not identical in each instruction) we add these to
+  // PHIOperands.
+  unsigned ScanIdx = 0;
+  SmallPtrSet<Value*,4> InstructionsToSink;
+  DenseMap<Instruction*, SmallVector<Value*,4>> PHIOperands;
+  LockstepReverseIterator LRI(UnconditionalPreds);
+  while (LRI.isValid() &&
+         canSinkInstructions(*LRI, PHIOperands)) {
+    LLVM_DEBUG(dbgs() << "SINK: instruction can be sunk: " << *(*LRI)[0]
+                      << "\n");
+    InstructionsToSink.insert((*LRI).begin(), (*LRI).end());
+    ++ScanIdx;
+    --LRI;
+  }
+
   // If no instructions can be sunk, early-return.
   if (ScanIdx == 0)
     return false;
 
   bool Changed = false;
 
-  auto ProfitableToSinkInstruction = [&](LockstepReverseIterator &LRI) { 
-    unsigned NumPHIdValues = 0; 
-    for (auto *I : *LRI) 
-      for (auto *V : PHIOperands[I]) 
-        if (InstructionsToSink.count(V) == 0) 
-          ++NumPHIdValues; 
-    LLVM_DEBUG(dbgs() << "SINK: #phid values: " << NumPHIdValues << "\n"); 
-    unsigned NumPHIInsts = NumPHIdValues / UnconditionalPreds.size(); 
-    if ((NumPHIdValues % UnconditionalPreds.size()) != 0) 
-        NumPHIInsts++; 
- 
-    return NumPHIInsts <= 1; 
-  }; 
- 
+  auto ProfitableToSinkInstruction = [&](LockstepReverseIterator &LRI) {
+    unsigned NumPHIdValues = 0;
+    for (auto *I : *LRI)
+      for (auto *V : PHIOperands[I])
+        if (InstructionsToSink.count(V) == 0)
+          ++NumPHIdValues;
+    LLVM_DEBUG(dbgs() << "SINK: #phid values: " << NumPHIdValues << "\n");
+    unsigned NumPHIInsts = NumPHIdValues / UnconditionalPreds.size();
+    if ((NumPHIdValues % UnconditionalPreds.size()) != 0)
+        NumPHIInsts++;
+
+    return NumPHIInsts <= 1;
+  };
+
   if (Cond) {
-    // Check if we would actually sink anything first! This mutates the CFG and 
-    // adds an extra block. The goal in doing this is to allow instructions that 
-    // couldn't be sunk before to be sunk - obviously, speculatable instructions 
-    // (such as trunc, add) can be sunk and predicated already. So we check that 
-    // we're going to sink at least one non-speculatable instruction. 
-    LRI.reset(); 
-    unsigned Idx = 0; 
-    bool Profitable = false; 
-    while (ProfitableToSinkInstruction(LRI) && Idx < ScanIdx) { 
-      if (!isSafeToSpeculativelyExecute((*LRI)[0])) { 
-        Profitable = true; 
-        break; 
-      } 
-      --LRI; 
-      ++Idx; 
-    } 
-    if (!Profitable) 
-      return false; 
- 
-    LLVM_DEBUG(dbgs() << "SINK: Splitting edge\n"); 
-    // We have a conditional edge and we're going to sink some instructions. 
-    // Insert a new block postdominating all blocks we're going to sink from. 
+    // Check if we would actually sink anything first! This mutates the CFG and
+    // adds an extra block. The goal in doing this is to allow instructions that
+    // couldn't be sunk before to be sunk - obviously, speculatable instructions
+    // (such as trunc, add) can be sunk and predicated already. So we check that
+    // we're going to sink at least one non-speculatable instruction.
+    LRI.reset();
+    unsigned Idx = 0;
+    bool Profitable = false;
+    while (ProfitableToSinkInstruction(LRI) && Idx < ScanIdx) {
+      if (!isSafeToSpeculativelyExecute((*LRI)[0])) {
+        Profitable = true;
+        break;
+      }
+      --LRI;
+      ++Idx;
+    }
+    if (!Profitable)
+      return false;
+
+    LLVM_DEBUG(dbgs() << "SINK: Splitting edge\n");
+    // We have a conditional edge and we're going to sink some instructions.
+    // Insert a new block postdominating all blocks we're going to sink from.
     if (!SplitBlockPredecessors(BB, UnconditionalPreds, ".sink.split", DTU))
-      // Edges couldn't be split. 
-      return false; 
-    Changed = true; 
-  } 
- 
-  // Now that we've analyzed all potential sinking candidates, perform the 
-  // actual sink. We iteratively sink the last non-terminator of the source 
-  // blocks into their common successor unless doing so would require too 
-  // many PHI instructions to be generated (currently only one PHI is allowed 
-  // per sunk instruction). 
-  // 
-  // We can use InstructionsToSink to discount values needing PHI-merging that will 
-  // actually be sunk in a later iteration. This allows us to be more 
-  // aggressive in what we sink. This does allow a false positive where we 
-  // sink presuming a later value will also be sunk, but stop half way through 
-  // and never actually sink it which means we produce more PHIs than intended. 
-  // This is unlikely in practice though. 
+      // Edges couldn't be split.
+      return false;
+    Changed = true;
+  }
+
+  // Now that we've analyzed all potential sinking candidates, perform the
+  // actual sink. We iteratively sink the last non-terminator of the source
+  // blocks into their common successor unless doing so would require too
+  // many PHI instructions to be generated (currently only one PHI is allowed
+  // per sunk instruction).
+  //
+  // We can use InstructionsToSink to discount values needing PHI-merging that will
+  // actually be sunk in a later iteration. This allows us to be more
+  // aggressive in what we sink. This does allow a false positive where we
+  // sink presuming a later value will also be sunk, but stop half way through
+  // and never actually sink it which means we produce more PHIs than intended.
+  // This is unlikely in practice though.
   unsigned SinkIdx = 0;
   for (; SinkIdx != ScanIdx; ++SinkIdx) {
-    LLVM_DEBUG(dbgs() << "SINK: Sink: " 
-                      << *UnconditionalPreds[0]->getTerminator()->getPrevNode() 
-                      << "\n"); 
- 
-    // Because we've sunk every instruction in turn, the current instruction to 
-    // sink is always at index 0. 
-    LRI.reset(); 
-    if (!ProfitableToSinkInstruction(LRI)) { 
-      // Too many PHIs would be created. 
-      LLVM_DEBUG( 
-          dbgs() << "SINK: stopping here, too many PHIs would be created!\n"); 
-      break; 
-    } 
- 
+    LLVM_DEBUG(dbgs() << "SINK: Sink: "
+                      << *UnconditionalPreds[0]->getTerminator()->getPrevNode()
+                      << "\n");
+
+    // Because we've sunk every instruction in turn, the current instruction to
+    // sink is always at index 0.
+    LRI.reset();
+    if (!ProfitableToSinkInstruction(LRI)) {
+      // Too many PHIs would be created.
+      LLVM_DEBUG(
+          dbgs() << "SINK: stopping here, too many PHIs would be created!\n");
+      break;
+    }
+
     if (!sinkLastInstruction(UnconditionalPreds)) {
       LLVM_DEBUG(
           dbgs()
@@ -2043,76 +2043,76 @@ static bool SinkCommonCodeFromPredecessors(BasicBlock *BB,
     }
 
     NumSinkCommonInstrs++;
-    Changed = true; 
-  } 
+    Changed = true;
+  }
   if (SinkIdx != 0)
     ++NumSinkCommonCode;
-  return Changed; 
-} 
- 
-/// Determine if we can hoist sink a sole store instruction out of a 
-/// conditional block. 
-/// 
-/// We are looking for code like the following: 
-///   BrBB: 
-///     store i32 %add, i32* %arrayidx2 
-///     ... // No other stores or function calls (we could be calling a memory 
-///     ... // function). 
-///     %cmp = icmp ult %x, %y 
-///     br i1 %cmp, label %EndBB, label %ThenBB 
-///   ThenBB: 
-///     store i32 %add5, i32* %arrayidx2 
-///     br label EndBB 
-///   EndBB: 
-///     ... 
-///   We are going to transform this into: 
-///   BrBB: 
-///     store i32 %add, i32* %arrayidx2 
-///     ... // 
-///     %cmp = icmp ult %x, %y 
-///     %add.add5 = select i1 %cmp, i32 %add, %add5 
-///     store i32 %add.add5, i32* %arrayidx2 
-///     ... 
-/// 
-/// \return The pointer to the value of the previous store if the store can be 
-///         hoisted into the predecessor block. 0 otherwise. 
-static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB, 
-                                     BasicBlock *StoreBB, BasicBlock *EndBB) { 
-  StoreInst *StoreToHoist = dyn_cast<StoreInst>(I); 
-  if (!StoreToHoist) 
-    return nullptr; 
- 
-  // Volatile or atomic. 
-  if (!StoreToHoist->isSimple()) 
-    return nullptr; 
- 
-  Value *StorePtr = StoreToHoist->getPointerOperand(); 
- 
-  // Look for a store to the same pointer in BrBB. 
-  unsigned MaxNumInstToLookAt = 9; 
+  return Changed;
+}
+
+/// Determine if we can hoist sink a sole store instruction out of a
+/// conditional block.
+///
+/// We are looking for code like the following:
+///   BrBB:
+///     store i32 %add, i32* %arrayidx2
+///     ... // No other stores or function calls (we could be calling a memory
+///     ... // function).
+///     %cmp = icmp ult %x, %y
+///     br i1 %cmp, label %EndBB, label %ThenBB
+///   ThenBB:
+///     store i32 %add5, i32* %arrayidx2
+///     br label EndBB
+///   EndBB:
+///     ...
+///   We are going to transform this into:
+///   BrBB:
+///     store i32 %add, i32* %arrayidx2
+///     ... //
+///     %cmp = icmp ult %x, %y
+///     %add.add5 = select i1 %cmp, i32 %add, %add5
+///     store i32 %add.add5, i32* %arrayidx2
+///     ...
+///
+/// \return The pointer to the value of the previous store if the store can be
+///         hoisted into the predecessor block. 0 otherwise.
+static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB,
+                                     BasicBlock *StoreBB, BasicBlock *EndBB) {
+  StoreInst *StoreToHoist = dyn_cast<StoreInst>(I);
+  if (!StoreToHoist)
+    return nullptr;
+
+  // Volatile or atomic.
+  if (!StoreToHoist->isSimple())
+    return nullptr;
+
+  Value *StorePtr = StoreToHoist->getPointerOperand();
+
+  // Look for a store to the same pointer in BrBB.
+  unsigned MaxNumInstToLookAt = 9;
   // Skip pseudo probe intrinsic calls which are not really killing any memory
   // accesses.
   for (Instruction &CurI : reverse(BrBB->instructionsWithoutDebug(true))) {
-    if (!MaxNumInstToLookAt) 
-      break; 
-    --MaxNumInstToLookAt; 
- 
-    // Could be calling an instruction that affects memory like free(). 
-    if (CurI.mayHaveSideEffects() && !isa<StoreInst>(CurI)) 
-      return nullptr; 
- 
-    if (auto *SI = dyn_cast<StoreInst>(&CurI)) { 
-      // Found the previous store make sure it stores to the same location. 
-      if (SI->getPointerOperand() == StorePtr) 
-        // Found the previous store, return its value operand. 
-        return SI->getValueOperand(); 
-      return nullptr; // Unknown store. 
-    } 
-  } 
- 
-  return nullptr; 
-} 
- 
+    if (!MaxNumInstToLookAt)
+      break;
+    --MaxNumInstToLookAt;
+
+    // Could be calling an instruction that affects memory like free().
+    if (CurI.mayHaveSideEffects() && !isa<StoreInst>(CurI))
+      return nullptr;
+
+    if (auto *SI = dyn_cast<StoreInst>(&CurI)) {
+      // Found the previous store make sure it stores to the same location.
+      if (SI->getPointerOperand() == StorePtr)
+        // Found the previous store, return its value operand.
+        return SI->getValueOperand();
+      return nullptr; // Unknown store.
+    }
+  }
+
+  return nullptr;
+}
+
 /// Estimate the cost of the insertion(s) and check that the PHI nodes can be
 /// converted to selects.
 static bool validateAndCostRequiredSelects(BasicBlock *BB, BasicBlock *ThenBB,
@@ -2172,86 +2172,86 @@ static bool validateAndCostRequiredSelects(BasicBlock *BB, BasicBlock *ThenBB,
   return HaveRewritablePHIs;
 }
 
-/// Speculate a conditional basic block flattening the CFG. 
-/// 
-/// Note that this is a very risky transform currently. Speculating 
-/// instructions like this is most often not desirable. Instead, there is an MI 
-/// pass which can do it with full awareness of the resource constraints. 
-/// However, some cases are "obvious" and we should do directly. An example of 
-/// this is speculating a single, reasonably cheap instruction. 
-/// 
-/// There is only one distinct advantage to flattening the CFG at the IR level: 
-/// it makes very common but simplistic optimizations such as are common in 
-/// instcombine and the DAG combiner more powerful by removing CFG edges and 
-/// modeling their effects with easier to reason about SSA value graphs. 
-/// 
-/// 
-/// An illustration of this transform is turning this IR: 
-/// \code 
-///   BB: 
-///     %cmp = icmp ult %x, %y 
-///     br i1 %cmp, label %EndBB, label %ThenBB 
-///   ThenBB: 
-///     %sub = sub %x, %y 
-///     br label BB2 
-///   EndBB: 
-///     %phi = phi [ %sub, %ThenBB ], [ 0, %EndBB ] 
-///     ... 
-/// \endcode 
-/// 
-/// Into this IR: 
-/// \code 
-///   BB: 
-///     %cmp = icmp ult %x, %y 
-///     %sub = sub %x, %y 
-///     %cond = select i1 %cmp, 0, %sub 
-///     ... 
-/// \endcode 
-/// 
-/// \returns true if the conditional block is removed. 
-bool SimplifyCFGOpt::SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB, 
-                                            const TargetTransformInfo &TTI) { 
-  // Be conservative for now. FP select instruction can often be expensive. 
-  Value *BrCond = BI->getCondition(); 
-  if (isa<FCmpInst>(BrCond)) 
-    return false; 
- 
-  BasicBlock *BB = BI->getParent(); 
-  BasicBlock *EndBB = ThenBB->getTerminator()->getSuccessor(0); 
+/// Speculate a conditional basic block flattening the CFG.
+///
+/// Note that this is a very risky transform currently. Speculating
+/// instructions like this is most often not desirable. Instead, there is an MI
+/// pass which can do it with full awareness of the resource constraints.
+/// However, some cases are "obvious" and we should do directly. An example of
+/// this is speculating a single, reasonably cheap instruction.
+///
+/// There is only one distinct advantage to flattening the CFG at the IR level:
+/// it makes very common but simplistic optimizations such as are common in
+/// instcombine and the DAG combiner more powerful by removing CFG edges and
+/// modeling their effects with easier to reason about SSA value graphs.
+///
+///
+/// An illustration of this transform is turning this IR:
+/// \code
+///   BB:
+///     %cmp = icmp ult %x, %y
+///     br i1 %cmp, label %EndBB, label %ThenBB
+///   ThenBB:
+///     %sub = sub %x, %y
+///     br label BB2
+///   EndBB:
+///     %phi = phi [ %sub, %ThenBB ], [ 0, %EndBB ]
+///     ...
+/// \endcode
+///
+/// Into this IR:
+/// \code
+///   BB:
+///     %cmp = icmp ult %x, %y
+///     %sub = sub %x, %y
+///     %cond = select i1 %cmp, 0, %sub
+///     ...
+/// \endcode
+///
+/// \returns true if the conditional block is removed.
+bool SimplifyCFGOpt::SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
+                                            const TargetTransformInfo &TTI) {
+  // Be conservative for now. FP select instruction can often be expensive.
+  Value *BrCond = BI->getCondition();
+  if (isa<FCmpInst>(BrCond))
+    return false;
+
+  BasicBlock *BB = BI->getParent();
+  BasicBlock *EndBB = ThenBB->getTerminator()->getSuccessor(0);
   int BudgetRemaining =
     PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic;
- 
-  // If ThenBB is actually on the false edge of the conditional branch, remember 
-  // to swap the select operands later. 
-  bool Invert = false; 
-  if (ThenBB != BI->getSuccessor(0)) { 
-    assert(ThenBB == BI->getSuccessor(1) && "No edge from 'if' block?"); 
-    Invert = true; 
-  } 
-  assert(EndBB == BI->getSuccessor(!Invert) && "No edge from to end block"); 
- 
-  // Keep a count of how many times instructions are used within ThenBB when 
-  // they are candidates for sinking into ThenBB. Specifically: 
-  // - They are defined in BB, and 
-  // - They have no side effects, and 
-  // - All of their uses are in ThenBB. 
-  SmallDenseMap<Instruction *, unsigned, 4> SinkCandidateUseCounts; 
- 
-  SmallVector<Instruction *, 4> SpeculatedDbgIntrinsics; 
- 
-  unsigned SpeculatedInstructions = 0; 
-  Value *SpeculatedStoreValue = nullptr; 
-  StoreInst *SpeculatedStore = nullptr; 
-  for (BasicBlock::iterator BBI = ThenBB->begin(), 
-                            BBE = std::prev(ThenBB->end()); 
-       BBI != BBE; ++BBI) { 
-    Instruction *I = &*BBI; 
-    // Skip debug info. 
-    if (isa<DbgInfoIntrinsic>(I)) { 
-      SpeculatedDbgIntrinsics.push_back(I); 
-      continue; 
-    } 
- 
+
+  // If ThenBB is actually on the false edge of the conditional branch, remember
+  // to swap the select operands later.
+  bool Invert = false;
+  if (ThenBB != BI->getSuccessor(0)) {
+    assert(ThenBB == BI->getSuccessor(1) && "No edge from 'if' block?");
+    Invert = true;
+  }
+  assert(EndBB == BI->getSuccessor(!Invert) && "No edge from to end block");
+
+  // Keep a count of how many times instructions are used within ThenBB when
+  // they are candidates for sinking into ThenBB. Specifically:
+  // - They are defined in BB, and
+  // - They have no side effects, and
+  // - All of their uses are in ThenBB.
+  SmallDenseMap<Instruction *, unsigned, 4> SinkCandidateUseCounts;
+
+  SmallVector<Instruction *, 4> SpeculatedDbgIntrinsics;
+
+  unsigned SpeculatedInstructions = 0;
+  Value *SpeculatedStoreValue = nullptr;
+  StoreInst *SpeculatedStore = nullptr;
+  for (BasicBlock::iterator BBI = ThenBB->begin(),
+                            BBE = std::prev(ThenBB->end());
+       BBI != BBE; ++BBI) {
+    Instruction *I = &*BBI;
+    // Skip debug info.
+    if (isa<DbgInfoIntrinsic>(I)) {
+      SpeculatedDbgIntrinsics.push_back(I);
+      continue;
+    }
+
     // Skip pseudo probes. The consequence is we lose track of the branch
     // probability for ThenBB, which is fine since the optimization here takes
     // place regardless of the branch probability.
@@ -2260,51 +2260,51 @@ bool SimplifyCFGOpt::SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
       continue;
     }
 
-    // Only speculatively execute a single instruction (not counting the 
-    // terminator) for now. 
-    ++SpeculatedInstructions; 
-    if (SpeculatedInstructions > 1) 
-      return false; 
- 
-    // Don't hoist the instruction if it's unsafe or expensive. 
-    if (!isSafeToSpeculativelyExecute(I) && 
-        !(HoistCondStores && (SpeculatedStoreValue = isSafeToSpeculateStore( 
-                                  I, BB, ThenBB, EndBB)))) 
-      return false; 
-    if (!SpeculatedStoreValue && 
-        ComputeSpeculationCost(I, TTI) > 
-            PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic) 
-      return false; 
- 
-    // Store the store speculation candidate. 
-    if (SpeculatedStoreValue) 
-      SpeculatedStore = cast<StoreInst>(I); 
- 
-    // Do not hoist the instruction if any of its operands are defined but not 
-    // used in BB. The transformation will prevent the operand from 
-    // being sunk into the use block. 
-    for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i) { 
-      Instruction *OpI = dyn_cast<Instruction>(*i); 
-      if (!OpI || OpI->getParent() != BB || OpI->mayHaveSideEffects()) 
-        continue; // Not a candidate for sinking. 
- 
-      ++SinkCandidateUseCounts[OpI]; 
-    } 
-  } 
- 
-  // Consider any sink candidates which are only used in ThenBB as costs for 
-  // speculation. Note, while we iterate over a DenseMap here, we are summing 
-  // and so iteration order isn't significant. 
-  for (SmallDenseMap<Instruction *, unsigned, 4>::iterator 
-           I = SinkCandidateUseCounts.begin(), 
-           E = SinkCandidateUseCounts.end(); 
-       I != E; ++I) 
-    if (I->first->hasNUses(I->second)) { 
-      ++SpeculatedInstructions; 
-      if (SpeculatedInstructions > 1) 
-        return false; 
-    } 
- 
+    // Only speculatively execute a single instruction (not counting the
+    // terminator) for now.
+    ++SpeculatedInstructions;
+    if (SpeculatedInstructions > 1)
+      return false;
+
+    // Don't hoist the instruction if it's unsafe or expensive.
+    if (!isSafeToSpeculativelyExecute(I) &&
+        !(HoistCondStores && (SpeculatedStoreValue = isSafeToSpeculateStore(
+                                  I, BB, ThenBB, EndBB))))
+      return false;
+    if (!SpeculatedStoreValue &&
+        ComputeSpeculationCost(I, TTI) >
+            PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic)
+      return false;
+
+    // Store the store speculation candidate.
+    if (SpeculatedStoreValue)
+      SpeculatedStore = cast<StoreInst>(I);
+
+    // Do not hoist the instruction if any of its operands are defined but not
+    // used in BB. The transformation will prevent the operand from
+    // being sunk into the use block.
+    for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i) {
+      Instruction *OpI = dyn_cast<Instruction>(*i);
+      if (!OpI || OpI->getParent() != BB || OpI->mayHaveSideEffects())
+        continue; // Not a candidate for sinking.
+
+      ++SinkCandidateUseCounts[OpI];
+    }
+  }
+
+  // Consider any sink candidates which are only used in ThenBB as costs for
+  // speculation. Note, while we iterate over a DenseMap here, we are summing
+  // and so iteration order isn't significant.
+  for (SmallDenseMap<Instruction *, unsigned, 4>::iterator
+           I = SinkCandidateUseCounts.begin(),
+           E = SinkCandidateUseCounts.end();
+       I != E; ++I)
+    if (I->first->hasNUses(I->second)) {
+      ++SpeculatedInstructions;
+      if (SpeculatedInstructions > 1)
+        return false;
+    }
+
   // Check that we can insert the selects and that it's not too expensive to do
   // so.
   bool Convert = SpeculatedStore != nullptr;
@@ -2312,379 +2312,379 @@ bool SimplifyCFGOpt::SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
                                             SpeculatedInstructions,
                                             BudgetRemaining, TTI);
   if (!Convert || BudgetRemaining < 0)
-    return false; 
- 
-  // If we get here, we can hoist the instruction and if-convert. 
-  LLVM_DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *ThenBB << "\n";); 
- 
-  // Insert a select of the value of the speculated store. 
-  if (SpeculatedStoreValue) { 
-    IRBuilder<NoFolder> Builder(BI); 
-    Value *TrueV = SpeculatedStore->getValueOperand(); 
-    Value *FalseV = SpeculatedStoreValue; 
-    if (Invert) 
-      std::swap(TrueV, FalseV); 
-    Value *S = Builder.CreateSelect( 
-        BrCond, TrueV, FalseV, "spec.store.select", BI); 
-    SpeculatedStore->setOperand(0, S); 
-    SpeculatedStore->applyMergedLocation(BI->getDebugLoc(), 
-                                         SpeculatedStore->getDebugLoc()); 
-  } 
- 
-  // Metadata can be dependent on the condition we are hoisting above. 
-  // Conservatively strip all metadata on the instruction. Drop the debug loc 
-  // to avoid making it appear as if the condition is a constant, which would 
-  // be misleading while debugging. 
-  for (auto &I : *ThenBB) { 
-    if (!SpeculatedStoreValue || &I != SpeculatedStore) 
-      I.setDebugLoc(DebugLoc()); 
-    I.dropUnknownNonDebugMetadata(); 
-  } 
- 
-  // Hoist the instructions. 
-  BB->getInstList().splice(BI->getIterator(), ThenBB->getInstList(), 
-                           ThenBB->begin(), std::prev(ThenBB->end())); 
- 
-  // Insert selects and rewrite the PHI operands. 
-  IRBuilder<NoFolder> Builder(BI); 
-  for (PHINode &PN : EndBB->phis()) { 
-    unsigned OrigI = PN.getBasicBlockIndex(BB); 
-    unsigned ThenI = PN.getBasicBlockIndex(ThenBB); 
-    Value *OrigV = PN.getIncomingValue(OrigI); 
-    Value *ThenV = PN.getIncomingValue(ThenI); 
- 
-    // Skip PHIs which are trivial. 
-    if (OrigV == ThenV) 
-      continue; 
- 
-    // Create a select whose true value is the speculatively executed value and 
-    // false value is the pre-existing value. Swap them if the branch 
-    // destinations were inverted. 
-    Value *TrueV = ThenV, *FalseV = OrigV; 
-    if (Invert) 
-      std::swap(TrueV, FalseV); 
-    Value *V = Builder.CreateSelect(BrCond, TrueV, FalseV, "spec.select", BI); 
-    PN.setIncomingValue(OrigI, V); 
-    PN.setIncomingValue(ThenI, V); 
-  } 
- 
-  // Remove speculated dbg intrinsics. 
-  // FIXME: Is it possible to do this in a more elegant way? Moving/merging the 
-  // dbg value for the different flows and inserting it after the select. 
-  for (Instruction *I : SpeculatedDbgIntrinsics) 
-    I->eraseFromParent(); 
- 
-  ++NumSpeculations; 
-  return true; 
-} 
- 
-/// Return true if we can thread a branch across this block. 
-static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) { 
-  int Size = 0; 
- 
-  for (Instruction &I : BB->instructionsWithoutDebug()) { 
-    if (Size > MaxSmallBlockSize) 
-      return false; // Don't clone large BB's. 
+    return false;
+
+  // If we get here, we can hoist the instruction and if-convert.
+  LLVM_DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *ThenBB << "\n";);
+
+  // Insert a select of the value of the speculated store.
+  if (SpeculatedStoreValue) {
+    IRBuilder<NoFolder> Builder(BI);
+    Value *TrueV = SpeculatedStore->getValueOperand();
+    Value *FalseV = SpeculatedStoreValue;
+    if (Invert)
+      std::swap(TrueV, FalseV);
+    Value *S = Builder.CreateSelect(
+        BrCond, TrueV, FalseV, "spec.store.select", BI);
+    SpeculatedStore->setOperand(0, S);
+    SpeculatedStore->applyMergedLocation(BI->getDebugLoc(),
+                                         SpeculatedStore->getDebugLoc());
+  }
+
+  // Metadata can be dependent on the condition we are hoisting above.
+  // Conservatively strip all metadata on the instruction. Drop the debug loc
+  // to avoid making it appear as if the condition is a constant, which would
+  // be misleading while debugging.
+  for (auto &I : *ThenBB) {
+    if (!SpeculatedStoreValue || &I != SpeculatedStore)
+      I.setDebugLoc(DebugLoc());
+    I.dropUnknownNonDebugMetadata();
+  }
+
+  // Hoist the instructions.
+  BB->getInstList().splice(BI->getIterator(), ThenBB->getInstList(),
+                           ThenBB->begin(), std::prev(ThenBB->end()));
+
+  // Insert selects and rewrite the PHI operands.
+  IRBuilder<NoFolder> Builder(BI);
+  for (PHINode &PN : EndBB->phis()) {
+    unsigned OrigI = PN.getBasicBlockIndex(BB);
+    unsigned ThenI = PN.getBasicBlockIndex(ThenBB);
+    Value *OrigV = PN.getIncomingValue(OrigI);
+    Value *ThenV = PN.getIncomingValue(ThenI);
+
+    // Skip PHIs which are trivial.
+    if (OrigV == ThenV)
+      continue;
+
+    // Create a select whose true value is the speculatively executed value and
+    // false value is the pre-existing value. Swap them if the branch
+    // destinations were inverted.
+    Value *TrueV = ThenV, *FalseV = OrigV;
+    if (Invert)
+      std::swap(TrueV, FalseV);
+    Value *V = Builder.CreateSelect(BrCond, TrueV, FalseV, "spec.select", BI);
+    PN.setIncomingValue(OrigI, V);
+    PN.setIncomingValue(ThenI, V);
+  }
+
+  // Remove speculated dbg intrinsics.
+  // FIXME: Is it possible to do this in a more elegant way? Moving/merging the
+  // dbg value for the different flows and inserting it after the select.
+  for (Instruction *I : SpeculatedDbgIntrinsics)
+    I->eraseFromParent();
+
+  ++NumSpeculations;
+  return true;
+}
+
+/// Return true if we can thread a branch across this block.
+static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
+  int Size = 0;
+
+  for (Instruction &I : BB->instructionsWithoutDebug()) {
+    if (Size > MaxSmallBlockSize)
+      return false; // Don't clone large BB's.
 
     // Can't fold blocks that contain noduplicate or convergent calls.
     if (CallInst *CI = dyn_cast<CallInst>(&I))
       if (CI->cannotDuplicate() || CI->isConvergent())
         return false;
 
-    // We will delete Phis while threading, so Phis should not be accounted in 
-    // block's size 
-    if (!isa<PHINode>(I)) 
-      ++Size; 
- 
-    // We can only support instructions that do not define values that are 
-    // live outside of the current basic block. 
-    for (User *U : I.users()) { 
-      Instruction *UI = cast<Instruction>(U); 
-      if (UI->getParent() != BB || isa<PHINode>(UI)) 
-        return false; 
-    } 
- 
-    // Looks ok, continue checking. 
-  } 
- 
-  return true; 
-} 
- 
-/// If we have a conditional branch on a PHI node value that is defined in the 
-/// same block as the branch and if any PHI entries are constants, thread edges 
-/// corresponding to that entry to be branches to their ultimate destination. 
+    // We will delete Phis while threading, so Phis should not be accounted in
+    // block's size
+    if (!isa<PHINode>(I))
+      ++Size;
+
+    // We can only support instructions that do not define values that are
+    // live outside of the current basic block.
+    for (User *U : I.users()) {
+      Instruction *UI = cast<Instruction>(U);
+      if (UI->getParent() != BB || isa<PHINode>(UI))
+        return false;
+    }
+
+    // Looks ok, continue checking.
+  }
+
+  return true;
+}
+
+/// If we have a conditional branch on a PHI node value that is defined in the
+/// same block as the branch and if any PHI entries are constants, thread edges
+/// corresponding to that entry to be branches to their ultimate destination.
 static bool FoldCondBranchOnPHI(BranchInst *BI, DomTreeUpdater *DTU,
                                 const DataLayout &DL, AssumptionCache *AC) {
-  BasicBlock *BB = BI->getParent(); 
-  PHINode *PN = dyn_cast<PHINode>(BI->getCondition()); 
-  // NOTE: we currently cannot transform this case if the PHI node is used 
-  // outside of the block. 
-  if (!PN || PN->getParent() != BB || !PN->hasOneUse()) 
-    return false; 
- 
-  // Degenerate case of a single entry PHI. 
-  if (PN->getNumIncomingValues() == 1) { 
-    FoldSingleEntryPHINodes(PN->getParent()); 
-    return true; 
-  } 
- 
-  // Now we know that this block has multiple preds and two succs. 
-  if (!BlockIsSimpleEnoughToThreadThrough(BB)) 
-    return false; 
- 
-  // Okay, this is a simple enough basic block.  See if any phi values are 
-  // constants. 
-  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { 
-    ConstantInt *CB = dyn_cast<ConstantInt>(PN->getIncomingValue(i)); 
-    if (!CB || !CB->getType()->isIntegerTy(1)) 
-      continue; 
- 
-    // Okay, we now know that all edges from PredBB should be revectored to 
-    // branch to RealDest. 
-    BasicBlock *PredBB = PN->getIncomingBlock(i); 
-    BasicBlock *RealDest = BI->getSuccessor(!CB->getZExtValue()); 
- 
-    if (RealDest == BB) 
-      continue; // Skip self loops. 
-    // Skip if the predecessor's terminator is an indirect branch. 
-    if (isa<IndirectBrInst>(PredBB->getTerminator())) 
-      continue; 
- 
+  BasicBlock *BB = BI->getParent();
+  PHINode *PN = dyn_cast<PHINode>(BI->getCondition());
+  // NOTE: we currently cannot transform this case if the PHI node is used
+  // outside of the block.
+  if (!PN || PN->getParent() != BB || !PN->hasOneUse())
+    return false;
+
+  // Degenerate case of a single entry PHI.
+  if (PN->getNumIncomingValues() == 1) {
+    FoldSingleEntryPHINodes(PN->getParent());
+    return true;
+  }
+
+  // Now we know that this block has multiple preds and two succs.
+  if (!BlockIsSimpleEnoughToThreadThrough(BB))
+    return false;
+
+  // Okay, this is a simple enough basic block.  See if any phi values are
+  // constants.
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    ConstantInt *CB = dyn_cast<ConstantInt>(PN->getIncomingValue(i));
+    if (!CB || !CB->getType()->isIntegerTy(1))
+      continue;
+
+    // Okay, we now know that all edges from PredBB should be revectored to
+    // branch to RealDest.
+    BasicBlock *PredBB = PN->getIncomingBlock(i);
+    BasicBlock *RealDest = BI->getSuccessor(!CB->getZExtValue());
+
+    if (RealDest == BB)
+      continue; // Skip self loops.
+    // Skip if the predecessor's terminator is an indirect branch.
+    if (isa<IndirectBrInst>(PredBB->getTerminator()))
+      continue;
+
     SmallVector<DominatorTree::UpdateType, 3> Updates;
 
-    // The dest block might have PHI nodes, other predecessors and other 
-    // difficult cases.  Instead of being smart about this, just insert a new 
-    // block that jumps to the destination block, effectively splitting 
-    // the edge we are about to create. 
-    BasicBlock *EdgeBB = 
-        BasicBlock::Create(BB->getContext(), RealDest->getName() + ".critedge", 
-                           RealDest->getParent(), RealDest); 
-    BranchInst *CritEdgeBranch = BranchInst::Create(RealDest, EdgeBB); 
+    // The dest block might have PHI nodes, other predecessors and other
+    // difficult cases.  Instead of being smart about this, just insert a new
+    // block that jumps to the destination block, effectively splitting
+    // the edge we are about to create.
+    BasicBlock *EdgeBB =
+        BasicBlock::Create(BB->getContext(), RealDest->getName() + ".critedge",
+                           RealDest->getParent(), RealDest);
+    BranchInst *CritEdgeBranch = BranchInst::Create(RealDest, EdgeBB);
     Updates.push_back({DominatorTree::Insert, EdgeBB, RealDest});
-    CritEdgeBranch->setDebugLoc(BI->getDebugLoc()); 
- 
-    // Update PHI nodes. 
-    AddPredecessorToBlock(RealDest, EdgeBB, BB); 
- 
-    // BB may have instructions that are being threaded over.  Clone these 
-    // instructions into EdgeBB.  We know that there will be no uses of the 
-    // cloned instructions outside of EdgeBB. 
-    BasicBlock::iterator InsertPt = EdgeBB->begin(); 
-    DenseMap<Value *, Value *> TranslateMap; // Track translated values. 
-    for (BasicBlock::iterator BBI = BB->begin(); &*BBI != BI; ++BBI) { 
-      if (PHINode *PN = dyn_cast<PHINode>(BBI)) { 
-        TranslateMap[PN] = PN->getIncomingValueForBlock(PredBB); 
-        continue; 
-      } 
-      // Clone the instruction. 
-      Instruction *N = BBI->clone(); 
-      if (BBI->hasName()) 
-        N->setName(BBI->getName() + ".c"); 
- 
-      // Update operands due to translation. 
-      for (User::op_iterator i = N->op_begin(), e = N->op_end(); i != e; ++i) { 
-        DenseMap<Value *, Value *>::iterator PI = TranslateMap.find(*i); 
-        if (PI != TranslateMap.end()) 
-          *i = PI->second; 
-      } 
- 
-      // Check for trivial simplification. 
-      if (Value *V = SimplifyInstruction(N, {DL, nullptr, nullptr, AC})) { 
-        if (!BBI->use_empty()) 
-          TranslateMap[&*BBI] = V; 
-        if (!N->mayHaveSideEffects()) { 
-          N->deleteValue(); // Instruction folded away, don't need actual inst 
-          N = nullptr; 
-        } 
-      } else { 
-        if (!BBI->use_empty()) 
-          TranslateMap[&*BBI] = N; 
-      } 
-      if (N) { 
-        // Insert the new instruction into its new home. 
-        EdgeBB->getInstList().insert(InsertPt, N); 
- 
-        // Register the new instruction with the assumption cache if necessary. 
-        if (AC && match(N, m_Intrinsic<Intrinsic::assume>())) 
-          AC->registerAssumption(cast<IntrinsicInst>(N)); 
-      } 
-    } 
- 
-    // Loop over all of the edges from PredBB to BB, changing them to branch 
-    // to EdgeBB instead. 
-    Instruction *PredBBTI = PredBB->getTerminator(); 
-    for (unsigned i = 0, e = PredBBTI->getNumSuccessors(); i != e; ++i) 
-      if (PredBBTI->getSuccessor(i) == BB) { 
-        BB->removePredecessor(PredBB); 
-        PredBBTI->setSuccessor(i, EdgeBB); 
-      } 
- 
+    CritEdgeBranch->setDebugLoc(BI->getDebugLoc());
+
+    // Update PHI nodes.
+    AddPredecessorToBlock(RealDest, EdgeBB, BB);
+
+    // BB may have instructions that are being threaded over.  Clone these
+    // instructions into EdgeBB.  We know that there will be no uses of the
+    // cloned instructions outside of EdgeBB.
+    BasicBlock::iterator InsertPt = EdgeBB->begin();
+    DenseMap<Value *, Value *> TranslateMap; // Track translated values.
+    for (BasicBlock::iterator BBI = BB->begin(); &*BBI != BI; ++BBI) {
+      if (PHINode *PN = dyn_cast<PHINode>(BBI)) {
+        TranslateMap[PN] = PN->getIncomingValueForBlock(PredBB);
+        continue;
+      }
+      // Clone the instruction.
+      Instruction *N = BBI->clone();
+      if (BBI->hasName())
+        N->setName(BBI->getName() + ".c");
+
+      // Update operands due to translation.
+      for (User::op_iterator i = N->op_begin(), e = N->op_end(); i != e; ++i) {
+        DenseMap<Value *, Value *>::iterator PI = TranslateMap.find(*i);
+        if (PI != TranslateMap.end())
+          *i = PI->second;
+      }
+
+      // Check for trivial simplification.
+      if (Value *V = SimplifyInstruction(N, {DL, nullptr, nullptr, AC})) {
+        if (!BBI->use_empty())
+          TranslateMap[&*BBI] = V;
+        if (!N->mayHaveSideEffects()) {
+          N->deleteValue(); // Instruction folded away, don't need actual inst
+          N = nullptr;
+        }
+      } else {
+        if (!BBI->use_empty())
+          TranslateMap[&*BBI] = N;
+      }
+      if (N) {
+        // Insert the new instruction into its new home.
+        EdgeBB->getInstList().insert(InsertPt, N);
+
+        // Register the new instruction with the assumption cache if necessary.
+        if (AC && match(N, m_Intrinsic<Intrinsic::assume>()))
+          AC->registerAssumption(cast<IntrinsicInst>(N));
+      }
+    }
+
+    // Loop over all of the edges from PredBB to BB, changing them to branch
+    // to EdgeBB instead.
+    Instruction *PredBBTI = PredBB->getTerminator();
+    for (unsigned i = 0, e = PredBBTI->getNumSuccessors(); i != e; ++i)
+      if (PredBBTI->getSuccessor(i) == BB) {
+        BB->removePredecessor(PredBB);
+        PredBBTI->setSuccessor(i, EdgeBB);
+      }
+
     Updates.push_back({DominatorTree::Insert, PredBB, EdgeBB});
     Updates.push_back({DominatorTree::Delete, PredBB, BB});
 
     if (DTU)
       DTU->applyUpdates(Updates);
 
-    // Recurse, simplifying any other constants. 
+    // Recurse, simplifying any other constants.
     return FoldCondBranchOnPHI(BI, DTU, DL, AC) || true;
-  } 
- 
-  return false; 
-} 
- 
-/// Given a BB that starts with the specified two-entry PHI node, 
-/// see if we can eliminate it. 
-static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI, 
+  }
+
+  return false;
+}
+
+/// Given a BB that starts with the specified two-entry PHI node,
+/// see if we can eliminate it.
+static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
                                 DomTreeUpdater *DTU, const DataLayout &DL) {
-  // Ok, this is a two entry PHI node.  Check to see if this is a simple "if 
-  // statement", which has a very simple dominance structure.  Basically, we 
-  // are trying to find the condition that is being branched on, which 
-  // subsequently causes this merge to happen.  We really want control 
-  // dependence information for this check, but simplifycfg can't keep it up 
-  // to date, and this catches most of the cases we care about anyway. 
-  BasicBlock *BB = PN->getParent(); 
- 
-  BasicBlock *IfTrue, *IfFalse; 
-  Value *IfCond = GetIfCondition(BB, IfTrue, IfFalse); 
-  if (!IfCond || 
-      // Don't bother if the branch will be constant folded trivially. 
-      isa<ConstantInt>(IfCond)) 
-    return false; 
- 
-  // Okay, we found that we can merge this two-entry phi node into a select. 
-  // Doing so would require us to fold *all* two entry phi nodes in this block. 
-  // At some point this becomes non-profitable (particularly if the target 
-  // doesn't support cmov's).  Only do this transformation if there are two or 
-  // fewer PHI nodes in this block. 
-  unsigned NumPhis = 0; 
-  for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ++NumPhis, ++I) 
-    if (NumPhis > 2) 
-      return false; 
- 
-  // Loop over the PHI's seeing if we can promote them all to select 
-  // instructions.  While we are at it, keep track of the instructions 
-  // that need to be moved to the dominating block. 
-  SmallPtrSet<Instruction *, 4> AggressiveInsts; 
-  int BudgetRemaining = 
-      TwoEntryPHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic; 
- 
+  // Ok, this is a two entry PHI node.  Check to see if this is a simple "if
+  // statement", which has a very simple dominance structure.  Basically, we
+  // are trying to find the condition that is being branched on, which
+  // subsequently causes this merge to happen.  We really want control
+  // dependence information for this check, but simplifycfg can't keep it up
+  // to date, and this catches most of the cases we care about anyway.
+  BasicBlock *BB = PN->getParent();
+
+  BasicBlock *IfTrue, *IfFalse;
+  Value *IfCond = GetIfCondition(BB, IfTrue, IfFalse);
+  if (!IfCond ||
+      // Don't bother if the branch will be constant folded trivially.
+      isa<ConstantInt>(IfCond))
+    return false;
+
+  // Okay, we found that we can merge this two-entry phi node into a select.
+  // Doing so would require us to fold *all* two entry phi nodes in this block.
+  // At some point this becomes non-profitable (particularly if the target
+  // doesn't support cmov's).  Only do this transformation if there are two or
+  // fewer PHI nodes in this block.
+  unsigned NumPhis = 0;
+  for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ++NumPhis, ++I)
+    if (NumPhis > 2)
+      return false;
+
+  // Loop over the PHI's seeing if we can promote them all to select
+  // instructions.  While we are at it, keep track of the instructions
+  // that need to be moved to the dominating block.
+  SmallPtrSet<Instruction *, 4> AggressiveInsts;
+  int BudgetRemaining =
+      TwoEntryPHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic;
+
   bool Changed = false;
-  for (BasicBlock::iterator II = BB->begin(); isa<PHINode>(II);) { 
-    PHINode *PN = cast<PHINode>(II++); 
-    if (Value *V = SimplifyInstruction(PN, {DL, PN})) { 
-      PN->replaceAllUsesWith(V); 
-      PN->eraseFromParent(); 
+  for (BasicBlock::iterator II = BB->begin(); isa<PHINode>(II);) {
+    PHINode *PN = cast<PHINode>(II++);
+    if (Value *V = SimplifyInstruction(PN, {DL, PN})) {
+      PN->replaceAllUsesWith(V);
+      PN->eraseFromParent();
       Changed = true;
-      continue; 
-    } 
- 
-    if (!DominatesMergePoint(PN->getIncomingValue(0), BB, AggressiveInsts, 
-                             BudgetRemaining, TTI) || 
-        !DominatesMergePoint(PN->getIncomingValue(1), BB, AggressiveInsts, 
-                             BudgetRemaining, TTI)) 
+      continue;
+    }
+
+    if (!DominatesMergePoint(PN->getIncomingValue(0), BB, AggressiveInsts,
+                             BudgetRemaining, TTI) ||
+        !DominatesMergePoint(PN->getIncomingValue(1), BB, AggressiveInsts,
+                             BudgetRemaining, TTI))
       return Changed;
-  } 
- 
-  // If we folded the first phi, PN dangles at this point.  Refresh it.  If 
-  // we ran out of PHIs then we simplified them all. 
-  PN = dyn_cast<PHINode>(BB->begin()); 
-  if (!PN) 
-    return true; 
- 
-  // Return true if at least one of these is a 'not', and another is either 
-  // a 'not' too, or a constant. 
-  auto CanHoistNotFromBothValues = [](Value *V0, Value *V1) { 
-    if (!match(V0, m_Not(m_Value()))) 
-      std::swap(V0, V1); 
-    auto Invertible = m_CombineOr(m_Not(m_Value()), m_AnyIntegralConstant()); 
-    return match(V0, m_Not(m_Value())) && match(V1, Invertible); 
-  }; 
- 
-  // Don't fold i1 branches on PHIs which contain binary operators, unless one 
-  // of the incoming values is an 'not' and another one is freely invertible. 
-  // These can often be turned into switches and other things. 
-  if (PN->getType()->isIntegerTy(1) && 
-      (isa<BinaryOperator>(PN->getIncomingValue(0)) || 
-       isa<BinaryOperator>(PN->getIncomingValue(1)) || 
-       isa<BinaryOperator>(IfCond)) && 
-      !CanHoistNotFromBothValues(PN->getIncomingValue(0), 
-                                 PN->getIncomingValue(1))) 
+  }
+
+  // If we folded the first phi, PN dangles at this point.  Refresh it.  If
+  // we ran out of PHIs then we simplified them all.
+  PN = dyn_cast<PHINode>(BB->begin());
+  if (!PN)
+    return true;
+
+  // Return true if at least one of these is a 'not', and another is either
+  // a 'not' too, or a constant.
+  auto CanHoistNotFromBothValues = [](Value *V0, Value *V1) {
+    if (!match(V0, m_Not(m_Value())))
+      std::swap(V0, V1);
+    auto Invertible = m_CombineOr(m_Not(m_Value()), m_AnyIntegralConstant());
+    return match(V0, m_Not(m_Value())) && match(V1, Invertible);
+  };
+
+  // Don't fold i1 branches on PHIs which contain binary operators, unless one
+  // of the incoming values is an 'not' and another one is freely invertible.
+  // These can often be turned into switches and other things.
+  if (PN->getType()->isIntegerTy(1) &&
+      (isa<BinaryOperator>(PN->getIncomingValue(0)) ||
+       isa<BinaryOperator>(PN->getIncomingValue(1)) ||
+       isa<BinaryOperator>(IfCond)) &&
+      !CanHoistNotFromBothValues(PN->getIncomingValue(0),
+                                 PN->getIncomingValue(1)))
     return Changed;
- 
-  // If all PHI nodes are promotable, check to make sure that all instructions 
-  // in the predecessor blocks can be promoted as well. If not, we won't be able 
-  // to get rid of the control flow, so it's not worth promoting to select 
-  // instructions. 
-  BasicBlock *DomBlock = nullptr; 
-  BasicBlock *IfBlock1 = PN->getIncomingBlock(0); 
-  BasicBlock *IfBlock2 = PN->getIncomingBlock(1); 
-  if (cast<BranchInst>(IfBlock1->getTerminator())->isConditional()) { 
-    IfBlock1 = nullptr; 
-  } else { 
-    DomBlock = *pred_begin(IfBlock1); 
-    for (BasicBlock::iterator I = IfBlock1->begin(); !I->isTerminator(); ++I) 
+
+  // If all PHI nodes are promotable, check to make sure that all instructions
+  // in the predecessor blocks can be promoted as well. If not, we won't be able
+  // to get rid of the control flow, so it's not worth promoting to select
+  // instructions.
+  BasicBlock *DomBlock = nullptr;
+  BasicBlock *IfBlock1 = PN->getIncomingBlock(0);
+  BasicBlock *IfBlock2 = PN->getIncomingBlock(1);
+  if (cast<BranchInst>(IfBlock1->getTerminator())->isConditional()) {
+    IfBlock1 = nullptr;
+  } else {
+    DomBlock = *pred_begin(IfBlock1);
+    for (BasicBlock::iterator I = IfBlock1->begin(); !I->isTerminator(); ++I)
       if (!AggressiveInsts.count(&*I) && !isa<DbgInfoIntrinsic>(I) &&
           !isa<PseudoProbeInst>(I)) {
-        // This is not an aggressive instruction that we can promote. 
-        // Because of this, we won't be able to get rid of the control flow, so 
-        // the xform is not worth it. 
+        // This is not an aggressive instruction that we can promote.
+        // Because of this, we won't be able to get rid of the control flow, so
+        // the xform is not worth it.
         return Changed;
-      } 
-  } 
- 
-  if (cast<BranchInst>(IfBlock2->getTerminator())->isConditional()) { 
-    IfBlock2 = nullptr; 
-  } else { 
-    DomBlock = *pred_begin(IfBlock2); 
-    for (BasicBlock::iterator I = IfBlock2->begin(); !I->isTerminator(); ++I) 
+      }
+  }
+
+  if (cast<BranchInst>(IfBlock2->getTerminator())->isConditional()) {
+    IfBlock2 = nullptr;
+  } else {
+    DomBlock = *pred_begin(IfBlock2);
+    for (BasicBlock::iterator I = IfBlock2->begin(); !I->isTerminator(); ++I)
       if (!AggressiveInsts.count(&*I) && !isa<DbgInfoIntrinsic>(I) &&
           !isa<PseudoProbeInst>(I)) {
-        // This is not an aggressive instruction that we can promote. 
-        // Because of this, we won't be able to get rid of the control flow, so 
-        // the xform is not worth it. 
+        // This is not an aggressive instruction that we can promote.
+        // Because of this, we won't be able to get rid of the control flow, so
+        // the xform is not worth it.
         return Changed;
-      } 
-  } 
-  assert(DomBlock && "Failed to find root DomBlock"); 
- 
-  LLVM_DEBUG(dbgs() << "FOUND IF CONDITION!  " << *IfCond 
-                    << "  T: " << IfTrue->getName() 
-                    << "  F: " << IfFalse->getName() << "\n"); 
- 
-  // If we can still promote the PHI nodes after this gauntlet of tests, 
-  // do all of the PHI's now. 
-  Instruction *InsertPt = DomBlock->getTerminator(); 
-  IRBuilder<NoFolder> Builder(InsertPt); 
- 
-  // Move all 'aggressive' instructions, which are defined in the 
-  // conditional parts of the if's up to the dominating block. 
-  if (IfBlock1) 
-    hoistAllInstructionsInto(DomBlock, InsertPt, IfBlock1); 
-  if (IfBlock2) 
-    hoistAllInstructionsInto(DomBlock, InsertPt, IfBlock2); 
- 
-  // Propagate fast-math-flags from phi nodes to replacement selects. 
-  IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); 
-  while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) { 
-    if (isa<FPMathOperator>(PN)) 
-      Builder.setFastMathFlags(PN->getFastMathFlags()); 
- 
-    // Change the PHI node into a select instruction. 
-    Value *TrueVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfFalse); 
-    Value *FalseVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfTrue); 
- 
-    Value *Sel = Builder.CreateSelect(IfCond, TrueVal, FalseVal, "", InsertPt); 
-    PN->replaceAllUsesWith(Sel); 
-    Sel->takeName(PN); 
-    PN->eraseFromParent(); 
-  } 
- 
-  // At this point, IfBlock1 and IfBlock2 are both empty, so our if statement 
-  // has been flattened.  Change DomBlock to jump directly to our new block to 
-  // avoid other simplifycfg's kicking in on the diamond. 
-  Instruction *OldTI = DomBlock->getTerminator(); 
-  Builder.SetInsertPoint(OldTI); 
-  Builder.CreateBr(BB); 
+      }
+  }
+  assert(DomBlock && "Failed to find root DomBlock");
+
+  LLVM_DEBUG(dbgs() << "FOUND IF CONDITION!  " << *IfCond
+                    << "  T: " << IfTrue->getName()
+                    << "  F: " << IfFalse->getName() << "\n");
+
+  // If we can still promote the PHI nodes after this gauntlet of tests,
+  // do all of the PHI's now.
+  Instruction *InsertPt = DomBlock->getTerminator();
+  IRBuilder<NoFolder> Builder(InsertPt);
+
+  // Move all 'aggressive' instructions, which are defined in the
+  // conditional parts of the if's up to the dominating block.
+  if (IfBlock1)
+    hoistAllInstructionsInto(DomBlock, InsertPt, IfBlock1);
+  if (IfBlock2)
+    hoistAllInstructionsInto(DomBlock, InsertPt, IfBlock2);
+
+  // Propagate fast-math-flags from phi nodes to replacement selects.
+  IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
+  while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
+    if (isa<FPMathOperator>(PN))
+      Builder.setFastMathFlags(PN->getFastMathFlags());
+
+    // Change the PHI node into a select instruction.
+    Value *TrueVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfFalse);
+    Value *FalseVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfTrue);
+
+    Value *Sel = Builder.CreateSelect(IfCond, TrueVal, FalseVal, "", InsertPt);
+    PN->replaceAllUsesWith(Sel);
+    Sel->takeName(PN);
+    PN->eraseFromParent();
+  }
+
+  // At this point, IfBlock1 and IfBlock2 are both empty, so our if statement
+  // has been flattened.  Change DomBlock to jump directly to our new block to
+  // avoid other simplifycfg's kicking in on the diamond.
+  Instruction *OldTI = DomBlock->getTerminator();
+  Builder.SetInsertPoint(OldTI);
+  Builder.CreateBr(BB);
 
   SmallVector<DominatorTree::UpdateType, 3> Updates;
   if (DTU) {
@@ -2693,43 +2693,43 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
       Updates.push_back({DominatorTree::Delete, DomBlock, Successor});
   }
 
-  OldTI->eraseFromParent(); 
+  OldTI->eraseFromParent();
   if (DTU)
     DTU->applyUpdates(Updates);
 
-  return true; 
-} 
- 
-/// If we found a conditional branch that goes to two returning blocks, 
-/// try to merge them together into one return, 
-/// introducing a select if the return values disagree. 
-bool SimplifyCFGOpt::SimplifyCondBranchToTwoReturns(BranchInst *BI, 
-                                                    IRBuilder<> &Builder) { 
+  return true;
+}
+
+/// If we found a conditional branch that goes to two returning blocks,
+/// try to merge them together into one return,
+/// introducing a select if the return values disagree.
+bool SimplifyCFGOpt::SimplifyCondBranchToTwoReturns(BranchInst *BI,
+                                                    IRBuilder<> &Builder) {
   auto *BB = BI->getParent();
-  assert(BI->isConditional() && "Must be a conditional branch"); 
-  BasicBlock *TrueSucc = BI->getSuccessor(0); 
-  BasicBlock *FalseSucc = BI->getSuccessor(1); 
+  assert(BI->isConditional() && "Must be a conditional branch");
+  BasicBlock *TrueSucc = BI->getSuccessor(0);
+  BasicBlock *FalseSucc = BI->getSuccessor(1);
   // NOTE: destinations may match, this could be degenerate uncond branch.
-  ReturnInst *TrueRet = cast<ReturnInst>(TrueSucc->getTerminator()); 
-  ReturnInst *FalseRet = cast<ReturnInst>(FalseSucc->getTerminator()); 
- 
-  // Check to ensure both blocks are empty (just a return) or optionally empty 
-  // with PHI nodes.  If there are other instructions, merging would cause extra 
-  // computation on one path or the other. 
-  if (!TrueSucc->getFirstNonPHIOrDbg()->isTerminator()) 
-    return false; 
-  if (!FalseSucc->getFirstNonPHIOrDbg()->isTerminator()) 
-    return false; 
- 
-  Builder.SetInsertPoint(BI); 
-  // Okay, we found a branch that is going to two return nodes.  If 
-  // there is no return value for this function, just change the 
-  // branch into a return. 
-  if (FalseRet->getNumOperands() == 0) { 
+  ReturnInst *TrueRet = cast<ReturnInst>(TrueSucc->getTerminator());
+  ReturnInst *FalseRet = cast<ReturnInst>(FalseSucc->getTerminator());
+
+  // Check to ensure both blocks are empty (just a return) or optionally empty
+  // with PHI nodes.  If there are other instructions, merging would cause extra
+  // computation on one path or the other.
+  if (!TrueSucc->getFirstNonPHIOrDbg()->isTerminator())
+    return false;
+  if (!FalseSucc->getFirstNonPHIOrDbg()->isTerminator())
+    return false;
+
+  Builder.SetInsertPoint(BI);
+  // Okay, we found a branch that is going to two return nodes.  If
+  // there is no return value for this function, just change the
+  // branch into a return.
+  if (FalseRet->getNumOperands() == 0) {
     TrueSucc->removePredecessor(BB);
     FalseSucc->removePredecessor(BB);
-    Builder.CreateRetVoid(); 
-    EraseTerminatorAndDCECond(BI); 
+    Builder.CreateRetVoid();
+    EraseTerminatorAndDCECond(BI);
     if (DTU) {
       SmallVector<DominatorTree::UpdateType, 2> Updates;
       Updates.push_back({DominatorTree::Delete, BB, TrueSucc});
@@ -2737,62 +2737,62 @@ bool SimplifyCFGOpt::SimplifyCondBranchToTwoReturns(BranchInst *BI,
         Updates.push_back({DominatorTree::Delete, BB, FalseSucc});
       DTU->applyUpdates(Updates);
     }
-    return true; 
-  } 
- 
-  // Otherwise, figure out what the true and false return values are 
-  // so we can insert a new select instruction. 
-  Value *TrueValue = TrueRet->getReturnValue(); 
-  Value *FalseValue = FalseRet->getReturnValue(); 
- 
-  // Unwrap any PHI nodes in the return blocks. 
-  if (PHINode *TVPN = dyn_cast_or_null<PHINode>(TrueValue)) 
-    if (TVPN->getParent() == TrueSucc) 
+    return true;
+  }
+
+  // Otherwise, figure out what the true and false return values are
+  // so we can insert a new select instruction.
+  Value *TrueValue = TrueRet->getReturnValue();
+  Value *FalseValue = FalseRet->getReturnValue();
+
+  // Unwrap any PHI nodes in the return blocks.
+  if (PHINode *TVPN = dyn_cast_or_null<PHINode>(TrueValue))
+    if (TVPN->getParent() == TrueSucc)
       TrueValue = TVPN->getIncomingValueForBlock(BB);
-  if (PHINode *FVPN = dyn_cast_or_null<PHINode>(FalseValue)) 
-    if (FVPN->getParent() == FalseSucc) 
+  if (PHINode *FVPN = dyn_cast_or_null<PHINode>(FalseValue))
+    if (FVPN->getParent() == FalseSucc)
       FalseValue = FVPN->getIncomingValueForBlock(BB);
- 
-  // In order for this transformation to be safe, we must be able to 
-  // unconditionally execute both operands to the return.  This is 
-  // normally the case, but we could have a potentially-trapping 
-  // constant expression that prevents this transformation from being 
-  // safe. 
-  if (ConstantExpr *TCV = dyn_cast_or_null<ConstantExpr>(TrueValue)) 
-    if (TCV->canTrap()) 
-      return false; 
-  if (ConstantExpr *FCV = dyn_cast_or_null<ConstantExpr>(FalseValue)) 
-    if (FCV->canTrap()) 
-      return false; 
- 
-  // Okay, we collected all the mapped values and checked them for sanity, and 
-  // defined to really do this transformation.  First, update the CFG. 
+
+  // In order for this transformation to be safe, we must be able to
+  // unconditionally execute both operands to the return.  This is
+  // normally the case, but we could have a potentially-trapping
+  // constant expression that prevents this transformation from being
+  // safe.
+  if (ConstantExpr *TCV = dyn_cast_or_null<ConstantExpr>(TrueValue))
+    if (TCV->canTrap())
+      return false;
+  if (ConstantExpr *FCV = dyn_cast_or_null<ConstantExpr>(FalseValue))
+    if (FCV->canTrap())
+      return false;
+
+  // Okay, we collected all the mapped values and checked them for sanity, and
+  // defined to really do this transformation.  First, update the CFG.
   TrueSucc->removePredecessor(BB);
   FalseSucc->removePredecessor(BB);
- 
-  // Insert select instructions where needed. 
-  Value *BrCond = BI->getCondition(); 
-  if (TrueValue) { 
-    // Insert a select if the results differ. 
-    if (TrueValue == FalseValue || isa<UndefValue>(FalseValue)) { 
-    } else if (isa<UndefValue>(TrueValue)) { 
-      TrueValue = FalseValue; 
-    } else { 
-      TrueValue = 
-          Builder.CreateSelect(BrCond, TrueValue, FalseValue, "retval", BI); 
-    } 
-  } 
- 
-  Value *RI = 
-      !TrueValue ? Builder.CreateRetVoid() : Builder.CreateRet(TrueValue); 
- 
-  (void)RI; 
- 
-  LLVM_DEBUG(dbgs() << "\nCHANGING BRANCH TO TWO RETURNS INTO SELECT:" 
-                    << "\n  " << *BI << "\nNewRet = " << *RI << "\nTRUEBLOCK: " 
-                    << *TrueSucc << "\nFALSEBLOCK: " << *FalseSucc); 
- 
-  EraseTerminatorAndDCECond(BI); 
+
+  // Insert select instructions where needed.
+  Value *BrCond = BI->getCondition();
+  if (TrueValue) {
+    // Insert a select if the results differ.
+    if (TrueValue == FalseValue || isa<UndefValue>(FalseValue)) {
+    } else if (isa<UndefValue>(TrueValue)) {
+      TrueValue = FalseValue;
+    } else {
+      TrueValue =
+          Builder.CreateSelect(BrCond, TrueValue, FalseValue, "retval", BI);
+    }
+  }
+
+  Value *RI =
+      !TrueValue ? Builder.CreateRetVoid() : Builder.CreateRet(TrueValue);
+
+  (void)RI;
+
+  LLVM_DEBUG(dbgs() << "\nCHANGING BRANCH TO TWO RETURNS INTO SELECT:"
+                    << "\n  " << *BI << "\nNewRet = " << *RI << "\nTRUEBLOCK: "
+                    << *TrueSucc << "\nFALSEBLOCK: " << *FalseSucc);
+
+  EraseTerminatorAndDCECond(BI);
   if (DTU) {
     SmallVector<DominatorTree::UpdateType, 2> Updates;
     Updates.push_back({DominatorTree::Delete, BB, TrueSucc});
@@ -2800,33 +2800,33 @@ bool SimplifyCFGOpt::SimplifyCondBranchToTwoReturns(BranchInst *BI,
       Updates.push_back({DominatorTree::Delete, BB, FalseSucc});
     DTU->applyUpdates(Updates);
   }
- 
-  return true; 
-} 
- 
-/// Return true if either PBI or BI has branch weight available, and store 
-/// the weights in {Pred|Succ}{True|False}Weight. If one of PBI and BI does 
-/// not have branch weight, use 1:1 as its weight. 
-static bool extractPredSuccWeights(BranchInst *PBI, BranchInst *BI, 
-                                   uint64_t &PredTrueWeight, 
-                                   uint64_t &PredFalseWeight, 
-                                   uint64_t &SuccTrueWeight, 
-                                   uint64_t &SuccFalseWeight) { 
-  bool PredHasWeights = 
-      PBI->extractProfMetadata(PredTrueWeight, PredFalseWeight); 
-  bool SuccHasWeights = 
-      BI->extractProfMetadata(SuccTrueWeight, SuccFalseWeight); 
-  if (PredHasWeights || SuccHasWeights) { 
-    if (!PredHasWeights) 
-      PredTrueWeight = PredFalseWeight = 1; 
-    if (!SuccHasWeights) 
-      SuccTrueWeight = SuccFalseWeight = 1; 
-    return true; 
-  } else { 
-    return false; 
-  } 
-} 
- 
+
+  return true;
+}
+
+/// Return true if either PBI or BI has branch weight available, and store
+/// the weights in {Pred|Succ}{True|False}Weight. If one of PBI and BI does
+/// not have branch weight, use 1:1 as its weight.
+static bool extractPredSuccWeights(BranchInst *PBI, BranchInst *BI,
+                                   uint64_t &PredTrueWeight,
+                                   uint64_t &PredFalseWeight,
+                                   uint64_t &SuccTrueWeight,
+                                   uint64_t &SuccFalseWeight) {
+  bool PredHasWeights =
+      PBI->extractProfMetadata(PredTrueWeight, PredFalseWeight);
+  bool SuccHasWeights =
+      BI->extractProfMetadata(SuccTrueWeight, SuccFalseWeight);
+  if (PredHasWeights || SuccHasWeights) {
+    if (!PredHasWeights)
+      PredTrueWeight = PredFalseWeight = 1;
+    if (!SuccHasWeights)
+      SuccTrueWeight = SuccFalseWeight = 1;
+    return true;
+  } else {
+    return false;
+  }
+}
+
 // Determine if the two branches share a common destination,
 // and deduce a glue that we need to use to join branch's conditions
 // to arrive at the common destination.
@@ -2967,91 +2967,91 @@ static bool PerformBranchToCommonDestFolding(BranchInst *BI, BranchInst *PBI,
   return true;
 }
 
-/// If this basic block is simple enough, and if a predecessor branches to us 
-/// and one of our successors, fold the block into the predecessor and use 
-/// logical operations to pick the right destination. 
+/// If this basic block is simple enough, and if a predecessor branches to us
+/// and one of our successors, fold the block into the predecessor and use
+/// logical operations to pick the right destination.
 bool llvm::FoldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU,
                                   MemorySSAUpdater *MSSAU,
                                   const TargetTransformInfo *TTI,
-                                  unsigned BonusInstThreshold) { 
+                                  unsigned BonusInstThreshold) {
   // If this block ends with an unconditional branch,
   // let SpeculativelyExecuteBB() deal with it.
   if (!BI->isConditional())
     return false;
 
-  BasicBlock *BB = BI->getParent(); 
- 
-  const unsigned PredCount = pred_size(BB); 
- 
-  bool Changed = false; 
- 
+  BasicBlock *BB = BI->getParent();
+
+  const unsigned PredCount = pred_size(BB);
+
+  bool Changed = false;
+
   TargetTransformInfo::TargetCostKind CostKind =
     BB->getParent()->hasMinSize() ? TargetTransformInfo::TCK_CodeSize
                                   : TargetTransformInfo::TCK_SizeAndLatency;
- 
+
   Instruction *Cond = dyn_cast<Instruction>(BI->getCondition());
- 
-  if (!Cond || (!isa<CmpInst>(Cond) && !isa<BinaryOperator>(Cond)) || 
-      Cond->getParent() != BB || !Cond->hasOneUse()) 
-    return Changed; 
- 
-  // Only allow this transformation if computing the condition doesn't involve 
-  // too many instructions and these involved instructions can be executed 
-  // unconditionally. We denote all involved instructions except the condition 
-  // as "bonus instructions", and only allow this transformation when the 
-  // number of the bonus instructions we'll need to create when cloning into 
-  // each predecessor does not exceed a certain threshold. 
-  unsigned NumBonusInsts = 0; 
+
+  if (!Cond || (!isa<CmpInst>(Cond) && !isa<BinaryOperator>(Cond)) ||
+      Cond->getParent() != BB || !Cond->hasOneUse())
+    return Changed;
+
+  // Only allow this transformation if computing the condition doesn't involve
+  // too many instructions and these involved instructions can be executed
+  // unconditionally. We denote all involved instructions except the condition
+  // as "bonus instructions", and only allow this transformation when the
+  // number of the bonus instructions we'll need to create when cloning into
+  // each predecessor does not exceed a certain threshold.
+  unsigned NumBonusInsts = 0;
   for (Instruction &I : *BB) {
     // Don't check the branch condition comparison itself.
     if (&I == Cond)
-      continue; 
+      continue;
     // Ignore dbg intrinsics, and the terminator.
     if (isa<DbgInfoIntrinsic>(I) || isa<BranchInst>(I))
       continue;
     // I must be safe to execute unconditionally.
     if (!isSafeToSpeculativelyExecute(&I))
-      return Changed; 
- 
-    // Account for the cost of duplicating this instruction into each 
-    // predecessor. 
-    NumBonusInsts += PredCount; 
-    // Early exits once we reach the limit. 
-    if (NumBonusInsts > BonusInstThreshold) 
-      return Changed; 
-  } 
- 
-  // Cond is known to be a compare or binary operator.  Check to make sure that 
-  // neither operand is a potentially-trapping constant expression. 
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(0))) 
-    if (CE->canTrap()) 
-      return Changed; 
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(1))) 
-    if (CE->canTrap()) 
-      return Changed; 
- 
-  // Finally, don't infinitely unroll conditional loops. 
+      return Changed;
+
+    // Account for the cost of duplicating this instruction into each
+    // predecessor.
+    NumBonusInsts += PredCount;
+    // Early exits once we reach the limit.
+    if (NumBonusInsts > BonusInstThreshold)
+      return Changed;
+  }
+
+  // Cond is known to be a compare or binary operator.  Check to make sure that
+  // neither operand is a potentially-trapping constant expression.
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(0)))
+    if (CE->canTrap())
+      return Changed;
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(1)))
+    if (CE->canTrap())
+      return Changed;
+
+  // Finally, don't infinitely unroll conditional loops.
   if (is_contained(successors(BB), BB))
-    return Changed; 
- 
-  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { 
-    BasicBlock *PredBlock = *PI; 
-    BranchInst *PBI = dyn_cast<BranchInst>(PredBlock->getTerminator()); 
- 
-    // Check that we have two conditional branches.  If there is a PHI node in 
-    // the common successor, verify that the same value flows in from both 
-    // blocks. 
+    return Changed;
+
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+    BasicBlock *PredBlock = *PI;
+    BranchInst *PBI = dyn_cast<BranchInst>(PredBlock->getTerminator());
+
+    // Check that we have two conditional branches.  If there is a PHI node in
+    // the common successor, verify that the same value flows in from both
+    // blocks.
     if (!PBI || PBI->isUnconditional() || !SafeToMergeTerminators(BI, PBI))
-      continue; 
- 
-    // Determine if the two branches share a common destination. 
+      continue;
+
+    // Determine if the two branches share a common destination.
     Instruction::BinaryOps Opc;
     bool InvertPredCond;
     if (auto Recepie = CheckIfCondBranchesShareCommonDestination(BI, PBI))
       std::tie(Opc, InvertPredCond) = *Recepie;
     else
       continue;
- 
+
     // Check the cost of inserting the necessary logic before performing the
     // transformation.
     if (TTI) {
@@ -3060,712 +3060,712 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU,
       if (InvertPredCond && (!PBI->getCondition()->hasOneUse() ||
           !isa<CmpInst>(PBI->getCondition())))
         Cost += TTI->getArithmeticInstrCost(Instruction::Xor, Ty, CostKind);
- 
+
       if (Cost > BranchFoldThreshold)
-        continue; 
-    } 
- 
+        continue;
+    }
+
     return PerformBranchToCommonDestFolding(BI, PBI, DTU, MSSAU);
-  } 
-  return Changed; 
-} 
- 
-// If there is only one store in BB1 and BB2, return it, otherwise return 
-// nullptr. 
-static StoreInst *findUniqueStoreInBlocks(BasicBlock *BB1, BasicBlock *BB2) { 
-  StoreInst *S = nullptr; 
-  for (auto *BB : {BB1, BB2}) { 
-    if (!BB) 
-      continue; 
-    for (auto &I : *BB) 
-      if (auto *SI = dyn_cast<StoreInst>(&I)) { 
-        if (S) 
-          // Multiple stores seen. 
-          return nullptr; 
-        else 
-          S = SI; 
-      } 
-  } 
-  return S; 
-} 
- 
-static Value *ensureValueAvailableInSuccessor(Value *V, BasicBlock *BB, 
-                                              Value *AlternativeV = nullptr) { 
-  // PHI is going to be a PHI node that allows the value V that is defined in 
-  // BB to be referenced in BB's only successor. 
-  // 
-  // If AlternativeV is nullptr, the only value we care about in PHI is V. It 
-  // doesn't matter to us what the other operand is (it'll never get used). We 
-  // could just create a new PHI with an undef incoming value, but that could 
-  // increase register pressure if EarlyCSE/InstCombine can't fold it with some 
-  // other PHI. So here we directly look for some PHI in BB's successor with V 
-  // as an incoming operand. If we find one, we use it, else we create a new 
-  // one. 
-  // 
-  // If AlternativeV is not nullptr, we care about both incoming values in PHI. 
-  // PHI must be exactly: phi <ty> [ %BB, %V ], [ %OtherBB, %AlternativeV] 
-  // where OtherBB is the single other predecessor of BB's only successor. 
-  PHINode *PHI = nullptr; 
-  BasicBlock *Succ = BB->getSingleSuccessor(); 
- 
-  for (auto I = Succ->begin(); isa<PHINode>(I); ++I) 
-    if (cast<PHINode>(I)->getIncomingValueForBlock(BB) == V) { 
-      PHI = cast<PHINode>(I); 
-      if (!AlternativeV) 
-        break; 
- 
-      assert(Succ->hasNPredecessors(2)); 
-      auto PredI = pred_begin(Succ); 
-      BasicBlock *OtherPredBB = *PredI == BB ? *++PredI : *PredI; 
-      if (PHI->getIncomingValueForBlock(OtherPredBB) == AlternativeV) 
-        break; 
-      PHI = nullptr; 
-    } 
-  if (PHI) 
-    return PHI; 
- 
-  // If V is not an instruction defined in BB, just return it. 
-  if (!AlternativeV && 
-      (!isa<Instruction>(V) || cast<Instruction>(V)->getParent() != BB)) 
-    return V; 
- 
-  PHI = PHINode::Create(V->getType(), 2, "simplifycfg.merge", &Succ->front()); 
-  PHI->addIncoming(V, BB); 
-  for (BasicBlock *PredBB : predecessors(Succ)) 
-    if (PredBB != BB) 
-      PHI->addIncoming( 
-          AlternativeV ? AlternativeV : UndefValue::get(V->getType()), PredBB); 
-  return PHI; 
-} 
- 
+  }
+  return Changed;
+}
+
+// If there is only one store in BB1 and BB2, return it, otherwise return
+// nullptr.
+static StoreInst *findUniqueStoreInBlocks(BasicBlock *BB1, BasicBlock *BB2) {
+  StoreInst *S = nullptr;
+  for (auto *BB : {BB1, BB2}) {
+    if (!BB)
+      continue;
+    for (auto &I : *BB)
+      if (auto *SI = dyn_cast<StoreInst>(&I)) {
+        if (S)
+          // Multiple stores seen.
+          return nullptr;
+        else
+          S = SI;
+      }
+  }
+  return S;
+}
+
+static Value *ensureValueAvailableInSuccessor(Value *V, BasicBlock *BB,
+                                              Value *AlternativeV = nullptr) {
+  // PHI is going to be a PHI node that allows the value V that is defined in
+  // BB to be referenced in BB's only successor.
+  //
+  // If AlternativeV is nullptr, the only value we care about in PHI is V. It
+  // doesn't matter to us what the other operand is (it'll never get used). We
+  // could just create a new PHI with an undef incoming value, but that could
+  // increase register pressure if EarlyCSE/InstCombine can't fold it with some
+  // other PHI. So here we directly look for some PHI in BB's successor with V
+  // as an incoming operand. If we find one, we use it, else we create a new
+  // one.
+  //
+  // If AlternativeV is not nullptr, we care about both incoming values in PHI.
+  // PHI must be exactly: phi <ty> [ %BB, %V ], [ %OtherBB, %AlternativeV]
+  // where OtherBB is the single other predecessor of BB's only successor.
+  PHINode *PHI = nullptr;
+  BasicBlock *Succ = BB->getSingleSuccessor();
+
+  for (auto I = Succ->begin(); isa<PHINode>(I); ++I)
+    if (cast<PHINode>(I)->getIncomingValueForBlock(BB) == V) {
+      PHI = cast<PHINode>(I);
+      if (!AlternativeV)
+        break;
+
+      assert(Succ->hasNPredecessors(2));
+      auto PredI = pred_begin(Succ);
+      BasicBlock *OtherPredBB = *PredI == BB ? *++PredI : *PredI;
+      if (PHI->getIncomingValueForBlock(OtherPredBB) == AlternativeV)
+        break;
+      PHI = nullptr;
+    }
+  if (PHI)
+    return PHI;
+
+  // If V is not an instruction defined in BB, just return it.
+  if (!AlternativeV &&
+      (!isa<Instruction>(V) || cast<Instruction>(V)->getParent() != BB))
+    return V;
+
+  PHI = PHINode::Create(V->getType(), 2, "simplifycfg.merge", &Succ->front());
+  PHI->addIncoming(V, BB);
+  for (BasicBlock *PredBB : predecessors(Succ))
+    if (PredBB != BB)
+      PHI->addIncoming(
+          AlternativeV ? AlternativeV : UndefValue::get(V->getType()), PredBB);
+  return PHI;
+}
+
 static bool mergeConditionalStoreToAddress(
     BasicBlock *PTB, BasicBlock *PFB, BasicBlock *QTB, BasicBlock *QFB,
     BasicBlock *PostBB, Value *Address, bool InvertPCond, bool InvertQCond,
     DomTreeUpdater *DTU, const DataLayout &DL, const TargetTransformInfo &TTI) {
-  // For every pointer, there must be exactly two stores, one coming from 
-  // PTB or PFB, and the other from QTB or QFB. We don't support more than one 
-  // store (to any address) in PTB,PFB or QTB,QFB. 
-  // FIXME: We could relax this restriction with a bit more work and performance 
-  // testing. 
-  StoreInst *PStore = findUniqueStoreInBlocks(PTB, PFB); 
-  StoreInst *QStore = findUniqueStoreInBlocks(QTB, QFB); 
-  if (!PStore || !QStore) 
-    return false; 
- 
-  // Now check the stores are compatible. 
-  if (!QStore->isUnordered() || !PStore->isUnordered()) 
-    return false; 
- 
-  // Check that sinking the store won't cause program behavior changes. Sinking 
-  // the store out of the Q blocks won't change any behavior as we're sinking 
-  // from a block to its unconditional successor. But we're moving a store from 
-  // the P blocks down through the middle block (QBI) and past both QFB and QTB. 
-  // So we need to check that there are no aliasing loads or stores in 
-  // QBI, QTB and QFB. We also need to check there are no conflicting memory 
-  // operations between PStore and the end of its parent block. 
-  // 
-  // The ideal way to do this is to query AliasAnalysis, but we don't 
-  // preserve AA currently so that is dangerous. Be super safe and just 
-  // check there are no other memory operations at all. 
-  for (auto &I : *QFB->getSinglePredecessor()) 
-    if (I.mayReadOrWriteMemory()) 
-      return false; 
-  for (auto &I : *QFB) 
-    if (&I != QStore && I.mayReadOrWriteMemory()) 
-      return false; 
-  if (QTB) 
-    for (auto &I : *QTB) 
-      if (&I != QStore && I.mayReadOrWriteMemory()) 
-        return false; 
-  for (auto I = BasicBlock::iterator(PStore), E = PStore->getParent()->end(); 
-       I != E; ++I) 
-    if (&*I != PStore && I->mayReadOrWriteMemory()) 
-      return false; 
- 
-  // If we're not in aggressive mode, we only optimize if we have some 
-  // confidence that by optimizing we'll allow P and/or Q to be if-converted. 
-  auto IsWorthwhile = [&](BasicBlock *BB, ArrayRef<StoreInst *> FreeStores) { 
-    if (!BB) 
-      return true; 
-    // Heuristic: if the block can be if-converted/phi-folded and the 
-    // instructions inside are all cheap (arithmetic/GEPs), it's worthwhile to 
-    // thread this store. 
-    int BudgetRemaining = 
-        PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic; 
-    for (auto &I : BB->instructionsWithoutDebug()) { 
-      // Consider terminator instruction to be free. 
-      if (I.isTerminator()) 
-        continue; 
-      // If this is one the stores that we want to speculate out of this BB, 
-      // then don't count it's cost, consider it to be free. 
-      if (auto *S = dyn_cast<StoreInst>(&I)) 
-        if (llvm::find(FreeStores, S)) 
-          continue; 
-      // Else, we have a white-list of instructions that we are ak speculating. 
-      if (!isa<BinaryOperator>(I) && !isa<GetElementPtrInst>(I)) 
-        return false; // Not in white-list - not worthwhile folding. 
-      // And finally, if this is a non-free instruction that we are okay 
-      // speculating, ensure that we consider the speculation budget. 
-      BudgetRemaining -= TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency); 
-      if (BudgetRemaining < 0) 
-        return false; // Eagerly refuse to fold as soon as we're out of budget. 
-    } 
-    assert(BudgetRemaining >= 0 && 
-           "When we run out of budget we will eagerly return from within the " 
-           "per-instruction loop."); 
-    return true; 
-  }; 
- 
+  // For every pointer, there must be exactly two stores, one coming from
+  // PTB or PFB, and the other from QTB or QFB. We don't support more than one
+  // store (to any address) in PTB,PFB or QTB,QFB.
+  // FIXME: We could relax this restriction with a bit more work and performance
+  // testing.
+  StoreInst *PStore = findUniqueStoreInBlocks(PTB, PFB);
+  StoreInst *QStore = findUniqueStoreInBlocks(QTB, QFB);
+  if (!PStore || !QStore)
+    return false;
+
+  // Now check the stores are compatible.
+  if (!QStore->isUnordered() || !PStore->isUnordered())
+    return false;
+
+  // Check that sinking the store won't cause program behavior changes. Sinking
+  // the store out of the Q blocks won't change any behavior as we're sinking
+  // from a block to its unconditional successor. But we're moving a store from
+  // the P blocks down through the middle block (QBI) and past both QFB and QTB.
+  // So we need to check that there are no aliasing loads or stores in
+  // QBI, QTB and QFB. We also need to check there are no conflicting memory
+  // operations between PStore and the end of its parent block.
+  //
+  // The ideal way to do this is to query AliasAnalysis, but we don't
+  // preserve AA currently so that is dangerous. Be super safe and just
+  // check there are no other memory operations at all.
+  for (auto &I : *QFB->getSinglePredecessor())
+    if (I.mayReadOrWriteMemory())
+      return false;
+  for (auto &I : *QFB)
+    if (&I != QStore && I.mayReadOrWriteMemory())
+      return false;
+  if (QTB)
+    for (auto &I : *QTB)
+      if (&I != QStore && I.mayReadOrWriteMemory())
+        return false;
+  for (auto I = BasicBlock::iterator(PStore), E = PStore->getParent()->end();
+       I != E; ++I)
+    if (&*I != PStore && I->mayReadOrWriteMemory())
+      return false;
+
+  // If we're not in aggressive mode, we only optimize if we have some
+  // confidence that by optimizing we'll allow P and/or Q to be if-converted.
+  auto IsWorthwhile = [&](BasicBlock *BB, ArrayRef<StoreInst *> FreeStores) {
+    if (!BB)
+      return true;
+    // Heuristic: if the block can be if-converted/phi-folded and the
+    // instructions inside are all cheap (arithmetic/GEPs), it's worthwhile to
+    // thread this store.
+    int BudgetRemaining =
+        PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic;
+    for (auto &I : BB->instructionsWithoutDebug()) {
+      // Consider terminator instruction to be free.
+      if (I.isTerminator())
+        continue;
+      // If this is one the stores that we want to speculate out of this BB,
+      // then don't count it's cost, consider it to be free.
+      if (auto *S = dyn_cast<StoreInst>(&I))
+        if (llvm::find(FreeStores, S))
+          continue;
+      // Else, we have a white-list of instructions that we are ak speculating.
+      if (!isa<BinaryOperator>(I) && !isa<GetElementPtrInst>(I))
+        return false; // Not in white-list - not worthwhile folding.
+      // And finally, if this is a non-free instruction that we are okay
+      // speculating, ensure that we consider the speculation budget.
+      BudgetRemaining -= TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency);
+      if (BudgetRemaining < 0)
+        return false; // Eagerly refuse to fold as soon as we're out of budget.
+    }
+    assert(BudgetRemaining >= 0 &&
+           "When we run out of budget we will eagerly return from within the "
+           "per-instruction loop.");
+    return true;
+  };
+
   const std::array<StoreInst *, 2> FreeStores = {PStore, QStore};
-  if (!MergeCondStoresAggressively && 
-      (!IsWorthwhile(PTB, FreeStores) || !IsWorthwhile(PFB, FreeStores) || 
-       !IsWorthwhile(QTB, FreeStores) || !IsWorthwhile(QFB, FreeStores))) 
-    return false; 
- 
-  // If PostBB has more than two predecessors, we need to split it so we can 
-  // sink the store. 
-  if (std::next(pred_begin(PostBB), 2) != pred_end(PostBB)) { 
-    // We know that QFB's only successor is PostBB. And QFB has a single 
-    // predecessor. If QTB exists, then its only successor is also PostBB. 
-    // If QTB does not exist, then QFB's only predecessor has a conditional 
-    // branch to QFB and PostBB. 
-    BasicBlock *TruePred = QTB ? QTB : QFB->getSinglePredecessor(); 
+  if (!MergeCondStoresAggressively &&
+      (!IsWorthwhile(PTB, FreeStores) || !IsWorthwhile(PFB, FreeStores) ||
+       !IsWorthwhile(QTB, FreeStores) || !IsWorthwhile(QFB, FreeStores)))
+    return false;
+
+  // If PostBB has more than two predecessors, we need to split it so we can
+  // sink the store.
+  if (std::next(pred_begin(PostBB), 2) != pred_end(PostBB)) {
+    // We know that QFB's only successor is PostBB. And QFB has a single
+    // predecessor. If QTB exists, then its only successor is also PostBB.
+    // If QTB does not exist, then QFB's only predecessor has a conditional
+    // branch to QFB and PostBB.
+    BasicBlock *TruePred = QTB ? QTB : QFB->getSinglePredecessor();
     BasicBlock *NewBB =
         SplitBlockPredecessors(PostBB, {QFB, TruePred}, "condstore.split", DTU);
-    if (!NewBB) 
-      return false; 
-    PostBB = NewBB; 
-  } 
- 
-  // OK, we're going to sink the stores to PostBB. The store has to be 
-  // conditional though, so first create the predicate. 
-  Value *PCond = cast<BranchInst>(PFB->getSinglePredecessor()->getTerminator()) 
-                     ->getCondition(); 
-  Value *QCond = cast<BranchInst>(QFB->getSinglePredecessor()->getTerminator()) 
-                     ->getCondition(); 
- 
-  Value *PPHI = ensureValueAvailableInSuccessor(PStore->getValueOperand(), 
-                                                PStore->getParent()); 
-  Value *QPHI = ensureValueAvailableInSuccessor(QStore->getValueOperand(), 
-                                                QStore->getParent(), PPHI); 
- 
-  IRBuilder<> QB(&*PostBB->getFirstInsertionPt()); 
- 
-  Value *PPred = PStore->getParent() == PTB ? PCond : QB.CreateNot(PCond); 
-  Value *QPred = QStore->getParent() == QTB ? QCond : QB.CreateNot(QCond); 
- 
-  if (InvertPCond) 
-    PPred = QB.CreateNot(PPred); 
-  if (InvertQCond) 
-    QPred = QB.CreateNot(QPred); 
-  Value *CombinedPred = QB.CreateOr(PPred, QPred); 
- 
+    if (!NewBB)
+      return false;
+    PostBB = NewBB;
+  }
+
+  // OK, we're going to sink the stores to PostBB. The store has to be
+  // conditional though, so first create the predicate.
+  Value *PCond = cast<BranchInst>(PFB->getSinglePredecessor()->getTerminator())
+                     ->getCondition();
+  Value *QCond = cast<BranchInst>(QFB->getSinglePredecessor()->getTerminator())
+                     ->getCondition();
+
+  Value *PPHI = ensureValueAvailableInSuccessor(PStore->getValueOperand(),
+                                                PStore->getParent());
+  Value *QPHI = ensureValueAvailableInSuccessor(QStore->getValueOperand(),
+                                                QStore->getParent(), PPHI);
+
+  IRBuilder<> QB(&*PostBB->getFirstInsertionPt());
+
+  Value *PPred = PStore->getParent() == PTB ? PCond : QB.CreateNot(PCond);
+  Value *QPred = QStore->getParent() == QTB ? QCond : QB.CreateNot(QCond);
+
+  if (InvertPCond)
+    PPred = QB.CreateNot(PPred);
+  if (InvertQCond)
+    QPred = QB.CreateNot(QPred);
+  Value *CombinedPred = QB.CreateOr(PPred, QPred);
+
   auto *T = SplitBlockAndInsertIfThen(CombinedPred, &*QB.GetInsertPoint(),
                                       /*Unreachable=*/false,
                                       /*BranchWeights=*/nullptr, DTU);
-  QB.SetInsertPoint(T); 
-  StoreInst *SI = cast<StoreInst>(QB.CreateStore(QPHI, Address)); 
-  AAMDNodes AAMD; 
-  PStore->getAAMetadata(AAMD, /*Merge=*/false); 
-  PStore->getAAMetadata(AAMD, /*Merge=*/true); 
-  SI->setAAMetadata(AAMD); 
-  // Choose the minimum alignment. If we could prove both stores execute, we 
-  // could use biggest one.  In this case, though, we only know that one of the 
-  // stores executes.  And we don't know it's safe to take the alignment from a 
-  // store that doesn't execute. 
-  SI->setAlignment(std::min(PStore->getAlign(), QStore->getAlign())); 
- 
-  QStore->eraseFromParent(); 
-  PStore->eraseFromParent(); 
- 
-  return true; 
-} 
- 
-static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI, 
+  QB.SetInsertPoint(T);
+  StoreInst *SI = cast<StoreInst>(QB.CreateStore(QPHI, Address));
+  AAMDNodes AAMD;
+  PStore->getAAMetadata(AAMD, /*Merge=*/false);
+  PStore->getAAMetadata(AAMD, /*Merge=*/true);
+  SI->setAAMetadata(AAMD);
+  // Choose the minimum alignment. If we could prove both stores execute, we
+  // could use biggest one.  In this case, though, we only know that one of the
+  // stores executes.  And we don't know it's safe to take the alignment from a
+  // store that doesn't execute.
+  SI->setAlignment(std::min(PStore->getAlign(), QStore->getAlign()));
+
+  QStore->eraseFromParent();
+  PStore->eraseFromParent();
+
+  return true;
+}
+
+static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI,
                                    DomTreeUpdater *DTU, const DataLayout &DL,
-                                   const TargetTransformInfo &TTI) { 
-  // The intention here is to find diamonds or triangles (see below) where each 
-  // conditional block contains a store to the same address. Both of these 
-  // stores are conditional, so they can't be unconditionally sunk. But it may 
-  // be profitable to speculatively sink the stores into one merged store at the 
-  // end, and predicate the merged store on the union of the two conditions of 
-  // PBI and QBI. 
-  // 
-  // This can reduce the number of stores executed if both of the conditions are 
-  // true, and can allow the blocks to become small enough to be if-converted. 
-  // This optimization will also chain, so that ladders of test-and-set 
-  // sequences can be if-converted away. 
-  // 
-  // We only deal with simple diamonds or triangles: 
-  // 
-  //     PBI       or      PBI        or a combination of the two 
-  //    /   \               | \ 
-  //   PTB  PFB             |  PFB 
-  //    \   /               | / 
-  //     QBI                QBI 
-  //    /  \                | \ 
-  //   QTB  QFB             |  QFB 
-  //    \  /                | / 
-  //    PostBB            PostBB 
-  // 
-  // We model triangles as a type of diamond with a nullptr "true" block. 
-  // Triangles are canonicalized so that the fallthrough edge is represented by 
-  // a true condition, as in the diagram above. 
-  BasicBlock *PTB = PBI->getSuccessor(0); 
-  BasicBlock *PFB = PBI->getSuccessor(1); 
-  BasicBlock *QTB = QBI->getSuccessor(0); 
-  BasicBlock *QFB = QBI->getSuccessor(1); 
-  BasicBlock *PostBB = QFB->getSingleSuccessor(); 
- 
-  // Make sure we have a good guess for PostBB. If QTB's only successor is 
-  // QFB, then QFB is a better PostBB. 
-  if (QTB->getSingleSuccessor() == QFB) 
-    PostBB = QFB; 
- 
-  // If we couldn't find a good PostBB, stop. 
-  if (!PostBB) 
-    return false; 
- 
-  bool InvertPCond = false, InvertQCond = false; 
-  // Canonicalize fallthroughs to the true branches. 
-  if (PFB == QBI->getParent()) { 
-    std::swap(PFB, PTB); 
-    InvertPCond = true; 
-  } 
-  if (QFB == PostBB) { 
-    std::swap(QFB, QTB); 
-    InvertQCond = true; 
-  } 
- 
-  // From this point on we can assume PTB or QTB may be fallthroughs but PFB 
-  // and QFB may not. Model fallthroughs as a nullptr block. 
-  if (PTB == QBI->getParent()) 
-    PTB = nullptr; 
-  if (QTB == PostBB) 
-    QTB = nullptr; 
- 
-  // Legality bailouts. We must have at least the non-fallthrough blocks and 
-  // the post-dominating block, and the non-fallthroughs must only have one 
-  // predecessor. 
-  auto HasOnePredAndOneSucc = [](BasicBlock *BB, BasicBlock *P, BasicBlock *S) { 
-    return BB->getSinglePredecessor() == P && BB->getSingleSuccessor() == S; 
-  }; 
-  if (!HasOnePredAndOneSucc(PFB, PBI->getParent(), QBI->getParent()) || 
-      !HasOnePredAndOneSucc(QFB, QBI->getParent(), PostBB)) 
-    return false; 
-  if ((PTB && !HasOnePredAndOneSucc(PTB, PBI->getParent(), QBI->getParent())) || 
-      (QTB && !HasOnePredAndOneSucc(QTB, QBI->getParent(), PostBB))) 
-    return false; 
-  if (!QBI->getParent()->hasNUses(2)) 
-    return false; 
- 
-  // OK, this is a sequence of two diamonds or triangles. 
-  // Check if there are stores in PTB or PFB that are repeated in QTB or QFB. 
-  SmallPtrSet<Value *, 4> PStoreAddresses, QStoreAddresses; 
-  for (auto *BB : {PTB, PFB}) { 
-    if (!BB) 
-      continue; 
-    for (auto &I : *BB) 
-      if (StoreInst *SI = dyn_cast<StoreInst>(&I)) 
-        PStoreAddresses.insert(SI->getPointerOperand()); 
-  } 
-  for (auto *BB : {QTB, QFB}) { 
-    if (!BB) 
-      continue; 
-    for (auto &I : *BB) 
-      if (StoreInst *SI = dyn_cast<StoreInst>(&I)) 
-        QStoreAddresses.insert(SI->getPointerOperand()); 
-  } 
- 
-  set_intersect(PStoreAddresses, QStoreAddresses); 
-  // set_intersect mutates PStoreAddresses in place. Rename it here to make it 
-  // clear what it contains. 
-  auto &CommonAddresses = PStoreAddresses; 
- 
-  bool Changed = false; 
-  for (auto *Address : CommonAddresses) 
+                                   const TargetTransformInfo &TTI) {
+  // The intention here is to find diamonds or triangles (see below) where each
+  // conditional block contains a store to the same address. Both of these
+  // stores are conditional, so they can't be unconditionally sunk. But it may
+  // be profitable to speculatively sink the stores into one merged store at the
+  // end, and predicate the merged store on the union of the two conditions of
+  // PBI and QBI.
+  //
+  // This can reduce the number of stores executed if both of the conditions are
+  // true, and can allow the blocks to become small enough to be if-converted.
+  // This optimization will also chain, so that ladders of test-and-set
+  // sequences can be if-converted away.
+  //
+  // We only deal with simple diamonds or triangles:
+  //
+  //     PBI       or      PBI        or a combination of the two
+  //    /   \               | \
+  //   PTB  PFB             |  PFB
+  //    \   /               | /
+  //     QBI                QBI
+  //    /  \                | \
+  //   QTB  QFB             |  QFB
+  //    \  /                | /
+  //    PostBB            PostBB
+  //
+  // We model triangles as a type of diamond with a nullptr "true" block.
+  // Triangles are canonicalized so that the fallthrough edge is represented by
+  // a true condition, as in the diagram above.
+  BasicBlock *PTB = PBI->getSuccessor(0);
+  BasicBlock *PFB = PBI->getSuccessor(1);
+  BasicBlock *QTB = QBI->getSuccessor(0);
+  BasicBlock *QFB = QBI->getSuccessor(1);
+  BasicBlock *PostBB = QFB->getSingleSuccessor();
+
+  // Make sure we have a good guess for PostBB. If QTB's only successor is
+  // QFB, then QFB is a better PostBB.
+  if (QTB->getSingleSuccessor() == QFB)
+    PostBB = QFB;
+
+  // If we couldn't find a good PostBB, stop.
+  if (!PostBB)
+    return false;
+
+  bool InvertPCond = false, InvertQCond = false;
+  // Canonicalize fallthroughs to the true branches.
+  if (PFB == QBI->getParent()) {
+    std::swap(PFB, PTB);
+    InvertPCond = true;
+  }
+  if (QFB == PostBB) {
+    std::swap(QFB, QTB);
+    InvertQCond = true;
+  }
+
+  // From this point on we can assume PTB or QTB may be fallthroughs but PFB
+  // and QFB may not. Model fallthroughs as a nullptr block.
+  if (PTB == QBI->getParent())
+    PTB = nullptr;
+  if (QTB == PostBB)
+    QTB = nullptr;
+
+  // Legality bailouts. We must have at least the non-fallthrough blocks and
+  // the post-dominating block, and the non-fallthroughs must only have one
+  // predecessor.
+  auto HasOnePredAndOneSucc = [](BasicBlock *BB, BasicBlock *P, BasicBlock *S) {
+    return BB->getSinglePredecessor() == P && BB->getSingleSuccessor() == S;
+  };
+  if (!HasOnePredAndOneSucc(PFB, PBI->getParent(), QBI->getParent()) ||
+      !HasOnePredAndOneSucc(QFB, QBI->getParent(), PostBB))
+    return false;
+  if ((PTB && !HasOnePredAndOneSucc(PTB, PBI->getParent(), QBI->getParent())) ||
+      (QTB && !HasOnePredAndOneSucc(QTB, QBI->getParent(), PostBB)))
+    return false;
+  if (!QBI->getParent()->hasNUses(2))
+    return false;
+
+  // OK, this is a sequence of two diamonds or triangles.
+  // Check if there are stores in PTB or PFB that are repeated in QTB or QFB.
+  SmallPtrSet<Value *, 4> PStoreAddresses, QStoreAddresses;
+  for (auto *BB : {PTB, PFB}) {
+    if (!BB)
+      continue;
+    for (auto &I : *BB)
+      if (StoreInst *SI = dyn_cast<StoreInst>(&I))
+        PStoreAddresses.insert(SI->getPointerOperand());
+  }
+  for (auto *BB : {QTB, QFB}) {
+    if (!BB)
+      continue;
+    for (auto &I : *BB)
+      if (StoreInst *SI = dyn_cast<StoreInst>(&I))
+        QStoreAddresses.insert(SI->getPointerOperand());
+  }
+
+  set_intersect(PStoreAddresses, QStoreAddresses);
+  // set_intersect mutates PStoreAddresses in place. Rename it here to make it
+  // clear what it contains.
+  auto &CommonAddresses = PStoreAddresses;
+
+  bool Changed = false;
+  for (auto *Address : CommonAddresses)
     Changed |=
         mergeConditionalStoreToAddress(PTB, PFB, QTB, QFB, PostBB, Address,
                                        InvertPCond, InvertQCond, DTU, DL, TTI);
-  return Changed; 
-} 
- 
-/// If the previous block ended with a widenable branch, determine if reusing 
-/// the target block is profitable and legal.  This will have the effect of 
-/// "widening" PBI, but doesn't require us to reason about hosting safety. 
+  return Changed;
+}
+
+/// If the previous block ended with a widenable branch, determine if reusing
+/// the target block is profitable and legal.  This will have the effect of
+/// "widening" PBI, but doesn't require us to reason about hosting safety.
 static bool tryWidenCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
                                            DomTreeUpdater *DTU) {
-  // TODO: This can be generalized in two important ways: 
-  // 1) We can allow phi nodes in IfFalseBB and simply reuse all the input 
-  //    values from the PBI edge. 
-  // 2) We can sink side effecting instructions into BI's fallthrough 
-  //    successor provided they doesn't contribute to computation of 
-  //    BI's condition. 
-  Value *CondWB, *WC; 
-  BasicBlock *IfTrueBB, *IfFalseBB; 
-  if (!parseWidenableBranch(PBI, CondWB, WC, IfTrueBB, IfFalseBB) || 
-      IfTrueBB != BI->getParent() || !BI->getParent()->getSinglePredecessor()) 
-    return false; 
-  if (!IfFalseBB->phis().empty()) 
-    return false; // TODO 
-  // Use lambda to lazily compute expensive condition after cheap ones. 
-  auto NoSideEffects = [](BasicBlock &BB) { 
-    return !llvm::any_of(BB, [](const Instruction &I) { 
-        return I.mayWriteToMemory() || I.mayHaveSideEffects(); 
-      }); 
-  }; 
-  if (BI->getSuccessor(1) != IfFalseBB && // no inf looping 
-      BI->getSuccessor(1)->getTerminatingDeoptimizeCall() && // profitability 
-      NoSideEffects(*BI->getParent())) { 
+  // TODO: This can be generalized in two important ways:
+  // 1) We can allow phi nodes in IfFalseBB and simply reuse all the input
+  //    values from the PBI edge.
+  // 2) We can sink side effecting instructions into BI's fallthrough
+  //    successor provided they doesn't contribute to computation of
+  //    BI's condition.
+  Value *CondWB, *WC;
+  BasicBlock *IfTrueBB, *IfFalseBB;
+  if (!parseWidenableBranch(PBI, CondWB, WC, IfTrueBB, IfFalseBB) ||
+      IfTrueBB != BI->getParent() || !BI->getParent()->getSinglePredecessor())
+    return false;
+  if (!IfFalseBB->phis().empty())
+    return false; // TODO
+  // Use lambda to lazily compute expensive condition after cheap ones.
+  auto NoSideEffects = [](BasicBlock &BB) {
+    return !llvm::any_of(BB, [](const Instruction &I) {
+        return I.mayWriteToMemory() || I.mayHaveSideEffects();
+      });
+  };
+  if (BI->getSuccessor(1) != IfFalseBB && // no inf looping
+      BI->getSuccessor(1)->getTerminatingDeoptimizeCall() && // profitability
+      NoSideEffects(*BI->getParent())) {
     auto *OldSuccessor = BI->getSuccessor(1);
     OldSuccessor->removePredecessor(BI->getParent());
-    BI->setSuccessor(1, IfFalseBB); 
+    BI->setSuccessor(1, IfFalseBB);
     if (DTU)
       DTU->applyUpdates(
           {{DominatorTree::Insert, BI->getParent(), IfFalseBB},
            {DominatorTree::Delete, BI->getParent(), OldSuccessor}});
-    return true; 
-  } 
-  if (BI->getSuccessor(0) != IfFalseBB && // no inf looping 
-      BI->getSuccessor(0)->getTerminatingDeoptimizeCall() && // profitability 
-      NoSideEffects(*BI->getParent())) { 
+    return true;
+  }
+  if (BI->getSuccessor(0) != IfFalseBB && // no inf looping
+      BI->getSuccessor(0)->getTerminatingDeoptimizeCall() && // profitability
+      NoSideEffects(*BI->getParent())) {
     auto *OldSuccessor = BI->getSuccessor(0);
     OldSuccessor->removePredecessor(BI->getParent());
-    BI->setSuccessor(0, IfFalseBB); 
+    BI->setSuccessor(0, IfFalseBB);
     if (DTU)
       DTU->applyUpdates(
           {{DominatorTree::Insert, BI->getParent(), IfFalseBB},
            {DominatorTree::Delete, BI->getParent(), OldSuccessor}});
-    return true; 
-  } 
-  return false; 
-} 
- 
-/// If we have a conditional branch as a predecessor of another block, 
-/// this function tries to simplify it.  We know 
-/// that PBI and BI are both conditional branches, and BI is in one of the 
-/// successor blocks of PBI - PBI branches to BI. 
-static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI, 
+    return true;
+  }
+  return false;
+}
+
+/// If we have a conditional branch as a predecessor of another block,
+/// this function tries to simplify it.  We know
+/// that PBI and BI are both conditional branches, and BI is in one of the
+/// successor blocks of PBI - PBI branches to BI.
+static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
                                            DomTreeUpdater *DTU,
-                                           const DataLayout &DL, 
-                                           const TargetTransformInfo &TTI) { 
-  assert(PBI->isConditional() && BI->isConditional()); 
-  BasicBlock *BB = BI->getParent(); 
- 
-  // If this block ends with a branch instruction, and if there is a 
-  // predecessor that ends on a branch of the same condition, make 
-  // this conditional branch redundant. 
-  if (PBI->getCondition() == BI->getCondition() && 
-      PBI->getSuccessor(0) != PBI->getSuccessor(1)) { 
-    // Okay, the outcome of this conditional branch is statically 
-    // knowable.  If this block had a single pred, handle specially. 
-    if (BB->getSinglePredecessor()) { 
-      // Turn this into a branch on constant. 
-      bool CondIsTrue = PBI->getSuccessor(0) == BB; 
-      BI->setCondition( 
-          ConstantInt::get(Type::getInt1Ty(BB->getContext()), CondIsTrue)); 
-      return true; // Nuke the branch on constant. 
-    } 
- 
-    // Otherwise, if there are multiple predecessors, insert a PHI that merges 
-    // in the constant and simplify the block result.  Subsequent passes of 
-    // simplifycfg will thread the block. 
-    if (BlockIsSimpleEnoughToThreadThrough(BB)) { 
-      pred_iterator PB = pred_begin(BB), PE = pred_end(BB); 
-      PHINode *NewPN = PHINode::Create( 
-          Type::getInt1Ty(BB->getContext()), std::distance(PB, PE), 
-          BI->getCondition()->getName() + ".pr", &BB->front()); 
-      // Okay, we're going to insert the PHI node.  Since PBI is not the only 
-      // predecessor, compute the PHI'd conditional value for all of the preds. 
-      // Any predecessor where the condition is not computable we keep symbolic. 
-      for (pred_iterator PI = PB; PI != PE; ++PI) { 
-        BasicBlock *P = *PI; 
-        if ((PBI = dyn_cast<BranchInst>(P->getTerminator())) && PBI != BI && 
-            PBI->isConditional() && PBI->getCondition() == BI->getCondition() && 
-            PBI->getSuccessor(0) != PBI->getSuccessor(1)) { 
-          bool CondIsTrue = PBI->getSuccessor(0) == BB; 
-          NewPN->addIncoming( 
-              ConstantInt::get(Type::getInt1Ty(BB->getContext()), CondIsTrue), 
-              P); 
-        } else { 
-          NewPN->addIncoming(BI->getCondition(), P); 
-        } 
-      } 
- 
-      BI->setCondition(NewPN); 
-      return true; 
-    } 
-  } 
- 
-  // If the previous block ended with a widenable branch, determine if reusing 
-  // the target block is profitable and legal.  This will have the effect of 
-  // "widening" PBI, but doesn't require us to reason about hosting safety. 
+                                           const DataLayout &DL,
+                                           const TargetTransformInfo &TTI) {
+  assert(PBI->isConditional() && BI->isConditional());
+  BasicBlock *BB = BI->getParent();
+
+  // If this block ends with a branch instruction, and if there is a
+  // predecessor that ends on a branch of the same condition, make
+  // this conditional branch redundant.
+  if (PBI->getCondition() == BI->getCondition() &&
+      PBI->getSuccessor(0) != PBI->getSuccessor(1)) {
+    // Okay, the outcome of this conditional branch is statically
+    // knowable.  If this block had a single pred, handle specially.
+    if (BB->getSinglePredecessor()) {
+      // Turn this into a branch on constant.
+      bool CondIsTrue = PBI->getSuccessor(0) == BB;
+      BI->setCondition(
+          ConstantInt::get(Type::getInt1Ty(BB->getContext()), CondIsTrue));
+      return true; // Nuke the branch on constant.
+    }
+
+    // Otherwise, if there are multiple predecessors, insert a PHI that merges
+    // in the constant and simplify the block result.  Subsequent passes of
+    // simplifycfg will thread the block.
+    if (BlockIsSimpleEnoughToThreadThrough(BB)) {
+      pred_iterator PB = pred_begin(BB), PE = pred_end(BB);
+      PHINode *NewPN = PHINode::Create(
+          Type::getInt1Ty(BB->getContext()), std::distance(PB, PE),
+          BI->getCondition()->getName() + ".pr", &BB->front());
+      // Okay, we're going to insert the PHI node.  Since PBI is not the only
+      // predecessor, compute the PHI'd conditional value for all of the preds.
+      // Any predecessor where the condition is not computable we keep symbolic.
+      for (pred_iterator PI = PB; PI != PE; ++PI) {
+        BasicBlock *P = *PI;
+        if ((PBI = dyn_cast<BranchInst>(P->getTerminator())) && PBI != BI &&
+            PBI->isConditional() && PBI->getCondition() == BI->getCondition() &&
+            PBI->getSuccessor(0) != PBI->getSuccessor(1)) {
+          bool CondIsTrue = PBI->getSuccessor(0) == BB;
+          NewPN->addIncoming(
+              ConstantInt::get(Type::getInt1Ty(BB->getContext()), CondIsTrue),
+              P);
+        } else {
+          NewPN->addIncoming(BI->getCondition(), P);
+        }
+      }
+
+      BI->setCondition(NewPN);
+      return true;
+    }
+  }
+
+  // If the previous block ended with a widenable branch, determine if reusing
+  // the target block is profitable and legal.  This will have the effect of
+  // "widening" PBI, but doesn't require us to reason about hosting safety.
   if (tryWidenCondBranchToCondBranch(PBI, BI, DTU))
-    return true; 
- 
-  if (auto *CE = dyn_cast<ConstantExpr>(BI->getCondition())) 
-    if (CE->canTrap()) 
-      return false; 
- 
-  // If both branches are conditional and both contain stores to the same 
-  // address, remove the stores from the conditionals and create a conditional 
-  // merged store at the end. 
+    return true;
+
+  if (auto *CE = dyn_cast<ConstantExpr>(BI->getCondition()))
+    if (CE->canTrap())
+      return false;
+
+  // If both branches are conditional and both contain stores to the same
+  // address, remove the stores from the conditionals and create a conditional
+  // merged store at the end.
   if (MergeCondStores && mergeConditionalStores(PBI, BI, DTU, DL, TTI))
-    return true; 
- 
-  // If this is a conditional branch in an empty block, and if any 
-  // predecessors are a conditional branch to one of our destinations, 
-  // fold the conditions into logical ops and one cond br. 
- 
-  // Ignore dbg intrinsics. 
-  if (&*BB->instructionsWithoutDebug().begin() != BI) 
-    return false; 
- 
-  int PBIOp, BIOp; 
-  if (PBI->getSuccessor(0) == BI->getSuccessor(0)) { 
-    PBIOp = 0; 
-    BIOp = 0; 
-  } else if (PBI->getSuccessor(0) == BI->getSuccessor(1)) { 
-    PBIOp = 0; 
-    BIOp = 1; 
-  } else if (PBI->getSuccessor(1) == BI->getSuccessor(0)) { 
-    PBIOp = 1; 
-    BIOp = 0; 
-  } else if (PBI->getSuccessor(1) == BI->getSuccessor(1)) { 
-    PBIOp = 1; 
-    BIOp = 1; 
-  } else { 
-    return false; 
-  } 
- 
-  // Check to make sure that the other destination of this branch 
-  // isn't BB itself.  If so, this is an infinite loop that will 
-  // keep getting unwound. 
-  if (PBI->getSuccessor(PBIOp) == BB) 
-    return false; 
- 
-  // Do not perform this transformation if it would require 
-  // insertion of a large number of select instructions. For targets 
-  // without predication/cmovs, this is a big pessimization. 
- 
-  // Also do not perform this transformation if any phi node in the common 
-  // destination block can trap when reached by BB or PBB (PR17073). In that 
-  // case, it would be unsafe to hoist the operation into a select instruction. 
- 
-  BasicBlock *CommonDest = PBI->getSuccessor(PBIOp); 
+    return true;
+
+  // If this is a conditional branch in an empty block, and if any
+  // predecessors are a conditional branch to one of our destinations,
+  // fold the conditions into logical ops and one cond br.
+
+  // Ignore dbg intrinsics.
+  if (&*BB->instructionsWithoutDebug().begin() != BI)
+    return false;
+
+  int PBIOp, BIOp;
+  if (PBI->getSuccessor(0) == BI->getSuccessor(0)) {
+    PBIOp = 0;
+    BIOp = 0;
+  } else if (PBI->getSuccessor(0) == BI->getSuccessor(1)) {
+    PBIOp = 0;
+    BIOp = 1;
+  } else if (PBI->getSuccessor(1) == BI->getSuccessor(0)) {
+    PBIOp = 1;
+    BIOp = 0;
+  } else if (PBI->getSuccessor(1) == BI->getSuccessor(1)) {
+    PBIOp = 1;
+    BIOp = 1;
+  } else {
+    return false;
+  }
+
+  // Check to make sure that the other destination of this branch
+  // isn't BB itself.  If so, this is an infinite loop that will
+  // keep getting unwound.
+  if (PBI->getSuccessor(PBIOp) == BB)
+    return false;
+
+  // Do not perform this transformation if it would require
+  // insertion of a large number of select instructions. For targets
+  // without predication/cmovs, this is a big pessimization.
+
+  // Also do not perform this transformation if any phi node in the common
+  // destination block can trap when reached by BB or PBB (PR17073). In that
+  // case, it would be unsafe to hoist the operation into a select instruction.
+
+  BasicBlock *CommonDest = PBI->getSuccessor(PBIOp);
   BasicBlock *RemovedDest = PBI->getSuccessor(PBIOp ^ 1);
-  unsigned NumPhis = 0; 
-  for (BasicBlock::iterator II = CommonDest->begin(); isa<PHINode>(II); 
-       ++II, ++NumPhis) { 
-    if (NumPhis > 2) // Disable this xform. 
-      return false; 
- 
-    PHINode *PN = cast<PHINode>(II); 
-    Value *BIV = PN->getIncomingValueForBlock(BB); 
-    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(BIV)) 
-      if (CE->canTrap()) 
-        return false; 
- 
-    unsigned PBBIdx = PN->getBasicBlockIndex(PBI->getParent()); 
-    Value *PBIV = PN->getIncomingValue(PBBIdx); 
-    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(PBIV)) 
-      if (CE->canTrap()) 
-        return false; 
-  } 
- 
-  // Finally, if everything is ok, fold the branches to logical ops. 
-  BasicBlock *OtherDest = BI->getSuccessor(BIOp ^ 1); 
- 
-  LLVM_DEBUG(dbgs() << "FOLDING BRs:" << *PBI->getParent() 
-                    << "AND: " << *BI->getParent()); 
- 
+  unsigned NumPhis = 0;
+  for (BasicBlock::iterator II = CommonDest->begin(); isa<PHINode>(II);
+       ++II, ++NumPhis) {
+    if (NumPhis > 2) // Disable this xform.
+      return false;
+
+    PHINode *PN = cast<PHINode>(II);
+    Value *BIV = PN->getIncomingValueForBlock(BB);
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(BIV))
+      if (CE->canTrap())
+        return false;
+
+    unsigned PBBIdx = PN->getBasicBlockIndex(PBI->getParent());
+    Value *PBIV = PN->getIncomingValue(PBBIdx);
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(PBIV))
+      if (CE->canTrap())
+        return false;
+  }
+
+  // Finally, if everything is ok, fold the branches to logical ops.
+  BasicBlock *OtherDest = BI->getSuccessor(BIOp ^ 1);
+
+  LLVM_DEBUG(dbgs() << "FOLDING BRs:" << *PBI->getParent()
+                    << "AND: " << *BI->getParent());
+
   SmallVector<DominatorTree::UpdateType, 5> Updates;
 
-  // If OtherDest *is* BB, then BB is a basic block with a single conditional 
-  // branch in it, where one edge (OtherDest) goes back to itself but the other 
-  // exits.  We don't *know* that the program avoids the infinite loop 
-  // (even though that seems likely).  If we do this xform naively, we'll end up 
-  // recursively unpeeling the loop.  Since we know that (after the xform is 
-  // done) that the block *is* infinite if reached, we just make it an obviously 
-  // infinite loop with no cond branch. 
-  if (OtherDest == BB) { 
-    // Insert it at the end of the function, because it's either code, 
-    // or it won't matter if it's hot. :) 
-    BasicBlock *InfLoopBlock = 
-        BasicBlock::Create(BB->getContext(), "infloop", BB->getParent()); 
-    BranchInst::Create(InfLoopBlock, InfLoopBlock); 
+  // If OtherDest *is* BB, then BB is a basic block with a single conditional
+  // branch in it, where one edge (OtherDest) goes back to itself but the other
+  // exits.  We don't *know* that the program avoids the infinite loop
+  // (even though that seems likely).  If we do this xform naively, we'll end up
+  // recursively unpeeling the loop.  Since we know that (after the xform is
+  // done) that the block *is* infinite if reached, we just make it an obviously
+  // infinite loop with no cond branch.
+  if (OtherDest == BB) {
+    // Insert it at the end of the function, because it's either code,
+    // or it won't matter if it's hot. :)
+    BasicBlock *InfLoopBlock =
+        BasicBlock::Create(BB->getContext(), "infloop", BB->getParent());
+    BranchInst::Create(InfLoopBlock, InfLoopBlock);
     Updates.push_back({DominatorTree::Insert, InfLoopBlock, InfLoopBlock});
-    OtherDest = InfLoopBlock; 
-  } 
- 
-  LLVM_DEBUG(dbgs() << *PBI->getParent()->getParent()); 
- 
-  // BI may have other predecessors.  Because of this, we leave 
-  // it alone, but modify PBI. 
- 
-  // Make sure we get to CommonDest on True&True directions. 
-  Value *PBICond = PBI->getCondition(); 
-  IRBuilder<NoFolder> Builder(PBI); 
-  if (PBIOp) 
-    PBICond = Builder.CreateNot(PBICond, PBICond->getName() + ".not"); 
- 
-  Value *BICond = BI->getCondition(); 
-  if (BIOp) 
-    BICond = Builder.CreateNot(BICond, BICond->getName() + ".not"); 
- 
-  // Merge the conditions. 
-  Value *Cond = Builder.CreateOr(PBICond, BICond, "brmerge"); 
- 
-  // Modify PBI to branch on the new condition to the new dests. 
-  PBI->setCondition(Cond); 
-  PBI->setSuccessor(0, CommonDest); 
-  PBI->setSuccessor(1, OtherDest); 
- 
+    OtherDest = InfLoopBlock;
+  }
+
+  LLVM_DEBUG(dbgs() << *PBI->getParent()->getParent());
+
+  // BI may have other predecessors.  Because of this, we leave
+  // it alone, but modify PBI.
+
+  // Make sure we get to CommonDest on True&True directions.
+  Value *PBICond = PBI->getCondition();
+  IRBuilder<NoFolder> Builder(PBI);
+  if (PBIOp)
+    PBICond = Builder.CreateNot(PBICond, PBICond->getName() + ".not");
+
+  Value *BICond = BI->getCondition();
+  if (BIOp)
+    BICond = Builder.CreateNot(BICond, BICond->getName() + ".not");
+
+  // Merge the conditions.
+  Value *Cond = Builder.CreateOr(PBICond, BICond, "brmerge");
+
+  // Modify PBI to branch on the new condition to the new dests.
+  PBI->setCondition(Cond);
+  PBI->setSuccessor(0, CommonDest);
+  PBI->setSuccessor(1, OtherDest);
+
   Updates.push_back({DominatorTree::Insert, PBI->getParent(), OtherDest});
   Updates.push_back({DominatorTree::Delete, PBI->getParent(), RemovedDest});
 
   if (DTU)
     DTU->applyUpdates(Updates);
 
-  // Update branch weight for PBI. 
-  uint64_t PredTrueWeight, PredFalseWeight, SuccTrueWeight, SuccFalseWeight; 
-  uint64_t PredCommon, PredOther, SuccCommon, SuccOther; 
-  bool HasWeights = 
-      extractPredSuccWeights(PBI, BI, PredTrueWeight, PredFalseWeight, 
-                             SuccTrueWeight, SuccFalseWeight); 
-  if (HasWeights) { 
-    PredCommon = PBIOp ? PredFalseWeight : PredTrueWeight; 
-    PredOther = PBIOp ? PredTrueWeight : PredFalseWeight; 
-    SuccCommon = BIOp ? SuccFalseWeight : SuccTrueWeight; 
-    SuccOther = BIOp ? SuccTrueWeight : SuccFalseWeight; 
-    // The weight to CommonDest should be PredCommon * SuccTotal + 
-    //                                    PredOther * SuccCommon. 
-    // The weight to OtherDest should be PredOther * SuccOther. 
-    uint64_t NewWeights[2] = {PredCommon * (SuccCommon + SuccOther) + 
-                                  PredOther * SuccCommon, 
-                              PredOther * SuccOther}; 
-    // Halve the weights if any of them cannot fit in an uint32_t 
-    FitWeights(NewWeights); 
- 
-    setBranchWeights(PBI, NewWeights[0], NewWeights[1]); 
-  } 
- 
-  // OtherDest may have phi nodes.  If so, add an entry from PBI's 
-  // block that are identical to the entries for BI's block. 
-  AddPredecessorToBlock(OtherDest, PBI->getParent(), BB); 
- 
-  // We know that the CommonDest already had an edge from PBI to 
-  // it.  If it has PHIs though, the PHIs may have different 
-  // entries for BB and PBI's BB.  If so, insert a select to make 
-  // them agree. 
-  for (PHINode &PN : CommonDest->phis()) { 
-    Value *BIV = PN.getIncomingValueForBlock(BB); 
-    unsigned PBBIdx = PN.getBasicBlockIndex(PBI->getParent()); 
-    Value *PBIV = PN.getIncomingValue(PBBIdx); 
-    if (BIV != PBIV) { 
-      // Insert a select in PBI to pick the right value. 
-      SelectInst *NV = cast<SelectInst>( 
-          Builder.CreateSelect(PBICond, PBIV, BIV, PBIV->getName() + ".mux")); 
-      PN.setIncomingValue(PBBIdx, NV); 
-      // Although the select has the same condition as PBI, the original branch 
-      // weights for PBI do not apply to the new select because the select's 
-      // 'logical' edges are incoming edges of the phi that is eliminated, not 
-      // the outgoing edges of PBI. 
-      if (HasWeights) { 
-        uint64_t PredCommon = PBIOp ? PredFalseWeight : PredTrueWeight; 
-        uint64_t PredOther = PBIOp ? PredTrueWeight : PredFalseWeight; 
-        uint64_t SuccCommon = BIOp ? SuccFalseWeight : SuccTrueWeight; 
-        uint64_t SuccOther = BIOp ? SuccTrueWeight : SuccFalseWeight; 
-        // The weight to PredCommonDest should be PredCommon * SuccTotal. 
-        // The weight to PredOtherDest should be PredOther * SuccCommon. 
-        uint64_t NewWeights[2] = {PredCommon * (SuccCommon + SuccOther), 
-                                  PredOther * SuccCommon}; 
- 
-        FitWeights(NewWeights); 
- 
-        setBranchWeights(NV, NewWeights[0], NewWeights[1]); 
-      } 
-    } 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "INTO: " << *PBI->getParent()); 
-  LLVM_DEBUG(dbgs() << *PBI->getParent()->getParent()); 
- 
-  // This basic block is probably dead.  We know it has at least 
-  // one fewer predecessor. 
-  return true; 
-} 
- 
-// Simplifies a terminator by replacing it with a branch to TrueBB if Cond is 
-// true or to FalseBB if Cond is false. 
-// Takes care of updating the successors and removing the old terminator. 
-// Also makes sure not to introduce new successors by assuming that edges to 
-// non-successor TrueBBs and FalseBBs aren't reachable. 
-bool SimplifyCFGOpt::SimplifyTerminatorOnSelect(Instruction *OldTerm, 
-                                                Value *Cond, BasicBlock *TrueBB, 
-                                                BasicBlock *FalseBB, 
-                                                uint32_t TrueWeight, 
-                                                uint32_t FalseWeight) { 
+  // Update branch weight for PBI.
+  uint64_t PredTrueWeight, PredFalseWeight, SuccTrueWeight, SuccFalseWeight;
+  uint64_t PredCommon, PredOther, SuccCommon, SuccOther;
+  bool HasWeights =
+      extractPredSuccWeights(PBI, BI, PredTrueWeight, PredFalseWeight,
+                             SuccTrueWeight, SuccFalseWeight);
+  if (HasWeights) {
+    PredCommon = PBIOp ? PredFalseWeight : PredTrueWeight;
+    PredOther = PBIOp ? PredTrueWeight : PredFalseWeight;
+    SuccCommon = BIOp ? SuccFalseWeight : SuccTrueWeight;
+    SuccOther = BIOp ? SuccTrueWeight : SuccFalseWeight;
+    // The weight to CommonDest should be PredCommon * SuccTotal +
+    //                                    PredOther * SuccCommon.
+    // The weight to OtherDest should be PredOther * SuccOther.
+    uint64_t NewWeights[2] = {PredCommon * (SuccCommon + SuccOther) +
+                                  PredOther * SuccCommon,
+                              PredOther * SuccOther};
+    // Halve the weights if any of them cannot fit in an uint32_t
+    FitWeights(NewWeights);
+
+    setBranchWeights(PBI, NewWeights[0], NewWeights[1]);
+  }
+
+  // OtherDest may have phi nodes.  If so, add an entry from PBI's
+  // block that are identical to the entries for BI's block.
+  AddPredecessorToBlock(OtherDest, PBI->getParent(), BB);
+
+  // We know that the CommonDest already had an edge from PBI to
+  // it.  If it has PHIs though, the PHIs may have different
+  // entries for BB and PBI's BB.  If so, insert a select to make
+  // them agree.
+  for (PHINode &PN : CommonDest->phis()) {
+    Value *BIV = PN.getIncomingValueForBlock(BB);
+    unsigned PBBIdx = PN.getBasicBlockIndex(PBI->getParent());
+    Value *PBIV = PN.getIncomingValue(PBBIdx);
+    if (BIV != PBIV) {
+      // Insert a select in PBI to pick the right value.
+      SelectInst *NV = cast<SelectInst>(
+          Builder.CreateSelect(PBICond, PBIV, BIV, PBIV->getName() + ".mux"));
+      PN.setIncomingValue(PBBIdx, NV);
+      // Although the select has the same condition as PBI, the original branch
+      // weights for PBI do not apply to the new select because the select's
+      // 'logical' edges are incoming edges of the phi that is eliminated, not
+      // the outgoing edges of PBI.
+      if (HasWeights) {
+        uint64_t PredCommon = PBIOp ? PredFalseWeight : PredTrueWeight;
+        uint64_t PredOther = PBIOp ? PredTrueWeight : PredFalseWeight;
+        uint64_t SuccCommon = BIOp ? SuccFalseWeight : SuccTrueWeight;
+        uint64_t SuccOther = BIOp ? SuccTrueWeight : SuccFalseWeight;
+        // The weight to PredCommonDest should be PredCommon * SuccTotal.
+        // The weight to PredOtherDest should be PredOther * SuccCommon.
+        uint64_t NewWeights[2] = {PredCommon * (SuccCommon + SuccOther),
+                                  PredOther * SuccCommon};
+
+        FitWeights(NewWeights);
+
+        setBranchWeights(NV, NewWeights[0], NewWeights[1]);
+      }
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "INTO: " << *PBI->getParent());
+  LLVM_DEBUG(dbgs() << *PBI->getParent()->getParent());
+
+  // This basic block is probably dead.  We know it has at least
+  // one fewer predecessor.
+  return true;
+}
+
+// Simplifies a terminator by replacing it with a branch to TrueBB if Cond is
+// true or to FalseBB if Cond is false.
+// Takes care of updating the successors and removing the old terminator.
+// Also makes sure not to introduce new successors by assuming that edges to
+// non-successor TrueBBs and FalseBBs aren't reachable.
+bool SimplifyCFGOpt::SimplifyTerminatorOnSelect(Instruction *OldTerm,
+                                                Value *Cond, BasicBlock *TrueBB,
+                                                BasicBlock *FalseBB,
+                                                uint32_t TrueWeight,
+                                                uint32_t FalseWeight) {
   auto *BB = OldTerm->getParent();
-  // Remove any superfluous successor edges from the CFG. 
-  // First, figure out which successors to preserve. 
-  // If TrueBB and FalseBB are equal, only try to preserve one copy of that 
-  // successor. 
-  BasicBlock *KeepEdge1 = TrueBB; 
-  BasicBlock *KeepEdge2 = TrueBB != FalseBB ? FalseBB : nullptr; 
- 
+  // Remove any superfluous successor edges from the CFG.
+  // First, figure out which successors to preserve.
+  // If TrueBB and FalseBB are equal, only try to preserve one copy of that
+  // successor.
+  BasicBlock *KeepEdge1 = TrueBB;
+  BasicBlock *KeepEdge2 = TrueBB != FalseBB ? FalseBB : nullptr;
+
   SmallSetVector<BasicBlock *, 2> RemovedSuccessors;
 
-  // Then remove the rest. 
-  for (BasicBlock *Succ : successors(OldTerm)) { 
-    // Make sure only to keep exactly one copy of each edge. 
-    if (Succ == KeepEdge1) 
-      KeepEdge1 = nullptr; 
-    else if (Succ == KeepEdge2) 
-      KeepEdge2 = nullptr; 
+  // Then remove the rest.
+  for (BasicBlock *Succ : successors(OldTerm)) {
+    // Make sure only to keep exactly one copy of each edge.
+    if (Succ == KeepEdge1)
+      KeepEdge1 = nullptr;
+    else if (Succ == KeepEdge2)
+      KeepEdge2 = nullptr;
     else {
       Succ->removePredecessor(BB,
-                              /*KeepOneInputPHIs=*/true); 
+                              /*KeepOneInputPHIs=*/true);
 
       if (Succ != TrueBB && Succ != FalseBB)
         RemovedSuccessors.insert(Succ);
     }
-  } 
- 
-  IRBuilder<> Builder(OldTerm); 
-  Builder.SetCurrentDebugLocation(OldTerm->getDebugLoc()); 
- 
-  // Insert an appropriate new terminator. 
-  if (!KeepEdge1 && !KeepEdge2) { 
+  }
+
+  IRBuilder<> Builder(OldTerm);
+  Builder.SetCurrentDebugLocation(OldTerm->getDebugLoc());
+
+  // Insert an appropriate new terminator.
+  if (!KeepEdge1 && !KeepEdge2) {
     if (TrueBB == FalseBB) {
-      // We were only looking for one successor, and it was present. 
-      // Create an unconditional branch to it. 
-      Builder.CreateBr(TrueBB); 
+      // We were only looking for one successor, and it was present.
+      // Create an unconditional branch to it.
+      Builder.CreateBr(TrueBB);
     } else {
-      // We found both of the successors we were looking for. 
-      // Create a conditional branch sharing the condition of the select. 
-      BranchInst *NewBI = Builder.CreateCondBr(Cond, TrueBB, FalseBB); 
-      if (TrueWeight != FalseWeight) 
-        setBranchWeights(NewBI, TrueWeight, FalseWeight); 
-    } 
-  } else if (KeepEdge1 && (KeepEdge2 || TrueBB == FalseBB)) { 
-    // Neither of the selected blocks were successors, so this 
-    // terminator must be unreachable. 
-    new UnreachableInst(OldTerm->getContext(), OldTerm); 
-  } else { 
-    // One of the selected values was a successor, but the other wasn't. 
-    // Insert an unconditional branch to the one that was found; 
-    // the edge to the one that wasn't must be unreachable. 
+      // We found both of the successors we were looking for.
+      // Create a conditional branch sharing the condition of the select.
+      BranchInst *NewBI = Builder.CreateCondBr(Cond, TrueBB, FalseBB);
+      if (TrueWeight != FalseWeight)
+        setBranchWeights(NewBI, TrueWeight, FalseWeight);
+    }
+  } else if (KeepEdge1 && (KeepEdge2 || TrueBB == FalseBB)) {
+    // Neither of the selected blocks were successors, so this
+    // terminator must be unreachable.
+    new UnreachableInst(OldTerm->getContext(), OldTerm);
+  } else {
+    // One of the selected values was a successor, but the other wasn't.
+    // Insert an unconditional branch to the one that was found;
+    // the edge to the one that wasn't must be unreachable.
     if (!KeepEdge1) {
-      // Only TrueBB was found. 
-      Builder.CreateBr(TrueBB); 
+      // Only TrueBB was found.
+      Builder.CreateBr(TrueBB);
     } else {
-      // Only FalseBB was found. 
-      Builder.CreateBr(FalseBB); 
+      // Only FalseBB was found.
+      Builder.CreateBr(FalseBB);
     }
-  } 
- 
-  EraseTerminatorAndDCECond(OldTerm); 
+  }
+
+  EraseTerminatorAndDCECond(OldTerm);
 
   if (DTU) {
     SmallVector<DominatorTree::UpdateType, 2> Updates;
@@ -3775,326 +3775,326 @@ bool SimplifyCFGOpt::SimplifyTerminatorOnSelect(Instruction *OldTerm,
     DTU->applyUpdates(Updates);
   }
 
-  return true; 
-} 
- 
-// Replaces 
-//   (switch (select cond, X, Y)) on constant X, Y 
-// with a branch - conditional if X and Y lead to distinct BBs, 
-// unconditional otherwise. 
-bool SimplifyCFGOpt::SimplifySwitchOnSelect(SwitchInst *SI, 
-                                            SelectInst *Select) { 
-  // Check for constant integer values in the select. 
-  ConstantInt *TrueVal = dyn_cast<ConstantInt>(Select->getTrueValue()); 
-  ConstantInt *FalseVal = dyn_cast<ConstantInt>(Select->getFalseValue()); 
-  if (!TrueVal || !FalseVal) 
-    return false; 
- 
-  // Find the relevant condition and destinations. 
-  Value *Condition = Select->getCondition(); 
-  BasicBlock *TrueBB = SI->findCaseValue(TrueVal)->getCaseSuccessor(); 
-  BasicBlock *FalseBB = SI->findCaseValue(FalseVal)->getCaseSuccessor(); 
- 
-  // Get weight for TrueBB and FalseBB. 
-  uint32_t TrueWeight = 0, FalseWeight = 0; 
-  SmallVector<uint64_t, 8> Weights; 
-  bool HasWeights = HasBranchWeights(SI); 
-  if (HasWeights) { 
-    GetBranchWeights(SI, Weights); 
-    if (Weights.size() == 1 + SI->getNumCases()) { 
-      TrueWeight = 
-          (uint32_t)Weights[SI->findCaseValue(TrueVal)->getSuccessorIndex()]; 
-      FalseWeight = 
-          (uint32_t)Weights[SI->findCaseValue(FalseVal)->getSuccessorIndex()]; 
-    } 
-  } 
- 
-  // Perform the actual simplification. 
-  return SimplifyTerminatorOnSelect(SI, Condition, TrueBB, FalseBB, TrueWeight, 
-                                    FalseWeight); 
-} 
- 
-// Replaces 
-//   (indirectbr (select cond, blockaddress(@fn, BlockA), 
-//                             blockaddress(@fn, BlockB))) 
-// with 
-//   (br cond, BlockA, BlockB). 
-bool SimplifyCFGOpt::SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, 
-                                                SelectInst *SI) { 
-  // Check that both operands of the select are block addresses. 
-  BlockAddress *TBA = dyn_cast<BlockAddress>(SI->getTrueValue()); 
-  BlockAddress *FBA = dyn_cast<BlockAddress>(SI->getFalseValue()); 
-  if (!TBA || !FBA) 
-    return false; 
- 
-  // Extract the actual blocks. 
-  BasicBlock *TrueBB = TBA->getBasicBlock(); 
-  BasicBlock *FalseBB = FBA->getBasicBlock(); 
- 
-  // Perform the actual simplification. 
-  return SimplifyTerminatorOnSelect(IBI, SI->getCondition(), TrueBB, FalseBB, 0, 
-                                    0); 
-} 
- 
-/// This is called when we find an icmp instruction 
-/// (a seteq/setne with a constant) as the only instruction in a 
-/// block that ends with an uncond branch.  We are looking for a very specific 
-/// pattern that occurs when "A == 1 || A == 2 || A == 3" gets simplified.  In 
-/// this case, we merge the first two "or's of icmp" into a switch, but then the 
-/// default value goes to an uncond block with a seteq in it, we get something 
-/// like: 
-/// 
-///   switch i8 %A, label %DEFAULT [ i8 1, label %end    i8 2, label %end ] 
-/// DEFAULT: 
-///   %tmp = icmp eq i8 %A, 92 
-///   br label %end 
-/// end: 
-///   ... = phi i1 [ true, %entry ], [ %tmp, %DEFAULT ], [ true, %entry ] 
-/// 
-/// We prefer to split the edge to 'end' so that there is a true/false entry to 
-/// the PHI, merging the third icmp into the switch. 
-bool SimplifyCFGOpt::tryToSimplifyUncondBranchWithICmpInIt( 
-    ICmpInst *ICI, IRBuilder<> &Builder) { 
-  BasicBlock *BB = ICI->getParent(); 
- 
-  // If the block has any PHIs in it or the icmp has multiple uses, it is too 
-  // complex. 
-  if (isa<PHINode>(BB->begin()) || !ICI->hasOneUse()) 
-    return false; 
- 
-  Value *V = ICI->getOperand(0); 
-  ConstantInt *Cst = cast<ConstantInt>(ICI->getOperand(1)); 
- 
-  // The pattern we're looking for is where our only predecessor is a switch on 
-  // 'V' and this block is the default case for the switch.  In this case we can 
-  // fold the compared value into the switch to simplify things. 
-  BasicBlock *Pred = BB->getSinglePredecessor(); 
-  if (!Pred || !isa<SwitchInst>(Pred->getTerminator())) 
-    return false; 
- 
-  SwitchInst *SI = cast<SwitchInst>(Pred->getTerminator()); 
-  if (SI->getCondition() != V) 
-    return false; 
- 
-  // If BB is reachable on a non-default case, then we simply know the value of 
-  // V in this block.  Substitute it and constant fold the icmp instruction 
-  // away. 
-  if (SI->getDefaultDest() != BB) { 
-    ConstantInt *VVal = SI->findCaseDest(BB); 
-    assert(VVal && "Should have a unique destination value"); 
-    ICI->setOperand(0, VVal); 
- 
-    if (Value *V = SimplifyInstruction(ICI, {DL, ICI})) { 
-      ICI->replaceAllUsesWith(V); 
-      ICI->eraseFromParent(); 
-    } 
-    // BB is now empty, so it is likely to simplify away. 
-    return requestResimplify(); 
-  } 
- 
-  // Ok, the block is reachable from the default dest.  If the constant we're 
-  // comparing exists in one of the other edges, then we can constant fold ICI 
-  // and zap it. 
-  if (SI->findCaseValue(Cst) != SI->case_default()) { 
-    Value *V; 
-    if (ICI->getPredicate() == ICmpInst::ICMP_EQ) 
-      V = ConstantInt::getFalse(BB->getContext()); 
-    else 
-      V = ConstantInt::getTrue(BB->getContext()); 
- 
-    ICI->replaceAllUsesWith(V); 
-    ICI->eraseFromParent(); 
-    // BB is now empty, so it is likely to simplify away. 
-    return requestResimplify(); 
-  } 
- 
-  // The use of the icmp has to be in the 'end' block, by the only PHI node in 
-  // the block. 
-  BasicBlock *SuccBlock = BB->getTerminator()->getSuccessor(0); 
-  PHINode *PHIUse = dyn_cast<PHINode>(ICI->user_back()); 
-  if (PHIUse == nullptr || PHIUse != &SuccBlock->front() || 
-      isa<PHINode>(++BasicBlock::iterator(PHIUse))) 
-    return false; 
- 
-  // If the icmp is a SETEQ, then the default dest gets false, the new edge gets 
-  // true in the PHI. 
-  Constant *DefaultCst = ConstantInt::getTrue(BB->getContext()); 
-  Constant *NewCst = ConstantInt::getFalse(BB->getContext()); 
- 
-  if (ICI->getPredicate() == ICmpInst::ICMP_EQ) 
-    std::swap(DefaultCst, NewCst); 
- 
-  // Replace ICI (which is used by the PHI for the default value) with true or 
-  // false depending on if it is EQ or NE. 
-  ICI->replaceAllUsesWith(DefaultCst); 
-  ICI->eraseFromParent(); 
- 
+  return true;
+}
+
+// Replaces
+//   (switch (select cond, X, Y)) on constant X, Y
+// with a branch - conditional if X and Y lead to distinct BBs,
+// unconditional otherwise.
+bool SimplifyCFGOpt::SimplifySwitchOnSelect(SwitchInst *SI,
+                                            SelectInst *Select) {
+  // Check for constant integer values in the select.
+  ConstantInt *TrueVal = dyn_cast<ConstantInt>(Select->getTrueValue());
+  ConstantInt *FalseVal = dyn_cast<ConstantInt>(Select->getFalseValue());
+  if (!TrueVal || !FalseVal)
+    return false;
+
+  // Find the relevant condition and destinations.
+  Value *Condition = Select->getCondition();
+  BasicBlock *TrueBB = SI->findCaseValue(TrueVal)->getCaseSuccessor();
+  BasicBlock *FalseBB = SI->findCaseValue(FalseVal)->getCaseSuccessor();
+
+  // Get weight for TrueBB and FalseBB.
+  uint32_t TrueWeight = 0, FalseWeight = 0;
+  SmallVector<uint64_t, 8> Weights;
+  bool HasWeights = HasBranchWeights(SI);
+  if (HasWeights) {
+    GetBranchWeights(SI, Weights);
+    if (Weights.size() == 1 + SI->getNumCases()) {
+      TrueWeight =
+          (uint32_t)Weights[SI->findCaseValue(TrueVal)->getSuccessorIndex()];
+      FalseWeight =
+          (uint32_t)Weights[SI->findCaseValue(FalseVal)->getSuccessorIndex()];
+    }
+  }
+
+  // Perform the actual simplification.
+  return SimplifyTerminatorOnSelect(SI, Condition, TrueBB, FalseBB, TrueWeight,
+                                    FalseWeight);
+}
+
+// Replaces
+//   (indirectbr (select cond, blockaddress(@fn, BlockA),
+//                             blockaddress(@fn, BlockB)))
+// with
+//   (br cond, BlockA, BlockB).
+bool SimplifyCFGOpt::SimplifyIndirectBrOnSelect(IndirectBrInst *IBI,
+                                                SelectInst *SI) {
+  // Check that both operands of the select are block addresses.
+  BlockAddress *TBA = dyn_cast<BlockAddress>(SI->getTrueValue());
+  BlockAddress *FBA = dyn_cast<BlockAddress>(SI->getFalseValue());
+  if (!TBA || !FBA)
+    return false;
+
+  // Extract the actual blocks.
+  BasicBlock *TrueBB = TBA->getBasicBlock();
+  BasicBlock *FalseBB = FBA->getBasicBlock();
+
+  // Perform the actual simplification.
+  return SimplifyTerminatorOnSelect(IBI, SI->getCondition(), TrueBB, FalseBB, 0,
+                                    0);
+}
+
+/// This is called when we find an icmp instruction
+/// (a seteq/setne with a constant) as the only instruction in a
+/// block that ends with an uncond branch.  We are looking for a very specific
+/// pattern that occurs when "A == 1 || A == 2 || A == 3" gets simplified.  In
+/// this case, we merge the first two "or's of icmp" into a switch, but then the
+/// default value goes to an uncond block with a seteq in it, we get something
+/// like:
+///
+///   switch i8 %A, label %DEFAULT [ i8 1, label %end    i8 2, label %end ]
+/// DEFAULT:
+///   %tmp = icmp eq i8 %A, 92
+///   br label %end
+/// end:
+///   ... = phi i1 [ true, %entry ], [ %tmp, %DEFAULT ], [ true, %entry ]
+///
+/// We prefer to split the edge to 'end' so that there is a true/false entry to
+/// the PHI, merging the third icmp into the switch.
+bool SimplifyCFGOpt::tryToSimplifyUncondBranchWithICmpInIt(
+    ICmpInst *ICI, IRBuilder<> &Builder) {
+  BasicBlock *BB = ICI->getParent();
+
+  // If the block has any PHIs in it or the icmp has multiple uses, it is too
+  // complex.
+  if (isa<PHINode>(BB->begin()) || !ICI->hasOneUse())
+    return false;
+
+  Value *V = ICI->getOperand(0);
+  ConstantInt *Cst = cast<ConstantInt>(ICI->getOperand(1));
+
+  // The pattern we're looking for is where our only predecessor is a switch on
+  // 'V' and this block is the default case for the switch.  In this case we can
+  // fold the compared value into the switch to simplify things.
+  BasicBlock *Pred = BB->getSinglePredecessor();
+  if (!Pred || !isa<SwitchInst>(Pred->getTerminator()))
+    return false;
+
+  SwitchInst *SI = cast<SwitchInst>(Pred->getTerminator());
+  if (SI->getCondition() != V)
+    return false;
+
+  // If BB is reachable on a non-default case, then we simply know the value of
+  // V in this block.  Substitute it and constant fold the icmp instruction
+  // away.
+  if (SI->getDefaultDest() != BB) {
+    ConstantInt *VVal = SI->findCaseDest(BB);
+    assert(VVal && "Should have a unique destination value");
+    ICI->setOperand(0, VVal);
+
+    if (Value *V = SimplifyInstruction(ICI, {DL, ICI})) {
+      ICI->replaceAllUsesWith(V);
+      ICI->eraseFromParent();
+    }
+    // BB is now empty, so it is likely to simplify away.
+    return requestResimplify();
+  }
+
+  // Ok, the block is reachable from the default dest.  If the constant we're
+  // comparing exists in one of the other edges, then we can constant fold ICI
+  // and zap it.
+  if (SI->findCaseValue(Cst) != SI->case_default()) {
+    Value *V;
+    if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
+      V = ConstantInt::getFalse(BB->getContext());
+    else
+      V = ConstantInt::getTrue(BB->getContext());
+
+    ICI->replaceAllUsesWith(V);
+    ICI->eraseFromParent();
+    // BB is now empty, so it is likely to simplify away.
+    return requestResimplify();
+  }
+
+  // The use of the icmp has to be in the 'end' block, by the only PHI node in
+  // the block.
+  BasicBlock *SuccBlock = BB->getTerminator()->getSuccessor(0);
+  PHINode *PHIUse = dyn_cast<PHINode>(ICI->user_back());
+  if (PHIUse == nullptr || PHIUse != &SuccBlock->front() ||
+      isa<PHINode>(++BasicBlock::iterator(PHIUse)))
+    return false;
+
+  // If the icmp is a SETEQ, then the default dest gets false, the new edge gets
+  // true in the PHI.
+  Constant *DefaultCst = ConstantInt::getTrue(BB->getContext());
+  Constant *NewCst = ConstantInt::getFalse(BB->getContext());
+
+  if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
+    std::swap(DefaultCst, NewCst);
+
+  // Replace ICI (which is used by the PHI for the default value) with true or
+  // false depending on if it is EQ or NE.
+  ICI->replaceAllUsesWith(DefaultCst);
+  ICI->eraseFromParent();
+
   SmallVector<DominatorTree::UpdateType, 2> Updates;
 
-  // Okay, the switch goes to this block on a default value.  Add an edge from 
-  // the switch to the merge point on the compared value. 
-  BasicBlock *NewBB = 
-      BasicBlock::Create(BB->getContext(), "switch.edge", BB->getParent(), BB); 
-  { 
-    SwitchInstProfUpdateWrapper SIW(*SI); 
-    auto W0 = SIW.getSuccessorWeight(0); 
-    SwitchInstProfUpdateWrapper::CaseWeightOpt NewW; 
-    if (W0) { 
-      NewW = ((uint64_t(*W0) + 1) >> 1); 
-      SIW.setSuccessorWeight(0, *NewW); 
-    } 
-    SIW.addCase(Cst, NewBB, NewW); 
+  // Okay, the switch goes to this block on a default value.  Add an edge from
+  // the switch to the merge point on the compared value.
+  BasicBlock *NewBB =
+      BasicBlock::Create(BB->getContext(), "switch.edge", BB->getParent(), BB);
+  {
+    SwitchInstProfUpdateWrapper SIW(*SI);
+    auto W0 = SIW.getSuccessorWeight(0);
+    SwitchInstProfUpdateWrapper::CaseWeightOpt NewW;
+    if (W0) {
+      NewW = ((uint64_t(*W0) + 1) >> 1);
+      SIW.setSuccessorWeight(0, *NewW);
+    }
+    SIW.addCase(Cst, NewBB, NewW);
     Updates.push_back({DominatorTree::Insert, Pred, NewBB});
-  } 
- 
-  // NewBB branches to the phi block, add the uncond branch and the phi entry. 
-  Builder.SetInsertPoint(NewBB); 
-  Builder.SetCurrentDebugLocation(SI->getDebugLoc()); 
-  Builder.CreateBr(SuccBlock); 
+  }
+
+  // NewBB branches to the phi block, add the uncond branch and the phi entry.
+  Builder.SetInsertPoint(NewBB);
+  Builder.SetCurrentDebugLocation(SI->getDebugLoc());
+  Builder.CreateBr(SuccBlock);
   Updates.push_back({DominatorTree::Insert, NewBB, SuccBlock});
-  PHIUse->addIncoming(NewCst, NewBB); 
+  PHIUse->addIncoming(NewCst, NewBB);
   if (DTU)
     DTU->applyUpdates(Updates);
-  return true; 
-} 
- 
-/// The specified branch is a conditional branch. 
-/// Check to see if it is branching on an or/and chain of icmp instructions, and 
-/// fold it into a switch instruction if so. 
-bool SimplifyCFGOpt::SimplifyBranchOnICmpChain(BranchInst *BI, 
-                                               IRBuilder<> &Builder, 
-                                               const DataLayout &DL) { 
-  Instruction *Cond = dyn_cast<Instruction>(BI->getCondition()); 
-  if (!Cond) 
-    return false; 
- 
-  // Change br (X == 0 | X == 1), T, F into a switch instruction. 
-  // If this is a bunch of seteq's or'd together, or if it's a bunch of 
-  // 'setne's and'ed together, collect them. 
- 
-  // Try to gather values from a chain of and/or to be turned into a switch 
-  ConstantComparesGatherer ConstantCompare(Cond, DL); 
-  // Unpack the result 
-  SmallVectorImpl<ConstantInt *> &Values = ConstantCompare.Vals; 
-  Value *CompVal = ConstantCompare.CompValue; 
-  unsigned UsedICmps = ConstantCompare.UsedICmps; 
-  Value *ExtraCase = ConstantCompare.Extra; 
- 
-  // If we didn't have a multiply compared value, fail. 
-  if (!CompVal) 
-    return false; 
- 
-  // Avoid turning single icmps into a switch. 
-  if (UsedICmps <= 1) 
-    return false; 
- 
+  return true;
+}
+
+/// The specified branch is a conditional branch.
+/// Check to see if it is branching on an or/and chain of icmp instructions, and
+/// fold it into a switch instruction if so.
+bool SimplifyCFGOpt::SimplifyBranchOnICmpChain(BranchInst *BI,
+                                               IRBuilder<> &Builder,
+                                               const DataLayout &DL) {
+  Instruction *Cond = dyn_cast<Instruction>(BI->getCondition());
+  if (!Cond)
+    return false;
+
+  // Change br (X == 0 | X == 1), T, F into a switch instruction.
+  // If this is a bunch of seteq's or'd together, or if it's a bunch of
+  // 'setne's and'ed together, collect them.
+
+  // Try to gather values from a chain of and/or to be turned into a switch
+  ConstantComparesGatherer ConstantCompare(Cond, DL);
+  // Unpack the result
+  SmallVectorImpl<ConstantInt *> &Values = ConstantCompare.Vals;
+  Value *CompVal = ConstantCompare.CompValue;
+  unsigned UsedICmps = ConstantCompare.UsedICmps;
+  Value *ExtraCase = ConstantCompare.Extra;
+
+  // If we didn't have a multiply compared value, fail.
+  if (!CompVal)
+    return false;
+
+  // Avoid turning single icmps into a switch.
+  if (UsedICmps <= 1)
+    return false;
+
   bool TrueWhenEqual = match(Cond, m_LogicalOr(m_Value(), m_Value()));
- 
-  // There might be duplicate constants in the list, which the switch 
-  // instruction can't handle, remove them now. 
-  array_pod_sort(Values.begin(), Values.end(), ConstantIntSortPredicate); 
-  Values.erase(std::unique(Values.begin(), Values.end()), Values.end()); 
- 
-  // If Extra was used, we require at least two switch values to do the 
-  // transformation.  A switch with one value is just a conditional branch. 
-  if (ExtraCase && Values.size() < 2) 
-    return false; 
- 
-  // TODO: Preserve branch weight metadata, similarly to how 
-  // FoldValueComparisonIntoPredecessors preserves it. 
- 
-  // Figure out which block is which destination. 
-  BasicBlock *DefaultBB = BI->getSuccessor(1); 
-  BasicBlock *EdgeBB = BI->getSuccessor(0); 
-  if (!TrueWhenEqual) 
-    std::swap(DefaultBB, EdgeBB); 
- 
-  BasicBlock *BB = BI->getParent(); 
- 
-  // MSAN does not like undefs as branch condition which can be introduced 
-  // with "explicit branch". 
-  if (ExtraCase && BB->getParent()->hasFnAttribute(Attribute::SanitizeMemory)) 
-    return false; 
- 
-  LLVM_DEBUG(dbgs() << "Converting 'icmp' chain with " << Values.size() 
-                    << " cases into SWITCH.  BB is:\n" 
-                    << *BB); 
- 
+
+  // There might be duplicate constants in the list, which the switch
+  // instruction can't handle, remove them now.
+  array_pod_sort(Values.begin(), Values.end(), ConstantIntSortPredicate);
+  Values.erase(std::unique(Values.begin(), Values.end()), Values.end());
+
+  // If Extra was used, we require at least two switch values to do the
+  // transformation.  A switch with one value is just a conditional branch.
+  if (ExtraCase && Values.size() < 2)
+    return false;
+
+  // TODO: Preserve branch weight metadata, similarly to how
+  // FoldValueComparisonIntoPredecessors preserves it.
+
+  // Figure out which block is which destination.
+  BasicBlock *DefaultBB = BI->getSuccessor(1);
+  BasicBlock *EdgeBB = BI->getSuccessor(0);
+  if (!TrueWhenEqual)
+    std::swap(DefaultBB, EdgeBB);
+
+  BasicBlock *BB = BI->getParent();
+
+  // MSAN does not like undefs as branch condition which can be introduced
+  // with "explicit branch".
+  if (ExtraCase && BB->getParent()->hasFnAttribute(Attribute::SanitizeMemory))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Converting 'icmp' chain with " << Values.size()
+                    << " cases into SWITCH.  BB is:\n"
+                    << *BB);
+
   SmallVector<DominatorTree::UpdateType, 2> Updates;
 
-  // If there are any extra values that couldn't be folded into the switch 
-  // then we evaluate them with an explicit branch first. Split the block 
-  // right before the condbr to handle it. 
-  if (ExtraCase) { 
+  // If there are any extra values that couldn't be folded into the switch
+  // then we evaluate them with an explicit branch first. Split the block
+  // right before the condbr to handle it.
+  if (ExtraCase) {
     BasicBlock *NewBB = SplitBlock(BB, BI, DTU, /*LI=*/nullptr,
                                    /*MSSAU=*/nullptr, "switch.early.test");
 
-    // Remove the uncond branch added to the old block. 
-    Instruction *OldTI = BB->getTerminator(); 
-    Builder.SetInsertPoint(OldTI); 
- 
-    if (TrueWhenEqual) 
-      Builder.CreateCondBr(ExtraCase, EdgeBB, NewBB); 
-    else 
-      Builder.CreateCondBr(ExtraCase, NewBB, EdgeBB); 
- 
-    OldTI->eraseFromParent(); 
- 
+    // Remove the uncond branch added to the old block.
+    Instruction *OldTI = BB->getTerminator();
+    Builder.SetInsertPoint(OldTI);
+
+    if (TrueWhenEqual)
+      Builder.CreateCondBr(ExtraCase, EdgeBB, NewBB);
+    else
+      Builder.CreateCondBr(ExtraCase, NewBB, EdgeBB);
+
+    OldTI->eraseFromParent();
+
     Updates.push_back({DominatorTree::Insert, BB, EdgeBB});
 
-    // If there are PHI nodes in EdgeBB, then we need to add a new entry to them 
-    // for the edge we just added. 
-    AddPredecessorToBlock(EdgeBB, BB, NewBB); 
- 
-    LLVM_DEBUG(dbgs() << "  ** 'icmp' chain unhandled condition: " << *ExtraCase 
-                      << "\nEXTRABB = " << *BB); 
-    BB = NewBB; 
-  } 
- 
-  Builder.SetInsertPoint(BI); 
-  // Convert pointer to int before we switch. 
-  if (CompVal->getType()->isPointerTy()) { 
-    CompVal = Builder.CreatePtrToInt( 
-        CompVal, DL.getIntPtrType(CompVal->getType()), "magicptr"); 
-  } 
- 
-  // Create the new switch instruction now. 
-  SwitchInst *New = Builder.CreateSwitch(CompVal, DefaultBB, Values.size()); 
- 
-  // Add all of the 'cases' to the switch instruction. 
-  for (unsigned i = 0, e = Values.size(); i != e; ++i) 
-    New->addCase(Values[i], EdgeBB); 
- 
-  // We added edges from PI to the EdgeBB.  As such, if there were any 
-  // PHI nodes in EdgeBB, they need entries to be added corresponding to 
-  // the number of edges added. 
-  for (BasicBlock::iterator BBI = EdgeBB->begin(); isa<PHINode>(BBI); ++BBI) { 
-    PHINode *PN = cast<PHINode>(BBI); 
-    Value *InVal = PN->getIncomingValueForBlock(BB); 
-    for (unsigned i = 0, e = Values.size() - 1; i != e; ++i) 
-      PN->addIncoming(InVal, BB); 
-  } 
- 
-  // Erase the old branch instruction. 
-  EraseTerminatorAndDCECond(BI); 
+    // If there are PHI nodes in EdgeBB, then we need to add a new entry to them
+    // for the edge we just added.
+    AddPredecessorToBlock(EdgeBB, BB, NewBB);
+
+    LLVM_DEBUG(dbgs() << "  ** 'icmp' chain unhandled condition: " << *ExtraCase
+                      << "\nEXTRABB = " << *BB);
+    BB = NewBB;
+  }
+
+  Builder.SetInsertPoint(BI);
+  // Convert pointer to int before we switch.
+  if (CompVal->getType()->isPointerTy()) {
+    CompVal = Builder.CreatePtrToInt(
+        CompVal, DL.getIntPtrType(CompVal->getType()), "magicptr");
+  }
+
+  // Create the new switch instruction now.
+  SwitchInst *New = Builder.CreateSwitch(CompVal, DefaultBB, Values.size());
+
+  // Add all of the 'cases' to the switch instruction.
+  for (unsigned i = 0, e = Values.size(); i != e; ++i)
+    New->addCase(Values[i], EdgeBB);
+
+  // We added edges from PI to the EdgeBB.  As such, if there were any
+  // PHI nodes in EdgeBB, they need entries to be added corresponding to
+  // the number of edges added.
+  for (BasicBlock::iterator BBI = EdgeBB->begin(); isa<PHINode>(BBI); ++BBI) {
+    PHINode *PN = cast<PHINode>(BBI);
+    Value *InVal = PN->getIncomingValueForBlock(BB);
+    for (unsigned i = 0, e = Values.size() - 1; i != e; ++i)
+      PN->addIncoming(InVal, BB);
+  }
+
+  // Erase the old branch instruction.
+  EraseTerminatorAndDCECond(BI);
   if (DTU)
     DTU->applyUpdates(Updates);
- 
-  LLVM_DEBUG(dbgs() << "  ** 'icmp' chain result is:\n" << *BB << '\n'); 
-  return true; 
-} 
- 
-bool SimplifyCFGOpt::simplifyResume(ResumeInst *RI, IRBuilder<> &Builder) { 
-  if (isa<PHINode>(RI->getValue())) 
-    return simplifyCommonResume(RI); 
-  else if (isa<LandingPadInst>(RI->getParent()->getFirstNonPHI()) && 
-           RI->getValue() == RI->getParent()->getFirstNonPHI()) 
-    // The resume must unwind the exception that caused control to branch here. 
-    return simplifySingleResume(RI); 
- 
-  return false; 
-} 
- 
+
+  LLVM_DEBUG(dbgs() << "  ** 'icmp' chain result is:\n" << *BB << '\n');
+  return true;
+}
+
+bool SimplifyCFGOpt::simplifyResume(ResumeInst *RI, IRBuilder<> &Builder) {
+  if (isa<PHINode>(RI->getValue()))
+    return simplifyCommonResume(RI);
+  else if (isa<LandingPadInst>(RI->getParent()->getFirstNonPHI()) &&
+           RI->getValue() == RI->getParent()->getFirstNonPHI())
+    // The resume must unwind the exception that caused control to branch here.
+    return simplifySingleResume(RI);
+
+  return false;
+}
+
 // Check if cleanup block is empty
 static bool isCleanupBlockEmpty(iterator_range<BasicBlock::iterator> R) {
   for (Instruction &I : R) {
@@ -4116,234 +4116,234 @@ static bool isCleanupBlockEmpty(iterator_range<BasicBlock::iterator> R) {
   return true;
 }
 
-// Simplify resume that is shared by several landing pads (phi of landing pad). 
-bool SimplifyCFGOpt::simplifyCommonResume(ResumeInst *RI) { 
-  BasicBlock *BB = RI->getParent(); 
- 
+// Simplify resume that is shared by several landing pads (phi of landing pad).
+bool SimplifyCFGOpt::simplifyCommonResume(ResumeInst *RI) {
+  BasicBlock *BB = RI->getParent();
+
   // Check that there are no other instructions except for debug and lifetime
   // intrinsics between the phi's and resume instruction.
   if (!isCleanupBlockEmpty(
           make_range(RI->getParent()->getFirstNonPHI(), BB->getTerminator())))
     return false;
- 
-  SmallSetVector<BasicBlock *, 4> TrivialUnwindBlocks; 
-  auto *PhiLPInst = cast<PHINode>(RI->getValue()); 
- 
-  // Check incoming blocks to see if any of them are trivial. 
-  for (unsigned Idx = 0, End = PhiLPInst->getNumIncomingValues(); Idx != End; 
-       Idx++) { 
-    auto *IncomingBB = PhiLPInst->getIncomingBlock(Idx); 
-    auto *IncomingValue = PhiLPInst->getIncomingValue(Idx); 
- 
-    // If the block has other successors, we can not delete it because 
-    // it has other dependents. 
-    if (IncomingBB->getUniqueSuccessor() != BB) 
-      continue; 
- 
-    auto *LandingPad = dyn_cast<LandingPadInst>(IncomingBB->getFirstNonPHI()); 
-    // Not the landing pad that caused the control to branch here. 
-    if (IncomingValue != LandingPad) 
-      continue; 
- 
+
+  SmallSetVector<BasicBlock *, 4> TrivialUnwindBlocks;
+  auto *PhiLPInst = cast<PHINode>(RI->getValue());
+
+  // Check incoming blocks to see if any of them are trivial.
+  for (unsigned Idx = 0, End = PhiLPInst->getNumIncomingValues(); Idx != End;
+       Idx++) {
+    auto *IncomingBB = PhiLPInst->getIncomingBlock(Idx);
+    auto *IncomingValue = PhiLPInst->getIncomingValue(Idx);
+
+    // If the block has other successors, we can not delete it because
+    // it has other dependents.
+    if (IncomingBB->getUniqueSuccessor() != BB)
+      continue;
+
+    auto *LandingPad = dyn_cast<LandingPadInst>(IncomingBB->getFirstNonPHI());
+    // Not the landing pad that caused the control to branch here.
+    if (IncomingValue != LandingPad)
+      continue;
+
     if (isCleanupBlockEmpty(
             make_range(LandingPad->getNextNode(), IncomingBB->getTerminator())))
-      TrivialUnwindBlocks.insert(IncomingBB); 
-  } 
- 
-  // If no trivial unwind blocks, don't do any simplifications. 
-  if (TrivialUnwindBlocks.empty()) 
-    return false; 
- 
-  // Turn all invokes that unwind here into calls. 
-  for (auto *TrivialBB : TrivialUnwindBlocks) { 
-    // Blocks that will be simplified should be removed from the phi node. 
-    // Note there could be multiple edges to the resume block, and we need 
-    // to remove them all. 
-    while (PhiLPInst->getBasicBlockIndex(TrivialBB) != -1) 
-      BB->removePredecessor(TrivialBB, true); 
- 
-    for (pred_iterator PI = pred_begin(TrivialBB), PE = pred_end(TrivialBB); 
-         PI != PE;) { 
-      BasicBlock *Pred = *PI++; 
+      TrivialUnwindBlocks.insert(IncomingBB);
+  }
+
+  // If no trivial unwind blocks, don't do any simplifications.
+  if (TrivialUnwindBlocks.empty())
+    return false;
+
+  // Turn all invokes that unwind here into calls.
+  for (auto *TrivialBB : TrivialUnwindBlocks) {
+    // Blocks that will be simplified should be removed from the phi node.
+    // Note there could be multiple edges to the resume block, and we need
+    // to remove them all.
+    while (PhiLPInst->getBasicBlockIndex(TrivialBB) != -1)
+      BB->removePredecessor(TrivialBB, true);
+
+    for (pred_iterator PI = pred_begin(TrivialBB), PE = pred_end(TrivialBB);
+         PI != PE;) {
+      BasicBlock *Pred = *PI++;
       removeUnwindEdge(Pred, DTU);
       ++NumInvokes;
-    } 
- 
-    // In each SimplifyCFG run, only the current processed block can be erased. 
-    // Otherwise, it will break the iteration of SimplifyCFG pass. So instead 
-    // of erasing TrivialBB, we only remove the branch to the common resume 
-    // block so that we can later erase the resume block since it has no 
-    // predecessors. 
-    TrivialBB->getTerminator()->eraseFromParent(); 
-    new UnreachableInst(RI->getContext(), TrivialBB); 
+    }
+
+    // In each SimplifyCFG run, only the current processed block can be erased.
+    // Otherwise, it will break the iteration of SimplifyCFG pass. So instead
+    // of erasing TrivialBB, we only remove the branch to the common resume
+    // block so that we can later erase the resume block since it has no
+    // predecessors.
+    TrivialBB->getTerminator()->eraseFromParent();
+    new UnreachableInst(RI->getContext(), TrivialBB);
     if (DTU)
       DTU->applyUpdates({{DominatorTree::Delete, TrivialBB, BB}});
-  } 
- 
-  // Delete the resume block if all its predecessors have been removed. 
+  }
+
+  // Delete the resume block if all its predecessors have been removed.
   if (pred_empty(BB)) {
     if (DTU)
       DTU->deleteBB(BB);
     else
       BB->eraseFromParent();
   }
- 
-  return !TrivialUnwindBlocks.empty(); 
-} 
- 
-// Simplify resume that is only used by a single (non-phi) landing pad. 
-bool SimplifyCFGOpt::simplifySingleResume(ResumeInst *RI) { 
-  BasicBlock *BB = RI->getParent(); 
-  auto *LPInst = cast<LandingPadInst>(BB->getFirstNonPHI()); 
-  assert(RI->getValue() == LPInst && 
-         "Resume must unwind the exception that caused control to here"); 
- 
-  // Check that there are no other instructions except for debug intrinsics. 
+
+  return !TrivialUnwindBlocks.empty();
+}
+
+// Simplify resume that is only used by a single (non-phi) landing pad.
+bool SimplifyCFGOpt::simplifySingleResume(ResumeInst *RI) {
+  BasicBlock *BB = RI->getParent();
+  auto *LPInst = cast<LandingPadInst>(BB->getFirstNonPHI());
+  assert(RI->getValue() == LPInst &&
+         "Resume must unwind the exception that caused control to here");
+
+  // Check that there are no other instructions except for debug intrinsics.
   if (!isCleanupBlockEmpty(
           make_range<Instruction *>(LPInst->getNextNode(), RI)))
-    return false; 
- 
-  // Turn all invokes that unwind here into calls and delete the basic block. 
-  for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;) { 
-    BasicBlock *Pred = *PI++; 
+    return false;
+
+  // Turn all invokes that unwind here into calls and delete the basic block.
+  for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;) {
+    BasicBlock *Pred = *PI++;
     removeUnwindEdge(Pred, DTU);
     ++NumInvokes;
-  } 
- 
-  // The landingpad is now unreachable.  Zap it. 
+  }
+
+  // The landingpad is now unreachable.  Zap it.
   if (DTU)
     DTU->deleteBB(BB);
   else
     BB->eraseFromParent();
-  return true; 
-} 
- 
+  return true;
+}
+
 static bool removeEmptyCleanup(CleanupReturnInst *RI, DomTreeUpdater *DTU) {
-  // If this is a trivial cleanup pad that executes no instructions, it can be 
-  // eliminated.  If the cleanup pad continues to the caller, any predecessor 
-  // that is an EH pad will be updated to continue to the caller and any 
-  // predecessor that terminates with an invoke instruction will have its invoke 
-  // instruction converted to a call instruction.  If the cleanup pad being 
-  // simplified does not continue to the caller, each predecessor will be 
-  // updated to continue to the unwind destination of the cleanup pad being 
-  // simplified. 
-  BasicBlock *BB = RI->getParent(); 
-  CleanupPadInst *CPInst = RI->getCleanupPad(); 
-  if (CPInst->getParent() != BB) 
-    // This isn't an empty cleanup. 
-    return false; 
- 
-  // We cannot kill the pad if it has multiple uses.  This typically arises 
-  // from unreachable basic blocks. 
-  if (!CPInst->hasOneUse()) 
-    return false; 
- 
-  // Check that there are no other instructions except for benign intrinsics. 
+  // If this is a trivial cleanup pad that executes no instructions, it can be
+  // eliminated.  If the cleanup pad continues to the caller, any predecessor
+  // that is an EH pad will be updated to continue to the caller and any
+  // predecessor that terminates with an invoke instruction will have its invoke
+  // instruction converted to a call instruction.  If the cleanup pad being
+  // simplified does not continue to the caller, each predecessor will be
+  // updated to continue to the unwind destination of the cleanup pad being
+  // simplified.
+  BasicBlock *BB = RI->getParent();
+  CleanupPadInst *CPInst = RI->getCleanupPad();
+  if (CPInst->getParent() != BB)
+    // This isn't an empty cleanup.
+    return false;
+
+  // We cannot kill the pad if it has multiple uses.  This typically arises
+  // from unreachable basic blocks.
+  if (!CPInst->hasOneUse())
+    return false;
+
+  // Check that there are no other instructions except for benign intrinsics.
   if (!isCleanupBlockEmpty(
           make_range<Instruction *>(CPInst->getNextNode(), RI)))
-    return false; 
- 
-  // If the cleanup return we are simplifying unwinds to the caller, this will 
-  // set UnwindDest to nullptr. 
-  BasicBlock *UnwindDest = RI->getUnwindDest(); 
-  Instruction *DestEHPad = UnwindDest ? UnwindDest->getFirstNonPHI() : nullptr; 
- 
-  // We're about to remove BB from the control flow.  Before we do, sink any 
-  // PHINodes into the unwind destination.  Doing this before changing the 
-  // control flow avoids some potentially slow checks, since we can currently 
-  // be certain that UnwindDest and BB have no common predecessors (since they 
-  // are both EH pads). 
-  if (UnwindDest) { 
-    // First, go through the PHI nodes in UnwindDest and update any nodes that 
-    // reference the block we are removing 
-    for (BasicBlock::iterator I = UnwindDest->begin(), 
-                              IE = DestEHPad->getIterator(); 
-         I != IE; ++I) { 
-      PHINode *DestPN = cast<PHINode>(I); 
- 
-      int Idx = DestPN->getBasicBlockIndex(BB); 
-      // Since BB unwinds to UnwindDest, it has to be in the PHI node. 
-      assert(Idx != -1); 
-      // This PHI node has an incoming value that corresponds to a control 
-      // path through the cleanup pad we are removing.  If the incoming 
-      // value is in the cleanup pad, it must be a PHINode (because we 
-      // verified above that the block is otherwise empty).  Otherwise, the 
-      // value is either a constant or a value that dominates the cleanup 
-      // pad being removed. 
-      // 
-      // Because BB and UnwindDest are both EH pads, all of their 
-      // predecessors must unwind to these blocks, and since no instruction 
-      // can have multiple unwind destinations, there will be no overlap in 
-      // incoming blocks between SrcPN and DestPN. 
-      Value *SrcVal = DestPN->getIncomingValue(Idx); 
-      PHINode *SrcPN = dyn_cast<PHINode>(SrcVal); 
- 
-      // Remove the entry for the block we are deleting. 
-      DestPN->removeIncomingValue(Idx, false); 
- 
-      if (SrcPN && SrcPN->getParent() == BB) { 
-        // If the incoming value was a PHI node in the cleanup pad we are 
-        // removing, we need to merge that PHI node's incoming values into 
-        // DestPN. 
-        for (unsigned SrcIdx = 0, SrcE = SrcPN->getNumIncomingValues(); 
-             SrcIdx != SrcE; ++SrcIdx) { 
-          DestPN->addIncoming(SrcPN->getIncomingValue(SrcIdx), 
-                              SrcPN->getIncomingBlock(SrcIdx)); 
-        } 
-      } else { 
-        // Otherwise, the incoming value came from above BB and 
-        // so we can just reuse it.  We must associate all of BB's 
-        // predecessors with this value. 
-        for (auto *pred : predecessors(BB)) { 
-          DestPN->addIncoming(SrcVal, pred); 
-        } 
-      } 
-    } 
- 
-    // Sink any remaining PHI nodes directly into UnwindDest. 
-    Instruction *InsertPt = DestEHPad; 
-    for (BasicBlock::iterator I = BB->begin(), 
-                              IE = BB->getFirstNonPHI()->getIterator(); 
-         I != IE;) { 
-      // The iterator must be incremented here because the instructions are 
-      // being moved to another block. 
-      PHINode *PN = cast<PHINode>(I++); 
-      if (PN->use_empty() || !PN->isUsedOutsideOfBlock(BB)) 
-        // If the PHI node has no uses or all of its uses are in this basic 
-        // block (meaning they are debug or lifetime intrinsics), just leave 
-        // it.  It will be erased when we erase BB below. 
-        continue; 
- 
-      // Otherwise, sink this PHI node into UnwindDest. 
-      // Any predecessors to UnwindDest which are not already represented 
-      // must be back edges which inherit the value from the path through 
-      // BB.  In this case, the PHI value must reference itself. 
-      for (auto *pred : predecessors(UnwindDest)) 
-        if (pred != BB) 
-          PN->addIncoming(PN, pred); 
-      PN->moveBefore(InsertPt); 
-    } 
-  } 
- 
+    return false;
+
+  // If the cleanup return we are simplifying unwinds to the caller, this will
+  // set UnwindDest to nullptr.
+  BasicBlock *UnwindDest = RI->getUnwindDest();
+  Instruction *DestEHPad = UnwindDest ? UnwindDest->getFirstNonPHI() : nullptr;
+
+  // We're about to remove BB from the control flow.  Before we do, sink any
+  // PHINodes into the unwind destination.  Doing this before changing the
+  // control flow avoids some potentially slow checks, since we can currently
+  // be certain that UnwindDest and BB have no common predecessors (since they
+  // are both EH pads).
+  if (UnwindDest) {
+    // First, go through the PHI nodes in UnwindDest and update any nodes that
+    // reference the block we are removing
+    for (BasicBlock::iterator I = UnwindDest->begin(),
+                              IE = DestEHPad->getIterator();
+         I != IE; ++I) {
+      PHINode *DestPN = cast<PHINode>(I);
+
+      int Idx = DestPN->getBasicBlockIndex(BB);
+      // Since BB unwinds to UnwindDest, it has to be in the PHI node.
+      assert(Idx != -1);
+      // This PHI node has an incoming value that corresponds to a control
+      // path through the cleanup pad we are removing.  If the incoming
+      // value is in the cleanup pad, it must be a PHINode (because we
+      // verified above that the block is otherwise empty).  Otherwise, the
+      // value is either a constant or a value that dominates the cleanup
+      // pad being removed.
+      //
+      // Because BB and UnwindDest are both EH pads, all of their
+      // predecessors must unwind to these blocks, and since no instruction
+      // can have multiple unwind destinations, there will be no overlap in
+      // incoming blocks between SrcPN and DestPN.
+      Value *SrcVal = DestPN->getIncomingValue(Idx);
+      PHINode *SrcPN = dyn_cast<PHINode>(SrcVal);
+
+      // Remove the entry for the block we are deleting.
+      DestPN->removeIncomingValue(Idx, false);
+
+      if (SrcPN && SrcPN->getParent() == BB) {
+        // If the incoming value was a PHI node in the cleanup pad we are
+        // removing, we need to merge that PHI node's incoming values into
+        // DestPN.
+        for (unsigned SrcIdx = 0, SrcE = SrcPN->getNumIncomingValues();
+             SrcIdx != SrcE; ++SrcIdx) {
+          DestPN->addIncoming(SrcPN->getIncomingValue(SrcIdx),
+                              SrcPN->getIncomingBlock(SrcIdx));
+        }
+      } else {
+        // Otherwise, the incoming value came from above BB and
+        // so we can just reuse it.  We must associate all of BB's
+        // predecessors with this value.
+        for (auto *pred : predecessors(BB)) {
+          DestPN->addIncoming(SrcVal, pred);
+        }
+      }
+    }
+
+    // Sink any remaining PHI nodes directly into UnwindDest.
+    Instruction *InsertPt = DestEHPad;
+    for (BasicBlock::iterator I = BB->begin(),
+                              IE = BB->getFirstNonPHI()->getIterator();
+         I != IE;) {
+      // The iterator must be incremented here because the instructions are
+      // being moved to another block.
+      PHINode *PN = cast<PHINode>(I++);
+      if (PN->use_empty() || !PN->isUsedOutsideOfBlock(BB))
+        // If the PHI node has no uses or all of its uses are in this basic
+        // block (meaning they are debug or lifetime intrinsics), just leave
+        // it.  It will be erased when we erase BB below.
+        continue;
+
+      // Otherwise, sink this PHI node into UnwindDest.
+      // Any predecessors to UnwindDest which are not already represented
+      // must be back edges which inherit the value from the path through
+      // BB.  In this case, the PHI value must reference itself.
+      for (auto *pred : predecessors(UnwindDest))
+        if (pred != BB)
+          PN->addIncoming(PN, pred);
+      PN->moveBefore(InsertPt);
+    }
+  }
+
   std::vector<DominatorTree::UpdateType> Updates;
 
-  for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;) { 
-    // The iterator must be updated here because we are removing this pred. 
-    BasicBlock *PredBB = *PI++; 
-    if (UnwindDest == nullptr) { 
+  for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;) {
+    // The iterator must be updated here because we are removing this pred.
+    BasicBlock *PredBB = *PI++;
+    if (UnwindDest == nullptr) {
       if (DTU)
         DTU->applyUpdates(Updates);
       Updates.clear();
       removeUnwindEdge(PredBB, DTU);
       ++NumInvokes;
-    } else { 
-      Instruction *TI = PredBB->getTerminator(); 
-      TI->replaceUsesOfWith(BB, UnwindDest); 
+    } else {
+      Instruction *TI = PredBB->getTerminator();
+      TI->replaceUsesOfWith(BB, UnwindDest);
       Updates.push_back({DominatorTree::Insert, PredBB, UnwindDest});
       Updates.push_back({DominatorTree::Delete, PredBB, BB});
-    } 
-  } 
- 
+    }
+  }
+
   if (DTU) {
     DTU->applyUpdates(Updates);
     DTU->deleteBB(BB);
@@ -4351,250 +4351,250 @@ static bool removeEmptyCleanup(CleanupReturnInst *RI, DomTreeUpdater *DTU) {
     // The cleanup pad is now unreachable.  Zap it.
     BB->eraseFromParent();
 
-  return true; 
-} 
- 
-// Try to merge two cleanuppads together. 
-static bool mergeCleanupPad(CleanupReturnInst *RI) { 
-  // Skip any cleanuprets which unwind to caller, there is nothing to merge 
-  // with. 
-  BasicBlock *UnwindDest = RI->getUnwindDest(); 
-  if (!UnwindDest) 
-    return false; 
- 
-  // This cleanupret isn't the only predecessor of this cleanuppad, it wouldn't 
-  // be safe to merge without code duplication. 
-  if (UnwindDest->getSinglePredecessor() != RI->getParent()) 
-    return false; 
- 
-  // Verify that our cleanuppad's unwind destination is another cleanuppad. 
-  auto *SuccessorCleanupPad = dyn_cast<CleanupPadInst>(&UnwindDest->front()); 
-  if (!SuccessorCleanupPad) 
-    return false; 
- 
-  CleanupPadInst *PredecessorCleanupPad = RI->getCleanupPad(); 
-  // Replace any uses of the successor cleanupad with the predecessor pad 
-  // The only cleanuppad uses should be this cleanupret, it's cleanupret and 
-  // funclet bundle operands. 
-  SuccessorCleanupPad->replaceAllUsesWith(PredecessorCleanupPad); 
-  // Remove the old cleanuppad. 
-  SuccessorCleanupPad->eraseFromParent(); 
-  // Now, we simply replace the cleanupret with a branch to the unwind 
-  // destination. 
-  BranchInst::Create(UnwindDest, RI->getParent()); 
-  RI->eraseFromParent(); 
- 
-  return true; 
-} 
- 
-bool SimplifyCFGOpt::simplifyCleanupReturn(CleanupReturnInst *RI) { 
-  // It is possible to transiantly have an undef cleanuppad operand because we 
-  // have deleted some, but not all, dead blocks. 
-  // Eventually, this block will be deleted. 
-  if (isa<UndefValue>(RI->getOperand(0))) 
-    return false; 
- 
-  if (mergeCleanupPad(RI)) 
-    return true; 
- 
+  return true;
+}
+
+// Try to merge two cleanuppads together.
+static bool mergeCleanupPad(CleanupReturnInst *RI) {
+  // Skip any cleanuprets which unwind to caller, there is nothing to merge
+  // with.
+  BasicBlock *UnwindDest = RI->getUnwindDest();
+  if (!UnwindDest)
+    return false;
+
+  // This cleanupret isn't the only predecessor of this cleanuppad, it wouldn't
+  // be safe to merge without code duplication.
+  if (UnwindDest->getSinglePredecessor() != RI->getParent())
+    return false;
+
+  // Verify that our cleanuppad's unwind destination is another cleanuppad.
+  auto *SuccessorCleanupPad = dyn_cast<CleanupPadInst>(&UnwindDest->front());
+  if (!SuccessorCleanupPad)
+    return false;
+
+  CleanupPadInst *PredecessorCleanupPad = RI->getCleanupPad();
+  // Replace any uses of the successor cleanupad with the predecessor pad
+  // The only cleanuppad uses should be this cleanupret, it's cleanupret and
+  // funclet bundle operands.
+  SuccessorCleanupPad->replaceAllUsesWith(PredecessorCleanupPad);
+  // Remove the old cleanuppad.
+  SuccessorCleanupPad->eraseFromParent();
+  // Now, we simply replace the cleanupret with a branch to the unwind
+  // destination.
+  BranchInst::Create(UnwindDest, RI->getParent());
+  RI->eraseFromParent();
+
+  return true;
+}
+
+bool SimplifyCFGOpt::simplifyCleanupReturn(CleanupReturnInst *RI) {
+  // It is possible to transiantly have an undef cleanuppad operand because we
+  // have deleted some, but not all, dead blocks.
+  // Eventually, this block will be deleted.
+  if (isa<UndefValue>(RI->getOperand(0)))
+    return false;
+
+  if (mergeCleanupPad(RI))
+    return true;
+
   if (removeEmptyCleanup(RI, DTU))
-    return true; 
- 
-  return false; 
-} 
- 
-bool SimplifyCFGOpt::simplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) { 
-  BasicBlock *BB = RI->getParent(); 
-  if (!BB->getFirstNonPHIOrDbg()->isTerminator()) 
-    return false; 
- 
-  // Find predecessors that end with branches. 
-  SmallVector<BasicBlock *, 8> UncondBranchPreds; 
-  SmallVector<BranchInst *, 8> CondBranchPreds; 
-  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { 
-    BasicBlock *P = *PI; 
-    Instruction *PTI = P->getTerminator(); 
-    if (BranchInst *BI = dyn_cast<BranchInst>(PTI)) { 
-      if (BI->isUnconditional()) 
-        UncondBranchPreds.push_back(P); 
-      else 
-        CondBranchPreds.push_back(BI); 
-    } 
-  } 
- 
-  // If we found some, do the transformation! 
-  if (!UncondBranchPreds.empty() && DupRet) { 
-    while (!UncondBranchPreds.empty()) { 
-      BasicBlock *Pred = UncondBranchPreds.pop_back_val(); 
-      LLVM_DEBUG(dbgs() << "FOLDING: " << *BB 
-                        << "INTO UNCOND BRANCH PRED: " << *Pred); 
+    return true;
+
+  return false;
+}
+
+bool SimplifyCFGOpt::simplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
+  BasicBlock *BB = RI->getParent();
+  if (!BB->getFirstNonPHIOrDbg()->isTerminator())
+    return false;
+
+  // Find predecessors that end with branches.
+  SmallVector<BasicBlock *, 8> UncondBranchPreds;
+  SmallVector<BranchInst *, 8> CondBranchPreds;
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+    BasicBlock *P = *PI;
+    Instruction *PTI = P->getTerminator();
+    if (BranchInst *BI = dyn_cast<BranchInst>(PTI)) {
+      if (BI->isUnconditional())
+        UncondBranchPreds.push_back(P);
+      else
+        CondBranchPreds.push_back(BI);
+    }
+  }
+
+  // If we found some, do the transformation!
+  if (!UncondBranchPreds.empty() && DupRet) {
+    while (!UncondBranchPreds.empty()) {
+      BasicBlock *Pred = UncondBranchPreds.pop_back_val();
+      LLVM_DEBUG(dbgs() << "FOLDING: " << *BB
+                        << "INTO UNCOND BRANCH PRED: " << *Pred);
       (void)FoldReturnIntoUncondBranch(RI, BB, Pred, DTU);
-    } 
- 
-    // If we eliminated all predecessors of the block, delete the block now. 
-    if (pred_empty(BB)) { 
-      // We know there are no successors, so just nuke the block. 
+    }
+
+    // If we eliminated all predecessors of the block, delete the block now.
+    if (pred_empty(BB)) {
+      // We know there are no successors, so just nuke the block.
       if (DTU)
         DTU->deleteBB(BB);
       else
         BB->eraseFromParent();
-    } 
- 
-    return true; 
-  } 
- 
-  // Check out all of the conditional branches going to this return 
-  // instruction.  If any of them just select between returns, change the 
-  // branch itself into a select/return pair. 
-  while (!CondBranchPreds.empty()) { 
-    BranchInst *BI = CondBranchPreds.pop_back_val(); 
- 
-    // Check to see if the non-BB successor is also a return block. 
-    if (isa<ReturnInst>(BI->getSuccessor(0)->getTerminator()) && 
-        isa<ReturnInst>(BI->getSuccessor(1)->getTerminator()) && 
-        SimplifyCondBranchToTwoReturns(BI, Builder)) 
-      return true; 
-  } 
-  return false; 
-} 
- 
-bool SimplifyCFGOpt::simplifyUnreachable(UnreachableInst *UI) { 
-  BasicBlock *BB = UI->getParent(); 
- 
-  bool Changed = false; 
- 
-  // If there are any instructions immediately before the unreachable that can 
-  // be removed, do so. 
-  while (UI->getIterator() != BB->begin()) { 
-    BasicBlock::iterator BBI = UI->getIterator(); 
-    --BBI; 
-    // Do not delete instructions that can have side effects which might cause 
-    // the unreachable to not be reachable; specifically, calls and volatile 
-    // operations may have this effect. 
-    if (isa<CallInst>(BBI) && !isa<DbgInfoIntrinsic>(BBI)) 
-      break; 
- 
-    if (BBI->mayHaveSideEffects()) { 
-      if (auto *SI = dyn_cast<StoreInst>(BBI)) { 
-        if (SI->isVolatile()) 
-          break; 
-      } else if (auto *LI = dyn_cast<LoadInst>(BBI)) { 
-        if (LI->isVolatile()) 
-          break; 
-      } else if (auto *RMWI = dyn_cast<AtomicRMWInst>(BBI)) { 
-        if (RMWI->isVolatile()) 
-          break; 
-      } else if (auto *CXI = dyn_cast<AtomicCmpXchgInst>(BBI)) { 
-        if (CXI->isVolatile()) 
-          break; 
-      } else if (isa<CatchPadInst>(BBI)) { 
-        // A catchpad may invoke exception object constructors and such, which 
-        // in some languages can be arbitrary code, so be conservative by 
-        // default. 
-        // For CoreCLR, it just involves a type test, so can be removed. 
-        if (classifyEHPersonality(BB->getParent()->getPersonalityFn()) != 
-            EHPersonality::CoreCLR) 
-          break; 
-      } else if (!isa<FenceInst>(BBI) && !isa<VAArgInst>(BBI) && 
-                 !isa<LandingPadInst>(BBI)) { 
-        break; 
-      } 
-      // Note that deleting LandingPad's here is in fact okay, although it 
-      // involves a bit of subtle reasoning. If this inst is a LandingPad, 
-      // all the predecessors of this block will be the unwind edges of Invokes, 
-      // and we can therefore guarantee this block will be erased. 
-    } 
- 
-    // Delete this instruction (any uses are guaranteed to be dead) 
-    if (!BBI->use_empty()) 
-      BBI->replaceAllUsesWith(UndefValue::get(BBI->getType())); 
-    BBI->eraseFromParent(); 
-    Changed = true; 
-  } 
- 
-  // If the unreachable instruction is the first in the block, take a gander 
-  // at all of the predecessors of this instruction, and simplify them. 
-  if (&BB->front() != UI) 
-    return Changed; 
- 
+    }
+
+    return true;
+  }
+
+  // Check out all of the conditional branches going to this return
+  // instruction.  If any of them just select between returns, change the
+  // branch itself into a select/return pair.
+  while (!CondBranchPreds.empty()) {
+    BranchInst *BI = CondBranchPreds.pop_back_val();
+
+    // Check to see if the non-BB successor is also a return block.
+    if (isa<ReturnInst>(BI->getSuccessor(0)->getTerminator()) &&
+        isa<ReturnInst>(BI->getSuccessor(1)->getTerminator()) &&
+        SimplifyCondBranchToTwoReturns(BI, Builder))
+      return true;
+  }
+  return false;
+}
+
+bool SimplifyCFGOpt::simplifyUnreachable(UnreachableInst *UI) {
+  BasicBlock *BB = UI->getParent();
+
+  bool Changed = false;
+
+  // If there are any instructions immediately before the unreachable that can
+  // be removed, do so.
+  while (UI->getIterator() != BB->begin()) {
+    BasicBlock::iterator BBI = UI->getIterator();
+    --BBI;
+    // Do not delete instructions that can have side effects which might cause
+    // the unreachable to not be reachable; specifically, calls and volatile
+    // operations may have this effect.
+    if (isa<CallInst>(BBI) && !isa<DbgInfoIntrinsic>(BBI))
+      break;
+
+    if (BBI->mayHaveSideEffects()) {
+      if (auto *SI = dyn_cast<StoreInst>(BBI)) {
+        if (SI->isVolatile())
+          break;
+      } else if (auto *LI = dyn_cast<LoadInst>(BBI)) {
+        if (LI->isVolatile())
+          break;
+      } else if (auto *RMWI = dyn_cast<AtomicRMWInst>(BBI)) {
+        if (RMWI->isVolatile())
+          break;
+      } else if (auto *CXI = dyn_cast<AtomicCmpXchgInst>(BBI)) {
+        if (CXI->isVolatile())
+          break;
+      } else if (isa<CatchPadInst>(BBI)) {
+        // A catchpad may invoke exception object constructors and such, which
+        // in some languages can be arbitrary code, so be conservative by
+        // default.
+        // For CoreCLR, it just involves a type test, so can be removed.
+        if (classifyEHPersonality(BB->getParent()->getPersonalityFn()) !=
+            EHPersonality::CoreCLR)
+          break;
+      } else if (!isa<FenceInst>(BBI) && !isa<VAArgInst>(BBI) &&
+                 !isa<LandingPadInst>(BBI)) {
+        break;
+      }
+      // Note that deleting LandingPad's here is in fact okay, although it
+      // involves a bit of subtle reasoning. If this inst is a LandingPad,
+      // all the predecessors of this block will be the unwind edges of Invokes,
+      // and we can therefore guarantee this block will be erased.
+    }
+
+    // Delete this instruction (any uses are guaranteed to be dead)
+    if (!BBI->use_empty())
+      BBI->replaceAllUsesWith(UndefValue::get(BBI->getType()));
+    BBI->eraseFromParent();
+    Changed = true;
+  }
+
+  // If the unreachable instruction is the first in the block, take a gander
+  // at all of the predecessors of this instruction, and simplify them.
+  if (&BB->front() != UI)
+    return Changed;
+
   std::vector<DominatorTree::UpdateType> Updates;
 
   SmallSetVector<BasicBlock *, 8> Preds(pred_begin(BB), pred_end(BB));
-  for (unsigned i = 0, e = Preds.size(); i != e; ++i) { 
+  for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
     auto *Predecessor = Preds[i];
     Instruction *TI = Predecessor->getTerminator();
-    IRBuilder<> Builder(TI); 
-    if (auto *BI = dyn_cast<BranchInst>(TI)) { 
+    IRBuilder<> Builder(TI);
+    if (auto *BI = dyn_cast<BranchInst>(TI)) {
       // We could either have a proper unconditional branch,
       // or a degenerate conditional branch with matching destinations.
       if (all_of(BI->successors(),
                  [BB](auto *Successor) { return Successor == BB; })) {
-        new UnreachableInst(TI->getContext(), TI); 
-        TI->eraseFromParent(); 
-        Changed = true; 
-      } else { 
+        new UnreachableInst(TI->getContext(), TI);
+        TI->eraseFromParent();
+        Changed = true;
+      } else {
         assert(BI->isConditional() && "Can't get here with an uncond branch.");
-        Value* Cond = BI->getCondition(); 
+        Value* Cond = BI->getCondition();
         assert(BI->getSuccessor(0) != BI->getSuccessor(1) &&
                "The destinations are guaranteed to be different here.");
-        if (BI->getSuccessor(0) == BB) { 
-          Builder.CreateAssumption(Builder.CreateNot(Cond)); 
-          Builder.CreateBr(BI->getSuccessor(1)); 
-        } else { 
-          assert(BI->getSuccessor(1) == BB && "Incorrect CFG"); 
-          Builder.CreateAssumption(Cond); 
-          Builder.CreateBr(BI->getSuccessor(0)); 
-        } 
-        EraseTerminatorAndDCECond(BI); 
-        Changed = true; 
-      } 
+        if (BI->getSuccessor(0) == BB) {
+          Builder.CreateAssumption(Builder.CreateNot(Cond));
+          Builder.CreateBr(BI->getSuccessor(1));
+        } else {
+          assert(BI->getSuccessor(1) == BB && "Incorrect CFG");
+          Builder.CreateAssumption(Cond);
+          Builder.CreateBr(BI->getSuccessor(0));
+        }
+        EraseTerminatorAndDCECond(BI);
+        Changed = true;
+      }
       Updates.push_back({DominatorTree::Delete, Predecessor, BB});
-    } else if (auto *SI = dyn_cast<SwitchInst>(TI)) { 
-      SwitchInstProfUpdateWrapper SU(*SI); 
-      for (auto i = SU->case_begin(), e = SU->case_end(); i != e;) { 
-        if (i->getCaseSuccessor() != BB) { 
-          ++i; 
-          continue; 
-        } 
-        BB->removePredecessor(SU->getParent()); 
-        i = SU.removeCase(i); 
-        e = SU->case_end(); 
-        Changed = true; 
-      } 
+    } else if (auto *SI = dyn_cast<SwitchInst>(TI)) {
+      SwitchInstProfUpdateWrapper SU(*SI);
+      for (auto i = SU->case_begin(), e = SU->case_end(); i != e;) {
+        if (i->getCaseSuccessor() != BB) {
+          ++i;
+          continue;
+        }
+        BB->removePredecessor(SU->getParent());
+        i = SU.removeCase(i);
+        e = SU->case_end();
+        Changed = true;
+      }
       // Note that the default destination can't be removed!
       if (SI->getDefaultDest() != BB)
         Updates.push_back({DominatorTree::Delete, Predecessor, BB});
-    } else if (auto *II = dyn_cast<InvokeInst>(TI)) { 
-      if (II->getUnwindDest() == BB) { 
+    } else if (auto *II = dyn_cast<InvokeInst>(TI)) {
+      if (II->getUnwindDest() == BB) {
         if (DTU)
           DTU->applyUpdates(Updates);
         Updates.clear();
         removeUnwindEdge(TI->getParent(), DTU);
-        Changed = true; 
-      } 
-    } else if (auto *CSI = dyn_cast<CatchSwitchInst>(TI)) { 
-      if (CSI->getUnwindDest() == BB) { 
+        Changed = true;
+      }
+    } else if (auto *CSI = dyn_cast<CatchSwitchInst>(TI)) {
+      if (CSI->getUnwindDest() == BB) {
         if (DTU)
           DTU->applyUpdates(Updates);
         Updates.clear();
         removeUnwindEdge(TI->getParent(), DTU);
-        Changed = true; 
-        continue; 
-      } 
- 
-      for (CatchSwitchInst::handler_iterator I = CSI->handler_begin(), 
-                                             E = CSI->handler_end(); 
-           I != E; ++I) { 
-        if (*I == BB) { 
-          CSI->removeHandler(I); 
-          --I; 
-          --E; 
-          Changed = true; 
-        } 
-      } 
+        Changed = true;
+        continue;
+      }
+
+      for (CatchSwitchInst::handler_iterator I = CSI->handler_begin(),
+                                             E = CSI->handler_end();
+           I != E; ++I) {
+        if (*I == BB) {
+          CSI->removeHandler(I);
+          --I;
+          --E;
+          Changed = true;
+        }
+      }
       Updates.push_back({DominatorTree::Delete, Predecessor, BB});
-      if (CSI->getNumHandlers() == 0) { 
-        if (CSI->hasUnwindDest()) { 
+      if (CSI->getNumHandlers() == 0) {
+        if (CSI->hasUnwindDest()) {
           // Redirect all predecessors of the block containing CatchSwitchInst
           // to instead branch to the CatchSwitchInst's unwind destination.
           for (auto *PredecessorOfPredecessor : predecessors(Predecessor)) {
@@ -4604,66 +4604,66 @@ bool SimplifyCFGOpt::simplifyUnreachable(UnreachableInst *UI) {
                 {DominatorTree::Delete, PredecessorOfPredecessor, Predecessor});
           }
           Predecessor->replaceAllUsesWith(CSI->getUnwindDest());
-        } else { 
-          // Rewrite all preds to unwind to caller (or from invoke to call). 
+        } else {
+          // Rewrite all preds to unwind to caller (or from invoke to call).
           if (DTU)
             DTU->applyUpdates(Updates);
           Updates.clear();
           SmallVector<BasicBlock *, 8> EHPreds(predecessors(Predecessor));
-          for (BasicBlock *EHPred : EHPreds) 
+          for (BasicBlock *EHPred : EHPreds)
             removeUnwindEdge(EHPred, DTU);
-        } 
-        // The catchswitch is no longer reachable. 
-        new UnreachableInst(CSI->getContext(), CSI); 
-        CSI->eraseFromParent(); 
-        Changed = true; 
-      } 
+        }
+        // The catchswitch is no longer reachable.
+        new UnreachableInst(CSI->getContext(), CSI);
+        CSI->eraseFromParent();
+        Changed = true;
+      }
     } else if (auto *CRI = dyn_cast<CleanupReturnInst>(TI)) {
       (void)CRI;
       assert(CRI->hasUnwindDest() && CRI->getUnwindDest() == BB &&
              "Expected to always have an unwind to BB.");
       Updates.push_back({DominatorTree::Delete, Predecessor, BB});
-      new UnreachableInst(TI->getContext(), TI); 
-      TI->eraseFromParent(); 
-      Changed = true; 
-    } 
-  } 
- 
+      new UnreachableInst(TI->getContext(), TI);
+      TI->eraseFromParent();
+      Changed = true;
+    }
+  }
+
   if (DTU)
     DTU->applyUpdates(Updates);
 
-  // If this block is now dead, remove it. 
-  if (pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()) { 
-    // We know there are no successors, so just nuke the block. 
+  // If this block is now dead, remove it.
+  if (pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()) {
+    // We know there are no successors, so just nuke the block.
     if (DTU)
       DTU->deleteBB(BB);
     else
       BB->eraseFromParent();
-    return true; 
-  } 
- 
-  return Changed; 
-} 
- 
-static bool CasesAreContiguous(SmallVectorImpl<ConstantInt *> &Cases) { 
-  assert(Cases.size() >= 1); 
- 
-  array_pod_sort(Cases.begin(), Cases.end(), ConstantIntSortPredicate); 
-  for (size_t I = 1, E = Cases.size(); I != E; ++I) { 
-    if (Cases[I - 1]->getValue() != Cases[I]->getValue() + 1) 
-      return false; 
-  } 
-  return true; 
-} 
- 
+    return true;
+  }
+
+  return Changed;
+}
+
+static bool CasesAreContiguous(SmallVectorImpl<ConstantInt *> &Cases) {
+  assert(Cases.size() >= 1);
+
+  array_pod_sort(Cases.begin(), Cases.end(), ConstantIntSortPredicate);
+  for (size_t I = 1, E = Cases.size(); I != E; ++I) {
+    if (Cases[I - 1]->getValue() != Cases[I]->getValue() + 1)
+      return false;
+  }
+  return true;
+}
+
 static void createUnreachableSwitchDefault(SwitchInst *Switch,
                                            DomTreeUpdater *DTU) {
-  LLVM_DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n"); 
+  LLVM_DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n");
   auto *BB = Switch->getParent();
   BasicBlock *NewDefaultBlock = SplitBlockPredecessors(
       Switch->getDefaultDest(), Switch->getParent(), "", DTU);
   auto *OrigDefaultBlock = Switch->getDefaultDest();
-  Switch->setDefaultDest(&*NewDefaultBlock); 
+  Switch->setDefaultDest(&*NewDefaultBlock);
   if (DTU)
     DTU->applyUpdates({{DominatorTree::Insert, BB, &*NewDefaultBlock},
                        {DominatorTree::Delete, BB, OrigDefaultBlock}});
@@ -4671,200 +4671,200 @@ static void createUnreachableSwitchDefault(SwitchInst *Switch,
   SmallVector<DominatorTree::UpdateType, 2> Updates;
   for (auto *Successor : successors(NewDefaultBlock))
     Updates.push_back({DominatorTree::Delete, NewDefaultBlock, Successor});
-  auto *NewTerminator = NewDefaultBlock->getTerminator(); 
-  new UnreachableInst(Switch->getContext(), NewTerminator); 
-  EraseTerminatorAndDCECond(NewTerminator); 
+  auto *NewTerminator = NewDefaultBlock->getTerminator();
+  new UnreachableInst(Switch->getContext(), NewTerminator);
+  EraseTerminatorAndDCECond(NewTerminator);
   if (DTU)
     DTU->applyUpdates(Updates);
-} 
- 
-/// Turn a switch with two reachable destinations into an integer range 
-/// comparison and branch. 
-bool SimplifyCFGOpt::TurnSwitchRangeIntoICmp(SwitchInst *SI, 
-                                             IRBuilder<> &Builder) { 
-  assert(SI->getNumCases() > 1 && "Degenerate switch?"); 
- 
-  bool HasDefault = 
-      !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg()); 
- 
+}
+
+/// Turn a switch with two reachable destinations into an integer range
+/// comparison and branch.
+bool SimplifyCFGOpt::TurnSwitchRangeIntoICmp(SwitchInst *SI,
+                                             IRBuilder<> &Builder) {
+  assert(SI->getNumCases() > 1 && "Degenerate switch?");
+
+  bool HasDefault =
+      !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg());
+
   auto *BB = SI->getParent();
 
-  // Partition the cases into two sets with different destinations. 
-  BasicBlock *DestA = HasDefault ? SI->getDefaultDest() : nullptr; 
-  BasicBlock *DestB = nullptr; 
-  SmallVector<ConstantInt *, 16> CasesA; 
-  SmallVector<ConstantInt *, 16> CasesB; 
- 
-  for (auto Case : SI->cases()) { 
-    BasicBlock *Dest = Case.getCaseSuccessor(); 
-    if (!DestA) 
-      DestA = Dest; 
-    if (Dest == DestA) { 
-      CasesA.push_back(Case.getCaseValue()); 
-      continue; 
-    } 
-    if (!DestB) 
-      DestB = Dest; 
-    if (Dest == DestB) { 
-      CasesB.push_back(Case.getCaseValue()); 
-      continue; 
-    } 
-    return false; // More than two destinations. 
-  } 
- 
-  assert(DestA && DestB && 
-         "Single-destination switch should have been folded."); 
-  assert(DestA != DestB); 
-  assert(DestB != SI->getDefaultDest()); 
-  assert(!CasesB.empty() && "There must be non-default cases."); 
-  assert(!CasesA.empty() || HasDefault); 
- 
-  // Figure out if one of the sets of cases form a contiguous range. 
-  SmallVectorImpl<ConstantInt *> *ContiguousCases = nullptr; 
-  BasicBlock *ContiguousDest = nullptr; 
-  BasicBlock *OtherDest = nullptr; 
-  if (!CasesA.empty() && CasesAreContiguous(CasesA)) { 
-    ContiguousCases = &CasesA; 
-    ContiguousDest = DestA; 
-    OtherDest = DestB; 
-  } else if (CasesAreContiguous(CasesB)) { 
-    ContiguousCases = &CasesB; 
-    ContiguousDest = DestB; 
-    OtherDest = DestA; 
-  } else 
-    return false; 
- 
-  // Start building the compare and branch. 
- 
-  Constant *Offset = ConstantExpr::getNeg(ContiguousCases->back()); 
-  Constant *NumCases = 
-      ConstantInt::get(Offset->getType(), ContiguousCases->size()); 
- 
-  Value *Sub = SI->getCondition(); 
-  if (!Offset->isNullValue()) 
-    Sub = Builder.CreateAdd(Sub, Offset, Sub->getName() + ".off"); 
- 
-  Value *Cmp; 
-  // If NumCases overflowed, then all possible values jump to the successor. 
-  if (NumCases->isNullValue() && !ContiguousCases->empty()) 
-    Cmp = ConstantInt::getTrue(SI->getContext()); 
-  else 
-    Cmp = Builder.CreateICmpULT(Sub, NumCases, "switch"); 
-  BranchInst *NewBI = Builder.CreateCondBr(Cmp, ContiguousDest, OtherDest); 
- 
-  // Update weight for the newly-created conditional branch. 
-  if (HasBranchWeights(SI)) { 
-    SmallVector<uint64_t, 8> Weights; 
-    GetBranchWeights(SI, Weights); 
-    if (Weights.size() == 1 + SI->getNumCases()) { 
-      uint64_t TrueWeight = 0; 
-      uint64_t FalseWeight = 0; 
-      for (size_t I = 0, E = Weights.size(); I != E; ++I) { 
-        if (SI->getSuccessor(I) == ContiguousDest) 
-          TrueWeight += Weights[I]; 
-        else 
-          FalseWeight += Weights[I]; 
-      } 
-      while (TrueWeight > UINT32_MAX || FalseWeight > UINT32_MAX) { 
-        TrueWeight /= 2; 
-        FalseWeight /= 2; 
-      } 
-      setBranchWeights(NewBI, TrueWeight, FalseWeight); 
-    } 
-  } 
- 
-  // Prune obsolete incoming values off the successors' PHI nodes. 
-  for (auto BBI = ContiguousDest->begin(); isa<PHINode>(BBI); ++BBI) { 
-    unsigned PreviousEdges = ContiguousCases->size(); 
-    if (ContiguousDest == SI->getDefaultDest()) 
-      ++PreviousEdges; 
-    for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I) 
-      cast<PHINode>(BBI)->removeIncomingValue(SI->getParent()); 
-  } 
-  for (auto BBI = OtherDest->begin(); isa<PHINode>(BBI); ++BBI) { 
-    unsigned PreviousEdges = SI->getNumCases() - ContiguousCases->size(); 
-    if (OtherDest == SI->getDefaultDest()) 
-      ++PreviousEdges; 
-    for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I) 
-      cast<PHINode>(BBI)->removeIncomingValue(SI->getParent()); 
-  } 
- 
-  // Clean up the default block - it may have phis or other instructions before 
-  // the unreachable terminator. 
-  if (!HasDefault) 
+  // Partition the cases into two sets with different destinations.
+  BasicBlock *DestA = HasDefault ? SI->getDefaultDest() : nullptr;
+  BasicBlock *DestB = nullptr;
+  SmallVector<ConstantInt *, 16> CasesA;
+  SmallVector<ConstantInt *, 16> CasesB;
+
+  for (auto Case : SI->cases()) {
+    BasicBlock *Dest = Case.getCaseSuccessor();
+    if (!DestA)
+      DestA = Dest;
+    if (Dest == DestA) {
+      CasesA.push_back(Case.getCaseValue());
+      continue;
+    }
+    if (!DestB)
+      DestB = Dest;
+    if (Dest == DestB) {
+      CasesB.push_back(Case.getCaseValue());
+      continue;
+    }
+    return false; // More than two destinations.
+  }
+
+  assert(DestA && DestB &&
+         "Single-destination switch should have been folded.");
+  assert(DestA != DestB);
+  assert(DestB != SI->getDefaultDest());
+  assert(!CasesB.empty() && "There must be non-default cases.");
+  assert(!CasesA.empty() || HasDefault);
+
+  // Figure out if one of the sets of cases form a contiguous range.
+  SmallVectorImpl<ConstantInt *> *ContiguousCases = nullptr;
+  BasicBlock *ContiguousDest = nullptr;
+  BasicBlock *OtherDest = nullptr;
+  if (!CasesA.empty() && CasesAreContiguous(CasesA)) {
+    ContiguousCases = &CasesA;
+    ContiguousDest = DestA;
+    OtherDest = DestB;
+  } else if (CasesAreContiguous(CasesB)) {
+    ContiguousCases = &CasesB;
+    ContiguousDest = DestB;
+    OtherDest = DestA;
+  } else
+    return false;
+
+  // Start building the compare and branch.
+
+  Constant *Offset = ConstantExpr::getNeg(ContiguousCases->back());
+  Constant *NumCases =
+      ConstantInt::get(Offset->getType(), ContiguousCases->size());
+
+  Value *Sub = SI->getCondition();
+  if (!Offset->isNullValue())
+    Sub = Builder.CreateAdd(Sub, Offset, Sub->getName() + ".off");
+
+  Value *Cmp;
+  // If NumCases overflowed, then all possible values jump to the successor.
+  if (NumCases->isNullValue() && !ContiguousCases->empty())
+    Cmp = ConstantInt::getTrue(SI->getContext());
+  else
+    Cmp = Builder.CreateICmpULT(Sub, NumCases, "switch");
+  BranchInst *NewBI = Builder.CreateCondBr(Cmp, ContiguousDest, OtherDest);
+
+  // Update weight for the newly-created conditional branch.
+  if (HasBranchWeights(SI)) {
+    SmallVector<uint64_t, 8> Weights;
+    GetBranchWeights(SI, Weights);
+    if (Weights.size() == 1 + SI->getNumCases()) {
+      uint64_t TrueWeight = 0;
+      uint64_t FalseWeight = 0;
+      for (size_t I = 0, E = Weights.size(); I != E; ++I) {
+        if (SI->getSuccessor(I) == ContiguousDest)
+          TrueWeight += Weights[I];
+        else
+          FalseWeight += Weights[I];
+      }
+      while (TrueWeight > UINT32_MAX || FalseWeight > UINT32_MAX) {
+        TrueWeight /= 2;
+        FalseWeight /= 2;
+      }
+      setBranchWeights(NewBI, TrueWeight, FalseWeight);
+    }
+  }
+
+  // Prune obsolete incoming values off the successors' PHI nodes.
+  for (auto BBI = ContiguousDest->begin(); isa<PHINode>(BBI); ++BBI) {
+    unsigned PreviousEdges = ContiguousCases->size();
+    if (ContiguousDest == SI->getDefaultDest())
+      ++PreviousEdges;
+    for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I)
+      cast<PHINode>(BBI)->removeIncomingValue(SI->getParent());
+  }
+  for (auto BBI = OtherDest->begin(); isa<PHINode>(BBI); ++BBI) {
+    unsigned PreviousEdges = SI->getNumCases() - ContiguousCases->size();
+    if (OtherDest == SI->getDefaultDest())
+      ++PreviousEdges;
+    for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I)
+      cast<PHINode>(BBI)->removeIncomingValue(SI->getParent());
+  }
+
+  // Clean up the default block - it may have phis or other instructions before
+  // the unreachable terminator.
+  if (!HasDefault)
     createUnreachableSwitchDefault(SI, DTU);
- 
+
   auto *UnreachableDefault = SI->getDefaultDest();
 
-  // Drop the switch. 
-  SI->eraseFromParent(); 
- 
+  // Drop the switch.
+  SI->eraseFromParent();
+
   if (!HasDefault && DTU)
     DTU->applyUpdates({{DominatorTree::Delete, BB, UnreachableDefault}});
 
-  return true; 
-} 
- 
-/// Compute masked bits for the condition of a switch 
-/// and use it to remove dead cases. 
+  return true;
+}
+
+/// Compute masked bits for the condition of a switch
+/// and use it to remove dead cases.
 static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
                                      AssumptionCache *AC,
-                                     const DataLayout &DL) { 
-  Value *Cond = SI->getCondition(); 
-  unsigned Bits = Cond->getType()->getIntegerBitWidth(); 
-  KnownBits Known = computeKnownBits(Cond, DL, 0, AC, SI); 
- 
-  // We can also eliminate cases by determining that their values are outside of 
-  // the limited range of the condition based on how many significant (non-sign) 
-  // bits are in the condition value. 
-  unsigned ExtraSignBits = ComputeNumSignBits(Cond, DL, 0, AC, SI) - 1; 
-  unsigned MaxSignificantBitsInCond = Bits - ExtraSignBits; 
- 
-  // Gather dead cases. 
-  SmallVector<ConstantInt *, 8> DeadCases; 
+                                     const DataLayout &DL) {
+  Value *Cond = SI->getCondition();
+  unsigned Bits = Cond->getType()->getIntegerBitWidth();
+  KnownBits Known = computeKnownBits(Cond, DL, 0, AC, SI);
+
+  // We can also eliminate cases by determining that their values are outside of
+  // the limited range of the condition based on how many significant (non-sign)
+  // bits are in the condition value.
+  unsigned ExtraSignBits = ComputeNumSignBits(Cond, DL, 0, AC, SI) - 1;
+  unsigned MaxSignificantBitsInCond = Bits - ExtraSignBits;
+
+  // Gather dead cases.
+  SmallVector<ConstantInt *, 8> DeadCases;
   SmallMapVector<BasicBlock *, int, 8> NumPerSuccessorCases;
-  for (auto &Case : SI->cases()) { 
+  for (auto &Case : SI->cases()) {
     auto *Successor = Case.getCaseSuccessor();
     ++NumPerSuccessorCases[Successor];
-    const APInt &CaseVal = Case.getCaseValue()->getValue(); 
-    if (Known.Zero.intersects(CaseVal) || !Known.One.isSubsetOf(CaseVal) || 
-        (CaseVal.getMinSignedBits() > MaxSignificantBitsInCond)) { 
-      DeadCases.push_back(Case.getCaseValue()); 
+    const APInt &CaseVal = Case.getCaseValue()->getValue();
+    if (Known.Zero.intersects(CaseVal) || !Known.One.isSubsetOf(CaseVal) ||
+        (CaseVal.getMinSignedBits() > MaxSignificantBitsInCond)) {
+      DeadCases.push_back(Case.getCaseValue());
       --NumPerSuccessorCases[Successor];
-      LLVM_DEBUG(dbgs() << "SimplifyCFG: switch case " << CaseVal 
-                        << " is dead.\n"); 
-    } 
-  } 
- 
-  // If we can prove that the cases must cover all possible values, the 
-  // default destination becomes dead and we can remove it.  If we know some 
-  // of the bits in the value, we can use that to more precisely compute the 
-  // number of possible unique case values. 
-  bool HasDefault = 
-      !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg()); 
-  const unsigned NumUnknownBits = 
-      Bits - (Known.Zero | Known.One).countPopulation(); 
-  assert(NumUnknownBits <= Bits); 
-  if (HasDefault && DeadCases.empty() && 
-      NumUnknownBits < 64 /* avoid overflow */ && 
-      SI->getNumCases() == (1ULL << NumUnknownBits)) { 
+      LLVM_DEBUG(dbgs() << "SimplifyCFG: switch case " << CaseVal
+                        << " is dead.\n");
+    }
+  }
+
+  // If we can prove that the cases must cover all possible values, the
+  // default destination becomes dead and we can remove it.  If we know some
+  // of the bits in the value, we can use that to more precisely compute the
+  // number of possible unique case values.
+  bool HasDefault =
+      !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg());
+  const unsigned NumUnknownBits =
+      Bits - (Known.Zero | Known.One).countPopulation();
+  assert(NumUnknownBits <= Bits);
+  if (HasDefault && DeadCases.empty() &&
+      NumUnknownBits < 64 /* avoid overflow */ &&
+      SI->getNumCases() == (1ULL << NumUnknownBits)) {
     createUnreachableSwitchDefault(SI, DTU);
-    return true; 
-  } 
- 
-  if (DeadCases.empty()) 
-    return false; 
- 
-  SwitchInstProfUpdateWrapper SIW(*SI); 
-  for (ConstantInt *DeadCase : DeadCases) { 
-    SwitchInst::CaseIt CaseI = SI->findCaseValue(DeadCase); 
-    assert(CaseI != SI->case_default() && 
-           "Case was not found. Probably mistake in DeadCases forming."); 
-    // Prune unused values from PHI nodes. 
-    CaseI->getCaseSuccessor()->removePredecessor(SI->getParent()); 
-    SIW.removeCase(CaseI); 
-  } 
- 
+    return true;
+  }
+
+  if (DeadCases.empty())
+    return false;
+
+  SwitchInstProfUpdateWrapper SIW(*SI);
+  for (ConstantInt *DeadCase : DeadCases) {
+    SwitchInst::CaseIt CaseI = SI->findCaseValue(DeadCase);
+    assert(CaseI != SI->case_default() &&
+           "Case was not found. Probably mistake in DeadCases forming.");
+    // Prune unused values from PHI nodes.
+    CaseI->getCaseSuccessor()->removePredecessor(SI->getParent());
+    SIW.removeCase(CaseI);
+  }
+
   std::vector<DominatorTree::UpdateType> Updates;
   for (const std::pair<BasicBlock *, int> &I : NumPerSuccessorCases)
     if (I.second == 0)
@@ -4872,366 +4872,366 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
   if (DTU)
     DTU->applyUpdates(Updates);
 
-  return true; 
-} 
- 
-/// If BB would be eligible for simplification by 
-/// TryToSimplifyUncondBranchFromEmptyBlock (i.e. it is empty and terminated 
-/// by an unconditional branch), look at the phi node for BB in the successor 
-/// block and see if the incoming value is equal to CaseValue. If so, return 
-/// the phi node, and set PhiIndex to BB's index in the phi node. 
-static PHINode *FindPHIForConditionForwarding(ConstantInt *CaseValue, 
-                                              BasicBlock *BB, int *PhiIndex) { 
-  if (BB->getFirstNonPHIOrDbg() != BB->getTerminator()) 
-    return nullptr; // BB must be empty to be a candidate for simplification. 
-  if (!BB->getSinglePredecessor()) 
-    return nullptr; // BB must be dominated by the switch. 
- 
-  BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); 
-  if (!Branch || !Branch->isUnconditional()) 
-    return nullptr; // Terminator must be unconditional branch. 
- 
-  BasicBlock *Succ = Branch->getSuccessor(0); 
- 
-  for (PHINode &PHI : Succ->phis()) { 
-    int Idx = PHI.getBasicBlockIndex(BB); 
-    assert(Idx >= 0 && "PHI has no entry for predecessor?"); 
- 
-    Value *InValue = PHI.getIncomingValue(Idx); 
-    if (InValue != CaseValue) 
-      continue; 
- 
-    *PhiIndex = Idx; 
-    return &PHI; 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Try to forward the condition of a switch instruction to a phi node 
-/// dominated by the switch, if that would mean that some of the destination 
-/// blocks of the switch can be folded away. Return true if a change is made. 
-static bool ForwardSwitchConditionToPHI(SwitchInst *SI) { 
-  using ForwardingNodesMap = DenseMap<PHINode *, SmallVector<int, 4>>; 
- 
-  ForwardingNodesMap ForwardingNodes; 
-  BasicBlock *SwitchBlock = SI->getParent(); 
-  bool Changed = false; 
-  for (auto &Case : SI->cases()) { 
-    ConstantInt *CaseValue = Case.getCaseValue(); 
-    BasicBlock *CaseDest = Case.getCaseSuccessor(); 
- 
-    // Replace phi operands in successor blocks that are using the constant case 
-    // value rather than the switch condition variable: 
-    //   switchbb: 
-    //   switch i32 %x, label %default [ 
-    //     i32 17, label %succ 
-    //   ... 
-    //   succ: 
-    //     %r = phi i32 ... [ 17, %switchbb ] ... 
-    // --> 
-    //     %r = phi i32 ... [ %x, %switchbb ] ... 
- 
-    for (PHINode &Phi : CaseDest->phis()) { 
-      // This only works if there is exactly 1 incoming edge from the switch to 
-      // a phi. If there is >1, that means multiple cases of the switch map to 1 
-      // value in the phi, and that phi value is not the switch condition. Thus, 
-      // this transform would not make sense (the phi would be invalid because 
-      // a phi can't have different incoming values from the same block). 
-      int SwitchBBIdx = Phi.getBasicBlockIndex(SwitchBlock); 
-      if (Phi.getIncomingValue(SwitchBBIdx) == CaseValue && 
-          count(Phi.blocks(), SwitchBlock) == 1) { 
-        Phi.setIncomingValue(SwitchBBIdx, SI->getCondition()); 
-        Changed = true; 
-      } 
-    } 
- 
-    // Collect phi nodes that are indirectly using this switch's case constants. 
-    int PhiIdx; 
-    if (auto *Phi = FindPHIForConditionForwarding(CaseValue, CaseDest, &PhiIdx)) 
-      ForwardingNodes[Phi].push_back(PhiIdx); 
-  } 
- 
-  for (auto &ForwardingNode : ForwardingNodes) { 
-    PHINode *Phi = ForwardingNode.first; 
-    SmallVectorImpl<int> &Indexes = ForwardingNode.second; 
-    if (Indexes.size() < 2) 
-      continue; 
- 
-    for (int Index : Indexes) 
-      Phi->setIncomingValue(Index, SI->getCondition()); 
-    Changed = true; 
-  } 
- 
-  return Changed; 
-} 
- 
-/// Return true if the backend will be able to handle 
-/// initializing an array of constants like C. 
-static bool ValidLookupTableConstant(Constant *C, const TargetTransformInfo &TTI) { 
-  if (C->isThreadDependent()) 
-    return false; 
-  if (C->isDLLImportDependent()) 
-    return false; 
- 
-  if (!isa<ConstantFP>(C) && !isa<ConstantInt>(C) && 
-      !isa<ConstantPointerNull>(C) && !isa<GlobalValue>(C) && 
-      !isa<UndefValue>(C) && !isa<ConstantExpr>(C)) 
-    return false; 
- 
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) { 
-    if (!CE->isGEPWithNoNotionalOverIndexing()) 
-      return false; 
-    if (!ValidLookupTableConstant(CE->getOperand(0), TTI)) 
-      return false; 
-  } 
- 
-  if (!TTI.shouldBuildLookupTablesForConstant(C)) 
-    return false; 
- 
-  return true; 
-} 
- 
-/// If V is a Constant, return it. Otherwise, try to look up 
-/// its constant value in ConstantPool, returning 0 if it's not there. 
-static Constant * 
-LookupConstant(Value *V, 
-               const SmallDenseMap<Value *, Constant *> &ConstantPool) { 
-  if (Constant *C = dyn_cast<Constant>(V)) 
-    return C; 
-  return ConstantPool.lookup(V); 
-} 
- 
-/// Try to fold instruction I into a constant. This works for 
-/// simple instructions such as binary operations where both operands are 
-/// constant or can be replaced by constants from the ConstantPool. Returns the 
-/// resulting constant on success, 0 otherwise. 
-static Constant * 
-ConstantFold(Instruction *I, const DataLayout &DL, 
-             const SmallDenseMap<Value *, Constant *> &ConstantPool) { 
-  if (SelectInst *Select = dyn_cast<SelectInst>(I)) { 
-    Constant *A = LookupConstant(Select->getCondition(), ConstantPool); 
-    if (!A) 
-      return nullptr; 
-    if (A->isAllOnesValue()) 
-      return LookupConstant(Select->getTrueValue(), ConstantPool); 
-    if (A->isNullValue()) 
-      return LookupConstant(Select->getFalseValue(), ConstantPool); 
-    return nullptr; 
-  } 
- 
-  SmallVector<Constant *, 4> COps; 
-  for (unsigned N = 0, E = I->getNumOperands(); N != E; ++N) { 
-    if (Constant *A = LookupConstant(I->getOperand(N), ConstantPool)) 
-      COps.push_back(A); 
-    else 
-      return nullptr; 
-  } 
- 
-  if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) { 
-    return ConstantFoldCompareInstOperands(Cmp->getPredicate(), COps[0], 
-                                           COps[1], DL); 
-  } 
- 
-  return ConstantFoldInstOperands(I, COps, DL); 
-} 
- 
-/// Try to determine the resulting constant values in phi nodes 
-/// at the common destination basic block, *CommonDest, for one of the case 
-/// destionations CaseDest corresponding to value CaseVal (0 for the default 
-/// case), of a switch instruction SI. 
-static bool 
-GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest, 
-               BasicBlock **CommonDest, 
-               SmallVectorImpl<std::pair<PHINode *, Constant *>> &Res, 
-               const DataLayout &DL, const TargetTransformInfo &TTI) { 
-  // The block from which we enter the common destination. 
-  BasicBlock *Pred = SI->getParent(); 
- 
-  // If CaseDest is empty except for some side-effect free instructions through 
-  // which we can constant-propagate the CaseVal, continue to its successor. 
-  SmallDenseMap<Value *, Constant *> ConstantPool; 
-  ConstantPool.insert(std::make_pair(SI->getCondition(), CaseVal)); 
-  for (Instruction &I :CaseDest->instructionsWithoutDebug()) { 
-    if (I.isTerminator()) { 
-      // If the terminator is a simple branch, continue to the next block. 
-      if (I.getNumSuccessors() != 1 || I.isExceptionalTerminator()) 
-        return false; 
-      Pred = CaseDest; 
-      CaseDest = I.getSuccessor(0); 
-    } else if (Constant *C = ConstantFold(&I, DL, ConstantPool)) { 
-      // Instruction is side-effect free and constant. 
- 
-      // If the instruction has uses outside this block or a phi node slot for 
-      // the block, it is not safe to bypass the instruction since it would then 
-      // no longer dominate all its uses. 
-      for (auto &Use : I.uses()) { 
-        User *User = Use.getUser(); 
-        if (Instruction *I = dyn_cast<Instruction>(User)) 
-          if (I->getParent() == CaseDest) 
-            continue; 
-        if (PHINode *Phi = dyn_cast<PHINode>(User)) 
-          if (Phi->getIncomingBlock(Use) == CaseDest) 
-            continue; 
-        return false; 
-      } 
- 
-      ConstantPool.insert(std::make_pair(&I, C)); 
-    } else { 
-      break; 
-    } 
-  } 
- 
-  // If we did not have a CommonDest before, use the current one. 
-  if (!*CommonDest) 
-    *CommonDest = CaseDest; 
-  // If the destination isn't the common one, abort. 
-  if (CaseDest != *CommonDest) 
-    return false; 
- 
-  // Get the values for this case from phi nodes in the destination block. 
-  for (PHINode &PHI : (*CommonDest)->phis()) { 
-    int Idx = PHI.getBasicBlockIndex(Pred); 
-    if (Idx == -1) 
-      continue; 
- 
-    Constant *ConstVal = 
-        LookupConstant(PHI.getIncomingValue(Idx), ConstantPool); 
-    if (!ConstVal) 
-      return false; 
- 
-    // Be conservative about which kinds of constants we support. 
-    if (!ValidLookupTableConstant(ConstVal, TTI)) 
-      return false; 
- 
-    Res.push_back(std::make_pair(&PHI, ConstVal)); 
-  } 
- 
-  return Res.size() > 0; 
-} 
- 
-// Helper function used to add CaseVal to the list of cases that generate 
-// Result. Returns the updated number of cases that generate this result. 
-static uintptr_t MapCaseToResult(ConstantInt *CaseVal, 
-                                 SwitchCaseResultVectorTy &UniqueResults, 
-                                 Constant *Result) { 
-  for (auto &I : UniqueResults) { 
-    if (I.first == Result) { 
-      I.second.push_back(CaseVal); 
-      return I.second.size(); 
-    } 
-  } 
-  UniqueResults.push_back( 
-      std::make_pair(Result, SmallVector<ConstantInt *, 4>(1, CaseVal))); 
-  return 1; 
-} 
- 
-// Helper function that initializes a map containing 
-// results for the PHI node of the common destination block for a switch 
-// instruction. Returns false if multiple PHI nodes have been found or if 
-// there is not a common destination block for the switch. 
-static bool 
-InitializeUniqueCases(SwitchInst *SI, PHINode *&PHI, BasicBlock *&CommonDest, 
-                      SwitchCaseResultVectorTy &UniqueResults, 
-                      Constant *&DefaultResult, const DataLayout &DL, 
-                      const TargetTransformInfo &TTI, 
-                      uintptr_t MaxUniqueResults, uintptr_t MaxCasesPerResult) { 
-  for (auto &I : SI->cases()) { 
-    ConstantInt *CaseVal = I.getCaseValue(); 
- 
-    // Resulting value at phi nodes for this case value. 
-    SwitchCaseResultsTy Results; 
-    if (!GetCaseResults(SI, CaseVal, I.getCaseSuccessor(), &CommonDest, Results, 
-                        DL, TTI)) 
-      return false; 
- 
-    // Only one value per case is permitted. 
-    if (Results.size() > 1) 
-      return false; 
- 
-    // Add the case->result mapping to UniqueResults. 
-    const uintptr_t NumCasesForResult = 
-        MapCaseToResult(CaseVal, UniqueResults, Results.begin()->second); 
- 
-    // Early out if there are too many cases for this result. 
-    if (NumCasesForResult > MaxCasesPerResult) 
-      return false; 
- 
-    // Early out if there are too many unique results. 
-    if (UniqueResults.size() > MaxUniqueResults) 
-      return false; 
- 
-    // Check the PHI consistency. 
-    if (!PHI) 
-      PHI = Results[0].first; 
-    else if (PHI != Results[0].first) 
-      return false; 
-  } 
-  // Find the default result value. 
-  SmallVector<std::pair<PHINode *, Constant *>, 1> DefaultResults; 
-  BasicBlock *DefaultDest = SI->getDefaultDest(); 
-  GetCaseResults(SI, nullptr, SI->getDefaultDest(), &CommonDest, DefaultResults, 
-                 DL, TTI); 
-  // If the default value is not found abort unless the default destination 
-  // is unreachable. 
-  DefaultResult = 
-      DefaultResults.size() == 1 ? DefaultResults.begin()->second : nullptr; 
-  if ((!DefaultResult && 
-       !isa<UnreachableInst>(DefaultDest->getFirstNonPHIOrDbg()))) 
-    return false; 
- 
-  return true; 
-} 
- 
-// Helper function that checks if it is possible to transform a switch with only 
-// two cases (or two cases + default) that produces a result into a select. 
-// Example: 
-// switch (a) { 
-//   case 10:                %0 = icmp eq i32 %a, 10 
-//     return 10;            %1 = select i1 %0, i32 10, i32 4 
-//   case 20:        ---->   %2 = icmp eq i32 %a, 20 
-//     return 2;             %3 = select i1 %2, i32 2, i32 %1 
-//   default: 
-//     return 4; 
-// } 
-static Value *ConvertTwoCaseSwitch(const SwitchCaseResultVectorTy &ResultVector, 
-                                   Constant *DefaultResult, Value *Condition, 
-                                   IRBuilder<> &Builder) { 
-  assert(ResultVector.size() == 2 && 
-         "We should have exactly two unique results at this point"); 
-  // If we are selecting between only two cases transform into a simple 
-  // select or a two-way select if default is possible. 
-  if (ResultVector[0].second.size() == 1 && 
-      ResultVector[1].second.size() == 1) { 
-    ConstantInt *const FirstCase = ResultVector[0].second[0]; 
-    ConstantInt *const SecondCase = ResultVector[1].second[0]; 
- 
-    bool DefaultCanTrigger = DefaultResult; 
-    Value *SelectValue = ResultVector[1].first; 
-    if (DefaultCanTrigger) { 
-      Value *const ValueCompare = 
-          Builder.CreateICmpEQ(Condition, SecondCase, "switch.selectcmp"); 
-      SelectValue = Builder.CreateSelect(ValueCompare, ResultVector[1].first, 
-                                         DefaultResult, "switch.select"); 
-    } 
-    Value *const ValueCompare = 
-        Builder.CreateICmpEQ(Condition, FirstCase, "switch.selectcmp"); 
-    return Builder.CreateSelect(ValueCompare, ResultVector[0].first, 
-                                SelectValue, "switch.select"); 
-  } 
- 
-  return nullptr; 
-} 
- 
-// Helper function to cleanup a switch instruction that has been converted into 
-// a select, fixing up PHI nodes and basic blocks. 
-static void RemoveSwitchAfterSelectConversion(SwitchInst *SI, PHINode *PHI, 
-                                              Value *SelectValue, 
+  return true;
+}
+
+/// If BB would be eligible for simplification by
+/// TryToSimplifyUncondBranchFromEmptyBlock (i.e. it is empty and terminated
+/// by an unconditional branch), look at the phi node for BB in the successor
+/// block and see if the incoming value is equal to CaseValue. If so, return
+/// the phi node, and set PhiIndex to BB's index in the phi node.
+static PHINode *FindPHIForConditionForwarding(ConstantInt *CaseValue,
+                                              BasicBlock *BB, int *PhiIndex) {
+  if (BB->getFirstNonPHIOrDbg() != BB->getTerminator())
+    return nullptr; // BB must be empty to be a candidate for simplification.
+  if (!BB->getSinglePredecessor())
+    return nullptr; // BB must be dominated by the switch.
+
+  BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
+  if (!Branch || !Branch->isUnconditional())
+    return nullptr; // Terminator must be unconditional branch.
+
+  BasicBlock *Succ = Branch->getSuccessor(0);
+
+  for (PHINode &PHI : Succ->phis()) {
+    int Idx = PHI.getBasicBlockIndex(BB);
+    assert(Idx >= 0 && "PHI has no entry for predecessor?");
+
+    Value *InValue = PHI.getIncomingValue(Idx);
+    if (InValue != CaseValue)
+      continue;
+
+    *PhiIndex = Idx;
+    return &PHI;
+  }
+
+  return nullptr;
+}
+
+/// Try to forward the condition of a switch instruction to a phi node
+/// dominated by the switch, if that would mean that some of the destination
+/// blocks of the switch can be folded away. Return true if a change is made.
+static bool ForwardSwitchConditionToPHI(SwitchInst *SI) {
+  using ForwardingNodesMap = DenseMap<PHINode *, SmallVector<int, 4>>;
+
+  ForwardingNodesMap ForwardingNodes;
+  BasicBlock *SwitchBlock = SI->getParent();
+  bool Changed = false;
+  for (auto &Case : SI->cases()) {
+    ConstantInt *CaseValue = Case.getCaseValue();
+    BasicBlock *CaseDest = Case.getCaseSuccessor();
+
+    // Replace phi operands in successor blocks that are using the constant case
+    // value rather than the switch condition variable:
+    //   switchbb:
+    //   switch i32 %x, label %default [
+    //     i32 17, label %succ
+    //   ...
+    //   succ:
+    //     %r = phi i32 ... [ 17, %switchbb ] ...
+    // -->
+    //     %r = phi i32 ... [ %x, %switchbb ] ...
+
+    for (PHINode &Phi : CaseDest->phis()) {
+      // This only works if there is exactly 1 incoming edge from the switch to
+      // a phi. If there is >1, that means multiple cases of the switch map to 1
+      // value in the phi, and that phi value is not the switch condition. Thus,
+      // this transform would not make sense (the phi would be invalid because
+      // a phi can't have different incoming values from the same block).
+      int SwitchBBIdx = Phi.getBasicBlockIndex(SwitchBlock);
+      if (Phi.getIncomingValue(SwitchBBIdx) == CaseValue &&
+          count(Phi.blocks(), SwitchBlock) == 1) {
+        Phi.setIncomingValue(SwitchBBIdx, SI->getCondition());
+        Changed = true;
+      }
+    }
+
+    // Collect phi nodes that are indirectly using this switch's case constants.
+    int PhiIdx;
+    if (auto *Phi = FindPHIForConditionForwarding(CaseValue, CaseDest, &PhiIdx))
+      ForwardingNodes[Phi].push_back(PhiIdx);
+  }
+
+  for (auto &ForwardingNode : ForwardingNodes) {
+    PHINode *Phi = ForwardingNode.first;
+    SmallVectorImpl<int> &Indexes = ForwardingNode.second;
+    if (Indexes.size() < 2)
+      continue;
+
+    for (int Index : Indexes)
+      Phi->setIncomingValue(Index, SI->getCondition());
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+/// Return true if the backend will be able to handle
+/// initializing an array of constants like C.
+static bool ValidLookupTableConstant(Constant *C, const TargetTransformInfo &TTI) {
+  if (C->isThreadDependent())
+    return false;
+  if (C->isDLLImportDependent())
+    return false;
+
+  if (!isa<ConstantFP>(C) && !isa<ConstantInt>(C) &&
+      !isa<ConstantPointerNull>(C) && !isa<GlobalValue>(C) &&
+      !isa<UndefValue>(C) && !isa<ConstantExpr>(C))
+    return false;
+
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+    if (!CE->isGEPWithNoNotionalOverIndexing())
+      return false;
+    if (!ValidLookupTableConstant(CE->getOperand(0), TTI))
+      return false;
+  }
+
+  if (!TTI.shouldBuildLookupTablesForConstant(C))
+    return false;
+
+  return true;
+}
+
+/// If V is a Constant, return it. Otherwise, try to look up
+/// its constant value in ConstantPool, returning 0 if it's not there.
+static Constant *
+LookupConstant(Value *V,
+               const SmallDenseMap<Value *, Constant *> &ConstantPool) {
+  if (Constant *C = dyn_cast<Constant>(V))
+    return C;
+  return ConstantPool.lookup(V);
+}
+
+/// Try to fold instruction I into a constant. This works for
+/// simple instructions such as binary operations where both operands are
+/// constant or can be replaced by constants from the ConstantPool. Returns the
+/// resulting constant on success, 0 otherwise.
+static Constant *
+ConstantFold(Instruction *I, const DataLayout &DL,
+             const SmallDenseMap<Value *, Constant *> &ConstantPool) {
+  if (SelectInst *Select = dyn_cast<SelectInst>(I)) {
+    Constant *A = LookupConstant(Select->getCondition(), ConstantPool);
+    if (!A)
+      return nullptr;
+    if (A->isAllOnesValue())
+      return LookupConstant(Select->getTrueValue(), ConstantPool);
+    if (A->isNullValue())
+      return LookupConstant(Select->getFalseValue(), ConstantPool);
+    return nullptr;
+  }
+
+  SmallVector<Constant *, 4> COps;
+  for (unsigned N = 0, E = I->getNumOperands(); N != E; ++N) {
+    if (Constant *A = LookupConstant(I->getOperand(N), ConstantPool))
+      COps.push_back(A);
+    else
+      return nullptr;
+  }
+
+  if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) {
+    return ConstantFoldCompareInstOperands(Cmp->getPredicate(), COps[0],
+                                           COps[1], DL);
+  }
+
+  return ConstantFoldInstOperands(I, COps, DL);
+}
+
+/// Try to determine the resulting constant values in phi nodes
+/// at the common destination basic block, *CommonDest, for one of the case
+/// destionations CaseDest corresponding to value CaseVal (0 for the default
+/// case), of a switch instruction SI.
+static bool
+GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,
+               BasicBlock **CommonDest,
+               SmallVectorImpl<std::pair<PHINode *, Constant *>> &Res,
+               const DataLayout &DL, const TargetTransformInfo &TTI) {
+  // The block from which we enter the common destination.
+  BasicBlock *Pred = SI->getParent();
+
+  // If CaseDest is empty except for some side-effect free instructions through
+  // which we can constant-propagate the CaseVal, continue to its successor.
+  SmallDenseMap<Value *, Constant *> ConstantPool;
+  ConstantPool.insert(std::make_pair(SI->getCondition(), CaseVal));
+  for (Instruction &I :CaseDest->instructionsWithoutDebug()) {
+    if (I.isTerminator()) {
+      // If the terminator is a simple branch, continue to the next block.
+      if (I.getNumSuccessors() != 1 || I.isExceptionalTerminator())
+        return false;
+      Pred = CaseDest;
+      CaseDest = I.getSuccessor(0);
+    } else if (Constant *C = ConstantFold(&I, DL, ConstantPool)) {
+      // Instruction is side-effect free and constant.
+
+      // If the instruction has uses outside this block or a phi node slot for
+      // the block, it is not safe to bypass the instruction since it would then
+      // no longer dominate all its uses.
+      for (auto &Use : I.uses()) {
+        User *User = Use.getUser();
+        if (Instruction *I = dyn_cast<Instruction>(User))
+          if (I->getParent() == CaseDest)
+            continue;
+        if (PHINode *Phi = dyn_cast<PHINode>(User))
+          if (Phi->getIncomingBlock(Use) == CaseDest)
+            continue;
+        return false;
+      }
+
+      ConstantPool.insert(std::make_pair(&I, C));
+    } else {
+      break;
+    }
+  }
+
+  // If we did not have a CommonDest before, use the current one.
+  if (!*CommonDest)
+    *CommonDest = CaseDest;
+  // If the destination isn't the common one, abort.
+  if (CaseDest != *CommonDest)
+    return false;
+
+  // Get the values for this case from phi nodes in the destination block.
+  for (PHINode &PHI : (*CommonDest)->phis()) {
+    int Idx = PHI.getBasicBlockIndex(Pred);
+    if (Idx == -1)
+      continue;
+
+    Constant *ConstVal =
+        LookupConstant(PHI.getIncomingValue(Idx), ConstantPool);
+    if (!ConstVal)
+      return false;
+
+    // Be conservative about which kinds of constants we support.
+    if (!ValidLookupTableConstant(ConstVal, TTI))
+      return false;
+
+    Res.push_back(std::make_pair(&PHI, ConstVal));
+  }
+
+  return Res.size() > 0;
+}
+
+// Helper function used to add CaseVal to the list of cases that generate
+// Result. Returns the updated number of cases that generate this result.
+static uintptr_t MapCaseToResult(ConstantInt *CaseVal,
+                                 SwitchCaseResultVectorTy &UniqueResults,
+                                 Constant *Result) {
+  for (auto &I : UniqueResults) {
+    if (I.first == Result) {
+      I.second.push_back(CaseVal);
+      return I.second.size();
+    }
+  }
+  UniqueResults.push_back(
+      std::make_pair(Result, SmallVector<ConstantInt *, 4>(1, CaseVal)));
+  return 1;
+}
+
+// Helper function that initializes a map containing
+// results for the PHI node of the common destination block for a switch
+// instruction. Returns false if multiple PHI nodes have been found or if
+// there is not a common destination block for the switch.
+static bool
+InitializeUniqueCases(SwitchInst *SI, PHINode *&PHI, BasicBlock *&CommonDest,
+                      SwitchCaseResultVectorTy &UniqueResults,
+                      Constant *&DefaultResult, const DataLayout &DL,
+                      const TargetTransformInfo &TTI,
+                      uintptr_t MaxUniqueResults, uintptr_t MaxCasesPerResult) {
+  for (auto &I : SI->cases()) {
+    ConstantInt *CaseVal = I.getCaseValue();
+
+    // Resulting value at phi nodes for this case value.
+    SwitchCaseResultsTy Results;
+    if (!GetCaseResults(SI, CaseVal, I.getCaseSuccessor(), &CommonDest, Results,
+                        DL, TTI))
+      return false;
+
+    // Only one value per case is permitted.
+    if (Results.size() > 1)
+      return false;
+
+    // Add the case->result mapping to UniqueResults.
+    const uintptr_t NumCasesForResult =
+        MapCaseToResult(CaseVal, UniqueResults, Results.begin()->second);
+
+    // Early out if there are too many cases for this result.
+    if (NumCasesForResult > MaxCasesPerResult)
+      return false;
+
+    // Early out if there are too many unique results.
+    if (UniqueResults.size() > MaxUniqueResults)
+      return false;
+
+    // Check the PHI consistency.
+    if (!PHI)
+      PHI = Results[0].first;
+    else if (PHI != Results[0].first)
+      return false;
+  }
+  // Find the default result value.
+  SmallVector<std::pair<PHINode *, Constant *>, 1> DefaultResults;
+  BasicBlock *DefaultDest = SI->getDefaultDest();
+  GetCaseResults(SI, nullptr, SI->getDefaultDest(), &CommonDest, DefaultResults,
+                 DL, TTI);
+  // If the default value is not found abort unless the default destination
+  // is unreachable.
+  DefaultResult =
+      DefaultResults.size() == 1 ? DefaultResults.begin()->second : nullptr;
+  if ((!DefaultResult &&
+       !isa<UnreachableInst>(DefaultDest->getFirstNonPHIOrDbg())))
+    return false;
+
+  return true;
+}
+
+// Helper function that checks if it is possible to transform a switch with only
+// two cases (or two cases + default) that produces a result into a select.
+// Example:
+// switch (a) {
+//   case 10:                %0 = icmp eq i32 %a, 10
+//     return 10;            %1 = select i1 %0, i32 10, i32 4
+//   case 20:        ---->   %2 = icmp eq i32 %a, 20
+//     return 2;             %3 = select i1 %2, i32 2, i32 %1
+//   default:
+//     return 4;
+// }
+static Value *ConvertTwoCaseSwitch(const SwitchCaseResultVectorTy &ResultVector,
+                                   Constant *DefaultResult, Value *Condition,
+                                   IRBuilder<> &Builder) {
+  assert(ResultVector.size() == 2 &&
+         "We should have exactly two unique results at this point");
+  // If we are selecting between only two cases transform into a simple
+  // select or a two-way select if default is possible.
+  if (ResultVector[0].second.size() == 1 &&
+      ResultVector[1].second.size() == 1) {
+    ConstantInt *const FirstCase = ResultVector[0].second[0];
+    ConstantInt *const SecondCase = ResultVector[1].second[0];
+
+    bool DefaultCanTrigger = DefaultResult;
+    Value *SelectValue = ResultVector[1].first;
+    if (DefaultCanTrigger) {
+      Value *const ValueCompare =
+          Builder.CreateICmpEQ(Condition, SecondCase, "switch.selectcmp");
+      SelectValue = Builder.CreateSelect(ValueCompare, ResultVector[1].first,
+                                         DefaultResult, "switch.select");
+    }
+    Value *const ValueCompare =
+        Builder.CreateICmpEQ(Condition, FirstCase, "switch.selectcmp");
+    return Builder.CreateSelect(ValueCompare, ResultVector[0].first,
+                                SelectValue, "switch.select");
+  }
+
+  return nullptr;
+}
+
+// Helper function to cleanup a switch instruction that has been converted into
+// a select, fixing up PHI nodes and basic blocks.
+static void RemoveSwitchAfterSelectConversion(SwitchInst *SI, PHINode *PHI,
+                                              Value *SelectValue,
                                               IRBuilder<> &Builder,
                                               DomTreeUpdater *DTU) {
   std::vector<DominatorTree::UpdateType> Updates;
 
-  BasicBlock *SelectBB = SI->getParent(); 
+  BasicBlock *SelectBB = SI->getParent();
   BasicBlock *DestBB = PHI->getParent();
 
   if (!is_contained(predecessors(DestBB), SelectBB))
@@ -5240,861 +5240,861 @@ static void RemoveSwitchAfterSelectConversion(SwitchInst *SI, PHINode *PHI,
 
   // Remove the switch.
 
-  while (PHI->getBasicBlockIndex(SelectBB) >= 0) 
-    PHI->removeIncomingValue(SelectBB); 
-  PHI->addIncoming(SelectValue, SelectBB); 
- 
-  for (unsigned i = 0, e = SI->getNumSuccessors(); i < e; ++i) { 
-    BasicBlock *Succ = SI->getSuccessor(i); 
- 
+  while (PHI->getBasicBlockIndex(SelectBB) >= 0)
+    PHI->removeIncomingValue(SelectBB);
+  PHI->addIncoming(SelectValue, SelectBB);
+
+  for (unsigned i = 0, e = SI->getNumSuccessors(); i < e; ++i) {
+    BasicBlock *Succ = SI->getSuccessor(i);
+
     if (Succ == DestBB)
-      continue; 
-    Succ->removePredecessor(SelectBB); 
+      continue;
+    Succ->removePredecessor(SelectBB);
     Updates.push_back({DominatorTree::Delete, SelectBB, Succ});
-  } 
-  SI->eraseFromParent(); 
+  }
+  SI->eraseFromParent();
   if (DTU)
     DTU->applyUpdates(Updates);
-} 
- 
-/// If the switch is only used to initialize one or more 
-/// phi nodes in a common successor block with only two different 
-/// constant values, replace the switch with select. 
-static bool switchToSelect(SwitchInst *SI, IRBuilder<> &Builder, 
+}
+
+/// If the switch is only used to initialize one or more
+/// phi nodes in a common successor block with only two different
+/// constant values, replace the switch with select.
+static bool switchToSelect(SwitchInst *SI, IRBuilder<> &Builder,
                            DomTreeUpdater *DTU, const DataLayout &DL,
-                           const TargetTransformInfo &TTI) { 
-  Value *const Cond = SI->getCondition(); 
-  PHINode *PHI = nullptr; 
-  BasicBlock *CommonDest = nullptr; 
-  Constant *DefaultResult; 
-  SwitchCaseResultVectorTy UniqueResults; 
-  // Collect all the cases that will deliver the same value from the switch. 
-  if (!InitializeUniqueCases(SI, PHI, CommonDest, UniqueResults, DefaultResult, 
-                             DL, TTI, 2, 1)) 
-    return false; 
-  // Selects choose between maximum two values. 
-  if (UniqueResults.size() != 2) 
-    return false; 
-  assert(PHI != nullptr && "PHI for value select not found"); 
- 
-  Builder.SetInsertPoint(SI); 
-  Value *SelectValue = 
-      ConvertTwoCaseSwitch(UniqueResults, DefaultResult, Cond, Builder); 
-  if (SelectValue) { 
+                           const TargetTransformInfo &TTI) {
+  Value *const Cond = SI->getCondition();
+  PHINode *PHI = nullptr;
+  BasicBlock *CommonDest = nullptr;
+  Constant *DefaultResult;
+  SwitchCaseResultVectorTy UniqueResults;
+  // Collect all the cases that will deliver the same value from the switch.
+  if (!InitializeUniqueCases(SI, PHI, CommonDest, UniqueResults, DefaultResult,
+                             DL, TTI, 2, 1))
+    return false;
+  // Selects choose between maximum two values.
+  if (UniqueResults.size() != 2)
+    return false;
+  assert(PHI != nullptr && "PHI for value select not found");
+
+  Builder.SetInsertPoint(SI);
+  Value *SelectValue =
+      ConvertTwoCaseSwitch(UniqueResults, DefaultResult, Cond, Builder);
+  if (SelectValue) {
     RemoveSwitchAfterSelectConversion(SI, PHI, SelectValue, Builder, DTU);
-    return true; 
-  } 
-  // The switch couldn't be converted into a select. 
-  return false; 
-} 
- 
-namespace { 
- 
-/// This class represents a lookup table that can be used to replace a switch. 
-class SwitchLookupTable { 
-public: 
-  /// Create a lookup table to use as a switch replacement with the contents 
-  /// of Values, using DefaultValue to fill any holes in the table. 
-  SwitchLookupTable( 
-      Module &M, uint64_t TableSize, ConstantInt *Offset, 
-      const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values, 
-      Constant *DefaultValue, const DataLayout &DL, const StringRef &FuncName); 
- 
-  /// Build instructions with Builder to retrieve the value at 
-  /// the position given by Index in the lookup table. 
-  Value *BuildLookup(Value *Index, IRBuilder<> &Builder); 
- 
-  /// Return true if a table with TableSize elements of 
-  /// type ElementType would fit in a target-legal register. 
-  static bool WouldFitInRegister(const DataLayout &DL, uint64_t TableSize, 
-                                 Type *ElementType); 
- 
-private: 
-  // Depending on the contents of the table, it can be represented in 
-  // different ways. 
-  enum { 
-    // For tables where each element contains the same value, we just have to 
-    // store that single value and return it for each lookup. 
-    SingleValueKind, 
- 
-    // For tables where there is a linear relationship between table index 
-    // and values. We calculate the result with a simple multiplication 
-    // and addition instead of a table lookup. 
-    LinearMapKind, 
- 
-    // For small tables with integer elements, we can pack them into a bitmap 
-    // that fits into a target-legal register. Values are retrieved by 
-    // shift and mask operations. 
-    BitMapKind, 
- 
-    // The table is stored as an array of values. Values are retrieved by load 
-    // instructions from the table. 
-    ArrayKind 
-  } Kind; 
- 
-  // For SingleValueKind, this is the single value. 
-  Constant *SingleValue = nullptr; 
- 
-  // For BitMapKind, this is the bitmap. 
-  ConstantInt *BitMap = nullptr; 
-  IntegerType *BitMapElementTy = nullptr; 
- 
-  // For LinearMapKind, these are the constants used to derive the value. 
-  ConstantInt *LinearOffset = nullptr; 
-  ConstantInt *LinearMultiplier = nullptr; 
- 
-  // For ArrayKind, this is the array. 
-  GlobalVariable *Array = nullptr; 
-}; 
- 
-} // end anonymous namespace 
- 
-SwitchLookupTable::SwitchLookupTable( 
-    Module &M, uint64_t TableSize, ConstantInt *Offset, 
-    const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values, 
-    Constant *DefaultValue, const DataLayout &DL, const StringRef &FuncName) { 
-  assert(Values.size() && "Can't build lookup table without values!"); 
-  assert(TableSize >= Values.size() && "Can't fit values in table!"); 
- 
-  // If all values in the table are equal, this is that value. 
-  SingleValue = Values.begin()->second; 
- 
-  Type *ValueType = Values.begin()->second->getType(); 
- 
-  // Build up the table contents. 
-  SmallVector<Constant *, 64> TableContents(TableSize); 
-  for (size_t I = 0, E = Values.size(); I != E; ++I) { 
-    ConstantInt *CaseVal = Values[I].first; 
-    Constant *CaseRes = Values[I].second; 
-    assert(CaseRes->getType() == ValueType); 
- 
-    uint64_t Idx = (CaseVal->getValue() - Offset->getValue()).getLimitedValue(); 
-    TableContents[Idx] = CaseRes; 
- 
-    if (CaseRes != SingleValue) 
-      SingleValue = nullptr; 
-  } 
- 
-  // Fill in any holes in the table with the default result. 
-  if (Values.size() < TableSize) { 
-    assert(DefaultValue && 
-           "Need a default value to fill the lookup table holes."); 
-    assert(DefaultValue->getType() == ValueType); 
-    for (uint64_t I = 0; I < TableSize; ++I) { 
-      if (!TableContents[I]) 
-        TableContents[I] = DefaultValue; 
-    } 
- 
-    if (DefaultValue != SingleValue) 
-      SingleValue = nullptr; 
-  } 
- 
-  // If each element in the table contains the same value, we only need to store 
-  // that single value. 
-  if (SingleValue) { 
-    Kind = SingleValueKind; 
-    return; 
-  } 
- 
-  // Check if we can derive the value with a linear transformation from the 
-  // table index. 
-  if (isa<IntegerType>(ValueType)) { 
-    bool LinearMappingPossible = true; 
-    APInt PrevVal; 
-    APInt DistToPrev; 
-    assert(TableSize >= 2 && "Should be a SingleValue table."); 
-    // Check if there is the same distance between two consecutive values. 
-    for (uint64_t I = 0; I < TableSize; ++I) { 
-      ConstantInt *ConstVal = dyn_cast<ConstantInt>(TableContents[I]); 
-      if (!ConstVal) { 
-        // This is an undef. We could deal with it, but undefs in lookup tables 
-        // are very seldom. It's probably not worth the additional complexity. 
-        LinearMappingPossible = false; 
-        break; 
-      } 
-      const APInt &Val = ConstVal->getValue(); 
-      if (I != 0) { 
-        APInt Dist = Val - PrevVal; 
-        if (I == 1) { 
-          DistToPrev = Dist; 
-        } else if (Dist != DistToPrev) { 
-          LinearMappingPossible = false; 
-          break; 
-        } 
-      } 
-      PrevVal = Val; 
-    } 
-    if (LinearMappingPossible) { 
-      LinearOffset = cast<ConstantInt>(TableContents[0]); 
-      LinearMultiplier = ConstantInt::get(M.getContext(), DistToPrev); 
-      Kind = LinearMapKind; 
-      ++NumLinearMaps; 
-      return; 
-    } 
-  } 
- 
-  // If the type is integer and the table fits in a register, build a bitmap. 
-  if (WouldFitInRegister(DL, TableSize, ValueType)) { 
-    IntegerType *IT = cast<IntegerType>(ValueType); 
-    APInt TableInt(TableSize * IT->getBitWidth(), 0); 
-    for (uint64_t I = TableSize; I > 0; --I) { 
-      TableInt <<= IT->getBitWidth(); 
-      // Insert values into the bitmap. Undef values are set to zero. 
-      if (!isa<UndefValue>(TableContents[I - 1])) { 
-        ConstantInt *Val = cast<ConstantInt>(TableContents[I - 1]); 
-        TableInt |= Val->getValue().zext(TableInt.getBitWidth()); 
-      } 
-    } 
-    BitMap = ConstantInt::get(M.getContext(), TableInt); 
-    BitMapElementTy = IT; 
-    Kind = BitMapKind; 
-    ++NumBitMaps; 
-    return; 
-  } 
- 
-  // Store the table in an array. 
-  ArrayType *ArrayTy = ArrayType::get(ValueType, TableSize); 
-  Constant *Initializer = ConstantArray::get(ArrayTy, TableContents); 
- 
-  Array = new GlobalVariable(M, ArrayTy, /*isConstant=*/true, 
-                             GlobalVariable::PrivateLinkage, Initializer, 
-                             "switch.table." + FuncName); 
-  Array->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); 
-  // Set the alignment to that of an array items. We will be only loading one 
-  // value out of it. 
-  Array->setAlignment(Align(DL.getPrefTypeAlignment(ValueType))); 
-  Kind = ArrayKind; 
-} 
- 
-Value *SwitchLookupTable::BuildLookup(Value *Index, IRBuilder<> &Builder) { 
-  switch (Kind) { 
-  case SingleValueKind: 
-    return SingleValue; 
-  case LinearMapKind: { 
-    // Derive the result value from the input value. 
-    Value *Result = Builder.CreateIntCast(Index, LinearMultiplier->getType(), 
-                                          false, "switch.idx.cast"); 
-    if (!LinearMultiplier->isOne()) 
-      Result = Builder.CreateMul(Result, LinearMultiplier, "switch.idx.mult"); 
-    if (!LinearOffset->isZero()) 
-      Result = Builder.CreateAdd(Result, LinearOffset, "switch.offset"); 
-    return Result; 
-  } 
-  case BitMapKind: { 
-    // Type of the bitmap (e.g. i59). 
-    IntegerType *MapTy = BitMap->getType(); 
- 
-    // Cast Index to the same type as the bitmap. 
-    // Note: The Index is <= the number of elements in the table, so 
-    // truncating it to the width of the bitmask is safe. 
-    Value *ShiftAmt = Builder.CreateZExtOrTrunc(Index, MapTy, "switch.cast"); 
- 
-    // Multiply the shift amount by the element width. 
-    ShiftAmt = Builder.CreateMul( 
-        ShiftAmt, ConstantInt::get(MapTy, BitMapElementTy->getBitWidth()), 
-        "switch.shiftamt"); 
- 
-    // Shift down. 
-    Value *DownShifted = 
-        Builder.CreateLShr(BitMap, ShiftAmt, "switch.downshift"); 
-    // Mask off. 
-    return Builder.CreateTrunc(DownShifted, BitMapElementTy, "switch.masked"); 
-  } 
-  case ArrayKind: { 
-    // Make sure the table index will not overflow when treated as signed. 
-    IntegerType *IT = cast<IntegerType>(Index->getType()); 
-    uint64_t TableSize = 
-        Array->getInitializer()->getType()->getArrayNumElements(); 
-    if (TableSize > (1ULL << (IT->getBitWidth() - 1))) 
-      Index = Builder.CreateZExt( 
-          Index, IntegerType::get(IT->getContext(), IT->getBitWidth() + 1), 
-          "switch.tableidx.zext"); 
- 
-    Value *GEPIndices[] = {Builder.getInt32(0), Index}; 
-    Value *GEP = Builder.CreateInBoundsGEP(Array->getValueType(), Array, 
-                                           GEPIndices, "switch.gep"); 
-    return Builder.CreateLoad( 
-        cast<ArrayType>(Array->getValueType())->getElementType(), GEP, 
-        "switch.load"); 
-  } 
-  } 
-  llvm_unreachable("Unknown lookup table kind!"); 
-} 
- 
-bool SwitchLookupTable::WouldFitInRegister(const DataLayout &DL, 
-                                           uint64_t TableSize, 
-                                           Type *ElementType) { 
-  auto *IT = dyn_cast<IntegerType>(ElementType); 
-  if (!IT) 
-    return false; 
-  // FIXME: If the type is wider than it needs to be, e.g. i8 but all values 
-  // are <= 15, we could try to narrow the type. 
- 
-  // Avoid overflow, fitsInLegalInteger uses unsigned int for the width. 
-  if (TableSize >= UINT_MAX / IT->getBitWidth()) 
-    return false; 
-  return DL.fitsInLegalInteger(TableSize * IT->getBitWidth()); 
-} 
- 
-/// Determine whether a lookup table should be built for this switch, based on 
-/// the number of cases, size of the table, and the types of the results. 
-static bool 
-ShouldBuildLookupTable(SwitchInst *SI, uint64_t TableSize, 
-                       const TargetTransformInfo &TTI, const DataLayout &DL, 
-                       const SmallDenseMap<PHINode *, Type *> &ResultTypes) { 
-  if (SI->getNumCases() > TableSize || TableSize >= UINT64_MAX / 10) 
-    return false; // TableSize overflowed, or mul below might overflow. 
- 
-  bool AllTablesFitInRegister = true; 
-  bool HasIllegalType = false; 
-  for (const auto &I : ResultTypes) { 
-    Type *Ty = I.second; 
- 
-    // Saturate this flag to true. 
-    HasIllegalType = HasIllegalType || !TTI.isTypeLegal(Ty); 
- 
-    // Saturate this flag to false. 
-    AllTablesFitInRegister = 
-        AllTablesFitInRegister && 
-        SwitchLookupTable::WouldFitInRegister(DL, TableSize, Ty); 
- 
-    // If both flags saturate, we're done. NOTE: This *only* works with 
-    // saturating flags, and all flags have to saturate first due to the 
-    // non-deterministic behavior of iterating over a dense map. 
-    if (HasIllegalType && !AllTablesFitInRegister) 
-      break; 
-  } 
- 
-  // If each table would fit in a register, we should build it anyway. 
-  if (AllTablesFitInRegister) 
-    return true; 
- 
-  // Don't build a table that doesn't fit in-register if it has illegal types. 
-  if (HasIllegalType) 
-    return false; 
- 
-  // The table density should be at least 40%. This is the same criterion as for 
-  // jump tables, see SelectionDAGBuilder::handleJTSwitchCase. 
-  // FIXME: Find the best cut-off. 
-  return SI->getNumCases() * 10 >= TableSize * 4; 
-} 
- 
-/// Try to reuse the switch table index compare. Following pattern: 
-/// \code 
-///     if (idx < tablesize) 
-///        r = table[idx]; // table does not contain default_value 
-///     else 
-///        r = default_value; 
-///     if (r != default_value) 
-///        ... 
-/// \endcode 
-/// Is optimized to: 
-/// \code 
-///     cond = idx < tablesize; 
-///     if (cond) 
-///        r = table[idx]; 
-///     else 
-///        r = default_value; 
-///     if (cond) 
-///        ... 
-/// \endcode 
-/// Jump threading will then eliminate the second if(cond). 
-static void reuseTableCompare( 
-    User *PhiUser, BasicBlock *PhiBlock, BranchInst *RangeCheckBranch, 
-    Constant *DefaultValue, 
-    const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values) { 
-  ICmpInst *CmpInst = dyn_cast<ICmpInst>(PhiUser); 
-  if (!CmpInst) 
-    return; 
- 
-  // We require that the compare is in the same block as the phi so that jump 
-  // threading can do its work afterwards. 
-  if (CmpInst->getParent() != PhiBlock) 
-    return; 
- 
-  Constant *CmpOp1 = dyn_cast<Constant>(CmpInst->getOperand(1)); 
-  if (!CmpOp1) 
-    return; 
- 
-  Value *RangeCmp = RangeCheckBranch->getCondition(); 
-  Constant *TrueConst = ConstantInt::getTrue(RangeCmp->getType()); 
-  Constant *FalseConst = ConstantInt::getFalse(RangeCmp->getType()); 
- 
-  // Check if the compare with the default value is constant true or false. 
-  Constant *DefaultConst = ConstantExpr::getICmp(CmpInst->getPredicate(), 
-                                                 DefaultValue, CmpOp1, true); 
-  if (DefaultConst != TrueConst && DefaultConst != FalseConst) 
-    return; 
- 
-  // Check if the compare with the case values is distinct from the default 
-  // compare result. 
-  for (auto ValuePair : Values) { 
-    Constant *CaseConst = ConstantExpr::getICmp(CmpInst->getPredicate(), 
-                                                ValuePair.second, CmpOp1, true); 
-    if (!CaseConst || CaseConst == DefaultConst || isa<UndefValue>(CaseConst)) 
-      return; 
-    assert((CaseConst == TrueConst || CaseConst == FalseConst) && 
-           "Expect true or false as compare result."); 
-  } 
- 
-  // Check if the branch instruction dominates the phi node. It's a simple 
-  // dominance check, but sufficient for our needs. 
-  // Although this check is invariant in the calling loops, it's better to do it 
-  // at this late stage. Practically we do it at most once for a switch. 
-  BasicBlock *BranchBlock = RangeCheckBranch->getParent(); 
-  for (auto PI = pred_begin(PhiBlock), E = pred_end(PhiBlock); PI != E; ++PI) { 
-    BasicBlock *Pred = *PI; 
-    if (Pred != BranchBlock && Pred->getUniquePredecessor() != BranchBlock) 
-      return; 
-  } 
- 
-  if (DefaultConst == FalseConst) { 
-    // The compare yields the same result. We can replace it. 
-    CmpInst->replaceAllUsesWith(RangeCmp); 
-    ++NumTableCmpReuses; 
-  } else { 
-    // The compare yields the same result, just inverted. We can replace it. 
-    Value *InvertedTableCmp = BinaryOperator::CreateXor( 
-        RangeCmp, ConstantInt::get(RangeCmp->getType(), 1), "inverted.cmp", 
-        RangeCheckBranch); 
-    CmpInst->replaceAllUsesWith(InvertedTableCmp); 
-    ++NumTableCmpReuses; 
-  } 
-} 
- 
-/// If the switch is only used to initialize one or more phi nodes in a common 
-/// successor block with different constant values, replace the switch with 
-/// lookup tables. 
-static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, 
+    return true;
+  }
+  // The switch couldn't be converted into a select.
+  return false;
+}
+
+namespace {
+
+/// This class represents a lookup table that can be used to replace a switch.
+class SwitchLookupTable {
+public:
+  /// Create a lookup table to use as a switch replacement with the contents
+  /// of Values, using DefaultValue to fill any holes in the table.
+  SwitchLookupTable(
+      Module &M, uint64_t TableSize, ConstantInt *Offset,
+      const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values,
+      Constant *DefaultValue, const DataLayout &DL, const StringRef &FuncName);
+
+  /// Build instructions with Builder to retrieve the value at
+  /// the position given by Index in the lookup table.
+  Value *BuildLookup(Value *Index, IRBuilder<> &Builder);
+
+  /// Return true if a table with TableSize elements of
+  /// type ElementType would fit in a target-legal register.
+  static bool WouldFitInRegister(const DataLayout &DL, uint64_t TableSize,
+                                 Type *ElementType);
+
+private:
+  // Depending on the contents of the table, it can be represented in
+  // different ways.
+  enum {
+    // For tables where each element contains the same value, we just have to
+    // store that single value and return it for each lookup.
+    SingleValueKind,
+
+    // For tables where there is a linear relationship between table index
+    // and values. We calculate the result with a simple multiplication
+    // and addition instead of a table lookup.
+    LinearMapKind,
+
+    // For small tables with integer elements, we can pack them into a bitmap
+    // that fits into a target-legal register. Values are retrieved by
+    // shift and mask operations.
+    BitMapKind,
+
+    // The table is stored as an array of values. Values are retrieved by load
+    // instructions from the table.
+    ArrayKind
+  } Kind;
+
+  // For SingleValueKind, this is the single value.
+  Constant *SingleValue = nullptr;
+
+  // For BitMapKind, this is the bitmap.
+  ConstantInt *BitMap = nullptr;
+  IntegerType *BitMapElementTy = nullptr;
+
+  // For LinearMapKind, these are the constants used to derive the value.
+  ConstantInt *LinearOffset = nullptr;
+  ConstantInt *LinearMultiplier = nullptr;
+
+  // For ArrayKind, this is the array.
+  GlobalVariable *Array = nullptr;
+};
+
+} // end anonymous namespace
+
+SwitchLookupTable::SwitchLookupTable(
+    Module &M, uint64_t TableSize, ConstantInt *Offset,
+    const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values,
+    Constant *DefaultValue, const DataLayout &DL, const StringRef &FuncName) {
+  assert(Values.size() && "Can't build lookup table without values!");
+  assert(TableSize >= Values.size() && "Can't fit values in table!");
+
+  // If all values in the table are equal, this is that value.
+  SingleValue = Values.begin()->second;
+
+  Type *ValueType = Values.begin()->second->getType();
+
+  // Build up the table contents.
+  SmallVector<Constant *, 64> TableContents(TableSize);
+  for (size_t I = 0, E = Values.size(); I != E; ++I) {
+    ConstantInt *CaseVal = Values[I].first;
+    Constant *CaseRes = Values[I].second;
+    assert(CaseRes->getType() == ValueType);
+
+    uint64_t Idx = (CaseVal->getValue() - Offset->getValue()).getLimitedValue();
+    TableContents[Idx] = CaseRes;
+
+    if (CaseRes != SingleValue)
+      SingleValue = nullptr;
+  }
+
+  // Fill in any holes in the table with the default result.
+  if (Values.size() < TableSize) {
+    assert(DefaultValue &&
+           "Need a default value to fill the lookup table holes.");
+    assert(DefaultValue->getType() == ValueType);
+    for (uint64_t I = 0; I < TableSize; ++I) {
+      if (!TableContents[I])
+        TableContents[I] = DefaultValue;
+    }
+
+    if (DefaultValue != SingleValue)
+      SingleValue = nullptr;
+  }
+
+  // If each element in the table contains the same value, we only need to store
+  // that single value.
+  if (SingleValue) {
+    Kind = SingleValueKind;
+    return;
+  }
+
+  // Check if we can derive the value with a linear transformation from the
+  // table index.
+  if (isa<IntegerType>(ValueType)) {
+    bool LinearMappingPossible = true;
+    APInt PrevVal;
+    APInt DistToPrev;
+    assert(TableSize >= 2 && "Should be a SingleValue table.");
+    // Check if there is the same distance between two consecutive values.
+    for (uint64_t I = 0; I < TableSize; ++I) {
+      ConstantInt *ConstVal = dyn_cast<ConstantInt>(TableContents[I]);
+      if (!ConstVal) {
+        // This is an undef. We could deal with it, but undefs in lookup tables
+        // are very seldom. It's probably not worth the additional complexity.
+        LinearMappingPossible = false;
+        break;
+      }
+      const APInt &Val = ConstVal->getValue();
+      if (I != 0) {
+        APInt Dist = Val - PrevVal;
+        if (I == 1) {
+          DistToPrev = Dist;
+        } else if (Dist != DistToPrev) {
+          LinearMappingPossible = false;
+          break;
+        }
+      }
+      PrevVal = Val;
+    }
+    if (LinearMappingPossible) {
+      LinearOffset = cast<ConstantInt>(TableContents[0]);
+      LinearMultiplier = ConstantInt::get(M.getContext(), DistToPrev);
+      Kind = LinearMapKind;
+      ++NumLinearMaps;
+      return;
+    }
+  }
+
+  // If the type is integer and the table fits in a register, build a bitmap.
+  if (WouldFitInRegister(DL, TableSize, ValueType)) {
+    IntegerType *IT = cast<IntegerType>(ValueType);
+    APInt TableInt(TableSize * IT->getBitWidth(), 0);
+    for (uint64_t I = TableSize; I > 0; --I) {
+      TableInt <<= IT->getBitWidth();
+      // Insert values into the bitmap. Undef values are set to zero.
+      if (!isa<UndefValue>(TableContents[I - 1])) {
+        ConstantInt *Val = cast<ConstantInt>(TableContents[I - 1]);
+        TableInt |= Val->getValue().zext(TableInt.getBitWidth());
+      }
+    }
+    BitMap = ConstantInt::get(M.getContext(), TableInt);
+    BitMapElementTy = IT;
+    Kind = BitMapKind;
+    ++NumBitMaps;
+    return;
+  }
+
+  // Store the table in an array.
+  ArrayType *ArrayTy = ArrayType::get(ValueType, TableSize);
+  Constant *Initializer = ConstantArray::get(ArrayTy, TableContents);
+
+  Array = new GlobalVariable(M, ArrayTy, /*isConstant=*/true,
+                             GlobalVariable::PrivateLinkage, Initializer,
+                             "switch.table." + FuncName);
+  Array->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  // Set the alignment to that of an array items. We will be only loading one
+  // value out of it.
+  Array->setAlignment(Align(DL.getPrefTypeAlignment(ValueType)));
+  Kind = ArrayKind;
+}
+
+Value *SwitchLookupTable::BuildLookup(Value *Index, IRBuilder<> &Builder) {
+  switch (Kind) {
+  case SingleValueKind:
+    return SingleValue;
+  case LinearMapKind: {
+    // Derive the result value from the input value.
+    Value *Result = Builder.CreateIntCast(Index, LinearMultiplier->getType(),
+                                          false, "switch.idx.cast");
+    if (!LinearMultiplier->isOne())
+      Result = Builder.CreateMul(Result, LinearMultiplier, "switch.idx.mult");
+    if (!LinearOffset->isZero())
+      Result = Builder.CreateAdd(Result, LinearOffset, "switch.offset");
+    return Result;
+  }
+  case BitMapKind: {
+    // Type of the bitmap (e.g. i59).
+    IntegerType *MapTy = BitMap->getType();
+
+    // Cast Index to the same type as the bitmap.
+    // Note: The Index is <= the number of elements in the table, so
+    // truncating it to the width of the bitmask is safe.
+    Value *ShiftAmt = Builder.CreateZExtOrTrunc(Index, MapTy, "switch.cast");
+
+    // Multiply the shift amount by the element width.
+    ShiftAmt = Builder.CreateMul(
+        ShiftAmt, ConstantInt::get(MapTy, BitMapElementTy->getBitWidth()),
+        "switch.shiftamt");
+
+    // Shift down.
+    Value *DownShifted =
+        Builder.CreateLShr(BitMap, ShiftAmt, "switch.downshift");
+    // Mask off.
+    return Builder.CreateTrunc(DownShifted, BitMapElementTy, "switch.masked");
+  }
+  case ArrayKind: {
+    // Make sure the table index will not overflow when treated as signed.
+    IntegerType *IT = cast<IntegerType>(Index->getType());
+    uint64_t TableSize =
+        Array->getInitializer()->getType()->getArrayNumElements();
+    if (TableSize > (1ULL << (IT->getBitWidth() - 1)))
+      Index = Builder.CreateZExt(
+          Index, IntegerType::get(IT->getContext(), IT->getBitWidth() + 1),
+          "switch.tableidx.zext");
+
+    Value *GEPIndices[] = {Builder.getInt32(0), Index};
+    Value *GEP = Builder.CreateInBoundsGEP(Array->getValueType(), Array,
+                                           GEPIndices, "switch.gep");
+    return Builder.CreateLoad(
+        cast<ArrayType>(Array->getValueType())->getElementType(), GEP,
+        "switch.load");
+  }
+  }
+  llvm_unreachable("Unknown lookup table kind!");
+}
+
+bool SwitchLookupTable::WouldFitInRegister(const DataLayout &DL,
+                                           uint64_t TableSize,
+                                           Type *ElementType) {
+  auto *IT = dyn_cast<IntegerType>(ElementType);
+  if (!IT)
+    return false;
+  // FIXME: If the type is wider than it needs to be, e.g. i8 but all values
+  // are <= 15, we could try to narrow the type.
+
+  // Avoid overflow, fitsInLegalInteger uses unsigned int for the width.
+  if (TableSize >= UINT_MAX / IT->getBitWidth())
+    return false;
+  return DL.fitsInLegalInteger(TableSize * IT->getBitWidth());
+}
+
+/// Determine whether a lookup table should be built for this switch, based on
+/// the number of cases, size of the table, and the types of the results.
+static bool
+ShouldBuildLookupTable(SwitchInst *SI, uint64_t TableSize,
+                       const TargetTransformInfo &TTI, const DataLayout &DL,
+                       const SmallDenseMap<PHINode *, Type *> &ResultTypes) {
+  if (SI->getNumCases() > TableSize || TableSize >= UINT64_MAX / 10)
+    return false; // TableSize overflowed, or mul below might overflow.
+
+  bool AllTablesFitInRegister = true;
+  bool HasIllegalType = false;
+  for (const auto &I : ResultTypes) {
+    Type *Ty = I.second;
+
+    // Saturate this flag to true.
+    HasIllegalType = HasIllegalType || !TTI.isTypeLegal(Ty);
+
+    // Saturate this flag to false.
+    AllTablesFitInRegister =
+        AllTablesFitInRegister &&
+        SwitchLookupTable::WouldFitInRegister(DL, TableSize, Ty);
+
+    // If both flags saturate, we're done. NOTE: This *only* works with
+    // saturating flags, and all flags have to saturate first due to the
+    // non-deterministic behavior of iterating over a dense map.
+    if (HasIllegalType && !AllTablesFitInRegister)
+      break;
+  }
+
+  // If each table would fit in a register, we should build it anyway.
+  if (AllTablesFitInRegister)
+    return true;
+
+  // Don't build a table that doesn't fit in-register if it has illegal types.
+  if (HasIllegalType)
+    return false;
+
+  // The table density should be at least 40%. This is the same criterion as for
+  // jump tables, see SelectionDAGBuilder::handleJTSwitchCase.
+  // FIXME: Find the best cut-off.
+  return SI->getNumCases() * 10 >= TableSize * 4;
+}
+
+/// Try to reuse the switch table index compare. Following pattern:
+/// \code
+///     if (idx < tablesize)
+///        r = table[idx]; // table does not contain default_value
+///     else
+///        r = default_value;
+///     if (r != default_value)
+///        ...
+/// \endcode
+/// Is optimized to:
+/// \code
+///     cond = idx < tablesize;
+///     if (cond)
+///        r = table[idx];
+///     else
+///        r = default_value;
+///     if (cond)
+///        ...
+/// \endcode
+/// Jump threading will then eliminate the second if(cond).
+static void reuseTableCompare(
+    User *PhiUser, BasicBlock *PhiBlock, BranchInst *RangeCheckBranch,
+    Constant *DefaultValue,
+    const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values) {
+  ICmpInst *CmpInst = dyn_cast<ICmpInst>(PhiUser);
+  if (!CmpInst)
+    return;
+
+  // We require that the compare is in the same block as the phi so that jump
+  // threading can do its work afterwards.
+  if (CmpInst->getParent() != PhiBlock)
+    return;
+
+  Constant *CmpOp1 = dyn_cast<Constant>(CmpInst->getOperand(1));
+  if (!CmpOp1)
+    return;
+
+  Value *RangeCmp = RangeCheckBranch->getCondition();
+  Constant *TrueConst = ConstantInt::getTrue(RangeCmp->getType());
+  Constant *FalseConst = ConstantInt::getFalse(RangeCmp->getType());
+
+  // Check if the compare with the default value is constant true or false.
+  Constant *DefaultConst = ConstantExpr::getICmp(CmpInst->getPredicate(),
+                                                 DefaultValue, CmpOp1, true);
+  if (DefaultConst != TrueConst && DefaultConst != FalseConst)
+    return;
+
+  // Check if the compare with the case values is distinct from the default
+  // compare result.
+  for (auto ValuePair : Values) {
+    Constant *CaseConst = ConstantExpr::getICmp(CmpInst->getPredicate(),
+                                                ValuePair.second, CmpOp1, true);
+    if (!CaseConst || CaseConst == DefaultConst || isa<UndefValue>(CaseConst))
+      return;
+    assert((CaseConst == TrueConst || CaseConst == FalseConst) &&
+           "Expect true or false as compare result.");
+  }
+
+  // Check if the branch instruction dominates the phi node. It's a simple
+  // dominance check, but sufficient for our needs.
+  // Although this check is invariant in the calling loops, it's better to do it
+  // at this late stage. Practically we do it at most once for a switch.
+  BasicBlock *BranchBlock = RangeCheckBranch->getParent();
+  for (auto PI = pred_begin(PhiBlock), E = pred_end(PhiBlock); PI != E; ++PI) {
+    BasicBlock *Pred = *PI;
+    if (Pred != BranchBlock && Pred->getUniquePredecessor() != BranchBlock)
+      return;
+  }
+
+  if (DefaultConst == FalseConst) {
+    // The compare yields the same result. We can replace it.
+    CmpInst->replaceAllUsesWith(RangeCmp);
+    ++NumTableCmpReuses;
+  } else {
+    // The compare yields the same result, just inverted. We can replace it.
+    Value *InvertedTableCmp = BinaryOperator::CreateXor(
+        RangeCmp, ConstantInt::get(RangeCmp->getType(), 1), "inverted.cmp",
+        RangeCheckBranch);
+    CmpInst->replaceAllUsesWith(InvertedTableCmp);
+    ++NumTableCmpReuses;
+  }
+}
+
+/// If the switch is only used to initialize one or more phi nodes in a common
+/// successor block with different constant values, replace the switch with
+/// lookup tables.
+static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
                                 DomTreeUpdater *DTU, const DataLayout &DL,
-                                const TargetTransformInfo &TTI) { 
-  assert(SI->getNumCases() > 1 && "Degenerate switch?"); 
- 
+                                const TargetTransformInfo &TTI) {
+  assert(SI->getNumCases() > 1 && "Degenerate switch?");
+
   BasicBlock *BB = SI->getParent();
   Function *Fn = BB->getParent();
-  // Only build lookup table when we have a target that supports it or the 
-  // attribute is not set. 
-  if (!TTI.shouldBuildLookupTables() || 
-      (Fn->getFnAttribute("no-jump-tables").getValueAsString() == "true")) 
-    return false; 
- 
-  // FIXME: If the switch is too sparse for a lookup table, perhaps we could 
-  // split off a dense part and build a lookup table for that. 
- 
-  // FIXME: This creates arrays of GEPs to constant strings, which means each 
-  // GEP needs a runtime relocation in PIC code. We should just build one big 
-  // string and lookup indices into that. 
- 
-  // Ignore switches with less than three cases. Lookup tables will not make 
-  // them faster, so we don't analyze them. 
-  if (SI->getNumCases() < 3) 
-    return false; 
- 
-  // Figure out the corresponding result for each case value and phi node in the 
-  // common destination, as well as the min and max case values. 
-  assert(!SI->cases().empty()); 
-  SwitchInst::CaseIt CI = SI->case_begin(); 
-  ConstantInt *MinCaseVal = CI->getCaseValue(); 
-  ConstantInt *MaxCaseVal = CI->getCaseValue(); 
- 
-  BasicBlock *CommonDest = nullptr; 
- 
-  using ResultListTy = SmallVector<std::pair<ConstantInt *, Constant *>, 4>; 
-  SmallDenseMap<PHINode *, ResultListTy> ResultLists; 
- 
-  SmallDenseMap<PHINode *, Constant *> DefaultResults; 
-  SmallDenseMap<PHINode *, Type *> ResultTypes; 
-  SmallVector<PHINode *, 4> PHIs; 
- 
-  for (SwitchInst::CaseIt E = SI->case_end(); CI != E; ++CI) { 
-    ConstantInt *CaseVal = CI->getCaseValue(); 
-    if (CaseVal->getValue().slt(MinCaseVal->getValue())) 
-      MinCaseVal = CaseVal; 
-    if (CaseVal->getValue().sgt(MaxCaseVal->getValue())) 
-      MaxCaseVal = CaseVal; 
- 
-    // Resulting value at phi nodes for this case value. 
-    using ResultsTy = SmallVector<std::pair<PHINode *, Constant *>, 4>; 
-    ResultsTy Results; 
-    if (!GetCaseResults(SI, CaseVal, CI->getCaseSuccessor(), &CommonDest, 
-                        Results, DL, TTI)) 
-      return false; 
- 
-    // Append the result from this case to the list for each phi. 
-    for (const auto &I : Results) { 
-      PHINode *PHI = I.first; 
-      Constant *Value = I.second; 
-      if (!ResultLists.count(PHI)) 
-        PHIs.push_back(PHI); 
-      ResultLists[PHI].push_back(std::make_pair(CaseVal, Value)); 
-    } 
-  } 
- 
-  // Keep track of the result types. 
-  for (PHINode *PHI : PHIs) { 
-    ResultTypes[PHI] = ResultLists[PHI][0].second->getType(); 
-  } 
- 
-  uint64_t NumResults = ResultLists[PHIs[0]].size(); 
-  APInt RangeSpread = MaxCaseVal->getValue() - MinCaseVal->getValue(); 
-  uint64_t TableSize = RangeSpread.getLimitedValue() + 1; 
-  bool TableHasHoles = (NumResults < TableSize); 
- 
-  // If the table has holes, we need a constant result for the default case 
-  // or a bitmask that fits in a register. 
-  SmallVector<std::pair<PHINode *, Constant *>, 4> DefaultResultsList; 
-  bool HasDefaultResults = 
-      GetCaseResults(SI, nullptr, SI->getDefaultDest(), &CommonDest, 
-                     DefaultResultsList, DL, TTI); 
- 
-  bool NeedMask = (TableHasHoles && !HasDefaultResults); 
-  if (NeedMask) { 
-    // As an extra penalty for the validity test we require more cases. 
-    if (SI->getNumCases() < 4) // FIXME: Find best threshold value (benchmark). 
-      return false; 
-    if (!DL.fitsInLegalInteger(TableSize)) 
-      return false; 
-  } 
- 
-  for (const auto &I : DefaultResultsList) { 
-    PHINode *PHI = I.first; 
-    Constant *Result = I.second; 
-    DefaultResults[PHI] = Result; 
-  } 
- 
-  if (!ShouldBuildLookupTable(SI, TableSize, TTI, DL, ResultTypes)) 
-    return false; 
- 
+  // Only build lookup table when we have a target that supports it or the
+  // attribute is not set.
+  if (!TTI.shouldBuildLookupTables() ||
+      (Fn->getFnAttribute("no-jump-tables").getValueAsString() == "true"))
+    return false;
+
+  // FIXME: If the switch is too sparse for a lookup table, perhaps we could
+  // split off a dense part and build a lookup table for that.
+
+  // FIXME: This creates arrays of GEPs to constant strings, which means each
+  // GEP needs a runtime relocation in PIC code. We should just build one big
+  // string and lookup indices into that.
+
+  // Ignore switches with less than three cases. Lookup tables will not make
+  // them faster, so we don't analyze them.
+  if (SI->getNumCases() < 3)
+    return false;
+
+  // Figure out the corresponding result for each case value and phi node in the
+  // common destination, as well as the min and max case values.
+  assert(!SI->cases().empty());
+  SwitchInst::CaseIt CI = SI->case_begin();
+  ConstantInt *MinCaseVal = CI->getCaseValue();
+  ConstantInt *MaxCaseVal = CI->getCaseValue();
+
+  BasicBlock *CommonDest = nullptr;
+
+  using ResultListTy = SmallVector<std::pair<ConstantInt *, Constant *>, 4>;
+  SmallDenseMap<PHINode *, ResultListTy> ResultLists;
+
+  SmallDenseMap<PHINode *, Constant *> DefaultResults;
+  SmallDenseMap<PHINode *, Type *> ResultTypes;
+  SmallVector<PHINode *, 4> PHIs;
+
+  for (SwitchInst::CaseIt E = SI->case_end(); CI != E; ++CI) {
+    ConstantInt *CaseVal = CI->getCaseValue();
+    if (CaseVal->getValue().slt(MinCaseVal->getValue()))
+      MinCaseVal = CaseVal;
+    if (CaseVal->getValue().sgt(MaxCaseVal->getValue()))
+      MaxCaseVal = CaseVal;
+
+    // Resulting value at phi nodes for this case value.
+    using ResultsTy = SmallVector<std::pair<PHINode *, Constant *>, 4>;
+    ResultsTy Results;
+    if (!GetCaseResults(SI, CaseVal, CI->getCaseSuccessor(), &CommonDest,
+                        Results, DL, TTI))
+      return false;
+
+    // Append the result from this case to the list for each phi.
+    for (const auto &I : Results) {
+      PHINode *PHI = I.first;
+      Constant *Value = I.second;
+      if (!ResultLists.count(PHI))
+        PHIs.push_back(PHI);
+      ResultLists[PHI].push_back(std::make_pair(CaseVal, Value));
+    }
+  }
+
+  // Keep track of the result types.
+  for (PHINode *PHI : PHIs) {
+    ResultTypes[PHI] = ResultLists[PHI][0].second->getType();
+  }
+
+  uint64_t NumResults = ResultLists[PHIs[0]].size();
+  APInt RangeSpread = MaxCaseVal->getValue() - MinCaseVal->getValue();
+  uint64_t TableSize = RangeSpread.getLimitedValue() + 1;
+  bool TableHasHoles = (NumResults < TableSize);
+
+  // If the table has holes, we need a constant result for the default case
+  // or a bitmask that fits in a register.
+  SmallVector<std::pair<PHINode *, Constant *>, 4> DefaultResultsList;
+  bool HasDefaultResults =
+      GetCaseResults(SI, nullptr, SI->getDefaultDest(), &CommonDest,
+                     DefaultResultsList, DL, TTI);
+
+  bool NeedMask = (TableHasHoles && !HasDefaultResults);
+  if (NeedMask) {
+    // As an extra penalty for the validity test we require more cases.
+    if (SI->getNumCases() < 4) // FIXME: Find best threshold value (benchmark).
+      return false;
+    if (!DL.fitsInLegalInteger(TableSize))
+      return false;
+  }
+
+  for (const auto &I : DefaultResultsList) {
+    PHINode *PHI = I.first;
+    Constant *Result = I.second;
+    DefaultResults[PHI] = Result;
+  }
+
+  if (!ShouldBuildLookupTable(SI, TableSize, TTI, DL, ResultTypes))
+    return false;
+
   std::vector<DominatorTree::UpdateType> Updates;
 
-  // Create the BB that does the lookups. 
-  Module &Mod = *CommonDest->getParent()->getParent(); 
-  BasicBlock *LookupBB = BasicBlock::Create( 
-      Mod.getContext(), "switch.lookup", CommonDest->getParent(), CommonDest); 
- 
-  // Compute the table index value. 
-  Builder.SetInsertPoint(SI); 
-  Value *TableIndex; 
-  if (MinCaseVal->isNullValue()) 
-    TableIndex = SI->getCondition(); 
-  else 
-    TableIndex = Builder.CreateSub(SI->getCondition(), MinCaseVal, 
-                                   "switch.tableidx"); 
- 
-  // Compute the maximum table size representable by the integer type we are 
-  // switching upon. 
-  unsigned CaseSize = MinCaseVal->getType()->getPrimitiveSizeInBits(); 
-  uint64_t MaxTableSize = CaseSize > 63 ? UINT64_MAX : 1ULL << CaseSize; 
-  assert(MaxTableSize >= TableSize && 
-         "It is impossible for a switch to have more entries than the max " 
-         "representable value of its input integer type's size."); 
- 
-  // If the default destination is unreachable, or if the lookup table covers 
-  // all values of the conditional variable, branch directly to the lookup table 
-  // BB. Otherwise, check that the condition is within the case range. 
-  const bool DefaultIsReachable = 
-      !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg()); 
-  const bool GeneratingCoveredLookupTable = (MaxTableSize == TableSize); 
-  BranchInst *RangeCheckBranch = nullptr; 
- 
-  if (!DefaultIsReachable || GeneratingCoveredLookupTable) { 
-    Builder.CreateBr(LookupBB); 
+  // Create the BB that does the lookups.
+  Module &Mod = *CommonDest->getParent()->getParent();
+  BasicBlock *LookupBB = BasicBlock::Create(
+      Mod.getContext(), "switch.lookup", CommonDest->getParent(), CommonDest);
+
+  // Compute the table index value.
+  Builder.SetInsertPoint(SI);
+  Value *TableIndex;
+  if (MinCaseVal->isNullValue())
+    TableIndex = SI->getCondition();
+  else
+    TableIndex = Builder.CreateSub(SI->getCondition(), MinCaseVal,
+                                   "switch.tableidx");
+
+  // Compute the maximum table size representable by the integer type we are
+  // switching upon.
+  unsigned CaseSize = MinCaseVal->getType()->getPrimitiveSizeInBits();
+  uint64_t MaxTableSize = CaseSize > 63 ? UINT64_MAX : 1ULL << CaseSize;
+  assert(MaxTableSize >= TableSize &&
+         "It is impossible for a switch to have more entries than the max "
+         "representable value of its input integer type's size.");
+
+  // If the default destination is unreachable, or if the lookup table covers
+  // all values of the conditional variable, branch directly to the lookup table
+  // BB. Otherwise, check that the condition is within the case range.
+  const bool DefaultIsReachable =
+      !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg());
+  const bool GeneratingCoveredLookupTable = (MaxTableSize == TableSize);
+  BranchInst *RangeCheckBranch = nullptr;
+
+  if (!DefaultIsReachable || GeneratingCoveredLookupTable) {
+    Builder.CreateBr(LookupBB);
     Updates.push_back({DominatorTree::Insert, BB, LookupBB});
-    // Note: We call removeProdecessor later since we need to be able to get the 
-    // PHI value for the default case in case we're using a bit mask. 
-  } else { 
-    Value *Cmp = Builder.CreateICmpULT( 
-        TableIndex, ConstantInt::get(MinCaseVal->getType(), TableSize)); 
-    RangeCheckBranch = 
-        Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest()); 
+    // Note: We call removeProdecessor later since we need to be able to get the
+    // PHI value for the default case in case we're using a bit mask.
+  } else {
+    Value *Cmp = Builder.CreateICmpULT(
+        TableIndex, ConstantInt::get(MinCaseVal->getType(), TableSize));
+    RangeCheckBranch =
+        Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest());
     Updates.push_back({DominatorTree::Insert, BB, LookupBB});
-  } 
- 
-  // Populate the BB that does the lookups. 
-  Builder.SetInsertPoint(LookupBB); 
- 
-  if (NeedMask) { 
-    // Before doing the lookup, we do the hole check. The LookupBB is therefore 
-    // re-purposed to do the hole check, and we create a new LookupBB. 
-    BasicBlock *MaskBB = LookupBB; 
-    MaskBB->setName("switch.hole_check"); 
-    LookupBB = BasicBlock::Create(Mod.getContext(), "switch.lookup", 
-                                  CommonDest->getParent(), CommonDest); 
- 
-    // Make the mask's bitwidth at least 8-bit and a power-of-2 to avoid 
-    // unnecessary illegal types. 
-    uint64_t TableSizePowOf2 = NextPowerOf2(std::max(7ULL, TableSize - 1ULL)); 
-    APInt MaskInt(TableSizePowOf2, 0); 
-    APInt One(TableSizePowOf2, 1); 
-    // Build bitmask; fill in a 1 bit for every case. 
-    const ResultListTy &ResultList = ResultLists[PHIs[0]]; 
-    for (size_t I = 0, E = ResultList.size(); I != E; ++I) { 
-      uint64_t Idx = (ResultList[I].first->getValue() - MinCaseVal->getValue()) 
-                         .getLimitedValue(); 
-      MaskInt |= One << Idx; 
-    } 
-    ConstantInt *TableMask = ConstantInt::get(Mod.getContext(), MaskInt); 
- 
-    // Get the TableIndex'th bit of the bitmask. 
-    // If this bit is 0 (meaning hole) jump to the default destination, 
-    // else continue with table lookup. 
-    IntegerType *MapTy = TableMask->getType(); 
-    Value *MaskIndex = 
-        Builder.CreateZExtOrTrunc(TableIndex, MapTy, "switch.maskindex"); 
-    Value *Shifted = Builder.CreateLShr(TableMask, MaskIndex, "switch.shifted"); 
-    Value *LoBit = Builder.CreateTrunc( 
-        Shifted, Type::getInt1Ty(Mod.getContext()), "switch.lobit"); 
-    Builder.CreateCondBr(LoBit, LookupBB, SI->getDefaultDest()); 
+  }
+
+  // Populate the BB that does the lookups.
+  Builder.SetInsertPoint(LookupBB);
+
+  if (NeedMask) {
+    // Before doing the lookup, we do the hole check. The LookupBB is therefore
+    // re-purposed to do the hole check, and we create a new LookupBB.
+    BasicBlock *MaskBB = LookupBB;
+    MaskBB->setName("switch.hole_check");
+    LookupBB = BasicBlock::Create(Mod.getContext(), "switch.lookup",
+                                  CommonDest->getParent(), CommonDest);
+
+    // Make the mask's bitwidth at least 8-bit and a power-of-2 to avoid
+    // unnecessary illegal types.
+    uint64_t TableSizePowOf2 = NextPowerOf2(std::max(7ULL, TableSize - 1ULL));
+    APInt MaskInt(TableSizePowOf2, 0);
+    APInt One(TableSizePowOf2, 1);
+    // Build bitmask; fill in a 1 bit for every case.
+    const ResultListTy &ResultList = ResultLists[PHIs[0]];
+    for (size_t I = 0, E = ResultList.size(); I != E; ++I) {
+      uint64_t Idx = (ResultList[I].first->getValue() - MinCaseVal->getValue())
+                         .getLimitedValue();
+      MaskInt |= One << Idx;
+    }
+    ConstantInt *TableMask = ConstantInt::get(Mod.getContext(), MaskInt);
+
+    // Get the TableIndex'th bit of the bitmask.
+    // If this bit is 0 (meaning hole) jump to the default destination,
+    // else continue with table lookup.
+    IntegerType *MapTy = TableMask->getType();
+    Value *MaskIndex =
+        Builder.CreateZExtOrTrunc(TableIndex, MapTy, "switch.maskindex");
+    Value *Shifted = Builder.CreateLShr(TableMask, MaskIndex, "switch.shifted");
+    Value *LoBit = Builder.CreateTrunc(
+        Shifted, Type::getInt1Ty(Mod.getContext()), "switch.lobit");
+    Builder.CreateCondBr(LoBit, LookupBB, SI->getDefaultDest());
     Updates.push_back({DominatorTree::Insert, MaskBB, LookupBB});
     Updates.push_back({DominatorTree::Insert, MaskBB, SI->getDefaultDest()});
-    Builder.SetInsertPoint(LookupBB); 
+    Builder.SetInsertPoint(LookupBB);
     AddPredecessorToBlock(SI->getDefaultDest(), MaskBB, BB);
-  } 
- 
-  if (!DefaultIsReachable || GeneratingCoveredLookupTable) { 
-    // We cached PHINodes in PHIs. To avoid accessing deleted PHINodes later, 
-    // do not delete PHINodes here. 
+  }
+
+  if (!DefaultIsReachable || GeneratingCoveredLookupTable) {
+    // We cached PHINodes in PHIs. To avoid accessing deleted PHINodes later,
+    // do not delete PHINodes here.
     SI->getDefaultDest()->removePredecessor(BB,
-                                            /*KeepOneInputPHIs=*/true); 
+                                            /*KeepOneInputPHIs=*/true);
     Updates.push_back({DominatorTree::Delete, BB, SI->getDefaultDest()});
-  } 
- 
-  bool ReturnedEarly = false; 
-  for (PHINode *PHI : PHIs) { 
-    const ResultListTy &ResultList = ResultLists[PHI]; 
- 
-    // If using a bitmask, use any value to fill the lookup table holes. 
-    Constant *DV = NeedMask ? ResultLists[PHI][0].second : DefaultResults[PHI]; 
-    StringRef FuncName = Fn->getName(); 
-    SwitchLookupTable Table(Mod, TableSize, MinCaseVal, ResultList, DV, DL, 
-                            FuncName); 
- 
-    Value *Result = Table.BuildLookup(TableIndex, Builder); 
- 
-    // If the result is used to return immediately from the function, we want to 
-    // do that right here. 
-    if (PHI->hasOneUse() && isa<ReturnInst>(*PHI->user_begin()) && 
-        PHI->user_back() == CommonDest->getFirstNonPHIOrDbg()) { 
-      Builder.CreateRet(Result); 
-      ReturnedEarly = true; 
-      break; 
-    } 
- 
-    // Do a small peephole optimization: re-use the switch table compare if 
-    // possible. 
-    if (!TableHasHoles && HasDefaultResults && RangeCheckBranch) { 
-      BasicBlock *PhiBlock = PHI->getParent(); 
-      // Search for compare instructions which use the phi. 
-      for (auto *User : PHI->users()) { 
-        reuseTableCompare(User, PhiBlock, RangeCheckBranch, DV, ResultList); 
-      } 
-    } 
- 
-    PHI->addIncoming(Result, LookupBB); 
-  } 
- 
+  }
+
+  bool ReturnedEarly = false;
+  for (PHINode *PHI : PHIs) {
+    const ResultListTy &ResultList = ResultLists[PHI];
+
+    // If using a bitmask, use any value to fill the lookup table holes.
+    Constant *DV = NeedMask ? ResultLists[PHI][0].second : DefaultResults[PHI];
+    StringRef FuncName = Fn->getName();
+    SwitchLookupTable Table(Mod, TableSize, MinCaseVal, ResultList, DV, DL,
+                            FuncName);
+
+    Value *Result = Table.BuildLookup(TableIndex, Builder);
+
+    // If the result is used to return immediately from the function, we want to
+    // do that right here.
+    if (PHI->hasOneUse() && isa<ReturnInst>(*PHI->user_begin()) &&
+        PHI->user_back() == CommonDest->getFirstNonPHIOrDbg()) {
+      Builder.CreateRet(Result);
+      ReturnedEarly = true;
+      break;
+    }
+
+    // Do a small peephole optimization: re-use the switch table compare if
+    // possible.
+    if (!TableHasHoles && HasDefaultResults && RangeCheckBranch) {
+      BasicBlock *PhiBlock = PHI->getParent();
+      // Search for compare instructions which use the phi.
+      for (auto *User : PHI->users()) {
+        reuseTableCompare(User, PhiBlock, RangeCheckBranch, DV, ResultList);
+      }
+    }
+
+    PHI->addIncoming(Result, LookupBB);
+  }
+
   if (!ReturnedEarly) {
-    Builder.CreateBr(CommonDest); 
+    Builder.CreateBr(CommonDest);
     Updates.push_back({DominatorTree::Insert, LookupBB, CommonDest});
   }
- 
-  // Remove the switch. 
+
+  // Remove the switch.
   SmallSetVector<BasicBlock *, 8> RemovedSuccessors;
-  for (unsigned i = 0, e = SI->getNumSuccessors(); i < e; ++i) { 
-    BasicBlock *Succ = SI->getSuccessor(i); 
- 
-    if (Succ == SI->getDefaultDest()) 
-      continue; 
+  for (unsigned i = 0, e = SI->getNumSuccessors(); i < e; ++i) {
+    BasicBlock *Succ = SI->getSuccessor(i);
+
+    if (Succ == SI->getDefaultDest())
+      continue;
     Succ->removePredecessor(BB);
     RemovedSuccessors.insert(Succ);
-  } 
-  SI->eraseFromParent(); 
- 
+  }
+  SI->eraseFromParent();
+
   if (DTU) {
     for (BasicBlock *RemovedSuccessor : RemovedSuccessors)
       Updates.push_back({DominatorTree::Delete, BB, RemovedSuccessor});
     DTU->applyUpdates(Updates);
   }
 
-  ++NumLookupTables; 
-  if (NeedMask) 
-    ++NumLookupTablesHoles; 
-  return true; 
-} 
- 
-static bool isSwitchDense(ArrayRef<int64_t> Values) { 
-  // See also SelectionDAGBuilder::isDense(), which this function was based on. 
-  uint64_t Diff = (uint64_t)Values.back() - (uint64_t)Values.front(); 
-  uint64_t Range = Diff + 1; 
-  uint64_t NumCases = Values.size(); 
-  // 40% is the default density for building a jump table in optsize/minsize mode. 
-  uint64_t MinDensity = 40; 
- 
-  return NumCases * 100 >= Range * MinDensity; 
-} 
- 
-/// Try to transform a switch that has "holes" in it to a contiguous sequence 
-/// of cases. 
-/// 
-/// A switch such as: switch(i) {case 5: case 9: case 13: case 17:} can be 
-/// range-reduced to: switch ((i-5) / 4) {case 0: case 1: case 2: case 3:}. 
-/// 
-/// This converts a sparse switch into a dense switch which allows better 
-/// lowering and could also allow transforming into a lookup table. 
-static bool ReduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder, 
-                              const DataLayout &DL, 
-                              const TargetTransformInfo &TTI) { 
-  auto *CondTy = cast<IntegerType>(SI->getCondition()->getType()); 
-  if (CondTy->getIntegerBitWidth() > 64 || 
-      !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth())) 
-    return false; 
-  // Only bother with this optimization if there are more than 3 switch cases; 
-  // SDAG will only bother creating jump tables for 4 or more cases. 
-  if (SI->getNumCases() < 4) 
-    return false; 
- 
-  // This transform is agnostic to the signedness of the input or case values. We 
-  // can treat the case values as signed or unsigned. We can optimize more common 
-  // cases such as a sequence crossing zero {-4,0,4,8} if we interpret case values 
-  // as signed. 
-  SmallVector<int64_t,4> Values; 
-  for (auto &C : SI->cases()) 
-    Values.push_back(C.getCaseValue()->getValue().getSExtValue()); 
-  llvm::sort(Values); 
- 
-  // If the switch is already dense, there's nothing useful to do here. 
-  if (isSwitchDense(Values)) 
-    return false; 
- 
-  // First, transform the values such that they start at zero and ascend. 
-  int64_t Base = Values[0]; 
-  for (auto &V : Values) 
-    V -= (uint64_t)(Base); 
- 
-  // Now we have signed numbers that have been shifted so that, given enough 
-  // precision, there are no negative values. Since the rest of the transform 
-  // is bitwise only, we switch now to an unsigned representation. 
- 
-  // This transform can be done speculatively because it is so cheap - it 
-  // results in a single rotate operation being inserted. 
-  // FIXME: It's possible that optimizing a switch on powers of two might also 
-  // be beneficial - flag values are often powers of two and we could use a CLZ 
-  // as the key function. 
- 
-  // countTrailingZeros(0) returns 64. As Values is guaranteed to have more than 
-  // one element and LLVM disallows duplicate cases, Shift is guaranteed to be 
-  // less than 64. 
-  unsigned Shift = 64; 
-  for (auto &V : Values) 
-    Shift = std::min(Shift, countTrailingZeros((uint64_t)V)); 
-  assert(Shift < 64); 
-  if (Shift > 0) 
-    for (auto &V : Values) 
-      V = (int64_t)((uint64_t)V >> Shift); 
- 
-  if (!isSwitchDense(Values)) 
-    // Transform didn't create a dense switch. 
-    return false; 
- 
-  // The obvious transform is to shift the switch condition right and emit a 
-  // check that the condition actually cleanly divided by GCD, i.e. 
-  //   C & (1 << Shift - 1) == 0 
-  // inserting a new CFG edge to handle the case where it didn't divide cleanly. 
-  // 
-  // A cheaper way of doing this is a simple ROTR(C, Shift). This performs the 
-  // shift and puts the shifted-off bits in the uppermost bits. If any of these 
-  // are nonzero then the switch condition will be very large and will hit the 
-  // default case. 
- 
-  auto *Ty = cast<IntegerType>(SI->getCondition()->getType()); 
-  Builder.SetInsertPoint(SI); 
-  auto *ShiftC = ConstantInt::get(Ty, Shift); 
-  auto *Sub = Builder.CreateSub(SI->getCondition(), ConstantInt::get(Ty, Base)); 
-  auto *LShr = Builder.CreateLShr(Sub, ShiftC); 
-  auto *Shl = Builder.CreateShl(Sub, Ty->getBitWidth() - Shift); 
-  auto *Rot = Builder.CreateOr(LShr, Shl); 
-  SI->replaceUsesOfWith(SI->getCondition(), Rot); 
- 
-  for (auto Case : SI->cases()) { 
-    auto *Orig = Case.getCaseValue(); 
-    auto Sub = Orig->getValue() - APInt(Ty->getBitWidth(), Base); 
-    Case.setValue( 
-        cast<ConstantInt>(ConstantInt::get(Ty, Sub.lshr(ShiftC->getValue())))); 
-  } 
-  return true; 
-} 
- 
-bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) { 
-  BasicBlock *BB = SI->getParent(); 
- 
-  if (isValueEqualityComparison(SI)) { 
-    // If we only have one predecessor, and if it is a branch on this value, 
-    // see if that predecessor totally determines the outcome of this switch. 
-    if (BasicBlock *OnlyPred = BB->getSinglePredecessor()) 
-      if (SimplifyEqualityComparisonWithOnlyPredecessor(SI, OnlyPred, Builder)) 
-        return requestResimplify(); 
- 
-    Value *Cond = SI->getCondition(); 
-    if (SelectInst *Select = dyn_cast<SelectInst>(Cond)) 
-      if (SimplifySwitchOnSelect(SI, Select)) 
-        return requestResimplify(); 
- 
-    // If the block only contains the switch, see if we can fold the block 
-    // away into any preds. 
-    if (SI == &*BB->instructionsWithoutDebug().begin()) 
-      if (FoldValueComparisonIntoPredecessors(SI, Builder)) 
-        return requestResimplify(); 
-  } 
- 
-  // Try to transform the switch into an icmp and a branch. 
-  if (TurnSwitchRangeIntoICmp(SI, Builder)) 
-    return requestResimplify(); 
- 
-  // Remove unreachable cases. 
+  ++NumLookupTables;
+  if (NeedMask)
+    ++NumLookupTablesHoles;
+  return true;
+}
+
+static bool isSwitchDense(ArrayRef<int64_t> Values) {
+  // See also SelectionDAGBuilder::isDense(), which this function was based on.
+  uint64_t Diff = (uint64_t)Values.back() - (uint64_t)Values.front();
+  uint64_t Range = Diff + 1;
+  uint64_t NumCases = Values.size();
+  // 40% is the default density for building a jump table in optsize/minsize mode.
+  uint64_t MinDensity = 40;
+
+  return NumCases * 100 >= Range * MinDensity;
+}
+
+/// Try to transform a switch that has "holes" in it to a contiguous sequence
+/// of cases.
+///
+/// A switch such as: switch(i) {case 5: case 9: case 13: case 17:} can be
+/// range-reduced to: switch ((i-5) / 4) {case 0: case 1: case 2: case 3:}.
+///
+/// This converts a sparse switch into a dense switch which allows better
+/// lowering and could also allow transforming into a lookup table.
+static bool ReduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder,
+                              const DataLayout &DL,
+                              const TargetTransformInfo &TTI) {
+  auto *CondTy = cast<IntegerType>(SI->getCondition()->getType());
+  if (CondTy->getIntegerBitWidth() > 64 ||
+      !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth()))
+    return false;
+  // Only bother with this optimization if there are more than 3 switch cases;
+  // SDAG will only bother creating jump tables for 4 or more cases.
+  if (SI->getNumCases() < 4)
+    return false;
+
+  // This transform is agnostic to the signedness of the input or case values. We
+  // can treat the case values as signed or unsigned. We can optimize more common
+  // cases such as a sequence crossing zero {-4,0,4,8} if we interpret case values
+  // as signed.
+  SmallVector<int64_t,4> Values;
+  for (auto &C : SI->cases())
+    Values.push_back(C.getCaseValue()->getValue().getSExtValue());
+  llvm::sort(Values);
+
+  // If the switch is already dense, there's nothing useful to do here.
+  if (isSwitchDense(Values))
+    return false;
+
+  // First, transform the values such that they start at zero and ascend.
+  int64_t Base = Values[0];
+  for (auto &V : Values)
+    V -= (uint64_t)(Base);
+
+  // Now we have signed numbers that have been shifted so that, given enough
+  // precision, there are no negative values. Since the rest of the transform
+  // is bitwise only, we switch now to an unsigned representation.
+
+  // This transform can be done speculatively because it is so cheap - it
+  // results in a single rotate operation being inserted.
+  // FIXME: It's possible that optimizing a switch on powers of two might also
+  // be beneficial - flag values are often powers of two and we could use a CLZ
+  // as the key function.
+
+  // countTrailingZeros(0) returns 64. As Values is guaranteed to have more than
+  // one element and LLVM disallows duplicate cases, Shift is guaranteed to be
+  // less than 64.
+  unsigned Shift = 64;
+  for (auto &V : Values)
+    Shift = std::min(Shift, countTrailingZeros((uint64_t)V));
+  assert(Shift < 64);
+  if (Shift > 0)
+    for (auto &V : Values)
+      V = (int64_t)((uint64_t)V >> Shift);
+
+  if (!isSwitchDense(Values))
+    // Transform didn't create a dense switch.
+    return false;
+
+  // The obvious transform is to shift the switch condition right and emit a
+  // check that the condition actually cleanly divided by GCD, i.e.
+  //   C & (1 << Shift - 1) == 0
+  // inserting a new CFG edge to handle the case where it didn't divide cleanly.
+  //
+  // A cheaper way of doing this is a simple ROTR(C, Shift). This performs the
+  // shift and puts the shifted-off bits in the uppermost bits. If any of these
+  // are nonzero then the switch condition will be very large and will hit the
+  // default case.
+
+  auto *Ty = cast<IntegerType>(SI->getCondition()->getType());
+  Builder.SetInsertPoint(SI);
+  auto *ShiftC = ConstantInt::get(Ty, Shift);
+  auto *Sub = Builder.CreateSub(SI->getCondition(), ConstantInt::get(Ty, Base));
+  auto *LShr = Builder.CreateLShr(Sub, ShiftC);
+  auto *Shl = Builder.CreateShl(Sub, Ty->getBitWidth() - Shift);
+  auto *Rot = Builder.CreateOr(LShr, Shl);
+  SI->replaceUsesOfWith(SI->getCondition(), Rot);
+
+  for (auto Case : SI->cases()) {
+    auto *Orig = Case.getCaseValue();
+    auto Sub = Orig->getValue() - APInt(Ty->getBitWidth(), Base);
+    Case.setValue(
+        cast<ConstantInt>(ConstantInt::get(Ty, Sub.lshr(ShiftC->getValue()))));
+  }
+  return true;
+}
+
+bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
+  BasicBlock *BB = SI->getParent();
+
+  if (isValueEqualityComparison(SI)) {
+    // If we only have one predecessor, and if it is a branch on this value,
+    // see if that predecessor totally determines the outcome of this switch.
+    if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
+      if (SimplifyEqualityComparisonWithOnlyPredecessor(SI, OnlyPred, Builder))
+        return requestResimplify();
+
+    Value *Cond = SI->getCondition();
+    if (SelectInst *Select = dyn_cast<SelectInst>(Cond))
+      if (SimplifySwitchOnSelect(SI, Select))
+        return requestResimplify();
+
+    // If the block only contains the switch, see if we can fold the block
+    // away into any preds.
+    if (SI == &*BB->instructionsWithoutDebug().begin())
+      if (FoldValueComparisonIntoPredecessors(SI, Builder))
+        return requestResimplify();
+  }
+
+  // Try to transform the switch into an icmp and a branch.
+  if (TurnSwitchRangeIntoICmp(SI, Builder))
+    return requestResimplify();
+
+  // Remove unreachable cases.
   if (eliminateDeadSwitchCases(SI, DTU, Options.AC, DL))
-    return requestResimplify(); 
- 
+    return requestResimplify();
+
   if (switchToSelect(SI, Builder, DTU, DL, TTI))
-    return requestResimplify(); 
- 
-  if (Options.ForwardSwitchCondToPhi && ForwardSwitchConditionToPHI(SI)) 
-    return requestResimplify(); 
- 
-  // The conversion from switch to lookup tables results in difficult-to-analyze 
-  // code and makes pruning branches much harder. This is a problem if the 
-  // switch expression itself can still be restricted as a result of inlining or 
-  // CVP. Therefore, only apply this transformation during late stages of the 
-  // optimisation pipeline. 
-  if (Options.ConvertSwitchToLookupTable && 
+    return requestResimplify();
+
+  if (Options.ForwardSwitchCondToPhi && ForwardSwitchConditionToPHI(SI))
+    return requestResimplify();
+
+  // The conversion from switch to lookup tables results in difficult-to-analyze
+  // code and makes pruning branches much harder. This is a problem if the
+  // switch expression itself can still be restricted as a result of inlining or
+  // CVP. Therefore, only apply this transformation during late stages of the
+  // optimisation pipeline.
+  if (Options.ConvertSwitchToLookupTable &&
       SwitchToLookupTable(SI, Builder, DTU, DL, TTI))
-    return requestResimplify(); 
- 
-  if (ReduceSwitchRange(SI, Builder, DL, TTI)) 
-    return requestResimplify(); 
- 
-  return false; 
-} 
- 
-bool SimplifyCFGOpt::simplifyIndirectBr(IndirectBrInst *IBI) { 
-  BasicBlock *BB = IBI->getParent(); 
-  bool Changed = false; 
- 
-  // Eliminate redundant destinations. 
-  SmallPtrSet<Value *, 8> Succs; 
+    return requestResimplify();
+
+  if (ReduceSwitchRange(SI, Builder, DL, TTI))
+    return requestResimplify();
+
+  return false;
+}
+
+bool SimplifyCFGOpt::simplifyIndirectBr(IndirectBrInst *IBI) {
+  BasicBlock *BB = IBI->getParent();
+  bool Changed = false;
+
+  // Eliminate redundant destinations.
+  SmallPtrSet<Value *, 8> Succs;
   SmallSetVector<BasicBlock *, 8> RemovedSuccs;
-  for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) { 
-    BasicBlock *Dest = IBI->getDestination(i); 
-    if (!Dest->hasAddressTaken() || !Succs.insert(Dest).second) { 
+  for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
+    BasicBlock *Dest = IBI->getDestination(i);
+    if (!Dest->hasAddressTaken() || !Succs.insert(Dest).second) {
       if (!Dest->hasAddressTaken())
         RemovedSuccs.insert(Dest);
-      Dest->removePredecessor(BB); 
-      IBI->removeDestination(i); 
-      --i; 
-      --e; 
-      Changed = true; 
-    } 
-  } 
- 
+      Dest->removePredecessor(BB);
+      IBI->removeDestination(i);
+      --i;
+      --e;
+      Changed = true;
+    }
+  }
+
   if (DTU) {
     std::vector<DominatorTree::UpdateType> Updates;
     Updates.reserve(RemovedSuccs.size());
@@ -6103,329 +6103,329 @@ bool SimplifyCFGOpt::simplifyIndirectBr(IndirectBrInst *IBI) {
     DTU->applyUpdates(Updates);
   }
 
-  if (IBI->getNumDestinations() == 0) { 
-    // If the indirectbr has no successors, change it to unreachable. 
-    new UnreachableInst(IBI->getContext(), IBI); 
-    EraseTerminatorAndDCECond(IBI); 
-    return true; 
-  } 
- 
-  if (IBI->getNumDestinations() == 1) { 
-    // If the indirectbr has one successor, change it to a direct branch. 
-    BranchInst::Create(IBI->getDestination(0), IBI); 
-    EraseTerminatorAndDCECond(IBI); 
-    return true; 
-  } 
- 
-  if (SelectInst *SI = dyn_cast<SelectInst>(IBI->getAddress())) { 
-    if (SimplifyIndirectBrOnSelect(IBI, SI)) 
-      return requestResimplify(); 
-  } 
-  return Changed; 
-} 
- 
-/// Given an block with only a single landing pad and a unconditional branch 
-/// try to find another basic block which this one can be merged with.  This 
-/// handles cases where we have multiple invokes with unique landing pads, but 
-/// a shared handler. 
-/// 
-/// We specifically choose to not worry about merging non-empty blocks 
-/// here.  That is a PRE/scheduling problem and is best solved elsewhere.  In 
-/// practice, the optimizer produces empty landing pad blocks quite frequently 
-/// when dealing with exception dense code.  (see: instcombine, gvn, if-else 
-/// sinking in this file) 
-/// 
-/// This is primarily a code size optimization.  We need to avoid performing 
-/// any transform which might inhibit optimization (such as our ability to 
-/// specialize a particular handler via tail commoning).  We do this by not 
-/// merging any blocks which require us to introduce a phi.  Since the same 
-/// values are flowing through both blocks, we don't lose any ability to 
-/// specialize.  If anything, we make such specialization more likely. 
-/// 
-/// TODO - This transformation could remove entries from a phi in the target 
-/// block when the inputs in the phi are the same for the two blocks being 
-/// merged.  In some cases, this could result in removal of the PHI entirely. 
-static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI, 
+  if (IBI->getNumDestinations() == 0) {
+    // If the indirectbr has no successors, change it to unreachable.
+    new UnreachableInst(IBI->getContext(), IBI);
+    EraseTerminatorAndDCECond(IBI);
+    return true;
+  }
+
+  if (IBI->getNumDestinations() == 1) {
+    // If the indirectbr has one successor, change it to a direct branch.
+    BranchInst::Create(IBI->getDestination(0), IBI);
+    EraseTerminatorAndDCECond(IBI);
+    return true;
+  }
+
+  if (SelectInst *SI = dyn_cast<SelectInst>(IBI->getAddress())) {
+    if (SimplifyIndirectBrOnSelect(IBI, SI))
+      return requestResimplify();
+  }
+  return Changed;
+}
+
+/// Given an block with only a single landing pad and a unconditional branch
+/// try to find another basic block which this one can be merged with.  This
+/// handles cases where we have multiple invokes with unique landing pads, but
+/// a shared handler.
+///
+/// We specifically choose to not worry about merging non-empty blocks
+/// here.  That is a PRE/scheduling problem and is best solved elsewhere.  In
+/// practice, the optimizer produces empty landing pad blocks quite frequently
+/// when dealing with exception dense code.  (see: instcombine, gvn, if-else
+/// sinking in this file)
+///
+/// This is primarily a code size optimization.  We need to avoid performing
+/// any transform which might inhibit optimization (such as our ability to
+/// specialize a particular handler via tail commoning).  We do this by not
+/// merging any blocks which require us to introduce a phi.  Since the same
+/// values are flowing through both blocks, we don't lose any ability to
+/// specialize.  If anything, we make such specialization more likely.
+///
+/// TODO - This transformation could remove entries from a phi in the target
+/// block when the inputs in the phi are the same for the two blocks being
+/// merged.  In some cases, this could result in removal of the PHI entirely.
+static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
                                  BasicBlock *BB, DomTreeUpdater *DTU) {
-  auto Succ = BB->getUniqueSuccessor(); 
-  assert(Succ); 
-  // If there's a phi in the successor block, we'd likely have to introduce 
-  // a phi into the merged landing pad block. 
-  if (isa<PHINode>(*Succ->begin())) 
-    return false; 
- 
-  for (BasicBlock *OtherPred : predecessors(Succ)) { 
-    if (BB == OtherPred) 
-      continue; 
-    BasicBlock::iterator I = OtherPred->begin(); 
-    LandingPadInst *LPad2 = dyn_cast<LandingPadInst>(I); 
-    if (!LPad2 || !LPad2->isIdenticalTo(LPad)) 
-      continue; 
-    for (++I; isa<DbgInfoIntrinsic>(I); ++I) 
-      ; 
-    BranchInst *BI2 = dyn_cast<BranchInst>(I); 
-    if (!BI2 || !BI2->isIdenticalTo(BI)) 
-      continue; 
- 
+  auto Succ = BB->getUniqueSuccessor();
+  assert(Succ);
+  // If there's a phi in the successor block, we'd likely have to introduce
+  // a phi into the merged landing pad block.
+  if (isa<PHINode>(*Succ->begin()))
+    return false;
+
+  for (BasicBlock *OtherPred : predecessors(Succ)) {
+    if (BB == OtherPred)
+      continue;
+    BasicBlock::iterator I = OtherPred->begin();
+    LandingPadInst *LPad2 = dyn_cast<LandingPadInst>(I);
+    if (!LPad2 || !LPad2->isIdenticalTo(LPad))
+      continue;
+    for (++I; isa<DbgInfoIntrinsic>(I); ++I)
+      ;
+    BranchInst *BI2 = dyn_cast<BranchInst>(I);
+    if (!BI2 || !BI2->isIdenticalTo(BI))
+      continue;
+
     std::vector<DominatorTree::UpdateType> Updates;
 
-    // We've found an identical block.  Update our predecessors to take that 
-    // path instead and make ourselves dead. 
-    SmallPtrSet<BasicBlock *, 16> Preds; 
-    Preds.insert(pred_begin(BB), pred_end(BB)); 
-    for (BasicBlock *Pred : Preds) { 
-      InvokeInst *II = cast<InvokeInst>(Pred->getTerminator()); 
-      assert(II->getNormalDest() != BB && II->getUnwindDest() == BB && 
-             "unexpected successor"); 
-      II->setUnwindDest(OtherPred); 
+    // We've found an identical block.  Update our predecessors to take that
+    // path instead and make ourselves dead.
+    SmallPtrSet<BasicBlock *, 16> Preds;
+    Preds.insert(pred_begin(BB), pred_end(BB));
+    for (BasicBlock *Pred : Preds) {
+      InvokeInst *II = cast<InvokeInst>(Pred->getTerminator());
+      assert(II->getNormalDest() != BB && II->getUnwindDest() == BB &&
+             "unexpected successor");
+      II->setUnwindDest(OtherPred);
       Updates.push_back({DominatorTree::Insert, Pred, OtherPred});
       Updates.push_back({DominatorTree::Delete, Pred, BB});
-    } 
- 
-    // The debug info in OtherPred doesn't cover the merged control flow that 
-    // used to go through BB.  We need to delete it or update it. 
-    for (auto I = OtherPred->begin(), E = OtherPred->end(); I != E;) { 
-      Instruction &Inst = *I; 
-      I++; 
-      if (isa<DbgInfoIntrinsic>(Inst)) 
-        Inst.eraseFromParent(); 
-    } 
- 
-    SmallPtrSet<BasicBlock *, 16> Succs; 
-    Succs.insert(succ_begin(BB), succ_end(BB)); 
-    for (BasicBlock *Succ : Succs) { 
-      Succ->removePredecessor(BB); 
+    }
+
+    // The debug info in OtherPred doesn't cover the merged control flow that
+    // used to go through BB.  We need to delete it or update it.
+    for (auto I = OtherPred->begin(), E = OtherPred->end(); I != E;) {
+      Instruction &Inst = *I;
+      I++;
+      if (isa<DbgInfoIntrinsic>(Inst))
+        Inst.eraseFromParent();
+    }
+
+    SmallPtrSet<BasicBlock *, 16> Succs;
+    Succs.insert(succ_begin(BB), succ_end(BB));
+    for (BasicBlock *Succ : Succs) {
+      Succ->removePredecessor(BB);
       Updates.push_back({DominatorTree::Delete, BB, Succ});
-    } 
- 
-    IRBuilder<> Builder(BI); 
-    Builder.CreateUnreachable(); 
-    BI->eraseFromParent(); 
+    }
+
+    IRBuilder<> Builder(BI);
+    Builder.CreateUnreachable();
+    BI->eraseFromParent();
     if (DTU)
       DTU->applyUpdates(Updates);
-    return true; 
-  } 
-  return false; 
-} 
- 
-bool SimplifyCFGOpt::simplifyBranch(BranchInst *Branch, IRBuilder<> &Builder) { 
-  return Branch->isUnconditional() ? simplifyUncondBranch(Branch, Builder) 
-                                   : simplifyCondBranch(Branch, Builder); 
-} 
- 
-bool SimplifyCFGOpt::simplifyUncondBranch(BranchInst *BI, 
-                                          IRBuilder<> &Builder) { 
-  BasicBlock *BB = BI->getParent(); 
-  BasicBlock *Succ = BI->getSuccessor(0); 
- 
-  // If the Terminator is the only non-phi instruction, simplify the block. 
-  // If LoopHeader is provided, check if the block or its successor is a loop 
-  // header. (This is for early invocations before loop simplify and 
-  // vectorization to keep canonical loop forms for nested loops. These blocks 
-  // can be eliminated when the pass is invoked later in the back-end.) 
-  // Note that if BB has only one predecessor then we do not introduce new 
-  // backedge, so we can eliminate BB. 
-  bool NeedCanonicalLoop = 
-      Options.NeedCanonicalLoop && 
+    return true;
+  }
+  return false;
+}
+
+bool SimplifyCFGOpt::simplifyBranch(BranchInst *Branch, IRBuilder<> &Builder) {
+  return Branch->isUnconditional() ? simplifyUncondBranch(Branch, Builder)
+                                   : simplifyCondBranch(Branch, Builder);
+}
+
+bool SimplifyCFGOpt::simplifyUncondBranch(BranchInst *BI,
+                                          IRBuilder<> &Builder) {
+  BasicBlock *BB = BI->getParent();
+  BasicBlock *Succ = BI->getSuccessor(0);
+
+  // If the Terminator is the only non-phi instruction, simplify the block.
+  // If LoopHeader is provided, check if the block or its successor is a loop
+  // header. (This is for early invocations before loop simplify and
+  // vectorization to keep canonical loop forms for nested loops. These blocks
+  // can be eliminated when the pass is invoked later in the back-end.)
+  // Note that if BB has only one predecessor then we do not introduce new
+  // backedge, so we can eliminate BB.
+  bool NeedCanonicalLoop =
+      Options.NeedCanonicalLoop &&
       (!LoopHeaders.empty() && BB->hasNPredecessorsOrMore(2) &&
        (is_contained(LoopHeaders, BB) || is_contained(LoopHeaders, Succ)));
-  BasicBlock::iterator I = BB->getFirstNonPHIOrDbg()->getIterator(); 
-  if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() && 
+  BasicBlock::iterator I = BB->getFirstNonPHIOrDbg()->getIterator();
+  if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() &&
       !NeedCanonicalLoop && TryToSimplifyUncondBranchFromEmptyBlock(BB, DTU))
-    return true; 
- 
-  // If the only instruction in the block is a seteq/setne comparison against a 
-  // constant, try to simplify the block. 
-  if (ICmpInst *ICI = dyn_cast<ICmpInst>(I)) 
-    if (ICI->isEquality() && isa<ConstantInt>(ICI->getOperand(1))) { 
-      for (++I; isa<DbgInfoIntrinsic>(I); ++I) 
-        ; 
-      if (I->isTerminator() && 
-          tryToSimplifyUncondBranchWithICmpInIt(ICI, Builder)) 
-        return true; 
-    } 
- 
-  // See if we can merge an empty landing pad block with another which is 
-  // equivalent. 
-  if (LandingPadInst *LPad = dyn_cast<LandingPadInst>(I)) { 
-    for (++I; isa<DbgInfoIntrinsic>(I); ++I) 
-      ; 
+    return true;
+
+  // If the only instruction in the block is a seteq/setne comparison against a
+  // constant, try to simplify the block.
+  if (ICmpInst *ICI = dyn_cast<ICmpInst>(I))
+    if (ICI->isEquality() && isa<ConstantInt>(ICI->getOperand(1))) {
+      for (++I; isa<DbgInfoIntrinsic>(I); ++I)
+        ;
+      if (I->isTerminator() &&
+          tryToSimplifyUncondBranchWithICmpInIt(ICI, Builder))
+        return true;
+    }
+
+  // See if we can merge an empty landing pad block with another which is
+  // equivalent.
+  if (LandingPadInst *LPad = dyn_cast<LandingPadInst>(I)) {
+    for (++I; isa<DbgInfoIntrinsic>(I); ++I)
+      ;
     if (I->isTerminator() && TryToMergeLandingPad(LPad, BI, BB, DTU))
-      return true; 
-  } 
- 
-  // If this basic block is ONLY a compare and a branch, and if a predecessor 
-  // branches to us and our successor, fold the comparison into the 
-  // predecessor and use logical operations to update the incoming value 
-  // for PHI nodes in common successor. 
+      return true;
+  }
+
+  // If this basic block is ONLY a compare and a branch, and if a predecessor
+  // branches to us and our successor, fold the comparison into the
+  // predecessor and use logical operations to update the incoming value
+  // for PHI nodes in common successor.
   if (FoldBranchToCommonDest(BI, DTU, /*MSSAU=*/nullptr, &TTI,
                              Options.BonusInstThreshold))
-    return requestResimplify(); 
-  return false; 
-} 
- 
-static BasicBlock *allPredecessorsComeFromSameSource(BasicBlock *BB) { 
-  BasicBlock *PredPred = nullptr; 
-  for (auto *P : predecessors(BB)) { 
-    BasicBlock *PPred = P->getSinglePredecessor(); 
-    if (!PPred || (PredPred && PredPred != PPred)) 
-      return nullptr; 
-    PredPred = PPred; 
-  } 
-  return PredPred; 
-} 
- 
-bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { 
-  BasicBlock *BB = BI->getParent(); 
-  if (!Options.SimplifyCondBranch) 
-    return false; 
- 
-  // Conditional branch 
-  if (isValueEqualityComparison(BI)) { 
-    // If we only have one predecessor, and if it is a branch on this value, 
-    // see if that predecessor totally determines the outcome of this 
-    // switch. 
-    if (BasicBlock *OnlyPred = BB->getSinglePredecessor()) 
-      if (SimplifyEqualityComparisonWithOnlyPredecessor(BI, OnlyPred, Builder)) 
-        return requestResimplify(); 
- 
-    // This block must be empty, except for the setcond inst, if it exists. 
-    // Ignore dbg intrinsics. 
-    auto I = BB->instructionsWithoutDebug().begin(); 
-    if (&*I == BI) { 
-      if (FoldValueComparisonIntoPredecessors(BI, Builder)) 
-        return requestResimplify(); 
-    } else if (&*I == cast<Instruction>(BI->getCondition())) { 
-      ++I; 
-      if (&*I == BI && FoldValueComparisonIntoPredecessors(BI, Builder)) 
-        return requestResimplify(); 
-    } 
-  } 
- 
-  // Try to turn "br (X == 0 | X == 1), T, F" into a switch instruction. 
-  if (SimplifyBranchOnICmpChain(BI, Builder, DL)) 
-    return true; 
- 
-  // If this basic block has dominating predecessor blocks and the dominating 
-  // blocks' conditions imply BI's condition, we know the direction of BI. 
-  Optional<bool> Imp = isImpliedByDomCondition(BI->getCondition(), BI, DL); 
-  if (Imp) { 
-    // Turn this into a branch on constant. 
-    auto *OldCond = BI->getCondition(); 
-    ConstantInt *TorF = *Imp ? ConstantInt::getTrue(BB->getContext()) 
-                             : ConstantInt::getFalse(BB->getContext()); 
-    BI->setCondition(TorF); 
-    RecursivelyDeleteTriviallyDeadInstructions(OldCond); 
-    return requestResimplify(); 
-  } 
- 
-  // If this basic block is ONLY a compare and a branch, and if a predecessor 
-  // branches to us and one of our successors, fold the comparison into the 
-  // predecessor and use logical operations to pick the right destination. 
+    return requestResimplify();
+  return false;
+}
+
+static BasicBlock *allPredecessorsComeFromSameSource(BasicBlock *BB) {
+  BasicBlock *PredPred = nullptr;
+  for (auto *P : predecessors(BB)) {
+    BasicBlock *PPred = P->getSinglePredecessor();
+    if (!PPred || (PredPred && PredPred != PPred))
+      return nullptr;
+    PredPred = PPred;
+  }
+  return PredPred;
+}
+
+bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
+  BasicBlock *BB = BI->getParent();
+  if (!Options.SimplifyCondBranch)
+    return false;
+
+  // Conditional branch
+  if (isValueEqualityComparison(BI)) {
+    // If we only have one predecessor, and if it is a branch on this value,
+    // see if that predecessor totally determines the outcome of this
+    // switch.
+    if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
+      if (SimplifyEqualityComparisonWithOnlyPredecessor(BI, OnlyPred, Builder))
+        return requestResimplify();
+
+    // This block must be empty, except for the setcond inst, if it exists.
+    // Ignore dbg intrinsics.
+    auto I = BB->instructionsWithoutDebug().begin();
+    if (&*I == BI) {
+      if (FoldValueComparisonIntoPredecessors(BI, Builder))
+        return requestResimplify();
+    } else if (&*I == cast<Instruction>(BI->getCondition())) {
+      ++I;
+      if (&*I == BI && FoldValueComparisonIntoPredecessors(BI, Builder))
+        return requestResimplify();
+    }
+  }
+
+  // Try to turn "br (X == 0 | X == 1), T, F" into a switch instruction.
+  if (SimplifyBranchOnICmpChain(BI, Builder, DL))
+    return true;
+
+  // If this basic block has dominating predecessor blocks and the dominating
+  // blocks' conditions imply BI's condition, we know the direction of BI.
+  Optional<bool> Imp = isImpliedByDomCondition(BI->getCondition(), BI, DL);
+  if (Imp) {
+    // Turn this into a branch on constant.
+    auto *OldCond = BI->getCondition();
+    ConstantInt *TorF = *Imp ? ConstantInt::getTrue(BB->getContext())
+                             : ConstantInt::getFalse(BB->getContext());
+    BI->setCondition(TorF);
+    RecursivelyDeleteTriviallyDeadInstructions(OldCond);
+    return requestResimplify();
+  }
+
+  // If this basic block is ONLY a compare and a branch, and if a predecessor
+  // branches to us and one of our successors, fold the comparison into the
+  // predecessor and use logical operations to pick the right destination.
   if (FoldBranchToCommonDest(BI, DTU, /*MSSAU=*/nullptr, &TTI,
                              Options.BonusInstThreshold))
-    return requestResimplify(); 
- 
-  // We have a conditional branch to two blocks that are only reachable 
-  // from BI.  We know that the condbr dominates the two blocks, so see if 
-  // there is any identical code in the "then" and "else" blocks.  If so, we 
-  // can hoist it up to the branching block. 
-  if (BI->getSuccessor(0)->getSinglePredecessor()) { 
-    if (BI->getSuccessor(1)->getSinglePredecessor()) { 
+    return requestResimplify();
+
+  // We have a conditional branch to two blocks that are only reachable
+  // from BI.  We know that the condbr dominates the two blocks, so see if
+  // there is any identical code in the "then" and "else" blocks.  If so, we
+  // can hoist it up to the branching block.
+  if (BI->getSuccessor(0)->getSinglePredecessor()) {
+    if (BI->getSuccessor(1)->getSinglePredecessor()) {
       if (HoistCommon && Options.HoistCommonInsts)
         if (HoistThenElseCodeToIf(BI, TTI))
           return requestResimplify();
-    } else { 
-      // If Successor #1 has multiple preds, we may be able to conditionally 
-      // execute Successor #0 if it branches to Successor #1. 
-      Instruction *Succ0TI = BI->getSuccessor(0)->getTerminator(); 
-      if (Succ0TI->getNumSuccessors() == 1 && 
-          Succ0TI->getSuccessor(0) == BI->getSuccessor(1)) 
-        if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0), TTI)) 
-          return requestResimplify(); 
-    } 
-  } else if (BI->getSuccessor(1)->getSinglePredecessor()) { 
-    // If Successor #0 has multiple preds, we may be able to conditionally 
-    // execute Successor #1 if it branches to Successor #0. 
-    Instruction *Succ1TI = BI->getSuccessor(1)->getTerminator(); 
-    if (Succ1TI->getNumSuccessors() == 1 && 
-        Succ1TI->getSuccessor(0) == BI->getSuccessor(0)) 
-      if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1), TTI)) 
-        return requestResimplify(); 
-  } 
- 
-  // If this is a branch on a phi node in the current block, thread control 
-  // through this block if any PHI node entries are constants. 
-  if (PHINode *PN = dyn_cast<PHINode>(BI->getCondition())) 
-    if (PN->getParent() == BI->getParent()) 
+    } else {
+      // If Successor #1 has multiple preds, we may be able to conditionally
+      // execute Successor #0 if it branches to Successor #1.
+      Instruction *Succ0TI = BI->getSuccessor(0)->getTerminator();
+      if (Succ0TI->getNumSuccessors() == 1 &&
+          Succ0TI->getSuccessor(0) == BI->getSuccessor(1))
+        if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0), TTI))
+          return requestResimplify();
+    }
+  } else if (BI->getSuccessor(1)->getSinglePredecessor()) {
+    // If Successor #0 has multiple preds, we may be able to conditionally
+    // execute Successor #1 if it branches to Successor #0.
+    Instruction *Succ1TI = BI->getSuccessor(1)->getTerminator();
+    if (Succ1TI->getNumSuccessors() == 1 &&
+        Succ1TI->getSuccessor(0) == BI->getSuccessor(0))
+      if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1), TTI))
+        return requestResimplify();
+  }
+
+  // If this is a branch on a phi node in the current block, thread control
+  // through this block if any PHI node entries are constants.
+  if (PHINode *PN = dyn_cast<PHINode>(BI->getCondition()))
+    if (PN->getParent() == BI->getParent())
       if (FoldCondBranchOnPHI(BI, DTU, DL, Options.AC))
-        return requestResimplify(); 
- 
-  // Scan predecessor blocks for conditional branches. 
-  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) 
-    if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator())) 
-      if (PBI != BI && PBI->isConditional()) 
+        return requestResimplify();
+
+  // Scan predecessor blocks for conditional branches.
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+    if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator()))
+      if (PBI != BI && PBI->isConditional())
         if (SimplifyCondBranchToCondBranch(PBI, BI, DTU, DL, TTI))
-          return requestResimplify(); 
- 
-  // Look for diamond patterns. 
-  if (MergeCondStores) 
-    if (BasicBlock *PrevBB = allPredecessorsComeFromSameSource(BB)) 
-      if (BranchInst *PBI = dyn_cast<BranchInst>(PrevBB->getTerminator())) 
-        if (PBI != BI && PBI->isConditional()) 
+          return requestResimplify();
+
+  // Look for diamond patterns.
+  if (MergeCondStores)
+    if (BasicBlock *PrevBB = allPredecessorsComeFromSameSource(BB))
+      if (BranchInst *PBI = dyn_cast<BranchInst>(PrevBB->getTerminator()))
+        if (PBI != BI && PBI->isConditional())
           if (mergeConditionalStores(PBI, BI, DTU, DL, TTI))
-            return requestResimplify(); 
- 
-  return false; 
-} 
- 
-/// Check if passing a value to an instruction will cause undefined behavior. 
+            return requestResimplify();
+
+  return false;
+}
+
+/// Check if passing a value to an instruction will cause undefined behavior.
 static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I, bool PtrValueMayBeModified) {
-  Constant *C = dyn_cast<Constant>(V); 
-  if (!C) 
-    return false; 
- 
-  if (I->use_empty()) 
-    return false; 
- 
-  if (C->isNullValue() || isa<UndefValue>(C)) { 
-    // Only look at the first use, avoid hurting compile time with long uselists 
-    User *Use = *I->user_begin(); 
- 
-    // Now make sure that there are no instructions in between that can alter 
-    // control flow (eg. calls) 
-    for (BasicBlock::iterator 
-             i = ++BasicBlock::iterator(I), 
-             UI = BasicBlock::iterator(dyn_cast<Instruction>(Use)); 
-         i != UI; ++i) 
-      if (i == I->getParent()->end() || i->mayHaveSideEffects()) 
-        return false; 
- 
-    // Look through GEPs. A load from a GEP derived from NULL is still undefined 
-    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Use)) 
+  Constant *C = dyn_cast<Constant>(V);
+  if (!C)
+    return false;
+
+  if (I->use_empty())
+    return false;
+
+  if (C->isNullValue() || isa<UndefValue>(C)) {
+    // Only look at the first use, avoid hurting compile time with long uselists
+    User *Use = *I->user_begin();
+
+    // Now make sure that there are no instructions in between that can alter
+    // control flow (eg. calls)
+    for (BasicBlock::iterator
+             i = ++BasicBlock::iterator(I),
+             UI = BasicBlock::iterator(dyn_cast<Instruction>(Use));
+         i != UI; ++i)
+      if (i == I->getParent()->end() || i->mayHaveSideEffects())
+        return false;
+
+    // Look through GEPs. A load from a GEP derived from NULL is still undefined
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Use))
       if (GEP->getPointerOperand() == I) {
         if (!GEP->isInBounds() || !GEP->hasAllZeroIndices())
           PtrValueMayBeModified = true;
         return passingValueIsAlwaysUndefined(V, GEP, PtrValueMayBeModified);
       }
- 
-    // Look through bitcasts. 
-    if (BitCastInst *BC = dyn_cast<BitCastInst>(Use)) 
+
+    // Look through bitcasts.
+    if (BitCastInst *BC = dyn_cast<BitCastInst>(Use))
       return passingValueIsAlwaysUndefined(V, BC, PtrValueMayBeModified);
- 
-    // Load from null is undefined. 
-    if (LoadInst *LI = dyn_cast<LoadInst>(Use)) 
-      if (!LI->isVolatile()) 
-        return !NullPointerIsDefined(LI->getFunction(), 
-                                     LI->getPointerAddressSpace()); 
- 
-    // Store to null is undefined. 
-    if (StoreInst *SI = dyn_cast<StoreInst>(Use)) 
-      if (!SI->isVolatile()) 
-        return (!NullPointerIsDefined(SI->getFunction(), 
-                                      SI->getPointerAddressSpace())) && 
-               SI->getPointerOperand() == I; 
- 
+
+    // Load from null is undefined.
+    if (LoadInst *LI = dyn_cast<LoadInst>(Use))
+      if (!LI->isVolatile())
+        return !NullPointerIsDefined(LI->getFunction(),
+                                     LI->getPointerAddressSpace());
+
+    // Store to null is undefined.
+    if (StoreInst *SI = dyn_cast<StoreInst>(Use))
+      if (!SI->isVolatile())
+        return (!NullPointerIsDefined(SI->getFunction(),
+                                      SI->getPointerAddressSpace())) &&
+               SI->getPointerOperand() == I;
+
     if (auto *CB = dyn_cast<CallBase>(Use)) {
       if (C->isNullValue() && NullPointerIsDefined(CB->getFunction()))
         return false;
@@ -6455,114 +6455,114 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I, bool PtrValu
           }
       }
     }
-  } 
-  return false; 
-} 
- 
-/// If BB has an incoming value that will always trigger undefined behavior 
-/// (eg. null pointer dereference), remove the branch leading here. 
+  }
+  return false;
+}
+
+/// If BB has an incoming value that will always trigger undefined behavior
+/// (eg. null pointer dereference), remove the branch leading here.
 static bool removeUndefIntroducingPredecessor(BasicBlock *BB,
                                               DomTreeUpdater *DTU) {
-  for (PHINode &PHI : BB->phis()) 
-    for (unsigned i = 0, e = PHI.getNumIncomingValues(); i != e; ++i) 
-      if (passingValueIsAlwaysUndefined(PHI.getIncomingValue(i), &PHI)) { 
+  for (PHINode &PHI : BB->phis())
+    for (unsigned i = 0, e = PHI.getNumIncomingValues(); i != e; ++i)
+      if (passingValueIsAlwaysUndefined(PHI.getIncomingValue(i), &PHI)) {
         BasicBlock *Predecessor = PHI.getIncomingBlock(i);
         Instruction *T = Predecessor->getTerminator();
-        IRBuilder<> Builder(T); 
-        if (BranchInst *BI = dyn_cast<BranchInst>(T)) { 
+        IRBuilder<> Builder(T);
+        if (BranchInst *BI = dyn_cast<BranchInst>(T)) {
           BB->removePredecessor(Predecessor);
-          // Turn uncoditional branches into unreachables and remove the dead 
-          // destination from conditional branches. 
-          if (BI->isUnconditional()) 
-            Builder.CreateUnreachable(); 
-          else 
-            Builder.CreateBr(BI->getSuccessor(0) == BB ? BI->getSuccessor(1) 
-                                                       : BI->getSuccessor(0)); 
-          BI->eraseFromParent(); 
+          // Turn uncoditional branches into unreachables and remove the dead
+          // destination from conditional branches.
+          if (BI->isUnconditional())
+            Builder.CreateUnreachable();
+          else
+            Builder.CreateBr(BI->getSuccessor(0) == BB ? BI->getSuccessor(1)
+                                                       : BI->getSuccessor(0));
+          BI->eraseFromParent();
           if (DTU)
             DTU->applyUpdates({{DominatorTree::Delete, Predecessor, BB}});
-          return true; 
-        } 
-        // TODO: SwitchInst. 
-      } 
- 
-  return false; 
-} 
- 
+          return true;
+        }
+        // TODO: SwitchInst.
+      }
+
+  return false;
+}
+
 bool SimplifyCFGOpt::simplifyOnceImpl(BasicBlock *BB) {
-  bool Changed = false; 
- 
-  assert(BB && BB->getParent() && "Block not embedded in function!"); 
-  assert(BB->getTerminator() && "Degenerate basic block encountered!"); 
- 
-  // Remove basic blocks that have no predecessors (except the entry block)... 
-  // or that just have themself as a predecessor.  These are unreachable. 
-  if ((pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()) || 
-      BB->getSinglePredecessor() == BB) { 
-    LLVM_DEBUG(dbgs() << "Removing BB: \n" << *BB); 
+  bool Changed = false;
+
+  assert(BB && BB->getParent() && "Block not embedded in function!");
+  assert(BB->getTerminator() && "Degenerate basic block encountered!");
+
+  // Remove basic blocks that have no predecessors (except the entry block)...
+  // or that just have themself as a predecessor.  These are unreachable.
+  if ((pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()) ||
+      BB->getSinglePredecessor() == BB) {
+    LLVM_DEBUG(dbgs() << "Removing BB: \n" << *BB);
     DeleteDeadBlock(BB, DTU);
-    return true; 
-  } 
- 
-  // Check to see if we can constant propagate this terminator instruction 
-  // away... 
+    return true;
+  }
+
+  // Check to see if we can constant propagate this terminator instruction
+  // away...
   Changed |= ConstantFoldTerminator(BB, /*DeleteDeadConditions=*/true,
                                     /*TLI=*/nullptr, DTU);
- 
-  // Check for and eliminate duplicate PHI nodes in this block. 
-  Changed |= EliminateDuplicatePHINodes(BB); 
- 
-  // Check for and remove branches that will always cause undefined behavior. 
+
+  // Check for and eliminate duplicate PHI nodes in this block.
+  Changed |= EliminateDuplicatePHINodes(BB);
+
+  // Check for and remove branches that will always cause undefined behavior.
   Changed |= removeUndefIntroducingPredecessor(BB, DTU);
- 
-  // Merge basic blocks into their predecessor if there is only one distinct 
-  // pred, and if there is only one distinct successor of the predecessor, and 
-  // if there are no PHI nodes. 
+
+  // Merge basic blocks into their predecessor if there is only one distinct
+  // pred, and if there is only one distinct successor of the predecessor, and
+  // if there are no PHI nodes.
   if (MergeBlockIntoPredecessor(BB, DTU))
-    return true; 
- 
-  if (SinkCommon && Options.SinkCommonInsts) 
+    return true;
+
+  if (SinkCommon && Options.SinkCommonInsts)
     Changed |= SinkCommonCodeFromPredecessors(BB, DTU);
- 
-  IRBuilder<> Builder(BB); 
- 
-  if (Options.FoldTwoEntryPHINode) { 
-    // If there is a trivial two-entry PHI node in this basic block, and we can 
-    // eliminate it, do so now. 
-    if (auto *PN = dyn_cast<PHINode>(BB->begin())) 
-      if (PN->getNumIncomingValues() == 2) 
+
+  IRBuilder<> Builder(BB);
+
+  if (Options.FoldTwoEntryPHINode) {
+    // If there is a trivial two-entry PHI node in this basic block, and we can
+    // eliminate it, do so now.
+    if (auto *PN = dyn_cast<PHINode>(BB->begin()))
+      if (PN->getNumIncomingValues() == 2)
         Changed |= FoldTwoEntryPHINode(PN, TTI, DTU, DL);
-  } 
- 
-  Instruction *Terminator = BB->getTerminator(); 
-  Builder.SetInsertPoint(Terminator); 
-  switch (Terminator->getOpcode()) { 
-  case Instruction::Br: 
-    Changed |= simplifyBranch(cast<BranchInst>(Terminator), Builder); 
-    break; 
-  case Instruction::Ret: 
-    Changed |= simplifyReturn(cast<ReturnInst>(Terminator), Builder); 
-    break; 
-  case Instruction::Resume: 
-    Changed |= simplifyResume(cast<ResumeInst>(Terminator), Builder); 
-    break; 
-  case Instruction::CleanupRet: 
-    Changed |= simplifyCleanupReturn(cast<CleanupReturnInst>(Terminator)); 
-    break; 
-  case Instruction::Switch: 
-    Changed |= simplifySwitch(cast<SwitchInst>(Terminator), Builder); 
-    break; 
-  case Instruction::Unreachable: 
-    Changed |= simplifyUnreachable(cast<UnreachableInst>(Terminator)); 
-    break; 
-  case Instruction::IndirectBr: 
-    Changed |= simplifyIndirectBr(cast<IndirectBrInst>(Terminator)); 
-    break; 
-  } 
- 
-  return Changed; 
-} 
- 
+  }
+
+  Instruction *Terminator = BB->getTerminator();
+  Builder.SetInsertPoint(Terminator);
+  switch (Terminator->getOpcode()) {
+  case Instruction::Br:
+    Changed |= simplifyBranch(cast<BranchInst>(Terminator), Builder);
+    break;
+  case Instruction::Ret:
+    Changed |= simplifyReturn(cast<ReturnInst>(Terminator), Builder);
+    break;
+  case Instruction::Resume:
+    Changed |= simplifyResume(cast<ResumeInst>(Terminator), Builder);
+    break;
+  case Instruction::CleanupRet:
+    Changed |= simplifyCleanupReturn(cast<CleanupReturnInst>(Terminator));
+    break;
+  case Instruction::Switch:
+    Changed |= simplifySwitch(cast<SwitchInst>(Terminator), Builder);
+    break;
+  case Instruction::Unreachable:
+    Changed |= simplifyUnreachable(cast<UnreachableInst>(Terminator));
+    break;
+  case Instruction::IndirectBr:
+    Changed |= simplifyIndirectBr(cast<IndirectBrInst>(Terminator));
+    break;
+  }
+
+  return Changed;
+}
+
 bool SimplifyCFGOpt::simplifyOnce(BasicBlock *BB) {
   bool Changed = simplifyOnceImpl(BB);
 
@@ -6574,30 +6574,30 @@ bool SimplifyCFGOpt::simplifyOnce(BasicBlock *BB) {
   return Changed;
 }
 
-bool SimplifyCFGOpt::run(BasicBlock *BB) { 
+bool SimplifyCFGOpt::run(BasicBlock *BB) {
   assert((!RequireAndPreserveDomTree ||
           (DTU &&
            DTU->getDomTree().verify(DominatorTree::VerificationLevel::Full))) &&
          "Original domtree is invalid?");
 
-  bool Changed = false; 
- 
-  // Repeated simplify BB as long as resimplification is requested. 
-  do { 
-    Resimplify = false; 
- 
-    // Perform one round of simplifcation. Resimplify flag will be set if 
-    // another iteration is requested. 
-    Changed |= simplifyOnce(BB); 
-  } while (Resimplify); 
- 
-  return Changed; 
-} 
- 
-bool llvm::simplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI, 
+  bool Changed = false;
+
+  // Repeated simplify BB as long as resimplification is requested.
+  do {
+    Resimplify = false;
+
+    // Perform one round of simplifcation. Resimplify flag will be set if
+    // another iteration is requested.
+    Changed |= simplifyOnce(BB);
+  } while (Resimplify);
+
+  return Changed;
+}
+
+bool llvm::simplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI,
                        DomTreeUpdater *DTU, const SimplifyCFGOptions &Options,
                        ArrayRef<WeakVH> LoopHeaders) {
   return SimplifyCFGOpt(TTI, RequireAndPreserveDomTree ? DTU : nullptr,
                         BB->getModule()->getDataLayout(), LoopHeaders, Options)
-      .run(BB); 
-} 
+      .run(BB);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/SimplifyIndVar.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/SimplifyIndVar.cpp
index d81357a967..290c04a7ad 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -1,973 +1,973 @@
-//===-- SimplifyIndVar.cpp - Induction variable simplification ------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements induction variable simplification. It does 
-// not define any actual pass or policy, but provides a single function to 
-// simplify a loop's induction variables based on ScalarEvolution. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/SimplifyIndVar.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "indvars" 
- 
-STATISTIC(NumElimIdentity, "Number of IV identities eliminated"); 
-STATISTIC(NumElimOperand,  "Number of IV operands folded into a use"); 
-STATISTIC(NumFoldedUser, "Number of IV users folded into a constant"); 
-STATISTIC(NumElimRem     , "Number of IV remainder operations eliminated"); 
-STATISTIC( 
-    NumSimplifiedSDiv, 
-    "Number of IV signed division operations converted to unsigned division"); 
-STATISTIC( 
-    NumSimplifiedSRem, 
-    "Number of IV signed remainder operations converted to unsigned remainder"); 
-STATISTIC(NumElimCmp     , "Number of IV comparisons eliminated"); 
- 
-namespace { 
-  /// This is a utility for simplifying induction variables 
-  /// based on ScalarEvolution. It is the primary instrument of the 
-  /// IndvarSimplify pass, but it may also be directly invoked to cleanup after 
-  /// other loop passes that preserve SCEV. 
-  class SimplifyIndvar { 
-    Loop             *L; 
-    LoopInfo         *LI; 
-    ScalarEvolution  *SE; 
-    DominatorTree    *DT; 
-    const TargetTransformInfo *TTI; 
-    SCEVExpander     &Rewriter; 
-    SmallVectorImpl<WeakTrackingVH> &DeadInsts; 
- 
-    bool Changed; 
- 
-  public: 
-    SimplifyIndvar(Loop *Loop, ScalarEvolution *SE, DominatorTree *DT, 
-                   LoopInfo *LI, const TargetTransformInfo *TTI, 
-                   SCEVExpander &Rewriter, 
-                   SmallVectorImpl<WeakTrackingVH> &Dead) 
-        : L(Loop), LI(LI), SE(SE), DT(DT), TTI(TTI), Rewriter(Rewriter), 
-          DeadInsts(Dead), Changed(false) { 
-      assert(LI && "IV simplification requires LoopInfo"); 
-    } 
- 
-    bool hasChanged() const { return Changed; } 
- 
-    /// Iteratively perform simplification on a worklist of users of the 
-    /// specified induction variable. This is the top-level driver that applies 
-    /// all simplifications to users of an IV. 
-    void simplifyUsers(PHINode *CurrIV, IVVisitor *V = nullptr); 
- 
-    Value *foldIVUser(Instruction *UseInst, Instruction *IVOperand); 
- 
-    bool eliminateIdentitySCEV(Instruction *UseInst, Instruction *IVOperand); 
-    bool replaceIVUserWithLoopInvariant(Instruction *UseInst); 
- 
-    bool eliminateOverflowIntrinsic(WithOverflowInst *WO); 
-    bool eliminateSaturatingIntrinsic(SaturatingInst *SI); 
-    bool eliminateTrunc(TruncInst *TI); 
-    bool eliminateIVUser(Instruction *UseInst, Instruction *IVOperand); 
-    bool makeIVComparisonInvariant(ICmpInst *ICmp, Value *IVOperand); 
-    void eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand); 
-    void simplifyIVRemainder(BinaryOperator *Rem, Value *IVOperand, 
-                             bool IsSigned); 
-    void replaceRemWithNumerator(BinaryOperator *Rem); 
-    void replaceRemWithNumeratorOrZero(BinaryOperator *Rem); 
-    void replaceSRemWithURem(BinaryOperator *Rem); 
-    bool eliminateSDiv(BinaryOperator *SDiv); 
-    bool strengthenOverflowingOperation(BinaryOperator *OBO, Value *IVOperand); 
-    bool strengthenRightShift(BinaryOperator *BO, Value *IVOperand); 
-  }; 
-} 
- 
-/// Fold an IV operand into its use.  This removes increments of an 
-/// aligned IV when used by a instruction that ignores the low bits. 
-/// 
-/// IVOperand is guaranteed SCEVable, but UseInst may not be. 
-/// 
-/// Return the operand of IVOperand for this induction variable if IVOperand can 
-/// be folded (in case more folding opportunities have been exposed). 
-/// Otherwise return null. 
-Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand) { 
-  Value *IVSrc = nullptr; 
-  const unsigned OperIdx = 0; 
-  const SCEV *FoldedExpr = nullptr; 
-  bool MustDropExactFlag = false; 
-  switch (UseInst->getOpcode()) { 
-  default: 
-    return nullptr; 
-  case Instruction::UDiv: 
-  case Instruction::LShr: 
-    // We're only interested in the case where we know something about 
-    // the numerator and have a constant denominator. 
-    if (IVOperand != UseInst->getOperand(OperIdx) || 
-        !isa<ConstantInt>(UseInst->getOperand(1))) 
-      return nullptr; 
- 
-    // Attempt to fold a binary operator with constant operand. 
-    // e.g. ((I + 1) >> 2) => I >> 2 
-    if (!isa<BinaryOperator>(IVOperand) 
-        || !isa<ConstantInt>(IVOperand->getOperand(1))) 
-      return nullptr; 
- 
-    IVSrc = IVOperand->getOperand(0); 
-    // IVSrc must be the (SCEVable) IV, since the other operand is const. 
-    assert(SE->isSCEVable(IVSrc->getType()) && "Expect SCEVable IV operand"); 
- 
-    ConstantInt *D = cast<ConstantInt>(UseInst->getOperand(1)); 
-    if (UseInst->getOpcode() == Instruction::LShr) { 
-      // Get a constant for the divisor. See createSCEV. 
-      uint32_t BitWidth = cast<IntegerType>(UseInst->getType())->getBitWidth(); 
-      if (D->getValue().uge(BitWidth)) 
-        return nullptr; 
- 
-      D = ConstantInt::get(UseInst->getContext(), 
-                           APInt::getOneBitSet(BitWidth, D->getZExtValue())); 
-    } 
-    FoldedExpr = SE->getUDivExpr(SE->getSCEV(IVSrc), SE->getSCEV(D)); 
-    // We might have 'exact' flag set at this point which will no longer be 
-    // correct after we make the replacement. 
-    if (UseInst->isExact() && 
-        SE->getSCEV(IVSrc) != SE->getMulExpr(FoldedExpr, SE->getSCEV(D))) 
-      MustDropExactFlag = true; 
-  } 
-  // We have something that might fold it's operand. Compare SCEVs. 
-  if (!SE->isSCEVable(UseInst->getType())) 
-    return nullptr; 
- 
-  // Bypass the operand if SCEV can prove it has no effect. 
-  if (SE->getSCEV(UseInst) != FoldedExpr) 
-    return nullptr; 
- 
-  LLVM_DEBUG(dbgs() << "INDVARS: Eliminated IV operand: " << *IVOperand 
-                    << " -> " << *UseInst << '\n'); 
- 
-  UseInst->setOperand(OperIdx, IVSrc); 
-  assert(SE->getSCEV(UseInst) == FoldedExpr && "bad SCEV with folded oper"); 
- 
-  if (MustDropExactFlag) 
-    UseInst->dropPoisonGeneratingFlags(); 
- 
-  ++NumElimOperand; 
-  Changed = true; 
-  if (IVOperand->use_empty()) 
-    DeadInsts.emplace_back(IVOperand); 
-  return IVSrc; 
-} 
- 
-bool SimplifyIndvar::makeIVComparisonInvariant(ICmpInst *ICmp, 
-                                               Value *IVOperand) { 
-  unsigned IVOperIdx = 0; 
-  ICmpInst::Predicate Pred = ICmp->getPredicate(); 
-  if (IVOperand != ICmp->getOperand(0)) { 
-    // Swapped 
-    assert(IVOperand == ICmp->getOperand(1) && "Can't find IVOperand"); 
-    IVOperIdx = 1; 
-    Pred = ICmpInst::getSwappedPredicate(Pred); 
-  } 
- 
-  // Get the SCEVs for the ICmp operands (in the specific context of the 
-  // current loop) 
-  const Loop *ICmpLoop = LI->getLoopFor(ICmp->getParent()); 
-  const SCEV *S = SE->getSCEVAtScope(ICmp->getOperand(IVOperIdx), ICmpLoop); 
-  const SCEV *X = SE->getSCEVAtScope(ICmp->getOperand(1 - IVOperIdx), ICmpLoop); 
- 
-  auto *PN = dyn_cast<PHINode>(IVOperand); 
-  if (!PN) 
-    return false; 
+//===-- SimplifyIndVar.cpp - Induction variable simplification ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements induction variable simplification. It does
+// not define any actual pass or policy, but provides a single function to
+// simplify a loop's induction variables based on ScalarEvolution.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "indvars"
+
+STATISTIC(NumElimIdentity, "Number of IV identities eliminated");
+STATISTIC(NumElimOperand,  "Number of IV operands folded into a use");
+STATISTIC(NumFoldedUser, "Number of IV users folded into a constant");
+STATISTIC(NumElimRem     , "Number of IV remainder operations eliminated");
+STATISTIC(
+    NumSimplifiedSDiv,
+    "Number of IV signed division operations converted to unsigned division");
+STATISTIC(
+    NumSimplifiedSRem,
+    "Number of IV signed remainder operations converted to unsigned remainder");
+STATISTIC(NumElimCmp     , "Number of IV comparisons eliminated");
+
+namespace {
+  /// This is a utility for simplifying induction variables
+  /// based on ScalarEvolution. It is the primary instrument of the
+  /// IndvarSimplify pass, but it may also be directly invoked to cleanup after
+  /// other loop passes that preserve SCEV.
+  class SimplifyIndvar {
+    Loop             *L;
+    LoopInfo         *LI;
+    ScalarEvolution  *SE;
+    DominatorTree    *DT;
+    const TargetTransformInfo *TTI;
+    SCEVExpander     &Rewriter;
+    SmallVectorImpl<WeakTrackingVH> &DeadInsts;
+
+    bool Changed;
+
+  public:
+    SimplifyIndvar(Loop *Loop, ScalarEvolution *SE, DominatorTree *DT,
+                   LoopInfo *LI, const TargetTransformInfo *TTI,
+                   SCEVExpander &Rewriter,
+                   SmallVectorImpl<WeakTrackingVH> &Dead)
+        : L(Loop), LI(LI), SE(SE), DT(DT), TTI(TTI), Rewriter(Rewriter),
+          DeadInsts(Dead), Changed(false) {
+      assert(LI && "IV simplification requires LoopInfo");
+    }
+
+    bool hasChanged() const { return Changed; }
+
+    /// Iteratively perform simplification on a worklist of users of the
+    /// specified induction variable. This is the top-level driver that applies
+    /// all simplifications to users of an IV.
+    void simplifyUsers(PHINode *CurrIV, IVVisitor *V = nullptr);
+
+    Value *foldIVUser(Instruction *UseInst, Instruction *IVOperand);
+
+    bool eliminateIdentitySCEV(Instruction *UseInst, Instruction *IVOperand);
+    bool replaceIVUserWithLoopInvariant(Instruction *UseInst);
+
+    bool eliminateOverflowIntrinsic(WithOverflowInst *WO);
+    bool eliminateSaturatingIntrinsic(SaturatingInst *SI);
+    bool eliminateTrunc(TruncInst *TI);
+    bool eliminateIVUser(Instruction *UseInst, Instruction *IVOperand);
+    bool makeIVComparisonInvariant(ICmpInst *ICmp, Value *IVOperand);
+    void eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand);
+    void simplifyIVRemainder(BinaryOperator *Rem, Value *IVOperand,
+                             bool IsSigned);
+    void replaceRemWithNumerator(BinaryOperator *Rem);
+    void replaceRemWithNumeratorOrZero(BinaryOperator *Rem);
+    void replaceSRemWithURem(BinaryOperator *Rem);
+    bool eliminateSDiv(BinaryOperator *SDiv);
+    bool strengthenOverflowingOperation(BinaryOperator *OBO, Value *IVOperand);
+    bool strengthenRightShift(BinaryOperator *BO, Value *IVOperand);
+  };
+}
+
+/// Fold an IV operand into its use.  This removes increments of an
+/// aligned IV when used by a instruction that ignores the low bits.
+///
+/// IVOperand is guaranteed SCEVable, but UseInst may not be.
+///
+/// Return the operand of IVOperand for this induction variable if IVOperand can
+/// be folded (in case more folding opportunities have been exposed).
+/// Otherwise return null.
+Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand) {
+  Value *IVSrc = nullptr;
+  const unsigned OperIdx = 0;
+  const SCEV *FoldedExpr = nullptr;
+  bool MustDropExactFlag = false;
+  switch (UseInst->getOpcode()) {
+  default:
+    return nullptr;
+  case Instruction::UDiv:
+  case Instruction::LShr:
+    // We're only interested in the case where we know something about
+    // the numerator and have a constant denominator.
+    if (IVOperand != UseInst->getOperand(OperIdx) ||
+        !isa<ConstantInt>(UseInst->getOperand(1)))
+      return nullptr;
+
+    // Attempt to fold a binary operator with constant operand.
+    // e.g. ((I + 1) >> 2) => I >> 2
+    if (!isa<BinaryOperator>(IVOperand)
+        || !isa<ConstantInt>(IVOperand->getOperand(1)))
+      return nullptr;
+
+    IVSrc = IVOperand->getOperand(0);
+    // IVSrc must be the (SCEVable) IV, since the other operand is const.
+    assert(SE->isSCEVable(IVSrc->getType()) && "Expect SCEVable IV operand");
+
+    ConstantInt *D = cast<ConstantInt>(UseInst->getOperand(1));
+    if (UseInst->getOpcode() == Instruction::LShr) {
+      // Get a constant for the divisor. See createSCEV.
+      uint32_t BitWidth = cast<IntegerType>(UseInst->getType())->getBitWidth();
+      if (D->getValue().uge(BitWidth))
+        return nullptr;
+
+      D = ConstantInt::get(UseInst->getContext(),
+                           APInt::getOneBitSet(BitWidth, D->getZExtValue()));
+    }
+    FoldedExpr = SE->getUDivExpr(SE->getSCEV(IVSrc), SE->getSCEV(D));
+    // We might have 'exact' flag set at this point which will no longer be
+    // correct after we make the replacement.
+    if (UseInst->isExact() &&
+        SE->getSCEV(IVSrc) != SE->getMulExpr(FoldedExpr, SE->getSCEV(D)))
+      MustDropExactFlag = true;
+  }
+  // We have something that might fold it's operand. Compare SCEVs.
+  if (!SE->isSCEVable(UseInst->getType()))
+    return nullptr;
+
+  // Bypass the operand if SCEV can prove it has no effect.
+  if (SE->getSCEV(UseInst) != FoldedExpr)
+    return nullptr;
+
+  LLVM_DEBUG(dbgs() << "INDVARS: Eliminated IV operand: " << *IVOperand
+                    << " -> " << *UseInst << '\n');
+
+  UseInst->setOperand(OperIdx, IVSrc);
+  assert(SE->getSCEV(UseInst) == FoldedExpr && "bad SCEV with folded oper");
+
+  if (MustDropExactFlag)
+    UseInst->dropPoisonGeneratingFlags();
+
+  ++NumElimOperand;
+  Changed = true;
+  if (IVOperand->use_empty())
+    DeadInsts.emplace_back(IVOperand);
+  return IVSrc;
+}
+
+bool SimplifyIndvar::makeIVComparisonInvariant(ICmpInst *ICmp,
+                                               Value *IVOperand) {
+  unsigned IVOperIdx = 0;
+  ICmpInst::Predicate Pred = ICmp->getPredicate();
+  if (IVOperand != ICmp->getOperand(0)) {
+    // Swapped
+    assert(IVOperand == ICmp->getOperand(1) && "Can't find IVOperand");
+    IVOperIdx = 1;
+    Pred = ICmpInst::getSwappedPredicate(Pred);
+  }
+
+  // Get the SCEVs for the ICmp operands (in the specific context of the
+  // current loop)
+  const Loop *ICmpLoop = LI->getLoopFor(ICmp->getParent());
+  const SCEV *S = SE->getSCEVAtScope(ICmp->getOperand(IVOperIdx), ICmpLoop);
+  const SCEV *X = SE->getSCEVAtScope(ICmp->getOperand(1 - IVOperIdx), ICmpLoop);
+
+  auto *PN = dyn_cast<PHINode>(IVOperand);
+  if (!PN)
+    return false;
   auto LIP = SE->getLoopInvariantPredicate(Pred, S, X, L);
   if (!LIP)
-    return false; 
+    return false;
   ICmpInst::Predicate InvariantPredicate = LIP->Pred;
   const SCEV *InvariantLHS = LIP->LHS;
   const SCEV *InvariantRHS = LIP->RHS;
- 
-  // Rewrite the comparison to a loop invariant comparison if it can be done 
-  // cheaply, where cheaply means "we don't need to emit any new 
-  // instructions". 
- 
-  SmallDenseMap<const SCEV*, Value*> CheapExpansions; 
-  CheapExpansions[S] = ICmp->getOperand(IVOperIdx); 
-  CheapExpansions[X] = ICmp->getOperand(1 - IVOperIdx); 
- 
-  // TODO: Support multiple entry loops?  (We currently bail out of these in 
-  // the IndVarSimplify pass) 
-  if (auto *BB = L->getLoopPredecessor()) { 
-    const int Idx = PN->getBasicBlockIndex(BB); 
-    if (Idx >= 0) { 
-      Value *Incoming = PN->getIncomingValue(Idx); 
-      const SCEV *IncomingS = SE->getSCEV(Incoming); 
-      CheapExpansions[IncomingS] = Incoming; 
-    } 
-  } 
-  Value *NewLHS = CheapExpansions[InvariantLHS]; 
-  Value *NewRHS = CheapExpansions[InvariantRHS]; 
- 
-  if (!NewLHS) 
-    if (auto *ConstLHS = dyn_cast<SCEVConstant>(InvariantLHS)) 
-      NewLHS = ConstLHS->getValue(); 
-  if (!NewRHS) 
-    if (auto *ConstRHS = dyn_cast<SCEVConstant>(InvariantRHS)) 
-      NewRHS = ConstRHS->getValue(); 
- 
-  if (!NewLHS || !NewRHS) 
-    // We could not find an existing value to replace either LHS or RHS. 
-    // Generating new instructions has subtler tradeoffs, so avoid doing that 
-    // for now. 
-    return false; 
- 
-  LLVM_DEBUG(dbgs() << "INDVARS: Simplified comparison: " << *ICmp << '\n'); 
-  ICmp->setPredicate(InvariantPredicate); 
-  ICmp->setOperand(0, NewLHS); 
-  ICmp->setOperand(1, NewRHS); 
-  return true; 
-} 
- 
-/// SimplifyIVUsers helper for eliminating useless 
-/// comparisons against an induction variable. 
-void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) { 
-  unsigned IVOperIdx = 0; 
-  ICmpInst::Predicate Pred = ICmp->getPredicate(); 
-  ICmpInst::Predicate OriginalPred = Pred; 
-  if (IVOperand != ICmp->getOperand(0)) { 
-    // Swapped 
-    assert(IVOperand == ICmp->getOperand(1) && "Can't find IVOperand"); 
-    IVOperIdx = 1; 
-    Pred = ICmpInst::getSwappedPredicate(Pred); 
-  } 
- 
-  // Get the SCEVs for the ICmp operands (in the specific context of the 
-  // current loop) 
-  const Loop *ICmpLoop = LI->getLoopFor(ICmp->getParent()); 
-  const SCEV *S = SE->getSCEVAtScope(ICmp->getOperand(IVOperIdx), ICmpLoop); 
-  const SCEV *X = SE->getSCEVAtScope(ICmp->getOperand(1 - IVOperIdx), ICmpLoop); 
- 
-  // If the condition is always true or always false, replace it with 
-  // a constant value. 
-  if (SE->isKnownPredicate(Pred, S, X)) { 
-    ICmp->replaceAllUsesWith(ConstantInt::getTrue(ICmp->getContext())); 
-    DeadInsts.emplace_back(ICmp); 
-    LLVM_DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n'); 
-  } else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X)) { 
-    ICmp->replaceAllUsesWith(ConstantInt::getFalse(ICmp->getContext())); 
-    DeadInsts.emplace_back(ICmp); 
-    LLVM_DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n'); 
-  } else if (makeIVComparisonInvariant(ICmp, IVOperand)) { 
-    // fallthrough to end of function 
-  } else if (ICmpInst::isSigned(OriginalPred) && 
-             SE->isKnownNonNegative(S) && SE->isKnownNonNegative(X)) { 
-    // If we were unable to make anything above, all we can is to canonicalize 
-    // the comparison hoping that it will open the doors for other 
-    // optimizations. If we find out that we compare two non-negative values, 
-    // we turn the instruction's predicate to its unsigned version. Note that 
-    // we cannot rely on Pred here unless we check if we have swapped it. 
-    assert(ICmp->getPredicate() == OriginalPred && "Predicate changed?"); 
-    LLVM_DEBUG(dbgs() << "INDVARS: Turn to unsigned comparison: " << *ICmp 
-                      << '\n'); 
-    ICmp->setPredicate(ICmpInst::getUnsignedPredicate(OriginalPred)); 
-  } else 
-    return; 
- 
-  ++NumElimCmp; 
-  Changed = true; 
-} 
- 
-bool SimplifyIndvar::eliminateSDiv(BinaryOperator *SDiv) { 
-  // Get the SCEVs for the ICmp operands. 
-  auto *N = SE->getSCEV(SDiv->getOperand(0)); 
-  auto *D = SE->getSCEV(SDiv->getOperand(1)); 
- 
-  // Simplify unnecessary loops away. 
-  const Loop *L = LI->getLoopFor(SDiv->getParent()); 
-  N = SE->getSCEVAtScope(N, L); 
-  D = SE->getSCEVAtScope(D, L); 
- 
-  // Replace sdiv by udiv if both of the operands are non-negative 
-  if (SE->isKnownNonNegative(N) && SE->isKnownNonNegative(D)) { 
-    auto *UDiv = BinaryOperator::Create( 
-        BinaryOperator::UDiv, SDiv->getOperand(0), SDiv->getOperand(1), 
-        SDiv->getName() + ".udiv", SDiv); 
-    UDiv->setIsExact(SDiv->isExact()); 
-    SDiv->replaceAllUsesWith(UDiv); 
-    LLVM_DEBUG(dbgs() << "INDVARS: Simplified sdiv: " << *SDiv << '\n'); 
-    ++NumSimplifiedSDiv; 
-    Changed = true; 
-    DeadInsts.push_back(SDiv); 
-    return true; 
-  } 
- 
-  return false; 
-} 
- 
-// i %s n -> i %u n if i >= 0 and n >= 0 
-void SimplifyIndvar::replaceSRemWithURem(BinaryOperator *Rem) { 
-  auto *N = Rem->getOperand(0), *D = Rem->getOperand(1); 
-  auto *URem = BinaryOperator::Create(BinaryOperator::URem, N, D, 
-                                      Rem->getName() + ".urem", Rem); 
-  Rem->replaceAllUsesWith(URem); 
-  LLVM_DEBUG(dbgs() << "INDVARS: Simplified srem: " << *Rem << '\n'); 
-  ++NumSimplifiedSRem; 
-  Changed = true; 
-  DeadInsts.emplace_back(Rem); 
-} 
- 
-// i % n  -->  i  if i is in [0,n). 
-void SimplifyIndvar::replaceRemWithNumerator(BinaryOperator *Rem) { 
-  Rem->replaceAllUsesWith(Rem->getOperand(0)); 
-  LLVM_DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n'); 
-  ++NumElimRem; 
-  Changed = true; 
-  DeadInsts.emplace_back(Rem); 
-} 
- 
-// (i+1) % n  -->  (i+1)==n?0:(i+1)  if i is in [0,n). 
-void SimplifyIndvar::replaceRemWithNumeratorOrZero(BinaryOperator *Rem) { 
-  auto *T = Rem->getType(); 
-  auto *N = Rem->getOperand(0), *D = Rem->getOperand(1); 
-  ICmpInst *ICmp = new ICmpInst(Rem, ICmpInst::ICMP_EQ, N, D); 
-  SelectInst *Sel = 
-      SelectInst::Create(ICmp, ConstantInt::get(T, 0), N, "iv.rem", Rem); 
-  Rem->replaceAllUsesWith(Sel); 
-  LLVM_DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n'); 
-  ++NumElimRem; 
-  Changed = true; 
-  DeadInsts.emplace_back(Rem); 
-} 
- 
-/// SimplifyIVUsers helper for eliminating useless remainder operations 
-/// operating on an induction variable or replacing srem by urem. 
-void SimplifyIndvar::simplifyIVRemainder(BinaryOperator *Rem, Value *IVOperand, 
-                                         bool IsSigned) { 
-  auto *NValue = Rem->getOperand(0); 
-  auto *DValue = Rem->getOperand(1); 
-  // We're only interested in the case where we know something about 
-  // the numerator, unless it is a srem, because we want to replace srem by urem 
-  // in general. 
-  bool UsedAsNumerator = IVOperand == NValue; 
-  if (!UsedAsNumerator && !IsSigned) 
-    return; 
- 
-  const SCEV *N = SE->getSCEV(NValue); 
- 
-  // Simplify unnecessary loops away. 
-  const Loop *ICmpLoop = LI->getLoopFor(Rem->getParent()); 
-  N = SE->getSCEVAtScope(N, ICmpLoop); 
- 
-  bool IsNumeratorNonNegative = !IsSigned || SE->isKnownNonNegative(N); 
- 
-  // Do not proceed if the Numerator may be negative 
-  if (!IsNumeratorNonNegative) 
-    return; 
- 
-  const SCEV *D = SE->getSCEV(DValue); 
-  D = SE->getSCEVAtScope(D, ICmpLoop); 
- 
-  if (UsedAsNumerator) { 
-    auto LT = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; 
-    if (SE->isKnownPredicate(LT, N, D)) { 
-      replaceRemWithNumerator(Rem); 
-      return; 
-    } 
- 
-    auto *T = Rem->getType(); 
-    const auto *NLessOne = SE->getMinusSCEV(N, SE->getOne(T)); 
-    if (SE->isKnownPredicate(LT, NLessOne, D)) { 
-      replaceRemWithNumeratorOrZero(Rem); 
-      return; 
-    } 
-  } 
- 
-  // Try to replace SRem with URem, if both N and D are known non-negative. 
-  // Since we had already check N, we only need to check D now 
-  if (!IsSigned || !SE->isKnownNonNegative(D)) 
-    return; 
- 
-  replaceSRemWithURem(Rem); 
-} 
- 
-static bool willNotOverflow(ScalarEvolution *SE, Instruction::BinaryOps BinOp, 
-                            bool Signed, const SCEV *LHS, const SCEV *RHS) { 
-  const SCEV *(ScalarEvolution::*Operation)(const SCEV *, const SCEV *, 
-                                            SCEV::NoWrapFlags, unsigned); 
-  switch (BinOp) { 
-  default: 
-    llvm_unreachable("Unsupported binary op"); 
-  case Instruction::Add: 
-    Operation = &ScalarEvolution::getAddExpr; 
-    break; 
-  case Instruction::Sub: 
-    Operation = &ScalarEvolution::getMinusSCEV; 
-    break; 
-  case Instruction::Mul: 
-    Operation = &ScalarEvolution::getMulExpr; 
-    break; 
-  } 
- 
-  const SCEV *(ScalarEvolution::*Extension)(const SCEV *, Type *, unsigned) = 
-      Signed ? &ScalarEvolution::getSignExtendExpr 
-             : &ScalarEvolution::getZeroExtendExpr; 
- 
-  // Check ext(LHS op RHS) == ext(LHS) op ext(RHS) 
-  auto *NarrowTy = cast<IntegerType>(LHS->getType()); 
-  auto *WideTy = 
-    IntegerType::get(NarrowTy->getContext(), NarrowTy->getBitWidth() * 2); 
- 
-  const SCEV *A = 
-      (SE->*Extension)((SE->*Operation)(LHS, RHS, SCEV::FlagAnyWrap, 0), 
-                       WideTy, 0); 
-  const SCEV *B = 
-      (SE->*Operation)((SE->*Extension)(LHS, WideTy, 0), 
-                       (SE->*Extension)(RHS, WideTy, 0), SCEV::FlagAnyWrap, 0); 
-  return A == B; 
-} 
- 
-bool SimplifyIndvar::eliminateOverflowIntrinsic(WithOverflowInst *WO) { 
-  const SCEV *LHS = SE->getSCEV(WO->getLHS()); 
-  const SCEV *RHS = SE->getSCEV(WO->getRHS()); 
-  if (!willNotOverflow(SE, WO->getBinaryOp(), WO->isSigned(), LHS, RHS)) 
-    return false; 
- 
-  // Proved no overflow, nuke the overflow check and, if possible, the overflow 
-  // intrinsic as well. 
- 
-  BinaryOperator *NewResult = BinaryOperator::Create( 
-      WO->getBinaryOp(), WO->getLHS(), WO->getRHS(), "", WO); 
- 
-  if (WO->isSigned()) 
-    NewResult->setHasNoSignedWrap(true); 
-  else 
-    NewResult->setHasNoUnsignedWrap(true); 
- 
-  SmallVector<ExtractValueInst *, 4> ToDelete; 
- 
-  for (auto *U : WO->users()) { 
-    if (auto *EVI = dyn_cast<ExtractValueInst>(U)) { 
-      if (EVI->getIndices()[0] == 1) 
-        EVI->replaceAllUsesWith(ConstantInt::getFalse(WO->getContext())); 
-      else { 
-        assert(EVI->getIndices()[0] == 0 && "Only two possibilities!"); 
-        EVI->replaceAllUsesWith(NewResult); 
-      } 
-      ToDelete.push_back(EVI); 
-    } 
-  } 
- 
-  for (auto *EVI : ToDelete) 
-    EVI->eraseFromParent(); 
- 
-  if (WO->use_empty()) 
-    WO->eraseFromParent(); 
- 
+
+  // Rewrite the comparison to a loop invariant comparison if it can be done
+  // cheaply, where cheaply means "we don't need to emit any new
+  // instructions".
+
+  SmallDenseMap<const SCEV*, Value*> CheapExpansions;
+  CheapExpansions[S] = ICmp->getOperand(IVOperIdx);
+  CheapExpansions[X] = ICmp->getOperand(1 - IVOperIdx);
+
+  // TODO: Support multiple entry loops?  (We currently bail out of these in
+  // the IndVarSimplify pass)
+  if (auto *BB = L->getLoopPredecessor()) {
+    const int Idx = PN->getBasicBlockIndex(BB);
+    if (Idx >= 0) {
+      Value *Incoming = PN->getIncomingValue(Idx);
+      const SCEV *IncomingS = SE->getSCEV(Incoming);
+      CheapExpansions[IncomingS] = Incoming;
+    }
+  }
+  Value *NewLHS = CheapExpansions[InvariantLHS];
+  Value *NewRHS = CheapExpansions[InvariantRHS];
+
+  if (!NewLHS)
+    if (auto *ConstLHS = dyn_cast<SCEVConstant>(InvariantLHS))
+      NewLHS = ConstLHS->getValue();
+  if (!NewRHS)
+    if (auto *ConstRHS = dyn_cast<SCEVConstant>(InvariantRHS))
+      NewRHS = ConstRHS->getValue();
+
+  if (!NewLHS || !NewRHS)
+    // We could not find an existing value to replace either LHS or RHS.
+    // Generating new instructions has subtler tradeoffs, so avoid doing that
+    // for now.
+    return false;
+
+  LLVM_DEBUG(dbgs() << "INDVARS: Simplified comparison: " << *ICmp << '\n');
+  ICmp->setPredicate(InvariantPredicate);
+  ICmp->setOperand(0, NewLHS);
+  ICmp->setOperand(1, NewRHS);
+  return true;
+}
+
+/// SimplifyIVUsers helper for eliminating useless
+/// comparisons against an induction variable.
+void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) {
+  unsigned IVOperIdx = 0;
+  ICmpInst::Predicate Pred = ICmp->getPredicate();
+  ICmpInst::Predicate OriginalPred = Pred;
+  if (IVOperand != ICmp->getOperand(0)) {
+    // Swapped
+    assert(IVOperand == ICmp->getOperand(1) && "Can't find IVOperand");
+    IVOperIdx = 1;
+    Pred = ICmpInst::getSwappedPredicate(Pred);
+  }
+
+  // Get the SCEVs for the ICmp operands (in the specific context of the
+  // current loop)
+  const Loop *ICmpLoop = LI->getLoopFor(ICmp->getParent());
+  const SCEV *S = SE->getSCEVAtScope(ICmp->getOperand(IVOperIdx), ICmpLoop);
+  const SCEV *X = SE->getSCEVAtScope(ICmp->getOperand(1 - IVOperIdx), ICmpLoop);
+
+  // If the condition is always true or always false, replace it with
+  // a constant value.
+  if (SE->isKnownPredicate(Pred, S, X)) {
+    ICmp->replaceAllUsesWith(ConstantInt::getTrue(ICmp->getContext()));
+    DeadInsts.emplace_back(ICmp);
+    LLVM_DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
+  } else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X)) {
+    ICmp->replaceAllUsesWith(ConstantInt::getFalse(ICmp->getContext()));
+    DeadInsts.emplace_back(ICmp);
+    LLVM_DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
+  } else if (makeIVComparisonInvariant(ICmp, IVOperand)) {
+    // fallthrough to end of function
+  } else if (ICmpInst::isSigned(OriginalPred) &&
+             SE->isKnownNonNegative(S) && SE->isKnownNonNegative(X)) {
+    // If we were unable to make anything above, all we can is to canonicalize
+    // the comparison hoping that it will open the doors for other
+    // optimizations. If we find out that we compare two non-negative values,
+    // we turn the instruction's predicate to its unsigned version. Note that
+    // we cannot rely on Pred here unless we check if we have swapped it.
+    assert(ICmp->getPredicate() == OriginalPred && "Predicate changed?");
+    LLVM_DEBUG(dbgs() << "INDVARS: Turn to unsigned comparison: " << *ICmp
+                      << '\n');
+    ICmp->setPredicate(ICmpInst::getUnsignedPredicate(OriginalPred));
+  } else
+    return;
+
+  ++NumElimCmp;
+  Changed = true;
+}
+
+bool SimplifyIndvar::eliminateSDiv(BinaryOperator *SDiv) {
+  // Get the SCEVs for the ICmp operands.
+  auto *N = SE->getSCEV(SDiv->getOperand(0));
+  auto *D = SE->getSCEV(SDiv->getOperand(1));
+
+  // Simplify unnecessary loops away.
+  const Loop *L = LI->getLoopFor(SDiv->getParent());
+  N = SE->getSCEVAtScope(N, L);
+  D = SE->getSCEVAtScope(D, L);
+
+  // Replace sdiv by udiv if both of the operands are non-negative
+  if (SE->isKnownNonNegative(N) && SE->isKnownNonNegative(D)) {
+    auto *UDiv = BinaryOperator::Create(
+        BinaryOperator::UDiv, SDiv->getOperand(0), SDiv->getOperand(1),
+        SDiv->getName() + ".udiv", SDiv);
+    UDiv->setIsExact(SDiv->isExact());
+    SDiv->replaceAllUsesWith(UDiv);
+    LLVM_DEBUG(dbgs() << "INDVARS: Simplified sdiv: " << *SDiv << '\n');
+    ++NumSimplifiedSDiv;
+    Changed = true;
+    DeadInsts.push_back(SDiv);
+    return true;
+  }
+
+  return false;
+}
+
+// i %s n -> i %u n if i >= 0 and n >= 0
+void SimplifyIndvar::replaceSRemWithURem(BinaryOperator *Rem) {
+  auto *N = Rem->getOperand(0), *D = Rem->getOperand(1);
+  auto *URem = BinaryOperator::Create(BinaryOperator::URem, N, D,
+                                      Rem->getName() + ".urem", Rem);
+  Rem->replaceAllUsesWith(URem);
+  LLVM_DEBUG(dbgs() << "INDVARS: Simplified srem: " << *Rem << '\n');
+  ++NumSimplifiedSRem;
+  Changed = true;
+  DeadInsts.emplace_back(Rem);
+}
+
+// i % n  -->  i  if i is in [0,n).
+void SimplifyIndvar::replaceRemWithNumerator(BinaryOperator *Rem) {
+  Rem->replaceAllUsesWith(Rem->getOperand(0));
+  LLVM_DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
+  ++NumElimRem;
+  Changed = true;
+  DeadInsts.emplace_back(Rem);
+}
+
+// (i+1) % n  -->  (i+1)==n?0:(i+1)  if i is in [0,n).
+void SimplifyIndvar::replaceRemWithNumeratorOrZero(BinaryOperator *Rem) {
+  auto *T = Rem->getType();
+  auto *N = Rem->getOperand(0), *D = Rem->getOperand(1);
+  ICmpInst *ICmp = new ICmpInst(Rem, ICmpInst::ICMP_EQ, N, D);
+  SelectInst *Sel =
+      SelectInst::Create(ICmp, ConstantInt::get(T, 0), N, "iv.rem", Rem);
+  Rem->replaceAllUsesWith(Sel);
+  LLVM_DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
+  ++NumElimRem;
   Changed = true;
-  return true; 
-} 
- 
-bool SimplifyIndvar::eliminateSaturatingIntrinsic(SaturatingInst *SI) { 
-  const SCEV *LHS = SE->getSCEV(SI->getLHS()); 
-  const SCEV *RHS = SE->getSCEV(SI->getRHS()); 
-  if (!willNotOverflow(SE, SI->getBinaryOp(), SI->isSigned(), LHS, RHS)) 
-    return false; 
- 
-  BinaryOperator *BO = BinaryOperator::Create( 
-      SI->getBinaryOp(), SI->getLHS(), SI->getRHS(), SI->getName(), SI); 
-  if (SI->isSigned()) 
-    BO->setHasNoSignedWrap(); 
-  else 
-    BO->setHasNoUnsignedWrap(); 
- 
-  SI->replaceAllUsesWith(BO); 
-  DeadInsts.emplace_back(SI); 
-  Changed = true; 
-  return true; 
-} 
- 
-bool SimplifyIndvar::eliminateTrunc(TruncInst *TI) { 
-  // It is always legal to replace 
-  //   icmp <pred> i32 trunc(iv), n 
-  // with 
-  //   icmp <pred> i64 sext(trunc(iv)), sext(n), if pred is signed predicate. 
-  // Or with 
-  //   icmp <pred> i64 zext(trunc(iv)), zext(n), if pred is unsigned predicate. 
-  // Or with either of these if pred is an equality predicate. 
-  // 
-  // If we can prove that iv == sext(trunc(iv)) or iv == zext(trunc(iv)) for 
-  // every comparison which uses trunc, it means that we can replace each of 
-  // them with comparison of iv against sext/zext(n). We no longer need trunc 
-  // after that. 
-  // 
-  // TODO: Should we do this if we can widen *some* comparisons, but not all 
-  // of them? Sometimes it is enough to enable other optimizations, but the 
-  // trunc instruction will stay in the loop. 
-  Value *IV = TI->getOperand(0); 
-  Type *IVTy = IV->getType(); 
-  const SCEV *IVSCEV = SE->getSCEV(IV); 
-  const SCEV *TISCEV = SE->getSCEV(TI); 
- 
-  // Check if iv == zext(trunc(iv)) and if iv == sext(trunc(iv)). If so, we can 
-  // get rid of trunc 
-  bool DoesSExtCollapse = false; 
-  bool DoesZExtCollapse = false; 
-  if (IVSCEV == SE->getSignExtendExpr(TISCEV, IVTy)) 
-    DoesSExtCollapse = true; 
-  if (IVSCEV == SE->getZeroExtendExpr(TISCEV, IVTy)) 
-    DoesZExtCollapse = true; 
- 
-  // If neither sext nor zext does collapse, it is not profitable to do any 
-  // transform. Bail. 
-  if (!DoesSExtCollapse && !DoesZExtCollapse) 
-    return false; 
- 
-  // Collect users of the trunc that look like comparisons against invariants. 
-  // Bail if we find something different. 
-  SmallVector<ICmpInst *, 4> ICmpUsers; 
-  for (auto *U : TI->users()) { 
-    // We don't care about users in unreachable blocks. 
-    if (isa<Instruction>(U) && 
-        !DT->isReachableFromEntry(cast<Instruction>(U)->getParent())) 
-      continue; 
-    ICmpInst *ICI = dyn_cast<ICmpInst>(U); 
-    if (!ICI) return false; 
-    assert(L->contains(ICI->getParent()) && "LCSSA form broken?"); 
-    if (!(ICI->getOperand(0) == TI && L->isLoopInvariant(ICI->getOperand(1))) && 
-        !(ICI->getOperand(1) == TI && L->isLoopInvariant(ICI->getOperand(0)))) 
-      return false; 
-    // If we cannot get rid of trunc, bail. 
-    if (ICI->isSigned() && !DoesSExtCollapse) 
-      return false; 
-    if (ICI->isUnsigned() && !DoesZExtCollapse) 
-      return false; 
-    // For equality, either signed or unsigned works. 
-    ICmpUsers.push_back(ICI); 
-  } 
- 
-  auto CanUseZExt = [&](ICmpInst *ICI) { 
-    // Unsigned comparison can be widened as unsigned. 
-    if (ICI->isUnsigned()) 
-      return true; 
-    // Is it profitable to do zext? 
-    if (!DoesZExtCollapse) 
-      return false; 
-    // For equality, we can safely zext both parts. 
-    if (ICI->isEquality()) 
-      return true; 
-    // Otherwise we can only use zext when comparing two non-negative or two 
-    // negative values. But in practice, we will never pass DoesZExtCollapse 
-    // check for a negative value, because zext(trunc(x)) is non-negative. So 
-    // it only make sense to check for non-negativity here. 
-    const SCEV *SCEVOP1 = SE->getSCEV(ICI->getOperand(0)); 
-    const SCEV *SCEVOP2 = SE->getSCEV(ICI->getOperand(1)); 
-    return SE->isKnownNonNegative(SCEVOP1) && SE->isKnownNonNegative(SCEVOP2); 
-  }; 
-  // Replace all comparisons against trunc with comparisons against IV. 
-  for (auto *ICI : ICmpUsers) { 
-    bool IsSwapped = L->isLoopInvariant(ICI->getOperand(0)); 
-    auto *Op1 = IsSwapped ? ICI->getOperand(0) : ICI->getOperand(1); 
-    Instruction *Ext = nullptr; 
-    // For signed/unsigned predicate, replace the old comparison with comparison 
-    // of immediate IV against sext/zext of the invariant argument. If we can 
-    // use either sext or zext (i.e. we are dealing with equality predicate), 
-    // then prefer zext as a more canonical form. 
-    // TODO: If we see a signed comparison which can be turned into unsigned, 
-    // we can do it here for canonicalization purposes. 
-    ICmpInst::Predicate Pred = ICI->getPredicate(); 
-    if (IsSwapped) Pred = ICmpInst::getSwappedPredicate(Pred); 
-    if (CanUseZExt(ICI)) { 
-      assert(DoesZExtCollapse && "Unprofitable zext?"); 
-      Ext = new ZExtInst(Op1, IVTy, "zext", ICI); 
-      Pred = ICmpInst::getUnsignedPredicate(Pred); 
-    } else { 
-      assert(DoesSExtCollapse && "Unprofitable sext?"); 
-      Ext = new SExtInst(Op1, IVTy, "sext", ICI); 
-      assert(Pred == ICmpInst::getSignedPredicate(Pred) && "Must be signed!"); 
-    } 
-    bool Changed; 
-    L->makeLoopInvariant(Ext, Changed); 
-    (void)Changed; 
-    ICmpInst *NewICI = new ICmpInst(ICI, Pred, IV, Ext); 
-    ICI->replaceAllUsesWith(NewICI); 
-    DeadInsts.emplace_back(ICI); 
-  } 
- 
-  // Trunc no longer needed. 
-  TI->replaceAllUsesWith(UndefValue::get(TI->getType())); 
-  DeadInsts.emplace_back(TI); 
-  return true; 
-} 
- 
-/// Eliminate an operation that consumes a simple IV and has no observable 
-/// side-effect given the range of IV values.  IVOperand is guaranteed SCEVable, 
-/// but UseInst may not be. 
-bool SimplifyIndvar::eliminateIVUser(Instruction *UseInst, 
-                                     Instruction *IVOperand) { 
-  if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) { 
-    eliminateIVComparison(ICmp, IVOperand); 
-    return true; 
-  } 
-  if (BinaryOperator *Bin = dyn_cast<BinaryOperator>(UseInst)) { 
-    bool IsSRem = Bin->getOpcode() == Instruction::SRem; 
-    if (IsSRem || Bin->getOpcode() == Instruction::URem) { 
-      simplifyIVRemainder(Bin, IVOperand, IsSRem); 
-      return true; 
-    } 
- 
-    if (Bin->getOpcode() == Instruction::SDiv) 
-      return eliminateSDiv(Bin); 
-  } 
- 
-  if (auto *WO = dyn_cast<WithOverflowInst>(UseInst)) 
-    if (eliminateOverflowIntrinsic(WO)) 
-      return true; 
- 
-  if (auto *SI = dyn_cast<SaturatingInst>(UseInst)) 
-    if (eliminateSaturatingIntrinsic(SI)) 
-      return true; 
- 
-  if (auto *TI = dyn_cast<TruncInst>(UseInst)) 
-    if (eliminateTrunc(TI)) 
-      return true; 
- 
-  if (eliminateIdentitySCEV(UseInst, IVOperand)) 
-    return true; 
- 
-  return false; 
-} 
- 
-static Instruction *GetLoopInvariantInsertPosition(Loop *L, Instruction *Hint) { 
-  if (auto *BB = L->getLoopPreheader()) 
-    return BB->getTerminator(); 
- 
-  return Hint; 
-} 
- 
-/// Replace the UseInst with a loop invariant expression if it is safe. 
-bool SimplifyIndvar::replaceIVUserWithLoopInvariant(Instruction *I) { 
-  if (!SE->isSCEVable(I->getType())) 
-    return false; 
- 
-  // Get the symbolic expression for this instruction. 
-  const SCEV *S = SE->getSCEV(I); 
- 
-  if (!SE->isLoopInvariant(S, L)) 
-    return false; 
- 
-  // Do not generate something ridiculous even if S is loop invariant. 
-  if (Rewriter.isHighCostExpansion(S, L, SCEVCheapExpansionBudget, TTI, I)) 
-    return false; 
- 
-  auto *IP = GetLoopInvariantInsertPosition(L, I); 
- 
-  if (!isSafeToExpandAt(S, IP, *SE)) { 
-    LLVM_DEBUG(dbgs() << "INDVARS: Can not replace IV user: " << *I 
-                      << " with non-speculable loop invariant: " << *S << '\n'); 
-    return false; 
-  } 
- 
-  auto *Invariant = Rewriter.expandCodeFor(S, I->getType(), IP); 
- 
-  I->replaceAllUsesWith(Invariant); 
-  LLVM_DEBUG(dbgs() << "INDVARS: Replace IV user: " << *I 
-                    << " with loop invariant: " << *S << '\n'); 
-  ++NumFoldedUser; 
-  Changed = true; 
-  DeadInsts.emplace_back(I); 
-  return true; 
-} 
- 
-/// Eliminate any operation that SCEV can prove is an identity function. 
-bool SimplifyIndvar::eliminateIdentitySCEV(Instruction *UseInst, 
-                                           Instruction *IVOperand) { 
-  if (!SE->isSCEVable(UseInst->getType()) || 
-      (UseInst->getType() != IVOperand->getType()) || 
-      (SE->getSCEV(UseInst) != SE->getSCEV(IVOperand))) 
-    return false; 
- 
-  // getSCEV(X) == getSCEV(Y) does not guarantee that X and Y are related in the 
-  // dominator tree, even if X is an operand to Y.  For instance, in 
-  // 
-  //     %iv = phi i32 {0,+,1} 
-  //     br %cond, label %left, label %merge 
-  // 
-  //   left: 
-  //     %X = add i32 %iv, 0 
-  //     br label %merge 
-  // 
-  //   merge: 
-  //     %M = phi (%X, %iv) 
-  // 
-  // getSCEV(%M) == getSCEV(%X) == {0,+,1}, but %X does not dominate %M, and 
-  // %M.replaceAllUsesWith(%X) would be incorrect. 
- 
-  if (isa<PHINode>(UseInst)) 
-    // If UseInst is not a PHI node then we know that IVOperand dominates 
-    // UseInst directly from the legality of SSA. 
-    if (!DT || !DT->dominates(IVOperand, UseInst)) 
-      return false; 
- 
-  if (!LI->replacementPreservesLCSSAForm(UseInst, IVOperand)) 
-    return false; 
- 
-  LLVM_DEBUG(dbgs() << "INDVARS: Eliminated identity: " << *UseInst << '\n'); 
- 
-  UseInst->replaceAllUsesWith(IVOperand); 
-  ++NumElimIdentity; 
-  Changed = true; 
-  DeadInsts.emplace_back(UseInst); 
-  return true; 
-} 
- 
-/// Annotate BO with nsw / nuw if it provably does not signed-overflow / 
-/// unsigned-overflow.  Returns true if anything changed, false otherwise. 
-bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO, 
-                                                    Value *IVOperand) { 
-  // Fastpath: we don't have any work to do if `BO` is `nuw` and `nsw`. 
-  if (BO->hasNoUnsignedWrap() && BO->hasNoSignedWrap()) 
-    return false; 
- 
-  if (BO->getOpcode() != Instruction::Add && 
-      BO->getOpcode() != Instruction::Sub && 
-      BO->getOpcode() != Instruction::Mul) 
-    return false; 
- 
-  const SCEV *LHS = SE->getSCEV(BO->getOperand(0)); 
-  const SCEV *RHS = SE->getSCEV(BO->getOperand(1)); 
-  bool Changed = false; 
- 
-  if (!BO->hasNoUnsignedWrap() && 
-      willNotOverflow(SE, BO->getOpcode(), /* Signed */ false, LHS, RHS)) { 
-    BO->setHasNoUnsignedWrap(); 
-    SE->forgetValue(BO); 
-    Changed = true; 
-  } 
- 
-  if (!BO->hasNoSignedWrap() && 
-      willNotOverflow(SE, BO->getOpcode(), /* Signed */ true, LHS, RHS)) { 
-    BO->setHasNoSignedWrap(); 
-    SE->forgetValue(BO); 
-    Changed = true; 
-  } 
- 
-  return Changed; 
-} 
- 
-/// Annotate the Shr in (X << IVOperand) >> C as exact using the 
-/// information from the IV's range. Returns true if anything changed, false 
-/// otherwise. 
-bool SimplifyIndvar::strengthenRightShift(BinaryOperator *BO, 
-                                          Value *IVOperand) { 
-  using namespace llvm::PatternMatch; 
- 
-  if (BO->getOpcode() == Instruction::Shl) { 
-    bool Changed = false; 
-    ConstantRange IVRange = SE->getUnsignedRange(SE->getSCEV(IVOperand)); 
-    for (auto *U : BO->users()) { 
-      const APInt *C; 
-      if (match(U, 
-                m_AShr(m_Shl(m_Value(), m_Specific(IVOperand)), m_APInt(C))) || 
-          match(U, 
-                m_LShr(m_Shl(m_Value(), m_Specific(IVOperand)), m_APInt(C)))) { 
-        BinaryOperator *Shr = cast<BinaryOperator>(U); 
-        if (!Shr->isExact() && IVRange.getUnsignedMin().uge(*C)) { 
-          Shr->setIsExact(true); 
-          Changed = true; 
-        } 
-      } 
-    } 
-    return Changed; 
-  } 
- 
-  return false; 
-} 
- 
-/// Add all uses of Def to the current IV's worklist. 
-static void pushIVUsers( 
-  Instruction *Def, Loop *L, 
-  SmallPtrSet<Instruction*,16> &Simplified, 
-  SmallVectorImpl< std::pair<Instruction*,Instruction*> > &SimpleIVUsers) { 
- 
-  for (User *U : Def->users()) { 
-    Instruction *UI = cast<Instruction>(U); 
- 
-    // Avoid infinite or exponential worklist processing. 
-    // Also ensure unique worklist users. 
-    // If Def is a LoopPhi, it may not be in the Simplified set, so check for 
-    // self edges first. 
-    if (UI == Def) 
-      continue; 
- 
-    // Only change the current Loop, do not change the other parts (e.g. other 
-    // Loops). 
-    if (!L->contains(UI)) 
-      continue; 
- 
-    // Do not push the same instruction more than once. 
-    if (!Simplified.insert(UI).second) 
-      continue; 
- 
-    SimpleIVUsers.push_back(std::make_pair(UI, Def)); 
-  } 
-} 
- 
-/// Return true if this instruction generates a simple SCEV 
-/// expression in terms of that IV. 
-/// 
-/// This is similar to IVUsers' isInteresting() but processes each instruction 
-/// non-recursively when the operand is already known to be a simpleIVUser. 
-/// 
-static bool isSimpleIVUser(Instruction *I, const Loop *L, ScalarEvolution *SE) { 
-  if (!SE->isSCEVable(I->getType())) 
-    return false; 
- 
-  // Get the symbolic expression for this instruction. 
-  const SCEV *S = SE->getSCEV(I); 
- 
-  // Only consider affine recurrences. 
-  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S); 
-  if (AR && AR->getLoop() == L) 
-    return true; 
- 
-  return false; 
-} 
- 
-/// Iteratively perform simplification on a worklist of users 
-/// of the specified induction variable. Each successive simplification may push 
-/// more users which may themselves be candidates for simplification. 
-/// 
-/// This algorithm does not require IVUsers analysis. Instead, it simplifies 
-/// instructions in-place during analysis. Rather than rewriting induction 
-/// variables bottom-up from their users, it transforms a chain of IVUsers 
-/// top-down, updating the IR only when it encounters a clear optimization 
-/// opportunity. 
-/// 
-/// Once DisableIVRewrite is default, LSR will be the only client of IVUsers. 
-/// 
-void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) { 
-  if (!SE->isSCEVable(CurrIV->getType())) 
-    return; 
- 
-  // Instructions processed by SimplifyIndvar for CurrIV. 
-  SmallPtrSet<Instruction*,16> Simplified; 
- 
-  // Use-def pairs if IV users waiting to be processed for CurrIV. 
-  SmallVector<std::pair<Instruction*, Instruction*>, 8> SimpleIVUsers; 
- 
-  // Push users of the current LoopPhi. In rare cases, pushIVUsers may be 
-  // called multiple times for the same LoopPhi. This is the proper thing to 
-  // do for loop header phis that use each other. 
-  pushIVUsers(CurrIV, L, Simplified, SimpleIVUsers); 
- 
-  while (!SimpleIVUsers.empty()) { 
-    std::pair<Instruction*, Instruction*> UseOper = 
-      SimpleIVUsers.pop_back_val(); 
-    Instruction *UseInst = UseOper.first; 
- 
-    // If a user of the IndVar is trivially dead, we prefer just to mark it dead 
-    // rather than try to do some complex analysis or transformation (such as 
-    // widening) basing on it. 
-    // TODO: Propagate TLI and pass it here to handle more cases. 
-    if (isInstructionTriviallyDead(UseInst, /* TLI */ nullptr)) { 
-      DeadInsts.emplace_back(UseInst); 
-      continue; 
-    } 
- 
-    // Bypass back edges to avoid extra work. 
-    if (UseInst == CurrIV) continue; 
- 
-    // Try to replace UseInst with a loop invariant before any other 
-    // simplifications. 
-    if (replaceIVUserWithLoopInvariant(UseInst)) 
-      continue; 
- 
-    Instruction *IVOperand = UseOper.second; 
-    for (unsigned N = 0; IVOperand; ++N) { 
-      assert(N <= Simplified.size() && "runaway iteration"); 
- 
-      Value *NewOper = foldIVUser(UseInst, IVOperand); 
-      if (!NewOper) 
-        break; // done folding 
-      IVOperand = dyn_cast<Instruction>(NewOper); 
-    } 
-    if (!IVOperand) 
-      continue; 
- 
-    if (eliminateIVUser(UseInst, IVOperand)) { 
-      pushIVUsers(IVOperand, L, Simplified, SimpleIVUsers); 
-      continue; 
-    } 
- 
-    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(UseInst)) { 
-      if ((isa<OverflowingBinaryOperator>(BO) && 
-           strengthenOverflowingOperation(BO, IVOperand)) || 
-          (isa<ShlOperator>(BO) && strengthenRightShift(BO, IVOperand))) { 
-        // re-queue uses of the now modified binary operator and fall 
-        // through to the checks that remain. 
-        pushIVUsers(IVOperand, L, Simplified, SimpleIVUsers); 
-      } 
-    } 
- 
-    CastInst *Cast = dyn_cast<CastInst>(UseInst); 
-    if (V && Cast) { 
-      V->visitCast(Cast); 
-      continue; 
-    } 
-    if (isSimpleIVUser(UseInst, L, SE)) { 
-      pushIVUsers(UseInst, L, Simplified, SimpleIVUsers); 
-    } 
-  } 
-} 
- 
-namespace llvm { 
- 
-void IVVisitor::anchor() { } 
- 
-/// Simplify instructions that use this induction variable 
-/// by using ScalarEvolution to analyze the IV's recurrence. 
-bool simplifyUsersOfIV(PHINode *CurrIV, ScalarEvolution *SE, DominatorTree *DT, 
-                       LoopInfo *LI, const TargetTransformInfo *TTI, 
-                       SmallVectorImpl<WeakTrackingVH> &Dead, 
-                       SCEVExpander &Rewriter, IVVisitor *V) { 
-  SimplifyIndvar SIV(LI->getLoopFor(CurrIV->getParent()), SE, DT, LI, TTI, 
-                     Rewriter, Dead); 
-  SIV.simplifyUsers(CurrIV, V); 
-  return SIV.hasChanged(); 
-} 
- 
-/// Simplify users of induction variables within this 
-/// loop. This does not actually change or add IVs. 
-bool simplifyLoopIVs(Loop *L, ScalarEvolution *SE, DominatorTree *DT, 
-                     LoopInfo *LI, const TargetTransformInfo *TTI, 
-                     SmallVectorImpl<WeakTrackingVH> &Dead) { 
-  SCEVExpander Rewriter(*SE, SE->getDataLayout(), "indvars"); 
-#ifndef NDEBUG 
-  Rewriter.setDebugType(DEBUG_TYPE); 
-#endif 
-  bool Changed = false; 
-  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { 
-    Changed |= 
-        simplifyUsersOfIV(cast<PHINode>(I), SE, DT, LI, TTI, Dead, Rewriter); 
-  } 
-  return Changed; 
-} 
- 
-} // namespace llvm 
+  DeadInsts.emplace_back(Rem);
+}
+
+/// SimplifyIVUsers helper for eliminating useless remainder operations
+/// operating on an induction variable or replacing srem by urem.
+void SimplifyIndvar::simplifyIVRemainder(BinaryOperator *Rem, Value *IVOperand,
+                                         bool IsSigned) {
+  auto *NValue = Rem->getOperand(0);
+  auto *DValue = Rem->getOperand(1);
+  // We're only interested in the case where we know something about
+  // the numerator, unless it is a srem, because we want to replace srem by urem
+  // in general.
+  bool UsedAsNumerator = IVOperand == NValue;
+  if (!UsedAsNumerator && !IsSigned)
+    return;
+
+  const SCEV *N = SE->getSCEV(NValue);
+
+  // Simplify unnecessary loops away.
+  const Loop *ICmpLoop = LI->getLoopFor(Rem->getParent());
+  N = SE->getSCEVAtScope(N, ICmpLoop);
+
+  bool IsNumeratorNonNegative = !IsSigned || SE->isKnownNonNegative(N);
+
+  // Do not proceed if the Numerator may be negative
+  if (!IsNumeratorNonNegative)
+    return;
+
+  const SCEV *D = SE->getSCEV(DValue);
+  D = SE->getSCEVAtScope(D, ICmpLoop);
+
+  if (UsedAsNumerator) {
+    auto LT = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
+    if (SE->isKnownPredicate(LT, N, D)) {
+      replaceRemWithNumerator(Rem);
+      return;
+    }
+
+    auto *T = Rem->getType();
+    const auto *NLessOne = SE->getMinusSCEV(N, SE->getOne(T));
+    if (SE->isKnownPredicate(LT, NLessOne, D)) {
+      replaceRemWithNumeratorOrZero(Rem);
+      return;
+    }
+  }
+
+  // Try to replace SRem with URem, if both N and D are known non-negative.
+  // Since we had already check N, we only need to check D now
+  if (!IsSigned || !SE->isKnownNonNegative(D))
+    return;
+
+  replaceSRemWithURem(Rem);
+}
+
+static bool willNotOverflow(ScalarEvolution *SE, Instruction::BinaryOps BinOp,
+                            bool Signed, const SCEV *LHS, const SCEV *RHS) {
+  const SCEV *(ScalarEvolution::*Operation)(const SCEV *, const SCEV *,
+                                            SCEV::NoWrapFlags, unsigned);
+  switch (BinOp) {
+  default:
+    llvm_unreachable("Unsupported binary op");
+  case Instruction::Add:
+    Operation = &ScalarEvolution::getAddExpr;
+    break;
+  case Instruction::Sub:
+    Operation = &ScalarEvolution::getMinusSCEV;
+    break;
+  case Instruction::Mul:
+    Operation = &ScalarEvolution::getMulExpr;
+    break;
+  }
+
+  const SCEV *(ScalarEvolution::*Extension)(const SCEV *, Type *, unsigned) =
+      Signed ? &ScalarEvolution::getSignExtendExpr
+             : &ScalarEvolution::getZeroExtendExpr;
+
+  // Check ext(LHS op RHS) == ext(LHS) op ext(RHS)
+  auto *NarrowTy = cast<IntegerType>(LHS->getType());
+  auto *WideTy =
+    IntegerType::get(NarrowTy->getContext(), NarrowTy->getBitWidth() * 2);
+
+  const SCEV *A =
+      (SE->*Extension)((SE->*Operation)(LHS, RHS, SCEV::FlagAnyWrap, 0),
+                       WideTy, 0);
+  const SCEV *B =
+      (SE->*Operation)((SE->*Extension)(LHS, WideTy, 0),
+                       (SE->*Extension)(RHS, WideTy, 0), SCEV::FlagAnyWrap, 0);
+  return A == B;
+}
+
+bool SimplifyIndvar::eliminateOverflowIntrinsic(WithOverflowInst *WO) {
+  const SCEV *LHS = SE->getSCEV(WO->getLHS());
+  const SCEV *RHS = SE->getSCEV(WO->getRHS());
+  if (!willNotOverflow(SE, WO->getBinaryOp(), WO->isSigned(), LHS, RHS))
+    return false;
+
+  // Proved no overflow, nuke the overflow check and, if possible, the overflow
+  // intrinsic as well.
+
+  BinaryOperator *NewResult = BinaryOperator::Create(
+      WO->getBinaryOp(), WO->getLHS(), WO->getRHS(), "", WO);
+
+  if (WO->isSigned())
+    NewResult->setHasNoSignedWrap(true);
+  else
+    NewResult->setHasNoUnsignedWrap(true);
+
+  SmallVector<ExtractValueInst *, 4> ToDelete;
+
+  for (auto *U : WO->users()) {
+    if (auto *EVI = dyn_cast<ExtractValueInst>(U)) {
+      if (EVI->getIndices()[0] == 1)
+        EVI->replaceAllUsesWith(ConstantInt::getFalse(WO->getContext()));
+      else {
+        assert(EVI->getIndices()[0] == 0 && "Only two possibilities!");
+        EVI->replaceAllUsesWith(NewResult);
+      }
+      ToDelete.push_back(EVI);
+    }
+  }
+
+  for (auto *EVI : ToDelete)
+    EVI->eraseFromParent();
+
+  if (WO->use_empty())
+    WO->eraseFromParent();
+
+  Changed = true;
+  return true;
+}
+
+bool SimplifyIndvar::eliminateSaturatingIntrinsic(SaturatingInst *SI) {
+  const SCEV *LHS = SE->getSCEV(SI->getLHS());
+  const SCEV *RHS = SE->getSCEV(SI->getRHS());
+  if (!willNotOverflow(SE, SI->getBinaryOp(), SI->isSigned(), LHS, RHS))
+    return false;
+
+  BinaryOperator *BO = BinaryOperator::Create(
+      SI->getBinaryOp(), SI->getLHS(), SI->getRHS(), SI->getName(), SI);
+  if (SI->isSigned())
+    BO->setHasNoSignedWrap();
+  else
+    BO->setHasNoUnsignedWrap();
+
+  SI->replaceAllUsesWith(BO);
+  DeadInsts.emplace_back(SI);
+  Changed = true;
+  return true;
+}
+
+bool SimplifyIndvar::eliminateTrunc(TruncInst *TI) {
+  // It is always legal to replace
+  //   icmp <pred> i32 trunc(iv), n
+  // with
+  //   icmp <pred> i64 sext(trunc(iv)), sext(n), if pred is signed predicate.
+  // Or with
+  //   icmp <pred> i64 zext(trunc(iv)), zext(n), if pred is unsigned predicate.
+  // Or with either of these if pred is an equality predicate.
+  //
+  // If we can prove that iv == sext(trunc(iv)) or iv == zext(trunc(iv)) for
+  // every comparison which uses trunc, it means that we can replace each of
+  // them with comparison of iv against sext/zext(n). We no longer need trunc
+  // after that.
+  //
+  // TODO: Should we do this if we can widen *some* comparisons, but not all
+  // of them? Sometimes it is enough to enable other optimizations, but the
+  // trunc instruction will stay in the loop.
+  Value *IV = TI->getOperand(0);
+  Type *IVTy = IV->getType();
+  const SCEV *IVSCEV = SE->getSCEV(IV);
+  const SCEV *TISCEV = SE->getSCEV(TI);
+
+  // Check if iv == zext(trunc(iv)) and if iv == sext(trunc(iv)). If so, we can
+  // get rid of trunc
+  bool DoesSExtCollapse = false;
+  bool DoesZExtCollapse = false;
+  if (IVSCEV == SE->getSignExtendExpr(TISCEV, IVTy))
+    DoesSExtCollapse = true;
+  if (IVSCEV == SE->getZeroExtendExpr(TISCEV, IVTy))
+    DoesZExtCollapse = true;
+
+  // If neither sext nor zext does collapse, it is not profitable to do any
+  // transform. Bail.
+  if (!DoesSExtCollapse && !DoesZExtCollapse)
+    return false;
+
+  // Collect users of the trunc that look like comparisons against invariants.
+  // Bail if we find something different.
+  SmallVector<ICmpInst *, 4> ICmpUsers;
+  for (auto *U : TI->users()) {
+    // We don't care about users in unreachable blocks.
+    if (isa<Instruction>(U) &&
+        !DT->isReachableFromEntry(cast<Instruction>(U)->getParent()))
+      continue;
+    ICmpInst *ICI = dyn_cast<ICmpInst>(U);
+    if (!ICI) return false;
+    assert(L->contains(ICI->getParent()) && "LCSSA form broken?");
+    if (!(ICI->getOperand(0) == TI && L->isLoopInvariant(ICI->getOperand(1))) &&
+        !(ICI->getOperand(1) == TI && L->isLoopInvariant(ICI->getOperand(0))))
+      return false;
+    // If we cannot get rid of trunc, bail.
+    if (ICI->isSigned() && !DoesSExtCollapse)
+      return false;
+    if (ICI->isUnsigned() && !DoesZExtCollapse)
+      return false;
+    // For equality, either signed or unsigned works.
+    ICmpUsers.push_back(ICI);
+  }
+
+  auto CanUseZExt = [&](ICmpInst *ICI) {
+    // Unsigned comparison can be widened as unsigned.
+    if (ICI->isUnsigned())
+      return true;
+    // Is it profitable to do zext?
+    if (!DoesZExtCollapse)
+      return false;
+    // For equality, we can safely zext both parts.
+    if (ICI->isEquality())
+      return true;
+    // Otherwise we can only use zext when comparing two non-negative or two
+    // negative values. But in practice, we will never pass DoesZExtCollapse
+    // check for a negative value, because zext(trunc(x)) is non-negative. So
+    // it only make sense to check for non-negativity here.
+    const SCEV *SCEVOP1 = SE->getSCEV(ICI->getOperand(0));
+    const SCEV *SCEVOP2 = SE->getSCEV(ICI->getOperand(1));
+    return SE->isKnownNonNegative(SCEVOP1) && SE->isKnownNonNegative(SCEVOP2);
+  };
+  // Replace all comparisons against trunc with comparisons against IV.
+  for (auto *ICI : ICmpUsers) {
+    bool IsSwapped = L->isLoopInvariant(ICI->getOperand(0));
+    auto *Op1 = IsSwapped ? ICI->getOperand(0) : ICI->getOperand(1);
+    Instruction *Ext = nullptr;
+    // For signed/unsigned predicate, replace the old comparison with comparison
+    // of immediate IV against sext/zext of the invariant argument. If we can
+    // use either sext or zext (i.e. we are dealing with equality predicate),
+    // then prefer zext as a more canonical form.
+    // TODO: If we see a signed comparison which can be turned into unsigned,
+    // we can do it here for canonicalization purposes.
+    ICmpInst::Predicate Pred = ICI->getPredicate();
+    if (IsSwapped) Pred = ICmpInst::getSwappedPredicate(Pred);
+    if (CanUseZExt(ICI)) {
+      assert(DoesZExtCollapse && "Unprofitable zext?");
+      Ext = new ZExtInst(Op1, IVTy, "zext", ICI);
+      Pred = ICmpInst::getUnsignedPredicate(Pred);
+    } else {
+      assert(DoesSExtCollapse && "Unprofitable sext?");
+      Ext = new SExtInst(Op1, IVTy, "sext", ICI);
+      assert(Pred == ICmpInst::getSignedPredicate(Pred) && "Must be signed!");
+    }
+    bool Changed;
+    L->makeLoopInvariant(Ext, Changed);
+    (void)Changed;
+    ICmpInst *NewICI = new ICmpInst(ICI, Pred, IV, Ext);
+    ICI->replaceAllUsesWith(NewICI);
+    DeadInsts.emplace_back(ICI);
+  }
+
+  // Trunc no longer needed.
+  TI->replaceAllUsesWith(UndefValue::get(TI->getType()));
+  DeadInsts.emplace_back(TI);
+  return true;
+}
+
+/// Eliminate an operation that consumes a simple IV and has no observable
+/// side-effect given the range of IV values.  IVOperand is guaranteed SCEVable,
+/// but UseInst may not be.
+bool SimplifyIndvar::eliminateIVUser(Instruction *UseInst,
+                                     Instruction *IVOperand) {
+  if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
+    eliminateIVComparison(ICmp, IVOperand);
+    return true;
+  }
+  if (BinaryOperator *Bin = dyn_cast<BinaryOperator>(UseInst)) {
+    bool IsSRem = Bin->getOpcode() == Instruction::SRem;
+    if (IsSRem || Bin->getOpcode() == Instruction::URem) {
+      simplifyIVRemainder(Bin, IVOperand, IsSRem);
+      return true;
+    }
+
+    if (Bin->getOpcode() == Instruction::SDiv)
+      return eliminateSDiv(Bin);
+  }
+
+  if (auto *WO = dyn_cast<WithOverflowInst>(UseInst))
+    if (eliminateOverflowIntrinsic(WO))
+      return true;
+
+  if (auto *SI = dyn_cast<SaturatingInst>(UseInst))
+    if (eliminateSaturatingIntrinsic(SI))
+      return true;
+
+  if (auto *TI = dyn_cast<TruncInst>(UseInst))
+    if (eliminateTrunc(TI))
+      return true;
+
+  if (eliminateIdentitySCEV(UseInst, IVOperand))
+    return true;
+
+  return false;
+}
+
+static Instruction *GetLoopInvariantInsertPosition(Loop *L, Instruction *Hint) {
+  if (auto *BB = L->getLoopPreheader())
+    return BB->getTerminator();
+
+  return Hint;
+}
+
+/// Replace the UseInst with a loop invariant expression if it is safe.
+bool SimplifyIndvar::replaceIVUserWithLoopInvariant(Instruction *I) {
+  if (!SE->isSCEVable(I->getType()))
+    return false;
+
+  // Get the symbolic expression for this instruction.
+  const SCEV *S = SE->getSCEV(I);
+
+  if (!SE->isLoopInvariant(S, L))
+    return false;
+
+  // Do not generate something ridiculous even if S is loop invariant.
+  if (Rewriter.isHighCostExpansion(S, L, SCEVCheapExpansionBudget, TTI, I))
+    return false;
+
+  auto *IP = GetLoopInvariantInsertPosition(L, I);
+
+  if (!isSafeToExpandAt(S, IP, *SE)) {
+    LLVM_DEBUG(dbgs() << "INDVARS: Can not replace IV user: " << *I
+                      << " with non-speculable loop invariant: " << *S << '\n');
+    return false;
+  }
+
+  auto *Invariant = Rewriter.expandCodeFor(S, I->getType(), IP);
+
+  I->replaceAllUsesWith(Invariant);
+  LLVM_DEBUG(dbgs() << "INDVARS: Replace IV user: " << *I
+                    << " with loop invariant: " << *S << '\n');
+  ++NumFoldedUser;
+  Changed = true;
+  DeadInsts.emplace_back(I);
+  return true;
+}
+
+/// Eliminate any operation that SCEV can prove is an identity function.
+bool SimplifyIndvar::eliminateIdentitySCEV(Instruction *UseInst,
+                                           Instruction *IVOperand) {
+  if (!SE->isSCEVable(UseInst->getType()) ||
+      (UseInst->getType() != IVOperand->getType()) ||
+      (SE->getSCEV(UseInst) != SE->getSCEV(IVOperand)))
+    return false;
+
+  // getSCEV(X) == getSCEV(Y) does not guarantee that X and Y are related in the
+  // dominator tree, even if X is an operand to Y.  For instance, in
+  //
+  //     %iv = phi i32 {0,+,1}
+  //     br %cond, label %left, label %merge
+  //
+  //   left:
+  //     %X = add i32 %iv, 0
+  //     br label %merge
+  //
+  //   merge:
+  //     %M = phi (%X, %iv)
+  //
+  // getSCEV(%M) == getSCEV(%X) == {0,+,1}, but %X does not dominate %M, and
+  // %M.replaceAllUsesWith(%X) would be incorrect.
+
+  if (isa<PHINode>(UseInst))
+    // If UseInst is not a PHI node then we know that IVOperand dominates
+    // UseInst directly from the legality of SSA.
+    if (!DT || !DT->dominates(IVOperand, UseInst))
+      return false;
+
+  if (!LI->replacementPreservesLCSSAForm(UseInst, IVOperand))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "INDVARS: Eliminated identity: " << *UseInst << '\n');
+
+  UseInst->replaceAllUsesWith(IVOperand);
+  ++NumElimIdentity;
+  Changed = true;
+  DeadInsts.emplace_back(UseInst);
+  return true;
+}
+
+/// Annotate BO with nsw / nuw if it provably does not signed-overflow /
+/// unsigned-overflow.  Returns true if anything changed, false otherwise.
+bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO,
+                                                    Value *IVOperand) {
+  // Fastpath: we don't have any work to do if `BO` is `nuw` and `nsw`.
+  if (BO->hasNoUnsignedWrap() && BO->hasNoSignedWrap())
+    return false;
+
+  if (BO->getOpcode() != Instruction::Add &&
+      BO->getOpcode() != Instruction::Sub &&
+      BO->getOpcode() != Instruction::Mul)
+    return false;
+
+  const SCEV *LHS = SE->getSCEV(BO->getOperand(0));
+  const SCEV *RHS = SE->getSCEV(BO->getOperand(1));
+  bool Changed = false;
+
+  if (!BO->hasNoUnsignedWrap() &&
+      willNotOverflow(SE, BO->getOpcode(), /* Signed */ false, LHS, RHS)) {
+    BO->setHasNoUnsignedWrap();
+    SE->forgetValue(BO);
+    Changed = true;
+  }
+
+  if (!BO->hasNoSignedWrap() &&
+      willNotOverflow(SE, BO->getOpcode(), /* Signed */ true, LHS, RHS)) {
+    BO->setHasNoSignedWrap();
+    SE->forgetValue(BO);
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+/// Annotate the Shr in (X << IVOperand) >> C as exact using the
+/// information from the IV's range. Returns true if anything changed, false
+/// otherwise.
+bool SimplifyIndvar::strengthenRightShift(BinaryOperator *BO,
+                                          Value *IVOperand) {
+  using namespace llvm::PatternMatch;
+
+  if (BO->getOpcode() == Instruction::Shl) {
+    bool Changed = false;
+    ConstantRange IVRange = SE->getUnsignedRange(SE->getSCEV(IVOperand));
+    for (auto *U : BO->users()) {
+      const APInt *C;
+      if (match(U,
+                m_AShr(m_Shl(m_Value(), m_Specific(IVOperand)), m_APInt(C))) ||
+          match(U,
+                m_LShr(m_Shl(m_Value(), m_Specific(IVOperand)), m_APInt(C)))) {
+        BinaryOperator *Shr = cast<BinaryOperator>(U);
+        if (!Shr->isExact() && IVRange.getUnsignedMin().uge(*C)) {
+          Shr->setIsExact(true);
+          Changed = true;
+        }
+      }
+    }
+    return Changed;
+  }
+
+  return false;
+}
+
+/// Add all uses of Def to the current IV's worklist.
+static void pushIVUsers(
+  Instruction *Def, Loop *L,
+  SmallPtrSet<Instruction*,16> &Simplified,
+  SmallVectorImpl< std::pair<Instruction*,Instruction*> > &SimpleIVUsers) {
+
+  for (User *U : Def->users()) {
+    Instruction *UI = cast<Instruction>(U);
+
+    // Avoid infinite or exponential worklist processing.
+    // Also ensure unique worklist users.
+    // If Def is a LoopPhi, it may not be in the Simplified set, so check for
+    // self edges first.
+    if (UI == Def)
+      continue;
+
+    // Only change the current Loop, do not change the other parts (e.g. other
+    // Loops).
+    if (!L->contains(UI))
+      continue;
+
+    // Do not push the same instruction more than once.
+    if (!Simplified.insert(UI).second)
+      continue;
+
+    SimpleIVUsers.push_back(std::make_pair(UI, Def));
+  }
+}
+
+/// Return true if this instruction generates a simple SCEV
+/// expression in terms of that IV.
+///
+/// This is similar to IVUsers' isInteresting() but processes each instruction
+/// non-recursively when the operand is already known to be a simpleIVUser.
+///
+static bool isSimpleIVUser(Instruction *I, const Loop *L, ScalarEvolution *SE) {
+  if (!SE->isSCEVable(I->getType()))
+    return false;
+
+  // Get the symbolic expression for this instruction.
+  const SCEV *S = SE->getSCEV(I);
+
+  // Only consider affine recurrences.
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S);
+  if (AR && AR->getLoop() == L)
+    return true;
+
+  return false;
+}
+
+/// Iteratively perform simplification on a worklist of users
+/// of the specified induction variable. Each successive simplification may push
+/// more users which may themselves be candidates for simplification.
+///
+/// This algorithm does not require IVUsers analysis. Instead, it simplifies
+/// instructions in-place during analysis. Rather than rewriting induction
+/// variables bottom-up from their users, it transforms a chain of IVUsers
+/// top-down, updating the IR only when it encounters a clear optimization
+/// opportunity.
+///
+/// Once DisableIVRewrite is default, LSR will be the only client of IVUsers.
+///
+void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) {
+  if (!SE->isSCEVable(CurrIV->getType()))
+    return;
+
+  // Instructions processed by SimplifyIndvar for CurrIV.
+  SmallPtrSet<Instruction*,16> Simplified;
+
+  // Use-def pairs if IV users waiting to be processed for CurrIV.
+  SmallVector<std::pair<Instruction*, Instruction*>, 8> SimpleIVUsers;
+
+  // Push users of the current LoopPhi. In rare cases, pushIVUsers may be
+  // called multiple times for the same LoopPhi. This is the proper thing to
+  // do for loop header phis that use each other.
+  pushIVUsers(CurrIV, L, Simplified, SimpleIVUsers);
+
+  while (!SimpleIVUsers.empty()) {
+    std::pair<Instruction*, Instruction*> UseOper =
+      SimpleIVUsers.pop_back_val();
+    Instruction *UseInst = UseOper.first;
+
+    // If a user of the IndVar is trivially dead, we prefer just to mark it dead
+    // rather than try to do some complex analysis or transformation (such as
+    // widening) basing on it.
+    // TODO: Propagate TLI and pass it here to handle more cases.
+    if (isInstructionTriviallyDead(UseInst, /* TLI */ nullptr)) {
+      DeadInsts.emplace_back(UseInst);
+      continue;
+    }
+
+    // Bypass back edges to avoid extra work.
+    if (UseInst == CurrIV) continue;
+
+    // Try to replace UseInst with a loop invariant before any other
+    // simplifications.
+    if (replaceIVUserWithLoopInvariant(UseInst))
+      continue;
+
+    Instruction *IVOperand = UseOper.second;
+    for (unsigned N = 0; IVOperand; ++N) {
+      assert(N <= Simplified.size() && "runaway iteration");
+
+      Value *NewOper = foldIVUser(UseInst, IVOperand);
+      if (!NewOper)
+        break; // done folding
+      IVOperand = dyn_cast<Instruction>(NewOper);
+    }
+    if (!IVOperand)
+      continue;
+
+    if (eliminateIVUser(UseInst, IVOperand)) {
+      pushIVUsers(IVOperand, L, Simplified, SimpleIVUsers);
+      continue;
+    }
+
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(UseInst)) {
+      if ((isa<OverflowingBinaryOperator>(BO) &&
+           strengthenOverflowingOperation(BO, IVOperand)) ||
+          (isa<ShlOperator>(BO) && strengthenRightShift(BO, IVOperand))) {
+        // re-queue uses of the now modified binary operator and fall
+        // through to the checks that remain.
+        pushIVUsers(IVOperand, L, Simplified, SimpleIVUsers);
+      }
+    }
+
+    CastInst *Cast = dyn_cast<CastInst>(UseInst);
+    if (V && Cast) {
+      V->visitCast(Cast);
+      continue;
+    }
+    if (isSimpleIVUser(UseInst, L, SE)) {
+      pushIVUsers(UseInst, L, Simplified, SimpleIVUsers);
+    }
+  }
+}
+
+namespace llvm {
+
+void IVVisitor::anchor() { }
+
+/// Simplify instructions that use this induction variable
+/// by using ScalarEvolution to analyze the IV's recurrence.
+bool simplifyUsersOfIV(PHINode *CurrIV, ScalarEvolution *SE, DominatorTree *DT,
+                       LoopInfo *LI, const TargetTransformInfo *TTI,
+                       SmallVectorImpl<WeakTrackingVH> &Dead,
+                       SCEVExpander &Rewriter, IVVisitor *V) {
+  SimplifyIndvar SIV(LI->getLoopFor(CurrIV->getParent()), SE, DT, LI, TTI,
+                     Rewriter, Dead);
+  SIV.simplifyUsers(CurrIV, V);
+  return SIV.hasChanged();
+}
+
+/// Simplify users of induction variables within this
+/// loop. This does not actually change or add IVs.
+bool simplifyLoopIVs(Loop *L, ScalarEvolution *SE, DominatorTree *DT,
+                     LoopInfo *LI, const TargetTransformInfo *TTI,
+                     SmallVectorImpl<WeakTrackingVH> &Dead) {
+  SCEVExpander Rewriter(*SE, SE->getDataLayout(), "indvars");
+#ifndef NDEBUG
+  Rewriter.setDebugType(DEBUG_TYPE);
+#endif
+  bool Changed = false;
+  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+    Changed |=
+        simplifyUsersOfIV(cast<PHINode>(I), SE, DT, LI, TTI, Dead, Rewriter);
+  }
+  return Changed;
+}
+
+} // namespace llvm
 
 //===----------------------------------------------------------------------===//
 // Widen Induction Variables - Extend the width of an IV to cover its
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/SimplifyLibCalls.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/SimplifyLibCalls.cpp
index cbe7799239..f9a9dd237b 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -1,617 +1,617 @@
-//===------ SimplifyLibCalls.cpp - Library calls simplifier ---------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements the library calls simplifier. It does not implement 
-// any pass, but can't be used by other passes to do simplifications. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/SimplifyLibCalls.h" 
-#include "llvm/ADT/APSInt.h" 
-#include "llvm/ADT/SmallString.h" 
-#include "llvm/ADT/StringMap.h" 
-#include "llvm/ADT/Triple.h" 
-#include "llvm/Analysis/BlockFrequencyInfo.h" 
-#include "llvm/Analysis/ConstantFolding.h" 
-#include "llvm/Analysis/OptimizationRemarkEmitter.h" 
-#include "llvm/Analysis/ProfileSummaryInfo.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/Analysis/CaptureTracking.h" 
-#include "llvm/Analysis/Loads.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/KnownBits.h" 
-#include "llvm/Support/MathExtras.h" 
-#include "llvm/Transforms/Utils/BuildLibCalls.h" 
-#include "llvm/Transforms/Utils/SizeOpts.h" 
- 
-using namespace llvm; 
-using namespace PatternMatch; 
- 
-static cl::opt<bool> 
-    EnableUnsafeFPShrink("enable-double-float-shrink", cl::Hidden, 
-                         cl::init(false), 
-                         cl::desc("Enable unsafe double to float " 
-                                  "shrinking for math lib calls")); 
- 
-//===----------------------------------------------------------------------===// 
-// Helper Functions 
-//===----------------------------------------------------------------------===// 
- 
-static bool ignoreCallingConv(LibFunc Func) { 
-  return Func == LibFunc_abs || Func == LibFunc_labs || 
-         Func == LibFunc_llabs || Func == LibFunc_strlen; 
-} 
- 
-static bool isCallingConvCCompatible(CallInst *CI) { 
-  switch(CI->getCallingConv()) { 
-  default: 
-    return false; 
-  case llvm::CallingConv::C: 
-    return true; 
-  case llvm::CallingConv::ARM_APCS: 
-  case llvm::CallingConv::ARM_AAPCS: 
-  case llvm::CallingConv::ARM_AAPCS_VFP: { 
- 
-    // The iOS ABI diverges from the standard in some cases, so for now don't 
-    // try to simplify those calls. 
-    if (Triple(CI->getModule()->getTargetTriple()).isiOS()) 
-      return false; 
- 
-    auto *FuncTy = CI->getFunctionType(); 
- 
-    if (!FuncTy->getReturnType()->isPointerTy() && 
-        !FuncTy->getReturnType()->isIntegerTy() && 
-        !FuncTy->getReturnType()->isVoidTy()) 
-      return false; 
- 
-    for (auto Param : FuncTy->params()) { 
-      if (!Param->isPointerTy() && !Param->isIntegerTy()) 
-        return false; 
-    } 
-    return true; 
-  } 
-  } 
-  return false; 
-} 
- 
-/// Return true if it is only used in equality comparisons with With. 
-static bool isOnlyUsedInEqualityComparison(Value *V, Value *With) { 
-  for (User *U : V->users()) { 
-    if (ICmpInst *IC = dyn_cast<ICmpInst>(U)) 
-      if (IC->isEquality() && IC->getOperand(1) == With) 
-        continue; 
-    // Unknown instruction. 
-    return false; 
-  } 
-  return true; 
-} 
- 
-static bool callHasFloatingPointArgument(const CallInst *CI) { 
-  return any_of(CI->operands(), [](const Use &OI) { 
-    return OI->getType()->isFloatingPointTy(); 
-  }); 
-} 
- 
-static bool callHasFP128Argument(const CallInst *CI) { 
-  return any_of(CI->operands(), [](const Use &OI) { 
-    return OI->getType()->isFP128Ty(); 
-  }); 
-} 
- 
-static Value *convertStrToNumber(CallInst *CI, StringRef &Str, int64_t Base) { 
-  if (Base < 2 || Base > 36) 
-    // handle special zero base 
-    if (Base != 0) 
-      return nullptr; 
- 
-  char *End; 
-  std::string nptr = Str.str(); 
-  errno = 0; 
-  long long int Result = strtoll(nptr.c_str(), &End, Base); 
-  if (errno) 
-    return nullptr; 
- 
-  // if we assume all possible target locales are ASCII supersets, 
-  // then if strtoll successfully parses a number on the host, 
-  // it will also successfully parse the same way on the target 
-  if (*End != '\0') 
-    return nullptr; 
- 
-  if (!isIntN(CI->getType()->getPrimitiveSizeInBits(), Result)) 
-    return nullptr; 
- 
-  return ConstantInt::get(CI->getType(), Result); 
-} 
- 
-static bool isOnlyUsedInComparisonWithZero(Value *V) { 
-  for (User *U : V->users()) { 
-    if (ICmpInst *IC = dyn_cast<ICmpInst>(U)) 
-      if (Constant *C = dyn_cast<Constant>(IC->getOperand(1))) 
-        if (C->isNullValue()) 
-          continue; 
-    // Unknown instruction. 
-    return false; 
-  } 
-  return true; 
-} 
- 
-static bool canTransformToMemCmp(CallInst *CI, Value *Str, uint64_t Len, 
-                                 const DataLayout &DL) { 
-  if (!isOnlyUsedInComparisonWithZero(CI)) 
-    return false; 
- 
-  if (!isDereferenceableAndAlignedPointer(Str, Align(1), APInt(64, Len), DL)) 
-    return false; 
- 
-  if (CI->getFunction()->hasFnAttribute(Attribute::SanitizeMemory)) 
-    return false; 
- 
-  return true; 
-} 
- 
-static void annotateDereferenceableBytes(CallInst *CI, 
-                                         ArrayRef<unsigned> ArgNos, 
-                                         uint64_t DereferenceableBytes) { 
-  const Function *F = CI->getCaller(); 
-  if (!F) 
-    return; 
-  for (unsigned ArgNo : ArgNos) { 
-    uint64_t DerefBytes = DereferenceableBytes; 
-    unsigned AS = CI->getArgOperand(ArgNo)->getType()->getPointerAddressSpace(); 
-    if (!llvm::NullPointerIsDefined(F, AS) || 
-        CI->paramHasAttr(ArgNo, Attribute::NonNull)) 
-      DerefBytes = std::max(CI->getDereferenceableOrNullBytes( 
-                                ArgNo + AttributeList::FirstArgIndex), 
-                            DereferenceableBytes); 
-   
-    if (CI->getDereferenceableBytes(ArgNo + AttributeList::FirstArgIndex) < 
-        DerefBytes) { 
-      CI->removeParamAttr(ArgNo, Attribute::Dereferenceable); 
-      if (!llvm::NullPointerIsDefined(F, AS) || 
-          CI->paramHasAttr(ArgNo, Attribute::NonNull)) 
-        CI->removeParamAttr(ArgNo, Attribute::DereferenceableOrNull); 
-      CI->addParamAttr(ArgNo, Attribute::getWithDereferenceableBytes( 
-                                  CI->getContext(), DerefBytes)); 
-    } 
-  } 
-} 
- 
-static void annotateNonNullBasedOnAccess(CallInst *CI, 
-                                         ArrayRef<unsigned> ArgNos) { 
-  Function *F = CI->getCaller(); 
-  if (!F) 
-    return; 
- 
-  for (unsigned ArgNo : ArgNos) { 
-    if (CI->paramHasAttr(ArgNo, Attribute::NonNull)) 
-      continue; 
-    unsigned AS = CI->getArgOperand(ArgNo)->getType()->getPointerAddressSpace(); 
-    if (llvm::NullPointerIsDefined(F, AS)) 
-      continue; 
- 
-    CI->addParamAttr(ArgNo, Attribute::NonNull); 
-    annotateDereferenceableBytes(CI, ArgNo, 1); 
-  } 
-} 
- 
-static void annotateNonNullAndDereferenceable(CallInst *CI, ArrayRef<unsigned> ArgNos, 
-                               Value *Size, const DataLayout &DL) { 
-  if (ConstantInt *LenC = dyn_cast<ConstantInt>(Size)) { 
-    annotateNonNullBasedOnAccess(CI, ArgNos); 
-    annotateDereferenceableBytes(CI, ArgNos, LenC->getZExtValue()); 
-  } else if (isKnownNonZero(Size, DL)) { 
-    annotateNonNullBasedOnAccess(CI, ArgNos); 
-    const APInt *X, *Y; 
-    uint64_t DerefMin = 1; 
-    if (match(Size, m_Select(m_Value(), m_APInt(X), m_APInt(Y)))) { 
-      DerefMin = std::min(X->getZExtValue(), Y->getZExtValue()); 
-      annotateDereferenceableBytes(CI, ArgNos, DerefMin); 
-    } 
-  } 
-} 
- 
-//===----------------------------------------------------------------------===// 
-// String and Memory Library Call Optimizations 
-//===----------------------------------------------------------------------===// 
- 
-Value *LibCallSimplifier::optimizeStrCat(CallInst *CI, IRBuilderBase &B) { 
-  // Extract some information from the instruction 
-  Value *Dst = CI->getArgOperand(0); 
-  Value *Src = CI->getArgOperand(1); 
-  annotateNonNullBasedOnAccess(CI, {0, 1}); 
- 
-  // See if we can get the length of the input string. 
-  uint64_t Len = GetStringLength(Src); 
-  if (Len) 
-    annotateDereferenceableBytes(CI, 1, Len); 
-  else 
-    return nullptr; 
-  --Len; // Unbias length. 
- 
-  // Handle the simple, do-nothing case: strcat(x, "") -> x 
-  if (Len == 0) 
-    return Dst; 
- 
-  return emitStrLenMemCpy(Src, Dst, Len, B); 
-} 
- 
-Value *LibCallSimplifier::emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len, 
-                                           IRBuilderBase &B) { 
-  // We need to find the end of the destination string.  That's where the 
-  // memory is to be moved to. We just generate a call to strlen. 
-  Value *DstLen = emitStrLen(Dst, B, DL, TLI); 
-  if (!DstLen) 
-    return nullptr; 
- 
-  // Now that we have the destination's length, we must index into the 
-  // destination's pointer to get the actual memcpy destination (end of 
-  // the string .. we're concatenating). 
-  Value *CpyDst = B.CreateGEP(B.getInt8Ty(), Dst, DstLen, "endptr"); 
- 
-  // We have enough information to now generate the memcpy call to do the 
-  // concatenation for us.  Make a memcpy to copy the nul byte with align = 1. 
-  B.CreateMemCpy( 
-      CpyDst, Align(1), Src, Align(1), 
-      ConstantInt::get(DL.getIntPtrType(Src->getContext()), Len + 1)); 
-  return Dst; 
-} 
- 
-Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilderBase &B) { 
-  // Extract some information from the instruction. 
-  Value *Dst = CI->getArgOperand(0); 
-  Value *Src = CI->getArgOperand(1); 
-  Value *Size = CI->getArgOperand(2); 
-  uint64_t Len; 
-  annotateNonNullBasedOnAccess(CI, 0); 
-  if (isKnownNonZero(Size, DL)) 
-    annotateNonNullBasedOnAccess(CI, 1); 
- 
-  // We don't do anything if length is not constant. 
-  ConstantInt *LengthArg = dyn_cast<ConstantInt>(Size); 
-  if (LengthArg) { 
-    Len = LengthArg->getZExtValue(); 
-    // strncat(x, c, 0) -> x 
-    if (!Len) 
-      return Dst; 
-  } else { 
-    return nullptr; 
-  } 
- 
-  // See if we can get the length of the input string. 
-  uint64_t SrcLen = GetStringLength(Src); 
-  if (SrcLen) { 
-    annotateDereferenceableBytes(CI, 1, SrcLen); 
-    --SrcLen; // Unbias length. 
-  } else { 
-    return nullptr; 
-  } 
- 
-  // strncat(x, "", c) -> x 
-  if (SrcLen == 0) 
-    return Dst; 
- 
-  // We don't optimize this case. 
-  if (Len < SrcLen) 
-    return nullptr; 
- 
-  // strncat(x, s, c) -> strcat(x, s) 
-  // s is constant so the strcat can be optimized further. 
-  return emitStrLenMemCpy(Src, Dst, SrcLen, B); 
-} 
- 
-Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilderBase &B) { 
-  Function *Callee = CI->getCalledFunction(); 
-  FunctionType *FT = Callee->getFunctionType(); 
-  Value *SrcStr = CI->getArgOperand(0); 
-  annotateNonNullBasedOnAccess(CI, 0); 
- 
-  // If the second operand is non-constant, see if we can compute the length 
-  // of the input string and turn this into memchr. 
-  ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1)); 
-  if (!CharC) { 
-    uint64_t Len = GetStringLength(SrcStr); 
-    if (Len) 
-      annotateDereferenceableBytes(CI, 0, Len); 
-    else 
-      return nullptr; 
-    if (!FT->getParamType(1)->isIntegerTy(32)) // memchr needs i32. 
-      return nullptr; 
- 
-    return emitMemChr(SrcStr, CI->getArgOperand(1), // include nul. 
-                      ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len), 
-                      B, DL, TLI); 
-  } 
- 
-  // Otherwise, the character is a constant, see if the first argument is 
-  // a string literal.  If so, we can constant fold. 
-  StringRef Str; 
-  if (!getConstantStringInfo(SrcStr, Str)) { 
-    if (CharC->isZero()) // strchr(p, 0) -> p + strlen(p) 
-      if (Value *StrLen = emitStrLen(SrcStr, B, DL, TLI)) 
-        return B.CreateGEP(B.getInt8Ty(), SrcStr, StrLen, "strchr"); 
-    return nullptr; 
-  } 
- 
-  // Compute the offset, make sure to handle the case when we're searching for 
-  // zero (a weird way to spell strlen). 
-  size_t I = (0xFF & CharC->getSExtValue()) == 0 
-                 ? Str.size() 
-                 : Str.find(CharC->getSExtValue()); 
-  if (I == StringRef::npos) // Didn't find the char.  strchr returns null. 
-    return Constant::getNullValue(CI->getType()); 
- 
-  // strchr(s+n,c)  -> gep(s+n+i,c) 
-  return B.CreateGEP(B.getInt8Ty(), SrcStr, B.getInt64(I), "strchr"); 
-} 
- 
-Value *LibCallSimplifier::optimizeStrRChr(CallInst *CI, IRBuilderBase &B) { 
-  Value *SrcStr = CI->getArgOperand(0); 
-  ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1)); 
-  annotateNonNullBasedOnAccess(CI, 0); 
- 
-  // Cannot fold anything if we're not looking for a constant. 
-  if (!CharC) 
-    return nullptr; 
- 
-  StringRef Str; 
-  if (!getConstantStringInfo(SrcStr, Str)) { 
-    // strrchr(s, 0) -> strchr(s, 0) 
-    if (CharC->isZero()) 
-      return emitStrChr(SrcStr, '\0', B, TLI); 
-    return nullptr; 
-  } 
- 
-  // Compute the offset. 
-  size_t I = (0xFF & CharC->getSExtValue()) == 0 
-                 ? Str.size() 
-                 : Str.rfind(CharC->getSExtValue()); 
-  if (I == StringRef::npos) // Didn't find the char. Return null. 
-    return Constant::getNullValue(CI->getType()); 
- 
-  // strrchr(s+n,c) -> gep(s+n+i,c) 
-  return B.CreateGEP(B.getInt8Ty(), SrcStr, B.getInt64(I), "strrchr"); 
-} 
- 
-Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilderBase &B) { 
-  Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1); 
-  if (Str1P == Str2P) // strcmp(x,x)  -> 0 
-    return ConstantInt::get(CI->getType(), 0); 
- 
-  StringRef Str1, Str2; 
-  bool HasStr1 = getConstantStringInfo(Str1P, Str1); 
-  bool HasStr2 = getConstantStringInfo(Str2P, Str2); 
- 
-  // strcmp(x, y)  -> cnst  (if both x and y are constant strings) 
-  if (HasStr1 && HasStr2) 
-    return ConstantInt::get(CI->getType(), Str1.compare(Str2)); 
- 
-  if (HasStr1 && Str1.empty()) // strcmp("", x) -> -*x 
-    return B.CreateNeg(B.CreateZExt( 
-        B.CreateLoad(B.getInt8Ty(), Str2P, "strcmpload"), CI->getType())); 
- 
-  if (HasStr2 && Str2.empty()) // strcmp(x,"") -> *x 
-    return B.CreateZExt(B.CreateLoad(B.getInt8Ty(), Str1P, "strcmpload"), 
-                        CI->getType()); 
- 
-  // strcmp(P, "x") -> memcmp(P, "x", 2) 
-  uint64_t Len1 = GetStringLength(Str1P); 
-  if (Len1) 
-    annotateDereferenceableBytes(CI, 0, Len1); 
-  uint64_t Len2 = GetStringLength(Str2P); 
-  if (Len2) 
-    annotateDereferenceableBytes(CI, 1, Len2); 
- 
-  if (Len1 && Len2) { 
-    return emitMemCmp(Str1P, Str2P, 
-                      ConstantInt::get(DL.getIntPtrType(CI->getContext()), 
-                                       std::min(Len1, Len2)), 
-                      B, DL, TLI); 
-  } 
- 
-  // strcmp to memcmp 
-  if (!HasStr1 && HasStr2) { 
-    if (canTransformToMemCmp(CI, Str1P, Len2, DL)) 
-      return emitMemCmp( 
-          Str1P, Str2P, 
-          ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len2), B, DL, 
-          TLI); 
-  } else if (HasStr1 && !HasStr2) { 
-    if (canTransformToMemCmp(CI, Str2P, Len1, DL)) 
-      return emitMemCmp( 
-          Str1P, Str2P, 
-          ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len1), B, DL, 
-          TLI); 
-  } 
- 
-  annotateNonNullBasedOnAccess(CI, {0, 1}); 
-  return nullptr; 
-} 
- 
-Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilderBase &B) { 
-  Value *Str1P = CI->getArgOperand(0); 
-  Value *Str2P = CI->getArgOperand(1); 
-  Value *Size = CI->getArgOperand(2); 
-  if (Str1P == Str2P) // strncmp(x,x,n)  -> 0 
-    return ConstantInt::get(CI->getType(), 0); 
- 
-  if (isKnownNonZero(Size, DL)) 
-    annotateNonNullBasedOnAccess(CI, {0, 1}); 
-  // Get the length argument if it is constant. 
-  uint64_t Length; 
-  if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(Size)) 
-    Length = LengthArg->getZExtValue(); 
-  else 
-    return nullptr; 
- 
-  if (Length == 0) // strncmp(x,y,0)   -> 0 
-    return ConstantInt::get(CI->getType(), 0); 
- 
-  if (Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1) 
-    return emitMemCmp(Str1P, Str2P, Size, B, DL, TLI); 
- 
-  StringRef Str1, Str2; 
-  bool HasStr1 = getConstantStringInfo(Str1P, Str1); 
-  bool HasStr2 = getConstantStringInfo(Str2P, Str2); 
- 
-  // strncmp(x, y)  -> cnst  (if both x and y are constant strings) 
-  if (HasStr1 && HasStr2) { 
-    StringRef SubStr1 = Str1.substr(0, Length); 
-    StringRef SubStr2 = Str2.substr(0, Length); 
-    return ConstantInt::get(CI->getType(), SubStr1.compare(SubStr2)); 
-  } 
- 
-  if (HasStr1 && Str1.empty()) // strncmp("", x, n) -> -*x 
-    return B.CreateNeg(B.CreateZExt( 
-        B.CreateLoad(B.getInt8Ty(), Str2P, "strcmpload"), CI->getType())); 
- 
-  if (HasStr2 && Str2.empty()) // strncmp(x, "", n) -> *x 
-    return B.CreateZExt(B.CreateLoad(B.getInt8Ty(), Str1P, "strcmpload"), 
-                        CI->getType()); 
- 
-  uint64_t Len1 = GetStringLength(Str1P); 
-  if (Len1) 
-    annotateDereferenceableBytes(CI, 0, Len1); 
-  uint64_t Len2 = GetStringLength(Str2P); 
-  if (Len2) 
-    annotateDereferenceableBytes(CI, 1, Len2); 
- 
-  // strncmp to memcmp 
-  if (!HasStr1 && HasStr2) { 
-    Len2 = std::min(Len2, Length); 
-    if (canTransformToMemCmp(CI, Str1P, Len2, DL)) 
-      return emitMemCmp( 
-          Str1P, Str2P, 
-          ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len2), B, DL, 
-          TLI); 
-  } else if (HasStr1 && !HasStr2) { 
-    Len1 = std::min(Len1, Length); 
-    if (canTransformToMemCmp(CI, Str2P, Len1, DL)) 
-      return emitMemCmp( 
-          Str1P, Str2P, 
-          ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len1), B, DL, 
-          TLI); 
-  } 
- 
-  return nullptr; 
-} 
- 
-Value *LibCallSimplifier::optimizeStrNDup(CallInst *CI, IRBuilderBase &B) { 
-  Value *Src = CI->getArgOperand(0); 
-  ConstantInt *Size = dyn_cast<ConstantInt>(CI->getArgOperand(1)); 
-  uint64_t SrcLen = GetStringLength(Src); 
-  if (SrcLen && Size) { 
-    annotateDereferenceableBytes(CI, 0, SrcLen); 
-    if (SrcLen <= Size->getZExtValue() + 1) 
-      return emitStrDup(Src, B, TLI); 
-  } 
- 
-  return nullptr; 
-} 
- 
-Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilderBase &B) { 
-  Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); 
-  if (Dst == Src) // strcpy(x,x)  -> x 
-    return Src; 
-   
-  annotateNonNullBasedOnAccess(CI, {0, 1}); 
-  // See if we can get the length of the input string. 
-  uint64_t Len = GetStringLength(Src); 
-  if (Len) 
-    annotateDereferenceableBytes(CI, 1, Len); 
-  else 
-    return nullptr; 
- 
-  // We have enough information to now generate the memcpy call to do the 
-  // copy for us.  Make a memcpy to copy the nul byte with align = 1. 
-  CallInst *NewCI = 
-      B.CreateMemCpy(Dst, Align(1), Src, Align(1), 
-                     ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len)); 
-  NewCI->setAttributes(CI->getAttributes()); 
+//===------ SimplifyLibCalls.cpp - Library calls simplifier ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the library calls simplifier. It does not implement
+// any pass, but can't be used by other passes to do simplifications.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+static cl::opt<bool>
+    EnableUnsafeFPShrink("enable-double-float-shrink", cl::Hidden,
+                         cl::init(false),
+                         cl::desc("Enable unsafe double to float "
+                                  "shrinking for math lib calls"));
+
+//===----------------------------------------------------------------------===//
+// Helper Functions
+//===----------------------------------------------------------------------===//
+
+static bool ignoreCallingConv(LibFunc Func) {
+  return Func == LibFunc_abs || Func == LibFunc_labs ||
+         Func == LibFunc_llabs || Func == LibFunc_strlen;
+}
+
+static bool isCallingConvCCompatible(CallInst *CI) {
+  switch(CI->getCallingConv()) {
+  default:
+    return false;
+  case llvm::CallingConv::C:
+    return true;
+  case llvm::CallingConv::ARM_APCS:
+  case llvm::CallingConv::ARM_AAPCS:
+  case llvm::CallingConv::ARM_AAPCS_VFP: {
+
+    // The iOS ABI diverges from the standard in some cases, so for now don't
+    // try to simplify those calls.
+    if (Triple(CI->getModule()->getTargetTriple()).isiOS())
+      return false;
+
+    auto *FuncTy = CI->getFunctionType();
+
+    if (!FuncTy->getReturnType()->isPointerTy() &&
+        !FuncTy->getReturnType()->isIntegerTy() &&
+        !FuncTy->getReturnType()->isVoidTy())
+      return false;
+
+    for (auto Param : FuncTy->params()) {
+      if (!Param->isPointerTy() && !Param->isIntegerTy())
+        return false;
+    }
+    return true;
+  }
+  }
+  return false;
+}
+
+/// Return true if it is only used in equality comparisons with With.
+static bool isOnlyUsedInEqualityComparison(Value *V, Value *With) {
+  for (User *U : V->users()) {
+    if (ICmpInst *IC = dyn_cast<ICmpInst>(U))
+      if (IC->isEquality() && IC->getOperand(1) == With)
+        continue;
+    // Unknown instruction.
+    return false;
+  }
+  return true;
+}
+
+static bool callHasFloatingPointArgument(const CallInst *CI) {
+  return any_of(CI->operands(), [](const Use &OI) {
+    return OI->getType()->isFloatingPointTy();
+  });
+}
+
+static bool callHasFP128Argument(const CallInst *CI) {
+  return any_of(CI->operands(), [](const Use &OI) {
+    return OI->getType()->isFP128Ty();
+  });
+}
+
+static Value *convertStrToNumber(CallInst *CI, StringRef &Str, int64_t Base) {
+  if (Base < 2 || Base > 36)
+    // handle special zero base
+    if (Base != 0)
+      return nullptr;
+
+  char *End;
+  std::string nptr = Str.str();
+  errno = 0;
+  long long int Result = strtoll(nptr.c_str(), &End, Base);
+  if (errno)
+    return nullptr;
+
+  // if we assume all possible target locales are ASCII supersets,
+  // then if strtoll successfully parses a number on the host,
+  // it will also successfully parse the same way on the target
+  if (*End != '\0')
+    return nullptr;
+
+  if (!isIntN(CI->getType()->getPrimitiveSizeInBits(), Result))
+    return nullptr;
+
+  return ConstantInt::get(CI->getType(), Result);
+}
+
+static bool isOnlyUsedInComparisonWithZero(Value *V) {
+  for (User *U : V->users()) {
+    if (ICmpInst *IC = dyn_cast<ICmpInst>(U))
+      if (Constant *C = dyn_cast<Constant>(IC->getOperand(1)))
+        if (C->isNullValue())
+          continue;
+    // Unknown instruction.
+    return false;
+  }
+  return true;
+}
+
+static bool canTransformToMemCmp(CallInst *CI, Value *Str, uint64_t Len,
+                                 const DataLayout &DL) {
+  if (!isOnlyUsedInComparisonWithZero(CI))
+    return false;
+
+  if (!isDereferenceableAndAlignedPointer(Str, Align(1), APInt(64, Len), DL))
+    return false;
+
+  if (CI->getFunction()->hasFnAttribute(Attribute::SanitizeMemory))
+    return false;
+
+  return true;
+}
+
+static void annotateDereferenceableBytes(CallInst *CI,
+                                         ArrayRef<unsigned> ArgNos,
+                                         uint64_t DereferenceableBytes) {
+  const Function *F = CI->getCaller();
+  if (!F)
+    return;
+  for (unsigned ArgNo : ArgNos) {
+    uint64_t DerefBytes = DereferenceableBytes;
+    unsigned AS = CI->getArgOperand(ArgNo)->getType()->getPointerAddressSpace();
+    if (!llvm::NullPointerIsDefined(F, AS) ||
+        CI->paramHasAttr(ArgNo, Attribute::NonNull))
+      DerefBytes = std::max(CI->getDereferenceableOrNullBytes(
+                                ArgNo + AttributeList::FirstArgIndex),
+                            DereferenceableBytes);
+  
+    if (CI->getDereferenceableBytes(ArgNo + AttributeList::FirstArgIndex) <
+        DerefBytes) {
+      CI->removeParamAttr(ArgNo, Attribute::Dereferenceable);
+      if (!llvm::NullPointerIsDefined(F, AS) ||
+          CI->paramHasAttr(ArgNo, Attribute::NonNull))
+        CI->removeParamAttr(ArgNo, Attribute::DereferenceableOrNull);
+      CI->addParamAttr(ArgNo, Attribute::getWithDereferenceableBytes(
+                                  CI->getContext(), DerefBytes));
+    }
+  }
+}
+
+static void annotateNonNullBasedOnAccess(CallInst *CI,
+                                         ArrayRef<unsigned> ArgNos) {
+  Function *F = CI->getCaller();
+  if (!F)
+    return;
+
+  for (unsigned ArgNo : ArgNos) {
+    if (CI->paramHasAttr(ArgNo, Attribute::NonNull))
+      continue;
+    unsigned AS = CI->getArgOperand(ArgNo)->getType()->getPointerAddressSpace();
+    if (llvm::NullPointerIsDefined(F, AS))
+      continue;
+
+    CI->addParamAttr(ArgNo, Attribute::NonNull);
+    annotateDereferenceableBytes(CI, ArgNo, 1);
+  }
+}
+
+static void annotateNonNullAndDereferenceable(CallInst *CI, ArrayRef<unsigned> ArgNos,
+                               Value *Size, const DataLayout &DL) {
+  if (ConstantInt *LenC = dyn_cast<ConstantInt>(Size)) {
+    annotateNonNullBasedOnAccess(CI, ArgNos);
+    annotateDereferenceableBytes(CI, ArgNos, LenC->getZExtValue());
+  } else if (isKnownNonZero(Size, DL)) {
+    annotateNonNullBasedOnAccess(CI, ArgNos);
+    const APInt *X, *Y;
+    uint64_t DerefMin = 1;
+    if (match(Size, m_Select(m_Value(), m_APInt(X), m_APInt(Y)))) {
+      DerefMin = std::min(X->getZExtValue(), Y->getZExtValue());
+      annotateDereferenceableBytes(CI, ArgNos, DerefMin);
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// String and Memory Library Call Optimizations
+//===----------------------------------------------------------------------===//
+
+Value *LibCallSimplifier::optimizeStrCat(CallInst *CI, IRBuilderBase &B) {
+  // Extract some information from the instruction
+  Value *Dst = CI->getArgOperand(0);
+  Value *Src = CI->getArgOperand(1);
+  annotateNonNullBasedOnAccess(CI, {0, 1});
+
+  // See if we can get the length of the input string.
+  uint64_t Len = GetStringLength(Src);
+  if (Len)
+    annotateDereferenceableBytes(CI, 1, Len);
+  else
+    return nullptr;
+  --Len; // Unbias length.
+
+  // Handle the simple, do-nothing case: strcat(x, "") -> x
+  if (Len == 0)
+    return Dst;
+
+  return emitStrLenMemCpy(Src, Dst, Len, B);
+}
+
+Value *LibCallSimplifier::emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len,
+                                           IRBuilderBase &B) {
+  // We need to find the end of the destination string.  That's where the
+  // memory is to be moved to. We just generate a call to strlen.
+  Value *DstLen = emitStrLen(Dst, B, DL, TLI);
+  if (!DstLen)
+    return nullptr;
+
+  // Now that we have the destination's length, we must index into the
+  // destination's pointer to get the actual memcpy destination (end of
+  // the string .. we're concatenating).
+  Value *CpyDst = B.CreateGEP(B.getInt8Ty(), Dst, DstLen, "endptr");
+
+  // We have enough information to now generate the memcpy call to do the
+  // concatenation for us.  Make a memcpy to copy the nul byte with align = 1.
+  B.CreateMemCpy(
+      CpyDst, Align(1), Src, Align(1),
+      ConstantInt::get(DL.getIntPtrType(Src->getContext()), Len + 1));
+  return Dst;
+}
+
+Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilderBase &B) {
+  // Extract some information from the instruction.
+  Value *Dst = CI->getArgOperand(0);
+  Value *Src = CI->getArgOperand(1);
+  Value *Size = CI->getArgOperand(2);
+  uint64_t Len;
+  annotateNonNullBasedOnAccess(CI, 0);
+  if (isKnownNonZero(Size, DL))
+    annotateNonNullBasedOnAccess(CI, 1);
+
+  // We don't do anything if length is not constant.
+  ConstantInt *LengthArg = dyn_cast<ConstantInt>(Size);
+  if (LengthArg) {
+    Len = LengthArg->getZExtValue();
+    // strncat(x, c, 0) -> x
+    if (!Len)
+      return Dst;
+  } else {
+    return nullptr;
+  }
+
+  // See if we can get the length of the input string.
+  uint64_t SrcLen = GetStringLength(Src);
+  if (SrcLen) {
+    annotateDereferenceableBytes(CI, 1, SrcLen);
+    --SrcLen; // Unbias length.
+  } else {
+    return nullptr;
+  }
+
+  // strncat(x, "", c) -> x
+  if (SrcLen == 0)
+    return Dst;
+
+  // We don't optimize this case.
+  if (Len < SrcLen)
+    return nullptr;
+
+  // strncat(x, s, c) -> strcat(x, s)
+  // s is constant so the strcat can be optimized further.
+  return emitStrLenMemCpy(Src, Dst, SrcLen, B);
+}
+
+Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilderBase &B) {
+  Function *Callee = CI->getCalledFunction();
+  FunctionType *FT = Callee->getFunctionType();
+  Value *SrcStr = CI->getArgOperand(0);
+  annotateNonNullBasedOnAccess(CI, 0);
+
+  // If the second operand is non-constant, see if we can compute the length
+  // of the input string and turn this into memchr.
+  ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+  if (!CharC) {
+    uint64_t Len = GetStringLength(SrcStr);
+    if (Len)
+      annotateDereferenceableBytes(CI, 0, Len);
+    else
+      return nullptr;
+    if (!FT->getParamType(1)->isIntegerTy(32)) // memchr needs i32.
+      return nullptr;
+
+    return emitMemChr(SrcStr, CI->getArgOperand(1), // include nul.
+                      ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len),
+                      B, DL, TLI);
+  }
+
+  // Otherwise, the character is a constant, see if the first argument is
+  // a string literal.  If so, we can constant fold.
+  StringRef Str;
+  if (!getConstantStringInfo(SrcStr, Str)) {
+    if (CharC->isZero()) // strchr(p, 0) -> p + strlen(p)
+      if (Value *StrLen = emitStrLen(SrcStr, B, DL, TLI))
+        return B.CreateGEP(B.getInt8Ty(), SrcStr, StrLen, "strchr");
+    return nullptr;
+  }
+
+  // Compute the offset, make sure to handle the case when we're searching for
+  // zero (a weird way to spell strlen).
+  size_t I = (0xFF & CharC->getSExtValue()) == 0
+                 ? Str.size()
+                 : Str.find(CharC->getSExtValue());
+  if (I == StringRef::npos) // Didn't find the char.  strchr returns null.
+    return Constant::getNullValue(CI->getType());
+
+  // strchr(s+n,c)  -> gep(s+n+i,c)
+  return B.CreateGEP(B.getInt8Ty(), SrcStr, B.getInt64(I), "strchr");
+}
+
+Value *LibCallSimplifier::optimizeStrRChr(CallInst *CI, IRBuilderBase &B) {
+  Value *SrcStr = CI->getArgOperand(0);
+  ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+  annotateNonNullBasedOnAccess(CI, 0);
+
+  // Cannot fold anything if we're not looking for a constant.
+  if (!CharC)
+    return nullptr;
+
+  StringRef Str;
+  if (!getConstantStringInfo(SrcStr, Str)) {
+    // strrchr(s, 0) -> strchr(s, 0)
+    if (CharC->isZero())
+      return emitStrChr(SrcStr, '\0', B, TLI);
+    return nullptr;
+  }
+
+  // Compute the offset.
+  size_t I = (0xFF & CharC->getSExtValue()) == 0
+                 ? Str.size()
+                 : Str.rfind(CharC->getSExtValue());
+  if (I == StringRef::npos) // Didn't find the char. Return null.
+    return Constant::getNullValue(CI->getType());
+
+  // strrchr(s+n,c) -> gep(s+n+i,c)
+  return B.CreateGEP(B.getInt8Ty(), SrcStr, B.getInt64(I), "strrchr");
+}
+
+Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilderBase &B) {
+  Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1);
+  if (Str1P == Str2P) // strcmp(x,x)  -> 0
+    return ConstantInt::get(CI->getType(), 0);
+
+  StringRef Str1, Str2;
+  bool HasStr1 = getConstantStringInfo(Str1P, Str1);
+  bool HasStr2 = getConstantStringInfo(Str2P, Str2);
+
+  // strcmp(x, y)  -> cnst  (if both x and y are constant strings)
+  if (HasStr1 && HasStr2)
+    return ConstantInt::get(CI->getType(), Str1.compare(Str2));
+
+  if (HasStr1 && Str1.empty()) // strcmp("", x) -> -*x
+    return B.CreateNeg(B.CreateZExt(
+        B.CreateLoad(B.getInt8Ty(), Str2P, "strcmpload"), CI->getType()));
+
+  if (HasStr2 && Str2.empty()) // strcmp(x,"") -> *x
+    return B.CreateZExt(B.CreateLoad(B.getInt8Ty(), Str1P, "strcmpload"),
+                        CI->getType());
+
+  // strcmp(P, "x") -> memcmp(P, "x", 2)
+  uint64_t Len1 = GetStringLength(Str1P);
+  if (Len1)
+    annotateDereferenceableBytes(CI, 0, Len1);
+  uint64_t Len2 = GetStringLength(Str2P);
+  if (Len2)
+    annotateDereferenceableBytes(CI, 1, Len2);
+
+  if (Len1 && Len2) {
+    return emitMemCmp(Str1P, Str2P,
+                      ConstantInt::get(DL.getIntPtrType(CI->getContext()),
+                                       std::min(Len1, Len2)),
+                      B, DL, TLI);
+  }
+
+  // strcmp to memcmp
+  if (!HasStr1 && HasStr2) {
+    if (canTransformToMemCmp(CI, Str1P, Len2, DL))
+      return emitMemCmp(
+          Str1P, Str2P,
+          ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len2), B, DL,
+          TLI);
+  } else if (HasStr1 && !HasStr2) {
+    if (canTransformToMemCmp(CI, Str2P, Len1, DL))
+      return emitMemCmp(
+          Str1P, Str2P,
+          ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len1), B, DL,
+          TLI);
+  }
+
+  annotateNonNullBasedOnAccess(CI, {0, 1});
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilderBase &B) {
+  Value *Str1P = CI->getArgOperand(0);
+  Value *Str2P = CI->getArgOperand(1);
+  Value *Size = CI->getArgOperand(2);
+  if (Str1P == Str2P) // strncmp(x,x,n)  -> 0
+    return ConstantInt::get(CI->getType(), 0);
+
+  if (isKnownNonZero(Size, DL))
+    annotateNonNullBasedOnAccess(CI, {0, 1});
+  // Get the length argument if it is constant.
+  uint64_t Length;
+  if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(Size))
+    Length = LengthArg->getZExtValue();
+  else
+    return nullptr;
+
+  if (Length == 0) // strncmp(x,y,0)   -> 0
+    return ConstantInt::get(CI->getType(), 0);
+
+  if (Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1)
+    return emitMemCmp(Str1P, Str2P, Size, B, DL, TLI);
+
+  StringRef Str1, Str2;
+  bool HasStr1 = getConstantStringInfo(Str1P, Str1);
+  bool HasStr2 = getConstantStringInfo(Str2P, Str2);
+
+  // strncmp(x, y)  -> cnst  (if both x and y are constant strings)
+  if (HasStr1 && HasStr2) {
+    StringRef SubStr1 = Str1.substr(0, Length);
+    StringRef SubStr2 = Str2.substr(0, Length);
+    return ConstantInt::get(CI->getType(), SubStr1.compare(SubStr2));
+  }
+
+  if (HasStr1 && Str1.empty()) // strncmp("", x, n) -> -*x
+    return B.CreateNeg(B.CreateZExt(
+        B.CreateLoad(B.getInt8Ty(), Str2P, "strcmpload"), CI->getType()));
+
+  if (HasStr2 && Str2.empty()) // strncmp(x, "", n) -> *x
+    return B.CreateZExt(B.CreateLoad(B.getInt8Ty(), Str1P, "strcmpload"),
+                        CI->getType());
+
+  uint64_t Len1 = GetStringLength(Str1P);
+  if (Len1)
+    annotateDereferenceableBytes(CI, 0, Len1);
+  uint64_t Len2 = GetStringLength(Str2P);
+  if (Len2)
+    annotateDereferenceableBytes(CI, 1, Len2);
+
+  // strncmp to memcmp
+  if (!HasStr1 && HasStr2) {
+    Len2 = std::min(Len2, Length);
+    if (canTransformToMemCmp(CI, Str1P, Len2, DL))
+      return emitMemCmp(
+          Str1P, Str2P,
+          ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len2), B, DL,
+          TLI);
+  } else if (HasStr1 && !HasStr2) {
+    Len1 = std::min(Len1, Length);
+    if (canTransformToMemCmp(CI, Str2P, Len1, DL))
+      return emitMemCmp(
+          Str1P, Str2P,
+          ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len1), B, DL,
+          TLI);
+  }
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeStrNDup(CallInst *CI, IRBuilderBase &B) {
+  Value *Src = CI->getArgOperand(0);
+  ConstantInt *Size = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+  uint64_t SrcLen = GetStringLength(Src);
+  if (SrcLen && Size) {
+    annotateDereferenceableBytes(CI, 0, SrcLen);
+    if (SrcLen <= Size->getZExtValue() + 1)
+      return emitStrDup(Src, B, TLI);
+  }
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilderBase &B) {
+  Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
+  if (Dst == Src) // strcpy(x,x)  -> x
+    return Src;
+  
+  annotateNonNullBasedOnAccess(CI, {0, 1});
+  // See if we can get the length of the input string.
+  uint64_t Len = GetStringLength(Src);
+  if (Len)
+    annotateDereferenceableBytes(CI, 1, Len);
+  else
+    return nullptr;
+
+  // We have enough information to now generate the memcpy call to do the
+  // copy for us.  Make a memcpy to copy the nul byte with align = 1.
+  CallInst *NewCI =
+      B.CreateMemCpy(Dst, Align(1), Src, Align(1),
+                     ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len));
+  NewCI->setAttributes(CI->getAttributes());
   NewCI->removeAttributes(AttributeList::ReturnIndex,
                           AttributeFuncs::typeIncompatible(NewCI->getType()));
-  return Dst; 
-} 
- 
-Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilderBase &B) { 
-  Function *Callee = CI->getCalledFunction(); 
-  Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); 
-  if (Dst == Src) { // stpcpy(x,x)  -> x+strlen(x) 
-    Value *StrLen = emitStrLen(Src, B, DL, TLI); 
-    return StrLen ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, StrLen) : nullptr; 
-  } 
- 
-  // See if we can get the length of the input string. 
-  uint64_t Len = GetStringLength(Src); 
-  if (Len) 
-    annotateDereferenceableBytes(CI, 1, Len); 
-  else 
-    return nullptr; 
- 
-  Type *PT = Callee->getFunctionType()->getParamType(0); 
-  Value *LenV = ConstantInt::get(DL.getIntPtrType(PT), Len); 
-  Value *DstEnd = B.CreateGEP(B.getInt8Ty(), Dst, 
-                              ConstantInt::get(DL.getIntPtrType(PT), Len - 1)); 
- 
-  // We have enough information to now generate the memcpy call to do the 
-  // copy for us.  Make a memcpy to copy the nul byte with align = 1. 
-  CallInst *NewCI = B.CreateMemCpy(Dst, Align(1), Src, Align(1), LenV); 
-  NewCI->setAttributes(CI->getAttributes()); 
+  return Dst;
+}
+
+Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilderBase &B) {
+  Function *Callee = CI->getCalledFunction();
+  Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
+  if (Dst == Src) { // stpcpy(x,x)  -> x+strlen(x)
+    Value *StrLen = emitStrLen(Src, B, DL, TLI);
+    return StrLen ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, StrLen) : nullptr;
+  }
+
+  // See if we can get the length of the input string.
+  uint64_t Len = GetStringLength(Src);
+  if (Len)
+    annotateDereferenceableBytes(CI, 1, Len);
+  else
+    return nullptr;
+
+  Type *PT = Callee->getFunctionType()->getParamType(0);
+  Value *LenV = ConstantInt::get(DL.getIntPtrType(PT), Len);
+  Value *DstEnd = B.CreateGEP(B.getInt8Ty(), Dst,
+                              ConstantInt::get(DL.getIntPtrType(PT), Len - 1));
+
+  // We have enough information to now generate the memcpy call to do the
+  // copy for us.  Make a memcpy to copy the nul byte with align = 1.
+  CallInst *NewCI = B.CreateMemCpy(Dst, Align(1), Src, Align(1), LenV);
+  NewCI->setAttributes(CI->getAttributes());
   NewCI->removeAttributes(AttributeList::ReturnIndex,
                           AttributeFuncs::typeIncompatible(NewCI->getType()));
-  return DstEnd; 
-} 
- 
-Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilderBase &B) { 
-  Function *Callee = CI->getCalledFunction(); 
-  Value *Dst = CI->getArgOperand(0); 
-  Value *Src = CI->getArgOperand(1); 
-  Value *Size = CI->getArgOperand(2); 
-  annotateNonNullBasedOnAccess(CI, 0); 
-  if (isKnownNonZero(Size, DL)) 
-    annotateNonNullBasedOnAccess(CI, 1); 
- 
-  uint64_t Len; 
-  if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(Size)) 
-    Len = LengthArg->getZExtValue(); 
-  else 
-    return nullptr; 
- 
-  // strncpy(x, y, 0) -> x 
-  if (Len == 0) 
-    return Dst; 
- 
-  // See if we can get the length of the input string. 
-  uint64_t SrcLen = GetStringLength(Src); 
-  if (SrcLen) { 
-    annotateDereferenceableBytes(CI, 1, SrcLen); 
-    --SrcLen; // Unbias length. 
-  } else { 
-    return nullptr; 
-  } 
- 
-  if (SrcLen == 0) { 
-    // strncpy(x, "", y) -> memset(align 1 x, '\0', y) 
-    CallInst *NewCI = B.CreateMemSet(Dst, B.getInt8('\0'), Size, Align(1)); 
-    AttrBuilder ArgAttrs(CI->getAttributes().getParamAttributes(0)); 
-    NewCI->setAttributes(NewCI->getAttributes().addParamAttributes( 
-        CI->getContext(), 0, ArgAttrs)); 
-    return Dst; 
-  } 
- 
+  return DstEnd;
+}
+
+Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilderBase &B) {
+  Function *Callee = CI->getCalledFunction();
+  Value *Dst = CI->getArgOperand(0);
+  Value *Src = CI->getArgOperand(1);
+  Value *Size = CI->getArgOperand(2);
+  annotateNonNullBasedOnAccess(CI, 0);
+  if (isKnownNonZero(Size, DL))
+    annotateNonNullBasedOnAccess(CI, 1);
+
+  uint64_t Len;
+  if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(Size))
+    Len = LengthArg->getZExtValue();
+  else
+    return nullptr;
+
+  // strncpy(x, y, 0) -> x
+  if (Len == 0)
+    return Dst;
+
+  // See if we can get the length of the input string.
+  uint64_t SrcLen = GetStringLength(Src);
+  if (SrcLen) {
+    annotateDereferenceableBytes(CI, 1, SrcLen);
+    --SrcLen; // Unbias length.
+  } else {
+    return nullptr;
+  }
+
+  if (SrcLen == 0) {
+    // strncpy(x, "", y) -> memset(align 1 x, '\0', y)
+    CallInst *NewCI = B.CreateMemSet(Dst, B.getInt8('\0'), Size, Align(1));
+    AttrBuilder ArgAttrs(CI->getAttributes().getParamAttributes(0));
+    NewCI->setAttributes(NewCI->getAttributes().addParamAttributes(
+        CI->getContext(), 0, ArgAttrs));
+    return Dst;
+  }
+
   // strncpy(a, "a", 4) - > memcpy(a, "a\0\0\0", 4)
   if (Len > SrcLen + 1) {
     if (Len <= 128) {
@@ -625,1034 +625,1034 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilderBase &B) {
       return nullptr;
     }
   }
- 
-  Type *PT = Callee->getFunctionType()->getParamType(0); 
-  // strncpy(x, s, c) -> memcpy(align 1 x, align 1 s, c) [s and c are constant] 
-  CallInst *NewCI = B.CreateMemCpy(Dst, Align(1), Src, Align(1), 
-                                   ConstantInt::get(DL.getIntPtrType(PT), Len)); 
-  NewCI->setAttributes(CI->getAttributes()); 
+
+  Type *PT = Callee->getFunctionType()->getParamType(0);
+  // strncpy(x, s, c) -> memcpy(align 1 x, align 1 s, c) [s and c are constant]
+  CallInst *NewCI = B.CreateMemCpy(Dst, Align(1), Src, Align(1),
+                                   ConstantInt::get(DL.getIntPtrType(PT), Len));
+  NewCI->setAttributes(CI->getAttributes());
   NewCI->removeAttributes(AttributeList::ReturnIndex,
                           AttributeFuncs::typeIncompatible(NewCI->getType()));
-  return Dst; 
-} 
- 
-Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilderBase &B, 
-                                               unsigned CharSize) { 
-  Value *Src = CI->getArgOperand(0); 
- 
-  // Constant folding: strlen("xyz") -> 3 
-  if (uint64_t Len = GetStringLength(Src, CharSize)) 
-    return ConstantInt::get(CI->getType(), Len - 1); 
- 
-  // If s is a constant pointer pointing to a string literal, we can fold 
-  // strlen(s + x) to strlen(s) - x, when x is known to be in the range 
-  // [0, strlen(s)] or the string has a single null terminator '\0' at the end. 
-  // We only try to simplify strlen when the pointer s points to an array 
-  // of i8. Otherwise, we would need to scale the offset x before doing the 
-  // subtraction. This will make the optimization more complex, and it's not 
-  // very useful because calling strlen for a pointer of other types is 
-  // very uncommon. 
-  if (GEPOperator *GEP = dyn_cast<GEPOperator>(Src)) { 
-    if (!isGEPBasedOnPointerToString(GEP, CharSize)) 
-      return nullptr; 
- 
-    ConstantDataArraySlice Slice; 
-    if (getConstantDataArrayInfo(GEP->getOperand(0), Slice, CharSize)) { 
-      uint64_t NullTermIdx; 
-      if (Slice.Array == nullptr) { 
-        NullTermIdx = 0; 
-      } else { 
-        NullTermIdx = ~((uint64_t)0); 
-        for (uint64_t I = 0, E = Slice.Length; I < E; ++I) { 
-          if (Slice.Array->getElementAsInteger(I + Slice.Offset) == 0) { 
-            NullTermIdx = I; 
-            break; 
-          } 
-        } 
-        // If the string does not have '\0', leave it to strlen to compute 
-        // its length. 
-        if (NullTermIdx == ~((uint64_t)0)) 
-          return nullptr; 
-      } 
- 
-      Value *Offset = GEP->getOperand(2); 
-      KnownBits Known = computeKnownBits(Offset, DL, 0, nullptr, CI, nullptr); 
-      Known.Zero.flipAllBits(); 
-      uint64_t ArrSize = 
-             cast<ArrayType>(GEP->getSourceElementType())->getNumElements(); 
- 
-      // KnownZero's bits are flipped, so zeros in KnownZero now represent 
-      // bits known to be zeros in Offset, and ones in KnowZero represent 
-      // bits unknown in Offset. Therefore, Offset is known to be in range 
-      // [0, NullTermIdx] when the flipped KnownZero is non-negative and 
-      // unsigned-less-than NullTermIdx. 
-      // 
-      // If Offset is not provably in the range [0, NullTermIdx], we can still 
-      // optimize if we can prove that the program has undefined behavior when 
-      // Offset is outside that range. That is the case when GEP->getOperand(0) 
-      // is a pointer to an object whose memory extent is NullTermIdx+1. 
-      if ((Known.Zero.isNonNegative() && Known.Zero.ule(NullTermIdx)) || 
-          (GEP->isInBounds() && isa<GlobalVariable>(GEP->getOperand(0)) && 
-           NullTermIdx == ArrSize - 1)) { 
-        Offset = B.CreateSExtOrTrunc(Offset, CI->getType()); 
-        return B.CreateSub(ConstantInt::get(CI->getType(), NullTermIdx), 
-                           Offset); 
-      } 
-    } 
-  } 
- 
-  // strlen(x?"foo":"bars") --> x ? 3 : 4 
-  if (SelectInst *SI = dyn_cast<SelectInst>(Src)) { 
-    uint64_t LenTrue = GetStringLength(SI->getTrueValue(), CharSize); 
-    uint64_t LenFalse = GetStringLength(SI->getFalseValue(), CharSize); 
-    if (LenTrue && LenFalse) { 
-      ORE.emit([&]() { 
-        return OptimizationRemark("instcombine", "simplify-libcalls", CI) 
-               << "folded strlen(select) to select of constants"; 
-      }); 
-      return B.CreateSelect(SI->getCondition(), 
-                            ConstantInt::get(CI->getType(), LenTrue - 1), 
-                            ConstantInt::get(CI->getType(), LenFalse - 1)); 
-    } 
-  } 
- 
-  // strlen(x) != 0 --> *x != 0 
-  // strlen(x) == 0 --> *x == 0 
-  if (isOnlyUsedInZeroEqualityComparison(CI)) 
-    return B.CreateZExt(B.CreateLoad(B.getIntNTy(CharSize), Src, "strlenfirst"), 
-                        CI->getType()); 
- 
-  return nullptr; 
-} 
- 
-Value *LibCallSimplifier::optimizeStrLen(CallInst *CI, IRBuilderBase &B) { 
-  if (Value *V = optimizeStringLength(CI, B, 8)) 
-    return V; 
-  annotateNonNullBasedOnAccess(CI, 0); 
-  return nullptr; 
-} 
- 
-Value *LibCallSimplifier::optimizeWcslen(CallInst *CI, IRBuilderBase &B) { 
-  Module &M = *CI->getModule(); 
-  unsigned WCharSize = TLI->getWCharSize(M) * 8; 
-  // We cannot perform this optimization without wchar_size metadata. 
-  if (WCharSize == 0) 
-    return nullptr; 
- 
-  return optimizeStringLength(CI, B, WCharSize); 
-} 
- 
-Value *LibCallSimplifier::optimizeStrPBrk(CallInst *CI, IRBuilderBase &B) { 
-  StringRef S1, S2; 
-  bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1); 
-  bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2); 
- 
-  // strpbrk(s, "") -> nullptr 
-  // strpbrk("", s) -> nullptr 
-  if ((HasS1 && S1.empty()) || (HasS2 && S2.empty())) 
-    return Constant::getNullValue(CI->getType()); 
- 
-  // Constant folding. 
-  if (HasS1 && HasS2) { 
-    size_t I = S1.find_first_of(S2); 
-    if (I == StringRef::npos) // No match. 
-      return Constant::getNullValue(CI->getType()); 
- 
-    return B.CreateGEP(B.getInt8Ty(), CI->getArgOperand(0), B.getInt64(I), 
-                       "strpbrk"); 
-  } 
- 
-  // strpbrk(s, "a") -> strchr(s, 'a') 
-  if (HasS2 && S2.size() == 1) 
-    return emitStrChr(CI->getArgOperand(0), S2[0], B, TLI); 
- 
-  return nullptr; 
-} 
- 
-Value *LibCallSimplifier::optimizeStrTo(CallInst *CI, IRBuilderBase &B) { 
-  Value *EndPtr = CI->getArgOperand(1); 
-  if (isa<ConstantPointerNull>(EndPtr)) { 
-    // With a null EndPtr, this function won't capture the main argument. 
-    // It would be readonly too, except that it still may write to errno. 
-    CI->addParamAttr(0, Attribute::NoCapture); 
-  } 
- 
-  return nullptr; 
-} 
- 
-Value *LibCallSimplifier::optimizeStrSpn(CallInst *CI, IRBuilderBase &B) { 
-  StringRef S1, S2; 
-  bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1); 
-  bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2); 
- 
-  // strspn(s, "") -> 0 
-  // strspn("", s) -> 0 
-  if ((HasS1 && S1.empty()) || (HasS2 && S2.empty())) 
-    return Constant::getNullValue(CI->getType()); 
- 
-  // Constant folding. 
-  if (HasS1 && HasS2) { 
-    size_t Pos = S1.find_first_not_of(S2); 
-    if (Pos == StringRef::npos) 
-      Pos = S1.size(); 
-    return ConstantInt::get(CI->getType(), Pos); 
-  } 
- 
-  return nullptr; 
-} 
- 
-Value *LibCallSimplifier::optimizeStrCSpn(CallInst *CI, IRBuilderBase &B) { 
-  StringRef S1, S2; 
-  bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1); 
-  bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2); 
- 
-  // strcspn("", s) -> 0 
-  if (HasS1 && S1.empty()) 
-    return Constant::getNullValue(CI->getType()); 
- 
-  // Constant folding. 
-  if (HasS1 && HasS2) { 
-    size_t Pos = S1.find_first_of(S2); 
-    if (Pos == StringRef::npos) 
-      Pos = S1.size(); 
-    return ConstantInt::get(CI->getType(), Pos); 
-  } 
- 
-  // strcspn(s, "") -> strlen(s) 
-  if (HasS2 && S2.empty()) 
-    return emitStrLen(CI->getArgOperand(0), B, DL, TLI); 
- 
-  return nullptr; 
-} 
- 
-Value *LibCallSimplifier::optimizeStrStr(CallInst *CI, IRBuilderBase &B) { 
-  // fold strstr(x, x) -> x. 
-  if (CI->getArgOperand(0) == CI->getArgOperand(1)) 
-    return B.CreateBitCast(CI->getArgOperand(0), CI->getType()); 
- 
-  // fold strstr(a, b) == a -> strncmp(a, b, strlen(b)) == 0 
-  if (isOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) { 
-    Value *StrLen = emitStrLen(CI->getArgOperand(1), B, DL, TLI); 
-    if (!StrLen) 
-      return nullptr; 
-    Value *StrNCmp = emitStrNCmp(CI->getArgOperand(0), CI->getArgOperand(1), 
-                                 StrLen, B, DL, TLI); 
-    if (!StrNCmp) 
-      return nullptr; 
-    for (auto UI = CI->user_begin(), UE = CI->user_end(); UI != UE;) { 
-      ICmpInst *Old = cast<ICmpInst>(*UI++); 
-      Value *Cmp = 
-          B.CreateICmp(Old->getPredicate(), StrNCmp, 
-                       ConstantInt::getNullValue(StrNCmp->getType()), "cmp"); 
-      replaceAllUsesWith(Old, Cmp); 
-    } 
-    return CI; 
-  } 
- 
-  // See if either input string is a constant string. 
-  StringRef SearchStr, ToFindStr; 
-  bool HasStr1 = getConstantStringInfo(CI->getArgOperand(0), SearchStr); 
-  bool HasStr2 = getConstantStringInfo(CI->getArgOperand(1), ToFindStr); 
- 
-  // fold strstr(x, "") -> x. 
-  if (HasStr2 && ToFindStr.empty()) 
-    return B.CreateBitCast(CI->getArgOperand(0), CI->getType()); 
- 
-  // If both strings are known, constant fold it. 
-  if (HasStr1 && HasStr2) { 
-    size_t Offset = SearchStr.find(ToFindStr); 
- 
-    if (Offset == StringRef::npos) // strstr("foo", "bar") -> null 
-      return Constant::getNullValue(CI->getType()); 
- 
-    // strstr("abcd", "bc") -> gep((char*)"abcd", 1) 
-    Value *Result = castToCStr(CI->getArgOperand(0), B); 
-    Result = 
-        B.CreateConstInBoundsGEP1_64(B.getInt8Ty(), Result, Offset, "strstr"); 
-    return B.CreateBitCast(Result, CI->getType()); 
-  } 
- 
-  // fold strstr(x, "y") -> strchr(x, 'y'). 
-  if (HasStr2 && ToFindStr.size() == 1) { 
-    Value *StrChr = emitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TLI); 
-    return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : nullptr; 
-  } 
- 
-  annotateNonNullBasedOnAccess(CI, {0, 1}); 
-  return nullptr; 
-} 
- 
-Value *LibCallSimplifier::optimizeMemRChr(CallInst *CI, IRBuilderBase &B) { 
-  if (isKnownNonZero(CI->getOperand(2), DL)) 
-    annotateNonNullBasedOnAccess(CI, 0); 
-  return nullptr; 
-} 
- 
-Value *LibCallSimplifier::optimizeMemChr(CallInst *CI, IRBuilderBase &B) { 
-  Value *SrcStr = CI->getArgOperand(0); 
-  Value *Size = CI->getArgOperand(2); 
-  annotateNonNullAndDereferenceable(CI, 0, Size, DL); 
-  ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1)); 
-  ConstantInt *LenC = dyn_cast<ConstantInt>(Size); 
- 
-  // memchr(x, y, 0) -> null 
-  if (LenC) { 
-    if (LenC->isZero()) 
-      return Constant::getNullValue(CI->getType()); 
-  } else { 
-    // From now on we need at least constant length and string. 
-    return nullptr; 
-  } 
- 
-  StringRef Str; 
-  if (!getConstantStringInfo(SrcStr, Str, 0, /*TrimAtNul=*/false)) 
-    return nullptr; 
- 
-  // Truncate the string to LenC. If Str is smaller than LenC we will still only 
-  // scan the string, as reading past the end of it is undefined and we can just 
-  // return null if we don't find the char. 
-  Str = Str.substr(0, LenC->getZExtValue()); 
- 
-  // If the char is variable but the input str and length are not we can turn 
-  // this memchr call into a simple bit field test. Of course this only works 
-  // when the return value is only checked against null. 
-  // 
-  // It would be really nice to reuse switch lowering here but we can't change 
-  // the CFG at this point. 
-  // 
-  // memchr("\r\n", C, 2) != nullptr -> (1 << C & ((1 << '\r') | (1 << '\n'))) 
-  // != 0 
-  //   after bounds check. 
-  if (!CharC && !Str.empty() && isOnlyUsedInZeroEqualityComparison(CI)) { 
-    unsigned char Max = 
-        *std::max_element(reinterpret_cast<const unsigned char *>(Str.begin()), 
-                          reinterpret_cast<const unsigned char *>(Str.end())); 
- 
-    // Make sure the bit field we're about to create fits in a register on the 
-    // target. 
-    // FIXME: On a 64 bit architecture this prevents us from using the 
-    // interesting range of alpha ascii chars. We could do better by emitting 
-    // two bitfields or shifting the range by 64 if no lower chars are used. 
-    if (!DL.fitsInLegalInteger(Max + 1)) 
-      return nullptr; 
- 
-    // For the bit field use a power-of-2 type with at least 8 bits to avoid 
-    // creating unnecessary illegal types. 
-    unsigned char Width = NextPowerOf2(std::max((unsigned char)7, Max)); 
- 
-    // Now build the bit field. 
-    APInt Bitfield(Width, 0); 
-    for (char C : Str) 
-      Bitfield.setBit((unsigned char)C); 
-    Value *BitfieldC = B.getInt(Bitfield); 
- 
-    // Adjust width of "C" to the bitfield width, then mask off the high bits. 
-    Value *C = B.CreateZExtOrTrunc(CI->getArgOperand(1), BitfieldC->getType()); 
-    C = B.CreateAnd(C, B.getIntN(Width, 0xFF)); 
- 
-    // First check that the bit field access is within bounds. 
-    Value *Bounds = B.CreateICmp(ICmpInst::ICMP_ULT, C, B.getIntN(Width, Width), 
-                                 "memchr.bounds"); 
- 
-    // Create code that checks if the given bit is set in the field. 
-    Value *Shl = B.CreateShl(B.getIntN(Width, 1ULL), C); 
-    Value *Bits = B.CreateIsNotNull(B.CreateAnd(Shl, BitfieldC), "memchr.bits"); 
- 
-    // Finally merge both checks and cast to pointer type. The inttoptr 
-    // implicitly zexts the i1 to intptr type. 
-    return B.CreateIntToPtr(B.CreateAnd(Bounds, Bits, "memchr"), CI->getType()); 
-  } 
- 
-  // Check if all arguments are constants.  If so, we can constant fold. 
-  if (!CharC) 
-    return nullptr; 
- 
-  // Compute the offset. 
-  size_t I = Str.find(CharC->getSExtValue() & 0xFF); 
-  if (I == StringRef::npos) // Didn't find the char.  memchr returns null. 
-    return Constant::getNullValue(CI->getType()); 
- 
-  // memchr(s+n,c,l) -> gep(s+n+i,c) 
-  return B.CreateGEP(B.getInt8Ty(), SrcStr, B.getInt64(I), "memchr"); 
-} 
- 
-static Value *optimizeMemCmpConstantSize(CallInst *CI, Value *LHS, Value *RHS, 
-                                         uint64_t Len, IRBuilderBase &B, 
-                                         const DataLayout &DL) { 
-  if (Len == 0) // memcmp(s1,s2,0) -> 0 
-    return Constant::getNullValue(CI->getType()); 
- 
-  // memcmp(S1,S2,1) -> *(unsigned char*)LHS - *(unsigned char*)RHS 
-  if (Len == 1) { 
-    Value *LHSV = 
-        B.CreateZExt(B.CreateLoad(B.getInt8Ty(), castToCStr(LHS, B), "lhsc"), 
-                     CI->getType(), "lhsv"); 
-    Value *RHSV = 
-        B.CreateZExt(B.CreateLoad(B.getInt8Ty(), castToCStr(RHS, B), "rhsc"), 
-                     CI->getType(), "rhsv"); 
-    return B.CreateSub(LHSV, RHSV, "chardiff"); 
-  } 
- 
-  // memcmp(S1,S2,N/8)==0 -> (*(intN_t*)S1 != *(intN_t*)S2)==0 
-  // TODO: The case where both inputs are constants does not need to be limited 
-  // to legal integers or equality comparison. See block below this. 
-  if (DL.isLegalInteger(Len * 8) && isOnlyUsedInZeroEqualityComparison(CI)) { 
-    IntegerType *IntType = IntegerType::get(CI->getContext(), Len * 8); 
-    unsigned PrefAlignment = DL.getPrefTypeAlignment(IntType); 
- 
-    // First, see if we can fold either argument to a constant. 
-    Value *LHSV = nullptr; 
-    if (auto *LHSC = dyn_cast<Constant>(LHS)) { 
-      LHSC = ConstantExpr::getBitCast(LHSC, IntType->getPointerTo()); 
-      LHSV = ConstantFoldLoadFromConstPtr(LHSC, IntType, DL); 
-    } 
-    Value *RHSV = nullptr; 
-    if (auto *RHSC = dyn_cast<Constant>(RHS)) { 
-      RHSC = ConstantExpr::getBitCast(RHSC, IntType->getPointerTo()); 
-      RHSV = ConstantFoldLoadFromConstPtr(RHSC, IntType, DL); 
-    } 
- 
-    // Don't generate unaligned loads. If either source is constant data, 
-    // alignment doesn't matter for that source because there is no load. 
-    if ((LHSV || getKnownAlignment(LHS, DL, CI) >= PrefAlignment) && 
-        (RHSV || getKnownAlignment(RHS, DL, CI) >= PrefAlignment)) { 
-      if (!LHSV) { 
-        Type *LHSPtrTy = 
-            IntType->getPointerTo(LHS->getType()->getPointerAddressSpace()); 
-        LHSV = B.CreateLoad(IntType, B.CreateBitCast(LHS, LHSPtrTy), "lhsv"); 
-      } 
-      if (!RHSV) { 
-        Type *RHSPtrTy = 
-            IntType->getPointerTo(RHS->getType()->getPointerAddressSpace()); 
-        RHSV = B.CreateLoad(IntType, B.CreateBitCast(RHS, RHSPtrTy), "rhsv"); 
-      } 
-      return B.CreateZExt(B.CreateICmpNE(LHSV, RHSV), CI->getType(), "memcmp"); 
-    } 
-  } 
- 
-  // Constant folding: memcmp(x, y, Len) -> constant (all arguments are const). 
-  // TODO: This is limited to i8 arrays. 
-  StringRef LHSStr, RHSStr; 
-  if (getConstantStringInfo(LHS, LHSStr) && 
-      getConstantStringInfo(RHS, RHSStr)) { 
-    // Make sure we're not reading out-of-bounds memory. 
-    if (Len > LHSStr.size() || Len > RHSStr.size()) 
-      return nullptr; 
-    // Fold the memcmp and normalize the result.  This way we get consistent 
-    // results across multiple platforms. 
-    uint64_t Ret = 0; 
-    int Cmp = memcmp(LHSStr.data(), RHSStr.data(), Len); 
-    if (Cmp < 0) 
-      Ret = -1; 
-    else if (Cmp > 0) 
-      Ret = 1; 
-    return ConstantInt::get(CI->getType(), Ret); 
-  } 
- 
-  return nullptr; 
-} 
- 
-// Most simplifications for memcmp also apply to bcmp. 
-Value *LibCallSimplifier::optimizeMemCmpBCmpCommon(CallInst *CI, 
-                                                   IRBuilderBase &B) { 
-  Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1); 
-  Value *Size = CI->getArgOperand(2); 
- 
-  if (LHS == RHS) // memcmp(s,s,x) -> 0 
-    return Constant::getNullValue(CI->getType()); 
- 
-  annotateNonNullAndDereferenceable(CI, {0, 1}, Size, DL); 
-  // Handle constant lengths. 
-  ConstantInt *LenC = dyn_cast<ConstantInt>(Size); 
-  if (!LenC) 
-    return nullptr; 
- 
-  // memcmp(d,s,0) -> 0 
-  if (LenC->getZExtValue() == 0) 
-    return Constant::getNullValue(CI->getType()); 
- 
-  if (Value *Res = 
-          optimizeMemCmpConstantSize(CI, LHS, RHS, LenC->getZExtValue(), B, DL)) 
-    return Res; 
-  return nullptr; 
-} 
- 
-Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilderBase &B) { 
-  if (Value *V = optimizeMemCmpBCmpCommon(CI, B)) 
-    return V; 
- 
-  // memcmp(x, y, Len) == 0 -> bcmp(x, y, Len) == 0 
-  // bcmp can be more efficient than memcmp because it only has to know that 
-  // there is a difference, not how different one is to the other. 
-  if (TLI->has(LibFunc_bcmp) && isOnlyUsedInZeroEqualityComparison(CI)) { 
-    Value *LHS = CI->getArgOperand(0); 
-    Value *RHS = CI->getArgOperand(1); 
-    Value *Size = CI->getArgOperand(2); 
-    return emitBCmp(LHS, RHS, Size, B, DL, TLI); 
-  } 
- 
-  return nullptr; 
-} 
- 
-Value *LibCallSimplifier::optimizeBCmp(CallInst *CI, IRBuilderBase &B) { 
-  return optimizeMemCmpBCmpCommon(CI, B); 
-} 
- 
-Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilderBase &B) { 
-  Value *Size = CI->getArgOperand(2); 
-  annotateNonNullAndDereferenceable(CI, {0, 1}, Size, DL); 
-  if (isa<IntrinsicInst>(CI)) 
-    return nullptr; 
- 
-  // memcpy(x, y, n) -> llvm.memcpy(align 1 x, align 1 y, n) 
-  CallInst *NewCI = B.CreateMemCpy(CI->getArgOperand(0), Align(1), 
-                                   CI->getArgOperand(1), Align(1), Size); 
-  NewCI->setAttributes(CI->getAttributes()); 
+  return Dst;
+}
+
+Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilderBase &B,
+                                               unsigned CharSize) {
+  Value *Src = CI->getArgOperand(0);
+
+  // Constant folding: strlen("xyz") -> 3
+  if (uint64_t Len = GetStringLength(Src, CharSize))
+    return ConstantInt::get(CI->getType(), Len - 1);
+
+  // If s is a constant pointer pointing to a string literal, we can fold
+  // strlen(s + x) to strlen(s) - x, when x is known to be in the range
+  // [0, strlen(s)] or the string has a single null terminator '\0' at the end.
+  // We only try to simplify strlen when the pointer s points to an array
+  // of i8. Otherwise, we would need to scale the offset x before doing the
+  // subtraction. This will make the optimization more complex, and it's not
+  // very useful because calling strlen for a pointer of other types is
+  // very uncommon.
+  if (GEPOperator *GEP = dyn_cast<GEPOperator>(Src)) {
+    if (!isGEPBasedOnPointerToString(GEP, CharSize))
+      return nullptr;
+
+    ConstantDataArraySlice Slice;
+    if (getConstantDataArrayInfo(GEP->getOperand(0), Slice, CharSize)) {
+      uint64_t NullTermIdx;
+      if (Slice.Array == nullptr) {
+        NullTermIdx = 0;
+      } else {
+        NullTermIdx = ~((uint64_t)0);
+        for (uint64_t I = 0, E = Slice.Length; I < E; ++I) {
+          if (Slice.Array->getElementAsInteger(I + Slice.Offset) == 0) {
+            NullTermIdx = I;
+            break;
+          }
+        }
+        // If the string does not have '\0', leave it to strlen to compute
+        // its length.
+        if (NullTermIdx == ~((uint64_t)0))
+          return nullptr;
+      }
+
+      Value *Offset = GEP->getOperand(2);
+      KnownBits Known = computeKnownBits(Offset, DL, 0, nullptr, CI, nullptr);
+      Known.Zero.flipAllBits();
+      uint64_t ArrSize =
+             cast<ArrayType>(GEP->getSourceElementType())->getNumElements();
+
+      // KnownZero's bits are flipped, so zeros in KnownZero now represent
+      // bits known to be zeros in Offset, and ones in KnowZero represent
+      // bits unknown in Offset. Therefore, Offset is known to be in range
+      // [0, NullTermIdx] when the flipped KnownZero is non-negative and
+      // unsigned-less-than NullTermIdx.
+      //
+      // If Offset is not provably in the range [0, NullTermIdx], we can still
+      // optimize if we can prove that the program has undefined behavior when
+      // Offset is outside that range. That is the case when GEP->getOperand(0)
+      // is a pointer to an object whose memory extent is NullTermIdx+1.
+      if ((Known.Zero.isNonNegative() && Known.Zero.ule(NullTermIdx)) ||
+          (GEP->isInBounds() && isa<GlobalVariable>(GEP->getOperand(0)) &&
+           NullTermIdx == ArrSize - 1)) {
+        Offset = B.CreateSExtOrTrunc(Offset, CI->getType());
+        return B.CreateSub(ConstantInt::get(CI->getType(), NullTermIdx),
+                           Offset);
+      }
+    }
+  }
+
+  // strlen(x?"foo":"bars") --> x ? 3 : 4
+  if (SelectInst *SI = dyn_cast<SelectInst>(Src)) {
+    uint64_t LenTrue = GetStringLength(SI->getTrueValue(), CharSize);
+    uint64_t LenFalse = GetStringLength(SI->getFalseValue(), CharSize);
+    if (LenTrue && LenFalse) {
+      ORE.emit([&]() {
+        return OptimizationRemark("instcombine", "simplify-libcalls", CI)
+               << "folded strlen(select) to select of constants";
+      });
+      return B.CreateSelect(SI->getCondition(),
+                            ConstantInt::get(CI->getType(), LenTrue - 1),
+                            ConstantInt::get(CI->getType(), LenFalse - 1));
+    }
+  }
+
+  // strlen(x) != 0 --> *x != 0
+  // strlen(x) == 0 --> *x == 0
+  if (isOnlyUsedInZeroEqualityComparison(CI))
+    return B.CreateZExt(B.CreateLoad(B.getIntNTy(CharSize), Src, "strlenfirst"),
+                        CI->getType());
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeStrLen(CallInst *CI, IRBuilderBase &B) {
+  if (Value *V = optimizeStringLength(CI, B, 8))
+    return V;
+  annotateNonNullBasedOnAccess(CI, 0);
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeWcslen(CallInst *CI, IRBuilderBase &B) {
+  Module &M = *CI->getModule();
+  unsigned WCharSize = TLI->getWCharSize(M) * 8;
+  // We cannot perform this optimization without wchar_size metadata.
+  if (WCharSize == 0)
+    return nullptr;
+
+  return optimizeStringLength(CI, B, WCharSize);
+}
+
+Value *LibCallSimplifier::optimizeStrPBrk(CallInst *CI, IRBuilderBase &B) {
+  StringRef S1, S2;
+  bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
+  bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
+
+  // strpbrk(s, "") -> nullptr
+  // strpbrk("", s) -> nullptr
+  if ((HasS1 && S1.empty()) || (HasS2 && S2.empty()))
+    return Constant::getNullValue(CI->getType());
+
+  // Constant folding.
+  if (HasS1 && HasS2) {
+    size_t I = S1.find_first_of(S2);
+    if (I == StringRef::npos) // No match.
+      return Constant::getNullValue(CI->getType());
+
+    return B.CreateGEP(B.getInt8Ty(), CI->getArgOperand(0), B.getInt64(I),
+                       "strpbrk");
+  }
+
+  // strpbrk(s, "a") -> strchr(s, 'a')
+  if (HasS2 && S2.size() == 1)
+    return emitStrChr(CI->getArgOperand(0), S2[0], B, TLI);
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeStrTo(CallInst *CI, IRBuilderBase &B) {
+  Value *EndPtr = CI->getArgOperand(1);
+  if (isa<ConstantPointerNull>(EndPtr)) {
+    // With a null EndPtr, this function won't capture the main argument.
+    // It would be readonly too, except that it still may write to errno.
+    CI->addParamAttr(0, Attribute::NoCapture);
+  }
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeStrSpn(CallInst *CI, IRBuilderBase &B) {
+  StringRef S1, S2;
+  bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
+  bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
+
+  // strspn(s, "") -> 0
+  // strspn("", s) -> 0
+  if ((HasS1 && S1.empty()) || (HasS2 && S2.empty()))
+    return Constant::getNullValue(CI->getType());
+
+  // Constant folding.
+  if (HasS1 && HasS2) {
+    size_t Pos = S1.find_first_not_of(S2);
+    if (Pos == StringRef::npos)
+      Pos = S1.size();
+    return ConstantInt::get(CI->getType(), Pos);
+  }
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeStrCSpn(CallInst *CI, IRBuilderBase &B) {
+  StringRef S1, S2;
+  bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
+  bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
+
+  // strcspn("", s) -> 0
+  if (HasS1 && S1.empty())
+    return Constant::getNullValue(CI->getType());
+
+  // Constant folding.
+  if (HasS1 && HasS2) {
+    size_t Pos = S1.find_first_of(S2);
+    if (Pos == StringRef::npos)
+      Pos = S1.size();
+    return ConstantInt::get(CI->getType(), Pos);
+  }
+
+  // strcspn(s, "") -> strlen(s)
+  if (HasS2 && S2.empty())
+    return emitStrLen(CI->getArgOperand(0), B, DL, TLI);
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeStrStr(CallInst *CI, IRBuilderBase &B) {
+  // fold strstr(x, x) -> x.
+  if (CI->getArgOperand(0) == CI->getArgOperand(1))
+    return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
+
+  // fold strstr(a, b) == a -> strncmp(a, b, strlen(b)) == 0
+  if (isOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) {
+    Value *StrLen = emitStrLen(CI->getArgOperand(1), B, DL, TLI);
+    if (!StrLen)
+      return nullptr;
+    Value *StrNCmp = emitStrNCmp(CI->getArgOperand(0), CI->getArgOperand(1),
+                                 StrLen, B, DL, TLI);
+    if (!StrNCmp)
+      return nullptr;
+    for (auto UI = CI->user_begin(), UE = CI->user_end(); UI != UE;) {
+      ICmpInst *Old = cast<ICmpInst>(*UI++);
+      Value *Cmp =
+          B.CreateICmp(Old->getPredicate(), StrNCmp,
+                       ConstantInt::getNullValue(StrNCmp->getType()), "cmp");
+      replaceAllUsesWith(Old, Cmp);
+    }
+    return CI;
+  }
+
+  // See if either input string is a constant string.
+  StringRef SearchStr, ToFindStr;
+  bool HasStr1 = getConstantStringInfo(CI->getArgOperand(0), SearchStr);
+  bool HasStr2 = getConstantStringInfo(CI->getArgOperand(1), ToFindStr);
+
+  // fold strstr(x, "") -> x.
+  if (HasStr2 && ToFindStr.empty())
+    return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
+
+  // If both strings are known, constant fold it.
+  if (HasStr1 && HasStr2) {
+    size_t Offset = SearchStr.find(ToFindStr);
+
+    if (Offset == StringRef::npos) // strstr("foo", "bar") -> null
+      return Constant::getNullValue(CI->getType());
+
+    // strstr("abcd", "bc") -> gep((char*)"abcd", 1)
+    Value *Result = castToCStr(CI->getArgOperand(0), B);
+    Result =
+        B.CreateConstInBoundsGEP1_64(B.getInt8Ty(), Result, Offset, "strstr");
+    return B.CreateBitCast(Result, CI->getType());
+  }
+
+  // fold strstr(x, "y") -> strchr(x, 'y').
+  if (HasStr2 && ToFindStr.size() == 1) {
+    Value *StrChr = emitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TLI);
+    return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : nullptr;
+  }
+
+  annotateNonNullBasedOnAccess(CI, {0, 1});
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeMemRChr(CallInst *CI, IRBuilderBase &B) {
+  if (isKnownNonZero(CI->getOperand(2), DL))
+    annotateNonNullBasedOnAccess(CI, 0);
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeMemChr(CallInst *CI, IRBuilderBase &B) {
+  Value *SrcStr = CI->getArgOperand(0);
+  Value *Size = CI->getArgOperand(2);
+  annotateNonNullAndDereferenceable(CI, 0, Size, DL);
+  ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+  ConstantInt *LenC = dyn_cast<ConstantInt>(Size);
+
+  // memchr(x, y, 0) -> null
+  if (LenC) {
+    if (LenC->isZero())
+      return Constant::getNullValue(CI->getType());
+  } else {
+    // From now on we need at least constant length and string.
+    return nullptr;
+  }
+
+  StringRef Str;
+  if (!getConstantStringInfo(SrcStr, Str, 0, /*TrimAtNul=*/false))
+    return nullptr;
+
+  // Truncate the string to LenC. If Str is smaller than LenC we will still only
+  // scan the string, as reading past the end of it is undefined and we can just
+  // return null if we don't find the char.
+  Str = Str.substr(0, LenC->getZExtValue());
+
+  // If the char is variable but the input str and length are not we can turn
+  // this memchr call into a simple bit field test. Of course this only works
+  // when the return value is only checked against null.
+  //
+  // It would be really nice to reuse switch lowering here but we can't change
+  // the CFG at this point.
+  //
+  // memchr("\r\n", C, 2) != nullptr -> (1 << C & ((1 << '\r') | (1 << '\n')))
+  // != 0
+  //   after bounds check.
+  if (!CharC && !Str.empty() && isOnlyUsedInZeroEqualityComparison(CI)) {
+    unsigned char Max =
+        *std::max_element(reinterpret_cast<const unsigned char *>(Str.begin()),
+                          reinterpret_cast<const unsigned char *>(Str.end()));
+
+    // Make sure the bit field we're about to create fits in a register on the
+    // target.
+    // FIXME: On a 64 bit architecture this prevents us from using the
+    // interesting range of alpha ascii chars. We could do better by emitting
+    // two bitfields or shifting the range by 64 if no lower chars are used.
+    if (!DL.fitsInLegalInteger(Max + 1))
+      return nullptr;
+
+    // For the bit field use a power-of-2 type with at least 8 bits to avoid
+    // creating unnecessary illegal types.
+    unsigned char Width = NextPowerOf2(std::max((unsigned char)7, Max));
+
+    // Now build the bit field.
+    APInt Bitfield(Width, 0);
+    for (char C : Str)
+      Bitfield.setBit((unsigned char)C);
+    Value *BitfieldC = B.getInt(Bitfield);
+
+    // Adjust width of "C" to the bitfield width, then mask off the high bits.
+    Value *C = B.CreateZExtOrTrunc(CI->getArgOperand(1), BitfieldC->getType());
+    C = B.CreateAnd(C, B.getIntN(Width, 0xFF));
+
+    // First check that the bit field access is within bounds.
+    Value *Bounds = B.CreateICmp(ICmpInst::ICMP_ULT, C, B.getIntN(Width, Width),
+                                 "memchr.bounds");
+
+    // Create code that checks if the given bit is set in the field.
+    Value *Shl = B.CreateShl(B.getIntN(Width, 1ULL), C);
+    Value *Bits = B.CreateIsNotNull(B.CreateAnd(Shl, BitfieldC), "memchr.bits");
+
+    // Finally merge both checks and cast to pointer type. The inttoptr
+    // implicitly zexts the i1 to intptr type.
+    return B.CreateIntToPtr(B.CreateAnd(Bounds, Bits, "memchr"), CI->getType());
+  }
+
+  // Check if all arguments are constants.  If so, we can constant fold.
+  if (!CharC)
+    return nullptr;
+
+  // Compute the offset.
+  size_t I = Str.find(CharC->getSExtValue() & 0xFF);
+  if (I == StringRef::npos) // Didn't find the char.  memchr returns null.
+    return Constant::getNullValue(CI->getType());
+
+  // memchr(s+n,c,l) -> gep(s+n+i,c)
+  return B.CreateGEP(B.getInt8Ty(), SrcStr, B.getInt64(I), "memchr");
+}
+
+static Value *optimizeMemCmpConstantSize(CallInst *CI, Value *LHS, Value *RHS,
+                                         uint64_t Len, IRBuilderBase &B,
+                                         const DataLayout &DL) {
+  if (Len == 0) // memcmp(s1,s2,0) -> 0
+    return Constant::getNullValue(CI->getType());
+
+  // memcmp(S1,S2,1) -> *(unsigned char*)LHS - *(unsigned char*)RHS
+  if (Len == 1) {
+    Value *LHSV =
+        B.CreateZExt(B.CreateLoad(B.getInt8Ty(), castToCStr(LHS, B), "lhsc"),
+                     CI->getType(), "lhsv");
+    Value *RHSV =
+        B.CreateZExt(B.CreateLoad(B.getInt8Ty(), castToCStr(RHS, B), "rhsc"),
+                     CI->getType(), "rhsv");
+    return B.CreateSub(LHSV, RHSV, "chardiff");
+  }
+
+  // memcmp(S1,S2,N/8)==0 -> (*(intN_t*)S1 != *(intN_t*)S2)==0
+  // TODO: The case where both inputs are constants does not need to be limited
+  // to legal integers or equality comparison. See block below this.
+  if (DL.isLegalInteger(Len * 8) && isOnlyUsedInZeroEqualityComparison(CI)) {
+    IntegerType *IntType = IntegerType::get(CI->getContext(), Len * 8);
+    unsigned PrefAlignment = DL.getPrefTypeAlignment(IntType);
+
+    // First, see if we can fold either argument to a constant.
+    Value *LHSV = nullptr;
+    if (auto *LHSC = dyn_cast<Constant>(LHS)) {
+      LHSC = ConstantExpr::getBitCast(LHSC, IntType->getPointerTo());
+      LHSV = ConstantFoldLoadFromConstPtr(LHSC, IntType, DL);
+    }
+    Value *RHSV = nullptr;
+    if (auto *RHSC = dyn_cast<Constant>(RHS)) {
+      RHSC = ConstantExpr::getBitCast(RHSC, IntType->getPointerTo());
+      RHSV = ConstantFoldLoadFromConstPtr(RHSC, IntType, DL);
+    }
+
+    // Don't generate unaligned loads. If either source is constant data,
+    // alignment doesn't matter for that source because there is no load.
+    if ((LHSV || getKnownAlignment(LHS, DL, CI) >= PrefAlignment) &&
+        (RHSV || getKnownAlignment(RHS, DL, CI) >= PrefAlignment)) {
+      if (!LHSV) {
+        Type *LHSPtrTy =
+            IntType->getPointerTo(LHS->getType()->getPointerAddressSpace());
+        LHSV = B.CreateLoad(IntType, B.CreateBitCast(LHS, LHSPtrTy), "lhsv");
+      }
+      if (!RHSV) {
+        Type *RHSPtrTy =
+            IntType->getPointerTo(RHS->getType()->getPointerAddressSpace());
+        RHSV = B.CreateLoad(IntType, B.CreateBitCast(RHS, RHSPtrTy), "rhsv");
+      }
+      return B.CreateZExt(B.CreateICmpNE(LHSV, RHSV), CI->getType(), "memcmp");
+    }
+  }
+
+  // Constant folding: memcmp(x, y, Len) -> constant (all arguments are const).
+  // TODO: This is limited to i8 arrays.
+  StringRef LHSStr, RHSStr;
+  if (getConstantStringInfo(LHS, LHSStr) &&
+      getConstantStringInfo(RHS, RHSStr)) {
+    // Make sure we're not reading out-of-bounds memory.
+    if (Len > LHSStr.size() || Len > RHSStr.size())
+      return nullptr;
+    // Fold the memcmp and normalize the result.  This way we get consistent
+    // results across multiple platforms.
+    uint64_t Ret = 0;
+    int Cmp = memcmp(LHSStr.data(), RHSStr.data(), Len);
+    if (Cmp < 0)
+      Ret = -1;
+    else if (Cmp > 0)
+      Ret = 1;
+    return ConstantInt::get(CI->getType(), Ret);
+  }
+
+  return nullptr;
+}
+
+// Most simplifications for memcmp also apply to bcmp.
+Value *LibCallSimplifier::optimizeMemCmpBCmpCommon(CallInst *CI,
+                                                   IRBuilderBase &B) {
+  Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1);
+  Value *Size = CI->getArgOperand(2);
+
+  if (LHS == RHS) // memcmp(s,s,x) -> 0
+    return Constant::getNullValue(CI->getType());
+
+  annotateNonNullAndDereferenceable(CI, {0, 1}, Size, DL);
+  // Handle constant lengths.
+  ConstantInt *LenC = dyn_cast<ConstantInt>(Size);
+  if (!LenC)
+    return nullptr;
+
+  // memcmp(d,s,0) -> 0
+  if (LenC->getZExtValue() == 0)
+    return Constant::getNullValue(CI->getType());
+
+  if (Value *Res =
+          optimizeMemCmpConstantSize(CI, LHS, RHS, LenC->getZExtValue(), B, DL))
+    return Res;
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilderBase &B) {
+  if (Value *V = optimizeMemCmpBCmpCommon(CI, B))
+    return V;
+
+  // memcmp(x, y, Len) == 0 -> bcmp(x, y, Len) == 0
+  // bcmp can be more efficient than memcmp because it only has to know that
+  // there is a difference, not how different one is to the other.
+  if (TLI->has(LibFunc_bcmp) && isOnlyUsedInZeroEqualityComparison(CI)) {
+    Value *LHS = CI->getArgOperand(0);
+    Value *RHS = CI->getArgOperand(1);
+    Value *Size = CI->getArgOperand(2);
+    return emitBCmp(LHS, RHS, Size, B, DL, TLI);
+  }
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeBCmp(CallInst *CI, IRBuilderBase &B) {
+  return optimizeMemCmpBCmpCommon(CI, B);
+}
+
+Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilderBase &B) {
+  Value *Size = CI->getArgOperand(2);
+  annotateNonNullAndDereferenceable(CI, {0, 1}, Size, DL);
+  if (isa<IntrinsicInst>(CI))
+    return nullptr;
+
+  // memcpy(x, y, n) -> llvm.memcpy(align 1 x, align 1 y, n)
+  CallInst *NewCI = B.CreateMemCpy(CI->getArgOperand(0), Align(1),
+                                   CI->getArgOperand(1), Align(1), Size);
+  NewCI->setAttributes(CI->getAttributes());
   NewCI->removeAttributes(AttributeList::ReturnIndex,
                           AttributeFuncs::typeIncompatible(NewCI->getType()));
-  return CI->getArgOperand(0); 
-} 
- 
-Value *LibCallSimplifier::optimizeMemCCpy(CallInst *CI, IRBuilderBase &B) { 
-  Value *Dst = CI->getArgOperand(0); 
-  Value *Src = CI->getArgOperand(1); 
-  ConstantInt *StopChar = dyn_cast<ConstantInt>(CI->getArgOperand(2)); 
-  ConstantInt *N = dyn_cast<ConstantInt>(CI->getArgOperand(3)); 
-  StringRef SrcStr; 
-  if (CI->use_empty() && Dst == Src) 
-    return Dst; 
-  // memccpy(d, s, c, 0) -> nullptr 
-  if (N) { 
-    if (N->isNullValue()) 
-      return Constant::getNullValue(CI->getType()); 
-    if (!getConstantStringInfo(Src, SrcStr, /*Offset=*/0, 
-                               /*TrimAtNul=*/false) || 
-        !StopChar) 
-      return nullptr; 
-  } else { 
-    return nullptr; 
-  } 
- 
-  // Wrap arg 'c' of type int to char 
-  size_t Pos = SrcStr.find(StopChar->getSExtValue() & 0xFF); 
-  if (Pos == StringRef::npos) { 
-    if (N->getZExtValue() <= SrcStr.size()) { 
-      B.CreateMemCpy(Dst, Align(1), Src, Align(1), CI->getArgOperand(3)); 
-      return Constant::getNullValue(CI->getType()); 
-    } 
-    return nullptr; 
-  } 
- 
-  Value *NewN = 
-      ConstantInt::get(N->getType(), std::min(uint64_t(Pos + 1), N->getZExtValue())); 
-  // memccpy -> llvm.memcpy 
-  B.CreateMemCpy(Dst, Align(1), Src, Align(1), NewN); 
-  return Pos + 1 <= N->getZExtValue() 
-             ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, NewN) 
-             : Constant::getNullValue(CI->getType()); 
-} 
- 
-Value *LibCallSimplifier::optimizeMemPCpy(CallInst *CI, IRBuilderBase &B) { 
-  Value *Dst = CI->getArgOperand(0); 
-  Value *N = CI->getArgOperand(2); 
-  // mempcpy(x, y, n) -> llvm.memcpy(align 1 x, align 1 y, n), x + n 
-  CallInst *NewCI = 
-      B.CreateMemCpy(Dst, Align(1), CI->getArgOperand(1), Align(1), N); 
+  return CI->getArgOperand(0);
+}
+
+Value *LibCallSimplifier::optimizeMemCCpy(CallInst *CI, IRBuilderBase &B) {
+  Value *Dst = CI->getArgOperand(0);
+  Value *Src = CI->getArgOperand(1);
+  ConstantInt *StopChar = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+  ConstantInt *N = dyn_cast<ConstantInt>(CI->getArgOperand(3));
+  StringRef SrcStr;
+  if (CI->use_empty() && Dst == Src)
+    return Dst;
+  // memccpy(d, s, c, 0) -> nullptr
+  if (N) {
+    if (N->isNullValue())
+      return Constant::getNullValue(CI->getType());
+    if (!getConstantStringInfo(Src, SrcStr, /*Offset=*/0,
+                               /*TrimAtNul=*/false) ||
+        !StopChar)
+      return nullptr;
+  } else {
+    return nullptr;
+  }
+
+  // Wrap arg 'c' of type int to char
+  size_t Pos = SrcStr.find(StopChar->getSExtValue() & 0xFF);
+  if (Pos == StringRef::npos) {
+    if (N->getZExtValue() <= SrcStr.size()) {
+      B.CreateMemCpy(Dst, Align(1), Src, Align(1), CI->getArgOperand(3));
+      return Constant::getNullValue(CI->getType());
+    }
+    return nullptr;
+  }
+
+  Value *NewN =
+      ConstantInt::get(N->getType(), std::min(uint64_t(Pos + 1), N->getZExtValue()));
+  // memccpy -> llvm.memcpy
+  B.CreateMemCpy(Dst, Align(1), Src, Align(1), NewN);
+  return Pos + 1 <= N->getZExtValue()
+             ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, NewN)
+             : Constant::getNullValue(CI->getType());
+}
+
+Value *LibCallSimplifier::optimizeMemPCpy(CallInst *CI, IRBuilderBase &B) {
+  Value *Dst = CI->getArgOperand(0);
+  Value *N = CI->getArgOperand(2);
+  // mempcpy(x, y, n) -> llvm.memcpy(align 1 x, align 1 y, n), x + n
+  CallInst *NewCI =
+      B.CreateMemCpy(Dst, Align(1), CI->getArgOperand(1), Align(1), N);
   // Propagate attributes, but memcpy has no return value, so make sure that
   // any return attributes are compliant.
   // TODO: Attach return value attributes to the 1st operand to preserve them?
-  NewCI->setAttributes(CI->getAttributes()); 
+  NewCI->setAttributes(CI->getAttributes());
   NewCI->removeAttributes(AttributeList::ReturnIndex,
                           AttributeFuncs::typeIncompatible(NewCI->getType()));
-  return B.CreateInBoundsGEP(B.getInt8Ty(), Dst, N); 
-} 
- 
-Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilderBase &B) { 
-  Value *Size = CI->getArgOperand(2); 
-  annotateNonNullAndDereferenceable(CI, {0, 1}, Size, DL); 
-  if (isa<IntrinsicInst>(CI)) 
-    return nullptr; 
- 
-  // memmove(x, y, n) -> llvm.memmove(align 1 x, align 1 y, n) 
-  CallInst *NewCI = B.CreateMemMove(CI->getArgOperand(0), Align(1), 
-                                    CI->getArgOperand(1), Align(1), Size); 
-  NewCI->setAttributes(CI->getAttributes()); 
+  return B.CreateInBoundsGEP(B.getInt8Ty(), Dst, N);
+}
+
+Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilderBase &B) {
+  Value *Size = CI->getArgOperand(2);
+  annotateNonNullAndDereferenceable(CI, {0, 1}, Size, DL);
+  if (isa<IntrinsicInst>(CI))
+    return nullptr;
+
+  // memmove(x, y, n) -> llvm.memmove(align 1 x, align 1 y, n)
+  CallInst *NewCI = B.CreateMemMove(CI->getArgOperand(0), Align(1),
+                                    CI->getArgOperand(1), Align(1), Size);
+  NewCI->setAttributes(CI->getAttributes());
   NewCI->removeAttributes(AttributeList::ReturnIndex,
                           AttributeFuncs::typeIncompatible(NewCI->getType()));
-  return CI->getArgOperand(0); 
-} 
- 
-/// Fold memset[_chk](malloc(n), 0, n) --> calloc(1, n). 
-Value *LibCallSimplifier::foldMallocMemset(CallInst *Memset, IRBuilderBase &B) { 
-  // This has to be a memset of zeros (bzero). 
-  auto *FillValue = dyn_cast<ConstantInt>(Memset->getArgOperand(1)); 
-  if (!FillValue || FillValue->getZExtValue() != 0) 
-    return nullptr; 
- 
-  // TODO: We should handle the case where the malloc has more than one use. 
-  // This is necessary to optimize common patterns such as when the result of 
-  // the malloc is checked against null or when a memset intrinsic is used in 
-  // place of a memset library call. 
-  auto *Malloc = dyn_cast<CallInst>(Memset->getArgOperand(0)); 
-  if (!Malloc || !Malloc->hasOneUse()) 
-    return nullptr; 
- 
-  // Is the inner call really malloc()? 
-  Function *InnerCallee = Malloc->getCalledFunction(); 
-  if (!InnerCallee) 
-    return nullptr; 
- 
-  LibFunc Func; 
-  if (!TLI->getLibFunc(*InnerCallee, Func) || !TLI->has(Func) || 
-      Func != LibFunc_malloc) 
-    return nullptr; 
- 
-  // The memset must cover the same number of bytes that are malloc'd. 
-  if (Memset->getArgOperand(2) != Malloc->getArgOperand(0)) 
-    return nullptr; 
- 
-  // Replace the malloc with a calloc. We need the data layout to know what the 
-  // actual size of a 'size_t' parameter is. 
-  B.SetInsertPoint(Malloc->getParent(), ++Malloc->getIterator()); 
-  const DataLayout &DL = Malloc->getModule()->getDataLayout(); 
-  IntegerType *SizeType = DL.getIntPtrType(B.GetInsertBlock()->getContext()); 
-  if (Value *Calloc = emitCalloc(ConstantInt::get(SizeType, 1), 
-                                 Malloc->getArgOperand(0), 
-                                 Malloc->getAttributes(), B, *TLI)) { 
-    substituteInParent(Malloc, Calloc); 
-    return Calloc; 
-  } 
- 
-  return nullptr; 
-} 
- 
-Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilderBase &B) { 
-  Value *Size = CI->getArgOperand(2); 
-  annotateNonNullAndDereferenceable(CI, 0, Size, DL); 
-  if (isa<IntrinsicInst>(CI)) 
-    return nullptr; 
- 
-  if (auto *Calloc = foldMallocMemset(CI, B)) 
-    return Calloc; 
- 
-  // memset(p, v, n) -> llvm.memset(align 1 p, v, n) 
-  Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false); 
-  CallInst *NewCI = B.CreateMemSet(CI->getArgOperand(0), Val, Size, Align(1)); 
-  NewCI->setAttributes(CI->getAttributes()); 
+  return CI->getArgOperand(0);
+}
+
+/// Fold memset[_chk](malloc(n), 0, n) --> calloc(1, n).
+Value *LibCallSimplifier::foldMallocMemset(CallInst *Memset, IRBuilderBase &B) {
+  // This has to be a memset of zeros (bzero).
+  auto *FillValue = dyn_cast<ConstantInt>(Memset->getArgOperand(1));
+  if (!FillValue || FillValue->getZExtValue() != 0)
+    return nullptr;
+
+  // TODO: We should handle the case where the malloc has more than one use.
+  // This is necessary to optimize common patterns such as when the result of
+  // the malloc is checked against null or when a memset intrinsic is used in
+  // place of a memset library call.
+  auto *Malloc = dyn_cast<CallInst>(Memset->getArgOperand(0));
+  if (!Malloc || !Malloc->hasOneUse())
+    return nullptr;
+
+  // Is the inner call really malloc()?
+  Function *InnerCallee = Malloc->getCalledFunction();
+  if (!InnerCallee)
+    return nullptr;
+
+  LibFunc Func;
+  if (!TLI->getLibFunc(*InnerCallee, Func) || !TLI->has(Func) ||
+      Func != LibFunc_malloc)
+    return nullptr;
+
+  // The memset must cover the same number of bytes that are malloc'd.
+  if (Memset->getArgOperand(2) != Malloc->getArgOperand(0))
+    return nullptr;
+
+  // Replace the malloc with a calloc. We need the data layout to know what the
+  // actual size of a 'size_t' parameter is.
+  B.SetInsertPoint(Malloc->getParent(), ++Malloc->getIterator());
+  const DataLayout &DL = Malloc->getModule()->getDataLayout();
+  IntegerType *SizeType = DL.getIntPtrType(B.GetInsertBlock()->getContext());
+  if (Value *Calloc = emitCalloc(ConstantInt::get(SizeType, 1),
+                                 Malloc->getArgOperand(0),
+                                 Malloc->getAttributes(), B, *TLI)) {
+    substituteInParent(Malloc, Calloc);
+    return Calloc;
+  }
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilderBase &B) {
+  Value *Size = CI->getArgOperand(2);
+  annotateNonNullAndDereferenceable(CI, 0, Size, DL);
+  if (isa<IntrinsicInst>(CI))
+    return nullptr;
+
+  if (auto *Calloc = foldMallocMemset(CI, B))
+    return Calloc;
+
+  // memset(p, v, n) -> llvm.memset(align 1 p, v, n)
+  Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
+  CallInst *NewCI = B.CreateMemSet(CI->getArgOperand(0), Val, Size, Align(1));
+  NewCI->setAttributes(CI->getAttributes());
   NewCI->removeAttributes(AttributeList::ReturnIndex,
                           AttributeFuncs::typeIncompatible(NewCI->getType()));
-  return CI->getArgOperand(0); 
-} 
- 
-Value *LibCallSimplifier::optimizeRealloc(CallInst *CI, IRBuilderBase &B) { 
-  if (isa<ConstantPointerNull>(CI->getArgOperand(0))) 
-    return emitMalloc(CI->getArgOperand(1), B, DL, TLI); 
- 
-  return nullptr; 
-} 
- 
-//===----------------------------------------------------------------------===// 
-// Math Library Optimizations 
-//===----------------------------------------------------------------------===// 
- 
-// Replace a libcall \p CI with a call to intrinsic \p IID 
-static Value *replaceUnaryCall(CallInst *CI, IRBuilderBase &B, 
-                               Intrinsic::ID IID) { 
-  // Propagate fast-math flags from the existing call to the new call. 
-  IRBuilderBase::FastMathFlagGuard Guard(B); 
-  B.setFastMathFlags(CI->getFastMathFlags()); 
- 
-  Module *M = CI->getModule(); 
-  Value *V = CI->getArgOperand(0); 
-  Function *F = Intrinsic::getDeclaration(M, IID, CI->getType()); 
-  CallInst *NewCall = B.CreateCall(F, V); 
-  NewCall->takeName(CI); 
-  return NewCall; 
-} 
- 
-/// Return a variant of Val with float type. 
-/// Currently this works in two cases: If Val is an FPExtension of a float 
-/// value to something bigger, simply return the operand. 
-/// If Val is a ConstantFP but can be converted to a float ConstantFP without 
-/// loss of precision do so. 
-static Value *valueHasFloatPrecision(Value *Val) { 
-  if (FPExtInst *Cast = dyn_cast<FPExtInst>(Val)) { 
-    Value *Op = Cast->getOperand(0); 
-    if (Op->getType()->isFloatTy()) 
-      return Op; 
-  } 
-  if (ConstantFP *Const = dyn_cast<ConstantFP>(Val)) { 
-    APFloat F = Const->getValueAPF(); 
-    bool losesInfo; 
-    (void)F.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, 
-                    &losesInfo); 
-    if (!losesInfo) 
-      return ConstantFP::get(Const->getContext(), F); 
-  } 
-  return nullptr; 
-} 
- 
-/// Shrink double -> float functions. 
-static Value *optimizeDoubleFP(CallInst *CI, IRBuilderBase &B, 
-                               bool isBinary, bool isPrecise = false) { 
-  Function *CalleeFn = CI->getCalledFunction(); 
-  if (!CI->getType()->isDoubleTy() || !CalleeFn) 
-    return nullptr; 
- 
-  // If not all the uses of the function are converted to float, then bail out. 
-  // This matters if the precision of the result is more important than the 
-  // precision of the arguments. 
-  if (isPrecise) 
-    for (User *U : CI->users()) { 
-      FPTruncInst *Cast = dyn_cast<FPTruncInst>(U); 
-      if (!Cast || !Cast->getType()->isFloatTy()) 
-        return nullptr; 
-    } 
- 
-  // If this is something like 'g((double) float)', convert to 'gf(float)'. 
-  Value *V[2]; 
-  V[0] = valueHasFloatPrecision(CI->getArgOperand(0)); 
-  V[1] = isBinary ? valueHasFloatPrecision(CI->getArgOperand(1)) : nullptr; 
-  if (!V[0] || (isBinary && !V[1])) 
-    return nullptr; 
- 
-  // If call isn't an intrinsic, check that it isn't within a function with the 
-  // same name as the float version of this call, otherwise the result is an 
-  // infinite loop.  For example, from MinGW-w64: 
-  // 
-  // float expf(float val) { return (float) exp((double) val); } 
-  StringRef CalleeName = CalleeFn->getName(); 
-  bool IsIntrinsic = CalleeFn->isIntrinsic(); 
-  if (!IsIntrinsic) { 
-    StringRef CallerName = CI->getFunction()->getName(); 
-    if (!CallerName.empty() && CallerName.back() == 'f' && 
-        CallerName.size() == (CalleeName.size() + 1) && 
-        CallerName.startswith(CalleeName)) 
-      return nullptr; 
-  } 
- 
-  // Propagate the math semantics from the current function to the new function. 
-  IRBuilderBase::FastMathFlagGuard Guard(B); 
-  B.setFastMathFlags(CI->getFastMathFlags()); 
- 
-  // g((double) float) -> (double) gf(float) 
-  Value *R; 
-  if (IsIntrinsic) { 
-    Module *M = CI->getModule(); 
-    Intrinsic::ID IID = CalleeFn->getIntrinsicID(); 
-    Function *Fn = Intrinsic::getDeclaration(M, IID, B.getFloatTy()); 
-    R = isBinary ? B.CreateCall(Fn, V) : B.CreateCall(Fn, V[0]); 
-  } else { 
-    AttributeList CalleeAttrs = CalleeFn->getAttributes(); 
-    R = isBinary ? emitBinaryFloatFnCall(V[0], V[1], CalleeName, B, CalleeAttrs) 
-                 : emitUnaryFloatFnCall(V[0], CalleeName, B, CalleeAttrs); 
-  } 
-  return B.CreateFPExt(R, B.getDoubleTy()); 
-} 
- 
-/// Shrink double -> float for unary functions. 
-static Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilderBase &B, 
-                                    bool isPrecise = false) { 
-  return optimizeDoubleFP(CI, B, false, isPrecise); 
-} 
- 
-/// Shrink double -> float for binary functions. 
-static Value *optimizeBinaryDoubleFP(CallInst *CI, IRBuilderBase &B, 
-                                     bool isPrecise = false) { 
-  return optimizeDoubleFP(CI, B, true, isPrecise); 
-} 
- 
-// cabs(z) -> sqrt((creal(z)*creal(z)) + (cimag(z)*cimag(z))) 
-Value *LibCallSimplifier::optimizeCAbs(CallInst *CI, IRBuilderBase &B) { 
-  if (!CI->isFast()) 
-    return nullptr; 
- 
-  // Propagate fast-math flags from the existing call to new instructions. 
-  IRBuilderBase::FastMathFlagGuard Guard(B); 
-  B.setFastMathFlags(CI->getFastMathFlags()); 
- 
-  Value *Real, *Imag; 
-  if (CI->getNumArgOperands() == 1) { 
-    Value *Op = CI->getArgOperand(0); 
-    assert(Op->getType()->isArrayTy() && "Unexpected signature for cabs!"); 
-    Real = B.CreateExtractValue(Op, 0, "real"); 
-    Imag = B.CreateExtractValue(Op, 1, "imag"); 
-  } else { 
-    assert(CI->getNumArgOperands() == 2 && "Unexpected signature for cabs!"); 
-    Real = CI->getArgOperand(0); 
-    Imag = CI->getArgOperand(1); 
-  } 
- 
-  Value *RealReal = B.CreateFMul(Real, Real); 
-  Value *ImagImag = B.CreateFMul(Imag, Imag); 
- 
-  Function *FSqrt = Intrinsic::getDeclaration(CI->getModule(), Intrinsic::sqrt, 
-                                              CI->getType()); 
-  return B.CreateCall(FSqrt, B.CreateFAdd(RealReal, ImagImag), "cabs"); 
-} 
- 
-static Value *optimizeTrigReflections(CallInst *Call, LibFunc Func, 
-                                      IRBuilderBase &B) { 
-  if (!isa<FPMathOperator>(Call)) 
-    return nullptr; 
- 
-  IRBuilderBase::FastMathFlagGuard Guard(B); 
-  B.setFastMathFlags(Call->getFastMathFlags()); 
- 
-  // TODO: Can this be shared to also handle LLVM intrinsics? 
-  Value *X; 
-  switch (Func) { 
-  case LibFunc_sin: 
-  case LibFunc_sinf: 
-  case LibFunc_sinl: 
-  case LibFunc_tan: 
-  case LibFunc_tanf: 
-  case LibFunc_tanl: 
-    // sin(-X) --> -sin(X) 
-    // tan(-X) --> -tan(X) 
-    if (match(Call->getArgOperand(0), m_OneUse(m_FNeg(m_Value(X))))) 
-      return B.CreateFNeg(B.CreateCall(Call->getCalledFunction(), X)); 
-    break; 
-  case LibFunc_cos: 
-  case LibFunc_cosf: 
-  case LibFunc_cosl: 
-    // cos(-X) --> cos(X) 
-    if (match(Call->getArgOperand(0), m_FNeg(m_Value(X)))) 
-      return B.CreateCall(Call->getCalledFunction(), X, "cos"); 
-    break; 
-  default: 
-    break; 
-  } 
-  return nullptr; 
-} 
- 
-static Value *getPow(Value *InnerChain[33], unsigned Exp, IRBuilderBase &B) { 
-  // Multiplications calculated using Addition Chains. 
-  // Refer: http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html 
- 
-  assert(Exp != 0 && "Incorrect exponent 0 not handled"); 
- 
-  if (InnerChain[Exp]) 
-    return InnerChain[Exp]; 
- 
-  static const unsigned AddChain[33][2] = { 
-      {0, 0}, // Unused. 
-      {0, 0}, // Unused (base case = pow1). 
-      {1, 1}, // Unused (pre-computed). 
-      {1, 2},  {2, 2},   {2, 3},  {3, 3},   {2, 5},  {4, 4}, 
-      {1, 8},  {5, 5},   {1, 10}, {6, 6},   {4, 9},  {7, 7}, 
-      {3, 12}, {8, 8},   {8, 9},  {2, 16},  {1, 18}, {10, 10}, 
-      {6, 15}, {11, 11}, {3, 20}, {12, 12}, {8, 17}, {13, 13}, 
-      {3, 24}, {14, 14}, {4, 25}, {15, 15}, {3, 28}, {16, 16}, 
-  }; 
- 
-  InnerChain[Exp] = B.CreateFMul(getPow(InnerChain, AddChain[Exp][0], B), 
-                                 getPow(InnerChain, AddChain[Exp][1], B)); 
-  return InnerChain[Exp]; 
-} 
- 
-// Return a properly extended 32-bit integer if the operation is an itofp. 
-static Value *getIntToFPVal(Value *I2F, IRBuilderBase &B) { 
-  if (isa<SIToFPInst>(I2F) || isa<UIToFPInst>(I2F)) { 
-    Value *Op = cast<Instruction>(I2F)->getOperand(0); 
-    // Make sure that the exponent fits inside an int32_t, 
-    // thus avoiding any range issues that FP has not. 
-    unsigned BitWidth = Op->getType()->getPrimitiveSizeInBits(); 
-    if (BitWidth < 32 || 
-        (BitWidth == 32 && isa<SIToFPInst>(I2F))) 
-      return isa<SIToFPInst>(I2F) ? B.CreateSExt(Op, B.getInt32Ty()) 
-                                  : B.CreateZExt(Op, B.getInt32Ty()); 
-  } 
- 
-  return nullptr; 
-} 
- 
-/// Use exp{,2}(x * y) for pow(exp{,2}(x), y); 
-/// ldexp(1.0, x) for pow(2.0, itofp(x)); exp2(n * x) for pow(2.0 ** n, x); 
-/// exp10(x) for pow(10.0, x); exp2(log2(n) * x) for pow(n, x). 
-Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) { 
-  Value *Base = Pow->getArgOperand(0), *Expo = Pow->getArgOperand(1); 
-  AttributeList Attrs; // Attributes are only meaningful on the original call 
-  Module *Mod = Pow->getModule(); 
-  Type *Ty = Pow->getType(); 
-  bool Ignored; 
- 
-  // Evaluate special cases related to a nested function as the base. 
- 
-  // pow(exp(x), y) -> exp(x * y) 
-  // pow(exp2(x), y) -> exp2(x * y) 
-  // If exp{,2}() is used only once, it is better to fold two transcendental 
-  // math functions into one.  If used again, exp{,2}() would still have to be 
-  // called with the original argument, then keep both original transcendental 
-  // functions.  However, this transformation is only safe with fully relaxed 
-  // math semantics, since, besides rounding differences, it changes overflow 
-  // and underflow behavior quite dramatically.  For example: 
-  //   pow(exp(1000), 0.001) = pow(inf, 0.001) = inf 
-  // Whereas: 
-  //   exp(1000 * 0.001) = exp(1) 
-  // TODO: Loosen the requirement for fully relaxed math semantics. 
-  // TODO: Handle exp10() when more targets have it available. 
-  CallInst *BaseFn = dyn_cast<CallInst>(Base); 
-  if (BaseFn && BaseFn->hasOneUse() && BaseFn->isFast() && Pow->isFast()) { 
-    LibFunc LibFn; 
- 
-    Function *CalleeFn = BaseFn->getCalledFunction(); 
-    if (CalleeFn && 
-        TLI->getLibFunc(CalleeFn->getName(), LibFn) && TLI->has(LibFn)) { 
-      StringRef ExpName; 
-      Intrinsic::ID ID; 
-      Value *ExpFn; 
-      LibFunc LibFnFloat, LibFnDouble, LibFnLongDouble; 
- 
-      switch (LibFn) { 
-      default: 
-        return nullptr; 
-      case LibFunc_expf:  case LibFunc_exp:  case LibFunc_expl: 
-        ExpName = TLI->getName(LibFunc_exp); 
-        ID = Intrinsic::exp; 
-        LibFnFloat = LibFunc_expf; 
-        LibFnDouble = LibFunc_exp; 
-        LibFnLongDouble = LibFunc_expl; 
-        break; 
-      case LibFunc_exp2f: case LibFunc_exp2: case LibFunc_exp2l: 
-        ExpName = TLI->getName(LibFunc_exp2); 
-        ID = Intrinsic::exp2; 
-        LibFnFloat = LibFunc_exp2f; 
-        LibFnDouble = LibFunc_exp2; 
-        LibFnLongDouble = LibFunc_exp2l; 
-        break; 
-      } 
- 
-      // Create new exp{,2}() with the product as its argument. 
-      Value *FMul = B.CreateFMul(BaseFn->getArgOperand(0), Expo, "mul"); 
-      ExpFn = BaseFn->doesNotAccessMemory() 
-              ? B.CreateCall(Intrinsic::getDeclaration(Mod, ID, Ty), 
-                             FMul, ExpName) 
-              : emitUnaryFloatFnCall(FMul, TLI, LibFnDouble, LibFnFloat, 
-                                     LibFnLongDouble, B, 
-                                     BaseFn->getAttributes()); 
- 
-      // Since the new exp{,2}() is different from the original one, dead code 
-      // elimination cannot be trusted to remove it, since it may have side 
-      // effects (e.g., errno).  When the only consumer for the original 
-      // exp{,2}() is pow(), then it has to be explicitly erased. 
-      substituteInParent(BaseFn, ExpFn); 
-      return ExpFn; 
-    } 
-  } 
- 
-  // Evaluate special cases related to a constant base. 
- 
-  const APFloat *BaseF; 
-  if (!match(Pow->getArgOperand(0), m_APFloat(BaseF))) 
-    return nullptr; 
- 
-  // pow(2.0, itofp(x)) -> ldexp(1.0, x) 
-  if (match(Base, m_SpecificFP(2.0)) && 
-      (isa<SIToFPInst>(Expo) || isa<UIToFPInst>(Expo)) && 
-      hasFloatFn(TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) { 
-    if (Value *ExpoI = getIntToFPVal(Expo, B)) 
-      return emitBinaryFloatFnCall(ConstantFP::get(Ty, 1.0), ExpoI, TLI, 
-                                   LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl, 
-                                   B, Attrs); 
-  } 
- 
-  // pow(2.0 ** n, x) -> exp2(n * x) 
-  if (hasFloatFn(TLI, Ty, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l)) { 
-    APFloat BaseR = APFloat(1.0); 
-    BaseR.convert(BaseF->getSemantics(), APFloat::rmTowardZero, &Ignored); 
-    BaseR = BaseR / *BaseF; 
-    bool IsInteger = BaseF->isInteger(), IsReciprocal = BaseR.isInteger(); 
-    const APFloat *NF = IsReciprocal ? &BaseR : BaseF; 
-    APSInt NI(64, false); 
-    if ((IsInteger || IsReciprocal) && 
-        NF->convertToInteger(NI, APFloat::rmTowardZero, &Ignored) == 
-            APFloat::opOK && 
-        NI > 1 && NI.isPowerOf2()) { 
-      double N = NI.logBase2() * (IsReciprocal ? -1.0 : 1.0); 
-      Value *FMul = B.CreateFMul(Expo, ConstantFP::get(Ty, N), "mul"); 
-      if (Pow->doesNotAccessMemory()) 
-        return B.CreateCall(Intrinsic::getDeclaration(Mod, Intrinsic::exp2, Ty), 
-                            FMul, "exp2"); 
-      else 
-        return emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2, LibFunc_exp2f, 
-                                    LibFunc_exp2l, B, Attrs); 
-    } 
-  } 
- 
-  // pow(10.0, x) -> exp10(x) 
-  // TODO: There is no exp10() intrinsic yet, but some day there shall be one. 
-  if (match(Base, m_SpecificFP(10.0)) && 
-      hasFloatFn(TLI, Ty, LibFunc_exp10, LibFunc_exp10f, LibFunc_exp10l)) 
-    return emitUnaryFloatFnCall(Expo, TLI, LibFunc_exp10, LibFunc_exp10f, 
-                                LibFunc_exp10l, B, Attrs); 
- 
-  // pow(x, y) -> exp2(log2(x) * y) 
-  if (Pow->hasApproxFunc() && Pow->hasNoNaNs() && BaseF->isFiniteNonZero() && 
-      !BaseF->isNegative()) { 
-    // pow(1, inf) is defined to be 1 but exp2(log2(1) * inf) evaluates to NaN. 
-    // Luckily optimizePow has already handled the x == 1 case. 
-    assert(!match(Base, m_FPOne()) && 
-           "pow(1.0, y) should have been simplified earlier!"); 
- 
-    Value *Log = nullptr; 
-    if (Ty->isFloatTy()) 
-      Log = ConstantFP::get(Ty, std::log2(BaseF->convertToFloat())); 
-    else if (Ty->isDoubleTy()) 
-      Log = ConstantFP::get(Ty, std::log2(BaseF->convertToDouble())); 
- 
-    if (Log) { 
-      Value *FMul = B.CreateFMul(Log, Expo, "mul"); 
-      if (Pow->doesNotAccessMemory()) 
-        return B.CreateCall(Intrinsic::getDeclaration(Mod, Intrinsic::exp2, Ty), 
-                            FMul, "exp2"); 
-      else if (hasFloatFn(TLI, Ty, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l)) 
-        return emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2, LibFunc_exp2f, 
-                                    LibFunc_exp2l, B, Attrs); 
-    } 
-  } 
- 
-  return nullptr; 
-} 
- 
-static Value *getSqrtCall(Value *V, AttributeList Attrs, bool NoErrno, 
-                          Module *M, IRBuilderBase &B, 
-                          const TargetLibraryInfo *TLI) { 
-  // If errno is never set, then use the intrinsic for sqrt(). 
-  if (NoErrno) { 
-    Function *SqrtFn = 
-        Intrinsic::getDeclaration(M, Intrinsic::sqrt, V->getType()); 
-    return B.CreateCall(SqrtFn, V, "sqrt"); 
-  } 
- 
-  // Otherwise, use the libcall for sqrt(). 
-  if (hasFloatFn(TLI, V->getType(), LibFunc_sqrt, LibFunc_sqrtf, LibFunc_sqrtl)) 
-    // TODO: We also should check that the target can in fact lower the sqrt() 
-    // libcall. We currently have no way to ask this question, so we ask if 
-    // the target has a sqrt() libcall, which is not exactly the same. 
-    return emitUnaryFloatFnCall(V, TLI, LibFunc_sqrt, LibFunc_sqrtf, 
-                                LibFunc_sqrtl, B, Attrs); 
- 
-  return nullptr; 
-} 
- 
-/// Use square root in place of pow(x, +/-0.5). 
-Value *LibCallSimplifier::replacePowWithSqrt(CallInst *Pow, IRBuilderBase &B) { 
-  Value *Sqrt, *Base = Pow->getArgOperand(0), *Expo = Pow->getArgOperand(1); 
-  AttributeList Attrs; // Attributes are only meaningful on the original call 
-  Module *Mod = Pow->getModule(); 
-  Type *Ty = Pow->getType(); 
- 
-  const APFloat *ExpoF; 
-  if (!match(Expo, m_APFloat(ExpoF)) || 
-      (!ExpoF->isExactlyValue(0.5) && !ExpoF->isExactlyValue(-0.5))) 
-    return nullptr; 
- 
-  // Converting pow(X, -0.5) to 1/sqrt(X) may introduce an extra rounding step, 
-  // so that requires fast-math-flags (afn or reassoc). 
-  if (ExpoF->isNegative() && (!Pow->hasApproxFunc() && !Pow->hasAllowReassoc())) 
-    return nullptr; 
- 
+  return CI->getArgOperand(0);
+}
+
+Value *LibCallSimplifier::optimizeRealloc(CallInst *CI, IRBuilderBase &B) {
+  if (isa<ConstantPointerNull>(CI->getArgOperand(0)))
+    return emitMalloc(CI->getArgOperand(1), B, DL, TLI);
+
+  return nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+// Math Library Optimizations
+//===----------------------------------------------------------------------===//
+
+// Replace a libcall \p CI with a call to intrinsic \p IID
+static Value *replaceUnaryCall(CallInst *CI, IRBuilderBase &B,
+                               Intrinsic::ID IID) {
+  // Propagate fast-math flags from the existing call to the new call.
+  IRBuilderBase::FastMathFlagGuard Guard(B);
+  B.setFastMathFlags(CI->getFastMathFlags());
+
+  Module *M = CI->getModule();
+  Value *V = CI->getArgOperand(0);
+  Function *F = Intrinsic::getDeclaration(M, IID, CI->getType());
+  CallInst *NewCall = B.CreateCall(F, V);
+  NewCall->takeName(CI);
+  return NewCall;
+}
+
+/// Return a variant of Val with float type.
+/// Currently this works in two cases: If Val is an FPExtension of a float
+/// value to something bigger, simply return the operand.
+/// If Val is a ConstantFP but can be converted to a float ConstantFP without
+/// loss of precision do so.
+static Value *valueHasFloatPrecision(Value *Val) {
+  if (FPExtInst *Cast = dyn_cast<FPExtInst>(Val)) {
+    Value *Op = Cast->getOperand(0);
+    if (Op->getType()->isFloatTy())
+      return Op;
+  }
+  if (ConstantFP *Const = dyn_cast<ConstantFP>(Val)) {
+    APFloat F = Const->getValueAPF();
+    bool losesInfo;
+    (void)F.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
+                    &losesInfo);
+    if (!losesInfo)
+      return ConstantFP::get(Const->getContext(), F);
+  }
+  return nullptr;
+}
+
+/// Shrink double -> float functions.
+static Value *optimizeDoubleFP(CallInst *CI, IRBuilderBase &B,
+                               bool isBinary, bool isPrecise = false) {
+  Function *CalleeFn = CI->getCalledFunction();
+  if (!CI->getType()->isDoubleTy() || !CalleeFn)
+    return nullptr;
+
+  // If not all the uses of the function are converted to float, then bail out.
+  // This matters if the precision of the result is more important than the
+  // precision of the arguments.
+  if (isPrecise)
+    for (User *U : CI->users()) {
+      FPTruncInst *Cast = dyn_cast<FPTruncInst>(U);
+      if (!Cast || !Cast->getType()->isFloatTy())
+        return nullptr;
+    }
+
+  // If this is something like 'g((double) float)', convert to 'gf(float)'.
+  Value *V[2];
+  V[0] = valueHasFloatPrecision(CI->getArgOperand(0));
+  V[1] = isBinary ? valueHasFloatPrecision(CI->getArgOperand(1)) : nullptr;
+  if (!V[0] || (isBinary && !V[1]))
+    return nullptr;
+
+  // If call isn't an intrinsic, check that it isn't within a function with the
+  // same name as the float version of this call, otherwise the result is an
+  // infinite loop.  For example, from MinGW-w64:
+  //
+  // float expf(float val) { return (float) exp((double) val); }
+  StringRef CalleeName = CalleeFn->getName();
+  bool IsIntrinsic = CalleeFn->isIntrinsic();
+  if (!IsIntrinsic) {
+    StringRef CallerName = CI->getFunction()->getName();
+    if (!CallerName.empty() && CallerName.back() == 'f' &&
+        CallerName.size() == (CalleeName.size() + 1) &&
+        CallerName.startswith(CalleeName))
+      return nullptr;
+  }
+
+  // Propagate the math semantics from the current function to the new function.
+  IRBuilderBase::FastMathFlagGuard Guard(B);
+  B.setFastMathFlags(CI->getFastMathFlags());
+
+  // g((double) float) -> (double) gf(float)
+  Value *R;
+  if (IsIntrinsic) {
+    Module *M = CI->getModule();
+    Intrinsic::ID IID = CalleeFn->getIntrinsicID();
+    Function *Fn = Intrinsic::getDeclaration(M, IID, B.getFloatTy());
+    R = isBinary ? B.CreateCall(Fn, V) : B.CreateCall(Fn, V[0]);
+  } else {
+    AttributeList CalleeAttrs = CalleeFn->getAttributes();
+    R = isBinary ? emitBinaryFloatFnCall(V[0], V[1], CalleeName, B, CalleeAttrs)
+                 : emitUnaryFloatFnCall(V[0], CalleeName, B, CalleeAttrs);
+  }
+  return B.CreateFPExt(R, B.getDoubleTy());
+}
+
+/// Shrink double -> float for unary functions.
+static Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilderBase &B,
+                                    bool isPrecise = false) {
+  return optimizeDoubleFP(CI, B, false, isPrecise);
+}
+
+/// Shrink double -> float for binary functions.
+static Value *optimizeBinaryDoubleFP(CallInst *CI, IRBuilderBase &B,
+                                     bool isPrecise = false) {
+  return optimizeDoubleFP(CI, B, true, isPrecise);
+}
+
+// cabs(z) -> sqrt((creal(z)*creal(z)) + (cimag(z)*cimag(z)))
+Value *LibCallSimplifier::optimizeCAbs(CallInst *CI, IRBuilderBase &B) {
+  if (!CI->isFast())
+    return nullptr;
+
+  // Propagate fast-math flags from the existing call to new instructions.
+  IRBuilderBase::FastMathFlagGuard Guard(B);
+  B.setFastMathFlags(CI->getFastMathFlags());
+
+  Value *Real, *Imag;
+  if (CI->getNumArgOperands() == 1) {
+    Value *Op = CI->getArgOperand(0);
+    assert(Op->getType()->isArrayTy() && "Unexpected signature for cabs!");
+    Real = B.CreateExtractValue(Op, 0, "real");
+    Imag = B.CreateExtractValue(Op, 1, "imag");
+  } else {
+    assert(CI->getNumArgOperands() == 2 && "Unexpected signature for cabs!");
+    Real = CI->getArgOperand(0);
+    Imag = CI->getArgOperand(1);
+  }
+
+  Value *RealReal = B.CreateFMul(Real, Real);
+  Value *ImagImag = B.CreateFMul(Imag, Imag);
+
+  Function *FSqrt = Intrinsic::getDeclaration(CI->getModule(), Intrinsic::sqrt,
+                                              CI->getType());
+  return B.CreateCall(FSqrt, B.CreateFAdd(RealReal, ImagImag), "cabs");
+}
+
+static Value *optimizeTrigReflections(CallInst *Call, LibFunc Func,
+                                      IRBuilderBase &B) {
+  if (!isa<FPMathOperator>(Call))
+    return nullptr;
+
+  IRBuilderBase::FastMathFlagGuard Guard(B);
+  B.setFastMathFlags(Call->getFastMathFlags());
+
+  // TODO: Can this be shared to also handle LLVM intrinsics?
+  Value *X;
+  switch (Func) {
+  case LibFunc_sin:
+  case LibFunc_sinf:
+  case LibFunc_sinl:
+  case LibFunc_tan:
+  case LibFunc_tanf:
+  case LibFunc_tanl:
+    // sin(-X) --> -sin(X)
+    // tan(-X) --> -tan(X)
+    if (match(Call->getArgOperand(0), m_OneUse(m_FNeg(m_Value(X)))))
+      return B.CreateFNeg(B.CreateCall(Call->getCalledFunction(), X));
+    break;
+  case LibFunc_cos:
+  case LibFunc_cosf:
+  case LibFunc_cosl:
+    // cos(-X) --> cos(X)
+    if (match(Call->getArgOperand(0), m_FNeg(m_Value(X))))
+      return B.CreateCall(Call->getCalledFunction(), X, "cos");
+    break;
+  default:
+    break;
+  }
+  return nullptr;
+}
+
+static Value *getPow(Value *InnerChain[33], unsigned Exp, IRBuilderBase &B) {
+  // Multiplications calculated using Addition Chains.
+  // Refer: http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html
+
+  assert(Exp != 0 && "Incorrect exponent 0 not handled");
+
+  if (InnerChain[Exp])
+    return InnerChain[Exp];
+
+  static const unsigned AddChain[33][2] = {
+      {0, 0}, // Unused.
+      {0, 0}, // Unused (base case = pow1).
+      {1, 1}, // Unused (pre-computed).
+      {1, 2},  {2, 2},   {2, 3},  {3, 3},   {2, 5},  {4, 4},
+      {1, 8},  {5, 5},   {1, 10}, {6, 6},   {4, 9},  {7, 7},
+      {3, 12}, {8, 8},   {8, 9},  {2, 16},  {1, 18}, {10, 10},
+      {6, 15}, {11, 11}, {3, 20}, {12, 12}, {8, 17}, {13, 13},
+      {3, 24}, {14, 14}, {4, 25}, {15, 15}, {3, 28}, {16, 16},
+  };
+
+  InnerChain[Exp] = B.CreateFMul(getPow(InnerChain, AddChain[Exp][0], B),
+                                 getPow(InnerChain, AddChain[Exp][1], B));
+  return InnerChain[Exp];
+}
+
+// Return a properly extended 32-bit integer if the operation is an itofp.
+static Value *getIntToFPVal(Value *I2F, IRBuilderBase &B) {
+  if (isa<SIToFPInst>(I2F) || isa<UIToFPInst>(I2F)) {
+    Value *Op = cast<Instruction>(I2F)->getOperand(0);
+    // Make sure that the exponent fits inside an int32_t,
+    // thus avoiding any range issues that FP has not.
+    unsigned BitWidth = Op->getType()->getPrimitiveSizeInBits();
+    if (BitWidth < 32 ||
+        (BitWidth == 32 && isa<SIToFPInst>(I2F)))
+      return isa<SIToFPInst>(I2F) ? B.CreateSExt(Op, B.getInt32Ty())
+                                  : B.CreateZExt(Op, B.getInt32Ty());
+  }
+
+  return nullptr;
+}
+
+/// Use exp{,2}(x * y) for pow(exp{,2}(x), y);
+/// ldexp(1.0, x) for pow(2.0, itofp(x)); exp2(n * x) for pow(2.0 ** n, x);
+/// exp10(x) for pow(10.0, x); exp2(log2(n) * x) for pow(n, x).
+Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) {
+  Value *Base = Pow->getArgOperand(0), *Expo = Pow->getArgOperand(1);
+  AttributeList Attrs; // Attributes are only meaningful on the original call
+  Module *Mod = Pow->getModule();
+  Type *Ty = Pow->getType();
+  bool Ignored;
+
+  // Evaluate special cases related to a nested function as the base.
+
+  // pow(exp(x), y) -> exp(x * y)
+  // pow(exp2(x), y) -> exp2(x * y)
+  // If exp{,2}() is used only once, it is better to fold two transcendental
+  // math functions into one.  If used again, exp{,2}() would still have to be
+  // called with the original argument, then keep both original transcendental
+  // functions.  However, this transformation is only safe with fully relaxed
+  // math semantics, since, besides rounding differences, it changes overflow
+  // and underflow behavior quite dramatically.  For example:
+  //   pow(exp(1000), 0.001) = pow(inf, 0.001) = inf
+  // Whereas:
+  //   exp(1000 * 0.001) = exp(1)
+  // TODO: Loosen the requirement for fully relaxed math semantics.
+  // TODO: Handle exp10() when more targets have it available.
+  CallInst *BaseFn = dyn_cast<CallInst>(Base);
+  if (BaseFn && BaseFn->hasOneUse() && BaseFn->isFast() && Pow->isFast()) {
+    LibFunc LibFn;
+
+    Function *CalleeFn = BaseFn->getCalledFunction();
+    if (CalleeFn &&
+        TLI->getLibFunc(CalleeFn->getName(), LibFn) && TLI->has(LibFn)) {
+      StringRef ExpName;
+      Intrinsic::ID ID;
+      Value *ExpFn;
+      LibFunc LibFnFloat, LibFnDouble, LibFnLongDouble;
+
+      switch (LibFn) {
+      default:
+        return nullptr;
+      case LibFunc_expf:  case LibFunc_exp:  case LibFunc_expl:
+        ExpName = TLI->getName(LibFunc_exp);
+        ID = Intrinsic::exp;
+        LibFnFloat = LibFunc_expf;
+        LibFnDouble = LibFunc_exp;
+        LibFnLongDouble = LibFunc_expl;
+        break;
+      case LibFunc_exp2f: case LibFunc_exp2: case LibFunc_exp2l:
+        ExpName = TLI->getName(LibFunc_exp2);
+        ID = Intrinsic::exp2;
+        LibFnFloat = LibFunc_exp2f;
+        LibFnDouble = LibFunc_exp2;
+        LibFnLongDouble = LibFunc_exp2l;
+        break;
+      }
+
+      // Create new exp{,2}() with the product as its argument.
+      Value *FMul = B.CreateFMul(BaseFn->getArgOperand(0), Expo, "mul");
+      ExpFn = BaseFn->doesNotAccessMemory()
+              ? B.CreateCall(Intrinsic::getDeclaration(Mod, ID, Ty),
+                             FMul, ExpName)
+              : emitUnaryFloatFnCall(FMul, TLI, LibFnDouble, LibFnFloat,
+                                     LibFnLongDouble, B,
+                                     BaseFn->getAttributes());
+
+      // Since the new exp{,2}() is different from the original one, dead code
+      // elimination cannot be trusted to remove it, since it may have side
+      // effects (e.g., errno).  When the only consumer for the original
+      // exp{,2}() is pow(), then it has to be explicitly erased.
+      substituteInParent(BaseFn, ExpFn);
+      return ExpFn;
+    }
+  }
+
+  // Evaluate special cases related to a constant base.
+
+  const APFloat *BaseF;
+  if (!match(Pow->getArgOperand(0), m_APFloat(BaseF)))
+    return nullptr;
+
+  // pow(2.0, itofp(x)) -> ldexp(1.0, x)
+  if (match(Base, m_SpecificFP(2.0)) &&
+      (isa<SIToFPInst>(Expo) || isa<UIToFPInst>(Expo)) &&
+      hasFloatFn(TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) {
+    if (Value *ExpoI = getIntToFPVal(Expo, B))
+      return emitBinaryFloatFnCall(ConstantFP::get(Ty, 1.0), ExpoI, TLI,
+                                   LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl,
+                                   B, Attrs);
+  }
+
+  // pow(2.0 ** n, x) -> exp2(n * x)
+  if (hasFloatFn(TLI, Ty, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l)) {
+    APFloat BaseR = APFloat(1.0);
+    BaseR.convert(BaseF->getSemantics(), APFloat::rmTowardZero, &Ignored);
+    BaseR = BaseR / *BaseF;
+    bool IsInteger = BaseF->isInteger(), IsReciprocal = BaseR.isInteger();
+    const APFloat *NF = IsReciprocal ? &BaseR : BaseF;
+    APSInt NI(64, false);
+    if ((IsInteger || IsReciprocal) &&
+        NF->convertToInteger(NI, APFloat::rmTowardZero, &Ignored) ==
+            APFloat::opOK &&
+        NI > 1 && NI.isPowerOf2()) {
+      double N = NI.logBase2() * (IsReciprocal ? -1.0 : 1.0);
+      Value *FMul = B.CreateFMul(Expo, ConstantFP::get(Ty, N), "mul");
+      if (Pow->doesNotAccessMemory())
+        return B.CreateCall(Intrinsic::getDeclaration(Mod, Intrinsic::exp2, Ty),
+                            FMul, "exp2");
+      else
+        return emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2, LibFunc_exp2f,
+                                    LibFunc_exp2l, B, Attrs);
+    }
+  }
+
+  // pow(10.0, x) -> exp10(x)
+  // TODO: There is no exp10() intrinsic yet, but some day there shall be one.
+  if (match(Base, m_SpecificFP(10.0)) &&
+      hasFloatFn(TLI, Ty, LibFunc_exp10, LibFunc_exp10f, LibFunc_exp10l))
+    return emitUnaryFloatFnCall(Expo, TLI, LibFunc_exp10, LibFunc_exp10f,
+                                LibFunc_exp10l, B, Attrs);
+
+  // pow(x, y) -> exp2(log2(x) * y)
+  if (Pow->hasApproxFunc() && Pow->hasNoNaNs() && BaseF->isFiniteNonZero() &&
+      !BaseF->isNegative()) {
+    // pow(1, inf) is defined to be 1 but exp2(log2(1) * inf) evaluates to NaN.
+    // Luckily optimizePow has already handled the x == 1 case.
+    assert(!match(Base, m_FPOne()) &&
+           "pow(1.0, y) should have been simplified earlier!");
+
+    Value *Log = nullptr;
+    if (Ty->isFloatTy())
+      Log = ConstantFP::get(Ty, std::log2(BaseF->convertToFloat()));
+    else if (Ty->isDoubleTy())
+      Log = ConstantFP::get(Ty, std::log2(BaseF->convertToDouble()));
+
+    if (Log) {
+      Value *FMul = B.CreateFMul(Log, Expo, "mul");
+      if (Pow->doesNotAccessMemory())
+        return B.CreateCall(Intrinsic::getDeclaration(Mod, Intrinsic::exp2, Ty),
+                            FMul, "exp2");
+      else if (hasFloatFn(TLI, Ty, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l))
+        return emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2, LibFunc_exp2f,
+                                    LibFunc_exp2l, B, Attrs);
+    }
+  }
+
+  return nullptr;
+}
+
+static Value *getSqrtCall(Value *V, AttributeList Attrs, bool NoErrno,
+                          Module *M, IRBuilderBase &B,
+                          const TargetLibraryInfo *TLI) {
+  // If errno is never set, then use the intrinsic for sqrt().
+  if (NoErrno) {
+    Function *SqrtFn =
+        Intrinsic::getDeclaration(M, Intrinsic::sqrt, V->getType());
+    return B.CreateCall(SqrtFn, V, "sqrt");
+  }
+
+  // Otherwise, use the libcall for sqrt().
+  if (hasFloatFn(TLI, V->getType(), LibFunc_sqrt, LibFunc_sqrtf, LibFunc_sqrtl))
+    // TODO: We also should check that the target can in fact lower the sqrt()
+    // libcall. We currently have no way to ask this question, so we ask if
+    // the target has a sqrt() libcall, which is not exactly the same.
+    return emitUnaryFloatFnCall(V, TLI, LibFunc_sqrt, LibFunc_sqrtf,
+                                LibFunc_sqrtl, B, Attrs);
+
+  return nullptr;
+}
+
+/// Use square root in place of pow(x, +/-0.5).
+Value *LibCallSimplifier::replacePowWithSqrt(CallInst *Pow, IRBuilderBase &B) {
+  Value *Sqrt, *Base = Pow->getArgOperand(0), *Expo = Pow->getArgOperand(1);
+  AttributeList Attrs; // Attributes are only meaningful on the original call
+  Module *Mod = Pow->getModule();
+  Type *Ty = Pow->getType();
+
+  const APFloat *ExpoF;
+  if (!match(Expo, m_APFloat(ExpoF)) ||
+      (!ExpoF->isExactlyValue(0.5) && !ExpoF->isExactlyValue(-0.5)))
+    return nullptr;
+
+  // Converting pow(X, -0.5) to 1/sqrt(X) may introduce an extra rounding step,
+  // so that requires fast-math-flags (afn or reassoc).
+  if (ExpoF->isNegative() && (!Pow->hasApproxFunc() && !Pow->hasAllowReassoc()))
+    return nullptr;
+
   // If we have a pow() library call (accesses memory) and we can't guarantee
   // that the base is not an infinity, give up:
   // pow(-Inf, 0.5) is optionally required to have a result of +Inf (not setting
@@ -1661,867 +1661,867 @@ Value *LibCallSimplifier::replacePowWithSqrt(CallInst *Pow, IRBuilderBase &B) {
       !isKnownNeverInfinity(Base, TLI))
     return nullptr;
 
-  Sqrt = getSqrtCall(Base, Attrs, Pow->doesNotAccessMemory(), Mod, B, TLI); 
-  if (!Sqrt) 
-    return nullptr; 
- 
-  // Handle signed zero base by expanding to fabs(sqrt(x)). 
-  if (!Pow->hasNoSignedZeros()) { 
-    Function *FAbsFn = Intrinsic::getDeclaration(Mod, Intrinsic::fabs, Ty); 
-    Sqrt = B.CreateCall(FAbsFn, Sqrt, "abs"); 
-  } 
- 
-  // Handle non finite base by expanding to 
-  // (x == -infinity ? +infinity : sqrt(x)). 
-  if (!Pow->hasNoInfs()) { 
-    Value *PosInf = ConstantFP::getInfinity(Ty), 
-          *NegInf = ConstantFP::getInfinity(Ty, true); 
-    Value *FCmp = B.CreateFCmpOEQ(Base, NegInf, "isinf"); 
-    Sqrt = B.CreateSelect(FCmp, PosInf, Sqrt); 
-  } 
- 
-  // If the exponent is negative, then get the reciprocal. 
-  if (ExpoF->isNegative()) 
-    Sqrt = B.CreateFDiv(ConstantFP::get(Ty, 1.0), Sqrt, "reciprocal"); 
- 
-  return Sqrt; 
-} 
- 
-static Value *createPowWithIntegerExponent(Value *Base, Value *Expo, Module *M, 
-                                           IRBuilderBase &B) { 
-  Value *Args[] = {Base, Expo}; 
-  Function *F = Intrinsic::getDeclaration(M, Intrinsic::powi, Base->getType()); 
-  return B.CreateCall(F, Args); 
-} 
- 
-Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilderBase &B) { 
-  Value *Base = Pow->getArgOperand(0); 
-  Value *Expo = Pow->getArgOperand(1); 
-  Function *Callee = Pow->getCalledFunction(); 
-  StringRef Name = Callee->getName(); 
-  Type *Ty = Pow->getType(); 
-  Module *M = Pow->getModule(); 
-  Value *Shrunk = nullptr; 
-  bool AllowApprox = Pow->hasApproxFunc(); 
-  bool Ignored; 
- 
-  // Propagate the math semantics from the call to any created instructions. 
-  IRBuilderBase::FastMathFlagGuard Guard(B); 
-  B.setFastMathFlags(Pow->getFastMathFlags()); 
- 
-  // Shrink pow() to powf() if the arguments are single precision, 
-  // unless the result is expected to be double precision. 
-  if (UnsafeFPShrink && Name == TLI->getName(LibFunc_pow) && 
-      hasFloatVersion(Name)) 
-    Shrunk = optimizeBinaryDoubleFP(Pow, B, true); 
- 
-  // Evaluate special cases related to the base. 
- 
-  // pow(1.0, x) -> 1.0 
-  if (match(Base, m_FPOne())) 
-    return Base; 
- 
-  if (Value *Exp = replacePowWithExp(Pow, B)) 
-    return Exp; 
- 
-  // Evaluate special cases related to the exponent. 
- 
-  // pow(x, -1.0) -> 1.0 / x 
-  if (match(Expo, m_SpecificFP(-1.0))) 
-    return B.CreateFDiv(ConstantFP::get(Ty, 1.0), Base, "reciprocal"); 
- 
-  // pow(x, +/-0.0) -> 1.0 
-  if (match(Expo, m_AnyZeroFP())) 
-    return ConstantFP::get(Ty, 1.0); 
- 
-  // pow(x, 1.0) -> x 
-  if (match(Expo, m_FPOne())) 
-    return Base; 
- 
-  // pow(x, 2.0) -> x * x 
-  if (match(Expo, m_SpecificFP(2.0))) 
-    return B.CreateFMul(Base, Base, "square"); 
- 
-  if (Value *Sqrt = replacePowWithSqrt(Pow, B)) 
-    return Sqrt; 
- 
-  // pow(x, n) -> x * x * x * ... 
-  const APFloat *ExpoF; 
+  Sqrt = getSqrtCall(Base, Attrs, Pow->doesNotAccessMemory(), Mod, B, TLI);
+  if (!Sqrt)
+    return nullptr;
+
+  // Handle signed zero base by expanding to fabs(sqrt(x)).
+  if (!Pow->hasNoSignedZeros()) {
+    Function *FAbsFn = Intrinsic::getDeclaration(Mod, Intrinsic::fabs, Ty);
+    Sqrt = B.CreateCall(FAbsFn, Sqrt, "abs");
+  }
+
+  // Handle non finite base by expanding to
+  // (x == -infinity ? +infinity : sqrt(x)).
+  if (!Pow->hasNoInfs()) {
+    Value *PosInf = ConstantFP::getInfinity(Ty),
+          *NegInf = ConstantFP::getInfinity(Ty, true);
+    Value *FCmp = B.CreateFCmpOEQ(Base, NegInf, "isinf");
+    Sqrt = B.CreateSelect(FCmp, PosInf, Sqrt);
+  }
+
+  // If the exponent is negative, then get the reciprocal.
+  if (ExpoF->isNegative())
+    Sqrt = B.CreateFDiv(ConstantFP::get(Ty, 1.0), Sqrt, "reciprocal");
+
+  return Sqrt;
+}
+
+static Value *createPowWithIntegerExponent(Value *Base, Value *Expo, Module *M,
+                                           IRBuilderBase &B) {
+  Value *Args[] = {Base, Expo};
+  Function *F = Intrinsic::getDeclaration(M, Intrinsic::powi, Base->getType());
+  return B.CreateCall(F, Args);
+}
+
+Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilderBase &B) {
+  Value *Base = Pow->getArgOperand(0);
+  Value *Expo = Pow->getArgOperand(1);
+  Function *Callee = Pow->getCalledFunction();
+  StringRef Name = Callee->getName();
+  Type *Ty = Pow->getType();
+  Module *M = Pow->getModule();
+  Value *Shrunk = nullptr;
+  bool AllowApprox = Pow->hasApproxFunc();
+  bool Ignored;
+
+  // Propagate the math semantics from the call to any created instructions.
+  IRBuilderBase::FastMathFlagGuard Guard(B);
+  B.setFastMathFlags(Pow->getFastMathFlags());
+
+  // Shrink pow() to powf() if the arguments are single precision,
+  // unless the result is expected to be double precision.
+  if (UnsafeFPShrink && Name == TLI->getName(LibFunc_pow) &&
+      hasFloatVersion(Name))
+    Shrunk = optimizeBinaryDoubleFP(Pow, B, true);
+
+  // Evaluate special cases related to the base.
+
+  // pow(1.0, x) -> 1.0
+  if (match(Base, m_FPOne()))
+    return Base;
+
+  if (Value *Exp = replacePowWithExp(Pow, B))
+    return Exp;
+
+  // Evaluate special cases related to the exponent.
+
+  // pow(x, -1.0) -> 1.0 / x
+  if (match(Expo, m_SpecificFP(-1.0)))
+    return B.CreateFDiv(ConstantFP::get(Ty, 1.0), Base, "reciprocal");
+
+  // pow(x, +/-0.0) -> 1.0
+  if (match(Expo, m_AnyZeroFP()))
+    return ConstantFP::get(Ty, 1.0);
+
+  // pow(x, 1.0) -> x
+  if (match(Expo, m_FPOne()))
+    return Base;
+
+  // pow(x, 2.0) -> x * x
+  if (match(Expo, m_SpecificFP(2.0)))
+    return B.CreateFMul(Base, Base, "square");
+
+  if (Value *Sqrt = replacePowWithSqrt(Pow, B))
+    return Sqrt;
+
+  // pow(x, n) -> x * x * x * ...
+  const APFloat *ExpoF;
   if (AllowApprox && match(Expo, m_APFloat(ExpoF)) &&
       !ExpoF->isExactlyValue(0.5) && !ExpoF->isExactlyValue(-0.5)) {
-    // We limit to a max of 7 multiplications, thus the maximum exponent is 32. 
-    // If the exponent is an integer+0.5 we generate a call to sqrt and an 
-    // additional fmul. 
-    // TODO: This whole transformation should be backend specific (e.g. some 
-    //       backends might prefer libcalls or the limit for the exponent might 
-    //       be different) and it should also consider optimizing for size. 
-    APFloat LimF(ExpoF->getSemantics(), 33), 
-            ExpoA(abs(*ExpoF)); 
-    if (ExpoA < LimF) { 
-      // This transformation applies to integer or integer+0.5 exponents only. 
-      // For integer+0.5, we create a sqrt(Base) call. 
-      Value *Sqrt = nullptr; 
-      if (!ExpoA.isInteger()) { 
-        APFloat Expo2 = ExpoA; 
-        // To check if ExpoA is an integer + 0.5, we add it to itself. If there 
-        // is no floating point exception and the result is an integer, then 
-        // ExpoA == integer + 0.5 
-        if (Expo2.add(ExpoA, APFloat::rmNearestTiesToEven) != APFloat::opOK) 
-          return nullptr; 
- 
-        if (!Expo2.isInteger()) 
-          return nullptr; 
- 
-        Sqrt = getSqrtCall(Base, Pow->getCalledFunction()->getAttributes(), 
-                           Pow->doesNotAccessMemory(), M, B, TLI); 
+    // We limit to a max of 7 multiplications, thus the maximum exponent is 32.
+    // If the exponent is an integer+0.5 we generate a call to sqrt and an
+    // additional fmul.
+    // TODO: This whole transformation should be backend specific (e.g. some
+    //       backends might prefer libcalls or the limit for the exponent might
+    //       be different) and it should also consider optimizing for size.
+    APFloat LimF(ExpoF->getSemantics(), 33),
+            ExpoA(abs(*ExpoF));
+    if (ExpoA < LimF) {
+      // This transformation applies to integer or integer+0.5 exponents only.
+      // For integer+0.5, we create a sqrt(Base) call.
+      Value *Sqrt = nullptr;
+      if (!ExpoA.isInteger()) {
+        APFloat Expo2 = ExpoA;
+        // To check if ExpoA is an integer + 0.5, we add it to itself. If there
+        // is no floating point exception and the result is an integer, then
+        // ExpoA == integer + 0.5
+        if (Expo2.add(ExpoA, APFloat::rmNearestTiesToEven) != APFloat::opOK)
+          return nullptr;
+
+        if (!Expo2.isInteger())
+          return nullptr;
+
+        Sqrt = getSqrtCall(Base, Pow->getCalledFunction()->getAttributes(),
+                           Pow->doesNotAccessMemory(), M, B, TLI);
         if (!Sqrt)
           return nullptr;
-      } 
- 
-      // We will memoize intermediate products of the Addition Chain. 
-      Value *InnerChain[33] = {nullptr}; 
-      InnerChain[1] = Base; 
-      InnerChain[2] = B.CreateFMul(Base, Base, "square"); 
- 
-      // We cannot readily convert a non-double type (like float) to a double. 
-      // So we first convert it to something which could be converted to double. 
-      ExpoA.convert(APFloat::IEEEdouble(), APFloat::rmTowardZero, &Ignored); 
-      Value *FMul = getPow(InnerChain, ExpoA.convertToDouble(), B); 
- 
-      // Expand pow(x, y+0.5) to pow(x, y) * sqrt(x). 
-      if (Sqrt) 
-        FMul = B.CreateFMul(FMul, Sqrt); 
- 
-      // If the exponent is negative, then get the reciprocal. 
-      if (ExpoF->isNegative()) 
-        FMul = B.CreateFDiv(ConstantFP::get(Ty, 1.0), FMul, "reciprocal"); 
- 
-      return FMul; 
-    } 
- 
-    APSInt IntExpo(32, /*isUnsigned=*/false); 
-    // powf(x, n) -> powi(x, n) if n is a constant signed integer value 
-    if (ExpoF->isInteger() && 
-        ExpoF->convertToInteger(IntExpo, APFloat::rmTowardZero, &Ignored) == 
-            APFloat::opOK) { 
-      return createPowWithIntegerExponent( 
-          Base, ConstantInt::get(B.getInt32Ty(), IntExpo), M, B); 
-    } 
-  } 
- 
-  // powf(x, itofp(y)) -> powi(x, y) 
-  if (AllowApprox && (isa<SIToFPInst>(Expo) || isa<UIToFPInst>(Expo))) { 
-    if (Value *ExpoI = getIntToFPVal(Expo, B)) 
-      return createPowWithIntegerExponent(Base, ExpoI, M, B); 
-  } 
- 
-  return Shrunk; 
-} 
- 
-Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilderBase &B) { 
-  Function *Callee = CI->getCalledFunction(); 
-  AttributeList Attrs; // Attributes are only meaningful on the original call 
-  StringRef Name = Callee->getName(); 
-  Value *Ret = nullptr; 
-  if (UnsafeFPShrink && Name == TLI->getName(LibFunc_exp2) && 
-      hasFloatVersion(Name)) 
-    Ret = optimizeUnaryDoubleFP(CI, B, true); 
- 
-  Type *Ty = CI->getType(); 
-  Value *Op = CI->getArgOperand(0); 
- 
-  // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x))  if sizeof(x) <= 32 
-  // Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x))  if sizeof(x) < 32 
-  if ((isa<SIToFPInst>(Op) || isa<UIToFPInst>(Op)) && 
-      hasFloatFn(TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) { 
-    if (Value *Exp = getIntToFPVal(Op, B)) 
-      return emitBinaryFloatFnCall(ConstantFP::get(Ty, 1.0), Exp, TLI, 
-                                   LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl, 
-                                   B, Attrs); 
-  } 
- 
-  return Ret; 
-} 
- 
-Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilderBase &B) { 
-  // If we can shrink the call to a float function rather than a double 
-  // function, do that first. 
-  Function *Callee = CI->getCalledFunction(); 
-  StringRef Name = Callee->getName(); 
-  if ((Name == "fmin" || Name == "fmax") && hasFloatVersion(Name)) 
-    if (Value *Ret = optimizeBinaryDoubleFP(CI, B)) 
-      return Ret; 
- 
-  // The LLVM intrinsics minnum/maxnum correspond to fmin/fmax. Canonicalize to 
-  // the intrinsics for improved optimization (for example, vectorization). 
-  // No-signed-zeros is implied by the definitions of fmax/fmin themselves. 
-  // From the C standard draft WG14/N1256: 
-  // "Ideally, fmax would be sensitive to the sign of zero, for example 
-  // fmax(-0.0, +0.0) would return +0; however, implementation in software 
-  // might be impractical." 
-  IRBuilderBase::FastMathFlagGuard Guard(B); 
-  FastMathFlags FMF = CI->getFastMathFlags(); 
-  FMF.setNoSignedZeros(); 
-  B.setFastMathFlags(FMF); 
- 
-  Intrinsic::ID IID = Callee->getName().startswith("fmin") ? Intrinsic::minnum 
-                                                           : Intrinsic::maxnum; 
-  Function *F = Intrinsic::getDeclaration(CI->getModule(), IID, CI->getType()); 
-  return B.CreateCall(F, { CI->getArgOperand(0), CI->getArgOperand(1) }); 
-} 
- 
-Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilderBase &B) { 
-  Function *LogFn = Log->getCalledFunction(); 
-  AttributeList Attrs; // Attributes are only meaningful on the original call 
-  StringRef LogNm = LogFn->getName(); 
-  Intrinsic::ID LogID = LogFn->getIntrinsicID(); 
-  Module *Mod = Log->getModule(); 
-  Type *Ty = Log->getType(); 
-  Value *Ret = nullptr; 
- 
-  if (UnsafeFPShrink && hasFloatVersion(LogNm)) 
-    Ret = optimizeUnaryDoubleFP(Log, B, true); 
- 
-  // The earlier call must also be 'fast' in order to do these transforms. 
-  CallInst *Arg = dyn_cast<CallInst>(Log->getArgOperand(0)); 
-  if (!Log->isFast() || !Arg || !Arg->isFast() || !Arg->hasOneUse()) 
-    return Ret; 
- 
-  LibFunc LogLb, ExpLb, Exp2Lb, Exp10Lb, PowLb; 
- 
-  // This is only applicable to log(), log2(), log10(). 
-  if (TLI->getLibFunc(LogNm, LogLb)) 
-    switch (LogLb) { 
-    case LibFunc_logf: 
-      LogID = Intrinsic::log; 
-      ExpLb = LibFunc_expf; 
-      Exp2Lb = LibFunc_exp2f; 
-      Exp10Lb = LibFunc_exp10f; 
-      PowLb = LibFunc_powf; 
-      break; 
-    case LibFunc_log: 
-      LogID = Intrinsic::log; 
-      ExpLb = LibFunc_exp; 
-      Exp2Lb = LibFunc_exp2; 
-      Exp10Lb = LibFunc_exp10; 
-      PowLb = LibFunc_pow; 
-      break; 
-    case LibFunc_logl: 
-      LogID = Intrinsic::log; 
-      ExpLb = LibFunc_expl; 
-      Exp2Lb = LibFunc_exp2l; 
-      Exp10Lb = LibFunc_exp10l; 
-      PowLb = LibFunc_powl; 
-      break; 
-    case LibFunc_log2f: 
-      LogID = Intrinsic::log2; 
-      ExpLb = LibFunc_expf; 
-      Exp2Lb = LibFunc_exp2f; 
-      Exp10Lb = LibFunc_exp10f; 
-      PowLb = LibFunc_powf; 
-      break; 
-    case LibFunc_log2: 
-      LogID = Intrinsic::log2; 
-      ExpLb = LibFunc_exp; 
-      Exp2Lb = LibFunc_exp2; 
-      Exp10Lb = LibFunc_exp10; 
-      PowLb = LibFunc_pow; 
-      break; 
-    case LibFunc_log2l: 
-      LogID = Intrinsic::log2; 
-      ExpLb = LibFunc_expl; 
-      Exp2Lb = LibFunc_exp2l; 
-      Exp10Lb = LibFunc_exp10l; 
-      PowLb = LibFunc_powl; 
-      break; 
-    case LibFunc_log10f: 
-      LogID = Intrinsic::log10; 
-      ExpLb = LibFunc_expf; 
-      Exp2Lb = LibFunc_exp2f; 
-      Exp10Lb = LibFunc_exp10f; 
-      PowLb = LibFunc_powf; 
-      break; 
-    case LibFunc_log10: 
-      LogID = Intrinsic::log10; 
-      ExpLb = LibFunc_exp; 
-      Exp2Lb = LibFunc_exp2; 
-      Exp10Lb = LibFunc_exp10; 
-      PowLb = LibFunc_pow; 
-      break; 
-    case LibFunc_log10l: 
-      LogID = Intrinsic::log10; 
-      ExpLb = LibFunc_expl; 
-      Exp2Lb = LibFunc_exp2l; 
-      Exp10Lb = LibFunc_exp10l; 
-      PowLb = LibFunc_powl; 
-      break; 
-    default: 
-      return Ret; 
-    } 
-  else if (LogID == Intrinsic::log || LogID == Intrinsic::log2 || 
-           LogID == Intrinsic::log10) { 
-    if (Ty->getScalarType()->isFloatTy()) { 
-      ExpLb = LibFunc_expf; 
-      Exp2Lb = LibFunc_exp2f; 
-      Exp10Lb = LibFunc_exp10f; 
-      PowLb = LibFunc_powf; 
-    } else if (Ty->getScalarType()->isDoubleTy()) { 
-      ExpLb = LibFunc_exp; 
-      Exp2Lb = LibFunc_exp2; 
-      Exp10Lb = LibFunc_exp10; 
-      PowLb = LibFunc_pow; 
-    } else 
-      return Ret; 
-  } else 
-    return Ret; 
- 
-  IRBuilderBase::FastMathFlagGuard Guard(B); 
-  B.setFastMathFlags(FastMathFlags::getFast()); 
- 
-  Intrinsic::ID ArgID = Arg->getIntrinsicID(); 
-  LibFunc ArgLb = NotLibFunc; 
-  TLI->getLibFunc(*Arg, ArgLb); 
- 
-  // log(pow(x,y)) -> y*log(x) 
-  if (ArgLb == PowLb || ArgID == Intrinsic::pow) { 
-    Value *LogX = 
-        Log->doesNotAccessMemory() 
-            ? B.CreateCall(Intrinsic::getDeclaration(Mod, LogID, Ty), 
-                           Arg->getOperand(0), "log") 
-            : emitUnaryFloatFnCall(Arg->getOperand(0), LogNm, B, Attrs); 
-    Value *MulY = B.CreateFMul(Arg->getArgOperand(1), LogX, "mul"); 
-    // Since pow() may have side effects, e.g. errno, 
-    // dead code elimination may not be trusted to remove it. 
-    substituteInParent(Arg, MulY); 
-    return MulY; 
-  } 
- 
-  // log(exp{,2,10}(y)) -> y*log({e,2,10}) 
-  // TODO: There is no exp10() intrinsic yet. 
-  if (ArgLb == ExpLb || ArgLb == Exp2Lb || ArgLb == Exp10Lb || 
-           ArgID == Intrinsic::exp || ArgID == Intrinsic::exp2) { 
-    Constant *Eul; 
-    if (ArgLb == ExpLb || ArgID == Intrinsic::exp) 
-      // FIXME: Add more precise value of e for long double. 
-      Eul = ConstantFP::get(Log->getType(), numbers::e); 
-    else if (ArgLb == Exp2Lb || ArgID == Intrinsic::exp2) 
-      Eul = ConstantFP::get(Log->getType(), 2.0); 
-    else 
-      Eul = ConstantFP::get(Log->getType(), 10.0); 
-    Value *LogE = Log->doesNotAccessMemory() 
-                      ? B.CreateCall(Intrinsic::getDeclaration(Mod, LogID, Ty), 
-                                     Eul, "log") 
-                      : emitUnaryFloatFnCall(Eul, LogNm, B, Attrs); 
-    Value *MulY = B.CreateFMul(Arg->getArgOperand(0), LogE, "mul"); 
-    // Since exp() may have side effects, e.g. errno, 
-    // dead code elimination may not be trusted to remove it. 
-    substituteInParent(Arg, MulY); 
-    return MulY; 
-  } 
- 
-  return Ret; 
-} 
- 
-Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilderBase &B) { 
-  Function *Callee = CI->getCalledFunction(); 
-  Value *Ret = nullptr; 
-  // TODO: Once we have a way (other than checking for the existince of the 
-  // libcall) to tell whether our target can lower @llvm.sqrt, relax the 
-  // condition below. 
-  if (TLI->has(LibFunc_sqrtf) && (Callee->getName() == "sqrt" || 
-                                  Callee->getIntrinsicID() == Intrinsic::sqrt)) 
-    Ret = optimizeUnaryDoubleFP(CI, B, true); 
- 
-  if (!CI->isFast()) 
-    return Ret; 
- 
-  Instruction *I = dyn_cast<Instruction>(CI->getArgOperand(0)); 
-  if (!I || I->getOpcode() != Instruction::FMul || !I->isFast()) 
-    return Ret; 
- 
-  // We're looking for a repeated factor in a multiplication tree, 
-  // so we can do this fold: sqrt(x * x) -> fabs(x); 
-  // or this fold: sqrt((x * x) * y) -> fabs(x) * sqrt(y). 
-  Value *Op0 = I->getOperand(0); 
-  Value *Op1 = I->getOperand(1); 
-  Value *RepeatOp = nullptr; 
-  Value *OtherOp = nullptr; 
-  if (Op0 == Op1) { 
-    // Simple match: the operands of the multiply are identical. 
-    RepeatOp = Op0; 
-  } else { 
-    // Look for a more complicated pattern: one of the operands is itself 
-    // a multiply, so search for a common factor in that multiply. 
-    // Note: We don't bother looking any deeper than this first level or for 
-    // variations of this pattern because instcombine's visitFMUL and/or the 
-    // reassociation pass should give us this form. 
-    Value *OtherMul0, *OtherMul1; 
-    if (match(Op0, m_FMul(m_Value(OtherMul0), m_Value(OtherMul1)))) { 
-      // Pattern: sqrt((x * y) * z) 
-      if (OtherMul0 == OtherMul1 && cast<Instruction>(Op0)->isFast()) { 
-        // Matched: sqrt((x * x) * z) 
-        RepeatOp = OtherMul0; 
-        OtherOp = Op1; 
-      } 
-    } 
-  } 
-  if (!RepeatOp) 
-    return Ret; 
- 
-  // Fast math flags for any created instructions should match the sqrt 
-  // and multiply. 
-  IRBuilderBase::FastMathFlagGuard Guard(B); 
-  B.setFastMathFlags(I->getFastMathFlags()); 
- 
-  // If we found a repeated factor, hoist it out of the square root and 
-  // replace it with the fabs of that factor. 
-  Module *M = Callee->getParent(); 
-  Type *ArgType = I->getType(); 
-  Function *Fabs = Intrinsic::getDeclaration(M, Intrinsic::fabs, ArgType); 
-  Value *FabsCall = B.CreateCall(Fabs, RepeatOp, "fabs"); 
-  if (OtherOp) { 
-    // If we found a non-repeated factor, we still need to get its square 
-    // root. We then multiply that by the value that was simplified out 
-    // of the square root calculation. 
-    Function *Sqrt = Intrinsic::getDeclaration(M, Intrinsic::sqrt, ArgType); 
-    Value *SqrtCall = B.CreateCall(Sqrt, OtherOp, "sqrt"); 
-    return B.CreateFMul(FabsCall, SqrtCall); 
-  } 
-  return FabsCall; 
-} 
- 
-// TODO: Generalize to handle any trig function and its inverse. 
-Value *LibCallSimplifier::optimizeTan(CallInst *CI, IRBuilderBase &B) { 
-  Function *Callee = CI->getCalledFunction(); 
-  Value *Ret = nullptr; 
-  StringRef Name = Callee->getName(); 
-  if (UnsafeFPShrink && Name == "tan" && hasFloatVersion(Name)) 
-    Ret = optimizeUnaryDoubleFP(CI, B, true); 
- 
-  Value *Op1 = CI->getArgOperand(0); 
-  auto *OpC = dyn_cast<CallInst>(Op1); 
-  if (!OpC) 
-    return Ret; 
- 
-  // Both calls must be 'fast' in order to remove them. 
-  if (!CI->isFast() || !OpC->isFast()) 
-    return Ret; 
- 
-  // tan(atan(x)) -> x 
-  // tanf(atanf(x)) -> x 
-  // tanl(atanl(x)) -> x 
-  LibFunc Func; 
-  Function *F = OpC->getCalledFunction(); 
-  if (F && TLI->getLibFunc(F->getName(), Func) && TLI->has(Func) && 
-      ((Func == LibFunc_atan && Callee->getName() == "tan") || 
-       (Func == LibFunc_atanf && Callee->getName() == "tanf") || 
-       (Func == LibFunc_atanl && Callee->getName() == "tanl"))) 
-    Ret = OpC->getArgOperand(0); 
-  return Ret; 
-} 
- 
-static bool isTrigLibCall(CallInst *CI) { 
-  // We can only hope to do anything useful if we can ignore things like errno 
-  // and floating-point exceptions. 
-  // We already checked the prototype. 
-  return CI->hasFnAttr(Attribute::NoUnwind) && 
-         CI->hasFnAttr(Attribute::ReadNone); 
-} 
- 
-static void insertSinCosCall(IRBuilderBase &B, Function *OrigCallee, Value *Arg, 
-                             bool UseFloat, Value *&Sin, Value *&Cos, 
-                             Value *&SinCos) { 
-  Type *ArgTy = Arg->getType(); 
-  Type *ResTy; 
-  StringRef Name; 
- 
-  Triple T(OrigCallee->getParent()->getTargetTriple()); 
-  if (UseFloat) { 
-    Name = "__sincospif_stret"; 
- 
-    assert(T.getArch() != Triple::x86 && "x86 messy and unsupported for now"); 
-    // x86_64 can't use {float, float} since that would be returned in both 
-    // xmm0 and xmm1, which isn't what a real struct would do. 
-    ResTy = T.getArch() == Triple::x86_64 
-                ? static_cast<Type *>(FixedVectorType::get(ArgTy, 2)) 
-                : static_cast<Type *>(StructType::get(ArgTy, ArgTy)); 
-  } else { 
-    Name = "__sincospi_stret"; 
-    ResTy = StructType::get(ArgTy, ArgTy); 
-  } 
- 
-  Module *M = OrigCallee->getParent(); 
-  FunctionCallee Callee = 
-      M->getOrInsertFunction(Name, OrigCallee->getAttributes(), ResTy, ArgTy); 
- 
-  if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) { 
-    // If the argument is an instruction, it must dominate all uses so put our 
-    // sincos call there. 
-    B.SetInsertPoint(ArgInst->getParent(), ++ArgInst->getIterator()); 
-  } else { 
-    // Otherwise (e.g. for a constant) the beginning of the function is as 
-    // good a place as any. 
-    BasicBlock &EntryBB = B.GetInsertBlock()->getParent()->getEntryBlock(); 
-    B.SetInsertPoint(&EntryBB, EntryBB.begin()); 
-  } 
- 
-  SinCos = B.CreateCall(Callee, Arg, "sincospi"); 
- 
-  if (SinCos->getType()->isStructTy()) { 
-    Sin = B.CreateExtractValue(SinCos, 0, "sinpi"); 
-    Cos = B.CreateExtractValue(SinCos, 1, "cospi"); 
-  } else { 
-    Sin = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 0), 
-                                 "sinpi"); 
-    Cos = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 1), 
-                                 "cospi"); 
-  } 
-} 
- 
-Value *LibCallSimplifier::optimizeSinCosPi(CallInst *CI, IRBuilderBase &B) { 
-  // Make sure the prototype is as expected, otherwise the rest of the 
-  // function is probably invalid and likely to abort. 
-  if (!isTrigLibCall(CI)) 
-    return nullptr; 
- 
-  Value *Arg = CI->getArgOperand(0); 
-  SmallVector<CallInst *, 1> SinCalls; 
-  SmallVector<CallInst *, 1> CosCalls; 
-  SmallVector<CallInst *, 1> SinCosCalls; 
- 
-  bool IsFloat = Arg->getType()->isFloatTy(); 
- 
-  // Look for all compatible sinpi, cospi and sincospi calls with the same 
-  // argument. If there are enough (in some sense) we can make the 
-  // substitution. 
-  Function *F = CI->getFunction(); 
-  for (User *U : Arg->users()) 
-    classifyArgUse(U, F, IsFloat, SinCalls, CosCalls, SinCosCalls); 
- 
-  // It's only worthwhile if both sinpi and cospi are actually used. 
+      }
+
+      // We will memoize intermediate products of the Addition Chain.
+      Value *InnerChain[33] = {nullptr};
+      InnerChain[1] = Base;
+      InnerChain[2] = B.CreateFMul(Base, Base, "square");
+
+      // We cannot readily convert a non-double type (like float) to a double.
+      // So we first convert it to something which could be converted to double.
+      ExpoA.convert(APFloat::IEEEdouble(), APFloat::rmTowardZero, &Ignored);
+      Value *FMul = getPow(InnerChain, ExpoA.convertToDouble(), B);
+
+      // Expand pow(x, y+0.5) to pow(x, y) * sqrt(x).
+      if (Sqrt)
+        FMul = B.CreateFMul(FMul, Sqrt);
+
+      // If the exponent is negative, then get the reciprocal.
+      if (ExpoF->isNegative())
+        FMul = B.CreateFDiv(ConstantFP::get(Ty, 1.0), FMul, "reciprocal");
+
+      return FMul;
+    }
+
+    APSInt IntExpo(32, /*isUnsigned=*/false);
+    // powf(x, n) -> powi(x, n) if n is a constant signed integer value
+    if (ExpoF->isInteger() &&
+        ExpoF->convertToInteger(IntExpo, APFloat::rmTowardZero, &Ignored) ==
+            APFloat::opOK) {
+      return createPowWithIntegerExponent(
+          Base, ConstantInt::get(B.getInt32Ty(), IntExpo), M, B);
+    }
+  }
+
+  // powf(x, itofp(y)) -> powi(x, y)
+  if (AllowApprox && (isa<SIToFPInst>(Expo) || isa<UIToFPInst>(Expo))) {
+    if (Value *ExpoI = getIntToFPVal(Expo, B))
+      return createPowWithIntegerExponent(Base, ExpoI, M, B);
+  }
+
+  return Shrunk;
+}
+
+Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilderBase &B) {
+  Function *Callee = CI->getCalledFunction();
+  AttributeList Attrs; // Attributes are only meaningful on the original call
+  StringRef Name = Callee->getName();
+  Value *Ret = nullptr;
+  if (UnsafeFPShrink && Name == TLI->getName(LibFunc_exp2) &&
+      hasFloatVersion(Name))
+    Ret = optimizeUnaryDoubleFP(CI, B, true);
+
+  Type *Ty = CI->getType();
+  Value *Op = CI->getArgOperand(0);
+
+  // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x))  if sizeof(x) <= 32
+  // Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x))  if sizeof(x) < 32
+  if ((isa<SIToFPInst>(Op) || isa<UIToFPInst>(Op)) &&
+      hasFloatFn(TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) {
+    if (Value *Exp = getIntToFPVal(Op, B))
+      return emitBinaryFloatFnCall(ConstantFP::get(Ty, 1.0), Exp, TLI,
+                                   LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl,
+                                   B, Attrs);
+  }
+
+  return Ret;
+}
+
+Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilderBase &B) {
+  // If we can shrink the call to a float function rather than a double
+  // function, do that first.
+  Function *Callee = CI->getCalledFunction();
+  StringRef Name = Callee->getName();
+  if ((Name == "fmin" || Name == "fmax") && hasFloatVersion(Name))
+    if (Value *Ret = optimizeBinaryDoubleFP(CI, B))
+      return Ret;
+
+  // The LLVM intrinsics minnum/maxnum correspond to fmin/fmax. Canonicalize to
+  // the intrinsics for improved optimization (for example, vectorization).
+  // No-signed-zeros is implied by the definitions of fmax/fmin themselves.
+  // From the C standard draft WG14/N1256:
+  // "Ideally, fmax would be sensitive to the sign of zero, for example
+  // fmax(-0.0, +0.0) would return +0; however, implementation in software
+  // might be impractical."
+  IRBuilderBase::FastMathFlagGuard Guard(B);
+  FastMathFlags FMF = CI->getFastMathFlags();
+  FMF.setNoSignedZeros();
+  B.setFastMathFlags(FMF);
+
+  Intrinsic::ID IID = Callee->getName().startswith("fmin") ? Intrinsic::minnum
+                                                           : Intrinsic::maxnum;
+  Function *F = Intrinsic::getDeclaration(CI->getModule(), IID, CI->getType());
+  return B.CreateCall(F, { CI->getArgOperand(0), CI->getArgOperand(1) });
+}
+
+Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilderBase &B) {
+  Function *LogFn = Log->getCalledFunction();
+  AttributeList Attrs; // Attributes are only meaningful on the original call
+  StringRef LogNm = LogFn->getName();
+  Intrinsic::ID LogID = LogFn->getIntrinsicID();
+  Module *Mod = Log->getModule();
+  Type *Ty = Log->getType();
+  Value *Ret = nullptr;
+
+  if (UnsafeFPShrink && hasFloatVersion(LogNm))
+    Ret = optimizeUnaryDoubleFP(Log, B, true);
+
+  // The earlier call must also be 'fast' in order to do these transforms.
+  CallInst *Arg = dyn_cast<CallInst>(Log->getArgOperand(0));
+  if (!Log->isFast() || !Arg || !Arg->isFast() || !Arg->hasOneUse())
+    return Ret;
+
+  LibFunc LogLb, ExpLb, Exp2Lb, Exp10Lb, PowLb;
+
+  // This is only applicable to log(), log2(), log10().
+  if (TLI->getLibFunc(LogNm, LogLb))
+    switch (LogLb) {
+    case LibFunc_logf:
+      LogID = Intrinsic::log;
+      ExpLb = LibFunc_expf;
+      Exp2Lb = LibFunc_exp2f;
+      Exp10Lb = LibFunc_exp10f;
+      PowLb = LibFunc_powf;
+      break;
+    case LibFunc_log:
+      LogID = Intrinsic::log;
+      ExpLb = LibFunc_exp;
+      Exp2Lb = LibFunc_exp2;
+      Exp10Lb = LibFunc_exp10;
+      PowLb = LibFunc_pow;
+      break;
+    case LibFunc_logl:
+      LogID = Intrinsic::log;
+      ExpLb = LibFunc_expl;
+      Exp2Lb = LibFunc_exp2l;
+      Exp10Lb = LibFunc_exp10l;
+      PowLb = LibFunc_powl;
+      break;
+    case LibFunc_log2f:
+      LogID = Intrinsic::log2;
+      ExpLb = LibFunc_expf;
+      Exp2Lb = LibFunc_exp2f;
+      Exp10Lb = LibFunc_exp10f;
+      PowLb = LibFunc_powf;
+      break;
+    case LibFunc_log2:
+      LogID = Intrinsic::log2;
+      ExpLb = LibFunc_exp;
+      Exp2Lb = LibFunc_exp2;
+      Exp10Lb = LibFunc_exp10;
+      PowLb = LibFunc_pow;
+      break;
+    case LibFunc_log2l:
+      LogID = Intrinsic::log2;
+      ExpLb = LibFunc_expl;
+      Exp2Lb = LibFunc_exp2l;
+      Exp10Lb = LibFunc_exp10l;
+      PowLb = LibFunc_powl;
+      break;
+    case LibFunc_log10f:
+      LogID = Intrinsic::log10;
+      ExpLb = LibFunc_expf;
+      Exp2Lb = LibFunc_exp2f;
+      Exp10Lb = LibFunc_exp10f;
+      PowLb = LibFunc_powf;
+      break;
+    case LibFunc_log10:
+      LogID = Intrinsic::log10;
+      ExpLb = LibFunc_exp;
+      Exp2Lb = LibFunc_exp2;
+      Exp10Lb = LibFunc_exp10;
+      PowLb = LibFunc_pow;
+      break;
+    case LibFunc_log10l:
+      LogID = Intrinsic::log10;
+      ExpLb = LibFunc_expl;
+      Exp2Lb = LibFunc_exp2l;
+      Exp10Lb = LibFunc_exp10l;
+      PowLb = LibFunc_powl;
+      break;
+    default:
+      return Ret;
+    }
+  else if (LogID == Intrinsic::log || LogID == Intrinsic::log2 ||
+           LogID == Intrinsic::log10) {
+    if (Ty->getScalarType()->isFloatTy()) {
+      ExpLb = LibFunc_expf;
+      Exp2Lb = LibFunc_exp2f;
+      Exp10Lb = LibFunc_exp10f;
+      PowLb = LibFunc_powf;
+    } else if (Ty->getScalarType()->isDoubleTy()) {
+      ExpLb = LibFunc_exp;
+      Exp2Lb = LibFunc_exp2;
+      Exp10Lb = LibFunc_exp10;
+      PowLb = LibFunc_pow;
+    } else
+      return Ret;
+  } else
+    return Ret;
+
+  IRBuilderBase::FastMathFlagGuard Guard(B);
+  B.setFastMathFlags(FastMathFlags::getFast());
+
+  Intrinsic::ID ArgID = Arg->getIntrinsicID();
+  LibFunc ArgLb = NotLibFunc;
+  TLI->getLibFunc(*Arg, ArgLb);
+
+  // log(pow(x,y)) -> y*log(x)
+  if (ArgLb == PowLb || ArgID == Intrinsic::pow) {
+    Value *LogX =
+        Log->doesNotAccessMemory()
+            ? B.CreateCall(Intrinsic::getDeclaration(Mod, LogID, Ty),
+                           Arg->getOperand(0), "log")
+            : emitUnaryFloatFnCall(Arg->getOperand(0), LogNm, B, Attrs);
+    Value *MulY = B.CreateFMul(Arg->getArgOperand(1), LogX, "mul");
+    // Since pow() may have side effects, e.g. errno,
+    // dead code elimination may not be trusted to remove it.
+    substituteInParent(Arg, MulY);
+    return MulY;
+  }
+
+  // log(exp{,2,10}(y)) -> y*log({e,2,10})
+  // TODO: There is no exp10() intrinsic yet.
+  if (ArgLb == ExpLb || ArgLb == Exp2Lb || ArgLb == Exp10Lb ||
+           ArgID == Intrinsic::exp || ArgID == Intrinsic::exp2) {
+    Constant *Eul;
+    if (ArgLb == ExpLb || ArgID == Intrinsic::exp)
+      // FIXME: Add more precise value of e for long double.
+      Eul = ConstantFP::get(Log->getType(), numbers::e);
+    else if (ArgLb == Exp2Lb || ArgID == Intrinsic::exp2)
+      Eul = ConstantFP::get(Log->getType(), 2.0);
+    else
+      Eul = ConstantFP::get(Log->getType(), 10.0);
+    Value *LogE = Log->doesNotAccessMemory()
+                      ? B.CreateCall(Intrinsic::getDeclaration(Mod, LogID, Ty),
+                                     Eul, "log")
+                      : emitUnaryFloatFnCall(Eul, LogNm, B, Attrs);
+    Value *MulY = B.CreateFMul(Arg->getArgOperand(0), LogE, "mul");
+    // Since exp() may have side effects, e.g. errno,
+    // dead code elimination may not be trusted to remove it.
+    substituteInParent(Arg, MulY);
+    return MulY;
+  }
+
+  return Ret;
+}
+
+Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilderBase &B) {
+  Function *Callee = CI->getCalledFunction();
+  Value *Ret = nullptr;
+  // TODO: Once we have a way (other than checking for the existince of the
+  // libcall) to tell whether our target can lower @llvm.sqrt, relax the
+  // condition below.
+  if (TLI->has(LibFunc_sqrtf) && (Callee->getName() == "sqrt" ||
+                                  Callee->getIntrinsicID() == Intrinsic::sqrt))
+    Ret = optimizeUnaryDoubleFP(CI, B, true);
+
+  if (!CI->isFast())
+    return Ret;
+
+  Instruction *I = dyn_cast<Instruction>(CI->getArgOperand(0));
+  if (!I || I->getOpcode() != Instruction::FMul || !I->isFast())
+    return Ret;
+
+  // We're looking for a repeated factor in a multiplication tree,
+  // so we can do this fold: sqrt(x * x) -> fabs(x);
+  // or this fold: sqrt((x * x) * y) -> fabs(x) * sqrt(y).
+  Value *Op0 = I->getOperand(0);
+  Value *Op1 = I->getOperand(1);
+  Value *RepeatOp = nullptr;
+  Value *OtherOp = nullptr;
+  if (Op0 == Op1) {
+    // Simple match: the operands of the multiply are identical.
+    RepeatOp = Op0;
+  } else {
+    // Look for a more complicated pattern: one of the operands is itself
+    // a multiply, so search for a common factor in that multiply.
+    // Note: We don't bother looking any deeper than this first level or for
+    // variations of this pattern because instcombine's visitFMUL and/or the
+    // reassociation pass should give us this form.
+    Value *OtherMul0, *OtherMul1;
+    if (match(Op0, m_FMul(m_Value(OtherMul0), m_Value(OtherMul1)))) {
+      // Pattern: sqrt((x * y) * z)
+      if (OtherMul0 == OtherMul1 && cast<Instruction>(Op0)->isFast()) {
+        // Matched: sqrt((x * x) * z)
+        RepeatOp = OtherMul0;
+        OtherOp = Op1;
+      }
+    }
+  }
+  if (!RepeatOp)
+    return Ret;
+
+  // Fast math flags for any created instructions should match the sqrt
+  // and multiply.
+  IRBuilderBase::FastMathFlagGuard Guard(B);
+  B.setFastMathFlags(I->getFastMathFlags());
+
+  // If we found a repeated factor, hoist it out of the square root and
+  // replace it with the fabs of that factor.
+  Module *M = Callee->getParent();
+  Type *ArgType = I->getType();
+  Function *Fabs = Intrinsic::getDeclaration(M, Intrinsic::fabs, ArgType);
+  Value *FabsCall = B.CreateCall(Fabs, RepeatOp, "fabs");
+  if (OtherOp) {
+    // If we found a non-repeated factor, we still need to get its square
+    // root. We then multiply that by the value that was simplified out
+    // of the square root calculation.
+    Function *Sqrt = Intrinsic::getDeclaration(M, Intrinsic::sqrt, ArgType);
+    Value *SqrtCall = B.CreateCall(Sqrt, OtherOp, "sqrt");
+    return B.CreateFMul(FabsCall, SqrtCall);
+  }
+  return FabsCall;
+}
+
+// TODO: Generalize to handle any trig function and its inverse.
+Value *LibCallSimplifier::optimizeTan(CallInst *CI, IRBuilderBase &B) {
+  Function *Callee = CI->getCalledFunction();
+  Value *Ret = nullptr;
+  StringRef Name = Callee->getName();
+  if (UnsafeFPShrink && Name == "tan" && hasFloatVersion(Name))
+    Ret = optimizeUnaryDoubleFP(CI, B, true);
+
+  Value *Op1 = CI->getArgOperand(0);
+  auto *OpC = dyn_cast<CallInst>(Op1);
+  if (!OpC)
+    return Ret;
+
+  // Both calls must be 'fast' in order to remove them.
+  if (!CI->isFast() || !OpC->isFast())
+    return Ret;
+
+  // tan(atan(x)) -> x
+  // tanf(atanf(x)) -> x
+  // tanl(atanl(x)) -> x
+  LibFunc Func;
+  Function *F = OpC->getCalledFunction();
+  if (F && TLI->getLibFunc(F->getName(), Func) && TLI->has(Func) &&
+      ((Func == LibFunc_atan && Callee->getName() == "tan") ||
+       (Func == LibFunc_atanf && Callee->getName() == "tanf") ||
+       (Func == LibFunc_atanl && Callee->getName() == "tanl")))
+    Ret = OpC->getArgOperand(0);
+  return Ret;
+}
+
+static bool isTrigLibCall(CallInst *CI) {
+  // We can only hope to do anything useful if we can ignore things like errno
+  // and floating-point exceptions.
+  // We already checked the prototype.
+  return CI->hasFnAttr(Attribute::NoUnwind) &&
+         CI->hasFnAttr(Attribute::ReadNone);
+}
+
+static void insertSinCosCall(IRBuilderBase &B, Function *OrigCallee, Value *Arg,
+                             bool UseFloat, Value *&Sin, Value *&Cos,
+                             Value *&SinCos) {
+  Type *ArgTy = Arg->getType();
+  Type *ResTy;
+  StringRef Name;
+
+  Triple T(OrigCallee->getParent()->getTargetTriple());
+  if (UseFloat) {
+    Name = "__sincospif_stret";
+
+    assert(T.getArch() != Triple::x86 && "x86 messy and unsupported for now");
+    // x86_64 can't use {float, float} since that would be returned in both
+    // xmm0 and xmm1, which isn't what a real struct would do.
+    ResTy = T.getArch() == Triple::x86_64
+                ? static_cast<Type *>(FixedVectorType::get(ArgTy, 2))
+                : static_cast<Type *>(StructType::get(ArgTy, ArgTy));
+  } else {
+    Name = "__sincospi_stret";
+    ResTy = StructType::get(ArgTy, ArgTy);
+  }
+
+  Module *M = OrigCallee->getParent();
+  FunctionCallee Callee =
+      M->getOrInsertFunction(Name, OrigCallee->getAttributes(), ResTy, ArgTy);
+
+  if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) {
+    // If the argument is an instruction, it must dominate all uses so put our
+    // sincos call there.
+    B.SetInsertPoint(ArgInst->getParent(), ++ArgInst->getIterator());
+  } else {
+    // Otherwise (e.g. for a constant) the beginning of the function is as
+    // good a place as any.
+    BasicBlock &EntryBB = B.GetInsertBlock()->getParent()->getEntryBlock();
+    B.SetInsertPoint(&EntryBB, EntryBB.begin());
+  }
+
+  SinCos = B.CreateCall(Callee, Arg, "sincospi");
+
+  if (SinCos->getType()->isStructTy()) {
+    Sin = B.CreateExtractValue(SinCos, 0, "sinpi");
+    Cos = B.CreateExtractValue(SinCos, 1, "cospi");
+  } else {
+    Sin = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 0),
+                                 "sinpi");
+    Cos = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 1),
+                                 "cospi");
+  }
+}
+
+Value *LibCallSimplifier::optimizeSinCosPi(CallInst *CI, IRBuilderBase &B) {
+  // Make sure the prototype is as expected, otherwise the rest of the
+  // function is probably invalid and likely to abort.
+  if (!isTrigLibCall(CI))
+    return nullptr;
+
+  Value *Arg = CI->getArgOperand(0);
+  SmallVector<CallInst *, 1> SinCalls;
+  SmallVector<CallInst *, 1> CosCalls;
+  SmallVector<CallInst *, 1> SinCosCalls;
+
+  bool IsFloat = Arg->getType()->isFloatTy();
+
+  // Look for all compatible sinpi, cospi and sincospi calls with the same
+  // argument. If there are enough (in some sense) we can make the
+  // substitution.
+  Function *F = CI->getFunction();
+  for (User *U : Arg->users())
+    classifyArgUse(U, F, IsFloat, SinCalls, CosCalls, SinCosCalls);
+
+  // It's only worthwhile if both sinpi and cospi are actually used.
   if (SinCalls.empty() || CosCalls.empty())
-    return nullptr; 
- 
-  Value *Sin, *Cos, *SinCos; 
-  insertSinCosCall(B, CI->getCalledFunction(), Arg, IsFloat, Sin, Cos, SinCos); 
- 
-  auto replaceTrigInsts = [this](SmallVectorImpl<CallInst *> &Calls, 
-                                 Value *Res) { 
-    for (CallInst *C : Calls) 
-      replaceAllUsesWith(C, Res); 
-  }; 
- 
-  replaceTrigInsts(SinCalls, Sin); 
-  replaceTrigInsts(CosCalls, Cos); 
-  replaceTrigInsts(SinCosCalls, SinCos); 
- 
-  return nullptr; 
-} 
- 
-void LibCallSimplifier::classifyArgUse( 
-    Value *Val, Function *F, bool IsFloat, 
-    SmallVectorImpl<CallInst *> &SinCalls, 
-    SmallVectorImpl<CallInst *> &CosCalls, 
-    SmallVectorImpl<CallInst *> &SinCosCalls) { 
-  CallInst *CI = dyn_cast<CallInst>(Val); 
- 
+    return nullptr;
+
+  Value *Sin, *Cos, *SinCos;
+  insertSinCosCall(B, CI->getCalledFunction(), Arg, IsFloat, Sin, Cos, SinCos);
+
+  auto replaceTrigInsts = [this](SmallVectorImpl<CallInst *> &Calls,
+                                 Value *Res) {
+    for (CallInst *C : Calls)
+      replaceAllUsesWith(C, Res);
+  };
+
+  replaceTrigInsts(SinCalls, Sin);
+  replaceTrigInsts(CosCalls, Cos);
+  replaceTrigInsts(SinCosCalls, SinCos);
+
+  return nullptr;
+}
+
+void LibCallSimplifier::classifyArgUse(
+    Value *Val, Function *F, bool IsFloat,
+    SmallVectorImpl<CallInst *> &SinCalls,
+    SmallVectorImpl<CallInst *> &CosCalls,
+    SmallVectorImpl<CallInst *> &SinCosCalls) {
+  CallInst *CI = dyn_cast<CallInst>(Val);
+
   if (!CI || CI->use_empty())
-    return; 
- 
-  // Don't consider calls in other functions. 
-  if (CI->getFunction() != F) 
-    return; 
- 
-  Function *Callee = CI->getCalledFunction(); 
-  LibFunc Func; 
-  if (!Callee || !TLI->getLibFunc(*Callee, Func) || !TLI->has(Func) || 
-      !isTrigLibCall(CI)) 
-    return; 
- 
-  if (IsFloat) { 
-    if (Func == LibFunc_sinpif) 
-      SinCalls.push_back(CI); 
-    else if (Func == LibFunc_cospif) 
-      CosCalls.push_back(CI); 
-    else if (Func == LibFunc_sincospif_stret) 
-      SinCosCalls.push_back(CI); 
-  } else { 
-    if (Func == LibFunc_sinpi) 
-      SinCalls.push_back(CI); 
-    else if (Func == LibFunc_cospi) 
-      CosCalls.push_back(CI); 
-    else if (Func == LibFunc_sincospi_stret) 
-      SinCosCalls.push_back(CI); 
-  } 
-} 
- 
-//===----------------------------------------------------------------------===// 
-// Integer Library Call Optimizations 
-//===----------------------------------------------------------------------===// 
- 
-Value *LibCallSimplifier::optimizeFFS(CallInst *CI, IRBuilderBase &B) { 
-  // ffs(x) -> x != 0 ? (i32)llvm.cttz(x)+1 : 0 
-  Value *Op = CI->getArgOperand(0); 
-  Type *ArgType = Op->getType(); 
-  Function *F = Intrinsic::getDeclaration(CI->getCalledFunction()->getParent(), 
-                                          Intrinsic::cttz, ArgType); 
-  Value *V = B.CreateCall(F, {Op, B.getTrue()}, "cttz"); 
-  V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1)); 
-  V = B.CreateIntCast(V, B.getInt32Ty(), false); 
- 
-  Value *Cond = B.CreateICmpNE(Op, Constant::getNullValue(ArgType)); 
-  return B.CreateSelect(Cond, V, B.getInt32(0)); 
-} 
- 
-Value *LibCallSimplifier::optimizeFls(CallInst *CI, IRBuilderBase &B) { 
-  // fls(x) -> (i32)(sizeInBits(x) - llvm.ctlz(x, false)) 
-  Value *Op = CI->getArgOperand(0); 
-  Type *ArgType = Op->getType(); 
-  Function *F = Intrinsic::getDeclaration(CI->getCalledFunction()->getParent(), 
-                                          Intrinsic::ctlz, ArgType); 
-  Value *V = B.CreateCall(F, {Op, B.getFalse()}, "ctlz"); 
-  V = B.CreateSub(ConstantInt::get(V->getType(), ArgType->getIntegerBitWidth()), 
-                  V); 
-  return B.CreateIntCast(V, CI->getType(), false); 
-} 
- 
-Value *LibCallSimplifier::optimizeAbs(CallInst *CI, IRBuilderBase &B) { 
-  // abs(x) -> x <s 0 ? -x : x 
-  // The negation has 'nsw' because abs of INT_MIN is undefined. 
-  Value *X = CI->getArgOperand(0); 
-  Value *IsNeg = B.CreateICmpSLT(X, Constant::getNullValue(X->getType())); 
-  Value *NegX = B.CreateNSWNeg(X, "neg"); 
-  return B.CreateSelect(IsNeg, NegX, X); 
-} 
- 
-Value *LibCallSimplifier::optimizeIsDigit(CallInst *CI, IRBuilderBase &B) { 
-  // isdigit(c) -> (c-'0') <u 10 
-  Value *Op = CI->getArgOperand(0); 
-  Op = B.CreateSub(Op, B.getInt32('0'), "isdigittmp"); 
-  Op = B.CreateICmpULT(Op, B.getInt32(10), "isdigit"); 
-  return B.CreateZExt(Op, CI->getType()); 
-} 
- 
-Value *LibCallSimplifier::optimizeIsAscii(CallInst *CI, IRBuilderBase &B) { 
-  // isascii(c) -> c <u 128 
-  Value *Op = CI->getArgOperand(0); 
-  Op = B.CreateICmpULT(Op, B.getInt32(128), "isascii"); 
-  return B.CreateZExt(Op, CI->getType()); 
-} 
- 
-Value *LibCallSimplifier::optimizeToAscii(CallInst *CI, IRBuilderBase &B) { 
-  // toascii(c) -> c & 0x7f 
-  return B.CreateAnd(CI->getArgOperand(0), 
-                     ConstantInt::get(CI->getType(), 0x7F)); 
-} 
- 
-Value *LibCallSimplifier::optimizeAtoi(CallInst *CI, IRBuilderBase &B) { 
-  StringRef Str; 
-  if (!getConstantStringInfo(CI->getArgOperand(0), Str)) 
-    return nullptr; 
- 
-  return convertStrToNumber(CI, Str, 10); 
-} 
- 
-Value *LibCallSimplifier::optimizeStrtol(CallInst *CI, IRBuilderBase &B) { 
-  StringRef Str; 
-  if (!getConstantStringInfo(CI->getArgOperand(0), Str)) 
-    return nullptr; 
- 
-  if (!isa<ConstantPointerNull>(CI->getArgOperand(1))) 
-    return nullptr; 
- 
-  if (ConstantInt *CInt = dyn_cast<ConstantInt>(CI->getArgOperand(2))) { 
-    return convertStrToNumber(CI, Str, CInt->getSExtValue()); 
-  } 
- 
-  return nullptr; 
-} 
- 
-//===----------------------------------------------------------------------===// 
-// Formatting and IO Library Call Optimizations 
-//===----------------------------------------------------------------------===// 
- 
-static bool isReportingError(Function *Callee, CallInst *CI, int StreamArg); 
- 
-Value *LibCallSimplifier::optimizeErrorReporting(CallInst *CI, IRBuilderBase &B, 
-                                                 int StreamArg) { 
-  Function *Callee = CI->getCalledFunction(); 
-  // Error reporting calls should be cold, mark them as such. 
-  // This applies even to non-builtin calls: it is only a hint and applies to 
-  // functions that the frontend might not understand as builtins. 
- 
-  // This heuristic was suggested in: 
-  // Improving Static Branch Prediction in a Compiler 
-  // Brian L. Deitrich, Ben-Chung Cheng, Wen-mei W. Hwu 
-  // Proceedings of PACT'98, Oct. 1998, IEEE 
-  if (!CI->hasFnAttr(Attribute::Cold) && 
-      isReportingError(Callee, CI, StreamArg)) { 
-    CI->addAttribute(AttributeList::FunctionIndex, Attribute::Cold); 
-  } 
- 
-  return nullptr; 
-} 
- 
-static bool isReportingError(Function *Callee, CallInst *CI, int StreamArg) { 
-  if (!Callee || !Callee->isDeclaration()) 
-    return false; 
- 
-  if (StreamArg < 0) 
-    return true; 
- 
-  // These functions might be considered cold, but only if their stream 
-  // argument is stderr. 
- 
-  if (StreamArg >= (int)CI->getNumArgOperands()) 
-    return false; 
-  LoadInst *LI = dyn_cast<LoadInst>(CI->getArgOperand(StreamArg)); 
-  if (!LI) 
-    return false; 
-  GlobalVariable *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand()); 
-  if (!GV || !GV->isDeclaration()) 
-    return false; 
-  return GV->getName() == "stderr"; 
-} 
- 
-Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilderBase &B) { 
-  // Check for a fixed format string. 
-  StringRef FormatStr; 
-  if (!getConstantStringInfo(CI->getArgOperand(0), FormatStr)) 
-    return nullptr; 
- 
-  // Empty format string -> noop. 
-  if (FormatStr.empty()) // Tolerate printf's declared void. 
-    return CI->use_empty() ? (Value *)CI : ConstantInt::get(CI->getType(), 0); 
- 
-  // Do not do any of the following transformations if the printf return value 
-  // is used, in general the printf return value is not compatible with either 
-  // putchar() or puts(). 
-  if (!CI->use_empty()) 
-    return nullptr; 
- 
-  // printf("x") -> putchar('x'), even for "%" and "%%". 
-  if (FormatStr.size() == 1 || FormatStr == "%%") 
-    return emitPutChar(B.getInt32(FormatStr[0]), B, TLI); 
- 
-  // printf("%s", "a") --> putchar('a') 
-  if (FormatStr == "%s" && CI->getNumArgOperands() > 1) { 
-    StringRef ChrStr; 
-    if (!getConstantStringInfo(CI->getOperand(1), ChrStr)) 
-      return nullptr; 
-    if (ChrStr.size() != 1) 
-      return nullptr; 
-    return emitPutChar(B.getInt32(ChrStr[0]), B, TLI); 
-  } 
- 
-  // printf("foo\n") --> puts("foo") 
-  if (FormatStr[FormatStr.size() - 1] == '\n' && 
-      FormatStr.find('%') == StringRef::npos) { // No format characters. 
-    // Create a string literal with no \n on it.  We expect the constant merge 
-    // pass to be run after this pass, to merge duplicate strings. 
-    FormatStr = FormatStr.drop_back(); 
-    Value *GV = B.CreateGlobalString(FormatStr, "str"); 
-    return emitPutS(GV, B, TLI); 
-  } 
- 
-  // Optimize specific format strings. 
-  // printf("%c", chr) --> putchar(chr) 
-  if (FormatStr == "%c" && CI->getNumArgOperands() > 1 && 
-      CI->getArgOperand(1)->getType()->isIntegerTy()) 
-    return emitPutChar(CI->getArgOperand(1), B, TLI); 
- 
-  // printf("%s\n", str) --> puts(str) 
-  if (FormatStr == "%s\n" && CI->getNumArgOperands() > 1 && 
-      CI->getArgOperand(1)->getType()->isPointerTy()) 
-    return emitPutS(CI->getArgOperand(1), B, TLI); 
-  return nullptr; 
-} 
- 
-Value *LibCallSimplifier::optimizePrintF(CallInst *CI, IRBuilderBase &B) { 
- 
-  Function *Callee = CI->getCalledFunction(); 
-  FunctionType *FT = Callee->getFunctionType(); 
-  if (Value *V = optimizePrintFString(CI, B)) { 
-    return V; 
-  } 
- 
-  // printf(format, ...) -> iprintf(format, ...) if no floating point 
-  // arguments. 
-  if (TLI->has(LibFunc_iprintf) && !callHasFloatingPointArgument(CI)) { 
-    Module *M = B.GetInsertBlock()->getParent()->getParent(); 
-    FunctionCallee IPrintFFn = 
-        M->getOrInsertFunction("iprintf", FT, Callee->getAttributes()); 
-    CallInst *New = cast<CallInst>(CI->clone()); 
-    New->setCalledFunction(IPrintFFn); 
-    B.Insert(New); 
-    return New; 
-  } 
- 
-  // printf(format, ...) -> __small_printf(format, ...) if no 128-bit floating point 
-  // arguments. 
-  if (TLI->has(LibFunc_small_printf) && !callHasFP128Argument(CI)) { 
-    Module *M = B.GetInsertBlock()->getParent()->getParent(); 
-    auto SmallPrintFFn = 
-        M->getOrInsertFunction(TLI->getName(LibFunc_small_printf), 
-                               FT, Callee->getAttributes()); 
-    CallInst *New = cast<CallInst>(CI->clone()); 
-    New->setCalledFunction(SmallPrintFFn); 
-    B.Insert(New); 
-    return New; 
-  } 
- 
-  annotateNonNullBasedOnAccess(CI, 0); 
-  return nullptr; 
-} 
- 
-Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, 
-                                                IRBuilderBase &B) { 
-  // Check for a fixed format string. 
-  StringRef FormatStr; 
-  if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr)) 
-    return nullptr; 
- 
-  // If we just have a format string (nothing else crazy) transform it. 
-  if (CI->getNumArgOperands() == 2) { 
-    // Make sure there's no % in the constant array.  We could try to handle 
-    // %% -> % in the future if we cared. 
-    if (FormatStr.find('%') != StringRef::npos) 
-      return nullptr; // we found a format specifier, bail out. 
- 
-    // sprintf(str, fmt) -> llvm.memcpy(align 1 str, align 1 fmt, strlen(fmt)+1) 
-    B.CreateMemCpy( 
-        CI->getArgOperand(0), Align(1), CI->getArgOperand(1), Align(1), 
-        ConstantInt::get(DL.getIntPtrType(CI->getContext()), 
-                         FormatStr.size() + 1)); // Copy the null byte. 
-    return ConstantInt::get(CI->getType(), FormatStr.size()); 
-  } 
- 
-  // The remaining optimizations require the format string to be "%s" or "%c" 
-  // and have an extra operand. 
-  if (FormatStr.size() != 2 || FormatStr[0] != '%' || 
-      CI->getNumArgOperands() < 3) 
-    return nullptr; 
- 
-  // Decode the second character of the format string. 
-  if (FormatStr[1] == 'c') { 
-    // sprintf(dst, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0 
-    if (!CI->getArgOperand(2)->getType()->isIntegerTy()) 
-      return nullptr; 
-    Value *V = B.CreateTrunc(CI->getArgOperand(2), B.getInt8Ty(), "char"); 
-    Value *Ptr = castToCStr(CI->getArgOperand(0), B); 
-    B.CreateStore(V, Ptr); 
-    Ptr = B.CreateGEP(B.getInt8Ty(), Ptr, B.getInt32(1), "nul"); 
-    B.CreateStore(B.getInt8(0), Ptr); 
- 
-    return ConstantInt::get(CI->getType(), 1); 
-  } 
- 
-  if (FormatStr[1] == 's') { 
-    // sprintf(dest, "%s", str) -> llvm.memcpy(align 1 dest, align 1 str, 
-    // strlen(str)+1) 
-    if (!CI->getArgOperand(2)->getType()->isPointerTy()) 
-      return nullptr; 
- 
+    return;
+
+  // Don't consider calls in other functions.
+  if (CI->getFunction() != F)
+    return;
+
+  Function *Callee = CI->getCalledFunction();
+  LibFunc Func;
+  if (!Callee || !TLI->getLibFunc(*Callee, Func) || !TLI->has(Func) ||
+      !isTrigLibCall(CI))
+    return;
+
+  if (IsFloat) {
+    if (Func == LibFunc_sinpif)
+      SinCalls.push_back(CI);
+    else if (Func == LibFunc_cospif)
+      CosCalls.push_back(CI);
+    else if (Func == LibFunc_sincospif_stret)
+      SinCosCalls.push_back(CI);
+  } else {
+    if (Func == LibFunc_sinpi)
+      SinCalls.push_back(CI);
+    else if (Func == LibFunc_cospi)
+      CosCalls.push_back(CI);
+    else if (Func == LibFunc_sincospi_stret)
+      SinCosCalls.push_back(CI);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Integer Library Call Optimizations
+//===----------------------------------------------------------------------===//
+
+Value *LibCallSimplifier::optimizeFFS(CallInst *CI, IRBuilderBase &B) {
+  // ffs(x) -> x != 0 ? (i32)llvm.cttz(x)+1 : 0
+  Value *Op = CI->getArgOperand(0);
+  Type *ArgType = Op->getType();
+  Function *F = Intrinsic::getDeclaration(CI->getCalledFunction()->getParent(),
+                                          Intrinsic::cttz, ArgType);
+  Value *V = B.CreateCall(F, {Op, B.getTrue()}, "cttz");
+  V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1));
+  V = B.CreateIntCast(V, B.getInt32Ty(), false);
+
+  Value *Cond = B.CreateICmpNE(Op, Constant::getNullValue(ArgType));
+  return B.CreateSelect(Cond, V, B.getInt32(0));
+}
+
+Value *LibCallSimplifier::optimizeFls(CallInst *CI, IRBuilderBase &B) {
+  // fls(x) -> (i32)(sizeInBits(x) - llvm.ctlz(x, false))
+  Value *Op = CI->getArgOperand(0);
+  Type *ArgType = Op->getType();
+  Function *F = Intrinsic::getDeclaration(CI->getCalledFunction()->getParent(),
+                                          Intrinsic::ctlz, ArgType);
+  Value *V = B.CreateCall(F, {Op, B.getFalse()}, "ctlz");
+  V = B.CreateSub(ConstantInt::get(V->getType(), ArgType->getIntegerBitWidth()),
+                  V);
+  return B.CreateIntCast(V, CI->getType(), false);
+}
+
+Value *LibCallSimplifier::optimizeAbs(CallInst *CI, IRBuilderBase &B) {
+  // abs(x) -> x <s 0 ? -x : x
+  // The negation has 'nsw' because abs of INT_MIN is undefined.
+  Value *X = CI->getArgOperand(0);
+  Value *IsNeg = B.CreateICmpSLT(X, Constant::getNullValue(X->getType()));
+  Value *NegX = B.CreateNSWNeg(X, "neg");
+  return B.CreateSelect(IsNeg, NegX, X);
+}
+
+Value *LibCallSimplifier::optimizeIsDigit(CallInst *CI, IRBuilderBase &B) {
+  // isdigit(c) -> (c-'0') <u 10
+  Value *Op = CI->getArgOperand(0);
+  Op = B.CreateSub(Op, B.getInt32('0'), "isdigittmp");
+  Op = B.CreateICmpULT(Op, B.getInt32(10), "isdigit");
+  return B.CreateZExt(Op, CI->getType());
+}
+
+Value *LibCallSimplifier::optimizeIsAscii(CallInst *CI, IRBuilderBase &B) {
+  // isascii(c) -> c <u 128
+  Value *Op = CI->getArgOperand(0);
+  Op = B.CreateICmpULT(Op, B.getInt32(128), "isascii");
+  return B.CreateZExt(Op, CI->getType());
+}
+
+Value *LibCallSimplifier::optimizeToAscii(CallInst *CI, IRBuilderBase &B) {
+  // toascii(c) -> c & 0x7f
+  return B.CreateAnd(CI->getArgOperand(0),
+                     ConstantInt::get(CI->getType(), 0x7F));
+}
+
+Value *LibCallSimplifier::optimizeAtoi(CallInst *CI, IRBuilderBase &B) {
+  StringRef Str;
+  if (!getConstantStringInfo(CI->getArgOperand(0), Str))
+    return nullptr;
+
+  return convertStrToNumber(CI, Str, 10);
+}
+
+Value *LibCallSimplifier::optimizeStrtol(CallInst *CI, IRBuilderBase &B) {
+  StringRef Str;
+  if (!getConstantStringInfo(CI->getArgOperand(0), Str))
+    return nullptr;
+
+  if (!isa<ConstantPointerNull>(CI->getArgOperand(1)))
+    return nullptr;
+
+  if (ConstantInt *CInt = dyn_cast<ConstantInt>(CI->getArgOperand(2))) {
+    return convertStrToNumber(CI, Str, CInt->getSExtValue());
+  }
+
+  return nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+// Formatting and IO Library Call Optimizations
+//===----------------------------------------------------------------------===//
+
+static bool isReportingError(Function *Callee, CallInst *CI, int StreamArg);
+
+Value *LibCallSimplifier::optimizeErrorReporting(CallInst *CI, IRBuilderBase &B,
+                                                 int StreamArg) {
+  Function *Callee = CI->getCalledFunction();
+  // Error reporting calls should be cold, mark them as such.
+  // This applies even to non-builtin calls: it is only a hint and applies to
+  // functions that the frontend might not understand as builtins.
+
+  // This heuristic was suggested in:
+  // Improving Static Branch Prediction in a Compiler
+  // Brian L. Deitrich, Ben-Chung Cheng, Wen-mei W. Hwu
+  // Proceedings of PACT'98, Oct. 1998, IEEE
+  if (!CI->hasFnAttr(Attribute::Cold) &&
+      isReportingError(Callee, CI, StreamArg)) {
+    CI->addAttribute(AttributeList::FunctionIndex, Attribute::Cold);
+  }
+
+  return nullptr;
+}
+
+static bool isReportingError(Function *Callee, CallInst *CI, int StreamArg) {
+  if (!Callee || !Callee->isDeclaration())
+    return false;
+
+  if (StreamArg < 0)
+    return true;
+
+  // These functions might be considered cold, but only if their stream
+  // argument is stderr.
+
+  if (StreamArg >= (int)CI->getNumArgOperands())
+    return false;
+  LoadInst *LI = dyn_cast<LoadInst>(CI->getArgOperand(StreamArg));
+  if (!LI)
+    return false;
+  GlobalVariable *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand());
+  if (!GV || !GV->isDeclaration())
+    return false;
+  return GV->getName() == "stderr";
+}
+
+Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilderBase &B) {
+  // Check for a fixed format string.
+  StringRef FormatStr;
+  if (!getConstantStringInfo(CI->getArgOperand(0), FormatStr))
+    return nullptr;
+
+  // Empty format string -> noop.
+  if (FormatStr.empty()) // Tolerate printf's declared void.
+    return CI->use_empty() ? (Value *)CI : ConstantInt::get(CI->getType(), 0);
+
+  // Do not do any of the following transformations if the printf return value
+  // is used, in general the printf return value is not compatible with either
+  // putchar() or puts().
+  if (!CI->use_empty())
+    return nullptr;
+
+  // printf("x") -> putchar('x'), even for "%" and "%%".
+  if (FormatStr.size() == 1 || FormatStr == "%%")
+    return emitPutChar(B.getInt32(FormatStr[0]), B, TLI);
+
+  // printf("%s", "a") --> putchar('a')
+  if (FormatStr == "%s" && CI->getNumArgOperands() > 1) {
+    StringRef ChrStr;
+    if (!getConstantStringInfo(CI->getOperand(1), ChrStr))
+      return nullptr;
+    if (ChrStr.size() != 1)
+      return nullptr;
+    return emitPutChar(B.getInt32(ChrStr[0]), B, TLI);
+  }
+
+  // printf("foo\n") --> puts("foo")
+  if (FormatStr[FormatStr.size() - 1] == '\n' &&
+      FormatStr.find('%') == StringRef::npos) { // No format characters.
+    // Create a string literal with no \n on it.  We expect the constant merge
+    // pass to be run after this pass, to merge duplicate strings.
+    FormatStr = FormatStr.drop_back();
+    Value *GV = B.CreateGlobalString(FormatStr, "str");
+    return emitPutS(GV, B, TLI);
+  }
+
+  // Optimize specific format strings.
+  // printf("%c", chr) --> putchar(chr)
+  if (FormatStr == "%c" && CI->getNumArgOperands() > 1 &&
+      CI->getArgOperand(1)->getType()->isIntegerTy())
+    return emitPutChar(CI->getArgOperand(1), B, TLI);
+
+  // printf("%s\n", str) --> puts(str)
+  if (FormatStr == "%s\n" && CI->getNumArgOperands() > 1 &&
+      CI->getArgOperand(1)->getType()->isPointerTy())
+    return emitPutS(CI->getArgOperand(1), B, TLI);
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizePrintF(CallInst *CI, IRBuilderBase &B) {
+
+  Function *Callee = CI->getCalledFunction();
+  FunctionType *FT = Callee->getFunctionType();
+  if (Value *V = optimizePrintFString(CI, B)) {
+    return V;
+  }
+
+  // printf(format, ...) -> iprintf(format, ...) if no floating point
+  // arguments.
+  if (TLI->has(LibFunc_iprintf) && !callHasFloatingPointArgument(CI)) {
+    Module *M = B.GetInsertBlock()->getParent()->getParent();
+    FunctionCallee IPrintFFn =
+        M->getOrInsertFunction("iprintf", FT, Callee->getAttributes());
+    CallInst *New = cast<CallInst>(CI->clone());
+    New->setCalledFunction(IPrintFFn);
+    B.Insert(New);
+    return New;
+  }
+
+  // printf(format, ...) -> __small_printf(format, ...) if no 128-bit floating point
+  // arguments.
+  if (TLI->has(LibFunc_small_printf) && !callHasFP128Argument(CI)) {
+    Module *M = B.GetInsertBlock()->getParent()->getParent();
+    auto SmallPrintFFn =
+        M->getOrInsertFunction(TLI->getName(LibFunc_small_printf),
+                               FT, Callee->getAttributes());
+    CallInst *New = cast<CallInst>(CI->clone());
+    New->setCalledFunction(SmallPrintFFn);
+    B.Insert(New);
+    return New;
+  }
+
+  annotateNonNullBasedOnAccess(CI, 0);
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI,
+                                                IRBuilderBase &B) {
+  // Check for a fixed format string.
+  StringRef FormatStr;
+  if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
+    return nullptr;
+
+  // If we just have a format string (nothing else crazy) transform it.
+  if (CI->getNumArgOperands() == 2) {
+    // Make sure there's no % in the constant array.  We could try to handle
+    // %% -> % in the future if we cared.
+    if (FormatStr.find('%') != StringRef::npos)
+      return nullptr; // we found a format specifier, bail out.
+
+    // sprintf(str, fmt) -> llvm.memcpy(align 1 str, align 1 fmt, strlen(fmt)+1)
+    B.CreateMemCpy(
+        CI->getArgOperand(0), Align(1), CI->getArgOperand(1), Align(1),
+        ConstantInt::get(DL.getIntPtrType(CI->getContext()),
+                         FormatStr.size() + 1)); // Copy the null byte.
+    return ConstantInt::get(CI->getType(), FormatStr.size());
+  }
+
+  // The remaining optimizations require the format string to be "%s" or "%c"
+  // and have an extra operand.
+  if (FormatStr.size() != 2 || FormatStr[0] != '%' ||
+      CI->getNumArgOperands() < 3)
+    return nullptr;
+
+  // Decode the second character of the format string.
+  if (FormatStr[1] == 'c') {
+    // sprintf(dst, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0
+    if (!CI->getArgOperand(2)->getType()->isIntegerTy())
+      return nullptr;
+    Value *V = B.CreateTrunc(CI->getArgOperand(2), B.getInt8Ty(), "char");
+    Value *Ptr = castToCStr(CI->getArgOperand(0), B);
+    B.CreateStore(V, Ptr);
+    Ptr = B.CreateGEP(B.getInt8Ty(), Ptr, B.getInt32(1), "nul");
+    B.CreateStore(B.getInt8(0), Ptr);
+
+    return ConstantInt::get(CI->getType(), 1);
+  }
+
+  if (FormatStr[1] == 's') {
+    // sprintf(dest, "%s", str) -> llvm.memcpy(align 1 dest, align 1 str,
+    // strlen(str)+1)
+    if (!CI->getArgOperand(2)->getType()->isPointerTy())
+      return nullptr;
+
     if (CI->use_empty())
       // sprintf(dest, "%s", str) -> strcpy(dest, str)
       return emitStrCpy(CI->getArgOperand(0), CI->getArgOperand(2), B, TLI);
@@ -2546,775 +2546,775 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI,
     if (OptForSize)
       return nullptr;
 
-    Value *Len = emitStrLen(CI->getArgOperand(2), B, DL, TLI); 
-    if (!Len) 
-      return nullptr; 
-    Value *IncLen = 
-        B.CreateAdd(Len, ConstantInt::get(Len->getType(), 1), "leninc"); 
-    B.CreateMemCpy(CI->getArgOperand(0), Align(1), CI->getArgOperand(2), 
-                   Align(1), IncLen); 
- 
-    // The sprintf result is the unincremented number of bytes in the string. 
-    return B.CreateIntCast(Len, CI->getType(), false); 
-  } 
-  return nullptr; 
-} 
- 
-Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilderBase &B) { 
-  Function *Callee = CI->getCalledFunction(); 
-  FunctionType *FT = Callee->getFunctionType(); 
-  if (Value *V = optimizeSPrintFString(CI, B)) { 
-    return V; 
-  } 
- 
-  // sprintf(str, format, ...) -> siprintf(str, format, ...) if no floating 
-  // point arguments. 
-  if (TLI->has(LibFunc_siprintf) && !callHasFloatingPointArgument(CI)) { 
-    Module *M = B.GetInsertBlock()->getParent()->getParent(); 
-    FunctionCallee SIPrintFFn = 
-        M->getOrInsertFunction("siprintf", FT, Callee->getAttributes()); 
-    CallInst *New = cast<CallInst>(CI->clone()); 
-    New->setCalledFunction(SIPrintFFn); 
-    B.Insert(New); 
-    return New; 
-  } 
- 
-  // sprintf(str, format, ...) -> __small_sprintf(str, format, ...) if no 128-bit 
-  // floating point arguments. 
-  if (TLI->has(LibFunc_small_sprintf) && !callHasFP128Argument(CI)) { 
-    Module *M = B.GetInsertBlock()->getParent()->getParent(); 
-    auto SmallSPrintFFn = 
-        M->getOrInsertFunction(TLI->getName(LibFunc_small_sprintf), 
-                               FT, Callee->getAttributes()); 
-    CallInst *New = cast<CallInst>(CI->clone()); 
-    New->setCalledFunction(SmallSPrintFFn); 
-    B.Insert(New); 
-    return New; 
-  } 
- 
-  annotateNonNullBasedOnAccess(CI, {0, 1}); 
-  return nullptr; 
-} 
- 
-Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI, 
-                                                 IRBuilderBase &B) { 
-  // Check for size 
-  ConstantInt *Size = dyn_cast<ConstantInt>(CI->getArgOperand(1)); 
-  if (!Size) 
-    return nullptr; 
- 
-  uint64_t N = Size->getZExtValue(); 
-  // Check for a fixed format string. 
-  StringRef FormatStr; 
-  if (!getConstantStringInfo(CI->getArgOperand(2), FormatStr)) 
-    return nullptr; 
- 
-  // If we just have a format string (nothing else crazy) transform it. 
-  if (CI->getNumArgOperands() == 3) { 
-    // Make sure there's no % in the constant array.  We could try to handle 
-    // %% -> % in the future if we cared. 
-    if (FormatStr.find('%') != StringRef::npos) 
-      return nullptr; // we found a format specifier, bail out. 
- 
-    if (N == 0) 
-      return ConstantInt::get(CI->getType(), FormatStr.size()); 
-    else if (N < FormatStr.size() + 1) 
-      return nullptr; 
- 
-    // snprintf(dst, size, fmt) -> llvm.memcpy(align 1 dst, align 1 fmt, 
-    // strlen(fmt)+1) 
-    B.CreateMemCpy( 
-        CI->getArgOperand(0), Align(1), CI->getArgOperand(2), Align(1), 
-        ConstantInt::get(DL.getIntPtrType(CI->getContext()), 
-                         FormatStr.size() + 1)); // Copy the null byte. 
-    return ConstantInt::get(CI->getType(), FormatStr.size()); 
-  } 
- 
-  // The remaining optimizations require the format string to be "%s" or "%c" 
-  // and have an extra operand. 
-  if (FormatStr.size() == 2 && FormatStr[0] == '%' && 
-      CI->getNumArgOperands() == 4) { 
- 
-    // Decode the second character of the format string. 
-    if (FormatStr[1] == 'c') { 
-      if (N == 0) 
-        return ConstantInt::get(CI->getType(), 1); 
-      else if (N == 1) 
-        return nullptr; 
- 
-      // snprintf(dst, size, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0 
-      if (!CI->getArgOperand(3)->getType()->isIntegerTy()) 
-        return nullptr; 
-      Value *V = B.CreateTrunc(CI->getArgOperand(3), B.getInt8Ty(), "char"); 
-      Value *Ptr = castToCStr(CI->getArgOperand(0), B); 
-      B.CreateStore(V, Ptr); 
-      Ptr = B.CreateGEP(B.getInt8Ty(), Ptr, B.getInt32(1), "nul"); 
-      B.CreateStore(B.getInt8(0), Ptr); 
- 
-      return ConstantInt::get(CI->getType(), 1); 
-    } 
- 
-    if (FormatStr[1] == 's') { 
-      // snprintf(dest, size, "%s", str) to llvm.memcpy(dest, str, len+1, 1) 
-      StringRef Str; 
-      if (!getConstantStringInfo(CI->getArgOperand(3), Str)) 
-        return nullptr; 
- 
-      if (N == 0) 
-        return ConstantInt::get(CI->getType(), Str.size()); 
-      else if (N < Str.size() + 1) 
-        return nullptr; 
- 
-      B.CreateMemCpy(CI->getArgOperand(0), Align(1), CI->getArgOperand(3), 
-                     Align(1), ConstantInt::get(CI->getType(), Str.size() + 1)); 
- 
-      // The snprintf result is the unincremented number of bytes in the string. 
-      return ConstantInt::get(CI->getType(), Str.size()); 
-    } 
-  } 
-  return nullptr; 
-} 
- 
-Value *LibCallSimplifier::optimizeSnPrintF(CallInst *CI, IRBuilderBase &B) { 
-  if (Value *V = optimizeSnPrintFString(CI, B)) { 
-    return V; 
-  } 
- 
-  if (isKnownNonZero(CI->getOperand(1), DL)) 
-    annotateNonNullBasedOnAccess(CI, 0); 
-  return nullptr; 
-} 
- 
-Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI, 
-                                                IRBuilderBase &B) { 
-  optimizeErrorReporting(CI, B, 0); 
- 
-  // All the optimizations depend on the format string. 
-  StringRef FormatStr; 
-  if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr)) 
-    return nullptr; 
- 
-  // Do not do any of the following transformations if the fprintf return 
-  // value is used, in general the fprintf return value is not compatible 
-  // with fwrite(), fputc() or fputs(). 
-  if (!CI->use_empty()) 
-    return nullptr; 
- 
-  // fprintf(F, "foo") --> fwrite("foo", 3, 1, F) 
-  if (CI->getNumArgOperands() == 2) { 
-    // Could handle %% -> % if we cared. 
-    if (FormatStr.find('%') != StringRef::npos) 
-      return nullptr; // We found a format specifier. 
- 
-    return emitFWrite( 
-        CI->getArgOperand(1), 
-        ConstantInt::get(DL.getIntPtrType(CI->getContext()), FormatStr.size()), 
-        CI->getArgOperand(0), B, DL, TLI); 
-  } 
- 
-  // The remaining optimizations require the format string to be "%s" or "%c" 
-  // and have an extra operand. 
-  if (FormatStr.size() != 2 || FormatStr[0] != '%' || 
-      CI->getNumArgOperands() < 3) 
-    return nullptr; 
- 
-  // Decode the second character of the format string. 
-  if (FormatStr[1] == 'c') { 
-    // fprintf(F, "%c", chr) --> fputc(chr, F) 
-    if (!CI->getArgOperand(2)->getType()->isIntegerTy()) 
-      return nullptr; 
-    return emitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI); 
-  } 
- 
-  if (FormatStr[1] == 's') { 
-    // fprintf(F, "%s", str) --> fputs(str, F) 
-    if (!CI->getArgOperand(2)->getType()->isPointerTy()) 
-      return nullptr; 
-    return emitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI); 
-  } 
-  return nullptr; 
-} 
- 
-Value *LibCallSimplifier::optimizeFPrintF(CallInst *CI, IRBuilderBase &B) { 
-  Function *Callee = CI->getCalledFunction(); 
-  FunctionType *FT = Callee->getFunctionType(); 
-  if (Value *V = optimizeFPrintFString(CI, B)) { 
-    return V; 
-  } 
- 
-  // fprintf(stream, format, ...) -> fiprintf(stream, format, ...) if no 
-  // floating point arguments. 
-  if (TLI->has(LibFunc_fiprintf) && !callHasFloatingPointArgument(CI)) { 
-    Module *M = B.GetInsertBlock()->getParent()->getParent(); 
-    FunctionCallee FIPrintFFn = 
-        M->getOrInsertFunction("fiprintf", FT, Callee->getAttributes()); 
-    CallInst *New = cast<CallInst>(CI->clone()); 
-    New->setCalledFunction(FIPrintFFn); 
-    B.Insert(New); 
-    return New; 
-  } 
- 
-  // fprintf(stream, format, ...) -> __small_fprintf(stream, format, ...) if no 
-  // 128-bit floating point arguments. 
-  if (TLI->has(LibFunc_small_fprintf) && !callHasFP128Argument(CI)) { 
-    Module *M = B.GetInsertBlock()->getParent()->getParent(); 
-    auto SmallFPrintFFn = 
-        M->getOrInsertFunction(TLI->getName(LibFunc_small_fprintf), 
-                               FT, Callee->getAttributes()); 
-    CallInst *New = cast<CallInst>(CI->clone()); 
-    New->setCalledFunction(SmallFPrintFFn); 
-    B.Insert(New); 
-    return New; 
-  } 
- 
-  return nullptr; 
-} 
- 
-Value *LibCallSimplifier::optimizeFWrite(CallInst *CI, IRBuilderBase &B) { 
-  optimizeErrorReporting(CI, B, 3); 
- 
-  // Get the element size and count. 
-  ConstantInt *SizeC = dyn_cast<ConstantInt>(CI->getArgOperand(1)); 
-  ConstantInt *CountC = dyn_cast<ConstantInt>(CI->getArgOperand(2)); 
-  if (SizeC && CountC) { 
-    uint64_t Bytes = SizeC->getZExtValue() * CountC->getZExtValue(); 
- 
-    // If this is writing zero records, remove the call (it's a noop). 
-    if (Bytes == 0) 
-      return ConstantInt::get(CI->getType(), 0); 
- 
-    // If this is writing one byte, turn it into fputc. 
-    // This optimisation is only valid, if the return value is unused. 
-    if (Bytes == 1 && CI->use_empty()) { // fwrite(S,1,1,F) -> fputc(S[0],F) 
-      Value *Char = B.CreateLoad(B.getInt8Ty(), 
-                                 castToCStr(CI->getArgOperand(0), B), "char"); 
-      Value *NewCI = emitFPutC(Char, CI->getArgOperand(3), B, TLI); 
-      return NewCI ? ConstantInt::get(CI->getType(), 1) : nullptr; 
-    } 
-  } 
- 
-  return nullptr; 
-} 
- 
-Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilderBase &B) { 
-  optimizeErrorReporting(CI, B, 1); 
- 
-  // Don't rewrite fputs to fwrite when optimising for size because fwrite 
-  // requires more arguments and thus extra MOVs are required. 
-  bool OptForSize = CI->getFunction()->hasOptSize() || 
-                    llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI, 
-                                                PGSOQueryType::IRPass); 
-  if (OptForSize) 
-    return nullptr; 
- 
-  // We can't optimize if return value is used. 
-  if (!CI->use_empty()) 
-    return nullptr; 
- 
-  // fputs(s,F) --> fwrite(s,strlen(s),1,F) 
-  uint64_t Len = GetStringLength(CI->getArgOperand(0)); 
-  if (!Len) 
-    return nullptr; 
- 
-  // Known to have no uses (see above). 
-  return emitFWrite( 
-      CI->getArgOperand(0), 
-      ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len - 1), 
-      CI->getArgOperand(1), B, DL, TLI); 
-} 
- 
-Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilderBase &B) { 
-  annotateNonNullBasedOnAccess(CI, 0); 
-  if (!CI->use_empty()) 
-    return nullptr; 
- 
-  // Check for a constant string. 
-  // puts("") -> putchar('\n') 
-  StringRef Str; 
-  if (getConstantStringInfo(CI->getArgOperand(0), Str) && Str.empty()) 
-    return emitPutChar(B.getInt32('\n'), B, TLI); 
- 
-  return nullptr; 
-} 
- 
-Value *LibCallSimplifier::optimizeBCopy(CallInst *CI, IRBuilderBase &B) { 
-  // bcopy(src, dst, n) -> llvm.memmove(dst, src, n) 
-  return B.CreateMemMove(CI->getArgOperand(1), Align(1), CI->getArgOperand(0), 
-                         Align(1), CI->getArgOperand(2)); 
-} 
- 
-bool LibCallSimplifier::hasFloatVersion(StringRef FuncName) { 
-  LibFunc Func; 
-  SmallString<20> FloatFuncName = FuncName; 
-  FloatFuncName += 'f'; 
-  if (TLI->getLibFunc(FloatFuncName, Func)) 
-    return TLI->has(Func); 
-  return false; 
-} 
- 
-Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI, 
-                                                      IRBuilderBase &Builder) { 
-  LibFunc Func; 
-  Function *Callee = CI->getCalledFunction(); 
-  // Check for string/memory library functions. 
-  if (TLI->getLibFunc(*Callee, Func) && TLI->has(Func)) { 
-    // Make sure we never change the calling convention. 
-    assert((ignoreCallingConv(Func) || 
-            isCallingConvCCompatible(CI)) && 
-      "Optimizing string/memory libcall would change the calling convention"); 
-    switch (Func) { 
-    case LibFunc_strcat: 
-      return optimizeStrCat(CI, Builder); 
-    case LibFunc_strncat: 
-      return optimizeStrNCat(CI, Builder); 
-    case LibFunc_strchr: 
-      return optimizeStrChr(CI, Builder); 
-    case LibFunc_strrchr: 
-      return optimizeStrRChr(CI, Builder); 
-    case LibFunc_strcmp: 
-      return optimizeStrCmp(CI, Builder); 
-    case LibFunc_strncmp: 
-      return optimizeStrNCmp(CI, Builder); 
-    case LibFunc_strcpy: 
-      return optimizeStrCpy(CI, Builder); 
-    case LibFunc_stpcpy: 
-      return optimizeStpCpy(CI, Builder); 
-    case LibFunc_strncpy: 
-      return optimizeStrNCpy(CI, Builder); 
-    case LibFunc_strlen: 
-      return optimizeStrLen(CI, Builder); 
-    case LibFunc_strpbrk: 
-      return optimizeStrPBrk(CI, Builder); 
-    case LibFunc_strndup: 
-      return optimizeStrNDup(CI, Builder); 
-    case LibFunc_strtol: 
-    case LibFunc_strtod: 
-    case LibFunc_strtof: 
-    case LibFunc_strtoul: 
-    case LibFunc_strtoll: 
-    case LibFunc_strtold: 
-    case LibFunc_strtoull: 
-      return optimizeStrTo(CI, Builder); 
-    case LibFunc_strspn: 
-      return optimizeStrSpn(CI, Builder); 
-    case LibFunc_strcspn: 
-      return optimizeStrCSpn(CI, Builder); 
-    case LibFunc_strstr: 
-      return optimizeStrStr(CI, Builder); 
-    case LibFunc_memchr: 
-      return optimizeMemChr(CI, Builder); 
-    case LibFunc_memrchr: 
-      return optimizeMemRChr(CI, Builder); 
-    case LibFunc_bcmp: 
-      return optimizeBCmp(CI, Builder); 
-    case LibFunc_memcmp: 
-      return optimizeMemCmp(CI, Builder); 
-    case LibFunc_memcpy: 
-      return optimizeMemCpy(CI, Builder); 
-    case LibFunc_memccpy: 
-      return optimizeMemCCpy(CI, Builder); 
-    case LibFunc_mempcpy: 
-      return optimizeMemPCpy(CI, Builder); 
-    case LibFunc_memmove: 
-      return optimizeMemMove(CI, Builder); 
-    case LibFunc_memset: 
-      return optimizeMemSet(CI, Builder); 
-    case LibFunc_realloc: 
-      return optimizeRealloc(CI, Builder); 
-    case LibFunc_wcslen: 
-      return optimizeWcslen(CI, Builder); 
-    case LibFunc_bcopy: 
-      return optimizeBCopy(CI, Builder); 
-    default: 
-      break; 
-    } 
-  } 
-  return nullptr; 
-} 
- 
-Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI, 
-                                                       LibFunc Func, 
-                                                       IRBuilderBase &Builder) { 
-  // Don't optimize calls that require strict floating point semantics. 
-  if (CI->isStrictFP()) 
-    return nullptr; 
- 
-  if (Value *V = optimizeTrigReflections(CI, Func, Builder)) 
-    return V; 
- 
-  switch (Func) { 
-  case LibFunc_sinpif: 
-  case LibFunc_sinpi: 
-  case LibFunc_cospif: 
-  case LibFunc_cospi: 
-    return optimizeSinCosPi(CI, Builder); 
-  case LibFunc_powf: 
-  case LibFunc_pow: 
-  case LibFunc_powl: 
-    return optimizePow(CI, Builder); 
-  case LibFunc_exp2l: 
-  case LibFunc_exp2: 
-  case LibFunc_exp2f: 
-    return optimizeExp2(CI, Builder); 
-  case LibFunc_fabsf: 
-  case LibFunc_fabs: 
-  case LibFunc_fabsl: 
-    return replaceUnaryCall(CI, Builder, Intrinsic::fabs); 
-  case LibFunc_sqrtf: 
-  case LibFunc_sqrt: 
-  case LibFunc_sqrtl: 
-    return optimizeSqrt(CI, Builder); 
-  case LibFunc_logf: 
-  case LibFunc_log: 
-  case LibFunc_logl: 
-  case LibFunc_log10f: 
-  case LibFunc_log10: 
-  case LibFunc_log10l: 
-  case LibFunc_log1pf: 
-  case LibFunc_log1p: 
-  case LibFunc_log1pl: 
-  case LibFunc_log2f: 
-  case LibFunc_log2: 
-  case LibFunc_log2l: 
-  case LibFunc_logbf: 
-  case LibFunc_logb: 
-  case LibFunc_logbl: 
-    return optimizeLog(CI, Builder); 
-  case LibFunc_tan: 
-  case LibFunc_tanf: 
-  case LibFunc_tanl: 
-    return optimizeTan(CI, Builder); 
-  case LibFunc_ceil: 
-    return replaceUnaryCall(CI, Builder, Intrinsic::ceil); 
-  case LibFunc_floor: 
-    return replaceUnaryCall(CI, Builder, Intrinsic::floor); 
-  case LibFunc_round: 
-    return replaceUnaryCall(CI, Builder, Intrinsic::round); 
-  case LibFunc_roundeven: 
-    return replaceUnaryCall(CI, Builder, Intrinsic::roundeven); 
-  case LibFunc_nearbyint: 
-    return replaceUnaryCall(CI, Builder, Intrinsic::nearbyint); 
-  case LibFunc_rint: 
-    return replaceUnaryCall(CI, Builder, Intrinsic::rint); 
-  case LibFunc_trunc: 
-    return replaceUnaryCall(CI, Builder, Intrinsic::trunc); 
-  case LibFunc_acos: 
-  case LibFunc_acosh: 
-  case LibFunc_asin: 
-  case LibFunc_asinh: 
-  case LibFunc_atan: 
-  case LibFunc_atanh: 
-  case LibFunc_cbrt: 
-  case LibFunc_cosh: 
-  case LibFunc_exp: 
-  case LibFunc_exp10: 
-  case LibFunc_expm1: 
-  case LibFunc_cos: 
-  case LibFunc_sin: 
-  case LibFunc_sinh: 
-  case LibFunc_tanh: 
-    if (UnsafeFPShrink && hasFloatVersion(CI->getCalledFunction()->getName())) 
-      return optimizeUnaryDoubleFP(CI, Builder, true); 
-    return nullptr; 
-  case LibFunc_copysign: 
-    if (hasFloatVersion(CI->getCalledFunction()->getName())) 
-      return optimizeBinaryDoubleFP(CI, Builder); 
-    return nullptr; 
-  case LibFunc_fminf: 
-  case LibFunc_fmin: 
-  case LibFunc_fminl: 
-  case LibFunc_fmaxf: 
-  case LibFunc_fmax: 
-  case LibFunc_fmaxl: 
-    return optimizeFMinFMax(CI, Builder); 
-  case LibFunc_cabs: 
-  case LibFunc_cabsf: 
-  case LibFunc_cabsl: 
-    return optimizeCAbs(CI, Builder); 
-  default: 
-    return nullptr; 
-  } 
-} 
- 
-Value *LibCallSimplifier::optimizeCall(CallInst *CI, IRBuilderBase &Builder) { 
-  // TODO: Split out the code below that operates on FP calls so that 
-  //       we can all non-FP calls with the StrictFP attribute to be 
-  //       optimized. 
-  if (CI->isNoBuiltin()) 
-    return nullptr; 
- 
-  LibFunc Func; 
-  Function *Callee = CI->getCalledFunction(); 
-  bool isCallingConvC = isCallingConvCCompatible(CI); 
- 
-  SmallVector<OperandBundleDef, 2> OpBundles; 
-  CI->getOperandBundlesAsDefs(OpBundles); 
- 
-  IRBuilderBase::OperandBundlesGuard Guard(Builder); 
-  Builder.setDefaultOperandBundles(OpBundles); 
- 
-  // Command-line parameter overrides instruction attribute. 
-  // This can't be moved to optimizeFloatingPointLibCall() because it may be 
-  // used by the intrinsic optimizations. 
-  if (EnableUnsafeFPShrink.getNumOccurrences() > 0) 
-    UnsafeFPShrink = EnableUnsafeFPShrink; 
-  else if (isa<FPMathOperator>(CI) && CI->isFast()) 
-    UnsafeFPShrink = true; 
- 
-  // First, check for intrinsics. 
-  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) { 
-    if (!isCallingConvC) 
-      return nullptr; 
-    // The FP intrinsics have corresponding constrained versions so we don't 
-    // need to check for the StrictFP attribute here. 
-    switch (II->getIntrinsicID()) { 
-    case Intrinsic::pow: 
-      return optimizePow(CI, Builder); 
-    case Intrinsic::exp2: 
-      return optimizeExp2(CI, Builder); 
-    case Intrinsic::log: 
-    case Intrinsic::log2: 
-    case Intrinsic::log10: 
-      return optimizeLog(CI, Builder); 
-    case Intrinsic::sqrt: 
-      return optimizeSqrt(CI, Builder); 
-    // TODO: Use foldMallocMemset() with memset intrinsic. 
-    case Intrinsic::memset: 
-      return optimizeMemSet(CI, Builder); 
-    case Intrinsic::memcpy: 
-      return optimizeMemCpy(CI, Builder); 
-    case Intrinsic::memmove: 
-      return optimizeMemMove(CI, Builder); 
-    default: 
-      return nullptr; 
-    } 
-  } 
- 
-  // Also try to simplify calls to fortified library functions. 
-  if (Value *SimplifiedFortifiedCI = 
-          FortifiedSimplifier.optimizeCall(CI, Builder)) { 
-    // Try to further simplify the result. 
-    CallInst *SimplifiedCI = dyn_cast<CallInst>(SimplifiedFortifiedCI); 
-    if (SimplifiedCI && SimplifiedCI->getCalledFunction()) { 
-      // Ensure that SimplifiedCI's uses are complete, since some calls have 
-      // their uses analyzed. 
-      replaceAllUsesWith(CI, SimplifiedCI); 
- 
-      // Set insertion point to SimplifiedCI to guarantee we reach all uses 
-      // we might replace later on. 
-      IRBuilderBase::InsertPointGuard Guard(Builder); 
-      Builder.SetInsertPoint(SimplifiedCI); 
-      if (Value *V = optimizeStringMemoryLibCall(SimplifiedCI, Builder)) { 
-        // If we were able to further simplify, remove the now redundant call. 
-        substituteInParent(SimplifiedCI, V); 
-        return V; 
-      } 
-    } 
-    return SimplifiedFortifiedCI; 
-  } 
- 
-  // Then check for known library functions. 
-  if (TLI->getLibFunc(*Callee, Func) && TLI->has(Func)) { 
-    // We never change the calling convention. 
-    if (!ignoreCallingConv(Func) && !isCallingConvC) 
-      return nullptr; 
-    if (Value *V = optimizeStringMemoryLibCall(CI, Builder)) 
-      return V; 
-    if (Value *V = optimizeFloatingPointLibCall(CI, Func, Builder)) 
-      return V; 
-    switch (Func) { 
-    case LibFunc_ffs: 
-    case LibFunc_ffsl: 
-    case LibFunc_ffsll: 
-      return optimizeFFS(CI, Builder); 
-    case LibFunc_fls: 
-    case LibFunc_flsl: 
-    case LibFunc_flsll: 
-      return optimizeFls(CI, Builder); 
-    case LibFunc_abs: 
-    case LibFunc_labs: 
-    case LibFunc_llabs: 
-      return optimizeAbs(CI, Builder); 
-    case LibFunc_isdigit: 
-      return optimizeIsDigit(CI, Builder); 
-    case LibFunc_isascii: 
-      return optimizeIsAscii(CI, Builder); 
-    case LibFunc_toascii: 
-      return optimizeToAscii(CI, Builder); 
-    case LibFunc_atoi: 
-    case LibFunc_atol: 
-    case LibFunc_atoll: 
-      return optimizeAtoi(CI, Builder); 
-    case LibFunc_strtol: 
-    case LibFunc_strtoll: 
-      return optimizeStrtol(CI, Builder); 
-    case LibFunc_printf: 
-      return optimizePrintF(CI, Builder); 
-    case LibFunc_sprintf: 
-      return optimizeSPrintF(CI, Builder); 
-    case LibFunc_snprintf: 
-      return optimizeSnPrintF(CI, Builder); 
-    case LibFunc_fprintf: 
-      return optimizeFPrintF(CI, Builder); 
-    case LibFunc_fwrite: 
-      return optimizeFWrite(CI, Builder); 
-    case LibFunc_fputs: 
-      return optimizeFPuts(CI, Builder); 
-    case LibFunc_puts: 
-      return optimizePuts(CI, Builder); 
-    case LibFunc_perror: 
-      return optimizeErrorReporting(CI, Builder); 
-    case LibFunc_vfprintf: 
-    case LibFunc_fiprintf: 
-      return optimizeErrorReporting(CI, Builder, 0); 
-    default: 
-      return nullptr; 
-    } 
-  } 
-  return nullptr; 
-} 
- 
-LibCallSimplifier::LibCallSimplifier( 
-    const DataLayout &DL, const TargetLibraryInfo *TLI, 
-    OptimizationRemarkEmitter &ORE, 
-    BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 
-    function_ref<void(Instruction *, Value *)> Replacer, 
-    function_ref<void(Instruction *)> Eraser) 
-    : FortifiedSimplifier(TLI), DL(DL), TLI(TLI), ORE(ORE), BFI(BFI), PSI(PSI), 
-      UnsafeFPShrink(false), Replacer(Replacer), Eraser(Eraser) {} 
- 
-void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) { 
-  // Indirect through the replacer used in this instance. 
-  Replacer(I, With); 
-} 
- 
-void LibCallSimplifier::eraseFromParent(Instruction *I) { 
-  Eraser(I); 
-} 
- 
-// TODO: 
-//   Additional cases that we need to add to this file: 
-// 
-// cbrt: 
-//   * cbrt(expN(X))  -> expN(x/3) 
-//   * cbrt(sqrt(x))  -> pow(x,1/6) 
-//   * cbrt(cbrt(x))  -> pow(x,1/9) 
-// 
-// exp, expf, expl: 
-//   * exp(log(x))  -> x 
-// 
-// log, logf, logl: 
-//   * log(exp(x))   -> x 
-//   * log(exp(y))   -> y*log(e) 
-//   * log(exp10(y)) -> y*log(10) 
-//   * log(sqrt(x))  -> 0.5*log(x) 
-// 
-// pow, powf, powl: 
-//   * pow(sqrt(x),y) -> pow(x,y*0.5) 
-//   * pow(pow(x,y),z)-> pow(x,y*z) 
-// 
-// signbit: 
-//   * signbit(cnst) -> cnst' 
-//   * signbit(nncst) -> 0 (if pstv is a non-negative constant) 
-// 
-// sqrt, sqrtf, sqrtl: 
-//   * sqrt(expN(x))  -> expN(x*0.5) 
-//   * sqrt(Nroot(x)) -> pow(x,1/(2*N)) 
-//   * sqrt(pow(x,y)) -> pow(|x|,y*0.5) 
-// 
- 
-//===----------------------------------------------------------------------===// 
-// Fortified Library Call Optimizations 
-//===----------------------------------------------------------------------===// 
- 
-bool 
-FortifiedLibCallSimplifier::isFortifiedCallFoldable(CallInst *CI, 
-                                                    unsigned ObjSizeOp, 
-                                                    Optional<unsigned> SizeOp, 
-                                                    Optional<unsigned> StrOp, 
-                                                    Optional<unsigned> FlagOp) { 
-  // If this function takes a flag argument, the implementation may use it to 
-  // perform extra checks. Don't fold into the non-checking variant. 
-  if (FlagOp) { 
-    ConstantInt *Flag = dyn_cast<ConstantInt>(CI->getArgOperand(*FlagOp)); 
-    if (!Flag || !Flag->isZero()) 
-      return false; 
-  } 
- 
-  if (SizeOp && CI->getArgOperand(ObjSizeOp) == CI->getArgOperand(*SizeOp)) 
-    return true; 
- 
-  if (ConstantInt *ObjSizeCI = 
-          dyn_cast<ConstantInt>(CI->getArgOperand(ObjSizeOp))) { 
-    if (ObjSizeCI->isMinusOne()) 
-      return true; 
-    // If the object size wasn't -1 (unknown), bail out if we were asked to. 
-    if (OnlyLowerUnknownSize) 
-      return false; 
-    if (StrOp) { 
-      uint64_t Len = GetStringLength(CI->getArgOperand(*StrOp)); 
-      // If the length is 0 we don't know how long it is and so we can't 
-      // remove the check. 
-      if (Len) 
-        annotateDereferenceableBytes(CI, *StrOp, Len); 
-      else 
-        return false; 
-      return ObjSizeCI->getZExtValue() >= Len; 
-    } 
- 
-    if (SizeOp) { 
-      if (ConstantInt *SizeCI = 
-              dyn_cast<ConstantInt>(CI->getArgOperand(*SizeOp))) 
-        return ObjSizeCI->getZExtValue() >= SizeCI->getZExtValue(); 
-    } 
-  } 
-  return false; 
-} 
- 
-Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI, 
-                                                     IRBuilderBase &B) { 
-  if (isFortifiedCallFoldable(CI, 3, 2)) { 
-    CallInst *NewCI = 
-        B.CreateMemCpy(CI->getArgOperand(0), Align(1), CI->getArgOperand(1), 
-                       Align(1), CI->getArgOperand(2)); 
-    NewCI->setAttributes(CI->getAttributes()); 
+    Value *Len = emitStrLen(CI->getArgOperand(2), B, DL, TLI);
+    if (!Len)
+      return nullptr;
+    Value *IncLen =
+        B.CreateAdd(Len, ConstantInt::get(Len->getType(), 1), "leninc");
+    B.CreateMemCpy(CI->getArgOperand(0), Align(1), CI->getArgOperand(2),
+                   Align(1), IncLen);
+
+    // The sprintf result is the unincremented number of bytes in the string.
+    return B.CreateIntCast(Len, CI->getType(), false);
+  }
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilderBase &B) {
+  Function *Callee = CI->getCalledFunction();
+  FunctionType *FT = Callee->getFunctionType();
+  if (Value *V = optimizeSPrintFString(CI, B)) {
+    return V;
+  }
+
+  // sprintf(str, format, ...) -> siprintf(str, format, ...) if no floating
+  // point arguments.
+  if (TLI->has(LibFunc_siprintf) && !callHasFloatingPointArgument(CI)) {
+    Module *M = B.GetInsertBlock()->getParent()->getParent();
+    FunctionCallee SIPrintFFn =
+        M->getOrInsertFunction("siprintf", FT, Callee->getAttributes());
+    CallInst *New = cast<CallInst>(CI->clone());
+    New->setCalledFunction(SIPrintFFn);
+    B.Insert(New);
+    return New;
+  }
+
+  // sprintf(str, format, ...) -> __small_sprintf(str, format, ...) if no 128-bit
+  // floating point arguments.
+  if (TLI->has(LibFunc_small_sprintf) && !callHasFP128Argument(CI)) {
+    Module *M = B.GetInsertBlock()->getParent()->getParent();
+    auto SmallSPrintFFn =
+        M->getOrInsertFunction(TLI->getName(LibFunc_small_sprintf),
+                               FT, Callee->getAttributes());
+    CallInst *New = cast<CallInst>(CI->clone());
+    New->setCalledFunction(SmallSPrintFFn);
+    B.Insert(New);
+    return New;
+  }
+
+  annotateNonNullBasedOnAccess(CI, {0, 1});
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI,
+                                                 IRBuilderBase &B) {
+  // Check for size
+  ConstantInt *Size = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+  if (!Size)
+    return nullptr;
+
+  uint64_t N = Size->getZExtValue();
+  // Check for a fixed format string.
+  StringRef FormatStr;
+  if (!getConstantStringInfo(CI->getArgOperand(2), FormatStr))
+    return nullptr;
+
+  // If we just have a format string (nothing else crazy) transform it.
+  if (CI->getNumArgOperands() == 3) {
+    // Make sure there's no % in the constant array.  We could try to handle
+    // %% -> % in the future if we cared.
+    if (FormatStr.find('%') != StringRef::npos)
+      return nullptr; // we found a format specifier, bail out.
+
+    if (N == 0)
+      return ConstantInt::get(CI->getType(), FormatStr.size());
+    else if (N < FormatStr.size() + 1)
+      return nullptr;
+
+    // snprintf(dst, size, fmt) -> llvm.memcpy(align 1 dst, align 1 fmt,
+    // strlen(fmt)+1)
+    B.CreateMemCpy(
+        CI->getArgOperand(0), Align(1), CI->getArgOperand(2), Align(1),
+        ConstantInt::get(DL.getIntPtrType(CI->getContext()),
+                         FormatStr.size() + 1)); // Copy the null byte.
+    return ConstantInt::get(CI->getType(), FormatStr.size());
+  }
+
+  // The remaining optimizations require the format string to be "%s" or "%c"
+  // and have an extra operand.
+  if (FormatStr.size() == 2 && FormatStr[0] == '%' &&
+      CI->getNumArgOperands() == 4) {
+
+    // Decode the second character of the format string.
+    if (FormatStr[1] == 'c') {
+      if (N == 0)
+        return ConstantInt::get(CI->getType(), 1);
+      else if (N == 1)
+        return nullptr;
+
+      // snprintf(dst, size, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0
+      if (!CI->getArgOperand(3)->getType()->isIntegerTy())
+        return nullptr;
+      Value *V = B.CreateTrunc(CI->getArgOperand(3), B.getInt8Ty(), "char");
+      Value *Ptr = castToCStr(CI->getArgOperand(0), B);
+      B.CreateStore(V, Ptr);
+      Ptr = B.CreateGEP(B.getInt8Ty(), Ptr, B.getInt32(1), "nul");
+      B.CreateStore(B.getInt8(0), Ptr);
+
+      return ConstantInt::get(CI->getType(), 1);
+    }
+
+    if (FormatStr[1] == 's') {
+      // snprintf(dest, size, "%s", str) to llvm.memcpy(dest, str, len+1, 1)
+      StringRef Str;
+      if (!getConstantStringInfo(CI->getArgOperand(3), Str))
+        return nullptr;
+
+      if (N == 0)
+        return ConstantInt::get(CI->getType(), Str.size());
+      else if (N < Str.size() + 1)
+        return nullptr;
+
+      B.CreateMemCpy(CI->getArgOperand(0), Align(1), CI->getArgOperand(3),
+                     Align(1), ConstantInt::get(CI->getType(), Str.size() + 1));
+
+      // The snprintf result is the unincremented number of bytes in the string.
+      return ConstantInt::get(CI->getType(), Str.size());
+    }
+  }
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeSnPrintF(CallInst *CI, IRBuilderBase &B) {
+  if (Value *V = optimizeSnPrintFString(CI, B)) {
+    return V;
+  }
+
+  if (isKnownNonZero(CI->getOperand(1), DL))
+    annotateNonNullBasedOnAccess(CI, 0);
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI,
+                                                IRBuilderBase &B) {
+  optimizeErrorReporting(CI, B, 0);
+
+  // All the optimizations depend on the format string.
+  StringRef FormatStr;
+  if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
+    return nullptr;
+
+  // Do not do any of the following transformations if the fprintf return
+  // value is used, in general the fprintf return value is not compatible
+  // with fwrite(), fputc() or fputs().
+  if (!CI->use_empty())
+    return nullptr;
+
+  // fprintf(F, "foo") --> fwrite("foo", 3, 1, F)
+  if (CI->getNumArgOperands() == 2) {
+    // Could handle %% -> % if we cared.
+    if (FormatStr.find('%') != StringRef::npos)
+      return nullptr; // We found a format specifier.
+
+    return emitFWrite(
+        CI->getArgOperand(1),
+        ConstantInt::get(DL.getIntPtrType(CI->getContext()), FormatStr.size()),
+        CI->getArgOperand(0), B, DL, TLI);
+  }
+
+  // The remaining optimizations require the format string to be "%s" or "%c"
+  // and have an extra operand.
+  if (FormatStr.size() != 2 || FormatStr[0] != '%' ||
+      CI->getNumArgOperands() < 3)
+    return nullptr;
+
+  // Decode the second character of the format string.
+  if (FormatStr[1] == 'c') {
+    // fprintf(F, "%c", chr) --> fputc(chr, F)
+    if (!CI->getArgOperand(2)->getType()->isIntegerTy())
+      return nullptr;
+    return emitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI);
+  }
+
+  if (FormatStr[1] == 's') {
+    // fprintf(F, "%s", str) --> fputs(str, F)
+    if (!CI->getArgOperand(2)->getType()->isPointerTy())
+      return nullptr;
+    return emitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI);
+  }
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeFPrintF(CallInst *CI, IRBuilderBase &B) {
+  Function *Callee = CI->getCalledFunction();
+  FunctionType *FT = Callee->getFunctionType();
+  if (Value *V = optimizeFPrintFString(CI, B)) {
+    return V;
+  }
+
+  // fprintf(stream, format, ...) -> fiprintf(stream, format, ...) if no
+  // floating point arguments.
+  if (TLI->has(LibFunc_fiprintf) && !callHasFloatingPointArgument(CI)) {
+    Module *M = B.GetInsertBlock()->getParent()->getParent();
+    FunctionCallee FIPrintFFn =
+        M->getOrInsertFunction("fiprintf", FT, Callee->getAttributes());
+    CallInst *New = cast<CallInst>(CI->clone());
+    New->setCalledFunction(FIPrintFFn);
+    B.Insert(New);
+    return New;
+  }
+
+  // fprintf(stream, format, ...) -> __small_fprintf(stream, format, ...) if no
+  // 128-bit floating point arguments.
+  if (TLI->has(LibFunc_small_fprintf) && !callHasFP128Argument(CI)) {
+    Module *M = B.GetInsertBlock()->getParent()->getParent();
+    auto SmallFPrintFFn =
+        M->getOrInsertFunction(TLI->getName(LibFunc_small_fprintf),
+                               FT, Callee->getAttributes());
+    CallInst *New = cast<CallInst>(CI->clone());
+    New->setCalledFunction(SmallFPrintFFn);
+    B.Insert(New);
+    return New;
+  }
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeFWrite(CallInst *CI, IRBuilderBase &B) {
+  optimizeErrorReporting(CI, B, 3);
+
+  // Get the element size and count.
+  ConstantInt *SizeC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+  ConstantInt *CountC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+  if (SizeC && CountC) {
+    uint64_t Bytes = SizeC->getZExtValue() * CountC->getZExtValue();
+
+    // If this is writing zero records, remove the call (it's a noop).
+    if (Bytes == 0)
+      return ConstantInt::get(CI->getType(), 0);
+
+    // If this is writing one byte, turn it into fputc.
+    // This optimisation is only valid, if the return value is unused.
+    if (Bytes == 1 && CI->use_empty()) { // fwrite(S,1,1,F) -> fputc(S[0],F)
+      Value *Char = B.CreateLoad(B.getInt8Ty(),
+                                 castToCStr(CI->getArgOperand(0), B), "char");
+      Value *NewCI = emitFPutC(Char, CI->getArgOperand(3), B, TLI);
+      return NewCI ? ConstantInt::get(CI->getType(), 1) : nullptr;
+    }
+  }
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilderBase &B) {
+  optimizeErrorReporting(CI, B, 1);
+
+  // Don't rewrite fputs to fwrite when optimising for size because fwrite
+  // requires more arguments and thus extra MOVs are required.
+  bool OptForSize = CI->getFunction()->hasOptSize() ||
+                    llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI,
+                                                PGSOQueryType::IRPass);
+  if (OptForSize)
+    return nullptr;
+
+  // We can't optimize if return value is used.
+  if (!CI->use_empty())
+    return nullptr;
+
+  // fputs(s,F) --> fwrite(s,strlen(s),1,F)
+  uint64_t Len = GetStringLength(CI->getArgOperand(0));
+  if (!Len)
+    return nullptr;
+
+  // Known to have no uses (see above).
+  return emitFWrite(
+      CI->getArgOperand(0),
+      ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len - 1),
+      CI->getArgOperand(1), B, DL, TLI);
+}
+
+Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilderBase &B) {
+  annotateNonNullBasedOnAccess(CI, 0);
+  if (!CI->use_empty())
+    return nullptr;
+
+  // Check for a constant string.
+  // puts("") -> putchar('\n')
+  StringRef Str;
+  if (getConstantStringInfo(CI->getArgOperand(0), Str) && Str.empty())
+    return emitPutChar(B.getInt32('\n'), B, TLI);
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeBCopy(CallInst *CI, IRBuilderBase &B) {
+  // bcopy(src, dst, n) -> llvm.memmove(dst, src, n)
+  return B.CreateMemMove(CI->getArgOperand(1), Align(1), CI->getArgOperand(0),
+                         Align(1), CI->getArgOperand(2));
+}
+
+bool LibCallSimplifier::hasFloatVersion(StringRef FuncName) {
+  LibFunc Func;
+  SmallString<20> FloatFuncName = FuncName;
+  FloatFuncName += 'f';
+  if (TLI->getLibFunc(FloatFuncName, Func))
+    return TLI->has(Func);
+  return false;
+}
+
+Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
+                                                      IRBuilderBase &Builder) {
+  LibFunc Func;
+  Function *Callee = CI->getCalledFunction();
+  // Check for string/memory library functions.
+  if (TLI->getLibFunc(*Callee, Func) && TLI->has(Func)) {
+    // Make sure we never change the calling convention.
+    assert((ignoreCallingConv(Func) ||
+            isCallingConvCCompatible(CI)) &&
+      "Optimizing string/memory libcall would change the calling convention");
+    switch (Func) {
+    case LibFunc_strcat:
+      return optimizeStrCat(CI, Builder);
+    case LibFunc_strncat:
+      return optimizeStrNCat(CI, Builder);
+    case LibFunc_strchr:
+      return optimizeStrChr(CI, Builder);
+    case LibFunc_strrchr:
+      return optimizeStrRChr(CI, Builder);
+    case LibFunc_strcmp:
+      return optimizeStrCmp(CI, Builder);
+    case LibFunc_strncmp:
+      return optimizeStrNCmp(CI, Builder);
+    case LibFunc_strcpy:
+      return optimizeStrCpy(CI, Builder);
+    case LibFunc_stpcpy:
+      return optimizeStpCpy(CI, Builder);
+    case LibFunc_strncpy:
+      return optimizeStrNCpy(CI, Builder);
+    case LibFunc_strlen:
+      return optimizeStrLen(CI, Builder);
+    case LibFunc_strpbrk:
+      return optimizeStrPBrk(CI, Builder);
+    case LibFunc_strndup:
+      return optimizeStrNDup(CI, Builder);
+    case LibFunc_strtol:
+    case LibFunc_strtod:
+    case LibFunc_strtof:
+    case LibFunc_strtoul:
+    case LibFunc_strtoll:
+    case LibFunc_strtold:
+    case LibFunc_strtoull:
+      return optimizeStrTo(CI, Builder);
+    case LibFunc_strspn:
+      return optimizeStrSpn(CI, Builder);
+    case LibFunc_strcspn:
+      return optimizeStrCSpn(CI, Builder);
+    case LibFunc_strstr:
+      return optimizeStrStr(CI, Builder);
+    case LibFunc_memchr:
+      return optimizeMemChr(CI, Builder);
+    case LibFunc_memrchr:
+      return optimizeMemRChr(CI, Builder);
+    case LibFunc_bcmp:
+      return optimizeBCmp(CI, Builder);
+    case LibFunc_memcmp:
+      return optimizeMemCmp(CI, Builder);
+    case LibFunc_memcpy:
+      return optimizeMemCpy(CI, Builder);
+    case LibFunc_memccpy:
+      return optimizeMemCCpy(CI, Builder);
+    case LibFunc_mempcpy:
+      return optimizeMemPCpy(CI, Builder);
+    case LibFunc_memmove:
+      return optimizeMemMove(CI, Builder);
+    case LibFunc_memset:
+      return optimizeMemSet(CI, Builder);
+    case LibFunc_realloc:
+      return optimizeRealloc(CI, Builder);
+    case LibFunc_wcslen:
+      return optimizeWcslen(CI, Builder);
+    case LibFunc_bcopy:
+      return optimizeBCopy(CI, Builder);
+    default:
+      break;
+    }
+  }
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI,
+                                                       LibFunc Func,
+                                                       IRBuilderBase &Builder) {
+  // Don't optimize calls that require strict floating point semantics.
+  if (CI->isStrictFP())
+    return nullptr;
+
+  if (Value *V = optimizeTrigReflections(CI, Func, Builder))
+    return V;
+
+  switch (Func) {
+  case LibFunc_sinpif:
+  case LibFunc_sinpi:
+  case LibFunc_cospif:
+  case LibFunc_cospi:
+    return optimizeSinCosPi(CI, Builder);
+  case LibFunc_powf:
+  case LibFunc_pow:
+  case LibFunc_powl:
+    return optimizePow(CI, Builder);
+  case LibFunc_exp2l:
+  case LibFunc_exp2:
+  case LibFunc_exp2f:
+    return optimizeExp2(CI, Builder);
+  case LibFunc_fabsf:
+  case LibFunc_fabs:
+  case LibFunc_fabsl:
+    return replaceUnaryCall(CI, Builder, Intrinsic::fabs);
+  case LibFunc_sqrtf:
+  case LibFunc_sqrt:
+  case LibFunc_sqrtl:
+    return optimizeSqrt(CI, Builder);
+  case LibFunc_logf:
+  case LibFunc_log:
+  case LibFunc_logl:
+  case LibFunc_log10f:
+  case LibFunc_log10:
+  case LibFunc_log10l:
+  case LibFunc_log1pf:
+  case LibFunc_log1p:
+  case LibFunc_log1pl:
+  case LibFunc_log2f:
+  case LibFunc_log2:
+  case LibFunc_log2l:
+  case LibFunc_logbf:
+  case LibFunc_logb:
+  case LibFunc_logbl:
+    return optimizeLog(CI, Builder);
+  case LibFunc_tan:
+  case LibFunc_tanf:
+  case LibFunc_tanl:
+    return optimizeTan(CI, Builder);
+  case LibFunc_ceil:
+    return replaceUnaryCall(CI, Builder, Intrinsic::ceil);
+  case LibFunc_floor:
+    return replaceUnaryCall(CI, Builder, Intrinsic::floor);
+  case LibFunc_round:
+    return replaceUnaryCall(CI, Builder, Intrinsic::round);
+  case LibFunc_roundeven:
+    return replaceUnaryCall(CI, Builder, Intrinsic::roundeven);
+  case LibFunc_nearbyint:
+    return replaceUnaryCall(CI, Builder, Intrinsic::nearbyint);
+  case LibFunc_rint:
+    return replaceUnaryCall(CI, Builder, Intrinsic::rint);
+  case LibFunc_trunc:
+    return replaceUnaryCall(CI, Builder, Intrinsic::trunc);
+  case LibFunc_acos:
+  case LibFunc_acosh:
+  case LibFunc_asin:
+  case LibFunc_asinh:
+  case LibFunc_atan:
+  case LibFunc_atanh:
+  case LibFunc_cbrt:
+  case LibFunc_cosh:
+  case LibFunc_exp:
+  case LibFunc_exp10:
+  case LibFunc_expm1:
+  case LibFunc_cos:
+  case LibFunc_sin:
+  case LibFunc_sinh:
+  case LibFunc_tanh:
+    if (UnsafeFPShrink && hasFloatVersion(CI->getCalledFunction()->getName()))
+      return optimizeUnaryDoubleFP(CI, Builder, true);
+    return nullptr;
+  case LibFunc_copysign:
+    if (hasFloatVersion(CI->getCalledFunction()->getName()))
+      return optimizeBinaryDoubleFP(CI, Builder);
+    return nullptr;
+  case LibFunc_fminf:
+  case LibFunc_fmin:
+  case LibFunc_fminl:
+  case LibFunc_fmaxf:
+  case LibFunc_fmax:
+  case LibFunc_fmaxl:
+    return optimizeFMinFMax(CI, Builder);
+  case LibFunc_cabs:
+  case LibFunc_cabsf:
+  case LibFunc_cabsl:
+    return optimizeCAbs(CI, Builder);
+  default:
+    return nullptr;
+  }
+}
+
+Value *LibCallSimplifier::optimizeCall(CallInst *CI, IRBuilderBase &Builder) {
+  // TODO: Split out the code below that operates on FP calls so that
+  //       we can all non-FP calls with the StrictFP attribute to be
+  //       optimized.
+  if (CI->isNoBuiltin())
+    return nullptr;
+
+  LibFunc Func;
+  Function *Callee = CI->getCalledFunction();
+  bool isCallingConvC = isCallingConvCCompatible(CI);
+
+  SmallVector<OperandBundleDef, 2> OpBundles;
+  CI->getOperandBundlesAsDefs(OpBundles);
+
+  IRBuilderBase::OperandBundlesGuard Guard(Builder);
+  Builder.setDefaultOperandBundles(OpBundles);
+
+  // Command-line parameter overrides instruction attribute.
+  // This can't be moved to optimizeFloatingPointLibCall() because it may be
+  // used by the intrinsic optimizations.
+  if (EnableUnsafeFPShrink.getNumOccurrences() > 0)
+    UnsafeFPShrink = EnableUnsafeFPShrink;
+  else if (isa<FPMathOperator>(CI) && CI->isFast())
+    UnsafeFPShrink = true;
+
+  // First, check for intrinsics.
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
+    if (!isCallingConvC)
+      return nullptr;
+    // The FP intrinsics have corresponding constrained versions so we don't
+    // need to check for the StrictFP attribute here.
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::pow:
+      return optimizePow(CI, Builder);
+    case Intrinsic::exp2:
+      return optimizeExp2(CI, Builder);
+    case Intrinsic::log:
+    case Intrinsic::log2:
+    case Intrinsic::log10:
+      return optimizeLog(CI, Builder);
+    case Intrinsic::sqrt:
+      return optimizeSqrt(CI, Builder);
+    // TODO: Use foldMallocMemset() with memset intrinsic.
+    case Intrinsic::memset:
+      return optimizeMemSet(CI, Builder);
+    case Intrinsic::memcpy:
+      return optimizeMemCpy(CI, Builder);
+    case Intrinsic::memmove:
+      return optimizeMemMove(CI, Builder);
+    default:
+      return nullptr;
+    }
+  }
+
+  // Also try to simplify calls to fortified library functions.
+  if (Value *SimplifiedFortifiedCI =
+          FortifiedSimplifier.optimizeCall(CI, Builder)) {
+    // Try to further simplify the result.
+    CallInst *SimplifiedCI = dyn_cast<CallInst>(SimplifiedFortifiedCI);
+    if (SimplifiedCI && SimplifiedCI->getCalledFunction()) {
+      // Ensure that SimplifiedCI's uses are complete, since some calls have
+      // their uses analyzed.
+      replaceAllUsesWith(CI, SimplifiedCI);
+
+      // Set insertion point to SimplifiedCI to guarantee we reach all uses
+      // we might replace later on.
+      IRBuilderBase::InsertPointGuard Guard(Builder);
+      Builder.SetInsertPoint(SimplifiedCI);
+      if (Value *V = optimizeStringMemoryLibCall(SimplifiedCI, Builder)) {
+        // If we were able to further simplify, remove the now redundant call.
+        substituteInParent(SimplifiedCI, V);
+        return V;
+      }
+    }
+    return SimplifiedFortifiedCI;
+  }
+
+  // Then check for known library functions.
+  if (TLI->getLibFunc(*Callee, Func) && TLI->has(Func)) {
+    // We never change the calling convention.
+    if (!ignoreCallingConv(Func) && !isCallingConvC)
+      return nullptr;
+    if (Value *V = optimizeStringMemoryLibCall(CI, Builder))
+      return V;
+    if (Value *V = optimizeFloatingPointLibCall(CI, Func, Builder))
+      return V;
+    switch (Func) {
+    case LibFunc_ffs:
+    case LibFunc_ffsl:
+    case LibFunc_ffsll:
+      return optimizeFFS(CI, Builder);
+    case LibFunc_fls:
+    case LibFunc_flsl:
+    case LibFunc_flsll:
+      return optimizeFls(CI, Builder);
+    case LibFunc_abs:
+    case LibFunc_labs:
+    case LibFunc_llabs:
+      return optimizeAbs(CI, Builder);
+    case LibFunc_isdigit:
+      return optimizeIsDigit(CI, Builder);
+    case LibFunc_isascii:
+      return optimizeIsAscii(CI, Builder);
+    case LibFunc_toascii:
+      return optimizeToAscii(CI, Builder);
+    case LibFunc_atoi:
+    case LibFunc_atol:
+    case LibFunc_atoll:
+      return optimizeAtoi(CI, Builder);
+    case LibFunc_strtol:
+    case LibFunc_strtoll:
+      return optimizeStrtol(CI, Builder);
+    case LibFunc_printf:
+      return optimizePrintF(CI, Builder);
+    case LibFunc_sprintf:
+      return optimizeSPrintF(CI, Builder);
+    case LibFunc_snprintf:
+      return optimizeSnPrintF(CI, Builder);
+    case LibFunc_fprintf:
+      return optimizeFPrintF(CI, Builder);
+    case LibFunc_fwrite:
+      return optimizeFWrite(CI, Builder);
+    case LibFunc_fputs:
+      return optimizeFPuts(CI, Builder);
+    case LibFunc_puts:
+      return optimizePuts(CI, Builder);
+    case LibFunc_perror:
+      return optimizeErrorReporting(CI, Builder);
+    case LibFunc_vfprintf:
+    case LibFunc_fiprintf:
+      return optimizeErrorReporting(CI, Builder, 0);
+    default:
+      return nullptr;
+    }
+  }
+  return nullptr;
+}
+
+LibCallSimplifier::LibCallSimplifier(
+    const DataLayout &DL, const TargetLibraryInfo *TLI,
+    OptimizationRemarkEmitter &ORE,
+    BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
+    function_ref<void(Instruction *, Value *)> Replacer,
+    function_ref<void(Instruction *)> Eraser)
+    : FortifiedSimplifier(TLI), DL(DL), TLI(TLI), ORE(ORE), BFI(BFI), PSI(PSI),
+      UnsafeFPShrink(false), Replacer(Replacer), Eraser(Eraser) {}
+
+void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) {
+  // Indirect through the replacer used in this instance.
+  Replacer(I, With);
+}
+
+void LibCallSimplifier::eraseFromParent(Instruction *I) {
+  Eraser(I);
+}
+
+// TODO:
+//   Additional cases that we need to add to this file:
+//
+// cbrt:
+//   * cbrt(expN(X))  -> expN(x/3)
+//   * cbrt(sqrt(x))  -> pow(x,1/6)
+//   * cbrt(cbrt(x))  -> pow(x,1/9)
+//
+// exp, expf, expl:
+//   * exp(log(x))  -> x
+//
+// log, logf, logl:
+//   * log(exp(x))   -> x
+//   * log(exp(y))   -> y*log(e)
+//   * log(exp10(y)) -> y*log(10)
+//   * log(sqrt(x))  -> 0.5*log(x)
+//
+// pow, powf, powl:
+//   * pow(sqrt(x),y) -> pow(x,y*0.5)
+//   * pow(pow(x,y),z)-> pow(x,y*z)
+//
+// signbit:
+//   * signbit(cnst) -> cnst'
+//   * signbit(nncst) -> 0 (if pstv is a non-negative constant)
+//
+// sqrt, sqrtf, sqrtl:
+//   * sqrt(expN(x))  -> expN(x*0.5)
+//   * sqrt(Nroot(x)) -> pow(x,1/(2*N))
+//   * sqrt(pow(x,y)) -> pow(|x|,y*0.5)
+//
+
+//===----------------------------------------------------------------------===//
+// Fortified Library Call Optimizations
+//===----------------------------------------------------------------------===//
+
+bool
+FortifiedLibCallSimplifier::isFortifiedCallFoldable(CallInst *CI,
+                                                    unsigned ObjSizeOp,
+                                                    Optional<unsigned> SizeOp,
+                                                    Optional<unsigned> StrOp,
+                                                    Optional<unsigned> FlagOp) {
+  // If this function takes a flag argument, the implementation may use it to
+  // perform extra checks. Don't fold into the non-checking variant.
+  if (FlagOp) {
+    ConstantInt *Flag = dyn_cast<ConstantInt>(CI->getArgOperand(*FlagOp));
+    if (!Flag || !Flag->isZero())
+      return false;
+  }
+
+  if (SizeOp && CI->getArgOperand(ObjSizeOp) == CI->getArgOperand(*SizeOp))
+    return true;
+
+  if (ConstantInt *ObjSizeCI =
+          dyn_cast<ConstantInt>(CI->getArgOperand(ObjSizeOp))) {
+    if (ObjSizeCI->isMinusOne())
+      return true;
+    // If the object size wasn't -1 (unknown), bail out if we were asked to.
+    if (OnlyLowerUnknownSize)
+      return false;
+    if (StrOp) {
+      uint64_t Len = GetStringLength(CI->getArgOperand(*StrOp));
+      // If the length is 0 we don't know how long it is and so we can't
+      // remove the check.
+      if (Len)
+        annotateDereferenceableBytes(CI, *StrOp, Len);
+      else
+        return false;
+      return ObjSizeCI->getZExtValue() >= Len;
+    }
+
+    if (SizeOp) {
+      if (ConstantInt *SizeCI =
+              dyn_cast<ConstantInt>(CI->getArgOperand(*SizeOp)))
+        return ObjSizeCI->getZExtValue() >= SizeCI->getZExtValue();
+    }
+  }
+  return false;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI,
+                                                     IRBuilderBase &B) {
+  if (isFortifiedCallFoldable(CI, 3, 2)) {
+    CallInst *NewCI =
+        B.CreateMemCpy(CI->getArgOperand(0), Align(1), CI->getArgOperand(1),
+                       Align(1), CI->getArgOperand(2));
+    NewCI->setAttributes(CI->getAttributes());
     NewCI->removeAttributes(AttributeList::ReturnIndex,
                             AttributeFuncs::typeIncompatible(NewCI->getType()));
-    return CI->getArgOperand(0); 
-  } 
-  return nullptr; 
-} 
- 
-Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI, 
-                                                      IRBuilderBase &B) { 
-  if (isFortifiedCallFoldable(CI, 3, 2)) { 
-    CallInst *NewCI = 
-        B.CreateMemMove(CI->getArgOperand(0), Align(1), CI->getArgOperand(1), 
-                        Align(1), CI->getArgOperand(2)); 
-    NewCI->setAttributes(CI->getAttributes()); 
+    return CI->getArgOperand(0);
+  }
+  return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI,
+                                                      IRBuilderBase &B) {
+  if (isFortifiedCallFoldable(CI, 3, 2)) {
+    CallInst *NewCI =
+        B.CreateMemMove(CI->getArgOperand(0), Align(1), CI->getArgOperand(1),
+                        Align(1), CI->getArgOperand(2));
+    NewCI->setAttributes(CI->getAttributes());
     NewCI->removeAttributes(AttributeList::ReturnIndex,
                             AttributeFuncs::typeIncompatible(NewCI->getType()));
-    return CI->getArgOperand(0); 
-  } 
-  return nullptr; 
-} 
- 
-Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI, 
-                                                     IRBuilderBase &B) { 
-  // TODO: Try foldMallocMemset() here. 
- 
-  if (isFortifiedCallFoldable(CI, 3, 2)) { 
-    Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false); 
-    CallInst *NewCI = B.CreateMemSet(CI->getArgOperand(0), Val, 
-                                     CI->getArgOperand(2), Align(1)); 
-    NewCI->setAttributes(CI->getAttributes()); 
+    return CI->getArgOperand(0);
+  }
+  return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI,
+                                                     IRBuilderBase &B) {
+  // TODO: Try foldMallocMemset() here.
+
+  if (isFortifiedCallFoldable(CI, 3, 2)) {
+    Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
+    CallInst *NewCI = B.CreateMemSet(CI->getArgOperand(0), Val,
+                                     CI->getArgOperand(2), Align(1));
+    NewCI->setAttributes(CI->getAttributes());
     NewCI->removeAttributes(AttributeList::ReturnIndex,
                             AttributeFuncs::typeIncompatible(NewCI->getType()));
-    return CI->getArgOperand(0); 
-  } 
-  return nullptr; 
-} 
- 
+    return CI->getArgOperand(0);
+  }
+  return nullptr;
+}
+
 Value *FortifiedLibCallSimplifier::optimizeMemPCpyChk(CallInst *CI,
                                                       IRBuilderBase &B) {
   const DataLayout &DL = CI->getModule()->getDataLayout();
@@ -3331,233 +3331,233 @@ Value *FortifiedLibCallSimplifier::optimizeMemPCpyChk(CallInst *CI,
   return nullptr;
 }
 
-Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI, 
-                                                      IRBuilderBase &B, 
-                                                      LibFunc Func) { 
-  const DataLayout &DL = CI->getModule()->getDataLayout(); 
-  Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1), 
-        *ObjSize = CI->getArgOperand(2); 
- 
-  // __stpcpy_chk(x,x,...)  -> x+strlen(x) 
-  if (Func == LibFunc_stpcpy_chk && !OnlyLowerUnknownSize && Dst == Src) { 
-    Value *StrLen = emitStrLen(Src, B, DL, TLI); 
-    return StrLen ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, StrLen) : nullptr; 
-  } 
- 
-  // If a) we don't have any length information, or b) we know this will 
-  // fit then just lower to a plain st[rp]cpy. Otherwise we'll keep our 
-  // st[rp]cpy_chk call which may fail at runtime if the size is too long. 
-  // TODO: It might be nice to get a maximum length out of the possible 
-  // string lengths for varying. 
-  if (isFortifiedCallFoldable(CI, 2, None, 1)) { 
-    if (Func == LibFunc_strcpy_chk) 
-      return emitStrCpy(Dst, Src, B, TLI); 
-    else 
-      return emitStpCpy(Dst, Src, B, TLI); 
-  } 
- 
-  if (OnlyLowerUnknownSize) 
-    return nullptr; 
- 
-  // Maybe we can stil fold __st[rp]cpy_chk to __memcpy_chk. 
-  uint64_t Len = GetStringLength(Src); 
-  if (Len) 
-    annotateDereferenceableBytes(CI, 1, Len); 
-  else 
-    return nullptr; 
- 
-  Type *SizeTTy = DL.getIntPtrType(CI->getContext()); 
-  Value *LenV = ConstantInt::get(SizeTTy, Len); 
-  Value *Ret = emitMemCpyChk(Dst, Src, LenV, ObjSize, B, DL, TLI); 
-  // If the function was an __stpcpy_chk, and we were able to fold it into 
-  // a __memcpy_chk, we still need to return the correct end pointer. 
-  if (Ret && Func == LibFunc_stpcpy_chk) 
-    return B.CreateGEP(B.getInt8Ty(), Dst, ConstantInt::get(SizeTTy, Len - 1)); 
-  return Ret; 
-} 
- 
-Value *FortifiedLibCallSimplifier::optimizeStrLenChk(CallInst *CI, 
-                                                     IRBuilderBase &B) { 
-  if (isFortifiedCallFoldable(CI, 1, None, 0)) 
-    return emitStrLen(CI->getArgOperand(0), B, CI->getModule()->getDataLayout(), 
-                      TLI); 
-  return nullptr; 
-} 
- 
-Value *FortifiedLibCallSimplifier::optimizeStrpNCpyChk(CallInst *CI, 
-                                                       IRBuilderBase &B, 
-                                                       LibFunc Func) { 
-  if (isFortifiedCallFoldable(CI, 3, 2)) { 
-    if (Func == LibFunc_strncpy_chk) 
-      return emitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1), 
-                               CI->getArgOperand(2), B, TLI); 
-    else 
-      return emitStpNCpy(CI->getArgOperand(0), CI->getArgOperand(1), 
-                         CI->getArgOperand(2), B, TLI); 
-  } 
- 
-  return nullptr; 
-} 
- 
-Value *FortifiedLibCallSimplifier::optimizeMemCCpyChk(CallInst *CI, 
-                                                      IRBuilderBase &B) { 
-  if (isFortifiedCallFoldable(CI, 4, 3)) 
-    return emitMemCCpy(CI->getArgOperand(0), CI->getArgOperand(1), 
-                       CI->getArgOperand(2), CI->getArgOperand(3), B, TLI); 
- 
-  return nullptr; 
-} 
- 
-Value *FortifiedLibCallSimplifier::optimizeSNPrintfChk(CallInst *CI, 
-                                                       IRBuilderBase &B) { 
-  if (isFortifiedCallFoldable(CI, 3, 1, None, 2)) { 
+Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI,
+                                                      IRBuilderBase &B,
+                                                      LibFunc Func) {
+  const DataLayout &DL = CI->getModule()->getDataLayout();
+  Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1),
+        *ObjSize = CI->getArgOperand(2);
+
+  // __stpcpy_chk(x,x,...)  -> x+strlen(x)
+  if (Func == LibFunc_stpcpy_chk && !OnlyLowerUnknownSize && Dst == Src) {
+    Value *StrLen = emitStrLen(Src, B, DL, TLI);
+    return StrLen ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, StrLen) : nullptr;
+  }
+
+  // If a) we don't have any length information, or b) we know this will
+  // fit then just lower to a plain st[rp]cpy. Otherwise we'll keep our
+  // st[rp]cpy_chk call which may fail at runtime if the size is too long.
+  // TODO: It might be nice to get a maximum length out of the possible
+  // string lengths for varying.
+  if (isFortifiedCallFoldable(CI, 2, None, 1)) {
+    if (Func == LibFunc_strcpy_chk)
+      return emitStrCpy(Dst, Src, B, TLI);
+    else
+      return emitStpCpy(Dst, Src, B, TLI);
+  }
+
+  if (OnlyLowerUnknownSize)
+    return nullptr;
+
+  // Maybe we can stil fold __st[rp]cpy_chk to __memcpy_chk.
+  uint64_t Len = GetStringLength(Src);
+  if (Len)
+    annotateDereferenceableBytes(CI, 1, Len);
+  else
+    return nullptr;
+
+  Type *SizeTTy = DL.getIntPtrType(CI->getContext());
+  Value *LenV = ConstantInt::get(SizeTTy, Len);
+  Value *Ret = emitMemCpyChk(Dst, Src, LenV, ObjSize, B, DL, TLI);
+  // If the function was an __stpcpy_chk, and we were able to fold it into
+  // a __memcpy_chk, we still need to return the correct end pointer.
+  if (Ret && Func == LibFunc_stpcpy_chk)
+    return B.CreateGEP(B.getInt8Ty(), Dst, ConstantInt::get(SizeTTy, Len - 1));
+  return Ret;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeStrLenChk(CallInst *CI,
+                                                     IRBuilderBase &B) {
+  if (isFortifiedCallFoldable(CI, 1, None, 0))
+    return emitStrLen(CI->getArgOperand(0), B, CI->getModule()->getDataLayout(),
+                      TLI);
+  return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeStrpNCpyChk(CallInst *CI,
+                                                       IRBuilderBase &B,
+                                                       LibFunc Func) {
+  if (isFortifiedCallFoldable(CI, 3, 2)) {
+    if (Func == LibFunc_strncpy_chk)
+      return emitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                               CI->getArgOperand(2), B, TLI);
+    else
+      return emitStpNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                         CI->getArgOperand(2), B, TLI);
+  }
+
+  return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeMemCCpyChk(CallInst *CI,
+                                                      IRBuilderBase &B) {
+  if (isFortifiedCallFoldable(CI, 4, 3))
+    return emitMemCCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                       CI->getArgOperand(2), CI->getArgOperand(3), B, TLI);
+
+  return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeSNPrintfChk(CallInst *CI,
+                                                       IRBuilderBase &B) {
+  if (isFortifiedCallFoldable(CI, 3, 1, None, 2)) {
     SmallVector<Value *, 8> VariadicArgs(drop_begin(CI->args(), 5));
-    return emitSNPrintf(CI->getArgOperand(0), CI->getArgOperand(1), 
-                        CI->getArgOperand(4), VariadicArgs, B, TLI); 
-  } 
- 
-  return nullptr; 
-} 
- 
-Value *FortifiedLibCallSimplifier::optimizeSPrintfChk(CallInst *CI, 
-                                                      IRBuilderBase &B) { 
-  if (isFortifiedCallFoldable(CI, 2, None, None, 1)) { 
+    return emitSNPrintf(CI->getArgOperand(0), CI->getArgOperand(1),
+                        CI->getArgOperand(4), VariadicArgs, B, TLI);
+  }
+
+  return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeSPrintfChk(CallInst *CI,
+                                                      IRBuilderBase &B) {
+  if (isFortifiedCallFoldable(CI, 2, None, None, 1)) {
     SmallVector<Value *, 8> VariadicArgs(drop_begin(CI->args(), 4));
-    return emitSPrintf(CI->getArgOperand(0), CI->getArgOperand(3), VariadicArgs, 
-                       B, TLI); 
-  } 
- 
-  return nullptr; 
-} 
- 
-Value *FortifiedLibCallSimplifier::optimizeStrCatChk(CallInst *CI, 
-                                                     IRBuilderBase &B) { 
-  if (isFortifiedCallFoldable(CI, 2)) 
-    return emitStrCat(CI->getArgOperand(0), CI->getArgOperand(1), B, TLI); 
- 
-  return nullptr; 
-} 
- 
-Value *FortifiedLibCallSimplifier::optimizeStrLCat(CallInst *CI, 
-                                                   IRBuilderBase &B) { 
-  if (isFortifiedCallFoldable(CI, 3)) 
-    return emitStrLCat(CI->getArgOperand(0), CI->getArgOperand(1), 
-                       CI->getArgOperand(2), B, TLI); 
- 
-  return nullptr; 
-} 
- 
-Value *FortifiedLibCallSimplifier::optimizeStrNCatChk(CallInst *CI, 
-                                                      IRBuilderBase &B) { 
-  if (isFortifiedCallFoldable(CI, 3)) 
-    return emitStrNCat(CI->getArgOperand(0), CI->getArgOperand(1), 
-                       CI->getArgOperand(2), B, TLI); 
- 
-  return nullptr; 
-} 
- 
-Value *FortifiedLibCallSimplifier::optimizeStrLCpyChk(CallInst *CI, 
-                                                      IRBuilderBase &B) { 
-  if (isFortifiedCallFoldable(CI, 3)) 
-    return emitStrLCpy(CI->getArgOperand(0), CI->getArgOperand(1), 
-                       CI->getArgOperand(2), B, TLI); 
- 
-  return nullptr; 
-} 
- 
-Value *FortifiedLibCallSimplifier::optimizeVSNPrintfChk(CallInst *CI, 
-                                                        IRBuilderBase &B) { 
-  if (isFortifiedCallFoldable(CI, 3, 1, None, 2)) 
-    return emitVSNPrintf(CI->getArgOperand(0), CI->getArgOperand(1), 
-                         CI->getArgOperand(4), CI->getArgOperand(5), B, TLI); 
- 
-  return nullptr; 
-} 
- 
-Value *FortifiedLibCallSimplifier::optimizeVSPrintfChk(CallInst *CI, 
-                                                       IRBuilderBase &B) { 
-  if (isFortifiedCallFoldable(CI, 2, None, None, 1)) 
-    return emitVSPrintf(CI->getArgOperand(0), CI->getArgOperand(3), 
-                        CI->getArgOperand(4), B, TLI); 
- 
-  return nullptr; 
-} 
- 
-Value *FortifiedLibCallSimplifier::optimizeCall(CallInst *CI, 
-                                                IRBuilderBase &Builder) { 
-  // FIXME: We shouldn't be changing "nobuiltin" or TLI unavailable calls here. 
-  // Some clang users checked for _chk libcall availability using: 
-  //   __has_builtin(__builtin___memcpy_chk) 
-  // When compiling with -fno-builtin, this is always true. 
-  // When passing -ffreestanding/-mkernel, which both imply -fno-builtin, we 
-  // end up with fortified libcalls, which isn't acceptable in a freestanding 
-  // environment which only provides their non-fortified counterparts. 
-  // 
-  // Until we change clang and/or teach external users to check for availability 
-  // differently, disregard the "nobuiltin" attribute and TLI::has. 
-  // 
-  // PR23093. 
- 
-  LibFunc Func; 
-  Function *Callee = CI->getCalledFunction(); 
-  bool isCallingConvC = isCallingConvCCompatible(CI); 
- 
-  SmallVector<OperandBundleDef, 2> OpBundles; 
-  CI->getOperandBundlesAsDefs(OpBundles); 
- 
-  IRBuilderBase::OperandBundlesGuard Guard(Builder); 
-  Builder.setDefaultOperandBundles(OpBundles); 
- 
-  // First, check that this is a known library functions and that the prototype 
-  // is correct. 
-  if (!TLI->getLibFunc(*Callee, Func)) 
-    return nullptr; 
- 
-  // We never change the calling convention. 
-  if (!ignoreCallingConv(Func) && !isCallingConvC) 
-    return nullptr; 
- 
-  switch (Func) { 
-  case LibFunc_memcpy_chk: 
-    return optimizeMemCpyChk(CI, Builder); 
+    return emitSPrintf(CI->getArgOperand(0), CI->getArgOperand(3), VariadicArgs,
+                       B, TLI);
+  }
+
+  return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeStrCatChk(CallInst *CI,
+                                                     IRBuilderBase &B) {
+  if (isFortifiedCallFoldable(CI, 2))
+    return emitStrCat(CI->getArgOperand(0), CI->getArgOperand(1), B, TLI);
+
+  return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeStrLCat(CallInst *CI,
+                                                   IRBuilderBase &B) {
+  if (isFortifiedCallFoldable(CI, 3))
+    return emitStrLCat(CI->getArgOperand(0), CI->getArgOperand(1),
+                       CI->getArgOperand(2), B, TLI);
+
+  return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeStrNCatChk(CallInst *CI,
+                                                      IRBuilderBase &B) {
+  if (isFortifiedCallFoldable(CI, 3))
+    return emitStrNCat(CI->getArgOperand(0), CI->getArgOperand(1),
+                       CI->getArgOperand(2), B, TLI);
+
+  return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeStrLCpyChk(CallInst *CI,
+                                                      IRBuilderBase &B) {
+  if (isFortifiedCallFoldable(CI, 3))
+    return emitStrLCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                       CI->getArgOperand(2), B, TLI);
+
+  return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeVSNPrintfChk(CallInst *CI,
+                                                        IRBuilderBase &B) {
+  if (isFortifiedCallFoldable(CI, 3, 1, None, 2))
+    return emitVSNPrintf(CI->getArgOperand(0), CI->getArgOperand(1),
+                         CI->getArgOperand(4), CI->getArgOperand(5), B, TLI);
+
+  return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeVSPrintfChk(CallInst *CI,
+                                                       IRBuilderBase &B) {
+  if (isFortifiedCallFoldable(CI, 2, None, None, 1))
+    return emitVSPrintf(CI->getArgOperand(0), CI->getArgOperand(3),
+                        CI->getArgOperand(4), B, TLI);
+
+  return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeCall(CallInst *CI,
+                                                IRBuilderBase &Builder) {
+  // FIXME: We shouldn't be changing "nobuiltin" or TLI unavailable calls here.
+  // Some clang users checked for _chk libcall availability using:
+  //   __has_builtin(__builtin___memcpy_chk)
+  // When compiling with -fno-builtin, this is always true.
+  // When passing -ffreestanding/-mkernel, which both imply -fno-builtin, we
+  // end up with fortified libcalls, which isn't acceptable in a freestanding
+  // environment which only provides their non-fortified counterparts.
+  //
+  // Until we change clang and/or teach external users to check for availability
+  // differently, disregard the "nobuiltin" attribute and TLI::has.
+  //
+  // PR23093.
+
+  LibFunc Func;
+  Function *Callee = CI->getCalledFunction();
+  bool isCallingConvC = isCallingConvCCompatible(CI);
+
+  SmallVector<OperandBundleDef, 2> OpBundles;
+  CI->getOperandBundlesAsDefs(OpBundles);
+
+  IRBuilderBase::OperandBundlesGuard Guard(Builder);
+  Builder.setDefaultOperandBundles(OpBundles);
+
+  // First, check that this is a known library functions and that the prototype
+  // is correct.
+  if (!TLI->getLibFunc(*Callee, Func))
+    return nullptr;
+
+  // We never change the calling convention.
+  if (!ignoreCallingConv(Func) && !isCallingConvC)
+    return nullptr;
+
+  switch (Func) {
+  case LibFunc_memcpy_chk:
+    return optimizeMemCpyChk(CI, Builder);
   case LibFunc_mempcpy_chk:
     return optimizeMemPCpyChk(CI, Builder);
-  case LibFunc_memmove_chk: 
-    return optimizeMemMoveChk(CI, Builder); 
-  case LibFunc_memset_chk: 
-    return optimizeMemSetChk(CI, Builder); 
-  case LibFunc_stpcpy_chk: 
-  case LibFunc_strcpy_chk: 
-    return optimizeStrpCpyChk(CI, Builder, Func); 
-  case LibFunc_strlen_chk: 
-    return optimizeStrLenChk(CI, Builder); 
-  case LibFunc_stpncpy_chk: 
-  case LibFunc_strncpy_chk: 
-    return optimizeStrpNCpyChk(CI, Builder, Func); 
-  case LibFunc_memccpy_chk: 
-    return optimizeMemCCpyChk(CI, Builder); 
-  case LibFunc_snprintf_chk: 
-    return optimizeSNPrintfChk(CI, Builder); 
-  case LibFunc_sprintf_chk: 
-    return optimizeSPrintfChk(CI, Builder); 
-  case LibFunc_strcat_chk: 
-    return optimizeStrCatChk(CI, Builder); 
-  case LibFunc_strlcat_chk: 
-    return optimizeStrLCat(CI, Builder); 
-  case LibFunc_strncat_chk: 
-    return optimizeStrNCatChk(CI, Builder); 
-  case LibFunc_strlcpy_chk: 
-    return optimizeStrLCpyChk(CI, Builder); 
-  case LibFunc_vsnprintf_chk: 
-    return optimizeVSNPrintfChk(CI, Builder); 
-  case LibFunc_vsprintf_chk: 
-    return optimizeVSPrintfChk(CI, Builder); 
-  default: 
-    break; 
-  } 
-  return nullptr; 
-} 
- 
-FortifiedLibCallSimplifier::FortifiedLibCallSimplifier( 
-    const TargetLibraryInfo *TLI, bool OnlyLowerUnknownSize) 
-    : TLI(TLI), OnlyLowerUnknownSize(OnlyLowerUnknownSize) {} 
+  case LibFunc_memmove_chk:
+    return optimizeMemMoveChk(CI, Builder);
+  case LibFunc_memset_chk:
+    return optimizeMemSetChk(CI, Builder);
+  case LibFunc_stpcpy_chk:
+  case LibFunc_strcpy_chk:
+    return optimizeStrpCpyChk(CI, Builder, Func);
+  case LibFunc_strlen_chk:
+    return optimizeStrLenChk(CI, Builder);
+  case LibFunc_stpncpy_chk:
+  case LibFunc_strncpy_chk:
+    return optimizeStrpNCpyChk(CI, Builder, Func);
+  case LibFunc_memccpy_chk:
+    return optimizeMemCCpyChk(CI, Builder);
+  case LibFunc_snprintf_chk:
+    return optimizeSNPrintfChk(CI, Builder);
+  case LibFunc_sprintf_chk:
+    return optimizeSPrintfChk(CI, Builder);
+  case LibFunc_strcat_chk:
+    return optimizeStrCatChk(CI, Builder);
+  case LibFunc_strlcat_chk:
+    return optimizeStrLCat(CI, Builder);
+  case LibFunc_strncat_chk:
+    return optimizeStrNCatChk(CI, Builder);
+  case LibFunc_strlcpy_chk:
+    return optimizeStrLCpyChk(CI, Builder);
+  case LibFunc_vsnprintf_chk:
+    return optimizeVSNPrintfChk(CI, Builder);
+  case LibFunc_vsprintf_chk:
+    return optimizeVSPrintfChk(CI, Builder);
+  default:
+    break;
+  }
+  return nullptr;
+}
+
+FortifiedLibCallSimplifier::FortifiedLibCallSimplifier(
+    const TargetLibraryInfo *TLI, bool OnlyLowerUnknownSize)
+    : TLI(TLI), OnlyLowerUnknownSize(OnlyLowerUnknownSize) {}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/SizeOpts.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/SizeOpts.cpp
index 02abd43851..beeb60698f 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/SizeOpts.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/SizeOpts.cpp
@@ -1,111 +1,111 @@
-//===-- SizeOpts.cpp - code size optimization related code ----------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file contains some shared code size optimization related code. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/SizeOpts.h" 
- 
-using namespace llvm; 
- 
-cl::opt<bool> EnablePGSO( 
-    "pgso", cl::Hidden, cl::init(true), 
-    cl::desc("Enable the profile guided size optimizations. ")); 
- 
-cl::opt<bool> PGSOLargeWorkingSetSizeOnly( 
-    "pgso-lwss-only", cl::Hidden, cl::init(true), 
-    cl::desc("Apply the profile guided size optimizations only " 
-             "if the working set size is large (except for cold code.)")); 
- 
-cl::opt<bool> PGSOColdCodeOnly( 
-    "pgso-cold-code-only", cl::Hidden, cl::init(false), 
-    cl::desc("Apply the profile guided size optimizations only " 
-             "to cold code.")); 
- 
-cl::opt<bool> PGSOColdCodeOnlyForInstrPGO( 
-    "pgso-cold-code-only-for-instr-pgo", cl::Hidden, cl::init(false), 
-    cl::desc("Apply the profile guided size optimizations only " 
-             "to cold code under instrumentation PGO.")); 
- 
-cl::opt<bool> PGSOColdCodeOnlyForSamplePGO( 
-    "pgso-cold-code-only-for-sample-pgo", cl::Hidden, cl::init(false), 
-    cl::desc("Apply the profile guided size optimizations only " 
-             "to cold code under sample PGO.")); 
- 
-cl::opt<bool> PGSOColdCodeOnlyForPartialSamplePGO( 
-    "pgso-cold-code-only-for-partial-sample-pgo", cl::Hidden, cl::init(false), 
-    cl::desc("Apply the profile guided size optimizations only " 
-             "to cold code under partial-profile sample PGO.")); 
- 
-cl::opt<bool> ForcePGSO( 
-    "force-pgso", cl::Hidden, cl::init(false), 
-    cl::desc("Force the (profiled-guided) size optimizations. ")); 
- 
-cl::opt<int> PgsoCutoffInstrProf( 
-    "pgso-cutoff-instr-prof", cl::Hidden, cl::init(950000), cl::ZeroOrMore, 
-    cl::desc("The profile guided size optimization profile summary cutoff " 
-             "for instrumentation profile.")); 
- 
-cl::opt<int> PgsoCutoffSampleProf( 
-    "pgso-cutoff-sample-prof", cl::Hidden, cl::init(990000), cl::ZeroOrMore, 
-    cl::desc("The profile guided size optimization profile summary cutoff " 
-             "for sample profile.")); 
- 
-namespace { 
-struct BasicBlockBFIAdapter { 
-  static bool isFunctionColdInCallGraph(const Function *F, 
-                                        ProfileSummaryInfo *PSI, 
-                                        BlockFrequencyInfo &BFI) { 
-    return PSI->isFunctionColdInCallGraph(F, BFI); 
-  } 
-  static bool isFunctionHotInCallGraphNthPercentile(int CutOff, 
-                                                    const Function *F, 
-                                                    ProfileSummaryInfo *PSI, 
-                                                    BlockFrequencyInfo &BFI) { 
-    return PSI->isFunctionHotInCallGraphNthPercentile(CutOff, F, BFI); 
-  } 
-  static bool isFunctionColdInCallGraphNthPercentile(int CutOff, 
-                                                     const Function *F, 
-                                                     ProfileSummaryInfo *PSI, 
-                                                     BlockFrequencyInfo &BFI) { 
-    return PSI->isFunctionColdInCallGraphNthPercentile(CutOff, F, BFI); 
-  } 
-  static bool isColdBlock(const BasicBlock *BB, 
-                          ProfileSummaryInfo *PSI, 
-                          BlockFrequencyInfo *BFI) { 
-    return PSI->isColdBlock(BB, BFI); 
-  } 
-  static bool isHotBlockNthPercentile(int CutOff, 
-                                      const BasicBlock *BB, 
-                                      ProfileSummaryInfo *PSI, 
-                                      BlockFrequencyInfo *BFI) { 
-    return PSI->isHotBlockNthPercentile(CutOff, BB, BFI); 
-  } 
-  static bool isColdBlockNthPercentile(int CutOff, const BasicBlock *BB, 
-                                       ProfileSummaryInfo *PSI, 
-                                       BlockFrequencyInfo *BFI) { 
-    return PSI->isColdBlockNthPercentile(CutOff, BB, BFI); 
-  } 
-}; 
-} // end anonymous namespace 
- 
-bool llvm::shouldOptimizeForSize(const Function *F, ProfileSummaryInfo *PSI, 
-                                 BlockFrequencyInfo *BFI, 
-                                 PGSOQueryType QueryType) { 
-  return shouldFuncOptimizeForSizeImpl<BasicBlockBFIAdapter>(F, PSI, BFI, 
-                                                             QueryType); 
-} 
- 
-bool llvm::shouldOptimizeForSize(const BasicBlock *BB, ProfileSummaryInfo *PSI, 
-                                 BlockFrequencyInfo *BFI, 
-                                 PGSOQueryType QueryType) { 
-  assert(BB); 
-  return shouldOptimizeForSizeImpl<BasicBlockBFIAdapter>(BB, PSI, BFI, 
-                                                         QueryType); 
-} 
+//===-- SizeOpts.cpp - code size optimization related code ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains some shared code size optimization related code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SizeOpts.h"
+
+using namespace llvm;
+
+cl::opt<bool> EnablePGSO(
+    "pgso", cl::Hidden, cl::init(true),
+    cl::desc("Enable the profile guided size optimizations. "));
+
+cl::opt<bool> PGSOLargeWorkingSetSizeOnly(
+    "pgso-lwss-only", cl::Hidden, cl::init(true),
+    cl::desc("Apply the profile guided size optimizations only "
+             "if the working set size is large (except for cold code.)"));
+
+cl::opt<bool> PGSOColdCodeOnly(
+    "pgso-cold-code-only", cl::Hidden, cl::init(false),
+    cl::desc("Apply the profile guided size optimizations only "
+             "to cold code."));
+
+cl::opt<bool> PGSOColdCodeOnlyForInstrPGO(
+    "pgso-cold-code-only-for-instr-pgo", cl::Hidden, cl::init(false),
+    cl::desc("Apply the profile guided size optimizations only "
+             "to cold code under instrumentation PGO."));
+
+cl::opt<bool> PGSOColdCodeOnlyForSamplePGO(
+    "pgso-cold-code-only-for-sample-pgo", cl::Hidden, cl::init(false),
+    cl::desc("Apply the profile guided size optimizations only "
+             "to cold code under sample PGO."));
+
+cl::opt<bool> PGSOColdCodeOnlyForPartialSamplePGO(
+    "pgso-cold-code-only-for-partial-sample-pgo", cl::Hidden, cl::init(false),
+    cl::desc("Apply the profile guided size optimizations only "
+             "to cold code under partial-profile sample PGO."));
+
+cl::opt<bool> ForcePGSO(
+    "force-pgso", cl::Hidden, cl::init(false),
+    cl::desc("Force the (profiled-guided) size optimizations. "));
+
+cl::opt<int> PgsoCutoffInstrProf(
+    "pgso-cutoff-instr-prof", cl::Hidden, cl::init(950000), cl::ZeroOrMore,
+    cl::desc("The profile guided size optimization profile summary cutoff "
+             "for instrumentation profile."));
+
+cl::opt<int> PgsoCutoffSampleProf(
+    "pgso-cutoff-sample-prof", cl::Hidden, cl::init(990000), cl::ZeroOrMore,
+    cl::desc("The profile guided size optimization profile summary cutoff "
+             "for sample profile."));
+
+namespace {
+struct BasicBlockBFIAdapter {
+  static bool isFunctionColdInCallGraph(const Function *F,
+                                        ProfileSummaryInfo *PSI,
+                                        BlockFrequencyInfo &BFI) {
+    return PSI->isFunctionColdInCallGraph(F, BFI);
+  }
+  static bool isFunctionHotInCallGraphNthPercentile(int CutOff,
+                                                    const Function *F,
+                                                    ProfileSummaryInfo *PSI,
+                                                    BlockFrequencyInfo &BFI) {
+    return PSI->isFunctionHotInCallGraphNthPercentile(CutOff, F, BFI);
+  }
+  static bool isFunctionColdInCallGraphNthPercentile(int CutOff,
+                                                     const Function *F,
+                                                     ProfileSummaryInfo *PSI,
+                                                     BlockFrequencyInfo &BFI) {
+    return PSI->isFunctionColdInCallGraphNthPercentile(CutOff, F, BFI);
+  }
+  static bool isColdBlock(const BasicBlock *BB,
+                          ProfileSummaryInfo *PSI,
+                          BlockFrequencyInfo *BFI) {
+    return PSI->isColdBlock(BB, BFI);
+  }
+  static bool isHotBlockNthPercentile(int CutOff,
+                                      const BasicBlock *BB,
+                                      ProfileSummaryInfo *PSI,
+                                      BlockFrequencyInfo *BFI) {
+    return PSI->isHotBlockNthPercentile(CutOff, BB, BFI);
+  }
+  static bool isColdBlockNthPercentile(int CutOff, const BasicBlock *BB,
+                                       ProfileSummaryInfo *PSI,
+                                       BlockFrequencyInfo *BFI) {
+    return PSI->isColdBlockNthPercentile(CutOff, BB, BFI);
+  }
+};
+} // end anonymous namespace
+
+bool llvm::shouldOptimizeForSize(const Function *F, ProfileSummaryInfo *PSI,
+                                 BlockFrequencyInfo *BFI,
+                                 PGSOQueryType QueryType) {
+  return shouldFuncOptimizeForSizeImpl<BasicBlockBFIAdapter>(F, PSI, BFI,
+                                                             QueryType);
+}
+
+bool llvm::shouldOptimizeForSize(const BasicBlock *BB, ProfileSummaryInfo *PSI,
+                                 BlockFrequencyInfo *BFI,
+                                 PGSOQueryType QueryType) {
+  assert(BB);
+  return shouldOptimizeForSizeImpl<BasicBlockBFIAdapter>(BB, PSI, BFI,
+                                                         QueryType);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/SplitModule.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/SplitModule.cpp
index eb27914fc7..e2c387cb89 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/SplitModule.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/SplitModule.cpp
@@ -1,284 +1,284 @@
-//===- SplitModule.cpp - Split a module into partitions -------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file defines the function llvm::SplitModule, which splits a module 
-// into multiple linkable partitions. It can be used to implement parallel code 
-// generation for link-time optimization. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/SplitModule.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/EquivalenceClasses.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/StringRef.h" 
-#include "llvm/IR/Comdat.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GlobalAlias.h" 
-#include "llvm/IR/GlobalObject.h" 
-#include "llvm/IR/GlobalIndirectSymbol.h" 
-#include "llvm/IR/GlobalValue.h" 
-#include "llvm/IR/GlobalVariable.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/MD5.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Utils/Cloning.h" 
-#include "llvm/Transforms/Utils/ValueMapper.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <iterator> 
-#include <memory> 
-#include <queue> 
-#include <utility> 
-#include <vector> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "split-module" 
- 
-namespace { 
- 
-using ClusterMapType = EquivalenceClasses<const GlobalValue *>; 
-using ComdatMembersType = DenseMap<const Comdat *, const GlobalValue *>; 
-using ClusterIDMapType = DenseMap<const GlobalValue *, unsigned>; 
- 
-} // end anonymous namespace 
- 
-static void addNonConstUser(ClusterMapType &GVtoClusterMap, 
-                            const GlobalValue *GV, const User *U) { 
-  assert((!isa<Constant>(U) || isa<GlobalValue>(U)) && "Bad user"); 
- 
-  if (const Instruction *I = dyn_cast<Instruction>(U)) { 
-    const GlobalValue *F = I->getParent()->getParent(); 
-    GVtoClusterMap.unionSets(GV, F); 
-  } else if (isa<GlobalIndirectSymbol>(U) || isa<Function>(U) || 
-             isa<GlobalVariable>(U)) { 
-    GVtoClusterMap.unionSets(GV, cast<GlobalValue>(U)); 
-  } else { 
-    llvm_unreachable("Underimplemented use case"); 
-  } 
-} 
- 
-// Adds all GlobalValue users of V to the same cluster as GV. 
-static void addAllGlobalValueUsers(ClusterMapType &GVtoClusterMap, 
-                                   const GlobalValue *GV, const Value *V) { 
-  for (auto *U : V->users()) { 
-    SmallVector<const User *, 4> Worklist; 
-    Worklist.push_back(U); 
-    while (!Worklist.empty()) { 
-      const User *UU = Worklist.pop_back_val(); 
-      // For each constant that is not a GV (a pure const) recurse. 
-      if (isa<Constant>(UU) && !isa<GlobalValue>(UU)) { 
-        Worklist.append(UU->user_begin(), UU->user_end()); 
-        continue; 
-      } 
-      addNonConstUser(GVtoClusterMap, GV, UU); 
-    } 
-  } 
-} 
- 
-// Find partitions for module in the way that no locals need to be 
-// globalized. 
-// Try to balance pack those partitions into N files since this roughly equals 
-// thread balancing for the backend codegen step. 
-static void findPartitions(Module *M, ClusterIDMapType &ClusterIDMap, 
-                           unsigned N) { 
-  // At this point module should have the proper mix of globals and locals. 
-  // As we attempt to partition this module, we must not change any 
-  // locals to globals. 
-  LLVM_DEBUG(dbgs() << "Partition module with (" << M->size() 
-                    << ")functions\n"); 
-  ClusterMapType GVtoClusterMap; 
-  ComdatMembersType ComdatMembers; 
- 
-  auto recordGVSet = [&GVtoClusterMap, &ComdatMembers](GlobalValue &GV) { 
-    if (GV.isDeclaration()) 
-      return; 
- 
-    if (!GV.hasName()) 
-      GV.setName("__llvmsplit_unnamed"); 
- 
-    // Comdat groups must not be partitioned. For comdat groups that contain 
-    // locals, record all their members here so we can keep them together. 
-    // Comdat groups that only contain external globals are already handled by 
-    // the MD5-based partitioning. 
-    if (const Comdat *C = GV.getComdat()) { 
-      auto &Member = ComdatMembers[C]; 
-      if (Member) 
-        GVtoClusterMap.unionSets(Member, &GV); 
-      else 
-        Member = &GV; 
-    } 
- 
-    // For aliases we should not separate them from their aliasees regardless 
-    // of linkage. 
-    if (auto *GIS = dyn_cast<GlobalIndirectSymbol>(&GV)) { 
-      if (const GlobalObject *Base = GIS->getBaseObject()) 
-        GVtoClusterMap.unionSets(&GV, Base); 
-    } 
- 
-    if (const Function *F = dyn_cast<Function>(&GV)) { 
-      for (const BasicBlock &BB : *F) { 
-        BlockAddress *BA = BlockAddress::lookup(&BB); 
-        if (!BA || !BA->isConstantUsed()) 
-          continue; 
-        addAllGlobalValueUsers(GVtoClusterMap, F, BA); 
-      } 
-    } 
- 
-    if (GV.hasLocalLinkage()) 
-      addAllGlobalValueUsers(GVtoClusterMap, &GV, &GV); 
-  }; 
- 
-  llvm::for_each(M->functions(), recordGVSet); 
-  llvm::for_each(M->globals(), recordGVSet); 
-  llvm::for_each(M->aliases(), recordGVSet); 
- 
-  // Assigned all GVs to merged clusters while balancing number of objects in 
-  // each. 
-  auto CompareClusters = [](const std::pair<unsigned, unsigned> &a, 
-                            const std::pair<unsigned, unsigned> &b) { 
-    if (a.second || b.second) 
-      return a.second > b.second; 
-    else 
-      return a.first > b.first; 
-  }; 
- 
-  std::priority_queue<std::pair<unsigned, unsigned>, 
-                      std::vector<std::pair<unsigned, unsigned>>, 
-                      decltype(CompareClusters)> 
-      BalancinQueue(CompareClusters); 
-  // Pre-populate priority queue with N slot blanks. 
-  for (unsigned i = 0; i < N; ++i) 
-    BalancinQueue.push(std::make_pair(i, 0)); 
- 
-  using SortType = std::pair<unsigned, ClusterMapType::iterator>; 
- 
-  SmallVector<SortType, 64> Sets; 
-  SmallPtrSet<const GlobalValue *, 32> Visited; 
- 
-  // To guarantee determinism, we have to sort SCC according to size. 
-  // When size is the same, use leader's name. 
-  for (ClusterMapType::iterator I = GVtoClusterMap.begin(), 
-                                E = GVtoClusterMap.end(); I != E; ++I) 
-    if (I->isLeader()) 
-      Sets.push_back( 
-          std::make_pair(std::distance(GVtoClusterMap.member_begin(I), 
-                                       GVtoClusterMap.member_end()), I)); 
- 
-  llvm::sort(Sets, [](const SortType &a, const SortType &b) { 
-    if (a.first == b.first) 
-      return a.second->getData()->getName() > b.second->getData()->getName(); 
-    else 
-      return a.first > b.first; 
-  }); 
- 
-  for (auto &I : Sets) { 
-    unsigned CurrentClusterID = BalancinQueue.top().first; 
-    unsigned CurrentClusterSize = BalancinQueue.top().second; 
-    BalancinQueue.pop(); 
- 
-    LLVM_DEBUG(dbgs() << "Root[" << CurrentClusterID << "] cluster_size(" 
-                      << I.first << ") ----> " << I.second->getData()->getName() 
-                      << "\n"); 
- 
-    for (ClusterMapType::member_iterator MI = 
-             GVtoClusterMap.findLeader(I.second); 
-         MI != GVtoClusterMap.member_end(); ++MI) { 
-      if (!Visited.insert(*MI).second) 
-        continue; 
-      LLVM_DEBUG(dbgs() << "----> " << (*MI)->getName() 
-                        << ((*MI)->hasLocalLinkage() ? " l " : " e ") << "\n"); 
-      Visited.insert(*MI); 
-      ClusterIDMap[*MI] = CurrentClusterID; 
-      CurrentClusterSize++; 
-    } 
-    // Add this set size to the number of entries in this cluster. 
-    BalancinQueue.push(std::make_pair(CurrentClusterID, CurrentClusterSize)); 
-  } 
-} 
- 
-static void externalize(GlobalValue *GV) { 
-  if (GV->hasLocalLinkage()) { 
-    GV->setLinkage(GlobalValue::ExternalLinkage); 
-    GV->setVisibility(GlobalValue::HiddenVisibility); 
-  } 
- 
-  // Unnamed entities must be named consistently between modules. setName will 
-  // give a distinct name to each such entity. 
-  if (!GV->hasName()) 
-    GV->setName("__llvmsplit_unnamed"); 
-} 
- 
-// Returns whether GV should be in partition (0-based) I of N. 
-static bool isInPartition(const GlobalValue *GV, unsigned I, unsigned N) { 
-  if (auto *GIS = dyn_cast<GlobalIndirectSymbol>(GV)) 
-    if (const GlobalObject *Base = GIS->getBaseObject()) 
-      GV = Base; 
- 
-  StringRef Name; 
-  if (const Comdat *C = GV->getComdat()) 
-    Name = C->getName(); 
-  else 
-    Name = GV->getName(); 
- 
-  // Partition by MD5 hash. We only need a few bits for evenness as the number 
-  // of partitions will generally be in the 1-2 figure range; the low 16 bits 
-  // are enough. 
-  MD5 H; 
-  MD5::MD5Result R; 
-  H.update(Name); 
-  H.final(R); 
-  return (R[0] | (R[1] << 8)) % N == I; 
-} 
- 
-void llvm::SplitModule( 
-    std::unique_ptr<Module> M, unsigned N, 
-    function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback, 
-    bool PreserveLocals) { 
-  if (!PreserveLocals) { 
-    for (Function &F : *M) 
-      externalize(&F); 
-    for (GlobalVariable &GV : M->globals()) 
-      externalize(&GV); 
-    for (GlobalAlias &GA : M->aliases()) 
-      externalize(&GA); 
-    for (GlobalIFunc &GIF : M->ifuncs()) 
-      externalize(&GIF); 
-  } 
- 
-  // This performs splitting without a need for externalization, which might not 
-  // always be possible. 
-  ClusterIDMapType ClusterIDMap; 
-  findPartitions(M.get(), ClusterIDMap, N); 
- 
-  // FIXME: We should be able to reuse M as the last partition instead of 
-  // cloning it. 
-  for (unsigned I = 0; I < N; ++I) { 
-    ValueToValueMapTy VMap; 
-    std::unique_ptr<Module> MPart( 
-        CloneModule(*M, VMap, [&](const GlobalValue *GV) { 
-          if (ClusterIDMap.count(GV)) 
-            return (ClusterIDMap[GV] == I); 
-          else 
-            return isInPartition(GV, I, N); 
-        })); 
-    if (I != 0) 
-      MPart->setModuleInlineAsm(""); 
-    ModuleCallback(std::move(MPart)); 
-  } 
-} 
+//===- SplitModule.cpp - Split a module into partitions -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the function llvm::SplitModule, which splits a module
+// into multiple linkable partitions. It can be used to implement parallel code
+// generation for link-time optimization.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SplitModule.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Comdat.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalIndirectSymbol.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MD5.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <memory>
+#include <queue>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "split-module"
+
+namespace {
+
+using ClusterMapType = EquivalenceClasses<const GlobalValue *>;
+using ComdatMembersType = DenseMap<const Comdat *, const GlobalValue *>;
+using ClusterIDMapType = DenseMap<const GlobalValue *, unsigned>;
+
+} // end anonymous namespace
+
+static void addNonConstUser(ClusterMapType &GVtoClusterMap,
+                            const GlobalValue *GV, const User *U) {
+  assert((!isa<Constant>(U) || isa<GlobalValue>(U)) && "Bad user");
+
+  if (const Instruction *I = dyn_cast<Instruction>(U)) {
+    const GlobalValue *F = I->getParent()->getParent();
+    GVtoClusterMap.unionSets(GV, F);
+  } else if (isa<GlobalIndirectSymbol>(U) || isa<Function>(U) ||
+             isa<GlobalVariable>(U)) {
+    GVtoClusterMap.unionSets(GV, cast<GlobalValue>(U));
+  } else {
+    llvm_unreachable("Underimplemented use case");
+  }
+}
+
+// Adds all GlobalValue users of V to the same cluster as GV.
+static void addAllGlobalValueUsers(ClusterMapType &GVtoClusterMap,
+                                   const GlobalValue *GV, const Value *V) {
+  for (auto *U : V->users()) {
+    SmallVector<const User *, 4> Worklist;
+    Worklist.push_back(U);
+    while (!Worklist.empty()) {
+      const User *UU = Worklist.pop_back_val();
+      // For each constant that is not a GV (a pure const) recurse.
+      if (isa<Constant>(UU) && !isa<GlobalValue>(UU)) {
+        Worklist.append(UU->user_begin(), UU->user_end());
+        continue;
+      }
+      addNonConstUser(GVtoClusterMap, GV, UU);
+    }
+  }
+}
+
+// Find partitions for module in the way that no locals need to be
+// globalized.
+// Try to balance pack those partitions into N files since this roughly equals
+// thread balancing for the backend codegen step.
+static void findPartitions(Module *M, ClusterIDMapType &ClusterIDMap,
+                           unsigned N) {
+  // At this point module should have the proper mix of globals and locals.
+  // As we attempt to partition this module, we must not change any
+  // locals to globals.
+  LLVM_DEBUG(dbgs() << "Partition module with (" << M->size()
+                    << ")functions\n");
+  ClusterMapType GVtoClusterMap;
+  ComdatMembersType ComdatMembers;
+
+  auto recordGVSet = [&GVtoClusterMap, &ComdatMembers](GlobalValue &GV) {
+    if (GV.isDeclaration())
+      return;
+
+    if (!GV.hasName())
+      GV.setName("__llvmsplit_unnamed");
+
+    // Comdat groups must not be partitioned. For comdat groups that contain
+    // locals, record all their members here so we can keep them together.
+    // Comdat groups that only contain external globals are already handled by
+    // the MD5-based partitioning.
+    if (const Comdat *C = GV.getComdat()) {
+      auto &Member = ComdatMembers[C];
+      if (Member)
+        GVtoClusterMap.unionSets(Member, &GV);
+      else
+        Member = &GV;
+    }
+
+    // For aliases we should not separate them from their aliasees regardless
+    // of linkage.
+    if (auto *GIS = dyn_cast<GlobalIndirectSymbol>(&GV)) {
+      if (const GlobalObject *Base = GIS->getBaseObject())
+        GVtoClusterMap.unionSets(&GV, Base);
+    }
+
+    if (const Function *F = dyn_cast<Function>(&GV)) {
+      for (const BasicBlock &BB : *F) {
+        BlockAddress *BA = BlockAddress::lookup(&BB);
+        if (!BA || !BA->isConstantUsed())
+          continue;
+        addAllGlobalValueUsers(GVtoClusterMap, F, BA);
+      }
+    }
+
+    if (GV.hasLocalLinkage())
+      addAllGlobalValueUsers(GVtoClusterMap, &GV, &GV);
+  };
+
+  llvm::for_each(M->functions(), recordGVSet);
+  llvm::for_each(M->globals(), recordGVSet);
+  llvm::for_each(M->aliases(), recordGVSet);
+
+  // Assigned all GVs to merged clusters while balancing number of objects in
+  // each.
+  auto CompareClusters = [](const std::pair<unsigned, unsigned> &a,
+                            const std::pair<unsigned, unsigned> &b) {
+    if (a.second || b.second)
+      return a.second > b.second;
+    else
+      return a.first > b.first;
+  };
+
+  std::priority_queue<std::pair<unsigned, unsigned>,
+                      std::vector<std::pair<unsigned, unsigned>>,
+                      decltype(CompareClusters)>
+      BalancinQueue(CompareClusters);
+  // Pre-populate priority queue with N slot blanks.
+  for (unsigned i = 0; i < N; ++i)
+    BalancinQueue.push(std::make_pair(i, 0));
+
+  using SortType = std::pair<unsigned, ClusterMapType::iterator>;
+
+  SmallVector<SortType, 64> Sets;
+  SmallPtrSet<const GlobalValue *, 32> Visited;
+
+  // To guarantee determinism, we have to sort SCC according to size.
+  // When size is the same, use leader's name.
+  for (ClusterMapType::iterator I = GVtoClusterMap.begin(),
+                                E = GVtoClusterMap.end(); I != E; ++I)
+    if (I->isLeader())
+      Sets.push_back(
+          std::make_pair(std::distance(GVtoClusterMap.member_begin(I),
+                                       GVtoClusterMap.member_end()), I));
+
+  llvm::sort(Sets, [](const SortType &a, const SortType &b) {
+    if (a.first == b.first)
+      return a.second->getData()->getName() > b.second->getData()->getName();
+    else
+      return a.first > b.first;
+  });
+
+  for (auto &I : Sets) {
+    unsigned CurrentClusterID = BalancinQueue.top().first;
+    unsigned CurrentClusterSize = BalancinQueue.top().second;
+    BalancinQueue.pop();
+
+    LLVM_DEBUG(dbgs() << "Root[" << CurrentClusterID << "] cluster_size("
+                      << I.first << ") ----> " << I.second->getData()->getName()
+                      << "\n");
+
+    for (ClusterMapType::member_iterator MI =
+             GVtoClusterMap.findLeader(I.second);
+         MI != GVtoClusterMap.member_end(); ++MI) {
+      if (!Visited.insert(*MI).second)
+        continue;
+      LLVM_DEBUG(dbgs() << "----> " << (*MI)->getName()
+                        << ((*MI)->hasLocalLinkage() ? " l " : " e ") << "\n");
+      Visited.insert(*MI);
+      ClusterIDMap[*MI] = CurrentClusterID;
+      CurrentClusterSize++;
+    }
+    // Add this set size to the number of entries in this cluster.
+    BalancinQueue.push(std::make_pair(CurrentClusterID, CurrentClusterSize));
+  }
+}
+
+static void externalize(GlobalValue *GV) {
+  if (GV->hasLocalLinkage()) {
+    GV->setLinkage(GlobalValue::ExternalLinkage);
+    GV->setVisibility(GlobalValue::HiddenVisibility);
+  }
+
+  // Unnamed entities must be named consistently between modules. setName will
+  // give a distinct name to each such entity.
+  if (!GV->hasName())
+    GV->setName("__llvmsplit_unnamed");
+}
+
+// Returns whether GV should be in partition (0-based) I of N.
+static bool isInPartition(const GlobalValue *GV, unsigned I, unsigned N) {
+  if (auto *GIS = dyn_cast<GlobalIndirectSymbol>(GV))
+    if (const GlobalObject *Base = GIS->getBaseObject())
+      GV = Base;
+
+  StringRef Name;
+  if (const Comdat *C = GV->getComdat())
+    Name = C->getName();
+  else
+    Name = GV->getName();
+
+  // Partition by MD5 hash. We only need a few bits for evenness as the number
+  // of partitions will generally be in the 1-2 figure range; the low 16 bits
+  // are enough.
+  MD5 H;
+  MD5::MD5Result R;
+  H.update(Name);
+  H.final(R);
+  return (R[0] | (R[1] << 8)) % N == I;
+}
+
+void llvm::SplitModule(
+    std::unique_ptr<Module> M, unsigned N,
+    function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback,
+    bool PreserveLocals) {
+  if (!PreserveLocals) {
+    for (Function &F : *M)
+      externalize(&F);
+    for (GlobalVariable &GV : M->globals())
+      externalize(&GV);
+    for (GlobalAlias &GA : M->aliases())
+      externalize(&GA);
+    for (GlobalIFunc &GIF : M->ifuncs())
+      externalize(&GIF);
+  }
+
+  // This performs splitting without a need for externalization, which might not
+  // always be possible.
+  ClusterIDMapType ClusterIDMap;
+  findPartitions(M.get(), ClusterIDMap, N);
+
+  // FIXME: We should be able to reuse M as the last partition instead of
+  // cloning it.
+  for (unsigned I = 0; I < N; ++I) {
+    ValueToValueMapTy VMap;
+    std::unique_ptr<Module> MPart(
+        CloneModule(*M, VMap, [&](const GlobalValue *GV) {
+          if (ClusterIDMap.count(GV))
+            return (ClusterIDMap[GV] == I);
+          else
+            return isInPartition(GV, I, N);
+        }));
+    if (I != 0)
+      MPart->setModuleInlineAsm("");
+    ModuleCallback(std::move(MPart));
+  }
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/StripGCRelocates.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/StripGCRelocates.cpp
index 461edd8755..1fa574f04c 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/StripGCRelocates.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/StripGCRelocates.cpp
@@ -1,62 +1,62 @@
-//===- StripGCRelocates.cpp - Remove gc.relocates inserted by RewriteStatePoints===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This is a little utility pass that removes the gc.relocates inserted by 
-// RewriteStatepointsForGC. Note that the generated IR is incorrect, 
-// but this is useful as a single pass in itself, for analysis of IR, without 
-// the GC.relocates. The statepoint and gc.result instrinsics would still be 
-// present. 
-//===----------------------------------------------------------------------===// 
- 
+//===- StripGCRelocates.cpp - Remove gc.relocates inserted by RewriteStatePoints===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a little utility pass that removes the gc.relocates inserted by
+// RewriteStatepointsForGC. Note that the generated IR is incorrect,
+// but this is useful as a single pass in itself, for analysis of IR, without
+// the GC.relocates. The statepoint and gc.result instrinsics would still be
+// present.
+//===----------------------------------------------------------------------===//
+
 #include "llvm/Transforms/Utils/StripGCRelocates.h"
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/InstIterator.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Statepoint.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/raw_ostream.h" 
- 
-using namespace llvm; 
- 
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
 static bool stripGCRelocates(Function &F) {
-  // Nothing to do for declarations. 
-  if (F.isDeclaration()) 
-    return false; 
-  SmallVector<GCRelocateInst *, 20> GCRelocates; 
-  // TODO: We currently do not handle gc.relocates that are in landing pads, 
-  // i.e. not bound to a single statepoint token. 
-  for (Instruction &I : instructions(F)) { 
-    if (auto *GCR = dyn_cast<GCRelocateInst>(&I)) 
-      if (isa<GCStatepointInst>(GCR->getOperand(0))) 
-        GCRelocates.push_back(GCR); 
-  } 
-  // All gc.relocates are bound to a single statepoint token. The order of 
-  // visiting gc.relocates for deletion does not matter. 
-  for (GCRelocateInst *GCRel : GCRelocates) { 
-    Value *OrigPtr = GCRel->getDerivedPtr(); 
-    Value *ReplaceGCRel = OrigPtr; 
- 
-    // All gc_relocates are i8 addrspace(1)* typed, we need a bitcast from i8 
-    // addrspace(1)* to the type of the OrigPtr, if the are not the same. 
-    if (GCRel->getType() != OrigPtr->getType()) 
-      ReplaceGCRel = new BitCastInst(OrigPtr, GCRel->getType(), "cast", GCRel); 
- 
-    // Replace all uses of gc.relocate and delete the gc.relocate 
-    // There maybe unncessary bitcasts back to the OrigPtr type, an instcombine 
-    // pass would clear this up. 
-    GCRel->replaceAllUsesWith(ReplaceGCRel); 
-    GCRel->eraseFromParent(); 
-  } 
-  return !GCRelocates.empty(); 
-} 
- 
+  // Nothing to do for declarations.
+  if (F.isDeclaration())
+    return false;
+  SmallVector<GCRelocateInst *, 20> GCRelocates;
+  // TODO: We currently do not handle gc.relocates that are in landing pads,
+  // i.e. not bound to a single statepoint token.
+  for (Instruction &I : instructions(F)) {
+    if (auto *GCR = dyn_cast<GCRelocateInst>(&I))
+      if (isa<GCStatepointInst>(GCR->getOperand(0)))
+        GCRelocates.push_back(GCR);
+  }
+  // All gc.relocates are bound to a single statepoint token. The order of
+  // visiting gc.relocates for deletion does not matter.
+  for (GCRelocateInst *GCRel : GCRelocates) {
+    Value *OrigPtr = GCRel->getDerivedPtr();
+    Value *ReplaceGCRel = OrigPtr;
+
+    // All gc_relocates are i8 addrspace(1)* typed, we need a bitcast from i8
+    // addrspace(1)* to the type of the OrigPtr, if the are not the same.
+    if (GCRel->getType() != OrigPtr->getType())
+      ReplaceGCRel = new BitCastInst(OrigPtr, GCRel->getType(), "cast", GCRel);
+
+    // Replace all uses of gc.relocate and delete the gc.relocate
+    // There maybe unncessary bitcasts back to the OrigPtr type, an instcombine
+    // pass would clear this up.
+    GCRel->replaceAllUsesWith(ReplaceGCRel);
+    GCRel->eraseFromParent();
+  }
+  return !GCRelocates.empty();
+}
+
 PreservedAnalyses StripGCRelocates::run(Function &F,
                                         FunctionAnalysisManager &AM) {
   if (!stripGCRelocates(F))
@@ -84,5 +84,5 @@ char StripGCRelocatesLegacy::ID = 0;
 } // namespace
 
 INITIALIZE_PASS(StripGCRelocatesLegacy, "strip-gc-relocates",
-                "Strip gc.relocates inserted through RewriteStatepointsForGC", 
-                true, false) 
+                "Strip gc.relocates inserted through RewriteStatepointsForGC",
+                true, false)
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp
index 0a45a21e43..10fda4df51 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp
@@ -1,48 +1,48 @@
-//===- StripNonLineTableDebugInfo.cpp -- Strip parts of Debug Info --------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
- 
+//===- StripNonLineTableDebugInfo.cpp -- Strip parts of Debug Info --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include "llvm/Transforms/Utils/StripNonLineTableDebugInfo.h"
-#include "llvm/IR/DebugInfo.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Transforms/Utils.h" 
-using namespace llvm; 
- 
-namespace { 
- 
-/// This pass strips all debug info that is not related line tables. 
-/// The result will be the same as if the program where compiled with 
-/// -gline-tables-only. 
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils.h"
+using namespace llvm;
+
+namespace {
+
+/// This pass strips all debug info that is not related line tables.
+/// The result will be the same as if the program where compiled with
+/// -gline-tables-only.
 struct StripNonLineTableDebugLegacyPass : public ModulePass {
-  static char ID; // Pass identification, replacement for typeid 
+  static char ID; // Pass identification, replacement for typeid
   StripNonLineTableDebugLegacyPass() : ModulePass(ID) {
     initializeStripNonLineTableDebugLegacyPassPass(
         *PassRegistry::getPassRegistry());
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.setPreservesAll(); 
-  } 
- 
-  bool runOnModule(Module &M) override { 
-    return llvm::stripNonLineTableDebugInfo(M); 
-  } 
-}; 
-} 
- 
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+
+  bool runOnModule(Module &M) override {
+    return llvm::stripNonLineTableDebugInfo(M);
+  }
+};
+}
+
 char StripNonLineTableDebugLegacyPass::ID = 0;
 INITIALIZE_PASS(StripNonLineTableDebugLegacyPass,
                 "strip-nonlinetable-debuginfo",
-                "Strip all debug info except linetables", false, false) 
- 
+                "Strip all debug info except linetables", false, false)
+
 ModulePass *llvm::createStripNonLineTableDebugLegacyPass() {
   return new StripNonLineTableDebugLegacyPass();
-} 
+}
 
 PreservedAnalyses
 StripNonLineTableDebugInfoPass::run(Module &M, ModuleAnalysisManager &AM) {
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/SymbolRewriter.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/SymbolRewriter.cpp
index 17299dfaf5..ec4ea848a5 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/SymbolRewriter.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/SymbolRewriter.cpp
@@ -1,587 +1,587 @@
-//===- SymbolRewriter.cpp - Symbol Rewriter -------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// SymbolRewriter is a LLVM pass which can rewrite symbols transparently within 
-// existing code.  It is implemented as a compiler pass and is configured via a 
-// YAML configuration file. 
-// 
-// The YAML configuration file format is as follows: 
-// 
-// RewriteMapFile := RewriteDescriptors 
-// RewriteDescriptors := RewriteDescriptor | RewriteDescriptors 
-// RewriteDescriptor := RewriteDescriptorType ':' '{' RewriteDescriptorFields '}' 
-// RewriteDescriptorFields := RewriteDescriptorField | RewriteDescriptorFields 
-// RewriteDescriptorField := FieldIdentifier ':' FieldValue ',' 
-// RewriteDescriptorType := Identifier 
-// FieldIdentifier := Identifier 
-// FieldValue := Identifier 
-// Identifier := [0-9a-zA-Z]+ 
-// 
-// Currently, the following descriptor types are supported: 
-// 
-// - function:          (function rewriting) 
-//      + Source        (original name of the function) 
-//      + Target        (explicit transformation) 
-//      + Transform     (pattern transformation) 
-//      + Naked         (boolean, whether the function is undecorated) 
-// - global variable:   (external linkage global variable rewriting) 
-//      + Source        (original name of externally visible variable) 
-//      + Target        (explicit transformation) 
-//      + Transform     (pattern transformation) 
-// - global alias:      (global alias rewriting) 
-//      + Source        (original name of the aliased name) 
-//      + Target        (explicit transformation) 
-//      + Transform     (pattern transformation) 
-// 
-// Note that source and exactly one of [Target, Transform] must be provided 
-// 
-// New rewrite descriptors can be created.  Addding a new rewrite descriptor 
-// involves: 
-// 
-//  a) extended the rewrite descriptor kind enumeration 
-//     (<anonymous>::RewriteDescriptor::RewriteDescriptorType) 
-//  b) implementing the new descriptor 
-//     (c.f. <anonymous>::ExplicitRewriteFunctionDescriptor) 
-//  c) extending the rewrite map parser 
-//     (<anonymous>::RewriteMapParser::parseEntry) 
-// 
-//  Specify to rewrite the symbols using the `-rewrite-symbols` option, and 
-//  specify the map file to use for the rewriting via the `-rewrite-map-file` 
-//  option. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/SymbolRewriter.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallString.h" 
-#include "llvm/ADT/StringRef.h" 
-#include "llvm/ADT/ilist.h" 
-#include "llvm/ADT/iterator_range.h" 
-#include "llvm/IR/Comdat.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GlobalAlias.h" 
-#include "llvm/IR/GlobalObject.h" 
-#include "llvm/IR/GlobalVariable.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/ErrorOr.h" 
-#include "llvm/Support/MemoryBuffer.h" 
-#include "llvm/Support/Regex.h" 
-#include "llvm/Support/SourceMgr.h" 
-#include "llvm/Support/YAMLParser.h" 
-#include <memory> 
-#include <string> 
-#include <vector> 
- 
-using namespace llvm; 
-using namespace SymbolRewriter; 
- 
-#define DEBUG_TYPE "symbol-rewriter" 
- 
-static cl::list<std::string> RewriteMapFiles("rewrite-map-file", 
-                                             cl::desc("Symbol Rewrite Map"), 
-                                             cl::value_desc("filename"), 
-                                             cl::Hidden); 
- 
-static void rewriteComdat(Module &M, GlobalObject *GO, 
-                          const std::string &Source, 
-                          const std::string &Target) { 
-  if (Comdat *CD = GO->getComdat()) { 
-    auto &Comdats = M.getComdatSymbolTable(); 
- 
-    Comdat *C = M.getOrInsertComdat(Target); 
-    C->setSelectionKind(CD->getSelectionKind()); 
-    GO->setComdat(C); 
- 
-    Comdats.erase(Comdats.find(Source)); 
-  } 
-} 
- 
-namespace { 
- 
-template <RewriteDescriptor::Type DT, typename ValueType, 
-          ValueType *(Module::*Get)(StringRef) const> 
-class ExplicitRewriteDescriptor : public RewriteDescriptor { 
-public: 
-  const std::string Source; 
-  const std::string Target; 
- 
-  ExplicitRewriteDescriptor(StringRef S, StringRef T, const bool Naked) 
-      : RewriteDescriptor(DT), 
-        Source(std::string(Naked ? StringRef("\01" + S.str()) : S)), 
-        Target(std::string(T)) {} 
- 
-  bool performOnModule(Module &M) override; 
- 
-  static bool classof(const RewriteDescriptor *RD) { 
-    return RD->getType() == DT; 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-template <RewriteDescriptor::Type DT, typename ValueType, 
-          ValueType *(Module::*Get)(StringRef) const> 
-bool ExplicitRewriteDescriptor<DT, ValueType, Get>::performOnModule(Module &M) { 
-  bool Changed = false; 
-  if (ValueType *S = (M.*Get)(Source)) { 
-    if (GlobalObject *GO = dyn_cast<GlobalObject>(S)) 
-      rewriteComdat(M, GO, Source, Target); 
- 
-    if (Value *T = (M.*Get)(Target)) 
-      S->setValueName(T->getValueName()); 
-    else 
-      S->setName(Target); 
- 
-    Changed = true; 
-  } 
-  return Changed; 
-} 
- 
-namespace { 
- 
-template <RewriteDescriptor::Type DT, typename ValueType, 
-          ValueType *(Module::*Get)(StringRef) const, 
-          iterator_range<typename iplist<ValueType>::iterator> 
-          (Module::*Iterator)()> 
-class PatternRewriteDescriptor : public RewriteDescriptor { 
-public: 
-  const std::string Pattern; 
-  const std::string Transform; 
- 
-  PatternRewriteDescriptor(StringRef P, StringRef T) 
-      : RewriteDescriptor(DT), Pattern(std::string(P)), 
-        Transform(std::string(T)) {} 
- 
-  bool performOnModule(Module &M) override; 
- 
-  static bool classof(const RewriteDescriptor *RD) { 
-    return RD->getType() == DT; 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-template <RewriteDescriptor::Type DT, typename ValueType, 
-          ValueType *(Module::*Get)(StringRef) const, 
-          iterator_range<typename iplist<ValueType>::iterator> 
-          (Module::*Iterator)()> 
-bool PatternRewriteDescriptor<DT, ValueType, Get, Iterator>:: 
-performOnModule(Module &M) { 
-  bool Changed = false; 
-  for (auto &C : (M.*Iterator)()) { 
-    std::string Error; 
- 
-    std::string Name = Regex(Pattern).sub(Transform, C.getName(), &Error); 
-    if (!Error.empty()) 
-      report_fatal_error("unable to transforn " + C.getName() + " in " + 
-                         M.getModuleIdentifier() + ": " + Error); 
- 
-    if (C.getName() == Name) 
-      continue; 
- 
-    if (GlobalObject *GO = dyn_cast<GlobalObject>(&C)) 
-      rewriteComdat(M, GO, std::string(C.getName()), Name); 
- 
-    if (Value *V = (M.*Get)(Name)) 
-      C.setValueName(V->getValueName()); 
-    else 
-      C.setName(Name); 
- 
-    Changed = true; 
-  } 
-  return Changed; 
-} 
- 
-namespace { 
- 
-/// Represents a rewrite for an explicitly named (function) symbol.  Both the 
-/// source function name and target function name of the transformation are 
-/// explicitly spelt out. 
-using ExplicitRewriteFunctionDescriptor = 
-    ExplicitRewriteDescriptor<RewriteDescriptor::Type::Function, Function, 
-                              &Module::getFunction>; 
- 
-/// Represents a rewrite for an explicitly named (global variable) symbol.  Both 
-/// the source variable name and target variable name are spelt out.  This 
-/// applies only to module level variables. 
-using ExplicitRewriteGlobalVariableDescriptor = 
-    ExplicitRewriteDescriptor<RewriteDescriptor::Type::GlobalVariable, 
-                              GlobalVariable, &Module::getGlobalVariable>; 
- 
-/// Represents a rewrite for an explicitly named global alias.  Both the source 
-/// and target name are explicitly spelt out. 
-using ExplicitRewriteNamedAliasDescriptor = 
-    ExplicitRewriteDescriptor<RewriteDescriptor::Type::NamedAlias, GlobalAlias, 
-                              &Module::getNamedAlias>; 
- 
-/// Represents a rewrite for a regular expression based pattern for functions. 
-/// A pattern for the function name is provided and a transformation for that 
-/// pattern to determine the target function name create the rewrite rule. 
-using PatternRewriteFunctionDescriptor = 
-    PatternRewriteDescriptor<RewriteDescriptor::Type::Function, Function, 
-                             &Module::getFunction, &Module::functions>; 
- 
-/// Represents a rewrite for a global variable based upon a matching pattern. 
-/// Each global variable matching the provided pattern will be transformed as 
-/// described in the transformation pattern for the target.  Applies only to 
-/// module level variables. 
-using PatternRewriteGlobalVariableDescriptor = 
-    PatternRewriteDescriptor<RewriteDescriptor::Type::GlobalVariable, 
-                             GlobalVariable, &Module::getGlobalVariable, 
-                             &Module::globals>; 
- 
-/// PatternRewriteNamedAliasDescriptor - represents a rewrite for global 
-/// aliases which match a given pattern.  The provided transformation will be 
-/// applied to each of the matching names. 
-using PatternRewriteNamedAliasDescriptor = 
-    PatternRewriteDescriptor<RewriteDescriptor::Type::NamedAlias, GlobalAlias, 
-                             &Module::getNamedAlias, &Module::aliases>; 
- 
-} // end anonymous namespace 
- 
-bool RewriteMapParser::parse(const std::string &MapFile, 
-                             RewriteDescriptorList *DL) { 
-  ErrorOr<std::unique_ptr<MemoryBuffer>> Mapping = 
-      MemoryBuffer::getFile(MapFile); 
- 
-  if (!Mapping) 
-    report_fatal_error("unable to read rewrite map '" + MapFile + "': " + 
-                       Mapping.getError().message()); 
- 
-  if (!parse(*Mapping, DL)) 
-    report_fatal_error("unable to parse rewrite map '" + MapFile + "'"); 
- 
-  return true; 
-} 
- 
-bool RewriteMapParser::parse(std::unique_ptr<MemoryBuffer> &MapFile, 
-                             RewriteDescriptorList *DL) { 
-  SourceMgr SM; 
-  yaml::Stream YS(MapFile->getBuffer(), SM); 
- 
-  for (auto &Document : YS) { 
-    yaml::MappingNode *DescriptorList; 
- 
-    // ignore empty documents 
-    if (isa<yaml::NullNode>(Document.getRoot())) 
-      continue; 
- 
-    DescriptorList = dyn_cast<yaml::MappingNode>(Document.getRoot()); 
-    if (!DescriptorList) { 
-      YS.printError(Document.getRoot(), "DescriptorList node must be a map"); 
-      return false; 
-    } 
- 
-    for (auto &Descriptor : *DescriptorList) 
-      if (!parseEntry(YS, Descriptor, DL)) 
-        return false; 
-  } 
- 
-  return true; 
-} 
- 
-bool RewriteMapParser::parseEntry(yaml::Stream &YS, yaml::KeyValueNode &Entry, 
-                                  RewriteDescriptorList *DL) { 
-  yaml::ScalarNode *Key; 
-  yaml::MappingNode *Value; 
-  SmallString<32> KeyStorage; 
-  StringRef RewriteType; 
- 
-  Key = dyn_cast<yaml::ScalarNode>(Entry.getKey()); 
-  if (!Key) { 
-    YS.printError(Entry.getKey(), "rewrite type must be a scalar"); 
-    return false; 
-  } 
- 
-  Value = dyn_cast<yaml::MappingNode>(Entry.getValue()); 
-  if (!Value) { 
-    YS.printError(Entry.getValue(), "rewrite descriptor must be a map"); 
-    return false; 
-  } 
- 
-  RewriteType = Key->getValue(KeyStorage); 
-  if (RewriteType.equals("function")) 
-    return parseRewriteFunctionDescriptor(YS, Key, Value, DL); 
-  else if (RewriteType.equals("global variable")) 
-    return parseRewriteGlobalVariableDescriptor(YS, Key, Value, DL); 
-  else if (RewriteType.equals("global alias")) 
-    return parseRewriteGlobalAliasDescriptor(YS, Key, Value, DL); 
- 
-  YS.printError(Entry.getKey(), "unknown rewrite type"); 
-  return false; 
-} 
- 
-bool RewriteMapParser:: 
-parseRewriteFunctionDescriptor(yaml::Stream &YS, yaml::ScalarNode *K, 
-                               yaml::MappingNode *Descriptor, 
-                               RewriteDescriptorList *DL) { 
-  bool Naked = false; 
-  std::string Source; 
-  std::string Target; 
-  std::string Transform; 
- 
-  for (auto &Field : *Descriptor) { 
-    yaml::ScalarNode *Key; 
-    yaml::ScalarNode *Value; 
-    SmallString<32> KeyStorage; 
-    SmallString<32> ValueStorage; 
-    StringRef KeyValue; 
- 
-    Key = dyn_cast<yaml::ScalarNode>(Field.getKey()); 
-    if (!Key) { 
-      YS.printError(Field.getKey(), "descriptor key must be a scalar"); 
-      return false; 
-    } 
- 
-    Value = dyn_cast<yaml::ScalarNode>(Field.getValue()); 
-    if (!Value) { 
-      YS.printError(Field.getValue(), "descriptor value must be a scalar"); 
-      return false; 
-    } 
- 
-    KeyValue = Key->getValue(KeyStorage); 
-    if (KeyValue.equals("source")) { 
-      std::string Error; 
- 
-      Source = std::string(Value->getValue(ValueStorage)); 
-      if (!Regex(Source).isValid(Error)) { 
-        YS.printError(Field.getKey(), "invalid regex: " + Error); 
-        return false; 
-      } 
-    } else if (KeyValue.equals("target")) { 
-      Target = std::string(Value->getValue(ValueStorage)); 
-    } else if (KeyValue.equals("transform")) { 
-      Transform = std::string(Value->getValue(ValueStorage)); 
-    } else if (KeyValue.equals("naked")) { 
-      std::string Undecorated; 
- 
-      Undecorated = std::string(Value->getValue(ValueStorage)); 
-      Naked = StringRef(Undecorated).lower() == "true" || Undecorated == "1"; 
-    } else { 
-      YS.printError(Field.getKey(), "unknown key for function"); 
-      return false; 
-    } 
-  } 
- 
-  if (Transform.empty() == Target.empty()) { 
-    YS.printError(Descriptor, 
-                  "exactly one of transform or target must be specified"); 
-    return false; 
-  } 
- 
-  // TODO see if there is a more elegant solution to selecting the rewrite 
-  // descriptor type 
-  if (!Target.empty()) 
-    DL->push_back(std::make_unique<ExplicitRewriteFunctionDescriptor>( 
-        Source, Target, Naked)); 
-  else 
-    DL->push_back( 
-        std::make_unique<PatternRewriteFunctionDescriptor>(Source, Transform)); 
- 
-  return true; 
-} 
- 
-bool RewriteMapParser:: 
-parseRewriteGlobalVariableDescriptor(yaml::Stream &YS, yaml::ScalarNode *K, 
-                                     yaml::MappingNode *Descriptor, 
-                                     RewriteDescriptorList *DL) { 
-  std::string Source; 
-  std::string Target; 
-  std::string Transform; 
- 
-  for (auto &Field : *Descriptor) { 
-    yaml::ScalarNode *Key; 
-    yaml::ScalarNode *Value; 
-    SmallString<32> KeyStorage; 
-    SmallString<32> ValueStorage; 
-    StringRef KeyValue; 
- 
-    Key = dyn_cast<yaml::ScalarNode>(Field.getKey()); 
-    if (!Key) { 
-      YS.printError(Field.getKey(), "descriptor Key must be a scalar"); 
-      return false; 
-    } 
- 
-    Value = dyn_cast<yaml::ScalarNode>(Field.getValue()); 
-    if (!Value) { 
-      YS.printError(Field.getValue(), "descriptor value must be a scalar"); 
-      return false; 
-    } 
- 
-    KeyValue = Key->getValue(KeyStorage); 
-    if (KeyValue.equals("source")) { 
-      std::string Error; 
- 
-      Source = std::string(Value->getValue(ValueStorage)); 
-      if (!Regex(Source).isValid(Error)) { 
-        YS.printError(Field.getKey(), "invalid regex: " + Error); 
-        return false; 
-      } 
-    } else if (KeyValue.equals("target")) { 
-      Target = std::string(Value->getValue(ValueStorage)); 
-    } else if (KeyValue.equals("transform")) { 
-      Transform = std::string(Value->getValue(ValueStorage)); 
-    } else { 
-      YS.printError(Field.getKey(), "unknown Key for Global Variable"); 
-      return false; 
-    } 
-  } 
- 
-  if (Transform.empty() == Target.empty()) { 
-    YS.printError(Descriptor, 
-                  "exactly one of transform or target must be specified"); 
-    return false; 
-  } 
- 
-  if (!Target.empty()) 
-    DL->push_back(std::make_unique<ExplicitRewriteGlobalVariableDescriptor>( 
-        Source, Target, 
-        /*Naked*/ false)); 
-  else 
-    DL->push_back(std::make_unique<PatternRewriteGlobalVariableDescriptor>( 
-        Source, Transform)); 
- 
-  return true; 
-} 
- 
-bool RewriteMapParser:: 
-parseRewriteGlobalAliasDescriptor(yaml::Stream &YS, yaml::ScalarNode *K, 
-                                  yaml::MappingNode *Descriptor, 
-                                  RewriteDescriptorList *DL) { 
-  std::string Source; 
-  std::string Target; 
-  std::string Transform; 
- 
-  for (auto &Field : *Descriptor) { 
-    yaml::ScalarNode *Key; 
-    yaml::ScalarNode *Value; 
-    SmallString<32> KeyStorage; 
-    SmallString<32> ValueStorage; 
-    StringRef KeyValue; 
- 
-    Key = dyn_cast<yaml::ScalarNode>(Field.getKey()); 
-    if (!Key) { 
-      YS.printError(Field.getKey(), "descriptor key must be a scalar"); 
-      return false; 
-    } 
- 
-    Value = dyn_cast<yaml::ScalarNode>(Field.getValue()); 
-    if (!Value) { 
-      YS.printError(Field.getValue(), "descriptor value must be a scalar"); 
-      return false; 
-    } 
- 
-    KeyValue = Key->getValue(KeyStorage); 
-    if (KeyValue.equals("source")) { 
-      std::string Error; 
- 
-      Source = std::string(Value->getValue(ValueStorage)); 
-      if (!Regex(Source).isValid(Error)) { 
-        YS.printError(Field.getKey(), "invalid regex: " + Error); 
-        return false; 
-      } 
-    } else if (KeyValue.equals("target")) { 
-      Target = std::string(Value->getValue(ValueStorage)); 
-    } else if (KeyValue.equals("transform")) { 
-      Transform = std::string(Value->getValue(ValueStorage)); 
-    } else { 
-      YS.printError(Field.getKey(), "unknown key for Global Alias"); 
-      return false; 
-    } 
-  } 
- 
-  if (Transform.empty() == Target.empty()) { 
-    YS.printError(Descriptor, 
-                  "exactly one of transform or target must be specified"); 
-    return false; 
-  } 
- 
-  if (!Target.empty()) 
-    DL->push_back(std::make_unique<ExplicitRewriteNamedAliasDescriptor>( 
-        Source, Target, 
-        /*Naked*/ false)); 
-  else 
-    DL->push_back(std::make_unique<PatternRewriteNamedAliasDescriptor>( 
-        Source, Transform)); 
- 
-  return true; 
-} 
- 
-namespace { 
- 
-class RewriteSymbolsLegacyPass : public ModulePass { 
-public: 
-  static char ID; // Pass identification, replacement for typeid 
- 
-  RewriteSymbolsLegacyPass(); 
-  RewriteSymbolsLegacyPass(SymbolRewriter::RewriteDescriptorList &DL); 
- 
-  bool runOnModule(Module &M) override; 
- 
-private: 
-  RewriteSymbolPass Impl; 
-}; 
- 
-} // end anonymous namespace 
- 
-char RewriteSymbolsLegacyPass::ID = 0; 
- 
-RewriteSymbolsLegacyPass::RewriteSymbolsLegacyPass() : ModulePass(ID) { 
-  initializeRewriteSymbolsLegacyPassPass(*PassRegistry::getPassRegistry()); 
-} 
- 
-RewriteSymbolsLegacyPass::RewriteSymbolsLegacyPass( 
-    SymbolRewriter::RewriteDescriptorList &DL) 
-    : ModulePass(ID), Impl(DL) {} 
- 
-bool RewriteSymbolsLegacyPass::runOnModule(Module &M) { 
-  return Impl.runImpl(M); 
-} 
- 
-PreservedAnalyses RewriteSymbolPass::run(Module &M, ModuleAnalysisManager &AM) { 
-  if (!runImpl(M)) 
-    return PreservedAnalyses::all(); 
- 
-  return PreservedAnalyses::none(); 
-} 
- 
-bool RewriteSymbolPass::runImpl(Module &M) { 
-  bool Changed; 
- 
-  Changed = false; 
-  for (auto &Descriptor : Descriptors) 
-    Changed |= Descriptor->performOnModule(M); 
- 
-  return Changed; 
-} 
- 
-void RewriteSymbolPass::loadAndParseMapFiles() { 
-  const std::vector<std::string> MapFiles(RewriteMapFiles); 
-  SymbolRewriter::RewriteMapParser Parser; 
- 
-  for (const auto &MapFile : MapFiles) 
-    Parser.parse(MapFile, &Descriptors); 
-} 
- 
-INITIALIZE_PASS(RewriteSymbolsLegacyPass, "rewrite-symbols", "Rewrite Symbols", 
-                false, false) 
- 
-ModulePass *llvm::createRewriteSymbolsPass() { 
-  return new RewriteSymbolsLegacyPass(); 
-} 
- 
-ModulePass * 
-llvm::createRewriteSymbolsPass(SymbolRewriter::RewriteDescriptorList &DL) { 
-  return new RewriteSymbolsLegacyPass(DL); 
-} 
+//===- SymbolRewriter.cpp - Symbol Rewriter -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// SymbolRewriter is a LLVM pass which can rewrite symbols transparently within
+// existing code.  It is implemented as a compiler pass and is configured via a
+// YAML configuration file.
+//
+// The YAML configuration file format is as follows:
+//
+// RewriteMapFile := RewriteDescriptors
+// RewriteDescriptors := RewriteDescriptor | RewriteDescriptors
+// RewriteDescriptor := RewriteDescriptorType ':' '{' RewriteDescriptorFields '}'
+// RewriteDescriptorFields := RewriteDescriptorField | RewriteDescriptorFields
+// RewriteDescriptorField := FieldIdentifier ':' FieldValue ','
+// RewriteDescriptorType := Identifier
+// FieldIdentifier := Identifier
+// FieldValue := Identifier
+// Identifier := [0-9a-zA-Z]+
+//
+// Currently, the following descriptor types are supported:
+//
+// - function:          (function rewriting)
+//      + Source        (original name of the function)
+//      + Target        (explicit transformation)
+//      + Transform     (pattern transformation)
+//      + Naked         (boolean, whether the function is undecorated)
+// - global variable:   (external linkage global variable rewriting)
+//      + Source        (original name of externally visible variable)
+//      + Target        (explicit transformation)
+//      + Transform     (pattern transformation)
+// - global alias:      (global alias rewriting)
+//      + Source        (original name of the aliased name)
+//      + Target        (explicit transformation)
+//      + Transform     (pattern transformation)
+//
+// Note that source and exactly one of [Target, Transform] must be provided
+//
+// New rewrite descriptors can be created.  Addding a new rewrite descriptor
+// involves:
+//
+//  a) extended the rewrite descriptor kind enumeration
+//     (<anonymous>::RewriteDescriptor::RewriteDescriptorType)
+//  b) implementing the new descriptor
+//     (c.f. <anonymous>::ExplicitRewriteFunctionDescriptor)
+//  c) extending the rewrite map parser
+//     (<anonymous>::RewriteMapParser::parseEntry)
+//
+//  Specify to rewrite the symbols using the `-rewrite-symbols` option, and
+//  specify the map file to use for the rewriting via the `-rewrite-map-file`
+//  option.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SymbolRewriter.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/ilist.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/IR/Comdat.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/YAMLParser.h"
+#include <memory>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+using namespace SymbolRewriter;
+
+#define DEBUG_TYPE "symbol-rewriter"
+
+static cl::list<std::string> RewriteMapFiles("rewrite-map-file",
+                                             cl::desc("Symbol Rewrite Map"),
+                                             cl::value_desc("filename"),
+                                             cl::Hidden);
+
+static void rewriteComdat(Module &M, GlobalObject *GO,
+                          const std::string &Source,
+                          const std::string &Target) {
+  if (Comdat *CD = GO->getComdat()) {
+    auto &Comdats = M.getComdatSymbolTable();
+
+    Comdat *C = M.getOrInsertComdat(Target);
+    C->setSelectionKind(CD->getSelectionKind());
+    GO->setComdat(C);
+
+    Comdats.erase(Comdats.find(Source));
+  }
+}
+
+namespace {
+
+template <RewriteDescriptor::Type DT, typename ValueType,
+          ValueType *(Module::*Get)(StringRef) const>
+class ExplicitRewriteDescriptor : public RewriteDescriptor {
+public:
+  const std::string Source;
+  const std::string Target;
+
+  ExplicitRewriteDescriptor(StringRef S, StringRef T, const bool Naked)
+      : RewriteDescriptor(DT),
+        Source(std::string(Naked ? StringRef("\01" + S.str()) : S)),
+        Target(std::string(T)) {}
+
+  bool performOnModule(Module &M) override;
+
+  static bool classof(const RewriteDescriptor *RD) {
+    return RD->getType() == DT;
+  }
+};
+
+} // end anonymous namespace
+
+template <RewriteDescriptor::Type DT, typename ValueType,
+          ValueType *(Module::*Get)(StringRef) const>
+bool ExplicitRewriteDescriptor<DT, ValueType, Get>::performOnModule(Module &M) {
+  bool Changed = false;
+  if (ValueType *S = (M.*Get)(Source)) {
+    if (GlobalObject *GO = dyn_cast<GlobalObject>(S))
+      rewriteComdat(M, GO, Source, Target);
+
+    if (Value *T = (M.*Get)(Target))
+      S->setValueName(T->getValueName());
+    else
+      S->setName(Target);
+
+    Changed = true;
+  }
+  return Changed;
+}
+
+namespace {
+
+template <RewriteDescriptor::Type DT, typename ValueType,
+          ValueType *(Module::*Get)(StringRef) const,
+          iterator_range<typename iplist<ValueType>::iterator>
+          (Module::*Iterator)()>
+class PatternRewriteDescriptor : public RewriteDescriptor {
+public:
+  const std::string Pattern;
+  const std::string Transform;
+
+  PatternRewriteDescriptor(StringRef P, StringRef T)
+      : RewriteDescriptor(DT), Pattern(std::string(P)),
+        Transform(std::string(T)) {}
+
+  bool performOnModule(Module &M) override;
+
+  static bool classof(const RewriteDescriptor *RD) {
+    return RD->getType() == DT;
+  }
+};
+
+} // end anonymous namespace
+
+template <RewriteDescriptor::Type DT, typename ValueType,
+          ValueType *(Module::*Get)(StringRef) const,
+          iterator_range<typename iplist<ValueType>::iterator>
+          (Module::*Iterator)()>
+bool PatternRewriteDescriptor<DT, ValueType, Get, Iterator>::
+performOnModule(Module &M) {
+  bool Changed = false;
+  for (auto &C : (M.*Iterator)()) {
+    std::string Error;
+
+    std::string Name = Regex(Pattern).sub(Transform, C.getName(), &Error);
+    if (!Error.empty())
+      report_fatal_error("unable to transforn " + C.getName() + " in " +
+                         M.getModuleIdentifier() + ": " + Error);
+
+    if (C.getName() == Name)
+      continue;
+
+    if (GlobalObject *GO = dyn_cast<GlobalObject>(&C))
+      rewriteComdat(M, GO, std::string(C.getName()), Name);
+
+    if (Value *V = (M.*Get)(Name))
+      C.setValueName(V->getValueName());
+    else
+      C.setName(Name);
+
+    Changed = true;
+  }
+  return Changed;
+}
+
+namespace {
+
+/// Represents a rewrite for an explicitly named (function) symbol.  Both the
+/// source function name and target function name of the transformation are
+/// explicitly spelt out.
+using ExplicitRewriteFunctionDescriptor =
+    ExplicitRewriteDescriptor<RewriteDescriptor::Type::Function, Function,
+                              &Module::getFunction>;
+
+/// Represents a rewrite for an explicitly named (global variable) symbol.  Both
+/// the source variable name and target variable name are spelt out.  This
+/// applies only to module level variables.
+using ExplicitRewriteGlobalVariableDescriptor =
+    ExplicitRewriteDescriptor<RewriteDescriptor::Type::GlobalVariable,
+                              GlobalVariable, &Module::getGlobalVariable>;
+
+/// Represents a rewrite for an explicitly named global alias.  Both the source
+/// and target name are explicitly spelt out.
+using ExplicitRewriteNamedAliasDescriptor =
+    ExplicitRewriteDescriptor<RewriteDescriptor::Type::NamedAlias, GlobalAlias,
+                              &Module::getNamedAlias>;
+
+/// Represents a rewrite for a regular expression based pattern for functions.
+/// A pattern for the function name is provided and a transformation for that
+/// pattern to determine the target function name create the rewrite rule.
+using PatternRewriteFunctionDescriptor =
+    PatternRewriteDescriptor<RewriteDescriptor::Type::Function, Function,
+                             &Module::getFunction, &Module::functions>;
+
+/// Represents a rewrite for a global variable based upon a matching pattern.
+/// Each global variable matching the provided pattern will be transformed as
+/// described in the transformation pattern for the target.  Applies only to
+/// module level variables.
+using PatternRewriteGlobalVariableDescriptor =
+    PatternRewriteDescriptor<RewriteDescriptor::Type::GlobalVariable,
+                             GlobalVariable, &Module::getGlobalVariable,
+                             &Module::globals>;
+
+/// PatternRewriteNamedAliasDescriptor - represents a rewrite for global
+/// aliases which match a given pattern.  The provided transformation will be
+/// applied to each of the matching names.
+using PatternRewriteNamedAliasDescriptor =
+    PatternRewriteDescriptor<RewriteDescriptor::Type::NamedAlias, GlobalAlias,
+                             &Module::getNamedAlias, &Module::aliases>;
+
+} // end anonymous namespace
+
+bool RewriteMapParser::parse(const std::string &MapFile,
+                             RewriteDescriptorList *DL) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> Mapping =
+      MemoryBuffer::getFile(MapFile);
+
+  if (!Mapping)
+    report_fatal_error("unable to read rewrite map '" + MapFile + "': " +
+                       Mapping.getError().message());
+
+  if (!parse(*Mapping, DL))
+    report_fatal_error("unable to parse rewrite map '" + MapFile + "'");
+
+  return true;
+}
+
+bool RewriteMapParser::parse(std::unique_ptr<MemoryBuffer> &MapFile,
+                             RewriteDescriptorList *DL) {
+  SourceMgr SM;
+  yaml::Stream YS(MapFile->getBuffer(), SM);
+
+  for (auto &Document : YS) {
+    yaml::MappingNode *DescriptorList;
+
+    // ignore empty documents
+    if (isa<yaml::NullNode>(Document.getRoot()))
+      continue;
+
+    DescriptorList = dyn_cast<yaml::MappingNode>(Document.getRoot());
+    if (!DescriptorList) {
+      YS.printError(Document.getRoot(), "DescriptorList node must be a map");
+      return false;
+    }
+
+    for (auto &Descriptor : *DescriptorList)
+      if (!parseEntry(YS, Descriptor, DL))
+        return false;
+  }
+
+  return true;
+}
+
+bool RewriteMapParser::parseEntry(yaml::Stream &YS, yaml::KeyValueNode &Entry,
+                                  RewriteDescriptorList *DL) {
+  yaml::ScalarNode *Key;
+  yaml::MappingNode *Value;
+  SmallString<32> KeyStorage;
+  StringRef RewriteType;
+
+  Key = dyn_cast<yaml::ScalarNode>(Entry.getKey());
+  if (!Key) {
+    YS.printError(Entry.getKey(), "rewrite type must be a scalar");
+    return false;
+  }
+
+  Value = dyn_cast<yaml::MappingNode>(Entry.getValue());
+  if (!Value) {
+    YS.printError(Entry.getValue(), "rewrite descriptor must be a map");
+    return false;
+  }
+
+  RewriteType = Key->getValue(KeyStorage);
+  if (RewriteType.equals("function"))
+    return parseRewriteFunctionDescriptor(YS, Key, Value, DL);
+  else if (RewriteType.equals("global variable"))
+    return parseRewriteGlobalVariableDescriptor(YS, Key, Value, DL);
+  else if (RewriteType.equals("global alias"))
+    return parseRewriteGlobalAliasDescriptor(YS, Key, Value, DL);
+
+  YS.printError(Entry.getKey(), "unknown rewrite type");
+  return false;
+}
+
+bool RewriteMapParser::
+parseRewriteFunctionDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
+                               yaml::MappingNode *Descriptor,
+                               RewriteDescriptorList *DL) {
+  bool Naked = false;
+  std::string Source;
+  std::string Target;
+  std::string Transform;
+
+  for (auto &Field : *Descriptor) {
+    yaml::ScalarNode *Key;
+    yaml::ScalarNode *Value;
+    SmallString<32> KeyStorage;
+    SmallString<32> ValueStorage;
+    StringRef KeyValue;
+
+    Key = dyn_cast<yaml::ScalarNode>(Field.getKey());
+    if (!Key) {
+      YS.printError(Field.getKey(), "descriptor key must be a scalar");
+      return false;
+    }
+
+    Value = dyn_cast<yaml::ScalarNode>(Field.getValue());
+    if (!Value) {
+      YS.printError(Field.getValue(), "descriptor value must be a scalar");
+      return false;
+    }
+
+    KeyValue = Key->getValue(KeyStorage);
+    if (KeyValue.equals("source")) {
+      std::string Error;
+
+      Source = std::string(Value->getValue(ValueStorage));
+      if (!Regex(Source).isValid(Error)) {
+        YS.printError(Field.getKey(), "invalid regex: " + Error);
+        return false;
+      }
+    } else if (KeyValue.equals("target")) {
+      Target = std::string(Value->getValue(ValueStorage));
+    } else if (KeyValue.equals("transform")) {
+      Transform = std::string(Value->getValue(ValueStorage));
+    } else if (KeyValue.equals("naked")) {
+      std::string Undecorated;
+
+      Undecorated = std::string(Value->getValue(ValueStorage));
+      Naked = StringRef(Undecorated).lower() == "true" || Undecorated == "1";
+    } else {
+      YS.printError(Field.getKey(), "unknown key for function");
+      return false;
+    }
+  }
+
+  if (Transform.empty() == Target.empty()) {
+    YS.printError(Descriptor,
+                  "exactly one of transform or target must be specified");
+    return false;
+  }
+
+  // TODO see if there is a more elegant solution to selecting the rewrite
+  // descriptor type
+  if (!Target.empty())
+    DL->push_back(std::make_unique<ExplicitRewriteFunctionDescriptor>(
+        Source, Target, Naked));
+  else
+    DL->push_back(
+        std::make_unique<PatternRewriteFunctionDescriptor>(Source, Transform));
+
+  return true;
+}
+
+bool RewriteMapParser::
+parseRewriteGlobalVariableDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
+                                     yaml::MappingNode *Descriptor,
+                                     RewriteDescriptorList *DL) {
+  std::string Source;
+  std::string Target;
+  std::string Transform;
+
+  for (auto &Field : *Descriptor) {
+    yaml::ScalarNode *Key;
+    yaml::ScalarNode *Value;
+    SmallString<32> KeyStorage;
+    SmallString<32> ValueStorage;
+    StringRef KeyValue;
+
+    Key = dyn_cast<yaml::ScalarNode>(Field.getKey());
+    if (!Key) {
+      YS.printError(Field.getKey(), "descriptor Key must be a scalar");
+      return false;
+    }
+
+    Value = dyn_cast<yaml::ScalarNode>(Field.getValue());
+    if (!Value) {
+      YS.printError(Field.getValue(), "descriptor value must be a scalar");
+      return false;
+    }
+
+    KeyValue = Key->getValue(KeyStorage);
+    if (KeyValue.equals("source")) {
+      std::string Error;
+
+      Source = std::string(Value->getValue(ValueStorage));
+      if (!Regex(Source).isValid(Error)) {
+        YS.printError(Field.getKey(), "invalid regex: " + Error);
+        return false;
+      }
+    } else if (KeyValue.equals("target")) {
+      Target = std::string(Value->getValue(ValueStorage));
+    } else if (KeyValue.equals("transform")) {
+      Transform = std::string(Value->getValue(ValueStorage));
+    } else {
+      YS.printError(Field.getKey(), "unknown Key for Global Variable");
+      return false;
+    }
+  }
+
+  if (Transform.empty() == Target.empty()) {
+    YS.printError(Descriptor,
+                  "exactly one of transform or target must be specified");
+    return false;
+  }
+
+  if (!Target.empty())
+    DL->push_back(std::make_unique<ExplicitRewriteGlobalVariableDescriptor>(
+        Source, Target,
+        /*Naked*/ false));
+  else
+    DL->push_back(std::make_unique<PatternRewriteGlobalVariableDescriptor>(
+        Source, Transform));
+
+  return true;
+}
+
+bool RewriteMapParser::
+parseRewriteGlobalAliasDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
+                                  yaml::MappingNode *Descriptor,
+                                  RewriteDescriptorList *DL) {
+  std::string Source;
+  std::string Target;
+  std::string Transform;
+
+  for (auto &Field : *Descriptor) {
+    yaml::ScalarNode *Key;
+    yaml::ScalarNode *Value;
+    SmallString<32> KeyStorage;
+    SmallString<32> ValueStorage;
+    StringRef KeyValue;
+
+    Key = dyn_cast<yaml::ScalarNode>(Field.getKey());
+    if (!Key) {
+      YS.printError(Field.getKey(), "descriptor key must be a scalar");
+      return false;
+    }
+
+    Value = dyn_cast<yaml::ScalarNode>(Field.getValue());
+    if (!Value) {
+      YS.printError(Field.getValue(), "descriptor value must be a scalar");
+      return false;
+    }
+
+    KeyValue = Key->getValue(KeyStorage);
+    if (KeyValue.equals("source")) {
+      std::string Error;
+
+      Source = std::string(Value->getValue(ValueStorage));
+      if (!Regex(Source).isValid(Error)) {
+        YS.printError(Field.getKey(), "invalid regex: " + Error);
+        return false;
+      }
+    } else if (KeyValue.equals("target")) {
+      Target = std::string(Value->getValue(ValueStorage));
+    } else if (KeyValue.equals("transform")) {
+      Transform = std::string(Value->getValue(ValueStorage));
+    } else {
+      YS.printError(Field.getKey(), "unknown key for Global Alias");
+      return false;
+    }
+  }
+
+  if (Transform.empty() == Target.empty()) {
+    YS.printError(Descriptor,
+                  "exactly one of transform or target must be specified");
+    return false;
+  }
+
+  if (!Target.empty())
+    DL->push_back(std::make_unique<ExplicitRewriteNamedAliasDescriptor>(
+        Source, Target,
+        /*Naked*/ false));
+  else
+    DL->push_back(std::make_unique<PatternRewriteNamedAliasDescriptor>(
+        Source, Transform));
+
+  return true;
+}
+
+namespace {
+
+class RewriteSymbolsLegacyPass : public ModulePass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  RewriteSymbolsLegacyPass();
+  RewriteSymbolsLegacyPass(SymbolRewriter::RewriteDescriptorList &DL);
+
+  bool runOnModule(Module &M) override;
+
+private:
+  RewriteSymbolPass Impl;
+};
+
+} // end anonymous namespace
+
+char RewriteSymbolsLegacyPass::ID = 0;
+
+RewriteSymbolsLegacyPass::RewriteSymbolsLegacyPass() : ModulePass(ID) {
+  initializeRewriteSymbolsLegacyPassPass(*PassRegistry::getPassRegistry());
+}
+
+RewriteSymbolsLegacyPass::RewriteSymbolsLegacyPass(
+    SymbolRewriter::RewriteDescriptorList &DL)
+    : ModulePass(ID), Impl(DL) {}
+
+bool RewriteSymbolsLegacyPass::runOnModule(Module &M) {
+  return Impl.runImpl(M);
+}
+
+PreservedAnalyses RewriteSymbolPass::run(Module &M, ModuleAnalysisManager &AM) {
+  if (!runImpl(M))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+
+bool RewriteSymbolPass::runImpl(Module &M) {
+  bool Changed;
+
+  Changed = false;
+  for (auto &Descriptor : Descriptors)
+    Changed |= Descriptor->performOnModule(M);
+
+  return Changed;
+}
+
+void RewriteSymbolPass::loadAndParseMapFiles() {
+  const std::vector<std::string> MapFiles(RewriteMapFiles);
+  SymbolRewriter::RewriteMapParser Parser;
+
+  for (const auto &MapFile : MapFiles)
+    Parser.parse(MapFile, &Descriptors);
+}
+
+INITIALIZE_PASS(RewriteSymbolsLegacyPass, "rewrite-symbols", "Rewrite Symbols",
+                false, false)
+
+ModulePass *llvm::createRewriteSymbolsPass() {
+  return new RewriteSymbolsLegacyPass();
+}
+
+ModulePass *
+llvm::createRewriteSymbolsPass(SymbolRewriter::RewriteDescriptorList &DL) {
+  return new RewriteSymbolsLegacyPass(DL);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
index 34e5b067a2..3631733713 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
@@ -1,60 +1,60 @@
-//===- UnifyFunctionExitNodes.cpp - Make all functions have a single exit -===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
+//===- UnifyFunctionExitNodes.cpp - Make all functions have a single exit -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
 // This pass is used to ensure that functions have at most one return and one
 // unreachable instruction in them.
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Transforms/Utils.h" 
-using namespace llvm; 
- 
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils.h"
+using namespace llvm;
+
 char UnifyFunctionExitNodesLegacyPass::ID = 0;
- 
+
 UnifyFunctionExitNodesLegacyPass::UnifyFunctionExitNodesLegacyPass()
     : FunctionPass(ID) {
   initializeUnifyFunctionExitNodesLegacyPassPass(
       *PassRegistry::getPassRegistry());
-} 
- 
+}
+
 INITIALIZE_PASS(UnifyFunctionExitNodesLegacyPass, "mergereturn",
-                "Unify function exit nodes", false, false) 
- 
-Pass *llvm::createUnifyFunctionExitNodesPass() { 
+                "Unify function exit nodes", false, false)
+
+Pass *llvm::createUnifyFunctionExitNodesPass() {
   return new UnifyFunctionExitNodesLegacyPass();
-} 
- 
+}
+
 void UnifyFunctionExitNodesLegacyPass::getAnalysisUsage(
     AnalysisUsage &AU) const {
-  // We preserve the non-critical-edgeness property 
-  AU.addPreservedID(BreakCriticalEdgesID); 
-  // This is a cluster of orthogonal Transforms 
-  AU.addPreservedID(LowerSwitchID); 
-} 
- 
+  // We preserve the non-critical-edgeness property
+  AU.addPreservedID(BreakCriticalEdgesID);
+  // This is a cluster of orthogonal Transforms
+  AU.addPreservedID(LowerSwitchID);
+}
+
 namespace {
 
 bool unifyUnreachableBlocks(Function &F) {
   std::vector<BasicBlock *> UnreachableBlocks;
 
-  for (BasicBlock &I : F) 
+  for (BasicBlock &I : F)
     if (isa<UnreachableInst>(I.getTerminator()))
-      UnreachableBlocks.push_back(&I); 
- 
+      UnreachableBlocks.push_back(&I);
+
   if (UnreachableBlocks.size() <= 1)
     return false;
- 
+
   BasicBlock *UnreachableBlock =
       BasicBlock::Create(F.getContext(), "UnifiedUnreachableBlock", &F);
   new UnreachableInst(F.getContext(), UnreachableBlock);
@@ -62,8 +62,8 @@ bool unifyUnreachableBlocks(Function &F) {
   for (BasicBlock *BB : UnreachableBlocks) {
     BB->getInstList().pop_back(); // Remove the unreachable inst.
     BranchInst::Create(UnreachableBlock, BB);
-  } 
- 
+  }
+
   return true;
 }
 
@@ -75,39 +75,39 @@ bool unifyReturnBlocks(Function &F) {
       ReturningBlocks.push_back(&I);
 
   if (ReturningBlocks.size() <= 1)
-    return false; 
- 
+    return false;
+
   // Insert a new basic block into the function, add PHI nodes (if the function
   // returns values), and convert all of the return instructions into
   // unconditional branches.
-  BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), 
-                                               "UnifiedReturnBlock", &F); 
- 
-  PHINode *PN = nullptr; 
-  if (F.getReturnType()->isVoidTy()) { 
-    ReturnInst::Create(F.getContext(), nullptr, NewRetBlock); 
-  } else { 
-    // If the function doesn't return void... add a PHI node to the block... 
-    PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(), 
-                         "UnifiedRetVal"); 
-    NewRetBlock->getInstList().push_back(PN); 
-    ReturnInst::Create(F.getContext(), PN, NewRetBlock); 
-  } 
- 
-  // Loop over all of the blocks, replacing the return instruction with an 
-  // unconditional branch. 
-  for (BasicBlock *BB : ReturningBlocks) { 
-    // Add an incoming element to the PHI node for every return instruction that 
-    // is merging into this new block... 
-    if (PN) 
-      PN->addIncoming(BB->getTerminator()->getOperand(0), BB); 
- 
-    BB->getInstList().pop_back();  // Remove the return insn 
-    BranchInst::Create(NewRetBlock, BB); 
-  } 
-
-  return true; 
-} 
+  BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(),
+                                               "UnifiedReturnBlock", &F);
+
+  PHINode *PN = nullptr;
+  if (F.getReturnType()->isVoidTy()) {
+    ReturnInst::Create(F.getContext(), nullptr, NewRetBlock);
+  } else {
+    // If the function doesn't return void... add a PHI node to the block...
+    PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(),
+                         "UnifiedRetVal");
+    NewRetBlock->getInstList().push_back(PN);
+    ReturnInst::Create(F.getContext(), PN, NewRetBlock);
+  }
+
+  // Loop over all of the blocks, replacing the return instruction with an
+  // unconditional branch.
+  for (BasicBlock *BB : ReturningBlocks) {
+    // Add an incoming element to the PHI node for every return instruction that
+    // is merging into this new block...
+    if (PN)
+      PN->addIncoming(BB->getTerminator()->getOperand(0), BB);
+
+    BB->getInstList().pop_back();  // Remove the return insn
+    BranchInst::Create(NewRetBlock, BB);
+  }
+
+  return true;
+}
 } // namespace
 
 // Unify all exit nodes of the CFG by creating a new BasicBlock, and converting
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/UnifyLoopExits.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/UnifyLoopExits.cpp
index dc73534be7..0b718ed613 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/UnifyLoopExits.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/UnifyLoopExits.cpp
@@ -1,223 +1,223 @@
-//===- UnifyLoopExits.cpp - Redirect exiting edges to one block -*- C++ -*-===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// For each natural loop with multiple exit blocks, this pass creates a new 
-// block N such that all exiting blocks now branch to N, and then control flow 
-// is redistributed to all the original exit blocks. 
-// 
-// Limitation: This assumes that all terminators in the CFG are direct branches 
-//             (the "br" instruction). The presence of any other control flow 
-//             such as indirectbr, switch or callbr will cause an assert. 
-// 
-//===----------------------------------------------------------------------===// 
- 
+//===- UnifyLoopExits.cpp - Redirect exiting edges to one block -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// For each natural loop with multiple exit blocks, this pass creates a new
+// block N such that all exiting blocks now branch to N, and then control flow
+// is redistributed to all the original exit blocks.
+//
+// Limitation: This assumes that all terminators in the CFG are direct branches
+//             (the "br" instruction). The presence of any other control flow
+//             such as indirectbr, switch or callbr will cause an assert.
+//
+//===----------------------------------------------------------------------===//
+
 #include "llvm/Transforms/Utils/UnifyLoopExits.h"
 #include "llvm/ADT/MapVector.h"
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Transforms/Utils.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
- 
-#define DEBUG_TYPE "unify-loop-exits" 
- 
-using namespace llvm; 
- 
-namespace { 
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "unify-loop-exits"
+
+using namespace llvm;
+
+namespace {
 struct UnifyLoopExitsLegacyPass : public FunctionPass {
-  static char ID; 
+  static char ID;
   UnifyLoopExitsLegacyPass() : FunctionPass(ID) {
     initializeUnifyLoopExitsLegacyPassPass(*PassRegistry::getPassRegistry());
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequiredID(LowerSwitchID); 
-    AU.addRequired<LoopInfoWrapperPass>(); 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addPreservedID(LowerSwitchID); 
-    AU.addPreserved<LoopInfoWrapperPass>(); 
-    AU.addPreserved<DominatorTreeWrapperPass>(); 
-  } 
- 
-  bool runOnFunction(Function &F) override; 
-}; 
-} // namespace 
- 
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequiredID(LowerSwitchID);
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreservedID(LowerSwitchID);
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override;
+};
+} // namespace
+
 char UnifyLoopExitsLegacyPass::ID = 0;
- 
+
 FunctionPass *llvm::createUnifyLoopExitsPass() {
   return new UnifyLoopExitsLegacyPass();
 }
- 
+
 INITIALIZE_PASS_BEGIN(UnifyLoopExitsLegacyPass, "unify-loop-exits",
-                      "Fixup each natural loop to have a single exit block", 
-                      false /* Only looks at CFG */, false /* Analysis Pass */) 
+                      "Fixup each natural loop to have a single exit block",
+                      false /* Only looks at CFG */, false /* Analysis Pass */)
 INITIALIZE_PASS_DEPENDENCY(LowerSwitchLegacyPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_END(UnifyLoopExitsLegacyPass, "unify-loop-exits",
-                    "Fixup each natural loop to have a single exit block", 
-                    false /* Only looks at CFG */, false /* Analysis Pass */) 
- 
-// The current transform introduces new control flow paths which may break the 
-// SSA requirement that every def must dominate all its uses. For example, 
-// consider a value D defined inside the loop that is used by some instruction 
-// U outside the loop. It follows that D dominates U, since the original 
-// program has valid SSA form. After merging the exits, all paths from D to U 
-// now flow through the unified exit block. In addition, there may be other 
-// paths that do not pass through D, but now reach the unified exit 
-// block. Thus, D no longer dominates U. 
-// 
-// Restore the dominance by creating a phi for each such D at the new unified 
-// loop exit. But when doing this, ignore any uses U that are in the new unified 
-// loop exit, since those were introduced specially when the block was created. 
-// 
-// The use of SSAUpdater seems like overkill for this operation. The location 
-// for creating the new PHI is well-known, and also the set of incoming blocks 
-// to the new PHI. 
-static void restoreSSA(const DominatorTree &DT, const Loop *L, 
-                       const SetVector<BasicBlock *> &Incoming, 
-                       BasicBlock *LoopExitBlock) { 
-  using InstVector = SmallVector<Instruction *, 8>; 
+                    "Fixup each natural loop to have a single exit block",
+                    false /* Only looks at CFG */, false /* Analysis Pass */)
+
+// The current transform introduces new control flow paths which may break the
+// SSA requirement that every def must dominate all its uses. For example,
+// consider a value D defined inside the loop that is used by some instruction
+// U outside the loop. It follows that D dominates U, since the original
+// program has valid SSA form. After merging the exits, all paths from D to U
+// now flow through the unified exit block. In addition, there may be other
+// paths that do not pass through D, but now reach the unified exit
+// block. Thus, D no longer dominates U.
+//
+// Restore the dominance by creating a phi for each such D at the new unified
+// loop exit. But when doing this, ignore any uses U that are in the new unified
+// loop exit, since those were introduced specially when the block was created.
+//
+// The use of SSAUpdater seems like overkill for this operation. The location
+// for creating the new PHI is well-known, and also the set of incoming blocks
+// to the new PHI.
+static void restoreSSA(const DominatorTree &DT, const Loop *L,
+                       const SetVector<BasicBlock *> &Incoming,
+                       BasicBlock *LoopExitBlock) {
+  using InstVector = SmallVector<Instruction *, 8>;
   using IIMap = MapVector<Instruction *, InstVector>;
-  IIMap ExternalUsers; 
-  for (auto BB : L->blocks()) { 
-    for (auto &I : *BB) { 
-      for (auto &U : I.uses()) { 
-        auto UserInst = cast<Instruction>(U.getUser()); 
-        auto UserBlock = UserInst->getParent(); 
-        if (UserBlock == LoopExitBlock) 
-          continue; 
-        if (L->contains(UserBlock)) 
-          continue; 
-        LLVM_DEBUG(dbgs() << "added ext use for " << I.getName() << "(" 
-                          << BB->getName() << ")" 
-                          << ": " << UserInst->getName() << "(" 
-                          << UserBlock->getName() << ")" 
-                          << "\n"); 
-        ExternalUsers[&I].push_back(UserInst); 
-      } 
-    } 
-  } 
- 
-  for (auto II : ExternalUsers) { 
-    // For each Def used outside the loop, create NewPhi in 
-    // LoopExitBlock. NewPhi receives Def only along exiting blocks that 
-    // dominate it, while the remaining values are undefined since those paths 
-    // didn't exist in the original CFG. 
-    auto Def = II.first; 
-    LLVM_DEBUG(dbgs() << "externally used: " << Def->getName() << "\n"); 
-    auto NewPhi = PHINode::Create(Def->getType(), Incoming.size(), 
-                                  Def->getName() + ".moved", 
-                                  LoopExitBlock->getTerminator()); 
-    for (auto In : Incoming) { 
-      LLVM_DEBUG(dbgs() << "predecessor " << In->getName() << ": "); 
-      if (Def->getParent() == In || DT.dominates(Def, In)) { 
-        LLVM_DEBUG(dbgs() << "dominated\n"); 
-        NewPhi->addIncoming(Def, In); 
-      } else { 
-        LLVM_DEBUG(dbgs() << "not dominated\n"); 
-        NewPhi->addIncoming(UndefValue::get(Def->getType()), In); 
-      } 
-    } 
- 
-    LLVM_DEBUG(dbgs() << "external users:"); 
-    for (auto U : II.second) { 
-      LLVM_DEBUG(dbgs() << " " << U->getName()); 
-      U->replaceUsesOfWith(Def, NewPhi); 
-    } 
-    LLVM_DEBUG(dbgs() << "\n"); 
-  } 
-} 
- 
-static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) { 
-  // To unify the loop exits, we need a list of the exiting blocks as 
-  // well as exit blocks. The functions for locating these lists both 
-  // traverse the entire loop body. It is more efficient to first 
-  // locate the exiting blocks and then examine their successors to 
-  // locate the exit blocks. 
-  SetVector<BasicBlock *> ExitingBlocks; 
-  SetVector<BasicBlock *> Exits; 
- 
-  // We need SetVectors, but the Loop API takes a vector, so we use a temporary. 
-  SmallVector<BasicBlock *, 8> Temp; 
-  L->getExitingBlocks(Temp); 
-  for (auto BB : Temp) { 
-    ExitingBlocks.insert(BB); 
-    for (auto S : successors(BB)) { 
-      auto SL = LI.getLoopFor(S); 
-      // A successor is not an exit if it is directly or indirectly in the 
-      // current loop. 
-      if (SL == L || L->contains(SL)) 
-        continue; 
-      Exits.insert(S); 
-    } 
-  } 
- 
-  LLVM_DEBUG( 
-      dbgs() << "Found exit blocks:"; 
-      for (auto Exit : Exits) { 
-        dbgs() << " " << Exit->getName(); 
-      } 
-      dbgs() << "\n"; 
- 
-      dbgs() << "Found exiting blocks:"; 
-      for (auto EB : ExitingBlocks) { 
-        dbgs() << " " << EB->getName(); 
-      } 
-      dbgs() << "\n";); 
- 
-  if (Exits.size() <= 1) { 
-    LLVM_DEBUG(dbgs() << "loop does not have multiple exits; nothing to do\n"); 
-    return false; 
-  } 
- 
-  SmallVector<BasicBlock *, 8> GuardBlocks; 
-  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); 
-  auto LoopExitBlock = CreateControlFlowHub(&DTU, GuardBlocks, ExitingBlocks, 
-                                            Exits, "loop.exit"); 
- 
-  restoreSSA(DT, L, ExitingBlocks, LoopExitBlock); 
- 
-#if defined(EXPENSIVE_CHECKS) 
-  assert(DT.verify(DominatorTree::VerificationLevel::Full)); 
-#else 
-  assert(DT.verify(DominatorTree::VerificationLevel::Fast)); 
-#endif // EXPENSIVE_CHECKS 
-  L->verifyLoop(); 
- 
-  // The guard blocks were created outside the loop, so they need to become 
-  // members of the parent loop. 
-  if (auto ParentLoop = L->getParentLoop()) { 
-    for (auto G : GuardBlocks) { 
-      ParentLoop->addBasicBlockToLoop(G, LI); 
-    } 
-    ParentLoop->verifyLoop(); 
-  } 
- 
-#if defined(EXPENSIVE_CHECKS) 
-  LI.verify(DT); 
-#endif // EXPENSIVE_CHECKS 
- 
-  return true; 
-} 
- 
+  IIMap ExternalUsers;
+  for (auto BB : L->blocks()) {
+    for (auto &I : *BB) {
+      for (auto &U : I.uses()) {
+        auto UserInst = cast<Instruction>(U.getUser());
+        auto UserBlock = UserInst->getParent();
+        if (UserBlock == LoopExitBlock)
+          continue;
+        if (L->contains(UserBlock))
+          continue;
+        LLVM_DEBUG(dbgs() << "added ext use for " << I.getName() << "("
+                          << BB->getName() << ")"
+                          << ": " << UserInst->getName() << "("
+                          << UserBlock->getName() << ")"
+                          << "\n");
+        ExternalUsers[&I].push_back(UserInst);
+      }
+    }
+  }
+
+  for (auto II : ExternalUsers) {
+    // For each Def used outside the loop, create NewPhi in
+    // LoopExitBlock. NewPhi receives Def only along exiting blocks that
+    // dominate it, while the remaining values are undefined since those paths
+    // didn't exist in the original CFG.
+    auto Def = II.first;
+    LLVM_DEBUG(dbgs() << "externally used: " << Def->getName() << "\n");
+    auto NewPhi = PHINode::Create(Def->getType(), Incoming.size(),
+                                  Def->getName() + ".moved",
+                                  LoopExitBlock->getTerminator());
+    for (auto In : Incoming) {
+      LLVM_DEBUG(dbgs() << "predecessor " << In->getName() << ": ");
+      if (Def->getParent() == In || DT.dominates(Def, In)) {
+        LLVM_DEBUG(dbgs() << "dominated\n");
+        NewPhi->addIncoming(Def, In);
+      } else {
+        LLVM_DEBUG(dbgs() << "not dominated\n");
+        NewPhi->addIncoming(UndefValue::get(Def->getType()), In);
+      }
+    }
+
+    LLVM_DEBUG(dbgs() << "external users:");
+    for (auto U : II.second) {
+      LLVM_DEBUG(dbgs() << " " << U->getName());
+      U->replaceUsesOfWith(Def, NewPhi);
+    }
+    LLVM_DEBUG(dbgs() << "\n");
+  }
+}
+
+static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
+  // To unify the loop exits, we need a list of the exiting blocks as
+  // well as exit blocks. The functions for locating these lists both
+  // traverse the entire loop body. It is more efficient to first
+  // locate the exiting blocks and then examine their successors to
+  // locate the exit blocks.
+  SetVector<BasicBlock *> ExitingBlocks;
+  SetVector<BasicBlock *> Exits;
+
+  // We need SetVectors, but the Loop API takes a vector, so we use a temporary.
+  SmallVector<BasicBlock *, 8> Temp;
+  L->getExitingBlocks(Temp);
+  for (auto BB : Temp) {
+    ExitingBlocks.insert(BB);
+    for (auto S : successors(BB)) {
+      auto SL = LI.getLoopFor(S);
+      // A successor is not an exit if it is directly or indirectly in the
+      // current loop.
+      if (SL == L || L->contains(SL))
+        continue;
+      Exits.insert(S);
+    }
+  }
+
+  LLVM_DEBUG(
+      dbgs() << "Found exit blocks:";
+      for (auto Exit : Exits) {
+        dbgs() << " " << Exit->getName();
+      }
+      dbgs() << "\n";
+
+      dbgs() << "Found exiting blocks:";
+      for (auto EB : ExitingBlocks) {
+        dbgs() << " " << EB->getName();
+      }
+      dbgs() << "\n";);
+
+  if (Exits.size() <= 1) {
+    LLVM_DEBUG(dbgs() << "loop does not have multiple exits; nothing to do\n");
+    return false;
+  }
+
+  SmallVector<BasicBlock *, 8> GuardBlocks;
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+  auto LoopExitBlock = CreateControlFlowHub(&DTU, GuardBlocks, ExitingBlocks,
+                                            Exits, "loop.exit");
+
+  restoreSSA(DT, L, ExitingBlocks, LoopExitBlock);
+
+#if defined(EXPENSIVE_CHECKS)
+  assert(DT.verify(DominatorTree::VerificationLevel::Full));
+#else
+  assert(DT.verify(DominatorTree::VerificationLevel::Fast));
+#endif // EXPENSIVE_CHECKS
+  L->verifyLoop();
+
+  // The guard blocks were created outside the loop, so they need to become
+  // members of the parent loop.
+  if (auto ParentLoop = L->getParentLoop()) {
+    for (auto G : GuardBlocks) {
+      ParentLoop->addBasicBlockToLoop(G, LI);
+    }
+    ParentLoop->verifyLoop();
+  }
+
+#if defined(EXPENSIVE_CHECKS)
+  LI.verify(DT);
+#endif // EXPENSIVE_CHECKS
+
+  return true;
+}
+
 static bool runImpl(LoopInfo &LI, DominatorTree &DT) {
- 
-  bool Changed = false; 
-  auto Loops = LI.getLoopsInPreorder(); 
-  for (auto L : Loops) { 
-    LLVM_DEBUG(dbgs() << "Loop: " << L->getHeader()->getName() << " (depth: " 
-                      << LI.getLoopDepth(L->getHeader()) << ")\n"); 
-    Changed |= unifyLoopExits(DT, LI, L); 
-  } 
-  return Changed; 
-} 
+
+  bool Changed = false;
+  auto Loops = LI.getLoopsInPreorder();
+  for (auto L : Loops) {
+    LLVM_DEBUG(dbgs() << "Loop: " << L->getHeader()->getName() << " (depth: "
+                      << LI.getLoopDepth(L->getHeader()) << ")\n");
+    Changed |= unifyLoopExits(DT, LI, L);
+  }
+  return Changed;
+}
 
 bool UnifyLoopExitsLegacyPass::runOnFunction(Function &F) {
   LLVM_DEBUG(dbgs() << "===== Unifying loop exits in function " << F.getName()
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/UniqueInternalLinkageNames.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/UniqueInternalLinkageNames.cpp
index 3afa0b8c77..c57cec6be6 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/UniqueInternalLinkageNames.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/UniqueInternalLinkageNames.cpp
@@ -1,48 +1,48 @@
-//===- UniqueInternalLinkageNames.cpp - Unique Internal Linkage Sym Names -===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements unique naming of internal linkage symbols with option 
-// -funique-internal-linkage-symbols. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/UniqueInternalLinkageNames.h" 
-#include "llvm/ADT/SmallString.h" 
+//===- UniqueInternalLinkageNames.cpp - Unique Internal Linkage Sym Names -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements unique naming of internal linkage symbols with option
+// -funique-internal-linkage-symbols.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/UniqueInternalLinkageNames.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Module.h" 
-#include "llvm/InitializePasses.h" 
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/MD5.h" 
-#include "llvm/Transforms/Utils/ModuleUtils.h" 
- 
-using namespace llvm; 
- 
-static bool uniqueifyInternalLinkageNames(Module &M) { 
-  llvm::MD5 Md5; 
-  Md5.update(M.getSourceFileName()); 
-  llvm::MD5::MD5Result R; 
-  Md5.final(R); 
-  SmallString<32> Str; 
-  llvm::MD5::stringifyResult(R, Str); 
+#include "llvm/Support/MD5.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+static bool uniqueifyInternalLinkageNames(Module &M) {
+  llvm::MD5 Md5;
+  Md5.update(M.getSourceFileName());
+  llvm::MD5::MD5Result R;
+  Md5.final(R);
+  SmallString<32> Str;
+  llvm::MD5::stringifyResult(R, Str);
   // Convert MD5hash to Decimal. Demangler suffixes can either contain numbers
   // or characters but not both.
   APInt IntHash = APInt(128, Str.str(), 16);
   // Prepend "__uniq" before the hash for tools like profilers to understand that
   // this symbol is of internal linkage type.
   std::string ModuleNameHash = (Twine(".__uniq.") + Twine(IntHash.toString(10, false))).str();
-  bool Changed = false; 
+  bool Changed = false;
   MDBuilder MDB(M.getContext());
- 
-  // Append the module hash to all internal linkage functions. 
-  for (auto &F : M) { 
-    if (F.hasInternalLinkage()) { 
-      F.setName(F.getName() + ModuleNameHash); 
+
+  // Append the module hash to all internal linkage functions.
+  for (auto &F : M) {
+    if (F.hasInternalLinkage()) {
+      F.setName(F.getName() + ModuleNameHash);
       F.addFnAttr("sample-profile-suffix-elision-policy", "selected");
       // Replace linkage names in the debug metadata.
       if (DISubprogram *SP = F.getSubprogram()) {
@@ -55,64 +55,64 @@ static bool uniqueifyInternalLinkageNames(Module &M) {
           }
         }
       }
-      Changed = true; 
-    } 
-  } 
- 
-  // Append the module hash to all internal linkage globals. 
-  for (auto &GV : M.globals()) { 
-    if (GV.hasInternalLinkage()) { 
-      GV.setName(GV.getName() + ModuleNameHash); 
-      Changed = true; 
-    } 
-  } 
-  return Changed; 
-} 
- 
-namespace { 
- 
-// Legacy pass that provides a name to every anon globals. 
-class UniqueInternalLinkageNamesLegacyPass : public ModulePass { 
- 
-public: 
-  /// Pass identification, replacement for typeid 
-  static char ID; 
- 
-  /// Specify pass name for debug output 
-  StringRef getPassName() const override { 
-    return "Unique Internal Linkage Names"; 
-  } 
- 
-  explicit UniqueInternalLinkageNamesLegacyPass() : ModulePass(ID) { 
-    initializeUniqueInternalLinkageNamesLegacyPassPass( 
-        *PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnModule(Module &M) override { 
-    return uniqueifyInternalLinkageNames(M); 
-  } 
-}; 
- 
-char UniqueInternalLinkageNamesLegacyPass::ID = 0; 
-} // anonymous namespace 
- 
-PreservedAnalyses 
-UniqueInternalLinkageNamesPass::run(Module &M, ModuleAnalysisManager &AM) { 
-  if (!uniqueifyInternalLinkageNames(M)) 
-    return PreservedAnalyses::all(); 
- 
-  return PreservedAnalyses::none(); 
-} 
- 
-INITIALIZE_PASS_BEGIN(UniqueInternalLinkageNamesLegacyPass, 
-                      "unique-internal-linkage-names", 
-                      "Uniqueify internal linkage names", false, false) 
-INITIALIZE_PASS_END(UniqueInternalLinkageNamesLegacyPass, 
-                    "unique-internal-linkage-names", 
-                    "Uniqueify Internal linkage names", false, false) 
- 
-namespace llvm { 
-ModulePass *createUniqueInternalLinkageNamesPass() { 
-  return new UniqueInternalLinkageNamesLegacyPass(); 
-} 
-} // namespace llvm 
+      Changed = true;
+    }
+  }
+
+  // Append the module hash to all internal linkage globals.
+  for (auto &GV : M.globals()) {
+    if (GV.hasInternalLinkage()) {
+      GV.setName(GV.getName() + ModuleNameHash);
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+namespace {
+
+// Legacy pass that provides a name to every anon globals.
+class UniqueInternalLinkageNamesLegacyPass : public ModulePass {
+
+public:
+  /// Pass identification, replacement for typeid
+  static char ID;
+
+  /// Specify pass name for debug output
+  StringRef getPassName() const override {
+    return "Unique Internal Linkage Names";
+  }
+
+  explicit UniqueInternalLinkageNamesLegacyPass() : ModulePass(ID) {
+    initializeUniqueInternalLinkageNamesLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    return uniqueifyInternalLinkageNames(M);
+  }
+};
+
+char UniqueInternalLinkageNamesLegacyPass::ID = 0;
+} // anonymous namespace
+
+PreservedAnalyses
+UniqueInternalLinkageNamesPass::run(Module &M, ModuleAnalysisManager &AM) {
+  if (!uniqueifyInternalLinkageNames(M))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+
+INITIALIZE_PASS_BEGIN(UniqueInternalLinkageNamesLegacyPass,
+                      "unique-internal-linkage-names",
+                      "Uniqueify internal linkage names", false, false)
+INITIALIZE_PASS_END(UniqueInternalLinkageNamesLegacyPass,
+                    "unique-internal-linkage-names",
+                    "Uniqueify Internal linkage names", false, false)
+
+namespace llvm {
+ModulePass *createUniqueInternalLinkageNamesPass() {
+  return new UniqueInternalLinkageNamesLegacyPass();
+}
+} // namespace llvm
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/Utils.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/Utils.cpp
index b55bfc7d52..73c0532f3f 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/Utils.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/Utils.cpp
@@ -1,66 +1,66 @@
-//===-- Utils.cpp - TransformUtils Infrastructure -------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file defines the common initialization infrastructure for the 
-// TransformUtils library. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils.h" 
-#include "llvm-c/Initialization.h" 
-#include "llvm-c/Transforms/Utils.h" 
-#include "llvm/IR/LegacyPassManager.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/PassRegistry.h" 
- 
-using namespace llvm; 
- 
-/// initializeTransformUtils - Initialize all passes in the TransformUtils 
-/// library. 
-void llvm::initializeTransformUtils(PassRegistry &Registry) { 
-  initializeAddDiscriminatorsLegacyPassPass(Registry); 
-  initializeAssumeSimplifyPassLegacyPassPass(Registry); 
-  initializeAssumeBuilderPassLegacyPassPass(Registry); 
-  initializeBreakCriticalEdgesPass(Registry); 
-  initializeCanonicalizeAliasesLegacyPassPass(Registry); 
-  initializeCanonicalizeFreezeInLoopsPass(Registry); 
-  initializeInstNamerPass(Registry); 
-  initializeLCSSAWrapperPassPass(Registry); 
-  initializeLibCallsShrinkWrapLegacyPassPass(Registry); 
-  initializeLoopSimplifyPass(Registry); 
-  initializeLowerInvokeLegacyPassPass(Registry); 
+//===-- Utils.cpp - TransformUtils Infrastructure -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the common initialization infrastructure for the
+// TransformUtils library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils.h"
+#include "llvm-c/Initialization.h"
+#include "llvm-c/Transforms/Utils.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/PassRegistry.h"
+
+using namespace llvm;
+
+/// initializeTransformUtils - Initialize all passes in the TransformUtils
+/// library.
+void llvm::initializeTransformUtils(PassRegistry &Registry) {
+  initializeAddDiscriminatorsLegacyPassPass(Registry);
+  initializeAssumeSimplifyPassLegacyPassPass(Registry);
+  initializeAssumeBuilderPassLegacyPassPass(Registry);
+  initializeBreakCriticalEdgesPass(Registry);
+  initializeCanonicalizeAliasesLegacyPassPass(Registry);
+  initializeCanonicalizeFreezeInLoopsPass(Registry);
+  initializeInstNamerPass(Registry);
+  initializeLCSSAWrapperPassPass(Registry);
+  initializeLibCallsShrinkWrapLegacyPassPass(Registry);
+  initializeLoopSimplifyPass(Registry);
+  initializeLowerInvokeLegacyPassPass(Registry);
   initializeLowerSwitchLegacyPassPass(Registry);
-  initializeNameAnonGlobalLegacyPassPass(Registry); 
-  initializePromoteLegacyPassPass(Registry); 
+  initializeNameAnonGlobalLegacyPassPass(Registry);
+  initializePromoteLegacyPassPass(Registry);
   initializeStripNonLineTableDebugLegacyPassPass(Registry);
   initializeUnifyFunctionExitNodesLegacyPassPass(Registry);
-  initializeMetaRenamerPass(Registry); 
+  initializeMetaRenamerPass(Registry);
   initializeStripGCRelocatesLegacyPass(Registry);
-  initializePredicateInfoPrinterLegacyPassPass(Registry); 
-  initializeInjectTLIMappingsLegacyPass(Registry); 
-  initializeFixIrreduciblePass(Registry); 
+  initializePredicateInfoPrinterLegacyPassPass(Registry);
+  initializeInjectTLIMappingsLegacyPass(Registry);
+  initializeFixIrreduciblePass(Registry);
   initializeUnifyLoopExitsLegacyPassPass(Registry);
-  initializeUniqueInternalLinkageNamesLegacyPassPass(Registry); 
-} 
- 
-/// LLVMInitializeTransformUtils - C binding for initializeTransformUtilsPasses. 
-void LLVMInitializeTransformUtils(LLVMPassRegistryRef R) { 
-  initializeTransformUtils(*unwrap(R)); 
-} 
- 
-void LLVMAddLowerSwitchPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createLowerSwitchPass()); 
-} 
- 
-void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createPromoteMemoryToRegisterPass()); 
-} 
- 
-void LLVMAddAddDiscriminatorsPass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createAddDiscriminatorsPass()); 
-} 
+  initializeUniqueInternalLinkageNamesLegacyPassPass(Registry);
+}
+
+/// LLVMInitializeTransformUtils - C binding for initializeTransformUtilsPasses.
+void LLVMInitializeTransformUtils(LLVMPassRegistryRef R) {
+  initializeTransformUtils(*unwrap(R));
+}
+
+void LLVMAddLowerSwitchPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLowerSwitchPass());
+}
+
+void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createPromoteMemoryToRegisterPass());
+}
+
+void LLVMAddAddDiscriminatorsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createAddDiscriminatorsPass());
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/VNCoercion.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/VNCoercion.cpp
index b718ce8b4d..61cd8595a7 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/VNCoercion.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/VNCoercion.cpp
@@ -1,57 +1,57 @@
-#include "llvm/Transforms/Utils/VNCoercion.h" 
-#include "llvm/Analysis/ConstantFolding.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/Support/Debug.h" 
- 
-#define DEBUG_TYPE "vncoerce" 
- 
-namespace llvm { 
-namespace VNCoercion { 
- 
-static bool isFirstClassAggregateOrScalableType(Type *Ty) { 
-  return Ty->isStructTy() || Ty->isArrayTy() || isa<ScalableVectorType>(Ty); 
-} 
- 
-/// Return true if coerceAvailableValueToLoadType will succeed. 
-bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy, 
-                                     const DataLayout &DL) { 
-  Type *StoredTy = StoredVal->getType(); 
-
-  if (StoredTy == LoadTy) 
-    return true; 
- 
-  // If the loaded/stored value is a first class array/struct, or scalable type, 
-  // don't try to transform them. We need to be able to bitcast to integer. 
-  if (isFirstClassAggregateOrScalableType(LoadTy) || 
-      isFirstClassAggregateOrScalableType(StoredTy)) 
-    return false; 
- 
-  uint64_t StoreSize = DL.getTypeSizeInBits(StoredTy).getFixedSize(); 
- 
-  // The store size must be byte-aligned to support future type casts. 
-  if (llvm::alignTo(StoreSize, 8) != StoreSize) 
-    return false; 
- 
-  // The store has to be at least as big as the load. 
-  if (StoreSize < DL.getTypeSizeInBits(LoadTy).getFixedSize()) 
-    return false; 
- 
+#include "llvm/Transforms/Utils/VNCoercion.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "vncoerce"
+
+namespace llvm {
+namespace VNCoercion {
+
+static bool isFirstClassAggregateOrScalableType(Type *Ty) {
+  return Ty->isStructTy() || Ty->isArrayTy() || isa<ScalableVectorType>(Ty);
+}
+
+/// Return true if coerceAvailableValueToLoadType will succeed.
+bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
+                                     const DataLayout &DL) {
+  Type *StoredTy = StoredVal->getType();
+
+  if (StoredTy == LoadTy)
+    return true;
+
+  // If the loaded/stored value is a first class array/struct, or scalable type,
+  // don't try to transform them. We need to be able to bitcast to integer.
+  if (isFirstClassAggregateOrScalableType(LoadTy) ||
+      isFirstClassAggregateOrScalableType(StoredTy))
+    return false;
+
+  uint64_t StoreSize = DL.getTypeSizeInBits(StoredTy).getFixedSize();
+
+  // The store size must be byte-aligned to support future type casts.
+  if (llvm::alignTo(StoreSize, 8) != StoreSize)
+    return false;
+
+  // The store has to be at least as big as the load.
+  if (StoreSize < DL.getTypeSizeInBits(LoadTy).getFixedSize())
+    return false;
+
   bool StoredNI = DL.isNonIntegralPointerType(StoredTy->getScalarType());
   bool LoadNI = DL.isNonIntegralPointerType(LoadTy->getScalarType());
-  // Don't coerce non-integral pointers to integers or vice versa. 
+  // Don't coerce non-integral pointers to integers or vice versa.
   if (StoredNI != LoadNI) {
-    // As a special case, allow coercion of memset used to initialize 
-    // an array w/null.  Despite non-integral pointers not generally having a 
-    // specific bit pattern, we do assume null is zero. 
-    if (auto *CI = dyn_cast<Constant>(StoredVal)) 
-      return CI->isNullValue(); 
-    return false; 
+    // As a special case, allow coercion of memset used to initialize
+    // an array w/null.  Despite non-integral pointers not generally having a
+    // specific bit pattern, we do assume null is zero.
+    if (auto *CI = dyn_cast<Constant>(StoredVal))
+      return CI->isNullValue();
+    return false;
   } else if (StoredNI && LoadNI &&
              StoredTy->getPointerAddressSpace() !=
                  LoadTy->getPointerAddressSpace()) {
     return false;
-  } 
+  }
 
 
   // The implementation below uses inttoptr for vectors of unequal size; we
@@ -60,570 +60,570 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
   if (StoredNI && StoreSize != DL.getTypeSizeInBits(LoadTy).getFixedSize())
     return false;
 
-  return true; 
-} 
- 
-template <class T, class HelperClass> 
-static T *coerceAvailableValueToLoadTypeHelper(T *StoredVal, Type *LoadedTy, 
-                                               HelperClass &Helper, 
-                                               const DataLayout &DL) { 
-  assert(canCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL) && 
-         "precondition violation - materialization can't fail"); 
-  if (auto *C = dyn_cast<Constant>(StoredVal)) 
-    StoredVal = ConstantFoldConstant(C, DL); 
- 
-  // If this is already the right type, just return it. 
-  Type *StoredValTy = StoredVal->getType(); 
- 
-  uint64_t StoredValSize = DL.getTypeSizeInBits(StoredValTy).getFixedSize(); 
-  uint64_t LoadedValSize = DL.getTypeSizeInBits(LoadedTy).getFixedSize(); 
- 
-  // If the store and reload are the same size, we can always reuse it. 
-  if (StoredValSize == LoadedValSize) { 
-    // Pointer to Pointer -> use bitcast. 
-    if (StoredValTy->isPtrOrPtrVectorTy() && LoadedTy->isPtrOrPtrVectorTy()) { 
-      StoredVal = Helper.CreateBitCast(StoredVal, LoadedTy); 
-    } else { 
-      // Convert source pointers to integers, which can be bitcast. 
-      if (StoredValTy->isPtrOrPtrVectorTy()) { 
-        StoredValTy = DL.getIntPtrType(StoredValTy); 
-        StoredVal = Helper.CreatePtrToInt(StoredVal, StoredValTy); 
-      } 
- 
-      Type *TypeToCastTo = LoadedTy; 
-      if (TypeToCastTo->isPtrOrPtrVectorTy()) 
-        TypeToCastTo = DL.getIntPtrType(TypeToCastTo); 
- 
-      if (StoredValTy != TypeToCastTo) 
-        StoredVal = Helper.CreateBitCast(StoredVal, TypeToCastTo); 
- 
-      // Cast to pointer if the load needs a pointer type. 
-      if (LoadedTy->isPtrOrPtrVectorTy()) 
-        StoredVal = Helper.CreateIntToPtr(StoredVal, LoadedTy); 
-    } 
- 
-    if (auto *C = dyn_cast<ConstantExpr>(StoredVal)) 
-      StoredVal = ConstantFoldConstant(C, DL); 
- 
-    return StoredVal; 
-  } 
-  // If the loaded value is smaller than the available value, then we can 
-  // extract out a piece from it.  If the available value is too small, then we 
-  // can't do anything. 
-  assert(StoredValSize >= LoadedValSize && 
-         "canCoerceMustAliasedValueToLoad fail"); 
- 
-  // Convert source pointers to integers, which can be manipulated. 
-  if (StoredValTy->isPtrOrPtrVectorTy()) { 
-    StoredValTy = DL.getIntPtrType(StoredValTy); 
-    StoredVal = Helper.CreatePtrToInt(StoredVal, StoredValTy); 
-  } 
- 
-  // Convert vectors and fp to integer, which can be manipulated. 
-  if (!StoredValTy->isIntegerTy()) { 
-    StoredValTy = IntegerType::get(StoredValTy->getContext(), StoredValSize); 
-    StoredVal = Helper.CreateBitCast(StoredVal, StoredValTy); 
-  } 
- 
-  // If this is a big-endian system, we need to shift the value down to the low 
-  // bits so that a truncate will work. 
-  if (DL.isBigEndian()) { 
-    uint64_t ShiftAmt = DL.getTypeStoreSizeInBits(StoredValTy).getFixedSize() - 
-                        DL.getTypeStoreSizeInBits(LoadedTy).getFixedSize(); 
-    StoredVal = Helper.CreateLShr( 
-        StoredVal, ConstantInt::get(StoredVal->getType(), ShiftAmt)); 
-  } 
- 
-  // Truncate the integer to the right size now. 
-  Type *NewIntTy = IntegerType::get(StoredValTy->getContext(), LoadedValSize); 
-  StoredVal = Helper.CreateTruncOrBitCast(StoredVal, NewIntTy); 
- 
-  if (LoadedTy != NewIntTy) { 
-    // If the result is a pointer, inttoptr. 
-    if (LoadedTy->isPtrOrPtrVectorTy()) 
-      StoredVal = Helper.CreateIntToPtr(StoredVal, LoadedTy); 
-    else 
-      // Otherwise, bitcast. 
-      StoredVal = Helper.CreateBitCast(StoredVal, LoadedTy); 
-  } 
- 
-  if (auto *C = dyn_cast<Constant>(StoredVal)) 
-    StoredVal = ConstantFoldConstant(C, DL); 
- 
-  return StoredVal; 
-} 
- 
-/// If we saw a store of a value to memory, and 
-/// then a load from a must-aliased pointer of a different type, try to coerce 
-/// the stored value.  LoadedTy is the type of the load we want to replace. 
-/// IRB is IRBuilder used to insert new instructions. 
-/// 
-/// If we can't do it, return null. 
-Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy, 
-                                      IRBuilderBase &IRB, 
-                                      const DataLayout &DL) { 
-  return coerceAvailableValueToLoadTypeHelper(StoredVal, LoadedTy, IRB, DL); 
-} 
- 
-/// This function is called when we have a memdep query of a load that ends up 
-/// being a clobbering memory write (store, memset, memcpy, memmove).  This 
-/// means that the write *may* provide bits used by the load but we can't be 
-/// sure because the pointers don't must-alias. 
-/// 
-/// Check this case to see if there is anything more we can do before we give 
-/// up.  This returns -1 if we have to give up, or a byte number in the stored 
-/// value of the piece that feeds the load. 
-static int analyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr, 
-                                          Value *WritePtr, 
-                                          uint64_t WriteSizeInBits, 
-                                          const DataLayout &DL) { 
-  // If the loaded/stored value is a first class array/struct, or scalable type, 
-  // don't try to transform them. We need to be able to bitcast to integer. 
-  if (isFirstClassAggregateOrScalableType(LoadTy)) 
-    return -1; 
- 
-  int64_t StoreOffset = 0, LoadOffset = 0; 
-  Value *StoreBase = 
-      GetPointerBaseWithConstantOffset(WritePtr, StoreOffset, DL); 
-  Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, DL); 
-  if (StoreBase != LoadBase) 
-    return -1; 
- 
-  // If the load and store are to the exact same address, they should have been 
-  // a must alias.  AA must have gotten confused. 
-  // FIXME: Study to see if/when this happens.  One case is forwarding a memset 
-  // to a load from the base of the memset. 
- 
-  // If the load and store don't overlap at all, the store doesn't provide 
-  // anything to the load.  In this case, they really don't alias at all, AA 
-  // must have gotten confused. 
-  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy).getFixedSize(); 
- 
-  if ((WriteSizeInBits & 7) | (LoadSize & 7)) 
-    return -1; 
-  uint64_t StoreSize = WriteSizeInBits / 8; // Convert to bytes. 
-  LoadSize /= 8; 
- 
-  bool isAAFailure = false; 
-  if (StoreOffset < LoadOffset) 
-    isAAFailure = StoreOffset + int64_t(StoreSize) <= LoadOffset; 
-  else 
-    isAAFailure = LoadOffset + int64_t(LoadSize) <= StoreOffset; 
- 
-  if (isAAFailure) 
-    return -1; 
- 
-  // If the Load isn't completely contained within the stored bits, we don't 
-  // have all the bits to feed it.  We could do something crazy in the future 
-  // (issue a smaller load then merge the bits in) but this seems unlikely to be 
-  // valuable. 
-  if (StoreOffset > LoadOffset || 
-      StoreOffset + StoreSize < LoadOffset + LoadSize) 
-    return -1; 
- 
-  // Okay, we can do this transformation.  Return the number of bytes into the 
-  // store that the load is. 
-  return LoadOffset - StoreOffset; 
-} 
- 
-/// This function is called when we have a 
-/// memdep query of a load that ends up being a clobbering store. 
-int analyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr, 
-                                   StoreInst *DepSI, const DataLayout &DL) { 
-  auto *StoredVal = DepSI->getValueOperand(); 
- 
-  // Cannot handle reading from store of first-class aggregate or scalable type. 
-  if (isFirstClassAggregateOrScalableType(StoredVal->getType())) 
-    return -1; 
- 
+  return true;
+}
+
+template <class T, class HelperClass>
+static T *coerceAvailableValueToLoadTypeHelper(T *StoredVal, Type *LoadedTy,
+                                               HelperClass &Helper,
+                                               const DataLayout &DL) {
+  assert(canCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL) &&
+         "precondition violation - materialization can't fail");
+  if (auto *C = dyn_cast<Constant>(StoredVal))
+    StoredVal = ConstantFoldConstant(C, DL);
+
+  // If this is already the right type, just return it.
+  Type *StoredValTy = StoredVal->getType();
+
+  uint64_t StoredValSize = DL.getTypeSizeInBits(StoredValTy).getFixedSize();
+  uint64_t LoadedValSize = DL.getTypeSizeInBits(LoadedTy).getFixedSize();
+
+  // If the store and reload are the same size, we can always reuse it.
+  if (StoredValSize == LoadedValSize) {
+    // Pointer to Pointer -> use bitcast.
+    if (StoredValTy->isPtrOrPtrVectorTy() && LoadedTy->isPtrOrPtrVectorTy()) {
+      StoredVal = Helper.CreateBitCast(StoredVal, LoadedTy);
+    } else {
+      // Convert source pointers to integers, which can be bitcast.
+      if (StoredValTy->isPtrOrPtrVectorTy()) {
+        StoredValTy = DL.getIntPtrType(StoredValTy);
+        StoredVal = Helper.CreatePtrToInt(StoredVal, StoredValTy);
+      }
+
+      Type *TypeToCastTo = LoadedTy;
+      if (TypeToCastTo->isPtrOrPtrVectorTy())
+        TypeToCastTo = DL.getIntPtrType(TypeToCastTo);
+
+      if (StoredValTy != TypeToCastTo)
+        StoredVal = Helper.CreateBitCast(StoredVal, TypeToCastTo);
+
+      // Cast to pointer if the load needs a pointer type.
+      if (LoadedTy->isPtrOrPtrVectorTy())
+        StoredVal = Helper.CreateIntToPtr(StoredVal, LoadedTy);
+    }
+
+    if (auto *C = dyn_cast<ConstantExpr>(StoredVal))
+      StoredVal = ConstantFoldConstant(C, DL);
+
+    return StoredVal;
+  }
+  // If the loaded value is smaller than the available value, then we can
+  // extract out a piece from it.  If the available value is too small, then we
+  // can't do anything.
+  assert(StoredValSize >= LoadedValSize &&
+         "canCoerceMustAliasedValueToLoad fail");
+
+  // Convert source pointers to integers, which can be manipulated.
+  if (StoredValTy->isPtrOrPtrVectorTy()) {
+    StoredValTy = DL.getIntPtrType(StoredValTy);
+    StoredVal = Helper.CreatePtrToInt(StoredVal, StoredValTy);
+  }
+
+  // Convert vectors and fp to integer, which can be manipulated.
+  if (!StoredValTy->isIntegerTy()) {
+    StoredValTy = IntegerType::get(StoredValTy->getContext(), StoredValSize);
+    StoredVal = Helper.CreateBitCast(StoredVal, StoredValTy);
+  }
+
+  // If this is a big-endian system, we need to shift the value down to the low
+  // bits so that a truncate will work.
+  if (DL.isBigEndian()) {
+    uint64_t ShiftAmt = DL.getTypeStoreSizeInBits(StoredValTy).getFixedSize() -
+                        DL.getTypeStoreSizeInBits(LoadedTy).getFixedSize();
+    StoredVal = Helper.CreateLShr(
+        StoredVal, ConstantInt::get(StoredVal->getType(), ShiftAmt));
+  }
+
+  // Truncate the integer to the right size now.
+  Type *NewIntTy = IntegerType::get(StoredValTy->getContext(), LoadedValSize);
+  StoredVal = Helper.CreateTruncOrBitCast(StoredVal, NewIntTy);
+
+  if (LoadedTy != NewIntTy) {
+    // If the result is a pointer, inttoptr.
+    if (LoadedTy->isPtrOrPtrVectorTy())
+      StoredVal = Helper.CreateIntToPtr(StoredVal, LoadedTy);
+    else
+      // Otherwise, bitcast.
+      StoredVal = Helper.CreateBitCast(StoredVal, LoadedTy);
+  }
+
+  if (auto *C = dyn_cast<Constant>(StoredVal))
+    StoredVal = ConstantFoldConstant(C, DL);
+
+  return StoredVal;
+}
+
+/// If we saw a store of a value to memory, and
+/// then a load from a must-aliased pointer of a different type, try to coerce
+/// the stored value.  LoadedTy is the type of the load we want to replace.
+/// IRB is IRBuilder used to insert new instructions.
+///
+/// If we can't do it, return null.
+Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
+                                      IRBuilderBase &IRB,
+                                      const DataLayout &DL) {
+  return coerceAvailableValueToLoadTypeHelper(StoredVal, LoadedTy, IRB, DL);
+}
+
+/// This function is called when we have a memdep query of a load that ends up
+/// being a clobbering memory write (store, memset, memcpy, memmove).  This
+/// means that the write *may* provide bits used by the load but we can't be
+/// sure because the pointers don't must-alias.
+///
+/// Check this case to see if there is anything more we can do before we give
+/// up.  This returns -1 if we have to give up, or a byte number in the stored
+/// value of the piece that feeds the load.
+static int analyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
+                                          Value *WritePtr,
+                                          uint64_t WriteSizeInBits,
+                                          const DataLayout &DL) {
+  // If the loaded/stored value is a first class array/struct, or scalable type,
+  // don't try to transform them. We need to be able to bitcast to integer.
+  if (isFirstClassAggregateOrScalableType(LoadTy))
+    return -1;
+
+  int64_t StoreOffset = 0, LoadOffset = 0;
+  Value *StoreBase =
+      GetPointerBaseWithConstantOffset(WritePtr, StoreOffset, DL);
+  Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, DL);
+  if (StoreBase != LoadBase)
+    return -1;
+
+  // If the load and store are to the exact same address, they should have been
+  // a must alias.  AA must have gotten confused.
+  // FIXME: Study to see if/when this happens.  One case is forwarding a memset
+  // to a load from the base of the memset.
+
+  // If the load and store don't overlap at all, the store doesn't provide
+  // anything to the load.  In this case, they really don't alias at all, AA
+  // must have gotten confused.
+  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy).getFixedSize();
+
+  if ((WriteSizeInBits & 7) | (LoadSize & 7))
+    return -1;
+  uint64_t StoreSize = WriteSizeInBits / 8; // Convert to bytes.
+  LoadSize /= 8;
+
+  bool isAAFailure = false;
+  if (StoreOffset < LoadOffset)
+    isAAFailure = StoreOffset + int64_t(StoreSize) <= LoadOffset;
+  else
+    isAAFailure = LoadOffset + int64_t(LoadSize) <= StoreOffset;
+
+  if (isAAFailure)
+    return -1;
+
+  // If the Load isn't completely contained within the stored bits, we don't
+  // have all the bits to feed it.  We could do something crazy in the future
+  // (issue a smaller load then merge the bits in) but this seems unlikely to be
+  // valuable.
+  if (StoreOffset > LoadOffset ||
+      StoreOffset + StoreSize < LoadOffset + LoadSize)
+    return -1;
+
+  // Okay, we can do this transformation.  Return the number of bytes into the
+  // store that the load is.
+  return LoadOffset - StoreOffset;
+}
+
+/// This function is called when we have a
+/// memdep query of a load that ends up being a clobbering store.
+int analyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
+                                   StoreInst *DepSI, const DataLayout &DL) {
+  auto *StoredVal = DepSI->getValueOperand();
+
+  // Cannot handle reading from store of first-class aggregate or scalable type.
+  if (isFirstClassAggregateOrScalableType(StoredVal->getType()))
+    return -1;
+
   if (!canCoerceMustAliasedValueToLoad(StoredVal, LoadTy, DL))
     return -1;
- 
-  Value *StorePtr = DepSI->getPointerOperand(); 
-  uint64_t StoreSize = 
-      DL.getTypeSizeInBits(DepSI->getValueOperand()->getType()).getFixedSize(); 
-  return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, StorePtr, StoreSize, 
-                                        DL); 
-} 
- 
-/// Looks at a memory location for a load (specified by MemLocBase, Offs, and 
-/// Size) and compares it against a load. 
-/// 
-/// If the specified load could be safely widened to a larger integer load 
-/// that is 1) still efficient, 2) safe for the target, and 3) would provide 
-/// the specified memory location value, then this function returns the size 
-/// in bytes of the load width to use.  If not, this returns zero. 
-static unsigned getLoadLoadClobberFullWidthSize(const Value *MemLocBase, 
-                                                int64_t MemLocOffs, 
-                                                unsigned MemLocSize, 
-                                                const LoadInst *LI) { 
-  // We can only extend simple integer loads. 
-  if (!isa<IntegerType>(LI->getType()) || !LI->isSimple()) 
-    return 0; 
- 
-  // Load widening is hostile to ThreadSanitizer: it may cause false positives 
-  // or make the reports more cryptic (access sizes are wrong). 
-  if (LI->getParent()->getParent()->hasFnAttribute(Attribute::SanitizeThread)) 
-    return 0; 
- 
-  const DataLayout &DL = LI->getModule()->getDataLayout(); 
- 
-  // Get the base of this load. 
-  int64_t LIOffs = 0; 
-  const Value *LIBase = 
-      GetPointerBaseWithConstantOffset(LI->getPointerOperand(), LIOffs, DL); 
- 
-  // If the two pointers are not based on the same pointer, we can't tell that 
-  // they are related. 
-  if (LIBase != MemLocBase) 
-    return 0; 
- 
-  // Okay, the two values are based on the same pointer, but returned as 
-  // no-alias.  This happens when we have things like two byte loads at "P+1" 
-  // and "P+3".  Check to see if increasing the size of the "LI" load up to its 
-  // alignment (or the largest native integer type) will allow us to load all 
-  // the bits required by MemLoc. 
- 
-  // If MemLoc is before LI, then no widening of LI will help us out. 
-  if (MemLocOffs < LIOffs) 
-    return 0; 
- 
-  // Get the alignment of the load in bytes.  We assume that it is safe to load 
-  // any legal integer up to this size without a problem.  For example, if we're 
-  // looking at an i8 load on x86-32 that is known 1024 byte aligned, we can 
-  // widen it up to an i32 load.  If it is known 2-byte aligned, we can widen it 
-  // to i16. 
-  unsigned LoadAlign = LI->getAlignment(); 
- 
-  int64_t MemLocEnd = MemLocOffs + MemLocSize; 
- 
-  // If no amount of rounding up will let MemLoc fit into LI, then bail out. 
-  if (LIOffs + LoadAlign < MemLocEnd) 
-    return 0; 
- 
-  // This is the size of the load to try.  Start with the next larger power of 
-  // two. 
-  unsigned NewLoadByteSize = LI->getType()->getPrimitiveSizeInBits() / 8U; 
-  NewLoadByteSize = NextPowerOf2(NewLoadByteSize); 
- 
-  while (true) { 
-    // If this load size is bigger than our known alignment or would not fit 
-    // into a native integer register, then we fail. 
-    if (NewLoadByteSize > LoadAlign || 
-        !DL.fitsInLegalInteger(NewLoadByteSize * 8)) 
-      return 0; 
- 
-    if (LIOffs + NewLoadByteSize > MemLocEnd && 
-        (LI->getParent()->getParent()->hasFnAttribute( 
-             Attribute::SanitizeAddress) || 
-         LI->getParent()->getParent()->hasFnAttribute( 
-             Attribute::SanitizeHWAddress))) 
-      // We will be reading past the location accessed by the original program. 
-      // While this is safe in a regular build, Address Safety analysis tools 
-      // may start reporting false warnings. So, don't do widening. 
-      return 0; 
- 
-    // If a load of this width would include all of MemLoc, then we succeed. 
-    if (LIOffs + NewLoadByteSize >= MemLocEnd) 
-      return NewLoadByteSize; 
- 
-    NewLoadByteSize <<= 1; 
-  } 
-} 
- 
-/// This function is called when we have a 
-/// memdep query of a load that ends up being clobbered by another load.  See if 
-/// the other load can feed into the second load. 
-int analyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, LoadInst *DepLI, 
-                                  const DataLayout &DL) { 
-  // Cannot handle reading from store of first-class aggregate yet. 
-  if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy()) 
-    return -1; 
- 
+
+  Value *StorePtr = DepSI->getPointerOperand();
+  uint64_t StoreSize =
+      DL.getTypeSizeInBits(DepSI->getValueOperand()->getType()).getFixedSize();
+  return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, StorePtr, StoreSize,
+                                        DL);
+}
+
+/// Looks at a memory location for a load (specified by MemLocBase, Offs, and
+/// Size) and compares it against a load.
+///
+/// If the specified load could be safely widened to a larger integer load
+/// that is 1) still efficient, 2) safe for the target, and 3) would provide
+/// the specified memory location value, then this function returns the size
+/// in bytes of the load width to use.  If not, this returns zero.
+static unsigned getLoadLoadClobberFullWidthSize(const Value *MemLocBase,
+                                                int64_t MemLocOffs,
+                                                unsigned MemLocSize,
+                                                const LoadInst *LI) {
+  // We can only extend simple integer loads.
+  if (!isa<IntegerType>(LI->getType()) || !LI->isSimple())
+    return 0;
+
+  // Load widening is hostile to ThreadSanitizer: it may cause false positives
+  // or make the reports more cryptic (access sizes are wrong).
+  if (LI->getParent()->getParent()->hasFnAttribute(Attribute::SanitizeThread))
+    return 0;
+
+  const DataLayout &DL = LI->getModule()->getDataLayout();
+
+  // Get the base of this load.
+  int64_t LIOffs = 0;
+  const Value *LIBase =
+      GetPointerBaseWithConstantOffset(LI->getPointerOperand(), LIOffs, DL);
+
+  // If the two pointers are not based on the same pointer, we can't tell that
+  // they are related.
+  if (LIBase != MemLocBase)
+    return 0;
+
+  // Okay, the two values are based on the same pointer, but returned as
+  // no-alias.  This happens when we have things like two byte loads at "P+1"
+  // and "P+3".  Check to see if increasing the size of the "LI" load up to its
+  // alignment (or the largest native integer type) will allow us to load all
+  // the bits required by MemLoc.
+
+  // If MemLoc is before LI, then no widening of LI will help us out.
+  if (MemLocOffs < LIOffs)
+    return 0;
+
+  // Get the alignment of the load in bytes.  We assume that it is safe to load
+  // any legal integer up to this size without a problem.  For example, if we're
+  // looking at an i8 load on x86-32 that is known 1024 byte aligned, we can
+  // widen it up to an i32 load.  If it is known 2-byte aligned, we can widen it
+  // to i16.
+  unsigned LoadAlign = LI->getAlignment();
+
+  int64_t MemLocEnd = MemLocOffs + MemLocSize;
+
+  // If no amount of rounding up will let MemLoc fit into LI, then bail out.
+  if (LIOffs + LoadAlign < MemLocEnd)
+    return 0;
+
+  // This is the size of the load to try.  Start with the next larger power of
+  // two.
+  unsigned NewLoadByteSize = LI->getType()->getPrimitiveSizeInBits() / 8U;
+  NewLoadByteSize = NextPowerOf2(NewLoadByteSize);
+
+  while (true) {
+    // If this load size is bigger than our known alignment or would not fit
+    // into a native integer register, then we fail.
+    if (NewLoadByteSize > LoadAlign ||
+        !DL.fitsInLegalInteger(NewLoadByteSize * 8))
+      return 0;
+
+    if (LIOffs + NewLoadByteSize > MemLocEnd &&
+        (LI->getParent()->getParent()->hasFnAttribute(
+             Attribute::SanitizeAddress) ||
+         LI->getParent()->getParent()->hasFnAttribute(
+             Attribute::SanitizeHWAddress)))
+      // We will be reading past the location accessed by the original program.
+      // While this is safe in a regular build, Address Safety analysis tools
+      // may start reporting false warnings. So, don't do widening.
+      return 0;
+
+    // If a load of this width would include all of MemLoc, then we succeed.
+    if (LIOffs + NewLoadByteSize >= MemLocEnd)
+      return NewLoadByteSize;
+
+    NewLoadByteSize <<= 1;
+  }
+}
+
+/// This function is called when we have a
+/// memdep query of a load that ends up being clobbered by another load.  See if
+/// the other load can feed into the second load.
+int analyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, LoadInst *DepLI,
+                                  const DataLayout &DL) {
+  // Cannot handle reading from store of first-class aggregate yet.
+  if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy())
+    return -1;
+
   if (!canCoerceMustAliasedValueToLoad(DepLI, LoadTy, DL))
-    return -1; 
- 
-  Value *DepPtr = DepLI->getPointerOperand(); 
-  uint64_t DepSize = DL.getTypeSizeInBits(DepLI->getType()).getFixedSize(); 
-  int R = analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, DL); 
-  if (R != -1) 
-    return R; 
- 
-  // If we have a load/load clobber an DepLI can be widened to cover this load, 
-  // then we should widen it! 
-  int64_t LoadOffs = 0; 
-  const Value *LoadBase = 
-      GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, DL); 
-  unsigned LoadSize = DL.getTypeStoreSize(LoadTy).getFixedSize(); 
- 
-  unsigned Size = 
-      getLoadLoadClobberFullWidthSize(LoadBase, LoadOffs, LoadSize, DepLI); 
-  if (Size == 0) 
-    return -1; 
- 
-  // Check non-obvious conditions enforced by MDA which we rely on for being 
-  // able to materialize this potentially available value 
-  assert(DepLI->isSimple() && "Cannot widen volatile/atomic load!"); 
-  assert(DepLI->getType()->isIntegerTy() && "Can't widen non-integer load"); 
- 
-  return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size * 8, DL); 
-} 
- 
-int analyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, 
-                                     MemIntrinsic *MI, const DataLayout &DL) { 
-  // If the mem operation is a non-constant size, we can't handle it. 
-  ConstantInt *SizeCst = dyn_cast<ConstantInt>(MI->getLength()); 
-  if (!SizeCst) 
-    return -1; 
-  uint64_t MemSizeInBits = SizeCst->getZExtValue() * 8; 
- 
-  // If this is memset, we just need to see if the offset is valid in the size 
-  // of the memset.. 
-  if (MI->getIntrinsicID() == Intrinsic::memset) { 
-    if (DL.isNonIntegralPointerType(LoadTy->getScalarType())) { 
-      auto *CI = dyn_cast<ConstantInt>(cast<MemSetInst>(MI)->getValue()); 
-      if (!CI || !CI->isZero()) 
-        return -1; 
-    } 
-    return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(), 
-                                          MemSizeInBits, DL); 
-  } 
- 
-  // If we have a memcpy/memmove, the only case we can handle is if this is a 
-  // copy from constant memory.  In that case, we can read directly from the 
-  // constant memory. 
-  MemTransferInst *MTI = cast<MemTransferInst>(MI); 
- 
-  Constant *Src = dyn_cast<Constant>(MTI->getSource()); 
-  if (!Src) 
-    return -1; 
- 
+    return -1;
+
+  Value *DepPtr = DepLI->getPointerOperand();
+  uint64_t DepSize = DL.getTypeSizeInBits(DepLI->getType()).getFixedSize();
+  int R = analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, DL);
+  if (R != -1)
+    return R;
+
+  // If we have a load/load clobber an DepLI can be widened to cover this load,
+  // then we should widen it!
+  int64_t LoadOffs = 0;
+  const Value *LoadBase =
+      GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, DL);
+  unsigned LoadSize = DL.getTypeStoreSize(LoadTy).getFixedSize();
+
+  unsigned Size =
+      getLoadLoadClobberFullWidthSize(LoadBase, LoadOffs, LoadSize, DepLI);
+  if (Size == 0)
+    return -1;
+
+  // Check non-obvious conditions enforced by MDA which we rely on for being
+  // able to materialize this potentially available value
+  assert(DepLI->isSimple() && "Cannot widen volatile/atomic load!");
+  assert(DepLI->getType()->isIntegerTy() && "Can't widen non-integer load");
+
+  return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size * 8, DL);
+}
+
+int analyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
+                                     MemIntrinsic *MI, const DataLayout &DL) {
+  // If the mem operation is a non-constant size, we can't handle it.
+  ConstantInt *SizeCst = dyn_cast<ConstantInt>(MI->getLength());
+  if (!SizeCst)
+    return -1;
+  uint64_t MemSizeInBits = SizeCst->getZExtValue() * 8;
+
+  // If this is memset, we just need to see if the offset is valid in the size
+  // of the memset..
+  if (MI->getIntrinsicID() == Intrinsic::memset) {
+    if (DL.isNonIntegralPointerType(LoadTy->getScalarType())) {
+      auto *CI = dyn_cast<ConstantInt>(cast<MemSetInst>(MI)->getValue());
+      if (!CI || !CI->isZero())
+        return -1;
+    }
+    return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(),
+                                          MemSizeInBits, DL);
+  }
+
+  // If we have a memcpy/memmove, the only case we can handle is if this is a
+  // copy from constant memory.  In that case, we can read directly from the
+  // constant memory.
+  MemTransferInst *MTI = cast<MemTransferInst>(MI);
+
+  Constant *Src = dyn_cast<Constant>(MTI->getSource());
+  if (!Src)
+    return -1;
+
   GlobalVariable *GV = dyn_cast<GlobalVariable>(getUnderlyingObject(Src));
-  if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer()) 
-    return -1; 
- 
-  // See if the access is within the bounds of the transfer. 
-  int Offset = analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(), 
-                                              MemSizeInBits, DL); 
-  if (Offset == -1) 
-    return Offset; 
- 
-  unsigned AS = Src->getType()->getPointerAddressSpace(); 
-  // Otherwise, see if we can constant fold a load from the constant with the 
-  // offset applied as appropriate. 
-  if (Offset) { 
-    Src = ConstantExpr::getBitCast(Src, 
-                                   Type::getInt8PtrTy(Src->getContext(), AS)); 
-    Constant *OffsetCst = 
-        ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); 
-    Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), 
-                                         Src, OffsetCst); 
-  } 
-  Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS)); 
-  if (ConstantFoldLoadFromConstPtr(Src, LoadTy, DL)) 
-    return Offset; 
-  return -1; 
-} 
- 
-template <class T, class HelperClass> 
-static T *getStoreValueForLoadHelper(T *SrcVal, unsigned Offset, Type *LoadTy, 
-                                     HelperClass &Helper, 
-                                     const DataLayout &DL) { 
-  LLVMContext &Ctx = SrcVal->getType()->getContext(); 
- 
-  // If two pointers are in the same address space, they have the same size, 
-  // so we don't need to do any truncation, etc. This avoids introducing 
-  // ptrtoint instructions for pointers that may be non-integral. 
-  if (SrcVal->getType()->isPointerTy() && LoadTy->isPointerTy() && 
-      cast<PointerType>(SrcVal->getType())->getAddressSpace() == 
-          cast<PointerType>(LoadTy)->getAddressSpace()) { 
-    return SrcVal; 
-  } 
- 
-  uint64_t StoreSize = 
-      (DL.getTypeSizeInBits(SrcVal->getType()).getFixedSize() + 7) / 8; 
-  uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy).getFixedSize() + 7) / 8; 
-  // Compute which bits of the stored value are being used by the load.  Convert 
-  // to an integer type to start with. 
-  if (SrcVal->getType()->isPtrOrPtrVectorTy()) 
-    SrcVal = Helper.CreatePtrToInt(SrcVal, DL.getIntPtrType(SrcVal->getType())); 
-  if (!SrcVal->getType()->isIntegerTy()) 
-    SrcVal = Helper.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize * 8)); 
- 
-  // Shift the bits to the least significant depending on endianness. 
-  unsigned ShiftAmt; 
-  if (DL.isLittleEndian()) 
-    ShiftAmt = Offset * 8; 
-  else 
-    ShiftAmt = (StoreSize - LoadSize - Offset) * 8; 
-  if (ShiftAmt) 
-    SrcVal = Helper.CreateLShr(SrcVal, 
-                               ConstantInt::get(SrcVal->getType(), ShiftAmt)); 
- 
-  if (LoadSize != StoreSize) 
-    SrcVal = Helper.CreateTruncOrBitCast(SrcVal, 
-                                         IntegerType::get(Ctx, LoadSize * 8)); 
-  return SrcVal; 
-} 
- 
-/// This function is called when we have a memdep query of a load that ends up 
-/// being a clobbering store.  This means that the store provides bits used by 
-/// the load but the pointers don't must-alias.  Check this case to see if 
-/// there is anything more we can do before we give up. 
-Value *getStoreValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy, 
-                            Instruction *InsertPt, const DataLayout &DL) { 
- 
-  IRBuilder<> Builder(InsertPt); 
-  SrcVal = getStoreValueForLoadHelper(SrcVal, Offset, LoadTy, Builder, DL); 
-  return coerceAvailableValueToLoadTypeHelper(SrcVal, LoadTy, Builder, DL); 
-} 
- 
-Constant *getConstantStoreValueForLoad(Constant *SrcVal, unsigned Offset, 
-                                       Type *LoadTy, const DataLayout &DL) { 
-  ConstantFolder F; 
-  SrcVal = getStoreValueForLoadHelper(SrcVal, Offset, LoadTy, F, DL); 
-  return coerceAvailableValueToLoadTypeHelper(SrcVal, LoadTy, F, DL); 
-} 
- 
-/// This function is called when we have a memdep query of a load that ends up 
-/// being a clobbering load.  This means that the load *may* provide bits used 
-/// by the load but we can't be sure because the pointers don't must-alias. 
-/// Check this case to see if there is anything more we can do before we give 
-/// up. 
-Value *getLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, Type *LoadTy, 
-                           Instruction *InsertPt, const DataLayout &DL) { 
-  // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to 
-  // widen SrcVal out to a larger load. 
-  unsigned SrcValStoreSize = 
-      DL.getTypeStoreSize(SrcVal->getType()).getFixedSize(); 
-  unsigned LoadSize = DL.getTypeStoreSize(LoadTy).getFixedSize(); 
-  if (Offset + LoadSize > SrcValStoreSize) { 
-    assert(SrcVal->isSimple() && "Cannot widen volatile/atomic load!"); 
-    assert(SrcVal->getType()->isIntegerTy() && "Can't widen non-integer load"); 
-    // If we have a load/load clobber an DepLI can be widened to cover this 
-    // load, then we should widen it to the next power of 2 size big enough! 
-    unsigned NewLoadSize = Offset + LoadSize; 
-    if (!isPowerOf2_32(NewLoadSize)) 
-      NewLoadSize = NextPowerOf2(NewLoadSize); 
- 
-    Value *PtrVal = SrcVal->getPointerOperand(); 
-    // Insert the new load after the old load.  This ensures that subsequent 
-    // memdep queries will find the new load.  We can't easily remove the old 
-    // load completely because it is already in the value numbering table. 
-    IRBuilder<> Builder(SrcVal->getParent(), ++BasicBlock::iterator(SrcVal)); 
-    Type *DestTy = IntegerType::get(LoadTy->getContext(), NewLoadSize * 8); 
-    Type *DestPTy = 
-        PointerType::get(DestTy, PtrVal->getType()->getPointerAddressSpace()); 
-    Builder.SetCurrentDebugLocation(SrcVal->getDebugLoc()); 
-    PtrVal = Builder.CreateBitCast(PtrVal, DestPTy); 
-    LoadInst *NewLoad = Builder.CreateLoad(DestTy, PtrVal); 
-    NewLoad->takeName(SrcVal); 
-    NewLoad->setAlignment(SrcVal->getAlign()); 
- 
-    LLVM_DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n"); 
-    LLVM_DEBUG(dbgs() << "TO: " << *NewLoad << "\n"); 
- 
-    // Replace uses of the original load with the wider load.  On a big endian 
-    // system, we need to shift down to get the relevant bits. 
-    Value *RV = NewLoad; 
-    if (DL.isBigEndian()) 
-      RV = Builder.CreateLShr(RV, (NewLoadSize - SrcValStoreSize) * 8); 
-    RV = Builder.CreateTrunc(RV, SrcVal->getType()); 
-    SrcVal->replaceAllUsesWith(RV); 
- 
-    SrcVal = NewLoad; 
-  } 
- 
-  return getStoreValueForLoad(SrcVal, Offset, LoadTy, InsertPt, DL); 
-} 
- 
-Constant *getConstantLoadValueForLoad(Constant *SrcVal, unsigned Offset, 
-                                      Type *LoadTy, const DataLayout &DL) { 
-  unsigned SrcValStoreSize = 
-      DL.getTypeStoreSize(SrcVal->getType()).getFixedSize(); 
-  unsigned LoadSize = DL.getTypeStoreSize(LoadTy).getFixedSize(); 
-  if (Offset + LoadSize > SrcValStoreSize) 
-    return nullptr; 
-  return getConstantStoreValueForLoad(SrcVal, Offset, LoadTy, DL); 
-} 
- 
-template <class T, class HelperClass> 
-T *getMemInstValueForLoadHelper(MemIntrinsic *SrcInst, unsigned Offset, 
-                                Type *LoadTy, HelperClass &Helper, 
-                                const DataLayout &DL) { 
-  LLVMContext &Ctx = LoadTy->getContext(); 
-  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy).getFixedSize() / 8; 
- 
-  // We know that this method is only called when the mem transfer fully 
-  // provides the bits for the load. 
-  if (MemSetInst *MSI = dyn_cast<MemSetInst>(SrcInst)) { 
-    // memset(P, 'x', 1234) -> splat('x'), even if x is a variable, and 
-    // independently of what the offset is. 
-    T *Val = cast<T>(MSI->getValue()); 
-    if (LoadSize != 1) 
-      Val = 
-          Helper.CreateZExtOrBitCast(Val, IntegerType::get(Ctx, LoadSize * 8)); 
-    T *OneElt = Val; 
- 
-    // Splat the value out to the right number of bits. 
-    for (unsigned NumBytesSet = 1; NumBytesSet != LoadSize;) { 
-      // If we can double the number of bytes set, do it. 
-      if (NumBytesSet * 2 <= LoadSize) { 
-        T *ShVal = Helper.CreateShl( 
-            Val, ConstantInt::get(Val->getType(), NumBytesSet * 8)); 
-        Val = Helper.CreateOr(Val, ShVal); 
-        NumBytesSet <<= 1; 
-        continue; 
-      } 
- 
-      // Otherwise insert one byte at a time. 
-      T *ShVal = Helper.CreateShl(Val, ConstantInt::get(Val->getType(), 1 * 8)); 
-      Val = Helper.CreateOr(OneElt, ShVal); 
-      ++NumBytesSet; 
-    } 
- 
-    return coerceAvailableValueToLoadTypeHelper(Val, LoadTy, Helper, DL); 
-  } 
- 
-  // Otherwise, this is a memcpy/memmove from a constant global. 
-  MemTransferInst *MTI = cast<MemTransferInst>(SrcInst); 
-  Constant *Src = cast<Constant>(MTI->getSource()); 
- 
-  unsigned AS = Src->getType()->getPointerAddressSpace(); 
-  // Otherwise, see if we can constant fold a load from the constant with the 
-  // offset applied as appropriate. 
-  if (Offset) { 
-    Src = ConstantExpr::getBitCast(Src, 
-                                   Type::getInt8PtrTy(Src->getContext(), AS)); 
-    Constant *OffsetCst = 
-        ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); 
-    Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), 
-                                         Src, OffsetCst); 
-  } 
-  Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS)); 
-  return ConstantFoldLoadFromConstPtr(Src, LoadTy, DL); 
-} 
- 
-/// This function is called when we have a 
-/// memdep query of a load that ends up being a clobbering mem intrinsic. 
-Value *getMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, 
-                              Type *LoadTy, Instruction *InsertPt, 
-                              const DataLayout &DL) { 
-  IRBuilder<> Builder(InsertPt); 
-  return getMemInstValueForLoadHelper<Value, IRBuilder<>>(SrcInst, Offset, 
-                                                          LoadTy, Builder, DL); 
-} 
- 
-Constant *getConstantMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, 
-                                         Type *LoadTy, const DataLayout &DL) { 
-  // The only case analyzeLoadFromClobberingMemInst cannot be converted to a 
-  // constant is when it's a memset of a non-constant. 
-  if (auto *MSI = dyn_cast<MemSetInst>(SrcInst)) 
-    if (!isa<Constant>(MSI->getValue())) 
-      return nullptr; 
-  ConstantFolder F; 
-  return getMemInstValueForLoadHelper<Constant, ConstantFolder>(SrcInst, Offset, 
-                                                                LoadTy, F, DL); 
-} 
-} // namespace VNCoercion 
-} // namespace llvm 
+  if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer())
+    return -1;
+
+  // See if the access is within the bounds of the transfer.
+  int Offset = analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(),
+                                              MemSizeInBits, DL);
+  if (Offset == -1)
+    return Offset;
+
+  unsigned AS = Src->getType()->getPointerAddressSpace();
+  // Otherwise, see if we can constant fold a load from the constant with the
+  // offset applied as appropriate.
+  if (Offset) {
+    Src = ConstantExpr::getBitCast(Src,
+                                   Type::getInt8PtrTy(Src->getContext(), AS));
+    Constant *OffsetCst =
+        ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
+    Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()),
+                                         Src, OffsetCst);
+  }
+  Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
+  if (ConstantFoldLoadFromConstPtr(Src, LoadTy, DL))
+    return Offset;
+  return -1;
+}
+
+template <class T, class HelperClass>
+static T *getStoreValueForLoadHelper(T *SrcVal, unsigned Offset, Type *LoadTy,
+                                     HelperClass &Helper,
+                                     const DataLayout &DL) {
+  LLVMContext &Ctx = SrcVal->getType()->getContext();
+
+  // If two pointers are in the same address space, they have the same size,
+  // so we don't need to do any truncation, etc. This avoids introducing
+  // ptrtoint instructions for pointers that may be non-integral.
+  if (SrcVal->getType()->isPointerTy() && LoadTy->isPointerTy() &&
+      cast<PointerType>(SrcVal->getType())->getAddressSpace() ==
+          cast<PointerType>(LoadTy)->getAddressSpace()) {
+    return SrcVal;
+  }
+
+  uint64_t StoreSize =
+      (DL.getTypeSizeInBits(SrcVal->getType()).getFixedSize() + 7) / 8;
+  uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy).getFixedSize() + 7) / 8;
+  // Compute which bits of the stored value are being used by the load.  Convert
+  // to an integer type to start with.
+  if (SrcVal->getType()->isPtrOrPtrVectorTy())
+    SrcVal = Helper.CreatePtrToInt(SrcVal, DL.getIntPtrType(SrcVal->getType()));
+  if (!SrcVal->getType()->isIntegerTy())
+    SrcVal = Helper.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize * 8));
+
+  // Shift the bits to the least significant depending on endianness.
+  unsigned ShiftAmt;
+  if (DL.isLittleEndian())
+    ShiftAmt = Offset * 8;
+  else
+    ShiftAmt = (StoreSize - LoadSize - Offset) * 8;
+  if (ShiftAmt)
+    SrcVal = Helper.CreateLShr(SrcVal,
+                               ConstantInt::get(SrcVal->getType(), ShiftAmt));
+
+  if (LoadSize != StoreSize)
+    SrcVal = Helper.CreateTruncOrBitCast(SrcVal,
+                                         IntegerType::get(Ctx, LoadSize * 8));
+  return SrcVal;
+}
+
+/// This function is called when we have a memdep query of a load that ends up
+/// being a clobbering store.  This means that the store provides bits used by
+/// the load but the pointers don't must-alias.  Check this case to see if
+/// there is anything more we can do before we give up.
+Value *getStoreValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy,
+                            Instruction *InsertPt, const DataLayout &DL) {
+
+  IRBuilder<> Builder(InsertPt);
+  SrcVal = getStoreValueForLoadHelper(SrcVal, Offset, LoadTy, Builder, DL);
+  return coerceAvailableValueToLoadTypeHelper(SrcVal, LoadTy, Builder, DL);
+}
+
+Constant *getConstantStoreValueForLoad(Constant *SrcVal, unsigned Offset,
+                                       Type *LoadTy, const DataLayout &DL) {
+  ConstantFolder F;
+  SrcVal = getStoreValueForLoadHelper(SrcVal, Offset, LoadTy, F, DL);
+  return coerceAvailableValueToLoadTypeHelper(SrcVal, LoadTy, F, DL);
+}
+
+/// This function is called when we have a memdep query of a load that ends up
+/// being a clobbering load.  This means that the load *may* provide bits used
+/// by the load but we can't be sure because the pointers don't must-alias.
+/// Check this case to see if there is anything more we can do before we give
+/// up.
+Value *getLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, Type *LoadTy,
+                           Instruction *InsertPt, const DataLayout &DL) {
+  // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to
+  // widen SrcVal out to a larger load.
+  unsigned SrcValStoreSize =
+      DL.getTypeStoreSize(SrcVal->getType()).getFixedSize();
+  unsigned LoadSize = DL.getTypeStoreSize(LoadTy).getFixedSize();
+  if (Offset + LoadSize > SrcValStoreSize) {
+    assert(SrcVal->isSimple() && "Cannot widen volatile/atomic load!");
+    assert(SrcVal->getType()->isIntegerTy() && "Can't widen non-integer load");
+    // If we have a load/load clobber an DepLI can be widened to cover this
+    // load, then we should widen it to the next power of 2 size big enough!
+    unsigned NewLoadSize = Offset + LoadSize;
+    if (!isPowerOf2_32(NewLoadSize))
+      NewLoadSize = NextPowerOf2(NewLoadSize);
+
+    Value *PtrVal = SrcVal->getPointerOperand();
+    // Insert the new load after the old load.  This ensures that subsequent
+    // memdep queries will find the new load.  We can't easily remove the old
+    // load completely because it is already in the value numbering table.
+    IRBuilder<> Builder(SrcVal->getParent(), ++BasicBlock::iterator(SrcVal));
+    Type *DestTy = IntegerType::get(LoadTy->getContext(), NewLoadSize * 8);
+    Type *DestPTy =
+        PointerType::get(DestTy, PtrVal->getType()->getPointerAddressSpace());
+    Builder.SetCurrentDebugLocation(SrcVal->getDebugLoc());
+    PtrVal = Builder.CreateBitCast(PtrVal, DestPTy);
+    LoadInst *NewLoad = Builder.CreateLoad(DestTy, PtrVal);
+    NewLoad->takeName(SrcVal);
+    NewLoad->setAlignment(SrcVal->getAlign());
+
+    LLVM_DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n");
+    LLVM_DEBUG(dbgs() << "TO: " << *NewLoad << "\n");
+
+    // Replace uses of the original load with the wider load.  On a big endian
+    // system, we need to shift down to get the relevant bits.
+    Value *RV = NewLoad;
+    if (DL.isBigEndian())
+      RV = Builder.CreateLShr(RV, (NewLoadSize - SrcValStoreSize) * 8);
+    RV = Builder.CreateTrunc(RV, SrcVal->getType());
+    SrcVal->replaceAllUsesWith(RV);
+
+    SrcVal = NewLoad;
+  }
+
+  return getStoreValueForLoad(SrcVal, Offset, LoadTy, InsertPt, DL);
+}
+
+Constant *getConstantLoadValueForLoad(Constant *SrcVal, unsigned Offset,
+                                      Type *LoadTy, const DataLayout &DL) {
+  unsigned SrcValStoreSize =
+      DL.getTypeStoreSize(SrcVal->getType()).getFixedSize();
+  unsigned LoadSize = DL.getTypeStoreSize(LoadTy).getFixedSize();
+  if (Offset + LoadSize > SrcValStoreSize)
+    return nullptr;
+  return getConstantStoreValueForLoad(SrcVal, Offset, LoadTy, DL);
+}
+
+template <class T, class HelperClass>
+T *getMemInstValueForLoadHelper(MemIntrinsic *SrcInst, unsigned Offset,
+                                Type *LoadTy, HelperClass &Helper,
+                                const DataLayout &DL) {
+  LLVMContext &Ctx = LoadTy->getContext();
+  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy).getFixedSize() / 8;
+
+  // We know that this method is only called when the mem transfer fully
+  // provides the bits for the load.
+  if (MemSetInst *MSI = dyn_cast<MemSetInst>(SrcInst)) {
+    // memset(P, 'x', 1234) -> splat('x'), even if x is a variable, and
+    // independently of what the offset is.
+    T *Val = cast<T>(MSI->getValue());
+    if (LoadSize != 1)
+      Val =
+          Helper.CreateZExtOrBitCast(Val, IntegerType::get(Ctx, LoadSize * 8));
+    T *OneElt = Val;
+
+    // Splat the value out to the right number of bits.
+    for (unsigned NumBytesSet = 1; NumBytesSet != LoadSize;) {
+      // If we can double the number of bytes set, do it.
+      if (NumBytesSet * 2 <= LoadSize) {
+        T *ShVal = Helper.CreateShl(
+            Val, ConstantInt::get(Val->getType(), NumBytesSet * 8));
+        Val = Helper.CreateOr(Val, ShVal);
+        NumBytesSet <<= 1;
+        continue;
+      }
+
+      // Otherwise insert one byte at a time.
+      T *ShVal = Helper.CreateShl(Val, ConstantInt::get(Val->getType(), 1 * 8));
+      Val = Helper.CreateOr(OneElt, ShVal);
+      ++NumBytesSet;
+    }
+
+    return coerceAvailableValueToLoadTypeHelper(Val, LoadTy, Helper, DL);
+  }
+
+  // Otherwise, this is a memcpy/memmove from a constant global.
+  MemTransferInst *MTI = cast<MemTransferInst>(SrcInst);
+  Constant *Src = cast<Constant>(MTI->getSource());
+
+  unsigned AS = Src->getType()->getPointerAddressSpace();
+  // Otherwise, see if we can constant fold a load from the constant with the
+  // offset applied as appropriate.
+  if (Offset) {
+    Src = ConstantExpr::getBitCast(Src,
+                                   Type::getInt8PtrTy(Src->getContext(), AS));
+    Constant *OffsetCst =
+        ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
+    Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()),
+                                         Src, OffsetCst);
+  }
+  Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
+  return ConstantFoldLoadFromConstPtr(Src, LoadTy, DL);
+}
+
+/// This function is called when we have a
+/// memdep query of a load that ends up being a clobbering mem intrinsic.
+Value *getMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
+                              Type *LoadTy, Instruction *InsertPt,
+                              const DataLayout &DL) {
+  IRBuilder<> Builder(InsertPt);
+  return getMemInstValueForLoadHelper<Value, IRBuilder<>>(SrcInst, Offset,
+                                                          LoadTy, Builder, DL);
+}
+
+Constant *getConstantMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
+                                         Type *LoadTy, const DataLayout &DL) {
+  // The only case analyzeLoadFromClobberingMemInst cannot be converted to a
+  // constant is when it's a memset of a non-constant.
+  if (auto *MSI = dyn_cast<MemSetInst>(SrcInst))
+    if (!isa<Constant>(MSI->getValue()))
+      return nullptr;
+  ConstantFolder F;
+  return getMemInstValueForLoadHelper<Constant, ConstantFolder>(SrcInst, Offset,
+                                                                LoadTy, F, DL);
+}
+} // namespace VNCoercion
+} // namespace llvm
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/ValueMapper.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/ValueMapper.cpp
index 1392ca041c..930e0b7ee0 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/ValueMapper.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/ValueMapper.cpp
@@ -1,906 +1,906 @@
-//===- ValueMapper.cpp - Interface shared by lib/Transforms/Utils ---------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file defines the MapValue function, which is shared by various parts of 
-// the lib/Transforms/Utils library. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Utils/ValueMapper.h" 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/DenseSet.h" 
-#include "llvm/ADT/None.h" 
-#include "llvm/ADT/Optional.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/IR/Argument.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DebugInfoMetadata.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/GlobalObject.h" 
-#include "llvm/IR/GlobalIndirectSymbol.h" 
-#include "llvm/IR/GlobalVariable.h" 
-#include "llvm/IR/InlineAsm.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/Operator.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/Support/Casting.h" 
-#include <cassert> 
-#include <limits> 
-#include <memory> 
-#include <utility> 
- 
-using namespace llvm; 
- 
-// Out of line method to get vtable etc for class. 
-void ValueMapTypeRemapper::anchor() {} 
-void ValueMaterializer::anchor() {} 
- 
-namespace { 
- 
-/// A basic block used in a BlockAddress whose function body is not yet 
-/// materialized. 
-struct DelayedBasicBlock { 
-  BasicBlock *OldBB; 
-  std::unique_ptr<BasicBlock> TempBB; 
- 
-  DelayedBasicBlock(const BlockAddress &Old) 
-      : OldBB(Old.getBasicBlock()), 
-        TempBB(BasicBlock::Create(Old.getContext())) {} 
-}; 
- 
-struct WorklistEntry { 
-  enum EntryKind { 
-    MapGlobalInit, 
-    MapAppendingVar, 
-    MapGlobalIndirectSymbol, 
-    RemapFunction 
-  }; 
-  struct GVInitTy { 
-    GlobalVariable *GV; 
-    Constant *Init; 
-  }; 
-  struct AppendingGVTy { 
-    GlobalVariable *GV; 
-    Constant *InitPrefix; 
-  }; 
-  struct GlobalIndirectSymbolTy { 
-    GlobalIndirectSymbol *GIS; 
-    Constant *Target; 
-  }; 
- 
-  unsigned Kind : 2; 
-  unsigned MCID : 29; 
-  unsigned AppendingGVIsOldCtorDtor : 1; 
-  unsigned AppendingGVNumNewMembers; 
-  union { 
-    GVInitTy GVInit; 
-    AppendingGVTy AppendingGV; 
-    GlobalIndirectSymbolTy GlobalIndirectSymbol; 
-    Function *RemapF; 
-  } Data; 
-}; 
- 
-struct MappingContext { 
-  ValueToValueMapTy *VM; 
-  ValueMaterializer *Materializer = nullptr; 
- 
-  /// Construct a MappingContext with a value map and materializer. 
-  explicit MappingContext(ValueToValueMapTy &VM, 
-                          ValueMaterializer *Materializer = nullptr) 
-      : VM(&VM), Materializer(Materializer) {} 
-}; 
- 
-class Mapper { 
-  friend class MDNodeMapper; 
- 
-#ifndef NDEBUG 
-  DenseSet<GlobalValue *> AlreadyScheduled; 
-#endif 
- 
-  RemapFlags Flags; 
-  ValueMapTypeRemapper *TypeMapper; 
-  unsigned CurrentMCID = 0; 
-  SmallVector<MappingContext, 2> MCs; 
-  SmallVector<WorklistEntry, 4> Worklist; 
-  SmallVector<DelayedBasicBlock, 1> DelayedBBs; 
-  SmallVector<Constant *, 16> AppendingInits; 
- 
-public: 
-  Mapper(ValueToValueMapTy &VM, RemapFlags Flags, 
-         ValueMapTypeRemapper *TypeMapper, ValueMaterializer *Materializer) 
-      : Flags(Flags), TypeMapper(TypeMapper), 
-        MCs(1, MappingContext(VM, Materializer)) {} 
- 
-  /// ValueMapper should explicitly call \a flush() before destruction. 
-  ~Mapper() { assert(!hasWorkToDo() && "Expected to be flushed"); } 
- 
-  bool hasWorkToDo() const { return !Worklist.empty(); } 
- 
-  unsigned 
-  registerAlternateMappingContext(ValueToValueMapTy &VM, 
-                                  ValueMaterializer *Materializer = nullptr) { 
-    MCs.push_back(MappingContext(VM, Materializer)); 
-    return MCs.size() - 1; 
-  } 
- 
-  void addFlags(RemapFlags Flags); 
- 
-  void remapGlobalObjectMetadata(GlobalObject &GO); 
- 
-  Value *mapValue(const Value *V); 
-  void remapInstruction(Instruction *I); 
-  void remapFunction(Function &F); 
- 
-  Constant *mapConstant(const Constant *C) { 
-    return cast_or_null<Constant>(mapValue(C)); 
-  } 
- 
-  /// Map metadata. 
-  /// 
-  /// Find the mapping for MD.  Guarantees that the return will be resolved 
-  /// (not an MDNode, or MDNode::isResolved() returns true). 
-  Metadata *mapMetadata(const Metadata *MD); 
- 
-  void scheduleMapGlobalInitializer(GlobalVariable &GV, Constant &Init, 
-                                    unsigned MCID); 
-  void scheduleMapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix, 
-                                    bool IsOldCtorDtor, 
-                                    ArrayRef<Constant *> NewMembers, 
-                                    unsigned MCID); 
-  void scheduleMapGlobalIndirectSymbol(GlobalIndirectSymbol &GIS, Constant &Target, 
-                                       unsigned MCID); 
-  void scheduleRemapFunction(Function &F, unsigned MCID); 
- 
-  void flush(); 
- 
-private: 
-  void mapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix, 
-                            bool IsOldCtorDtor, 
-                            ArrayRef<Constant *> NewMembers); 
- 
-  ValueToValueMapTy &getVM() { return *MCs[CurrentMCID].VM; } 
-  ValueMaterializer *getMaterializer() { return MCs[CurrentMCID].Materializer; } 
- 
-  Value *mapBlockAddress(const BlockAddress &BA); 
- 
-  /// Map metadata that doesn't require visiting operands. 
-  Optional<Metadata *> mapSimpleMetadata(const Metadata *MD); 
- 
-  Metadata *mapToMetadata(const Metadata *Key, Metadata *Val); 
-  Metadata *mapToSelf(const Metadata *MD); 
-}; 
- 
-class MDNodeMapper { 
-  Mapper &M; 
- 
-  /// Data about a node in \a UniquedGraph. 
-  struct Data { 
-    bool HasChanged = false; 
-    unsigned ID = std::numeric_limits<unsigned>::max(); 
-    TempMDNode Placeholder; 
-  }; 
- 
-  /// A graph of uniqued nodes. 
-  struct UniquedGraph { 
-    SmallDenseMap<const Metadata *, Data, 32> Info; // Node properties. 
-    SmallVector<MDNode *, 16> POT;                  // Post-order traversal. 
- 
-    /// Propagate changed operands through the post-order traversal. 
-    /// 
-    /// Iteratively update \a Data::HasChanged for each node based on \a 
-    /// Data::HasChanged of its operands, until fixed point. 
-    void propagateChanges(); 
- 
-    /// Get a forward reference to a node to use as an operand. 
-    Metadata &getFwdReference(MDNode &Op); 
-  }; 
- 
-  /// Worklist of distinct nodes whose operands need to be remapped. 
-  SmallVector<MDNode *, 16> DistinctWorklist; 
- 
-  // Storage for a UniquedGraph. 
-  SmallDenseMap<const Metadata *, Data, 32> InfoStorage; 
-  SmallVector<MDNode *, 16> POTStorage; 
- 
-public: 
-  MDNodeMapper(Mapper &M) : M(M) {} 
- 
-  /// Map a metadata node (and its transitive operands). 
-  /// 
-  /// Map all the (unmapped) nodes in the subgraph under \c N.  The iterative 
-  /// algorithm handles distinct nodes and uniqued node subgraphs using 
-  /// different strategies. 
-  /// 
-  /// Distinct nodes are immediately mapped and added to \a DistinctWorklist 
-  /// using \a mapDistinctNode().  Their mapping can always be computed 
-  /// immediately without visiting operands, even if their operands change. 
-  /// 
-  /// The mapping for uniqued nodes depends on whether their operands change. 
-  /// \a mapTopLevelUniquedNode() traverses the transitive uniqued subgraph of 
-  /// a node to calculate uniqued node mappings in bulk.  Distinct leafs are 
-  /// added to \a DistinctWorklist with \a mapDistinctNode(). 
-  /// 
-  /// After mapping \c N itself, this function remaps the operands of the 
-  /// distinct nodes in \a DistinctWorklist until the entire subgraph under \c 
-  /// N has been mapped. 
-  Metadata *map(const MDNode &N); 
- 
-private: 
-  /// Map a top-level uniqued node and the uniqued subgraph underneath it. 
-  /// 
-  /// This builds up a post-order traversal of the (unmapped) uniqued subgraph 
-  /// underneath \c FirstN and calculates the nodes' mapping.  Each node uses 
-  /// the identity mapping (\a Mapper::mapToSelf()) as long as all of its 
-  /// operands uses the identity mapping. 
-  /// 
-  /// The algorithm works as follows: 
-  /// 
-  ///  1. \a createPOT(): traverse the uniqued subgraph under \c FirstN and 
-  ///     save the post-order traversal in the given \a UniquedGraph, tracking 
-  ///     nodes' operands change. 
-  /// 
-  ///  2. \a UniquedGraph::propagateChanges(): propagate changed operands 
-  ///     through the \a UniquedGraph until fixed point, following the rule 
-  ///     that if a node changes, any node that references must also change. 
-  /// 
-  ///  3. \a mapNodesInPOT(): map the uniqued nodes, creating new uniqued nodes 
-  ///     (referencing new operands) where necessary. 
-  Metadata *mapTopLevelUniquedNode(const MDNode &FirstN); 
- 
-  /// Try to map the operand of an \a MDNode. 
-  /// 
-  /// If \c Op is already mapped, return the mapping.  If it's not an \a 
-  /// MDNode, compute and return the mapping.  If it's a distinct \a MDNode, 
-  /// return the result of \a mapDistinctNode(). 
-  /// 
-  /// \return None if \c Op is an unmapped uniqued \a MDNode. 
-  /// \post getMappedOp(Op) only returns None if this returns None. 
-  Optional<Metadata *> tryToMapOperand(const Metadata *Op); 
- 
-  /// Map a distinct node. 
-  /// 
-  /// Return the mapping for the distinct node \c N, saving the result in \a 
-  /// DistinctWorklist for later remapping. 
-  /// 
-  /// \pre \c N is not yet mapped. 
-  /// \pre \c N.isDistinct(). 
-  MDNode *mapDistinctNode(const MDNode &N); 
- 
-  /// Get a previously mapped node. 
-  Optional<Metadata *> getMappedOp(const Metadata *Op) const; 
- 
-  /// Create a post-order traversal of an unmapped uniqued node subgraph. 
-  /// 
-  /// This traverses the metadata graph deeply enough to map \c FirstN.  It 
-  /// uses \a tryToMapOperand() (via \a Mapper::mapSimplifiedNode()), so any 
-  /// metadata that has already been mapped will not be part of the POT. 
-  /// 
-  /// Each node that has a changed operand from outside the graph (e.g., a 
-  /// distinct node, an already-mapped uniqued node, or \a ConstantAsMetadata) 
-  /// is marked with \a Data::HasChanged. 
-  /// 
-  /// \return \c true if any nodes in \c G have \a Data::HasChanged. 
-  /// \post \c G.POT is a post-order traversal ending with \c FirstN. 
-  /// \post \a Data::hasChanged in \c G.Info indicates whether any node needs 
-  /// to change because of operands outside the graph. 
-  bool createPOT(UniquedGraph &G, const MDNode &FirstN); 
- 
-  /// Visit the operands of a uniqued node in the POT. 
-  /// 
-  /// Visit the operands in the range from \c I to \c E, returning the first 
-  /// uniqued node we find that isn't yet in \c G.  \c I is always advanced to 
-  /// where to continue the loop through the operands. 
-  /// 
-  /// This sets \c HasChanged if any of the visited operands change. 
-  MDNode *visitOperands(UniquedGraph &G, MDNode::op_iterator &I, 
-                        MDNode::op_iterator E, bool &HasChanged); 
- 
-  /// Map all the nodes in the given uniqued graph. 
-  /// 
-  /// This visits all the nodes in \c G in post-order, using the identity 
-  /// mapping or creating a new node depending on \a Data::HasChanged. 
-  /// 
-  /// \pre \a getMappedOp() returns None for nodes in \c G, but not for any of 
-  /// their operands outside of \c G. 
-  /// \pre \a Data::HasChanged is true for a node in \c G iff any of its 
-  /// operands have changed. 
-  /// \post \a getMappedOp() returns the mapped node for every node in \c G. 
-  void mapNodesInPOT(UniquedGraph &G); 
- 
-  /// Remap a node's operands using the given functor. 
-  /// 
-  /// Iterate through the operands of \c N and update them in place using \c 
-  /// mapOperand. 
-  /// 
-  /// \pre N.isDistinct() or N.isTemporary(). 
-  template <class OperandMapper> 
-  void remapOperands(MDNode &N, OperandMapper mapOperand); 
-}; 
- 
-} // end anonymous namespace 
- 
-Value *Mapper::mapValue(const Value *V) { 
-  ValueToValueMapTy::iterator I = getVM().find(V); 
- 
-  // If the value already exists in the map, use it. 
-  if (I != getVM().end()) { 
-    assert(I->second && "Unexpected null mapping"); 
-    return I->second; 
-  } 
- 
-  // If we have a materializer and it can materialize a value, use that. 
-  if (auto *Materializer = getMaterializer()) { 
-    if (Value *NewV = Materializer->materialize(const_cast<Value *>(V))) { 
-      getVM()[V] = NewV; 
-      return NewV; 
-    } 
-  } 
- 
-  // Global values do not need to be seeded into the VM if they 
-  // are using the identity mapping. 
-  if (isa<GlobalValue>(V)) { 
-    if (Flags & RF_NullMapMissingGlobalValues) 
-      return nullptr; 
-    return getVM()[V] = const_cast<Value *>(V); 
-  } 
- 
-  if (const InlineAsm *IA = dyn_cast<InlineAsm>(V)) { 
-    // Inline asm may need *type* remapping. 
-    FunctionType *NewTy = IA->getFunctionType(); 
-    if (TypeMapper) { 
-      NewTy = cast<FunctionType>(TypeMapper->remapType(NewTy)); 
- 
-      if (NewTy != IA->getFunctionType()) 
-        V = InlineAsm::get(NewTy, IA->getAsmString(), IA->getConstraintString(), 
-                           IA->hasSideEffects(), IA->isAlignStack(), 
-                           IA->getDialect()); 
-    } 
- 
-    return getVM()[V] = const_cast<Value *>(V); 
-  } 
- 
-  if (const auto *MDV = dyn_cast<MetadataAsValue>(V)) { 
-    const Metadata *MD = MDV->getMetadata(); 
- 
-    if (auto *LAM = dyn_cast<LocalAsMetadata>(MD)) { 
-      // Look through to grab the local value. 
-      if (Value *LV = mapValue(LAM->getValue())) { 
-        if (V == LAM->getValue()) 
-          return const_cast<Value *>(V); 
-        return MetadataAsValue::get(V->getContext(), ValueAsMetadata::get(LV)); 
-      } 
- 
-      // FIXME: always return nullptr once Verifier::verifyDominatesUse() 
-      // ensures metadata operands only reference defined SSA values. 
-      return (Flags & RF_IgnoreMissingLocals) 
-                 ? nullptr 
-                 : MetadataAsValue::get(V->getContext(), 
-                                        MDTuple::get(V->getContext(), None)); 
-    } 
- 
-    // If this is a module-level metadata and we know that nothing at the module 
-    // level is changing, then use an identity mapping. 
-    if (Flags & RF_NoModuleLevelChanges) 
-      return getVM()[V] = const_cast<Value *>(V); 
- 
-    // Map the metadata and turn it into a value. 
-    auto *MappedMD = mapMetadata(MD); 
-    if (MD == MappedMD) 
-      return getVM()[V] = const_cast<Value *>(V); 
-    return getVM()[V] = MetadataAsValue::get(V->getContext(), MappedMD); 
-  } 
- 
-  // Okay, this either must be a constant (which may or may not be mappable) or 
-  // is something that is not in the mapping table. 
-  Constant *C = const_cast<Constant*>(dyn_cast<Constant>(V)); 
-  if (!C) 
-    return nullptr; 
- 
-  if (BlockAddress *BA = dyn_cast<BlockAddress>(C)) 
-    return mapBlockAddress(*BA); 
- 
-  auto mapValueOrNull = [this](Value *V) { 
-    auto Mapped = mapValue(V); 
-    assert((Mapped || (Flags & RF_NullMapMissingGlobalValues)) && 
-           "Unexpected null mapping for constant operand without " 
-           "NullMapMissingGlobalValues flag"); 
-    return Mapped; 
-  }; 
- 
-  // Otherwise, we have some other constant to remap.  Start by checking to see 
-  // if all operands have an identity remapping. 
-  unsigned OpNo = 0, NumOperands = C->getNumOperands(); 
-  Value *Mapped = nullptr; 
-  for (; OpNo != NumOperands; ++OpNo) { 
-    Value *Op = C->getOperand(OpNo); 
-    Mapped = mapValueOrNull(Op); 
-    if (!Mapped) 
-      return nullptr; 
-    if (Mapped != Op) 
-      break; 
-  } 
- 
-  // See if the type mapper wants to remap the type as well. 
-  Type *NewTy = C->getType(); 
-  if (TypeMapper) 
-    NewTy = TypeMapper->remapType(NewTy); 
- 
-  // If the result type and all operands match up, then just insert an identity 
-  // mapping. 
-  if (OpNo == NumOperands && NewTy == C->getType()) 
-    return getVM()[V] = C; 
- 
-  // Okay, we need to create a new constant.  We've already processed some or 
-  // all of the operands, set them all up now. 
-  SmallVector<Constant*, 8> Ops; 
-  Ops.reserve(NumOperands); 
-  for (unsigned j = 0; j != OpNo; ++j) 
-    Ops.push_back(cast<Constant>(C->getOperand(j))); 
- 
-  // If one of the operands mismatch, push it and the other mapped operands. 
-  if (OpNo != NumOperands) { 
-    Ops.push_back(cast<Constant>(Mapped)); 
- 
-    // Map the rest of the operands that aren't processed yet. 
-    for (++OpNo; OpNo != NumOperands; ++OpNo) { 
-      Mapped = mapValueOrNull(C->getOperand(OpNo)); 
-      if (!Mapped) 
-        return nullptr; 
-      Ops.push_back(cast<Constant>(Mapped)); 
-    } 
-  } 
-  Type *NewSrcTy = nullptr; 
-  if (TypeMapper) 
-    if (auto *GEPO = dyn_cast<GEPOperator>(C)) 
-      NewSrcTy = TypeMapper->remapType(GEPO->getSourceElementType()); 
- 
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) 
-    return getVM()[V] = CE->getWithOperands(Ops, NewTy, false, NewSrcTy); 
-  if (isa<ConstantArray>(C)) 
-    return getVM()[V] = ConstantArray::get(cast<ArrayType>(NewTy), Ops); 
-  if (isa<ConstantStruct>(C)) 
-    return getVM()[V] = ConstantStruct::get(cast<StructType>(NewTy), Ops); 
-  if (isa<ConstantVector>(C)) 
-    return getVM()[V] = ConstantVector::get(Ops); 
-  // If this is a no-operand constant, it must be because the type was remapped. 
-  if (isa<UndefValue>(C)) 
-    return getVM()[V] = UndefValue::get(NewTy); 
-  if (isa<ConstantAggregateZero>(C)) 
-    return getVM()[V] = ConstantAggregateZero::get(NewTy); 
-  assert(isa<ConstantPointerNull>(C)); 
-  return getVM()[V] = ConstantPointerNull::get(cast<PointerType>(NewTy)); 
-} 
- 
-Value *Mapper::mapBlockAddress(const BlockAddress &BA) { 
-  Function *F = cast<Function>(mapValue(BA.getFunction())); 
- 
-  // F may not have materialized its initializer.  In that case, create a 
-  // dummy basic block for now, and replace it once we've materialized all 
-  // the initializers. 
-  BasicBlock *BB; 
-  if (F->empty()) { 
-    DelayedBBs.push_back(DelayedBasicBlock(BA)); 
-    BB = DelayedBBs.back().TempBB.get(); 
-  } else { 
-    BB = cast_or_null<BasicBlock>(mapValue(BA.getBasicBlock())); 
-  } 
- 
-  return getVM()[&BA] = BlockAddress::get(F, BB ? BB : BA.getBasicBlock()); 
-} 
- 
-Metadata *Mapper::mapToMetadata(const Metadata *Key, Metadata *Val) { 
-  getVM().MD()[Key].reset(Val); 
-  return Val; 
-} 
- 
-Metadata *Mapper::mapToSelf(const Metadata *MD) { 
-  return mapToMetadata(MD, const_cast<Metadata *>(MD)); 
-} 
- 
-Optional<Metadata *> MDNodeMapper::tryToMapOperand(const Metadata *Op) { 
-  if (!Op) 
-    return nullptr; 
- 
-  if (Optional<Metadata *> MappedOp = M.mapSimpleMetadata(Op)) { 
-#ifndef NDEBUG 
-    if (auto *CMD = dyn_cast<ConstantAsMetadata>(Op)) 
-      assert((!*MappedOp || M.getVM().count(CMD->getValue()) || 
-              M.getVM().getMappedMD(Op)) && 
-             "Expected Value to be memoized"); 
-    else 
-      assert((isa<MDString>(Op) || M.getVM().getMappedMD(Op)) && 
-             "Expected result to be memoized"); 
-#endif 
-    return *MappedOp; 
-  } 
- 
-  const MDNode &N = *cast<MDNode>(Op); 
-  if (N.isDistinct()) 
-    return mapDistinctNode(N); 
-  return None; 
-} 
- 
-static Metadata *cloneOrBuildODR(const MDNode &N) { 
-  auto *CT = dyn_cast<DICompositeType>(&N); 
-  // If ODR type uniquing is enabled, we would have uniqued composite types 
-  // with identifiers during bitcode reading, so we can just use CT. 
-  if (CT && CT->getContext().isODRUniquingDebugTypes() && 
-      CT->getIdentifier() != "") 
-    return const_cast<DICompositeType *>(CT); 
-  return MDNode::replaceWithDistinct(N.clone()); 
-} 
- 
-MDNode *MDNodeMapper::mapDistinctNode(const MDNode &N) { 
-  assert(N.isDistinct() && "Expected a distinct node"); 
-  assert(!M.getVM().getMappedMD(&N) && "Expected an unmapped node"); 
-  DistinctWorklist.push_back( 
-      cast<MDNode>((M.Flags & RF_MoveDistinctMDs) 
-                       ? M.mapToSelf(&N) 
-                       : M.mapToMetadata(&N, cloneOrBuildODR(N)))); 
-  return DistinctWorklist.back(); 
-} 
- 
-static ConstantAsMetadata *wrapConstantAsMetadata(const ConstantAsMetadata &CMD, 
-                                                  Value *MappedV) { 
-  if (CMD.getValue() == MappedV) 
-    return const_cast<ConstantAsMetadata *>(&CMD); 
-  return MappedV ? ConstantAsMetadata::getConstant(MappedV) : nullptr; 
-} 
- 
-Optional<Metadata *> MDNodeMapper::getMappedOp(const Metadata *Op) const { 
-  if (!Op) 
-    return nullptr; 
- 
-  if (Optional<Metadata *> MappedOp = M.getVM().getMappedMD(Op)) 
-    return *MappedOp; 
- 
-  if (isa<MDString>(Op)) 
-    return const_cast<Metadata *>(Op); 
- 
-  if (auto *CMD = dyn_cast<ConstantAsMetadata>(Op)) 
-    return wrapConstantAsMetadata(*CMD, M.getVM().lookup(CMD->getValue())); 
- 
-  return None; 
-} 
- 
-Metadata &MDNodeMapper::UniquedGraph::getFwdReference(MDNode &Op) { 
-  auto Where = Info.find(&Op); 
-  assert(Where != Info.end() && "Expected a valid reference"); 
- 
-  auto &OpD = Where->second; 
-  if (!OpD.HasChanged) 
-    return Op; 
- 
-  // Lazily construct a temporary node. 
-  if (!OpD.Placeholder) 
-    OpD.Placeholder = Op.clone(); 
- 
-  return *OpD.Placeholder; 
-} 
- 
-template <class OperandMapper> 
-void MDNodeMapper::remapOperands(MDNode &N, OperandMapper mapOperand) { 
-  assert(!N.isUniqued() && "Expected distinct or temporary nodes"); 
-  for (unsigned I = 0, E = N.getNumOperands(); I != E; ++I) { 
-    Metadata *Old = N.getOperand(I); 
-    Metadata *New = mapOperand(Old); 
- 
-    if (Old != New) 
-      N.replaceOperandWith(I, New); 
-  } 
-} 
- 
-namespace { 
- 
-/// An entry in the worklist for the post-order traversal. 
-struct POTWorklistEntry { 
-  MDNode *N;              ///< Current node. 
-  MDNode::op_iterator Op; ///< Current operand of \c N. 
- 
-  /// Keep a flag of whether operands have changed in the worklist to avoid 
-  /// hitting the map in \a UniquedGraph. 
-  bool HasChanged = false; 
- 
-  POTWorklistEntry(MDNode &N) : N(&N), Op(N.op_begin()) {} 
-}; 
- 
-} // end anonymous namespace 
- 
-bool MDNodeMapper::createPOT(UniquedGraph &G, const MDNode &FirstN) { 
-  assert(G.Info.empty() && "Expected a fresh traversal"); 
-  assert(FirstN.isUniqued() && "Expected uniqued node in POT"); 
- 
-  // Construct a post-order traversal of the uniqued subgraph under FirstN. 
-  bool AnyChanges = false; 
-  SmallVector<POTWorklistEntry, 16> Worklist; 
-  Worklist.push_back(POTWorklistEntry(const_cast<MDNode &>(FirstN))); 
-  (void)G.Info[&FirstN]; 
-  while (!Worklist.empty()) { 
-    // Start or continue the traversal through the this node's operands. 
-    auto &WE = Worklist.back(); 
-    if (MDNode *N = visitOperands(G, WE.Op, WE.N->op_end(), WE.HasChanged)) { 
-      // Push a new node to traverse first. 
-      Worklist.push_back(POTWorklistEntry(*N)); 
-      continue; 
-    } 
- 
-    // Push the node onto the POT. 
-    assert(WE.N->isUniqued() && "Expected only uniqued nodes"); 
-    assert(WE.Op == WE.N->op_end() && "Expected to visit all operands"); 
-    auto &D = G.Info[WE.N]; 
-    AnyChanges |= D.HasChanged = WE.HasChanged; 
-    D.ID = G.POT.size(); 
-    G.POT.push_back(WE.N); 
- 
-    // Pop the node off the worklist. 
-    Worklist.pop_back(); 
-  } 
-  return AnyChanges; 
-} 
- 
-MDNode *MDNodeMapper::visitOperands(UniquedGraph &G, MDNode::op_iterator &I, 
-                                    MDNode::op_iterator E, bool &HasChanged) { 
-  while (I != E) { 
-    Metadata *Op = *I++; // Increment even on early return. 
-    if (Optional<Metadata *> MappedOp = tryToMapOperand(Op)) { 
-      // Check if the operand changes. 
-      HasChanged |= Op != *MappedOp; 
-      continue; 
-    } 
- 
-    // A uniqued metadata node. 
-    MDNode &OpN = *cast<MDNode>(Op); 
-    assert(OpN.isUniqued() && 
-           "Only uniqued operands cannot be mapped immediately"); 
-    if (G.Info.insert(std::make_pair(&OpN, Data())).second) 
-      return &OpN; // This is a new one.  Return it. 
-  } 
-  return nullptr; 
-} 
- 
-void MDNodeMapper::UniquedGraph::propagateChanges() { 
-  bool AnyChanges; 
-  do { 
-    AnyChanges = false; 
-    for (MDNode *N : POT) { 
-      auto &D = Info[N]; 
-      if (D.HasChanged) 
-        continue; 
- 
-      if (llvm::none_of(N->operands(), [&](const Metadata *Op) { 
-            auto Where = Info.find(Op); 
-            return Where != Info.end() && Where->second.HasChanged; 
-          })) 
-        continue; 
- 
-      AnyChanges = D.HasChanged = true; 
-    } 
-  } while (AnyChanges); 
-} 
- 
-void MDNodeMapper::mapNodesInPOT(UniquedGraph &G) { 
-  // Construct uniqued nodes, building forward references as necessary. 
-  SmallVector<MDNode *, 16> CyclicNodes; 
-  for (auto *N : G.POT) { 
-    auto &D = G.Info[N]; 
-    if (!D.HasChanged) { 
-      // The node hasn't changed. 
-      M.mapToSelf(N); 
-      continue; 
-    } 
- 
-    // Remember whether this node had a placeholder. 
-    bool HadPlaceholder(D.Placeholder); 
- 
-    // Clone the uniqued node and remap the operands. 
-    TempMDNode ClonedN = D.Placeholder ? std::move(D.Placeholder) : N->clone(); 
-    remapOperands(*ClonedN, [this, &D, &G](Metadata *Old) { 
-      if (Optional<Metadata *> MappedOp = getMappedOp(Old)) 
-        return *MappedOp; 
-      (void)D; 
-      assert(G.Info[Old].ID > D.ID && "Expected a forward reference"); 
-      return &G.getFwdReference(*cast<MDNode>(Old)); 
-    }); 
- 
-    auto *NewN = MDNode::replaceWithUniqued(std::move(ClonedN)); 
-    M.mapToMetadata(N, NewN); 
- 
-    // Nodes that were referenced out of order in the POT are involved in a 
-    // uniquing cycle. 
-    if (HadPlaceholder) 
-      CyclicNodes.push_back(NewN); 
-  } 
- 
-  // Resolve cycles. 
-  for (auto *N : CyclicNodes) 
-    if (!N->isResolved()) 
-      N->resolveCycles(); 
-} 
- 
-Metadata *MDNodeMapper::map(const MDNode &N) { 
-  assert(DistinctWorklist.empty() && "MDNodeMapper::map is not recursive"); 
-  assert(!(M.Flags & RF_NoModuleLevelChanges) && 
-         "MDNodeMapper::map assumes module-level changes"); 
- 
-  // Require resolved nodes whenever metadata might be remapped. 
-  assert(N.isResolved() && "Unexpected unresolved node"); 
- 
-  Metadata *MappedN = 
-      N.isUniqued() ? mapTopLevelUniquedNode(N) : mapDistinctNode(N); 
-  while (!DistinctWorklist.empty()) 
-    remapOperands(*DistinctWorklist.pop_back_val(), [this](Metadata *Old) { 
-      if (Optional<Metadata *> MappedOp = tryToMapOperand(Old)) 
-        return *MappedOp; 
-      return mapTopLevelUniquedNode(*cast<MDNode>(Old)); 
-    }); 
-  return MappedN; 
-} 
- 
-Metadata *MDNodeMapper::mapTopLevelUniquedNode(const MDNode &FirstN) { 
-  assert(FirstN.isUniqued() && "Expected uniqued node"); 
- 
-  // Create a post-order traversal of uniqued nodes under FirstN. 
-  UniquedGraph G; 
-  if (!createPOT(G, FirstN)) { 
-    // Return early if no nodes have changed. 
-    for (const MDNode *N : G.POT) 
-      M.mapToSelf(N); 
-    return &const_cast<MDNode &>(FirstN); 
-  } 
- 
-  // Update graph with all nodes that have changed. 
-  G.propagateChanges(); 
- 
-  // Map all the nodes in the graph. 
-  mapNodesInPOT(G); 
- 
-  // Return the original node, remapped. 
-  return *getMappedOp(&FirstN); 
-} 
- 
-Optional<Metadata *> Mapper::mapSimpleMetadata(const Metadata *MD) { 
-  // If the value already exists in the map, use it. 
-  if (Optional<Metadata *> NewMD = getVM().getMappedMD(MD)) 
-    return *NewMD; 
- 
-  if (isa<MDString>(MD)) 
-    return const_cast<Metadata *>(MD); 
- 
-  // This is a module-level metadata.  If nothing at the module level is 
-  // changing, use an identity mapping. 
-  if ((Flags & RF_NoModuleLevelChanges)) 
-    return const_cast<Metadata *>(MD); 
- 
-  if (auto *CMD = dyn_cast<ConstantAsMetadata>(MD)) { 
-    // Don't memoize ConstantAsMetadata.  Instead of lasting until the 
-    // LLVMContext is destroyed, they can be deleted when the GlobalValue they 
-    // reference is destructed.  These aren't super common, so the extra 
-    // indirection isn't that expensive. 
-    return wrapConstantAsMetadata(*CMD, mapValue(CMD->getValue())); 
-  } 
- 
-  assert(isa<MDNode>(MD) && "Expected a metadata node"); 
- 
-  return None; 
-} 
- 
-Metadata *Mapper::mapMetadata(const Metadata *MD) { 
-  assert(MD && "Expected valid metadata"); 
-  assert(!isa<LocalAsMetadata>(MD) && "Unexpected local metadata"); 
- 
-  if (Optional<Metadata *> NewMD = mapSimpleMetadata(MD)) 
-    return *NewMD; 
- 
-  return MDNodeMapper(*this).map(*cast<MDNode>(MD)); 
-} 
- 
-void Mapper::flush() { 
-  // Flush out the worklist of global values. 
-  while (!Worklist.empty()) { 
-    WorklistEntry E = Worklist.pop_back_val(); 
-    CurrentMCID = E.MCID; 
-    switch (E.Kind) { 
-    case WorklistEntry::MapGlobalInit: 
-      E.Data.GVInit.GV->setInitializer(mapConstant(E.Data.GVInit.Init)); 
-      remapGlobalObjectMetadata(*E.Data.GVInit.GV); 
-      break; 
-    case WorklistEntry::MapAppendingVar: { 
-      unsigned PrefixSize = AppendingInits.size() - E.AppendingGVNumNewMembers; 
+//===- ValueMapper.cpp - Interface shared by lib/Transforms/Utils ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the MapValue function, which is shared by various parts of
+// the lib/Transforms/Utils library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalIndirectSymbol.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include <cassert>
+#include <limits>
+#include <memory>
+#include <utility>
+
+using namespace llvm;
+
+// Out of line method to get vtable etc for class.
+void ValueMapTypeRemapper::anchor() {}
+void ValueMaterializer::anchor() {}
+
+namespace {
+
+/// A basic block used in a BlockAddress whose function body is not yet
+/// materialized.
+struct DelayedBasicBlock {
+  BasicBlock *OldBB;
+  std::unique_ptr<BasicBlock> TempBB;
+
+  DelayedBasicBlock(const BlockAddress &Old)
+      : OldBB(Old.getBasicBlock()),
+        TempBB(BasicBlock::Create(Old.getContext())) {}
+};
+
+struct WorklistEntry {
+  enum EntryKind {
+    MapGlobalInit,
+    MapAppendingVar,
+    MapGlobalIndirectSymbol,
+    RemapFunction
+  };
+  struct GVInitTy {
+    GlobalVariable *GV;
+    Constant *Init;
+  };
+  struct AppendingGVTy {
+    GlobalVariable *GV;
+    Constant *InitPrefix;
+  };
+  struct GlobalIndirectSymbolTy {
+    GlobalIndirectSymbol *GIS;
+    Constant *Target;
+  };
+
+  unsigned Kind : 2;
+  unsigned MCID : 29;
+  unsigned AppendingGVIsOldCtorDtor : 1;
+  unsigned AppendingGVNumNewMembers;
+  union {
+    GVInitTy GVInit;
+    AppendingGVTy AppendingGV;
+    GlobalIndirectSymbolTy GlobalIndirectSymbol;
+    Function *RemapF;
+  } Data;
+};
+
+struct MappingContext {
+  ValueToValueMapTy *VM;
+  ValueMaterializer *Materializer = nullptr;
+
+  /// Construct a MappingContext with a value map and materializer.
+  explicit MappingContext(ValueToValueMapTy &VM,
+                          ValueMaterializer *Materializer = nullptr)
+      : VM(&VM), Materializer(Materializer) {}
+};
+
+class Mapper {
+  friend class MDNodeMapper;
+
+#ifndef NDEBUG
+  DenseSet<GlobalValue *> AlreadyScheduled;
+#endif
+
+  RemapFlags Flags;
+  ValueMapTypeRemapper *TypeMapper;
+  unsigned CurrentMCID = 0;
+  SmallVector<MappingContext, 2> MCs;
+  SmallVector<WorklistEntry, 4> Worklist;
+  SmallVector<DelayedBasicBlock, 1> DelayedBBs;
+  SmallVector<Constant *, 16> AppendingInits;
+
+public:
+  Mapper(ValueToValueMapTy &VM, RemapFlags Flags,
+         ValueMapTypeRemapper *TypeMapper, ValueMaterializer *Materializer)
+      : Flags(Flags), TypeMapper(TypeMapper),
+        MCs(1, MappingContext(VM, Materializer)) {}
+
+  /// ValueMapper should explicitly call \a flush() before destruction.
+  ~Mapper() { assert(!hasWorkToDo() && "Expected to be flushed"); }
+
+  bool hasWorkToDo() const { return !Worklist.empty(); }
+
+  unsigned
+  registerAlternateMappingContext(ValueToValueMapTy &VM,
+                                  ValueMaterializer *Materializer = nullptr) {
+    MCs.push_back(MappingContext(VM, Materializer));
+    return MCs.size() - 1;
+  }
+
+  void addFlags(RemapFlags Flags);
+
+  void remapGlobalObjectMetadata(GlobalObject &GO);
+
+  Value *mapValue(const Value *V);
+  void remapInstruction(Instruction *I);
+  void remapFunction(Function &F);
+
+  Constant *mapConstant(const Constant *C) {
+    return cast_or_null<Constant>(mapValue(C));
+  }
+
+  /// Map metadata.
+  ///
+  /// Find the mapping for MD.  Guarantees that the return will be resolved
+  /// (not an MDNode, or MDNode::isResolved() returns true).
+  Metadata *mapMetadata(const Metadata *MD);
+
+  void scheduleMapGlobalInitializer(GlobalVariable &GV, Constant &Init,
+                                    unsigned MCID);
+  void scheduleMapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,
+                                    bool IsOldCtorDtor,
+                                    ArrayRef<Constant *> NewMembers,
+                                    unsigned MCID);
+  void scheduleMapGlobalIndirectSymbol(GlobalIndirectSymbol &GIS, Constant &Target,
+                                       unsigned MCID);
+  void scheduleRemapFunction(Function &F, unsigned MCID);
+
+  void flush();
+
+private:
+  void mapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,
+                            bool IsOldCtorDtor,
+                            ArrayRef<Constant *> NewMembers);
+
+  ValueToValueMapTy &getVM() { return *MCs[CurrentMCID].VM; }
+  ValueMaterializer *getMaterializer() { return MCs[CurrentMCID].Materializer; }
+
+  Value *mapBlockAddress(const BlockAddress &BA);
+
+  /// Map metadata that doesn't require visiting operands.
+  Optional<Metadata *> mapSimpleMetadata(const Metadata *MD);
+
+  Metadata *mapToMetadata(const Metadata *Key, Metadata *Val);
+  Metadata *mapToSelf(const Metadata *MD);
+};
+
+class MDNodeMapper {
+  Mapper &M;
+
+  /// Data about a node in \a UniquedGraph.
+  struct Data {
+    bool HasChanged = false;
+    unsigned ID = std::numeric_limits<unsigned>::max();
+    TempMDNode Placeholder;
+  };
+
+  /// A graph of uniqued nodes.
+  struct UniquedGraph {
+    SmallDenseMap<const Metadata *, Data, 32> Info; // Node properties.
+    SmallVector<MDNode *, 16> POT;                  // Post-order traversal.
+
+    /// Propagate changed operands through the post-order traversal.
+    ///
+    /// Iteratively update \a Data::HasChanged for each node based on \a
+    /// Data::HasChanged of its operands, until fixed point.
+    void propagateChanges();
+
+    /// Get a forward reference to a node to use as an operand.
+    Metadata &getFwdReference(MDNode &Op);
+  };
+
+  /// Worklist of distinct nodes whose operands need to be remapped.
+  SmallVector<MDNode *, 16> DistinctWorklist;
+
+  // Storage for a UniquedGraph.
+  SmallDenseMap<const Metadata *, Data, 32> InfoStorage;
+  SmallVector<MDNode *, 16> POTStorage;
+
+public:
+  MDNodeMapper(Mapper &M) : M(M) {}
+
+  /// Map a metadata node (and its transitive operands).
+  ///
+  /// Map all the (unmapped) nodes in the subgraph under \c N.  The iterative
+  /// algorithm handles distinct nodes and uniqued node subgraphs using
+  /// different strategies.
+  ///
+  /// Distinct nodes are immediately mapped and added to \a DistinctWorklist
+  /// using \a mapDistinctNode().  Their mapping can always be computed
+  /// immediately without visiting operands, even if their operands change.
+  ///
+  /// The mapping for uniqued nodes depends on whether their operands change.
+  /// \a mapTopLevelUniquedNode() traverses the transitive uniqued subgraph of
+  /// a node to calculate uniqued node mappings in bulk.  Distinct leafs are
+  /// added to \a DistinctWorklist with \a mapDistinctNode().
+  ///
+  /// After mapping \c N itself, this function remaps the operands of the
+  /// distinct nodes in \a DistinctWorklist until the entire subgraph under \c
+  /// N has been mapped.
+  Metadata *map(const MDNode &N);
+
+private:
+  /// Map a top-level uniqued node and the uniqued subgraph underneath it.
+  ///
+  /// This builds up a post-order traversal of the (unmapped) uniqued subgraph
+  /// underneath \c FirstN and calculates the nodes' mapping.  Each node uses
+  /// the identity mapping (\a Mapper::mapToSelf()) as long as all of its
+  /// operands uses the identity mapping.
+  ///
+  /// The algorithm works as follows:
+  ///
+  ///  1. \a createPOT(): traverse the uniqued subgraph under \c FirstN and
+  ///     save the post-order traversal in the given \a UniquedGraph, tracking
+  ///     nodes' operands change.
+  ///
+  ///  2. \a UniquedGraph::propagateChanges(): propagate changed operands
+  ///     through the \a UniquedGraph until fixed point, following the rule
+  ///     that if a node changes, any node that references must also change.
+  ///
+  ///  3. \a mapNodesInPOT(): map the uniqued nodes, creating new uniqued nodes
+  ///     (referencing new operands) where necessary.
+  Metadata *mapTopLevelUniquedNode(const MDNode &FirstN);
+
+  /// Try to map the operand of an \a MDNode.
+  ///
+  /// If \c Op is already mapped, return the mapping.  If it's not an \a
+  /// MDNode, compute and return the mapping.  If it's a distinct \a MDNode,
+  /// return the result of \a mapDistinctNode().
+  ///
+  /// \return None if \c Op is an unmapped uniqued \a MDNode.
+  /// \post getMappedOp(Op) only returns None if this returns None.
+  Optional<Metadata *> tryToMapOperand(const Metadata *Op);
+
+  /// Map a distinct node.
+  ///
+  /// Return the mapping for the distinct node \c N, saving the result in \a
+  /// DistinctWorklist for later remapping.
+  ///
+  /// \pre \c N is not yet mapped.
+  /// \pre \c N.isDistinct().
+  MDNode *mapDistinctNode(const MDNode &N);
+
+  /// Get a previously mapped node.
+  Optional<Metadata *> getMappedOp(const Metadata *Op) const;
+
+  /// Create a post-order traversal of an unmapped uniqued node subgraph.
+  ///
+  /// This traverses the metadata graph deeply enough to map \c FirstN.  It
+  /// uses \a tryToMapOperand() (via \a Mapper::mapSimplifiedNode()), so any
+  /// metadata that has already been mapped will not be part of the POT.
+  ///
+  /// Each node that has a changed operand from outside the graph (e.g., a
+  /// distinct node, an already-mapped uniqued node, or \a ConstantAsMetadata)
+  /// is marked with \a Data::HasChanged.
+  ///
+  /// \return \c true if any nodes in \c G have \a Data::HasChanged.
+  /// \post \c G.POT is a post-order traversal ending with \c FirstN.
+  /// \post \a Data::hasChanged in \c G.Info indicates whether any node needs
+  /// to change because of operands outside the graph.
+  bool createPOT(UniquedGraph &G, const MDNode &FirstN);
+
+  /// Visit the operands of a uniqued node in the POT.
+  ///
+  /// Visit the operands in the range from \c I to \c E, returning the first
+  /// uniqued node we find that isn't yet in \c G.  \c I is always advanced to
+  /// where to continue the loop through the operands.
+  ///
+  /// This sets \c HasChanged if any of the visited operands change.
+  MDNode *visitOperands(UniquedGraph &G, MDNode::op_iterator &I,
+                        MDNode::op_iterator E, bool &HasChanged);
+
+  /// Map all the nodes in the given uniqued graph.
+  ///
+  /// This visits all the nodes in \c G in post-order, using the identity
+  /// mapping or creating a new node depending on \a Data::HasChanged.
+  ///
+  /// \pre \a getMappedOp() returns None for nodes in \c G, but not for any of
+  /// their operands outside of \c G.
+  /// \pre \a Data::HasChanged is true for a node in \c G iff any of its
+  /// operands have changed.
+  /// \post \a getMappedOp() returns the mapped node for every node in \c G.
+  void mapNodesInPOT(UniquedGraph &G);
+
+  /// Remap a node's operands using the given functor.
+  ///
+  /// Iterate through the operands of \c N and update them in place using \c
+  /// mapOperand.
+  ///
+  /// \pre N.isDistinct() or N.isTemporary().
+  template <class OperandMapper>
+  void remapOperands(MDNode &N, OperandMapper mapOperand);
+};
+
+} // end anonymous namespace
+
+Value *Mapper::mapValue(const Value *V) {
+  ValueToValueMapTy::iterator I = getVM().find(V);
+
+  // If the value already exists in the map, use it.
+  if (I != getVM().end()) {
+    assert(I->second && "Unexpected null mapping");
+    return I->second;
+  }
+
+  // If we have a materializer and it can materialize a value, use that.
+  if (auto *Materializer = getMaterializer()) {
+    if (Value *NewV = Materializer->materialize(const_cast<Value *>(V))) {
+      getVM()[V] = NewV;
+      return NewV;
+    }
+  }
+
+  // Global values do not need to be seeded into the VM if they
+  // are using the identity mapping.
+  if (isa<GlobalValue>(V)) {
+    if (Flags & RF_NullMapMissingGlobalValues)
+      return nullptr;
+    return getVM()[V] = const_cast<Value *>(V);
+  }
+
+  if (const InlineAsm *IA = dyn_cast<InlineAsm>(V)) {
+    // Inline asm may need *type* remapping.
+    FunctionType *NewTy = IA->getFunctionType();
+    if (TypeMapper) {
+      NewTy = cast<FunctionType>(TypeMapper->remapType(NewTy));
+
+      if (NewTy != IA->getFunctionType())
+        V = InlineAsm::get(NewTy, IA->getAsmString(), IA->getConstraintString(),
+                           IA->hasSideEffects(), IA->isAlignStack(),
+                           IA->getDialect());
+    }
+
+    return getVM()[V] = const_cast<Value *>(V);
+  }
+
+  if (const auto *MDV = dyn_cast<MetadataAsValue>(V)) {
+    const Metadata *MD = MDV->getMetadata();
+
+    if (auto *LAM = dyn_cast<LocalAsMetadata>(MD)) {
+      // Look through to grab the local value.
+      if (Value *LV = mapValue(LAM->getValue())) {
+        if (V == LAM->getValue())
+          return const_cast<Value *>(V);
+        return MetadataAsValue::get(V->getContext(), ValueAsMetadata::get(LV));
+      }
+
+      // FIXME: always return nullptr once Verifier::verifyDominatesUse()
+      // ensures metadata operands only reference defined SSA values.
+      return (Flags & RF_IgnoreMissingLocals)
+                 ? nullptr
+                 : MetadataAsValue::get(V->getContext(),
+                                        MDTuple::get(V->getContext(), None));
+    }
+
+    // If this is a module-level metadata and we know that nothing at the module
+    // level is changing, then use an identity mapping.
+    if (Flags & RF_NoModuleLevelChanges)
+      return getVM()[V] = const_cast<Value *>(V);
+
+    // Map the metadata and turn it into a value.
+    auto *MappedMD = mapMetadata(MD);
+    if (MD == MappedMD)
+      return getVM()[V] = const_cast<Value *>(V);
+    return getVM()[V] = MetadataAsValue::get(V->getContext(), MappedMD);
+  }
+
+  // Okay, this either must be a constant (which may or may not be mappable) or
+  // is something that is not in the mapping table.
+  Constant *C = const_cast<Constant*>(dyn_cast<Constant>(V));
+  if (!C)
+    return nullptr;
+
+  if (BlockAddress *BA = dyn_cast<BlockAddress>(C))
+    return mapBlockAddress(*BA);
+
+  auto mapValueOrNull = [this](Value *V) {
+    auto Mapped = mapValue(V);
+    assert((Mapped || (Flags & RF_NullMapMissingGlobalValues)) &&
+           "Unexpected null mapping for constant operand without "
+           "NullMapMissingGlobalValues flag");
+    return Mapped;
+  };
+
+  // Otherwise, we have some other constant to remap.  Start by checking to see
+  // if all operands have an identity remapping.
+  unsigned OpNo = 0, NumOperands = C->getNumOperands();
+  Value *Mapped = nullptr;
+  for (; OpNo != NumOperands; ++OpNo) {
+    Value *Op = C->getOperand(OpNo);
+    Mapped = mapValueOrNull(Op);
+    if (!Mapped)
+      return nullptr;
+    if (Mapped != Op)
+      break;
+  }
+
+  // See if the type mapper wants to remap the type as well.
+  Type *NewTy = C->getType();
+  if (TypeMapper)
+    NewTy = TypeMapper->remapType(NewTy);
+
+  // If the result type and all operands match up, then just insert an identity
+  // mapping.
+  if (OpNo == NumOperands && NewTy == C->getType())
+    return getVM()[V] = C;
+
+  // Okay, we need to create a new constant.  We've already processed some or
+  // all of the operands, set them all up now.
+  SmallVector<Constant*, 8> Ops;
+  Ops.reserve(NumOperands);
+  for (unsigned j = 0; j != OpNo; ++j)
+    Ops.push_back(cast<Constant>(C->getOperand(j)));
+
+  // If one of the operands mismatch, push it and the other mapped operands.
+  if (OpNo != NumOperands) {
+    Ops.push_back(cast<Constant>(Mapped));
+
+    // Map the rest of the operands that aren't processed yet.
+    for (++OpNo; OpNo != NumOperands; ++OpNo) {
+      Mapped = mapValueOrNull(C->getOperand(OpNo));
+      if (!Mapped)
+        return nullptr;
+      Ops.push_back(cast<Constant>(Mapped));
+    }
+  }
+  Type *NewSrcTy = nullptr;
+  if (TypeMapper)
+    if (auto *GEPO = dyn_cast<GEPOperator>(C))
+      NewSrcTy = TypeMapper->remapType(GEPO->getSourceElementType());
+
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
+    return getVM()[V] = CE->getWithOperands(Ops, NewTy, false, NewSrcTy);
+  if (isa<ConstantArray>(C))
+    return getVM()[V] = ConstantArray::get(cast<ArrayType>(NewTy), Ops);
+  if (isa<ConstantStruct>(C))
+    return getVM()[V] = ConstantStruct::get(cast<StructType>(NewTy), Ops);
+  if (isa<ConstantVector>(C))
+    return getVM()[V] = ConstantVector::get(Ops);
+  // If this is a no-operand constant, it must be because the type was remapped.
+  if (isa<UndefValue>(C))
+    return getVM()[V] = UndefValue::get(NewTy);
+  if (isa<ConstantAggregateZero>(C))
+    return getVM()[V] = ConstantAggregateZero::get(NewTy);
+  assert(isa<ConstantPointerNull>(C));
+  return getVM()[V] = ConstantPointerNull::get(cast<PointerType>(NewTy));
+}
+
+Value *Mapper::mapBlockAddress(const BlockAddress &BA) {
+  Function *F = cast<Function>(mapValue(BA.getFunction()));
+
+  // F may not have materialized its initializer.  In that case, create a
+  // dummy basic block for now, and replace it once we've materialized all
+  // the initializers.
+  BasicBlock *BB;
+  if (F->empty()) {
+    DelayedBBs.push_back(DelayedBasicBlock(BA));
+    BB = DelayedBBs.back().TempBB.get();
+  } else {
+    BB = cast_or_null<BasicBlock>(mapValue(BA.getBasicBlock()));
+  }
+
+  return getVM()[&BA] = BlockAddress::get(F, BB ? BB : BA.getBasicBlock());
+}
+
+Metadata *Mapper::mapToMetadata(const Metadata *Key, Metadata *Val) {
+  getVM().MD()[Key].reset(Val);
+  return Val;
+}
+
+Metadata *Mapper::mapToSelf(const Metadata *MD) {
+  return mapToMetadata(MD, const_cast<Metadata *>(MD));
+}
+
+Optional<Metadata *> MDNodeMapper::tryToMapOperand(const Metadata *Op) {
+  if (!Op)
+    return nullptr;
+
+  if (Optional<Metadata *> MappedOp = M.mapSimpleMetadata(Op)) {
+#ifndef NDEBUG
+    if (auto *CMD = dyn_cast<ConstantAsMetadata>(Op))
+      assert((!*MappedOp || M.getVM().count(CMD->getValue()) ||
+              M.getVM().getMappedMD(Op)) &&
+             "Expected Value to be memoized");
+    else
+      assert((isa<MDString>(Op) || M.getVM().getMappedMD(Op)) &&
+             "Expected result to be memoized");
+#endif
+    return *MappedOp;
+  }
+
+  const MDNode &N = *cast<MDNode>(Op);
+  if (N.isDistinct())
+    return mapDistinctNode(N);
+  return None;
+}
+
+static Metadata *cloneOrBuildODR(const MDNode &N) {
+  auto *CT = dyn_cast<DICompositeType>(&N);
+  // If ODR type uniquing is enabled, we would have uniqued composite types
+  // with identifiers during bitcode reading, so we can just use CT.
+  if (CT && CT->getContext().isODRUniquingDebugTypes() &&
+      CT->getIdentifier() != "")
+    return const_cast<DICompositeType *>(CT);
+  return MDNode::replaceWithDistinct(N.clone());
+}
+
+MDNode *MDNodeMapper::mapDistinctNode(const MDNode &N) {
+  assert(N.isDistinct() && "Expected a distinct node");
+  assert(!M.getVM().getMappedMD(&N) && "Expected an unmapped node");
+  DistinctWorklist.push_back(
+      cast<MDNode>((M.Flags & RF_MoveDistinctMDs)
+                       ? M.mapToSelf(&N)
+                       : M.mapToMetadata(&N, cloneOrBuildODR(N))));
+  return DistinctWorklist.back();
+}
+
+static ConstantAsMetadata *wrapConstantAsMetadata(const ConstantAsMetadata &CMD,
+                                                  Value *MappedV) {
+  if (CMD.getValue() == MappedV)
+    return const_cast<ConstantAsMetadata *>(&CMD);
+  return MappedV ? ConstantAsMetadata::getConstant(MappedV) : nullptr;
+}
+
+Optional<Metadata *> MDNodeMapper::getMappedOp(const Metadata *Op) const {
+  if (!Op)
+    return nullptr;
+
+  if (Optional<Metadata *> MappedOp = M.getVM().getMappedMD(Op))
+    return *MappedOp;
+
+  if (isa<MDString>(Op))
+    return const_cast<Metadata *>(Op);
+
+  if (auto *CMD = dyn_cast<ConstantAsMetadata>(Op))
+    return wrapConstantAsMetadata(*CMD, M.getVM().lookup(CMD->getValue()));
+
+  return None;
+}
+
+Metadata &MDNodeMapper::UniquedGraph::getFwdReference(MDNode &Op) {
+  auto Where = Info.find(&Op);
+  assert(Where != Info.end() && "Expected a valid reference");
+
+  auto &OpD = Where->second;
+  if (!OpD.HasChanged)
+    return Op;
+
+  // Lazily construct a temporary node.
+  if (!OpD.Placeholder)
+    OpD.Placeholder = Op.clone();
+
+  return *OpD.Placeholder;
+}
+
+template <class OperandMapper>
+void MDNodeMapper::remapOperands(MDNode &N, OperandMapper mapOperand) {
+  assert(!N.isUniqued() && "Expected distinct or temporary nodes");
+  for (unsigned I = 0, E = N.getNumOperands(); I != E; ++I) {
+    Metadata *Old = N.getOperand(I);
+    Metadata *New = mapOperand(Old);
+
+    if (Old != New)
+      N.replaceOperandWith(I, New);
+  }
+}
+
+namespace {
+
+/// An entry in the worklist for the post-order traversal.
+struct POTWorklistEntry {
+  MDNode *N;              ///< Current node.
+  MDNode::op_iterator Op; ///< Current operand of \c N.
+
+  /// Keep a flag of whether operands have changed in the worklist to avoid
+  /// hitting the map in \a UniquedGraph.
+  bool HasChanged = false;
+
+  POTWorklistEntry(MDNode &N) : N(&N), Op(N.op_begin()) {}
+};
+
+} // end anonymous namespace
+
+bool MDNodeMapper::createPOT(UniquedGraph &G, const MDNode &FirstN) {
+  assert(G.Info.empty() && "Expected a fresh traversal");
+  assert(FirstN.isUniqued() && "Expected uniqued node in POT");
+
+  // Construct a post-order traversal of the uniqued subgraph under FirstN.
+  bool AnyChanges = false;
+  SmallVector<POTWorklistEntry, 16> Worklist;
+  Worklist.push_back(POTWorklistEntry(const_cast<MDNode &>(FirstN)));
+  (void)G.Info[&FirstN];
+  while (!Worklist.empty()) {
+    // Start or continue the traversal through the this node's operands.
+    auto &WE = Worklist.back();
+    if (MDNode *N = visitOperands(G, WE.Op, WE.N->op_end(), WE.HasChanged)) {
+      // Push a new node to traverse first.
+      Worklist.push_back(POTWorklistEntry(*N));
+      continue;
+    }
+
+    // Push the node onto the POT.
+    assert(WE.N->isUniqued() && "Expected only uniqued nodes");
+    assert(WE.Op == WE.N->op_end() && "Expected to visit all operands");
+    auto &D = G.Info[WE.N];
+    AnyChanges |= D.HasChanged = WE.HasChanged;
+    D.ID = G.POT.size();
+    G.POT.push_back(WE.N);
+
+    // Pop the node off the worklist.
+    Worklist.pop_back();
+  }
+  return AnyChanges;
+}
+
+MDNode *MDNodeMapper::visitOperands(UniquedGraph &G, MDNode::op_iterator &I,
+                                    MDNode::op_iterator E, bool &HasChanged) {
+  while (I != E) {
+    Metadata *Op = *I++; // Increment even on early return.
+    if (Optional<Metadata *> MappedOp = tryToMapOperand(Op)) {
+      // Check if the operand changes.
+      HasChanged |= Op != *MappedOp;
+      continue;
+    }
+
+    // A uniqued metadata node.
+    MDNode &OpN = *cast<MDNode>(Op);
+    assert(OpN.isUniqued() &&
+           "Only uniqued operands cannot be mapped immediately");
+    if (G.Info.insert(std::make_pair(&OpN, Data())).second)
+      return &OpN; // This is a new one.  Return it.
+  }
+  return nullptr;
+}
+
+void MDNodeMapper::UniquedGraph::propagateChanges() {
+  bool AnyChanges;
+  do {
+    AnyChanges = false;
+    for (MDNode *N : POT) {
+      auto &D = Info[N];
+      if (D.HasChanged)
+        continue;
+
+      if (llvm::none_of(N->operands(), [&](const Metadata *Op) {
+            auto Where = Info.find(Op);
+            return Where != Info.end() && Where->second.HasChanged;
+          }))
+        continue;
+
+      AnyChanges = D.HasChanged = true;
+    }
+  } while (AnyChanges);
+}
+
+void MDNodeMapper::mapNodesInPOT(UniquedGraph &G) {
+  // Construct uniqued nodes, building forward references as necessary.
+  SmallVector<MDNode *, 16> CyclicNodes;
+  for (auto *N : G.POT) {
+    auto &D = G.Info[N];
+    if (!D.HasChanged) {
+      // The node hasn't changed.
+      M.mapToSelf(N);
+      continue;
+    }
+
+    // Remember whether this node had a placeholder.
+    bool HadPlaceholder(D.Placeholder);
+
+    // Clone the uniqued node and remap the operands.
+    TempMDNode ClonedN = D.Placeholder ? std::move(D.Placeholder) : N->clone();
+    remapOperands(*ClonedN, [this, &D, &G](Metadata *Old) {
+      if (Optional<Metadata *> MappedOp = getMappedOp(Old))
+        return *MappedOp;
+      (void)D;
+      assert(G.Info[Old].ID > D.ID && "Expected a forward reference");
+      return &G.getFwdReference(*cast<MDNode>(Old));
+    });
+
+    auto *NewN = MDNode::replaceWithUniqued(std::move(ClonedN));
+    M.mapToMetadata(N, NewN);
+
+    // Nodes that were referenced out of order in the POT are involved in a
+    // uniquing cycle.
+    if (HadPlaceholder)
+      CyclicNodes.push_back(NewN);
+  }
+
+  // Resolve cycles.
+  for (auto *N : CyclicNodes)
+    if (!N->isResolved())
+      N->resolveCycles();
+}
+
+Metadata *MDNodeMapper::map(const MDNode &N) {
+  assert(DistinctWorklist.empty() && "MDNodeMapper::map is not recursive");
+  assert(!(M.Flags & RF_NoModuleLevelChanges) &&
+         "MDNodeMapper::map assumes module-level changes");
+
+  // Require resolved nodes whenever metadata might be remapped.
+  assert(N.isResolved() && "Unexpected unresolved node");
+
+  Metadata *MappedN =
+      N.isUniqued() ? mapTopLevelUniquedNode(N) : mapDistinctNode(N);
+  while (!DistinctWorklist.empty())
+    remapOperands(*DistinctWorklist.pop_back_val(), [this](Metadata *Old) {
+      if (Optional<Metadata *> MappedOp = tryToMapOperand(Old))
+        return *MappedOp;
+      return mapTopLevelUniquedNode(*cast<MDNode>(Old));
+    });
+  return MappedN;
+}
+
+Metadata *MDNodeMapper::mapTopLevelUniquedNode(const MDNode &FirstN) {
+  assert(FirstN.isUniqued() && "Expected uniqued node");
+
+  // Create a post-order traversal of uniqued nodes under FirstN.
+  UniquedGraph G;
+  if (!createPOT(G, FirstN)) {
+    // Return early if no nodes have changed.
+    for (const MDNode *N : G.POT)
+      M.mapToSelf(N);
+    return &const_cast<MDNode &>(FirstN);
+  }
+
+  // Update graph with all nodes that have changed.
+  G.propagateChanges();
+
+  // Map all the nodes in the graph.
+  mapNodesInPOT(G);
+
+  // Return the original node, remapped.
+  return *getMappedOp(&FirstN);
+}
+
+Optional<Metadata *> Mapper::mapSimpleMetadata(const Metadata *MD) {
+  // If the value already exists in the map, use it.
+  if (Optional<Metadata *> NewMD = getVM().getMappedMD(MD))
+    return *NewMD;
+
+  if (isa<MDString>(MD))
+    return const_cast<Metadata *>(MD);
+
+  // This is a module-level metadata.  If nothing at the module level is
+  // changing, use an identity mapping.
+  if ((Flags & RF_NoModuleLevelChanges))
+    return const_cast<Metadata *>(MD);
+
+  if (auto *CMD = dyn_cast<ConstantAsMetadata>(MD)) {
+    // Don't memoize ConstantAsMetadata.  Instead of lasting until the
+    // LLVMContext is destroyed, they can be deleted when the GlobalValue they
+    // reference is destructed.  These aren't super common, so the extra
+    // indirection isn't that expensive.
+    return wrapConstantAsMetadata(*CMD, mapValue(CMD->getValue()));
+  }
+
+  assert(isa<MDNode>(MD) && "Expected a metadata node");
+
+  return None;
+}
+
+Metadata *Mapper::mapMetadata(const Metadata *MD) {
+  assert(MD && "Expected valid metadata");
+  assert(!isa<LocalAsMetadata>(MD) && "Unexpected local metadata");
+
+  if (Optional<Metadata *> NewMD = mapSimpleMetadata(MD))
+    return *NewMD;
+
+  return MDNodeMapper(*this).map(*cast<MDNode>(MD));
+}
+
+void Mapper::flush() {
+  // Flush out the worklist of global values.
+  while (!Worklist.empty()) {
+    WorklistEntry E = Worklist.pop_back_val();
+    CurrentMCID = E.MCID;
+    switch (E.Kind) {
+    case WorklistEntry::MapGlobalInit:
+      E.Data.GVInit.GV->setInitializer(mapConstant(E.Data.GVInit.Init));
+      remapGlobalObjectMetadata(*E.Data.GVInit.GV);
+      break;
+    case WorklistEntry::MapAppendingVar: {
+      unsigned PrefixSize = AppendingInits.size() - E.AppendingGVNumNewMembers;
       // mapAppendingVariable call can change AppendingInits if initalizer for
       // the variable depends on another appending global, because of that inits
       // need to be extracted and updated before the call.
       SmallVector<Constant *, 8> NewInits(
           drop_begin(AppendingInits, PrefixSize));
       AppendingInits.resize(PrefixSize);
-      mapAppendingVariable(*E.Data.AppendingGV.GV, 
-                           E.Data.AppendingGV.InitPrefix, 
+      mapAppendingVariable(*E.Data.AppendingGV.GV,
+                           E.Data.AppendingGV.InitPrefix,
                            E.AppendingGVIsOldCtorDtor, makeArrayRef(NewInits));
-      break; 
-    } 
-    case WorklistEntry::MapGlobalIndirectSymbol: 
-      E.Data.GlobalIndirectSymbol.GIS->setIndirectSymbol( 
-          mapConstant(E.Data.GlobalIndirectSymbol.Target)); 
-      break; 
-    case WorklistEntry::RemapFunction: 
-      remapFunction(*E.Data.RemapF); 
-      break; 
-    } 
-  } 
-  CurrentMCID = 0; 
- 
-  // Finish logic for block addresses now that all global values have been 
-  // handled. 
-  while (!DelayedBBs.empty()) { 
-    DelayedBasicBlock DBB = DelayedBBs.pop_back_val(); 
-    BasicBlock *BB = cast_or_null<BasicBlock>(mapValue(DBB.OldBB)); 
-    DBB.TempBB->replaceAllUsesWith(BB ? BB : DBB.OldBB); 
-  } 
-} 
- 
-void Mapper::remapInstruction(Instruction *I) { 
-  // Remap operands. 
-  for (Use &Op : I->operands()) { 
-    Value *V = mapValue(Op); 
-    // If we aren't ignoring missing entries, assert that something happened. 
-    if (V) 
-      Op = V; 
-    else 
-      assert((Flags & RF_IgnoreMissingLocals) && 
-             "Referenced value not in value map!"); 
-  } 
- 
-  // Remap phi nodes' incoming blocks. 
-  if (PHINode *PN = dyn_cast<PHINode>(I)) { 
-    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { 
-      Value *V = mapValue(PN->getIncomingBlock(i)); 
-      // If we aren't ignoring missing entries, assert that something happened. 
-      if (V) 
-        PN->setIncomingBlock(i, cast<BasicBlock>(V)); 
-      else 
-        assert((Flags & RF_IgnoreMissingLocals) && 
-               "Referenced block not in value map!"); 
-    } 
-  } 
- 
-  // Remap attached metadata. 
-  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs; 
-  I->getAllMetadata(MDs); 
-  for (const auto &MI : MDs) { 
-    MDNode *Old = MI.second; 
-    MDNode *New = cast_or_null<MDNode>(mapMetadata(Old)); 
-    if (New != Old) 
-      I->setMetadata(MI.first, New); 
-  } 
- 
-  if (!TypeMapper) 
-    return; 
- 
-  // If the instruction's type is being remapped, do so now. 
-  if (auto *CB = dyn_cast<CallBase>(I)) { 
-    SmallVector<Type *, 3> Tys; 
-    FunctionType *FTy = CB->getFunctionType(); 
-    Tys.reserve(FTy->getNumParams()); 
-    for (Type *Ty : FTy->params()) 
-      Tys.push_back(TypeMapper->remapType(Ty)); 
-    CB->mutateFunctionType(FunctionType::get( 
-        TypeMapper->remapType(I->getType()), Tys, FTy->isVarArg())); 
- 
-    LLVMContext &C = CB->getContext(); 
-    AttributeList Attrs = CB->getAttributes(); 
-    for (unsigned i = 0; i < Attrs.getNumAttrSets(); ++i) { 
+      break;
+    }
+    case WorklistEntry::MapGlobalIndirectSymbol:
+      E.Data.GlobalIndirectSymbol.GIS->setIndirectSymbol(
+          mapConstant(E.Data.GlobalIndirectSymbol.Target));
+      break;
+    case WorklistEntry::RemapFunction:
+      remapFunction(*E.Data.RemapF);
+      break;
+    }
+  }
+  CurrentMCID = 0;
+
+  // Finish logic for block addresses now that all global values have been
+  // handled.
+  while (!DelayedBBs.empty()) {
+    DelayedBasicBlock DBB = DelayedBBs.pop_back_val();
+    BasicBlock *BB = cast_or_null<BasicBlock>(mapValue(DBB.OldBB));
+    DBB.TempBB->replaceAllUsesWith(BB ? BB : DBB.OldBB);
+  }
+}
+
+void Mapper::remapInstruction(Instruction *I) {
+  // Remap operands.
+  for (Use &Op : I->operands()) {
+    Value *V = mapValue(Op);
+    // If we aren't ignoring missing entries, assert that something happened.
+    if (V)
+      Op = V;
+    else
+      assert((Flags & RF_IgnoreMissingLocals) &&
+             "Referenced value not in value map!");
+  }
+
+  // Remap phi nodes' incoming blocks.
+  if (PHINode *PN = dyn_cast<PHINode>(I)) {
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      Value *V = mapValue(PN->getIncomingBlock(i));
+      // If we aren't ignoring missing entries, assert that something happened.
+      if (V)
+        PN->setIncomingBlock(i, cast<BasicBlock>(V));
+      else
+        assert((Flags & RF_IgnoreMissingLocals) &&
+               "Referenced block not in value map!");
+    }
+  }
+
+  // Remap attached metadata.
+  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+  I->getAllMetadata(MDs);
+  for (const auto &MI : MDs) {
+    MDNode *Old = MI.second;
+    MDNode *New = cast_or_null<MDNode>(mapMetadata(Old));
+    if (New != Old)
+      I->setMetadata(MI.first, New);
+  }
+
+  if (!TypeMapper)
+    return;
+
+  // If the instruction's type is being remapped, do so now.
+  if (auto *CB = dyn_cast<CallBase>(I)) {
+    SmallVector<Type *, 3> Tys;
+    FunctionType *FTy = CB->getFunctionType();
+    Tys.reserve(FTy->getNumParams());
+    for (Type *Ty : FTy->params())
+      Tys.push_back(TypeMapper->remapType(Ty));
+    CB->mutateFunctionType(FunctionType::get(
+        TypeMapper->remapType(I->getType()), Tys, FTy->isVarArg()));
+
+    LLVMContext &C = CB->getContext();
+    AttributeList Attrs = CB->getAttributes();
+    for (unsigned i = 0; i < Attrs.getNumAttrSets(); ++i) {
       for (Attribute::AttrKind TypedAttr :
            {Attribute::ByVal, Attribute::StructRet, Attribute::ByRef}) {
         if (Type *Ty = Attrs.getAttribute(i, TypedAttr).getValueAsType()) {
@@ -908,234 +908,234 @@ void Mapper::remapInstruction(Instruction *I) {
                                              TypeMapper->remapType(Ty));
           break;
         }
-      } 
-    } 
-    CB->setAttributes(Attrs); 
-    return; 
-  } 
-  if (auto *AI = dyn_cast<AllocaInst>(I)) 
-    AI->setAllocatedType(TypeMapper->remapType(AI->getAllocatedType())); 
-  if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) { 
-    GEP->setSourceElementType( 
-        TypeMapper->remapType(GEP->getSourceElementType())); 
-    GEP->setResultElementType( 
-        TypeMapper->remapType(GEP->getResultElementType())); 
-  } 
-  I->mutateType(TypeMapper->remapType(I->getType())); 
-} 
- 
-void Mapper::remapGlobalObjectMetadata(GlobalObject &GO) { 
-  SmallVector<std::pair<unsigned, MDNode *>, 8> MDs; 
-  GO.getAllMetadata(MDs); 
-  GO.clearMetadata(); 
-  for (const auto &I : MDs) 
-    GO.addMetadata(I.first, *cast<MDNode>(mapMetadata(I.second))); 
-} 
- 
-void Mapper::remapFunction(Function &F) { 
-  // Remap the operands. 
-  for (Use &Op : F.operands()) 
-    if (Op) 
-      Op = mapValue(Op); 
- 
-  // Remap the metadata attachments. 
-  remapGlobalObjectMetadata(F); 
- 
-  // Remap the argument types. 
-  if (TypeMapper) 
-    for (Argument &A : F.args()) 
-      A.mutateType(TypeMapper->remapType(A.getType())); 
- 
-  // Remap the instructions. 
-  for (BasicBlock &BB : F) 
-    for (Instruction &I : BB) 
-      remapInstruction(&I); 
-} 
- 
-void Mapper::mapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix, 
-                                  bool IsOldCtorDtor, 
-                                  ArrayRef<Constant *> NewMembers) { 
-  SmallVector<Constant *, 16> Elements; 
-  if (InitPrefix) { 
-    unsigned NumElements = 
-        cast<ArrayType>(InitPrefix->getType())->getNumElements(); 
-    for (unsigned I = 0; I != NumElements; ++I) 
-      Elements.push_back(InitPrefix->getAggregateElement(I)); 
-  } 
- 
-  PointerType *VoidPtrTy; 
-  Type *EltTy; 
-  if (IsOldCtorDtor) { 
-    // FIXME: This upgrade is done during linking to support the C API.  See 
-    // also IRLinker::linkAppendingVarProto() in IRMover.cpp. 
-    VoidPtrTy = Type::getInt8Ty(GV.getContext())->getPointerTo(); 
-    auto &ST = *cast<StructType>(NewMembers.front()->getType()); 
-    Type *Tys[3] = {ST.getElementType(0), ST.getElementType(1), VoidPtrTy}; 
-    EltTy = StructType::get(GV.getContext(), Tys, false); 
-  } 
- 
-  for (auto *V : NewMembers) { 
-    Constant *NewV; 
-    if (IsOldCtorDtor) { 
-      auto *S = cast<ConstantStruct>(V); 
-      auto *E1 = cast<Constant>(mapValue(S->getOperand(0))); 
-      auto *E2 = cast<Constant>(mapValue(S->getOperand(1))); 
-      Constant *Null = Constant::getNullValue(VoidPtrTy); 
-      NewV = ConstantStruct::get(cast<StructType>(EltTy), E1, E2, Null); 
-    } else { 
-      NewV = cast_or_null<Constant>(mapValue(V)); 
-    } 
-    Elements.push_back(NewV); 
-  } 
- 
-  GV.setInitializer(ConstantArray::get( 
-      cast<ArrayType>(GV.getType()->getElementType()), Elements)); 
-} 
- 
-void Mapper::scheduleMapGlobalInitializer(GlobalVariable &GV, Constant &Init, 
-                                          unsigned MCID) { 
-  assert(AlreadyScheduled.insert(&GV).second && "Should not reschedule"); 
-  assert(MCID < MCs.size() && "Invalid mapping context"); 
- 
-  WorklistEntry WE; 
-  WE.Kind = WorklistEntry::MapGlobalInit; 
-  WE.MCID = MCID; 
-  WE.Data.GVInit.GV = &GV; 
-  WE.Data.GVInit.Init = &Init; 
-  Worklist.push_back(WE); 
-} 
- 
-void Mapper::scheduleMapAppendingVariable(GlobalVariable &GV, 
-                                          Constant *InitPrefix, 
-                                          bool IsOldCtorDtor, 
-                                          ArrayRef<Constant *> NewMembers, 
-                                          unsigned MCID) { 
-  assert(AlreadyScheduled.insert(&GV).second && "Should not reschedule"); 
-  assert(MCID < MCs.size() && "Invalid mapping context"); 
- 
-  WorklistEntry WE; 
-  WE.Kind = WorklistEntry::MapAppendingVar; 
-  WE.MCID = MCID; 
-  WE.Data.AppendingGV.GV = &GV; 
-  WE.Data.AppendingGV.InitPrefix = InitPrefix; 
-  WE.AppendingGVIsOldCtorDtor = IsOldCtorDtor; 
-  WE.AppendingGVNumNewMembers = NewMembers.size(); 
-  Worklist.push_back(WE); 
-  AppendingInits.append(NewMembers.begin(), NewMembers.end()); 
-} 
- 
-void Mapper::scheduleMapGlobalIndirectSymbol(GlobalIndirectSymbol &GIS, 
-                                             Constant &Target, unsigned MCID) { 
-  assert(AlreadyScheduled.insert(&GIS).second && "Should not reschedule"); 
-  assert(MCID < MCs.size() && "Invalid mapping context"); 
- 
-  WorklistEntry WE; 
-  WE.Kind = WorklistEntry::MapGlobalIndirectSymbol; 
-  WE.MCID = MCID; 
-  WE.Data.GlobalIndirectSymbol.GIS = &GIS; 
-  WE.Data.GlobalIndirectSymbol.Target = &Target; 
-  Worklist.push_back(WE); 
-} 
- 
-void Mapper::scheduleRemapFunction(Function &F, unsigned MCID) { 
-  assert(AlreadyScheduled.insert(&F).second && "Should not reschedule"); 
-  assert(MCID < MCs.size() && "Invalid mapping context"); 
- 
-  WorklistEntry WE; 
-  WE.Kind = WorklistEntry::RemapFunction; 
-  WE.MCID = MCID; 
-  WE.Data.RemapF = &F; 
-  Worklist.push_back(WE); 
-} 
- 
-void Mapper::addFlags(RemapFlags Flags) { 
-  assert(!hasWorkToDo() && "Expected to have flushed the worklist"); 
-  this->Flags = this->Flags | Flags; 
-} 
- 
-static Mapper *getAsMapper(void *pImpl) { 
-  return reinterpret_cast<Mapper *>(pImpl); 
-} 
- 
-namespace { 
- 
-class FlushingMapper { 
-  Mapper &M; 
- 
-public: 
-  explicit FlushingMapper(void *pImpl) : M(*getAsMapper(pImpl)) { 
-    assert(!M.hasWorkToDo() && "Expected to be flushed"); 
-  } 
- 
-  ~FlushingMapper() { M.flush(); } 
- 
-  Mapper *operator->() const { return &M; } 
-}; 
- 
-} // end anonymous namespace 
- 
-ValueMapper::ValueMapper(ValueToValueMapTy &VM, RemapFlags Flags, 
-                         ValueMapTypeRemapper *TypeMapper, 
-                         ValueMaterializer *Materializer) 
-    : pImpl(new Mapper(VM, Flags, TypeMapper, Materializer)) {} 
- 
-ValueMapper::~ValueMapper() { delete getAsMapper(pImpl); } 
- 
-unsigned 
-ValueMapper::registerAlternateMappingContext(ValueToValueMapTy &VM, 
-                                             ValueMaterializer *Materializer) { 
-  return getAsMapper(pImpl)->registerAlternateMappingContext(VM, Materializer); 
-} 
- 
-void ValueMapper::addFlags(RemapFlags Flags) { 
-  FlushingMapper(pImpl)->addFlags(Flags); 
-} 
- 
-Value *ValueMapper::mapValue(const Value &V) { 
-  return FlushingMapper(pImpl)->mapValue(&V); 
-} 
- 
-Constant *ValueMapper::mapConstant(const Constant &C) { 
-  return cast_or_null<Constant>(mapValue(C)); 
-} 
- 
-Metadata *ValueMapper::mapMetadata(const Metadata &MD) { 
-  return FlushingMapper(pImpl)->mapMetadata(&MD); 
-} 
- 
-MDNode *ValueMapper::mapMDNode(const MDNode &N) { 
-  return cast_or_null<MDNode>(mapMetadata(N)); 
-} 
- 
-void ValueMapper::remapInstruction(Instruction &I) { 
-  FlushingMapper(pImpl)->remapInstruction(&I); 
-} 
- 
-void ValueMapper::remapFunction(Function &F) { 
-  FlushingMapper(pImpl)->remapFunction(F); 
-} 
- 
-void ValueMapper::scheduleMapGlobalInitializer(GlobalVariable &GV, 
-                                               Constant &Init, 
-                                               unsigned MCID) { 
-  getAsMapper(pImpl)->scheduleMapGlobalInitializer(GV, Init, MCID); 
-} 
- 
-void ValueMapper::scheduleMapAppendingVariable(GlobalVariable &GV, 
-                                               Constant *InitPrefix, 
-                                               bool IsOldCtorDtor, 
-                                               ArrayRef<Constant *> NewMembers, 
-                                               unsigned MCID) { 
-  getAsMapper(pImpl)->scheduleMapAppendingVariable( 
-      GV, InitPrefix, IsOldCtorDtor, NewMembers, MCID); 
-} 
- 
-void ValueMapper::scheduleMapGlobalIndirectSymbol(GlobalIndirectSymbol &GIS, 
-                                                  Constant &Target, 
-                                                  unsigned MCID) { 
-  getAsMapper(pImpl)->scheduleMapGlobalIndirectSymbol(GIS, Target, MCID); 
-} 
- 
-void ValueMapper::scheduleRemapFunction(Function &F, unsigned MCID) { 
-  getAsMapper(pImpl)->scheduleRemapFunction(F, MCID); 
-} 
+      }
+    }
+    CB->setAttributes(Attrs);
+    return;
+  }
+  if (auto *AI = dyn_cast<AllocaInst>(I))
+    AI->setAllocatedType(TypeMapper->remapType(AI->getAllocatedType()));
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
+    GEP->setSourceElementType(
+        TypeMapper->remapType(GEP->getSourceElementType()));
+    GEP->setResultElementType(
+        TypeMapper->remapType(GEP->getResultElementType()));
+  }
+  I->mutateType(TypeMapper->remapType(I->getType()));
+}
+
+void Mapper::remapGlobalObjectMetadata(GlobalObject &GO) {
+  SmallVector<std::pair<unsigned, MDNode *>, 8> MDs;
+  GO.getAllMetadata(MDs);
+  GO.clearMetadata();
+  for (const auto &I : MDs)
+    GO.addMetadata(I.first, *cast<MDNode>(mapMetadata(I.second)));
+}
+
+void Mapper::remapFunction(Function &F) {
+  // Remap the operands.
+  for (Use &Op : F.operands())
+    if (Op)
+      Op = mapValue(Op);
+
+  // Remap the metadata attachments.
+  remapGlobalObjectMetadata(F);
+
+  // Remap the argument types.
+  if (TypeMapper)
+    for (Argument &A : F.args())
+      A.mutateType(TypeMapper->remapType(A.getType()));
+
+  // Remap the instructions.
+  for (BasicBlock &BB : F)
+    for (Instruction &I : BB)
+      remapInstruction(&I);
+}
+
+void Mapper::mapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,
+                                  bool IsOldCtorDtor,
+                                  ArrayRef<Constant *> NewMembers) {
+  SmallVector<Constant *, 16> Elements;
+  if (InitPrefix) {
+    unsigned NumElements =
+        cast<ArrayType>(InitPrefix->getType())->getNumElements();
+    for (unsigned I = 0; I != NumElements; ++I)
+      Elements.push_back(InitPrefix->getAggregateElement(I));
+  }
+
+  PointerType *VoidPtrTy;
+  Type *EltTy;
+  if (IsOldCtorDtor) {
+    // FIXME: This upgrade is done during linking to support the C API.  See
+    // also IRLinker::linkAppendingVarProto() in IRMover.cpp.
+    VoidPtrTy = Type::getInt8Ty(GV.getContext())->getPointerTo();
+    auto &ST = *cast<StructType>(NewMembers.front()->getType());
+    Type *Tys[3] = {ST.getElementType(0), ST.getElementType(1), VoidPtrTy};
+    EltTy = StructType::get(GV.getContext(), Tys, false);
+  }
+
+  for (auto *V : NewMembers) {
+    Constant *NewV;
+    if (IsOldCtorDtor) {
+      auto *S = cast<ConstantStruct>(V);
+      auto *E1 = cast<Constant>(mapValue(S->getOperand(0)));
+      auto *E2 = cast<Constant>(mapValue(S->getOperand(1)));
+      Constant *Null = Constant::getNullValue(VoidPtrTy);
+      NewV = ConstantStruct::get(cast<StructType>(EltTy), E1, E2, Null);
+    } else {
+      NewV = cast_or_null<Constant>(mapValue(V));
+    }
+    Elements.push_back(NewV);
+  }
+
+  GV.setInitializer(ConstantArray::get(
+      cast<ArrayType>(GV.getType()->getElementType()), Elements));
+}
+
+void Mapper::scheduleMapGlobalInitializer(GlobalVariable &GV, Constant &Init,
+                                          unsigned MCID) {
+  assert(AlreadyScheduled.insert(&GV).second && "Should not reschedule");
+  assert(MCID < MCs.size() && "Invalid mapping context");
+
+  WorklistEntry WE;
+  WE.Kind = WorklistEntry::MapGlobalInit;
+  WE.MCID = MCID;
+  WE.Data.GVInit.GV = &GV;
+  WE.Data.GVInit.Init = &Init;
+  Worklist.push_back(WE);
+}
+
+void Mapper::scheduleMapAppendingVariable(GlobalVariable &GV,
+                                          Constant *InitPrefix,
+                                          bool IsOldCtorDtor,
+                                          ArrayRef<Constant *> NewMembers,
+                                          unsigned MCID) {
+  assert(AlreadyScheduled.insert(&GV).second && "Should not reschedule");
+  assert(MCID < MCs.size() && "Invalid mapping context");
+
+  WorklistEntry WE;
+  WE.Kind = WorklistEntry::MapAppendingVar;
+  WE.MCID = MCID;
+  WE.Data.AppendingGV.GV = &GV;
+  WE.Data.AppendingGV.InitPrefix = InitPrefix;
+  WE.AppendingGVIsOldCtorDtor = IsOldCtorDtor;
+  WE.AppendingGVNumNewMembers = NewMembers.size();
+  Worklist.push_back(WE);
+  AppendingInits.append(NewMembers.begin(), NewMembers.end());
+}
+
+void Mapper::scheduleMapGlobalIndirectSymbol(GlobalIndirectSymbol &GIS,
+                                             Constant &Target, unsigned MCID) {
+  assert(AlreadyScheduled.insert(&GIS).second && "Should not reschedule");
+  assert(MCID < MCs.size() && "Invalid mapping context");
+
+  WorklistEntry WE;
+  WE.Kind = WorklistEntry::MapGlobalIndirectSymbol;
+  WE.MCID = MCID;
+  WE.Data.GlobalIndirectSymbol.GIS = &GIS;
+  WE.Data.GlobalIndirectSymbol.Target = &Target;
+  Worklist.push_back(WE);
+}
+
+void Mapper::scheduleRemapFunction(Function &F, unsigned MCID) {
+  assert(AlreadyScheduled.insert(&F).second && "Should not reschedule");
+  assert(MCID < MCs.size() && "Invalid mapping context");
+
+  WorklistEntry WE;
+  WE.Kind = WorklistEntry::RemapFunction;
+  WE.MCID = MCID;
+  WE.Data.RemapF = &F;
+  Worklist.push_back(WE);
+}
+
+void Mapper::addFlags(RemapFlags Flags) {
+  assert(!hasWorkToDo() && "Expected to have flushed the worklist");
+  this->Flags = this->Flags | Flags;
+}
+
+static Mapper *getAsMapper(void *pImpl) {
+  return reinterpret_cast<Mapper *>(pImpl);
+}
+
+namespace {
+
+class FlushingMapper {
+  Mapper &M;
+
+public:
+  explicit FlushingMapper(void *pImpl) : M(*getAsMapper(pImpl)) {
+    assert(!M.hasWorkToDo() && "Expected to be flushed");
+  }
+
+  ~FlushingMapper() { M.flush(); }
+
+  Mapper *operator->() const { return &M; }
+};
+
+} // end anonymous namespace
+
+ValueMapper::ValueMapper(ValueToValueMapTy &VM, RemapFlags Flags,
+                         ValueMapTypeRemapper *TypeMapper,
+                         ValueMaterializer *Materializer)
+    : pImpl(new Mapper(VM, Flags, TypeMapper, Materializer)) {}
+
+ValueMapper::~ValueMapper() { delete getAsMapper(pImpl); }
+
+unsigned
+ValueMapper::registerAlternateMappingContext(ValueToValueMapTy &VM,
+                                             ValueMaterializer *Materializer) {
+  return getAsMapper(pImpl)->registerAlternateMappingContext(VM, Materializer);
+}
+
+void ValueMapper::addFlags(RemapFlags Flags) {
+  FlushingMapper(pImpl)->addFlags(Flags);
+}
+
+Value *ValueMapper::mapValue(const Value &V) {
+  return FlushingMapper(pImpl)->mapValue(&V);
+}
+
+Constant *ValueMapper::mapConstant(const Constant &C) {
+  return cast_or_null<Constant>(mapValue(C));
+}
+
+Metadata *ValueMapper::mapMetadata(const Metadata &MD) {
+  return FlushingMapper(pImpl)->mapMetadata(&MD);
+}
+
+MDNode *ValueMapper::mapMDNode(const MDNode &N) {
+  return cast_or_null<MDNode>(mapMetadata(N));
+}
+
+void ValueMapper::remapInstruction(Instruction &I) {
+  FlushingMapper(pImpl)->remapInstruction(&I);
+}
+
+void ValueMapper::remapFunction(Function &F) {
+  FlushingMapper(pImpl)->remapFunction(F);
+}
+
+void ValueMapper::scheduleMapGlobalInitializer(GlobalVariable &GV,
+                                               Constant &Init,
+                                               unsigned MCID) {
+  getAsMapper(pImpl)->scheduleMapGlobalInitializer(GV, Init, MCID);
+}
+
+void ValueMapper::scheduleMapAppendingVariable(GlobalVariable &GV,
+                                               Constant *InitPrefix,
+                                               bool IsOldCtorDtor,
+                                               ArrayRef<Constant *> NewMembers,
+                                               unsigned MCID) {
+  getAsMapper(pImpl)->scheduleMapAppendingVariable(
+      GV, InitPrefix, IsOldCtorDtor, NewMembers, MCID);
+}
+
+void ValueMapper::scheduleMapGlobalIndirectSymbol(GlobalIndirectSymbol &GIS,
+                                                  Constant &Target,
+                                                  unsigned MCID) {
+  getAsMapper(pImpl)->scheduleMapGlobalIndirectSymbol(GIS, Target, MCID);
+}
+
+void ValueMapper::scheduleRemapFunction(Function &F, unsigned MCID) {
+  getAsMapper(pImpl)->scheduleRemapFunction(F, MCID);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/ya.make b/contrib/libs/llvm12/lib/Transforms/Utils/ya.make
index f7869c85cd..c07d5d6db6 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/ya.make
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/ya.make
@@ -1,104 +1,104 @@
-# Generated by devtools/yamaker. 
- 
-LIBRARY() 
- 
+# Generated by devtools/yamaker.
+
+LIBRARY()
+
 OWNER(
     orivej
     g:cpp-contrib
 )
- 
+
 LICENSE(Apache-2.0 WITH LLVM-exception)
 
 LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
 
-PEERDIR( 
+PEERDIR(
     contrib/libs/llvm12
     contrib/libs/llvm12/include
     contrib/libs/llvm12/lib/Analysis
     contrib/libs/llvm12/lib/IR
     contrib/libs/llvm12/lib/Support
-) 
- 
+)
+
 ADDINCL(
     contrib/libs/llvm12/lib/Transforms/Utils
 )
- 
-NO_COMPILER_WARNINGS() 
- 
-NO_UTIL() 
- 
-SRCS( 
-    AMDGPUEmitPrintf.cpp 
-    ASanStackFrameLayout.cpp 
-    AddDiscriminators.cpp 
-    AssumeBundleBuilder.cpp 
-    BasicBlockUtils.cpp 
-    BreakCriticalEdges.cpp 
-    BuildLibCalls.cpp 
-    BypassSlowDivision.cpp 
-    CallGraphUpdater.cpp 
-    CallPromotionUtils.cpp 
-    CanonicalizeAliases.cpp 
-    CanonicalizeFreezeInLoops.cpp 
-    CloneFunction.cpp 
-    CloneModule.cpp 
-    CodeExtractor.cpp 
-    CodeMoverUtils.cpp 
-    CtorUtils.cpp 
-    Debugify.cpp 
-    DemoteRegToStack.cpp 
-    EntryExitInstrumenter.cpp 
-    EscapeEnumerator.cpp 
-    Evaluator.cpp 
-    FixIrreducible.cpp 
-    FlattenCFG.cpp 
-    FunctionComparator.cpp 
-    FunctionImportUtils.cpp 
-    GlobalStatus.cpp 
-    GuardUtils.cpp 
-    InjectTLIMappings.cpp 
-    InlineFunction.cpp 
-    InstructionNamer.cpp 
-    IntegerDivision.cpp 
-    LCSSA.cpp 
-    LibCallsShrinkWrap.cpp 
-    Local.cpp 
+
+NO_COMPILER_WARNINGS()
+
+NO_UTIL()
+
+SRCS(
+    AMDGPUEmitPrintf.cpp
+    ASanStackFrameLayout.cpp
+    AddDiscriminators.cpp
+    AssumeBundleBuilder.cpp
+    BasicBlockUtils.cpp
+    BreakCriticalEdges.cpp
+    BuildLibCalls.cpp
+    BypassSlowDivision.cpp
+    CallGraphUpdater.cpp
+    CallPromotionUtils.cpp
+    CanonicalizeAliases.cpp
+    CanonicalizeFreezeInLoops.cpp
+    CloneFunction.cpp
+    CloneModule.cpp
+    CodeExtractor.cpp
+    CodeMoverUtils.cpp
+    CtorUtils.cpp
+    Debugify.cpp
+    DemoteRegToStack.cpp
+    EntryExitInstrumenter.cpp
+    EscapeEnumerator.cpp
+    Evaluator.cpp
+    FixIrreducible.cpp
+    FlattenCFG.cpp
+    FunctionComparator.cpp
+    FunctionImportUtils.cpp
+    GlobalStatus.cpp
+    GuardUtils.cpp
+    InjectTLIMappings.cpp
+    InlineFunction.cpp
+    InstructionNamer.cpp
+    IntegerDivision.cpp
+    LCSSA.cpp
+    LibCallsShrinkWrap.cpp
+    Local.cpp
     LoopPeel.cpp
-    LoopRotationUtils.cpp 
-    LoopSimplify.cpp 
-    LoopUnroll.cpp 
-    LoopUnrollAndJam.cpp 
-    LoopUnrollRuntime.cpp 
-    LoopUtils.cpp 
-    LoopVersioning.cpp 
-    LowerInvoke.cpp 
-    LowerMemIntrinsics.cpp 
-    LowerSwitch.cpp 
+    LoopRotationUtils.cpp
+    LoopSimplify.cpp
+    LoopUnroll.cpp
+    LoopUnrollAndJam.cpp
+    LoopUnrollRuntime.cpp
+    LoopUtils.cpp
+    LoopVersioning.cpp
+    LowerInvoke.cpp
+    LowerMemIntrinsics.cpp
+    LowerSwitch.cpp
     MatrixUtils.cpp
-    Mem2Reg.cpp 
-    MetaRenamer.cpp 
-    ModuleUtils.cpp 
-    NameAnonGlobals.cpp 
-    PredicateInfo.cpp 
-    PromoteMemoryToRegister.cpp 
-    SSAUpdater.cpp 
-    SSAUpdaterBulk.cpp 
-    SanitizerStats.cpp 
-    ScalarEvolutionExpander.cpp 
-    SimplifyCFG.cpp 
-    SimplifyIndVar.cpp 
-    SimplifyLibCalls.cpp 
-    SizeOpts.cpp 
-    SplitModule.cpp 
-    StripGCRelocates.cpp 
-    StripNonLineTableDebugInfo.cpp 
-    SymbolRewriter.cpp 
-    UnifyFunctionExitNodes.cpp 
-    UnifyLoopExits.cpp 
-    UniqueInternalLinkageNames.cpp 
-    Utils.cpp 
-    VNCoercion.cpp 
-    ValueMapper.cpp 
-) 
- 
-END() 
+    Mem2Reg.cpp
+    MetaRenamer.cpp
+    ModuleUtils.cpp
+    NameAnonGlobals.cpp
+    PredicateInfo.cpp
+    PromoteMemoryToRegister.cpp
+    SSAUpdater.cpp
+    SSAUpdaterBulk.cpp
+    SanitizerStats.cpp
+    ScalarEvolutionExpander.cpp
+    SimplifyCFG.cpp
+    SimplifyIndVar.cpp
+    SimplifyLibCalls.cpp
+    SizeOpts.cpp
+    SplitModule.cpp
+    StripGCRelocates.cpp
+    StripNonLineTableDebugInfo.cpp
+    SymbolRewriter.cpp
+    UnifyFunctionExitNodes.cpp
+    UnifyLoopExits.cpp
+    UniqueInternalLinkageNames.cpp
+    Utils.cpp
+    VNCoercion.cpp
+    ValueMapper.cpp
+)
+
+END()
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 7abf30b46c..6ec5590d76 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -1,1315 +1,1315 @@
-//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass merges loads/stores to/from sequential memory addresses into vector 
-// loads/stores.  Although there's nothing GPU-specific in here, this pass is 
-// motivated by the microarchitectural quirks of nVidia and AMD GPUs. 
-// 
-// (For simplicity below we talk about loads only, but everything also applies 
-// to stores.) 
-// 
-// This pass is intended to be run late in the pipeline, after other 
-// vectorization opportunities have been exploited.  So the assumption here is 
-// that immediately following our new vector load we'll need to extract out the 
-// individual elements of the load, so we can operate on them individually. 
-// 
-// On CPUs this transformation is usually not beneficial, because extracting the 
-// elements of a vector register is expensive on most architectures.  It's 
-// usually better just to load each element individually into its own scalar 
-// register. 
-// 
-// However, nVidia and AMD GPUs don't have proper vector registers.  Instead, a 
-// "vector load" loads directly into a series of scalar registers.  In effect, 
-// extracting the elements of the vector is free.  It's therefore always 
-// beneficial to vectorize a sequence of loads on these architectures. 
-// 
-// Vectorizing (perhaps a better name might be "coalescing") loads can have 
-// large performance impacts on GPU kernels, and opportunities for vectorizing 
-// are common in GPU code.  This pass tries very hard to find such 
-// opportunities; its runtime is quadratic in the number of loads in a BB. 
-// 
-// Some CPU architectures, such as ARM, have instructions that load into 
-// multiple scalar registers, similar to a GPU vectorized load.  In theory ARM 
-// could use this pass (with some modifications), but currently it implements 
-// its own pass to do something similar to what we do here. 
- 
-#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" 
-#include "llvm/ADT/APInt.h" 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/MapVector.h" 
-#include "llvm/ADT/PostOrderIterator.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/iterator_range.h" 
-#include "llvm/Analysis/AliasAnalysis.h" 
-#include "llvm/Analysis/MemoryLocation.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/Analysis/VectorUtils.h" 
-#include "llvm/IR/Attributes.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/KnownBits.h" 
-#include "llvm/Support/MathExtras.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Transforms/Vectorize.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <cstdlib> 
-#include <tuple> 
-#include <utility> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "load-store-vectorizer" 
- 
-STATISTIC(NumVectorInstructions, "Number of vector accesses generated"); 
-STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized"); 
- 
-// FIXME: Assuming stack alignment of 4 is always good enough 
-static const unsigned StackAdjustedAlignment = 4; 
- 
-namespace { 
- 
-/// ChainID is an arbitrary token that is allowed to be different only for the 
-/// accesses that are guaranteed to be considered non-consecutive by 
-/// Vectorizer::isConsecutiveAccess. It's used for grouping instructions 
-/// together and reducing the number of instructions the main search operates on 
-/// at a time, i.e. this is to reduce compile time and nothing else as the main 
-/// search has O(n^2) time complexity. The underlying type of ChainID should not 
-/// be relied upon. 
-using ChainID = const Value *; 
-using InstrList = SmallVector<Instruction *, 8>; 
-using InstrListMap = MapVector<ChainID, InstrList>; 
- 
-class Vectorizer { 
-  Function &F; 
-  AliasAnalysis &AA; 
-  DominatorTree &DT; 
-  ScalarEvolution &SE; 
-  TargetTransformInfo &TTI; 
-  const DataLayout &DL; 
-  IRBuilder<> Builder; 
- 
-public: 
-  Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT, 
-             ScalarEvolution &SE, TargetTransformInfo &TTI) 
-      : F(F), AA(AA), DT(DT), SE(SE), TTI(TTI), 
-        DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {} 
- 
-  bool run(); 
- 
-private: 
-  unsigned getPointerAddressSpace(Value *I); 
- 
-  static const unsigned MaxDepth = 3; 
- 
-  bool isConsecutiveAccess(Value *A, Value *B); 
-  bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt PtrDelta, 
-                              unsigned Depth = 0) const; 
-  bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta, 
-                                   unsigned Depth) const; 
-  bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta, 
-                          unsigned Depth) const; 
- 
-  /// After vectorization, reorder the instructions that I depends on 
-  /// (the instructions defining its operands), to ensure they dominate I. 
-  void reorder(Instruction *I); 
- 
-  /// Returns the first and the last instructions in Chain. 
-  std::pair<BasicBlock::iterator, BasicBlock::iterator> 
-  getBoundaryInstrs(ArrayRef<Instruction *> Chain); 
- 
-  /// Erases the original instructions after vectorizing. 
-  void eraseInstructions(ArrayRef<Instruction *> Chain); 
- 
-  /// "Legalize" the vector type that would be produced by combining \p 
-  /// ElementSizeBits elements in \p Chain. Break into two pieces such that the 
-  /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is 
-  /// expected to have more than 4 elements. 
-  std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>> 
-  splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits); 
- 
-  /// Finds the largest prefix of Chain that's vectorizable, checking for 
-  /// intervening instructions which may affect the memory accessed by the 
-  /// instructions within Chain. 
-  /// 
-  /// The elements of \p Chain must be all loads or all stores and must be in 
-  /// address order. 
-  ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain); 
- 
-  /// Collects load and store instructions to vectorize. 
-  std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB); 
- 
-  /// Processes the collected instructions, the \p Map. The values of \p Map 
-  /// should be all loads or all stores. 
-  bool vectorizeChains(InstrListMap &Map); 
- 
-  /// Finds the load/stores to consecutive memory addresses and vectorizes them. 
-  bool vectorizeInstructions(ArrayRef<Instruction *> Instrs); 
- 
-  /// Vectorizes the load instructions in Chain. 
-  bool 
-  vectorizeLoadChain(ArrayRef<Instruction *> Chain, 
-                     SmallPtrSet<Instruction *, 16> *InstructionsProcessed); 
- 
-  /// Vectorizes the store instructions in Chain. 
-  bool 
-  vectorizeStoreChain(ArrayRef<Instruction *> Chain, 
-                      SmallPtrSet<Instruction *, 16> *InstructionsProcessed); 
- 
-  /// Check if this load/store access is misaligned accesses. 
-  bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace, 
-                          unsigned Alignment); 
-}; 
- 
-class LoadStoreVectorizerLegacyPass : public FunctionPass { 
-public: 
-  static char ID; 
- 
-  LoadStoreVectorizerLegacyPass() : FunctionPass(ID) { 
-    initializeLoadStoreVectorizerLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnFunction(Function &F) override; 
- 
-  StringRef getPassName() const override { 
-    return "GPU Load and Store Vectorizer"; 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<AAResultsWrapperPass>(); 
-    AU.addRequired<ScalarEvolutionWrapperPass>(); 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addRequired<TargetTransformInfoWrapperPass>(); 
-    AU.setPreservesCFG(); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-char LoadStoreVectorizerLegacyPass::ID = 0; 
- 
-INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE, 
-                      "Vectorize load and Store instructions", false, false) 
-INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 
-INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE, 
-                    "Vectorize load and store instructions", false, false) 
- 
-Pass *llvm::createLoadStoreVectorizerPass() { 
-  return new LoadStoreVectorizerLegacyPass(); 
-} 
- 
-bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) { 
-  // Don't vectorize when the attribute NoImplicitFloat is used. 
-  if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat)) 
-    return false; 
- 
-  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); 
-  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 
-  TargetTransformInfo &TTI = 
-      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 
- 
-  Vectorizer V(F, AA, DT, SE, TTI); 
-  return V.run(); 
-} 
- 
-PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) { 
-  // Don't vectorize when the attribute NoImplicitFloat is used. 
-  if (F.hasFnAttribute(Attribute::NoImplicitFloat)) 
-    return PreservedAnalyses::all(); 
- 
-  AliasAnalysis &AA = AM.getResult<AAManager>(F); 
-  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F); 
-  ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 
-  TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F); 
- 
-  Vectorizer V(F, AA, DT, SE, TTI); 
-  bool Changed = V.run(); 
-  PreservedAnalyses PA; 
-  PA.preserveSet<CFGAnalyses>(); 
-  return Changed ? PA : PreservedAnalyses::all(); 
-} 
- 
-// The real propagateMetadata expects a SmallVector<Value*>, but we deal in 
-// vectors of Instructions. 
-static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) { 
-  SmallVector<Value *, 8> VL(IL.begin(), IL.end()); 
-  propagateMetadata(I, VL); 
-} 
- 
-// Vectorizer Implementation 
-bool Vectorizer::run() { 
-  bool Changed = false; 
- 
-  // Scan the blocks in the function in post order. 
-  for (BasicBlock *BB : post_order(&F)) { 
-    InstrListMap LoadRefs, StoreRefs; 
-    std::tie(LoadRefs, StoreRefs) = collectInstructions(BB); 
-    Changed |= vectorizeChains(LoadRefs); 
-    Changed |= vectorizeChains(StoreRefs); 
-  } 
- 
-  return Changed; 
-} 
- 
-unsigned Vectorizer::getPointerAddressSpace(Value *I) { 
-  if (LoadInst *L = dyn_cast<LoadInst>(I)) 
-    return L->getPointerAddressSpace(); 
-  if (StoreInst *S = dyn_cast<StoreInst>(I)) 
-    return S->getPointerAddressSpace(); 
-  return -1; 
-} 
- 
-// FIXME: Merge with llvm::isConsecutiveAccess 
-bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { 
-  Value *PtrA = getLoadStorePointerOperand(A); 
-  Value *PtrB = getLoadStorePointerOperand(B); 
-  unsigned ASA = getPointerAddressSpace(A); 
-  unsigned ASB = getPointerAddressSpace(B); 
- 
-  // Check that the address spaces match and that the pointers are valid. 
-  if (!PtrA || !PtrB || (ASA != ASB)) 
-    return false; 
- 
-  // Make sure that A and B are different pointers of the same size type. 
-  Type *PtrATy = PtrA->getType()->getPointerElementType(); 
-  Type *PtrBTy = PtrB->getType()->getPointerElementType(); 
-  if (PtrA == PtrB || 
-      PtrATy->isVectorTy() != PtrBTy->isVectorTy() || 
-      DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) || 
-      DL.getTypeStoreSize(PtrATy->getScalarType()) != 
-          DL.getTypeStoreSize(PtrBTy->getScalarType())) 
-    return false; 
- 
-  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA); 
-  APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy)); 
- 
-  return areConsecutivePointers(PtrA, PtrB, Size); 
-} 
- 
-bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB, 
-                                        APInt PtrDelta, unsigned Depth) const { 
-  unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType()); 
-  APInt OffsetA(PtrBitWidth, 0); 
-  APInt OffsetB(PtrBitWidth, 0); 
-  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); 
-  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); 
- 
-  unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType()); 
- 
-  if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType())) 
-    return false; 
- 
-  // In case if we have to shrink the pointer 
-  // stripAndAccumulateInBoundsConstantOffsets should properly handle a 
-  // possible overflow and the value should fit into a smallest data type 
-  // used in the cast/gep chain. 
-  assert(OffsetA.getMinSignedBits() <= NewPtrBitWidth && 
-         OffsetB.getMinSignedBits() <= NewPtrBitWidth); 
- 
-  OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth); 
-  OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth); 
-  PtrDelta = PtrDelta.sextOrTrunc(NewPtrBitWidth); 
- 
-  APInt OffsetDelta = OffsetB - OffsetA; 
- 
-  // Check if they are based on the same pointer. That makes the offsets 
-  // sufficient. 
-  if (PtrA == PtrB) 
-    return OffsetDelta == PtrDelta; 
- 
-  // Compute the necessary base pointer delta to have the necessary final delta 
-  // equal to the pointer delta requested. 
-  APInt BaseDelta = PtrDelta - OffsetDelta; 
- 
-  // Compute the distance with SCEV between the base pointers. 
-  const SCEV *PtrSCEVA = SE.getSCEV(PtrA); 
-  const SCEV *PtrSCEVB = SE.getSCEV(PtrB); 
-  const SCEV *C = SE.getConstant(BaseDelta); 
-  const SCEV *X = SE.getAddExpr(PtrSCEVA, C); 
-  if (X == PtrSCEVB) 
-    return true; 
- 
-  // The above check will not catch the cases where one of the pointers is 
-  // factorized but the other one is not, such as (C + (S * (A + B))) vs 
-  // (AS + BS). Get the minus scev. That will allow re-combining the expresions 
-  // and getting the simplified difference. 
-  const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA); 
-  if (C == Dist) 
-    return true; 
- 
-  // Sometimes even this doesn't work, because SCEV can't always see through 
-  // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking 
-  // things the hard way. 
-  return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth); 
-} 
- 
-bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB, 
-                                             APInt PtrDelta, 
-                                             unsigned Depth) const { 
-  auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA); 
-  auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB); 
-  if (!GEPA || !GEPB) 
-    return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth); 
- 
-  // Look through GEPs after checking they're the same except for the last 
-  // index. 
-  if (GEPA->getNumOperands() != GEPB->getNumOperands() || 
-      GEPA->getPointerOperand() != GEPB->getPointerOperand()) 
-    return false; 
-  gep_type_iterator GTIA = gep_type_begin(GEPA); 
-  gep_type_iterator GTIB = gep_type_begin(GEPB); 
-  for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) { 
-    if (GTIA.getOperand() != GTIB.getOperand()) 
-      return false; 
-    ++GTIA; 
-    ++GTIB; 
-  } 
- 
-  Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand()); 
-  Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand()); 
-  if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() || 
-      OpA->getType() != OpB->getType()) 
-    return false; 
- 
-  if (PtrDelta.isNegative()) { 
-    if (PtrDelta.isMinSignedValue()) 
-      return false; 
-    PtrDelta.negate(); 
-    std::swap(OpA, OpB); 
-  } 
-  uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType()); 
-  if (PtrDelta.urem(Stride) != 0) 
-    return false; 
-  unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits(); 
-  APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth); 
- 
-  // Only look through a ZExt/SExt. 
-  if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA)) 
-    return false; 
- 
-  bool Signed = isa<SExtInst>(OpA); 
- 
-  // At this point A could be a function parameter, i.e. not an instruction 
-  Value *ValA = OpA->getOperand(0); 
-  OpB = dyn_cast<Instruction>(OpB->getOperand(0)); 
-  if (!OpB || ValA->getType() != OpB->getType()) 
-    return false; 
- 
-  // Now we need to prove that adding IdxDiff to ValA won't overflow. 
-  bool Safe = false; 
-  auto CheckFlags = [](Instruction *I, bool Signed) { 
-    BinaryOperator *BinOpI = cast<BinaryOperator>(I); 
-    return (Signed && BinOpI->hasNoSignedWrap()) || 
-           (!Signed && BinOpI->hasNoUnsignedWrap()); 
-  }; 
- 
-  // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to 
-  // ValA, we're okay. 
-  if (OpB->getOpcode() == Instruction::Add && 
-      isa<ConstantInt>(OpB->getOperand(1)) && 
-      IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue()) && 
-      CheckFlags(OpB, Signed)) 
-    Safe = true; 
- 
-  // Second attempt: If both OpA and OpB is an add with NSW/NUW and with 
-  // the same LHS operand, we can guarantee that the transformation is safe 
-  // if we can prove that OpA won't overflow when IdxDiff added to the RHS 
-  // of OpA. 
-  // For example: 
-  //  %tmp7 = add nsw i32 %tmp2, %v0 
-  //  %tmp8 = sext i32 %tmp7 to i64 
-  //  ... 
-  //  %tmp11 = add nsw i32 %v0, 1 
-  //  %tmp12 = add nsw i32 %tmp2, %tmp11 
-  //  %tmp13 = sext i32 %tmp12 to i64 
-  // 
-  //  Both %tmp7 and %tmp2 has the nsw flag and the first operand 
-  //  is %tmp2. It's guaranteed that adding 1 to %tmp7 won't overflow 
-  //  because %tmp11 adds 1 to %v0 and both %tmp11 and %tmp12 has the 
-  //  nsw flag. 
-  OpA = dyn_cast<Instruction>(ValA); 
-  if (!Safe && OpA && OpA->getOpcode() == Instruction::Add && 
-      OpB->getOpcode() == Instruction::Add && 
-      OpA->getOperand(0) == OpB->getOperand(0) && CheckFlags(OpA, Signed) && 
-      CheckFlags(OpB, Signed)) { 
-    Value *RHSA = OpA->getOperand(1); 
-    Value *RHSB = OpB->getOperand(1); 
-    Instruction *OpRHSA = dyn_cast<Instruction>(RHSA); 
-    Instruction *OpRHSB = dyn_cast<Instruction>(RHSB); 
-    // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`. 
-    if (OpRHSB && OpRHSB->getOpcode() == Instruction::Add && 
-        CheckFlags(OpRHSB, Signed) && isa<ConstantInt>(OpRHSB->getOperand(1))) { 
-      int64_t CstVal = cast<ConstantInt>(OpRHSB->getOperand(1))->getSExtValue(); 
-      if (OpRHSB->getOperand(0) == RHSA && IdxDiff.getSExtValue() == CstVal) 
-        Safe = true; 
-    } 
-    // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`. 
-    if (OpRHSA && OpRHSA->getOpcode() == Instruction::Add && 
-        CheckFlags(OpRHSA, Signed) && isa<ConstantInt>(OpRHSA->getOperand(1))) { 
-      int64_t CstVal = cast<ConstantInt>(OpRHSA->getOperand(1))->getSExtValue(); 
-      if (OpRHSA->getOperand(0) == RHSB && IdxDiff.getSExtValue() == -CstVal) 
-        Safe = true; 
-    } 
-    // Match `x +nsw/nuw (y +nsw/nuw c)` and 
-    // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`. 
-    if (OpRHSA && OpRHSB && OpRHSA->getOpcode() == Instruction::Add && 
-        OpRHSB->getOpcode() == Instruction::Add && CheckFlags(OpRHSA, Signed) && 
-        CheckFlags(OpRHSB, Signed) && isa<ConstantInt>(OpRHSA->getOperand(1)) && 
-        isa<ConstantInt>(OpRHSB->getOperand(1))) { 
-      int64_t CstValA = 
-          cast<ConstantInt>(OpRHSA->getOperand(1))->getSExtValue(); 
-      int64_t CstValB = 
-          cast<ConstantInt>(OpRHSB->getOperand(1))->getSExtValue(); 
-      if (OpRHSA->getOperand(0) == OpRHSB->getOperand(0) && 
-          IdxDiff.getSExtValue() == (CstValB - CstValA)) 
-        Safe = true; 
-    } 
-  } 
- 
-  unsigned BitWidth = ValA->getType()->getScalarSizeInBits(); 
- 
-  // Third attempt: 
-  // If all set bits of IdxDiff or any higher order bit other than the sign bit 
-  // are known to be zero in ValA, we can add Diff to it while guaranteeing no 
-  // overflow of any sort. 
-  if (!Safe) { 
-    OpA = dyn_cast<Instruction>(ValA); 
-    if (!OpA) 
-      return false; 
-    KnownBits Known(BitWidth); 
-    computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT); 
-    APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth()); 
-    if (Signed) 
-      BitsAllowedToBeSet.clearBit(BitWidth - 1); 
-    if (BitsAllowedToBeSet.ult(IdxDiff)) 
-      return false; 
-  } 
- 
-  const SCEV *OffsetSCEVA = SE.getSCEV(ValA); 
-  const SCEV *OffsetSCEVB = SE.getSCEV(OpB); 
-  const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth)); 
-  const SCEV *X = SE.getAddExpr(OffsetSCEVA, C); 
-  return X == OffsetSCEVB; 
-} 
- 
-bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB, 
-                                    const APInt &PtrDelta, 
-                                    unsigned Depth) const { 
-  if (Depth++ == MaxDepth) 
-    return false; 
- 
-  if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) { 
-    if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) { 
-      return SelectA->getCondition() == SelectB->getCondition() && 
-             areConsecutivePointers(SelectA->getTrueValue(), 
-                                    SelectB->getTrueValue(), PtrDelta, Depth) && 
-             areConsecutivePointers(SelectA->getFalseValue(), 
-                                    SelectB->getFalseValue(), PtrDelta, Depth); 
-    } 
-  } 
-  return false; 
-} 
- 
-void Vectorizer::reorder(Instruction *I) { 
-  SmallPtrSet<Instruction *, 16> InstructionsToMove; 
-  SmallVector<Instruction *, 16> Worklist; 
- 
-  Worklist.push_back(I); 
-  while (!Worklist.empty()) { 
-    Instruction *IW = Worklist.pop_back_val(); 
-    int NumOperands = IW->getNumOperands(); 
-    for (int i = 0; i < NumOperands; i++) { 
-      Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i)); 
-      if (!IM || IM->getOpcode() == Instruction::PHI) 
-        continue; 
- 
-      // If IM is in another BB, no need to move it, because this pass only 
-      // vectorizes instructions within one BB. 
-      if (IM->getParent() != I->getParent()) 
-        continue; 
- 
-      if (!IM->comesBefore(I)) { 
-        InstructionsToMove.insert(IM); 
-        Worklist.push_back(IM); 
-      } 
-    } 
-  } 
- 
-  // All instructions to move should follow I. Start from I, not from begin(). 
-  for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E; 
-       ++BBI) { 
-    if (!InstructionsToMove.count(&*BBI)) 
-      continue; 
-    Instruction *IM = &*BBI; 
-    --BBI; 
-    IM->removeFromParent(); 
-    IM->insertBefore(I); 
-  } 
-} 
- 
-std::pair<BasicBlock::iterator, BasicBlock::iterator> 
-Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) { 
-  Instruction *C0 = Chain[0]; 
-  BasicBlock::iterator FirstInstr = C0->getIterator(); 
-  BasicBlock::iterator LastInstr = C0->getIterator(); 
- 
-  BasicBlock *BB = C0->getParent(); 
-  unsigned NumFound = 0; 
-  for (Instruction &I : *BB) { 
-    if (!is_contained(Chain, &I)) 
-      continue; 
- 
-    ++NumFound; 
-    if (NumFound == 1) { 
-      FirstInstr = I.getIterator(); 
-    } 
-    if (NumFound == Chain.size()) { 
-      LastInstr = I.getIterator(); 
-      break; 
-    } 
-  } 
- 
-  // Range is [first, last). 
-  return std::make_pair(FirstInstr, ++LastInstr); 
-} 
- 
-void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) { 
-  SmallVector<Instruction *, 16> Instrs; 
-  for (Instruction *I : Chain) { 
-    Value *PtrOperand = getLoadStorePointerOperand(I); 
-    assert(PtrOperand && "Instruction must have a pointer operand."); 
-    Instrs.push_back(I); 
-    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand)) 
-      Instrs.push_back(GEP); 
-  } 
- 
-  // Erase instructions. 
-  for (Instruction *I : Instrs) 
-    if (I->use_empty()) 
-      I->eraseFromParent(); 
-} 
- 
-std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>> 
-Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain, 
-                               unsigned ElementSizeBits) { 
-  unsigned ElementSizeBytes = ElementSizeBits / 8; 
-  unsigned SizeBytes = ElementSizeBytes * Chain.size(); 
-  unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes; 
-  if (NumLeft == Chain.size()) { 
-    if ((NumLeft & 1) == 0) 
-      NumLeft /= 2; // Split even in half 
-    else 
-      --NumLeft;    // Split off last element 
-  } else if (NumLeft == 0) 
-    NumLeft = 1; 
-  return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft)); 
-} 
- 
-ArrayRef<Instruction *> 
-Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) { 
-  // These are in BB order, unlike Chain, which is in address order. 
-  SmallVector<Instruction *, 16> MemoryInstrs; 
-  SmallVector<Instruction *, 16> ChainInstrs; 
- 
-  bool IsLoadChain = isa<LoadInst>(Chain[0]); 
-  LLVM_DEBUG({ 
-    for (Instruction *I : Chain) { 
-      if (IsLoadChain) 
-        assert(isa<LoadInst>(I) && 
-               "All elements of Chain must be loads, or all must be stores."); 
-      else 
-        assert(isa<StoreInst>(I) && 
-               "All elements of Chain must be loads, or all must be stores."); 
-    } 
-  }); 
- 
-  for (Instruction &I : make_range(getBoundaryInstrs(Chain))) { 
-    if (isa<LoadInst>(I) || isa<StoreInst>(I)) { 
-      if (!is_contained(Chain, &I)) 
-        MemoryInstrs.push_back(&I); 
-      else 
-        ChainInstrs.push_back(&I); 
-    } else if (isa<IntrinsicInst>(&I) && 
-               cast<IntrinsicInst>(&I)->getIntrinsicID() == 
-                   Intrinsic::sideeffect) { 
-      // Ignore llvm.sideeffect calls. 
+//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass merges loads/stores to/from sequential memory addresses into vector
+// loads/stores.  Although there's nothing GPU-specific in here, this pass is
+// motivated by the microarchitectural quirks of nVidia and AMD GPUs.
+//
+// (For simplicity below we talk about loads only, but everything also applies
+// to stores.)
+//
+// This pass is intended to be run late in the pipeline, after other
+// vectorization opportunities have been exploited.  So the assumption here is
+// that immediately following our new vector load we'll need to extract out the
+// individual elements of the load, so we can operate on them individually.
+//
+// On CPUs this transformation is usually not beneficial, because extracting the
+// elements of a vector register is expensive on most architectures.  It's
+// usually better just to load each element individually into its own scalar
+// register.
+//
+// However, nVidia and AMD GPUs don't have proper vector registers.  Instead, a
+// "vector load" loads directly into a series of scalar registers.  In effect,
+// extracting the elements of the vector is free.  It's therefore always
+// beneficial to vectorize a sequence of loads on these architectures.
+//
+// Vectorizing (perhaps a better name might be "coalescing") loads can have
+// large performance impacts on GPU kernels, and opportunities for vectorizing
+// are common in GPU code.  This pass tries very hard to find such
+// opportunities; its runtime is quadratic in the number of loads in a BB.
+//
+// Some CPU architectures, such as ARM, have instructions that load into
+// multiple scalar registers, similar to a GPU vectorized load.  In theory ARM
+// could use this pass (with some modifications), but currently it implements
+// its own pass to do something similar to what we do here.
+
+#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Vectorize.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdlib>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "load-store-vectorizer"
+
+STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
+STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
+
+// FIXME: Assuming stack alignment of 4 is always good enough
+static const unsigned StackAdjustedAlignment = 4;
+
+namespace {
+
+/// ChainID is an arbitrary token that is allowed to be different only for the
+/// accesses that are guaranteed to be considered non-consecutive by
+/// Vectorizer::isConsecutiveAccess. It's used for grouping instructions
+/// together and reducing the number of instructions the main search operates on
+/// at a time, i.e. this is to reduce compile time and nothing else as the main
+/// search has O(n^2) time complexity. The underlying type of ChainID should not
+/// be relied upon.
+using ChainID = const Value *;
+using InstrList = SmallVector<Instruction *, 8>;
+using InstrListMap = MapVector<ChainID, InstrList>;
+
+class Vectorizer {
+  Function &F;
+  AliasAnalysis &AA;
+  DominatorTree &DT;
+  ScalarEvolution &SE;
+  TargetTransformInfo &TTI;
+  const DataLayout &DL;
+  IRBuilder<> Builder;
+
+public:
+  Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT,
+             ScalarEvolution &SE, TargetTransformInfo &TTI)
+      : F(F), AA(AA), DT(DT), SE(SE), TTI(TTI),
+        DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}
+
+  bool run();
+
+private:
+  unsigned getPointerAddressSpace(Value *I);
+
+  static const unsigned MaxDepth = 3;
+
+  bool isConsecutiveAccess(Value *A, Value *B);
+  bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt PtrDelta,
+                              unsigned Depth = 0) const;
+  bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta,
+                                   unsigned Depth) const;
+  bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
+                          unsigned Depth) const;
+
+  /// After vectorization, reorder the instructions that I depends on
+  /// (the instructions defining its operands), to ensure they dominate I.
+  void reorder(Instruction *I);
+
+  /// Returns the first and the last instructions in Chain.
+  std::pair<BasicBlock::iterator, BasicBlock::iterator>
+  getBoundaryInstrs(ArrayRef<Instruction *> Chain);
+
+  /// Erases the original instructions after vectorizing.
+  void eraseInstructions(ArrayRef<Instruction *> Chain);
+
+  /// "Legalize" the vector type that would be produced by combining \p
+  /// ElementSizeBits elements in \p Chain. Break into two pieces such that the
+  /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
+  /// expected to have more than 4 elements.
+  std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
+  splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits);
+
+  /// Finds the largest prefix of Chain that's vectorizable, checking for
+  /// intervening instructions which may affect the memory accessed by the
+  /// instructions within Chain.
+  ///
+  /// The elements of \p Chain must be all loads or all stores and must be in
+  /// address order.
+  ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain);
+
+  /// Collects load and store instructions to vectorize.
+  std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB);
+
+  /// Processes the collected instructions, the \p Map. The values of \p Map
+  /// should be all loads or all stores.
+  bool vectorizeChains(InstrListMap &Map);
+
+  /// Finds the load/stores to consecutive memory addresses and vectorizes them.
+  bool vectorizeInstructions(ArrayRef<Instruction *> Instrs);
+
+  /// Vectorizes the load instructions in Chain.
+  bool
+  vectorizeLoadChain(ArrayRef<Instruction *> Chain,
+                     SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
+
+  /// Vectorizes the store instructions in Chain.
+  bool
+  vectorizeStoreChain(ArrayRef<Instruction *> Chain,
+                      SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
+
+  /// Check if this load/store access is misaligned accesses.
+  bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
+                          unsigned Alignment);
+};
+
+class LoadStoreVectorizerLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
+    initializeLoadStoreVectorizerLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+
+  StringRef getPassName() const override {
+    return "GPU Load and Store Vectorizer";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.setPreservesCFG();
+  }
+};
+
+} // end anonymous namespace
+
+char LoadStoreVectorizerLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
+                      "Vectorize load and Store instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
+                    "Vectorize load and store instructions", false, false)
+
+Pass *llvm::createLoadStoreVectorizerPass() {
+  return new LoadStoreVectorizerLegacyPass();
+}
+
+bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
+  // Don't vectorize when the attribute NoImplicitFloat is used.
+  if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
+    return false;
+
+  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  TargetTransformInfo &TTI =
+      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+  Vectorizer V(F, AA, DT, SE, TTI);
+  return V.run();
+}
+
+PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
+  // Don't vectorize when the attribute NoImplicitFloat is used.
+  if (F.hasFnAttribute(Attribute::NoImplicitFloat))
+    return PreservedAnalyses::all();
+
+  AliasAnalysis &AA = AM.getResult<AAManager>(F);
+  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+  TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
+
+  Vectorizer V(F, AA, DT, SE, TTI);
+  bool Changed = V.run();
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return Changed ? PA : PreservedAnalyses::all();
+}
+
+// The real propagateMetadata expects a SmallVector<Value*>, but we deal in
+// vectors of Instructions.
+static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) {
+  SmallVector<Value *, 8> VL(IL.begin(), IL.end());
+  propagateMetadata(I, VL);
+}
+
+// Vectorizer Implementation
+bool Vectorizer::run() {
+  bool Changed = false;
+
+  // Scan the blocks in the function in post order.
+  for (BasicBlock *BB : post_order(&F)) {
+    InstrListMap LoadRefs, StoreRefs;
+    std::tie(LoadRefs, StoreRefs) = collectInstructions(BB);
+    Changed |= vectorizeChains(LoadRefs);
+    Changed |= vectorizeChains(StoreRefs);
+  }
+
+  return Changed;
+}
+
+unsigned Vectorizer::getPointerAddressSpace(Value *I) {
+  if (LoadInst *L = dyn_cast<LoadInst>(I))
+    return L->getPointerAddressSpace();
+  if (StoreInst *S = dyn_cast<StoreInst>(I))
+    return S->getPointerAddressSpace();
+  return -1;
+}
+
+// FIXME: Merge with llvm::isConsecutiveAccess
+bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
+  Value *PtrA = getLoadStorePointerOperand(A);
+  Value *PtrB = getLoadStorePointerOperand(B);
+  unsigned ASA = getPointerAddressSpace(A);
+  unsigned ASB = getPointerAddressSpace(B);
+
+  // Check that the address spaces match and that the pointers are valid.
+  if (!PtrA || !PtrB || (ASA != ASB))
+    return false;
+
+  // Make sure that A and B are different pointers of the same size type.
+  Type *PtrATy = PtrA->getType()->getPointerElementType();
+  Type *PtrBTy = PtrB->getType()->getPointerElementType();
+  if (PtrA == PtrB ||
+      PtrATy->isVectorTy() != PtrBTy->isVectorTy() ||
+      DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||
+      DL.getTypeStoreSize(PtrATy->getScalarType()) !=
+          DL.getTypeStoreSize(PtrBTy->getScalarType()))
+    return false;
+
+  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
+  APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
+
+  return areConsecutivePointers(PtrA, PtrB, Size);
+}
+
+bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,
+                                        APInt PtrDelta, unsigned Depth) const {
+  unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
+  APInt OffsetA(PtrBitWidth, 0);
+  APInt OffsetB(PtrBitWidth, 0);
+  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
+  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
+
+  unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
+
+  if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
+    return false;
+
+  // In case if we have to shrink the pointer
+  // stripAndAccumulateInBoundsConstantOffsets should properly handle a
+  // possible overflow and the value should fit into a smallest data type
+  // used in the cast/gep chain.
+  assert(OffsetA.getMinSignedBits() <= NewPtrBitWidth &&
+         OffsetB.getMinSignedBits() <= NewPtrBitWidth);
+
+  OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
+  OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
+  PtrDelta = PtrDelta.sextOrTrunc(NewPtrBitWidth);
+
+  APInt OffsetDelta = OffsetB - OffsetA;
+
+  // Check if they are based on the same pointer. That makes the offsets
+  // sufficient.
+  if (PtrA == PtrB)
+    return OffsetDelta == PtrDelta;
+
+  // Compute the necessary base pointer delta to have the necessary final delta
+  // equal to the pointer delta requested.
+  APInt BaseDelta = PtrDelta - OffsetDelta;
+
+  // Compute the distance with SCEV between the base pointers.
+  const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
+  const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
+  const SCEV *C = SE.getConstant(BaseDelta);
+  const SCEV *X = SE.getAddExpr(PtrSCEVA, C);
+  if (X == PtrSCEVB)
+    return true;
+
+  // The above check will not catch the cases where one of the pointers is
+  // factorized but the other one is not, such as (C + (S * (A + B))) vs
+  // (AS + BS). Get the minus scev. That will allow re-combining the expresions
+  // and getting the simplified difference.
+  const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
+  if (C == Dist)
+    return true;
+
+  // Sometimes even this doesn't work, because SCEV can't always see through
+  // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
+  // things the hard way.
+  return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth);
+}
+
+bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
+                                             APInt PtrDelta,
+                                             unsigned Depth) const {
+  auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
+  auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
+  if (!GEPA || !GEPB)
+    return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth);
+
+  // Look through GEPs after checking they're the same except for the last
+  // index.
+  if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
+      GEPA->getPointerOperand() != GEPB->getPointerOperand())
+    return false;
+  gep_type_iterator GTIA = gep_type_begin(GEPA);
+  gep_type_iterator GTIB = gep_type_begin(GEPB);
+  for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
+    if (GTIA.getOperand() != GTIB.getOperand())
+      return false;
+    ++GTIA;
+    ++GTIB;
+  }
+
+  Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
+  Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
+  if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
+      OpA->getType() != OpB->getType())
+    return false;
+
+  if (PtrDelta.isNegative()) {
+    if (PtrDelta.isMinSignedValue())
+      return false;
+    PtrDelta.negate();
+    std::swap(OpA, OpB);
+  }
+  uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
+  if (PtrDelta.urem(Stride) != 0)
+    return false;
+  unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
+  APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth);
+
+  // Only look through a ZExt/SExt.
+  if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
+    return false;
+
+  bool Signed = isa<SExtInst>(OpA);
+
+  // At this point A could be a function parameter, i.e. not an instruction
+  Value *ValA = OpA->getOperand(0);
+  OpB = dyn_cast<Instruction>(OpB->getOperand(0));
+  if (!OpB || ValA->getType() != OpB->getType())
+    return false;
+
+  // Now we need to prove that adding IdxDiff to ValA won't overflow.
+  bool Safe = false;
+  auto CheckFlags = [](Instruction *I, bool Signed) {
+    BinaryOperator *BinOpI = cast<BinaryOperator>(I);
+    return (Signed && BinOpI->hasNoSignedWrap()) ||
+           (!Signed && BinOpI->hasNoUnsignedWrap());
+  };
+
+  // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
+  // ValA, we're okay.
+  if (OpB->getOpcode() == Instruction::Add &&
+      isa<ConstantInt>(OpB->getOperand(1)) &&
+      IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue()) &&
+      CheckFlags(OpB, Signed))
+    Safe = true;
+
+  // Second attempt: If both OpA and OpB is an add with NSW/NUW and with
+  // the same LHS operand, we can guarantee that the transformation is safe
+  // if we can prove that OpA won't overflow when IdxDiff added to the RHS
+  // of OpA.
+  // For example:
+  //  %tmp7 = add nsw i32 %tmp2, %v0
+  //  %tmp8 = sext i32 %tmp7 to i64
+  //  ...
+  //  %tmp11 = add nsw i32 %v0, 1
+  //  %tmp12 = add nsw i32 %tmp2, %tmp11
+  //  %tmp13 = sext i32 %tmp12 to i64
+  //
+  //  Both %tmp7 and %tmp2 has the nsw flag and the first operand
+  //  is %tmp2. It's guaranteed that adding 1 to %tmp7 won't overflow
+  //  because %tmp11 adds 1 to %v0 and both %tmp11 and %tmp12 has the
+  //  nsw flag.
+  OpA = dyn_cast<Instruction>(ValA);
+  if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
+      OpB->getOpcode() == Instruction::Add &&
+      OpA->getOperand(0) == OpB->getOperand(0) && CheckFlags(OpA, Signed) &&
+      CheckFlags(OpB, Signed)) {
+    Value *RHSA = OpA->getOperand(1);
+    Value *RHSB = OpB->getOperand(1);
+    Instruction *OpRHSA = dyn_cast<Instruction>(RHSA);
+    Instruction *OpRHSB = dyn_cast<Instruction>(RHSB);
+    // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
+    if (OpRHSB && OpRHSB->getOpcode() == Instruction::Add &&
+        CheckFlags(OpRHSB, Signed) && isa<ConstantInt>(OpRHSB->getOperand(1))) {
+      int64_t CstVal = cast<ConstantInt>(OpRHSB->getOperand(1))->getSExtValue();
+      if (OpRHSB->getOperand(0) == RHSA && IdxDiff.getSExtValue() == CstVal)
+        Safe = true;
+    }
+    // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
+    if (OpRHSA && OpRHSA->getOpcode() == Instruction::Add &&
+        CheckFlags(OpRHSA, Signed) && isa<ConstantInt>(OpRHSA->getOperand(1))) {
+      int64_t CstVal = cast<ConstantInt>(OpRHSA->getOperand(1))->getSExtValue();
+      if (OpRHSA->getOperand(0) == RHSB && IdxDiff.getSExtValue() == -CstVal)
+        Safe = true;
+    }
+    // Match `x +nsw/nuw (y +nsw/nuw c)` and
+    // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
+    if (OpRHSA && OpRHSB && OpRHSA->getOpcode() == Instruction::Add &&
+        OpRHSB->getOpcode() == Instruction::Add && CheckFlags(OpRHSA, Signed) &&
+        CheckFlags(OpRHSB, Signed) && isa<ConstantInt>(OpRHSA->getOperand(1)) &&
+        isa<ConstantInt>(OpRHSB->getOperand(1))) {
+      int64_t CstValA =
+          cast<ConstantInt>(OpRHSA->getOperand(1))->getSExtValue();
+      int64_t CstValB =
+          cast<ConstantInt>(OpRHSB->getOperand(1))->getSExtValue();
+      if (OpRHSA->getOperand(0) == OpRHSB->getOperand(0) &&
+          IdxDiff.getSExtValue() == (CstValB - CstValA))
+        Safe = true;
+    }
+  }
+
+  unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
+
+  // Third attempt:
+  // If all set bits of IdxDiff or any higher order bit other than the sign bit
+  // are known to be zero in ValA, we can add Diff to it while guaranteeing no
+  // overflow of any sort.
+  if (!Safe) {
+    OpA = dyn_cast<Instruction>(ValA);
+    if (!OpA)
+      return false;
+    KnownBits Known(BitWidth);
+    computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);
+    APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
+    if (Signed)
+      BitsAllowedToBeSet.clearBit(BitWidth - 1);
+    if (BitsAllowedToBeSet.ult(IdxDiff))
+      return false;
+  }
+
+  const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
+  const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
+  const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));
+  const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);
+  return X == OffsetSCEVB;
+}
+
+bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB,
+                                    const APInt &PtrDelta,
+                                    unsigned Depth) const {
+  if (Depth++ == MaxDepth)
+    return false;
+
+  if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
+    if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
+      return SelectA->getCondition() == SelectB->getCondition() &&
+             areConsecutivePointers(SelectA->getTrueValue(),
+                                    SelectB->getTrueValue(), PtrDelta, Depth) &&
+             areConsecutivePointers(SelectA->getFalseValue(),
+                                    SelectB->getFalseValue(), PtrDelta, Depth);
+    }
+  }
+  return false;
+}
+
+void Vectorizer::reorder(Instruction *I) {
+  SmallPtrSet<Instruction *, 16> InstructionsToMove;
+  SmallVector<Instruction *, 16> Worklist;
+
+  Worklist.push_back(I);
+  while (!Worklist.empty()) {
+    Instruction *IW = Worklist.pop_back_val();
+    int NumOperands = IW->getNumOperands();
+    for (int i = 0; i < NumOperands; i++) {
+      Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
+      if (!IM || IM->getOpcode() == Instruction::PHI)
+        continue;
+
+      // If IM is in another BB, no need to move it, because this pass only
+      // vectorizes instructions within one BB.
+      if (IM->getParent() != I->getParent())
+        continue;
+
+      if (!IM->comesBefore(I)) {
+        InstructionsToMove.insert(IM);
+        Worklist.push_back(IM);
+      }
+    }
+  }
+
+  // All instructions to move should follow I. Start from I, not from begin().
+  for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;
+       ++BBI) {
+    if (!InstructionsToMove.count(&*BBI))
+      continue;
+    Instruction *IM = &*BBI;
+    --BBI;
+    IM->removeFromParent();
+    IM->insertBefore(I);
+  }
+}
+
+std::pair<BasicBlock::iterator, BasicBlock::iterator>
+Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {
+  Instruction *C0 = Chain[0];
+  BasicBlock::iterator FirstInstr = C0->getIterator();
+  BasicBlock::iterator LastInstr = C0->getIterator();
+
+  BasicBlock *BB = C0->getParent();
+  unsigned NumFound = 0;
+  for (Instruction &I : *BB) {
+    if (!is_contained(Chain, &I))
+      continue;
+
+    ++NumFound;
+    if (NumFound == 1) {
+      FirstInstr = I.getIterator();
+    }
+    if (NumFound == Chain.size()) {
+      LastInstr = I.getIterator();
+      break;
+    }
+  }
+
+  // Range is [first, last).
+  return std::make_pair(FirstInstr, ++LastInstr);
+}
+
+void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {
+  SmallVector<Instruction *, 16> Instrs;
+  for (Instruction *I : Chain) {
+    Value *PtrOperand = getLoadStorePointerOperand(I);
+    assert(PtrOperand && "Instruction must have a pointer operand.");
+    Instrs.push_back(I);
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
+      Instrs.push_back(GEP);
+  }
+
+  // Erase instructions.
+  for (Instruction *I : Instrs)
+    if (I->use_empty())
+      I->eraseFromParent();
+}
+
+std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
+Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
+                               unsigned ElementSizeBits) {
+  unsigned ElementSizeBytes = ElementSizeBits / 8;
+  unsigned SizeBytes = ElementSizeBytes * Chain.size();
+  unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;
+  if (NumLeft == Chain.size()) {
+    if ((NumLeft & 1) == 0)
+      NumLeft /= 2; // Split even in half
+    else
+      --NumLeft;    // Split off last element
+  } else if (NumLeft == 0)
+    NumLeft = 1;
+  return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
+}
+
+ArrayRef<Instruction *>
+Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
+  // These are in BB order, unlike Chain, which is in address order.
+  SmallVector<Instruction *, 16> MemoryInstrs;
+  SmallVector<Instruction *, 16> ChainInstrs;
+
+  bool IsLoadChain = isa<LoadInst>(Chain[0]);
+  LLVM_DEBUG({
+    for (Instruction *I : Chain) {
+      if (IsLoadChain)
+        assert(isa<LoadInst>(I) &&
+               "All elements of Chain must be loads, or all must be stores.");
+      else
+        assert(isa<StoreInst>(I) &&
+               "All elements of Chain must be loads, or all must be stores.");
+    }
+  });
+
+  for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {
+    if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
+      if (!is_contained(Chain, &I))
+        MemoryInstrs.push_back(&I);
+      else
+        ChainInstrs.push_back(&I);
+    } else if (isa<IntrinsicInst>(&I) &&
+               cast<IntrinsicInst>(&I)->getIntrinsicID() ==
+                   Intrinsic::sideeffect) {
+      // Ignore llvm.sideeffect calls.
     } else if (isa<IntrinsicInst>(&I) &&
                cast<IntrinsicInst>(&I)->getIntrinsicID() ==
                    Intrinsic::pseudoprobe) {
       // Ignore llvm.pseudoprobe calls.
-    } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) { 
-      LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I 
-                        << '\n'); 
-      break; 
-    } else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) { 
-      LLVM_DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I 
-                        << '\n'); 
-      break; 
-    } 
-  } 
- 
-  // Loop until we find an instruction in ChainInstrs that we can't vectorize. 
-  unsigned ChainInstrIdx = 0; 
-  Instruction *BarrierMemoryInstr = nullptr; 
- 
-  for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) { 
-    Instruction *ChainInstr = ChainInstrs[ChainInstrIdx]; 
- 
-    // If a barrier memory instruction was found, chain instructions that follow 
-    // will not be added to the valid prefix. 
-    if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(ChainInstr)) 
-      break; 
- 
-    // Check (in BB order) if any instruction prevents ChainInstr from being 
-    // vectorized. Find and store the first such "conflicting" instruction. 
-    for (Instruction *MemInstr : MemoryInstrs) { 
-      // If a barrier memory instruction was found, do not check past it. 
-      if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(MemInstr)) 
-        break; 
- 
-      auto *MemLoad = dyn_cast<LoadInst>(MemInstr); 
-      auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr); 
-      if (MemLoad && ChainLoad) 
-        continue; 
- 
-      // We can ignore the alias if the we have a load store pair and the load 
-      // is known to be invariant. The load cannot be clobbered by the store. 
-      auto IsInvariantLoad = [](const LoadInst *LI) -> bool { 
-        return LI->hasMetadata(LLVMContext::MD_invariant_load); 
-      }; 
- 
-      // We can ignore the alias as long as the load comes before the store, 
-      // because that means we won't be moving the load past the store to 
-      // vectorize it (the vectorized load is inserted at the location of the 
-      // first load in the chain). 
-      if (isa<StoreInst>(MemInstr) && ChainLoad && 
-          (IsInvariantLoad(ChainLoad) || ChainLoad->comesBefore(MemInstr))) 
-        continue; 
- 
-      // Same case, but in reverse. 
-      if (MemLoad && isa<StoreInst>(ChainInstr) && 
-          (IsInvariantLoad(MemLoad) || MemLoad->comesBefore(ChainInstr))) 
-        continue; 
- 
-      if (!AA.isNoAlias(MemoryLocation::get(MemInstr), 
-                        MemoryLocation::get(ChainInstr))) { 
-        LLVM_DEBUG({ 
-          dbgs() << "LSV: Found alias:\n" 
-                    "  Aliasing instruction and pointer:\n" 
-                 << "  " << *MemInstr << '\n' 
-                 << "  " << *getLoadStorePointerOperand(MemInstr) << '\n' 
-                 << "  Aliased instruction and pointer:\n" 
-                 << "  " << *ChainInstr << '\n' 
-                 << "  " << *getLoadStorePointerOperand(ChainInstr) << '\n'; 
-        }); 
-        // Save this aliasing memory instruction as a barrier, but allow other 
-        // instructions that precede the barrier to be vectorized with this one. 
-        BarrierMemoryInstr = MemInstr; 
-        break; 
-      } 
-    } 
-    // Continue the search only for store chains, since vectorizing stores that 
-    // precede an aliasing load is valid. Conversely, vectorizing loads is valid 
-    // up to an aliasing store, but should not pull loads from further down in 
-    // the basic block. 
-    if (IsLoadChain && BarrierMemoryInstr) { 
-      // The BarrierMemoryInstr is a store that precedes ChainInstr. 
-      assert(BarrierMemoryInstr->comesBefore(ChainInstr)); 
-      break; 
-    } 
-  } 
- 
-  // Find the largest prefix of Chain whose elements are all in 
-  // ChainInstrs[0, ChainInstrIdx).  This is the largest vectorizable prefix of 
-  // Chain.  (Recall that Chain is in address order, but ChainInstrs is in BB 
-  // order.) 
-  SmallPtrSet<Instruction *, 8> VectorizableChainInstrs( 
-      ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx); 
-  unsigned ChainIdx = 0; 
-  for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) { 
-    if (!VectorizableChainInstrs.count(Chain[ChainIdx])) 
-      break; 
-  } 
-  return Chain.slice(0, ChainIdx); 
-} 
- 
+    } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) {
+      LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I
+                        << '\n');
+      break;
+    } else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) {
+      LLVM_DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
+                        << '\n');
+      break;
+    }
+  }
+
+  // Loop until we find an instruction in ChainInstrs that we can't vectorize.
+  unsigned ChainInstrIdx = 0;
+  Instruction *BarrierMemoryInstr = nullptr;
+
+  for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) {
+    Instruction *ChainInstr = ChainInstrs[ChainInstrIdx];
+
+    // If a barrier memory instruction was found, chain instructions that follow
+    // will not be added to the valid prefix.
+    if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(ChainInstr))
+      break;
+
+    // Check (in BB order) if any instruction prevents ChainInstr from being
+    // vectorized. Find and store the first such "conflicting" instruction.
+    for (Instruction *MemInstr : MemoryInstrs) {
+      // If a barrier memory instruction was found, do not check past it.
+      if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(MemInstr))
+        break;
+
+      auto *MemLoad = dyn_cast<LoadInst>(MemInstr);
+      auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr);
+      if (MemLoad && ChainLoad)
+        continue;
+
+      // We can ignore the alias if the we have a load store pair and the load
+      // is known to be invariant. The load cannot be clobbered by the store.
+      auto IsInvariantLoad = [](const LoadInst *LI) -> bool {
+        return LI->hasMetadata(LLVMContext::MD_invariant_load);
+      };
+
+      // We can ignore the alias as long as the load comes before the store,
+      // because that means we won't be moving the load past the store to
+      // vectorize it (the vectorized load is inserted at the location of the
+      // first load in the chain).
+      if (isa<StoreInst>(MemInstr) && ChainLoad &&
+          (IsInvariantLoad(ChainLoad) || ChainLoad->comesBefore(MemInstr)))
+        continue;
+
+      // Same case, but in reverse.
+      if (MemLoad && isa<StoreInst>(ChainInstr) &&
+          (IsInvariantLoad(MemLoad) || MemLoad->comesBefore(ChainInstr)))
+        continue;
+
+      if (!AA.isNoAlias(MemoryLocation::get(MemInstr),
+                        MemoryLocation::get(ChainInstr))) {
+        LLVM_DEBUG({
+          dbgs() << "LSV: Found alias:\n"
+                    "  Aliasing instruction and pointer:\n"
+                 << "  " << *MemInstr << '\n'
+                 << "  " << *getLoadStorePointerOperand(MemInstr) << '\n'
+                 << "  Aliased instruction and pointer:\n"
+                 << "  " << *ChainInstr << '\n'
+                 << "  " << *getLoadStorePointerOperand(ChainInstr) << '\n';
+        });
+        // Save this aliasing memory instruction as a barrier, but allow other
+        // instructions that precede the barrier to be vectorized with this one.
+        BarrierMemoryInstr = MemInstr;
+        break;
+      }
+    }
+    // Continue the search only for store chains, since vectorizing stores that
+    // precede an aliasing load is valid. Conversely, vectorizing loads is valid
+    // up to an aliasing store, but should not pull loads from further down in
+    // the basic block.
+    if (IsLoadChain && BarrierMemoryInstr) {
+      // The BarrierMemoryInstr is a store that precedes ChainInstr.
+      assert(BarrierMemoryInstr->comesBefore(ChainInstr));
+      break;
+    }
+  }
+
+  // Find the largest prefix of Chain whose elements are all in
+  // ChainInstrs[0, ChainInstrIdx).  This is the largest vectorizable prefix of
+  // Chain.  (Recall that Chain is in address order, but ChainInstrs is in BB
+  // order.)
+  SmallPtrSet<Instruction *, 8> VectorizableChainInstrs(
+      ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx);
+  unsigned ChainIdx = 0;
+  for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) {
+    if (!VectorizableChainInstrs.count(Chain[ChainIdx]))
+      break;
+  }
+  return Chain.slice(0, ChainIdx);
+}
+
 static ChainID getChainID(const Value *Ptr) {
   const Value *ObjPtr = getUnderlyingObject(Ptr);
-  if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) { 
-    // The select's themselves are distinct instructions even if they share the 
-    // same condition and evaluate to consecutive pointers for true and false 
-    // values of the condition. Therefore using the select's themselves for 
-    // grouping instructions would put consecutive accesses into different lists 
-    // and they won't be even checked for being consecutive, and won't be 
-    // vectorized. 
-    return Sel->getCondition(); 
-  } 
-  return ObjPtr; 
-} 
- 
-std::pair<InstrListMap, InstrListMap> 
-Vectorizer::collectInstructions(BasicBlock *BB) { 
-  InstrListMap LoadRefs; 
-  InstrListMap StoreRefs; 
- 
-  for (Instruction &I : *BB) { 
-    if (!I.mayReadOrWriteMemory()) 
-      continue; 
- 
-    if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { 
-      if (!LI->isSimple()) 
-        continue; 
- 
-      // Skip if it's not legal. 
-      if (!TTI.isLegalToVectorizeLoad(LI)) 
-        continue; 
- 
-      Type *Ty = LI->getType(); 
-      if (!VectorType::isValidElementType(Ty->getScalarType())) 
-        continue; 
- 
-      // Skip weird non-byte sizes. They probably aren't worth the effort of 
-      // handling correctly. 
-      unsigned TySize = DL.getTypeSizeInBits(Ty); 
-      if ((TySize % 8) != 0) 
-        continue; 
- 
-      // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain 
-      // functions are currently using an integer type for the vectorized 
-      // load/store, and does not support casting between the integer type and a 
-      // vector of pointers (e.g. i64 to <2 x i16*>) 
-      if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy()) 
-        continue; 
- 
-      Value *Ptr = LI->getPointerOperand(); 
-      unsigned AS = Ptr->getType()->getPointerAddressSpace(); 
-      unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); 
- 
-      unsigned VF = VecRegSize / TySize; 
-      VectorType *VecTy = dyn_cast<VectorType>(Ty); 
- 
-      // No point in looking at these if they're too big to vectorize. 
-      if (TySize > VecRegSize / 2 || 
-          (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) 
-        continue; 
- 
-      // Make sure all the users of a vector are constant-index extracts. 
-      if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) { 
-            const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U); 
-            return EEI && isa<ConstantInt>(EEI->getOperand(1)); 
-          })) 
-        continue; 
- 
-      // Save the load locations. 
+  if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
+    // The select's themselves are distinct instructions even if they share the
+    // same condition and evaluate to consecutive pointers for true and false
+    // values of the condition. Therefore using the select's themselves for
+    // grouping instructions would put consecutive accesses into different lists
+    // and they won't be even checked for being consecutive, and won't be
+    // vectorized.
+    return Sel->getCondition();
+  }
+  return ObjPtr;
+}
+
+std::pair<InstrListMap, InstrListMap>
+Vectorizer::collectInstructions(BasicBlock *BB) {
+  InstrListMap LoadRefs;
+  InstrListMap StoreRefs;
+
+  for (Instruction &I : *BB) {
+    if (!I.mayReadOrWriteMemory())
+      continue;
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+      if (!LI->isSimple())
+        continue;
+
+      // Skip if it's not legal.
+      if (!TTI.isLegalToVectorizeLoad(LI))
+        continue;
+
+      Type *Ty = LI->getType();
+      if (!VectorType::isValidElementType(Ty->getScalarType()))
+        continue;
+
+      // Skip weird non-byte sizes. They probably aren't worth the effort of
+      // handling correctly.
+      unsigned TySize = DL.getTypeSizeInBits(Ty);
+      if ((TySize % 8) != 0)
+        continue;
+
+      // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
+      // functions are currently using an integer type for the vectorized
+      // load/store, and does not support casting between the integer type and a
+      // vector of pointers (e.g. i64 to <2 x i16*>)
+      if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
+        continue;
+
+      Value *Ptr = LI->getPointerOperand();
+      unsigned AS = Ptr->getType()->getPointerAddressSpace();
+      unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
+
+      unsigned VF = VecRegSize / TySize;
+      VectorType *VecTy = dyn_cast<VectorType>(Ty);
+
+      // No point in looking at these if they're too big to vectorize.
+      if (TySize > VecRegSize / 2 ||
+          (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
+        continue;
+
+      // Make sure all the users of a vector are constant-index extracts.
+      if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) {
+            const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
+            return EEI && isa<ConstantInt>(EEI->getOperand(1));
+          }))
+        continue;
+
+      // Save the load locations.
       const ChainID ID = getChainID(Ptr);
-      LoadRefs[ID].push_back(LI); 
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { 
-      if (!SI->isSimple()) 
-        continue; 
- 
-      // Skip if it's not legal. 
-      if (!TTI.isLegalToVectorizeStore(SI)) 
-        continue; 
- 
-      Type *Ty = SI->getValueOperand()->getType(); 
-      if (!VectorType::isValidElementType(Ty->getScalarType())) 
-        continue; 
- 
-      // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain 
-      // functions are currently using an integer type for the vectorized 
-      // load/store, and does not support casting between the integer type and a 
-      // vector of pointers (e.g. i64 to <2 x i16*>) 
-      if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy()) 
-        continue; 
- 
-      // Skip weird non-byte sizes. They probably aren't worth the effort of 
-      // handling correctly. 
-      unsigned TySize = DL.getTypeSizeInBits(Ty); 
-      if ((TySize % 8) != 0) 
-        continue; 
- 
-      Value *Ptr = SI->getPointerOperand(); 
-      unsigned AS = Ptr->getType()->getPointerAddressSpace(); 
-      unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); 
- 
-      unsigned VF = VecRegSize / TySize; 
-      VectorType *VecTy = dyn_cast<VectorType>(Ty); 
- 
-      // No point in looking at these if they're too big to vectorize. 
-      if (TySize > VecRegSize / 2 || 
-          (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) 
-        continue; 
- 
-      if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) { 
-            const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U); 
-            return EEI && isa<ConstantInt>(EEI->getOperand(1)); 
-          })) 
-        continue; 
- 
-      // Save store location. 
+      LoadRefs[ID].push_back(LI);
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
+      if (!SI->isSimple())
+        continue;
+
+      // Skip if it's not legal.
+      if (!TTI.isLegalToVectorizeStore(SI))
+        continue;
+
+      Type *Ty = SI->getValueOperand()->getType();
+      if (!VectorType::isValidElementType(Ty->getScalarType()))
+        continue;
+
+      // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
+      // functions are currently using an integer type for the vectorized
+      // load/store, and does not support casting between the integer type and a
+      // vector of pointers (e.g. i64 to <2 x i16*>)
+      if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
+        continue;
+
+      // Skip weird non-byte sizes. They probably aren't worth the effort of
+      // handling correctly.
+      unsigned TySize = DL.getTypeSizeInBits(Ty);
+      if ((TySize % 8) != 0)
+        continue;
+
+      Value *Ptr = SI->getPointerOperand();
+      unsigned AS = Ptr->getType()->getPointerAddressSpace();
+      unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
+
+      unsigned VF = VecRegSize / TySize;
+      VectorType *VecTy = dyn_cast<VectorType>(Ty);
+
+      // No point in looking at these if they're too big to vectorize.
+      if (TySize > VecRegSize / 2 ||
+          (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
+        continue;
+
+      if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) {
+            const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
+            return EEI && isa<ConstantInt>(EEI->getOperand(1));
+          }))
+        continue;
+
+      // Save store location.
       const ChainID ID = getChainID(Ptr);
-      StoreRefs[ID].push_back(SI); 
-    } 
-  } 
- 
-  return {LoadRefs, StoreRefs}; 
-} 
- 
-bool Vectorizer::vectorizeChains(InstrListMap &Map) { 
-  bool Changed = false; 
- 
-  for (const std::pair<ChainID, InstrList> &Chain : Map) { 
-    unsigned Size = Chain.second.size(); 
-    if (Size < 2) 
-      continue; 
- 
-    LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n"); 
- 
-    // Process the stores in chunks of 64. 
-    for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) { 
-      unsigned Len = std::min<unsigned>(CE - CI, 64); 
-      ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len); 
-      Changed |= vectorizeInstructions(Chunk); 
-    } 
-  } 
- 
-  return Changed; 
-} 
- 
-bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) { 
-  LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size() 
-                    << " instructions.\n"); 
-  SmallVector<int, 16> Heads, Tails; 
-  int ConsecutiveChain[64]; 
- 
-  // Do a quadratic search on all of the given loads/stores and find all of the 
-  // pairs of loads/stores that follow each other. 
-  for (int i = 0, e = Instrs.size(); i < e; ++i) { 
-    ConsecutiveChain[i] = -1; 
-    for (int j = e - 1; j >= 0; --j) { 
-      if (i == j) 
-        continue; 
- 
-      if (isConsecutiveAccess(Instrs[i], Instrs[j])) { 
-        if (ConsecutiveChain[i] != -1) { 
-          int CurDistance = std::abs(ConsecutiveChain[i] - i); 
-          int NewDistance = std::abs(ConsecutiveChain[i] - j); 
-          if (j < i || NewDistance > CurDistance) 
-            continue; // Should not insert. 
-        } 
- 
-        Tails.push_back(j); 
-        Heads.push_back(i); 
-        ConsecutiveChain[i] = j; 
-      } 
-    } 
-  } 
- 
-  bool Changed = false; 
-  SmallPtrSet<Instruction *, 16> InstructionsProcessed; 
- 
-  for (int Head : Heads) { 
-    if (InstructionsProcessed.count(Instrs[Head])) 
-      continue; 
-    bool LongerChainExists = false; 
-    for (unsigned TIt = 0; TIt < Tails.size(); TIt++) 
-      if (Head == Tails[TIt] && 
-          !InstructionsProcessed.count(Instrs[Heads[TIt]])) { 
-        LongerChainExists = true; 
-        break; 
-      } 
-    if (LongerChainExists) 
-      continue; 
- 
-    // We found an instr that starts a chain. Now follow the chain and try to 
-    // vectorize it. 
-    SmallVector<Instruction *, 16> Operands; 
-    int I = Head; 
-    while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) { 
-      if (InstructionsProcessed.count(Instrs[I])) 
-        break; 
- 
-      Operands.push_back(Instrs[I]); 
-      I = ConsecutiveChain[I]; 
-    } 
- 
-    bool Vectorized = false; 
-    if (isa<LoadInst>(*Operands.begin())) 
-      Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed); 
-    else 
-      Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed); 
- 
-    Changed |= Vectorized; 
-  } 
- 
-  return Changed; 
-} 
- 
-bool Vectorizer::vectorizeStoreChain( 
-    ArrayRef<Instruction *> Chain, 
-    SmallPtrSet<Instruction *, 16> *InstructionsProcessed) { 
-  StoreInst *S0 = cast<StoreInst>(Chain[0]); 
- 
-  // If the vector has an int element, default to int for the whole store. 
-  Type *StoreTy = nullptr; 
-  for (Instruction *I : Chain) { 
-    StoreTy = cast<StoreInst>(I)->getValueOperand()->getType(); 
-    if (StoreTy->isIntOrIntVectorTy()) 
-      break; 
- 
-    if (StoreTy->isPtrOrPtrVectorTy()) { 
-      StoreTy = Type::getIntNTy(F.getParent()->getContext(), 
-                                DL.getTypeSizeInBits(StoreTy)); 
-      break; 
-    } 
-  } 
-  assert(StoreTy && "Failed to find store type"); 
- 
-  unsigned Sz = DL.getTypeSizeInBits(StoreTy); 
-  unsigned AS = S0->getPointerAddressSpace(); 
-  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); 
-  unsigned VF = VecRegSize / Sz; 
-  unsigned ChainSize = Chain.size(); 
-  Align Alignment = S0->getAlign(); 
- 
-  if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) { 
-    InstructionsProcessed->insert(Chain.begin(), Chain.end()); 
-    return false; 
-  } 
- 
-  ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain); 
-  if (NewChain.empty()) { 
-    // No vectorization possible. 
-    InstructionsProcessed->insert(Chain.begin(), Chain.end()); 
-    return false; 
-  } 
-  if (NewChain.size() == 1) { 
-    // Failed after the first instruction. Discard it and try the smaller chain. 
-    InstructionsProcessed->insert(NewChain.front()); 
-    return false; 
-  } 
- 
-  // Update Chain to the valid vectorizable subchain. 
-  Chain = NewChain; 
-  ChainSize = Chain.size(); 
- 
-  // Check if it's legal to vectorize this chain. If not, split the chain and 
-  // try again. 
-  unsigned EltSzInBytes = Sz / 8; 
-  unsigned SzInBytes = EltSzInBytes * ChainSize; 
- 
+      StoreRefs[ID].push_back(SI);
+    }
+  }
+
+  return {LoadRefs, StoreRefs};
+}
+
+bool Vectorizer::vectorizeChains(InstrListMap &Map) {
+  bool Changed = false;
+
+  for (const std::pair<ChainID, InstrList> &Chain : Map) {
+    unsigned Size = Chain.second.size();
+    if (Size < 2)
+      continue;
+
+    LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
+
+    // Process the stores in chunks of 64.
+    for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
+      unsigned Len = std::min<unsigned>(CE - CI, 64);
+      ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len);
+      Changed |= vectorizeInstructions(Chunk);
+    }
+  }
+
+  return Changed;
+}
+
+bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
+  LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()
+                    << " instructions.\n");
+  SmallVector<int, 16> Heads, Tails;
+  int ConsecutiveChain[64];
+
+  // Do a quadratic search on all of the given loads/stores and find all of the
+  // pairs of loads/stores that follow each other.
+  for (int i = 0, e = Instrs.size(); i < e; ++i) {
+    ConsecutiveChain[i] = -1;
+    for (int j = e - 1; j >= 0; --j) {
+      if (i == j)
+        continue;
+
+      if (isConsecutiveAccess(Instrs[i], Instrs[j])) {
+        if (ConsecutiveChain[i] != -1) {
+          int CurDistance = std::abs(ConsecutiveChain[i] - i);
+          int NewDistance = std::abs(ConsecutiveChain[i] - j);
+          if (j < i || NewDistance > CurDistance)
+            continue; // Should not insert.
+        }
+
+        Tails.push_back(j);
+        Heads.push_back(i);
+        ConsecutiveChain[i] = j;
+      }
+    }
+  }
+
+  bool Changed = false;
+  SmallPtrSet<Instruction *, 16> InstructionsProcessed;
+
+  for (int Head : Heads) {
+    if (InstructionsProcessed.count(Instrs[Head]))
+      continue;
+    bool LongerChainExists = false;
+    for (unsigned TIt = 0; TIt < Tails.size(); TIt++)
+      if (Head == Tails[TIt] &&
+          !InstructionsProcessed.count(Instrs[Heads[TIt]])) {
+        LongerChainExists = true;
+        break;
+      }
+    if (LongerChainExists)
+      continue;
+
+    // We found an instr that starts a chain. Now follow the chain and try to
+    // vectorize it.
+    SmallVector<Instruction *, 16> Operands;
+    int I = Head;
+    while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) {
+      if (InstructionsProcessed.count(Instrs[I]))
+        break;
+
+      Operands.push_back(Instrs[I]);
+      I = ConsecutiveChain[I];
+    }
+
+    bool Vectorized = false;
+    if (isa<LoadInst>(*Operands.begin()))
+      Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed);
+    else
+      Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed);
+
+    Changed |= Vectorized;
+  }
+
+  return Changed;
+}
+
+bool Vectorizer::vectorizeStoreChain(
+    ArrayRef<Instruction *> Chain,
+    SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
+  StoreInst *S0 = cast<StoreInst>(Chain[0]);
+
+  // If the vector has an int element, default to int for the whole store.
+  Type *StoreTy = nullptr;
+  for (Instruction *I : Chain) {
+    StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();
+    if (StoreTy->isIntOrIntVectorTy())
+      break;
+
+    if (StoreTy->isPtrOrPtrVectorTy()) {
+      StoreTy = Type::getIntNTy(F.getParent()->getContext(),
+                                DL.getTypeSizeInBits(StoreTy));
+      break;
+    }
+  }
+  assert(StoreTy && "Failed to find store type");
+
+  unsigned Sz = DL.getTypeSizeInBits(StoreTy);
+  unsigned AS = S0->getPointerAddressSpace();
+  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
+  unsigned VF = VecRegSize / Sz;
+  unsigned ChainSize = Chain.size();
+  Align Alignment = S0->getAlign();
+
+  if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
+    InstructionsProcessed->insert(Chain.begin(), Chain.end());
+    return false;
+  }
+
+  ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
+  if (NewChain.empty()) {
+    // No vectorization possible.
+    InstructionsProcessed->insert(Chain.begin(), Chain.end());
+    return false;
+  }
+  if (NewChain.size() == 1) {
+    // Failed after the first instruction. Discard it and try the smaller chain.
+    InstructionsProcessed->insert(NewChain.front());
+    return false;
+  }
+
+  // Update Chain to the valid vectorizable subchain.
+  Chain = NewChain;
+  ChainSize = Chain.size();
+
+  // Check if it's legal to vectorize this chain. If not, split the chain and
+  // try again.
+  unsigned EltSzInBytes = Sz / 8;
+  unsigned SzInBytes = EltSzInBytes * ChainSize;
+
   FixedVectorType *VecTy;
   auto *VecStoreTy = dyn_cast<FixedVectorType>(StoreTy);
-  if (VecStoreTy) 
-    VecTy = FixedVectorType::get(StoreTy->getScalarType(), 
-                                 Chain.size() * VecStoreTy->getNumElements()); 
-  else 
-    VecTy = FixedVectorType::get(StoreTy, Chain.size()); 
- 
-  // If it's more than the max vector size or the target has a better 
-  // vector factor, break it into two pieces. 
-  unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy); 
-  if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) { 
-    LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor." 
-                         " Creating two separate arrays.\n"); 
-    return vectorizeStoreChain(Chain.slice(0, TargetVF), 
-                               InstructionsProcessed) | 
-           vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed); 
-  } 
- 
-  LLVM_DEBUG({ 
-    dbgs() << "LSV: Stores to vectorize:\n"; 
-    for (Instruction *I : Chain) 
-      dbgs() << "  " << *I << "\n"; 
-  }); 
- 
-  // We won't try again to vectorize the elements of the chain, regardless of 
-  // whether we succeed below. 
-  InstructionsProcessed->insert(Chain.begin(), Chain.end()); 
- 
-  // If the store is going to be misaligned, don't vectorize it. 
-  if (accessIsMisaligned(SzInBytes, AS, Alignment.value())) { 
-    if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) { 
-      auto Chains = splitOddVectorElts(Chain, Sz); 
-      return vectorizeStoreChain(Chains.first, InstructionsProcessed) | 
-             vectorizeStoreChain(Chains.second, InstructionsProcessed); 
-    } 
- 
-    Align NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(), 
-                                                Align(StackAdjustedAlignment), 
-                                                DL, S0, nullptr, &DT); 
-    if (NewAlign >= Alignment) 
-      Alignment = NewAlign; 
-    else 
-      return false; 
-  } 
- 
-  if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) { 
-    auto Chains = splitOddVectorElts(Chain, Sz); 
-    return vectorizeStoreChain(Chains.first, InstructionsProcessed) | 
-           vectorizeStoreChain(Chains.second, InstructionsProcessed); 
-  } 
- 
-  BasicBlock::iterator First, Last; 
-  std::tie(First, Last) = getBoundaryInstrs(Chain); 
-  Builder.SetInsertPoint(&*Last); 
- 
-  Value *Vec = UndefValue::get(VecTy); 
- 
-  if (VecStoreTy) { 
-    unsigned VecWidth = VecStoreTy->getNumElements(); 
-    for (unsigned I = 0, E = Chain.size(); I != E; ++I) { 
-      StoreInst *Store = cast<StoreInst>(Chain[I]); 
-      for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) { 
-        unsigned NewIdx = J + I * VecWidth; 
-        Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(), 
-                                                      Builder.getInt32(J)); 
-        if (Extract->getType() != StoreTy->getScalarType()) 
-          Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType()); 
- 
-        Value *Insert = 
-            Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx)); 
-        Vec = Insert; 
-      } 
-    } 
-  } else { 
-    for (unsigned I = 0, E = Chain.size(); I != E; ++I) { 
-      StoreInst *Store = cast<StoreInst>(Chain[I]); 
-      Value *Extract = Store->getValueOperand(); 
-      if (Extract->getType() != StoreTy->getScalarType()) 
-        Extract = 
-            Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType()); 
- 
-      Value *Insert = 
-          Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I)); 
-      Vec = Insert; 
-    } 
-  } 
- 
-  StoreInst *SI = Builder.CreateAlignedStore( 
-    Vec, 
-    Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)), 
-    Alignment); 
-  propagateMetadata(SI, Chain); 
- 
-  eraseInstructions(Chain); 
-  ++NumVectorInstructions; 
-  NumScalarsVectorized += Chain.size(); 
-  return true; 
-} 
- 
-bool Vectorizer::vectorizeLoadChain( 
-    ArrayRef<Instruction *> Chain, 
-    SmallPtrSet<Instruction *, 16> *InstructionsProcessed) { 
-  LoadInst *L0 = cast<LoadInst>(Chain[0]); 
- 
-  // If the vector has an int element, default to int for the whole load. 
-  Type *LoadTy = nullptr; 
-  for (const auto &V : Chain) { 
-    LoadTy = cast<LoadInst>(V)->getType(); 
-    if (LoadTy->isIntOrIntVectorTy()) 
-      break; 
- 
-    if (LoadTy->isPtrOrPtrVectorTy()) { 
-      LoadTy = Type::getIntNTy(F.getParent()->getContext(), 
-                               DL.getTypeSizeInBits(LoadTy)); 
-      break; 
-    } 
-  } 
-  assert(LoadTy && "Can't determine LoadInst type from chain"); 
- 
-  unsigned Sz = DL.getTypeSizeInBits(LoadTy); 
-  unsigned AS = L0->getPointerAddressSpace(); 
-  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); 
-  unsigned VF = VecRegSize / Sz; 
-  unsigned ChainSize = Chain.size(); 
-  Align Alignment = L0->getAlign(); 
- 
-  if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) { 
-    InstructionsProcessed->insert(Chain.begin(), Chain.end()); 
-    return false; 
-  } 
- 
-  ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain); 
-  if (NewChain.empty()) { 
-    // No vectorization possible. 
-    InstructionsProcessed->insert(Chain.begin(), Chain.end()); 
-    return false; 
-  } 
-  if (NewChain.size() == 1) { 
-    // Failed after the first instruction. Discard it and try the smaller chain. 
-    InstructionsProcessed->insert(NewChain.front()); 
-    return false; 
-  } 
- 
-  // Update Chain to the valid vectorizable subchain. 
-  Chain = NewChain; 
-  ChainSize = Chain.size(); 
- 
-  // Check if it's legal to vectorize this chain. If not, split the chain and 
-  // try again. 
-  unsigned EltSzInBytes = Sz / 8; 
-  unsigned SzInBytes = EltSzInBytes * ChainSize; 
-  VectorType *VecTy; 
+  if (VecStoreTy)
+    VecTy = FixedVectorType::get(StoreTy->getScalarType(),
+                                 Chain.size() * VecStoreTy->getNumElements());
+  else
+    VecTy = FixedVectorType::get(StoreTy, Chain.size());
+
+  // If it's more than the max vector size or the target has a better
+  // vector factor, break it into two pieces.
+  unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);
+  if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
+    LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
+                         " Creating two separate arrays.\n");
+    return vectorizeStoreChain(Chain.slice(0, TargetVF),
+                               InstructionsProcessed) |
+           vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "LSV: Stores to vectorize:\n";
+    for (Instruction *I : Chain)
+      dbgs() << "  " << *I << "\n";
+  });
+
+  // We won't try again to vectorize the elements of the chain, regardless of
+  // whether we succeed below.
+  InstructionsProcessed->insert(Chain.begin(), Chain.end());
+
+  // If the store is going to be misaligned, don't vectorize it.
+  if (accessIsMisaligned(SzInBytes, AS, Alignment.value())) {
+    if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
+      auto Chains = splitOddVectorElts(Chain, Sz);
+      return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
+             vectorizeStoreChain(Chains.second, InstructionsProcessed);
+    }
+
+    Align NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
+                                                Align(StackAdjustedAlignment),
+                                                DL, S0, nullptr, &DT);
+    if (NewAlign >= Alignment)
+      Alignment = NewAlign;
+    else
+      return false;
+  }
+
+  if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
+    auto Chains = splitOddVectorElts(Chain, Sz);
+    return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
+           vectorizeStoreChain(Chains.second, InstructionsProcessed);
+  }
+
+  BasicBlock::iterator First, Last;
+  std::tie(First, Last) = getBoundaryInstrs(Chain);
+  Builder.SetInsertPoint(&*Last);
+
+  Value *Vec = UndefValue::get(VecTy);
+
+  if (VecStoreTy) {
+    unsigned VecWidth = VecStoreTy->getNumElements();
+    for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
+      StoreInst *Store = cast<StoreInst>(Chain[I]);
+      for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) {
+        unsigned NewIdx = J + I * VecWidth;
+        Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(),
+                                                      Builder.getInt32(J));
+        if (Extract->getType() != StoreTy->getScalarType())
+          Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType());
+
+        Value *Insert =
+            Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx));
+        Vec = Insert;
+      }
+    }
+  } else {
+    for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
+      StoreInst *Store = cast<StoreInst>(Chain[I]);
+      Value *Extract = Store->getValueOperand();
+      if (Extract->getType() != StoreTy->getScalarType())
+        Extract =
+            Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType());
+
+      Value *Insert =
+          Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I));
+      Vec = Insert;
+    }
+  }
+
+  StoreInst *SI = Builder.CreateAlignedStore(
+    Vec,
+    Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)),
+    Alignment);
+  propagateMetadata(SI, Chain);
+
+  eraseInstructions(Chain);
+  ++NumVectorInstructions;
+  NumScalarsVectorized += Chain.size();
+  return true;
+}
+
+bool Vectorizer::vectorizeLoadChain(
+    ArrayRef<Instruction *> Chain,
+    SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
+  LoadInst *L0 = cast<LoadInst>(Chain[0]);
+
+  // If the vector has an int element, default to int for the whole load.
+  Type *LoadTy = nullptr;
+  for (const auto &V : Chain) {
+    LoadTy = cast<LoadInst>(V)->getType();
+    if (LoadTy->isIntOrIntVectorTy())
+      break;
+
+    if (LoadTy->isPtrOrPtrVectorTy()) {
+      LoadTy = Type::getIntNTy(F.getParent()->getContext(),
+                               DL.getTypeSizeInBits(LoadTy));
+      break;
+    }
+  }
+  assert(LoadTy && "Can't determine LoadInst type from chain");
+
+  unsigned Sz = DL.getTypeSizeInBits(LoadTy);
+  unsigned AS = L0->getPointerAddressSpace();
+  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
+  unsigned VF = VecRegSize / Sz;
+  unsigned ChainSize = Chain.size();
+  Align Alignment = L0->getAlign();
+
+  if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
+    InstructionsProcessed->insert(Chain.begin(), Chain.end());
+    return false;
+  }
+
+  ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
+  if (NewChain.empty()) {
+    // No vectorization possible.
+    InstructionsProcessed->insert(Chain.begin(), Chain.end());
+    return false;
+  }
+  if (NewChain.size() == 1) {
+    // Failed after the first instruction. Discard it and try the smaller chain.
+    InstructionsProcessed->insert(NewChain.front());
+    return false;
+  }
+
+  // Update Chain to the valid vectorizable subchain.
+  Chain = NewChain;
+  ChainSize = Chain.size();
+
+  // Check if it's legal to vectorize this chain. If not, split the chain and
+  // try again.
+  unsigned EltSzInBytes = Sz / 8;
+  unsigned SzInBytes = EltSzInBytes * ChainSize;
+  VectorType *VecTy;
   auto *VecLoadTy = dyn_cast<FixedVectorType>(LoadTy);
-  if (VecLoadTy) 
-    VecTy = FixedVectorType::get(LoadTy->getScalarType(), 
-                                 Chain.size() * VecLoadTy->getNumElements()); 
-  else 
-    VecTy = FixedVectorType::get(LoadTy, Chain.size()); 
- 
-  // If it's more than the max vector size or the target has a better 
-  // vector factor, break it into two pieces. 
-  unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy); 
-  if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) { 
-    LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor." 
-                         " Creating two separate arrays.\n"); 
-    return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) | 
-           vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed); 
-  } 
- 
-  // We won't try again to vectorize the elements of the chain, regardless of 
-  // whether we succeed below. 
-  InstructionsProcessed->insert(Chain.begin(), Chain.end()); 
- 
-  // If the load is going to be misaligned, don't vectorize it. 
-  if (accessIsMisaligned(SzInBytes, AS, Alignment.value())) { 
-    if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) { 
-      auto Chains = splitOddVectorElts(Chain, Sz); 
-      return vectorizeLoadChain(Chains.first, InstructionsProcessed) | 
-             vectorizeLoadChain(Chains.second, InstructionsProcessed); 
-    } 
- 
-    Align NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(), 
-                                                Align(StackAdjustedAlignment), 
-                                                DL, L0, nullptr, &DT); 
-    if (NewAlign >= Alignment) 
-      Alignment = NewAlign; 
-    else 
-      return false; 
-  } 
- 
-  if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) { 
-    auto Chains = splitOddVectorElts(Chain, Sz); 
-    return vectorizeLoadChain(Chains.first, InstructionsProcessed) | 
-           vectorizeLoadChain(Chains.second, InstructionsProcessed); 
-  } 
- 
-  LLVM_DEBUG({ 
-    dbgs() << "LSV: Loads to vectorize:\n"; 
-    for (Instruction *I : Chain) 
-      I->dump(); 
-  }); 
- 
-  // getVectorizablePrefix already computed getBoundaryInstrs.  The value of 
-  // Last may have changed since then, but the value of First won't have.  If it 
-  // matters, we could compute getBoundaryInstrs only once and reuse it here. 
-  BasicBlock::iterator First, Last; 
-  std::tie(First, Last) = getBoundaryInstrs(Chain); 
-  Builder.SetInsertPoint(&*First); 
- 
-  Value *Bitcast = 
-      Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS)); 
-  LoadInst *LI = 
-      Builder.CreateAlignedLoad(VecTy, Bitcast, MaybeAlign(Alignment)); 
-  propagateMetadata(LI, Chain); 
- 
-  if (VecLoadTy) { 
-    SmallVector<Instruction *, 16> InstrsToErase; 
- 
-    unsigned VecWidth = VecLoadTy->getNumElements(); 
-    for (unsigned I = 0, E = Chain.size(); I != E; ++I) { 
-      for (auto Use : Chain[I]->users()) { 
-        // All users of vector loads are ExtractElement instructions with 
-        // constant indices, otherwise we would have bailed before now. 
-        Instruction *UI = cast<Instruction>(Use); 
-        unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue(); 
-        unsigned NewIdx = Idx + I * VecWidth; 
-        Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx), 
-                                                UI->getName()); 
-        if (V->getType() != UI->getType()) 
-          V = Builder.CreateBitCast(V, UI->getType()); 
- 
-        // Replace the old instruction. 
-        UI->replaceAllUsesWith(V); 
-        InstrsToErase.push_back(UI); 
-      } 
-    } 
- 
-    // Bitcast might not be an Instruction, if the value being loaded is a 
-    // constant.  In that case, no need to reorder anything. 
-    if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast)) 
-      reorder(BitcastInst); 
- 
-    for (auto I : InstrsToErase) 
-      I->eraseFromParent(); 
-  } else { 
-    for (unsigned I = 0, E = Chain.size(); I != E; ++I) { 
-      Value *CV = Chain[I]; 
-      Value *V = 
-          Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName()); 
-      if (V->getType() != CV->getType()) { 
-        V = Builder.CreateBitOrPointerCast(V, CV->getType()); 
-      } 
- 
-      // Replace the old instruction. 
-      CV->replaceAllUsesWith(V); 
-    } 
- 
-    if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast)) 
-      reorder(BitcastInst); 
-  } 
- 
-  eraseInstructions(Chain); 
- 
-  ++NumVectorInstructions; 
-  NumScalarsVectorized += Chain.size(); 
-  return true; 
-} 
- 
-bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace, 
-                                    unsigned Alignment) { 
-  if (Alignment % SzInBytes == 0) 
-    return false; 
- 
-  bool Fast = false; 
-  bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(), 
-                                                   SzInBytes * 8, AddressSpace, 
-                                                   Alignment, &Fast); 
-  LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows 
-                    << " and fast? " << Fast << "\n";); 
-  return !Allows || !Fast; 
-} 
+  if (VecLoadTy)
+    VecTy = FixedVectorType::get(LoadTy->getScalarType(),
+                                 Chain.size() * VecLoadTy->getNumElements());
+  else
+    VecTy = FixedVectorType::get(LoadTy, Chain.size());
+
+  // If it's more than the max vector size or the target has a better
+  // vector factor, break it into two pieces.
+  unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);
+  if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
+    LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
+                         " Creating two separate arrays.\n");
+    return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) |
+           vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
+  }
+
+  // We won't try again to vectorize the elements of the chain, regardless of
+  // whether we succeed below.
+  InstructionsProcessed->insert(Chain.begin(), Chain.end());
+
+  // If the load is going to be misaligned, don't vectorize it.
+  if (accessIsMisaligned(SzInBytes, AS, Alignment.value())) {
+    if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
+      auto Chains = splitOddVectorElts(Chain, Sz);
+      return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
+             vectorizeLoadChain(Chains.second, InstructionsProcessed);
+    }
+
+    Align NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(),
+                                                Align(StackAdjustedAlignment),
+                                                DL, L0, nullptr, &DT);
+    if (NewAlign >= Alignment)
+      Alignment = NewAlign;
+    else
+      return false;
+  }
+
+  if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
+    auto Chains = splitOddVectorElts(Chain, Sz);
+    return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
+           vectorizeLoadChain(Chains.second, InstructionsProcessed);
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "LSV: Loads to vectorize:\n";
+    for (Instruction *I : Chain)
+      I->dump();
+  });
+
+  // getVectorizablePrefix already computed getBoundaryInstrs.  The value of
+  // Last may have changed since then, but the value of First won't have.  If it
+  // matters, we could compute getBoundaryInstrs only once and reuse it here.
+  BasicBlock::iterator First, Last;
+  std::tie(First, Last) = getBoundaryInstrs(Chain);
+  Builder.SetInsertPoint(&*First);
+
+  Value *Bitcast =
+      Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
+  LoadInst *LI =
+      Builder.CreateAlignedLoad(VecTy, Bitcast, MaybeAlign(Alignment));
+  propagateMetadata(LI, Chain);
+
+  if (VecLoadTy) {
+    SmallVector<Instruction *, 16> InstrsToErase;
+
+    unsigned VecWidth = VecLoadTy->getNumElements();
+    for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
+      for (auto Use : Chain[I]->users()) {
+        // All users of vector loads are ExtractElement instructions with
+        // constant indices, otherwise we would have bailed before now.
+        Instruction *UI = cast<Instruction>(Use);
+        unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue();
+        unsigned NewIdx = Idx + I * VecWidth;
+        Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx),
+                                                UI->getName());
+        if (V->getType() != UI->getType())
+          V = Builder.CreateBitCast(V, UI->getType());
+
+        // Replace the old instruction.
+        UI->replaceAllUsesWith(V);
+        InstrsToErase.push_back(UI);
+      }
+    }
+
+    // Bitcast might not be an Instruction, if the value being loaded is a
+    // constant.  In that case, no need to reorder anything.
+    if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
+      reorder(BitcastInst);
+
+    for (auto I : InstrsToErase)
+      I->eraseFromParent();
+  } else {
+    for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
+      Value *CV = Chain[I];
+      Value *V =
+          Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
+      if (V->getType() != CV->getType()) {
+        V = Builder.CreateBitOrPointerCast(V, CV->getType());
+      }
+
+      // Replace the old instruction.
+      CV->replaceAllUsesWith(V);
+    }
+
+    if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
+      reorder(BitcastInst);
+  }
+
+  eraseInstructions(Chain);
+
+  ++NumVectorInstructions;
+  NumScalarsVectorized += Chain.size();
+  return true;
+}
+
+bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
+                                    unsigned Alignment) {
+  if (Alignment % SzInBytes == 0)
+    return false;
+
+  bool Fast = false;
+  bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
+                                                   SzInBytes * 8, AddressSpace,
+                                                   Alignment, &Fast);
+  LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
+                    << " and fast? " << Fast << "\n";);
+  return !Allows || !Fast;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 184340599b..b8c21a0e1c 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1,1099 +1,1099 @@
-//===- LoopVectorizationLegality.cpp --------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file provides loop vectorization legality analysis. Original code 
-// resided in LoopVectorize.cpp for a long time. 
-// 
-// At this point, it is implemented as a utility class, not as an analysis 
-// pass. It should be easy to create an analysis pass around it if there 
-// is a need (but D45420 needs to happen first). 
-// 
-
-#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 
-#include "llvm/Analysis/Loads.h" 
-#include "llvm/Analysis/LoopInfo.h" 
+//===- LoopVectorizationLegality.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides loop vectorization legality analysis. Original code
+// resided in LoopVectorize.cpp for a long time.
+//
+// At this point, it is implemented as a utility class, not as an analysis
+// pass. It should be easy to create an analysis pass around it if there
+// is a need (but D45420 needs to happen first).
+//
+
+#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/Analysis/VectorUtils.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/PatternMatch.h" 
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Transforms/Utils/SizeOpts.h"
-#include "llvm/Transforms/Vectorize/LoopVectorize.h" 
- 
-using namespace llvm; 
-using namespace PatternMatch; 
- 
-#define LV_NAME "loop-vectorize" 
-#define DEBUG_TYPE LV_NAME 
- 
-extern cl::opt<bool> EnableVPlanPredication; 
- 
-static cl::opt<bool> 
-    EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, 
-                       cl::desc("Enable if-conversion during vectorization.")); 
- 
-static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 
-    "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 
-    cl::desc("The maximum allowed number of runtime memory checks with a " 
-             "vectorize(enable) pragma.")); 
- 
-static cl::opt<unsigned> VectorizeSCEVCheckThreshold( 
-    "vectorize-scev-check-threshold", cl::init(16), cl::Hidden, 
-    cl::desc("The maximum number of SCEV checks allowed.")); 
- 
-static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold( 
-    "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden, 
-    cl::desc("The maximum number of SCEV checks allowed with a " 
-             "vectorize(enable) pragma")); 
- 
-/// Maximum vectorization interleave count. 
-static const unsigned MaxInterleaveFactor = 16; 
- 
-namespace llvm { 
- 
-bool LoopVectorizeHints::Hint::validate(unsigned Val) { 
-  switch (Kind) { 
-  case HK_WIDTH: 
-    return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth; 
-  case HK_UNROLL: 
-    return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; 
-  case HK_FORCE: 
-    return (Val <= 1); 
-  case HK_ISVECTORIZED: 
-  case HK_PREDICATE: 
+#include "llvm/Transforms/Vectorize/LoopVectorize.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define LV_NAME "loop-vectorize"
+#define DEBUG_TYPE LV_NAME
+
+extern cl::opt<bool> EnableVPlanPredication;
+
+static cl::opt<bool>
+    EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
+                       cl::desc("Enable if-conversion during vectorization."));
+
+static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
+    "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
+    cl::desc("The maximum allowed number of runtime memory checks with a "
+             "vectorize(enable) pragma."));
+
+static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
+    "vectorize-scev-check-threshold", cl::init(16), cl::Hidden,
+    cl::desc("The maximum number of SCEV checks allowed."));
+
+static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
+    "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden,
+    cl::desc("The maximum number of SCEV checks allowed with a "
+             "vectorize(enable) pragma"));
+
+/// Maximum vectorization interleave count.
+static const unsigned MaxInterleaveFactor = 16;
+
+namespace llvm {
+
+bool LoopVectorizeHints::Hint::validate(unsigned Val) {
+  switch (Kind) {
+  case HK_WIDTH:
+    return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
+  case HK_UNROLL:
+    return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
+  case HK_FORCE:
+    return (Val <= 1);
+  case HK_ISVECTORIZED:
+  case HK_PREDICATE:
   case HK_SCALABLE:
-    return (Val == 0 || Val == 1); 
-  } 
-  return false; 
-} 
- 
-LoopVectorizeHints::LoopVectorizeHints(const Loop *L, 
-                                       bool InterleaveOnlyWhenForced, 
-                                       OptimizationRemarkEmitter &ORE) 
-    : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH), 
-      Interleave("interleave.count", InterleaveOnlyWhenForced, HK_UNROLL), 
-      Force("vectorize.enable", FK_Undefined, HK_FORCE), 
-      IsVectorized("isvectorized", 0, HK_ISVECTORIZED), 
+    return (Val == 0 || Val == 1);
+  }
+  return false;
+}
+
+LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
+                                       bool InterleaveOnlyWhenForced,
+                                       OptimizationRemarkEmitter &ORE)
+    : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH),
+      Interleave("interleave.count", InterleaveOnlyWhenForced, HK_UNROLL),
+      Force("vectorize.enable", FK_Undefined, HK_FORCE),
+      IsVectorized("isvectorized", 0, HK_ISVECTORIZED),
       Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE),
       Scalable("vectorize.scalable.enable", false, HK_SCALABLE), TheLoop(L),
-      ORE(ORE) { 
-  // Populate values with existing loop metadata. 
-  getHintsFromMetadata(); 
- 
-  // force-vector-interleave overrides DisableInterleaving. 
-  if (VectorizerParams::isInterleaveForced()) 
-    Interleave.Value = VectorizerParams::VectorizationInterleave; 
- 
-  if (IsVectorized.Value != 1) 
-    // If the vectorization width and interleaving count are both 1 then 
-    // consider the loop to have been already vectorized because there's 
-    // nothing more that we can do. 
+      ORE(ORE) {
+  // Populate values with existing loop metadata.
+  getHintsFromMetadata();
+
+  // force-vector-interleave overrides DisableInterleaving.
+  if (VectorizerParams::isInterleaveForced())
+    Interleave.Value = VectorizerParams::VectorizationInterleave;
+
+  if (IsVectorized.Value != 1)
+    // If the vectorization width and interleaving count are both 1 then
+    // consider the loop to have been already vectorized because there's
+    // nothing more that we can do.
     IsVectorized.Value =
         getWidth() == ElementCount::getFixed(1) && Interleave.Value == 1;
-  LLVM_DEBUG(if (InterleaveOnlyWhenForced && Interleave.Value == 1) dbgs() 
-             << "LV: Interleaving disabled by the pass manager\n"); 
-} 
- 
-void LoopVectorizeHints::setAlreadyVectorized() { 
-  LLVMContext &Context = TheLoop->getHeader()->getContext(); 
- 
-  MDNode *IsVectorizedMD = MDNode::get( 
-      Context, 
-      {MDString::get(Context, "llvm.loop.isvectorized"), 
-       ConstantAsMetadata::get(ConstantInt::get(Context, APInt(32, 1)))}); 
-  MDNode *LoopID = TheLoop->getLoopID(); 
-  MDNode *NewLoopID = 
-      makePostTransformationMetadata(Context, LoopID, 
-                                     {Twine(Prefix(), "vectorize.").str(), 
-                                      Twine(Prefix(), "interleave.").str()}, 
-                                     {IsVectorizedMD}); 
-  TheLoop->setLoopID(NewLoopID); 
- 
-  // Update internal cache. 
-  IsVectorized.Value = 1; 
-} 
- 
-bool LoopVectorizeHints::allowVectorization( 
-    Function *F, Loop *L, bool VectorizeOnlyWhenForced) const { 
-  if (getForce() == LoopVectorizeHints::FK_Disabled) { 
-    LLVM_DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); 
-    emitRemarkWithHints(); 
-    return false; 
-  } 
- 
-  if (VectorizeOnlyWhenForced && getForce() != LoopVectorizeHints::FK_Enabled) { 
-    LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); 
-    emitRemarkWithHints(); 
-    return false; 
-  } 
- 
-  if (getIsVectorized() == 1) { 
-    LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n"); 
-    // FIXME: Add interleave.disable metadata. This will allow 
-    // vectorize.disable to be used without disabling the pass and errors 
-    // to differentiate between disabled vectorization and a width of 1. 
-    ORE.emit([&]() { 
-      return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(), 
-                                        "AllDisabled", L->getStartLoc(), 
-                                        L->getHeader()) 
-             << "loop not vectorized: vectorization and interleaving are " 
-                "explicitly disabled, or the loop has already been " 
-                "vectorized"; 
-    }); 
-    return false; 
-  } 
- 
-  return true; 
-} 
- 
-void LoopVectorizeHints::emitRemarkWithHints() const { 
-  using namespace ore; 
- 
-  ORE.emit([&]() { 
-    if (Force.Value == LoopVectorizeHints::FK_Disabled) 
-      return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled", 
-                                      TheLoop->getStartLoc(), 
-                                      TheLoop->getHeader()) 
-             << "loop not vectorized: vectorization is explicitly disabled"; 
-    else { 
-      OptimizationRemarkMissed R(LV_NAME, "MissedDetails", 
-                                 TheLoop->getStartLoc(), TheLoop->getHeader()); 
-      R << "loop not vectorized"; 
-      if (Force.Value == LoopVectorizeHints::FK_Enabled) { 
-        R << " (Force=" << NV("Force", true); 
-        if (Width.Value != 0) 
+  LLVM_DEBUG(if (InterleaveOnlyWhenForced && Interleave.Value == 1) dbgs()
+             << "LV: Interleaving disabled by the pass manager\n");
+}
+
+void LoopVectorizeHints::setAlreadyVectorized() {
+  LLVMContext &Context = TheLoop->getHeader()->getContext();
+
+  MDNode *IsVectorizedMD = MDNode::get(
+      Context,
+      {MDString::get(Context, "llvm.loop.isvectorized"),
+       ConstantAsMetadata::get(ConstantInt::get(Context, APInt(32, 1)))});
+  MDNode *LoopID = TheLoop->getLoopID();
+  MDNode *NewLoopID =
+      makePostTransformationMetadata(Context, LoopID,
+                                     {Twine(Prefix(), "vectorize.").str(),
+                                      Twine(Prefix(), "interleave.").str()},
+                                     {IsVectorizedMD});
+  TheLoop->setLoopID(NewLoopID);
+
+  // Update internal cache.
+  IsVectorized.Value = 1;
+}
+
+bool LoopVectorizeHints::allowVectorization(
+    Function *F, Loop *L, bool VectorizeOnlyWhenForced) const {
+  if (getForce() == LoopVectorizeHints::FK_Disabled) {
+    LLVM_DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
+    emitRemarkWithHints();
+    return false;
+  }
+
+  if (VectorizeOnlyWhenForced && getForce() != LoopVectorizeHints::FK_Enabled) {
+    LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
+    emitRemarkWithHints();
+    return false;
+  }
+
+  if (getIsVectorized() == 1) {
+    LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
+    // FIXME: Add interleave.disable metadata. This will allow
+    // vectorize.disable to be used without disabling the pass and errors
+    // to differentiate between disabled vectorization and a width of 1.
+    ORE.emit([&]() {
+      return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),
+                                        "AllDisabled", L->getStartLoc(),
+                                        L->getHeader())
+             << "loop not vectorized: vectorization and interleaving are "
+                "explicitly disabled, or the loop has already been "
+                "vectorized";
+    });
+    return false;
+  }
+
+  return true;
+}
+
+void LoopVectorizeHints::emitRemarkWithHints() const {
+  using namespace ore;
+
+  ORE.emit([&]() {
+    if (Force.Value == LoopVectorizeHints::FK_Disabled)
+      return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
+                                      TheLoop->getStartLoc(),
+                                      TheLoop->getHeader())
+             << "loop not vectorized: vectorization is explicitly disabled";
+    else {
+      OptimizationRemarkMissed R(LV_NAME, "MissedDetails",
+                                 TheLoop->getStartLoc(), TheLoop->getHeader());
+      R << "loop not vectorized";
+      if (Force.Value == LoopVectorizeHints::FK_Enabled) {
+        R << " (Force=" << NV("Force", true);
+        if (Width.Value != 0)
           R << ", Vector Width=" << NV("VectorWidth", getWidth());
-        if (Interleave.Value != 0) 
-          R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value); 
-        R << ")"; 
-      } 
-      return R; 
-    } 
-  }); 
-} 
- 
-const char *LoopVectorizeHints::vectorizeAnalysisPassName() const { 
+        if (Interleave.Value != 0)
+          R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value);
+        R << ")";
+      }
+      return R;
+    }
+  });
+}
+
+const char *LoopVectorizeHints::vectorizeAnalysisPassName() const {
   if (getWidth() == ElementCount::getFixed(1))
-    return LV_NAME; 
-  if (getForce() == LoopVectorizeHints::FK_Disabled) 
-    return LV_NAME; 
+    return LV_NAME;
+  if (getForce() == LoopVectorizeHints::FK_Disabled)
+    return LV_NAME;
   if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth().isZero())
-    return LV_NAME; 
-  return OptimizationRemarkAnalysis::AlwaysPrint; 
-} 
- 
-void LoopVectorizeHints::getHintsFromMetadata() { 
-  MDNode *LoopID = TheLoop->getLoopID(); 
-  if (!LoopID) 
-    return; 
- 
-  // First operand should refer to the loop id itself. 
-  assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); 
-  assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); 
- 
-  for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 
-    const MDString *S = nullptr; 
-    SmallVector<Metadata *, 4> Args; 
- 
-    // The expected hint is either a MDString or a MDNode with the first 
-    // operand a MDString. 
-    if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) { 
-      if (!MD || MD->getNumOperands() == 0) 
-        continue; 
-      S = dyn_cast<MDString>(MD->getOperand(0)); 
-      for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i) 
-        Args.push_back(MD->getOperand(i)); 
-    } else { 
-      S = dyn_cast<MDString>(LoopID->getOperand(i)); 
-      assert(Args.size() == 0 && "too many arguments for MDString"); 
-    } 
- 
-    if (!S) 
-      continue; 
- 
-    // Check if the hint starts with the loop metadata prefix. 
-    StringRef Name = S->getString(); 
-    if (Args.size() == 1) 
-      setHint(Name, Args[0]); 
-  } 
-} 
- 
-void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) { 
-  if (!Name.startswith(Prefix())) 
-    return; 
-  Name = Name.substr(Prefix().size(), StringRef::npos); 
- 
-  const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg); 
-  if (!C) 
-    return; 
-  unsigned Val = C->getZExtValue(); 
- 
+    return LV_NAME;
+  return OptimizationRemarkAnalysis::AlwaysPrint;
+}
+
+void LoopVectorizeHints::getHintsFromMetadata() {
+  MDNode *LoopID = TheLoop->getLoopID();
+  if (!LoopID)
+    return;
+
+  // First operand should refer to the loop id itself.
+  assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
+  assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
+
+  for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+    const MDString *S = nullptr;
+    SmallVector<Metadata *, 4> Args;
+
+    // The expected hint is either a MDString or a MDNode with the first
+    // operand a MDString.
+    if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
+      if (!MD || MD->getNumOperands() == 0)
+        continue;
+      S = dyn_cast<MDString>(MD->getOperand(0));
+      for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
+        Args.push_back(MD->getOperand(i));
+    } else {
+      S = dyn_cast<MDString>(LoopID->getOperand(i));
+      assert(Args.size() == 0 && "too many arguments for MDString");
+    }
+
+    if (!S)
+      continue;
+
+    // Check if the hint starts with the loop metadata prefix.
+    StringRef Name = S->getString();
+    if (Args.size() == 1)
+      setHint(Name, Args[0]);
+  }
+}
+
+void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
+  if (!Name.startswith(Prefix()))
+    return;
+  Name = Name.substr(Prefix().size(), StringRef::npos);
+
+  const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
+  if (!C)
+    return;
+  unsigned Val = C->getZExtValue();
+
   Hint *Hints[] = {&Width,        &Interleave, &Force,
                    &IsVectorized, &Predicate,  &Scalable};
-  for (auto H : Hints) { 
-    if (Name == H->Name) { 
-      if (H->validate(Val)) 
-        H->Value = Val; 
-      else 
-        LLVM_DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n"); 
-      break; 
-    } 
-  } 
-} 
- 
-bool LoopVectorizationRequirements::doesNotMeet( 
-    Function *F, Loop *L, const LoopVectorizeHints &Hints) { 
-  const char *PassName = Hints.vectorizeAnalysisPassName(); 
-  bool Failed = false; 
-  if (UnsafeAlgebraInst && !Hints.allowReordering()) { 
-    ORE.emit([&]() { 
-      return OptimizationRemarkAnalysisFPCommute( 
-                 PassName, "CantReorderFPOps", UnsafeAlgebraInst->getDebugLoc(), 
-                 UnsafeAlgebraInst->getParent()) 
-             << "loop not vectorized: cannot prove it is safe to reorder " 
-                "floating-point operations"; 
-    }); 
-    Failed = true; 
-  } 
- 
-  // Test if runtime memcheck thresholds are exceeded. 
-  bool PragmaThresholdReached = 
-      NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 
-  bool ThresholdReached = 
-      NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 
-  if ((ThresholdReached && !Hints.allowReordering()) || 
-      PragmaThresholdReached) { 
-    ORE.emit([&]() { 
-      return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps", 
-                                                L->getStartLoc(), 
-                                                L->getHeader()) 
-             << "loop not vectorized: cannot prove it is safe to reorder " 
-                "memory operations"; 
-    }); 
-    LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 
-    Failed = true; 
-  } 
- 
-  return Failed; 
-} 
- 
-// Return true if the inner loop \p Lp is uniform with regard to the outer loop 
-// \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes 
-// executing the inner loop will execute the same iterations). This check is 
-// very constrained for now but it will be relaxed in the future. \p Lp is 
-// considered uniform if it meets all the following conditions: 
-//   1) it has a canonical IV (starting from 0 and with stride 1), 
-//   2) its latch terminator is a conditional branch and, 
-//   3) its latch condition is a compare instruction whose operands are the 
-//      canonical IV and an OuterLp invariant. 
-// This check doesn't take into account the uniformity of other conditions not 
-// related to the loop latch because they don't affect the loop uniformity. 
-// 
-// NOTE: We decided to keep all these checks and its associated documentation 
-// together so that we can easily have a picture of the current supported loop 
-// nests. However, some of the current checks don't depend on \p OuterLp and 
-// would be redundantly executed for each \p Lp if we invoked this function for 
-// different candidate outer loops. This is not the case for now because we 
-// don't currently have the infrastructure to evaluate multiple candidate outer 
-// loops and \p OuterLp will be a fixed parameter while we only support explicit 
-// outer loop vectorization. It's also very likely that these checks go away 
-// before introducing the aforementioned infrastructure. However, if this is not 
-// the case, we should move the \p OuterLp independent checks to a separate 
-// function that is only executed once for each \p Lp. 
-static bool isUniformLoop(Loop *Lp, Loop *OuterLp) { 
-  assert(Lp->getLoopLatch() && "Expected loop with a single latch."); 
- 
-  // If Lp is the outer loop, it's uniform by definition. 
-  if (Lp == OuterLp) 
-    return true; 
-  assert(OuterLp->contains(Lp) && "OuterLp must contain Lp."); 
- 
-  // 1. 
-  PHINode *IV = Lp->getCanonicalInductionVariable(); 
-  if (!IV) { 
-    LLVM_DEBUG(dbgs() << "LV: Canonical IV not found.\n"); 
-    return false; 
-  } 
- 
-  // 2. 
-  BasicBlock *Latch = Lp->getLoopLatch(); 
-  auto *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator()); 
-  if (!LatchBr || LatchBr->isUnconditional()) { 
-    LLVM_DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n"); 
-    return false; 
-  } 
- 
-  // 3. 
-  auto *LatchCmp = dyn_cast<CmpInst>(LatchBr->getCondition()); 
-  if (!LatchCmp) { 
-    LLVM_DEBUG( 
-        dbgs() << "LV: Loop latch condition is not a compare instruction.\n"); 
-    return false; 
-  } 
- 
-  Value *CondOp0 = LatchCmp->getOperand(0); 
-  Value *CondOp1 = LatchCmp->getOperand(1); 
-  Value *IVUpdate = IV->getIncomingValueForBlock(Latch); 
-  if (!(CondOp0 == IVUpdate && OuterLp->isLoopInvariant(CondOp1)) && 
-      !(CondOp1 == IVUpdate && OuterLp->isLoopInvariant(CondOp0))) { 
-    LLVM_DEBUG(dbgs() << "LV: Loop latch condition is not uniform.\n"); 
-    return false; 
-  } 
- 
-  return true; 
-} 
- 
-// Return true if \p Lp and all its nested loops are uniform with regard to \p 
-// OuterLp. 
-static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) { 
-  if (!isUniformLoop(Lp, OuterLp)) 
-    return false; 
- 
-  // Check if nested loops are uniform. 
-  for (Loop *SubLp : *Lp) 
-    if (!isUniformLoopNest(SubLp, OuterLp)) 
-      return false; 
- 
-  return true; 
-} 
- 
-/// Check whether it is safe to if-convert this phi node. 
-/// 
-/// Phi nodes with constant expressions that can trap are not safe to if 
-/// convert. 
-static bool canIfConvertPHINodes(BasicBlock *BB) { 
-  for (PHINode &Phi : BB->phis()) { 
-    for (Value *V : Phi.incoming_values()) 
-      if (auto *C = dyn_cast<Constant>(V)) 
-        if (C->canTrap()) 
-          return false; 
-  } 
-  return true; 
-} 
- 
-static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { 
-  if (Ty->isPointerTy()) 
-    return DL.getIntPtrType(Ty); 
- 
-  // It is possible that char's or short's overflow when we ask for the loop's 
-  // trip count, work around this by changing the type size. 
-  if (Ty->getScalarSizeInBits() < 32) 
-    return Type::getInt32Ty(Ty->getContext()); 
- 
-  return Ty; 
-} 
- 
-static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) { 
-  Ty0 = convertPointerToIntegerType(DL, Ty0); 
-  Ty1 = convertPointerToIntegerType(DL, Ty1); 
-  if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits()) 
-    return Ty0; 
-  return Ty1; 
-} 
- 
-/// Check that the instruction has outside loop users and is not an 
-/// identified reduction variable. 
-static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, 
-                               SmallPtrSetImpl<Value *> &AllowedExit) { 
-  // Reductions, Inductions and non-header phis are allowed to have exit users. All 
-  // other instructions must not have external users. 
-  if (!AllowedExit.count(Inst)) 
-    // Check that all of the users of the loop are inside the BB. 
-    for (User *U : Inst->users()) { 
-      Instruction *UI = cast<Instruction>(U); 
-      // This user may be a reduction exit value. 
-      if (!TheLoop->contains(UI)) { 
-        LLVM_DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n'); 
-        return true; 
-      } 
-    } 
-  return false; 
-} 
- 
-int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { 
-  const ValueToValueMap &Strides = 
-      getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap(); 
- 
+  for (auto H : Hints) {
+    if (Name == H->Name) {
+      if (H->validate(Val))
+        H->Value = Val;
+      else
+        LLVM_DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
+      break;
+    }
+  }
+}
+
+bool LoopVectorizationRequirements::doesNotMeet(
+    Function *F, Loop *L, const LoopVectorizeHints &Hints) {
+  const char *PassName = Hints.vectorizeAnalysisPassName();
+  bool Failed = false;
+  if (UnsafeAlgebraInst && !Hints.allowReordering()) {
+    ORE.emit([&]() {
+      return OptimizationRemarkAnalysisFPCommute(
+                 PassName, "CantReorderFPOps", UnsafeAlgebraInst->getDebugLoc(),
+                 UnsafeAlgebraInst->getParent())
+             << "loop not vectorized: cannot prove it is safe to reorder "
+                "floating-point operations";
+    });
+    Failed = true;
+  }
+
+  // Test if runtime memcheck thresholds are exceeded.
+  bool PragmaThresholdReached =
+      NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
+  bool ThresholdReached =
+      NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
+  if ((ThresholdReached && !Hints.allowReordering()) ||
+      PragmaThresholdReached) {
+    ORE.emit([&]() {
+      return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps",
+                                                L->getStartLoc(),
+                                                L->getHeader())
+             << "loop not vectorized: cannot prove it is safe to reorder "
+                "memory operations";
+    });
+    LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
+    Failed = true;
+  }
+
+  return Failed;
+}
+
+// Return true if the inner loop \p Lp is uniform with regard to the outer loop
+// \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes
+// executing the inner loop will execute the same iterations). This check is
+// very constrained for now but it will be relaxed in the future. \p Lp is
+// considered uniform if it meets all the following conditions:
+//   1) it has a canonical IV (starting from 0 and with stride 1),
+//   2) its latch terminator is a conditional branch and,
+//   3) its latch condition is a compare instruction whose operands are the
+//      canonical IV and an OuterLp invariant.
+// This check doesn't take into account the uniformity of other conditions not
+// related to the loop latch because they don't affect the loop uniformity.
+//
+// NOTE: We decided to keep all these checks and its associated documentation
+// together so that we can easily have a picture of the current supported loop
+// nests. However, some of the current checks don't depend on \p OuterLp and
+// would be redundantly executed for each \p Lp if we invoked this function for
+// different candidate outer loops. This is not the case for now because we
+// don't currently have the infrastructure to evaluate multiple candidate outer
+// loops and \p OuterLp will be a fixed parameter while we only support explicit
+// outer loop vectorization. It's also very likely that these checks go away
+// before introducing the aforementioned infrastructure. However, if this is not
+// the case, we should move the \p OuterLp independent checks to a separate
+// function that is only executed once for each \p Lp.
+static bool isUniformLoop(Loop *Lp, Loop *OuterLp) {
+  assert(Lp->getLoopLatch() && "Expected loop with a single latch.");
+
+  // If Lp is the outer loop, it's uniform by definition.
+  if (Lp == OuterLp)
+    return true;
+  assert(OuterLp->contains(Lp) && "OuterLp must contain Lp.");
+
+  // 1.
+  PHINode *IV = Lp->getCanonicalInductionVariable();
+  if (!IV) {
+    LLVM_DEBUG(dbgs() << "LV: Canonical IV not found.\n");
+    return false;
+  }
+
+  // 2.
+  BasicBlock *Latch = Lp->getLoopLatch();
+  auto *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator());
+  if (!LatchBr || LatchBr->isUnconditional()) {
+    LLVM_DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n");
+    return false;
+  }
+
+  // 3.
+  auto *LatchCmp = dyn_cast<CmpInst>(LatchBr->getCondition());
+  if (!LatchCmp) {
+    LLVM_DEBUG(
+        dbgs() << "LV: Loop latch condition is not a compare instruction.\n");
+    return false;
+  }
+
+  Value *CondOp0 = LatchCmp->getOperand(0);
+  Value *CondOp1 = LatchCmp->getOperand(1);
+  Value *IVUpdate = IV->getIncomingValueForBlock(Latch);
+  if (!(CondOp0 == IVUpdate && OuterLp->isLoopInvariant(CondOp1)) &&
+      !(CondOp1 == IVUpdate && OuterLp->isLoopInvariant(CondOp0))) {
+    LLVM_DEBUG(dbgs() << "LV: Loop latch condition is not uniform.\n");
+    return false;
+  }
+
+  return true;
+}
+
+// Return true if \p Lp and all its nested loops are uniform with regard to \p
+// OuterLp.
+static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) {
+  if (!isUniformLoop(Lp, OuterLp))
+    return false;
+
+  // Check if nested loops are uniform.
+  for (Loop *SubLp : *Lp)
+    if (!isUniformLoopNest(SubLp, OuterLp))
+      return false;
+
+  return true;
+}
+
+/// Check whether it is safe to if-convert this phi node.
+///
+/// Phi nodes with constant expressions that can trap are not safe to if
+/// convert.
+static bool canIfConvertPHINodes(BasicBlock *BB) {
+  for (PHINode &Phi : BB->phis()) {
+    for (Value *V : Phi.incoming_values())
+      if (auto *C = dyn_cast<Constant>(V))
+        if (C->canTrap())
+          return false;
+  }
+  return true;
+}
+
+static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
+  if (Ty->isPointerTy())
+    return DL.getIntPtrType(Ty);
+
+  // It is possible that char's or short's overflow when we ask for the loop's
+  // trip count, work around this by changing the type size.
+  if (Ty->getScalarSizeInBits() < 32)
+    return Type::getInt32Ty(Ty->getContext());
+
+  return Ty;
+}
+
+static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
+  Ty0 = convertPointerToIntegerType(DL, Ty0);
+  Ty1 = convertPointerToIntegerType(DL, Ty1);
+  if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
+    return Ty0;
+  return Ty1;
+}
+
+/// Check that the instruction has outside loop users and is not an
+/// identified reduction variable.
+static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
+                               SmallPtrSetImpl<Value *> &AllowedExit) {
+  // Reductions, Inductions and non-header phis are allowed to have exit users. All
+  // other instructions must not have external users.
+  if (!AllowedExit.count(Inst))
+    // Check that all of the users of the loop are inside the BB.
+    for (User *U : Inst->users()) {
+      Instruction *UI = cast<Instruction>(U);
+      // This user may be a reduction exit value.
+      if (!TheLoop->contains(UI)) {
+        LLVM_DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
+        return true;
+      }
+    }
+  return false;
+}
+
+int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
+  const ValueToValueMap &Strides =
+      getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap();
+
   Function *F = TheLoop->getHeader()->getParent();
   bool OptForSize = F->hasOptSize() ||
                     llvm::shouldOptimizeForSize(TheLoop->getHeader(), PSI, BFI,
                                                 PGSOQueryType::IRPass);
   bool CanAddPredicate = !OptForSize;
-  int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, CanAddPredicate, false); 
-  if (Stride == 1 || Stride == -1) 
-    return Stride; 
-  return 0; 
-} 
- 
-bool LoopVectorizationLegality::isUniform(Value *V) { 
-  return LAI->isUniform(V); 
-} 
- 
-bool LoopVectorizationLegality::canVectorizeOuterLoop() { 
+  int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, CanAddPredicate, false);
+  if (Stride == 1 || Stride == -1)
+    return Stride;
+  return 0;
+}
+
+bool LoopVectorizationLegality::isUniform(Value *V) {
+  return LAI->isUniform(V);
+}
+
+bool LoopVectorizationLegality::canVectorizeOuterLoop() {
   assert(!TheLoop->isInnermost() && "We are not vectorizing an outer loop.");
-  // Store the result and return it at the end instead of exiting early, in case 
-  // allowExtraAnalysis is used to report multiple reasons for not vectorizing. 
-  bool Result = true; 
-  bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); 
- 
-  for (BasicBlock *BB : TheLoop->blocks()) { 
-    // Check whether the BB terminator is a BranchInst. Any other terminator is 
-    // not supported yet. 
-    auto *Br = dyn_cast<BranchInst>(BB->getTerminator()); 
-    if (!Br) { 
-      reportVectorizationFailure("Unsupported basic block terminator", 
-          "loop control flow is not understood by vectorizer", 
-          "CFGNotUnderstood", ORE, TheLoop); 
-      if (DoExtraAnalysis) 
-        Result = false; 
-      else 
-        return false; 
-    } 
- 
-    // Check whether the BranchInst is a supported one. Only unconditional 
-    // branches, conditional branches with an outer loop invariant condition or 
-    // backedges are supported. 
-    // FIXME: We skip these checks when VPlan predication is enabled as we 
-    // want to allow divergent branches. This whole check will be removed 
-    // once VPlan predication is on by default. 
-    if (!EnableVPlanPredication && Br && Br->isConditional() && 
-        !TheLoop->isLoopInvariant(Br->getCondition()) && 
-        !LI->isLoopHeader(Br->getSuccessor(0)) && 
-        !LI->isLoopHeader(Br->getSuccessor(1))) { 
-      reportVectorizationFailure("Unsupported conditional branch", 
-          "loop control flow is not understood by vectorizer", 
-          "CFGNotUnderstood", ORE, TheLoop); 
-      if (DoExtraAnalysis) 
-        Result = false; 
-      else 
-        return false; 
-    } 
-  } 
- 
-  // Check whether inner loops are uniform. At this point, we only support 
-  // simple outer loops scenarios with uniform nested loops. 
-  if (!isUniformLoopNest(TheLoop /*loop nest*/, 
-                         TheLoop /*context outer loop*/)) { 
-    reportVectorizationFailure("Outer loop contains divergent loops", 
-        "loop control flow is not understood by vectorizer", 
-        "CFGNotUnderstood", ORE, TheLoop); 
-    if (DoExtraAnalysis) 
-      Result = false; 
-    else 
-      return false; 
-  } 
- 
-  // Check whether we are able to set up outer loop induction. 
-  if (!setupOuterLoopInductions()) { 
-    reportVectorizationFailure("Unsupported outer loop Phi(s)", 
-                               "Unsupported outer loop Phi(s)", 
-                               "UnsupportedPhi", ORE, TheLoop); 
-    if (DoExtraAnalysis) 
-      Result = false; 
-    else 
-      return false; 
-  } 
- 
-  return Result; 
-} 
- 
-void LoopVectorizationLegality::addInductionPhi( 
-    PHINode *Phi, const InductionDescriptor &ID, 
-    SmallPtrSetImpl<Value *> &AllowedExit) { 
-  Inductions[Phi] = ID; 
- 
-  // In case this induction also comes with casts that we know we can ignore 
-  // in the vectorized loop body, record them here. All casts could be recorded 
-  // here for ignoring, but suffices to record only the first (as it is the 
-  // only one that may bw used outside the cast sequence). 
-  const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 
-  if (!Casts.empty()) 
-    InductionCastsToIgnore.insert(*Casts.begin()); 
- 
-  Type *PhiTy = Phi->getType(); 
-  const DataLayout &DL = Phi->getModule()->getDataLayout(); 
- 
-  // Get the widest type. 
-  if (!PhiTy->isFloatingPointTy()) { 
-    if (!WidestIndTy) 
-      WidestIndTy = convertPointerToIntegerType(DL, PhiTy); 
-    else 
-      WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy); 
-  } 
- 
-  // Int inductions are special because we only allow one IV. 
-  if (ID.getKind() == InductionDescriptor::IK_IntInduction && 
-      ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne() && 
-      isa<Constant>(ID.getStartValue()) && 
-      cast<Constant>(ID.getStartValue())->isNullValue()) { 
- 
-    // Use the phi node with the widest type as induction. Use the last 
-    // one if there are multiple (no good reason for doing this other 
-    // than it is expedient). We've checked that it begins at zero and 
-    // steps by one, so this is a canonical induction variable. 
-    if (!PrimaryInduction || PhiTy == WidestIndTy) 
-      PrimaryInduction = Phi; 
-  } 
- 
-  // Both the PHI node itself, and the "post-increment" value feeding 
-  // back into the PHI node may have external users. 
-  // We can allow those uses, except if the SCEVs we have for them rely 
-  // on predicates that only hold within the loop, since allowing the exit 
-  // currently means re-using this SCEV outside the loop (see PR33706 for more 
-  // details). 
-  if (PSE.getUnionPredicate().isAlwaysTrue()) { 
-    AllowedExit.insert(Phi); 
-    AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch())); 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n"); 
-} 
- 
-bool LoopVectorizationLegality::setupOuterLoopInductions() { 
-  BasicBlock *Header = TheLoop->getHeader(); 
- 
-  // Returns true if a given Phi is a supported induction. 
-  auto isSupportedPhi = [&](PHINode &Phi) -> bool { 
-    InductionDescriptor ID; 
-    if (InductionDescriptor::isInductionPHI(&Phi, TheLoop, PSE, ID) && 
-        ID.getKind() == InductionDescriptor::IK_IntInduction) { 
-      addInductionPhi(&Phi, ID, AllowedExit); 
-      return true; 
-    } else { 
-      // Bail out for any Phi in the outer loop header that is not a supported 
-      // induction. 
-      LLVM_DEBUG( 
-          dbgs() 
-          << "LV: Found unsupported PHI for outer loop vectorization.\n"); 
-      return false; 
-    } 
-  }; 
- 
-  if (llvm::all_of(Header->phis(), isSupportedPhi)) 
-    return true; 
-  else 
-    return false; 
-} 
- 
-/// Checks if a function is scalarizable according to the TLI, in 
-/// the sense that it should be vectorized and then expanded in 
-/// multiple scalarcalls. This is represented in the 
-/// TLI via mappings that do not specify a vector name, as in the 
-/// following example: 
-/// 
-///    const VecDesc VecIntrinsics[] = { 
-///      {"llvm.phx.abs.i32", "", 4} 
-///    }; 
-static bool isTLIScalarize(const TargetLibraryInfo &TLI, const CallInst &CI) { 
-  const StringRef ScalarName = CI.getCalledFunction()->getName(); 
-  bool Scalarize = TLI.isFunctionVectorizable(ScalarName); 
-  // Check that all known VFs are not associated to a vector 
-  // function, i.e. the vector name is emty. 
-  if (Scalarize) 
-    for (unsigned VF = 2, WidestVF = TLI.getWidestVF(ScalarName); 
-         VF <= WidestVF; VF *= 2) { 
-      Scalarize &= !TLI.isFunctionVectorizable(ScalarName, VF); 
-    } 
-  return Scalarize; 
-} 
- 
-bool LoopVectorizationLegality::canVectorizeInstrs() { 
-  BasicBlock *Header = TheLoop->getHeader(); 
- 
-  // Look for the attribute signaling the absence of NaNs. 
-  Function &F = *Header->getParent(); 
-  HasFunNoNaNAttr = 
-      F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true"; 
- 
-  // For each block in the loop. 
-  for (BasicBlock *BB : TheLoop->blocks()) { 
-    // Scan the instructions in the block and look for hazards. 
-    for (Instruction &I : *BB) { 
-      if (auto *Phi = dyn_cast<PHINode>(&I)) { 
-        Type *PhiTy = Phi->getType(); 
-        // Check that this PHI type is allowed. 
-        if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && 
-            !PhiTy->isPointerTy()) { 
-          reportVectorizationFailure("Found a non-int non-pointer PHI", 
-                                     "loop control flow is not understood by vectorizer", 
-                                     "CFGNotUnderstood", ORE, TheLoop); 
-          return false; 
-        } 
- 
-        // If this PHINode is not in the header block, then we know that we 
-        // can convert it to select during if-conversion. No need to check if 
-        // the PHIs in this block are induction or reduction variables. 
-        if (BB != Header) { 
-          // Non-header phi nodes that have outside uses can be vectorized. Add 
-          // them to the list of allowed exits. 
-          // Unsafe cyclic dependencies with header phis are identified during 
-          // legalization for reduction, induction and first order 
-          // recurrences. 
-          AllowedExit.insert(&I); 
-          continue; 
-        } 
- 
-        // We only allow if-converted PHIs with exactly two incoming values. 
-        if (Phi->getNumIncomingValues() != 2) { 
-          reportVectorizationFailure("Found an invalid PHI", 
-              "loop control flow is not understood by vectorizer", 
-              "CFGNotUnderstood", ORE, TheLoop, Phi); 
-          return false; 
-        } 
- 
-        RecurrenceDescriptor RedDes; 
-        if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC, 
-                                                 DT)) { 
-          if (RedDes.hasUnsafeAlgebra()) 
-            Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst()); 
-          AllowedExit.insert(RedDes.getLoopExitInstr()); 
-          Reductions[Phi] = RedDes; 
-          continue; 
-        } 
- 
-        // TODO: Instead of recording the AllowedExit, it would be good to record the 
-        // complementary set: NotAllowedExit. These include (but may not be 
-        // limited to): 
-        // 1. Reduction phis as they represent the one-before-last value, which 
-        // is not available when vectorized  
-        // 2. Induction phis and increment when SCEV predicates cannot be used 
-        // outside the loop - see addInductionPhi 
-        // 3. Non-Phis with outside uses when SCEV predicates cannot be used 
-        // outside the loop - see call to hasOutsideLoopUser in the non-phi 
-        // handling below 
-        // 4. FirstOrderRecurrence phis that can possibly be handled by 
-        // extraction. 
-        // By recording these, we can then reason about ways to vectorize each 
-        // of these NotAllowedExit.  
-        InductionDescriptor ID; 
-        if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) { 
-          addInductionPhi(Phi, ID, AllowedExit); 
-          if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr) 
-            Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst()); 
-          continue; 
-        } 
- 
-        if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop, 
-                                                         SinkAfter, DT)) { 
-          AllowedExit.insert(Phi); 
-          FirstOrderRecurrences.insert(Phi); 
-          continue; 
-        } 
- 
-        // As a last resort, coerce the PHI to a AddRec expression 
-        // and re-try classifying it a an induction PHI. 
-        if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) { 
-          addInductionPhi(Phi, ID, AllowedExit); 
-          continue; 
-        } 
- 
-        reportVectorizationFailure("Found an unidentified PHI", 
-            "value that could not be identified as " 
-            "reduction is used outside the loop", 
-            "NonReductionValueUsedOutsideLoop", ORE, TheLoop, Phi); 
-        return false; 
-      } // end of PHI handling 
- 
-      // We handle calls that: 
-      //   * Are debug info intrinsics. 
-      //   * Have a mapping to an IR intrinsic. 
-      //   * Have a vector version available. 
-      auto *CI = dyn_cast<CallInst>(&I); 
- 
-      if (CI && !getVectorIntrinsicIDForCall(CI, TLI) && 
-          !isa<DbgInfoIntrinsic>(CI) && 
-          !(CI->getCalledFunction() && TLI && 
-            (!VFDatabase::getMappings(*CI).empty() || 
-             isTLIScalarize(*TLI, *CI)))) { 
-        // If the call is a recognized math libary call, it is likely that 
-        // we can vectorize it given loosened floating-point constraints. 
-        LibFunc Func; 
-        bool IsMathLibCall = 
-            TLI && CI->getCalledFunction() && 
-            CI->getType()->isFloatingPointTy() && 
-            TLI->getLibFunc(CI->getCalledFunction()->getName(), Func) && 
-            TLI->hasOptimizedCodeGen(Func); 
- 
-        if (IsMathLibCall) { 
-          // TODO: Ideally, we should not use clang-specific language here, 
-          // but it's hard to provide meaningful yet generic advice. 
-          // Also, should this be guarded by allowExtraAnalysis() and/or be part 
-          // of the returned info from isFunctionVectorizable()? 
-          reportVectorizationFailure( 
-              "Found a non-intrinsic callsite", 
-              "library call cannot be vectorized. " 
-              "Try compiling with -fno-math-errno, -ffast-math, " 
-              "or similar flags", 
-              "CantVectorizeLibcall", ORE, TheLoop, CI); 
-        } else { 
-          reportVectorizationFailure("Found a non-intrinsic callsite", 
-                                     "call instruction cannot be vectorized", 
-                                     "CantVectorizeLibcall", ORE, TheLoop, CI); 
-        } 
-        return false; 
-      } 
- 
-      // Some intrinsics have scalar arguments and should be same in order for 
-      // them to be vectorized (i.e. loop invariant). 
-      if (CI) { 
-        auto *SE = PSE.getSE(); 
-        Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI); 
-        for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) 
-          if (hasVectorInstrinsicScalarOpd(IntrinID, i)) { 
-            if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(i)), TheLoop)) { 
-              reportVectorizationFailure("Found unvectorizable intrinsic", 
-                  "intrinsic instruction cannot be vectorized", 
-                  "CantVectorizeIntrinsic", ORE, TheLoop, CI); 
-              return false; 
-            } 
-          } 
-      } 
- 
-      // Check that the instruction return type is vectorizable. 
-      // Also, we can't vectorize extractelement instructions. 
-      if ((!VectorType::isValidElementType(I.getType()) && 
-           !I.getType()->isVoidTy()) || 
-          isa<ExtractElementInst>(I)) { 
-        reportVectorizationFailure("Found unvectorizable type", 
-            "instruction return type cannot be vectorized", 
-            "CantVectorizeInstructionReturnType", ORE, TheLoop, &I); 
-        return false; 
-      } 
- 
-      // Check that the stored type is vectorizable. 
-      if (auto *ST = dyn_cast<StoreInst>(&I)) { 
-        Type *T = ST->getValueOperand()->getType(); 
-        if (!VectorType::isValidElementType(T)) { 
-          reportVectorizationFailure("Store instruction cannot be vectorized", 
-                                     "store instruction cannot be vectorized", 
-                                     "CantVectorizeStore", ORE, TheLoop, ST); 
-          return false; 
-        } 
- 
-        // For nontemporal stores, check that a nontemporal vector version is 
-        // supported on the target. 
-        if (ST->getMetadata(LLVMContext::MD_nontemporal)) { 
-          // Arbitrarily try a vector of 2 elements. 
+  // Store the result and return it at the end instead of exiting early, in case
+  // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
+  bool Result = true;
+  bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    // Check whether the BB terminator is a BranchInst. Any other terminator is
+    // not supported yet.
+    auto *Br = dyn_cast<BranchInst>(BB->getTerminator());
+    if (!Br) {
+      reportVectorizationFailure("Unsupported basic block terminator",
+          "loop control flow is not understood by vectorizer",
+          "CFGNotUnderstood", ORE, TheLoop);
+      if (DoExtraAnalysis)
+        Result = false;
+      else
+        return false;
+    }
+
+    // Check whether the BranchInst is a supported one. Only unconditional
+    // branches, conditional branches with an outer loop invariant condition or
+    // backedges are supported.
+    // FIXME: We skip these checks when VPlan predication is enabled as we
+    // want to allow divergent branches. This whole check will be removed
+    // once VPlan predication is on by default.
+    if (!EnableVPlanPredication && Br && Br->isConditional() &&
+        !TheLoop->isLoopInvariant(Br->getCondition()) &&
+        !LI->isLoopHeader(Br->getSuccessor(0)) &&
+        !LI->isLoopHeader(Br->getSuccessor(1))) {
+      reportVectorizationFailure("Unsupported conditional branch",
+          "loop control flow is not understood by vectorizer",
+          "CFGNotUnderstood", ORE, TheLoop);
+      if (DoExtraAnalysis)
+        Result = false;
+      else
+        return false;
+    }
+  }
+
+  // Check whether inner loops are uniform. At this point, we only support
+  // simple outer loops scenarios with uniform nested loops.
+  if (!isUniformLoopNest(TheLoop /*loop nest*/,
+                         TheLoop /*context outer loop*/)) {
+    reportVectorizationFailure("Outer loop contains divergent loops",
+        "loop control flow is not understood by vectorizer",
+        "CFGNotUnderstood", ORE, TheLoop);
+    if (DoExtraAnalysis)
+      Result = false;
+    else
+      return false;
+  }
+
+  // Check whether we are able to set up outer loop induction.
+  if (!setupOuterLoopInductions()) {
+    reportVectorizationFailure("Unsupported outer loop Phi(s)",
+                               "Unsupported outer loop Phi(s)",
+                               "UnsupportedPhi", ORE, TheLoop);
+    if (DoExtraAnalysis)
+      Result = false;
+    else
+      return false;
+  }
+
+  return Result;
+}
+
+void LoopVectorizationLegality::addInductionPhi(
+    PHINode *Phi, const InductionDescriptor &ID,
+    SmallPtrSetImpl<Value *> &AllowedExit) {
+  Inductions[Phi] = ID;
+
+  // In case this induction also comes with casts that we know we can ignore
+  // in the vectorized loop body, record them here. All casts could be recorded
+  // here for ignoring, but suffices to record only the first (as it is the
+  // only one that may bw used outside the cast sequence).
+  const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
+  if (!Casts.empty())
+    InductionCastsToIgnore.insert(*Casts.begin());
+
+  Type *PhiTy = Phi->getType();
+  const DataLayout &DL = Phi->getModule()->getDataLayout();
+
+  // Get the widest type.
+  if (!PhiTy->isFloatingPointTy()) {
+    if (!WidestIndTy)
+      WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
+    else
+      WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
+  }
+
+  // Int inductions are special because we only allow one IV.
+  if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
+      ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne() &&
+      isa<Constant>(ID.getStartValue()) &&
+      cast<Constant>(ID.getStartValue())->isNullValue()) {
+
+    // Use the phi node with the widest type as induction. Use the last
+    // one if there are multiple (no good reason for doing this other
+    // than it is expedient). We've checked that it begins at zero and
+    // steps by one, so this is a canonical induction variable.
+    if (!PrimaryInduction || PhiTy == WidestIndTy)
+      PrimaryInduction = Phi;
+  }
+
+  // Both the PHI node itself, and the "post-increment" value feeding
+  // back into the PHI node may have external users.
+  // We can allow those uses, except if the SCEVs we have for them rely
+  // on predicates that only hold within the loop, since allowing the exit
+  // currently means re-using this SCEV outside the loop (see PR33706 for more
+  // details).
+  if (PSE.getUnionPredicate().isAlwaysTrue()) {
+    AllowedExit.insert(Phi);
+    AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch()));
+  }
+
+  LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n");
+}
+
+bool LoopVectorizationLegality::setupOuterLoopInductions() {
+  BasicBlock *Header = TheLoop->getHeader();
+
+  // Returns true if a given Phi is a supported induction.
+  auto isSupportedPhi = [&](PHINode &Phi) -> bool {
+    InductionDescriptor ID;
+    if (InductionDescriptor::isInductionPHI(&Phi, TheLoop, PSE, ID) &&
+        ID.getKind() == InductionDescriptor::IK_IntInduction) {
+      addInductionPhi(&Phi, ID, AllowedExit);
+      return true;
+    } else {
+      // Bail out for any Phi in the outer loop header that is not a supported
+      // induction.
+      LLVM_DEBUG(
+          dbgs()
+          << "LV: Found unsupported PHI for outer loop vectorization.\n");
+      return false;
+    }
+  };
+
+  if (llvm::all_of(Header->phis(), isSupportedPhi))
+    return true;
+  else
+    return false;
+}
+
+/// Checks if a function is scalarizable according to the TLI, in
+/// the sense that it should be vectorized and then expanded in
+/// multiple scalarcalls. This is represented in the
+/// TLI via mappings that do not specify a vector name, as in the
+/// following example:
+///
+///    const VecDesc VecIntrinsics[] = {
+///      {"llvm.phx.abs.i32", "", 4}
+///    };
+static bool isTLIScalarize(const TargetLibraryInfo &TLI, const CallInst &CI) {
+  const StringRef ScalarName = CI.getCalledFunction()->getName();
+  bool Scalarize = TLI.isFunctionVectorizable(ScalarName);
+  // Check that all known VFs are not associated to a vector
+  // function, i.e. the vector name is emty.
+  if (Scalarize)
+    for (unsigned VF = 2, WidestVF = TLI.getWidestVF(ScalarName);
+         VF <= WidestVF; VF *= 2) {
+      Scalarize &= !TLI.isFunctionVectorizable(ScalarName, VF);
+    }
+  return Scalarize;
+}
+
+bool LoopVectorizationLegality::canVectorizeInstrs() {
+  BasicBlock *Header = TheLoop->getHeader();
+
+  // Look for the attribute signaling the absence of NaNs.
+  Function &F = *Header->getParent();
+  HasFunNoNaNAttr =
+      F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
+
+  // For each block in the loop.
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    // Scan the instructions in the block and look for hazards.
+    for (Instruction &I : *BB) {
+      if (auto *Phi = dyn_cast<PHINode>(&I)) {
+        Type *PhiTy = Phi->getType();
+        // Check that this PHI type is allowed.
+        if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
+            !PhiTy->isPointerTy()) {
+          reportVectorizationFailure("Found a non-int non-pointer PHI",
+                                     "loop control flow is not understood by vectorizer",
+                                     "CFGNotUnderstood", ORE, TheLoop);
+          return false;
+        }
+
+        // If this PHINode is not in the header block, then we know that we
+        // can convert it to select during if-conversion. No need to check if
+        // the PHIs in this block are induction or reduction variables.
+        if (BB != Header) {
+          // Non-header phi nodes that have outside uses can be vectorized. Add
+          // them to the list of allowed exits.
+          // Unsafe cyclic dependencies with header phis are identified during
+          // legalization for reduction, induction and first order
+          // recurrences.
+          AllowedExit.insert(&I);
+          continue;
+        }
+
+        // We only allow if-converted PHIs with exactly two incoming values.
+        if (Phi->getNumIncomingValues() != 2) {
+          reportVectorizationFailure("Found an invalid PHI",
+              "loop control flow is not understood by vectorizer",
+              "CFGNotUnderstood", ORE, TheLoop, Phi);
+          return false;
+        }
+
+        RecurrenceDescriptor RedDes;
+        if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC,
+                                                 DT)) {
+          if (RedDes.hasUnsafeAlgebra())
+            Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst());
+          AllowedExit.insert(RedDes.getLoopExitInstr());
+          Reductions[Phi] = RedDes;
+          continue;
+        }
+
+        // TODO: Instead of recording the AllowedExit, it would be good to record the
+        // complementary set: NotAllowedExit. These include (but may not be
+        // limited to):
+        // 1. Reduction phis as they represent the one-before-last value, which
+        // is not available when vectorized 
+        // 2. Induction phis and increment when SCEV predicates cannot be used
+        // outside the loop - see addInductionPhi
+        // 3. Non-Phis with outside uses when SCEV predicates cannot be used
+        // outside the loop - see call to hasOutsideLoopUser in the non-phi
+        // handling below
+        // 4. FirstOrderRecurrence phis that can possibly be handled by
+        // extraction.
+        // By recording these, we can then reason about ways to vectorize each
+        // of these NotAllowedExit. 
+        InductionDescriptor ID;
+        if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) {
+          addInductionPhi(Phi, ID, AllowedExit);
+          if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr)
+            Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst());
+          continue;
+        }
+
+        if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop,
+                                                         SinkAfter, DT)) {
+          AllowedExit.insert(Phi);
+          FirstOrderRecurrences.insert(Phi);
+          continue;
+        }
+
+        // As a last resort, coerce the PHI to a AddRec expression
+        // and re-try classifying it a an induction PHI.
+        if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) {
+          addInductionPhi(Phi, ID, AllowedExit);
+          continue;
+        }
+
+        reportVectorizationFailure("Found an unidentified PHI",
+            "value that could not be identified as "
+            "reduction is used outside the loop",
+            "NonReductionValueUsedOutsideLoop", ORE, TheLoop, Phi);
+        return false;
+      } // end of PHI handling
+
+      // We handle calls that:
+      //   * Are debug info intrinsics.
+      //   * Have a mapping to an IR intrinsic.
+      //   * Have a vector version available.
+      auto *CI = dyn_cast<CallInst>(&I);
+
+      if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
+          !isa<DbgInfoIntrinsic>(CI) &&
+          !(CI->getCalledFunction() && TLI &&
+            (!VFDatabase::getMappings(*CI).empty() ||
+             isTLIScalarize(*TLI, *CI)))) {
+        // If the call is a recognized math libary call, it is likely that
+        // we can vectorize it given loosened floating-point constraints.
+        LibFunc Func;
+        bool IsMathLibCall =
+            TLI && CI->getCalledFunction() &&
+            CI->getType()->isFloatingPointTy() &&
+            TLI->getLibFunc(CI->getCalledFunction()->getName(), Func) &&
+            TLI->hasOptimizedCodeGen(Func);
+
+        if (IsMathLibCall) {
+          // TODO: Ideally, we should not use clang-specific language here,
+          // but it's hard to provide meaningful yet generic advice.
+          // Also, should this be guarded by allowExtraAnalysis() and/or be part
+          // of the returned info from isFunctionVectorizable()?
+          reportVectorizationFailure(
+              "Found a non-intrinsic callsite",
+              "library call cannot be vectorized. "
+              "Try compiling with -fno-math-errno, -ffast-math, "
+              "or similar flags",
+              "CantVectorizeLibcall", ORE, TheLoop, CI);
+        } else {
+          reportVectorizationFailure("Found a non-intrinsic callsite",
+                                     "call instruction cannot be vectorized",
+                                     "CantVectorizeLibcall", ORE, TheLoop, CI);
+        }
+        return false;
+      }
+
+      // Some intrinsics have scalar arguments and should be same in order for
+      // them to be vectorized (i.e. loop invariant).
+      if (CI) {
+        auto *SE = PSE.getSE();
+        Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);
+        for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i)
+          if (hasVectorInstrinsicScalarOpd(IntrinID, i)) {
+            if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(i)), TheLoop)) {
+              reportVectorizationFailure("Found unvectorizable intrinsic",
+                  "intrinsic instruction cannot be vectorized",
+                  "CantVectorizeIntrinsic", ORE, TheLoop, CI);
+              return false;
+            }
+          }
+      }
+
+      // Check that the instruction return type is vectorizable.
+      // Also, we can't vectorize extractelement instructions.
+      if ((!VectorType::isValidElementType(I.getType()) &&
+           !I.getType()->isVoidTy()) ||
+          isa<ExtractElementInst>(I)) {
+        reportVectorizationFailure("Found unvectorizable type",
+            "instruction return type cannot be vectorized",
+            "CantVectorizeInstructionReturnType", ORE, TheLoop, &I);
+        return false;
+      }
+
+      // Check that the stored type is vectorizable.
+      if (auto *ST = dyn_cast<StoreInst>(&I)) {
+        Type *T = ST->getValueOperand()->getType();
+        if (!VectorType::isValidElementType(T)) {
+          reportVectorizationFailure("Store instruction cannot be vectorized",
+                                     "store instruction cannot be vectorized",
+                                     "CantVectorizeStore", ORE, TheLoop, ST);
+          return false;
+        }
+
+        // For nontemporal stores, check that a nontemporal vector version is
+        // supported on the target.
+        if (ST->getMetadata(LLVMContext::MD_nontemporal)) {
+          // Arbitrarily try a vector of 2 elements.
           auto *VecTy = FixedVectorType::get(T, /*NumElts=*/2);
-          assert(VecTy && "did not find vectorized version of stored type"); 
-          if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) { 
-            reportVectorizationFailure( 
-                "nontemporal store instruction cannot be vectorized", 
-                "nontemporal store instruction cannot be vectorized", 
-                "CantVectorizeNontemporalStore", ORE, TheLoop, ST); 
-            return false; 
-          } 
-        } 
- 
-      } else if (auto *LD = dyn_cast<LoadInst>(&I)) { 
-        if (LD->getMetadata(LLVMContext::MD_nontemporal)) { 
-          // For nontemporal loads, check that a nontemporal vector version is 
-          // supported on the target (arbitrarily try a vector of 2 elements). 
+          assert(VecTy && "did not find vectorized version of stored type");
+          if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) {
+            reportVectorizationFailure(
+                "nontemporal store instruction cannot be vectorized",
+                "nontemporal store instruction cannot be vectorized",
+                "CantVectorizeNontemporalStore", ORE, TheLoop, ST);
+            return false;
+          }
+        }
+
+      } else if (auto *LD = dyn_cast<LoadInst>(&I)) {
+        if (LD->getMetadata(LLVMContext::MD_nontemporal)) {
+          // For nontemporal loads, check that a nontemporal vector version is
+          // supported on the target (arbitrarily try a vector of 2 elements).
           auto *VecTy = FixedVectorType::get(I.getType(), /*NumElts=*/2);
-          assert(VecTy && "did not find vectorized version of load type"); 
-          if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) { 
-            reportVectorizationFailure( 
-                "nontemporal load instruction cannot be vectorized", 
-                "nontemporal load instruction cannot be vectorized", 
-                "CantVectorizeNontemporalLoad", ORE, TheLoop, LD); 
-            return false; 
-          } 
-        } 
- 
-        // FP instructions can allow unsafe algebra, thus vectorizable by 
-        // non-IEEE-754 compliant SIMD units. 
-        // This applies to floating-point math operations and calls, not memory 
-        // operations, shuffles, or casts, as they don't change precision or 
-        // semantics. 
-      } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) && 
-                 !I.isFast()) { 
-        LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n"); 
-        Hints->setPotentiallyUnsafe(); 
-      } 
- 
-      // Reduction instructions are allowed to have exit users. 
-      // All other instructions must not have external users. 
-      if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) { 
-        // We can safely vectorize loops where instructions within the loop are 
-        // used outside the loop only if the SCEV predicates within the loop is 
-        // same as outside the loop. Allowing the exit means reusing the SCEV 
-        // outside the loop. 
-        if (PSE.getUnionPredicate().isAlwaysTrue()) { 
-          AllowedExit.insert(&I); 
-          continue; 
-        } 
-        reportVectorizationFailure("Value cannot be used outside the loop", 
-                                   "value cannot be used outside the loop", 
-                                   "ValueUsedOutsideLoop", ORE, TheLoop, &I); 
-        return false; 
-      } 
-    } // next instr. 
-  } 
- 
-  if (!PrimaryInduction) { 
-    if (Inductions.empty()) { 
-      reportVectorizationFailure("Did not find one integer induction var", 
-          "loop induction variable could not be identified", 
-          "NoInductionVariable", ORE, TheLoop); 
-      return false; 
-    } else if (!WidestIndTy) { 
-      reportVectorizationFailure("Did not find one integer induction var", 
-          "integer loop induction variable could not be identified", 
-          "NoIntegerInductionVariable", ORE, TheLoop); 
-      return false; 
-    } else { 
-      LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); 
-    } 
-  } 
- 
-  // For first order recurrences, we use the previous value (incoming value from 
-  // the latch) to check if it dominates all users of the recurrence. Bail out 
-  // if we have to sink such an instruction for another recurrence, as the 
-  // dominance requirement may not hold after sinking. 
-  BasicBlock *LoopLatch = TheLoop->getLoopLatch(); 
-  if (any_of(FirstOrderRecurrences, [LoopLatch, this](const PHINode *Phi) { 
-        Instruction *V = 
-            cast<Instruction>(Phi->getIncomingValueForBlock(LoopLatch)); 
-        return SinkAfter.find(V) != SinkAfter.end(); 
-      })) 
-    return false; 
- 
-  // Now we know the widest induction type, check if our found induction 
-  // is the same size. If it's not, unset it here and InnerLoopVectorizer 
-  // will create another. 
-  if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType()) 
-    PrimaryInduction = nullptr; 
- 
-  return true; 
-} 
- 
-bool LoopVectorizationLegality::canVectorizeMemory() { 
-  LAI = &(*GetLAA)(*TheLoop); 
-  const OptimizationRemarkAnalysis *LAR = LAI->getReport(); 
-  if (LAR) { 
-    ORE->emit([&]() { 
-      return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(), 
-                                        "loop not vectorized: ", *LAR); 
-    }); 
-  } 
-  if (!LAI->canVectorizeMemory()) 
-    return false; 
- 
-  if (LAI->hasDependenceInvolvingLoopInvariantAddress()) { 
-    reportVectorizationFailure("Stores to a uniform address", 
-        "write to a loop invariant address could not be vectorized", 
-        "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop); 
-    return false; 
-  } 
-  Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); 
-  PSE.addPredicate(LAI->getPSE().getUnionPredicate()); 
- 
-  return true; 
-} 
- 
-bool LoopVectorizationLegality::isInductionPhi(const Value *V) { 
-  Value *In0 = const_cast<Value *>(V); 
-  PHINode *PN = dyn_cast_or_null<PHINode>(In0); 
-  if (!PN) 
-    return false; 
- 
-  return Inductions.count(PN); 
-} 
- 
-bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) { 
-  auto *Inst = dyn_cast<Instruction>(V); 
-  return (Inst && InductionCastsToIgnore.count(Inst)); 
-} 
- 
-bool LoopVectorizationLegality::isInductionVariable(const Value *V) { 
-  return isInductionPhi(V) || isCastedInductionVariable(V); 
-} 
- 
-bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) { 
-  return FirstOrderRecurrences.count(Phi); 
-} 
- 
-bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { 
-  return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); 
-} 
- 
-bool LoopVectorizationLegality::blockCanBePredicated( 
+          assert(VecTy && "did not find vectorized version of load type");
+          if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) {
+            reportVectorizationFailure(
+                "nontemporal load instruction cannot be vectorized",
+                "nontemporal load instruction cannot be vectorized",
+                "CantVectorizeNontemporalLoad", ORE, TheLoop, LD);
+            return false;
+          }
+        }
+
+        // FP instructions can allow unsafe algebra, thus vectorizable by
+        // non-IEEE-754 compliant SIMD units.
+        // This applies to floating-point math operations and calls, not memory
+        // operations, shuffles, or casts, as they don't change precision or
+        // semantics.
+      } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
+                 !I.isFast()) {
+        LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
+        Hints->setPotentiallyUnsafe();
+      }
+
+      // Reduction instructions are allowed to have exit users.
+      // All other instructions must not have external users.
+      if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) {
+        // We can safely vectorize loops where instructions within the loop are
+        // used outside the loop only if the SCEV predicates within the loop is
+        // same as outside the loop. Allowing the exit means reusing the SCEV
+        // outside the loop.
+        if (PSE.getUnionPredicate().isAlwaysTrue()) {
+          AllowedExit.insert(&I);
+          continue;
+        }
+        reportVectorizationFailure("Value cannot be used outside the loop",
+                                   "value cannot be used outside the loop",
+                                   "ValueUsedOutsideLoop", ORE, TheLoop, &I);
+        return false;
+      }
+    } // next instr.
+  }
+
+  if (!PrimaryInduction) {
+    if (Inductions.empty()) {
+      reportVectorizationFailure("Did not find one integer induction var",
+          "loop induction variable could not be identified",
+          "NoInductionVariable", ORE, TheLoop);
+      return false;
+    } else if (!WidestIndTy) {
+      reportVectorizationFailure("Did not find one integer induction var",
+          "integer loop induction variable could not be identified",
+          "NoIntegerInductionVariable", ORE, TheLoop);
+      return false;
+    } else {
+      LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
+    }
+  }
+
+  // For first order recurrences, we use the previous value (incoming value from
+  // the latch) to check if it dominates all users of the recurrence. Bail out
+  // if we have to sink such an instruction for another recurrence, as the
+  // dominance requirement may not hold after sinking.
+  BasicBlock *LoopLatch = TheLoop->getLoopLatch();
+  if (any_of(FirstOrderRecurrences, [LoopLatch, this](const PHINode *Phi) {
+        Instruction *V =
+            cast<Instruction>(Phi->getIncomingValueForBlock(LoopLatch));
+        return SinkAfter.find(V) != SinkAfter.end();
+      }))
+    return false;
+
+  // Now we know the widest induction type, check if our found induction
+  // is the same size. If it's not, unset it here and InnerLoopVectorizer
+  // will create another.
+  if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
+    PrimaryInduction = nullptr;
+
+  return true;
+}
+
+bool LoopVectorizationLegality::canVectorizeMemory() {
+  LAI = &(*GetLAA)(*TheLoop);
+  const OptimizationRemarkAnalysis *LAR = LAI->getReport();
+  if (LAR) {
+    ORE->emit([&]() {
+      return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(),
+                                        "loop not vectorized: ", *LAR);
+    });
+  }
+  if (!LAI->canVectorizeMemory())
+    return false;
+
+  if (LAI->hasDependenceInvolvingLoopInvariantAddress()) {
+    reportVectorizationFailure("Stores to a uniform address",
+        "write to a loop invariant address could not be vectorized",
+        "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
+    return false;
+  }
+  Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
+  PSE.addPredicate(LAI->getPSE().getUnionPredicate());
+
+  return true;
+}
+
+bool LoopVectorizationLegality::isInductionPhi(const Value *V) {
+  Value *In0 = const_cast<Value *>(V);
+  PHINode *PN = dyn_cast_or_null<PHINode>(In0);
+  if (!PN)
+    return false;
+
+  return Inductions.count(PN);
+}
+
+bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) {
+  auto *Inst = dyn_cast<Instruction>(V);
+  return (Inst && InductionCastsToIgnore.count(Inst));
+}
+
+bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
+  return isInductionPhi(V) || isCastedInductionVariable(V);
+}
+
+bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) {
+  return FirstOrderRecurrences.count(Phi);
+}
+
+bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
+  return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
+}
+
+bool LoopVectorizationLegality::blockCanBePredicated(
     BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs,
     SmallPtrSetImpl<const Instruction *> &MaskedOp,
     SmallPtrSetImpl<Instruction *> &ConditionalAssumes) const {
-  for (Instruction &I : *BB) { 
-    // Check that we don't have a constant expression that can trap as operand. 
-    for (Value *Operand : I.operands()) { 
-      if (auto *C = dyn_cast<Constant>(Operand)) 
-        if (C->canTrap()) 
-          return false; 
-    } 
- 
-    // We can predicate blocks with calls to assume, as long as we drop them in 
-    // case we flatten the CFG via predication. 
-    if (match(&I, m_Intrinsic<Intrinsic::assume>())) { 
-      ConditionalAssumes.insert(&I); 
-      continue; 
-    } 
- 
+  for (Instruction &I : *BB) {
+    // Check that we don't have a constant expression that can trap as operand.
+    for (Value *Operand : I.operands()) {
+      if (auto *C = dyn_cast<Constant>(Operand))
+        if (C->canTrap())
+          return false;
+    }
+
+    // We can predicate blocks with calls to assume, as long as we drop them in
+    // case we flatten the CFG via predication.
+    if (match(&I, m_Intrinsic<Intrinsic::assume>())) {
+      ConditionalAssumes.insert(&I);
+      continue;
+    }
+
     // Do not let llvm.experimental.noalias.scope.decl block the vectorization.
     // TODO: there might be cases that it should block the vectorization. Let's
     // ignore those for now.
     if (isa<NoAliasScopeDeclInst>(&I))
       continue;
 
-    // We might be able to hoist the load. 
-    if (I.mayReadFromMemory()) { 
-      auto *LI = dyn_cast<LoadInst>(&I); 
-      if (!LI) 
-        return false; 
-      if (!SafePtrs.count(LI->getPointerOperand())) { 
+    // We might be able to hoist the load.
+    if (I.mayReadFromMemory()) {
+      auto *LI = dyn_cast<LoadInst>(&I);
+      if (!LI)
+        return false;
+      if (!SafePtrs.count(LI->getPointerOperand())) {
         MaskedOp.insert(LI);
-        continue; 
-      } 
-    } 
- 
-    if (I.mayWriteToMemory()) { 
-      auto *SI = dyn_cast<StoreInst>(&I); 
-      if (!SI) 
-        return false; 
-      // Predicated store requires some form of masking: 
-      // 1) masked store HW instruction, 
-      // 2) emulation via load-blend-store (only if safe and legal to do so, 
-      //    be aware on the race conditions), or 
-      // 3) element-by-element predicate check and scalar store. 
-      MaskedOp.insert(SI); 
-      continue; 
-    } 
-    if (I.mayThrow()) 
-      return false; 
-  } 
- 
-  return true; 
-} 
- 
-bool LoopVectorizationLegality::canVectorizeWithIfConvert() { 
-  if (!EnableIfConversion) { 
-    reportVectorizationFailure("If-conversion is disabled", 
-                               "if-conversion is disabled", 
-                               "IfConversionDisabled", 
-                               ORE, TheLoop); 
-    return false; 
-  } 
- 
-  assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); 
- 
-  // A list of pointers which are known to be dereferenceable within scope of 
-  // the loop body for each iteration of the loop which executes.  That is, 
-  // the memory pointed to can be dereferenced (with the access size implied by 
-  // the value's type) unconditionally within the loop header without 
-  // introducing a new fault. 
-  SmallPtrSet<Value *, 8> SafePointers; 
- 
-  // Collect safe addresses. 
-  for (BasicBlock *BB : TheLoop->blocks()) { 
-    if (!blockNeedsPredication(BB)) { 
-      for (Instruction &I : *BB) 
-        if (auto *Ptr = getLoadStorePointerOperand(&I)) 
-          SafePointers.insert(Ptr); 
-      continue; 
-    } 
- 
-    // For a block which requires predication, a address may be safe to access 
-    // in the loop w/o predication if we can prove dereferenceability facts 
-    // sufficient to ensure it'll never fault within the loop. For the moment, 
-    // we restrict this to loads; stores are more complicated due to 
-    // concurrency restrictions. 
-    ScalarEvolution &SE = *PSE.getSE(); 
-    for (Instruction &I : *BB) { 
-      LoadInst *LI = dyn_cast<LoadInst>(&I); 
+        continue;
+      }
+    }
+
+    if (I.mayWriteToMemory()) {
+      auto *SI = dyn_cast<StoreInst>(&I);
+      if (!SI)
+        return false;
+      // Predicated store requires some form of masking:
+      // 1) masked store HW instruction,
+      // 2) emulation via load-blend-store (only if safe and legal to do so,
+      //    be aware on the race conditions), or
+      // 3) element-by-element predicate check and scalar store.
+      MaskedOp.insert(SI);
+      continue;
+    }
+    if (I.mayThrow())
+      return false;
+  }
+
+  return true;
+}
+
+bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
+  if (!EnableIfConversion) {
+    reportVectorizationFailure("If-conversion is disabled",
+                               "if-conversion is disabled",
+                               "IfConversionDisabled",
+                               ORE, TheLoop);
+    return false;
+  }
+
+  assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
+
+  // A list of pointers which are known to be dereferenceable within scope of
+  // the loop body for each iteration of the loop which executes.  That is,
+  // the memory pointed to can be dereferenced (with the access size implied by
+  // the value's type) unconditionally within the loop header without
+  // introducing a new fault.
+  SmallPtrSet<Value *, 8> SafePointers;
+
+  // Collect safe addresses.
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    if (!blockNeedsPredication(BB)) {
+      for (Instruction &I : *BB)
+        if (auto *Ptr = getLoadStorePointerOperand(&I))
+          SafePointers.insert(Ptr);
+      continue;
+    }
+
+    // For a block which requires predication, a address may be safe to access
+    // in the loop w/o predication if we can prove dereferenceability facts
+    // sufficient to ensure it'll never fault within the loop. For the moment,
+    // we restrict this to loads; stores are more complicated due to
+    // concurrency restrictions.
+    ScalarEvolution &SE = *PSE.getSE();
+    for (Instruction &I : *BB) {
+      LoadInst *LI = dyn_cast<LoadInst>(&I);
       if (LI && !LI->getType()->isVectorTy() && !mustSuppressSpeculation(*LI) &&
-          isDereferenceableAndAlignedInLoop(LI, TheLoop, SE, *DT)) 
-        SafePointers.insert(LI->getPointerOperand()); 
-    } 
-  } 
- 
-  // Collect the blocks that need predication. 
-  BasicBlock *Header = TheLoop->getHeader(); 
-  for (BasicBlock *BB : TheLoop->blocks()) { 
-    // We don't support switch statements inside loops. 
-    if (!isa<BranchInst>(BB->getTerminator())) { 
-      reportVectorizationFailure("Loop contains a switch statement", 
-                                 "loop contains a switch statement", 
-                                 "LoopContainsSwitch", ORE, TheLoop, 
-                                 BB->getTerminator()); 
-      return false; 
-    } 
- 
-    // We must be able to predicate all blocks that need to be predicated. 
-    if (blockNeedsPredication(BB)) { 
+          isDereferenceableAndAlignedInLoop(LI, TheLoop, SE, *DT))
+        SafePointers.insert(LI->getPointerOperand());
+    }
+  }
+
+  // Collect the blocks that need predication.
+  BasicBlock *Header = TheLoop->getHeader();
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    // We don't support switch statements inside loops.
+    if (!isa<BranchInst>(BB->getTerminator())) {
+      reportVectorizationFailure("Loop contains a switch statement",
+                                 "loop contains a switch statement",
+                                 "LoopContainsSwitch", ORE, TheLoop,
+                                 BB->getTerminator());
+      return false;
+    }
+
+    // We must be able to predicate all blocks that need to be predicated.
+    if (blockNeedsPredication(BB)) {
       if (!blockCanBePredicated(BB, SafePointers, MaskedOp,
                                 ConditionalAssumes)) {
-        reportVectorizationFailure( 
-            "Control flow cannot be substituted for a select", 
-            "control flow cannot be substituted for a select", 
-            "NoCFGForSelect", ORE, TheLoop, 
-            BB->getTerminator()); 
-        return false; 
-      } 
-    } else if (BB != Header && !canIfConvertPHINodes(BB)) { 
-      reportVectorizationFailure( 
-          "Control flow cannot be substituted for a select", 
-          "control flow cannot be substituted for a select", 
-          "NoCFGForSelect", ORE, TheLoop, 
-          BB->getTerminator()); 
-      return false; 
-    } 
-  } 
- 
-  // We can if-convert this loop. 
-  return true; 
-} 
- 
-// Helper function to canVectorizeLoopNestCFG. 
-bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp, 
-                                                    bool UseVPlanNativePath) { 
+        reportVectorizationFailure(
+            "Control flow cannot be substituted for a select",
+            "control flow cannot be substituted for a select",
+            "NoCFGForSelect", ORE, TheLoop,
+            BB->getTerminator());
+        return false;
+      }
+    } else if (BB != Header && !canIfConvertPHINodes(BB)) {
+      reportVectorizationFailure(
+          "Control flow cannot be substituted for a select",
+          "control flow cannot be substituted for a select",
+          "NoCFGForSelect", ORE, TheLoop,
+          BB->getTerminator());
+      return false;
+    }
+  }
+
+  // We can if-convert this loop.
+  return true;
+}
+
+// Helper function to canVectorizeLoopNestCFG.
+bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
+                                                    bool UseVPlanNativePath) {
   assert((UseVPlanNativePath || Lp->isInnermost()) &&
-         "VPlan-native path is not enabled."); 
- 
-  // TODO: ORE should be improved to show more accurate information when an 
-  // outer loop can't be vectorized because a nested loop is not understood or 
-  // legal. Something like: "outer_loop_location: loop not vectorized: 
-  // (inner_loop_location) loop control flow is not understood by vectorizer". 
- 
-  // Store the result and return it at the end instead of exiting early, in case 
-  // allowExtraAnalysis is used to report multiple reasons for not vectorizing. 
-  bool Result = true; 
-  bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); 
- 
-  // We must have a loop in canonical form. Loops with indirectbr in them cannot 
-  // be canonicalized. 
-  if (!Lp->getLoopPreheader()) { 
-    reportVectorizationFailure("Loop doesn't have a legal pre-header", 
-        "loop control flow is not understood by vectorizer", 
-        "CFGNotUnderstood", ORE, TheLoop); 
-    if (DoExtraAnalysis) 
-      Result = false; 
-    else 
-      return false; 
-  } 
- 
-  // We must have a single backedge. 
-  if (Lp->getNumBackEdges() != 1) { 
-    reportVectorizationFailure("The loop must have a single backedge", 
-        "loop control flow is not understood by vectorizer", 
-        "CFGNotUnderstood", ORE, TheLoop); 
-    if (DoExtraAnalysis) 
-      Result = false; 
-    else 
-      return false; 
-  } 
- 
+         "VPlan-native path is not enabled.");
+
+  // TODO: ORE should be improved to show more accurate information when an
+  // outer loop can't be vectorized because a nested loop is not understood or
+  // legal. Something like: "outer_loop_location: loop not vectorized:
+  // (inner_loop_location) loop control flow is not understood by vectorizer".
+
+  // Store the result and return it at the end instead of exiting early, in case
+  // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
+  bool Result = true;
+  bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+
+  // We must have a loop in canonical form. Loops with indirectbr in them cannot
+  // be canonicalized.
+  if (!Lp->getLoopPreheader()) {
+    reportVectorizationFailure("Loop doesn't have a legal pre-header",
+        "loop control flow is not understood by vectorizer",
+        "CFGNotUnderstood", ORE, TheLoop);
+    if (DoExtraAnalysis)
+      Result = false;
+    else
+      return false;
+  }
+
+  // We must have a single backedge.
+  if (Lp->getNumBackEdges() != 1) {
+    reportVectorizationFailure("The loop must have a single backedge",
+        "loop control flow is not understood by vectorizer",
+        "CFGNotUnderstood", ORE, TheLoop);
+    if (DoExtraAnalysis)
+      Result = false;
+    else
+      return false;
+  }
+
   // We currently must have a single "exit block" after the loop. Note that
   // multiple "exiting blocks" inside the loop are allowed, provided they all
   // reach the single exit block.
@@ -1102,186 +1102,186 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
   // update logic in a number of places.
   if (!Lp->getUniqueExitBlock()) {
     reportVectorizationFailure("The loop must have a unique exit block",
-        "loop control flow is not understood by vectorizer", 
-        "CFGNotUnderstood", ORE, TheLoop); 
-    if (DoExtraAnalysis) 
-      Result = false; 
-    else 
-      return false; 
-  } 
-  return Result; 
-} 
- 
-bool LoopVectorizationLegality::canVectorizeLoopNestCFG( 
-    Loop *Lp, bool UseVPlanNativePath) { 
-  // Store the result and return it at the end instead of exiting early, in case 
-  // allowExtraAnalysis is used to report multiple reasons for not vectorizing. 
-  bool Result = true; 
-  bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); 
-  if (!canVectorizeLoopCFG(Lp, UseVPlanNativePath)) { 
-    if (DoExtraAnalysis) 
-      Result = false; 
-    else 
-      return false; 
-  } 
- 
-  // Recursively check whether the loop control flow of nested loops is 
-  // understood. 
-  for (Loop *SubLp : *Lp) 
-    if (!canVectorizeLoopNestCFG(SubLp, UseVPlanNativePath)) { 
-      if (DoExtraAnalysis) 
-        Result = false; 
-      else 
-        return false; 
-    } 
- 
-  return Result; 
-} 
- 
-bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { 
-  // Store the result and return it at the end instead of exiting early, in case 
-  // allowExtraAnalysis is used to report multiple reasons for not vectorizing. 
-  bool Result = true; 
- 
-  bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); 
-  // Check whether the loop-related control flow in the loop nest is expected by 
-  // vectorizer. 
-  if (!canVectorizeLoopNestCFG(TheLoop, UseVPlanNativePath)) { 
-    if (DoExtraAnalysis) 
-      Result = false; 
-    else 
-      return false; 
-  } 
- 
-  // We need to have a loop header. 
-  LLVM_DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName() 
-                    << '\n'); 
- 
-  // Specific checks for outer loops. We skip the remaining legal checks at this 
-  // point because they don't support outer loops. 
+        "loop control flow is not understood by vectorizer",
+        "CFGNotUnderstood", ORE, TheLoop);
+    if (DoExtraAnalysis)
+      Result = false;
+    else
+      return false;
+  }
+  return Result;
+}
+
+bool LoopVectorizationLegality::canVectorizeLoopNestCFG(
+    Loop *Lp, bool UseVPlanNativePath) {
+  // Store the result and return it at the end instead of exiting early, in case
+  // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
+  bool Result = true;
+  bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+  if (!canVectorizeLoopCFG(Lp, UseVPlanNativePath)) {
+    if (DoExtraAnalysis)
+      Result = false;
+    else
+      return false;
+  }
+
+  // Recursively check whether the loop control flow of nested loops is
+  // understood.
+  for (Loop *SubLp : *Lp)
+    if (!canVectorizeLoopNestCFG(SubLp, UseVPlanNativePath)) {
+      if (DoExtraAnalysis)
+        Result = false;
+      else
+        return false;
+    }
+
+  return Result;
+}
+
+bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
+  // Store the result and return it at the end instead of exiting early, in case
+  // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
+  bool Result = true;
+
+  bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+  // Check whether the loop-related control flow in the loop nest is expected by
+  // vectorizer.
+  if (!canVectorizeLoopNestCFG(TheLoop, UseVPlanNativePath)) {
+    if (DoExtraAnalysis)
+      Result = false;
+    else
+      return false;
+  }
+
+  // We need to have a loop header.
+  LLVM_DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
+                    << '\n');
+
+  // Specific checks for outer loops. We skip the remaining legal checks at this
+  // point because they don't support outer loops.
   if (!TheLoop->isInnermost()) {
-    assert(UseVPlanNativePath && "VPlan-native path is not enabled."); 
- 
-    if (!canVectorizeOuterLoop()) { 
-      reportVectorizationFailure("Unsupported outer loop", 
-                                 "unsupported outer loop", 
-                                 "UnsupportedOuterLoop", 
-                                 ORE, TheLoop); 
-      // TODO: Implement DoExtraAnalysis when subsequent legal checks support 
-      // outer loops. 
-      return false; 
-    } 
- 
-    LLVM_DEBUG(dbgs() << "LV: We can vectorize this outer loop!\n"); 
-    return Result; 
-  } 
- 
+    assert(UseVPlanNativePath && "VPlan-native path is not enabled.");
+
+    if (!canVectorizeOuterLoop()) {
+      reportVectorizationFailure("Unsupported outer loop",
+                                 "unsupported outer loop",
+                                 "UnsupportedOuterLoop",
+                                 ORE, TheLoop);
+      // TODO: Implement DoExtraAnalysis when subsequent legal checks support
+      // outer loops.
+      return false;
+    }
+
+    LLVM_DEBUG(dbgs() << "LV: We can vectorize this outer loop!\n");
+    return Result;
+  }
+
   assert(TheLoop->isInnermost() && "Inner loop expected.");
-  // Check if we can if-convert non-single-bb loops. 
-  unsigned NumBlocks = TheLoop->getNumBlocks(); 
-  if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { 
-    LLVM_DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); 
-    if (DoExtraAnalysis) 
-      Result = false; 
-    else 
-      return false; 
-  } 
- 
-  // Check if we can vectorize the instructions and CFG in this loop. 
-  if (!canVectorizeInstrs()) { 
-    LLVM_DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); 
-    if (DoExtraAnalysis) 
-      Result = false; 
-    else 
-      return false; 
-  } 
- 
-  // Go over each instruction and look at memory deps. 
-  if (!canVectorizeMemory()) { 
-    LLVM_DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n"); 
-    if (DoExtraAnalysis) 
-      Result = false; 
-    else 
-      return false; 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop" 
-                    << (LAI->getRuntimePointerChecking()->Need 
-                            ? " (with a runtime bound check)" 
-                            : "") 
-                    << "!\n"); 
- 
-  unsigned SCEVThreshold = VectorizeSCEVCheckThreshold; 
-  if (Hints->getForce() == LoopVectorizeHints::FK_Enabled) 
-    SCEVThreshold = PragmaVectorizeSCEVCheckThreshold; 
- 
-  if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) { 
-    reportVectorizationFailure("Too many SCEV checks needed", 
-        "Too many SCEV assumptions need to be made and checked at runtime", 
-        "TooManySCEVRunTimeChecks", ORE, TheLoop); 
-    if (DoExtraAnalysis) 
-      Result = false; 
-    else 
-      return false; 
-  } 
- 
-  // Okay! We've done all the tests. If any have failed, return false. Otherwise 
-  // we can vectorize, and at this point we don't have any other mem analysis 
-  // which may limit our maximum vectorization factor, so just return true with 
-  // no restrictions. 
-  return Result; 
-} 
- 
-bool LoopVectorizationLegality::prepareToFoldTailByMasking() { 
- 
-  LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n"); 
- 
-  SmallPtrSet<const Value *, 8> ReductionLiveOuts; 
- 
-  for (auto &Reduction : getReductionVars()) 
-    ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr()); 
- 
-  // TODO: handle non-reduction outside users when tail is folded by masking. 
-  for (auto *AE : AllowedExit) { 
-    // Check that all users of allowed exit values are inside the loop or 
-    // are the live-out of a reduction. 
-    if (ReductionLiveOuts.count(AE)) 
-      continue; 
-    for (User *U : AE->users()) { 
-      Instruction *UI = cast<Instruction>(U); 
-      if (TheLoop->contains(UI)) 
-        continue; 
+  // Check if we can if-convert non-single-bb loops.
+  unsigned NumBlocks = TheLoop->getNumBlocks();
+  if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
+    LLVM_DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
+    if (DoExtraAnalysis)
+      Result = false;
+    else
+      return false;
+  }
+
+  // Check if we can vectorize the instructions and CFG in this loop.
+  if (!canVectorizeInstrs()) {
+    LLVM_DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
+    if (DoExtraAnalysis)
+      Result = false;
+    else
+      return false;
+  }
+
+  // Go over each instruction and look at memory deps.
+  if (!canVectorizeMemory()) {
+    LLVM_DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
+    if (DoExtraAnalysis)
+      Result = false;
+    else
+      return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop"
+                    << (LAI->getRuntimePointerChecking()->Need
+                            ? " (with a runtime bound check)"
+                            : "")
+                    << "!\n");
+
+  unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
+  if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
+    SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
+
+  if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
+    reportVectorizationFailure("Too many SCEV checks needed",
+        "Too many SCEV assumptions need to be made and checked at runtime",
+        "TooManySCEVRunTimeChecks", ORE, TheLoop);
+    if (DoExtraAnalysis)
+      Result = false;
+    else
+      return false;
+  }
+
+  // Okay! We've done all the tests. If any have failed, return false. Otherwise
+  // we can vectorize, and at this point we don't have any other mem analysis
+  // which may limit our maximum vectorization factor, so just return true with
+  // no restrictions.
+  return Result;
+}
+
+bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
+
+  LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
+
+  SmallPtrSet<const Value *, 8> ReductionLiveOuts;
+
+  for (auto &Reduction : getReductionVars())
+    ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr());
+
+  // TODO: handle non-reduction outside users when tail is folded by masking.
+  for (auto *AE : AllowedExit) {
+    // Check that all users of allowed exit values are inside the loop or
+    // are the live-out of a reduction.
+    if (ReductionLiveOuts.count(AE))
+      continue;
+    for (User *U : AE->users()) {
+      Instruction *UI = cast<Instruction>(U);
+      if (TheLoop->contains(UI))
+        continue;
       LLVM_DEBUG(
           dbgs()
           << "LV: Cannot fold tail by masking, loop has an outside user for "
           << *UI << "\n");
-      return false; 
-    } 
-  } 
- 
-  // The list of pointers that we can safely read and write to remains empty. 
-  SmallPtrSet<Value *, 8> SafePointers; 
- 
+      return false;
+    }
+  }
+
+  // The list of pointers that we can safely read and write to remains empty.
+  SmallPtrSet<Value *, 8> SafePointers;
+
   SmallPtrSet<const Instruction *, 8> TmpMaskedOp;
   SmallPtrSet<Instruction *, 8> TmpConditionalAssumes;
 
-  // Check and mark all blocks for predication, including those that ordinarily 
-  // do not need predication such as the header block. 
-  for (BasicBlock *BB : TheLoop->blocks()) { 
+  // Check and mark all blocks for predication, including those that ordinarily
+  // do not need predication such as the header block.
+  for (BasicBlock *BB : TheLoop->blocks()) {
     if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp,
                               TmpConditionalAssumes)) {
       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as requested.\n");
-      return false; 
-    } 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n"); 
+      return false;
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
 
   MaskedOp.insert(TmpMaskedOp.begin(), TmpMaskedOp.end());
   ConditionalAssumes.insert(TmpConditionalAssumes.begin(),
                             TmpConditionalAssumes.end());
 
-  return true; 
-} 
- 
-} // namespace llvm 
+  return true;
+}
+
+} // namespace llvm
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 1938b1f0f8..19797e6f78 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -1,278 +1,278 @@
-//===- LoopVectorizationPlanner.h - Planner for LoopVectorization ---------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-/// 
-/// \file 
-/// This file provides a LoopVectorizationPlanner class. 
-/// InnerLoopVectorizer vectorizes loops which contain only one basic 
-/// LoopVectorizationPlanner - drives the vectorization process after having 
-/// passed Legality checks. 
-/// The planner builds and optimizes the Vectorization Plans which record the 
-/// decisions how to vectorize the given loop. In particular, represent the 
-/// control-flow of the vectorized version, the replication of instructions that 
-/// are to be scalarized, and interleave access groups. 
-/// 
-/// Also provides a VPlan-based builder utility analogous to IRBuilder. 
-/// It provides an instruction-level API for generating VPInstructions while 
-/// abstracting away the Recipe manipulation details. 
-//===----------------------------------------------------------------------===// 
- 
-#ifndef LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H 
-#define LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H 
- 
-#include "VPlan.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
- 
-namespace llvm { 
- 
-class LoopVectorizationLegality; 
-class LoopVectorizationCostModel; 
-class PredicatedScalarEvolution; 
+//===- LoopVectorizationPlanner.h - Planner for LoopVectorization ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides a LoopVectorizationPlanner class.
+/// InnerLoopVectorizer vectorizes loops which contain only one basic
+/// LoopVectorizationPlanner - drives the vectorization process after having
+/// passed Legality checks.
+/// The planner builds and optimizes the Vectorization Plans which record the
+/// decisions how to vectorize the given loop. In particular, represent the
+/// control-flow of the vectorized version, the replication of instructions that
+/// are to be scalarized, and interleave access groups.
+///
+/// Also provides a VPlan-based builder utility analogous to IRBuilder.
+/// It provides an instruction-level API for generating VPInstructions while
+/// abstracting away the Recipe manipulation details.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
+#define LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
+
+#include "VPlan.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+
+namespace llvm {
+
+class LoopVectorizationLegality;
+class LoopVectorizationCostModel;
+class PredicatedScalarEvolution;
 class VPRecipeBuilder;
- 
-/// VPlan-based builder utility analogous to IRBuilder. 
-class VPBuilder { 
-  VPBasicBlock *BB = nullptr; 
-  VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator(); 
- 
-  VPInstruction *createInstruction(unsigned Opcode, 
-                                   ArrayRef<VPValue *> Operands) { 
-    VPInstruction *Instr = new VPInstruction(Opcode, Operands); 
-    if (BB) 
-      BB->insert(Instr, InsertPt); 
-    return Instr; 
-  } 
- 
-  VPInstruction *createInstruction(unsigned Opcode, 
-                                   std::initializer_list<VPValue *> Operands) { 
-    return createInstruction(Opcode, ArrayRef<VPValue *>(Operands)); 
-  } 
- 
-public: 
-  VPBuilder() {} 
- 
-  /// Clear the insertion point: created instructions will not be inserted into 
-  /// a block. 
-  void clearInsertionPoint() { 
-    BB = nullptr; 
-    InsertPt = VPBasicBlock::iterator(); 
-  } 
- 
-  VPBasicBlock *getInsertBlock() const { return BB; } 
-  VPBasicBlock::iterator getInsertPoint() const { return InsertPt; } 
- 
-  /// InsertPoint - A saved insertion point. 
-  class VPInsertPoint { 
-    VPBasicBlock *Block = nullptr; 
-    VPBasicBlock::iterator Point; 
- 
-  public: 
-    /// Creates a new insertion point which doesn't point to anything. 
-    VPInsertPoint() = default; 
- 
-    /// Creates a new insertion point at the given location. 
-    VPInsertPoint(VPBasicBlock *InsertBlock, VPBasicBlock::iterator InsertPoint) 
-        : Block(InsertBlock), Point(InsertPoint) {} 
- 
-    /// Returns true if this insert point is set. 
-    bool isSet() const { return Block != nullptr; } 
- 
-    VPBasicBlock *getBlock() const { return Block; } 
-    VPBasicBlock::iterator getPoint() const { return Point; } 
-  }; 
- 
-  /// Sets the current insert point to a previously-saved location. 
-  void restoreIP(VPInsertPoint IP) { 
-    if (IP.isSet()) 
-      setInsertPoint(IP.getBlock(), IP.getPoint()); 
-    else 
-      clearInsertionPoint(); 
-  } 
- 
-  /// This specifies that created VPInstructions should be appended to the end 
-  /// of the specified block. 
-  void setInsertPoint(VPBasicBlock *TheBB) { 
-    assert(TheBB && "Attempting to set a null insert point"); 
-    BB = TheBB; 
-    InsertPt = BB->end(); 
-  } 
- 
-  /// This specifies that created instructions should be inserted at the 
-  /// specified point. 
-  void setInsertPoint(VPBasicBlock *TheBB, VPBasicBlock::iterator IP) { 
-    BB = TheBB; 
-    InsertPt = IP; 
-  } 
- 
-  /// Insert and return the specified instruction. 
-  VPInstruction *insert(VPInstruction *I) const { 
-    BB->insert(I, InsertPt); 
-    return I; 
-  } 
- 
-  /// Create an N-ary operation with \p Opcode, \p Operands and set \p Inst as 
-  /// its underlying Instruction. 
-  VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands, 
-                        Instruction *Inst = nullptr) { 
-    VPInstruction *NewVPInst = createInstruction(Opcode, Operands); 
-    NewVPInst->setUnderlyingValue(Inst); 
-    return NewVPInst; 
-  } 
-  VPValue *createNaryOp(unsigned Opcode, 
-                        std::initializer_list<VPValue *> Operands, 
-                        Instruction *Inst = nullptr) { 
-    return createNaryOp(Opcode, ArrayRef<VPValue *>(Operands), Inst); 
-  } 
- 
-  VPValue *createNot(VPValue *Operand) { 
-    return createInstruction(VPInstruction::Not, {Operand}); 
-  } 
- 
-  VPValue *createAnd(VPValue *LHS, VPValue *RHS) { 
-    return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}); 
-  } 
- 
-  VPValue *createOr(VPValue *LHS, VPValue *RHS) { 
-    return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS}); 
-  } 
- 
+
+/// VPlan-based builder utility analogous to IRBuilder.
+class VPBuilder {
+  VPBasicBlock *BB = nullptr;
+  VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator();
+
+  VPInstruction *createInstruction(unsigned Opcode,
+                                   ArrayRef<VPValue *> Operands) {
+    VPInstruction *Instr = new VPInstruction(Opcode, Operands);
+    if (BB)
+      BB->insert(Instr, InsertPt);
+    return Instr;
+  }
+
+  VPInstruction *createInstruction(unsigned Opcode,
+                                   std::initializer_list<VPValue *> Operands) {
+    return createInstruction(Opcode, ArrayRef<VPValue *>(Operands));
+  }
+
+public:
+  VPBuilder() {}
+
+  /// Clear the insertion point: created instructions will not be inserted into
+  /// a block.
+  void clearInsertionPoint() {
+    BB = nullptr;
+    InsertPt = VPBasicBlock::iterator();
+  }
+
+  VPBasicBlock *getInsertBlock() const { return BB; }
+  VPBasicBlock::iterator getInsertPoint() const { return InsertPt; }
+
+  /// InsertPoint - A saved insertion point.
+  class VPInsertPoint {
+    VPBasicBlock *Block = nullptr;
+    VPBasicBlock::iterator Point;
+
+  public:
+    /// Creates a new insertion point which doesn't point to anything.
+    VPInsertPoint() = default;
+
+    /// Creates a new insertion point at the given location.
+    VPInsertPoint(VPBasicBlock *InsertBlock, VPBasicBlock::iterator InsertPoint)
+        : Block(InsertBlock), Point(InsertPoint) {}
+
+    /// Returns true if this insert point is set.
+    bool isSet() const { return Block != nullptr; }
+
+    VPBasicBlock *getBlock() const { return Block; }
+    VPBasicBlock::iterator getPoint() const { return Point; }
+  };
+
+  /// Sets the current insert point to a previously-saved location.
+  void restoreIP(VPInsertPoint IP) {
+    if (IP.isSet())
+      setInsertPoint(IP.getBlock(), IP.getPoint());
+    else
+      clearInsertionPoint();
+  }
+
+  /// This specifies that created VPInstructions should be appended to the end
+  /// of the specified block.
+  void setInsertPoint(VPBasicBlock *TheBB) {
+    assert(TheBB && "Attempting to set a null insert point");
+    BB = TheBB;
+    InsertPt = BB->end();
+  }
+
+  /// This specifies that created instructions should be inserted at the
+  /// specified point.
+  void setInsertPoint(VPBasicBlock *TheBB, VPBasicBlock::iterator IP) {
+    BB = TheBB;
+    InsertPt = IP;
+  }
+
+  /// Insert and return the specified instruction.
+  VPInstruction *insert(VPInstruction *I) const {
+    BB->insert(I, InsertPt);
+    return I;
+  }
+
+  /// Create an N-ary operation with \p Opcode, \p Operands and set \p Inst as
+  /// its underlying Instruction.
+  VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
+                        Instruction *Inst = nullptr) {
+    VPInstruction *NewVPInst = createInstruction(Opcode, Operands);
+    NewVPInst->setUnderlyingValue(Inst);
+    return NewVPInst;
+  }
+  VPValue *createNaryOp(unsigned Opcode,
+                        std::initializer_list<VPValue *> Operands,
+                        Instruction *Inst = nullptr) {
+    return createNaryOp(Opcode, ArrayRef<VPValue *>(Operands), Inst);
+  }
+
+  VPValue *createNot(VPValue *Operand) {
+    return createInstruction(VPInstruction::Not, {Operand});
+  }
+
+  VPValue *createAnd(VPValue *LHS, VPValue *RHS) {
+    return createInstruction(Instruction::BinaryOps::And, {LHS, RHS});
+  }
+
+  VPValue *createOr(VPValue *LHS, VPValue *RHS) {
+    return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS});
+  }
+
   VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal) {
     return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal});
   }
 
-  //===--------------------------------------------------------------------===// 
-  // RAII helpers. 
-  //===--------------------------------------------------------------------===// 
- 
-  /// RAII object that stores the current insertion point and restores it when 
-  /// the object is destroyed. 
-  class InsertPointGuard { 
-    VPBuilder &Builder; 
-    VPBasicBlock *Block; 
-    VPBasicBlock::iterator Point; 
- 
-  public: 
-    InsertPointGuard(VPBuilder &B) 
-        : Builder(B), Block(B.getInsertBlock()), Point(B.getInsertPoint()) {} 
- 
-    InsertPointGuard(const InsertPointGuard &) = delete; 
-    InsertPointGuard &operator=(const InsertPointGuard &) = delete; 
- 
-    ~InsertPointGuard() { Builder.restoreIP(VPInsertPoint(Block, Point)); } 
-  }; 
-}; 
- 
-/// TODO: The following VectorizationFactor was pulled out of 
-/// LoopVectorizationCostModel class. LV also deals with 
-/// VectorizerParams::VectorizationFactor and VectorizationCostTy. 
-/// We need to streamline them. 
- 
-/// Information about vectorization costs 
-struct VectorizationFactor { 
-  // Vector width with best cost 
+  //===--------------------------------------------------------------------===//
+  // RAII helpers.
+  //===--------------------------------------------------------------------===//
+
+  /// RAII object that stores the current insertion point and restores it when
+  /// the object is destroyed.
+  class InsertPointGuard {
+    VPBuilder &Builder;
+    VPBasicBlock *Block;
+    VPBasicBlock::iterator Point;
+
+  public:
+    InsertPointGuard(VPBuilder &B)
+        : Builder(B), Block(B.getInsertBlock()), Point(B.getInsertPoint()) {}
+
+    InsertPointGuard(const InsertPointGuard &) = delete;
+    InsertPointGuard &operator=(const InsertPointGuard &) = delete;
+
+    ~InsertPointGuard() { Builder.restoreIP(VPInsertPoint(Block, Point)); }
+  };
+};
+
+/// TODO: The following VectorizationFactor was pulled out of
+/// LoopVectorizationCostModel class. LV also deals with
+/// VectorizerParams::VectorizationFactor and VectorizationCostTy.
+/// We need to streamline them.
+
+/// Information about vectorization costs
+struct VectorizationFactor {
+  // Vector width with best cost
   ElementCount Width;
-  // Cost of the loop with that width 
-  unsigned Cost; 
- 
-  // Width 1 means no vectorization, cost 0 means uncomputed cost. 
+  // Cost of the loop with that width
+  unsigned Cost;
+
+  // Width 1 means no vectorization, cost 0 means uncomputed cost.
   static VectorizationFactor Disabled() {
     return {ElementCount::getFixed(1), 0};
   }
- 
-  bool operator==(const VectorizationFactor &rhs) const { 
-    return Width == rhs.Width && Cost == rhs.Cost; 
-  } 
+
+  bool operator==(const VectorizationFactor &rhs) const {
+    return Width == rhs.Width && Cost == rhs.Cost;
+  }
 
   bool operator!=(const VectorizationFactor &rhs) const {
     return !(*this == rhs);
   }
-}; 
- 
-/// Planner drives the vectorization process after having passed 
-/// Legality checks. 
-class LoopVectorizationPlanner { 
-  /// The loop that we evaluate. 
-  Loop *OrigLoop; 
- 
-  /// Loop Info analysis. 
-  LoopInfo *LI; 
- 
-  /// Target Library Info. 
-  const TargetLibraryInfo *TLI; 
- 
-  /// Target Transform Info. 
-  const TargetTransformInfo *TTI; 
- 
-  /// The legality analysis. 
-  LoopVectorizationLegality *Legal; 
- 
-  /// The profitability analysis. 
-  LoopVectorizationCostModel &CM; 
- 
-  /// The interleaved access analysis. 
-  InterleavedAccessInfo &IAI; 
- 
-  PredicatedScalarEvolution &PSE; 
- 
-  SmallVector<VPlanPtr, 4> VPlans; 
- 
-  /// This class is used to enable the VPlan to invoke a method of ILV. This is 
-  /// needed until the method is refactored out of ILV and becomes reusable. 
-  struct VPCallbackILV : public VPCallback { 
-    InnerLoopVectorizer &ILV; 
- 
-    VPCallbackILV(InnerLoopVectorizer &ILV) : ILV(ILV) {} 
- 
-    Value *getOrCreateVectorValues(Value *V, unsigned Part) override; 
-    Value *getOrCreateScalarValue(Value *V, 
-                                  const VPIteration &Instance) override; 
-  }; 
- 
-  /// A builder used to construct the current plan. 
-  VPBuilder Builder; 
- 
+};
+
+/// Planner drives the vectorization process after having passed
+/// Legality checks.
+class LoopVectorizationPlanner {
+  /// The loop that we evaluate.
+  Loop *OrigLoop;
+
+  /// Loop Info analysis.
+  LoopInfo *LI;
+
+  /// Target Library Info.
+  const TargetLibraryInfo *TLI;
+
+  /// Target Transform Info.
+  const TargetTransformInfo *TTI;
+
+  /// The legality analysis.
+  LoopVectorizationLegality *Legal;
+
+  /// The profitability analysis.
+  LoopVectorizationCostModel &CM;
+
+  /// The interleaved access analysis.
+  InterleavedAccessInfo &IAI;
+
+  PredicatedScalarEvolution &PSE;
+
+  SmallVector<VPlanPtr, 4> VPlans;
+
+  /// This class is used to enable the VPlan to invoke a method of ILV. This is
+  /// needed until the method is refactored out of ILV and becomes reusable.
+  struct VPCallbackILV : public VPCallback {
+    InnerLoopVectorizer &ILV;
+
+    VPCallbackILV(InnerLoopVectorizer &ILV) : ILV(ILV) {}
+
+    Value *getOrCreateVectorValues(Value *V, unsigned Part) override;
+    Value *getOrCreateScalarValue(Value *V,
+                                  const VPIteration &Instance) override;
+  };
+
+  /// A builder used to construct the current plan.
+  VPBuilder Builder;
+
   /// The best number of elements of the vector types used in the
   /// transformed loop. BestVF = None means that vectorization is
   /// disabled.
   Optional<ElementCount> BestVF = None;
-  unsigned BestUF = 0; 
- 
-public: 
-  LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI, 
-                           const TargetTransformInfo *TTI, 
-                           LoopVectorizationLegality *Legal, 
-                           LoopVectorizationCostModel &CM, 
-                           InterleavedAccessInfo &IAI, 
-                           PredicatedScalarEvolution &PSE) 
-      : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI), 
-        PSE(PSE) {} 
- 
-  /// Plan how to best vectorize, return the best VF and its cost, or None if 
-  /// vectorization and interleaving should be avoided up front. 
+  unsigned BestUF = 0;
+
+public:
+  LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,
+                           const TargetTransformInfo *TTI,
+                           LoopVectorizationLegality *Legal,
+                           LoopVectorizationCostModel &CM,
+                           InterleavedAccessInfo &IAI,
+                           PredicatedScalarEvolution &PSE)
+      : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI),
+        PSE(PSE) {}
+
+  /// Plan how to best vectorize, return the best VF and its cost, or None if
+  /// vectorization and interleaving should be avoided up front.
   Optional<VectorizationFactor> plan(ElementCount UserVF, unsigned UserIC);
- 
-  /// Use the VPlan-native path to plan how to best vectorize, return the best 
-  /// VF and its cost. 
+
+  /// Use the VPlan-native path to plan how to best vectorize, return the best
+  /// VF and its cost.
   VectorizationFactor planInVPlanNativePath(ElementCount UserVF);
- 
-  /// Finalize the best decision and dispose of all other VPlans. 
+
+  /// Finalize the best decision and dispose of all other VPlans.
   void setBestPlan(ElementCount VF, unsigned UF);
- 
-  /// Generate the IR code for the body of the vectorized loop according to the 
-  /// best selected VPlan. 
-  void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT); 
- 
-  void printPlans(raw_ostream &O) { 
-    for (const auto &Plan : VPlans) 
-      O << *Plan; 
-  } 
- 
+
+  /// Generate the IR code for the body of the vectorized loop according to the
+  /// best selected VPlan.
+  void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT);
+
+  void printPlans(raw_ostream &O) {
+    for (const auto &Plan : VPlans)
+      O << *Plan;
+  }
+
   /// Look through the existing plans and return true if we have one with all
   /// the vectorization factors in question.
   bool hasPlanWithVFs(const ArrayRef<ElementCount> VFs) const {
@@ -283,39 +283,39 @@ public:
     });
   }
 
-  /// Test a \p Predicate on a \p Range of VF's. Return the value of applying 
-  /// \p Predicate on Range.Start, possibly decreasing Range.End such that the 
-  /// returned value holds for the entire \p Range. 
-  static bool 
+  /// Test a \p Predicate on a \p Range of VF's. Return the value of applying
+  /// \p Predicate on Range.Start, possibly decreasing Range.End such that the
+  /// returned value holds for the entire \p Range.
+  static bool
   getDecisionAndClampRange(const std::function<bool(ElementCount)> &Predicate,
-                           VFRange &Range); 
- 
-protected: 
-  /// Collect the instructions from the original loop that would be trivially 
-  /// dead in the vectorized loop if generated. 
-  void collectTriviallyDeadInstructions( 
-      SmallPtrSetImpl<Instruction *> &DeadInstructions); 
- 
-  /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, 
-  /// according to the information gathered by Legal when it checked if it is 
-  /// legal to vectorize the loop. 
+                           VFRange &Range);
+
+protected:
+  /// Collect the instructions from the original loop that would be trivially
+  /// dead in the vectorized loop if generated.
+  void collectTriviallyDeadInstructions(
+      SmallPtrSetImpl<Instruction *> &DeadInstructions);
+
+  /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
+  /// according to the information gathered by Legal when it checked if it is
+  /// legal to vectorize the loop.
   void buildVPlans(ElementCount MinVF, ElementCount MaxVF);
- 
-private: 
-  /// Build a VPlan according to the information gathered by Legal. \return a 
-  /// VPlan for vectorization factors \p Range.Start and up to \p Range.End 
-  /// exclusive, possibly decreasing \p Range.End. 
-  VPlanPtr buildVPlan(VFRange &Range); 
- 
-  /// Build a VPlan using VPRecipes according to the information gather by 
-  /// Legal. This method is only used for the legacy inner loop vectorizer. 
-  VPlanPtr buildVPlanWithVPRecipes( 
+
+private:
+  /// Build a VPlan according to the information gathered by Legal. \return a
+  /// VPlan for vectorization factors \p Range.Start and up to \p Range.End
+  /// exclusive, possibly decreasing \p Range.End.
+  VPlanPtr buildVPlan(VFRange &Range);
+
+  /// Build a VPlan using VPRecipes according to the information gather by
+  /// Legal. This method is only used for the legacy inner loop vectorizer.
+  VPlanPtr buildVPlanWithVPRecipes(
       VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
-      const DenseMap<Instruction *, Instruction *> &SinkAfter); 
- 
-  /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, 
-  /// according to the information gathered by Legal when it checked if it is 
-  /// legal to vectorize the loop. This method creates VPlans using VPRecipes. 
+      const DenseMap<Instruction *, Instruction *> &SinkAfter);
+
+  /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
+  /// according to the information gathered by Legal when it checked if it is
+  /// legal to vectorize the loop. This method creates VPlans using VPRecipes.
   void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF);
 
   /// Adjust the recipes for any inloop reductions. The chain of instructions
@@ -324,8 +324,8 @@ private:
   /// reduction chain.
   void adjustRecipesForInLoopReductions(VPlanPtr &Plan,
                                         VPRecipeBuilder &RecipeBuilder);
-}; 
- 
-} // namespace llvm 
- 
-#endif // LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H 
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorize.cpp
index 60048bab64..b456a97aa4 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1,180 +1,180 @@
-//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 
-// and generates target-independent LLVM-IR. 
-// The vectorizer uses the TargetTransformInfo analysis to estimate the costs 
-// of instructions in order to estimate the profitability of vectorization. 
-// 
-// The loop vectorizer combines consecutive loop iterations into a single 
-// 'wide' iteration. After this transformation the index is incremented 
-// by the SIMD vector width, and not by one. 
-// 
-// This pass has three parts: 
-// 1. The main loop pass that drives the different parts. 
-// 2. LoopVectorizationLegality - A unit that checks for the legality 
-//    of the vectorization. 
-// 3. InnerLoopVectorizer - A unit that performs the actual 
-//    widening of instructions. 
-// 4. LoopVectorizationCostModel - A unit that checks for the profitability 
-//    of vectorization. It decides on the optimal vector width, which 
-//    can be one, if vectorization is not profitable. 
-// 
-// There is a development effort going on to migrate loop vectorizer to the 
-// VPlan infrastructure and to introduce outer loop vectorization support (see 
-// docs/Proposal/VectorizationPlan.rst and 
-// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 
-// purpose, we temporarily introduced the VPlan-native vectorization path: an 
-// alternative vectorization path that is natively implemented on top of the 
-// VPlan infrastructure. See EnableVPlanNativePath for enabling. 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// The reduction-variable vectorization is based on the paper: 
-//  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 
-// 
-// Variable uniformity checks are inspired by: 
-//  Karrenberg, R. and Hack, S. Whole Function Vectorization. 
-// 
-// The interleaved access vectorization is based on the paper: 
-//  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved 
-//  Data for SIMD 
-// 
-// Other ideas/concepts are from: 
-//  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 
-// 
-//  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of 
-//  Vectorizing Compilers. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Vectorize/LoopVectorize.h" 
-#include "LoopVectorizationPlanner.h" 
-#include "VPRecipeBuilder.h" 
-#include "VPlan.h" 
-#include "VPlanHCFGBuilder.h" 
-#include "VPlanPredicator.h" 
-#include "VPlanTransforms.h" 
-#include "llvm/ADT/APInt.h" 
-#include "llvm/ADT/ArrayRef.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/DenseMapInfo.h" 
-#include "llvm/ADT/Hashing.h" 
-#include "llvm/ADT/MapVector.h" 
-#include "llvm/ADT/None.h" 
-#include "llvm/ADT/Optional.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/StringRef.h" 
-#include "llvm/ADT/Twine.h" 
-#include "llvm/ADT/iterator_range.h" 
-#include "llvm/Analysis/AssumptionCache.h" 
-#include "llvm/Analysis/BasicAliasAnalysis.h" 
-#include "llvm/Analysis/BlockFrequencyInfo.h" 
-#include "llvm/Analysis/CFG.h" 
-#include "llvm/Analysis/CodeMetrics.h" 
-#include "llvm/Analysis/DemandedBits.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
-#include "llvm/Analysis/LoopAccessAnalysis.h" 
-#include "llvm/Analysis/LoopAnalysisManager.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/LoopIterator.h" 
-#include "llvm/Analysis/MemorySSA.h" 
-#include "llvm/Analysis/OptimizationRemarkEmitter.h" 
-#include "llvm/Analysis/ProfileSummaryInfo.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/Analysis/ScalarEvolutionExpressions.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/Analysis/VectorUtils.h" 
-#include "llvm/IR/Attributes.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DebugInfoMetadata.h" 
-#include "llvm/IR/DebugLoc.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/DiagnosticInfo.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/LLVMContext.h" 
-#include "llvm/IR/Metadata.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/Operator.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/IR/ValueHandle.h" 
-#include "llvm/IR/Verifier.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Compiler.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/ErrorHandling.h" 
+//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
+// and generates target-independent LLVM-IR.
+// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
+// of instructions in order to estimate the profitability of vectorization.
+//
+// The loop vectorizer combines consecutive loop iterations into a single
+// 'wide' iteration. After this transformation the index is incremented
+// by the SIMD vector width, and not by one.
+//
+// This pass has three parts:
+// 1. The main loop pass that drives the different parts.
+// 2. LoopVectorizationLegality - A unit that checks for the legality
+//    of the vectorization.
+// 3. InnerLoopVectorizer - A unit that performs the actual
+//    widening of instructions.
+// 4. LoopVectorizationCostModel - A unit that checks for the profitability
+//    of vectorization. It decides on the optimal vector width, which
+//    can be one, if vectorization is not profitable.
+//
+// There is a development effort going on to migrate loop vectorizer to the
+// VPlan infrastructure and to introduce outer loop vectorization support (see
+// docs/Proposal/VectorizationPlan.rst and
+// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
+// purpose, we temporarily introduced the VPlan-native vectorization path: an
+// alternative vectorization path that is natively implemented on top of the
+// VPlan infrastructure. See EnableVPlanNativePath for enabling.
+//
+//===----------------------------------------------------------------------===//
+//
+// The reduction-variable vectorization is based on the paper:
+//  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
+//
+// Variable uniformity checks are inspired by:
+//  Karrenberg, R. and Hack, S. Whole Function Vectorization.
+//
+// The interleaved access vectorization is based on the paper:
+//  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
+//  Data for SIMD
+//
+// Other ideas/concepts are from:
+//  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
+//
+//  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
+//  Vectorizing Compilers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize/LoopVectorize.h"
+#include "LoopVectorizationPlanner.h"
+#include "VPRecipeBuilder.h"
+#include "VPlan.h"
+#include "VPlanHCFGBuilder.h"
+#include "VPlanPredicator.h"
+#include "VPlanTransforms.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DemandedBits.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/InstructionCost.h"
-#include "llvm/Support/MathExtras.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include "llvm/Transforms/Utils/InjectTLIMappings.h" 
-#include "llvm/Transforms/Utils/LoopSimplify.h" 
-#include "llvm/Transforms/Utils/LoopUtils.h" 
-#include "llvm/Transforms/Utils/LoopVersioning.h" 
-#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 
-#include "llvm/Transforms/Utils/SizeOpts.h" 
-#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <cstdint> 
-#include <cstdlib> 
-#include <functional> 
-#include <iterator> 
-#include <limits> 
-#include <memory> 
-#include <string> 
-#include <tuple> 
-#include <utility> 
- 
-using namespace llvm; 
- 
-#define LV_NAME "loop-vectorize" 
-#define DEBUG_TYPE LV_NAME 
- 
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/InjectTLIMappings.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
+#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <functional>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+
+#define LV_NAME "loop-vectorize"
+#define DEBUG_TYPE LV_NAME
+
 #ifndef NDEBUG
 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
 #endif
 
-/// @{ 
-/// Metadata attribute names 
+/// @{
+/// Metadata attribute names
 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
 const char LLVMLoopVectorizeFollowupVectorized[] =
-    "llvm.loop.vectorize.followup_vectorized"; 
+    "llvm.loop.vectorize.followup_vectorized";
 const char LLVMLoopVectorizeFollowupEpilogue[] =
-    "llvm.loop.vectorize.followup_epilogue"; 
-/// @} 
- 
-STATISTIC(LoopsVectorized, "Number of loops vectorized"); 
-STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 
+    "llvm.loop.vectorize.followup_epilogue";
+/// @}
+
+STATISTIC(LoopsVectorized, "Number of loops vectorized");
+STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
- 
+
 static cl::opt<bool> EnableEpilogueVectorization(
     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
     cl::desc("Enable vectorization of epilogue loops."));
@@ -190,14 +190,14 @@ static cl::opt<unsigned> EpilogueVectorizationMinVF(
     cl::desc("Only loops with vectorization factor equal to or larger than "
              "the specified value are considered for epilogue vectorization."));
 
-/// Loops with a known constant trip count below this number are vectorized only 
-/// if no scalar iteration overheads are incurred. 
-static cl::opt<unsigned> TinyTripCountVectorThreshold( 
-    "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 
-    cl::desc("Loops with a constant trip count that is smaller than this " 
-             "value are vectorized only if no scalar iteration overheads " 
-             "are incurred.")); 
- 
+/// Loops with a known constant trip count below this number are vectorized only
+/// if no scalar iteration overheads are incurred.
+static cl::opt<unsigned> TinyTripCountVectorThreshold(
+    "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
+    cl::desc("Loops with a constant trip count that is smaller than this "
+             "value are vectorized only if no scalar iteration overheads "
+             "are incurred."));
+
 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
 // that predication is preferred, and this lists all options. I.e., the
 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
@@ -210,7 +210,7 @@ namespace PreferPredicateTy {
     PredicateOrDontVectorize
   };
 } // namespace PreferPredicateTy
- 
+
 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
     "prefer-predicate-over-epilogue",
     cl::init(PreferPredicateTy::ScalarEpilogue),
@@ -229,97 +229,97 @@ static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
                          "prefers tail-folding, don't attempt vectorization if "
                          "tail-folding fails.")));
 
-static cl::opt<bool> MaximizeBandwidth( 
-    "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 
-    cl::desc("Maximize bandwidth when selecting vectorization factor which " 
-             "will be determined by the smallest type in loop.")); 
- 
-static cl::opt<bool> EnableInterleavedMemAccesses( 
-    "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 
-    cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 
- 
-/// An interleave-group may need masking if it resides in a block that needs 
+static cl::opt<bool> MaximizeBandwidth(
+    "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
+    cl::desc("Maximize bandwidth when selecting vectorization factor which "
+             "will be determined by the smallest type in loop."));
+
+static cl::opt<bool> EnableInterleavedMemAccesses(
+    "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
+    cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
+
+/// An interleave-group may need masking if it resides in a block that needs
 /// predication, or in order to mask away gaps.
-static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 
-    "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 
-    cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 
- 
-static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 
-    "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 
-    cl::desc("We don't interleave loops with a estimated constant trip count " 
-             "below this number")); 
- 
-static cl::opt<unsigned> ForceTargetNumScalarRegs( 
-    "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 
-    cl::desc("A flag that overrides the target's number of scalar registers.")); 
- 
-static cl::opt<unsigned> ForceTargetNumVectorRegs( 
-    "force-target-num-vector-regs", cl::init(0), cl::Hidden, 
-    cl::desc("A flag that overrides the target's number of vector registers.")); 
- 
-static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 
-    "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 
-    cl::desc("A flag that overrides the target's max interleave factor for " 
-             "scalar loops.")); 
- 
-static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 
-    "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 
-    cl::desc("A flag that overrides the target's max interleave factor for " 
-             "vectorized loops.")); 
- 
-static cl::opt<unsigned> ForceTargetInstructionCost( 
-    "force-target-instruction-cost", cl::init(0), cl::Hidden, 
-    cl::desc("A flag that overrides the target's expected cost for " 
-             "an instruction to a single constant value. Mostly " 
-             "useful for getting consistent testing.")); 
- 
+static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
+    "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
+    cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
+
+static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
+    "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
+    cl::desc("We don't interleave loops with a estimated constant trip count "
+             "below this number"));
+
+static cl::opt<unsigned> ForceTargetNumScalarRegs(
+    "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
+    cl::desc("A flag that overrides the target's number of scalar registers."));
+
+static cl::opt<unsigned> ForceTargetNumVectorRegs(
+    "force-target-num-vector-regs", cl::init(0), cl::Hidden,
+    cl::desc("A flag that overrides the target's number of vector registers."));
+
+static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
+    "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
+    cl::desc("A flag that overrides the target's max interleave factor for "
+             "scalar loops."));
+
+static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
+    "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
+    cl::desc("A flag that overrides the target's max interleave factor for "
+             "vectorized loops."));
+
+static cl::opt<unsigned> ForceTargetInstructionCost(
+    "force-target-instruction-cost", cl::init(0), cl::Hidden,
+    cl::desc("A flag that overrides the target's expected cost for "
+             "an instruction to a single constant value. Mostly "
+             "useful for getting consistent testing."));
+
 static cl::opt<bool> ForceTargetSupportsScalableVectors(
     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
     cl::desc(
         "Pretend that scalable vectors are supported, even if the target does "
         "not support them. This flag should only be used for testing."));
 
-static cl::opt<unsigned> SmallLoopCost( 
-    "small-loop-cost", cl::init(20), cl::Hidden, 
-    cl::desc( 
-        "The cost of a loop that is considered 'small' by the interleaver.")); 
- 
-static cl::opt<bool> LoopVectorizeWithBlockFrequency( 
-    "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 
-    cl::desc("Enable the use of the block frequency analysis to access PGO " 
-             "heuristics minimizing code growth in cold regions and being more " 
-             "aggressive in hot regions.")); 
- 
-// Runtime interleave loops for load/store throughput. 
-static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 
-    "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 
-    cl::desc( 
-        "Enable runtime interleaving until load/store ports are saturated")); 
- 
+static cl::opt<unsigned> SmallLoopCost(
+    "small-loop-cost", cl::init(20), cl::Hidden,
+    cl::desc(
+        "The cost of a loop that is considered 'small' by the interleaver."));
+
+static cl::opt<bool> LoopVectorizeWithBlockFrequency(
+    "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
+    cl::desc("Enable the use of the block frequency analysis to access PGO "
+             "heuristics minimizing code growth in cold regions and being more "
+             "aggressive in hot regions."));
+
+// Runtime interleave loops for load/store throughput.
+static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
+    "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
+    cl::desc(
+        "Enable runtime interleaving until load/store ports are saturated"));
+
 /// Interleave small loops with scalar reductions.
 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
     cl::desc("Enable interleaving for loops with small iteration counts that "
              "contain scalar reductions to expose ILP."));
 
-/// The number of stores in a loop that are allowed to need predication. 
-static cl::opt<unsigned> NumberOfStoresToPredicate( 
-    "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 
-    cl::desc("Max number of stores to be predicated behind an if.")); 
- 
-static cl::opt<bool> EnableIndVarRegisterHeur( 
-    "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 
-    cl::desc("Count the induction variable only once when interleaving")); 
- 
-static cl::opt<bool> EnableCondStoresVectorization( 
-    "enable-cond-stores-vec", cl::init(true), cl::Hidden, 
-    cl::desc("Enable if predication of stores during vectorization.")); 
- 
-static cl::opt<unsigned> MaxNestedScalarReductionIC( 
-    "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 
-    cl::desc("The maximum interleave count to use when interleaving a scalar " 
-             "reduction in a nested loop.")); 
- 
+/// The number of stores in a loop that are allowed to need predication.
+static cl::opt<unsigned> NumberOfStoresToPredicate(
+    "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
+    cl::desc("Max number of stores to be predicated behind an if."));
+
+static cl::opt<bool> EnableIndVarRegisterHeur(
+    "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
+    cl::desc("Count the induction variable only once when interleaving"));
+
+static cl::opt<bool> EnableCondStoresVectorization(
+    "enable-cond-stores-vec", cl::init(true), cl::Hidden,
+    cl::desc("Enable if predication of stores during vectorization."));
+
+static cl::opt<unsigned> MaxNestedScalarReductionIC(
+    "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
+    cl::desc("The maximum interleave count to use when interleaving a scalar "
+             "reduction in a nested loop."));
+
 static cl::opt<bool>
     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
                            cl::Hidden,
@@ -331,135 +331,135 @@ static cl::opt<bool> PreferPredicatedReductionSelect(
     cl::desc(
         "Prefer predicating a reduction operation over an after loop select."));
 
-cl::opt<bool> EnableVPlanNativePath( 
-    "enable-vplan-native-path", cl::init(false), cl::Hidden, 
-    cl::desc("Enable VPlan-native vectorization path with " 
-             "support for outer loop vectorization.")); 
- 
-// FIXME: Remove this switch once we have divergence analysis. Currently we 
-// assume divergent non-backedge branches when this switch is true. 
-cl::opt<bool> EnableVPlanPredication( 
-    "enable-vplan-predication", cl::init(false), cl::Hidden, 
-    cl::desc("Enable VPlan-native vectorization path predicator with " 
-             "support for outer loop vectorization.")); 
- 
-// This flag enables the stress testing of the VPlan H-CFG construction in the 
-// VPlan-native vectorization path. It must be used in conjuction with 
-// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 
-// verification of the H-CFGs built. 
-static cl::opt<bool> VPlanBuildStressTest( 
-    "vplan-build-stress-test", cl::init(false), cl::Hidden, 
-    cl::desc( 
-        "Build VPlan for every supported loop nest in the function and bail " 
-        "out right after the build (stress test the VPlan H-CFG construction " 
-        "in the VPlan-native vectorization path).")); 
- 
-cl::opt<bool> llvm::EnableLoopInterleaving( 
-    "interleave-loops", cl::init(true), cl::Hidden, 
-    cl::desc("Enable loop interleaving in Loop vectorization passes")); 
-cl::opt<bool> llvm::EnableLoopVectorization( 
-    "vectorize-loops", cl::init(true), cl::Hidden, 
-    cl::desc("Run the Loop vectorization passes")); 
- 
-/// A helper function that returns the type of loaded or stored value. 
-static Type *getMemInstValueType(Value *I) { 
-  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 
-         "Expected Load or Store instruction"); 
-  if (auto *LI = dyn_cast<LoadInst>(I)) 
-    return LI->getType(); 
-  return cast<StoreInst>(I)->getValueOperand()->getType(); 
-} 
- 
-/// A helper function that returns true if the given type is irregular. The 
-/// type is irregular if its allocated size doesn't equal the store size of an 
+cl::opt<bool> EnableVPlanNativePath(
+    "enable-vplan-native-path", cl::init(false), cl::Hidden,
+    cl::desc("Enable VPlan-native vectorization path with "
+             "support for outer loop vectorization."));
+
+// FIXME: Remove this switch once we have divergence analysis. Currently we
+// assume divergent non-backedge branches when this switch is true.
+cl::opt<bool> EnableVPlanPredication(
+    "enable-vplan-predication", cl::init(false), cl::Hidden,
+    cl::desc("Enable VPlan-native vectorization path predicator with "
+             "support for outer loop vectorization."));
+
+// This flag enables the stress testing of the VPlan H-CFG construction in the
+// VPlan-native vectorization path. It must be used in conjuction with
+// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
+// verification of the H-CFGs built.
+static cl::opt<bool> VPlanBuildStressTest(
+    "vplan-build-stress-test", cl::init(false), cl::Hidden,
+    cl::desc(
+        "Build VPlan for every supported loop nest in the function and bail "
+        "out right after the build (stress test the VPlan H-CFG construction "
+        "in the VPlan-native vectorization path)."));
+
+cl::opt<bool> llvm::EnableLoopInterleaving(
+    "interleave-loops", cl::init(true), cl::Hidden,
+    cl::desc("Enable loop interleaving in Loop vectorization passes"));
+cl::opt<bool> llvm::EnableLoopVectorization(
+    "vectorize-loops", cl::init(true), cl::Hidden,
+    cl::desc("Run the Loop vectorization passes"));
+
+/// A helper function that returns the type of loaded or stored value.
+static Type *getMemInstValueType(Value *I) {
+  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
+         "Expected Load or Store instruction");
+  if (auto *LI = dyn_cast<LoadInst>(I))
+    return LI->getType();
+  return cast<StoreInst>(I)->getValueOperand()->getType();
+}
+
+/// A helper function that returns true if the given type is irregular. The
+/// type is irregular if its allocated size doesn't equal the store size of an
 /// element of the corresponding vector type.
 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
   // Determine if an array of N elements of type Ty is "bitcast compatible"
   // with a <N x Ty> vector.
   // This is only true if there is no padding between the array elements.
-  return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 
-} 
- 
-/// A helper function that returns the reciprocal of the block probability of 
-/// predicated blocks. If we return X, we are assuming the predicated block 
-/// will execute once for every X iterations of the loop header. 
-/// 
-/// TODO: We should use actual block probability here, if available. Currently, 
-///       we always assume predicated blocks have a 50% chance of executing. 
-static unsigned getReciprocalPredBlockProb() { return 2; } 
- 
-/// A helper function that adds a 'fast' flag to floating-point operations. 
-static Value *addFastMathFlag(Value *V) { 
-  if (isa<FPMathOperator>(V)) 
-    cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 
-  return V; 
-} 
- 
-static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 
-  if (isa<FPMathOperator>(V)) 
-    cast<Instruction>(V)->setFastMathFlags(FMF); 
-  return V; 
-} 
- 
-/// A helper function that returns an integer or floating-point constant with 
-/// value C. 
-static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 
-  return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 
-                           : ConstantFP::get(Ty, C); 
-} 
- 
-/// Returns "best known" trip count for the specified loop \p L as defined by 
-/// the following procedure: 
-///   1) Returns exact trip count if it is known. 
-///   2) Returns expected trip count according to profile data if any. 
-///   3) Returns upper bound estimate if it is known. 
-///   4) Returns None if all of the above failed. 
-static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 
-  // Check if exact trip count is known. 
-  if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 
-    return ExpectedTC; 
- 
-  // Check if there is an expected trip count available from profile data. 
-  if (LoopVectorizeWithBlockFrequency) 
-    if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 
-      return EstimatedTC; 
- 
-  // Check if upper bound estimate is known. 
-  if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 
-    return ExpectedTC; 
- 
-  return None; 
-} 
- 
-namespace llvm { 
- 
-/// InnerLoopVectorizer vectorizes loops which contain only one basic 
-/// block to a specified vectorization factor (VF). 
-/// This class performs the widening of scalars into vectors, or multiple 
-/// scalars. This class also implements the following features: 
-/// * It inserts an epilogue loop for handling loops that don't have iteration 
-///   counts that are known to be a multiple of the vectorization factor. 
-/// * It handles the code generation for reduction variables. 
-/// * Scalarization (implementation using scalars) of un-vectorizable 
-///   instructions. 
-/// InnerLoopVectorizer does not perform any vectorization-legality 
-/// checks, and relies on the caller to check for the different legality 
-/// aspects. The InnerLoopVectorizer relies on the 
-/// LoopVectorizationLegality class to provide information about the induction 
-/// and reduction variables that were found to a given vectorization factor. 
-class InnerLoopVectorizer { 
-public: 
-  InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 
-                      LoopInfo *LI, DominatorTree *DT, 
-                      const TargetLibraryInfo *TLI, 
-                      const TargetTransformInfo *TTI, AssumptionCache *AC, 
+  return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
+}
+
+/// A helper function that returns the reciprocal of the block probability of
+/// predicated blocks. If we return X, we are assuming the predicated block
+/// will execute once for every X iterations of the loop header.
+///
+/// TODO: We should use actual block probability here, if available. Currently,
+///       we always assume predicated blocks have a 50% chance of executing.
+static unsigned getReciprocalPredBlockProb() { return 2; }
+
+/// A helper function that adds a 'fast' flag to floating-point operations.
+static Value *addFastMathFlag(Value *V) {
+  if (isa<FPMathOperator>(V))
+    cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
+  return V;
+}
+
+static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
+  if (isa<FPMathOperator>(V))
+    cast<Instruction>(V)->setFastMathFlags(FMF);
+  return V;
+}
+
+/// A helper function that returns an integer or floating-point constant with
+/// value C.
+static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
+  return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
+                           : ConstantFP::get(Ty, C);
+}
+
+/// Returns "best known" trip count for the specified loop \p L as defined by
+/// the following procedure:
+///   1) Returns exact trip count if it is known.
+///   2) Returns expected trip count according to profile data if any.
+///   3) Returns upper bound estimate if it is known.
+///   4) Returns None if all of the above failed.
+static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
+  // Check if exact trip count is known.
+  if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
+    return ExpectedTC;
+
+  // Check if there is an expected trip count available from profile data.
+  if (LoopVectorizeWithBlockFrequency)
+    if (auto EstimatedTC = getLoopEstimatedTripCount(L))
+      return EstimatedTC;
+
+  // Check if upper bound estimate is known.
+  if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
+    return ExpectedTC;
+
+  return None;
+}
+
+namespace llvm {
+
+/// InnerLoopVectorizer vectorizes loops which contain only one basic
+/// block to a specified vectorization factor (VF).
+/// This class performs the widening of scalars into vectors, or multiple
+/// scalars. This class also implements the following features:
+/// * It inserts an epilogue loop for handling loops that don't have iteration
+///   counts that are known to be a multiple of the vectorization factor.
+/// * It handles the code generation for reduction variables.
+/// * Scalarization (implementation using scalars) of un-vectorizable
+///   instructions.
+/// InnerLoopVectorizer does not perform any vectorization-legality
+/// checks, and relies on the caller to check for the different legality
+/// aspects. The InnerLoopVectorizer relies on the
+/// LoopVectorizationLegality class to provide information about the induction
+/// and reduction variables that were found to a given vectorization factor.
+class InnerLoopVectorizer {
+public:
+  InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
+                      LoopInfo *LI, DominatorTree *DT,
+                      const TargetLibraryInfo *TLI,
+                      const TargetTransformInfo *TTI, AssumptionCache *AC,
                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
-                      unsigned UnrollFactor, LoopVectorizationLegality *LVL, 
+                      unsigned UnrollFactor, LoopVectorizationLegality *LVL,
                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
                       ProfileSummaryInfo *PSI)
-      : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 
-        AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 
-        Builder(PSE.getSE()->getContext()), 
+      : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
+        AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
+        Builder(PSE.getSE()->getContext()),
         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
         BFI(BFI), PSI(PSI) {
     // Query this against the original loop and save it here because the profile
@@ -468,8 +468,8 @@ public:
         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
   }
 
-  virtual ~InnerLoopVectorizer() = default; 
- 
+  virtual ~InnerLoopVectorizer() = default;
+
   /// Create a new empty loop that will contain vectorized instructions later
   /// on, while the old loop will be used as the scalar remainder. Control flow
   /// is generated around the vectorized (and scalar epilogue) loops consisting
@@ -478,265 +478,265 @@ public:
   /// In the case of epilogue vectorization, this function is overriden to
   /// handle the more complex control flow around the loops.
   virtual BasicBlock *createVectorizedLoopSkeleton();
- 
-  /// Widen a single instruction within the innermost loop. 
+
+  /// Widen a single instruction within the innermost loop.
   void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
-                        VPTransformState &State); 
- 
-  /// Widen a single call instruction within the innermost loop. 
+                        VPTransformState &State);
+
+  /// Widen a single call instruction within the innermost loop.
   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
-                            VPTransformState &State); 
- 
-  /// Widen a single select instruction within the innermost loop. 
+                            VPTransformState &State);
+
+  /// Widen a single select instruction within the innermost loop.
   void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
-                              bool InvariantCond, VPTransformState &State); 
- 
-  /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 
-  void fixVectorizedLoop(); 
- 
-  // Return true if any runtime check is added. 
-  bool areSafetyChecksAdded() { return AddedSafetyChecks; } 
- 
-  /// A type for vectorized values in the new loop. Each value from the 
-  /// original loop, when vectorized, is represented by UF vector values in the 
-  /// new unrolled loop, where UF is the unroll factor. 
-  using VectorParts = SmallVector<Value *, 2>; 
- 
-  /// Vectorize a single GetElementPtrInst based on information gathered and 
-  /// decisions taken during planning. 
+                              bool InvariantCond, VPTransformState &State);
+
+  /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
+  void fixVectorizedLoop();
+
+  // Return true if any runtime check is added.
+  bool areSafetyChecksAdded() { return AddedSafetyChecks; }
+
+  /// A type for vectorized values in the new loop. Each value from the
+  /// original loop, when vectorized, is represented by UF vector values in the
+  /// new unrolled loop, where UF is the unroll factor.
+  using VectorParts = SmallVector<Value *, 2>;
+
+  /// Vectorize a single GetElementPtrInst based on information gathered and
+  /// decisions taken during planning.
   void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
                 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
-                SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 
- 
-  /// Vectorize a single PHINode in a block. This method handles the induction 
-  /// variable canonicalization. It supports both VF = 1 for unrolled loops and 
-  /// arbitrary length vectors. 
+                SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
+
+  /// Vectorize a single PHINode in a block. This method handles the induction
+  /// variable canonicalization. It supports both VF = 1 for unrolled loops and
+  /// arbitrary length vectors.
   void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc,
                            Value *StartV, unsigned UF, ElementCount VF);
- 
-  /// A helper function to scalarize a single Instruction in the innermost loop. 
-  /// Generates a sequence of scalar instances for each lane between \p MinLane 
-  /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 
-  /// inclusive. Uses the VPValue operands from \p Operands instead of \p 
-  /// Instr's operands. 
-  void scalarizeInstruction(Instruction *Instr, VPUser &Operands, 
-                            const VPIteration &Instance, bool IfPredicateInstr, 
-                            VPTransformState &State); 
- 
-  /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 
-  /// is provided, the integer induction variable will first be truncated to 
-  /// the corresponding type. 
+
+  /// A helper function to scalarize a single Instruction in the innermost loop.
+  /// Generates a sequence of scalar instances for each lane between \p MinLane
+  /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
+  /// inclusive. Uses the VPValue operands from \p Operands instead of \p
+  /// Instr's operands.
+  void scalarizeInstruction(Instruction *Instr, VPUser &Operands,
+                            const VPIteration &Instance, bool IfPredicateInstr,
+                            VPTransformState &State);
+
+  /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
+  /// is provided, the integer induction variable will first be truncated to
+  /// the corresponding type.
   void widenIntOrFpInduction(PHINode *IV, Value *Start,
                              TruncInst *Trunc = nullptr);
- 
-  /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 
-  /// vector or scalar value on-demand if one is not yet available. When 
-  /// vectorizing a loop, we visit the definition of an instruction before its 
-  /// uses. When visiting the definition, we either vectorize or scalarize the 
-  /// instruction, creating an entry for it in the corresponding map. (In some 
-  /// cases, such as induction variables, we will create both vector and scalar 
-  /// entries.) Then, as we encounter uses of the definition, we derive values 
-  /// for each scalar or vector use unless such a value is already available. 
-  /// For example, if we scalarize a definition and one of its uses is vector, 
-  /// we build the required vector on-demand with an insertelement sequence 
-  /// when visiting the use. Otherwise, if the use is scalar, we can use the 
-  /// existing scalar definition. 
-  /// 
-  /// Return a value in the new loop corresponding to \p V from the original 
-  /// loop at unroll index \p Part. If the value has already been vectorized, 
-  /// the corresponding vector entry in VectorLoopValueMap is returned. If, 
-  /// however, the value has a scalar entry in VectorLoopValueMap, we construct 
-  /// a new vector value on-demand by inserting the scalar values into a vector 
-  /// with an insertelement sequence. If the value has been neither vectorized 
-  /// nor scalarized, it must be loop invariant, so we simply broadcast the 
-  /// value into a vector. 
-  Value *getOrCreateVectorValue(Value *V, unsigned Part); 
- 
+
+  /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
+  /// vector or scalar value on-demand if one is not yet available. When
+  /// vectorizing a loop, we visit the definition of an instruction before its
+  /// uses. When visiting the definition, we either vectorize or scalarize the
+  /// instruction, creating an entry for it in the corresponding map. (In some
+  /// cases, such as induction variables, we will create both vector and scalar
+  /// entries.) Then, as we encounter uses of the definition, we derive values
+  /// for each scalar or vector use unless such a value is already available.
+  /// For example, if we scalarize a definition and one of its uses is vector,
+  /// we build the required vector on-demand with an insertelement sequence
+  /// when visiting the use. Otherwise, if the use is scalar, we can use the
+  /// existing scalar definition.
+  ///
+  /// Return a value in the new loop corresponding to \p V from the original
+  /// loop at unroll index \p Part. If the value has already been vectorized,
+  /// the corresponding vector entry in VectorLoopValueMap is returned. If,
+  /// however, the value has a scalar entry in VectorLoopValueMap, we construct
+  /// a new vector value on-demand by inserting the scalar values into a vector
+  /// with an insertelement sequence. If the value has been neither vectorized
+  /// nor scalarized, it must be loop invariant, so we simply broadcast the
+  /// value into a vector.
+  Value *getOrCreateVectorValue(Value *V, unsigned Part);
+
   void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) {
     VectorLoopValueMap.setVectorValue(Scalar, Part, Vector);
   }
 
-  /// Return a value in the new loop corresponding to \p V from the original 
-  /// loop at unroll and vector indices \p Instance. If the value has been 
-  /// vectorized but not scalarized, the necessary extractelement instruction 
-  /// will be generated. 
-  Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 
- 
-  /// Construct the vector value of a scalarized value \p V one lane at a time. 
-  void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 
- 
-  /// Try to vectorize interleaved access group \p Group with the base address 
-  /// given in \p Addr, optionally masking the vector operations if \p 
-  /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 
-  /// values in the vectorized loop. 
-  void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 
+  /// Return a value in the new loop corresponding to \p V from the original
+  /// loop at unroll and vector indices \p Instance. If the value has been
+  /// vectorized but not scalarized, the necessary extractelement instruction
+  /// will be generated.
+  Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
+
+  /// Construct the vector value of a scalarized value \p V one lane at a time.
+  void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
+
+  /// Try to vectorize interleaved access group \p Group with the base address
+  /// given in \p Addr, optionally masking the vector operations if \p
+  /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
+  /// values in the vectorized loop.
+  void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
                                 ArrayRef<VPValue *> VPDefs,
-                                VPTransformState &State, VPValue *Addr, 
+                                VPTransformState &State, VPValue *Addr,
                                 ArrayRef<VPValue *> StoredValues,
-                                VPValue *BlockInMask = nullptr); 
- 
-  /// Vectorize Load and Store instructions with the base address given in \p 
-  /// Addr, optionally masking the vector operations if \p BlockInMask is 
-  /// non-null. Use \p State to translate given VPValues to IR values in the 
-  /// vectorized loop. 
-  void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 
+                                VPValue *BlockInMask = nullptr);
+
+  /// Vectorize Load and Store instructions with the base address given in \p
+  /// Addr, optionally masking the vector operations if \p BlockInMask is
+  /// non-null. Use \p State to translate given VPValues to IR values in the
+  /// vectorized loop.
+  void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
                                   VPValue *Def, VPValue *Addr,
                                   VPValue *StoredValue, VPValue *BlockInMask);
- 
-  /// Set the debug location in the builder using the debug location in 
-  /// the instruction. 
-  void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 
- 
-  /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 
-  void fixNonInductionPHIs(void); 
- 
-protected: 
-  friend class LoopVectorizationPlanner; 
- 
-  /// A small list of PHINodes. 
-  using PhiVector = SmallVector<PHINode *, 4>; 
- 
-  /// A type for scalarized values in the new loop. Each value from the 
-  /// original loop, when scalarized, is represented by UF x VF scalar values 
-  /// in the new unrolled loop, where UF is the unroll factor and VF is the 
-  /// vectorization factor. 
-  using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 
- 
-  /// Set up the values of the IVs correctly when exiting the vector loop. 
-  void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 
-                    Value *CountRoundDown, Value *EndValue, 
-                    BasicBlock *MiddleBlock); 
- 
-  /// Create a new induction variable inside L. 
-  PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 
-                                   Value *Step, Instruction *DL); 
- 
-  /// Handle all cross-iteration phis in the header. 
-  void fixCrossIterationPHIs(); 
- 
-  /// Fix a first-order recurrence. This is the second phase of vectorizing 
-  /// this phi node. 
-  void fixFirstOrderRecurrence(PHINode *Phi); 
- 
-  /// Fix a reduction cross-iteration phi. This is the second phase of 
-  /// vectorizing this phi node. 
-  void fixReduction(PHINode *Phi); 
- 
-  /// Clear NSW/NUW flags from reduction instructions if necessary. 
-  void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 
- 
+
+  /// Set the debug location in the builder using the debug location in
+  /// the instruction.
+  void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
+
+  /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
+  void fixNonInductionPHIs(void);
+
+protected:
+  friend class LoopVectorizationPlanner;
+
+  /// A small list of PHINodes.
+  using PhiVector = SmallVector<PHINode *, 4>;
+
+  /// A type for scalarized values in the new loop. Each value from the
+  /// original loop, when scalarized, is represented by UF x VF scalar values
+  /// in the new unrolled loop, where UF is the unroll factor and VF is the
+  /// vectorization factor.
+  using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
+
+  /// Set up the values of the IVs correctly when exiting the vector loop.
+  void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
+                    Value *CountRoundDown, Value *EndValue,
+                    BasicBlock *MiddleBlock);
+
+  /// Create a new induction variable inside L.
+  PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
+                                   Value *Step, Instruction *DL);
+
+  /// Handle all cross-iteration phis in the header.
+  void fixCrossIterationPHIs();
+
+  /// Fix a first-order recurrence. This is the second phase of vectorizing
+  /// this phi node.
+  void fixFirstOrderRecurrence(PHINode *Phi);
+
+  /// Fix a reduction cross-iteration phi. This is the second phase of
+  /// vectorizing this phi node.
+  void fixReduction(PHINode *Phi);
+
+  /// Clear NSW/NUW flags from reduction instructions if necessary.
+  void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
+
   /// Fixup the LCSSA phi nodes in the unique exit block.  This simply
   /// means we need to add the appropriate incoming value from the middle
   /// block as exiting edges from the scalar epilogue loop (if present) are
   /// already in place, and we exit the vector loop exclusively to the middle
   /// block.
-  void fixLCSSAPHIs(); 
- 
-  /// Iteratively sink the scalarized operands of a predicated instruction into 
-  /// the block that was created for it. 
-  void sinkScalarOperands(Instruction *PredInst); 
- 
-  /// Shrinks vector element sizes to the smallest bitwidth they can be legally 
-  /// represented as. 
-  void truncateToMinimalBitwidths(); 
- 
-  /// Create a broadcast instruction. This method generates a broadcast 
-  /// instruction (shuffle) for loop invariant values and for the induction 
-  /// value. If this is the induction variable then we extend it to N, N+1, ... 
-  /// this is needed because each iteration in the loop corresponds to a SIMD 
-  /// element. 
-  virtual Value *getBroadcastInstrs(Value *V); 
- 
-  /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 
-  /// to each vector element of Val. The sequence starts at StartIndex. 
-  /// \p Opcode is relevant for FP induction variable. 
-  virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 
-                               Instruction::BinaryOps Opcode = 
-                               Instruction::BinaryOpsEnd); 
- 
-  /// Compute scalar induction steps. \p ScalarIV is the scalar induction 
-  /// variable on which to base the steps, \p Step is the size of the step, and 
-  /// \p EntryVal is the value from the original loop that maps to the steps. 
-  /// Note that \p EntryVal doesn't have to be an induction variable - it 
-  /// can also be a truncate instruction. 
-  void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 
-                        const InductionDescriptor &ID); 
- 
-  /// Create a vector induction phi node based on an existing scalar one. \p 
-  /// EntryVal is the value from the original loop that maps to the vector phi 
-  /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 
-  /// truncate instruction, instead of widening the original IV, we widen a 
-  /// version of the IV truncated to \p EntryVal's type. 
-  void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 
+  void fixLCSSAPHIs();
+
+  /// Iteratively sink the scalarized operands of a predicated instruction into
+  /// the block that was created for it.
+  void sinkScalarOperands(Instruction *PredInst);
+
+  /// Shrinks vector element sizes to the smallest bitwidth they can be legally
+  /// represented as.
+  void truncateToMinimalBitwidths();
+
+  /// Create a broadcast instruction. This method generates a broadcast
+  /// instruction (shuffle) for loop invariant values and for the induction
+  /// value. If this is the induction variable then we extend it to N, N+1, ...
+  /// this is needed because each iteration in the loop corresponds to a SIMD
+  /// element.
+  virtual Value *getBroadcastInstrs(Value *V);
+
+  /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
+  /// to each vector element of Val. The sequence starts at StartIndex.
+  /// \p Opcode is relevant for FP induction variable.
+  virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
+                               Instruction::BinaryOps Opcode =
+                               Instruction::BinaryOpsEnd);
+
+  /// Compute scalar induction steps. \p ScalarIV is the scalar induction
+  /// variable on which to base the steps, \p Step is the size of the step, and
+  /// \p EntryVal is the value from the original loop that maps to the steps.
+  /// Note that \p EntryVal doesn't have to be an induction variable - it
+  /// can also be a truncate instruction.
+  void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
+                        const InductionDescriptor &ID);
+
+  /// Create a vector induction phi node based on an existing scalar one. \p
+  /// EntryVal is the value from the original loop that maps to the vector phi
+  /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
+  /// truncate instruction, instead of widening the original IV, we widen a
+  /// version of the IV truncated to \p EntryVal's type.
+  void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
                                        Value *Step, Value *Start,
                                        Instruction *EntryVal);
- 
-  /// Returns true if an instruction \p I should be scalarized instead of 
-  /// vectorized for the chosen vectorization factor. 
-  bool shouldScalarizeInstruction(Instruction *I) const; 
- 
-  /// Returns true if we should generate a scalar version of \p IV. 
-  bool needsScalarInduction(Instruction *IV) const; 
- 
-  /// If there is a cast involved in the induction variable \p ID, which should 
-  /// be ignored in the vectorized loop body, this function records the 
-  /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 
-  /// cast. We had already proved that the casted Phi is equal to the uncasted 
-  /// Phi in the vectorized loop (under a runtime guard), and therefore 
-  /// there is no need to vectorize the cast - the same value can be used in the 
-  /// vector loop for both the Phi and the cast. 
-  /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 
-  /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 
-  /// 
-  /// \p EntryVal is the value from the original loop that maps to the vector 
-  /// phi node and is used to distinguish what is the IV currently being 
-  /// processed - original one (if \p EntryVal is a phi corresponding to the 
-  /// original IV) or the "newly-created" one based on the proof mentioned above 
-  /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 
-  /// latter case \p EntryVal is a TruncInst and we must not record anything for 
-  /// that IV, but it's error-prone to expect callers of this routine to care 
-  /// about that, hence this explicit parameter. 
-  void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 
-                                             const Instruction *EntryVal, 
-                                             Value *VectorLoopValue, 
-                                             unsigned Part, 
-                                             unsigned Lane = UINT_MAX); 
- 
-  /// Generate a shuffle sequence that will reverse the vector Vec. 
-  virtual Value *reverseVector(Value *Vec); 
- 
-  /// Returns (and creates if needed) the original loop trip count. 
-  Value *getOrCreateTripCount(Loop *NewLoop); 
- 
-  /// Returns (and creates if needed) the trip count of the widened loop. 
-  Value *getOrCreateVectorTripCount(Loop *NewLoop); 
- 
-  /// Returns a bitcasted value to the requested vector type. 
-  /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 
-  Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 
-                                const DataLayout &DL); 
- 
-  /// Emit a bypass check to see if the vector trip count is zero, including if 
-  /// it overflows. 
-  void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 
- 
-  /// Emit a bypass check to see if all of the SCEV assumptions we've 
-  /// had to make are correct. 
-  void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 
- 
-  /// Emit bypass checks to check any memory assumptions we may have made. 
-  void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 
- 
-  /// Compute the transformed value of Index at offset StartValue using step 
-  /// StepValue. 
-  /// For integer induction, returns StartValue + Index * StepValue. 
-  /// For pointer induction, returns StartValue[Index * StepValue]. 
-  /// FIXME: The newly created binary instructions should contain nsw/nuw 
-  /// flags, which can be found from the original scalar operations. 
-  Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 
-                              const DataLayout &DL, 
-                              const InductionDescriptor &ID) const; 
- 
+
+  /// Returns true if an instruction \p I should be scalarized instead of
+  /// vectorized for the chosen vectorization factor.
+  bool shouldScalarizeInstruction(Instruction *I) const;
+
+  /// Returns true if we should generate a scalar version of \p IV.
+  bool needsScalarInduction(Instruction *IV) const;
+
+  /// If there is a cast involved in the induction variable \p ID, which should
+  /// be ignored in the vectorized loop body, this function records the
+  /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
+  /// cast. We had already proved that the casted Phi is equal to the uncasted
+  /// Phi in the vectorized loop (under a runtime guard), and therefore
+  /// there is no need to vectorize the cast - the same value can be used in the
+  /// vector loop for both the Phi and the cast.
+  /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
+  /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
+  ///
+  /// \p EntryVal is the value from the original loop that maps to the vector
+  /// phi node and is used to distinguish what is the IV currently being
+  /// processed - original one (if \p EntryVal is a phi corresponding to the
+  /// original IV) or the "newly-created" one based on the proof mentioned above
+  /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
+  /// latter case \p EntryVal is a TruncInst and we must not record anything for
+  /// that IV, but it's error-prone to expect callers of this routine to care
+  /// about that, hence this explicit parameter.
+  void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
+                                             const Instruction *EntryVal,
+                                             Value *VectorLoopValue,
+                                             unsigned Part,
+                                             unsigned Lane = UINT_MAX);
+
+  /// Generate a shuffle sequence that will reverse the vector Vec.
+  virtual Value *reverseVector(Value *Vec);
+
+  /// Returns (and creates if needed) the original loop trip count.
+  Value *getOrCreateTripCount(Loop *NewLoop);
+
+  /// Returns (and creates if needed) the trip count of the widened loop.
+  Value *getOrCreateVectorTripCount(Loop *NewLoop);
+
+  /// Returns a bitcasted value to the requested vector type.
+  /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
+  Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
+                                const DataLayout &DL);
+
+  /// Emit a bypass check to see if the vector trip count is zero, including if
+  /// it overflows.
+  void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
+
+  /// Emit a bypass check to see if all of the SCEV assumptions we've
+  /// had to make are correct.
+  void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
+
+  /// Emit bypass checks to check any memory assumptions we may have made.
+  void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
+
+  /// Compute the transformed value of Index at offset StartValue using step
+  /// StepValue.
+  /// For integer induction, returns StartValue + Index * StepValue.
+  /// For pointer induction, returns StartValue[Index * StepValue].
+  /// FIXME: The newly created binary instructions should contain nsw/nuw
+  /// flags, which can be found from the original scalar operations.
+  Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
+                              const DataLayout &DL,
+                              const InductionDescriptor &ID) const;
+
   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
   /// vector loop preheader, middle block and scalar preheader. Also
   /// allocate a loop object for the new vector loop and return it.
@@ -759,137 +759,137 @@ protected:
   /// the preheader of the completed vector loop.
   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
 
-  /// Add additional metadata to \p To that was not present on \p Orig. 
-  /// 
-  /// Currently this is used to add the noalias annotations based on the 
-  /// inserted memchecks.  Use this for instructions that are *cloned* into the 
-  /// vector loop. 
-  void addNewMetadata(Instruction *To, const Instruction *Orig); 
- 
-  /// Add metadata from one instruction to another. 
-  /// 
-  /// This includes both the original MDs from \p From and additional ones (\see 
-  /// addNewMetadata).  Use this for *newly created* instructions in the vector 
-  /// loop. 
-  void addMetadata(Instruction *To, Instruction *From); 
- 
-  /// Similar to the previous function but it adds the metadata to a 
-  /// vector of instructions. 
-  void addMetadata(ArrayRef<Value *> To, Instruction *From); 
- 
+  /// Add additional metadata to \p To that was not present on \p Orig.
+  ///
+  /// Currently this is used to add the noalias annotations based on the
+  /// inserted memchecks.  Use this for instructions that are *cloned* into the
+  /// vector loop.
+  void addNewMetadata(Instruction *To, const Instruction *Orig);
+
+  /// Add metadata from one instruction to another.
+  ///
+  /// This includes both the original MDs from \p From and additional ones (\see
+  /// addNewMetadata).  Use this for *newly created* instructions in the vector
+  /// loop.
+  void addMetadata(Instruction *To, Instruction *From);
+
+  /// Similar to the previous function but it adds the metadata to a
+  /// vector of instructions.
+  void addMetadata(ArrayRef<Value *> To, Instruction *From);
+
   /// Allow subclasses to override and print debug traces before/after vplan
   /// execution, when trace information is requested.
   virtual void printDebugTracesAtStart(){};
   virtual void printDebugTracesAtEnd(){};
 
-  /// The original loop. 
-  Loop *OrigLoop; 
- 
-  /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 
-  /// dynamic knowledge to simplify SCEV expressions and converts them to a 
-  /// more usable form. 
-  PredicatedScalarEvolution &PSE; 
- 
-  /// Loop Info. 
-  LoopInfo *LI; 
- 
-  /// Dominator Tree. 
-  DominatorTree *DT; 
- 
-  /// Alias Analysis. 
-  AAResults *AA; 
- 
-  /// Target Library Info. 
-  const TargetLibraryInfo *TLI; 
- 
-  /// Target Transform Info. 
-  const TargetTransformInfo *TTI; 
- 
-  /// Assumption Cache. 
-  AssumptionCache *AC; 
- 
-  /// Interface to emit optimization remarks. 
-  OptimizationRemarkEmitter *ORE; 
- 
-  /// LoopVersioning.  It's only set up (non-null) if memchecks were 
-  /// used. 
-  /// 
-  /// This is currently only used to add no-alias metadata based on the 
-  /// memchecks.  The actually versioning is performed manually. 
-  std::unique_ptr<LoopVersioning> LVer; 
- 
-  /// The vectorization SIMD factor to use. Each vector will have this many 
-  /// vector elements. 
+  /// The original loop.
+  Loop *OrigLoop;
+
+  /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
+  /// dynamic knowledge to simplify SCEV expressions and converts them to a
+  /// more usable form.
+  PredicatedScalarEvolution &PSE;
+
+  /// Loop Info.
+  LoopInfo *LI;
+
+  /// Dominator Tree.
+  DominatorTree *DT;
+
+  /// Alias Analysis.
+  AAResults *AA;
+
+  /// Target Library Info.
+  const TargetLibraryInfo *TLI;
+
+  /// Target Transform Info.
+  const TargetTransformInfo *TTI;
+
+  /// Assumption Cache.
+  AssumptionCache *AC;
+
+  /// Interface to emit optimization remarks.
+  OptimizationRemarkEmitter *ORE;
+
+  /// LoopVersioning.  It's only set up (non-null) if memchecks were
+  /// used.
+  ///
+  /// This is currently only used to add no-alias metadata based on the
+  /// memchecks.  The actually versioning is performed manually.
+  std::unique_ptr<LoopVersioning> LVer;
+
+  /// The vectorization SIMD factor to use. Each vector will have this many
+  /// vector elements.
   ElementCount VF;
- 
-  /// The vectorization unroll factor to use. Each scalar is vectorized to this 
-  /// many different vector instructions. 
-  unsigned UF; 
- 
-  /// The builder that we use 
-  IRBuilder<> Builder; 
- 
-  // --- Vectorization state --- 
- 
-  /// The vector-loop preheader. 
-  BasicBlock *LoopVectorPreHeader; 
- 
-  /// The scalar-loop preheader. 
-  BasicBlock *LoopScalarPreHeader; 
- 
-  /// Middle Block between the vector and the scalar. 
-  BasicBlock *LoopMiddleBlock; 
- 
+
+  /// The vectorization unroll factor to use. Each scalar is vectorized to this
+  /// many different vector instructions.
+  unsigned UF;
+
+  /// The builder that we use
+  IRBuilder<> Builder;
+
+  // --- Vectorization state ---
+
+  /// The vector-loop preheader.
+  BasicBlock *LoopVectorPreHeader;
+
+  /// The scalar-loop preheader.
+  BasicBlock *LoopScalarPreHeader;
+
+  /// Middle Block between the vector and the scalar.
+  BasicBlock *LoopMiddleBlock;
+
   /// The (unique) ExitBlock of the scalar loop.  Note that
   /// there can be multiple exiting edges reaching this block.
-  BasicBlock *LoopExitBlock; 
- 
-  /// The vector loop body. 
-  BasicBlock *LoopVectorBody; 
- 
-  /// The scalar loop body. 
-  BasicBlock *LoopScalarBody; 
- 
-  /// A list of all bypass blocks. The first block is the entry of the loop. 
-  SmallVector<BasicBlock *, 4> LoopBypassBlocks; 
- 
-  /// The new Induction variable which was added to the new block. 
-  PHINode *Induction = nullptr; 
- 
-  /// The induction variable of the old basic block. 
-  PHINode *OldInduction = nullptr; 
- 
-  /// Maps values from the original loop to their corresponding values in the 
-  /// vectorized loop. A key value can map to either vector values, scalar 
-  /// values or both kinds of values, depending on whether the key was 
-  /// vectorized and scalarized. 
-  VectorizerValueMap VectorLoopValueMap; 
- 
-  /// Store instructions that were predicated. 
-  SmallVector<Instruction *, 4> PredicatedInstructions; 
- 
-  /// Trip count of the original loop. 
-  Value *TripCount = nullptr; 
- 
-  /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 
-  Value *VectorTripCount = nullptr; 
- 
-  /// The legality analysis. 
-  LoopVectorizationLegality *Legal; 
- 
-  /// The profitablity analysis. 
-  LoopVectorizationCostModel *Cost; 
- 
-  // Record whether runtime checks are added. 
-  bool AddedSafetyChecks = false; 
- 
-  // Holds the end values for each induction variable. We save the end values 
-  // so we can later fix-up the external users of the induction variables. 
-  DenseMap<PHINode *, Value *> IVEndValues; 
- 
-  // Vector of original scalar PHIs whose corresponding widened PHIs need to be 
-  // fixed up at the end of vector code generation. 
-  SmallVector<PHINode *, 8> OrigPHIsToFix; 
+  BasicBlock *LoopExitBlock;
+
+  /// The vector loop body.
+  BasicBlock *LoopVectorBody;
+
+  /// The scalar loop body.
+  BasicBlock *LoopScalarBody;
+
+  /// A list of all bypass blocks. The first block is the entry of the loop.
+  SmallVector<BasicBlock *, 4> LoopBypassBlocks;
+
+  /// The new Induction variable which was added to the new block.
+  PHINode *Induction = nullptr;
+
+  /// The induction variable of the old basic block.
+  PHINode *OldInduction = nullptr;
+
+  /// Maps values from the original loop to their corresponding values in the
+  /// vectorized loop. A key value can map to either vector values, scalar
+  /// values or both kinds of values, depending on whether the key was
+  /// vectorized and scalarized.
+  VectorizerValueMap VectorLoopValueMap;
+
+  /// Store instructions that were predicated.
+  SmallVector<Instruction *, 4> PredicatedInstructions;
+
+  /// Trip count of the original loop.
+  Value *TripCount = nullptr;
+
+  /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
+  Value *VectorTripCount = nullptr;
+
+  /// The legality analysis.
+  LoopVectorizationLegality *Legal;
+
+  /// The profitablity analysis.
+  LoopVectorizationCostModel *Cost;
+
+  // Record whether runtime checks are added.
+  bool AddedSafetyChecks = false;
+
+  // Holds the end values for each induction variable. We save the end values
+  // so we can later fix-up the external users of the induction variables.
+  DenseMap<PHINode *, Value *> IVEndValues;
+
+  // Vector of original scalar PHIs whose corresponding widened PHIs need to be
+  // fixed up at the end of vector code generation.
+  SmallVector<PHINode *, 8> OrigPHIsToFix;
 
   /// BFI and PSI are used to check for profile guided size optimizations.
   BlockFrequencyInfo *BFI;
@@ -898,30 +898,30 @@ protected:
   // Whether this loop should be optimized for size based on profile guided size
   // optimizatios.
   bool OptForSizeBasedOnProfile;
-}; 
- 
-class InnerLoopUnroller : public InnerLoopVectorizer { 
-public: 
-  InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 
-                    LoopInfo *LI, DominatorTree *DT, 
-                    const TargetLibraryInfo *TLI, 
-                    const TargetTransformInfo *TTI, AssumptionCache *AC, 
-                    OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 
-                    LoopVectorizationLegality *LVL, 
+};
+
+class InnerLoopUnroller : public InnerLoopVectorizer {
+public:
+  InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
+                    LoopInfo *LI, DominatorTree *DT,
+                    const TargetLibraryInfo *TLI,
+                    const TargetTransformInfo *TTI, AssumptionCache *AC,
+                    OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
+                    LoopVectorizationLegality *LVL,
                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
                     ProfileSummaryInfo *PSI)
       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
                             BFI, PSI) {}
- 
-private: 
-  Value *getBroadcastInstrs(Value *V) override; 
-  Value *getStepVector(Value *Val, int StartIdx, Value *Step, 
-                       Instruction::BinaryOps Opcode = 
-                       Instruction::BinaryOpsEnd) override; 
-  Value *reverseVector(Value *Vec) override; 
-}; 
- 
+
+private:
+  Value *getBroadcastInstrs(Value *V) override;
+  Value *getStepVector(Value *Val, int StartIdx, Value *Step,
+                       Instruction::BinaryOps Opcode =
+                       Instruction::BinaryOpsEnd) override;
+  Value *reverseVector(Value *Vec) override;
+};
+
 /// Encapsulate information regarding vectorization of a loop and its epilogue.
 /// This information is meant to be updated and used across two stages of
 /// epilogue vectorization.
@@ -1044,88 +1044,88 @@ protected:
   void printDebugTracesAtStart() override;
   void printDebugTracesAtEnd() override;
 };
-} // end namespace llvm 
- 
-/// Look for a meaningful debug location on the instruction or it's 
-/// operands. 
-static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 
-  if (!I) 
-    return I; 
- 
-  DebugLoc Empty; 
-  if (I->getDebugLoc() != Empty) 
-    return I; 
- 
-  for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 
-    if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 
-      if (OpInst->getDebugLoc() != Empty) 
-        return OpInst; 
-  } 
- 
-  return I; 
-} 
- 
-void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 
-  if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 
-    const DILocation *DIL = Inst->getDebugLoc(); 
-    if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 
-        !isa<DbgInfoIntrinsic>(Inst)) { 
+} // end namespace llvm
+
+/// Look for a meaningful debug location on the instruction or it's
+/// operands.
+static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
+  if (!I)
+    return I;
+
+  DebugLoc Empty;
+  if (I->getDebugLoc() != Empty)
+    return I;
+
+  for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
+    if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
+      if (OpInst->getDebugLoc() != Empty)
+        return OpInst;
+  }
+
+  return I;
+}
+
+void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
+  if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
+    const DILocation *DIL = Inst->getDebugLoc();
+    if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
+        !isa<DbgInfoIntrinsic>(Inst)) {
       assert(!VF.isScalable() && "scalable vectors not yet supported.");
       auto NewDIL =
           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
-      if (NewDIL) 
-        B.SetCurrentDebugLocation(NewDIL.getValue()); 
-      else 
-        LLVM_DEBUG(dbgs() 
-                   << "Failed to create new discriminator: " 
-                   << DIL->getFilename() << " Line: " << DIL->getLine()); 
-    } 
-    else 
-      B.SetCurrentDebugLocation(DIL); 
-  } else 
-    B.SetCurrentDebugLocation(DebugLoc()); 
-} 
- 
-/// Write a record \p DebugMsg about vectorization failure to the debug 
-/// output stream. If \p I is passed, it is an instruction that prevents 
-/// vectorization. 
-#ifndef NDEBUG 
-static void debugVectorizationFailure(const StringRef DebugMsg, 
-    Instruction *I) { 
-  dbgs() << "LV: Not vectorizing: " << DebugMsg; 
-  if (I != nullptr) 
-    dbgs() << " " << *I; 
-  else 
-    dbgs() << '.'; 
-  dbgs() << '\n'; 
-} 
-#endif 
- 
-/// Create an analysis remark that explains why vectorization failed 
-/// 
-/// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p 
-/// RemarkName is the identifier for the remark.  If \p I is passed it is an 
-/// instruction that prevents vectorization.  Otherwise \p TheLoop is used for 
-/// the location of the remark.  \return the remark object that can be 
-/// streamed to. 
-static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 
-    StringRef RemarkName, Loop *TheLoop, Instruction *I) { 
-  Value *CodeRegion = TheLoop->getHeader(); 
-  DebugLoc DL = TheLoop->getStartLoc(); 
- 
-  if (I) { 
-    CodeRegion = I->getParent(); 
-    // If there is no debug location attached to the instruction, revert back to 
-    // using the loop's. 
-    if (I->getDebugLoc()) 
-      DL = I->getDebugLoc(); 
-  } 
- 
-  OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 
-  R << "loop not vectorized: "; 
-  return R; 
-} 
- 
+      if (NewDIL)
+        B.SetCurrentDebugLocation(NewDIL.getValue());
+      else
+        LLVM_DEBUG(dbgs()
+                   << "Failed to create new discriminator: "
+                   << DIL->getFilename() << " Line: " << DIL->getLine());
+    }
+    else
+      B.SetCurrentDebugLocation(DIL);
+  } else
+    B.SetCurrentDebugLocation(DebugLoc());
+}
+
+/// Write a record \p DebugMsg about vectorization failure to the debug
+/// output stream. If \p I is passed, it is an instruction that prevents
+/// vectorization.
+#ifndef NDEBUG
+static void debugVectorizationFailure(const StringRef DebugMsg,
+    Instruction *I) {
+  dbgs() << "LV: Not vectorizing: " << DebugMsg;
+  if (I != nullptr)
+    dbgs() << " " << *I;
+  else
+    dbgs() << '.';
+  dbgs() << '\n';
+}
+#endif
+
+/// Create an analysis remark that explains why vectorization failed
+///
+/// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
+/// RemarkName is the identifier for the remark.  If \p I is passed it is an
+/// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
+/// the location of the remark.  \return the remark object that can be
+/// streamed to.
+static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
+    StringRef RemarkName, Loop *TheLoop, Instruction *I) {
+  Value *CodeRegion = TheLoop->getHeader();
+  DebugLoc DL = TheLoop->getStartLoc();
+
+  if (I) {
+    CodeRegion = I->getParent();
+    // If there is no debug location attached to the instruction, revert back to
+    // using the loop's.
+    if (I->getDebugLoc())
+      DL = I->getDebugLoc();
+  }
+
+  OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
+  R << "loop not vectorized: ";
+  return R;
+}
+
 /// Return a value for Step multiplied by VF.
 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
   assert(isa<ConstantInt>(Step) && "Expected an integer step");
@@ -1135,427 +1135,427 @@ static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
 }
 
-namespace llvm { 
- 
-void reportVectorizationFailure(const StringRef DebugMsg, 
-    const StringRef OREMsg, const StringRef ORETag, 
-    OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 
-  LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 
-  LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 
-  ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 
-                ORETag, TheLoop, I) << OREMsg); 
-} 
- 
-} // end namespace llvm 
- 
-#ifndef NDEBUG 
-/// \return string containing a file name and a line # for the given loop. 
-static std::string getDebugLocString(const Loop *L) { 
-  std::string Result; 
-  if (L) { 
-    raw_string_ostream OS(Result); 
-    if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 
-      LoopDbgLoc.print(OS); 
-    else 
-      // Just print the module name. 
-      OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 
-    OS.flush(); 
-  } 
-  return Result; 
-} 
-#endif 
- 
-void InnerLoopVectorizer::addNewMetadata(Instruction *To, 
-                                         const Instruction *Orig) { 
-  // If the loop was versioned with memchecks, add the corresponding no-alias 
-  // metadata. 
-  if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 
-    LVer->annotateInstWithNoAlias(To, Orig); 
-} 
- 
-void InnerLoopVectorizer::addMetadata(Instruction *To, 
-                                      Instruction *From) { 
-  propagateMetadata(To, From); 
-  addNewMetadata(To, From); 
-} 
- 
-void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 
-                                      Instruction *From) { 
-  for (Value *V : To) { 
-    if (Instruction *I = dyn_cast<Instruction>(V)) 
-      addMetadata(I, From); 
-  } 
-} 
- 
-namespace llvm { 
- 
-// Loop vectorization cost-model hints how the scalar epilogue loop should be 
-// lowered. 
-enum ScalarEpilogueLowering { 
- 
-  // The default: allowing scalar epilogues. 
-  CM_ScalarEpilogueAllowed, 
- 
-  // Vectorization with OptForSize: don't allow epilogues. 
-  CM_ScalarEpilogueNotAllowedOptSize, 
- 
-  // A special case of vectorisation with OptForSize: loops with a very small 
-  // trip count are considered for vectorization under OptForSize, thereby 
-  // making sure the cost of their loop body is dominant, free of runtime 
-  // guards and scalar iteration overheads. 
-  CM_ScalarEpilogueNotAllowedLowTripLoop, 
- 
-  // Loop hint predicate indicating an epilogue is undesired. 
+namespace llvm {
+
+void reportVectorizationFailure(const StringRef DebugMsg,
+    const StringRef OREMsg, const StringRef ORETag,
+    OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
+  LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
+  LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
+  ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
+                ORETag, TheLoop, I) << OREMsg);
+}
+
+} // end namespace llvm
+
+#ifndef NDEBUG
+/// \return string containing a file name and a line # for the given loop.
+static std::string getDebugLocString(const Loop *L) {
+  std::string Result;
+  if (L) {
+    raw_string_ostream OS(Result);
+    if (const DebugLoc LoopDbgLoc = L->getStartLoc())
+      LoopDbgLoc.print(OS);
+    else
+      // Just print the module name.
+      OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
+    OS.flush();
+  }
+  return Result;
+}
+#endif
+
+void InnerLoopVectorizer::addNewMetadata(Instruction *To,
+                                         const Instruction *Orig) {
+  // If the loop was versioned with memchecks, add the corresponding no-alias
+  // metadata.
+  if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
+    LVer->annotateInstWithNoAlias(To, Orig);
+}
+
+void InnerLoopVectorizer::addMetadata(Instruction *To,
+                                      Instruction *From) {
+  propagateMetadata(To, From);
+  addNewMetadata(To, From);
+}
+
+void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
+                                      Instruction *From) {
+  for (Value *V : To) {
+    if (Instruction *I = dyn_cast<Instruction>(V))
+      addMetadata(I, From);
+  }
+}
+
+namespace llvm {
+
+// Loop vectorization cost-model hints how the scalar epilogue loop should be
+// lowered.
+enum ScalarEpilogueLowering {
+
+  // The default: allowing scalar epilogues.
+  CM_ScalarEpilogueAllowed,
+
+  // Vectorization with OptForSize: don't allow epilogues.
+  CM_ScalarEpilogueNotAllowedOptSize,
+
+  // A special case of vectorisation with OptForSize: loops with a very small
+  // trip count are considered for vectorization under OptForSize, thereby
+  // making sure the cost of their loop body is dominant, free of runtime
+  // guards and scalar iteration overheads.
+  CM_ScalarEpilogueNotAllowedLowTripLoop,
+
+  // Loop hint predicate indicating an epilogue is undesired.
   CM_ScalarEpilogueNotNeededUsePredicate,
 
   // Directive indicating we must either tail fold or not vectorize
   CM_ScalarEpilogueNotAllowedUsePredicate
-}; 
- 
-/// LoopVectorizationCostModel - estimates the expected speedups due to 
-/// vectorization. 
-/// In many cases vectorization is not profitable. This can happen because of 
-/// a number of reasons. In this class we mainly attempt to predict the 
-/// expected speedup/slowdowns due to the supported instruction set. We use the 
-/// TargetTransformInfo to query the different backends for the cost of 
-/// different operations. 
-class LoopVectorizationCostModel { 
-public: 
-  LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 
-                             PredicatedScalarEvolution &PSE, LoopInfo *LI, 
-                             LoopVectorizationLegality *Legal, 
-                             const TargetTransformInfo &TTI, 
-                             const TargetLibraryInfo *TLI, DemandedBits *DB, 
-                             AssumptionCache *AC, 
-                             OptimizationRemarkEmitter *ORE, const Function *F, 
-                             const LoopVectorizeHints *Hints, 
-                             InterleavedAccessInfo &IAI) 
-      : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 
-        TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 
-        Hints(Hints), InterleaveInfo(IAI) {} 
- 
-  /// \return An upper bound for the vectorization factor, or None if 
-  /// vectorization and interleaving should be avoided up front. 
+};
+
+/// LoopVectorizationCostModel - estimates the expected speedups due to
+/// vectorization.
+/// In many cases vectorization is not profitable. This can happen because of
+/// a number of reasons. In this class we mainly attempt to predict the
+/// expected speedup/slowdowns due to the supported instruction set. We use the
+/// TargetTransformInfo to query the different backends for the cost of
+/// different operations.
+class LoopVectorizationCostModel {
+public:
+  LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
+                             PredicatedScalarEvolution &PSE, LoopInfo *LI,
+                             LoopVectorizationLegality *Legal,
+                             const TargetTransformInfo &TTI,
+                             const TargetLibraryInfo *TLI, DemandedBits *DB,
+                             AssumptionCache *AC,
+                             OptimizationRemarkEmitter *ORE, const Function *F,
+                             const LoopVectorizeHints *Hints,
+                             InterleavedAccessInfo &IAI)
+      : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
+        TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
+        Hints(Hints), InterleaveInfo(IAI) {}
+
+  /// \return An upper bound for the vectorization factor, or None if
+  /// vectorization and interleaving should be avoided up front.
   Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC);
- 
-  /// \return True if runtime checks are required for vectorization, and false 
-  /// otherwise. 
-  bool runtimeChecksRequired(); 
- 
-  /// \return The most profitable vectorization factor and the cost of that VF. 
-  /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 
-  /// then this vectorization factor will be selected if vectorization is 
-  /// possible. 
+
+  /// \return True if runtime checks are required for vectorization, and false
+  /// otherwise.
+  bool runtimeChecksRequired();
+
+  /// \return The most profitable vectorization factor and the cost of that VF.
+  /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
+  /// then this vectorization factor will be selected if vectorization is
+  /// possible.
   VectorizationFactor selectVectorizationFactor(ElementCount MaxVF);
   VectorizationFactor
   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
                                     const LoopVectorizationPlanner &LVP);
- 
-  /// Setup cost-based decisions for user vectorization factor. 
+
+  /// Setup cost-based decisions for user vectorization factor.
   void selectUserVectorizationFactor(ElementCount UserVF) {
-    collectUniformsAndScalars(UserVF); 
-    collectInstsToScalarize(UserVF); 
-  } 
- 
-  /// \return The size (in bits) of the smallest and widest types in the code 
-  /// that needs to be vectorized. We ignore values that remain scalar such as 
-  /// 64 bit loop indices. 
-  std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 
- 
-  /// \return The desired interleave count. 
-  /// If interleave count has been specified by metadata it will be returned. 
-  /// Otherwise, the interleave count is computed and returned. VF and LoopCost 
-  /// are the selected vectorization factor and the cost of the selected VF. 
+    collectUniformsAndScalars(UserVF);
+    collectInstsToScalarize(UserVF);
+  }
+
+  /// \return The size (in bits) of the smallest and widest types in the code
+  /// that needs to be vectorized. We ignore values that remain scalar such as
+  /// 64 bit loop indices.
+  std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
+
+  /// \return The desired interleave count.
+  /// If interleave count has been specified by metadata it will be returned.
+  /// Otherwise, the interleave count is computed and returned. VF and LoopCost
+  /// are the selected vectorization factor and the cost of the selected VF.
   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
- 
-  /// Memory access instruction may be vectorized in more than one way. 
-  /// Form of instruction after vectorization depends on cost. 
-  /// This function takes cost-based decisions for Load/Store instructions 
-  /// and collects them in a map. This decisions map is used for building 
-  /// the lists of loop-uniform and loop-scalar instructions. 
-  /// The calculated cost is saved with widening decision in order to 
-  /// avoid redundant calculations. 
+
+  /// Memory access instruction may be vectorized in more than one way.
+  /// Form of instruction after vectorization depends on cost.
+  /// This function takes cost-based decisions for Load/Store instructions
+  /// and collects them in a map. This decisions map is used for building
+  /// the lists of loop-uniform and loop-scalar instructions.
+  /// The calculated cost is saved with widening decision in order to
+  /// avoid redundant calculations.
   void setCostBasedWideningDecision(ElementCount VF);
- 
-  /// A struct that represents some properties of the register usage 
-  /// of a loop. 
-  struct RegisterUsage { 
-    /// Holds the number of loop invariant values that are used in the loop. 
-    /// The key is ClassID of target-provided register class. 
-    SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 
-    /// Holds the maximum number of concurrent live intervals in the loop. 
-    /// The key is ClassID of target-provided register class. 
-    SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 
-  }; 
- 
-  /// \return Returns information about the register usages of the loop for the 
-  /// given vectorization factors. 
+
+  /// A struct that represents some properties of the register usage
+  /// of a loop.
+  struct RegisterUsage {
+    /// Holds the number of loop invariant values that are used in the loop.
+    /// The key is ClassID of target-provided register class.
+    SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
+    /// Holds the maximum number of concurrent live intervals in the loop.
+    /// The key is ClassID of target-provided register class.
+    SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
+  };
+
+  /// \return Returns information about the register usages of the loop for the
+  /// given vectorization factors.
   SmallVector<RegisterUsage, 8>
   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
- 
-  /// Collect values we want to ignore in the cost model. 
-  void collectValuesToIgnore(); 
- 
+
+  /// Collect values we want to ignore in the cost model.
+  void collectValuesToIgnore();
+
   /// Split reductions into those that happen in the loop, and those that happen
   /// outside. In loop reductions are collected into InLoopReductionChains.
   void collectInLoopReductions();
 
-  /// \returns The smallest bitwidth each instruction can be represented with. 
-  /// The vector equivalents of these instructions should be truncated to this 
-  /// type. 
-  const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 
-    return MinBWs; 
-  } 
- 
-  /// \returns True if it is more profitable to scalarize instruction \p I for 
-  /// vectorization factor \p VF. 
+  /// \returns The smallest bitwidth each instruction can be represented with.
+  /// The vector equivalents of these instructions should be truncated to this
+  /// type.
+  const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
+    return MinBWs;
+  }
+
+  /// \returns True if it is more profitable to scalarize instruction \p I for
+  /// vectorization factor \p VF.
   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
     assert(VF.isVector() &&
            "Profitable to scalarize relevant only for VF > 1.");
- 
-    // Cost model is not run in the VPlan-native path - return conservative 
-    // result until this changes. 
-    if (EnableVPlanNativePath) 
-      return false; 
- 
-    auto Scalars = InstsToScalarize.find(VF); 
-    assert(Scalars != InstsToScalarize.end() && 
-           "VF not yet analyzed for scalarization profitability"); 
-    return Scalars->second.find(I) != Scalars->second.end(); 
-  } 
- 
-  /// Returns true if \p I is known to be uniform after vectorization. 
+
+    // Cost model is not run in the VPlan-native path - return conservative
+    // result until this changes.
+    if (EnableVPlanNativePath)
+      return false;
+
+    auto Scalars = InstsToScalarize.find(VF);
+    assert(Scalars != InstsToScalarize.end() &&
+           "VF not yet analyzed for scalarization profitability");
+    return Scalars->second.find(I) != Scalars->second.end();
+  }
+
+  /// Returns true if \p I is known to be uniform after vectorization.
   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
     if (VF.isScalar())
-      return true; 
- 
-    // Cost model is not run in the VPlan-native path - return conservative 
-    // result until this changes. 
-    if (EnableVPlanNativePath) 
-      return false; 
- 
-    auto UniformsPerVF = Uniforms.find(VF); 
-    assert(UniformsPerVF != Uniforms.end() && 
-           "VF not yet analyzed for uniformity"); 
-    return UniformsPerVF->second.count(I); 
-  } 
- 
-  /// Returns true if \p I is known to be scalar after vectorization. 
+      return true;
+
+    // Cost model is not run in the VPlan-native path - return conservative
+    // result until this changes.
+    if (EnableVPlanNativePath)
+      return false;
+
+    auto UniformsPerVF = Uniforms.find(VF);
+    assert(UniformsPerVF != Uniforms.end() &&
+           "VF not yet analyzed for uniformity");
+    return UniformsPerVF->second.count(I);
+  }
+
+  /// Returns true if \p I is known to be scalar after vectorization.
   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
     if (VF.isScalar())
-      return true; 
- 
-    // Cost model is not run in the VPlan-native path - return conservative 
-    // result until this changes. 
-    if (EnableVPlanNativePath) 
-      return false; 
- 
-    auto ScalarsPerVF = Scalars.find(VF); 
-    assert(ScalarsPerVF != Scalars.end() && 
-           "Scalar values are not calculated for VF"); 
-    return ScalarsPerVF->second.count(I); 
-  } 
- 
-  /// \returns True if instruction \p I can be truncated to a smaller bitwidth 
-  /// for vectorization factor \p VF. 
+      return true;
+
+    // Cost model is not run in the VPlan-native path - return conservative
+    // result until this changes.
+    if (EnableVPlanNativePath)
+      return false;
+
+    auto ScalarsPerVF = Scalars.find(VF);
+    assert(ScalarsPerVF != Scalars.end() &&
+           "Scalar values are not calculated for VF");
+    return ScalarsPerVF->second.count(I);
+  }
+
+  /// \returns True if instruction \p I can be truncated to a smaller bitwidth
+  /// for vectorization factor \p VF.
   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
-           !isProfitableToScalarize(I, VF) && 
-           !isScalarAfterVectorization(I, VF); 
-  } 
- 
-  /// Decision that was taken during cost calculation for memory instruction. 
-  enum InstWidening { 
-    CM_Unknown, 
-    CM_Widen,         // For consecutive accesses with stride +1. 
-    CM_Widen_Reverse, // For consecutive accesses with stride -1. 
-    CM_Interleave, 
-    CM_GatherScatter, 
-    CM_Scalarize 
-  }; 
- 
-  /// Save vectorization decision \p W and \p Cost taken by the cost model for 
-  /// instruction \p I and vector width \p VF. 
+           !isProfitableToScalarize(I, VF) &&
+           !isScalarAfterVectorization(I, VF);
+  }
+
+  /// Decision that was taken during cost calculation for memory instruction.
+  enum InstWidening {
+    CM_Unknown,
+    CM_Widen,         // For consecutive accesses with stride +1.
+    CM_Widen_Reverse, // For consecutive accesses with stride -1.
+    CM_Interleave,
+    CM_GatherScatter,
+    CM_Scalarize
+  };
+
+  /// Save vectorization decision \p W and \p Cost taken by the cost model for
+  /// instruction \p I and vector width \p VF.
   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
                            InstructionCost Cost) {
     assert(VF.isVector() && "Expected VF >=2");
-    WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 
-  } 
- 
-  /// Save vectorization decision \p W and \p Cost taken by the cost model for 
-  /// interleaving group \p Grp and vector width \p VF. 
+    WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
+  }
+
+  /// Save vectorization decision \p W and \p Cost taken by the cost model for
+  /// interleaving group \p Grp and vector width \p VF.
   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
                            ElementCount VF, InstWidening W,
                            InstructionCost Cost) {
     assert(VF.isVector() && "Expected VF >=2");
-    /// Broadcast this decicion to all instructions inside the group. 
-    /// But the cost will be assigned to one instruction only. 
-    for (unsigned i = 0; i < Grp->getFactor(); ++i) { 
-      if (auto *I = Grp->getMember(i)) { 
-        if (Grp->getInsertPos() == I) 
-          WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 
-        else 
-          WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 
-      } 
-    } 
-  } 
- 
-  /// Return the cost model decision for the given instruction \p I and vector 
-  /// width \p VF. Return CM_Unknown if this instruction did not pass 
-  /// through the cost modeling. 
+    /// Broadcast this decicion to all instructions inside the group.
+    /// But the cost will be assigned to one instruction only.
+    for (unsigned i = 0; i < Grp->getFactor(); ++i) {
+      if (auto *I = Grp->getMember(i)) {
+        if (Grp->getInsertPos() == I)
+          WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
+        else
+          WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
+      }
+    }
+  }
+
+  /// Return the cost model decision for the given instruction \p I and vector
+  /// width \p VF. Return CM_Unknown if this instruction did not pass
+  /// through the cost modeling.
   InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
     assert(VF.isVector() && "Expected VF to be a vector VF");
-    // Cost model is not run in the VPlan-native path - return conservative 
-    // result until this changes. 
-    if (EnableVPlanNativePath) 
-      return CM_GatherScatter; 
- 
+    // Cost model is not run in the VPlan-native path - return conservative
+    // result until this changes.
+    if (EnableVPlanNativePath)
+      return CM_GatherScatter;
+
     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
-    auto Itr = WideningDecisions.find(InstOnVF); 
-    if (Itr == WideningDecisions.end()) 
-      return CM_Unknown; 
-    return Itr->second.first; 
-  } 
- 
-  /// Return the vectorization cost for the given instruction \p I and vector 
-  /// width \p VF. 
+    auto Itr = WideningDecisions.find(InstOnVF);
+    if (Itr == WideningDecisions.end())
+      return CM_Unknown;
+    return Itr->second.first;
+  }
+
+  /// Return the vectorization cost for the given instruction \p I and vector
+  /// width \p VF.
   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
     assert(VF.isVector() && "Expected VF >=2");
     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
-    assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 
-           "The cost is not calculated"); 
-    return WideningDecisions[InstOnVF].second; 
-  } 
- 
-  /// Return True if instruction \p I is an optimizable truncate whose operand 
-  /// is an induction variable. Such a truncate will be removed by adding a new 
-  /// induction variable with the destination type. 
+    assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
+           "The cost is not calculated");
+    return WideningDecisions[InstOnVF].second;
+  }
+
+  /// Return True if instruction \p I is an optimizable truncate whose operand
+  /// is an induction variable. Such a truncate will be removed by adding a new
+  /// induction variable with the destination type.
   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
-    // If the instruction is not a truncate, return false. 
-    auto *Trunc = dyn_cast<TruncInst>(I); 
-    if (!Trunc) 
-      return false; 
- 
-    // Get the source and destination types of the truncate. 
-    Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 
-    Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 
- 
-    // If the truncate is free for the given types, return false. Replacing a 
-    // free truncate with an induction variable would add an induction variable 
-    // update instruction to each iteration of the loop. We exclude from this 
-    // check the primary induction variable since it will need an update 
-    // instruction regardless. 
-    Value *Op = Trunc->getOperand(0); 
-    if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 
-      return false; 
- 
-    // If the truncated value is not an induction variable, return false. 
-    return Legal->isInductionPhi(Op); 
-  } 
- 
-  /// Collects the instructions to scalarize for each predicated instruction in 
-  /// the loop. 
+    // If the instruction is not a truncate, return false.
+    auto *Trunc = dyn_cast<TruncInst>(I);
+    if (!Trunc)
+      return false;
+
+    // Get the source and destination types of the truncate.
+    Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
+    Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
+
+    // If the truncate is free for the given types, return false. Replacing a
+    // free truncate with an induction variable would add an induction variable
+    // update instruction to each iteration of the loop. We exclude from this
+    // check the primary induction variable since it will need an update
+    // instruction regardless.
+    Value *Op = Trunc->getOperand(0);
+    if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
+      return false;
+
+    // If the truncated value is not an induction variable, return false.
+    return Legal->isInductionPhi(Op);
+  }
+
+  /// Collects the instructions to scalarize for each predicated instruction in
+  /// the loop.
   void collectInstsToScalarize(ElementCount VF);
- 
-  /// Collect Uniform and Scalar values for the given \p VF. 
-  /// The sets depend on CM decision for Load/Store instructions 
-  /// that may be vectorized as interleave, gather-scatter or scalarized. 
+
+  /// Collect Uniform and Scalar values for the given \p VF.
+  /// The sets depend on CM decision for Load/Store instructions
+  /// that may be vectorized as interleave, gather-scatter or scalarized.
   void collectUniformsAndScalars(ElementCount VF) {
-    // Do the analysis once. 
+    // Do the analysis once.
     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
-      return; 
-    setCostBasedWideningDecision(VF); 
-    collectLoopUniforms(VF); 
-    collectLoopScalars(VF); 
-  } 
- 
-  /// Returns true if the target machine supports masked store operation 
-  /// for the given \p DataType and kind of access to \p Ptr. 
-  bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 
-    return Legal->isConsecutivePtr(Ptr) && 
-           TTI.isLegalMaskedStore(DataType, Alignment); 
-  } 
- 
-  /// Returns true if the target machine supports masked load operation 
-  /// for the given \p DataType and kind of access to \p Ptr. 
-  bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 
-    return Legal->isConsecutivePtr(Ptr) && 
-           TTI.isLegalMaskedLoad(DataType, Alignment); 
-  } 
- 
-  /// Returns true if the target machine supports masked scatter operation 
-  /// for the given \p DataType. 
-  bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 
-    return TTI.isLegalMaskedScatter(DataType, Alignment); 
-  } 
- 
-  /// Returns true if the target machine supports masked gather operation 
-  /// for the given \p DataType. 
-  bool isLegalMaskedGather(Type *DataType, Align Alignment) { 
-    return TTI.isLegalMaskedGather(DataType, Alignment); 
-  } 
- 
-  /// Returns true if the target machine can represent \p V as a masked gather 
-  /// or scatter operation. 
-  bool isLegalGatherOrScatter(Value *V) { 
-    bool LI = isa<LoadInst>(V); 
-    bool SI = isa<StoreInst>(V); 
-    if (!LI && !SI) 
-      return false; 
-    auto *Ty = getMemInstValueType(V); 
-    Align Align = getLoadStoreAlignment(V); 
-    return (LI && isLegalMaskedGather(Ty, Align)) || 
-           (SI && isLegalMaskedScatter(Ty, Align)); 
-  } 
- 
-  /// Returns true if \p I is an instruction that will be scalarized with 
-  /// predication. Such instructions include conditional stores and 
-  /// instructions that may divide by zero. 
-  /// If a non-zero VF has been calculated, we check if I will be scalarized 
-  /// predication for that VF. 
+      return;
+    setCostBasedWideningDecision(VF);
+    collectLoopUniforms(VF);
+    collectLoopScalars(VF);
+  }
+
+  /// Returns true if the target machine supports masked store operation
+  /// for the given \p DataType and kind of access to \p Ptr.
+  bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
+    return Legal->isConsecutivePtr(Ptr) &&
+           TTI.isLegalMaskedStore(DataType, Alignment);
+  }
+
+  /// Returns true if the target machine supports masked load operation
+  /// for the given \p DataType and kind of access to \p Ptr.
+  bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
+    return Legal->isConsecutivePtr(Ptr) &&
+           TTI.isLegalMaskedLoad(DataType, Alignment);
+  }
+
+  /// Returns true if the target machine supports masked scatter operation
+  /// for the given \p DataType.
+  bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
+    return TTI.isLegalMaskedScatter(DataType, Alignment);
+  }
+
+  /// Returns true if the target machine supports masked gather operation
+  /// for the given \p DataType.
+  bool isLegalMaskedGather(Type *DataType, Align Alignment) {
+    return TTI.isLegalMaskedGather(DataType, Alignment);
+  }
+
+  /// Returns true if the target machine can represent \p V as a masked gather
+  /// or scatter operation.
+  bool isLegalGatherOrScatter(Value *V) {
+    bool LI = isa<LoadInst>(V);
+    bool SI = isa<StoreInst>(V);
+    if (!LI && !SI)
+      return false;
+    auto *Ty = getMemInstValueType(V);
+    Align Align = getLoadStoreAlignment(V);
+    return (LI && isLegalMaskedGather(Ty, Align)) ||
+           (SI && isLegalMaskedScatter(Ty, Align));
+  }
+
+  /// Returns true if \p I is an instruction that will be scalarized with
+  /// predication. Such instructions include conditional stores and
+  /// instructions that may divide by zero.
+  /// If a non-zero VF has been calculated, we check if I will be scalarized
+  /// predication for that VF.
   bool isScalarWithPredication(Instruction *I,
                                ElementCount VF = ElementCount::getFixed(1));
- 
-  // Returns true if \p I is an instruction that will be predicated either 
-  // through scalar predication or masked load/store or masked gather/scatter. 
-  // Superset of instructions that return true for isScalarWithPredication. 
-  bool isPredicatedInst(Instruction *I) { 
-    if (!blockNeedsPredication(I->getParent())) 
-      return false; 
-    // Loads and stores that need some form of masked operation are predicated 
-    // instructions. 
-    if (isa<LoadInst>(I) || isa<StoreInst>(I)) 
-      return Legal->isMaskRequired(I); 
-    return isScalarWithPredication(I); 
-  } 
- 
-  /// Returns true if \p I is a memory instruction with consecutive memory 
-  /// access that can be widened. 
+
+  // Returns true if \p I is an instruction that will be predicated either
+  // through scalar predication or masked load/store or masked gather/scatter.
+  // Superset of instructions that return true for isScalarWithPredication.
+  bool isPredicatedInst(Instruction *I) {
+    if (!blockNeedsPredication(I->getParent()))
+      return false;
+    // Loads and stores that need some form of masked operation are predicated
+    // instructions.
+    if (isa<LoadInst>(I) || isa<StoreInst>(I))
+      return Legal->isMaskRequired(I);
+    return isScalarWithPredication(I);
+  }
+
+  /// Returns true if \p I is a memory instruction with consecutive memory
+  /// access that can be widened.
   bool
   memoryInstructionCanBeWidened(Instruction *I,
                                 ElementCount VF = ElementCount::getFixed(1));
- 
-  /// Returns true if \p I is a memory instruction in an interleaved-group 
-  /// of memory accesses that can be vectorized with wide vector loads/stores 
-  /// and shuffles. 
+
+  /// Returns true if \p I is a memory instruction in an interleaved-group
+  /// of memory accesses that can be vectorized with wide vector loads/stores
+  /// and shuffles.
   bool
   interleavedAccessCanBeWidened(Instruction *I,
                                 ElementCount VF = ElementCount::getFixed(1));
- 
-  /// Check if \p Instr belongs to any interleaved access group. 
-  bool isAccessInterleaved(Instruction *Instr) { 
-    return InterleaveInfo.isInterleaved(Instr); 
-  } 
- 
-  /// Get the interleaved access group that \p Instr belongs to. 
-  const InterleaveGroup<Instruction> * 
-  getInterleavedAccessGroup(Instruction *Instr) { 
-    return InterleaveInfo.getInterleaveGroup(Instr); 
-  } 
- 
+
+  /// Check if \p Instr belongs to any interleaved access group.
+  bool isAccessInterleaved(Instruction *Instr) {
+    return InterleaveInfo.isInterleaved(Instr);
+  }
+
+  /// Get the interleaved access group that \p Instr belongs to.
+  const InterleaveGroup<Instruction> *
+  getInterleavedAccessGroup(Instruction *Instr) {
+    return InterleaveInfo.getInterleaveGroup(Instr);
+  }
+
   /// Returns true if we're required to use a scalar epilogue for at least
   /// the final iteration of the original loop.
-  bool requiresScalarEpilogue() const { 
+  bool requiresScalarEpilogue() const {
     if (!isScalarEpilogueAllowed())
       return false;
     // If we might exit from anywhere but the latch, must run the exiting
@@ -1563,21 +1563,21 @@ public:
     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
       return true;
     return InterleaveInfo.requiresScalarEpilogue();
-  } 
- 
-  /// Returns true if a scalar epilogue is not allowed due to optsize or a 
-  /// loop hint annotation. 
-  bool isScalarEpilogueAllowed() const { 
-    return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 
-  } 
- 
-  /// Returns true if all loop blocks should be masked to fold tail loop. 
-  bool foldTailByMasking() const { return FoldTailByMasking; } 
- 
-  bool blockNeedsPredication(BasicBlock *BB) { 
-    return foldTailByMasking() || Legal->blockNeedsPredication(BB); 
-  } 
- 
+  }
+
+  /// Returns true if a scalar epilogue is not allowed due to optsize or a
+  /// loop hint annotation.
+  bool isScalarEpilogueAllowed() const {
+    return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
+  }
+
+  /// Returns true if all loop blocks should be masked to fold tail loop.
+  bool foldTailByMasking() const { return FoldTailByMasking; }
+
+  bool blockNeedsPredication(BasicBlock *BB) {
+    return foldTailByMasking() || Legal->blockNeedsPredication(BB);
+  }
+
   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
   /// nodes to the chain of instructions representing the reductions. Uses a
   /// MapVector to ensure deterministic iteration order.
@@ -1594,143 +1594,143 @@ public:
     return InLoopReductionChains.count(Phi);
   }
 
-  /// Estimate cost of an intrinsic call instruction CI if it were vectorized 
-  /// with factor VF.  Return the cost of the instruction, including 
-  /// scalarization overhead if it's needed. 
+  /// Estimate cost of an intrinsic call instruction CI if it were vectorized
+  /// with factor VF.  Return the cost of the instruction, including
+  /// scalarization overhead if it's needed.
   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
- 
-  /// Estimate cost of a call instruction CI if it were vectorized with factor 
-  /// VF. Return the cost of the instruction, including scalarization overhead 
-  /// if it's needed. The flag NeedToScalarize shows if the call needs to be 
-  /// scalarized - 
-  /// i.e. either vector version isn't available, or is too expensive. 
+
+  /// Estimate cost of a call instruction CI if it were vectorized with factor
+  /// VF. Return the cost of the instruction, including scalarization overhead
+  /// if it's needed. The flag NeedToScalarize shows if the call needs to be
+  /// scalarized -
+  /// i.e. either vector version isn't available, or is too expensive.
   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
                                     bool &NeedToScalarize);
- 
-  /// Invalidates decisions already taken by the cost model. 
-  void invalidateCostModelingDecisions() { 
-    WideningDecisions.clear(); 
-    Uniforms.clear(); 
-    Scalars.clear(); 
-  } 
- 
-private: 
-  unsigned NumPredStores = 0; 
- 
-  /// \return An upper bound for the vectorization factor, a power-of-2 larger 
-  /// than zero. One is returned if vectorization should best be avoided due 
-  /// to cost. 
+
+  /// Invalidates decisions already taken by the cost model.
+  void invalidateCostModelingDecisions() {
+    WideningDecisions.clear();
+    Uniforms.clear();
+    Scalars.clear();
+  }
+
+private:
+  unsigned NumPredStores = 0;
+
+  /// \return An upper bound for the vectorization factor, a power-of-2 larger
+  /// than zero. One is returned if vectorization should best be avoided due
+  /// to cost.
   ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
                                     ElementCount UserVF);
- 
-  /// The vectorization cost is a combination of the cost itself and a boolean 
-  /// indicating whether any of the contributing operations will actually 
-  /// operate on 
-  /// vector values after type legalization in the backend. If this latter value 
-  /// is 
-  /// false, then all operations will be scalarized (i.e. no vectorization has 
-  /// actually taken place). 
+
+  /// The vectorization cost is a combination of the cost itself and a boolean
+  /// indicating whether any of the contributing operations will actually
+  /// operate on
+  /// vector values after type legalization in the backend. If this latter value
+  /// is
+  /// false, then all operations will be scalarized (i.e. no vectorization has
+  /// actually taken place).
   using VectorizationCostTy = std::pair<InstructionCost, bool>;
- 
-  /// Returns the expected execution cost. The unit of the cost does 
-  /// not matter because we use the 'cost' units to compare different 
-  /// vector widths. The cost that is returned is *not* normalized by 
-  /// the factor width. 
+
+  /// Returns the expected execution cost. The unit of the cost does
+  /// not matter because we use the 'cost' units to compare different
+  /// vector widths. The cost that is returned is *not* normalized by
+  /// the factor width.
   VectorizationCostTy expectedCost(ElementCount VF);
- 
-  /// Returns the execution time cost of an instruction for a given vector 
-  /// width. Vector width of one means scalar. 
+
+  /// Returns the execution time cost of an instruction for a given vector
+  /// width. Vector width of one means scalar.
   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
- 
-  /// The cost-computation logic from getInstructionCost which provides 
-  /// the vector type as an output parameter. 
+
+  /// The cost-computation logic from getInstructionCost which provides
+  /// the vector type as an output parameter.
   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
                                      Type *&VectorTy);
- 
+
   /// Return the cost of instructions in an inloop reduction pattern, if I is
   /// part of that pattern.
   InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF,
                                           Type *VectorTy,
                                           TTI::TargetCostKind CostKind);
 
-  /// Calculate vectorization cost of memory instruction \p I. 
+  /// Calculate vectorization cost of memory instruction \p I.
   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
- 
-  /// The cost computation for scalarized memory instruction. 
+
+  /// The cost computation for scalarized memory instruction.
   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
- 
-  /// The cost computation for interleaving group of memory instructions. 
+
+  /// The cost computation for interleaving group of memory instructions.
   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
- 
-  /// The cost computation for Gather/Scatter instruction. 
+
+  /// The cost computation for Gather/Scatter instruction.
   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
- 
-  /// The cost computation for widening instruction \p I with consecutive 
-  /// memory access. 
+
+  /// The cost computation for widening instruction \p I with consecutive
+  /// memory access.
   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
- 
-  /// The cost calculation for Load/Store instruction \p I with uniform pointer - 
-  /// Load: scalar load + broadcast. 
-  /// Store: scalar store + (loop invariant value stored? 0 : extract of last 
-  /// element) 
+
+  /// The cost calculation for Load/Store instruction \p I with uniform pointer -
+  /// Load: scalar load + broadcast.
+  /// Store: scalar store + (loop invariant value stored? 0 : extract of last
+  /// element)
   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
- 
-  /// Estimate the overhead of scalarizing an instruction. This is a 
-  /// convenience wrapper for the type-based getScalarizationOverhead API. 
+
+  /// Estimate the overhead of scalarizing an instruction. This is a
+  /// convenience wrapper for the type-based getScalarizationOverhead API.
   InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF);
- 
-  /// Returns whether the instruction is a load or store and will be a emitted 
-  /// as a vector operation. 
-  bool isConsecutiveLoadOrStore(Instruction *I); 
- 
-  /// Returns true if an artificially high cost for emulated masked memrefs 
-  /// should be used. 
-  bool useEmulatedMaskMemRefHack(Instruction *I); 
- 
-  /// Map of scalar integer values to the smallest bitwidth they can be legally 
-  /// represented as. The vector equivalents of these values should be truncated 
-  /// to this type. 
-  MapVector<Instruction *, uint64_t> MinBWs; 
- 
-  /// A type representing the costs for instructions if they were to be 
-  /// scalarized rather than vectorized. The entries are Instruction-Cost 
-  /// pairs. 
+
+  /// Returns whether the instruction is a load or store and will be a emitted
+  /// as a vector operation.
+  bool isConsecutiveLoadOrStore(Instruction *I);
+
+  /// Returns true if an artificially high cost for emulated masked memrefs
+  /// should be used.
+  bool useEmulatedMaskMemRefHack(Instruction *I);
+
+  /// Map of scalar integer values to the smallest bitwidth they can be legally
+  /// represented as. The vector equivalents of these values should be truncated
+  /// to this type.
+  MapVector<Instruction *, uint64_t> MinBWs;
+
+  /// A type representing the costs for instructions if they were to be
+  /// scalarized rather than vectorized. The entries are Instruction-Cost
+  /// pairs.
   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
- 
-  /// A set containing all BasicBlocks that are known to present after 
-  /// vectorization as a predicated block. 
-  SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 
- 
-  /// Records whether it is allowed to have the original scalar loop execute at 
-  /// least once. This may be needed as a fallback loop in case runtime 
-  /// aliasing/dependence checks fail, or to handle the tail/remainder 
-  /// iterations when the trip count is unknown or doesn't divide by the VF, 
-  /// or as a peel-loop to handle gaps in interleave-groups. 
-  /// Under optsize and when the trip count is very small we don't allow any 
-  /// iterations to execute in the scalar loop. 
-  ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 
- 
-  /// All blocks of loop are to be masked to fold tail of scalar iterations. 
-  bool FoldTailByMasking = false; 
- 
-  /// A map holding scalar costs for different vectorization factors. The 
-  /// presence of a cost for an instruction in the mapping indicates that the 
-  /// instruction will be scalarized when vectorizing with the associated 
-  /// vectorization factor. The entries are VF-ScalarCostTy pairs. 
+
+  /// A set containing all BasicBlocks that are known to present after
+  /// vectorization as a predicated block.
+  SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
+
+  /// Records whether it is allowed to have the original scalar loop execute at
+  /// least once. This may be needed as a fallback loop in case runtime
+  /// aliasing/dependence checks fail, or to handle the tail/remainder
+  /// iterations when the trip count is unknown or doesn't divide by the VF,
+  /// or as a peel-loop to handle gaps in interleave-groups.
+  /// Under optsize and when the trip count is very small we don't allow any
+  /// iterations to execute in the scalar loop.
+  ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
+
+  /// All blocks of loop are to be masked to fold tail of scalar iterations.
+  bool FoldTailByMasking = false;
+
+  /// A map holding scalar costs for different vectorization factors. The
+  /// presence of a cost for an instruction in the mapping indicates that the
+  /// instruction will be scalarized when vectorizing with the associated
+  /// vectorization factor. The entries are VF-ScalarCostTy pairs.
   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
- 
-  /// Holds the instructions known to be uniform after vectorization. 
-  /// The data is collected per VF. 
+
+  /// Holds the instructions known to be uniform after vectorization.
+  /// The data is collected per VF.
   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
- 
-  /// Holds the instructions known to be scalar after vectorization. 
-  /// The data is collected per VF. 
+
+  /// Holds the instructions known to be scalar after vectorization.
+  /// The data is collected per VF.
   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
- 
-  /// Holds the instructions (address computations) that are forced to be 
-  /// scalarized. 
+
+  /// Holds the instructions (address computations) that are forced to be
+  /// scalarized.
   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
- 
+
   /// PHINodes of the reductions that should be expanded in-loop along with
   /// their associated chains of reduction operations, in program order from top
   /// (PHI) to bottom
@@ -1742,64 +1742,64 @@ private:
   /// without having to loop through InLoopReductionChains.
   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
 
-  /// Returns the expected difference in cost from scalarizing the expression 
-  /// feeding a predicated instruction \p PredInst. The instructions to 
-  /// scalarize and their scalar costs are collected in \p ScalarCosts. A 
-  /// non-negative return value implies the expression will be scalarized. 
-  /// Currently, only single-use chains are considered for scalarization. 
-  int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 
+  /// Returns the expected difference in cost from scalarizing the expression
+  /// feeding a predicated instruction \p PredInst. The instructions to
+  /// scalarize and their scalar costs are collected in \p ScalarCosts. A
+  /// non-negative return value implies the expression will be scalarized.
+  /// Currently, only single-use chains are considered for scalarization.
+  int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
                               ElementCount VF);
- 
-  /// Collect the instructions that are uniform after vectorization. An 
-  /// instruction is uniform if we represent it with a single scalar value in 
-  /// the vectorized loop corresponding to each vector iteration. Examples of 
-  /// uniform instructions include pointer operands of consecutive or 
-  /// interleaved memory accesses. Note that although uniformity implies an 
-  /// instruction will be scalar, the reverse is not true. In general, a 
-  /// scalarized instruction will be represented by VF scalar values in the 
-  /// vectorized loop, each corresponding to an iteration of the original 
-  /// scalar loop. 
+
+  /// Collect the instructions that are uniform after vectorization. An
+  /// instruction is uniform if we represent it with a single scalar value in
+  /// the vectorized loop corresponding to each vector iteration. Examples of
+  /// uniform instructions include pointer operands of consecutive or
+  /// interleaved memory accesses. Note that although uniformity implies an
+  /// instruction will be scalar, the reverse is not true. In general, a
+  /// scalarized instruction will be represented by VF scalar values in the
+  /// vectorized loop, each corresponding to an iteration of the original
+  /// scalar loop.
   void collectLoopUniforms(ElementCount VF);
- 
-  /// Collect the instructions that are scalar after vectorization. An 
-  /// instruction is scalar if it is known to be uniform or will be scalarized 
-  /// during vectorization. Non-uniform scalarized instructions will be 
-  /// represented by VF values in the vectorized loop, each corresponding to an 
-  /// iteration of the original scalar loop. 
+
+  /// Collect the instructions that are scalar after vectorization. An
+  /// instruction is scalar if it is known to be uniform or will be scalarized
+  /// during vectorization. Non-uniform scalarized instructions will be
+  /// represented by VF values in the vectorized loop, each corresponding to an
+  /// iteration of the original scalar loop.
   void collectLoopScalars(ElementCount VF);
- 
-  /// Keeps cost model vectorization decision and cost for instructions. 
-  /// Right now it is used for memory instructions only. 
+
+  /// Keeps cost model vectorization decision and cost for instructions.
+  /// Right now it is used for memory instructions only.
   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
                                 std::pair<InstWidening, InstructionCost>>;
- 
-  DecisionList WideningDecisions; 
- 
-  /// Returns true if \p V is expected to be vectorized and it needs to be 
-  /// extracted. 
+
+  DecisionList WideningDecisions;
+
+  /// Returns true if \p V is expected to be vectorized and it needs to be
+  /// extracted.
   bool needsExtract(Value *V, ElementCount VF) const {
-    Instruction *I = dyn_cast<Instruction>(V); 
+    Instruction *I = dyn_cast<Instruction>(V);
     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
         TheLoop->isLoopInvariant(I))
-      return false; 
- 
-    // Assume we can vectorize V (and hence we need extraction) if the 
-    // scalars are not computed yet. This can happen, because it is called 
-    // via getScalarizationOverhead from setCostBasedWideningDecision, before 
-    // the scalars are collected. That should be a safe assumption in most 
-    // cases, because we check if the operands have vectorizable types 
-    // beforehand in LoopVectorizationLegality. 
-    return Scalars.find(VF) == Scalars.end() || 
-           !isScalarAfterVectorization(I, VF); 
-  }; 
- 
-  /// Returns a range containing only operands needing to be extracted. 
-  SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 
+      return false;
+
+    // Assume we can vectorize V (and hence we need extraction) if the
+    // scalars are not computed yet. This can happen, because it is called
+    // via getScalarizationOverhead from setCostBasedWideningDecision, before
+    // the scalars are collected. That should be a safe assumption in most
+    // cases, because we check if the operands have vectorizable types
+    // beforehand in LoopVectorizationLegality.
+    return Scalars.find(VF) == Scalars.end() ||
+           !isScalarAfterVectorization(I, VF);
+  };
+
+  /// Returns a range containing only operands needing to be extracted.
+  SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
                                                    ElementCount VF) {
-    return SmallVector<Value *, 4>(make_filter_range( 
-        Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 
-  } 
- 
+    return SmallVector<Value *, 4>(make_filter_range(
+        Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
+  }
+
   /// Determines if we have the infrastructure to vectorize loop \p L and its
   /// epilogue, assuming the main loop is vectorized by \p VF.
   bool isCandidateForEpilogueVectorization(const Loop &L,
@@ -1810,539 +1810,539 @@ private:
   /// \p VF is the vectorization factor chosen for the original loop.
   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
 
-public: 
-  /// The loop that we evaluate. 
-  Loop *TheLoop; 
- 
-  /// Predicated scalar evolution analysis. 
-  PredicatedScalarEvolution &PSE; 
- 
-  /// Loop Info analysis. 
-  LoopInfo *LI; 
- 
-  /// Vectorization legality. 
-  LoopVectorizationLegality *Legal; 
- 
-  /// Vector target information. 
-  const TargetTransformInfo &TTI; 
- 
-  /// Target Library Info. 
-  const TargetLibraryInfo *TLI; 
- 
-  /// Demanded bits analysis. 
-  DemandedBits *DB; 
- 
-  /// Assumption cache. 
-  AssumptionCache *AC; 
- 
-  /// Interface to emit optimization remarks. 
-  OptimizationRemarkEmitter *ORE; 
- 
-  const Function *TheFunction; 
- 
-  /// Loop Vectorize Hint. 
-  const LoopVectorizeHints *Hints; 
- 
-  /// The interleave access information contains groups of interleaved accesses 
-  /// with the same stride and close to each other. 
-  InterleavedAccessInfo &InterleaveInfo; 
- 
-  /// Values to ignore in the cost model. 
-  SmallPtrSet<const Value *, 16> ValuesToIgnore; 
- 
-  /// Values to ignore in the cost model when VF > 1. 
-  SmallPtrSet<const Value *, 16> VecValuesToIgnore; 
+public:
+  /// The loop that we evaluate.
+  Loop *TheLoop;
+
+  /// Predicated scalar evolution analysis.
+  PredicatedScalarEvolution &PSE;
+
+  /// Loop Info analysis.
+  LoopInfo *LI;
+
+  /// Vectorization legality.
+  LoopVectorizationLegality *Legal;
+
+  /// Vector target information.
+  const TargetTransformInfo &TTI;
+
+  /// Target Library Info.
+  const TargetLibraryInfo *TLI;
+
+  /// Demanded bits analysis.
+  DemandedBits *DB;
+
+  /// Assumption cache.
+  AssumptionCache *AC;
+
+  /// Interface to emit optimization remarks.
+  OptimizationRemarkEmitter *ORE;
+
+  const Function *TheFunction;
+
+  /// Loop Vectorize Hint.
+  const LoopVectorizeHints *Hints;
+
+  /// The interleave access information contains groups of interleaved accesses
+  /// with the same stride and close to each other.
+  InterleavedAccessInfo &InterleaveInfo;
+
+  /// Values to ignore in the cost model.
+  SmallPtrSet<const Value *, 16> ValuesToIgnore;
+
+  /// Values to ignore in the cost model when VF > 1.
+  SmallPtrSet<const Value *, 16> VecValuesToIgnore;
 
   /// Profitable vector factors.
   SmallVector<VectorizationFactor, 8> ProfitableVFs;
-}; 
- 
-} // end namespace llvm 
- 
-// Return true if \p OuterLp is an outer loop annotated with hints for explicit 
-// vectorization. The loop needs to be annotated with #pragma omp simd 
-// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 
-// vector length information is not provided, vectorization is not considered 
-// explicit. Interleave hints are not allowed either. These limitations will be 
-// relaxed in the future. 
-// Please, note that we are currently forced to abuse the pragma 'clang 
-// vectorize' semantics. This pragma provides *auto-vectorization hints* 
-// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 
-// provides *explicit vectorization hints* (LV can bypass legal checks and 
-// assume that vectorization is legal). However, both hints are implemented 
-// using the same metadata (llvm.loop.vectorize, processed by 
-// LoopVectorizeHints). This will be fixed in the future when the native IR 
-// representation for pragma 'omp simd' is introduced. 
-static bool isExplicitVecOuterLoop(Loop *OuterLp, 
-                                   OptimizationRemarkEmitter *ORE) { 
+};
+
+} // end namespace llvm
+
+// Return true if \p OuterLp is an outer loop annotated with hints for explicit
+// vectorization. The loop needs to be annotated with #pragma omp simd
+// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
+// vector length information is not provided, vectorization is not considered
+// explicit. Interleave hints are not allowed either. These limitations will be
+// relaxed in the future.
+// Please, note that we are currently forced to abuse the pragma 'clang
+// vectorize' semantics. This pragma provides *auto-vectorization hints*
+// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
+// provides *explicit vectorization hints* (LV can bypass legal checks and
+// assume that vectorization is legal). However, both hints are implemented
+// using the same metadata (llvm.loop.vectorize, processed by
+// LoopVectorizeHints). This will be fixed in the future when the native IR
+// representation for pragma 'omp simd' is introduced.
+static bool isExplicitVecOuterLoop(Loop *OuterLp,
+                                   OptimizationRemarkEmitter *ORE) {
   assert(!OuterLp->isInnermost() && "This is not an outer loop");
-  LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 
- 
-  // Only outer loops with an explicit vectorization hint are supported. 
-  // Unannotated outer loops are ignored. 
-  if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 
-    return false; 
- 
-  Function *Fn = OuterLp->getHeader()->getParent(); 
-  if (!Hints.allowVectorization(Fn, OuterLp, 
-                                true /*VectorizeOnlyWhenForced*/)) { 
-    LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 
-    return false; 
-  } 
- 
-  if (Hints.getInterleave() > 1) { 
-    // TODO: Interleave support is future work. 
-    LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 
-                         "outer loops.\n"); 
-    Hints.emitRemarkWithHints(); 
-    return false; 
-  } 
- 
-  return true; 
-} 
- 
-static void collectSupportedLoops(Loop &L, LoopInfo *LI, 
-                                  OptimizationRemarkEmitter *ORE, 
-                                  SmallVectorImpl<Loop *> &V) { 
-  // Collect inner loops and outer loops without irreducible control flow. For 
-  // now, only collect outer loops that have explicit vectorization hints. If we 
-  // are stress testing the VPlan H-CFG construction, we collect the outermost 
-  // loop of every loop nest. 
+  LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
+
+  // Only outer loops with an explicit vectorization hint are supported.
+  // Unannotated outer loops are ignored.
+  if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
+    return false;
+
+  Function *Fn = OuterLp->getHeader()->getParent();
+  if (!Hints.allowVectorization(Fn, OuterLp,
+                                true /*VectorizeOnlyWhenForced*/)) {
+    LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
+    return false;
+  }
+
+  if (Hints.getInterleave() > 1) {
+    // TODO: Interleave support is future work.
+    LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
+                         "outer loops.\n");
+    Hints.emitRemarkWithHints();
+    return false;
+  }
+
+  return true;
+}
+
+static void collectSupportedLoops(Loop &L, LoopInfo *LI,
+                                  OptimizationRemarkEmitter *ORE,
+                                  SmallVectorImpl<Loop *> &V) {
+  // Collect inner loops and outer loops without irreducible control flow. For
+  // now, only collect outer loops that have explicit vectorization hints. If we
+  // are stress testing the VPlan H-CFG construction, we collect the outermost
+  // loop of every loop nest.
   if (L.isInnermost() || VPlanBuildStressTest ||
-      (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 
-    LoopBlocksRPO RPOT(&L); 
-    RPOT.perform(LI); 
-    if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 
-      V.push_back(&L); 
-      // TODO: Collect inner loops inside marked outer loops in case 
-      // vectorization fails for the outer loop. Do not invoke 
-      // 'containsIrreducibleCFG' again for inner loops when the outer loop is 
-      // already known to be reducible. We can use an inherited attribute for 
-      // that. 
-      return; 
-    } 
-  } 
-  for (Loop *InnerL : L) 
-    collectSupportedLoops(*InnerL, LI, ORE, V); 
-} 
- 
-namespace { 
- 
-/// The LoopVectorize Pass. 
-struct LoopVectorize : public FunctionPass { 
-  /// Pass identification, replacement for typeid 
-  static char ID; 
- 
-  LoopVectorizePass Impl; 
- 
-  explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 
-                         bool VectorizeOnlyWhenForced = false) 
-      : FunctionPass(ID), 
-        Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 
-    initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool runOnFunction(Function &F) override { 
-    if (skipFunction(F)) 
-      return false; 
- 
-    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 
-    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 
-    auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 
-    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-    auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 
-    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 
-    auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 
-    auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 
-    auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 
-    auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 
-    auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 
-    auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 
-    auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 
- 
-    std::function<const LoopAccessInfo &(Loop &)> GetLAA = 
-        [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 
- 
-    return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 
-                        GetLAA, *ORE, PSI).MadeAnyChange; 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<AssumptionCacheTracker>(); 
-    AU.addRequired<BlockFrequencyInfoWrapperPass>(); 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addRequired<LoopInfoWrapperPass>(); 
-    AU.addRequired<ScalarEvolutionWrapperPass>(); 
-    AU.addRequired<TargetTransformInfoWrapperPass>(); 
-    AU.addRequired<AAResultsWrapperPass>(); 
-    AU.addRequired<LoopAccessLegacyAnalysis>(); 
-    AU.addRequired<DemandedBitsWrapperPass>(); 
-    AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 
-    AU.addRequired<InjectTLIMappingsLegacy>(); 
- 
-    // We currently do not preserve loopinfo/dominator analyses with outer loop 
-    // vectorization. Until this is addressed, mark these analyses as preserved 
-    // only for non-VPlan-native path. 
-    // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 
-    if (!EnableVPlanNativePath) { 
-      AU.addPreserved<LoopInfoWrapperPass>(); 
-      AU.addPreserved<DominatorTreeWrapperPass>(); 
-    } 
- 
-    AU.addPreserved<BasicAAWrapperPass>(); 
-    AU.addPreserved<GlobalsAAWrapperPass>(); 
-    AU.addRequired<ProfileSummaryInfoWrapperPass>(); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-//===----------------------------------------------------------------------===// 
-// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 
-// LoopVectorizationCostModel and LoopVectorizationPlanner. 
-//===----------------------------------------------------------------------===// 
- 
-Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 
-  // We need to place the broadcast of invariant variables outside the loop, 
-  // but only if it's proven safe to do so. Else, broadcast will be inside 
-  // vector loop body. 
-  Instruction *Instr = dyn_cast<Instruction>(V); 
-  bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 
-                     (!Instr || 
-                      DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 
-  // Place the code for broadcasting invariant variables in the new preheader. 
-  IRBuilder<>::InsertPointGuard Guard(Builder); 
-  if (SafeToHoist) 
-    Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 
- 
-  // Broadcast the scalar into all locations in the vector. 
-  Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 
- 
-  return Shuf; 
-} 
- 
-void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 
+      (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
+    LoopBlocksRPO RPOT(&L);
+    RPOT.perform(LI);
+    if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
+      V.push_back(&L);
+      // TODO: Collect inner loops inside marked outer loops in case
+      // vectorization fails for the outer loop. Do not invoke
+      // 'containsIrreducibleCFG' again for inner loops when the outer loop is
+      // already known to be reducible. We can use an inherited attribute for
+      // that.
+      return;
+    }
+  }
+  for (Loop *InnerL : L)
+    collectSupportedLoops(*InnerL, LI, ORE, V);
+}
+
+namespace {
+
+/// The LoopVectorize Pass.
+struct LoopVectorize : public FunctionPass {
+  /// Pass identification, replacement for typeid
+  static char ID;
+
+  LoopVectorizePass Impl;
+
+  explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
+                         bool VectorizeOnlyWhenForced = false)
+      : FunctionPass(ID),
+        Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
+    initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
+    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+    auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
+    auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+    auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
+    auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
+    auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+    auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+
+    std::function<const LoopAccessInfo &(Loop &)> GetLAA =
+        [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
+
+    return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
+                        GetLAA, *ORE, PSI).MadeAnyChange;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<LoopAccessLegacyAnalysis>();
+    AU.addRequired<DemandedBitsWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+    AU.addRequired<InjectTLIMappingsLegacy>();
+
+    // We currently do not preserve loopinfo/dominator analyses with outer loop
+    // vectorization. Until this is addressed, mark these analyses as preserved
+    // only for non-VPlan-native path.
+    // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
+    if (!EnableVPlanNativePath) {
+      AU.addPreserved<LoopInfoWrapperPass>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
+    }
+
+    AU.addPreserved<BasicAAWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
+  }
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
+// LoopVectorizationCostModel and LoopVectorizationPlanner.
+//===----------------------------------------------------------------------===//
+
+Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
+  // We need to place the broadcast of invariant variables outside the loop,
+  // but only if it's proven safe to do so. Else, broadcast will be inside
+  // vector loop body.
+  Instruction *Instr = dyn_cast<Instruction>(V);
+  bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
+                     (!Instr ||
+                      DT->dominates(Instr->getParent(), LoopVectorPreHeader));
+  // Place the code for broadcasting invariant variables in the new preheader.
+  IRBuilder<>::InsertPointGuard Guard(Builder);
+  if (SafeToHoist)
+    Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+
+  // Broadcast the scalar into all locations in the vector.
+  Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
+
+  return Shuf;
+}
+
+void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
     const InductionDescriptor &II, Value *Step, Value *Start,
     Instruction *EntryVal) {
-  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 
-         "Expected either an induction phi-node or a truncate of it!"); 
- 
-  // Construct the initial value of the vector IV in the vector loop preheader 
-  auto CurrIP = Builder.saveIP(); 
-  Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 
-  if (isa<TruncInst>(EntryVal)) { 
-    assert(Start->getType()->isIntegerTy() && 
-           "Truncation requires an integer type"); 
-    auto *TruncType = cast<IntegerType>(EntryVal->getType()); 
-    Step = Builder.CreateTrunc(Step, TruncType); 
-    Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 
-  } 
-  Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 
-  Value *SteppedStart = 
-      getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 
- 
-  // We create vector phi nodes for both integer and floating-point induction 
-  // variables. Here, we determine the kind of arithmetic we will perform. 
-  Instruction::BinaryOps AddOp; 
-  Instruction::BinaryOps MulOp; 
-  if (Step->getType()->isIntegerTy()) { 
-    AddOp = Instruction::Add; 
-    MulOp = Instruction::Mul; 
-  } else { 
-    AddOp = II.getInductionOpcode(); 
-    MulOp = Instruction::FMul; 
-  } 
- 
-  // Multiply the vectorization factor by the step using integer or 
-  // floating-point arithmetic as appropriate. 
+  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
+         "Expected either an induction phi-node or a truncate of it!");
+
+  // Construct the initial value of the vector IV in the vector loop preheader
+  auto CurrIP = Builder.saveIP();
+  Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+  if (isa<TruncInst>(EntryVal)) {
+    assert(Start->getType()->isIntegerTy() &&
+           "Truncation requires an integer type");
+    auto *TruncType = cast<IntegerType>(EntryVal->getType());
+    Step = Builder.CreateTrunc(Step, TruncType);
+    Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
+  }
+  Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
+  Value *SteppedStart =
+      getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
+
+  // We create vector phi nodes for both integer and floating-point induction
+  // variables. Here, we determine the kind of arithmetic we will perform.
+  Instruction::BinaryOps AddOp;
+  Instruction::BinaryOps MulOp;
+  if (Step->getType()->isIntegerTy()) {
+    AddOp = Instruction::Add;
+    MulOp = Instruction::Mul;
+  } else {
+    AddOp = II.getInductionOpcode();
+    MulOp = Instruction::FMul;
+  }
+
+  // Multiply the vectorization factor by the step using integer or
+  // floating-point arithmetic as appropriate.
   Value *ConstVF =
       getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue());
-  Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 
- 
-  // Create a vector splat to use in the induction update. 
-  // 
-  // FIXME: If the step is non-constant, we create the vector splat with 
-  //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 
-  //        handle a constant vector splat. 
+  Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
+
+  // Create a vector splat to use in the induction update.
+  //
+  // FIXME: If the step is non-constant, we create the vector splat with
+  //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
+  //        handle a constant vector splat.
   assert(!VF.isScalable() && "scalable vectors not yet supported.");
   Value *SplatVF = isa<Constant>(Mul)
                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
                        : Builder.CreateVectorSplat(VF, Mul);
-  Builder.restoreIP(CurrIP); 
- 
-  // We may need to add the step a number of times, depending on the unroll 
-  // factor. The last of those goes into the PHI. 
-  PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 
-                                    &*LoopVectorBody->getFirstInsertionPt()); 
-  VecInd->setDebugLoc(EntryVal->getDebugLoc()); 
-  Instruction *LastInduction = VecInd; 
-  for (unsigned Part = 0; Part < UF; ++Part) { 
-    VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 
- 
-    if (isa<TruncInst>(EntryVal)) 
-      addMetadata(LastInduction, EntryVal); 
-    recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 
- 
-    LastInduction = cast<Instruction>(addFastMathFlag( 
-        Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 
-    LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 
-  } 
- 
-  // Move the last step to the end of the latch block. This ensures consistent 
-  // placement of all induction updates. 
-  auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 
-  auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 
-  auto *ICmp = cast<Instruction>(Br->getCondition()); 
-  LastInduction->moveBefore(ICmp); 
-  LastInduction->setName("vec.ind.next"); 
- 
-  VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 
-  VecInd->addIncoming(LastInduction, LoopVectorLatch); 
-} 
- 
-bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 
-  return Cost->isScalarAfterVectorization(I, VF) || 
-         Cost->isProfitableToScalarize(I, VF); 
-} 
- 
-bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 
-  if (shouldScalarizeInstruction(IV)) 
-    return true; 
-  auto isScalarInst = [&](User *U) -> bool { 
-    auto *I = cast<Instruction>(U); 
-    return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 
-  }; 
-  return llvm::any_of(IV->users(), isScalarInst); 
-} 
- 
-void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 
-    const InductionDescriptor &ID, const Instruction *EntryVal, 
-    Value *VectorLoopVal, unsigned Part, unsigned Lane) { 
-  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 
-         "Expected either an induction phi-node or a truncate of it!"); 
- 
-  // This induction variable is not the phi from the original loop but the 
-  // newly-created IV based on the proof that casted Phi is equal to the 
-  // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 
-  // re-uses the same InductionDescriptor that original IV uses but we don't 
-  // have to do any recording in this case - that is done when original IV is 
-  // processed. 
-  if (isa<TruncInst>(EntryVal)) 
-    return; 
- 
-  const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 
-  if (Casts.empty()) 
-    return; 
-  // Only the first Cast instruction in the Casts vector is of interest. 
-  // The rest of the Casts (if exist) have no uses outside the 
-  // induction update chain itself. 
-  Instruction *CastInst = *Casts.begin(); 
-  if (Lane < UINT_MAX) 
-    VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 
-  else 
-    VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 
-} 
- 
+  Builder.restoreIP(CurrIP);
+
+  // We may need to add the step a number of times, depending on the unroll
+  // factor. The last of those goes into the PHI.
+  PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
+                                    &*LoopVectorBody->getFirstInsertionPt());
+  VecInd->setDebugLoc(EntryVal->getDebugLoc());
+  Instruction *LastInduction = VecInd;
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
+
+    if (isa<TruncInst>(EntryVal))
+      addMetadata(LastInduction, EntryVal);
+    recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
+
+    LastInduction = cast<Instruction>(addFastMathFlag(
+        Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
+    LastInduction->setDebugLoc(EntryVal->getDebugLoc());
+  }
+
+  // Move the last step to the end of the latch block. This ensures consistent
+  // placement of all induction updates.
+  auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
+  auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
+  auto *ICmp = cast<Instruction>(Br->getCondition());
+  LastInduction->moveBefore(ICmp);
+  LastInduction->setName("vec.ind.next");
+
+  VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
+  VecInd->addIncoming(LastInduction, LoopVectorLatch);
+}
+
+bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
+  return Cost->isScalarAfterVectorization(I, VF) ||
+         Cost->isProfitableToScalarize(I, VF);
+}
+
+bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
+  if (shouldScalarizeInstruction(IV))
+    return true;
+  auto isScalarInst = [&](User *U) -> bool {
+    auto *I = cast<Instruction>(U);
+    return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
+  };
+  return llvm::any_of(IV->users(), isScalarInst);
+}
+
+void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
+    const InductionDescriptor &ID, const Instruction *EntryVal,
+    Value *VectorLoopVal, unsigned Part, unsigned Lane) {
+  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
+         "Expected either an induction phi-node or a truncate of it!");
+
+  // This induction variable is not the phi from the original loop but the
+  // newly-created IV based on the proof that casted Phi is equal to the
+  // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
+  // re-uses the same InductionDescriptor that original IV uses but we don't
+  // have to do any recording in this case - that is done when original IV is
+  // processed.
+  if (isa<TruncInst>(EntryVal))
+    return;
+
+  const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
+  if (Casts.empty())
+    return;
+  // Only the first Cast instruction in the Casts vector is of interest.
+  // The rest of the Casts (if exist) have no uses outside the
+  // induction update chain itself.
+  Instruction *CastInst = *Casts.begin();
+  if (Lane < UINT_MAX)
+    VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
+  else
+    VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
+}
+
 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
                                                 TruncInst *Trunc) {
-  assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 
-         "Primary induction variable must have an integer type"); 
- 
-  auto II = Legal->getInductionVars().find(IV); 
-  assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 
- 
-  auto ID = II->second; 
-  assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 
- 
-  // The value from the original loop to which we are mapping the new induction 
-  // variable. 
-  Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 
- 
-  auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 
- 
-  // Generate code for the induction step. Note that induction steps are 
-  // required to be loop-invariant 
-  auto CreateStepValue = [&](const SCEV *Step) -> Value * { 
-    assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 
-           "Induction step should be loop invariant"); 
-    if (PSE.getSE()->isSCEVable(IV->getType())) { 
-      SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 
-      return Exp.expandCodeFor(Step, Step->getType(), 
-                               LoopVectorPreHeader->getTerminator()); 
-    } 
-    return cast<SCEVUnknown>(Step)->getValue(); 
-  }; 
- 
-  // The scalar value to broadcast. This is derived from the canonical 
-  // induction variable. If a truncation type is given, truncate the canonical 
-  // induction variable and step. Otherwise, derive these values from the 
-  // induction descriptor. 
-  auto CreateScalarIV = [&](Value *&Step) -> Value * { 
-    Value *ScalarIV = Induction; 
-    if (IV != OldInduction) { 
-      ScalarIV = IV->getType()->isIntegerTy() 
-                     ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 
-                     : Builder.CreateCast(Instruction::SIToFP, Induction, 
-                                          IV->getType()); 
-      ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 
-      ScalarIV->setName("offset.idx"); 
-    } 
-    if (Trunc) { 
-      auto *TruncType = cast<IntegerType>(Trunc->getType()); 
-      assert(Step->getType()->isIntegerTy() && 
-             "Truncation requires an integer step"); 
-      ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 
-      Step = Builder.CreateTrunc(Step, TruncType); 
-    } 
-    return ScalarIV; 
-  }; 
- 
-  // Create the vector values from the scalar IV, in the absence of creating a 
-  // vector IV. 
-  auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 
-    Value *Broadcasted = getBroadcastInstrs(ScalarIV); 
-    for (unsigned Part = 0; Part < UF; ++Part) { 
+  assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
+         "Primary induction variable must have an integer type");
+
+  auto II = Legal->getInductionVars().find(IV);
+  assert(II != Legal->getInductionVars().end() && "IV is not an induction");
+
+  auto ID = II->second;
+  assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
+
+  // The value from the original loop to which we are mapping the new induction
+  // variable.
+  Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
+
+  auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
+
+  // Generate code for the induction step. Note that induction steps are
+  // required to be loop-invariant
+  auto CreateStepValue = [&](const SCEV *Step) -> Value * {
+    assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
+           "Induction step should be loop invariant");
+    if (PSE.getSE()->isSCEVable(IV->getType())) {
+      SCEVExpander Exp(*PSE.getSE(), DL, "induction");
+      return Exp.expandCodeFor(Step, Step->getType(),
+                               LoopVectorPreHeader->getTerminator());
+    }
+    return cast<SCEVUnknown>(Step)->getValue();
+  };
+
+  // The scalar value to broadcast. This is derived from the canonical
+  // induction variable. If a truncation type is given, truncate the canonical
+  // induction variable and step. Otherwise, derive these values from the
+  // induction descriptor.
+  auto CreateScalarIV = [&](Value *&Step) -> Value * {
+    Value *ScalarIV = Induction;
+    if (IV != OldInduction) {
+      ScalarIV = IV->getType()->isIntegerTy()
+                     ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
+                     : Builder.CreateCast(Instruction::SIToFP, Induction,
+                                          IV->getType());
+      ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
+      ScalarIV->setName("offset.idx");
+    }
+    if (Trunc) {
+      auto *TruncType = cast<IntegerType>(Trunc->getType());
+      assert(Step->getType()->isIntegerTy() &&
+             "Truncation requires an integer step");
+      ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
+      Step = Builder.CreateTrunc(Step, TruncType);
+    }
+    return ScalarIV;
+  };
+
+  // Create the vector values from the scalar IV, in the absence of creating a
+  // vector IV.
+  auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
+    Value *Broadcasted = getBroadcastInstrs(ScalarIV);
+    for (unsigned Part = 0; Part < UF; ++Part) {
       assert(!VF.isScalable() && "scalable vectors not yet supported.");
-      Value *EntryPart = 
+      Value *EntryPart =
           getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
                         ID.getInductionOpcode());
-      VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 
-      if (Trunc) 
-        addMetadata(EntryPart, Trunc); 
-      recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 
-    } 
-  }; 
- 
-  // Now do the actual transformations, and start with creating the step value. 
-  Value *Step = CreateStepValue(ID.getStep()); 
+      VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
+      if (Trunc)
+        addMetadata(EntryPart, Trunc);
+      recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
+    }
+  };
+
+  // Now do the actual transformations, and start with creating the step value.
+  Value *Step = CreateStepValue(ID.getStep());
   if (VF.isZero() || VF.isScalar()) {
-    Value *ScalarIV = CreateScalarIV(Step); 
-    CreateSplatIV(ScalarIV, Step); 
-    return; 
-  } 
- 
-  // Determine if we want a scalar version of the induction variable. This is 
-  // true if the induction variable itself is not widened, or if it has at 
-  // least one user in the loop that is not widened. 
-  auto NeedsScalarIV = needsScalarInduction(EntryVal); 
-  if (!NeedsScalarIV) { 
+    Value *ScalarIV = CreateScalarIV(Step);
+    CreateSplatIV(ScalarIV, Step);
+    return;
+  }
+
+  // Determine if we want a scalar version of the induction variable. This is
+  // true if the induction variable itself is not widened, or if it has at
+  // least one user in the loop that is not widened.
+  auto NeedsScalarIV = needsScalarInduction(EntryVal);
+  if (!NeedsScalarIV) {
     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal);
-    return; 
-  } 
- 
-  // Try to create a new independent vector induction variable. If we can't 
-  // create the phi node, we will splat the scalar induction variable in each 
-  // loop iteration. 
-  if (!shouldScalarizeInstruction(EntryVal)) { 
+    return;
+  }
+
+  // Try to create a new independent vector induction variable. If we can't
+  // create the phi node, we will splat the scalar induction variable in each
+  // loop iteration.
+  if (!shouldScalarizeInstruction(EntryVal)) {
     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal);
-    Value *ScalarIV = CreateScalarIV(Step); 
-    // Create scalar steps that can be used by instructions we will later 
-    // scalarize. Note that the addition of the scalar steps will not increase 
-    // the number of instructions in the loop in the common case prior to 
-    // InstCombine. We will be trading one vector extract for each scalar step. 
-    buildScalarSteps(ScalarIV, Step, EntryVal, ID); 
-    return; 
-  } 
- 
-  // All IV users are scalar instructions, so only emit a scalar IV, not a 
-  // vectorised IV. Except when we tail-fold, then the splat IV feeds the 
-  // predicate used by the masked loads/stores. 
-  Value *ScalarIV = CreateScalarIV(Step); 
-  if (!Cost->isScalarEpilogueAllowed()) 
-    CreateSplatIV(ScalarIV, Step); 
-  buildScalarSteps(ScalarIV, Step, EntryVal, ID); 
-} 
- 
-Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 
-                                          Instruction::BinaryOps BinOp) { 
-  // Create and check the types. 
+    Value *ScalarIV = CreateScalarIV(Step);
+    // Create scalar steps that can be used by instructions we will later
+    // scalarize. Note that the addition of the scalar steps will not increase
+    // the number of instructions in the loop in the common case prior to
+    // InstCombine. We will be trading one vector extract for each scalar step.
+    buildScalarSteps(ScalarIV, Step, EntryVal, ID);
+    return;
+  }
+
+  // All IV users are scalar instructions, so only emit a scalar IV, not a
+  // vectorised IV. Except when we tail-fold, then the splat IV feeds the
+  // predicate used by the masked loads/stores.
+  Value *ScalarIV = CreateScalarIV(Step);
+  if (!Cost->isScalarEpilogueAllowed())
+    CreateSplatIV(ScalarIV, Step);
+  buildScalarSteps(ScalarIV, Step, EntryVal, ID);
+}
+
+Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
+                                          Instruction::BinaryOps BinOp) {
+  // Create and check the types.
   auto *ValVTy = cast<FixedVectorType>(Val->getType());
-  int VLen = ValVTy->getNumElements(); 
- 
-  Type *STy = Val->getType()->getScalarType(); 
-  assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 
-         "Induction Step must be an integer or FP"); 
-  assert(Step->getType() == STy && "Step has wrong type"); 
- 
-  SmallVector<Constant *, 8> Indices; 
- 
-  if (STy->isIntegerTy()) { 
-    // Create a vector of consecutive numbers from zero to VF. 
-    for (int i = 0; i < VLen; ++i) 
-      Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 
- 
-    // Add the consecutive indices to the vector value. 
-    Constant *Cv = ConstantVector::get(Indices); 
-    assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 
-    Step = Builder.CreateVectorSplat(VLen, Step); 
-    assert(Step->getType() == Val->getType() && "Invalid step vec"); 
-    // FIXME: The newly created binary instructions should contain nsw/nuw flags, 
-    // which can be found from the original scalar operations. 
-    Step = Builder.CreateMul(Cv, Step); 
-    return Builder.CreateAdd(Val, Step, "induction"); 
-  } 
- 
-  // Floating point induction. 
-  assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 
-         "Binary Opcode should be specified for FP induction"); 
-  // Create a vector of consecutive numbers from zero to VF. 
-  for (int i = 0; i < VLen; ++i) 
-    Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 
- 
-  // Add the consecutive indices to the vector value. 
-  Constant *Cv = ConstantVector::get(Indices); 
- 
-  Step = Builder.CreateVectorSplat(VLen, Step); 
- 
-  // Floating point operations had to be 'fast' to enable the induction. 
-  FastMathFlags Flags; 
-  Flags.setFast(); 
- 
-  Value *MulOp = Builder.CreateFMul(Cv, Step); 
-  if (isa<Instruction>(MulOp)) 
-    // Have to check, MulOp may be a constant 
-    cast<Instruction>(MulOp)->setFastMathFlags(Flags); 
- 
-  Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 
-  if (isa<Instruction>(BOp)) 
-    cast<Instruction>(BOp)->setFastMathFlags(Flags); 
-  return BOp; 
-} 
- 
-void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 
-                                           Instruction *EntryVal, 
-                                           const InductionDescriptor &ID) { 
-  // We shouldn't have to build scalar steps if we aren't vectorizing. 
+  int VLen = ValVTy->getNumElements();
+
+  Type *STy = Val->getType()->getScalarType();
+  assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
+         "Induction Step must be an integer or FP");
+  assert(Step->getType() == STy && "Step has wrong type");
+
+  SmallVector<Constant *, 8> Indices;
+
+  if (STy->isIntegerTy()) {
+    // Create a vector of consecutive numbers from zero to VF.
+    for (int i = 0; i < VLen; ++i)
+      Indices.push_back(ConstantInt::get(STy, StartIdx + i));
+
+    // Add the consecutive indices to the vector value.
+    Constant *Cv = ConstantVector::get(Indices);
+    assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
+    Step = Builder.CreateVectorSplat(VLen, Step);
+    assert(Step->getType() == Val->getType() && "Invalid step vec");
+    // FIXME: The newly created binary instructions should contain nsw/nuw flags,
+    // which can be found from the original scalar operations.
+    Step = Builder.CreateMul(Cv, Step);
+    return Builder.CreateAdd(Val, Step, "induction");
+  }
+
+  // Floating point induction.
+  assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
+         "Binary Opcode should be specified for FP induction");
+  // Create a vector of consecutive numbers from zero to VF.
+  for (int i = 0; i < VLen; ++i)
+    Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
+
+  // Add the consecutive indices to the vector value.
+  Constant *Cv = ConstantVector::get(Indices);
+
+  Step = Builder.CreateVectorSplat(VLen, Step);
+
+  // Floating point operations had to be 'fast' to enable the induction.
+  FastMathFlags Flags;
+  Flags.setFast();
+
+  Value *MulOp = Builder.CreateFMul(Cv, Step);
+  if (isa<Instruction>(MulOp))
+    // Have to check, MulOp may be a constant
+    cast<Instruction>(MulOp)->setFastMathFlags(Flags);
+
+  Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
+  if (isa<Instruction>(BOp))
+    cast<Instruction>(BOp)->setFastMathFlags(Flags);
+  return BOp;
+}
+
+void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
+                                           Instruction *EntryVal,
+                                           const InductionDescriptor &ID) {
+  // We shouldn't have to build scalar steps if we aren't vectorizing.
   assert(VF.isVector() && "VF should be greater than one");
-  // Get the value type and ensure it and the step have the same integer type. 
-  Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 
-  assert(ScalarIVTy == Step->getType() && 
-         "Val and Step should have the same type"); 
- 
-  // We build scalar steps for both integer and floating-point induction 
-  // variables. Here, we determine the kind of arithmetic we will perform. 
-  Instruction::BinaryOps AddOp; 
-  Instruction::BinaryOps MulOp; 
-  if (ScalarIVTy->isIntegerTy()) { 
-    AddOp = Instruction::Add; 
-    MulOp = Instruction::Mul; 
-  } else { 
-    AddOp = ID.getInductionOpcode(); 
-    MulOp = Instruction::FMul; 
-  } 
- 
-  // Determine the number of scalars we need to generate for each unroll 
-  // iteration. If EntryVal is uniform, we only need to generate the first 
-  // lane. Otherwise, we generate all VF values. 
-  unsigned Lanes = 
+  // Get the value type and ensure it and the step have the same integer type.
+  Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
+  assert(ScalarIVTy == Step->getType() &&
+         "Val and Step should have the same type");
+
+  // We build scalar steps for both integer and floating-point induction
+  // variables. Here, we determine the kind of arithmetic we will perform.
+  Instruction::BinaryOps AddOp;
+  Instruction::BinaryOps MulOp;
+  if (ScalarIVTy->isIntegerTy()) {
+    AddOp = Instruction::Add;
+    MulOp = Instruction::Mul;
+  } else {
+    AddOp = ID.getInductionOpcode();
+    MulOp = Instruction::FMul;
+  }
+
+  // Determine the number of scalars we need to generate for each unroll
+  // iteration. If EntryVal is uniform, we only need to generate the first
+  // lane. Otherwise, we generate all VF values.
+  unsigned Lanes =
       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
           ? 1
           : VF.getKnownMinValue();
   assert((!VF.isScalable() || Lanes == 1) &&
          "Should never scalarize a scalable vector");
-  // Compute the scalar steps and save the results in VectorLoopValueMap. 
-  for (unsigned Part = 0; Part < UF; ++Part) { 
-    for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 
+  // Compute the scalar steps and save the results in VectorLoopValueMap.
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
       auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
                                          ScalarIVTy->getScalarSizeInBits());
       Value *StartIdx =
@@ -2356,685 +2356,685 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
       assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
              "Expected StartIdx to be folded to a constant when VF is not "
              "scalable");
-      auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 
-      auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 
-      VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 
-      recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 
-    } 
-  } 
-} 
- 
-Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 
-  assert(V != Induction && "The new induction variable should not be used."); 
-  assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 
-  assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 
- 
-  // If we have a stride that is replaced by one, do it here. Defer this for 
-  // the VPlan-native path until we start running Legal checks in that path. 
-  if (!EnableVPlanNativePath && Legal->hasStride(V)) 
-    V = ConstantInt::get(V->getType(), 1); 
- 
-  // If we have a vector mapped to this value, return it. 
-  if (VectorLoopValueMap.hasVectorValue(V, Part)) 
-    return VectorLoopValueMap.getVectorValue(V, Part); 
- 
-  // If the value has not been vectorized, check if it has been scalarized 
-  // instead. If it has been scalarized, and we actually need the value in 
-  // vector form, we will construct the vector values on demand. 
-  if (VectorLoopValueMap.hasAnyScalarValue(V)) { 
-    Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 
- 
-    // If we've scalarized a value, that value should be an instruction. 
-    auto *I = cast<Instruction>(V); 
- 
-    // If we aren't vectorizing, we can just copy the scalar map values over to 
-    // the vector map. 
+      auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
+      auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
+      VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
+      recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
+    }
+  }
+}
+
+Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
+  assert(V != Induction && "The new induction variable should not be used.");
+  assert(!V->getType()->isVectorTy() && "Can't widen a vector");
+  assert(!V->getType()->isVoidTy() && "Type does not produce a value");
+
+  // If we have a stride that is replaced by one, do it here. Defer this for
+  // the VPlan-native path until we start running Legal checks in that path.
+  if (!EnableVPlanNativePath && Legal->hasStride(V))
+    V = ConstantInt::get(V->getType(), 1);
+
+  // If we have a vector mapped to this value, return it.
+  if (VectorLoopValueMap.hasVectorValue(V, Part))
+    return VectorLoopValueMap.getVectorValue(V, Part);
+
+  // If the value has not been vectorized, check if it has been scalarized
+  // instead. If it has been scalarized, and we actually need the value in
+  // vector form, we will construct the vector values on demand.
+  if (VectorLoopValueMap.hasAnyScalarValue(V)) {
+    Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
+
+    // If we've scalarized a value, that value should be an instruction.
+    auto *I = cast<Instruction>(V);
+
+    // If we aren't vectorizing, we can just copy the scalar map values over to
+    // the vector map.
     if (VF.isScalar()) {
-      VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 
-      return ScalarValue; 
-    } 
- 
-    // Get the last scalar instruction we generated for V and Part. If the value 
-    // is known to be uniform after vectorization, this corresponds to lane zero 
-    // of the Part unroll iteration. Otherwise, the last instruction is the one 
-    // we created for the last vector lane of the Part unroll iteration. 
+      VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
+      return ScalarValue;
+    }
+
+    // Get the last scalar instruction we generated for V and Part. If the value
+    // is known to be uniform after vectorization, this corresponds to lane zero
+    // of the Part unroll iteration. Otherwise, the last instruction is the one
+    // we created for the last vector lane of the Part unroll iteration.
     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)
                             ? 0
                             : VF.getKnownMinValue() - 1;
     assert((!VF.isScalable() || LastLane == 0) &&
            "Scalable vectorization can't lead to any scalarized values.");
-    auto *LastInst = cast<Instruction>( 
-        VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 
- 
-    // Set the insert point after the last scalarized instruction. This ensures 
-    // the insertelement sequence will directly follow the scalar definitions. 
-    auto OldIP = Builder.saveIP(); 
-    auto NewIP = std::next(BasicBlock::iterator(LastInst)); 
-    Builder.SetInsertPoint(&*NewIP); 
- 
-    // However, if we are vectorizing, we need to construct the vector values. 
-    // If the value is known to be uniform after vectorization, we can just 
-    // broadcast the scalar value corresponding to lane zero for each unroll 
-    // iteration. Otherwise, we construct the vector values using insertelement 
-    // instructions. Since the resulting vectors are stored in 
-    // VectorLoopValueMap, we will only generate the insertelements once. 
-    Value *VectorValue = nullptr; 
-    if (Cost->isUniformAfterVectorization(I, VF)) { 
-      VectorValue = getBroadcastInstrs(ScalarValue); 
-      VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 
-    } else { 
+    auto *LastInst = cast<Instruction>(
+        VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
+
+    // Set the insert point after the last scalarized instruction. This ensures
+    // the insertelement sequence will directly follow the scalar definitions.
+    auto OldIP = Builder.saveIP();
+    auto NewIP = std::next(BasicBlock::iterator(LastInst));
+    Builder.SetInsertPoint(&*NewIP);
+
+    // However, if we are vectorizing, we need to construct the vector values.
+    // If the value is known to be uniform after vectorization, we can just
+    // broadcast the scalar value corresponding to lane zero for each unroll
+    // iteration. Otherwise, we construct the vector values using insertelement
+    // instructions. Since the resulting vectors are stored in
+    // VectorLoopValueMap, we will only generate the insertelements once.
+    Value *VectorValue = nullptr;
+    if (Cost->isUniformAfterVectorization(I, VF)) {
+      VectorValue = getBroadcastInstrs(ScalarValue);
+      VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
+    } else {
       // Initialize packing with insertelements to start from poison.
       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
       Value *Poison = PoisonValue::get(VectorType::get(V->getType(), VF));
       VectorLoopValueMap.setVectorValue(V, Part, Poison);
       for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
-        packScalarIntoVectorValue(V, {Part, Lane}); 
-      VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 
-    } 
-    Builder.restoreIP(OldIP); 
-    return VectorValue; 
-  } 
- 
-  // If this scalar is unknown, assume that it is a constant or that it is 
-  // loop invariant. Broadcast V and save the value for future uses. 
-  Value *B = getBroadcastInstrs(V); 
-  VectorLoopValueMap.setVectorValue(V, Part, B); 
-  return B; 
-} 
- 
-Value * 
-InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 
-                                            const VPIteration &Instance) { 
-  // If the value is not an instruction contained in the loop, it should 
-  // already be scalar. 
-  if (OrigLoop->isLoopInvariant(V)) 
-    return V; 
- 
-  assert(Instance.Lane > 0 
-             ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 
-             : true && "Uniform values only have lane zero"); 
- 
-  // If the value from the original loop has not been vectorized, it is 
-  // represented by UF x VF scalar values in the new loop. Return the requested 
-  // scalar value. 
-  if (VectorLoopValueMap.hasScalarValue(V, Instance)) 
-    return VectorLoopValueMap.getScalarValue(V, Instance); 
- 
-  // If the value has not been scalarized, get its entry in VectorLoopValueMap 
-  // for the given unroll part. If this entry is not a vector type (i.e., the 
-  // vectorization factor is one), there is no need to generate an 
-  // extractelement instruction. 
-  auto *U = getOrCreateVectorValue(V, Instance.Part); 
-  if (!U->getType()->isVectorTy()) { 
+        packScalarIntoVectorValue(V, {Part, Lane});
+      VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
+    }
+    Builder.restoreIP(OldIP);
+    return VectorValue;
+  }
+
+  // If this scalar is unknown, assume that it is a constant or that it is
+  // loop invariant. Broadcast V and save the value for future uses.
+  Value *B = getBroadcastInstrs(V);
+  VectorLoopValueMap.setVectorValue(V, Part, B);
+  return B;
+}
+
+Value *
+InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
+                                            const VPIteration &Instance) {
+  // If the value is not an instruction contained in the loop, it should
+  // already be scalar.
+  if (OrigLoop->isLoopInvariant(V))
+    return V;
+
+  assert(Instance.Lane > 0
+             ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
+             : true && "Uniform values only have lane zero");
+
+  // If the value from the original loop has not been vectorized, it is
+  // represented by UF x VF scalar values in the new loop. Return the requested
+  // scalar value.
+  if (VectorLoopValueMap.hasScalarValue(V, Instance))
+    return VectorLoopValueMap.getScalarValue(V, Instance);
+
+  // If the value has not been scalarized, get its entry in VectorLoopValueMap
+  // for the given unroll part. If this entry is not a vector type (i.e., the
+  // vectorization factor is one), there is no need to generate an
+  // extractelement instruction.
+  auto *U = getOrCreateVectorValue(V, Instance.Part);
+  if (!U->getType()->isVectorTy()) {
     assert(VF.isScalar() && "Value not scalarized has non-vector type");
-    return U; 
-  } 
- 
-  // Otherwise, the value from the original loop has been vectorized and is 
-  // represented by UF vector values. Extract and return the requested scalar 
-  // value from the appropriate vector lane. 
-  return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 
-} 
- 
-void InnerLoopVectorizer::packScalarIntoVectorValue( 
-    Value *V, const VPIteration &Instance) { 
-  assert(V != Induction && "The new induction variable should not be used."); 
-  assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 
-  assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 
- 
-  Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 
-  Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 
-  VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 
-                                            Builder.getInt32(Instance.Lane)); 
-  VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 
-} 
- 
-Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 
-  assert(Vec->getType()->isVectorTy() && "Invalid type"); 
+    return U;
+  }
+
+  // Otherwise, the value from the original loop has been vectorized and is
+  // represented by UF vector values. Extract and return the requested scalar
+  // value from the appropriate vector lane.
+  return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
+}
+
+void InnerLoopVectorizer::packScalarIntoVectorValue(
+    Value *V, const VPIteration &Instance) {
+  assert(V != Induction && "The new induction variable should not be used.");
+  assert(!V->getType()->isVectorTy() && "Can't pack a vector");
+  assert(!V->getType()->isVoidTy() && "Type does not produce a value");
+
+  Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
+  Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
+  VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
+                                            Builder.getInt32(Instance.Lane));
+  VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
+}
+
+Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
+  assert(Vec->getType()->isVectorTy() && "Invalid type");
   assert(!VF.isScalable() && "Cannot reverse scalable vectors");
-  SmallVector<int, 8> ShuffleMask; 
+  SmallVector<int, 8> ShuffleMask;
   for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
     ShuffleMask.push_back(VF.getKnownMinValue() - i - 1);
- 
+
   return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse");
-} 
- 
-// Return whether we allow using masked interleave-groups (for dealing with 
-// strided loads/stores that reside in predicated blocks, or for dealing 
-// with gaps). 
-static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 
-  // If an override option has been passed in for interleaved accesses, use it. 
-  if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 
-    return EnableMaskedInterleavedMemAccesses; 
- 
-  return TTI.enableMaskedInterleavedAccessVectorization(); 
-} 
- 
-// Try to vectorize the interleave group that \p Instr belongs to. 
-// 
-// E.g. Translate following interleaved load group (factor = 3): 
-//   for (i = 0; i < N; i+=3) { 
-//     R = Pic[i];             // Member of index 0 
-//     G = Pic[i+1];           // Member of index 1 
-//     B = Pic[i+2];           // Member of index 2 
-//     ... // do something to R, G, B 
-//   } 
-// To: 
-//   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B 
+}
+
+// Return whether we allow using masked interleave-groups (for dealing with
+// strided loads/stores that reside in predicated blocks, or for dealing
+// with gaps).
+static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
+  // If an override option has been passed in for interleaved accesses, use it.
+  if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
+    return EnableMaskedInterleavedMemAccesses;
+
+  return TTI.enableMaskedInterleavedAccessVectorization();
+}
+
+// Try to vectorize the interleave group that \p Instr belongs to.
+//
+// E.g. Translate following interleaved load group (factor = 3):
+//   for (i = 0; i < N; i+=3) {
+//     R = Pic[i];             // Member of index 0
+//     G = Pic[i+1];           // Member of index 1
+//     B = Pic[i+2];           // Member of index 2
+//     ... // do something to R, G, B
+//   }
+// To:
+//   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
-// 
-// Or translate following interleaved store group (factor = 3): 
-//   for (i = 0; i < N; i+=3) { 
-//     ... do something to R, G, B 
-//     Pic[i]   = R;           // Member of index 0 
-//     Pic[i+1] = G;           // Member of index 1 
-//     Pic[i+2] = B;           // Member of index 2 
-//   } 
-// To: 
-//   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 
+//
+// Or translate following interleaved store group (factor = 3):
+//   for (i = 0; i < N; i+=3) {
+//     ... do something to R, G, B
+//     Pic[i]   = R;           // Member of index 0
+//     Pic[i+1] = G;           // Member of index 1
+//     Pic[i+2] = B;           // Member of index 2
+//   }
+// To:
+//   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
-//   %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 
-//        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements 
-//   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B 
-void InnerLoopVectorizer::vectorizeInterleaveGroup( 
+//   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
+//        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
+//   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
+void InnerLoopVectorizer::vectorizeInterleaveGroup(
     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
     VPValue *BlockInMask) {
-  Instruction *Instr = Group->getInsertPos(); 
-  const DataLayout &DL = Instr->getModule()->getDataLayout(); 
- 
-  // Prepare for the vector type of the interleaved load/store. 
-  Type *ScalarTy = getMemInstValueType(Instr); 
-  unsigned InterleaveFactor = Group->getFactor(); 
+  Instruction *Instr = Group->getInsertPos();
+  const DataLayout &DL = Instr->getModule()->getDataLayout();
+
+  // Prepare for the vector type of the interleaved load/store.
+  Type *ScalarTy = getMemInstValueType(Instr);
+  unsigned InterleaveFactor = Group->getFactor();
   assert(!VF.isScalable() && "scalable vectors not yet supported.");
   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
- 
-  // Prepare for the new pointers. 
-  SmallVector<Value *, 2> AddrParts; 
-  unsigned Index = Group->getIndex(Instr); 
- 
-  // TODO: extend the masked interleaved-group support to reversed access. 
-  assert((!BlockInMask || !Group->isReverse()) && 
-         "Reversed masked interleave-group not supported."); 
- 
-  // If the group is reverse, adjust the index to refer to the last vector lane 
-  // instead of the first. We adjust the index from the first vector lane, 
-  // rather than directly getting the pointer for lane VF - 1, because the 
-  // pointer operand of the interleaved access is supposed to be uniform. For 
-  // uniform instructions, we're only required to generate a value for the 
-  // first vector lane in each unroll iteration. 
+
+  // Prepare for the new pointers.
+  SmallVector<Value *, 2> AddrParts;
+  unsigned Index = Group->getIndex(Instr);
+
+  // TODO: extend the masked interleaved-group support to reversed access.
+  assert((!BlockInMask || !Group->isReverse()) &&
+         "Reversed masked interleave-group not supported.");
+
+  // If the group is reverse, adjust the index to refer to the last vector lane
+  // instead of the first. We adjust the index from the first vector lane,
+  // rather than directly getting the pointer for lane VF - 1, because the
+  // pointer operand of the interleaved access is supposed to be uniform. For
+  // uniform instructions, we're only required to generate a value for the
+  // first vector lane in each unroll iteration.
   assert(!VF.isScalable() &&
          "scalable vector reverse operation is not implemented");
-  if (Group->isReverse()) 
+  if (Group->isReverse())
     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
- 
-  for (unsigned Part = 0; Part < UF; Part++) { 
-    Value *AddrPart = State.get(Addr, {Part, 0}); 
-    setDebugLocFromInst(Builder, AddrPart); 
- 
-    // Notice current instruction could be any index. Need to adjust the address 
-    // to the member of index 0. 
-    // 
-    // E.g.  a = A[i+1];     // Member of index 1 (Current instruction) 
-    //       b = A[i];       // Member of index 0 
-    // Current pointer is pointed to A[i+1], adjust it to A[i]. 
-    // 
-    // E.g.  A[i+1] = a;     // Member of index 1 
-    //       A[i]   = b;     // Member of index 0 
-    //       A[i+2] = c;     // Member of index 2 (Current instruction) 
-    // Current pointer is pointed to A[i+2], adjust it to A[i]. 
- 
-    bool InBounds = false; 
-    if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 
-      InBounds = gep->isInBounds(); 
-    AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 
-    cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 
- 
-    // Cast to the vector pointer type. 
-    unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 
-    Type *PtrTy = VecTy->getPointerTo(AddressSpace); 
-    AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 
-  } 
- 
-  setDebugLocFromInst(Builder, Instr); 
+
+  for (unsigned Part = 0; Part < UF; Part++) {
+    Value *AddrPart = State.get(Addr, {Part, 0});
+    setDebugLocFromInst(Builder, AddrPart);
+
+    // Notice current instruction could be any index. Need to adjust the address
+    // to the member of index 0.
+    //
+    // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
+    //       b = A[i];       // Member of index 0
+    // Current pointer is pointed to A[i+1], adjust it to A[i].
+    //
+    // E.g.  A[i+1] = a;     // Member of index 1
+    //       A[i]   = b;     // Member of index 0
+    //       A[i+2] = c;     // Member of index 2 (Current instruction)
+    // Current pointer is pointed to A[i+2], adjust it to A[i].
+
+    bool InBounds = false;
+    if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
+      InBounds = gep->isInBounds();
+    AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
+    cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
+
+    // Cast to the vector pointer type.
+    unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
+    Type *PtrTy = VecTy->getPointerTo(AddressSpace);
+    AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
+  }
+
+  setDebugLocFromInst(Builder, Instr);
   Value *PoisonVec = PoisonValue::get(VecTy);
- 
-  Value *MaskForGaps = nullptr; 
-  if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 
+
+  Value *MaskForGaps = nullptr;
+  if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
     assert(!VF.isScalable() && "scalable vectors not yet supported.");
     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
-    assert(MaskForGaps && "Mask for Gaps is required but it is null"); 
-  } 
- 
-  // Vectorize the interleaved load group. 
-  if (isa<LoadInst>(Instr)) { 
-    // For each unroll part, create a wide load for the group. 
-    SmallVector<Value *, 2> NewLoads; 
-    for (unsigned Part = 0; Part < UF; Part++) { 
-      Instruction *NewLoad; 
-      if (BlockInMask || MaskForGaps) { 
-        assert(useMaskedInterleavedAccesses(*TTI) && 
-               "masked interleaved groups are not allowed."); 
-        Value *GroupMask = MaskForGaps; 
-        if (BlockInMask) { 
-          Value *BlockInMaskPart = State.get(BlockInMask, Part); 
+    assert(MaskForGaps && "Mask for Gaps is required but it is null");
+  }
+
+  // Vectorize the interleaved load group.
+  if (isa<LoadInst>(Instr)) {
+    // For each unroll part, create a wide load for the group.
+    SmallVector<Value *, 2> NewLoads;
+    for (unsigned Part = 0; Part < UF; Part++) {
+      Instruction *NewLoad;
+      if (BlockInMask || MaskForGaps) {
+        assert(useMaskedInterleavedAccesses(*TTI) &&
+               "masked interleaved groups are not allowed.");
+        Value *GroupMask = MaskForGaps;
+        if (BlockInMask) {
+          Value *BlockInMaskPart = State.get(BlockInMask, Part);
           assert(!VF.isScalable() && "scalable vectors not yet supported.");
-          Value *ShuffledMask = Builder.CreateShuffleVector( 
+          Value *ShuffledMask = Builder.CreateShuffleVector(
               BlockInMaskPart,
               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
               "interleaved.mask");
-          GroupMask = MaskForGaps 
-                          ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 
-                                                MaskForGaps) 
-                          : ShuffledMask; 
-        } 
-        NewLoad = 
-            Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 
+          GroupMask = MaskForGaps
+                          ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
+                                                MaskForGaps)
+                          : ShuffledMask;
+        }
+        NewLoad =
+            Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
                                      GroupMask, PoisonVec, "wide.masked.vec");
-      } 
-      else 
-        NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 
-                                            Group->getAlign(), "wide.vec"); 
-      Group->addMetadata(NewLoad); 
-      NewLoads.push_back(NewLoad); 
-    } 
- 
-    // For each member in the group, shuffle out the appropriate data from the 
-    // wide loads. 
+      }
+      else
+        NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
+                                            Group->getAlign(), "wide.vec");
+      Group->addMetadata(NewLoad);
+      NewLoads.push_back(NewLoad);
+    }
+
+    // For each member in the group, shuffle out the appropriate data from the
+    // wide loads.
     unsigned J = 0;
-    for (unsigned I = 0; I < InterleaveFactor; ++I) { 
-      Instruction *Member = Group->getMember(I); 
- 
-      // Skip the gaps in the group. 
-      if (!Member) 
-        continue; 
- 
+    for (unsigned I = 0; I < InterleaveFactor; ++I) {
+      Instruction *Member = Group->getMember(I);
+
+      // Skip the gaps in the group.
+      if (!Member)
+        continue;
+
       assert(!VF.isScalable() && "scalable vectors not yet supported.");
       auto StrideMask =
           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
-      for (unsigned Part = 0; Part < UF; Part++) { 
-        Value *StridedVec = Builder.CreateShuffleVector( 
+      for (unsigned Part = 0; Part < UF; Part++) {
+        Value *StridedVec = Builder.CreateShuffleVector(
             NewLoads[Part], StrideMask, "strided.vec");
- 
-        // If this member has different type, cast the result type. 
-        if (Member->getType() != ScalarTy) { 
+
+        // If this member has different type, cast the result type.
+        if (Member->getType() != ScalarTy) {
           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
-          StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 
-        } 
- 
-        if (Group->isReverse()) 
-          StridedVec = reverseVector(StridedVec); 
- 
+          StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
+        }
+
+        if (Group->isReverse())
+          StridedVec = reverseVector(StridedVec);
+
         State.set(VPDefs[J], Member, StridedVec, Part);
-      } 
+      }
       ++J;
-    } 
-    return; 
-  } 
- 
-  // The sub vector type for current instruction. 
+    }
+    return;
+  }
+
+  // The sub vector type for current instruction.
   assert(!VF.isScalable() && "VF is assumed to be non scalable.");
   auto *SubVT = VectorType::get(ScalarTy, VF);
- 
-  // Vectorize the interleaved store group. 
-  for (unsigned Part = 0; Part < UF; Part++) { 
-    // Collect the stored vector from each member. 
-    SmallVector<Value *, 4> StoredVecs; 
-    for (unsigned i = 0; i < InterleaveFactor; i++) { 
-      // Interleaved store group doesn't allow a gap, so each index has a member 
+
+  // Vectorize the interleaved store group.
+  for (unsigned Part = 0; Part < UF; Part++) {
+    // Collect the stored vector from each member.
+    SmallVector<Value *, 4> StoredVecs;
+    for (unsigned i = 0; i < InterleaveFactor; i++) {
+      // Interleaved store group doesn't allow a gap, so each index has a member
       assert(Group->getMember(i) && "Fail to get a member from an interleaved store group");
- 
+
       Value *StoredVec = State.get(StoredValues[i], Part);
 
-      if (Group->isReverse()) 
-        StoredVec = reverseVector(StoredVec); 
- 
-      // If this member has different type, cast it to a unified type. 
- 
-      if (StoredVec->getType() != SubVT) 
-        StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 
- 
-      StoredVecs.push_back(StoredVec); 
-    } 
- 
-    // Concatenate all vectors into a wide vector. 
-    Value *WideVec = concatenateVectors(Builder, StoredVecs); 
- 
-    // Interleave the elements in the wide vector. 
+      if (Group->isReverse())
+        StoredVec = reverseVector(StoredVec);
+
+      // If this member has different type, cast it to a unified type.
+
+      if (StoredVec->getType() != SubVT)
+        StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
+
+      StoredVecs.push_back(StoredVec);
+    }
+
+    // Concatenate all vectors into a wide vector.
+    Value *WideVec = concatenateVectors(Builder, StoredVecs);
+
+    // Interleave the elements in the wide vector.
     assert(!VF.isScalable() && "scalable vectors not yet supported.");
-    Value *IVec = Builder.CreateShuffleVector( 
+    Value *IVec = Builder.CreateShuffleVector(
         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
-        "interleaved.vec"); 
- 
-    Instruction *NewStoreInstr; 
-    if (BlockInMask) { 
-      Value *BlockInMaskPart = State.get(BlockInMask, Part); 
-      Value *ShuffledMask = Builder.CreateShuffleVector( 
+        "interleaved.vec");
+
+    Instruction *NewStoreInstr;
+    if (BlockInMask) {
+      Value *BlockInMaskPart = State.get(BlockInMask, Part);
+      Value *ShuffledMask = Builder.CreateShuffleVector(
           BlockInMaskPart,
           createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
-          "interleaved.mask"); 
-      NewStoreInstr = Builder.CreateMaskedStore( 
-          IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 
-    } 
-    else 
-      NewStoreInstr = 
-          Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 
- 
-    Group->addMetadata(NewStoreInstr); 
-  } 
-} 
- 
+          "interleaved.mask");
+      NewStoreInstr = Builder.CreateMaskedStore(
+          IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
+    }
+    else
+      NewStoreInstr =
+          Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
+
+    Group->addMetadata(NewStoreInstr);
+  }
+}
+
 void InnerLoopVectorizer::vectorizeMemoryInstruction(
     Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
     VPValue *StoredValue, VPValue *BlockInMask) {
-  // Attempt to issue a wide load. 
-  LoadInst *LI = dyn_cast<LoadInst>(Instr); 
-  StoreInst *SI = dyn_cast<StoreInst>(Instr); 
- 
-  assert((LI || SI) && "Invalid Load/Store instruction"); 
-  assert((!SI || StoredValue) && "No stored value provided for widened store"); 
-  assert((!LI || !StoredValue) && "Stored value provided for widened load"); 
- 
-  LoopVectorizationCostModel::InstWidening Decision = 
-      Cost->getWideningDecision(Instr, VF); 
-  assert((Decision == LoopVectorizationCostModel::CM_Widen || 
-          Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 
-          Decision == LoopVectorizationCostModel::CM_GatherScatter) && 
-         "CM decision is not to widen the memory instruction"); 
- 
-  Type *ScalarDataTy = getMemInstValueType(Instr); 
+  // Attempt to issue a wide load.
+  LoadInst *LI = dyn_cast<LoadInst>(Instr);
+  StoreInst *SI = dyn_cast<StoreInst>(Instr);
+
+  assert((LI || SI) && "Invalid Load/Store instruction");
+  assert((!SI || StoredValue) && "No stored value provided for widened store");
+  assert((!LI || !StoredValue) && "Stored value provided for widened load");
+
+  LoopVectorizationCostModel::InstWidening Decision =
+      Cost->getWideningDecision(Instr, VF);
+  assert((Decision == LoopVectorizationCostModel::CM_Widen ||
+          Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
+          Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
+         "CM decision is not to widen the memory instruction");
+
+  Type *ScalarDataTy = getMemInstValueType(Instr);
 
   auto *DataTy = VectorType::get(ScalarDataTy, VF);
-  const Align Alignment = getLoadStoreAlignment(Instr); 
- 
-  // Determine if the pointer operand of the access is either consecutive or 
-  // reverse consecutive. 
-  bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 
-  bool ConsecutiveStride = 
-      Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 
-  bool CreateGatherScatter = 
-      (Decision == LoopVectorizationCostModel::CM_GatherScatter); 
- 
-  // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 
-  // gather/scatter. Otherwise Decision should have been to Scalarize. 
-  assert((ConsecutiveStride || CreateGatherScatter) && 
-         "The instruction should be scalarized"); 
-  (void)ConsecutiveStride; 
- 
-  VectorParts BlockInMaskParts(UF); 
-  bool isMaskRequired = BlockInMask; 
-  if (isMaskRequired) 
-    for (unsigned Part = 0; Part < UF; ++Part) 
-      BlockInMaskParts[Part] = State.get(BlockInMask, Part); 
- 
-  const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 
-    // Calculate the pointer for the specific unroll-part. 
-    GetElementPtrInst *PartPtr = nullptr; 
- 
-    bool InBounds = false; 
-    if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 
-      InBounds = gep->isInBounds(); 
- 
-    if (Reverse) { 
+  const Align Alignment = getLoadStoreAlignment(Instr);
+
+  // Determine if the pointer operand of the access is either consecutive or
+  // reverse consecutive.
+  bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
+  bool ConsecutiveStride =
+      Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
+  bool CreateGatherScatter =
+      (Decision == LoopVectorizationCostModel::CM_GatherScatter);
+
+  // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
+  // gather/scatter. Otherwise Decision should have been to Scalarize.
+  assert((ConsecutiveStride || CreateGatherScatter) &&
+         "The instruction should be scalarized");
+  (void)ConsecutiveStride;
+
+  VectorParts BlockInMaskParts(UF);
+  bool isMaskRequired = BlockInMask;
+  if (isMaskRequired)
+    for (unsigned Part = 0; Part < UF; ++Part)
+      BlockInMaskParts[Part] = State.get(BlockInMask, Part);
+
+  const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
+    // Calculate the pointer for the specific unroll-part.
+    GetElementPtrInst *PartPtr = nullptr;
+
+    bool InBounds = false;
+    if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
+      InBounds = gep->isInBounds();
+
+    if (Reverse) {
       assert(!VF.isScalable() &&
              "Reversing vectors is not yet supported for scalable vectors.");
 
-      // If the address is consecutive but reversed, then the 
-      // wide store needs to start at the last vector element. 
+      // If the address is consecutive but reversed, then the
+      // wide store needs to start at the last vector element.
       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
           ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
-      PartPtr->setIsInBounds(InBounds); 
+      PartPtr->setIsInBounds(InBounds);
       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
           ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
-      PartPtr->setIsInBounds(InBounds); 
-      if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 
-        BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 
-    } else { 
+      PartPtr->setIsInBounds(InBounds);
+      if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
+        BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
+    } else {
       Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
-      PartPtr = cast<GetElementPtrInst>( 
+      PartPtr = cast<GetElementPtrInst>(
           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
-      PartPtr->setIsInBounds(InBounds); 
-    } 
- 
-    unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 
-    return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 
-  }; 
- 
-  // Handle Stores: 
-  if (SI) { 
-    setDebugLocFromInst(Builder, SI); 
- 
-    for (unsigned Part = 0; Part < UF; ++Part) { 
-      Instruction *NewSI = nullptr; 
-      Value *StoredVal = State.get(StoredValue, Part); 
-      if (CreateGatherScatter) { 
-        Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 
-        Value *VectorGep = State.get(Addr, Part); 
-        NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 
-                                            MaskPart); 
-      } else { 
-        if (Reverse) { 
-          // If we store to reverse consecutive memory locations, then we need 
-          // to reverse the order of elements in the stored value. 
-          StoredVal = reverseVector(StoredVal); 
-          // We don't want to update the value in the map as it might be used in 
-          // another expression. So don't call resetVectorValue(StoredVal). 
-        } 
-        auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 
-        if (isMaskRequired) 
-          NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 
-                                            BlockInMaskParts[Part]); 
-        else 
-          NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 
-      } 
-      addMetadata(NewSI, SI); 
-    } 
-    return; 
-  } 
- 
-  // Handle loads. 
-  assert(LI && "Must have a load instruction"); 
-  setDebugLocFromInst(Builder, LI); 
-  for (unsigned Part = 0; Part < UF; ++Part) { 
-    Value *NewLI; 
-    if (CreateGatherScatter) { 
-      Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 
-      Value *VectorGep = State.get(Addr, Part); 
-      NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 
-                                         nullptr, "wide.masked.gather"); 
-      addMetadata(NewLI, LI); 
-    } else { 
-      auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 
-      if (isMaskRequired) 
-        NewLI = Builder.CreateMaskedLoad( 
+      PartPtr->setIsInBounds(InBounds);
+    }
+
+    unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
+    return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
+  };
+
+  // Handle Stores:
+  if (SI) {
+    setDebugLocFromInst(Builder, SI);
+
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Instruction *NewSI = nullptr;
+      Value *StoredVal = State.get(StoredValue, Part);
+      if (CreateGatherScatter) {
+        Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
+        Value *VectorGep = State.get(Addr, Part);
+        NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
+                                            MaskPart);
+      } else {
+        if (Reverse) {
+          // If we store to reverse consecutive memory locations, then we need
+          // to reverse the order of elements in the stored value.
+          StoredVal = reverseVector(StoredVal);
+          // We don't want to update the value in the map as it might be used in
+          // another expression. So don't call resetVectorValue(StoredVal).
+        }
+        auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
+        if (isMaskRequired)
+          NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
+                                            BlockInMaskParts[Part]);
+        else
+          NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
+      }
+      addMetadata(NewSI, SI);
+    }
+    return;
+  }
+
+  // Handle loads.
+  assert(LI && "Must have a load instruction");
+  setDebugLocFromInst(Builder, LI);
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    Value *NewLI;
+    if (CreateGatherScatter) {
+      Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
+      Value *VectorGep = State.get(Addr, Part);
+      NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
+                                         nullptr, "wide.masked.gather");
+      addMetadata(NewLI, LI);
+    } else {
+      auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
+      if (isMaskRequired)
+        NewLI = Builder.CreateMaskedLoad(
             VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy),
-            "wide.masked.load"); 
-      else 
-        NewLI = 
-            Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 
- 
-      // Add metadata to the load, but setVectorValue to the reverse shuffle. 
-      addMetadata(NewLI, LI); 
-      if (Reverse) 
-        NewLI = reverseVector(NewLI); 
-    } 
+            "wide.masked.load");
+      else
+        NewLI =
+            Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
+
+      // Add metadata to the load, but setVectorValue to the reverse shuffle.
+      addMetadata(NewLI, LI);
+      if (Reverse)
+        NewLI = reverseVector(NewLI);
+    }
 
     State.set(Def, Instr, NewLI, Part);
-  } 
-} 
- 
-void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, 
-                                               const VPIteration &Instance, 
-                                               bool IfPredicateInstr, 
-                                               VPTransformState &State) { 
-  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 
- 
+  }
+}
+
+void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
+                                               const VPIteration &Instance,
+                                               bool IfPredicateInstr,
+                                               VPTransformState &State) {
+  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
+
   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
   // the first lane and part.
   if (isa<NoAliasScopeDeclInst>(Instr))
     if (Instance.Lane != 0 || Instance.Part != 0)
       return;
 
-  setDebugLocFromInst(Builder, Instr); 
- 
-  // Does this instruction return a value ? 
-  bool IsVoidRetTy = Instr->getType()->isVoidTy(); 
- 
-  Instruction *Cloned = Instr->clone(); 
-  if (!IsVoidRetTy) 
-    Cloned->setName(Instr->getName() + ".cloned"); 
- 
-  // Replace the operands of the cloned instructions with their scalar 
-  // equivalents in the new loop. 
-  for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 
+  setDebugLocFromInst(Builder, Instr);
+
+  // Does this instruction return a value ?
+  bool IsVoidRetTy = Instr->getType()->isVoidTy();
+
+  Instruction *Cloned = Instr->clone();
+  if (!IsVoidRetTy)
+    Cloned->setName(Instr->getName() + ".cloned");
+
+  // Replace the operands of the cloned instructions with their scalar
+  // equivalents in the new loop.
+  for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
     auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
     auto InputInstance = Instance;
     if (!Operand || !OrigLoop->contains(Operand) ||
         (Cost->isUniformAfterVectorization(Operand, State.VF)))
       InputInstance.Lane = 0;
     auto *NewOp = State.get(User.getOperand(op), InputInstance);
-    Cloned->setOperand(op, NewOp); 
-  } 
-  addNewMetadata(Cloned, Instr); 
- 
-  // Place the cloned scalar in the new loop. 
-  Builder.Insert(Cloned); 
- 
+    Cloned->setOperand(op, NewOp);
+  }
+  addNewMetadata(Cloned, Instr);
+
+  // Place the cloned scalar in the new loop.
+  Builder.Insert(Cloned);
+
   // TODO: Set result for VPValue of VPReciplicateRecipe. This requires
   // representing scalar values in VPTransformState. Add the cloned scalar to
   // the scalar map entry.
-  VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 
- 
-  // If we just cloned a new assumption, add it the assumption cache. 
-  if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 
-    if (II->getIntrinsicID() == Intrinsic::assume) 
-      AC->registerAssumption(II); 
- 
-  // End if-block. 
-  if (IfPredicateInstr) 
-    PredicatedInstructions.push_back(Cloned); 
-} 
- 
-PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 
-                                                      Value *End, Value *Step, 
-                                                      Instruction *DL) { 
-  BasicBlock *Header = L->getHeader(); 
-  BasicBlock *Latch = L->getLoopLatch(); 
-  // As we're just creating this loop, it's possible no latch exists 
-  // yet. If so, use the header as this will be a single block loop. 
-  if (!Latch) 
-    Latch = Header; 
- 
-  IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 
-  Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 
-  setDebugLocFromInst(Builder, OldInst); 
-  auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 
- 
-  Builder.SetInsertPoint(Latch->getTerminator()); 
-  setDebugLocFromInst(Builder, OldInst); 
- 
-  // Create i+1 and fill the PHINode. 
-  Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 
-  Induction->addIncoming(Start, L->getLoopPreheader()); 
-  Induction->addIncoming(Next, Latch); 
-  // Create the compare. 
-  Value *ICmp = Builder.CreateICmpEQ(Next, End); 
+  VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
+
+  // If we just cloned a new assumption, add it the assumption cache.
+  if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
+    if (II->getIntrinsicID() == Intrinsic::assume)
+      AC->registerAssumption(II);
+
+  // End if-block.
+  if (IfPredicateInstr)
+    PredicatedInstructions.push_back(Cloned);
+}
+
+PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
+                                                      Value *End, Value *Step,
+                                                      Instruction *DL) {
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Latch = L->getLoopLatch();
+  // As we're just creating this loop, it's possible no latch exists
+  // yet. If so, use the header as this will be a single block loop.
+  if (!Latch)
+    Latch = Header;
+
+  IRBuilder<> Builder(&*Header->getFirstInsertionPt());
+  Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
+  setDebugLocFromInst(Builder, OldInst);
+  auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
+
+  Builder.SetInsertPoint(Latch->getTerminator());
+  setDebugLocFromInst(Builder, OldInst);
+
+  // Create i+1 and fill the PHINode.
+  Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
+  Induction->addIncoming(Start, L->getLoopPreheader());
+  Induction->addIncoming(Next, Latch);
+  // Create the compare.
+  Value *ICmp = Builder.CreateICmpEQ(Next, End);
   Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
- 
-  // Now we have two terminators. Remove the old one from the block. 
-  Latch->getTerminator()->eraseFromParent(); 
- 
-  return Induction; 
-} 
- 
-Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 
-  if (TripCount) 
-    return TripCount; 
- 
-  assert(L && "Create Trip Count for null loop."); 
-  IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 
-  // Find the loop boundaries. 
-  ScalarEvolution *SE = PSE.getSE(); 
-  const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 
+
+  // Now we have two terminators. Remove the old one from the block.
+  Latch->getTerminator()->eraseFromParent();
+
+  return Induction;
+}
+
+Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
+  if (TripCount)
+    return TripCount;
+
+  assert(L && "Create Trip Count for null loop.");
+  IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
+  // Find the loop boundaries.
+  ScalarEvolution *SE = PSE.getSE();
+  const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
-         "Invalid loop count"); 
- 
-  Type *IdxTy = Legal->getWidestInductionType(); 
-  assert(IdxTy && "No type for induction"); 
- 
-  // The exit count might have the type of i64 while the phi is i32. This can 
-  // happen if we have an induction variable that is sign extended before the 
-  // compare. The only way that we get a backedge taken count is that the 
-  // induction variable was signed and as such will not overflow. In such a case 
-  // truncation is legal. 
-  if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 
-      IdxTy->getPrimitiveSizeInBits()) 
-    BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 
-  BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 
- 
-  // Get the total trip count from the count by adding 1. 
-  const SCEV *ExitCount = SE->getAddExpr( 
-      BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 
- 
-  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 
- 
-  // Expand the trip count and place the new instructions in the preheader. 
-  // Notice that the pre-header does not change, only the loop body. 
-  SCEVExpander Exp(*SE, DL, "induction"); 
- 
-  // Count holds the overall loop count (N). 
-  TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 
-                                L->getLoopPreheader()->getTerminator()); 
- 
-  if (TripCount->getType()->isPointerTy()) 
-    TripCount = 
-        CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 
-                                    L->getLoopPreheader()->getTerminator()); 
- 
-  return TripCount; 
-} 
- 
-Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 
-  if (VectorTripCount) 
-    return VectorTripCount; 
- 
-  Value *TC = getOrCreateTripCount(L); 
-  IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 
- 
-  Type *Ty = TC->getType(); 
+         "Invalid loop count");
+
+  Type *IdxTy = Legal->getWidestInductionType();
+  assert(IdxTy && "No type for induction");
+
+  // The exit count might have the type of i64 while the phi is i32. This can
+  // happen if we have an induction variable that is sign extended before the
+  // compare. The only way that we get a backedge taken count is that the
+  // induction variable was signed and as such will not overflow. In such a case
+  // truncation is legal.
+  if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
+      IdxTy->getPrimitiveSizeInBits())
+    BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
+  BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
+
+  // Get the total trip count from the count by adding 1.
+  const SCEV *ExitCount = SE->getAddExpr(
+      BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
+
+  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+
+  // Expand the trip count and place the new instructions in the preheader.
+  // Notice that the pre-header does not change, only the loop body.
+  SCEVExpander Exp(*SE, DL, "induction");
+
+  // Count holds the overall loop count (N).
+  TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
+                                L->getLoopPreheader()->getTerminator());
+
+  if (TripCount->getType()->isPointerTy())
+    TripCount =
+        CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
+                                    L->getLoopPreheader()->getTerminator());
+
+  return TripCount;
+}
+
+Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
+  if (VectorTripCount)
+    return VectorTripCount;
+
+  Value *TC = getOrCreateTripCount(L);
+  IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
+
+  Type *Ty = TC->getType();
   // This is where we can make the step a runtime constant.
   Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
- 
-  // If the tail is to be folded by masking, round the number of iterations N 
-  // up to a multiple of Step instead of rounding down. This is done by first 
-  // adding Step-1 and then rounding down. Note that it's ok if this addition 
-  // overflows: the vector induction variable will eventually wrap to zero given 
-  // that it starts at zero and its Step is a power of two; the loop will then 
-  // exit, with the last early-exit vector comparison also producing all-true. 
-  if (Cost->foldTailByMasking()) { 
+
+  // If the tail is to be folded by masking, round the number of iterations N
+  // up to a multiple of Step instead of rounding down. This is done by first
+  // adding Step-1 and then rounding down. Note that it's ok if this addition
+  // overflows: the vector induction variable will eventually wrap to zero given
+  // that it starts at zero and its Step is a power of two; the loop will then
+  // exit, with the last early-exit vector comparison also producing all-true.
+  if (Cost->foldTailByMasking()) {
     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
-           "VF*UF must be a power of 2 when folding tail by masking"); 
+           "VF*UF must be a power of 2 when folding tail by masking");
     assert(!VF.isScalable() &&
            "Tail folding not yet supported for scalable vectors");
     TC = Builder.CreateAdd(
         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
-  } 
- 
-  // Now we need to generate the expression for the part of the loop that the 
-  // vectorized body will execute. This is equal to N - (N % Step) if scalar 
-  // iterations are not required for correctness, or N - Step, otherwise. Step 
-  // is equal to the vectorization factor (number of SIMD elements) times the 
-  // unroll factor (number of SIMD instructions). 
-  Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 
- 
+  }
+
+  // Now we need to generate the expression for the part of the loop that the
+  // vectorized body will execute. This is equal to N - (N % Step) if scalar
+  // iterations are not required for correctness, or N - Step, otherwise. Step
+  // is equal to the vectorization factor (number of SIMD elements) times the
+  // unroll factor (number of SIMD instructions).
+  Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
+
   // There are two cases where we need to ensure (at least) the last iteration
   // runs in the scalar remainder loop. Thus, if the step evenly divides
-  // the trip count, we set the remainder to be equal to the step. If the step 
-  // does not evenly divide the trip count, no adjustment is necessary since 
-  // there will already be scalar iterations. Note that the minimum iterations 
+  // the trip count, we set the remainder to be equal to the step. If the step
+  // does not evenly divide the trip count, no adjustment is necessary since
+  // there will already be scalar iterations. Note that the minimum iterations
   // check ensures that N >= Step. The cases are:
   // 1) If there is a non-reversed interleaved group that may speculatively
   //    access memory out-of-bounds.
@@ -3042,178 +3042,178 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
   //    the loop contains multiple exiting blocks, or a single exiting block
   //    which is not the latch.
   if (VF.isVector() && Cost->requiresScalarEpilogue()) {
-    auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 
-    R = Builder.CreateSelect(IsZero, Step, R); 
-  } 
- 
-  VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 
- 
-  return VectorTripCount; 
-} 
- 
-Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 
-                                                   const DataLayout &DL) { 
-  // Verify that V is a vector type with same number of elements as DstVTy. 
+    auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
+    R = Builder.CreateSelect(IsZero, Step, R);
+  }
+
+  VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
+
+  return VectorTripCount;
+}
+
+Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
+                                                   const DataLayout &DL) {
+  // Verify that V is a vector type with same number of elements as DstVTy.
   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
   unsigned VF = DstFVTy->getNumElements();
   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
-  assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 
-  Type *SrcElemTy = SrcVecTy->getElementType(); 
+  assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
+  Type *SrcElemTy = SrcVecTy->getElementType();
   Type *DstElemTy = DstFVTy->getElementType();
-  assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 
-         "Vector elements must have same size"); 
- 
-  // Do a direct cast if element types are castable. 
-  if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 
+  assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
+         "Vector elements must have same size");
+
+  // Do a direct cast if element types are castable.
+  if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
     return Builder.CreateBitOrPointerCast(V, DstFVTy);
-  } 
-  // V cannot be directly casted to desired vector type. 
-  // May happen when V is a floating point vector but DstVTy is a vector of 
-  // pointers or vice-versa. Handle this using a two-step bitcast using an 
-  // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 
-  assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 
-         "Only one type should be a pointer type"); 
-  assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 
-         "Only one type should be a floating point type"); 
-  Type *IntTy = 
-      IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 
-  auto *VecIntTy = FixedVectorType::get(IntTy, VF); 
-  Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 
+  }
+  // V cannot be directly casted to desired vector type.
+  // May happen when V is a floating point vector but DstVTy is a vector of
+  // pointers or vice-versa. Handle this using a two-step bitcast using an
+  // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
+  assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
+         "Only one type should be a pointer type");
+  assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
+         "Only one type should be a floating point type");
+  Type *IntTy =
+      IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
+  auto *VecIntTy = FixedVectorType::get(IntTy, VF);
+  Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
-} 
- 
-void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 
-                                                         BasicBlock *Bypass) { 
-  Value *Count = getOrCreateTripCount(L); 
-  // Reuse existing vector loop preheader for TC checks. 
-  // Note that new preheader block is generated for vector loop. 
-  BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 
-  IRBuilder<> Builder(TCCheckBlock->getTerminator()); 
- 
-  // Generate code to check if the loop's trip count is less than VF * UF, or 
-  // equal to it in case a scalar epilogue is required; this implies that the 
-  // vector trip count is zero. This check also covers the case where adding one 
-  // to the backedge-taken count overflowed leading to an incorrect trip count 
-  // of zero. In this case we will also jump to the scalar loop. 
-  auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 
-                                          : ICmpInst::ICMP_ULT; 
- 
-  // If tail is to be folded, vector loop takes care of all iterations. 
-  Value *CheckMinIters = Builder.getFalse(); 
+}
+
+void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
+                                                         BasicBlock *Bypass) {
+  Value *Count = getOrCreateTripCount(L);
+  // Reuse existing vector loop preheader for TC checks.
+  // Note that new preheader block is generated for vector loop.
+  BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
+  IRBuilder<> Builder(TCCheckBlock->getTerminator());
+
+  // Generate code to check if the loop's trip count is less than VF * UF, or
+  // equal to it in case a scalar epilogue is required; this implies that the
+  // vector trip count is zero. This check also covers the case where adding one
+  // to the backedge-taken count overflowed leading to an incorrect trip count
+  // of zero. In this case we will also jump to the scalar loop.
+  auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
+                                          : ICmpInst::ICMP_ULT;
+
+  // If tail is to be folded, vector loop takes care of all iterations.
+  Value *CheckMinIters = Builder.getFalse();
   if (!Cost->foldTailByMasking()) {
     Value *Step =
         createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
   }
-  // Create new preheader for vector loop. 
-  LoopVectorPreHeader = 
-      SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 
-                 "vector.ph"); 
- 
-  assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 
-                               DT->getNode(Bypass)->getIDom()) && 
-         "TC check is expected to dominate Bypass"); 
- 
-  // Update dominator for Bypass & LoopExit. 
-  DT->changeImmediateDominator(Bypass, TCCheckBlock); 
-  DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 
- 
-  ReplaceInstWithInst( 
-      TCCheckBlock->getTerminator(), 
-      BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 
-  LoopBypassBlocks.push_back(TCCheckBlock); 
-} 
- 
-void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 
-  // Reuse existing vector loop preheader for SCEV checks. 
-  // Note that new preheader block is generated for vector loop. 
-  BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 
- 
-  // Generate the code to check that the SCEV assumptions that we made. 
-  // We want the new basic block to start at the first instruction in a 
-  // sequence of instructions that form a check. 
-  SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 
-                   "scev.check"); 
-  Value *SCEVCheck = Exp.expandCodeForPredicate( 
-      &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 
- 
-  if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 
-    if (C->isZero()) 
-      return; 
- 
+  // Create new preheader for vector loop.
+  LoopVectorPreHeader =
+      SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
+                 "vector.ph");
+
+  assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
+                               DT->getNode(Bypass)->getIDom()) &&
+         "TC check is expected to dominate Bypass");
+
+  // Update dominator for Bypass & LoopExit.
+  DT->changeImmediateDominator(Bypass, TCCheckBlock);
+  DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
+
+  ReplaceInstWithInst(
+      TCCheckBlock->getTerminator(),
+      BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
+  LoopBypassBlocks.push_back(TCCheckBlock);
+}
+
+void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
+  // Reuse existing vector loop preheader for SCEV checks.
+  // Note that new preheader block is generated for vector loop.
+  BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
+
+  // Generate the code to check that the SCEV assumptions that we made.
+  // We want the new basic block to start at the first instruction in a
+  // sequence of instructions that form a check.
+  SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
+                   "scev.check");
+  Value *SCEVCheck = Exp.expandCodeForPredicate(
+      &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
+
+  if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
+    if (C->isZero())
+      return;
+
   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
            (OptForSizeBasedOnProfile &&
             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
-         "Cannot SCEV check stride or overflow when optimizing for size"); 
- 
-  SCEVCheckBlock->setName("vector.scevcheck"); 
-  // Create new preheader for vector loop. 
-  LoopVectorPreHeader = 
-      SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 
-                 nullptr, "vector.ph"); 
- 
-  // Update dominator only if this is first RT check. 
-  if (LoopBypassBlocks.empty()) { 
-    DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 
-    DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 
-  } 
- 
-  ReplaceInstWithInst( 
-      SCEVCheckBlock->getTerminator(), 
-      BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 
-  LoopBypassBlocks.push_back(SCEVCheckBlock); 
-  AddedSafetyChecks = true; 
-} 
- 
-void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 
-  // VPlan-native path does not do any analysis for runtime checks currently. 
-  if (EnableVPlanNativePath) 
-    return; 
- 
-  // Reuse existing vector loop preheader for runtime memory checks. 
-  // Note that new preheader block is generated for vector loop. 
-  BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 
- 
-  // Generate the code that checks in runtime if arrays overlap. We put the 
-  // checks into a separate block to make the more common case of few elements 
-  // faster. 
-  auto *LAI = Legal->getLAI(); 
-  const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); 
-  if (!RtPtrChecking.Need) 
-    return; 
- 
+         "Cannot SCEV check stride or overflow when optimizing for size");
+
+  SCEVCheckBlock->setName("vector.scevcheck");
+  // Create new preheader for vector loop.
+  LoopVectorPreHeader =
+      SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
+                 nullptr, "vector.ph");
+
+  // Update dominator only if this is first RT check.
+  if (LoopBypassBlocks.empty()) {
+    DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
+    DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
+  }
+
+  ReplaceInstWithInst(
+      SCEVCheckBlock->getTerminator(),
+      BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
+  LoopBypassBlocks.push_back(SCEVCheckBlock);
+  AddedSafetyChecks = true;
+}
+
+void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
+  // VPlan-native path does not do any analysis for runtime checks currently.
+  if (EnableVPlanNativePath)
+    return;
+
+  // Reuse existing vector loop preheader for runtime memory checks.
+  // Note that new preheader block is generated for vector loop.
+  BasicBlock *const MemCheckBlock = L->getLoopPreheader();
+
+  // Generate the code that checks in runtime if arrays overlap. We put the
+  // checks into a separate block to make the more common case of few elements
+  // faster.
+  auto *LAI = Legal->getLAI();
+  const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
+  if (!RtPtrChecking.Need)
+    return;
+
   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
-    assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 
-           "Cannot emit memory checks when optimizing for size, unless forced " 
-           "to vectorize."); 
-    ORE->emit([&]() { 
-      return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 
-                                        L->getStartLoc(), L->getHeader()) 
-             << "Code-size may be reduced by not forcing " 
-                "vectorization, or by source-code modifications " 
-                "eliminating the need for runtime checks " 
-                "(e.g., adding 'restrict')."; 
-    }); 
-  } 
- 
-  MemCheckBlock->setName("vector.memcheck"); 
-  // Create new preheader for vector loop. 
-  LoopVectorPreHeader = 
-      SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 
-                 "vector.ph"); 
- 
+    assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
+           "Cannot emit memory checks when optimizing for size, unless forced "
+           "to vectorize.");
+    ORE->emit([&]() {
+      return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
+                                        L->getStartLoc(), L->getHeader())
+             << "Code-size may be reduced by not forcing "
+                "vectorization, or by source-code modifications "
+                "eliminating the need for runtime checks "
+                "(e.g., adding 'restrict').";
+    });
+  }
+
+  MemCheckBlock->setName("vector.memcheck");
+  // Create new preheader for vector loop.
+  LoopVectorPreHeader =
+      SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
+                 "vector.ph");
+
   auto *CondBranch = cast<BranchInst>(
       Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader));
   ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch);
   LoopBypassBlocks.push_back(MemCheckBlock);
   AddedSafetyChecks = true;
 
-  // Update dominator only if this is first RT check. 
-  if (LoopBypassBlocks.empty()) { 
-    DT->changeImmediateDominator(Bypass, MemCheckBlock); 
-    DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 
-  } 
- 
+  // Update dominator only if this is first RT check.
+  if (LoopBypassBlocks.empty()) {
+    DT->changeImmediateDominator(Bypass, MemCheckBlock);
+    DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
+  }
+
   Instruction *FirstCheckInst;
   Instruction *MemRuntimeCheck;
   std::tie(FirstCheckInst, MemRuntimeCheck) =
@@ -3222,128 +3222,128 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
   assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
                             "claimed checks are required");
   CondBranch->setCondition(MemRuntimeCheck);
- 
-  // We currently don't use LoopVersioning for the actual loop cloning but we 
-  // still use it to add the noalias metadata. 
+
+  // We currently don't use LoopVersioning for the actual loop cloning but we
+  // still use it to add the noalias metadata.
   LVer = std::make_unique<LoopVersioning>(
       *Legal->getLAI(),
       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
       DT, PSE.getSE());
-  LVer->prepareNoAliasMetadata(); 
-} 
- 
-Value *InnerLoopVectorizer::emitTransformedIndex( 
-    IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 
-    const InductionDescriptor &ID) const { 
- 
-  SCEVExpander Exp(*SE, DL, "induction"); 
-  auto Step = ID.getStep(); 
-  auto StartValue = ID.getStartValue(); 
-  assert(Index->getType() == Step->getType() && 
-         "Index type does not match StepValue type"); 
- 
-  // Note: the IR at this point is broken. We cannot use SE to create any new 
-  // SCEV and then expand it, hoping that SCEV's simplification will give us 
-  // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 
-  // lead to various SCEV crashes. So all we can do is to use builder and rely 
-  // on InstCombine for future simplifications. Here we handle some trivial 
-  // cases only. 
-  auto CreateAdd = [&B](Value *X, Value *Y) { 
-    assert(X->getType() == Y->getType() && "Types don't match!"); 
-    if (auto *CX = dyn_cast<ConstantInt>(X)) 
-      if (CX->isZero()) 
-        return Y; 
-    if (auto *CY = dyn_cast<ConstantInt>(Y)) 
-      if (CY->isZero()) 
-        return X; 
-    return B.CreateAdd(X, Y); 
-  }; 
- 
-  auto CreateMul = [&B](Value *X, Value *Y) { 
-    assert(X->getType() == Y->getType() && "Types don't match!"); 
-    if (auto *CX = dyn_cast<ConstantInt>(X)) 
-      if (CX->isOne()) 
-        return Y; 
-    if (auto *CY = dyn_cast<ConstantInt>(Y)) 
-      if (CY->isOne()) 
-        return X; 
-    return B.CreateMul(X, Y); 
-  }; 
- 
-  // Get a suitable insert point for SCEV expansion. For blocks in the vector 
-  // loop, choose the end of the vector loop header (=LoopVectorBody), because 
-  // the DomTree is not kept up-to-date for additional blocks generated in the 
-  // vector loop. By using the header as insertion point, we guarantee that the 
-  // expanded instructions dominate all their uses. 
-  auto GetInsertPoint = [this, &B]() { 
-    BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 
-    if (InsertBB != LoopVectorBody && 
-        LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 
-      return LoopVectorBody->getTerminator(); 
-    return &*B.GetInsertPoint(); 
-  }; 
-  switch (ID.getKind()) { 
-  case InductionDescriptor::IK_IntInduction: { 
-    assert(Index->getType() == StartValue->getType() && 
-           "Index type does not match StartValue type"); 
-    if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 
-      return B.CreateSub(StartValue, Index); 
-    auto *Offset = CreateMul( 
-        Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 
-    return CreateAdd(StartValue, Offset); 
-  } 
-  case InductionDescriptor::IK_PtrInduction: { 
-    assert(isa<SCEVConstant>(Step) && 
-           "Expected constant step for pointer induction"); 
-    return B.CreateGEP( 
-        StartValue->getType()->getPointerElementType(), StartValue, 
-        CreateMul(Index, 
-                  Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 
-  } 
-  case InductionDescriptor::IK_FpInduction: { 
-    assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 
-    auto InductionBinOp = ID.getInductionBinOp(); 
-    assert(InductionBinOp && 
-           (InductionBinOp->getOpcode() == Instruction::FAdd || 
-            InductionBinOp->getOpcode() == Instruction::FSub) && 
-           "Original bin op should be defined for FP induction"); 
- 
-    Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 
- 
-    // Floating point operations had to be 'fast' to enable the induction. 
-    FastMathFlags Flags; 
-    Flags.setFast(); 
- 
-    Value *MulExp = B.CreateFMul(StepValue, Index); 
-    if (isa<Instruction>(MulExp)) 
-      // We have to check, the MulExp may be a constant. 
-      cast<Instruction>(MulExp)->setFastMathFlags(Flags); 
- 
-    Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 
-                               "induction"); 
-    if (isa<Instruction>(BOp)) 
-      cast<Instruction>(BOp)->setFastMathFlags(Flags); 
- 
-    return BOp; 
-  } 
-  case InductionDescriptor::IK_NoInduction: 
-    return nullptr; 
-  } 
-  llvm_unreachable("invalid enum"); 
-} 
- 
+  LVer->prepareNoAliasMetadata();
+}
+
+Value *InnerLoopVectorizer::emitTransformedIndex(
+    IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
+    const InductionDescriptor &ID) const {
+
+  SCEVExpander Exp(*SE, DL, "induction");
+  auto Step = ID.getStep();
+  auto StartValue = ID.getStartValue();
+  assert(Index->getType() == Step->getType() &&
+         "Index type does not match StepValue type");
+
+  // Note: the IR at this point is broken. We cannot use SE to create any new
+  // SCEV and then expand it, hoping that SCEV's simplification will give us
+  // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
+  // lead to various SCEV crashes. So all we can do is to use builder and rely
+  // on InstCombine for future simplifications. Here we handle some trivial
+  // cases only.
+  auto CreateAdd = [&B](Value *X, Value *Y) {
+    assert(X->getType() == Y->getType() && "Types don't match!");
+    if (auto *CX = dyn_cast<ConstantInt>(X))
+      if (CX->isZero())
+        return Y;
+    if (auto *CY = dyn_cast<ConstantInt>(Y))
+      if (CY->isZero())
+        return X;
+    return B.CreateAdd(X, Y);
+  };
+
+  auto CreateMul = [&B](Value *X, Value *Y) {
+    assert(X->getType() == Y->getType() && "Types don't match!");
+    if (auto *CX = dyn_cast<ConstantInt>(X))
+      if (CX->isOne())
+        return Y;
+    if (auto *CY = dyn_cast<ConstantInt>(Y))
+      if (CY->isOne())
+        return X;
+    return B.CreateMul(X, Y);
+  };
+
+  // Get a suitable insert point for SCEV expansion. For blocks in the vector
+  // loop, choose the end of the vector loop header (=LoopVectorBody), because
+  // the DomTree is not kept up-to-date for additional blocks generated in the
+  // vector loop. By using the header as insertion point, we guarantee that the
+  // expanded instructions dominate all their uses.
+  auto GetInsertPoint = [this, &B]() {
+    BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
+    if (InsertBB != LoopVectorBody &&
+        LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
+      return LoopVectorBody->getTerminator();
+    return &*B.GetInsertPoint();
+  };
+  switch (ID.getKind()) {
+  case InductionDescriptor::IK_IntInduction: {
+    assert(Index->getType() == StartValue->getType() &&
+           "Index type does not match StartValue type");
+    if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
+      return B.CreateSub(StartValue, Index);
+    auto *Offset = CreateMul(
+        Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
+    return CreateAdd(StartValue, Offset);
+  }
+  case InductionDescriptor::IK_PtrInduction: {
+    assert(isa<SCEVConstant>(Step) &&
+           "Expected constant step for pointer induction");
+    return B.CreateGEP(
+        StartValue->getType()->getPointerElementType(), StartValue,
+        CreateMul(Index,
+                  Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
+  }
+  case InductionDescriptor::IK_FpInduction: {
+    assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
+    auto InductionBinOp = ID.getInductionBinOp();
+    assert(InductionBinOp &&
+           (InductionBinOp->getOpcode() == Instruction::FAdd ||
+            InductionBinOp->getOpcode() == Instruction::FSub) &&
+           "Original bin op should be defined for FP induction");
+
+    Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
+
+    // Floating point operations had to be 'fast' to enable the induction.
+    FastMathFlags Flags;
+    Flags.setFast();
+
+    Value *MulExp = B.CreateFMul(StepValue, Index);
+    if (isa<Instruction>(MulExp))
+      // We have to check, the MulExp may be a constant.
+      cast<Instruction>(MulExp)->setFastMathFlags(Flags);
+
+    Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
+                               "induction");
+    if (isa<Instruction>(BOp))
+      cast<Instruction>(BOp)->setFastMathFlags(Flags);
+
+    return BOp;
+  }
+  case InductionDescriptor::IK_NoInduction:
+    return nullptr;
+  }
+  llvm_unreachable("invalid enum");
+}
+
 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
-  LoopScalarBody = OrigLoop->getHeader(); 
-  LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 
+  LoopScalarBody = OrigLoop->getHeader();
+  LoopVectorPreHeader = OrigLoop->getLoopPreheader();
   LoopExitBlock = OrigLoop->getUniqueExitBlock();
-  assert(LoopExitBlock && "Must have an exit block"); 
-  assert(LoopVectorPreHeader && "Invalid loop structure"); 
- 
-  LoopMiddleBlock = 
-      SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 
+  assert(LoopExitBlock && "Must have an exit block");
+  assert(LoopVectorPreHeader && "Invalid loop structure");
+
+  LoopMiddleBlock =
+      SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
                  LI, nullptr, Twine(Prefix) + "middle.block");
-  LoopScalarPreHeader = 
-      SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 
+  LoopScalarPreHeader =
+      SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
                  nullptr, Twine(Prefix) + "scalar.ph");
 
   // Set up branch from middle block to the exit and scalar preheader blocks.
@@ -3355,31 +3355,31 @@ Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
 
-  // We intentionally don't let SplitBlock to update LoopInfo since 
-  // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 
-  // LoopVectorBody is explicitly added to the correct place few lines later. 
-  LoopVectorBody = 
-      SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 
+  // We intentionally don't let SplitBlock to update LoopInfo since
+  // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
+  // LoopVectorBody is explicitly added to the correct place few lines later.
+  LoopVectorBody =
+      SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
                  nullptr, nullptr, Twine(Prefix) + "vector.body");
- 
-  // Update dominator for loop exit. 
-  DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 
- 
-  // Create and register the new vector loop. 
-  Loop *Lp = LI->AllocateLoop(); 
-  Loop *ParentLoop = OrigLoop->getParentLoop(); 
- 
-  // Insert the new loop into the loop nest and register the new basic blocks 
-  // before calling any utilities such as SCEV that require valid LoopInfo. 
-  if (ParentLoop) { 
-    ParentLoop->addChildLoop(Lp); 
-  } else { 
-    LI->addTopLevelLoop(Lp); 
-  } 
-  Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 
+
+  // Update dominator for loop exit.
+  DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
+
+  // Create and register the new vector loop.
+  Loop *Lp = LI->AllocateLoop();
+  Loop *ParentLoop = OrigLoop->getParentLoop();
+
+  // Insert the new loop into the loop nest and register the new basic blocks
+  // before calling any utilities such as SCEV that require valid LoopInfo.
+  if (ParentLoop) {
+    ParentLoop->addChildLoop(Lp);
+  } else {
+    LI->addTopLevelLoop(Lp);
+  }
+  Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
   return Lp;
 }
- 
+
 void InnerLoopVectorizer::createInductionResumeValues(
     Loop *L, Value *VectorTripCount,
     std::pair<BasicBlock *, Value *> AdditionalBypass) {
@@ -3387,37 +3387,37 @@ void InnerLoopVectorizer::createInductionResumeValues(
   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
          "Inconsistent information about additional bypass.");
-  // We are going to resume the execution of the scalar loop. 
-  // Go over all of the induction variables that we found and fix the 
-  // PHIs that are left in the scalar version of the loop. 
-  // The starting values of PHI nodes depend on the counter of the last 
-  // iteration in the vectorized loop. 
-  // If we come from a bypass edge then we need to start from the original 
-  // start value. 
-  for (auto &InductionEntry : Legal->getInductionVars()) { 
-    PHINode *OrigPhi = InductionEntry.first; 
-    InductionDescriptor II = InductionEntry.second; 
- 
-    // Create phi nodes to merge from the  backedge-taken check block. 
-    PHINode *BCResumeVal = 
-        PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 
-                        LoopScalarPreHeader->getTerminator()); 
-    // Copy original phi DL over to the new one. 
-    BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 
-    Value *&EndValue = IVEndValues[OrigPhi]; 
+  // We are going to resume the execution of the scalar loop.
+  // Go over all of the induction variables that we found and fix the
+  // PHIs that are left in the scalar version of the loop.
+  // The starting values of PHI nodes depend on the counter of the last
+  // iteration in the vectorized loop.
+  // If we come from a bypass edge then we need to start from the original
+  // start value.
+  for (auto &InductionEntry : Legal->getInductionVars()) {
+    PHINode *OrigPhi = InductionEntry.first;
+    InductionDescriptor II = InductionEntry.second;
+
+    // Create phi nodes to merge from the  backedge-taken check block.
+    PHINode *BCResumeVal =
+        PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
+                        LoopScalarPreHeader->getTerminator());
+    // Copy original phi DL over to the new one.
+    BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
+    Value *&EndValue = IVEndValues[OrigPhi];
     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
-    if (OrigPhi == OldInduction) { 
-      // We know what the end value is. 
+    if (OrigPhi == OldInduction) {
+      // We know what the end value is.
       EndValue = VectorTripCount;
-    } else { 
+    } else {
       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
-      Type *StepType = II.getStep()->getType(); 
-      Instruction::CastOps CastOp = 
+      Type *StepType = II.getStep()->getType();
+      Instruction::CastOps CastOp =
           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
-      const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 
-      EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 
-      EndValue->setName("ind.end"); 
+      const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
+      EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
+      EndValue->setName("ind.end");
 
       // Compute the end value for the additional bypass (if applicable).
       if (AdditionalBypass.first) {
@@ -3430,84 +3430,84 @@ void InnerLoopVectorizer::createInductionResumeValues(
             emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
         EndValueFromAdditionalBypass->setName("ind.end");
       }
-    } 
-    // The new PHI merges the original incoming value, in case of a bypass, 
-    // or the value at the end of the vectorized loop. 
-    BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 
- 
-    // Fix the scalar body counter (PHI node). 
-    // The old induction's phi node in the scalar body needs the truncated 
-    // value. 
-    for (BasicBlock *BB : LoopBypassBlocks) 
-      BCResumeVal->addIncoming(II.getStartValue(), BB); 
+    }
+    // The new PHI merges the original incoming value, in case of a bypass,
+    // or the value at the end of the vectorized loop.
+    BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
+
+    // Fix the scalar body counter (PHI node).
+    // The old induction's phi node in the scalar body needs the truncated
+    // value.
+    for (BasicBlock *BB : LoopBypassBlocks)
+      BCResumeVal->addIncoming(II.getStartValue(), BB);
 
     if (AdditionalBypass.first)
       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
                                             EndValueFromAdditionalBypass);
 
-    OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 
-  } 
+    OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
+  }
 }
- 
+
 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
                                                       MDNode *OrigLoopID) {
   assert(L && "Expected valid loop.");
- 
+
   // The trip counts should be cached by now.
   Value *Count = getOrCreateTripCount(L);
   Value *VectorTripCount = getOrCreateVectorTripCount(L);
 
   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
 
-  // Add a check in the middle block to see if we have completed 
-  // all of the iterations in the first vector loop. 
-  // If (N - N%VF) == N, then we *don't* need to run the remainder. 
-  // If tail is to be folded, we know we don't need to run the remainder. 
-  if (!Cost->foldTailByMasking()) { 
+  // Add a check in the middle block to see if we have completed
+  // all of the iterations in the first vector loop.
+  // If (N - N%VF) == N, then we *don't* need to run the remainder.
+  // If tail is to be folded, we know we don't need to run the remainder.
+  if (!Cost->foldTailByMasking()) {
     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
                                         Count, VectorTripCount, "cmp.n",
                                         LoopMiddleBlock->getTerminator());
- 
+
     // Here we use the same DebugLoc as the scalar loop latch terminator instead
-    // of the corresponding compare because they may have ended up with 
-    // different line numbers and we want to avoid awkward line stepping while 
-    // debugging. Eg. if the compare has got a line number inside the loop. 
+    // of the corresponding compare because they may have ended up with
+    // different line numbers and we want to avoid awkward line stepping while
+    // debugging. Eg. if the compare has got a line number inside the loop.
     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
-  } 
- 
-  // Get ready to start creating new instructions into the vectorized body. 
+  }
+
+  // Get ready to start creating new instructions into the vectorized body.
   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
-         "Inconsistent vector loop preheader"); 
-  Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 
- 
-  Optional<MDNode *> VectorizedLoopID = 
-      makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 
-                                      LLVMLoopVectorizeFollowupVectorized}); 
-  if (VectorizedLoopID.hasValue()) { 
+         "Inconsistent vector loop preheader");
+  Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
+
+  Optional<MDNode *> VectorizedLoopID =
+      makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
+                                      LLVMLoopVectorizeFollowupVectorized});
+  if (VectorizedLoopID.hasValue()) {
     L->setLoopID(VectorizedLoopID.getValue());
- 
-    // Do not setAlreadyVectorized if loop attributes have been defined 
-    // explicitly. 
-    return LoopVectorPreHeader; 
-  } 
- 
-  // Keep all loop hints from the original loop on the vector loop (we'll 
-  // replace the vectorizer-specific hints below). 
-  if (MDNode *LID = OrigLoop->getLoopID()) 
+
+    // Do not setAlreadyVectorized if loop attributes have been defined
+    // explicitly.
+    return LoopVectorPreHeader;
+  }
+
+  // Keep all loop hints from the original loop on the vector loop (we'll
+  // replace the vectorizer-specific hints below).
+  if (MDNode *LID = OrigLoop->getLoopID())
     L->setLoopID(LID);
- 
+
   LoopVectorizeHints Hints(L, true, *ORE);
-  Hints.setAlreadyVectorized(); 
- 
-#ifdef EXPENSIVE_CHECKS 
-  assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 
-  LI->verify(*DT); 
-#endif 
- 
-  return LoopVectorPreHeader; 
-} 
- 
+  Hints.setAlreadyVectorized();
+
+#ifdef EXPENSIVE_CHECKS
+  assert(DT->verify(DominatorTree::VerificationLevel::Fast));
+  LI->verify(*DT);
+#endif
+
+  return LoopVectorPreHeader;
+}
+
 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
   /*
    In this function we generate a new loop. The new loop will contain
@@ -3593,376 +3593,376 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
   return completeLoopSkeleton(Lp, OrigLoopID);
 }
 
-// Fix up external users of the induction variable. At this point, we are 
-// in LCSSA form, with all external PHIs that use the IV having one input value, 
-// coming from the remainder loop. We need those PHIs to also have a correct 
-// value for the IV when arriving directly from the middle block. 
-void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 
-                                       const InductionDescriptor &II, 
-                                       Value *CountRoundDown, Value *EndValue, 
-                                       BasicBlock *MiddleBlock) { 
-  // There are two kinds of external IV usages - those that use the value 
-  // computed in the last iteration (the PHI) and those that use the penultimate 
-  // value (the value that feeds into the phi from the loop latch). 
-  // We allow both, but they, obviously, have different values. 
- 
+// Fix up external users of the induction variable. At this point, we are
+// in LCSSA form, with all external PHIs that use the IV having one input value,
+// coming from the remainder loop. We need those PHIs to also have a correct
+// value for the IV when arriving directly from the middle block.
+void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
+                                       const InductionDescriptor &II,
+                                       Value *CountRoundDown, Value *EndValue,
+                                       BasicBlock *MiddleBlock) {
+  // There are two kinds of external IV usages - those that use the value
+  // computed in the last iteration (the PHI) and those that use the penultimate
+  // value (the value that feeds into the phi from the loop latch).
+  // We allow both, but they, obviously, have different values.
+
   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
- 
-  DenseMap<Value *, Value *> MissingVals; 
- 
-  // An external user of the last iteration's value should see the value that 
-  // the remainder loop uses to initialize its own IV. 
-  Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 
-  for (User *U : PostInc->users()) { 
-    Instruction *UI = cast<Instruction>(U); 
-    if (!OrigLoop->contains(UI)) { 
-      assert(isa<PHINode>(UI) && "Expected LCSSA form"); 
-      MissingVals[UI] = EndValue; 
-    } 
-  } 
- 
-  // An external user of the penultimate value need to see EndValue - Step. 
-  // The simplest way to get this is to recompute it from the constituent SCEVs, 
-  // that is Start + (Step * (CRD - 1)). 
-  for (User *U : OrigPhi->users()) { 
-    auto *UI = cast<Instruction>(U); 
-    if (!OrigLoop->contains(UI)) { 
-      const DataLayout &DL = 
-          OrigLoop->getHeader()->getModule()->getDataLayout(); 
-      assert(isa<PHINode>(UI) && "Expected LCSSA form"); 
- 
-      IRBuilder<> B(MiddleBlock->getTerminator()); 
-      Value *CountMinusOne = B.CreateSub( 
-          CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 
-      Value *CMO = 
-          !II.getStep()->getType()->isIntegerTy() 
-              ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 
-                             II.getStep()->getType()) 
-              : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 
-      CMO->setName("cast.cmo"); 
-      Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 
-      Escape->setName("ind.escape"); 
-      MissingVals[UI] = Escape; 
-    } 
-  } 
- 
-  for (auto &I : MissingVals) { 
-    PHINode *PHI = cast<PHINode>(I.first); 
-    // One corner case we have to handle is two IVs "chasing" each-other, 
-    // that is %IV2 = phi [...], [ %IV1, %latch ] 
-    // In this case, if IV1 has an external use, we need to avoid adding both 
-    // "last value of IV1" and "penultimate value of IV2". So, verify that we 
-    // don't already have an incoming value for the middle block. 
-    if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 
-      PHI->addIncoming(I.second, MiddleBlock); 
-  } 
-} 
- 
-namespace { 
- 
-struct CSEDenseMapInfo { 
-  static bool canHandle(const Instruction *I) { 
-    return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 
-           isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 
-  } 
- 
-  static inline Instruction *getEmptyKey() { 
-    return DenseMapInfo<Instruction *>::getEmptyKey(); 
-  } 
- 
-  static inline Instruction *getTombstoneKey() { 
-    return DenseMapInfo<Instruction *>::getTombstoneKey(); 
-  } 
- 
-  static unsigned getHashValue(const Instruction *I) { 
-    assert(canHandle(I) && "Unknown instruction!"); 
-    return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 
-                                                           I->value_op_end())); 
-  } 
- 
-  static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 
-    if (LHS == getEmptyKey() || RHS == getEmptyKey() || 
-        LHS == getTombstoneKey() || RHS == getTombstoneKey()) 
-      return LHS == RHS; 
-    return LHS->isIdenticalTo(RHS); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-///Perform cse of induction variable instructions. 
-static void cse(BasicBlock *BB) { 
-  // Perform simple cse. 
-  SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 
-  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 
-    Instruction *In = &*I++; 
- 
-    if (!CSEDenseMapInfo::canHandle(In)) 
-      continue; 
- 
-    // Check if we can replace this instruction with any of the 
-    // visited instructions. 
-    if (Instruction *V = CSEMap.lookup(In)) { 
-      In->replaceAllUsesWith(V); 
-      In->eraseFromParent(); 
-      continue; 
-    } 
- 
-    CSEMap[In] = In; 
-  } 
-} 
- 
+
+  DenseMap<Value *, Value *> MissingVals;
+
+  // An external user of the last iteration's value should see the value that
+  // the remainder loop uses to initialize its own IV.
+  Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
+  for (User *U : PostInc->users()) {
+    Instruction *UI = cast<Instruction>(U);
+    if (!OrigLoop->contains(UI)) {
+      assert(isa<PHINode>(UI) && "Expected LCSSA form");
+      MissingVals[UI] = EndValue;
+    }
+  }
+
+  // An external user of the penultimate value need to see EndValue - Step.
+  // The simplest way to get this is to recompute it from the constituent SCEVs,
+  // that is Start + (Step * (CRD - 1)).
+  for (User *U : OrigPhi->users()) {
+    auto *UI = cast<Instruction>(U);
+    if (!OrigLoop->contains(UI)) {
+      const DataLayout &DL =
+          OrigLoop->getHeader()->getModule()->getDataLayout();
+      assert(isa<PHINode>(UI) && "Expected LCSSA form");
+
+      IRBuilder<> B(MiddleBlock->getTerminator());
+      Value *CountMinusOne = B.CreateSub(
+          CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
+      Value *CMO =
+          !II.getStep()->getType()->isIntegerTy()
+              ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
+                             II.getStep()->getType())
+              : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
+      CMO->setName("cast.cmo");
+      Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
+      Escape->setName("ind.escape");
+      MissingVals[UI] = Escape;
+    }
+  }
+
+  for (auto &I : MissingVals) {
+    PHINode *PHI = cast<PHINode>(I.first);
+    // One corner case we have to handle is two IVs "chasing" each-other,
+    // that is %IV2 = phi [...], [ %IV1, %latch ]
+    // In this case, if IV1 has an external use, we need to avoid adding both
+    // "last value of IV1" and "penultimate value of IV2". So, verify that we
+    // don't already have an incoming value for the middle block.
+    if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
+      PHI->addIncoming(I.second, MiddleBlock);
+  }
+}
+
+namespace {
+
+struct CSEDenseMapInfo {
+  static bool canHandle(const Instruction *I) {
+    return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
+           isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
+  }
+
+  static inline Instruction *getEmptyKey() {
+    return DenseMapInfo<Instruction *>::getEmptyKey();
+  }
+
+  static inline Instruction *getTombstoneKey() {
+    return DenseMapInfo<Instruction *>::getTombstoneKey();
+  }
+
+  static unsigned getHashValue(const Instruction *I) {
+    assert(canHandle(I) && "Unknown instruction!");
+    return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
+                                                           I->value_op_end()));
+  }
+
+  static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
+    if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
+        LHS == getTombstoneKey() || RHS == getTombstoneKey())
+      return LHS == RHS;
+    return LHS->isIdenticalTo(RHS);
+  }
+};
+
+} // end anonymous namespace
+
+///Perform cse of induction variable instructions.
+static void cse(BasicBlock *BB) {
+  // Perform simple cse.
+  SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+    Instruction *In = &*I++;
+
+    if (!CSEDenseMapInfo::canHandle(In))
+      continue;
+
+    // Check if we can replace this instruction with any of the
+    // visited instructions.
+    if (Instruction *V = CSEMap.lookup(In)) {
+      In->replaceAllUsesWith(V);
+      In->eraseFromParent();
+      continue;
+    }
+
+    CSEMap[In] = In;
+  }
+}
+
 InstructionCost
 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
                                               bool &NeedToScalarize) {
   assert(!VF.isScalable() && "scalable vectors not yet supported.");
-  Function *F = CI->getCalledFunction(); 
-  Type *ScalarRetTy = CI->getType(); 
-  SmallVector<Type *, 4> Tys, ScalarTys; 
-  for (auto &ArgOp : CI->arg_operands()) 
-    ScalarTys.push_back(ArgOp->getType()); 
- 
-  // Estimate cost of scalarized vector call. The source operands are assumed 
-  // to be vectors, so we need to extract individual elements from there, 
-  // execute VF scalar calls, and then gather the result into the vector return 
-  // value. 
+  Function *F = CI->getCalledFunction();
+  Type *ScalarRetTy = CI->getType();
+  SmallVector<Type *, 4> Tys, ScalarTys;
+  for (auto &ArgOp : CI->arg_operands())
+    ScalarTys.push_back(ArgOp->getType());
+
+  // Estimate cost of scalarized vector call. The source operands are assumed
+  // to be vectors, so we need to extract individual elements from there,
+  // execute VF scalar calls, and then gather the result into the vector return
+  // value.
   InstructionCost ScalarCallCost =
       TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
   if (VF.isScalar())
-    return ScalarCallCost; 
- 
-  // Compute corresponding vector type for return value and arguments. 
-  Type *RetTy = ToVectorTy(ScalarRetTy, VF); 
-  for (Type *ScalarTy : ScalarTys) 
-    Tys.push_back(ToVectorTy(ScalarTy, VF)); 
- 
-  // Compute costs of unpacking argument values for the scalar calls and 
-  // packing the return values to a vector. 
+    return ScalarCallCost;
+
+  // Compute corresponding vector type for return value and arguments.
+  Type *RetTy = ToVectorTy(ScalarRetTy, VF);
+  for (Type *ScalarTy : ScalarTys)
+    Tys.push_back(ToVectorTy(ScalarTy, VF));
+
+  // Compute costs of unpacking argument values for the scalar calls and
+  // packing the return values to a vector.
   InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
- 
+
   InstructionCost Cost =
       ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
- 
-  // If we can't emit a vector call for this function, then the currently found 
-  // cost is the cost we need to return. 
-  NeedToScalarize = true; 
+
+  // If we can't emit a vector call for this function, then the currently found
+  // cost is the cost we need to return.
+  NeedToScalarize = true;
   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
-  Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 
- 
-  if (!TLI || CI->isNoBuiltin() || !VecFunc) 
-    return Cost; 
- 
-  // If the corresponding vector cost is cheaper, return its cost. 
+  Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
+
+  if (!TLI || CI->isNoBuiltin() || !VecFunc)
+    return Cost;
+
+  // If the corresponding vector cost is cheaper, return its cost.
   InstructionCost VectorCallCost =
       TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
-  if (VectorCallCost < Cost) { 
-    NeedToScalarize = false; 
+  if (VectorCallCost < Cost) {
+    NeedToScalarize = false;
     Cost = VectorCallCost;
-  } 
-  return Cost; 
-} 
- 
+  }
+  return Cost;
+}
+
 InstructionCost
 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
                                                    ElementCount VF) {
-  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 
-  assert(ID && "Expected intrinsic call!"); 
- 
-  IntrinsicCostAttributes CostAttrs(ID, *CI, VF); 
-  return TTI.getIntrinsicInstrCost(CostAttrs, 
-                                   TargetTransformInfo::TCK_RecipThroughput); 
-} 
- 
-static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 
-  auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 
-  auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 
-  return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 
-} 
- 
-static Type *largestIntegerVectorType(Type *T1, Type *T2) { 
-  auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 
-  auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 
-  return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 
-} 
- 
-void InnerLoopVectorizer::truncateToMinimalBitwidths() { 
-  // For every instruction `I` in MinBWs, truncate the operands, create a 
-  // truncated version of `I` and reextend its result. InstCombine runs 
-  // later and will remove any ext/trunc pairs. 
-  SmallPtrSet<Value *, 4> Erased; 
-  for (const auto &KV : Cost->getMinimalBitwidths()) { 
-    // If the value wasn't vectorized, we must maintain the original scalar 
-    // type. The absence of the value from VectorLoopValueMap indicates that it 
-    // wasn't vectorized. 
-    if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 
-      continue; 
-    for (unsigned Part = 0; Part < UF; ++Part) { 
-      Value *I = getOrCreateVectorValue(KV.first, Part); 
-      if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 
-        continue; 
-      Type *OriginalTy = I->getType(); 
-      Type *ScalarTruncatedTy = 
-          IntegerType::get(OriginalTy->getContext(), KV.second); 
-      auto *TruncatedTy = FixedVectorType::get( 
+  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+  assert(ID && "Expected intrinsic call!");
+
+  IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
+  return TTI.getIntrinsicInstrCost(CostAttrs,
+                                   TargetTransformInfo::TCK_RecipThroughput);
+}
+
+static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
+  auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
+  auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
+  return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
+}
+
+static Type *largestIntegerVectorType(Type *T1, Type *T2) {
+  auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
+  auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
+  return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
+}
+
+void InnerLoopVectorizer::truncateToMinimalBitwidths() {
+  // For every instruction `I` in MinBWs, truncate the operands, create a
+  // truncated version of `I` and reextend its result. InstCombine runs
+  // later and will remove any ext/trunc pairs.
+  SmallPtrSet<Value *, 4> Erased;
+  for (const auto &KV : Cost->getMinimalBitwidths()) {
+    // If the value wasn't vectorized, we must maintain the original scalar
+    // type. The absence of the value from VectorLoopValueMap indicates that it
+    // wasn't vectorized.
+    if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
+      continue;
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *I = getOrCreateVectorValue(KV.first, Part);
+      if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
+        continue;
+      Type *OriginalTy = I->getType();
+      Type *ScalarTruncatedTy =
+          IntegerType::get(OriginalTy->getContext(), KV.second);
+      auto *TruncatedTy = FixedVectorType::get(
           ScalarTruncatedTy,
           cast<FixedVectorType>(OriginalTy)->getNumElements());
-      if (TruncatedTy == OriginalTy) 
-        continue; 
- 
-      IRBuilder<> B(cast<Instruction>(I)); 
-      auto ShrinkOperand = [&](Value *V) -> Value * { 
-        if (auto *ZI = dyn_cast<ZExtInst>(V)) 
-          if (ZI->getSrcTy() == TruncatedTy) 
-            return ZI->getOperand(0); 
-        return B.CreateZExtOrTrunc(V, TruncatedTy); 
-      }; 
- 
-      // The actual instruction modification depends on the instruction type, 
-      // unfortunately. 
-      Value *NewI = nullptr; 
-      if (auto *BO = dyn_cast<BinaryOperator>(I)) { 
-        NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 
-                             ShrinkOperand(BO->getOperand(1))); 
- 
-        // Any wrapping introduced by shrinking this operation shouldn't be 
-        // considered undefined behavior. So, we can't unconditionally copy 
-        // arithmetic wrapping flags to NewI. 
-        cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 
-      } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 
-        NewI = 
-            B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 
-                         ShrinkOperand(CI->getOperand(1))); 
-      } else if (auto *SI = dyn_cast<SelectInst>(I)) { 
-        NewI = B.CreateSelect(SI->getCondition(), 
-                              ShrinkOperand(SI->getTrueValue()), 
-                              ShrinkOperand(SI->getFalseValue())); 
-      } else if (auto *CI = dyn_cast<CastInst>(I)) { 
-        switch (CI->getOpcode()) { 
-        default: 
-          llvm_unreachable("Unhandled cast!"); 
-        case Instruction::Trunc: 
-          NewI = ShrinkOperand(CI->getOperand(0)); 
-          break; 
-        case Instruction::SExt: 
-          NewI = B.CreateSExtOrTrunc( 
-              CI->getOperand(0), 
-              smallestIntegerVectorType(OriginalTy, TruncatedTy)); 
-          break; 
-        case Instruction::ZExt: 
-          NewI = B.CreateZExtOrTrunc( 
-              CI->getOperand(0), 
-              smallestIntegerVectorType(OriginalTy, TruncatedTy)); 
-          break; 
-        } 
-      } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 
+      if (TruncatedTy == OriginalTy)
+        continue;
+
+      IRBuilder<> B(cast<Instruction>(I));
+      auto ShrinkOperand = [&](Value *V) -> Value * {
+        if (auto *ZI = dyn_cast<ZExtInst>(V))
+          if (ZI->getSrcTy() == TruncatedTy)
+            return ZI->getOperand(0);
+        return B.CreateZExtOrTrunc(V, TruncatedTy);
+      };
+
+      // The actual instruction modification depends on the instruction type,
+      // unfortunately.
+      Value *NewI = nullptr;
+      if (auto *BO = dyn_cast<BinaryOperator>(I)) {
+        NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
+                             ShrinkOperand(BO->getOperand(1)));
+
+        // Any wrapping introduced by shrinking this operation shouldn't be
+        // considered undefined behavior. So, we can't unconditionally copy
+        // arithmetic wrapping flags to NewI.
+        cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
+      } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
+        NewI =
+            B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
+                         ShrinkOperand(CI->getOperand(1)));
+      } else if (auto *SI = dyn_cast<SelectInst>(I)) {
+        NewI = B.CreateSelect(SI->getCondition(),
+                              ShrinkOperand(SI->getTrueValue()),
+                              ShrinkOperand(SI->getFalseValue()));
+      } else if (auto *CI = dyn_cast<CastInst>(I)) {
+        switch (CI->getOpcode()) {
+        default:
+          llvm_unreachable("Unhandled cast!");
+        case Instruction::Trunc:
+          NewI = ShrinkOperand(CI->getOperand(0));
+          break;
+        case Instruction::SExt:
+          NewI = B.CreateSExtOrTrunc(
+              CI->getOperand(0),
+              smallestIntegerVectorType(OriginalTy, TruncatedTy));
+          break;
+        case Instruction::ZExt:
+          NewI = B.CreateZExtOrTrunc(
+              CI->getOperand(0),
+              smallestIntegerVectorType(OriginalTy, TruncatedTy));
+          break;
+        }
+      } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
         auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
                              ->getNumElements();
-        auto *O0 = B.CreateZExtOrTrunc( 
-            SI->getOperand(0), 
-            FixedVectorType::get(ScalarTruncatedTy, Elements0)); 
+        auto *O0 = B.CreateZExtOrTrunc(
+            SI->getOperand(0),
+            FixedVectorType::get(ScalarTruncatedTy, Elements0));
         auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
                              ->getNumElements();
-        auto *O1 = B.CreateZExtOrTrunc( 
-            SI->getOperand(1), 
-            FixedVectorType::get(ScalarTruncatedTy, Elements1)); 
- 
-        NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 
-      } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 
-        // Don't do anything with the operands, just extend the result. 
-        continue; 
-      } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 
+        auto *O1 = B.CreateZExtOrTrunc(
+            SI->getOperand(1),
+            FixedVectorType::get(ScalarTruncatedTy, Elements1));
+
+        NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
+      } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
+        // Don't do anything with the operands, just extend the result.
+        continue;
+      } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
         auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
                             ->getNumElements();
-        auto *O0 = B.CreateZExtOrTrunc( 
-            IE->getOperand(0), 
-            FixedVectorType::get(ScalarTruncatedTy, Elements)); 
-        auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 
-        NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 
-      } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 
+        auto *O0 = B.CreateZExtOrTrunc(
+            IE->getOperand(0),
+            FixedVectorType::get(ScalarTruncatedTy, Elements));
+        auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
+        NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
+      } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
         auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
                             ->getNumElements();
-        auto *O0 = B.CreateZExtOrTrunc( 
-            EE->getOperand(0), 
-            FixedVectorType::get(ScalarTruncatedTy, Elements)); 
-        NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 
-      } else { 
-        // If we don't know what to do, be conservative and don't do anything. 
-        continue; 
-      } 
- 
-      // Lastly, extend the result. 
-      NewI->takeName(cast<Instruction>(I)); 
-      Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 
-      I->replaceAllUsesWith(Res); 
-      cast<Instruction>(I)->eraseFromParent(); 
-      Erased.insert(I); 
-      VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 
-    } 
-  } 
- 
-  // We'll have created a bunch of ZExts that are now parentless. Clean up. 
-  for (const auto &KV : Cost->getMinimalBitwidths()) { 
-    // If the value wasn't vectorized, we must maintain the original scalar 
-    // type. The absence of the value from VectorLoopValueMap indicates that it 
-    // wasn't vectorized. 
-    if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 
-      continue; 
-    for (unsigned Part = 0; Part < UF; ++Part) { 
-      Value *I = getOrCreateVectorValue(KV.first, Part); 
-      ZExtInst *Inst = dyn_cast<ZExtInst>(I); 
-      if (Inst && Inst->use_empty()) { 
-        Value *NewI = Inst->getOperand(0); 
-        Inst->eraseFromParent(); 
-        VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 
-      } 
-    } 
-  } 
-} 
- 
-void InnerLoopVectorizer::fixVectorizedLoop() { 
-  // Insert truncates and extends for any truncated instructions as hints to 
-  // InstCombine. 
+        auto *O0 = B.CreateZExtOrTrunc(
+            EE->getOperand(0),
+            FixedVectorType::get(ScalarTruncatedTy, Elements));
+        NewI = B.CreateExtractElement(O0, EE->getOperand(2));
+      } else {
+        // If we don't know what to do, be conservative and don't do anything.
+        continue;
+      }
+
+      // Lastly, extend the result.
+      NewI->takeName(cast<Instruction>(I));
+      Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
+      I->replaceAllUsesWith(Res);
+      cast<Instruction>(I)->eraseFromParent();
+      Erased.insert(I);
+      VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
+    }
+  }
+
+  // We'll have created a bunch of ZExts that are now parentless. Clean up.
+  for (const auto &KV : Cost->getMinimalBitwidths()) {
+    // If the value wasn't vectorized, we must maintain the original scalar
+    // type. The absence of the value from VectorLoopValueMap indicates that it
+    // wasn't vectorized.
+    if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
+      continue;
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *I = getOrCreateVectorValue(KV.first, Part);
+      ZExtInst *Inst = dyn_cast<ZExtInst>(I);
+      if (Inst && Inst->use_empty()) {
+        Value *NewI = Inst->getOperand(0);
+        Inst->eraseFromParent();
+        VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
+      }
+    }
+  }
+}
+
+void InnerLoopVectorizer::fixVectorizedLoop() {
+  // Insert truncates and extends for any truncated instructions as hints to
+  // InstCombine.
   if (VF.isVector())
-    truncateToMinimalBitwidths(); 
- 
-  // Fix widened non-induction PHIs by setting up the PHI operands. 
-  if (OrigPHIsToFix.size()) { 
-    assert(EnableVPlanNativePath && 
-           "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 
-    fixNonInductionPHIs(); 
-  } 
- 
-  // At this point every instruction in the original loop is widened to a 
-  // vector form. Now we need to fix the recurrences in the loop. These PHI 
-  // nodes are currently empty because we did not want to introduce cycles. 
-  // This is the second stage of vectorizing recurrences. 
-  fixCrossIterationPHIs(); 
- 
-  // Forget the original basic block. 
-  PSE.getSE()->forgetLoop(OrigLoop); 
- 
-  // Fix-up external users of the induction variables. 
-  for (auto &Entry : Legal->getInductionVars()) 
-    fixupIVUsers(Entry.first, Entry.second, 
-                 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 
-                 IVEndValues[Entry.first], LoopMiddleBlock); 
- 
-  fixLCSSAPHIs(); 
-  for (Instruction *PI : PredicatedInstructions) 
-    sinkScalarOperands(&*PI); 
- 
-  // Remove redundant induction instructions. 
-  cse(LoopVectorBody); 
- 
-  // Set/update profile weights for the vector and remainder loops as original 
-  // loop iterations are now distributed among them. Note that original loop 
-  // represented by LoopScalarBody becomes remainder loop after vectorization. 
-  // 
-  // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 
-  // end up getting slightly roughened result but that should be OK since 
-  // profile is not inherently precise anyway. Note also possible bypass of 
-  // vector code caused by legality checks is ignored, assigning all the weight 
-  // to the vector loop, optimistically. 
+    truncateToMinimalBitwidths();
+
+  // Fix widened non-induction PHIs by setting up the PHI operands.
+  if (OrigPHIsToFix.size()) {
+    assert(EnableVPlanNativePath &&
+           "Unexpected non-induction PHIs for fixup in non VPlan-native path");
+    fixNonInductionPHIs();
+  }
+
+  // At this point every instruction in the original loop is widened to a
+  // vector form. Now we need to fix the recurrences in the loop. These PHI
+  // nodes are currently empty because we did not want to introduce cycles.
+  // This is the second stage of vectorizing recurrences.
+  fixCrossIterationPHIs();
+
+  // Forget the original basic block.
+  PSE.getSE()->forgetLoop(OrigLoop);
+
+  // Fix-up external users of the induction variables.
+  for (auto &Entry : Legal->getInductionVars())
+    fixupIVUsers(Entry.first, Entry.second,
+                 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
+                 IVEndValues[Entry.first], LoopMiddleBlock);
+
+  fixLCSSAPHIs();
+  for (Instruction *PI : PredicatedInstructions)
+    sinkScalarOperands(&*PI);
+
+  // Remove redundant induction instructions.
+  cse(LoopVectorBody);
+
+  // Set/update profile weights for the vector and remainder loops as original
+  // loop iterations are now distributed among them. Note that original loop
+  // represented by LoopScalarBody becomes remainder loop after vectorization.
+  //
+  // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
+  // end up getting slightly roughened result but that should be OK since
+  // profile is not inherently precise anyway. Note also possible bypass of
+  // vector code caused by legality checks is ignored, assigning all the weight
+  // to the vector loop, optimistically.
   //
   // For scalable vectorization we can't know at compile time how many iterations
   // of the loop are handled in one vector iteration, so instead assume a pessimistic
@@ -3970,199 +3970,199 @@ void InnerLoopVectorizer::fixVectorizedLoop() {
   setProfileInfoAfterUnrolling(
       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
-} 
- 
-void InnerLoopVectorizer::fixCrossIterationPHIs() { 
-  // In order to support recurrences we need to be able to vectorize Phi nodes. 
-  // Phi nodes have cycles, so we need to vectorize them in two stages. This is 
-  // stage #2: We now need to fix the recurrences by adding incoming edges to 
-  // the currently empty PHI nodes. At this point every instruction in the 
-  // original loop is widened to a vector form so we can use them to construct 
-  // the incoming edges. 
-  for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 
-    // Handle first-order recurrences and reductions that need to be fixed. 
-    if (Legal->isFirstOrderRecurrence(&Phi)) 
-      fixFirstOrderRecurrence(&Phi); 
-    else if (Legal->isReductionVariable(&Phi)) 
-      fixReduction(&Phi); 
-  } 
-} 
- 
-void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 
-  // This is the second phase of vectorizing first-order recurrences. An 
-  // overview of the transformation is described below. Suppose we have the 
-  // following loop. 
-  // 
-  //   for (int i = 0; i < n; ++i) 
-  //     b[i] = a[i] - a[i - 1]; 
-  // 
-  // There is a first-order recurrence on "a". For this loop, the shorthand 
-  // scalar IR looks like: 
-  // 
-  //   scalar.ph: 
-  //     s_init = a[-1] 
-  //     br scalar.body 
-  // 
-  //   scalar.body: 
-  //     i = phi [0, scalar.ph], [i+1, scalar.body] 
-  //     s1 = phi [s_init, scalar.ph], [s2, scalar.body] 
-  //     s2 = a[i] 
-  //     b[i] = s2 - s1 
-  //     br cond, scalar.body, ... 
-  // 
-  // In this example, s1 is a recurrence because it's value depends on the 
-  // previous iteration. In the first phase of vectorization, we created a 
-  // temporary value for s1. We now complete the vectorization and produce the 
-  // shorthand vector IR shown below (for VF = 4, UF = 1). 
-  // 
-  //   vector.ph: 
-  //     v_init = vector(..., ..., ..., a[-1]) 
-  //     br vector.body 
-  // 
-  //   vector.body 
-  //     i = phi [0, vector.ph], [i+4, vector.body] 
-  //     v1 = phi [v_init, vector.ph], [v2, vector.body] 
-  //     v2 = a[i, i+1, i+2, i+3]; 
-  //     v3 = vector(v1(3), v2(0, 1, 2)) 
-  //     b[i, i+1, i+2, i+3] = v2 - v3 
-  //     br cond, vector.body, middle.block 
-  // 
-  //   middle.block: 
-  //     x = v2(3) 
-  //     br scalar.ph 
-  // 
-  //   scalar.ph: 
-  //     s_init = phi [x, middle.block], [a[-1], otherwise] 
-  //     br scalar.body 
-  // 
-  // After execution completes the vector loop, we extract the next value of 
-  // the recurrence (x) to use as the initial value in the scalar loop. 
- 
-  // Get the original loop preheader and single loop latch. 
-  auto *Preheader = OrigLoop->getLoopPreheader(); 
-  auto *Latch = OrigLoop->getLoopLatch(); 
- 
-  // Get the initial and previous values of the scalar recurrence. 
-  auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 
-  auto *Previous = Phi->getIncomingValueForBlock(Latch); 
- 
-  // Create a vector from the initial value. 
-  auto *VectorInit = ScalarInit; 
+}
+
+void InnerLoopVectorizer::fixCrossIterationPHIs() {
+  // In order to support recurrences we need to be able to vectorize Phi nodes.
+  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
+  // stage #2: We now need to fix the recurrences by adding incoming edges to
+  // the currently empty PHI nodes. At this point every instruction in the
+  // original loop is widened to a vector form so we can use them to construct
+  // the incoming edges.
+  for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
+    // Handle first-order recurrences and reductions that need to be fixed.
+    if (Legal->isFirstOrderRecurrence(&Phi))
+      fixFirstOrderRecurrence(&Phi);
+    else if (Legal->isReductionVariable(&Phi))
+      fixReduction(&Phi);
+  }
+}
+
+void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
+  // This is the second phase of vectorizing first-order recurrences. An
+  // overview of the transformation is described below. Suppose we have the
+  // following loop.
+  //
+  //   for (int i = 0; i < n; ++i)
+  //     b[i] = a[i] - a[i - 1];
+  //
+  // There is a first-order recurrence on "a". For this loop, the shorthand
+  // scalar IR looks like:
+  //
+  //   scalar.ph:
+  //     s_init = a[-1]
+  //     br scalar.body
+  //
+  //   scalar.body:
+  //     i = phi [0, scalar.ph], [i+1, scalar.body]
+  //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
+  //     s2 = a[i]
+  //     b[i] = s2 - s1
+  //     br cond, scalar.body, ...
+  //
+  // In this example, s1 is a recurrence because it's value depends on the
+  // previous iteration. In the first phase of vectorization, we created a
+  // temporary value for s1. We now complete the vectorization and produce the
+  // shorthand vector IR shown below (for VF = 4, UF = 1).
+  //
+  //   vector.ph:
+  //     v_init = vector(..., ..., ..., a[-1])
+  //     br vector.body
+  //
+  //   vector.body
+  //     i = phi [0, vector.ph], [i+4, vector.body]
+  //     v1 = phi [v_init, vector.ph], [v2, vector.body]
+  //     v2 = a[i, i+1, i+2, i+3];
+  //     v3 = vector(v1(3), v2(0, 1, 2))
+  //     b[i, i+1, i+2, i+3] = v2 - v3
+  //     br cond, vector.body, middle.block
+  //
+  //   middle.block:
+  //     x = v2(3)
+  //     br scalar.ph
+  //
+  //   scalar.ph:
+  //     s_init = phi [x, middle.block], [a[-1], otherwise]
+  //     br scalar.body
+  //
+  // After execution completes the vector loop, we extract the next value of
+  // the recurrence (x) to use as the initial value in the scalar loop.
+
+  // Get the original loop preheader and single loop latch.
+  auto *Preheader = OrigLoop->getLoopPreheader();
+  auto *Latch = OrigLoop->getLoopLatch();
+
+  // Get the initial and previous values of the scalar recurrence.
+  auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
+  auto *Previous = Phi->getIncomingValueForBlock(Latch);
+
+  // Create a vector from the initial value.
+  auto *VectorInit = ScalarInit;
   if (VF.isVector()) {
-    Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 
+    Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
-    VectorInit = Builder.CreateInsertElement( 
+    VectorInit = Builder.CreateInsertElement(
         PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
         Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
-  } 
- 
-  // We constructed a temporary phi node in the first phase of vectorization. 
-  // This phi node will eventually be deleted. 
-  Builder.SetInsertPoint( 
-      cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 
- 
-  // Create a phi node for the new recurrence. The current value will either be 
-  // the initial value inserted into a vector or loop-varying vector value. 
-  auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 
-  VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 
- 
-  // Get the vectorized previous value of the last part UF - 1. It appears last 
-  // among all unrolled iterations, due to the order of their construction. 
-  Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 
- 
-  // Find and set the insertion point after the previous value if it is an 
-  // instruction. 
-  BasicBlock::iterator InsertPt; 
-  // Note that the previous value may have been constant-folded so it is not 
-  // guaranteed to be an instruction in the vector loop. 
-  // FIXME: Loop invariant values do not form recurrences. We should deal with 
-  //        them earlier. 
-  if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 
-    InsertPt = LoopVectorBody->getFirstInsertionPt(); 
-  else { 
-    Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 
-    if (isa<PHINode>(PreviousLastPart)) 
-      // If the previous value is a phi node, we should insert after all the phi 
-      // nodes in the block containing the PHI to avoid breaking basic block 
-      // verification. Note that the basic block may be different to 
-      // LoopVectorBody, in case we predicate the loop. 
-      InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 
-    else 
-      InsertPt = ++PreviousInst->getIterator(); 
-  } 
-  Builder.SetInsertPoint(&*InsertPt); 
- 
-  // We will construct a vector for the recurrence by combining the values for 
-  // the current and previous iterations. This is the required shuffle mask. 
+  }
+
+  // We constructed a temporary phi node in the first phase of vectorization.
+  // This phi node will eventually be deleted.
+  Builder.SetInsertPoint(
+      cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
+
+  // Create a phi node for the new recurrence. The current value will either be
+  // the initial value inserted into a vector or loop-varying vector value.
+  auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
+  VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
+
+  // Get the vectorized previous value of the last part UF - 1. It appears last
+  // among all unrolled iterations, due to the order of their construction.
+  Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
+
+  // Find and set the insertion point after the previous value if it is an
+  // instruction.
+  BasicBlock::iterator InsertPt;
+  // Note that the previous value may have been constant-folded so it is not
+  // guaranteed to be an instruction in the vector loop.
+  // FIXME: Loop invariant values do not form recurrences. We should deal with
+  //        them earlier.
+  if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
+    InsertPt = LoopVectorBody->getFirstInsertionPt();
+  else {
+    Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
+    if (isa<PHINode>(PreviousLastPart))
+      // If the previous value is a phi node, we should insert after all the phi
+      // nodes in the block containing the PHI to avoid breaking basic block
+      // verification. Note that the basic block may be different to
+      // LoopVectorBody, in case we predicate the loop.
+      InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
+    else
+      InsertPt = ++PreviousInst->getIterator();
+  }
+  Builder.SetInsertPoint(&*InsertPt);
+
+  // We will construct a vector for the recurrence by combining the values for
+  // the current and previous iterations. This is the required shuffle mask.
   assert(!VF.isScalable());
   SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
   ShuffleMask[0] = VF.getKnownMinValue() - 1;
   for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
     ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
- 
-  // The vector from which to take the initial value for the current iteration 
-  // (actual or unrolled). Initially, this is the vector phi node. 
-  Value *Incoming = VecPhi; 
- 
-  // Shuffle the current and previous vector and update the vector parts. 
-  for (unsigned Part = 0; Part < UF; ++Part) { 
-    Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 
-    Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 
+
+  // The vector from which to take the initial value for the current iteration
+  // (actual or unrolled). Initially, this is the vector phi node.
+  Value *Incoming = VecPhi;
+
+  // Shuffle the current and previous vector and update the vector parts.
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
+    Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
     auto *Shuffle =
         VF.isVector()
             ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
             : Incoming;
-    PhiPart->replaceAllUsesWith(Shuffle); 
-    cast<Instruction>(PhiPart)->eraseFromParent(); 
-    VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 
-    Incoming = PreviousPart; 
-  } 
- 
-  // Fix the latch value of the new recurrence in the vector loop. 
-  VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 
- 
-  // Extract the last vector element in the middle block. This will be the 
-  // initial value for the recurrence when jumping to the scalar loop. 
-  auto *ExtractForScalar = Incoming; 
+    PhiPart->replaceAllUsesWith(Shuffle);
+    cast<Instruction>(PhiPart)->eraseFromParent();
+    VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
+    Incoming = PreviousPart;
+  }
+
+  // Fix the latch value of the new recurrence in the vector loop.
+  VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
+
+  // Extract the last vector element in the middle block. This will be the
+  // initial value for the recurrence when jumping to the scalar loop.
+  auto *ExtractForScalar = Incoming;
   if (VF.isVector()) {
-    Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 
-    ExtractForScalar = Builder.CreateExtractElement( 
+    Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
+    ExtractForScalar = Builder.CreateExtractElement(
         ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
         "vector.recur.extract");
-  } 
-  // Extract the second last element in the middle block if the 
-  // Phi is used outside the loop. We need to extract the phi itself 
-  // and not the last element (the phi update in the current iteration). This 
-  // will be the value when jumping to the exit block from the LoopMiddleBlock, 
-  // when the scalar loop is not run at all. 
-  Value *ExtractForPhiUsedOutsideLoop = nullptr; 
+  }
+  // Extract the second last element in the middle block if the
+  // Phi is used outside the loop. We need to extract the phi itself
+  // and not the last element (the phi update in the current iteration). This
+  // will be the value when jumping to the exit block from the LoopMiddleBlock,
+  // when the scalar loop is not run at all.
+  Value *ExtractForPhiUsedOutsideLoop = nullptr;
   if (VF.isVector())
-    ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 
+    ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
         Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
         "vector.recur.extract.for.phi");
-  // When loop is unrolled without vectorizing, initialize 
-  // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 
-  // `Incoming`. This is analogous to the vectorized case above: extracting the 
-  // second last element when VF > 1. 
-  else if (UF > 1) 
-    ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 
- 
-  // Fix the initial value of the original recurrence in the scalar loop. 
-  Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 
-  auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 
-  for (auto *BB : predecessors(LoopScalarPreHeader)) { 
-    auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 
-    Start->addIncoming(Incoming, BB); 
-  } 
- 
-  Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 
-  Phi->setName("scalar.recur"); 
- 
-  // Finally, fix users of the recurrence outside the loop. The users will need 
-  // either the last value of the scalar recurrence or the last value of the 
-  // vector recurrence we extracted in the middle block. Since the loop is in 
-  // LCSSA form, we just need to find all the phi nodes for the original scalar 
-  // recurrence in the exit block, and then add an edge for the middle block. 
+  // When loop is unrolled without vectorizing, initialize
+  // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
+  // `Incoming`. This is analogous to the vectorized case above: extracting the
+  // second last element when VF > 1.
+  else if (UF > 1)
+    ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
+
+  // Fix the initial value of the original recurrence in the scalar loop.
+  Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
+  auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
+  for (auto *BB : predecessors(LoopScalarPreHeader)) {
+    auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
+    Start->addIncoming(Incoming, BB);
+  }
+
+  Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
+  Phi->setName("scalar.recur");
+
+  // Finally, fix users of the recurrence outside the loop. The users will need
+  // either the last value of the scalar recurrence or the last value of the
+  // vector recurrence we extracted in the middle block. Since the loop is in
+  // LCSSA form, we just need to find all the phi nodes for the original scalar
+  // recurrence in the exit block, and then add an edge for the middle block.
   // Note that LCSSA does not imply single entry when the original scalar loop
   // had multiple exiting edges (as we always run the last iteration in the
   // scalar epilogue); in that case, the exiting path through middle will be
@@ -4170,67 +4170,67 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
   for (PHINode &LCSSAPhi : LoopExitBlock->phis())
     if (any_of(LCSSAPhi.incoming_values(),
                [Phi](Value *V) { return V == Phi; }))
-      LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 
-} 
- 
-void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 
-  // Get it's reduction variable descriptor. 
-  assert(Legal->isReductionVariable(Phi) && 
-         "Unable to find the reduction variable"); 
-  RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 
- 
+      LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
+}
+
+void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
+  // Get it's reduction variable descriptor.
+  assert(Legal->isReductionVariable(Phi) &&
+         "Unable to find the reduction variable");
+  RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
+
   RecurKind RK = RdxDesc.getRecurrenceKind();
-  TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 
-  Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 
-  setDebugLocFromInst(Builder, ReductionStartValue); 
+  TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
+  Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
+  setDebugLocFromInst(Builder, ReductionStartValue);
   bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
- 
-  // This is the vector-clone of the value that leaves the loop. 
-  Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 
- 
-  // Wrap flags are in general invalid after vectorization, clear them. 
-  clearReductionWrapFlags(RdxDesc); 
- 
-  // Fix the vector-loop phi. 
- 
-  // Reductions do not have to start at zero. They can start with 
-  // any loop invariant values. 
-  BasicBlock *Latch = OrigLoop->getLoopLatch(); 
-  Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 
- 
-  for (unsigned Part = 0; Part < UF; ++Part) { 
-    Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 
-    Value *Val = getOrCreateVectorValue(LoopVal, Part); 
-    cast<PHINode>(VecRdxPhi) 
-      ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 
-  } 
- 
-  // Before each round, move the insertion point right between 
-  // the PHIs and the values we are going to write. 
-  // This allows us to write both PHINodes and the extractelement 
-  // instructions. 
-  Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 
- 
-  setDebugLocFromInst(Builder, LoopExitInst); 
- 
-  // If tail is folded by masking, the vector value to leave the loop should be 
-  // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 
+
+  // This is the vector-clone of the value that leaves the loop.
+  Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
+
+  // Wrap flags are in general invalid after vectorization, clear them.
+  clearReductionWrapFlags(RdxDesc);
+
+  // Fix the vector-loop phi.
+
+  // Reductions do not have to start at zero. They can start with
+  // any loop invariant values.
+  BasicBlock *Latch = OrigLoop->getLoopLatch();
+  Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
+
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
+    Value *Val = getOrCreateVectorValue(LoopVal, Part);
+    cast<PHINode>(VecRdxPhi)
+      ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
+  }
+
+  // Before each round, move the insertion point right between
+  // the PHIs and the values we are going to write.
+  // This allows us to write both PHINodes and the extractelement
+  // instructions.
+  Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
+
+  setDebugLocFromInst(Builder, LoopExitInst);
+
+  // If tail is folded by masking, the vector value to leave the loop should be
+  // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
   // instead of the former. For an inloop reduction the reduction will already
   // be predicated, and does not need to be handled here.
   if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) {
-    for (unsigned Part = 0; Part < UF; ++Part) { 
-      Value *VecLoopExitInst = 
-          VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 
-      Value *Sel = nullptr; 
-      for (User *U : VecLoopExitInst->users()) { 
-        if (isa<SelectInst>(U)) { 
-          assert(!Sel && "Reduction exit feeding two selects"); 
-          Sel = U; 
-        } else 
-          assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 
-      } 
-      assert(Sel && "Reduction exit feeds no select"); 
-      VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *VecLoopExitInst =
+          VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
+      Value *Sel = nullptr;
+      for (User *U : VecLoopExitInst->users()) {
+        if (isa<SelectInst>(U)) {
+          assert(!Sel && "Reduction exit feeding two selects");
+          Sel = U;
+        } else
+          assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
+      }
+      assert(Sel && "Reduction exit feeds no select");
+      VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
 
       // If the target can create a predicated operator for the reduction at no
       // extra cost in the loop (for example a predicated vadd), it can be
@@ -4246,140 +4246,140 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
         VecRdxPhi->setIncomingValueForBlock(
             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
       }
-    } 
-  } 
- 
-  // If the vector reduction can be performed in a smaller type, we truncate 
-  // then extend the loop exit value to enable InstCombine to evaluate the 
-  // entire expression in the smaller type. 
+    }
+  }
+
+  // If the vector reduction can be performed in a smaller type, we truncate
+  // then extend the loop exit value to enable InstCombine to evaluate the
+  // entire expression in the smaller type.
   if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
     assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
     assert(!VF.isScalable() && "scalable vectors not yet supported.");
     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
-    Builder.SetInsertPoint( 
-        LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 
-    VectorParts RdxParts(UF); 
-    for (unsigned Part = 0; Part < UF; ++Part) { 
-      RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 
-      Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 
-      Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 
-                                        : Builder.CreateZExt(Trunc, VecTy); 
-      for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 
-           UI != RdxParts[Part]->user_end();) 
-        if (*UI != Trunc) { 
-          (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 
-          RdxParts[Part] = Extnd; 
-        } else { 
-          ++UI; 
-        } 
-    } 
-    Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 
-    for (unsigned Part = 0; Part < UF; ++Part) { 
-      RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 
-      VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 
-    } 
-  } 
- 
-  // Reduce all of the unrolled parts into a single vector. 
-  Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 
+    Builder.SetInsertPoint(
+        LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
+    VectorParts RdxParts(UF);
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
+      Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
+      Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
+                                        : Builder.CreateZExt(Trunc, VecTy);
+      for (Value::user_iterator UI = RdxParts[Part]->user_begin();
+           UI != RdxParts[Part]->user_end();)
+        if (*UI != Trunc) {
+          (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
+          RdxParts[Part] = Extnd;
+        } else {
+          ++UI;
+        }
+    }
+    Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
+      VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
+    }
+  }
+
+  // Reduce all of the unrolled parts into a single vector.
+  Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
- 
-  // The middle block terminator has already been assigned a DebugLoc here (the 
-  // OrigLoop's single latch terminator). We want the whole middle block to 
-  // appear to execute on this line because: (a) it is all compiler generated, 
-  // (b) these instructions are always executed after evaluating the latch 
-  // conditional branch, and (c) other passes may add new predecessors which 
-  // terminate on this line. This is the easiest way to ensure we don't 
-  // accidentally cause an extra step back into the loop while debugging. 
-  setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 
-  for (unsigned Part = 1; Part < UF; ++Part) { 
-    Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 
-    if (Op != Instruction::ICmp && Op != Instruction::FCmp) 
-      // Floating point operations had to be 'fast' to enable the reduction. 
-      ReducedPartRdx = addFastMathFlag( 
-          Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 
-                              ReducedPartRdx, "bin.rdx"), 
-          RdxDesc.getFastMathFlags()); 
-    else 
+
+  // The middle block terminator has already been assigned a DebugLoc here (the
+  // OrigLoop's single latch terminator). We want the whole middle block to
+  // appear to execute on this line because: (a) it is all compiler generated,
+  // (b) these instructions are always executed after evaluating the latch
+  // conditional branch, and (c) other passes may add new predecessors which
+  // terminate on this line. This is the easiest way to ensure we don't
+  // accidentally cause an extra step back into the loop while debugging.
+  setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
+  for (unsigned Part = 1; Part < UF; ++Part) {
+    Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
+    if (Op != Instruction::ICmp && Op != Instruction::FCmp)
+      // Floating point operations had to be 'fast' to enable the reduction.
+      ReducedPartRdx = addFastMathFlag(
+          Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
+                              ReducedPartRdx, "bin.rdx"),
+          RdxDesc.getFastMathFlags());
+    else
       ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
-  } 
- 
+  }
+
   // Create the reduction after the loop. Note that inloop reductions create the
   // target reduction in the loop using a Reduction recipe.
   if (VF.isVector() && !IsInLoopReductionPhi) {
-    ReducedPartRdx = 
+    ReducedPartRdx =
         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx);
-    // If the reduction can be performed in a smaller type, we need to extend 
-    // the reduction to the wider type before we branch to the original loop. 
-    if (Phi->getType() != RdxDesc.getRecurrenceType()) 
-      ReducedPartRdx = 
-        RdxDesc.isSigned() 
-        ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 
-        : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 
-  } 
- 
-  // Create a phi node that merges control-flow from the backedge-taken check 
-  // block and the middle block. 
-  PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 
-                                        LoopScalarPreHeader->getTerminator()); 
-  for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 
-    BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 
-  BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 
- 
-  // Now, we need to fix the users of the reduction variable 
-  // inside and outside of the scalar remainder loop. 
- 
+    // If the reduction can be performed in a smaller type, we need to extend
+    // the reduction to the wider type before we branch to the original loop.
+    if (Phi->getType() != RdxDesc.getRecurrenceType())
+      ReducedPartRdx =
+        RdxDesc.isSigned()
+        ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
+        : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
+  }
+
+  // Create a phi node that merges control-flow from the backedge-taken check
+  // block and the middle block.
+  PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
+                                        LoopScalarPreHeader->getTerminator());
+  for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+    BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
+  BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
+
+  // Now, we need to fix the users of the reduction variable
+  // inside and outside of the scalar remainder loop.
+
   // We know that the loop is in LCSSA form. We need to update the PHI nodes
   // in the exit blocks.  See comment on analogous loop in
   // fixFirstOrderRecurrence for a more complete explaination of the logic.
   for (PHINode &LCSSAPhi : LoopExitBlock->phis())
     if (any_of(LCSSAPhi.incoming_values(),
                [LoopExitInst](Value *V) { return V == LoopExitInst; }))
-      LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 
- 
+      LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
+
   // Fix the scalar loop reduction variable with the incoming reduction sum
   // from the vector body and from the backedge value.
-  int IncomingEdgeBlockIdx = 
-    Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 
-  assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 
-  // Pick the other block. 
-  int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 
-  Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 
-  Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 
-} 
- 
-void InnerLoopVectorizer::clearReductionWrapFlags( 
-    RecurrenceDescriptor &RdxDesc) { 
+  int IncomingEdgeBlockIdx =
+    Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
+  assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
+  // Pick the other block.
+  int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
+  Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
+  Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
+}
+
+void InnerLoopVectorizer::clearReductionWrapFlags(
+    RecurrenceDescriptor &RdxDesc) {
   RecurKind RK = RdxDesc.getRecurrenceKind();
   if (RK != RecurKind::Add && RK != RecurKind::Mul)
-    return; 
- 
-  Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 
-  assert(LoopExitInstr && "null loop exit instruction"); 
-  SmallVector<Instruction *, 8> Worklist; 
-  SmallPtrSet<Instruction *, 8> Visited; 
-  Worklist.push_back(LoopExitInstr); 
-  Visited.insert(LoopExitInstr); 
- 
-  while (!Worklist.empty()) { 
-    Instruction *Cur = Worklist.pop_back_val(); 
-    if (isa<OverflowingBinaryOperator>(Cur)) 
-      for (unsigned Part = 0; Part < UF; ++Part) { 
-        Value *V = getOrCreateVectorValue(Cur, Part); 
-        cast<Instruction>(V)->dropPoisonGeneratingFlags(); 
-      } 
- 
-    for (User *U : Cur->users()) { 
-      Instruction *UI = cast<Instruction>(U); 
-      if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 
-          Visited.insert(UI).second) 
-        Worklist.push_back(UI); 
-    } 
-  } 
-} 
- 
-void InnerLoopVectorizer::fixLCSSAPHIs() { 
-  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 
+    return;
+
+  Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
+  assert(LoopExitInstr && "null loop exit instruction");
+  SmallVector<Instruction *, 8> Worklist;
+  SmallPtrSet<Instruction *, 8> Visited;
+  Worklist.push_back(LoopExitInstr);
+  Visited.insert(LoopExitInstr);
+
+  while (!Worklist.empty()) {
+    Instruction *Cur = Worklist.pop_back_val();
+    if (isa<OverflowingBinaryOperator>(Cur))
+      for (unsigned Part = 0; Part < UF; ++Part) {
+        Value *V = getOrCreateVectorValue(Cur, Part);
+        cast<Instruction>(V)->dropPoisonGeneratingFlags();
+      }
+
+    for (User *U : Cur->users()) {
+      Instruction *UI = cast<Instruction>(U);
+      if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
+          Visited.insert(UI).second)
+        Worklist.push_back(UI);
+    }
+  }
+}
+
+void InnerLoopVectorizer::fixLCSSAPHIs() {
+  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
     if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
       // Some phis were already hand updated by the reduction and recurrence
       // code above, leave them alone.
@@ -4401,206 +4401,206 @@ void InnerLoopVectorizer::fixLCSSAPHIs() {
     Value *lastIncomingValue =
       getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
     LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
-  } 
-} 
- 
-void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 
-  // The basic block and loop containing the predicated instruction. 
-  auto *PredBB = PredInst->getParent(); 
-  auto *VectorLoop = LI->getLoopFor(PredBB); 
- 
-  // Initialize a worklist with the operands of the predicated instruction. 
-  SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 
- 
-  // Holds instructions that we need to analyze again. An instruction may be 
-  // reanalyzed if we don't yet know if we can sink it or not. 
-  SmallVector<Instruction *, 8> InstsToReanalyze; 
- 
-  // Returns true if a given use occurs in the predicated block. Phi nodes use 
-  // their operands in their corresponding predecessor blocks. 
-  auto isBlockOfUsePredicated = [&](Use &U) -> bool { 
-    auto *I = cast<Instruction>(U.getUser()); 
-    BasicBlock *BB = I->getParent(); 
-    if (auto *Phi = dyn_cast<PHINode>(I)) 
-      BB = Phi->getIncomingBlock( 
-          PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 
-    return BB == PredBB; 
-  }; 
- 
-  // Iteratively sink the scalarized operands of the predicated instruction 
-  // into the block we created for it. When an instruction is sunk, it's 
-  // operands are then added to the worklist. The algorithm ends after one pass 
-  // through the worklist doesn't sink a single instruction. 
-  bool Changed; 
-  do { 
-    // Add the instructions that need to be reanalyzed to the worklist, and 
-    // reset the changed indicator. 
-    Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 
-    InstsToReanalyze.clear(); 
-    Changed = false; 
- 
-    while (!Worklist.empty()) { 
-      auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 
- 
-      // We can't sink an instruction if it is a phi node, is already in the 
-      // predicated block, is not in the loop, or may have side effects. 
-      if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 
-          !VectorLoop->contains(I) || I->mayHaveSideEffects()) 
-        continue; 
- 
-      // It's legal to sink the instruction if all its uses occur in the 
-      // predicated block. Otherwise, there's nothing to do yet, and we may 
-      // need to reanalyze the instruction. 
-      if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 
-        InstsToReanalyze.push_back(I); 
-        continue; 
-      } 
- 
-      // Move the instruction to the beginning of the predicated block, and add 
-      // it's operands to the worklist. 
-      I->moveBefore(&*PredBB->getFirstInsertionPt()); 
-      Worklist.insert(I->op_begin(), I->op_end()); 
- 
-      // The sinking may have enabled other instructions to be sunk, so we will 
-      // need to iterate. 
-      Changed = true; 
-    } 
-  } while (Changed); 
-} 
- 
-void InnerLoopVectorizer::fixNonInductionPHIs() { 
-  for (PHINode *OrigPhi : OrigPHIsToFix) { 
-    PHINode *NewPhi = 
-        cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 
-    unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 
- 
-    SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 
-        predecessors(OrigPhi->getParent())); 
-    SmallVector<BasicBlock *, 2> VectorBBPredecessors( 
-        predecessors(NewPhi->getParent())); 
-    assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 
-           "Scalar and Vector BB should have the same number of predecessors"); 
- 
-    // The insertion point in Builder may be invalidated by the time we get 
-    // here. Force the Builder insertion point to something valid so that we do 
-    // not run into issues during insertion point restore in 
-    // getOrCreateVectorValue calls below. 
-    Builder.SetInsertPoint(NewPhi); 
- 
-    // The predecessor order is preserved and we can rely on mapping between 
-    // scalar and vector block predecessors. 
-    for (unsigned i = 0; i < NumIncomingValues; ++i) { 
-      BasicBlock *NewPredBB = VectorBBPredecessors[i]; 
- 
-      // When looking up the new scalar/vector values to fix up, use incoming 
-      // values from original phi. 
-      Value *ScIncV = 
-          OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 
- 
-      // Scalar incoming value may need a broadcast 
-      Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 
-      NewPhi->addIncoming(NewIncV, NewPredBB); 
-    } 
-  } 
-} 
- 
+  }
+}
+
+void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
+  // The basic block and loop containing the predicated instruction.
+  auto *PredBB = PredInst->getParent();
+  auto *VectorLoop = LI->getLoopFor(PredBB);
+
+  // Initialize a worklist with the operands of the predicated instruction.
+  SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
+
+  // Holds instructions that we need to analyze again. An instruction may be
+  // reanalyzed if we don't yet know if we can sink it or not.
+  SmallVector<Instruction *, 8> InstsToReanalyze;
+
+  // Returns true if a given use occurs in the predicated block. Phi nodes use
+  // their operands in their corresponding predecessor blocks.
+  auto isBlockOfUsePredicated = [&](Use &U) -> bool {
+    auto *I = cast<Instruction>(U.getUser());
+    BasicBlock *BB = I->getParent();
+    if (auto *Phi = dyn_cast<PHINode>(I))
+      BB = Phi->getIncomingBlock(
+          PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
+    return BB == PredBB;
+  };
+
+  // Iteratively sink the scalarized operands of the predicated instruction
+  // into the block we created for it. When an instruction is sunk, it's
+  // operands are then added to the worklist. The algorithm ends after one pass
+  // through the worklist doesn't sink a single instruction.
+  bool Changed;
+  do {
+    // Add the instructions that need to be reanalyzed to the worklist, and
+    // reset the changed indicator.
+    Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
+    InstsToReanalyze.clear();
+    Changed = false;
+
+    while (!Worklist.empty()) {
+      auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
+
+      // We can't sink an instruction if it is a phi node, is already in the
+      // predicated block, is not in the loop, or may have side effects.
+      if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
+          !VectorLoop->contains(I) || I->mayHaveSideEffects())
+        continue;
+
+      // It's legal to sink the instruction if all its uses occur in the
+      // predicated block. Otherwise, there's nothing to do yet, and we may
+      // need to reanalyze the instruction.
+      if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
+        InstsToReanalyze.push_back(I);
+        continue;
+      }
+
+      // Move the instruction to the beginning of the predicated block, and add
+      // it's operands to the worklist.
+      I->moveBefore(&*PredBB->getFirstInsertionPt());
+      Worklist.insert(I->op_begin(), I->op_end());
+
+      // The sinking may have enabled other instructions to be sunk, so we will
+      // need to iterate.
+      Changed = true;
+    }
+  } while (Changed);
+}
+
+void InnerLoopVectorizer::fixNonInductionPHIs() {
+  for (PHINode *OrigPhi : OrigPHIsToFix) {
+    PHINode *NewPhi =
+        cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
+    unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
+
+    SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
+        predecessors(OrigPhi->getParent()));
+    SmallVector<BasicBlock *, 2> VectorBBPredecessors(
+        predecessors(NewPhi->getParent()));
+    assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
+           "Scalar and Vector BB should have the same number of predecessors");
+
+    // The insertion point in Builder may be invalidated by the time we get
+    // here. Force the Builder insertion point to something valid so that we do
+    // not run into issues during insertion point restore in
+    // getOrCreateVectorValue calls below.
+    Builder.SetInsertPoint(NewPhi);
+
+    // The predecessor order is preserved and we can rely on mapping between
+    // scalar and vector block predecessors.
+    for (unsigned i = 0; i < NumIncomingValues; ++i) {
+      BasicBlock *NewPredBB = VectorBBPredecessors[i];
+
+      // When looking up the new scalar/vector values to fix up, use incoming
+      // values from original phi.
+      Value *ScIncV =
+          OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
+
+      // Scalar incoming value may need a broadcast
+      Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
+      NewPhi->addIncoming(NewIncV, NewPredBB);
+    }
+  }
+}
+
 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
                                    VPUser &Operands, unsigned UF,
                                    ElementCount VF, bool IsPtrLoopInvariant,
-                                   SmallBitVector &IsIndexLoopInvariant, 
-                                   VPTransformState &State) { 
-  // Construct a vector GEP by widening the operands of the scalar GEP as 
-  // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 
-  // results in a vector of pointers when at least one operand of the GEP 
-  // is vector-typed. Thus, to keep the representation compact, we only use 
-  // vector-typed operands for loop-varying values. 
- 
+                                   SmallBitVector &IsIndexLoopInvariant,
+                                   VPTransformState &State) {
+  // Construct a vector GEP by widening the operands of the scalar GEP as
+  // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
+  // results in a vector of pointers when at least one operand of the GEP
+  // is vector-typed. Thus, to keep the representation compact, we only use
+  // vector-typed operands for loop-varying values.
+
   if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
-    // If we are vectorizing, but the GEP has only loop-invariant operands, 
-    // the GEP we build (by only using vector-typed operands for 
-    // loop-varying values) would be a scalar pointer. Thus, to ensure we 
-    // produce a vector of pointers, we need to either arbitrarily pick an 
-    // operand to broadcast, or broadcast a clone of the original GEP. 
-    // Here, we broadcast a clone of the original. 
-    // 
-    // TODO: If at some point we decide to scalarize instructions having 
-    //       loop-invariant operands, this special case will no longer be 
-    //       required. We would add the scalarization decision to 
-    //       collectLoopScalars() and teach getVectorValue() to broadcast 
-    //       the lane-zero scalar value. 
-    auto *Clone = Builder.Insert(GEP->clone()); 
-    for (unsigned Part = 0; Part < UF; ++Part) { 
-      Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 
+    // If we are vectorizing, but the GEP has only loop-invariant operands,
+    // the GEP we build (by only using vector-typed operands for
+    // loop-varying values) would be a scalar pointer. Thus, to ensure we
+    // produce a vector of pointers, we need to either arbitrarily pick an
+    // operand to broadcast, or broadcast a clone of the original GEP.
+    // Here, we broadcast a clone of the original.
+    //
+    // TODO: If at some point we decide to scalarize instructions having
+    //       loop-invariant operands, this special case will no longer be
+    //       required. We would add the scalarization decision to
+    //       collectLoopScalars() and teach getVectorValue() to broadcast
+    //       the lane-zero scalar value.
+    auto *Clone = Builder.Insert(GEP->clone());
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
       State.set(VPDef, GEP, EntryPart, Part);
-      addMetadata(EntryPart, GEP); 
-    } 
-  } else { 
-    // If the GEP has at least one loop-varying operand, we are sure to 
-    // produce a vector of pointers. But if we are only unrolling, we want 
-    // to produce a scalar GEP for each unroll part. Thus, the GEP we 
-    // produce with the code below will be scalar (if VF == 1) or vector 
-    // (otherwise). Note that for the unroll-only case, we still maintain 
-    // values in the vector mapping with initVector, as we do for other 
-    // instructions. 
-    for (unsigned Part = 0; Part < UF; ++Part) { 
-      // The pointer operand of the new GEP. If it's loop-invariant, we 
-      // won't broadcast it. 
-      auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0}) 
-                                     : State.get(Operands.getOperand(0), Part); 
- 
-      // Collect all the indices for the new GEP. If any index is 
-      // loop-invariant, we won't broadcast it. 
-      SmallVector<Value *, 4> Indices; 
-      for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 
-        VPValue *Operand = Operands.getOperand(I); 
-        if (IsIndexLoopInvariant[I - 1]) 
-          Indices.push_back(State.get(Operand, {0, 0})); 
-        else 
-          Indices.push_back(State.get(Operand, Part)); 
-      } 
- 
-      // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 
-      // but it should be a vector, otherwise. 
-      auto *NewGEP = 
-          GEP->isInBounds() 
-              ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 
-                                          Indices) 
-              : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 
+      addMetadata(EntryPart, GEP);
+    }
+  } else {
+    // If the GEP has at least one loop-varying operand, we are sure to
+    // produce a vector of pointers. But if we are only unrolling, we want
+    // to produce a scalar GEP for each unroll part. Thus, the GEP we
+    // produce with the code below will be scalar (if VF == 1) or vector
+    // (otherwise). Note that for the unroll-only case, we still maintain
+    // values in the vector mapping with initVector, as we do for other
+    // instructions.
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      // The pointer operand of the new GEP. If it's loop-invariant, we
+      // won't broadcast it.
+      auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0})
+                                     : State.get(Operands.getOperand(0), Part);
+
+      // Collect all the indices for the new GEP. If any index is
+      // loop-invariant, we won't broadcast it.
+      SmallVector<Value *, 4> Indices;
+      for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
+        VPValue *Operand = Operands.getOperand(I);
+        if (IsIndexLoopInvariant[I - 1])
+          Indices.push_back(State.get(Operand, {0, 0}));
+        else
+          Indices.push_back(State.get(Operand, Part));
+      }
+
+      // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
+      // but it should be a vector, otherwise.
+      auto *NewGEP =
+          GEP->isInBounds()
+              ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
+                                          Indices)
+              : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
       assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
-             "NewGEP is not a pointer vector"); 
+             "NewGEP is not a pointer vector");
       State.set(VPDef, GEP, NewGEP, Part);
-      addMetadata(NewGEP, GEP); 
-    } 
-  } 
-} 
- 
+      addMetadata(NewGEP, GEP);
+    }
+  }
+}
+
 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
                                               RecurrenceDescriptor *RdxDesc,
                                               Value *StartV, unsigned UF,
                                               ElementCount VF) {
   assert(!VF.isScalable() && "scalable vectors not yet supported.");
-  PHINode *P = cast<PHINode>(PN); 
-  if (EnableVPlanNativePath) { 
-    // Currently we enter here in the VPlan-native path for non-induction 
-    // PHIs where all control flow is uniform. We simply widen these PHIs. 
-    // Create a vector phi with no operands - the vector phi operands will be 
-    // set at the end of vector code generation. 
-    Type *VecTy = 
+  PHINode *P = cast<PHINode>(PN);
+  if (EnableVPlanNativePath) {
+    // Currently we enter here in the VPlan-native path for non-induction
+    // PHIs where all control flow is uniform. We simply widen these PHIs.
+    // Create a vector phi with no operands - the vector phi operands will be
+    // set at the end of vector code generation.
+    Type *VecTy =
         (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF);
-    Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 
-    VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 
-    OrigPHIsToFix.push_back(P); 
- 
-    return; 
-  } 
- 
-  assert(PN->getParent() == OrigLoop->getHeader() && 
-         "Non-header phis should have been handled elsewhere"); 
- 
-  // In order to support recurrences we need to be able to vectorize Phi nodes. 
-  // Phi nodes have cycles, so we need to vectorize them in two stages. This is 
-  // stage #1: We create a new vector PHI node with no incoming edges. We'll use 
-  // this value when we vectorize all of the instructions that use the PHI. 
+    Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
+    VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
+    OrigPHIsToFix.push_back(P);
+
+    return;
+  }
+
+  assert(PN->getParent() == OrigLoop->getHeader() &&
+         "Non-header phis should have been handled elsewhere");
+
+  // In order to support recurrences we need to be able to vectorize Phi nodes.
+  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
+  // stage #1: We create a new vector PHI node with no incoming edges. We'll use
+  // this value when we vectorize all of the instructions that use the PHI.
   if (RdxDesc || Legal->isFirstOrderRecurrence(P)) {
     Value *Iden = nullptr;
     bool ScalarPHI =
@@ -4637,44 +4637,44 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
       }
     }
 
-    for (unsigned Part = 0; Part < UF; ++Part) { 
-      // This is phase one of vectorizing PHIs. 
-      Value *EntryPart = PHINode::Create( 
-          VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 
-      VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      // This is phase one of vectorizing PHIs.
+      Value *EntryPart = PHINode::Create(
+          VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
+      VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
       if (StartV) {
         // Make sure to add the reduction start value only to the
         // first unroll part.
         Value *StartVal = (Part == 0) ? StartV : Iden;
         cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader);
       }
-    } 
-    return; 
-  } 
- 
+    }
+    return;
+  }
+
   assert(!Legal->isReductionVariable(P) &&
          "reductions should be handled above");
 
-  setDebugLocFromInst(Builder, P); 
- 
-  // This PHINode must be an induction variable. 
-  // Make sure that we know about it. 
-  assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 
- 
-  InductionDescriptor II = Legal->getInductionVars().lookup(P); 
-  const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 
- 
-  // FIXME: The newly created binary instructions should contain nsw/nuw flags, 
-  // which can be found from the original scalar operations. 
-  switch (II.getKind()) { 
-  case InductionDescriptor::IK_NoInduction: 
-    llvm_unreachable("Unknown induction"); 
-  case InductionDescriptor::IK_IntInduction: 
-  case InductionDescriptor::IK_FpInduction: 
-    llvm_unreachable("Integer/fp induction is handled elsewhere."); 
-  case InductionDescriptor::IK_PtrInduction: { 
-    // Handle the pointer induction variable case. 
-    assert(P->getType()->isPointerTy() && "Unexpected type."); 
+  setDebugLocFromInst(Builder, P);
+
+  // This PHINode must be an induction variable.
+  // Make sure that we know about it.
+  assert(Legal->getInductionVars().count(P) && "Not an induction variable");
+
+  InductionDescriptor II = Legal->getInductionVars().lookup(P);
+  const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
+
+  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
+  // which can be found from the original scalar operations.
+  switch (II.getKind()) {
+  case InductionDescriptor::IK_NoInduction:
+    llvm_unreachable("Unknown induction");
+  case InductionDescriptor::IK_IntInduction:
+  case InductionDescriptor::IK_FpInduction:
+    llvm_unreachable("Integer/fp induction is handled elsewhere.");
+  case InductionDescriptor::IK_PtrInduction: {
+    // Handle the pointer induction variable case.
+    assert(P->getType()->isPointerTy() && "Unexpected type.");
 
     if (Cost->isScalarAfterVectorization(P, VF)) {
       // This is the normalized GEP that starts counting at zero.
@@ -4695,9 +4695,9 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
           SclrGep->setName("next.gep");
           VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
         }
-      } 
+      }
       return;
-    } 
+    }
     assert(isa<SCEVConstant>(II.getStep()) &&
            "Induction step not a SCEV constant!");
     Type *PhiType = II.getStep()->getType();
@@ -4743,271 +4743,271 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
               "vector.gep"));
       VectorLoopValueMap.setVectorValue(P, Part, GEP);
     }
-  } 
-  } 
-} 
- 
-/// A helper function for checking whether an integer division-related 
-/// instruction may divide by zero (in which case it must be predicated if 
-/// executed conditionally in the scalar code). 
-/// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 
-/// Non-zero divisors that are non compile-time constants will not be 
-/// converted into multiplication, so we will still end up scalarizing 
-/// the division, but can do so w/o predication. 
-static bool mayDivideByZero(Instruction &I) { 
-  assert((I.getOpcode() == Instruction::UDiv || 
-          I.getOpcode() == Instruction::SDiv || 
-          I.getOpcode() == Instruction::URem || 
-          I.getOpcode() == Instruction::SRem) && 
-         "Unexpected instruction"); 
-  Value *Divisor = I.getOperand(1); 
-  auto *CInt = dyn_cast<ConstantInt>(Divisor); 
-  return !CInt || CInt->isZero(); 
-} 
- 
+  }
+  }
+}
+
+/// A helper function for checking whether an integer division-related
+/// instruction may divide by zero (in which case it must be predicated if
+/// executed conditionally in the scalar code).
+/// TODO: It may be worthwhile to generalize and check isKnownNonZero().
+/// Non-zero divisors that are non compile-time constants will not be
+/// converted into multiplication, so we will still end up scalarizing
+/// the division, but can do so w/o predication.
+static bool mayDivideByZero(Instruction &I) {
+  assert((I.getOpcode() == Instruction::UDiv ||
+          I.getOpcode() == Instruction::SDiv ||
+          I.getOpcode() == Instruction::URem ||
+          I.getOpcode() == Instruction::SRem) &&
+         "Unexpected instruction");
+  Value *Divisor = I.getOperand(1);
+  auto *CInt = dyn_cast<ConstantInt>(Divisor);
+  return !CInt || CInt->isZero();
+}
+
 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
                                            VPUser &User,
-                                           VPTransformState &State) { 
-  switch (I.getOpcode()) { 
-  case Instruction::Call: 
-  case Instruction::Br: 
-  case Instruction::PHI: 
-  case Instruction::GetElementPtr: 
-  case Instruction::Select: 
-    llvm_unreachable("This instruction is handled by a different recipe."); 
-  case Instruction::UDiv: 
-  case Instruction::SDiv: 
-  case Instruction::SRem: 
-  case Instruction::URem: 
-  case Instruction::Add: 
-  case Instruction::FAdd: 
-  case Instruction::Sub: 
-  case Instruction::FSub: 
-  case Instruction::FNeg: 
-  case Instruction::Mul: 
-  case Instruction::FMul: 
-  case Instruction::FDiv: 
-  case Instruction::FRem: 
-  case Instruction::Shl: 
-  case Instruction::LShr: 
-  case Instruction::AShr: 
-  case Instruction::And: 
-  case Instruction::Or: 
-  case Instruction::Xor: { 
-    // Just widen unops and binops. 
-    setDebugLocFromInst(Builder, &I); 
- 
-    for (unsigned Part = 0; Part < UF; ++Part) { 
-      SmallVector<Value *, 2> Ops; 
-      for (VPValue *VPOp : User.operands()) 
-        Ops.push_back(State.get(VPOp, Part)); 
- 
-      Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 
- 
-      if (auto *VecOp = dyn_cast<Instruction>(V)) 
-        VecOp->copyIRFlags(&I); 
- 
-      // Use this vector value for all users of the original instruction. 
+                                           VPTransformState &State) {
+  switch (I.getOpcode()) {
+  case Instruction::Call:
+  case Instruction::Br:
+  case Instruction::PHI:
+  case Instruction::GetElementPtr:
+  case Instruction::Select:
+    llvm_unreachable("This instruction is handled by a different recipe.");
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::SRem:
+  case Instruction::URem:
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::FNeg:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::FDiv:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    // Just widen unops and binops.
+    setDebugLocFromInst(Builder, &I);
+
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      SmallVector<Value *, 2> Ops;
+      for (VPValue *VPOp : User.operands())
+        Ops.push_back(State.get(VPOp, Part));
+
+      Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
+
+      if (auto *VecOp = dyn_cast<Instruction>(V))
+        VecOp->copyIRFlags(&I);
+
+      // Use this vector value for all users of the original instruction.
       State.set(Def, &I, V, Part);
-      addMetadata(V, &I); 
-    } 
- 
-    break; 
-  } 
-  case Instruction::ICmp: 
-  case Instruction::FCmp: { 
-    // Widen compares. Generate vector compares. 
-    bool FCmp = (I.getOpcode() == Instruction::FCmp); 
-    auto *Cmp = cast<CmpInst>(&I); 
-    setDebugLocFromInst(Builder, Cmp); 
-    for (unsigned Part = 0; Part < UF; ++Part) { 
-      Value *A = State.get(User.getOperand(0), Part); 
-      Value *B = State.get(User.getOperand(1), Part); 
-      Value *C = nullptr; 
-      if (FCmp) { 
-        // Propagate fast math flags. 
-        IRBuilder<>::FastMathFlagGuard FMFG(Builder); 
-        Builder.setFastMathFlags(Cmp->getFastMathFlags()); 
-        C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 
-      } else { 
-        C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 
-      } 
+      addMetadata(V, &I);
+    }
+
+    break;
+  }
+  case Instruction::ICmp:
+  case Instruction::FCmp: {
+    // Widen compares. Generate vector compares.
+    bool FCmp = (I.getOpcode() == Instruction::FCmp);
+    auto *Cmp = cast<CmpInst>(&I);
+    setDebugLocFromInst(Builder, Cmp);
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *A = State.get(User.getOperand(0), Part);
+      Value *B = State.get(User.getOperand(1), Part);
+      Value *C = nullptr;
+      if (FCmp) {
+        // Propagate fast math flags.
+        IRBuilder<>::FastMathFlagGuard FMFG(Builder);
+        Builder.setFastMathFlags(Cmp->getFastMathFlags());
+        C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
+      } else {
+        C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
+      }
       State.set(Def, &I, C, Part);
-      addMetadata(C, &I); 
-    } 
- 
-    break; 
-  } 
- 
-  case Instruction::ZExt: 
-  case Instruction::SExt: 
-  case Instruction::FPToUI: 
-  case Instruction::FPToSI: 
-  case Instruction::FPExt: 
-  case Instruction::PtrToInt: 
-  case Instruction::IntToPtr: 
-  case Instruction::SIToFP: 
-  case Instruction::UIToFP: 
-  case Instruction::Trunc: 
-  case Instruction::FPTrunc: 
-  case Instruction::BitCast: { 
-    auto *CI = cast<CastInst>(&I); 
-    setDebugLocFromInst(Builder, CI); 
- 
-    /// Vectorize casts. 
-    Type *DestTy = 
+      addMetadata(C, &I);
+    }
+
+    break;
+  }
+
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::FPExt:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::SIToFP:
+  case Instruction::UIToFP:
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::BitCast: {
+    auto *CI = cast<CastInst>(&I);
+    setDebugLocFromInst(Builder, CI);
+
+    /// Vectorize casts.
+    Type *DestTy =
         (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
- 
-    for (unsigned Part = 0; Part < UF; ++Part) { 
-      Value *A = State.get(User.getOperand(0), Part); 
-      Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 
+
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *A = State.get(User.getOperand(0), Part);
+      Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
       State.set(Def, &I, Cast, Part);
-      addMetadata(Cast, &I); 
-    } 
-    break; 
-  } 
-  default: 
-    // This instruction is not vectorized by simple widening. 
-    LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 
-    llvm_unreachable("Unhandled instruction!"); 
-  } // end of switch. 
-} 
- 
+      addMetadata(Cast, &I);
+    }
+    break;
+  }
+  default:
+    // This instruction is not vectorized by simple widening.
+    LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
+    llvm_unreachable("Unhandled instruction!");
+  } // end of switch.
+}
+
 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
                                                VPUser &ArgOperands,
-                                               VPTransformState &State) { 
-  assert(!isa<DbgInfoIntrinsic>(I) && 
-         "DbgInfoIntrinsic should have been dropped during VPlan construction"); 
-  setDebugLocFromInst(Builder, &I); 
- 
-  Module *M = I.getParent()->getParent()->getParent(); 
-  auto *CI = cast<CallInst>(&I); 
- 
-  SmallVector<Type *, 4> Tys; 
-  for (Value *ArgOperand : CI->arg_operands()) 
+                                               VPTransformState &State) {
+  assert(!isa<DbgInfoIntrinsic>(I) &&
+         "DbgInfoIntrinsic should have been dropped during VPlan construction");
+  setDebugLocFromInst(Builder, &I);
+
+  Module *M = I.getParent()->getParent()->getParent();
+  auto *CI = cast<CallInst>(&I);
+
+  SmallVector<Type *, 4> Tys;
+  for (Value *ArgOperand : CI->arg_operands())
     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
- 
-  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 
- 
-  // The flag shows whether we use Intrinsic or a usual Call for vectorized 
-  // version of the instruction. 
-  // Is it beneficial to perform intrinsic call compared to lib call? 
-  bool NeedToScalarize = false; 
+
+  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+
+  // The flag shows whether we use Intrinsic or a usual Call for vectorized
+  // version of the instruction.
+  // Is it beneficial to perform intrinsic call compared to lib call?
+  bool NeedToScalarize = false;
   InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
   InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
   bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
-  assert((UseVectorIntrinsic || !NeedToScalarize) && 
-         "Instruction should be scalarized elsewhere."); 
+  assert((UseVectorIntrinsic || !NeedToScalarize) &&
+         "Instruction should be scalarized elsewhere.");
   assert(IntrinsicCost.isValid() && CallCost.isValid() &&
          "Cannot have invalid costs while widening");
- 
-  for (unsigned Part = 0; Part < UF; ++Part) { 
-    SmallVector<Value *, 4> Args; 
-    for (auto &I : enumerate(ArgOperands.operands())) { 
-      // Some intrinsics have a scalar argument - don't replace it with a 
-      // vector. 
-      Value *Arg; 
-      if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 
-        Arg = State.get(I.value(), Part); 
-      else 
-        Arg = State.get(I.value(), {0, 0}); 
-      Args.push_back(Arg); 
-    } 
- 
-    Function *VectorF; 
-    if (UseVectorIntrinsic) { 
-      // Use vector version of the intrinsic. 
-      Type *TysForDecl[] = {CI->getType()}; 
+
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    SmallVector<Value *, 4> Args;
+    for (auto &I : enumerate(ArgOperands.operands())) {
+      // Some intrinsics have a scalar argument - don't replace it with a
+      // vector.
+      Value *Arg;
+      if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
+        Arg = State.get(I.value(), Part);
+      else
+        Arg = State.get(I.value(), {0, 0});
+      Args.push_back(Arg);
+    }
+
+    Function *VectorF;
+    if (UseVectorIntrinsic) {
+      // Use vector version of the intrinsic.
+      Type *TysForDecl[] = {CI->getType()};
       if (VF.isVector()) {
         assert(!VF.isScalable() && "VF is assumed to be non scalable.");
         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
       }
-      VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 
-      assert(VectorF && "Can't retrieve vector intrinsic."); 
-    } else { 
-      // Use vector version of the function call. 
+      VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
+      assert(VectorF && "Can't retrieve vector intrinsic.");
+    } else {
+      // Use vector version of the function call.
       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
-#ifndef NDEBUG 
-      assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 
-             "Can't create vector function."); 
-#endif 
-        VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 
-    } 
-      SmallVector<OperandBundleDef, 1> OpBundles; 
-      CI->getOperandBundlesAsDefs(OpBundles); 
-      CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 
- 
-      if (isa<FPMathOperator>(V)) 
-        V->copyFastMathFlags(CI); 
- 
+#ifndef NDEBUG
+      assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
+             "Can't create vector function.");
+#endif
+        VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
+    }
+      SmallVector<OperandBundleDef, 1> OpBundles;
+      CI->getOperandBundlesAsDefs(OpBundles);
+      CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
+
+      if (isa<FPMathOperator>(V))
+        V->copyFastMathFlags(CI);
+
       State.set(Def, &I, V, Part);
-      addMetadata(V, &I); 
-  } 
-} 
- 
+      addMetadata(V, &I);
+  }
+}
+
 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
-                                                 VPUser &Operands, 
-                                                 bool InvariantCond, 
-                                                 VPTransformState &State) { 
-  setDebugLocFromInst(Builder, &I); 
- 
-  // The condition can be loop invariant  but still defined inside the 
-  // loop. This means that we can't just use the original 'cond' value. 
-  // We have to take the 'vectorized' value and pick the first lane. 
-  // Instcombine will make this a no-op. 
-  auto *InvarCond = 
-      InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr; 
- 
-  for (unsigned Part = 0; Part < UF; ++Part) { 
-    Value *Cond = 
-        InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 
-    Value *Op0 = State.get(Operands.getOperand(1), Part); 
-    Value *Op1 = State.get(Operands.getOperand(2), Part); 
-    Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 
+                                                 VPUser &Operands,
+                                                 bool InvariantCond,
+                                                 VPTransformState &State) {
+  setDebugLocFromInst(Builder, &I);
+
+  // The condition can be loop invariant  but still defined inside the
+  // loop. This means that we can't just use the original 'cond' value.
+  // We have to take the 'vectorized' value and pick the first lane.
+  // Instcombine will make this a no-op.
+  auto *InvarCond =
+      InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
+
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    Value *Cond =
+        InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
+    Value *Op0 = State.get(Operands.getOperand(1), Part);
+    Value *Op1 = State.get(Operands.getOperand(2), Part);
+    Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
     State.set(VPDef, &I, Sel, Part);
-    addMetadata(Sel, &I); 
-  } 
-} 
- 
+    addMetadata(Sel, &I);
+  }
+}
+
 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
-  // We should not collect Scalars more than once per VF. Right now, this 
-  // function is called from collectUniformsAndScalars(), which already does 
-  // this check. Collecting Scalars for VF=1 does not make any sense. 
+  // We should not collect Scalars more than once per VF. Right now, this
+  // function is called from collectUniformsAndScalars(), which already does
+  // this check. Collecting Scalars for VF=1 does not make any sense.
   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
-         "This function should not be visited twice for the same VF"); 
- 
-  SmallSetVector<Instruction *, 8> Worklist; 
- 
-  // These sets are used to seed the analysis with pointers used by memory 
-  // accesses that will remain scalar. 
-  SmallSetVector<Instruction *, 8> ScalarPtrs; 
-  SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 
+         "This function should not be visited twice for the same VF");
+
+  SmallSetVector<Instruction *, 8> Worklist;
+
+  // These sets are used to seed the analysis with pointers used by memory
+  // accesses that will remain scalar.
+  SmallSetVector<Instruction *, 8> ScalarPtrs;
+  SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
   auto *Latch = TheLoop->getLoopLatch();
- 
-  // A helper that returns true if the use of Ptr by MemAccess will be scalar. 
-  // The pointer operands of loads and stores will be scalar as long as the 
-  // memory access is not a gather or scatter operation. The value operand of a 
-  // store will remain scalar if the store is scalarized. 
-  auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 
-    InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 
-    assert(WideningDecision != CM_Unknown && 
-           "Widening decision should be ready at this moment"); 
-    if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 
-      if (Ptr == Store->getValueOperand()) 
-        return WideningDecision == CM_Scalarize; 
-    assert(Ptr == getLoadStorePointerOperand(MemAccess) && 
-           "Ptr is neither a value or pointer operand"); 
-    return WideningDecision != CM_GatherScatter; 
-  }; 
- 
-  // A helper that returns true if the given value is a bitcast or 
-  // getelementptr instruction contained in the loop. 
-  auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 
-    return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 
-            isa<GetElementPtrInst>(V)) && 
-           !TheLoop->isLoopInvariant(V); 
-  }; 
- 
+
+  // A helper that returns true if the use of Ptr by MemAccess will be scalar.
+  // The pointer operands of loads and stores will be scalar as long as the
+  // memory access is not a gather or scatter operation. The value operand of a
+  // store will remain scalar if the store is scalarized.
+  auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
+    InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
+    assert(WideningDecision != CM_Unknown &&
+           "Widening decision should be ready at this moment");
+    if (auto *Store = dyn_cast<StoreInst>(MemAccess))
+      if (Ptr == Store->getValueOperand())
+        return WideningDecision == CM_Scalarize;
+    assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
+           "Ptr is neither a value or pointer operand");
+    return WideningDecision != CM_GatherScatter;
+  };
+
+  // A helper that returns true if the given value is a bitcast or
+  // getelementptr instruction contained in the loop.
+  auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
+    return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
+            isa<GetElementPtrInst>(V)) &&
+           !TheLoop->isLoopInvariant(V);
+  };
+
   auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
     if (!isa<PHINode>(Ptr) ||
         !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
@@ -5023,7 +5023,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
   // inserted into Worklist. If the use will be a scalar use, and the
   // pointer is only used by memory accesses, we place the pointer in
   // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
-  auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 
+  auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
     if (isScalarPtrInduction(MemAccess, Ptr)) {
       Worklist.insert(cast<Instruction>(Ptr));
       Instruction *Update = cast<Instruction>(
@@ -5035,286 +5035,286 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
                         << "\n");
       return;
     }
-    // We only care about bitcast and getelementptr instructions contained in 
-    // the loop. 
-    if (!isLoopVaryingBitCastOrGEP(Ptr)) 
-      return; 
- 
-    // If the pointer has already been identified as scalar (e.g., if it was 
-    // also identified as uniform), there's nothing to do. 
-    auto *I = cast<Instruction>(Ptr); 
-    if (Worklist.count(I)) 
-      return; 
- 
-    // If the use of the pointer will be a scalar use, and all users of the 
-    // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 
-    // place the pointer in PossibleNonScalarPtrs. 
-    if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 
-          return isa<LoadInst>(U) || isa<StoreInst>(U); 
-        })) 
-      ScalarPtrs.insert(I); 
-    else 
-      PossibleNonScalarPtrs.insert(I); 
-  }; 
- 
-  // We seed the scalars analysis with three classes of instructions: (1) 
+    // We only care about bitcast and getelementptr instructions contained in
+    // the loop.
+    if (!isLoopVaryingBitCastOrGEP(Ptr))
+      return;
+
+    // If the pointer has already been identified as scalar (e.g., if it was
+    // also identified as uniform), there's nothing to do.
+    auto *I = cast<Instruction>(Ptr);
+    if (Worklist.count(I))
+      return;
+
+    // If the use of the pointer will be a scalar use, and all users of the
+    // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
+    // place the pointer in PossibleNonScalarPtrs.
+    if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
+          return isa<LoadInst>(U) || isa<StoreInst>(U);
+        }))
+      ScalarPtrs.insert(I);
+    else
+      PossibleNonScalarPtrs.insert(I);
+  };
+
+  // We seed the scalars analysis with three classes of instructions: (1)
   // instructions marked uniform-after-vectorization and (2) bitcast,
   // getelementptr and (pointer) phi instructions used by memory accesses
   // requiring a scalar use.
-  // 
-  // (1) Add to the worklist all instructions that have been identified as 
-  // uniform-after-vectorization. 
-  Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 
- 
-  // (2) Add to the worklist all bitcast and getelementptr instructions used by 
-  // memory accesses requiring a scalar use. The pointer operands of loads and 
-  // stores will be scalar as long as the memory accesses is not a gather or 
-  // scatter operation. The value operand of a store will remain scalar if the 
-  // store is scalarized. 
-  for (auto *BB : TheLoop->blocks()) 
-    for (auto &I : *BB) { 
-      if (auto *Load = dyn_cast<LoadInst>(&I)) { 
-        evaluatePtrUse(Load, Load->getPointerOperand()); 
-      } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 
-        evaluatePtrUse(Store, Store->getPointerOperand()); 
-        evaluatePtrUse(Store, Store->getValueOperand()); 
-      } 
-    } 
-  for (auto *I : ScalarPtrs) 
-    if (!PossibleNonScalarPtrs.count(I)) { 
-      LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 
-      Worklist.insert(I); 
-    } 
- 
-  // Insert the forced scalars. 
-  // FIXME: Currently widenPHIInstruction() often creates a dead vector 
-  // induction variable when the PHI user is scalarized. 
-  auto ForcedScalar = ForcedScalars.find(VF); 
-  if (ForcedScalar != ForcedScalars.end()) 
-    for (auto *I : ForcedScalar->second) 
-      Worklist.insert(I); 
- 
-  // Expand the worklist by looking through any bitcasts and getelementptr 
-  // instructions we've already identified as scalar. This is similar to the 
-  // expansion step in collectLoopUniforms(); however, here we're only 
-  // expanding to include additional bitcasts and getelementptr instructions. 
-  unsigned Idx = 0; 
-  while (Idx != Worklist.size()) { 
-    Instruction *Dst = Worklist[Idx++]; 
-    if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 
-      continue; 
-    auto *Src = cast<Instruction>(Dst->getOperand(0)); 
-    if (llvm::all_of(Src->users(), [&](User *U) -> bool { 
-          auto *J = cast<Instruction>(U); 
-          return !TheLoop->contains(J) || Worklist.count(J) || 
-                 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 
-                  isScalarUse(J, Src)); 
-        })) { 
-      Worklist.insert(Src); 
-      LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 
-    } 
-  } 
- 
-  // An induction variable will remain scalar if all users of the induction 
-  // variable and induction variable update remain scalar. 
-  for (auto &Induction : Legal->getInductionVars()) { 
-    auto *Ind = Induction.first; 
-    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 
- 
-    // If tail-folding is applied, the primary induction variable will be used 
-    // to feed a vector compare. 
-    if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 
-      continue; 
- 
-    // Determine if all users of the induction variable are scalar after 
-    // vectorization. 
-    auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 
-      auto *I = cast<Instruction>(U); 
-      return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 
-    }); 
-    if (!ScalarInd) 
-      continue; 
- 
-    // Determine if all users of the induction variable update instruction are 
-    // scalar after vectorization. 
-    auto ScalarIndUpdate = 
-        llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 
-          auto *I = cast<Instruction>(U); 
-          return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 
-        }); 
-    if (!ScalarIndUpdate) 
-      continue; 
- 
-    // The induction variable and its update instruction will remain scalar. 
-    Worklist.insert(Ind); 
-    Worklist.insert(IndUpdate); 
-    LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 
-    LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 
-                      << "\n"); 
-  } 
- 
-  Scalars[VF].insert(Worklist.begin(), Worklist.end()); 
-} 
- 
+  //
+  // (1) Add to the worklist all instructions that have been identified as
+  // uniform-after-vectorization.
+  Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
+
+  // (2) Add to the worklist all bitcast and getelementptr instructions used by
+  // memory accesses requiring a scalar use. The pointer operands of loads and
+  // stores will be scalar as long as the memory accesses is not a gather or
+  // scatter operation. The value operand of a store will remain scalar if the
+  // store is scalarized.
+  for (auto *BB : TheLoop->blocks())
+    for (auto &I : *BB) {
+      if (auto *Load = dyn_cast<LoadInst>(&I)) {
+        evaluatePtrUse(Load, Load->getPointerOperand());
+      } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
+        evaluatePtrUse(Store, Store->getPointerOperand());
+        evaluatePtrUse(Store, Store->getValueOperand());
+      }
+    }
+  for (auto *I : ScalarPtrs)
+    if (!PossibleNonScalarPtrs.count(I)) {
+      LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
+      Worklist.insert(I);
+    }
+
+  // Insert the forced scalars.
+  // FIXME: Currently widenPHIInstruction() often creates a dead vector
+  // induction variable when the PHI user is scalarized.
+  auto ForcedScalar = ForcedScalars.find(VF);
+  if (ForcedScalar != ForcedScalars.end())
+    for (auto *I : ForcedScalar->second)
+      Worklist.insert(I);
+
+  // Expand the worklist by looking through any bitcasts and getelementptr
+  // instructions we've already identified as scalar. This is similar to the
+  // expansion step in collectLoopUniforms(); however, here we're only
+  // expanding to include additional bitcasts and getelementptr instructions.
+  unsigned Idx = 0;
+  while (Idx != Worklist.size()) {
+    Instruction *Dst = Worklist[Idx++];
+    if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
+      continue;
+    auto *Src = cast<Instruction>(Dst->getOperand(0));
+    if (llvm::all_of(Src->users(), [&](User *U) -> bool {
+          auto *J = cast<Instruction>(U);
+          return !TheLoop->contains(J) || Worklist.count(J) ||
+                 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
+                  isScalarUse(J, Src));
+        })) {
+      Worklist.insert(Src);
+      LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
+    }
+  }
+
+  // An induction variable will remain scalar if all users of the induction
+  // variable and induction variable update remain scalar.
+  for (auto &Induction : Legal->getInductionVars()) {
+    auto *Ind = Induction.first;
+    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
+
+    // If tail-folding is applied, the primary induction variable will be used
+    // to feed a vector compare.
+    if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
+      continue;
+
+    // Determine if all users of the induction variable are scalar after
+    // vectorization.
+    auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
+      auto *I = cast<Instruction>(U);
+      return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
+    });
+    if (!ScalarInd)
+      continue;
+
+    // Determine if all users of the induction variable update instruction are
+    // scalar after vectorization.
+    auto ScalarIndUpdate =
+        llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
+          auto *I = cast<Instruction>(U);
+          return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
+        });
+    if (!ScalarIndUpdate)
+      continue;
+
+    // The induction variable and its update instruction will remain scalar.
+    Worklist.insert(Ind);
+    Worklist.insert(IndUpdate);
+    LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
+    LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
+                      << "\n");
+  }
+
+  Scalars[VF].insert(Worklist.begin(), Worklist.end());
+}
+
 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
                                                          ElementCount VF) {
-  if (!blockNeedsPredication(I->getParent())) 
-    return false; 
-  switch(I->getOpcode()) { 
-  default: 
-    break; 
-  case Instruction::Load: 
-  case Instruction::Store: { 
-    if (!Legal->isMaskRequired(I)) 
-      return false; 
-    auto *Ptr = getLoadStorePointerOperand(I); 
-    auto *Ty = getMemInstValueType(I); 
-    // We have already decided how to vectorize this instruction, get that 
-    // result. 
+  if (!blockNeedsPredication(I->getParent()))
+    return false;
+  switch(I->getOpcode()) {
+  default:
+    break;
+  case Instruction::Load:
+  case Instruction::Store: {
+    if (!Legal->isMaskRequired(I))
+      return false;
+    auto *Ptr = getLoadStorePointerOperand(I);
+    auto *Ty = getMemInstValueType(I);
+    // We have already decided how to vectorize this instruction, get that
+    // result.
     if (VF.isVector()) {
-      InstWidening WideningDecision = getWideningDecision(I, VF); 
-      assert(WideningDecision != CM_Unknown && 
-             "Widening decision should be ready at this moment"); 
-      return WideningDecision == CM_Scalarize; 
-    } 
-    const Align Alignment = getLoadStoreAlignment(I); 
-    return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 
-                                isLegalMaskedGather(Ty, Alignment)) 
-                            : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 
-                                isLegalMaskedScatter(Ty, Alignment)); 
-  } 
-  case Instruction::UDiv: 
-  case Instruction::SDiv: 
-  case Instruction::SRem: 
-  case Instruction::URem: 
-    return mayDivideByZero(*I); 
-  } 
-  return false; 
-} 
- 
+      InstWidening WideningDecision = getWideningDecision(I, VF);
+      assert(WideningDecision != CM_Unknown &&
+             "Widening decision should be ready at this moment");
+      return WideningDecision == CM_Scalarize;
+    }
+    const Align Alignment = getLoadStoreAlignment(I);
+    return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
+                                isLegalMaskedGather(Ty, Alignment))
+                            : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
+                                isLegalMaskedScatter(Ty, Alignment));
+  }
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::SRem:
+  case Instruction::URem:
+    return mayDivideByZero(*I);
+  }
+  return false;
+}
+
 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
     Instruction *I, ElementCount VF) {
-  assert(isAccessInterleaved(I) && "Expecting interleaved access."); 
-  assert(getWideningDecision(I, VF) == CM_Unknown && 
-         "Decision should not be set yet."); 
-  auto *Group = getInterleavedAccessGroup(I); 
-  assert(Group && "Must have a group."); 
- 
-  // If the instruction's allocated size doesn't equal it's type size, it 
-  // requires padding and will be scalarized. 
-  auto &DL = I->getModule()->getDataLayout(); 
-  auto *ScalarTy = getMemInstValueType(I); 
+  assert(isAccessInterleaved(I) && "Expecting interleaved access.");
+  assert(getWideningDecision(I, VF) == CM_Unknown &&
+         "Decision should not be set yet.");
+  auto *Group = getInterleavedAccessGroup(I);
+  assert(Group && "Must have a group.");
+
+  // If the instruction's allocated size doesn't equal it's type size, it
+  // requires padding and will be scalarized.
+  auto &DL = I->getModule()->getDataLayout();
+  auto *ScalarTy = getMemInstValueType(I);
   if (hasIrregularType(ScalarTy, DL))
-    return false; 
- 
-  // Check if masking is required. 
-  // A Group may need masking for one of two reasons: it resides in a block that 
-  // needs predication, or it was decided to use masking to deal with gaps. 
-  bool PredicatedAccessRequiresMasking = 
-      Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 
-  bool AccessWithGapsRequiresMasking = 
-      Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 
-  if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 
-    return true; 
- 
-  // If masked interleaving is required, we expect that the user/target had 
-  // enabled it, because otherwise it either wouldn't have been created or 
-  // it should have been invalidated by the CostModel. 
-  assert(useMaskedInterleavedAccesses(TTI) && 
-         "Masked interleave-groups for predicated accesses are not enabled."); 
- 
-  auto *Ty = getMemInstValueType(I); 
-  const Align Alignment = getLoadStoreAlignment(I); 
-  return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 
-                          : TTI.isLegalMaskedStore(Ty, Alignment); 
-} 
- 
+    return false;
+
+  // Check if masking is required.
+  // A Group may need masking for one of two reasons: it resides in a block that
+  // needs predication, or it was decided to use masking to deal with gaps.
+  bool PredicatedAccessRequiresMasking =
+      Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
+  bool AccessWithGapsRequiresMasking =
+      Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
+  if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
+    return true;
+
+  // If masked interleaving is required, we expect that the user/target had
+  // enabled it, because otherwise it either wouldn't have been created or
+  // it should have been invalidated by the CostModel.
+  assert(useMaskedInterleavedAccesses(TTI) &&
+         "Masked interleave-groups for predicated accesses are not enabled.");
+
+  auto *Ty = getMemInstValueType(I);
+  const Align Alignment = getLoadStoreAlignment(I);
+  return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
+                          : TTI.isLegalMaskedStore(Ty, Alignment);
+}
+
 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
     Instruction *I, ElementCount VF) {
-  // Get and ensure we have a valid memory instruction. 
-  LoadInst *LI = dyn_cast<LoadInst>(I); 
-  StoreInst *SI = dyn_cast<StoreInst>(I); 
-  assert((LI || SI) && "Invalid memory instruction"); 
- 
-  auto *Ptr = getLoadStorePointerOperand(I); 
- 
-  // In order to be widened, the pointer should be consecutive, first of all. 
-  if (!Legal->isConsecutivePtr(Ptr)) 
-    return false; 
- 
-  // If the instruction is a store located in a predicated block, it will be 
-  // scalarized. 
-  if (isScalarWithPredication(I)) 
-    return false; 
- 
-  // If the instruction's allocated size doesn't equal it's type size, it 
-  // requires padding and will be scalarized. 
-  auto &DL = I->getModule()->getDataLayout(); 
-  auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 
+  // Get and ensure we have a valid memory instruction.
+  LoadInst *LI = dyn_cast<LoadInst>(I);
+  StoreInst *SI = dyn_cast<StoreInst>(I);
+  assert((LI || SI) && "Invalid memory instruction");
+
+  auto *Ptr = getLoadStorePointerOperand(I);
+
+  // In order to be widened, the pointer should be consecutive, first of all.
+  if (!Legal->isConsecutivePtr(Ptr))
+    return false;
+
+  // If the instruction is a store located in a predicated block, it will be
+  // scalarized.
+  if (isScalarWithPredication(I))
+    return false;
+
+  // If the instruction's allocated size doesn't equal it's type size, it
+  // requires padding and will be scalarized.
+  auto &DL = I->getModule()->getDataLayout();
+  auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
   if (hasIrregularType(ScalarTy, DL))
-    return false; 
- 
-  return true; 
-} 
- 
+    return false;
+
+  return true;
+}
+
 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
-  // We should not collect Uniforms more than once per VF. Right now, 
-  // this function is called from collectUniformsAndScalars(), which 
-  // already does this check. Collecting Uniforms for VF=1 does not make any 
-  // sense. 
- 
+  // We should not collect Uniforms more than once per VF. Right now,
+  // this function is called from collectUniformsAndScalars(), which
+  // already does this check. Collecting Uniforms for VF=1 does not make any
+  // sense.
+
   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
-         "This function should not be visited twice for the same VF"); 
- 
-  // Visit the list of Uniforms. If we'll not find any uniform value, we'll 
-  // not analyze again.  Uniforms.count(VF) will return 1. 
-  Uniforms[VF].clear(); 
- 
-  // We now know that the loop is vectorizable! 
-  // Collect instructions inside the loop that will remain uniform after 
-  // vectorization. 
- 
-  // Global values, params and instructions outside of current loop are out of 
-  // scope. 
-  auto isOutOfScope = [&](Value *V) -> bool { 
-    Instruction *I = dyn_cast<Instruction>(V); 
-    return (!I || !TheLoop->contains(I)); 
-  }; 
- 
-  SetVector<Instruction *> Worklist; 
-  BasicBlock *Latch = TheLoop->getLoopLatch(); 
- 
-  // Instructions that are scalar with predication must not be considered 
-  // uniform after vectorization, because that would create an erroneous 
-  // replicating region where only a single instance out of VF should be formed. 
-  // TODO: optimize such seldom cases if found important, see PR40816. 
-  auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 
+         "This function should not be visited twice for the same VF");
+
+  // Visit the list of Uniforms. If we'll not find any uniform value, we'll
+  // not analyze again.  Uniforms.count(VF) will return 1.
+  Uniforms[VF].clear();
+
+  // We now know that the loop is vectorizable!
+  // Collect instructions inside the loop that will remain uniform after
+  // vectorization.
+
+  // Global values, params and instructions outside of current loop are out of
+  // scope.
+  auto isOutOfScope = [&](Value *V) -> bool {
+    Instruction *I = dyn_cast<Instruction>(V);
+    return (!I || !TheLoop->contains(I));
+  };
+
+  SetVector<Instruction *> Worklist;
+  BasicBlock *Latch = TheLoop->getLoopLatch();
+
+  // Instructions that are scalar with predication must not be considered
+  // uniform after vectorization, because that would create an erroneous
+  // replicating region where only a single instance out of VF should be formed.
+  // TODO: optimize such seldom cases if found important, see PR40816.
+  auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
     if (isOutOfScope(I)) {
       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
                         << *I << "\n");
       return;
     }
-    if (isScalarWithPredication(I, VF)) { 
-      LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 
-                        << *I << "\n"); 
-      return; 
-    } 
-    LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 
-    Worklist.insert(I); 
-  }; 
- 
-  // Start with the conditional branch. If the branch condition is an 
-  // instruction contained in the loop that is only used by the branch, it is 
-  // uniform. 
-  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 
-  if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 
-    addToWorklistIfAllowed(Cmp); 
- 
+    if (isScalarWithPredication(I, VF)) {
+      LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
+                        << *I << "\n");
+      return;
+    }
+    LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
+    Worklist.insert(I);
+  };
+
+  // Start with the conditional branch. If the branch condition is an
+  // instruction contained in the loop that is only used by the branch, it is
+  // uniform.
+  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
+  if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
+    addToWorklistIfAllowed(Cmp);
+
   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
-    InstWidening WideningDecision = getWideningDecision(I, VF); 
-    assert(WideningDecision != CM_Unknown && 
-           "Widening decision should be ready at this moment"); 
- 
+    InstWidening WideningDecision = getWideningDecision(I, VF);
+    assert(WideningDecision != CM_Unknown &&
+           "Widening decision should be ready at this moment");
+
     // A uniform memory op is itself uniform.  We exclude uniform stores
     // here as they demand the last lane, not the first one.
     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
@@ -5322,10 +5322,10 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
       return true;
     }
 
-    return (WideningDecision == CM_Widen || 
-            WideningDecision == CM_Widen_Reverse || 
-            WideningDecision == CM_Interleave); 
-  }; 
+    return (WideningDecision == CM_Widen ||
+            WideningDecision == CM_Widen_Reverse ||
+            WideningDecision == CM_Interleave);
+  };
 
 
   // Returns true if Ptr is the pointer operand of a memory access instruction
@@ -5343,24 +5343,24 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
 
   // Scan the loop for instructions which are either a) known to have only
   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
-  for (auto *BB : TheLoop->blocks()) 
-    for (auto &I : *BB) { 
-      // If there's no pointer operand, there's nothing to do. 
+  for (auto *BB : TheLoop->blocks())
+    for (auto &I : *BB) {
+      // If there's no pointer operand, there's nothing to do.
       auto *Ptr = getLoadStorePointerOperand(&I);
-      if (!Ptr) 
-        continue; 
- 
+      if (!Ptr)
+        continue;
+
       // A uniform memory op is itself uniform.  We exclude uniform stores
       // here as they demand the last lane, not the first one.
       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
         addToWorklistIfAllowed(&I);
- 
+
       if (isUniformDecision(&I, VF)) {
         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
         HasUniformUse.insert(Ptr);
       }
-    } 
- 
+    }
+
   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
   // demanding) users.  Since loops are assumed to be in LCSSA form, this
   // disallows uses outside the loop as well.
@@ -5375,156 +5375,156 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
     if (UsersAreMemAccesses)
       addToWorklistIfAllowed(I);
   }
- 
-  // Expand Worklist in topological order: whenever a new instruction 
-  // is added , its users should be already inside Worklist.  It ensures 
-  // a uniform instruction will only be used by uniform instructions. 
-  unsigned idx = 0; 
-  while (idx != Worklist.size()) { 
-    Instruction *I = Worklist[idx++]; 
- 
-    for (auto OV : I->operand_values()) { 
-      // isOutOfScope operands cannot be uniform instructions. 
-      if (isOutOfScope(OV)) 
-        continue; 
-      // First order recurrence Phi's should typically be considered 
-      // non-uniform. 
-      auto *OP = dyn_cast<PHINode>(OV); 
-      if (OP && Legal->isFirstOrderRecurrence(OP)) 
-        continue; 
-      // If all the users of the operand are uniform, then add the 
-      // operand into the uniform worklist. 
-      auto *OI = cast<Instruction>(OV); 
-      if (llvm::all_of(OI->users(), [&](User *U) -> bool { 
-            auto *J = cast<Instruction>(U); 
+
+  // Expand Worklist in topological order: whenever a new instruction
+  // is added , its users should be already inside Worklist.  It ensures
+  // a uniform instruction will only be used by uniform instructions.
+  unsigned idx = 0;
+  while (idx != Worklist.size()) {
+    Instruction *I = Worklist[idx++];
+
+    for (auto OV : I->operand_values()) {
+      // isOutOfScope operands cannot be uniform instructions.
+      if (isOutOfScope(OV))
+        continue;
+      // First order recurrence Phi's should typically be considered
+      // non-uniform.
+      auto *OP = dyn_cast<PHINode>(OV);
+      if (OP && Legal->isFirstOrderRecurrence(OP))
+        continue;
+      // If all the users of the operand are uniform, then add the
+      // operand into the uniform worklist.
+      auto *OI = cast<Instruction>(OV);
+      if (llvm::all_of(OI->users(), [&](User *U) -> bool {
+            auto *J = cast<Instruction>(U);
             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
-          })) 
-        addToWorklistIfAllowed(OI); 
-    } 
-  } 
- 
-  // For an instruction to be added into Worklist above, all its users inside 
-  // the loop should also be in Worklist. However, this condition cannot be 
-  // true for phi nodes that form a cyclic dependence. We must process phi 
-  // nodes separately. An induction variable will remain uniform if all users 
-  // of the induction variable and induction variable update remain uniform. 
-  // The code below handles both pointer and non-pointer induction variables. 
-  for (auto &Induction : Legal->getInductionVars()) { 
-    auto *Ind = Induction.first; 
-    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 
- 
-    // Determine if all users of the induction variable are uniform after 
-    // vectorization. 
-    auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 
-      auto *I = cast<Instruction>(U); 
-      return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 
-             isVectorizedMemAccessUse(I, Ind); 
-    }); 
-    if (!UniformInd) 
-      continue; 
- 
-    // Determine if all users of the induction variable update instruction are 
-    // uniform after vectorization. 
-    auto UniformIndUpdate = 
-        llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 
-          auto *I = cast<Instruction>(U); 
-          return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 
-                 isVectorizedMemAccessUse(I, IndUpdate); 
-        }); 
-    if (!UniformIndUpdate) 
-      continue; 
- 
-    // The induction variable and its update instruction will remain uniform. 
-    addToWorklistIfAllowed(Ind); 
-    addToWorklistIfAllowed(IndUpdate); 
-  } 
- 
-  Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 
-} 
- 
-bool LoopVectorizationCostModel::runtimeChecksRequired() { 
-  LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 
- 
-  if (Legal->getRuntimePointerChecking()->Need) { 
-    reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 
-        "runtime pointer checks needed. Enable vectorization of this " 
-        "loop with '#pragma clang loop vectorize(enable)' when " 
-        "compiling with -Os/-Oz", 
-        "CantVersionLoopWithOptForSize", ORE, TheLoop); 
-    return true; 
-  } 
- 
-  if (!PSE.getUnionPredicate().getPredicates().empty()) { 
-    reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 
-        "runtime SCEV checks needed. Enable vectorization of this " 
-        "loop with '#pragma clang loop vectorize(enable)' when " 
-        "compiling with -Os/-Oz", 
-        "CantVersionLoopWithOptForSize", ORE, TheLoop); 
-    return true; 
-  } 
- 
-  // FIXME: Avoid specializing for stride==1 instead of bailing out. 
-  if (!Legal->getLAI()->getSymbolicStrides().empty()) { 
-    reportVectorizationFailure("Runtime stride check for small trip count", 
-        "runtime stride == 1 checks needed. Enable vectorization of " 
-        "this loop without such check by compiling with -Os/-Oz", 
-        "CantVersionLoopWithOptForSize", ORE, TheLoop); 
-    return true; 
-  } 
- 
-  return false; 
-} 
- 
+          }))
+        addToWorklistIfAllowed(OI);
+    }
+  }
+
+  // For an instruction to be added into Worklist above, all its users inside
+  // the loop should also be in Worklist. However, this condition cannot be
+  // true for phi nodes that form a cyclic dependence. We must process phi
+  // nodes separately. An induction variable will remain uniform if all users
+  // of the induction variable and induction variable update remain uniform.
+  // The code below handles both pointer and non-pointer induction variables.
+  for (auto &Induction : Legal->getInductionVars()) {
+    auto *Ind = Induction.first;
+    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
+
+    // Determine if all users of the induction variable are uniform after
+    // vectorization.
+    auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
+      auto *I = cast<Instruction>(U);
+      return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
+             isVectorizedMemAccessUse(I, Ind);
+    });
+    if (!UniformInd)
+      continue;
+
+    // Determine if all users of the induction variable update instruction are
+    // uniform after vectorization.
+    auto UniformIndUpdate =
+        llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
+          auto *I = cast<Instruction>(U);
+          return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
+                 isVectorizedMemAccessUse(I, IndUpdate);
+        });
+    if (!UniformIndUpdate)
+      continue;
+
+    // The induction variable and its update instruction will remain uniform.
+    addToWorklistIfAllowed(Ind);
+    addToWorklistIfAllowed(IndUpdate);
+  }
+
+  Uniforms[VF].insert(Worklist.begin(), Worklist.end());
+}
+
+bool LoopVectorizationCostModel::runtimeChecksRequired() {
+  LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
+
+  if (Legal->getRuntimePointerChecking()->Need) {
+    reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
+        "runtime pointer checks needed. Enable vectorization of this "
+        "loop with '#pragma clang loop vectorize(enable)' when "
+        "compiling with -Os/-Oz",
+        "CantVersionLoopWithOptForSize", ORE, TheLoop);
+    return true;
+  }
+
+  if (!PSE.getUnionPredicate().getPredicates().empty()) {
+    reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
+        "runtime SCEV checks needed. Enable vectorization of this "
+        "loop with '#pragma clang loop vectorize(enable)' when "
+        "compiling with -Os/-Oz",
+        "CantVersionLoopWithOptForSize", ORE, TheLoop);
+    return true;
+  }
+
+  // FIXME: Avoid specializing for stride==1 instead of bailing out.
+  if (!Legal->getLAI()->getSymbolicStrides().empty()) {
+    reportVectorizationFailure("Runtime stride check for small trip count",
+        "runtime stride == 1 checks needed. Enable vectorization of "
+        "this loop without such check by compiling with -Os/-Oz",
+        "CantVersionLoopWithOptForSize", ORE, TheLoop);
+    return true;
+  }
+
+  return false;
+}
+
 Optional<ElementCount>
 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
-  if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 
-    // TODO: It may by useful to do since it's still likely to be dynamically 
-    // uniform if the target can skip. 
-    reportVectorizationFailure( 
-        "Not inserting runtime ptr check for divergent target", 
-        "runtime pointer checks needed. Not enabled for divergent target", 
-        "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 
-    return None; 
-  } 
- 
-  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 
-  LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 
-  if (TC == 1) { 
-    reportVectorizationFailure("Single iteration (non) loop", 
-        "loop trip count is one, irrelevant for vectorization", 
-        "SingleIterationLoop", ORE, TheLoop); 
-    return None; 
-  } 
- 
-  switch (ScalarEpilogueStatus) { 
-  case CM_ScalarEpilogueAllowed: 
+  if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
+    // TODO: It may by useful to do since it's still likely to be dynamically
+    // uniform if the target can skip.
+    reportVectorizationFailure(
+        "Not inserting runtime ptr check for divergent target",
+        "runtime pointer checks needed. Not enabled for divergent target",
+        "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
+    return None;
+  }
+
+  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
+  LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
+  if (TC == 1) {
+    reportVectorizationFailure("Single iteration (non) loop",
+        "loop trip count is one, irrelevant for vectorization",
+        "SingleIterationLoop", ORE, TheLoop);
+    return None;
+  }
+
+  switch (ScalarEpilogueStatus) {
+  case CM_ScalarEpilogueAllowed:
     return computeFeasibleMaxVF(TC, UserVF);
   case CM_ScalarEpilogueNotAllowedUsePredicate:
     LLVM_FALLTHROUGH;
-  case CM_ScalarEpilogueNotNeededUsePredicate: 
-    LLVM_DEBUG( 
-        dbgs() << "LV: vector predicate hint/switch found.\n" 
-               << "LV: Not allowing scalar epilogue, creating predicated " 
-               << "vector loop.\n"); 
-    break; 
-  case CM_ScalarEpilogueNotAllowedLowTripLoop: 
-    // fallthrough as a special case of OptForSize 
-  case CM_ScalarEpilogueNotAllowedOptSize: 
-    if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 
-      LLVM_DEBUG( 
-          dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 
-    else 
-      LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 
-                        << "count.\n"); 
- 
-    // Bail if runtime checks are required, which are not good when optimising 
-    // for size. 
-    if (runtimeChecksRequired()) 
-      return None; 
-
-    break; 
-  } 
- 
+  case CM_ScalarEpilogueNotNeededUsePredicate:
+    LLVM_DEBUG(
+        dbgs() << "LV: vector predicate hint/switch found.\n"
+               << "LV: Not allowing scalar epilogue, creating predicated "
+               << "vector loop.\n");
+    break;
+  case CM_ScalarEpilogueNotAllowedLowTripLoop:
+    // fallthrough as a special case of OptForSize
+  case CM_ScalarEpilogueNotAllowedOptSize:
+    if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
+      LLVM_DEBUG(
+          dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
+    else
+      LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
+                        << "count.\n");
+
+    // Bail if runtime checks are required, which are not good when optimising
+    // for size.
+    if (runtimeChecksRequired())
+      return None;
+
+    break;
+  }
+
   // The only loops we can vectorize without a scalar epilogue, are loops with
   // a bottom-test and a single exiting block. We'd have to handle the fact
   // that not every instruction executes on the last iteration.  This will
@@ -5541,18 +5541,18 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     return None;
   }
 
-  // Now try the tail folding 
- 
-  // Invalidate interleave groups that require an epilogue if we can't mask 
-  // the interleave-group. 
-  if (!useMaskedInterleavedAccesses(TTI)) { 
-    assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 
-           "No decisions should have been taken at this point"); 
-    // Note: There is no need to invalidate any cost modeling decisions here, as 
-    // non where taken so far. 
-    InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 
-  } 
- 
+  // Now try the tail folding
+
+  // Invalidate interleave groups that require an epilogue if we can't mask
+  // the interleave-group.
+  if (!useMaskedInterleavedAccesses(TTI)) {
+    assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
+           "No decisions should have been taken at this point");
+    // Note: There is no need to invalidate any cost modeling decisions here, as
+    // non where taken so far.
+    InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
+  }
+
   ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
   assert(!MaxVF.isScalable() &&
          "Scalable vectors do not yet support tail folding");
@@ -5569,20 +5569,20 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
   const SCEV *Rem = SE->getURemExpr(
       ExitCount, SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
   if (Rem->isZero()) {
-    // Accept MaxVF if we do not have a tail. 
-    LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 
-    return MaxVF; 
-  } 
- 
-  // If we don't know the precise trip count, or if the trip count that we 
-  // found modulo the vectorization factor is not zero, try to fold the tail 
-  // by masking. 
-  // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 
-  if (Legal->prepareToFoldTailByMasking()) { 
-    FoldTailByMasking = true; 
-    return MaxVF; 
-  } 
- 
+    // Accept MaxVF if we do not have a tail.
+    LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
+    return MaxVF;
+  }
+
+  // If we don't know the precise trip count, or if the trip count that we
+  // found modulo the vectorization factor is not zero, try to fold the tail
+  // by masking.
+  // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
+  if (Legal->prepareToFoldTailByMasking()) {
+    FoldTailByMasking = true;
+    return MaxVF;
+  }
+
   // If there was a tail-folding hint/switch, but we can't fold the tail by
   // masking, fallback to a vectorization with a scalar epilogue.
   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
@@ -5597,23 +5597,23 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     return None;
   }
 
-  if (TC == 0) { 
-    reportVectorizationFailure( 
-        "Unable to calculate the loop count due to complex control flow", 
-        "unable to calculate the loop count due to complex control flow", 
-        "UnknownLoopCountComplexCFG", ORE, TheLoop); 
-    return None; 
-  } 
- 
-  reportVectorizationFailure( 
-      "Cannot optimize for size and vectorize at the same time.", 
-      "cannot optimize for size and vectorize at the same time. " 
-      "Enable vectorization of this loop with '#pragma clang loop " 
-      "vectorize(enable)' when compiling with -Os/-Oz", 
-      "NoTailLoopWithOptForSize", ORE, TheLoop); 
-  return None; 
-} 
- 
+  if (TC == 0) {
+    reportVectorizationFailure(
+        "Unable to calculate the loop count due to complex control flow",
+        "unable to calculate the loop count due to complex control flow",
+        "UnknownLoopCountComplexCFG", ORE, TheLoop);
+    return None;
+  }
+
+  reportVectorizationFailure(
+      "Cannot optimize for size and vectorize at the same time.",
+      "cannot optimize for size and vectorize at the same time. "
+      "Enable vectorization of this loop with '#pragma clang loop "
+      "vectorize(enable)' when compiling with -Os/-Oz",
+      "NoTailLoopWithOptForSize", ORE, TheLoop);
+  return None;
+}
+
 ElementCount
 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
                                                  ElementCount UserVF) {
@@ -5641,24 +5641,24 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
       Legal->isSafeForAnyVectorWidth())
     return UserVF;
 
-  MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 
-  unsigned SmallestType, WidestType; 
-  std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 
-  unsigned WidestRegister = TTI.getRegisterBitWidth(true); 
- 
-  // Get the maximum safe dependence distance in bits computed by LAA. 
-  // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 
-  // the memory accesses that is most restrictive (involved in the smallest 
-  // dependence distance). 
+  MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
+  unsigned SmallestType, WidestType;
+  std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
+  unsigned WidestRegister = TTI.getRegisterBitWidth(true);
+
+  // Get the maximum safe dependence distance in bits computed by LAA.
+  // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
+  // the memory accesses that is most restrictive (involved in the smallest
+  // dependence distance).
   unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
- 
+
   // If the user vectorization factor is legally unsafe, clamp it to a safe
   // value. Otherwise, return as is.
   if (UserVF.isNonZero() && !IgnoreScalableUserVF) {
     unsigned MaxSafeElements =
         PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
     ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements);
- 
+
     if (UserVF.isScalable()) {
       Optional<unsigned> MaxVScale = TTI.getMaxVScale();
 
@@ -5707,71 +5707,71 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
 
   WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
 
-  // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 
-  // Note that both WidestRegister and WidestType may not be a powers of 2. 
-  unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType); 
- 
-  LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 
-                    << " / " << WidestType << " bits.\n"); 
-  LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 
-                    << WidestRegister << " bits.\n"); 
- 
+  // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
+  // Note that both WidestRegister and WidestType may not be a powers of 2.
+  unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
+
+  LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
+                    << " / " << WidestType << " bits.\n");
+  LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
+                    << WidestRegister << " bits.\n");
+
   assert(MaxVectorSize <= WidestRegister &&
          "Did not expect to pack so many elements"
          " into one vector!");
-  if (MaxVectorSize == 0) { 
-    LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 
-    MaxVectorSize = 1; 
+  if (MaxVectorSize == 0) {
+    LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
+    MaxVectorSize = 1;
     return ElementCount::getFixed(MaxVectorSize);
-  } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 
-             isPowerOf2_32(ConstTripCount)) { 
-    // We need to clamp the VF to be the ConstTripCount. There is no point in 
-    // choosing a higher viable VF as done in the loop below. 
-    LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 
-                      << ConstTripCount << "\n"); 
-    MaxVectorSize = ConstTripCount; 
+  } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
+             isPowerOf2_32(ConstTripCount)) {
+    // We need to clamp the VF to be the ConstTripCount. There is no point in
+    // choosing a higher viable VF as done in the loop below.
+    LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
+                      << ConstTripCount << "\n");
+    MaxVectorSize = ConstTripCount;
     return ElementCount::getFixed(MaxVectorSize);
-  } 
- 
-  unsigned MaxVF = MaxVectorSize; 
-  if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 
-      (MaximizeBandwidth && isScalarEpilogueAllowed())) { 
-    // Collect all viable vectorization factors larger than the default MaxVF 
-    // (i.e. MaxVectorSize). 
+  }
+
+  unsigned MaxVF = MaxVectorSize;
+  if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
+      (MaximizeBandwidth && isScalarEpilogueAllowed())) {
+    // Collect all viable vectorization factors larger than the default MaxVF
+    // (i.e. MaxVectorSize).
     SmallVector<ElementCount, 8> VFs;
-    unsigned NewMaxVectorSize = WidestRegister / SmallestType; 
-    for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 
+    unsigned NewMaxVectorSize = WidestRegister / SmallestType;
+    for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
       VFs.push_back(ElementCount::getFixed(VS));
- 
-    // For each VF calculate its register usage. 
-    auto RUs = calculateRegisterUsage(VFs); 
- 
-    // Select the largest VF which doesn't require more registers than existing 
-    // ones. 
-    for (int i = RUs.size() - 1; i >= 0; --i) { 
-      bool Selected = true; 
-      for (auto& pair : RUs[i].MaxLocalUsers) { 
-        unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 
-        if (pair.second > TargetNumRegisters) 
-          Selected = false; 
-      } 
-      if (Selected) { 
+
+    // For each VF calculate its register usage.
+    auto RUs = calculateRegisterUsage(VFs);
+
+    // Select the largest VF which doesn't require more registers than existing
+    // ones.
+    for (int i = RUs.size() - 1; i >= 0; --i) {
+      bool Selected = true;
+      for (auto& pair : RUs[i].MaxLocalUsers) {
+        unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
+        if (pair.second > TargetNumRegisters)
+          Selected = false;
+      }
+      if (Selected) {
         MaxVF = VFs[i].getKnownMinValue();
-        break; 
-      } 
-    } 
-    if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 
-      if (MaxVF < MinVF) { 
-        LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 
-                          << ") with target's minimum: " << MinVF << '\n'); 
-        MaxVF = MinVF; 
-      } 
-    } 
-  } 
+        break;
+      }
+    }
+    if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
+      if (MaxVF < MinVF) {
+        LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
+                          << ") with target's minimum: " << MinVF << '\n');
+        MaxVF = MinVF;
+      }
+    }
+  }
   return ElementCount::getFixed(MaxVF);
-} 
- 
-VectorizationFactor 
+}
+
+VectorizationFactor
 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
   // FIXME: This can be fixed for scalable vectors later, because at this stage
   // the LoopVectorizer will only consider vectorizing a loop with scalable
@@ -5782,33 +5782,33 @@ LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
 
-  unsigned Width = 1; 
+  unsigned Width = 1;
   const float ScalarCost = *ExpectedCost.getValue();
   float Cost = ScalarCost;
- 
-  bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 
+
+  bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
   if (ForceVectorization && MaxVF.isVector()) {
-    // Ignore scalar width, because the user explicitly wants vectorization. 
-    // Initialize cost to max so that VF = 2 is, at least, chosen during cost 
-    // evaluation. 
-    Cost = std::numeric_limits<float>::max(); 
-  } 
- 
+    // Ignore scalar width, because the user explicitly wants vectorization.
+    // Initialize cost to max so that VF = 2 is, at least, chosen during cost
+    // evaluation.
+    Cost = std::numeric_limits<float>::max();
+  }
+
   for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) {
-    // Notice that the vector loop needs to be executed less times, so 
-    // we need to divide the cost of the vector loops by the width of 
-    // the vector elements. 
+    // Notice that the vector loop needs to be executed less times, so
+    // we need to divide the cost of the vector loops by the width of
+    // the vector elements.
     VectorizationCostTy C = expectedCost(ElementCount::getFixed(i));
     assert(C.first.isValid() && "Unexpected invalid cost for vector loop");
     float VectorCost = *C.first.getValue() / (float)i;
-    LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 
-                      << " costs: " << (int)VectorCost << ".\n"); 
-    if (!C.second && !ForceVectorization) { 
-      LLVM_DEBUG( 
-          dbgs() << "LV: Not considering vector loop of width " << i 
-                 << " because it will not generate any vector instructions.\n"); 
-      continue; 
-    } 
+    LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
+                      << " costs: " << (int)VectorCost << ".\n");
+    if (!C.second && !ForceVectorization) {
+      LLVM_DEBUG(
+          dbgs() << "LV: Not considering vector loop of width " << i
+                 << " because it will not generate any vector instructions.\n");
+      continue;
+    }
 
     // If profitable add it to ProfitableVF list.
     if (VectorCost < ScalarCost) {
@@ -5816,29 +5816,29 @@ LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
           {ElementCount::getFixed(i), (unsigned)VectorCost}));
     }
 
-    if (VectorCost < Cost) { 
-      Cost = VectorCost; 
-      Width = i; 
-    } 
-  } 
- 
-  if (!EnableCondStoresVectorization && NumPredStores) { 
-    reportVectorizationFailure("There are conditional stores.", 
-        "store that is conditionally executed prevents vectorization", 
-        "ConditionalStore", ORE, TheLoop); 
-    Width = 1; 
-    Cost = ScalarCost; 
-  } 
- 
-  LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 
-             << "LV: Vectorization seems to be not beneficial, " 
-             << "but was forced by a user.\n"); 
-  LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 
+    if (VectorCost < Cost) {
+      Cost = VectorCost;
+      Width = i;
+    }
+  }
+
+  if (!EnableCondStoresVectorization && NumPredStores) {
+    reportVectorizationFailure("There are conditional stores.",
+        "store that is conditionally executed prevents vectorization",
+        "ConditionalStore", ORE, TheLoop);
+    Width = 1;
+    Cost = ScalarCost;
+  }
+
+  LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
+             << "LV: Vectorization seems to be not beneficial, "
+             << "but was forced by a user.\n");
+  LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
   VectorizationFactor Factor = {ElementCount::getFixed(Width),
                                 (unsigned)(Width * Cost)};
-  return Factor; 
-} 
- 
+  return Factor;
+}
+
 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
     const Loop &L, ElementCount VF) const {
   // Cross iteration phis such as reductions need special handling and are
@@ -5959,163 +5959,163 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
   return Result;
 }
 
-std::pair<unsigned, unsigned> 
-LoopVectorizationCostModel::getSmallestAndWidestTypes() { 
-  unsigned MinWidth = -1U; 
-  unsigned MaxWidth = 8; 
-  const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 
- 
-  // For each block. 
-  for (BasicBlock *BB : TheLoop->blocks()) { 
-    // For each instruction in the loop. 
-    for (Instruction &I : BB->instructionsWithoutDebug()) { 
-      Type *T = I.getType(); 
- 
-      // Skip ignored values. 
-      if (ValuesToIgnore.count(&I)) 
-        continue; 
- 
-      // Only examine Loads, Stores and PHINodes. 
-      if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 
-        continue; 
- 
-      // Examine PHI nodes that are reduction variables. Update the type to 
-      // account for the recurrence type. 
-      if (auto *PN = dyn_cast<PHINode>(&I)) { 
-        if (!Legal->isReductionVariable(PN)) 
-          continue; 
-        RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 
+std::pair<unsigned, unsigned>
+LoopVectorizationCostModel::getSmallestAndWidestTypes() {
+  unsigned MinWidth = -1U;
+  unsigned MaxWidth = 8;
+  const DataLayout &DL = TheFunction->getParent()->getDataLayout();
+
+  // For each block.
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    // For each instruction in the loop.
+    for (Instruction &I : BB->instructionsWithoutDebug()) {
+      Type *T = I.getType();
+
+      // Skip ignored values.
+      if (ValuesToIgnore.count(&I))
+        continue;
+
+      // Only examine Loads, Stores and PHINodes.
+      if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
+        continue;
+
+      // Examine PHI nodes that are reduction variables. Update the type to
+      // account for the recurrence type.
+      if (auto *PN = dyn_cast<PHINode>(&I)) {
+        if (!Legal->isReductionVariable(PN))
+          continue;
+        RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
         if (PreferInLoopReductions ||
             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
                                       RdxDesc.getRecurrenceType(),
                                       TargetTransformInfo::ReductionFlags()))
           continue;
-        T = RdxDesc.getRecurrenceType(); 
-      } 
- 
-      // Examine the stored values. 
-      if (auto *ST = dyn_cast<StoreInst>(&I)) 
-        T = ST->getValueOperand()->getType(); 
- 
-      // Ignore loaded pointer types and stored pointer types that are not 
-      // vectorizable. 
-      // 
-      // FIXME: The check here attempts to predict whether a load or store will 
-      //        be vectorized. We only know this for certain after a VF has 
-      //        been selected. Here, we assume that if an access can be 
-      //        vectorized, it will be. We should also look at extending this 
-      //        optimization to non-pointer types. 
-      // 
-      if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 
-          !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 
-        continue; 
- 
-      MinWidth = std::min(MinWidth, 
-                          (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 
-      MaxWidth = std::max(MaxWidth, 
-                          (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 
-    } 
-  } 
- 
-  return {MinWidth, MaxWidth}; 
-} 
- 
+        T = RdxDesc.getRecurrenceType();
+      }
+
+      // Examine the stored values.
+      if (auto *ST = dyn_cast<StoreInst>(&I))
+        T = ST->getValueOperand()->getType();
+
+      // Ignore loaded pointer types and stored pointer types that are not
+      // vectorizable.
+      //
+      // FIXME: The check here attempts to predict whether a load or store will
+      //        be vectorized. We only know this for certain after a VF has
+      //        been selected. Here, we assume that if an access can be
+      //        vectorized, it will be. We should also look at extending this
+      //        optimization to non-pointer types.
+      //
+      if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
+          !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
+        continue;
+
+      MinWidth = std::min(MinWidth,
+                          (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
+      MaxWidth = std::max(MaxWidth,
+                          (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
+    }
+  }
+
+  return {MinWidth, MaxWidth};
+}
+
 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
-                                                           unsigned LoopCost) { 
-  // -- The interleave heuristics -- 
-  // We interleave the loop in order to expose ILP and reduce the loop overhead. 
-  // There are many micro-architectural considerations that we can't predict 
-  // at this level. For example, frontend pressure (on decode or fetch) due to 
-  // code size, or the number and capabilities of the execution ports. 
-  // 
-  // We use the following heuristics to select the interleave count: 
-  // 1. If the code has reductions, then we interleave to break the cross 
-  // iteration dependency. 
-  // 2. If the loop is really small, then we interleave to reduce the loop 
-  // overhead. 
-  // 3. We don't interleave if we think that we will spill registers to memory 
-  // due to the increased register pressure. 
- 
-  if (!isScalarEpilogueAllowed()) 
-    return 1; 
- 
-  // We used the distance for the interleave count. 
-  if (Legal->getMaxSafeDepDistBytes() != -1U) 
-    return 1; 
- 
+                                                           unsigned LoopCost) {
+  // -- The interleave heuristics --
+  // We interleave the loop in order to expose ILP and reduce the loop overhead.
+  // There are many micro-architectural considerations that we can't predict
+  // at this level. For example, frontend pressure (on decode or fetch) due to
+  // code size, or the number and capabilities of the execution ports.
+  //
+  // We use the following heuristics to select the interleave count:
+  // 1. If the code has reductions, then we interleave to break the cross
+  // iteration dependency.
+  // 2. If the loop is really small, then we interleave to reduce the loop
+  // overhead.
+  // 3. We don't interleave if we think that we will spill registers to memory
+  // due to the increased register pressure.
+
+  if (!isScalarEpilogueAllowed())
+    return 1;
+
+  // We used the distance for the interleave count.
+  if (Legal->getMaxSafeDepDistBytes() != -1U)
+    return 1;
+
   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
   const bool HasReductions = !Legal->getReductionVars().empty();
-  // Do not interleave loops with a relatively small known or estimated trip 
+  // Do not interleave loops with a relatively small known or estimated trip
   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
   // because with the above conditions interleaving can expose ILP and break
   // cross iteration dependences for reductions.
   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
-    return 1; 
- 
-  RegisterUsage R = calculateRegisterUsage({VF})[0]; 
-  // We divide by these constants so assume that we have at least one 
-  // instruction that uses at least one register. 
-  for (auto& pair : R.MaxLocalUsers) { 
-    pair.second = std::max(pair.second, 1U); 
-  } 
- 
-  // We calculate the interleave count using the following formula. 
-  // Subtract the number of loop invariants from the number of available 
-  // registers. These registers are used by all of the interleaved instances. 
-  // Next, divide the remaining registers by the number of registers that is 
-  // required by the loop, in order to estimate how many parallel instances 
-  // fit without causing spills. All of this is rounded down if necessary to be 
-  // a power of two. We want power of two interleave count to simplify any 
-  // addressing operations or alignment considerations. 
-  // We also want power of two interleave counts to ensure that the induction 
-  // variable of the vector loop wraps to zero, when tail is folded by masking; 
-  // this currently happens when OptForSize, in which case IC is set to 1 above. 
-  unsigned IC = UINT_MAX; 
- 
-  for (auto& pair : R.MaxLocalUsers) { 
-    unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 
-    LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 
-                      << " registers of " 
-                      << TTI.getRegisterClassName(pair.first) << " register class\n"); 
+    return 1;
+
+  RegisterUsage R = calculateRegisterUsage({VF})[0];
+  // We divide by these constants so assume that we have at least one
+  // instruction that uses at least one register.
+  for (auto& pair : R.MaxLocalUsers) {
+    pair.second = std::max(pair.second, 1U);
+  }
+
+  // We calculate the interleave count using the following formula.
+  // Subtract the number of loop invariants from the number of available
+  // registers. These registers are used by all of the interleaved instances.
+  // Next, divide the remaining registers by the number of registers that is
+  // required by the loop, in order to estimate how many parallel instances
+  // fit without causing spills. All of this is rounded down if necessary to be
+  // a power of two. We want power of two interleave count to simplify any
+  // addressing operations or alignment considerations.
+  // We also want power of two interleave counts to ensure that the induction
+  // variable of the vector loop wraps to zero, when tail is folded by masking;
+  // this currently happens when OptForSize, in which case IC is set to 1 above.
+  unsigned IC = UINT_MAX;
+
+  for (auto& pair : R.MaxLocalUsers) {
+    unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
+    LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
+                      << " registers of "
+                      << TTI.getRegisterClassName(pair.first) << " register class\n");
     if (VF.isScalar()) {
-      if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 
-        TargetNumRegisters = ForceTargetNumScalarRegs; 
-    } else { 
-      if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 
-        TargetNumRegisters = ForceTargetNumVectorRegs; 
-    } 
-    unsigned MaxLocalUsers = pair.second; 
-    unsigned LoopInvariantRegs = 0; 
-    if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 
-      LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 
- 
-    unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 
-    // Don't count the induction variable as interleaved. 
-    if (EnableIndVarRegisterHeur) { 
-      TmpIC = 
-          PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 
-                        std::max(1U, (MaxLocalUsers - 1))); 
-    } 
- 
-    IC = std::min(IC, TmpIC); 
-  } 
- 
-  // Clamp the interleave ranges to reasonable counts. 
+      if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
+        TargetNumRegisters = ForceTargetNumScalarRegs;
+    } else {
+      if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
+        TargetNumRegisters = ForceTargetNumVectorRegs;
+    }
+    unsigned MaxLocalUsers = pair.second;
+    unsigned LoopInvariantRegs = 0;
+    if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
+      LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
+
+    unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
+    // Don't count the induction variable as interleaved.
+    if (EnableIndVarRegisterHeur) {
+      TmpIC =
+          PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
+                        std::max(1U, (MaxLocalUsers - 1)));
+    }
+
+    IC = std::min(IC, TmpIC);
+  }
+
+  // Clamp the interleave ranges to reasonable counts.
   unsigned MaxInterleaveCount =
       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
- 
-  // Check if the user has overridden the max. 
+
+  // Check if the user has overridden the max.
   if (VF.isScalar()) {
-    if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 
-      MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 
-  } else { 
-    if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 
-      MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 
-  } 
- 
-  // If trip count is known or estimated compile time constant, limit the 
+    if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
+      MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
+  } else {
+    if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
+      MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
+  }
+
+  // If trip count is known or estimated compile time constant, limit the
   // interleave count to be less than the trip count divided by VF, provided it
   // is at least 1.
   //
@@ -6125,24 +6125,24 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
   // similar benefit as for fixed-width vectors. For now, we choose to leave
   // the InterleaveCount as if vscale is '1', although if some information about
   // the vector is known (e.g. min vector size), we can make a better decision.
-  if (BestKnownTC) { 
+  if (BestKnownTC) {
     MaxInterleaveCount =
         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
     // Make sure MaxInterleaveCount is greater than 0.
     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
-  } 
- 
+  }
+
   assert(MaxInterleaveCount > 0 &&
          "Maximum interleave count must be greater than 0");
- 
-  // Clamp the calculated IC to be between the 1 and the max interleave count 
-  // that the target and trip count allows. 
-  if (IC > MaxInterleaveCount) 
-    IC = MaxInterleaveCount; 
+
+  // Clamp the calculated IC to be between the 1 and the max interleave count
+  // that the target and trip count allows.
+  if (IC > MaxInterleaveCount)
+    IC = MaxInterleaveCount;
   else
     // Make sure IC is greater than 0.
     IC = std::max(1u, IC);
- 
+
   assert(IC > 0 && "Interleave count must be greater than 0.");
 
   // If we did not calculate the cost for VF (because the user selected the VF)
@@ -6154,57 +6154,57 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
 
   assert(LoopCost && "Non-zero loop cost expected");
 
-  // Interleave if we vectorized this loop and there is a reduction that could 
-  // benefit from interleaving. 
+  // Interleave if we vectorized this loop and there is a reduction that could
+  // benefit from interleaving.
   if (VF.isVector() && HasReductions) {
-    LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 
-    return IC; 
-  } 
- 
-  // Note that if we've already vectorized the loop we will have done the 
-  // runtime check and so interleaving won't require further checks. 
-  bool InterleavingRequiresRuntimePointerCheck = 
+    LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
+    return IC;
+  }
+
+  // Note that if we've already vectorized the loop we will have done the
+  // runtime check and so interleaving won't require further checks.
+  bool InterleavingRequiresRuntimePointerCheck =
       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
- 
-  // We want to interleave small loops in order to reduce the loop overhead and 
-  // potentially expose ILP opportunities. 
+
+  // We want to interleave small loops in order to reduce the loop overhead and
+  // potentially expose ILP opportunities.
   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
                     << "LV: IC is " << IC << '\n'
                     << "LV: VF is " << VF << '\n');
   const bool AggressivelyInterleaveReductions =
       TTI.enableAggressiveInterleaving(HasReductions);
-  if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 
-    // We assume that the cost overhead is 1 and we use the cost model 
-    // to estimate the cost of the loop and interleave until the cost of the 
-    // loop overhead is about 5% of the cost of the loop. 
-    unsigned SmallIC = 
-        std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 
- 
-    // Interleave until store/load ports (estimated by max interleave count) are 
-    // saturated. 
-    unsigned NumStores = Legal->getNumStores(); 
-    unsigned NumLoads = Legal->getNumLoads(); 
-    unsigned StoresIC = IC / (NumStores ? NumStores : 1); 
-    unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 
- 
-    // If we have a scalar reduction (vector reductions are already dealt with 
-    // by this point), we can increase the critical path length if the loop 
-    // we're interleaving is inside another loop. Limit, by default to 2, so the 
-    // critical path only gets increased by one reduction operation. 
+  if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
+    // We assume that the cost overhead is 1 and we use the cost model
+    // to estimate the cost of the loop and interleave until the cost of the
+    // loop overhead is about 5% of the cost of the loop.
+    unsigned SmallIC =
+        std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
+
+    // Interleave until store/load ports (estimated by max interleave count) are
+    // saturated.
+    unsigned NumStores = Legal->getNumStores();
+    unsigned NumLoads = Legal->getNumLoads();
+    unsigned StoresIC = IC / (NumStores ? NumStores : 1);
+    unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
+
+    // If we have a scalar reduction (vector reductions are already dealt with
+    // by this point), we can increase the critical path length if the loop
+    // we're interleaving is inside another loop. Limit, by default to 2, so the
+    // critical path only gets increased by one reduction operation.
     if (HasReductions && TheLoop->getLoopDepth() > 1) {
-      unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 
-      SmallIC = std::min(SmallIC, F); 
-      StoresIC = std::min(StoresIC, F); 
-      LoadsIC = std::min(LoadsIC, F); 
-    } 
- 
-    if (EnableLoadStoreRuntimeInterleave && 
-        std::max(StoresIC, LoadsIC) > SmallIC) { 
-      LLVM_DEBUG( 
-          dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 
-      return std::max(StoresIC, LoadsIC); 
-    } 
- 
+      unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
+      SmallIC = std::min(SmallIC, F);
+      StoresIC = std::min(StoresIC, F);
+      LoadsIC = std::min(LoadsIC, F);
+    }
+
+    if (EnableLoadStoreRuntimeInterleave &&
+        std::max(StoresIC, LoadsIC) > SmallIC) {
+      LLVM_DEBUG(
+          dbgs() << "LV: Interleaving to saturate store or load ports.\n");
+      return std::max(StoresIC, LoadsIC);
+    }
+
     // If there are scalar reductions and TTI has enabled aggressive
     // interleaving for reductions, we will interleave to expose ILP.
     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
@@ -6217,611 +6217,611 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
       return SmallIC;
     }
-  } 
- 
-  // Interleave if this is a large loop (small loops are already dealt with by 
-  // this point) that could benefit from interleaving. 
+  }
+
+  // Interleave if this is a large loop (small loops are already dealt with by
+  // this point) that could benefit from interleaving.
   if (AggressivelyInterleaveReductions) {
-    LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 
-    return IC; 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 
-  return 1; 
-} 
- 
-SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 
+    LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
+    return IC;
+  }
+
+  LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
+  return 1;
+}
+
+SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
-  // This function calculates the register usage by measuring the highest number 
-  // of values that are alive at a single location. Obviously, this is a very 
-  // rough estimation. We scan the loop in a topological order in order and 
-  // assign a number to each instruction. We use RPO to ensure that defs are 
-  // met before their users. We assume that each instruction that has in-loop 
-  // users starts an interval. We record every time that an in-loop value is 
-  // used, so we have a list of the first and last occurrences of each 
-  // instruction. Next, we transpose this data structure into a multi map that 
-  // holds the list of intervals that *end* at a specific location. This multi 
-  // map allows us to perform a linear search. We scan the instructions linearly 
-  // and record each time that a new interval starts, by placing it in a set. 
-  // If we find this value in the multi-map then we remove it from the set. 
-  // The max register usage is the maximum size of the set. 
-  // We also search for instructions that are defined outside the loop, but are 
-  // used inside the loop. We need this number separately from the max-interval 
-  // usage number because when we unroll, loop-invariant values do not take 
-  // more register. 
-  LoopBlocksDFS DFS(TheLoop); 
-  DFS.perform(LI); 
- 
-  RegisterUsage RU; 
- 
-  // Each 'key' in the map opens a new interval. The values 
-  // of the map are the index of the 'last seen' usage of the 
-  // instruction that is the key. 
-  using IntervalMap = DenseMap<Instruction *, unsigned>; 
- 
-  // Maps instruction to its index. 
-  SmallVector<Instruction *, 64> IdxToInstr; 
-  // Marks the end of each interval. 
-  IntervalMap EndPoint; 
-  // Saves the list of instruction indices that are used in the loop. 
-  SmallPtrSet<Instruction *, 8> Ends; 
-  // Saves the list of values that are used in the loop but are 
-  // defined outside the loop, such as arguments and constants. 
-  SmallPtrSet<Value *, 8> LoopInvariants; 
- 
-  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 
-    for (Instruction &I : BB->instructionsWithoutDebug()) { 
-      IdxToInstr.push_back(&I); 
- 
-      // Save the end location of each USE. 
-      for (Value *U : I.operands()) { 
-        auto *Instr = dyn_cast<Instruction>(U); 
- 
-        // Ignore non-instruction values such as arguments, constants, etc. 
-        if (!Instr) 
-          continue; 
- 
-        // If this instruction is outside the loop then record it and continue. 
-        if (!TheLoop->contains(Instr)) { 
-          LoopInvariants.insert(Instr); 
-          continue; 
-        } 
- 
-        // Overwrite previous end points. 
-        EndPoint[Instr] = IdxToInstr.size(); 
-        Ends.insert(Instr); 
-      } 
-    } 
-  } 
- 
-  // Saves the list of intervals that end with the index in 'key'. 
-  using InstrList = SmallVector<Instruction *, 2>; 
-  DenseMap<unsigned, InstrList> TransposeEnds; 
- 
-  // Transpose the EndPoints to a list of values that end at each index. 
-  for (auto &Interval : EndPoint) 
-    TransposeEnds[Interval.second].push_back(Interval.first); 
- 
-  SmallPtrSet<Instruction *, 8> OpenIntervals; 
-  SmallVector<RegisterUsage, 8> RUs(VFs.size()); 
-  SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 
- 
-  LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 
- 
-  // A lambda that gets the register usage for the given type and VF. 
+  // This function calculates the register usage by measuring the highest number
+  // of values that are alive at a single location. Obviously, this is a very
+  // rough estimation. We scan the loop in a topological order in order and
+  // assign a number to each instruction. We use RPO to ensure that defs are
+  // met before their users. We assume that each instruction that has in-loop
+  // users starts an interval. We record every time that an in-loop value is
+  // used, so we have a list of the first and last occurrences of each
+  // instruction. Next, we transpose this data structure into a multi map that
+  // holds the list of intervals that *end* at a specific location. This multi
+  // map allows us to perform a linear search. We scan the instructions linearly
+  // and record each time that a new interval starts, by placing it in a set.
+  // If we find this value in the multi-map then we remove it from the set.
+  // The max register usage is the maximum size of the set.
+  // We also search for instructions that are defined outside the loop, but are
+  // used inside the loop. We need this number separately from the max-interval
+  // usage number because when we unroll, loop-invariant values do not take
+  // more register.
+  LoopBlocksDFS DFS(TheLoop);
+  DFS.perform(LI);
+
+  RegisterUsage RU;
+
+  // Each 'key' in the map opens a new interval. The values
+  // of the map are the index of the 'last seen' usage of the
+  // instruction that is the key.
+  using IntervalMap = DenseMap<Instruction *, unsigned>;
+
+  // Maps instruction to its index.
+  SmallVector<Instruction *, 64> IdxToInstr;
+  // Marks the end of each interval.
+  IntervalMap EndPoint;
+  // Saves the list of instruction indices that are used in the loop.
+  SmallPtrSet<Instruction *, 8> Ends;
+  // Saves the list of values that are used in the loop but are
+  // defined outside the loop, such as arguments and constants.
+  SmallPtrSet<Value *, 8> LoopInvariants;
+
+  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
+    for (Instruction &I : BB->instructionsWithoutDebug()) {
+      IdxToInstr.push_back(&I);
+
+      // Save the end location of each USE.
+      for (Value *U : I.operands()) {
+        auto *Instr = dyn_cast<Instruction>(U);
+
+        // Ignore non-instruction values such as arguments, constants, etc.
+        if (!Instr)
+          continue;
+
+        // If this instruction is outside the loop then record it and continue.
+        if (!TheLoop->contains(Instr)) {
+          LoopInvariants.insert(Instr);
+          continue;
+        }
+
+        // Overwrite previous end points.
+        EndPoint[Instr] = IdxToInstr.size();
+        Ends.insert(Instr);
+      }
+    }
+  }
+
+  // Saves the list of intervals that end with the index in 'key'.
+  using InstrList = SmallVector<Instruction *, 2>;
+  DenseMap<unsigned, InstrList> TransposeEnds;
+
+  // Transpose the EndPoints to a list of values that end at each index.
+  for (auto &Interval : EndPoint)
+    TransposeEnds[Interval.second].push_back(Interval.first);
+
+  SmallPtrSet<Instruction *, 8> OpenIntervals;
+  SmallVector<RegisterUsage, 8> RUs(VFs.size());
+  SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
+
+  LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
+
+  // A lambda that gets the register usage for the given type and VF.
   const auto &TTICapture = TTI;
   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) {
     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
-      return 0U; 
+      return 0U;
     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
-  }; 
- 
-  for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 
-    Instruction *I = IdxToInstr[i]; 
- 
-    // Remove all of the instructions that end at this location. 
-    InstrList &List = TransposeEnds[i]; 
-    for (Instruction *ToRemove : List) 
-      OpenIntervals.erase(ToRemove); 
- 
-    // Ignore instructions that are never used within the loop. 
-    if (!Ends.count(I)) 
-      continue; 
- 
-    // Skip ignored values. 
-    if (ValuesToIgnore.count(I)) 
-      continue; 
- 
-    // For each VF find the maximum usage of registers. 
-    for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 
-      // Count the number of live intervals. 
-      SmallMapVector<unsigned, unsigned, 4> RegUsage; 
- 
+  };
+
+  for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
+    Instruction *I = IdxToInstr[i];
+
+    // Remove all of the instructions that end at this location.
+    InstrList &List = TransposeEnds[i];
+    for (Instruction *ToRemove : List)
+      OpenIntervals.erase(ToRemove);
+
+    // Ignore instructions that are never used within the loop.
+    if (!Ends.count(I))
+      continue;
+
+    // Skip ignored values.
+    if (ValuesToIgnore.count(I))
+      continue;
+
+    // For each VF find the maximum usage of registers.
+    for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
+      // Count the number of live intervals.
+      SmallMapVector<unsigned, unsigned, 4> RegUsage;
+
       if (VFs[j].isScalar()) {
-        for (auto Inst : OpenIntervals) { 
-          unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 
-          if (RegUsage.find(ClassID) == RegUsage.end()) 
-            RegUsage[ClassID] = 1; 
-          else 
-            RegUsage[ClassID] += 1; 
-        } 
-      } else { 
-        collectUniformsAndScalars(VFs[j]); 
-        for (auto Inst : OpenIntervals) { 
-          // Skip ignored values for VF > 1. 
-          if (VecValuesToIgnore.count(Inst)) 
-            continue; 
-          if (isScalarAfterVectorization(Inst, VFs[j])) { 
-            unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 
-            if (RegUsage.find(ClassID) == RegUsage.end()) 
-              RegUsage[ClassID] = 1; 
-            else 
-              RegUsage[ClassID] += 1; 
-          } else { 
-            unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 
-            if (RegUsage.find(ClassID) == RegUsage.end()) 
-              RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 
-            else 
-              RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 
-          } 
-        } 
-      } 
-
-      for (auto& pair : RegUsage) { 
-        if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 
-          MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 
-        else 
-          MaxUsages[j][pair.first] = pair.second; 
-      } 
-    } 
- 
-    LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 
-                      << OpenIntervals.size() << '\n'); 
- 
-    // Add the current instruction to the list of open intervals. 
-    OpenIntervals.insert(I); 
-  } 
- 
-  for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 
-    SmallMapVector<unsigned, unsigned, 4> Invariant; 
-
-    for (auto Inst : LoopInvariants) { 
+        for (auto Inst : OpenIntervals) {
+          unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
+          if (RegUsage.find(ClassID) == RegUsage.end())
+            RegUsage[ClassID] = 1;
+          else
+            RegUsage[ClassID] += 1;
+        }
+      } else {
+        collectUniformsAndScalars(VFs[j]);
+        for (auto Inst : OpenIntervals) {
+          // Skip ignored values for VF > 1.
+          if (VecValuesToIgnore.count(Inst))
+            continue;
+          if (isScalarAfterVectorization(Inst, VFs[j])) {
+            unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
+            if (RegUsage.find(ClassID) == RegUsage.end())
+              RegUsage[ClassID] = 1;
+            else
+              RegUsage[ClassID] += 1;
+          } else {
+            unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
+            if (RegUsage.find(ClassID) == RegUsage.end())
+              RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
+            else
+              RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
+          }
+        }
+      }
+
+      for (auto& pair : RegUsage) {
+        if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
+          MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
+        else
+          MaxUsages[j][pair.first] = pair.second;
+      }
+    }
+
+    LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
+                      << OpenIntervals.size() << '\n');
+
+    // Add the current instruction to the list of open intervals.
+    OpenIntervals.insert(I);
+  }
+
+  for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
+    SmallMapVector<unsigned, unsigned, 4> Invariant;
+
+    for (auto Inst : LoopInvariants) {
       unsigned Usage =
           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
       unsigned ClassID =
           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
-      if (Invariant.find(ClassID) == Invariant.end()) 
-        Invariant[ClassID] = Usage; 
-      else 
-        Invariant[ClassID] += Usage; 
-    } 
- 
-    LLVM_DEBUG({ 
-      dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 
-      dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 
-             << " item\n"; 
-      for (const auto &pair : MaxUsages[i]) { 
-        dbgs() << "LV(REG): RegisterClass: " 
-               << TTI.getRegisterClassName(pair.first) << ", " << pair.second 
-               << " registers\n"; 
-      } 
-      dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 
-             << " item\n"; 
-      for (const auto &pair : Invariant) { 
-        dbgs() << "LV(REG): RegisterClass: " 
-               << TTI.getRegisterClassName(pair.first) << ", " << pair.second 
-               << " registers\n"; 
-      } 
-    }); 
- 
-    RU.LoopInvariantRegs = Invariant; 
-    RU.MaxLocalUsers = MaxUsages[i]; 
-    RUs[i] = RU; 
-  } 
- 
-  return RUs; 
-} 
- 
-bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 
-  // TODO: Cost model for emulated masked load/store is completely 
-  // broken. This hack guides the cost model to use an artificially 
-  // high enough value to practically disable vectorization with such 
-  // operations, except where previously deployed legality hack allowed 
-  // using very low cost values. This is to avoid regressions coming simply 
-  // from moving "masked load/store" check from legality to cost model. 
-  // Masked Load/Gather emulation was previously never allowed. 
-  // Limited number of Masked Store/Scatter emulation was allowed. 
-  assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 
-  return isa<LoadInst>(I) || 
-         (isa<StoreInst>(I) && 
-          NumPredStores > NumberOfStoresToPredicate); 
-} 
- 
+      if (Invariant.find(ClassID) == Invariant.end())
+        Invariant[ClassID] = Usage;
+      else
+        Invariant[ClassID] += Usage;
+    }
+
+    LLVM_DEBUG({
+      dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
+      dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
+             << " item\n";
+      for (const auto &pair : MaxUsages[i]) {
+        dbgs() << "LV(REG): RegisterClass: "
+               << TTI.getRegisterClassName(pair.first) << ", " << pair.second
+               << " registers\n";
+      }
+      dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
+             << " item\n";
+      for (const auto &pair : Invariant) {
+        dbgs() << "LV(REG): RegisterClass: "
+               << TTI.getRegisterClassName(pair.first) << ", " << pair.second
+               << " registers\n";
+      }
+    });
+
+    RU.LoopInvariantRegs = Invariant;
+    RU.MaxLocalUsers = MaxUsages[i];
+    RUs[i] = RU;
+  }
+
+  return RUs;
+}
+
+bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
+  // TODO: Cost model for emulated masked load/store is completely
+  // broken. This hack guides the cost model to use an artificially
+  // high enough value to practically disable vectorization with such
+  // operations, except where previously deployed legality hack allowed
+  // using very low cost values. This is to avoid regressions coming simply
+  // from moving "masked load/store" check from legality to cost model.
+  // Masked Load/Gather emulation was previously never allowed.
+  // Limited number of Masked Store/Scatter emulation was allowed.
+  assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
+  return isa<LoadInst>(I) ||
+         (isa<StoreInst>(I) &&
+          NumPredStores > NumberOfStoresToPredicate);
+}
+
 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
-  // If we aren't vectorizing the loop, or if we've already collected the 
-  // instructions to scalarize, there's nothing to do. Collection may already 
-  // have occurred if we have a user-selected VF and are now computing the 
-  // expected cost for interleaving. 
+  // If we aren't vectorizing the loop, or if we've already collected the
+  // instructions to scalarize, there's nothing to do. Collection may already
+  // have occurred if we have a user-selected VF and are now computing the
+  // expected cost for interleaving.
   if (VF.isScalar() || VF.isZero() ||
       InstsToScalarize.find(VF) != InstsToScalarize.end())
-    return; 
- 
-  // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 
-  // not profitable to scalarize any instructions, the presence of VF in the 
-  // map will indicate that we've analyzed it already. 
-  ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 
- 
-  // Find all the instructions that are scalar with predication in the loop and 
-  // determine if it would be better to not if-convert the blocks they are in. 
-  // If so, we also record the instructions to scalarize. 
-  for (BasicBlock *BB : TheLoop->blocks()) { 
-    if (!blockNeedsPredication(BB)) 
-      continue; 
-    for (Instruction &I : *BB) 
-      if (isScalarWithPredication(&I)) { 
-        ScalarCostsTy ScalarCosts; 
-        // Do not apply discount logic if hacked cost is needed 
-        // for emulated masked memrefs. 
-        if (!useEmulatedMaskMemRefHack(&I) && 
-            computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 
-          ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 
-        // Remember that BB will remain after vectorization. 
-        PredicatedBBsAfterVectorization.insert(BB); 
-      } 
-  } 
-} 
- 
-int LoopVectorizationCostModel::computePredInstDiscount( 
+    return;
+
+  // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
+  // not profitable to scalarize any instructions, the presence of VF in the
+  // map will indicate that we've analyzed it already.
+  ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
+
+  // Find all the instructions that are scalar with predication in the loop and
+  // determine if it would be better to not if-convert the blocks they are in.
+  // If so, we also record the instructions to scalarize.
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    if (!blockNeedsPredication(BB))
+      continue;
+    for (Instruction &I : *BB)
+      if (isScalarWithPredication(&I)) {
+        ScalarCostsTy ScalarCosts;
+        // Do not apply discount logic if hacked cost is needed
+        // for emulated masked memrefs.
+        if (!useEmulatedMaskMemRefHack(&I) &&
+            computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
+          ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
+        // Remember that BB will remain after vectorization.
+        PredicatedBBsAfterVectorization.insert(BB);
+      }
+  }
+}
+
+int LoopVectorizationCostModel::computePredInstDiscount(
     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
-  assert(!isUniformAfterVectorization(PredInst, VF) && 
-         "Instruction marked uniform-after-vectorization will be predicated"); 
- 
-  // Initialize the discount to zero, meaning that the scalar version and the 
-  // vector version cost the same. 
+  assert(!isUniformAfterVectorization(PredInst, VF) &&
+         "Instruction marked uniform-after-vectorization will be predicated");
+
+  // Initialize the discount to zero, meaning that the scalar version and the
+  // vector version cost the same.
   InstructionCost Discount = 0;
- 
-  // Holds instructions to analyze. The instructions we visit are mapped in 
-  // ScalarCosts. Those instructions are the ones that would be scalarized if 
-  // we find that the scalar version costs less. 
-  SmallVector<Instruction *, 8> Worklist; 
- 
-  // Returns true if the given instruction can be scalarized. 
-  auto canBeScalarized = [&](Instruction *I) -> bool { 
-    // We only attempt to scalarize instructions forming a single-use chain 
-    // from the original predicated block that would otherwise be vectorized. 
-    // Although not strictly necessary, we give up on instructions we know will 
-    // already be scalar to avoid traversing chains that are unlikely to be 
-    // beneficial. 
-    if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 
-        isScalarAfterVectorization(I, VF)) 
-      return false; 
- 
-    // If the instruction is scalar with predication, it will be analyzed 
-    // separately. We ignore it within the context of PredInst. 
-    if (isScalarWithPredication(I)) 
-      return false; 
- 
-    // If any of the instruction's operands are uniform after vectorization, 
-    // the instruction cannot be scalarized. This prevents, for example, a 
-    // masked load from being scalarized. 
-    // 
-    // We assume we will only emit a value for lane zero of an instruction 
-    // marked uniform after vectorization, rather than VF identical values. 
-    // Thus, if we scalarize an instruction that uses a uniform, we would 
-    // create uses of values corresponding to the lanes we aren't emitting code 
-    // for. This behavior can be changed by allowing getScalarValue to clone 
-    // the lane zero values for uniforms rather than asserting. 
-    for (Use &U : I->operands()) 
-      if (auto *J = dyn_cast<Instruction>(U.get())) 
-        if (isUniformAfterVectorization(J, VF)) 
-          return false; 
- 
-    // Otherwise, we can scalarize the instruction. 
-    return true; 
-  }; 
- 
-  // Compute the expected cost discount from scalarizing the entire expression 
-  // feeding the predicated instruction. We currently only consider expressions 
-  // that are single-use instruction chains. 
-  Worklist.push_back(PredInst); 
-  while (!Worklist.empty()) { 
-    Instruction *I = Worklist.pop_back_val(); 
- 
-    // If we've already analyzed the instruction, there's nothing to do. 
-    if (ScalarCosts.find(I) != ScalarCosts.end()) 
-      continue; 
- 
-    // Compute the cost of the vector instruction. Note that this cost already 
-    // includes the scalarization overhead of the predicated instruction. 
+
+  // Holds instructions to analyze. The instructions we visit are mapped in
+  // ScalarCosts. Those instructions are the ones that would be scalarized if
+  // we find that the scalar version costs less.
+  SmallVector<Instruction *, 8> Worklist;
+
+  // Returns true if the given instruction can be scalarized.
+  auto canBeScalarized = [&](Instruction *I) -> bool {
+    // We only attempt to scalarize instructions forming a single-use chain
+    // from the original predicated block that would otherwise be vectorized.
+    // Although not strictly necessary, we give up on instructions we know will
+    // already be scalar to avoid traversing chains that are unlikely to be
+    // beneficial.
+    if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
+        isScalarAfterVectorization(I, VF))
+      return false;
+
+    // If the instruction is scalar with predication, it will be analyzed
+    // separately. We ignore it within the context of PredInst.
+    if (isScalarWithPredication(I))
+      return false;
+
+    // If any of the instruction's operands are uniform after vectorization,
+    // the instruction cannot be scalarized. This prevents, for example, a
+    // masked load from being scalarized.
+    //
+    // We assume we will only emit a value for lane zero of an instruction
+    // marked uniform after vectorization, rather than VF identical values.
+    // Thus, if we scalarize an instruction that uses a uniform, we would
+    // create uses of values corresponding to the lanes we aren't emitting code
+    // for. This behavior can be changed by allowing getScalarValue to clone
+    // the lane zero values for uniforms rather than asserting.
+    for (Use &U : I->operands())
+      if (auto *J = dyn_cast<Instruction>(U.get()))
+        if (isUniformAfterVectorization(J, VF))
+          return false;
+
+    // Otherwise, we can scalarize the instruction.
+    return true;
+  };
+
+  // Compute the expected cost discount from scalarizing the entire expression
+  // feeding the predicated instruction. We currently only consider expressions
+  // that are single-use instruction chains.
+  Worklist.push_back(PredInst);
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.pop_back_val();
+
+    // If we've already analyzed the instruction, there's nothing to do.
+    if (ScalarCosts.find(I) != ScalarCosts.end())
+      continue;
+
+    // Compute the cost of the vector instruction. Note that this cost already
+    // includes the scalarization overhead of the predicated instruction.
     InstructionCost VectorCost = getInstructionCost(I, VF).first;
- 
-    // Compute the cost of the scalarized instruction. This cost is the cost of 
-    // the instruction as if it wasn't if-converted and instead remained in the 
-    // predicated block. We will scale this cost by block probability after 
-    // computing the scalarization overhead. 
+
+    // Compute the cost of the scalarized instruction. This cost is the cost of
+    // the instruction as if it wasn't if-converted and instead remained in the
+    // predicated block. We will scale this cost by block probability after
+    // computing the scalarization overhead.
     assert(!VF.isScalable() && "scalable vectors not yet supported.");
     InstructionCost ScalarCost =
         VF.getKnownMinValue() *
         getInstructionCost(I, ElementCount::getFixed(1)).first;
- 
-    // Compute the scalarization overhead of needed insertelement instructions 
-    // and phi nodes. 
-    if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 
-      ScalarCost += TTI.getScalarizationOverhead( 
-          cast<VectorType>(ToVectorTy(I->getType(), VF)), 
+
+    // Compute the scalarization overhead of needed insertelement instructions
+    // and phi nodes.
+    if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
+      ScalarCost += TTI.getScalarizationOverhead(
+          cast<VectorType>(ToVectorTy(I->getType(), VF)),
           APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
       assert(!VF.isScalable() && "scalable vectors not yet supported.");
       ScalarCost +=
           VF.getKnownMinValue() *
           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
-    } 
- 
-    // Compute the scalarization overhead of needed extractelement 
-    // instructions. For each of the instruction's operands, if the operand can 
-    // be scalarized, add it to the worklist; otherwise, account for the 
-    // overhead. 
-    for (Use &U : I->operands()) 
-      if (auto *J = dyn_cast<Instruction>(U.get())) { 
-        assert(VectorType::isValidElementType(J->getType()) && 
-               "Instruction has non-scalar type"); 
-        if (canBeScalarized(J)) 
-          Worklist.push_back(J); 
+    }
+
+    // Compute the scalarization overhead of needed extractelement
+    // instructions. For each of the instruction's operands, if the operand can
+    // be scalarized, add it to the worklist; otherwise, account for the
+    // overhead.
+    for (Use &U : I->operands())
+      if (auto *J = dyn_cast<Instruction>(U.get())) {
+        assert(VectorType::isValidElementType(J->getType()) &&
+               "Instruction has non-scalar type");
+        if (canBeScalarized(J))
+          Worklist.push_back(J);
         else if (needsExtract(J, VF)) {
           assert(!VF.isScalable() && "scalable vectors not yet supported.");
-          ScalarCost += TTI.getScalarizationOverhead( 
-              cast<VectorType>(ToVectorTy(J->getType(), VF)), 
+          ScalarCost += TTI.getScalarizationOverhead(
+              cast<VectorType>(ToVectorTy(J->getType(), VF)),
               APInt::getAllOnesValue(VF.getKnownMinValue()), false, true);
         }
-      } 
- 
-    // Scale the total scalar cost by block probability. 
-    ScalarCost /= getReciprocalPredBlockProb(); 
- 
-    // Compute the discount. A non-negative discount means the vector version 
-    // of the instruction costs more, and scalarizing would be beneficial. 
-    Discount += VectorCost - ScalarCost; 
-    ScalarCosts[I] = ScalarCost; 
-  } 
- 
+      }
+
+    // Scale the total scalar cost by block probability.
+    ScalarCost /= getReciprocalPredBlockProb();
+
+    // Compute the discount. A non-negative discount means the vector version
+    // of the instruction costs more, and scalarizing would be beneficial.
+    Discount += VectorCost - ScalarCost;
+    ScalarCosts[I] = ScalarCost;
+  }
+
   return *Discount.getValue();
-} 
- 
-LoopVectorizationCostModel::VectorizationCostTy 
+}
+
+LoopVectorizationCostModel::VectorizationCostTy
 LoopVectorizationCostModel::expectedCost(ElementCount VF) {
-  VectorizationCostTy Cost; 
- 
-  // For each block. 
-  for (BasicBlock *BB : TheLoop->blocks()) { 
-    VectorizationCostTy BlockCost; 
- 
-    // For each instruction in the old loop. 
-    for (Instruction &I : BB->instructionsWithoutDebug()) { 
-      // Skip ignored values. 
+  VectorizationCostTy Cost;
+
+  // For each block.
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    VectorizationCostTy BlockCost;
+
+    // For each instruction in the old loop.
+    for (Instruction &I : BB->instructionsWithoutDebug()) {
+      // Skip ignored values.
       if (ValuesToIgnore.count(&I) ||
           (VF.isVector() && VecValuesToIgnore.count(&I)))
-        continue; 
- 
-      VectorizationCostTy C = getInstructionCost(&I, VF); 
- 
-      // Check if we should override the cost. 
-      if (ForceTargetInstructionCost.getNumOccurrences() > 0) 
+        continue;
+
+      VectorizationCostTy C = getInstructionCost(&I, VF);
+
+      // Check if we should override the cost.
+      if (ForceTargetInstructionCost.getNumOccurrences() > 0)
         C.first = InstructionCost(ForceTargetInstructionCost);
- 
-      BlockCost.first += C.first; 
-      BlockCost.second |= C.second; 
-      LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 
-                        << " for VF " << VF << " For instruction: " << I 
-                        << '\n'); 
-    } 
- 
-    // If we are vectorizing a predicated block, it will have been 
-    // if-converted. This means that the block's instructions (aside from 
-    // stores and instructions that may divide by zero) will now be 
-    // unconditionally executed. For the scalar case, we may not always execute 
+
+      BlockCost.first += C.first;
+      BlockCost.second |= C.second;
+      LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
+                        << " for VF " << VF << " For instruction: " << I
+                        << '\n');
+    }
+
+    // If we are vectorizing a predicated block, it will have been
+    // if-converted. This means that the block's instructions (aside from
+    // stores and instructions that may divide by zero) will now be
+    // unconditionally executed. For the scalar case, we may not always execute
     // the predicated block, if it is an if-else block. Thus, scale the block's
     // cost by the probability of executing it. blockNeedsPredication from
     // Legal is used so as to not include all blocks in tail folded loops.
     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
-      BlockCost.first /= getReciprocalPredBlockProb(); 
- 
-    Cost.first += BlockCost.first; 
-    Cost.second |= BlockCost.second; 
-  } 
- 
-  return Cost; 
-} 
- 
-/// Gets Address Access SCEV after verifying that the access pattern 
-/// is loop invariant except the induction variable dependence. 
-/// 
-/// This SCEV can be sent to the Target in order to estimate the address 
-/// calculation cost. 
-static const SCEV *getAddressAccessSCEV( 
-              Value *Ptr, 
-              LoopVectorizationLegality *Legal, 
-              PredicatedScalarEvolution &PSE, 
-              const Loop *TheLoop) { 
- 
-  auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 
-  if (!Gep) 
-    return nullptr; 
- 
-  // We are looking for a gep with all loop invariant indices except for one 
-  // which should be an induction variable. 
-  auto SE = PSE.getSE(); 
-  unsigned NumOperands = Gep->getNumOperands(); 
-  for (unsigned i = 1; i < NumOperands; ++i) { 
-    Value *Opd = Gep->getOperand(i); 
-    if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 
-        !Legal->isInductionVariable(Opd)) 
-      return nullptr; 
-  } 
- 
-  // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 
-  return PSE.getSCEV(Ptr); 
-} 
- 
-static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 
-  return Legal->hasStride(I->getOperand(0)) || 
-         Legal->hasStride(I->getOperand(1)); 
-} 
- 
+      BlockCost.first /= getReciprocalPredBlockProb();
+
+    Cost.first += BlockCost.first;
+    Cost.second |= BlockCost.second;
+  }
+
+  return Cost;
+}
+
+/// Gets Address Access SCEV after verifying that the access pattern
+/// is loop invariant except the induction variable dependence.
+///
+/// This SCEV can be sent to the Target in order to estimate the address
+/// calculation cost.
+static const SCEV *getAddressAccessSCEV(
+              Value *Ptr,
+              LoopVectorizationLegality *Legal,
+              PredicatedScalarEvolution &PSE,
+              const Loop *TheLoop) {
+
+  auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+  if (!Gep)
+    return nullptr;
+
+  // We are looking for a gep with all loop invariant indices except for one
+  // which should be an induction variable.
+  auto SE = PSE.getSE();
+  unsigned NumOperands = Gep->getNumOperands();
+  for (unsigned i = 1; i < NumOperands; ++i) {
+    Value *Opd = Gep->getOperand(i);
+    if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
+        !Legal->isInductionVariable(Opd))
+      return nullptr;
+  }
+
+  // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
+  return PSE.getSCEV(Ptr);
+}
+
+static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
+  return Legal->hasStride(I->getOperand(0)) ||
+         Legal->hasStride(I->getOperand(1));
+}
+
 InstructionCost
 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
                                                         ElementCount VF) {
   assert(VF.isVector() &&
          "Scalarization cost of instruction implies vectorization.");
   assert(!VF.isScalable() && "scalable vectors not yet supported.");
-  Type *ValTy = getMemInstValueType(I); 
-  auto SE = PSE.getSE(); 
- 
-  unsigned AS = getLoadStoreAddressSpace(I); 
-  Value *Ptr = getLoadStorePointerOperand(I); 
-  Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 
- 
-  // Figure out whether the access is strided and get the stride value 
-  // if it's known in compile time 
-  const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 
- 
-  // Get the cost of the scalar memory instruction and address computation. 
+  Type *ValTy = getMemInstValueType(I);
+  auto SE = PSE.getSE();
+
+  unsigned AS = getLoadStoreAddressSpace(I);
+  Value *Ptr = getLoadStorePointerOperand(I);
+  Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
+
+  // Figure out whether the access is strided and get the stride value
+  // if it's known in compile time
+  const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
+
+  // Get the cost of the scalar memory instruction and address computation.
   InstructionCost Cost =
       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
- 
-  // Don't pass *I here, since it is scalar but will actually be part of a 
-  // vectorized loop where the user of it is a vectorized instruction. 
-  const Align Alignment = getLoadStoreAlignment(I); 
+
+  // Don't pass *I here, since it is scalar but will actually be part of a
+  // vectorized loop where the user of it is a vectorized instruction.
+  const Align Alignment = getLoadStoreAlignment(I);
   Cost += VF.getKnownMinValue() *
           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
                               AS, TTI::TCK_RecipThroughput);
- 
-  // Get the overhead of the extractelement and insertelement instructions 
-  // we might create due to scalarization. 
-  Cost += getScalarizationOverhead(I, VF); 
- 
-  // If we have a predicated store, it may not be executed for each vector 
-  // lane. Scale the cost by the probability of executing the predicated 
-  // block. 
-  if (isPredicatedInst(I)) { 
-    Cost /= getReciprocalPredBlockProb(); 
- 
-    if (useEmulatedMaskMemRefHack(I)) 
-      // Artificially setting to a high enough value to practically disable 
-      // vectorization with such operations. 
-      Cost = 3000000; 
-  } 
- 
-  return Cost; 
-} 
- 
+
+  // Get the overhead of the extractelement and insertelement instructions
+  // we might create due to scalarization.
+  Cost += getScalarizationOverhead(I, VF);
+
+  // If we have a predicated store, it may not be executed for each vector
+  // lane. Scale the cost by the probability of executing the predicated
+  // block.
+  if (isPredicatedInst(I)) {
+    Cost /= getReciprocalPredBlockProb();
+
+    if (useEmulatedMaskMemRefHack(I))
+      // Artificially setting to a high enough value to practically disable
+      // vectorization with such operations.
+      Cost = 3000000;
+  }
+
+  return Cost;
+}
+
 InstructionCost
 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
                                                     ElementCount VF) {
-  Type *ValTy = getMemInstValueType(I); 
-  auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 
-  Value *Ptr = getLoadStorePointerOperand(I); 
-  unsigned AS = getLoadStoreAddressSpace(I); 
-  int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 
-  enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 
- 
-  assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 
-         "Stride should be 1 or -1 for consecutive memory access"); 
-  const Align Alignment = getLoadStoreAlignment(I); 
+  Type *ValTy = getMemInstValueType(I);
+  auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
+  Value *Ptr = getLoadStorePointerOperand(I);
+  unsigned AS = getLoadStoreAddressSpace(I);
+  int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
+  enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+
+  assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
+         "Stride should be 1 or -1 for consecutive memory access");
+  const Align Alignment = getLoadStoreAlignment(I);
   InstructionCost Cost = 0;
-  if (Legal->isMaskRequired(I)) 
-    Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 
-                                      CostKind); 
-  else 
-    Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 
-                                CostKind, I); 
- 
-  bool Reverse = ConsecutiveStride < 0; 
-  if (Reverse) 
-    Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 
-  return Cost; 
-} 
- 
+  if (Legal->isMaskRequired(I))
+    Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
+                                      CostKind);
+  else
+    Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
+                                CostKind, I);
+
+  bool Reverse = ConsecutiveStride < 0;
+  if (Reverse)
+    Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
+  return Cost;
+}
+
 InstructionCost
 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
                                                 ElementCount VF) {
   assert(Legal->isUniformMemOp(*I));
 
-  Type *ValTy = getMemInstValueType(I); 
-  auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 
-  const Align Alignment = getLoadStoreAlignment(I); 
-  unsigned AS = getLoadStoreAddressSpace(I); 
-  enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 
-  if (isa<LoadInst>(I)) { 
-    return TTI.getAddressComputationCost(ValTy) + 
-           TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 
-                               CostKind) + 
-           TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 
-  } 
-  StoreInst *SI = cast<StoreInst>(I); 
- 
-  bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 
-  return TTI.getAddressComputationCost(ValTy) + 
-         TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 
-                             CostKind) + 
-         (isLoopInvariantStoreValue 
-              ? 0 
-              : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 
+  Type *ValTy = getMemInstValueType(I);
+  auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
+  const Align Alignment = getLoadStoreAlignment(I);
+  unsigned AS = getLoadStoreAddressSpace(I);
+  enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  if (isa<LoadInst>(I)) {
+    return TTI.getAddressComputationCost(ValTy) +
+           TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
+                               CostKind) +
+           TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
+  }
+  StoreInst *SI = cast<StoreInst>(I);
+
+  bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
+  return TTI.getAddressComputationCost(ValTy) +
+         TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
+                             CostKind) +
+         (isLoopInvariantStoreValue
+              ? 0
+              : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
                                        VF.getKnownMinValue() - 1));
-} 
- 
+}
+
 InstructionCost
 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
                                                  ElementCount VF) {
-  Type *ValTy = getMemInstValueType(I); 
-  auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 
-  const Align Alignment = getLoadStoreAlignment(I); 
-  const Value *Ptr = getLoadStorePointerOperand(I); 
- 
-  return TTI.getAddressComputationCost(VectorTy) + 
-         TTI.getGatherScatterOpCost( 
-             I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 
-             TargetTransformInfo::TCK_RecipThroughput, I); 
-} 
- 
+  Type *ValTy = getMemInstValueType(I);
+  auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
+  const Align Alignment = getLoadStoreAlignment(I);
+  const Value *Ptr = getLoadStorePointerOperand(I);
+
+  return TTI.getAddressComputationCost(VectorTy) +
+         TTI.getGatherScatterOpCost(
+             I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
+             TargetTransformInfo::TCK_RecipThroughput, I);
+}
+
 InstructionCost
 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
                                                    ElementCount VF) {
-  Type *ValTy = getMemInstValueType(I); 
-  auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 
-  unsigned AS = getLoadStoreAddressSpace(I); 
- 
-  auto Group = getInterleavedAccessGroup(I); 
-  assert(Group && "Fail to get an interleaved access group."); 
- 
-  unsigned InterleaveFactor = Group->getFactor(); 
+  Type *ValTy = getMemInstValueType(I);
+  auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
+  unsigned AS = getLoadStoreAddressSpace(I);
+
+  auto Group = getInterleavedAccessGroup(I);
+  assert(Group && "Fail to get an interleaved access group.");
+
+  unsigned InterleaveFactor = Group->getFactor();
   assert(!VF.isScalable() && "scalable vectors not yet supported.");
   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
- 
-  // Holds the indices of existing members in an interleaved load group. 
-  // An interleaved store group doesn't need this as it doesn't allow gaps. 
-  SmallVector<unsigned, 4> Indices; 
-  if (isa<LoadInst>(I)) { 
-    for (unsigned i = 0; i < InterleaveFactor; i++) 
-      if (Group->getMember(i)) 
-        Indices.push_back(i); 
-  } 
- 
-  // Calculate the cost of the whole interleaved group. 
-  bool UseMaskForGaps = 
-      Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 
+
+  // Holds the indices of existing members in an interleaved load group.
+  // An interleaved store group doesn't need this as it doesn't allow gaps.
+  SmallVector<unsigned, 4> Indices;
+  if (isa<LoadInst>(I)) {
+    for (unsigned i = 0; i < InterleaveFactor; i++)
+      if (Group->getMember(i))
+        Indices.push_back(i);
+  }
+
+  // Calculate the cost of the whole interleaved group.
+  bool UseMaskForGaps =
+      Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
-      I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 
-      AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 
- 
-  if (Group->isReverse()) { 
-    // TODO: Add support for reversed masked interleaved access. 
-    assert(!Legal->isMaskRequired(I) && 
-           "Reverse masked interleaved access not supported."); 
-    Cost += Group->getNumMembers() * 
-            TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 
-  } 
-  return Cost; 
-} 
- 
+      I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
+      AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
+
+  if (Group->isReverse()) {
+    // TODO: Add support for reversed masked interleaved access.
+    assert(!Legal->isMaskRequired(I) &&
+           "Reverse masked interleaved access not supported.");
+    Cost += Group->getNumMembers() *
+            TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
+  }
+  return Cost;
+}
+
 InstructionCost LoopVectorizationCostModel::getReductionPatternCost(
     Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
   // Early exit for no inloop reductions
@@ -6935,270 +6935,270 @@ InstructionCost LoopVectorizationCostModel::getReductionPatternCost(
 InstructionCost
 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
                                                      ElementCount VF) {
-  // Calculate scalar cost only. Vectorization cost should be ready at this 
-  // moment. 
+  // Calculate scalar cost only. Vectorization cost should be ready at this
+  // moment.
   if (VF.isScalar()) {
-    Type *ValTy = getMemInstValueType(I); 
-    const Align Alignment = getLoadStoreAlignment(I); 
-    unsigned AS = getLoadStoreAddressSpace(I); 
- 
-    return TTI.getAddressComputationCost(ValTy) + 
-           TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 
-                               TTI::TCK_RecipThroughput, I); 
-  } 
-  return getWideningCost(I, VF); 
-} 
- 
-LoopVectorizationCostModel::VectorizationCostTy 
+    Type *ValTy = getMemInstValueType(I);
+    const Align Alignment = getLoadStoreAlignment(I);
+    unsigned AS = getLoadStoreAddressSpace(I);
+
+    return TTI.getAddressComputationCost(ValTy) +
+           TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
+                               TTI::TCK_RecipThroughput, I);
+  }
+  return getWideningCost(I, VF);
+}
+
+LoopVectorizationCostModel::VectorizationCostTy
 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
                                                ElementCount VF) {
-  // If we know that this instruction will remain uniform, check the cost of 
-  // the scalar version. 
-  if (isUniformAfterVectorization(I, VF)) 
+  // If we know that this instruction will remain uniform, check the cost of
+  // the scalar version.
+  if (isUniformAfterVectorization(I, VF))
     VF = ElementCount::getFixed(1);
- 
+
   if (VF.isVector() && isProfitableToScalarize(I, VF))
-    return VectorizationCostTy(InstsToScalarize[VF][I], false); 
- 
-  // Forced scalars do not have any scalarization overhead. 
-  auto ForcedScalar = ForcedScalars.find(VF); 
+    return VectorizationCostTy(InstsToScalarize[VF][I], false);
+
+  // Forced scalars do not have any scalarization overhead.
+  auto ForcedScalar = ForcedScalars.find(VF);
   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
-    auto InstSet = ForcedScalar->second; 
-    if (InstSet.count(I)) 
+    auto InstSet = ForcedScalar->second;
+    if (InstSet.count(I))
       return VectorizationCostTy(
           (getInstructionCost(I, ElementCount::getFixed(1)).first *
            VF.getKnownMinValue()),
           false);
-  } 
- 
-  Type *VectorTy; 
+  }
+
+  Type *VectorTy;
   InstructionCost C = getInstructionCost(I, VF, VectorTy);
- 
-  bool TypeNotScalarized = 
+
+  bool TypeNotScalarized =
       VF.isVector() && VectorTy->isVectorTy() &&
       TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
-  return VectorizationCostTy(C, TypeNotScalarized); 
-} 
- 
+  return VectorizationCostTy(C, TypeNotScalarized);
+}
+
 InstructionCost
 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
                                                      ElementCount VF) {
- 
+
   assert(!VF.isScalable() &&
          "cannot compute scalarization overhead for scalable vectorization");
   if (VF.isScalar())
-    return 0; 
- 
+    return 0;
+
   InstructionCost Cost = 0;
-  Type *RetTy = ToVectorTy(I->getType(), VF); 
-  if (!RetTy->isVoidTy() && 
-      (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 
-    Cost += TTI.getScalarizationOverhead( 
+  Type *RetTy = ToVectorTy(I->getType(), VF);
+  if (!RetTy->isVoidTy() &&
+      (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
+    Cost += TTI.getScalarizationOverhead(
         cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
         true, false);
- 
-  // Some targets keep addresses scalar. 
-  if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 
-    return Cost; 
- 
-  // Some targets support efficient element stores. 
-  if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 
-    return Cost; 
- 
-  // Collect operands to consider. 
-  CallInst *CI = dyn_cast<CallInst>(I); 
-  Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 
- 
-  // Skip operands that do not require extraction/scalarization and do not incur 
-  // any overhead. 
-  return Cost + TTI.getOperandsScalarizationOverhead( 
+
+  // Some targets keep addresses scalar.
+  if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
+    return Cost;
+
+  // Some targets support efficient element stores.
+  if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
+    return Cost;
+
+  // Collect operands to consider.
+  CallInst *CI = dyn_cast<CallInst>(I);
+  Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
+
+  // Skip operands that do not require extraction/scalarization and do not incur
+  // any overhead.
+  return Cost + TTI.getOperandsScalarizationOverhead(
                     filterExtractingOperands(Ops, VF), VF.getKnownMinValue());
-} 
- 
+}
+
 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
   if (VF.isScalar())
-    return; 
-  NumPredStores = 0; 
-  for (BasicBlock *BB : TheLoop->blocks()) { 
-    // For each instruction in the old loop. 
-    for (Instruction &I : *BB) { 
-      Value *Ptr =  getLoadStorePointerOperand(&I); 
-      if (!Ptr) 
-        continue; 
- 
-      // TODO: We should generate better code and update the cost model for 
-      // predicated uniform stores. Today they are treated as any other 
-      // predicated store (see added test cases in 
-      // invariant-store-vectorization.ll). 
-      if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 
-        NumPredStores++; 
- 
+    return;
+  NumPredStores = 0;
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    // For each instruction in the old loop.
+    for (Instruction &I : *BB) {
+      Value *Ptr =  getLoadStorePointerOperand(&I);
+      if (!Ptr)
+        continue;
+
+      // TODO: We should generate better code and update the cost model for
+      // predicated uniform stores. Today they are treated as any other
+      // predicated store (see added test cases in
+      // invariant-store-vectorization.ll).
+      if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
+        NumPredStores++;
+
       if (Legal->isUniformMemOp(I)) {
-        // TODO: Avoid replicating loads and stores instead of 
-        // relying on instcombine to remove them. 
-        // Load: Scalar load + broadcast 
-        // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 
+        // TODO: Avoid replicating loads and stores instead of
+        // relying on instcombine to remove them.
+        // Load: Scalar load + broadcast
+        // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
         InstructionCost Cost = getUniformMemOpCost(&I, VF);
-        setWideningDecision(&I, VF, CM_Scalarize, Cost); 
-        continue; 
-      } 
- 
-      // We assume that widening is the best solution when possible. 
-      if (memoryInstructionCanBeWidened(&I, VF)) { 
+        setWideningDecision(&I, VF, CM_Scalarize, Cost);
+        continue;
+      }
+
+      // We assume that widening is the best solution when possible.
+      if (memoryInstructionCanBeWidened(&I, VF)) {
         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
-        int ConsecutiveStride = 
-               Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 
-        assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 
-               "Expected consecutive stride."); 
-        InstWidening Decision = 
-            ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 
-        setWideningDecision(&I, VF, Decision, Cost); 
-        continue; 
-      } 
- 
-      // Choose between Interleaving, Gather/Scatter or Scalarization. 
+        int ConsecutiveStride =
+               Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
+        assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
+               "Expected consecutive stride.");
+        InstWidening Decision =
+            ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
+        setWideningDecision(&I, VF, Decision, Cost);
+        continue;
+      }
+
+      // Choose between Interleaving, Gather/Scatter or Scalarization.
       InstructionCost InterleaveCost = std::numeric_limits<int>::max();
-      unsigned NumAccesses = 1; 
-      if (isAccessInterleaved(&I)) { 
-        auto Group = getInterleavedAccessGroup(&I); 
-        assert(Group && "Fail to get an interleaved access group."); 
- 
-        // Make one decision for the whole group. 
-        if (getWideningDecision(&I, VF) != CM_Unknown) 
-          continue; 
- 
-        NumAccesses = Group->getNumMembers(); 
-        if (interleavedAccessCanBeWidened(&I, VF)) 
-          InterleaveCost = getInterleaveGroupCost(&I, VF); 
-      } 
- 
+      unsigned NumAccesses = 1;
+      if (isAccessInterleaved(&I)) {
+        auto Group = getInterleavedAccessGroup(&I);
+        assert(Group && "Fail to get an interleaved access group.");
+
+        // Make one decision for the whole group.
+        if (getWideningDecision(&I, VF) != CM_Unknown)
+          continue;
+
+        NumAccesses = Group->getNumMembers();
+        if (interleavedAccessCanBeWidened(&I, VF))
+          InterleaveCost = getInterleaveGroupCost(&I, VF);
+      }
+
       InstructionCost GatherScatterCost =
-          isLegalGatherOrScatter(&I) 
-              ? getGatherScatterCost(&I, VF) * NumAccesses 
+          isLegalGatherOrScatter(&I)
+              ? getGatherScatterCost(&I, VF) * NumAccesses
               : std::numeric_limits<int>::max();
- 
+
       InstructionCost ScalarizationCost =
-          getMemInstScalarizationCost(&I, VF) * NumAccesses; 
- 
-      // Choose better solution for the current VF, 
-      // write down this decision and use it during vectorization. 
+          getMemInstScalarizationCost(&I, VF) * NumAccesses;
+
+      // Choose better solution for the current VF,
+      // write down this decision and use it during vectorization.
       InstructionCost Cost;
-      InstWidening Decision; 
-      if (InterleaveCost <= GatherScatterCost && 
-          InterleaveCost < ScalarizationCost) { 
-        Decision = CM_Interleave; 
-        Cost = InterleaveCost; 
-      } else if (GatherScatterCost < ScalarizationCost) { 
-        Decision = CM_GatherScatter; 
-        Cost = GatherScatterCost; 
-      } else { 
-        Decision = CM_Scalarize; 
-        Cost = ScalarizationCost; 
-      } 
-      // If the instructions belongs to an interleave group, the whole group 
-      // receives the same decision. The whole group receives the cost, but 
-      // the cost will actually be assigned to one instruction. 
-      if (auto Group = getInterleavedAccessGroup(&I)) 
-        setWideningDecision(Group, VF, Decision, Cost); 
-      else 
-        setWideningDecision(&I, VF, Decision, Cost); 
-    } 
-  } 
- 
-  // Make sure that any load of address and any other address computation 
-  // remains scalar unless there is gather/scatter support. This avoids 
-  // inevitable extracts into address registers, and also has the benefit of 
-  // activating LSR more, since that pass can't optimize vectorized 
-  // addresses. 
-  if (TTI.prefersVectorizedAddressing()) 
-    return; 
- 
-  // Start with all scalar pointer uses. 
-  SmallPtrSet<Instruction *, 8> AddrDefs; 
-  for (BasicBlock *BB : TheLoop->blocks()) 
-    for (Instruction &I : *BB) { 
-      Instruction *PtrDef = 
-        dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 
-      if (PtrDef && TheLoop->contains(PtrDef) && 
-          getWideningDecision(&I, VF) != CM_GatherScatter) 
-        AddrDefs.insert(PtrDef); 
-    } 
- 
-  // Add all instructions used to generate the addresses. 
-  SmallVector<Instruction *, 4> Worklist; 
+      InstWidening Decision;
+      if (InterleaveCost <= GatherScatterCost &&
+          InterleaveCost < ScalarizationCost) {
+        Decision = CM_Interleave;
+        Cost = InterleaveCost;
+      } else if (GatherScatterCost < ScalarizationCost) {
+        Decision = CM_GatherScatter;
+        Cost = GatherScatterCost;
+      } else {
+        Decision = CM_Scalarize;
+        Cost = ScalarizationCost;
+      }
+      // If the instructions belongs to an interleave group, the whole group
+      // receives the same decision. The whole group receives the cost, but
+      // the cost will actually be assigned to one instruction.
+      if (auto Group = getInterleavedAccessGroup(&I))
+        setWideningDecision(Group, VF, Decision, Cost);
+      else
+        setWideningDecision(&I, VF, Decision, Cost);
+    }
+  }
+
+  // Make sure that any load of address and any other address computation
+  // remains scalar unless there is gather/scatter support. This avoids
+  // inevitable extracts into address registers, and also has the benefit of
+  // activating LSR more, since that pass can't optimize vectorized
+  // addresses.
+  if (TTI.prefersVectorizedAddressing())
+    return;
+
+  // Start with all scalar pointer uses.
+  SmallPtrSet<Instruction *, 8> AddrDefs;
+  for (BasicBlock *BB : TheLoop->blocks())
+    for (Instruction &I : *BB) {
+      Instruction *PtrDef =
+        dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
+      if (PtrDef && TheLoop->contains(PtrDef) &&
+          getWideningDecision(&I, VF) != CM_GatherScatter)
+        AddrDefs.insert(PtrDef);
+    }
+
+  // Add all instructions used to generate the addresses.
+  SmallVector<Instruction *, 4> Worklist;
   append_range(Worklist, AddrDefs);
-  while (!Worklist.empty()) { 
-    Instruction *I = Worklist.pop_back_val(); 
-    for (auto &Op : I->operands()) 
-      if (auto *InstOp = dyn_cast<Instruction>(Op)) 
-        if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 
-            AddrDefs.insert(InstOp).second) 
-          Worklist.push_back(InstOp); 
-  } 
- 
-  for (auto *I : AddrDefs) { 
-    if (isa<LoadInst>(I)) { 
-      // Setting the desired widening decision should ideally be handled in 
-      // by cost functions, but since this involves the task of finding out 
-      // if the loaded register is involved in an address computation, it is 
-      // instead changed here when we know this is the case. 
-      InstWidening Decision = getWideningDecision(I, VF); 
-      if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 
-        // Scalarize a widened load of address. 
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.pop_back_val();
+    for (auto &Op : I->operands())
+      if (auto *InstOp = dyn_cast<Instruction>(Op))
+        if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
+            AddrDefs.insert(InstOp).second)
+          Worklist.push_back(InstOp);
+  }
+
+  for (auto *I : AddrDefs) {
+    if (isa<LoadInst>(I)) {
+      // Setting the desired widening decision should ideally be handled in
+      // by cost functions, but since this involves the task of finding out
+      // if the loaded register is involved in an address computation, it is
+      // instead changed here when we know this is the case.
+      InstWidening Decision = getWideningDecision(I, VF);
+      if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
+        // Scalarize a widened load of address.
         setWideningDecision(
             I, VF, CM_Scalarize,
             (VF.getKnownMinValue() *
              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
-      else if (auto Group = getInterleavedAccessGroup(I)) { 
-        // Scalarize an interleave group of address loads. 
-        for (unsigned I = 0; I < Group->getFactor(); ++I) { 
-          if (Instruction *Member = Group->getMember(I)) 
+      else if (auto Group = getInterleavedAccessGroup(I)) {
+        // Scalarize an interleave group of address loads.
+        for (unsigned I = 0; I < Group->getFactor(); ++I) {
+          if (Instruction *Member = Group->getMember(I))
             setWideningDecision(
                 Member, VF, CM_Scalarize,
                 (VF.getKnownMinValue() *
                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
-        } 
-      } 
-    } else 
-      // Make sure I gets scalarized and a cost estimate without 
-      // scalarization overhead. 
-      ForcedScalars[VF].insert(I); 
-  } 
-} 
- 
+        }
+      }
+    } else
+      // Make sure I gets scalarized and a cost estimate without
+      // scalarization overhead.
+      ForcedScalars[VF].insert(I);
+  }
+}
+
 InstructionCost
 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
                                                Type *&VectorTy) {
-  Type *RetTy = I->getType(); 
-  if (canTruncateToMinimalBitwidth(I, VF)) 
-    RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 
-  VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 
-  auto SE = PSE.getSE(); 
-  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 
- 
-  // TODO: We need to estimate the cost of intrinsic calls. 
-  switch (I->getOpcode()) { 
-  case Instruction::GetElementPtr: 
-    // We mark this instruction as zero-cost because the cost of GEPs in 
-    // vectorized code depends on whether the corresponding memory instruction 
-    // is scalarized or not. Therefore, we handle GEPs with the memory 
-    // instruction cost. 
-    return 0; 
-  case Instruction::Br: { 
-    // In cases of scalarized and predicated instructions, there will be VF 
-    // predicated blocks in the vectorized loop. Each branch around these 
-    // blocks requires also an extract of its vector compare i1 element. 
-    bool ScalarPredicatedBB = false; 
-    BranchInst *BI = cast<BranchInst>(I); 
+  Type *RetTy = I->getType();
+  if (canTruncateToMinimalBitwidth(I, VF))
+    RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
+  VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
+  auto SE = PSE.getSE();
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+
+  // TODO: We need to estimate the cost of intrinsic calls.
+  switch (I->getOpcode()) {
+  case Instruction::GetElementPtr:
+    // We mark this instruction as zero-cost because the cost of GEPs in
+    // vectorized code depends on whether the corresponding memory instruction
+    // is scalarized or not. Therefore, we handle GEPs with the memory
+    // instruction cost.
+    return 0;
+  case Instruction::Br: {
+    // In cases of scalarized and predicated instructions, there will be VF
+    // predicated blocks in the vectorized loop. Each branch around these
+    // blocks requires also an extract of its vector compare i1 element.
+    bool ScalarPredicatedBB = false;
+    BranchInst *BI = cast<BranchInst>(I);
     if (VF.isVector() && BI->isConditional() &&
-        (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 
-         PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 
-      ScalarPredicatedBB = true; 
- 
-    if (ScalarPredicatedBB) { 
-      // Return cost for branches around scalarized and predicated blocks. 
+        (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
+         PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
+      ScalarPredicatedBB = true;
+
+    if (ScalarPredicatedBB) {
+      // Return cost for branches around scalarized and predicated blocks.
       assert(!VF.isScalable() && "scalable vectors not yet supported.");
-      auto *Vec_i1Ty = 
+      auto *Vec_i1Ty =
           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
       return (TTI.getScalarizationOverhead(
                   Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
@@ -7206,86 +7206,86 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
               (TTI.getCFInstrCost(Instruction::Br, CostKind) *
                VF.getKnownMinValue()));
     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
-      // The back-edge branch will remain, as will all scalar branches. 
-      return TTI.getCFInstrCost(Instruction::Br, CostKind); 
-    else 
-      // This branch will be eliminated by if-conversion. 
-      return 0; 
-    // Note: We currently assume zero cost for an unconditional branch inside 
-    // a predicated block since it will become a fall-through, although we 
-    // may decide in the future to call TTI for all branches. 
-  } 
-  case Instruction::PHI: { 
-    auto *Phi = cast<PHINode>(I); 
- 
-    // First-order recurrences are replaced by vector shuffles inside the loop. 
-    // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 
+      // The back-edge branch will remain, as will all scalar branches.
+      return TTI.getCFInstrCost(Instruction::Br, CostKind);
+    else
+      // This branch will be eliminated by if-conversion.
+      return 0;
+    // Note: We currently assume zero cost for an unconditional branch inside
+    // a predicated block since it will become a fall-through, although we
+    // may decide in the future to call TTI for all branches.
+  }
+  case Instruction::PHI: {
+    auto *Phi = cast<PHINode>(I);
+
+    // First-order recurrences are replaced by vector shuffles inside the loop.
+    // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
       return TTI.getShuffleCost(
           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
           VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
- 
-    // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 
-    // converted into select instructions. We require N - 1 selects per phi 
-    // node, where N is the number of incoming values. 
+
+    // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
+    // converted into select instructions. We require N - 1 selects per phi
+    // node, where N is the number of incoming values.
     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
-      return (Phi->getNumIncomingValues() - 1) * 
-             TTI.getCmpSelInstrCost( 
-                 Instruction::Select, ToVectorTy(Phi->getType(), VF), 
-                 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 
+      return (Phi->getNumIncomingValues() - 1) *
+             TTI.getCmpSelInstrCost(
+                 Instruction::Select, ToVectorTy(Phi->getType(), VF),
+                 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
- 
-    return TTI.getCFInstrCost(Instruction::PHI, CostKind); 
-  } 
-  case Instruction::UDiv: 
-  case Instruction::SDiv: 
-  case Instruction::URem: 
-  case Instruction::SRem: 
-    // If we have a predicated instruction, it may not be executed for each 
-    // vector lane. Get the scalarization cost and scale this amount by the 
-    // probability of executing the predicated block. If the instruction is not 
-    // predicated, we fall through to the next case. 
+
+    return TTI.getCFInstrCost(Instruction::PHI, CostKind);
+  }
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+    // If we have a predicated instruction, it may not be executed for each
+    // vector lane. Get the scalarization cost and scale this amount by the
+    // probability of executing the predicated block. If the instruction is not
+    // predicated, we fall through to the next case.
     if (VF.isVector() && isScalarWithPredication(I)) {
       InstructionCost Cost = 0;
- 
-      // These instructions have a non-void type, so account for the phi nodes 
-      // that we will create. This cost is likely to be zero. The phi node 
-      // cost, if any, should be scaled by the block probability because it 
-      // models a copy at the end of each predicated block. 
+
+      // These instructions have a non-void type, so account for the phi nodes
+      // that we will create. This cost is likely to be zero. The phi node
+      // cost, if any, should be scaled by the block probability because it
+      // models a copy at the end of each predicated block.
       Cost += VF.getKnownMinValue() *
               TTI.getCFInstrCost(Instruction::PHI, CostKind);
- 
-      // The cost of the non-predicated instruction. 
+
+      // The cost of the non-predicated instruction.
       Cost += VF.getKnownMinValue() *
               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
- 
-      // The cost of insertelement and extractelement instructions needed for 
-      // scalarization. 
-      Cost += getScalarizationOverhead(I, VF); 
- 
-      // Scale the cost by the probability of executing the predicated blocks. 
-      // This assumes the predicated block for each vector lane is equally 
-      // likely. 
-      return Cost / getReciprocalPredBlockProb(); 
-    } 
-    LLVM_FALLTHROUGH; 
-  case Instruction::Add: 
-  case Instruction::FAdd: 
-  case Instruction::Sub: 
-  case Instruction::FSub: 
-  case Instruction::Mul: 
-  case Instruction::FMul: 
-  case Instruction::FDiv: 
-  case Instruction::FRem: 
-  case Instruction::Shl: 
-  case Instruction::LShr: 
-  case Instruction::AShr: 
-  case Instruction::And: 
-  case Instruction::Or: 
-  case Instruction::Xor: { 
-    // Since we will replace the stride by 1 the multiplication should go away. 
-    if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 
-      return 0; 
+
+      // The cost of insertelement and extractelement instructions needed for
+      // scalarization.
+      Cost += getScalarizationOverhead(I, VF);
+
+      // Scale the cost by the probability of executing the predicated blocks.
+      // This assumes the predicated block for each vector lane is equally
+      // likely.
+      return Cost / getReciprocalPredBlockProb();
+    }
+    LLVM_FALLTHROUGH;
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::FDiv:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    // Since we will replace the stride by 1 the multiplication should go away.
+    if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
+      return 0;
 
     // Detect reduction patterns
     InstructionCost RedCost;
@@ -7293,77 +7293,77 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
             .isValid())
       return RedCost;
 
-    // Certain instructions can be cheaper to vectorize if they have a constant 
-    // second vector operand. One example of this are shifts on x86. 
-    Value *Op2 = I->getOperand(1); 
-    TargetTransformInfo::OperandValueProperties Op2VP; 
-    TargetTransformInfo::OperandValueKind Op2VK = 
-        TTI.getOperandInfo(Op2, Op2VP); 
-    if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 
-      Op2VK = TargetTransformInfo::OK_UniformValue; 
- 
-    SmallVector<const Value *, 4> Operands(I->operand_values()); 
+    // Certain instructions can be cheaper to vectorize if they have a constant
+    // second vector operand. One example of this are shifts on x86.
+    Value *Op2 = I->getOperand(1);
+    TargetTransformInfo::OperandValueProperties Op2VP;
+    TargetTransformInfo::OperandValueKind Op2VK =
+        TTI.getOperandInfo(Op2, Op2VP);
+    if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
+      Op2VK = TargetTransformInfo::OK_UniformValue;
+
+    SmallVector<const Value *, 4> Operands(I->operand_values());
     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
-    return N * TTI.getArithmeticInstrCost( 
-                   I->getOpcode(), VectorTy, CostKind, 
-                   TargetTransformInfo::OK_AnyValue, 
-                   Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 
-  } 
-  case Instruction::FNeg: { 
+    return N * TTI.getArithmeticInstrCost(
+                   I->getOpcode(), VectorTy, CostKind,
+                   TargetTransformInfo::OK_AnyValue,
+                   Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
+  }
+  case Instruction::FNeg: {
     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
-    return N * TTI.getArithmeticInstrCost( 
-                   I->getOpcode(), VectorTy, CostKind, 
-                   TargetTransformInfo::OK_AnyValue, 
-                   TargetTransformInfo::OK_AnyValue, 
-                   TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 
-                   I->getOperand(0), I); 
-  } 
-  case Instruction::Select: { 
-    SelectInst *SI = cast<SelectInst>(I); 
-    const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 
-    bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 
-    Type *CondTy = SI->getCondition()->getType(); 
-    if (!ScalarCond) 
+    return N * TTI.getArithmeticInstrCost(
+                   I->getOpcode(), VectorTy, CostKind,
+                   TargetTransformInfo::OK_AnyValue,
+                   TargetTransformInfo::OK_AnyValue,
+                   TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
+                   I->getOperand(0), I);
+  }
+  case Instruction::Select: {
+    SelectInst *SI = cast<SelectInst>(I);
+    const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
+    bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
+    Type *CondTy = SI->getCondition()->getType();
+    if (!ScalarCond)
       CondTy = VectorType::get(CondTy, VF);
-    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 
+    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
-  } 
-  case Instruction::ICmp: 
-  case Instruction::FCmp: { 
-    Type *ValTy = I->getOperand(0)->getType(); 
-    Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 
-    if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 
-      ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 
-    VectorTy = ToVectorTy(ValTy, VF); 
+  }
+  case Instruction::ICmp:
+  case Instruction::FCmp: {
+    Type *ValTy = I->getOperand(0)->getType();
+    Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
+    if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
+      ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
+    VectorTy = ToVectorTy(ValTy, VF);
     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
-  } 
-  case Instruction::Store: 
-  case Instruction::Load: { 
+  }
+  case Instruction::Store:
+  case Instruction::Load: {
     ElementCount Width = VF;
     if (Width.isVector()) {
-      InstWidening Decision = getWideningDecision(I, Width); 
-      assert(Decision != CM_Unknown && 
-             "CM decision should be taken at this point"); 
-      if (Decision == CM_Scalarize) 
+      InstWidening Decision = getWideningDecision(I, Width);
+      assert(Decision != CM_Unknown &&
+             "CM decision should be taken at this point");
+      if (Decision == CM_Scalarize)
         Width = ElementCount::getFixed(1);
-    } 
-    VectorTy = ToVectorTy(getMemInstValueType(I), Width); 
-    return getMemoryInstructionCost(I, VF); 
-  } 
-  case Instruction::ZExt: 
-  case Instruction::SExt: 
-  case Instruction::FPToUI: 
-  case Instruction::FPToSI: 
-  case Instruction::FPExt: 
-  case Instruction::PtrToInt: 
-  case Instruction::IntToPtr: 
-  case Instruction::SIToFP: 
-  case Instruction::UIToFP: 
-  case Instruction::Trunc: 
-  case Instruction::FPTrunc: 
-  case Instruction::BitCast: { 
+    }
+    VectorTy = ToVectorTy(getMemInstValueType(I), Width);
+    return getMemoryInstructionCost(I, VF);
+  }
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::FPExt:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::SIToFP:
+  case Instruction::UIToFP:
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::BitCast: {
     // Computes the CastContextHint from a Load/Store instruction.
     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
@@ -7405,128 +7405,128 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
         CCH = ComputeCCH(Load);
     }
 
-    // We optimize the truncation of induction variables having constant 
-    // integer steps. The cost of these truncations is the same as the scalar 
-    // operation. 
-    if (isOptimizableIVTruncate(I, VF)) { 
-      auto *Trunc = cast<TruncInst>(I); 
-      return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 
+    // We optimize the truncation of induction variables having constant
+    // integer steps. The cost of these truncations is the same as the scalar
+    // operation.
+    if (isOptimizableIVTruncate(I, VF)) {
+      auto *Trunc = cast<TruncInst>(I);
+      return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
-    } 
- 
+    }
+
     // Detect reduction patterns
     InstructionCost RedCost;
     if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
             .isValid())
       return RedCost;
 
-    Type *SrcScalarTy = I->getOperand(0)->getType(); 
-    Type *SrcVecTy = 
-        VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 
-    if (canTruncateToMinimalBitwidth(I, VF)) { 
-      // This cast is going to be shrunk. This may remove the cast or it might 
-      // turn it into slightly different cast. For example, if MinBW == 16, 
-      // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 
-      // 
-      // Calculate the modified src and dest types. 
-      Type *MinVecTy = VectorTy; 
+    Type *SrcScalarTy = I->getOperand(0)->getType();
+    Type *SrcVecTy =
+        VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
+    if (canTruncateToMinimalBitwidth(I, VF)) {
+      // This cast is going to be shrunk. This may remove the cast or it might
+      // turn it into slightly different cast. For example, if MinBW == 16,
+      // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
+      //
+      // Calculate the modified src and dest types.
+      Type *MinVecTy = VectorTy;
       if (Opcode == Instruction::Trunc) {
-        SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 
-        VectorTy = 
-            largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 
+        SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
+        VectorTy =
+            largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
-        SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 
-        VectorTy = 
-            smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 
-      } 
-    } 
- 
+        SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
+        VectorTy =
+            smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
+      }
+    }
+
     assert(!VF.isScalable() && "VF is assumed to be non scalable");
     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
     return N *
            TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
-  } 
-  case Instruction::Call: { 
-    bool NeedToScalarize; 
-    CallInst *CI = cast<CallInst>(I); 
+  }
+  case Instruction::Call: {
+    bool NeedToScalarize;
+    CallInst *CI = cast<CallInst>(I);
     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
     if (getVectorIntrinsicIDForCall(CI, TLI)) {
       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
       return std::min(CallCost, IntrinsicCost);
     }
-    return CallCost; 
-  } 
+    return CallCost;
+  }
   case Instruction::ExtractValue:
     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
-  default: 
-    // The cost of executing VF copies of the scalar instruction. This opcode 
-    // is unknown. Assume that it is the same as 'mul'. 
+  default:
+    // The cost of executing VF copies of the scalar instruction. This opcode
+    // is unknown. Assume that it is the same as 'mul'.
     return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
                                        Instruction::Mul, VectorTy, CostKind) +
-           getScalarizationOverhead(I, VF); 
-  } // end of switch. 
-} 
- 
-char LoopVectorize::ID = 0; 
- 
-static const char lv_name[] = "Loop Vectorization"; 
- 
-INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
-INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 
-INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 
-INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 
- 
-namespace llvm { 
- 
-Pass *createLoopVectorizePass() { return new LoopVectorize(); } 
- 
-Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 
-                              bool VectorizeOnlyWhenForced) { 
-  return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 
-} 
- 
-} // end namespace llvm 
- 
-bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 
-  // Check if the pointer operand of a load or store instruction is 
-  // consecutive. 
-  if (auto *Ptr = getLoadStorePointerOperand(Inst)) 
-    return Legal->isConsecutivePtr(Ptr); 
-  return false; 
-} 
- 
-void LoopVectorizationCostModel::collectValuesToIgnore() { 
-  // Ignore ephemeral values. 
-  CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 
- 
-  // Ignore type-promoting instructions we identified during reduction 
-  // detection. 
-  for (auto &Reduction : Legal->getReductionVars()) { 
-    RecurrenceDescriptor &RedDes = Reduction.second; 
+           getScalarizationOverhead(I, VF);
+  } // end of switch.
+}
+
+char LoopVectorize::ID = 0;
+
+static const char lv_name[] = "Loop Vectorization";
+
+INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
+INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
+
+namespace llvm {
+
+Pass *createLoopVectorizePass() { return new LoopVectorize(); }
+
+Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
+                              bool VectorizeOnlyWhenForced) {
+  return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
+}
+
+} // end namespace llvm
+
+bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
+  // Check if the pointer operand of a load or store instruction is
+  // consecutive.
+  if (auto *Ptr = getLoadStorePointerOperand(Inst))
+    return Legal->isConsecutivePtr(Ptr);
+  return false;
+}
+
+void LoopVectorizationCostModel::collectValuesToIgnore() {
+  // Ignore ephemeral values.
+  CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
+
+  // Ignore type-promoting instructions we identified during reduction
+  // detection.
+  for (auto &Reduction : Legal->getReductionVars()) {
+    RecurrenceDescriptor &RedDes = Reduction.second;
     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
-    VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 
-  } 
-  // Ignore type-casting instructions we identified during induction 
-  // detection. 
-  for (auto &Induction : Legal->getInductionVars()) { 
-    InductionDescriptor &IndDes = Induction.second; 
-    const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 
-    VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 
-  } 
-} 
- 
+    VecValuesToIgnore.insert(Casts.begin(), Casts.end());
+  }
+  // Ignore type-casting instructions we identified during induction
+  // detection.
+  for (auto &Induction : Legal->getInductionVars()) {
+    InductionDescriptor &IndDes = Induction.second;
+    const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
+    VecValuesToIgnore.insert(Casts.begin(), Casts.end());
+  }
+}
+
 void LoopVectorizationCostModel::collectInLoopReductions() {
   for (auto &Reduction : Legal->getReductionVars()) {
     PHINode *Phi = Reduction.first;
@@ -7564,82 +7564,82 @@ void LoopVectorizationCostModel::collectInLoopReductions() {
   }
 }
 
-// TODO: we could return a pair of values that specify the max VF and 
-// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 
-// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 
-// doesn't have a cost model that can choose which plan to execute if 
-// more than one is generated. 
-static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 
-                                 LoopVectorizationCostModel &CM) { 
-  unsigned WidestType; 
-  std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 
-  return WidestVectorRegBits / WidestType; 
-} 
- 
-VectorizationFactor 
+// TODO: we could return a pair of values that specify the max VF and
+// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
+// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
+// doesn't have a cost model that can choose which plan to execute if
+// more than one is generated.
+static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
+                                 LoopVectorizationCostModel &CM) {
+  unsigned WidestType;
+  std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
+  return WidestVectorRegBits / WidestType;
+}
+
+VectorizationFactor
 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
   ElementCount VF = UserVF;
-  // Outer loop handling: They may require CFG and instruction level 
-  // transformations before even evaluating whether vectorization is profitable. 
-  // Since we cannot modify the incoming IR, we need to build VPlan upfront in 
-  // the vectorization pipeline. 
+  // Outer loop handling: They may require CFG and instruction level
+  // transformations before even evaluating whether vectorization is profitable.
+  // Since we cannot modify the incoming IR, we need to build VPlan upfront in
+  // the vectorization pipeline.
   if (!OrigLoop->isInnermost()) {
-    // If the user doesn't provide a vectorization factor, determine a 
-    // reasonable one. 
+    // If the user doesn't provide a vectorization factor, determine a
+    // reasonable one.
     if (UserVF.isZero()) {
       VF = ElementCount::getFixed(
           determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM));
-      LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 
- 
-      // Make sure we have a VF > 1 for stress testing. 
+      LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
+
+      // Make sure we have a VF > 1 for stress testing.
       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
-        LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 
-                          << "overriding computed VF.\n"); 
+        LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
+                          << "overriding computed VF.\n");
         VF = ElementCount::getFixed(4);
-      } 
-    } 
-    assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 
+      }
+    }
+    assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
            "VF needs to be a power of two");
     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
                       << "VF " << VF << " to build VPlans.\n");
-    buildVPlans(VF, VF); 
- 
-    // For VPlan build stress testing, we bail out after VPlan construction. 
-    if (VPlanBuildStressTest) 
-      return VectorizationFactor::Disabled(); 
- 
+    buildVPlans(VF, VF);
+
+    // For VPlan build stress testing, we bail out after VPlan construction.
+    if (VPlanBuildStressTest)
+      return VectorizationFactor::Disabled();
+
     return {VF, 0 /*Cost*/};
-  } 
- 
-  LLVM_DEBUG( 
-      dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 
-                "VPlan-native path.\n"); 
-  return VectorizationFactor::Disabled(); 
-} 
- 
+  }
+
+  LLVM_DEBUG(
+      dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
+                "VPlan-native path.\n");
+  return VectorizationFactor::Disabled();
+}
+
 Optional<VectorizationFactor>
 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
   Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
-  if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 
-    return None; 
- 
-  // Invalidate interleave groups if all blocks of loop will be predicated. 
-  if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 
-      !useMaskedInterleavedAccesses(*TTI)) { 
-    LLVM_DEBUG( 
-        dbgs() 
-        << "LV: Invalidate all interleaved groups due to fold-tail by masking " 
-           "which requires masked-interleaved support.\n"); 
-    if (CM.InterleaveInfo.invalidateGroups()) 
-      // Invalidating interleave groups also requires invalidating all decisions 
-      // based on them, which includes widening decisions and uniform and scalar 
-      // values. 
-      CM.invalidateCostModelingDecisions(); 
-  } 
- 
+  if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
+    return None;
+
+  // Invalidate interleave groups if all blocks of loop will be predicated.
+  if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
+      !useMaskedInterleavedAccesses(*TTI)) {
+    LLVM_DEBUG(
+        dbgs()
+        << "LV: Invalidate all interleaved groups due to fold-tail by masking "
+           "which requires masked-interleaved support.\n");
+    if (CM.InterleaveInfo.invalidateGroups())
+      // Invalidating interleave groups also requires invalidating all decisions
+      // based on them, which includes widening decisions and uniform and scalar
+      // values.
+      CM.invalidateCostModelingDecisions();
+  }
+
   ElementCount MaxVF = MaybeMaxVF.getValue();
   assert(MaxVF.isNonZero() && "MaxVF is zero.");
 
@@ -7654,59 +7654,59 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
                       << " VF " << VF << ".\n");
     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
            "VF needs to be a power of two");
-    // Collect the instructions (and their associated costs) that will be more 
-    // profitable to scalarize. 
+    // Collect the instructions (and their associated costs) that will be more
+    // profitable to scalarize.
     CM.selectUserVectorizationFactor(VF);
     CM.collectInLoopReductions();
     buildVPlansWithVPRecipes(VF, VF);
-    LLVM_DEBUG(printPlans(dbgs())); 
+    LLVM_DEBUG(printPlans(dbgs()));
     return {{VF, 0}};
-  } 
- 
+  }
+
   assert(!MaxVF.isScalable() &&
          "Scalable vectors not yet supported beyond this point");
- 
+
   for (ElementCount VF = ElementCount::getFixed(1);
        ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
-    // Collect Uniform and Scalar instructions after vectorization with VF. 
-    CM.collectUniformsAndScalars(VF); 
- 
-    // Collect the instructions (and their associated costs) that will be more 
-    // profitable to scalarize. 
+    // Collect Uniform and Scalar instructions after vectorization with VF.
+    CM.collectUniformsAndScalars(VF);
+
+    // Collect the instructions (and their associated costs) that will be more
+    // profitable to scalarize.
     if (VF.isVector())
-      CM.collectInstsToScalarize(VF); 
-  } 
- 
+      CM.collectInstsToScalarize(VF);
+  }
+
   CM.collectInLoopReductions();
 
   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF);
-  LLVM_DEBUG(printPlans(dbgs())); 
+  LLVM_DEBUG(printPlans(dbgs()));
   if (MaxVF.isScalar())
-    return VectorizationFactor::Disabled(); 
- 
-  // Select the optimal vectorization factor. 
-  return CM.selectVectorizationFactor(MaxVF); 
-} 
- 
+    return VectorizationFactor::Disabled();
+
+  // Select the optimal vectorization factor.
+  return CM.selectVectorizationFactor(MaxVF);
+}
+
 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
-  LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 
-                    << '\n'); 
-  BestVF = VF; 
-  BestUF = UF; 
- 
-  erase_if(VPlans, [VF](const VPlanPtr &Plan) { 
-    return !Plan->hasVF(VF); 
-  }); 
-  assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 
-} 
- 
-void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 
-                                           DominatorTree *DT) { 
-  // Perform the actual loop transformation. 
- 
-  // 1. Create a new empty loop. Unlink the old loop and connect the new one. 
-  VPCallbackILV CallbackILV(ILV); 
- 
+  LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
+                    << '\n');
+  BestVF = VF;
+  BestUF = UF;
+
+  erase_if(VPlans, [VF](const VPlanPtr &Plan) {
+    return !Plan->hasVF(VF);
+  });
+  assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
+}
+
+void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
+                                           DominatorTree *DT) {
+  // Perform the actual loop transformation.
+
+  // 1. Create a new empty loop. Unlink the old loop and connect the new one.
+  VPCallbackILV CallbackILV(ILV);
+
   assert(BestVF.hasValue() && "Vectorization Factor is missing");
 
   VPTransformState State{*BestVF,
@@ -7718,34 +7718,34 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
                          ILV.VectorLoopValueMap,
                          &ILV,
                          CallbackILV};
-  State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 
-  State.TripCount = ILV.getOrCreateTripCount(nullptr); 
-  State.CanonicalIV = ILV.Induction; 
- 
+  State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
+  State.TripCount = ILV.getOrCreateTripCount(nullptr);
+  State.CanonicalIV = ILV.Induction;
+
   ILV.printDebugTracesAtStart();
 
-  //===------------------------------------------------===// 
-  // 
-  // Notice: any optimization or new instruction that go 
-  // into the code below should also be implemented in 
-  // the cost-model. 
-  // 
-  //===------------------------------------------------===// 
- 
-  // 2. Copy and widen instructions from the old loop into the new loop. 
-  assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 
-  VPlans.front()->execute(&State); 
- 
-  // 3. Fix the vectorized code: take care of header phi's, live-outs, 
-  //    predication, updating analyses. 
-  ILV.fixVectorizedLoop(); 
+  //===------------------------------------------------===//
+  //
+  // Notice: any optimization or new instruction that go
+  // into the code below should also be implemented in
+  // the cost-model.
+  //
+  //===------------------------------------------------===//
+
+  // 2. Copy and widen instructions from the old loop into the new loop.
+  assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
+  VPlans.front()->execute(&State);
+
+  // 3. Fix the vectorized code: take care of header phi's, live-outs,
+  //    predication, updating analyses.
+  ILV.fixVectorizedLoop();
 
   ILV.printDebugTracesAtEnd();
-} 
- 
-void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 
-    SmallPtrSetImpl<Instruction *> &DeadInstructions) { 
- 
+}
+
+void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
+    SmallPtrSetImpl<Instruction *> &DeadInstructions) {
+
   // We create new control-flow for the vectorized loop, so the original exit
   // conditions will be dead after vectorization if it's only used by the
   // terminator
@@ -7755,7 +7755,7 @@ void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
     auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
     if (!Cmp || !Cmp->hasOneUse())
       continue;
- 
+
     // TODO: we should introduce a getUniqueExitingBlocks on Loop
     if (!DeadInstructions.insert(Cmp).second)
       continue;
@@ -7768,93 +7768,93 @@ void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
     }
   }
 
-  // We create new "steps" for induction variable updates to which the original 
-  // induction variables map. An original update instruction will be dead if 
-  // all its users except the induction variable are dead. 
+  // We create new "steps" for induction variable updates to which the original
+  // induction variables map. An original update instruction will be dead if
+  // all its users except the induction variable are dead.
   auto *Latch = OrigLoop->getLoopLatch();
-  for (auto &Induction : Legal->getInductionVars()) { 
-    PHINode *Ind = Induction.first; 
-    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 
+  for (auto &Induction : Legal->getInductionVars()) {
+    PHINode *Ind = Induction.first;
+    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
 
     // If the tail is to be folded by masking, the primary induction variable,
     // if exists, isn't dead: it will be used for masking. Don't kill it.
     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
       continue;
 
-    if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 
-          return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 
-        })) 
-      DeadInstructions.insert(IndUpdate); 
- 
-    // We record as "Dead" also the type-casting instructions we had identified 
-    // during induction analysis. We don't need any handling for them in the 
-    // vectorized loop because we have proven that, under a proper runtime 
-    // test guarding the vectorized loop, the value of the phi, and the casted 
-    // value of the phi, are the same. The last instruction in this casting chain 
-    // will get its scalar/vector/widened def from the scalar/vector/widened def 
-    // of the respective phi node. Any other casts in the induction def-use chain 
-    // have no other uses outside the phi update chain, and will be ignored. 
-    InductionDescriptor &IndDes = Induction.second; 
-    const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 
-    DeadInstructions.insert(Casts.begin(), Casts.end()); 
-  } 
-} 
- 
-Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 
- 
-Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 
- 
-Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 
-                                        Instruction::BinaryOps BinOp) { 
-  // When unrolling and the VF is 1, we only need to add a simple scalar. 
-  Type *Ty = Val->getType(); 
-  assert(!Ty->isVectorTy() && "Val must be a scalar"); 
- 
-  if (Ty->isFloatingPointTy()) { 
-    Constant *C = ConstantFP::get(Ty, (double)StartIdx); 
- 
-    // Floating point operations had to be 'fast' to enable the unrolling. 
-    Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 
-    return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 
-  } 
-  Constant *C = ConstantInt::get(Ty, StartIdx); 
-  return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 
-} 
- 
-static void AddRuntimeUnrollDisableMetaData(Loop *L) { 
-  SmallVector<Metadata *, 4> MDs; 
-  // Reserve first location for self reference to the LoopID metadata node. 
-  MDs.push_back(nullptr); 
-  bool IsUnrollMetadata = false; 
-  MDNode *LoopID = L->getLoopID(); 
-  if (LoopID) { 
-    // First find existing loop unrolling disable metadata. 
-    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 
-      auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 
-      if (MD) { 
-        const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 
-        IsUnrollMetadata = 
-            S && S->getString().startswith("llvm.loop.unroll.disable"); 
-      } 
-      MDs.push_back(LoopID->getOperand(i)); 
-    } 
-  } 
- 
-  if (!IsUnrollMetadata) { 
-    // Add runtime unroll disable metadata. 
-    LLVMContext &Context = L->getHeader()->getContext(); 
-    SmallVector<Metadata *, 1> DisableOperands; 
-    DisableOperands.push_back( 
-        MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 
-    MDNode *DisableNode = MDNode::get(Context, DisableOperands); 
-    MDs.push_back(DisableNode); 
-    MDNode *NewLoopID = MDNode::get(Context, MDs); 
-    // Set operand 0 to refer to the loop id itself. 
-    NewLoopID->replaceOperandWith(0, NewLoopID); 
-    L->setLoopID(NewLoopID); 
-  } 
-} 
- 
+    if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
+          return U == Ind || DeadInstructions.count(cast<Instruction>(U));
+        }))
+      DeadInstructions.insert(IndUpdate);
+
+    // We record as "Dead" also the type-casting instructions we had identified
+    // during induction analysis. We don't need any handling for them in the
+    // vectorized loop because we have proven that, under a proper runtime
+    // test guarding the vectorized loop, the value of the phi, and the casted
+    // value of the phi, are the same. The last instruction in this casting chain
+    // will get its scalar/vector/widened def from the scalar/vector/widened def
+    // of the respective phi node. Any other casts in the induction def-use chain
+    // have no other uses outside the phi update chain, and will be ignored.
+    InductionDescriptor &IndDes = Induction.second;
+    const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
+    DeadInstructions.insert(Casts.begin(), Casts.end());
+  }
+}
+
+Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
+
+Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
+
+Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
+                                        Instruction::BinaryOps BinOp) {
+  // When unrolling and the VF is 1, we only need to add a simple scalar.
+  Type *Ty = Val->getType();
+  assert(!Ty->isVectorTy() && "Val must be a scalar");
+
+  if (Ty->isFloatingPointTy()) {
+    Constant *C = ConstantFP::get(Ty, (double)StartIdx);
+
+    // Floating point operations had to be 'fast' to enable the unrolling.
+    Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
+    return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
+  }
+  Constant *C = ConstantInt::get(Ty, StartIdx);
+  return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
+}
+
+static void AddRuntimeUnrollDisableMetaData(Loop *L) {
+  SmallVector<Metadata *, 4> MDs;
+  // Reserve first location for self reference to the LoopID metadata node.
+  MDs.push_back(nullptr);
+  bool IsUnrollMetadata = false;
+  MDNode *LoopID = L->getLoopID();
+  if (LoopID) {
+    // First find existing loop unrolling disable metadata.
+    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+      auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+      if (MD) {
+        const auto *S = dyn_cast<MDString>(MD->getOperand(0));
+        IsUnrollMetadata =
+            S && S->getString().startswith("llvm.loop.unroll.disable");
+      }
+      MDs.push_back(LoopID->getOperand(i));
+    }
+  }
+
+  if (!IsUnrollMetadata) {
+    // Add runtime unroll disable metadata.
+    LLVMContext &Context = L->getHeader()->getContext();
+    SmallVector<Metadata *, 1> DisableOperands;
+    DisableOperands.push_back(
+        MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
+    MDNode *DisableNode = MDNode::get(Context, DisableOperands);
+    MDs.push_back(DisableNode);
+    MDNode *NewLoopID = MDNode::get(Context, MDs);
+    // Set operand 0 to refer to the loop id itself.
+    NewLoopID->replaceOperandWith(0, NewLoopID);
+    L->setLoopID(NewLoopID);
+  }
+}
+
 //===--------------------------------------------------------------------===//
 // EpilogueVectorizerMainLoop
 //===--------------------------------------------------------------------===//
@@ -8126,55 +8126,55 @@ void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
   });
 }
 
-bool LoopVectorizationPlanner::getDecisionAndClampRange( 
+bool LoopVectorizationPlanner::getDecisionAndClampRange(
     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
-  bool PredicateAtRangeStart = Predicate(Range.Start); 
- 
+  bool PredicateAtRangeStart = Predicate(Range.Start);
+
   for (ElementCount TmpVF = Range.Start * 2;
        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
-    if (Predicate(TmpVF) != PredicateAtRangeStart) { 
-      Range.End = TmpVF; 
-      break; 
-    } 
- 
-  return PredicateAtRangeStart; 
-} 
- 
-/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 
-/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 
-/// of VF's starting at a given VF and extending it as much as possible. Each 
-/// vectorization decision can potentially shorten this sub-range during 
-/// buildVPlan(). 
+    if (Predicate(TmpVF) != PredicateAtRangeStart) {
+      Range.End = TmpVF;
+      break;
+    }
+
+  return PredicateAtRangeStart;
+}
+
+/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
+/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
+/// of VF's starting at a given VF and extending it as much as possible. Each
+/// vectorization decision can potentially shorten this sub-range during
+/// buildVPlan().
 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
                                            ElementCount MaxVF) {
   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
     VFRange SubRange = {VF, MaxVFPlusOne};
-    VPlans.push_back(buildVPlan(SubRange)); 
-    VF = SubRange.End; 
-  } 
-} 
- 
-VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 
-                                         VPlanPtr &Plan) { 
-  assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 
- 
-  // Look for cached value. 
-  std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 
-  EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 
-  if (ECEntryIt != EdgeMaskCache.end()) 
-    return ECEntryIt->second; 
- 
-  VPValue *SrcMask = createBlockInMask(Src, Plan); 
- 
-  // The terminator has to be a branch inst! 
-  BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 
-  assert(BI && "Unexpected terminator found"); 
- 
-  if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 
-    return EdgeMaskCache[Edge] = SrcMask; 
- 
+    VPlans.push_back(buildVPlan(SubRange));
+    VF = SubRange.End;
+  }
+}
+
+VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
+                                         VPlanPtr &Plan) {
+  assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
+
+  // Look for cached value.
+  std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
+  EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
+  if (ECEntryIt != EdgeMaskCache.end())
+    return ECEntryIt->second;
+
+  VPValue *SrcMask = createBlockInMask(Src, Plan);
+
+  // The terminator has to be a branch inst!
+  BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
+  assert(BI && "Unexpected terminator found");
+
+  if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
+    return EdgeMaskCache[Edge] = SrcMask;
+
   // If source is an exiting block, we know the exit edge is dynamically dead
   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
   // adding uses of an otherwise potentially dead instruction.
@@ -8182,11 +8182,11 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
     return EdgeMaskCache[Edge] = SrcMask;
 
   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
-  assert(EdgeMask && "No Edge Mask found for condition"); 
- 
-  if (BI->getSuccessor(0) != Dst) 
-    EdgeMask = Builder.createNot(EdgeMask); 
- 
+  assert(EdgeMask && "No Edge Mask found for condition");
+
+  if (BI->getSuccessor(0) != Dst)
+    EdgeMask = Builder.createNot(EdgeMask);
+
   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
     // The condition is 'SrcMask && EdgeMask', which is equivalent to
     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
@@ -8196,44 +8196,44 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
         ConstantInt::getFalse(BI->getCondition()->getType()));
     EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False);
   }
- 
-  return EdgeMaskCache[Edge] = EdgeMask; 
-} 
- 
-VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 
-  assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 
- 
-  // Look for cached value. 
-  BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 
-  if (BCEntryIt != BlockMaskCache.end()) 
-    return BCEntryIt->second; 
- 
-  // All-one mask is modelled as no-mask following the convention for masked 
-  // load/store/gather/scatter. Initialize BlockMask to no-mask. 
-  VPValue *BlockMask = nullptr; 
- 
-  if (OrigLoop->getHeader() == BB) { 
-    if (!CM.blockNeedsPredication(BB)) 
-      return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 
- 
+
+  return EdgeMaskCache[Edge] = EdgeMask;
+}
+
+VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
+  assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
+
+  // Look for cached value.
+  BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
+  if (BCEntryIt != BlockMaskCache.end())
+    return BCEntryIt->second;
+
+  // All-one mask is modelled as no-mask following the convention for masked
+  // load/store/gather/scatter. Initialize BlockMask to no-mask.
+  VPValue *BlockMask = nullptr;
+
+  if (OrigLoop->getHeader() == BB) {
+    if (!CM.blockNeedsPredication(BB))
+      return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
+
     // Create the block in mask as the first non-phi instruction in the block.
     VPBuilder::InsertPointGuard Guard(Builder);
     auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
     Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
 
-    // Introduce the early-exit compare IV <= BTC to form header block mask. 
-    // This is used instead of IV < TC because TC may wrap, unlike BTC. 
-    // Start by constructing the desired canonical IV. 
-    VPValue *IV = nullptr; 
-    if (Legal->getPrimaryInduction()) 
+    // Introduce the early-exit compare IV <= BTC to form header block mask.
+    // This is used instead of IV < TC because TC may wrap, unlike BTC.
+    // Start by constructing the desired canonical IV.
+    VPValue *IV = nullptr;
+    if (Legal->getPrimaryInduction())
       IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
-    else { 
-      auto IVRecipe = new VPWidenCanonicalIVRecipe(); 
+    else {
+      auto IVRecipe = new VPWidenCanonicalIVRecipe();
       Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
-      IV = IVRecipe->getVPValue(); 
-    } 
-    VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 
-    bool TailFolded = !CM.isScalarEpilogueAllowed(); 
+      IV = IVRecipe->getVPValue();
+    }
+    VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
+    bool TailFolded = !CM.isScalarEpilogueAllowed();
 
     if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
       // While ActiveLaneMask is a binary op that consumes the loop tripcount
@@ -8242,320 +8242,320 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
       // happen.
       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
     } else {
-      BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 
+      BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
     }
-    return BlockMaskCache[BB] = BlockMask; 
-  } 
- 
-  // This is the block mask. We OR all incoming edges. 
-  for (auto *Predecessor : predecessors(BB)) { 
-    VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 
-    if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 
-      return BlockMaskCache[BB] = EdgeMask; 
- 
-    if (!BlockMask) { // BlockMask has its initialized nullptr value. 
-      BlockMask = EdgeMask; 
-      continue; 
-    } 
- 
-    BlockMask = Builder.createOr(BlockMask, EdgeMask); 
-  } 
- 
-  return BlockMaskCache[BB] = BlockMask; 
-} 
- 
+    return BlockMaskCache[BB] = BlockMask;
+  }
+
+  // This is the block mask. We OR all incoming edges.
+  for (auto *Predecessor : predecessors(BB)) {
+    VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
+    if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
+      return BlockMaskCache[BB] = EdgeMask;
+
+    if (!BlockMask) { // BlockMask has its initialized nullptr value.
+      BlockMask = EdgeMask;
+      continue;
+    }
+
+    BlockMask = Builder.createOr(BlockMask, EdgeMask);
+  }
+
+  return BlockMaskCache[BB] = BlockMask;
+}
+
 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
                                                 VPlanPtr &Plan) {
-  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 
-         "Must be called with either a load or store"); 
- 
+  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
+         "Must be called with either a load or store");
+
   auto willWiden = [&](ElementCount VF) -> bool {
     if (VF.isScalar())
-      return false; 
-    LoopVectorizationCostModel::InstWidening Decision = 
-        CM.getWideningDecision(I, VF); 
-    assert(Decision != LoopVectorizationCostModel::CM_Unknown && 
-           "CM decision should be taken at this point."); 
-    if (Decision == LoopVectorizationCostModel::CM_Interleave) 
-      return true; 
-    if (CM.isScalarAfterVectorization(I, VF) || 
-        CM.isProfitableToScalarize(I, VF)) 
-      return false; 
-    return Decision != LoopVectorizationCostModel::CM_Scalarize; 
-  }; 
- 
-  if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 
-    return nullptr; 
- 
-  VPValue *Mask = nullptr; 
-  if (Legal->isMaskRequired(I)) 
-    Mask = createBlockInMask(I->getParent(), Plan); 
- 
-  VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 
-  if (LoadInst *Load = dyn_cast<LoadInst>(I)) 
-    return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 
- 
-  StoreInst *Store = cast<StoreInst>(I); 
-  VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 
-  return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 
-} 
- 
-VPWidenIntOrFpInductionRecipe * 
+      return false;
+    LoopVectorizationCostModel::InstWidening Decision =
+        CM.getWideningDecision(I, VF);
+    assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
+           "CM decision should be taken at this point.");
+    if (Decision == LoopVectorizationCostModel::CM_Interleave)
+      return true;
+    if (CM.isScalarAfterVectorization(I, VF) ||
+        CM.isProfitableToScalarize(I, VF))
+      return false;
+    return Decision != LoopVectorizationCostModel::CM_Scalarize;
+  };
+
+  if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
+    return nullptr;
+
+  VPValue *Mask = nullptr;
+  if (Legal->isMaskRequired(I))
+    Mask = createBlockInMask(I->getParent(), Plan);
+
+  VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
+  if (LoadInst *Load = dyn_cast<LoadInst>(I))
+    return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
+
+  StoreInst *Store = cast<StoreInst>(I);
+  VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
+  return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
+}
+
+VPWidenIntOrFpInductionRecipe *
 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const {
-  // Check if this is an integer or fp induction. If so, build the recipe that 
-  // produces its scalar and vector values. 
-  InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 
-  if (II.getKind() == InductionDescriptor::IK_IntInduction || 
+  // Check if this is an integer or fp induction. If so, build the recipe that
+  // produces its scalar and vector values.
+  InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
+  if (II.getKind() == InductionDescriptor::IK_IntInduction ||
       II.getKind() == InductionDescriptor::IK_FpInduction) {
     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
     return new VPWidenIntOrFpInductionRecipe(Phi, Start);
   }
- 
-  return nullptr; 
-} 
- 
-VPWidenIntOrFpInductionRecipe * 
+
+  return nullptr;
+}
+
+VPWidenIntOrFpInductionRecipe *
 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range,
                                                 VPlan &Plan) const {
-  // Optimize the special case where the source is a constant integer 
-  // induction variable. Notice that we can only optimize the 'trunc' case 
-  // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 
-  // (c) other casts depend on pointer size. 
- 
-  // Determine whether \p K is a truncation based on an induction variable that 
-  // can be optimized. 
-  auto isOptimizableIVTruncate = 
+  // Optimize the special case where the source is a constant integer
+  // induction variable. Notice that we can only optimize the 'trunc' case
+  // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
+  // (c) other casts depend on pointer size.
+
+  // Determine whether \p K is a truncation based on an induction variable that
+  // can be optimized.
+  auto isOptimizableIVTruncate =
       [&](Instruction *K) -> std::function<bool(ElementCount)> {
     return [=](ElementCount VF) -> bool {
       return CM.isOptimizableIVTruncate(K, VF);
     };
-  }; 
- 
-  if (LoopVectorizationPlanner::getDecisionAndClampRange( 
+  };
+
+  if (LoopVectorizationPlanner::getDecisionAndClampRange(
           isOptimizableIVTruncate(I), Range)) {
 
     InductionDescriptor II =
         Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0)));
     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
-    return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 
+    return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
                                              Start, I);
   }
-  return nullptr; 
-} 
- 
-VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 
-  // We know that all PHIs in non-header blocks are converted into selects, so 
-  // we don't have to worry about the insertion order and we can just use the 
-  // builder. At this point we generate the predication tree. There may be 
-  // duplications since this is a simple recursive scan, but future 
-  // optimizations will clean it up. 
- 
-  SmallVector<VPValue *, 2> Operands; 
-  unsigned NumIncoming = Phi->getNumIncomingValues(); 
-  for (unsigned In = 0; In < NumIncoming; In++) { 
-    VPValue *EdgeMask = 
-      createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 
-    assert((EdgeMask || NumIncoming == 1) && 
-           "Multiple predecessors with one having a full mask"); 
-    Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 
-    if (EdgeMask) 
-      Operands.push_back(EdgeMask); 
-  } 
-  return new VPBlendRecipe(Phi, Operands); 
-} 
- 
-VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 
-                                                   VPlan &Plan) const { 
- 
-  bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 
+  return nullptr;
+}
+
+VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
+  // We know that all PHIs in non-header blocks are converted into selects, so
+  // we don't have to worry about the insertion order and we can just use the
+  // builder. At this point we generate the predication tree. There may be
+  // duplications since this is a simple recursive scan, but future
+  // optimizations will clean it up.
+
+  SmallVector<VPValue *, 2> Operands;
+  unsigned NumIncoming = Phi->getNumIncomingValues();
+  for (unsigned In = 0; In < NumIncoming; In++) {
+    VPValue *EdgeMask =
+      createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
+    assert((EdgeMask || NumIncoming == 1) &&
+           "Multiple predecessors with one having a full mask");
+    Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
+    if (EdgeMask)
+      Operands.push_back(EdgeMask);
+  }
+  return new VPBlendRecipe(Phi, Operands);
+}
+
+VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
+                                                   VPlan &Plan) const {
+
+  bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
       [this, CI](ElementCount VF) {
         return CM.isScalarWithPredication(CI, VF);
       },
-      Range); 
- 
-  if (IsPredicated) 
-    return nullptr; 
- 
-  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 
-  if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 
+      Range);
+
+  if (IsPredicated)
+    return nullptr;
+
+  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+  if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
              ID == Intrinsic::pseudoprobe ||
              ID == Intrinsic::experimental_noalias_scope_decl))
-    return nullptr; 
- 
+    return nullptr;
+
   auto willWiden = [&](ElementCount VF) -> bool {
-    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 
-    // The following case may be scalarized depending on the VF. 
-    // The flag shows whether we use Intrinsic or a usual Call for vectorized 
-    // version of the instruction. 
-    // Is it beneficial to perform intrinsic call compared to lib call? 
-    bool NeedToScalarize = false; 
+    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+    // The following case may be scalarized depending on the VF.
+    // The flag shows whether we use Intrinsic or a usual Call for vectorized
+    // version of the instruction.
+    // Is it beneficial to perform intrinsic call compared to lib call?
+    bool NeedToScalarize = false;
     InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
     InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
     bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
     assert(IntrinsicCost.isValid() && CallCost.isValid() &&
            "Cannot have invalid costs while widening");
-    return UseVectorIntrinsic || !NeedToScalarize; 
-  }; 
- 
-  if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 
-    return nullptr; 
- 
-  return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 
-} 
- 
-bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 
-  assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 
-         !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 
-  // Instruction should be widened, unless it is scalar after vectorization, 
-  // scalarization is profitable or it is predicated. 
+    return UseVectorIntrinsic || !NeedToScalarize;
+  };
+
+  if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
+    return nullptr;
+
+  return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
+}
+
+bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
+  assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
+         !isa<StoreInst>(I) && "Instruction should have been handled earlier");
+  // Instruction should be widened, unless it is scalar after vectorization,
+  // scalarization is profitable or it is predicated.
   auto WillScalarize = [this, I](ElementCount VF) -> bool {
-    return CM.isScalarAfterVectorization(I, VF) || 
-           CM.isProfitableToScalarize(I, VF) || 
-           CM.isScalarWithPredication(I, VF); 
-  }; 
-  return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 
-                                                             Range); 
-} 
- 
-VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 
-  auto IsVectorizableOpcode = [](unsigned Opcode) { 
-    switch (Opcode) { 
-    case Instruction::Add: 
-    case Instruction::And: 
-    case Instruction::AShr: 
-    case Instruction::BitCast: 
-    case Instruction::FAdd: 
-    case Instruction::FCmp: 
-    case Instruction::FDiv: 
-    case Instruction::FMul: 
-    case Instruction::FNeg: 
-    case Instruction::FPExt: 
-    case Instruction::FPToSI: 
-    case Instruction::FPToUI: 
-    case Instruction::FPTrunc: 
-    case Instruction::FRem: 
-    case Instruction::FSub: 
-    case Instruction::ICmp: 
-    case Instruction::IntToPtr: 
-    case Instruction::LShr: 
-    case Instruction::Mul: 
-    case Instruction::Or: 
-    case Instruction::PtrToInt: 
-    case Instruction::SDiv: 
-    case Instruction::Select: 
-    case Instruction::SExt: 
-    case Instruction::Shl: 
-    case Instruction::SIToFP: 
-    case Instruction::SRem: 
-    case Instruction::Sub: 
-    case Instruction::Trunc: 
-    case Instruction::UDiv: 
-    case Instruction::UIToFP: 
-    case Instruction::URem: 
-    case Instruction::Xor: 
-    case Instruction::ZExt: 
-      return true; 
-    } 
-    return false; 
-  }; 
- 
-  if (!IsVectorizableOpcode(I->getOpcode())) 
-    return nullptr; 
- 
-  // Success: widen this instruction. 
-  return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 
-} 
- 
-VPBasicBlock *VPRecipeBuilder::handleReplication( 
-    Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 
-    DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 
-    VPlanPtr &Plan) { 
-  bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 
+    return CM.isScalarAfterVectorization(I, VF) ||
+           CM.isProfitableToScalarize(I, VF) ||
+           CM.isScalarWithPredication(I, VF);
+  };
+  return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
+                                                             Range);
+}
+
+VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
+  auto IsVectorizableOpcode = [](unsigned Opcode) {
+    switch (Opcode) {
+    case Instruction::Add:
+    case Instruction::And:
+    case Instruction::AShr:
+    case Instruction::BitCast:
+    case Instruction::FAdd:
+    case Instruction::FCmp:
+    case Instruction::FDiv:
+    case Instruction::FMul:
+    case Instruction::FNeg:
+    case Instruction::FPExt:
+    case Instruction::FPToSI:
+    case Instruction::FPToUI:
+    case Instruction::FPTrunc:
+    case Instruction::FRem:
+    case Instruction::FSub:
+    case Instruction::ICmp:
+    case Instruction::IntToPtr:
+    case Instruction::LShr:
+    case Instruction::Mul:
+    case Instruction::Or:
+    case Instruction::PtrToInt:
+    case Instruction::SDiv:
+    case Instruction::Select:
+    case Instruction::SExt:
+    case Instruction::Shl:
+    case Instruction::SIToFP:
+    case Instruction::SRem:
+    case Instruction::Sub:
+    case Instruction::Trunc:
+    case Instruction::UDiv:
+    case Instruction::UIToFP:
+    case Instruction::URem:
+    case Instruction::Xor:
+    case Instruction::ZExt:
+      return true;
+    }
+    return false;
+  };
+
+  if (!IsVectorizableOpcode(I->getOpcode()))
+    return nullptr;
+
+  // Success: widen this instruction.
+  return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
+}
+
+VPBasicBlock *VPRecipeBuilder::handleReplication(
+    Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
+    DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
+    VPlanPtr &Plan) {
+  bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
-      Range); 
- 
-  bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 
+      Range);
+
+  bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
       [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
       Range);
- 
-  auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 
-                                       IsUniform, IsPredicated); 
-  setRecipe(I, Recipe); 
+
+  auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
+                                       IsUniform, IsPredicated);
+  setRecipe(I, Recipe);
   Plan->addVPValue(I, Recipe);
- 
-  // Find if I uses a predicated instruction. If so, it will use its scalar 
-  // value. Avoid hoisting the insert-element which packs the scalar value into 
-  // a vector value, as that happens iff all users use the vector value. 
-  for (auto &Op : I->operands()) 
-    if (auto *PredInst = dyn_cast<Instruction>(Op)) 
-      if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 
-        PredInst2Recipe[PredInst]->setAlsoPack(false); 
- 
-  // Finalize the recipe for Instr, first if it is not predicated. 
-  if (!IsPredicated) { 
-    LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 
-    VPBB->appendRecipe(Recipe); 
-    return VPBB; 
-  } 
-  LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 
-  assert(VPBB->getSuccessors().empty() && 
-         "VPBB has successors when handling predicated replication."); 
-  // Record predicated instructions for above packing optimizations. 
-  PredInst2Recipe[I] = Recipe; 
-  VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 
-  VPBlockUtils::insertBlockAfter(Region, VPBB); 
-  auto *RegSucc = new VPBasicBlock(); 
-  VPBlockUtils::insertBlockAfter(RegSucc, Region); 
-  return RegSucc; 
-} 
- 
-VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 
-                                                      VPRecipeBase *PredRecipe, 
-                                                      VPlanPtr &Plan) { 
-  // Instructions marked for predication are replicated and placed under an 
-  // if-then construct to prevent side-effects. 
- 
-  // Generate recipes to compute the block mask for this region. 
-  VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 
- 
-  // Build the triangular if-then region. 
-  std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 
-  assert(Instr->getParent() && "Predicated instruction not in any basic block"); 
-  auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 
-  auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 
+
+  // Find if I uses a predicated instruction. If so, it will use its scalar
+  // value. Avoid hoisting the insert-element which packs the scalar value into
+  // a vector value, as that happens iff all users use the vector value.
+  for (auto &Op : I->operands())
+    if (auto *PredInst = dyn_cast<Instruction>(Op))
+      if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
+        PredInst2Recipe[PredInst]->setAlsoPack(false);
+
+  // Finalize the recipe for Instr, first if it is not predicated.
+  if (!IsPredicated) {
+    LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
+    VPBB->appendRecipe(Recipe);
+    return VPBB;
+  }
+  LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
+  assert(VPBB->getSuccessors().empty() &&
+         "VPBB has successors when handling predicated replication.");
+  // Record predicated instructions for above packing optimizations.
+  PredInst2Recipe[I] = Recipe;
+  VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
+  VPBlockUtils::insertBlockAfter(Region, VPBB);
+  auto *RegSucc = new VPBasicBlock();
+  VPBlockUtils::insertBlockAfter(RegSucc, Region);
+  return RegSucc;
+}
+
+VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
+                                                      VPRecipeBase *PredRecipe,
+                                                      VPlanPtr &Plan) {
+  // Instructions marked for predication are replicated and placed under an
+  // if-then construct to prevent side-effects.
+
+  // Generate recipes to compute the block mask for this region.
+  VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
+
+  // Build the triangular if-then region.
+  std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
+  assert(Instr->getParent() && "Predicated instruction not in any basic block");
+  auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
+  auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
   auto *PHIRecipe = Instr->getType()->isVoidTy()
                         ? nullptr
                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
-  auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 
-  auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 
-  VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 
- 
-  // Note: first set Entry as region entry and then connect successors starting 
-  // from it in order, to propagate the "parent" of each VPBasicBlock. 
-  VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 
-  VPBlockUtils::connectBlocks(Pred, Exit); 
- 
-  return Region; 
-} 
- 
-VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 
-                                                      VFRange &Range, 
-                                                      VPlanPtr &Plan) { 
-  // First, check for specific widening recipes that deal with calls, memory 
-  // operations, inductions and Phi nodes. 
-  if (auto *CI = dyn_cast<CallInst>(Instr)) 
-    return tryToWidenCall(CI, Range, *Plan); 
- 
-  if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 
-    return tryToWidenMemory(Instr, Range, Plan); 
- 
-  VPRecipeBase *Recipe; 
-  if (auto Phi = dyn_cast<PHINode>(Instr)) { 
-    if (Phi->getParent() != OrigLoop->getHeader()) 
-      return tryToBlend(Phi, Plan); 
+  auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
+  auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
+  VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
+
+  // Note: first set Entry as region entry and then connect successors starting
+  // from it in order, to propagate the "parent" of each VPBasicBlock.
+  VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
+  VPBlockUtils::connectBlocks(Pred, Exit);
+
+  return Region;
+}
+
+VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
+                                                      VFRange &Range,
+                                                      VPlanPtr &Plan) {
+  // First, check for specific widening recipes that deal with calls, memory
+  // operations, inductions and Phi nodes.
+  if (auto *CI = dyn_cast<CallInst>(Instr))
+    return tryToWidenCall(CI, Range, *Plan);
+
+  if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
+    return tryToWidenMemory(Instr, Range, Plan);
+
+  VPRecipeBase *Recipe;
+  if (auto Phi = dyn_cast<PHINode>(Instr)) {
+    if (Phi->getParent() != OrigLoop->getHeader())
+      return tryToBlend(Phi, Plan);
     if ((Recipe = tryToOptimizeInductionPHI(Phi, *Plan)))
-      return Recipe; 
+      return Recipe;
 
     if (Legal->isReductionVariable(Phi)) {
       RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
@@ -8564,93 +8564,93 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
       return new VPWidenPHIRecipe(Phi, RdxDesc, *StartV);
     }
 
-    return new VPWidenPHIRecipe(Phi); 
-  } 
- 
+    return new VPWidenPHIRecipe(Phi);
+  }
+
   if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
                                     cast<TruncInst>(Instr), Range, *Plan)))
-    return Recipe; 
- 
-  if (!shouldWiden(Instr, Range)) 
-    return nullptr; 
- 
-  if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 
-    return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()), 
-                                OrigLoop); 
- 
-  if (auto *SI = dyn_cast<SelectInst>(Instr)) { 
-    bool InvariantCond = 
-        PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 
-    return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()), 
-                                   InvariantCond); 
-  } 
- 
-  return tryToWiden(Instr, *Plan); 
-} 
- 
+    return Recipe;
+
+  if (!shouldWiden(Instr, Range))
+    return nullptr;
+
+  if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
+    return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()),
+                                OrigLoop);
+
+  if (auto *SI = dyn_cast<SelectInst>(Instr)) {
+    bool InvariantCond =
+        PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
+    return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()),
+                                   InvariantCond);
+  }
+
+  return tryToWiden(Instr, *Plan);
+}
+
 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
                                                         ElementCount MaxVF) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
- 
-  // Collect instructions from the original loop that will become trivially dead 
-  // in the vectorized loop. We don't need to vectorize these instructions. For 
-  // example, original induction update instructions can become dead because we 
-  // separately emit induction "steps" when generating code for the new loop. 
-  // Similarly, we create a new latch condition when setting up the structure 
-  // of the new loop, so the old one can become dead. 
-  SmallPtrSet<Instruction *, 4> DeadInstructions; 
-  collectTriviallyDeadInstructions(DeadInstructions); 
- 
-  // Add assume instructions we need to drop to DeadInstructions, to prevent 
-  // them from being added to the VPlan. 
-  // TODO: We only need to drop assumes in blocks that get flattend. If the 
-  // control flow is preserved, we should keep them. 
-  auto &ConditionalAssumes = Legal->getConditionalAssumes(); 
-  DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 
- 
-  DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 
-  // Dead instructions do not need sinking. Remove them from SinkAfter. 
-  for (Instruction *I : DeadInstructions) 
-    SinkAfter.erase(I); 
- 
+
+  // Collect instructions from the original loop that will become trivially dead
+  // in the vectorized loop. We don't need to vectorize these instructions. For
+  // example, original induction update instructions can become dead because we
+  // separately emit induction "steps" when generating code for the new loop.
+  // Similarly, we create a new latch condition when setting up the structure
+  // of the new loop, so the old one can become dead.
+  SmallPtrSet<Instruction *, 4> DeadInstructions;
+  collectTriviallyDeadInstructions(DeadInstructions);
+
+  // Add assume instructions we need to drop to DeadInstructions, to prevent
+  // them from being added to the VPlan.
+  // TODO: We only need to drop assumes in blocks that get flattend. If the
+  // control flow is preserved, we should keep them.
+  auto &ConditionalAssumes = Legal->getConditionalAssumes();
+  DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
+
+  DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
+  // Dead instructions do not need sinking. Remove them from SinkAfter.
+  for (Instruction *I : DeadInstructions)
+    SinkAfter.erase(I);
+
   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
     VFRange SubRange = {VF, MaxVFPlusOne};
     VPlans.push_back(
         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
-    VF = SubRange.End; 
-  } 
-} 
- 
-VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 
+    VF = SubRange.End;
+  }
+}
+
+VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
-    const DenseMap<Instruction *, Instruction *> &SinkAfter) { 
- 
-  // Hold a mapping from predicated instructions to their recipes, in order to 
-  // fix their AlsoPack behavior if a user is determined to replicate and use a 
-  // scalar instead of vector value. 
-  DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 
- 
-  SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 
- 
-  VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 
- 
-  // --------------------------------------------------------------------------- 
-  // Pre-construction: record ingredients whose recipes we'll need to further 
-  // process after constructing the initial VPlan. 
-  // --------------------------------------------------------------------------- 
- 
-  // Mark instructions we'll need to sink later and their targets as 
-  // ingredients whose recipe we'll need to record. 
-  for (auto &Entry : SinkAfter) { 
-    RecipeBuilder.recordRecipeOf(Entry.first); 
-    RecipeBuilder.recordRecipeOf(Entry.second); 
-  } 
+    const DenseMap<Instruction *, Instruction *> &SinkAfter) {
+
+  // Hold a mapping from predicated instructions to their recipes, in order to
+  // fix their AlsoPack behavior if a user is determined to replicate and use a
+  // scalar instead of vector value.
+  DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
+
+  SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
+
+  VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
+
+  // ---------------------------------------------------------------------------
+  // Pre-construction: record ingredients whose recipes we'll need to further
+  // process after constructing the initial VPlan.
+  // ---------------------------------------------------------------------------
+
+  // Mark instructions we'll need to sink later and their targets as
+  // ingredients whose recipe we'll need to record.
+  for (auto &Entry : SinkAfter) {
+    RecipeBuilder.recordRecipeOf(Entry.first);
+    RecipeBuilder.recordRecipeOf(Entry.second);
+  }
   for (auto &Reduction : CM.getInLoopReductionChains()) {
     PHINode *Phi = Reduction.first;
     RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind();
     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
- 
+
     RecipeBuilder.recordRecipeOf(Phi);
     for (auto &R : ReductionOperations) {
       RecipeBuilder.recordRecipeOf(R);
@@ -8661,100 +8661,100 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
     }
   }
 
-  // For each interleave group which is relevant for this (possibly trimmed) 
-  // Range, add it to the set of groups to be later applied to the VPlan and add 
-  // placeholders for its members' Recipes which we'll be replacing with a 
-  // single VPInterleaveRecipe. 
-  for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 
+  // For each interleave group which is relevant for this (possibly trimmed)
+  // Range, add it to the set of groups to be later applied to the VPlan and add
+  // placeholders for its members' Recipes which we'll be replacing with a
+  // single VPInterleaveRecipe.
+  for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
     auto applyIG = [IG, this](ElementCount VF) -> bool {
       return (VF.isVector() && // Query is illegal for VF == 1
-              CM.getWideningDecision(IG->getInsertPos(), VF) == 
-                  LoopVectorizationCostModel::CM_Interleave); 
-    }; 
-    if (!getDecisionAndClampRange(applyIG, Range)) 
-      continue; 
-    InterleaveGroups.insert(IG); 
-    for (unsigned i = 0; i < IG->getFactor(); i++) 
-      if (Instruction *Member = IG->getMember(i)) 
-        RecipeBuilder.recordRecipeOf(Member); 
-  }; 
- 
-  // --------------------------------------------------------------------------- 
-  // Build initial VPlan: Scan the body of the loop in a topological order to 
-  // visit each basic block after having visited its predecessor basic blocks. 
-  // --------------------------------------------------------------------------- 
- 
-  // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 
-  auto Plan = std::make_unique<VPlan>(); 
-  VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 
-  Plan->setEntry(VPBB); 
- 
-  // Scan the body of the loop in a topological order to visit each basic block 
-  // after having visited its predecessor basic blocks. 
-  LoopBlocksDFS DFS(OrigLoop); 
-  DFS.perform(LI); 
- 
-  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 
-    // Relevant instructions from basic block BB will be grouped into VPRecipe 
-    // ingredients and fill a new VPBasicBlock. 
-    unsigned VPBBsForBB = 0; 
-    auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 
-    VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 
-    VPBB = FirstVPBBForBB; 
-    Builder.setInsertPoint(VPBB); 
- 
-    // Introduce each ingredient into VPlan. 
-    // TODO: Model and preserve debug instrinsics in VPlan. 
-    for (Instruction &I : BB->instructionsWithoutDebug()) { 
-      Instruction *Instr = &I; 
- 
-      // First filter out irrelevant instructions, to ensure no recipes are 
-      // built for them. 
-      if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 
-        continue; 
- 
-      if (auto Recipe = 
-              RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 
+              CM.getWideningDecision(IG->getInsertPos(), VF) ==
+                  LoopVectorizationCostModel::CM_Interleave);
+    };
+    if (!getDecisionAndClampRange(applyIG, Range))
+      continue;
+    InterleaveGroups.insert(IG);
+    for (unsigned i = 0; i < IG->getFactor(); i++)
+      if (Instruction *Member = IG->getMember(i))
+        RecipeBuilder.recordRecipeOf(Member);
+  };
+
+  // ---------------------------------------------------------------------------
+  // Build initial VPlan: Scan the body of the loop in a topological order to
+  // visit each basic block after having visited its predecessor basic blocks.
+  // ---------------------------------------------------------------------------
+
+  // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
+  auto Plan = std::make_unique<VPlan>();
+  VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
+  Plan->setEntry(VPBB);
+
+  // Scan the body of the loop in a topological order to visit each basic block
+  // after having visited its predecessor basic blocks.
+  LoopBlocksDFS DFS(OrigLoop);
+  DFS.perform(LI);
+
+  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
+    // Relevant instructions from basic block BB will be grouped into VPRecipe
+    // ingredients and fill a new VPBasicBlock.
+    unsigned VPBBsForBB = 0;
+    auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
+    VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
+    VPBB = FirstVPBBForBB;
+    Builder.setInsertPoint(VPBB);
+
+    // Introduce each ingredient into VPlan.
+    // TODO: Model and preserve debug instrinsics in VPlan.
+    for (Instruction &I : BB->instructionsWithoutDebug()) {
+      Instruction *Instr = &I;
+
+      // First filter out irrelevant instructions, to ensure no recipes are
+      // built for them.
+      if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
+        continue;
+
+      if (auto Recipe =
+              RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
         for (auto *Def : Recipe->definedValues()) {
           auto *UV = Def->getUnderlyingValue();
           Plan->addVPValue(UV, Def);
         }
 
-        RecipeBuilder.setRecipe(Instr, Recipe); 
-        VPBB->appendRecipe(Recipe); 
-        continue; 
-      } 
- 
-      // Otherwise, if all widening options failed, Instruction is to be 
-      // replicated. This may create a successor for VPBB. 
-      VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 
-          Instr, Range, VPBB, PredInst2Recipe, Plan); 
-      if (NextVPBB != VPBB) { 
-        VPBB = NextVPBB; 
-        VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 
-                                    : ""); 
-      } 
-    } 
-  } 
- 
-  // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 
-  // may also be empty, such as the last one VPBB, reflecting original 
-  // basic-blocks with no recipes. 
-  VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 
-  assert(PreEntry->empty() && "Expecting empty pre-entry block."); 
-  VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 
-  VPBlockUtils::disconnectBlocks(PreEntry, Entry); 
-  delete PreEntry; 
- 
-  // --------------------------------------------------------------------------- 
-  // Transform initial VPlan: Apply previously taken decisions, in order, to 
-  // bring the VPlan to its final state. 
-  // --------------------------------------------------------------------------- 
- 
-  // Apply Sink-After legal constraints. 
-  for (auto &Entry : SinkAfter) { 
-    VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 
-    VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 
+        RecipeBuilder.setRecipe(Instr, Recipe);
+        VPBB->appendRecipe(Recipe);
+        continue;
+      }
+
+      // Otherwise, if all widening options failed, Instruction is to be
+      // replicated. This may create a successor for VPBB.
+      VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
+          Instr, Range, VPBB, PredInst2Recipe, Plan);
+      if (NextVPBB != VPBB) {
+        VPBB = NextVPBB;
+        VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
+                                    : "");
+      }
+    }
+  }
+
+  // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
+  // may also be empty, such as the last one VPBB, reflecting original
+  // basic-blocks with no recipes.
+  VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
+  assert(PreEntry->empty() && "Expecting empty pre-entry block.");
+  VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
+  VPBlockUtils::disconnectBlocks(PreEntry, Entry);
+  delete PreEntry;
+
+  // ---------------------------------------------------------------------------
+  // Transform initial VPlan: Apply previously taken decisions, in order, to
+  // bring the VPlan to its final state.
+  // ---------------------------------------------------------------------------
+
+  // Apply Sink-After legal constraints.
+  for (auto &Entry : SinkAfter) {
+    VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
+    VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
     // If the target is in a replication region, make sure to move Sink to the
     // block after it, not into the replication region itself.
     if (auto *Region =
@@ -8767,26 +8767,26 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
         continue;
       }
     }
-    Sink->moveAfter(Target); 
-  } 
- 
-  // Interleave memory: for each Interleave Group we marked earlier as relevant 
-  // for this VPlan, replace the Recipes widening its memory instructions with a 
-  // single VPInterleaveRecipe at its insertion point. 
-  for (auto IG : InterleaveGroups) { 
-    auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 
-        RecipeBuilder.getRecipe(IG->getInsertPos())); 
+    Sink->moveAfter(Target);
+  }
+
+  // Interleave memory: for each Interleave Group we marked earlier as relevant
+  // for this VPlan, replace the Recipes widening its memory instructions with a
+  // single VPInterleaveRecipe at its insertion point.
+  for (auto IG : InterleaveGroups) {
+    auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
+        RecipeBuilder.getRecipe(IG->getInsertPos()));
     SmallVector<VPValue *, 4> StoredValues;
     for (unsigned i = 0; i < IG->getFactor(); ++i)
       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i)))
         StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0)));
- 
+
     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
                                         Recipe->getMask());
     VPIG->insertBefore(Recipe);
     unsigned J = 0;
-    for (unsigned i = 0; i < IG->getFactor(); ++i) 
-      if (Instruction *Member = IG->getMember(i)) { 
+    for (unsigned i = 0; i < IG->getFactor(); ++i)
+      if (Instruction *Member = IG->getMember(i)) {
         if (!Member->getType()->isVoidTy()) {
           VPValue *OriginalV = Plan->getVPValue(Member);
           Plan->removeVPValueFor(Member);
@@ -8794,78 +8794,78 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
           J++;
         }
-        RecipeBuilder.getRecipe(Member)->eraseFromParent(); 
-      } 
-  } 
- 
+        RecipeBuilder.getRecipe(Member)->eraseFromParent();
+      }
+  }
+
   // Adjust the recipes for any inloop reductions.
   if (Range.Start.isVector())
     adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
 
-  // Finally, if tail is folded by masking, introduce selects between the phi 
-  // and the live-out instruction of each reduction, at the end of the latch. 
+  // Finally, if tail is folded by masking, introduce selects between the phi
+  // and the live-out instruction of each reduction, at the end of the latch.
   if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
-    Builder.setInsertPoint(VPBB); 
-    auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 
-    for (auto &Reduction : Legal->getReductionVars()) { 
+    Builder.setInsertPoint(VPBB);
+    auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
+    for (auto &Reduction : Legal->getReductionVars()) {
       if (CM.isInLoopReduction(Reduction.first))
         continue;
       VPValue *Phi = Plan->getOrAddVPValue(Reduction.first);
       VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr());
-      Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 
-    } 
-  } 
- 
-  std::string PlanName; 
-  raw_string_ostream RSO(PlanName); 
+      Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
+    }
+  }
+
+  std::string PlanName;
+  raw_string_ostream RSO(PlanName);
   ElementCount VF = Range.Start;
-  Plan->addVF(VF); 
-  RSO << "Initial VPlan for VF={" << VF; 
+  Plan->addVF(VF);
+  RSO << "Initial VPlan for VF={" << VF;
   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
-    Plan->addVF(VF); 
-    RSO << "," << VF; 
-  } 
-  RSO << "},UF>=1"; 
-  RSO.flush(); 
-  Plan->setName(PlanName); 
- 
-  return Plan; 
-} 
- 
-VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 
-  // Outer loop handling: They may require CFG and instruction level 
-  // transformations before even evaluating whether vectorization is profitable. 
-  // Since we cannot modify the incoming IR, we need to build VPlan upfront in 
-  // the vectorization pipeline. 
+    Plan->addVF(VF);
+    RSO << "," << VF;
+  }
+  RSO << "},UF>=1";
+  RSO.flush();
+  Plan->setName(PlanName);
+
+  return Plan;
+}
+
+VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
+  // Outer loop handling: They may require CFG and instruction level
+  // transformations before even evaluating whether vectorization is profitable.
+  // Since we cannot modify the incoming IR, we need to build VPlan upfront in
+  // the vectorization pipeline.
   assert(!OrigLoop->isInnermost());
-  assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 
- 
-  // Create new empty VPlan 
-  auto Plan = std::make_unique<VPlan>(); 
- 
-  // Build hierarchical CFG 
-  VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 
-  HCFGBuilder.buildHierarchicalCFG(); 
- 
+  assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
+
+  // Create new empty VPlan
+  auto Plan = std::make_unique<VPlan>();
+
+  // Build hierarchical CFG
+  VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
+  HCFGBuilder.buildHierarchicalCFG();
+
   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
        VF *= 2)
-    Plan->addVF(VF); 
- 
-  if (EnableVPlanPredication) { 
-    VPlanPredicator VPP(*Plan); 
-    VPP.predicate(); 
- 
-    // Avoid running transformation to recipes until masked code generation in 
-    // VPlan-native path is in place. 
-    return Plan; 
-  } 
- 
-  SmallPtrSet<Instruction *, 1> DeadInstructions; 
-  VPlanTransforms::VPInstructionsToVPRecipes( 
-      OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 
-  return Plan; 
-} 
- 
+    Plan->addVF(VF);
+
+  if (EnableVPlanPredication) {
+    VPlanPredicator VPP(*Plan);
+    VPP.predicate();
+
+    // Avoid running transformation to recipes until masked code generation in
+    // VPlan-native path is in place.
+    return Plan;
+  }
+
+  SmallPtrSet<Instruction *, 1> DeadInstructions;
+  VPlanTransforms::VPInstructionsToVPRecipes(
+      OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
+  return Plan;
+}
+
 // Adjust the recipes for any inloop reductions. The chain of instructions
 // leading from the loop exit instr to the phi need to be converted to
 // reductions, with one operand being vector and the other being the scalar
@@ -8927,109 +8927,109 @@ void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
   }
 }
 
-Value* LoopVectorizationPlanner::VPCallbackILV:: 
-getOrCreateVectorValues(Value *V, unsigned Part) { 
-      return ILV.getOrCreateVectorValue(V, Part); 
-} 
- 
-Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 
-    Value *V, const VPIteration &Instance) { 
-  return ILV.getOrCreateScalarValue(V, Instance); 
-} 
- 
-void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 
-                               VPSlotTracker &SlotTracker) const { 
-  O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 
-  IG->getInsertPos()->printAsOperand(O, false); 
-  O << ", "; 
-  getAddr()->printAsOperand(O, SlotTracker); 
-  VPValue *Mask = getMask(); 
-  if (Mask) { 
-    O << ", "; 
-    Mask->printAsOperand(O, SlotTracker); 
-  } 
-  for (unsigned i = 0; i < IG->getFactor(); ++i) 
-    if (Instruction *I = IG->getMember(i)) 
-      O << "\\l\" +\n" << Indent << "\"  " << VPlanIngredient(I) << " " << i; 
-} 
- 
-void VPWidenCallRecipe::execute(VPTransformState &State) { 
+Value* LoopVectorizationPlanner::VPCallbackILV::
+getOrCreateVectorValues(Value *V, unsigned Part) {
+      return ILV.getOrCreateVectorValue(V, Part);
+}
+
+Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
+    Value *V, const VPIteration &Instance) {
+  return ILV.getOrCreateScalarValue(V, Instance);
+}
+
+void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
+                               VPSlotTracker &SlotTracker) const {
+  O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
+  IG->getInsertPos()->printAsOperand(O, false);
+  O << ", ";
+  getAddr()->printAsOperand(O, SlotTracker);
+  VPValue *Mask = getMask();
+  if (Mask) {
+    O << ", ";
+    Mask->printAsOperand(O, SlotTracker);
+  }
+  for (unsigned i = 0; i < IG->getFactor(); ++i)
+    if (Instruction *I = IG->getMember(i))
+      O << "\\l\" +\n" << Indent << "\"  " << VPlanIngredient(I) << " " << i;
+}
+
+void VPWidenCallRecipe::execute(VPTransformState &State) {
   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
                                   *this, State);
-} 
- 
-void VPWidenSelectRecipe::execute(VPTransformState &State) { 
+}
+
+void VPWidenSelectRecipe::execute(VPTransformState &State) {
   State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),
                                     this, *this, InvariantCond, State);
-} 
- 
-void VPWidenRecipe::execute(VPTransformState &State) { 
+}
+
+void VPWidenRecipe::execute(VPTransformState &State) {
   State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);
-} 
- 
-void VPWidenGEPRecipe::execute(VPTransformState &State) { 
+}
+
+void VPWidenGEPRecipe::execute(VPTransformState &State) {
   State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
                       *this, State.UF, State.VF, IsPtrLoopInvariant,
-                      IsIndexLoopInvariant, State); 
-} 
- 
-void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 
-  assert(!State.Instance && "Int or FP induction being replicated."); 
+                      IsIndexLoopInvariant, State);
+}
+
+void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
+  assert(!State.Instance && "Int or FP induction being replicated.");
   State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(),
                                    Trunc);
-} 
- 
-void VPWidenPHIRecipe::execute(VPTransformState &State) { 
+}
+
+void VPWidenPHIRecipe::execute(VPTransformState &State) {
   Value *StartV =
       getStartValue() ? getStartValue()->getLiveInIRValue() : nullptr;
   State.ILV->widenPHIInstruction(Phi, RdxDesc, StartV, State.UF, State.VF);
-} 
- 
-void VPBlendRecipe::execute(VPTransformState &State) { 
-  State.ILV->setDebugLocFromInst(State.Builder, Phi); 
-  // We know that all PHIs in non-header blocks are converted into 
-  // selects, so we don't have to worry about the insertion order and we 
-  // can just use the builder. 
-  // At this point we generate the predication tree. There may be 
-  // duplications since this is a simple recursive scan, but future 
-  // optimizations will clean it up. 
- 
-  unsigned NumIncoming = getNumIncomingValues(); 
- 
-  // Generate a sequence of selects of the form: 
-  // SELECT(Mask3, In3, 
-  //        SELECT(Mask2, In2, 
-  //               SELECT(Mask1, In1, 
-  //                      In0))) 
-  // Note that Mask0 is never used: lanes for which no path reaches this phi and 
-  // are essentially undef are taken from In0. 
-  InnerLoopVectorizer::VectorParts Entry(State.UF); 
-  for (unsigned In = 0; In < NumIncoming; ++In) { 
-    for (unsigned Part = 0; Part < State.UF; ++Part) { 
-      // We might have single edge PHIs (blocks) - use an identity 
-      // 'select' for the first PHI operand. 
-      Value *In0 = State.get(getIncomingValue(In), Part); 
-      if (In == 0) 
-        Entry[Part] = In0; // Initialize with the first incoming value. 
-      else { 
-        // Select between the current value and the previous incoming edge 
-        // based on the incoming mask. 
-        Value *Cond = State.get(getMask(In), Part); 
-        Entry[Part] = 
-            State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 
-      } 
-    } 
-  } 
-  for (unsigned Part = 0; Part < State.UF; ++Part) 
-    State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 
-} 
- 
-void VPInterleaveRecipe::execute(VPTransformState &State) { 
-  assert(!State.Instance && "Interleave group being replicated."); 
+}
+
+void VPBlendRecipe::execute(VPTransformState &State) {
+  State.ILV->setDebugLocFromInst(State.Builder, Phi);
+  // We know that all PHIs in non-header blocks are converted into
+  // selects, so we don't have to worry about the insertion order and we
+  // can just use the builder.
+  // At this point we generate the predication tree. There may be
+  // duplications since this is a simple recursive scan, but future
+  // optimizations will clean it up.
+
+  unsigned NumIncoming = getNumIncomingValues();
+
+  // Generate a sequence of selects of the form:
+  // SELECT(Mask3, In3,
+  //        SELECT(Mask2, In2,
+  //               SELECT(Mask1, In1,
+  //                      In0)))
+  // Note that Mask0 is never used: lanes for which no path reaches this phi and
+  // are essentially undef are taken from In0.
+  InnerLoopVectorizer::VectorParts Entry(State.UF);
+  for (unsigned In = 0; In < NumIncoming; ++In) {
+    for (unsigned Part = 0; Part < State.UF; ++Part) {
+      // We might have single edge PHIs (blocks) - use an identity
+      // 'select' for the first PHI operand.
+      Value *In0 = State.get(getIncomingValue(In), Part);
+      if (In == 0)
+        Entry[Part] = In0; // Initialize with the first incoming value.
+      else {
+        // Select between the current value and the previous incoming edge
+        // based on the incoming mask.
+        Value *Cond = State.get(getMask(In), Part);
+        Entry[Part] =
+            State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
+      }
+    }
+  }
+  for (unsigned Part = 0; Part < State.UF; ++Part)
+    State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
+}
+
+void VPInterleaveRecipe::execute(VPTransformState &State) {
+  assert(!State.Instance && "Interleave group being replicated.");
   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
                                       getStoredValues(), getMask());
-} 
- 
+}
+
 void VPReductionRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "Reduction being replicated.");
   for (unsigned Part = 0; Part < State.UF; ++Part) {
@@ -9062,116 +9062,116 @@ void VPReductionRecipe::execute(VPTransformState &State) {
   }
 }
 
-void VPReplicateRecipe::execute(VPTransformState &State) { 
-  if (State.Instance) { // Generate a single instance. 
+void VPReplicateRecipe::execute(VPTransformState &State) {
+  if (State.Instance) { // Generate a single instance.
     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
     State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this,
                                     *State.Instance, IsPredicated, State);
-    // Insert scalar instance packing it into a vector. 
+    // Insert scalar instance packing it into a vector.
     if (AlsoPack && State.VF.isVector()) {
       // If we're constructing lane 0, initialize to start from poison.
-      if (State.Instance->Lane == 0) { 
+      if (State.Instance->Lane == 0) {
         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
         Value *Poison = PoisonValue::get(
             VectorType::get(getUnderlyingValue()->getType(), State.VF));
         State.ValueMap.setVectorValue(getUnderlyingInstr(),
                                       State.Instance->Part, Poison);
-      } 
+      }
       State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(),
                                            *State.Instance);
-    } 
-    return; 
-  } 
- 
-  // Generate scalar instances for all VF lanes of all UF parts, unless the 
-  // instruction is uniform inwhich case generate only the first lane for each 
-  // of the UF parts. 
+    }
+    return;
+  }
+
+  // Generate scalar instances for all VF lanes of all UF parts, unless the
+  // instruction is uniform inwhich case generate only the first lane for each
+  // of the UF parts.
   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
   assert((!State.VF.isScalable() || IsUniform) &&
          "Can't scalarize a scalable vector");
-  for (unsigned Part = 0; Part < State.UF; ++Part) 
-    for (unsigned Lane = 0; Lane < EndLane; ++Lane) 
+  for (unsigned Part = 0; Part < State.UF; ++Part)
+    for (unsigned Lane = 0; Lane < EndLane; ++Lane)
       State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane},
-                                      IsPredicated, State); 
-} 
- 
-void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 
-  assert(State.Instance && "Branch on Mask works only on single instance."); 
- 
-  unsigned Part = State.Instance->Part; 
-  unsigned Lane = State.Instance->Lane; 
- 
-  Value *ConditionBit = nullptr; 
-  VPValue *BlockInMask = getMask(); 
-  if (BlockInMask) { 
-    ConditionBit = State.get(BlockInMask, Part); 
-    if (ConditionBit->getType()->isVectorTy()) 
-      ConditionBit = State.Builder.CreateExtractElement( 
-          ConditionBit, State.Builder.getInt32(Lane)); 
-  } else // Block in mask is all-one. 
-    ConditionBit = State.Builder.getTrue(); 
- 
-  // Replace the temporary unreachable terminator with a new conditional branch, 
-  // whose two destinations will be set later when they are created. 
-  auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 
-  assert(isa<UnreachableInst>(CurrentTerminator) && 
-         "Expected to replace unreachable terminator with conditional branch."); 
-  auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 
-  CondBr->setSuccessor(0, nullptr); 
-  ReplaceInstWithInst(CurrentTerminator, CondBr); 
-} 
- 
-void VPPredInstPHIRecipe::execute(VPTransformState &State) { 
-  assert(State.Instance && "Predicated instruction PHI works per instance."); 
+                                      IsPredicated, State);
+}
+
+void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
+  assert(State.Instance && "Branch on Mask works only on single instance.");
+
+  unsigned Part = State.Instance->Part;
+  unsigned Lane = State.Instance->Lane;
+
+  Value *ConditionBit = nullptr;
+  VPValue *BlockInMask = getMask();
+  if (BlockInMask) {
+    ConditionBit = State.get(BlockInMask, Part);
+    if (ConditionBit->getType()->isVectorTy())
+      ConditionBit = State.Builder.CreateExtractElement(
+          ConditionBit, State.Builder.getInt32(Lane));
+  } else // Block in mask is all-one.
+    ConditionBit = State.Builder.getTrue();
+
+  // Replace the temporary unreachable terminator with a new conditional branch,
+  // whose two destinations will be set later when they are created.
+  auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
+  assert(isa<UnreachableInst>(CurrentTerminator) &&
+         "Expected to replace unreachable terminator with conditional branch.");
+  auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
+  CondBr->setSuccessor(0, nullptr);
+  ReplaceInstWithInst(CurrentTerminator, CondBr);
+}
+
+void VPPredInstPHIRecipe::execute(VPTransformState &State) {
+  assert(State.Instance && "Predicated instruction PHI works per instance.");
   Instruction *ScalarPredInst =
       cast<Instruction>(State.get(getOperand(0), *State.Instance));
-  BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 
-  BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 
-  assert(PredicatingBB && "Predicated block has no single predecessor."); 
- 
-  // By current pack/unpack logic we need to generate only a single phi node: if 
-  // a vector value for the predicated instruction exists at this point it means 
-  // the instruction has vector users only, and a phi for the vector value is 
-  // needed. In this case the recipe of the predicated instruction is marked to 
-  // also do that packing, thereby "hoisting" the insert-element sequence. 
-  // Otherwise, a phi node for the scalar value is needed. 
-  unsigned Part = State.Instance->Part; 
+  BasicBlock *PredicatedBB = ScalarPredInst->getParent();
+  BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
+  assert(PredicatingBB && "Predicated block has no single predecessor.");
+
+  // By current pack/unpack logic we need to generate only a single phi node: if
+  // a vector value for the predicated instruction exists at this point it means
+  // the instruction has vector users only, and a phi for the vector value is
+  // needed. In this case the recipe of the predicated instruction is marked to
+  // also do that packing, thereby "hoisting" the insert-element sequence.
+  // Otherwise, a phi node for the scalar value is needed.
+  unsigned Part = State.Instance->Part;
   Instruction *PredInst =
       cast<Instruction>(getOperand(0)->getUnderlyingValue());
-  if (State.ValueMap.hasVectorValue(PredInst, Part)) { 
-    Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 
-    InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 
-    PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 
-    VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 
-    VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 
-    State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 
-  } else { 
-    Type *PredInstType = PredInst->getType(); 
-    PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 
+  if (State.ValueMap.hasVectorValue(PredInst, Part)) {
+    Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
+    InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
+    PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
+    VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
+    VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
+    State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
+  } else {
+    Type *PredInstType = PredInst->getType();
+    PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), PredicatingBB);
-    Phi->addIncoming(ScalarPredInst, PredicatedBB); 
-    State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 
-  } 
-} 
- 
-void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 
+    Phi->addIncoming(ScalarPredInst, PredicatedBB);
+    State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
+  }
+}
+
+void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
   State.ILV->vectorizeMemoryInstruction(&Ingredient, State,
                                         StoredValue ? nullptr : getVPValue(),
                                         getAddr(), StoredValue, getMask());
-} 
- 
-// Determine how to lower the scalar epilogue, which depends on 1) optimising 
-// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 
-// predication, and 4) a TTI hook that analyses whether the loop is suitable 
-// for predication. 
-static ScalarEpilogueLowering getScalarEpilogueLowering( 
-    Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 
-    BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 
-    AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 
-    LoopVectorizationLegality &LVL) { 
-  // 1) OptSize takes precedence over all other options, i.e. if this is set, 
-  // don't look at hints or options, and don't request a scalar epilogue. 
+}
+
+// Determine how to lower the scalar epilogue, which depends on 1) optimising
+// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
+// predication, and 4) a TTI hook that analyses whether the loop is suitable
+// for predication.
+static ScalarEpilogueLowering getScalarEpilogueLowering(
+    Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
+    BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
+    AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
+    LoopVectorizationLegality &LVL) {
+  // 1) OptSize takes precedence over all other options, i.e. if this is set,
+  // don't look at hints or options, and don't request a scalar epilogue.
   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
   // LoopAccessInfo (due to code dependency and not being able to reliably get
   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
@@ -9181,8 +9181,8 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
                                                       PGSOQueryType::IRPass) &&
                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
-    return CM_ScalarEpilogueNotAllowedOptSize; 
- 
+    return CM_ScalarEpilogueNotAllowedOptSize;
+
   // 2) If set, obey the directives
   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
     switch (PreferPredicateOverEpilogue) {
@@ -9194,356 +9194,356 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
       return CM_ScalarEpilogueNotAllowedUsePredicate;
     };
   }
- 
+
   // 3) If set, obey the hints
   switch (Hints.getPredicate()) {
   case LoopVectorizeHints::FK_Enabled:
     return CM_ScalarEpilogueNotNeededUsePredicate;
   case LoopVectorizeHints::FK_Disabled:
-    return CM_ScalarEpilogueAllowed; 
+    return CM_ScalarEpilogueAllowed;
   };
- 
+
   // 4) if the TTI hook indicates this is profitable, request predication.
   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
                                        LVL.getLAI()))
-    return CM_ScalarEpilogueNotNeededUsePredicate; 
- 
-  return CM_ScalarEpilogueAllowed; 
-} 
- 
+    return CM_ScalarEpilogueNotNeededUsePredicate;
+
+  return CM_ScalarEpilogueAllowed;
+}
+
 void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V,
                            unsigned Part) {
   set(Def, V, Part);
   ILV->setVectorValue(IRDef, Part, V);
 }
 
-// Process the loop in the VPlan-native vectorization path. This path builds 
-// VPlan upfront in the vectorization pipeline, which allows to apply 
-// VPlan-to-VPlan transformations from the very beginning without modifying the 
-// input LLVM IR. 
-static bool processLoopInVPlanNativePath( 
-    Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 
-    LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 
-    TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 
-    OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 
-    ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 
- 
+// Process the loop in the VPlan-native vectorization path. This path builds
+// VPlan upfront in the vectorization pipeline, which allows to apply
+// VPlan-to-VPlan transformations from the very beginning without modifying the
+// input LLVM IR.
+static bool processLoopInVPlanNativePath(
+    Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
+    LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
+    TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
+    OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
+    ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
+
   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
-    LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 
-    return false; 
-  } 
-  assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 
-  Function *F = L->getHeader()->getParent(); 
-  InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 
- 
-  ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 
-      F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 
- 
-  LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 
-                                &Hints, IAI); 
-  // Use the planner for outer loop vectorization. 
-  // TODO: CM is not used at this point inside the planner. Turn CM into an 
-  // optional argument if we don't need it in the future. 
-  LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 
- 
-  // Get user vectorization factor. 
+    LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
+    return false;
+  }
+  assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
+  Function *F = L->getHeader()->getParent();
+  InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
+
+  ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
+      F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
+
+  LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
+                                &Hints, IAI);
+  // Use the planner for outer loop vectorization.
+  // TODO: CM is not used at this point inside the planner. Turn CM into an
+  // optional argument if we don't need it in the future.
+  LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
+
+  // Get user vectorization factor.
   ElementCount UserVF = Hints.getWidth();
- 
-  // Plan how to best vectorize, return the best VF and its cost. 
-  const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 
- 
-  // If we are stress testing VPlan builds, do not attempt to generate vector 
-  // code. Masked vector code generation support will follow soon. 
-  // Also, do not attempt to vectorize if no vector code will be produced. 
-  if (VPlanBuildStressTest || EnableVPlanPredication || 
-      VectorizationFactor::Disabled() == VF) 
-    return false; 
- 
-  LVP.setBestPlan(VF.Width, 1); 
- 
-  InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 
+
+  // Plan how to best vectorize, return the best VF and its cost.
+  const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
+
+  // If we are stress testing VPlan builds, do not attempt to generate vector
+  // code. Masked vector code generation support will follow soon.
+  // Also, do not attempt to vectorize if no vector code will be produced.
+  if (VPlanBuildStressTest || EnableVPlanPredication ||
+      VectorizationFactor::Disabled() == VF)
+    return false;
+
+  LVP.setBestPlan(VF.Width, 1);
+
+  InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
                          &CM, BFI, PSI);
-  LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 
-                    << L->getHeader()->getParent()->getName() << "\"\n"); 
-  LVP.executePlan(LB, DT); 
- 
-  // Mark the loop as already vectorized to avoid vectorizing again. 
-  Hints.setAlreadyVectorized(); 
- 
-  assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 
-  return true; 
-} 
- 
-LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 
-    : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 
-                               !EnableLoopInterleaving), 
-      VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 
-                              !EnableLoopVectorization) {} 
- 
-bool LoopVectorizePass::processLoop(Loop *L) { 
+  LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
+                    << L->getHeader()->getParent()->getName() << "\"\n");
+  LVP.executePlan(LB, DT);
+
+  // Mark the loop as already vectorized to avoid vectorizing again.
+  Hints.setAlreadyVectorized();
+
+  assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
+  return true;
+}
+
+LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
+    : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
+                               !EnableLoopInterleaving),
+      VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
+                              !EnableLoopVectorization) {}
+
+bool LoopVectorizePass::processLoop(Loop *L) {
   assert((EnableVPlanNativePath || L->isInnermost()) &&
-         "VPlan-native path is not enabled. Only process inner loops."); 
- 
-#ifndef NDEBUG 
-  const std::string DebugLocStr = getDebugLocString(L); 
-#endif /* NDEBUG */ 
- 
-  LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 
-                    << L->getHeader()->getParent()->getName() << "\" from " 
-                    << DebugLocStr << "\n"); 
- 
-  LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 
- 
-  LLVM_DEBUG( 
-      dbgs() << "LV: Loop hints:" 
-             << " force=" 
-             << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 
-                     ? "disabled" 
-                     : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 
-                            ? "enabled" 
-                            : "?")) 
-             << " width=" << Hints.getWidth() 
-             << " unroll=" << Hints.getInterleave() << "\n"); 
- 
-  // Function containing loop 
-  Function *F = L->getHeader()->getParent(); 
- 
-  // Looking at the diagnostic output is the only way to determine if a loop 
-  // was vectorized (other than looking at the IR or machine code), so it 
-  // is important to generate an optimization remark for each loop. Most of 
-  // these messages are generated as OptimizationRemarkAnalysis. Remarks 
-  // generated as OptimizationRemark and OptimizationRemarkMissed are 
-  // less verbose reporting vectorized loops and unvectorized loops that may 
-  // benefit from vectorization, respectively. 
- 
-  if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 
-    LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 
-    return false; 
-  } 
- 
-  PredicatedScalarEvolution PSE(*SE, *L); 
- 
-  // Check if it is legal to vectorize the loop. 
-  LoopVectorizationRequirements Requirements(*ORE); 
-  LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 
+         "VPlan-native path is not enabled. Only process inner loops.");
+
+#ifndef NDEBUG
+  const std::string DebugLocStr = getDebugLocString(L);
+#endif /* NDEBUG */
+
+  LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
+                    << L->getHeader()->getParent()->getName() << "\" from "
+                    << DebugLocStr << "\n");
+
+  LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
+
+  LLVM_DEBUG(
+      dbgs() << "LV: Loop hints:"
+             << " force="
+             << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
+                     ? "disabled"
+                     : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
+                            ? "enabled"
+                            : "?"))
+             << " width=" << Hints.getWidth()
+             << " unroll=" << Hints.getInterleave() << "\n");
+
+  // Function containing loop
+  Function *F = L->getHeader()->getParent();
+
+  // Looking at the diagnostic output is the only way to determine if a loop
+  // was vectorized (other than looking at the IR or machine code), so it
+  // is important to generate an optimization remark for each loop. Most of
+  // these messages are generated as OptimizationRemarkAnalysis. Remarks
+  // generated as OptimizationRemark and OptimizationRemarkMissed are
+  // less verbose reporting vectorized loops and unvectorized loops that may
+  // benefit from vectorization, respectively.
+
+  if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
+    LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
+    return false;
+  }
+
+  PredicatedScalarEvolution PSE(*SE, *L);
+
+  // Check if it is legal to vectorize the loop.
+  LoopVectorizationRequirements Requirements(*ORE);
+  LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
                                 &Requirements, &Hints, DB, AC, BFI, PSI);
-  if (!LVL.canVectorize(EnableVPlanNativePath)) { 
-    LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 
-    Hints.emitRemarkWithHints(); 
-    return false; 
-  } 
- 
-  // Check the function attributes and profiles to find out if this function 
-  // should be optimized for size. 
-  ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 
-      F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 
- 
-  // Entrance to the VPlan-native vectorization path. Outer loops are processed 
-  // here. They may require CFG and instruction level transformations before 
-  // even evaluating whether vectorization is profitable. Since we cannot modify 
-  // the incoming IR, we need to build VPlan upfront in the vectorization 
-  // pipeline. 
+  if (!LVL.canVectorize(EnableVPlanNativePath)) {
+    LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
+    Hints.emitRemarkWithHints();
+    return false;
+  }
+
+  // Check the function attributes and profiles to find out if this function
+  // should be optimized for size.
+  ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
+      F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
+
+  // Entrance to the VPlan-native vectorization path. Outer loops are processed
+  // here. They may require CFG and instruction level transformations before
+  // even evaluating whether vectorization is profitable. Since we cannot modify
+  // the incoming IR, we need to build VPlan upfront in the vectorization
+  // pipeline.
   if (!L->isInnermost())
-    return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 
-                                        ORE, BFI, PSI, Hints); 
- 
+    return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
+                                        ORE, BFI, PSI, Hints);
+
   assert(L->isInnermost() && "Inner loop expected.");
- 
-  // Check the loop for a trip count threshold: vectorize loops with a tiny trip 
-  // count by optimizing for size, to minimize overheads. 
-  auto ExpectedTC = getSmallBestKnownTC(*SE, L); 
-  if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 
-    LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 
-                      << "This loop is worth vectorizing only if no scalar " 
-                      << "iteration overheads are incurred."); 
-    if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 
-      LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 
-    else { 
-      LLVM_DEBUG(dbgs() << "\n"); 
-      SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 
-    } 
-  } 
- 
-  // Check the function attributes to see if implicit floats are allowed. 
-  // FIXME: This check doesn't seem possibly correct -- what if the loop is 
-  // an integer loop and the vector instructions selected are purely integer 
-  // vector instructions? 
-  if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 
-    reportVectorizationFailure( 
-        "Can't vectorize when the NoImplicitFloat attribute is used", 
-        "loop not vectorized due to NoImplicitFloat attribute", 
-        "NoImplicitFloat", ORE, L); 
-    Hints.emitRemarkWithHints(); 
-    return false; 
-  } 
- 
-  // Check if the target supports potentially unsafe FP vectorization. 
-  // FIXME: Add a check for the type of safety issue (denormal, signaling) 
-  // for the target we're vectorizing for, to make sure none of the 
-  // additional fp-math flags can help. 
-  if (Hints.isPotentiallyUnsafe() && 
-      TTI->isFPVectorizationPotentiallyUnsafe()) { 
-    reportVectorizationFailure( 
-        "Potentially unsafe FP op prevents vectorization", 
-        "loop not vectorized due to unsafe FP support.", 
-        "UnsafeFP", ORE, L); 
-    Hints.emitRemarkWithHints(); 
-    return false; 
-  } 
- 
-  bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 
-  InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 
- 
-  // If an override option has been passed in for interleaved accesses, use it. 
-  if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 
-    UseInterleaved = EnableInterleavedMemAccesses; 
- 
-  // Analyze interleaved memory accesses. 
-  if (UseInterleaved) { 
-    IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 
-  } 
- 
-  // Use the cost model. 
-  LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 
-                                F, &Hints, IAI); 
-  CM.collectValuesToIgnore(); 
- 
-  // Use the planner for vectorization. 
-  LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 
- 
-  // Get user vectorization factor and interleave count. 
+
+  // Check the loop for a trip count threshold: vectorize loops with a tiny trip
+  // count by optimizing for size, to minimize overheads.
+  auto ExpectedTC = getSmallBestKnownTC(*SE, L);
+  if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
+    LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
+                      << "This loop is worth vectorizing only if no scalar "
+                      << "iteration overheads are incurred.");
+    if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
+      LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
+    else {
+      LLVM_DEBUG(dbgs() << "\n");
+      SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
+    }
+  }
+
+  // Check the function attributes to see if implicit floats are allowed.
+  // FIXME: This check doesn't seem possibly correct -- what if the loop is
+  // an integer loop and the vector instructions selected are purely integer
+  // vector instructions?
+  if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
+    reportVectorizationFailure(
+        "Can't vectorize when the NoImplicitFloat attribute is used",
+        "loop not vectorized due to NoImplicitFloat attribute",
+        "NoImplicitFloat", ORE, L);
+    Hints.emitRemarkWithHints();
+    return false;
+  }
+
+  // Check if the target supports potentially unsafe FP vectorization.
+  // FIXME: Add a check for the type of safety issue (denormal, signaling)
+  // for the target we're vectorizing for, to make sure none of the
+  // additional fp-math flags can help.
+  if (Hints.isPotentiallyUnsafe() &&
+      TTI->isFPVectorizationPotentiallyUnsafe()) {
+    reportVectorizationFailure(
+        "Potentially unsafe FP op prevents vectorization",
+        "loop not vectorized due to unsafe FP support.",
+        "UnsafeFP", ORE, L);
+    Hints.emitRemarkWithHints();
+    return false;
+  }
+
+  bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
+  InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
+
+  // If an override option has been passed in for interleaved accesses, use it.
+  if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
+    UseInterleaved = EnableInterleavedMemAccesses;
+
+  // Analyze interleaved memory accesses.
+  if (UseInterleaved) {
+    IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
+  }
+
+  // Use the cost model.
+  LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
+                                F, &Hints, IAI);
+  CM.collectValuesToIgnore();
+
+  // Use the planner for vectorization.
+  LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
+
+  // Get user vectorization factor and interleave count.
   ElementCount UserVF = Hints.getWidth();
-  unsigned UserIC = Hints.getInterleave(); 
- 
-  // Plan how to best vectorize, return the best VF and its cost. 
-  Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 
- 
-  VectorizationFactor VF = VectorizationFactor::Disabled(); 
-  unsigned IC = 1; 
- 
-  if (MaybeVF) { 
-    VF = *MaybeVF; 
-    // Select the interleave count. 
-    IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 
-  } 
- 
-  // Identify the diagnostic messages that should be produced. 
-  std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 
-  bool VectorizeLoop = true, InterleaveLoop = true; 
-  if (Requirements.doesNotMeet(F, L, Hints)) { 
-    LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 
-                         "requirements.\n"); 
-    Hints.emitRemarkWithHints(); 
-    return false; 
-  } 
- 
+  unsigned UserIC = Hints.getInterleave();
+
+  // Plan how to best vectorize, return the best VF and its cost.
+  Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
+
+  VectorizationFactor VF = VectorizationFactor::Disabled();
+  unsigned IC = 1;
+
+  if (MaybeVF) {
+    VF = *MaybeVF;
+    // Select the interleave count.
+    IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
+  }
+
+  // Identify the diagnostic messages that should be produced.
+  std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
+  bool VectorizeLoop = true, InterleaveLoop = true;
+  if (Requirements.doesNotMeet(F, L, Hints)) {
+    LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
+                         "requirements.\n");
+    Hints.emitRemarkWithHints();
+    return false;
+  }
+
   if (VF.Width.isScalar()) {
-    LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 
-    VecDiagMsg = std::make_pair( 
-        "VectorizationNotBeneficial", 
-        "the cost-model indicates that vectorization is not beneficial"); 
-    VectorizeLoop = false; 
-  } 
- 
-  if (!MaybeVF && UserIC > 1) { 
-    // Tell the user interleaving was avoided up-front, despite being explicitly 
-    // requested. 
-    LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 
-                         "interleaving should be avoided up front\n"); 
-    IntDiagMsg = std::make_pair( 
-        "InterleavingAvoided", 
-        "Ignoring UserIC, because interleaving was avoided up front"); 
-    InterleaveLoop = false; 
-  } else if (IC == 1 && UserIC <= 1) { 
-    // Tell the user interleaving is not beneficial. 
-    LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 
-    IntDiagMsg = std::make_pair( 
-        "InterleavingNotBeneficial", 
-        "the cost-model indicates that interleaving is not beneficial"); 
-    InterleaveLoop = false; 
-    if (UserIC == 1) { 
-      IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 
-      IntDiagMsg.second += 
-          " and is explicitly disabled or interleave count is set to 1"; 
-    } 
-  } else if (IC > 1 && UserIC == 1) { 
-    // Tell the user interleaving is beneficial, but it explicitly disabled. 
-    LLVM_DEBUG( 
-        dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 
-    IntDiagMsg = std::make_pair( 
-        "InterleavingBeneficialButDisabled", 
-        "the cost-model indicates that interleaving is beneficial " 
-        "but is explicitly disabled or interleave count is set to 1"); 
-    InterleaveLoop = false; 
-  } 
- 
-  // Override IC if user provided an interleave count. 
-  IC = UserIC > 0 ? UserIC : IC; 
- 
-  // Emit diagnostic messages, if any. 
-  const char *VAPassName = Hints.vectorizeAnalysisPassName(); 
-  if (!VectorizeLoop && !InterleaveLoop) { 
-    // Do not vectorize or interleaving the loop. 
-    ORE->emit([&]() { 
-      return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 
-                                      L->getStartLoc(), L->getHeader()) 
-             << VecDiagMsg.second; 
-    }); 
-    ORE->emit([&]() { 
-      return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 
-                                      L->getStartLoc(), L->getHeader()) 
-             << IntDiagMsg.second; 
-    }); 
-    return false; 
-  } else if (!VectorizeLoop && InterleaveLoop) { 
-    LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 
-    ORE->emit([&]() { 
-      return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 
-                                        L->getStartLoc(), L->getHeader()) 
-             << VecDiagMsg.second; 
-    }); 
-  } else if (VectorizeLoop && !InterleaveLoop) { 
-    LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 
-                      << ") in " << DebugLocStr << '\n'); 
-    ORE->emit([&]() { 
-      return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 
-                                        L->getStartLoc(), L->getHeader()) 
-             << IntDiagMsg.second; 
-    }); 
-  } else if (VectorizeLoop && InterleaveLoop) { 
-    LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 
-                      << ") in " << DebugLocStr << '\n'); 
-    LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 
-  } 
- 
-  LVP.setBestPlan(VF.Width, IC); 
- 
-  using namespace ore; 
-  bool DisableRuntimeUnroll = false; 
-  MDNode *OrigLoopID = L->getLoopID(); 
- 
-  if (!VectorizeLoop) { 
-    assert(IC > 1 && "interleave count should not be 1 or 0"); 
-    // If we decided that it is not legal to vectorize the loop, then 
-    // interleave it. 
+    LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
+    VecDiagMsg = std::make_pair(
+        "VectorizationNotBeneficial",
+        "the cost-model indicates that vectorization is not beneficial");
+    VectorizeLoop = false;
+  }
+
+  if (!MaybeVF && UserIC > 1) {
+    // Tell the user interleaving was avoided up-front, despite being explicitly
+    // requested.
+    LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
+                         "interleaving should be avoided up front\n");
+    IntDiagMsg = std::make_pair(
+        "InterleavingAvoided",
+        "Ignoring UserIC, because interleaving was avoided up front");
+    InterleaveLoop = false;
+  } else if (IC == 1 && UserIC <= 1) {
+    // Tell the user interleaving is not beneficial.
+    LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
+    IntDiagMsg = std::make_pair(
+        "InterleavingNotBeneficial",
+        "the cost-model indicates that interleaving is not beneficial");
+    InterleaveLoop = false;
+    if (UserIC == 1) {
+      IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
+      IntDiagMsg.second +=
+          " and is explicitly disabled or interleave count is set to 1";
+    }
+  } else if (IC > 1 && UserIC == 1) {
+    // Tell the user interleaving is beneficial, but it explicitly disabled.
+    LLVM_DEBUG(
+        dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
+    IntDiagMsg = std::make_pair(
+        "InterleavingBeneficialButDisabled",
+        "the cost-model indicates that interleaving is beneficial "
+        "but is explicitly disabled or interleave count is set to 1");
+    InterleaveLoop = false;
+  }
+
+  // Override IC if user provided an interleave count.
+  IC = UserIC > 0 ? UserIC : IC;
+
+  // Emit diagnostic messages, if any.
+  const char *VAPassName = Hints.vectorizeAnalysisPassName();
+  if (!VectorizeLoop && !InterleaveLoop) {
+    // Do not vectorize or interleaving the loop.
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
+                                      L->getStartLoc(), L->getHeader())
+             << VecDiagMsg.second;
+    });
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
+                                      L->getStartLoc(), L->getHeader())
+             << IntDiagMsg.second;
+    });
+    return false;
+  } else if (!VectorizeLoop && InterleaveLoop) {
+    LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
+    ORE->emit([&]() {
+      return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
+                                        L->getStartLoc(), L->getHeader())
+             << VecDiagMsg.second;
+    });
+  } else if (VectorizeLoop && !InterleaveLoop) {
+    LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
+                      << ") in " << DebugLocStr << '\n');
+    ORE->emit([&]() {
+      return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
+                                        L->getStartLoc(), L->getHeader())
+             << IntDiagMsg.second;
+    });
+  } else if (VectorizeLoop && InterleaveLoop) {
+    LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
+                      << ") in " << DebugLocStr << '\n');
+    LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
+  }
+
+  LVP.setBestPlan(VF.Width, IC);
+
+  using namespace ore;
+  bool DisableRuntimeUnroll = false;
+  MDNode *OrigLoopID = L->getLoopID();
+
+  if (!VectorizeLoop) {
+    assert(IC > 1 && "interleave count should not be 1 or 0");
+    // If we decided that it is not legal to vectorize the loop, then
+    // interleave it.
     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
                                BFI, PSI);
-    LVP.executePlan(Unroller, DT); 
- 
-    ORE->emit([&]() { 
-      return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 
-                                L->getHeader()) 
-             << "interleaved loop (interleaved count: " 
-             << NV("InterleaveCount", IC) << ")"; 
-    }); 
-  } else { 
-    // If we decided that it is *legal* to vectorize the loop, then do it. 
- 
+    LVP.executePlan(Unroller, DT);
+
+    ORE->emit([&]() {
+      return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
+                                L->getHeader())
+             << "interleaved loop (interleaved count: "
+             << NV("InterleaveCount", IC) << ")";
+    });
+  } else {
+    // If we decided that it is *legal* to vectorize the loop, then do it.
+
     // Consider vectorizing the epilogue too if it's profitable.
     VectorizationFactor EpilogueVF =
       CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
     if (EpilogueVF.Width.isVector()) {
- 
+
       // The first pass vectorizes the main loop and creates a scalar epilogue
       // to be vectorized by executing the plan (potentially with a different
       // factor) again shortly afterwards.
@@ -9584,142 +9584,142 @@ bool LoopVectorizePass::processLoop(Loop *L) {
         DisableRuntimeUnroll = true;
     }
 
-    // Report the vectorization decision. 
-    ORE->emit([&]() { 
-      return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 
-                                L->getHeader()) 
-             << "vectorized loop (vectorization width: " 
-             << NV("VectorizationFactor", VF.Width) 
-             << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 
-    }); 
-  } 
- 
-  Optional<MDNode *> RemainderLoopID = 
-      makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 
-                                      LLVMLoopVectorizeFollowupEpilogue}); 
-  if (RemainderLoopID.hasValue()) { 
-    L->setLoopID(RemainderLoopID.getValue()); 
-  } else { 
-    if (DisableRuntimeUnroll) 
-      AddRuntimeUnrollDisableMetaData(L); 
- 
-    // Mark the loop as already vectorized to avoid vectorizing again. 
-    Hints.setAlreadyVectorized(); 
-  } 
- 
-  assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 
-  return true; 
-} 
- 
-LoopVectorizeResult LoopVectorizePass::runImpl( 
-    Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 
-    DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 
-    DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 
-    std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 
-    OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 
-  SE = &SE_; 
-  LI = &LI_; 
-  TTI = &TTI_; 
-  DT = &DT_; 
-  BFI = &BFI_; 
-  TLI = TLI_; 
-  AA = &AA_; 
-  AC = &AC_; 
-  GetLAA = &GetLAA_; 
-  DB = &DB_; 
-  ORE = &ORE_; 
-  PSI = PSI_; 
- 
-  // Don't attempt if 
-  // 1. the target claims to have no vector registers, and 
-  // 2. interleaving won't help ILP. 
-  // 
-  // The second condition is necessary because, even if the target has no 
-  // vector registers, loop vectorization may still enable scalar 
-  // interleaving. 
-  if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 
-      TTI->getMaxInterleaveFactor(1) < 2) 
-    return LoopVectorizeResult(false, false); 
- 
-  bool Changed = false, CFGChanged = false; 
- 
-  // The vectorizer requires loops to be in simplified form. 
-  // Since simplification may add new inner loops, it has to run before the 
-  // legality and profitability checks. This means running the loop vectorizer 
-  // will simplify all loops, regardless of whether anything end up being 
-  // vectorized. 
-  for (auto &L : *LI) 
-    Changed |= CFGChanged |= 
-        simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 
- 
-  // Build up a worklist of inner-loops to vectorize. This is necessary as 
-  // the act of vectorizing or partially unrolling a loop creates new loops 
-  // and can invalidate iterators across the loops. 
-  SmallVector<Loop *, 8> Worklist; 
- 
-  for (Loop *L : *LI) 
-    collectSupportedLoops(*L, LI, ORE, Worklist); 
- 
-  LoopsAnalyzed += Worklist.size(); 
- 
-  // Now walk the identified inner loops. 
-  while (!Worklist.empty()) { 
-    Loop *L = Worklist.pop_back_val(); 
- 
-    // For the inner loops we actually process, form LCSSA to simplify the 
-    // transform. 
-    Changed |= formLCSSARecursively(*L, *DT, LI, SE); 
- 
-    Changed |= CFGChanged |= processLoop(L); 
-  } 
- 
-  // Process each loop nest in the function. 
-  return LoopVectorizeResult(Changed, CFGChanged); 
-} 
- 
-PreservedAnalyses LoopVectorizePass::run(Function &F, 
-                                         FunctionAnalysisManager &AM) { 
-    auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 
-    auto &LI = AM.getResult<LoopAnalysis>(F); 
-    auto &TTI = AM.getResult<TargetIRAnalysis>(F); 
-    auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 
-    auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 
-    auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 
-    auto &AA = AM.getResult<AAManager>(F); 
-    auto &AC = AM.getResult<AssumptionAnalysis>(F); 
-    auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 
-    auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 
-    MemorySSA *MSSA = EnableMSSALoopDependency 
-                          ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 
-                          : nullptr; 
- 
-    auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 
-    std::function<const LoopAccessInfo &(Loop &)> GetLAA = 
-        [&](Loop &L) -> const LoopAccessInfo & { 
+    // Report the vectorization decision.
+    ORE->emit([&]() {
+      return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
+                                L->getHeader())
+             << "vectorized loop (vectorization width: "
+             << NV("VectorizationFactor", VF.Width)
+             << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
+    });
+  }
+
+  Optional<MDNode *> RemainderLoopID =
+      makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
+                                      LLVMLoopVectorizeFollowupEpilogue});
+  if (RemainderLoopID.hasValue()) {
+    L->setLoopID(RemainderLoopID.getValue());
+  } else {
+    if (DisableRuntimeUnroll)
+      AddRuntimeUnrollDisableMetaData(L);
+
+    // Mark the loop as already vectorized to avoid vectorizing again.
+    Hints.setAlreadyVectorized();
+  }
+
+  assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
+  return true;
+}
+
+LoopVectorizeResult LoopVectorizePass::runImpl(
+    Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
+    DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
+    DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
+    std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
+    OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
+  SE = &SE_;
+  LI = &LI_;
+  TTI = &TTI_;
+  DT = &DT_;
+  BFI = &BFI_;
+  TLI = TLI_;
+  AA = &AA_;
+  AC = &AC_;
+  GetLAA = &GetLAA_;
+  DB = &DB_;
+  ORE = &ORE_;
+  PSI = PSI_;
+
+  // Don't attempt if
+  // 1. the target claims to have no vector registers, and
+  // 2. interleaving won't help ILP.
+  //
+  // The second condition is necessary because, even if the target has no
+  // vector registers, loop vectorization may still enable scalar
+  // interleaving.
+  if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
+      TTI->getMaxInterleaveFactor(1) < 2)
+    return LoopVectorizeResult(false, false);
+
+  bool Changed = false, CFGChanged = false;
+
+  // The vectorizer requires loops to be in simplified form.
+  // Since simplification may add new inner loops, it has to run before the
+  // legality and profitability checks. This means running the loop vectorizer
+  // will simplify all loops, regardless of whether anything end up being
+  // vectorized.
+  for (auto &L : *LI)
+    Changed |= CFGChanged |=
+        simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
+
+  // Build up a worklist of inner-loops to vectorize. This is necessary as
+  // the act of vectorizing or partially unrolling a loop creates new loops
+  // and can invalidate iterators across the loops.
+  SmallVector<Loop *, 8> Worklist;
+
+  for (Loop *L : *LI)
+    collectSupportedLoops(*L, LI, ORE, Worklist);
+
+  LoopsAnalyzed += Worklist.size();
+
+  // Now walk the identified inner loops.
+  while (!Worklist.empty()) {
+    Loop *L = Worklist.pop_back_val();
+
+    // For the inner loops we actually process, form LCSSA to simplify the
+    // transform.
+    Changed |= formLCSSARecursively(*L, *DT, LI, SE);
+
+    Changed |= CFGChanged |= processLoop(L);
+  }
+
+  // Process each loop nest in the function.
+  return LoopVectorizeResult(Changed, CFGChanged);
+}
+
+PreservedAnalyses LoopVectorizePass::run(Function &F,
+                                         FunctionAnalysisManager &AM) {
+    auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+    auto &LI = AM.getResult<LoopAnalysis>(F);
+    auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+    auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+    auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
+    auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+    auto &AA = AM.getResult<AAManager>(F);
+    auto &AC = AM.getResult<AssumptionAnalysis>(F);
+    auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
+    auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+    MemorySSA *MSSA = EnableMSSALoopDependency
+                          ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
+                          : nullptr;
+
+    auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
+    std::function<const LoopAccessInfo &(Loop &)> GetLAA =
+        [&](Loop &L) -> const LoopAccessInfo & {
       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE,
                                         TLI, TTI, nullptr, MSSA};
-      return LAM.getResult<LoopAccessAnalysis>(L, AR); 
-    }; 
-    auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 
-    ProfileSummaryInfo *PSI = 
-        MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 
-    LoopVectorizeResult Result = 
-        runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 
-    if (!Result.MadeAnyChange) 
-      return PreservedAnalyses::all(); 
-    PreservedAnalyses PA; 
- 
-    // We currently do not preserve loopinfo/dominator analyses with outer loop 
-    // vectorization. Until this is addressed, mark these analyses as preserved 
-    // only for non-VPlan-native path. 
-    // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 
-    if (!EnableVPlanNativePath) { 
-      PA.preserve<LoopAnalysis>(); 
-      PA.preserve<DominatorTreeAnalysis>(); 
-    } 
-    PA.preserve<BasicAA>(); 
-    PA.preserve<GlobalsAA>(); 
-    if (!Result.MadeCFGChange) 
-      PA.preserveSet<CFGAnalyses>(); 
-    return PA; 
-} 
+      return LAM.getResult<LoopAccessAnalysis>(L, AR);
+    };
+    auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
+    ProfileSummaryInfo *PSI =
+        MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+    LoopVectorizeResult Result =
+        runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
+    if (!Result.MadeAnyChange)
+      return PreservedAnalyses::all();
+    PreservedAnalyses PA;
+
+    // We currently do not preserve loopinfo/dominator analyses with outer loop
+    // vectorization. Until this is addressed, mark these analyses as preserved
+    // only for non-VPlan-native path.
+    // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
+    if (!EnableVPlanNativePath) {
+      PA.preserve<LoopAnalysis>();
+      PA.preserve<DominatorTreeAnalysis>();
+    }
+    PA.preserve<BasicAA>();
+    PA.preserve<GlobalsAA>();
+    if (!Result.MadeCFGChange)
+      PA.preserveSet<CFGAnalyses>();
+    return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 94741c5c33..0b63019791 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1,243 +1,243 @@
-//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass implements the Bottom Up SLP vectorizer. It detects consecutive 
-// stores that can be put together into vector-stores. Next, it attempts to 
-// construct vectorizable tree using the use-def chains. If a profitable tree 
-// was found, the SLP vectorizer performs vectorization on the tree. 
-// 
-// The pass is inspired by the work described in the paper: 
-//  "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Vectorize/SLPVectorizer.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/DenseSet.h" 
-#include "llvm/ADT/Optional.h" 
-#include "llvm/ADT/PostOrderIterator.h" 
-#include "llvm/ADT/STLExtras.h" 
-#include "llvm/ADT/SetVector.h" 
-#include "llvm/ADT/SmallBitVector.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallSet.h" 
+//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
+// stores that can be put together into vector-stores. Next, it attempts to
+// construct vectorizable tree using the use-def chains. If a profitable tree
+// was found, the SLP vectorizer performs vectorization on the tree.
+//
+// The pass is inspired by the work described in the paper:
+//  "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/ADT/iterator.h" 
-#include "llvm/ADT/iterator_range.h" 
-#include "llvm/Analysis/AliasAnalysis.h" 
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/CodeMetrics.h" 
-#include "llvm/Analysis/DemandedBits.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DemandedBits.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/IVDescriptors.h"
-#include "llvm/Analysis/LoopAccessAnalysis.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/MemoryLocation.h" 
-#include "llvm/Analysis/OptimizationRemarkEmitter.h" 
-#include "llvm/Analysis/ScalarEvolution.h" 
-#include "llvm/Analysis/ScalarEvolutionExpressions.h" 
-#include "llvm/Analysis/TargetLibraryInfo.h" 
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/Analysis/VectorUtils.h" 
-#include "llvm/IR/Attributes.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/Constant.h" 
-#include "llvm/IR/Constants.h" 
-#include "llvm/IR/DataLayout.h" 
-#include "llvm/IR/DebugLoc.h" 
-#include "llvm/IR/DerivedTypes.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/IntrinsicInst.h" 
-#include "llvm/IR/Intrinsics.h" 
-#include "llvm/IR/Module.h" 
-#include "llvm/IR/NoFolder.h" 
-#include "llvm/IR/Operator.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Use.h" 
-#include "llvm/IR/User.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/IR/ValueHandle.h" 
-#include "llvm/IR/Verifier.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Compiler.h" 
-#include "llvm/Support/DOTGraphTraits.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/GraphWriter.h" 
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/NoFolder.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/DOTGraphTraits.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/InstructionCost.h"
-#include "llvm/Support/KnownBits.h" 
-#include "llvm/Support/MathExtras.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Utils/InjectTLIMappings.h" 
-#include "llvm/Transforms/Utils/LoopUtils.h" 
-#include "llvm/Transforms/Vectorize.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <cstdint> 
-#include <iterator> 
-#include <memory> 
-#include <set> 
-#include <string> 
-#include <tuple> 
-#include <utility> 
-#include <vector> 
- 
-using namespace llvm; 
-using namespace llvm::PatternMatch; 
-using namespace slpvectorizer; 
- 
-#define SV_NAME "slp-vectorizer" 
-#define DEBUG_TYPE "SLP" 
- 
-STATISTIC(NumVectorInstructions, "Number of vector instructions generated"); 
- 
-cl::opt<bool> RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, 
-                                  cl::desc("Run the SLP vectorization passes")); 
- 
-static cl::opt<int> 
-    SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, 
-                     cl::desc("Only vectorize if you gain more than this " 
-                              "number ")); 
- 
-static cl::opt<bool> 
-ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, 
-                   cl::desc("Attempt to vectorize horizontal reductions")); 
- 
-static cl::opt<bool> ShouldStartVectorizeHorAtStore( 
-    "slp-vectorize-hor-store", cl::init(false), cl::Hidden, 
-    cl::desc( 
-        "Attempt to vectorize horizontal reductions feeding into a store")); 
- 
-static cl::opt<int> 
-MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, 
-    cl::desc("Attempt to vectorize for this register size in bits")); 
- 
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/InjectTLIMappings.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Vectorize.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <set>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+using namespace slpvectorizer;
+
+#define SV_NAME "slp-vectorizer"
+#define DEBUG_TYPE "SLP"
+
+STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
+
+cl::opt<bool> RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
+                                  cl::desc("Run the SLP vectorization passes"));
+
+static cl::opt<int>
+    SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
+                     cl::desc("Only vectorize if you gain more than this "
+                              "number "));
+
+static cl::opt<bool>
+ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
+                   cl::desc("Attempt to vectorize horizontal reductions"));
+
+static cl::opt<bool> ShouldStartVectorizeHorAtStore(
+    "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
+    cl::desc(
+        "Attempt to vectorize horizontal reductions feeding into a store"));
+
+static cl::opt<int>
+MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
+    cl::desc("Attempt to vectorize for this register size in bits"));
+
 static cl::opt<unsigned>
 MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,
     cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
 
-static cl::opt<int> 
-MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden, 
-    cl::desc("Maximum depth of the lookup for consecutive stores.")); 
- 
-/// Limits the size of scheduling regions in a block. 
-/// It avoid long compile times for _very_ large blocks where vector 
-/// instructions are spread over a wide range. 
-/// This limit is way higher than needed by real-world functions. 
-static cl::opt<int> 
-ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, 
-    cl::desc("Limit the size of the SLP scheduling region per block")); 
- 
-static cl::opt<int> MinVectorRegSizeOption( 
-    "slp-min-reg-size", cl::init(128), cl::Hidden, 
-    cl::desc("Attempt to vectorize for this register size in bits")); 
- 
-static cl::opt<unsigned> RecursionMaxDepth( 
-    "slp-recursion-max-depth", cl::init(12), cl::Hidden, 
-    cl::desc("Limit the recursion depth when building a vectorizable tree")); 
- 
-static cl::opt<unsigned> MinTreeSize( 
-    "slp-min-tree-size", cl::init(3), cl::Hidden, 
-    cl::desc("Only vectorize small trees if they are fully vectorizable")); 
- 
-// The maximum depth that the look-ahead score heuristic will explore. 
-// The higher this value, the higher the compilation time overhead. 
-static cl::opt<int> LookAheadMaxDepth( 
-    "slp-max-look-ahead-depth", cl::init(2), cl::Hidden, 
-    cl::desc("The maximum look-ahead depth for operand reordering scores")); 
- 
-// The Look-ahead heuristic goes through the users of the bundle to calculate 
-// the users cost in getExternalUsesCost(). To avoid compilation time increase 
-// we limit the number of users visited to this value. 
-static cl::opt<unsigned> LookAheadUsersBudget( 
-    "slp-look-ahead-users-budget", cl::init(2), cl::Hidden, 
-    cl::desc("The maximum number of users to visit while visiting the " 
-             "predecessors. This prevents compilation time increase.")); 
- 
-static cl::opt<bool> 
-    ViewSLPTree("view-slp-tree", cl::Hidden, 
-                cl::desc("Display the SLP trees with Graphviz")); 
- 
-// Limit the number of alias checks. The limit is chosen so that 
-// it has no negative effect on the llvm benchmarks. 
-static const unsigned AliasedCheckLimit = 10; 
- 
-// Another limit for the alias checks: The maximum distance between load/store 
-// instructions where alias checks are done. 
-// This limit is useful for very large basic blocks. 
-static const unsigned MaxMemDepDistance = 160; 
- 
-/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling 
-/// regions to be handled. 
-static const int MinScheduleRegionSize = 16; 
- 
-/// Predicate for the element types that the SLP vectorizer supports. 
-/// 
-/// The most important thing to filter here are types which are invalid in LLVM 
-/// vectors. We also filter target specific types which have absolutely no 
-/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just 
-/// avoids spending time checking the cost model and realizing that they will 
-/// be inevitably scalarized. 
-static bool isValidElementType(Type *Ty) { 
-  return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() && 
-         !Ty->isPPC_FP128Ty(); 
-} 
- 
-/// \returns true if all of the instructions in \p VL are in the same block or 
-/// false otherwise. 
-static bool allSameBlock(ArrayRef<Value *> VL) { 
-  Instruction *I0 = dyn_cast<Instruction>(VL[0]); 
-  if (!I0) 
-    return false; 
-  BasicBlock *BB = I0->getParent(); 
+static cl::opt<int>
+MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,
+    cl::desc("Maximum depth of the lookup for consecutive stores."));
+
+/// Limits the size of scheduling regions in a block.
+/// It avoid long compile times for _very_ large blocks where vector
+/// instructions are spread over a wide range.
+/// This limit is way higher than needed by real-world functions.
+static cl::opt<int>
+ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
+    cl::desc("Limit the size of the SLP scheduling region per block"));
+
+static cl::opt<int> MinVectorRegSizeOption(
+    "slp-min-reg-size", cl::init(128), cl::Hidden,
+    cl::desc("Attempt to vectorize for this register size in bits"));
+
+static cl::opt<unsigned> RecursionMaxDepth(
+    "slp-recursion-max-depth", cl::init(12), cl::Hidden,
+    cl::desc("Limit the recursion depth when building a vectorizable tree"));
+
+static cl::opt<unsigned> MinTreeSize(
+    "slp-min-tree-size", cl::init(3), cl::Hidden,
+    cl::desc("Only vectorize small trees if they are fully vectorizable"));
+
+// The maximum depth that the look-ahead score heuristic will explore.
+// The higher this value, the higher the compilation time overhead.
+static cl::opt<int> LookAheadMaxDepth(
+    "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
+    cl::desc("The maximum look-ahead depth for operand reordering scores"));
+
+// The Look-ahead heuristic goes through the users of the bundle to calculate
+// the users cost in getExternalUsesCost(). To avoid compilation time increase
+// we limit the number of users visited to this value.
+static cl::opt<unsigned> LookAheadUsersBudget(
+    "slp-look-ahead-users-budget", cl::init(2), cl::Hidden,
+    cl::desc("The maximum number of users to visit while visiting the "
+             "predecessors. This prevents compilation time increase."));
+
+static cl::opt<bool>
+    ViewSLPTree("view-slp-tree", cl::Hidden,
+                cl::desc("Display the SLP trees with Graphviz"));
+
+// Limit the number of alias checks. The limit is chosen so that
+// it has no negative effect on the llvm benchmarks.
+static const unsigned AliasedCheckLimit = 10;
+
+// Another limit for the alias checks: The maximum distance between load/store
+// instructions where alias checks are done.
+// This limit is useful for very large basic blocks.
+static const unsigned MaxMemDepDistance = 160;
+
+/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
+/// regions to be handled.
+static const int MinScheduleRegionSize = 16;
+
+/// Predicate for the element types that the SLP vectorizer supports.
+///
+/// The most important thing to filter here are types which are invalid in LLVM
+/// vectors. We also filter target specific types which have absolutely no
+/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
+/// avoids spending time checking the cost model and realizing that they will
+/// be inevitably scalarized.
+static bool isValidElementType(Type *Ty) {
+  return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
+         !Ty->isPPC_FP128Ty();
+}
+
+/// \returns true if all of the instructions in \p VL are in the same block or
+/// false otherwise.
+static bool allSameBlock(ArrayRef<Value *> VL) {
+  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+  if (!I0)
+    return false;
+  BasicBlock *BB = I0->getParent();
   for (int I = 1, E = VL.size(); I < E; I++) {
     auto *II = dyn_cast<Instruction>(VL[I]);
     if (!II)
-      return false; 
- 
+      return false;
+
     if (BB != II->getParent())
-      return false; 
-  } 
-  return true; 
-} 
- 
-/// \returns True if all of the values in \p VL are constants (but not 
-/// globals/constant expressions). 
-static bool allConstant(ArrayRef<Value *> VL) { 
-  // Constant expressions and globals can't be vectorized like normal integer/FP 
-  // constants. 
-  for (Value *i : VL) 
-    if (!isa<Constant>(i) || isa<ConstantExpr>(i) || isa<GlobalValue>(i)) 
-      return false; 
-  return true; 
-} 
- 
-/// \returns True if all of the values in \p VL are identical. 
-static bool isSplat(ArrayRef<Value *> VL) { 
-  for (unsigned i = 1, e = VL.size(); i < e; ++i) 
-    if (VL[i] != VL[0]) 
-      return false; 
-  return true; 
-} 
- 
+      return false;
+  }
+  return true;
+}
+
+/// \returns True if all of the values in \p VL are constants (but not
+/// globals/constant expressions).
+static bool allConstant(ArrayRef<Value *> VL) {
+  // Constant expressions and globals can't be vectorized like normal integer/FP
+  // constants.
+  for (Value *i : VL)
+    if (!isa<Constant>(i) || isa<ConstantExpr>(i) || isa<GlobalValue>(i))
+      return false;
+  return true;
+}
+
+/// \returns True if all of the values in \p VL are identical.
+static bool isSplat(ArrayRef<Value *> VL) {
+  for (unsigned i = 1, e = VL.size(); i < e; ++i)
+    if (VL[i] != VL[0])
+      return false;
+  return true;
+}
+
 /// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
-static bool isCommutative(Instruction *I) { 
+static bool isCommutative(Instruction *I) {
   if (auto *Cmp = dyn_cast<CmpInst>(I))
     return Cmp->isCommutative();
   if (auto *BO = dyn_cast<BinaryOperator>(I))
@@ -246,289 +246,289 @@ static bool isCommutative(Instruction *I) {
   //       we need to confirm that the caller code correctly handles Intrinsics
   //       for example (does not have 2 operands).
   return false;
-} 
- 
-/// Checks if the vector of instructions can be represented as a shuffle, like: 
-/// %x0 = extractelement <4 x i8> %x, i32 0 
-/// %x3 = extractelement <4 x i8> %x, i32 3 
-/// %y1 = extractelement <4 x i8> %y, i32 1 
-/// %y2 = extractelement <4 x i8> %y, i32 2 
-/// %x0x0 = mul i8 %x0, %x0 
-/// %x3x3 = mul i8 %x3, %x3 
-/// %y1y1 = mul i8 %y1, %y1 
-/// %y2y2 = mul i8 %y2, %y2 
+}
+
+/// Checks if the vector of instructions can be represented as a shuffle, like:
+/// %x0 = extractelement <4 x i8> %x, i32 0
+/// %x3 = extractelement <4 x i8> %x, i32 3
+/// %y1 = extractelement <4 x i8> %y, i32 1
+/// %y2 = extractelement <4 x i8> %y, i32 2
+/// %x0x0 = mul i8 %x0, %x0
+/// %x3x3 = mul i8 %x3, %x3
+/// %y1y1 = mul i8 %y1, %y1
+/// %y2y2 = mul i8 %y2, %y2
 /// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
-/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1 
-/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2 
-/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3 
-/// ret <4 x i8> %ins4 
-/// can be transformed into: 
-/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5, 
-///                                                         i32 6> 
-/// %2 = mul <4 x i8> %1, %1 
-/// ret <4 x i8> %2 
-/// We convert this initially to something like: 
-/// %x0 = extractelement <4 x i8> %x, i32 0 
-/// %x3 = extractelement <4 x i8> %x, i32 3 
-/// %y1 = extractelement <4 x i8> %y, i32 1 
-/// %y2 = extractelement <4 x i8> %y, i32 2 
+/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
+/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
+/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
+/// ret <4 x i8> %ins4
+/// can be transformed into:
+/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
+///                                                         i32 6>
+/// %2 = mul <4 x i8> %1, %1
+/// ret <4 x i8> %2
+/// We convert this initially to something like:
+/// %x0 = extractelement <4 x i8> %x, i32 0
+/// %x3 = extractelement <4 x i8> %x, i32 3
+/// %y1 = extractelement <4 x i8> %y, i32 1
+/// %y2 = extractelement <4 x i8> %y, i32 2
 /// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0
-/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1 
-/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2 
-/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3 
-/// %5 = mul <4 x i8> %4, %4 
-/// %6 = extractelement <4 x i8> %5, i32 0 
+/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
+/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
+/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
+/// %5 = mul <4 x i8> %4, %4
+/// %6 = extractelement <4 x i8> %5, i32 0
 /// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0
-/// %7 = extractelement <4 x i8> %5, i32 1 
-/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1 
-/// %8 = extractelement <4 x i8> %5, i32 2 
-/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2 
-/// %9 = extractelement <4 x i8> %5, i32 3 
-/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3 
-/// ret <4 x i8> %ins4 
-/// InstCombiner transforms this into a shuffle and vector mul 
-/// TODO: Can we split off and reuse the shuffle mask detection from 
-/// TargetTransformInfo::getInstructionThroughput? 
-static Optional<TargetTransformInfo::ShuffleKind> 
-isShuffle(ArrayRef<Value *> VL) { 
-  auto *EI0 = cast<ExtractElementInst>(VL[0]); 
+/// %7 = extractelement <4 x i8> %5, i32 1
+/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
+/// %8 = extractelement <4 x i8> %5, i32 2
+/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2
+/// %9 = extractelement <4 x i8> %5, i32 3
+/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
+/// ret <4 x i8> %ins4
+/// InstCombiner transforms this into a shuffle and vector mul
+/// TODO: Can we split off and reuse the shuffle mask detection from
+/// TargetTransformInfo::getInstructionThroughput?
+static Optional<TargetTransformInfo::ShuffleKind>
+isShuffle(ArrayRef<Value *> VL) {
+  auto *EI0 = cast<ExtractElementInst>(VL[0]);
   unsigned Size =
       cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
-  Value *Vec1 = nullptr; 
-  Value *Vec2 = nullptr; 
-  enum ShuffleMode { Unknown, Select, Permute }; 
-  ShuffleMode CommonShuffleMode = Unknown; 
-  for (unsigned I = 0, E = VL.size(); I < E; ++I) { 
-    auto *EI = cast<ExtractElementInst>(VL[I]); 
-    auto *Vec = EI->getVectorOperand(); 
-    // All vector operands must have the same number of vector elements. 
+  Value *Vec1 = nullptr;
+  Value *Vec2 = nullptr;
+  enum ShuffleMode { Unknown, Select, Permute };
+  ShuffleMode CommonShuffleMode = Unknown;
+  for (unsigned I = 0, E = VL.size(); I < E; ++I) {
+    auto *EI = cast<ExtractElementInst>(VL[I]);
+    auto *Vec = EI->getVectorOperand();
+    // All vector operands must have the same number of vector elements.
     if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
-      return None; 
-    auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand()); 
-    if (!Idx) 
-      return None; 
-    // Undefined behavior if Idx is negative or >= Size. 
-    if (Idx->getValue().uge(Size)) 
-      continue; 
-    unsigned IntIdx = Idx->getValue().getZExtValue(); 
+      return None;
+    auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
+    if (!Idx)
+      return None;
+    // Undefined behavior if Idx is negative or >= Size.
+    if (Idx->getValue().uge(Size))
+      continue;
+    unsigned IntIdx = Idx->getValue().getZExtValue();
     // We can extractelement from undef or poison vector.
-    if (isa<UndefValue>(Vec)) 
-      continue; 
-    // For correct shuffling we have to have at most 2 different vector operands 
-    // in all extractelement instructions. 
-    if (!Vec1 || Vec1 == Vec) 
-      Vec1 = Vec; 
-    else if (!Vec2 || Vec2 == Vec) 
-      Vec2 = Vec; 
-    else 
-      return None; 
-    if (CommonShuffleMode == Permute) 
-      continue; 
-    // If the extract index is not the same as the operation number, it is a 
-    // permutation. 
-    if (IntIdx != I) { 
-      CommonShuffleMode = Permute; 
-      continue; 
-    } 
-    CommonShuffleMode = Select; 
-  } 
-  // If we're not crossing lanes in different vectors, consider it as blending. 
-  if (CommonShuffleMode == Select && Vec2) 
-    return TargetTransformInfo::SK_Select; 
-  // If Vec2 was never used, we have a permutation of a single vector, otherwise 
-  // we have permutation of 2 vectors. 
-  return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc 
-              : TargetTransformInfo::SK_PermuteSingleSrc; 
-} 
- 
-namespace { 
- 
-/// Main data required for vectorization of instructions. 
-struct InstructionsState { 
-  /// The very first instruction in the list with the main opcode. 
-  Value *OpValue = nullptr; 
- 
-  /// The main/alternate instruction. 
-  Instruction *MainOp = nullptr; 
-  Instruction *AltOp = nullptr; 
- 
-  /// The main/alternate opcodes for the list of instructions. 
-  unsigned getOpcode() const { 
-    return MainOp ? MainOp->getOpcode() : 0; 
-  } 
- 
-  unsigned getAltOpcode() const { 
-    return AltOp ? AltOp->getOpcode() : 0; 
-  } 
- 
-  /// Some of the instructions in the list have alternate opcodes. 
-  bool isAltShuffle() const { return getOpcode() != getAltOpcode(); } 
- 
-  bool isOpcodeOrAlt(Instruction *I) const { 
-    unsigned CheckedOpcode = I->getOpcode(); 
-    return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode; 
-  } 
- 
-  InstructionsState() = delete; 
-  InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp) 
-      : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {} 
-}; 
- 
-} // end anonymous namespace 
- 
-/// Chooses the correct key for scheduling data. If \p Op has the same (or 
-/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p 
-/// OpValue. 
-static Value *isOneOf(const InstructionsState &S, Value *Op) { 
-  auto *I = dyn_cast<Instruction>(Op); 
-  if (I && S.isOpcodeOrAlt(I)) 
-    return Op; 
-  return S.OpValue; 
-} 
- 
-/// \returns true if \p Opcode is allowed as part of of the main/alternate 
-/// instruction for SLP vectorization. 
-/// 
-/// Example of unsupported opcode is SDIV that can potentially cause UB if the 
-/// "shuffled out" lane would result in division by zero. 
-static bool isValidForAlternation(unsigned Opcode) { 
-  if (Instruction::isIntDivRem(Opcode)) 
-    return false; 
- 
-  return true; 
-} 
- 
-/// \returns analysis of the Instructions in \p VL described in 
-/// InstructionsState, the Opcode that we suppose the whole list 
-/// could be vectorized even if its structure is diverse. 
-static InstructionsState getSameOpcode(ArrayRef<Value *> VL, 
-                                       unsigned BaseIndex = 0) { 
-  // Make sure these are all Instructions. 
-  if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); })) 
-    return InstructionsState(VL[BaseIndex], nullptr, nullptr); 
- 
-  bool IsCastOp = isa<CastInst>(VL[BaseIndex]); 
-  bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]); 
-  unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode(); 
-  unsigned AltOpcode = Opcode; 
-  unsigned AltIndex = BaseIndex; 
- 
-  // Check for one alternate opcode from another BinaryOperator. 
-  // TODO - generalize to support all operators (types, calls etc.). 
-  for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { 
-    unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode(); 
-    if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) { 
-      if (InstOpcode == Opcode || InstOpcode == AltOpcode) 
-        continue; 
-      if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) && 
-          isValidForAlternation(Opcode)) { 
-        AltOpcode = InstOpcode; 
-        AltIndex = Cnt; 
-        continue; 
-      } 
-    } else if (IsCastOp && isa<CastInst>(VL[Cnt])) { 
-      Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType(); 
-      Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType(); 
-      if (Ty0 == Ty1) { 
-        if (InstOpcode == Opcode || InstOpcode == AltOpcode) 
-          continue; 
-        if (Opcode == AltOpcode) { 
-          assert(isValidForAlternation(Opcode) && 
-                 isValidForAlternation(InstOpcode) && 
-                 "Cast isn't safe for alternation, logic needs to be updated!"); 
-          AltOpcode = InstOpcode; 
-          AltIndex = Cnt; 
-          continue; 
-        } 
-      } 
-    } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) 
-      continue; 
-    return InstructionsState(VL[BaseIndex], nullptr, nullptr); 
-  } 
- 
-  return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]), 
-                           cast<Instruction>(VL[AltIndex])); 
-} 
- 
-/// \returns true if all of the values in \p VL have the same type or false 
-/// otherwise. 
-static bool allSameType(ArrayRef<Value *> VL) { 
-  Type *Ty = VL[0]->getType(); 
-  for (int i = 1, e = VL.size(); i < e; i++) 
-    if (VL[i]->getType() != Ty) 
-      return false; 
- 
-  return true; 
-} 
- 
-/// \returns True if Extract{Value,Element} instruction extracts element Idx. 
-static Optional<unsigned> getExtractIndex(Instruction *E) { 
-  unsigned Opcode = E->getOpcode(); 
-  assert((Opcode == Instruction::ExtractElement || 
-          Opcode == Instruction::ExtractValue) && 
-         "Expected extractelement or extractvalue instruction."); 
-  if (Opcode == Instruction::ExtractElement) { 
-    auto *CI = dyn_cast<ConstantInt>(E->getOperand(1)); 
-    if (!CI) 
-      return None; 
-    return CI->getZExtValue(); 
-  } 
-  ExtractValueInst *EI = cast<ExtractValueInst>(E); 
-  if (EI->getNumIndices() != 1) 
-    return None; 
-  return *EI->idx_begin(); 
-} 
- 
-/// \returns True if in-tree use also needs extract. This refers to 
-/// possible scalar operand in vectorized instruction. 
-static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, 
-                                    TargetLibraryInfo *TLI) { 
-  unsigned Opcode = UserInst->getOpcode(); 
-  switch (Opcode) { 
-  case Instruction::Load: { 
-    LoadInst *LI = cast<LoadInst>(UserInst); 
-    return (LI->getPointerOperand() == Scalar); 
-  } 
-  case Instruction::Store: { 
-    StoreInst *SI = cast<StoreInst>(UserInst); 
-    return (SI->getPointerOperand() == Scalar); 
-  } 
-  case Instruction::Call: { 
-    CallInst *CI = cast<CallInst>(UserInst); 
-    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 
-    for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { 
-      if (hasVectorInstrinsicScalarOpd(ID, i)) 
-        return (CI->getArgOperand(i) == Scalar); 
-    } 
-    LLVM_FALLTHROUGH; 
-  } 
-  default: 
-    return false; 
-  } 
-} 
- 
-/// \returns the AA location that is being access by the instruction. 
+    if (isa<UndefValue>(Vec))
+      continue;
+    // For correct shuffling we have to have at most 2 different vector operands
+    // in all extractelement instructions.
+    if (!Vec1 || Vec1 == Vec)
+      Vec1 = Vec;
+    else if (!Vec2 || Vec2 == Vec)
+      Vec2 = Vec;
+    else
+      return None;
+    if (CommonShuffleMode == Permute)
+      continue;
+    // If the extract index is not the same as the operation number, it is a
+    // permutation.
+    if (IntIdx != I) {
+      CommonShuffleMode = Permute;
+      continue;
+    }
+    CommonShuffleMode = Select;
+  }
+  // If we're not crossing lanes in different vectors, consider it as blending.
+  if (CommonShuffleMode == Select && Vec2)
+    return TargetTransformInfo::SK_Select;
+  // If Vec2 was never used, we have a permutation of a single vector, otherwise
+  // we have permutation of 2 vectors.
+  return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
+              : TargetTransformInfo::SK_PermuteSingleSrc;
+}
+
+namespace {
+
+/// Main data required for vectorization of instructions.
+struct InstructionsState {
+  /// The very first instruction in the list with the main opcode.
+  Value *OpValue = nullptr;
+
+  /// The main/alternate instruction.
+  Instruction *MainOp = nullptr;
+  Instruction *AltOp = nullptr;
+
+  /// The main/alternate opcodes for the list of instructions.
+  unsigned getOpcode() const {
+    return MainOp ? MainOp->getOpcode() : 0;
+  }
+
+  unsigned getAltOpcode() const {
+    return AltOp ? AltOp->getOpcode() : 0;
+  }
+
+  /// Some of the instructions in the list have alternate opcodes.
+  bool isAltShuffle() const { return getOpcode() != getAltOpcode(); }
+
+  bool isOpcodeOrAlt(Instruction *I) const {
+    unsigned CheckedOpcode = I->getOpcode();
+    return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
+  }
+
+  InstructionsState() = delete;
+  InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
+      : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
+};
+
+} // end anonymous namespace
+
+/// Chooses the correct key for scheduling data. If \p Op has the same (or
+/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
+/// OpValue.
+static Value *isOneOf(const InstructionsState &S, Value *Op) {
+  auto *I = dyn_cast<Instruction>(Op);
+  if (I && S.isOpcodeOrAlt(I))
+    return Op;
+  return S.OpValue;
+}
+
+/// \returns true if \p Opcode is allowed as part of of the main/alternate
+/// instruction for SLP vectorization.
+///
+/// Example of unsupported opcode is SDIV that can potentially cause UB if the
+/// "shuffled out" lane would result in division by zero.
+static bool isValidForAlternation(unsigned Opcode) {
+  if (Instruction::isIntDivRem(Opcode))
+    return false;
+
+  return true;
+}
+
+/// \returns analysis of the Instructions in \p VL described in
+/// InstructionsState, the Opcode that we suppose the whole list
+/// could be vectorized even if its structure is diverse.
+static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
+                                       unsigned BaseIndex = 0) {
+  // Make sure these are all Instructions.
+  if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
+    return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+
+  bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
+  bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
+  unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
+  unsigned AltOpcode = Opcode;
+  unsigned AltIndex = BaseIndex;
+
+  // Check for one alternate opcode from another BinaryOperator.
+  // TODO - generalize to support all operators (types, calls etc.).
+  for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
+    unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode();
+    if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {
+      if (InstOpcode == Opcode || InstOpcode == AltOpcode)
+        continue;
+      if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
+          isValidForAlternation(Opcode)) {
+        AltOpcode = InstOpcode;
+        AltIndex = Cnt;
+        continue;
+      }
+    } else if (IsCastOp && isa<CastInst>(VL[Cnt])) {
+      Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType();
+      Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType();
+      if (Ty0 == Ty1) {
+        if (InstOpcode == Opcode || InstOpcode == AltOpcode)
+          continue;
+        if (Opcode == AltOpcode) {
+          assert(isValidForAlternation(Opcode) &&
+                 isValidForAlternation(InstOpcode) &&
+                 "Cast isn't safe for alternation, logic needs to be updated!");
+          AltOpcode = InstOpcode;
+          AltIndex = Cnt;
+          continue;
+        }
+      }
+    } else if (InstOpcode == Opcode || InstOpcode == AltOpcode)
+      continue;
+    return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+  }
+
+  return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
+                           cast<Instruction>(VL[AltIndex]));
+}
+
+/// \returns true if all of the values in \p VL have the same type or false
+/// otherwise.
+static bool allSameType(ArrayRef<Value *> VL) {
+  Type *Ty = VL[0]->getType();
+  for (int i = 1, e = VL.size(); i < e; i++)
+    if (VL[i]->getType() != Ty)
+      return false;
+
+  return true;
+}
+
+/// \returns True if Extract{Value,Element} instruction extracts element Idx.
+static Optional<unsigned> getExtractIndex(Instruction *E) {
+  unsigned Opcode = E->getOpcode();
+  assert((Opcode == Instruction::ExtractElement ||
+          Opcode == Instruction::ExtractValue) &&
+         "Expected extractelement or extractvalue instruction.");
+  if (Opcode == Instruction::ExtractElement) {
+    auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
+    if (!CI)
+      return None;
+    return CI->getZExtValue();
+  }
+  ExtractValueInst *EI = cast<ExtractValueInst>(E);
+  if (EI->getNumIndices() != 1)
+    return None;
+  return *EI->idx_begin();
+}
+
+/// \returns True if in-tree use also needs extract. This refers to
+/// possible scalar operand in vectorized instruction.
+static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
+                                    TargetLibraryInfo *TLI) {
+  unsigned Opcode = UserInst->getOpcode();
+  switch (Opcode) {
+  case Instruction::Load: {
+    LoadInst *LI = cast<LoadInst>(UserInst);
+    return (LI->getPointerOperand() == Scalar);
+  }
+  case Instruction::Store: {
+    StoreInst *SI = cast<StoreInst>(UserInst);
+    return (SI->getPointerOperand() == Scalar);
+  }
+  case Instruction::Call: {
+    CallInst *CI = cast<CallInst>(UserInst);
+    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+    for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
+      if (hasVectorInstrinsicScalarOpd(ID, i))
+        return (CI->getArgOperand(i) == Scalar);
+    }
+    LLVM_FALLTHROUGH;
+  }
+  default:
+    return false;
+  }
+}
+
+/// \returns the AA location that is being access by the instruction.
 static MemoryLocation getLocation(Instruction *I, AAResults *AA) {
-  if (StoreInst *SI = dyn_cast<StoreInst>(I)) 
-    return MemoryLocation::get(SI); 
-  if (LoadInst *LI = dyn_cast<LoadInst>(I)) 
-    return MemoryLocation::get(LI); 
-  return MemoryLocation(); 
-} 
- 
-/// \returns True if the instruction is not a volatile or atomic load/store. 
-static bool isSimple(Instruction *I) { 
-  if (LoadInst *LI = dyn_cast<LoadInst>(I)) 
-    return LI->isSimple(); 
-  if (StoreInst *SI = dyn_cast<StoreInst>(I)) 
-    return SI->isSimple(); 
-  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) 
-    return !MI->isVolatile(); 
-  return true; 
-} 
- 
-namespace llvm { 
- 
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return MemoryLocation::get(SI);
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return MemoryLocation::get(LI);
+  return MemoryLocation();
+}
+
+/// \returns True if the instruction is not a volatile or atomic load/store.
+static bool isSimple(Instruction *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->isSimple();
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->isSimple();
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
+    return !MI->isVolatile();
+  return true;
+}
+
+namespace llvm {
+
 static void inversePermutation(ArrayRef<unsigned> Indices,
                                SmallVectorImpl<int> &Mask) {
   Mask.clear();
@@ -538,98 +538,98 @@ static void inversePermutation(ArrayRef<unsigned> Indices,
     Mask[Indices[I]] = I;
 }
 
-namespace slpvectorizer { 
- 
-/// Bottom Up SLP Vectorizer. 
-class BoUpSLP { 
-  struct TreeEntry; 
-  struct ScheduleData; 
- 
-public: 
-  using ValueList = SmallVector<Value *, 8>; 
-  using InstrList = SmallVector<Instruction *, 16>; 
-  using ValueSet = SmallPtrSet<Value *, 16>; 
-  using StoreList = SmallVector<StoreInst *, 8>; 
-  using ExtraValueToDebugLocsMap = 
-      MapVector<Value *, SmallVector<Instruction *, 2>>; 
+namespace slpvectorizer {
+
+/// Bottom Up SLP Vectorizer.
+class BoUpSLP {
+  struct TreeEntry;
+  struct ScheduleData;
+
+public:
+  using ValueList = SmallVector<Value *, 8>;
+  using InstrList = SmallVector<Instruction *, 16>;
+  using ValueSet = SmallPtrSet<Value *, 16>;
+  using StoreList = SmallVector<StoreInst *, 8>;
+  using ExtraValueToDebugLocsMap =
+      MapVector<Value *, SmallVector<Instruction *, 2>>;
   using OrdersType = SmallVector<unsigned, 4>;
- 
-  BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, 
+
+  BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
           TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
-          DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, 
-          const DataLayout *DL, OptimizationRemarkEmitter *ORE) 
-      : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), 
-        DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) { 
-    CodeMetrics::collectEphemeralValues(F, AC, EphValues); 
-    // Use the vector register size specified by the target unless overridden 
-    // by a command-line option. 
-    // TODO: It would be better to limit the vectorization factor based on 
-    //       data type rather than just register size. For example, x86 AVX has 
-    //       256-bit registers, but it does not support integer operations 
-    //       at that width (that requires AVX2). 
-    if (MaxVectorRegSizeOption.getNumOccurrences()) 
-      MaxVecRegSize = MaxVectorRegSizeOption; 
-    else 
-      MaxVecRegSize = TTI->getRegisterBitWidth(true); 
- 
-    if (MinVectorRegSizeOption.getNumOccurrences()) 
-      MinVecRegSize = MinVectorRegSizeOption; 
-    else 
-      MinVecRegSize = TTI->getMinVectorRegisterBitWidth(); 
-  } 
- 
-  /// Vectorize the tree that starts with the elements in \p VL. 
-  /// Returns the vectorized root. 
-  Value *vectorizeTree(); 
- 
-  /// Vectorize the tree but with the list of externally used values \p 
-  /// ExternallyUsedValues. Values in this MapVector can be replaced but the 
-  /// generated extractvalue instructions. 
-  Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues); 
- 
-  /// \returns the cost incurred by unwanted spills and fills, caused by 
-  /// holding live values over call sites. 
+          DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
+          const DataLayout *DL, OptimizationRemarkEmitter *ORE)
+      : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
+        DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
+    CodeMetrics::collectEphemeralValues(F, AC, EphValues);
+    // Use the vector register size specified by the target unless overridden
+    // by a command-line option.
+    // TODO: It would be better to limit the vectorization factor based on
+    //       data type rather than just register size. For example, x86 AVX has
+    //       256-bit registers, but it does not support integer operations
+    //       at that width (that requires AVX2).
+    if (MaxVectorRegSizeOption.getNumOccurrences())
+      MaxVecRegSize = MaxVectorRegSizeOption;
+    else
+      MaxVecRegSize = TTI->getRegisterBitWidth(true);
+
+    if (MinVectorRegSizeOption.getNumOccurrences())
+      MinVecRegSize = MinVectorRegSizeOption;
+    else
+      MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
+  }
+
+  /// Vectorize the tree that starts with the elements in \p VL.
+  /// Returns the vectorized root.
+  Value *vectorizeTree();
+
+  /// Vectorize the tree but with the list of externally used values \p
+  /// ExternallyUsedValues. Values in this MapVector can be replaced but the
+  /// generated extractvalue instructions.
+  Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues);
+
+  /// \returns the cost incurred by unwanted spills and fills, caused by
+  /// holding live values over call sites.
   InstructionCost getSpillCost() const;
- 
-  /// \returns the vectorization cost of the subtree that starts at \p VL. 
-  /// A negative number means that this is profitable. 
+
+  /// \returns the vectorization cost of the subtree that starts at \p VL.
+  /// A negative number means that this is profitable.
   InstructionCost getTreeCost();
- 
-  /// Construct a vectorizable tree that starts at \p Roots, ignoring users for 
-  /// the purpose of scheduling and extraction in the \p UserIgnoreLst. 
-  void buildTree(ArrayRef<Value *> Roots, 
-                 ArrayRef<Value *> UserIgnoreLst = None); 
- 
-  /// Construct a vectorizable tree that starts at \p Roots, ignoring users for 
-  /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking 
-  /// into account (and updating it, if required) list of externally used 
-  /// values stored in \p ExternallyUsedValues. 
-  void buildTree(ArrayRef<Value *> Roots, 
-                 ExtraValueToDebugLocsMap &ExternallyUsedValues, 
-                 ArrayRef<Value *> UserIgnoreLst = None); 
- 
-  /// Clear the internal data structures that are created by 'buildTree'. 
-  void deleteTree() { 
-    VectorizableTree.clear(); 
-    ScalarToTreeEntry.clear(); 
-    MustGather.clear(); 
-    ExternalUses.clear(); 
-    NumOpsWantToKeepOrder.clear(); 
-    NumOpsWantToKeepOriginalOrder = 0; 
-    for (auto &Iter : BlocksSchedules) { 
-      BlockScheduling *BS = Iter.second.get(); 
-      BS->clear(); 
-    } 
-    MinBWs.clear(); 
-  } 
- 
-  unsigned getTreeSize() const { return VectorizableTree.size(); } 
- 
-  /// Perform LICM and CSE on the newly generated gather sequences. 
-  void optimizeGatherSequence(); 
- 
-  /// \returns The best order of instructions for vectorization. 
-  Optional<ArrayRef<unsigned>> bestOrder() const { 
+
+  /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
+  /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
+  void buildTree(ArrayRef<Value *> Roots,
+                 ArrayRef<Value *> UserIgnoreLst = None);
+
+  /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
+  /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking
+  /// into account (and updating it, if required) list of externally used
+  /// values stored in \p ExternallyUsedValues.
+  void buildTree(ArrayRef<Value *> Roots,
+                 ExtraValueToDebugLocsMap &ExternallyUsedValues,
+                 ArrayRef<Value *> UserIgnoreLst = None);
+
+  /// Clear the internal data structures that are created by 'buildTree'.
+  void deleteTree() {
+    VectorizableTree.clear();
+    ScalarToTreeEntry.clear();
+    MustGather.clear();
+    ExternalUses.clear();
+    NumOpsWantToKeepOrder.clear();
+    NumOpsWantToKeepOriginalOrder = 0;
+    for (auto &Iter : BlocksSchedules) {
+      BlockScheduling *BS = Iter.second.get();
+      BS->clear();
+    }
+    MinBWs.clear();
+  }
+
+  unsigned getTreeSize() const { return VectorizableTree.size(); }
+
+  /// Perform LICM and CSE on the newly generated gather sequences.
+  void optimizeGatherSequence();
+
+  /// \returns The best order of instructions for vectorization.
+  Optional<ArrayRef<unsigned>> bestOrder() const {
     assert(llvm::all_of(
                NumOpsWantToKeepOrder,
                [this](const decltype(NumOpsWantToKeepOrder)::value_type &D) {
@@ -638,19 +638,19 @@ public:
                }) &&
            "All orders must have the same size as number of instructions in "
            "tree node.");
-    auto I = std::max_element( 
-        NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(), 
-        [](const decltype(NumOpsWantToKeepOrder)::value_type &D1, 
-           const decltype(NumOpsWantToKeepOrder)::value_type &D2) { 
-          return D1.second < D2.second; 
-        }); 
-    if (I == NumOpsWantToKeepOrder.end() || 
-        I->getSecond() <= NumOpsWantToKeepOriginalOrder) 
-      return None; 
- 
-    return makeArrayRef(I->getFirst()); 
-  } 
- 
+    auto I = std::max_element(
+        NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(),
+        [](const decltype(NumOpsWantToKeepOrder)::value_type &D1,
+           const decltype(NumOpsWantToKeepOrder)::value_type &D2) {
+          return D1.second < D2.second;
+        });
+    if (I == NumOpsWantToKeepOrder.end() ||
+        I->getSecond() <= NumOpsWantToKeepOriginalOrder)
+      return None;
+
+    return makeArrayRef(I->getFirst());
+  }
+
   /// Builds the correct order for root instructions.
   /// If some leaves have the same instructions to be vectorized, we may
   /// incorrectly evaluate the best order for the root node (it is built for the
@@ -726,267 +726,267 @@ public:
            "All indices must be initialized");
   }
 
-  /// \return The vector element size in bits to use when vectorizing the 
-  /// expression tree ending at \p V. If V is a store, the size is the width of 
-  /// the stored value. Otherwise, the size is the width of the largest loaded 
-  /// value reaching V. This method is used by the vectorizer to calculate 
-  /// vectorization factors. 
-  unsigned getVectorElementSize(Value *V); 
- 
-  /// Compute the minimum type sizes required to represent the entries in a 
-  /// vectorizable tree. 
-  void computeMinimumValueSizes(); 
- 
-  // \returns maximum vector register size as set by TTI or overridden by cl::opt. 
-  unsigned getMaxVecRegSize() const { 
-    return MaxVecRegSize; 
-  } 
- 
-  // \returns minimum vector register size as set by cl::opt. 
-  unsigned getMinVecRegSize() const { 
-    return MinVecRegSize; 
-  } 
- 
+  /// \return The vector element size in bits to use when vectorizing the
+  /// expression tree ending at \p V. If V is a store, the size is the width of
+  /// the stored value. Otherwise, the size is the width of the largest loaded
+  /// value reaching V. This method is used by the vectorizer to calculate
+  /// vectorization factors.
+  unsigned getVectorElementSize(Value *V);
+
+  /// Compute the minimum type sizes required to represent the entries in a
+  /// vectorizable tree.
+  void computeMinimumValueSizes();
+
+  // \returns maximum vector register size as set by TTI or overridden by cl::opt.
+  unsigned getMaxVecRegSize() const {
+    return MaxVecRegSize;
+  }
+
+  // \returns minimum vector register size as set by cl::opt.
+  unsigned getMinVecRegSize() const {
+    return MinVecRegSize;
+  }
+
   unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
     unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
       MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
     return MaxVF ? MaxVF : UINT_MAX;
   }
 
-  /// Check if homogeneous aggregate is isomorphic to some VectorType. 
-  /// Accepts homogeneous multidimensional aggregate of scalars/vectors like 
-  /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> }, 
-  /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on. 
-  /// 
-  /// \returns number of elements in vector if isomorphism exists, 0 otherwise. 
-  unsigned canMapToVector(Type *T, const DataLayout &DL) const; 
- 
-  /// \returns True if the VectorizableTree is both tiny and not fully 
-  /// vectorizable. We do not vectorize such trees. 
-  bool isTreeTinyAndNotFullyVectorizable() const; 
- 
-  /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values 
-  /// can be load combined in the backend. Load combining may not be allowed in 
-  /// the IR optimizer, so we do not want to alter the pattern. For example, 
-  /// partially transforming a scalar bswap() pattern into vector code is 
-  /// effectively impossible for the backend to undo. 
-  /// TODO: If load combining is allowed in the IR optimizer, this analysis 
-  ///       may not be necessary. 
+  /// Check if homogeneous aggregate is isomorphic to some VectorType.
+  /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
+  /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
+  /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
+  ///
+  /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
+  unsigned canMapToVector(Type *T, const DataLayout &DL) const;
+
+  /// \returns True if the VectorizableTree is both tiny and not fully
+  /// vectorizable. We do not vectorize such trees.
+  bool isTreeTinyAndNotFullyVectorizable() const;
+
+  /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
+  /// can be load combined in the backend. Load combining may not be allowed in
+  /// the IR optimizer, so we do not want to alter the pattern. For example,
+  /// partially transforming a scalar bswap() pattern into vector code is
+  /// effectively impossible for the backend to undo.
+  /// TODO: If load combining is allowed in the IR optimizer, this analysis
+  ///       may not be necessary.
   bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
- 
-  /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values 
-  /// can be load combined in the backend. Load combining may not be allowed in 
-  /// the IR optimizer, so we do not want to alter the pattern. For example, 
-  /// partially transforming a scalar bswap() pattern into vector code is 
-  /// effectively impossible for the backend to undo. 
-  /// TODO: If load combining is allowed in the IR optimizer, this analysis 
-  ///       may not be necessary. 
-  bool isLoadCombineCandidate() const; 
- 
-  OptimizationRemarkEmitter *getORE() { return ORE; } 
- 
-  /// This structure holds any data we need about the edges being traversed 
-  /// during buildTree_rec(). We keep track of: 
-  /// (i) the user TreeEntry index, and 
-  /// (ii) the index of the edge. 
-  struct EdgeInfo { 
-    EdgeInfo() = default; 
-    EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx) 
-        : UserTE(UserTE), EdgeIdx(EdgeIdx) {} 
-    /// The user TreeEntry. 
-    TreeEntry *UserTE = nullptr; 
-    /// The operand index of the use. 
-    unsigned EdgeIdx = UINT_MAX; 
-#ifndef NDEBUG 
-    friend inline raw_ostream &operator<<(raw_ostream &OS, 
-                                          const BoUpSLP::EdgeInfo &EI) { 
-      EI.dump(OS); 
-      return OS; 
-    } 
-    /// Debug print. 
-    void dump(raw_ostream &OS) const { 
-      OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null") 
-         << " EdgeIdx:" << EdgeIdx << "}"; 
-    } 
-    LLVM_DUMP_METHOD void dump() const { dump(dbgs()); } 
-#endif 
-  }; 
- 
-  /// A helper data structure to hold the operands of a vector of instructions. 
-  /// This supports a fixed vector length for all operand vectors. 
-  class VLOperands { 
-    /// For each operand we need (i) the value, and (ii) the opcode that it 
-    /// would be attached to if the expression was in a left-linearized form. 
-    /// This is required to avoid illegal operand reordering. 
-    /// For example: 
-    /// \verbatim 
-    ///                         0 Op1 
-    ///                         |/ 
-    /// Op1 Op2   Linearized    + Op2 
-    ///   \ /     ---------->   |/ 
-    ///    -                    - 
-    /// 
-    /// Op1 - Op2            (0 + Op1) - Op2 
-    /// \endverbatim 
-    /// 
-    /// Value Op1 is attached to a '+' operation, and Op2 to a '-'. 
-    /// 
-    /// Another way to think of this is to track all the operations across the 
-    /// path from the operand all the way to the root of the tree and to 
-    /// calculate the operation that corresponds to this path. For example, the 
-    /// path from Op2 to the root crosses the RHS of the '-', therefore the 
-    /// corresponding operation is a '-' (which matches the one in the 
-    /// linearized tree, as shown above). 
-    /// 
-    /// For lack of a better term, we refer to this operation as Accumulated 
-    /// Path Operation (APO). 
-    struct OperandData { 
-      OperandData() = default; 
-      OperandData(Value *V, bool APO, bool IsUsed) 
-          : V(V), APO(APO), IsUsed(IsUsed) {} 
-      /// The operand value. 
-      Value *V = nullptr; 
-      /// TreeEntries only allow a single opcode, or an alternate sequence of 
-      /// them (e.g, +, -). Therefore, we can safely use a boolean value for the 
-      /// APO. It is set to 'true' if 'V' is attached to an inverse operation 
-      /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise 
-      /// (e.g., Add/Mul) 
-      bool APO = false; 
-      /// Helper data for the reordering function. 
-      bool IsUsed = false; 
-    }; 
- 
-    /// During operand reordering, we are trying to select the operand at lane 
-    /// that matches best with the operand at the neighboring lane. Our 
-    /// selection is based on the type of value we are looking for. For example, 
-    /// if the neighboring lane has a load, we need to look for a load that is 
-    /// accessing a consecutive address. These strategies are summarized in the 
-    /// 'ReorderingMode' enumerator. 
-    enum class ReorderingMode { 
-      Load,     ///< Matching loads to consecutive memory addresses 
-      Opcode,   ///< Matching instructions based on opcode (same or alternate) 
-      Constant, ///< Matching constants 
-      Splat,    ///< Matching the same instruction multiple times (broadcast) 
-      Failed,   ///< We failed to create a vectorizable group 
-    }; 
- 
-    using OperandDataVec = SmallVector<OperandData, 2>; 
- 
-    /// A vector of operand vectors. 
-    SmallVector<OperandDataVec, 4> OpsVec; 
- 
-    const DataLayout &DL; 
-    ScalarEvolution &SE; 
-    const BoUpSLP &R; 
- 
-    /// \returns the operand data at \p OpIdx and \p Lane. 
-    OperandData &getData(unsigned OpIdx, unsigned Lane) { 
-      return OpsVec[OpIdx][Lane]; 
-    } 
- 
-    /// \returns the operand data at \p OpIdx and \p Lane. Const version. 
-    const OperandData &getData(unsigned OpIdx, unsigned Lane) const { 
-      return OpsVec[OpIdx][Lane]; 
-    } 
- 
-    /// Clears the used flag for all entries. 
-    void clearUsed() { 
-      for (unsigned OpIdx = 0, NumOperands = getNumOperands(); 
-           OpIdx != NumOperands; ++OpIdx) 
-        for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes; 
-             ++Lane) 
-          OpsVec[OpIdx][Lane].IsUsed = false; 
-    } 
- 
-    /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2. 
-    void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) { 
-      std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]); 
-    } 
- 
-    // The hard-coded scores listed here are not very important. When computing 
-    // the scores of matching one sub-tree with another, we are basically 
-    // counting the number of values that are matching. So even if all scores 
-    // are set to 1, we would still get a decent matching result. 
-    // However, sometimes we have to break ties. For example we may have to 
-    // choose between matching loads vs matching opcodes. This is what these 
-    // scores are helping us with: they provide the order of preference. 
- 
-    /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]). 
-    static const int ScoreConsecutiveLoads = 3; 
-    /// ExtractElementInst from same vector and consecutive indexes. 
-    static const int ScoreConsecutiveExtracts = 3; 
-    /// Constants. 
-    static const int ScoreConstants = 2; 
-    /// Instructions with the same opcode. 
-    static const int ScoreSameOpcode = 2; 
-    /// Instructions with alt opcodes (e.g, add + sub). 
-    static const int ScoreAltOpcodes = 1; 
-    /// Identical instructions (a.k.a. splat or broadcast). 
-    static const int ScoreSplat = 1; 
-    /// Matching with an undef is preferable to failing. 
-    static const int ScoreUndef = 1; 
-    /// Score for failing to find a decent match. 
-    static const int ScoreFail = 0; 
-    /// User exteranl to the vectorized code. 
-    static const int ExternalUseCost = 1; 
-    /// The user is internal but in a different lane. 
-    static const int UserInDiffLaneCost = ExternalUseCost; 
- 
-    /// \returns the score of placing \p V1 and \p V2 in consecutive lanes. 
-    static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL, 
-                               ScalarEvolution &SE) { 
-      auto *LI1 = dyn_cast<LoadInst>(V1); 
-      auto *LI2 = dyn_cast<LoadInst>(V2); 
-      if (LI1 && LI2) 
-        return isConsecutiveAccess(LI1, LI2, DL, SE) 
-                   ? VLOperands::ScoreConsecutiveLoads 
-                   : VLOperands::ScoreFail; 
- 
-      auto *C1 = dyn_cast<Constant>(V1); 
-      auto *C2 = dyn_cast<Constant>(V2); 
-      if (C1 && C2) 
-        return VLOperands::ScoreConstants; 
- 
-      // Extracts from consecutive indexes of the same vector better score as 
-      // the extracts could be optimized away. 
-      Value *EV; 
-      ConstantInt *Ex1Idx, *Ex2Idx; 
-      if (match(V1, m_ExtractElt(m_Value(EV), m_ConstantInt(Ex1Idx))) && 
-          match(V2, m_ExtractElt(m_Deferred(EV), m_ConstantInt(Ex2Idx))) && 
-          Ex1Idx->getZExtValue() + 1 == Ex2Idx->getZExtValue()) 
-        return VLOperands::ScoreConsecutiveExtracts; 
- 
-      auto *I1 = dyn_cast<Instruction>(V1); 
-      auto *I2 = dyn_cast<Instruction>(V2); 
-      if (I1 && I2) { 
-        if (I1 == I2) 
-          return VLOperands::ScoreSplat; 
-        InstructionsState S = getSameOpcode({I1, I2}); 
-        // Note: Only consider instructions with <= 2 operands to avoid 
-        // complexity explosion. 
-        if (S.getOpcode() && S.MainOp->getNumOperands() <= 2) 
-          return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes 
-                                  : VLOperands::ScoreSameOpcode; 
-      } 
- 
-      if (isa<UndefValue>(V2)) 
-        return VLOperands::ScoreUndef; 
- 
-      return VLOperands::ScoreFail; 
-    } 
- 
-    /// Holds the values and their lane that are taking part in the look-ahead 
-    /// score calculation. This is used in the external uses cost calculation. 
-    SmallDenseMap<Value *, int> InLookAheadValues; 
- 
-    /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are 
-    /// either external to the vectorized code, or require shuffling. 
-    int getExternalUsesCost(const std::pair<Value *, int> &LHS, 
-                            const std::pair<Value *, int> &RHS) { 
-      int Cost = 0; 
-      std::array<std::pair<Value *, int>, 2> Values = {{LHS, RHS}}; 
-      for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) { 
-        Value *V = Values[Idx].first; 
+
+  /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
+  /// can be load combined in the backend. Load combining may not be allowed in
+  /// the IR optimizer, so we do not want to alter the pattern. For example,
+  /// partially transforming a scalar bswap() pattern into vector code is
+  /// effectively impossible for the backend to undo.
+  /// TODO: If load combining is allowed in the IR optimizer, this analysis
+  ///       may not be necessary.
+  bool isLoadCombineCandidate() const;
+
+  OptimizationRemarkEmitter *getORE() { return ORE; }
+
+  /// This structure holds any data we need about the edges being traversed
+  /// during buildTree_rec(). We keep track of:
+  /// (i) the user TreeEntry index, and
+  /// (ii) the index of the edge.
+  struct EdgeInfo {
+    EdgeInfo() = default;
+    EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
+        : UserTE(UserTE), EdgeIdx(EdgeIdx) {}
+    /// The user TreeEntry.
+    TreeEntry *UserTE = nullptr;
+    /// The operand index of the use.
+    unsigned EdgeIdx = UINT_MAX;
+#ifndef NDEBUG
+    friend inline raw_ostream &operator<<(raw_ostream &OS,
+                                          const BoUpSLP::EdgeInfo &EI) {
+      EI.dump(OS);
+      return OS;
+    }
+    /// Debug print.
+    void dump(raw_ostream &OS) const {
+      OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
+         << " EdgeIdx:" << EdgeIdx << "}";
+    }
+    LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
+#endif
+  };
+
+  /// A helper data structure to hold the operands of a vector of instructions.
+  /// This supports a fixed vector length for all operand vectors.
+  class VLOperands {
+    /// For each operand we need (i) the value, and (ii) the opcode that it
+    /// would be attached to if the expression was in a left-linearized form.
+    /// This is required to avoid illegal operand reordering.
+    /// For example:
+    /// \verbatim
+    ///                         0 Op1
+    ///                         |/
+    /// Op1 Op2   Linearized    + Op2
+    ///   \ /     ---------->   |/
+    ///    -                    -
+    ///
+    /// Op1 - Op2            (0 + Op1) - Op2
+    /// \endverbatim
+    ///
+    /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
+    ///
+    /// Another way to think of this is to track all the operations across the
+    /// path from the operand all the way to the root of the tree and to
+    /// calculate the operation that corresponds to this path. For example, the
+    /// path from Op2 to the root crosses the RHS of the '-', therefore the
+    /// corresponding operation is a '-' (which matches the one in the
+    /// linearized tree, as shown above).
+    ///
+    /// For lack of a better term, we refer to this operation as Accumulated
+    /// Path Operation (APO).
+    struct OperandData {
+      OperandData() = default;
+      OperandData(Value *V, bool APO, bool IsUsed)
+          : V(V), APO(APO), IsUsed(IsUsed) {}
+      /// The operand value.
+      Value *V = nullptr;
+      /// TreeEntries only allow a single opcode, or an alternate sequence of
+      /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
+      /// APO. It is set to 'true' if 'V' is attached to an inverse operation
+      /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
+      /// (e.g., Add/Mul)
+      bool APO = false;
+      /// Helper data for the reordering function.
+      bool IsUsed = false;
+    };
+
+    /// During operand reordering, we are trying to select the operand at lane
+    /// that matches best with the operand at the neighboring lane. Our
+    /// selection is based on the type of value we are looking for. For example,
+    /// if the neighboring lane has a load, we need to look for a load that is
+    /// accessing a consecutive address. These strategies are summarized in the
+    /// 'ReorderingMode' enumerator.
+    enum class ReorderingMode {
+      Load,     ///< Matching loads to consecutive memory addresses
+      Opcode,   ///< Matching instructions based on opcode (same or alternate)
+      Constant, ///< Matching constants
+      Splat,    ///< Matching the same instruction multiple times (broadcast)
+      Failed,   ///< We failed to create a vectorizable group
+    };
+
+    using OperandDataVec = SmallVector<OperandData, 2>;
+
+    /// A vector of operand vectors.
+    SmallVector<OperandDataVec, 4> OpsVec;
+
+    const DataLayout &DL;
+    ScalarEvolution &SE;
+    const BoUpSLP &R;
+
+    /// \returns the operand data at \p OpIdx and \p Lane.
+    OperandData &getData(unsigned OpIdx, unsigned Lane) {
+      return OpsVec[OpIdx][Lane];
+    }
+
+    /// \returns the operand data at \p OpIdx and \p Lane. Const version.
+    const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
+      return OpsVec[OpIdx][Lane];
+    }
+
+    /// Clears the used flag for all entries.
+    void clearUsed() {
+      for (unsigned OpIdx = 0, NumOperands = getNumOperands();
+           OpIdx != NumOperands; ++OpIdx)
+        for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
+             ++Lane)
+          OpsVec[OpIdx][Lane].IsUsed = false;
+    }
+
+    /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
+    void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
+      std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
+    }
+
+    // The hard-coded scores listed here are not very important. When computing
+    // the scores of matching one sub-tree with another, we are basically
+    // counting the number of values that are matching. So even if all scores
+    // are set to 1, we would still get a decent matching result.
+    // However, sometimes we have to break ties. For example we may have to
+    // choose between matching loads vs matching opcodes. This is what these
+    // scores are helping us with: they provide the order of preference.
+
+    /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
+    static const int ScoreConsecutiveLoads = 3;
+    /// ExtractElementInst from same vector and consecutive indexes.
+    static const int ScoreConsecutiveExtracts = 3;
+    /// Constants.
+    static const int ScoreConstants = 2;
+    /// Instructions with the same opcode.
+    static const int ScoreSameOpcode = 2;
+    /// Instructions with alt opcodes (e.g, add + sub).
+    static const int ScoreAltOpcodes = 1;
+    /// Identical instructions (a.k.a. splat or broadcast).
+    static const int ScoreSplat = 1;
+    /// Matching with an undef is preferable to failing.
+    static const int ScoreUndef = 1;
+    /// Score for failing to find a decent match.
+    static const int ScoreFail = 0;
+    /// User exteranl to the vectorized code.
+    static const int ExternalUseCost = 1;
+    /// The user is internal but in a different lane.
+    static const int UserInDiffLaneCost = ExternalUseCost;
+
+    /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
+    static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL,
+                               ScalarEvolution &SE) {
+      auto *LI1 = dyn_cast<LoadInst>(V1);
+      auto *LI2 = dyn_cast<LoadInst>(V2);
+      if (LI1 && LI2)
+        return isConsecutiveAccess(LI1, LI2, DL, SE)
+                   ? VLOperands::ScoreConsecutiveLoads
+                   : VLOperands::ScoreFail;
+
+      auto *C1 = dyn_cast<Constant>(V1);
+      auto *C2 = dyn_cast<Constant>(V2);
+      if (C1 && C2)
+        return VLOperands::ScoreConstants;
+
+      // Extracts from consecutive indexes of the same vector better score as
+      // the extracts could be optimized away.
+      Value *EV;
+      ConstantInt *Ex1Idx, *Ex2Idx;
+      if (match(V1, m_ExtractElt(m_Value(EV), m_ConstantInt(Ex1Idx))) &&
+          match(V2, m_ExtractElt(m_Deferred(EV), m_ConstantInt(Ex2Idx))) &&
+          Ex1Idx->getZExtValue() + 1 == Ex2Idx->getZExtValue())
+        return VLOperands::ScoreConsecutiveExtracts;
+
+      auto *I1 = dyn_cast<Instruction>(V1);
+      auto *I2 = dyn_cast<Instruction>(V2);
+      if (I1 && I2) {
+        if (I1 == I2)
+          return VLOperands::ScoreSplat;
+        InstructionsState S = getSameOpcode({I1, I2});
+        // Note: Only consider instructions with <= 2 operands to avoid
+        // complexity explosion.
+        if (S.getOpcode() && S.MainOp->getNumOperands() <= 2)
+          return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes
+                                  : VLOperands::ScoreSameOpcode;
+      }
+
+      if (isa<UndefValue>(V2))
+        return VLOperands::ScoreUndef;
+
+      return VLOperands::ScoreFail;
+    }
+
+    /// Holds the values and their lane that are taking part in the look-ahead
+    /// score calculation. This is used in the external uses cost calculation.
+    SmallDenseMap<Value *, int> InLookAheadValues;
+
+    /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are
+    /// either external to the vectorized code, or require shuffling.
+    int getExternalUsesCost(const std::pair<Value *, int> &LHS,
+                            const std::pair<Value *, int> &RHS) {
+      int Cost = 0;
+      std::array<std::pair<Value *, int>, 2> Values = {{LHS, RHS}};
+      for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {
+        Value *V = Values[Idx].first;
         if (isa<Constant>(V)) {
           // Since this is a function pass, it doesn't make semantic sense to
           // walk the users of a subclass of Constant. The users could be in
@@ -995,776 +995,776 @@ public:
           continue;
         }
 
-        // Calculate the absolute lane, using the minimum relative lane of LHS 
-        // and RHS as base and Idx as the offset. 
-        int Ln = std::min(LHS.second, RHS.second) + Idx; 
-        assert(Ln >= 0 && "Bad lane calculation"); 
-        unsigned UsersBudget = LookAheadUsersBudget; 
-        for (User *U : V->users()) { 
-          if (const TreeEntry *UserTE = R.getTreeEntry(U)) { 
-            // The user is in the VectorizableTree. Check if we need to insert. 
-            auto It = llvm::find(UserTE->Scalars, U); 
-            assert(It != UserTE->Scalars.end() && "U is in UserTE"); 
-            int UserLn = std::distance(UserTE->Scalars.begin(), It); 
-            assert(UserLn >= 0 && "Bad lane"); 
-            if (UserLn != Ln) 
-              Cost += UserInDiffLaneCost; 
-          } else { 
-            // Check if the user is in the look-ahead code. 
-            auto It2 = InLookAheadValues.find(U); 
-            if (It2 != InLookAheadValues.end()) { 
-              // The user is in the look-ahead code. Check the lane. 
-              if (It2->second != Ln) 
-                Cost += UserInDiffLaneCost; 
-            } else { 
-              // The user is neither in SLP tree nor in the look-ahead code. 
-              Cost += ExternalUseCost; 
-            } 
-          } 
-          // Limit the number of visited uses to cap compilation time. 
-          if (--UsersBudget == 0) 
-            break; 
-        } 
-      } 
-      return Cost; 
-    } 
- 
-    /// Go through the operands of \p LHS and \p RHS recursively until \p 
-    /// MaxLevel, and return the cummulative score. For example: 
-    /// \verbatim 
-    ///  A[0]  B[0]  A[1]  B[1]  C[0] D[0]  B[1] A[1] 
-    ///     \ /         \ /         \ /        \ / 
-    ///      +           +           +          + 
-    ///     G1          G2          G3         G4 
-    /// \endverbatim 
-    /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at 
-    /// each level recursively, accumulating the score. It starts from matching 
-    /// the additions at level 0, then moves on to the loads (level 1). The 
-    /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and 
-    /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while 
-    /// {A[0],C[0]} has a score of VLOperands::ScoreFail. 
-    /// Please note that the order of the operands does not matter, as we 
-    /// evaluate the score of all profitable combinations of operands. In 
-    /// other words the score of G1 and G4 is the same as G1 and G2. This 
-    /// heuristic is based on ideas described in: 
-    ///   Look-ahead SLP: Auto-vectorization in the presence of commutative 
-    ///   operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, 
-    ///   Luís F. W. Góes 
-    int getScoreAtLevelRec(const std::pair<Value *, int> &LHS, 
-                           const std::pair<Value *, int> &RHS, int CurrLevel, 
-                           int MaxLevel) { 
- 
-      Value *V1 = LHS.first; 
-      Value *V2 = RHS.first; 
-      // Get the shallow score of V1 and V2. 
-      int ShallowScoreAtThisLevel = 
-          std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) - 
-                                       getExternalUsesCost(LHS, RHS)); 
-      int Lane1 = LHS.second; 
-      int Lane2 = RHS.second; 
- 
-      // If reached MaxLevel, 
-      //  or if V1 and V2 are not instructions, 
-      //  or if they are SPLAT, 
-      //  or if they are not consecutive, early return the current cost. 
-      auto *I1 = dyn_cast<Instruction>(V1); 
-      auto *I2 = dyn_cast<Instruction>(V2); 
-      if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 || 
-          ShallowScoreAtThisLevel == VLOperands::ScoreFail || 
-          (isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel)) 
-        return ShallowScoreAtThisLevel; 
-      assert(I1 && I2 && "Should have early exited."); 
- 
-      // Keep track of in-tree values for determining the external-use cost. 
-      InLookAheadValues[V1] = Lane1; 
-      InLookAheadValues[V2] = Lane2; 
- 
-      // Contains the I2 operand indexes that got matched with I1 operands. 
-      SmallSet<unsigned, 4> Op2Used; 
- 
-      // Recursion towards the operands of I1 and I2. We are trying all possbile 
-      // operand pairs, and keeping track of the best score. 
-      for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands(); 
-           OpIdx1 != NumOperands1; ++OpIdx1) { 
-        // Try to pair op1I with the best operand of I2. 
-        int MaxTmpScore = 0; 
-        unsigned MaxOpIdx2 = 0; 
-        bool FoundBest = false; 
-        // If I2 is commutative try all combinations. 
-        unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1; 
-        unsigned ToIdx = isCommutative(I2) 
-                             ? I2->getNumOperands() 
-                             : std::min(I2->getNumOperands(), OpIdx1 + 1); 
-        assert(FromIdx <= ToIdx && "Bad index"); 
-        for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) { 
-          // Skip operands already paired with OpIdx1. 
-          if (Op2Used.count(OpIdx2)) 
-            continue; 
-          // Recursively calculate the cost at each level 
-          int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1}, 
-                                            {I2->getOperand(OpIdx2), Lane2}, 
-                                            CurrLevel + 1, MaxLevel); 
-          // Look for the best score. 
-          if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) { 
-            MaxTmpScore = TmpScore; 
-            MaxOpIdx2 = OpIdx2; 
-            FoundBest = true; 
-          } 
-        } 
-        if (FoundBest) { 
-          // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it. 
-          Op2Used.insert(MaxOpIdx2); 
-          ShallowScoreAtThisLevel += MaxTmpScore; 
-        } 
-      } 
-      return ShallowScoreAtThisLevel; 
-    } 
- 
-    /// \Returns the look-ahead score, which tells us how much the sub-trees 
-    /// rooted at \p LHS and \p RHS match, the more they match the higher the 
-    /// score. This helps break ties in an informed way when we cannot decide on 
-    /// the order of the operands by just considering the immediate 
-    /// predecessors. 
-    int getLookAheadScore(const std::pair<Value *, int> &LHS, 
-                          const std::pair<Value *, int> &RHS) { 
-      InLookAheadValues.clear(); 
-      return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth); 
-    } 
- 
-    // Search all operands in Ops[*][Lane] for the one that matches best 
-    // Ops[OpIdx][LastLane] and return its opreand index. 
-    // If no good match can be found, return None. 
-    Optional<unsigned> 
-    getBestOperand(unsigned OpIdx, int Lane, int LastLane, 
-                   ArrayRef<ReorderingMode> ReorderingModes) { 
-      unsigned NumOperands = getNumOperands(); 
- 
-      // The operand of the previous lane at OpIdx. 
-      Value *OpLastLane = getData(OpIdx, LastLane).V; 
- 
-      // Our strategy mode for OpIdx. 
-      ReorderingMode RMode = ReorderingModes[OpIdx]; 
- 
-      // The linearized opcode of the operand at OpIdx, Lane. 
-      bool OpIdxAPO = getData(OpIdx, Lane).APO; 
- 
-      // The best operand index and its score. 
-      // Sometimes we have more than one option (e.g., Opcode and Undefs), so we 
-      // are using the score to differentiate between the two. 
-      struct BestOpData { 
-        Optional<unsigned> Idx = None; 
-        unsigned Score = 0; 
-      } BestOp; 
- 
-      // Iterate through all unused operands and look for the best. 
-      for (unsigned Idx = 0; Idx != NumOperands; ++Idx) { 
-        // Get the operand at Idx and Lane. 
-        OperandData &OpData = getData(Idx, Lane); 
-        Value *Op = OpData.V; 
-        bool OpAPO = OpData.APO; 
- 
-        // Skip already selected operands. 
-        if (OpData.IsUsed) 
-          continue; 
- 
-        // Skip if we are trying to move the operand to a position with a 
-        // different opcode in the linearized tree form. This would break the 
-        // semantics. 
-        if (OpAPO != OpIdxAPO) 
-          continue; 
- 
-        // Look for an operand that matches the current mode. 
-        switch (RMode) { 
-        case ReorderingMode::Load: 
-        case ReorderingMode::Constant: 
-        case ReorderingMode::Opcode: { 
-          bool LeftToRight = Lane > LastLane; 
-          Value *OpLeft = (LeftToRight) ? OpLastLane : Op; 
-          Value *OpRight = (LeftToRight) ? Op : OpLastLane; 
-          unsigned Score = 
-              getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane}); 
-          if (Score > BestOp.Score) { 
-            BestOp.Idx = Idx; 
-            BestOp.Score = Score; 
-          } 
-          break; 
-        } 
-        case ReorderingMode::Splat: 
-          if (Op == OpLastLane) 
-            BestOp.Idx = Idx; 
-          break; 
-        case ReorderingMode::Failed: 
-          return None; 
-        } 
-      } 
- 
-      if (BestOp.Idx) { 
-        getData(BestOp.Idx.getValue(), Lane).IsUsed = true; 
-        return BestOp.Idx; 
-      } 
-      // If we could not find a good match return None. 
-      return None; 
-    } 
- 
-    /// Helper for reorderOperandVecs. \Returns the lane that we should start 
-    /// reordering from. This is the one which has the least number of operands 
-    /// that can freely move about. 
-    unsigned getBestLaneToStartReordering() const { 
-      unsigned BestLane = 0; 
-      unsigned Min = UINT_MAX; 
-      for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes; 
-           ++Lane) { 
-        unsigned NumFreeOps = getMaxNumOperandsThatCanBeReordered(Lane); 
-        if (NumFreeOps < Min) { 
-          Min = NumFreeOps; 
-          BestLane = Lane; 
-        } 
-      } 
-      return BestLane; 
-    } 
- 
-    /// \Returns the maximum number of operands that are allowed to be reordered 
-    /// for \p Lane. This is used as a heuristic for selecting the first lane to 
-    /// start operand reordering. 
-    unsigned getMaxNumOperandsThatCanBeReordered(unsigned Lane) const { 
-      unsigned CntTrue = 0; 
-      unsigned NumOperands = getNumOperands(); 
-      // Operands with the same APO can be reordered. We therefore need to count 
-      // how many of them we have for each APO, like this: Cnt[APO] = x. 
-      // Since we only have two APOs, namely true and false, we can avoid using 
-      // a map. Instead we can simply count the number of operands that 
-      // correspond to one of them (in this case the 'true' APO), and calculate 
-      // the other by subtracting it from the total number of operands. 
-      for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) 
-        if (getData(OpIdx, Lane).APO) 
-          ++CntTrue; 
-      unsigned CntFalse = NumOperands - CntTrue; 
-      return std::max(CntTrue, CntFalse); 
-    } 
- 
-    /// Go through the instructions in VL and append their operands. 
-    void appendOperandsOfVL(ArrayRef<Value *> VL) { 
-      assert(!VL.empty() && "Bad VL"); 
-      assert((empty() || VL.size() == getNumLanes()) && 
-             "Expected same number of lanes"); 
-      assert(isa<Instruction>(VL[0]) && "Expected instruction"); 
-      unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands(); 
-      OpsVec.resize(NumOperands); 
-      unsigned NumLanes = VL.size(); 
-      for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { 
-        OpsVec[OpIdx].resize(NumLanes); 
-        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 
-          assert(isa<Instruction>(VL[Lane]) && "Expected instruction"); 
-          // Our tree has just 3 nodes: the root and two operands. 
-          // It is therefore trivial to get the APO. We only need to check the 
-          // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or 
-          // RHS operand. The LHS operand of both add and sub is never attached 
-          // to an inversese operation in the linearized form, therefore its APO 
-          // is false. The RHS is true only if VL[Lane] is an inverse operation. 
- 
-          // Since operand reordering is performed on groups of commutative 
-          // operations or alternating sequences (e.g., +, -), we can safely 
-          // tell the inverse operations by checking commutativity. 
-          bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane])); 
-          bool APO = (OpIdx == 0) ? false : IsInverseOperation; 
-          OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx), 
-                                 APO, false}; 
-        } 
-      } 
-    } 
- 
-    /// \returns the number of operands. 
-    unsigned getNumOperands() const { return OpsVec.size(); } 
- 
-    /// \returns the number of lanes. 
-    unsigned getNumLanes() const { return OpsVec[0].size(); } 
- 
-    /// \returns the operand value at \p OpIdx and \p Lane. 
-    Value *getValue(unsigned OpIdx, unsigned Lane) const { 
-      return getData(OpIdx, Lane).V; 
-    } 
- 
-    /// \returns true if the data structure is empty. 
-    bool empty() const { return OpsVec.empty(); } 
- 
-    /// Clears the data. 
-    void clear() { OpsVec.clear(); } 
- 
-    /// \Returns true if there are enough operands identical to \p Op to fill 
-    /// the whole vector. 
-    /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow. 
-    bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) { 
-      bool OpAPO = getData(OpIdx, Lane).APO; 
-      for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) { 
-        if (Ln == Lane) 
-          continue; 
-        // This is set to true if we found a candidate for broadcast at Lane. 
-        bool FoundCandidate = false; 
-        for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) { 
-          OperandData &Data = getData(OpI, Ln); 
-          if (Data.APO != OpAPO || Data.IsUsed) 
-            continue; 
-          if (Data.V == Op) { 
-            FoundCandidate = true; 
-            Data.IsUsed = true; 
-            break; 
-          } 
-        } 
-        if (!FoundCandidate) 
-          return false; 
-      } 
-      return true; 
-    } 
- 
-  public: 
-    /// Initialize with all the operands of the instruction vector \p RootVL. 
-    VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL, 
-               ScalarEvolution &SE, const BoUpSLP &R) 
-        : DL(DL), SE(SE), R(R) { 
-      // Append all the operands of RootVL. 
-      appendOperandsOfVL(RootVL); 
-    } 
- 
-    /// \Returns a value vector with the operands across all lanes for the 
-    /// opearnd at \p OpIdx. 
-    ValueList getVL(unsigned OpIdx) const { 
-      ValueList OpVL(OpsVec[OpIdx].size()); 
-      assert(OpsVec[OpIdx].size() == getNumLanes() && 
-             "Expected same num of lanes across all operands"); 
-      for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane) 
-        OpVL[Lane] = OpsVec[OpIdx][Lane].V; 
-      return OpVL; 
-    } 
- 
-    // Performs operand reordering for 2 or more operands. 
-    // The original operands are in OrigOps[OpIdx][Lane]. 
-    // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'. 
-    void reorder() { 
-      unsigned NumOperands = getNumOperands(); 
-      unsigned NumLanes = getNumLanes(); 
-      // Each operand has its own mode. We are using this mode to help us select 
-      // the instructions for each lane, so that they match best with the ones 
-      // we have selected so far. 
-      SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands); 
- 
-      // This is a greedy single-pass algorithm. We are going over each lane 
-      // once and deciding on the best order right away with no back-tracking. 
-      // However, in order to increase its effectiveness, we start with the lane 
-      // that has operands that can move the least. For example, given the 
-      // following lanes: 
-      //  Lane 0 : A[0] = B[0] + C[0]   // Visited 3rd 
-      //  Lane 1 : A[1] = C[1] - B[1]   // Visited 1st 
-      //  Lane 2 : A[2] = B[2] + C[2]   // Visited 2nd 
-      //  Lane 3 : A[3] = C[3] - B[3]   // Visited 4th 
-      // we will start at Lane 1, since the operands of the subtraction cannot 
-      // be reordered. Then we will visit the rest of the lanes in a circular 
-      // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3. 
- 
-      // Find the first lane that we will start our search from. 
-      unsigned FirstLane = getBestLaneToStartReordering(); 
- 
-      // Initialize the modes. 
-      for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { 
-        Value *OpLane0 = getValue(OpIdx, FirstLane); 
-        // Keep track if we have instructions with all the same opcode on one 
-        // side. 
-        if (isa<LoadInst>(OpLane0)) 
-          ReorderingModes[OpIdx] = ReorderingMode::Load; 
-        else if (isa<Instruction>(OpLane0)) { 
-          // Check if OpLane0 should be broadcast. 
-          if (shouldBroadcast(OpLane0, OpIdx, FirstLane)) 
-            ReorderingModes[OpIdx] = ReorderingMode::Splat; 
-          else 
-            ReorderingModes[OpIdx] = ReorderingMode::Opcode; 
-        } 
-        else if (isa<Constant>(OpLane0)) 
-          ReorderingModes[OpIdx] = ReorderingMode::Constant; 
-        else if (isa<Argument>(OpLane0)) 
-          // Our best hope is a Splat. It may save some cost in some cases. 
-          ReorderingModes[OpIdx] = ReorderingMode::Splat; 
-        else 
-          // NOTE: This should be unreachable. 
-          ReorderingModes[OpIdx] = ReorderingMode::Failed; 
-      } 
- 
-      // If the initial strategy fails for any of the operand indexes, then we 
-      // perform reordering again in a second pass. This helps avoid assigning 
-      // high priority to the failed strategy, and should improve reordering for 
-      // the non-failed operand indexes. 
-      for (int Pass = 0; Pass != 2; ++Pass) { 
-        // Skip the second pass if the first pass did not fail. 
-        bool StrategyFailed = false; 
-        // Mark all operand data as free to use. 
-        clearUsed(); 
-        // We keep the original operand order for the FirstLane, so reorder the 
-        // rest of the lanes. We are visiting the nodes in a circular fashion, 
-        // using FirstLane as the center point and increasing the radius 
-        // distance. 
-        for (unsigned Distance = 1; Distance != NumLanes; ++Distance) { 
-          // Visit the lane on the right and then the lane on the left. 
-          for (int Direction : {+1, -1}) { 
-            int Lane = FirstLane + Direction * Distance; 
-            if (Lane < 0 || Lane >= (int)NumLanes) 
-              continue; 
-            int LastLane = Lane - Direction; 
-            assert(LastLane >= 0 && LastLane < (int)NumLanes && 
-                   "Out of bounds"); 
-            // Look for a good match for each operand. 
-            for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { 
-              // Search for the operand that matches SortedOps[OpIdx][Lane-1]. 
-              Optional<unsigned> BestIdx = 
-                  getBestOperand(OpIdx, Lane, LastLane, ReorderingModes); 
-              // By not selecting a value, we allow the operands that follow to 
-              // select a better matching value. We will get a non-null value in 
-              // the next run of getBestOperand(). 
-              if (BestIdx) { 
-                // Swap the current operand with the one returned by 
-                // getBestOperand(). 
-                swap(OpIdx, BestIdx.getValue(), Lane); 
-              } else { 
-                // We failed to find a best operand, set mode to 'Failed'. 
-                ReorderingModes[OpIdx] = ReorderingMode::Failed; 
-                // Enable the second pass. 
-                StrategyFailed = true; 
-              } 
-            } 
-          } 
-        } 
-        // Skip second pass if the strategy did not fail. 
-        if (!StrategyFailed) 
-          break; 
-      } 
-    } 
- 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 
-    LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) { 
-      switch (RMode) { 
-      case ReorderingMode::Load: 
-        return "Load"; 
-      case ReorderingMode::Opcode: 
-        return "Opcode"; 
-      case ReorderingMode::Constant: 
-        return "Constant"; 
-      case ReorderingMode::Splat: 
-        return "Splat"; 
-      case ReorderingMode::Failed: 
-        return "Failed"; 
-      } 
-      llvm_unreachable("Unimplemented Reordering Type"); 
-    } 
- 
-    LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode, 
-                                                   raw_ostream &OS) { 
-      return OS << getModeStr(RMode); 
-    } 
- 
-    /// Debug print. 
-    LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) { 
-      printMode(RMode, dbgs()); 
-    } 
- 
-    friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) { 
-      return printMode(RMode, OS); 
-    } 
- 
-    LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const { 
-      const unsigned Indent = 2; 
-      unsigned Cnt = 0; 
-      for (const OperandDataVec &OpDataVec : OpsVec) { 
-        OS << "Operand " << Cnt++ << "\n"; 
-        for (const OperandData &OpData : OpDataVec) { 
-          OS.indent(Indent) << "{"; 
-          if (Value *V = OpData.V) 
-            OS << *V; 
-          else 
-            OS << "null"; 
-          OS << ", APO:" << OpData.APO << "}\n"; 
-        } 
-        OS << "\n"; 
-      } 
-      return OS; 
-    } 
- 
-    /// Debug print. 
-    LLVM_DUMP_METHOD void dump() const { print(dbgs()); } 
-#endif 
-  }; 
- 
-  /// Checks if the instruction is marked for deletion. 
-  bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); } 
- 
-  /// Marks values operands for later deletion by replacing them with Undefs. 
-  void eraseInstructions(ArrayRef<Value *> AV); 
- 
-  ~BoUpSLP(); 
- 
-private: 
-  /// Checks if all users of \p I are the part of the vectorization tree. 
-  bool areAllUsersVectorized(Instruction *I) const; 
- 
-  /// \returns the cost of the vectorizable entry. 
+        // Calculate the absolute lane, using the minimum relative lane of LHS
+        // and RHS as base and Idx as the offset.
+        int Ln = std::min(LHS.second, RHS.second) + Idx;
+        assert(Ln >= 0 && "Bad lane calculation");
+        unsigned UsersBudget = LookAheadUsersBudget;
+        for (User *U : V->users()) {
+          if (const TreeEntry *UserTE = R.getTreeEntry(U)) {
+            // The user is in the VectorizableTree. Check if we need to insert.
+            auto It = llvm::find(UserTE->Scalars, U);
+            assert(It != UserTE->Scalars.end() && "U is in UserTE");
+            int UserLn = std::distance(UserTE->Scalars.begin(), It);
+            assert(UserLn >= 0 && "Bad lane");
+            if (UserLn != Ln)
+              Cost += UserInDiffLaneCost;
+          } else {
+            // Check if the user is in the look-ahead code.
+            auto It2 = InLookAheadValues.find(U);
+            if (It2 != InLookAheadValues.end()) {
+              // The user is in the look-ahead code. Check the lane.
+              if (It2->second != Ln)
+                Cost += UserInDiffLaneCost;
+            } else {
+              // The user is neither in SLP tree nor in the look-ahead code.
+              Cost += ExternalUseCost;
+            }
+          }
+          // Limit the number of visited uses to cap compilation time.
+          if (--UsersBudget == 0)
+            break;
+        }
+      }
+      return Cost;
+    }
+
+    /// Go through the operands of \p LHS and \p RHS recursively until \p
+    /// MaxLevel, and return the cummulative score. For example:
+    /// \verbatim
+    ///  A[0]  B[0]  A[1]  B[1]  C[0] D[0]  B[1] A[1]
+    ///     \ /         \ /         \ /        \ /
+    ///      +           +           +          +
+    ///     G1          G2          G3         G4
+    /// \endverbatim
+    /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
+    /// each level recursively, accumulating the score. It starts from matching
+    /// the additions at level 0, then moves on to the loads (level 1). The
+    /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
+    /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while
+    /// {A[0],C[0]} has a score of VLOperands::ScoreFail.
+    /// Please note that the order of the operands does not matter, as we
+    /// evaluate the score of all profitable combinations of operands. In
+    /// other words the score of G1 and G4 is the same as G1 and G2. This
+    /// heuristic is based on ideas described in:
+    ///   Look-ahead SLP: Auto-vectorization in the presence of commutative
+    ///   operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
+    ///   Luís F. W. Góes
+    int getScoreAtLevelRec(const std::pair<Value *, int> &LHS,
+                           const std::pair<Value *, int> &RHS, int CurrLevel,
+                           int MaxLevel) {
+
+      Value *V1 = LHS.first;
+      Value *V2 = RHS.first;
+      // Get the shallow score of V1 and V2.
+      int ShallowScoreAtThisLevel =
+          std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) -
+                                       getExternalUsesCost(LHS, RHS));
+      int Lane1 = LHS.second;
+      int Lane2 = RHS.second;
+
+      // If reached MaxLevel,
+      //  or if V1 and V2 are not instructions,
+      //  or if they are SPLAT,
+      //  or if they are not consecutive, early return the current cost.
+      auto *I1 = dyn_cast<Instruction>(V1);
+      auto *I2 = dyn_cast<Instruction>(V2);
+      if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
+          ShallowScoreAtThisLevel == VLOperands::ScoreFail ||
+          (isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel))
+        return ShallowScoreAtThisLevel;
+      assert(I1 && I2 && "Should have early exited.");
+
+      // Keep track of in-tree values for determining the external-use cost.
+      InLookAheadValues[V1] = Lane1;
+      InLookAheadValues[V2] = Lane2;
+
+      // Contains the I2 operand indexes that got matched with I1 operands.
+      SmallSet<unsigned, 4> Op2Used;
+
+      // Recursion towards the operands of I1 and I2. We are trying all possbile
+      // operand pairs, and keeping track of the best score.
+      for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
+           OpIdx1 != NumOperands1; ++OpIdx1) {
+        // Try to pair op1I with the best operand of I2.
+        int MaxTmpScore = 0;
+        unsigned MaxOpIdx2 = 0;
+        bool FoundBest = false;
+        // If I2 is commutative try all combinations.
+        unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
+        unsigned ToIdx = isCommutative(I2)
+                             ? I2->getNumOperands()
+                             : std::min(I2->getNumOperands(), OpIdx1 + 1);
+        assert(FromIdx <= ToIdx && "Bad index");
+        for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
+          // Skip operands already paired with OpIdx1.
+          if (Op2Used.count(OpIdx2))
+            continue;
+          // Recursively calculate the cost at each level
+          int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1},
+                                            {I2->getOperand(OpIdx2), Lane2},
+                                            CurrLevel + 1, MaxLevel);
+          // Look for the best score.
+          if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) {
+            MaxTmpScore = TmpScore;
+            MaxOpIdx2 = OpIdx2;
+            FoundBest = true;
+          }
+        }
+        if (FoundBest) {
+          // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
+          Op2Used.insert(MaxOpIdx2);
+          ShallowScoreAtThisLevel += MaxTmpScore;
+        }
+      }
+      return ShallowScoreAtThisLevel;
+    }
+
+    /// \Returns the look-ahead score, which tells us how much the sub-trees
+    /// rooted at \p LHS and \p RHS match, the more they match the higher the
+    /// score. This helps break ties in an informed way when we cannot decide on
+    /// the order of the operands by just considering the immediate
+    /// predecessors.
+    int getLookAheadScore(const std::pair<Value *, int> &LHS,
+                          const std::pair<Value *, int> &RHS) {
+      InLookAheadValues.clear();
+      return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth);
+    }
+
+    // Search all operands in Ops[*][Lane] for the one that matches best
+    // Ops[OpIdx][LastLane] and return its opreand index.
+    // If no good match can be found, return None.
+    Optional<unsigned>
+    getBestOperand(unsigned OpIdx, int Lane, int LastLane,
+                   ArrayRef<ReorderingMode> ReorderingModes) {
+      unsigned NumOperands = getNumOperands();
+
+      // The operand of the previous lane at OpIdx.
+      Value *OpLastLane = getData(OpIdx, LastLane).V;
+
+      // Our strategy mode for OpIdx.
+      ReorderingMode RMode = ReorderingModes[OpIdx];
+
+      // The linearized opcode of the operand at OpIdx, Lane.
+      bool OpIdxAPO = getData(OpIdx, Lane).APO;
+
+      // The best operand index and its score.
+      // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
+      // are using the score to differentiate between the two.
+      struct BestOpData {
+        Optional<unsigned> Idx = None;
+        unsigned Score = 0;
+      } BestOp;
+
+      // Iterate through all unused operands and look for the best.
+      for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
+        // Get the operand at Idx and Lane.
+        OperandData &OpData = getData(Idx, Lane);
+        Value *Op = OpData.V;
+        bool OpAPO = OpData.APO;
+
+        // Skip already selected operands.
+        if (OpData.IsUsed)
+          continue;
+
+        // Skip if we are trying to move the operand to a position with a
+        // different opcode in the linearized tree form. This would break the
+        // semantics.
+        if (OpAPO != OpIdxAPO)
+          continue;
+
+        // Look for an operand that matches the current mode.
+        switch (RMode) {
+        case ReorderingMode::Load:
+        case ReorderingMode::Constant:
+        case ReorderingMode::Opcode: {
+          bool LeftToRight = Lane > LastLane;
+          Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
+          Value *OpRight = (LeftToRight) ? Op : OpLastLane;
+          unsigned Score =
+              getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane});
+          if (Score > BestOp.Score) {
+            BestOp.Idx = Idx;
+            BestOp.Score = Score;
+          }
+          break;
+        }
+        case ReorderingMode::Splat:
+          if (Op == OpLastLane)
+            BestOp.Idx = Idx;
+          break;
+        case ReorderingMode::Failed:
+          return None;
+        }
+      }
+
+      if (BestOp.Idx) {
+        getData(BestOp.Idx.getValue(), Lane).IsUsed = true;
+        return BestOp.Idx;
+      }
+      // If we could not find a good match return None.
+      return None;
+    }
+
+    /// Helper for reorderOperandVecs. \Returns the lane that we should start
+    /// reordering from. This is the one which has the least number of operands
+    /// that can freely move about.
+    unsigned getBestLaneToStartReordering() const {
+      unsigned BestLane = 0;
+      unsigned Min = UINT_MAX;
+      for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
+           ++Lane) {
+        unsigned NumFreeOps = getMaxNumOperandsThatCanBeReordered(Lane);
+        if (NumFreeOps < Min) {
+          Min = NumFreeOps;
+          BestLane = Lane;
+        }
+      }
+      return BestLane;
+    }
+
+    /// \Returns the maximum number of operands that are allowed to be reordered
+    /// for \p Lane. This is used as a heuristic for selecting the first lane to
+    /// start operand reordering.
+    unsigned getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
+      unsigned CntTrue = 0;
+      unsigned NumOperands = getNumOperands();
+      // Operands with the same APO can be reordered. We therefore need to count
+      // how many of them we have for each APO, like this: Cnt[APO] = x.
+      // Since we only have two APOs, namely true and false, we can avoid using
+      // a map. Instead we can simply count the number of operands that
+      // correspond to one of them (in this case the 'true' APO), and calculate
+      // the other by subtracting it from the total number of operands.
+      for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx)
+        if (getData(OpIdx, Lane).APO)
+          ++CntTrue;
+      unsigned CntFalse = NumOperands - CntTrue;
+      return std::max(CntTrue, CntFalse);
+    }
+
+    /// Go through the instructions in VL and append their operands.
+    void appendOperandsOfVL(ArrayRef<Value *> VL) {
+      assert(!VL.empty() && "Bad VL");
+      assert((empty() || VL.size() == getNumLanes()) &&
+             "Expected same number of lanes");
+      assert(isa<Instruction>(VL[0]) && "Expected instruction");
+      unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
+      OpsVec.resize(NumOperands);
+      unsigned NumLanes = VL.size();
+      for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
+        OpsVec[OpIdx].resize(NumLanes);
+        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+          assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
+          // Our tree has just 3 nodes: the root and two operands.
+          // It is therefore trivial to get the APO. We only need to check the
+          // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
+          // RHS operand. The LHS operand of both add and sub is never attached
+          // to an inversese operation in the linearized form, therefore its APO
+          // is false. The RHS is true only if VL[Lane] is an inverse operation.
+
+          // Since operand reordering is performed on groups of commutative
+          // operations or alternating sequences (e.g., +, -), we can safely
+          // tell the inverse operations by checking commutativity.
+          bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
+          bool APO = (OpIdx == 0) ? false : IsInverseOperation;
+          OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
+                                 APO, false};
+        }
+      }
+    }
+
+    /// \returns the number of operands.
+    unsigned getNumOperands() const { return OpsVec.size(); }
+
+    /// \returns the number of lanes.
+    unsigned getNumLanes() const { return OpsVec[0].size(); }
+
+    /// \returns the operand value at \p OpIdx and \p Lane.
+    Value *getValue(unsigned OpIdx, unsigned Lane) const {
+      return getData(OpIdx, Lane).V;
+    }
+
+    /// \returns true if the data structure is empty.
+    bool empty() const { return OpsVec.empty(); }
+
+    /// Clears the data.
+    void clear() { OpsVec.clear(); }
+
+    /// \Returns true if there are enough operands identical to \p Op to fill
+    /// the whole vector.
+    /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
+    bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
+      bool OpAPO = getData(OpIdx, Lane).APO;
+      for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
+        if (Ln == Lane)
+          continue;
+        // This is set to true if we found a candidate for broadcast at Lane.
+        bool FoundCandidate = false;
+        for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
+          OperandData &Data = getData(OpI, Ln);
+          if (Data.APO != OpAPO || Data.IsUsed)
+            continue;
+          if (Data.V == Op) {
+            FoundCandidate = true;
+            Data.IsUsed = true;
+            break;
+          }
+        }
+        if (!FoundCandidate)
+          return false;
+      }
+      return true;
+    }
+
+  public:
+    /// Initialize with all the operands of the instruction vector \p RootVL.
+    VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,
+               ScalarEvolution &SE, const BoUpSLP &R)
+        : DL(DL), SE(SE), R(R) {
+      // Append all the operands of RootVL.
+      appendOperandsOfVL(RootVL);
+    }
+
+    /// \Returns a value vector with the operands across all lanes for the
+    /// opearnd at \p OpIdx.
+    ValueList getVL(unsigned OpIdx) const {
+      ValueList OpVL(OpsVec[OpIdx].size());
+      assert(OpsVec[OpIdx].size() == getNumLanes() &&
+             "Expected same num of lanes across all operands");
+      for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
+        OpVL[Lane] = OpsVec[OpIdx][Lane].V;
+      return OpVL;
+    }
+
+    // Performs operand reordering for 2 or more operands.
+    // The original operands are in OrigOps[OpIdx][Lane].
+    // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
+    void reorder() {
+      unsigned NumOperands = getNumOperands();
+      unsigned NumLanes = getNumLanes();
+      // Each operand has its own mode. We are using this mode to help us select
+      // the instructions for each lane, so that they match best with the ones
+      // we have selected so far.
+      SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
+
+      // This is a greedy single-pass algorithm. We are going over each lane
+      // once and deciding on the best order right away with no back-tracking.
+      // However, in order to increase its effectiveness, we start with the lane
+      // that has operands that can move the least. For example, given the
+      // following lanes:
+      //  Lane 0 : A[0] = B[0] + C[0]   // Visited 3rd
+      //  Lane 1 : A[1] = C[1] - B[1]   // Visited 1st
+      //  Lane 2 : A[2] = B[2] + C[2]   // Visited 2nd
+      //  Lane 3 : A[3] = C[3] - B[3]   // Visited 4th
+      // we will start at Lane 1, since the operands of the subtraction cannot
+      // be reordered. Then we will visit the rest of the lanes in a circular
+      // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
+
+      // Find the first lane that we will start our search from.
+      unsigned FirstLane = getBestLaneToStartReordering();
+
+      // Initialize the modes.
+      for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
+        Value *OpLane0 = getValue(OpIdx, FirstLane);
+        // Keep track if we have instructions with all the same opcode on one
+        // side.
+        if (isa<LoadInst>(OpLane0))
+          ReorderingModes[OpIdx] = ReorderingMode::Load;
+        else if (isa<Instruction>(OpLane0)) {
+          // Check if OpLane0 should be broadcast.
+          if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
+            ReorderingModes[OpIdx] = ReorderingMode::Splat;
+          else
+            ReorderingModes[OpIdx] = ReorderingMode::Opcode;
+        }
+        else if (isa<Constant>(OpLane0))
+          ReorderingModes[OpIdx] = ReorderingMode::Constant;
+        else if (isa<Argument>(OpLane0))
+          // Our best hope is a Splat. It may save some cost in some cases.
+          ReorderingModes[OpIdx] = ReorderingMode::Splat;
+        else
+          // NOTE: This should be unreachable.
+          ReorderingModes[OpIdx] = ReorderingMode::Failed;
+      }
+
+      // If the initial strategy fails for any of the operand indexes, then we
+      // perform reordering again in a second pass. This helps avoid assigning
+      // high priority to the failed strategy, and should improve reordering for
+      // the non-failed operand indexes.
+      for (int Pass = 0; Pass != 2; ++Pass) {
+        // Skip the second pass if the first pass did not fail.
+        bool StrategyFailed = false;
+        // Mark all operand data as free to use.
+        clearUsed();
+        // We keep the original operand order for the FirstLane, so reorder the
+        // rest of the lanes. We are visiting the nodes in a circular fashion,
+        // using FirstLane as the center point and increasing the radius
+        // distance.
+        for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
+          // Visit the lane on the right and then the lane on the left.
+          for (int Direction : {+1, -1}) {
+            int Lane = FirstLane + Direction * Distance;
+            if (Lane < 0 || Lane >= (int)NumLanes)
+              continue;
+            int LastLane = Lane - Direction;
+            assert(LastLane >= 0 && LastLane < (int)NumLanes &&
+                   "Out of bounds");
+            // Look for a good match for each operand.
+            for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
+              // Search for the operand that matches SortedOps[OpIdx][Lane-1].
+              Optional<unsigned> BestIdx =
+                  getBestOperand(OpIdx, Lane, LastLane, ReorderingModes);
+              // By not selecting a value, we allow the operands that follow to
+              // select a better matching value. We will get a non-null value in
+              // the next run of getBestOperand().
+              if (BestIdx) {
+                // Swap the current operand with the one returned by
+                // getBestOperand().
+                swap(OpIdx, BestIdx.getValue(), Lane);
+              } else {
+                // We failed to find a best operand, set mode to 'Failed'.
+                ReorderingModes[OpIdx] = ReorderingMode::Failed;
+                // Enable the second pass.
+                StrategyFailed = true;
+              }
+            }
+          }
+        }
+        // Skip second pass if the strategy did not fail.
+        if (!StrategyFailed)
+          break;
+      }
+    }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
+      switch (RMode) {
+      case ReorderingMode::Load:
+        return "Load";
+      case ReorderingMode::Opcode:
+        return "Opcode";
+      case ReorderingMode::Constant:
+        return "Constant";
+      case ReorderingMode::Splat:
+        return "Splat";
+      case ReorderingMode::Failed:
+        return "Failed";
+      }
+      llvm_unreachable("Unimplemented Reordering Type");
+    }
+
+    LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
+                                                   raw_ostream &OS) {
+      return OS << getModeStr(RMode);
+    }
+
+    /// Debug print.
+    LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
+      printMode(RMode, dbgs());
+    }
+
+    friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
+      return printMode(RMode, OS);
+    }
+
+    LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const {
+      const unsigned Indent = 2;
+      unsigned Cnt = 0;
+      for (const OperandDataVec &OpDataVec : OpsVec) {
+        OS << "Operand " << Cnt++ << "\n";
+        for (const OperandData &OpData : OpDataVec) {
+          OS.indent(Indent) << "{";
+          if (Value *V = OpData.V)
+            OS << *V;
+          else
+            OS << "null";
+          OS << ", APO:" << OpData.APO << "}\n";
+        }
+        OS << "\n";
+      }
+      return OS;
+    }
+
+    /// Debug print.
+    LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
+#endif
+  };
+
+  /// Checks if the instruction is marked for deletion.
+  bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
+
+  /// Marks values operands for later deletion by replacing them with Undefs.
+  void eraseInstructions(ArrayRef<Value *> AV);
+
+  ~BoUpSLP();
+
+private:
+  /// Checks if all users of \p I are the part of the vectorization tree.
+  bool areAllUsersVectorized(Instruction *I) const;
+
+  /// \returns the cost of the vectorizable entry.
   InstructionCost getEntryCost(TreeEntry *E);
- 
-  /// This is the recursive part of buildTree. 
-  void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, 
-                     const EdgeInfo &EI); 
- 
-  /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can 
-  /// be vectorized to use the original vector (or aggregate "bitcast" to a 
-  /// vector) and sets \p CurrentOrder to the identity permutation; otherwise 
-  /// returns false, setting \p CurrentOrder to either an empty vector or a 
-  /// non-identity permutation that allows to reuse extract instructions. 
-  bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, 
-                       SmallVectorImpl<unsigned> &CurrentOrder) const; 
- 
-  /// Vectorize a single entry in the tree. 
-  Value *vectorizeTree(TreeEntry *E); 
- 
-  /// Vectorize a single entry in the tree, starting in \p VL. 
-  Value *vectorizeTree(ArrayRef<Value *> VL); 
- 
-  /// \returns the scalarization cost for this type. Scalarization in this 
-  /// context means the creation of vectors from a group of scalars. 
+
+  /// This is the recursive part of buildTree.
+  void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
+                     const EdgeInfo &EI);
+
+  /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
+  /// be vectorized to use the original vector (or aggregate "bitcast" to a
+  /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
+  /// returns false, setting \p CurrentOrder to either an empty vector or a
+  /// non-identity permutation that allows to reuse extract instructions.
+  bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
+                       SmallVectorImpl<unsigned> &CurrentOrder) const;
+
+  /// Vectorize a single entry in the tree.
+  Value *vectorizeTree(TreeEntry *E);
+
+  /// Vectorize a single entry in the tree, starting in \p VL.
+  Value *vectorizeTree(ArrayRef<Value *> VL);
+
+  /// \returns the scalarization cost for this type. Scalarization in this
+  /// context means the creation of vectors from a group of scalars.
   InstructionCost
   getGatherCost(FixedVectorType *Ty,
                 const DenseSet<unsigned> &ShuffledIndices) const;
- 
-  /// \returns the scalarization cost for this list of values. Assuming that 
-  /// this subtree gets vectorized, we may need to extract the values from the 
-  /// roots. This method calculates the cost of extracting the values. 
+
+  /// \returns the scalarization cost for this list of values. Assuming that
+  /// this subtree gets vectorized, we may need to extract the values from the
+  /// roots. This method calculates the cost of extracting the values.
   InstructionCost getGatherCost(ArrayRef<Value *> VL) const;
- 
-  /// Set the Builder insert point to one after the last instruction in 
-  /// the bundle 
-  void setInsertPointAfterBundle(TreeEntry *E); 
- 
-  /// \returns a vector from a collection of scalars in \p VL. 
+
+  /// Set the Builder insert point to one after the last instruction in
+  /// the bundle
+  void setInsertPointAfterBundle(TreeEntry *E);
+
+  /// \returns a vector from a collection of scalars in \p VL.
   Value *gather(ArrayRef<Value *> VL);
- 
-  /// \returns whether the VectorizableTree is fully vectorizable and will 
-  /// be beneficial even the tree height is tiny. 
-  bool isFullyVectorizableTinyTree() const; 
- 
-  /// Reorder commutative or alt operands to get better probability of 
-  /// generating vectorized code. 
-  static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, 
-                                             SmallVectorImpl<Value *> &Left, 
-                                             SmallVectorImpl<Value *> &Right, 
-                                             const DataLayout &DL, 
-                                             ScalarEvolution &SE, 
-                                             const BoUpSLP &R); 
-  struct TreeEntry { 
-    using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>; 
-    TreeEntry(VecTreeTy &Container) : Container(Container) {} 
- 
-    /// \returns true if the scalars in VL are equal to this entry. 
-    bool isSame(ArrayRef<Value *> VL) const { 
-      if (VL.size() == Scalars.size()) 
-        return std::equal(VL.begin(), VL.end(), Scalars.begin()); 
-      return VL.size() == ReuseShuffleIndices.size() && 
-             std::equal( 
-                 VL.begin(), VL.end(), ReuseShuffleIndices.begin(), 
-                 [this](Value *V, int Idx) { return V == Scalars[Idx]; }); 
-    } 
- 
-    /// A vector of scalars. 
-    ValueList Scalars; 
- 
-    /// The Scalars are vectorized into this value. It is initialized to Null. 
-    Value *VectorizedValue = nullptr; 
- 
+
+  /// \returns whether the VectorizableTree is fully vectorizable and will
+  /// be beneficial even the tree height is tiny.
+  bool isFullyVectorizableTinyTree() const;
+
+  /// Reorder commutative or alt operands to get better probability of
+  /// generating vectorized code.
+  static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
+                                             SmallVectorImpl<Value *> &Left,
+                                             SmallVectorImpl<Value *> &Right,
+                                             const DataLayout &DL,
+                                             ScalarEvolution &SE,
+                                             const BoUpSLP &R);
+  struct TreeEntry {
+    using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
+    TreeEntry(VecTreeTy &Container) : Container(Container) {}
+
+    /// \returns true if the scalars in VL are equal to this entry.
+    bool isSame(ArrayRef<Value *> VL) const {
+      if (VL.size() == Scalars.size())
+        return std::equal(VL.begin(), VL.end(), Scalars.begin());
+      return VL.size() == ReuseShuffleIndices.size() &&
+             std::equal(
+                 VL.begin(), VL.end(), ReuseShuffleIndices.begin(),
+                 [this](Value *V, int Idx) { return V == Scalars[Idx]; });
+    }
+
+    /// A vector of scalars.
+    ValueList Scalars;
+
+    /// The Scalars are vectorized into this value. It is initialized to Null.
+    Value *VectorizedValue = nullptr;
+
     /// Do we need to gather this sequence or vectorize it
     /// (either with vector instruction or with scatter/gather
     /// intrinsics for store/load)?
     enum EntryState { Vectorize, ScatterVectorize, NeedToGather };
-    EntryState State; 
- 
-    /// Does this sequence require some shuffling? 
-    SmallVector<int, 4> ReuseShuffleIndices; 
- 
-    /// Does this entry require reordering? 
+    EntryState State;
+
+    /// Does this sequence require some shuffling?
+    SmallVector<int, 4> ReuseShuffleIndices;
+
+    /// Does this entry require reordering?
     SmallVector<unsigned, 4> ReorderIndices;
- 
-    /// Points back to the VectorizableTree. 
-    /// 
-    /// Only used for Graphviz right now.  Unfortunately GraphTrait::NodeRef has 
-    /// to be a pointer and needs to be able to initialize the child iterator. 
-    /// Thus we need a reference back to the container to translate the indices 
-    /// to entries. 
-    VecTreeTy &Container; 
- 
-    /// The TreeEntry index containing the user of this entry.  We can actually 
-    /// have multiple users so the data structure is not truly a tree. 
-    SmallVector<EdgeInfo, 1> UserTreeIndices; 
- 
-    /// The index of this treeEntry in VectorizableTree. 
-    int Idx = -1; 
- 
-  private: 
-    /// The operands of each instruction in each lane Operands[op_index][lane]. 
-    /// Note: This helps avoid the replication of the code that performs the 
-    /// reordering of operands during buildTree_rec() and vectorizeTree(). 
-    SmallVector<ValueList, 2> Operands; 
- 
-    /// The main/alternate instruction. 
-    Instruction *MainOp = nullptr; 
-    Instruction *AltOp = nullptr; 
- 
-  public: 
-    /// Set this bundle's \p OpIdx'th operand to \p OpVL. 
-    void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) { 
-      if (Operands.size() < OpIdx + 1) 
-        Operands.resize(OpIdx + 1); 
-      assert(Operands[OpIdx].size() == 0 && "Already resized?"); 
-      Operands[OpIdx].resize(Scalars.size()); 
-      for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane) 
-        Operands[OpIdx][Lane] = OpVL[Lane]; 
-    } 
- 
-    /// Set the operands of this bundle in their original order. 
-    void setOperandsInOrder() { 
-      assert(Operands.empty() && "Already initialized?"); 
-      auto *I0 = cast<Instruction>(Scalars[0]); 
-      Operands.resize(I0->getNumOperands()); 
-      unsigned NumLanes = Scalars.size(); 
-      for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands(); 
-           OpIdx != NumOperands; ++OpIdx) { 
-        Operands[OpIdx].resize(NumLanes); 
-        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 
-          auto *I = cast<Instruction>(Scalars[Lane]); 
-          assert(I->getNumOperands() == NumOperands && 
-                 "Expected same number of operands"); 
-          Operands[OpIdx][Lane] = I->getOperand(OpIdx); 
-        } 
-      } 
-    } 
- 
-    /// \returns the \p OpIdx operand of this TreeEntry. 
-    ValueList &getOperand(unsigned OpIdx) { 
-      assert(OpIdx < Operands.size() && "Off bounds"); 
-      return Operands[OpIdx]; 
-    } 
- 
-    /// \returns the number of operands. 
-    unsigned getNumOperands() const { return Operands.size(); } 
- 
-    /// \return the single \p OpIdx operand. 
-    Value *getSingleOperand(unsigned OpIdx) const { 
-      assert(OpIdx < Operands.size() && "Off bounds"); 
-      assert(!Operands[OpIdx].empty() && "No operand available"); 
-      return Operands[OpIdx][0]; 
-    } 
- 
-    /// Some of the instructions in the list have alternate opcodes. 
-    bool isAltShuffle() const { 
-      return getOpcode() != getAltOpcode(); 
-    } 
- 
-    bool isOpcodeOrAlt(Instruction *I) const { 
-      unsigned CheckedOpcode = I->getOpcode(); 
-      return (getOpcode() == CheckedOpcode || 
-              getAltOpcode() == CheckedOpcode); 
-    } 
- 
-    /// Chooses the correct key for scheduling data. If \p Op has the same (or 
-    /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is 
-    /// \p OpValue. 
-    Value *isOneOf(Value *Op) const { 
-      auto *I = dyn_cast<Instruction>(Op); 
-      if (I && isOpcodeOrAlt(I)) 
-        return Op; 
-      return MainOp; 
-    } 
- 
-    void setOperations(const InstructionsState &S) { 
-      MainOp = S.MainOp; 
-      AltOp = S.AltOp; 
-    } 
- 
-    Instruction *getMainOp() const { 
-      return MainOp; 
-    } 
- 
-    Instruction *getAltOp() const { 
-      return AltOp; 
-    } 
- 
-    /// The main/alternate opcodes for the list of instructions. 
-    unsigned getOpcode() const { 
-      return MainOp ? MainOp->getOpcode() : 0; 
-    } 
- 
-    unsigned getAltOpcode() const { 
-      return AltOp ? AltOp->getOpcode() : 0; 
-    } 
- 
-    /// Update operations state of this entry if reorder occurred. 
-    bool updateStateIfReorder() { 
-      if (ReorderIndices.empty()) 
-        return false; 
-      InstructionsState S = getSameOpcode(Scalars, ReorderIndices.front()); 
-      setOperations(S); 
-      return true; 
-    } 
- 
-#ifndef NDEBUG 
-    /// Debug printer. 
-    LLVM_DUMP_METHOD void dump() const { 
-      dbgs() << Idx << ".\n"; 
-      for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) { 
-        dbgs() << "Operand " << OpI << ":\n"; 
-        for (const Value *V : Operands[OpI]) 
-          dbgs().indent(2) << *V << "\n"; 
-      } 
-      dbgs() << "Scalars: \n"; 
-      for (Value *V : Scalars) 
-        dbgs().indent(2) << *V << "\n"; 
-      dbgs() << "State: "; 
-      switch (State) { 
-      case Vectorize: 
-        dbgs() << "Vectorize\n"; 
-        break; 
+
+    /// Points back to the VectorizableTree.
+    ///
+    /// Only used for Graphviz right now.  Unfortunately GraphTrait::NodeRef has
+    /// to be a pointer and needs to be able to initialize the child iterator.
+    /// Thus we need a reference back to the container to translate the indices
+    /// to entries.
+    VecTreeTy &Container;
+
+    /// The TreeEntry index containing the user of this entry.  We can actually
+    /// have multiple users so the data structure is not truly a tree.
+    SmallVector<EdgeInfo, 1> UserTreeIndices;
+
+    /// The index of this treeEntry in VectorizableTree.
+    int Idx = -1;
+
+  private:
+    /// The operands of each instruction in each lane Operands[op_index][lane].
+    /// Note: This helps avoid the replication of the code that performs the
+    /// reordering of operands during buildTree_rec() and vectorizeTree().
+    SmallVector<ValueList, 2> Operands;
+
+    /// The main/alternate instruction.
+    Instruction *MainOp = nullptr;
+    Instruction *AltOp = nullptr;
+
+  public:
+    /// Set this bundle's \p OpIdx'th operand to \p OpVL.
+    void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
+      if (Operands.size() < OpIdx + 1)
+        Operands.resize(OpIdx + 1);
+      assert(Operands[OpIdx].size() == 0 && "Already resized?");
+      Operands[OpIdx].resize(Scalars.size());
+      for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane)
+        Operands[OpIdx][Lane] = OpVL[Lane];
+    }
+
+    /// Set the operands of this bundle in their original order.
+    void setOperandsInOrder() {
+      assert(Operands.empty() && "Already initialized?");
+      auto *I0 = cast<Instruction>(Scalars[0]);
+      Operands.resize(I0->getNumOperands());
+      unsigned NumLanes = Scalars.size();
+      for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
+           OpIdx != NumOperands; ++OpIdx) {
+        Operands[OpIdx].resize(NumLanes);
+        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+          auto *I = cast<Instruction>(Scalars[Lane]);
+          assert(I->getNumOperands() == NumOperands &&
+                 "Expected same number of operands");
+          Operands[OpIdx][Lane] = I->getOperand(OpIdx);
+        }
+      }
+    }
+
+    /// \returns the \p OpIdx operand of this TreeEntry.
+    ValueList &getOperand(unsigned OpIdx) {
+      assert(OpIdx < Operands.size() && "Off bounds");
+      return Operands[OpIdx];
+    }
+
+    /// \returns the number of operands.
+    unsigned getNumOperands() const { return Operands.size(); }
+
+    /// \return the single \p OpIdx operand.
+    Value *getSingleOperand(unsigned OpIdx) const {
+      assert(OpIdx < Operands.size() && "Off bounds");
+      assert(!Operands[OpIdx].empty() && "No operand available");
+      return Operands[OpIdx][0];
+    }
+
+    /// Some of the instructions in the list have alternate opcodes.
+    bool isAltShuffle() const {
+      return getOpcode() != getAltOpcode();
+    }
+
+    bool isOpcodeOrAlt(Instruction *I) const {
+      unsigned CheckedOpcode = I->getOpcode();
+      return (getOpcode() == CheckedOpcode ||
+              getAltOpcode() == CheckedOpcode);
+    }
+
+    /// Chooses the correct key for scheduling data. If \p Op has the same (or
+    /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
+    /// \p OpValue.
+    Value *isOneOf(Value *Op) const {
+      auto *I = dyn_cast<Instruction>(Op);
+      if (I && isOpcodeOrAlt(I))
+        return Op;
+      return MainOp;
+    }
+
+    void setOperations(const InstructionsState &S) {
+      MainOp = S.MainOp;
+      AltOp = S.AltOp;
+    }
+
+    Instruction *getMainOp() const {
+      return MainOp;
+    }
+
+    Instruction *getAltOp() const {
+      return AltOp;
+    }
+
+    /// The main/alternate opcodes for the list of instructions.
+    unsigned getOpcode() const {
+      return MainOp ? MainOp->getOpcode() : 0;
+    }
+
+    unsigned getAltOpcode() const {
+      return AltOp ? AltOp->getOpcode() : 0;
+    }
+
+    /// Update operations state of this entry if reorder occurred.
+    bool updateStateIfReorder() {
+      if (ReorderIndices.empty())
+        return false;
+      InstructionsState S = getSameOpcode(Scalars, ReorderIndices.front());
+      setOperations(S);
+      return true;
+    }
+
+#ifndef NDEBUG
+    /// Debug printer.
+    LLVM_DUMP_METHOD void dump() const {
+      dbgs() << Idx << ".\n";
+      for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
+        dbgs() << "Operand " << OpI << ":\n";
+        for (const Value *V : Operands[OpI])
+          dbgs().indent(2) << *V << "\n";
+      }
+      dbgs() << "Scalars: \n";
+      for (Value *V : Scalars)
+        dbgs().indent(2) << *V << "\n";
+      dbgs() << "State: ";
+      switch (State) {
+      case Vectorize:
+        dbgs() << "Vectorize\n";
+        break;
       case ScatterVectorize:
         dbgs() << "ScatterVectorize\n";
         break;
-      case NeedToGather: 
-        dbgs() << "NeedToGather\n"; 
-        break; 
-      } 
-      dbgs() << "MainOp: "; 
-      if (MainOp) 
-        dbgs() << *MainOp << "\n"; 
-      else 
-        dbgs() << "NULL\n"; 
-      dbgs() << "AltOp: "; 
-      if (AltOp) 
-        dbgs() << *AltOp << "\n"; 
-      else 
-        dbgs() << "NULL\n"; 
-      dbgs() << "VectorizedValue: "; 
-      if (VectorizedValue) 
-        dbgs() << *VectorizedValue << "\n"; 
-      else 
-        dbgs() << "NULL\n"; 
-      dbgs() << "ReuseShuffleIndices: "; 
-      if (ReuseShuffleIndices.empty()) 
+      case NeedToGather:
+        dbgs() << "NeedToGather\n";
+        break;
+      }
+      dbgs() << "MainOp: ";
+      if (MainOp)
+        dbgs() << *MainOp << "\n";
+      else
+        dbgs() << "NULL\n";
+      dbgs() << "AltOp: ";
+      if (AltOp)
+        dbgs() << *AltOp << "\n";
+      else
+        dbgs() << "NULL\n";
+      dbgs() << "VectorizedValue: ";
+      if (VectorizedValue)
+        dbgs() << *VectorizedValue << "\n";
+      else
+        dbgs() << "NULL\n";
+      dbgs() << "ReuseShuffleIndices: ";
+      if (ReuseShuffleIndices.empty())
         dbgs() << "Empty";
-      else 
-        for (unsigned ReuseIdx : ReuseShuffleIndices) 
-          dbgs() << ReuseIdx << ", "; 
-      dbgs() << "\n"; 
-      dbgs() << "ReorderIndices: "; 
-      for (unsigned ReorderIdx : ReorderIndices) 
-        dbgs() << ReorderIdx << ", "; 
-      dbgs() << "\n"; 
-      dbgs() << "UserTreeIndices: "; 
-      for (const auto &EInfo : UserTreeIndices) 
-        dbgs() << EInfo << ", "; 
-      dbgs() << "\n"; 
-    } 
-#endif 
-  }; 
- 
+      else
+        for (unsigned ReuseIdx : ReuseShuffleIndices)
+          dbgs() << ReuseIdx << ", ";
+      dbgs() << "\n";
+      dbgs() << "ReorderIndices: ";
+      for (unsigned ReorderIdx : ReorderIndices)
+        dbgs() << ReorderIdx << ", ";
+      dbgs() << "\n";
+      dbgs() << "UserTreeIndices: ";
+      for (const auto &EInfo : UserTreeIndices)
+        dbgs() << EInfo << ", ";
+      dbgs() << "\n";
+    }
+#endif
+  };
+
 #ifndef NDEBUG
   void dumpTreeCosts(TreeEntry *E, InstructionCost ReuseShuffleCost,
                      InstructionCost VecCost,
@@ -1779,12 +1779,12 @@ private:
   }
 #endif
 
-  /// Create a new VectorizableTree entry. 
-  TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle, 
-                          const InstructionsState &S, 
-                          const EdgeInfo &UserTreeIdx, 
-                          ArrayRef<unsigned> ReuseShuffleIndices = None, 
-                          ArrayRef<unsigned> ReorderIndices = None) { 
+  /// Create a new VectorizableTree entry.
+  TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle,
+                          const InstructionsState &S,
+                          const EdgeInfo &UserTreeIdx,
+                          ArrayRef<unsigned> ReuseShuffleIndices = None,
+                          ArrayRef<unsigned> ReorderIndices = None) {
     TreeEntry::EntryState EntryState =
         Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
     return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
@@ -1801,1097 +1801,1097 @@ private:
     assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
             (Bundle && EntryState != TreeEntry::NeedToGather)) &&
            "Need to vectorize gather entry?");
-    VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree)); 
-    TreeEntry *Last = VectorizableTree.back().get(); 
-    Last->Idx = VectorizableTree.size() - 1; 
-    Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); 
+    VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
+    TreeEntry *Last = VectorizableTree.back().get();
+    Last->Idx = VectorizableTree.size() - 1;
+    Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
     Last->State = EntryState;
-    Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), 
-                                     ReuseShuffleIndices.end()); 
+    Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
+                                     ReuseShuffleIndices.end());
     Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
-    Last->setOperations(S); 
+    Last->setOperations(S);
     if (Last->State != TreeEntry::NeedToGather) {
       for (Value *V : VL) {
         assert(!getTreeEntry(V) && "Scalar already in tree!");
         ScalarToTreeEntry[V] = Last;
-      } 
-      // Update the scheduler bundle to point to this TreeEntry. 
-      unsigned Lane = 0; 
-      for (ScheduleData *BundleMember = Bundle.getValue(); BundleMember; 
-           BundleMember = BundleMember->NextInBundle) { 
-        BundleMember->TE = Last; 
-        BundleMember->Lane = Lane; 
-        ++Lane; 
-      } 
-      assert((!Bundle.getValue() || Lane == VL.size()) && 
-             "Bundle and VL out of sync"); 
-    } else { 
-      MustGather.insert(VL.begin(), VL.end()); 
-    } 
- 
-    if (UserTreeIdx.UserTE) 
-      Last->UserTreeIndices.push_back(UserTreeIdx); 
- 
-    return Last; 
-  } 
- 
-  /// -- Vectorization State -- 
-  /// Holds all of the tree entries. 
-  TreeEntry::VecTreeTy VectorizableTree; 
- 
-#ifndef NDEBUG 
-  /// Debug printer. 
-  LLVM_DUMP_METHOD void dumpVectorizableTree() const { 
-    for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) { 
-      VectorizableTree[Id]->dump(); 
-      dbgs() << "\n"; 
-    } 
-  } 
-#endif 
- 
+      }
+      // Update the scheduler bundle to point to this TreeEntry.
+      unsigned Lane = 0;
+      for (ScheduleData *BundleMember = Bundle.getValue(); BundleMember;
+           BundleMember = BundleMember->NextInBundle) {
+        BundleMember->TE = Last;
+        BundleMember->Lane = Lane;
+        ++Lane;
+      }
+      assert((!Bundle.getValue() || Lane == VL.size()) &&
+             "Bundle and VL out of sync");
+    } else {
+      MustGather.insert(VL.begin(), VL.end());
+    }
+
+    if (UserTreeIdx.UserTE)
+      Last->UserTreeIndices.push_back(UserTreeIdx);
+
+    return Last;
+  }
+
+  /// -- Vectorization State --
+  /// Holds all of the tree entries.
+  TreeEntry::VecTreeTy VectorizableTree;
+
+#ifndef NDEBUG
+  /// Debug printer.
+  LLVM_DUMP_METHOD void dumpVectorizableTree() const {
+    for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
+      VectorizableTree[Id]->dump();
+      dbgs() << "\n";
+    }
+  }
+#endif
+
   TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
- 
-  const TreeEntry *getTreeEntry(Value *V) const { 
+
+  const TreeEntry *getTreeEntry(Value *V) const {
     return ScalarToTreeEntry.lookup(V);
-  } 
- 
-  /// Maps a specific scalar to its tree entry. 
-  SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry; 
- 
-  /// Maps a value to the proposed vectorizable size. 
-  SmallDenseMap<Value *, unsigned> InstrElementSize; 
- 
-  /// A list of scalars that we found that we need to keep as scalars. 
-  ValueSet MustGather; 
- 
-  /// This POD struct describes one external user in the vectorized tree. 
-  struct ExternalUser { 
-    ExternalUser(Value *S, llvm::User *U, int L) 
-        : Scalar(S), User(U), Lane(L) {} 
- 
-    // Which scalar in our function. 
-    Value *Scalar; 
- 
-    // Which user that uses the scalar. 
-    llvm::User *User; 
- 
-    // Which lane does the scalar belong to. 
-    int Lane; 
-  }; 
-  using UserList = SmallVector<ExternalUser, 16>; 
- 
-  /// Checks if two instructions may access the same memory. 
-  /// 
-  /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it 
-  /// is invariant in the calling loop. 
-  bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1, 
-                 Instruction *Inst2) { 
-    // First check if the result is already in the cache. 
-    AliasCacheKey key = std::make_pair(Inst1, Inst2); 
-    Optional<bool> &result = AliasCache[key]; 
-    if (result.hasValue()) { 
-      return result.getValue(); 
-    } 
-    MemoryLocation Loc2 = getLocation(Inst2, AA); 
-    bool aliased = true; 
-    if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) { 
-      // Do the alias check. 
-      aliased = AA->alias(Loc1, Loc2); 
-    } 
-    // Store the result in the cache. 
-    result = aliased; 
-    return aliased; 
-  } 
- 
-  using AliasCacheKey = std::pair<Instruction *, Instruction *>; 
- 
-  /// Cache for alias results. 
-  /// TODO: consider moving this to the AliasAnalysis itself. 
-  DenseMap<AliasCacheKey, Optional<bool>> AliasCache; 
- 
-  /// Removes an instruction from its block and eventually deletes it. 
-  /// It's like Instruction::eraseFromParent() except that the actual deletion 
-  /// is delayed until BoUpSLP is destructed. 
-  /// This is required to ensure that there are no incorrect collisions in the 
-  /// AliasCache, which can happen if a new instruction is allocated at the 
-  /// same address as a previously deleted instruction. 
-  void eraseInstruction(Instruction *I, bool ReplaceOpsWithUndef = false) { 
-    auto It = DeletedInstructions.try_emplace(I, ReplaceOpsWithUndef).first; 
-    It->getSecond() = It->getSecond() && ReplaceOpsWithUndef; 
-  } 
- 
-  /// Temporary store for deleted instructions. Instructions will be deleted 
-  /// eventually when the BoUpSLP is destructed. 
-  DenseMap<Instruction *, bool> DeletedInstructions; 
- 
-  /// A list of values that need to extracted out of the tree. 
-  /// This list holds pairs of (Internal Scalar : External User). External User 
-  /// can be nullptr, it means that this Internal Scalar will be used later, 
-  /// after vectorization. 
-  UserList ExternalUses; 
- 
-  /// Values used only by @llvm.assume calls. 
-  SmallPtrSet<const Value *, 32> EphValues; 
- 
-  /// Holds all of the instructions that we gathered. 
-  SetVector<Instruction *> GatherSeq; 
- 
-  /// A list of blocks that we are going to CSE. 
-  SetVector<BasicBlock *> CSEBlocks; 
- 
-  /// Contains all scheduling relevant data for an instruction. 
-  /// A ScheduleData either represents a single instruction or a member of an 
-  /// instruction bundle (= a group of instructions which is combined into a 
-  /// vector instruction). 
-  struct ScheduleData { 
-    // The initial value for the dependency counters. It means that the 
-    // dependencies are not calculated yet. 
-    enum { InvalidDeps = -1 }; 
- 
-    ScheduleData() = default; 
- 
-    void init(int BlockSchedulingRegionID, Value *OpVal) { 
-      FirstInBundle = this; 
-      NextInBundle = nullptr; 
-      NextLoadStore = nullptr; 
-      IsScheduled = false; 
-      SchedulingRegionID = BlockSchedulingRegionID; 
-      UnscheduledDepsInBundle = UnscheduledDeps; 
-      clearDependencies(); 
-      OpValue = OpVal; 
-      TE = nullptr; 
-      Lane = -1; 
-    } 
- 
-    /// Returns true if the dependency information has been calculated. 
-    bool hasValidDependencies() const { return Dependencies != InvalidDeps; } 
- 
-    /// Returns true for single instructions and for bundle representatives 
-    /// (= the head of a bundle). 
-    bool isSchedulingEntity() const { return FirstInBundle == this; } 
- 
-    /// Returns true if it represents an instruction bundle and not only a 
-    /// single instruction. 
-    bool isPartOfBundle() const { 
-      return NextInBundle != nullptr || FirstInBundle != this; 
-    } 
- 
-    /// Returns true if it is ready for scheduling, i.e. it has no more 
-    /// unscheduled depending instructions/bundles. 
-    bool isReady() const { 
-      assert(isSchedulingEntity() && 
-             "can't consider non-scheduling entity for ready list"); 
-      return UnscheduledDepsInBundle == 0 && !IsScheduled; 
-    } 
- 
-    /// Modifies the number of unscheduled dependencies, also updating it for 
-    /// the whole bundle. 
-    int incrementUnscheduledDeps(int Incr) { 
-      UnscheduledDeps += Incr; 
-      return FirstInBundle->UnscheduledDepsInBundle += Incr; 
-    } 
- 
-    /// Sets the number of unscheduled dependencies to the number of 
-    /// dependencies. 
-    void resetUnscheduledDeps() { 
-      incrementUnscheduledDeps(Dependencies - UnscheduledDeps); 
-    } 
- 
-    /// Clears all dependency information. 
-    void clearDependencies() { 
-      Dependencies = InvalidDeps; 
-      resetUnscheduledDeps(); 
-      MemoryDependencies.clear(); 
-    } 
- 
-    void dump(raw_ostream &os) const { 
-      if (!isSchedulingEntity()) { 
-        os << "/ " << *Inst; 
-      } else if (NextInBundle) { 
-        os << '[' << *Inst; 
-        ScheduleData *SD = NextInBundle; 
-        while (SD) { 
-          os << ';' << *SD->Inst; 
-          SD = SD->NextInBundle; 
-        } 
-        os << ']'; 
-      } else { 
-        os << *Inst; 
-      } 
-    } 
- 
-    Instruction *Inst = nullptr; 
- 
-    /// Points to the head in an instruction bundle (and always to this for 
-    /// single instructions). 
-    ScheduleData *FirstInBundle = nullptr; 
- 
-    /// Single linked list of all instructions in a bundle. Null if it is a 
-    /// single instruction. 
-    ScheduleData *NextInBundle = nullptr; 
- 
-    /// Single linked list of all memory instructions (e.g. load, store, call) 
-    /// in the block - until the end of the scheduling region. 
-    ScheduleData *NextLoadStore = nullptr; 
- 
-    /// The dependent memory instructions. 
-    /// This list is derived on demand in calculateDependencies(). 
-    SmallVector<ScheduleData *, 4> MemoryDependencies; 
- 
-    /// This ScheduleData is in the current scheduling region if this matches 
-    /// the current SchedulingRegionID of BlockScheduling. 
-    int SchedulingRegionID = 0; 
- 
-    /// Used for getting a "good" final ordering of instructions. 
-    int SchedulingPriority = 0; 
- 
-    /// The number of dependencies. Constitutes of the number of users of the 
-    /// instruction plus the number of dependent memory instructions (if any). 
-    /// This value is calculated on demand. 
-    /// If InvalidDeps, the number of dependencies is not calculated yet. 
-    int Dependencies = InvalidDeps; 
- 
-    /// The number of dependencies minus the number of dependencies of scheduled 
-    /// instructions. As soon as this is zero, the instruction/bundle gets ready 
-    /// for scheduling. 
-    /// Note that this is negative as long as Dependencies is not calculated. 
-    int UnscheduledDeps = InvalidDeps; 
- 
-    /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for 
-    /// single instructions. 
-    int UnscheduledDepsInBundle = InvalidDeps; 
- 
-    /// True if this instruction is scheduled (or considered as scheduled in the 
-    /// dry-run). 
-    bool IsScheduled = false; 
- 
-    /// Opcode of the current instruction in the schedule data. 
-    Value *OpValue = nullptr; 
- 
-    /// The TreeEntry that this instruction corresponds to. 
-    TreeEntry *TE = nullptr; 
- 
-    /// The lane of this node in the TreeEntry. 
-    int Lane = -1; 
-  }; 
- 
-#ifndef NDEBUG 
-  friend inline raw_ostream &operator<<(raw_ostream &os, 
-                                        const BoUpSLP::ScheduleData &SD) { 
-    SD.dump(os); 
-    return os; 
-  } 
-#endif 
- 
-  friend struct GraphTraits<BoUpSLP *>; 
-  friend struct DOTGraphTraits<BoUpSLP *>; 
- 
-  /// Contains all scheduling data for a basic block. 
-  struct BlockScheduling { 
-    BlockScheduling(BasicBlock *BB) 
-        : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {} 
- 
-    void clear() { 
-      ReadyInsts.clear(); 
-      ScheduleStart = nullptr; 
-      ScheduleEnd = nullptr; 
-      FirstLoadStoreInRegion = nullptr; 
-      LastLoadStoreInRegion = nullptr; 
- 
-      // Reduce the maximum schedule region size by the size of the 
-      // previous scheduling run. 
-      ScheduleRegionSizeLimit -= ScheduleRegionSize; 
-      if (ScheduleRegionSizeLimit < MinScheduleRegionSize) 
-        ScheduleRegionSizeLimit = MinScheduleRegionSize; 
-      ScheduleRegionSize = 0; 
- 
-      // Make a new scheduling region, i.e. all existing ScheduleData is not 
-      // in the new region yet. 
-      ++SchedulingRegionID; 
-    } 
- 
-    ScheduleData *getScheduleData(Value *V) { 
-      ScheduleData *SD = ScheduleDataMap[V]; 
-      if (SD && SD->SchedulingRegionID == SchedulingRegionID) 
-        return SD; 
-      return nullptr; 
-    } 
- 
-    ScheduleData *getScheduleData(Value *V, Value *Key) { 
-      if (V == Key) 
-        return getScheduleData(V); 
-      auto I = ExtraScheduleDataMap.find(V); 
-      if (I != ExtraScheduleDataMap.end()) { 
-        ScheduleData *SD = I->second[Key]; 
-        if (SD && SD->SchedulingRegionID == SchedulingRegionID) 
-          return SD; 
-      } 
-      return nullptr; 
-    } 
- 
-    bool isInSchedulingRegion(ScheduleData *SD) const { 
-      return SD->SchedulingRegionID == SchedulingRegionID; 
-    } 
- 
-    /// Marks an instruction as scheduled and puts all dependent ready 
-    /// instructions into the ready-list. 
-    template <typename ReadyListType> 
-    void schedule(ScheduleData *SD, ReadyListType &ReadyList) { 
-      SD->IsScheduled = true; 
-      LLVM_DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n"); 
- 
-      ScheduleData *BundleMember = SD; 
-      while (BundleMember) { 
-        if (BundleMember->Inst != BundleMember->OpValue) { 
-          BundleMember = BundleMember->NextInBundle; 
-          continue; 
-        } 
-        // Handle the def-use chain dependencies. 
- 
-        // Decrement the unscheduled counter and insert to ready list if ready. 
-        auto &&DecrUnsched = [this, &ReadyList](Instruction *I) { 
-          doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) { 
-            if (OpDef && OpDef->hasValidDependencies() && 
-                OpDef->incrementUnscheduledDeps(-1) == 0) { 
-              // There are no more unscheduled dependencies after 
-              // decrementing, so we can put the dependent instruction 
-              // into the ready list. 
-              ScheduleData *DepBundle = OpDef->FirstInBundle; 
-              assert(!DepBundle->IsScheduled && 
-                     "already scheduled bundle gets ready"); 
-              ReadyList.insert(DepBundle); 
-              LLVM_DEBUG(dbgs() 
-                         << "SLP:    gets ready (def): " << *DepBundle << "\n"); 
-            } 
-          }); 
-        }; 
- 
-        // If BundleMember is a vector bundle, its operands may have been 
-        // reordered duiring buildTree(). We therefore need to get its operands 
-        // through the TreeEntry. 
-        if (TreeEntry *TE = BundleMember->TE) { 
-          int Lane = BundleMember->Lane; 
-          assert(Lane >= 0 && "Lane not set"); 
- 
-          // Since vectorization tree is being built recursively this assertion 
-          // ensures that the tree entry has all operands set before reaching 
-          // this code. Couple of exceptions known at the moment are extracts 
-          // where their second (immediate) operand is not added. Since 
-          // immediates do not affect scheduler behavior this is considered 
-          // okay. 
-          auto *In = TE->getMainOp(); 
-          assert(In && 
-                 (isa<ExtractValueInst>(In) || isa<ExtractElementInst>(In) || 
-                  In->getNumOperands() == TE->getNumOperands()) && 
-                 "Missed TreeEntry operands?"); 
-          (void)In; // fake use to avoid build failure when assertions disabled 
- 
-          for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands(); 
-               OpIdx != NumOperands; ++OpIdx) 
-            if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane])) 
-              DecrUnsched(I); 
-        } else { 
-          // If BundleMember is a stand-alone instruction, no operand reordering 
-          // has taken place, so we directly access its operands. 
-          for (Use &U : BundleMember->Inst->operands()) 
-            if (auto *I = dyn_cast<Instruction>(U.get())) 
-              DecrUnsched(I); 
-        } 
-        // Handle the memory dependencies. 
-        for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) { 
-          if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) { 
-            // There are no more unscheduled dependencies after decrementing, 
-            // so we can put the dependent instruction into the ready list. 
-            ScheduleData *DepBundle = MemoryDepSD->FirstInBundle; 
-            assert(!DepBundle->IsScheduled && 
-                   "already scheduled bundle gets ready"); 
-            ReadyList.insert(DepBundle); 
-            LLVM_DEBUG(dbgs() 
-                       << "SLP:    gets ready (mem): " << *DepBundle << "\n"); 
-          } 
-        } 
-        BundleMember = BundleMember->NextInBundle; 
-      } 
-    } 
- 
-    void doForAllOpcodes(Value *V, 
-                         function_ref<void(ScheduleData *SD)> Action) { 
-      if (ScheduleData *SD = getScheduleData(V)) 
-        Action(SD); 
-      auto I = ExtraScheduleDataMap.find(V); 
-      if (I != ExtraScheduleDataMap.end()) 
-        for (auto &P : I->second) 
-          if (P.second->SchedulingRegionID == SchedulingRegionID) 
-            Action(P.second); 
-    } 
- 
-    /// Put all instructions into the ReadyList which are ready for scheduling. 
-    template <typename ReadyListType> 
-    void initialFillReadyList(ReadyListType &ReadyList) { 
-      for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 
-        doForAllOpcodes(I, [&](ScheduleData *SD) { 
-          if (SD->isSchedulingEntity() && SD->isReady()) { 
-            ReadyList.insert(SD); 
-            LLVM_DEBUG(dbgs() 
-                       << "SLP:    initially in ready list: " << *I << "\n"); 
-          } 
-        }); 
-      } 
-    } 
- 
-    /// Checks if a bundle of instructions can be scheduled, i.e. has no 
-    /// cyclic dependencies. This is only a dry-run, no instructions are 
-    /// actually moved at this stage. 
-    /// \returns the scheduling bundle. The returned Optional value is non-None 
-    /// if \p VL is allowed to be scheduled. 
-    Optional<ScheduleData *> 
-    tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, 
-                      const InstructionsState &S); 
- 
-    /// Un-bundles a group of instructions. 
-    void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue); 
- 
-    /// Allocates schedule data chunk. 
-    ScheduleData *allocateScheduleDataChunks(); 
- 
-    /// Extends the scheduling region so that V is inside the region. 
-    /// \returns true if the region size is within the limit. 
-    bool extendSchedulingRegion(Value *V, const InstructionsState &S); 
- 
-    /// Initialize the ScheduleData structures for new instructions in the 
-    /// scheduling region. 
-    void initScheduleData(Instruction *FromI, Instruction *ToI, 
-                          ScheduleData *PrevLoadStore, 
-                          ScheduleData *NextLoadStore); 
- 
-    /// Updates the dependency information of a bundle and of all instructions/ 
-    /// bundles which depend on the original bundle. 
-    void calculateDependencies(ScheduleData *SD, bool InsertInReadyList, 
-                               BoUpSLP *SLP); 
- 
-    /// Sets all instruction in the scheduling region to un-scheduled. 
-    void resetSchedule(); 
- 
-    BasicBlock *BB; 
- 
-    /// Simple memory allocation for ScheduleData. 
-    std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks; 
- 
-    /// The size of a ScheduleData array in ScheduleDataChunks. 
-    int ChunkSize; 
- 
-    /// The allocator position in the current chunk, which is the last entry 
-    /// of ScheduleDataChunks. 
-    int ChunkPos; 
- 
-    /// Attaches ScheduleData to Instruction. 
-    /// Note that the mapping survives during all vectorization iterations, i.e. 
-    /// ScheduleData structures are recycled. 
-    DenseMap<Value *, ScheduleData *> ScheduleDataMap; 
- 
-    /// Attaches ScheduleData to Instruction with the leading key. 
-    DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>> 
-        ExtraScheduleDataMap; 
- 
-    struct ReadyList : SmallVector<ScheduleData *, 8> { 
-      void insert(ScheduleData *SD) { push_back(SD); } 
-    }; 
- 
-    /// The ready-list for scheduling (only used for the dry-run). 
-    ReadyList ReadyInsts; 
- 
-    /// The first instruction of the scheduling region. 
-    Instruction *ScheduleStart = nullptr; 
- 
-    /// The first instruction _after_ the scheduling region. 
-    Instruction *ScheduleEnd = nullptr; 
- 
-    /// The first memory accessing instruction in the scheduling region 
-    /// (can be null). 
-    ScheduleData *FirstLoadStoreInRegion = nullptr; 
- 
-    /// The last memory accessing instruction in the scheduling region 
-    /// (can be null). 
-    ScheduleData *LastLoadStoreInRegion = nullptr; 
- 
-    /// The current size of the scheduling region. 
-    int ScheduleRegionSize = 0; 
- 
-    /// The maximum size allowed for the scheduling region. 
-    int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget; 
- 
-    /// The ID of the scheduling region. For a new vectorization iteration this 
-    /// is incremented which "removes" all ScheduleData from the region. 
-    // Make sure that the initial SchedulingRegionID is greater than the 
-    // initial SchedulingRegionID in ScheduleData (which is 0). 
-    int SchedulingRegionID = 1; 
-  }; 
- 
-  /// Attaches the BlockScheduling structures to basic blocks. 
-  MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules; 
- 
-  /// Performs the "real" scheduling. Done before vectorization is actually 
-  /// performed in a basic block. 
-  void scheduleBlock(BlockScheduling *BS); 
- 
-  /// List of users to ignore during scheduling and that don't need extracting. 
-  ArrayRef<Value *> UserIgnoreList; 
- 
-  /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of 
-  /// sorted SmallVectors of unsigned. 
-  struct OrdersTypeDenseMapInfo { 
-    static OrdersType getEmptyKey() { 
-      OrdersType V; 
-      V.push_back(~1U); 
-      return V; 
-    } 
- 
-    static OrdersType getTombstoneKey() { 
-      OrdersType V; 
-      V.push_back(~2U); 
-      return V; 
-    } 
- 
-    static unsigned getHashValue(const OrdersType &V) { 
-      return static_cast<unsigned>(hash_combine_range(V.begin(), V.end())); 
-    } 
- 
-    static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) { 
-      return LHS == RHS; 
-    } 
-  }; 
- 
-  /// Contains orders of operations along with the number of bundles that have 
-  /// operations in this order. It stores only those orders that require 
-  /// reordering, if reordering is not required it is counted using \a 
-  /// NumOpsWantToKeepOriginalOrder. 
-  DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo> NumOpsWantToKeepOrder; 
-  /// Number of bundles that do not require reordering. 
-  unsigned NumOpsWantToKeepOriginalOrder = 0; 
- 
-  // Analysis and block reference. 
-  Function *F; 
-  ScalarEvolution *SE; 
-  TargetTransformInfo *TTI; 
-  TargetLibraryInfo *TLI; 
+  }
+
+  /// Maps a specific scalar to its tree entry.
+  SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry;
+
+  /// Maps a value to the proposed vectorizable size.
+  SmallDenseMap<Value *, unsigned> InstrElementSize;
+
+  /// A list of scalars that we found that we need to keep as scalars.
+  ValueSet MustGather;
+
+  /// This POD struct describes one external user in the vectorized tree.
+  struct ExternalUser {
+    ExternalUser(Value *S, llvm::User *U, int L)
+        : Scalar(S), User(U), Lane(L) {}
+
+    // Which scalar in our function.
+    Value *Scalar;
+
+    // Which user that uses the scalar.
+    llvm::User *User;
+
+    // Which lane does the scalar belong to.
+    int Lane;
+  };
+  using UserList = SmallVector<ExternalUser, 16>;
+
+  /// Checks if two instructions may access the same memory.
+  ///
+  /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
+  /// is invariant in the calling loop.
+  bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
+                 Instruction *Inst2) {
+    // First check if the result is already in the cache.
+    AliasCacheKey key = std::make_pair(Inst1, Inst2);
+    Optional<bool> &result = AliasCache[key];
+    if (result.hasValue()) {
+      return result.getValue();
+    }
+    MemoryLocation Loc2 = getLocation(Inst2, AA);
+    bool aliased = true;
+    if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
+      // Do the alias check.
+      aliased = AA->alias(Loc1, Loc2);
+    }
+    // Store the result in the cache.
+    result = aliased;
+    return aliased;
+  }
+
+  using AliasCacheKey = std::pair<Instruction *, Instruction *>;
+
+  /// Cache for alias results.
+  /// TODO: consider moving this to the AliasAnalysis itself.
+  DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
+
+  /// Removes an instruction from its block and eventually deletes it.
+  /// It's like Instruction::eraseFromParent() except that the actual deletion
+  /// is delayed until BoUpSLP is destructed.
+  /// This is required to ensure that there are no incorrect collisions in the
+  /// AliasCache, which can happen if a new instruction is allocated at the
+  /// same address as a previously deleted instruction.
+  void eraseInstruction(Instruction *I, bool ReplaceOpsWithUndef = false) {
+    auto It = DeletedInstructions.try_emplace(I, ReplaceOpsWithUndef).first;
+    It->getSecond() = It->getSecond() && ReplaceOpsWithUndef;
+  }
+
+  /// Temporary store for deleted instructions. Instructions will be deleted
+  /// eventually when the BoUpSLP is destructed.
+  DenseMap<Instruction *, bool> DeletedInstructions;
+
+  /// A list of values that need to extracted out of the tree.
+  /// This list holds pairs of (Internal Scalar : External User). External User
+  /// can be nullptr, it means that this Internal Scalar will be used later,
+  /// after vectorization.
+  UserList ExternalUses;
+
+  /// Values used only by @llvm.assume calls.
+  SmallPtrSet<const Value *, 32> EphValues;
+
+  /// Holds all of the instructions that we gathered.
+  SetVector<Instruction *> GatherSeq;
+
+  /// A list of blocks that we are going to CSE.
+  SetVector<BasicBlock *> CSEBlocks;
+
+  /// Contains all scheduling relevant data for an instruction.
+  /// A ScheduleData either represents a single instruction or a member of an
+  /// instruction bundle (= a group of instructions which is combined into a
+  /// vector instruction).
+  struct ScheduleData {
+    // The initial value for the dependency counters. It means that the
+    // dependencies are not calculated yet.
+    enum { InvalidDeps = -1 };
+
+    ScheduleData() = default;
+
+    void init(int BlockSchedulingRegionID, Value *OpVal) {
+      FirstInBundle = this;
+      NextInBundle = nullptr;
+      NextLoadStore = nullptr;
+      IsScheduled = false;
+      SchedulingRegionID = BlockSchedulingRegionID;
+      UnscheduledDepsInBundle = UnscheduledDeps;
+      clearDependencies();
+      OpValue = OpVal;
+      TE = nullptr;
+      Lane = -1;
+    }
+
+    /// Returns true if the dependency information has been calculated.
+    bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
+
+    /// Returns true for single instructions and for bundle representatives
+    /// (= the head of a bundle).
+    bool isSchedulingEntity() const { return FirstInBundle == this; }
+
+    /// Returns true if it represents an instruction bundle and not only a
+    /// single instruction.
+    bool isPartOfBundle() const {
+      return NextInBundle != nullptr || FirstInBundle != this;
+    }
+
+    /// Returns true if it is ready for scheduling, i.e. it has no more
+    /// unscheduled depending instructions/bundles.
+    bool isReady() const {
+      assert(isSchedulingEntity() &&
+             "can't consider non-scheduling entity for ready list");
+      return UnscheduledDepsInBundle == 0 && !IsScheduled;
+    }
+
+    /// Modifies the number of unscheduled dependencies, also updating it for
+    /// the whole bundle.
+    int incrementUnscheduledDeps(int Incr) {
+      UnscheduledDeps += Incr;
+      return FirstInBundle->UnscheduledDepsInBundle += Incr;
+    }
+
+    /// Sets the number of unscheduled dependencies to the number of
+    /// dependencies.
+    void resetUnscheduledDeps() {
+      incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
+    }
+
+    /// Clears all dependency information.
+    void clearDependencies() {
+      Dependencies = InvalidDeps;
+      resetUnscheduledDeps();
+      MemoryDependencies.clear();
+    }
+
+    void dump(raw_ostream &os) const {
+      if (!isSchedulingEntity()) {
+        os << "/ " << *Inst;
+      } else if (NextInBundle) {
+        os << '[' << *Inst;
+        ScheduleData *SD = NextInBundle;
+        while (SD) {
+          os << ';' << *SD->Inst;
+          SD = SD->NextInBundle;
+        }
+        os << ']';
+      } else {
+        os << *Inst;
+      }
+    }
+
+    Instruction *Inst = nullptr;
+
+    /// Points to the head in an instruction bundle (and always to this for
+    /// single instructions).
+    ScheduleData *FirstInBundle = nullptr;
+
+    /// Single linked list of all instructions in a bundle. Null if it is a
+    /// single instruction.
+    ScheduleData *NextInBundle = nullptr;
+
+    /// Single linked list of all memory instructions (e.g. load, store, call)
+    /// in the block - until the end of the scheduling region.
+    ScheduleData *NextLoadStore = nullptr;
+
+    /// The dependent memory instructions.
+    /// This list is derived on demand in calculateDependencies().
+    SmallVector<ScheduleData *, 4> MemoryDependencies;
+
+    /// This ScheduleData is in the current scheduling region if this matches
+    /// the current SchedulingRegionID of BlockScheduling.
+    int SchedulingRegionID = 0;
+
+    /// Used for getting a "good" final ordering of instructions.
+    int SchedulingPriority = 0;
+
+    /// The number of dependencies. Constitutes of the number of users of the
+    /// instruction plus the number of dependent memory instructions (if any).
+    /// This value is calculated on demand.
+    /// If InvalidDeps, the number of dependencies is not calculated yet.
+    int Dependencies = InvalidDeps;
+
+    /// The number of dependencies minus the number of dependencies of scheduled
+    /// instructions. As soon as this is zero, the instruction/bundle gets ready
+    /// for scheduling.
+    /// Note that this is negative as long as Dependencies is not calculated.
+    int UnscheduledDeps = InvalidDeps;
+
+    /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
+    /// single instructions.
+    int UnscheduledDepsInBundle = InvalidDeps;
+
+    /// True if this instruction is scheduled (or considered as scheduled in the
+    /// dry-run).
+    bool IsScheduled = false;
+
+    /// Opcode of the current instruction in the schedule data.
+    Value *OpValue = nullptr;
+
+    /// The TreeEntry that this instruction corresponds to.
+    TreeEntry *TE = nullptr;
+
+    /// The lane of this node in the TreeEntry.
+    int Lane = -1;
+  };
+
+#ifndef NDEBUG
+  friend inline raw_ostream &operator<<(raw_ostream &os,
+                                        const BoUpSLP::ScheduleData &SD) {
+    SD.dump(os);
+    return os;
+  }
+#endif
+
+  friend struct GraphTraits<BoUpSLP *>;
+  friend struct DOTGraphTraits<BoUpSLP *>;
+
+  /// Contains all scheduling data for a basic block.
+  struct BlockScheduling {
+    BlockScheduling(BasicBlock *BB)
+        : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
+
+    void clear() {
+      ReadyInsts.clear();
+      ScheduleStart = nullptr;
+      ScheduleEnd = nullptr;
+      FirstLoadStoreInRegion = nullptr;
+      LastLoadStoreInRegion = nullptr;
+
+      // Reduce the maximum schedule region size by the size of the
+      // previous scheduling run.
+      ScheduleRegionSizeLimit -= ScheduleRegionSize;
+      if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
+        ScheduleRegionSizeLimit = MinScheduleRegionSize;
+      ScheduleRegionSize = 0;
+
+      // Make a new scheduling region, i.e. all existing ScheduleData is not
+      // in the new region yet.
+      ++SchedulingRegionID;
+    }
+
+    ScheduleData *getScheduleData(Value *V) {
+      ScheduleData *SD = ScheduleDataMap[V];
+      if (SD && SD->SchedulingRegionID == SchedulingRegionID)
+        return SD;
+      return nullptr;
+    }
+
+    ScheduleData *getScheduleData(Value *V, Value *Key) {
+      if (V == Key)
+        return getScheduleData(V);
+      auto I = ExtraScheduleDataMap.find(V);
+      if (I != ExtraScheduleDataMap.end()) {
+        ScheduleData *SD = I->second[Key];
+        if (SD && SD->SchedulingRegionID == SchedulingRegionID)
+          return SD;
+      }
+      return nullptr;
+    }
+
+    bool isInSchedulingRegion(ScheduleData *SD) const {
+      return SD->SchedulingRegionID == SchedulingRegionID;
+    }
+
+    /// Marks an instruction as scheduled and puts all dependent ready
+    /// instructions into the ready-list.
+    template <typename ReadyListType>
+    void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
+      SD->IsScheduled = true;
+      LLVM_DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n");
+
+      ScheduleData *BundleMember = SD;
+      while (BundleMember) {
+        if (BundleMember->Inst != BundleMember->OpValue) {
+          BundleMember = BundleMember->NextInBundle;
+          continue;
+        }
+        // Handle the def-use chain dependencies.
+
+        // Decrement the unscheduled counter and insert to ready list if ready.
+        auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
+          doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
+            if (OpDef && OpDef->hasValidDependencies() &&
+                OpDef->incrementUnscheduledDeps(-1) == 0) {
+              // There are no more unscheduled dependencies after
+              // decrementing, so we can put the dependent instruction
+              // into the ready list.
+              ScheduleData *DepBundle = OpDef->FirstInBundle;
+              assert(!DepBundle->IsScheduled &&
+                     "already scheduled bundle gets ready");
+              ReadyList.insert(DepBundle);
+              LLVM_DEBUG(dbgs()
+                         << "SLP:    gets ready (def): " << *DepBundle << "\n");
+            }
+          });
+        };
+
+        // If BundleMember is a vector bundle, its operands may have been
+        // reordered duiring buildTree(). We therefore need to get its operands
+        // through the TreeEntry.
+        if (TreeEntry *TE = BundleMember->TE) {
+          int Lane = BundleMember->Lane;
+          assert(Lane >= 0 && "Lane not set");
+
+          // Since vectorization tree is being built recursively this assertion
+          // ensures that the tree entry has all operands set before reaching
+          // this code. Couple of exceptions known at the moment are extracts
+          // where their second (immediate) operand is not added. Since
+          // immediates do not affect scheduler behavior this is considered
+          // okay.
+          auto *In = TE->getMainOp();
+          assert(In &&
+                 (isa<ExtractValueInst>(In) || isa<ExtractElementInst>(In) ||
+                  In->getNumOperands() == TE->getNumOperands()) &&
+                 "Missed TreeEntry operands?");
+          (void)In; // fake use to avoid build failure when assertions disabled
+
+          for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
+               OpIdx != NumOperands; ++OpIdx)
+            if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
+              DecrUnsched(I);
+        } else {
+          // If BundleMember is a stand-alone instruction, no operand reordering
+          // has taken place, so we directly access its operands.
+          for (Use &U : BundleMember->Inst->operands())
+            if (auto *I = dyn_cast<Instruction>(U.get()))
+              DecrUnsched(I);
+        }
+        // Handle the memory dependencies.
+        for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
+          if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
+            // There are no more unscheduled dependencies after decrementing,
+            // so we can put the dependent instruction into the ready list.
+            ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
+            assert(!DepBundle->IsScheduled &&
+                   "already scheduled bundle gets ready");
+            ReadyList.insert(DepBundle);
+            LLVM_DEBUG(dbgs()
+                       << "SLP:    gets ready (mem): " << *DepBundle << "\n");
+          }
+        }
+        BundleMember = BundleMember->NextInBundle;
+      }
+    }
+
+    void doForAllOpcodes(Value *V,
+                         function_ref<void(ScheduleData *SD)> Action) {
+      if (ScheduleData *SD = getScheduleData(V))
+        Action(SD);
+      auto I = ExtraScheduleDataMap.find(V);
+      if (I != ExtraScheduleDataMap.end())
+        for (auto &P : I->second)
+          if (P.second->SchedulingRegionID == SchedulingRegionID)
+            Action(P.second);
+    }
+
+    /// Put all instructions into the ReadyList which are ready for scheduling.
+    template <typename ReadyListType>
+    void initialFillReadyList(ReadyListType &ReadyList) {
+      for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
+        doForAllOpcodes(I, [&](ScheduleData *SD) {
+          if (SD->isSchedulingEntity() && SD->isReady()) {
+            ReadyList.insert(SD);
+            LLVM_DEBUG(dbgs()
+                       << "SLP:    initially in ready list: " << *I << "\n");
+          }
+        });
+      }
+    }
+
+    /// Checks if a bundle of instructions can be scheduled, i.e. has no
+    /// cyclic dependencies. This is only a dry-run, no instructions are
+    /// actually moved at this stage.
+    /// \returns the scheduling bundle. The returned Optional value is non-None
+    /// if \p VL is allowed to be scheduled.
+    Optional<ScheduleData *>
+    tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
+                      const InstructionsState &S);
+
+    /// Un-bundles a group of instructions.
+    void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
+
+    /// Allocates schedule data chunk.
+    ScheduleData *allocateScheduleDataChunks();
+
+    /// Extends the scheduling region so that V is inside the region.
+    /// \returns true if the region size is within the limit.
+    bool extendSchedulingRegion(Value *V, const InstructionsState &S);
+
+    /// Initialize the ScheduleData structures for new instructions in the
+    /// scheduling region.
+    void initScheduleData(Instruction *FromI, Instruction *ToI,
+                          ScheduleData *PrevLoadStore,
+                          ScheduleData *NextLoadStore);
+
+    /// Updates the dependency information of a bundle and of all instructions/
+    /// bundles which depend on the original bundle.
+    void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
+                               BoUpSLP *SLP);
+
+    /// Sets all instruction in the scheduling region to un-scheduled.
+    void resetSchedule();
+
+    BasicBlock *BB;
+
+    /// Simple memory allocation for ScheduleData.
+    std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
+
+    /// The size of a ScheduleData array in ScheduleDataChunks.
+    int ChunkSize;
+
+    /// The allocator position in the current chunk, which is the last entry
+    /// of ScheduleDataChunks.
+    int ChunkPos;
+
+    /// Attaches ScheduleData to Instruction.
+    /// Note that the mapping survives during all vectorization iterations, i.e.
+    /// ScheduleData structures are recycled.
+    DenseMap<Value *, ScheduleData *> ScheduleDataMap;
+
+    /// Attaches ScheduleData to Instruction with the leading key.
+    DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>
+        ExtraScheduleDataMap;
+
+    struct ReadyList : SmallVector<ScheduleData *, 8> {
+      void insert(ScheduleData *SD) { push_back(SD); }
+    };
+
+    /// The ready-list for scheduling (only used for the dry-run).
+    ReadyList ReadyInsts;
+
+    /// The first instruction of the scheduling region.
+    Instruction *ScheduleStart = nullptr;
+
+    /// The first instruction _after_ the scheduling region.
+    Instruction *ScheduleEnd = nullptr;
+
+    /// The first memory accessing instruction in the scheduling region
+    /// (can be null).
+    ScheduleData *FirstLoadStoreInRegion = nullptr;
+
+    /// The last memory accessing instruction in the scheduling region
+    /// (can be null).
+    ScheduleData *LastLoadStoreInRegion = nullptr;
+
+    /// The current size of the scheduling region.
+    int ScheduleRegionSize = 0;
+
+    /// The maximum size allowed for the scheduling region.
+    int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
+
+    /// The ID of the scheduling region. For a new vectorization iteration this
+    /// is incremented which "removes" all ScheduleData from the region.
+    // Make sure that the initial SchedulingRegionID is greater than the
+    // initial SchedulingRegionID in ScheduleData (which is 0).
+    int SchedulingRegionID = 1;
+  };
+
+  /// Attaches the BlockScheduling structures to basic blocks.
+  MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
+
+  /// Performs the "real" scheduling. Done before vectorization is actually
+  /// performed in a basic block.
+  void scheduleBlock(BlockScheduling *BS);
+
+  /// List of users to ignore during scheduling and that don't need extracting.
+  ArrayRef<Value *> UserIgnoreList;
+
+  /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
+  /// sorted SmallVectors of unsigned.
+  struct OrdersTypeDenseMapInfo {
+    static OrdersType getEmptyKey() {
+      OrdersType V;
+      V.push_back(~1U);
+      return V;
+    }
+
+    static OrdersType getTombstoneKey() {
+      OrdersType V;
+      V.push_back(~2U);
+      return V;
+    }
+
+    static unsigned getHashValue(const OrdersType &V) {
+      return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
+    }
+
+    static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
+      return LHS == RHS;
+    }
+  };
+
+  /// Contains orders of operations along with the number of bundles that have
+  /// operations in this order. It stores only those orders that require
+  /// reordering, if reordering is not required it is counted using \a
+  /// NumOpsWantToKeepOriginalOrder.
+  DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo> NumOpsWantToKeepOrder;
+  /// Number of bundles that do not require reordering.
+  unsigned NumOpsWantToKeepOriginalOrder = 0;
+
+  // Analysis and block reference.
+  Function *F;
+  ScalarEvolution *SE;
+  TargetTransformInfo *TTI;
+  TargetLibraryInfo *TLI;
   AAResults *AA;
-  LoopInfo *LI; 
-  DominatorTree *DT; 
-  AssumptionCache *AC; 
-  DemandedBits *DB; 
-  const DataLayout *DL; 
-  OptimizationRemarkEmitter *ORE; 
- 
-  unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt. 
-  unsigned MinVecRegSize; // Set by cl::opt (default: 128). 
- 
-  /// Instruction builder to construct the vectorized tree. 
-  IRBuilder<> Builder; 
- 
-  /// A map of scalar integer values to the smallest bit width with which they 
-  /// can legally be represented. The values map to (width, signed) pairs, 
-  /// where "width" indicates the minimum bit width and "signed" is True if the 
-  /// value must be signed-extended, rather than zero-extended, back to its 
-  /// original width. 
-  MapVector<Value *, std::pair<uint64_t, bool>> MinBWs; 
-}; 
- 
-} // end namespace slpvectorizer 
- 
-template <> struct GraphTraits<BoUpSLP *> { 
-  using TreeEntry = BoUpSLP::TreeEntry; 
- 
-  /// NodeRef has to be a pointer per the GraphWriter. 
-  using NodeRef = TreeEntry *; 
- 
-  using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy; 
- 
-  /// Add the VectorizableTree to the index iterator to be able to return 
-  /// TreeEntry pointers. 
-  struct ChildIteratorType 
-      : public iterator_adaptor_base< 
-            ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> { 
-    ContainerTy &VectorizableTree; 
- 
-    ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W, 
-                      ContainerTy &VT) 
-        : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {} 
- 
-    NodeRef operator*() { return I->UserTE; } 
-  }; 
- 
-  static NodeRef getEntryNode(BoUpSLP &R) { 
-    return R.VectorizableTree[0].get(); 
-  } 
- 
-  static ChildIteratorType child_begin(NodeRef N) { 
-    return {N->UserTreeIndices.begin(), N->Container}; 
-  } 
- 
-  static ChildIteratorType child_end(NodeRef N) { 
-    return {N->UserTreeIndices.end(), N->Container}; 
-  } 
- 
-  /// For the node iterator we just need to turn the TreeEntry iterator into a 
-  /// TreeEntry* iterator so that it dereferences to NodeRef. 
-  class nodes_iterator { 
-    using ItTy = ContainerTy::iterator; 
-    ItTy It; 
- 
-  public: 
-    nodes_iterator(const ItTy &It2) : It(It2) {} 
-    NodeRef operator*() { return It->get(); } 
-    nodes_iterator operator++() { 
-      ++It; 
-      return *this; 
-    } 
-    bool operator!=(const nodes_iterator &N2) const { return N2.It != It; } 
-  }; 
- 
-  static nodes_iterator nodes_begin(BoUpSLP *R) { 
-    return nodes_iterator(R->VectorizableTree.begin()); 
-  } 
- 
-  static nodes_iterator nodes_end(BoUpSLP *R) { 
-    return nodes_iterator(R->VectorizableTree.end()); 
-  } 
- 
-  static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); } 
-}; 
- 
-template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits { 
-  using TreeEntry = BoUpSLP::TreeEntry; 
- 
-  DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {} 
- 
-  std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) { 
-    std::string Str; 
-    raw_string_ostream OS(Str); 
-    if (isSplat(Entry->Scalars)) { 
-      OS << "<splat> " << *Entry->Scalars[0]; 
-      return Str; 
-    } 
-    for (auto V : Entry->Scalars) { 
-      OS << *V; 
+  LoopInfo *LI;
+  DominatorTree *DT;
+  AssumptionCache *AC;
+  DemandedBits *DB;
+  const DataLayout *DL;
+  OptimizationRemarkEmitter *ORE;
+
+  unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
+  unsigned MinVecRegSize; // Set by cl::opt (default: 128).
+
+  /// Instruction builder to construct the vectorized tree.
+  IRBuilder<> Builder;
+
+  /// A map of scalar integer values to the smallest bit width with which they
+  /// can legally be represented. The values map to (width, signed) pairs,
+  /// where "width" indicates the minimum bit width and "signed" is True if the
+  /// value must be signed-extended, rather than zero-extended, back to its
+  /// original width.
+  MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
+};
+
+} // end namespace slpvectorizer
+
+template <> struct GraphTraits<BoUpSLP *> {
+  using TreeEntry = BoUpSLP::TreeEntry;
+
+  /// NodeRef has to be a pointer per the GraphWriter.
+  using NodeRef = TreeEntry *;
+
+  using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
+
+  /// Add the VectorizableTree to the index iterator to be able to return
+  /// TreeEntry pointers.
+  struct ChildIteratorType
+      : public iterator_adaptor_base<
+            ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
+    ContainerTy &VectorizableTree;
+
+    ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W,
+                      ContainerTy &VT)
+        : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
+
+    NodeRef operator*() { return I->UserTE; }
+  };
+
+  static NodeRef getEntryNode(BoUpSLP &R) {
+    return R.VectorizableTree[0].get();
+  }
+
+  static ChildIteratorType child_begin(NodeRef N) {
+    return {N->UserTreeIndices.begin(), N->Container};
+  }
+
+  static ChildIteratorType child_end(NodeRef N) {
+    return {N->UserTreeIndices.end(), N->Container};
+  }
+
+  /// For the node iterator we just need to turn the TreeEntry iterator into a
+  /// TreeEntry* iterator so that it dereferences to NodeRef.
+  class nodes_iterator {
+    using ItTy = ContainerTy::iterator;
+    ItTy It;
+
+  public:
+    nodes_iterator(const ItTy &It2) : It(It2) {}
+    NodeRef operator*() { return It->get(); }
+    nodes_iterator operator++() {
+      ++It;
+      return *this;
+    }
+    bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
+  };
+
+  static nodes_iterator nodes_begin(BoUpSLP *R) {
+    return nodes_iterator(R->VectorizableTree.begin());
+  }
+
+  static nodes_iterator nodes_end(BoUpSLP *R) {
+    return nodes_iterator(R->VectorizableTree.end());
+  }
+
+  static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
+};
+
+template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
+  using TreeEntry = BoUpSLP::TreeEntry;
+
+  DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+
+  std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
+    std::string Str;
+    raw_string_ostream OS(Str);
+    if (isSplat(Entry->Scalars)) {
+      OS << "<splat> " << *Entry->Scalars[0];
+      return Str;
+    }
+    for (auto V : Entry->Scalars) {
+      OS << *V;
       if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
             return EU.Scalar == V;
           }))
-        OS << " <extract>"; 
-      OS << "\n"; 
-    } 
-    return Str; 
-  } 
- 
-  static std::string getNodeAttributes(const TreeEntry *Entry, 
-                                       const BoUpSLP *) { 
-    if (Entry->State == TreeEntry::NeedToGather) 
-      return "color=red"; 
-    return ""; 
-  } 
-}; 
- 
-} // end namespace llvm 
- 
-BoUpSLP::~BoUpSLP() { 
-  for (const auto &Pair : DeletedInstructions) { 
-    // Replace operands of ignored instructions with Undefs in case if they were 
-    // marked for deletion. 
-    if (Pair.getSecond()) { 
-      Value *Undef = UndefValue::get(Pair.getFirst()->getType()); 
-      Pair.getFirst()->replaceAllUsesWith(Undef); 
-    } 
-    Pair.getFirst()->dropAllReferences(); 
-  } 
-  for (const auto &Pair : DeletedInstructions) { 
-    assert(Pair.getFirst()->use_empty() && 
-           "trying to erase instruction with users."); 
-    Pair.getFirst()->eraseFromParent(); 
-  } 
+        OS << " <extract>";
+      OS << "\n";
+    }
+    return Str;
+  }
+
+  static std::string getNodeAttributes(const TreeEntry *Entry,
+                                       const BoUpSLP *) {
+    if (Entry->State == TreeEntry::NeedToGather)
+      return "color=red";
+    return "";
+  }
+};
+
+} // end namespace llvm
+
+BoUpSLP::~BoUpSLP() {
+  for (const auto &Pair : DeletedInstructions) {
+    // Replace operands of ignored instructions with Undefs in case if they were
+    // marked for deletion.
+    if (Pair.getSecond()) {
+      Value *Undef = UndefValue::get(Pair.getFirst()->getType());
+      Pair.getFirst()->replaceAllUsesWith(Undef);
+    }
+    Pair.getFirst()->dropAllReferences();
+  }
+  for (const auto &Pair : DeletedInstructions) {
+    assert(Pair.getFirst()->use_empty() &&
+           "trying to erase instruction with users.");
+    Pair.getFirst()->eraseFromParent();
+  }
 #ifdef EXPENSIVE_CHECKS
   // If we could guarantee that this call is not extremely slow, we could
   // remove the ifdef limitation (see PR47712).
-  assert(!verifyFunction(*F, &dbgs())); 
+  assert(!verifyFunction(*F, &dbgs()));
 #endif
-} 
- 
-void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) { 
-  for (auto *V : AV) { 
-    if (auto *I = dyn_cast<Instruction>(V)) 
+}
+
+void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) {
+  for (auto *V : AV) {
+    if (auto *I = dyn_cast<Instruction>(V))
       eraseInstruction(I, /*ReplaceOpsWithUndef=*/true);
-  }; 
-} 
- 
-void BoUpSLP::buildTree(ArrayRef<Value *> Roots, 
-                        ArrayRef<Value *> UserIgnoreLst) { 
-  ExtraValueToDebugLocsMap ExternallyUsedValues; 
-  buildTree(Roots, ExternallyUsedValues, UserIgnoreLst); 
-} 
- 
-void BoUpSLP::buildTree(ArrayRef<Value *> Roots, 
-                        ExtraValueToDebugLocsMap &ExternallyUsedValues, 
-                        ArrayRef<Value *> UserIgnoreLst) { 
-  deleteTree(); 
-  UserIgnoreList = UserIgnoreLst; 
-  if (!allSameType(Roots)) 
-    return; 
-  buildTree_rec(Roots, 0, EdgeInfo()); 
- 
-  // Collect the values that we need to extract from the tree. 
-  for (auto &TEPtr : VectorizableTree) { 
-    TreeEntry *Entry = TEPtr.get(); 
- 
-    // No need to handle users of gathered values. 
-    if (Entry->State == TreeEntry::NeedToGather) 
-      continue; 
- 
-    // For each lane: 
-    for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { 
-      Value *Scalar = Entry->Scalars[Lane]; 
-      int FoundLane = Lane; 
-      if (!Entry->ReuseShuffleIndices.empty()) { 
-        FoundLane = 
-            std::distance(Entry->ReuseShuffleIndices.begin(), 
-                          llvm::find(Entry->ReuseShuffleIndices, FoundLane)); 
-      } 
- 
-      // Check if the scalar is externally used as an extra arg. 
-      auto ExtI = ExternallyUsedValues.find(Scalar); 
-      if (ExtI != ExternallyUsedValues.end()) { 
-        LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " 
-                          << Lane << " from " << *Scalar << ".\n"); 
-        ExternalUses.emplace_back(Scalar, nullptr, FoundLane); 
-      } 
-      for (User *U : Scalar->users()) { 
-        LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); 
- 
-        Instruction *UserInst = dyn_cast<Instruction>(U); 
-        if (!UserInst) 
-          continue; 
- 
-        // Skip in-tree scalars that become vectors 
-        if (TreeEntry *UseEntry = getTreeEntry(U)) { 
-          Value *UseScalar = UseEntry->Scalars[0]; 
-          // Some in-tree scalars will remain as scalar in vectorized 
-          // instructions. If that is the case, the one in Lane 0 will 
-          // be used. 
-          if (UseScalar != U || 
-              !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { 
-            LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U 
-                              << ".\n"); 
-            assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state"); 
-            continue; 
-          } 
-        } 
- 
-        // Ignore users in the user ignore list. 
-        if (is_contained(UserIgnoreList, UserInst)) 
-          continue; 
- 
-        LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " 
-                          << Lane << " from " << *Scalar << ".\n"); 
-        ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane)); 
-      } 
-    } 
-  } 
-} 
- 
-void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, 
-                            const EdgeInfo &UserTreeIdx) { 
-  assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); 
- 
-  InstructionsState S = getSameOpcode(VL); 
-  if (Depth == RecursionMaxDepth) { 
-    LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); 
-    newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); 
-    return; 
-  } 
- 
-  // Don't handle vectors. 
-  if (S.OpValue->getType()->isVectorTy()) { 
-    LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); 
-    newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); 
-    return; 
-  } 
- 
-  if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue)) 
-    if (SI->getValueOperand()->getType()->isVectorTy()) { 
-      LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); 
-      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); 
-      return; 
-    } 
- 
-  // If all of the operands are identical or constant we have a simple solution. 
-  if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) { 
-    LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); 
-    newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); 
-    return; 
-  } 
- 
-  // We now know that this is a vector of instructions of the same type from 
-  // the same block. 
- 
-  // Don't vectorize ephemeral values. 
-  for (Value *V : VL) { 
-    if (EphValues.count(V)) { 
-      LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V 
-                        << ") is ephemeral.\n"); 
-      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); 
-      return; 
-    } 
-  } 
- 
-  // Check if this is a duplicate of another entry. 
-  if (TreeEntry *E = getTreeEntry(S.OpValue)) { 
-    LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); 
-    if (!E->isSame(VL)) { 
-      LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); 
-      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); 
-      return; 
-    } 
-    // Record the reuse of the tree node.  FIXME, currently this is only used to 
-    // properly draw the graph rather than for the actual vectorization. 
-    E->UserTreeIndices.push_back(UserTreeIdx); 
-    LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue 
-                      << ".\n"); 
-    return; 
-  } 
- 
-  // Check that none of the instructions in the bundle are already in the tree. 
-  for (Value *V : VL) { 
-    auto *I = dyn_cast<Instruction>(V); 
-    if (!I) 
-      continue; 
-    if (getTreeEntry(I)) { 
-      LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V 
-                        << ") is already in tree.\n"); 
-      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); 
-      return; 
-    } 
-  } 
- 
-  // If any of the scalars is marked as a value that needs to stay scalar, then 
-  // we need to gather the scalars. 
-  // The reduction nodes (stored in UserIgnoreList) also should stay scalar. 
-  for (Value *V : VL) { 
-    if (MustGather.count(V) || is_contained(UserIgnoreList, V)) { 
-      LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); 
-      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); 
-      return; 
-    } 
-  } 
- 
-  // Check that all of the users of the scalars that we want to vectorize are 
-  // schedulable. 
-  auto *VL0 = cast<Instruction>(S.OpValue); 
-  BasicBlock *BB = VL0->getParent(); 
- 
-  if (!DT->isReachableFromEntry(BB)) { 
-    // Don't go into unreachable blocks. They may contain instructions with 
-    // dependency cycles which confuse the final scheduling. 
-    LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); 
-    newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); 
-    return; 
-  } 
- 
-  // Check that every instruction appears once in this bundle. 
-  SmallVector<unsigned, 4> ReuseShuffleIndicies; 
-  SmallVector<Value *, 4> UniqueValues; 
-  DenseMap<Value *, unsigned> UniquePositions; 
-  for (Value *V : VL) { 
-    auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); 
-    ReuseShuffleIndicies.emplace_back(Res.first->second); 
-    if (Res.second) 
-      UniqueValues.emplace_back(V); 
-  } 
-  size_t NumUniqueScalarValues = UniqueValues.size(); 
-  if (NumUniqueScalarValues == VL.size()) { 
-    ReuseShuffleIndicies.clear(); 
-  } else { 
-    LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); 
-    if (NumUniqueScalarValues <= 1 || 
-        !llvm::isPowerOf2_32(NumUniqueScalarValues)) { 
-      LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); 
-      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); 
-      return; 
-    } 
-    VL = UniqueValues; 
-  } 
- 
-  auto &BSRef = BlocksSchedules[BB]; 
-  if (!BSRef) 
-    BSRef = std::make_unique<BlockScheduling>(BB); 
- 
-  BlockScheduling &BS = *BSRef.get(); 
- 
-  Optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S); 
-  if (!Bundle) { 
-    LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); 
-    assert((!BS.getScheduleData(VL0) || 
-            !BS.getScheduleData(VL0)->isPartOfBundle()) && 
-           "tryScheduleBundle should cancelScheduling on failure"); 
-    newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 
-                 ReuseShuffleIndicies); 
-    return; 
-  } 
-  LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); 
- 
-  unsigned ShuffleOrOp = S.isAltShuffle() ? 
-                (unsigned) Instruction::ShuffleVector : S.getOpcode(); 
-  switch (ShuffleOrOp) { 
-    case Instruction::PHI: { 
-      auto *PH = cast<PHINode>(VL0); 
- 
-      // Check for terminator values (e.g. invoke). 
+  };
+}
+
+void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
+                        ArrayRef<Value *> UserIgnoreLst) {
+  ExtraValueToDebugLocsMap ExternallyUsedValues;
+  buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);
+}
+
+void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
+                        ExtraValueToDebugLocsMap &ExternallyUsedValues,
+                        ArrayRef<Value *> UserIgnoreLst) {
+  deleteTree();
+  UserIgnoreList = UserIgnoreLst;
+  if (!allSameType(Roots))
+    return;
+  buildTree_rec(Roots, 0, EdgeInfo());
+
+  // Collect the values that we need to extract from the tree.
+  for (auto &TEPtr : VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+
+    // No need to handle users of gathered values.
+    if (Entry->State == TreeEntry::NeedToGather)
+      continue;
+
+    // For each lane:
+    for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
+      Value *Scalar = Entry->Scalars[Lane];
+      int FoundLane = Lane;
+      if (!Entry->ReuseShuffleIndices.empty()) {
+        FoundLane =
+            std::distance(Entry->ReuseShuffleIndices.begin(),
+                          llvm::find(Entry->ReuseShuffleIndices, FoundLane));
+      }
+
+      // Check if the scalar is externally used as an extra arg.
+      auto ExtI = ExternallyUsedValues.find(Scalar);
+      if (ExtI != ExternallyUsedValues.end()) {
+        LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
+                          << Lane << " from " << *Scalar << ".\n");
+        ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
+      }
+      for (User *U : Scalar->users()) {
+        LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
+
+        Instruction *UserInst = dyn_cast<Instruction>(U);
+        if (!UserInst)
+          continue;
+
+        // Skip in-tree scalars that become vectors
+        if (TreeEntry *UseEntry = getTreeEntry(U)) {
+          Value *UseScalar = UseEntry->Scalars[0];
+          // Some in-tree scalars will remain as scalar in vectorized
+          // instructions. If that is the case, the one in Lane 0 will
+          // be used.
+          if (UseScalar != U ||
+              !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
+            LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
+                              << ".\n");
+            assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
+            continue;
+          }
+        }
+
+        // Ignore users in the user ignore list.
+        if (is_contained(UserIgnoreList, UserInst))
+          continue;
+
+        LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "
+                          << Lane << " from " << *Scalar << ".\n");
+        ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
+      }
+    }
+  }
+}
+
+void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
+                            const EdgeInfo &UserTreeIdx) {
+  assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
+
+  InstructionsState S = getSameOpcode(VL);
+  if (Depth == RecursionMaxDepth) {
+    LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
+    newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+    return;
+  }
+
+  // Don't handle vectors.
+  if (S.OpValue->getType()->isVectorTy()) {
+    LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
+    newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+    return;
+  }
+
+  if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
+    if (SI->getValueOperand()->getType()->isVectorTy()) {
+      LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
+      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+      return;
+    }
+
+  // If all of the operands are identical or constant we have a simple solution.
+  if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) {
+    LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
+    newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+    return;
+  }
+
+  // We now know that this is a vector of instructions of the same type from
+  // the same block.
+
+  // Don't vectorize ephemeral values.
+  for (Value *V : VL) {
+    if (EphValues.count(V)) {
+      LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
+                        << ") is ephemeral.\n");
+      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+      return;
+    }
+  }
+
+  // Check if this is a duplicate of another entry.
+  if (TreeEntry *E = getTreeEntry(S.OpValue)) {
+    LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
+    if (!E->isSame(VL)) {
+      LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
+      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+      return;
+    }
+    // Record the reuse of the tree node.  FIXME, currently this is only used to
+    // properly draw the graph rather than for the actual vectorization.
+    E->UserTreeIndices.push_back(UserTreeIdx);
+    LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
+                      << ".\n");
+    return;
+  }
+
+  // Check that none of the instructions in the bundle are already in the tree.
+  for (Value *V : VL) {
+    auto *I = dyn_cast<Instruction>(V);
+    if (!I)
+      continue;
+    if (getTreeEntry(I)) {
+      LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
+                        << ") is already in tree.\n");
+      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+      return;
+    }
+  }
+
+  // If any of the scalars is marked as a value that needs to stay scalar, then
+  // we need to gather the scalars.
+  // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
+  for (Value *V : VL) {
+    if (MustGather.count(V) || is_contained(UserIgnoreList, V)) {
+      LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
+      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+      return;
+    }
+  }
+
+  // Check that all of the users of the scalars that we want to vectorize are
+  // schedulable.
+  auto *VL0 = cast<Instruction>(S.OpValue);
+  BasicBlock *BB = VL0->getParent();
+
+  if (!DT->isReachableFromEntry(BB)) {
+    // Don't go into unreachable blocks. They may contain instructions with
+    // dependency cycles which confuse the final scheduling.
+    LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
+    newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+    return;
+  }
+
+  // Check that every instruction appears once in this bundle.
+  SmallVector<unsigned, 4> ReuseShuffleIndicies;
+  SmallVector<Value *, 4> UniqueValues;
+  DenseMap<Value *, unsigned> UniquePositions;
+  for (Value *V : VL) {
+    auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
+    ReuseShuffleIndicies.emplace_back(Res.first->second);
+    if (Res.second)
+      UniqueValues.emplace_back(V);
+  }
+  size_t NumUniqueScalarValues = UniqueValues.size();
+  if (NumUniqueScalarValues == VL.size()) {
+    ReuseShuffleIndicies.clear();
+  } else {
+    LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
+    if (NumUniqueScalarValues <= 1 ||
+        !llvm::isPowerOf2_32(NumUniqueScalarValues)) {
+      LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
+      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+      return;
+    }
+    VL = UniqueValues;
+  }
+
+  auto &BSRef = BlocksSchedules[BB];
+  if (!BSRef)
+    BSRef = std::make_unique<BlockScheduling>(BB);
+
+  BlockScheduling &BS = *BSRef.get();
+
+  Optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);
+  if (!Bundle) {
+    LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
+    assert((!BS.getScheduleData(VL0) ||
+            !BS.getScheduleData(VL0)->isPartOfBundle()) &&
+           "tryScheduleBundle should cancelScheduling on failure");
+    newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                 ReuseShuffleIndicies);
+    return;
+  }
+  LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
+
+  unsigned ShuffleOrOp = S.isAltShuffle() ?
+                (unsigned) Instruction::ShuffleVector : S.getOpcode();
+  switch (ShuffleOrOp) {
+    case Instruction::PHI: {
+      auto *PH = cast<PHINode>(VL0);
+
+      // Check for terminator values (e.g. invoke).
       for (Value *V : VL)
         for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
-          Instruction *Term = dyn_cast<Instruction>( 
+          Instruction *Term = dyn_cast<Instruction>(
               cast<PHINode>(V)->getIncomingValueForBlock(
                   PH->getIncomingBlock(I)));
-          if (Term && Term->isTerminator()) { 
-            LLVM_DEBUG(dbgs() 
-                       << "SLP: Need to swizzle PHINodes (terminator use).\n"); 
-            BS.cancelScheduling(VL, VL0); 
-            newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 
-                         ReuseShuffleIndicies); 
-            return; 
-          } 
-        } 
- 
-      TreeEntry *TE = 
-          newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies); 
-      LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); 
- 
-      // Keeps the reordered operands to avoid code duplication. 
-      SmallVector<ValueList, 2> OperandsVec; 
+          if (Term && Term->isTerminator()) {
+            LLVM_DEBUG(dbgs()
+                       << "SLP: Need to swizzle PHINodes (terminator use).\n");
+            BS.cancelScheduling(VL, VL0);
+            newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                         ReuseShuffleIndicies);
+            return;
+          }
+        }
+
+      TreeEntry *TE =
+          newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
+      LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
+
+      // Keeps the reordered operands to avoid code duplication.
+      SmallVector<ValueList, 2> OperandsVec;
       for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
-        ValueList Operands; 
-        // Prepare the operand vector. 
+        ValueList Operands;
+        // Prepare the operand vector.
         for (Value *V : VL)
           Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(
               PH->getIncomingBlock(I)));
         TE->setOperand(I, Operands);
-        OperandsVec.push_back(Operands); 
-      } 
-      for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx) 
-        buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx}); 
-      return; 
-    } 
-    case Instruction::ExtractValue: 
-    case Instruction::ExtractElement: { 
-      OrdersType CurrentOrder; 
-      bool Reuse = canReuseExtract(VL, VL0, CurrentOrder); 
-      if (Reuse) { 
-        LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n"); 
-        ++NumOpsWantToKeepOriginalOrder; 
-        newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 
-                     ReuseShuffleIndicies); 
-        // This is a special case, as it does not gather, but at the same time 
-        // we are not extending buildTree_rec() towards the operands. 
-        ValueList Op0; 
-        Op0.assign(VL.size(), VL0->getOperand(0)); 
-        VectorizableTree.back()->setOperand(0, Op0); 
-        return; 
-      } 
-      if (!CurrentOrder.empty()) { 
-        LLVM_DEBUG({ 
-          dbgs() << "SLP: Reusing or shuffling of reordered extract sequence " 
-                    "with order"; 
-          for (unsigned Idx : CurrentOrder) 
-            dbgs() << " " << Idx; 
-          dbgs() << "\n"; 
-        }); 
-        // Insert new order with initial value 0, if it does not exist, 
-        // otherwise return the iterator to the existing one. 
-        newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 
+        OperandsVec.push_back(Operands);
+      }
+      for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx)
+        buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx});
+      return;
+    }
+    case Instruction::ExtractValue:
+    case Instruction::ExtractElement: {
+      OrdersType CurrentOrder;
+      bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
+      if (Reuse) {
+        LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
+        ++NumOpsWantToKeepOriginalOrder;
+        newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                     ReuseShuffleIndicies);
+        // This is a special case, as it does not gather, but at the same time
+        // we are not extending buildTree_rec() towards the operands.
+        ValueList Op0;
+        Op0.assign(VL.size(), VL0->getOperand(0));
+        VectorizableTree.back()->setOperand(0, Op0);
+        return;
+      }
+      if (!CurrentOrder.empty()) {
+        LLVM_DEBUG({
+          dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
+                    "with order";
+          for (unsigned Idx : CurrentOrder)
+            dbgs() << " " << Idx;
+          dbgs() << "\n";
+        });
+        // Insert new order with initial value 0, if it does not exist,
+        // otherwise return the iterator to the existing one.
+        newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                      ReuseShuffleIndicies, CurrentOrder);
         findRootOrder(CurrentOrder);
         ++NumOpsWantToKeepOrder[CurrentOrder];
-        // This is a special case, as it does not gather, but at the same time 
-        // we are not extending buildTree_rec() towards the operands. 
-        ValueList Op0; 
-        Op0.assign(VL.size(), VL0->getOperand(0)); 
-        VectorizableTree.back()->setOperand(0, Op0); 
-        return; 
-      } 
-      LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); 
-      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 
-                   ReuseShuffleIndicies); 
-      BS.cancelScheduling(VL, VL0); 
-      return; 
-    } 
-    case Instruction::Load: { 
-      // Check that a vectorized load would load the same memory as a scalar 
-      // load. For example, we don't want to vectorize loads that are smaller 
-      // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM 
-      // treats loading/storing it as an i8 struct. If we vectorize loads/stores 
-      // from such a struct, we read/write packed bits disagreeing with the 
-      // unvectorized version. 
-      Type *ScalarTy = VL0->getType(); 
- 
-      if (DL->getTypeSizeInBits(ScalarTy) != 
-          DL->getTypeAllocSizeInBits(ScalarTy)) { 
-        BS.cancelScheduling(VL, VL0); 
-        newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 
-                     ReuseShuffleIndicies); 
-        LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); 
-        return; 
-      } 
- 
-      // Make sure all loads in the bundle are simple - we can't vectorize 
-      // atomic or volatile loads. 
-      SmallVector<Value *, 4> PointerOps(VL.size()); 
-      auto POIter = PointerOps.begin(); 
-      for (Value *V : VL) { 
-        auto *L = cast<LoadInst>(V); 
-        if (!L->isSimple()) { 
-          BS.cancelScheduling(VL, VL0); 
-          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 
-                       ReuseShuffleIndicies); 
-          LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); 
-          return; 
-        } 
-        *POIter = L->getPointerOperand(); 
-        ++POIter; 
-      } 
- 
-      OrdersType CurrentOrder; 
-      // Check the order of pointer operands. 
-      if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) { 
-        Value *Ptr0; 
-        Value *PtrN; 
-        if (CurrentOrder.empty()) { 
-          Ptr0 = PointerOps.front(); 
-          PtrN = PointerOps.back(); 
-        } else { 
-          Ptr0 = PointerOps[CurrentOrder.front()]; 
-          PtrN = PointerOps[CurrentOrder.back()]; 
-        } 
-        const SCEV *Scev0 = SE->getSCEV(Ptr0); 
-        const SCEV *ScevN = SE->getSCEV(PtrN); 
-        const auto *Diff = 
-            dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0)); 
-        uint64_t Size = DL->getTypeAllocSize(ScalarTy); 
-        // Check that the sorted loads are consecutive. 
-        if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) { 
-          if (CurrentOrder.empty()) { 
-            // Original loads are consecutive and does not require reordering. 
-            ++NumOpsWantToKeepOriginalOrder; 
-            TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, 
-                                         UserTreeIdx, ReuseShuffleIndicies); 
-            TE->setOperandsInOrder(); 
-            LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); 
-          } else { 
-            // Need to reorder. 
-            TreeEntry *TE = 
-                newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 
+        // This is a special case, as it does not gather, but at the same time
+        // we are not extending buildTree_rec() towards the operands.
+        ValueList Op0;
+        Op0.assign(VL.size(), VL0->getOperand(0));
+        VectorizableTree.back()->setOperand(0, Op0);
+        return;
+      }
+      LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
+      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                   ReuseShuffleIndicies);
+      BS.cancelScheduling(VL, VL0);
+      return;
+    }
+    case Instruction::Load: {
+      // Check that a vectorized load would load the same memory as a scalar
+      // load. For example, we don't want to vectorize loads that are smaller
+      // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
+      // treats loading/storing it as an i8 struct. If we vectorize loads/stores
+      // from such a struct, we read/write packed bits disagreeing with the
+      // unvectorized version.
+      Type *ScalarTy = VL0->getType();
+
+      if (DL->getTypeSizeInBits(ScalarTy) !=
+          DL->getTypeAllocSizeInBits(ScalarTy)) {
+        BS.cancelScheduling(VL, VL0);
+        newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                     ReuseShuffleIndicies);
+        LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
+        return;
+      }
+
+      // Make sure all loads in the bundle are simple - we can't vectorize
+      // atomic or volatile loads.
+      SmallVector<Value *, 4> PointerOps(VL.size());
+      auto POIter = PointerOps.begin();
+      for (Value *V : VL) {
+        auto *L = cast<LoadInst>(V);
+        if (!L->isSimple()) {
+          BS.cancelScheduling(VL, VL0);
+          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                       ReuseShuffleIndicies);
+          LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
+          return;
+        }
+        *POIter = L->getPointerOperand();
+        ++POIter;
+      }
+
+      OrdersType CurrentOrder;
+      // Check the order of pointer operands.
+      if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {
+        Value *Ptr0;
+        Value *PtrN;
+        if (CurrentOrder.empty()) {
+          Ptr0 = PointerOps.front();
+          PtrN = PointerOps.back();
+        } else {
+          Ptr0 = PointerOps[CurrentOrder.front()];
+          PtrN = PointerOps[CurrentOrder.back()];
+        }
+        const SCEV *Scev0 = SE->getSCEV(Ptr0);
+        const SCEV *ScevN = SE->getSCEV(PtrN);
+        const auto *Diff =
+            dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
+        uint64_t Size = DL->getTypeAllocSize(ScalarTy);
+        // Check that the sorted loads are consecutive.
+        if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) {
+          if (CurrentOrder.empty()) {
+            // Original loads are consecutive and does not require reordering.
+            ++NumOpsWantToKeepOriginalOrder;
+            TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
+                                         UserTreeIdx, ReuseShuffleIndicies);
+            TE->setOperandsInOrder();
+            LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
+          } else {
+            // Need to reorder.
+            TreeEntry *TE =
+                newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                              ReuseShuffleIndicies, CurrentOrder);
-            TE->setOperandsInOrder(); 
-            LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); 
+            TE->setOperandsInOrder();
+            LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
             findRootOrder(CurrentOrder);
             ++NumOpsWantToKeepOrder[CurrentOrder];
-          } 
-          return; 
-        } 
+          }
+          return;
+        }
         // Vectorizing non-consecutive loads with `llvm.masked.gather`.
         TreeEntry *TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
                                      UserTreeIdx, ReuseShuffleIndicies);
@@ -2899,209 +2899,209 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         buildTree_rec(PointerOps, Depth + 1, {TE, 0});
         LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
         return;
-      } 
- 
-      LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); 
-      BS.cancelScheduling(VL, VL0); 
-      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 
-                   ReuseShuffleIndicies); 
-      return; 
-    } 
-    case Instruction::ZExt: 
-    case Instruction::SExt: 
-    case Instruction::FPToUI: 
-    case Instruction::FPToSI: 
-    case Instruction::FPExt: 
-    case Instruction::PtrToInt: 
-    case Instruction::IntToPtr: 
-    case Instruction::SIToFP: 
-    case Instruction::UIToFP: 
-    case Instruction::Trunc: 
-    case Instruction::FPTrunc: 
-    case Instruction::BitCast: { 
-      Type *SrcTy = VL0->getOperand(0)->getType(); 
-      for (Value *V : VL) { 
-        Type *Ty = cast<Instruction>(V)->getOperand(0)->getType(); 
-        if (Ty != SrcTy || !isValidElementType(Ty)) { 
-          BS.cancelScheduling(VL, VL0); 
-          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 
-                       ReuseShuffleIndicies); 
-          LLVM_DEBUG(dbgs() 
-                     << "SLP: Gathering casts with different src types.\n"); 
-          return; 
-        } 
-      } 
-      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 
-                                   ReuseShuffleIndicies); 
-      LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n"); 
- 
-      TE->setOperandsInOrder(); 
-      for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 
-        ValueList Operands; 
-        // Prepare the operand vector. 
-        for (Value *V : VL) 
-          Operands.push_back(cast<Instruction>(V)->getOperand(i)); 
- 
-        buildTree_rec(Operands, Depth + 1, {TE, i}); 
-      } 
-      return; 
-    } 
-    case Instruction::ICmp: 
-    case Instruction::FCmp: { 
-      // Check that all of the compares have the same predicate. 
-      CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); 
-      CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0); 
-      Type *ComparedTy = VL0->getOperand(0)->getType(); 
-      for (Value *V : VL) { 
-        CmpInst *Cmp = cast<CmpInst>(V); 
-        if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) || 
-            Cmp->getOperand(0)->getType() != ComparedTy) { 
-          BS.cancelScheduling(VL, VL0); 
-          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 
-                       ReuseShuffleIndicies); 
-          LLVM_DEBUG(dbgs() 
-                     << "SLP: Gathering cmp with different predicate.\n"); 
-          return; 
-        } 
-      } 
- 
-      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 
-                                   ReuseShuffleIndicies); 
-      LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n"); 
- 
-      ValueList Left, Right; 
-      if (cast<CmpInst>(VL0)->isCommutative()) { 
-        // Commutative predicate - collect + sort operands of the instructions 
-        // so that each side is more likely to have the same opcode. 
-        assert(P0 == SwapP0 && "Commutative Predicate mismatch"); 
-        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); 
-      } else { 
-        // Collect operands - commute if it uses the swapped predicate. 
-        for (Value *V : VL) { 
-          auto *Cmp = cast<CmpInst>(V); 
-          Value *LHS = Cmp->getOperand(0); 
-          Value *RHS = Cmp->getOperand(1); 
-          if (Cmp->getPredicate() != P0) 
-            std::swap(LHS, RHS); 
-          Left.push_back(LHS); 
-          Right.push_back(RHS); 
-        } 
-      } 
-      TE->setOperand(0, Left); 
-      TE->setOperand(1, Right); 
-      buildTree_rec(Left, Depth + 1, {TE, 0}); 
-      buildTree_rec(Right, Depth + 1, {TE, 1}); 
-      return; 
-    } 
-    case Instruction::Select: 
-    case Instruction::FNeg: 
-    case Instruction::Add: 
-    case Instruction::FAdd: 
-    case Instruction::Sub: 
-    case Instruction::FSub: 
-    case Instruction::Mul: 
-    case Instruction::FMul: 
-    case Instruction::UDiv: 
-    case Instruction::SDiv: 
-    case Instruction::FDiv: 
-    case Instruction::URem: 
-    case Instruction::SRem: 
-    case Instruction::FRem: 
-    case Instruction::Shl: 
-    case Instruction::LShr: 
-    case Instruction::AShr: 
-    case Instruction::And: 
-    case Instruction::Or: 
-    case Instruction::Xor: { 
-      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 
-                                   ReuseShuffleIndicies); 
-      LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n"); 
- 
-      // Sort operands of the instructions so that each side is more likely to 
-      // have the same opcode. 
-      if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) { 
-        ValueList Left, Right; 
-        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); 
-        TE->setOperand(0, Left); 
-        TE->setOperand(1, Right); 
-        buildTree_rec(Left, Depth + 1, {TE, 0}); 
-        buildTree_rec(Right, Depth + 1, {TE, 1}); 
-        return; 
-      } 
- 
-      TE->setOperandsInOrder(); 
-      for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 
-        ValueList Operands; 
-        // Prepare the operand vector. 
+      }
+
+      LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
+      BS.cancelScheduling(VL, VL0);
+      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                   ReuseShuffleIndicies);
+      return;
+    }
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::SIToFP:
+    case Instruction::UIToFP:
+    case Instruction::Trunc:
+    case Instruction::FPTrunc:
+    case Instruction::BitCast: {
+      Type *SrcTy = VL0->getOperand(0)->getType();
+      for (Value *V : VL) {
+        Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
+        if (Ty != SrcTy || !isValidElementType(Ty)) {
+          BS.cancelScheduling(VL, VL0);
+          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                       ReuseShuffleIndicies);
+          LLVM_DEBUG(dbgs()
+                     << "SLP: Gathering casts with different src types.\n");
+          return;
+        }
+      }
+      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                                   ReuseShuffleIndicies);
+      LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
+
+      TE->setOperandsInOrder();
+      for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+        ValueList Operands;
+        // Prepare the operand vector.
+        for (Value *V : VL)
+          Operands.push_back(cast<Instruction>(V)->getOperand(i));
+
+        buildTree_rec(Operands, Depth + 1, {TE, i});
+      }
+      return;
+    }
+    case Instruction::ICmp:
+    case Instruction::FCmp: {
+      // Check that all of the compares have the same predicate.
+      CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
+      CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);
+      Type *ComparedTy = VL0->getOperand(0)->getType();
+      for (Value *V : VL) {
+        CmpInst *Cmp = cast<CmpInst>(V);
+        if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
+            Cmp->getOperand(0)->getType() != ComparedTy) {
+          BS.cancelScheduling(VL, VL0);
+          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                       ReuseShuffleIndicies);
+          LLVM_DEBUG(dbgs()
+                     << "SLP: Gathering cmp with different predicate.\n");
+          return;
+        }
+      }
+
+      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                                   ReuseShuffleIndicies);
+      LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
+
+      ValueList Left, Right;
+      if (cast<CmpInst>(VL0)->isCommutative()) {
+        // Commutative predicate - collect + sort operands of the instructions
+        // so that each side is more likely to have the same opcode.
+        assert(P0 == SwapP0 && "Commutative Predicate mismatch");
+        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
+      } else {
+        // Collect operands - commute if it uses the swapped predicate.
+        for (Value *V : VL) {
+          auto *Cmp = cast<CmpInst>(V);
+          Value *LHS = Cmp->getOperand(0);
+          Value *RHS = Cmp->getOperand(1);
+          if (Cmp->getPredicate() != P0)
+            std::swap(LHS, RHS);
+          Left.push_back(LHS);
+          Right.push_back(RHS);
+        }
+      }
+      TE->setOperand(0, Left);
+      TE->setOperand(1, Right);
+      buildTree_rec(Left, Depth + 1, {TE, 0});
+      buildTree_rec(Right, Depth + 1, {TE, 1});
+      return;
+    }
+    case Instruction::Select:
+    case Instruction::FNeg:
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor: {
+      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                                   ReuseShuffleIndicies);
+      LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
+
+      // Sort operands of the instructions so that each side is more likely to
+      // have the same opcode.
+      if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
+        ValueList Left, Right;
+        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
+        TE->setOperand(0, Left);
+        TE->setOperand(1, Right);
+        buildTree_rec(Left, Depth + 1, {TE, 0});
+        buildTree_rec(Right, Depth + 1, {TE, 1});
+        return;
+      }
+
+      TE->setOperandsInOrder();
+      for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+        ValueList Operands;
+        // Prepare the operand vector.
+        for (Value *V : VL)
+          Operands.push_back(cast<Instruction>(V)->getOperand(i));
+
+        buildTree_rec(Operands, Depth + 1, {TE, i});
+      }
+      return;
+    }
+    case Instruction::GetElementPtr: {
+      // We don't combine GEPs with complicated (nested) indexing.
+      for (Value *V : VL) {
+        if (cast<Instruction>(V)->getNumOperands() != 2) {
+          LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
+          BS.cancelScheduling(VL, VL0);
+          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                       ReuseShuffleIndicies);
+          return;
+        }
+      }
+
+      // We can't combine several GEPs into one vector if they operate on
+      // different types.
+      Type *Ty0 = VL0->getOperand(0)->getType();
+      for (Value *V : VL) {
+        Type *CurTy = cast<Instruction>(V)->getOperand(0)->getType();
+        if (Ty0 != CurTy) {
+          LLVM_DEBUG(dbgs()
+                     << "SLP: not-vectorizable GEP (different types).\n");
+          BS.cancelScheduling(VL, VL0);
+          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                       ReuseShuffleIndicies);
+          return;
+        }
+      }
+
+      // We don't combine GEPs with non-constant indexes.
+      Type *Ty1 = VL0->getOperand(1)->getType();
+      for (Value *V : VL) {
+        auto Op = cast<Instruction>(V)->getOperand(1);
+        if (!isa<ConstantInt>(Op) ||
+            (Op->getType() != Ty1 &&
+             Op->getType()->getScalarSizeInBits() >
+                 DL->getIndexSizeInBits(
+                     V->getType()->getPointerAddressSpace()))) {
+          LLVM_DEBUG(dbgs()
+                     << "SLP: not-vectorizable GEP (non-constant indexes).\n");
+          BS.cancelScheduling(VL, VL0);
+          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                       ReuseShuffleIndicies);
+          return;
+        }
+      }
+
+      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                                   ReuseShuffleIndicies);
+      LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
+      TE->setOperandsInOrder();
+      for (unsigned i = 0, e = 2; i < e; ++i) {
+        ValueList Operands;
+        // Prepare the operand vector.
         for (Value *V : VL)
           Operands.push_back(cast<Instruction>(V)->getOperand(i));
- 
-        buildTree_rec(Operands, Depth + 1, {TE, i}); 
-      } 
-      return; 
-    } 
-    case Instruction::GetElementPtr: { 
-      // We don't combine GEPs with complicated (nested) indexing. 
-      for (Value *V : VL) { 
-        if (cast<Instruction>(V)->getNumOperands() != 2) { 
-          LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); 
-          BS.cancelScheduling(VL, VL0); 
-          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 
-                       ReuseShuffleIndicies); 
-          return; 
-        } 
-      } 
- 
-      // We can't combine several GEPs into one vector if they operate on 
-      // different types. 
-      Type *Ty0 = VL0->getOperand(0)->getType(); 
-      for (Value *V : VL) { 
-        Type *CurTy = cast<Instruction>(V)->getOperand(0)->getType(); 
-        if (Ty0 != CurTy) { 
-          LLVM_DEBUG(dbgs() 
-                     << "SLP: not-vectorizable GEP (different types).\n"); 
-          BS.cancelScheduling(VL, VL0); 
-          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 
-                       ReuseShuffleIndicies); 
-          return; 
-        } 
-      } 
- 
-      // We don't combine GEPs with non-constant indexes. 
-      Type *Ty1 = VL0->getOperand(1)->getType(); 
-      for (Value *V : VL) { 
-        auto Op = cast<Instruction>(V)->getOperand(1); 
-        if (!isa<ConstantInt>(Op) || 
-            (Op->getType() != Ty1 && 
-             Op->getType()->getScalarSizeInBits() > 
-                 DL->getIndexSizeInBits( 
-                     V->getType()->getPointerAddressSpace()))) { 
-          LLVM_DEBUG(dbgs() 
-                     << "SLP: not-vectorizable GEP (non-constant indexes).\n"); 
-          BS.cancelScheduling(VL, VL0); 
-          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 
-                       ReuseShuffleIndicies); 
-          return; 
-        } 
-      } 
- 
-      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 
-                                   ReuseShuffleIndicies); 
-      LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); 
-      TE->setOperandsInOrder(); 
-      for (unsigned i = 0, e = 2; i < e; ++i) { 
-        ValueList Operands; 
-        // Prepare the operand vector. 
-        for (Value *V : VL) 
-          Operands.push_back(cast<Instruction>(V)->getOperand(i)); 
- 
-        buildTree_rec(Operands, Depth + 1, {TE, i}); 
-      } 
-      return; 
-    } 
-    case Instruction::Store: { 
-      // Check if the stores are consecutive or if we need to swizzle them. 
-      llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType(); 
+
+        buildTree_rec(Operands, Depth + 1, {TE, i});
+      }
+      return;
+    }
+    case Instruction::Store: {
+      // Check if the stores are consecutive or if we need to swizzle them.
+      llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
       // Avoid types that are padded when being allocated as scalars, while
       // being packed together in a vector (such as i1).
       if (DL->getTypeSizeInBits(ScalarTy) !=
@@ -3112,511 +3112,511 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
         return;
       }
-      // Make sure all stores in the bundle are simple - we can't vectorize 
-      // atomic or volatile stores. 
-      SmallVector<Value *, 4> PointerOps(VL.size()); 
-      ValueList Operands(VL.size()); 
-      auto POIter = PointerOps.begin(); 
-      auto OIter = Operands.begin(); 
-      for (Value *V : VL) { 
-        auto *SI = cast<StoreInst>(V); 
-        if (!SI->isSimple()) { 
-          BS.cancelScheduling(VL, VL0); 
-          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 
-                       ReuseShuffleIndicies); 
-          LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n"); 
-          return; 
-        } 
-        *POIter = SI->getPointerOperand(); 
-        *OIter = SI->getValueOperand(); 
-        ++POIter; 
-        ++OIter; 
-      } 
- 
-      OrdersType CurrentOrder; 
-      // Check the order of pointer operands. 
-      if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) { 
-        Value *Ptr0; 
-        Value *PtrN; 
-        if (CurrentOrder.empty()) { 
-          Ptr0 = PointerOps.front(); 
-          PtrN = PointerOps.back(); 
-        } else { 
-          Ptr0 = PointerOps[CurrentOrder.front()]; 
-          PtrN = PointerOps[CurrentOrder.back()]; 
-        } 
-        const SCEV *Scev0 = SE->getSCEV(Ptr0); 
-        const SCEV *ScevN = SE->getSCEV(PtrN); 
-        const auto *Diff = 
-            dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0)); 
-        uint64_t Size = DL->getTypeAllocSize(ScalarTy); 
-        // Check that the sorted pointer operands are consecutive. 
-        if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) { 
-          if (CurrentOrder.empty()) { 
-            // Original stores are consecutive and does not require reordering. 
-            ++NumOpsWantToKeepOriginalOrder; 
-            TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, 
-                                         UserTreeIdx, ReuseShuffleIndicies); 
-            TE->setOperandsInOrder(); 
-            buildTree_rec(Operands, Depth + 1, {TE, 0}); 
-            LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); 
-          } else { 
-            TreeEntry *TE = 
-                newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 
+      // Make sure all stores in the bundle are simple - we can't vectorize
+      // atomic or volatile stores.
+      SmallVector<Value *, 4> PointerOps(VL.size());
+      ValueList Operands(VL.size());
+      auto POIter = PointerOps.begin();
+      auto OIter = Operands.begin();
+      for (Value *V : VL) {
+        auto *SI = cast<StoreInst>(V);
+        if (!SI->isSimple()) {
+          BS.cancelScheduling(VL, VL0);
+          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                       ReuseShuffleIndicies);
+          LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
+          return;
+        }
+        *POIter = SI->getPointerOperand();
+        *OIter = SI->getValueOperand();
+        ++POIter;
+        ++OIter;
+      }
+
+      OrdersType CurrentOrder;
+      // Check the order of pointer operands.
+      if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {
+        Value *Ptr0;
+        Value *PtrN;
+        if (CurrentOrder.empty()) {
+          Ptr0 = PointerOps.front();
+          PtrN = PointerOps.back();
+        } else {
+          Ptr0 = PointerOps[CurrentOrder.front()];
+          PtrN = PointerOps[CurrentOrder.back()];
+        }
+        const SCEV *Scev0 = SE->getSCEV(Ptr0);
+        const SCEV *ScevN = SE->getSCEV(PtrN);
+        const auto *Diff =
+            dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
+        uint64_t Size = DL->getTypeAllocSize(ScalarTy);
+        // Check that the sorted pointer operands are consecutive.
+        if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) {
+          if (CurrentOrder.empty()) {
+            // Original stores are consecutive and does not require reordering.
+            ++NumOpsWantToKeepOriginalOrder;
+            TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
+                                         UserTreeIdx, ReuseShuffleIndicies);
+            TE->setOperandsInOrder();
+            buildTree_rec(Operands, Depth + 1, {TE, 0});
+            LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
+          } else {
+            TreeEntry *TE =
+                newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                              ReuseShuffleIndicies, CurrentOrder);
-            TE->setOperandsInOrder(); 
-            buildTree_rec(Operands, Depth + 1, {TE, 0}); 
-            LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n"); 
+            TE->setOperandsInOrder();
+            buildTree_rec(Operands, Depth + 1, {TE, 0});
+            LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
             findRootOrder(CurrentOrder);
             ++NumOpsWantToKeepOrder[CurrentOrder];
-          } 
-          return; 
-        } 
-      } 
- 
-      BS.cancelScheduling(VL, VL0); 
-      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 
-                   ReuseShuffleIndicies); 
-      LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); 
-      return; 
-    } 
-    case Instruction::Call: { 
-      // Check if the calls are all to the same vectorizable intrinsic or 
-      // library function. 
-      CallInst *CI = cast<CallInst>(VL0); 
-      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 
- 
-      VFShape Shape = VFShape::get( 
+          }
+          return;
+        }
+      }
+
+      BS.cancelScheduling(VL, VL0);
+      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                   ReuseShuffleIndicies);
+      LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
+      return;
+    }
+    case Instruction::Call: {
+      // Check if the calls are all to the same vectorizable intrinsic or
+      // library function.
+      CallInst *CI = cast<CallInst>(VL0);
+      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+
+      VFShape Shape = VFShape::get(
           *CI, ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
-          false /*HasGlobalPred*/); 
-      Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 
- 
-      if (!VecFunc && !isTriviallyVectorizable(ID)) { 
-        BS.cancelScheduling(VL, VL0); 
-        newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 
-                     ReuseShuffleIndicies); 
-        LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); 
-        return; 
-      } 
-      Function *F = CI->getCalledFunction(); 
-      unsigned NumArgs = CI->getNumArgOperands(); 
-      SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr); 
-      for (unsigned j = 0; j != NumArgs; ++j) 
-        if (hasVectorInstrinsicScalarOpd(ID, j)) 
-          ScalarArgs[j] = CI->getArgOperand(j); 
-      for (Value *V : VL) { 
-        CallInst *CI2 = dyn_cast<CallInst>(V); 
-        if (!CI2 || CI2->getCalledFunction() != F || 
-            getVectorIntrinsicIDForCall(CI2, TLI) != ID || 
-            (VecFunc && 
-             VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) || 
-            !CI->hasIdenticalOperandBundleSchema(*CI2)) { 
-          BS.cancelScheduling(VL, VL0); 
-          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 
-                       ReuseShuffleIndicies); 
-          LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V 
-                            << "\n"); 
-          return; 
-        } 
-        // Some intrinsics have scalar arguments and should be same in order for 
-        // them to be vectorized. 
-        for (unsigned j = 0; j != NumArgs; ++j) { 
-          if (hasVectorInstrinsicScalarOpd(ID, j)) { 
-            Value *A1J = CI2->getArgOperand(j); 
-            if (ScalarArgs[j] != A1J) { 
-              BS.cancelScheduling(VL, VL0); 
-              newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 
-                           ReuseShuffleIndicies); 
-              LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI 
-                                << " argument " << ScalarArgs[j] << "!=" << A1J 
-                                << "\n"); 
-              return; 
-            } 
-          } 
-        } 
-        // Verify that the bundle operands are identical between the two calls. 
-        if (CI->hasOperandBundles() && 
-            !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(), 
-                        CI->op_begin() + CI->getBundleOperandsEndIndex(), 
-                        CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { 
-          BS.cancelScheduling(VL, VL0); 
-          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 
-                       ReuseShuffleIndicies); 
-          LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" 
-                            << *CI << "!=" << *V << '\n'); 
-          return; 
-        } 
-      } 
- 
-      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 
-                                   ReuseShuffleIndicies); 
-      TE->setOperandsInOrder(); 
-      for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { 
-        ValueList Operands; 
-        // Prepare the operand vector. 
-        for (Value *V : VL) { 
-          auto *CI2 = cast<CallInst>(V); 
-          Operands.push_back(CI2->getArgOperand(i)); 
-        } 
-        buildTree_rec(Operands, Depth + 1, {TE, i}); 
-      } 
-      return; 
-    } 
-    case Instruction::ShuffleVector: { 
-      // If this is not an alternate sequence of opcode like add-sub 
-      // then do not vectorize this instruction. 
-      if (!S.isAltShuffle()) { 
-        BS.cancelScheduling(VL, VL0); 
-        newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 
-                     ReuseShuffleIndicies); 
-        LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); 
-        return; 
-      } 
-      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, 
-                                   ReuseShuffleIndicies); 
-      LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); 
- 
-      // Reorder operands if reordering would enable vectorization. 
-      if (isa<BinaryOperator>(VL0)) { 
-        ValueList Left, Right; 
-        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); 
-        TE->setOperand(0, Left); 
-        TE->setOperand(1, Right); 
-        buildTree_rec(Left, Depth + 1, {TE, 0}); 
-        buildTree_rec(Right, Depth + 1, {TE, 1}); 
-        return; 
-      } 
- 
-      TE->setOperandsInOrder(); 
-      for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 
-        ValueList Operands; 
-        // Prepare the operand vector. 
-        for (Value *V : VL) 
-          Operands.push_back(cast<Instruction>(V)->getOperand(i)); 
- 
-        buildTree_rec(Operands, Depth + 1, {TE, i}); 
-      } 
-      return; 
-    } 
-    default: 
-      BS.cancelScheduling(VL, VL0); 
-      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, 
-                   ReuseShuffleIndicies); 
-      LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); 
-      return; 
-  } 
-} 
- 
-unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { 
-  unsigned N = 1; 
-  Type *EltTy = T; 
- 
-  while (isa<StructType>(EltTy) || isa<ArrayType>(EltTy) || 
-         isa<VectorType>(EltTy)) { 
-    if (auto *ST = dyn_cast<StructType>(EltTy)) { 
-      // Check that struct is homogeneous. 
-      for (const auto *Ty : ST->elements()) 
-        if (Ty != *ST->element_begin()) 
-          return 0; 
-      N *= ST->getNumElements(); 
-      EltTy = *ST->element_begin(); 
-    } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) { 
-      N *= AT->getNumElements(); 
-      EltTy = AT->getElementType(); 
-    } else { 
+          false /*HasGlobalPred*/);
+      Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
+
+      if (!VecFunc && !isTriviallyVectorizable(ID)) {
+        BS.cancelScheduling(VL, VL0);
+        newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                     ReuseShuffleIndicies);
+        LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
+        return;
+      }
+      Function *F = CI->getCalledFunction();
+      unsigned NumArgs = CI->getNumArgOperands();
+      SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr);
+      for (unsigned j = 0; j != NumArgs; ++j)
+        if (hasVectorInstrinsicScalarOpd(ID, j))
+          ScalarArgs[j] = CI->getArgOperand(j);
+      for (Value *V : VL) {
+        CallInst *CI2 = dyn_cast<CallInst>(V);
+        if (!CI2 || CI2->getCalledFunction() != F ||
+            getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
+            (VecFunc &&
+             VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
+            !CI->hasIdenticalOperandBundleSchema(*CI2)) {
+          BS.cancelScheduling(VL, VL0);
+          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                       ReuseShuffleIndicies);
+          LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
+                            << "\n");
+          return;
+        }
+        // Some intrinsics have scalar arguments and should be same in order for
+        // them to be vectorized.
+        for (unsigned j = 0; j != NumArgs; ++j) {
+          if (hasVectorInstrinsicScalarOpd(ID, j)) {
+            Value *A1J = CI2->getArgOperand(j);
+            if (ScalarArgs[j] != A1J) {
+              BS.cancelScheduling(VL, VL0);
+              newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                           ReuseShuffleIndicies);
+              LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
+                                << " argument " << ScalarArgs[j] << "!=" << A1J
+                                << "\n");
+              return;
+            }
+          }
+        }
+        // Verify that the bundle operands are identical between the two calls.
+        if (CI->hasOperandBundles() &&
+            !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
+                        CI->op_begin() + CI->getBundleOperandsEndIndex(),
+                        CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
+          BS.cancelScheduling(VL, VL0);
+          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                       ReuseShuffleIndicies);
+          LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"
+                            << *CI << "!=" << *V << '\n');
+          return;
+        }
+      }
+
+      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                                   ReuseShuffleIndicies);
+      TE->setOperandsInOrder();
+      for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
+        ValueList Operands;
+        // Prepare the operand vector.
+        for (Value *V : VL) {
+          auto *CI2 = cast<CallInst>(V);
+          Operands.push_back(CI2->getArgOperand(i));
+        }
+        buildTree_rec(Operands, Depth + 1, {TE, i});
+      }
+      return;
+    }
+    case Instruction::ShuffleVector: {
+      // If this is not an alternate sequence of opcode like add-sub
+      // then do not vectorize this instruction.
+      if (!S.isAltShuffle()) {
+        BS.cancelScheduling(VL, VL0);
+        newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                     ReuseShuffleIndicies);
+        LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
+        return;
+      }
+      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                                   ReuseShuffleIndicies);
+      LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
+
+      // Reorder operands if reordering would enable vectorization.
+      if (isa<BinaryOperator>(VL0)) {
+        ValueList Left, Right;
+        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
+        TE->setOperand(0, Left);
+        TE->setOperand(1, Right);
+        buildTree_rec(Left, Depth + 1, {TE, 0});
+        buildTree_rec(Right, Depth + 1, {TE, 1});
+        return;
+      }
+
+      TE->setOperandsInOrder();
+      for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+        ValueList Operands;
+        // Prepare the operand vector.
+        for (Value *V : VL)
+          Operands.push_back(cast<Instruction>(V)->getOperand(i));
+
+        buildTree_rec(Operands, Depth + 1, {TE, i});
+      }
+      return;
+    }
+    default:
+      BS.cancelScheduling(VL, VL0);
+      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                   ReuseShuffleIndicies);
+      LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
+      return;
+  }
+}
+
+unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
+  unsigned N = 1;
+  Type *EltTy = T;
+
+  while (isa<StructType>(EltTy) || isa<ArrayType>(EltTy) ||
+         isa<VectorType>(EltTy)) {
+    if (auto *ST = dyn_cast<StructType>(EltTy)) {
+      // Check that struct is homogeneous.
+      for (const auto *Ty : ST->elements())
+        if (Ty != *ST->element_begin())
+          return 0;
+      N *= ST->getNumElements();
+      EltTy = *ST->element_begin();
+    } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
+      N *= AT->getNumElements();
+      EltTy = AT->getElementType();
+    } else {
       auto *VT = cast<FixedVectorType>(EltTy);
-      N *= VT->getNumElements(); 
-      EltTy = VT->getElementType(); 
-    } 
-  } 
- 
-  if (!isValidElementType(EltTy)) 
-    return 0; 
-  uint64_t VTSize = DL.getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N)); 
-  if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T)) 
-    return 0; 
-  return N; 
-} 
- 
-bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, 
-                              SmallVectorImpl<unsigned> &CurrentOrder) const { 
-  Instruction *E0 = cast<Instruction>(OpValue); 
-  assert(E0->getOpcode() == Instruction::ExtractElement || 
-         E0->getOpcode() == Instruction::ExtractValue); 
-  assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode"); 
-  // Check if all of the extracts come from the same vector and from the 
-  // correct offset. 
-  Value *Vec = E0->getOperand(0); 
- 
-  CurrentOrder.clear(); 
- 
-  // We have to extract from a vector/aggregate with the same number of elements. 
-  unsigned NElts; 
-  if (E0->getOpcode() == Instruction::ExtractValue) { 
-    const DataLayout &DL = E0->getModule()->getDataLayout(); 
-    NElts = canMapToVector(Vec->getType(), DL); 
-    if (!NElts) 
-      return false; 
-    // Check if load can be rewritten as load of vector. 
-    LoadInst *LI = dyn_cast<LoadInst>(Vec); 
-    if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size())) 
-      return false; 
-  } else { 
+      N *= VT->getNumElements();
+      EltTy = VT->getElementType();
+    }
+  }
+
+  if (!isValidElementType(EltTy))
+    return 0;
+  uint64_t VTSize = DL.getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N));
+  if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
+    return 0;
+  return N;
+}
+
+bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
+                              SmallVectorImpl<unsigned> &CurrentOrder) const {
+  Instruction *E0 = cast<Instruction>(OpValue);
+  assert(E0->getOpcode() == Instruction::ExtractElement ||
+         E0->getOpcode() == Instruction::ExtractValue);
+  assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode");
+  // Check if all of the extracts come from the same vector and from the
+  // correct offset.
+  Value *Vec = E0->getOperand(0);
+
+  CurrentOrder.clear();
+
+  // We have to extract from a vector/aggregate with the same number of elements.
+  unsigned NElts;
+  if (E0->getOpcode() == Instruction::ExtractValue) {
+    const DataLayout &DL = E0->getModule()->getDataLayout();
+    NElts = canMapToVector(Vec->getType(), DL);
+    if (!NElts)
+      return false;
+    // Check if load can be rewritten as load of vector.
+    LoadInst *LI = dyn_cast<LoadInst>(Vec);
+    if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
+      return false;
+  } else {
     NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
-  } 
- 
-  if (NElts != VL.size()) 
-    return false; 
- 
-  // Check that all of the indices extract from the correct offset. 
-  bool ShouldKeepOrder = true; 
-  unsigned E = VL.size(); 
-  // Assign to all items the initial value E + 1 so we can check if the extract 
-  // instruction index was used already. 
-  // Also, later we can check that all the indices are used and we have a 
-  // consecutive access in the extract instructions, by checking that no 
-  // element of CurrentOrder still has value E + 1. 
-  CurrentOrder.assign(E, E + 1); 
-  unsigned I = 0; 
-  for (; I < E; ++I) { 
-    auto *Inst = cast<Instruction>(VL[I]); 
-    if (Inst->getOperand(0) != Vec) 
-      break; 
-    Optional<unsigned> Idx = getExtractIndex(Inst); 
-    if (!Idx) 
-      break; 
-    const unsigned ExtIdx = *Idx; 
-    if (ExtIdx != I) { 
-      if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1) 
-        break; 
-      ShouldKeepOrder = false; 
-      CurrentOrder[ExtIdx] = I; 
-    } else { 
-      if (CurrentOrder[I] != E + 1) 
-        break; 
-      CurrentOrder[I] = I; 
-    } 
-  } 
-  if (I < E) { 
-    CurrentOrder.clear(); 
-    return false; 
-  } 
- 
-  return ShouldKeepOrder; 
-} 
- 
-bool BoUpSLP::areAllUsersVectorized(Instruction *I) const { 
+  }
+
+  if (NElts != VL.size())
+    return false;
+
+  // Check that all of the indices extract from the correct offset.
+  bool ShouldKeepOrder = true;
+  unsigned E = VL.size();
+  // Assign to all items the initial value E + 1 so we can check if the extract
+  // instruction index was used already.
+  // Also, later we can check that all the indices are used and we have a
+  // consecutive access in the extract instructions, by checking that no
+  // element of CurrentOrder still has value E + 1.
+  CurrentOrder.assign(E, E + 1);
+  unsigned I = 0;
+  for (; I < E; ++I) {
+    auto *Inst = cast<Instruction>(VL[I]);
+    if (Inst->getOperand(0) != Vec)
+      break;
+    Optional<unsigned> Idx = getExtractIndex(Inst);
+    if (!Idx)
+      break;
+    const unsigned ExtIdx = *Idx;
+    if (ExtIdx != I) {
+      if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1)
+        break;
+      ShouldKeepOrder = false;
+      CurrentOrder[ExtIdx] = I;
+    } else {
+      if (CurrentOrder[I] != E + 1)
+        break;
+      CurrentOrder[I] = I;
+    }
+  }
+  if (I < E) {
+    CurrentOrder.clear();
+    return false;
+  }
+
+  return ShouldKeepOrder;
+}
+
+bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
   return I->hasOneUse() || llvm::all_of(I->users(), [this](User *U) {
-           return ScalarToTreeEntry.count(U) > 0; 
-         }); 
-} 
- 
+           return ScalarToTreeEntry.count(U) > 0;
+         });
+}
+
 static std::pair<InstructionCost, InstructionCost>
 getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
                    TargetTransformInfo *TTI, TargetLibraryInfo *TLI) {
-  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 
- 
-  // Calculate the cost of the scalar and vector calls. 
+  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+
+  // Calculate the cost of the scalar and vector calls.
   IntrinsicCostAttributes CostAttrs(ID, *CI, VecTy->getElementCount());
   auto IntrinsicCost =
-    TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput); 
- 
+    TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
+
   auto Shape = VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>(
                                      VecTy->getNumElements())),
                             false /*HasGlobalPred*/);
-  Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 
+  Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
   auto LibCost = IntrinsicCost;
-  if (!CI->isNoBuiltin() && VecFunc) { 
-    // Calculate the cost of the vector library call. 
-    SmallVector<Type *, 4> VecTys; 
-    for (Use &Arg : CI->args()) 
-      VecTys.push_back( 
-          FixedVectorType::get(Arg->getType(), VecTy->getNumElements())); 
- 
-    // If the corresponding vector call is cheaper, return its cost. 
-    LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys, 
-                                    TTI::TCK_RecipThroughput); 
-  } 
-  return {IntrinsicCost, LibCost}; 
-} 
- 
+  if (!CI->isNoBuiltin() && VecFunc) {
+    // Calculate the cost of the vector library call.
+    SmallVector<Type *, 4> VecTys;
+    for (Use &Arg : CI->args())
+      VecTys.push_back(
+          FixedVectorType::get(Arg->getType(), VecTy->getNumElements()));
+
+    // If the corresponding vector call is cheaper, return its cost.
+    LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys,
+                                    TTI::TCK_RecipThroughput);
+  }
+  return {IntrinsicCost, LibCost};
+}
+
 InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
-  ArrayRef<Value*> VL = E->Scalars; 
- 
-  Type *ScalarTy = VL[0]->getType(); 
-  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 
-    ScalarTy = SI->getValueOperand()->getType(); 
-  else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0])) 
-    ScalarTy = CI->getOperand(0)->getType(); 
-  auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); 
-  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 
- 
-  // If we have computed a smaller type for the expression, update VecTy so 
-  // that the costs will be accurate. 
-  if (MinBWs.count(VL[0])) 
-    VecTy = FixedVectorType::get( 
-        IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size()); 
- 
-  unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size(); 
-  bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); 
+  ArrayRef<Value*> VL = E->Scalars;
+
+  Type *ScalarTy = VL[0]->getType();
+  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+    ScalarTy = SI->getValueOperand()->getType();
+  else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
+    ScalarTy = CI->getOperand(0)->getType();
+  auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+
+  // If we have computed a smaller type for the expression, update VecTy so
+  // that the costs will be accurate.
+  if (MinBWs.count(VL[0]))
+    VecTy = FixedVectorType::get(
+        IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
+
+  unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
+  bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
   InstructionCost ReuseShuffleCost = 0;
-  if (NeedToShuffleReuses) { 
-    ReuseShuffleCost = 
-        TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); 
-  } 
-  if (E->State == TreeEntry::NeedToGather) { 
-    if (allConstant(VL)) 
-      return 0; 
-    if (isSplat(VL)) { 
-      return ReuseShuffleCost + 
-             TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); 
-    } 
-    if (E->getOpcode() == Instruction::ExtractElement && 
-        allSameType(VL) && allSameBlock(VL)) { 
-      Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL); 
-      if (ShuffleKind.hasValue()) { 
+  if (NeedToShuffleReuses) {
+    ReuseShuffleCost =
+        TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+  }
+  if (E->State == TreeEntry::NeedToGather) {
+    if (allConstant(VL))
+      return 0;
+    if (isSplat(VL)) {
+      return ReuseShuffleCost +
+             TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
+    }
+    if (E->getOpcode() == Instruction::ExtractElement &&
+        allSameType(VL) && allSameBlock(VL)) {
+      Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
+      if (ShuffleKind.hasValue()) {
         InstructionCost Cost =
             TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
-        for (auto *V : VL) { 
-          // If all users of instruction are going to be vectorized and this 
-          // instruction itself is not going to be vectorized, consider this 
-          // instruction as dead and remove its cost from the final cost of the 
-          // vectorized tree. 
-          if (areAllUsersVectorized(cast<Instruction>(V)) && 
-              !ScalarToTreeEntry.count(V)) { 
-            auto *IO = cast<ConstantInt>( 
-                cast<ExtractElementInst>(V)->getIndexOperand()); 
-            Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, 
-                                            IO->getZExtValue()); 
-          } 
-        } 
-        return ReuseShuffleCost + Cost; 
-      } 
-    } 
-    return ReuseShuffleCost + getGatherCost(VL); 
-  } 
+        for (auto *V : VL) {
+          // If all users of instruction are going to be vectorized and this
+          // instruction itself is not going to be vectorized, consider this
+          // instruction as dead and remove its cost from the final cost of the
+          // vectorized tree.
+          if (areAllUsersVectorized(cast<Instruction>(V)) &&
+              !ScalarToTreeEntry.count(V)) {
+            auto *IO = cast<ConstantInt>(
+                cast<ExtractElementInst>(V)->getIndexOperand());
+            Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
+                                            IO->getZExtValue());
+          }
+        }
+        return ReuseShuffleCost + Cost;
+      }
+    }
+    return ReuseShuffleCost + getGatherCost(VL);
+  }
   assert((E->State == TreeEntry::Vectorize ||
           E->State == TreeEntry::ScatterVectorize) &&
          "Unhandled state");
-  assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); 
-  Instruction *VL0 = E->getMainOp(); 
-  unsigned ShuffleOrOp = 
-      E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); 
-  switch (ShuffleOrOp) { 
-    case Instruction::PHI: 
-      return 0; 
- 
-    case Instruction::ExtractValue: 
-    case Instruction::ExtractElement: { 
-      if (NeedToShuffleReuses) { 
-        unsigned Idx = 0; 
-        for (unsigned I : E->ReuseShuffleIndices) { 
-          if (ShuffleOrOp == Instruction::ExtractElement) { 
-            auto *IO = cast<ConstantInt>( 
-                cast<ExtractElementInst>(VL[I])->getIndexOperand()); 
-            Idx = IO->getZExtValue(); 
-            ReuseShuffleCost -= TTI->getVectorInstrCost( 
-                Instruction::ExtractElement, VecTy, Idx); 
-          } else { 
-            ReuseShuffleCost -= TTI->getVectorInstrCost( 
-                Instruction::ExtractElement, VecTy, Idx); 
-            ++Idx; 
-          } 
-        } 
-        Idx = ReuseShuffleNumbers; 
-        for (Value *V : VL) { 
-          if (ShuffleOrOp == Instruction::ExtractElement) { 
-            auto *IO = cast<ConstantInt>( 
-                cast<ExtractElementInst>(V)->getIndexOperand()); 
-            Idx = IO->getZExtValue(); 
-          } else { 
-            --Idx; 
-          } 
-          ReuseShuffleCost += 
-              TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx); 
-        } 
-      } 
+  assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
+  Instruction *VL0 = E->getMainOp();
+  unsigned ShuffleOrOp =
+      E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
+  switch (ShuffleOrOp) {
+    case Instruction::PHI:
+      return 0;
+
+    case Instruction::ExtractValue:
+    case Instruction::ExtractElement: {
+      if (NeedToShuffleReuses) {
+        unsigned Idx = 0;
+        for (unsigned I : E->ReuseShuffleIndices) {
+          if (ShuffleOrOp == Instruction::ExtractElement) {
+            auto *IO = cast<ConstantInt>(
+                cast<ExtractElementInst>(VL[I])->getIndexOperand());
+            Idx = IO->getZExtValue();
+            ReuseShuffleCost -= TTI->getVectorInstrCost(
+                Instruction::ExtractElement, VecTy, Idx);
+          } else {
+            ReuseShuffleCost -= TTI->getVectorInstrCost(
+                Instruction::ExtractElement, VecTy, Idx);
+            ++Idx;
+          }
+        }
+        Idx = ReuseShuffleNumbers;
+        for (Value *V : VL) {
+          if (ShuffleOrOp == Instruction::ExtractElement) {
+            auto *IO = cast<ConstantInt>(
+                cast<ExtractElementInst>(V)->getIndexOperand());
+            Idx = IO->getZExtValue();
+          } else {
+            --Idx;
+          }
+          ReuseShuffleCost +=
+              TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
+        }
+      }
       InstructionCost DeadCost = ReuseShuffleCost;
-      if (!E->ReorderIndices.empty()) { 
-        // TODO: Merge this shuffle with the ReuseShuffleCost. 
-        DeadCost += TTI->getShuffleCost( 
-            TargetTransformInfo::SK_PermuteSingleSrc, VecTy); 
-      } 
+      if (!E->ReorderIndices.empty()) {
+        // TODO: Merge this shuffle with the ReuseShuffleCost.
+        DeadCost += TTI->getShuffleCost(
+            TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+      }
       for (unsigned I = 0, E = VL.size(); I < E; ++I) {
         Instruction *EI = cast<Instruction>(VL[I]);
-        // If all users are going to be vectorized, instruction can be 
-        // considered as dead. 
-        // The same, if have only one user, it will be vectorized for sure. 
+        // If all users are going to be vectorized, instruction can be
+        // considered as dead.
+        // The same, if have only one user, it will be vectorized for sure.
         if (areAllUsersVectorized(EI)) {
-          // Take credit for instruction that will become dead. 
+          // Take credit for instruction that will become dead.
           if (EI->hasOneUse()) {
             Instruction *Ext = EI->user_back();
-            if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) && 
-                all_of(Ext->users(), 
-                       [](User *U) { return isa<GetElementPtrInst>(U); })) { 
-              // Use getExtractWithExtendCost() to calculate the cost of 
-              // extractelement/ext pair. 
-              DeadCost -= TTI->getExtractWithExtendCost( 
+            if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
+                all_of(Ext->users(),
+                       [](User *U) { return isa<GetElementPtrInst>(U); })) {
+              // Use getExtractWithExtendCost() to calculate the cost of
+              // extractelement/ext pair.
+              DeadCost -= TTI->getExtractWithExtendCost(
                   Ext->getOpcode(), Ext->getType(), VecTy, I);
-              // Add back the cost of s|zext which is subtracted separately. 
-              DeadCost += TTI->getCastInstrCost( 
+              // Add back the cost of s|zext which is subtracted separately.
+              DeadCost += TTI->getCastInstrCost(
                   Ext->getOpcode(), Ext->getType(), EI->getType(),
                   TTI::getCastContextHint(Ext), CostKind, Ext);
-              continue; 
-            } 
-          } 
-          DeadCost -= 
+              continue;
+            }
+          }
+          DeadCost -=
               TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I);
-        } 
-      } 
-      return DeadCost; 
-    } 
-    case Instruction::ZExt: 
-    case Instruction::SExt: 
-    case Instruction::FPToUI: 
-    case Instruction::FPToSI: 
-    case Instruction::FPExt: 
-    case Instruction::PtrToInt: 
-    case Instruction::IntToPtr: 
-    case Instruction::SIToFP: 
-    case Instruction::UIToFP: 
-    case Instruction::Trunc: 
-    case Instruction::FPTrunc: 
-    case Instruction::BitCast: { 
-      Type *SrcTy = VL0->getOperand(0)->getType(); 
+        }
+      }
+      return DeadCost;
+    }
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::SIToFP:
+    case Instruction::UIToFP:
+    case Instruction::Trunc:
+    case Instruction::FPTrunc:
+    case Instruction::BitCast: {
+      Type *SrcTy = VL0->getOperand(0)->getType();
       InstructionCost ScalarEltCost =
           TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy,
                                 TTI::getCastContextHint(VL0), CostKind, VL0);
-      if (NeedToShuffleReuses) { 
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; 
-      } 
- 
-      // Calculate the cost of this instruction. 
+      if (NeedToShuffleReuses) {
+        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+      }
+
+      // Calculate the cost of this instruction.
       InstructionCost ScalarCost = VL.size() * ScalarEltCost;
- 
-      auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size()); 
+
+      auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size());
       InstructionCost VecCost = 0;
-      // Check if the values are candidates to demote. 
-      if (!MinBWs.count(VL0) || VecTy != SrcVecTy) { 
+      // Check if the values are candidates to demote.
+      if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
         VecCost =
             ReuseShuffleCost +
             TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy,
                                   TTI::getCastContextHint(VL0), CostKind, VL0);
-      } 
+      }
       LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
-      return VecCost - ScalarCost; 
-    } 
-    case Instruction::FCmp: 
-    case Instruction::ICmp: 
-    case Instruction::Select: { 
-      // Calculate the cost of this instruction. 
+      return VecCost - ScalarCost;
+    }
+    case Instruction::FCmp:
+    case Instruction::ICmp:
+    case Instruction::Select: {
+      // Calculate the cost of this instruction.
       InstructionCost ScalarEltCost =
           TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0);
-      if (NeedToShuffleReuses) { 
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; 
-      } 
-      auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size()); 
+      if (NeedToShuffleReuses) {
+        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+      }
+      auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
       InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
 
       // Check if all entries in VL are either compares or selects with compares
@@ -3656,103 +3656,103 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
         VecCost = std::min(VecCost, IntrinsicCost);
       }
       LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
-      return ReuseShuffleCost + VecCost - ScalarCost; 
-    } 
-    case Instruction::FNeg: 
-    case Instruction::Add: 
-    case Instruction::FAdd: 
-    case Instruction::Sub: 
-    case Instruction::FSub: 
-    case Instruction::Mul: 
-    case Instruction::FMul: 
-    case Instruction::UDiv: 
-    case Instruction::SDiv: 
-    case Instruction::FDiv: 
-    case Instruction::URem: 
-    case Instruction::SRem: 
-    case Instruction::FRem: 
-    case Instruction::Shl: 
-    case Instruction::LShr: 
-    case Instruction::AShr: 
-    case Instruction::And: 
-    case Instruction::Or: 
-    case Instruction::Xor: { 
-      // Certain instructions can be cheaper to vectorize if they have a 
-      // constant second vector operand. 
-      TargetTransformInfo::OperandValueKind Op1VK = 
-          TargetTransformInfo::OK_AnyValue; 
-      TargetTransformInfo::OperandValueKind Op2VK = 
-          TargetTransformInfo::OK_UniformConstantValue; 
-      TargetTransformInfo::OperandValueProperties Op1VP = 
-          TargetTransformInfo::OP_None; 
-      TargetTransformInfo::OperandValueProperties Op2VP = 
-          TargetTransformInfo::OP_PowerOf2; 
- 
-      // If all operands are exactly the same ConstantInt then set the 
-      // operand kind to OK_UniformConstantValue. 
-      // If instead not all operands are constants, then set the operand kind 
-      // to OK_AnyValue. If all operands are constants but not the same, 
-      // then set the operand kind to OK_NonUniformConstantValue. 
-      ConstantInt *CInt0 = nullptr; 
-      for (unsigned i = 0, e = VL.size(); i < e; ++i) { 
-        const Instruction *I = cast<Instruction>(VL[i]); 
-        unsigned OpIdx = isa<BinaryOperator>(I) ? 1 : 0; 
-        ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(OpIdx)); 
-        if (!CInt) { 
-          Op2VK = TargetTransformInfo::OK_AnyValue; 
-          Op2VP = TargetTransformInfo::OP_None; 
-          break; 
-        } 
-        if (Op2VP == TargetTransformInfo::OP_PowerOf2 && 
-            !CInt->getValue().isPowerOf2()) 
-          Op2VP = TargetTransformInfo::OP_None; 
-        if (i == 0) { 
-          CInt0 = CInt; 
-          continue; 
-        } 
-        if (CInt0 != CInt) 
-          Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; 
-      } 
- 
-      SmallVector<const Value *, 4> Operands(VL0->operand_values()); 
+      return ReuseShuffleCost + VecCost - ScalarCost;
+    }
+    case Instruction::FNeg:
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor: {
+      // Certain instructions can be cheaper to vectorize if they have a
+      // constant second vector operand.
+      TargetTransformInfo::OperandValueKind Op1VK =
+          TargetTransformInfo::OK_AnyValue;
+      TargetTransformInfo::OperandValueKind Op2VK =
+          TargetTransformInfo::OK_UniformConstantValue;
+      TargetTransformInfo::OperandValueProperties Op1VP =
+          TargetTransformInfo::OP_None;
+      TargetTransformInfo::OperandValueProperties Op2VP =
+          TargetTransformInfo::OP_PowerOf2;
+
+      // If all operands are exactly the same ConstantInt then set the
+      // operand kind to OK_UniformConstantValue.
+      // If instead not all operands are constants, then set the operand kind
+      // to OK_AnyValue. If all operands are constants but not the same,
+      // then set the operand kind to OK_NonUniformConstantValue.
+      ConstantInt *CInt0 = nullptr;
+      for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+        const Instruction *I = cast<Instruction>(VL[i]);
+        unsigned OpIdx = isa<BinaryOperator>(I) ? 1 : 0;
+        ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(OpIdx));
+        if (!CInt) {
+          Op2VK = TargetTransformInfo::OK_AnyValue;
+          Op2VP = TargetTransformInfo::OP_None;
+          break;
+        }
+        if (Op2VP == TargetTransformInfo::OP_PowerOf2 &&
+            !CInt->getValue().isPowerOf2())
+          Op2VP = TargetTransformInfo::OP_None;
+        if (i == 0) {
+          CInt0 = CInt;
+          continue;
+        }
+        if (CInt0 != CInt)
+          Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
+      }
+
+      SmallVector<const Value *, 4> Operands(VL0->operand_values());
       InstructionCost ScalarEltCost =
           TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, Op1VK,
                                       Op2VK, Op1VP, Op2VP, Operands, VL0);
-      if (NeedToShuffleReuses) { 
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; 
-      } 
+      if (NeedToShuffleReuses) {
+        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+      }
       InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
       InstructionCost VecCost =
           TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind, Op1VK,
                                       Op2VK, Op1VP, Op2VP, Operands, VL0);
       LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
-      return ReuseShuffleCost + VecCost - ScalarCost; 
-    } 
-    case Instruction::GetElementPtr: { 
-      TargetTransformInfo::OperandValueKind Op1VK = 
-          TargetTransformInfo::OK_AnyValue; 
-      TargetTransformInfo::OperandValueKind Op2VK = 
-          TargetTransformInfo::OK_UniformConstantValue; 
- 
+      return ReuseShuffleCost + VecCost - ScalarCost;
+    }
+    case Instruction::GetElementPtr: {
+      TargetTransformInfo::OperandValueKind Op1VK =
+          TargetTransformInfo::OK_AnyValue;
+      TargetTransformInfo::OperandValueKind Op2VK =
+          TargetTransformInfo::OK_UniformConstantValue;
+
       InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost(
           Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK);
-      if (NeedToShuffleReuses) { 
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; 
-      } 
+      if (NeedToShuffleReuses) {
+        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+      }
       InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
       InstructionCost VecCost = TTI->getArithmeticInstrCost(
           Instruction::Add, VecTy, CostKind, Op1VK, Op2VK);
       LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
-      return ReuseShuffleCost + VecCost - ScalarCost; 
-    } 
-    case Instruction::Load: { 
-      // Cost of wide load - cost of scalar loads. 
-      Align alignment = cast<LoadInst>(VL0)->getAlign(); 
+      return ReuseShuffleCost + VecCost - ScalarCost;
+    }
+    case Instruction::Load: {
+      // Cost of wide load - cost of scalar loads.
+      Align alignment = cast<LoadInst>(VL0)->getAlign();
       InstructionCost ScalarEltCost = TTI->getMemoryOpCost(
           Instruction::Load, ScalarTy, alignment, 0, CostKind, VL0);
-      if (NeedToShuffleReuses) { 
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; 
-      } 
+      if (NeedToShuffleReuses) {
+        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+      }
       InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
       InstructionCost VecLdCost;
       if (E->State == TreeEntry::Vectorize) {
@@ -3764,220 +3764,220 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
             Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
             /*VariableMask=*/false, alignment, CostKind, VL0);
       }
-      if (!E->ReorderIndices.empty()) { 
-        // TODO: Merge this shuffle with the ReuseShuffleCost. 
-        VecLdCost += TTI->getShuffleCost( 
-            TargetTransformInfo::SK_PermuteSingleSrc, VecTy); 
-      } 
+      if (!E->ReorderIndices.empty()) {
+        // TODO: Merge this shuffle with the ReuseShuffleCost.
+        VecLdCost += TTI->getShuffleCost(
+            TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+      }
       LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecLdCost, ScalarLdCost));
-      return ReuseShuffleCost + VecLdCost - ScalarLdCost; 
-    } 
-    case Instruction::Store: { 
-      // We know that we can merge the stores. Calculate the cost. 
-      bool IsReorder = !E->ReorderIndices.empty(); 
-      auto *SI = 
-          cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0); 
-      Align Alignment = SI->getAlign(); 
+      return ReuseShuffleCost + VecLdCost - ScalarLdCost;
+    }
+    case Instruction::Store: {
+      // We know that we can merge the stores. Calculate the cost.
+      bool IsReorder = !E->ReorderIndices.empty();
+      auto *SI =
+          cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
+      Align Alignment = SI->getAlign();
       InstructionCost ScalarEltCost = TTI->getMemoryOpCost(
           Instruction::Store, ScalarTy, Alignment, 0, CostKind, VL0);
-      if (NeedToShuffleReuses) 
-        ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost; 
+      if (NeedToShuffleReuses)
+        ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       InstructionCost ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
       InstructionCost VecStCost = TTI->getMemoryOpCost(
           Instruction::Store, VecTy, Alignment, 0, CostKind, VL0);
-      if (IsReorder) { 
-        // TODO: Merge this shuffle with the ReuseShuffleCost. 
-        VecStCost += TTI->getShuffleCost( 
-            TargetTransformInfo::SK_PermuteSingleSrc, VecTy); 
-      } 
+      if (IsReorder) {
+        // TODO: Merge this shuffle with the ReuseShuffleCost.
+        VecStCost += TTI->getShuffleCost(
+            TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+      }
       LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecStCost, ScalarStCost));
-      return ReuseShuffleCost + VecStCost - ScalarStCost; 
-    } 
-    case Instruction::Call: { 
-      CallInst *CI = cast<CallInst>(VL0); 
-      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 
- 
-      // Calculate the cost of the scalar and vector calls. 
+      return ReuseShuffleCost + VecStCost - ScalarStCost;
+    }
+    case Instruction::Call: {
+      CallInst *CI = cast<CallInst>(VL0);
+      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+
+      // Calculate the cost of the scalar and vector calls.
       IntrinsicCostAttributes CostAttrs(ID, *CI, ElementCount::getFixed(1), 1);
       InstructionCost ScalarEltCost =
           TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
-      if (NeedToShuffleReuses) { 
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; 
-      } 
+      if (NeedToShuffleReuses) {
+        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+      }
       InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
- 
-      auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI); 
+
+      auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
       InstructionCost VecCallCost =
           std::min(VecCallCosts.first, VecCallCosts.second);
- 
-      LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost 
-                        << " (" << VecCallCost << "-" << ScalarCallCost << ")" 
-                        << " for " << *CI << "\n"); 
- 
-      return ReuseShuffleCost + VecCallCost - ScalarCallCost; 
-    } 
-    case Instruction::ShuffleVector: { 
-      assert(E->isAltShuffle() && 
-             ((Instruction::isBinaryOp(E->getOpcode()) && 
-               Instruction::isBinaryOp(E->getAltOpcode())) || 
-              (Instruction::isCast(E->getOpcode()) && 
-               Instruction::isCast(E->getAltOpcode()))) && 
-             "Invalid Shuffle Vector Operand"); 
+
+      LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost
+                        << " (" << VecCallCost << "-" << ScalarCallCost << ")"
+                        << " for " << *CI << "\n");
+
+      return ReuseShuffleCost + VecCallCost - ScalarCallCost;
+    }
+    case Instruction::ShuffleVector: {
+      assert(E->isAltShuffle() &&
+             ((Instruction::isBinaryOp(E->getOpcode()) &&
+               Instruction::isBinaryOp(E->getAltOpcode())) ||
+              (Instruction::isCast(E->getOpcode()) &&
+               Instruction::isCast(E->getAltOpcode()))) &&
+             "Invalid Shuffle Vector Operand");
       InstructionCost ScalarCost = 0;
-      if (NeedToShuffleReuses) { 
-        for (unsigned Idx : E->ReuseShuffleIndices) { 
-          Instruction *I = cast<Instruction>(VL[Idx]); 
-          ReuseShuffleCost -= TTI->getInstructionCost(I, CostKind); 
-        } 
-        for (Value *V : VL) { 
-          Instruction *I = cast<Instruction>(V); 
-          ReuseShuffleCost += TTI->getInstructionCost(I, CostKind); 
-        } 
-      } 
-      for (Value *V : VL) { 
-        Instruction *I = cast<Instruction>(V); 
-        assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); 
-        ScalarCost += TTI->getInstructionCost(I, CostKind); 
-      } 
-      // VecCost is equal to sum of the cost of creating 2 vectors 
-      // and the cost of creating shuffle. 
+      if (NeedToShuffleReuses) {
+        for (unsigned Idx : E->ReuseShuffleIndices) {
+          Instruction *I = cast<Instruction>(VL[Idx]);
+          ReuseShuffleCost -= TTI->getInstructionCost(I, CostKind);
+        }
+        for (Value *V : VL) {
+          Instruction *I = cast<Instruction>(V);
+          ReuseShuffleCost += TTI->getInstructionCost(I, CostKind);
+        }
+      }
+      for (Value *V : VL) {
+        Instruction *I = cast<Instruction>(V);
+        assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
+        ScalarCost += TTI->getInstructionCost(I, CostKind);
+      }
+      // VecCost is equal to sum of the cost of creating 2 vectors
+      // and the cost of creating shuffle.
       InstructionCost VecCost = 0;
-      if (Instruction::isBinaryOp(E->getOpcode())) { 
-        VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind); 
-        VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, 
-                                               CostKind); 
-      } else { 
-        Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType(); 
-        Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType(); 
-        auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size()); 
-        auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size()); 
-        VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty, 
+      if (Instruction::isBinaryOp(E->getOpcode())) {
+        VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
+        VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy,
+                                               CostKind);
+      } else {
+        Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();
+        Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();
+        auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size());
+        auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size());
+        VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty,
                                         TTI::CastContextHint::None, CostKind);
-        VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty, 
+        VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,
                                          TTI::CastContextHint::None, CostKind);
-      } 
-      VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0); 
+      }
+      VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);
       LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
-      return ReuseShuffleCost + VecCost - ScalarCost; 
-    } 
-    default: 
-      llvm_unreachable("Unknown instruction"); 
-  } 
-} 
- 
-bool BoUpSLP::isFullyVectorizableTinyTree() const { 
-  LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height " 
-                    << VectorizableTree.size() << " is fully vectorizable .\n"); 
- 
-  // We only handle trees of heights 1 and 2. 
-  if (VectorizableTree.size() == 1 && 
-      VectorizableTree[0]->State == TreeEntry::Vectorize) 
-    return true; 
- 
-  if (VectorizableTree.size() != 2) 
-    return false; 
- 
-  // Handle splat and all-constants stores. 
-  if (VectorizableTree[0]->State == TreeEntry::Vectorize && 
-      (allConstant(VectorizableTree[1]->Scalars) || 
-       isSplat(VectorizableTree[1]->Scalars))) 
-    return true; 
- 
-  // Gathering cost would be too much for tiny trees. 
-  if (VectorizableTree[0]->State == TreeEntry::NeedToGather || 
-      VectorizableTree[1]->State == TreeEntry::NeedToGather) 
-    return false; 
- 
-  return true; 
-} 
- 
-static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, 
-                                       TargetTransformInfo *TTI) { 
-  // Look past the root to find a source value. Arbitrarily follow the 
-  // path through operand 0 of any 'or'. Also, peek through optional 
+      return ReuseShuffleCost + VecCost - ScalarCost;
+    }
+    default:
+      llvm_unreachable("Unknown instruction");
+  }
+}
+
+bool BoUpSLP::isFullyVectorizableTinyTree() const {
+  LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
+                    << VectorizableTree.size() << " is fully vectorizable .\n");
+
+  // We only handle trees of heights 1 and 2.
+  if (VectorizableTree.size() == 1 &&
+      VectorizableTree[0]->State == TreeEntry::Vectorize)
+    return true;
+
+  if (VectorizableTree.size() != 2)
+    return false;
+
+  // Handle splat and all-constants stores.
+  if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
+      (allConstant(VectorizableTree[1]->Scalars) ||
+       isSplat(VectorizableTree[1]->Scalars)))
+    return true;
+
+  // Gathering cost would be too much for tiny trees.
+  if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
+      VectorizableTree[1]->State == TreeEntry::NeedToGather)
+    return false;
+
+  return true;
+}
+
+static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
+                                       TargetTransformInfo *TTI) {
+  // Look past the root to find a source value. Arbitrarily follow the
+  // path through operand 0 of any 'or'. Also, peek through optional
   // shift-left-by-multiple-of-8-bits.
-  Value *ZextLoad = Root; 
+  Value *ZextLoad = Root;
   const APInt *ShAmtC;
-  while (!isa<ConstantExpr>(ZextLoad) && 
-         (match(ZextLoad, m_Or(m_Value(), m_Value())) || 
+  while (!isa<ConstantExpr>(ZextLoad) &&
+         (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
           (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
            ShAmtC->urem(8) == 0)))
-    ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0); 
- 
-  // Check if the input is an extended load of the required or/shift expression. 
-  Value *LoadPtr; 
-  if (ZextLoad == Root || !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr))))) 
-    return false; 
- 
-  // Require that the total load bit width is a legal integer type. 
-  // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target. 
-  // But <16 x i8> --> i128 is not, so the backend probably can't reduce it. 
-  Type *SrcTy = LoadPtr->getType()->getPointerElementType(); 
-  unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts; 
-  if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth))) 
-    return false; 
- 
-  // Everything matched - assume that we can fold the whole sequence using 
-  // load combining. 
-  LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at " 
-             << *(cast<Instruction>(Root)) << "\n"); 
- 
-  return true; 
-} 
- 
+    ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0);
+
+  // Check if the input is an extended load of the required or/shift expression.
+  Value *LoadPtr;
+  if (ZextLoad == Root || !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
+    return false;
+
+  // Require that the total load bit width is a legal integer type.
+  // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
+  // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
+  Type *SrcTy = LoadPtr->getType()->getPointerElementType();
+  unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
+  if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
+    return false;
+
+  // Everything matched - assume that we can fold the whole sequence using
+  // load combining.
+  LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
+             << *(cast<Instruction>(Root)) << "\n");
+
+  return true;
+}
+
 bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const {
   if (RdxKind != RecurKind::Or)
-    return false; 
- 
-  unsigned NumElts = VectorizableTree[0]->Scalars.size(); 
-  Value *FirstReduced = VectorizableTree[0]->Scalars[0]; 
-  return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI); 
-} 
- 
-bool BoUpSLP::isLoadCombineCandidate() const { 
-  // Peek through a final sequence of stores and check if all operations are 
-  // likely to be load-combined. 
-  unsigned NumElts = VectorizableTree[0]->Scalars.size(); 
-  for (Value *Scalar : VectorizableTree[0]->Scalars) { 
-    Value *X; 
-    if (!match(Scalar, m_Store(m_Value(X), m_Value())) || 
-        !isLoadCombineCandidateImpl(X, NumElts, TTI)) 
-      return false; 
-  } 
-  return true; 
-} 
- 
-bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const { 
-  // We can vectorize the tree if its size is greater than or equal to the 
-  // minimum size specified by the MinTreeSize command line option. 
-  if (VectorizableTree.size() >= MinTreeSize) 
-    return false; 
- 
-  // If we have a tiny tree (a tree whose size is less than MinTreeSize), we 
-  // can vectorize it if we can prove it fully vectorizable. 
-  if (isFullyVectorizableTinyTree()) 
-    return false; 
- 
-  assert(VectorizableTree.empty() 
-             ? ExternalUses.empty() 
-             : true && "We shouldn't have any external users"); 
- 
-  // Otherwise, we can't vectorize the tree. It is both tiny and not fully 
-  // vectorizable. 
-  return true; 
-} 
- 
+    return false;
+
+  unsigned NumElts = VectorizableTree[0]->Scalars.size();
+  Value *FirstReduced = VectorizableTree[0]->Scalars[0];
+  return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI);
+}
+
+bool BoUpSLP::isLoadCombineCandidate() const {
+  // Peek through a final sequence of stores and check if all operations are
+  // likely to be load-combined.
+  unsigned NumElts = VectorizableTree[0]->Scalars.size();
+  for (Value *Scalar : VectorizableTree[0]->Scalars) {
+    Value *X;
+    if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
+        !isLoadCombineCandidateImpl(X, NumElts, TTI))
+      return false;
+  }
+  return true;
+}
+
+bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {
+  // We can vectorize the tree if its size is greater than or equal to the
+  // minimum size specified by the MinTreeSize command line option.
+  if (VectorizableTree.size() >= MinTreeSize)
+    return false;
+
+  // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
+  // can vectorize it if we can prove it fully vectorizable.
+  if (isFullyVectorizableTinyTree())
+    return false;
+
+  assert(VectorizableTree.empty()
+             ? ExternalUses.empty()
+             : true && "We shouldn't have any external users");
+
+  // Otherwise, we can't vectorize the tree. It is both tiny and not fully
+  // vectorizable.
+  return true;
+}
+
 InstructionCost BoUpSLP::getSpillCost() const {
-  // Walk from the bottom of the tree to the top, tracking which values are 
-  // live. When we see a call instruction that is not part of our tree, 
-  // query TTI to see if there is a cost to keeping values live over it 
-  // (for example, if spills and fills are required). 
-  unsigned BundleWidth = VectorizableTree.front()->Scalars.size(); 
+  // Walk from the bottom of the tree to the top, tracking which values are
+  // live. When we see a call instruction that is not part of our tree,
+  // query TTI to see if there is a cost to keeping values live over it
+  // (for example, if spills and fills are required).
+  unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
   InstructionCost Cost = 0;
- 
-  SmallPtrSet<Instruction*, 4> LiveValues; 
-  Instruction *PrevInst = nullptr; 
- 
+
+  SmallPtrSet<Instruction*, 4> LiveValues;
+  Instruction *PrevInst = nullptr;
+
   // The entries in VectorizableTree are not necessarily ordered by their
   // position in basic blocks. Collect them and order them by dominance so later
   // instructions are guaranteed to be visited first. For instructions in
@@ -3985,273 +3985,273 @@ InstructionCost BoUpSLP::getSpillCost() const {
   // their order does not matter, as long as all instructions in a basic block
   // are grouped together. Using dominance ensures a deterministic order.
   SmallVector<Instruction *, 16> OrderedScalars;
-  for (const auto &TEPtr : VectorizableTree) { 
-    Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]); 
-    if (!Inst) 
-      continue; 
+  for (const auto &TEPtr : VectorizableTree) {
+    Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
+    if (!Inst)
+      continue;
     OrderedScalars.push_back(Inst);
   }
   llvm::stable_sort(OrderedScalars, [this](Instruction *A, Instruction *B) {
     return DT->dominates(B, A);
   });
- 
+
   for (Instruction *Inst : OrderedScalars) {
-    if (!PrevInst) { 
-      PrevInst = Inst; 
-      continue; 
-    } 
- 
-    // Update LiveValues. 
-    LiveValues.erase(PrevInst); 
-    for (auto &J : PrevInst->operands()) { 
-      if (isa<Instruction>(&*J) && getTreeEntry(&*J)) 
-        LiveValues.insert(cast<Instruction>(&*J)); 
-    } 
- 
-    LLVM_DEBUG({ 
-      dbgs() << "SLP: #LV: " << LiveValues.size(); 
-      for (auto *X : LiveValues) 
-        dbgs() << " " << X->getName(); 
-      dbgs() << ", Looking at "; 
-      Inst->dump(); 
-    }); 
- 
-    // Now find the sequence of instructions between PrevInst and Inst. 
-    unsigned NumCalls = 0; 
-    BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(), 
-                                 PrevInstIt = 
-                                     PrevInst->getIterator().getReverse(); 
-    while (InstIt != PrevInstIt) { 
-      if (PrevInstIt == PrevInst->getParent()->rend()) { 
-        PrevInstIt = Inst->getParent()->rbegin(); 
-        continue; 
-      } 
- 
-      // Debug information does not impact spill cost. 
-      if ((isa<CallInst>(&*PrevInstIt) && 
-           !isa<DbgInfoIntrinsic>(&*PrevInstIt)) && 
-          &*PrevInstIt != PrevInst) 
-        NumCalls++; 
- 
-      ++PrevInstIt; 
-    } 
- 
-    if (NumCalls) { 
-      SmallVector<Type*, 4> V; 
-      for (auto *II : LiveValues) 
-        V.push_back(FixedVectorType::get(II->getType(), BundleWidth)); 
-      Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V); 
-    } 
- 
-    PrevInst = Inst; 
-  } 
- 
-  return Cost; 
-} 
- 
+    if (!PrevInst) {
+      PrevInst = Inst;
+      continue;
+    }
+
+    // Update LiveValues.
+    LiveValues.erase(PrevInst);
+    for (auto &J : PrevInst->operands()) {
+      if (isa<Instruction>(&*J) && getTreeEntry(&*J))
+        LiveValues.insert(cast<Instruction>(&*J));
+    }
+
+    LLVM_DEBUG({
+      dbgs() << "SLP: #LV: " << LiveValues.size();
+      for (auto *X : LiveValues)
+        dbgs() << " " << X->getName();
+      dbgs() << ", Looking at ";
+      Inst->dump();
+    });
+
+    // Now find the sequence of instructions between PrevInst and Inst.
+    unsigned NumCalls = 0;
+    BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
+                                 PrevInstIt =
+                                     PrevInst->getIterator().getReverse();
+    while (InstIt != PrevInstIt) {
+      if (PrevInstIt == PrevInst->getParent()->rend()) {
+        PrevInstIt = Inst->getParent()->rbegin();
+        continue;
+      }
+
+      // Debug information does not impact spill cost.
+      if ((isa<CallInst>(&*PrevInstIt) &&
+           !isa<DbgInfoIntrinsic>(&*PrevInstIt)) &&
+          &*PrevInstIt != PrevInst)
+        NumCalls++;
+
+      ++PrevInstIt;
+    }
+
+    if (NumCalls) {
+      SmallVector<Type*, 4> V;
+      for (auto *II : LiveValues)
+        V.push_back(FixedVectorType::get(II->getType(), BundleWidth));
+      Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
+    }
+
+    PrevInst = Inst;
+  }
+
+  return Cost;
+}
+
 InstructionCost BoUpSLP::getTreeCost() {
   InstructionCost Cost = 0;
-  LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " 
-                    << VectorizableTree.size() << ".\n"); 
- 
-  unsigned BundleWidth = VectorizableTree[0]->Scalars.size(); 
- 
-  for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { 
-    TreeEntry &TE = *VectorizableTree[I].get(); 
- 
-    // We create duplicate tree entries for gather sequences that have multiple 
-    // uses. However, we should not compute the cost of duplicate sequences. 
-    // For example, if we have a build vector (i.e., insertelement sequence) 
-    // that is used by more than one vector instruction, we only need to 
-    // compute the cost of the insertelement instructions once. The redundant 
-    // instructions will be eliminated by CSE. 
-    // 
-    // We should consider not creating duplicate tree entries for gather 
-    // sequences, and instead add additional edges to the tree representing 
-    // their uses. Since such an approach results in fewer total entries, 
-    // existing heuristics based on tree size may yield different results. 
-    // 
-    if (TE.State == TreeEntry::NeedToGather && 
-        std::any_of(std::next(VectorizableTree.begin(), I + 1), 
-                    VectorizableTree.end(), 
-                    [TE](const std::unique_ptr<TreeEntry> &EntryPtr) { 
-                      return EntryPtr->State == TreeEntry::NeedToGather && 
-                             EntryPtr->isSame(TE.Scalars); 
-                    })) 
-      continue; 
- 
+  LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
+                    << VectorizableTree.size() << ".\n");
+
+  unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
+
+  for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
+    TreeEntry &TE = *VectorizableTree[I].get();
+
+    // We create duplicate tree entries for gather sequences that have multiple
+    // uses. However, we should not compute the cost of duplicate sequences.
+    // For example, if we have a build vector (i.e., insertelement sequence)
+    // that is used by more than one vector instruction, we only need to
+    // compute the cost of the insertelement instructions once. The redundant
+    // instructions will be eliminated by CSE.
+    //
+    // We should consider not creating duplicate tree entries for gather
+    // sequences, and instead add additional edges to the tree representing
+    // their uses. Since such an approach results in fewer total entries,
+    // existing heuristics based on tree size may yield different results.
+    //
+    if (TE.State == TreeEntry::NeedToGather &&
+        std::any_of(std::next(VectorizableTree.begin(), I + 1),
+                    VectorizableTree.end(),
+                    [TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
+                      return EntryPtr->State == TreeEntry::NeedToGather &&
+                             EntryPtr->isSame(TE.Scalars);
+                    }))
+      continue;
+
     InstructionCost C = getEntryCost(&TE);
     Cost += C;
-    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C 
-                      << " for bundle that starts with " << *TE.Scalars[0] 
+    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+                      << " for bundle that starts with " << *TE.Scalars[0]
                       << ".\n"
                       << "SLP: Current total cost = " << Cost << "\n");
-  } 
- 
-  SmallPtrSet<Value *, 16> ExtractCostCalculated; 
+  }
+
+  SmallPtrSet<Value *, 16> ExtractCostCalculated;
   InstructionCost ExtractCost = 0;
-  for (ExternalUser &EU : ExternalUses) { 
-    // We only add extract cost once for the same scalar. 
-    if (!ExtractCostCalculated.insert(EU.Scalar).second) 
-      continue; 
- 
-    // Uses by ephemeral values are free (because the ephemeral value will be 
-    // removed prior to code generation, and so the extraction will be 
-    // removed as well). 
-    if (EphValues.count(EU.User)) 
-      continue; 
- 
-    // If we plan to rewrite the tree in a smaller type, we will need to sign 
-    // extend the extracted value back to the original type. Here, we account 
-    // for the extract and the added cost of the sign extend if needed. 
-    auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth); 
-    auto *ScalarRoot = VectorizableTree[0]->Scalars[0]; 
-    if (MinBWs.count(ScalarRoot)) { 
-      auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); 
-      auto Extend = 
-          MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt; 
-      VecTy = FixedVectorType::get(MinTy, BundleWidth); 
-      ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), 
-                                                   VecTy, EU.Lane); 
-    } else { 
-      ExtractCost += 
-          TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); 
-    } 
-  } 
- 
+  for (ExternalUser &EU : ExternalUses) {
+    // We only add extract cost once for the same scalar.
+    if (!ExtractCostCalculated.insert(EU.Scalar).second)
+      continue;
+
+    // Uses by ephemeral values are free (because the ephemeral value will be
+    // removed prior to code generation, and so the extraction will be
+    // removed as well).
+    if (EphValues.count(EU.User))
+      continue;
+
+    // If we plan to rewrite the tree in a smaller type, we will need to sign
+    // extend the extracted value back to the original type. Here, we account
+    // for the extract and the added cost of the sign extend if needed.
+    auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
+    auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
+    if (MinBWs.count(ScalarRoot)) {
+      auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
+      auto Extend =
+          MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
+      VecTy = FixedVectorType::get(MinTy, BundleWidth);
+      ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
+                                                   VecTy, EU.Lane);
+    } else {
+      ExtractCost +=
+          TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
+    }
+  }
+
   InstructionCost SpillCost = getSpillCost();
-  Cost += SpillCost + ExtractCost; 
- 
+  Cost += SpillCost + ExtractCost;
+
 #ifndef NDEBUG
   SmallString<256> Str;
-  { 
+  {
     raw_svector_ostream OS(Str);
-    OS << "SLP: Spill Cost = " << SpillCost << ".\n" 
-       << "SLP: Extract Cost = " << ExtractCost << ".\n" 
-       << "SLP: Total Cost = " << Cost << ".\n"; 
-  } 
-  LLVM_DEBUG(dbgs() << Str); 
-  if (ViewSLPTree) 
-    ViewGraph(this, "SLP" + F->getName(), false, Str); 
+    OS << "SLP: Spill Cost = " << SpillCost << ".\n"
+       << "SLP: Extract Cost = " << ExtractCost << ".\n"
+       << "SLP: Total Cost = " << Cost << ".\n";
+  }
+  LLVM_DEBUG(dbgs() << Str);
+  if (ViewSLPTree)
+    ViewGraph(this, "SLP" + F->getName(), false, Str);
 #endif
- 
-  return Cost; 
-} 
- 
+
+  return Cost;
+}
+
 InstructionCost
 BoUpSLP::getGatherCost(FixedVectorType *Ty,
                        const DenseSet<unsigned> &ShuffledIndices) const {
-  unsigned NumElts = Ty->getNumElements(); 
-  APInt DemandedElts = APInt::getNullValue(NumElts); 
+  unsigned NumElts = Ty->getNumElements();
+  APInt DemandedElts = APInt::getNullValue(NumElts);
   for (unsigned I = 0; I < NumElts; ++I)
     if (!ShuffledIndices.count(I))
       DemandedElts.setBit(I);
   InstructionCost Cost =
       TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true,
                                     /*Extract*/ false);
-  if (!ShuffledIndices.empty()) 
-    Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty); 
-  return Cost; 
-} 
- 
+  if (!ShuffledIndices.empty())
+    Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
+  return Cost;
+}
+
 InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
-  // Find the type of the operands in VL. 
-  Type *ScalarTy = VL[0]->getType(); 
-  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 
-    ScalarTy = SI->getValueOperand()->getType(); 
-  auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); 
-  // Find the cost of inserting/extracting values from the vector. 
-  // Check if the same elements are inserted several times and count them as 
-  // shuffle candidates. 
-  DenseSet<unsigned> ShuffledElements; 
-  DenseSet<Value *> UniqueElements; 
-  // Iterate in reverse order to consider insert elements with the high cost. 
-  for (unsigned I = VL.size(); I > 0; --I) { 
-    unsigned Idx = I - 1; 
-    if (!UniqueElements.insert(VL[Idx]).second) 
-      ShuffledElements.insert(Idx); 
-  } 
-  return getGatherCost(VecTy, ShuffledElements); 
-} 
- 
-// Perform operand reordering on the instructions in VL and return the reordered 
-// operands in Left and Right. 
-void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, 
-                                             SmallVectorImpl<Value *> &Left, 
-                                             SmallVectorImpl<Value *> &Right, 
-                                             const DataLayout &DL, 
-                                             ScalarEvolution &SE, 
-                                             const BoUpSLP &R) { 
-  if (VL.empty()) 
-    return; 
-  VLOperands Ops(VL, DL, SE, R); 
-  // Reorder the operands in place. 
-  Ops.reorder(); 
-  Left = Ops.getVL(0); 
-  Right = Ops.getVL(1); 
-} 
- 
-void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) { 
-  // Get the basic block this bundle is in. All instructions in the bundle 
-  // should be in this block. 
-  auto *Front = E->getMainOp(); 
-  auto *BB = Front->getParent(); 
+  // Find the type of the operands in VL.
+  Type *ScalarTy = VL[0]->getType();
+  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+    ScalarTy = SI->getValueOperand()->getType();
+  auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
+  // Find the cost of inserting/extracting values from the vector.
+  // Check if the same elements are inserted several times and count them as
+  // shuffle candidates.
+  DenseSet<unsigned> ShuffledElements;
+  DenseSet<Value *> UniqueElements;
+  // Iterate in reverse order to consider insert elements with the high cost.
+  for (unsigned I = VL.size(); I > 0; --I) {
+    unsigned Idx = I - 1;
+    if (!UniqueElements.insert(VL[Idx]).second)
+      ShuffledElements.insert(Idx);
+  }
+  return getGatherCost(VecTy, ShuffledElements);
+}
+
+// Perform operand reordering on the instructions in VL and return the reordered
+// operands in Left and Right.
+void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
+                                             SmallVectorImpl<Value *> &Left,
+                                             SmallVectorImpl<Value *> &Right,
+                                             const DataLayout &DL,
+                                             ScalarEvolution &SE,
+                                             const BoUpSLP &R) {
+  if (VL.empty())
+    return;
+  VLOperands Ops(VL, DL, SE, R);
+  // Reorder the operands in place.
+  Ops.reorder();
+  Left = Ops.getVL(0);
+  Right = Ops.getVL(1);
+}
+
+void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) {
+  // Get the basic block this bundle is in. All instructions in the bundle
+  // should be in this block.
+  auto *Front = E->getMainOp();
+  auto *BB = Front->getParent();
   assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool {
     auto *I = cast<Instruction>(V);
     return !E->isOpcodeOrAlt(I) || I->getParent() == BB;
   }));
- 
-  // The last instruction in the bundle in program order. 
-  Instruction *LastInst = nullptr; 
- 
-  // Find the last instruction. The common case should be that BB has been 
-  // scheduled, and the last instruction is VL.back(). So we start with 
-  // VL.back() and iterate over schedule data until we reach the end of the 
-  // bundle. The end of the bundle is marked by null ScheduleData. 
-  if (BlocksSchedules.count(BB)) { 
-    auto *Bundle = 
-        BlocksSchedules[BB]->getScheduleData(E->isOneOf(E->Scalars.back())); 
-    if (Bundle && Bundle->isPartOfBundle()) 
-      for (; Bundle; Bundle = Bundle->NextInBundle) 
-        if (Bundle->OpValue == Bundle->Inst) 
-          LastInst = Bundle->Inst; 
-  } 
- 
-  // LastInst can still be null at this point if there's either not an entry 
-  // for BB in BlocksSchedules or there's no ScheduleData available for 
-  // VL.back(). This can be the case if buildTree_rec aborts for various 
-  // reasons (e.g., the maximum recursion depth is reached, the maximum region 
-  // size is reached, etc.). ScheduleData is initialized in the scheduling 
-  // "dry-run". 
-  // 
-  // If this happens, we can still find the last instruction by brute force. We 
-  // iterate forwards from Front (inclusive) until we either see all 
-  // instructions in the bundle or reach the end of the block. If Front is the 
-  // last instruction in program order, LastInst will be set to Front, and we 
-  // will visit all the remaining instructions in the block. 
-  // 
-  // One of the reasons we exit early from buildTree_rec is to place an upper 
-  // bound on compile-time. Thus, taking an additional compile-time hit here is 
-  // not ideal. However, this should be exceedingly rare since it requires that 
-  // we both exit early from buildTree_rec and that the bundle be out-of-order 
-  // (causing us to iterate all the way to the end of the block). 
-  if (!LastInst) { 
-    SmallPtrSet<Value *, 16> Bundle(E->Scalars.begin(), E->Scalars.end()); 
-    for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) { 
-      if (Bundle.erase(&I) && E->isOpcodeOrAlt(&I)) 
-        LastInst = &I; 
-      if (Bundle.empty()) 
-        break; 
-    } 
-  } 
-  assert(LastInst && "Failed to find last instruction in bundle"); 
- 
-  // Set the insertion point after the last instruction in the bundle. Set the 
-  // debug location to Front. 
-  Builder.SetInsertPoint(BB, ++LastInst->getIterator()); 
-  Builder.SetCurrentDebugLocation(Front->getDebugLoc()); 
-} 
- 
+
+  // The last instruction in the bundle in program order.
+  Instruction *LastInst = nullptr;
+
+  // Find the last instruction. The common case should be that BB has been
+  // scheduled, and the last instruction is VL.back(). So we start with
+  // VL.back() and iterate over schedule data until we reach the end of the
+  // bundle. The end of the bundle is marked by null ScheduleData.
+  if (BlocksSchedules.count(BB)) {
+    auto *Bundle =
+        BlocksSchedules[BB]->getScheduleData(E->isOneOf(E->Scalars.back()));
+    if (Bundle && Bundle->isPartOfBundle())
+      for (; Bundle; Bundle = Bundle->NextInBundle)
+        if (Bundle->OpValue == Bundle->Inst)
+          LastInst = Bundle->Inst;
+  }
+
+  // LastInst can still be null at this point if there's either not an entry
+  // for BB in BlocksSchedules or there's no ScheduleData available for
+  // VL.back(). This can be the case if buildTree_rec aborts for various
+  // reasons (e.g., the maximum recursion depth is reached, the maximum region
+  // size is reached, etc.). ScheduleData is initialized in the scheduling
+  // "dry-run".
+  //
+  // If this happens, we can still find the last instruction by brute force. We
+  // iterate forwards from Front (inclusive) until we either see all
+  // instructions in the bundle or reach the end of the block. If Front is the
+  // last instruction in program order, LastInst will be set to Front, and we
+  // will visit all the remaining instructions in the block.
+  //
+  // One of the reasons we exit early from buildTree_rec is to place an upper
+  // bound on compile-time. Thus, taking an additional compile-time hit here is
+  // not ideal. However, this should be exceedingly rare since it requires that
+  // we both exit early from buildTree_rec and that the bundle be out-of-order
+  // (causing us to iterate all the way to the end of the block).
+  if (!LastInst) {
+    SmallPtrSet<Value *, 16> Bundle(E->Scalars.begin(), E->Scalars.end());
+    for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
+      if (Bundle.erase(&I) && E->isOpcodeOrAlt(&I))
+        LastInst = &I;
+      if (Bundle.empty())
+        break;
+    }
+  }
+  assert(LastInst && "Failed to find last instruction in bundle");
+
+  // Set the insertion point after the last instruction in the bundle. Set the
+  // debug location to Front.
+  Builder.SetInsertPoint(BB, ++LastInst->getIterator());
+  Builder.SetCurrentDebugLocation(Front->getDebugLoc());
+}
+
 Value *BoUpSLP::gather(ArrayRef<Value *> VL) {
   Value *Val0 =
       isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];
@@ -4274,337 +4274,337 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL) {
       if (!Entry->ReuseShuffleIndices.empty()) {
         FoundLane = std::distance(Entry->ReuseShuffleIndices.begin(),
                                   find(Entry->ReuseShuffleIndices, FoundLane));
-      } 
+      }
       ExternalUses.push_back(ExternalUser(Val, InsElt, FoundLane));
-    } 
-  } 
- 
-  return Vec; 
-} 
- 
-Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { 
-  InstructionsState S = getSameOpcode(VL); 
-  if (S.getOpcode()) { 
-    if (TreeEntry *E = getTreeEntry(S.OpValue)) { 
-      if (E->isSame(VL)) { 
-        Value *V = vectorizeTree(E); 
-        if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) { 
-          // We need to get the vectorized value but without shuffle. 
-          if (auto *SV = dyn_cast<ShuffleVectorInst>(V)) { 
-            V = SV->getOperand(0); 
-          } else { 
-            // Reshuffle to get only unique values. 
-            SmallVector<int, 4> UniqueIdxs; 
-            SmallSet<int, 4> UsedIdxs; 
-            for (int Idx : E->ReuseShuffleIndices) 
-              if (UsedIdxs.insert(Idx).second) 
-                UniqueIdxs.emplace_back(Idx); 
+    }
+  }
+
+  return Vec;
+}
+
+Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
+  InstructionsState S = getSameOpcode(VL);
+  if (S.getOpcode()) {
+    if (TreeEntry *E = getTreeEntry(S.OpValue)) {
+      if (E->isSame(VL)) {
+        Value *V = vectorizeTree(E);
+        if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) {
+          // We need to get the vectorized value but without shuffle.
+          if (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
+            V = SV->getOperand(0);
+          } else {
+            // Reshuffle to get only unique values.
+            SmallVector<int, 4> UniqueIdxs;
+            SmallSet<int, 4> UsedIdxs;
+            for (int Idx : E->ReuseShuffleIndices)
+              if (UsedIdxs.insert(Idx).second)
+                UniqueIdxs.emplace_back(Idx);
             V = Builder.CreateShuffleVector(V, UniqueIdxs);
-          } 
-        } 
-        return V; 
-      } 
-    } 
-  } 
- 
-  // Check that every instruction appears once in this bundle. 
-  SmallVector<int, 4> ReuseShuffleIndicies; 
-  SmallVector<Value *, 4> UniqueValues; 
-  if (VL.size() > 2) { 
-    DenseMap<Value *, unsigned> UniquePositions; 
-    for (Value *V : VL) { 
-      auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); 
-      ReuseShuffleIndicies.emplace_back(Res.first->second); 
-      if (Res.second || isa<Constant>(V)) 
-        UniqueValues.emplace_back(V); 
-    } 
-    // Do not shuffle single element or if number of unique values is not power 
-    // of 2. 
-    if (UniqueValues.size() == VL.size() || UniqueValues.size() <= 1 || 
-        !llvm::isPowerOf2_32(UniqueValues.size())) 
-      ReuseShuffleIndicies.clear(); 
-    else 
-      VL = UniqueValues; 
-  } 
- 
+          }
+        }
+        return V;
+      }
+    }
+  }
+
+  // Check that every instruction appears once in this bundle.
+  SmallVector<int, 4> ReuseShuffleIndicies;
+  SmallVector<Value *, 4> UniqueValues;
+  if (VL.size() > 2) {
+    DenseMap<Value *, unsigned> UniquePositions;
+    for (Value *V : VL) {
+      auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
+      ReuseShuffleIndicies.emplace_back(Res.first->second);
+      if (Res.second || isa<Constant>(V))
+        UniqueValues.emplace_back(V);
+    }
+    // Do not shuffle single element or if number of unique values is not power
+    // of 2.
+    if (UniqueValues.size() == VL.size() || UniqueValues.size() <= 1 ||
+        !llvm::isPowerOf2_32(UniqueValues.size()))
+      ReuseShuffleIndicies.clear();
+    else
+      VL = UniqueValues;
+  }
+
   Value *Vec = gather(VL);
-  if (!ReuseShuffleIndicies.empty()) { 
+  if (!ReuseShuffleIndicies.empty()) {
     Vec = Builder.CreateShuffleVector(Vec, ReuseShuffleIndicies, "shuffle");
     if (auto *I = dyn_cast<Instruction>(Vec)) {
-      GatherSeq.insert(I); 
-      CSEBlocks.insert(I->getParent()); 
-    } 
-  } 
+      GatherSeq.insert(I);
+      CSEBlocks.insert(I->getParent());
+    }
+  }
   return Vec;
-} 
- 
-Value *BoUpSLP::vectorizeTree(TreeEntry *E) { 
-  IRBuilder<>::InsertPointGuard Guard(Builder); 
- 
-  if (E->VectorizedValue) { 
-    LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); 
-    return E->VectorizedValue; 
-  } 
- 
-  bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); 
-  if (E->State == TreeEntry::NeedToGather) { 
-    setInsertPointAfterBundle(E); 
+}
+
+Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
+  IRBuilder<>::InsertPointGuard Guard(Builder);
+
+  if (E->VectorizedValue) {
+    LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
+    return E->VectorizedValue;
+  }
+
+  bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
+  if (E->State == TreeEntry::NeedToGather) {
+    setInsertPointAfterBundle(E);
     Value *Vec = gather(E->Scalars);
-    if (NeedToShuffleReuses) { 
+    if (NeedToShuffleReuses) {
       Vec = Builder.CreateShuffleVector(Vec, E->ReuseShuffleIndices, "shuffle");
       if (auto *I = dyn_cast<Instruction>(Vec)) {
-        GatherSeq.insert(I); 
-        CSEBlocks.insert(I->getParent()); 
-      } 
-    } 
+        GatherSeq.insert(I);
+        CSEBlocks.insert(I->getParent());
+      }
+    }
     E->VectorizedValue = Vec;
     return Vec;
-  } 
- 
+  }
+
   assert((E->State == TreeEntry::Vectorize ||
           E->State == TreeEntry::ScatterVectorize) &&
          "Unhandled state");
-  unsigned ShuffleOrOp = 
-      E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); 
+  unsigned ShuffleOrOp =
+      E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
   Instruction *VL0 = E->getMainOp();
   Type *ScalarTy = VL0->getType();
   if (auto *Store = dyn_cast<StoreInst>(VL0))
     ScalarTy = Store->getValueOperand()->getType();
   auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
-  switch (ShuffleOrOp) { 
-    case Instruction::PHI: { 
-      auto *PH = cast<PHINode>(VL0); 
-      Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI()); 
-      Builder.SetCurrentDebugLocation(PH->getDebugLoc()); 
-      PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); 
-      Value *V = NewPhi; 
+  switch (ShuffleOrOp) {
+    case Instruction::PHI: {
+      auto *PH = cast<PHINode>(VL0);
+      Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
+      Builder.SetCurrentDebugLocation(PH->getDebugLoc());
+      PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
+      Value *V = NewPhi;
       if (NeedToShuffleReuses)
         V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
 
-      E->VectorizedValue = V; 
- 
-      // PHINodes may have multiple entries from the same block. We want to 
-      // visit every block once. 
-      SmallPtrSet<BasicBlock*, 4> VisitedBBs; 
- 
-      for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { 
-        ValueList Operands; 
-        BasicBlock *IBB = PH->getIncomingBlock(i); 
- 
-        if (!VisitedBBs.insert(IBB).second) { 
-          NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB); 
-          continue; 
-        } 
- 
-        Builder.SetInsertPoint(IBB->getTerminator()); 
-        Builder.SetCurrentDebugLocation(PH->getDebugLoc()); 
-        Value *Vec = vectorizeTree(E->getOperand(i)); 
-        NewPhi->addIncoming(Vec, IBB); 
-      } 
- 
-      assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && 
-             "Invalid number of incoming values"); 
-      return V; 
-    } 
- 
-    case Instruction::ExtractElement: { 
-      Value *V = E->getSingleOperand(0); 
-      if (!E->ReorderIndices.empty()) { 
-        SmallVector<int, 4> Mask; 
-        inversePermutation(E->ReorderIndices, Mask); 
-        Builder.SetInsertPoint(VL0); 
+      E->VectorizedValue = V;
+
+      // PHINodes may have multiple entries from the same block. We want to
+      // visit every block once.
+      SmallPtrSet<BasicBlock*, 4> VisitedBBs;
+
+      for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
+        ValueList Operands;
+        BasicBlock *IBB = PH->getIncomingBlock(i);
+
+        if (!VisitedBBs.insert(IBB).second) {
+          NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
+          continue;
+        }
+
+        Builder.SetInsertPoint(IBB->getTerminator());
+        Builder.SetCurrentDebugLocation(PH->getDebugLoc());
+        Value *Vec = vectorizeTree(E->getOperand(i));
+        NewPhi->addIncoming(Vec, IBB);
+      }
+
+      assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
+             "Invalid number of incoming values");
+      return V;
+    }
+
+    case Instruction::ExtractElement: {
+      Value *V = E->getSingleOperand(0);
+      if (!E->ReorderIndices.empty()) {
+        SmallVector<int, 4> Mask;
+        inversePermutation(E->ReorderIndices, Mask);
+        Builder.SetInsertPoint(VL0);
         V = Builder.CreateShuffleVector(V, Mask, "reorder_shuffle");
-      } 
-      if (NeedToShuffleReuses) { 
-        // TODO: Merge this shuffle with the ReorderShuffleMask. 
-        if (E->ReorderIndices.empty()) 
-          Builder.SetInsertPoint(VL0); 
+      }
+      if (NeedToShuffleReuses) {
+        // TODO: Merge this shuffle with the ReorderShuffleMask.
+        if (E->ReorderIndices.empty())
+          Builder.SetInsertPoint(VL0);
         V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
-      } 
-      E->VectorizedValue = V; 
-      return V; 
-    } 
-    case Instruction::ExtractValue: { 
+      }
+      E->VectorizedValue = V;
+      return V;
+    }
+    case Instruction::ExtractValue: {
       auto *LI = cast<LoadInst>(E->getSingleOperand(0));
-      Builder.SetInsertPoint(LI); 
+      Builder.SetInsertPoint(LI);
       auto *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
-      Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); 
-      LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign()); 
-      Value *NewV = propagateMetadata(V, E->Scalars); 
-      if (!E->ReorderIndices.empty()) { 
-        SmallVector<int, 4> Mask; 
-        inversePermutation(E->ReorderIndices, Mask); 
+      Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
+      LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
+      Value *NewV = propagateMetadata(V, E->Scalars);
+      if (!E->ReorderIndices.empty()) {
+        SmallVector<int, 4> Mask;
+        inversePermutation(E->ReorderIndices, Mask);
         NewV = Builder.CreateShuffleVector(NewV, Mask, "reorder_shuffle");
-      } 
-      if (NeedToShuffleReuses) { 
-        // TODO: Merge this shuffle with the ReorderShuffleMask. 
+      }
+      if (NeedToShuffleReuses) {
+        // TODO: Merge this shuffle with the ReorderShuffleMask.
         NewV = Builder.CreateShuffleVector(NewV, E->ReuseShuffleIndices,
                                            "shuffle");
-      } 
-      E->VectorizedValue = NewV; 
-      return NewV; 
-    } 
-    case Instruction::ZExt: 
-    case Instruction::SExt: 
-    case Instruction::FPToUI: 
-    case Instruction::FPToSI: 
-    case Instruction::FPExt: 
-    case Instruction::PtrToInt: 
-    case Instruction::IntToPtr: 
-    case Instruction::SIToFP: 
-    case Instruction::UIToFP: 
-    case Instruction::Trunc: 
-    case Instruction::FPTrunc: 
-    case Instruction::BitCast: { 
-      setInsertPointAfterBundle(E); 
- 
-      Value *InVec = vectorizeTree(E->getOperand(0)); 
- 
-      if (E->VectorizedValue) { 
-        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 
-        return E->VectorizedValue; 
-      } 
- 
-      auto *CI = cast<CastInst>(VL0); 
-      Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); 
+      }
+      E->VectorizedValue = NewV;
+      return NewV;
+    }
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::SIToFP:
+    case Instruction::UIToFP:
+    case Instruction::Trunc:
+    case Instruction::FPTrunc:
+    case Instruction::BitCast: {
+      setInsertPointAfterBundle(E);
+
+      Value *InVec = vectorizeTree(E->getOperand(0));
+
+      if (E->VectorizedValue) {
+        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+        return E->VectorizedValue;
+      }
+
+      auto *CI = cast<CastInst>(VL0);
+      Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
       if (NeedToShuffleReuses)
         V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
 
-      E->VectorizedValue = V; 
-      ++NumVectorInstructions; 
-      return V; 
-    } 
-    case Instruction::FCmp: 
-    case Instruction::ICmp: { 
-      setInsertPointAfterBundle(E); 
- 
-      Value *L = vectorizeTree(E->getOperand(0)); 
-      Value *R = vectorizeTree(E->getOperand(1)); 
- 
-      if (E->VectorizedValue) { 
-        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 
-        return E->VectorizedValue; 
-      } 
- 
-      CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); 
-      Value *V = Builder.CreateCmp(P0, L, R); 
-      propagateIRFlags(V, E->Scalars, VL0); 
+      E->VectorizedValue = V;
+      ++NumVectorInstructions;
+      return V;
+    }
+    case Instruction::FCmp:
+    case Instruction::ICmp: {
+      setInsertPointAfterBundle(E);
+
+      Value *L = vectorizeTree(E->getOperand(0));
+      Value *R = vectorizeTree(E->getOperand(1));
+
+      if (E->VectorizedValue) {
+        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+        return E->VectorizedValue;
+      }
+
+      CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
+      Value *V = Builder.CreateCmp(P0, L, R);
+      propagateIRFlags(V, E->Scalars, VL0);
       if (NeedToShuffleReuses)
         V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
 
-      E->VectorizedValue = V; 
-      ++NumVectorInstructions; 
-      return V; 
-    } 
-    case Instruction::Select: { 
-      setInsertPointAfterBundle(E); 
- 
-      Value *Cond = vectorizeTree(E->getOperand(0)); 
-      Value *True = vectorizeTree(E->getOperand(1)); 
-      Value *False = vectorizeTree(E->getOperand(2)); 
- 
-      if (E->VectorizedValue) { 
-        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 
-        return E->VectorizedValue; 
-      } 
- 
-      Value *V = Builder.CreateSelect(Cond, True, False); 
+      E->VectorizedValue = V;
+      ++NumVectorInstructions;
+      return V;
+    }
+    case Instruction::Select: {
+      setInsertPointAfterBundle(E);
+
+      Value *Cond = vectorizeTree(E->getOperand(0));
+      Value *True = vectorizeTree(E->getOperand(1));
+      Value *False = vectorizeTree(E->getOperand(2));
+
+      if (E->VectorizedValue) {
+        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+        return E->VectorizedValue;
+      }
+
+      Value *V = Builder.CreateSelect(Cond, True, False);
       if (NeedToShuffleReuses)
         V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
 
-      E->VectorizedValue = V; 
-      ++NumVectorInstructions; 
-      return V; 
-    } 
-    case Instruction::FNeg: { 
-      setInsertPointAfterBundle(E); 
- 
-      Value *Op = vectorizeTree(E->getOperand(0)); 
- 
-      if (E->VectorizedValue) { 
-        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 
-        return E->VectorizedValue; 
-      } 
- 
-      Value *V = Builder.CreateUnOp( 
-          static_cast<Instruction::UnaryOps>(E->getOpcode()), Op); 
-      propagateIRFlags(V, E->Scalars, VL0); 
-      if (auto *I = dyn_cast<Instruction>(V)) 
-        V = propagateMetadata(I, E->Scalars); 
- 
+      E->VectorizedValue = V;
+      ++NumVectorInstructions;
+      return V;
+    }
+    case Instruction::FNeg: {
+      setInsertPointAfterBundle(E);
+
+      Value *Op = vectorizeTree(E->getOperand(0));
+
+      if (E->VectorizedValue) {
+        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+        return E->VectorizedValue;
+      }
+
+      Value *V = Builder.CreateUnOp(
+          static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
+      propagateIRFlags(V, E->Scalars, VL0);
+      if (auto *I = dyn_cast<Instruction>(V))
+        V = propagateMetadata(I, E->Scalars);
+
       if (NeedToShuffleReuses)
         V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
 
-      E->VectorizedValue = V; 
-      ++NumVectorInstructions; 
- 
-      return V; 
-    } 
-    case Instruction::Add: 
-    case Instruction::FAdd: 
-    case Instruction::Sub: 
-    case Instruction::FSub: 
-    case Instruction::Mul: 
-    case Instruction::FMul: 
-    case Instruction::UDiv: 
-    case Instruction::SDiv: 
-    case Instruction::FDiv: 
-    case Instruction::URem: 
-    case Instruction::SRem: 
-    case Instruction::FRem: 
-    case Instruction::Shl: 
-    case Instruction::LShr: 
-    case Instruction::AShr: 
-    case Instruction::And: 
-    case Instruction::Or: 
-    case Instruction::Xor: { 
-      setInsertPointAfterBundle(E); 
- 
-      Value *LHS = vectorizeTree(E->getOperand(0)); 
-      Value *RHS = vectorizeTree(E->getOperand(1)); 
- 
-      if (E->VectorizedValue) { 
-        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 
-        return E->VectorizedValue; 
-      } 
- 
-      Value *V = Builder.CreateBinOp( 
-          static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, 
-          RHS); 
-      propagateIRFlags(V, E->Scalars, VL0); 
-      if (auto *I = dyn_cast<Instruction>(V)) 
-        V = propagateMetadata(I, E->Scalars); 
- 
+      E->VectorizedValue = V;
+      ++NumVectorInstructions;
+
+      return V;
+    }
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor: {
+      setInsertPointAfterBundle(E);
+
+      Value *LHS = vectorizeTree(E->getOperand(0));
+      Value *RHS = vectorizeTree(E->getOperand(1));
+
+      if (E->VectorizedValue) {
+        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+        return E->VectorizedValue;
+      }
+
+      Value *V = Builder.CreateBinOp(
+          static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
+          RHS);
+      propagateIRFlags(V, E->Scalars, VL0);
+      if (auto *I = dyn_cast<Instruction>(V))
+        V = propagateMetadata(I, E->Scalars);
+
       if (NeedToShuffleReuses)
         V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
 
-      E->VectorizedValue = V; 
-      ++NumVectorInstructions; 
- 
-      return V; 
-    } 
-    case Instruction::Load: { 
-      // Loads are inserted at the head of the tree because we don't want to 
-      // sink them all the way down past store instructions. 
-      bool IsReorder = E->updateStateIfReorder(); 
-      if (IsReorder) 
-        VL0 = E->getMainOp(); 
-      setInsertPointAfterBundle(E); 
- 
-      LoadInst *LI = cast<LoadInst>(VL0); 
+      E->VectorizedValue = V;
+      ++NumVectorInstructions;
+
+      return V;
+    }
+    case Instruction::Load: {
+      // Loads are inserted at the head of the tree because we don't want to
+      // sink them all the way down past store instructions.
+      bool IsReorder = E->updateStateIfReorder();
+      if (IsReorder)
+        VL0 = E->getMainOp();
+      setInsertPointAfterBundle(E);
+
+      LoadInst *LI = cast<LoadInst>(VL0);
       Instruction *NewLI;
-      unsigned AS = LI->getPointerAddressSpace(); 
+      unsigned AS = LI->getPointerAddressSpace();
       Value *PO = LI->getPointerOperand();
       if (E->State == TreeEntry::Vectorize) {
- 
+
         Value *VecPtr = Builder.CreateBitCast(PO, VecTy->getPointerTo(AS));
- 
+
         // The pointer operand uses an in-tree scalar so we add the new BitCast
         // to ExternalUses list to make sure that an extract will be generated
         // in the future.
         if (getTreeEntry(PO))
           ExternalUses.emplace_back(PO, cast<User>(VecPtr), 0);
- 
+
         NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());
       } else {
         assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
@@ -4618,922 +4618,922 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       }
       Value *V = propagateMetadata(NewLI, E->Scalars);
 
-      if (IsReorder) { 
-        SmallVector<int, 4> Mask; 
-        inversePermutation(E->ReorderIndices, Mask); 
+      if (IsReorder) {
+        SmallVector<int, 4> Mask;
+        inversePermutation(E->ReorderIndices, Mask);
         V = Builder.CreateShuffleVector(V, Mask, "reorder_shuffle");
-      } 
-      if (NeedToShuffleReuses) { 
-        // TODO: Merge this shuffle with the ReorderShuffleMask. 
+      }
+      if (NeedToShuffleReuses) {
+        // TODO: Merge this shuffle with the ReorderShuffleMask.
         V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
-      } 
-      E->VectorizedValue = V; 
-      ++NumVectorInstructions; 
-      return V; 
-    } 
-    case Instruction::Store: { 
-      bool IsReorder = !E->ReorderIndices.empty(); 
-      auto *SI = cast<StoreInst>( 
-          IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0); 
-      unsigned AS = SI->getPointerAddressSpace(); 
- 
-      setInsertPointAfterBundle(E); 
- 
-      Value *VecValue = vectorizeTree(E->getOperand(0)); 
-      if (IsReorder) { 
-        SmallVector<int, 4> Mask(E->ReorderIndices.begin(), 
-                                 E->ReorderIndices.end()); 
+      }
+      E->VectorizedValue = V;
+      ++NumVectorInstructions;
+      return V;
+    }
+    case Instruction::Store: {
+      bool IsReorder = !E->ReorderIndices.empty();
+      auto *SI = cast<StoreInst>(
+          IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0);
+      unsigned AS = SI->getPointerAddressSpace();
+
+      setInsertPointAfterBundle(E);
+
+      Value *VecValue = vectorizeTree(E->getOperand(0));
+      if (IsReorder) {
+        SmallVector<int, 4> Mask(E->ReorderIndices.begin(),
+                                 E->ReorderIndices.end());
         VecValue = Builder.CreateShuffleVector(VecValue, Mask, "reorder_shuf");
-      } 
-      Value *ScalarPtr = SI->getPointerOperand(); 
-      Value *VecPtr = Builder.CreateBitCast( 
-          ScalarPtr, VecValue->getType()->getPointerTo(AS)); 
-      StoreInst *ST = Builder.CreateAlignedStore(VecValue, VecPtr, 
-                                                 SI->getAlign()); 
- 
-      // The pointer operand uses an in-tree scalar, so add the new BitCast to 
-      // ExternalUses to make sure that an extract will be generated in the 
-      // future. 
-      if (getTreeEntry(ScalarPtr)) 
-        ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0)); 
- 
-      Value *V = propagateMetadata(ST, E->Scalars); 
+      }
+      Value *ScalarPtr = SI->getPointerOperand();
+      Value *VecPtr = Builder.CreateBitCast(
+          ScalarPtr, VecValue->getType()->getPointerTo(AS));
+      StoreInst *ST = Builder.CreateAlignedStore(VecValue, VecPtr,
+                                                 SI->getAlign());
+
+      // The pointer operand uses an in-tree scalar, so add the new BitCast to
+      // ExternalUses to make sure that an extract will be generated in the
+      // future.
+      if (getTreeEntry(ScalarPtr))
+        ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));
+
+      Value *V = propagateMetadata(ST, E->Scalars);
       if (NeedToShuffleReuses)
         V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
 
-      E->VectorizedValue = V; 
-      ++NumVectorInstructions; 
-      return V; 
-    } 
-    case Instruction::GetElementPtr: { 
-      setInsertPointAfterBundle(E); 
- 
-      Value *Op0 = vectorizeTree(E->getOperand(0)); 
- 
-      std::vector<Value *> OpVecs; 
-      for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e; 
-           ++j) { 
-        ValueList &VL = E->getOperand(j); 
-        // Need to cast all elements to the same type before vectorization to 
-        // avoid crash. 
-        Type *VL0Ty = VL0->getOperand(j)->getType(); 
-        Type *Ty = llvm::all_of( 
-                       VL, [VL0Ty](Value *V) { return VL0Ty == V->getType(); }) 
-                       ? VL0Ty 
-                       : DL->getIndexType(cast<GetElementPtrInst>(VL0) 
-                                              ->getPointerOperandType() 
-                                              ->getScalarType()); 
-        for (Value *&V : VL) { 
-          auto *CI = cast<ConstantInt>(V); 
-          V = ConstantExpr::getIntegerCast(CI, Ty, 
-                                           CI->getValue().isSignBitSet()); 
-        } 
-        Value *OpVec = vectorizeTree(VL); 
-        OpVecs.push_back(OpVec); 
-      } 
- 
-      Value *V = Builder.CreateGEP( 
-          cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs); 
-      if (Instruction *I = dyn_cast<Instruction>(V)) 
-        V = propagateMetadata(I, E->Scalars); 
- 
+      E->VectorizedValue = V;
+      ++NumVectorInstructions;
+      return V;
+    }
+    case Instruction::GetElementPtr: {
+      setInsertPointAfterBundle(E);
+
+      Value *Op0 = vectorizeTree(E->getOperand(0));
+
+      std::vector<Value *> OpVecs;
+      for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
+           ++j) {
+        ValueList &VL = E->getOperand(j);
+        // Need to cast all elements to the same type before vectorization to
+        // avoid crash.
+        Type *VL0Ty = VL0->getOperand(j)->getType();
+        Type *Ty = llvm::all_of(
+                       VL, [VL0Ty](Value *V) { return VL0Ty == V->getType(); })
+                       ? VL0Ty
+                       : DL->getIndexType(cast<GetElementPtrInst>(VL0)
+                                              ->getPointerOperandType()
+                                              ->getScalarType());
+        for (Value *&V : VL) {
+          auto *CI = cast<ConstantInt>(V);
+          V = ConstantExpr::getIntegerCast(CI, Ty,
+                                           CI->getValue().isSignBitSet());
+        }
+        Value *OpVec = vectorizeTree(VL);
+        OpVecs.push_back(OpVec);
+      }
+
+      Value *V = Builder.CreateGEP(
+          cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
+      if (Instruction *I = dyn_cast<Instruction>(V))
+        V = propagateMetadata(I, E->Scalars);
+
       if (NeedToShuffleReuses)
         V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
 
-      E->VectorizedValue = V; 
-      ++NumVectorInstructions; 
- 
-      return V; 
-    } 
-    case Instruction::Call: { 
-      CallInst *CI = cast<CallInst>(VL0); 
-      setInsertPointAfterBundle(E); 
- 
-      Intrinsic::ID IID  = Intrinsic::not_intrinsic; 
-      if (Function *FI = CI->getCalledFunction()) 
-        IID = FI->getIntrinsicID(); 
- 
-      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 
- 
-      auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI); 
-      bool UseIntrinsic = ID != Intrinsic::not_intrinsic && 
-                          VecCallCosts.first <= VecCallCosts.second; 
- 
-      Value *ScalarArg = nullptr; 
-      std::vector<Value *> OpVecs; 
-      for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) { 
-        ValueList OpVL; 
-        // Some intrinsics have scalar arguments. This argument should not be 
-        // vectorized. 
-        if (UseIntrinsic && hasVectorInstrinsicScalarOpd(IID, j)) { 
-          CallInst *CEI = cast<CallInst>(VL0); 
-          ScalarArg = CEI->getArgOperand(j); 
-          OpVecs.push_back(CEI->getArgOperand(j)); 
-          continue; 
-        } 
- 
-        Value *OpVec = vectorizeTree(E->getOperand(j)); 
-        LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n"); 
-        OpVecs.push_back(OpVec); 
-      } 
- 
-      Function *CF; 
-      if (!UseIntrinsic) { 
+      E->VectorizedValue = V;
+      ++NumVectorInstructions;
+
+      return V;
+    }
+    case Instruction::Call: {
+      CallInst *CI = cast<CallInst>(VL0);
+      setInsertPointAfterBundle(E);
+
+      Intrinsic::ID IID  = Intrinsic::not_intrinsic;
+      if (Function *FI = CI->getCalledFunction())
+        IID = FI->getIntrinsicID();
+
+      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+
+      auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
+      bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
+                          VecCallCosts.first <= VecCallCosts.second;
+
+      Value *ScalarArg = nullptr;
+      std::vector<Value *> OpVecs;
+      for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
+        ValueList OpVL;
+        // Some intrinsics have scalar arguments. This argument should not be
+        // vectorized.
+        if (UseIntrinsic && hasVectorInstrinsicScalarOpd(IID, j)) {
+          CallInst *CEI = cast<CallInst>(VL0);
+          ScalarArg = CEI->getArgOperand(j);
+          OpVecs.push_back(CEI->getArgOperand(j));
+          continue;
+        }
+
+        Value *OpVec = vectorizeTree(E->getOperand(j));
+        LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
+        OpVecs.push_back(OpVec);
+      }
+
+      Function *CF;
+      if (!UseIntrinsic) {
         VFShape Shape =
             VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>(
                                   VecTy->getNumElements())),
                          false /*HasGlobalPred*/);
-        CF = VFDatabase(*CI).getVectorizedFunction(Shape); 
-      } else { 
-        Type *Tys[] = {FixedVectorType::get(CI->getType(), E->Scalars.size())}; 
-        CF = Intrinsic::getDeclaration(F->getParent(), ID, Tys); 
-      } 
- 
-      SmallVector<OperandBundleDef, 1> OpBundles; 
-      CI->getOperandBundlesAsDefs(OpBundles); 
-      Value *V = Builder.CreateCall(CF, OpVecs, OpBundles); 
- 
-      // The scalar argument uses an in-tree scalar so we add the new vectorized 
-      // call to ExternalUses list to make sure that an extract will be 
-      // generated in the future. 
-      if (ScalarArg && getTreeEntry(ScalarArg)) 
-        ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0)); 
- 
-      propagateIRFlags(V, E->Scalars, VL0); 
+        CF = VFDatabase(*CI).getVectorizedFunction(Shape);
+      } else {
+        Type *Tys[] = {FixedVectorType::get(CI->getType(), E->Scalars.size())};
+        CF = Intrinsic::getDeclaration(F->getParent(), ID, Tys);
+      }
+
+      SmallVector<OperandBundleDef, 1> OpBundles;
+      CI->getOperandBundlesAsDefs(OpBundles);
+      Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
+
+      // The scalar argument uses an in-tree scalar so we add the new vectorized
+      // call to ExternalUses list to make sure that an extract will be
+      // generated in the future.
+      if (ScalarArg && getTreeEntry(ScalarArg))
+        ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
+
+      propagateIRFlags(V, E->Scalars, VL0);
       if (NeedToShuffleReuses)
         V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
 
-      E->VectorizedValue = V; 
-      ++NumVectorInstructions; 
-      return V; 
-    } 
-    case Instruction::ShuffleVector: { 
-      assert(E->isAltShuffle() && 
-             ((Instruction::isBinaryOp(E->getOpcode()) && 
-               Instruction::isBinaryOp(E->getAltOpcode())) || 
-              (Instruction::isCast(E->getOpcode()) && 
-               Instruction::isCast(E->getAltOpcode()))) && 
-             "Invalid Shuffle Vector Operand"); 
- 
-      Value *LHS = nullptr, *RHS = nullptr; 
-      if (Instruction::isBinaryOp(E->getOpcode())) { 
-        setInsertPointAfterBundle(E); 
-        LHS = vectorizeTree(E->getOperand(0)); 
-        RHS = vectorizeTree(E->getOperand(1)); 
-      } else { 
-        setInsertPointAfterBundle(E); 
-        LHS = vectorizeTree(E->getOperand(0)); 
-      } 
- 
-      if (E->VectorizedValue) { 
-        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); 
-        return E->VectorizedValue; 
-      } 
- 
-      Value *V0, *V1; 
-      if (Instruction::isBinaryOp(E->getOpcode())) { 
-        V0 = Builder.CreateBinOp( 
-            static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS); 
-        V1 = Builder.CreateBinOp( 
-            static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS); 
-      } else { 
-        V0 = Builder.CreateCast( 
-            static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy); 
-        V1 = Builder.CreateCast( 
-            static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy); 
-      } 
- 
-      // Create shuffle to take alternate operations from the vector. 
-      // Also, gather up main and alt scalar ops to propagate IR flags to 
-      // each vector operation. 
-      ValueList OpScalars, AltScalars; 
-      unsigned e = E->Scalars.size(); 
-      SmallVector<int, 8> Mask(e); 
-      for (unsigned i = 0; i < e; ++i) { 
-        auto *OpInst = cast<Instruction>(E->Scalars[i]); 
-        assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode"); 
-        if (OpInst->getOpcode() == E->getAltOpcode()) { 
-          Mask[i] = e + i; 
-          AltScalars.push_back(E->Scalars[i]); 
-        } else { 
-          Mask[i] = i; 
-          OpScalars.push_back(E->Scalars[i]); 
-        } 
-      } 
- 
-      propagateIRFlags(V0, OpScalars); 
-      propagateIRFlags(V1, AltScalars); 
- 
-      Value *V = Builder.CreateShuffleVector(V0, V1, Mask); 
-      if (Instruction *I = dyn_cast<Instruction>(V)) 
-        V = propagateMetadata(I, E->Scalars); 
+      E->VectorizedValue = V;
+      ++NumVectorInstructions;
+      return V;
+    }
+    case Instruction::ShuffleVector: {
+      assert(E->isAltShuffle() &&
+             ((Instruction::isBinaryOp(E->getOpcode()) &&
+               Instruction::isBinaryOp(E->getAltOpcode())) ||
+              (Instruction::isCast(E->getOpcode()) &&
+               Instruction::isCast(E->getAltOpcode()))) &&
+             "Invalid Shuffle Vector Operand");
+
+      Value *LHS = nullptr, *RHS = nullptr;
+      if (Instruction::isBinaryOp(E->getOpcode())) {
+        setInsertPointAfterBundle(E);
+        LHS = vectorizeTree(E->getOperand(0));
+        RHS = vectorizeTree(E->getOperand(1));
+      } else {
+        setInsertPointAfterBundle(E);
+        LHS = vectorizeTree(E->getOperand(0));
+      }
+
+      if (E->VectorizedValue) {
+        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+        return E->VectorizedValue;
+      }
+
+      Value *V0, *V1;
+      if (Instruction::isBinaryOp(E->getOpcode())) {
+        V0 = Builder.CreateBinOp(
+            static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
+        V1 = Builder.CreateBinOp(
+            static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
+      } else {
+        V0 = Builder.CreateCast(
+            static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
+        V1 = Builder.CreateCast(
+            static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
+      }
+
+      // Create shuffle to take alternate operations from the vector.
+      // Also, gather up main and alt scalar ops to propagate IR flags to
+      // each vector operation.
+      ValueList OpScalars, AltScalars;
+      unsigned e = E->Scalars.size();
+      SmallVector<int, 8> Mask(e);
+      for (unsigned i = 0; i < e; ++i) {
+        auto *OpInst = cast<Instruction>(E->Scalars[i]);
+        assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
+        if (OpInst->getOpcode() == E->getAltOpcode()) {
+          Mask[i] = e + i;
+          AltScalars.push_back(E->Scalars[i]);
+        } else {
+          Mask[i] = i;
+          OpScalars.push_back(E->Scalars[i]);
+        }
+      }
+
+      propagateIRFlags(V0, OpScalars);
+      propagateIRFlags(V1, AltScalars);
+
+      Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
+      if (Instruction *I = dyn_cast<Instruction>(V))
+        V = propagateMetadata(I, E->Scalars);
       if (NeedToShuffleReuses)
         V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
 
-      E->VectorizedValue = V; 
-      ++NumVectorInstructions; 
- 
-      return V; 
-    } 
-    default: 
-    llvm_unreachable("unknown inst"); 
-  } 
-  return nullptr; 
-} 
- 
-Value *BoUpSLP::vectorizeTree() { 
-  ExtraValueToDebugLocsMap ExternallyUsedValues; 
-  return vectorizeTree(ExternallyUsedValues); 
-} 
- 
-Value * 
-BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { 
-  // All blocks must be scheduled before any instructions are inserted. 
-  for (auto &BSIter : BlocksSchedules) { 
-    scheduleBlock(BSIter.second.get()); 
-  } 
- 
-  Builder.SetInsertPoint(&F->getEntryBlock().front()); 
-  auto *VectorRoot = vectorizeTree(VectorizableTree[0].get()); 
- 
-  // If the vectorized tree can be rewritten in a smaller type, we truncate the 
-  // vectorized root. InstCombine will then rewrite the entire expression. We 
-  // sign extend the extracted values below. 
-  auto *ScalarRoot = VectorizableTree[0]->Scalars[0]; 
-  if (MinBWs.count(ScalarRoot)) { 
-    if (auto *I = dyn_cast<Instruction>(VectorRoot)) 
-      Builder.SetInsertPoint(&*++BasicBlock::iterator(I)); 
-    auto BundleWidth = VectorizableTree[0]->Scalars.size(); 
-    auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); 
-    auto *VecTy = FixedVectorType::get(MinTy, BundleWidth); 
-    auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy); 
-    VectorizableTree[0]->VectorizedValue = Trunc; 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() 
-                    << " values .\n"); 
- 
-  // If necessary, sign-extend or zero-extend ScalarRoot to the larger type 
-  // specified by ScalarType. 
-  auto extend = [&](Value *ScalarRoot, Value *Ex, Type *ScalarType) { 
-    if (!MinBWs.count(ScalarRoot)) 
-      return Ex; 
-    if (MinBWs[ScalarRoot].second) 
-      return Builder.CreateSExt(Ex, ScalarType); 
-    return Builder.CreateZExt(Ex, ScalarType); 
-  }; 
- 
-  // Extract all of the elements with the external uses. 
-  for (const auto &ExternalUse : ExternalUses) { 
-    Value *Scalar = ExternalUse.Scalar; 
-    llvm::User *User = ExternalUse.User; 
- 
-    // Skip users that we already RAUW. This happens when one instruction 
-    // has multiple uses of the same value. 
-    if (User && !is_contained(Scalar->users(), User)) 
-      continue; 
-    TreeEntry *E = getTreeEntry(Scalar); 
-    assert(E && "Invalid scalar"); 
+      E->VectorizedValue = V;
+      ++NumVectorInstructions;
+
+      return V;
+    }
+    default:
+    llvm_unreachable("unknown inst");
+  }
+  return nullptr;
+}
+
+Value *BoUpSLP::vectorizeTree() {
+  ExtraValueToDebugLocsMap ExternallyUsedValues;
+  return vectorizeTree(ExternallyUsedValues);
+}
+
+Value *
+BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
+  // All blocks must be scheduled before any instructions are inserted.
+  for (auto &BSIter : BlocksSchedules) {
+    scheduleBlock(BSIter.second.get());
+  }
+
+  Builder.SetInsertPoint(&F->getEntryBlock().front());
+  auto *VectorRoot = vectorizeTree(VectorizableTree[0].get());
+
+  // If the vectorized tree can be rewritten in a smaller type, we truncate the
+  // vectorized root. InstCombine will then rewrite the entire expression. We
+  // sign extend the extracted values below.
+  auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
+  if (MinBWs.count(ScalarRoot)) {
+    if (auto *I = dyn_cast<Instruction>(VectorRoot))
+      Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
+    auto BundleWidth = VectorizableTree[0]->Scalars.size();
+    auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
+    auto *VecTy = FixedVectorType::get(MinTy, BundleWidth);
+    auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
+    VectorizableTree[0]->VectorizedValue = Trunc;
+  }
+
+  LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
+                    << " values .\n");
+
+  // If necessary, sign-extend or zero-extend ScalarRoot to the larger type
+  // specified by ScalarType.
+  auto extend = [&](Value *ScalarRoot, Value *Ex, Type *ScalarType) {
+    if (!MinBWs.count(ScalarRoot))
+      return Ex;
+    if (MinBWs[ScalarRoot].second)
+      return Builder.CreateSExt(Ex, ScalarType);
+    return Builder.CreateZExt(Ex, ScalarType);
+  };
+
+  // Extract all of the elements with the external uses.
+  for (const auto &ExternalUse : ExternalUses) {
+    Value *Scalar = ExternalUse.Scalar;
+    llvm::User *User = ExternalUse.User;
+
+    // Skip users that we already RAUW. This happens when one instruction
+    // has multiple uses of the same value.
+    if (User && !is_contained(Scalar->users(), User))
+      continue;
+    TreeEntry *E = getTreeEntry(Scalar);
+    assert(E && "Invalid scalar");
     assert(E->State != TreeEntry::NeedToGather &&
            "Extracting from a gather list");
- 
-    Value *Vec = E->VectorizedValue; 
-    assert(Vec && "Can't find vectorizable value"); 
- 
-    Value *Lane = Builder.getInt32(ExternalUse.Lane); 
-    // If User == nullptr, the Scalar is used as extra arg. Generate 
-    // ExtractElement instruction and update the record for this scalar in 
-    // ExternallyUsedValues. 
-    if (!User) { 
-      assert(ExternallyUsedValues.count(Scalar) && 
-             "Scalar with nullptr as an external user must be registered in " 
-             "ExternallyUsedValues map"); 
-      if (auto *VecI = dyn_cast<Instruction>(Vec)) { 
-        Builder.SetInsertPoint(VecI->getParent(), 
-                               std::next(VecI->getIterator())); 
-      } else { 
-        Builder.SetInsertPoint(&F->getEntryBlock().front()); 
-      } 
-      Value *Ex = Builder.CreateExtractElement(Vec, Lane); 
-      Ex = extend(ScalarRoot, Ex, Scalar->getType()); 
-      CSEBlocks.insert(cast<Instruction>(Scalar)->getParent()); 
-      auto &Locs = ExternallyUsedValues[Scalar]; 
-      ExternallyUsedValues.insert({Ex, Locs}); 
-      ExternallyUsedValues.erase(Scalar); 
-      // Required to update internally referenced instructions. 
-      Scalar->replaceAllUsesWith(Ex); 
-      continue; 
-    } 
- 
-    // Generate extracts for out-of-tree users. 
-    // Find the insertion point for the extractelement lane. 
-    if (auto *VecI = dyn_cast<Instruction>(Vec)) { 
-      if (PHINode *PH = dyn_cast<PHINode>(User)) { 
-        for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) { 
-          if (PH->getIncomingValue(i) == Scalar) { 
-            Instruction *IncomingTerminator = 
-                PH->getIncomingBlock(i)->getTerminator(); 
-            if (isa<CatchSwitchInst>(IncomingTerminator)) { 
-              Builder.SetInsertPoint(VecI->getParent(), 
-                                     std::next(VecI->getIterator())); 
-            } else { 
-              Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator()); 
-            } 
-            Value *Ex = Builder.CreateExtractElement(Vec, Lane); 
-            Ex = extend(ScalarRoot, Ex, Scalar->getType()); 
-            CSEBlocks.insert(PH->getIncomingBlock(i)); 
-            PH->setOperand(i, Ex); 
-          } 
-        } 
-      } else { 
-        Builder.SetInsertPoint(cast<Instruction>(User)); 
-        Value *Ex = Builder.CreateExtractElement(Vec, Lane); 
-        Ex = extend(ScalarRoot, Ex, Scalar->getType()); 
-        CSEBlocks.insert(cast<Instruction>(User)->getParent()); 
-        User->replaceUsesOfWith(Scalar, Ex); 
-      } 
-    } else { 
-      Builder.SetInsertPoint(&F->getEntryBlock().front()); 
-      Value *Ex = Builder.CreateExtractElement(Vec, Lane); 
-      Ex = extend(ScalarRoot, Ex, Scalar->getType()); 
-      CSEBlocks.insert(&F->getEntryBlock()); 
-      User->replaceUsesOfWith(Scalar, Ex); 
-    } 
- 
-    LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n"); 
-  } 
- 
-  // For each vectorized value: 
-  for (auto &TEPtr : VectorizableTree) { 
-    TreeEntry *Entry = TEPtr.get(); 
- 
-    // No need to handle users of gathered values. 
-    if (Entry->State == TreeEntry::NeedToGather) 
-      continue; 
- 
-    assert(Entry->VectorizedValue && "Can't find vectorizable value"); 
- 
-    // For each lane: 
-    for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { 
-      Value *Scalar = Entry->Scalars[Lane]; 
- 
-#ifndef NDEBUG 
-      Type *Ty = Scalar->getType(); 
-      if (!Ty->isVoidTy()) { 
-        for (User *U : Scalar->users()) { 
-          LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); 
- 
-          // It is legal to delete users in the ignorelist. 
-          assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) && 
-                 "Deleting out-of-tree value"); 
-        } 
-      } 
-#endif 
-      LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n"); 
-      eraseInstruction(cast<Instruction>(Scalar)); 
-    } 
-  } 
- 
-  Builder.ClearInsertionPoint(); 
-  InstrElementSize.clear(); 
- 
-  return VectorizableTree[0]->VectorizedValue; 
-} 
- 
-void BoUpSLP::optimizeGatherSequence() { 
-  LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size() 
-                    << " gather sequences instructions.\n"); 
-  // LICM InsertElementInst sequences. 
-  for (Instruction *I : GatherSeq) { 
-    if (isDeleted(I)) 
-      continue; 
- 
-    // Check if this block is inside a loop. 
-    Loop *L = LI->getLoopFor(I->getParent()); 
-    if (!L) 
-      continue; 
- 
-    // Check if it has a preheader. 
-    BasicBlock *PreHeader = L->getLoopPreheader(); 
-    if (!PreHeader) 
-      continue; 
- 
-    // If the vector or the element that we insert into it are 
-    // instructions that are defined in this basic block then we can't 
-    // hoist this instruction. 
-    auto *Op0 = dyn_cast<Instruction>(I->getOperand(0)); 
-    auto *Op1 = dyn_cast<Instruction>(I->getOperand(1)); 
-    if (Op0 && L->contains(Op0)) 
-      continue; 
-    if (Op1 && L->contains(Op1)) 
-      continue; 
- 
-    // We can hoist this instruction. Move it to the pre-header. 
-    I->moveBefore(PreHeader->getTerminator()); 
-  } 
- 
-  // Make a list of all reachable blocks in our CSE queue. 
-  SmallVector<const DomTreeNode *, 8> CSEWorkList; 
-  CSEWorkList.reserve(CSEBlocks.size()); 
-  for (BasicBlock *BB : CSEBlocks) 
-    if (DomTreeNode *N = DT->getNode(BB)) { 
-      assert(DT->isReachableFromEntry(N)); 
-      CSEWorkList.push_back(N); 
-    } 
- 
-  // Sort blocks by domination. This ensures we visit a block after all blocks 
-  // dominating it are visited. 
-  llvm::stable_sort(CSEWorkList, 
-                    [this](const DomTreeNode *A, const DomTreeNode *B) { 
-                      return DT->properlyDominates(A, B); 
-                    }); 
- 
-  // Perform O(N^2) search over the gather sequences and merge identical 
-  // instructions. TODO: We can further optimize this scan if we split the 
-  // instructions into different buckets based on the insert lane. 
-  SmallVector<Instruction *, 16> Visited; 
-  for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) { 
+
+    Value *Vec = E->VectorizedValue;
+    assert(Vec && "Can't find vectorizable value");
+
+    Value *Lane = Builder.getInt32(ExternalUse.Lane);
+    // If User == nullptr, the Scalar is used as extra arg. Generate
+    // ExtractElement instruction and update the record for this scalar in
+    // ExternallyUsedValues.
+    if (!User) {
+      assert(ExternallyUsedValues.count(Scalar) &&
+             "Scalar with nullptr as an external user must be registered in "
+             "ExternallyUsedValues map");
+      if (auto *VecI = dyn_cast<Instruction>(Vec)) {
+        Builder.SetInsertPoint(VecI->getParent(),
+                               std::next(VecI->getIterator()));
+      } else {
+        Builder.SetInsertPoint(&F->getEntryBlock().front());
+      }
+      Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+      Ex = extend(ScalarRoot, Ex, Scalar->getType());
+      CSEBlocks.insert(cast<Instruction>(Scalar)->getParent());
+      auto &Locs = ExternallyUsedValues[Scalar];
+      ExternallyUsedValues.insert({Ex, Locs});
+      ExternallyUsedValues.erase(Scalar);
+      // Required to update internally referenced instructions.
+      Scalar->replaceAllUsesWith(Ex);
+      continue;
+    }
+
+    // Generate extracts for out-of-tree users.
+    // Find the insertion point for the extractelement lane.
+    if (auto *VecI = dyn_cast<Instruction>(Vec)) {
+      if (PHINode *PH = dyn_cast<PHINode>(User)) {
+        for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
+          if (PH->getIncomingValue(i) == Scalar) {
+            Instruction *IncomingTerminator =
+                PH->getIncomingBlock(i)->getTerminator();
+            if (isa<CatchSwitchInst>(IncomingTerminator)) {
+              Builder.SetInsertPoint(VecI->getParent(),
+                                     std::next(VecI->getIterator()));
+            } else {
+              Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
+            }
+            Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+            Ex = extend(ScalarRoot, Ex, Scalar->getType());
+            CSEBlocks.insert(PH->getIncomingBlock(i));
+            PH->setOperand(i, Ex);
+          }
+        }
+      } else {
+        Builder.SetInsertPoint(cast<Instruction>(User));
+        Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+        Ex = extend(ScalarRoot, Ex, Scalar->getType());
+        CSEBlocks.insert(cast<Instruction>(User)->getParent());
+        User->replaceUsesOfWith(Scalar, Ex);
+      }
+    } else {
+      Builder.SetInsertPoint(&F->getEntryBlock().front());
+      Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+      Ex = extend(ScalarRoot, Ex, Scalar->getType());
+      CSEBlocks.insert(&F->getEntryBlock());
+      User->replaceUsesOfWith(Scalar, Ex);
+    }
+
+    LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
+  }
+
+  // For each vectorized value:
+  for (auto &TEPtr : VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+
+    // No need to handle users of gathered values.
+    if (Entry->State == TreeEntry::NeedToGather)
+      continue;
+
+    assert(Entry->VectorizedValue && "Can't find vectorizable value");
+
+    // For each lane:
+    for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
+      Value *Scalar = Entry->Scalars[Lane];
+
+#ifndef NDEBUG
+      Type *Ty = Scalar->getType();
+      if (!Ty->isVoidTy()) {
+        for (User *U : Scalar->users()) {
+          LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
+
+          // It is legal to delete users in the ignorelist.
+          assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&
+                 "Deleting out-of-tree value");
+        }
+      }
+#endif
+      LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
+      eraseInstruction(cast<Instruction>(Scalar));
+    }
+  }
+
+  Builder.ClearInsertionPoint();
+  InstrElementSize.clear();
+
+  return VectorizableTree[0]->VectorizedValue;
+}
+
+void BoUpSLP::optimizeGatherSequence() {
+  LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
+                    << " gather sequences instructions.\n");
+  // LICM InsertElementInst sequences.
+  for (Instruction *I : GatherSeq) {
+    if (isDeleted(I))
+      continue;
+
+    // Check if this block is inside a loop.
+    Loop *L = LI->getLoopFor(I->getParent());
+    if (!L)
+      continue;
+
+    // Check if it has a preheader.
+    BasicBlock *PreHeader = L->getLoopPreheader();
+    if (!PreHeader)
+      continue;
+
+    // If the vector or the element that we insert into it are
+    // instructions that are defined in this basic block then we can't
+    // hoist this instruction.
+    auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
+    auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
+    if (Op0 && L->contains(Op0))
+      continue;
+    if (Op1 && L->contains(Op1))
+      continue;
+
+    // We can hoist this instruction. Move it to the pre-header.
+    I->moveBefore(PreHeader->getTerminator());
+  }
+
+  // Make a list of all reachable blocks in our CSE queue.
+  SmallVector<const DomTreeNode *, 8> CSEWorkList;
+  CSEWorkList.reserve(CSEBlocks.size());
+  for (BasicBlock *BB : CSEBlocks)
+    if (DomTreeNode *N = DT->getNode(BB)) {
+      assert(DT->isReachableFromEntry(N));
+      CSEWorkList.push_back(N);
+    }
+
+  // Sort blocks by domination. This ensures we visit a block after all blocks
+  // dominating it are visited.
+  llvm::stable_sort(CSEWorkList,
+                    [this](const DomTreeNode *A, const DomTreeNode *B) {
+                      return DT->properlyDominates(A, B);
+                    });
+
+  // Perform O(N^2) search over the gather sequences and merge identical
+  // instructions. TODO: We can further optimize this scan if we split the
+  // instructions into different buckets based on the insert lane.
+  SmallVector<Instruction *, 16> Visited;
+  for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
     assert(*I &&
            (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
-           "Worklist not sorted properly!"); 
-    BasicBlock *BB = (*I)->getBlock(); 
-    // For all instructions in blocks containing gather sequences: 
-    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) { 
-      Instruction *In = &*it++; 
-      if (isDeleted(In)) 
-        continue; 
-      if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In)) 
-        continue; 
- 
-      // Check if we can replace this instruction with any of the 
-      // visited instructions. 
-      for (Instruction *v : Visited) { 
-        if (In->isIdenticalTo(v) && 
-            DT->dominates(v->getParent(), In->getParent())) { 
-          In->replaceAllUsesWith(v); 
-          eraseInstruction(In); 
-          In = nullptr; 
-          break; 
-        } 
-      } 
-      if (In) { 
-        assert(!is_contained(Visited, In)); 
-        Visited.push_back(In); 
-      } 
-    } 
-  } 
-  CSEBlocks.clear(); 
-  GatherSeq.clear(); 
-} 
- 
-// Groups the instructions to a bundle (which is then a single scheduling entity) 
-// and schedules instructions until the bundle gets ready. 
-Optional<BoUpSLP::ScheduleData *> 
-BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, 
-                                            const InstructionsState &S) { 
-  if (isa<PHINode>(S.OpValue)) 
-    return nullptr; 
- 
-  // Initialize the instruction bundle. 
-  Instruction *OldScheduleEnd = ScheduleEnd; 
-  ScheduleData *PrevInBundle = nullptr; 
-  ScheduleData *Bundle = nullptr; 
-  bool ReSchedule = false; 
-  LLVM_DEBUG(dbgs() << "SLP:  bundle: " << *S.OpValue << "\n"); 
- 
-  // Make sure that the scheduling region contains all 
-  // instructions of the bundle. 
-  for (Value *V : VL) { 
-    if (!extendSchedulingRegion(V, S)) 
-      return None; 
-  } 
- 
-  for (Value *V : VL) { 
-    ScheduleData *BundleMember = getScheduleData(V); 
-    assert(BundleMember && 
-           "no ScheduleData for bundle member (maybe not in same basic block)"); 
-    if (BundleMember->IsScheduled) { 
-      // A bundle member was scheduled as single instruction before and now 
-      // needs to be scheduled as part of the bundle. We just get rid of the 
-      // existing schedule. 
-      LLVM_DEBUG(dbgs() << "SLP:  reset schedule because " << *BundleMember 
-                        << " was already scheduled\n"); 
-      ReSchedule = true; 
-    } 
-    assert(BundleMember->isSchedulingEntity() && 
-           "bundle member already part of other bundle"); 
-    if (PrevInBundle) { 
-      PrevInBundle->NextInBundle = BundleMember; 
-    } else { 
-      Bundle = BundleMember; 
-    } 
-    BundleMember->UnscheduledDepsInBundle = 0; 
-    Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps; 
- 
-    // Group the instructions to a bundle. 
-    BundleMember->FirstInBundle = Bundle; 
-    PrevInBundle = BundleMember; 
-  } 
-  if (ScheduleEnd != OldScheduleEnd) { 
-    // The scheduling region got new instructions at the lower end (or it is a 
-    // new region for the first bundle). This makes it necessary to 
-    // recalculate all dependencies. 
-    // It is seldom that this needs to be done a second time after adding the 
-    // initial bundle to the region. 
-    for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 
-      doForAllOpcodes(I, [](ScheduleData *SD) { 
-        SD->clearDependencies(); 
-      }); 
-    } 
-    ReSchedule = true; 
-  } 
-  if (ReSchedule) { 
-    resetSchedule(); 
-    initialFillReadyList(ReadyInsts); 
-  } 
-  assert(Bundle && "Failed to find schedule bundle"); 
- 
-  LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block " 
-                    << BB->getName() << "\n"); 
- 
-  calculateDependencies(Bundle, true, SLP); 
- 
-  // Now try to schedule the new bundle. As soon as the bundle is "ready" it 
-  // means that there are no cyclic dependencies and we can schedule it. 
-  // Note that's important that we don't "schedule" the bundle yet (see 
-  // cancelScheduling). 
-  while (!Bundle->isReady() && !ReadyInsts.empty()) { 
- 
+           "Worklist not sorted properly!");
+    BasicBlock *BB = (*I)->getBlock();
+    // For all instructions in blocks containing gather sequences:
+    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
+      Instruction *In = &*it++;
+      if (isDeleted(In))
+        continue;
+      if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
+        continue;
+
+      // Check if we can replace this instruction with any of the
+      // visited instructions.
+      for (Instruction *v : Visited) {
+        if (In->isIdenticalTo(v) &&
+            DT->dominates(v->getParent(), In->getParent())) {
+          In->replaceAllUsesWith(v);
+          eraseInstruction(In);
+          In = nullptr;
+          break;
+        }
+      }
+      if (In) {
+        assert(!is_contained(Visited, In));
+        Visited.push_back(In);
+      }
+    }
+  }
+  CSEBlocks.clear();
+  GatherSeq.clear();
+}
+
+// Groups the instructions to a bundle (which is then a single scheduling entity)
+// and schedules instructions until the bundle gets ready.
+Optional<BoUpSLP::ScheduleData *>
+BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
+                                            const InstructionsState &S) {
+  if (isa<PHINode>(S.OpValue))
+    return nullptr;
+
+  // Initialize the instruction bundle.
+  Instruction *OldScheduleEnd = ScheduleEnd;
+  ScheduleData *PrevInBundle = nullptr;
+  ScheduleData *Bundle = nullptr;
+  bool ReSchedule = false;
+  LLVM_DEBUG(dbgs() << "SLP:  bundle: " << *S.OpValue << "\n");
+
+  // Make sure that the scheduling region contains all
+  // instructions of the bundle.
+  for (Value *V : VL) {
+    if (!extendSchedulingRegion(V, S))
+      return None;
+  }
+
+  for (Value *V : VL) {
+    ScheduleData *BundleMember = getScheduleData(V);
+    assert(BundleMember &&
+           "no ScheduleData for bundle member (maybe not in same basic block)");
+    if (BundleMember->IsScheduled) {
+      // A bundle member was scheduled as single instruction before and now
+      // needs to be scheduled as part of the bundle. We just get rid of the
+      // existing schedule.
+      LLVM_DEBUG(dbgs() << "SLP:  reset schedule because " << *BundleMember
+                        << " was already scheduled\n");
+      ReSchedule = true;
+    }
+    assert(BundleMember->isSchedulingEntity() &&
+           "bundle member already part of other bundle");
+    if (PrevInBundle) {
+      PrevInBundle->NextInBundle = BundleMember;
+    } else {
+      Bundle = BundleMember;
+    }
+    BundleMember->UnscheduledDepsInBundle = 0;
+    Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
+
+    // Group the instructions to a bundle.
+    BundleMember->FirstInBundle = Bundle;
+    PrevInBundle = BundleMember;
+  }
+  if (ScheduleEnd != OldScheduleEnd) {
+    // The scheduling region got new instructions at the lower end (or it is a
+    // new region for the first bundle). This makes it necessary to
+    // recalculate all dependencies.
+    // It is seldom that this needs to be done a second time after adding the
+    // initial bundle to the region.
+    for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
+      doForAllOpcodes(I, [](ScheduleData *SD) {
+        SD->clearDependencies();
+      });
+    }
+    ReSchedule = true;
+  }
+  if (ReSchedule) {
+    resetSchedule();
+    initialFillReadyList(ReadyInsts);
+  }
+  assert(Bundle && "Failed to find schedule bundle");
+
+  LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
+                    << BB->getName() << "\n");
+
+  calculateDependencies(Bundle, true, SLP);
+
+  // Now try to schedule the new bundle. As soon as the bundle is "ready" it
+  // means that there are no cyclic dependencies and we can schedule it.
+  // Note that's important that we don't "schedule" the bundle yet (see
+  // cancelScheduling).
+  while (!Bundle->isReady() && !ReadyInsts.empty()) {
+
     ScheduleData *pickedSD = ReadyInsts.pop_back_val();
- 
-    if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) { 
-      schedule(pickedSD, ReadyInsts); 
-    } 
-  } 
-  if (!Bundle->isReady()) { 
-    cancelScheduling(VL, S.OpValue); 
-    return None; 
-  } 
-  return Bundle; 
-} 
- 
-void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL, 
-                                                Value *OpValue) { 
-  if (isa<PHINode>(OpValue)) 
-    return; 
- 
-  ScheduleData *Bundle = getScheduleData(OpValue); 
-  LLVM_DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n"); 
-  assert(!Bundle->IsScheduled && 
-         "Can't cancel bundle which is already scheduled"); 
-  assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() && 
-         "tried to unbundle something which is not a bundle"); 
- 
-  // Un-bundle: make single instructions out of the bundle. 
-  ScheduleData *BundleMember = Bundle; 
-  while (BundleMember) { 
-    assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links"); 
-    BundleMember->FirstInBundle = BundleMember; 
-    ScheduleData *Next = BundleMember->NextInBundle; 
-    BundleMember->NextInBundle = nullptr; 
-    BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps; 
-    if (BundleMember->UnscheduledDepsInBundle == 0) { 
-      ReadyInsts.insert(BundleMember); 
-    } 
-    BundleMember = Next; 
-  } 
-} 
- 
-BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() { 
-  // Allocate a new ScheduleData for the instruction. 
-  if (ChunkPos >= ChunkSize) { 
-    ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize)); 
-    ChunkPos = 0; 
-  } 
-  return &(ScheduleDataChunks.back()[ChunkPos++]); 
-} 
- 
-bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, 
-                                                      const InstructionsState &S) { 
-  if (getScheduleData(V, isOneOf(S, V))) 
-    return true; 
-  Instruction *I = dyn_cast<Instruction>(V); 
-  assert(I && "bundle member must be an instruction"); 
-  assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled"); 
-  auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool { 
-    ScheduleData *ISD = getScheduleData(I); 
-    if (!ISD) 
-      return false; 
-    assert(isInSchedulingRegion(ISD) && 
-           "ScheduleData not in scheduling region"); 
-    ScheduleData *SD = allocateScheduleDataChunks(); 
-    SD->Inst = I; 
-    SD->init(SchedulingRegionID, S.OpValue); 
-    ExtraScheduleDataMap[I][S.OpValue] = SD; 
-    return true; 
-  }; 
-  if (CheckSheduleForI(I)) 
-    return true; 
-  if (!ScheduleStart) { 
-    // It's the first instruction in the new region. 
-    initScheduleData(I, I->getNextNode(), nullptr, nullptr); 
-    ScheduleStart = I; 
-    ScheduleEnd = I->getNextNode(); 
-    if (isOneOf(S, I) != I) 
-      CheckSheduleForI(I); 
-    assert(ScheduleEnd && "tried to vectorize a terminator?"); 
-    LLVM_DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n"); 
-    return true; 
-  } 
-  // Search up and down at the same time, because we don't know if the new 
-  // instruction is above or below the existing scheduling region. 
-  BasicBlock::reverse_iterator UpIter = 
-      ++ScheduleStart->getIterator().getReverse(); 
-  BasicBlock::reverse_iterator UpperEnd = BB->rend(); 
-  BasicBlock::iterator DownIter = ScheduleEnd->getIterator(); 
-  BasicBlock::iterator LowerEnd = BB->end(); 
-  while (true) { 
-    if (++ScheduleRegionSize > ScheduleRegionSizeLimit) { 
-      LLVM_DEBUG(dbgs() << "SLP:  exceeded schedule region size limit\n"); 
-      return false; 
-    } 
- 
-    if (UpIter != UpperEnd) { 
-      if (&*UpIter == I) { 
-        initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion); 
-        ScheduleStart = I; 
-        if (isOneOf(S, I) != I) 
-          CheckSheduleForI(I); 
-        LLVM_DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I 
-                          << "\n"); 
-        return true; 
-      } 
-      ++UpIter; 
-    } 
-    if (DownIter != LowerEnd) { 
-      if (&*DownIter == I) { 
-        initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion, 
-                         nullptr); 
-        ScheduleEnd = I->getNextNode(); 
-        if (isOneOf(S, I) != I) 
-          CheckSheduleForI(I); 
-        assert(ScheduleEnd && "tried to vectorize a terminator?"); 
-        LLVM_DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I 
-                          << "\n"); 
-        return true; 
-      } 
-      ++DownIter; 
-    } 
-    assert((UpIter != UpperEnd || DownIter != LowerEnd) && 
-           "instruction not found in block"); 
-  } 
-  return true; 
-} 
- 
-void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, 
-                                                Instruction *ToI, 
-                                                ScheduleData *PrevLoadStore, 
-                                                ScheduleData *NextLoadStore) { 
-  ScheduleData *CurrentLoadStore = PrevLoadStore; 
-  for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) { 
-    ScheduleData *SD = ScheduleDataMap[I]; 
-    if (!SD) { 
-      SD = allocateScheduleDataChunks(); 
-      ScheduleDataMap[I] = SD; 
-      SD->Inst = I; 
-    } 
-    assert(!isInSchedulingRegion(SD) && 
-           "new ScheduleData already in scheduling region"); 
-    SD->init(SchedulingRegionID, I); 
- 
-    if (I->mayReadOrWriteMemory() && 
-        (!isa<IntrinsicInst>(I) || 
+
+    if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
+      schedule(pickedSD, ReadyInsts);
+    }
+  }
+  if (!Bundle->isReady()) {
+    cancelScheduling(VL, S.OpValue);
+    return None;
+  }
+  return Bundle;
+}
+
+void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
+                                                Value *OpValue) {
+  if (isa<PHINode>(OpValue))
+    return;
+
+  ScheduleData *Bundle = getScheduleData(OpValue);
+  LLVM_DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");
+  assert(!Bundle->IsScheduled &&
+         "Can't cancel bundle which is already scheduled");
+  assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
+         "tried to unbundle something which is not a bundle");
+
+  // Un-bundle: make single instructions out of the bundle.
+  ScheduleData *BundleMember = Bundle;
+  while (BundleMember) {
+    assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
+    BundleMember->FirstInBundle = BundleMember;
+    ScheduleData *Next = BundleMember->NextInBundle;
+    BundleMember->NextInBundle = nullptr;
+    BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
+    if (BundleMember->UnscheduledDepsInBundle == 0) {
+      ReadyInsts.insert(BundleMember);
+    }
+    BundleMember = Next;
+  }
+}
+
+BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
+  // Allocate a new ScheduleData for the instruction.
+  if (ChunkPos >= ChunkSize) {
+    ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
+    ChunkPos = 0;
+  }
+  return &(ScheduleDataChunks.back()[ChunkPos++]);
+}
+
+bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
+                                                      const InstructionsState &S) {
+  if (getScheduleData(V, isOneOf(S, V)))
+    return true;
+  Instruction *I = dyn_cast<Instruction>(V);
+  assert(I && "bundle member must be an instruction");
+  assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
+  auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool {
+    ScheduleData *ISD = getScheduleData(I);
+    if (!ISD)
+      return false;
+    assert(isInSchedulingRegion(ISD) &&
+           "ScheduleData not in scheduling region");
+    ScheduleData *SD = allocateScheduleDataChunks();
+    SD->Inst = I;
+    SD->init(SchedulingRegionID, S.OpValue);
+    ExtraScheduleDataMap[I][S.OpValue] = SD;
+    return true;
+  };
+  if (CheckSheduleForI(I))
+    return true;
+  if (!ScheduleStart) {
+    // It's the first instruction in the new region.
+    initScheduleData(I, I->getNextNode(), nullptr, nullptr);
+    ScheduleStart = I;
+    ScheduleEnd = I->getNextNode();
+    if (isOneOf(S, I) != I)
+      CheckSheduleForI(I);
+    assert(ScheduleEnd && "tried to vectorize a terminator?");
+    LLVM_DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
+    return true;
+  }
+  // Search up and down at the same time, because we don't know if the new
+  // instruction is above or below the existing scheduling region.
+  BasicBlock::reverse_iterator UpIter =
+      ++ScheduleStart->getIterator().getReverse();
+  BasicBlock::reverse_iterator UpperEnd = BB->rend();
+  BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
+  BasicBlock::iterator LowerEnd = BB->end();
+  while (true) {
+    if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
+      LLVM_DEBUG(dbgs() << "SLP:  exceeded schedule region size limit\n");
+      return false;
+    }
+
+    if (UpIter != UpperEnd) {
+      if (&*UpIter == I) {
+        initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
+        ScheduleStart = I;
+        if (isOneOf(S, I) != I)
+          CheckSheduleForI(I);
+        LLVM_DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I
+                          << "\n");
+        return true;
+      }
+      ++UpIter;
+    }
+    if (DownIter != LowerEnd) {
+      if (&*DownIter == I) {
+        initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
+                         nullptr);
+        ScheduleEnd = I->getNextNode();
+        if (isOneOf(S, I) != I)
+          CheckSheduleForI(I);
+        assert(ScheduleEnd && "tried to vectorize a terminator?");
+        LLVM_DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I
+                          << "\n");
+        return true;
+      }
+      ++DownIter;
+    }
+    assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
+           "instruction not found in block");
+  }
+  return true;
+}
+
+void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
+                                                Instruction *ToI,
+                                                ScheduleData *PrevLoadStore,
+                                                ScheduleData *NextLoadStore) {
+  ScheduleData *CurrentLoadStore = PrevLoadStore;
+  for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
+    ScheduleData *SD = ScheduleDataMap[I];
+    if (!SD) {
+      SD = allocateScheduleDataChunks();
+      ScheduleDataMap[I] = SD;
+      SD->Inst = I;
+    }
+    assert(!isInSchedulingRegion(SD) &&
+           "new ScheduleData already in scheduling region");
+    SD->init(SchedulingRegionID, I);
+
+    if (I->mayReadOrWriteMemory() &&
+        (!isa<IntrinsicInst>(I) ||
          (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
           cast<IntrinsicInst>(I)->getIntrinsicID() !=
               Intrinsic::pseudoprobe))) {
-      // Update the linked list of memory accessing instructions. 
-      if (CurrentLoadStore) { 
-        CurrentLoadStore->NextLoadStore = SD; 
-      } else { 
-        FirstLoadStoreInRegion = SD; 
-      } 
-      CurrentLoadStore = SD; 
-    } 
-  } 
-  if (NextLoadStore) { 
-    if (CurrentLoadStore) 
-      CurrentLoadStore->NextLoadStore = NextLoadStore; 
-  } else { 
-    LastLoadStoreInRegion = CurrentLoadStore; 
-  } 
-} 
- 
-void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, 
-                                                     bool InsertInReadyList, 
-                                                     BoUpSLP *SLP) { 
-  assert(SD->isSchedulingEntity()); 
- 
-  SmallVector<ScheduleData *, 10> WorkList; 
-  WorkList.push_back(SD); 
- 
-  while (!WorkList.empty()) { 
+      // Update the linked list of memory accessing instructions.
+      if (CurrentLoadStore) {
+        CurrentLoadStore->NextLoadStore = SD;
+      } else {
+        FirstLoadStoreInRegion = SD;
+      }
+      CurrentLoadStore = SD;
+    }
+  }
+  if (NextLoadStore) {
+    if (CurrentLoadStore)
+      CurrentLoadStore->NextLoadStore = NextLoadStore;
+  } else {
+    LastLoadStoreInRegion = CurrentLoadStore;
+  }
+}
+
+void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
+                                                     bool InsertInReadyList,
+                                                     BoUpSLP *SLP) {
+  assert(SD->isSchedulingEntity());
+
+  SmallVector<ScheduleData *, 10> WorkList;
+  WorkList.push_back(SD);
+
+  while (!WorkList.empty()) {
     ScheduleData *SD = WorkList.pop_back_val();
- 
-    ScheduleData *BundleMember = SD; 
-    while (BundleMember) { 
-      assert(isInSchedulingRegion(BundleMember)); 
-      if (!BundleMember->hasValidDependencies()) { 
- 
-        LLVM_DEBUG(dbgs() << "SLP:       update deps of " << *BundleMember 
-                          << "\n"); 
-        BundleMember->Dependencies = 0; 
-        BundleMember->resetUnscheduledDeps(); 
- 
-        // Handle def-use chain dependencies. 
-        if (BundleMember->OpValue != BundleMember->Inst) { 
-          ScheduleData *UseSD = getScheduleData(BundleMember->Inst); 
-          if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { 
-            BundleMember->Dependencies++; 
-            ScheduleData *DestBundle = UseSD->FirstInBundle; 
-            if (!DestBundle->IsScheduled) 
-              BundleMember->incrementUnscheduledDeps(1); 
-            if (!DestBundle->hasValidDependencies()) 
-              WorkList.push_back(DestBundle); 
-          } 
-        } else { 
-          for (User *U : BundleMember->Inst->users()) { 
-            if (isa<Instruction>(U)) { 
-              ScheduleData *UseSD = getScheduleData(U); 
-              if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { 
-                BundleMember->Dependencies++; 
-                ScheduleData *DestBundle = UseSD->FirstInBundle; 
-                if (!DestBundle->IsScheduled) 
-                  BundleMember->incrementUnscheduledDeps(1); 
-                if (!DestBundle->hasValidDependencies()) 
-                  WorkList.push_back(DestBundle); 
-              } 
-            } else { 
-              // I'm not sure if this can ever happen. But we need to be safe. 
-              // This lets the instruction/bundle never be scheduled and 
-              // eventually disable vectorization. 
-              BundleMember->Dependencies++; 
-              BundleMember->incrementUnscheduledDeps(1); 
-            } 
-          } 
-        } 
- 
-        // Handle the memory dependencies. 
-        ScheduleData *DepDest = BundleMember->NextLoadStore; 
-        if (DepDest) { 
-          Instruction *SrcInst = BundleMember->Inst; 
-          MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA); 
-          bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory(); 
-          unsigned numAliased = 0; 
-          unsigned DistToSrc = 1; 
- 
-          while (DepDest) { 
-            assert(isInSchedulingRegion(DepDest)); 
- 
-            // We have two limits to reduce the complexity: 
-            // 1) AliasedCheckLimit: It's a small limit to reduce calls to 
-            //    SLP->isAliased (which is the expensive part in this loop). 
-            // 2) MaxMemDepDistance: It's for very large blocks and it aborts 
-            //    the whole loop (even if the loop is fast, it's quadratic). 
-            //    It's important for the loop break condition (see below) to 
-            //    check this limit even between two read-only instructions. 
-            if (DistToSrc >= MaxMemDepDistance || 
-                    ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) && 
-                     (numAliased >= AliasedCheckLimit || 
-                      SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) { 
- 
-              // We increment the counter only if the locations are aliased 
-              // (instead of counting all alias checks). This gives a better 
-              // balance between reduced runtime and accurate dependencies. 
-              numAliased++; 
- 
-              DepDest->MemoryDependencies.push_back(BundleMember); 
-              BundleMember->Dependencies++; 
-              ScheduleData *DestBundle = DepDest->FirstInBundle; 
-              if (!DestBundle->IsScheduled) { 
-                BundleMember->incrementUnscheduledDeps(1); 
-              } 
-              if (!DestBundle->hasValidDependencies()) { 
-                WorkList.push_back(DestBundle); 
-              } 
-            } 
-            DepDest = DepDest->NextLoadStore; 
- 
-            // Example, explaining the loop break condition: Let's assume our 
-            // starting instruction is i0 and MaxMemDepDistance = 3. 
-            // 
-            //                      +--------v--v--v 
-            //             i0,i1,i2,i3,i4,i5,i6,i7,i8 
-            //             +--------^--^--^ 
-            // 
-            // MaxMemDepDistance let us stop alias-checking at i3 and we add 
-            // dependencies from i0 to i3,i4,.. (even if they are not aliased). 
-            // Previously we already added dependencies from i3 to i6,i7,i8 
-            // (because of MaxMemDepDistance). As we added a dependency from 
-            // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8 
-            // and we can abort this loop at i6. 
-            if (DistToSrc >= 2 * MaxMemDepDistance) 
-              break; 
-            DistToSrc++; 
-          } 
-        } 
-      } 
-      BundleMember = BundleMember->NextInBundle; 
-    } 
-    if (InsertInReadyList && SD->isReady()) { 
-      ReadyInsts.push_back(SD); 
-      LLVM_DEBUG(dbgs() << "SLP:     gets ready on update: " << *SD->Inst 
-                        << "\n"); 
-    } 
-  } 
-} 
- 
-void BoUpSLP::BlockScheduling::resetSchedule() { 
-  assert(ScheduleStart && 
-         "tried to reset schedule on block which has not been scheduled"); 
-  for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 
-    doForAllOpcodes(I, [&](ScheduleData *SD) { 
-      assert(isInSchedulingRegion(SD) && 
-             "ScheduleData not in scheduling region"); 
-      SD->IsScheduled = false; 
-      SD->resetUnscheduledDeps(); 
-    }); 
-  } 
-  ReadyInsts.clear(); 
-} 
- 
-void BoUpSLP::scheduleBlock(BlockScheduling *BS) { 
-  if (!BS->ScheduleStart) 
-    return; 
- 
-  LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n"); 
- 
-  BS->resetSchedule(); 
- 
-  // For the real scheduling we use a more sophisticated ready-list: it is 
-  // sorted by the original instruction location. This lets the final schedule 
-  // be as  close as possible to the original instruction order. 
-  struct ScheduleDataCompare { 
-    bool operator()(ScheduleData *SD1, ScheduleData *SD2) const { 
-      return SD2->SchedulingPriority < SD1->SchedulingPriority; 
-    } 
-  }; 
-  std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts; 
- 
-  // Ensure that all dependency data is updated and fill the ready-list with 
-  // initial instructions. 
-  int Idx = 0; 
-  int NumToSchedule = 0; 
-  for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; 
-       I = I->getNextNode()) { 
-    BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) { 
-      assert(SD->isPartOfBundle() == 
-                 (getTreeEntry(SD->Inst) != nullptr) && 
-             "scheduler and vectorizer bundle mismatch"); 
-      SD->FirstInBundle->SchedulingPriority = Idx++; 
-      if (SD->isSchedulingEntity()) { 
-        BS->calculateDependencies(SD, false, this); 
-        NumToSchedule++; 
-      } 
-    }); 
-  } 
-  BS->initialFillReadyList(ReadyInsts); 
- 
-  Instruction *LastScheduledInst = BS->ScheduleEnd; 
- 
-  // Do the "real" scheduling. 
-  while (!ReadyInsts.empty()) { 
-    ScheduleData *picked = *ReadyInsts.begin(); 
-    ReadyInsts.erase(ReadyInsts.begin()); 
- 
-    // Move the scheduled instruction(s) to their dedicated places, if not 
-    // there yet. 
-    ScheduleData *BundleMember = picked; 
-    while (BundleMember) { 
-      Instruction *pickedInst = BundleMember->Inst; 
-      if (LastScheduledInst->getNextNode() != pickedInst) { 
-        BS->BB->getInstList().remove(pickedInst); 
-        BS->BB->getInstList().insert(LastScheduledInst->getIterator(), 
-                                     pickedInst); 
-      } 
-      LastScheduledInst = pickedInst; 
-      BundleMember = BundleMember->NextInBundle; 
-    } 
- 
-    BS->schedule(picked, ReadyInsts); 
-    NumToSchedule--; 
-  } 
-  assert(NumToSchedule == 0 && "could not schedule all instructions"); 
- 
-  // Avoid duplicate scheduling of the block. 
-  BS->ScheduleStart = nullptr; 
-} 
- 
-unsigned BoUpSLP::getVectorElementSize(Value *V) { 
+
+    ScheduleData *BundleMember = SD;
+    while (BundleMember) {
+      assert(isInSchedulingRegion(BundleMember));
+      if (!BundleMember->hasValidDependencies()) {
+
+        LLVM_DEBUG(dbgs() << "SLP:       update deps of " << *BundleMember
+                          << "\n");
+        BundleMember->Dependencies = 0;
+        BundleMember->resetUnscheduledDeps();
+
+        // Handle def-use chain dependencies.
+        if (BundleMember->OpValue != BundleMember->Inst) {
+          ScheduleData *UseSD = getScheduleData(BundleMember->Inst);
+          if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
+            BundleMember->Dependencies++;
+            ScheduleData *DestBundle = UseSD->FirstInBundle;
+            if (!DestBundle->IsScheduled)
+              BundleMember->incrementUnscheduledDeps(1);
+            if (!DestBundle->hasValidDependencies())
+              WorkList.push_back(DestBundle);
+          }
+        } else {
+          for (User *U : BundleMember->Inst->users()) {
+            if (isa<Instruction>(U)) {
+              ScheduleData *UseSD = getScheduleData(U);
+              if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
+                BundleMember->Dependencies++;
+                ScheduleData *DestBundle = UseSD->FirstInBundle;
+                if (!DestBundle->IsScheduled)
+                  BundleMember->incrementUnscheduledDeps(1);
+                if (!DestBundle->hasValidDependencies())
+                  WorkList.push_back(DestBundle);
+              }
+            } else {
+              // I'm not sure if this can ever happen. But we need to be safe.
+              // This lets the instruction/bundle never be scheduled and
+              // eventually disable vectorization.
+              BundleMember->Dependencies++;
+              BundleMember->incrementUnscheduledDeps(1);
+            }
+          }
+        }
+
+        // Handle the memory dependencies.
+        ScheduleData *DepDest = BundleMember->NextLoadStore;
+        if (DepDest) {
+          Instruction *SrcInst = BundleMember->Inst;
+          MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
+          bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
+          unsigned numAliased = 0;
+          unsigned DistToSrc = 1;
+
+          while (DepDest) {
+            assert(isInSchedulingRegion(DepDest));
+
+            // We have two limits to reduce the complexity:
+            // 1) AliasedCheckLimit: It's a small limit to reduce calls to
+            //    SLP->isAliased (which is the expensive part in this loop).
+            // 2) MaxMemDepDistance: It's for very large blocks and it aborts
+            //    the whole loop (even if the loop is fast, it's quadratic).
+            //    It's important for the loop break condition (see below) to
+            //    check this limit even between two read-only instructions.
+            if (DistToSrc >= MaxMemDepDistance ||
+                    ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
+                     (numAliased >= AliasedCheckLimit ||
+                      SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
+
+              // We increment the counter only if the locations are aliased
+              // (instead of counting all alias checks). This gives a better
+              // balance between reduced runtime and accurate dependencies.
+              numAliased++;
+
+              DepDest->MemoryDependencies.push_back(BundleMember);
+              BundleMember->Dependencies++;
+              ScheduleData *DestBundle = DepDest->FirstInBundle;
+              if (!DestBundle->IsScheduled) {
+                BundleMember->incrementUnscheduledDeps(1);
+              }
+              if (!DestBundle->hasValidDependencies()) {
+                WorkList.push_back(DestBundle);
+              }
+            }
+            DepDest = DepDest->NextLoadStore;
+
+            // Example, explaining the loop break condition: Let's assume our
+            // starting instruction is i0 and MaxMemDepDistance = 3.
+            //
+            //                      +--------v--v--v
+            //             i0,i1,i2,i3,i4,i5,i6,i7,i8
+            //             +--------^--^--^
+            //
+            // MaxMemDepDistance let us stop alias-checking at i3 and we add
+            // dependencies from i0 to i3,i4,.. (even if they are not aliased).
+            // Previously we already added dependencies from i3 to i6,i7,i8
+            // (because of MaxMemDepDistance). As we added a dependency from
+            // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
+            // and we can abort this loop at i6.
+            if (DistToSrc >= 2 * MaxMemDepDistance)
+              break;
+            DistToSrc++;
+          }
+        }
+      }
+      BundleMember = BundleMember->NextInBundle;
+    }
+    if (InsertInReadyList && SD->isReady()) {
+      ReadyInsts.push_back(SD);
+      LLVM_DEBUG(dbgs() << "SLP:     gets ready on update: " << *SD->Inst
+                        << "\n");
+    }
+  }
+}
+
+void BoUpSLP::BlockScheduling::resetSchedule() {
+  assert(ScheduleStart &&
+         "tried to reset schedule on block which has not been scheduled");
+  for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
+    doForAllOpcodes(I, [&](ScheduleData *SD) {
+      assert(isInSchedulingRegion(SD) &&
+             "ScheduleData not in scheduling region");
+      SD->IsScheduled = false;
+      SD->resetUnscheduledDeps();
+    });
+  }
+  ReadyInsts.clear();
+}
+
+void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
+  if (!BS->ScheduleStart)
+    return;
+
+  LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
+
+  BS->resetSchedule();
+
+  // For the real scheduling we use a more sophisticated ready-list: it is
+  // sorted by the original instruction location. This lets the final schedule
+  // be as  close as possible to the original instruction order.
+  struct ScheduleDataCompare {
+    bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
+      return SD2->SchedulingPriority < SD1->SchedulingPriority;
+    }
+  };
+  std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
+
+  // Ensure that all dependency data is updated and fill the ready-list with
+  // initial instructions.
+  int Idx = 0;
+  int NumToSchedule = 0;
+  for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
+       I = I->getNextNode()) {
+    BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
+      assert(SD->isPartOfBundle() ==
+                 (getTreeEntry(SD->Inst) != nullptr) &&
+             "scheduler and vectorizer bundle mismatch");
+      SD->FirstInBundle->SchedulingPriority = Idx++;
+      if (SD->isSchedulingEntity()) {
+        BS->calculateDependencies(SD, false, this);
+        NumToSchedule++;
+      }
+    });
+  }
+  BS->initialFillReadyList(ReadyInsts);
+
+  Instruction *LastScheduledInst = BS->ScheduleEnd;
+
+  // Do the "real" scheduling.
+  while (!ReadyInsts.empty()) {
+    ScheduleData *picked = *ReadyInsts.begin();
+    ReadyInsts.erase(ReadyInsts.begin());
+
+    // Move the scheduled instruction(s) to their dedicated places, if not
+    // there yet.
+    ScheduleData *BundleMember = picked;
+    while (BundleMember) {
+      Instruction *pickedInst = BundleMember->Inst;
+      if (LastScheduledInst->getNextNode() != pickedInst) {
+        BS->BB->getInstList().remove(pickedInst);
+        BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
+                                     pickedInst);
+      }
+      LastScheduledInst = pickedInst;
+      BundleMember = BundleMember->NextInBundle;
+    }
+
+    BS->schedule(picked, ReadyInsts);
+    NumToSchedule--;
+  }
+  assert(NumToSchedule == 0 && "could not schedule all instructions");
+
+  // Avoid duplicate scheduling of the block.
+  BS->ScheduleStart = nullptr;
+}
+
+unsigned BoUpSLP::getVectorElementSize(Value *V) {
   // If V is a store, just return the width of the stored value (or value
   // truncated just before storing) without traversing the expression tree.
   // This is the common case.
@@ -5543,891 +5543,891 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) {
     else
       return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
   }
- 
-  auto E = InstrElementSize.find(V); 
-  if (E != InstrElementSize.end()) 
-    return E->second; 
- 
-  // If V is not a store, we can traverse the expression tree to find loads 
-  // that feed it. The type of the loaded value may indicate a more suitable 
-  // width than V's type. We want to base the vector element size on the width 
-  // of memory operations where possible. 
-  SmallVector<Instruction *, 16> Worklist; 
-  SmallPtrSet<Instruction *, 16> Visited; 
-  if (auto *I = dyn_cast<Instruction>(V)) { 
-    Worklist.push_back(I); 
-    Visited.insert(I); 
-  } 
- 
-  // Traverse the expression tree in bottom-up order looking for loads. If we 
-  // encounter an instruction we don't yet handle, we give up. 
-  auto MaxWidth = 0u; 
-  auto FoundUnknownInst = false; 
-  while (!Worklist.empty() && !FoundUnknownInst) { 
-    auto *I = Worklist.pop_back_val(); 
- 
-    // We should only be looking at scalar instructions here. If the current 
-    // instruction has a vector type, give up. 
-    auto *Ty = I->getType(); 
-    if (isa<VectorType>(Ty)) 
-      FoundUnknownInst = true; 
- 
-    // If the current instruction is a load, update MaxWidth to reflect the 
-    // width of the loaded value. 
-    else if (isa<LoadInst>(I)) 
-      MaxWidth = std::max<unsigned>(MaxWidth, DL->getTypeSizeInBits(Ty)); 
- 
-    // Otherwise, we need to visit the operands of the instruction. We only 
-    // handle the interesting cases from buildTree here. If an operand is an 
-    // instruction we haven't yet visited, we add it to the worklist. 
-    else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) || 
-             isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) { 
-      for (Use &U : I->operands()) 
-        if (auto *J = dyn_cast<Instruction>(U.get())) 
-          if (Visited.insert(J).second) 
-            Worklist.push_back(J); 
-    } 
- 
-    // If we don't yet handle the instruction, give up. 
-    else 
-      FoundUnknownInst = true; 
-  } 
- 
-  int Width = MaxWidth; 
-  // If we didn't encounter a memory access in the expression tree, or if we 
-  // gave up for some reason, just return the width of V. Otherwise, return the 
-  // maximum width we found. 
-  if (!MaxWidth || FoundUnknownInst) 
-    Width = DL->getTypeSizeInBits(V->getType()); 
- 
-  for (Instruction *I : Visited) 
-    InstrElementSize[I] = Width; 
- 
-  return Width; 
-} 
- 
-// Determine if a value V in a vectorizable expression Expr can be demoted to a 
-// smaller type with a truncation. We collect the values that will be demoted 
-// in ToDemote and additional roots that require investigating in Roots. 
-static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr, 
-                                  SmallVectorImpl<Value *> &ToDemote, 
-                                  SmallVectorImpl<Value *> &Roots) { 
-  // We can always demote constants. 
-  if (isa<Constant>(V)) { 
-    ToDemote.push_back(V); 
-    return true; 
-  } 
- 
-  // If the value is not an instruction in the expression with only one use, it 
-  // cannot be demoted. 
-  auto *I = dyn_cast<Instruction>(V); 
-  if (!I || !I->hasOneUse() || !Expr.count(I)) 
-    return false; 
- 
-  switch (I->getOpcode()) { 
- 
-  // We can always demote truncations and extensions. Since truncations can 
-  // seed additional demotion, we save the truncated value. 
-  case Instruction::Trunc: 
-    Roots.push_back(I->getOperand(0)); 
-    break; 
-  case Instruction::ZExt: 
-  case Instruction::SExt: 
-    break; 
- 
-  // We can demote certain binary operations if we can demote both of their 
-  // operands. 
-  case Instruction::Add: 
-  case Instruction::Sub: 
-  case Instruction::Mul: 
-  case Instruction::And: 
-  case Instruction::Or: 
-  case Instruction::Xor: 
-    if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) || 
-        !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots)) 
-      return false; 
-    break; 
- 
-  // We can demote selects if we can demote their true and false values. 
-  case Instruction::Select: { 
-    SelectInst *SI = cast<SelectInst>(I); 
-    if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) || 
-        !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots)) 
-      return false; 
-    break; 
-  } 
- 
-  // We can demote phis if we can demote all their incoming operands. Note that 
-  // we don't need to worry about cycles since we ensure single use above. 
-  case Instruction::PHI: { 
-    PHINode *PN = cast<PHINode>(I); 
-    for (Value *IncValue : PN->incoming_values()) 
-      if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots)) 
-        return false; 
-    break; 
-  } 
- 
-  // Otherwise, conservatively give up. 
-  default: 
-    return false; 
-  } 
- 
-  // Record the value that we can demote. 
-  ToDemote.push_back(V); 
-  return true; 
-} 
- 
-void BoUpSLP::computeMinimumValueSizes() { 
-  // If there are no external uses, the expression tree must be rooted by a 
-  // store. We can't demote in-memory values, so there is nothing to do here. 
-  if (ExternalUses.empty()) 
-    return; 
- 
-  // We only attempt to truncate integer expressions. 
-  auto &TreeRoot = VectorizableTree[0]->Scalars; 
-  auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType()); 
-  if (!TreeRootIT) 
-    return; 
- 
-  // If the expression is not rooted by a store, these roots should have 
-  // external uses. We will rely on InstCombine to rewrite the expression in 
-  // the narrower type. However, InstCombine only rewrites single-use values. 
-  // This means that if a tree entry other than a root is used externally, it 
-  // must have multiple uses and InstCombine will not rewrite it. The code 
-  // below ensures that only the roots are used externally. 
-  SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end()); 
-  for (auto &EU : ExternalUses) 
-    if (!Expr.erase(EU.Scalar)) 
-      return; 
-  if (!Expr.empty()) 
-    return; 
- 
-  // Collect the scalar values of the vectorizable expression. We will use this 
-  // context to determine which values can be demoted. If we see a truncation, 
-  // we mark it as seeding another demotion. 
-  for (auto &EntryPtr : VectorizableTree) 
-    Expr.insert(EntryPtr->Scalars.begin(), EntryPtr->Scalars.end()); 
- 
-  // Ensure the roots of the vectorizable tree don't form a cycle. They must 
-  // have a single external user that is not in the vectorizable tree. 
-  for (auto *Root : TreeRoot) 
-    if (!Root->hasOneUse() || Expr.count(*Root->user_begin())) 
-      return; 
- 
-  // Conservatively determine if we can actually truncate the roots of the 
-  // expression. Collect the values that can be demoted in ToDemote and 
-  // additional roots that require investigating in Roots. 
-  SmallVector<Value *, 32> ToDemote; 
-  SmallVector<Value *, 4> Roots; 
-  for (auto *Root : TreeRoot) 
-    if (!collectValuesToDemote(Root, Expr, ToDemote, Roots)) 
-      return; 
- 
-  // The maximum bit width required to represent all the values that can be 
-  // demoted without loss of precision. It would be safe to truncate the roots 
-  // of the expression to this width. 
-  auto MaxBitWidth = 8u; 
- 
-  // We first check if all the bits of the roots are demanded. If they're not, 
-  // we can truncate the roots to this narrower type. 
-  for (auto *Root : TreeRoot) { 
-    auto Mask = DB->getDemandedBits(cast<Instruction>(Root)); 
-    MaxBitWidth = std::max<unsigned>( 
-        Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth); 
-  } 
- 
-  // True if the roots can be zero-extended back to their original type, rather 
-  // than sign-extended. We know that if the leading bits are not demanded, we 
-  // can safely zero-extend. So we initialize IsKnownPositive to True. 
-  bool IsKnownPositive = true; 
- 
-  // If all the bits of the roots are demanded, we can try a little harder to 
-  // compute a narrower type. This can happen, for example, if the roots are 
-  // getelementptr indices. InstCombine promotes these indices to the pointer 
-  // width. Thus, all their bits are technically demanded even though the 
-  // address computation might be vectorized in a smaller type. 
-  // 
-  // We start by looking at each entry that can be demoted. We compute the 
-  // maximum bit width required to store the scalar by using ValueTracking to 
-  // compute the number of high-order bits we can truncate. 
-  if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) && 
-      llvm::all_of(TreeRoot, [](Value *R) { 
-        assert(R->hasOneUse() && "Root should have only one use!"); 
-        return isa<GetElementPtrInst>(R->user_back()); 
-      })) { 
-    MaxBitWidth = 8u; 
- 
-    // Determine if the sign bit of all the roots is known to be zero. If not, 
-    // IsKnownPositive is set to False. 
-    IsKnownPositive = llvm::all_of(TreeRoot, [&](Value *R) { 
-      KnownBits Known = computeKnownBits(R, *DL); 
-      return Known.isNonNegative(); 
-    }); 
- 
-    // Determine the maximum number of bits required to store the scalar 
-    // values. 
-    for (auto *Scalar : ToDemote) { 
-      auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, nullptr, DT); 
-      auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType()); 
-      MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth); 
-    } 
- 
-    // If we can't prove that the sign bit is zero, we must add one to the 
-    // maximum bit width to account for the unknown sign bit. This preserves 
-    // the existing sign bit so we can safely sign-extend the root back to the 
-    // original type. Otherwise, if we know the sign bit is zero, we will 
-    // zero-extend the root instead. 
-    // 
-    // FIXME: This is somewhat suboptimal, as there will be cases where adding 
-    //        one to the maximum bit width will yield a larger-than-necessary 
-    //        type. In general, we need to add an extra bit only if we can't 
-    //        prove that the upper bit of the original type is equal to the 
-    //        upper bit of the proposed smaller type. If these two bits are the 
-    //        same (either zero or one) we know that sign-extending from the 
-    //        smaller type will result in the same value. Here, since we can't 
-    //        yet prove this, we are just making the proposed smaller type 
-    //        larger to ensure correctness. 
-    if (!IsKnownPositive) 
-      ++MaxBitWidth; 
-  } 
- 
-  // Round MaxBitWidth up to the next power-of-two. 
-  if (!isPowerOf2_64(MaxBitWidth)) 
-    MaxBitWidth = NextPowerOf2(MaxBitWidth); 
- 
-  // If the maximum bit width we compute is less than the with of the roots' 
-  // type, we can proceed with the narrowing. Otherwise, do nothing. 
-  if (MaxBitWidth >= TreeRootIT->getBitWidth()) 
-    return; 
- 
-  // If we can truncate the root, we must collect additional values that might 
-  // be demoted as a result. That is, those seeded by truncations we will 
-  // modify. 
-  while (!Roots.empty()) 
-    collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots); 
- 
-  // Finally, map the values we can demote to the maximum bit with we computed. 
-  for (auto *Scalar : ToDemote) 
-    MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive); 
-} 
- 
-namespace { 
- 
-/// The SLPVectorizer Pass. 
-struct SLPVectorizer : public FunctionPass { 
-  SLPVectorizerPass Impl; 
- 
-  /// Pass identification, replacement for typeid 
-  static char ID; 
- 
-  explicit SLPVectorizer() : FunctionPass(ID) { 
-    initializeSLPVectorizerPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  bool doInitialization(Module &M) override { 
-    return false; 
-  } 
- 
-  bool runOnFunction(Function &F) override { 
-    if (skipFunction(F)) 
-      return false; 
- 
-    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 
-    auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 
-    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 
-    auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 
-    auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 
-    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 
-    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-    auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 
-    auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 
-    auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 
- 
-    return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    FunctionPass::getAnalysisUsage(AU); 
-    AU.addRequired<AssumptionCacheTracker>(); 
-    AU.addRequired<ScalarEvolutionWrapperPass>(); 
-    AU.addRequired<AAResultsWrapperPass>(); 
-    AU.addRequired<TargetTransformInfoWrapperPass>(); 
-    AU.addRequired<LoopInfoWrapperPass>(); 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addRequired<DemandedBitsWrapperPass>(); 
-    AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 
-    AU.addRequired<InjectTLIMappingsLegacy>(); 
-    AU.addPreserved<LoopInfoWrapperPass>(); 
-    AU.addPreserved<DominatorTreeWrapperPass>(); 
-    AU.addPreserved<AAResultsWrapperPass>(); 
-    AU.addPreserved<GlobalsAAWrapperPass>(); 
-    AU.setPreservesCFG(); 
-  } 
-}; 
- 
-} // end anonymous namespace 
- 
-PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) { 
-  auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F); 
-  auto *TTI = &AM.getResult<TargetIRAnalysis>(F); 
-  auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F); 
-  auto *AA = &AM.getResult<AAManager>(F); 
-  auto *LI = &AM.getResult<LoopAnalysis>(F); 
-  auto *DT = &AM.getResult<DominatorTreeAnalysis>(F); 
-  auto *AC = &AM.getResult<AssumptionAnalysis>(F); 
-  auto *DB = &AM.getResult<DemandedBitsAnalysis>(F); 
-  auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 
- 
-  bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE); 
-  if (!Changed) 
-    return PreservedAnalyses::all(); 
- 
-  PreservedAnalyses PA; 
-  PA.preserveSet<CFGAnalyses>(); 
-  PA.preserve<AAManager>(); 
-  PA.preserve<GlobalsAA>(); 
-  return PA; 
-} 
- 
-bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, 
-                                TargetTransformInfo *TTI_, 
+
+  auto E = InstrElementSize.find(V);
+  if (E != InstrElementSize.end())
+    return E->second;
+
+  // If V is not a store, we can traverse the expression tree to find loads
+  // that feed it. The type of the loaded value may indicate a more suitable
+  // width than V's type. We want to base the vector element size on the width
+  // of memory operations where possible.
+  SmallVector<Instruction *, 16> Worklist;
+  SmallPtrSet<Instruction *, 16> Visited;
+  if (auto *I = dyn_cast<Instruction>(V)) {
+    Worklist.push_back(I);
+    Visited.insert(I);
+  }
+
+  // Traverse the expression tree in bottom-up order looking for loads. If we
+  // encounter an instruction we don't yet handle, we give up.
+  auto MaxWidth = 0u;
+  auto FoundUnknownInst = false;
+  while (!Worklist.empty() && !FoundUnknownInst) {
+    auto *I = Worklist.pop_back_val();
+
+    // We should only be looking at scalar instructions here. If the current
+    // instruction has a vector type, give up.
+    auto *Ty = I->getType();
+    if (isa<VectorType>(Ty))
+      FoundUnknownInst = true;
+
+    // If the current instruction is a load, update MaxWidth to reflect the
+    // width of the loaded value.
+    else if (isa<LoadInst>(I))
+      MaxWidth = std::max<unsigned>(MaxWidth, DL->getTypeSizeInBits(Ty));
+
+    // Otherwise, we need to visit the operands of the instruction. We only
+    // handle the interesting cases from buildTree here. If an operand is an
+    // instruction we haven't yet visited, we add it to the worklist.
+    else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
+             isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {
+      for (Use &U : I->operands())
+        if (auto *J = dyn_cast<Instruction>(U.get()))
+          if (Visited.insert(J).second)
+            Worklist.push_back(J);
+    }
+
+    // If we don't yet handle the instruction, give up.
+    else
+      FoundUnknownInst = true;
+  }
+
+  int Width = MaxWidth;
+  // If we didn't encounter a memory access in the expression tree, or if we
+  // gave up for some reason, just return the width of V. Otherwise, return the
+  // maximum width we found.
+  if (!MaxWidth || FoundUnknownInst)
+    Width = DL->getTypeSizeInBits(V->getType());
+
+  for (Instruction *I : Visited)
+    InstrElementSize[I] = Width;
+
+  return Width;
+}
+
+// Determine if a value V in a vectorizable expression Expr can be demoted to a
+// smaller type with a truncation. We collect the values that will be demoted
+// in ToDemote and additional roots that require investigating in Roots.
+static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
+                                  SmallVectorImpl<Value *> &ToDemote,
+                                  SmallVectorImpl<Value *> &Roots) {
+  // We can always demote constants.
+  if (isa<Constant>(V)) {
+    ToDemote.push_back(V);
+    return true;
+  }
+
+  // If the value is not an instruction in the expression with only one use, it
+  // cannot be demoted.
+  auto *I = dyn_cast<Instruction>(V);
+  if (!I || !I->hasOneUse() || !Expr.count(I))
+    return false;
+
+  switch (I->getOpcode()) {
+
+  // We can always demote truncations and extensions. Since truncations can
+  // seed additional demotion, we save the truncated value.
+  case Instruction::Trunc:
+    Roots.push_back(I->getOperand(0));
+    break;
+  case Instruction::ZExt:
+  case Instruction::SExt:
+    break;
+
+  // We can demote certain binary operations if we can demote both of their
+  // operands.
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) ||
+        !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))
+      return false;
+    break;
+
+  // We can demote selects if we can demote their true and false values.
+  case Instruction::Select: {
+    SelectInst *SI = cast<SelectInst>(I);
+    if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) ||
+        !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots))
+      return false;
+    break;
+  }
+
+  // We can demote phis if we can demote all their incoming operands. Note that
+  // we don't need to worry about cycles since we ensure single use above.
+  case Instruction::PHI: {
+    PHINode *PN = cast<PHINode>(I);
+    for (Value *IncValue : PN->incoming_values())
+      if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))
+        return false;
+    break;
+  }
+
+  // Otherwise, conservatively give up.
+  default:
+    return false;
+  }
+
+  // Record the value that we can demote.
+  ToDemote.push_back(V);
+  return true;
+}
+
+void BoUpSLP::computeMinimumValueSizes() {
+  // If there are no external uses, the expression tree must be rooted by a
+  // store. We can't demote in-memory values, so there is nothing to do here.
+  if (ExternalUses.empty())
+    return;
+
+  // We only attempt to truncate integer expressions.
+  auto &TreeRoot = VectorizableTree[0]->Scalars;
+  auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
+  if (!TreeRootIT)
+    return;
+
+  // If the expression is not rooted by a store, these roots should have
+  // external uses. We will rely on InstCombine to rewrite the expression in
+  // the narrower type. However, InstCombine only rewrites single-use values.
+  // This means that if a tree entry other than a root is used externally, it
+  // must have multiple uses and InstCombine will not rewrite it. The code
+  // below ensures that only the roots are used externally.
+  SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());
+  for (auto &EU : ExternalUses)
+    if (!Expr.erase(EU.Scalar))
+      return;
+  if (!Expr.empty())
+    return;
+
+  // Collect the scalar values of the vectorizable expression. We will use this
+  // context to determine which values can be demoted. If we see a truncation,
+  // we mark it as seeding another demotion.
+  for (auto &EntryPtr : VectorizableTree)
+    Expr.insert(EntryPtr->Scalars.begin(), EntryPtr->Scalars.end());
+
+  // Ensure the roots of the vectorizable tree don't form a cycle. They must
+  // have a single external user that is not in the vectorizable tree.
+  for (auto *Root : TreeRoot)
+    if (!Root->hasOneUse() || Expr.count(*Root->user_begin()))
+      return;
+
+  // Conservatively determine if we can actually truncate the roots of the
+  // expression. Collect the values that can be demoted in ToDemote and
+  // additional roots that require investigating in Roots.
+  SmallVector<Value *, 32> ToDemote;
+  SmallVector<Value *, 4> Roots;
+  for (auto *Root : TreeRoot)
+    if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))
+      return;
+
+  // The maximum bit width required to represent all the values that can be
+  // demoted without loss of precision. It would be safe to truncate the roots
+  // of the expression to this width.
+  auto MaxBitWidth = 8u;
+
+  // We first check if all the bits of the roots are demanded. If they're not,
+  // we can truncate the roots to this narrower type.
+  for (auto *Root : TreeRoot) {
+    auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
+    MaxBitWidth = std::max<unsigned>(
+        Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
+  }
+
+  // True if the roots can be zero-extended back to their original type, rather
+  // than sign-extended. We know that if the leading bits are not demanded, we
+  // can safely zero-extend. So we initialize IsKnownPositive to True.
+  bool IsKnownPositive = true;
+
+  // If all the bits of the roots are demanded, we can try a little harder to
+  // compute a narrower type. This can happen, for example, if the roots are
+  // getelementptr indices. InstCombine promotes these indices to the pointer
+  // width. Thus, all their bits are technically demanded even though the
+  // address computation might be vectorized in a smaller type.
+  //
+  // We start by looking at each entry that can be demoted. We compute the
+  // maximum bit width required to store the scalar by using ValueTracking to
+  // compute the number of high-order bits we can truncate.
+  if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) &&
+      llvm::all_of(TreeRoot, [](Value *R) {
+        assert(R->hasOneUse() && "Root should have only one use!");
+        return isa<GetElementPtrInst>(R->user_back());
+      })) {
+    MaxBitWidth = 8u;
+
+    // Determine if the sign bit of all the roots is known to be zero. If not,
+    // IsKnownPositive is set to False.
+    IsKnownPositive = llvm::all_of(TreeRoot, [&](Value *R) {
+      KnownBits Known = computeKnownBits(R, *DL);
+      return Known.isNonNegative();
+    });
+
+    // Determine the maximum number of bits required to store the scalar
+    // values.
+    for (auto *Scalar : ToDemote) {
+      auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, nullptr, DT);
+      auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
+      MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
+    }
+
+    // If we can't prove that the sign bit is zero, we must add one to the
+    // maximum bit width to account for the unknown sign bit. This preserves
+    // the existing sign bit so we can safely sign-extend the root back to the
+    // original type. Otherwise, if we know the sign bit is zero, we will
+    // zero-extend the root instead.
+    //
+    // FIXME: This is somewhat suboptimal, as there will be cases where adding
+    //        one to the maximum bit width will yield a larger-than-necessary
+    //        type. In general, we need to add an extra bit only if we can't
+    //        prove that the upper bit of the original type is equal to the
+    //        upper bit of the proposed smaller type. If these two bits are the
+    //        same (either zero or one) we know that sign-extending from the
+    //        smaller type will result in the same value. Here, since we can't
+    //        yet prove this, we are just making the proposed smaller type
+    //        larger to ensure correctness.
+    if (!IsKnownPositive)
+      ++MaxBitWidth;
+  }
+
+  // Round MaxBitWidth up to the next power-of-two.
+  if (!isPowerOf2_64(MaxBitWidth))
+    MaxBitWidth = NextPowerOf2(MaxBitWidth);
+
+  // If the maximum bit width we compute is less than the with of the roots'
+  // type, we can proceed with the narrowing. Otherwise, do nothing.
+  if (MaxBitWidth >= TreeRootIT->getBitWidth())
+    return;
+
+  // If we can truncate the root, we must collect additional values that might
+  // be demoted as a result. That is, those seeded by truncations we will
+  // modify.
+  while (!Roots.empty())
+    collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);
+
+  // Finally, map the values we can demote to the maximum bit with we computed.
+  for (auto *Scalar : ToDemote)
+    MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);
+}
+
+namespace {
+
+/// The SLPVectorizer Pass.
+struct SLPVectorizer : public FunctionPass {
+  SLPVectorizerPass Impl;
+
+  /// Pass identification, replacement for typeid
+  static char ID;
+
+  explicit SLPVectorizer() : FunctionPass(ID) {
+    initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool doInitialization(Module &M) override {
+    return false;
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+    auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
+    auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
+    auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+
+    return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    FunctionPass::getAnalysisUsage(AU);
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<DemandedBitsWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+    AU.addRequired<InjectTLIMappingsLegacy>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<AAResultsWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.setPreservesCFG();
+  }
+};
+
+} // end anonymous namespace
+
+PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
+  auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
+  auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
+  auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
+  auto *AA = &AM.getResult<AAManager>(F);
+  auto *LI = &AM.getResult<LoopAnalysis>(F);
+  auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+  auto *AC = &AM.getResult<AssumptionAnalysis>(F);
+  auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
+  auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+
+  bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<AAManager>();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+
+bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
+                                TargetTransformInfo *TTI_,
                                 TargetLibraryInfo *TLI_, AAResults *AA_,
-                                LoopInfo *LI_, DominatorTree *DT_, 
-                                AssumptionCache *AC_, DemandedBits *DB_, 
-                                OptimizationRemarkEmitter *ORE_) { 
-  if (!RunSLPVectorization) 
-    return false; 
-  SE = SE_; 
-  TTI = TTI_; 
-  TLI = TLI_; 
-  AA = AA_; 
-  LI = LI_; 
-  DT = DT_; 
-  AC = AC_; 
-  DB = DB_; 
-  DL = &F.getParent()->getDataLayout(); 
- 
-  Stores.clear(); 
-  GEPs.clear(); 
-  bool Changed = false; 
- 
-  // If the target claims to have no vector registers don't attempt 
-  // vectorization. 
-  if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) 
-    return false; 
- 
-  // Don't vectorize when the attribute NoImplicitFloat is used. 
-  if (F.hasFnAttribute(Attribute::NoImplicitFloat)) 
-    return false; 
- 
-  LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n"); 
- 
-  // Use the bottom up slp vectorizer to construct chains that start with 
-  // store instructions. 
-  BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_); 
- 
-  // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to 
-  // delete instructions. 
- 
-  // Scan the blocks in the function in post order. 
-  for (auto BB : post_order(&F.getEntryBlock())) { 
-    collectSeedInstructions(BB); 
- 
-    // Vectorize trees that end at stores. 
-    if (!Stores.empty()) { 
-      LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size() 
-                        << " underlying objects.\n"); 
-      Changed |= vectorizeStoreChains(R); 
-    } 
- 
-    // Vectorize trees that end at reductions. 
-    Changed |= vectorizeChainsInBlock(BB, R); 
- 
-    // Vectorize the index computations of getelementptr instructions. This 
-    // is primarily intended to catch gather-like idioms ending at 
-    // non-consecutive loads. 
-    if (!GEPs.empty()) { 
-      LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size() 
-                        << " underlying objects.\n"); 
-      Changed |= vectorizeGEPIndices(BB, R); 
-    } 
-  } 
- 
-  if (Changed) { 
-    R.optimizeGatherSequence(); 
-    LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n"); 
-  } 
-  return Changed; 
-} 
- 
-bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, 
-                                            unsigned Idx) { 
-  LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size() 
-                    << "\n"); 
-  const unsigned Sz = R.getVectorElementSize(Chain[0]); 
-  const unsigned MinVF = R.getMinVecRegSize() / Sz; 
-  unsigned VF = Chain.size(); 
- 
-  if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) 
-    return false; 
- 
-  LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx 
-                    << "\n"); 
- 
-  R.buildTree(Chain); 
-  Optional<ArrayRef<unsigned>> Order = R.bestOrder(); 
-  // TODO: Handle orders of size less than number of elements in the vector. 
-  if (Order && Order->size() == Chain.size()) { 
-    // TODO: reorder tree nodes without tree rebuilding. 
-    SmallVector<Value *, 4> ReorderedOps(Chain.rbegin(), Chain.rend()); 
-    llvm::transform(*Order, ReorderedOps.begin(), 
-                    [Chain](const unsigned Idx) { return Chain[Idx]; }); 
-    R.buildTree(ReorderedOps); 
-  } 
-  if (R.isTreeTinyAndNotFullyVectorizable()) 
-    return false; 
-  if (R.isLoadCombineCandidate()) 
-    return false; 
- 
-  R.computeMinimumValueSizes(); 
- 
+                                LoopInfo *LI_, DominatorTree *DT_,
+                                AssumptionCache *AC_, DemandedBits *DB_,
+                                OptimizationRemarkEmitter *ORE_) {
+  if (!RunSLPVectorization)
+    return false;
+  SE = SE_;
+  TTI = TTI_;
+  TLI = TLI_;
+  AA = AA_;
+  LI = LI_;
+  DT = DT_;
+  AC = AC_;
+  DB = DB_;
+  DL = &F.getParent()->getDataLayout();
+
+  Stores.clear();
+  GEPs.clear();
+  bool Changed = false;
+
+  // If the target claims to have no vector registers don't attempt
+  // vectorization.
+  if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)))
+    return false;
+
+  // Don't vectorize when the attribute NoImplicitFloat is used.
+  if (F.hasFnAttribute(Attribute::NoImplicitFloat))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
+
+  // Use the bottom up slp vectorizer to construct chains that start with
+  // store instructions.
+  BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
+
+  // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
+  // delete instructions.
+
+  // Scan the blocks in the function in post order.
+  for (auto BB : post_order(&F.getEntryBlock())) {
+    collectSeedInstructions(BB);
+
+    // Vectorize trees that end at stores.
+    if (!Stores.empty()) {
+      LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
+                        << " underlying objects.\n");
+      Changed |= vectorizeStoreChains(R);
+    }
+
+    // Vectorize trees that end at reductions.
+    Changed |= vectorizeChainsInBlock(BB, R);
+
+    // Vectorize the index computations of getelementptr instructions. This
+    // is primarily intended to catch gather-like idioms ending at
+    // non-consecutive loads.
+    if (!GEPs.empty()) {
+      LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
+                        << " underlying objects.\n");
+      Changed |= vectorizeGEPIndices(BB, R);
+    }
+  }
+
+  if (Changed) {
+    R.optimizeGatherSequence();
+    LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
+  }
+  return Changed;
+}
+
+bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
+                                            unsigned Idx) {
+  LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
+                    << "\n");
+  const unsigned Sz = R.getVectorElementSize(Chain[0]);
+  const unsigned MinVF = R.getMinVecRegSize() / Sz;
+  unsigned VF = Chain.size();
+
+  if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
+                    << "\n");
+
+  R.buildTree(Chain);
+  Optional<ArrayRef<unsigned>> Order = R.bestOrder();
+  // TODO: Handle orders of size less than number of elements in the vector.
+  if (Order && Order->size() == Chain.size()) {
+    // TODO: reorder tree nodes without tree rebuilding.
+    SmallVector<Value *, 4> ReorderedOps(Chain.rbegin(), Chain.rend());
+    llvm::transform(*Order, ReorderedOps.begin(),
+                    [Chain](const unsigned Idx) { return Chain[Idx]; });
+    R.buildTree(ReorderedOps);
+  }
+  if (R.isTreeTinyAndNotFullyVectorizable())
+    return false;
+  if (R.isLoadCombineCandidate())
+    return false;
+
+  R.computeMinimumValueSizes();
+
   InstructionCost Cost = R.getTreeCost();
- 
+
   LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF =" << VF << "\n");
-  if (Cost < -SLPCostThreshold) { 
+  if (Cost < -SLPCostThreshold) {
     LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
- 
-    using namespace ore; 
- 
-    R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized", 
-                                        cast<StoreInst>(Chain[0])) 
-                     << "Stores SLP vectorized with cost " << NV("Cost", Cost) 
-                     << " and with tree size " 
-                     << NV("TreeSize", R.getTreeSize())); 
- 
-    R.vectorizeTree(); 
-    return true; 
-  } 
- 
-  return false; 
-} 
- 
-bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores, 
-                                        BoUpSLP &R) { 
-  // We may run into multiple chains that merge into a single chain. We mark the 
-  // stores that we vectorized so that we don't visit the same store twice. 
-  BoUpSLP::ValueSet VectorizedStores; 
-  bool Changed = false; 
- 
-  int E = Stores.size(); 
-  SmallBitVector Tails(E, false); 
-  SmallVector<int, 16> ConsecutiveChain(E, E + 1); 
-  int MaxIter = MaxStoreLookup.getValue(); 
-  int IterCnt; 
-  auto &&FindConsecutiveAccess = [this, &Stores, &Tails, &IterCnt, MaxIter, 
-                                  &ConsecutiveChain](int K, int Idx) { 
-    if (IterCnt >= MaxIter) 
-      return true; 
-    ++IterCnt; 
-    if (!isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE)) 
-      return false; 
- 
-    Tails.set(Idx); 
-    ConsecutiveChain[K] = Idx; 
-    return true; 
-  }; 
-  // Do a quadratic search on all of the given stores in reverse order and find 
-  // all of the pairs of stores that follow each other. 
-  for (int Idx = E - 1; Idx >= 0; --Idx) { 
-    // If a store has multiple consecutive store candidates, search according 
-    // to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ... 
-    // This is because usually pairing with immediate succeeding or preceding 
-    // candidate create the best chance to find slp vectorization opportunity. 
-    const int MaxLookDepth = std::max(E - Idx, Idx + 1); 
-    IterCnt = 0; 
-    for (int Offset = 1, F = MaxLookDepth; Offset < F; ++Offset) 
-      if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) || 
-          (Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx))) 
-        break; 
-  } 
- 
-  // For stores that start but don't end a link in the chain: 
-  for (int Cnt = E; Cnt > 0; --Cnt) { 
-    int I = Cnt - 1; 
-    if (ConsecutiveChain[I] == E + 1 || Tails.test(I)) 
-      continue; 
-    // We found a store instr that starts a chain. Now follow the chain and try 
-    // to vectorize it. 
-    BoUpSLP::ValueList Operands; 
-    // Collect the chain into a list. 
-    while (I != E + 1 && !VectorizedStores.count(Stores[I])) { 
-      Operands.push_back(Stores[I]); 
-      // Move to the next value in the chain. 
-      I = ConsecutiveChain[I]; 
-    } 
- 
-    // If a vector register can't hold 1 element, we are done. 
-    unsigned MaxVecRegSize = R.getMaxVecRegSize(); 
+
+    using namespace ore;
+
+    R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
+                                        cast<StoreInst>(Chain[0]))
+                     << "Stores SLP vectorized with cost " << NV("Cost", Cost)
+                     << " and with tree size "
+                     << NV("TreeSize", R.getTreeSize()));
+
+    R.vectorizeTree();
+    return true;
+  }
+
+  return false;
+}
+
+bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
+                                        BoUpSLP &R) {
+  // We may run into multiple chains that merge into a single chain. We mark the
+  // stores that we vectorized so that we don't visit the same store twice.
+  BoUpSLP::ValueSet VectorizedStores;
+  bool Changed = false;
+
+  int E = Stores.size();
+  SmallBitVector Tails(E, false);
+  SmallVector<int, 16> ConsecutiveChain(E, E + 1);
+  int MaxIter = MaxStoreLookup.getValue();
+  int IterCnt;
+  auto &&FindConsecutiveAccess = [this, &Stores, &Tails, &IterCnt, MaxIter,
+                                  &ConsecutiveChain](int K, int Idx) {
+    if (IterCnt >= MaxIter)
+      return true;
+    ++IterCnt;
+    if (!isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE))
+      return false;
+
+    Tails.set(Idx);
+    ConsecutiveChain[K] = Idx;
+    return true;
+  };
+  // Do a quadratic search on all of the given stores in reverse order and find
+  // all of the pairs of stores that follow each other.
+  for (int Idx = E - 1; Idx >= 0; --Idx) {
+    // If a store has multiple consecutive store candidates, search according
+    // to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...
+    // This is because usually pairing with immediate succeeding or preceding
+    // candidate create the best chance to find slp vectorization opportunity.
+    const int MaxLookDepth = std::max(E - Idx, Idx + 1);
+    IterCnt = 0;
+    for (int Offset = 1, F = MaxLookDepth; Offset < F; ++Offset)
+      if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) ||
+          (Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx)))
+        break;
+  }
+
+  // For stores that start but don't end a link in the chain:
+  for (int Cnt = E; Cnt > 0; --Cnt) {
+    int I = Cnt - 1;
+    if (ConsecutiveChain[I] == E + 1 || Tails.test(I))
+      continue;
+    // We found a store instr that starts a chain. Now follow the chain and try
+    // to vectorize it.
+    BoUpSLP::ValueList Operands;
+    // Collect the chain into a list.
+    while (I != E + 1 && !VectorizedStores.count(Stores[I])) {
+      Operands.push_back(Stores[I]);
+      // Move to the next value in the chain.
+      I = ConsecutiveChain[I];
+    }
+
+    // If a vector register can't hold 1 element, we are done.
+    unsigned MaxVecRegSize = R.getMaxVecRegSize();
     unsigned EltSize = R.getVectorElementSize(Operands[0]);
-    if (MaxVecRegSize % EltSize != 0) 
-      continue; 
- 
-    unsigned MaxElts = MaxVecRegSize / EltSize; 
-    // FIXME: Is division-by-2 the correct step? Should we assert that the 
-    // register size is a power-of-2? 
-    unsigned StartIdx = 0; 
-    for (unsigned Size = llvm::PowerOf2Ceil(MaxElts); Size >= 2; Size /= 2) { 
-      for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) { 
-        ArrayRef<Value *> Slice = makeArrayRef(Operands).slice(Cnt, Size); 
-        if (!VectorizedStores.count(Slice.front()) && 
-            !VectorizedStores.count(Slice.back()) && 
-            vectorizeStoreChain(Slice, R, Cnt)) { 
-          // Mark the vectorized stores so that we don't vectorize them again. 
-          VectorizedStores.insert(Slice.begin(), Slice.end()); 
-          Changed = true; 
-          // If we vectorized initial block, no need to try to vectorize it 
-          // again. 
-          if (Cnt == StartIdx) 
-            StartIdx += Size; 
-          Cnt += Size; 
-          continue; 
-        } 
-        ++Cnt; 
-      } 
-      // Check if the whole array was vectorized already - exit. 
-      if (StartIdx >= Operands.size()) 
-        break; 
-    } 
-  } 
- 
-  return Changed; 
-} 
- 
-void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) { 
-  // Initialize the collections. We will make a single pass over the block. 
-  Stores.clear(); 
-  GEPs.clear(); 
- 
-  // Visit the store and getelementptr instructions in BB and organize them in 
-  // Stores and GEPs according to the underlying objects of their pointer 
-  // operands. 
-  for (Instruction &I : *BB) { 
-    // Ignore store instructions that are volatile or have a pointer operand 
-    // that doesn't point to a scalar type. 
-    if (auto *SI = dyn_cast<StoreInst>(&I)) { 
-      if (!SI->isSimple()) 
-        continue; 
-      if (!isValidElementType(SI->getValueOperand()->getType())) 
-        continue; 
+    if (MaxVecRegSize % EltSize != 0)
+      continue;
+
+    unsigned MaxElts = MaxVecRegSize / EltSize;
+    // FIXME: Is division-by-2 the correct step? Should we assert that the
+    // register size is a power-of-2?
+    unsigned StartIdx = 0;
+    for (unsigned Size = llvm::PowerOf2Ceil(MaxElts); Size >= 2; Size /= 2) {
+      for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
+        ArrayRef<Value *> Slice = makeArrayRef(Operands).slice(Cnt, Size);
+        if (!VectorizedStores.count(Slice.front()) &&
+            !VectorizedStores.count(Slice.back()) &&
+            vectorizeStoreChain(Slice, R, Cnt)) {
+          // Mark the vectorized stores so that we don't vectorize them again.
+          VectorizedStores.insert(Slice.begin(), Slice.end());
+          Changed = true;
+          // If we vectorized initial block, no need to try to vectorize it
+          // again.
+          if (Cnt == StartIdx)
+            StartIdx += Size;
+          Cnt += Size;
+          continue;
+        }
+        ++Cnt;
+      }
+      // Check if the whole array was vectorized already - exit.
+      if (StartIdx >= Operands.size())
+        break;
+    }
+  }
+
+  return Changed;
+}
+
+void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
+  // Initialize the collections. We will make a single pass over the block.
+  Stores.clear();
+  GEPs.clear();
+
+  // Visit the store and getelementptr instructions in BB and organize them in
+  // Stores and GEPs according to the underlying objects of their pointer
+  // operands.
+  for (Instruction &I : *BB) {
+    // Ignore store instructions that are volatile or have a pointer operand
+    // that doesn't point to a scalar type.
+    if (auto *SI = dyn_cast<StoreInst>(&I)) {
+      if (!SI->isSimple())
+        continue;
+      if (!isValidElementType(SI->getValueOperand()->getType()))
+        continue;
       Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
-    } 
- 
-    // Ignore getelementptr instructions that have more than one index, a 
-    // constant index, or a pointer operand that doesn't point to a scalar 
-    // type. 
-    else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { 
-      auto Idx = GEP->idx_begin()->get(); 
-      if (GEP->getNumIndices() > 1 || isa<Constant>(Idx)) 
-        continue; 
-      if (!isValidElementType(Idx->getType())) 
-        continue; 
-      if (GEP->getType()->isVectorTy()) 
-        continue; 
-      GEPs[GEP->getPointerOperand()].push_back(GEP); 
-    } 
-  } 
-} 
- 
-bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { 
-  if (!A || !B) 
-    return false; 
-  Value *VL[] = {A, B}; 
-  return tryToVectorizeList(VL, R, /*AllowReorder=*/true); 
-} 
- 
-bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, 
-                                           bool AllowReorder, 
-                                           ArrayRef<Value *> InsertUses) { 
-  if (VL.size() < 2) 
-    return false; 
- 
-  LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = " 
-                    << VL.size() << ".\n"); 
- 
-  // Check that all of the parts are instructions of the same type, 
-  // we permit an alternate opcode via InstructionsState. 
-  InstructionsState S = getSameOpcode(VL); 
-  if (!S.getOpcode()) 
-    return false; 
- 
-  Instruction *I0 = cast<Instruction>(S.OpValue); 
-  // Make sure invalid types (including vector type) are rejected before 
-  // determining vectorization factor for scalar instructions. 
-  for (Value *V : VL) { 
-    Type *Ty = V->getType(); 
-    if (!isValidElementType(Ty)) { 
-      // NOTE: the following will give user internal llvm type name, which may 
-      // not be useful. 
-      R.getORE()->emit([&]() { 
-        std::string type_str; 
-        llvm::raw_string_ostream rso(type_str); 
-        Ty->print(rso); 
-        return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0) 
-               << "Cannot SLP vectorize list: type " 
-               << rso.str() + " is unsupported by vectorizer"; 
-      }); 
-      return false; 
-    } 
-  } 
- 
-  unsigned Sz = R.getVectorElementSize(I0); 
-  unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz); 
-  unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF); 
+    }
+
+    // Ignore getelementptr instructions that have more than one index, a
+    // constant index, or a pointer operand that doesn't point to a scalar
+    // type.
+    else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+      auto Idx = GEP->idx_begin()->get();
+      if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))
+        continue;
+      if (!isValidElementType(Idx->getType()))
+        continue;
+      if (GEP->getType()->isVectorTy())
+        continue;
+      GEPs[GEP->getPointerOperand()].push_back(GEP);
+    }
+  }
+}
+
+bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
+  if (!A || !B)
+    return false;
+  Value *VL[] = {A, B};
+  return tryToVectorizeList(VL, R, /*AllowReorder=*/true);
+}
+
+bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
+                                           bool AllowReorder,
+                                           ArrayRef<Value *> InsertUses) {
+  if (VL.size() < 2)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
+                    << VL.size() << ".\n");
+
+  // Check that all of the parts are instructions of the same type,
+  // we permit an alternate opcode via InstructionsState.
+  InstructionsState S = getSameOpcode(VL);
+  if (!S.getOpcode())
+    return false;
+
+  Instruction *I0 = cast<Instruction>(S.OpValue);
+  // Make sure invalid types (including vector type) are rejected before
+  // determining vectorization factor for scalar instructions.
+  for (Value *V : VL) {
+    Type *Ty = V->getType();
+    if (!isValidElementType(Ty)) {
+      // NOTE: the following will give user internal llvm type name, which may
+      // not be useful.
+      R.getORE()->emit([&]() {
+        std::string type_str;
+        llvm::raw_string_ostream rso(type_str);
+        Ty->print(rso);
+        return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
+               << "Cannot SLP vectorize list: type "
+               << rso.str() + " is unsupported by vectorizer";
+      });
+      return false;
+    }
+  }
+
+  unsigned Sz = R.getVectorElementSize(I0);
+  unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
+  unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
   MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
-  if (MaxVF < 2) { 
-    R.getORE()->emit([&]() { 
-      return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0) 
-             << "Cannot SLP vectorize list: vectorization factor " 
-             << "less than 2 is not supported"; 
-    }); 
-    return false; 
-  } 
- 
-  bool Changed = false; 
-  bool CandidateFound = false; 
+  if (MaxVF < 2) {
+    R.getORE()->emit([&]() {
+      return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
+             << "Cannot SLP vectorize list: vectorization factor "
+             << "less than 2 is not supported";
+    });
+    return false;
+  }
+
+  bool Changed = false;
+  bool CandidateFound = false;
   InstructionCost MinCost = SLPCostThreshold.getValue();
- 
-  bool CompensateUseCost = 
-      !InsertUses.empty() && llvm::all_of(InsertUses, [](const Value *V) { 
-        return V && isa<InsertElementInst>(V); 
-      }); 
-  assert((!CompensateUseCost || InsertUses.size() == VL.size()) && 
-         "Each scalar expected to have an associated InsertElement user."); 
- 
-  unsigned NextInst = 0, MaxInst = VL.size(); 
-  for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) { 
-    // No actual vectorization should happen, if number of parts is the same as 
-    // provided vectorization factor (i.e. the scalar type is used for vector 
-    // code during codegen). 
-    auto *VecTy = FixedVectorType::get(VL[0]->getType(), VF); 
-    if (TTI->getNumberOfParts(VecTy) == VF) 
-      continue; 
-    for (unsigned I = NextInst; I < MaxInst; ++I) { 
-      unsigned OpsWidth = 0; 
- 
-      if (I + VF > MaxInst) 
-        OpsWidth = MaxInst - I; 
-      else 
-        OpsWidth = VF; 
- 
-      if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2) 
-        break; 
- 
-      ArrayRef<Value *> Ops = VL.slice(I, OpsWidth); 
-      // Check that a previous iteration of this loop did not delete the Value. 
-      if (llvm::any_of(Ops, [&R](Value *V) { 
-            auto *I = dyn_cast<Instruction>(V); 
-            return I && R.isDeleted(I); 
-          })) 
-        continue; 
- 
-      LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations " 
-                        << "\n"); 
- 
-      R.buildTree(Ops); 
-      Optional<ArrayRef<unsigned>> Order = R.bestOrder(); 
-      // TODO: check if we can allow reordering for more cases. 
-      if (AllowReorder && Order) { 
-        // TODO: reorder tree nodes without tree rebuilding. 
-        // Conceptually, there is nothing actually preventing us from trying to 
-        // reorder a larger list. In fact, we do exactly this when vectorizing 
-        // reductions. However, at this point, we only expect to get here when 
-        // there are exactly two operations. 
-        assert(Ops.size() == 2); 
-        Value *ReorderedOps[] = {Ops[1], Ops[0]}; 
-        R.buildTree(ReorderedOps, None); 
-      } 
-      if (R.isTreeTinyAndNotFullyVectorizable()) 
-        continue; 
- 
-      R.computeMinimumValueSizes(); 
+
+  bool CompensateUseCost =
+      !InsertUses.empty() && llvm::all_of(InsertUses, [](const Value *V) {
+        return V && isa<InsertElementInst>(V);
+      });
+  assert((!CompensateUseCost || InsertUses.size() == VL.size()) &&
+         "Each scalar expected to have an associated InsertElement user.");
+
+  unsigned NextInst = 0, MaxInst = VL.size();
+  for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
+    // No actual vectorization should happen, if number of parts is the same as
+    // provided vectorization factor (i.e. the scalar type is used for vector
+    // code during codegen).
+    auto *VecTy = FixedVectorType::get(VL[0]->getType(), VF);
+    if (TTI->getNumberOfParts(VecTy) == VF)
+      continue;
+    for (unsigned I = NextInst; I < MaxInst; ++I) {
+      unsigned OpsWidth = 0;
+
+      if (I + VF > MaxInst)
+        OpsWidth = MaxInst - I;
+      else
+        OpsWidth = VF;
+
+      if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
+        break;
+
+      ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
+      // Check that a previous iteration of this loop did not delete the Value.
+      if (llvm::any_of(Ops, [&R](Value *V) {
+            auto *I = dyn_cast<Instruction>(V);
+            return I && R.isDeleted(I);
+          }))
+        continue;
+
+      LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
+                        << "\n");
+
+      R.buildTree(Ops);
+      Optional<ArrayRef<unsigned>> Order = R.bestOrder();
+      // TODO: check if we can allow reordering for more cases.
+      if (AllowReorder && Order) {
+        // TODO: reorder tree nodes without tree rebuilding.
+        // Conceptually, there is nothing actually preventing us from trying to
+        // reorder a larger list. In fact, we do exactly this when vectorizing
+        // reductions. However, at this point, we only expect to get here when
+        // there are exactly two operations.
+        assert(Ops.size() == 2);
+        Value *ReorderedOps[] = {Ops[1], Ops[0]};
+        R.buildTree(ReorderedOps, None);
+      }
+      if (R.isTreeTinyAndNotFullyVectorizable())
+        continue;
+
+      R.computeMinimumValueSizes();
       InstructionCost Cost = R.getTreeCost();
-      CandidateFound = true; 
-      if (CompensateUseCost) { 
-        // TODO: Use TTI's getScalarizationOverhead for sequence of inserts 
-        // rather than sum of single inserts as the latter may overestimate 
-        // cost. This work should imply improving cost estimation for extracts 
-        // that added in for external (for vectorization tree) users,i.e. that 
-        // part should also switch to same interface. 
-        // For example, the following case is projected code after SLP: 
-        //  %4 = extractelement <4 x i64> %3, i32 0 
+      CandidateFound = true;
+      if (CompensateUseCost) {
+        // TODO: Use TTI's getScalarizationOverhead for sequence of inserts
+        // rather than sum of single inserts as the latter may overestimate
+        // cost. This work should imply improving cost estimation for extracts
+        // that added in for external (for vectorization tree) users,i.e. that
+        // part should also switch to same interface.
+        // For example, the following case is projected code after SLP:
+        //  %4 = extractelement <4 x i64> %3, i32 0
         //  %v0 = insertelement <4 x i64> poison, i64 %4, i32 0
-        //  %5 = extractelement <4 x i64> %3, i32 1 
-        //  %v1 = insertelement <4 x i64> %v0, i64 %5, i32 1 
-        //  %6 = extractelement <4 x i64> %3, i32 2 
-        //  %v2 = insertelement <4 x i64> %v1, i64 %6, i32 2 
-        //  %7 = extractelement <4 x i64> %3, i32 3 
-        //  %v3 = insertelement <4 x i64> %v2, i64 %7, i32 3 
-        // 
-        // Extracts here added by SLP in order to feed users (the inserts) of 
-        // original scalars and contribute to "ExtractCost" at cost evaluation. 
-        // The inserts in turn form sequence to build an aggregate that 
-        // detected by findBuildAggregate routine. 
-        // SLP makes an assumption that such sequence will be optimized away 
-        // later (instcombine) so it tries to compensate ExctractCost with 
-        // cost of insert sequence. 
-        // Current per element cost calculation approach is not quite accurate 
-        // and tends to create bias toward favoring vectorization. 
-        // Switching to the TTI interface might help a bit. 
-        // Alternative solution could be pattern-match to detect a no-op or 
-        // shuffle. 
+        //  %5 = extractelement <4 x i64> %3, i32 1
+        //  %v1 = insertelement <4 x i64> %v0, i64 %5, i32 1
+        //  %6 = extractelement <4 x i64> %3, i32 2
+        //  %v2 = insertelement <4 x i64> %v1, i64 %6, i32 2
+        //  %7 = extractelement <4 x i64> %3, i32 3
+        //  %v3 = insertelement <4 x i64> %v2, i64 %7, i32 3
+        //
+        // Extracts here added by SLP in order to feed users (the inserts) of
+        // original scalars and contribute to "ExtractCost" at cost evaluation.
+        // The inserts in turn form sequence to build an aggregate that
+        // detected by findBuildAggregate routine.
+        // SLP makes an assumption that such sequence will be optimized away
+        // later (instcombine) so it tries to compensate ExctractCost with
+        // cost of insert sequence.
+        // Current per element cost calculation approach is not quite accurate
+        // and tends to create bias toward favoring vectorization.
+        // Switching to the TTI interface might help a bit.
+        // Alternative solution could be pattern-match to detect a no-op or
+        // shuffle.
         InstructionCost UserCost = 0;
-        for (unsigned Lane = 0; Lane < OpsWidth; Lane++) { 
-          auto *IE = cast<InsertElementInst>(InsertUses[I + Lane]); 
-          if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) 
-            UserCost += TTI->getVectorInstrCost( 
-                Instruction::InsertElement, IE->getType(), CI->getZExtValue()); 
-        } 
-        LLVM_DEBUG(dbgs() << "SLP: Compensate cost of users by: " << UserCost 
-                          << ".\n"); 
-        Cost -= UserCost; 
-      } 
- 
-      MinCost = std::min(MinCost, Cost); 
- 
-      if (Cost < -SLPCostThreshold) { 
-        LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); 
-        R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList", 
-                                                    cast<Instruction>(Ops[0])) 
-                                 << "SLP vectorized with cost " << ore::NV("Cost", Cost) 
-                                 << " and with tree size " 
-                                 << ore::NV("TreeSize", R.getTreeSize())); 
- 
-        R.vectorizeTree(); 
-        // Move to the next bundle. 
-        I += VF - 1; 
-        NextInst = I + 1; 
-        Changed = true; 
-      } 
-    } 
-  } 
- 
-  if (!Changed && CandidateFound) { 
-    R.getORE()->emit([&]() { 
-      return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0) 
-             << "List vectorization was possible but not beneficial with cost " 
-             << ore::NV("Cost", MinCost) << " >= " 
-             << ore::NV("Treshold", -SLPCostThreshold); 
-    }); 
-  } else if (!Changed) { 
-    R.getORE()->emit([&]() { 
-      return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0) 
-             << "Cannot SLP vectorize list: vectorization was impossible" 
-             << " with available vectorization factors"; 
-    }); 
-  } 
-  return Changed; 
-} 
- 
-bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) { 
-  if (!I) 
-    return false; 
- 
-  if (!isa<BinaryOperator>(I) && !isa<CmpInst>(I)) 
-    return false; 
- 
-  Value *P = I->getParent(); 
- 
-  // Vectorize in current basic block only. 
-  auto *Op0 = dyn_cast<Instruction>(I->getOperand(0)); 
-  auto *Op1 = dyn_cast<Instruction>(I->getOperand(1)); 
-  if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P) 
-    return false; 
- 
-  // Try to vectorize V. 
-  if (tryToVectorizePair(Op0, Op1, R)) 
-    return true; 
- 
-  auto *A = dyn_cast<BinaryOperator>(Op0); 
-  auto *B = dyn_cast<BinaryOperator>(Op1); 
-  // Try to skip B. 
-  if (B && B->hasOneUse()) { 
-    auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0)); 
-    auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1)); 
-    if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R)) 
-      return true; 
-    if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R)) 
-      return true; 
-  } 
- 
-  // Try to skip A. 
-  if (A && A->hasOneUse()) { 
-    auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0)); 
-    auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1)); 
-    if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R)) 
-      return true; 
-    if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R)) 
-      return true; 
-  } 
-  return false; 
-} 
- 
-namespace { 
- 
-/// Model horizontal reductions. 
-/// 
+        for (unsigned Lane = 0; Lane < OpsWidth; Lane++) {
+          auto *IE = cast<InsertElementInst>(InsertUses[I + Lane]);
+          if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2)))
+            UserCost += TTI->getVectorInstrCost(
+                Instruction::InsertElement, IE->getType(), CI->getZExtValue());
+        }
+        LLVM_DEBUG(dbgs() << "SLP: Compensate cost of users by: " << UserCost
+                          << ".\n");
+        Cost -= UserCost;
+      }
+
+      MinCost = std::min(MinCost, Cost);
+
+      if (Cost < -SLPCostThreshold) {
+        LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
+        R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
+                                                    cast<Instruction>(Ops[0]))
+                                 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
+                                 << " and with tree size "
+                                 << ore::NV("TreeSize", R.getTreeSize()));
+
+        R.vectorizeTree();
+        // Move to the next bundle.
+        I += VF - 1;
+        NextInst = I + 1;
+        Changed = true;
+      }
+    }
+  }
+
+  if (!Changed && CandidateFound) {
+    R.getORE()->emit([&]() {
+      return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
+             << "List vectorization was possible but not beneficial with cost "
+             << ore::NV("Cost", MinCost) << " >= "
+             << ore::NV("Treshold", -SLPCostThreshold);
+    });
+  } else if (!Changed) {
+    R.getORE()->emit([&]() {
+      return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
+             << "Cannot SLP vectorize list: vectorization was impossible"
+             << " with available vectorization factors";
+    });
+  }
+  return Changed;
+}
+
+bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
+  if (!I)
+    return false;
+
+  if (!isa<BinaryOperator>(I) && !isa<CmpInst>(I))
+    return false;
+
+  Value *P = I->getParent();
+
+  // Vectorize in current basic block only.
+  auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
+  auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
+  if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
+    return false;
+
+  // Try to vectorize V.
+  if (tryToVectorizePair(Op0, Op1, R))
+    return true;
+
+  auto *A = dyn_cast<BinaryOperator>(Op0);
+  auto *B = dyn_cast<BinaryOperator>(Op1);
+  // Try to skip B.
+  if (B && B->hasOneUse()) {
+    auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
+    auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
+    if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))
+      return true;
+    if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))
+      return true;
+  }
+
+  // Try to skip A.
+  if (A && A->hasOneUse()) {
+    auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
+    auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
+    if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))
+      return true;
+    if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))
+      return true;
+  }
+  return false;
+}
+
+namespace {
+
+/// Model horizontal reductions.
+///
 /// A horizontal reduction is a tree of reduction instructions that has values
 /// that can be put into a vector as its leaves. For example:
-/// 
-/// mul mul mul mul 
-///  \  /    \  / 
-///   +       + 
-///    \     / 
-///       + 
+///
+/// mul mul mul mul
+///  \  /    \  /
+///   +       +
+///    \     /
+///       +
 /// This tree has "mul" as its leaf values and "+" as its reduction
 /// instructions. A reduction can feed into a store or a binary operation
-/// feeding a phi. 
-///    ... 
-///    \  / 
-///     + 
-///     | 
-///  phi += 
-/// 
-///  Or: 
-///    ... 
-///    \  / 
-///     + 
-///     | 
-///   *p = 
-/// 
-class HorizontalReduction { 
-  using ReductionOpsType = SmallVector<Value *, 16>; 
-  using ReductionOpsListType = SmallVector<ReductionOpsType, 2>; 
+/// feeding a phi.
+///    ...
+///    \  /
+///     +
+///     |
+///  phi +=
+///
+///  Or:
+///    ...
+///    \  /
+///     +
+///     |
+///   *p =
+///
+class HorizontalReduction {
+  using ReductionOpsType = SmallVector<Value *, 16>;
+  using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
   ReductionOpsListType ReductionOps;
-  SmallVector<Value *, 32> ReducedVals; 
-  // Use map vector to make stable output. 
-  MapVector<Instruction *, Value *> ExtraArgs; 
+  SmallVector<Value *, 32> ReducedVals;
+  // Use map vector to make stable output.
+  MapVector<Instruction *, Value *> ExtraArgs;
   WeakTrackingVH ReductionRoot;
   /// The type of reduction operation.
   RecurKind RdxKind;
- 
+
   /// Checks if instruction is associative and can be vectorized.
   static bool isVectorizable(RecurKind Kind, Instruction *I) {
     if (Kind == RecurKind::None)
       return false;
     if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind))
       return true;
- 
+
     if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
       // FP min/max are associative except for NaN and -0.0. We do not
       // have to rule out -0.0 here because the intrinsic semantics do not
       // specify a fixed result for it.
       return I->getFastMathFlags().noNaNs();
-    } 
- 
+    }
+
     return I->isAssociative();
   }
- 
+
   /// Checks if the ParentStackElem.first should be marked as a reduction
   /// operation with an extra argument or as extra argument itself.
   void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,
@@ -6446,9 +6446,9 @@ class HorizontalReduction {
       // We ran into something like:
       // ParentStackElem.first += ... + ExtraArg + ...
       ExtraArgs[ParentStackElem.first] = ExtraArg;
-    } 
+    }
   }
- 
+
   /// Creates reduction operation with the current opcode.
   static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS,
                          Value *RHS, const Twine &Name) {
@@ -6467,28 +6467,28 @@ class HorizontalReduction {
       return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
     case RecurKind::FMin:
       return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
- 
+
     case RecurKind::SMax: {
       Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
       return Builder.CreateSelect(Cmp, LHS, RHS, Name);
-    } 
+    }
     case RecurKind::SMin: {
       Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
       return Builder.CreateSelect(Cmp, LHS, RHS, Name);
-    } 
+    }
     case RecurKind::UMax: {
       Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
       return Builder.CreateSelect(Cmp, LHS, RHS, Name);
-    } 
+    }
     case RecurKind::UMin: {
       Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
       return Builder.CreateSelect(Cmp, LHS, RHS, Name);
-    } 
+    }
     default:
       llvm_unreachable("Unknown reduction operation.");
-    } 
+    }
   }
- 
+
   /// Creates reduction operation with the current opcode with the IR flags
   /// from \p ReductionOps.
   static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,
@@ -6500,7 +6500,7 @@ class HorizontalReduction {
         propagateIRFlags(Sel->getCondition(), ReductionOps[0]);
       propagateIRFlags(Op, ReductionOps[1]);
       return Op;
-    } 
+    }
     propagateIRFlags(Op, ReductionOps[0]);
     return Op;
   }
@@ -6513,12 +6513,12 @@ class HorizontalReduction {
       if (auto *Sel = dyn_cast<SelectInst>(Op)) {
         propagateIRFlags(Sel->getCondition(),
                          cast<SelectInst>(I)->getCondition());
-      } 
-    } 
+      }
+    }
     propagateIRFlags(Op, I);
     return Op;
   }
- 
+
   static RecurKind getRdxKind(Instruction *I) {
     assert(I && "Expected instruction for reduction matching");
     TargetTransformInfo::ReductionFlags RdxFlags;
@@ -6536,12 +6536,12 @@ class HorizontalReduction {
       return RecurKind::FAdd;
     if (match(I, m_FMul(m_Value(), m_Value())))
       return RecurKind::FMul;
- 
+
     if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
       return RecurKind::FMax;
     if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
       return RecurKind::FMin;
- 
+
     if (match(I, m_SMax(m_Value(), m_Value())))
       return RecurKind::SMax;
     if (match(I, m_SMin(m_Value(), m_Value())))
@@ -6550,7 +6550,7 @@ class HorizontalReduction {
       return RecurKind::UMax;
     if (match(I, m_UMin(m_Value(), m_Value())))
       return RecurKind::UMin;
- 
+
     if (auto *Select = dyn_cast<SelectInst>(I)) {
       // Try harder: look for min/max pattern based on instructions producing
       // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
@@ -6566,11 +6566,11 @@ class HorizontalReduction {
       CmpInst::Predicate Pred;
       Instruction *L1;
       Instruction *L2;
- 
+
       Value *LHS = Select->getTrueValue();
       Value *RHS = Select->getFalseValue();
       Value *Cond = Select->getCondition();
- 
+
       // TODO: Support inverse predicates.
       if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
         if (!isa<ExtractElementInst>(RHS) ||
@@ -6587,8 +6587,8 @@ class HorizontalReduction {
             !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
             !L2->isIdenticalTo(cast<Instruction>(RHS)))
           return RecurKind::None;
-      } 
- 
+      }
+
       TargetTransformInfo::ReductionFlags RdxFlags;
       switch (Pred) {
       default:
@@ -6605,16 +6605,16 @@ class HorizontalReduction {
       case CmpInst::ICMP_ULT:
       case CmpInst::ICMP_ULE:
         return RecurKind::UMin;
-      } 
-    } 
+      }
+    }
     return RecurKind::None;
   }
- 
+
   /// Return true if this operation is a cmp+select idiom.
   static bool isCmpSel(RecurKind Kind) {
     return RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind);
   }
- 
+
   /// Get the index of the first operand.
   static unsigned getFirstOperandIndex(RecurKind Kind) {
     // We allow calling this before 'Kind' is set, so handle that specially.
@@ -6622,12 +6622,12 @@ class HorizontalReduction {
       return 0;
     return isCmpSel(Kind) ? 1 : 0;
   }
- 
+
   /// Total number of operands in the reduction operation.
   static unsigned getNumberOfOperands(RecurKind Kind) {
     return isCmpSel(Kind) ? 3 : 2;
   }
- 
+
   /// Checks if the instruction is in basic block \p BB.
   /// For a min/max reduction check that both compare and select are in \p BB.
   static bool hasSameParent(RecurKind Kind, Instruction *I, BasicBlock *BB,
@@ -6635,10 +6635,10 @@ class HorizontalReduction {
     if (IsRedOp && isCmpSel(Kind)) {
       auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition());
       return I->getParent() == BB && Cmp && Cmp->getParent() == BB;
-    } 
+    }
     return I->getParent() == BB;
-  } 
- 
+  }
+
   /// Expected number of uses for reduction operations/reduced values.
   static bool hasRequiredNumberOfUses(RecurKind Kind, Instruction *I,
                                       bool IsReductionOp) {
@@ -6648,11 +6648,11 @@ class HorizontalReduction {
       return I->hasNUses(2) &&
              (!IsReductionOp ||
               cast<SelectInst>(I)->getCondition()->hasOneUse());
- 
+
     // Arithmetic reduction operation must be used once only.
     return I->hasOneUse();
   }
- 
+
   /// Initializes the list of reduction operations.
   void initReductionOps(RecurKind Kind) {
     if (isCmpSel(Kind))
@@ -6660,7 +6660,7 @@ class HorizontalReduction {
     else
       ReductionOps.assign(1, ReductionOpsType());
   }
- 
+
   /// Add all reduction operations for the reduction instruction \p I.
   void addReductionOps(RecurKind Kind, Instruction *I) {
     assert(Kind != RecurKind::None && "Expected reduction operation.");
@@ -6669,9 +6669,9 @@ class HorizontalReduction {
       ReductionOps[1].emplace_back(I);
     } else {
       ReductionOps[0].emplace_back(I);
-    } 
-  } 
- 
+    }
+  }
+
   static Value *getLHS(RecurKind Kind, Instruction *I) {
     if (Kind == RecurKind::None)
       return nullptr;
@@ -6683,90 +6683,90 @@ class HorizontalReduction {
     return I->getOperand(getFirstOperandIndex(Kind) + 1);
   }
 
-public: 
-  HorizontalReduction() = default; 
- 
-  /// Try to find a reduction tree. 
-  bool matchAssociativeReduction(PHINode *Phi, Instruction *B) { 
-    assert((!Phi || is_contained(Phi->operands(), B)) && 
+public:
+  HorizontalReduction() = default;
+
+  /// Try to find a reduction tree.
+  bool matchAssociativeReduction(PHINode *Phi, Instruction *B) {
+    assert((!Phi || is_contained(Phi->operands(), B)) &&
            "Phi needs to use the binary operator");
- 
+
     RdxKind = getRdxKind(B);
- 
-    // We could have a initial reductions that is not an add. 
-    //  r *= v1 + v2 + v3 + v4 
-    // In such a case start looking for a tree rooted in the first '+'. 
-    if (Phi) { 
+
+    // We could have a initial reductions that is not an add.
+    //  r *= v1 + v2 + v3 + v4
+    // In such a case start looking for a tree rooted in the first '+'.
+    if (Phi) {
       if (getLHS(RdxKind, B) == Phi) {
-        Phi = nullptr; 
+        Phi = nullptr;
         B = dyn_cast<Instruction>(getRHS(RdxKind, B));
         if (!B)
           return false;
         RdxKind = getRdxKind(B);
       } else if (getRHS(RdxKind, B) == Phi) {
-        Phi = nullptr; 
+        Phi = nullptr;
         B = dyn_cast<Instruction>(getLHS(RdxKind, B));
         if (!B)
           return false;
         RdxKind = getRdxKind(B);
-      } 
-    } 
- 
+      }
+    }
+
     if (!isVectorizable(RdxKind, B))
-      return false; 
- 
+      return false;
+
     // Analyze "regular" integer/FP types for reductions - no target-specific
     // types or pointers.
-    Type *Ty = B->getType(); 
+    Type *Ty = B->getType();
     if (!isValidElementType(Ty) || Ty->isPointerTy())
-      return false; 
- 
-    ReductionRoot = B; 
- 
+      return false;
+
+    ReductionRoot = B;
+
     // The opcode for leaf values that we perform a reduction on.
     // For example: load(x) + load(y) + load(z) + fptoui(w)
     // The leaf opcode for 'w' does not match, so we don't include it as a
     // potential candidate for the reduction.
     unsigned LeafOpcode = 0;
 
-    // Post order traverse the reduction tree starting at B. We only handle true 
-    // trees containing only binary operators. 
-    SmallVector<std::pair<Instruction *, unsigned>, 32> Stack; 
+    // Post order traverse the reduction tree starting at B. We only handle true
+    // trees containing only binary operators.
+    SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
     Stack.push_back(std::make_pair(B, getFirstOperandIndex(RdxKind)));
     initReductionOps(RdxKind);
-    while (!Stack.empty()) { 
-      Instruction *TreeN = Stack.back().first; 
+    while (!Stack.empty()) {
+      Instruction *TreeN = Stack.back().first;
       unsigned EdgeToVisit = Stack.back().second++;
       const RecurKind TreeRdxKind = getRdxKind(TreeN);
       bool IsReducedValue = TreeRdxKind != RdxKind;
- 
+
       // Postorder visit.
       if (IsReducedValue || EdgeToVisit == getNumberOfOperands(TreeRdxKind)) {
-        if (IsReducedValue) 
-          ReducedVals.push_back(TreeN); 
-        else { 
-          auto I = ExtraArgs.find(TreeN); 
-          if (I != ExtraArgs.end() && !I->second) { 
-            // Check if TreeN is an extra argument of its parent operation. 
-            if (Stack.size() <= 1) { 
-              // TreeN can't be an extra argument as it is a root reduction 
-              // operation. 
-              return false; 
-            } 
-            // Yes, TreeN is an extra argument, do not add it to a list of 
-            // reduction operations. 
-            // Stack[Stack.size() - 2] always points to the parent operation. 
-            markExtraArg(Stack[Stack.size() - 2], TreeN); 
-            ExtraArgs.erase(TreeN); 
-          } else 
+        if (IsReducedValue)
+          ReducedVals.push_back(TreeN);
+        else {
+          auto I = ExtraArgs.find(TreeN);
+          if (I != ExtraArgs.end() && !I->second) {
+            // Check if TreeN is an extra argument of its parent operation.
+            if (Stack.size() <= 1) {
+              // TreeN can't be an extra argument as it is a root reduction
+              // operation.
+              return false;
+            }
+            // Yes, TreeN is an extra argument, do not add it to a list of
+            // reduction operations.
+            // Stack[Stack.size() - 2] always points to the parent operation.
+            markExtraArg(Stack[Stack.size() - 2], TreeN);
+            ExtraArgs.erase(TreeN);
+          } else
             addReductionOps(RdxKind, TreeN);
-        } 
-        // Retract. 
-        Stack.pop_back(); 
-        continue; 
-      } 
- 
-      // Visit left or right. 
+        }
+        // Retract.
+        Stack.pop_back();
+        continue;
+      }
+
+      // Visit left or right.
       Value *EdgeVal = TreeN->getOperand(EdgeToVisit);
       auto *I = dyn_cast<Instruction>(EdgeVal);
       if (!I) {
@@ -6791,31 +6791,31 @@ public:
         if (IsRdxInst) {
           // We need to be able to reassociate the reduction operations.
           if (!isVectorizable(EdgeRdxKind, I)) {
-            // I is an extra argument for TreeN (its parent operation). 
-            markExtraArg(Stack.back(), I); 
-            continue; 
-          } 
+            // I is an extra argument for TreeN (its parent operation).
+            markExtraArg(Stack.back(), I);
+            continue;
+          }
         } else if (!LeafOpcode) {
           LeafOpcode = I->getOpcode();
-        } 
+        }
         Stack.push_back(std::make_pair(I, getFirstOperandIndex(EdgeRdxKind)));
         continue;
-      } 
+      }
       // I is an extra argument for TreeN (its parent operation).
       markExtraArg(Stack.back(), I);
-    } 
-    return true; 
-  } 
- 
+    }
+    return true;
+  }
+
   /// Attempt to vectorize the tree found by matchAssociativeReduction.
-  bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { 
+  bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
     // If there are a sufficient number of reduction values, reduce
     // to a nearby power-of-2. We can safely generate oversized
-    // vectors and rely on the backend to split them to legal sizes. 
-    unsigned NumReducedVals = ReducedVals.size(); 
-    if (NumReducedVals < 4) 
-      return false; 
- 
+    // vectors and rely on the backend to split them to legal sizes.
+    unsigned NumReducedVals = ReducedVals.size();
+    if (NumReducedVals < 4)
+      return false;
+
     // Intersect the fast-math-flags from all reduction operations.
     FastMathFlags RdxFMF;
     RdxFMF.set();
@@ -6825,33 +6825,33 @@ public:
           RdxFMF &= FPMO->getFastMathFlags();
       }
     }
- 
-    IRBuilder<> Builder(cast<Instruction>(ReductionRoot)); 
+
+    IRBuilder<> Builder(cast<Instruction>(ReductionRoot));
     Builder.setFastMathFlags(RdxFMF);
- 
-    BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues; 
+
+    BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
     // The same extra argument may be used several times, so log each attempt
-    // to use it. 
+    // to use it.
     for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
-      assert(Pair.first && "DebugLoc must be set."); 
-      ExternallyUsedValues[Pair.second].push_back(Pair.first); 
-    } 
- 
-    // The compare instruction of a min/max is the insertion point for new 
-    // instructions and may be replaced with a new compare instruction. 
-    auto getCmpForMinMaxReduction = [](Instruction *RdxRootInst) { 
-      assert(isa<SelectInst>(RdxRootInst) && 
-             "Expected min/max reduction to have select root instruction"); 
-      Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition(); 
-      assert(isa<Instruction>(ScalarCond) && 
-             "Expected min/max reduction to have compare condition"); 
-      return cast<Instruction>(ScalarCond); 
-    }; 
- 
-    // The reduction root is used as the insertion point for new instructions, 
-    // so set it as externally used to prevent it from being deleted. 
-    ExternallyUsedValues[ReductionRoot]; 
-    SmallVector<Value *, 16> IgnoreList; 
+      assert(Pair.first && "DebugLoc must be set.");
+      ExternallyUsedValues[Pair.second].push_back(Pair.first);
+    }
+
+    // The compare instruction of a min/max is the insertion point for new
+    // instructions and may be replaced with a new compare instruction.
+    auto getCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
+      assert(isa<SelectInst>(RdxRootInst) &&
+             "Expected min/max reduction to have select root instruction");
+      Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
+      assert(isa<Instruction>(ScalarCond) &&
+             "Expected min/max reduction to have compare condition");
+      return cast<Instruction>(ScalarCond);
+    };
+
+    // The reduction root is used as the insertion point for new instructions,
+    // so set it as externally used to prevent it from being deleted.
+    ExternallyUsedValues[ReductionRoot];
+    SmallVector<Value *, 16> IgnoreList;
     for (ReductionOpsType &RdxOp : ReductionOps)
       IgnoreList.append(RdxOp.begin(), RdxOp.end());
 
@@ -6886,28 +6886,28 @@ public:
 
     Value *VectorizedTree = nullptr;
     unsigned i = 0;
-    while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) { 
+    while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
       ArrayRef<Value *> VL(&ReducedVals[i], ReduxWidth);
-      V.buildTree(VL, ExternallyUsedValues, IgnoreList); 
-      Optional<ArrayRef<unsigned>> Order = V.bestOrder(); 
+      V.buildTree(VL, ExternallyUsedValues, IgnoreList);
+      Optional<ArrayRef<unsigned>> Order = V.bestOrder();
       if (Order) {
         assert(Order->size() == VL.size() &&
                "Order size must be the same as number of vectorized "
                "instructions.");
-        // TODO: reorder tree nodes without tree rebuilding. 
-        SmallVector<Value *, 4> ReorderedOps(VL.size()); 
-        llvm::transform(*Order, ReorderedOps.begin(), 
-                        [VL](const unsigned Idx) { return VL[Idx]; }); 
-        V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList); 
-      } 
-      if (V.isTreeTinyAndNotFullyVectorizable()) 
-        break; 
+        // TODO: reorder tree nodes without tree rebuilding.
+        SmallVector<Value *, 4> ReorderedOps(VL.size());
+        llvm::transform(*Order, ReorderedOps.begin(),
+                        [VL](const unsigned Idx) { return VL[Idx]; });
+        V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList);
+      }
+      if (V.isTreeTinyAndNotFullyVectorizable())
+        break;
       if (V.isLoadCombineReductionCandidate(RdxKind))
-        break; 
- 
-      V.computeMinimumValueSizes(); 
- 
-      // Estimate cost. 
+        break;
+
+      V.computeMinimumValueSizes();
+
+      // Estimate cost.
       InstructionCost TreeCost = V.getTreeCost();
       InstructionCost ReductionCost =
           getReductionCost(TTI, ReducedVals[i], ReduxWidth);
@@ -6916,7 +6916,7 @@ public:
         LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n");
         return false;
       }
-      if (Cost >= -SLPCostThreshold) { 
+      if (Cost >= -SLPCostThreshold) {
         V.getORE()->emit([&]() {
           return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
                                           cast<Instruction>(VL[0]))
@@ -6926,91 +6926,91 @@ public:
                  << ore::NV("Threshold", -SLPCostThreshold);
         });
         break;
-      } 
- 
-      LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" 
-                        << Cost << ". (HorRdx)\n"); 
-      V.getORE()->emit([&]() { 
+      }
+
+      LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
+                        << Cost << ". (HorRdx)\n");
+      V.getORE()->emit([&]() {
         return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
                                   cast<Instruction>(VL[0]))
                << "Vectorized horizontal reduction with cost "
                << ore::NV("Cost", Cost) << " and with tree size "
                << ore::NV("TreeSize", V.getTreeSize());
-      }); 
- 
-      // Vectorize a tree. 
-      DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc(); 
-      Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues); 
- 
+      });
+
+      // Vectorize a tree.
+      DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
+      Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
+
       // Emit a reduction. If the root is a select (min/max idiom), the insert
-      // point is the compare condition of that select. 
-      Instruction *RdxRootInst = cast<Instruction>(ReductionRoot); 
+      // point is the compare condition of that select.
+      Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
       if (isCmpSel(RdxKind))
-        Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst)); 
-      else 
-        Builder.SetInsertPoint(RdxRootInst); 
- 
-      Value *ReducedSubTree = 
-          emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI); 
+        Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst));
+      else
+        Builder.SetInsertPoint(RdxRootInst);
+
+      Value *ReducedSubTree =
+          emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
 
       if (!VectorizedTree) {
         // Initialize the final value in the reduction.
         VectorizedTree = ReducedSubTree;
       } else {
         // Update the final value in the reduction.
-        Builder.SetCurrentDebugLocation(Loc); 
+        Builder.SetCurrentDebugLocation(Loc);
         VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
                                   ReducedSubTree, "op.rdx", ReductionOps);
       }
-      i += ReduxWidth; 
-      ReduxWidth = PowerOf2Floor(NumReducedVals - i); 
-    } 
- 
-    if (VectorizedTree) { 
-      // Finish the reduction. 
-      for (; i < NumReducedVals; ++i) { 
-        auto *I = cast<Instruction>(ReducedVals[i]); 
-        Builder.SetCurrentDebugLocation(I->getDebugLoc()); 
+      i += ReduxWidth;
+      ReduxWidth = PowerOf2Floor(NumReducedVals - i);
+    }
+
+    if (VectorizedTree) {
+      // Finish the reduction.
+      for (; i < NumReducedVals; ++i) {
+        auto *I = cast<Instruction>(ReducedVals[i]);
+        Builder.SetCurrentDebugLocation(I->getDebugLoc());
         VectorizedTree =
             createOp(Builder, RdxKind, VectorizedTree, I, "", ReductionOps);
-      } 
-      for (auto &Pair : ExternallyUsedValues) { 
-        // Add each externally used value to the final reduction. 
-        for (auto *I : Pair.second) { 
-          Builder.SetCurrentDebugLocation(I->getDebugLoc()); 
+      }
+      for (auto &Pair : ExternallyUsedValues) {
+        // Add each externally used value to the final reduction.
+        for (auto *I : Pair.second) {
+          Builder.SetCurrentDebugLocation(I->getDebugLoc());
           VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
                                     Pair.first, "op.extra", I);
-        } 
-      } 
- 
-      // Update users. For a min/max reduction that ends with a compare and 
-      // select, we also have to RAUW for the compare instruction feeding the 
-      // reduction root. That's because the original compare may have extra uses 
-      // besides the final select of the reduction. 
+        }
+      }
+
+      // Update users. For a min/max reduction that ends with a compare and
+      // select, we also have to RAUW for the compare instruction feeding the
+      // reduction root. That's because the original compare may have extra uses
+      // besides the final select of the reduction.
       if (isCmpSel(RdxKind)) {
-        if (auto *VecSelect = dyn_cast<SelectInst>(VectorizedTree)) { 
-          Instruction *ScalarCmp = 
-              getCmpForMinMaxReduction(cast<Instruction>(ReductionRoot)); 
-          ScalarCmp->replaceAllUsesWith(VecSelect->getCondition()); 
-        } 
-      } 
-      ReductionRoot->replaceAllUsesWith(VectorizedTree); 
- 
-      // Mark all scalar reduction ops for deletion, they are replaced by the 
-      // vector reductions. 
-      V.eraseInstructions(IgnoreList); 
-    } 
-    return VectorizedTree != nullptr; 
-  } 
- 
+        if (auto *VecSelect = dyn_cast<SelectInst>(VectorizedTree)) {
+          Instruction *ScalarCmp =
+              getCmpForMinMaxReduction(cast<Instruction>(ReductionRoot));
+          ScalarCmp->replaceAllUsesWith(VecSelect->getCondition());
+        }
+      }
+      ReductionRoot->replaceAllUsesWith(VectorizedTree);
+
+      // Mark all scalar reduction ops for deletion, they are replaced by the
+      // vector reductions.
+      V.eraseInstructions(IgnoreList);
+    }
+    return VectorizedTree != nullptr;
+  }
+
   unsigned numReductionValues() const { return ReducedVals.size(); }
- 
-private: 
-  /// Calculate the cost of a reduction. 
+
+private:
+  /// Calculate the cost of a reduction.
   InstructionCost getReductionCost(TargetTransformInfo *TTI,
                                    Value *FirstReducedVal,
                                    unsigned ReduxWidth) {
-    Type *ScalarTy = FirstReducedVal->getType(); 
+    Type *ScalarTy = FirstReducedVal->getType();
     FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);
     InstructionCost VectorCost, ScalarCost;
     switch (RdxKind) {
@@ -7025,7 +7025,7 @@ private:
       VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
                                                    /*IsPairwiseForm=*/false);
       ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy);
-      break; 
+      break;
     }
     case RecurKind::FMax:
     case RecurKind::FMin: {
@@ -7037,8 +7037,8 @@ private:
           TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy) +
           TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
                                   CmpInst::makeCmpResultType(ScalarTy));
-      break; 
-    } 
+      break;
+    }
     case RecurKind::SMax:
     case RecurKind::SMin:
     case RecurKind::UMax:
@@ -7051,36 +7051,36 @@ private:
                                       /*IsPairwiseForm=*/false, IsUnsigned);
       ScalarCost =
           TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy) +
-          TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, 
-                                  CmpInst::makeCmpResultType(ScalarTy)); 
-      break; 
+          TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
+                                  CmpInst::makeCmpResultType(ScalarTy));
+      break;
     }
     default:
-      llvm_unreachable("Expected arithmetic or min/max reduction operation"); 
-    } 
- 
+      llvm_unreachable("Expected arithmetic or min/max reduction operation");
+    }
+
     // Scalar cost is repeated for N-1 elements.
     ScalarCost *= (ReduxWidth - 1);
     LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
-                      << " for reduction that starts with " << *FirstReducedVal 
+                      << " for reduction that starts with " << *FirstReducedVal
                       << " (It is a splitting reduction)\n");
     return VectorCost - ScalarCost;
-  } 
- 
-  /// Emit a horizontal reduction of the vectorized value. 
-  Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder, 
-                       unsigned ReduxWidth, const TargetTransformInfo *TTI) { 
-    assert(VectorizedValue && "Need to have a vectorized tree node"); 
-    assert(isPowerOf2_32(ReduxWidth) && 
-           "We only handle power-of-two reductions for now"); 
- 
+  }
+
+  /// Emit a horizontal reduction of the vectorized value.
+  Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
+                       unsigned ReduxWidth, const TargetTransformInfo *TTI) {
+    assert(VectorizedValue && "Need to have a vectorized tree node");
+    assert(isPowerOf2_32(ReduxWidth) &&
+           "We only handle power-of-two reductions for now");
+
     return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind,
                                        ReductionOps.back());
   }
 };
- 
+
 } // end anonymous namespace
- 
+
 static Optional<unsigned> getAggregateSize(Instruction *InsertInst) {
   if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
     return cast<FixedVectorType>(IE->getType())->getNumElements();
@@ -7105,10 +7105,10 @@ static Optional<unsigned> getAggregateSize(Instruction *InsertInst) {
       return AggregateSize;
     } else {
       return None;
-    } 
+    }
   } while (true);
 }
- 
+
 static Optional<unsigned> getOperandIndex(Instruction *InsertInst,
                                           unsigned OperandOffset) {
   unsigned OperandIndex = OperandOffset;
@@ -7120,8 +7120,8 @@ static Optional<unsigned> getOperandIndex(Instruction *InsertInst,
       return OperandIndex;
     }
     return None;
-  } 
- 
+  }
+
   auto *IV = cast<InsertValueInst>(InsertInst);
   Type *CurrentType = IV->getType();
   for (unsigned int Index : IV->indices()) {
@@ -7138,7 +7138,7 @@ static Optional<unsigned> getOperandIndex(Instruction *InsertInst,
   }
   return OperandIndex;
 }
- 
+
 static bool findBuildAggregate_rec(Instruction *LastInsertInst,
                                    TargetTransformInfo *TTI,
                                    SmallVectorImpl<Value *> &BuildVectorOpds,
@@ -7169,28 +7169,28 @@ static bool findBuildAggregate_rec(Instruction *LastInsertInst,
   return false;
 }
 
-/// Recognize construction of vectors like 
+/// Recognize construction of vectors like
 ///  %ra = insertelement <4 x float> poison, float %s0, i32 0
-///  %rb = insertelement <4 x float> %ra, float %s1, i32 1 
-///  %rc = insertelement <4 x float> %rb, float %s2, i32 2 
-///  %rd = insertelement <4 x float> %rc, float %s3, i32 3 
-///  starting from the last insertelement or insertvalue instruction. 
-/// 
+///  %rb = insertelement <4 x float> %ra, float %s1, i32 1
+///  %rc = insertelement <4 x float> %rb, float %s2, i32 2
+///  %rd = insertelement <4 x float> %rc, float %s3, i32 3
+///  starting from the last insertelement or insertvalue instruction.
+///
 /// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
-/// {{float, float}, {float, float}}, [2 x {float, float}] and so on. 
-/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples. 
-/// 
-/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type. 
-/// 
-/// \return true if it matches. 
+/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
+/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
+///
+/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
+///
+/// \return true if it matches.
 static bool findBuildAggregate(Instruction *LastInsertInst,
                                TargetTransformInfo *TTI,
-                               SmallVectorImpl<Value *> &BuildVectorOpds, 
-                               SmallVectorImpl<Value *> &InsertElts) { 
+                               SmallVectorImpl<Value *> &BuildVectorOpds,
+                               SmallVectorImpl<Value *> &InsertElts) {
 
-  assert((isa<InsertElementInst>(LastInsertInst) || 
-          isa<InsertValueInst>(LastInsertInst)) && 
-         "Expected insertelement or insertvalue instruction!"); 
+  assert((isa<InsertElementInst>(LastInsertInst) ||
+          isa<InsertValueInst>(LastInsertInst)) &&
+         "Expected insertelement or insertvalue instruction!");
 
   assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
          "Expected empty result vectors!");
@@ -7210,63 +7210,63 @@ static bool findBuildAggregate(Instruction *LastInsertInst,
   }
 
   return false;
-} 
- 
-static bool PhiTypeSorterFunc(Value *V, Value *V2) { 
-  return V->getType() < V2->getType(); 
-} 
- 
-/// Try and get a reduction value from a phi node. 
-/// 
-/// Given a phi node \p P in a block \p ParentBB, consider possible reductions 
-/// if they come from either \p ParentBB or a containing loop latch. 
-/// 
-/// \returns A candidate reduction value if possible, or \code nullptr \endcode 
-/// if not possible. 
-static Value *getReductionValue(const DominatorTree *DT, PHINode *P, 
-                                BasicBlock *ParentBB, LoopInfo *LI) { 
-  // There are situations where the reduction value is not dominated by the 
-  // reduction phi. Vectorizing such cases has been reported to cause 
-  // miscompiles. See PR25787. 
-  auto DominatedReduxValue = [&](Value *R) { 
-    return isa<Instruction>(R) && 
-           DT->dominates(P->getParent(), cast<Instruction>(R)->getParent()); 
-  }; 
- 
-  Value *Rdx = nullptr; 
- 
-  // Return the incoming value if it comes from the same BB as the phi node. 
-  if (P->getIncomingBlock(0) == ParentBB) { 
-    Rdx = P->getIncomingValue(0); 
-  } else if (P->getIncomingBlock(1) == ParentBB) { 
-    Rdx = P->getIncomingValue(1); 
-  } 
- 
-  if (Rdx && DominatedReduxValue(Rdx)) 
-    return Rdx; 
- 
-  // Otherwise, check whether we have a loop latch to look at. 
-  Loop *BBL = LI->getLoopFor(ParentBB); 
-  if (!BBL) 
-    return nullptr; 
-  BasicBlock *BBLatch = BBL->getLoopLatch(); 
-  if (!BBLatch) 
-    return nullptr; 
- 
-  // There is a loop latch, return the incoming value if it comes from 
-  // that. This reduction pattern occasionally turns up. 
-  if (P->getIncomingBlock(0) == BBLatch) { 
-    Rdx = P->getIncomingValue(0); 
-  } else if (P->getIncomingBlock(1) == BBLatch) { 
-    Rdx = P->getIncomingValue(1); 
-  } 
- 
-  if (Rdx && DominatedReduxValue(Rdx)) 
-    return Rdx; 
- 
-  return nullptr; 
-} 
- 
+}
+
+static bool PhiTypeSorterFunc(Value *V, Value *V2) {
+  return V->getType() < V2->getType();
+}
+
+/// Try and get a reduction value from a phi node.
+///
+/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
+/// if they come from either \p ParentBB or a containing loop latch.
+///
+/// \returns A candidate reduction value if possible, or \code nullptr \endcode
+/// if not possible.
+static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
+                                BasicBlock *ParentBB, LoopInfo *LI) {
+  // There are situations where the reduction value is not dominated by the
+  // reduction phi. Vectorizing such cases has been reported to cause
+  // miscompiles. See PR25787.
+  auto DominatedReduxValue = [&](Value *R) {
+    return isa<Instruction>(R) &&
+           DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
+  };
+
+  Value *Rdx = nullptr;
+
+  // Return the incoming value if it comes from the same BB as the phi node.
+  if (P->getIncomingBlock(0) == ParentBB) {
+    Rdx = P->getIncomingValue(0);
+  } else if (P->getIncomingBlock(1) == ParentBB) {
+    Rdx = P->getIncomingValue(1);
+  }
+
+  if (Rdx && DominatedReduxValue(Rdx))
+    return Rdx;
+
+  // Otherwise, check whether we have a loop latch to look at.
+  Loop *BBL = LI->getLoopFor(ParentBB);
+  if (!BBL)
+    return nullptr;
+  BasicBlock *BBLatch = BBL->getLoopLatch();
+  if (!BBLatch)
+    return nullptr;
+
+  // There is a loop latch, return the incoming value if it comes from
+  // that. This reduction pattern occasionally turns up.
+  if (P->getIncomingBlock(0) == BBLatch) {
+    Rdx = P->getIncomingValue(0);
+  } else if (P->getIncomingBlock(1) == BBLatch) {
+    Rdx = P->getIncomingValue(1);
+  }
+
+  if (Rdx && DominatedReduxValue(Rdx))
+    return Rdx;
+
+  return nullptr;
+}
+
 static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
   if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
     return true;
@@ -7277,263 +7277,263 @@ static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
   return false;
 }
 
-/// Attempt to reduce a horizontal reduction. 
-/// If it is legal to match a horizontal reduction feeding the phi node \a P 
-/// with reduction operators \a Root (or one of its operands) in a basic block 
-/// \a BB, then check if it can be done. If horizontal reduction is not found 
-/// and root instruction is a binary operation, vectorization of the operands is 
-/// attempted. 
-/// \returns true if a horizontal reduction was matched and reduced or operands 
-/// of one of the binary instruction were vectorized. 
-/// \returns false if a horizontal reduction was not matched (or not possible) 
-/// or no vectorization of any binary operation feeding \a Root instruction was 
-/// performed. 
-static bool tryToVectorizeHorReductionOrInstOperands( 
-    PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, 
-    TargetTransformInfo *TTI, 
-    const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) { 
-  if (!ShouldVectorizeHor) 
-    return false; 
- 
-  if (!Root) 
-    return false; 
- 
-  if (Root->getParent() != BB || isa<PHINode>(Root)) 
-    return false; 
-  // Start analysis starting from Root instruction. If horizontal reduction is 
-  // found, try to vectorize it. If it is not a horizontal reduction or 
-  // vectorization is not possible or not effective, and currently analyzed 
-  // instruction is a binary operation, try to vectorize the operands, using 
-  // pre-order DFS traversal order. If the operands were not vectorized, repeat 
-  // the same procedure considering each operand as a possible root of the 
-  // horizontal reduction. 
-  // Interrupt the process if the Root instruction itself was vectorized or all 
-  // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized. 
-  SmallVector<std::pair<Instruction *, unsigned>, 8> Stack(1, {Root, 0}); 
-  SmallPtrSet<Value *, 8> VisitedInstrs; 
-  bool Res = false; 
-  while (!Stack.empty()) { 
-    Instruction *Inst; 
-    unsigned Level; 
-    std::tie(Inst, Level) = Stack.pop_back_val(); 
+/// Attempt to reduce a horizontal reduction.
+/// If it is legal to match a horizontal reduction feeding the phi node \a P
+/// with reduction operators \a Root (or one of its operands) in a basic block
+/// \a BB, then check if it can be done. If horizontal reduction is not found
+/// and root instruction is a binary operation, vectorization of the operands is
+/// attempted.
+/// \returns true if a horizontal reduction was matched and reduced or operands
+/// of one of the binary instruction were vectorized.
+/// \returns false if a horizontal reduction was not matched (or not possible)
+/// or no vectorization of any binary operation feeding \a Root instruction was
+/// performed.
+static bool tryToVectorizeHorReductionOrInstOperands(
+    PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
+    TargetTransformInfo *TTI,
+    const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) {
+  if (!ShouldVectorizeHor)
+    return false;
+
+  if (!Root)
+    return false;
+
+  if (Root->getParent() != BB || isa<PHINode>(Root))
+    return false;
+  // Start analysis starting from Root instruction. If horizontal reduction is
+  // found, try to vectorize it. If it is not a horizontal reduction or
+  // vectorization is not possible or not effective, and currently analyzed
+  // instruction is a binary operation, try to vectorize the operands, using
+  // pre-order DFS traversal order. If the operands were not vectorized, repeat
+  // the same procedure considering each operand as a possible root of the
+  // horizontal reduction.
+  // Interrupt the process if the Root instruction itself was vectorized or all
+  // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
+  SmallVector<std::pair<Instruction *, unsigned>, 8> Stack(1, {Root, 0});
+  SmallPtrSet<Value *, 8> VisitedInstrs;
+  bool Res = false;
+  while (!Stack.empty()) {
+    Instruction *Inst;
+    unsigned Level;
+    std::tie(Inst, Level) = Stack.pop_back_val();
     Value *B0, *B1;
     bool IsBinop = matchRdxBop(Inst, B0, B1);
     bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value()));
     if (IsBinop || IsSelect) {
-      HorizontalReduction HorRdx; 
-      if (HorRdx.matchAssociativeReduction(P, Inst)) { 
-        if (HorRdx.tryToReduce(R, TTI)) { 
-          Res = true; 
-          // Set P to nullptr to avoid re-analysis of phi node in 
-          // matchAssociativeReduction function unless this is the root node. 
-          P = nullptr; 
-          continue; 
-        } 
-      } 
+      HorizontalReduction HorRdx;
+      if (HorRdx.matchAssociativeReduction(P, Inst)) {
+        if (HorRdx.tryToReduce(R, TTI)) {
+          Res = true;
+          // Set P to nullptr to avoid re-analysis of phi node in
+          // matchAssociativeReduction function unless this is the root node.
+          P = nullptr;
+          continue;
+        }
+      }
       if (P && IsBinop) {
         Inst = dyn_cast<Instruction>(B0);
-        if (Inst == P) 
+        if (Inst == P)
           Inst = dyn_cast<Instruction>(B1);
-        if (!Inst) { 
-          // Set P to nullptr to avoid re-analysis of phi node in 
-          // matchAssociativeReduction function unless this is the root node. 
-          P = nullptr; 
-          continue; 
-        } 
-      } 
-    } 
-    // Set P to nullptr to avoid re-analysis of phi node in 
-    // matchAssociativeReduction function unless this is the root node. 
-    P = nullptr; 
-    if (Vectorize(Inst, R)) { 
-      Res = true; 
-      continue; 
-    } 
- 
-    // Try to vectorize operands. 
-    // Continue analysis for the instruction from the same basic block only to 
-    // save compile time. 
-    if (++Level < RecursionMaxDepth) 
-      for (auto *Op : Inst->operand_values()) 
-        if (VisitedInstrs.insert(Op).second) 
-          if (auto *I = dyn_cast<Instruction>(Op)) 
-            if (!isa<PHINode>(I) && !R.isDeleted(I) && I->getParent() == BB) 
-              Stack.emplace_back(I, Level); 
-  } 
-  return Res; 
-} 
- 
-bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V, 
-                                                 BasicBlock *BB, BoUpSLP &R, 
-                                                 TargetTransformInfo *TTI) { 
+        if (!Inst) {
+          // Set P to nullptr to avoid re-analysis of phi node in
+          // matchAssociativeReduction function unless this is the root node.
+          P = nullptr;
+          continue;
+        }
+      }
+    }
+    // Set P to nullptr to avoid re-analysis of phi node in
+    // matchAssociativeReduction function unless this is the root node.
+    P = nullptr;
+    if (Vectorize(Inst, R)) {
+      Res = true;
+      continue;
+    }
+
+    // Try to vectorize operands.
+    // Continue analysis for the instruction from the same basic block only to
+    // save compile time.
+    if (++Level < RecursionMaxDepth)
+      for (auto *Op : Inst->operand_values())
+        if (VisitedInstrs.insert(Op).second)
+          if (auto *I = dyn_cast<Instruction>(Op))
+            if (!isa<PHINode>(I) && !R.isDeleted(I) && I->getParent() == BB)
+              Stack.emplace_back(I, Level);
+  }
+  return Res;
+}
+
+bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
+                                                 BasicBlock *BB, BoUpSLP &R,
+                                                 TargetTransformInfo *TTI) {
   auto *I = dyn_cast_or_null<Instruction>(V);
-  if (!I) 
-    return false; 
- 
-  if (!isa<BinaryOperator>(I)) 
-    P = nullptr; 
-  // Try to match and vectorize a horizontal reduction. 
-  auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool { 
-    return tryToVectorize(I, R); 
-  }; 
-  return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, 
-                                                  ExtraVectorization); 
-} 
- 
-bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, 
-                                                 BasicBlock *BB, BoUpSLP &R) { 
-  const DataLayout &DL = BB->getModule()->getDataLayout(); 
-  if (!R.canMapToVector(IVI->getType(), DL)) 
-    return false; 
- 
-  SmallVector<Value *, 16> BuildVectorOpds; 
-  SmallVector<Value *, 16> BuildVectorInsts; 
+  if (!I)
+    return false;
+
+  if (!isa<BinaryOperator>(I))
+    P = nullptr;
+  // Try to match and vectorize a horizontal reduction.
+  auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool {
+    return tryToVectorize(I, R);
+  };
+  return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI,
+                                                  ExtraVectorization);
+}
+
+bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
+                                                 BasicBlock *BB, BoUpSLP &R) {
+  const DataLayout &DL = BB->getModule()->getDataLayout();
+  if (!R.canMapToVector(IVI->getType(), DL))
+    return false;
+
+  SmallVector<Value *, 16> BuildVectorOpds;
+  SmallVector<Value *, 16> BuildVectorInsts;
   if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts))
-    return false; 
- 
-  LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n"); 
-  // Aggregate value is unlikely to be processed in vector register, we need to 
-  // extract scalars into scalar registers, so NeedExtraction is set true. 
-  return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false, 
-                            BuildVectorInsts); 
-} 
- 
-bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, 
-                                                   BasicBlock *BB, BoUpSLP &R) { 
-  SmallVector<Value *, 16> BuildVectorInsts; 
-  SmallVector<Value *, 16> BuildVectorOpds; 
-  if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) || 
-      (llvm::all_of(BuildVectorOpds, 
-                    [](Value *V) { return isa<ExtractElementInst>(V); }) && 
-       isShuffle(BuildVectorOpds))) 
-    return false; 
- 
-  // Vectorize starting with the build vector operands ignoring the BuildVector 
-  // instructions for the purpose of scheduling and user extraction. 
-  return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false, 
-                            BuildVectorInsts); 
-} 
- 
-bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB, 
-                                         BoUpSLP &R) { 
-  if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) 
-    return true; 
- 
-  bool OpsChanged = false; 
-  for (int Idx = 0; Idx < 2; ++Idx) { 
-    OpsChanged |= 
-        vectorizeRootInstruction(nullptr, CI->getOperand(Idx), BB, R, TTI); 
-  } 
-  return OpsChanged; 
-} 
- 
-bool SLPVectorizerPass::vectorizeSimpleInstructions( 
-    SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R) { 
-  bool OpsChanged = false; 
-  for (auto *I : reverse(Instructions)) { 
-    if (R.isDeleted(I)) 
-      continue; 
-    if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) 
-      OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R); 
-    else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) 
-      OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R); 
-    else if (auto *CI = dyn_cast<CmpInst>(I)) 
-      OpsChanged |= vectorizeCmpInst(CI, BB, R); 
-  } 
-  Instructions.clear(); 
-  return OpsChanged; 
-} 
- 
-bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { 
-  bool Changed = false; 
-  SmallVector<Value *, 4> Incoming; 
-  SmallPtrSet<Value *, 16> VisitedInstrs; 
- 
-  bool HaveVectorizedPhiNodes = true; 
-  while (HaveVectorizedPhiNodes) { 
-    HaveVectorizedPhiNodes = false; 
- 
-    // Collect the incoming values from the PHIs. 
-    Incoming.clear(); 
-    for (Instruction &I : *BB) { 
-      PHINode *P = dyn_cast<PHINode>(&I); 
-      if (!P) 
-        break; 
- 
-      if (!VisitedInstrs.count(P) && !R.isDeleted(P)) 
-        Incoming.push_back(P); 
-    } 
- 
-    // Sort by type. 
-    llvm::stable_sort(Incoming, PhiTypeSorterFunc); 
- 
-    // Try to vectorize elements base on their type. 
-    for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(), 
-                                           E = Incoming.end(); 
-         IncIt != E;) { 
- 
-      // Look for the next elements with the same type. 
-      SmallVector<Value *, 4>::iterator SameTypeIt = IncIt; 
-      while (SameTypeIt != E && 
+    return false;
+
+  LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
+  // Aggregate value is unlikely to be processed in vector register, we need to
+  // extract scalars into scalar registers, so NeedExtraction is set true.
+  return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false,
+                            BuildVectorInsts);
+}
+
+bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
+                                                   BasicBlock *BB, BoUpSLP &R) {
+  SmallVector<Value *, 16> BuildVectorInsts;
+  SmallVector<Value *, 16> BuildVectorOpds;
+  if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
+      (llvm::all_of(BuildVectorOpds,
+                    [](Value *V) { return isa<ExtractElementInst>(V); }) &&
+       isShuffle(BuildVectorOpds)))
+    return false;
+
+  // Vectorize starting with the build vector operands ignoring the BuildVector
+  // instructions for the purpose of scheduling and user extraction.
+  return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false,
+                            BuildVectorInsts);
+}
+
+bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB,
+                                         BoUpSLP &R) {
+  if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R))
+    return true;
+
+  bool OpsChanged = false;
+  for (int Idx = 0; Idx < 2; ++Idx) {
+    OpsChanged |=
+        vectorizeRootInstruction(nullptr, CI->getOperand(Idx), BB, R, TTI);
+  }
+  return OpsChanged;
+}
+
+bool SLPVectorizerPass::vectorizeSimpleInstructions(
+    SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R) {
+  bool OpsChanged = false;
+  for (auto *I : reverse(Instructions)) {
+    if (R.isDeleted(I))
+      continue;
+    if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))
+      OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
+    else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I))
+      OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
+    else if (auto *CI = dyn_cast<CmpInst>(I))
+      OpsChanged |= vectorizeCmpInst(CI, BB, R);
+  }
+  Instructions.clear();
+  return OpsChanged;
+}
+
+bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
+  bool Changed = false;
+  SmallVector<Value *, 4> Incoming;
+  SmallPtrSet<Value *, 16> VisitedInstrs;
+
+  bool HaveVectorizedPhiNodes = true;
+  while (HaveVectorizedPhiNodes) {
+    HaveVectorizedPhiNodes = false;
+
+    // Collect the incoming values from the PHIs.
+    Incoming.clear();
+    for (Instruction &I : *BB) {
+      PHINode *P = dyn_cast<PHINode>(&I);
+      if (!P)
+        break;
+
+      if (!VisitedInstrs.count(P) && !R.isDeleted(P))
+        Incoming.push_back(P);
+    }
+
+    // Sort by type.
+    llvm::stable_sort(Incoming, PhiTypeSorterFunc);
+
+    // Try to vectorize elements base on their type.
+    for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
+                                           E = Incoming.end();
+         IncIt != E;) {
+
+      // Look for the next elements with the same type.
+      SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
+      while (SameTypeIt != E &&
              (*SameTypeIt)->getType() == (*IncIt)->getType()) {
-        VisitedInstrs.insert(*SameTypeIt); 
-        ++SameTypeIt; 
-      } 
- 
-      // Try to vectorize them. 
-      unsigned NumElts = (SameTypeIt - IncIt); 
-      LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at PHIs (" 
-                        << NumElts << ")\n"); 
-      // The order in which the phi nodes appear in the program does not matter. 
-      // So allow tryToVectorizeList to reorder them if it is beneficial. This 
-      // is done when there are exactly two elements since tryToVectorizeList 
-      // asserts that there are only two values when AllowReorder is true. 
-      bool AllowReorder = NumElts == 2; 
-      if (NumElts > 1 && 
-          tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, AllowReorder)) { 
-        // Success start over because instructions might have been changed. 
-        HaveVectorizedPhiNodes = true; 
-        Changed = true; 
-        break; 
-      } 
- 
-      // Start over at the next instruction of a different type (or the end). 
-      IncIt = SameTypeIt; 
-    } 
-  } 
- 
-  VisitedInstrs.clear(); 
- 
-  SmallVector<Instruction *, 8> PostProcessInstructions; 
-  SmallDenseSet<Instruction *, 4> KeyNodes; 
-  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 
+        VisitedInstrs.insert(*SameTypeIt);
+        ++SameTypeIt;
+      }
+
+      // Try to vectorize them.
+      unsigned NumElts = (SameTypeIt - IncIt);
+      LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at PHIs ("
+                        << NumElts << ")\n");
+      // The order in which the phi nodes appear in the program does not matter.
+      // So allow tryToVectorizeList to reorder them if it is beneficial. This
+      // is done when there are exactly two elements since tryToVectorizeList
+      // asserts that there are only two values when AllowReorder is true.
+      bool AllowReorder = NumElts == 2;
+      if (NumElts > 1 &&
+          tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, AllowReorder)) {
+        // Success start over because instructions might have been changed.
+        HaveVectorizedPhiNodes = true;
+        Changed = true;
+        break;
+      }
+
+      // Start over at the next instruction of a different type (or the end).
+      IncIt = SameTypeIt;
+    }
+  }
+
+  VisitedInstrs.clear();
+
+  SmallVector<Instruction *, 8> PostProcessInstructions;
+  SmallDenseSet<Instruction *, 4> KeyNodes;
+  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
     // Skip instructions with scalable type. The num of elements is unknown at
     // compile-time for scalable type.
     if (isa<ScalableVectorType>(it->getType()))
       continue;
 
-    // Skip instructions marked for the deletion. 
-    if (R.isDeleted(&*it)) 
-      continue; 
-    // We may go through BB multiple times so skip the one we have checked. 
-    if (!VisitedInstrs.insert(&*it).second) { 
+    // Skip instructions marked for the deletion.
+    if (R.isDeleted(&*it))
+      continue;
+    // We may go through BB multiple times so skip the one we have checked.
+    if (!VisitedInstrs.insert(&*it).second) {
       if (it->use_empty() && KeyNodes.contains(&*it) &&
-          vectorizeSimpleInstructions(PostProcessInstructions, BB, R)) { 
-        // We would like to start over since some instructions are deleted 
-        // and the iterator may become invalid value. 
-        Changed = true; 
-        it = BB->begin(); 
-        e = BB->end(); 
-      } 
-      continue; 
-    } 
- 
-    if (isa<DbgInfoIntrinsic>(it)) 
-      continue; 
- 
-    // Try to vectorize reductions that use PHINodes. 
-    if (PHINode *P = dyn_cast<PHINode>(it)) { 
-      // Check that the PHI is a reduction PHI. 
+          vectorizeSimpleInstructions(PostProcessInstructions, BB, R)) {
+        // We would like to start over since some instructions are deleted
+        // and the iterator may become invalid value.
+        Changed = true;
+        it = BB->begin();
+        e = BB->end();
+      }
+      continue;
+    }
+
+    if (isa<DbgInfoIntrinsic>(it))
+      continue;
+
+    // Try to vectorize reductions that use PHINodes.
+    if (PHINode *P = dyn_cast<PHINode>(it)) {
+      // Check that the PHI is a reduction PHI.
       if (P->getNumIncomingValues() == 2) {
         // Try to match and vectorize a horizontal reduction.
         if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
@@ -7554,169 +7554,169 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
         if (BB == P->getIncomingBlock(I) ||
             !DT->isReachableFromEntry(P->getIncomingBlock(I)))
           continue;
- 
+
         Changed |= vectorizeRootInstruction(nullptr, P->getIncomingValue(I),
                                             P->getIncomingBlock(I), R, TTI);
-      } 
-      continue; 
-    } 
- 
-    // Ran into an instruction without users, like terminator, or function call 
-    // with ignored return value, store. Ignore unused instructions (basing on 
-    // instruction type, except for CallInst and InvokeInst). 
-    if (it->use_empty() && (it->getType()->isVoidTy() || isa<CallInst>(it) || 
-                            isa<InvokeInst>(it))) { 
-      KeyNodes.insert(&*it); 
-      bool OpsChanged = false; 
-      if (ShouldStartVectorizeHorAtStore || !isa<StoreInst>(it)) { 
-        for (auto *V : it->operand_values()) { 
-          // Try to match and vectorize a horizontal reduction. 
-          OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI); 
-        } 
-      } 
-      // Start vectorization of post-process list of instructions from the 
-      // top-tree instructions to try to vectorize as many instructions as 
-      // possible. 
-      OpsChanged |= vectorizeSimpleInstructions(PostProcessInstructions, BB, R); 
-      if (OpsChanged) { 
-        // We would like to start over since some instructions are deleted 
-        // and the iterator may become invalid value. 
-        Changed = true; 
-        it = BB->begin(); 
-        e = BB->end(); 
-        continue; 
-      } 
-    } 
- 
-    if (isa<InsertElementInst>(it) || isa<CmpInst>(it) || 
-        isa<InsertValueInst>(it)) 
-      PostProcessInstructions.push_back(&*it); 
-  } 
- 
-  return Changed; 
-} 
- 
-bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) { 
-  auto Changed = false; 
-  for (auto &Entry : GEPs) { 
-    // If the getelementptr list has fewer than two elements, there's nothing 
-    // to do. 
-    if (Entry.second.size() < 2) 
-      continue; 
- 
-    LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length " 
-                      << Entry.second.size() << ".\n"); 
- 
-    // Process the GEP list in chunks suitable for the target's supported 
-    // vector size. If a vector register can't hold 1 element, we are done. We 
-    // are trying to vectorize the index computations, so the maximum number of 
-    // elements is based on the size of the index expression, rather than the 
-    // size of the GEP itself (the target's pointer size). 
-    unsigned MaxVecRegSize = R.getMaxVecRegSize(); 
-    unsigned EltSize = R.getVectorElementSize(*Entry.second[0]->idx_begin()); 
-    if (MaxVecRegSize < EltSize) 
-      continue; 
- 
-    unsigned MaxElts = MaxVecRegSize / EltSize; 
-    for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) { 
-      auto Len = std::min<unsigned>(BE - BI, MaxElts); 
+      }
+      continue;
+    }
+
+    // Ran into an instruction without users, like terminator, or function call
+    // with ignored return value, store. Ignore unused instructions (basing on
+    // instruction type, except for CallInst and InvokeInst).
+    if (it->use_empty() && (it->getType()->isVoidTy() || isa<CallInst>(it) ||
+                            isa<InvokeInst>(it))) {
+      KeyNodes.insert(&*it);
+      bool OpsChanged = false;
+      if (ShouldStartVectorizeHorAtStore || !isa<StoreInst>(it)) {
+        for (auto *V : it->operand_values()) {
+          // Try to match and vectorize a horizontal reduction.
+          OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI);
+        }
+      }
+      // Start vectorization of post-process list of instructions from the
+      // top-tree instructions to try to vectorize as many instructions as
+      // possible.
+      OpsChanged |= vectorizeSimpleInstructions(PostProcessInstructions, BB, R);
+      if (OpsChanged) {
+        // We would like to start over since some instructions are deleted
+        // and the iterator may become invalid value.
+        Changed = true;
+        it = BB->begin();
+        e = BB->end();
+        continue;
+      }
+    }
+
+    if (isa<InsertElementInst>(it) || isa<CmpInst>(it) ||
+        isa<InsertValueInst>(it))
+      PostProcessInstructions.push_back(&*it);
+  }
+
+  return Changed;
+}
+
+bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
+  auto Changed = false;
+  for (auto &Entry : GEPs) {
+    // If the getelementptr list has fewer than two elements, there's nothing
+    // to do.
+    if (Entry.second.size() < 2)
+      continue;
+
+    LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
+                      << Entry.second.size() << ".\n");
+
+    // Process the GEP list in chunks suitable for the target's supported
+    // vector size. If a vector register can't hold 1 element, we are done. We
+    // are trying to vectorize the index computations, so the maximum number of
+    // elements is based on the size of the index expression, rather than the
+    // size of the GEP itself (the target's pointer size).
+    unsigned MaxVecRegSize = R.getMaxVecRegSize();
+    unsigned EltSize = R.getVectorElementSize(*Entry.second[0]->idx_begin());
+    if (MaxVecRegSize < EltSize)
+      continue;
+
+    unsigned MaxElts = MaxVecRegSize / EltSize;
+    for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
+      auto Len = std::min<unsigned>(BE - BI, MaxElts);
       ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
- 
-      // Initialize a set a candidate getelementptrs. Note that we use a 
-      // SetVector here to preserve program order. If the index computations 
-      // are vectorizable and begin with loads, we want to minimize the chance 
-      // of having to reorder them later. 
-      SetVector<Value *> Candidates(GEPList.begin(), GEPList.end()); 
- 
-      // Some of the candidates may have already been vectorized after we 
-      // initially collected them. If so, they are marked as deleted, so remove 
-      // them from the set of candidates. 
-      Candidates.remove_if( 
-          [&R](Value *I) { return R.isDeleted(cast<Instruction>(I)); }); 
- 
-      // Remove from the set of candidates all pairs of getelementptrs with 
-      // constant differences. Such getelementptrs are likely not good 
-      // candidates for vectorization in a bottom-up phase since one can be 
-      // computed from the other. We also ensure all candidate getelementptr 
-      // indices are unique. 
-      for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) { 
-        auto *GEPI = GEPList[I]; 
-        if (!Candidates.count(GEPI)) 
-          continue; 
-        auto *SCEVI = SE->getSCEV(GEPList[I]); 
-        for (int J = I + 1; J < E && Candidates.size() > 1; ++J) { 
-          auto *GEPJ = GEPList[J]; 
-          auto *SCEVJ = SE->getSCEV(GEPList[J]); 
-          if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) { 
-            Candidates.remove(GEPI); 
-            Candidates.remove(GEPJ); 
-          } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) { 
-            Candidates.remove(GEPJ); 
-          } 
-        } 
-      } 
- 
-      // We break out of the above computation as soon as we know there are 
-      // fewer than two candidates remaining. 
-      if (Candidates.size() < 2) 
-        continue; 
- 
-      // Add the single, non-constant index of each candidate to the bundle. We 
-      // ensured the indices met these constraints when we originally collected 
-      // the getelementptrs. 
-      SmallVector<Value *, 16> Bundle(Candidates.size()); 
-      auto BundleIndex = 0u; 
-      for (auto *V : Candidates) { 
-        auto *GEP = cast<GetElementPtrInst>(V); 
-        auto *GEPIdx = GEP->idx_begin()->get(); 
-        assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx)); 
-        Bundle[BundleIndex++] = GEPIdx; 
-      } 
- 
-      // Try and vectorize the indices. We are currently only interested in 
-      // gather-like cases of the form: 
-      // 
-      // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ... 
-      // 
-      // where the loads of "a", the loads of "b", and the subtractions can be 
-      // performed in parallel. It's likely that detecting this pattern in a 
-      // bottom-up phase will be simpler and less costly than building a 
-      // full-blown top-down phase beginning at the consecutive loads. 
-      Changed |= tryToVectorizeList(Bundle, R); 
-    } 
-  } 
-  return Changed; 
-} 
- 
-bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { 
-  bool Changed = false; 
-  // Attempt to sort and vectorize each of the store-groups. 
-  for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e; 
-       ++it) { 
-    if (it->second.size() < 2) 
-      continue; 
- 
-    LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " 
-                      << it->second.size() << ".\n"); 
- 
-    Changed |= vectorizeStores(it->second, R); 
-  } 
-  return Changed; 
-} 
- 
-char SLPVectorizer::ID = 0; 
- 
-static const char lv_name[] = "SLP Vectorizer"; 
- 
-INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false) 
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 
-INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 
-INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 
-INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false) 
- 
-Pass *llvm::createSLPVectorizerPass() { return new SLPVectorizer(); } 
+
+      // Initialize a set a candidate getelementptrs. Note that we use a
+      // SetVector here to preserve program order. If the index computations
+      // are vectorizable and begin with loads, we want to minimize the chance
+      // of having to reorder them later.
+      SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
+
+      // Some of the candidates may have already been vectorized after we
+      // initially collected them. If so, they are marked as deleted, so remove
+      // them from the set of candidates.
+      Candidates.remove_if(
+          [&R](Value *I) { return R.isDeleted(cast<Instruction>(I)); });
+
+      // Remove from the set of candidates all pairs of getelementptrs with
+      // constant differences. Such getelementptrs are likely not good
+      // candidates for vectorization in a bottom-up phase since one can be
+      // computed from the other. We also ensure all candidate getelementptr
+      // indices are unique.
+      for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
+        auto *GEPI = GEPList[I];
+        if (!Candidates.count(GEPI))
+          continue;
+        auto *SCEVI = SE->getSCEV(GEPList[I]);
+        for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
+          auto *GEPJ = GEPList[J];
+          auto *SCEVJ = SE->getSCEV(GEPList[J]);
+          if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
+            Candidates.remove(GEPI);
+            Candidates.remove(GEPJ);
+          } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
+            Candidates.remove(GEPJ);
+          }
+        }
+      }
+
+      // We break out of the above computation as soon as we know there are
+      // fewer than two candidates remaining.
+      if (Candidates.size() < 2)
+        continue;
+
+      // Add the single, non-constant index of each candidate to the bundle. We
+      // ensured the indices met these constraints when we originally collected
+      // the getelementptrs.
+      SmallVector<Value *, 16> Bundle(Candidates.size());
+      auto BundleIndex = 0u;
+      for (auto *V : Candidates) {
+        auto *GEP = cast<GetElementPtrInst>(V);
+        auto *GEPIdx = GEP->idx_begin()->get();
+        assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx));
+        Bundle[BundleIndex++] = GEPIdx;
+      }
+
+      // Try and vectorize the indices. We are currently only interested in
+      // gather-like cases of the form:
+      //
+      // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
+      //
+      // where the loads of "a", the loads of "b", and the subtractions can be
+      // performed in parallel. It's likely that detecting this pattern in a
+      // bottom-up phase will be simpler and less costly than building a
+      // full-blown top-down phase beginning at the consecutive loads.
+      Changed |= tryToVectorizeList(Bundle, R);
+    }
+  }
+  return Changed;
+}
+
+bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
+  bool Changed = false;
+  // Attempt to sort and vectorize each of the store-groups.
+  for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e;
+       ++it) {
+    if (it->second.size() < 2)
+      continue;
+
+    LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
+                      << it->second.size() << ".\n");
+
+    Changed |= vectorizeStores(it->second, R);
+  }
+  return Changed;
+}
+
+char SLPVectorizer::ID = 0;
+
+static const char lv_name[] = "SLP Vectorizer";
+
+INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
+INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
+
+Pass *llvm::createSLPVectorizerPass() { return new SLPVectorizer(); }
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPRecipeBuilder.h b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPRecipeBuilder.h
index eebb58be8b..8737016760 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -1,161 +1,161 @@
-//===- VPRecipeBuilder.h - Helper class to build recipes --------*- C++ -*-===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#ifndef LLVM_TRANSFORMS_VECTORIZE_VPRECIPEBUILDER_H 
-#define LLVM_TRANSFORMS_VECTORIZE_VPRECIPEBUILDER_H 
- 
-#include "LoopVectorizationPlanner.h" 
-#include "VPlan.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/IR/IRBuilder.h" 
- 
-namespace llvm { 
- 
-class LoopVectorizationLegality; 
-class LoopVectorizationCostModel; 
-class TargetLibraryInfo; 
- 
-/// Helper class to create VPRecipies from IR instructions. 
-class VPRecipeBuilder { 
-  /// The loop that we evaluate. 
-  Loop *OrigLoop; 
- 
-  /// Target Library Info. 
-  const TargetLibraryInfo *TLI; 
- 
-  /// The legality analysis. 
-  LoopVectorizationLegality *Legal; 
- 
-  /// The profitablity analysis. 
-  LoopVectorizationCostModel &CM; 
- 
-  PredicatedScalarEvolution &PSE; 
- 
-  VPBuilder &Builder; 
- 
-  /// When we if-convert we need to create edge masks. We have to cache values 
-  /// so that we don't end up with exponential recursion/IR. Note that 
-  /// if-conversion currently takes place during VPlan-construction, so these 
-  /// caches are only used at that stage. 
-  using EdgeMaskCacheTy = 
-      DenseMap<std::pair<BasicBlock *, BasicBlock *>, VPValue *>; 
-  using BlockMaskCacheTy = DenseMap<BasicBlock *, VPValue *>; 
-  EdgeMaskCacheTy EdgeMaskCache; 
-  BlockMaskCacheTy BlockMaskCache; 
- 
-  // VPlan-VPlan transformations support: Hold a mapping from ingredients to 
-  // their recipe. To save on memory, only do so for selected ingredients, 
-  // marked by having a nullptr entry in this map. 
-  DenseMap<Instruction *, VPRecipeBase *> Ingredient2Recipe; 
- 
-  /// Check if \p I can be widened at the start of \p Range and possibly 
-  /// decrease the range such that the returned value holds for the entire \p 
-  /// Range. The function should not be called for memory instructions or calls. 
-  bool shouldWiden(Instruction *I, VFRange &Range) const; 
- 
-  /// Check if the load or store instruction \p I should widened for \p 
-  /// Range.Start and potentially masked. Such instructions are handled by a 
-  /// recipe that takes an additional VPInstruction for the mask. 
+//===- VPRecipeBuilder.h - Helper class to build recipes --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPRECIPEBUILDER_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPRECIPEBUILDER_H
+
+#include "LoopVectorizationPlanner.h"
+#include "VPlan.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/IRBuilder.h"
+
+namespace llvm {
+
+class LoopVectorizationLegality;
+class LoopVectorizationCostModel;
+class TargetLibraryInfo;
+
+/// Helper class to create VPRecipies from IR instructions.
+class VPRecipeBuilder {
+  /// The loop that we evaluate.
+  Loop *OrigLoop;
+
+  /// Target Library Info.
+  const TargetLibraryInfo *TLI;
+
+  /// The legality analysis.
+  LoopVectorizationLegality *Legal;
+
+  /// The profitablity analysis.
+  LoopVectorizationCostModel &CM;
+
+  PredicatedScalarEvolution &PSE;
+
+  VPBuilder &Builder;
+
+  /// When we if-convert we need to create edge masks. We have to cache values
+  /// so that we don't end up with exponential recursion/IR. Note that
+  /// if-conversion currently takes place during VPlan-construction, so these
+  /// caches are only used at that stage.
+  using EdgeMaskCacheTy =
+      DenseMap<std::pair<BasicBlock *, BasicBlock *>, VPValue *>;
+  using BlockMaskCacheTy = DenseMap<BasicBlock *, VPValue *>;
+  EdgeMaskCacheTy EdgeMaskCache;
+  BlockMaskCacheTy BlockMaskCache;
+
+  // VPlan-VPlan transformations support: Hold a mapping from ingredients to
+  // their recipe. To save on memory, only do so for selected ingredients,
+  // marked by having a nullptr entry in this map.
+  DenseMap<Instruction *, VPRecipeBase *> Ingredient2Recipe;
+
+  /// Check if \p I can be widened at the start of \p Range and possibly
+  /// decrease the range such that the returned value holds for the entire \p
+  /// Range. The function should not be called for memory instructions or calls.
+  bool shouldWiden(Instruction *I, VFRange &Range) const;
+
+  /// Check if the load or store instruction \p I should widened for \p
+  /// Range.Start and potentially masked. Such instructions are handled by a
+  /// recipe that takes an additional VPInstruction for the mask.
   VPRecipeBase *tryToWidenMemory(Instruction *I, VFRange &Range,
                                  VPlanPtr &Plan);
- 
-  /// Check if an induction recipe should be constructed for \I. If so build and 
-  /// return it. If not, return null. 
+
+  /// Check if an induction recipe should be constructed for \I. If so build and
+  /// return it. If not, return null.
   VPWidenIntOrFpInductionRecipe *tryToOptimizeInductionPHI(PHINode *Phi,
                                                            VPlan &Plan) const;
- 
-  /// Optimize the special case where the operand of \p I is a constant integer 
-  /// induction variable. 
-  VPWidenIntOrFpInductionRecipe * 
+
+  /// Optimize the special case where the operand of \p I is a constant integer
+  /// induction variable.
+  VPWidenIntOrFpInductionRecipe *
   tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range,
                                  VPlan &Plan) const;
- 
-  /// Handle non-loop phi nodes. Currently all such phi nodes are turned into 
-  /// a sequence of select instructions as the vectorizer currently performs 
-  /// full if-conversion. 
-  VPBlendRecipe *tryToBlend(PHINode *Phi, VPlanPtr &Plan); 
- 
-  /// Handle call instructions. If \p CI can be widened for \p Range.Start, 
-  /// return a new VPWidenCallRecipe. Range.End may be decreased to ensure same 
-  /// decision from \p Range.Start to \p Range.End. 
-  VPWidenCallRecipe *tryToWidenCall(CallInst *CI, VFRange &Range, 
-                                    VPlan &Plan) const; 
- 
-  /// Check if \p I has an opcode that can be widened and return a VPWidenRecipe 
-  /// if it can. The function should only be called if the cost-model indicates 
-  /// that widening should be performed. 
-  VPWidenRecipe *tryToWiden(Instruction *I, VPlan &Plan) const; 
- 
-public: 
-  VPRecipeBuilder(Loop *OrigLoop, const TargetLibraryInfo *TLI, 
-                  LoopVectorizationLegality *Legal, 
-                  LoopVectorizationCostModel &CM, 
-                  PredicatedScalarEvolution &PSE, VPBuilder &Builder) 
-      : OrigLoop(OrigLoop), TLI(TLI), Legal(Legal), CM(CM), PSE(PSE), 
-        Builder(Builder) {} 
- 
-  /// Check if a recipe can be create for \p I withing the given VF \p Range. 
-  /// If a recipe can be created, return it. Otherwise return nullptr. 
-  VPRecipeBase *tryToCreateWidenRecipe(Instruction *Instr, VFRange &Range, 
-                                       VPlanPtr &Plan); 
- 
-  /// Set the recipe created for given ingredient. This operation is a no-op for 
-  /// ingredients that were not marked using a nullptr entry in the map. 
-  void setRecipe(Instruction *I, VPRecipeBase *R) { 
-    if (!Ingredient2Recipe.count(I)) 
-      return; 
-    assert(Ingredient2Recipe[I] == nullptr && 
-           "Recipe already set for ingredient"); 
-    Ingredient2Recipe[I] = R; 
-  } 
- 
-  /// A helper function that computes the predicate of the block BB, assuming 
-  /// that the header block of the loop is set to True. It returns the *entry* 
-  /// mask for the block BB. 
-  VPValue *createBlockInMask(BasicBlock *BB, VPlanPtr &Plan); 
- 
-  /// A helper function that computes the predicate of the edge between SRC 
-  /// and DST. 
-  VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan); 
- 
-  /// Mark given ingredient for recording its recipe once one is created for 
-  /// it. 
-  void recordRecipeOf(Instruction *I) { 
-    assert((!Ingredient2Recipe.count(I) || Ingredient2Recipe[I] == nullptr) && 
-           "Recipe already set for ingredient"); 
-    Ingredient2Recipe[I] = nullptr; 
-  } 
- 
-  /// Return the recipe created for given ingredient. 
-  VPRecipeBase *getRecipe(Instruction *I) { 
-    assert(Ingredient2Recipe.count(I) && 
-           "Recording this ingredients recipe was not requested"); 
-    assert(Ingredient2Recipe[I] != nullptr && 
-           "Ingredient doesn't have a recipe"); 
-    return Ingredient2Recipe[I]; 
-  } 
- 
-  /// Create a replicating region for instruction \p I that requires 
-  /// predication. \p PredRecipe is a VPReplicateRecipe holding \p I. 
-  VPRegionBlock *createReplicateRegion(Instruction *I, VPRecipeBase *PredRecipe, 
-                                       VPlanPtr &Plan); 
- 
-  /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it 
-  /// is predicated. \return \p VPBB augmented with this new recipe if \p I is 
-  /// not predicated, otherwise \return a new VPBasicBlock that succeeds the new 
-  /// Region. Update the packing decision of predicated instructions if they 
-  /// feed \p I. Range.End may be decreased to ensure same recipe behavior from 
-  /// \p Range.Start to \p Range.End. 
-  VPBasicBlock *handleReplication( 
-      Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 
-      DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 
-      VPlanPtr &Plan); 
-}; 
-} // end namespace llvm 
- 
-#endif // LLVM_TRANSFORMS_VECTORIZE_VPRECIPEBUILDER_H 
+
+  /// Handle non-loop phi nodes. Currently all such phi nodes are turned into
+  /// a sequence of select instructions as the vectorizer currently performs
+  /// full if-conversion.
+  VPBlendRecipe *tryToBlend(PHINode *Phi, VPlanPtr &Plan);
+
+  /// Handle call instructions. If \p CI can be widened for \p Range.Start,
+  /// return a new VPWidenCallRecipe. Range.End may be decreased to ensure same
+  /// decision from \p Range.Start to \p Range.End.
+  VPWidenCallRecipe *tryToWidenCall(CallInst *CI, VFRange &Range,
+                                    VPlan &Plan) const;
+
+  /// Check if \p I has an opcode that can be widened and return a VPWidenRecipe
+  /// if it can. The function should only be called if the cost-model indicates
+  /// that widening should be performed.
+  VPWidenRecipe *tryToWiden(Instruction *I, VPlan &Plan) const;
+
+public:
+  VPRecipeBuilder(Loop *OrigLoop, const TargetLibraryInfo *TLI,
+                  LoopVectorizationLegality *Legal,
+                  LoopVectorizationCostModel &CM,
+                  PredicatedScalarEvolution &PSE, VPBuilder &Builder)
+      : OrigLoop(OrigLoop), TLI(TLI), Legal(Legal), CM(CM), PSE(PSE),
+        Builder(Builder) {}
+
+  /// Check if a recipe can be create for \p I withing the given VF \p Range.
+  /// If a recipe can be created, return it. Otherwise return nullptr.
+  VPRecipeBase *tryToCreateWidenRecipe(Instruction *Instr, VFRange &Range,
+                                       VPlanPtr &Plan);
+
+  /// Set the recipe created for given ingredient. This operation is a no-op for
+  /// ingredients that were not marked using a nullptr entry in the map.
+  void setRecipe(Instruction *I, VPRecipeBase *R) {
+    if (!Ingredient2Recipe.count(I))
+      return;
+    assert(Ingredient2Recipe[I] == nullptr &&
+           "Recipe already set for ingredient");
+    Ingredient2Recipe[I] = R;
+  }
+
+  /// A helper function that computes the predicate of the block BB, assuming
+  /// that the header block of the loop is set to True. It returns the *entry*
+  /// mask for the block BB.
+  VPValue *createBlockInMask(BasicBlock *BB, VPlanPtr &Plan);
+
+  /// A helper function that computes the predicate of the edge between SRC
+  /// and DST.
+  VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan);
+
+  /// Mark given ingredient for recording its recipe once one is created for
+  /// it.
+  void recordRecipeOf(Instruction *I) {
+    assert((!Ingredient2Recipe.count(I) || Ingredient2Recipe[I] == nullptr) &&
+           "Recipe already set for ingredient");
+    Ingredient2Recipe[I] = nullptr;
+  }
+
+  /// Return the recipe created for given ingredient.
+  VPRecipeBase *getRecipe(Instruction *I) {
+    assert(Ingredient2Recipe.count(I) &&
+           "Recording this ingredients recipe was not requested");
+    assert(Ingredient2Recipe[I] != nullptr &&
+           "Ingredient doesn't have a recipe");
+    return Ingredient2Recipe[I];
+  }
+
+  /// Create a replicating region for instruction \p I that requires
+  /// predication. \p PredRecipe is a VPReplicateRecipe holding \p I.
+  VPRegionBlock *createReplicateRegion(Instruction *I, VPRecipeBase *PredRecipe,
+                                       VPlanPtr &Plan);
+
+  /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it
+  /// is predicated. \return \p VPBB augmented with this new recipe if \p I is
+  /// not predicated, otherwise \return a new VPBasicBlock that succeeds the new
+  /// Region. Update the packing decision of predicated instructions if they
+  /// feed \p I. Range.End may be decreased to ensure same recipe behavior from
+  /// \p Range.Start to \p Range.End.
+  VPBasicBlock *handleReplication(
+      Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
+      DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
+      VPlanPtr &Plan);
+};
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPRECIPEBUILDER_H
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.cpp
index eaacde6f66..b26399e0ae 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.cpp
@@ -1,63 +1,63 @@
-//===- VPlan.cpp - Vectorizer Plan ----------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-/// 
-/// \file 
-/// This is the LLVM vectorization plan. It represents a candidate for 
-/// vectorization, allowing to plan and optimize how to vectorize a given loop 
-/// before generating LLVM-IR. 
-/// The vectorizer uses vectorization plans to estimate the costs of potential 
-/// candidates and if profitable to execute the desired plan, generating vector 
-/// LLVM-IR code. 
-/// 
-//===----------------------------------------------------------------------===// 
- 
-#include "VPlan.h" 
-#include "VPlanDominatorTree.h" 
-#include "llvm/ADT/DepthFirstIterator.h" 
-#include "llvm/ADT/PostOrderIterator.h" 
+//===- VPlan.cpp - Vectorizer Plan ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This is the LLVM vectorization plan. It represents a candidate for
+/// vectorization, allowing to plan and optimize how to vectorize a given loop
+/// before generating LLVM-IR.
+/// The vectorizer uses vectorization plans to estimate the costs of potential
+/// candidates and if profitable to execute the desired plan, generating vector
+/// LLVM-IR code.
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPlan.h"
+#include "VPlanDominatorTree.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Twine.h" 
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/IVDescriptors.h"
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/GenericDomTreeConstruction.h" 
-#include "llvm/Support/GraphWriter.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include <cassert> 
-#include <iterator> 
-#include <string> 
-#include <vector> 
- 
-using namespace llvm; 
-extern cl::opt<bool> EnableVPlanNativePath; 
- 
-#define DEBUG_TYPE "vplan" 
- 
-raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) { 
-  const VPInstruction *Instr = dyn_cast<VPInstruction>(&V); 
-  VPSlotTracker SlotTracker( 
-      (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr); 
-  V.print(OS, SlotTracker); 
-  return OS; 
-} 
- 
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GenericDomTreeConstruction.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <cassert>
+#include <iterator>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+extern cl::opt<bool> EnableVPlanNativePath;
+
+#define DEBUG_TYPE "vplan"
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) {
+  const VPInstruction *Instr = dyn_cast<VPInstruction>(&V);
+  VPSlotTracker SlotTracker(
+      (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr);
+  V.print(OS, SlotTracker);
+  return OS;
+}
+
 VPValue::VPValue(const unsigned char SC, Value *UV, VPDef *Def)
     : SubclassID(SC), UnderlyingVal(UV), Def(Def) {
   if (Def)
@@ -70,13 +70,13 @@ VPValue::~VPValue() {
     Def->removeDefinedValue(this);
 }
 
-void VPValue::print(raw_ostream &OS, VPSlotTracker &SlotTracker) const { 
+void VPValue::print(raw_ostream &OS, VPSlotTracker &SlotTracker) const {
   if (const VPRecipeBase *R = dyn_cast_or_null<VPRecipeBase>(Def))
     R->print(OS, "", SlotTracker);
-  else 
-    printAsOperand(OS, SlotTracker); 
-} 
- 
+  else
+    printAsOperand(OS, SlotTracker);
+}
+
 void VPValue::dump() const {
   const VPRecipeBase *Instr = dyn_cast_or_null<VPRecipeBase>(this->Def);
   VPSlotTracker SlotTracker(
@@ -121,91 +121,91 @@ VPUser *VPRecipeBase::toVPUser() {
   return nullptr;
 }
 
-// Get the top-most entry block of \p Start. This is the entry block of the 
-// containing VPlan. This function is templated to support both const and non-const blocks 
-template <typename T> static T *getPlanEntry(T *Start) { 
-  T *Next = Start; 
-  T *Current = Start; 
-  while ((Next = Next->getParent())) 
-    Current = Next; 
- 
-  SmallSetVector<T *, 8> WorkList; 
-  WorkList.insert(Current); 
- 
-  for (unsigned i = 0; i < WorkList.size(); i++) { 
-    T *Current = WorkList[i]; 
-    if (Current->getNumPredecessors() == 0) 
-      return Current; 
-    auto &Predecessors = Current->getPredecessors(); 
-    WorkList.insert(Predecessors.begin(), Predecessors.end()); 
-  } 
- 
-  llvm_unreachable("VPlan without any entry node without predecessors"); 
-} 
- 
-VPlan *VPBlockBase::getPlan() { return getPlanEntry(this)->Plan; } 
- 
-const VPlan *VPBlockBase::getPlan() const { return getPlanEntry(this)->Plan; } 
- 
-/// \return the VPBasicBlock that is the entry of Block, possibly indirectly. 
-const VPBasicBlock *VPBlockBase::getEntryBasicBlock() const { 
-  const VPBlockBase *Block = this; 
-  while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) 
-    Block = Region->getEntry(); 
-  return cast<VPBasicBlock>(Block); 
-} 
- 
-VPBasicBlock *VPBlockBase::getEntryBasicBlock() { 
-  VPBlockBase *Block = this; 
-  while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) 
-    Block = Region->getEntry(); 
-  return cast<VPBasicBlock>(Block); 
-} 
- 
-void VPBlockBase::setPlan(VPlan *ParentPlan) { 
-  assert(ParentPlan->getEntry() == this && 
-         "Can only set plan on its entry block."); 
-  Plan = ParentPlan; 
-} 
- 
-/// \return the VPBasicBlock that is the exit of Block, possibly indirectly. 
-const VPBasicBlock *VPBlockBase::getExitBasicBlock() const { 
-  const VPBlockBase *Block = this; 
-  while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) 
-    Block = Region->getExit(); 
-  return cast<VPBasicBlock>(Block); 
-} 
- 
-VPBasicBlock *VPBlockBase::getExitBasicBlock() { 
-  VPBlockBase *Block = this; 
-  while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) 
-    Block = Region->getExit(); 
-  return cast<VPBasicBlock>(Block); 
-} 
- 
-VPBlockBase *VPBlockBase::getEnclosingBlockWithSuccessors() { 
-  if (!Successors.empty() || !Parent) 
-    return this; 
-  assert(Parent->getExit() == this && 
-         "Block w/o successors not the exit of its parent."); 
-  return Parent->getEnclosingBlockWithSuccessors(); 
-} 
- 
-VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() { 
-  if (!Predecessors.empty() || !Parent) 
-    return this; 
-  assert(Parent->getEntry() == this && 
-         "Block w/o predecessors not the entry of its parent."); 
-  return Parent->getEnclosingBlockWithPredecessors(); 
-} 
- 
-void VPBlockBase::deleteCFG(VPBlockBase *Entry) { 
+// Get the top-most entry block of \p Start. This is the entry block of the
+// containing VPlan. This function is templated to support both const and non-const blocks
+template <typename T> static T *getPlanEntry(T *Start) {
+  T *Next = Start;
+  T *Current = Start;
+  while ((Next = Next->getParent()))
+    Current = Next;
+
+  SmallSetVector<T *, 8> WorkList;
+  WorkList.insert(Current);
+
+  for (unsigned i = 0; i < WorkList.size(); i++) {
+    T *Current = WorkList[i];
+    if (Current->getNumPredecessors() == 0)
+      return Current;
+    auto &Predecessors = Current->getPredecessors();
+    WorkList.insert(Predecessors.begin(), Predecessors.end());
+  }
+
+  llvm_unreachable("VPlan without any entry node without predecessors");
+}
+
+VPlan *VPBlockBase::getPlan() { return getPlanEntry(this)->Plan; }
+
+const VPlan *VPBlockBase::getPlan() const { return getPlanEntry(this)->Plan; }
+
+/// \return the VPBasicBlock that is the entry of Block, possibly indirectly.
+const VPBasicBlock *VPBlockBase::getEntryBasicBlock() const {
+  const VPBlockBase *Block = this;
+  while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
+    Block = Region->getEntry();
+  return cast<VPBasicBlock>(Block);
+}
+
+VPBasicBlock *VPBlockBase::getEntryBasicBlock() {
+  VPBlockBase *Block = this;
+  while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
+    Block = Region->getEntry();
+  return cast<VPBasicBlock>(Block);
+}
+
+void VPBlockBase::setPlan(VPlan *ParentPlan) {
+  assert(ParentPlan->getEntry() == this &&
+         "Can only set plan on its entry block.");
+  Plan = ParentPlan;
+}
+
+/// \return the VPBasicBlock that is the exit of Block, possibly indirectly.
+const VPBasicBlock *VPBlockBase::getExitBasicBlock() const {
+  const VPBlockBase *Block = this;
+  while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
+    Block = Region->getExit();
+  return cast<VPBasicBlock>(Block);
+}
+
+VPBasicBlock *VPBlockBase::getExitBasicBlock() {
+  VPBlockBase *Block = this;
+  while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
+    Block = Region->getExit();
+  return cast<VPBasicBlock>(Block);
+}
+
+VPBlockBase *VPBlockBase::getEnclosingBlockWithSuccessors() {
+  if (!Successors.empty() || !Parent)
+    return this;
+  assert(Parent->getExit() == this &&
+         "Block w/o successors not the exit of its parent.");
+  return Parent->getEnclosingBlockWithSuccessors();
+}
+
+VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() {
+  if (!Predecessors.empty() || !Parent)
+    return this;
+  assert(Parent->getEntry() == this &&
+         "Block w/o predecessors not the entry of its parent.");
+  return Parent->getEnclosingBlockWithPredecessors();
+}
+
+void VPBlockBase::deleteCFG(VPBlockBase *Entry) {
   SmallVector<VPBlockBase *, 8> Blocks(depth_first(Entry));
- 
-  for (VPBlockBase *Block : Blocks) 
-    delete Block; 
-} 
- 
+
+  for (VPBlockBase *Block : Blocks)
+    delete Block;
+}
+
 VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
   iterator It = begin();
   while (It != end() && (isa<VPWidenPHIRecipe>(&*It) ||
@@ -237,123 +237,123 @@ Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) {
   return Callback.getOrCreateScalarValue(VPValue2Value[Def], Instance);
 }
 
-BasicBlock * 
-VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) { 
-  // BB stands for IR BasicBlocks. VPBB stands for VPlan VPBasicBlocks. 
-  // Pred stands for Predessor. Prev stands for Previous - last visited/created. 
-  BasicBlock *PrevBB = CFG.PrevBB; 
-  BasicBlock *NewBB = BasicBlock::Create(PrevBB->getContext(), getName(), 
-                                         PrevBB->getParent(), CFG.LastBB); 
-  LLVM_DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n'); 
- 
-  // Hook up the new basic block to its predecessors. 
-  for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) { 
-    VPBasicBlock *PredVPBB = PredVPBlock->getExitBasicBlock(); 
-    auto &PredVPSuccessors = PredVPBB->getSuccessors(); 
-    BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB]; 
- 
-    // In outer loop vectorization scenario, the predecessor BBlock may not yet 
-    // be visited(backedge). Mark the VPBasicBlock for fixup at the end of 
-    // vectorization. We do not encounter this case in inner loop vectorization 
-    // as we start out by building a loop skeleton with the vector loop header 
-    // and latch blocks. As a result, we never enter this function for the 
-    // header block in the non VPlan-native path. 
-    if (!PredBB) { 
-      assert(EnableVPlanNativePath && 
-             "Unexpected null predecessor in non VPlan-native path"); 
-      CFG.VPBBsToFix.push_back(PredVPBB); 
-      continue; 
-    } 
- 
-    assert(PredBB && "Predecessor basic-block not found building successor."); 
-    auto *PredBBTerminator = PredBB->getTerminator(); 
-    LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n'); 
-    if (isa<UnreachableInst>(PredBBTerminator)) { 
-      assert(PredVPSuccessors.size() == 1 && 
-             "Predecessor ending w/o branch must have single successor."); 
-      PredBBTerminator->eraseFromParent(); 
-      BranchInst::Create(NewBB, PredBB); 
-    } else { 
-      assert(PredVPSuccessors.size() == 2 && 
-             "Predecessor ending with branch must have two successors."); 
-      unsigned idx = PredVPSuccessors.front() == this ? 0 : 1; 
-      assert(!PredBBTerminator->getSuccessor(idx) && 
-             "Trying to reset an existing successor block."); 
-      PredBBTerminator->setSuccessor(idx, NewBB); 
-    } 
-  } 
-  return NewBB; 
-} 
- 
-void VPBasicBlock::execute(VPTransformState *State) { 
-  bool Replica = State->Instance && 
-                 !(State->Instance->Part == 0 && State->Instance->Lane == 0); 
-  VPBasicBlock *PrevVPBB = State->CFG.PrevVPBB; 
-  VPBlockBase *SingleHPred = nullptr; 
-  BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible. 
- 
-  // 1. Create an IR basic block, or reuse the last one if possible. 
-  // The last IR basic block is reused, as an optimization, in three cases: 
-  // A. the first VPBB reuses the loop header BB - when PrevVPBB is null; 
-  // B. when the current VPBB has a single (hierarchical) predecessor which 
-  //    is PrevVPBB and the latter has a single (hierarchical) successor; and 
-  // C. when the current VPBB is an entry of a region replica - where PrevVPBB 
-  //    is the exit of this region from a previous instance, or the predecessor 
-  //    of this region. 
-  if (PrevVPBB && /* A */ 
-      !((SingleHPred = getSingleHierarchicalPredecessor()) && 
-        SingleHPred->getExitBasicBlock() == PrevVPBB && 
-        PrevVPBB->getSingleHierarchicalSuccessor()) && /* B */ 
-      !(Replica && getPredecessors().empty())) {       /* C */ 
-    NewBB = createEmptyBasicBlock(State->CFG); 
-    State->Builder.SetInsertPoint(NewBB); 
-    // Temporarily terminate with unreachable until CFG is rewired. 
-    UnreachableInst *Terminator = State->Builder.CreateUnreachable(); 
-    State->Builder.SetInsertPoint(Terminator); 
-    // Register NewBB in its loop. In innermost loops its the same for all BB's. 
-    Loop *L = State->LI->getLoopFor(State->CFG.LastBB); 
-    L->addBasicBlockToLoop(NewBB, *State->LI); 
-    State->CFG.PrevBB = NewBB; 
-  } 
- 
-  // 2. Fill the IR basic block with IR instructions. 
-  LLVM_DEBUG(dbgs() << "LV: vectorizing VPBB:" << getName() 
-                    << " in BB:" << NewBB->getName() << '\n'); 
- 
-  State->CFG.VPBB2IRBB[this] = NewBB; 
-  State->CFG.PrevVPBB = this; 
- 
-  for (VPRecipeBase &Recipe : Recipes) 
-    Recipe.execute(*State); 
- 
-  VPValue *CBV; 
-  if (EnableVPlanNativePath && (CBV = getCondBit())) { 
-    Value *IRCBV = CBV->getUnderlyingValue(); 
-    assert(IRCBV && "Unexpected null underlying value for condition bit"); 
- 
-    // Condition bit value in a VPBasicBlock is used as the branch selector. In 
-    // the VPlan-native path case, since all branches are uniform we generate a 
-    // branch instruction using the condition value from vector lane 0 and dummy 
-    // successors. The successors are fixed later when the successor blocks are 
-    // visited. 
-    Value *NewCond = State->Callback.getOrCreateVectorValues(IRCBV, 0); 
-    NewCond = State->Builder.CreateExtractElement(NewCond, 
-                                                  State->Builder.getInt32(0)); 
- 
-    // Replace the temporary unreachable terminator with the new conditional 
-    // branch. 
-    auto *CurrentTerminator = NewBB->getTerminator(); 
-    assert(isa<UnreachableInst>(CurrentTerminator) && 
-           "Expected to replace unreachable terminator with conditional " 
-           "branch."); 
-    auto *CondBr = BranchInst::Create(NewBB, nullptr, NewCond); 
-    CondBr->setSuccessor(0, nullptr); 
-    ReplaceInstWithInst(CurrentTerminator, CondBr); 
-  } 
- 
-  LLVM_DEBUG(dbgs() << "LV: filled BB:" << *NewBB); 
-} 
- 
+BasicBlock *
+VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
+  // BB stands for IR BasicBlocks. VPBB stands for VPlan VPBasicBlocks.
+  // Pred stands for Predessor. Prev stands for Previous - last visited/created.
+  BasicBlock *PrevBB = CFG.PrevBB;
+  BasicBlock *NewBB = BasicBlock::Create(PrevBB->getContext(), getName(),
+                                         PrevBB->getParent(), CFG.LastBB);
+  LLVM_DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n');
+
+  // Hook up the new basic block to its predecessors.
+  for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) {
+    VPBasicBlock *PredVPBB = PredVPBlock->getExitBasicBlock();
+    auto &PredVPSuccessors = PredVPBB->getSuccessors();
+    BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB];
+
+    // In outer loop vectorization scenario, the predecessor BBlock may not yet
+    // be visited(backedge). Mark the VPBasicBlock for fixup at the end of
+    // vectorization. We do not encounter this case in inner loop vectorization
+    // as we start out by building a loop skeleton with the vector loop header
+    // and latch blocks. As a result, we never enter this function for the
+    // header block in the non VPlan-native path.
+    if (!PredBB) {
+      assert(EnableVPlanNativePath &&
+             "Unexpected null predecessor in non VPlan-native path");
+      CFG.VPBBsToFix.push_back(PredVPBB);
+      continue;
+    }
+
+    assert(PredBB && "Predecessor basic-block not found building successor.");
+    auto *PredBBTerminator = PredBB->getTerminator();
+    LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n');
+    if (isa<UnreachableInst>(PredBBTerminator)) {
+      assert(PredVPSuccessors.size() == 1 &&
+             "Predecessor ending w/o branch must have single successor.");
+      PredBBTerminator->eraseFromParent();
+      BranchInst::Create(NewBB, PredBB);
+    } else {
+      assert(PredVPSuccessors.size() == 2 &&
+             "Predecessor ending with branch must have two successors.");
+      unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
+      assert(!PredBBTerminator->getSuccessor(idx) &&
+             "Trying to reset an existing successor block.");
+      PredBBTerminator->setSuccessor(idx, NewBB);
+    }
+  }
+  return NewBB;
+}
+
+void VPBasicBlock::execute(VPTransformState *State) {
+  bool Replica = State->Instance &&
+                 !(State->Instance->Part == 0 && State->Instance->Lane == 0);
+  VPBasicBlock *PrevVPBB = State->CFG.PrevVPBB;
+  VPBlockBase *SingleHPred = nullptr;
+  BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible.
+
+  // 1. Create an IR basic block, or reuse the last one if possible.
+  // The last IR basic block is reused, as an optimization, in three cases:
+  // A. the first VPBB reuses the loop header BB - when PrevVPBB is null;
+  // B. when the current VPBB has a single (hierarchical) predecessor which
+  //    is PrevVPBB and the latter has a single (hierarchical) successor; and
+  // C. when the current VPBB is an entry of a region replica - where PrevVPBB
+  //    is the exit of this region from a previous instance, or the predecessor
+  //    of this region.
+  if (PrevVPBB && /* A */
+      !((SingleHPred = getSingleHierarchicalPredecessor()) &&
+        SingleHPred->getExitBasicBlock() == PrevVPBB &&
+        PrevVPBB->getSingleHierarchicalSuccessor()) && /* B */
+      !(Replica && getPredecessors().empty())) {       /* C */
+    NewBB = createEmptyBasicBlock(State->CFG);
+    State->Builder.SetInsertPoint(NewBB);
+    // Temporarily terminate with unreachable until CFG is rewired.
+    UnreachableInst *Terminator = State->Builder.CreateUnreachable();
+    State->Builder.SetInsertPoint(Terminator);
+    // Register NewBB in its loop. In innermost loops its the same for all BB's.
+    Loop *L = State->LI->getLoopFor(State->CFG.LastBB);
+    L->addBasicBlockToLoop(NewBB, *State->LI);
+    State->CFG.PrevBB = NewBB;
+  }
+
+  // 2. Fill the IR basic block with IR instructions.
+  LLVM_DEBUG(dbgs() << "LV: vectorizing VPBB:" << getName()
+                    << " in BB:" << NewBB->getName() << '\n');
+
+  State->CFG.VPBB2IRBB[this] = NewBB;
+  State->CFG.PrevVPBB = this;
+
+  for (VPRecipeBase &Recipe : Recipes)
+    Recipe.execute(*State);
+
+  VPValue *CBV;
+  if (EnableVPlanNativePath && (CBV = getCondBit())) {
+    Value *IRCBV = CBV->getUnderlyingValue();
+    assert(IRCBV && "Unexpected null underlying value for condition bit");
+
+    // Condition bit value in a VPBasicBlock is used as the branch selector. In
+    // the VPlan-native path case, since all branches are uniform we generate a
+    // branch instruction using the condition value from vector lane 0 and dummy
+    // successors. The successors are fixed later when the successor blocks are
+    // visited.
+    Value *NewCond = State->Callback.getOrCreateVectorValues(IRCBV, 0);
+    NewCond = State->Builder.CreateExtractElement(NewCond,
+                                                  State->Builder.getInt32(0));
+
+    // Replace the temporary unreachable terminator with the new conditional
+    // branch.
+    auto *CurrentTerminator = NewBB->getTerminator();
+    assert(isa<UnreachableInst>(CurrentTerminator) &&
+           "Expected to replace unreachable terminator with conditional "
+           "branch.");
+    auto *CondBr = BranchInst::Create(NewBB, nullptr, NewCond);
+    CondBr->setSuccessor(0, nullptr);
+    ReplaceInstWithInst(CurrentTerminator, CondBr);
+  }
+
+  LLVM_DEBUG(dbgs() << "LV: filled BB:" << *NewBB);
+}
+
 void VPBasicBlock::dropAllReferences(VPValue *NewValue) {
   for (VPRecipeBase &R : Recipes) {
     for (auto *Def : R.definedValues())
@@ -372,87 +372,87 @@ void VPRegionBlock::dropAllReferences(VPValue *NewValue) {
     Block->dropAllReferences(NewValue);
 }
 
-void VPRegionBlock::execute(VPTransformState *State) { 
-  ReversePostOrderTraversal<VPBlockBase *> RPOT(Entry); 
- 
-  if (!isReplicator()) { 
-    // Visit the VPBlocks connected to "this", starting from it. 
-    for (VPBlockBase *Block : RPOT) { 
-      if (EnableVPlanNativePath) { 
-        // The inner loop vectorization path does not represent loop preheader 
-        // and exit blocks as part of the VPlan. In the VPlan-native path, skip 
-        // vectorizing loop preheader block. In future, we may replace this 
-        // check with the check for loop preheader. 
-        if (Block->getNumPredecessors() == 0) 
-          continue; 
- 
-        // Skip vectorizing loop exit block. In future, we may replace this 
-        // check with the check for loop exit. 
-        if (Block->getNumSuccessors() == 0) 
-          continue; 
-      } 
- 
-      LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n'); 
-      Block->execute(State); 
-    } 
-    return; 
-  } 
- 
-  assert(!State->Instance && "Replicating a Region with non-null instance."); 
- 
-  // Enter replicating mode. 
-  State->Instance = {0, 0}; 
- 
-  for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) { 
-    State->Instance->Part = Part; 
+void VPRegionBlock::execute(VPTransformState *State) {
+  ReversePostOrderTraversal<VPBlockBase *> RPOT(Entry);
+
+  if (!isReplicator()) {
+    // Visit the VPBlocks connected to "this", starting from it.
+    for (VPBlockBase *Block : RPOT) {
+      if (EnableVPlanNativePath) {
+        // The inner loop vectorization path does not represent loop preheader
+        // and exit blocks as part of the VPlan. In the VPlan-native path, skip
+        // vectorizing loop preheader block. In future, we may replace this
+        // check with the check for loop preheader.
+        if (Block->getNumPredecessors() == 0)
+          continue;
+
+        // Skip vectorizing loop exit block. In future, we may replace this
+        // check with the check for loop exit.
+        if (Block->getNumSuccessors() == 0)
+          continue;
+      }
+
+      LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
+      Block->execute(State);
+    }
+    return;
+  }
+
+  assert(!State->Instance && "Replicating a Region with non-null instance.");
+
+  // Enter replicating mode.
+  State->Instance = {0, 0};
+
+  for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) {
+    State->Instance->Part = Part;
     assert(!State->VF.isScalable() && "VF is assumed to be non scalable.");
     for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF;
          ++Lane) {
-      State->Instance->Lane = Lane; 
-      // Visit the VPBlocks connected to \p this, starting from it. 
-      for (VPBlockBase *Block : RPOT) { 
-        LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n'); 
-        Block->execute(State); 
-      } 
-    } 
-  } 
- 
-  // Exit replicating mode. 
-  State->Instance.reset(); 
-} 
- 
-void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { 
-  assert(!Parent && "Recipe already in some VPBasicBlock"); 
-  assert(InsertPos->getParent() && 
-         "Insertion position not in any VPBasicBlock"); 
-  Parent = InsertPos->getParent(); 
-  Parent->getRecipeList().insert(InsertPos->getIterator(), this); 
-} 
- 
-void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { 
-  assert(!Parent && "Recipe already in some VPBasicBlock"); 
-  assert(InsertPos->getParent() && 
-         "Insertion position not in any VPBasicBlock"); 
-  Parent = InsertPos->getParent(); 
-  Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this); 
-} 
- 
-void VPRecipeBase::removeFromParent() { 
-  assert(getParent() && "Recipe not in any VPBasicBlock"); 
-  getParent()->getRecipeList().remove(getIterator()); 
-  Parent = nullptr; 
-} 
- 
-iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { 
-  assert(getParent() && "Recipe not in any VPBasicBlock"); 
-  return getParent()->getRecipeList().erase(getIterator()); 
-} 
- 
-void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { 
-  removeFromParent(); 
-  insertAfter(InsertPos); 
-} 
- 
+      State->Instance->Lane = Lane;
+      // Visit the VPBlocks connected to \p this, starting from it.
+      for (VPBlockBase *Block : RPOT) {
+        LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
+        Block->execute(State);
+      }
+    }
+  }
+
+  // Exit replicating mode.
+  State->Instance.reset();
+}
+
+void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
+  assert(!Parent && "Recipe already in some VPBasicBlock");
+  assert(InsertPos->getParent() &&
+         "Insertion position not in any VPBasicBlock");
+  Parent = InsertPos->getParent();
+  Parent->getRecipeList().insert(InsertPos->getIterator(), this);
+}
+
+void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
+  assert(!Parent && "Recipe already in some VPBasicBlock");
+  assert(InsertPos->getParent() &&
+         "Insertion position not in any VPBasicBlock");
+  Parent = InsertPos->getParent();
+  Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this);
+}
+
+void VPRecipeBase::removeFromParent() {
+  assert(getParent() && "Recipe not in any VPBasicBlock");
+  getParent()->getRecipeList().remove(getIterator());
+  Parent = nullptr;
+}
+
+iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
+  assert(getParent() && "Recipe not in any VPBasicBlock");
+  return getParent()->getRecipeList().erase(getIterator());
+}
+
+void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
+  removeFromParent();
+  insertAfter(InsertPos);
+}
+
 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
                               iplist<VPRecipeBase>::iterator I) {
   assert(I == BB.end() || I->getParent() == &BB);
@@ -461,395 +461,395 @@ void VPRecipeBase::moveBefore(VPBasicBlock &BB,
   BB.getRecipeList().insert(I, this);
 }
 
-void VPInstruction::generateInstruction(VPTransformState &State, 
-                                        unsigned Part) { 
-  IRBuilder<> &Builder = State.Builder; 
- 
-  if (Instruction::isBinaryOp(getOpcode())) { 
-    Value *A = State.get(getOperand(0), Part); 
-    Value *B = State.get(getOperand(1), Part); 
-    Value *V = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B); 
-    State.set(this, V, Part); 
-    return; 
-  } 
- 
-  switch (getOpcode()) { 
-  case VPInstruction::Not: { 
-    Value *A = State.get(getOperand(0), Part); 
-    Value *V = Builder.CreateNot(A); 
-    State.set(this, V, Part); 
-    break; 
-  } 
-  case VPInstruction::ICmpULE: { 
-    Value *IV = State.get(getOperand(0), Part); 
-    Value *TC = State.get(getOperand(1), Part); 
-    Value *V = Builder.CreateICmpULE(IV, TC); 
-    State.set(this, V, Part); 
-    break; 
-  } 
-  case Instruction::Select: { 
-    Value *Cond = State.get(getOperand(0), Part); 
-    Value *Op1 = State.get(getOperand(1), Part); 
-    Value *Op2 = State.get(getOperand(2), Part); 
-    Value *V = Builder.CreateSelect(Cond, Op1, Op2); 
-    State.set(this, V, Part); 
-    break; 
-  } 
-  case VPInstruction::ActiveLaneMask: { 
-    // Get first lane of vector induction variable. 
-    Value *VIVElem0 = State.get(getOperand(0), {Part, 0}); 
+void VPInstruction::generateInstruction(VPTransformState &State,
+                                        unsigned Part) {
+  IRBuilder<> &Builder = State.Builder;
+
+  if (Instruction::isBinaryOp(getOpcode())) {
+    Value *A = State.get(getOperand(0), Part);
+    Value *B = State.get(getOperand(1), Part);
+    Value *V = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B);
+    State.set(this, V, Part);
+    return;
+  }
+
+  switch (getOpcode()) {
+  case VPInstruction::Not: {
+    Value *A = State.get(getOperand(0), Part);
+    Value *V = Builder.CreateNot(A);
+    State.set(this, V, Part);
+    break;
+  }
+  case VPInstruction::ICmpULE: {
+    Value *IV = State.get(getOperand(0), Part);
+    Value *TC = State.get(getOperand(1), Part);
+    Value *V = Builder.CreateICmpULE(IV, TC);
+    State.set(this, V, Part);
+    break;
+  }
+  case Instruction::Select: {
+    Value *Cond = State.get(getOperand(0), Part);
+    Value *Op1 = State.get(getOperand(1), Part);
+    Value *Op2 = State.get(getOperand(2), Part);
+    Value *V = Builder.CreateSelect(Cond, Op1, Op2);
+    State.set(this, V, Part);
+    break;
+  }
+  case VPInstruction::ActiveLaneMask: {
+    // Get first lane of vector induction variable.
+    Value *VIVElem0 = State.get(getOperand(0), {Part, 0});
     // Get the original loop tripcount.
     Value *ScalarTC = State.TripCount;
- 
-    auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); 
+
+    auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
     auto *PredTy = FixedVectorType::get(Int1Ty, State.VF.getKnownMinValue());
-    Instruction *Call = Builder.CreateIntrinsic( 
+    Instruction *Call = Builder.CreateIntrinsic(
         Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()},
         {VIVElem0, ScalarTC}, nullptr, "active.lane.mask");
-    State.set(this, Call, Part); 
-    break; 
-  } 
-  default: 
-    llvm_unreachable("Unsupported opcode for instruction"); 
-  } 
-} 
- 
-void VPInstruction::execute(VPTransformState &State) { 
-  assert(!State.Instance && "VPInstruction executing an Instance"); 
-  for (unsigned Part = 0; Part < State.UF; ++Part) 
-    generateInstruction(State, Part); 
-} 
- 
+    State.set(this, Call, Part);
+    break;
+  }
+  default:
+    llvm_unreachable("Unsupported opcode for instruction");
+  }
+}
+
+void VPInstruction::execute(VPTransformState &State) {
+  assert(!State.Instance && "VPInstruction executing an Instance");
+  for (unsigned Part = 0; Part < State.UF; ++Part)
+    generateInstruction(State, Part);
+}
+
 void VPInstruction::dump() const {
   VPSlotTracker SlotTracker(getParent()->getPlan());
   print(dbgs(), "", SlotTracker);
 }
 
-void VPInstruction::print(raw_ostream &O, const Twine &Indent, 
-                          VPSlotTracker &SlotTracker) const { 
+void VPInstruction::print(raw_ostream &O, const Twine &Indent,
+                          VPSlotTracker &SlotTracker) const {
   O << "EMIT ";
- 
-  if (hasResult()) { 
-    printAsOperand(O, SlotTracker); 
-    O << " = "; 
-  } 
- 
-  switch (getOpcode()) { 
-  case VPInstruction::Not: 
-    O << "not"; 
-    break; 
-  case VPInstruction::ICmpULE: 
-    O << "icmp ule"; 
-    break; 
-  case VPInstruction::SLPLoad: 
-    O << "combined load"; 
-    break; 
-  case VPInstruction::SLPStore: 
-    O << "combined store"; 
-    break; 
-  case VPInstruction::ActiveLaneMask: 
-    O << "active lane mask"; 
-    break; 
- 
-  default: 
-    O << Instruction::getOpcodeName(getOpcode()); 
-  } 
- 
-  for (const VPValue *Operand : operands()) { 
-    O << " "; 
-    Operand->printAsOperand(O, SlotTracker); 
-  } 
-} 
- 
-/// Generate the code inside the body of the vectorized loop. Assumes a single 
-/// LoopVectorBody basic-block was created for this. Introduce additional 
-/// basic-blocks as needed, and fill them all. 
-void VPlan::execute(VPTransformState *State) { 
-  // -1. Check if the backedge taken count is needed, and if so build it. 
-  if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) { 
-    Value *TC = State->TripCount; 
-    IRBuilder<> Builder(State->CFG.PrevBB->getTerminator()); 
-    auto *TCMO = Builder.CreateSub(TC, ConstantInt::get(TC->getType(), 1), 
-                                   "trip.count.minus.1"); 
-    auto VF = State->VF; 
-    Value *VTCMO = 
+
+  if (hasResult()) {
+    printAsOperand(O, SlotTracker);
+    O << " = ";
+  }
+
+  switch (getOpcode()) {
+  case VPInstruction::Not:
+    O << "not";
+    break;
+  case VPInstruction::ICmpULE:
+    O << "icmp ule";
+    break;
+  case VPInstruction::SLPLoad:
+    O << "combined load";
+    break;
+  case VPInstruction::SLPStore:
+    O << "combined store";
+    break;
+  case VPInstruction::ActiveLaneMask:
+    O << "active lane mask";
+    break;
+
+  default:
+    O << Instruction::getOpcodeName(getOpcode());
+  }
+
+  for (const VPValue *Operand : operands()) {
+    O << " ";
+    Operand->printAsOperand(O, SlotTracker);
+  }
+}
+
+/// Generate the code inside the body of the vectorized loop. Assumes a single
+/// LoopVectorBody basic-block was created for this. Introduce additional
+/// basic-blocks as needed, and fill them all.
+void VPlan::execute(VPTransformState *State) {
+  // -1. Check if the backedge taken count is needed, and if so build it.
+  if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) {
+    Value *TC = State->TripCount;
+    IRBuilder<> Builder(State->CFG.PrevBB->getTerminator());
+    auto *TCMO = Builder.CreateSub(TC, ConstantInt::get(TC->getType(), 1),
+                                   "trip.count.minus.1");
+    auto VF = State->VF;
+    Value *VTCMO =
         VF.isScalar() ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast");
-    for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) 
-      State->set(BackedgeTakenCount, VTCMO, Part); 
-  } 
- 
-  // 0. Set the reverse mapping from VPValues to Values for code generation. 
-  for (auto &Entry : Value2VPValue) 
-    State->VPValue2Value[Entry.second] = Entry.first; 
- 
-  BasicBlock *VectorPreHeaderBB = State->CFG.PrevBB; 
-  BasicBlock *VectorHeaderBB = VectorPreHeaderBB->getSingleSuccessor(); 
-  assert(VectorHeaderBB && "Loop preheader does not have a single successor."); 
- 
-  // 1. Make room to generate basic-blocks inside loop body if needed. 
-  BasicBlock *VectorLatchBB = VectorHeaderBB->splitBasicBlock( 
-      VectorHeaderBB->getFirstInsertionPt(), "vector.body.latch"); 
-  Loop *L = State->LI->getLoopFor(VectorHeaderBB); 
-  L->addBasicBlockToLoop(VectorLatchBB, *State->LI); 
-  // Remove the edge between Header and Latch to allow other connections. 
-  // Temporarily terminate with unreachable until CFG is rewired. 
-  // Note: this asserts the generated code's assumption that 
-  // getFirstInsertionPt() can be dereferenced into an Instruction. 
-  VectorHeaderBB->getTerminator()->eraseFromParent(); 
-  State->Builder.SetInsertPoint(VectorHeaderBB); 
-  UnreachableInst *Terminator = State->Builder.CreateUnreachable(); 
-  State->Builder.SetInsertPoint(Terminator); 
- 
-  // 2. Generate code in loop body. 
-  State->CFG.PrevVPBB = nullptr; 
-  State->CFG.PrevBB = VectorHeaderBB; 
-  State->CFG.LastBB = VectorLatchBB; 
- 
-  for (VPBlockBase *Block : depth_first(Entry)) 
-    Block->execute(State); 
- 
-  // Setup branch terminator successors for VPBBs in VPBBsToFix based on 
-  // VPBB's successors. 
-  for (auto VPBB : State->CFG.VPBBsToFix) { 
-    assert(EnableVPlanNativePath && 
-           "Unexpected VPBBsToFix in non VPlan-native path"); 
-    BasicBlock *BB = State->CFG.VPBB2IRBB[VPBB]; 
-    assert(BB && "Unexpected null basic block for VPBB"); 
- 
-    unsigned Idx = 0; 
-    auto *BBTerminator = BB->getTerminator(); 
- 
-    for (VPBlockBase *SuccVPBlock : VPBB->getHierarchicalSuccessors()) { 
-      VPBasicBlock *SuccVPBB = SuccVPBlock->getEntryBasicBlock(); 
-      BBTerminator->setSuccessor(Idx, State->CFG.VPBB2IRBB[SuccVPBB]); 
-      ++Idx; 
-    } 
-  } 
- 
-  // 3. Merge the temporary latch created with the last basic-block filled. 
-  BasicBlock *LastBB = State->CFG.PrevBB; 
-  // Connect LastBB to VectorLatchBB to facilitate their merge. 
-  assert((EnableVPlanNativePath || 
-          isa<UnreachableInst>(LastBB->getTerminator())) && 
-         "Expected InnerLoop VPlan CFG to terminate with unreachable"); 
-  assert((!EnableVPlanNativePath || isa<BranchInst>(LastBB->getTerminator())) && 
-         "Expected VPlan CFG to terminate with branch in NativePath"); 
-  LastBB->getTerminator()->eraseFromParent(); 
-  BranchInst::Create(VectorLatchBB, LastBB); 
- 
-  // Merge LastBB with Latch. 
-  bool Merged = MergeBlockIntoPredecessor(VectorLatchBB, nullptr, State->LI); 
-  (void)Merged; 
-  assert(Merged && "Could not merge last basic block with latch."); 
-  VectorLatchBB = LastBB; 
- 
-  // We do not attempt to preserve DT for outer loop vectorization currently. 
-  if (!EnableVPlanNativePath) 
-    updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB, 
-                        L->getExitBlock()); 
-} 
- 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 
-LLVM_DUMP_METHOD 
-void VPlan::dump() const { dbgs() << *this << '\n'; } 
-#endif 
- 
-void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB, 
-                                BasicBlock *LoopLatchBB, 
-                                BasicBlock *LoopExitBB) { 
-  BasicBlock *LoopHeaderBB = LoopPreHeaderBB->getSingleSuccessor(); 
-  assert(LoopHeaderBB && "Loop preheader does not have a single successor."); 
-  // The vector body may be more than a single basic-block by this point. 
-  // Update the dominator tree information inside the vector body by propagating 
-  // it from header to latch, expecting only triangular control-flow, if any. 
-  BasicBlock *PostDomSucc = nullptr; 
-  for (auto *BB = LoopHeaderBB; BB != LoopLatchBB; BB = PostDomSucc) { 
-    // Get the list of successors of this block. 
-    std::vector<BasicBlock *> Succs(succ_begin(BB), succ_end(BB)); 
-    assert(Succs.size() <= 2 && 
-           "Basic block in vector loop has more than 2 successors."); 
-    PostDomSucc = Succs[0]; 
-    if (Succs.size() == 1) { 
-      assert(PostDomSucc->getSinglePredecessor() && 
-             "PostDom successor has more than one predecessor."); 
-      DT->addNewBlock(PostDomSucc, BB); 
-      continue; 
-    } 
-    BasicBlock *InterimSucc = Succs[1]; 
-    if (PostDomSucc->getSingleSuccessor() == InterimSucc) { 
-      PostDomSucc = Succs[1]; 
-      InterimSucc = Succs[0]; 
-    } 
-    assert(InterimSucc->getSingleSuccessor() == PostDomSucc && 
-           "One successor of a basic block does not lead to the other."); 
-    assert(InterimSucc->getSinglePredecessor() && 
-           "Interim successor has more than one predecessor."); 
-    assert(PostDomSucc->hasNPredecessors(2) && 
-           "PostDom successor has more than two predecessors."); 
-    DT->addNewBlock(InterimSucc, BB); 
-    DT->addNewBlock(PostDomSucc, BB); 
-  } 
-  // Latch block is a new dominator for the loop exit. 
-  DT->changeImmediateDominator(LoopExitBB, LoopLatchBB); 
-  assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 
-} 
- 
-const Twine VPlanPrinter::getUID(const VPBlockBase *Block) { 
-  return (isa<VPRegionBlock>(Block) ? "cluster_N" : "N") + 
-         Twine(getOrCreateBID(Block)); 
-} 
- 
-const Twine VPlanPrinter::getOrCreateName(const VPBlockBase *Block) { 
-  const std::string &Name = Block->getName(); 
-  if (!Name.empty()) 
-    return Name; 
-  return "VPB" + Twine(getOrCreateBID(Block)); 
-} 
- 
-void VPlanPrinter::dump() { 
-  Depth = 1; 
-  bumpIndent(0); 
-  OS << "digraph VPlan {\n"; 
-  OS << "graph [labelloc=t, fontsize=30; label=\"Vectorization Plan"; 
-  if (!Plan.getName().empty()) 
-    OS << "\\n" << DOT::EscapeString(Plan.getName()); 
-  if (Plan.BackedgeTakenCount) { 
-    OS << ", where:\\n"; 
-    Plan.BackedgeTakenCount->print(OS, SlotTracker); 
-    OS << " := BackedgeTakenCount"; 
-  } 
-  OS << "\"]\n"; 
-  OS << "node [shape=rect, fontname=Courier, fontsize=30]\n"; 
-  OS << "edge [fontname=Courier, fontsize=30]\n"; 
-  OS << "compound=true\n"; 
- 
-  for (const VPBlockBase *Block : depth_first(Plan.getEntry())) 
-    dumpBlock(Block); 
- 
-  OS << "}\n"; 
-} 
- 
-void VPlanPrinter::dumpBlock(const VPBlockBase *Block) { 
-  if (const VPBasicBlock *BasicBlock = dyn_cast<VPBasicBlock>(Block)) 
-    dumpBasicBlock(BasicBlock); 
-  else if (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) 
-    dumpRegion(Region); 
-  else 
-    llvm_unreachable("Unsupported kind of VPBlock."); 
-} 
- 
-void VPlanPrinter::drawEdge(const VPBlockBase *From, const VPBlockBase *To, 
-                            bool Hidden, const Twine &Label) { 
-  // Due to "dot" we print an edge between two regions as an edge between the 
-  // exit basic block and the entry basic of the respective regions. 
-  const VPBlockBase *Tail = From->getExitBasicBlock(); 
-  const VPBlockBase *Head = To->getEntryBasicBlock(); 
-  OS << Indent << getUID(Tail) << " -> " << getUID(Head); 
-  OS << " [ label=\"" << Label << '\"'; 
-  if (Tail != From) 
-    OS << " ltail=" << getUID(From); 
-  if (Head != To) 
-    OS << " lhead=" << getUID(To); 
-  if (Hidden) 
-    OS << "; splines=none"; 
-  OS << "]\n"; 
-} 
- 
-void VPlanPrinter::dumpEdges(const VPBlockBase *Block) { 
-  auto &Successors = Block->getSuccessors(); 
-  if (Successors.size() == 1) 
-    drawEdge(Block, Successors.front(), false, ""); 
-  else if (Successors.size() == 2) { 
-    drawEdge(Block, Successors.front(), false, "T"); 
-    drawEdge(Block, Successors.back(), false, "F"); 
-  } else { 
-    unsigned SuccessorNumber = 0; 
-    for (auto *Successor : Successors) 
-      drawEdge(Block, Successor, false, Twine(SuccessorNumber++)); 
-  } 
-} 
- 
-void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) { 
-  OS << Indent << getUID(BasicBlock) << " [label =\n"; 
-  bumpIndent(1); 
-  OS << Indent << "\"" << DOT::EscapeString(BasicBlock->getName()) << ":\\n\""; 
-  bumpIndent(1); 
- 
-  // Dump the block predicate. 
-  const VPValue *Pred = BasicBlock->getPredicate(); 
-  if (Pred) { 
+    for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part)
+      State->set(BackedgeTakenCount, VTCMO, Part);
+  }
+
+  // 0. Set the reverse mapping from VPValues to Values for code generation.
+  for (auto &Entry : Value2VPValue)
+    State->VPValue2Value[Entry.second] = Entry.first;
+
+  BasicBlock *VectorPreHeaderBB = State->CFG.PrevBB;
+  BasicBlock *VectorHeaderBB = VectorPreHeaderBB->getSingleSuccessor();
+  assert(VectorHeaderBB && "Loop preheader does not have a single successor.");
+
+  // 1. Make room to generate basic-blocks inside loop body if needed.
+  BasicBlock *VectorLatchBB = VectorHeaderBB->splitBasicBlock(
+      VectorHeaderBB->getFirstInsertionPt(), "vector.body.latch");
+  Loop *L = State->LI->getLoopFor(VectorHeaderBB);
+  L->addBasicBlockToLoop(VectorLatchBB, *State->LI);
+  // Remove the edge between Header and Latch to allow other connections.
+  // Temporarily terminate with unreachable until CFG is rewired.
+  // Note: this asserts the generated code's assumption that
+  // getFirstInsertionPt() can be dereferenced into an Instruction.
+  VectorHeaderBB->getTerminator()->eraseFromParent();
+  State->Builder.SetInsertPoint(VectorHeaderBB);
+  UnreachableInst *Terminator = State->Builder.CreateUnreachable();
+  State->Builder.SetInsertPoint(Terminator);
+
+  // 2. Generate code in loop body.
+  State->CFG.PrevVPBB = nullptr;
+  State->CFG.PrevBB = VectorHeaderBB;
+  State->CFG.LastBB = VectorLatchBB;
+
+  for (VPBlockBase *Block : depth_first(Entry))
+    Block->execute(State);
+
+  // Setup branch terminator successors for VPBBs in VPBBsToFix based on
+  // VPBB's successors.
+  for (auto VPBB : State->CFG.VPBBsToFix) {
+    assert(EnableVPlanNativePath &&
+           "Unexpected VPBBsToFix in non VPlan-native path");
+    BasicBlock *BB = State->CFG.VPBB2IRBB[VPBB];
+    assert(BB && "Unexpected null basic block for VPBB");
+
+    unsigned Idx = 0;
+    auto *BBTerminator = BB->getTerminator();
+
+    for (VPBlockBase *SuccVPBlock : VPBB->getHierarchicalSuccessors()) {
+      VPBasicBlock *SuccVPBB = SuccVPBlock->getEntryBasicBlock();
+      BBTerminator->setSuccessor(Idx, State->CFG.VPBB2IRBB[SuccVPBB]);
+      ++Idx;
+    }
+  }
+
+  // 3. Merge the temporary latch created with the last basic-block filled.
+  BasicBlock *LastBB = State->CFG.PrevBB;
+  // Connect LastBB to VectorLatchBB to facilitate their merge.
+  assert((EnableVPlanNativePath ||
+          isa<UnreachableInst>(LastBB->getTerminator())) &&
+         "Expected InnerLoop VPlan CFG to terminate with unreachable");
+  assert((!EnableVPlanNativePath || isa<BranchInst>(LastBB->getTerminator())) &&
+         "Expected VPlan CFG to terminate with branch in NativePath");
+  LastBB->getTerminator()->eraseFromParent();
+  BranchInst::Create(VectorLatchBB, LastBB);
+
+  // Merge LastBB with Latch.
+  bool Merged = MergeBlockIntoPredecessor(VectorLatchBB, nullptr, State->LI);
+  (void)Merged;
+  assert(Merged && "Could not merge last basic block with latch.");
+  VectorLatchBB = LastBB;
+
+  // We do not attempt to preserve DT for outer loop vectorization currently.
+  if (!EnableVPlanNativePath)
+    updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB,
+                        L->getExitBlock());
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
+void VPlan::dump() const { dbgs() << *this << '\n'; }
+#endif
+
+void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB,
+                                BasicBlock *LoopLatchBB,
+                                BasicBlock *LoopExitBB) {
+  BasicBlock *LoopHeaderBB = LoopPreHeaderBB->getSingleSuccessor();
+  assert(LoopHeaderBB && "Loop preheader does not have a single successor.");
+  // The vector body may be more than a single basic-block by this point.
+  // Update the dominator tree information inside the vector body by propagating
+  // it from header to latch, expecting only triangular control-flow, if any.
+  BasicBlock *PostDomSucc = nullptr;
+  for (auto *BB = LoopHeaderBB; BB != LoopLatchBB; BB = PostDomSucc) {
+    // Get the list of successors of this block.
+    std::vector<BasicBlock *> Succs(succ_begin(BB), succ_end(BB));
+    assert(Succs.size() <= 2 &&
+           "Basic block in vector loop has more than 2 successors.");
+    PostDomSucc = Succs[0];
+    if (Succs.size() == 1) {
+      assert(PostDomSucc->getSinglePredecessor() &&
+             "PostDom successor has more than one predecessor.");
+      DT->addNewBlock(PostDomSucc, BB);
+      continue;
+    }
+    BasicBlock *InterimSucc = Succs[1];
+    if (PostDomSucc->getSingleSuccessor() == InterimSucc) {
+      PostDomSucc = Succs[1];
+      InterimSucc = Succs[0];
+    }
+    assert(InterimSucc->getSingleSuccessor() == PostDomSucc &&
+           "One successor of a basic block does not lead to the other.");
+    assert(InterimSucc->getSinglePredecessor() &&
+           "Interim successor has more than one predecessor.");
+    assert(PostDomSucc->hasNPredecessors(2) &&
+           "PostDom successor has more than two predecessors.");
+    DT->addNewBlock(InterimSucc, BB);
+    DT->addNewBlock(PostDomSucc, BB);
+  }
+  // Latch block is a new dominator for the loop exit.
+  DT->changeImmediateDominator(LoopExitBB, LoopLatchBB);
+  assert(DT->verify(DominatorTree::VerificationLevel::Fast));
+}
+
+const Twine VPlanPrinter::getUID(const VPBlockBase *Block) {
+  return (isa<VPRegionBlock>(Block) ? "cluster_N" : "N") +
+         Twine(getOrCreateBID(Block));
+}
+
+const Twine VPlanPrinter::getOrCreateName(const VPBlockBase *Block) {
+  const std::string &Name = Block->getName();
+  if (!Name.empty())
+    return Name;
+  return "VPB" + Twine(getOrCreateBID(Block));
+}
+
+void VPlanPrinter::dump() {
+  Depth = 1;
+  bumpIndent(0);
+  OS << "digraph VPlan {\n";
+  OS << "graph [labelloc=t, fontsize=30; label=\"Vectorization Plan";
+  if (!Plan.getName().empty())
+    OS << "\\n" << DOT::EscapeString(Plan.getName());
+  if (Plan.BackedgeTakenCount) {
+    OS << ", where:\\n";
+    Plan.BackedgeTakenCount->print(OS, SlotTracker);
+    OS << " := BackedgeTakenCount";
+  }
+  OS << "\"]\n";
+  OS << "node [shape=rect, fontname=Courier, fontsize=30]\n";
+  OS << "edge [fontname=Courier, fontsize=30]\n";
+  OS << "compound=true\n";
+
+  for (const VPBlockBase *Block : depth_first(Plan.getEntry()))
+    dumpBlock(Block);
+
+  OS << "}\n";
+}
+
+void VPlanPrinter::dumpBlock(const VPBlockBase *Block) {
+  if (const VPBasicBlock *BasicBlock = dyn_cast<VPBasicBlock>(Block))
+    dumpBasicBlock(BasicBlock);
+  else if (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
+    dumpRegion(Region);
+  else
+    llvm_unreachable("Unsupported kind of VPBlock.");
+}
+
+void VPlanPrinter::drawEdge(const VPBlockBase *From, const VPBlockBase *To,
+                            bool Hidden, const Twine &Label) {
+  // Due to "dot" we print an edge between two regions as an edge between the
+  // exit basic block and the entry basic of the respective regions.
+  const VPBlockBase *Tail = From->getExitBasicBlock();
+  const VPBlockBase *Head = To->getEntryBasicBlock();
+  OS << Indent << getUID(Tail) << " -> " << getUID(Head);
+  OS << " [ label=\"" << Label << '\"';
+  if (Tail != From)
+    OS << " ltail=" << getUID(From);
+  if (Head != To)
+    OS << " lhead=" << getUID(To);
+  if (Hidden)
+    OS << "; splines=none";
+  OS << "]\n";
+}
+
+void VPlanPrinter::dumpEdges(const VPBlockBase *Block) {
+  auto &Successors = Block->getSuccessors();
+  if (Successors.size() == 1)
+    drawEdge(Block, Successors.front(), false, "");
+  else if (Successors.size() == 2) {
+    drawEdge(Block, Successors.front(), false, "T");
+    drawEdge(Block, Successors.back(), false, "F");
+  } else {
+    unsigned SuccessorNumber = 0;
+    for (auto *Successor : Successors)
+      drawEdge(Block, Successor, false, Twine(SuccessorNumber++));
+  }
+}
+
+void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) {
+  OS << Indent << getUID(BasicBlock) << " [label =\n";
+  bumpIndent(1);
+  OS << Indent << "\"" << DOT::EscapeString(BasicBlock->getName()) << ":\\n\"";
+  bumpIndent(1);
+
+  // Dump the block predicate.
+  const VPValue *Pred = BasicBlock->getPredicate();
+  if (Pred) {
     OS << " +\n" << Indent << " \"BlockPredicate: \"";
-    if (const VPInstruction *PredI = dyn_cast<VPInstruction>(Pred)) { 
-      PredI->printAsOperand(OS, SlotTracker); 
-      OS << " (" << DOT::EscapeString(PredI->getParent()->getName()) 
-         << ")\\l\""; 
-    } else 
-      Pred->printAsOperand(OS, SlotTracker); 
-  } 
- 
-  for (const VPRecipeBase &Recipe : *BasicBlock) { 
+    if (const VPInstruction *PredI = dyn_cast<VPInstruction>(Pred)) {
+      PredI->printAsOperand(OS, SlotTracker);
+      OS << " (" << DOT::EscapeString(PredI->getParent()->getName())
+         << ")\\l\"";
+    } else
+      Pred->printAsOperand(OS, SlotTracker);
+  }
+
+  for (const VPRecipeBase &Recipe : *BasicBlock) {
     OS << " +\n" << Indent << "\"";
-    Recipe.print(OS, Indent, SlotTracker); 
-    OS << "\\l\""; 
-  } 
- 
-  // Dump the condition bit. 
-  const VPValue *CBV = BasicBlock->getCondBit(); 
-  if (CBV) { 
-    OS << " +\n" << Indent << " \"CondBit: "; 
-    if (const VPInstruction *CBI = dyn_cast<VPInstruction>(CBV)) { 
-      CBI->printAsOperand(OS, SlotTracker); 
-      OS << " (" << DOT::EscapeString(CBI->getParent()->getName()) << ")\\l\""; 
-    } else { 
-      CBV->printAsOperand(OS, SlotTracker); 
-      OS << "\""; 
-    } 
-  } 
- 
-  bumpIndent(-2); 
-  OS << "\n" << Indent << "]\n"; 
-  dumpEdges(BasicBlock); 
-} 
- 
-void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) { 
-  OS << Indent << "subgraph " << getUID(Region) << " {\n"; 
-  bumpIndent(1); 
-  OS << Indent << "fontname=Courier\n" 
-     << Indent << "label=\"" 
-     << DOT::EscapeString(Region->isReplicator() ? "<xVFxUF> " : "<x1> ") 
-     << DOT::EscapeString(Region->getName()) << "\"\n"; 
-  // Dump the blocks of the region. 
-  assert(Region->getEntry() && "Region contains no inner blocks."); 
-  for (const VPBlockBase *Block : depth_first(Region->getEntry())) 
-    dumpBlock(Block); 
-  bumpIndent(-1); 
-  OS << Indent << "}\n"; 
-  dumpEdges(Region); 
-} 
- 
+    Recipe.print(OS, Indent, SlotTracker);
+    OS << "\\l\"";
+  }
+
+  // Dump the condition bit.
+  const VPValue *CBV = BasicBlock->getCondBit();
+  if (CBV) {
+    OS << " +\n" << Indent << " \"CondBit: ";
+    if (const VPInstruction *CBI = dyn_cast<VPInstruction>(CBV)) {
+      CBI->printAsOperand(OS, SlotTracker);
+      OS << " (" << DOT::EscapeString(CBI->getParent()->getName()) << ")\\l\"";
+    } else {
+      CBV->printAsOperand(OS, SlotTracker);
+      OS << "\"";
+    }
+  }
+
+  bumpIndent(-2);
+  OS << "\n" << Indent << "]\n";
+  dumpEdges(BasicBlock);
+}
+
+void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) {
+  OS << Indent << "subgraph " << getUID(Region) << " {\n";
+  bumpIndent(1);
+  OS << Indent << "fontname=Courier\n"
+     << Indent << "label=\""
+     << DOT::EscapeString(Region->isReplicator() ? "<xVFxUF> " : "<x1> ")
+     << DOT::EscapeString(Region->getName()) << "\"\n";
+  // Dump the blocks of the region.
+  assert(Region->getEntry() && "Region contains no inner blocks.");
+  for (const VPBlockBase *Block : depth_first(Region->getEntry()))
+    dumpBlock(Block);
+  bumpIndent(-1);
+  OS << Indent << "}\n";
+  dumpEdges(Region);
+}
+
 void VPlanPrinter::printAsIngredient(raw_ostream &O, const Value *V) {
-  std::string IngredientString; 
-  raw_string_ostream RSO(IngredientString); 
-  if (auto *Inst = dyn_cast<Instruction>(V)) { 
-    if (!Inst->getType()->isVoidTy()) { 
-      Inst->printAsOperand(RSO, false); 
-      RSO << " = "; 
-    } 
-    RSO << Inst->getOpcodeName() << " "; 
-    unsigned E = Inst->getNumOperands(); 
-    if (E > 0) { 
-      Inst->getOperand(0)->printAsOperand(RSO, false); 
-      for (unsigned I = 1; I < E; ++I) 
-        Inst->getOperand(I)->printAsOperand(RSO << ", ", false); 
-    } 
-  } else // !Inst 
-    V->printAsOperand(RSO, false); 
-  RSO.flush(); 
-  O << DOT::EscapeString(IngredientString); 
-} 
- 
-void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, 
-                              VPSlotTracker &SlotTracker) const { 
+  std::string IngredientString;
+  raw_string_ostream RSO(IngredientString);
+  if (auto *Inst = dyn_cast<Instruction>(V)) {
+    if (!Inst->getType()->isVoidTy()) {
+      Inst->printAsOperand(RSO, false);
+      RSO << " = ";
+    }
+    RSO << Inst->getOpcodeName() << " ";
+    unsigned E = Inst->getNumOperands();
+    if (E > 0) {
+      Inst->getOperand(0)->printAsOperand(RSO, false);
+      for (unsigned I = 1; I < E; ++I)
+        Inst->getOperand(I)->printAsOperand(RSO << ", ", false);
+    }
+  } else // !Inst
+    V->printAsOperand(RSO, false);
+  RSO.flush();
+  O << DOT::EscapeString(IngredientString);
+}
+
+void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
+                              VPSlotTracker &SlotTracker) const {
   O << "WIDEN-CALL ";
 
   auto *CI = cast<CallInst>(getUnderlyingInstr());
@@ -863,10 +863,10 @@ void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
   O << "call @" << CI->getCalledFunction()->getName() << "(";
   printOperands(O, SlotTracker);
   O << ")";
-} 
- 
-void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, 
-                                VPSlotTracker &SlotTracker) const { 
+}
+
+void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
+                                VPSlotTracker &SlotTracker) const {
   O << "WIDEN-SELECT ";
   printAsOperand(O, SlotTracker);
   O << " = select ";
@@ -876,66 +876,66 @@ void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
   O << ", ";
   getOperand(2)->printAsOperand(O, SlotTracker);
   O << (InvariantCond ? " (condition is loop invariant)" : "");
-} 
- 
-void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, 
-                          VPSlotTracker &SlotTracker) const { 
+}
+
+void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
+                          VPSlotTracker &SlotTracker) const {
   O << "WIDEN ";
   printAsOperand(O, SlotTracker);
   O << " = " << getUnderlyingInstr()->getOpcodeName() << " ";
   printOperands(O, SlotTracker);
-} 
- 
-void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, 
-                                          VPSlotTracker &SlotTracker) const { 
+}
+
+void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
+                                          VPSlotTracker &SlotTracker) const {
   O << "WIDEN-INDUCTION";
-  if (Trunc) { 
-    O << "\\l\""; 
-    O << " +\n" << Indent << "\"  " << VPlanIngredient(IV) << "\\l\""; 
-    O << " +\n" << Indent << "\"  " << VPlanIngredient(Trunc); 
-  } else 
-    O << " " << VPlanIngredient(IV); 
-} 
- 
-void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, 
-                             VPSlotTracker &SlotTracker) const { 
+  if (Trunc) {
+    O << "\\l\"";
+    O << " +\n" << Indent << "\"  " << VPlanIngredient(IV) << "\\l\"";
+    O << " +\n" << Indent << "\"  " << VPlanIngredient(Trunc);
+  } else
+    O << " " << VPlanIngredient(IV);
+}
+
+void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
+                             VPSlotTracker &SlotTracker) const {
   O << "WIDEN-GEP ";
-  O << (IsPtrLoopInvariant ? "Inv" : "Var"); 
-  size_t IndicesNumber = IsIndexLoopInvariant.size(); 
-  for (size_t I = 0; I < IndicesNumber; ++I) 
-    O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]"; 
+  O << (IsPtrLoopInvariant ? "Inv" : "Var");
+  size_t IndicesNumber = IsIndexLoopInvariant.size();
+  for (size_t I = 0; I < IndicesNumber; ++I)
+    O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]";
 
   O << " ";
   printAsOperand(O, SlotTracker);
   O << " = getelementptr ";
   printOperands(O, SlotTracker);
-} 
- 
-void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, 
-                             VPSlotTracker &SlotTracker) const { 
+}
+
+void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+                             VPSlotTracker &SlotTracker) const {
   O << "WIDEN-PHI " << VPlanIngredient(Phi);
-} 
- 
-void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, 
-                          VPSlotTracker &SlotTracker) const { 
+}
+
+void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
+                          VPSlotTracker &SlotTracker) const {
   O << "BLEND ";
-  Phi->printAsOperand(O, false); 
-  O << " ="; 
-  if (getNumIncomingValues() == 1) { 
-    // Not a User of any mask: not really blending, this is a 
-    // single-predecessor phi. 
-    O << " "; 
-    getIncomingValue(0)->printAsOperand(O, SlotTracker); 
-  } else { 
-    for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { 
-      O << " "; 
-      getIncomingValue(I)->printAsOperand(O, SlotTracker); 
-      O << "/"; 
-      getMask(I)->printAsOperand(O, SlotTracker); 
-    } 
-  } 
-} 
- 
+  Phi->printAsOperand(O, false);
+  O << " =";
+  if (getNumIncomingValues() == 1) {
+    // Not a User of any mask: not really blending, this is a
+    // single-predecessor phi.
+    O << " ";
+    getIncomingValue(0)->printAsOperand(O, SlotTracker);
+  } else {
+    for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
+      O << " ";
+      getIncomingValue(I)->printAsOperand(O, SlotTracker);
+      O << "/";
+      getMask(I)->printAsOperand(O, SlotTracker);
+    }
+  }
+}
+
 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
                               VPSlotTracker &SlotTracker) const {
   O << "REDUCE ";
@@ -952,8 +952,8 @@ void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
   O << ")";
 }
 
-void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, 
-                              VPSlotTracker &SlotTracker) const { 
+void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
+                              VPSlotTracker &SlotTracker) const {
   O << (IsUniform ? "CLONE " : "REPLICATE ");
 
   if (!getUnderlyingInstr()->getType()->isVoidTy()) {
@@ -963,182 +963,182 @@ void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
   O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()) << " ";
   printOperands(O, SlotTracker);
 
-  if (AlsoPack) 
-    O << " (S->V)"; 
-} 
- 
-void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, 
-                                VPSlotTracker &SlotTracker) const { 
+  if (AlsoPack)
+    O << " (S->V)";
+}
+
+void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+                                VPSlotTracker &SlotTracker) const {
   O << "PHI-PREDICATED-INSTRUCTION ";
   printOperands(O, SlotTracker);
-} 
- 
-void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent, 
-                                           VPSlotTracker &SlotTracker) const { 
+}
+
+void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
+                                           VPSlotTracker &SlotTracker) const {
   O << "WIDEN ";
 
   if (!isStore()) {
     getVPValue()->printAsOperand(O, SlotTracker);
     O << " = ";
-  } 
+  }
   O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " ";
 
   printOperands(O, SlotTracker);
-} 
- 
-void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { 
-  Value *CanonicalIV = State.CanonicalIV; 
-  Type *STy = CanonicalIV->getType(); 
-  IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); 
+}
+
+void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
+  Value *CanonicalIV = State.CanonicalIV;
+  Type *STy = CanonicalIV->getType();
+  IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
   ElementCount VF = State.VF;
   assert(!VF.isScalable() && "the code following assumes non scalables ECs");
   Value *VStart = VF.isScalar()
-                      ? CanonicalIV 
+                      ? CanonicalIV
                       : Builder.CreateVectorSplat(VF.getKnownMinValue(),
                                                   CanonicalIV, "broadcast");
-  for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { 
-    SmallVector<Constant *, 8> Indices; 
+  for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
+    SmallVector<Constant *, 8> Indices;
     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
       Indices.push_back(
           ConstantInt::get(STy, Part * VF.getKnownMinValue() + Lane));
-    // If VF == 1, there is only one iteration in the loop above, thus the 
-    // element pushed back into Indices is ConstantInt::get(STy, Part) 
+    // If VF == 1, there is only one iteration in the loop above, thus the
+    // element pushed back into Indices is ConstantInt::get(STy, Part)
     Constant *VStep =
         VF.isScalar() ? Indices.back() : ConstantVector::get(Indices);
-    // Add the consecutive indices to the vector value. 
-    Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); 
-    State.set(getVPValue(), CanonicalVectorIV, Part); 
-  } 
-} 
- 
-void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, 
-                                     VPSlotTracker &SlotTracker) const { 
+    // Add the consecutive indices to the vector value.
+    Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
+    State.set(getVPValue(), CanonicalVectorIV, Part);
+  }
+}
+
+void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
+                                     VPSlotTracker &SlotTracker) const {
   O << "EMIT ";
-  getVPValue()->printAsOperand(O, SlotTracker); 
-  O << " = WIDEN-CANONICAL-INDUCTION"; 
-} 
- 
-template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT); 
- 
-void VPValue::replaceAllUsesWith(VPValue *New) { 
+  getVPValue()->printAsOperand(O, SlotTracker);
+  O << " = WIDEN-CANONICAL-INDUCTION";
+}
+
+template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT);
+
+void VPValue::replaceAllUsesWith(VPValue *New) {
   for (unsigned J = 0; J < getNumUsers();) {
     VPUser *User = Users[J];
     unsigned NumUsers = getNumUsers();
-    for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I) 
-      if (User->getOperand(I) == this) 
-        User->setOperand(I, New); 
+    for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I)
+      if (User->getOperand(I) == this)
+        User->setOperand(I, New);
     // If a user got removed after updating the current user, the next user to
     // update will be moved to the current position, so we only need to
     // increment the index if the number of users did not change.
     if (NumUsers == getNumUsers())
       J++;
   }
-} 
- 
-void VPValue::printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const { 
-  if (const Value *UV = getUnderlyingValue()) { 
-    OS << "ir<"; 
-    UV->printAsOperand(OS, false); 
-    OS << ">"; 
-    return; 
-  } 
- 
-  unsigned Slot = Tracker.getSlot(this); 
-  if (Slot == unsigned(-1)) 
-    OS << "<badref>"; 
-  else 
-    OS << "vp<%" << Tracker.getSlot(this) << ">"; 
-} 
- 
+}
+
+void VPValue::printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const {
+  if (const Value *UV = getUnderlyingValue()) {
+    OS << "ir<";
+    UV->printAsOperand(OS, false);
+    OS << ">";
+    return;
+  }
+
+  unsigned Slot = Tracker.getSlot(this);
+  if (Slot == unsigned(-1))
+    OS << "<badref>";
+  else
+    OS << "vp<%" << Tracker.getSlot(this) << ">";
+}
+
 void VPUser::printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const {
   interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
     Op->printAsOperand(O, SlotTracker);
   });
 }
 
-void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region, 
-                                          Old2NewTy &Old2New, 
-                                          InterleavedAccessInfo &IAI) { 
-  ReversePostOrderTraversal<VPBlockBase *> RPOT(Region->getEntry()); 
-  for (VPBlockBase *Base : RPOT) { 
-    visitBlock(Base, Old2New, IAI); 
-  } 
-} 
- 
-void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New, 
-                                         InterleavedAccessInfo &IAI) { 
-  if (VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(Block)) { 
-    for (VPRecipeBase &VPI : *VPBB) { 
-      assert(isa<VPInstruction>(&VPI) && "Can only handle VPInstructions"); 
-      auto *VPInst = cast<VPInstruction>(&VPI); 
-      auto *Inst = cast<Instruction>(VPInst->getUnderlyingValue()); 
-      auto *IG = IAI.getInterleaveGroup(Inst); 
-      if (!IG) 
-        continue; 
- 
-      auto NewIGIter = Old2New.find(IG); 
-      if (NewIGIter == Old2New.end()) 
-        Old2New[IG] = new InterleaveGroup<VPInstruction>( 
-            IG->getFactor(), IG->isReverse(), IG->getAlign()); 
- 
-      if (Inst == IG->getInsertPos()) 
-        Old2New[IG]->setInsertPos(VPInst); 
- 
-      InterleaveGroupMap[VPInst] = Old2New[IG]; 
-      InterleaveGroupMap[VPInst]->insertMember( 
-          VPInst, IG->getIndex(Inst), 
-          Align(IG->isReverse() ? (-1) * int(IG->getFactor()) 
-                                : IG->getFactor())); 
-    } 
-  } else if (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) 
-    visitRegion(Region, Old2New, IAI); 
-  else 
-    llvm_unreachable("Unsupported kind of VPBlock."); 
-} 
- 
-VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan, 
-                                                 InterleavedAccessInfo &IAI) { 
-  Old2NewTy Old2New; 
-  visitRegion(cast<VPRegionBlock>(Plan.getEntry()), Old2New, IAI); 
-} 
- 
-void VPSlotTracker::assignSlot(const VPValue *V) { 
-  assert(Slots.find(V) == Slots.end() && "VPValue already has a slot!"); 
-  Slots[V] = NextSlot++; 
-} 
- 
-void VPSlotTracker::assignSlots(const VPBlockBase *VPBB) { 
-  if (auto *Region = dyn_cast<VPRegionBlock>(VPBB)) 
-    assignSlots(Region); 
-  else 
-    assignSlots(cast<VPBasicBlock>(VPBB)); 
-} 
- 
-void VPSlotTracker::assignSlots(const VPRegionBlock *Region) { 
-  ReversePostOrderTraversal<const VPBlockBase *> RPOT(Region->getEntry()); 
-  for (const VPBlockBase *Block : RPOT) 
-    assignSlots(Block); 
-} 
- 
-void VPSlotTracker::assignSlots(const VPBasicBlock *VPBB) { 
-  for (const VPRecipeBase &Recipe : *VPBB) { 
+void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region,
+                                          Old2NewTy &Old2New,
+                                          InterleavedAccessInfo &IAI) {
+  ReversePostOrderTraversal<VPBlockBase *> RPOT(Region->getEntry());
+  for (VPBlockBase *Base : RPOT) {
+    visitBlock(Base, Old2New, IAI);
+  }
+}
+
+void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
+                                         InterleavedAccessInfo &IAI) {
+  if (VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(Block)) {
+    for (VPRecipeBase &VPI : *VPBB) {
+      assert(isa<VPInstruction>(&VPI) && "Can only handle VPInstructions");
+      auto *VPInst = cast<VPInstruction>(&VPI);
+      auto *Inst = cast<Instruction>(VPInst->getUnderlyingValue());
+      auto *IG = IAI.getInterleaveGroup(Inst);
+      if (!IG)
+        continue;
+
+      auto NewIGIter = Old2New.find(IG);
+      if (NewIGIter == Old2New.end())
+        Old2New[IG] = new InterleaveGroup<VPInstruction>(
+            IG->getFactor(), IG->isReverse(), IG->getAlign());
+
+      if (Inst == IG->getInsertPos())
+        Old2New[IG]->setInsertPos(VPInst);
+
+      InterleaveGroupMap[VPInst] = Old2New[IG];
+      InterleaveGroupMap[VPInst]->insertMember(
+          VPInst, IG->getIndex(Inst),
+          Align(IG->isReverse() ? (-1) * int(IG->getFactor())
+                                : IG->getFactor()));
+    }
+  } else if (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
+    visitRegion(Region, Old2New, IAI);
+  else
+    llvm_unreachable("Unsupported kind of VPBlock.");
+}
+
+VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan,
+                                                 InterleavedAccessInfo &IAI) {
+  Old2NewTy Old2New;
+  visitRegion(cast<VPRegionBlock>(Plan.getEntry()), Old2New, IAI);
+}
+
+void VPSlotTracker::assignSlot(const VPValue *V) {
+  assert(Slots.find(V) == Slots.end() && "VPValue already has a slot!");
+  Slots[V] = NextSlot++;
+}
+
+void VPSlotTracker::assignSlots(const VPBlockBase *VPBB) {
+  if (auto *Region = dyn_cast<VPRegionBlock>(VPBB))
+    assignSlots(Region);
+  else
+    assignSlots(cast<VPBasicBlock>(VPBB));
+}
+
+void VPSlotTracker::assignSlots(const VPRegionBlock *Region) {
+  ReversePostOrderTraversal<const VPBlockBase *> RPOT(Region->getEntry());
+  for (const VPBlockBase *Block : RPOT)
+    assignSlots(Block);
+}
+
+void VPSlotTracker::assignSlots(const VPBasicBlock *VPBB) {
+  for (const VPRecipeBase &Recipe : *VPBB) {
     for (VPValue *Def : Recipe.definedValues())
       assignSlot(Def);
-  } 
-} 
- 
-void VPSlotTracker::assignSlots(const VPlan &Plan) { 
- 
-  for (const VPValue *V : Plan.VPExternalDefs) 
-    assignSlot(V); 
- 
-  for (const VPValue *V : Plan.VPCBVs) 
-    assignSlot(V); 
- 
-  if (Plan.BackedgeTakenCount) 
-    assignSlot(Plan.BackedgeTakenCount); 
- 
-  ReversePostOrderTraversal<const VPBlockBase *> RPOT(Plan.getEntry()); 
-  for (const VPBlockBase *Block : RPOT) 
-    assignSlots(Block); 
-} 
+  }
+}
+
+void VPSlotTracker::assignSlots(const VPlan &Plan) {
+
+  for (const VPValue *V : Plan.VPExternalDefs)
+    assignSlot(V);
+
+  for (const VPValue *V : Plan.VPCBVs)
+    assignSlot(V);
+
+  if (Plan.BackedgeTakenCount)
+    assignSlot(Plan.BackedgeTakenCount);
+
+  ReversePostOrderTraversal<const VPBlockBase *> RPOT(Plan.getEntry());
+  for (const VPBlockBase *Block : RPOT)
+    assignSlots(Block);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.h b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.h
index 1fd54e42f2..2cce127cd4 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.h
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.h
@@ -1,73 +1,73 @@
-//===- VPlan.h - Represent A Vectorizer Plan --------------------*- C++ -*-===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-/// \file 
-/// This file contains the declarations of the Vectorization Plan base classes: 
-/// 1. VPBasicBlock and VPRegionBlock that inherit from a common pure virtual 
-///    VPBlockBase, together implementing a Hierarchical CFG; 
-/// 2. Specializations of GraphTraits that allow VPBlockBase graphs to be 
-///    treated as proper graphs for generic algorithms; 
-/// 3. Pure virtual VPRecipeBase serving as the base class for recipes contained 
-///    within VPBasicBlocks; 
-/// 4. VPInstruction, a concrete Recipe and VPUser modeling a single planned 
-///    instruction; 
-/// 5. The VPlan class holding a candidate for vectorization; 
-/// 6. The VPlanPrinter class providing a way to print a plan in dot format; 
-/// These are documented in docs/VectorizationPlan.rst. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H 
-#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H 
- 
-#include "VPlanLoopInfo.h" 
-#include "VPlanValue.h" 
-#include "llvm/ADT/DenseMap.h" 
-#include "llvm/ADT/DepthFirstIterator.h" 
-#include "llvm/ADT/GraphTraits.h" 
-#include "llvm/ADT/Optional.h" 
-#include "llvm/ADT/SmallBitVector.h" 
-#include "llvm/ADT/SmallPtrSet.h" 
-#include "llvm/ADT/SmallSet.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Twine.h" 
-#include "llvm/ADT/ilist.h" 
-#include "llvm/ADT/ilist_node.h" 
-#include "llvm/Analysis/VectorUtils.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include <algorithm> 
-#include <cassert> 
-#include <cstddef> 
-#include <map> 
-#include <string> 
- 
-namespace llvm { 
- 
-class BasicBlock; 
-class DominatorTree; 
-class InnerLoopVectorizer; 
-class LoopInfo; 
-class raw_ostream; 
+//===- VPlan.h - Represent A Vectorizer Plan --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file contains the declarations of the Vectorization Plan base classes:
+/// 1. VPBasicBlock and VPRegionBlock that inherit from a common pure virtual
+///    VPBlockBase, together implementing a Hierarchical CFG;
+/// 2. Specializations of GraphTraits that allow VPBlockBase graphs to be
+///    treated as proper graphs for generic algorithms;
+/// 3. Pure virtual VPRecipeBase serving as the base class for recipes contained
+///    within VPBasicBlocks;
+/// 4. VPInstruction, a concrete Recipe and VPUser modeling a single planned
+///    instruction;
+/// 5. The VPlan class holding a candidate for vectorization;
+/// 6. The VPlanPrinter class providing a way to print a plan in dot format;
+/// These are documented in docs/VectorizationPlan.rst.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
+
+#include "VPlanLoopInfo.h"
+#include "VPlanValue.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/ilist.h"
+#include "llvm/ADT/ilist_node.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/IRBuilder.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <map>
+#include <string>
+
+namespace llvm {
+
+class BasicBlock;
+class DominatorTree;
+class InnerLoopVectorizer;
+class LoopInfo;
+class raw_ostream;
 class RecurrenceDescriptor;
-class Value; 
-class VPBasicBlock; 
-class VPRegionBlock; 
-class VPlan; 
-class VPlanSlp; 
- 
-/// A range of powers-of-2 vectorization factors with fixed start and 
-/// adjustable end. The range includes start and excludes end, e.g.,: 
-/// [1, 9) = {1, 2, 4, 8} 
-struct VFRange { 
-  // A power of 2. 
+class Value;
+class VPBasicBlock;
+class VPRegionBlock;
+class VPlan;
+class VPlanSlp;
+
+/// A range of powers-of-2 vectorization factors with fixed start and
+/// adjustable end. The range includes start and excludes end, e.g.,:
+/// [1, 9) = {1, 2, 4, 8}
+struct VFRange {
+  // A power of 2.
   const ElementCount Start;
- 
-  // Need not be a power of 2. If End <= Start range is empty. 
+
+  // Need not be a power of 2. If End <= Start range is empty.
   ElementCount End;
 
   bool isEmpty() const {
@@ -81,221 +81,221 @@ struct VFRange {
     assert(isPowerOf2_32(Start.getKnownMinValue()) &&
            "Expected Start to be a power of 2");
   }
-}; 
- 
-using VPlanPtr = std::unique_ptr<VPlan>; 
- 
-/// In what follows, the term "input IR" refers to code that is fed into the 
-/// vectorizer whereas the term "output IR" refers to code that is generated by 
-/// the vectorizer. 
- 
-/// VPIteration represents a single point in the iteration space of the output 
-/// (vectorized and/or unrolled) IR loop. 
-struct VPIteration { 
-  /// in [0..UF) 
-  unsigned Part; 
- 
-  /// in [0..VF) 
-  unsigned Lane; 
-}; 
- 
-/// This is a helper struct for maintaining vectorization state. It's used for 
-/// mapping values from the original loop to their corresponding values in 
-/// the new loop. Two mappings are maintained: one for vectorized values and 
-/// one for scalarized values. Vectorized values are represented with UF 
-/// vector values in the new loop, and scalarized values are represented with 
-/// UF x VF scalar values in the new loop. UF and VF are the unroll and 
-/// vectorization factors, respectively. 
-/// 
-/// Entries can be added to either map with setVectorValue and setScalarValue, 
-/// which assert that an entry was not already added before. If an entry is to 
-/// replace an existing one, call resetVectorValue and resetScalarValue. This is 
-/// currently needed to modify the mapped values during "fix-up" operations that 
-/// occur once the first phase of widening is complete. These operations include 
-/// type truncation and the second phase of recurrence widening. 
-/// 
-/// Entries from either map can be retrieved using the getVectorValue and 
-/// getScalarValue functions, which assert that the desired value exists. 
-struct VectorizerValueMap { 
-  friend struct VPTransformState; 
- 
-private: 
-  /// The unroll factor. Each entry in the vector map contains UF vector values. 
-  unsigned UF; 
- 
-  /// The vectorization factor. Each entry in the scalar map contains UF x VF 
-  /// scalar values. 
+};
+
+using VPlanPtr = std::unique_ptr<VPlan>;
+
+/// In what follows, the term "input IR" refers to code that is fed into the
+/// vectorizer whereas the term "output IR" refers to code that is generated by
+/// the vectorizer.
+
+/// VPIteration represents a single point in the iteration space of the output
+/// (vectorized and/or unrolled) IR loop.
+struct VPIteration {
+  /// in [0..UF)
+  unsigned Part;
+
+  /// in [0..VF)
+  unsigned Lane;
+};
+
+/// This is a helper struct for maintaining vectorization state. It's used for
+/// mapping values from the original loop to their corresponding values in
+/// the new loop. Two mappings are maintained: one for vectorized values and
+/// one for scalarized values. Vectorized values are represented with UF
+/// vector values in the new loop, and scalarized values are represented with
+/// UF x VF scalar values in the new loop. UF and VF are the unroll and
+/// vectorization factors, respectively.
+///
+/// Entries can be added to either map with setVectorValue and setScalarValue,
+/// which assert that an entry was not already added before. If an entry is to
+/// replace an existing one, call resetVectorValue and resetScalarValue. This is
+/// currently needed to modify the mapped values during "fix-up" operations that
+/// occur once the first phase of widening is complete. These operations include
+/// type truncation and the second phase of recurrence widening.
+///
+/// Entries from either map can be retrieved using the getVectorValue and
+/// getScalarValue functions, which assert that the desired value exists.
+struct VectorizerValueMap {
+  friend struct VPTransformState;
+
+private:
+  /// The unroll factor. Each entry in the vector map contains UF vector values.
+  unsigned UF;
+
+  /// The vectorization factor. Each entry in the scalar map contains UF x VF
+  /// scalar values.
   ElementCount VF;
- 
-  /// The vector and scalar map storage. We use std::map and not DenseMap 
-  /// because insertions to DenseMap invalidate its iterators. 
-  using VectorParts = SmallVector<Value *, 2>; 
-  using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 
-  std::map<Value *, VectorParts> VectorMapStorage; 
-  std::map<Value *, ScalarParts> ScalarMapStorage; 
- 
-public: 
-  /// Construct an empty map with the given unroll and vectorization factors. 
+
+  /// The vector and scalar map storage. We use std::map and not DenseMap
+  /// because insertions to DenseMap invalidate its iterators.
+  using VectorParts = SmallVector<Value *, 2>;
+  using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
+  std::map<Value *, VectorParts> VectorMapStorage;
+  std::map<Value *, ScalarParts> ScalarMapStorage;
+
+public:
+  /// Construct an empty map with the given unroll and vectorization factors.
   VectorizerValueMap(unsigned UF, ElementCount VF) : UF(UF), VF(VF) {}
- 
-  /// \return True if the map has any vector entry for \p Key. 
-  bool hasAnyVectorValue(Value *Key) const { 
-    return VectorMapStorage.count(Key); 
-  } 
- 
-  /// \return True if the map has a vector entry for \p Key and \p Part. 
-  bool hasVectorValue(Value *Key, unsigned Part) const { 
-    assert(Part < UF && "Queried Vector Part is too large."); 
-    if (!hasAnyVectorValue(Key)) 
-      return false; 
-    const VectorParts &Entry = VectorMapStorage.find(Key)->second; 
-    assert(Entry.size() == UF && "VectorParts has wrong dimensions."); 
-    return Entry[Part] != nullptr; 
-  } 
- 
-  /// \return True if the map has any scalar entry for \p Key. 
-  bool hasAnyScalarValue(Value *Key) const { 
-    return ScalarMapStorage.count(Key); 
-  } 
- 
-  /// \return True if the map has a scalar entry for \p Key and \p Instance. 
-  bool hasScalarValue(Value *Key, const VPIteration &Instance) const { 
-    assert(Instance.Part < UF && "Queried Scalar Part is too large."); 
+
+  /// \return True if the map has any vector entry for \p Key.
+  bool hasAnyVectorValue(Value *Key) const {
+    return VectorMapStorage.count(Key);
+  }
+
+  /// \return True if the map has a vector entry for \p Key and \p Part.
+  bool hasVectorValue(Value *Key, unsigned Part) const {
+    assert(Part < UF && "Queried Vector Part is too large.");
+    if (!hasAnyVectorValue(Key))
+      return false;
+    const VectorParts &Entry = VectorMapStorage.find(Key)->second;
+    assert(Entry.size() == UF && "VectorParts has wrong dimensions.");
+    return Entry[Part] != nullptr;
+  }
+
+  /// \return True if the map has any scalar entry for \p Key.
+  bool hasAnyScalarValue(Value *Key) const {
+    return ScalarMapStorage.count(Key);
+  }
+
+  /// \return True if the map has a scalar entry for \p Key and \p Instance.
+  bool hasScalarValue(Value *Key, const VPIteration &Instance) const {
+    assert(Instance.Part < UF && "Queried Scalar Part is too large.");
     assert(Instance.Lane < VF.getKnownMinValue() &&
            "Queried Scalar Lane is too large.");
 
-    if (!hasAnyScalarValue(Key)) 
-      return false; 
-    const ScalarParts &Entry = ScalarMapStorage.find(Key)->second; 
-    assert(Entry.size() == UF && "ScalarParts has wrong dimensions."); 
+    if (!hasAnyScalarValue(Key))
+      return false;
+    const ScalarParts &Entry = ScalarMapStorage.find(Key)->second;
+    assert(Entry.size() == UF && "ScalarParts has wrong dimensions.");
     assert(Entry[Instance.Part].size() == VF.getKnownMinValue() &&
-           "ScalarParts has wrong dimensions."); 
-    return Entry[Instance.Part][Instance.Lane] != nullptr; 
-  } 
- 
-  /// Retrieve the existing vector value that corresponds to \p Key and 
-  /// \p Part. 
-  Value *getVectorValue(Value *Key, unsigned Part) { 
-    assert(hasVectorValue(Key, Part) && "Getting non-existent value."); 
-    return VectorMapStorage[Key][Part]; 
-  } 
- 
-  /// Retrieve the existing scalar value that corresponds to \p Key and 
-  /// \p Instance. 
-  Value *getScalarValue(Value *Key, const VPIteration &Instance) { 
-    assert(hasScalarValue(Key, Instance) && "Getting non-existent value."); 
-    return ScalarMapStorage[Key][Instance.Part][Instance.Lane]; 
-  } 
- 
-  /// Set a vector value associated with \p Key and \p Part. Assumes such a 
-  /// value is not already set. If it is, use resetVectorValue() instead. 
-  void setVectorValue(Value *Key, unsigned Part, Value *Vector) { 
-    assert(!hasVectorValue(Key, Part) && "Vector value already set for part"); 
-    if (!VectorMapStorage.count(Key)) { 
-      VectorParts Entry(UF); 
-      VectorMapStorage[Key] = Entry; 
-    } 
-    VectorMapStorage[Key][Part] = Vector; 
-  } 
- 
-  /// Set a scalar value associated with \p Key and \p Instance. Assumes such a 
-  /// value is not already set. 
-  void setScalarValue(Value *Key, const VPIteration &Instance, Value *Scalar) { 
-    assert(!hasScalarValue(Key, Instance) && "Scalar value already set"); 
-    if (!ScalarMapStorage.count(Key)) { 
-      ScalarParts Entry(UF); 
-      // TODO: Consider storing uniform values only per-part, as they occupy 
-      //       lane 0 only, keeping the other VF-1 redundant entries null. 
-      for (unsigned Part = 0; Part < UF; ++Part) 
+           "ScalarParts has wrong dimensions.");
+    return Entry[Instance.Part][Instance.Lane] != nullptr;
+  }
+
+  /// Retrieve the existing vector value that corresponds to \p Key and
+  /// \p Part.
+  Value *getVectorValue(Value *Key, unsigned Part) {
+    assert(hasVectorValue(Key, Part) && "Getting non-existent value.");
+    return VectorMapStorage[Key][Part];
+  }
+
+  /// Retrieve the existing scalar value that corresponds to \p Key and
+  /// \p Instance.
+  Value *getScalarValue(Value *Key, const VPIteration &Instance) {
+    assert(hasScalarValue(Key, Instance) && "Getting non-existent value.");
+    return ScalarMapStorage[Key][Instance.Part][Instance.Lane];
+  }
+
+  /// Set a vector value associated with \p Key and \p Part. Assumes such a
+  /// value is not already set. If it is, use resetVectorValue() instead.
+  void setVectorValue(Value *Key, unsigned Part, Value *Vector) {
+    assert(!hasVectorValue(Key, Part) && "Vector value already set for part");
+    if (!VectorMapStorage.count(Key)) {
+      VectorParts Entry(UF);
+      VectorMapStorage[Key] = Entry;
+    }
+    VectorMapStorage[Key][Part] = Vector;
+  }
+
+  /// Set a scalar value associated with \p Key and \p Instance. Assumes such a
+  /// value is not already set.
+  void setScalarValue(Value *Key, const VPIteration &Instance, Value *Scalar) {
+    assert(!hasScalarValue(Key, Instance) && "Scalar value already set");
+    if (!ScalarMapStorage.count(Key)) {
+      ScalarParts Entry(UF);
+      // TODO: Consider storing uniform values only per-part, as they occupy
+      //       lane 0 only, keeping the other VF-1 redundant entries null.
+      for (unsigned Part = 0; Part < UF; ++Part)
         Entry[Part].resize(VF.getKnownMinValue(), nullptr);
-      ScalarMapStorage[Key] = Entry; 
-    } 
-    ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar; 
-  } 
- 
-  /// Reset the vector value associated with \p Key for the given \p Part. 
-  /// This function can be used to update values that have already been 
-  /// vectorized. This is the case for "fix-up" operations including type 
-  /// truncation and the second phase of recurrence vectorization. 
-  void resetVectorValue(Value *Key, unsigned Part, Value *Vector) { 
-    assert(hasVectorValue(Key, Part) && "Vector value not set for part"); 
-    VectorMapStorage[Key][Part] = Vector; 
-  } 
- 
-  /// Reset the scalar value associated with \p Key for \p Part and \p Lane. 
-  /// This function can be used to update values that have already been 
-  /// scalarized. This is the case for "fix-up" operations including scalar phi 
-  /// nodes for scalarized and predicated instructions. 
-  void resetScalarValue(Value *Key, const VPIteration &Instance, 
-                        Value *Scalar) { 
-    assert(hasScalarValue(Key, Instance) && 
-           "Scalar value not set for part and lane"); 
-    ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar; 
-  } 
-}; 
- 
-/// This class is used to enable the VPlan to invoke a method of ILV. This is 
-/// needed until the method is refactored out of ILV and becomes reusable. 
-struct VPCallback { 
-  virtual ~VPCallback() {} 
-  virtual Value *getOrCreateVectorValues(Value *V, unsigned Part) = 0; 
-  virtual Value *getOrCreateScalarValue(Value *V, 
-                                        const VPIteration &Instance) = 0; 
-}; 
- 
-/// VPTransformState holds information passed down when "executing" a VPlan, 
-/// needed for generating the output IR. 
-struct VPTransformState { 
+      ScalarMapStorage[Key] = Entry;
+    }
+    ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar;
+  }
+
+  /// Reset the vector value associated with \p Key for the given \p Part.
+  /// This function can be used to update values that have already been
+  /// vectorized. This is the case for "fix-up" operations including type
+  /// truncation and the second phase of recurrence vectorization.
+  void resetVectorValue(Value *Key, unsigned Part, Value *Vector) {
+    assert(hasVectorValue(Key, Part) && "Vector value not set for part");
+    VectorMapStorage[Key][Part] = Vector;
+  }
+
+  /// Reset the scalar value associated with \p Key for \p Part and \p Lane.
+  /// This function can be used to update values that have already been
+  /// scalarized. This is the case for "fix-up" operations including scalar phi
+  /// nodes for scalarized and predicated instructions.
+  void resetScalarValue(Value *Key, const VPIteration &Instance,
+                        Value *Scalar) {
+    assert(hasScalarValue(Key, Instance) &&
+           "Scalar value not set for part and lane");
+    ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar;
+  }
+};
+
+/// This class is used to enable the VPlan to invoke a method of ILV. This is
+/// needed until the method is refactored out of ILV and becomes reusable.
+struct VPCallback {
+  virtual ~VPCallback() {}
+  virtual Value *getOrCreateVectorValues(Value *V, unsigned Part) = 0;
+  virtual Value *getOrCreateScalarValue(Value *V,
+                                        const VPIteration &Instance) = 0;
+};
+
+/// VPTransformState holds information passed down when "executing" a VPlan,
+/// needed for generating the output IR.
+struct VPTransformState {
   VPTransformState(ElementCount VF, unsigned UF, Loop *OrigLoop, LoopInfo *LI,
                    DominatorTree *DT, IRBuilder<> &Builder,
                    VectorizerValueMap &ValueMap, InnerLoopVectorizer *ILV,
                    VPCallback &Callback)
       : VF(VF), UF(UF), Instance(), OrigLoop(OrigLoop), LI(LI), DT(DT),
         Builder(Builder), ValueMap(ValueMap), ILV(ILV), Callback(Callback) {}
- 
-  /// The chosen Vectorization and Unroll Factors of the loop being vectorized. 
+
+  /// The chosen Vectorization and Unroll Factors of the loop being vectorized.
   ElementCount VF;
-  unsigned UF; 
- 
-  /// Hold the indices to generate specific scalar instructions. Null indicates 
-  /// that all instances are to be generated, using either scalar or vector 
-  /// instructions. 
-  Optional<VPIteration> Instance; 
- 
-  struct DataState { 
-    /// A type for vectorized values in the new loop. Each value from the 
-    /// original loop, when vectorized, is represented by UF vector values in 
-    /// the new unrolled loop, where UF is the unroll factor. 
-    typedef SmallVector<Value *, 2> PerPartValuesTy; 
- 
-    DenseMap<VPValue *, PerPartValuesTy> PerPartOutput; 
+  unsigned UF;
+
+  /// Hold the indices to generate specific scalar instructions. Null indicates
+  /// that all instances are to be generated, using either scalar or vector
+  /// instructions.
+  Optional<VPIteration> Instance;
+
+  struct DataState {
+    /// A type for vectorized values in the new loop. Each value from the
+    /// original loop, when vectorized, is represented by UF vector values in
+    /// the new unrolled loop, where UF is the unroll factor.
+    typedef SmallVector<Value *, 2> PerPartValuesTy;
+
+    DenseMap<VPValue *, PerPartValuesTy> PerPartOutput;
 
     using ScalarsPerPartValuesTy = SmallVector<SmallVector<Value *, 4>, 2>;
     DenseMap<VPValue *, ScalarsPerPartValuesTy> PerPartScalars;
-  } Data; 
- 
-  /// Get the generated Value for a given VPValue and a given Part. Note that 
-  /// as some Defs are still created by ILV and managed in its ValueMap, this 
-  /// method will delegate the call to ILV in such cases in order to provide 
-  /// callers a consistent API. 
-  /// \see set. 
-  Value *get(VPValue *Def, unsigned Part) { 
-    // If Values have been set for this Def return the one relevant for \p Part. 
-    if (Data.PerPartOutput.count(Def)) 
-      return Data.PerPartOutput[Def][Part]; 
-    // Def is managed by ILV: bring the Values from ValueMap. 
-    return Callback.getOrCreateVectorValues(VPValue2Value[Def], Part); 
-  } 
- 
-  /// Get the generated Value for a given VPValue and given Part and Lane. 
+  } Data;
+
+  /// Get the generated Value for a given VPValue and a given Part. Note that
+  /// as some Defs are still created by ILV and managed in its ValueMap, this
+  /// method will delegate the call to ILV in such cases in order to provide
+  /// callers a consistent API.
+  /// \see set.
+  Value *get(VPValue *Def, unsigned Part) {
+    // If Values have been set for this Def return the one relevant for \p Part.
+    if (Data.PerPartOutput.count(Def))
+      return Data.PerPartOutput[Def][Part];
+    // Def is managed by ILV: bring the Values from ValueMap.
+    return Callback.getOrCreateVectorValues(VPValue2Value[Def], Part);
+  }
+
+  /// Get the generated Value for a given VPValue and given Part and Lane.
   Value *get(VPValue *Def, const VPIteration &Instance);
- 
+
   bool hasVectorValue(VPValue *Def, unsigned Part) {
     auto I = Data.PerPartOutput.find(Def);
     return I != Data.PerPartOutput.end() && Part < I->second.size() &&
            I->second[Part];
-  } 
- 
+  }
+
   bool hasScalarValue(VPValue *Def, VPIteration Instance) {
     auto I = Data.PerPartScalars.find(Def);
     if (I == Data.PerPartScalars.end())
@@ -305,16 +305,16 @@ struct VPTransformState {
            I->second[Instance.Part][Instance.Lane];
   }
 
-  /// Set the generated Value for a given VPValue and a given Part. 
-  void set(VPValue *Def, Value *V, unsigned Part) { 
-    if (!Data.PerPartOutput.count(Def)) { 
-      DataState::PerPartValuesTy Entry(UF); 
-      Data.PerPartOutput[Def] = Entry; 
-    } 
-    Data.PerPartOutput[Def][Part] = V; 
-  } 
+  /// Set the generated Value for a given VPValue and a given Part.
+  void set(VPValue *Def, Value *V, unsigned Part) {
+    if (!Data.PerPartOutput.count(Def)) {
+      DataState::PerPartValuesTy Entry(UF);
+      Data.PerPartOutput[Def] = Entry;
+    }
+    Data.PerPartOutput[Def][Part] = V;
+  }
   void set(VPValue *Def, Value *IRDef, Value *V, unsigned Part);
- 
+
   void set(VPValue *Def, Value *V, const VPIteration &Instance) {
     auto Iter = Data.PerPartScalars.insert({Def, {}});
     auto &PerPartVec = Iter.first->second;
@@ -326,364 +326,364 @@ struct VPTransformState {
     Scalars[Instance.Lane] = V;
   }
 
-  /// Hold state information used when constructing the CFG of the output IR, 
-  /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks. 
-  struct CFGState { 
-    /// The previous VPBasicBlock visited. Initially set to null. 
-    VPBasicBlock *PrevVPBB = nullptr; 
- 
-    /// The previous IR BasicBlock created or used. Initially set to the new 
-    /// header BasicBlock. 
-    BasicBlock *PrevBB = nullptr; 
- 
-    /// The last IR BasicBlock in the output IR. Set to the new latch 
-    /// BasicBlock, used for placing the newly created BasicBlocks. 
-    BasicBlock *LastBB = nullptr; 
- 
-    /// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case 
-    /// of replication, maps the BasicBlock of the last replica created. 
-    SmallDenseMap<VPBasicBlock *, BasicBlock *> VPBB2IRBB; 
- 
-    /// Vector of VPBasicBlocks whose terminator instruction needs to be fixed 
-    /// up at the end of vector code generation. 
-    SmallVector<VPBasicBlock *, 8> VPBBsToFix; 
- 
-    CFGState() = default; 
-  } CFG; 
- 
+  /// Hold state information used when constructing the CFG of the output IR,
+  /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.
+  struct CFGState {
+    /// The previous VPBasicBlock visited. Initially set to null.
+    VPBasicBlock *PrevVPBB = nullptr;
+
+    /// The previous IR BasicBlock created or used. Initially set to the new
+    /// header BasicBlock.
+    BasicBlock *PrevBB = nullptr;
+
+    /// The last IR BasicBlock in the output IR. Set to the new latch
+    /// BasicBlock, used for placing the newly created BasicBlocks.
+    BasicBlock *LastBB = nullptr;
+
+    /// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case
+    /// of replication, maps the BasicBlock of the last replica created.
+    SmallDenseMap<VPBasicBlock *, BasicBlock *> VPBB2IRBB;
+
+    /// Vector of VPBasicBlocks whose terminator instruction needs to be fixed
+    /// up at the end of vector code generation.
+    SmallVector<VPBasicBlock *, 8> VPBBsToFix;
+
+    CFGState() = default;
+  } CFG;
+
   /// Hold a pointer to the original loop.
   Loop *OrigLoop;
 
-  /// Hold a pointer to LoopInfo to register new basic blocks in the loop. 
-  LoopInfo *LI; 
- 
-  /// Hold a pointer to Dominator Tree to register new basic blocks in the loop. 
-  DominatorTree *DT; 
- 
-  /// Hold a reference to the IRBuilder used to generate output IR code. 
-  IRBuilder<> &Builder; 
- 
-  /// Hold a reference to the Value state information used when generating the 
-  /// Values of the output IR. 
-  VectorizerValueMap &ValueMap; 
- 
-  /// Hold a reference to a mapping between VPValues in VPlan and original 
-  /// Values they correspond to. 
-  VPValue2ValueTy VPValue2Value; 
- 
-  /// Hold the canonical scalar IV of the vector loop (start=0, step=VF*UF). 
-  Value *CanonicalIV = nullptr; 
- 
-  /// Hold the trip count of the scalar loop. 
-  Value *TripCount = nullptr; 
- 
-  /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods. 
-  InnerLoopVectorizer *ILV; 
- 
-  VPCallback &Callback; 
-}; 
- 
-/// VPBlockBase is the building block of the Hierarchical Control-Flow Graph. 
-/// A VPBlockBase can be either a VPBasicBlock or a VPRegionBlock. 
-class VPBlockBase { 
-  friend class VPBlockUtils; 
- 
-  const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). 
- 
-  /// An optional name for the block. 
-  std::string Name; 
- 
-  /// The immediate VPRegionBlock which this VPBlockBase belongs to, or null if 
-  /// it is a topmost VPBlockBase. 
-  VPRegionBlock *Parent = nullptr; 
- 
-  /// List of predecessor blocks. 
-  SmallVector<VPBlockBase *, 1> Predecessors; 
- 
-  /// List of successor blocks. 
-  SmallVector<VPBlockBase *, 1> Successors; 
- 
-  /// Successor selector, null for zero or single successor blocks. 
-  VPValue *CondBit = nullptr; 
- 
-  /// Current block predicate - null if the block does not need a predicate. 
-  VPValue *Predicate = nullptr; 
- 
-  /// VPlan containing the block. Can only be set on the entry block of the 
-  /// plan. 
-  VPlan *Plan = nullptr; 
- 
-  /// Add \p Successor as the last successor to this block. 
-  void appendSuccessor(VPBlockBase *Successor) { 
-    assert(Successor && "Cannot add nullptr successor!"); 
-    Successors.push_back(Successor); 
-  } 
- 
-  /// Add \p Predecessor as the last predecessor to this block. 
-  void appendPredecessor(VPBlockBase *Predecessor) { 
-    assert(Predecessor && "Cannot add nullptr predecessor!"); 
-    Predecessors.push_back(Predecessor); 
-  } 
- 
-  /// Remove \p Predecessor from the predecessors of this block. 
-  void removePredecessor(VPBlockBase *Predecessor) { 
+  /// Hold a pointer to LoopInfo to register new basic blocks in the loop.
+  LoopInfo *LI;
+
+  /// Hold a pointer to Dominator Tree to register new basic blocks in the loop.
+  DominatorTree *DT;
+
+  /// Hold a reference to the IRBuilder used to generate output IR code.
+  IRBuilder<> &Builder;
+
+  /// Hold a reference to the Value state information used when generating the
+  /// Values of the output IR.
+  VectorizerValueMap &ValueMap;
+
+  /// Hold a reference to a mapping between VPValues in VPlan and original
+  /// Values they correspond to.
+  VPValue2ValueTy VPValue2Value;
+
+  /// Hold the canonical scalar IV of the vector loop (start=0, step=VF*UF).
+  Value *CanonicalIV = nullptr;
+
+  /// Hold the trip count of the scalar loop.
+  Value *TripCount = nullptr;
+
+  /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
+  InnerLoopVectorizer *ILV;
+
+  VPCallback &Callback;
+};
+
+/// VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
+/// A VPBlockBase can be either a VPBasicBlock or a VPRegionBlock.
+class VPBlockBase {
+  friend class VPBlockUtils;
+
+  const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
+
+  /// An optional name for the block.
+  std::string Name;
+
+  /// The immediate VPRegionBlock which this VPBlockBase belongs to, or null if
+  /// it is a topmost VPBlockBase.
+  VPRegionBlock *Parent = nullptr;
+
+  /// List of predecessor blocks.
+  SmallVector<VPBlockBase *, 1> Predecessors;
+
+  /// List of successor blocks.
+  SmallVector<VPBlockBase *, 1> Successors;
+
+  /// Successor selector, null for zero or single successor blocks.
+  VPValue *CondBit = nullptr;
+
+  /// Current block predicate - null if the block does not need a predicate.
+  VPValue *Predicate = nullptr;
+
+  /// VPlan containing the block. Can only be set on the entry block of the
+  /// plan.
+  VPlan *Plan = nullptr;
+
+  /// Add \p Successor as the last successor to this block.
+  void appendSuccessor(VPBlockBase *Successor) {
+    assert(Successor && "Cannot add nullptr successor!");
+    Successors.push_back(Successor);
+  }
+
+  /// Add \p Predecessor as the last predecessor to this block.
+  void appendPredecessor(VPBlockBase *Predecessor) {
+    assert(Predecessor && "Cannot add nullptr predecessor!");
+    Predecessors.push_back(Predecessor);
+  }
+
+  /// Remove \p Predecessor from the predecessors of this block.
+  void removePredecessor(VPBlockBase *Predecessor) {
     auto Pos = find(Predecessors, Predecessor);
-    assert(Pos && "Predecessor does not exist"); 
-    Predecessors.erase(Pos); 
-  } 
- 
-  /// Remove \p Successor from the successors of this block. 
-  void removeSuccessor(VPBlockBase *Successor) { 
+    assert(Pos && "Predecessor does not exist");
+    Predecessors.erase(Pos);
+  }
+
+  /// Remove \p Successor from the successors of this block.
+  void removeSuccessor(VPBlockBase *Successor) {
     auto Pos = find(Successors, Successor);
-    assert(Pos && "Successor does not exist"); 
-    Successors.erase(Pos); 
-  } 
- 
-protected: 
-  VPBlockBase(const unsigned char SC, const std::string &N) 
-      : SubclassID(SC), Name(N) {} 
- 
-public: 
-  /// An enumeration for keeping track of the concrete subclass of VPBlockBase 
-  /// that are actually instantiated. Values of this enumeration are kept in the 
-  /// SubclassID field of the VPBlockBase objects. They are used for concrete 
-  /// type identification. 
-  using VPBlockTy = enum { VPBasicBlockSC, VPRegionBlockSC }; 
- 
-  using VPBlocksTy = SmallVectorImpl<VPBlockBase *>; 
- 
-  virtual ~VPBlockBase() = default; 
- 
-  const std::string &getName() const { return Name; } 
- 
-  void setName(const Twine &newName) { Name = newName.str(); } 
- 
-  /// \return an ID for the concrete type of this object. 
-  /// This is used to implement the classof checks. This should not be used 
-  /// for any other purpose, as the values may change as LLVM evolves. 
-  unsigned getVPBlockID() const { return SubclassID; } 
- 
-  VPRegionBlock *getParent() { return Parent; } 
-  const VPRegionBlock *getParent() const { return Parent; } 
- 
-  /// \return A pointer to the plan containing the current block. 
-  VPlan *getPlan(); 
-  const VPlan *getPlan() const; 
- 
-  /// Sets the pointer of the plan containing the block. The block must be the 
-  /// entry block into the VPlan. 
-  void setPlan(VPlan *ParentPlan); 
- 
-  void setParent(VPRegionBlock *P) { Parent = P; } 
- 
-  /// \return the VPBasicBlock that is the entry of this VPBlockBase, 
-  /// recursively, if the latter is a VPRegionBlock. Otherwise, if this 
-  /// VPBlockBase is a VPBasicBlock, it is returned. 
-  const VPBasicBlock *getEntryBasicBlock() const; 
-  VPBasicBlock *getEntryBasicBlock(); 
- 
-  /// \return the VPBasicBlock that is the exit of this VPBlockBase, 
-  /// recursively, if the latter is a VPRegionBlock. Otherwise, if this 
-  /// VPBlockBase is a VPBasicBlock, it is returned. 
-  const VPBasicBlock *getExitBasicBlock() const; 
-  VPBasicBlock *getExitBasicBlock(); 
- 
-  const VPBlocksTy &getSuccessors() const { return Successors; } 
-  VPBlocksTy &getSuccessors() { return Successors; } 
- 
-  const VPBlocksTy &getPredecessors() const { return Predecessors; } 
-  VPBlocksTy &getPredecessors() { return Predecessors; } 
- 
-  /// \return the successor of this VPBlockBase if it has a single successor. 
-  /// Otherwise return a null pointer. 
-  VPBlockBase *getSingleSuccessor() const { 
-    return (Successors.size() == 1 ? *Successors.begin() : nullptr); 
-  } 
- 
-  /// \return the predecessor of this VPBlockBase if it has a single 
-  /// predecessor. Otherwise return a null pointer. 
-  VPBlockBase *getSinglePredecessor() const { 
-    return (Predecessors.size() == 1 ? *Predecessors.begin() : nullptr); 
-  } 
- 
-  size_t getNumSuccessors() const { return Successors.size(); } 
-  size_t getNumPredecessors() const { return Predecessors.size(); } 
- 
-  /// An Enclosing Block of a block B is any block containing B, including B 
-  /// itself. \return the closest enclosing block starting from "this", which 
-  /// has successors. \return the root enclosing block if all enclosing blocks 
-  /// have no successors. 
-  VPBlockBase *getEnclosingBlockWithSuccessors(); 
- 
-  /// \return the closest enclosing block starting from "this", which has 
-  /// predecessors. \return the root enclosing block if all enclosing blocks 
-  /// have no predecessors. 
-  VPBlockBase *getEnclosingBlockWithPredecessors(); 
- 
-  /// \return the successors either attached directly to this VPBlockBase or, if 
-  /// this VPBlockBase is the exit block of a VPRegionBlock and has no 
-  /// successors of its own, search recursively for the first enclosing 
-  /// VPRegionBlock that has successors and return them. If no such 
-  /// VPRegionBlock exists, return the (empty) successors of the topmost 
-  /// VPBlockBase reached. 
-  const VPBlocksTy &getHierarchicalSuccessors() { 
-    return getEnclosingBlockWithSuccessors()->getSuccessors(); 
-  } 
- 
-  /// \return the hierarchical successor of this VPBlockBase if it has a single 
-  /// hierarchical successor. Otherwise return a null pointer. 
-  VPBlockBase *getSingleHierarchicalSuccessor() { 
-    return getEnclosingBlockWithSuccessors()->getSingleSuccessor(); 
-  } 
- 
-  /// \return the predecessors either attached directly to this VPBlockBase or, 
-  /// if this VPBlockBase is the entry block of a VPRegionBlock and has no 
-  /// predecessors of its own, search recursively for the first enclosing 
-  /// VPRegionBlock that has predecessors and return them. If no such 
-  /// VPRegionBlock exists, return the (empty) predecessors of the topmost 
-  /// VPBlockBase reached. 
-  const VPBlocksTy &getHierarchicalPredecessors() { 
-    return getEnclosingBlockWithPredecessors()->getPredecessors(); 
-  } 
- 
-  /// \return the hierarchical predecessor of this VPBlockBase if it has a 
-  /// single hierarchical predecessor. Otherwise return a null pointer. 
-  VPBlockBase *getSingleHierarchicalPredecessor() { 
-    return getEnclosingBlockWithPredecessors()->getSinglePredecessor(); 
-  } 
- 
-  /// \return the condition bit selecting the successor. 
-  VPValue *getCondBit() { return CondBit; } 
- 
-  const VPValue *getCondBit() const { return CondBit; } 
- 
-  void setCondBit(VPValue *CV) { CondBit = CV; } 
- 
-  VPValue *getPredicate() { return Predicate; } 
- 
-  const VPValue *getPredicate() const { return Predicate; } 
- 
-  void setPredicate(VPValue *Pred) { Predicate = Pred; } 
- 
-  /// Set a given VPBlockBase \p Successor as the single successor of this 
-  /// VPBlockBase. This VPBlockBase is not added as predecessor of \p Successor. 
-  /// This VPBlockBase must have no successors. 
-  void setOneSuccessor(VPBlockBase *Successor) { 
-    assert(Successors.empty() && "Setting one successor when others exist."); 
-    appendSuccessor(Successor); 
-  } 
- 
-  /// Set two given VPBlockBases \p IfTrue and \p IfFalse to be the two 
-  /// successors of this VPBlockBase. \p Condition is set as the successor 
-  /// selector. This VPBlockBase is not added as predecessor of \p IfTrue or \p 
-  /// IfFalse. This VPBlockBase must have no successors. 
-  void setTwoSuccessors(VPBlockBase *IfTrue, VPBlockBase *IfFalse, 
-                        VPValue *Condition) { 
-    assert(Successors.empty() && "Setting two successors when others exist."); 
-    assert(Condition && "Setting two successors without condition!"); 
-    CondBit = Condition; 
-    appendSuccessor(IfTrue); 
-    appendSuccessor(IfFalse); 
-  } 
- 
-  /// Set each VPBasicBlock in \p NewPreds as predecessor of this VPBlockBase. 
-  /// This VPBlockBase must have no predecessors. This VPBlockBase is not added 
-  /// as successor of any VPBasicBlock in \p NewPreds. 
-  void setPredecessors(ArrayRef<VPBlockBase *> NewPreds) { 
-    assert(Predecessors.empty() && "Block predecessors already set."); 
-    for (auto *Pred : NewPreds) 
-      appendPredecessor(Pred); 
-  } 
- 
-  /// Remove all the predecessor of this block. 
-  void clearPredecessors() { Predecessors.clear(); } 
- 
-  /// Remove all the successors of this block and set to null its condition bit 
-  void clearSuccessors() { 
-    Successors.clear(); 
-    CondBit = nullptr; 
-  } 
- 
-  /// The method which generates the output IR that correspond to this 
-  /// VPBlockBase, thereby "executing" the VPlan. 
-  virtual void execute(struct VPTransformState *State) = 0; 
- 
-  /// Delete all blocks reachable from a given VPBlockBase, inclusive. 
-  static void deleteCFG(VPBlockBase *Entry); 
- 
-  void printAsOperand(raw_ostream &OS, bool PrintType) const { 
-    OS << getName(); 
-  } 
- 
-  void print(raw_ostream &OS) const { 
-    // TODO: Only printing VPBB name for now since we only have dot printing 
-    // support for VPInstructions/Recipes. 
-    printAsOperand(OS, false); 
-  } 
- 
-  /// Return true if it is legal to hoist instructions into this block. 
-  bool isLegalToHoistInto() { 
-    // There are currently no constraints that prevent an instruction to be 
-    // hoisted into a VPBlockBase. 
-    return true; 
-  } 
+    assert(Pos && "Successor does not exist");
+    Successors.erase(Pos);
+  }
+
+protected:
+  VPBlockBase(const unsigned char SC, const std::string &N)
+      : SubclassID(SC), Name(N) {}
+
+public:
+  /// An enumeration for keeping track of the concrete subclass of VPBlockBase
+  /// that are actually instantiated. Values of this enumeration are kept in the
+  /// SubclassID field of the VPBlockBase objects. They are used for concrete
+  /// type identification.
+  using VPBlockTy = enum { VPBasicBlockSC, VPRegionBlockSC };
+
+  using VPBlocksTy = SmallVectorImpl<VPBlockBase *>;
+
+  virtual ~VPBlockBase() = default;
+
+  const std::string &getName() const { return Name; }
+
+  void setName(const Twine &newName) { Name = newName.str(); }
+
+  /// \return an ID for the concrete type of this object.
+  /// This is used to implement the classof checks. This should not be used
+  /// for any other purpose, as the values may change as LLVM evolves.
+  unsigned getVPBlockID() const { return SubclassID; }
+
+  VPRegionBlock *getParent() { return Parent; }
+  const VPRegionBlock *getParent() const { return Parent; }
+
+  /// \return A pointer to the plan containing the current block.
+  VPlan *getPlan();
+  const VPlan *getPlan() const;
+
+  /// Sets the pointer of the plan containing the block. The block must be the
+  /// entry block into the VPlan.
+  void setPlan(VPlan *ParentPlan);
+
+  void setParent(VPRegionBlock *P) { Parent = P; }
+
+  /// \return the VPBasicBlock that is the entry of this VPBlockBase,
+  /// recursively, if the latter is a VPRegionBlock. Otherwise, if this
+  /// VPBlockBase is a VPBasicBlock, it is returned.
+  const VPBasicBlock *getEntryBasicBlock() const;
+  VPBasicBlock *getEntryBasicBlock();
+
+  /// \return the VPBasicBlock that is the exit of this VPBlockBase,
+  /// recursively, if the latter is a VPRegionBlock. Otherwise, if this
+  /// VPBlockBase is a VPBasicBlock, it is returned.
+  const VPBasicBlock *getExitBasicBlock() const;
+  VPBasicBlock *getExitBasicBlock();
+
+  const VPBlocksTy &getSuccessors() const { return Successors; }
+  VPBlocksTy &getSuccessors() { return Successors; }
+
+  const VPBlocksTy &getPredecessors() const { return Predecessors; }
+  VPBlocksTy &getPredecessors() { return Predecessors; }
+
+  /// \return the successor of this VPBlockBase if it has a single successor.
+  /// Otherwise return a null pointer.
+  VPBlockBase *getSingleSuccessor() const {
+    return (Successors.size() == 1 ? *Successors.begin() : nullptr);
+  }
+
+  /// \return the predecessor of this VPBlockBase if it has a single
+  /// predecessor. Otherwise return a null pointer.
+  VPBlockBase *getSinglePredecessor() const {
+    return (Predecessors.size() == 1 ? *Predecessors.begin() : nullptr);
+  }
+
+  size_t getNumSuccessors() const { return Successors.size(); }
+  size_t getNumPredecessors() const { return Predecessors.size(); }
+
+  /// An Enclosing Block of a block B is any block containing B, including B
+  /// itself. \return the closest enclosing block starting from "this", which
+  /// has successors. \return the root enclosing block if all enclosing blocks
+  /// have no successors.
+  VPBlockBase *getEnclosingBlockWithSuccessors();
+
+  /// \return the closest enclosing block starting from "this", which has
+  /// predecessors. \return the root enclosing block if all enclosing blocks
+  /// have no predecessors.
+  VPBlockBase *getEnclosingBlockWithPredecessors();
+
+  /// \return the successors either attached directly to this VPBlockBase or, if
+  /// this VPBlockBase is the exit block of a VPRegionBlock and has no
+  /// successors of its own, search recursively for the first enclosing
+  /// VPRegionBlock that has successors and return them. If no such
+  /// VPRegionBlock exists, return the (empty) successors of the topmost
+  /// VPBlockBase reached.
+  const VPBlocksTy &getHierarchicalSuccessors() {
+    return getEnclosingBlockWithSuccessors()->getSuccessors();
+  }
+
+  /// \return the hierarchical successor of this VPBlockBase if it has a single
+  /// hierarchical successor. Otherwise return a null pointer.
+  VPBlockBase *getSingleHierarchicalSuccessor() {
+    return getEnclosingBlockWithSuccessors()->getSingleSuccessor();
+  }
+
+  /// \return the predecessors either attached directly to this VPBlockBase or,
+  /// if this VPBlockBase is the entry block of a VPRegionBlock and has no
+  /// predecessors of its own, search recursively for the first enclosing
+  /// VPRegionBlock that has predecessors and return them. If no such
+  /// VPRegionBlock exists, return the (empty) predecessors of the topmost
+  /// VPBlockBase reached.
+  const VPBlocksTy &getHierarchicalPredecessors() {
+    return getEnclosingBlockWithPredecessors()->getPredecessors();
+  }
+
+  /// \return the hierarchical predecessor of this VPBlockBase if it has a
+  /// single hierarchical predecessor. Otherwise return a null pointer.
+  VPBlockBase *getSingleHierarchicalPredecessor() {
+    return getEnclosingBlockWithPredecessors()->getSinglePredecessor();
+  }
+
+  /// \return the condition bit selecting the successor.
+  VPValue *getCondBit() { return CondBit; }
+
+  const VPValue *getCondBit() const { return CondBit; }
+
+  void setCondBit(VPValue *CV) { CondBit = CV; }
+
+  VPValue *getPredicate() { return Predicate; }
+
+  const VPValue *getPredicate() const { return Predicate; }
+
+  void setPredicate(VPValue *Pred) { Predicate = Pred; }
+
+  /// Set a given VPBlockBase \p Successor as the single successor of this
+  /// VPBlockBase. This VPBlockBase is not added as predecessor of \p Successor.
+  /// This VPBlockBase must have no successors.
+  void setOneSuccessor(VPBlockBase *Successor) {
+    assert(Successors.empty() && "Setting one successor when others exist.");
+    appendSuccessor(Successor);
+  }
+
+  /// Set two given VPBlockBases \p IfTrue and \p IfFalse to be the two
+  /// successors of this VPBlockBase. \p Condition is set as the successor
+  /// selector. This VPBlockBase is not added as predecessor of \p IfTrue or \p
+  /// IfFalse. This VPBlockBase must have no successors.
+  void setTwoSuccessors(VPBlockBase *IfTrue, VPBlockBase *IfFalse,
+                        VPValue *Condition) {
+    assert(Successors.empty() && "Setting two successors when others exist.");
+    assert(Condition && "Setting two successors without condition!");
+    CondBit = Condition;
+    appendSuccessor(IfTrue);
+    appendSuccessor(IfFalse);
+  }
+
+  /// Set each VPBasicBlock in \p NewPreds as predecessor of this VPBlockBase.
+  /// This VPBlockBase must have no predecessors. This VPBlockBase is not added
+  /// as successor of any VPBasicBlock in \p NewPreds.
+  void setPredecessors(ArrayRef<VPBlockBase *> NewPreds) {
+    assert(Predecessors.empty() && "Block predecessors already set.");
+    for (auto *Pred : NewPreds)
+      appendPredecessor(Pred);
+  }
+
+  /// Remove all the predecessor of this block.
+  void clearPredecessors() { Predecessors.clear(); }
+
+  /// Remove all the successors of this block and set to null its condition bit
+  void clearSuccessors() {
+    Successors.clear();
+    CondBit = nullptr;
+  }
+
+  /// The method which generates the output IR that correspond to this
+  /// VPBlockBase, thereby "executing" the VPlan.
+  virtual void execute(struct VPTransformState *State) = 0;
+
+  /// Delete all blocks reachable from a given VPBlockBase, inclusive.
+  static void deleteCFG(VPBlockBase *Entry);
+
+  void printAsOperand(raw_ostream &OS, bool PrintType) const {
+    OS << getName();
+  }
+
+  void print(raw_ostream &OS) const {
+    // TODO: Only printing VPBB name for now since we only have dot printing
+    // support for VPInstructions/Recipes.
+    printAsOperand(OS, false);
+  }
+
+  /// Return true if it is legal to hoist instructions into this block.
+  bool isLegalToHoistInto() {
+    // There are currently no constraints that prevent an instruction to be
+    // hoisted into a VPBlockBase.
+    return true;
+  }
 
   /// Replace all operands of VPUsers in the block with \p NewValue and also
   /// replaces all uses of VPValues defined in the block with NewValue.
   virtual void dropAllReferences(VPValue *NewValue) = 0;
-}; 
- 
-/// VPRecipeBase is a base class modeling a sequence of one or more output IR 
+};
+
+/// VPRecipeBase is a base class modeling a sequence of one or more output IR
 /// instructions. VPRecipeBase owns the the VPValues it defines through VPDef
 /// and is responsible for deleting its defined values. Single-value
 /// VPRecipeBases that also inherit from VPValue must make sure to inherit from
 /// VPRecipeBase before VPValue.
 class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
                      public VPDef {
-  friend VPBasicBlock; 
-  friend class VPBlockUtils; 
- 
- 
-  /// Each VPRecipe belongs to a single VPBasicBlock. 
-  VPBasicBlock *Parent = nullptr; 
- 
-public: 
+  friend VPBasicBlock;
+  friend class VPBlockUtils;
+
+
+  /// Each VPRecipe belongs to a single VPBasicBlock.
+  VPBasicBlock *Parent = nullptr;
+
+public:
   VPRecipeBase(const unsigned char SC) : VPDef(SC) {}
-  virtual ~VPRecipeBase() = default; 
- 
-  /// \return the VPBasicBlock which this VPRecipe belongs to. 
-  VPBasicBlock *getParent() { return Parent; } 
-  const VPBasicBlock *getParent() const { return Parent; } 
- 
-  /// The method which generates the output IR instructions that correspond to 
-  /// this VPRecipe, thereby "executing" the VPlan. 
-  virtual void execute(struct VPTransformState &State) = 0; 
- 
-  /// Insert an unlinked recipe into a basic block immediately before 
-  /// the specified recipe. 
-  void insertBefore(VPRecipeBase *InsertPos); 
- 
-  /// Insert an unlinked Recipe into a basic block immediately after 
-  /// the specified Recipe. 
-  void insertAfter(VPRecipeBase *InsertPos); 
- 
-  /// Unlink this recipe from its current VPBasicBlock and insert it into 
-  /// the VPBasicBlock that MovePos lives in, right after MovePos. 
-  void moveAfter(VPRecipeBase *MovePos); 
- 
+  virtual ~VPRecipeBase() = default;
+
+  /// \return the VPBasicBlock which this VPRecipe belongs to.
+  VPBasicBlock *getParent() { return Parent; }
+  const VPBasicBlock *getParent() const { return Parent; }
+
+  /// The method which generates the output IR instructions that correspond to
+  /// this VPRecipe, thereby "executing" the VPlan.
+  virtual void execute(struct VPTransformState &State) = 0;
+
+  /// Insert an unlinked recipe into a basic block immediately before
+  /// the specified recipe.
+  void insertBefore(VPRecipeBase *InsertPos);
+
+  /// Insert an unlinked Recipe into a basic block immediately after
+  /// the specified Recipe.
+  void insertAfter(VPRecipeBase *InsertPos);
+
+  /// Unlink this recipe from its current VPBasicBlock and insert it into
+  /// the VPBasicBlock that MovePos lives in, right after MovePos.
+  void moveAfter(VPRecipeBase *MovePos);
+
   /// Unlink this recipe and insert into BB before I.
   ///
   /// \pre I is a valid iterator into BB.
   void moveBefore(VPBasicBlock &BB, iplist<VPRecipeBase>::iterator I);
 
-  /// This method unlinks 'this' from the containing basic block, but does not 
-  /// delete it. 
-  void removeFromParent(); 
- 
-  /// This method unlinks 'this' from the containing basic block and deletes it. 
-  /// 
-  /// \returns an iterator pointing to the element after the erased one 
-  iplist<VPRecipeBase>::iterator eraseFromParent(); 
+  /// This method unlinks 'this' from the containing basic block, but does not
+  /// delete it.
+  void removeFromParent();
+
+  /// This method unlinks 'this' from the containing basic block and deletes it.
+  ///
+  /// \returns an iterator pointing to the element after the erased one
+  iplist<VPRecipeBase>::iterator eraseFromParent();
 
   /// Returns a pointer to a VPUser, if the recipe inherits from VPUser or
   /// nullptr otherwise.
@@ -703,8 +703,8 @@ public:
     // All VPDefs are also VPRecipeBases.
     return true;
   }
-}; 
- 
+};
+
 inline bool VPUser::classof(const VPDef *Def) {
   return Def->getVPDefID() == VPRecipeBase::VPInstructionSC ||
          Def->getVPDefID() == VPRecipeBase::VPWidenSC ||
@@ -719,39 +719,39 @@ inline bool VPUser::classof(const VPDef *Def) {
          Def->getVPDefID() == VPRecipeBase::VPWidenMemoryInstructionSC;
 }
 
-/// This is a concrete Recipe that models a single VPlan-level instruction. 
-/// While as any Recipe it may generate a sequence of IR instructions when 
-/// executed, these instructions would always form a single-def expression as 
-/// the VPInstruction is also a single def-use vertex. 
+/// This is a concrete Recipe that models a single VPlan-level instruction.
+/// While as any Recipe it may generate a sequence of IR instructions when
+/// executed, these instructions would always form a single-def expression as
+/// the VPInstruction is also a single def-use vertex.
 class VPInstruction : public VPRecipeBase, public VPUser, public VPValue {
-  friend class VPlanSlp; 
- 
-public: 
-  /// VPlan opcodes, extending LLVM IR with idiomatics instructions. 
-  enum { 
-    Not = Instruction::OtherOpsEnd + 1, 
-    ICmpULE, 
-    SLPLoad, 
-    SLPStore, 
-    ActiveLaneMask, 
-  }; 
- 
-private: 
-  typedef unsigned char OpcodeTy; 
-  OpcodeTy Opcode; 
- 
-  /// Utility method serving execute(): generates a single instance of the 
-  /// modeled instruction. 
-  void generateInstruction(VPTransformState &State, unsigned Part); 
- 
-protected: 
-  void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); } 
- 
-public: 
-  VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands) 
+  friend class VPlanSlp;
+
+public:
+  /// VPlan opcodes, extending LLVM IR with idiomatics instructions.
+  enum {
+    Not = Instruction::OtherOpsEnd + 1,
+    ICmpULE,
+    SLPLoad,
+    SLPStore,
+    ActiveLaneMask,
+  };
+
+private:
+  typedef unsigned char OpcodeTy;
+  OpcodeTy Opcode;
+
+  /// Utility method serving execute(): generates a single instance of the
+  /// modeled instruction.
+  void generateInstruction(VPTransformState &State, unsigned Part);
+
+protected:
+  void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); }
+
+public:
+  VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands)
       : VPRecipeBase(VPRecipeBase::VPInstructionSC), VPUser(Operands),
         VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) {}
- 
+
   VPInstruction(unsigned Opcode, ArrayRef<VPInstruction *> Operands)
       : VPRecipeBase(VPRecipeBase::VPInstructionSC), VPUser({}),
         VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) {
@@ -759,195 +759,195 @@ public:
       addOperand(I->getVPValue());
   }
 
-  VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands) 
-      : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands)) {} 
- 
-  /// Method to support type inquiry through isa, cast, and dyn_cast. 
-  static inline bool classof(const VPValue *V) { 
+  VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands)
+      : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands)) {}
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
+  static inline bool classof(const VPValue *V) {
     return V->getVPValueID() == VPValue::VPVInstructionSC;
-  } 
- 
-  VPInstruction *clone() const { 
-    SmallVector<VPValue *, 2> Operands(operands()); 
-    return new VPInstruction(Opcode, Operands); 
-  } 
- 
-  /// Method to support type inquiry through isa, cast, and dyn_cast. 
+  }
+
+  VPInstruction *clone() const {
+    SmallVector<VPValue *, 2> Operands(operands());
+    return new VPInstruction(Opcode, Operands);
+  }
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPDef *R) {
     return R->getVPDefID() == VPRecipeBase::VPInstructionSC;
-  } 
- 
-  unsigned getOpcode() const { return Opcode; } 
- 
-  /// Generate the instruction. 
-  /// TODO: We currently execute only per-part unless a specific instance is 
-  /// provided. 
-  void execute(VPTransformState &State) override; 
- 
+  }
+
+  unsigned getOpcode() const { return Opcode; }
+
+  /// Generate the instruction.
+  /// TODO: We currently execute only per-part unless a specific instance is
+  /// provided.
+  void execute(VPTransformState &State) override;
+
   /// Print the VPInstruction to \p O.
-  void print(raw_ostream &O, const Twine &Indent, 
-             VPSlotTracker &SlotTracker) const override; 
- 
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+
   /// Print the VPInstruction to dbgs() (for debugging).
   void dump() const;
- 
-  /// Return true if this instruction may modify memory. 
-  bool mayWriteToMemory() const { 
-    // TODO: we can use attributes of the called function to rule out memory 
-    //       modifications. 
-    return Opcode == Instruction::Store || Opcode == Instruction::Call || 
-           Opcode == Instruction::Invoke || Opcode == SLPStore; 
-  } 
- 
-  bool hasResult() const { 
-    // CallInst may or may not have a result, depending on the called function. 
-    // Conservatively return calls have results for now. 
-    switch (getOpcode()) { 
-    case Instruction::Ret: 
-    case Instruction::Br: 
-    case Instruction::Store: 
-    case Instruction::Switch: 
-    case Instruction::IndirectBr: 
-    case Instruction::Resume: 
-    case Instruction::CatchRet: 
-    case Instruction::Unreachable: 
-    case Instruction::Fence: 
-    case Instruction::AtomicRMW: 
-      return false; 
-    default: 
-      return true; 
-    } 
-  } 
-}; 
- 
-/// VPWidenRecipe is a recipe for producing a copy of vector type its 
-/// ingredient. This recipe covers most of the traditional vectorization cases 
-/// where each ingredient transforms into a vectorized version of itself. 
+
+  /// Return true if this instruction may modify memory.
+  bool mayWriteToMemory() const {
+    // TODO: we can use attributes of the called function to rule out memory
+    //       modifications.
+    return Opcode == Instruction::Store || Opcode == Instruction::Call ||
+           Opcode == Instruction::Invoke || Opcode == SLPStore;
+  }
+
+  bool hasResult() const {
+    // CallInst may or may not have a result, depending on the called function.
+    // Conservatively return calls have results for now.
+    switch (getOpcode()) {
+    case Instruction::Ret:
+    case Instruction::Br:
+    case Instruction::Store:
+    case Instruction::Switch:
+    case Instruction::IndirectBr:
+    case Instruction::Resume:
+    case Instruction::CatchRet:
+    case Instruction::Unreachable:
+    case Instruction::Fence:
+    case Instruction::AtomicRMW:
+      return false;
+    default:
+      return true;
+    }
+  }
+};
+
+/// VPWidenRecipe is a recipe for producing a copy of vector type its
+/// ingredient. This recipe covers most of the traditional vectorization cases
+/// where each ingredient transforms into a vectorized version of itself.
 class VPWidenRecipe : public VPRecipeBase, public VPValue, public VPUser {
-public: 
-  template <typename IterT> 
-  VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands) 
+public:
+  template <typename IterT>
+  VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands)
       : VPRecipeBase(VPRecipeBase::VPWidenSC),
         VPValue(VPValue::VPVWidenSC, &I, this), VPUser(Operands) {}
- 
-  ~VPWidenRecipe() override = default; 
- 
-  /// Method to support type inquiry through isa, cast, and dyn_cast. 
+
+  ~VPWidenRecipe() override = default;
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPDef *D) {
     return D->getVPDefID() == VPRecipeBase::VPWidenSC;
-  } 
+  }
   static inline bool classof(const VPValue *V) {
     return V->getVPValueID() == VPValue::VPVWidenSC;
   }
- 
-  /// Produce widened copies of all Ingredients. 
-  void execute(VPTransformState &State) override; 
- 
-  /// Print the recipe. 
-  void print(raw_ostream &O, const Twine &Indent, 
-             VPSlotTracker &SlotTracker) const override; 
-}; 
- 
-/// A recipe for widening Call instructions. 
+
+  /// Produce widened copies of all Ingredients.
+  void execute(VPTransformState &State) override;
+
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+};
+
+/// A recipe for widening Call instructions.
 class VPWidenCallRecipe : public VPRecipeBase, public VPUser, public VPValue {
- 
-public: 
-  template <typename IterT> 
-  VPWidenCallRecipe(CallInst &I, iterator_range<IterT> CallArguments) 
+
+public:
+  template <typename IterT>
+  VPWidenCallRecipe(CallInst &I, iterator_range<IterT> CallArguments)
       : VPRecipeBase(VPRecipeBase::VPWidenCallSC), VPUser(CallArguments),
         VPValue(VPValue::VPVWidenCallSC, &I, this) {}
- 
-  ~VPWidenCallRecipe() override = default; 
- 
-  /// Method to support type inquiry through isa, cast, and dyn_cast. 
+
+  ~VPWidenCallRecipe() override = default;
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPDef *D) {
     return D->getVPDefID() == VPRecipeBase::VPWidenCallSC;
-  } 
- 
-  /// Produce a widened version of the call instruction. 
-  void execute(VPTransformState &State) override; 
- 
-  /// Print the recipe. 
-  void print(raw_ostream &O, const Twine &Indent, 
-             VPSlotTracker &SlotTracker) const override; 
-}; 
- 
-/// A recipe for widening select instructions. 
+  }
+
+  /// Produce a widened version of the call instruction.
+  void execute(VPTransformState &State) override;
+
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+};
+
+/// A recipe for widening select instructions.
 class VPWidenSelectRecipe : public VPRecipeBase, public VPUser, public VPValue {
- 
-  /// Is the condition of the select loop invariant? 
-  bool InvariantCond; 
- 
-public: 
-  template <typename IterT> 
-  VPWidenSelectRecipe(SelectInst &I, iterator_range<IterT> Operands, 
-                      bool InvariantCond) 
+
+  /// Is the condition of the select loop invariant?
+  bool InvariantCond;
+
+public:
+  template <typename IterT>
+  VPWidenSelectRecipe(SelectInst &I, iterator_range<IterT> Operands,
+                      bool InvariantCond)
       : VPRecipeBase(VPRecipeBase::VPWidenSelectSC), VPUser(Operands),
         VPValue(VPValue::VPVWidenSelectSC, &I, this),
-        InvariantCond(InvariantCond) {} 
- 
-  ~VPWidenSelectRecipe() override = default; 
- 
-  /// Method to support type inquiry through isa, cast, and dyn_cast. 
+        InvariantCond(InvariantCond) {}
+
+  ~VPWidenSelectRecipe() override = default;
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPDef *D) {
     return D->getVPDefID() == VPRecipeBase::VPWidenSelectSC;
-  } 
- 
-  /// Produce a widened version of the select instruction. 
-  void execute(VPTransformState &State) override; 
- 
-  /// Print the recipe. 
-  void print(raw_ostream &O, const Twine &Indent, 
-             VPSlotTracker &SlotTracker) const override; 
-}; 
- 
-/// A recipe for handling GEP instructions. 
+  }
+
+  /// Produce a widened version of the select instruction.
+  void execute(VPTransformState &State) override;
+
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+};
+
+/// A recipe for handling GEP instructions.
 class VPWidenGEPRecipe : public VPRecipeBase,
                          public VPUser,
                          public VPValue {
-  bool IsPtrLoopInvariant; 
-  SmallBitVector IsIndexLoopInvariant; 
- 
-public: 
-  template <typename IterT> 
+  bool IsPtrLoopInvariant;
+  SmallBitVector IsIndexLoopInvariant;
+
+public:
+  template <typename IterT>
   VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands)
       : VPRecipeBase(VPRecipeBase::VPWidenGEPSC), VPUser(Operands),
         VPValue(VPWidenGEPSC, GEP, this),
         IsIndexLoopInvariant(GEP->getNumIndices(), false) {}
 
   template <typename IterT>
-  VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands, 
-                   Loop *OrigLoop) 
+  VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands,
+                   Loop *OrigLoop)
       : VPRecipeBase(VPRecipeBase::VPWidenGEPSC), VPUser(Operands),
         VPValue(VPValue::VPVWidenGEPSC, GEP, this),
-        IsIndexLoopInvariant(GEP->getNumIndices(), false) { 
-    IsPtrLoopInvariant = OrigLoop->isLoopInvariant(GEP->getPointerOperand()); 
-    for (auto Index : enumerate(GEP->indices())) 
-      IsIndexLoopInvariant[Index.index()] = 
-          OrigLoop->isLoopInvariant(Index.value().get()); 
-  } 
-  ~VPWidenGEPRecipe() override = default; 
- 
-  /// Method to support type inquiry through isa, cast, and dyn_cast. 
+        IsIndexLoopInvariant(GEP->getNumIndices(), false) {
+    IsPtrLoopInvariant = OrigLoop->isLoopInvariant(GEP->getPointerOperand());
+    for (auto Index : enumerate(GEP->indices()))
+      IsIndexLoopInvariant[Index.index()] =
+          OrigLoop->isLoopInvariant(Index.value().get());
+  }
+  ~VPWidenGEPRecipe() override = default;
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPDef *D) {
     return D->getVPDefID() == VPRecipeBase::VPWidenGEPSC;
-  } 
- 
-  /// Generate the gep nodes. 
-  void execute(VPTransformState &State) override; 
- 
-  /// Print the recipe. 
-  void print(raw_ostream &O, const Twine &Indent, 
-             VPSlotTracker &SlotTracker) const override; 
-}; 
- 
-/// A recipe for handling phi nodes of integer and floating-point inductions, 
-/// producing their vector and scalar values. 
+  }
+
+  /// Generate the gep nodes.
+  void execute(VPTransformState &State) override;
+
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+};
+
+/// A recipe for handling phi nodes of integer and floating-point inductions,
+/// producing their vector and scalar values.
 class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPUser {
-  PHINode *IV; 
-  TruncInst *Trunc; 
- 
-public: 
+  PHINode *IV;
+  TruncInst *Trunc;
+
+public:
   VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start,
                                 TruncInst *Trunc = nullptr)
       : VPRecipeBase(VPWidenIntOrFpInductionSC), VPUser({Start}), IV(IV),
@@ -957,35 +957,35 @@ public:
     else
       new VPValue(IV, this);
   }
-  ~VPWidenIntOrFpInductionRecipe() override = default; 
- 
-  /// Method to support type inquiry through isa, cast, and dyn_cast. 
+  ~VPWidenIntOrFpInductionRecipe() override = default;
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPDef *D) {
     return D->getVPDefID() == VPRecipeBase::VPWidenIntOrFpInductionSC;
-  } 
- 
-  /// Generate the vectorized and scalarized versions of the phi node as 
-  /// needed by their users. 
-  void execute(VPTransformState &State) override; 
- 
-  /// Print the recipe. 
-  void print(raw_ostream &O, const Twine &Indent, 
-             VPSlotTracker &SlotTracker) const override; 
+  }
+
+  /// Generate the vectorized and scalarized versions of the phi node as
+  /// needed by their users.
+  void execute(VPTransformState &State) override;
+
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
 
   /// Returns the start value of the induction.
   VPValue *getStartValue() { return getOperand(0); }
-}; 
- 
-/// A recipe for handling all phi nodes except for integer and FP inductions. 
+};
+
+/// A recipe for handling all phi nodes except for integer and FP inductions.
 /// For reduction PHIs, RdxDesc must point to the corresponding recurrence
 /// descriptor and the start value is the first operand of the recipe.
 class VPWidenPHIRecipe : public VPRecipeBase, public VPUser {
-  PHINode *Phi; 
- 
+  PHINode *Phi;
+
   /// Descriptor for a reduction PHI.
   RecurrenceDescriptor *RdxDesc = nullptr;
 
-public: 
+public:
   /// Create a new VPWidenPHIRecipe for the reduction \p Phi described by \p
   /// RdxDesc.
   VPWidenPHIRecipe(PHINode *Phi, RecurrenceDescriptor &RdxDesc, VPValue &Start)
@@ -998,78 +998,78 @@ public:
   VPWidenPHIRecipe(PHINode *Phi) : VPRecipeBase(VPWidenPHISC), Phi(Phi) {
     new VPValue(Phi, this);
   }
-  ~VPWidenPHIRecipe() override = default; 
- 
-  /// Method to support type inquiry through isa, cast, and dyn_cast. 
+  ~VPWidenPHIRecipe() override = default;
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPDef *D) {
     return D->getVPDefID() == VPRecipeBase::VPWidenPHISC;
-  } 
- 
-  /// Generate the phi/select nodes. 
-  void execute(VPTransformState &State) override; 
- 
-  /// Print the recipe. 
-  void print(raw_ostream &O, const Twine &Indent, 
-             VPSlotTracker &SlotTracker) const override; 
+  }
+
+  /// Generate the phi/select nodes.
+  void execute(VPTransformState &State) override;
+
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
 
   /// Returns the start value of the phi, if it is a reduction.
   VPValue *getStartValue() {
     return getNumOperands() == 0 ? nullptr : getOperand(0);
   }
-}; 
- 
-/// A recipe for vectorizing a phi-node as a sequence of mask-based select 
-/// instructions. 
+};
+
+/// A recipe for vectorizing a phi-node as a sequence of mask-based select
+/// instructions.
 class VPBlendRecipe : public VPRecipeBase, public VPUser {
-  PHINode *Phi; 
- 
+  PHINode *Phi;
+
 public:
-  /// The blend operation is a User of the incoming values and of their 
-  /// respective masks, ordered [I0, M0, I1, M1, ...]. Note that a single value 
-  /// might be incoming with a full mask for which there is no VPValue. 
-  VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Operands) 
+  /// The blend operation is a User of the incoming values and of their
+  /// respective masks, ordered [I0, M0, I1, M1, ...]. Note that a single value
+  /// might be incoming with a full mask for which there is no VPValue.
+  VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Operands)
       : VPRecipeBase(VPBlendSC), VPUser(Operands), Phi(Phi) {
     new VPValue(Phi, this);
-    assert(Operands.size() > 0 && 
-           ((Operands.size() == 1) || (Operands.size() % 2 == 0)) && 
-           "Expected either a single incoming value or a positive even number " 
-           "of operands"); 
-  } 
- 
-  /// Method to support type inquiry through isa, cast, and dyn_cast. 
+    assert(Operands.size() > 0 &&
+           ((Operands.size() == 1) || (Operands.size() % 2 == 0)) &&
+           "Expected either a single incoming value or a positive even number "
+           "of operands");
+  }
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPDef *D) {
     return D->getVPDefID() == VPRecipeBase::VPBlendSC;
-  } 
- 
-  /// Return the number of incoming values, taking into account that a single 
-  /// incoming value has no mask. 
+  }
+
+  /// Return the number of incoming values, taking into account that a single
+  /// incoming value has no mask.
   unsigned getNumIncomingValues() const { return (getNumOperands() + 1) / 2; }
- 
-  /// Return incoming value number \p Idx. 
+
+  /// Return incoming value number \p Idx.
   VPValue *getIncomingValue(unsigned Idx) const { return getOperand(Idx * 2); }
- 
-  /// Return mask number \p Idx. 
+
+  /// Return mask number \p Idx.
   VPValue *getMask(unsigned Idx) const { return getOperand(Idx * 2 + 1); }
- 
-  /// Generate the phi/select nodes. 
-  void execute(VPTransformState &State) override; 
- 
-  /// Print the recipe. 
-  void print(raw_ostream &O, const Twine &Indent, 
-             VPSlotTracker &SlotTracker) const override; 
-}; 
- 
-/// VPInterleaveRecipe is a recipe for transforming an interleave group of load 
+
+  /// Generate the phi/select nodes.
+  void execute(VPTransformState &State) override;
+
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+};
+
+/// VPInterleaveRecipe is a recipe for transforming an interleave group of load
 /// or stores into one wide load/store and shuffles. The first operand of a
 /// VPInterleave recipe is the address, followed by the stored values, followed
 /// by an optional mask.
 class VPInterleaveRecipe : public VPRecipeBase, public VPUser {
-  const InterleaveGroup<Instruction> *IG; 
- 
+  const InterleaveGroup<Instruction> *IG;
+
   bool HasMask = false;
 
-public: 
-  VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr, 
+public:
+  VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr,
                      ArrayRef<VPValue *> StoredValues, VPValue *Mask)
       : VPRecipeBase(VPInterleaveSC), VPUser(Addr), IG(IG) {
     for (unsigned i = 0; i < IG->getFactor(); ++i)
@@ -1085,26 +1085,26 @@ public:
       HasMask = true;
       addOperand(Mask);
     }
-  } 
-  ~VPInterleaveRecipe() override = default; 
- 
-  /// Method to support type inquiry through isa, cast, and dyn_cast. 
+  }
+  ~VPInterleaveRecipe() override = default;
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPDef *D) {
     return D->getVPDefID() == VPRecipeBase::VPInterleaveSC;
-  } 
- 
-  /// Return the address accessed by this recipe. 
-  VPValue *getAddr() const { 
+  }
+
+  /// Return the address accessed by this recipe.
+  VPValue *getAddr() const {
     return getOperand(0); // Address is the 1st, mandatory operand.
-  } 
- 
-  /// Return the mask used by this recipe. Note that a full mask is represented 
-  /// by a nullptr. 
-  VPValue *getMask() const { 
-    // Mask is optional and therefore the last, currently 2nd operand. 
+  }
+
+  /// Return the mask used by this recipe. Note that a full mask is represented
+  /// by a nullptr.
+  VPValue *getMask() const {
+    // Mask is optional and therefore the last, currently 2nd operand.
     return HasMask ? getOperand(getNumOperands() - 1) : nullptr;
-  } 
- 
+  }
+
   /// Return the VPValues stored by this interleave group. If it is a load
   /// interleave group, return an empty ArrayRef.
   ArrayRef<VPValue *> getStoredValues() const {
@@ -1114,16 +1114,16 @@ public:
         .slice(1, getNumOperands() - (HasMask ? 2 : 1));
   }
 
-  /// Generate the wide load or store, and shuffles. 
-  void execute(VPTransformState &State) override; 
- 
-  /// Print the recipe. 
-  void print(raw_ostream &O, const Twine &Indent, 
-             VPSlotTracker &SlotTracker) const override; 
- 
-  const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; } 
-}; 
- 
+  /// Generate the wide load or store, and shuffles.
+  void execute(VPTransformState &State) override;
+
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+
+  const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; }
+};
+
 /// A recipe to represent inloop reduction operations, performing a reduction on
 /// a vector operand into a scalar value, and adding the result to a chain.
 /// The Operands are {ChainOp, VecOp, [Condition]}.
@@ -1174,976 +1174,976 @@ public:
   }
 };
 
-/// VPReplicateRecipe replicates a given instruction producing multiple scalar 
-/// copies of the original scalar type, one per lane, instead of producing a 
-/// single copy of widened type for all lanes. If the instruction is known to be 
-/// uniform only one copy, per lane zero, will be generated. 
+/// VPReplicateRecipe replicates a given instruction producing multiple scalar
+/// copies of the original scalar type, one per lane, instead of producing a
+/// single copy of widened type for all lanes. If the instruction is known to be
+/// uniform only one copy, per lane zero, will be generated.
 class VPReplicateRecipe : public VPRecipeBase, public VPUser, public VPValue {
-  /// Indicator if only a single replica per lane is needed. 
-  bool IsUniform; 
- 
-  /// Indicator if the replicas are also predicated. 
-  bool IsPredicated; 
- 
-  /// Indicator if the scalar values should also be packed into a vector. 
-  bool AlsoPack; 
- 
-public: 
-  template <typename IterT> 
-  VPReplicateRecipe(Instruction *I, iterator_range<IterT> Operands, 
-                    bool IsUniform, bool IsPredicated = false) 
+  /// Indicator if only a single replica per lane is needed.
+  bool IsUniform;
+
+  /// Indicator if the replicas are also predicated.
+  bool IsPredicated;
+
+  /// Indicator if the scalar values should also be packed into a vector.
+  bool AlsoPack;
+
+public:
+  template <typename IterT>
+  VPReplicateRecipe(Instruction *I, iterator_range<IterT> Operands,
+                    bool IsUniform, bool IsPredicated = false)
       : VPRecipeBase(VPReplicateSC), VPUser(Operands),
         VPValue(VPVReplicateSC, I, this), IsUniform(IsUniform),
         IsPredicated(IsPredicated) {
-    // Retain the previous behavior of predicateInstructions(), where an 
-    // insert-element of a predicated instruction got hoisted into the 
-    // predicated basic block iff it was its only user. This is achieved by 
-    // having predicated instructions also pack their values into a vector by 
-    // default unless they have a replicated user which uses their scalar value. 
-    AlsoPack = IsPredicated && !I->use_empty(); 
-  } 
- 
-  ~VPReplicateRecipe() override = default; 
- 
-  /// Method to support type inquiry through isa, cast, and dyn_cast. 
+    // Retain the previous behavior of predicateInstructions(), where an
+    // insert-element of a predicated instruction got hoisted into the
+    // predicated basic block iff it was its only user. This is achieved by
+    // having predicated instructions also pack their values into a vector by
+    // default unless they have a replicated user which uses their scalar value.
+    AlsoPack = IsPredicated && !I->use_empty();
+  }
+
+  ~VPReplicateRecipe() override = default;
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPDef *D) {
     return D->getVPDefID() == VPRecipeBase::VPReplicateSC;
-  } 
- 
+  }
+
   static inline bool classof(const VPValue *V) {
     return V->getVPValueID() == VPValue::VPVReplicateSC;
   }
 
-  /// Generate replicas of the desired Ingredient. Replicas will be generated 
-  /// for all parts and lanes unless a specific part and lane are specified in 
-  /// the \p State. 
-  void execute(VPTransformState &State) override; 
- 
-  void setAlsoPack(bool Pack) { AlsoPack = Pack; } 
- 
-  /// Print the recipe. 
-  void print(raw_ostream &O, const Twine &Indent, 
-             VPSlotTracker &SlotTracker) const override; 
+  /// Generate replicas of the desired Ingredient. Replicas will be generated
+  /// for all parts and lanes unless a specific part and lane are specified in
+  /// the \p State.
+  void execute(VPTransformState &State) override;
+
+  void setAlsoPack(bool Pack) { AlsoPack = Pack; }
+
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
 
   bool isUniform() const { return IsUniform; }
-}; 
- 
-/// A recipe for generating conditional branches on the bits of a mask. 
+};
+
+/// A recipe for generating conditional branches on the bits of a mask.
 class VPBranchOnMaskRecipe : public VPRecipeBase, public VPUser {
-public: 
-  VPBranchOnMaskRecipe(VPValue *BlockInMask) : VPRecipeBase(VPBranchOnMaskSC) { 
-    if (BlockInMask) // nullptr means all-one mask. 
+public:
+  VPBranchOnMaskRecipe(VPValue *BlockInMask) : VPRecipeBase(VPBranchOnMaskSC) {
+    if (BlockInMask) // nullptr means all-one mask.
       addOperand(BlockInMask);
-  } 
- 
-  /// Method to support type inquiry through isa, cast, and dyn_cast. 
+  }
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPDef *D) {
     return D->getVPDefID() == VPRecipeBase::VPBranchOnMaskSC;
-  } 
- 
-  /// Generate the extraction of the appropriate bit from the block mask and the 
-  /// conditional branch. 
-  void execute(VPTransformState &State) override; 
- 
-  /// Print the recipe. 
-  void print(raw_ostream &O, const Twine &Indent, 
-             VPSlotTracker &SlotTracker) const override { 
-    O << " +\n" << Indent << "\"BRANCH-ON-MASK "; 
-    if (VPValue *Mask = getMask()) 
+  }
+
+  /// Generate the extraction of the appropriate bit from the block mask and the
+  /// conditional branch.
+  void execute(VPTransformState &State) override;
+
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override {
+    O << " +\n" << Indent << "\"BRANCH-ON-MASK ";
+    if (VPValue *Mask = getMask())
       Mask->printAsOperand(O, SlotTracker);
-    else 
-      O << " All-One"; 
-    O << "\\l\""; 
-  } 
- 
-  /// Return the mask used by this recipe. Note that a full mask is represented 
-  /// by a nullptr. 
-  VPValue *getMask() const { 
+    else
+      O << " All-One";
+    O << "\\l\"";
+  }
+
+  /// Return the mask used by this recipe. Note that a full mask is represented
+  /// by a nullptr.
+  VPValue *getMask() const {
     assert(getNumOperands() <= 1 && "should have either 0 or 1 operands");
-    // Mask is optional. 
+    // Mask is optional.
     return getNumOperands() == 1 ? getOperand(0) : nullptr;
-  } 
-}; 
- 
-/// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when 
-/// control converges back from a Branch-on-Mask. The phi nodes are needed in 
-/// order to merge values that are set under such a branch and feed their uses. 
-/// The phi nodes can be scalar or vector depending on the users of the value. 
-/// This recipe works in concert with VPBranchOnMaskRecipe. 
+  }
+};
+
+/// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when
+/// control converges back from a Branch-on-Mask. The phi nodes are needed in
+/// order to merge values that are set under such a branch and feed their uses.
+/// The phi nodes can be scalar or vector depending on the users of the value.
+/// This recipe works in concert with VPBranchOnMaskRecipe.
 class VPPredInstPHIRecipe : public VPRecipeBase, public VPUser {
- 
-public: 
-  /// Construct a VPPredInstPHIRecipe given \p PredInst whose value needs a phi 
-  /// nodes after merging back from a Branch-on-Mask. 
+
+public:
+  /// Construct a VPPredInstPHIRecipe given \p PredInst whose value needs a phi
+  /// nodes after merging back from a Branch-on-Mask.
   VPPredInstPHIRecipe(VPValue *PredV)
       : VPRecipeBase(VPPredInstPHISC), VPUser(PredV) {
     new VPValue(PredV->getUnderlyingValue(), this);
   }
-  ~VPPredInstPHIRecipe() override = default; 
- 
-  /// Method to support type inquiry through isa, cast, and dyn_cast. 
+  ~VPPredInstPHIRecipe() override = default;
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPDef *D) {
     return D->getVPDefID() == VPRecipeBase::VPPredInstPHISC;
-  } 
- 
-  /// Generates phi nodes for live-outs as needed to retain SSA form. 
-  void execute(VPTransformState &State) override; 
- 
-  /// Print the recipe. 
-  void print(raw_ostream &O, const Twine &Indent, 
-             VPSlotTracker &SlotTracker) const override; 
-}; 
- 
-/// A Recipe for widening load/store operations. 
-/// The recipe uses the following VPValues: 
-/// - For load: Address, optional mask 
-/// - For store: Address, stored value, optional mask 
-/// TODO: We currently execute only per-part unless a specific instance is 
-/// provided. 
+  }
+
+  /// Generates phi nodes for live-outs as needed to retain SSA form.
+  void execute(VPTransformState &State) override;
+
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+};
+
+/// A Recipe for widening load/store operations.
+/// The recipe uses the following VPValues:
+/// - For load: Address, optional mask
+/// - For store: Address, stored value, optional mask
+/// TODO: We currently execute only per-part unless a specific instance is
+/// provided.
 class VPWidenMemoryInstructionRecipe : public VPRecipeBase,
                                        public VPUser {
   Instruction &Ingredient;
- 
-  void setMask(VPValue *Mask) { 
-    if (!Mask) 
-      return; 
+
+  void setMask(VPValue *Mask) {
+    if (!Mask)
+      return;
     addOperand(Mask);
-  } 
- 
-  bool isMasked() const { 
+  }
+
+  bool isMasked() const {
     return isStore() ? getNumOperands() == 3 : getNumOperands() == 2;
-  } 
- 
-public: 
-  VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask) 
+  }
+
+public:
+  VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask)
       : VPRecipeBase(VPWidenMemoryInstructionSC), VPUser({Addr}),
         Ingredient(Load) {
     new VPValue(VPValue::VPVMemoryInstructionSC, &Load, this);
-    setMask(Mask); 
-  } 
- 
-  VPWidenMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr, 
-                                 VPValue *StoredValue, VPValue *Mask) 
+    setMask(Mask);
+  }
+
+  VPWidenMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr,
+                                 VPValue *StoredValue, VPValue *Mask)
       : VPRecipeBase(VPWidenMemoryInstructionSC), VPUser({Addr, StoredValue}),
         Ingredient(Store) {
-    setMask(Mask); 
-  } 
- 
-  /// Method to support type inquiry through isa, cast, and dyn_cast. 
+    setMask(Mask);
+  }
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPDef *D) {
     return D->getVPDefID() == VPRecipeBase::VPWidenMemoryInstructionSC;
-  } 
- 
-  /// Return the address accessed by this recipe. 
-  VPValue *getAddr() const { 
+  }
+
+  /// Return the address accessed by this recipe.
+  VPValue *getAddr() const {
     return getOperand(0); // Address is the 1st, mandatory operand.
-  } 
- 
-  /// Return the mask used by this recipe. Note that a full mask is represented 
-  /// by a nullptr. 
-  VPValue *getMask() const { 
-    // Mask is optional and therefore the last operand. 
+  }
+
+  /// Return the mask used by this recipe. Note that a full mask is represented
+  /// by a nullptr.
+  VPValue *getMask() const {
+    // Mask is optional and therefore the last operand.
     return isMasked() ? getOperand(getNumOperands() - 1) : nullptr;
-  } 
- 
+  }
+
   /// Returns true if this recipe is a store.
   bool isStore() const { return isa<StoreInst>(Ingredient); }
 
-  /// Return the address accessed by this recipe. 
-  VPValue *getStoredValue() const { 
+  /// Return the address accessed by this recipe.
+  VPValue *getStoredValue() const {
     assert(isStore() && "Stored value only available for store instructions");
     return getOperand(1); // Stored value is the 2nd, mandatory operand.
-  } 
- 
-  /// Generate the wide load/store. 
-  void execute(VPTransformState &State) override; 
- 
-  /// Print the recipe. 
-  void print(raw_ostream &O, const Twine &Indent, 
-             VPSlotTracker &SlotTracker) const override; 
-}; 
- 
-/// A Recipe for widening the canonical induction variable of the vector loop. 
-class VPWidenCanonicalIVRecipe : public VPRecipeBase { 
+  }
+
+  /// Generate the wide load/store.
+  void execute(VPTransformState &State) override;
+
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+};
+
+/// A Recipe for widening the canonical induction variable of the vector loop.
+class VPWidenCanonicalIVRecipe : public VPRecipeBase {
 public:
   VPWidenCanonicalIVRecipe() : VPRecipeBase(VPWidenCanonicalIVSC) {
     new VPValue(nullptr, this);
   }
- 
-  ~VPWidenCanonicalIVRecipe() override = default; 
- 
-  /// Method to support type inquiry through isa, cast, and dyn_cast. 
+
+  ~VPWidenCanonicalIVRecipe() override = default;
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPDef *D) {
     return D->getVPDefID() == VPRecipeBase::VPWidenCanonicalIVSC;
-  } 
- 
-  /// Generate a canonical vector induction variable of the vector loop, with 
-  /// start = {<Part*VF, Part*VF+1, ..., Part*VF+VF-1> for 0 <= Part < UF}, and 
-  /// step = <VF*UF, VF*UF, ..., VF*UF>. 
-  void execute(VPTransformState &State) override; 
- 
-  /// Print the recipe. 
-  void print(raw_ostream &O, const Twine &Indent, 
-             VPSlotTracker &SlotTracker) const override; 
-}; 
- 
-/// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It 
-/// holds a sequence of zero or more VPRecipe's each representing a sequence of 
-/// output IR instructions. 
-class VPBasicBlock : public VPBlockBase { 
-public: 
-  using RecipeListTy = iplist<VPRecipeBase>; 
- 
-private: 
-  /// The VPRecipes held in the order of output instructions to generate. 
-  RecipeListTy Recipes; 
- 
-public: 
-  VPBasicBlock(const Twine &Name = "", VPRecipeBase *Recipe = nullptr) 
-      : VPBlockBase(VPBasicBlockSC, Name.str()) { 
-    if (Recipe) 
-      appendRecipe(Recipe); 
-  } 
- 
-  ~VPBasicBlock() override { Recipes.clear(); } 
- 
-  /// Instruction iterators... 
-  using iterator = RecipeListTy::iterator; 
-  using const_iterator = RecipeListTy::const_iterator; 
-  using reverse_iterator = RecipeListTy::reverse_iterator; 
-  using const_reverse_iterator = RecipeListTy::const_reverse_iterator; 
- 
-  //===--------------------------------------------------------------------===// 
-  /// Recipe iterator methods 
-  /// 
-  inline iterator begin() { return Recipes.begin(); } 
-  inline const_iterator begin() const { return Recipes.begin(); } 
-  inline iterator end() { return Recipes.end(); } 
-  inline const_iterator end() const { return Recipes.end(); } 
- 
-  inline reverse_iterator rbegin() { return Recipes.rbegin(); } 
-  inline const_reverse_iterator rbegin() const { return Recipes.rbegin(); } 
-  inline reverse_iterator rend() { return Recipes.rend(); } 
-  inline const_reverse_iterator rend() const { return Recipes.rend(); } 
- 
-  inline size_t size() const { return Recipes.size(); } 
-  inline bool empty() const { return Recipes.empty(); } 
-  inline const VPRecipeBase &front() const { return Recipes.front(); } 
-  inline VPRecipeBase &front() { return Recipes.front(); } 
-  inline const VPRecipeBase &back() const { return Recipes.back(); } 
-  inline VPRecipeBase &back() { return Recipes.back(); } 
- 
-  /// Returns a reference to the list of recipes. 
-  RecipeListTy &getRecipeList() { return Recipes; } 
- 
-  /// Returns a pointer to a member of the recipe list. 
-  static RecipeListTy VPBasicBlock::*getSublistAccess(VPRecipeBase *) { 
-    return &VPBasicBlock::Recipes; 
-  } 
- 
-  /// Method to support type inquiry through isa, cast, and dyn_cast. 
-  static inline bool classof(const VPBlockBase *V) { 
-    return V->getVPBlockID() == VPBlockBase::VPBasicBlockSC; 
-  } 
- 
-  void insert(VPRecipeBase *Recipe, iterator InsertPt) { 
-    assert(Recipe && "No recipe to append."); 
-    assert(!Recipe->Parent && "Recipe already in VPlan"); 
-    Recipe->Parent = this; 
-    Recipes.insert(InsertPt, Recipe); 
-  } 
- 
-  /// Augment the existing recipes of a VPBasicBlock with an additional 
-  /// \p Recipe as the last recipe. 
-  void appendRecipe(VPRecipeBase *Recipe) { insert(Recipe, end()); } 
- 
-  /// The method which generates the output IR instructions that correspond to 
-  /// this VPBasicBlock, thereby "executing" the VPlan. 
-  void execute(struct VPTransformState *State) override; 
- 
+  }
+
+  /// Generate a canonical vector induction variable of the vector loop, with
+  /// start = {<Part*VF, Part*VF+1, ..., Part*VF+VF-1> for 0 <= Part < UF}, and
+  /// step = <VF*UF, VF*UF, ..., VF*UF>.
+  void execute(VPTransformState &State) override;
+
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+};
+
+/// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It
+/// holds a sequence of zero or more VPRecipe's each representing a sequence of
+/// output IR instructions.
+class VPBasicBlock : public VPBlockBase {
+public:
+  using RecipeListTy = iplist<VPRecipeBase>;
+
+private:
+  /// The VPRecipes held in the order of output instructions to generate.
+  RecipeListTy Recipes;
+
+public:
+  VPBasicBlock(const Twine &Name = "", VPRecipeBase *Recipe = nullptr)
+      : VPBlockBase(VPBasicBlockSC, Name.str()) {
+    if (Recipe)
+      appendRecipe(Recipe);
+  }
+
+  ~VPBasicBlock() override { Recipes.clear(); }
+
+  /// Instruction iterators...
+  using iterator = RecipeListTy::iterator;
+  using const_iterator = RecipeListTy::const_iterator;
+  using reverse_iterator = RecipeListTy::reverse_iterator;
+  using const_reverse_iterator = RecipeListTy::const_reverse_iterator;
+
+  //===--------------------------------------------------------------------===//
+  /// Recipe iterator methods
+  ///
+  inline iterator begin() { return Recipes.begin(); }
+  inline const_iterator begin() const { return Recipes.begin(); }
+  inline iterator end() { return Recipes.end(); }
+  inline const_iterator end() const { return Recipes.end(); }
+
+  inline reverse_iterator rbegin() { return Recipes.rbegin(); }
+  inline const_reverse_iterator rbegin() const { return Recipes.rbegin(); }
+  inline reverse_iterator rend() { return Recipes.rend(); }
+  inline const_reverse_iterator rend() const { return Recipes.rend(); }
+
+  inline size_t size() const { return Recipes.size(); }
+  inline bool empty() const { return Recipes.empty(); }
+  inline const VPRecipeBase &front() const { return Recipes.front(); }
+  inline VPRecipeBase &front() { return Recipes.front(); }
+  inline const VPRecipeBase &back() const { return Recipes.back(); }
+  inline VPRecipeBase &back() { return Recipes.back(); }
+
+  /// Returns a reference to the list of recipes.
+  RecipeListTy &getRecipeList() { return Recipes; }
+
+  /// Returns a pointer to a member of the recipe list.
+  static RecipeListTy VPBasicBlock::*getSublistAccess(VPRecipeBase *) {
+    return &VPBasicBlock::Recipes;
+  }
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
+  static inline bool classof(const VPBlockBase *V) {
+    return V->getVPBlockID() == VPBlockBase::VPBasicBlockSC;
+  }
+
+  void insert(VPRecipeBase *Recipe, iterator InsertPt) {
+    assert(Recipe && "No recipe to append.");
+    assert(!Recipe->Parent && "Recipe already in VPlan");
+    Recipe->Parent = this;
+    Recipes.insert(InsertPt, Recipe);
+  }
+
+  /// Augment the existing recipes of a VPBasicBlock with an additional
+  /// \p Recipe as the last recipe.
+  void appendRecipe(VPRecipeBase *Recipe) { insert(Recipe, end()); }
+
+  /// The method which generates the output IR instructions that correspond to
+  /// this VPBasicBlock, thereby "executing" the VPlan.
+  void execute(struct VPTransformState *State) override;
+
   /// Return the position of the first non-phi node recipe in the block.
   iterator getFirstNonPhi();
 
   void dropAllReferences(VPValue *NewValue) override;
 
-private: 
-  /// Create an IR BasicBlock to hold the output instructions generated by this 
-  /// VPBasicBlock, and return it. Update the CFGState accordingly. 
-  BasicBlock *createEmptyBasicBlock(VPTransformState::CFGState &CFG); 
-}; 
- 
-/// VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks 
-/// which form a Single-Entry-Single-Exit subgraph of the output IR CFG. 
-/// A VPRegionBlock may indicate that its contents are to be replicated several 
-/// times. This is designed to support predicated scalarization, in which a 
-/// scalar if-then code structure needs to be generated VF * UF times. Having 
-/// this replication indicator helps to keep a single model for multiple 
-/// candidate VF's. The actual replication takes place only once the desired VF 
-/// and UF have been determined. 
-class VPRegionBlock : public VPBlockBase { 
-  /// Hold the Single Entry of the SESE region modelled by the VPRegionBlock. 
-  VPBlockBase *Entry; 
- 
-  /// Hold the Single Exit of the SESE region modelled by the VPRegionBlock. 
-  VPBlockBase *Exit; 
- 
-  /// An indicator whether this region is to generate multiple replicated 
-  /// instances of output IR corresponding to its VPBlockBases. 
-  bool IsReplicator; 
- 
-public: 
-  VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exit, 
-                const std::string &Name = "", bool IsReplicator = false) 
-      : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exit(Exit), 
-        IsReplicator(IsReplicator) { 
-    assert(Entry->getPredecessors().empty() && "Entry block has predecessors."); 
-    assert(Exit->getSuccessors().empty() && "Exit block has successors."); 
-    Entry->setParent(this); 
-    Exit->setParent(this); 
-  } 
-  VPRegionBlock(const std::string &Name = "", bool IsReplicator = false) 
-      : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exit(nullptr), 
-        IsReplicator(IsReplicator) {} 
- 
-  ~VPRegionBlock() override { 
+private:
+  /// Create an IR BasicBlock to hold the output instructions generated by this
+  /// VPBasicBlock, and return it. Update the CFGState accordingly.
+  BasicBlock *createEmptyBasicBlock(VPTransformState::CFGState &CFG);
+};
+
+/// VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks
+/// which form a Single-Entry-Single-Exit subgraph of the output IR CFG.
+/// A VPRegionBlock may indicate that its contents are to be replicated several
+/// times. This is designed to support predicated scalarization, in which a
+/// scalar if-then code structure needs to be generated VF * UF times. Having
+/// this replication indicator helps to keep a single model for multiple
+/// candidate VF's. The actual replication takes place only once the desired VF
+/// and UF have been determined.
+class VPRegionBlock : public VPBlockBase {
+  /// Hold the Single Entry of the SESE region modelled by the VPRegionBlock.
+  VPBlockBase *Entry;
+
+  /// Hold the Single Exit of the SESE region modelled by the VPRegionBlock.
+  VPBlockBase *Exit;
+
+  /// An indicator whether this region is to generate multiple replicated
+  /// instances of output IR corresponding to its VPBlockBases.
+  bool IsReplicator;
+
+public:
+  VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exit,
+                const std::string &Name = "", bool IsReplicator = false)
+      : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exit(Exit),
+        IsReplicator(IsReplicator) {
+    assert(Entry->getPredecessors().empty() && "Entry block has predecessors.");
+    assert(Exit->getSuccessors().empty() && "Exit block has successors.");
+    Entry->setParent(this);
+    Exit->setParent(this);
+  }
+  VPRegionBlock(const std::string &Name = "", bool IsReplicator = false)
+      : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exit(nullptr),
+        IsReplicator(IsReplicator) {}
+
+  ~VPRegionBlock() override {
     if (Entry) {
       VPValue DummyValue;
       Entry->dropAllReferences(&DummyValue);
-      deleteCFG(Entry); 
+      deleteCFG(Entry);
     }
-  } 
- 
-  /// Method to support type inquiry through isa, cast, and dyn_cast. 
-  static inline bool classof(const VPBlockBase *V) { 
-    return V->getVPBlockID() == VPBlockBase::VPRegionBlockSC; 
-  } 
- 
-  const VPBlockBase *getEntry() const { return Entry; } 
-  VPBlockBase *getEntry() { return Entry; } 
- 
-  /// Set \p EntryBlock as the entry VPBlockBase of this VPRegionBlock. \p 
-  /// EntryBlock must have no predecessors. 
-  void setEntry(VPBlockBase *EntryBlock) { 
-    assert(EntryBlock->getPredecessors().empty() && 
-           "Entry block cannot have predecessors."); 
-    Entry = EntryBlock; 
-    EntryBlock->setParent(this); 
-  } 
- 
-  // FIXME: DominatorTreeBase is doing 'A->getParent()->front()'. 'front' is a 
-  // specific interface of llvm::Function, instead of using 
-  // GraphTraints::getEntryNode. We should add a new template parameter to 
-  // DominatorTreeBase representing the Graph type. 
-  VPBlockBase &front() const { return *Entry; } 
- 
-  const VPBlockBase *getExit() const { return Exit; } 
-  VPBlockBase *getExit() { return Exit; } 
- 
-  /// Set \p ExitBlock as the exit VPBlockBase of this VPRegionBlock. \p 
-  /// ExitBlock must have no successors. 
-  void setExit(VPBlockBase *ExitBlock) { 
-    assert(ExitBlock->getSuccessors().empty() && 
-           "Exit block cannot have successors."); 
-    Exit = ExitBlock; 
-    ExitBlock->setParent(this); 
-  } 
- 
-  /// An indicator whether this region is to generate multiple replicated 
-  /// instances of output IR corresponding to its VPBlockBases. 
-  bool isReplicator() const { return IsReplicator; } 
- 
-  /// The method which generates the output IR instructions that correspond to 
-  /// this VPRegionBlock, thereby "executing" the VPlan. 
-  void execute(struct VPTransformState *State) override; 
+  }
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
+  static inline bool classof(const VPBlockBase *V) {
+    return V->getVPBlockID() == VPBlockBase::VPRegionBlockSC;
+  }
+
+  const VPBlockBase *getEntry() const { return Entry; }
+  VPBlockBase *getEntry() { return Entry; }
+
+  /// Set \p EntryBlock as the entry VPBlockBase of this VPRegionBlock. \p
+  /// EntryBlock must have no predecessors.
+  void setEntry(VPBlockBase *EntryBlock) {
+    assert(EntryBlock->getPredecessors().empty() &&
+           "Entry block cannot have predecessors.");
+    Entry = EntryBlock;
+    EntryBlock->setParent(this);
+  }
+
+  // FIXME: DominatorTreeBase is doing 'A->getParent()->front()'. 'front' is a
+  // specific interface of llvm::Function, instead of using
+  // GraphTraints::getEntryNode. We should add a new template parameter to
+  // DominatorTreeBase representing the Graph type.
+  VPBlockBase &front() const { return *Entry; }
+
+  const VPBlockBase *getExit() const { return Exit; }
+  VPBlockBase *getExit() { return Exit; }
+
+  /// Set \p ExitBlock as the exit VPBlockBase of this VPRegionBlock. \p
+  /// ExitBlock must have no successors.
+  void setExit(VPBlockBase *ExitBlock) {
+    assert(ExitBlock->getSuccessors().empty() &&
+           "Exit block cannot have successors.");
+    Exit = ExitBlock;
+    ExitBlock->setParent(this);
+  }
+
+  /// An indicator whether this region is to generate multiple replicated
+  /// instances of output IR corresponding to its VPBlockBases.
+  bool isReplicator() const { return IsReplicator; }
+
+  /// The method which generates the output IR instructions that correspond to
+  /// this VPRegionBlock, thereby "executing" the VPlan.
+  void execute(struct VPTransformState *State) override;
 
   void dropAllReferences(VPValue *NewValue) override;
-}; 
- 
-//===----------------------------------------------------------------------===// 
-// GraphTraits specializations for VPlan Hierarchical Control-Flow Graphs     // 
-//===----------------------------------------------------------------------===// 
- 
-// The following set of template specializations implement GraphTraits to treat 
-// any VPBlockBase as a node in a graph of VPBlockBases. It's important to note 
-// that VPBlockBase traits don't recurse into VPRegioBlocks, i.e., if the 
-// VPBlockBase is a VPRegionBlock, this specialization provides access to its 
-// successors/predecessors but not to the blocks inside the region. 
- 
-template <> struct GraphTraits<VPBlockBase *> { 
-  using NodeRef = VPBlockBase *; 
-  using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator; 
- 
-  static NodeRef getEntryNode(NodeRef N) { return N; } 
- 
-  static inline ChildIteratorType child_begin(NodeRef N) { 
-    return N->getSuccessors().begin(); 
-  } 
- 
-  static inline ChildIteratorType child_end(NodeRef N) { 
-    return N->getSuccessors().end(); 
-  } 
-}; 
- 
-template <> struct GraphTraits<const VPBlockBase *> { 
-  using NodeRef = const VPBlockBase *; 
-  using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::const_iterator; 
- 
-  static NodeRef getEntryNode(NodeRef N) { return N; } 
- 
-  static inline ChildIteratorType child_begin(NodeRef N) { 
-    return N->getSuccessors().begin(); 
-  } 
- 
-  static inline ChildIteratorType child_end(NodeRef N) { 
-    return N->getSuccessors().end(); 
-  } 
-}; 
- 
-// Inverse order specialization for VPBasicBlocks. Predecessors are used instead 
-// of successors for the inverse traversal. 
-template <> struct GraphTraits<Inverse<VPBlockBase *>> { 
-  using NodeRef = VPBlockBase *; 
-  using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator; 
- 
-  static NodeRef getEntryNode(Inverse<NodeRef> B) { return B.Graph; } 
- 
-  static inline ChildIteratorType child_begin(NodeRef N) { 
-    return N->getPredecessors().begin(); 
-  } 
- 
-  static inline ChildIteratorType child_end(NodeRef N) { 
-    return N->getPredecessors().end(); 
-  } 
-}; 
- 
-// The following set of template specializations implement GraphTraits to 
-// treat VPRegionBlock as a graph and recurse inside its nodes. It's important 
-// to note that the blocks inside the VPRegionBlock are treated as VPBlockBases 
-// (i.e., no dyn_cast is performed, VPBlockBases specialization is used), so 
-// there won't be automatic recursion into other VPBlockBases that turn to be 
-// VPRegionBlocks. 
- 
-template <> 
-struct GraphTraits<VPRegionBlock *> : public GraphTraits<VPBlockBase *> { 
-  using GraphRef = VPRegionBlock *; 
-  using nodes_iterator = df_iterator<NodeRef>; 
- 
-  static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); } 
- 
-  static nodes_iterator nodes_begin(GraphRef N) { 
-    return nodes_iterator::begin(N->getEntry()); 
-  } 
- 
-  static nodes_iterator nodes_end(GraphRef N) { 
-    // df_iterator::end() returns an empty iterator so the node used doesn't 
-    // matter. 
-    return nodes_iterator::end(N); 
-  } 
-}; 
- 
-template <> 
-struct GraphTraits<const VPRegionBlock *> 
-    : public GraphTraits<const VPBlockBase *> { 
-  using GraphRef = const VPRegionBlock *; 
-  using nodes_iterator = df_iterator<NodeRef>; 
- 
-  static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); } 
- 
-  static nodes_iterator nodes_begin(GraphRef N) { 
-    return nodes_iterator::begin(N->getEntry()); 
-  } 
- 
-  static nodes_iterator nodes_end(GraphRef N) { 
-    // df_iterator::end() returns an empty iterator so the node used doesn't 
-    // matter. 
-    return nodes_iterator::end(N); 
-  } 
-}; 
- 
-template <> 
-struct GraphTraits<Inverse<VPRegionBlock *>> 
-    : public GraphTraits<Inverse<VPBlockBase *>> { 
-  using GraphRef = VPRegionBlock *; 
-  using nodes_iterator = df_iterator<NodeRef>; 
- 
-  static NodeRef getEntryNode(Inverse<GraphRef> N) { 
-    return N.Graph->getExit(); 
-  } 
- 
-  static nodes_iterator nodes_begin(GraphRef N) { 
-    return nodes_iterator::begin(N->getExit()); 
-  } 
- 
-  static nodes_iterator nodes_end(GraphRef N) { 
-    // df_iterator::end() returns an empty iterator so the node used doesn't 
-    // matter. 
-    return nodes_iterator::end(N); 
-  } 
-}; 
- 
-/// VPlan models a candidate for vectorization, encoding various decisions take 
-/// to produce efficient output IR, including which branches, basic-blocks and 
-/// output IR instructions to generate, and their cost. VPlan holds a 
-/// Hierarchical-CFG of VPBasicBlocks and VPRegionBlocks rooted at an Entry 
-/// VPBlock. 
-class VPlan { 
-  friend class VPlanPrinter; 
-  friend class VPSlotTracker; 
- 
-  /// Hold the single entry to the Hierarchical CFG of the VPlan. 
-  VPBlockBase *Entry; 
- 
-  /// Holds the VFs applicable to this VPlan. 
+};
+
+//===----------------------------------------------------------------------===//
+// GraphTraits specializations for VPlan Hierarchical Control-Flow Graphs     //
+//===----------------------------------------------------------------------===//
+
+// The following set of template specializations implement GraphTraits to treat
+// any VPBlockBase as a node in a graph of VPBlockBases. It's important to note
+// that VPBlockBase traits don't recurse into VPRegioBlocks, i.e., if the
+// VPBlockBase is a VPRegionBlock, this specialization provides access to its
+// successors/predecessors but not to the blocks inside the region.
+
+template <> struct GraphTraits<VPBlockBase *> {
+  using NodeRef = VPBlockBase *;
+  using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator;
+
+  static NodeRef getEntryNode(NodeRef N) { return N; }
+
+  static inline ChildIteratorType child_begin(NodeRef N) {
+    return N->getSuccessors().begin();
+  }
+
+  static inline ChildIteratorType child_end(NodeRef N) {
+    return N->getSuccessors().end();
+  }
+};
+
+template <> struct GraphTraits<const VPBlockBase *> {
+  using NodeRef = const VPBlockBase *;
+  using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::const_iterator;
+
+  static NodeRef getEntryNode(NodeRef N) { return N; }
+
+  static inline ChildIteratorType child_begin(NodeRef N) {
+    return N->getSuccessors().begin();
+  }
+
+  static inline ChildIteratorType child_end(NodeRef N) {
+    return N->getSuccessors().end();
+  }
+};
+
+// Inverse order specialization for VPBasicBlocks. Predecessors are used instead
+// of successors for the inverse traversal.
+template <> struct GraphTraits<Inverse<VPBlockBase *>> {
+  using NodeRef = VPBlockBase *;
+  using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator;
+
+  static NodeRef getEntryNode(Inverse<NodeRef> B) { return B.Graph; }
+
+  static inline ChildIteratorType child_begin(NodeRef N) {
+    return N->getPredecessors().begin();
+  }
+
+  static inline ChildIteratorType child_end(NodeRef N) {
+    return N->getPredecessors().end();
+  }
+};
+
+// The following set of template specializations implement GraphTraits to
+// treat VPRegionBlock as a graph and recurse inside its nodes. It's important
+// to note that the blocks inside the VPRegionBlock are treated as VPBlockBases
+// (i.e., no dyn_cast is performed, VPBlockBases specialization is used), so
+// there won't be automatic recursion into other VPBlockBases that turn to be
+// VPRegionBlocks.
+
+template <>
+struct GraphTraits<VPRegionBlock *> : public GraphTraits<VPBlockBase *> {
+  using GraphRef = VPRegionBlock *;
+  using nodes_iterator = df_iterator<NodeRef>;
+
+  static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); }
+
+  static nodes_iterator nodes_begin(GraphRef N) {
+    return nodes_iterator::begin(N->getEntry());
+  }
+
+  static nodes_iterator nodes_end(GraphRef N) {
+    // df_iterator::end() returns an empty iterator so the node used doesn't
+    // matter.
+    return nodes_iterator::end(N);
+  }
+};
+
+template <>
+struct GraphTraits<const VPRegionBlock *>
+    : public GraphTraits<const VPBlockBase *> {
+  using GraphRef = const VPRegionBlock *;
+  using nodes_iterator = df_iterator<NodeRef>;
+
+  static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); }
+
+  static nodes_iterator nodes_begin(GraphRef N) {
+    return nodes_iterator::begin(N->getEntry());
+  }
+
+  static nodes_iterator nodes_end(GraphRef N) {
+    // df_iterator::end() returns an empty iterator so the node used doesn't
+    // matter.
+    return nodes_iterator::end(N);
+  }
+};
+
+template <>
+struct GraphTraits<Inverse<VPRegionBlock *>>
+    : public GraphTraits<Inverse<VPBlockBase *>> {
+  using GraphRef = VPRegionBlock *;
+  using nodes_iterator = df_iterator<NodeRef>;
+
+  static NodeRef getEntryNode(Inverse<GraphRef> N) {
+    return N.Graph->getExit();
+  }
+
+  static nodes_iterator nodes_begin(GraphRef N) {
+    return nodes_iterator::begin(N->getExit());
+  }
+
+  static nodes_iterator nodes_end(GraphRef N) {
+    // df_iterator::end() returns an empty iterator so the node used doesn't
+    // matter.
+    return nodes_iterator::end(N);
+  }
+};
+
+/// VPlan models a candidate for vectorization, encoding various decisions take
+/// to produce efficient output IR, including which branches, basic-blocks and
+/// output IR instructions to generate, and their cost. VPlan holds a
+/// Hierarchical-CFG of VPBasicBlocks and VPRegionBlocks rooted at an Entry
+/// VPBlock.
+class VPlan {
+  friend class VPlanPrinter;
+  friend class VPSlotTracker;
+
+  /// Hold the single entry to the Hierarchical CFG of the VPlan.
+  VPBlockBase *Entry;
+
+  /// Holds the VFs applicable to this VPlan.
   SmallSetVector<ElementCount, 2> VFs;
- 
-  /// Holds the name of the VPlan, for printing. 
-  std::string Name; 
- 
-  /// Holds all the external definitions created for this VPlan. 
-  // TODO: Introduce a specific representation for external definitions in 
-  // VPlan. External definitions must be immutable and hold a pointer to its 
-  // underlying IR that will be used to implement its structural comparison 
-  // (operators '==' and '<'). 
-  SmallPtrSet<VPValue *, 16> VPExternalDefs; 
- 
-  /// Represents the backedge taken count of the original loop, for folding 
-  /// the tail. 
-  VPValue *BackedgeTakenCount = nullptr; 
- 
-  /// Holds a mapping between Values and their corresponding VPValue inside 
-  /// VPlan. 
-  Value2VPValueTy Value2VPValue; 
- 
+
+  /// Holds the name of the VPlan, for printing.
+  std::string Name;
+
+  /// Holds all the external definitions created for this VPlan.
+  // TODO: Introduce a specific representation for external definitions in
+  // VPlan. External definitions must be immutable and hold a pointer to its
+  // underlying IR that will be used to implement its structural comparison
+  // (operators '==' and '<').
+  SmallPtrSet<VPValue *, 16> VPExternalDefs;
+
+  /// Represents the backedge taken count of the original loop, for folding
+  /// the tail.
+  VPValue *BackedgeTakenCount = nullptr;
+
+  /// Holds a mapping between Values and their corresponding VPValue inside
+  /// VPlan.
+  Value2VPValueTy Value2VPValue;
+
   /// Contains all VPValues that been allocated by addVPValue directly and need
   /// to be free when the plan's destructor is called.
   SmallVector<VPValue *, 16> VPValuesToFree;
 
-  /// Holds the VPLoopInfo analysis for this VPlan. 
-  VPLoopInfo VPLInfo; 
- 
-  /// Holds the condition bit values built during VPInstruction to VPRecipe transformation. 
-  SmallVector<VPValue *, 4> VPCBVs; 
- 
-public: 
-  VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) { 
-    if (Entry) 
-      Entry->setPlan(this); 
-  } 
- 
-  ~VPlan() { 
+  /// Holds the VPLoopInfo analysis for this VPlan.
+  VPLoopInfo VPLInfo;
+
+  /// Holds the condition bit values built during VPInstruction to VPRecipe transformation.
+  SmallVector<VPValue *, 4> VPCBVs;
+
+public:
+  VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) {
+    if (Entry)
+      Entry->setPlan(this);
+  }
+
+  ~VPlan() {
     if (Entry) {
       VPValue DummyValue;
       for (VPBlockBase *Block : depth_first(Entry))
         Block->dropAllReferences(&DummyValue);
 
-      VPBlockBase::deleteCFG(Entry); 
+      VPBlockBase::deleteCFG(Entry);
     }
     for (VPValue *VPV : VPValuesToFree)
       delete VPV;
-    if (BackedgeTakenCount) 
-      delete BackedgeTakenCount; 
-    for (VPValue *Def : VPExternalDefs) 
-      delete Def; 
-    for (VPValue *CBV : VPCBVs) 
-      delete CBV; 
-  } 
- 
-  /// Generate the IR code for this VPlan. 
-  void execute(struct VPTransformState *State); 
- 
-  VPBlockBase *getEntry() { return Entry; } 
-  const VPBlockBase *getEntry() const { return Entry; } 
- 
-  VPBlockBase *setEntry(VPBlockBase *Block) { 
-    Entry = Block; 
-    Block->setPlan(this); 
-    return Entry; 
-  } 
- 
-  /// The backedge taken count of the original loop. 
-  VPValue *getOrCreateBackedgeTakenCount() { 
-    if (!BackedgeTakenCount) 
-      BackedgeTakenCount = new VPValue(); 
-    return BackedgeTakenCount; 
-  } 
- 
+    if (BackedgeTakenCount)
+      delete BackedgeTakenCount;
+    for (VPValue *Def : VPExternalDefs)
+      delete Def;
+    for (VPValue *CBV : VPCBVs)
+      delete CBV;
+  }
+
+  /// Generate the IR code for this VPlan.
+  void execute(struct VPTransformState *State);
+
+  VPBlockBase *getEntry() { return Entry; }
+  const VPBlockBase *getEntry() const { return Entry; }
+
+  VPBlockBase *setEntry(VPBlockBase *Block) {
+    Entry = Block;
+    Block->setPlan(this);
+    return Entry;
+  }
+
+  /// The backedge taken count of the original loop.
+  VPValue *getOrCreateBackedgeTakenCount() {
+    if (!BackedgeTakenCount)
+      BackedgeTakenCount = new VPValue();
+    return BackedgeTakenCount;
+  }
+
   void addVF(ElementCount VF) { VFs.insert(VF); }
- 
+
   bool hasVF(ElementCount VF) { return VFs.count(VF); }
- 
-  const std::string &getName() const { return Name; } 
- 
-  void setName(const Twine &newName) { Name = newName.str(); } 
- 
-  /// Add \p VPVal to the pool of external definitions if it's not already 
-  /// in the pool. 
-  void addExternalDef(VPValue *VPVal) { 
-    VPExternalDefs.insert(VPVal); 
-  } 
- 
-  /// Add \p CBV to the vector of condition bit values. 
-  void addCBV(VPValue *CBV) { 
-    VPCBVs.push_back(CBV); 
-  } 
- 
-  void addVPValue(Value *V) { 
-    assert(V && "Trying to add a null Value to VPlan"); 
-    assert(!Value2VPValue.count(V) && "Value already exists in VPlan"); 
+
+  const std::string &getName() const { return Name; }
+
+  void setName(const Twine &newName) { Name = newName.str(); }
+
+  /// Add \p VPVal to the pool of external definitions if it's not already
+  /// in the pool.
+  void addExternalDef(VPValue *VPVal) {
+    VPExternalDefs.insert(VPVal);
+  }
+
+  /// Add \p CBV to the vector of condition bit values.
+  void addCBV(VPValue *CBV) {
+    VPCBVs.push_back(CBV);
+  }
+
+  void addVPValue(Value *V) {
+    assert(V && "Trying to add a null Value to VPlan");
+    assert(!Value2VPValue.count(V) && "Value already exists in VPlan");
     VPValue *VPV = new VPValue(V);
     Value2VPValue[V] = VPV;
     VPValuesToFree.push_back(VPV);
-  } 
- 
+  }
+
   void addVPValue(Value *V, VPValue *VPV) {
     assert(V && "Trying to add a null Value to VPlan");
     assert(!Value2VPValue.count(V) && "Value already exists in VPlan");
     Value2VPValue[V] = VPV;
   }
 
-  VPValue *getVPValue(Value *V) { 
-    assert(V && "Trying to get the VPValue of a null Value"); 
-    assert(Value2VPValue.count(V) && "Value does not exist in VPlan"); 
-    return Value2VPValue[V]; 
-  } 
- 
-  VPValue *getOrAddVPValue(Value *V) { 
-    assert(V && "Trying to get or add the VPValue of a null Value"); 
-    if (!Value2VPValue.count(V)) 
-      addVPValue(V); 
-    return getVPValue(V); 
-  } 
- 
+  VPValue *getVPValue(Value *V) {
+    assert(V && "Trying to get the VPValue of a null Value");
+    assert(Value2VPValue.count(V) && "Value does not exist in VPlan");
+    return Value2VPValue[V];
+  }
+
+  VPValue *getOrAddVPValue(Value *V) {
+    assert(V && "Trying to get or add the VPValue of a null Value");
+    if (!Value2VPValue.count(V))
+      addVPValue(V);
+    return getVPValue(V);
+  }
+
   void removeVPValueFor(Value *V) { Value2VPValue.erase(V); }
 
-  /// Return the VPLoopInfo analysis for this VPlan. 
-  VPLoopInfo &getVPLoopInfo() { return VPLInfo; } 
-  const VPLoopInfo &getVPLoopInfo() const { return VPLInfo; } 
- 
-  /// Dump the plan to stderr (for debugging). 
-  void dump() const; 
- 
-  /// Returns a range mapping the values the range \p Operands to their 
-  /// corresponding VPValues. 
-  iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>> 
-  mapToVPValues(User::op_range Operands) { 
-    std::function<VPValue *(Value *)> Fn = [this](Value *Op) { 
-      return getOrAddVPValue(Op); 
-    }; 
-    return map_range(Operands, Fn); 
-  } 
- 
-private: 
-  /// Add to the given dominator tree the header block and every new basic block 
-  /// that was created between it and the latch block, inclusive. 
-  static void updateDominatorTree(DominatorTree *DT, BasicBlock *LoopLatchBB, 
-                                  BasicBlock *LoopPreHeaderBB, 
-                                  BasicBlock *LoopExitBB); 
-}; 
- 
-/// VPlanPrinter prints a given VPlan to a given output stream. The printing is 
-/// indented and follows the dot format. 
-class VPlanPrinter { 
-  friend inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan); 
-  friend inline raw_ostream &operator<<(raw_ostream &OS, 
-                                        const struct VPlanIngredient &I); 
- 
-private: 
-  raw_ostream &OS; 
-  const VPlan &Plan; 
-  unsigned Depth = 0; 
-  unsigned TabWidth = 2; 
-  std::string Indent; 
-  unsigned BID = 0; 
-  SmallDenseMap<const VPBlockBase *, unsigned> BlockID; 
- 
-  VPSlotTracker SlotTracker; 
- 
-  VPlanPrinter(raw_ostream &O, const VPlan &P) 
-      : OS(O), Plan(P), SlotTracker(&P) {} 
- 
-  /// Handle indentation. 
-  void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); } 
- 
-  /// Print a given \p Block of the Plan. 
-  void dumpBlock(const VPBlockBase *Block); 
- 
-  /// Print the information related to the CFG edges going out of a given 
-  /// \p Block, followed by printing the successor blocks themselves. 
-  void dumpEdges(const VPBlockBase *Block); 
- 
-  /// Print a given \p BasicBlock, including its VPRecipes, followed by printing 
-  /// its successor blocks. 
-  void dumpBasicBlock(const VPBasicBlock *BasicBlock); 
- 
-  /// Print a given \p Region of the Plan. 
-  void dumpRegion(const VPRegionBlock *Region); 
- 
-  unsigned getOrCreateBID(const VPBlockBase *Block) { 
-    return BlockID.count(Block) ? BlockID[Block] : BlockID[Block] = BID++; 
-  } 
- 
-  const Twine getOrCreateName(const VPBlockBase *Block); 
- 
-  const Twine getUID(const VPBlockBase *Block); 
- 
-  /// Print the information related to a CFG edge between two VPBlockBases. 
-  void drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden, 
-                const Twine &Label); 
- 
-  void dump(); 
- 
+  /// Return the VPLoopInfo analysis for this VPlan.
+  VPLoopInfo &getVPLoopInfo() { return VPLInfo; }
+  const VPLoopInfo &getVPLoopInfo() const { return VPLInfo; }
+
+  /// Dump the plan to stderr (for debugging).
+  void dump() const;
+
+  /// Returns a range mapping the values the range \p Operands to their
+  /// corresponding VPValues.
+  iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
+  mapToVPValues(User::op_range Operands) {
+    std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
+      return getOrAddVPValue(Op);
+    };
+    return map_range(Operands, Fn);
+  }
+
+private:
+  /// Add to the given dominator tree the header block and every new basic block
+  /// that was created between it and the latch block, inclusive.
+  static void updateDominatorTree(DominatorTree *DT, BasicBlock *LoopLatchBB,
+                                  BasicBlock *LoopPreHeaderBB,
+                                  BasicBlock *LoopExitBB);
+};
+
+/// VPlanPrinter prints a given VPlan to a given output stream. The printing is
+/// indented and follows the dot format.
+class VPlanPrinter {
+  friend inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan);
+  friend inline raw_ostream &operator<<(raw_ostream &OS,
+                                        const struct VPlanIngredient &I);
+
+private:
+  raw_ostream &OS;
+  const VPlan &Plan;
+  unsigned Depth = 0;
+  unsigned TabWidth = 2;
+  std::string Indent;
+  unsigned BID = 0;
+  SmallDenseMap<const VPBlockBase *, unsigned> BlockID;
+
+  VPSlotTracker SlotTracker;
+
+  VPlanPrinter(raw_ostream &O, const VPlan &P)
+      : OS(O), Plan(P), SlotTracker(&P) {}
+
+  /// Handle indentation.
+  void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); }
+
+  /// Print a given \p Block of the Plan.
+  void dumpBlock(const VPBlockBase *Block);
+
+  /// Print the information related to the CFG edges going out of a given
+  /// \p Block, followed by printing the successor blocks themselves.
+  void dumpEdges(const VPBlockBase *Block);
+
+  /// Print a given \p BasicBlock, including its VPRecipes, followed by printing
+  /// its successor blocks.
+  void dumpBasicBlock(const VPBasicBlock *BasicBlock);
+
+  /// Print a given \p Region of the Plan.
+  void dumpRegion(const VPRegionBlock *Region);
+
+  unsigned getOrCreateBID(const VPBlockBase *Block) {
+    return BlockID.count(Block) ? BlockID[Block] : BlockID[Block] = BID++;
+  }
+
+  const Twine getOrCreateName(const VPBlockBase *Block);
+
+  const Twine getUID(const VPBlockBase *Block);
+
+  /// Print the information related to a CFG edge between two VPBlockBases.
+  void drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden,
+                const Twine &Label);
+
+  void dump();
+
   static void printAsIngredient(raw_ostream &O, const Value *V);
-}; 
- 
-struct VPlanIngredient { 
+};
+
+struct VPlanIngredient {
   const Value *V;
- 
+
   VPlanIngredient(const Value *V) : V(V) {}
-}; 
- 
-inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) { 
-  VPlanPrinter::printAsIngredient(OS, I.V); 
-  return OS; 
-} 
- 
-inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan) { 
-  VPlanPrinter Printer(OS, Plan); 
-  Printer.dump(); 
-  return OS; 
-} 
- 
-//===----------------------------------------------------------------------===// 
-// VPlan Utilities 
-//===----------------------------------------------------------------------===// 
- 
-/// Class that provides utilities for VPBlockBases in VPlan. 
-class VPBlockUtils { 
-public: 
-  VPBlockUtils() = delete; 
- 
-  /// Insert disconnected VPBlockBase \p NewBlock after \p BlockPtr. Add \p 
-  /// NewBlock as successor of \p BlockPtr and \p BlockPtr as predecessor of \p 
-  /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. If \p BlockPtr 
-  /// has more than one successor, its conditional bit is propagated to \p 
-  /// NewBlock. \p NewBlock must have neither successors nor predecessors. 
-  static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) { 
-    assert(NewBlock->getSuccessors().empty() && 
-           "Can't insert new block with successors."); 
-    // TODO: move successors from BlockPtr to NewBlock when this functionality 
-    // is necessary. For now, setBlockSingleSuccessor will assert if BlockPtr 
-    // already has successors. 
-    BlockPtr->setOneSuccessor(NewBlock); 
-    NewBlock->setPredecessors({BlockPtr}); 
-    NewBlock->setParent(BlockPtr->getParent()); 
-  } 
- 
-  /// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p 
-  /// BlockPtr. Add \p IfTrue and \p IfFalse as succesors of \p BlockPtr and \p 
-  /// BlockPtr as predecessor of \p IfTrue and \p IfFalse. Propagate \p BlockPtr 
-  /// parent to \p IfTrue and \p IfFalse. \p Condition is set as the successor 
-  /// selector. \p BlockPtr must have no successors and \p IfTrue and \p IfFalse 
-  /// must have neither successors nor predecessors. 
-  static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, 
-                                   VPValue *Condition, VPBlockBase *BlockPtr) { 
-    assert(IfTrue->getSuccessors().empty() && 
-           "Can't insert IfTrue with successors."); 
-    assert(IfFalse->getSuccessors().empty() && 
-           "Can't insert IfFalse with successors."); 
-    BlockPtr->setTwoSuccessors(IfTrue, IfFalse, Condition); 
-    IfTrue->setPredecessors({BlockPtr}); 
-    IfFalse->setPredecessors({BlockPtr}); 
-    IfTrue->setParent(BlockPtr->getParent()); 
-    IfFalse->setParent(BlockPtr->getParent()); 
-  } 
- 
-  /// Connect VPBlockBases \p From and \p To bi-directionally. Append \p To to 
-  /// the successors of \p From and \p From to the predecessors of \p To. Both 
-  /// VPBlockBases must have the same parent, which can be null. Both 
-  /// VPBlockBases can be already connected to other VPBlockBases. 
-  static void connectBlocks(VPBlockBase *From, VPBlockBase *To) { 
-    assert((From->getParent() == To->getParent()) && 
-           "Can't connect two block with different parents"); 
-    assert(From->getNumSuccessors() < 2 && 
-           "Blocks can't have more than two successors."); 
-    From->appendSuccessor(To); 
-    To->appendPredecessor(From); 
-  } 
- 
-  /// Disconnect VPBlockBases \p From and \p To bi-directionally. Remove \p To 
-  /// from the successors of \p From and \p From from the predecessors of \p To. 
-  static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To) { 
-    assert(To && "Successor to disconnect is null."); 
-    From->removeSuccessor(To); 
-    To->removePredecessor(From); 
-  } 
- 
-  /// Returns true if the edge \p FromBlock -> \p ToBlock is a back-edge. 
-  static bool isBackEdge(const VPBlockBase *FromBlock, 
-                         const VPBlockBase *ToBlock, const VPLoopInfo *VPLI) { 
-    assert(FromBlock->getParent() == ToBlock->getParent() && 
-           FromBlock->getParent() && "Must be in same region"); 
-    const VPLoop *FromLoop = VPLI->getLoopFor(FromBlock); 
-    const VPLoop *ToLoop = VPLI->getLoopFor(ToBlock); 
-    if (!FromLoop || !ToLoop || FromLoop != ToLoop) 
-      return false; 
- 
-    // A back-edge is a branch from the loop latch to its header. 
-    return ToLoop->isLoopLatch(FromBlock) && ToBlock == ToLoop->getHeader(); 
-  } 
- 
-  /// Returns true if \p Block is a loop latch 
-  static bool blockIsLoopLatch(const VPBlockBase *Block, 
-                               const VPLoopInfo *VPLInfo) { 
-    if (const VPLoop *ParentVPL = VPLInfo->getLoopFor(Block)) 
-      return ParentVPL->isLoopLatch(Block); 
- 
-    return false; 
-  } 
- 
-  /// Count and return the number of succesors of \p PredBlock excluding any 
-  /// backedges. 
-  static unsigned countSuccessorsNoBE(VPBlockBase *PredBlock, 
-                                      VPLoopInfo *VPLI) { 
-    unsigned Count = 0; 
-    for (VPBlockBase *SuccBlock : PredBlock->getSuccessors()) { 
-      if (!VPBlockUtils::isBackEdge(PredBlock, SuccBlock, VPLI)) 
-        Count++; 
-    } 
-    return Count; 
-  } 
-}; 
- 
-class VPInterleavedAccessInfo { 
-  DenseMap<VPInstruction *, InterleaveGroup<VPInstruction> *> 
-      InterleaveGroupMap; 
- 
-  /// Type for mapping of instruction based interleave groups to VPInstruction 
-  /// interleave groups 
-  using Old2NewTy = DenseMap<InterleaveGroup<Instruction> *, 
-                             InterleaveGroup<VPInstruction> *>; 
- 
-  /// Recursively \p Region and populate VPlan based interleave groups based on 
-  /// \p IAI. 
-  void visitRegion(VPRegionBlock *Region, Old2NewTy &Old2New, 
-                   InterleavedAccessInfo &IAI); 
-  /// Recursively traverse \p Block and populate VPlan based interleave groups 
-  /// based on \p IAI. 
-  void visitBlock(VPBlockBase *Block, Old2NewTy &Old2New, 
-                  InterleavedAccessInfo &IAI); 
- 
-public: 
-  VPInterleavedAccessInfo(VPlan &Plan, InterleavedAccessInfo &IAI); 
- 
-  ~VPInterleavedAccessInfo() { 
-    SmallPtrSet<InterleaveGroup<VPInstruction> *, 4> DelSet; 
-    // Avoid releasing a pointer twice. 
-    for (auto &I : InterleaveGroupMap) 
-      DelSet.insert(I.second); 
-    for (auto *Ptr : DelSet) 
-      delete Ptr; 
-  } 
- 
-  /// Get the interleave group that \p Instr belongs to. 
-  /// 
-  /// \returns nullptr if doesn't have such group. 
-  InterleaveGroup<VPInstruction> * 
-  getInterleaveGroup(VPInstruction *Instr) const { 
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) {
+  VPlanPrinter::printAsIngredient(OS, I.V);
+  return OS;
+}
+
+inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan) {
+  VPlanPrinter Printer(OS, Plan);
+  Printer.dump();
+  return OS;
+}
+
+//===----------------------------------------------------------------------===//
+// VPlan Utilities
+//===----------------------------------------------------------------------===//
+
+/// Class that provides utilities for VPBlockBases in VPlan.
+class VPBlockUtils {
+public:
+  VPBlockUtils() = delete;
+
+  /// Insert disconnected VPBlockBase \p NewBlock after \p BlockPtr. Add \p
+  /// NewBlock as successor of \p BlockPtr and \p BlockPtr as predecessor of \p
+  /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. If \p BlockPtr
+  /// has more than one successor, its conditional bit is propagated to \p
+  /// NewBlock. \p NewBlock must have neither successors nor predecessors.
+  static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) {
+    assert(NewBlock->getSuccessors().empty() &&
+           "Can't insert new block with successors.");
+    // TODO: move successors from BlockPtr to NewBlock when this functionality
+    // is necessary. For now, setBlockSingleSuccessor will assert if BlockPtr
+    // already has successors.
+    BlockPtr->setOneSuccessor(NewBlock);
+    NewBlock->setPredecessors({BlockPtr});
+    NewBlock->setParent(BlockPtr->getParent());
+  }
+
+  /// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p
+  /// BlockPtr. Add \p IfTrue and \p IfFalse as succesors of \p BlockPtr and \p
+  /// BlockPtr as predecessor of \p IfTrue and \p IfFalse. Propagate \p BlockPtr
+  /// parent to \p IfTrue and \p IfFalse. \p Condition is set as the successor
+  /// selector. \p BlockPtr must have no successors and \p IfTrue and \p IfFalse
+  /// must have neither successors nor predecessors.
+  static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse,
+                                   VPValue *Condition, VPBlockBase *BlockPtr) {
+    assert(IfTrue->getSuccessors().empty() &&
+           "Can't insert IfTrue with successors.");
+    assert(IfFalse->getSuccessors().empty() &&
+           "Can't insert IfFalse with successors.");
+    BlockPtr->setTwoSuccessors(IfTrue, IfFalse, Condition);
+    IfTrue->setPredecessors({BlockPtr});
+    IfFalse->setPredecessors({BlockPtr});
+    IfTrue->setParent(BlockPtr->getParent());
+    IfFalse->setParent(BlockPtr->getParent());
+  }
+
+  /// Connect VPBlockBases \p From and \p To bi-directionally. Append \p To to
+  /// the successors of \p From and \p From to the predecessors of \p To. Both
+  /// VPBlockBases must have the same parent, which can be null. Both
+  /// VPBlockBases can be already connected to other VPBlockBases.
+  static void connectBlocks(VPBlockBase *From, VPBlockBase *To) {
+    assert((From->getParent() == To->getParent()) &&
+           "Can't connect two block with different parents");
+    assert(From->getNumSuccessors() < 2 &&
+           "Blocks can't have more than two successors.");
+    From->appendSuccessor(To);
+    To->appendPredecessor(From);
+  }
+
+  /// Disconnect VPBlockBases \p From and \p To bi-directionally. Remove \p To
+  /// from the successors of \p From and \p From from the predecessors of \p To.
+  static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To) {
+    assert(To && "Successor to disconnect is null.");
+    From->removeSuccessor(To);
+    To->removePredecessor(From);
+  }
+
+  /// Returns true if the edge \p FromBlock -> \p ToBlock is a back-edge.
+  static bool isBackEdge(const VPBlockBase *FromBlock,
+                         const VPBlockBase *ToBlock, const VPLoopInfo *VPLI) {
+    assert(FromBlock->getParent() == ToBlock->getParent() &&
+           FromBlock->getParent() && "Must be in same region");
+    const VPLoop *FromLoop = VPLI->getLoopFor(FromBlock);
+    const VPLoop *ToLoop = VPLI->getLoopFor(ToBlock);
+    if (!FromLoop || !ToLoop || FromLoop != ToLoop)
+      return false;
+
+    // A back-edge is a branch from the loop latch to its header.
+    return ToLoop->isLoopLatch(FromBlock) && ToBlock == ToLoop->getHeader();
+  }
+
+  /// Returns true if \p Block is a loop latch
+  static bool blockIsLoopLatch(const VPBlockBase *Block,
+                               const VPLoopInfo *VPLInfo) {
+    if (const VPLoop *ParentVPL = VPLInfo->getLoopFor(Block))
+      return ParentVPL->isLoopLatch(Block);
+
+    return false;
+  }
+
+  /// Count and return the number of succesors of \p PredBlock excluding any
+  /// backedges.
+  static unsigned countSuccessorsNoBE(VPBlockBase *PredBlock,
+                                      VPLoopInfo *VPLI) {
+    unsigned Count = 0;
+    for (VPBlockBase *SuccBlock : PredBlock->getSuccessors()) {
+      if (!VPBlockUtils::isBackEdge(PredBlock, SuccBlock, VPLI))
+        Count++;
+    }
+    return Count;
+  }
+};
+
+class VPInterleavedAccessInfo {
+  DenseMap<VPInstruction *, InterleaveGroup<VPInstruction> *>
+      InterleaveGroupMap;
+
+  /// Type for mapping of instruction based interleave groups to VPInstruction
+  /// interleave groups
+  using Old2NewTy = DenseMap<InterleaveGroup<Instruction> *,
+                             InterleaveGroup<VPInstruction> *>;
+
+  /// Recursively \p Region and populate VPlan based interleave groups based on
+  /// \p IAI.
+  void visitRegion(VPRegionBlock *Region, Old2NewTy &Old2New,
+                   InterleavedAccessInfo &IAI);
+  /// Recursively traverse \p Block and populate VPlan based interleave groups
+  /// based on \p IAI.
+  void visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
+                  InterleavedAccessInfo &IAI);
+
+public:
+  VPInterleavedAccessInfo(VPlan &Plan, InterleavedAccessInfo &IAI);
+
+  ~VPInterleavedAccessInfo() {
+    SmallPtrSet<InterleaveGroup<VPInstruction> *, 4> DelSet;
+    // Avoid releasing a pointer twice.
+    for (auto &I : InterleaveGroupMap)
+      DelSet.insert(I.second);
+    for (auto *Ptr : DelSet)
+      delete Ptr;
+  }
+
+  /// Get the interleave group that \p Instr belongs to.
+  ///
+  /// \returns nullptr if doesn't have such group.
+  InterleaveGroup<VPInstruction> *
+  getInterleaveGroup(VPInstruction *Instr) const {
     return InterleaveGroupMap.lookup(Instr);
-  } 
-}; 
- 
-/// Class that maps (parts of) an existing VPlan to trees of combined 
-/// VPInstructions. 
-class VPlanSlp { 
-  enum class OpMode { Failed, Load, Opcode }; 
- 
-  /// A DenseMapInfo implementation for using SmallVector<VPValue *, 4> as 
-  /// DenseMap keys. 
-  struct BundleDenseMapInfo { 
-    static SmallVector<VPValue *, 4> getEmptyKey() { 
-      return {reinterpret_cast<VPValue *>(-1)}; 
-    } 
- 
-    static SmallVector<VPValue *, 4> getTombstoneKey() { 
-      return {reinterpret_cast<VPValue *>(-2)}; 
-    } 
- 
-    static unsigned getHashValue(const SmallVector<VPValue *, 4> &V) { 
-      return static_cast<unsigned>(hash_combine_range(V.begin(), V.end())); 
-    } 
- 
-    static bool isEqual(const SmallVector<VPValue *, 4> &LHS, 
-                        const SmallVector<VPValue *, 4> &RHS) { 
-      return LHS == RHS; 
-    } 
-  }; 
- 
-  /// Mapping of values in the original VPlan to a combined VPInstruction. 
-  DenseMap<SmallVector<VPValue *, 4>, VPInstruction *, BundleDenseMapInfo> 
-      BundleToCombined; 
- 
-  VPInterleavedAccessInfo &IAI; 
- 
-  /// Basic block to operate on. For now, only instructions in a single BB are 
-  /// considered. 
-  const VPBasicBlock &BB; 
- 
-  /// Indicates whether we managed to combine all visited instructions or not. 
-  bool CompletelySLP = true; 
- 
-  /// Width of the widest combined bundle in bits. 
-  unsigned WidestBundleBits = 0; 
- 
-  using MultiNodeOpTy = 
-      typename std::pair<VPInstruction *, SmallVector<VPValue *, 4>>; 
- 
-  // Input operand bundles for the current multi node. Each multi node operand 
-  // bundle contains values not matching the multi node's opcode. They will 
-  // be reordered in reorderMultiNodeOps, once we completed building a 
-  // multi node. 
-  SmallVector<MultiNodeOpTy, 4> MultiNodeOps; 
- 
-  /// Indicates whether we are building a multi node currently. 
-  bool MultiNodeActive = false; 
- 
-  /// Check if we can vectorize Operands together. 
-  bool areVectorizable(ArrayRef<VPValue *> Operands) const; 
- 
-  /// Add combined instruction \p New for the bundle \p Operands. 
-  void addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New); 
- 
-  /// Indicate we hit a bundle we failed to combine. Returns nullptr for now. 
-  VPInstruction *markFailed(); 
- 
-  /// Reorder operands in the multi node to maximize sequential memory access 
-  /// and commutative operations. 
-  SmallVector<MultiNodeOpTy, 4> reorderMultiNodeOps(); 
- 
-  /// Choose the best candidate to use for the lane after \p Last. The set of 
-  /// candidates to choose from are values with an opcode matching \p Last's 
-  /// or loads consecutive to \p Last. 
-  std::pair<OpMode, VPValue *> getBest(OpMode Mode, VPValue *Last, 
-                                       SmallPtrSetImpl<VPValue *> &Candidates, 
-                                       VPInterleavedAccessInfo &IAI); 
- 
-  /// Print bundle \p Values to dbgs(). 
-  void dumpBundle(ArrayRef<VPValue *> Values); 
- 
-public: 
-  VPlanSlp(VPInterleavedAccessInfo &IAI, VPBasicBlock &BB) : IAI(IAI), BB(BB) {} 
- 
+  }
+};
+
+/// Class that maps (parts of) an existing VPlan to trees of combined
+/// VPInstructions.
+class VPlanSlp {
+  enum class OpMode { Failed, Load, Opcode };
+
+  /// A DenseMapInfo implementation for using SmallVector<VPValue *, 4> as
+  /// DenseMap keys.
+  struct BundleDenseMapInfo {
+    static SmallVector<VPValue *, 4> getEmptyKey() {
+      return {reinterpret_cast<VPValue *>(-1)};
+    }
+
+    static SmallVector<VPValue *, 4> getTombstoneKey() {
+      return {reinterpret_cast<VPValue *>(-2)};
+    }
+
+    static unsigned getHashValue(const SmallVector<VPValue *, 4> &V) {
+      return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
+    }
+
+    static bool isEqual(const SmallVector<VPValue *, 4> &LHS,
+                        const SmallVector<VPValue *, 4> &RHS) {
+      return LHS == RHS;
+    }
+  };
+
+  /// Mapping of values in the original VPlan to a combined VPInstruction.
+  DenseMap<SmallVector<VPValue *, 4>, VPInstruction *, BundleDenseMapInfo>
+      BundleToCombined;
+
+  VPInterleavedAccessInfo &IAI;
+
+  /// Basic block to operate on. For now, only instructions in a single BB are
+  /// considered.
+  const VPBasicBlock &BB;
+
+  /// Indicates whether we managed to combine all visited instructions or not.
+  bool CompletelySLP = true;
+
+  /// Width of the widest combined bundle in bits.
+  unsigned WidestBundleBits = 0;
+
+  using MultiNodeOpTy =
+      typename std::pair<VPInstruction *, SmallVector<VPValue *, 4>>;
+
+  // Input operand bundles for the current multi node. Each multi node operand
+  // bundle contains values not matching the multi node's opcode. They will
+  // be reordered in reorderMultiNodeOps, once we completed building a
+  // multi node.
+  SmallVector<MultiNodeOpTy, 4> MultiNodeOps;
+
+  /// Indicates whether we are building a multi node currently.
+  bool MultiNodeActive = false;
+
+  /// Check if we can vectorize Operands together.
+  bool areVectorizable(ArrayRef<VPValue *> Operands) const;
+
+  /// Add combined instruction \p New for the bundle \p Operands.
+  void addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New);
+
+  /// Indicate we hit a bundle we failed to combine. Returns nullptr for now.
+  VPInstruction *markFailed();
+
+  /// Reorder operands in the multi node to maximize sequential memory access
+  /// and commutative operations.
+  SmallVector<MultiNodeOpTy, 4> reorderMultiNodeOps();
+
+  /// Choose the best candidate to use for the lane after \p Last. The set of
+  /// candidates to choose from are values with an opcode matching \p Last's
+  /// or loads consecutive to \p Last.
+  std::pair<OpMode, VPValue *> getBest(OpMode Mode, VPValue *Last,
+                                       SmallPtrSetImpl<VPValue *> &Candidates,
+                                       VPInterleavedAccessInfo &IAI);
+
+  /// Print bundle \p Values to dbgs().
+  void dumpBundle(ArrayRef<VPValue *> Values);
+
+public:
+  VPlanSlp(VPInterleavedAccessInfo &IAI, VPBasicBlock &BB) : IAI(IAI), BB(BB) {}
+
   ~VPlanSlp() = default;
- 
-  /// Tries to build an SLP tree rooted at \p Operands and returns a 
-  /// VPInstruction combining \p Operands, if they can be combined. 
-  VPInstruction *buildGraph(ArrayRef<VPValue *> Operands); 
- 
-  /// Return the width of the widest combined bundle in bits. 
-  unsigned getWidestBundleBits() const { return WidestBundleBits; } 
- 
-  /// Return true if all visited instruction can be combined. 
-  bool isCompletelySLP() const { return CompletelySLP; } 
-}; 
-} // end namespace llvm 
- 
-#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H 
+
+  /// Tries to build an SLP tree rooted at \p Operands and returns a
+  /// VPInstruction combining \p Operands, if they can be combined.
+  VPInstruction *buildGraph(ArrayRef<VPValue *> Operands);
+
+  /// Return the width of the widest combined bundle in bits.
+  unsigned getWidestBundleBits() const { return WidestBundleBits; }
+
+  /// Return true if all visited instruction can be combined.
+  bool isCompletelySLP() const { return CompletelySLP; }
+};
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanDominatorTree.h b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanDominatorTree.h
index 2087e620f7..a42ebc9ee9 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanDominatorTree.h
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanDominatorTree.h
@@ -1,41 +1,41 @@
-//===-- VPlanDominatorTree.h ------------------------------------*- C++ -*-===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-/// 
-/// \file 
-/// This file implements dominator tree analysis for a single level of a VPlan's 
-/// H-CFG. 
-/// 
-//===----------------------------------------------------------------------===// 
- 
-#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANDOMINATORTREE_H 
-#define LLVM_TRANSFORMS_VECTORIZE_VPLANDOMINATORTREE_H 
- 
-#include "VPlan.h" 
-#include "llvm/ADT/GraphTraits.h" 
-#include "llvm/IR/Dominators.h" 
- 
-namespace llvm { 
- 
-/// Template specialization of the standard LLVM dominator tree utility for 
-/// VPBlockBases. 
-using VPDominatorTree = DomTreeBase<VPBlockBase>; 
- 
-using VPDomTreeNode = DomTreeNodeBase<VPBlockBase>; 
- 
-/// Template specializations of GraphTraits for VPDomTreeNode. 
-template <> 
-struct GraphTraits<VPDomTreeNode *> 
-    : public DomTreeGraphTraitsBase<VPDomTreeNode, 
-                                    VPDomTreeNode::const_iterator> {}; 
- 
-template <> 
-struct GraphTraits<const VPDomTreeNode *> 
-    : public DomTreeGraphTraitsBase<const VPDomTreeNode, 
-                                    VPDomTreeNode::const_iterator> {}; 
-} // namespace llvm 
-#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANDOMINATORTREE_H 
+//===-- VPlanDominatorTree.h ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements dominator tree analysis for a single level of a VPlan's
+/// H-CFG.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANDOMINATORTREE_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLANDOMINATORTREE_H
+
+#include "VPlan.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/IR/Dominators.h"
+
+namespace llvm {
+
+/// Template specialization of the standard LLVM dominator tree utility for
+/// VPBlockBases.
+using VPDominatorTree = DomTreeBase<VPBlockBase>;
+
+using VPDomTreeNode = DomTreeNodeBase<VPBlockBase>;
+
+/// Template specializations of GraphTraits for VPDomTreeNode.
+template <>
+struct GraphTraits<VPDomTreeNode *>
+    : public DomTreeGraphTraitsBase<VPDomTreeNode,
+                                    VPDomTreeNode::const_iterator> {};
+
+template <>
+struct GraphTraits<const VPDomTreeNode *>
+    : public DomTreeGraphTraitsBase<const VPDomTreeNode,
+                                    VPDomTreeNode::const_iterator> {};
+} // namespace llvm
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANDOMINATORTREE_H
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
index f54b8958ae..df96f67288 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
@@ -1,354 +1,354 @@
-//===-- VPlanHCFGBuilder.cpp ----------------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-/// 
-/// \file 
-/// This file implements the construction of a VPlan-based Hierarchical CFG 
-/// (H-CFG) for an incoming IR. This construction comprises the following 
-/// components and steps: 
-// 
-/// 1. PlainCFGBuilder class: builds a plain VPBasicBlock-based CFG that 
-/// faithfully represents the CFG in the incoming IR. A VPRegionBlock (Top 
-/// Region) is created to enclose and serve as parent of all the VPBasicBlocks 
-/// in the plain CFG. 
-/// NOTE: At this point, there is a direct correspondence between all the 
-/// VPBasicBlocks created for the initial plain CFG and the incoming 
-/// BasicBlocks. However, this might change in the future. 
-/// 
-//===----------------------------------------------------------------------===// 
- 
-#include "VPlanHCFGBuilder.h" 
-#include "LoopVectorizationPlanner.h" 
-#include "llvm/Analysis/LoopIterator.h" 
- 
-#define DEBUG_TYPE "loop-vectorize" 
- 
-using namespace llvm; 
- 
-namespace { 
-// Class that is used to build the plain CFG for the incoming IR. 
-class PlainCFGBuilder { 
-private: 
-  // The outermost loop of the input loop nest considered for vectorization. 
-  Loop *TheLoop; 
- 
-  // Loop Info analysis. 
-  LoopInfo *LI; 
- 
-  // Vectorization plan that we are working on. 
-  VPlan &Plan; 
- 
-  // Output Top Region. 
-  VPRegionBlock *TopRegion = nullptr; 
- 
-  // Builder of the VPlan instruction-level representation. 
-  VPBuilder VPIRBuilder; 
- 
-  // NOTE: The following maps are intentionally destroyed after the plain CFG 
-  // construction because subsequent VPlan-to-VPlan transformation may 
-  // invalidate them. 
-  // Map incoming BasicBlocks to their newly-created VPBasicBlocks. 
-  DenseMap<BasicBlock *, VPBasicBlock *> BB2VPBB; 
-  // Map incoming Value definitions to their newly-created VPValues. 
-  DenseMap<Value *, VPValue *> IRDef2VPValue; 
- 
-  // Hold phi node's that need to be fixed once the plain CFG has been built. 
-  SmallVector<PHINode *, 8> PhisToFix; 
- 
-  // Utility functions. 
-  void setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB); 
-  void fixPhiNodes(); 
-  VPBasicBlock *getOrCreateVPBB(BasicBlock *BB); 
-#ifndef NDEBUG 
-  bool isExternalDef(Value *Val); 
-#endif 
-  VPValue *getOrCreateVPOperand(Value *IRVal); 
-  void createVPInstructionsForVPBB(VPBasicBlock *VPBB, BasicBlock *BB); 
- 
-public: 
-  PlainCFGBuilder(Loop *Lp, LoopInfo *LI, VPlan &P) 
-      : TheLoop(Lp), LI(LI), Plan(P) {} 
- 
-  // Build the plain CFG and return its Top Region. 
-  VPRegionBlock *buildPlainCFG(); 
-}; 
-} // anonymous namespace 
- 
-// Set predecessors of \p VPBB in the same order as they are in \p BB. \p VPBB 
-// must have no predecessors. 
-void PlainCFGBuilder::setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB) { 
-  SmallVector<VPBlockBase *, 8> VPBBPreds; 
-  // Collect VPBB predecessors. 
-  for (BasicBlock *Pred : predecessors(BB)) 
-    VPBBPreds.push_back(getOrCreateVPBB(Pred)); 
- 
-  VPBB->setPredecessors(VPBBPreds); 
-} 
- 
-// Add operands to VPInstructions representing phi nodes from the input IR. 
-void PlainCFGBuilder::fixPhiNodes() { 
-  for (auto *Phi : PhisToFix) { 
-    assert(IRDef2VPValue.count(Phi) && "Missing VPInstruction for PHINode."); 
-    VPValue *VPVal = IRDef2VPValue[Phi]; 
-    assert(isa<VPInstruction>(VPVal) && "Expected VPInstruction for phi node."); 
-    auto *VPPhi = cast<VPInstruction>(VPVal); 
-    assert(VPPhi->getNumOperands() == 0 && 
-           "Expected VPInstruction with no operands."); 
- 
-    for (Value *Op : Phi->operands()) 
-      VPPhi->addOperand(getOrCreateVPOperand(Op)); 
-  } 
-} 
- 
-// Create a new empty VPBasicBlock for an incoming BasicBlock or retrieve an 
-// existing one if it was already created. 
-VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) { 
-  auto BlockIt = BB2VPBB.find(BB); 
-  if (BlockIt != BB2VPBB.end()) 
-    // Retrieve existing VPBB. 
-    return BlockIt->second; 
- 
-  // Create new VPBB. 
-  LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << BB->getName() << "\n"); 
-  VPBasicBlock *VPBB = new VPBasicBlock(BB->getName()); 
-  BB2VPBB[BB] = VPBB; 
-  VPBB->setParent(TopRegion); 
-  return VPBB; 
-} 
- 
-#ifndef NDEBUG 
-// Return true if \p Val is considered an external definition. An external 
-// definition is either: 
-// 1. A Value that is not an Instruction. This will be refined in the future. 
-// 2. An Instruction that is outside of the CFG snippet represented in VPlan, 
-// i.e., is not part of: a) the loop nest, b) outermost loop PH and, c) 
-// outermost loop exits. 
-bool PlainCFGBuilder::isExternalDef(Value *Val) { 
-  // All the Values that are not Instructions are considered external 
-  // definitions for now. 
-  Instruction *Inst = dyn_cast<Instruction>(Val); 
-  if (!Inst) 
-    return true; 
- 
-  BasicBlock *InstParent = Inst->getParent(); 
-  assert(InstParent && "Expected instruction parent."); 
- 
-  // Check whether Instruction definition is in loop PH. 
-  BasicBlock *PH = TheLoop->getLoopPreheader(); 
-  assert(PH && "Expected loop pre-header."); 
- 
-  if (InstParent == PH) 
-    // Instruction definition is in outermost loop PH. 
-    return false; 
- 
-  // Check whether Instruction definition is in the loop exit. 
-  BasicBlock *Exit = TheLoop->getUniqueExitBlock(); 
-  assert(Exit && "Expected loop with single exit."); 
-  if (InstParent == Exit) { 
-    // Instruction definition is in outermost loop exit. 
-    return false; 
-  } 
- 
-  // Check whether Instruction definition is in loop body. 
-  return !TheLoop->contains(Inst); 
-} 
-#endif 
- 
-// Create a new VPValue or retrieve an existing one for the Instruction's 
-// operand \p IRVal. This function must only be used to create/retrieve VPValues 
-// for *Instruction's operands* and not to create regular VPInstruction's. For 
-// the latter, please, look at 'createVPInstructionsForVPBB'. 
-VPValue *PlainCFGBuilder::getOrCreateVPOperand(Value *IRVal) { 
-  auto VPValIt = IRDef2VPValue.find(IRVal); 
-  if (VPValIt != IRDef2VPValue.end()) 
-    // Operand has an associated VPInstruction or VPValue that was previously 
-    // created. 
-    return VPValIt->second; 
- 
-  // Operand doesn't have a previously created VPInstruction/VPValue. This 
-  // means that operand is: 
-  //   A) a definition external to VPlan, 
-  //   B) any other Value without specific representation in VPlan. 
-  // For now, we use VPValue to represent A and B and classify both as external 
-  // definitions. We may introduce specific VPValue subclasses for them in the 
-  // future. 
-  assert(isExternalDef(IRVal) && "Expected external definition as operand."); 
- 
-  // A and B: Create VPValue and add it to the pool of external definitions and 
-  // to the Value->VPValue map. 
-  VPValue *NewVPVal = new VPValue(IRVal); 
-  Plan.addExternalDef(NewVPVal); 
-  IRDef2VPValue[IRVal] = NewVPVal; 
-  return NewVPVal; 
-} 
- 
-// Create new VPInstructions in a VPBasicBlock, given its BasicBlock 
-// counterpart. This function must be invoked in RPO so that the operands of a 
-// VPInstruction in \p BB have been visited before (except for Phi nodes). 
-void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB, 
-                                                  BasicBlock *BB) { 
-  VPIRBuilder.setInsertPoint(VPBB); 
-  for (Instruction &InstRef : *BB) { 
-    Instruction *Inst = &InstRef; 
- 
-    // There shouldn't be any VPValue for Inst at this point. Otherwise, we 
-    // visited Inst when we shouldn't, breaking the RPO traversal order. 
-    assert(!IRDef2VPValue.count(Inst) && 
-           "Instruction shouldn't have been visited."); 
- 
-    if (auto *Br = dyn_cast<BranchInst>(Inst)) { 
-      // Branch instruction is not explicitly represented in VPlan but we need 
-      // to represent its condition bit when it's conditional. 
-      if (Br->isConditional()) 
-        getOrCreateVPOperand(Br->getCondition()); 
- 
-      // Skip the rest of the Instruction processing for Branch instructions. 
-      continue; 
-    } 
- 
-    VPInstruction *NewVPInst; 
-    if (auto *Phi = dyn_cast<PHINode>(Inst)) { 
-      // Phi node's operands may have not been visited at this point. We create 
-      // an empty VPInstruction that we will fix once the whole plain CFG has 
-      // been built. 
-      NewVPInst = cast<VPInstruction>(VPIRBuilder.createNaryOp( 
-          Inst->getOpcode(), {} /*No operands*/, Inst)); 
-      PhisToFix.push_back(Phi); 
-    } else { 
-      // Translate LLVM-IR operands into VPValue operands and set them in the 
-      // new VPInstruction. 
-      SmallVector<VPValue *, 4> VPOperands; 
-      for (Value *Op : Inst->operands()) 
-        VPOperands.push_back(getOrCreateVPOperand(Op)); 
- 
-      // Build VPInstruction for any arbitraty Instruction without specific 
-      // representation in VPlan. 
-      NewVPInst = cast<VPInstruction>( 
-          VPIRBuilder.createNaryOp(Inst->getOpcode(), VPOperands, Inst)); 
-    } 
- 
-    IRDef2VPValue[Inst] = NewVPInst; 
-  } 
-} 
- 
-// Main interface to build the plain CFG. 
-VPRegionBlock *PlainCFGBuilder::buildPlainCFG() { 
-  // 1. Create the Top Region. It will be the parent of all VPBBs. 
-  TopRegion = new VPRegionBlock("TopRegion", false /*isReplicator*/); 
- 
-  // 2. Scan the body of the loop in a topological order to visit each basic 
-  // block after having visited its predecessor basic blocks. Create a VPBB for 
-  // each BB and link it to its successor and predecessor VPBBs. Note that 
-  // predecessors must be set in the same order as they are in the incomming IR. 
-  // Otherwise, there might be problems with existing phi nodes and algorithm 
-  // based on predecessors traversal. 
- 
-  // Loop PH needs to be explicitly visited since it's not taken into account by 
-  // LoopBlocksDFS. 
-  BasicBlock *PreheaderBB = TheLoop->getLoopPreheader(); 
-  assert((PreheaderBB->getTerminator()->getNumSuccessors() == 1) && 
-         "Unexpected loop preheader"); 
-  VPBasicBlock *PreheaderVPBB = getOrCreateVPBB(PreheaderBB); 
-  createVPInstructionsForVPBB(PreheaderVPBB, PreheaderBB); 
-  // Create empty VPBB for Loop H so that we can link PH->H. 
-  VPBlockBase *HeaderVPBB = getOrCreateVPBB(TheLoop->getHeader()); 
-  // Preheader's predecessors will be set during the loop RPO traversal below. 
-  PreheaderVPBB->setOneSuccessor(HeaderVPBB); 
- 
-  LoopBlocksRPO RPO(TheLoop); 
-  RPO.perform(LI); 
- 
-  for (BasicBlock *BB : RPO) { 
-    // Create or retrieve the VPBasicBlock for this BB and create its 
-    // VPInstructions. 
-    VPBasicBlock *VPBB = getOrCreateVPBB(BB); 
-    createVPInstructionsForVPBB(VPBB, BB); 
- 
-    // Set VPBB successors. We create empty VPBBs for successors if they don't 
-    // exist already. Recipes will be created when the successor is visited 
-    // during the RPO traversal. 
-    Instruction *TI = BB->getTerminator(); 
-    assert(TI && "Terminator expected."); 
-    unsigned NumSuccs = TI->getNumSuccessors(); 
- 
-    if (NumSuccs == 1) { 
-      VPBasicBlock *SuccVPBB = getOrCreateVPBB(TI->getSuccessor(0)); 
-      assert(SuccVPBB && "VPBB Successor not found."); 
-      VPBB->setOneSuccessor(SuccVPBB); 
-    } else if (NumSuccs == 2) { 
-      VPBasicBlock *SuccVPBB0 = getOrCreateVPBB(TI->getSuccessor(0)); 
-      assert(SuccVPBB0 && "Successor 0 not found."); 
-      VPBasicBlock *SuccVPBB1 = getOrCreateVPBB(TI->getSuccessor(1)); 
-      assert(SuccVPBB1 && "Successor 1 not found."); 
- 
-      // Get VPBB's condition bit. 
-      assert(isa<BranchInst>(TI) && "Unsupported terminator!"); 
-      auto *Br = cast<BranchInst>(TI); 
-      Value *BrCond = Br->getCondition(); 
-      // Look up the branch condition to get the corresponding VPValue 
-      // representing the condition bit in VPlan (which may be in another VPBB). 
-      assert(IRDef2VPValue.count(BrCond) && 
-             "Missing condition bit in IRDef2VPValue!"); 
-      VPValue *VPCondBit = IRDef2VPValue[BrCond]; 
- 
-      // Link successors using condition bit. 
-      VPBB->setTwoSuccessors(SuccVPBB0, SuccVPBB1, VPCondBit); 
-    } else 
-      llvm_unreachable("Number of successors not supported."); 
- 
-    // Set VPBB predecessors in the same order as they are in the incoming BB. 
-    setVPBBPredsFromBB(VPBB, BB); 
-  } 
- 
-  // 3. Process outermost loop exit. We created an empty VPBB for the loop 
-  // single exit BB during the RPO traversal of the loop body but Instructions 
-  // weren't visited because it's not part of the the loop. 
-  BasicBlock *LoopExitBB = TheLoop->getUniqueExitBlock(); 
-  assert(LoopExitBB && "Loops with multiple exits are not supported."); 
-  VPBasicBlock *LoopExitVPBB = BB2VPBB[LoopExitBB]; 
-  createVPInstructionsForVPBB(LoopExitVPBB, LoopExitBB); 
-  // Loop exit was already set as successor of the loop exiting BB. 
-  // We only set its predecessor VPBB now. 
-  setVPBBPredsFromBB(LoopExitVPBB, LoopExitBB); 
- 
-  // 4. The whole CFG has been built at this point so all the input Values must 
-  // have a VPlan couterpart. Fix VPlan phi nodes by adding their corresponding 
-  // VPlan operands. 
-  fixPhiNodes(); 
- 
-  // 5. Final Top Region setup. Set outermost loop pre-header and single exit as 
-  // Top Region entry and exit. 
-  TopRegion->setEntry(PreheaderVPBB); 
-  TopRegion->setExit(LoopExitVPBB); 
-  return TopRegion; 
-} 
- 
-VPRegionBlock *VPlanHCFGBuilder::buildPlainCFG() { 
-  PlainCFGBuilder PCFGBuilder(TheLoop, LI, Plan); 
-  return PCFGBuilder.buildPlainCFG(); 
-} 
- 
-// Public interface to build a H-CFG. 
-void VPlanHCFGBuilder::buildHierarchicalCFG() { 
-  // Build Top Region enclosing the plain CFG and set it as VPlan entry. 
-  VPRegionBlock *TopRegion = buildPlainCFG(); 
-  Plan.setEntry(TopRegion); 
-  LLVM_DEBUG(Plan.setName("HCFGBuilder: Plain CFG\n"); dbgs() << Plan); 
- 
-  Verifier.verifyHierarchicalCFG(TopRegion); 
- 
-  // Compute plain CFG dom tree for VPLInfo. 
-  VPDomTree.recalculate(*TopRegion); 
-  LLVM_DEBUG(dbgs() << "Dominator Tree after building the plain CFG.\n"; 
-             VPDomTree.print(dbgs())); 
- 
-  // Compute VPLInfo and keep it in Plan. 
-  VPLoopInfo &VPLInfo = Plan.getVPLoopInfo(); 
-  VPLInfo.analyze(VPDomTree); 
-  LLVM_DEBUG(dbgs() << "VPLoop Info After buildPlainCFG:\n"; 
-             VPLInfo.print(dbgs())); 
-} 
+//===-- VPlanHCFGBuilder.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the construction of a VPlan-based Hierarchical CFG
+/// (H-CFG) for an incoming IR. This construction comprises the following
+/// components and steps:
+//
+/// 1. PlainCFGBuilder class: builds a plain VPBasicBlock-based CFG that
+/// faithfully represents the CFG in the incoming IR. A VPRegionBlock (Top
+/// Region) is created to enclose and serve as parent of all the VPBasicBlocks
+/// in the plain CFG.
+/// NOTE: At this point, there is a direct correspondence between all the
+/// VPBasicBlocks created for the initial plain CFG and the incoming
+/// BasicBlocks. However, this might change in the future.
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPlanHCFGBuilder.h"
+#include "LoopVectorizationPlanner.h"
+#include "llvm/Analysis/LoopIterator.h"
+
+#define DEBUG_TYPE "loop-vectorize"
+
+using namespace llvm;
+
+namespace {
+// Class that is used to build the plain CFG for the incoming IR.
+class PlainCFGBuilder {
+private:
+  // The outermost loop of the input loop nest considered for vectorization.
+  Loop *TheLoop;
+
+  // Loop Info analysis.
+  LoopInfo *LI;
+
+  // Vectorization plan that we are working on.
+  VPlan &Plan;
+
+  // Output Top Region.
+  VPRegionBlock *TopRegion = nullptr;
+
+  // Builder of the VPlan instruction-level representation.
+  VPBuilder VPIRBuilder;
+
+  // NOTE: The following maps are intentionally destroyed after the plain CFG
+  // construction because subsequent VPlan-to-VPlan transformation may
+  // invalidate them.
+  // Map incoming BasicBlocks to their newly-created VPBasicBlocks.
+  DenseMap<BasicBlock *, VPBasicBlock *> BB2VPBB;
+  // Map incoming Value definitions to their newly-created VPValues.
+  DenseMap<Value *, VPValue *> IRDef2VPValue;
+
+  // Hold phi node's that need to be fixed once the plain CFG has been built.
+  SmallVector<PHINode *, 8> PhisToFix;
+
+  // Utility functions.
+  void setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB);
+  void fixPhiNodes();
+  VPBasicBlock *getOrCreateVPBB(BasicBlock *BB);
+#ifndef NDEBUG
+  bool isExternalDef(Value *Val);
+#endif
+  VPValue *getOrCreateVPOperand(Value *IRVal);
+  void createVPInstructionsForVPBB(VPBasicBlock *VPBB, BasicBlock *BB);
+
+public:
+  PlainCFGBuilder(Loop *Lp, LoopInfo *LI, VPlan &P)
+      : TheLoop(Lp), LI(LI), Plan(P) {}
+
+  // Build the plain CFG and return its Top Region.
+  VPRegionBlock *buildPlainCFG();
+};
+} // anonymous namespace
+
+// Set predecessors of \p VPBB in the same order as they are in \p BB. \p VPBB
+// must have no predecessors.
+void PlainCFGBuilder::setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB) {
+  SmallVector<VPBlockBase *, 8> VPBBPreds;
+  // Collect VPBB predecessors.
+  for (BasicBlock *Pred : predecessors(BB))
+    VPBBPreds.push_back(getOrCreateVPBB(Pred));
+
+  VPBB->setPredecessors(VPBBPreds);
+}
+
+// Add operands to VPInstructions representing phi nodes from the input IR.
+void PlainCFGBuilder::fixPhiNodes() {
+  for (auto *Phi : PhisToFix) {
+    assert(IRDef2VPValue.count(Phi) && "Missing VPInstruction for PHINode.");
+    VPValue *VPVal = IRDef2VPValue[Phi];
+    assert(isa<VPInstruction>(VPVal) && "Expected VPInstruction for phi node.");
+    auto *VPPhi = cast<VPInstruction>(VPVal);
+    assert(VPPhi->getNumOperands() == 0 &&
+           "Expected VPInstruction with no operands.");
+
+    for (Value *Op : Phi->operands())
+      VPPhi->addOperand(getOrCreateVPOperand(Op));
+  }
+}
+
+// Create a new empty VPBasicBlock for an incoming BasicBlock or retrieve an
+// existing one if it was already created.
+VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) {
+  auto BlockIt = BB2VPBB.find(BB);
+  if (BlockIt != BB2VPBB.end())
+    // Retrieve existing VPBB.
+    return BlockIt->second;
+
+  // Create new VPBB.
+  LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << BB->getName() << "\n");
+  VPBasicBlock *VPBB = new VPBasicBlock(BB->getName());
+  BB2VPBB[BB] = VPBB;
+  VPBB->setParent(TopRegion);
+  return VPBB;
+}
+
+#ifndef NDEBUG
+// Return true if \p Val is considered an external definition. An external
+// definition is either:
+// 1. A Value that is not an Instruction. This will be refined in the future.
+// 2. An Instruction that is outside of the CFG snippet represented in VPlan,
+// i.e., is not part of: a) the loop nest, b) outermost loop PH and, c)
+// outermost loop exits.
+bool PlainCFGBuilder::isExternalDef(Value *Val) {
+  // All the Values that are not Instructions are considered external
+  // definitions for now.
+  Instruction *Inst = dyn_cast<Instruction>(Val);
+  if (!Inst)
+    return true;
+
+  BasicBlock *InstParent = Inst->getParent();
+  assert(InstParent && "Expected instruction parent.");
+
+  // Check whether Instruction definition is in loop PH.
+  BasicBlock *PH = TheLoop->getLoopPreheader();
+  assert(PH && "Expected loop pre-header.");
+
+  if (InstParent == PH)
+    // Instruction definition is in outermost loop PH.
+    return false;
+
+  // Check whether Instruction definition is in the loop exit.
+  BasicBlock *Exit = TheLoop->getUniqueExitBlock();
+  assert(Exit && "Expected loop with single exit.");
+  if (InstParent == Exit) {
+    // Instruction definition is in outermost loop exit.
+    return false;
+  }
+
+  // Check whether Instruction definition is in loop body.
+  return !TheLoop->contains(Inst);
+}
+#endif
+
+// Create a new VPValue or retrieve an existing one for the Instruction's
+// operand \p IRVal. This function must only be used to create/retrieve VPValues
+// for *Instruction's operands* and not to create regular VPInstruction's. For
+// the latter, please, look at 'createVPInstructionsForVPBB'.
+VPValue *PlainCFGBuilder::getOrCreateVPOperand(Value *IRVal) {
+  auto VPValIt = IRDef2VPValue.find(IRVal);
+  if (VPValIt != IRDef2VPValue.end())
+    // Operand has an associated VPInstruction or VPValue that was previously
+    // created.
+    return VPValIt->second;
+
+  // Operand doesn't have a previously created VPInstruction/VPValue. This
+  // means that operand is:
+  //   A) a definition external to VPlan,
+  //   B) any other Value without specific representation in VPlan.
+  // For now, we use VPValue to represent A and B and classify both as external
+  // definitions. We may introduce specific VPValue subclasses for them in the
+  // future.
+  assert(isExternalDef(IRVal) && "Expected external definition as operand.");
+
+  // A and B: Create VPValue and add it to the pool of external definitions and
+  // to the Value->VPValue map.
+  VPValue *NewVPVal = new VPValue(IRVal);
+  Plan.addExternalDef(NewVPVal);
+  IRDef2VPValue[IRVal] = NewVPVal;
+  return NewVPVal;
+}
+
+// Create new VPInstructions in a VPBasicBlock, given its BasicBlock
+// counterpart. This function must be invoked in RPO so that the operands of a
+// VPInstruction in \p BB have been visited before (except for Phi nodes).
+void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
+                                                  BasicBlock *BB) {
+  VPIRBuilder.setInsertPoint(VPBB);
+  for (Instruction &InstRef : *BB) {
+    Instruction *Inst = &InstRef;
+
+    // There shouldn't be any VPValue for Inst at this point. Otherwise, we
+    // visited Inst when we shouldn't, breaking the RPO traversal order.
+    assert(!IRDef2VPValue.count(Inst) &&
+           "Instruction shouldn't have been visited.");
+
+    if (auto *Br = dyn_cast<BranchInst>(Inst)) {
+      // Branch instruction is not explicitly represented in VPlan but we need
+      // to represent its condition bit when it's conditional.
+      if (Br->isConditional())
+        getOrCreateVPOperand(Br->getCondition());
+
+      // Skip the rest of the Instruction processing for Branch instructions.
+      continue;
+    }
+
+    VPInstruction *NewVPInst;
+    if (auto *Phi = dyn_cast<PHINode>(Inst)) {
+      // Phi node's operands may have not been visited at this point. We create
+      // an empty VPInstruction that we will fix once the whole plain CFG has
+      // been built.
+      NewVPInst = cast<VPInstruction>(VPIRBuilder.createNaryOp(
+          Inst->getOpcode(), {} /*No operands*/, Inst));
+      PhisToFix.push_back(Phi);
+    } else {
+      // Translate LLVM-IR operands into VPValue operands and set them in the
+      // new VPInstruction.
+      SmallVector<VPValue *, 4> VPOperands;
+      for (Value *Op : Inst->operands())
+        VPOperands.push_back(getOrCreateVPOperand(Op));
+
+      // Build VPInstruction for any arbitraty Instruction without specific
+      // representation in VPlan.
+      NewVPInst = cast<VPInstruction>(
+          VPIRBuilder.createNaryOp(Inst->getOpcode(), VPOperands, Inst));
+    }
+
+    IRDef2VPValue[Inst] = NewVPInst;
+  }
+}
+
+// Main interface to build the plain CFG.
+VPRegionBlock *PlainCFGBuilder::buildPlainCFG() {
+  // 1. Create the Top Region. It will be the parent of all VPBBs.
+  TopRegion = new VPRegionBlock("TopRegion", false /*isReplicator*/);
+
+  // 2. Scan the body of the loop in a topological order to visit each basic
+  // block after having visited its predecessor basic blocks. Create a VPBB for
+  // each BB and link it to its successor and predecessor VPBBs. Note that
+  // predecessors must be set in the same order as they are in the incomming IR.
+  // Otherwise, there might be problems with existing phi nodes and algorithm
+  // based on predecessors traversal.
+
+  // Loop PH needs to be explicitly visited since it's not taken into account by
+  // LoopBlocksDFS.
+  BasicBlock *PreheaderBB = TheLoop->getLoopPreheader();
+  assert((PreheaderBB->getTerminator()->getNumSuccessors() == 1) &&
+         "Unexpected loop preheader");
+  VPBasicBlock *PreheaderVPBB = getOrCreateVPBB(PreheaderBB);
+  createVPInstructionsForVPBB(PreheaderVPBB, PreheaderBB);
+  // Create empty VPBB for Loop H so that we can link PH->H.
+  VPBlockBase *HeaderVPBB = getOrCreateVPBB(TheLoop->getHeader());
+  // Preheader's predecessors will be set during the loop RPO traversal below.
+  PreheaderVPBB->setOneSuccessor(HeaderVPBB);
+
+  LoopBlocksRPO RPO(TheLoop);
+  RPO.perform(LI);
+
+  for (BasicBlock *BB : RPO) {
+    // Create or retrieve the VPBasicBlock for this BB and create its
+    // VPInstructions.
+    VPBasicBlock *VPBB = getOrCreateVPBB(BB);
+    createVPInstructionsForVPBB(VPBB, BB);
+
+    // Set VPBB successors. We create empty VPBBs for successors if they don't
+    // exist already. Recipes will be created when the successor is visited
+    // during the RPO traversal.
+    Instruction *TI = BB->getTerminator();
+    assert(TI && "Terminator expected.");
+    unsigned NumSuccs = TI->getNumSuccessors();
+
+    if (NumSuccs == 1) {
+      VPBasicBlock *SuccVPBB = getOrCreateVPBB(TI->getSuccessor(0));
+      assert(SuccVPBB && "VPBB Successor not found.");
+      VPBB->setOneSuccessor(SuccVPBB);
+    } else if (NumSuccs == 2) {
+      VPBasicBlock *SuccVPBB0 = getOrCreateVPBB(TI->getSuccessor(0));
+      assert(SuccVPBB0 && "Successor 0 not found.");
+      VPBasicBlock *SuccVPBB1 = getOrCreateVPBB(TI->getSuccessor(1));
+      assert(SuccVPBB1 && "Successor 1 not found.");
+
+      // Get VPBB's condition bit.
+      assert(isa<BranchInst>(TI) && "Unsupported terminator!");
+      auto *Br = cast<BranchInst>(TI);
+      Value *BrCond = Br->getCondition();
+      // Look up the branch condition to get the corresponding VPValue
+      // representing the condition bit in VPlan (which may be in another VPBB).
+      assert(IRDef2VPValue.count(BrCond) &&
+             "Missing condition bit in IRDef2VPValue!");
+      VPValue *VPCondBit = IRDef2VPValue[BrCond];
+
+      // Link successors using condition bit.
+      VPBB->setTwoSuccessors(SuccVPBB0, SuccVPBB1, VPCondBit);
+    } else
+      llvm_unreachable("Number of successors not supported.");
+
+    // Set VPBB predecessors in the same order as they are in the incoming BB.
+    setVPBBPredsFromBB(VPBB, BB);
+  }
+
+  // 3. Process outermost loop exit. We created an empty VPBB for the loop
+  // single exit BB during the RPO traversal of the loop body but Instructions
+  // weren't visited because it's not part of the the loop.
+  BasicBlock *LoopExitBB = TheLoop->getUniqueExitBlock();
+  assert(LoopExitBB && "Loops with multiple exits are not supported.");
+  VPBasicBlock *LoopExitVPBB = BB2VPBB[LoopExitBB];
+  createVPInstructionsForVPBB(LoopExitVPBB, LoopExitBB);
+  // Loop exit was already set as successor of the loop exiting BB.
+  // We only set its predecessor VPBB now.
+  setVPBBPredsFromBB(LoopExitVPBB, LoopExitBB);
+
+  // 4. The whole CFG has been built at this point so all the input Values must
+  // have a VPlan couterpart. Fix VPlan phi nodes by adding their corresponding
+  // VPlan operands.
+  fixPhiNodes();
+
+  // 5. Final Top Region setup. Set outermost loop pre-header and single exit as
+  // Top Region entry and exit.
+  TopRegion->setEntry(PreheaderVPBB);
+  TopRegion->setExit(LoopExitVPBB);
+  return TopRegion;
+}
+
+VPRegionBlock *VPlanHCFGBuilder::buildPlainCFG() {
+  PlainCFGBuilder PCFGBuilder(TheLoop, LI, Plan);
+  return PCFGBuilder.buildPlainCFG();
+}
+
+// Public interface to build a H-CFG.
+void VPlanHCFGBuilder::buildHierarchicalCFG() {
+  // Build Top Region enclosing the plain CFG and set it as VPlan entry.
+  VPRegionBlock *TopRegion = buildPlainCFG();
+  Plan.setEntry(TopRegion);
+  LLVM_DEBUG(Plan.setName("HCFGBuilder: Plain CFG\n"); dbgs() << Plan);
+
+  Verifier.verifyHierarchicalCFG(TopRegion);
+
+  // Compute plain CFG dom tree for VPLInfo.
+  VPDomTree.recalculate(*TopRegion);
+  LLVM_DEBUG(dbgs() << "Dominator Tree after building the plain CFG.\n";
+             VPDomTree.print(dbgs()));
+
+  // Compute VPLInfo and keep it in Plan.
+  VPLoopInfo &VPLInfo = Plan.getVPLoopInfo();
+  VPLInfo.analyze(VPDomTree);
+  LLVM_DEBUG(dbgs() << "VPLoop Info After buildPlainCFG:\n";
+             VPLInfo.print(dbgs()));
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanHCFGBuilder.h b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
index ba611ede14..238ee7e634 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
@@ -1,71 +1,71 @@
-//===-- VPlanHCFGBuilder.h --------------------------------------*- C++ -*-===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-/// 
-/// \file 
-/// This file defines the VPlanHCFGBuilder class which contains the public 
-/// interface (buildHierarchicalCFG) to build a VPlan-based Hierarchical CFG 
-/// (H-CFG) for an incoming IR. 
-/// 
-/// A H-CFG in VPlan is a control-flow graph whose nodes are VPBasicBlocks 
-/// and/or VPRegionBlocks (i.e., other H-CFGs). The outermost H-CFG of a VPlan 
-/// consists of a VPRegionBlock, denoted Top Region, which encloses any other 
-/// VPBlockBase in the H-CFG. This guarantees that any VPBlockBase in the H-CFG 
-/// other than the Top Region will have a parent VPRegionBlock and allows us 
-/// to easily add more nodes before/after the main vector loop (such as the 
-/// reduction epilogue). 
-/// 
-//===----------------------------------------------------------------------===// 
- 
-#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H 
-#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H 
- 
-#include "VPlan.h" 
-#include "VPlanDominatorTree.h" 
-#include "VPlanVerifier.h" 
- 
-namespace llvm { 
- 
-class Loop; 
-class VPlanTestBase; 
- 
-/// Main class to build the VPlan H-CFG for an incoming IR. 
-class VPlanHCFGBuilder { 
-  friend VPlanTestBase; 
- 
-private: 
-  // The outermost loop of the input loop nest considered for vectorization. 
-  Loop *TheLoop; 
- 
-  // Loop Info analysis. 
-  LoopInfo *LI; 
- 
-  // The VPlan that will contain the H-CFG we are building. 
-  VPlan &Plan; 
- 
-  // VPlan verifier utility. 
-  VPlanVerifier Verifier; 
- 
-  // Dominator analysis for VPlan plain CFG to be used in the 
-  // construction of the H-CFG. This analysis is no longer valid once regions 
-  // are introduced. 
-  VPDominatorTree VPDomTree; 
- 
-  /// Build plain CFG for TheLoop. Return a new VPRegionBlock (TopRegion) 
-  /// enclosing the plain CFG. 
-  VPRegionBlock *buildPlainCFG(); 
- 
-public: 
-  VPlanHCFGBuilder(Loop *Lp, LoopInfo *LI, VPlan &P) 
-      : TheLoop(Lp), LI(LI), Plan(P) {} 
- 
-  /// Build H-CFG for TheLoop and update Plan accordingly. 
-  void buildHierarchicalCFG(); 
-}; 
-} // namespace llvm 
- 
-#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H 
+//===-- VPlanHCFGBuilder.h --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the VPlanHCFGBuilder class which contains the public
+/// interface (buildHierarchicalCFG) to build a VPlan-based Hierarchical CFG
+/// (H-CFG) for an incoming IR.
+///
+/// A H-CFG in VPlan is a control-flow graph whose nodes are VPBasicBlocks
+/// and/or VPRegionBlocks (i.e., other H-CFGs). The outermost H-CFG of a VPlan
+/// consists of a VPRegionBlock, denoted Top Region, which encloses any other
+/// VPBlockBase in the H-CFG. This guarantees that any VPBlockBase in the H-CFG
+/// other than the Top Region will have a parent VPRegionBlock and allows us
+/// to easily add more nodes before/after the main vector loop (such as the
+/// reduction epilogue).
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H
+
+#include "VPlan.h"
+#include "VPlanDominatorTree.h"
+#include "VPlanVerifier.h"
+
+namespace llvm {
+
+class Loop;
+class VPlanTestBase;
+
+/// Main class to build the VPlan H-CFG for an incoming IR.
+class VPlanHCFGBuilder {
+  friend VPlanTestBase;
+
+private:
+  // The outermost loop of the input loop nest considered for vectorization.
+  Loop *TheLoop;
+
+  // Loop Info analysis.
+  LoopInfo *LI;
+
+  // The VPlan that will contain the H-CFG we are building.
+  VPlan &Plan;
+
+  // VPlan verifier utility.
+  VPlanVerifier Verifier;
+
+  // Dominator analysis for VPlan plain CFG to be used in the
+  // construction of the H-CFG. This analysis is no longer valid once regions
+  // are introduced.
+  VPDominatorTree VPDomTree;
+
+  /// Build plain CFG for TheLoop. Return a new VPRegionBlock (TopRegion)
+  /// enclosing the plain CFG.
+  VPRegionBlock *buildPlainCFG();
+
+public:
+  VPlanHCFGBuilder(Loop *Lp, LoopInfo *LI, VPlan &P)
+      : TheLoop(Lp), LI(LI), Plan(P) {}
+
+  /// Build H-CFG for TheLoop and update Plan accordingly.
+  void buildHierarchicalCFG();
+};
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanLoopInfo.h b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanLoopInfo.h
index 4b9933630f..5208f2d58e 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanLoopInfo.h
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanLoopInfo.h
@@ -1,44 +1,44 @@
-//===-- VPLoopInfo.h --------------------------------------------*- C++ -*-===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-/// 
-/// \file 
-/// This file defines VPLoopInfo analysis and VPLoop class. VPLoopInfo is a 
-/// specialization of LoopInfoBase for VPBlockBase. VPLoops is a specialization 
-/// of LoopBase that is used to hold loop metadata from VPLoopInfo. Further 
-/// information can be found in VectorizationPlanner.rst. 
-/// 
-//===----------------------------------------------------------------------===// 
- 
-#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLOOPINFO_H 
-#define LLVM_TRANSFORMS_VECTORIZE_VPLOOPINFO_H 
- 
-#include "llvm/Analysis/LoopInfoImpl.h" 
- 
-namespace llvm { 
-class VPBlockBase; 
- 
-/// Hold analysis information for every loop detected by VPLoopInfo. It is an 
-/// instantiation of LoopBase. 
-class VPLoop : public LoopBase<VPBlockBase, VPLoop> { 
-private: 
-  friend class LoopInfoBase<VPBlockBase, VPLoop>; 
-  explicit VPLoop(VPBlockBase *VPB) : LoopBase<VPBlockBase, VPLoop>(VPB) {} 
-}; 
- 
-/// VPLoopInfo provides analysis of natural loop for VPBlockBase-based 
-/// Hierarchical CFG. It is a specialization of LoopInfoBase class. 
-// TODO: VPLoopInfo is initially computed on top of the VPlan plain CFG, which 
-// is the same as the incoming IR CFG. If it's more efficient than running the 
-// whole loop detection algorithm, we may want to create a mechanism to 
-// translate LoopInfo into VPLoopInfo. However, that would require significant 
-// changes in LoopInfoBase class. 
-typedef LoopInfoBase<VPBlockBase, VPLoop> VPLoopInfo; 
- 
-} // namespace llvm 
- 
-#endif // LLVM_TRANSFORMS_VECTORIZE_VPLOOPINFO_H 
+//===-- VPLoopInfo.h --------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines VPLoopInfo analysis and VPLoop class. VPLoopInfo is a
+/// specialization of LoopInfoBase for VPBlockBase. VPLoops is a specialization
+/// of LoopBase that is used to hold loop metadata from VPLoopInfo. Further
+/// information can be found in VectorizationPlanner.rst.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLOOPINFO_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLOOPINFO_H
+
+#include "llvm/Analysis/LoopInfoImpl.h"
+
+namespace llvm {
+class VPBlockBase;
+
+/// Hold analysis information for every loop detected by VPLoopInfo. It is an
+/// instantiation of LoopBase.
+class VPLoop : public LoopBase<VPBlockBase, VPLoop> {
+private:
+  friend class LoopInfoBase<VPBlockBase, VPLoop>;
+  explicit VPLoop(VPBlockBase *VPB) : LoopBase<VPBlockBase, VPLoop>(VPB) {}
+};
+
+/// VPLoopInfo provides analysis of natural loop for VPBlockBase-based
+/// Hierarchical CFG. It is a specialization of LoopInfoBase class.
+// TODO: VPLoopInfo is initially computed on top of the VPlan plain CFG, which
+// is the same as the incoming IR CFG. If it's more efficient than running the
+// whole loop detection algorithm, we may want to create a mechanism to
+// translate LoopInfo into VPLoopInfo. However, that would require significant
+// changes in LoopInfoBase class.
+typedef LoopInfoBase<VPBlockBase, VPLoop> VPLoopInfo;
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLOOPINFO_H
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanPredicator.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanPredicator.cpp
index 4151d85df2..ac3b3505dc 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -1,248 +1,248 @@
-//===-- VPlanPredicator.cpp -------------------------------------*- C++ -*-===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-/// 
-/// \file 
-/// This file implements the VPlanPredicator class which contains the public 
-/// interfaces to predicate and linearize the VPlan region. 
-/// 
-//===----------------------------------------------------------------------===// 
- 
-#include "VPlanPredicator.h" 
-#include "VPlan.h" 
-#include "llvm/ADT/DepthFirstIterator.h" 
-#include "llvm/ADT/GraphTraits.h" 
-#include "llvm/ADT/PostOrderIterator.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/raw_ostream.h" 
- 
-#define DEBUG_TYPE "VPlanPredicator" 
- 
-using namespace llvm; 
- 
-// Generate VPInstructions at the beginning of CurrBB that calculate the 
-// predicate being propagated from PredBB to CurrBB depending on the edge type 
-// between them. For example if: 
-//  i.  PredBB is controlled by predicate %BP, and 
-//  ii. The edge PredBB->CurrBB is the false edge, controlled by the condition 
-//  bit value %CBV then this function will generate the following two 
-//  VPInstructions at the start of CurrBB: 
-//   %IntermediateVal = not %CBV 
-//   %FinalVal        = and %BP %IntermediateVal 
-// It returns %FinalVal. 
-VPValue *VPlanPredicator::getOrCreateNotPredicate(VPBasicBlock *PredBB, 
-                                                  VPBasicBlock *CurrBB) { 
-  VPValue *CBV = PredBB->getCondBit(); 
- 
-  // Set the intermediate value - this is either 'CBV', or 'not CBV' 
-  // depending on the edge type. 
-  EdgeType ET = getEdgeTypeBetween(PredBB, CurrBB); 
-  VPValue *IntermediateVal = nullptr; 
-  switch (ET) { 
-  case EdgeType::TRUE_EDGE: 
-    // CurrBB is the true successor of PredBB - nothing to do here. 
-    IntermediateVal = CBV; 
-    break; 
- 
-  case EdgeType::FALSE_EDGE: 
-    // CurrBB is the False successor of PredBB - compute not of CBV. 
-    IntermediateVal = Builder.createNot(CBV); 
-    break; 
-  } 
- 
-  // Now AND intermediate value with PredBB's block predicate if it has one. 
-  VPValue *BP = PredBB->getPredicate(); 
-  if (BP) 
-    return Builder.createAnd(BP, IntermediateVal); 
-  else 
-    return IntermediateVal; 
-} 
- 
-// Generate a tree of ORs for all IncomingPredicates in  WorkList. 
-// Note: This function destroys the original Worklist. 
-// 
-// P1 P2 P3 P4 P5 
-//  \ /   \ /  / 
-//  OR1   OR2 / 
-//    \    | / 
-//     \   +/-+ 
-//      \  /  | 
-//       OR3  | 
-//         \  | 
-//          OR4 <- Returns this 
-//           | 
-// 
-// The algorithm uses a worklist of predicates as its main data structure. 
-// We pop a pair of values from the front (e.g. P1 and P2), generate an OR 
-// (in this example OR1), and push it back. In this example the worklist 
-// contains {P3, P4, P5, OR1}. 
-// The process iterates until we have only one element in the Worklist (OR4). 
-// The last element is the root predicate which is returned. 
-VPValue *VPlanPredicator::genPredicateTree(std::list<VPValue *> &Worklist) { 
-  if (Worklist.empty()) 
-    return nullptr; 
- 
-  // The worklist initially contains all the leaf nodes. Initialize the tree 
-  // using them. 
-  while (Worklist.size() >= 2) { 
-    // Pop a pair of values from the front. 
-    VPValue *LHS = Worklist.front(); 
-    Worklist.pop_front(); 
-    VPValue *RHS = Worklist.front(); 
-    Worklist.pop_front(); 
- 
-    // Create an OR of these values. 
-    VPValue *Or = Builder.createOr(LHS, RHS); 
- 
-    // Push OR to the back of the worklist. 
-    Worklist.push_back(Or); 
-  } 
- 
-  assert(Worklist.size() == 1 && "Expected 1 item in worklist"); 
- 
-  // The root is the last node in the worklist. 
-  VPValue *Root = Worklist.front(); 
- 
-  // This root needs to replace the existing block predicate. This is done in 
-  // the caller function. 
-  return Root; 
-} 
- 
-// Return whether the edge FromBlock -> ToBlock is a TRUE_EDGE or FALSE_EDGE 
-VPlanPredicator::EdgeType 
-VPlanPredicator::getEdgeTypeBetween(VPBlockBase *FromBlock, 
-                                    VPBlockBase *ToBlock) { 
-  unsigned Count = 0; 
-  for (VPBlockBase *SuccBlock : FromBlock->getSuccessors()) { 
-    if (SuccBlock == ToBlock) { 
-      assert(Count < 2 && "Switch not supported currently"); 
-      return (Count == 0) ? EdgeType::TRUE_EDGE : EdgeType::FALSE_EDGE; 
-    } 
-    Count++; 
-  } 
- 
-  llvm_unreachable("Broken getEdgeTypeBetween"); 
-} 
- 
-// Generate all predicates needed for CurrBlock by going through its immediate 
-// predecessor blocks. 
-void VPlanPredicator::createOrPropagatePredicates(VPBlockBase *CurrBlock, 
-                                                  VPRegionBlock *Region) { 
-  // Blocks that dominate region exit inherit the predicate from the region. 
-  // Return after setting the predicate. 
-  if (VPDomTree.dominates(CurrBlock, Region->getExit())) { 
-    VPValue *RegionBP = Region->getPredicate(); 
-    CurrBlock->setPredicate(RegionBP); 
-    return; 
-  } 
- 
-  // Collect all incoming predicates in a worklist. 
-  std::list<VPValue *> IncomingPredicates; 
- 
-  // Set the builder's insertion point to the top of the current BB 
-  VPBasicBlock *CurrBB = cast<VPBasicBlock>(CurrBlock->getEntryBasicBlock()); 
-  Builder.setInsertPoint(CurrBB, CurrBB->begin()); 
- 
-  // For each predecessor, generate the VPInstructions required for 
-  // computing 'BP AND (not) CBV" at the top of CurrBB. 
-  // Collect the outcome of this calculation for all predecessors 
-  // into IncomingPredicates. 
-  for (VPBlockBase *PredBlock : CurrBlock->getPredecessors()) { 
-    // Skip back-edges 
-    if (VPBlockUtils::isBackEdge(PredBlock, CurrBlock, VPLI)) 
-      continue; 
- 
-    VPValue *IncomingPredicate = nullptr; 
-    unsigned NumPredSuccsNoBE = 
-        VPBlockUtils::countSuccessorsNoBE(PredBlock, VPLI); 
- 
-    // If there is an unconditional branch to the currBB, then we don't create 
-    // edge predicates. We use the predecessor's block predicate instead. 
-    if (NumPredSuccsNoBE == 1) 
-      IncomingPredicate = PredBlock->getPredicate(); 
-    else if (NumPredSuccsNoBE == 2) { 
-      // Emit recipes into CurrBlock if required 
-      assert(isa<VPBasicBlock>(PredBlock) && "Only BBs have multiple exits"); 
-      IncomingPredicate = 
-          getOrCreateNotPredicate(cast<VPBasicBlock>(PredBlock), CurrBB); 
-    } else 
-      llvm_unreachable("FIXME: switch statement ?"); 
- 
-    if (IncomingPredicate) 
-      IncomingPredicates.push_back(IncomingPredicate); 
-  } 
- 
-  // Logically OR all incoming predicates by building the Predicate Tree. 
-  VPValue *Predicate = genPredicateTree(IncomingPredicates); 
- 
-  // Now update the block's predicate with the new one. 
-  CurrBlock->setPredicate(Predicate); 
-} 
- 
-// Generate all predicates needed for Region. 
-void VPlanPredicator::predicateRegionRec(VPRegionBlock *Region) { 
-  VPBasicBlock *EntryBlock = cast<VPBasicBlock>(Region->getEntry()); 
-  ReversePostOrderTraversal<VPBlockBase *> RPOT(EntryBlock); 
- 
-  // Generate edge predicates and append them to the block predicate. RPO is 
-  // necessary since the predecessor blocks' block predicate needs to be set 
-  // before the current block's block predicate can be computed. 
+//===-- VPlanPredicator.cpp -------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the VPlanPredicator class which contains the public
+/// interfaces to predicate and linearize the VPlan region.
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPlanPredicator.h"
+#include "VPlan.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "VPlanPredicator"
+
+using namespace llvm;
+
+// Generate VPInstructions at the beginning of CurrBB that calculate the
+// predicate being propagated from PredBB to CurrBB depending on the edge type
+// between them. For example if:
+//  i.  PredBB is controlled by predicate %BP, and
+//  ii. The edge PredBB->CurrBB is the false edge, controlled by the condition
+//  bit value %CBV then this function will generate the following two
+//  VPInstructions at the start of CurrBB:
+//   %IntermediateVal = not %CBV
+//   %FinalVal        = and %BP %IntermediateVal
+// It returns %FinalVal.
+VPValue *VPlanPredicator::getOrCreateNotPredicate(VPBasicBlock *PredBB,
+                                                  VPBasicBlock *CurrBB) {
+  VPValue *CBV = PredBB->getCondBit();
+
+  // Set the intermediate value - this is either 'CBV', or 'not CBV'
+  // depending on the edge type.
+  EdgeType ET = getEdgeTypeBetween(PredBB, CurrBB);
+  VPValue *IntermediateVal = nullptr;
+  switch (ET) {
+  case EdgeType::TRUE_EDGE:
+    // CurrBB is the true successor of PredBB - nothing to do here.
+    IntermediateVal = CBV;
+    break;
+
+  case EdgeType::FALSE_EDGE:
+    // CurrBB is the False successor of PredBB - compute not of CBV.
+    IntermediateVal = Builder.createNot(CBV);
+    break;
+  }
+
+  // Now AND intermediate value with PredBB's block predicate if it has one.
+  VPValue *BP = PredBB->getPredicate();
+  if (BP)
+    return Builder.createAnd(BP, IntermediateVal);
+  else
+    return IntermediateVal;
+}
+
+// Generate a tree of ORs for all IncomingPredicates in  WorkList.
+// Note: This function destroys the original Worklist.
+//
+// P1 P2 P3 P4 P5
+//  \ /   \ /  /
+//  OR1   OR2 /
+//    \    | /
+//     \   +/-+
+//      \  /  |
+//       OR3  |
+//         \  |
+//          OR4 <- Returns this
+//           |
+//
+// The algorithm uses a worklist of predicates as its main data structure.
+// We pop a pair of values from the front (e.g. P1 and P2), generate an OR
+// (in this example OR1), and push it back. In this example the worklist
+// contains {P3, P4, P5, OR1}.
+// The process iterates until we have only one element in the Worklist (OR4).
+// The last element is the root predicate which is returned.
+VPValue *VPlanPredicator::genPredicateTree(std::list<VPValue *> &Worklist) {
+  if (Worklist.empty())
+    return nullptr;
+
+  // The worklist initially contains all the leaf nodes. Initialize the tree
+  // using them.
+  while (Worklist.size() >= 2) {
+    // Pop a pair of values from the front.
+    VPValue *LHS = Worklist.front();
+    Worklist.pop_front();
+    VPValue *RHS = Worklist.front();
+    Worklist.pop_front();
+
+    // Create an OR of these values.
+    VPValue *Or = Builder.createOr(LHS, RHS);
+
+    // Push OR to the back of the worklist.
+    Worklist.push_back(Or);
+  }
+
+  assert(Worklist.size() == 1 && "Expected 1 item in worklist");
+
+  // The root is the last node in the worklist.
+  VPValue *Root = Worklist.front();
+
+  // This root needs to replace the existing block predicate. This is done in
+  // the caller function.
+  return Root;
+}
+
+// Return whether the edge FromBlock -> ToBlock is a TRUE_EDGE or FALSE_EDGE
+VPlanPredicator::EdgeType
+VPlanPredicator::getEdgeTypeBetween(VPBlockBase *FromBlock,
+                                    VPBlockBase *ToBlock) {
+  unsigned Count = 0;
+  for (VPBlockBase *SuccBlock : FromBlock->getSuccessors()) {
+    if (SuccBlock == ToBlock) {
+      assert(Count < 2 && "Switch not supported currently");
+      return (Count == 0) ? EdgeType::TRUE_EDGE : EdgeType::FALSE_EDGE;
+    }
+    Count++;
+  }
+
+  llvm_unreachable("Broken getEdgeTypeBetween");
+}
+
+// Generate all predicates needed for CurrBlock by going through its immediate
+// predecessor blocks.
+void VPlanPredicator::createOrPropagatePredicates(VPBlockBase *CurrBlock,
+                                                  VPRegionBlock *Region) {
+  // Blocks that dominate region exit inherit the predicate from the region.
+  // Return after setting the predicate.
+  if (VPDomTree.dominates(CurrBlock, Region->getExit())) {
+    VPValue *RegionBP = Region->getPredicate();
+    CurrBlock->setPredicate(RegionBP);
+    return;
+  }
+
+  // Collect all incoming predicates in a worklist.
+  std::list<VPValue *> IncomingPredicates;
+
+  // Set the builder's insertion point to the top of the current BB
+  VPBasicBlock *CurrBB = cast<VPBasicBlock>(CurrBlock->getEntryBasicBlock());
+  Builder.setInsertPoint(CurrBB, CurrBB->begin());
+
+  // For each predecessor, generate the VPInstructions required for
+  // computing 'BP AND (not) CBV" at the top of CurrBB.
+  // Collect the outcome of this calculation for all predecessors
+  // into IncomingPredicates.
+  for (VPBlockBase *PredBlock : CurrBlock->getPredecessors()) {
+    // Skip back-edges
+    if (VPBlockUtils::isBackEdge(PredBlock, CurrBlock, VPLI))
+      continue;
+
+    VPValue *IncomingPredicate = nullptr;
+    unsigned NumPredSuccsNoBE =
+        VPBlockUtils::countSuccessorsNoBE(PredBlock, VPLI);
+
+    // If there is an unconditional branch to the currBB, then we don't create
+    // edge predicates. We use the predecessor's block predicate instead.
+    if (NumPredSuccsNoBE == 1)
+      IncomingPredicate = PredBlock->getPredicate();
+    else if (NumPredSuccsNoBE == 2) {
+      // Emit recipes into CurrBlock if required
+      assert(isa<VPBasicBlock>(PredBlock) && "Only BBs have multiple exits");
+      IncomingPredicate =
+          getOrCreateNotPredicate(cast<VPBasicBlock>(PredBlock), CurrBB);
+    } else
+      llvm_unreachable("FIXME: switch statement ?");
+
+    if (IncomingPredicate)
+      IncomingPredicates.push_back(IncomingPredicate);
+  }
+
+  // Logically OR all incoming predicates by building the Predicate Tree.
+  VPValue *Predicate = genPredicateTree(IncomingPredicates);
+
+  // Now update the block's predicate with the new one.
+  CurrBlock->setPredicate(Predicate);
+}
+
+// Generate all predicates needed for Region.
+void VPlanPredicator::predicateRegionRec(VPRegionBlock *Region) {
+  VPBasicBlock *EntryBlock = cast<VPBasicBlock>(Region->getEntry());
+  ReversePostOrderTraversal<VPBlockBase *> RPOT(EntryBlock);
+
+  // Generate edge predicates and append them to the block predicate. RPO is
+  // necessary since the predecessor blocks' block predicate needs to be set
+  // before the current block's block predicate can be computed.
   for (VPBlockBase *Block : RPOT) {
-    // TODO: Handle nested regions once we start generating the same. 
-    assert(!isa<VPRegionBlock>(Block) && "Nested region not expected"); 
-    createOrPropagatePredicates(Block, Region); 
-  } 
-} 
- 
-// Linearize the CFG within Region. 
-// TODO: Predication and linearization need RPOT for every region. 
-// This traversal is expensive. Since predication is not adding new 
-// blocks, we should be able to compute RPOT once in predication and 
-// reuse it here. This becomes even more important once we have nested 
-// regions. 
-void VPlanPredicator::linearizeRegionRec(VPRegionBlock *Region) { 
-  ReversePostOrderTraversal<VPBlockBase *> RPOT(Region->getEntry()); 
-  VPBlockBase *PrevBlock = nullptr; 
- 
+    // TODO: Handle nested regions once we start generating the same.
+    assert(!isa<VPRegionBlock>(Block) && "Nested region not expected");
+    createOrPropagatePredicates(Block, Region);
+  }
+}
+
+// Linearize the CFG within Region.
+// TODO: Predication and linearization need RPOT for every region.
+// This traversal is expensive. Since predication is not adding new
+// blocks, we should be able to compute RPOT once in predication and
+// reuse it here. This becomes even more important once we have nested
+// regions.
+void VPlanPredicator::linearizeRegionRec(VPRegionBlock *Region) {
+  ReversePostOrderTraversal<VPBlockBase *> RPOT(Region->getEntry());
+  VPBlockBase *PrevBlock = nullptr;
+
   for (VPBlockBase *CurrBlock : RPOT) {
-    // TODO: Handle nested regions once we start generating the same. 
-    assert(!isa<VPRegionBlock>(CurrBlock) && "Nested region not expected"); 
- 
-    // Linearize control flow by adding an unconditional edge between PrevBlock 
-    // and CurrBlock skipping loop headers and latches to keep intact loop 
-    // header predecessors and loop latch successors. 
-    if (PrevBlock && !VPLI->isLoopHeader(CurrBlock) && 
-        !VPBlockUtils::blockIsLoopLatch(PrevBlock, VPLI)) { 
- 
-      LLVM_DEBUG(dbgs() << "Linearizing: " << PrevBlock->getName() << "->" 
-                        << CurrBlock->getName() << "\n"); 
- 
-      PrevBlock->clearSuccessors(); 
-      CurrBlock->clearPredecessors(); 
-      VPBlockUtils::connectBlocks(PrevBlock, CurrBlock); 
-    } 
- 
-    PrevBlock = CurrBlock; 
-  } 
-} 
- 
-// Entry point. The driver function for the predicator. 
-void VPlanPredicator::predicate(void) { 
-  // Predicate the blocks within Region. 
-  predicateRegionRec(cast<VPRegionBlock>(Plan.getEntry())); 
- 
-  // Linearlize the blocks with Region. 
-  linearizeRegionRec(cast<VPRegionBlock>(Plan.getEntry())); 
-} 
- 
-VPlanPredicator::VPlanPredicator(VPlan &Plan) 
-    : Plan(Plan), VPLI(&(Plan.getVPLoopInfo())) { 
-  // FIXME: Predicator is currently computing the dominator information for the 
-  // top region. Once we start storing dominator information in a VPRegionBlock, 
-  // we can avoid this recalculation. 
-  VPDomTree.recalculate(*(cast<VPRegionBlock>(Plan.getEntry()))); 
-} 
+    // TODO: Handle nested regions once we start generating the same.
+    assert(!isa<VPRegionBlock>(CurrBlock) && "Nested region not expected");
+
+    // Linearize control flow by adding an unconditional edge between PrevBlock
+    // and CurrBlock skipping loop headers and latches to keep intact loop
+    // header predecessors and loop latch successors.
+    if (PrevBlock && !VPLI->isLoopHeader(CurrBlock) &&
+        !VPBlockUtils::blockIsLoopLatch(PrevBlock, VPLI)) {
+
+      LLVM_DEBUG(dbgs() << "Linearizing: " << PrevBlock->getName() << "->"
+                        << CurrBlock->getName() << "\n");
+
+      PrevBlock->clearSuccessors();
+      CurrBlock->clearPredecessors();
+      VPBlockUtils::connectBlocks(PrevBlock, CurrBlock);
+    }
+
+    PrevBlock = CurrBlock;
+  }
+}
+
+// Entry point. The driver function for the predicator.
+void VPlanPredicator::predicate(void) {
+  // Predicate the blocks within Region.
+  predicateRegionRec(cast<VPRegionBlock>(Plan.getEntry()));
+
+  // Linearlize the blocks with Region.
+  linearizeRegionRec(cast<VPRegionBlock>(Plan.getEntry()));
+}
+
+VPlanPredicator::VPlanPredicator(VPlan &Plan)
+    : Plan(Plan), VPLI(&(Plan.getVPLoopInfo())) {
+  // FIXME: Predicator is currently computing the dominator information for the
+  // top region. Once we start storing dominator information in a VPRegionBlock,
+  // we can avoid this recalculation.
+  VPDomTree.recalculate(*(cast<VPRegionBlock>(Plan.getEntry())));
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanPredicator.h b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanPredicator.h
index 5dac70d090..692afd2978 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanPredicator.h
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanPredicator.h
@@ -1,74 +1,74 @@
-//===-- VPlanPredicator.h ---------------------------------------*- C++ -*-===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-/// 
-/// \file 
-/// This file defines the VPlanPredicator class which contains the public 
-/// interfaces to predicate and linearize the VPlan region. 
-/// 
-//===----------------------------------------------------------------------===// 
- 
-#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H 
-#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H 
- 
-#include "LoopVectorizationPlanner.h" 
-#include "VPlan.h" 
-#include "VPlanDominatorTree.h" 
- 
-namespace llvm { 
- 
-class VPlanPredicator { 
-private: 
-  enum class EdgeType { 
-    TRUE_EDGE, 
-    FALSE_EDGE, 
-  }; 
- 
-  // VPlan being predicated. 
-  VPlan &Plan; 
- 
-  // VPLoopInfo for Plan's HCFG. 
-  VPLoopInfo *VPLI; 
- 
-  // Dominator tree for Plan's HCFG. 
-  VPDominatorTree VPDomTree; 
- 
-  // VPlan builder used to generate VPInstructions for block predicates. 
-  VPBuilder Builder; 
- 
-  /// Get the type of edge from \p FromBlock to \p ToBlock. Returns TRUE_EDGE if 
-  /// \p ToBlock is either the unconditional successor or the conditional true 
-  /// successor of \p FromBlock and FALSE_EDGE otherwise. 
-  EdgeType getEdgeTypeBetween(VPBlockBase *FromBlock, VPBlockBase *ToBlock); 
- 
-  /// Create and return VPValue corresponding to the predicate for the edge from 
-  /// \p PredBB to \p CurrentBlock. 
-  VPValue *getOrCreateNotPredicate(VPBasicBlock *PredBB, VPBasicBlock *CurrBB); 
- 
-  /// Generate and return the result of ORing all the predicate VPValues in \p 
-  /// Worklist. 
-  VPValue *genPredicateTree(std::list<VPValue *> &Worklist); 
- 
-  /// Create or propagate predicate for \p CurrBlock in region \p Region using 
-  /// predicate(s) of its predecessor(s) 
-  void createOrPropagatePredicates(VPBlockBase *CurrBlock, 
-                                   VPRegionBlock *Region); 
- 
-  /// Predicate the CFG within \p Region. 
-  void predicateRegionRec(VPRegionBlock *Region); 
- 
-  /// Linearize the CFG within \p Region. 
-  void linearizeRegionRec(VPRegionBlock *Region); 
- 
-public: 
-  VPlanPredicator(VPlan &Plan); 
- 
-  /// Predicate Plan's HCFG. 
-  void predicate(void); 
-}; 
-} // end namespace llvm 
-#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H 
+//===-- VPlanPredicator.h ---------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the VPlanPredicator class which contains the public
+/// interfaces to predicate and linearize the VPlan region.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H
+
+#include "LoopVectorizationPlanner.h"
+#include "VPlan.h"
+#include "VPlanDominatorTree.h"
+
+namespace llvm {
+
+class VPlanPredicator {
+private:
+  enum class EdgeType {
+    TRUE_EDGE,
+    FALSE_EDGE,
+  };
+
+  // VPlan being predicated.
+  VPlan &Plan;
+
+  // VPLoopInfo for Plan's HCFG.
+  VPLoopInfo *VPLI;
+
+  // Dominator tree for Plan's HCFG.
+  VPDominatorTree VPDomTree;
+
+  // VPlan builder used to generate VPInstructions for block predicates.
+  VPBuilder Builder;
+
+  /// Get the type of edge from \p FromBlock to \p ToBlock. Returns TRUE_EDGE if
+  /// \p ToBlock is either the unconditional successor or the conditional true
+  /// successor of \p FromBlock and FALSE_EDGE otherwise.
+  EdgeType getEdgeTypeBetween(VPBlockBase *FromBlock, VPBlockBase *ToBlock);
+
+  /// Create and return VPValue corresponding to the predicate for the edge from
+  /// \p PredBB to \p CurrentBlock.
+  VPValue *getOrCreateNotPredicate(VPBasicBlock *PredBB, VPBasicBlock *CurrBB);
+
+  /// Generate and return the result of ORing all the predicate VPValues in \p
+  /// Worklist.
+  VPValue *genPredicateTree(std::list<VPValue *> &Worklist);
+
+  /// Create or propagate predicate for \p CurrBlock in region \p Region using
+  /// predicate(s) of its predecessor(s)
+  void createOrPropagatePredicates(VPBlockBase *CurrBlock,
+                                   VPRegionBlock *Region);
+
+  /// Predicate the CFG within \p Region.
+  void predicateRegionRec(VPRegionBlock *Region);
+
+  /// Linearize the CFG within \p Region.
+  void linearizeRegionRec(VPRegionBlock *Region);
+
+public:
+  VPlanPredicator(VPlan &Plan);
+
+  /// Predicate Plan's HCFG.
+  void predicate(void);
+};
+} // end namespace llvm
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanSLP.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanSLP.cpp
index b2a5a7688d..6f21bf4429 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanSLP.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanSLP.cpp
@@ -1,473 +1,473 @@
-//===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-/// This file implements SLP analysis based on VPlan. The analysis is based on 
-/// the ideas described in 
-/// 
-///   Look-ahead SLP: auto-vectorization in the presence of commutative 
-///   operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, 
-///   Luís F. W. Góes 
-/// 
-//===----------------------------------------------------------------------===// 
- 
-#include "VPlan.h" 
-#include "llvm/ADT/DepthFirstIterator.h" 
-#include "llvm/ADT/PostOrderIterator.h" 
-#include "llvm/ADT/SmallVector.h" 
-#include "llvm/ADT/Twine.h" 
-#include "llvm/Analysis/LoopInfo.h" 
-#include "llvm/Analysis/VectorUtils.h" 
-#include "llvm/IR/BasicBlock.h" 
-#include "llvm/IR/CFG.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/InstrTypes.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/IR/Instructions.h" 
-#include "llvm/IR/Type.h" 
-#include "llvm/IR/Value.h" 
-#include "llvm/Support/Casting.h" 
-#include "llvm/Support/Debug.h" 
-#include "llvm/Support/ErrorHandling.h" 
-#include "llvm/Support/GraphWriter.h" 
-#include "llvm/Support/raw_ostream.h" 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h" 
-#include <cassert> 
-#include <iterator> 
-#include <string> 
-#include <vector> 
- 
-using namespace llvm; 
- 
-#define DEBUG_TYPE "vplan-slp" 
- 
-// Number of levels to look ahead when re-ordering multi node operands. 
-static unsigned LookaheadMaxDepth = 5; 
- 
-VPInstruction *VPlanSlp::markFailed() { 
-  // FIXME: Currently this is used to signal we hit instructions we cannot 
-  //        trivially SLP'ize. 
-  CompletelySLP = false; 
-  return nullptr; 
-} 
- 
-void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) { 
-  if (all_of(Operands, [](VPValue *V) { 
-        return cast<VPInstruction>(V)->getUnderlyingInstr(); 
-      })) { 
-    unsigned BundleSize = 0; 
-    for (VPValue *V : Operands) { 
-      Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType(); 
-      assert(!T->isVectorTy() && "Only scalar types supported for now"); 
-      BundleSize += T->getScalarSizeInBits(); 
-    } 
-    WidestBundleBits = std::max(WidestBundleBits, BundleSize); 
-  } 
- 
-  auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New); 
-  assert(Res.second && 
-         "Already created a combined instruction for the operand bundle"); 
-  (void)Res; 
-} 
- 
-bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const { 
-  // Currently we only support VPInstructions. 
-  if (!all_of(Operands, [](VPValue *Op) { 
-        return Op && isa<VPInstruction>(Op) && 
-               cast<VPInstruction>(Op)->getUnderlyingInstr(); 
-      })) { 
-    LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n"); 
-    return false; 
-  } 
- 
-  // Check if opcodes and type width agree for all instructions in the bundle. 
-  // FIXME: Differing widths/opcodes can be handled by inserting additional 
-  //        instructions. 
-  // FIXME: Deal with non-primitive types. 
-  const Instruction *OriginalInstr = 
-      cast<VPInstruction>(Operands[0])->getUnderlyingInstr(); 
-  unsigned Opcode = OriginalInstr->getOpcode(); 
-  unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits(); 
-  if (!all_of(Operands, [Opcode, Width](VPValue *Op) { 
-        const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr(); 
-        return I->getOpcode() == Opcode && 
-               I->getType()->getPrimitiveSizeInBits() == Width; 
-      })) { 
-    LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n"); 
-    return false; 
-  } 
- 
-  // For now, all operands must be defined in the same BB. 
-  if (any_of(Operands, [this](VPValue *Op) { 
-        return cast<VPInstruction>(Op)->getParent() != &this->BB; 
-      })) { 
-    LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n"); 
-    return false; 
-  } 
- 
-  if (any_of(Operands, 
-             [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) { 
-    LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n"); 
-    return false; 
-  } 
- 
-  // For loads, check that there are no instructions writing to memory in 
-  // between them. 
-  // TODO: we only have to forbid instructions writing to memory that could 
-  //       interfere with any of the loads in the bundle 
-  if (Opcode == Instruction::Load) { 
-    unsigned LoadsSeen = 0; 
-    VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent(); 
-    for (auto &I : *Parent) { 
-      auto *VPI = cast<VPInstruction>(&I); 
-      if (VPI->getOpcode() == Instruction::Load && 
+//===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// This file implements SLP analysis based on VPlan. The analysis is based on
+/// the ideas described in
+///
+///   Look-ahead SLP: auto-vectorization in the presence of commutative
+///   operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
+///   Luís F. W. Góes
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPlan.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <cassert>
+#include <iterator>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "vplan-slp"
+
+// Number of levels to look ahead when re-ordering multi node operands.
+static unsigned LookaheadMaxDepth = 5;
+
+VPInstruction *VPlanSlp::markFailed() {
+  // FIXME: Currently this is used to signal we hit instructions we cannot
+  //        trivially SLP'ize.
+  CompletelySLP = false;
+  return nullptr;
+}
+
+void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) {
+  if (all_of(Operands, [](VPValue *V) {
+        return cast<VPInstruction>(V)->getUnderlyingInstr();
+      })) {
+    unsigned BundleSize = 0;
+    for (VPValue *V : Operands) {
+      Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType();
+      assert(!T->isVectorTy() && "Only scalar types supported for now");
+      BundleSize += T->getScalarSizeInBits();
+    }
+    WidestBundleBits = std::max(WidestBundleBits, BundleSize);
+  }
+
+  auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New);
+  assert(Res.second &&
+         "Already created a combined instruction for the operand bundle");
+  (void)Res;
+}
+
+bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const {
+  // Currently we only support VPInstructions.
+  if (!all_of(Operands, [](VPValue *Op) {
+        return Op && isa<VPInstruction>(Op) &&
+               cast<VPInstruction>(Op)->getUnderlyingInstr();
+      })) {
+    LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n");
+    return false;
+  }
+
+  // Check if opcodes and type width agree for all instructions in the bundle.
+  // FIXME: Differing widths/opcodes can be handled by inserting additional
+  //        instructions.
+  // FIXME: Deal with non-primitive types.
+  const Instruction *OriginalInstr =
+      cast<VPInstruction>(Operands[0])->getUnderlyingInstr();
+  unsigned Opcode = OriginalInstr->getOpcode();
+  unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits();
+  if (!all_of(Operands, [Opcode, Width](VPValue *Op) {
+        const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr();
+        return I->getOpcode() == Opcode &&
+               I->getType()->getPrimitiveSizeInBits() == Width;
+      })) {
+    LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n");
+    return false;
+  }
+
+  // For now, all operands must be defined in the same BB.
+  if (any_of(Operands, [this](VPValue *Op) {
+        return cast<VPInstruction>(Op)->getParent() != &this->BB;
+      })) {
+    LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n");
+    return false;
+  }
+
+  if (any_of(Operands,
+             [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) {
+    LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n");
+    return false;
+  }
+
+  // For loads, check that there are no instructions writing to memory in
+  // between them.
+  // TODO: we only have to forbid instructions writing to memory that could
+  //       interfere with any of the loads in the bundle
+  if (Opcode == Instruction::Load) {
+    unsigned LoadsSeen = 0;
+    VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent();
+    for (auto &I : *Parent) {
+      auto *VPI = cast<VPInstruction>(&I);
+      if (VPI->getOpcode() == Instruction::Load &&
           llvm::is_contained(Operands, VPI))
-        LoadsSeen++; 
- 
-      if (LoadsSeen == Operands.size()) 
-        break; 
-      if (LoadsSeen > 0 && VPI->mayWriteToMemory()) { 
-        LLVM_DEBUG( 
-            dbgs() << "VPSLP: instruction modifying memory between loads\n"); 
-        return false; 
-      } 
-    } 
- 
-    if (!all_of(Operands, [](VPValue *Op) { 
-          return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr()) 
-              ->isSimple(); 
-        })) { 
-      LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n"); 
-      return false; 
-    } 
-  } 
- 
-  if (Opcode == Instruction::Store) 
-    if (!all_of(Operands, [](VPValue *Op) { 
-          return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr()) 
-              ->isSimple(); 
-        })) { 
-      LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n"); 
-      return false; 
-    } 
- 
-  return true; 
-} 
- 
-static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values, 
-                                             unsigned OperandIndex) { 
-  SmallVector<VPValue *, 4> Operands; 
-  for (VPValue *V : Values) { 
+        LoadsSeen++;
+
+      if (LoadsSeen == Operands.size())
+        break;
+      if (LoadsSeen > 0 && VPI->mayWriteToMemory()) {
+        LLVM_DEBUG(
+            dbgs() << "VPSLP: instruction modifying memory between loads\n");
+        return false;
+      }
+    }
+
+    if (!all_of(Operands, [](VPValue *Op) {
+          return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
+              ->isSimple();
+        })) {
+      LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n");
+      return false;
+    }
+  }
+
+  if (Opcode == Instruction::Store)
+    if (!all_of(Operands, [](VPValue *Op) {
+          return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
+              ->isSimple();
+        })) {
+      LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n");
+      return false;
+    }
+
+  return true;
+}
+
+static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values,
+                                             unsigned OperandIndex) {
+  SmallVector<VPValue *, 4> Operands;
+  for (VPValue *V : Values) {
     // Currently we only support VPInstructions.
     auto *U = cast<VPInstruction>(V);
-    Operands.push_back(U->getOperand(OperandIndex)); 
-  } 
-  return Operands; 
-} 
- 
-static bool areCommutative(ArrayRef<VPValue *> Values) { 
-  return Instruction::isCommutative( 
-      cast<VPInstruction>(Values[0])->getOpcode()); 
-} 
- 
-static SmallVector<SmallVector<VPValue *, 4>, 4> 
-getOperands(ArrayRef<VPValue *> Values) { 
-  SmallVector<SmallVector<VPValue *, 4>, 4> Result; 
-  auto *VPI = cast<VPInstruction>(Values[0]); 
- 
-  switch (VPI->getOpcode()) { 
-  case Instruction::Load: 
-    llvm_unreachable("Loads terminate a tree, no need to get operands"); 
-  case Instruction::Store: 
-    Result.push_back(getOperands(Values, 0)); 
-    break; 
-  default: 
-    for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I) 
-      Result.push_back(getOperands(Values, I)); 
-    break; 
-  } 
- 
-  return Result; 
-} 
- 
-/// Returns the opcode of Values or ~0 if they do not all agree. 
-static Optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) { 
-  unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode(); 
-  if (any_of(Values, [Opcode](VPValue *V) { 
-        return cast<VPInstruction>(V)->getOpcode() != Opcode; 
-      })) 
-    return None; 
-  return {Opcode}; 
-} 
- 
-/// Returns true if A and B access sequential memory if they are loads or 
-/// stores or if they have identical opcodes otherwise. 
-static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B, 
-                                  VPInterleavedAccessInfo &IAI) { 
-  if (A->getOpcode() != B->getOpcode()) 
-    return false; 
- 
-  if (A->getOpcode() != Instruction::Load && 
-      A->getOpcode() != Instruction::Store) 
-    return true; 
-  auto *GA = IAI.getInterleaveGroup(A); 
-  auto *GB = IAI.getInterleaveGroup(B); 
- 
-  return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B); 
-} 
- 
-/// Implements getLAScore from Listing 7 in the paper. 
-/// Traverses and compares operands of V1 and V2 to MaxLevel. 
-static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel, 
-                           VPInterleavedAccessInfo &IAI) { 
+    Operands.push_back(U->getOperand(OperandIndex));
+  }
+  return Operands;
+}
+
+static bool areCommutative(ArrayRef<VPValue *> Values) {
+  return Instruction::isCommutative(
+      cast<VPInstruction>(Values[0])->getOpcode());
+}
+
+static SmallVector<SmallVector<VPValue *, 4>, 4>
+getOperands(ArrayRef<VPValue *> Values) {
+  SmallVector<SmallVector<VPValue *, 4>, 4> Result;
+  auto *VPI = cast<VPInstruction>(Values[0]);
+
+  switch (VPI->getOpcode()) {
+  case Instruction::Load:
+    llvm_unreachable("Loads terminate a tree, no need to get operands");
+  case Instruction::Store:
+    Result.push_back(getOperands(Values, 0));
+    break;
+  default:
+    for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I)
+      Result.push_back(getOperands(Values, I));
+    break;
+  }
+
+  return Result;
+}
+
+/// Returns the opcode of Values or ~0 if they do not all agree.
+static Optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) {
+  unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode();
+  if (any_of(Values, [Opcode](VPValue *V) {
+        return cast<VPInstruction>(V)->getOpcode() != Opcode;
+      }))
+    return None;
+  return {Opcode};
+}
+
+/// Returns true if A and B access sequential memory if they are loads or
+/// stores or if they have identical opcodes otherwise.
+static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B,
+                                  VPInterleavedAccessInfo &IAI) {
+  if (A->getOpcode() != B->getOpcode())
+    return false;
+
+  if (A->getOpcode() != Instruction::Load &&
+      A->getOpcode() != Instruction::Store)
+    return true;
+  auto *GA = IAI.getInterleaveGroup(A);
+  auto *GB = IAI.getInterleaveGroup(B);
+
+  return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B);
+}
+
+/// Implements getLAScore from Listing 7 in the paper.
+/// Traverses and compares operands of V1 and V2 to MaxLevel.
+static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel,
+                           VPInterleavedAccessInfo &IAI) {
   auto *I1 = dyn_cast<VPInstruction>(V1);
   auto *I2 = dyn_cast<VPInstruction>(V2);
   // Currently we only support VPInstructions.
   if (!I1 || !I2)
-    return 0; 
- 
-  if (MaxLevel == 0) 
+    return 0;
+
+  if (MaxLevel == 0)
     return (unsigned)areConsecutiveOrMatch(I1, I2, IAI);
- 
-  unsigned Score = 0; 
+
+  unsigned Score = 0;
   for (unsigned I = 0, EV1 = I1->getNumOperands(); I < EV1; ++I)
     for (unsigned J = 0, EV2 = I2->getNumOperands(); J < EV2; ++J)
       Score +=
           getLAScore(I1->getOperand(I), I2->getOperand(J), MaxLevel - 1, IAI);
-  return Score; 
-} 
- 
-std::pair<VPlanSlp::OpMode, VPValue *> 
-VPlanSlp::getBest(OpMode Mode, VPValue *Last, 
-                  SmallPtrSetImpl<VPValue *> &Candidates, 
-                  VPInterleavedAccessInfo &IAI) { 
-  assert((Mode == OpMode::Load || Mode == OpMode::Opcode) && 
-         "Currently we only handle load and commutative opcodes"); 
-  LLVM_DEBUG(dbgs() << "      getBest\n"); 
- 
-  SmallVector<VPValue *, 4> BestCandidates; 
-  LLVM_DEBUG(dbgs() << "        Candidates  for " 
-                    << *cast<VPInstruction>(Last)->getUnderlyingInstr() << " "); 
-  for (auto *Candidate : Candidates) { 
-    auto *LastI = cast<VPInstruction>(Last); 
-    auto *CandidateI = cast<VPInstruction>(Candidate); 
-    if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) { 
-      LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr() 
-                        << " "); 
-      BestCandidates.push_back(Candidate); 
-    } 
-  } 
-  LLVM_DEBUG(dbgs() << "\n"); 
- 
-  if (BestCandidates.empty()) 
-    return {OpMode::Failed, nullptr}; 
- 
-  if (BestCandidates.size() == 1) 
-    return {Mode, BestCandidates[0]}; 
- 
-  VPValue *Best = nullptr; 
-  unsigned BestScore = 0; 
-  for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) { 
-    unsigned PrevScore = ~0u; 
-    bool AllSame = true; 
- 
-    // FIXME: Avoid visiting the same operands multiple times. 
-    for (auto *Candidate : BestCandidates) { 
-      unsigned Score = getLAScore(Last, Candidate, Depth, IAI); 
-      if (PrevScore == ~0u) 
-        PrevScore = Score; 
-      if (PrevScore != Score) 
-        AllSame = false; 
-      PrevScore = Score; 
- 
-      if (Score > BestScore) { 
-        BestScore = Score; 
-        Best = Candidate; 
-      } 
-    } 
-    if (!AllSame) 
-      break; 
-  } 
-  LLVM_DEBUG(dbgs() << "Found best " 
-                    << *cast<VPInstruction>(Best)->getUnderlyingInstr() 
-                    << "\n"); 
-  Candidates.erase(Best); 
- 
-  return {Mode, Best}; 
-} 
- 
-SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() { 
-  SmallVector<MultiNodeOpTy, 4> FinalOrder; 
-  SmallVector<OpMode, 4> Mode; 
-  FinalOrder.reserve(MultiNodeOps.size()); 
-  Mode.reserve(MultiNodeOps.size()); 
- 
-  LLVM_DEBUG(dbgs() << "Reordering multinode\n"); 
- 
-  for (auto &Operands : MultiNodeOps) { 
-    FinalOrder.push_back({Operands.first, {Operands.second[0]}}); 
-    if (cast<VPInstruction>(Operands.second[0])->getOpcode() == 
-        Instruction::Load) 
-      Mode.push_back(OpMode::Load); 
-    else 
-      Mode.push_back(OpMode::Opcode); 
-  } 
- 
-  for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) { 
-    LLVM_DEBUG(dbgs() << "  Finding best value for lane " << Lane << "\n"); 
-    SmallPtrSet<VPValue *, 4> Candidates; 
-    LLVM_DEBUG(dbgs() << "  Candidates  "); 
-    for (auto Ops : MultiNodeOps) { 
-      LLVM_DEBUG( 
-          dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr() 
-                 << " "); 
-      Candidates.insert(Ops.second[Lane]); 
-    } 
-    LLVM_DEBUG(dbgs() << "\n"); 
- 
-    for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) { 
-      LLVM_DEBUG(dbgs() << "  Checking " << Op << "\n"); 
-      if (Mode[Op] == OpMode::Failed) 
-        continue; 
- 
-      VPValue *Last = FinalOrder[Op].second[Lane - 1]; 
-      std::pair<OpMode, VPValue *> Res = 
-          getBest(Mode[Op], Last, Candidates, IAI); 
-      if (Res.second) 
-        FinalOrder[Op].second.push_back(Res.second); 
-      else 
-        // TODO: handle this case 
-        FinalOrder[Op].second.push_back(markFailed()); 
-    } 
-  } 
- 
-  return FinalOrder; 
-} 
- 
-void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) { 
-  dbgs() << " Ops: "; 
-  for (auto Op : Values) { 
-    if (auto *VPInstr = cast_or_null<VPInstruction>(Op)) 
-      if (auto *Instr = VPInstr->getUnderlyingInstr()) { 
-        dbgs() << *Instr << " | "; 
-        continue; 
-      } 
-    dbgs() << " nullptr | "; 
-  } 
-  dbgs() << "\n"; 
-} 
- 
-VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) { 
-  assert(!Values.empty() && "Need some operands!"); 
- 
-  // If we already visited this instruction bundle, re-use the existing node 
-  auto I = BundleToCombined.find(to_vector<4>(Values)); 
-  if (I != BundleToCombined.end()) { 
-#ifndef NDEBUG 
-    // Check that the resulting graph is a tree. If we re-use a node, this means 
-    // its values have multiple users. We only allow this, if all users of each 
-    // value are the same instruction. 
-    for (auto *V : Values) { 
-      auto UI = V->user_begin(); 
-      auto *FirstUser = *UI++; 
-      while (UI != V->user_end()) { 
-        assert(*UI == FirstUser && "Currently we only support SLP trees."); 
-        UI++; 
-      } 
-    } 
-#endif 
-    return I->second; 
-  } 
- 
-  // Dump inputs 
-  LLVM_DEBUG({ 
-    dbgs() << "buildGraph: "; 
-    dumpBundle(Values); 
-  }); 
- 
-  if (!areVectorizable(Values)) 
-    return markFailed(); 
- 
-  assert(getOpcode(Values) && "Opcodes for all values must match"); 
-  unsigned ValuesOpcode = getOpcode(Values).getValue(); 
- 
-  SmallVector<VPValue *, 4> CombinedOperands; 
-  if (areCommutative(Values)) { 
-    bool MultiNodeRoot = !MultiNodeActive; 
-    MultiNodeActive = true; 
-    for (auto &Operands : getOperands(Values)) { 
-      LLVM_DEBUG({ 
-        dbgs() << "  Visiting Commutative"; 
-        dumpBundle(Operands); 
-      }); 
- 
-      auto OperandsOpcode = getOpcode(Operands); 
-      if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) { 
-        LLVM_DEBUG(dbgs() << "    Same opcode, continue building\n"); 
-        CombinedOperands.push_back(buildGraph(Operands)); 
-      } else { 
-        LLVM_DEBUG(dbgs() << "    Adding multinode Ops\n"); 
-        // Create dummy VPInstruction, which will we replace later by the 
-        // re-ordered operand. 
-        VPInstruction *Op = new VPInstruction(0, {}); 
-        CombinedOperands.push_back(Op); 
-        MultiNodeOps.emplace_back(Op, Operands); 
-      } 
-    } 
- 
-    if (MultiNodeRoot) { 
-      LLVM_DEBUG(dbgs() << "Reorder \n"); 
-      MultiNodeActive = false; 
- 
-      auto FinalOrder = reorderMultiNodeOps(); 
- 
-      MultiNodeOps.clear(); 
-      for (auto &Ops : FinalOrder) { 
-        VPInstruction *NewOp = buildGraph(Ops.second); 
-        Ops.first->replaceAllUsesWith(NewOp); 
-        for (unsigned i = 0; i < CombinedOperands.size(); i++) 
-          if (CombinedOperands[i] == Ops.first) 
-            CombinedOperands[i] = NewOp; 
-        delete Ops.first; 
-        Ops.first = NewOp; 
-      } 
-      LLVM_DEBUG(dbgs() << "Found final order\n"); 
-    } 
-  } else { 
-    LLVM_DEBUG(dbgs() << "  NonCommuntative\n"); 
-    if (ValuesOpcode == Instruction::Load) 
-      for (VPValue *V : Values) 
-        CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0)); 
-    else 
-      for (auto &Operands : getOperands(Values)) 
-        CombinedOperands.push_back(buildGraph(Operands)); 
-  } 
- 
-  unsigned Opcode; 
-  switch (ValuesOpcode) { 
-  case Instruction::Load: 
-    Opcode = VPInstruction::SLPLoad; 
-    break; 
-  case Instruction::Store: 
-    Opcode = VPInstruction::SLPStore; 
-    break; 
-  default: 
-    Opcode = ValuesOpcode; 
-    break; 
-  } 
- 
-  if (!CompletelySLP) 
-    return markFailed(); 
- 
-  assert(CombinedOperands.size() > 0 && "Need more some operands"); 
-  auto *VPI = new VPInstruction(Opcode, CombinedOperands); 
-  VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr()); 
- 
+  return Score;
+}
+
+std::pair<VPlanSlp::OpMode, VPValue *>
+VPlanSlp::getBest(OpMode Mode, VPValue *Last,
+                  SmallPtrSetImpl<VPValue *> &Candidates,
+                  VPInterleavedAccessInfo &IAI) {
+  assert((Mode == OpMode::Load || Mode == OpMode::Opcode) &&
+         "Currently we only handle load and commutative opcodes");
+  LLVM_DEBUG(dbgs() << "      getBest\n");
+
+  SmallVector<VPValue *, 4> BestCandidates;
+  LLVM_DEBUG(dbgs() << "        Candidates  for "
+                    << *cast<VPInstruction>(Last)->getUnderlyingInstr() << " ");
+  for (auto *Candidate : Candidates) {
+    auto *LastI = cast<VPInstruction>(Last);
+    auto *CandidateI = cast<VPInstruction>(Candidate);
+    if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) {
+      LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr()
+                        << " ");
+      BestCandidates.push_back(Candidate);
+    }
+  }
+  LLVM_DEBUG(dbgs() << "\n");
+
+  if (BestCandidates.empty())
+    return {OpMode::Failed, nullptr};
+
+  if (BestCandidates.size() == 1)
+    return {Mode, BestCandidates[0]};
+
+  VPValue *Best = nullptr;
+  unsigned BestScore = 0;
+  for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) {
+    unsigned PrevScore = ~0u;
+    bool AllSame = true;
+
+    // FIXME: Avoid visiting the same operands multiple times.
+    for (auto *Candidate : BestCandidates) {
+      unsigned Score = getLAScore(Last, Candidate, Depth, IAI);
+      if (PrevScore == ~0u)
+        PrevScore = Score;
+      if (PrevScore != Score)
+        AllSame = false;
+      PrevScore = Score;
+
+      if (Score > BestScore) {
+        BestScore = Score;
+        Best = Candidate;
+      }
+    }
+    if (!AllSame)
+      break;
+  }
+  LLVM_DEBUG(dbgs() << "Found best "
+                    << *cast<VPInstruction>(Best)->getUnderlyingInstr()
+                    << "\n");
+  Candidates.erase(Best);
+
+  return {Mode, Best};
+}
+
+SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() {
+  SmallVector<MultiNodeOpTy, 4> FinalOrder;
+  SmallVector<OpMode, 4> Mode;
+  FinalOrder.reserve(MultiNodeOps.size());
+  Mode.reserve(MultiNodeOps.size());
+
+  LLVM_DEBUG(dbgs() << "Reordering multinode\n");
+
+  for (auto &Operands : MultiNodeOps) {
+    FinalOrder.push_back({Operands.first, {Operands.second[0]}});
+    if (cast<VPInstruction>(Operands.second[0])->getOpcode() ==
+        Instruction::Load)
+      Mode.push_back(OpMode::Load);
+    else
+      Mode.push_back(OpMode::Opcode);
+  }
+
+  for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) {
+    LLVM_DEBUG(dbgs() << "  Finding best value for lane " << Lane << "\n");
+    SmallPtrSet<VPValue *, 4> Candidates;
+    LLVM_DEBUG(dbgs() << "  Candidates  ");
+    for (auto Ops : MultiNodeOps) {
+      LLVM_DEBUG(
+          dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr()
+                 << " ");
+      Candidates.insert(Ops.second[Lane]);
+    }
+    LLVM_DEBUG(dbgs() << "\n");
+
+    for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) {
+      LLVM_DEBUG(dbgs() << "  Checking " << Op << "\n");
+      if (Mode[Op] == OpMode::Failed)
+        continue;
+
+      VPValue *Last = FinalOrder[Op].second[Lane - 1];
+      std::pair<OpMode, VPValue *> Res =
+          getBest(Mode[Op], Last, Candidates, IAI);
+      if (Res.second)
+        FinalOrder[Op].second.push_back(Res.second);
+      else
+        // TODO: handle this case
+        FinalOrder[Op].second.push_back(markFailed());
+    }
+  }
+
+  return FinalOrder;
+}
+
+void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) {
+  dbgs() << " Ops: ";
+  for (auto Op : Values) {
+    if (auto *VPInstr = cast_or_null<VPInstruction>(Op))
+      if (auto *Instr = VPInstr->getUnderlyingInstr()) {
+        dbgs() << *Instr << " | ";
+        continue;
+      }
+    dbgs() << " nullptr | ";
+  }
+  dbgs() << "\n";
+}
+
+VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
+  assert(!Values.empty() && "Need some operands!");
+
+  // If we already visited this instruction bundle, re-use the existing node
+  auto I = BundleToCombined.find(to_vector<4>(Values));
+  if (I != BundleToCombined.end()) {
+#ifndef NDEBUG
+    // Check that the resulting graph is a tree. If we re-use a node, this means
+    // its values have multiple users. We only allow this, if all users of each
+    // value are the same instruction.
+    for (auto *V : Values) {
+      auto UI = V->user_begin();
+      auto *FirstUser = *UI++;
+      while (UI != V->user_end()) {
+        assert(*UI == FirstUser && "Currently we only support SLP trees.");
+        UI++;
+      }
+    }
+#endif
+    return I->second;
+  }
+
+  // Dump inputs
+  LLVM_DEBUG({
+    dbgs() << "buildGraph: ";
+    dumpBundle(Values);
+  });
+
+  if (!areVectorizable(Values))
+    return markFailed();
+
+  assert(getOpcode(Values) && "Opcodes for all values must match");
+  unsigned ValuesOpcode = getOpcode(Values).getValue();
+
+  SmallVector<VPValue *, 4> CombinedOperands;
+  if (areCommutative(Values)) {
+    bool MultiNodeRoot = !MultiNodeActive;
+    MultiNodeActive = true;
+    for (auto &Operands : getOperands(Values)) {
+      LLVM_DEBUG({
+        dbgs() << "  Visiting Commutative";
+        dumpBundle(Operands);
+      });
+
+      auto OperandsOpcode = getOpcode(Operands);
+      if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) {
+        LLVM_DEBUG(dbgs() << "    Same opcode, continue building\n");
+        CombinedOperands.push_back(buildGraph(Operands));
+      } else {
+        LLVM_DEBUG(dbgs() << "    Adding multinode Ops\n");
+        // Create dummy VPInstruction, which will we replace later by the
+        // re-ordered operand.
+        VPInstruction *Op = new VPInstruction(0, {});
+        CombinedOperands.push_back(Op);
+        MultiNodeOps.emplace_back(Op, Operands);
+      }
+    }
+
+    if (MultiNodeRoot) {
+      LLVM_DEBUG(dbgs() << "Reorder \n");
+      MultiNodeActive = false;
+
+      auto FinalOrder = reorderMultiNodeOps();
+
+      MultiNodeOps.clear();
+      for (auto &Ops : FinalOrder) {
+        VPInstruction *NewOp = buildGraph(Ops.second);
+        Ops.first->replaceAllUsesWith(NewOp);
+        for (unsigned i = 0; i < CombinedOperands.size(); i++)
+          if (CombinedOperands[i] == Ops.first)
+            CombinedOperands[i] = NewOp;
+        delete Ops.first;
+        Ops.first = NewOp;
+      }
+      LLVM_DEBUG(dbgs() << "Found final order\n");
+    }
+  } else {
+    LLVM_DEBUG(dbgs() << "  NonCommuntative\n");
+    if (ValuesOpcode == Instruction::Load)
+      for (VPValue *V : Values)
+        CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0));
+    else
+      for (auto &Operands : getOperands(Values))
+        CombinedOperands.push_back(buildGraph(Operands));
+  }
+
+  unsigned Opcode;
+  switch (ValuesOpcode) {
+  case Instruction::Load:
+    Opcode = VPInstruction::SLPLoad;
+    break;
+  case Instruction::Store:
+    Opcode = VPInstruction::SLPStore;
+    break;
+  default:
+    Opcode = ValuesOpcode;
+    break;
+  }
+
+  if (!CompletelySLP)
+    return markFailed();
+
+  assert(CombinedOperands.size() > 0 && "Need more some operands");
+  auto *VPI = new VPInstruction(Opcode, CombinedOperands);
+  VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr());
+
   LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " "
                     << *cast<VPInstruction>(Values[0]) << "\n");
-  addCombined(Values, VPI); 
-  return VPI; 
-} 
+  addCombined(Values, VPI);
+  return VPI;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanTransforms.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 640ca7160b..1a54603faf 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1,91 +1,91 @@
-//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-/// 
-/// \file 
-/// This file implements a set of utility VPlan to VPlan transformations. 
-/// 
-//===----------------------------------------------------------------------===// 
- 
-#include "VPlanTransforms.h" 
-#include "llvm/ADT/PostOrderIterator.h" 
- 
-using namespace llvm; 
- 
-void VPlanTransforms::VPInstructionsToVPRecipes( 
-    Loop *OrigLoop, VPlanPtr &Plan, 
-    LoopVectorizationLegality::InductionList &Inductions, 
-    SmallPtrSetImpl<Instruction *> &DeadInstructions) { 
- 
-  auto *TopRegion = cast<VPRegionBlock>(Plan->getEntry()); 
-  ReversePostOrderTraversal<VPBlockBase *> RPOT(TopRegion->getEntry()); 
- 
-  // Condition bit VPValues get deleted during transformation to VPRecipes. 
-  // Create new VPValues and save away as condition bits. These will be deleted 
-  // after finalizing the vector IR basic blocks. 
-  for (VPBlockBase *Base : RPOT) { 
-    VPBasicBlock *VPBB = Base->getEntryBasicBlock(); 
-    if (auto *CondBit = VPBB->getCondBit()) { 
-      auto *NCondBit = new VPValue(CondBit->getUnderlyingValue()); 
-      VPBB->setCondBit(NCondBit); 
-      Plan->addCBV(NCondBit); 
-    } 
-  } 
-  for (VPBlockBase *Base : RPOT) { 
-    // Do not widen instructions in pre-header and exit blocks. 
-    if (Base->getNumPredecessors() == 0 || Base->getNumSuccessors() == 0) 
-      continue; 
- 
-    VPBasicBlock *VPBB = Base->getEntryBasicBlock(); 
-    // Introduce each ingredient into VPlan. 
-    for (auto I = VPBB->begin(), E = VPBB->end(); I != E;) { 
-      VPRecipeBase *Ingredient = &*I++; 
-      // Can only handle VPInstructions. 
-      VPInstruction *VPInst = cast<VPInstruction>(Ingredient); 
-      Instruction *Inst = cast<Instruction>(VPInst->getUnderlyingValue()); 
-      if (DeadInstructions.count(Inst)) { 
+//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements a set of utility VPlan to VPlan transformations.
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPlanTransforms.h"
+#include "llvm/ADT/PostOrderIterator.h"
+
+using namespace llvm;
+
+void VPlanTransforms::VPInstructionsToVPRecipes(
+    Loop *OrigLoop, VPlanPtr &Plan,
+    LoopVectorizationLegality::InductionList &Inductions,
+    SmallPtrSetImpl<Instruction *> &DeadInstructions) {
+
+  auto *TopRegion = cast<VPRegionBlock>(Plan->getEntry());
+  ReversePostOrderTraversal<VPBlockBase *> RPOT(TopRegion->getEntry());
+
+  // Condition bit VPValues get deleted during transformation to VPRecipes.
+  // Create new VPValues and save away as condition bits. These will be deleted
+  // after finalizing the vector IR basic blocks.
+  for (VPBlockBase *Base : RPOT) {
+    VPBasicBlock *VPBB = Base->getEntryBasicBlock();
+    if (auto *CondBit = VPBB->getCondBit()) {
+      auto *NCondBit = new VPValue(CondBit->getUnderlyingValue());
+      VPBB->setCondBit(NCondBit);
+      Plan->addCBV(NCondBit);
+    }
+  }
+  for (VPBlockBase *Base : RPOT) {
+    // Do not widen instructions in pre-header and exit blocks.
+    if (Base->getNumPredecessors() == 0 || Base->getNumSuccessors() == 0)
+      continue;
+
+    VPBasicBlock *VPBB = Base->getEntryBasicBlock();
+    // Introduce each ingredient into VPlan.
+    for (auto I = VPBB->begin(), E = VPBB->end(); I != E;) {
+      VPRecipeBase *Ingredient = &*I++;
+      // Can only handle VPInstructions.
+      VPInstruction *VPInst = cast<VPInstruction>(Ingredient);
+      Instruction *Inst = cast<Instruction>(VPInst->getUnderlyingValue());
+      if (DeadInstructions.count(Inst)) {
         VPValue DummyValue;
         VPInst->replaceAllUsesWith(&DummyValue);
-        Ingredient->eraseFromParent(); 
-        continue; 
-      } 
- 
-      VPRecipeBase *NewRecipe = nullptr; 
-      // Create VPWidenMemoryInstructionRecipe for loads and stores. 
-      if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) 
-        NewRecipe = new VPWidenMemoryInstructionRecipe( 
-            *Load, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)), 
-            nullptr /*Mask*/); 
-      else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) 
-        NewRecipe = new VPWidenMemoryInstructionRecipe( 
-            *Store, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)), 
-            Plan->getOrAddVPValue(Store->getValueOperand()), nullptr /*Mask*/); 
-      else if (PHINode *Phi = dyn_cast<PHINode>(Inst)) { 
-        InductionDescriptor II = Inductions.lookup(Phi); 
-        if (II.getKind() == InductionDescriptor::IK_IntInduction || 
-            II.getKind() == InductionDescriptor::IK_FpInduction) { 
+        Ingredient->eraseFromParent();
+        continue;
+      }
+
+      VPRecipeBase *NewRecipe = nullptr;
+      // Create VPWidenMemoryInstructionRecipe for loads and stores.
+      if (LoadInst *Load = dyn_cast<LoadInst>(Inst))
+        NewRecipe = new VPWidenMemoryInstructionRecipe(
+            *Load, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)),
+            nullptr /*Mask*/);
+      else if (StoreInst *Store = dyn_cast<StoreInst>(Inst))
+        NewRecipe = new VPWidenMemoryInstructionRecipe(
+            *Store, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)),
+            Plan->getOrAddVPValue(Store->getValueOperand()), nullptr /*Mask*/);
+      else if (PHINode *Phi = dyn_cast<PHINode>(Inst)) {
+        InductionDescriptor II = Inductions.lookup(Phi);
+        if (II.getKind() == InductionDescriptor::IK_IntInduction ||
+            II.getKind() == InductionDescriptor::IK_FpInduction) {
           VPValue *Start = Plan->getOrAddVPValue(II.getStartValue());
           NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start);
-        } else 
-          NewRecipe = new VPWidenPHIRecipe(Phi); 
-      } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) { 
-        NewRecipe = new VPWidenGEPRecipe( 
-            GEP, Plan->mapToVPValues(GEP->operands()), OrigLoop); 
-      } else 
-        NewRecipe = 
-            new VPWidenRecipe(*Inst, Plan->mapToVPValues(Inst->operands())); 
- 
-      NewRecipe->insertBefore(Ingredient); 
+        } else
+          NewRecipe = new VPWidenPHIRecipe(Phi);
+      } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
+        NewRecipe = new VPWidenGEPRecipe(
+            GEP, Plan->mapToVPValues(GEP->operands()), OrigLoop);
+      } else
+        NewRecipe =
+            new VPWidenRecipe(*Inst, Plan->mapToVPValues(Inst->operands()));
+
+      NewRecipe->insertBefore(Ingredient);
       if (NewRecipe->getNumDefinedValues() == 1)
         VPInst->replaceAllUsesWith(NewRecipe->getVPValue());
       else
         assert(NewRecipe->getNumDefinedValues() == 0 &&
                "Only recpies with zero or one defined values expected");
-      Ingredient->eraseFromParent(); 
-    } 
-  } 
-} 
+      Ingredient->eraseFromParent();
+    }
+  }
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanTransforms.h b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanTransforms.h
index 15cc7d355f..4b20e8b4e3 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -1,33 +1,33 @@
-//===- VPlanTransforms.h - Utility VPlan to VPlan transforms --------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-/// 
-/// \file 
-/// This file provides utility VPlan to VPlan transformations. 
-//===----------------------------------------------------------------------===// 
- 
-#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H 
-#define LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H 
- 
-#include "VPlan.h" 
-#include "llvm/IR/Instruction.h" 
-#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 
- 
-namespace llvm { 
- 
-struct VPlanTransforms { 
-  /// Replaces the VPInstructions in \p Plan with corresponding 
-  /// widen recipes. 
-  static void VPInstructionsToVPRecipes( 
-      Loop *OrigLoop, VPlanPtr &Plan, 
-      LoopVectorizationLegality::InductionList &Inductions, 
-      SmallPtrSetImpl<Instruction *> &DeadInstructions); 
-}; 
- 
-} // namespace llvm 
- 
-#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H 
+//===- VPlanTransforms.h - Utility VPlan to VPlan transforms --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides utility VPlan to VPlan transformations.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H
+
+#include "VPlan.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
+
+namespace llvm {
+
+struct VPlanTransforms {
+  /// Replaces the VPInstructions in \p Plan with corresponding
+  /// widen recipes.
+  static void VPInstructionsToVPRecipes(
+      Loop *OrigLoop, VPlanPtr &Plan,
+      LoopVectorizationLegality::InductionList &Inductions,
+      SmallPtrSetImpl<Instruction *> &DeadInstructions);
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanValue.h b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanValue.h
index dbf04d3707..ed572ca366 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanValue.h
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanValue.h
@@ -1,93 +1,93 @@
-//===- VPlanValue.h - Represent Values in Vectorizer Plan -----------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-/// 
-/// \file 
-/// This file contains the declarations of the entities induced by Vectorization 
-/// Plans, e.g. the instructions the VPlan intends to generate if executed. 
-/// VPlan models the following entities: 
+//===- VPlanValue.h - Represent Values in Vectorizer Plan -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the declarations of the entities induced by Vectorization
+/// Plans, e.g. the instructions the VPlan intends to generate if executed.
+/// VPlan models the following entities:
 /// VPValue   VPUser   VPDef
 ///    |        |
 ///   VPInstruction
-/// These are documented in docs/VectorizationPlan.rst. 
-/// 
-//===----------------------------------------------------------------------===// 
- 
-#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_VALUE_H 
-#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_VALUE_H 
- 
-#include "llvm/ADT/DenseMap.h" 
+/// These are documented in docs/VectorizationPlan.rst.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_VALUE_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_VALUE_H
+
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h" 
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/TinyPtrVector.h"
-#include "llvm/ADT/iterator_range.h" 
- 
-namespace llvm { 
- 
-// Forward declarations. 
-class raw_ostream; 
-class Value; 
+#include "llvm/ADT/iterator_range.h"
+
+namespace llvm {
+
+// Forward declarations.
+class raw_ostream;
+class Value;
 class VPDef;
-class VPSlotTracker; 
-class VPUser; 
+class VPSlotTracker;
+class VPUser;
 class VPRecipeBase;
 class VPWidenMemoryInstructionRecipe;
- 
-// This is the base class of the VPlan Def/Use graph, used for modeling the data 
-// flow into, within and out of the VPlan. VPValues can stand for live-ins 
-// coming from the input IR, instructions which VPlan will generate if executed 
-// and live-outs which the VPlan will need to fix accordingly. 
-class VPValue { 
-  friend class VPBuilder; 
+
+// This is the base class of the VPlan Def/Use graph, used for modeling the data
+// flow into, within and out of the VPlan. VPValues can stand for live-ins
+// coming from the input IR, instructions which VPlan will generate if executed
+// and live-outs which the VPlan will need to fix accordingly.
+class VPValue {
+  friend class VPBuilder;
   friend class VPDef;
   friend class VPInstruction;
-  friend struct VPlanTransforms; 
-  friend class VPBasicBlock; 
-  friend class VPInterleavedAccessInfo; 
-  friend class VPSlotTracker; 
+  friend struct VPlanTransforms;
+  friend class VPBasicBlock;
+  friend class VPInterleavedAccessInfo;
+  friend class VPSlotTracker;
   friend class VPRecipeBase;
   friend class VPWidenMemoryInstructionRecipe;
- 
-  const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). 
- 
-  SmallVector<VPUser *, 1> Users; 
- 
-protected: 
-  // Hold the underlying Value, if any, attached to this VPValue. 
-  Value *UnderlyingVal; 
- 
+
+  const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
+
+  SmallVector<VPUser *, 1> Users;
+
+protected:
+  // Hold the underlying Value, if any, attached to this VPValue.
+  Value *UnderlyingVal;
+
   /// Pointer to the VPDef that defines this VPValue. If it is nullptr, the
   /// VPValue is not defined by any recipe modeled in VPlan.
   VPDef *Def;
- 
+
   VPValue(const unsigned char SC, Value *UV = nullptr, VPDef *Def = nullptr);
 
-  // DESIGN PRINCIPLE: Access to the underlying IR must be strictly limited to 
-  // the front-end and back-end of VPlan so that the middle-end is as 
-  // independent as possible of the underlying IR. We grant access to the 
-  // underlying IR using friendship. In that way, we should be able to use VPlan 
-  // for multiple underlying IRs (Polly?) by providing a new VPlan front-end, 
-  // back-end and analysis information for the new IR. 
- 
-  // Set \p Val as the underlying Value of this VPValue. 
-  void setUnderlyingValue(Value *Val) { 
-    assert(!UnderlyingVal && "Underlying Value is already set."); 
-    UnderlyingVal = Val; 
-  } 
- 
-public: 
+  // DESIGN PRINCIPLE: Access to the underlying IR must be strictly limited to
+  // the front-end and back-end of VPlan so that the middle-end is as
+  // independent as possible of the underlying IR. We grant access to the
+  // underlying IR using friendship. In that way, we should be able to use VPlan
+  // for multiple underlying IRs (Polly?) by providing a new VPlan front-end,
+  // back-end and analysis information for the new IR.
+
+  // Set \p Val as the underlying Value of this VPValue.
+  void setUnderlyingValue(Value *Val) {
+    assert(!UnderlyingVal && "Underlying Value is already set.");
+    UnderlyingVal = Val;
+  }
+
+public:
   /// Return the underlying Value attached to this VPValue.
   Value *getUnderlyingValue() { return UnderlyingVal; }
   const Value *getUnderlyingValue() const { return UnderlyingVal; }
 
-  /// An enumeration for keeping track of the concrete subclass of VPValue that 
-  /// are actually instantiated. Values of this enumeration are kept in the 
-  /// SubclassID field of the VPValue objects. They are used for concrete 
-  /// type identification. 
+  /// An enumeration for keeping track of the concrete subclass of VPValue that
+  /// are actually instantiated. Values of this enumeration are kept in the
+  /// SubclassID field of the VPValue objects. They are used for concrete
+  /// type identification.
   enum {
     VPValueSC,
     VPVInstructionSC,
@@ -99,28 +99,28 @@ public:
     VPVWidenGEPSC,
     VPVWidenSelectSC,
   };
- 
+
   VPValue(Value *UV = nullptr, VPDef *Def = nullptr)
       : VPValue(VPValueSC, UV, Def) {}
-  VPValue(const VPValue &) = delete; 
-  VPValue &operator=(const VPValue &) = delete; 
- 
+  VPValue(const VPValue &) = delete;
+  VPValue &operator=(const VPValue &) = delete;
+
   virtual ~VPValue();
 
-  /// \return an ID for the concrete type of this object. 
-  /// This is used to implement the classof checks. This should not be used 
-  /// for any other purpose, as the values may change as LLVM evolves. 
-  unsigned getVPValueID() const { return SubclassID; } 
- 
-  void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const; 
-  void print(raw_ostream &OS, VPSlotTracker &Tracker) const; 
- 
+  /// \return an ID for the concrete type of this object.
+  /// This is used to implement the classof checks. This should not be used
+  /// for any other purpose, as the values may change as LLVM evolves.
+  unsigned getVPValueID() const { return SubclassID; }
+
+  void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const;
+  void print(raw_ostream &OS, VPSlotTracker &Tracker) const;
+
   /// Dump the value to stderr (for debugging).
   void dump() const;
 
-  unsigned getNumUsers() const { return Users.size(); } 
-  void addUser(VPUser &User) { Users.push_back(&User); } 
- 
+  unsigned getNumUsers() const { return Users.size(); }
+  void addUser(VPUser &User) { Users.push_back(&User); }
+
   /// Remove a single \p User from the list of users.
   void removeUser(VPUser &User) {
     bool Found = false;
@@ -137,33 +137,33 @@ public:
     });
   }
 
-  typedef SmallVectorImpl<VPUser *>::iterator user_iterator; 
-  typedef SmallVectorImpl<VPUser *>::const_iterator const_user_iterator; 
-  typedef iterator_range<user_iterator> user_range; 
-  typedef iterator_range<const_user_iterator> const_user_range; 
- 
-  user_iterator user_begin() { return Users.begin(); } 
-  const_user_iterator user_begin() const { return Users.begin(); } 
-  user_iterator user_end() { return Users.end(); } 
-  const_user_iterator user_end() const { return Users.end(); } 
-  user_range users() { return user_range(user_begin(), user_end()); } 
-  const_user_range users() const { 
-    return const_user_range(user_begin(), user_end()); 
-  } 
- 
-  /// Returns true if the value has more than one unique user. 
-  bool hasMoreThanOneUniqueUser() { 
-    if (getNumUsers() == 0) 
-      return false; 
- 
-    // Check if all users match the first user. 
-    auto Current = std::next(user_begin()); 
-    while (Current != user_end() && *user_begin() == *Current) 
-      Current++; 
-    return Current != user_end(); 
-  } 
- 
-  void replaceAllUsesWith(VPValue *New); 
+  typedef SmallVectorImpl<VPUser *>::iterator user_iterator;
+  typedef SmallVectorImpl<VPUser *>::const_iterator const_user_iterator;
+  typedef iterator_range<user_iterator> user_range;
+  typedef iterator_range<const_user_iterator> const_user_range;
+
+  user_iterator user_begin() { return Users.begin(); }
+  const_user_iterator user_begin() const { return Users.begin(); }
+  user_iterator user_end() { return Users.end(); }
+  const_user_iterator user_end() const { return Users.end(); }
+  user_range users() { return user_range(user_begin(), user_end()); }
+  const_user_range users() const {
+    return const_user_range(user_begin(), user_end());
+  }
+
+  /// Returns true if the value has more than one unique user.
+  bool hasMoreThanOneUniqueUser() {
+    if (getNumUsers() == 0)
+      return false;
+
+    // Check if all users match the first user.
+    auto Current = std::next(user_begin());
+    while (Current != user_end() && *user_begin() == *Current)
+      Current++;
+    return Current != user_end();
+  }
+
+  void replaceAllUsesWith(VPValue *New);
 
   VPDef *getDef() { return Def; }
 
@@ -175,77 +175,77 @@ public:
            "VPValue is not a live-in; it is defined by a VPDef inside a VPlan");
     return getUnderlyingValue();
   }
-}; 
- 
-typedef DenseMap<Value *, VPValue *> Value2VPValueTy; 
-typedef DenseMap<VPValue *, Value *> VPValue2ValueTy; 
- 
-raw_ostream &operator<<(raw_ostream &OS, const VPValue &V); 
- 
-/// This class augments VPValue with operands which provide the inverse def-use 
-/// edges from VPValue's users to their defs. 
+};
+
+typedef DenseMap<Value *, VPValue *> Value2VPValueTy;
+typedef DenseMap<VPValue *, Value *> VPValue2ValueTy;
+
+raw_ostream &operator<<(raw_ostream &OS, const VPValue &V);
+
+/// This class augments VPValue with operands which provide the inverse def-use
+/// edges from VPValue's users to their defs.
 class VPUser {
-  SmallVector<VPValue *, 2> Operands; 
- 
-protected: 
+  SmallVector<VPValue *, 2> Operands;
+
+protected:
   /// Print the operands to \p O.
   void printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const;
 
 public:
   VPUser() {}
   VPUser(ArrayRef<VPValue *> Operands) {
-    for (VPValue *Operand : Operands) 
-      addOperand(Operand); 
-  } 
- 
-  VPUser(std::initializer_list<VPValue *> Operands) 
-      : VPUser(ArrayRef<VPValue *>(Operands)) {} 
+    for (VPValue *Operand : Operands)
+      addOperand(Operand);
+  }
+
+  VPUser(std::initializer_list<VPValue *> Operands)
+      : VPUser(ArrayRef<VPValue *>(Operands)) {}
   template <typename IterT> VPUser(iterator_range<IterT> Operands) {
-    for (VPValue *Operand : Operands) 
-      addOperand(Operand); 
-  } 
- 
-  VPUser(const VPUser &) = delete; 
-  VPUser &operator=(const VPUser &) = delete; 
+    for (VPValue *Operand : Operands)
+      addOperand(Operand);
+  }
+
+  VPUser(const VPUser &) = delete;
+  VPUser &operator=(const VPUser &) = delete;
   virtual ~VPUser() {
     for (VPValue *Op : operands())
       Op->removeUser(*this);
-  } 
- 
-  void addOperand(VPValue *Operand) { 
-    Operands.push_back(Operand); 
-    Operand->addUser(*this); 
-  } 
- 
-  unsigned getNumOperands() const { return Operands.size(); } 
-  inline VPValue *getOperand(unsigned N) const { 
-    assert(N < Operands.size() && "Operand index out of bounds"); 
-    return Operands[N]; 
-  } 
- 
+  }
+
+  void addOperand(VPValue *Operand) {
+    Operands.push_back(Operand);
+    Operand->addUser(*this);
+  }
+
+  unsigned getNumOperands() const { return Operands.size(); }
+  inline VPValue *getOperand(unsigned N) const {
+    assert(N < Operands.size() && "Operand index out of bounds");
+    return Operands[N];
+  }
+
   void setOperand(unsigned I, VPValue *New) {
     Operands[I]->removeUser(*this);
     Operands[I] = New;
     New->addUser(*this);
   }
- 
-  typedef SmallVectorImpl<VPValue *>::iterator operand_iterator; 
-  typedef SmallVectorImpl<VPValue *>::const_iterator const_operand_iterator; 
-  typedef iterator_range<operand_iterator> operand_range; 
-  typedef iterator_range<const_operand_iterator> const_operand_range; 
- 
-  operand_iterator op_begin() { return Operands.begin(); } 
-  const_operand_iterator op_begin() const { return Operands.begin(); } 
-  operand_iterator op_end() { return Operands.end(); } 
-  const_operand_iterator op_end() const { return Operands.end(); } 
-  operand_range operands() { return operand_range(op_begin(), op_end()); } 
-  const_operand_range operands() const { 
-    return const_operand_range(op_begin(), op_end()); 
-  } 
+
+  typedef SmallVectorImpl<VPValue *>::iterator operand_iterator;
+  typedef SmallVectorImpl<VPValue *>::const_iterator const_operand_iterator;
+  typedef iterator_range<operand_iterator> operand_range;
+  typedef iterator_range<const_operand_iterator> const_operand_range;
+
+  operand_iterator op_begin() { return Operands.begin(); }
+  const_operand_iterator op_begin() const { return Operands.begin(); }
+  operand_iterator op_end() { return Operands.end(); }
+  const_operand_iterator op_end() const { return Operands.end(); }
+  operand_range operands() { return operand_range(op_begin(), op_end()); }
+  const_operand_range operands() const {
+    return const_operand_range(op_begin(), op_end());
+  }
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPDef *Recipe);
-}; 
+};
 
 /// This class augments a recipe with a set of VPValues defined by the recipe.
 /// It allows recipes to define zero, one or multiple VPValues. A VPDef owns
@@ -346,38 +346,38 @@ public:
                      VPSlotTracker &SlotTracker) const = 0;
 };
 
-class VPlan; 
-class VPBasicBlock; 
-class VPRegionBlock; 
- 
-/// This class can be used to assign consecutive numbers to all VPValues in a 
-/// VPlan and allows querying the numbering for printing, similar to the 
-/// ModuleSlotTracker for IR values. 
-class VPSlotTracker { 
-  DenseMap<const VPValue *, unsigned> Slots; 
-  unsigned NextSlot = 0; 
- 
-  void assignSlots(const VPBlockBase *VPBB); 
-  void assignSlots(const VPRegionBlock *Region); 
-  void assignSlots(const VPBasicBlock *VPBB); 
-  void assignSlot(const VPValue *V); 
- 
-  void assignSlots(const VPlan &Plan); 
- 
-public: 
+class VPlan;
+class VPBasicBlock;
+class VPRegionBlock;
+
+/// This class can be used to assign consecutive numbers to all VPValues in a
+/// VPlan and allows querying the numbering for printing, similar to the
+/// ModuleSlotTracker for IR values.
+class VPSlotTracker {
+  DenseMap<const VPValue *, unsigned> Slots;
+  unsigned NextSlot = 0;
+
+  void assignSlots(const VPBlockBase *VPBB);
+  void assignSlots(const VPRegionBlock *Region);
+  void assignSlots(const VPBasicBlock *VPBB);
+  void assignSlot(const VPValue *V);
+
+  void assignSlots(const VPlan &Plan);
+
+public:
   VPSlotTracker(const VPlan *Plan = nullptr) {
-    if (Plan) 
-      assignSlots(*Plan); 
-  } 
- 
-  unsigned getSlot(const VPValue *V) const { 
-    auto I = Slots.find(V); 
-    if (I == Slots.end()) 
-      return -1; 
-    return I->second; 
-  } 
-}; 
- 
-} // namespace llvm 
- 
-#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_VALUE_H 
+    if (Plan)
+      assignSlots(*Plan);
+  }
+
+  unsigned getSlot(const VPValue *V) const {
+    auto I = Slots.find(V);
+    if (I == Slots.end())
+      return -1;
+    return I->second;
+  }
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_VALUE_H
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanVerifier.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 7a602fb146..6eec8d14de 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -1,130 +1,130 @@
-//===-- VPlanVerifier.cpp -------------------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-/// 
-/// \file 
-/// This file defines the class VPlanVerifier, which contains utility functions 
-/// to check the consistency and invariants of a VPlan. 
-/// 
-//===----------------------------------------------------------------------===// 
- 
-#include "VPlanVerifier.h" 
-#include "VPlan.h" 
-#include "llvm/ADT/DepthFirstIterator.h" 
-#include "llvm/Support/CommandLine.h" 
- 
-#define DEBUG_TYPE "loop-vectorize" 
- 
-using namespace llvm; 
- 
-static cl::opt<bool> EnableHCFGVerifier("vplan-verify-hcfg", cl::init(false), 
-                                        cl::Hidden, 
-                                        cl::desc("Verify VPlan H-CFG.")); 
- 
-#ifndef NDEBUG 
-/// Utility function that checks whether \p VPBlockVec has duplicate 
-/// VPBlockBases. 
-static bool hasDuplicates(const SmallVectorImpl<VPBlockBase *> &VPBlockVec) { 
-  SmallDenseSet<const VPBlockBase *, 8> VPBlockSet; 
-  for (const auto *Block : VPBlockVec) { 
-    if (VPBlockSet.count(Block)) 
-      return true; 
-    VPBlockSet.insert(Block); 
-  } 
-  return false; 
-} 
-#endif 
- 
-/// Helper function that verifies the CFG invariants of the VPBlockBases within 
-/// \p Region. Checks in this function are generic for VPBlockBases. They are 
-/// not specific for VPBasicBlocks or VPRegionBlocks. 
-static void verifyBlocksInRegion(const VPRegionBlock *Region) { 
-  for (const VPBlockBase *VPB : 
-       make_range(df_iterator<const VPBlockBase *>::begin(Region->getEntry()), 
-                  df_iterator<const VPBlockBase *>::end(Region->getExit()))) { 
-    // Check block's parent. 
-    assert(VPB->getParent() == Region && "VPBlockBase has wrong parent"); 
- 
-    // Check block's condition bit. 
-    if (VPB->getNumSuccessors() > 1) 
-      assert(VPB->getCondBit() && "Missing condition bit!"); 
-    else 
-      assert(!VPB->getCondBit() && "Unexpected condition bit!"); 
- 
-    // Check block's successors. 
-    const auto &Successors = VPB->getSuccessors(); 
-    // There must be only one instance of a successor in block's successor list. 
-    // TODO: This won't work for switch statements. 
-    assert(!hasDuplicates(Successors) && 
-           "Multiple instances of the same successor."); 
- 
-    for (const VPBlockBase *Succ : Successors) { 
-      // There must be a bi-directional link between block and successor. 
-      const auto &SuccPreds = Succ->getPredecessors(); 
+//===-- VPlanVerifier.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the class VPlanVerifier, which contains utility functions
+/// to check the consistency and invariants of a VPlan.
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPlanVerifier.h"
+#include "VPlan.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/Support/CommandLine.h"
+
+#define DEBUG_TYPE "loop-vectorize"
+
+using namespace llvm;
+
+static cl::opt<bool> EnableHCFGVerifier("vplan-verify-hcfg", cl::init(false),
+                                        cl::Hidden,
+                                        cl::desc("Verify VPlan H-CFG."));
+
+#ifndef NDEBUG
+/// Utility function that checks whether \p VPBlockVec has duplicate
+/// VPBlockBases.
+static bool hasDuplicates(const SmallVectorImpl<VPBlockBase *> &VPBlockVec) {
+  SmallDenseSet<const VPBlockBase *, 8> VPBlockSet;
+  for (const auto *Block : VPBlockVec) {
+    if (VPBlockSet.count(Block))
+      return true;
+    VPBlockSet.insert(Block);
+  }
+  return false;
+}
+#endif
+
+/// Helper function that verifies the CFG invariants of the VPBlockBases within
+/// \p Region. Checks in this function are generic for VPBlockBases. They are
+/// not specific for VPBasicBlocks or VPRegionBlocks.
+static void verifyBlocksInRegion(const VPRegionBlock *Region) {
+  for (const VPBlockBase *VPB :
+       make_range(df_iterator<const VPBlockBase *>::begin(Region->getEntry()),
+                  df_iterator<const VPBlockBase *>::end(Region->getExit()))) {
+    // Check block's parent.
+    assert(VPB->getParent() == Region && "VPBlockBase has wrong parent");
+
+    // Check block's condition bit.
+    if (VPB->getNumSuccessors() > 1)
+      assert(VPB->getCondBit() && "Missing condition bit!");
+    else
+      assert(!VPB->getCondBit() && "Unexpected condition bit!");
+
+    // Check block's successors.
+    const auto &Successors = VPB->getSuccessors();
+    // There must be only one instance of a successor in block's successor list.
+    // TODO: This won't work for switch statements.
+    assert(!hasDuplicates(Successors) &&
+           "Multiple instances of the same successor.");
+
+    for (const VPBlockBase *Succ : Successors) {
+      // There must be a bi-directional link between block and successor.
+      const auto &SuccPreds = Succ->getPredecessors();
       assert(llvm::is_contained(SuccPreds, VPB) && "Missing predecessor link.");
-      (void)SuccPreds; 
-    } 
- 
-    // Check block's predecessors. 
-    const auto &Predecessors = VPB->getPredecessors(); 
-    // There must be only one instance of a predecessor in block's predecessor 
-    // list. 
-    // TODO: This won't work for switch statements. 
-    assert(!hasDuplicates(Predecessors) && 
-           "Multiple instances of the same predecessor."); 
- 
-    for (const VPBlockBase *Pred : Predecessors) { 
-      // Block and predecessor must be inside the same region. 
-      assert(Pred->getParent() == VPB->getParent() && 
-             "Predecessor is not in the same region."); 
- 
-      // There must be a bi-directional link between block and predecessor. 
-      const auto &PredSuccs = Pred->getSuccessors(); 
+      (void)SuccPreds;
+    }
+
+    // Check block's predecessors.
+    const auto &Predecessors = VPB->getPredecessors();
+    // There must be only one instance of a predecessor in block's predecessor
+    // list.
+    // TODO: This won't work for switch statements.
+    assert(!hasDuplicates(Predecessors) &&
+           "Multiple instances of the same predecessor.");
+
+    for (const VPBlockBase *Pred : Predecessors) {
+      // Block and predecessor must be inside the same region.
+      assert(Pred->getParent() == VPB->getParent() &&
+             "Predecessor is not in the same region.");
+
+      // There must be a bi-directional link between block and predecessor.
+      const auto &PredSuccs = Pred->getSuccessors();
       assert(llvm::is_contained(PredSuccs, VPB) && "Missing successor link.");
-      (void)PredSuccs; 
-    } 
-  } 
-} 
- 
-/// Verify the CFG invariants of VPRegionBlock \p Region and its nested 
-/// VPBlockBases. Do not recurse inside nested VPRegionBlocks. 
-static void verifyRegion(const VPRegionBlock *Region) { 
-  const VPBlockBase *Entry = Region->getEntry(); 
-  const VPBlockBase *Exit = Region->getExit(); 
- 
-  // Entry and Exit shouldn't have any predecessor/successor, respectively. 
-  assert(!Entry->getNumPredecessors() && "Region entry has predecessors."); 
-  assert(!Exit->getNumSuccessors() && "Region exit has successors."); 
-  (void)Entry; 
-  (void)Exit; 
- 
-  verifyBlocksInRegion(Region); 
-} 
- 
-/// Verify the CFG invariants of VPRegionBlock \p Region and its nested 
-/// VPBlockBases. Recurse inside nested VPRegionBlocks. 
-static void verifyRegionRec(const VPRegionBlock *Region) { 
-  verifyRegion(Region); 
- 
-  // Recurse inside nested regions. 
-  for (const VPBlockBase *VPB : 
-       make_range(df_iterator<const VPBlockBase *>::begin(Region->getEntry()), 
-                  df_iterator<const VPBlockBase *>::end(Region->getExit()))) { 
-    if (const auto *SubRegion = dyn_cast<VPRegionBlock>(VPB)) 
-      verifyRegionRec(SubRegion); 
-  } 
-} 
- 
-void VPlanVerifier::verifyHierarchicalCFG( 
-    const VPRegionBlock *TopRegion) const { 
-  if (!EnableHCFGVerifier) 
-    return; 
- 
-  LLVM_DEBUG(dbgs() << "Verifying VPlan H-CFG.\n"); 
-  assert(!TopRegion->getParent() && "VPlan Top Region should have no parent."); 
-  verifyRegionRec(TopRegion); 
-} 
+      (void)PredSuccs;
+    }
+  }
+}
+
+/// Verify the CFG invariants of VPRegionBlock \p Region and its nested
+/// VPBlockBases. Do not recurse inside nested VPRegionBlocks.
+static void verifyRegion(const VPRegionBlock *Region) {
+  const VPBlockBase *Entry = Region->getEntry();
+  const VPBlockBase *Exit = Region->getExit();
+
+  // Entry and Exit shouldn't have any predecessor/successor, respectively.
+  assert(!Entry->getNumPredecessors() && "Region entry has predecessors.");
+  assert(!Exit->getNumSuccessors() && "Region exit has successors.");
+  (void)Entry;
+  (void)Exit;
+
+  verifyBlocksInRegion(Region);
+}
+
+/// Verify the CFG invariants of VPRegionBlock \p Region and its nested
+/// VPBlockBases. Recurse inside nested VPRegionBlocks.
+static void verifyRegionRec(const VPRegionBlock *Region) {
+  verifyRegion(Region);
+
+  // Recurse inside nested regions.
+  for (const VPBlockBase *VPB :
+       make_range(df_iterator<const VPBlockBase *>::begin(Region->getEntry()),
+                  df_iterator<const VPBlockBase *>::end(Region->getExit()))) {
+    if (const auto *SubRegion = dyn_cast<VPRegionBlock>(VPB))
+      verifyRegionRec(SubRegion);
+  }
+}
+
+void VPlanVerifier::verifyHierarchicalCFG(
+    const VPRegionBlock *TopRegion) const {
+  if (!EnableHCFGVerifier)
+    return;
+
+  LLVM_DEBUG(dbgs() << "Verifying VPlan H-CFG.\n");
+  assert(!TopRegion->getParent() && "VPlan Top Region should have no parent.");
+  verifyRegionRec(TopRegion);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanVerifier.h b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanVerifier.h
index 75a92a8d12..8e8de44164 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanVerifier.h
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanVerifier.h
@@ -1,41 +1,41 @@
-//===-- VPlanVerifier.h -----------------------------------------*- C++ -*-===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-/// 
-/// \file 
-/// This file declares the class VPlanVerifier, which contains utility functions 
-/// to check the consistency of a VPlan. This includes the following kinds of 
-/// invariants: 
-/// 
-/// 1. Region/Block invariants: 
-///   - Region's entry/exit block must have no predecessors/successors, 
-///     respectively. 
-///   - Block's parent must be the region immediately containing the block. 
-///   - Linked blocks must have a bi-directional link (successor/predecessor). 
-///   - All predecessors/successors of a block must belong to the same region. 
-///   - Blocks must have no duplicated successor/predecessor. 
-/// 
-//===----------------------------------------------------------------------===// 
- 
-#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANVERIFIER_H 
-#define LLVM_TRANSFORMS_VECTORIZE_VPLANVERIFIER_H 
- 
-namespace llvm { 
-class VPRegionBlock; 
- 
-/// Struct with utility functions that can be used to check the consistency and 
-/// invariants of a VPlan, including the components of its H-CFG. 
-struct VPlanVerifier { 
-  /// Verify the invariants of the H-CFG starting from \p TopRegion. The 
-  /// verification process comprises the following steps: 
-  /// 1. Region/Block verification: Check the Region/Block verification 
-  /// invariants for every region in the H-CFG. 
-  void verifyHierarchicalCFG(const VPRegionBlock *TopRegion) const; 
-}; 
-} // namespace llvm 
- 
-#endif //LLVM_TRANSFORMS_VECTORIZE_VPLANVERIFIER_H 
+//===-- VPlanVerifier.h -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares the class VPlanVerifier, which contains utility functions
+/// to check the consistency of a VPlan. This includes the following kinds of
+/// invariants:
+///
+/// 1. Region/Block invariants:
+///   - Region's entry/exit block must have no predecessors/successors,
+///     respectively.
+///   - Block's parent must be the region immediately containing the block.
+///   - Linked blocks must have a bi-directional link (successor/predecessor).
+///   - All predecessors/successors of a block must belong to the same region.
+///   - Blocks must have no duplicated successor/predecessor.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANVERIFIER_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLANVERIFIER_H
+
+namespace llvm {
+class VPRegionBlock;
+
+/// Struct with utility functions that can be used to check the consistency and
+/// invariants of a VPlan, including the components of its H-CFG.
+struct VPlanVerifier {
+  /// Verify the invariants of the H-CFG starting from \p TopRegion. The
+  /// verification process comprises the following steps:
+  /// 1. Region/Block verification: Check the Region/Block verification
+  /// invariants for every region in the H-CFG.
+  void verifyHierarchicalCFG(const VPRegionBlock *TopRegion) const;
+};
+} // namespace llvm
+
+#endif //LLVM_TRANSFORMS_VECTORIZE_VPLANVERIFIER_H
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VectorCombine.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/VectorCombine.cpp
index 815b5eadbd..787f146bdd 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1,96 +1,96 @@
-//===------- VectorCombine.cpp - Optimize partial vector operations -------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This pass optimizes scalar/vector interactions using target cost models. The 
-// transforms implemented here may not fit in traditional loop-based or SLP 
-// vectorization passes. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Vectorize/VectorCombine.h" 
-#include "llvm/ADT/Statistic.h" 
-#include "llvm/Analysis/BasicAliasAnalysis.h" 
-#include "llvm/Analysis/GlobalsModRef.h" 
+//===------- VectorCombine.cpp - Optimize partial vector operations -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass optimizes scalar/vector interactions using target cost models. The
+// transforms implemented here may not fit in traditional loop-based or SLP
+// vectorization passes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize/VectorCombine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/Loads.h"
-#include "llvm/Analysis/TargetTransformInfo.h" 
-#include "llvm/Analysis/ValueTracking.h" 
-#include "llvm/Analysis/VectorUtils.h" 
-#include "llvm/IR/Dominators.h" 
-#include "llvm/IR/Function.h" 
-#include "llvm/IR/IRBuilder.h" 
-#include "llvm/IR/PatternMatch.h" 
-#include "llvm/InitializePasses.h" 
-#include "llvm/Pass.h" 
-#include "llvm/Support/CommandLine.h" 
-#include "llvm/Transforms/Utils/Local.h" 
-#include "llvm/Transforms/Vectorize.h" 
- 
-using namespace llvm; 
-using namespace llvm::PatternMatch; 
- 
-#define DEBUG_TYPE "vector-combine" 
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Vectorize.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "vector-combine"
 STATISTIC(NumVecLoad, "Number of vector loads formed");
-STATISTIC(NumVecCmp, "Number of vector compares formed"); 
-STATISTIC(NumVecBO, "Number of vector binops formed"); 
-STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed"); 
-STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast"); 
-STATISTIC(NumScalarBO, "Number of scalar binops formed"); 
-STATISTIC(NumScalarCmp, "Number of scalar compares formed"); 
- 
-static cl::opt<bool> DisableVectorCombine( 
-    "disable-vector-combine", cl::init(false), cl::Hidden, 
-    cl::desc("Disable all vector combine transforms")); 
- 
-static cl::opt<bool> DisableBinopExtractShuffle( 
-    "disable-binop-extract-shuffle", cl::init(false), cl::Hidden, 
-    cl::desc("Disable binop extract to shuffle transforms")); 
- 
-static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max(); 
- 
-namespace { 
-class VectorCombine { 
-public: 
-  VectorCombine(Function &F, const TargetTransformInfo &TTI, 
-                const DominatorTree &DT) 
-      : F(F), Builder(F.getContext()), TTI(TTI), DT(DT) {} 
- 
-  bool run(); 
- 
-private: 
-  Function &F; 
-  IRBuilder<> Builder; 
-  const TargetTransformInfo &TTI; 
-  const DominatorTree &DT; 
- 
+STATISTIC(NumVecCmp, "Number of vector compares formed");
+STATISTIC(NumVecBO, "Number of vector binops formed");
+STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
+STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
+STATISTIC(NumScalarBO, "Number of scalar binops formed");
+STATISTIC(NumScalarCmp, "Number of scalar compares formed");
+
+static cl::opt<bool> DisableVectorCombine(
+    "disable-vector-combine", cl::init(false), cl::Hidden,
+    cl::desc("Disable all vector combine transforms"));
+
+static cl::opt<bool> DisableBinopExtractShuffle(
+    "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
+    cl::desc("Disable binop extract to shuffle transforms"));
+
+static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
+
+namespace {
+class VectorCombine {
+public:
+  VectorCombine(Function &F, const TargetTransformInfo &TTI,
+                const DominatorTree &DT)
+      : F(F), Builder(F.getContext()), TTI(TTI), DT(DT) {}
+
+  bool run();
+
+private:
+  Function &F;
+  IRBuilder<> Builder;
+  const TargetTransformInfo &TTI;
+  const DominatorTree &DT;
+
   bool vectorizeLoadInsert(Instruction &I);
-  ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0, 
-                                        ExtractElementInst *Ext1, 
-                                        unsigned PreferredExtractIndex) const; 
-  bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1, 
-                             unsigned Opcode, 
-                             ExtractElementInst *&ConvertToShuffle, 
-                             unsigned PreferredExtractIndex); 
-  void foldExtExtCmp(ExtractElementInst *Ext0, ExtractElementInst *Ext1, 
-                     Instruction &I); 
-  void foldExtExtBinop(ExtractElementInst *Ext0, ExtractElementInst *Ext1, 
-                       Instruction &I); 
-  bool foldExtractExtract(Instruction &I); 
-  bool foldBitcastShuf(Instruction &I); 
-  bool scalarizeBinopOrCmp(Instruction &I); 
-  bool foldExtractedCmps(Instruction &I); 
-}; 
-} // namespace 
- 
-static void replaceValue(Value &Old, Value &New) { 
-  Old.replaceAllUsesWith(&New); 
-  New.takeName(&Old); 
-} 
- 
+  ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
+                                        ExtractElementInst *Ext1,
+                                        unsigned PreferredExtractIndex) const;
+  bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
+                             unsigned Opcode,
+                             ExtractElementInst *&ConvertToShuffle,
+                             unsigned PreferredExtractIndex);
+  void foldExtExtCmp(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
+                     Instruction &I);
+  void foldExtExtBinop(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
+                       Instruction &I);
+  bool foldExtractExtract(Instruction &I);
+  bool foldBitcastShuf(Instruction &I);
+  bool scalarizeBinopOrCmp(Instruction &I);
+  bool foldExtractedCmps(Instruction &I);
+};
+} // namespace
+
+static void replaceValue(Value &Old, Value &New) {
+  Old.replaceAllUsesWith(&New);
+  New.takeName(&Old);
+}
+
 bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
   // Match insert into fixed vector of scalar value.
   // TODO: Handle non-zero insert index.
@@ -223,628 +223,628 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
   return true;
 }
 
-/// Determine which, if any, of the inputs should be replaced by a shuffle 
-/// followed by extract from a different index. 
-ExtractElementInst *VectorCombine::getShuffleExtract( 
-    ExtractElementInst *Ext0, ExtractElementInst *Ext1, 
-    unsigned PreferredExtractIndex = InvalidIndex) const { 
-  assert(isa<ConstantInt>(Ext0->getIndexOperand()) && 
-         isa<ConstantInt>(Ext1->getIndexOperand()) && 
-         "Expected constant extract indexes"); 
- 
-  unsigned Index0 = cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue(); 
-  unsigned Index1 = cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue(); 
- 
-  // If the extract indexes are identical, no shuffle is needed. 
-  if (Index0 == Index1) 
-    return nullptr; 
- 
-  Type *VecTy = Ext0->getVectorOperand()->getType(); 
-  assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types"); 
+/// Determine which, if any, of the inputs should be replaced by a shuffle
+/// followed by extract from a different index.
+ExtractElementInst *VectorCombine::getShuffleExtract(
+    ExtractElementInst *Ext0, ExtractElementInst *Ext1,
+    unsigned PreferredExtractIndex = InvalidIndex) const {
+  assert(isa<ConstantInt>(Ext0->getIndexOperand()) &&
+         isa<ConstantInt>(Ext1->getIndexOperand()) &&
+         "Expected constant extract indexes");
+
+  unsigned Index0 = cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue();
+  unsigned Index1 = cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue();
+
+  // If the extract indexes are identical, no shuffle is needed.
+  if (Index0 == Index1)
+    return nullptr;
+
+  Type *VecTy = Ext0->getVectorOperand()->getType();
+  assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
   InstructionCost Cost0 =
       TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);
   InstructionCost Cost1 =
       TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1);
- 
+
   // If both costs are invalid no shuffle is needed
   if (!Cost0.isValid() && !Cost1.isValid())
     return nullptr;
 
-  // We are extracting from 2 different indexes, so one operand must be shuffled 
-  // before performing a vector operation and/or extract. The more expensive 
-  // extract will be replaced by a shuffle. 
-  if (Cost0 > Cost1) 
-    return Ext0; 
-  if (Cost1 > Cost0) 
-    return Ext1; 
- 
-  // If the costs are equal and there is a preferred extract index, shuffle the 
-  // opposite operand. 
-  if (PreferredExtractIndex == Index0) 
-    return Ext1; 
-  if (PreferredExtractIndex == Index1) 
-    return Ext0; 
- 
-  // Otherwise, replace the extract with the higher index. 
-  return Index0 > Index1 ? Ext0 : Ext1; 
-} 
- 
-/// Compare the relative costs of 2 extracts followed by scalar operation vs. 
-/// vector operation(s) followed by extract. Return true if the existing 
-/// instructions are cheaper than a vector alternative. Otherwise, return false 
-/// and if one of the extracts should be transformed to a shufflevector, set 
-/// \p ConvertToShuffle to that extract instruction. 
-bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0, 
-                                          ExtractElementInst *Ext1, 
-                                          unsigned Opcode, 
-                                          ExtractElementInst *&ConvertToShuffle, 
-                                          unsigned PreferredExtractIndex) { 
-  assert(isa<ConstantInt>(Ext0->getOperand(1)) && 
-         isa<ConstantInt>(Ext1->getOperand(1)) && 
-         "Expected constant extract indexes"); 
-  Type *ScalarTy = Ext0->getType(); 
-  auto *VecTy = cast<VectorType>(Ext0->getOperand(0)->getType()); 
+  // We are extracting from 2 different indexes, so one operand must be shuffled
+  // before performing a vector operation and/or extract. The more expensive
+  // extract will be replaced by a shuffle.
+  if (Cost0 > Cost1)
+    return Ext0;
+  if (Cost1 > Cost0)
+    return Ext1;
+
+  // If the costs are equal and there is a preferred extract index, shuffle the
+  // opposite operand.
+  if (PreferredExtractIndex == Index0)
+    return Ext1;
+  if (PreferredExtractIndex == Index1)
+    return Ext0;
+
+  // Otherwise, replace the extract with the higher index.
+  return Index0 > Index1 ? Ext0 : Ext1;
+}
+
+/// Compare the relative costs of 2 extracts followed by scalar operation vs.
+/// vector operation(s) followed by extract. Return true if the existing
+/// instructions are cheaper than a vector alternative. Otherwise, return false
+/// and if one of the extracts should be transformed to a shufflevector, set
+/// \p ConvertToShuffle to that extract instruction.
+bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
+                                          ExtractElementInst *Ext1,
+                                          unsigned Opcode,
+                                          ExtractElementInst *&ConvertToShuffle,
+                                          unsigned PreferredExtractIndex) {
+  assert(isa<ConstantInt>(Ext0->getOperand(1)) &&
+         isa<ConstantInt>(Ext1->getOperand(1)) &&
+         "Expected constant extract indexes");
+  Type *ScalarTy = Ext0->getType();
+  auto *VecTy = cast<VectorType>(Ext0->getOperand(0)->getType());
   InstructionCost ScalarOpCost, VectorOpCost;
- 
-  // Get cost estimates for scalar and vector versions of the operation. 
-  bool IsBinOp = Instruction::isBinaryOp(Opcode); 
-  if (IsBinOp) { 
-    ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy); 
-    VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy); 
-  } else { 
-    assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && 
-           "Expected a compare"); 
-    ScalarOpCost = TTI.getCmpSelInstrCost(Opcode, ScalarTy, 
-                                          CmpInst::makeCmpResultType(ScalarTy)); 
-    VectorOpCost = TTI.getCmpSelInstrCost(Opcode, VecTy, 
-                                          CmpInst::makeCmpResultType(VecTy)); 
-  } 
- 
-  // Get cost estimates for the extract elements. These costs will factor into 
-  // both sequences. 
-  unsigned Ext0Index = cast<ConstantInt>(Ext0->getOperand(1))->getZExtValue(); 
-  unsigned Ext1Index = cast<ConstantInt>(Ext1->getOperand(1))->getZExtValue(); 
- 
+
+  // Get cost estimates for scalar and vector versions of the operation.
+  bool IsBinOp = Instruction::isBinaryOp(Opcode);
+  if (IsBinOp) {
+    ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy);
+    VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy);
+  } else {
+    assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
+           "Expected a compare");
+    ScalarOpCost = TTI.getCmpSelInstrCost(Opcode, ScalarTy,
+                                          CmpInst::makeCmpResultType(ScalarTy));
+    VectorOpCost = TTI.getCmpSelInstrCost(Opcode, VecTy,
+                                          CmpInst::makeCmpResultType(VecTy));
+  }
+
+  // Get cost estimates for the extract elements. These costs will factor into
+  // both sequences.
+  unsigned Ext0Index = cast<ConstantInt>(Ext0->getOperand(1))->getZExtValue();
+  unsigned Ext1Index = cast<ConstantInt>(Ext1->getOperand(1))->getZExtValue();
+
   InstructionCost Extract0Cost =
-      TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext0Index); 
+      TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext0Index);
   InstructionCost Extract1Cost =
-      TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext1Index); 
- 
-  // A more expensive extract will always be replaced by a splat shuffle. 
-  // For example, if Ext0 is more expensive: 
-  // opcode (extelt V0, Ext0), (ext V1, Ext1) --> 
-  // extelt (opcode (splat V0, Ext0), V1), Ext1 
-  // TODO: Evaluate whether that always results in lowest cost. Alternatively, 
-  //       check the cost of creating a broadcast shuffle and shuffling both 
-  //       operands to element 0. 
+      TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext1Index);
+
+  // A more expensive extract will always be replaced by a splat shuffle.
+  // For example, if Ext0 is more expensive:
+  // opcode (extelt V0, Ext0), (ext V1, Ext1) -->
+  // extelt (opcode (splat V0, Ext0), V1), Ext1
+  // TODO: Evaluate whether that always results in lowest cost. Alternatively,
+  //       check the cost of creating a broadcast shuffle and shuffling both
+  //       operands to element 0.
   InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
- 
-  // Extra uses of the extracts mean that we include those costs in the 
-  // vector total because those instructions will not be eliminated. 
+
+  // Extra uses of the extracts mean that we include those costs in the
+  // vector total because those instructions will not be eliminated.
   InstructionCost OldCost, NewCost;
-  if (Ext0->getOperand(0) == Ext1->getOperand(0) && Ext0Index == Ext1Index) { 
-    // Handle a special case. If the 2 extracts are identical, adjust the 
-    // formulas to account for that. The extra use charge allows for either the 
-    // CSE'd pattern or an unoptimized form with identical values: 
-    // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C 
-    bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2) 
-                                  : !Ext0->hasOneUse() || !Ext1->hasOneUse(); 
-    OldCost = CheapExtractCost + ScalarOpCost; 
-    NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost; 
-  } else { 
-    // Handle the general case. Each extract is actually a different value: 
-    // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C 
-    OldCost = Extract0Cost + Extract1Cost + ScalarOpCost; 
-    NewCost = VectorOpCost + CheapExtractCost + 
-              !Ext0->hasOneUse() * Extract0Cost + 
-              !Ext1->hasOneUse() * Extract1Cost; 
-  } 
- 
-  ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex); 
-  if (ConvertToShuffle) { 
-    if (IsBinOp && DisableBinopExtractShuffle) 
-      return true; 
- 
-    // If we are extracting from 2 different indexes, then one operand must be 
-    // shuffled before performing the vector operation. The shuffle mask is 
-    // undefined except for 1 lane that is being translated to the remaining 
-    // extraction lane. Therefore, it is a splat shuffle. Ex: 
-    // ShufMask = { undef, undef, 0, undef } 
-    // TODO: The cost model has an option for a "broadcast" shuffle 
-    //       (splat-from-element-0), but no option for a more general splat. 
-    NewCost += 
-        TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); 
-  } 
- 
-  // Aggressively form a vector op if the cost is equal because the transform 
-  // may enable further optimization. 
-  // Codegen can reverse this transform (scalarize) if it was not profitable. 
-  return OldCost < NewCost; 
-} 
- 
-/// Create a shuffle that translates (shifts) 1 element from the input vector 
-/// to a new element location. 
-static Value *createShiftShuffle(Value *Vec, unsigned OldIndex, 
-                                 unsigned NewIndex, IRBuilder<> &Builder) { 
-  // The shuffle mask is undefined except for 1 lane that is being translated 
-  // to the new element index. Example for OldIndex == 2 and NewIndex == 0: 
-  // ShufMask = { 2, undef, undef, undef } 
-  auto *VecTy = cast<FixedVectorType>(Vec->getType()); 
-  SmallVector<int, 32> ShufMask(VecTy->getNumElements(), UndefMaskElem); 
-  ShufMask[NewIndex] = OldIndex; 
+  if (Ext0->getOperand(0) == Ext1->getOperand(0) && Ext0Index == Ext1Index) {
+    // Handle a special case. If the 2 extracts are identical, adjust the
+    // formulas to account for that. The extra use charge allows for either the
+    // CSE'd pattern or an unoptimized form with identical values:
+    // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
+    bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
+                                  : !Ext0->hasOneUse() || !Ext1->hasOneUse();
+    OldCost = CheapExtractCost + ScalarOpCost;
+    NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
+  } else {
+    // Handle the general case. Each extract is actually a different value:
+    // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
+    OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
+    NewCost = VectorOpCost + CheapExtractCost +
+              !Ext0->hasOneUse() * Extract0Cost +
+              !Ext1->hasOneUse() * Extract1Cost;
+  }
+
+  ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
+  if (ConvertToShuffle) {
+    if (IsBinOp && DisableBinopExtractShuffle)
+      return true;
+
+    // If we are extracting from 2 different indexes, then one operand must be
+    // shuffled before performing the vector operation. The shuffle mask is
+    // undefined except for 1 lane that is being translated to the remaining
+    // extraction lane. Therefore, it is a splat shuffle. Ex:
+    // ShufMask = { undef, undef, 0, undef }
+    // TODO: The cost model has an option for a "broadcast" shuffle
+    //       (splat-from-element-0), but no option for a more general splat.
+    NewCost +=
+        TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+  }
+
+  // Aggressively form a vector op if the cost is equal because the transform
+  // may enable further optimization.
+  // Codegen can reverse this transform (scalarize) if it was not profitable.
+  return OldCost < NewCost;
+}
+
+/// Create a shuffle that translates (shifts) 1 element from the input vector
+/// to a new element location.
+static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
+                                 unsigned NewIndex, IRBuilder<> &Builder) {
+  // The shuffle mask is undefined except for 1 lane that is being translated
+  // to the new element index. Example for OldIndex == 2 and NewIndex == 0:
+  // ShufMask = { 2, undef, undef, undef }
+  auto *VecTy = cast<FixedVectorType>(Vec->getType());
+  SmallVector<int, 32> ShufMask(VecTy->getNumElements(), UndefMaskElem);
+  ShufMask[NewIndex] = OldIndex;
   return Builder.CreateShuffleVector(Vec, ShufMask, "shift");
-} 
- 
-/// Given an extract element instruction with constant index operand, shuffle 
-/// the source vector (shift the scalar element) to a NewIndex for extraction. 
-/// Return null if the input can be constant folded, so that we are not creating 
-/// unnecessary instructions. 
-static ExtractElementInst *translateExtract(ExtractElementInst *ExtElt, 
-                                            unsigned NewIndex, 
-                                            IRBuilder<> &Builder) { 
-  // If the extract can be constant-folded, this code is unsimplified. Defer 
-  // to other passes to handle that. 
-  Value *X = ExtElt->getVectorOperand(); 
-  Value *C = ExtElt->getIndexOperand(); 
-  assert(isa<ConstantInt>(C) && "Expected a constant index operand"); 
-  if (isa<Constant>(X)) 
-    return nullptr; 
- 
-  Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(), 
-                                   NewIndex, Builder); 
-  return cast<ExtractElementInst>(Builder.CreateExtractElement(Shuf, NewIndex)); 
-} 
- 
-/// Try to reduce extract element costs by converting scalar compares to vector 
-/// compares followed by extract. 
-/// cmp (ext0 V0, C), (ext1 V1, C) 
-void VectorCombine::foldExtExtCmp(ExtractElementInst *Ext0, 
-                                  ExtractElementInst *Ext1, Instruction &I) { 
-  assert(isa<CmpInst>(&I) && "Expected a compare"); 
-  assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() == 
-             cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() && 
-         "Expected matching constant extract indexes"); 
- 
-  // cmp Pred (extelt V0, C), (extelt V1, C) --> extelt (cmp Pred V0, V1), C 
-  ++NumVecCmp; 
-  CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate(); 
-  Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand(); 
-  Value *VecCmp = Builder.CreateCmp(Pred, V0, V1); 
-  Value *NewExt = Builder.CreateExtractElement(VecCmp, Ext0->getIndexOperand()); 
-  replaceValue(I, *NewExt); 
-} 
- 
-/// Try to reduce extract element costs by converting scalar binops to vector 
-/// binops followed by extract. 
-/// bo (ext0 V0, C), (ext1 V1, C) 
-void VectorCombine::foldExtExtBinop(ExtractElementInst *Ext0, 
-                                    ExtractElementInst *Ext1, Instruction &I) { 
-  assert(isa<BinaryOperator>(&I) && "Expected a binary operator"); 
-  assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() == 
-             cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() && 
-         "Expected matching constant extract indexes"); 
- 
-  // bo (extelt V0, C), (extelt V1, C) --> extelt (bo V0, V1), C 
-  ++NumVecBO; 
-  Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand(); 
-  Value *VecBO = 
-      Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0, V1); 
- 
-  // All IR flags are safe to back-propagate because any potential poison 
-  // created in unused vector elements is discarded by the extract. 
-  if (auto *VecBOInst = dyn_cast<Instruction>(VecBO)) 
-    VecBOInst->copyIRFlags(&I); 
- 
-  Value *NewExt = Builder.CreateExtractElement(VecBO, Ext0->getIndexOperand()); 
-  replaceValue(I, *NewExt); 
-} 
- 
-/// Match an instruction with extracted vector operands. 
-bool VectorCombine::foldExtractExtract(Instruction &I) { 
-  // It is not safe to transform things like div, urem, etc. because we may 
-  // create undefined behavior when executing those on unknown vector elements. 
-  if (!isSafeToSpeculativelyExecute(&I)) 
-    return false; 
- 
-  Instruction *I0, *I1; 
-  CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 
-  if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) && 
-      !match(&I, m_BinOp(m_Instruction(I0), m_Instruction(I1)))) 
-    return false; 
- 
-  Value *V0, *V1; 
-  uint64_t C0, C1; 
-  if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) || 
-      !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) || 
-      V0->getType() != V1->getType()) 
-    return false; 
- 
-  // If the scalar value 'I' is going to be re-inserted into a vector, then try 
-  // to create an extract to that same element. The extract/insert can be 
-  // reduced to a "select shuffle". 
-  // TODO: If we add a larger pattern match that starts from an insert, this 
-  //       probably becomes unnecessary. 
-  auto *Ext0 = cast<ExtractElementInst>(I0); 
-  auto *Ext1 = cast<ExtractElementInst>(I1); 
-  uint64_t InsertIndex = InvalidIndex; 
-  if (I.hasOneUse()) 
-    match(I.user_back(), 
-          m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex))); 
- 
-  ExtractElementInst *ExtractToChange; 
-  if (isExtractExtractCheap(Ext0, Ext1, I.getOpcode(), ExtractToChange, 
-                            InsertIndex)) 
-    return false; 
- 
-  if (ExtractToChange) { 
-    unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0; 
-    ExtractElementInst *NewExtract = 
-        translateExtract(ExtractToChange, CheapExtractIdx, Builder); 
-    if (!NewExtract) 
-      return false; 
-    if (ExtractToChange == Ext0) 
-      Ext0 = NewExtract; 
-    else 
-      Ext1 = NewExtract; 
-  } 
- 
-  if (Pred != CmpInst::BAD_ICMP_PREDICATE) 
-    foldExtExtCmp(Ext0, Ext1, I); 
-  else 
-    foldExtExtBinop(Ext0, Ext1, I); 
- 
-  return true; 
-} 
- 
-/// If this is a bitcast of a shuffle, try to bitcast the source vector to the 
-/// destination type followed by shuffle. This can enable further transforms by 
-/// moving bitcasts or shuffles together. 
-bool VectorCombine::foldBitcastShuf(Instruction &I) { 
-  Value *V; 
-  ArrayRef<int> Mask; 
-  if (!match(&I, m_BitCast( 
-                     m_OneUse(m_Shuffle(m_Value(V), m_Undef(), m_Mask(Mask)))))) 
-    return false; 
- 
+}
+
+/// Given an extract element instruction with constant index operand, shuffle
+/// the source vector (shift the scalar element) to a NewIndex for extraction.
+/// Return null if the input can be constant folded, so that we are not creating
+/// unnecessary instructions.
+static ExtractElementInst *translateExtract(ExtractElementInst *ExtElt,
+                                            unsigned NewIndex,
+                                            IRBuilder<> &Builder) {
+  // If the extract can be constant-folded, this code is unsimplified. Defer
+  // to other passes to handle that.
+  Value *X = ExtElt->getVectorOperand();
+  Value *C = ExtElt->getIndexOperand();
+  assert(isa<ConstantInt>(C) && "Expected a constant index operand");
+  if (isa<Constant>(X))
+    return nullptr;
+
+  Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(),
+                                   NewIndex, Builder);
+  return cast<ExtractElementInst>(Builder.CreateExtractElement(Shuf, NewIndex));
+}
+
+/// Try to reduce extract element costs by converting scalar compares to vector
+/// compares followed by extract.
+/// cmp (ext0 V0, C), (ext1 V1, C)
+void VectorCombine::foldExtExtCmp(ExtractElementInst *Ext0,
+                                  ExtractElementInst *Ext1, Instruction &I) {
+  assert(isa<CmpInst>(&I) && "Expected a compare");
+  assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() ==
+             cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() &&
+         "Expected matching constant extract indexes");
+
+  // cmp Pred (extelt V0, C), (extelt V1, C) --> extelt (cmp Pred V0, V1), C
+  ++NumVecCmp;
+  CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate();
+  Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand();
+  Value *VecCmp = Builder.CreateCmp(Pred, V0, V1);
+  Value *NewExt = Builder.CreateExtractElement(VecCmp, Ext0->getIndexOperand());
+  replaceValue(I, *NewExt);
+}
+
+/// Try to reduce extract element costs by converting scalar binops to vector
+/// binops followed by extract.
+/// bo (ext0 V0, C), (ext1 V1, C)
+void VectorCombine::foldExtExtBinop(ExtractElementInst *Ext0,
+                                    ExtractElementInst *Ext1, Instruction &I) {
+  assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
+  assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() ==
+             cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() &&
+         "Expected matching constant extract indexes");
+
+  // bo (extelt V0, C), (extelt V1, C) --> extelt (bo V0, V1), C
+  ++NumVecBO;
+  Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand();
+  Value *VecBO =
+      Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0, V1);
+
+  // All IR flags are safe to back-propagate because any potential poison
+  // created in unused vector elements is discarded by the extract.
+  if (auto *VecBOInst = dyn_cast<Instruction>(VecBO))
+    VecBOInst->copyIRFlags(&I);
+
+  Value *NewExt = Builder.CreateExtractElement(VecBO, Ext0->getIndexOperand());
+  replaceValue(I, *NewExt);
+}
+
+/// Match an instruction with extracted vector operands.
+bool VectorCombine::foldExtractExtract(Instruction &I) {
+  // It is not safe to transform things like div, urem, etc. because we may
+  // create undefined behavior when executing those on unknown vector elements.
+  if (!isSafeToSpeculativelyExecute(&I))
+    return false;
+
+  Instruction *I0, *I1;
+  CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+  if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) &&
+      !match(&I, m_BinOp(m_Instruction(I0), m_Instruction(I1))))
+    return false;
+
+  Value *V0, *V1;
+  uint64_t C0, C1;
+  if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) ||
+      !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) ||
+      V0->getType() != V1->getType())
+    return false;
+
+  // If the scalar value 'I' is going to be re-inserted into a vector, then try
+  // to create an extract to that same element. The extract/insert can be
+  // reduced to a "select shuffle".
+  // TODO: If we add a larger pattern match that starts from an insert, this
+  //       probably becomes unnecessary.
+  auto *Ext0 = cast<ExtractElementInst>(I0);
+  auto *Ext1 = cast<ExtractElementInst>(I1);
+  uint64_t InsertIndex = InvalidIndex;
+  if (I.hasOneUse())
+    match(I.user_back(),
+          m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));
+
+  ExtractElementInst *ExtractToChange;
+  if (isExtractExtractCheap(Ext0, Ext1, I.getOpcode(), ExtractToChange,
+                            InsertIndex))
+    return false;
+
+  if (ExtractToChange) {
+    unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
+    ExtractElementInst *NewExtract =
+        translateExtract(ExtractToChange, CheapExtractIdx, Builder);
+    if (!NewExtract)
+      return false;
+    if (ExtractToChange == Ext0)
+      Ext0 = NewExtract;
+    else
+      Ext1 = NewExtract;
+  }
+
+  if (Pred != CmpInst::BAD_ICMP_PREDICATE)
+    foldExtExtCmp(Ext0, Ext1, I);
+  else
+    foldExtExtBinop(Ext0, Ext1, I);
+
+  return true;
+}
+
+/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
+/// destination type followed by shuffle. This can enable further transforms by
+/// moving bitcasts or shuffles together.
+bool VectorCombine::foldBitcastShuf(Instruction &I) {
+  Value *V;
+  ArrayRef<int> Mask;
+  if (!match(&I, m_BitCast(
+                     m_OneUse(m_Shuffle(m_Value(V), m_Undef(), m_Mask(Mask))))))
+    return false;
+
   // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
   // scalable type is unknown; Second, we cannot reason if the narrowed shuffle
   // mask for scalable type is a splat or not.
   // 2) Disallow non-vector casts and length-changing shuffles.
-  // TODO: We could allow any shuffle. 
+  // TODO: We could allow any shuffle.
   auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
   auto *SrcTy = dyn_cast<FixedVectorType>(V->getType());
   if (!SrcTy || !DestTy || I.getOperand(0)->getType() != SrcTy)
-    return false; 
- 
-  // The new shuffle must not cost more than the old shuffle. The bitcast is 
-  // moved ahead of the shuffle, so assume that it has the same cost as before. 
+    return false;
+
+  // The new shuffle must not cost more than the old shuffle. The bitcast is
+  // moved ahead of the shuffle, so assume that it has the same cost as before.
   InstructionCost DestCost =
       TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, DestTy);
   InstructionCost SrcCost =
       TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy);
   if (DestCost > SrcCost || !DestCost.isValid())
-    return false; 
- 
-  unsigned DestNumElts = DestTy->getNumElements(); 
-  unsigned SrcNumElts = SrcTy->getNumElements(); 
-  SmallVector<int, 16> NewMask; 
-  if (SrcNumElts <= DestNumElts) { 
-    // The bitcast is from wide to narrow/equal elements. The shuffle mask can 
-    // always be expanded to the equivalent form choosing narrower elements. 
-    assert(DestNumElts % SrcNumElts == 0 && "Unexpected shuffle mask"); 
-    unsigned ScaleFactor = DestNumElts / SrcNumElts; 
-    narrowShuffleMaskElts(ScaleFactor, Mask, NewMask); 
-  } else { 
-    // The bitcast is from narrow elements to wide elements. The shuffle mask 
-    // must choose consecutive elements to allow casting first. 
-    assert(SrcNumElts % DestNumElts == 0 && "Unexpected shuffle mask"); 
-    unsigned ScaleFactor = SrcNumElts / DestNumElts; 
-    if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask)) 
-      return false; 
-  } 
-  // bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC' 
-  ++NumShufOfBitcast; 
-  Value *CastV = Builder.CreateBitCast(V, DestTy); 
+    return false;
+
+  unsigned DestNumElts = DestTy->getNumElements();
+  unsigned SrcNumElts = SrcTy->getNumElements();
+  SmallVector<int, 16> NewMask;
+  if (SrcNumElts <= DestNumElts) {
+    // The bitcast is from wide to narrow/equal elements. The shuffle mask can
+    // always be expanded to the equivalent form choosing narrower elements.
+    assert(DestNumElts % SrcNumElts == 0 && "Unexpected shuffle mask");
+    unsigned ScaleFactor = DestNumElts / SrcNumElts;
+    narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
+  } else {
+    // The bitcast is from narrow elements to wide elements. The shuffle mask
+    // must choose consecutive elements to allow casting first.
+    assert(SrcNumElts % DestNumElts == 0 && "Unexpected shuffle mask");
+    unsigned ScaleFactor = SrcNumElts / DestNumElts;
+    if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
+      return false;
+  }
+  // bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC'
+  ++NumShufOfBitcast;
+  Value *CastV = Builder.CreateBitCast(V, DestTy);
   Value *Shuf = Builder.CreateShuffleVector(CastV, NewMask);
-  replaceValue(I, *Shuf); 
-  return true; 
-} 
- 
-/// Match a vector binop or compare instruction with at least one inserted 
-/// scalar operand and convert to scalar binop/cmp followed by insertelement. 
-bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { 
-  CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 
-  Value *Ins0, *Ins1; 
-  if (!match(&I, m_BinOp(m_Value(Ins0), m_Value(Ins1))) && 
-      !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1)))) 
-    return false; 
- 
-  // Do not convert the vector condition of a vector select into a scalar 
-  // condition. That may cause problems for codegen because of differences in 
-  // boolean formats and register-file transfers. 
-  // TODO: Can we account for that in the cost model? 
-  bool IsCmp = Pred != CmpInst::Predicate::BAD_ICMP_PREDICATE; 
-  if (IsCmp) 
-    for (User *U : I.users()) 
-      if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value()))) 
-        return false; 
- 
-  // Match against one or both scalar values being inserted into constant 
-  // vectors: 
-  // vec_op VecC0, (inselt VecC1, V1, Index) 
-  // vec_op (inselt VecC0, V0, Index), VecC1 
-  // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) 
-  // TODO: Deal with mismatched index constants and variable indexes? 
-  Constant *VecC0 = nullptr, *VecC1 = nullptr; 
-  Value *V0 = nullptr, *V1 = nullptr; 
-  uint64_t Index0 = 0, Index1 = 0; 
-  if (!match(Ins0, m_InsertElt(m_Constant(VecC0), m_Value(V0), 
-                               m_ConstantInt(Index0))) && 
-      !match(Ins0, m_Constant(VecC0))) 
-    return false; 
-  if (!match(Ins1, m_InsertElt(m_Constant(VecC1), m_Value(V1), 
-                               m_ConstantInt(Index1))) && 
-      !match(Ins1, m_Constant(VecC1))) 
-    return false; 
- 
-  bool IsConst0 = !V0; 
-  bool IsConst1 = !V1; 
-  if (IsConst0 && IsConst1) 
-    return false; 
-  if (!IsConst0 && !IsConst1 && Index0 != Index1) 
-    return false; 
- 
-  // Bail for single insertion if it is a load. 
-  // TODO: Handle this once getVectorInstrCost can cost for load/stores. 
-  auto *I0 = dyn_cast_or_null<Instruction>(V0); 
-  auto *I1 = dyn_cast_or_null<Instruction>(V1); 
-  if ((IsConst0 && I1 && I1->mayReadFromMemory()) || 
-      (IsConst1 && I0 && I0->mayReadFromMemory())) 
-    return false; 
- 
-  uint64_t Index = IsConst0 ? Index1 : Index0; 
-  Type *ScalarTy = IsConst0 ? V1->getType() : V0->getType(); 
-  Type *VecTy = I.getType(); 
-  assert(VecTy->isVectorTy() && 
-         (IsConst0 || IsConst1 || V0->getType() == V1->getType()) && 
-         (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() || 
-          ScalarTy->isPointerTy()) && 
-         "Unexpected types for insert element into binop or cmp"); 
- 
-  unsigned Opcode = I.getOpcode(); 
+  replaceValue(I, *Shuf);
+  return true;
+}
+
+/// Match a vector binop or compare instruction with at least one inserted
+/// scalar operand and convert to scalar binop/cmp followed by insertelement.
+bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
+  CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+  Value *Ins0, *Ins1;
+  if (!match(&I, m_BinOp(m_Value(Ins0), m_Value(Ins1))) &&
+      !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1))))
+    return false;
+
+  // Do not convert the vector condition of a vector select into a scalar
+  // condition. That may cause problems for codegen because of differences in
+  // boolean formats and register-file transfers.
+  // TODO: Can we account for that in the cost model?
+  bool IsCmp = Pred != CmpInst::Predicate::BAD_ICMP_PREDICATE;
+  if (IsCmp)
+    for (User *U : I.users())
+      if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
+        return false;
+
+  // Match against one or both scalar values being inserted into constant
+  // vectors:
+  // vec_op VecC0, (inselt VecC1, V1, Index)
+  // vec_op (inselt VecC0, V0, Index), VecC1
+  // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index)
+  // TODO: Deal with mismatched index constants and variable indexes?
+  Constant *VecC0 = nullptr, *VecC1 = nullptr;
+  Value *V0 = nullptr, *V1 = nullptr;
+  uint64_t Index0 = 0, Index1 = 0;
+  if (!match(Ins0, m_InsertElt(m_Constant(VecC0), m_Value(V0),
+                               m_ConstantInt(Index0))) &&
+      !match(Ins0, m_Constant(VecC0)))
+    return false;
+  if (!match(Ins1, m_InsertElt(m_Constant(VecC1), m_Value(V1),
+                               m_ConstantInt(Index1))) &&
+      !match(Ins1, m_Constant(VecC1)))
+    return false;
+
+  bool IsConst0 = !V0;
+  bool IsConst1 = !V1;
+  if (IsConst0 && IsConst1)
+    return false;
+  if (!IsConst0 && !IsConst1 && Index0 != Index1)
+    return false;
+
+  // Bail for single insertion if it is a load.
+  // TODO: Handle this once getVectorInstrCost can cost for load/stores.
+  auto *I0 = dyn_cast_or_null<Instruction>(V0);
+  auto *I1 = dyn_cast_or_null<Instruction>(V1);
+  if ((IsConst0 && I1 && I1->mayReadFromMemory()) ||
+      (IsConst1 && I0 && I0->mayReadFromMemory()))
+    return false;
+
+  uint64_t Index = IsConst0 ? Index1 : Index0;
+  Type *ScalarTy = IsConst0 ? V1->getType() : V0->getType();
+  Type *VecTy = I.getType();
+  assert(VecTy->isVectorTy() &&
+         (IsConst0 || IsConst1 || V0->getType() == V1->getType()) &&
+         (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
+          ScalarTy->isPointerTy()) &&
+         "Unexpected types for insert element into binop or cmp");
+
+  unsigned Opcode = I.getOpcode();
   InstructionCost ScalarOpCost, VectorOpCost;
-  if (IsCmp) { 
-    ScalarOpCost = TTI.getCmpSelInstrCost(Opcode, ScalarTy); 
-    VectorOpCost = TTI.getCmpSelInstrCost(Opcode, VecTy); 
-  } else { 
-    ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy); 
-    VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy); 
-  } 
- 
-  // Get cost estimate for the insert element. This cost will factor into 
-  // both sequences. 
+  if (IsCmp) {
+    ScalarOpCost = TTI.getCmpSelInstrCost(Opcode, ScalarTy);
+    VectorOpCost = TTI.getCmpSelInstrCost(Opcode, VecTy);
+  } else {
+    ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy);
+    VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy);
+  }
+
+  // Get cost estimate for the insert element. This cost will factor into
+  // both sequences.
   InstructionCost InsertCost =
-      TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, Index); 
+      TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, Index);
   InstructionCost OldCost =
       (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + VectorOpCost;
   InstructionCost NewCost = ScalarOpCost + InsertCost +
                             (IsConst0 ? 0 : !Ins0->hasOneUse() * InsertCost) +
                             (IsConst1 ? 0 : !Ins1->hasOneUse() * InsertCost);
- 
-  // We want to scalarize unless the vector variant actually has lower cost. 
+
+  // We want to scalarize unless the vector variant actually has lower cost.
   if (OldCost < NewCost || !NewCost.isValid())
-    return false; 
- 
-  // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) --> 
-  // inselt NewVecC, (scalar_op V0, V1), Index 
-  if (IsCmp) 
-    ++NumScalarCmp; 
-  else 
-    ++NumScalarBO; 
- 
-  // For constant cases, extract the scalar element, this should constant fold. 
-  if (IsConst0) 
-    V0 = ConstantExpr::getExtractElement(VecC0, Builder.getInt64(Index)); 
-  if (IsConst1) 
-    V1 = ConstantExpr::getExtractElement(VecC1, Builder.getInt64(Index)); 
- 
-  Value *Scalar = 
-      IsCmp ? Builder.CreateCmp(Pred, V0, V1) 
-            : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1); 
- 
-  Scalar->setName(I.getName() + ".scalar"); 
- 
-  // All IR flags are safe to back-propagate. There is no potential for extra 
-  // poison to be created by the scalar instruction. 
-  if (auto *ScalarInst = dyn_cast<Instruction>(Scalar)) 
-    ScalarInst->copyIRFlags(&I); 
- 
-  // Fold the vector constants in the original vectors into a new base vector. 
-  Constant *NewVecC = IsCmp ? ConstantExpr::getCompare(Pred, VecC0, VecC1) 
-                            : ConstantExpr::get(Opcode, VecC0, VecC1); 
-  Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index); 
-  replaceValue(I, *Insert); 
-  return true; 
-} 
- 
-/// Try to combine a scalar binop + 2 scalar compares of extracted elements of 
-/// a vector into vector operations followed by extract. Note: The SLP pass 
-/// may miss this pattern because of implementation problems. 
-bool VectorCombine::foldExtractedCmps(Instruction &I) { 
-  // We are looking for a scalar binop of booleans. 
-  // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1) 
-  if (!I.isBinaryOp() || !I.getType()->isIntegerTy(1)) 
-    return false; 
- 
-  // The compare predicates should match, and each compare should have a 
-  // constant operand. 
-  // TODO: Relax the one-use constraints. 
-  Value *B0 = I.getOperand(0), *B1 = I.getOperand(1); 
-  Instruction *I0, *I1; 
-  Constant *C0, *C1; 
-  CmpInst::Predicate P0, P1; 
-  if (!match(B0, m_OneUse(m_Cmp(P0, m_Instruction(I0), m_Constant(C0)))) || 
-      !match(B1, m_OneUse(m_Cmp(P1, m_Instruction(I1), m_Constant(C1)))) || 
-      P0 != P1) 
-    return false; 
- 
-  // The compare operands must be extracts of the same vector with constant 
-  // extract indexes. 
-  // TODO: Relax the one-use constraints. 
-  Value *X; 
-  uint64_t Index0, Index1; 
-  if (!match(I0, m_OneUse(m_ExtractElt(m_Value(X), m_ConstantInt(Index0)))) || 
-      !match(I1, m_OneUse(m_ExtractElt(m_Specific(X), m_ConstantInt(Index1))))) 
-    return false; 
- 
-  auto *Ext0 = cast<ExtractElementInst>(I0); 
-  auto *Ext1 = cast<ExtractElementInst>(I1); 
-  ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1); 
-  if (!ConvertToShuf) 
-    return false; 
- 
-  // The original scalar pattern is: 
-  // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1) 
-  CmpInst::Predicate Pred = P0; 
-  unsigned CmpOpcode = CmpInst::isFPPredicate(Pred) ? Instruction::FCmp 
-                                                    : Instruction::ICmp; 
-  auto *VecTy = dyn_cast<FixedVectorType>(X->getType()); 
-  if (!VecTy) 
-    return false; 
- 
+    return false;
+
+  // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
+  // inselt NewVecC, (scalar_op V0, V1), Index
+  if (IsCmp)
+    ++NumScalarCmp;
+  else
+    ++NumScalarBO;
+
+  // For constant cases, extract the scalar element, this should constant fold.
+  if (IsConst0)
+    V0 = ConstantExpr::getExtractElement(VecC0, Builder.getInt64(Index));
+  if (IsConst1)
+    V1 = ConstantExpr::getExtractElement(VecC1, Builder.getInt64(Index));
+
+  Value *Scalar =
+      IsCmp ? Builder.CreateCmp(Pred, V0, V1)
+            : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1);
+
+  Scalar->setName(I.getName() + ".scalar");
+
+  // All IR flags are safe to back-propagate. There is no potential for extra
+  // poison to be created by the scalar instruction.
+  if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
+    ScalarInst->copyIRFlags(&I);
+
+  // Fold the vector constants in the original vectors into a new base vector.
+  Constant *NewVecC = IsCmp ? ConstantExpr::getCompare(Pred, VecC0, VecC1)
+                            : ConstantExpr::get(Opcode, VecC0, VecC1);
+  Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index);
+  replaceValue(I, *Insert);
+  return true;
+}
+
+/// Try to combine a scalar binop + 2 scalar compares of extracted elements of
+/// a vector into vector operations followed by extract. Note: The SLP pass
+/// may miss this pattern because of implementation problems.
+bool VectorCombine::foldExtractedCmps(Instruction &I) {
+  // We are looking for a scalar binop of booleans.
+  // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
+  if (!I.isBinaryOp() || !I.getType()->isIntegerTy(1))
+    return false;
+
+  // The compare predicates should match, and each compare should have a
+  // constant operand.
+  // TODO: Relax the one-use constraints.
+  Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
+  Instruction *I0, *I1;
+  Constant *C0, *C1;
+  CmpInst::Predicate P0, P1;
+  if (!match(B0, m_OneUse(m_Cmp(P0, m_Instruction(I0), m_Constant(C0)))) ||
+      !match(B1, m_OneUse(m_Cmp(P1, m_Instruction(I1), m_Constant(C1)))) ||
+      P0 != P1)
+    return false;
+
+  // The compare operands must be extracts of the same vector with constant
+  // extract indexes.
+  // TODO: Relax the one-use constraints.
+  Value *X;
+  uint64_t Index0, Index1;
+  if (!match(I0, m_OneUse(m_ExtractElt(m_Value(X), m_ConstantInt(Index0)))) ||
+      !match(I1, m_OneUse(m_ExtractElt(m_Specific(X), m_ConstantInt(Index1)))))
+    return false;
+
+  auto *Ext0 = cast<ExtractElementInst>(I0);
+  auto *Ext1 = cast<ExtractElementInst>(I1);
+  ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1);
+  if (!ConvertToShuf)
+    return false;
+
+  // The original scalar pattern is:
+  // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
+  CmpInst::Predicate Pred = P0;
+  unsigned CmpOpcode = CmpInst::isFPPredicate(Pred) ? Instruction::FCmp
+                                                    : Instruction::ICmp;
+  auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
+  if (!VecTy)
+    return false;
+
   InstructionCost OldCost =
       TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);
-  OldCost += TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1); 
-  OldCost += TTI.getCmpSelInstrCost(CmpOpcode, I0->getType()) * 2; 
-  OldCost += TTI.getArithmeticInstrCost(I.getOpcode(), I.getType()); 
- 
-  // The proposed vector pattern is: 
-  // vcmp = cmp Pred X, VecC 
-  // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0 
-  int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0; 
-  int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1; 
-  auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(X->getType())); 
+  OldCost += TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1);
+  OldCost += TTI.getCmpSelInstrCost(CmpOpcode, I0->getType()) * 2;
+  OldCost += TTI.getArithmeticInstrCost(I.getOpcode(), I.getType());
+
+  // The proposed vector pattern is:
+  // vcmp = cmp Pred X, VecC
+  // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
+  int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
+  int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
+  auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(X->getType()));
   InstructionCost NewCost = TTI.getCmpSelInstrCost(CmpOpcode, X->getType());
-  NewCost += 
-      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy); 
-  NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy); 
-  NewCost += TTI.getVectorInstrCost(Ext0->getOpcode(), CmpTy, CheapIndex); 
- 
-  // Aggressively form vector ops if the cost is equal because the transform 
-  // may enable further optimization. 
-  // Codegen can reverse this transform (scalarize) if it was not profitable. 
+  NewCost +=
+      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy);
+  NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy);
+  NewCost += TTI.getVectorInstrCost(Ext0->getOpcode(), CmpTy, CheapIndex);
+
+  // Aggressively form vector ops if the cost is equal because the transform
+  // may enable further optimization.
+  // Codegen can reverse this transform (scalarize) if it was not profitable.
   if (OldCost < NewCost || !NewCost.isValid())
-    return false; 
- 
-  // Create a vector constant from the 2 scalar constants. 
-  SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(), 
-                                   UndefValue::get(VecTy->getElementType())); 
-  CmpC[Index0] = C0; 
-  CmpC[Index1] = C1; 
-  Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC)); 
- 
-  Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder); 
-  Value *VecLogic = Builder.CreateBinOp(cast<BinaryOperator>(I).getOpcode(), 
-                                        VCmp, Shuf); 
-  Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex); 
-  replaceValue(I, *NewExt); 
-  ++NumVecCmpBO; 
-  return true; 
-} 
- 
-/// This is the entry point for all transforms. Pass manager differences are 
-/// handled in the callers of this function. 
-bool VectorCombine::run() { 
-  if (DisableVectorCombine) 
-    return false; 
- 
+    return false;
+
+  // Create a vector constant from the 2 scalar constants.
+  SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(),
+                                   UndefValue::get(VecTy->getElementType()));
+  CmpC[Index0] = C0;
+  CmpC[Index1] = C1;
+  Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
+
+  Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder);
+  Value *VecLogic = Builder.CreateBinOp(cast<BinaryOperator>(I).getOpcode(),
+                                        VCmp, Shuf);
+  Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex);
+  replaceValue(I, *NewExt);
+  ++NumVecCmpBO;
+  return true;
+}
+
+/// This is the entry point for all transforms. Pass manager differences are
+/// handled in the callers of this function.
+bool VectorCombine::run() {
+  if (DisableVectorCombine)
+    return false;
+
   // Don't attempt vectorization if the target does not support vectors.
   if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
     return false;
 
-  bool MadeChange = false; 
-  for (BasicBlock &BB : F) { 
-    // Ignore unreachable basic blocks. 
-    if (!DT.isReachableFromEntry(&BB)) 
-      continue; 
-    // Do not delete instructions under here and invalidate the iterator. 
-    // Walk the block forwards to enable simple iterative chains of transforms. 
-    // TODO: It could be more efficient to remove dead instructions 
-    //       iteratively in this loop rather than waiting until the end. 
-    for (Instruction &I : BB) { 
-      if (isa<DbgInfoIntrinsic>(I)) 
-        continue; 
-      Builder.SetInsertPoint(&I); 
+  bool MadeChange = false;
+  for (BasicBlock &BB : F) {
+    // Ignore unreachable basic blocks.
+    if (!DT.isReachableFromEntry(&BB))
+      continue;
+    // Do not delete instructions under here and invalidate the iterator.
+    // Walk the block forwards to enable simple iterative chains of transforms.
+    // TODO: It could be more efficient to remove dead instructions
+    //       iteratively in this loop rather than waiting until the end.
+    for (Instruction &I : BB) {
+      if (isa<DbgInfoIntrinsic>(I))
+        continue;
+      Builder.SetInsertPoint(&I);
       MadeChange |= vectorizeLoadInsert(I);
-      MadeChange |= foldExtractExtract(I); 
-      MadeChange |= foldBitcastShuf(I); 
-      MadeChange |= scalarizeBinopOrCmp(I); 
-      MadeChange |= foldExtractedCmps(I); 
-    } 
-  } 
- 
-  // We're done with transforms, so remove dead instructions. 
-  if (MadeChange) 
-    for (BasicBlock &BB : F) 
-      SimplifyInstructionsInBlock(&BB); 
- 
-  return MadeChange; 
-} 
- 
-// Pass manager boilerplate below here. 
- 
-namespace { 
-class VectorCombineLegacyPass : public FunctionPass { 
-public: 
-  static char ID; 
-  VectorCombineLegacyPass() : FunctionPass(ID) { 
-    initializeVectorCombineLegacyPassPass(*PassRegistry::getPassRegistry()); 
-  } 
- 
-  void getAnalysisUsage(AnalysisUsage &AU) const override { 
-    AU.addRequired<DominatorTreeWrapperPass>(); 
-    AU.addRequired<TargetTransformInfoWrapperPass>(); 
-    AU.setPreservesCFG(); 
-    AU.addPreserved<DominatorTreeWrapperPass>(); 
-    AU.addPreserved<GlobalsAAWrapperPass>(); 
-    AU.addPreserved<AAResultsWrapperPass>(); 
-    AU.addPreserved<BasicAAWrapperPass>(); 
-    FunctionPass::getAnalysisUsage(AU); 
-  } 
- 
-  bool runOnFunction(Function &F) override { 
-    if (skipFunction(F)) 
-      return false; 
-    auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 
-    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 
-    VectorCombine Combiner(F, TTI, DT); 
-    return Combiner.run(); 
-  } 
-}; 
-} // namespace 
- 
-char VectorCombineLegacyPass::ID = 0; 
-INITIALIZE_PASS_BEGIN(VectorCombineLegacyPass, "vector-combine", 
-                      "Optimize scalar/vector ops", false, 
-                      false) 
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 
-INITIALIZE_PASS_END(VectorCombineLegacyPass, "vector-combine", 
-                    "Optimize scalar/vector ops", false, false) 
-Pass *llvm::createVectorCombinePass() { 
-  return new VectorCombineLegacyPass(); 
-} 
- 
-PreservedAnalyses VectorCombinePass::run(Function &F, 
-                                         FunctionAnalysisManager &FAM) { 
-  TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F); 
-  DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F); 
-  VectorCombine Combiner(F, TTI, DT); 
-  if (!Combiner.run()) 
-    return PreservedAnalyses::all(); 
-  PreservedAnalyses PA; 
-  PA.preserveSet<CFGAnalyses>(); 
-  PA.preserve<GlobalsAA>(); 
-  PA.preserve<AAManager>(); 
-  PA.preserve<BasicAA>(); 
-  return PA; 
-} 
+      MadeChange |= foldExtractExtract(I);
+      MadeChange |= foldBitcastShuf(I);
+      MadeChange |= scalarizeBinopOrCmp(I);
+      MadeChange |= foldExtractedCmps(I);
+    }
+  }
+
+  // We're done with transforms, so remove dead instructions.
+  if (MadeChange)
+    for (BasicBlock &BB : F)
+      SimplifyInstructionsInBlock(&BB);
+
+  return MadeChange;
+}
+
+// Pass manager boilerplate below here.
+
+namespace {
+class VectorCombineLegacyPass : public FunctionPass {
+public:
+  static char ID;
+  VectorCombineLegacyPass() : FunctionPass(ID) {
+    initializeVectorCombineLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.setPreservesCFG();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<AAResultsWrapperPass>();
+    AU.addPreserved<BasicAAWrapperPass>();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    VectorCombine Combiner(F, TTI, DT);
+    return Combiner.run();
+  }
+};
+} // namespace
+
+char VectorCombineLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(VectorCombineLegacyPass, "vector-combine",
+                      "Optimize scalar/vector ops", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(VectorCombineLegacyPass, "vector-combine",
+                    "Optimize scalar/vector ops", false, false)
+Pass *llvm::createVectorCombinePass() {
+  return new VectorCombineLegacyPass();
+}
+
+PreservedAnalyses VectorCombinePass::run(Function &F,
+                                         FunctionAnalysisManager &FAM) {
+  TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
+  DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+  VectorCombine Combiner(F, TTI, DT);
+  if (!Combiner.run())
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<GlobalsAA>();
+  PA.preserve<AAManager>();
+  PA.preserve<BasicAA>();
+  return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/Vectorize.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/Vectorize.cpp
index 7d3314b0d2..0296a995ad 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/Vectorize.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/Vectorize.cpp
@@ -1,42 +1,42 @@
-//===-- Vectorize.cpp -----------------------------------------------------===// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file implements common infrastructure for libLLVMVectorizeOpts.a, which 
-// implements several vectorization transformations over the LLVM intermediate 
-// representation, including the C bindings for that library. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-#include "llvm/Transforms/Vectorize.h" 
-#include "llvm-c/Initialization.h" 
-#include "llvm-c/Transforms/Vectorize.h" 
-#include "llvm/Analysis/Passes.h" 
-#include "llvm/IR/LegacyPassManager.h" 
-#include "llvm/InitializePasses.h" 
- 
-using namespace llvm; 
- 
-/// Initialize all passes linked into the Vectorization library. 
-void llvm::initializeVectorization(PassRegistry &Registry) { 
-  initializeLoopVectorizePass(Registry); 
-  initializeSLPVectorizerPass(Registry); 
-  initializeLoadStoreVectorizerLegacyPassPass(Registry); 
-  initializeVectorCombineLegacyPassPass(Registry); 
-} 
- 
-void LLVMInitializeVectorization(LLVMPassRegistryRef R) { 
-  initializeVectorization(*unwrap(R)); 
-} 
- 
-void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createLoopVectorizePass()); 
-} 
- 
-void LLVMAddSLPVectorizePass(LLVMPassManagerRef PM) { 
-  unwrap(PM)->add(createSLPVectorizerPass()); 
-} 
+//===-- Vectorize.cpp -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements common infrastructure for libLLVMVectorizeOpts.a, which
+// implements several vectorization transformations over the LLVM intermediate
+// representation, including the C bindings for that library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize.h"
+#include "llvm-c/Initialization.h"
+#include "llvm-c/Transforms/Vectorize.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+/// Initialize all passes linked into the Vectorization library.
+void llvm::initializeVectorization(PassRegistry &Registry) {
+  initializeLoopVectorizePass(Registry);
+  initializeSLPVectorizerPass(Registry);
+  initializeLoadStoreVectorizerLegacyPassPass(Registry);
+  initializeVectorCombineLegacyPassPass(Registry);
+}
+
+void LLVMInitializeVectorization(LLVMPassRegistryRef R) {
+  initializeVectorization(*unwrap(R));
+}
+
+void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopVectorizePass());
+}
+
+void LLVMAddSLPVectorizePass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createSLPVectorizerPass());
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/ya.make b/contrib/libs/llvm12/lib/Transforms/Vectorize/ya.make
index df7cc36ebe..a68c667bde 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/ya.make
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/ya.make
@@ -1,46 +1,46 @@
-# Generated by devtools/yamaker. 
- 
-LIBRARY() 
- 
+# Generated by devtools/yamaker.
+
+LIBRARY()
+
 OWNER(
     orivej
     g:cpp-contrib
 )
- 
+
 LICENSE(Apache-2.0 WITH LLVM-exception)
 
 LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
 
-PEERDIR( 
+PEERDIR(
     contrib/libs/llvm12
     contrib/libs/llvm12/include
     contrib/libs/llvm12/lib/Analysis
     contrib/libs/llvm12/lib/IR
     contrib/libs/llvm12/lib/Support
     contrib/libs/llvm12/lib/Transforms/Utils
-) 
- 
+)
+
 ADDINCL(
     contrib/libs/llvm12/lib/Transforms/Vectorize
 )
- 
-NO_COMPILER_WARNINGS() 
- 
-NO_UTIL() 
- 
-SRCS( 
-    LoadStoreVectorizer.cpp 
-    LoopVectorizationLegality.cpp 
-    LoopVectorize.cpp 
-    SLPVectorizer.cpp 
-    VPlan.cpp 
-    VPlanHCFGBuilder.cpp 
-    VPlanPredicator.cpp 
-    VPlanSLP.cpp 
-    VPlanTransforms.cpp 
-    VPlanVerifier.cpp 
-    VectorCombine.cpp 
-    Vectorize.cpp 
-) 
- 
-END() 
+
+NO_COMPILER_WARNINGS()
+
+NO_UTIL()
+
+SRCS(
+    LoadStoreVectorizer.cpp
+    LoopVectorizationLegality.cpp
+    LoopVectorize.cpp
+    SLPVectorizer.cpp
+    VPlan.cpp
+    VPlanHCFGBuilder.cpp
+    VPlanPredicator.cpp
+    VPlanSLP.cpp
+    VPlanTransforms.cpp
+    VPlanVerifier.cpp
+    VectorCombine.cpp
+    Vectorize.cpp
+)
+
+END()
author	orivej <orivej@yandex-team.ru>	2022-02-10 16:45:01 +0300
committer	Daniil Cherednik <dcherednik@yandex-team.ru>	2022-02-10 16:45:01 +0300
commit	2d37894b1b037cf24231090eda8589bbb44fb6fc (patch)
tree	be835aa92c6248212e705f25388ebafcf84bc7a1 /contrib/libs/llvm12/lib/Transforms
parent	718c552901d703c502ccbefdfc3c9028d608b947 (diff)
download	ydb-2d37894b1b037cf24231090eda8589bbb44fb6fc.tar.gz